350 files changed, 334221 insertions, 0 deletions
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
new file mode 100644
index 000000000000..886c4db069f1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -0,0 +1,993 @@
+//===- AggressiveAntiDepBreaker.cpp - Anti-dep breaker --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AggressiveAntiDepBreaker class, which
+// implements register anti-dependence breaking during post-RA
+// scheduling. It attempts to break all anti-dependencies within a
+// block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AggressiveAntiDepBreaker.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "post-RA-sched"
+
+// If DebugDiv > 0 then only break antidep with (ID % DebugDiv) == DebugMod
+static cl::opt<int>
+DebugDiv("agg-antidep-debugdiv",
+         cl::desc("Debug control for aggressive anti-dep breaker"),
+         cl::init(0), cl::Hidden);
+
+static cl::opt<int>
+DebugMod("agg-antidep-debugmod",
+         cl::desc("Debug control for aggressive anti-dep breaker"),
+         cl::init(0), cl::Hidden);
+
+AggressiveAntiDepState::AggressiveAntiDepState(const unsigned TargetRegs,
+                                               MachineBasicBlock *BB)
+    : NumTargetRegs(TargetRegs), GroupNodes(TargetRegs, 0),
+      GroupNodeIndices(TargetRegs, 0), KillIndices(TargetRegs, 0),
+      DefIndices(TargetRegs, 0) {
+  const unsigned BBSize = BB->size();
+  for (unsigned i = 0; i < NumTargetRegs; ++i) {
+    // Initialize all registers to be in their own group. Initially we
+    // assign the register to the same-indexed GroupNode.
+    GroupNodeIndices[i] = i;
+    // Initialize the indices to indicate that no registers are live.
+    KillIndices[i] = ~0u;
+    DefIndices[i] = BBSize;
+  }
+}
+
+unsigned AggressiveAntiDepState::GetGroup(unsigned Reg) {
+  unsigned Node = GroupNodeIndices[Reg];
+  while (GroupNodes[Node] != Node)
+    Node = GroupNodes[Node];
+
+  return Node;
+}
+
+void AggressiveAntiDepState::GetGroupRegs(
+  unsigned Group,
+  std::vector<unsigned> &Regs,
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference> *RegRefs)
+{
+  for (unsigned Reg = 0; Reg != NumTargetRegs; ++Reg) {
+    if ((GetGroup(Reg) == Group) && (RegRefs->count(Reg) > 0))
+      Regs.push_back(Reg);
+  }
+}
+
+unsigned AggressiveAntiDepState::UnionGroups(unsigned Reg1, unsigned Reg2) {
+  assert(GroupNodes[0] == 0 && "GroupNode 0 not parent!");
+  assert(GroupNodeIndices[0] == 0 && "Reg 0 not in Group 0!");
+
+  // find group for each register
+  unsigned Group1 = GetGroup(Reg1);
+  unsigned Group2 = GetGroup(Reg2);
+
+  // if either group is 0, then that must become the parent
+  unsigned Parent = (Group1 == 0) ? Group1 : Group2;
+  unsigned Other = (Parent == Group1) ? Group2 : Group1;
+  GroupNodes.at(Other) = Parent;
+  return Parent;
+}
+
+unsigned AggressiveAntiDepState::LeaveGroup(unsigned Reg) {
+  // Create a new GroupNode for Reg. Reg's existing GroupNode must
+  // stay as is because there could be other GroupNodes referring to
+  // it.
+  unsigned idx = GroupNodes.size();
+  GroupNodes.push_back(idx);
+  GroupNodeIndices[Reg] = idx;
+  return idx;
+}
+
+bool AggressiveAntiDepState::IsLive(unsigned Reg) {
+  // KillIndex must be defined and DefIndex not defined for a register
+  // to be live.
+  return((KillIndices[Reg] != ~0u) && (DefIndices[Reg] == ~0u));
+}
+
+AggressiveAntiDepBreaker::AggressiveAntiDepBreaker(
+    MachineFunction &MFi, const RegisterClassInfo &RCI,
+    TargetSubtargetInfo::RegClassVector &CriticalPathRCs)
+    : MF(MFi), MRI(MF.getRegInfo()), TII(MF.getSubtarget().getInstrInfo()),
+      TRI(MF.getSubtarget().getRegisterInfo()), RegClassInfo(RCI) {
+  /* Collect a bitset of all registers that are only broken if they
+     are on the critical path. */
+  for (unsigned i = 0, e = CriticalPathRCs.size(); i < e; ++i) {
+    BitVector CPSet = TRI->getAllocatableSet(MF, CriticalPathRCs[i]);
+    if (CriticalPathSet.none())
+      CriticalPathSet = CPSet;
+    else
+      CriticalPathSet |= CPSet;
+   }
+
+   LLVM_DEBUG(dbgs() << "AntiDep Critical-Path Registers:");
+   LLVM_DEBUG(for (unsigned r
+                   : CriticalPathSet.set_bits()) dbgs()
+              << " " << printReg(r, TRI));
+   LLVM_DEBUG(dbgs() << '\n');
+}
+
+AggressiveAntiDepBreaker::~AggressiveAntiDepBreaker() {
+  delete State;
+}
+
+void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
+  assert(!State);
+  State = new AggressiveAntiDepState(TRI->getNumRegs(), BB);
+
+  bool IsReturnBlock = BB->isReturnBlock();
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+
+  // Examine the live-in regs of all successors.
+  for (MachineBasicBlock *Succ : BB->successors())
+    for (const auto &LI : Succ->liveins()) {
+      for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI) {
+        unsigned Reg = *AI;
+        State->UnionGroups(Reg, 0);
+        KillIndices[Reg] = BB->size();
+        DefIndices[Reg] = ~0u;
+      }
+    }
+
+  // Mark live-out callee-saved registers. In a return block this is
+  // all callee-saved registers. In non-return this is any
+  // callee-saved register that is not saved in the prolog.
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  BitVector Pristine = MFI.getPristineRegs(MF);
+  for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I;
+       ++I) {
+    unsigned Reg = *I;
+    if (!IsReturnBlock && !Pristine.test(Reg))
+      continue;
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+      unsigned AliasReg = *AI;
+      State->UnionGroups(AliasReg, 0);
+      KillIndices[AliasReg] = BB->size();
+      DefIndices[AliasReg] = ~0u;
+    }
+  }
+}
+
+void AggressiveAntiDepBreaker::FinishBlock() {
+  delete State;
+  State = nullptr;
+}
+
+void AggressiveAntiDepBreaker::Observe(MachineInstr &MI, unsigned Count,
+                                       unsigned InsertPosIndex) {
+  assert(Count < InsertPosIndex && "Instruction index out of expected range!");
+
+  std::set<unsigned> PassthruRegs;
+  GetPassthruRegs(MI, PassthruRegs);
+  PrescanInstruction(MI, Count, PassthruRegs);
+  ScanInstruction(MI, Count);
+
+  LLVM_DEBUG(dbgs() << "Observe: ");
+  LLVM_DEBUG(MI.dump());
+  LLVM_DEBUG(dbgs() << "\tRegs:");
+
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  for (unsigned Reg = 1; Reg != TRI->getNumRegs(); ++Reg) {
+    // If Reg is current live, then mark that it can't be renamed as
+    // we don't know the extent of its live-range anymore (now that it
+    // has been scheduled). If it is not live but was defined in the
+    // previous schedule region, then set its def index to the most
+    // conservative location (i.e. the beginning of the previous
+    // schedule region).
+    if (State->IsLive(Reg)) {
+      LLVM_DEBUG(if (State->GetGroup(Reg) != 0) dbgs()
+                 << " " << printReg(Reg, TRI) << "=g" << State->GetGroup(Reg)
+                 << "->g0(region live-out)");
+      State->UnionGroups(Reg, 0);
+    } else if ((DefIndices[Reg] < InsertPosIndex)
+               && (DefIndices[Reg] >= Count)) {
+      DefIndices[Reg] = Count;
+    }
+  }
+  LLVM_DEBUG(dbgs() << '\n');
+}
+
+bool AggressiveAntiDepBreaker::IsImplicitDefUse(MachineInstr &MI,
+                                                MachineOperand &MO) {
+  if (!MO.isReg() || !MO.isImplicit())
+    return false;
+
+  Register Reg = MO.getReg();
+  if (Reg == 0)
+    return false;
+
+  MachineOperand *Op = nullptr;
+  if (MO.isDef())
+    Op = MI.findRegisterUseOperand(Reg, true);
+  else
+    Op = MI.findRegisterDefOperand(Reg);
+
+  return(Op && Op->isImplicit());
+}
+
+void AggressiveAntiDepBreaker::GetPassthruRegs(
+    MachineInstr &MI, std::set<unsigned> &PassthruRegs) {
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg()) continue;
+    if ((MO.isDef() && MI.isRegTiedToUseOperand(i)) ||
+        IsImplicitDefUse(MI, MO)) {
+      const Register Reg = MO.getReg();
+      for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg))
+        PassthruRegs.insert(SubReg);
+    }
+  }
+}
+
+/// AntiDepEdges - Return in Edges the anti- and output- dependencies
+/// in SU that we want to consider for breaking.
+static void AntiDepEdges(const SUnit *SU, std::vector<const SDep *> &Edges) {
+  SmallSet<unsigned, 4> RegSet;
+  for (const SDep &Pred : SU->Preds) {
+    if ((Pred.getKind() == SDep::Anti) || (Pred.getKind() == SDep::Output)) {
+      if (RegSet.insert(Pred.getReg()).second)
+        Edges.push_back(&Pred);
+    }
+  }
+}
+
+/// CriticalPathStep - Return the next SUnit after SU on the bottom-up
+/// critical path.
+static const SUnit *CriticalPathStep(const SUnit *SU) {
+  const SDep *Next = nullptr;
+  unsigned NextDepth = 0;
+  // Find the predecessor edge with the greatest depth.
+  if (SU) {
+    for (const SDep &Pred : SU->Preds) {
+      const SUnit *PredSU = Pred.getSUnit();
+      unsigned PredLatency = Pred.getLatency();
+      unsigned PredTotalLatency = PredSU->getDepth() + PredLatency;
+      // In the case of a latency tie, prefer an anti-dependency edge over
+      // other types of edges.
+      if (NextDepth < PredTotalLatency ||
+          (NextDepth == PredTotalLatency && Pred.getKind() == SDep::Anti)) {
+        NextDepth = PredTotalLatency;
+        Next = &Pred;
+      }
+    }
+  }
+
+  return (Next) ? Next->getSUnit() : nullptr;
+}
+
+void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx,
+                                             const char *tag,
+                                             const char *header,
+                                             const char *footer) {
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // FIXME: We must leave subregisters of live super registers as live, so that
+  // we don't clear out the register tracking information for subregisters of
+  // super registers we're still tracking (and with which we're unioning
+  // subregister definitions).
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    if (TRI->isSuperRegister(Reg, *AI) && State->IsLive(*AI)) {
+      LLVM_DEBUG(if (!header && footer) dbgs() << footer);
+      return;
+    }
+
+  if (!State->IsLive(Reg)) {
+    KillIndices[Reg] = KillIdx;
+    DefIndices[Reg] = ~0u;
+    RegRefs.erase(Reg);
+    State->LeaveGroup(Reg);
+    LLVM_DEBUG(if (header) {
+      dbgs() << header << printReg(Reg, TRI);
+      header = nullptr;
+    });
+    LLVM_DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << tag);
+    // Repeat for subregisters. Note that we only do this if the superregister
+    // was not live because otherwise, regardless whether we have an explicit
+    // use of the subregister, the subregister's contents are needed for the
+    // uses of the superregister.
+    for (MCPhysReg SubregReg : TRI->subregs(Reg)) {
+      if (!State->IsLive(SubregReg)) {
+        KillIndices[SubregReg] = KillIdx;
+        DefIndices[SubregReg] = ~0u;
+        RegRefs.erase(SubregReg);
+        State->LeaveGroup(SubregReg);
+        LLVM_DEBUG(if (header) {
+          dbgs() << header << printReg(Reg, TRI);
+          header = nullptr;
+        });
+        LLVM_DEBUG(dbgs() << " " << printReg(SubregReg, TRI) << "->g"
+                          << State->GetGroup(SubregReg) << tag);
+      }
+    }
+  }
+
+  LLVM_DEBUG(if (!header && footer) dbgs() << footer);
+}
+
+void AggressiveAntiDepBreaker::PrescanInstruction(
+    MachineInstr &MI, unsigned Count, std::set<unsigned> &PassthruRegs) {
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // Handle dead defs by simulating a last-use of the register just
+  // after the def. A dead def can occur because the def is truly
+  // dead, or because only a subregister is live at the def. If we
+  // don't do this the dead def will be incorrectly merged into the
+  // previous def.
+  for (const MachineOperand &MO : MI.all_defs()) {
+    Register Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    HandleLastUse(Reg, Count + 1, "", "\tDead Def: ", "\n");
+  }
+
+  LLVM_DEBUG(dbgs() << "\tDef Groups:");
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    Register Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    LLVM_DEBUG(dbgs() << " " << printReg(Reg, TRI) << "=g"
+                      << State->GetGroup(Reg));
+
+    // If MI's defs have a special allocation requirement, don't allow
+    // any def registers to be changed. Also assume all registers
+    // defined in a call must not be changed (ABI). Inline assembly may
+    // reference either system calls or the register directly. Skip it until we
+    // can tell user specified registers from compiler-specified.
+    if (MI.isCall() || MI.hasExtraDefRegAllocReq() || TII->isPredicated(MI) ||
+        MI.isInlineAsm()) {
+      LLVM_DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
+      State->UnionGroups(Reg, 0);
+    }
+
+    // Any aliased that are live at this point are completely or
+    // partially defined here, so group those aliases with Reg.
+    for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) {
+      unsigned AliasReg = *AI;
+      if (State->IsLive(AliasReg)) {
+        State->UnionGroups(Reg, AliasReg);
+        LLVM_DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << "(via "
+                          << printReg(AliasReg, TRI) << ")");
+      }
+    }
+
+    // Note register reference...
+    const TargetRegisterClass *RC = nullptr;
+    if (i < MI.getDesc().getNumOperands())
+      RC = TII->getRegClass(MI.getDesc(), i, TRI, MF);
+    AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
+    RegRefs.insert(std::make_pair(Reg, RR));
+  }
+
+  LLVM_DEBUG(dbgs() << '\n');
+
+  // Scan the register defs for this instruction and update
+  // live-ranges.
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.isDef()) continue;
+    Register Reg = MO.getReg();
+    if (Reg == 0) continue;
+    // Ignore KILLs and passthru registers for liveness...
+    if (MI.isKill() || (PassthruRegs.count(Reg) != 0))
+      continue;
+
+    // Update def for Reg and aliases.
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+      // We need to be careful here not to define already-live super registers.
+      // If the super register is already live, then this definition is not
+      // a definition of the whole super register (just a partial insertion
+      // into it). Earlier subregister definitions (which we've not yet visited
+      // because we're iterating bottom-up) need to be linked to the same group
+      // as this definition.
+      if (TRI->isSuperRegister(Reg, *AI) && State->IsLive(*AI))
+        continue;
+
+      DefIndices[*AI] = Count;
+    }
+  }
+}
+
+void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
+                                               unsigned Count) {
+  LLVM_DEBUG(dbgs() << "\tUse Groups:");
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // If MI's uses have special allocation requirement, don't allow
+  // any use registers to be changed. Also assume all registers
+  // used in a call must not be changed (ABI).
+  // Inline Assembly register uses also cannot be safely changed.
+  // FIXME: The issue with predicated instruction is more complex. We are being
+  // conservatively here because the kill markers cannot be trusted after
+  // if-conversion:
+  // %r6 = LDR %sp, %reg0, 92, 14, %reg0; mem:LD4[FixedStack14]
+  // ...
+  // STR %r0, killed %r6, %reg0, 0, 0, %cpsr; mem:ST4[%395]
+  // %r6 = LDR %sp, %reg0, 100, 0, %cpsr; mem:LD4[FixedStack12]
+  // STR %r0, killed %r6, %reg0, 0, 14, %reg0; mem:ST4[%396](align=8)
+  //
+  // The first R6 kill is not really a kill since it's killed by a predicated
+  // instruction which may not be executed. The second R6 def may or may not
+  // re-define R6 so it's not safe to change it since the last R6 use cannot be
+  // changed.
+  bool Special = MI.isCall() || MI.hasExtraSrcRegAllocReq() ||
+                 TII->isPredicated(MI) || MI.isInlineAsm();
+
+  // Scan the register uses for this instruction and update
+  // live-ranges, groups and RegRefs.
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isUse()) continue;
+    Register Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    LLVM_DEBUG(dbgs() << " " << printReg(Reg, TRI) << "=g"
+                      << State->GetGroup(Reg));
+
+    // It wasn't previously live but now it is, this is a kill. Forget
+    // the previous live-range information and start a new live-range
+    // for the register.
+    HandleLastUse(Reg, Count, "(last-use)");
+
+    if (Special) {
+      LLVM_DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
+      State->UnionGroups(Reg, 0);
+    }
+
+    // Note register reference...
+    const TargetRegisterClass *RC = nullptr;
+    if (i < MI.getDesc().getNumOperands())
+      RC = TII->getRegClass(MI.getDesc(), i, TRI, MF);
+    AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
+    RegRefs.insert(std::make_pair(Reg, RR));
+  }
+
+  LLVM_DEBUG(dbgs() << '\n');
+
+  // Form a group of all defs and uses of a KILL instruction to ensure
+  // that all registers are renamed as a group.
+  if (MI.isKill()) {
+    LLVM_DEBUG(dbgs() << "\tKill Group:");
+
+    unsigned FirstReg = 0;
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg()) continue;
+      Register Reg = MO.getReg();
+      if (Reg == 0) continue;
+
+      if (FirstReg != 0) {
+        LLVM_DEBUG(dbgs() << "=" << printReg(Reg, TRI));
+        State->UnionGroups(FirstReg, Reg);
+      } else {
+        LLVM_DEBUG(dbgs() << " " << printReg(Reg, TRI));
+        FirstReg = Reg;
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "->g" << State->GetGroup(FirstReg) << '\n');
+  }
+}
+
+BitVector AggressiveAntiDepBreaker::GetRenameRegisters(unsigned Reg) {
+  BitVector BV(TRI->getNumRegs(), false);
+  bool first = true;
+
+  // Check all references that need rewriting for Reg. For each, use
+  // the corresponding register class to narrow the set of registers
+  // that are appropriate for renaming.
+  for (const auto &Q : make_range(State->GetRegRefs().equal_range(Reg))) {
+    const TargetRegisterClass *RC = Q.second.RC;
+    if (!RC) continue;
+
+    BitVector RCBV = TRI->getAllocatableSet(MF, RC);
+    if (first) {
+      BV |= RCBV;
+      first = false;
+    } else {
+      BV &= RCBV;
+    }
+
+    LLVM_DEBUG(dbgs() << " " << TRI->getRegClassName(RC));
+  }
+
+  return BV;
+}
+
+bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
+                                unsigned AntiDepGroupIndex,
+                                RenameOrderType& RenameOrder,
+                                std::map<unsigned, unsigned> &RenameMap) {
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // Collect all referenced registers in the same group as
+  // AntiDepReg. These all need to be renamed together if we are to
+  // break the anti-dependence.
+  std::vector<unsigned> Regs;
+  State->GetGroupRegs(AntiDepGroupIndex, Regs, &RegRefs);
+  assert(!Regs.empty() && "Empty register group!");
+  if (Regs.empty())
+    return false;
+
+  // Find the "superest" register in the group. At the same time,
+  // collect the BitVector of registers that can be used to rename
+  // each register.
+  LLVM_DEBUG(dbgs() << "\tRename Candidates for Group g" << AntiDepGroupIndex
+                    << ":\n");
+  std::map<unsigned, BitVector> RenameRegisterMap;
+  unsigned SuperReg = 0;
+  for (unsigned Reg : Regs) {
+    if ((SuperReg == 0) || TRI->isSuperRegister(SuperReg, Reg))
+      SuperReg = Reg;
+
+    // If Reg has any references, then collect possible rename regs
+    if (RegRefs.count(Reg) > 0) {
+      LLVM_DEBUG(dbgs() << "\t\t" << printReg(Reg, TRI) << ":");
+
+      BitVector &BV = RenameRegisterMap[Reg];
+      assert(BV.empty());
+      BV = GetRenameRegisters(Reg);
+
+      LLVM_DEBUG({
+        dbgs() << " ::";
+        for (unsigned r : BV.set_bits())
+          dbgs() << " " << printReg(r, TRI);
+        dbgs() << "\n";
+      });
+    }
+  }
+
+  // All group registers should be a subreg of SuperReg.
+  for (unsigned Reg : Regs) {
+    if (Reg == SuperReg) continue;
+    bool IsSub = TRI->isSubRegister(SuperReg, Reg);
+    // FIXME: remove this once PR18663 has been properly fixed. For now,
+    // return a conservative answer:
+    // assert(IsSub && "Expecting group subregister");
+    if (!IsSub)
+      return false;
+  }
+
+#ifndef NDEBUG
+  // If DebugDiv > 0 then only rename (renamecnt % DebugDiv) == DebugMod
+  if (DebugDiv > 0) {
+    static int renamecnt = 0;
+    if (renamecnt++ % DebugDiv != DebugMod)
+      return false;
+
+    dbgs() << "*** Performing rename " << printReg(SuperReg, TRI)
+           << " for debug ***\n";
+  }
+#endif
+
+  // Check each possible rename register for SuperReg in round-robin
+  // order. If that register is available, and the corresponding
+  // registers are available for the other group subregisters, then we
+  // can use those registers to rename.
+
+  // FIXME: Using getMinimalPhysRegClass is very conservative. We should
+  // check every use of the register and find the largest register class
+  // that can be used in all of them.
+  const TargetRegisterClass *SuperRC =
+    TRI->getMinimalPhysRegClass(SuperReg, MVT::Other);
+
+  ArrayRef<MCPhysReg> Order = RegClassInfo.getOrder(SuperRC);
+  if (Order.empty()) {
+    LLVM_DEBUG(dbgs() << "\tEmpty Super Regclass!!\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "\tFind Registers:");
+
+  RenameOrder.insert(RenameOrderType::value_type(SuperRC, Order.size()));
+
+  unsigned OrigR = RenameOrder[SuperRC];
+  unsigned EndR = ((OrigR == Order.size()) ? 0 : OrigR);
+  unsigned R = OrigR;
+  do {
+    if (R == 0) R = Order.size();
+    --R;
+    const unsigned NewSuperReg = Order[R];
+    // Don't consider non-allocatable registers
+    if (!MRI.isAllocatable(NewSuperReg)) continue;
+    // Don't replace a register with itself.
+    if (NewSuperReg == SuperReg) continue;
+
+    LLVM_DEBUG(dbgs() << " [" << printReg(NewSuperReg, TRI) << ':');
+    RenameMap.clear();
+
+    // For each referenced group register (which must be a SuperReg or
+    // a subregister of SuperReg), find the corresponding subregister
+    // of NewSuperReg and make sure it is free to be renamed.
+    for (unsigned Reg : Regs) {
+      unsigned NewReg = 0;
+      if (Reg == SuperReg) {
+        NewReg = NewSuperReg;
+      } else {
+        unsigned NewSubRegIdx = TRI->getSubRegIndex(SuperReg, Reg);
+        if (NewSubRegIdx != 0)
+          NewReg = TRI->getSubReg(NewSuperReg, NewSubRegIdx);
+      }
+
+      LLVM_DEBUG(dbgs() << " " << printReg(NewReg, TRI));
+
+      // Check if Reg can be renamed to NewReg.
+      if (!RenameRegisterMap[Reg].test(NewReg)) {
+        LLVM_DEBUG(dbgs() << "(no rename)");
+        goto next_super_reg;
+      }
+
+      // If NewReg is dead and NewReg's most recent def is not before
+      // Regs's kill, it's safe to replace Reg with NewReg. We
+      // must also check all aliases of NewReg, because we can't define a
+      // register when any sub or super is already live.
+      if (State->IsLive(NewReg) || (KillIndices[Reg] > DefIndices[NewReg])) {
+        LLVM_DEBUG(dbgs() << "(live)");
+        goto next_super_reg;
+      } else {
+        bool found = false;
+        for (MCRegAliasIterator AI(NewReg, TRI, false); AI.isValid(); ++AI) {
+          unsigned AliasReg = *AI;
+          if (State->IsLive(AliasReg) ||
+              (KillIndices[Reg] > DefIndices[AliasReg])) {
+            LLVM_DEBUG(dbgs()
+                       << "(alias " << printReg(AliasReg, TRI) << " live)");
+            found = true;
+            break;
+          }
+        }
+        if (found)
+          goto next_super_reg;
+      }
+
+      // We cannot rename 'Reg' to 'NewReg' if one of the uses of 'Reg' also
+      // defines 'NewReg' via an early-clobber operand.
+      for (const auto &Q : make_range(RegRefs.equal_range(Reg))) {
+        MachineInstr *UseMI = Q.second.Operand->getParent();
+        int Idx = UseMI->findRegisterDefOperandIdx(NewReg, false, true, TRI);
+        if (Idx == -1)
+          continue;
+
+        if (UseMI->getOperand(Idx).isEarlyClobber()) {
+          LLVM_DEBUG(dbgs() << "(ec)");
+          goto next_super_reg;
+        }
+      }
+
+      // Also, we cannot rename 'Reg' to 'NewReg' if the instruction defining
+      // 'Reg' is an early-clobber define and that instruction also uses
+      // 'NewReg'.
+      for (const auto &Q : make_range(RegRefs.equal_range(Reg))) {
+        if (!Q.second.Operand->isDef() || !Q.second.Operand->isEarlyClobber())
+          continue;
+
+        MachineInstr *DefMI = Q.second.Operand->getParent();
+        if (DefMI->readsRegister(NewReg, TRI)) {
+          LLVM_DEBUG(dbgs() << "(ec)");
+          goto next_super_reg;
+        }
+      }
+
+      // Record that 'Reg' can be renamed to 'NewReg'.
+      RenameMap.insert(std::pair<unsigned, unsigned>(Reg, NewReg));
+    }
+
+    // If we fall-out here, then every register in the group can be
+    // renamed, as recorded in RenameMap.
+    RenameOrder.erase(SuperRC);
+    RenameOrder.insert(RenameOrderType::value_type(SuperRC, R));
+    LLVM_DEBUG(dbgs() << "]\n");
+    return true;
+
+  next_super_reg:
+    LLVM_DEBUG(dbgs() << ']');
+  } while (R != EndR);
+
+  LLVM_DEBUG(dbgs() << '\n');
+
+  // No registers are free and available!
+  return false;
+}
+
+/// BreakAntiDependencies - Identifiy anti-dependencies within the
+/// ScheduleDAG and break them by renaming registers.
+unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
+                              const std::vector<SUnit> &SUnits,
+                              MachineBasicBlock::iterator Begin,
+                              MachineBasicBlock::iterator End,
+                              unsigned InsertPosIndex,
+                              DbgValueVector &DbgValues) {
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // The code below assumes that there is at least one instruction,
+  // so just duck out immediately if the block is empty.
+  if (SUnits.empty()) return 0;
+
+  // For each regclass the next register to use for renaming.
+  RenameOrderType RenameOrder;
+
+  // ...need a map from MI to SUnit.
+  std::map<MachineInstr *, const SUnit *> MISUnitMap;
+  for (const SUnit &SU : SUnits)
+    MISUnitMap.insert(std::make_pair(SU.getInstr(), &SU));
+
+  // Track progress along the critical path through the SUnit graph as
+  // we walk the instructions. This is needed for regclasses that only
+  // break critical-path anti-dependencies.
+  const SUnit *CriticalPathSU = nullptr;
+  MachineInstr *CriticalPathMI = nullptr;
+  if (CriticalPathSet.any()) {
+    for (const SUnit &SU : SUnits) {
+      if (!CriticalPathSU ||
+          ((SU.getDepth() + SU.Latency) >
+           (CriticalPathSU->getDepth() + CriticalPathSU->Latency))) {
+        CriticalPathSU = &SU;
+      }
+    }
+    assert(CriticalPathSU && "Failed to find SUnit critical path");
+    CriticalPathMI = CriticalPathSU->getInstr();
+  }
+
+#ifndef NDEBUG
+  LLVM_DEBUG(dbgs() << "\n===== Aggressive anti-dependency breaking\n");
+  LLVM_DEBUG(dbgs() << "Available regs:");
+  for (unsigned Reg = 1; Reg < TRI->getNumRegs(); ++Reg) {
+    if (!State->IsLive(Reg))
+      LLVM_DEBUG(dbgs() << " " << printReg(Reg, TRI));
+  }
+  LLVM_DEBUG(dbgs() << '\n');
+#endif
+
+  BitVector RegAliases(TRI->getNumRegs());
+
+  // Attempt to break anti-dependence edges. Walk the instructions
+  // from the bottom up, tracking information about liveness as we go
+  // to help determine which registers are available.
+  unsigned Broken = 0;
+  unsigned Count = InsertPosIndex - 1;
+  for (MachineBasicBlock::iterator I = End, E = Begin;
+       I != E; --Count) {
+    MachineInstr &MI = *--I;
+
+    if (MI.isDebugInstr())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Anti: ");
+    LLVM_DEBUG(MI.dump());
+
+    std::set<unsigned> PassthruRegs;
+    GetPassthruRegs(MI, PassthruRegs);
+
+    // Process the defs in MI...
+    PrescanInstruction(MI, Count, PassthruRegs);
+
+    // The dependence edges that represent anti- and output-
+    // dependencies that are candidates for breaking.
+    std::vector<const SDep *> Edges;
+    const SUnit *PathSU = MISUnitMap[&MI];
+    AntiDepEdges(PathSU, Edges);
+
+    // If MI is not on the critical path, then we don't rename
+    // registers in the CriticalPathSet.
+    BitVector *ExcludeRegs = nullptr;
+    if (&MI == CriticalPathMI) {
+      CriticalPathSU = CriticalPathStep(CriticalPathSU);
+      CriticalPathMI = (CriticalPathSU) ? CriticalPathSU->getInstr() : nullptr;
+    } else if (CriticalPathSet.any()) {
+      ExcludeRegs = &CriticalPathSet;
+    }
+
+    // Ignore KILL instructions (they form a group in ScanInstruction
+    // but don't cause any anti-dependence breaking themselves)
+    if (!MI.isKill()) {
+      // Attempt to break each anti-dependency...
+      for (const SDep *Edge : Edges) {
+        SUnit *NextSU = Edge->getSUnit();
+
+        if ((Edge->getKind() != SDep::Anti) &&
+            (Edge->getKind() != SDep::Output)) continue;
+
+        unsigned AntiDepReg = Edge->getReg();
+        LLVM_DEBUG(dbgs() << "\tAntidep reg: " << printReg(AntiDepReg, TRI));
+        assert(AntiDepReg != 0 && "Anti-dependence on reg0?");
+
+        if (!MRI.isAllocatable(AntiDepReg)) {
+          // Don't break anti-dependencies on non-allocatable registers.
+          LLVM_DEBUG(dbgs() << " (non-allocatable)\n");
+          continue;
+        } else if (ExcludeRegs && ExcludeRegs->test(AntiDepReg)) {
+          // Don't break anti-dependencies for critical path registers
+          // if not on the critical path
+          LLVM_DEBUG(dbgs() << " (not critical-path)\n");
+          continue;
+        } else if (PassthruRegs.count(AntiDepReg) != 0) {
+          // If the anti-dep register liveness "passes-thru", then
+          // don't try to change it. It will be changed along with
+          // the use if required to break an earlier antidep.
+          LLVM_DEBUG(dbgs() << " (passthru)\n");
+          continue;
+        } else {
+          // No anti-dep breaking for implicit deps
+          MachineOperand *AntiDepOp = MI.findRegisterDefOperand(AntiDepReg);
+          assert(AntiDepOp && "Can't find index for defined register operand");
+          if (!AntiDepOp || AntiDepOp->isImplicit()) {
+            LLVM_DEBUG(dbgs() << " (implicit)\n");
+            continue;
+          }
+
+          // If the SUnit has other dependencies on the SUnit that
+          // it anti-depends on, don't bother breaking the
+          // anti-dependency since those edges would prevent such
+          // units from being scheduled past each other
+          // regardless.
+          //
+          // Also, if there are dependencies on other SUnits with the
+          // same register as the anti-dependency, don't attempt to
+          // break it.
+          for (const SDep &Pred : PathSU->Preds) {
+            if (Pred.getSUnit() == NextSU ? (Pred.getKind() != SDep::Anti ||
+                                             Pred.getReg() != AntiDepReg)
+                                          : (Pred.getKind() == SDep::Data &&
+                                             Pred.getReg() == AntiDepReg)) {
+              AntiDepReg = 0;
+              break;
+            }
+          }
+          for (const SDep &Pred : PathSU->Preds) {
+            if ((Pred.getSUnit() == NextSU) && (Pred.getKind() != SDep::Anti) &&
+                (Pred.getKind() != SDep::Output)) {
+              LLVM_DEBUG(dbgs() << " (real dependency)\n");
+              AntiDepReg = 0;
+              break;
+            } else if ((Pred.getSUnit() != NextSU) &&
+                       (Pred.getKind() == SDep::Data) &&
+                       (Pred.getReg() == AntiDepReg)) {
+              LLVM_DEBUG(dbgs() << " (other dependency)\n");
+              AntiDepReg = 0;
+              break;
+            }
+          }
+
+          if (AntiDepReg == 0) continue;
+
+          // If the definition of the anti-dependency register does not start
+          // a new live range, bail out. This can happen if the anti-dep
+          // register is a sub-register of another register whose live range
+          // spans over PathSU. In such case, PathSU defines only a part of
+          // the larger register.
+          RegAliases.reset();
+          for (MCRegAliasIterator AI(AntiDepReg, TRI, true); AI.isValid(); ++AI)
+            RegAliases.set(*AI);
+          for (SDep S : PathSU->Succs) {
+            SDep::Kind K = S.getKind();
+            if (K != SDep::Data && K != SDep::Output && K != SDep::Anti)
+              continue;
+            unsigned R = S.getReg();
+            if (!RegAliases[R])
+              continue;
+            if (R == AntiDepReg || TRI->isSubRegister(AntiDepReg, R))
+              continue;
+            AntiDepReg = 0;
+            break;
+          }
+
+          if (AntiDepReg == 0) continue;
+        }
+
+        assert(AntiDepReg != 0);
+
+        // Determine AntiDepReg's register group.
+        const unsigned GroupIndex = State->GetGroup(AntiDepReg);
+        if (GroupIndex == 0) {
+          LLVM_DEBUG(dbgs() << " (zero group)\n");
+          continue;
+        }
+
+        LLVM_DEBUG(dbgs() << '\n');
+
+        // Look for a suitable register to use to break the anti-dependence.
+        std::map<unsigned, unsigned> RenameMap;
+        if (FindSuitableFreeRegisters(GroupIndex, RenameOrder, RenameMap)) {
+          LLVM_DEBUG(dbgs() << "\tBreaking anti-dependence edge on "
+                            << printReg(AntiDepReg, TRI) << ":");
+
+          // Handle each group register...
+          for (const auto &P : RenameMap) {
+            unsigned CurrReg = P.first;
+            unsigned NewReg = P.second;
+
+            LLVM_DEBUG(dbgs() << " " << printReg(CurrReg, TRI) << "->"
+                              << printReg(NewReg, TRI) << "("
+                              << RegRefs.count(CurrReg) << " refs)");
+
+            // Update the references to the old register CurrReg to
+            // refer to the new register NewReg.
+            for (const auto &Q : make_range(RegRefs.equal_range(CurrReg))) {
+              Q.second.Operand->setReg(NewReg);
+              // If the SU for the instruction being updated has debug
+              // information related to the anti-dependency register, make
+              // sure to update that as well.
+              const SUnit *SU = MISUnitMap[Q.second.Operand->getParent()];
+              if (!SU) continue;
+              UpdateDbgValues(DbgValues, Q.second.Operand->getParent(),
+                              AntiDepReg, NewReg);
+            }
+
+            // We just went back in time and modified history; the
+            // liveness information for CurrReg is now inconsistent. Set
+            // the state as if it were dead.
+            State->UnionGroups(NewReg, 0);
+            RegRefs.erase(NewReg);
+            DefIndices[NewReg] = DefIndices[CurrReg];
+            KillIndices[NewReg] = KillIndices[CurrReg];
+
+            State->UnionGroups(CurrReg, 0);
+            RegRefs.erase(CurrReg);
+            DefIndices[CurrReg] = KillIndices[CurrReg];
+            KillIndices[CurrReg] = ~0u;
+            assert(((KillIndices[CurrReg] == ~0u) !=
+                    (DefIndices[CurrReg] == ~0u)) &&
+                   "Kill and Def maps aren't consistent for AntiDepReg!");
+          }
+
+          ++Broken;
+          LLVM_DEBUG(dbgs() << '\n');
+        }
+      }
+    }
+
+    ScanInstruction(MI, Count);
+  }
+
+  return Broken;
+}
+
+AntiDepBreaker *llvm::createAggressiveAntiDepBreaker(
+    MachineFunction &MFi, const RegisterClassInfo &RCI,
+    TargetSubtargetInfo::RegClassVector &CriticalPathRCs) {
+  return new AggressiveAntiDepBreaker(MFi, RCI, CriticalPathRCs);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h b/contrib/llvm-project/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h
new file mode 100644
index 000000000000..cece217e645c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h
@@ -0,0 +1,186 @@
+//==- llvm/CodeGen/AggressiveAntiDepBreaker.h - Anti-Dep Support -*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AggressiveAntiDepBreaker class, which
+// implements register anti-dependence breaking during post-RA
+// scheduling. It attempts to break all anti-dependencies within a
+// block.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
+#define LLVM_LIB_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/AntiDepBreaker.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Support/Compiler.h"
+#include <map>
+#include <set>
+#include <vector>
+
+namespace llvm {
+
+class MachineBasicBlock;
+class MachineFunction;
+class MachineInstr;
+class MachineOperand;
+class MachineRegisterInfo;
+class RegisterClassInfo;
+class TargetInstrInfo;
+class TargetRegisterClass;
+class TargetRegisterInfo;
+
+  /// Contains all the state necessary for anti-dep breaking.
+class LLVM_LIBRARY_VISIBILITY AggressiveAntiDepState {
+  public:
+    /// Information about a register reference within a liverange
+    struct RegisterReference {
+      /// The registers operand
+      MachineOperand *Operand;
+
+      /// The register class
+      const TargetRegisterClass *RC;
+    };
+
+  private:
+    /// Number of non-virtual target registers (i.e. TRI->getNumRegs()).
+    const unsigned NumTargetRegs;
+
+    /// Implements a disjoint-union data structure to
+    /// form register groups. A node is represented by an index into
+    /// the vector. A node can "point to" itself to indicate that it
+    /// is the parent of a group, or point to another node to indicate
+    /// that it is a member of the same group as that node.
+    std::vector<unsigned> GroupNodes;
+
+    /// For each register, the index of the GroupNode
+    /// currently representing the group that the register belongs to.
+    /// Register 0 is always represented by the 0 group, a group
+    /// composed of registers that are not eligible for anti-aliasing.
+    std::vector<unsigned> GroupNodeIndices;
+
+    /// Map registers to all their references within a live range.
+    std::multimap<unsigned, RegisterReference> RegRefs;
+
+    /// The index of the most recent kill (proceeding bottom-up),
+    /// or ~0u if the register is not live.
+    std::vector<unsigned> KillIndices;
+
+    /// The index of the most recent complete def (proceeding bottom
+    /// up), or ~0u if the register is live.
+    std::vector<unsigned> DefIndices;
+
+  public:
+    AggressiveAntiDepState(const unsigned TargetRegs, MachineBasicBlock *BB);
+
+    /// Return the kill indices.
+    std::vector<unsigned> &GetKillIndices() { return KillIndices; }
+
+    /// Return the define indices.
+    std::vector<unsigned> &GetDefIndices() { return DefIndices; }
+
+    /// Return the RegRefs map.
+    std::multimap<unsigned, RegisterReference>& GetRegRefs() { return RegRefs; }
+
+    // Get the group for a register. The returned value is
+    // the index of the GroupNode representing the group.
+    unsigned GetGroup(unsigned Reg);
+
+    // Return a vector of the registers belonging to a group.
+    // If RegRefs is non-NULL then only included referenced registers.
+    void GetGroupRegs(
+       unsigned Group,
+       std::vector<unsigned> &Regs,
+       std::multimap<unsigned,
+         AggressiveAntiDepState::RegisterReference> *RegRefs);
+
+    // Union Reg1's and Reg2's groups to form a new group.
+    // Return the index of the GroupNode representing the group.
+    unsigned UnionGroups(unsigned Reg1, unsigned Reg2);
+
+    // Remove a register from its current group and place
+    // it alone in its own group. Return the index of the GroupNode
+    // representing the registers new group.
+    unsigned LeaveGroup(unsigned Reg);
+
+    /// Return true if Reg is live.
+    bool IsLive(unsigned Reg);
+  };
+
+  class LLVM_LIBRARY_VISIBILITY AggressiveAntiDepBreaker
+      : public AntiDepBreaker {
+    MachineFunction &MF;
+    MachineRegisterInfo &MRI;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const RegisterClassInfo &RegClassInfo;
+
+    /// The set of registers that should only be
+    /// renamed if they are on the critical path.
+    BitVector CriticalPathSet;
+
+    /// The state used to identify and rename anti-dependence registers.
+    AggressiveAntiDepState *State = nullptr;
+
+  public:
+    AggressiveAntiDepBreaker(MachineFunction &MFi,
+                          const RegisterClassInfo &RCI,
+                          TargetSubtargetInfo::RegClassVector& CriticalPathRCs);
+    AggressiveAntiDepBreaker &
+    operator=(const AggressiveAntiDepBreaker &other) = delete;
+    AggressiveAntiDepBreaker(const AggressiveAntiDepBreaker &other) = delete;
+    ~AggressiveAntiDepBreaker() override;
+
+    /// Initialize anti-dep breaking for a new basic block.
+    void StartBlock(MachineBasicBlock *BB) override;
+
+    /// Identifiy anti-dependencies along the critical path
+    /// of the ScheduleDAG and break them by renaming registers.
+    unsigned BreakAntiDependencies(const std::vector<SUnit> &SUnits,
+                                   MachineBasicBlock::iterator Begin,
+                                   MachineBasicBlock::iterator End,
+                                   unsigned InsertPosIndex,
+                                   DbgValueVector &DbgValues) override;
+
+    /// Update liveness information to account for the current
+    /// instruction, which will not be scheduled.
+    void Observe(MachineInstr &MI, unsigned Count,
+                 unsigned InsertPosIndex) override;
+
+    /// Finish anti-dep breaking for a basic block.
+    void FinishBlock() override;
+
+  private:
+    /// Keep track of a position in the allocation order for each regclass.
+    using RenameOrderType = std::map<const TargetRegisterClass *, unsigned>;
+
+    /// Return true if MO represents a register
+    /// that is both implicitly used and defined in MI
+    bool IsImplicitDefUse(MachineInstr &MI, MachineOperand &MO);
+
+    /// If MI implicitly def/uses a register, then
+    /// return that register and all subregisters.
+    void GetPassthruRegs(MachineInstr &MI, std::set<unsigned> &PassthruRegs);
+
+    void HandleLastUse(unsigned Reg, unsigned KillIdx, const char *tag,
+                       const char *header = nullptr,
+                       const char *footer = nullptr);
+
+    void PrescanInstruction(MachineInstr &MI, unsigned Count,
+                            std::set<unsigned> &PassthruRegs);
+    void ScanInstruction(MachineInstr &MI, unsigned Count);
+    BitVector GetRenameRegisters(unsigned Reg);
+    bool FindSuitableFreeRegisters(unsigned AntiDepGroupIndex,
+                                   RenameOrderType& RenameOrder,
+                                   std::map<unsigned, unsigned> &RenameMap);
+  };
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AllocationOrder.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AllocationOrder.cpp
new file mode 100644
index 000000000000..2aef1234ac0e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AllocationOrder.cpp
@@ -0,0 +1,53 @@
+//===-- llvm/CodeGen/AllocationOrder.cpp - Allocation Order ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an allocation order for virtual registers.
+//
+// The preferred allocation order for a virtual register depends on allocation
+// hints and target hooks. The AllocationOrder class encapsulates all of that.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AllocationOrder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+// Compare VirtRegMap::getRegAllocPref().
+AllocationOrder AllocationOrder::create(unsigned VirtReg, const VirtRegMap &VRM,
+                                        const RegisterClassInfo &RegClassInfo,
+                                        const LiveRegMatrix *Matrix) {
+  const MachineFunction &MF = VRM.getMachineFunction();
+  const TargetRegisterInfo *TRI = &VRM.getTargetRegInfo();
+  auto Order = RegClassInfo.getOrder(MF.getRegInfo().getRegClass(VirtReg));
+  SmallVector<MCPhysReg, 16> Hints;
+  bool HardHints =
+      TRI->getRegAllocationHints(VirtReg, Order, Hints, MF, &VRM, Matrix);
+
+  LLVM_DEBUG({
+    if (!Hints.empty()) {
+      dbgs() << "hints:";
+      for (unsigned I = 0, E = Hints.size(); I != E; ++I)
+        dbgs() << ' ' << printReg(Hints[I], TRI);
+      dbgs() << '\n';
+    }
+  });
+#ifndef NDEBUG
+  for (unsigned I = 0, E = Hints.size(); I != E; ++I)
+    assert(is_contained(Order, Hints[I]) &&
+           "Target hint is outside allocation order.");
+#endif
+  return AllocationOrder(std::move(Hints), Order, HardHints);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AllocationOrder.h b/contrib/llvm-project/llvm/lib/CodeGen/AllocationOrder.h
new file mode 100644
index 000000000000..0701e6810100
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AllocationOrder.h
@@ -0,0 +1,124 @@
+//===-- llvm/CodeGen/AllocationOrder.h - Allocation Order -*- C++ -*-------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an allocation order for virtual registers.
+//
+// The preferred allocation order for a virtual register depends on allocation
+// hints and target hooks. The AllocationOrder class encapsulates all of that.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ALLOCATIONORDER_H
+#define LLVM_LIB_CODEGEN_ALLOCATIONORDER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Register.h"
+
+namespace llvm {
+
+class RegisterClassInfo;
+class VirtRegMap;
+class LiveRegMatrix;
+
+class LLVM_LIBRARY_VISIBILITY AllocationOrder {
+  const SmallVector<MCPhysReg, 16> Hints;
+  ArrayRef<MCPhysReg> Order;
+  // How far into the Order we can iterate. This is 0 if the AllocationOrder is
+  // constructed with HardHints = true, Order.size() otherwise. While
+  // technically a size_t, it will participate in comparisons with the
+  // Iterator's Pos, which must be signed, so it's typed here as signed, too, to
+  // avoid warnings and under the assumption that the size of Order is
+  // relatively small.
+  // IterationLimit defines an invalid iterator position.
+  const int IterationLimit;
+
+public:
+  /// Forward iterator for an AllocationOrder.
+  class Iterator final {
+    const AllocationOrder &AO;
+    int Pos = 0;
+
+  public:
+    Iterator(const AllocationOrder &AO, int Pos) : AO(AO), Pos(Pos) {}
+
+    /// Return true if the curent position is that of a preferred register.
+    bool isHint() const { return Pos < 0; }
+
+    /// Return the next physical register in the allocation order.
+    MCRegister operator*() const {
+      if (Pos < 0)
+        return AO.Hints.end()[Pos];
+      assert(Pos < AO.IterationLimit);
+      return AO.Order[Pos];
+    }
+
+    /// Advance the iterator to the next position. If that's past the Hints
+    /// list, advance to the first value that's not also in the Hints list.
+    Iterator &operator++() {
+      if (Pos < AO.IterationLimit)
+        ++Pos;
+      while (Pos >= 0 && Pos < AO.IterationLimit && AO.isHint(AO.Order[Pos]))
+        ++Pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &Other) const {
+      assert(&AO == &Other.AO);
+      return Pos == Other.Pos;
+    }
+
+    bool operator!=(const Iterator &Other) const { return !(*this == Other); }
+  };
+
+  /// Create a new AllocationOrder for VirtReg.
+  /// @param VirtReg      Virtual register to allocate for.
+  /// @param VRM          Virtual register map for function.
+  /// @param RegClassInfo Information about reserved and allocatable registers.
+  static AllocationOrder create(unsigned VirtReg, const VirtRegMap &VRM,
+                                const RegisterClassInfo &RegClassInfo,
+                                const LiveRegMatrix *Matrix);
+
+  /// Create an AllocationOrder given the Hits, Order, and HardHits values.
+  /// Use the create method above - the ctor is for unittests.
+  AllocationOrder(SmallVector<MCPhysReg, 16> &&Hints, ArrayRef<MCPhysReg> Order,
+                  bool HardHints)
+      : Hints(std::move(Hints)), Order(Order),
+        IterationLimit(HardHints ? 0 : static_cast<int>(Order.size())) {}
+
+  Iterator begin() const {
+    return Iterator(*this, -(static_cast<int>(Hints.size())));
+  }
+
+  Iterator end() const { return Iterator(*this, IterationLimit); }
+
+  Iterator getOrderLimitEnd(unsigned OrderLimit) const {
+    assert(OrderLimit <= Order.size());
+    if (OrderLimit == 0)
+      return end();
+    Iterator Ret(*this,
+                 std::min(static_cast<int>(OrderLimit) - 1, IterationLimit));
+    return ++Ret;
+  }
+
+  /// Get the allocation order without reordered hints.
+  ArrayRef<MCPhysReg> getOrder() const { return Order; }
+
+  /// Return true if Reg is a preferred physical register.
+  bool isHint(Register Reg) const {
+    assert(!Reg.isPhysical() ||
+           Reg.id() <
+               static_cast<uint32_t>(std::numeric_limits<MCPhysReg>::max()));
+    return Reg.isPhysical() && is_contained(Hints, Reg.id());
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/Analysis.cpp b/contrib/llvm-project/llvm/lib/CodeGen/Analysis.cpp
new file mode 100644
index 000000000000..2065bfbd1c44
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/Analysis.cpp
@@ -0,0 +1,870 @@
+//===-- Analysis.cpp - CodeGen LLVM IR Analysis Utilities -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines several CodeGen-specific LLVM IR analysis utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+/// Compute the linearized index of a member in a nested aggregate/struct/array
+/// by recursing and accumulating CurIndex as long as there are indices in the
+/// index list.
+unsigned llvm::ComputeLinearIndex(Type *Ty,
+                                  const unsigned *Indices,
+                                  const unsigned *IndicesEnd,
+                                  unsigned CurIndex) {
+  // Base case: We're done.
+  if (Indices && Indices == IndicesEnd)
+    return CurIndex;
+
+  // Given a struct type, recursively traverse the elements.
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    for (auto I : llvm::enumerate(STy->elements())) {
+      Type *ET = I.value();
+      if (Indices && *Indices == I.index())
+        return ComputeLinearIndex(ET, Indices + 1, IndicesEnd, CurIndex);
+      CurIndex = ComputeLinearIndex(ET, nullptr, nullptr, CurIndex);
+    }
+    assert(!Indices && "Unexpected out of bound");
+    return CurIndex;
+  }
+  // Given an array type, recursively traverse the elements.
+  else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    Type *EltTy = ATy->getElementType();
+    unsigned NumElts = ATy->getNumElements();
+    // Compute the Linear offset when jumping one element of the array
+    unsigned EltLinearOffset = ComputeLinearIndex(EltTy, nullptr, nullptr, 0);
+    if (Indices) {
+      assert(*Indices < NumElts && "Unexpected out of bound");
+      // If the indice is inside the array, compute the index to the requested
+      // elt and recurse inside the element with the end of the indices list
+      CurIndex += EltLinearOffset* *Indices;
+      return ComputeLinearIndex(EltTy, Indices+1, IndicesEnd, CurIndex);
+    }
+    CurIndex += EltLinearOffset*NumElts;
+    return CurIndex;
+  }
+  // We haven't found the type we're looking for, so keep searching.
+  return CurIndex + 1;
+}
+
+/// ComputeValueVTs - Given an LLVM IR type, compute a sequence of
+/// EVTs that represent all the individual underlying
+/// non-aggregate types that comprise it.
+///
+/// If Offsets is non-null, it points to a vector to be filled in
+/// with the in-memory offsets of each of the individual values.
+///
+void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+                           Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
+                           SmallVectorImpl<EVT> *MemVTs,
+                           SmallVectorImpl<TypeSize> *Offsets,
+                           TypeSize StartingOffset) {
+  // Given a struct type, recursively traverse the elements.
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    // If the Offsets aren't needed, don't query the struct layout. This allows
+    // us to support structs with scalable vectors for operations that don't
+    // need offsets.
+    const StructLayout *SL = Offsets ? DL.getStructLayout(STy) : nullptr;
+    for (StructType::element_iterator EB = STy->element_begin(),
+                                      EI = EB,
+                                      EE = STy->element_end();
+         EI != EE; ++EI) {
+      // Don't compute the element offset if we didn't get a StructLayout above.
+      TypeSize EltOffset = SL ? SL->getElementOffset(EI - EB)
+                              : TypeSize::get(0, StartingOffset.isScalable());
+      ComputeValueVTs(TLI, DL, *EI, ValueVTs, MemVTs, Offsets,
+                      StartingOffset + EltOffset);
+    }
+    return;
+  }
+  // Given an array type, recursively traverse the elements.
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    Type *EltTy = ATy->getElementType();
+    TypeSize EltSize = DL.getTypeAllocSize(EltTy);
+    for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
+      ComputeValueVTs(TLI, DL, EltTy, ValueVTs, MemVTs, Offsets,
+                      StartingOffset + i * EltSize);
+    return;
+  }
+  // Interpret void as zero return values.
+  if (Ty->isVoidTy())
+    return;
+  // Base case: we can get an EVT for this LLVM IR type.
+  ValueVTs.push_back(TLI.getValueType(DL, Ty));
+  if (MemVTs)
+    MemVTs->push_back(TLI.getMemValueType(DL, Ty));
+  if (Offsets)
+    Offsets->push_back(StartingOffset);
+}
+
+void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+                           Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
+                           SmallVectorImpl<TypeSize> *Offsets,
+                           TypeSize StartingOffset) {
+  return ComputeValueVTs(TLI, DL, Ty, ValueVTs, /*MemVTs=*/nullptr, Offsets,
+                         StartingOffset);
+}
+
+void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+                           Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
+                           SmallVectorImpl<TypeSize> *Offsets,
+                           uint64_t StartingOffset) {
+  TypeSize Offset = TypeSize::get(StartingOffset, Ty->isScalableTy());
+  return ComputeValueVTs(TLI, DL, Ty, ValueVTs, Offsets, Offset);
+}
+
+void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+                           Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
+                           SmallVectorImpl<uint64_t> *FixedOffsets,
+                           uint64_t StartingOffset) {
+  TypeSize Offset = TypeSize::get(StartingOffset, Ty->isScalableTy());
+  SmallVector<TypeSize, 4> Offsets;
+  if (FixedOffsets)
+    ComputeValueVTs(TLI, DL, Ty, ValueVTs, &Offsets, Offset);
+  else
+    ComputeValueVTs(TLI, DL, Ty, ValueVTs, nullptr, Offset);
+
+  if (FixedOffsets)
+    for (TypeSize Offset : Offsets)
+      FixedOffsets->push_back(Offset.getKnownMinValue());
+}
+
+void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+                           Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
+                           SmallVectorImpl<EVT> *MemVTs,
+                           SmallVectorImpl<TypeSize> *Offsets,
+                           uint64_t StartingOffset) {
+  TypeSize Offset = TypeSize::get(StartingOffset, Ty->isScalableTy());
+  return ComputeValueVTs(TLI, DL, Ty, ValueVTs, MemVTs, Offsets, Offset);
+}
+
+void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+                           Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
+                           SmallVectorImpl<EVT> *MemVTs,
+                           SmallVectorImpl<uint64_t> *FixedOffsets,
+                           uint64_t StartingOffset) {
+  TypeSize Offset = TypeSize::get(StartingOffset, Ty->isScalableTy());
+  SmallVector<TypeSize, 4> Offsets;
+  if (FixedOffsets)
+    ComputeValueVTs(TLI, DL, Ty, ValueVTs, MemVTs, &Offsets, Offset);
+  else
+    ComputeValueVTs(TLI, DL, Ty, ValueVTs, MemVTs, nullptr, Offset);
+
+  if (FixedOffsets)
+    for (TypeSize Offset : Offsets)
+      FixedOffsets->push_back(Offset.getKnownMinValue());
+}
+
+void llvm::computeValueLLTs(const DataLayout &DL, Type &Ty,
+                            SmallVectorImpl<LLT> &ValueTys,
+                            SmallVectorImpl<uint64_t> *Offsets,
+                            uint64_t StartingOffset) {
+  // Given a struct type, recursively traverse the elements.
+  if (StructType *STy = dyn_cast<StructType>(&Ty)) {
+    // If the Offsets aren't needed, don't query the struct layout. This allows
+    // us to support structs with scalable vectors for operations that don't
+    // need offsets.
+    const StructLayout *SL = Offsets ? DL.getStructLayout(STy) : nullptr;
+    for (unsigned I = 0, E = STy->getNumElements(); I != E; ++I) {
+      uint64_t EltOffset = SL ? SL->getElementOffset(I) : 0;
+      computeValueLLTs(DL, *STy->getElementType(I), ValueTys, Offsets,
+                       StartingOffset + EltOffset);
+    }
+    return;
+  }
+  // Given an array type, recursively traverse the elements.
+  if (ArrayType *ATy = dyn_cast<ArrayType>(&Ty)) {
+    Type *EltTy = ATy->getElementType();
+    uint64_t EltSize = DL.getTypeAllocSize(EltTy).getFixedValue();
+    for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
+      computeValueLLTs(DL, *EltTy, ValueTys, Offsets,
+                       StartingOffset + i * EltSize);
+    return;
+  }
+  // Interpret void as zero return values.
+  if (Ty.isVoidTy())
+    return;
+  // Base case: we can get an LLT for this LLVM IR type.
+  ValueTys.push_back(getLLTForType(Ty, DL));
+  if (Offsets != nullptr)
+    Offsets->push_back(StartingOffset * 8);
+}
+
+/// ExtractTypeInfo - Returns the type info, possibly bitcast, encoded in V.
+GlobalValue *llvm::ExtractTypeInfo(Value *V) {
+  V = V->stripPointerCasts();
+  GlobalValue *GV = dyn_cast<GlobalValue>(V);
+  GlobalVariable *Var = dyn_cast<GlobalVariable>(V);
+
+  if (Var && Var->getName() == "llvm.eh.catch.all.value") {
+    assert(Var->hasInitializer() &&
+           "The EH catch-all value must have an initializer");
+    Value *Init = Var->getInitializer();
+    GV = dyn_cast<GlobalValue>(Init);
+    if (!GV) V = cast<ConstantPointerNull>(Init);
+  }
+
+  assert((GV || isa<ConstantPointerNull>(V)) &&
+         "TypeInfo must be a global variable or NULL");
+  return GV;
+}
+
+/// getFCmpCondCode - Return the ISD condition code corresponding to
+/// the given LLVM IR floating-point condition code.  This includes
+/// consideration of global floating-point math flags.
+///
+ISD::CondCode llvm::getFCmpCondCode(FCmpInst::Predicate Pred) {
+  switch (Pred) {
+  case FCmpInst::FCMP_FALSE: return ISD::SETFALSE;
+  case FCmpInst::FCMP_OEQ:   return ISD::SETOEQ;
+  case FCmpInst::FCMP_OGT:   return ISD::SETOGT;
+  case FCmpInst::FCMP_OGE:   return ISD::SETOGE;
+  case FCmpInst::FCMP_OLT:   return ISD::SETOLT;
+  case FCmpInst::FCMP_OLE:   return ISD::SETOLE;
+  case FCmpInst::FCMP_ONE:   return ISD::SETONE;
+  case FCmpInst::FCMP_ORD:   return ISD::SETO;
+  case FCmpInst::FCMP_UNO:   return ISD::SETUO;
+  case FCmpInst::FCMP_UEQ:   return ISD::SETUEQ;
+  case FCmpInst::FCMP_UGT:   return ISD::SETUGT;
+  case FCmpInst::FCMP_UGE:   return ISD::SETUGE;
+  case FCmpInst::FCMP_ULT:   return ISD::SETULT;
+  case FCmpInst::FCMP_ULE:   return ISD::SETULE;
+  case FCmpInst::FCMP_UNE:   return ISD::SETUNE;
+  case FCmpInst::FCMP_TRUE:  return ISD::SETTRUE;
+  default: llvm_unreachable("Invalid FCmp predicate opcode!");
+  }
+}
+
+ISD::CondCode llvm::getFCmpCodeWithoutNaN(ISD::CondCode CC) {
+  switch (CC) {
+    case ISD::SETOEQ: case ISD::SETUEQ: return ISD::SETEQ;
+    case ISD::SETONE: case ISD::SETUNE: return ISD::SETNE;
+    case ISD::SETOLT: case ISD::SETULT: return ISD::SETLT;
+    case ISD::SETOLE: case ISD::SETULE: return ISD::SETLE;
+    case ISD::SETOGT: case ISD::SETUGT: return ISD::SETGT;
+    case ISD::SETOGE: case ISD::SETUGE: return ISD::SETGE;
+    default: return CC;
+  }
+}
+
+ISD::CondCode llvm::getICmpCondCode(ICmpInst::Predicate Pred) {
+  switch (Pred) {
+  case ICmpInst::ICMP_EQ:  return ISD::SETEQ;
+  case ICmpInst::ICMP_NE:  return ISD::SETNE;
+  case ICmpInst::ICMP_SLE: return ISD::SETLE;
+  case ICmpInst::ICMP_ULE: return ISD::SETULE;
+  case ICmpInst::ICMP_SGE: return ISD::SETGE;
+  case ICmpInst::ICMP_UGE: return ISD::SETUGE;
+  case ICmpInst::ICMP_SLT: return ISD::SETLT;
+  case ICmpInst::ICMP_ULT: return ISD::SETULT;
+  case ICmpInst::ICMP_SGT: return ISD::SETGT;
+  case ICmpInst::ICMP_UGT: return ISD::SETUGT;
+  default:
+    llvm_unreachable("Invalid ICmp predicate opcode!");
+  }
+}
+
+ICmpInst::Predicate llvm::getICmpCondCode(ISD::CondCode Pred) {
+  switch (Pred) {
+  case ISD::SETEQ:
+    return ICmpInst::ICMP_EQ;
+  case ISD::SETNE:
+    return ICmpInst::ICMP_NE;
+  case ISD::SETLE:
+    return ICmpInst::ICMP_SLE;
+  case ISD::SETULE:
+    return ICmpInst::ICMP_ULE;
+  case ISD::SETGE:
+    return ICmpInst::ICMP_SGE;
+  case ISD::SETUGE:
+    return ICmpInst::ICMP_UGE;
+  case ISD::SETLT:
+    return ICmpInst::ICMP_SLT;
+  case ISD::SETULT:
+    return ICmpInst::ICMP_ULT;
+  case ISD::SETGT:
+    return ICmpInst::ICMP_SGT;
+  case ISD::SETUGT:
+    return ICmpInst::ICMP_UGT;
+  default:
+    llvm_unreachable("Invalid ISD integer condition code!");
+  }
+}
+
+static bool isNoopBitcast(Type *T1, Type *T2,
+                          const TargetLoweringBase& TLI) {
+  return T1 == T2 || (T1->isPointerTy() && T2->isPointerTy()) ||
+         (isa<VectorType>(T1) && isa<VectorType>(T2) &&
+          TLI.isTypeLegal(EVT::getEVT(T1)) && TLI.isTypeLegal(EVT::getEVT(T2)));
+}
+
+/// Look through operations that will be free to find the earliest source of
+/// this value.
+///
+/// @param ValLoc If V has aggregate type, we will be interested in a particular
+/// scalar component. This records its address; the reverse of this list gives a
+/// sequence of indices appropriate for an extractvalue to locate the important
+/// value. This value is updated during the function and on exit will indicate
+/// similar information for the Value returned.
+///
+/// @param DataBits If this function looks through truncate instructions, this
+/// will record the smallest size attained.
+static const Value *getNoopInput(const Value *V,
+                                 SmallVectorImpl<unsigned> &ValLoc,
+                                 unsigned &DataBits,
+                                 const TargetLoweringBase &TLI,
+                                 const DataLayout &DL) {
+  while (true) {
+    // Try to look through V1; if V1 is not an instruction, it can't be looked
+    // through.
+    const Instruction *I = dyn_cast<Instruction>(V);
+    if (!I || I->getNumOperands() == 0) return V;
+    const Value *NoopInput = nullptr;
+
+    Value *Op = I->getOperand(0);
+    if (isa<BitCastInst>(I)) {
+      // Look through truly no-op bitcasts.
+      if (isNoopBitcast(Op->getType(), I->getType(), TLI))
+        NoopInput = Op;
+    } else if (isa<GetElementPtrInst>(I)) {
+      // Look through getelementptr
+      if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
+        NoopInput = Op;
+    } else if (isa<IntToPtrInst>(I)) {
+      // Look through inttoptr.
+      // Make sure this isn't a truncating or extending cast.  We could
+      // support this eventually, but don't bother for now.
+      if (!isa<VectorType>(I->getType()) &&
+          DL.getPointerSizeInBits() ==
+              cast<IntegerType>(Op->getType())->getBitWidth())
+        NoopInput = Op;
+    } else if (isa<PtrToIntInst>(I)) {
+      // Look through ptrtoint.
+      // Make sure this isn't a truncating or extending cast.  We could
+      // support this eventually, but don't bother for now.
+      if (!isa<VectorType>(I->getType()) &&
+          DL.getPointerSizeInBits() ==
+              cast<IntegerType>(I->getType())->getBitWidth())
+        NoopInput = Op;
+    } else if (isa<TruncInst>(I) &&
+               TLI.allowTruncateForTailCall(Op->getType(), I->getType())) {
+      DataBits =
+          std::min((uint64_t)DataBits,
+                   I->getType()->getPrimitiveSizeInBits().getFixedValue());
+      NoopInput = Op;
+    } else if (auto *CB = dyn_cast<CallBase>(I)) {
+      const Value *ReturnedOp = CB->getReturnedArgOperand();
+      if (ReturnedOp && isNoopBitcast(ReturnedOp->getType(), I->getType(), TLI))
+        NoopInput = ReturnedOp;
+    } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(V)) {
+      // Value may come from either the aggregate or the scalar
+      ArrayRef<unsigned> InsertLoc = IVI->getIndices();
+      if (ValLoc.size() >= InsertLoc.size() &&
+          std::equal(InsertLoc.begin(), InsertLoc.end(), ValLoc.rbegin())) {
+        // The type being inserted is a nested sub-type of the aggregate; we
+        // have to remove those initial indices to get the location we're
+        // interested in for the operand.
+        ValLoc.resize(ValLoc.size() - InsertLoc.size());
+        NoopInput = IVI->getInsertedValueOperand();
+      } else {
+        // The struct we're inserting into has the value we're interested in, no
+        // change of address.
+        NoopInput = Op;
+      }
+    } else if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(V)) {
+      // The part we're interested in will inevitably be some sub-section of the
+      // previous aggregate. Combine the two paths to obtain the true address of
+      // our element.
+      ArrayRef<unsigned> ExtractLoc = EVI->getIndices();
+      ValLoc.append(ExtractLoc.rbegin(), ExtractLoc.rend());
+      NoopInput = Op;
+    }
+    // Terminate if we couldn't find anything to look through.
+    if (!NoopInput)
+      return V;
+
+    V = NoopInput;
+  }
+}
+
+/// Return true if this scalar return value only has bits discarded on its path
+/// from the "tail call" to the "ret". This includes the obvious noop
+/// instructions handled by getNoopInput above as well as free truncations (or
+/// extensions prior to the call).
+static bool slotOnlyDiscardsData(const Value *RetVal, const Value *CallVal,
+                                 SmallVectorImpl<unsigned> &RetIndices,
+                                 SmallVectorImpl<unsigned> &CallIndices,
+                                 bool AllowDifferingSizes,
+                                 const TargetLoweringBase &TLI,
+                                 const DataLayout &DL) {
+
+  // Trace the sub-value needed by the return value as far back up the graph as
+  // possible, in the hope that it will intersect with the value produced by the
+  // call. In the simple case with no "returned" attribute, the hope is actually
+  // that we end up back at the tail call instruction itself.
+  unsigned BitsRequired = UINT_MAX;
+  RetVal = getNoopInput(RetVal, RetIndices, BitsRequired, TLI, DL);
+
+  // If this slot in the value returned is undef, it doesn't matter what the
+  // call puts there, it'll be fine.
+  if (isa<UndefValue>(RetVal))
+    return true;
+
+  // Now do a similar search up through the graph to find where the value
+  // actually returned by the "tail call" comes from. In the simple case without
+  // a "returned" attribute, the search will be blocked immediately and the loop
+  // a Noop.
+  unsigned BitsProvided = UINT_MAX;
+  CallVal = getNoopInput(CallVal, CallIndices, BitsProvided, TLI, DL);
+
+  // There's no hope if we can't actually trace them to (the same part of!) the
+  // same value.
+  if (CallVal != RetVal || CallIndices != RetIndices)
+    return false;
+
+  // However, intervening truncates may have made the call non-tail. Make sure
+  // all the bits that are needed by the "ret" have been provided by the "tail
+  // call". FIXME: with sufficiently cunning bit-tracking, we could look through
+  // extensions too.
+  if (BitsProvided < BitsRequired ||
+      (!AllowDifferingSizes && BitsProvided != BitsRequired))
+    return false;
+
+  return true;
+}
+
+/// For an aggregate type, determine whether a given index is within bounds or
+/// not.
+static bool indexReallyValid(Type *T, unsigned Idx) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(T))
+    return Idx < AT->getNumElements();
+
+  return Idx < cast<StructType>(T)->getNumElements();
+}
+
+/// Move the given iterators to the next leaf type in depth first traversal.
+///
+/// Performs a depth-first traversal of the type as specified by its arguments,
+/// stopping at the next leaf node (which may be a legitimate scalar type or an
+/// empty struct or array).
+///
+/// @param SubTypes List of the partial components making up the type from
+/// outermost to innermost non-empty aggregate. The element currently
+/// represented is SubTypes.back()->getTypeAtIndex(Path.back() - 1).
+///
+/// @param Path Set of extractvalue indices leading from the outermost type
+/// (SubTypes[0]) to the leaf node currently represented.
+///
+/// @returns true if a new type was found, false otherwise. Calling this
+/// function again on a finished iterator will repeatedly return
+/// false. SubTypes.back()->getTypeAtIndex(Path.back()) is either an empty
+/// aggregate or a non-aggregate
+static bool advanceToNextLeafType(SmallVectorImpl<Type *> &SubTypes,
+                                  SmallVectorImpl<unsigned> &Path) {
+  // First march back up the tree until we can successfully increment one of the
+  // coordinates in Path.
+  while (!Path.empty() && !indexReallyValid(SubTypes.back(), Path.back() + 1)) {
+    Path.pop_back();
+    SubTypes.pop_back();
+  }
+
+  // If we reached the top, then the iterator is done.
+  if (Path.empty())
+    return false;
+
+  // We know there's *some* valid leaf now, so march back down the tree picking
+  // out the left-most element at each node.
+  ++Path.back();
+  Type *DeeperType =
+      ExtractValueInst::getIndexedType(SubTypes.back(), Path.back());
+  while (DeeperType->isAggregateType()) {
+    if (!indexReallyValid(DeeperType, 0))
+      return true;
+
+    SubTypes.push_back(DeeperType);
+    Path.push_back(0);
+
+    DeeperType = ExtractValueInst::getIndexedType(DeeperType, 0);
+  }
+
+  return true;
+}
+
+/// Find the first non-empty, scalar-like type in Next and setup the iterator
+/// components.
+///
+/// Assuming Next is an aggregate of some kind, this function will traverse the
+/// tree from left to right (i.e. depth-first) looking for the first
+/// non-aggregate type which will play a role in function return.
+///
+/// For example, if Next was {[0 x i64], {{}, i32, {}}, i32} then we would setup
+/// Path as [1, 1] and SubTypes as [Next, {{}, i32, {}}] to represent the first
+/// i32 in that type.
+static bool firstRealType(Type *Next, SmallVectorImpl<Type *> &SubTypes,
+                          SmallVectorImpl<unsigned> &Path) {
+  // First initialise the iterator components to the first "leaf" node
+  // (i.e. node with no valid sub-type at any index, so {} does count as a leaf
+  // despite nominally being an aggregate).
+  while (Type *FirstInner = ExtractValueInst::getIndexedType(Next, 0)) {
+    SubTypes.push_back(Next);
+    Path.push_back(0);
+    Next = FirstInner;
+  }
+
+  // If there's no Path now, Next was originally scalar already (or empty
+  // leaf). We're done.
+  if (Path.empty())
+    return true;
+
+  // Otherwise, use normal iteration to keep looking through the tree until we
+  // find a non-aggregate type.
+  while (ExtractValueInst::getIndexedType(SubTypes.back(), Path.back())
+             ->isAggregateType()) {
+    if (!advanceToNextLeafType(SubTypes, Path))
+      return false;
+  }
+
+  return true;
+}
+
+/// Set the iterator data-structures to the next non-empty, non-aggregate
+/// subtype.
+static bool nextRealType(SmallVectorImpl<Type *> &SubTypes,
+                         SmallVectorImpl<unsigned> &Path) {
+  do {
+    if (!advanceToNextLeafType(SubTypes, Path))
+      return false;
+
+    assert(!Path.empty() && "found a leaf but didn't set the path?");
+  } while (ExtractValueInst::getIndexedType(SubTypes.back(), Path.back())
+               ->isAggregateType());
+
+  return true;
+}
+
+
+/// Test if the given instruction is in a position to be optimized
+/// with a tail-call. This roughly means that it's in a block with
+/// a return and there's nothing that needs to be scheduled
+/// between it and the return.
+///
+/// This function only tests target-independent requirements.
+bool llvm::isInTailCallPosition(const CallBase &Call, const TargetMachine &TM) {
+  const BasicBlock *ExitBB = Call.getParent();
+  const Instruction *Term = ExitBB->getTerminator();
+  const ReturnInst *Ret = dyn_cast<ReturnInst>(Term);
+
+  // The block must end in a return statement or unreachable.
+  //
+  // FIXME: Decline tailcall if it's not guaranteed and if the block ends in
+  // an unreachable, for now. The way tailcall optimization is currently
+  // implemented means it will add an epilogue followed by a jump. That is
+  // not profitable. Also, if the callee is a special function (e.g.
+  // longjmp on x86), it can end up causing miscompilation that has not
+  // been fully understood.
+  if (!Ret && ((!TM.Options.GuaranteedTailCallOpt &&
+                Call.getCallingConv() != CallingConv::Tail &&
+                Call.getCallingConv() != CallingConv::SwiftTail) ||
+               !isa<UnreachableInst>(Term)))
+    return false;
+
+  // If I will have a chain, make sure no other instruction that will have a
+  // chain interposes between I and the return.
+  // Check for all calls including speculatable functions.
+  for (BasicBlock::const_iterator BBI = std::prev(ExitBB->end(), 2);; --BBI) {
+    if (&*BBI == &Call)
+      break;
+    // Debug info intrinsics do not get in the way of tail call optimization.
+    // Pseudo probe intrinsics do not block tail call optimization either.
+    if (BBI->isDebugOrPseudoInst())
+      continue;
+    // A lifetime end, assume or noalias.decl intrinsic should not stop tail
+    // call optimization.
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(BBI))
+      if (II->getIntrinsicID() == Intrinsic::lifetime_end ||
+          II->getIntrinsicID() == Intrinsic::assume ||
+          II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl)
+        continue;
+    if (BBI->mayHaveSideEffects() || BBI->mayReadFromMemory() ||
+        !isSafeToSpeculativelyExecute(&*BBI))
+      return false;
+  }
+
+  const Function *F = ExitBB->getParent();
+  return returnTypeIsEligibleForTailCall(
+      F, &Call, Ret, *TM.getSubtargetImpl(*F)->getTargetLowering());
+}
+
+bool llvm::attributesPermitTailCall(const Function *F, const Instruction *I,
+                                    const ReturnInst *Ret,
+                                    const TargetLoweringBase &TLI,
+                                    bool *AllowDifferingSizes) {
+  // ADS may be null, so don't write to it directly.
+  bool DummyADS;
+  bool &ADS = AllowDifferingSizes ? *AllowDifferingSizes : DummyADS;
+  ADS = true;
+
+  AttrBuilder CallerAttrs(F->getContext(), F->getAttributes().getRetAttrs());
+  AttrBuilder CalleeAttrs(F->getContext(),
+                          cast<CallInst>(I)->getAttributes().getRetAttrs());
+
+  // Following attributes are completely benign as far as calling convention
+  // goes, they shouldn't affect whether the call is a tail call.
+  for (const auto &Attr : {Attribute::Alignment, Attribute::Dereferenceable,
+                           Attribute::DereferenceableOrNull, Attribute::NoAlias,
+                           Attribute::NonNull, Attribute::NoUndef}) {
+    CallerAttrs.removeAttribute(Attr);
+    CalleeAttrs.removeAttribute(Attr);
+  }
+
+  if (CallerAttrs.contains(Attribute::ZExt)) {
+    if (!CalleeAttrs.contains(Attribute::ZExt))
+      return false;
+
+    ADS = false;
+    CallerAttrs.removeAttribute(Attribute::ZExt);
+    CalleeAttrs.removeAttribute(Attribute::ZExt);
+  } else if (CallerAttrs.contains(Attribute::SExt)) {
+    if (!CalleeAttrs.contains(Attribute::SExt))
+      return false;
+
+    ADS = false;
+    CallerAttrs.removeAttribute(Attribute::SExt);
+    CalleeAttrs.removeAttribute(Attribute::SExt);
+  }
+
+  // Drop sext and zext return attributes if the result is not used.
+  // This enables tail calls for code like:
+  //
+  // define void @caller() {
+  // entry:
+  //   %unused_result = tail call zeroext i1 @callee()
+  //   br label %retlabel
+  // retlabel:
+  //   ret void
+  // }
+  if (I->use_empty()) {
+    CalleeAttrs.removeAttribute(Attribute::SExt);
+    CalleeAttrs.removeAttribute(Attribute::ZExt);
+  }
+
+  // If they're still different, there's some facet we don't understand
+  // (currently only "inreg", but in future who knows). It may be OK but the
+  // only safe option is to reject the tail call.
+  return CallerAttrs == CalleeAttrs;
+}
+
+/// Check whether B is a bitcast of a pointer type to another pointer type,
+/// which is equal to A.
+static bool isPointerBitcastEqualTo(const Value *A, const Value *B) {
+  assert(A && B && "Expected non-null inputs!");
+
+  auto *BitCastIn = dyn_cast<BitCastInst>(B);
+
+  if (!BitCastIn)
+    return false;
+
+  if (!A->getType()->isPointerTy() || !B->getType()->isPointerTy())
+    return false;
+
+  return A == BitCastIn->getOperand(0);
+}
+
+bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
+                                           const Instruction *I,
+                                           const ReturnInst *Ret,
+                                           const TargetLoweringBase &TLI) {
+  // If the block ends with a void return or unreachable, it doesn't matter
+  // what the call's return type is.
+  if (!Ret || Ret->getNumOperands() == 0) return true;
+
+  // If the return value is undef, it doesn't matter what the call's
+  // return type is.
+  if (isa<UndefValue>(Ret->getOperand(0))) return true;
+
+  // Make sure the attributes attached to each return are compatible.
+  bool AllowDifferingSizes;
+  if (!attributesPermitTailCall(F, I, Ret, TLI, &AllowDifferingSizes))
+    return false;
+
+  const Value *RetVal = Ret->getOperand(0), *CallVal = I;
+  // Intrinsic like llvm.memcpy has no return value, but the expanded
+  // libcall may or may not have return value. On most platforms, it
+  // will be expanded as memcpy in libc, which returns the first
+  // argument. On other platforms like arm-none-eabi, memcpy may be
+  // expanded as library call without return value, like __aeabi_memcpy.
+  const CallInst *Call = cast<CallInst>(I);
+  if (Function *F = Call->getCalledFunction()) {
+    Intrinsic::ID IID = F->getIntrinsicID();
+    if (((IID == Intrinsic::memcpy &&
+          TLI.getLibcallName(RTLIB::MEMCPY) == StringRef("memcpy")) ||
+         (IID == Intrinsic::memmove &&
+          TLI.getLibcallName(RTLIB::MEMMOVE) == StringRef("memmove")) ||
+         (IID == Intrinsic::memset &&
+          TLI.getLibcallName(RTLIB::MEMSET) == StringRef("memset"))) &&
+        (RetVal == Call->getArgOperand(0) ||
+         isPointerBitcastEqualTo(RetVal, Call->getArgOperand(0))))
+      return true;
+  }
+
+  SmallVector<unsigned, 4> RetPath, CallPath;
+  SmallVector<Type *, 4> RetSubTypes, CallSubTypes;
+
+  bool RetEmpty = !firstRealType(RetVal->getType(), RetSubTypes, RetPath);
+  bool CallEmpty = !firstRealType(CallVal->getType(), CallSubTypes, CallPath);
+
+  // Nothing's actually returned, it doesn't matter what the callee put there
+  // it's a valid tail call.
+  if (RetEmpty)
+    return true;
+
+  // Iterate pairwise through each of the value types making up the tail call
+  // and the corresponding return. For each one we want to know whether it's
+  // essentially going directly from the tail call to the ret, via operations
+  // that end up not generating any code.
+  //
+  // We allow a certain amount of covariance here. For example it's permitted
+  // for the tail call to define more bits than the ret actually cares about
+  // (e.g. via a truncate).
+  do {
+    if (CallEmpty) {
+      // We've exhausted the values produced by the tail call instruction, the
+      // rest are essentially undef. The type doesn't really matter, but we need
+      // *something*.
+      Type *SlotType =
+          ExtractValueInst::getIndexedType(RetSubTypes.back(), RetPath.back());
+      CallVal = UndefValue::get(SlotType);
+    }
+
+    // The manipulations performed when we're looking through an insertvalue or
+    // an extractvalue would happen at the front of the RetPath list, so since
+    // we have to copy it anyway it's more efficient to create a reversed copy.
+    SmallVector<unsigned, 4> TmpRetPath(llvm::reverse(RetPath));
+    SmallVector<unsigned, 4> TmpCallPath(llvm::reverse(CallPath));
+
+    // Finally, we can check whether the value produced by the tail call at this
+    // index is compatible with the value we return.
+    if (!slotOnlyDiscardsData(RetVal, CallVal, TmpRetPath, TmpCallPath,
+                              AllowDifferingSizes, TLI,
+                              F->getParent()->getDataLayout()))
+      return false;
+
+    CallEmpty  = !nextRealType(CallSubTypes, CallPath);
+  } while(nextRealType(RetSubTypes, RetPath));
+
+  return true;
+}
+
+static void collectEHScopeMembers(
+    DenseMap<const MachineBasicBlock *, int> &EHScopeMembership, int EHScope,
+    const MachineBasicBlock *MBB) {
+  SmallVector<const MachineBasicBlock *, 16> Worklist = {MBB};
+  while (!Worklist.empty()) {
+    const MachineBasicBlock *Visiting = Worklist.pop_back_val();
+    // Don't follow blocks which start new scopes.
+    if (Visiting->isEHPad() && Visiting != MBB)
+      continue;
+
+    // Add this MBB to our scope.
+    auto P = EHScopeMembership.insert(std::make_pair(Visiting, EHScope));
+
+    // Don't revisit blocks.
+    if (!P.second) {
+      assert(P.first->second == EHScope && "MBB is part of two scopes!");
+      continue;
+    }
+
+    // Returns are boundaries where scope transfer can occur, don't follow
+    // successors.
+    if (Visiting->isEHScopeReturnBlock())
+      continue;
+
+    append_range(Worklist, Visiting->successors());
+  }
+}
+
+DenseMap<const MachineBasicBlock *, int>
+llvm::getEHScopeMembership(const MachineFunction &MF) {
+  DenseMap<const MachineBasicBlock *, int> EHScopeMembership;
+
+  // We don't have anything to do if there aren't any EH pads.
+  if (!MF.hasEHScopes())
+    return EHScopeMembership;
+
+  int EntryBBNumber = MF.front().getNumber();
+  bool IsSEH = isAsynchronousEHPersonality(
+      classifyEHPersonality(MF.getFunction().getPersonalityFn()));
+
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  SmallVector<const MachineBasicBlock *, 16> EHScopeBlocks;
+  SmallVector<const MachineBasicBlock *, 16> UnreachableBlocks;
+  SmallVector<const MachineBasicBlock *, 16> SEHCatchPads;
+  SmallVector<std::pair<const MachineBasicBlock *, int>, 16> CatchRetSuccessors;
+  for (const MachineBasicBlock &MBB : MF) {
+    if (MBB.isEHScopeEntry()) {
+      EHScopeBlocks.push_back(&MBB);
+    } else if (IsSEH && MBB.isEHPad()) {
+      SEHCatchPads.push_back(&MBB);
+    } else if (MBB.pred_empty()) {
+      UnreachableBlocks.push_back(&MBB);
+    }
+
+    MachineBasicBlock::const_iterator MBBI = MBB.getFirstTerminator();
+
+    // CatchPads are not scopes for SEH so do not consider CatchRet to
+    // transfer control to another scope.
+    if (MBBI == MBB.end() || MBBI->getOpcode() != TII->getCatchReturnOpcode())
+      continue;
+
+    // FIXME: SEH CatchPads are not necessarily in the parent function:
+    // they could be inside a finally block.
+    const MachineBasicBlock *Successor = MBBI->getOperand(0).getMBB();
+    const MachineBasicBlock *SuccessorColor = MBBI->getOperand(1).getMBB();
+    CatchRetSuccessors.push_back(
+        {Successor, IsSEH ? EntryBBNumber : SuccessorColor->getNumber()});
+  }
+
+  // We don't have anything to do if there aren't any EH pads.
+  if (EHScopeBlocks.empty())
+    return EHScopeMembership;
+
+  // Identify all the basic blocks reachable from the function entry.
+  collectEHScopeMembers(EHScopeMembership, EntryBBNumber, &MF.front());
+  // All blocks not part of a scope are in the parent function.
+  for (const MachineBasicBlock *MBB : UnreachableBlocks)
+    collectEHScopeMembers(EHScopeMembership, EntryBBNumber, MBB);
+  // Next, identify all the blocks inside the scopes.
+  for (const MachineBasicBlock *MBB : EHScopeBlocks)
+    collectEHScopeMembers(EHScopeMembership, MBB->getNumber(), MBB);
+  // SEH CatchPads aren't really scopes, handle them separately.
+  for (const MachineBasicBlock *MBB : SEHCatchPads)
+    collectEHScopeMembers(EHScopeMembership, EntryBBNumber, MBB);
+  // Finally, identify all the targets of a catchret.
+  for (std::pair<const MachineBasicBlock *, int> CatchRetPair :
+       CatchRetSuccessors)
+    collectEHScopeMembers(EHScopeMembership, CatchRetPair.second,
+                          CatchRetPair.first);
+  return EHScopeMembership;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
new file mode 100644
index 000000000000..82b5ccdc70ea
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
@@ -0,0 +1,93 @@
+//===-- CodeGen/AsmPrinter/AIXException.cpp - AIX Exception Impl ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing AIX exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfException.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCSectionXCOFF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+AIXException::AIXException(AsmPrinter *A) : EHStreamer(A) {}
+
+void AIXException::emitExceptionInfoTable(const MCSymbol *LSDA,
+                                          const MCSymbol *PerSym) {
+  // Generate EH Info Table.
+  // The EH Info Table, aka, 'compat unwind section' on AIX, have the following
+  // format: struct eh_info_t {
+  //   unsigned version;           /* EH info verion 0 */
+  // #if defined(__64BIT__)
+  //   char _pad[4];               /* padding */
+  // #endif
+  //   unsigned long lsda;         /* Pointer to LSDA */
+  //   unsigned long personality;  /* Pointer to the personality routine */
+  //   }
+
+  auto *EHInfo =
+      cast<MCSectionXCOFF>(Asm->getObjFileLowering().getCompactUnwindSection());
+  if (Asm->TM.getFunctionSections()) {
+    // If option -ffunction-sections is on, append the function name to the
+    // name of EH Info Table csect so that each function has its own EH Info
+    // Table csect. This helps the linker to garbage-collect EH info of unused
+    // functions.
+    SmallString<128> NameStr = EHInfo->getName();
+    raw_svector_ostream(NameStr) << '.' << Asm->MF->getFunction().getName();
+    EHInfo = Asm->OutContext.getXCOFFSection(NameStr, EHInfo->getKind(),
+                                             EHInfo->getCsectProp());
+  }
+  Asm->OutStreamer->switchSection(EHInfo);
+  MCSymbol *EHInfoLabel =
+      TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(Asm->MF);
+  Asm->OutStreamer->emitLabel(EHInfoLabel);
+
+  // Version number.
+  Asm->emitInt32(0);
+
+  const DataLayout &DL = MMI->getModule()->getDataLayout();
+  const unsigned PointerSize = DL.getPointerSize();
+
+  // Add necessary paddings in 64 bit mode.
+  Asm->OutStreamer->emitValueToAlignment(Align(PointerSize));
+
+  // LSDA location.
+  Asm->OutStreamer->emitValue(MCSymbolRefExpr::create(LSDA, Asm->OutContext),
+                              PointerSize);
+
+  // Personality routine.
+  Asm->OutStreamer->emitValue(MCSymbolRefExpr::create(PerSym, Asm->OutContext),
+                              PointerSize);
+}
+
+void AIXException::endFunction(const MachineFunction *MF) {
+  // There is no easy way to access register information in `AIXException`
+  // class. when ShouldEmitEHBlock is false and VRs are saved, A dumy eh info
+  // table are emitted in PPCAIXAsmPrinter::emitFunctionBodyEnd.
+  if (!TargetLoweringObjectFileXCOFF::ShouldEmitEHBlock(MF))
+    return;
+
+  const MCSymbol *LSDALabel = emitExceptionTable();
+
+  const Function &F = MF->getFunction();
+  assert(F.hasPersonalityFn() &&
+         "Landingpads are presented, but no personality routine is found.");
+  const auto *Per =
+      cast<GlobalValue>(F.getPersonalityFn()->stripPointerCasts());
+  const MCSymbol *PerSym = Asm->TM.getSymbol(Per);
+
+  emitExceptionInfoTable(LSDALabel, PerSym);
+}
+
+} // End of namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
new file mode 100644
index 000000000000..de6ebcf0c341
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -0,0 +1,132 @@
+//===-- CodeGen/AsmPrinter/ARMException.cpp - ARM EHABI Exception Impl ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing DWARF exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfException.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+using namespace llvm;
+
+ARMException::ARMException(AsmPrinter *A) : EHStreamer(A) {}
+
+ARMException::~ARMException() = default;
+
+ARMTargetStreamer &ARMException::getTargetStreamer() {
+  MCTargetStreamer &TS = *Asm->OutStreamer->getTargetStreamer();
+  return static_cast<ARMTargetStreamer &>(TS);
+}
+
+void ARMException::beginFunction(const MachineFunction *MF) {
+  if (Asm->MAI->getExceptionHandlingType() == ExceptionHandling::ARM)
+    getTargetStreamer().emitFnStart();
+  // See if we need call frame info.
+  AsmPrinter::CFISection CFISecType = Asm->getFunctionCFISectionType(*MF);
+  assert(CFISecType != AsmPrinter::CFISection::EH &&
+         "non-EH CFI not yet supported in prologue with EHABI lowering");
+
+  if (CFISecType == AsmPrinter::CFISection::Debug) {
+    if (!hasEmittedCFISections) {
+      if (Asm->getModuleCFISectionType() == AsmPrinter::CFISection::Debug)
+        Asm->OutStreamer->emitCFISections(false, true);
+      hasEmittedCFISections = true;
+    }
+
+    shouldEmitCFI = true;
+    Asm->OutStreamer->emitCFIStartProc(false);
+  }
+}
+
+void ARMException::markFunctionEnd() {
+  if (shouldEmitCFI)
+    Asm->OutStreamer->emitCFIEndProc();
+}
+
+/// endFunction - Gather and emit post-function exception information.
+///
+void ARMException::endFunction(const MachineFunction *MF) {
+  ARMTargetStreamer &ATS = getTargetStreamer();
+  const Function &F = MF->getFunction();
+  const Function *Per = nullptr;
+  if (F.hasPersonalityFn())
+    Per = dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts());
+  bool forceEmitPersonality =
+    F.hasPersonalityFn() && !isNoOpWithoutInvoke(classifyEHPersonality(Per)) &&
+    F.needsUnwindTableEntry();
+  bool shouldEmitPersonality = forceEmitPersonality ||
+    !MF->getLandingPads().empty();
+  if (!Asm->MF->getFunction().needsUnwindTableEntry() &&
+      !shouldEmitPersonality)
+    ATS.emitCantUnwind();
+  else if (shouldEmitPersonality) {
+    // Emit references to personality.
+    if (Per) {
+      MCSymbol *PerSym = Asm->getSymbol(Per);
+      ATS.emitPersonality(PerSym);
+    }
+
+    // Emit .handlerdata directive.
+    ATS.emitHandlerData();
+
+    // Emit actual exception table
+    emitExceptionTable();
+  }
+
+  if (Asm->MAI->getExceptionHandlingType() == ExceptionHandling::ARM)
+    ATS.emitFnEnd();
+}
+
+void ARMException::emitTypeInfos(unsigned TTypeEncoding,
+                                 MCSymbol *TTBaseLabel) {
+  const MachineFunction *MF = Asm->MF;
+  const std::vector<const GlobalValue *> &TypeInfos = MF->getTypeInfos();
+  const std::vector<unsigned> &FilterIds = MF->getFilterIds();
+
+  bool VerboseAsm = Asm->OutStreamer->isVerboseAsm();
+
+  int Entry = 0;
+  // Emit the Catch TypeInfos.
+  if (VerboseAsm && !TypeInfos.empty()) {
+    Asm->OutStreamer->AddComment(">> Catch TypeInfos <<");
+    Asm->OutStreamer->addBlankLine();
+    Entry = TypeInfos.size();
+  }
+
+  for (const GlobalValue *GV : reverse(TypeInfos)) {
+    if (VerboseAsm)
+      Asm->OutStreamer->AddComment("TypeInfo " + Twine(Entry--));
+    Asm->emitTTypeReference(GV, TTypeEncoding);
+  }
+
+  Asm->OutStreamer->emitLabel(TTBaseLabel);
+
+  // Emit the Exception Specifications.
+  if (VerboseAsm && !FilterIds.empty()) {
+    Asm->OutStreamer->AddComment(">> Filter TypeInfos <<");
+    Asm->OutStreamer->addBlankLine();
+    Entry = 0;
+  }
+  for (std::vector<unsigned>::const_iterator
+         I = FilterIds.begin(), E = FilterIds.end(); I < E; ++I) {
+    unsigned TypeID = *I;
+    if (VerboseAsm) {
+      --Entry;
+      if (TypeID != 0)
+        Asm->OutStreamer->AddComment("FilterInfo " + Twine(Entry));
+    }
+
+    Asm->emitTTypeReference((TypeID == 0 ? nullptr : TypeInfos[TypeID - 1]),
+                            TTypeEncoding);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
new file mode 100644
index 000000000000..aab3c2681339
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
@@ -0,0 +1,714 @@
+//===- llvm/CodeGen/AsmPrinter/AccelTable.cpp - Accelerator Tables --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing accelerator tables.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/AccelTable.h"
+#include "DwarfCompileUnit.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <vector>
+
+using namespace llvm;
+
+void AccelTableBase::computeBucketCount() {
+  // First get the number of unique hashes.
+  std::vector<uint32_t> Uniques;
+  Uniques.reserve(Entries.size());
+  for (const auto &E : Entries)
+    Uniques.push_back(E.second.HashValue);
+  array_pod_sort(Uniques.begin(), Uniques.end());
+  std::vector<uint32_t>::iterator P =
+      std::unique(Uniques.begin(), Uniques.end());
+
+  UniqueHashCount = std::distance(Uniques.begin(), P);
+
+  if (UniqueHashCount > 1024)
+    BucketCount = UniqueHashCount / 4;
+  else if (UniqueHashCount > 16)
+    BucketCount = UniqueHashCount / 2;
+  else
+    BucketCount = std::max<uint32_t>(UniqueHashCount, 1);
+}
+
+void AccelTableBase::finalize(AsmPrinter *Asm, StringRef Prefix) {
+  // Create the individual hash data outputs.
+  for (auto &E : Entries) {
+    // Unique the entries.
+    llvm::stable_sort(E.second.Values,
+                      [](const AccelTableData *A, const AccelTableData *B) {
+                        return *A < *B;
+                      });
+    E.second.Values.erase(
+        std::unique(E.second.Values.begin(), E.second.Values.end()),
+        E.second.Values.end());
+  }
+
+  // Figure out how many buckets we need, then compute the bucket contents and
+  // the final ordering. The hashes and offsets can be emitted by walking these
+  // data structures. We add temporary symbols to the data so they can be
+  // referenced when emitting the offsets.
+  computeBucketCount();
+
+  // Compute bucket contents and final ordering.
+  Buckets.resize(BucketCount);
+  for (auto &E : Entries) {
+    uint32_t Bucket = E.second.HashValue % BucketCount;
+    Buckets[Bucket].push_back(&E.second);
+    E.second.Sym = Asm->createTempSymbol(Prefix);
+  }
+
+  // Sort the contents of the buckets by hash value so that hash collisions end
+  // up together. Stable sort makes testing easier and doesn't cost much more.
+  for (auto &Bucket : Buckets)
+    llvm::stable_sort(Bucket, [](HashData *LHS, HashData *RHS) {
+      return LHS->HashValue < RHS->HashValue;
+    });
+}
+
+namespace {
+/// Base class for writing out Accelerator tables. It holds the common
+/// functionality for the two Accelerator table types.
+class AccelTableWriter {
+protected:
+  AsmPrinter *const Asm;          ///< Destination.
+  const AccelTableBase &Contents; ///< Data to emit.
+
+  /// Controls whether to emit duplicate hash and offset table entries for names
+  /// with identical hashes. Apple tables don't emit duplicate entries, DWARF v5
+  /// tables do.
+  const bool SkipIdenticalHashes;
+
+  void emitHashes() const;
+
+  /// Emit offsets to lists of entries with identical names. The offsets are
+  /// relative to the Base argument.
+  void emitOffsets(const MCSymbol *Base) const;
+
+public:
+  AccelTableWriter(AsmPrinter *Asm, const AccelTableBase &Contents,
+                   bool SkipIdenticalHashes)
+      : Asm(Asm), Contents(Contents), SkipIdenticalHashes(SkipIdenticalHashes) {
+  }
+};
+
+class AppleAccelTableWriter : public AccelTableWriter {
+  using Atom = AppleAccelTableData::Atom;
+
+  /// The fixed header of an Apple Accelerator Table.
+  struct Header {
+    uint32_t Magic = MagicHash;
+    uint16_t Version = 1;
+    uint16_t HashFunction = dwarf::DW_hash_function_djb;
+    uint32_t BucketCount;
+    uint32_t HashCount;
+    uint32_t HeaderDataLength;
+
+    /// 'HASH' magic value to detect endianness.
+    static const uint32_t MagicHash = 0x48415348;
+
+    Header(uint32_t BucketCount, uint32_t UniqueHashCount, uint32_t DataLength)
+        : BucketCount(BucketCount), HashCount(UniqueHashCount),
+          HeaderDataLength(DataLength) {}
+
+    void emit(AsmPrinter *Asm) const;
+#ifndef NDEBUG
+    void print(raw_ostream &OS) const;
+    void dump() const { print(dbgs()); }
+#endif
+  };
+
+  /// The HeaderData describes the structure of an Apple accelerator table
+  /// through a list of Atoms.
+  struct HeaderData {
+    /// In the case of data that is referenced via DW_FORM_ref_* the offset
+    /// base is used to describe the offset for all forms in the list of atoms.
+    uint32_t DieOffsetBase;
+
+    const SmallVector<Atom, 4> Atoms;
+
+    HeaderData(ArrayRef<Atom> AtomList, uint32_t Offset = 0)
+        : DieOffsetBase(Offset), Atoms(AtomList.begin(), AtomList.end()) {}
+
+    void emit(AsmPrinter *Asm) const;
+#ifndef NDEBUG
+    void print(raw_ostream &OS) const;
+    void dump() const { print(dbgs()); }
+#endif
+  };
+
+  Header Header;
+  HeaderData HeaderData;
+  const MCSymbol *SecBegin;
+
+  void emitBuckets() const;
+  void emitData() const;
+
+public:
+  AppleAccelTableWriter(AsmPrinter *Asm, const AccelTableBase &Contents,
+                        ArrayRef<Atom> Atoms, const MCSymbol *SecBegin)
+      : AccelTableWriter(Asm, Contents, true),
+        Header(Contents.getBucketCount(), Contents.getUniqueHashCount(),
+               8 + (Atoms.size() * 4)),
+        HeaderData(Atoms), SecBegin(SecBegin) {}
+
+  void emit() const;
+
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const;
+  void dump() const { print(dbgs()); }
+#endif
+};
+
+/// Class responsible for emitting a DWARF v5 Accelerator Table. The only
+/// public function is emit(), which performs the actual emission.
+///
+/// The class is templated in its data type. This allows us to emit both dyamic
+/// and static data entries. A callback abstract the logic to provide a CU
+/// index for a given entry, which is different per data type, but identical
+/// for every entry in the same table.
+template <typename DataT>
+class Dwarf5AccelTableWriter : public AccelTableWriter {
+  struct Header {
+    uint16_t Version = 5;
+    uint16_t Padding = 0;
+    uint32_t CompUnitCount;
+    uint32_t LocalTypeUnitCount = 0;
+    uint32_t ForeignTypeUnitCount = 0;
+    uint32_t BucketCount = 0;
+    uint32_t NameCount = 0;
+    uint32_t AbbrevTableSize = 0;
+    uint32_t AugmentationStringSize = sizeof(AugmentationString);
+    char AugmentationString[8] = {'L', 'L', 'V', 'M', '0', '7', '0', '0'};
+
+    Header(uint32_t CompUnitCount, uint32_t BucketCount, uint32_t NameCount)
+        : CompUnitCount(CompUnitCount), BucketCount(BucketCount),
+          NameCount(NameCount) {}
+
+    void emit(Dwarf5AccelTableWriter &Ctx);
+  };
+  struct AttributeEncoding {
+    dwarf::Index Index;
+    dwarf::Form Form;
+  };
+
+  Header Header;
+  DenseMap<uint32_t, SmallVector<AttributeEncoding, 2>> Abbreviations;
+  ArrayRef<MCSymbol *> CompUnits;
+  llvm::function_ref<unsigned(const DataT &)> getCUIndexForEntry;
+  MCSymbol *ContributionEnd = nullptr;
+  MCSymbol *AbbrevStart = Asm->createTempSymbol("names_abbrev_start");
+  MCSymbol *AbbrevEnd = Asm->createTempSymbol("names_abbrev_end");
+  MCSymbol *EntryPool = Asm->createTempSymbol("names_entries");
+
+  DenseSet<uint32_t> getUniqueTags() const;
+
+  // Right now, we emit uniform attributes for all tags.
+  SmallVector<AttributeEncoding, 2> getUniformAttributes() const;
+
+  void emitCUList() const;
+  void emitBuckets() const;
+  void emitStringOffsets() const;
+  void emitAbbrevs() const;
+  void emitEntry(const DataT &Entry) const;
+  void emitData() const;
+
+public:
+  Dwarf5AccelTableWriter(
+      AsmPrinter *Asm, const AccelTableBase &Contents,
+      ArrayRef<MCSymbol *> CompUnits,
+      llvm::function_ref<unsigned(const DataT &)> GetCUIndexForEntry);
+
+  void emit();
+};
+} // namespace
+
+void AccelTableWriter::emitHashes() const {
+  uint64_t PrevHash = std::numeric_limits<uint64_t>::max();
+  unsigned BucketIdx = 0;
+  for (const auto &Bucket : Contents.getBuckets()) {
+    for (const auto &Hash : Bucket) {
+      uint32_t HashValue = Hash->HashValue;
+      if (SkipIdenticalHashes && PrevHash == HashValue)
+        continue;
+      Asm->OutStreamer->AddComment("Hash in Bucket " + Twine(BucketIdx));
+      Asm->emitInt32(HashValue);
+      PrevHash = HashValue;
+    }
+    BucketIdx++;
+  }
+}
+
+void AccelTableWriter::emitOffsets(const MCSymbol *Base) const {
+  const auto &Buckets = Contents.getBuckets();
+  uint64_t PrevHash = std::numeric_limits<uint64_t>::max();
+  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
+    for (auto *Hash : Buckets[i]) {
+      uint32_t HashValue = Hash->HashValue;
+      if (SkipIdenticalHashes && PrevHash == HashValue)
+        continue;
+      PrevHash = HashValue;
+      Asm->OutStreamer->AddComment("Offset in Bucket " + Twine(i));
+      Asm->emitLabelDifference(Hash->Sym, Base, Asm->getDwarfOffsetByteSize());
+    }
+  }
+}
+
+void AppleAccelTableWriter::Header::emit(AsmPrinter *Asm) const {
+  Asm->OutStreamer->AddComment("Header Magic");
+  Asm->emitInt32(Magic);
+  Asm->OutStreamer->AddComment("Header Version");
+  Asm->emitInt16(Version);
+  Asm->OutStreamer->AddComment("Header Hash Function");
+  Asm->emitInt16(HashFunction);
+  Asm->OutStreamer->AddComment("Header Bucket Count");
+  Asm->emitInt32(BucketCount);
+  Asm->OutStreamer->AddComment("Header Hash Count");
+  Asm->emitInt32(HashCount);
+  Asm->OutStreamer->AddComment("Header Data Length");
+  Asm->emitInt32(HeaderDataLength);
+}
+
+void AppleAccelTableWriter::HeaderData::emit(AsmPrinter *Asm) const {
+  Asm->OutStreamer->AddComment("HeaderData Die Offset Base");
+  Asm->emitInt32(DieOffsetBase);
+  Asm->OutStreamer->AddComment("HeaderData Atom Count");
+  Asm->emitInt32(Atoms.size());
+
+  for (const Atom &A : Atoms) {
+    Asm->OutStreamer->AddComment(dwarf::AtomTypeString(A.Type));
+    Asm->emitInt16(A.Type);
+    Asm->OutStreamer->AddComment(dwarf::FormEncodingString(A.Form));
+    Asm->emitInt16(A.Form);
+  }
+}
+
+void AppleAccelTableWriter::emitBuckets() const {
+  const auto &Buckets = Contents.getBuckets();
+  unsigned index = 0;
+  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
+    Asm->OutStreamer->AddComment("Bucket " + Twine(i));
+    if (!Buckets[i].empty())
+      Asm->emitInt32(index);
+    else
+      Asm->emitInt32(std::numeric_limits<uint32_t>::max());
+    // Buckets point in the list of hashes, not to the data. Do not increment
+    // the index multiple times in case of hash collisions.
+    uint64_t PrevHash = std::numeric_limits<uint64_t>::max();
+    for (auto *HD : Buckets[i]) {
+      uint32_t HashValue = HD->HashValue;
+      if (PrevHash != HashValue)
+        ++index;
+      PrevHash = HashValue;
+    }
+  }
+}
+
+void AppleAccelTableWriter::emitData() const {
+  const auto &Buckets = Contents.getBuckets();
+  for (const AccelTableBase::HashList &Bucket : Buckets) {
+    uint64_t PrevHash = std::numeric_limits<uint64_t>::max();
+    for (const auto &Hash : Bucket) {
+      // Terminate the previous entry if there is no hash collision with the
+      // current one.
+      if (PrevHash != std::numeric_limits<uint64_t>::max() &&
+          PrevHash != Hash->HashValue)
+        Asm->emitInt32(0);
+      // Remember to emit the label for our offset.
+      Asm->OutStreamer->emitLabel(Hash->Sym);
+      Asm->OutStreamer->AddComment(Hash->Name.getString());
+      Asm->emitDwarfStringOffset(Hash->Name);
+      Asm->OutStreamer->AddComment("Num DIEs");
+      Asm->emitInt32(Hash->Values.size());
+      for (const auto *V : Hash->Values)
+        static_cast<const AppleAccelTableData *>(V)->emit(Asm);
+      PrevHash = Hash->HashValue;
+    }
+    // Emit the final end marker for the bucket.
+    if (!Bucket.empty())
+      Asm->emitInt32(0);
+  }
+}
+
+void AppleAccelTableWriter::emit() const {
+  Header.emit(Asm);
+  HeaderData.emit(Asm);
+  emitBuckets();
+  emitHashes();
+  emitOffsets(SecBegin);
+  emitData();
+}
+
+template <typename DataT>
+void Dwarf5AccelTableWriter<DataT>::Header::emit(Dwarf5AccelTableWriter &Ctx) {
+  assert(CompUnitCount > 0 && "Index must have at least one CU.");
+
+  AsmPrinter *Asm = Ctx.Asm;
+  Ctx.ContributionEnd =
+      Asm->emitDwarfUnitLength("names", "Header: unit length");
+  Asm->OutStreamer->AddComment("Header: version");
+  Asm->emitInt16(Version);
+  Asm->OutStreamer->AddComment("Header: padding");
+  Asm->emitInt16(Padding);
+  Asm->OutStreamer->AddComment("Header: compilation unit count");
+  Asm->emitInt32(CompUnitCount);
+  Asm->OutStreamer->AddComment("Header: local type unit count");
+  Asm->emitInt32(LocalTypeUnitCount);
+  Asm->OutStreamer->AddComment("Header: foreign type unit count");
+  Asm->emitInt32(ForeignTypeUnitCount);
+  Asm->OutStreamer->AddComment("Header: bucket count");
+  Asm->emitInt32(BucketCount);
+  Asm->OutStreamer->AddComment("Header: name count");
+  Asm->emitInt32(NameCount);
+  Asm->OutStreamer->AddComment("Header: abbreviation table size");
+  Asm->emitLabelDifference(Ctx.AbbrevEnd, Ctx.AbbrevStart, sizeof(uint32_t));
+  Asm->OutStreamer->AddComment("Header: augmentation string size");
+  assert(AugmentationStringSize % 4 == 0);
+  Asm->emitInt32(AugmentationStringSize);
+  Asm->OutStreamer->AddComment("Header: augmentation string");
+  Asm->OutStreamer->emitBytes({AugmentationString, AugmentationStringSize});
+}
+
+template <typename DataT>
+DenseSet<uint32_t> Dwarf5AccelTableWriter<DataT>::getUniqueTags() const {
+  DenseSet<uint32_t> UniqueTags;
+  for (auto &Bucket : Contents.getBuckets()) {
+    for (auto *Hash : Bucket) {
+      for (auto *Value : Hash->Values) {
+        unsigned Tag = static_cast<const DataT *>(Value)->getDieTag();
+        UniqueTags.insert(Tag);
+      }
+    }
+  }
+  return UniqueTags;
+}
+
+template <typename DataT>
+SmallVector<typename Dwarf5AccelTableWriter<DataT>::AttributeEncoding, 2>
+Dwarf5AccelTableWriter<DataT>::getUniformAttributes() const {
+  SmallVector<AttributeEncoding, 2> UA;
+  if (CompUnits.size() > 1) {
+    size_t LargestCUIndex = CompUnits.size() - 1;
+    dwarf::Form Form = DIEInteger::BestForm(/*IsSigned*/ false, LargestCUIndex);
+    UA.push_back({dwarf::DW_IDX_compile_unit, Form});
+  }
+  UA.push_back({dwarf::DW_IDX_die_offset, dwarf::DW_FORM_ref4});
+  return UA;
+}
+
+template <typename DataT>
+void Dwarf5AccelTableWriter<DataT>::emitCUList() const {
+  for (const auto &CU : enumerate(CompUnits)) {
+    Asm->OutStreamer->AddComment("Compilation unit " + Twine(CU.index()));
+    Asm->emitDwarfSymbolReference(CU.value());
+  }
+}
+
+template <typename DataT>
+void Dwarf5AccelTableWriter<DataT>::emitBuckets() const {
+  uint32_t Index = 1;
+  for (const auto &Bucket : enumerate(Contents.getBuckets())) {
+    Asm->OutStreamer->AddComment("Bucket " + Twine(Bucket.index()));
+    Asm->emitInt32(Bucket.value().empty() ? 0 : Index);
+    Index += Bucket.value().size();
+  }
+}
+
+template <typename DataT>
+void Dwarf5AccelTableWriter<DataT>::emitStringOffsets() const {
+  for (const auto &Bucket : enumerate(Contents.getBuckets())) {
+    for (auto *Hash : Bucket.value()) {
+      DwarfStringPoolEntryRef String = Hash->Name;
+      Asm->OutStreamer->AddComment("String in Bucket " + Twine(Bucket.index()) +
+                                   ": " + String.getString());
+      Asm->emitDwarfStringOffset(String);
+    }
+  }
+}
+
+template <typename DataT>
+void Dwarf5AccelTableWriter<DataT>::emitAbbrevs() const {
+  Asm->OutStreamer->emitLabel(AbbrevStart);
+  for (const auto &Abbrev : Abbreviations) {
+    Asm->OutStreamer->AddComment("Abbrev code");
+    assert(Abbrev.first != 0);
+    Asm->emitULEB128(Abbrev.first);
+    Asm->OutStreamer->AddComment(dwarf::TagString(Abbrev.first));
+    Asm->emitULEB128(Abbrev.first);
+    for (const auto &AttrEnc : Abbrev.second) {
+      Asm->emitULEB128(AttrEnc.Index, dwarf::IndexString(AttrEnc.Index).data());
+      Asm->emitULEB128(AttrEnc.Form,
+                       dwarf::FormEncodingString(AttrEnc.Form).data());
+    }
+    Asm->emitULEB128(0, "End of abbrev");
+    Asm->emitULEB128(0, "End of abbrev");
+  }
+  Asm->emitULEB128(0, "End of abbrev list");
+  Asm->OutStreamer->emitLabel(AbbrevEnd);
+}
+
+template <typename DataT>
+void Dwarf5AccelTableWriter<DataT>::emitEntry(const DataT &Entry) const {
+  auto AbbrevIt = Abbreviations.find(Entry.getDieTag());
+  assert(AbbrevIt != Abbreviations.end() &&
+         "Why wasn't this abbrev generated?");
+
+  Asm->emitULEB128(AbbrevIt->first, "Abbreviation code");
+  for (const auto &AttrEnc : AbbrevIt->second) {
+    Asm->OutStreamer->AddComment(dwarf::IndexString(AttrEnc.Index));
+    switch (AttrEnc.Index) {
+    case dwarf::DW_IDX_compile_unit: {
+      DIEInteger ID(getCUIndexForEntry(Entry));
+      ID.emitValue(Asm, AttrEnc.Form);
+      break;
+    }
+    case dwarf::DW_IDX_die_offset:
+      assert(AttrEnc.Form == dwarf::DW_FORM_ref4);
+      Asm->emitInt32(Entry.getDieOffset());
+      break;
+    default:
+      llvm_unreachable("Unexpected index attribute!");
+    }
+  }
+}
+
+template <typename DataT> void Dwarf5AccelTableWriter<DataT>::emitData() const {
+  Asm->OutStreamer->emitLabel(EntryPool);
+  for (auto &Bucket : Contents.getBuckets()) {
+    for (auto *Hash : Bucket) {
+      // Remember to emit the label for our offset.
+      Asm->OutStreamer->emitLabel(Hash->Sym);
+      for (const auto *Value : Hash->Values)
+        emitEntry(*static_cast<const DataT *>(Value));
+      Asm->OutStreamer->AddComment("End of list: " + Hash->Name.getString());
+      Asm->emitInt8(0);
+    }
+  }
+}
+
+template <typename DataT>
+Dwarf5AccelTableWriter<DataT>::Dwarf5AccelTableWriter(
+    AsmPrinter *Asm, const AccelTableBase &Contents,
+    ArrayRef<MCSymbol *> CompUnits,
+    llvm::function_ref<unsigned(const DataT &)> getCUIndexForEntry)
+    : AccelTableWriter(Asm, Contents, false),
+      Header(CompUnits.size(), Contents.getBucketCount(),
+             Contents.getUniqueNameCount()),
+      CompUnits(CompUnits), getCUIndexForEntry(std::move(getCUIndexForEntry)) {
+  DenseSet<uint32_t> UniqueTags = getUniqueTags();
+  SmallVector<AttributeEncoding, 2> UniformAttributes = getUniformAttributes();
+
+  Abbreviations.reserve(UniqueTags.size());
+  for (uint32_t Tag : UniqueTags)
+    Abbreviations.try_emplace(Tag, UniformAttributes);
+}
+
+template <typename DataT> void Dwarf5AccelTableWriter<DataT>::emit() {
+  Header.emit(*this);
+  emitCUList();
+  emitBuckets();
+  emitHashes();
+  emitStringOffsets();
+  emitOffsets(EntryPool);
+  emitAbbrevs();
+  emitData();
+  Asm->OutStreamer->emitValueToAlignment(Align(4), 0);
+  Asm->OutStreamer->emitLabel(ContributionEnd);
+}
+
+void llvm::emitAppleAccelTableImpl(AsmPrinter *Asm, AccelTableBase &Contents,
+                                   StringRef Prefix, const MCSymbol *SecBegin,
+                                   ArrayRef<AppleAccelTableData::Atom> Atoms) {
+  Contents.finalize(Asm, Prefix);
+  AppleAccelTableWriter(Asm, Contents, Atoms, SecBegin).emit();
+}
+
+void llvm::emitDWARF5AccelTable(
+    AsmPrinter *Asm, AccelTable<DWARF5AccelTableData> &Contents,
+    const DwarfDebug &DD, ArrayRef<std::unique_ptr<DwarfCompileUnit>> CUs) {
+  std::vector<MCSymbol *> CompUnits;
+  SmallVector<unsigned, 1> CUIndex(CUs.size());
+  int Count = 0;
+  for (const auto &CU : enumerate(CUs)) {
+    switch (CU.value()->getCUNode()->getNameTableKind()) {
+    case DICompileUnit::DebugNameTableKind::Default:
+    case DICompileUnit::DebugNameTableKind::Apple:
+      break;
+    default:
+      continue;
+    }
+    CUIndex[CU.index()] = Count++;
+    assert(CU.index() == CU.value()->getUniqueID());
+    const DwarfCompileUnit *MainCU =
+        DD.useSplitDwarf() ? CU.value()->getSkeleton() : CU.value().get();
+    CompUnits.push_back(MainCU->getLabelBegin());
+  }
+
+  if (CompUnits.empty())
+    return;
+
+  Asm->OutStreamer->switchSection(
+      Asm->getObjFileLowering().getDwarfDebugNamesSection());
+
+  Contents.finalize(Asm, "names");
+  Dwarf5AccelTableWriter<DWARF5AccelTableData>(
+      Asm, Contents, CompUnits,
+      [&](const DWARF5AccelTableData &Entry) {
+        const DIE *CUDie = Entry.getDie().getUnitDie();
+        return CUIndex[DD.lookupCU(CUDie)->getUniqueID()];
+      })
+      .emit();
+}
+
+void llvm::emitDWARF5AccelTable(
+    AsmPrinter *Asm, AccelTable<DWARF5AccelTableStaticData> &Contents,
+    ArrayRef<MCSymbol *> CUs,
+    llvm::function_ref<unsigned(const DWARF5AccelTableStaticData &)>
+        getCUIndexForEntry) {
+  Contents.finalize(Asm, "names");
+  Dwarf5AccelTableWriter<DWARF5AccelTableStaticData>(Asm, Contents, CUs,
+                                                     getCUIndexForEntry)
+      .emit();
+}
+
+void AppleAccelTableOffsetData::emit(AsmPrinter *Asm) const {
+  assert(Die.getDebugSectionOffset() <= UINT32_MAX &&
+         "The section offset exceeds the limit.");
+  Asm->emitInt32(Die.getDebugSectionOffset());
+}
+
+void AppleAccelTableTypeData::emit(AsmPrinter *Asm) const {
+  assert(Die.getDebugSectionOffset() <= UINT32_MAX &&
+         "The section offset exceeds the limit.");
+  Asm->emitInt32(Die.getDebugSectionOffset());
+  Asm->emitInt16(Die.getTag());
+  Asm->emitInt8(0);
+}
+
+void AppleAccelTableStaticOffsetData::emit(AsmPrinter *Asm) const {
+  Asm->emitInt32(Offset);
+}
+
+void AppleAccelTableStaticTypeData::emit(AsmPrinter *Asm) const {
+  Asm->emitInt32(Offset);
+  Asm->emitInt16(Tag);
+  Asm->emitInt8(ObjCClassIsImplementation ? dwarf::DW_FLAG_type_implementation
+                                          : 0);
+  Asm->emitInt32(QualifiedNameHash);
+}
+
+constexpr AppleAccelTableData::Atom AppleAccelTableTypeData::Atoms[];
+constexpr AppleAccelTableData::Atom AppleAccelTableOffsetData::Atoms[];
+constexpr AppleAccelTableData::Atom AppleAccelTableStaticOffsetData::Atoms[];
+constexpr AppleAccelTableData::Atom AppleAccelTableStaticTypeData::Atoms[];
+
+#ifndef NDEBUG
+void AppleAccelTableWriter::Header::print(raw_ostream &OS) const {
+  OS << "Magic: " << format("0x%x", Magic) << "\n"
+     << "Version: " << Version << "\n"
+     << "Hash Function: " << HashFunction << "\n"
+     << "Bucket Count: " << BucketCount << "\n"
+     << "Header Data Length: " << HeaderDataLength << "\n";
+}
+
+void AppleAccelTableData::Atom::print(raw_ostream &OS) const {
+  OS << "Type: " << dwarf::AtomTypeString(Type) << "\n"
+     << "Form: " << dwarf::FormEncodingString(Form) << "\n";
+}
+
+void AppleAccelTableWriter::HeaderData::print(raw_ostream &OS) const {
+  OS << "DIE Offset Base: " << DieOffsetBase << "\n";
+  for (auto Atom : Atoms)
+    Atom.print(OS);
+}
+
+void AppleAccelTableWriter::print(raw_ostream &OS) const {
+  Header.print(OS);
+  HeaderData.print(OS);
+  Contents.print(OS);
+  SecBegin->print(OS, nullptr);
+}
+
+void AccelTableBase::HashData::print(raw_ostream &OS) const {
+  OS << "Name: " << Name.getString() << "\n";
+  OS << "  Hash Value: " << format("0x%x", HashValue) << "\n";
+  OS << "  Symbol: ";
+  if (Sym)
+    OS << *Sym;
+  else
+    OS << "<none>";
+  OS << "\n";
+  for (auto *Value : Values)
+    Value->print(OS);
+}
+
+void AccelTableBase::print(raw_ostream &OS) const {
+  // Print Content.
+  OS << "Entries: \n";
+  for (const auto &[Name, Data] : Entries) {
+    OS << "Name: " << Name << "\n";
+    for (auto *V : Data.Values)
+      V->print(OS);
+  }
+
+  OS << "Buckets and Hashes: \n";
+  for (const auto &Bucket : Buckets)
+    for (const auto &Hash : Bucket)
+      Hash->print(OS);
+
+  OS << "Data: \n";
+  for (const auto &E : Entries)
+    E.second.print(OS);
+}
+
+void DWARF5AccelTableData::print(raw_ostream &OS) const {
+  OS << "  Offset: " << getDieOffset() << "\n";
+  OS << "  Tag: " << dwarf::TagString(getDieTag()) << "\n";
+}
+
+void DWARF5AccelTableStaticData::print(raw_ostream &OS) const {
+  OS << "  Offset: " << getDieOffset() << "\n";
+  OS << "  Tag: " << dwarf::TagString(getDieTag()) << "\n";
+}
+
+void AppleAccelTableOffsetData::print(raw_ostream &OS) const {
+  OS << "  Offset: " << Die.getOffset() << "\n";
+}
+
+void AppleAccelTableTypeData::print(raw_ostream &OS) const {
+  OS << "  Offset: " << Die.getOffset() << "\n";
+  OS << "  Tag: " << dwarf::TagString(Die.getTag()) << "\n";
+}
+
+void AppleAccelTableStaticOffsetData::print(raw_ostream &OS) const {
+  OS << "  Static Offset: " << Offset << "\n";
+}
+
+void AppleAccelTableStaticTypeData::print(raw_ostream &OS) const {
+  OS << "  Static Offset: " << Offset << "\n";
+  OS << "  QualifiedNameHash: " << format("%x\n", QualifiedNameHash) << "\n";
+  OS << "  Tag: " << dwarf::TagString(Tag) << "\n";
+  OS << "  ObjCClassIsImplementation: "
+     << (ObjCClassIsImplementation ? "true" : "false");
+  OS << "\n";
+}
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
new file mode 100644
index 000000000000..00ee4e1b47a8
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
@@ -0,0 +1,73 @@
+//===- llvm/CodeGen/AddressPool.cpp - Dwarf Debug Framework ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddressPool.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <utility>
+
+using namespace llvm;
+
+unsigned AddressPool::getIndex(const MCSymbol *Sym, bool TLS) {
+  resetUsedFlag(true);
+  auto IterBool =
+      Pool.insert(std::make_pair(Sym, AddressPoolEntry(Pool.size(), TLS)));
+  return IterBool.first->second.Number;
+}
+
+MCSymbol *AddressPool::emitHeader(AsmPrinter &Asm, MCSection *Section) {
+  static const uint8_t AddrSize = Asm.MAI->getCodePointerSize();
+
+  MCSymbol *EndLabel =
+      Asm.emitDwarfUnitLength("debug_addr", "Length of contribution");
+  Asm.OutStreamer->AddComment("DWARF version number");
+  Asm.emitInt16(Asm.getDwarfVersion());
+  Asm.OutStreamer->AddComment("Address size");
+  Asm.emitInt8(AddrSize);
+  Asm.OutStreamer->AddComment("Segment selector size");
+  Asm.emitInt8(0); // TODO: Support non-zero segment_selector_size.
+
+  return EndLabel;
+}
+
+// Emit addresses into the section given.
+void AddressPool::emit(AsmPrinter &Asm, MCSection *AddrSection) {
+  if (isEmpty())
+    return;
+
+  // Start the dwarf addr section.
+  Asm.OutStreamer->switchSection(AddrSection);
+
+  MCSymbol *EndLabel = nullptr;
+
+  if (Asm.getDwarfVersion() >= 5)
+    EndLabel = emitHeader(Asm, AddrSection);
+
+  // Define the symbol that marks the start of the contribution.
+  // It is referenced via DW_AT_addr_base.
+  Asm.OutStreamer->emitLabel(AddressTableBaseSym);
+
+  // Order the address pool entries by ID
+  SmallVector<const MCExpr *, 64> Entries(Pool.size());
+
+  for (const auto &I : Pool)
+    Entries[I.second.Number] =
+        I.second.TLS
+            ? Asm.getObjFileLowering().getDebugThreadLocalSymbol(I.first)
+            : MCSymbolRefExpr::create(I.first, Asm.OutContext);
+
+  for (const MCExpr *Entry : Entries)
+    Asm.OutStreamer->emitValue(Entry, Asm.MAI->getCodePointerSize());
+
+  if (EndLabel)
+    Asm.OutStreamer->emitLabel(EndLabel);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AddressPool.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
new file mode 100644
index 000000000000..f1edc6c330d5
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
@@ -0,0 +1,65 @@
+//===- llvm/CodeGen/AddressPool.h - Dwarf Debug Framework -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_ADDRESSPOOL_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_ADDRESSPOOL_H
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class MCSection;
+class MCSymbol;
+
+// Collection of addresses for this unit and assorted labels.
+// A Symbol->unsigned mapping of addresses used by indirect
+// references.
+class AddressPool {
+  struct AddressPoolEntry {
+    unsigned Number;
+    bool TLS;
+
+    AddressPoolEntry(unsigned Number, bool TLS) : Number(Number), TLS(TLS) {}
+  };
+  DenseMap<const MCSymbol *, AddressPoolEntry> Pool;
+
+  /// Record whether the AddressPool has been queried for an address index since
+  /// the last "resetUsedFlag" call. Used to implement type unit fallback - a
+  /// type that references addresses cannot be placed in a type unit when using
+  /// fission.
+  bool HasBeenUsed = false;
+
+public:
+  AddressPool() = default;
+
+  /// Returns the index into the address pool with the given
+  /// label/symbol.
+  unsigned getIndex(const MCSymbol *Sym, bool TLS = false);
+
+  void emit(AsmPrinter &Asm, MCSection *AddrSection);
+
+  bool isEmpty() { return Pool.empty(); }
+
+  bool hasBeenUsed() const { return HasBeenUsed; }
+
+  void resetUsedFlag(bool HasBeenUsed = false) { this->HasBeenUsed = HasBeenUsed; }
+
+  MCSymbol *getLabel() { return AddressTableBaseSym; }
+  void setLabel(MCSymbol *Sym) { AddressTableBaseSym = Sym; }
+
+private:
+  MCSymbol *emitHeader(AsmPrinter &Asm, MCSection *Section);
+
+  /// Symbol designates the start of the contribution to the address table.
+  MCSymbol *AddressTableBaseSym = nullptr;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_ASMPRINTER_ADDRESSPOOL_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
new file mode 100644
index 000000000000..5381dfdd184c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -0,0 +1,4166 @@
+//===- AsmPrinter.cpp - Common AsmPrinter code ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AsmPrinter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "CodeViewDebug.h"
+#include "DwarfDebug.h"
+#include "DwarfException.h"
+#include "PseudoProbePrinter.h"
+#include "WasmException.h"
+#include "WinCFGuard.h"
+#include "WinException.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/config.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/EHPersonalities.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GCStrategy.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalIFunc.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PseudoProbe.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Pass.h"
+#include "llvm/Remarks/RemarkStreamer.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Triple.h"
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+static cl::opt<std::string> BasicBlockProfileDump(
+    "mbb-profile-dump", cl::Hidden,
+    cl::desc("Basic block profile dump for external cost modelling. If "
+             "matching up BBs with afterwards, the compilation must be "
+             "performed with -basic-block-sections=labels. Enabling this "
+             "flag during in-process ThinLTO is not supported."));
+
+const char DWARFGroupName[] = "dwarf";
+const char DWARFGroupDescription[] = "DWARF Emission";
+const char DbgTimerName[] = "emit";
+const char DbgTimerDescription[] = "Debug Info Emission";
+const char EHTimerName[] = "write_exception";
+const char EHTimerDescription[] = "DWARF Exception Writer";
+const char CFGuardName[] = "Control Flow Guard";
+const char CFGuardDescription[] = "Control Flow Guard";
+const char CodeViewLineTablesGroupName[] = "linetables";
+const char CodeViewLineTablesGroupDescription[] = "CodeView Line Tables";
+const char PPTimerName[] = "emit";
+const char PPTimerDescription[] = "Pseudo Probe Emission";
+const char PPGroupName[] = "pseudo probe";
+const char PPGroupDescription[] = "Pseudo Probe Emission";
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+char AsmPrinter::ID = 0;
+
+namespace {
+class AddrLabelMapCallbackPtr final : CallbackVH {
+  AddrLabelMap *Map = nullptr;
+
+public:
+  AddrLabelMapCallbackPtr() = default;
+  AddrLabelMapCallbackPtr(Value *V) : CallbackVH(V) {}
+
+  void setPtr(BasicBlock *BB) {
+    ValueHandleBase::operator=(BB);
+  }
+
+  void setMap(AddrLabelMap *map) { Map = map; }
+
+  void deleted() override;
+  void allUsesReplacedWith(Value *V2) override;
+};
+} // namespace
+
+class llvm::AddrLabelMap {
+  MCContext &Context;
+  struct AddrLabelSymEntry {
+    /// The symbols for the label.
+    TinyPtrVector<MCSymbol *> Symbols;
+
+    Function *Fn;   // The containing function of the BasicBlock.
+    unsigned Index; // The index in BBCallbacks for the BasicBlock.
+  };
+
+  DenseMap<AssertingVH<BasicBlock>, AddrLabelSymEntry> AddrLabelSymbols;
+
+  /// Callbacks for the BasicBlock's that we have entries for.  We use this so
+  /// we get notified if a block is deleted or RAUWd.
+  std::vector<AddrLabelMapCallbackPtr> BBCallbacks;
+
+  /// This is a per-function list of symbols whose corresponding BasicBlock got
+  /// deleted.  These symbols need to be emitted at some point in the file, so
+  /// AsmPrinter emits them after the function body.
+  DenseMap<AssertingVH<Function>, std::vector<MCSymbol *>>
+      DeletedAddrLabelsNeedingEmission;
+
+public:
+  AddrLabelMap(MCContext &context) : Context(context) {}
+
+  ~AddrLabelMap() {
+    assert(DeletedAddrLabelsNeedingEmission.empty() &&
+           "Some labels for deleted blocks never got emitted");
+  }
+
+  ArrayRef<MCSymbol *> getAddrLabelSymbolToEmit(BasicBlock *BB);
+
+  void takeDeletedSymbolsForFunction(Function *F,
+                                     std::vector<MCSymbol *> &Result);
+
+  void UpdateForDeletedBlock(BasicBlock *BB);
+  void UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New);
+};
+
+ArrayRef<MCSymbol *> AddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) {
+  assert(BB->hasAddressTaken() &&
+         "Shouldn't get label for block without address taken");
+  AddrLabelSymEntry &Entry = AddrLabelSymbols[BB];
+
+  // If we already had an entry for this block, just return it.
+  if (!Entry.Symbols.empty()) {
+    assert(BB->getParent() == Entry.Fn && "Parent changed");
+    return Entry.Symbols;
+  }
+
+  // Otherwise, this is a new entry, create a new symbol for it and add an
+  // entry to BBCallbacks so we can be notified if the BB is deleted or RAUWd.
+  BBCallbacks.emplace_back(BB);
+  BBCallbacks.back().setMap(this);
+  Entry.Index = BBCallbacks.size() - 1;
+  Entry.Fn = BB->getParent();
+  MCSymbol *Sym = BB->hasAddressTaken() ? Context.createNamedTempSymbol()
+                                        : Context.createTempSymbol();
+  Entry.Symbols.push_back(Sym);
+  return Entry.Symbols;
+}
+
+/// If we have any deleted symbols for F, return them.
+void AddrLabelMap::takeDeletedSymbolsForFunction(
+    Function *F, std::vector<MCSymbol *> &Result) {
+  DenseMap<AssertingVH<Function>, std::vector<MCSymbol *>>::iterator I =
+      DeletedAddrLabelsNeedingEmission.find(F);
+
+  // If there are no entries for the function, just return.
+  if (I == DeletedAddrLabelsNeedingEmission.end())
+    return;
+
+  // Otherwise, take the list.
+  std::swap(Result, I->second);
+  DeletedAddrLabelsNeedingEmission.erase(I);
+}
+
+//===- Address of Block Management ----------------------------------------===//
+
+ArrayRef<MCSymbol *>
+AsmPrinter::getAddrLabelSymbolToEmit(const BasicBlock *BB) {
+  // Lazily create AddrLabelSymbols.
+  if (!AddrLabelSymbols)
+    AddrLabelSymbols = std::make_unique<AddrLabelMap>(OutContext);
+  return AddrLabelSymbols->getAddrLabelSymbolToEmit(
+      const_cast<BasicBlock *>(BB));
+}
+
+void AsmPrinter::takeDeletedSymbolsForFunction(
+    const Function *F, std::vector<MCSymbol *> &Result) {
+  // If no blocks have had their addresses taken, we're done.
+  if (!AddrLabelSymbols)
+    return;
+  return AddrLabelSymbols->takeDeletedSymbolsForFunction(
+      const_cast<Function *>(F), Result);
+}
+
+void AddrLabelMap::UpdateForDeletedBlock(BasicBlock *BB) {
+  // If the block got deleted, there is no need for the symbol.  If the symbol
+  // was already emitted, we can just forget about it, otherwise we need to
+  // queue it up for later emission when the function is output.
+  AddrLabelSymEntry Entry = std::move(AddrLabelSymbols[BB]);
+  AddrLabelSymbols.erase(BB);
+  assert(!Entry.Symbols.empty() && "Didn't have a symbol, why a callback?");
+  BBCallbacks[Entry.Index] = nullptr; // Clear the callback.
+
+#if !LLVM_MEMORY_SANITIZER_BUILD
+  // BasicBlock is destroyed already, so this access is UB detectable by msan.
+  assert((BB->getParent() == nullptr || BB->getParent() == Entry.Fn) &&
+         "Block/parent mismatch");
+#endif
+
+  for (MCSymbol *Sym : Entry.Symbols) {
+    if (Sym->isDefined())
+      return;
+
+    // If the block is not yet defined, we need to emit it at the end of the
+    // function.  Add the symbol to the DeletedAddrLabelsNeedingEmission list
+    // for the containing Function.  Since the block is being deleted, its
+    // parent may already be removed, we have to get the function from 'Entry'.
+    DeletedAddrLabelsNeedingEmission[Entry.Fn].push_back(Sym);
+  }
+}
+
+void AddrLabelMap::UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New) {
+  // Get the entry for the RAUW'd block and remove it from our map.
+  AddrLabelSymEntry OldEntry = std::move(AddrLabelSymbols[Old]);
+  AddrLabelSymbols.erase(Old);
+  assert(!OldEntry.Symbols.empty() && "Didn't have a symbol, why a callback?");
+
+  AddrLabelSymEntry &NewEntry = AddrLabelSymbols[New];
+
+  // If New is not address taken, just move our symbol over to it.
+  if (NewEntry.Symbols.empty()) {
+    BBCallbacks[OldEntry.Index].setPtr(New); // Update the callback.
+    NewEntry = std::move(OldEntry);          // Set New's entry.
+    return;
+  }
+
+  BBCallbacks[OldEntry.Index] = nullptr; // Update the callback.
+
+  // Otherwise, we need to add the old symbols to the new block's set.
+  llvm::append_range(NewEntry.Symbols, OldEntry.Symbols);
+}
+
+void AddrLabelMapCallbackPtr::deleted() {
+  Map->UpdateForDeletedBlock(cast<BasicBlock>(getValPtr()));
+}
+
+void AddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
+  Map->UpdateForRAUWBlock(cast<BasicBlock>(getValPtr()), cast<BasicBlock>(V2));
+}
+
+/// getGVAlignment - Return the alignment to use for the specified global
+/// value.  This rounds up to the preferred alignment if possible and legal.
+Align AsmPrinter::getGVAlignment(const GlobalObject *GV, const DataLayout &DL,
+                                 Align InAlign) {
+  Align Alignment;
+  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+    Alignment = DL.getPreferredAlign(GVar);
+
+  // If InAlign is specified, round it to it.
+  if (InAlign > Alignment)
+    Alignment = InAlign;
+
+  // If the GV has a specified alignment, take it into account.
+  const MaybeAlign GVAlign(GV->getAlign());
+  if (!GVAlign)
+    return Alignment;
+
+  assert(GVAlign && "GVAlign must be set");
+
+  // If the GVAlign is larger than NumBits, or if we are required to obey
+  // NumBits because the GV has an assigned section, obey it.
+  if (*GVAlign > Alignment || GV->hasSection())
+    Alignment = *GVAlign;
+  return Alignment;
+}
+
+AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer)
+    : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()),
+      OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)),
+      SM(*this) {
+  VerboseAsm = OutStreamer->isVerboseAsm();
+  DwarfUsesRelocationsAcrossSections =
+      MAI->doesDwarfUseRelocationsAcrossSections();
+}
+
+AsmPrinter::~AsmPrinter() {
+  assert(!DD && Handlers.size() == NumUserHandlers &&
+         "Debug/EH info didn't get finalized");
+}
+
+bool AsmPrinter::isPositionIndependent() const {
+  return TM.isPositionIndependent();
+}
+
+/// getFunctionNumber - Return a unique ID for the current function.
+unsigned AsmPrinter::getFunctionNumber() const {
+  return MF->getFunctionNumber();
+}
+
+const TargetLoweringObjectFile &AsmPrinter::getObjFileLowering() const {
+  return *TM.getObjFileLowering();
+}
+
+const DataLayout &AsmPrinter::getDataLayout() const {
+  return MMI->getModule()->getDataLayout();
+}
+
+// Do not use the cached DataLayout because some client use it without a Module
+// (dsymutil, llvm-dwarfdump).
+unsigned AsmPrinter::getPointerSize() const {
+  return TM.getPointerSize(0); // FIXME: Default address space
+}
+
+const MCSubtargetInfo &AsmPrinter::getSubtargetInfo() const {
+  assert(MF && "getSubtargetInfo requires a valid MachineFunction!");
+  return MF->getSubtarget<MCSubtargetInfo>();
+}
+
+void AsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
+  S.emitInstruction(Inst, getSubtargetInfo());
+}
+
+void AsmPrinter::emitInitialRawDwarfLocDirective(const MachineFunction &MF) {
+  if (DD) {
+    assert(OutStreamer->hasRawTextSupport() &&
+           "Expected assembly output mode.");
+    // This is NVPTX specific and it's unclear why.
+    // PR51079: If we have code without debug information we need to give up.
+    DISubprogram *MFSP = MF.getFunction().getSubprogram();
+    if (!MFSP)
+      return;
+    (void)DD->emitInitialLocDirective(MF, /*CUID=*/0);
+  }
+}
+
+/// getCurrentSection() - Return the current section we are emitting to.
+const MCSection *AsmPrinter::getCurrentSection() const {
+  return OutStreamer->getCurrentSectionOnly();
+}
+
+void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+  AU.addRequired<MachineOptimizationRemarkEmitterPass>();
+  AU.addRequired<GCModuleInfo>();
+  AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+}
+
+bool AsmPrinter::doInitialization(Module &M) {
+  auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
+  MMI = MMIWP ? &MMIWP->getMMI() : nullptr;
+  HasSplitStack = false;
+  HasNoSplitStack = false;
+
+  AddrLabelSymbols = nullptr;
+
+  // Initialize TargetLoweringObjectFile.
+  const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
+    .Initialize(OutContext, TM);
+
+  const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
+      .getModuleMetadata(M);
+
+  OutStreamer->initSections(false, *TM.getMCSubtargetInfo());
+
+  // Emit the version-min deployment target directive if needed.
+  //
+  // FIXME: If we end up with a collection of these sorts of Darwin-specific
+  // or ELF-specific things, it may make sense to have a platform helper class
+  // that will work with the target helper class. For now keep it here, as the
+  // alternative is duplicated code in each of the target asm printers that
+  // use the directive, where it would need the same conditionalization
+  // anyway.
+  const Triple &Target = TM.getTargetTriple();
+  Triple TVT(M.getDarwinTargetVariantTriple());
+  OutStreamer->emitVersionForTarget(
+      Target, M.getSDKVersion(),
+      M.getDarwinTargetVariantTriple().empty() ? nullptr : &TVT,
+      M.getDarwinTargetVariantSDKVersion());
+
+  // Allow the target to emit any magic that it wants at the start of the file.
+  emitStartOfAsmFile(M);
+
+  // Very minimal debug info. It is ignored if we emit actual debug info. If we
+  // don't, this at least helps the user find where a global came from.
+  if (MAI->hasSingleParameterDotFile()) {
+    // .file "foo.c"
+
+    SmallString<128> FileName;
+    if (MAI->hasBasenameOnlyForFileDirective())
+      FileName = llvm::sys::path::filename(M.getSourceFileName());
+    else
+      FileName = M.getSourceFileName();
+    if (MAI->hasFourStringsDotFile()) {
+#ifdef PACKAGE_VENDOR
+      const char VerStr[] =
+          PACKAGE_VENDOR " " PACKAGE_NAME " version " PACKAGE_VERSION;
+#else
+      const char VerStr[] = PACKAGE_NAME " version " PACKAGE_VERSION;
+#endif
+      // TODO: Add timestamp and description.
+      OutStreamer->emitFileDirective(FileName, VerStr, "", "");
+    } else {
+      OutStreamer->emitFileDirective(FileName);
+    }
+  }
+
+  // On AIX, emit bytes for llvm.commandline metadata after .file so that the
+  // C_INFO symbol is preserved if any csect is kept by the linker.
+  if (TM.getTargetTriple().isOSBinFormatXCOFF())
+    emitModuleCommandLines(M);
+
+  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(MI && "AsmPrinter didn't require GCModuleInfo?");
+  for (const auto &I : *MI)
+    if (GCMetadataPrinter *MP = getOrCreateGCPrinter(*I))
+      MP->beginAssembly(M, *MI, *this);
+
+  // Emit module-level inline asm if it exists.
+  if (!M.getModuleInlineAsm().empty()) {
+    OutStreamer->AddComment("Start of file scope inline assembly");
+    OutStreamer->addBlankLine();
+    emitInlineAsm(M.getModuleInlineAsm() + "\n", *TM.getMCSubtargetInfo(),
+                  TM.Options.MCOptions);
+    OutStreamer->AddComment("End of file scope inline assembly");
+    OutStreamer->addBlankLine();
+  }
+
+  if (MAI->doesSupportDebugInformation()) {
+    bool EmitCodeView = M.getCodeViewFlag();
+    if (EmitCodeView && TM.getTargetTriple().isOSWindows()) {
+      Handlers.emplace_back(std::make_unique<CodeViewDebug>(this),
+                            DbgTimerName, DbgTimerDescription,
+                            CodeViewLineTablesGroupName,
+                            CodeViewLineTablesGroupDescription);
+    }
+    if (!EmitCodeView || M.getDwarfVersion()) {
+      if (MMI->hasDebugInfo()) {
+        DD = new DwarfDebug(this);
+        Handlers.emplace_back(std::unique_ptr<DwarfDebug>(DD), DbgTimerName,
+                              DbgTimerDescription, DWARFGroupName,
+                              DWARFGroupDescription);
+      }
+    }
+  }
+
+  if (M.getNamedMetadata(PseudoProbeDescMetadataName)) {
+    PP = new PseudoProbeHandler(this);
+    Handlers.emplace_back(std::unique_ptr<PseudoProbeHandler>(PP), PPTimerName,
+                          PPTimerDescription, PPGroupName, PPGroupDescription);
+  }
+
+  switch (MAI->getExceptionHandlingType()) {
+  case ExceptionHandling::None:
+    // We may want to emit CFI for debug.
+    [[fallthrough]];
+  case ExceptionHandling::SjLj:
+  case ExceptionHandling::DwarfCFI:
+  case ExceptionHandling::ARM:
+    for (auto &F : M.getFunctionList()) {
+      if (getFunctionCFISectionType(F) != CFISection::None)
+        ModuleCFISection = getFunctionCFISectionType(F);
+      // If any function needsUnwindTableEntry(), it needs .eh_frame and hence
+      // the module needs .eh_frame. If we have found that case, we are done.
+      if (ModuleCFISection == CFISection::EH)
+        break;
+    }
+    assert(MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI ||
+           usesCFIWithoutEH() || ModuleCFISection != CFISection::EH);
+    break;
+  default:
+    break;
+  }
+
+  EHStreamer *ES = nullptr;
+  switch (MAI->getExceptionHandlingType()) {
+  case ExceptionHandling::None:
+    if (!usesCFIWithoutEH())
+      break;
+    [[fallthrough]];
+  case ExceptionHandling::SjLj:
+  case ExceptionHandling::DwarfCFI:
+    ES = new DwarfCFIException(this);
+    break;
+  case ExceptionHandling::ARM:
+    ES = new ARMException(this);
+    break;
+  case ExceptionHandling::WinEH:
+    switch (MAI->getWinEHEncodingType()) {
+    default: llvm_unreachable("unsupported unwinding information encoding");
+    case WinEH::EncodingType::Invalid:
+      break;
+    case WinEH::EncodingType::X86:
+    case WinEH::EncodingType::Itanium:
+      ES = new WinException(this);
+      break;
+    }
+    break;
+  case ExceptionHandling::Wasm:
+    ES = new WasmException(this);
+    break;
+  case ExceptionHandling::AIX:
+    ES = new AIXException(this);
+    break;
+  }
+  if (ES)
+    Handlers.emplace_back(std::unique_ptr<EHStreamer>(ES), EHTimerName,
+                          EHTimerDescription, DWARFGroupName,
+                          DWARFGroupDescription);
+
+  // Emit tables for any value of cfguard flag (i.e. cfguard=1 or cfguard=2).
+  if (mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("cfguard")))
+    Handlers.emplace_back(std::make_unique<WinCFGuard>(this), CFGuardName,
+                          CFGuardDescription, DWARFGroupName,
+                          DWARFGroupDescription);
+
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerDescription, HI.TimerGroupName,
+                       HI.TimerGroupDescription, TimePassesIsEnabled);
+    HI.Handler->beginModule(&M);
+  }
+
+  if (!BasicBlockProfileDump.empty()) {
+    std::error_code PossibleFileError;
+    MBBProfileDumpFileOutput = std::make_unique<raw_fd_ostream>(
+        BasicBlockProfileDump, PossibleFileError);
+    if (PossibleFileError) {
+      M.getContext().emitError("Failed to open file for MBB Profile Dump: " +
+                               PossibleFileError.message() + "\n");
+    }
+  }
+
+  return false;
+}
+
+static bool canBeHidden(const GlobalValue *GV, const MCAsmInfo &MAI) {
+  if (!MAI.hasWeakDefCanBeHiddenDirective())
+    return false;
+
+  return GV->canBeOmittedFromSymbolTable();
+}
+
+void AsmPrinter::emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const {
+  GlobalValue::LinkageTypes Linkage = GV->getLinkage();
+  switch (Linkage) {
+  case GlobalValue::CommonLinkage:
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+    if (MAI->hasWeakDefDirective()) {
+      // .globl _foo
+      OutStreamer->emitSymbolAttribute(GVSym, MCSA_Global);
+
+      if (!canBeHidden(GV, *MAI))
+        // .weak_definition _foo
+        OutStreamer->emitSymbolAttribute(GVSym, MCSA_WeakDefinition);
+      else
+        OutStreamer->emitSymbolAttribute(GVSym, MCSA_WeakDefAutoPrivate);
+    } else if (MAI->avoidWeakIfComdat() && GV->hasComdat()) {
+      // .globl _foo
+      OutStreamer->emitSymbolAttribute(GVSym, MCSA_Global);
+      //NOTE: linkonce is handled by the section the symbol was assigned to.
+    } else {
+      // .weak _foo
+      OutStreamer->emitSymbolAttribute(GVSym, MCSA_Weak);
+    }
+    return;
+  case GlobalValue::ExternalLinkage:
+    OutStreamer->emitSymbolAttribute(GVSym, MCSA_Global);
+    return;
+  case GlobalValue::PrivateLinkage:
+  case GlobalValue::InternalLinkage:
+    return;
+  case GlobalValue::ExternalWeakLinkage:
+  case GlobalValue::AvailableExternallyLinkage:
+  case GlobalValue::AppendingLinkage:
+    llvm_unreachable("Should never emit this");
+  }
+  llvm_unreachable("Unknown linkage type!");
+}
+
+void AsmPrinter::getNameWithPrefix(SmallVectorImpl<char> &Name,
+                                   const GlobalValue *GV) const {
+  TM.getNameWithPrefix(Name, GV, getObjFileLowering().getMangler());
+}
+
+MCSymbol *AsmPrinter::getSymbol(const GlobalValue *GV) const {
+  return TM.getSymbol(GV);
+}
+
+MCSymbol *AsmPrinter::getSymbolPreferLocal(const GlobalValue &GV) const {
+  // On ELF, use .Lfoo$local if GV is a non-interposable GlobalObject with an
+  // exact definion (intersection of GlobalValue::hasExactDefinition() and
+  // !isInterposable()). These linkages include: external, appending, internal,
+  // private. It may be profitable to use a local alias for external. The
+  // assembler would otherwise be conservative and assume a global default
+  // visibility symbol can be interposable, even if the code generator already
+  // assumed it.
+  if (TM.getTargetTriple().isOSBinFormatELF() && GV.canBenefitFromLocalAlias()) {
+    const Module &M = *GV.getParent();
+    if (TM.getRelocationModel() != Reloc::Static &&
+        M.getPIELevel() == PIELevel::Default && GV.isDSOLocal())
+      return getSymbolWithGlobalValueBase(&GV, "$local");
+  }
+  return TM.getSymbol(&GV);
+}
+
+/// EmitGlobalVariable - Emit the specified global variable to the .s file.
+void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
+  bool IsEmuTLSVar = TM.useEmulatedTLS() && GV->isThreadLocal();
+  assert(!(IsEmuTLSVar && GV->hasCommonLinkage()) &&
+         "No emulated TLS variables in the common section");
+
+  // Never emit TLS variable xyz in emulated TLS model.
+  // The initialization value is in __emutls_t.xyz instead of xyz.
+  if (IsEmuTLSVar)
+    return;
+
+  if (GV->hasInitializer()) {
+    // Check to see if this is a special global used by LLVM, if so, emit it.
+    if (emitSpecialLLVMGlobal(GV))
+      return;
+
+    // Skip the emission of global equivalents. The symbol can be emitted later
+    // on by emitGlobalGOTEquivs in case it turns out to be needed.
+    if (GlobalGOTEquivs.count(getSymbol(GV)))
+      return;
+
+    if (isVerbose()) {
+      // When printing the control variable __emutls_v.*,
+      // we don't need to print the original TLS variable name.
+      GV->printAsOperand(OutStreamer->getCommentOS(),
+                         /*PrintType=*/false, GV->getParent());
+      OutStreamer->getCommentOS() << '\n';
+    }
+  }
+
+  MCSymbol *GVSym = getSymbol(GV);
+  MCSymbol *EmittedSym = GVSym;
+
+  // getOrCreateEmuTLSControlSym only creates the symbol with name and default
+  // attributes.
+  // GV's or GVSym's attributes will be used for the EmittedSym.
+  emitVisibility(EmittedSym, GV->getVisibility(), !GV->isDeclaration());
+
+  if (GV->isTagged()) {
+    Triple T = TM.getTargetTriple();
+
+    if (T.getArch() != Triple::aarch64 || !T.isAndroid())
+      OutContext.reportError(SMLoc(),
+                             "tagged symbols (-fsanitize=memtag-globals) are "
+                             "only supported on AArch64 Android");
+    OutStreamer->emitSymbolAttribute(EmittedSym, MAI->getMemtagAttr());
+  }
+
+  if (!GV->hasInitializer())   // External globals require no extra code.
+    return;
+
+  GVSym->redefineIfPossible();
+  if (GVSym->isDefined() || GVSym->isVariable())
+    OutContext.reportError(SMLoc(), "symbol '" + Twine(GVSym->getName()) +
+                                        "' is already defined");
+
+  if (MAI->hasDotTypeDotSizeDirective())
+    OutStreamer->emitSymbolAttribute(EmittedSym, MCSA_ELF_TypeObject);
+
+  SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
+
+  const DataLayout &DL = GV->getParent()->getDataLayout();
+  uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
+
+  // If the alignment is specified, we *must* obey it.  Overaligning a global
+  // with a specified alignment is a prompt way to break globals emitted to
+  // sections and expected to be contiguous (e.g. ObjC metadata).
+  const Align Alignment = getGVAlignment(GV, DL);
+
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerDescription,
+                       HI.TimerGroupName, HI.TimerGroupDescription,
+                       TimePassesIsEnabled);
+    HI.Handler->setSymbolSize(GVSym, Size);
+  }
+
+  // Handle common symbols
+  if (GVKind.isCommon()) {
+    if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+    // .comm _foo, 42, 4
+    OutStreamer->emitCommonSymbol(GVSym, Size, Alignment);
+    return;
+  }
+
+  // Determine to which section this global should be emitted.
+  MCSection *TheSection = getObjFileLowering().SectionForGlobal(GV, GVKind, TM);
+
+  // If we have a bss global going to a section that supports the
+  // zerofill directive, do so here.
+  if (GVKind.isBSS() && MAI->hasMachoZeroFillDirective() &&
+      TheSection->isVirtualSection()) {
+    if (Size == 0)
+      Size = 1; // zerofill of 0 bytes is undefined.
+    emitLinkage(GV, GVSym);
+    // .zerofill __DATA, __bss, _foo, 400, 5
+    OutStreamer->emitZerofill(TheSection, GVSym, Size, Alignment);
+    return;
+  }
+
+  // If this is a BSS local symbol and we are emitting in the BSS
+  // section use .lcomm/.comm directive.
+  if (GVKind.isBSSLocal() &&
+      getObjFileLowering().getBSSSection() == TheSection) {
+    if (Size == 0)
+      Size = 1; // .comm Foo, 0 is undefined, avoid it.
+
+    // Use .lcomm only if it supports user-specified alignment.
+    // Otherwise, while it would still be correct to use .lcomm in some
+    // cases (e.g. when Align == 1), the external assembler might enfore
+    // some -unknown- default alignment behavior, which could cause
+    // spurious differences between external and integrated assembler.
+    // Prefer to simply fall back to .local / .comm in this case.
+    if (MAI->getLCOMMDirectiveAlignmentType() != LCOMM::NoAlignment) {
+      // .lcomm _foo, 42
+      OutStreamer->emitLocalCommonSymbol(GVSym, Size, Alignment);
+      return;
+    }
+
+    // .local _foo
+    OutStreamer->emitSymbolAttribute(GVSym, MCSA_Local);
+    // .comm _foo, 42, 4
+    OutStreamer->emitCommonSymbol(GVSym, Size, Alignment);
+    return;
+  }
+
+  // Handle thread local data for mach-o which requires us to output an
+  // additional structure of data and mangle the original symbol so that we
+  // can reference it later.
+  //
+  // TODO: This should become an "emit thread local global" method on TLOF.
+  // All of this macho specific stuff should be sunk down into TLOFMachO and
+  // stuff like "TLSExtraDataSection" should no longer be part of the parent
+  // TLOF class.  This will also make it more obvious that stuff like
+  // MCStreamer::EmitTBSSSymbol is macho specific and only called from macho
+  // specific code.
+  if (GVKind.isThreadLocal() && MAI->hasMachoTBSSDirective()) {
+    // Emit the .tbss symbol
+    MCSymbol *MangSym =
+        OutContext.getOrCreateSymbol(GVSym->getName() + Twine("$tlv$init"));
+
+    if (GVKind.isThreadBSS()) {
+      TheSection = getObjFileLowering().getTLSBSSSection();
+      OutStreamer->emitTBSSSymbol(TheSection, MangSym, Size, Alignment);
+    } else if (GVKind.isThreadData()) {
+      OutStreamer->switchSection(TheSection);
+
+      emitAlignment(Alignment, GV);
+      OutStreamer->emitLabel(MangSym);
+
+      emitGlobalConstant(GV->getParent()->getDataLayout(),
+                         GV->getInitializer());
+    }
+
+    OutStreamer->addBlankLine();
+
+    // Emit the variable struct for the runtime.
+    MCSection *TLVSect = getObjFileLowering().getTLSExtraDataSection();
+
+    OutStreamer->switchSection(TLVSect);
+    // Emit the linkage here.
+    emitLinkage(GV, GVSym);
+    OutStreamer->emitLabel(GVSym);
+
+    // Three pointers in size:
+    //   - __tlv_bootstrap - used to make sure support exists
+    //   - spare pointer, used when mapped by the runtime
+    //   - pointer to mangled symbol above with initializer
+    unsigned PtrSize = DL.getPointerTypeSize(GV->getType());
+    OutStreamer->emitSymbolValue(GetExternalSymbolSymbol("_tlv_bootstrap"),
+                                PtrSize);
+    OutStreamer->emitIntValue(0, PtrSize);
+    OutStreamer->emitSymbolValue(MangSym, PtrSize);
+
+    OutStreamer->addBlankLine();
+    return;
+  }
+
+  MCSymbol *EmittedInitSym = GVSym;
+
+  OutStreamer->switchSection(TheSection);
+
+  emitLinkage(GV, EmittedInitSym);
+  emitAlignment(Alignment, GV);
+
+  OutStreamer->emitLabel(EmittedInitSym);
+  MCSymbol *LocalAlias = getSymbolPreferLocal(*GV);
+  if (LocalAlias != EmittedInitSym)
+    OutStreamer->emitLabel(LocalAlias);
+
+  emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
+
+  if (MAI->hasDotTypeDotSizeDirective())
+    // .size foo, 42
+    OutStreamer->emitELFSize(EmittedInitSym,
+                             MCConstantExpr::create(Size, OutContext));
+
+  OutStreamer->addBlankLine();
+}
+
+/// Emit the directive and value for debug thread local expression
+///
+/// \p Value - The value to emit.
+/// \p Size - The size of the integer (in bytes) to emit.
+void AsmPrinter::emitDebugValue(const MCExpr *Value, unsigned Size) const {
+  OutStreamer->emitValue(Value, Size);
+}
+
+void AsmPrinter::emitFunctionHeaderComment() {}
+
+/// EmitFunctionHeader - This method emits the header for the current
+/// function.
+void AsmPrinter::emitFunctionHeader() {
+  const Function &F = MF->getFunction();
+
+  if (isVerbose())
+    OutStreamer->getCommentOS()
+        << "-- Begin function "
+        << GlobalValue::dropLLVMManglingEscape(F.getName()) << '\n';
+
+  // Print out constants referenced by the function
+  emitConstantPool();
+
+  // Print the 'header' of function.
+  // If basic block sections are desired, explicitly request a unique section
+  // for this function's entry block.
+  if (MF->front().isBeginSection())
+    MF->setSection(getObjFileLowering().getUniqueSectionForFunction(F, TM));
+  else
+    MF->setSection(getObjFileLowering().SectionForGlobal(&F, TM));
+  OutStreamer->switchSection(MF->getSection());
+
+  if (!MAI->hasVisibilityOnlyWithLinkage())
+    emitVisibility(CurrentFnSym, F.getVisibility());
+
+  if (MAI->needsFunctionDescriptors())
+    emitLinkage(&F, CurrentFnDescSym);
+
+  emitLinkage(&F, CurrentFnSym);
+  if (MAI->hasFunctionAlignment())
+    emitAlignment(MF->getAlignment(), &F);
+
+  if (MAI->hasDotTypeDotSizeDirective())
+    OutStreamer->emitSymbolAttribute(CurrentFnSym, MCSA_ELF_TypeFunction);
+
+  if (F.hasFnAttribute(Attribute::Cold))
+    OutStreamer->emitSymbolAttribute(CurrentFnSym, MCSA_Cold);
+
+  // Emit the prefix data.
+  if (F.hasPrefixData()) {
+    if (MAI->hasSubsectionsViaSymbols()) {
+      // Preserving prefix data on platforms which use subsections-via-symbols
+      // is a bit tricky. Here we introduce a symbol for the prefix data
+      // and use the .alt_entry attribute to mark the function's real entry point
+      // as an alternative entry point to the prefix-data symbol.
+      MCSymbol *PrefixSym = OutContext.createLinkerPrivateTempSymbol();
+      OutStreamer->emitLabel(PrefixSym);
+
+      emitGlobalConstant(F.getParent()->getDataLayout(), F.getPrefixData());
+
+      // Emit an .alt_entry directive for the actual function symbol.
+      OutStreamer->emitSymbolAttribute(CurrentFnSym, MCSA_AltEntry);
+    } else {
+      emitGlobalConstant(F.getParent()->getDataLayout(), F.getPrefixData());
+    }
+  }
+
+  // Emit KCFI type information before patchable-function-prefix nops.
+  emitKCFITypeId(*MF);
+
+  // Emit M NOPs for -fpatchable-function-entry=N,M where M>0. We arbitrarily
+  // place prefix data before NOPs.
+  unsigned PatchableFunctionPrefix = 0;
+  unsigned PatchableFunctionEntry = 0;
+  (void)F.getFnAttribute("patchable-function-prefix")
+      .getValueAsString()
+      .getAsInteger(10, PatchableFunctionPrefix);
+  (void)F.getFnAttribute("patchable-function-entry")
+      .getValueAsString()
+      .getAsInteger(10, PatchableFunctionEntry);
+  if (PatchableFunctionPrefix) {
+    CurrentPatchableFunctionEntrySym =
+        OutContext.createLinkerPrivateTempSymbol();
+    OutStreamer->emitLabel(CurrentPatchableFunctionEntrySym);
+    emitNops(PatchableFunctionPrefix);
+  } else if (PatchableFunctionEntry) {
+    // May be reassigned when emitting the body, to reference the label after
+    // the initial BTI (AArch64) or endbr32/endbr64 (x86).
+    CurrentPatchableFunctionEntrySym = CurrentFnBegin;
+  }
+
+  // Emit the function prologue data for the indirect call sanitizer.
+  if (const MDNode *MD = F.getMetadata(LLVMContext::MD_func_sanitize)) {
+    assert(MD->getNumOperands() == 2);
+
+    auto *PrologueSig = mdconst::extract<Constant>(MD->getOperand(0));
+    auto *TypeHash = mdconst::extract<Constant>(MD->getOperand(1));
+    emitGlobalConstant(F.getParent()->getDataLayout(), PrologueSig);
+    emitGlobalConstant(F.getParent()->getDataLayout(), TypeHash);
+  }
+
+  if (isVerbose()) {
+    F.printAsOperand(OutStreamer->getCommentOS(),
+                     /*PrintType=*/false, F.getParent());
+    emitFunctionHeaderComment();
+    OutStreamer->getCommentOS() << '\n';
+  }
+
+  // Emit the function descriptor. This is a virtual function to allow targets
+  // to emit their specific function descriptor. Right now it is only used by
+  // the AIX target. The PowerPC 64-bit V1 ELF target also uses function
+  // descriptors and should be converted to use this hook as well.
+  if (MAI->needsFunctionDescriptors())
+    emitFunctionDescriptor();
+
+  // Emit the CurrentFnSym. This is a virtual function to allow targets to do
+  // their wild and crazy things as required.
+  emitFunctionEntryLabel();
+
+  // If the function had address-taken blocks that got deleted, then we have
+  // references to the dangling symbols.  Emit them at the start of the function
+  // so that we don't get references to undefined symbols.
+  std::vector<MCSymbol*> DeadBlockSyms;
+  takeDeletedSymbolsForFunction(&F, DeadBlockSyms);
+  for (MCSymbol *DeadBlockSym : DeadBlockSyms) {
+    OutStreamer->AddComment("Address taken block that was later removed");
+    OutStreamer->emitLabel(DeadBlockSym);
+  }
+
+  if (CurrentFnBegin) {
+    if (MAI->useAssignmentForEHBegin()) {
+      MCSymbol *CurPos = OutContext.createTempSymbol();
+      OutStreamer->emitLabel(CurPos);
+      OutStreamer->emitAssignment(CurrentFnBegin,
+                                 MCSymbolRefExpr::create(CurPos, OutContext));
+    } else {
+      OutStreamer->emitLabel(CurrentFnBegin);
+    }
+  }
+
+  // Emit pre-function debug and/or EH information.
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerDescription, HI.TimerGroupName,
+                       HI.TimerGroupDescription, TimePassesIsEnabled);
+    HI.Handler->beginFunction(MF);
+  }
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerDescription, HI.TimerGroupName,
+                       HI.TimerGroupDescription, TimePassesIsEnabled);
+    HI.Handler->beginBasicBlockSection(MF->front());
+  }
+
+  // Emit the prologue data.
+  if (F.hasPrologueData())
+    emitGlobalConstant(F.getParent()->getDataLayout(), F.getPrologueData());
+}
+
+/// EmitFunctionEntryLabel - Emit the label that is the entrypoint for the
+/// function.  This can be overridden by targets as required to do custom stuff.
+void AsmPrinter::emitFunctionEntryLabel() {
+  CurrentFnSym->redefineIfPossible();
+
+  // The function label could have already been emitted if two symbols end up
+  // conflicting due to asm renaming.  Detect this and emit an error.
+  if (CurrentFnSym->isVariable())
+    report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
+                       "' is a protected alias");
+
+  OutStreamer->emitLabel(CurrentFnSym);
+
+  if (TM.getTargetTriple().isOSBinFormatELF()) {
+    MCSymbol *Sym = getSymbolPreferLocal(MF->getFunction());
+    if (Sym != CurrentFnSym) {
+      cast<MCSymbolELF>(Sym)->setType(ELF::STT_FUNC);
+      CurrentFnBeginLocal = Sym;
+      OutStreamer->emitLabel(Sym);
+      if (MAI->hasDotTypeDotSizeDirective())
+        OutStreamer->emitSymbolAttribute(Sym, MCSA_ELF_TypeFunction);
+    }
+  }
+}
+
+/// emitComments - Pretty-print comments for instructions.
+static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
+  const MachineFunction *MF = MI.getMF();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+  // Check for spills and reloads
+
+  // We assume a single instruction only has a spill or reload, not
+  // both.
+  std::optional<unsigned> Size;
+  if ((Size = MI.getRestoreSize(TII))) {
+    CommentOS << *Size << "-byte Reload\n";
+  } else if ((Size = MI.getFoldedRestoreSize(TII))) {
+    if (*Size) {
+      if (*Size == unsigned(MemoryLocation::UnknownSize))
+        CommentOS << "Unknown-size Folded Reload\n";
+      else
+        CommentOS << *Size << "-byte Folded Reload\n";
+    }
+  } else if ((Size = MI.getSpillSize(TII))) {
+    CommentOS << *Size << "-byte Spill\n";
+  } else if ((Size = MI.getFoldedSpillSize(TII))) {
+    if (*Size) {
+      if (*Size == unsigned(MemoryLocation::UnknownSize))
+        CommentOS << "Unknown-size Folded Spill\n";
+      else
+        CommentOS << *Size << "-byte Folded Spill\n";
+    }
+  }
+
+  // Check for spill-induced copies
+  if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse))
+    CommentOS << " Reload Reuse\n";
+}
+
+/// emitImplicitDef - This method emits the specified machine instruction
+/// that is an implicit def.
+void AsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
+  Register RegNo = MI->getOperand(0).getReg();
+
+  SmallString<128> Str;
+  raw_svector_ostream OS(Str);
+  OS << "implicit-def: "
+     << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
+
+  OutStreamer->AddComment(OS.str());
+  OutStreamer->addBlankLine();
+}
+
+static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
+  std::string Str;
+  raw_string_ostream OS(Str);
+  OS << "kill:";
+  for (const MachineOperand &Op : MI->operands()) {
+    assert(Op.isReg() && "KILL instruction must have only register operands");
+    OS << ' ' << (Op.isDef() ? "def " : "killed ")
+       << printReg(Op.getReg(), AP.MF->getSubtarget().getRegisterInfo());
+  }
+  AP.OutStreamer->AddComment(OS.str());
+  AP.OutStreamer->addBlankLine();
+}
+
+/// emitDebugValueComment - This method handles the target-independent form
+/// of DBG_VALUE, returning true if it was able to do so.  A false return
+/// means the target will need to handle MI in EmitInstruction.
+static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
+  // This code handles only the 4-operand target-independent form.
+  if (MI->isNonListDebugValue() && MI->getNumOperands() != 4)
+    return false;
+
+  SmallString<128> Str;
+  raw_svector_ostream OS(Str);
+  OS << "DEBUG_VALUE: ";
+
+  const DILocalVariable *V = MI->getDebugVariable();
+  if (auto *SP = dyn_cast<DISubprogram>(V->getScope())) {
+    StringRef Name = SP->getName();
+    if (!Name.empty())
+      OS << Name << ":";
+  }
+  OS << V->getName();
+  OS << " <- ";
+
+  const DIExpression *Expr = MI->getDebugExpression();
+  // First convert this to a non-variadic expression if possible, to simplify
+  // the output.
+  if (auto NonVariadicExpr = DIExpression::convertToNonVariadicExpression(Expr))
+    Expr = *NonVariadicExpr;
+  // Then, output the possibly-simplified expression.
+  if (Expr->getNumElements()) {
+    OS << '[';
+    ListSeparator LS;
+    for (auto &Op : Expr->expr_ops()) {
+      OS << LS << dwarf::OperationEncodingString(Op.getOp());
+      for (unsigned I = 0; I < Op.getNumArgs(); ++I)
+        OS << ' ' << Op.getArg(I);
+    }
+    OS << "] ";
+  }
+
+  // Register or immediate value. Register 0 means undef.
+  for (const MachineOperand &Op : MI->debug_operands()) {
+    if (&Op != MI->debug_operands().begin())
+      OS << ", ";
+    switch (Op.getType()) {
+    case MachineOperand::MO_FPImmediate: {
+      APFloat APF = APFloat(Op.getFPImm()->getValueAPF());
+      Type *ImmTy = Op.getFPImm()->getType();
+      if (ImmTy->isBFloatTy() || ImmTy->isHalfTy() || ImmTy->isFloatTy() ||
+          ImmTy->isDoubleTy()) {
+        OS << APF.convertToDouble();
+      } else {
+        // There is no good way to print long double.  Convert a copy to
+        // double.  Ah well, it's only a comment.
+        bool ignored;
+        APF.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
+                    &ignored);
+        OS << "(long double) " << APF.convertToDouble();
+      }
+      break;
+    }
+    case MachineOperand::MO_Immediate: {
+      OS << Op.getImm();
+      break;
+    }
+    case MachineOperand::MO_CImmediate: {
+      Op.getCImm()->getValue().print(OS, false /*isSigned*/);
+      break;
+    }
+    case MachineOperand::MO_TargetIndex: {
+      OS << "!target-index(" << Op.getIndex() << "," << Op.getOffset() << ")";
+      break;
+    }
+    case MachineOperand::MO_Register:
+    case MachineOperand::MO_FrameIndex: {
+      Register Reg;
+      std::optional<StackOffset> Offset;
+      if (Op.isReg()) {
+        Reg = Op.getReg();
+      } else {
+        const TargetFrameLowering *TFI =
+            AP.MF->getSubtarget().getFrameLowering();
+        Offset = TFI->getFrameIndexReference(*AP.MF, Op.getIndex(), Reg);
+      }
+      if (!Reg) {
+        // Suppress offset, it is not meaningful here.
+        OS << "undef";
+        break;
+      }
+      // The second operand is only an offset if it's an immediate.
+      if (MI->isIndirectDebugValue())
+        Offset = StackOffset::getFixed(MI->getDebugOffset().getImm());
+      if (Offset)
+        OS << '[';
+      OS << printReg(Reg, AP.MF->getSubtarget().getRegisterInfo());
+      if (Offset)
+        OS << '+' << Offset->getFixed() << ']';
+      break;
+    }
+    default:
+      llvm_unreachable("Unknown operand type");
+    }
+  }
+
+  // NOTE: Want this comment at start of line, don't emit with AddComment.
+  AP.OutStreamer->emitRawComment(OS.str());
+  return true;
+}
+
+/// This method handles the target-independent form of DBG_LABEL, returning
+/// true if it was able to do so.  A false return means the target will need
+/// to handle MI in EmitInstruction.
+static bool emitDebugLabelComment(const MachineInstr *MI, AsmPrinter &AP) {
+  if (MI->getNumOperands() != 1)
+    return false;
+
+  SmallString<128> Str;
+  raw_svector_ostream OS(Str);
+  OS << "DEBUG_LABEL: ";
+
+  const DILabel *V = MI->getDebugLabel();
+  if (auto *SP = dyn_cast<DISubprogram>(
+          V->getScope()->getNonLexicalBlockFileScope())) {
+    StringRef Name = SP->getName();
+    if (!Name.empty())
+      OS << Name << ":";
+  }
+  OS << V->getName();
+
+  // NOTE: Want this comment at start of line, don't emit with AddComment.
+  AP.OutStreamer->emitRawComment(OS.str());
+  return true;
+}
+
+AsmPrinter::CFISection
+AsmPrinter::getFunctionCFISectionType(const Function &F) const {
+  // Ignore functions that won't get emitted.
+  if (F.isDeclarationForLinker())
+    return CFISection::None;
+
+  if (MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI &&
+      F.needsUnwindTableEntry())
+    return CFISection::EH;
+
+  if (MAI->usesCFIWithoutEH() && F.hasUWTable())
+    return CFISection::EH;
+
+  assert(MMI != nullptr && "Invalid machine module info");
+  if (MMI->hasDebugInfo() || TM.Options.ForceDwarfFrameSection)
+    return CFISection::Debug;
+
+  return CFISection::None;
+}
+
+AsmPrinter::CFISection
+AsmPrinter::getFunctionCFISectionType(const MachineFunction &MF) const {
+  return getFunctionCFISectionType(MF.getFunction());
+}
+
+bool AsmPrinter::needsSEHMoves() {
+  return MAI->usesWindowsCFI() && MF->getFunction().needsUnwindTableEntry();
+}
+
+bool AsmPrinter::usesCFIWithoutEH() const {
+  return MAI->usesCFIWithoutEH() && ModuleCFISection != CFISection::None;
+}
+
+void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) {
+  ExceptionHandling ExceptionHandlingType = MAI->getExceptionHandlingType();
+  if (!usesCFIWithoutEH() &&
+      ExceptionHandlingType != ExceptionHandling::DwarfCFI &&
+      ExceptionHandlingType != ExceptionHandling::ARM)
+    return;
+
+  if (getFunctionCFISectionType(*MF) == CFISection::None)
+    return;
+
+  // If there is no "real" instruction following this CFI instruction, skip
+  // emitting it; it would be beyond the end of the function's FDE range.
+  auto *MBB = MI.getParent();
+  auto I = std::next(MI.getIterator());
+  while (I != MBB->end() && I->isTransient())
+    ++I;
+  if (I == MBB->instr_end() &&
+      MBB->getReverseIterator() == MBB->getParent()->rbegin())
+    return;
+
+  const std::vector<MCCFIInstruction> &Instrs = MF->getFrameInstructions();
+  unsigned CFIIndex = MI.getOperand(0).getCFIIndex();
+  const MCCFIInstruction &CFI = Instrs[CFIIndex];
+  emitCFIInstruction(CFI);
+}
+
+void AsmPrinter::emitFrameAlloc(const MachineInstr &MI) {
+  // The operands are the MCSymbol and the frame offset of the allocation.
+  MCSymbol *FrameAllocSym = MI.getOperand(0).getMCSymbol();
+  int FrameOffset = MI.getOperand(1).getImm();
+
+  // Emit a symbol assignment.
+  OutStreamer->emitAssignment(FrameAllocSym,
+                             MCConstantExpr::create(FrameOffset, OutContext));
+}
+
+/// Returns the BB metadata to be emitted in the SHT_LLVM_BB_ADDR_MAP section
+/// for a given basic block. This can be used to capture more precise profile
+/// information.
+static uint32_t getBBAddrMapMetadata(const MachineBasicBlock &MBB) {
+  const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
+  return object::BBAddrMap::BBEntry::Metadata{
+      MBB.isReturnBlock(), !MBB.empty() && TII->isTailCall(MBB.back()),
+      MBB.isEHPad(), const_cast<MachineBasicBlock &>(MBB).canFallThrough(),
+      !MBB.empty() && MBB.rbegin()->isIndirectBranch()}
+      .encode();
+}
+
+void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
+  MCSection *BBAddrMapSection =
+      getObjFileLowering().getBBAddrMapSection(*MF.getSection());
+  assert(BBAddrMapSection && ".llvm_bb_addr_map section is not initialized.");
+
+  const MCSymbol *FunctionSymbol = getFunctionBegin();
+
+  OutStreamer->pushSection();
+  OutStreamer->switchSection(BBAddrMapSection);
+  OutStreamer->AddComment("version");
+  uint8_t BBAddrMapVersion = OutStreamer->getContext().getBBAddrMapVersion();
+  OutStreamer->emitInt8(BBAddrMapVersion);
+  OutStreamer->AddComment("feature");
+  OutStreamer->emitInt8(0);
+  OutStreamer->AddComment("function address");
+  OutStreamer->emitSymbolValue(FunctionSymbol, getPointerSize());
+  OutStreamer->AddComment("number of basic blocks");
+  OutStreamer->emitULEB128IntValue(MF.size());
+  const MCSymbol *PrevMBBEndSymbol = FunctionSymbol;
+  // Emit BB Information for each basic block in the function.
+  for (const MachineBasicBlock &MBB : MF) {
+    const MCSymbol *MBBSymbol =
+        MBB.isEntryBlock() ? FunctionSymbol : MBB.getSymbol();
+    // TODO: Remove this check when version 1 is deprecated.
+    if (BBAddrMapVersion > 1) {
+      OutStreamer->AddComment("BB id");
+      // Emit the BB ID for this basic block.
+      OutStreamer->emitULEB128IntValue(*MBB.getBBID());
+    }
+    // Emit the basic block offset relative to the end of the previous block.
+    // This is zero unless the block is padded due to alignment.
+    emitLabelDifferenceAsULEB128(MBBSymbol, PrevMBBEndSymbol);
+    // Emit the basic block size. When BBs have alignments, their size cannot
+    // always be computed from their offsets.
+    emitLabelDifferenceAsULEB128(MBB.getEndSymbol(), MBBSymbol);
+    // Emit the Metadata.
+    OutStreamer->emitULEB128IntValue(getBBAddrMapMetadata(MBB));
+    PrevMBBEndSymbol = MBB.getEndSymbol();
+  }
+  OutStreamer->popSection();
+}
+
+void AsmPrinter::emitKCFITrapEntry(const MachineFunction &MF,
+                                   const MCSymbol *Symbol) {
+  MCSection *Section =
+      getObjFileLowering().getKCFITrapSection(*MF.getSection());
+  if (!Section)
+    return;
+
+  OutStreamer->pushSection();
+  OutStreamer->switchSection(Section);
+
+  MCSymbol *Loc = OutContext.createLinkerPrivateTempSymbol();
+  OutStreamer->emitLabel(Loc);
+  OutStreamer->emitAbsoluteSymbolDiff(Symbol, Loc, 4);
+
+  OutStreamer->popSection();
+}
+
+void AsmPrinter::emitKCFITypeId(const MachineFunction &MF) {
+  const Function &F = MF.getFunction();
+  if (const MDNode *MD = F.getMetadata(LLVMContext::MD_kcfi_type))
+    emitGlobalConstant(F.getParent()->getDataLayout(),
+                       mdconst::extract<ConstantInt>(MD->getOperand(0)));
+}
+
+void AsmPrinter::emitPseudoProbe(const MachineInstr &MI) {
+  if (PP) {
+    auto GUID = MI.getOperand(0).getImm();
+    auto Index = MI.getOperand(1).getImm();
+    auto Type = MI.getOperand(2).getImm();
+    auto Attr = MI.getOperand(3).getImm();
+    DILocation *DebugLoc = MI.getDebugLoc();
+    PP->emitPseudoProbe(GUID, Index, Type, Attr, DebugLoc);
+  }
+}
+
+void AsmPrinter::emitStackSizeSection(const MachineFunction &MF) {
+  if (!MF.getTarget().Options.EmitStackSizeSection)
+    return;
+
+  MCSection *StackSizeSection =
+      getObjFileLowering().getStackSizesSection(*getCurrentSection());
+  if (!StackSizeSection)
+    return;
+
+  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  // Don't emit functions with dynamic stack allocations.
+  if (FrameInfo.hasVarSizedObjects())
+    return;
+
+  OutStreamer->pushSection();
+  OutStreamer->switchSection(StackSizeSection);
+
+  const MCSymbol *FunctionSymbol = getFunctionBegin();
+  uint64_t StackSize =
+      FrameInfo.getStackSize() + FrameInfo.getUnsafeStackSize();
+  OutStreamer->emitSymbolValue(FunctionSymbol, TM.getProgramPointerSize());
+  OutStreamer->emitULEB128IntValue(StackSize);
+
+  OutStreamer->popSection();
+}
+
+void AsmPrinter::emitStackUsage(const MachineFunction &MF) {
+  const std::string &OutputFilename = MF.getTarget().Options.StackUsageOutput;
+
+  // OutputFilename empty implies -fstack-usage is not passed.
+  if (OutputFilename.empty())
+    return;
+
+  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  uint64_t StackSize =
+      FrameInfo.getStackSize() + FrameInfo.getUnsafeStackSize();
+
+  if (StackUsageStream == nullptr) {
+    std::error_code EC;
+    StackUsageStream =
+        std::make_unique<raw_fd_ostream>(OutputFilename, EC, sys::fs::OF_Text);
+    if (EC) {
+      errs() << "Could not open file: " << EC.message();
+      return;
+    }
+  }
+
+  *StackUsageStream << MF.getFunction().getParent()->getName();
+  if (const DISubprogram *DSP = MF.getFunction().getSubprogram())
+    *StackUsageStream << ':' << DSP->getLine();
+
+  *StackUsageStream << ':' << MF.getName() << '\t' << StackSize << '\t';
+  if (FrameInfo.hasVarSizedObjects())
+    *StackUsageStream << "dynamic\n";
+  else
+    *StackUsageStream << "static\n";
+}
+
+void AsmPrinter::emitPCSectionsLabel(const MachineFunction &MF,
+                                     const MDNode &MD) {
+  MCSymbol *S = MF.getContext().createTempSymbol("pcsection");
+  OutStreamer->emitLabel(S);
+  PCSectionsSymbols[&MD].emplace_back(S);
+}
+
+void AsmPrinter::emitPCSections(const MachineFunction &MF) {
+  const Function &F = MF.getFunction();
+  if (PCSectionsSymbols.empty() && !F.hasMetadata(LLVMContext::MD_pcsections))
+    return;
+
+  const CodeModel::Model CM = MF.getTarget().getCodeModel();
+  const unsigned RelativeRelocSize =
+      (CM == CodeModel::Medium || CM == CodeModel::Large) ? getPointerSize()
+                                                          : 4;
+
+  // Switch to PCSection, short-circuiting the common case where the current
+  // section is still valid (assume most MD_pcsections contain just 1 section).
+  auto SwitchSection = [&, Prev = StringRef()](const StringRef &Sec) mutable {
+    if (Sec == Prev)
+      return;
+    MCSection *S = getObjFileLowering().getPCSection(Sec, MF.getSection());
+    assert(S && "PC section is not initialized");
+    OutStreamer->switchSection(S);
+    Prev = Sec;
+  };
+  // Emit symbols into sections and data as specified in the pcsections MDNode.
+  auto EmitForMD = [&](const MDNode &MD, ArrayRef<const MCSymbol *> Syms,
+                       bool Deltas) {
+    // Expect the first operand to be a section name. After that, a tuple of
+    // constants may appear, which will simply be emitted into the current
+    // section (the user of MD_pcsections decides the format of encoded data).
+    assert(isa<MDString>(MD.getOperand(0)) && "first operand not a string");
+    bool ConstULEB128 = false;
+    for (const MDOperand &MDO : MD.operands()) {
+      if (auto *S = dyn_cast<MDString>(MDO)) {
+        // Found string, start of new section!
+        // Find options for this section "<section>!<opts>" - supported options:
+        //   C = Compress constant integers of size 2-8 bytes as ULEB128.
+        const StringRef SecWithOpt = S->getString();
+        const size_t OptStart = SecWithOpt.find('!'); // likely npos
+        const StringRef Sec = SecWithOpt.substr(0, OptStart);
+        const StringRef Opts = SecWithOpt.substr(OptStart); // likely empty
+        ConstULEB128 = Opts.find('C') != StringRef::npos;
+#ifndef NDEBUG
+        for (char O : Opts)
+          assert((O == '!' || O == 'C') && "Invalid !pcsections options");
+#endif
+        SwitchSection(Sec);
+        const MCSymbol *Prev = Syms.front();
+        for (const MCSymbol *Sym : Syms) {
+          if (Sym == Prev || !Deltas) {
+            // Use the entry itself as the base of the relative offset.
+            MCSymbol *Base = MF.getContext().createTempSymbol("pcsection_base");
+            OutStreamer->emitLabel(Base);
+            // Emit relative relocation `addr - base`, which avoids a dynamic
+            // relocation in the final binary. User will get the address with
+            // `base + addr`.
+            emitLabelDifference(Sym, Base, RelativeRelocSize);
+          } else {
+            // Emit delta between symbol and previous symbol.
+            if (ConstULEB128)
+              emitLabelDifferenceAsULEB128(Sym, Prev);
+            else
+              emitLabelDifference(Sym, Prev, 4);
+          }
+          Prev = Sym;
+        }
+      } else {
+        // Emit auxiliary data after PC.
+        assert(isa<MDNode>(MDO) && "expecting either string or tuple");
+        const auto *AuxMDs = cast<MDNode>(MDO);
+        for (const MDOperand &AuxMDO : AuxMDs->operands()) {
+          assert(isa<ConstantAsMetadata>(AuxMDO) && "expecting a constant");
+          const Constant *C = cast<ConstantAsMetadata>(AuxMDO)->getValue();
+          const DataLayout &DL = F.getParent()->getDataLayout();
+          const uint64_t Size = DL.getTypeStoreSize(C->getType());
+
+          if (auto *CI = dyn_cast<ConstantInt>(C);
+              CI && ConstULEB128 && Size > 1 && Size <= 8) {
+            emitULEB128(CI->getZExtValue());
+          } else {
+            emitGlobalConstant(DL, C);
+          }
+        }
+      }
+    }
+  };
+
+  OutStreamer->pushSection();
+  // Emit PCs for function start and function size.
+  if (const MDNode *MD = F.getMetadata(LLVMContext::MD_pcsections))
+    EmitForMD(*MD, {getFunctionBegin(), getFunctionEnd()}, true);
+  // Emit PCs for instructions collected.
+  for (const auto &MS : PCSectionsSymbols)
+    EmitForMD(*MS.first, MS.second, false);
+  OutStreamer->popSection();
+  PCSectionsSymbols.clear();
+}
+
+/// Returns true if function begin and end labels should be emitted.
+static bool needFuncLabels(const MachineFunction &MF) {
+  MachineModuleInfo &MMI = MF.getMMI();
+  if (!MF.getLandingPads().empty() || MF.hasEHFunclets() ||
+      MMI.hasDebugInfo() ||
+      MF.getFunction().hasMetadata(LLVMContext::MD_pcsections))
+    return true;
+
+  // We might emit an EH table that uses function begin and end labels even if
+  // we don't have any landingpads.
+  if (!MF.getFunction().hasPersonalityFn())
+    return false;
+  return !isNoOpWithoutInvoke(
+      classifyEHPersonality(MF.getFunction().getPersonalityFn()));
+}
+
+/// EmitFunctionBody - This method emits the body and trailer for a
+/// function.
+void AsmPrinter::emitFunctionBody() {
+  emitFunctionHeader();
+
+  // Emit target-specific gunk before the function body.
+  emitFunctionBodyStart();
+
+  if (isVerbose()) {
+    // Get MachineDominatorTree or compute it on the fly if it's unavailable
+    MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+    if (!MDT) {
+      OwnedMDT = std::make_unique<MachineDominatorTree>();
+      OwnedMDT->getBase().recalculate(*MF);
+      MDT = OwnedMDT.get();
+    }
+
+    // Get MachineLoopInfo or compute it on the fly if it's unavailable
+    MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+    if (!MLI) {
+      OwnedMLI = std::make_unique<MachineLoopInfo>();
+      OwnedMLI->getBase().analyze(MDT->getBase());
+      MLI = OwnedMLI.get();
+    }
+  }
+
+  // Print out code for the function.
+  bool HasAnyRealCode = false;
+  int NumInstsInFunction = 0;
+  bool IsEHa = MMI->getModule()->getModuleFlag("eh-asynch");
+
+  bool CanDoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+  for (auto &MBB : *MF) {
+    // Print a label for the basic block.
+    emitBasicBlockStart(MBB);
+    DenseMap<StringRef, unsigned> MnemonicCounts;
+    for (auto &MI : MBB) {
+      // Print the assembly for the instruction.
+      if (!MI.isPosition() && !MI.isImplicitDef() && !MI.isKill() &&
+          !MI.isDebugInstr()) {
+        HasAnyRealCode = true;
+        ++NumInstsInFunction;
+      }
+
+      // If there is a pre-instruction symbol, emit a label for it here.
+      if (MCSymbol *S = MI.getPreInstrSymbol())
+        OutStreamer->emitLabel(S);
+
+      if (MDNode *MD = MI.getPCSections())
+        emitPCSectionsLabel(*MF, *MD);
+
+      for (const HandlerInfo &HI : Handlers) {
+        NamedRegionTimer T(HI.TimerName, HI.TimerDescription, HI.TimerGroupName,
+                           HI.TimerGroupDescription, TimePassesIsEnabled);
+        HI.Handler->beginInstruction(&MI);
+      }
+
+      if (isVerbose())
+        emitComments(MI, OutStreamer->getCommentOS());
+
+      switch (MI.getOpcode()) {
+      case TargetOpcode::CFI_INSTRUCTION:
+        emitCFIInstruction(MI);
+        break;
+      case TargetOpcode::LOCAL_ESCAPE:
+        emitFrameAlloc(MI);
+        break;
+      case TargetOpcode::ANNOTATION_LABEL:
+      case TargetOpcode::GC_LABEL:
+        OutStreamer->emitLabel(MI.getOperand(0).getMCSymbol());
+        break;
+      case TargetOpcode::EH_LABEL:
+        OutStreamer->emitLabel(MI.getOperand(0).getMCSymbol());
+        // For AsynchEH, insert a Nop if followed by a trap inst
+        //   Or the exception won't be caught.
+        //   (see MCConstantExpr::create(1,..) in WinException.cpp)
+        //  Ignore SDiv/UDiv because a DIV with Const-0 divisor
+        //    must have being turned into an UndefValue.
+        //  Div with variable opnds won't be the first instruction in
+        //  an EH region as it must be led by at least a Load
+        {
+          auto MI2 = std::next(MI.getIterator());
+          if (IsEHa && MI2 != MBB.end() &&
+              (MI2->mayLoadOrStore() || MI2->mayRaiseFPException()))
+            emitNops(1);
+        }
+        break;
+      case TargetOpcode::INLINEASM:
+      case TargetOpcode::INLINEASM_BR:
+        emitInlineAsm(&MI);
+        break;
+      case TargetOpcode::DBG_VALUE:
+      case TargetOpcode::DBG_VALUE_LIST:
+        if (isVerbose()) {
+          if (!emitDebugValueComment(&MI, *this))
+            emitInstruction(&MI);
+        }
+        break;
+      case TargetOpcode::DBG_INSTR_REF:
+        // This instruction reference will have been resolved to a machine
+        // location, and a nearby DBG_VALUE created. We can safely ignore
+        // the instruction reference.
+        break;
+      case TargetOpcode::DBG_PHI:
+        // This instruction is only used to label a program point, it's purely
+        // meta information.
+        break;
+      case TargetOpcode::DBG_LABEL:
+        if (isVerbose()) {
+          if (!emitDebugLabelComment(&MI, *this))
+            emitInstruction(&MI);
+        }
+        break;
+      case TargetOpcode::IMPLICIT_DEF:
+        if (isVerbose()) emitImplicitDef(&MI);
+        break;
+      case TargetOpcode::KILL:
+        if (isVerbose()) emitKill(&MI, *this);
+        break;
+      case TargetOpcode::PSEUDO_PROBE:
+        emitPseudoProbe(MI);
+        break;
+      case TargetOpcode::ARITH_FENCE:
+        if (isVerbose())
+          OutStreamer->emitRawComment("ARITH_FENCE");
+        break;
+      case TargetOpcode::MEMBARRIER:
+        OutStreamer->emitRawComment("MEMBARRIER");
+        break;
+      default:
+        emitInstruction(&MI);
+        if (CanDoExtraAnalysis) {
+          MCInst MCI;
+          MCI.setOpcode(MI.getOpcode());
+          auto Name = OutStreamer->getMnemonic(MCI);
+          auto I = MnemonicCounts.insert({Name, 0u});
+          I.first->second++;
+        }
+        break;
+      }
+
+      // If there is a post-instruction symbol, emit a label for it here.
+      if (MCSymbol *S = MI.getPostInstrSymbol())
+        OutStreamer->emitLabel(S);
+
+      for (const HandlerInfo &HI : Handlers) {
+        NamedRegionTimer T(HI.TimerName, HI.TimerDescription, HI.TimerGroupName,
+                           HI.TimerGroupDescription, TimePassesIsEnabled);
+        HI.Handler->endInstruction();
+      }
+    }
+
+    // We must emit temporary symbol for the end of this basic block, if either
+    // we have BBLabels enabled or if this basic blocks marks the end of a
+    // section.
+    if (MF->hasBBLabels() ||
+        (MAI->hasDotTypeDotSizeDirective() && MBB.isEndSection()))
+      OutStreamer->emitLabel(MBB.getEndSymbol());
+
+    if (MBB.isEndSection()) {
+      // The size directive for the section containing the entry block is
+      // handled separately by the function section.
+      if (!MBB.sameSection(&MF->front())) {
+        if (MAI->hasDotTypeDotSizeDirective()) {
+          // Emit the size directive for the basic block section.
+          const MCExpr *SizeExp = MCBinaryExpr::createSub(
+              MCSymbolRefExpr::create(MBB.getEndSymbol(), OutContext),
+              MCSymbolRefExpr::create(CurrentSectionBeginSym, OutContext),
+              OutContext);
+          OutStreamer->emitELFSize(CurrentSectionBeginSym, SizeExp);
+        }
+        MBBSectionRanges[MBB.getSectionIDNum()] =
+            MBBSectionRange{CurrentSectionBeginSym, MBB.getEndSymbol()};
+      }
+    }
+    emitBasicBlockEnd(MBB);
+
+    if (CanDoExtraAnalysis) {
+      // Skip empty blocks.
+      if (MBB.empty())
+        continue;
+
+      MachineOptimizationRemarkAnalysis R(DEBUG_TYPE, "InstructionMix",
+                                          MBB.begin()->getDebugLoc(), &MBB);
+
+      // Generate instruction mix remark. First, sort counts in descending order
+      // by count and name.
+      SmallVector<std::pair<StringRef, unsigned>, 128> MnemonicVec;
+      for (auto &KV : MnemonicCounts)
+        MnemonicVec.emplace_back(KV.first, KV.second);
+
+      sort(MnemonicVec, [](const std::pair<StringRef, unsigned> &A,
+                           const std::pair<StringRef, unsigned> &B) {
+        if (A.second > B.second)
+          return true;
+        if (A.second == B.second)
+          return StringRef(A.first) < StringRef(B.first);
+        return false;
+      });
+      R << "BasicBlock: " << ore::NV("BasicBlock", MBB.getName()) << "\n";
+      for (auto &KV : MnemonicVec) {
+        auto Name = (Twine("INST_") + getToken(KV.first.trim()).first).str();
+        R << KV.first << ": " << ore::NV(Name, KV.second) << "\n";
+      }
+      ORE->emit(R);
+    }
+  }
+
+  EmittedInsts += NumInstsInFunction;
+  MachineOptimizationRemarkAnalysis R(DEBUG_TYPE, "InstructionCount",
+                                      MF->getFunction().getSubprogram(),
+                                      &MF->front());
+  R << ore::NV("NumInstructions", NumInstsInFunction)
+    << " instructions in function";
+  ORE->emit(R);
+
+  // If the function is empty and the object file uses .subsections_via_symbols,
+  // then we need to emit *something* to the function body to prevent the
+  // labels from collapsing together.  Just emit a noop.
+  // Similarly, don't emit empty functions on Windows either. It can lead to
+  // duplicate entries (two functions with the same RVA) in the Guard CF Table
+  // after linking, causing the kernel not to load the binary:
+  // https://developercommunity.visualstudio.com/content/problem/45366/vc-linker-creates-invalid-dll-with-clang-cl.html
+  // FIXME: Hide this behind some API in e.g. MCAsmInfo or MCTargetStreamer.
+  const Triple &TT = TM.getTargetTriple();
+  if (!HasAnyRealCode && (MAI->hasSubsectionsViaSymbols() ||
+                          (TT.isOSWindows() && TT.isOSBinFormatCOFF()))) {
+    MCInst Noop = MF->getSubtarget().getInstrInfo()->getNop();
+
+    // Targets can opt-out of emitting the noop here by leaving the opcode
+    // unspecified.
+    if (Noop.getOpcode()) {
+      OutStreamer->AddComment("avoids zero-length function");
+      emitNops(1);
+    }
+  }
+
+  // Switch to the original section in case basic block sections was used.
+  OutStreamer->switchSection(MF->getSection());
+
+  const Function &F = MF->getFunction();
+  for (const auto &BB : F) {
+    if (!BB.hasAddressTaken())
+      continue;
+    MCSymbol *Sym = GetBlockAddressSymbol(&BB);
+    if (Sym->isDefined())
+      continue;
+    OutStreamer->AddComment("Address of block that was removed by CodeGen");
+    OutStreamer->emitLabel(Sym);
+  }
+
+  // Emit target-specific gunk after the function body.
+  emitFunctionBodyEnd();
+
+  // Even though wasm supports .type and .size in general, function symbols
+  // are automatically sized.
+  bool EmitFunctionSize = MAI->hasDotTypeDotSizeDirective() && !TT.isWasm();
+
+  if (needFuncLabels(*MF) || EmitFunctionSize) {
+    // Create a symbol for the end of function.
+    CurrentFnEnd = createTempSymbol("func_end");
+    OutStreamer->emitLabel(CurrentFnEnd);
+  }
+
+  // If the target wants a .size directive for the size of the function, emit
+  // it.
+  if (EmitFunctionSize) {
+    // We can get the size as difference between the function label and the
+    // temp label.
+    const MCExpr *SizeExp = MCBinaryExpr::createSub(
+        MCSymbolRefExpr::create(CurrentFnEnd, OutContext),
+        MCSymbolRefExpr::create(CurrentFnSymForSize, OutContext), OutContext);
+    OutStreamer->emitELFSize(CurrentFnSym, SizeExp);
+    if (CurrentFnBeginLocal)
+      OutStreamer->emitELFSize(CurrentFnBeginLocal, SizeExp);
+  }
+
+  // Call endBasicBlockSection on the last block now, if it wasn't already
+  // called.
+  if (!MF->back().isEndSection()) {
+    for (const HandlerInfo &HI : Handlers) {
+      NamedRegionTimer T(HI.TimerName, HI.TimerDescription, HI.TimerGroupName,
+                         HI.TimerGroupDescription, TimePassesIsEnabled);
+      HI.Handler->endBasicBlockSection(MF->back());
+    }
+  }
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerDescription, HI.TimerGroupName,
+                       HI.TimerGroupDescription, TimePassesIsEnabled);
+    HI.Handler->markFunctionEnd();
+  }
+
+  MBBSectionRanges[MF->front().getSectionIDNum()] =
+      MBBSectionRange{CurrentFnBegin, CurrentFnEnd};
+
+  // Print out jump tables referenced by the function.
+  emitJumpTableInfo();
+
+  // Emit post-function debug and/or EH information.
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerDescription, HI.TimerGroupName,
+                       HI.TimerGroupDescription, TimePassesIsEnabled);
+    HI.Handler->endFunction(MF);
+  }
+
+  // Emit section containing BB address offsets and their metadata, when
+  // BB labels are requested for this function. Skip empty functions.
+  if (MF->hasBBLabels() && HasAnyRealCode)
+    emitBBAddrMapSection(*MF);
+
+  // Emit sections containing instruction and function PCs.
+  emitPCSections(*MF);
+
+  // Emit section containing stack size metadata.
+  emitStackSizeSection(*MF);
+
+  // Emit .su file containing function stack size information.
+  emitStackUsage(*MF);
+
+  emitPatchableFunctionEntries();
+
+  if (isVerbose())
+    OutStreamer->getCommentOS() << "-- End function\n";
+
+  OutStreamer->addBlankLine();
+
+  // Output MBB ids, function names, and frequencies if the flag to dump
+  // MBB profile information has been set
+  if (MBBProfileDumpFileOutput) {
+    if (!MF->hasBBLabels())
+      MF->getContext().reportError(
+          SMLoc(),
+          "Unable to find BB labels for MBB profile dump. -mbb-profile-dump "
+          "must be called with -basic-block-sections=labels");
+    MachineBlockFrequencyInfo &MBFI =
+        getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI();
+    for (const auto &MBB : *MF) {
+      *MBBProfileDumpFileOutput.get()
+          << MF->getName() << "," << MBB.getBBID() << ","
+          << MBFI.getBlockFreqRelativeToEntryBlock(&MBB) << "\n";
+    }
+  }
+}
+
+/// Compute the number of Global Variables that uses a Constant.
+static unsigned getNumGlobalVariableUses(const Constant *C) {
+  if (!C)
+    return 0;
+
+  if (isa<GlobalVariable>(C))
+    return 1;
+
+  unsigned NumUses = 0;
+  for (const auto *CU : C->users())
+    NumUses += getNumGlobalVariableUses(dyn_cast<Constant>(CU));
+
+  return NumUses;
+}
+
+/// Only consider global GOT equivalents if at least one user is a
+/// cstexpr inside an initializer of another global variables. Also, don't
+/// handle cstexpr inside instructions. During global variable emission,
+/// candidates are skipped and are emitted later in case at least one cstexpr
+/// isn't replaced by a PC relative GOT entry access.
+static bool isGOTEquivalentCandidate(const GlobalVariable *GV,
+                                     unsigned &NumGOTEquivUsers) {
+  // Global GOT equivalents are unnamed private globals with a constant
+  // pointer initializer to another global symbol. They must point to a
+  // GlobalVariable or Function, i.e., as GlobalValue.
+  if (!GV->hasGlobalUnnamedAddr() || !GV->hasInitializer() ||
+      !GV->isConstant() || !GV->isDiscardableIfUnused() ||
+      !isa<GlobalValue>(GV->getOperand(0)))
+    return false;
+
+  // To be a got equivalent, at least one of its users need to be a constant
+  // expression used by another global variable.
+  for (const auto *U : GV->users())
+    NumGOTEquivUsers += getNumGlobalVariableUses(dyn_cast<Constant>(U));
+
+  return NumGOTEquivUsers > 0;
+}
+
+/// Unnamed constant global variables solely contaning a pointer to
+/// another globals variable is equivalent to a GOT table entry; it contains the
+/// the address of another symbol. Optimize it and replace accesses to these
+/// "GOT equivalents" by using the GOT entry for the final global instead.
+/// Compute GOT equivalent candidates among all global variables to avoid
+/// emitting them if possible later on, after it use is replaced by a GOT entry
+/// access.
+void AsmPrinter::computeGlobalGOTEquivs(Module &M) {
+  if (!getObjFileLowering().supportIndirectSymViaGOTPCRel())
+    return;
+
+  for (const auto &G : M.globals()) {
+    unsigned NumGOTEquivUsers = 0;
+    if (!isGOTEquivalentCandidate(&G, NumGOTEquivUsers))
+      continue;
+
+    const MCSymbol *GOTEquivSym = getSymbol(&G);
+    GlobalGOTEquivs[GOTEquivSym] = std::make_pair(&G, NumGOTEquivUsers);
+  }
+}
+
+/// Constant expressions using GOT equivalent globals may not be eligible
+/// for PC relative GOT entry conversion, in such cases we need to emit such
+/// globals we previously omitted in EmitGlobalVariable.
+void AsmPrinter::emitGlobalGOTEquivs() {
+  if (!getObjFileLowering().supportIndirectSymViaGOTPCRel())
+    return;
+
+  SmallVector<const GlobalVariable *, 8> FailedCandidates;
+  for (auto &I : GlobalGOTEquivs) {
+    const GlobalVariable *GV = I.second.first;
+    unsigned Cnt = I.second.second;
+    if (Cnt)
+      FailedCandidates.push_back(GV);
+  }
+  GlobalGOTEquivs.clear();
+
+  for (const auto *GV : FailedCandidates)
+    emitGlobalVariable(GV);
+}
+
+void AsmPrinter::emitGlobalAlias(Module &M, const GlobalAlias &GA) {
+  MCSymbol *Name = getSymbol(&GA);
+  bool IsFunction = GA.getValueType()->isFunctionTy();
+  // Treat bitcasts of functions as functions also. This is important at least
+  // on WebAssembly where object and function addresses can't alias each other.
+  if (!IsFunction)
+    IsFunction = isa<Function>(GA.getAliasee()->stripPointerCasts());
+
+  // AIX's assembly directive `.set` is not usable for aliasing purpose,
+  // so AIX has to use the extra-label-at-definition strategy. At this
+  // point, all the extra label is emitted, we just have to emit linkage for
+  // those labels.
+  if (TM.getTargetTriple().isOSBinFormatXCOFF()) {
+    assert(MAI->hasVisibilityOnlyWithLinkage() &&
+           "Visibility should be handled with emitLinkage() on AIX.");
+
+    // Linkage for alias of global variable has been emitted.
+    if (isa<GlobalVariable>(GA.getAliaseeObject()))
+      return;
+
+    emitLinkage(&GA, Name);
+    // If it's a function, also emit linkage for aliases of function entry
+    // point.
+    if (IsFunction)
+      emitLinkage(&GA,
+                  getObjFileLowering().getFunctionEntryPointSymbol(&GA, TM));
+    return;
+  }
+
+  if (GA.hasExternalLinkage() || !MAI->getWeakRefDirective())
+    OutStreamer->emitSymbolAttribute(Name, MCSA_Global);
+  else if (GA.hasWeakLinkage() || GA.hasLinkOnceLinkage())
+    OutStreamer->emitSymbolAttribute(Name, MCSA_WeakReference);
+  else
+    assert(GA.hasLocalLinkage() && "Invalid alias linkage");
+
+  // Set the symbol type to function if the alias has a function type.
+  // This affects codegen when the aliasee is not a function.
+  if (IsFunction) {
+    OutStreamer->emitSymbolAttribute(Name, MCSA_ELF_TypeFunction);
+    if (TM.getTargetTriple().isOSBinFormatCOFF()) {
+      OutStreamer->beginCOFFSymbolDef(Name);
+      OutStreamer->emitCOFFSymbolStorageClass(
+          GA.hasLocalLinkage() ? COFF::IMAGE_SYM_CLASS_STATIC
+                               : COFF::IMAGE_SYM_CLASS_EXTERNAL);
+      OutStreamer->emitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+                                      << COFF::SCT_COMPLEX_TYPE_SHIFT);
+      OutStreamer->endCOFFSymbolDef();
+    }
+  }
+
+  emitVisibility(Name, GA.getVisibility());
+
+  const MCExpr *Expr = lowerConstant(GA.getAliasee());
+
+  if (MAI->hasAltEntry() && isa<MCBinaryExpr>(Expr))
+    OutStreamer->emitSymbolAttribute(Name, MCSA_AltEntry);
+
+  // Emit the directives as assignments aka .set:
+  OutStreamer->emitAssignment(Name, Expr);
+  MCSymbol *LocalAlias = getSymbolPreferLocal(GA);
+  if (LocalAlias != Name)
+    OutStreamer->emitAssignment(LocalAlias, Expr);
+
+  // If the aliasee does not correspond to a symbol in the output, i.e. the
+  // alias is not of an object or the aliased object is private, then set the
+  // size of the alias symbol from the type of the alias. We don't do this in
+  // other situations as the alias and aliasee having differing types but same
+  // size may be intentional.
+  const GlobalObject *BaseObject = GA.getAliaseeObject();
+  if (MAI->hasDotTypeDotSizeDirective() && GA.getValueType()->isSized() &&
+      (!BaseObject || BaseObject->hasPrivateLinkage())) {
+    const DataLayout &DL = M.getDataLayout();
+    uint64_t Size = DL.getTypeAllocSize(GA.getValueType());
+    OutStreamer->emitELFSize(Name, MCConstantExpr::create(Size, OutContext));
+  }
+}
+
+void AsmPrinter::emitGlobalIFunc(Module &M, const GlobalIFunc &GI) {
+  assert(!TM.getTargetTriple().isOSBinFormatXCOFF() &&
+         "IFunc is not supported on AIX.");
+
+  MCSymbol *Name = getSymbol(&GI);
+
+  if (GI.hasExternalLinkage() || !MAI->getWeakRefDirective())
+    OutStreamer->emitSymbolAttribute(Name, MCSA_Global);
+  else if (GI.hasWeakLinkage() || GI.hasLinkOnceLinkage())
+    OutStreamer->emitSymbolAttribute(Name, MCSA_WeakReference);
+  else
+    assert(GI.hasLocalLinkage() && "Invalid ifunc linkage");
+
+  OutStreamer->emitSymbolAttribute(Name, MCSA_ELF_TypeIndFunction);
+  emitVisibility(Name, GI.getVisibility());
+
+  // Emit the directives as assignments aka .set:
+  const MCExpr *Expr = lowerConstant(GI.getResolver());
+  OutStreamer->emitAssignment(Name, Expr);
+  MCSymbol *LocalAlias = getSymbolPreferLocal(GI);
+  if (LocalAlias != Name)
+    OutStreamer->emitAssignment(LocalAlias, Expr);
+}
+
+void AsmPrinter::emitRemarksSection(remarks::RemarkStreamer &RS) {
+  if (!RS.needsSection())
+    return;
+
+  remarks::RemarkSerializer &RemarkSerializer = RS.getSerializer();
+
+  std::optional<SmallString<128>> Filename;
+  if (std::optional<StringRef> FilenameRef = RS.getFilename()) {
+    Filename = *FilenameRef;
+    sys::fs::make_absolute(*Filename);
+    assert(!Filename->empty() && "The filename can't be empty.");
+  }
+
+  std::string Buf;
+  raw_string_ostream OS(Buf);
+  std::unique_ptr<remarks::MetaSerializer> MetaSerializer =
+      Filename ? RemarkSerializer.metaSerializer(OS, Filename->str())
+               : RemarkSerializer.metaSerializer(OS);
+  MetaSerializer->emit();
+
+  // Switch to the remarks section.
+  MCSection *RemarksSection =
+      OutContext.getObjectFileInfo()->getRemarksSection();
+  OutStreamer->switchSection(RemarksSection);
+
+  OutStreamer->emitBinaryData(OS.str());
+}
+
+bool AsmPrinter::doFinalization(Module &M) {
+  // Set the MachineFunction to nullptr so that we can catch attempted
+  // accesses to MF specific features at the module level and so that
+  // we can conditionalize accesses based on whether or not it is nullptr.
+  MF = nullptr;
+
+  // Gather all GOT equivalent globals in the module. We really need two
+  // passes over the globals: one to compute and another to avoid its emission
+  // in EmitGlobalVariable, otherwise we would not be able to handle cases
+  // where the got equivalent shows up before its use.
+  computeGlobalGOTEquivs(M);
+
+  // Emit global variables.
+  for (const auto &G : M.globals())
+    emitGlobalVariable(&G);
+
+  // Emit remaining GOT equivalent globals.
+  emitGlobalGOTEquivs();
+
+  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+
+  // Emit linkage(XCOFF) and visibility info for declarations
+  for (const Function &F : M) {
+    if (!F.isDeclarationForLinker())
+      continue;
+
+    MCSymbol *Name = getSymbol(&F);
+    // Function getSymbol gives us the function descriptor symbol for XCOFF.
+
+    if (!TM.getTargetTriple().isOSBinFormatXCOFF()) {
+      GlobalValue::VisibilityTypes V = F.getVisibility();
+      if (V == GlobalValue::DefaultVisibility)
+        continue;
+
+      emitVisibility(Name, V, false);
+      continue;
+    }
+
+    if (F.isIntrinsic())
+      continue;
+
+    // Handle the XCOFF case.
+    // Variable `Name` is the function descriptor symbol (see above). Get the
+    // function entry point symbol.
+    MCSymbol *FnEntryPointSym = TLOF.getFunctionEntryPointSymbol(&F, TM);
+    // Emit linkage for the function entry point.
+    emitLinkage(&F, FnEntryPointSym);
+
+    // Emit linkage for the function descriptor.
+    emitLinkage(&F, Name);
+  }
+
+  // Emit the remarks section contents.
+  // FIXME: Figure out when is the safest time to emit this section. It should
+  // not come after debug info.
+  if (remarks::RemarkStreamer *RS = M.getContext().getMainRemarkStreamer())
+    emitRemarksSection(*RS);
+
+  TLOF.emitModuleMetadata(*OutStreamer, M);
+
+  if (TM.getTargetTriple().isOSBinFormatELF()) {
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer->switchSection(TLOF.getDataSection());
+      const DataLayout &DL = M.getDataLayout();
+
+      emitAlignment(Align(DL.getPointerSize()));
+      for (const auto &Stub : Stubs) {
+        OutStreamer->emitLabel(Stub.first);
+        OutStreamer->emitSymbolValue(Stub.second.getPointer(),
+                                     DL.getPointerSize());
+      }
+    }
+  }
+
+  if (TM.getTargetTriple().isOSBinFormatCOFF()) {
+    MachineModuleInfoCOFF &MMICOFF =
+        MMI->getObjFileInfo<MachineModuleInfoCOFF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoCOFF::SymbolListTy Stubs = MMICOFF.GetGVStubList();
+    if (!Stubs.empty()) {
+      const DataLayout &DL = M.getDataLayout();
+
+      for (const auto &Stub : Stubs) {
+        SmallString<256> SectionName = StringRef(".rdata$");
+        SectionName += Stub.first->getName();
+        OutStreamer->switchSection(OutContext.getCOFFSection(
+            SectionName,
+            COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ |
+                COFF::IMAGE_SCN_LNK_COMDAT,
+            SectionKind::getReadOnly(), Stub.first->getName(),
+            COFF::IMAGE_COMDAT_SELECT_ANY));
+        emitAlignment(Align(DL.getPointerSize()));
+        OutStreamer->emitSymbolAttribute(Stub.first, MCSA_Global);
+        OutStreamer->emitLabel(Stub.first);
+        OutStreamer->emitSymbolValue(Stub.second.getPointer(),
+                                     DL.getPointerSize());
+      }
+    }
+  }
+
+  // This needs to happen before emitting debug information since that can end
+  // arbitrary sections.
+  if (auto *TS = OutStreamer->getTargetStreamer())
+    TS->emitConstantPools();
+
+  // Emit Stack maps before any debug info. Mach-O requires that no data or
+  // text sections come after debug info has been emitted. This matters for
+  // stack maps as they are arbitrary data, and may even have a custom format
+  // through user plugins.
+  emitStackMaps();
+
+  // Finalize debug and EH information.
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerDescription, HI.TimerGroupName,
+                       HI.TimerGroupDescription, TimePassesIsEnabled);
+    HI.Handler->endModule();
+  }
+
+  // This deletes all the ephemeral handlers that AsmPrinter added, while
+  // keeping all the user-added handlers alive until the AsmPrinter is
+  // destroyed.
+  Handlers.erase(Handlers.begin() + NumUserHandlers, Handlers.end());
+  DD = nullptr;
+
+  // If the target wants to know about weak references, print them all.
+  if (MAI->getWeakRefDirective()) {
+    // FIXME: This is not lazy, it would be nice to only print weak references
+    // to stuff that is actually used.  Note that doing so would require targets
+    // to notice uses in operands (due to constant exprs etc).  This should
+    // happen with the MC stuff eventually.
+
+    // Print out module-level global objects here.
+    for (const auto &GO : M.global_objects()) {
+      if (!GO.hasExternalWeakLinkage())
+        continue;
+      OutStreamer->emitSymbolAttribute(getSymbol(&GO), MCSA_WeakReference);
+    }
+    if (shouldEmitWeakSwiftAsyncExtendedFramePointerFlags()) {
+      auto SymbolName = "swift_async_extendedFramePointerFlags";
+      auto Global = M.getGlobalVariable(SymbolName);
+      if (!Global) {
+        auto Int8PtrTy = Type::getInt8PtrTy(M.getContext());
+        Global = new GlobalVariable(M, Int8PtrTy, false,
+                                    GlobalValue::ExternalWeakLinkage, nullptr,
+                                    SymbolName);
+        OutStreamer->emitSymbolAttribute(getSymbol(Global), MCSA_WeakReference);
+      }
+    }
+  }
+
+  // Print aliases in topological order, that is, for each alias a = b,
+  // b must be printed before a.
+  // This is because on some targets (e.g. PowerPC) linker expects aliases in
+  // such an order to generate correct TOC information.
+  SmallVector<const GlobalAlias *, 16> AliasStack;
+  SmallPtrSet<const GlobalAlias *, 16> AliasVisited;
+  for (const auto &Alias : M.aliases()) {
+    if (Alias.hasAvailableExternallyLinkage())
+      continue;
+    for (const GlobalAlias *Cur = &Alias; Cur;
+         Cur = dyn_cast<GlobalAlias>(Cur->getAliasee())) {
+      if (!AliasVisited.insert(Cur).second)
+        break;
+      AliasStack.push_back(Cur);
+    }
+    for (const GlobalAlias *AncestorAlias : llvm::reverse(AliasStack))
+      emitGlobalAlias(M, *AncestorAlias);
+    AliasStack.clear();
+  }
+  for (const auto &IFunc : M.ifuncs())
+    emitGlobalIFunc(M, IFunc);
+
+  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(MI && "AsmPrinter didn't require GCModuleInfo?");
+  for (GCModuleInfo::iterator I = MI->end(), E = MI->begin(); I != E; )
+    if (GCMetadataPrinter *MP = getOrCreateGCPrinter(**--I))
+      MP->finishAssembly(M, *MI, *this);
+
+  // Emit llvm.ident metadata in an '.ident' directive.
+  emitModuleIdents(M);
+
+  // Emit bytes for llvm.commandline metadata.
+  // The command line metadata is emitted earlier on XCOFF.
+  if (!TM.getTargetTriple().isOSBinFormatXCOFF())
+    emitModuleCommandLines(M);
+
+  // Emit .note.GNU-split-stack and .note.GNU-no-split-stack sections if
+  // split-stack is used.
+  if (TM.getTargetTriple().isOSBinFormatELF() && HasSplitStack) {
+    OutStreamer->switchSection(OutContext.getELFSection(".note.GNU-split-stack",
+                                                        ELF::SHT_PROGBITS, 0));
+    if (HasNoSplitStack)
+      OutStreamer->switchSection(OutContext.getELFSection(
+          ".note.GNU-no-split-stack", ELF::SHT_PROGBITS, 0));
+  }
+
+  // If we don't have any trampolines, then we don't require stack memory
+  // to be executable. Some targets have a directive to declare this.
+  Function *InitTrampolineIntrinsic = M.getFunction("llvm.init.trampoline");
+  if (!InitTrampolineIntrinsic || InitTrampolineIntrinsic->use_empty())
+    if (MCSection *S = MAI->getNonexecutableStackSection(OutContext))
+      OutStreamer->switchSection(S);
+
+  if (TM.Options.EmitAddrsig) {
+    // Emit address-significance attributes for all globals.
+    OutStreamer->emitAddrsig();
+    for (const GlobalValue &GV : M.global_values()) {
+      if (!GV.use_empty() && !GV.isThreadLocal() &&
+          !GV.hasDLLImportStorageClass() && !GV.getName().startswith("llvm.") &&
+          !GV.hasAtLeastLocalUnnamedAddr())
+        OutStreamer->emitAddrsigSym(getSymbol(&GV));
+    }
+  }
+
+  // Emit symbol partition specifications (ELF only).
+  if (TM.getTargetTriple().isOSBinFormatELF()) {
+    unsigned UniqueID = 0;
+    for (const GlobalValue &GV : M.global_values()) {
+      if (!GV.hasPartition() || GV.isDeclarationForLinker() ||
+          GV.getVisibility() != GlobalValue::DefaultVisibility)
+        continue;
+
+      OutStreamer->switchSection(
+          OutContext.getELFSection(".llvm_sympart", ELF::SHT_LLVM_SYMPART, 0, 0,
+                                   "", false, ++UniqueID, nullptr));
+      OutStreamer->emitBytes(GV.getPartition());
+      OutStreamer->emitZeros(1);
+      OutStreamer->emitValue(
+          MCSymbolRefExpr::create(getSymbol(&GV), OutContext),
+          MAI->getCodePointerSize());
+    }
+  }
+
+  // Allow the target to emit any magic that it wants at the end of the file,
+  // after everything else has gone out.
+  emitEndOfAsmFile(M);
+
+  MMI = nullptr;
+  AddrLabelSymbols = nullptr;
+
+  OutStreamer->finish();
+  OutStreamer->reset();
+  OwnedMLI.reset();
+  OwnedMDT.reset();
+
+  return false;
+}
+
+MCSymbol *AsmPrinter::getMBBExceptionSym(const MachineBasicBlock &MBB) {
+  auto Res = MBBSectionExceptionSyms.try_emplace(MBB.getSectionIDNum());
+  if (Res.second)
+    Res.first->second = createTempSymbol("exception");
+  return Res.first->second;
+}
+
+void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
+  this->MF = &MF;
+  const Function &F = MF.getFunction();
+
+  // Record that there are split-stack functions, so we will emit a special
+  // section to tell the linker.
+  if (MF.shouldSplitStack()) {
+    HasSplitStack = true;
+
+    if (!MF.getFrameInfo().needsSplitStackProlog())
+      HasNoSplitStack = true;
+  } else
+    HasNoSplitStack = true;
+
+  // Get the function symbol.
+  if (!MAI->needsFunctionDescriptors()) {
+    CurrentFnSym = getSymbol(&MF.getFunction());
+  } else {
+    assert(TM.getTargetTriple().isOSAIX() &&
+           "Only AIX uses the function descriptor hooks.");
+    // AIX is unique here in that the name of the symbol emitted for the
+    // function body does not have the same name as the source function's
+    // C-linkage name.
+    assert(CurrentFnDescSym && "The function descriptor symbol needs to be"
+                               " initalized first.");
+
+    // Get the function entry point symbol.
+    CurrentFnSym = getObjFileLowering().getFunctionEntryPointSymbol(&F, TM);
+  }
+
+  CurrentFnSymForSize = CurrentFnSym;
+  CurrentFnBegin = nullptr;
+  CurrentFnBeginLocal = nullptr;
+  CurrentSectionBeginSym = nullptr;
+  MBBSectionRanges.clear();
+  MBBSectionExceptionSyms.clear();
+  bool NeedsLocalForSize = MAI->needsLocalForSize();
+  if (F.hasFnAttribute("patchable-function-entry") ||
+      F.hasFnAttribute("function-instrument") ||
+      F.hasFnAttribute("xray-instruction-threshold") ||
+      needFuncLabels(MF) || NeedsLocalForSize ||
+      MF.getTarget().Options.EmitStackSizeSection || MF.hasBBLabels()) {
+    CurrentFnBegin = createTempSymbol("func_begin");
+    if (NeedsLocalForSize)
+      CurrentFnSymForSize = CurrentFnBegin;
+  }
+
+  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
+}
+
+namespace {
+
+// Keep track the alignment, constpool entries per Section.
+  struct SectionCPs {
+    MCSection *S;
+    Align Alignment;
+    SmallVector<unsigned, 4> CPEs;
+
+    SectionCPs(MCSection *s, Align a) : S(s), Alignment(a) {}
+  };
+
+} // end anonymous namespace
+
+/// EmitConstantPool - Print to the current output stream assembly
+/// representations of the constants in the constant pool MCP. This is
+/// used to print out constants which have been "spilled to memory" by
+/// the code generator.
+void AsmPrinter::emitConstantPool() {
+  const MachineConstantPool *MCP = MF->getConstantPool();
+  const std::vector<MachineConstantPoolEntry> &CP = MCP->getConstants();
+  if (CP.empty()) return;
+
+  // Calculate sections for constant pool entries. We collect entries to go into
+  // the same section together to reduce amount of section switch statements.
+  SmallVector<SectionCPs, 4> CPSections;
+  for (unsigned i = 0, e = CP.size(); i != e; ++i) {
+    const MachineConstantPoolEntry &CPE = CP[i];
+    Align Alignment = CPE.getAlign();
+
+    SectionKind Kind = CPE.getSectionKind(&getDataLayout());
+
+    const Constant *C = nullptr;
+    if (!CPE.isMachineConstantPoolEntry())
+      C = CPE.Val.ConstVal;
+
+    MCSection *S = getObjFileLowering().getSectionForConstant(
+        getDataLayout(), Kind, C, Alignment);
+
+    // The number of sections are small, just do a linear search from the
+    // last section to the first.
+    bool Found = false;
+    unsigned SecIdx = CPSections.size();
+    while (SecIdx != 0) {
+      if (CPSections[--SecIdx].S == S) {
+        Found = true;
+        break;
+      }
+    }
+    if (!Found) {
+      SecIdx = CPSections.size();
+      CPSections.push_back(SectionCPs(S, Alignment));
+    }
+
+    if (Alignment > CPSections[SecIdx].Alignment)
+      CPSections[SecIdx].Alignment = Alignment;
+    CPSections[SecIdx].CPEs.push_back(i);
+  }
+
+  // Now print stuff into the calculated sections.
+  const MCSection *CurSection = nullptr;
+  unsigned Offset = 0;
+  for (unsigned i = 0, e = CPSections.size(); i != e; ++i) {
+    for (unsigned j = 0, ee = CPSections[i].CPEs.size(); j != ee; ++j) {
+      unsigned CPI = CPSections[i].CPEs[j];
+      MCSymbol *Sym = GetCPISymbol(CPI);
+      if (!Sym->isUndefined())
+        continue;
+
+      if (CurSection != CPSections[i].S) {
+        OutStreamer->switchSection(CPSections[i].S);
+        emitAlignment(Align(CPSections[i].Alignment));
+        CurSection = CPSections[i].S;
+        Offset = 0;
+      }
+
+      MachineConstantPoolEntry CPE = CP[CPI];
+
+      // Emit inter-object padding for alignment.
+      unsigned NewOffset = alignTo(Offset, CPE.getAlign());
+      OutStreamer->emitZeros(NewOffset - Offset);
+
+      Offset = NewOffset + CPE.getSizeInBytes(getDataLayout());
+
+      OutStreamer->emitLabel(Sym);
+      if (CPE.isMachineConstantPoolEntry())
+        emitMachineConstantPoolValue(CPE.Val.MachineCPVal);
+      else
+        emitGlobalConstant(getDataLayout(), CPE.Val.ConstVal);
+    }
+  }
+}
+
+// Print assembly representations of the jump tables used by the current
+// function.
+void AsmPrinter::emitJumpTableInfo() {
+  const DataLayout &DL = MF->getDataLayout();
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  if (!MJTI) return;
+  if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_Inline) return;
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  if (JT.empty()) return;
+
+  // Pick the directive to use to print the jump table entries, and switch to
+  // the appropriate section.
+  const Function &F = MF->getFunction();
+  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+  bool JTInDiffSection = !TLOF.shouldPutJumpTableInFunctionSection(
+      MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32,
+      F);
+  if (JTInDiffSection) {
+    // Drop it in the readonly section.
+    MCSection *ReadOnlySection = TLOF.getSectionForJumpTable(F, TM);
+    OutStreamer->switchSection(ReadOnlySection);
+  }
+
+  emitAlignment(Align(MJTI->getEntryAlignment(DL)));
+
+  // Jump tables in code sections are marked with a data_region directive
+  // where that's supported.
+  if (!JTInDiffSection)
+    OutStreamer->emitDataRegion(MCDR_DataRegionJT32);
+
+  for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
+    const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+
+    // If this jump table was deleted, ignore it.
+    if (JTBBs.empty()) continue;
+
+    // For the EK_LabelDifference32 entry, if using .set avoids a relocation,
+    /// emit a .set directive for each unique entry.
+    if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 &&
+        MAI->doesSetDirectiveSuppressReloc()) {
+      SmallPtrSet<const MachineBasicBlock*, 16> EmittedSets;
+      const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+      const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF,JTI,OutContext);
+      for (const MachineBasicBlock *MBB : JTBBs) {
+        if (!EmittedSets.insert(MBB).second)
+          continue;
+
+        // .set LJTSet, LBB32-base
+        const MCExpr *LHS =
+          MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+        OutStreamer->emitAssignment(GetJTSetSymbol(JTI, MBB->getNumber()),
+                                    MCBinaryExpr::createSub(LHS, Base,
+                                                            OutContext));
+      }
+    }
+
+    // On some targets (e.g. Darwin) we want to emit two consecutive labels
+    // before each jump table.  The first label is never referenced, but tells
+    // the assembler and linker the extents of the jump table object.  The
+    // second label is actually referenced by the code.
+    if (JTInDiffSection && DL.hasLinkerPrivateGlobalPrefix())
+      // FIXME: This doesn't have to have any specific name, just any randomly
+      // named and numbered local label started with 'l' would work.  Simplify
+      // GetJTISymbol.
+      OutStreamer->emitLabel(GetJTISymbol(JTI, true));
+
+    MCSymbol* JTISymbol = GetJTISymbol(JTI);
+    OutStreamer->emitLabel(JTISymbol);
+
+    for (const MachineBasicBlock *MBB : JTBBs)
+      emitJumpTableEntry(MJTI, MBB, JTI);
+  }
+  if (!JTInDiffSection)
+    OutStreamer->emitDataRegion(MCDR_DataRegionEnd);
+}
+
+/// EmitJumpTableEntry - Emit a jump table entry for the specified MBB to the
+/// current stream.
+void AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                                    const MachineBasicBlock *MBB,
+                                    unsigned UID) const {
+  assert(MBB && MBB->getNumber() >= 0 && "Invalid basic block");
+  const MCExpr *Value = nullptr;
+  switch (MJTI->getEntryKind()) {
+  case MachineJumpTableInfo::EK_Inline:
+    llvm_unreachable("Cannot emit EK_Inline jump table entry");
+  case MachineJumpTableInfo::EK_Custom32:
+    Value = MF->getSubtarget().getTargetLowering()->LowerCustomJumpTableEntry(
+        MJTI, MBB, UID, OutContext);
+    break;
+  case MachineJumpTableInfo::EK_BlockAddress:
+    // EK_BlockAddress - Each entry is a plain address of block, e.g.:
+    //     .word LBB123
+    Value = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+    break;
+  case MachineJumpTableInfo::EK_GPRel32BlockAddress: {
+    // EK_GPRel32BlockAddress - Each entry is an address of block, encoded
+    // with a relocation as gp-relative, e.g.:
+    //     .gprel32 LBB123
+    MCSymbol *MBBSym = MBB->getSymbol();
+    OutStreamer->emitGPRel32Value(MCSymbolRefExpr::create(MBBSym, OutContext));
+    return;
+  }
+
+  case MachineJumpTableInfo::EK_GPRel64BlockAddress: {
+    // EK_GPRel64BlockAddress - Each entry is an address of block, encoded
+    // with a relocation as gp-relative, e.g.:
+    //     .gpdword LBB123
+    MCSymbol *MBBSym = MBB->getSymbol();
+    OutStreamer->emitGPRel64Value(MCSymbolRefExpr::create(MBBSym, OutContext));
+    return;
+  }
+
+  case MachineJumpTableInfo::EK_LabelDifference32: {
+    // Each entry is the address of the block minus the address of the jump
+    // table. This is used for PIC jump tables where gprel32 is not supported.
+    // e.g.:
+    //      .word LBB123 - LJTI1_2
+    // If the .set directive avoids relocations, this is emitted as:
+    //      .set L4_5_set_123, LBB123 - LJTI1_2
+    //      .word L4_5_set_123
+    if (MAI->doesSetDirectiveSuppressReloc()) {
+      Value = MCSymbolRefExpr::create(GetJTSetSymbol(UID, MBB->getNumber()),
+                                      OutContext);
+      break;
+    }
+    Value = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+    const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+    const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF, UID, OutContext);
+    Value = MCBinaryExpr::createSub(Value, Base, OutContext);
+    break;
+  }
+  }
+
+  assert(Value && "Unknown entry kind!");
+
+  unsigned EntrySize = MJTI->getEntrySize(getDataLayout());
+  OutStreamer->emitValue(Value, EntrySize);
+}
+
+/// EmitSpecialLLVMGlobal - Check to see if the specified global is a
+/// special global used by LLVM.  If so, emit it and return true, otherwise
+/// do nothing and return false.
+bool AsmPrinter::emitSpecialLLVMGlobal(const GlobalVariable *GV) {
+  if (GV->getName() == "llvm.used") {
+    if (MAI->hasNoDeadStrip())    // No need to emit this at all.
+      emitLLVMUsedList(cast<ConstantArray>(GV->getInitializer()));
+    return true;
+  }
+
+  // Ignore debug and non-emitted data.  This handles llvm.compiler.used.
+  if (GV->getSection() == "llvm.metadata" ||
+      GV->hasAvailableExternallyLinkage())
+    return true;
+
+  if (!GV->hasAppendingLinkage()) return false;
+
+  assert(GV->hasInitializer() && "Not a special LLVM global!");
+
+  if (GV->getName() == "llvm.global_ctors") {
+    emitXXStructorList(GV->getParent()->getDataLayout(), GV->getInitializer(),
+                       /* isCtor */ true);
+
+    return true;
+  }
+
+  if (GV->getName() == "llvm.global_dtors") {
+    emitXXStructorList(GV->getParent()->getDataLayout(), GV->getInitializer(),
+                       /* isCtor */ false);
+
+    return true;
+  }
+
+  report_fatal_error("unknown special variable");
+}
+
+/// EmitLLVMUsedList - For targets that define a MAI::UsedDirective, mark each
+/// global in the specified llvm.used list.
+void AsmPrinter::emitLLVMUsedList(const ConstantArray *InitList) {
+  // Should be an array of 'i8*'.
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
+    const GlobalValue *GV =
+      dyn_cast<GlobalValue>(InitList->getOperand(i)->stripPointerCasts());
+    if (GV)
+      OutStreamer->emitSymbolAttribute(getSymbol(GV), MCSA_NoDeadStrip);
+  }
+}
+
+void AsmPrinter::preprocessXXStructorList(const DataLayout &DL,
+                                          const Constant *List,
+                                          SmallVector<Structor, 8> &Structors) {
+  // Should be an array of '{ i32, void ()*, i8* }' structs.  The first value is
+  // the init priority.
+  if (!isa<ConstantArray>(List))
+    return;
+
+  // Gather the structors in a form that's convenient for sorting by priority.
+  for (Value *O : cast<ConstantArray>(List)->operands()) {
+    auto *CS = cast<ConstantStruct>(O);
+    if (CS->getOperand(1)->isNullValue())
+      break; // Found a null terminator, skip the rest.
+    ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
+    if (!Priority)
+      continue; // Malformed.
+    Structors.push_back(Structor());
+    Structor &S = Structors.back();
+    S.Priority = Priority->getLimitedValue(65535);
+    S.Func = CS->getOperand(1);
+    if (!CS->getOperand(2)->isNullValue()) {
+      if (TM.getTargetTriple().isOSAIX())
+        llvm::report_fatal_error(
+            "associated data of XXStructor list is not yet supported on AIX");
+      S.ComdatKey =
+          dyn_cast<GlobalValue>(CS->getOperand(2)->stripPointerCasts());
+    }
+  }
+
+  // Emit the function pointers in the target-specific order
+  llvm::stable_sort(Structors, [](const Structor &L, const Structor &R) {
+    return L.Priority < R.Priority;
+  });
+}
+
+/// EmitXXStructorList - Emit the ctor or dtor list taking into account the init
+/// priority.
+void AsmPrinter::emitXXStructorList(const DataLayout &DL, const Constant *List,
+                                    bool IsCtor) {
+  SmallVector<Structor, 8> Structors;
+  preprocessXXStructorList(DL, List, Structors);
+  if (Structors.empty())
+    return;
+
+  // Emit the structors in reverse order if we are using the .ctor/.dtor
+  // initialization scheme.
+  if (!TM.Options.UseInitArray)
+    std::reverse(Structors.begin(), Structors.end());
+
+  const Align Align = DL.getPointerPrefAlignment();
+  for (Structor &S : Structors) {
+    const TargetLoweringObjectFile &Obj = getObjFileLowering();
+    const MCSymbol *KeySym = nullptr;
+    if (GlobalValue *GV = S.ComdatKey) {
+      if (GV->isDeclarationForLinker())
+        // If the associated variable is not defined in this module
+        // (it might be available_externally, or have been an
+        // available_externally definition that was dropped by the
+        // EliminateAvailableExternally pass), some other TU
+        // will provide its dynamic initializer.
+        continue;
+
+      KeySym = getSymbol(GV);
+    }
+
+    MCSection *OutputSection =
+        (IsCtor ? Obj.getStaticCtorSection(S.Priority, KeySym)
+                : Obj.getStaticDtorSection(S.Priority, KeySym));
+    OutStreamer->switchSection(OutputSection);
+    if (OutStreamer->getCurrentSection() != OutStreamer->getPreviousSection())
+      emitAlignment(Align);
+    emitXXStructor(DL, S.Func);
+  }
+}
+
+void AsmPrinter::emitModuleIdents(Module &M) {
+  if (!MAI->hasIdentDirective())
+    return;
+
+  if (const NamedMDNode *NMD = M.getNamedMetadata("llvm.ident")) {
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+      const MDNode *N = NMD->getOperand(i);
+      assert(N->getNumOperands() == 1 &&
+             "llvm.ident metadata entry can have only one operand");
+      const MDString *S = cast<MDString>(N->getOperand(0));
+      OutStreamer->emitIdent(S->getString());
+    }
+  }
+}
+
+void AsmPrinter::emitModuleCommandLines(Module &M) {
+  MCSection *CommandLine = getObjFileLowering().getSectionForCommandLines();
+  if (!CommandLine)
+    return;
+
+  const NamedMDNode *NMD = M.getNamedMetadata("llvm.commandline");
+  if (!NMD || !NMD->getNumOperands())
+    return;
+
+  OutStreamer->pushSection();
+  OutStreamer->switchSection(CommandLine);
+  OutStreamer->emitZeros(1);
+  for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+    const MDNode *N = NMD->getOperand(i);
+    assert(N->getNumOperands() == 1 &&
+           "llvm.commandline metadata entry can have only one operand");
+    const MDString *S = cast<MDString>(N->getOperand(0));
+    OutStreamer->emitBytes(S->getString());
+    OutStreamer->emitZeros(1);
+  }
+  OutStreamer->popSection();
+}
+
+//===--------------------------------------------------------------------===//
+// Emission and print routines
+//
+
+/// Emit a byte directive and value.
+///
+void AsmPrinter::emitInt8(int Value) const { OutStreamer->emitInt8(Value); }
+
+/// Emit a short directive and value.
+void AsmPrinter::emitInt16(int Value) const { OutStreamer->emitInt16(Value); }
+
+/// Emit a long directive and value.
+void AsmPrinter::emitInt32(int Value) const { OutStreamer->emitInt32(Value); }
+
+/// EmitSLEB128 - emit the specified signed leb128 value.
+void AsmPrinter::emitSLEB128(int64_t Value, const char *Desc) const {
+  if (isVerbose() && Desc)
+    OutStreamer->AddComment(Desc);
+
+  OutStreamer->emitSLEB128IntValue(Value);
+}
+
+void AsmPrinter::emitULEB128(uint64_t Value, const char *Desc,
+                             unsigned PadTo) const {
+  if (isVerbose() && Desc)
+    OutStreamer->AddComment(Desc);
+
+  OutStreamer->emitULEB128IntValue(Value, PadTo);
+}
+
+/// Emit a long long directive and value.
+void AsmPrinter::emitInt64(uint64_t Value) const {
+  OutStreamer->emitInt64(Value);
+}
+
+/// Emit something like ".long Hi-Lo" where the size in bytes of the directive
+/// is specified by Size and Hi/Lo specify the labels. This implicitly uses
+/// .set if it avoids relocations.
+void AsmPrinter::emitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo,
+                                     unsigned Size) const {
+  OutStreamer->emitAbsoluteSymbolDiff(Hi, Lo, Size);
+}
+
+/// Emit something like ".uleb128 Hi-Lo".
+void AsmPrinter::emitLabelDifferenceAsULEB128(const MCSymbol *Hi,
+                                              const MCSymbol *Lo) const {
+  OutStreamer->emitAbsoluteSymbolDiffAsULEB128(Hi, Lo);
+}
+
+/// EmitLabelPlusOffset - Emit something like ".long Label+Offset"
+/// where the size in bytes of the directive is specified by Size and Label
+/// specifies the label.  This implicitly uses .set if it is available.
+void AsmPrinter::emitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
+                                     unsigned Size,
+                                     bool IsSectionRelative) const {
+  if (MAI->needsDwarfSectionOffsetDirective() && IsSectionRelative) {
+    OutStreamer->emitCOFFSecRel32(Label, Offset);
+    if (Size > 4)
+      OutStreamer->emitZeros(Size - 4);
+    return;
+  }
+
+  // Emit Label+Offset (or just Label if Offset is zero)
+  const MCExpr *Expr = MCSymbolRefExpr::create(Label, OutContext);
+  if (Offset)
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(Offset, OutContext), OutContext);
+
+  OutStreamer->emitValue(Expr, Size);
+}
+
+//===----------------------------------------------------------------------===//
+
+// EmitAlignment - Emit an alignment directive to the specified power of
+// two boundary.  If a global value is specified, and if that global has
+// an explicit alignment requested, it will override the alignment request
+// if required for correctness.
+void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV,
+                               unsigned MaxBytesToEmit) const {
+  if (GV)
+    Alignment = getGVAlignment(GV, GV->getParent()->getDataLayout(), Alignment);
+
+  if (Alignment == Align(1))
+    return; // 1-byte aligned: no need to emit alignment.
+
+  if (getCurrentSection()->getKind().isText()) {
+    const MCSubtargetInfo *STI = nullptr;
+    if (this->MF)
+      STI = &getSubtargetInfo();
+    else
+      STI = TM.getMCSubtargetInfo();
+    OutStreamer->emitCodeAlignment(Alignment, STI, MaxBytesToEmit);
+  } else
+    OutStreamer->emitValueToAlignment(Alignment, 0, 1, MaxBytesToEmit);
+}
+
+//===----------------------------------------------------------------------===//
+// Constant emission.
+//===----------------------------------------------------------------------===//
+
+const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
+  MCContext &Ctx = OutContext;
+
+  if (CV->isNullValue() || isa<UndefValue>(CV))
+    return MCConstantExpr::create(0, Ctx);
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV))
+    return MCConstantExpr::create(CI->getZExtValue(), Ctx);
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
+    return MCSymbolRefExpr::create(getSymbol(GV), Ctx);
+
+  if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
+    return MCSymbolRefExpr::create(GetBlockAddressSymbol(BA), Ctx);
+
+  if (const auto *Equiv = dyn_cast<DSOLocalEquivalent>(CV))
+    return getObjFileLowering().lowerDSOLocalEquivalent(Equiv, TM);
+
+  if (const NoCFIValue *NC = dyn_cast<NoCFIValue>(CV))
+    return MCSymbolRefExpr::create(getSymbol(NC->getGlobalValue()), Ctx);
+
+  const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
+  if (!CE) {
+    llvm_unreachable("Unknown constant value to lower!");
+  }
+
+  // The constant expression opcodes are limited to those that are necessary
+  // to represent relocations on supported targets. Expressions involving only
+  // constant addresses are constant folded instead.
+  switch (CE->getOpcode()) {
+  default:
+    break; // Error
+  case Instruction::AddrSpaceCast: {
+    const Constant *Op = CE->getOperand(0);
+    unsigned DstAS = CE->getType()->getPointerAddressSpace();
+    unsigned SrcAS = Op->getType()->getPointerAddressSpace();
+    if (TM.isNoopAddrSpaceCast(SrcAS, DstAS))
+      return lowerConstant(Op);
+
+    break; // Error
+  }
+  case Instruction::GetElementPtr: {
+    // Generate a symbolic expression for the byte address
+    APInt OffsetAI(getDataLayout().getPointerTypeSizeInBits(CE->getType()), 0);
+    cast<GEPOperator>(CE)->accumulateConstantOffset(getDataLayout(), OffsetAI);
+
+    const MCExpr *Base = lowerConstant(CE->getOperand(0));
+    if (!OffsetAI)
+      return Base;
+
+    int64_t Offset = OffsetAI.getSExtValue();
+    return MCBinaryExpr::createAdd(Base, MCConstantExpr::create(Offset, Ctx),
+                                   Ctx);
+  }
+
+  case Instruction::Trunc:
+    // We emit the value and depend on the assembler to truncate the generated
+    // expression properly.  This is important for differences between
+    // blockaddress labels.  Since the two labels are in the same function, it
+    // is reasonable to treat their delta as a 32-bit value.
+    [[fallthrough]];
+  case Instruction::BitCast:
+    return lowerConstant(CE->getOperand(0));
+
+  case Instruction::IntToPtr: {
+    const DataLayout &DL = getDataLayout();
+
+    // Handle casts to pointers by changing them into casts to the appropriate
+    // integer type.  This promotes constant folding and simplifies this code.
+    Constant *Op = CE->getOperand(0);
+    Op = ConstantExpr::getIntegerCast(Op, DL.getIntPtrType(CV->getType()),
+                                      false/*ZExt*/);
+    return lowerConstant(Op);
+  }
+
+  case Instruction::PtrToInt: {
+    const DataLayout &DL = getDataLayout();
+
+    // Support only foldable casts to/from pointers that can be eliminated by
+    // changing the pointer to the appropriately sized integer type.
+    Constant *Op = CE->getOperand(0);
+    Type *Ty = CE->getType();
+
+    const MCExpr *OpExpr = lowerConstant(Op);
+
+    // We can emit the pointer value into this slot if the slot is an
+    // integer slot equal to the size of the pointer.
+    //
+    // If the pointer is larger than the resultant integer, then
+    // as with Trunc just depend on the assembler to truncate it.
+    if (DL.getTypeAllocSize(Ty).getFixedValue() <=
+        DL.getTypeAllocSize(Op->getType()).getFixedValue())
+      return OpExpr;
+
+    break; // Error
+  }
+
+  case Instruction::Sub: {
+    GlobalValue *LHSGV;
+    APInt LHSOffset;
+    DSOLocalEquivalent *DSOEquiv;
+    if (IsConstantOffsetFromGlobal(CE->getOperand(0), LHSGV, LHSOffset,
+                                   getDataLayout(), &DSOEquiv)) {
+      GlobalValue *RHSGV;
+      APInt RHSOffset;
+      if (IsConstantOffsetFromGlobal(CE->getOperand(1), RHSGV, RHSOffset,
+                                     getDataLayout())) {
+        const MCExpr *RelocExpr =
+            getObjFileLowering().lowerRelativeReference(LHSGV, RHSGV, TM);
+        if (!RelocExpr) {
+          const MCExpr *LHSExpr =
+              MCSymbolRefExpr::create(getSymbol(LHSGV), Ctx);
+          if (DSOEquiv &&
+              getObjFileLowering().supportDSOLocalEquivalentLowering())
+            LHSExpr =
+                getObjFileLowering().lowerDSOLocalEquivalent(DSOEquiv, TM);
+          RelocExpr = MCBinaryExpr::createSub(
+              LHSExpr, MCSymbolRefExpr::create(getSymbol(RHSGV), Ctx), Ctx);
+        }
+        int64_t Addend = (LHSOffset - RHSOffset).getSExtValue();
+        if (Addend != 0)
+          RelocExpr = MCBinaryExpr::createAdd(
+              RelocExpr, MCConstantExpr::create(Addend, Ctx), Ctx);
+        return RelocExpr;
+      }
+    }
+
+    const MCExpr *LHS = lowerConstant(CE->getOperand(0));
+    const MCExpr *RHS = lowerConstant(CE->getOperand(1));
+    return MCBinaryExpr::createSub(LHS, RHS, Ctx);
+    break;
+  }
+
+  case Instruction::Add: {
+    const MCExpr *LHS = lowerConstant(CE->getOperand(0));
+    const MCExpr *RHS = lowerConstant(CE->getOperand(1));
+    return MCBinaryExpr::createAdd(LHS, RHS, Ctx);
+  }
+  }
+
+  // If the code isn't optimized, there may be outstanding folding
+  // opportunities. Attempt to fold the expression using DataLayout as a
+  // last resort before giving up.
+  Constant *C = ConstantFoldConstant(CE, getDataLayout());
+  if (C != CE)
+    return lowerConstant(C);
+
+  // Otherwise report the problem to the user.
+  std::string S;
+  raw_string_ostream OS(S);
+  OS << "Unsupported expression in static initializer: ";
+  CE->printAsOperand(OS, /*PrintType=*/false,
+                     !MF ? nullptr : MF->getFunction().getParent());
+  report_fatal_error(Twine(OS.str()));
+}
+
+static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *C,
+                                   AsmPrinter &AP,
+                                   const Constant *BaseCV = nullptr,
+                                   uint64_t Offset = 0,
+                                   AsmPrinter::AliasMapTy *AliasList = nullptr);
+
+static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP);
+static void emitGlobalConstantFP(APFloat APF, Type *ET, AsmPrinter &AP);
+
+/// isRepeatedByteSequence - Determine whether the given value is
+/// composed of a repeated sequence of identical bytes and return the
+/// byte value.  If it is not a repeated sequence, return -1.
+static int isRepeatedByteSequence(const ConstantDataSequential *V) {
+  StringRef Data = V->getRawDataValues();
+  assert(!Data.empty() && "Empty aggregates should be CAZ node");
+  char C = Data[0];
+  for (unsigned i = 1, e = Data.size(); i != e; ++i)
+    if (Data[i] != C) return -1;
+  return static_cast<uint8_t>(C); // Ensure 255 is not returned as -1.
+}
+
+/// isRepeatedByteSequence - Determine whether the given value is
+/// composed of a repeated sequence of identical bytes and return the
+/// byte value.  If it is not a repeated sequence, return -1.
+static int isRepeatedByteSequence(const Value *V, const DataLayout &DL) {
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    uint64_t Size = DL.getTypeAllocSizeInBits(V->getType());
+    assert(Size % 8 == 0);
+
+    // Extend the element to take zero padding into account.
+    APInt Value = CI->getValue().zext(Size);
+    if (!Value.isSplat(8))
+      return -1;
+
+    return Value.zextOrTrunc(8).getZExtValue();
+  }
+  if (const ConstantArray *CA = dyn_cast<ConstantArray>(V)) {
+    // Make sure all array elements are sequences of the same repeated
+    // byte.
+    assert(CA->getNumOperands() != 0 && "Should be a CAZ");
+    Constant *Op0 = CA->getOperand(0);
+    int Byte = isRepeatedByteSequence(Op0, DL);
+    if (Byte == -1)
+      return -1;
+
+    // All array elements must be equal.
+    for (unsigned i = 1, e = CA->getNumOperands(); i != e; ++i)
+      if (CA->getOperand(i) != Op0)
+        return -1;
+    return Byte;
+  }
+
+  if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(V))
+    return isRepeatedByteSequence(CDS);
+
+  return -1;
+}
+
+static void emitGlobalAliasInline(AsmPrinter &AP, uint64_t Offset,
+                                  AsmPrinter::AliasMapTy *AliasList) {
+  if (AliasList) {
+    auto AliasIt = AliasList->find(Offset);
+    if (AliasIt != AliasList->end()) {
+      for (const GlobalAlias *GA : AliasIt->second)
+        AP.OutStreamer->emitLabel(AP.getSymbol(GA));
+      AliasList->erase(Offset);
+    }
+  }
+}
+
+static void emitGlobalConstantDataSequential(
+    const DataLayout &DL, const ConstantDataSequential *CDS, AsmPrinter &AP,
+    AsmPrinter::AliasMapTy *AliasList) {
+  // See if we can aggregate this into a .fill, if so, emit it as such.
+  int Value = isRepeatedByteSequence(CDS, DL);
+  if (Value != -1) {
+    uint64_t Bytes = DL.getTypeAllocSize(CDS->getType());
+    // Don't emit a 1-byte object as a .fill.
+    if (Bytes > 1)
+      return AP.OutStreamer->emitFill(Bytes, Value);
+  }
+
+  // If this can be emitted with .ascii/.asciz, emit it as such.
+  if (CDS->isString())
+    return AP.OutStreamer->emitBytes(CDS->getAsString());
+
+  // Otherwise, emit the values in successive locations.
+  unsigned ElementByteSize = CDS->getElementByteSize();
+  if (isa<IntegerType>(CDS->getElementType())) {
+    for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I) {
+      emitGlobalAliasInline(AP, ElementByteSize * I, AliasList);
+      if (AP.isVerbose())
+        AP.OutStreamer->getCommentOS()
+            << format("0x%" PRIx64 "\n", CDS->getElementAsInteger(I));
+      AP.OutStreamer->emitIntValue(CDS->getElementAsInteger(I),
+                                   ElementByteSize);
+    }
+  } else {
+    Type *ET = CDS->getElementType();
+    for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I) {
+      emitGlobalAliasInline(AP, ElementByteSize * I, AliasList);
+      emitGlobalConstantFP(CDS->getElementAsAPFloat(I), ET, AP);
+    }
+  }
+
+  unsigned Size = DL.getTypeAllocSize(CDS->getType());
+  unsigned EmittedSize =
+      DL.getTypeAllocSize(CDS->getElementType()) * CDS->getNumElements();
+  assert(EmittedSize <= Size && "Size cannot be less than EmittedSize!");
+  if (unsigned Padding = Size - EmittedSize)
+    AP.OutStreamer->emitZeros(Padding);
+}
+
+static void emitGlobalConstantArray(const DataLayout &DL,
+                                    const ConstantArray *CA, AsmPrinter &AP,
+                                    const Constant *BaseCV, uint64_t Offset,
+                                    AsmPrinter::AliasMapTy *AliasList) {
+  // See if we can aggregate some values.  Make sure it can be
+  // represented as a series of bytes of the constant value.
+  int Value = isRepeatedByteSequence(CA, DL);
+
+  if (Value != -1) {
+    uint64_t Bytes = DL.getTypeAllocSize(CA->getType());
+    AP.OutStreamer->emitFill(Bytes, Value);
+  } else {
+    for (unsigned I = 0, E = CA->getNumOperands(); I != E; ++I) {
+      emitGlobalConstantImpl(DL, CA->getOperand(I), AP, BaseCV, Offset,
+                             AliasList);
+      Offset += DL.getTypeAllocSize(CA->getOperand(I)->getType());
+    }
+  }
+}
+
+static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP);
+
+static void emitGlobalConstantVector(const DataLayout &DL,
+                                     const ConstantVector *CV, AsmPrinter &AP,
+                                     AsmPrinter::AliasMapTy *AliasList) {
+  Type *ElementType = CV->getType()->getElementType();
+  uint64_t ElementSizeInBits = DL.getTypeSizeInBits(ElementType);
+  uint64_t ElementAllocSizeInBits = DL.getTypeAllocSizeInBits(ElementType);
+  uint64_t EmittedSize;
+  if (ElementSizeInBits != ElementAllocSizeInBits) {
+    // If the allocation size of an element is different from the size in bits,
+    // printing each element separately will insert incorrect padding.
+    //
+    // The general algorithm here is complicated; instead of writing it out
+    // here, just use the existing code in ConstantFolding.
+    Type *IntT =
+        IntegerType::get(CV->getContext(), DL.getTypeSizeInBits(CV->getType()));
+    ConstantInt *CI = dyn_cast_or_null<ConstantInt>(ConstantFoldConstant(
+        ConstantExpr::getBitCast(const_cast<ConstantVector *>(CV), IntT), DL));
+    if (!CI) {
+      report_fatal_error(
+          "Cannot lower vector global with unusual element type");
+    }
+    emitGlobalAliasInline(AP, 0, AliasList);
+    emitGlobalConstantLargeInt(CI, AP);
+    EmittedSize = DL.getTypeStoreSize(CV->getType());
+  } else {
+    for (unsigned I = 0, E = CV->getType()->getNumElements(); I != E; ++I) {
+      emitGlobalAliasInline(AP, DL.getTypeAllocSize(CV->getType()) * I, AliasList);
+      emitGlobalConstantImpl(DL, CV->getOperand(I), AP);
+    }
+    EmittedSize =
+        DL.getTypeAllocSize(ElementType) * CV->getType()->getNumElements();
+  }
+
+  unsigned Size = DL.getTypeAllocSize(CV->getType());
+  if (unsigned Padding = Size - EmittedSize)
+    AP.OutStreamer->emitZeros(Padding);
+}
+
+static void emitGlobalConstantStruct(const DataLayout &DL,
+                                     const ConstantStruct *CS, AsmPrinter &AP,
+                                     const Constant *BaseCV, uint64_t Offset,
+                                     AsmPrinter::AliasMapTy *AliasList) {
+  // Print the fields in successive locations. Pad to align if needed!
+  unsigned Size = DL.getTypeAllocSize(CS->getType());
+  const StructLayout *Layout = DL.getStructLayout(CS->getType());
+  uint64_t SizeSoFar = 0;
+  for (unsigned I = 0, E = CS->getNumOperands(); I != E; ++I) {
+    const Constant *Field = CS->getOperand(I);
+
+    // Print the actual field value.
+    emitGlobalConstantImpl(DL, Field, AP, BaseCV, Offset + SizeSoFar,
+                           AliasList);
+
+    // Check if padding is needed and insert one or more 0s.
+    uint64_t FieldSize = DL.getTypeAllocSize(Field->getType());
+    uint64_t PadSize = ((I == E - 1 ? Size : Layout->getElementOffset(I + 1)) -
+                        Layout->getElementOffset(I)) -
+                       FieldSize;
+    SizeSoFar += FieldSize + PadSize;
+
+    // Insert padding - this may include padding to increase the size of the
+    // current field up to the ABI size (if the struct is not packed) as well
+    // as padding to ensure that the next field starts at the right offset.
+    AP.OutStreamer->emitZeros(PadSize);
+  }
+  assert(SizeSoFar == Layout->getSizeInBytes() &&
+         "Layout of constant struct may be incorrect!");
+}
+
+static void emitGlobalConstantFP(APFloat APF, Type *ET, AsmPrinter &AP) {
+  assert(ET && "Unknown float type");
+  APInt API = APF.bitcastToAPInt();
+
+  // First print a comment with what we think the original floating-point value
+  // should have been.
+  if (AP.isVerbose()) {
+    SmallString<8> StrVal;
+    APF.toString(StrVal);
+    ET->print(AP.OutStreamer->getCommentOS());
+    AP.OutStreamer->getCommentOS() << ' ' << StrVal << '\n';
+  }
+
+  // Now iterate through the APInt chunks, emitting them in endian-correct
+  // order, possibly with a smaller chunk at beginning/end (e.g. for x87 80-bit
+  // floats).
+  unsigned NumBytes = API.getBitWidth() / 8;
+  unsigned TrailingBytes = NumBytes % sizeof(uint64_t);
+  const uint64_t *p = API.getRawData();
+
+  // PPC's long double has odd notions of endianness compared to how LLVM
+  // handles it: p[0] goes first for *big* endian on PPC.
+  if (AP.getDataLayout().isBigEndian() && !ET->isPPC_FP128Ty()) {
+    int Chunk = API.getNumWords() - 1;
+
+    if (TrailingBytes)
+      AP.OutStreamer->emitIntValueInHexWithPadding(p[Chunk--], TrailingBytes);
+
+    for (; Chunk >= 0; --Chunk)
+      AP.OutStreamer->emitIntValueInHexWithPadding(p[Chunk], sizeof(uint64_t));
+  } else {
+    unsigned Chunk;
+    for (Chunk = 0; Chunk < NumBytes / sizeof(uint64_t); ++Chunk)
+      AP.OutStreamer->emitIntValueInHexWithPadding(p[Chunk], sizeof(uint64_t));
+
+    if (TrailingBytes)
+      AP.OutStreamer->emitIntValueInHexWithPadding(p[Chunk], TrailingBytes);
+  }
+
+  // Emit the tail padding for the long double.
+  const DataLayout &DL = AP.getDataLayout();
+  AP.OutStreamer->emitZeros(DL.getTypeAllocSize(ET) - DL.getTypeStoreSize(ET));
+}
+
+static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
+  emitGlobalConstantFP(CFP->getValueAPF(), CFP->getType(), AP);
+}
+
+static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
+  const DataLayout &DL = AP.getDataLayout();
+  unsigned BitWidth = CI->getBitWidth();
+
+  // Copy the value as we may massage the layout for constants whose bit width
+  // is not a multiple of 64-bits.
+  APInt Realigned(CI->getValue());
+  uint64_t ExtraBits = 0;
+  unsigned ExtraBitsSize = BitWidth & 63;
+
+  if (ExtraBitsSize) {
+    // The bit width of the data is not a multiple of 64-bits.
+    // The extra bits are expected to be at the end of the chunk of the memory.
+    // Little endian:
+    // * Nothing to be done, just record the extra bits to emit.
+    // Big endian:
+    // * Record the extra bits to emit.
+    // * Realign the raw data to emit the chunks of 64-bits.
+    if (DL.isBigEndian()) {
+      // Basically the structure of the raw data is a chunk of 64-bits cells:
+      //    0        1         BitWidth / 64
+      // [chunk1][chunk2] ... [chunkN].
+      // The most significant chunk is chunkN and it should be emitted first.
+      // However, due to the alignment issue chunkN contains useless bits.
+      // Realign the chunks so that they contain only useful information:
+      // ExtraBits     0       1       (BitWidth / 64) - 1
+      //       chu[nk1 chu][nk2 chu] ... [nkN-1 chunkN]
+      ExtraBitsSize = alignTo(ExtraBitsSize, 8);
+      ExtraBits = Realigned.getRawData()[0] &
+        (((uint64_t)-1) >> (64 - ExtraBitsSize));
+      if (BitWidth >= 64)
+        Realigned.lshrInPlace(ExtraBitsSize);
+    } else
+      ExtraBits = Realigned.getRawData()[BitWidth / 64];
+  }
+
+  // We don't expect assemblers to support integer data directives
+  // for more than 64 bits, so we emit the data in at most 64-bit
+  // quantities at a time.
+  const uint64_t *RawData = Realigned.getRawData();
+  for (unsigned i = 0, e = BitWidth / 64; i != e; ++i) {
+    uint64_t Val = DL.isBigEndian() ? RawData[e - i - 1] : RawData[i];
+    AP.OutStreamer->emitIntValue(Val, 8);
+  }
+
+  if (ExtraBitsSize) {
+    // Emit the extra bits after the 64-bits chunks.
+
+    // Emit a directive that fills the expected size.
+    uint64_t Size = AP.getDataLayout().getTypeStoreSize(CI->getType());
+    Size -= (BitWidth / 64) * 8;
+    assert(Size && Size * 8 >= ExtraBitsSize &&
+           (ExtraBits & (((uint64_t)-1) >> (64 - ExtraBitsSize)))
+           == ExtraBits && "Directive too small for extra bits.");
+    AP.OutStreamer->emitIntValue(ExtraBits, Size);
+  }
+}
+
+/// Transform a not absolute MCExpr containing a reference to a GOT
+/// equivalent global, by a target specific GOT pc relative access to the
+/// final symbol.
+static void handleIndirectSymViaGOTPCRel(AsmPrinter &AP, const MCExpr **ME,
+                                         const Constant *BaseCst,
+                                         uint64_t Offset) {
+  // The global @foo below illustrates a global that uses a got equivalent.
+  //
+  //  @bar = global i32 42
+  //  @gotequiv = private unnamed_addr constant i32* @bar
+  //  @foo = i32 trunc (i64 sub (i64 ptrtoint (i32** @gotequiv to i64),
+  //                             i64 ptrtoint (i32* @foo to i64))
+  //                        to i32)
+  //
+  // The cstexpr in @foo is converted into the MCExpr `ME`, where we actually
+  // check whether @foo is suitable to use a GOTPCREL. `ME` is usually in the
+  // form:
+  //
+  //  foo = cstexpr, where
+  //    cstexpr := <gotequiv> - "." + <cst>
+  //    cstexpr := <gotequiv> - (<foo> - <offset from @foo base>) + <cst>
+  //
+  // After canonicalization by evaluateAsRelocatable `ME` turns into:
+  //
+  //  cstexpr := <gotequiv> - <foo> + gotpcrelcst, where
+  //    gotpcrelcst := <offset from @foo base> + <cst>
+  MCValue MV;
+  if (!(*ME)->evaluateAsRelocatable(MV, nullptr, nullptr) || MV.isAbsolute())
+    return;
+  const MCSymbolRefExpr *SymA = MV.getSymA();
+  if (!SymA)
+    return;
+
+  // Check that GOT equivalent symbol is cached.
+  const MCSymbol *GOTEquivSym = &SymA->getSymbol();
+  if (!AP.GlobalGOTEquivs.count(GOTEquivSym))
+    return;
+
+  const GlobalValue *BaseGV = dyn_cast_or_null<GlobalValue>(BaseCst);
+  if (!BaseGV)
+    return;
+
+  // Check for a valid base symbol
+  const MCSymbol *BaseSym = AP.getSymbol(BaseGV);
+  const MCSymbolRefExpr *SymB = MV.getSymB();
+
+  if (!SymB || BaseSym != &SymB->getSymbol())
+    return;
+
+  // Make sure to match:
+  //
+  //    gotpcrelcst := <offset from @foo base> + <cst>
+  //
+  // If gotpcrelcst is positive it means that we can safely fold the pc rel
+  // displacement into the GOTPCREL. We can also can have an extra offset <cst>
+  // if the target knows how to encode it.
+  int64_t GOTPCRelCst = Offset + MV.getConstant();
+  if (GOTPCRelCst < 0)
+    return;
+  if (!AP.getObjFileLowering().supportGOTPCRelWithOffset() && GOTPCRelCst != 0)
+    return;
+
+  // Emit the GOT PC relative to replace the got equivalent global, i.e.:
+  //
+  //  bar:
+  //    .long 42
+  //  gotequiv:
+  //    .quad bar
+  //  foo:
+  //    .long gotequiv - "." + <cst>
+  //
+  // is replaced by the target specific equivalent to:
+  //
+  //  bar:
+  //    .long 42
+  //  foo:
+  //    .long bar@GOTPCREL+<gotpcrelcst>
+  AsmPrinter::GOTEquivUsePair Result = AP.GlobalGOTEquivs[GOTEquivSym];
+  const GlobalVariable *GV = Result.first;
+  int NumUses = (int)Result.second;
+  const GlobalValue *FinalGV = dyn_cast<GlobalValue>(GV->getOperand(0));
+  const MCSymbol *FinalSym = AP.getSymbol(FinalGV);
+  *ME = AP.getObjFileLowering().getIndirectSymViaGOTPCRel(
+      FinalGV, FinalSym, MV, Offset, AP.MMI, *AP.OutStreamer);
+
+  // Update GOT equivalent usage information
+  --NumUses;
+  if (NumUses >= 0)
+    AP.GlobalGOTEquivs[GOTEquivSym] = std::make_pair(GV, NumUses);
+}
+
+static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV,
+                                   AsmPrinter &AP, const Constant *BaseCV,
+                                   uint64_t Offset,
+                                   AsmPrinter::AliasMapTy *AliasList) {
+  emitGlobalAliasInline(AP, Offset, AliasList);
+  uint64_t Size = DL.getTypeAllocSize(CV->getType());
+
+  // Globals with sub-elements such as combinations of arrays and structs
+  // are handled recursively by emitGlobalConstantImpl. Keep track of the
+  // constant symbol base and the current position with BaseCV and Offset.
+  if (!BaseCV && CV->hasOneUse())
+    BaseCV = dyn_cast<Constant>(CV->user_back());
+
+  if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV))
+    return AP.OutStreamer->emitZeros(Size);
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+    const uint64_t StoreSize = DL.getTypeStoreSize(CV->getType());
+
+    if (StoreSize <= 8) {
+      if (AP.isVerbose())
+        AP.OutStreamer->getCommentOS()
+            << format("0x%" PRIx64 "\n", CI->getZExtValue());
+      AP.OutStreamer->emitIntValue(CI->getZExtValue(), StoreSize);
+    } else {
+      emitGlobalConstantLargeInt(CI, AP);
+    }
+
+    // Emit tail padding if needed
+    if (Size != StoreSize)
+      AP.OutStreamer->emitZeros(Size - StoreSize);
+
+    return;
+  }
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV))
+    return emitGlobalConstantFP(CFP, AP);
+
+  if (isa<ConstantPointerNull>(CV)) {
+    AP.OutStreamer->emitIntValue(0, Size);
+    return;
+  }
+
+  if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(CV))
+    return emitGlobalConstantDataSequential(DL, CDS, AP, AliasList);
+
+  if (const ConstantArray *CVA = dyn_cast<ConstantArray>(CV))
+    return emitGlobalConstantArray(DL, CVA, AP, BaseCV, Offset, AliasList);
+
+  if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV))
+    return emitGlobalConstantStruct(DL, CVS, AP, BaseCV, Offset, AliasList);
+
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+    // Look through bitcasts, which might not be able to be MCExpr'ized (e.g. of
+    // vectors).
+    if (CE->getOpcode() == Instruction::BitCast)
+      return emitGlobalConstantImpl(DL, CE->getOperand(0), AP);
+
+    if (Size > 8) {
+      // If the constant expression's size is greater than 64-bits, then we have
+      // to emit the value in chunks. Try to constant fold the value and emit it
+      // that way.
+      Constant *New = ConstantFoldConstant(CE, DL);
+      if (New != CE)
+        return emitGlobalConstantImpl(DL, New, AP);
+    }
+  }
+
+  if (const ConstantVector *V = dyn_cast<ConstantVector>(CV))
+    return emitGlobalConstantVector(DL, V, AP, AliasList);
+
+  // Otherwise, it must be a ConstantExpr.  Lower it to an MCExpr, then emit it
+  // thread the streamer with EmitValue.
+  const MCExpr *ME = AP.lowerConstant(CV);
+
+  // Since lowerConstant already folded and got rid of all IR pointer and
+  // integer casts, detect GOT equivalent accesses by looking into the MCExpr
+  // directly.
+  if (AP.getObjFileLowering().supportIndirectSymViaGOTPCRel())
+    handleIndirectSymViaGOTPCRel(AP, &ME, BaseCV, Offset);
+
+  AP.OutStreamer->emitValue(ME, Size);
+}
+
+/// EmitGlobalConstant - Print a general LLVM constant to the .s file.
+void AsmPrinter::emitGlobalConstant(const DataLayout &DL, const Constant *CV,
+                                    AliasMapTy *AliasList) {
+  uint64_t Size = DL.getTypeAllocSize(CV->getType());
+  if (Size)
+    emitGlobalConstantImpl(DL, CV, *this, nullptr, 0, AliasList);
+  else if (MAI->hasSubsectionsViaSymbols()) {
+    // If the global has zero size, emit a single byte so that two labels don't
+    // look like they are at the same location.
+    OutStreamer->emitIntValue(0, 1);
+  }
+  if (!AliasList)
+    return;
+  // TODO: These remaining aliases are not emitted in the correct location. Need
+  // to handle the case where the alias offset doesn't refer to any sub-element.
+  for (auto &AliasPair : *AliasList) {
+    for (const GlobalAlias *GA : AliasPair.second)
+      OutStreamer->emitLabel(getSymbol(GA));
+  }
+}
+
+void AsmPrinter::emitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+  // Target doesn't support this yet!
+  llvm_unreachable("Target does not support EmitMachineConstantPoolValue");
+}
+
+void AsmPrinter::printOffset(int64_t Offset, raw_ostream &OS) const {
+  if (Offset > 0)
+    OS << '+' << Offset;
+  else if (Offset < 0)
+    OS << Offset;
+}
+
+void AsmPrinter::emitNops(unsigned N) {
+  MCInst Nop = MF->getSubtarget().getInstrInfo()->getNop();
+  for (; N; --N)
+    EmitToStreamer(*OutStreamer, Nop);
+}
+
+//===----------------------------------------------------------------------===//
+// Symbol Lowering Routines.
+//===----------------------------------------------------------------------===//
+
+MCSymbol *AsmPrinter::createTempSymbol(const Twine &Name) const {
+  return OutContext.createTempSymbol(Name, true);
+}
+
+MCSymbol *AsmPrinter::GetBlockAddressSymbol(const BlockAddress *BA) const {
+  return const_cast<AsmPrinter *>(this)->getAddrLabelSymbol(
+      BA->getBasicBlock());
+}
+
+MCSymbol *AsmPrinter::GetBlockAddressSymbol(const BasicBlock *BB) const {
+  return const_cast<AsmPrinter *>(this)->getAddrLabelSymbol(BB);
+}
+
+/// GetCPISymbol - Return the symbol for the specified constant pool entry.
+MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const {
+  if (getSubtargetInfo().getTargetTriple().isWindowsMSVCEnvironment()) {
+    const MachineConstantPoolEntry &CPE =
+        MF->getConstantPool()->getConstants()[CPID];
+    if (!CPE.isMachineConstantPoolEntry()) {
+      const DataLayout &DL = MF->getDataLayout();
+      SectionKind Kind = CPE.getSectionKind(&DL);
+      const Constant *C = CPE.Val.ConstVal;
+      Align Alignment = CPE.Alignment;
+      if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
+              getObjFileLowering().getSectionForConstant(DL, Kind, C,
+                                                         Alignment))) {
+        if (MCSymbol *Sym = S->getCOMDATSymbol()) {
+          if (Sym->isUndefined())
+            OutStreamer->emitSymbolAttribute(Sym, MCSA_Global);
+          return Sym;
+        }
+      }
+    }
+  }
+
+  const DataLayout &DL = getDataLayout();
+  return OutContext.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                      "CPI" + Twine(getFunctionNumber()) + "_" +
+                                      Twine(CPID));
+}
+
+/// GetJTISymbol - Return the symbol for the specified jump table entry.
+MCSymbol *AsmPrinter::GetJTISymbol(unsigned JTID, bool isLinkerPrivate) const {
+  return MF->getJTISymbol(JTID, OutContext, isLinkerPrivate);
+}
+
+/// GetJTSetSymbol - Return the symbol for the specified jump table .set
+/// FIXME: privatize to AsmPrinter.
+MCSymbol *AsmPrinter::GetJTSetSymbol(unsigned UID, unsigned MBBID) const {
+  const DataLayout &DL = getDataLayout();
+  return OutContext.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                      Twine(getFunctionNumber()) + "_" +
+                                      Twine(UID) + "_set_" + Twine(MBBID));
+}
+
+MCSymbol *AsmPrinter::getSymbolWithGlobalValueBase(const GlobalValue *GV,
+                                                   StringRef Suffix) const {
+  return getObjFileLowering().getSymbolWithGlobalValueBase(GV, Suffix, TM);
+}
+
+/// Return the MCSymbol for the specified ExternalSymbol.
+MCSymbol *AsmPrinter::GetExternalSymbolSymbol(StringRef Sym) const {
+  SmallString<60> NameStr;
+  Mangler::getNameWithPrefix(NameStr, Sym, getDataLayout());
+  return OutContext.getOrCreateSymbol(NameStr);
+}
+
+/// PrintParentLoopComment - Print comments about parent loops of this one.
+static void PrintParentLoopComment(raw_ostream &OS, const MachineLoop *Loop,
+                                   unsigned FunctionNumber) {
+  if (!Loop) return;
+  PrintParentLoopComment(OS, Loop->getParentLoop(), FunctionNumber);
+  OS.indent(Loop->getLoopDepth()*2)
+    << "Parent Loop BB" << FunctionNumber << "_"
+    << Loop->getHeader()->getNumber()
+    << " Depth=" << Loop->getLoopDepth() << '\n';
+}
+
+/// PrintChildLoopComment - Print comments about child loops within
+/// the loop for this basic block, with nesting.
+static void PrintChildLoopComment(raw_ostream &OS, const MachineLoop *Loop,
+                                  unsigned FunctionNumber) {
+  // Add child loop information
+  for (const MachineLoop *CL : *Loop) {
+    OS.indent(CL->getLoopDepth()*2)
+      << "Child Loop BB" << FunctionNumber << "_"
+      << CL->getHeader()->getNumber() << " Depth " << CL->getLoopDepth()
+      << '\n';
+    PrintChildLoopComment(OS, CL, FunctionNumber);
+  }
+}
+
+/// emitBasicBlockLoopComments - Pretty-print comments for basic blocks.
+static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
+                                       const MachineLoopInfo *LI,
+                                       const AsmPrinter &AP) {
+  // Add loop depth information
+  const MachineLoop *Loop = LI->getLoopFor(&MBB);
+  if (!Loop) return;
+
+  MachineBasicBlock *Header = Loop->getHeader();
+  assert(Header && "No header for loop");
+
+  // If this block is not a loop header, just print out what is the loop header
+  // and return.
+  if (Header != &MBB) {
+    AP.OutStreamer->AddComment("  in Loop: Header=BB" +
+                               Twine(AP.getFunctionNumber())+"_" +
+                               Twine(Loop->getHeader()->getNumber())+
+                               " Depth="+Twine(Loop->getLoopDepth()));
+    return;
+  }
+
+  // Otherwise, it is a loop header.  Print out information about child and
+  // parent loops.
+  raw_ostream &OS = AP.OutStreamer->getCommentOS();
+
+  PrintParentLoopComment(OS, Loop->getParentLoop(), AP.getFunctionNumber());
+
+  OS << "=>";
+  OS.indent(Loop->getLoopDepth()*2-2);
+
+  OS << "This ";
+  if (Loop->isInnermost())
+    OS << "Inner ";
+  OS << "Loop Header: Depth=" + Twine(Loop->getLoopDepth()) << '\n';
+
+  PrintChildLoopComment(OS, Loop, AP.getFunctionNumber());
+}
+
+/// emitBasicBlockStart - This method prints the label for the specified
+/// MachineBasicBlock, an alignment (if present) and a comment describing
+/// it if appropriate.
+void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
+  // End the previous funclet and start a new one.
+  if (MBB.isEHFuncletEntry()) {
+    for (const HandlerInfo &HI : Handlers) {
+      HI.Handler->endFunclet();
+      HI.Handler->beginFunclet(MBB);
+    }
+  }
+
+  // Switch to a new section if this basic block must begin a section. The
+  // entry block is always placed in the function section and is handled
+  // separately.
+  if (MBB.isBeginSection() && !MBB.isEntryBlock()) {
+    OutStreamer->switchSection(
+        getObjFileLowering().getSectionForMachineBasicBlock(MF->getFunction(),
+                                                            MBB, TM));
+    CurrentSectionBeginSym = MBB.getSymbol();
+  }
+
+  // Emit an alignment directive for this block, if needed.
+  const Align Alignment = MBB.getAlignment();
+  if (Alignment != Align(1))
+    emitAlignment(Alignment, nullptr, MBB.getMaxBytesForAlignment());
+
+  // If the block has its address taken, emit any labels that were used to
+  // reference the block.  It is possible that there is more than one label
+  // here, because multiple LLVM BB's may have been RAUW'd to this block after
+  // the references were generated.
+  if (MBB.isIRBlockAddressTaken()) {
+    if (isVerbose())
+      OutStreamer->AddComment("Block address taken");
+
+    BasicBlock *BB = MBB.getAddressTakenIRBlock();
+    assert(BB && BB->hasAddressTaken() && "Missing BB");
+    for (MCSymbol *Sym : getAddrLabelSymbolToEmit(BB))
+      OutStreamer->emitLabel(Sym);
+  } else if (isVerbose() && MBB.isMachineBlockAddressTaken()) {
+    OutStreamer->AddComment("Block address taken");
+  }
+
+  // Print some verbose block comments.
+  if (isVerbose()) {
+    if (const BasicBlock *BB = MBB.getBasicBlock()) {
+      if (BB->hasName()) {
+        BB->printAsOperand(OutStreamer->getCommentOS(),
+                           /*PrintType=*/false, BB->getModule());
+        OutStreamer->getCommentOS() << '\n';
+      }
+    }
+
+    assert(MLI != nullptr && "MachineLoopInfo should has been computed");
+    emitBasicBlockLoopComments(MBB, MLI, *this);
+  }
+
+  // Print the main label for the block.
+  if (shouldEmitLabelForBasicBlock(MBB)) {
+    if (isVerbose() && MBB.hasLabelMustBeEmitted())
+      OutStreamer->AddComment("Label of block must be emitted");
+    OutStreamer->emitLabel(MBB.getSymbol());
+  } else {
+    if (isVerbose()) {
+      // NOTE: Want this comment at start of line, don't emit with AddComment.
+      OutStreamer->emitRawComment(" %bb." + Twine(MBB.getNumber()) + ":",
+                                  false);
+    }
+  }
+
+  if (MBB.isEHCatchretTarget() &&
+      MAI->getExceptionHandlingType() == ExceptionHandling::WinEH) {
+    OutStreamer->emitLabel(MBB.getEHCatchretSymbol());
+  }
+
+  // With BB sections, each basic block must handle CFI information on its own
+  // if it begins a section (Entry block call is handled separately, next to
+  // beginFunction).
+  if (MBB.isBeginSection() && !MBB.isEntryBlock())
+    for (const HandlerInfo &HI : Handlers)
+      HI.Handler->beginBasicBlockSection(MBB);
+}
+
+void AsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) {
+  // Check if CFI information needs to be updated for this MBB with basic block
+  // sections.
+  if (MBB.isEndSection())
+    for (const HandlerInfo &HI : Handlers)
+      HI.Handler->endBasicBlockSection(MBB);
+}
+
+void AsmPrinter::emitVisibility(MCSymbol *Sym, unsigned Visibility,
+                                bool IsDefinition) const {
+  MCSymbolAttr Attr = MCSA_Invalid;
+
+  switch (Visibility) {
+  default: break;
+  case GlobalValue::HiddenVisibility:
+    if (IsDefinition)
+      Attr = MAI->getHiddenVisibilityAttr();
+    else
+      Attr = MAI->getHiddenDeclarationVisibilityAttr();
+    break;
+  case GlobalValue::ProtectedVisibility:
+    Attr = MAI->getProtectedVisibilityAttr();
+    break;
+  }
+
+  if (Attr != MCSA_Invalid)
+    OutStreamer->emitSymbolAttribute(Sym, Attr);
+}
+
+bool AsmPrinter::shouldEmitLabelForBasicBlock(
+    const MachineBasicBlock &MBB) const {
+  // With `-fbasic-block-sections=`, a label is needed for every non-entry block
+  // in the labels mode (option `=labels`) and every section beginning in the
+  // sections mode (`=all` and `=list=`).
+  if ((MF->hasBBLabels() || MBB.isBeginSection()) && !MBB.isEntryBlock())
+    return true;
+  // A label is needed for any block with at least one predecessor (when that
+  // predecessor is not the fallthrough predecessor, or if it is an EH funclet
+  // entry, or if a label is forced).
+  return !MBB.pred_empty() &&
+         (!isBlockOnlyReachableByFallthrough(&MBB) || MBB.isEHFuncletEntry() ||
+          MBB.hasLabelMustBeEmitted());
+}
+
+/// isBlockOnlyReachableByFallthough - Return true if the basic block has
+/// exactly one predecessor and the control transfer mechanism between
+/// the predecessor and this block is a fall-through.
+bool AsmPrinter::
+isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
+  // If this is a landing pad, it isn't a fall through.  If it has no preds,
+  // then nothing falls through to it.
+  if (MBB->isEHPad() || MBB->pred_empty())
+    return false;
+
+  // If there isn't exactly one predecessor, it can't be a fall through.
+  if (MBB->pred_size() > 1)
+    return false;
+
+  // The predecessor has to be immediately before this block.
+  MachineBasicBlock *Pred = *MBB->pred_begin();
+  if (!Pred->isLayoutSuccessor(MBB))
+    return false;
+
+  // If the block is completely empty, then it definitely does fall through.
+  if (Pred->empty())
+    return true;
+
+  // Check the terminators in the previous blocks
+  for (const auto &MI : Pred->terminators()) {
+    // If it is not a simple branch, we are in a table somewhere.
+    if (!MI.isBranch() || MI.isIndirectBranch())
+      return false;
+
+    // If we are the operands of one of the branches, this is not a fall
+    // through. Note that targets with delay slots will usually bundle
+    // terminators with the delay slot instruction.
+    for (ConstMIBundleOperands OP(MI); OP.isValid(); ++OP) {
+      if (OP->isJTI())
+        return false;
+      if (OP->isMBB() && OP->getMBB() == MBB)
+        return false;
+    }
+  }
+
+  return true;
+}
+
+GCMetadataPrinter *AsmPrinter::getOrCreateGCPrinter(GCStrategy &S) {
+  if (!S.usesMetadata())
+    return nullptr;
+
+  auto [GCPI, Inserted] = GCMetadataPrinters.insert({&S, nullptr});
+  if (!Inserted)
+    return GCPI->second.get();
+
+  auto Name = S.getName();
+
+  for (const GCMetadataPrinterRegistry::entry &GCMetaPrinter :
+       GCMetadataPrinterRegistry::entries())
+    if (Name == GCMetaPrinter.getName()) {
+      std::unique_ptr<GCMetadataPrinter> GMP = GCMetaPrinter.instantiate();
+      GMP->S = &S;
+      GCPI->second = std::move(GMP);
+      return GCPI->second.get();
+    }
+
+  report_fatal_error("no GCMetadataPrinter registered for GC: " + Twine(Name));
+}
+
+void AsmPrinter::emitStackMaps() {
+  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(MI && "AsmPrinter didn't require GCModuleInfo?");
+  bool NeedsDefault = false;
+  if (MI->begin() == MI->end())
+    // No GC strategy, use the default format.
+    NeedsDefault = true;
+  else
+    for (const auto &I : *MI) {
+      if (GCMetadataPrinter *MP = getOrCreateGCPrinter(*I))
+        if (MP->emitStackMaps(SM, *this))
+          continue;
+      // The strategy doesn't have printer or doesn't emit custom stack maps.
+      // Use the default format.
+      NeedsDefault = true;
+    }
+
+  if (NeedsDefault)
+    SM.serializeToStackMapSection();
+}
+
+/// Pin vtable to this file.
+AsmPrinterHandler::~AsmPrinterHandler() = default;
+
+void AsmPrinterHandler::markFunctionEnd() {}
+
+// In the binary's "xray_instr_map" section, an array of these function entries
+// describes each instrumentation point.  When XRay patches your code, the index
+// into this table will be given to your handler as a patch point identifier.
+void AsmPrinter::XRayFunctionEntry::emit(int Bytes, MCStreamer *Out) const {
+  auto Kind8 = static_cast<uint8_t>(Kind);
+  Out->emitBinaryData(StringRef(reinterpret_cast<const char *>(&Kind8), 1));
+  Out->emitBinaryData(
+      StringRef(reinterpret_cast<const char *>(&AlwaysInstrument), 1));
+  Out->emitBinaryData(StringRef(reinterpret_cast<const char *>(&Version), 1));
+  auto Padding = (4 * Bytes) - ((2 * Bytes) + 3);
+  assert(Padding >= 0 && "Instrumentation map entry > 4 * Word Size");
+  Out->emitZeros(Padding);
+}
+
+void AsmPrinter::emitXRayTable() {
+  if (Sleds.empty())
+    return;
+
+  auto PrevSection = OutStreamer->getCurrentSectionOnly();
+  const Function &F = MF->getFunction();
+  MCSection *InstMap = nullptr;
+  MCSection *FnSledIndex = nullptr;
+  const Triple &TT = TM.getTargetTriple();
+  // Use PC-relative addresses on all targets.
+  if (TT.isOSBinFormatELF()) {
+    auto LinkedToSym = cast<MCSymbolELF>(CurrentFnSym);
+    auto Flags = ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER;
+    StringRef GroupName;
+    if (F.hasComdat()) {
+      Flags |= ELF::SHF_GROUP;
+      GroupName = F.getComdat()->getName();
+    }
+    InstMap = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+                                       Flags, 0, GroupName, F.hasComdat(),
+                                       MCSection::NonUniqueID, LinkedToSym);
+
+    if (TM.Options.XRayFunctionIndex)
+      FnSledIndex = OutContext.getELFSection(
+          "xray_fn_idx", ELF::SHT_PROGBITS, Flags, 0, GroupName, F.hasComdat(),
+          MCSection::NonUniqueID, LinkedToSym);
+  } else if (MF->getSubtarget().getTargetTriple().isOSBinFormatMachO()) {
+    InstMap = OutContext.getMachOSection("__DATA", "xray_instr_map",
+                                         MachO::S_ATTR_LIVE_SUPPORT,
+                                         SectionKind::getReadOnlyWithRel());
+    if (TM.Options.XRayFunctionIndex)
+      FnSledIndex = OutContext.getMachOSection("__DATA", "xray_fn_idx",
+                                               MachO::S_ATTR_LIVE_SUPPORT,
+                                               SectionKind::getReadOnly());
+  } else {
+    llvm_unreachable("Unsupported target");
+  }
+
+  auto WordSizeBytes = MAI->getCodePointerSize();
+
+  // Now we switch to the instrumentation map section. Because this is done
+  // per-function, we are able to create an index entry that will represent the
+  // range of sleds associated with a function.
+  auto &Ctx = OutContext;
+  MCSymbol *SledsStart =
+      OutContext.createLinkerPrivateSymbol("xray_sleds_start");
+  OutStreamer->switchSection(InstMap);
+  OutStreamer->emitLabel(SledsStart);
+  for (const auto &Sled : Sleds) {
+    MCSymbol *Dot = Ctx.createTempSymbol();
+    OutStreamer->emitLabel(Dot);
+    OutStreamer->emitValueImpl(
+        MCBinaryExpr::createSub(MCSymbolRefExpr::create(Sled.Sled, Ctx),
+                                MCSymbolRefExpr::create(Dot, Ctx), Ctx),
+        WordSizeBytes);
+    OutStreamer->emitValueImpl(
+        MCBinaryExpr::createSub(
+            MCSymbolRefExpr::create(CurrentFnBegin, Ctx),
+            MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Dot, Ctx),
+                                    MCConstantExpr::create(WordSizeBytes, Ctx),
+                                    Ctx),
+            Ctx),
+        WordSizeBytes);
+    Sled.emit(WordSizeBytes, OutStreamer.get());
+  }
+  MCSymbol *SledsEnd = OutContext.createTempSymbol("xray_sleds_end", true);
+  OutStreamer->emitLabel(SledsEnd);
+
+  // We then emit a single entry in the index per function. We use the symbols
+  // that bound the instrumentation map as the range for a specific function.
+  // Each entry here will be 2 * word size aligned, as we're writing down two
+  // pointers. This should work for both 32-bit and 64-bit platforms.
+  if (FnSledIndex) {
+    OutStreamer->switchSection(FnSledIndex);
+    OutStreamer->emitCodeAlignment(Align(2 * WordSizeBytes),
+                                   &getSubtargetInfo());
+    // For Mach-O, use an "l" symbol as the atom of this subsection. The label
+    // difference uses a SUBTRACTOR external relocation which references the
+    // symbol.
+    MCSymbol *Dot = Ctx.createLinkerPrivateSymbol("xray_fn_idx");
+    OutStreamer->emitLabel(Dot);
+    OutStreamer->emitValueImpl(
+        MCBinaryExpr::createSub(MCSymbolRefExpr::create(SledsStart, Ctx),
+                                MCSymbolRefExpr::create(Dot, Ctx), Ctx),
+        WordSizeBytes);
+    OutStreamer->emitValueImpl(MCConstantExpr::create(Sleds.size(), Ctx),
+                               WordSizeBytes);
+    OutStreamer->switchSection(PrevSection);
+  }
+  Sleds.clear();
+}
+
+void AsmPrinter::recordSled(MCSymbol *Sled, const MachineInstr &MI,
+                            SledKind Kind, uint8_t Version) {
+  const Function &F = MI.getMF()->getFunction();
+  auto Attr = F.getFnAttribute("function-instrument");
+  bool LogArgs = F.hasFnAttribute("xray-log-args");
+  bool AlwaysInstrument =
+    Attr.isStringAttribute() && Attr.getValueAsString() == "xray-always";
+  if (Kind == SledKind::FUNCTION_ENTER && LogArgs)
+    Kind = SledKind::LOG_ARGS_ENTER;
+  Sleds.emplace_back(XRayFunctionEntry{Sled, CurrentFnSym, Kind,
+                                       AlwaysInstrument, &F, Version});
+}
+
+void AsmPrinter::emitPatchableFunctionEntries() {
+  const Function &F = MF->getFunction();
+  unsigned PatchableFunctionPrefix = 0, PatchableFunctionEntry = 0;
+  (void)F.getFnAttribute("patchable-function-prefix")
+      .getValueAsString()
+      .getAsInteger(10, PatchableFunctionPrefix);
+  (void)F.getFnAttribute("patchable-function-entry")
+      .getValueAsString()
+      .getAsInteger(10, PatchableFunctionEntry);
+  if (!PatchableFunctionPrefix && !PatchableFunctionEntry)
+    return;
+  const unsigned PointerSize = getPointerSize();
+  if (TM.getTargetTriple().isOSBinFormatELF()) {
+    auto Flags = ELF::SHF_WRITE | ELF::SHF_ALLOC;
+    const MCSymbolELF *LinkedToSym = nullptr;
+    StringRef GroupName;
+
+    // GNU as < 2.35 did not support section flag 'o'. GNU ld < 2.36 did not
+    // support mixed SHF_LINK_ORDER and non-SHF_LINK_ORDER sections.
+    if (MAI->useIntegratedAssembler() || MAI->binutilsIsAtLeast(2, 36)) {
+      Flags |= ELF::SHF_LINK_ORDER;
+      if (F.hasComdat()) {
+        Flags |= ELF::SHF_GROUP;
+        GroupName = F.getComdat()->getName();
+      }
+      LinkedToSym = cast<MCSymbolELF>(CurrentFnSym);
+    }
+    OutStreamer->switchSection(OutContext.getELFSection(
+        "__patchable_function_entries", ELF::SHT_PROGBITS, Flags, 0, GroupName,
+        F.hasComdat(), MCSection::NonUniqueID, LinkedToSym));
+    emitAlignment(Align(PointerSize));
+    OutStreamer->emitSymbolValue(CurrentPatchableFunctionEntrySym, PointerSize);
+  }
+}
+
+uint16_t AsmPrinter::getDwarfVersion() const {
+  return OutStreamer->getContext().getDwarfVersion();
+}
+
+void AsmPrinter::setDwarfVersion(uint16_t Version) {
+  OutStreamer->getContext().setDwarfVersion(Version);
+}
+
+bool AsmPrinter::isDwarf64() const {
+  return OutStreamer->getContext().getDwarfFormat() == dwarf::DWARF64;
+}
+
+unsigned int AsmPrinter::getDwarfOffsetByteSize() const {
+  return dwarf::getDwarfOffsetByteSize(
+      OutStreamer->getContext().getDwarfFormat());
+}
+
+dwarf::FormParams AsmPrinter::getDwarfFormParams() const {
+  return {getDwarfVersion(), uint8_t(MAI->getCodePointerSize()),
+          OutStreamer->getContext().getDwarfFormat(),
+          doesDwarfUseRelocationsAcrossSections()};
+}
+
+unsigned int AsmPrinter::getUnitLengthFieldByteSize() const {
+  return dwarf::getUnitLengthFieldByteSize(
+      OutStreamer->getContext().getDwarfFormat());
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
new file mode 100644
index 000000000000..21d0d070c247
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -0,0 +1,305 @@
+//===-- AsmPrinterDwarf.cpp - AsmPrinter Dwarf Support --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Dwarf emissions parts of AsmPrinter.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <cstdint>
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+//===----------------------------------------------------------------------===//
+// Dwarf Emission Helper Routines
+//===----------------------------------------------------------------------===//
+
+static const char *DecodeDWARFEncoding(unsigned Encoding) {
+  switch (Encoding) {
+  case dwarf::DW_EH_PE_absptr:
+    return "absptr";
+  case dwarf::DW_EH_PE_omit:
+    return "omit";
+  case dwarf::DW_EH_PE_pcrel:
+    return "pcrel";
+  case dwarf::DW_EH_PE_uleb128:
+    return "uleb128";
+  case dwarf::DW_EH_PE_sleb128:
+    return "sleb128";
+  case dwarf::DW_EH_PE_udata4:
+    return "udata4";
+  case dwarf::DW_EH_PE_udata8:
+    return "udata8";
+  case dwarf::DW_EH_PE_sdata4:
+    return "sdata4";
+  case dwarf::DW_EH_PE_sdata8:
+    return "sdata8";
+  case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata4:
+    return "pcrel udata4";
+  case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4:
+    return "pcrel sdata4";
+  case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata8:
+    return "pcrel udata8";
+  case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8:
+    return "pcrel sdata8";
+  case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata4
+      :
+    return "indirect pcrel udata4";
+  case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
+      :
+    return "indirect pcrel sdata4";
+  case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata8
+      :
+    return "indirect pcrel udata8";
+  case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8
+      :
+    return "indirect pcrel sdata8";
+  case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_datarel |
+      dwarf::DW_EH_PE_sdata4:
+    return "indirect datarel sdata4";
+  case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_datarel |
+      dwarf::DW_EH_PE_sdata8:
+    return "indirect datarel sdata8";
+  }
+
+  return "<unknown encoding>";
+}
+
+/// EmitEncodingByte - Emit a .byte 42 directive that corresponds to an
+/// encoding.  If verbose assembly output is enabled, we output comments
+/// describing the encoding.  Desc is an optional string saying what the
+/// encoding is specifying (e.g. "LSDA").
+void AsmPrinter::emitEncodingByte(unsigned Val, const char *Desc) const {
+  if (isVerbose()) {
+    if (Desc)
+      OutStreamer->AddComment(Twine(Desc) + " Encoding = " +
+                              Twine(DecodeDWARFEncoding(Val)));
+    else
+      OutStreamer->AddComment(Twine("Encoding = ") + DecodeDWARFEncoding(Val));
+  }
+
+  OutStreamer->emitIntValue(Val, 1);
+}
+
+/// GetSizeOfEncodedValue - Return the size of the encoding in bytes.
+unsigned AsmPrinter::GetSizeOfEncodedValue(unsigned Encoding) const {
+  if (Encoding == dwarf::DW_EH_PE_omit)
+    return 0;
+
+  switch (Encoding & 0x07) {
+  default:
+    llvm_unreachable("Invalid encoded value.");
+  case dwarf::DW_EH_PE_absptr:
+    return MAI->getCodePointerSize();
+  case dwarf::DW_EH_PE_udata2:
+    return 2;
+  case dwarf::DW_EH_PE_udata4:
+    return 4;
+  case dwarf::DW_EH_PE_udata8:
+    return 8;
+  }
+}
+
+void AsmPrinter::emitTTypeReference(const GlobalValue *GV, unsigned Encoding) {
+  if (GV) {
+    const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+
+    const MCExpr *Exp =
+        TLOF.getTTypeGlobalReference(GV, Encoding, TM, MMI, *OutStreamer);
+    OutStreamer->emitValue(Exp, GetSizeOfEncodedValue(Encoding));
+  } else
+    OutStreamer->emitIntValue(0, GetSizeOfEncodedValue(Encoding));
+}
+
+void AsmPrinter::emitDwarfSymbolReference(const MCSymbol *Label,
+                                          bool ForceOffset) const {
+  if (!ForceOffset) {
+    // On COFF targets, we have to emit the special .secrel32 directive.
+    if (MAI->needsDwarfSectionOffsetDirective()) {
+      assert(!isDwarf64() &&
+             "emitting DWARF64 is not implemented for COFF targets");
+      OutStreamer->emitCOFFSecRel32(Label, /*Offset=*/0);
+      return;
+    }
+
+    // If the format uses relocations with dwarf, refer to the symbol directly.
+    if (doesDwarfUseRelocationsAcrossSections()) {
+      OutStreamer->emitSymbolValue(Label, getDwarfOffsetByteSize());
+      return;
+    }
+  }
+
+  // Otherwise, emit it as a label difference from the start of the section.
+  emitLabelDifference(Label, Label->getSection().getBeginSymbol(),
+                      getDwarfOffsetByteSize());
+}
+
+void AsmPrinter::emitDwarfStringOffset(DwarfStringPoolEntry S) const {
+  if (doesDwarfUseRelocationsAcrossSections()) {
+    assert(S.Symbol && "No symbol available");
+    emitDwarfSymbolReference(S.Symbol);
+    return;
+  }
+
+  // Just emit the offset directly; no need for symbol math.
+  OutStreamer->emitIntValue(S.Offset, getDwarfOffsetByteSize());
+}
+
+void AsmPrinter::emitDwarfOffset(const MCSymbol *Label, uint64_t Offset) const {
+  emitLabelPlusOffset(Label, Offset, getDwarfOffsetByteSize());
+}
+
+void AsmPrinter::emitDwarfLengthOrOffset(uint64_t Value) const {
+  assert(isDwarf64() || Value <= UINT32_MAX);
+  OutStreamer->emitIntValue(Value, getDwarfOffsetByteSize());
+}
+
+void AsmPrinter::emitDwarfUnitLength(uint64_t Length,
+                                     const Twine &Comment) const {
+  OutStreamer->emitDwarfUnitLength(Length, Comment);
+}
+
+MCSymbol *AsmPrinter::emitDwarfUnitLength(const Twine &Prefix,
+                                          const Twine &Comment) const {
+  return OutStreamer->emitDwarfUnitLength(Prefix, Comment);
+}
+
+void AsmPrinter::emitCallSiteOffset(const MCSymbol *Hi, const MCSymbol *Lo,
+                                    unsigned Encoding) const {
+  // The least significant 3 bits specify the width of the encoding
+  if ((Encoding & 0x7) == dwarf::DW_EH_PE_uleb128)
+    emitLabelDifferenceAsULEB128(Hi, Lo);
+  else
+    emitLabelDifference(Hi, Lo, GetSizeOfEncodedValue(Encoding));
+}
+
+void AsmPrinter::emitCallSiteValue(uint64_t Value, unsigned Encoding) const {
+  // The least significant 3 bits specify the width of the encoding
+  if ((Encoding & 0x7) == dwarf::DW_EH_PE_uleb128)
+    emitULEB128(Value);
+  else
+    OutStreamer->emitIntValue(Value, GetSizeOfEncodedValue(Encoding));
+}
+
+//===----------------------------------------------------------------------===//
+// Dwarf Lowering Routines
+//===----------------------------------------------------------------------===//
+
+void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const {
+  SMLoc Loc = Inst.getLoc();
+  switch (Inst.getOperation()) {
+  default:
+    llvm_unreachable("Unexpected instruction");
+  case MCCFIInstruction::OpDefCfaOffset:
+    OutStreamer->emitCFIDefCfaOffset(Inst.getOffset(), Loc);
+    break;
+  case MCCFIInstruction::OpAdjustCfaOffset:
+    OutStreamer->emitCFIAdjustCfaOffset(Inst.getOffset(), Loc);
+    break;
+  case MCCFIInstruction::OpDefCfa:
+    OutStreamer->emitCFIDefCfa(Inst.getRegister(), Inst.getOffset(), Loc);
+    break;
+  case MCCFIInstruction::OpDefCfaRegister:
+    OutStreamer->emitCFIDefCfaRegister(Inst.getRegister(), Loc);
+    break;
+  case MCCFIInstruction::OpLLVMDefAspaceCfa:
+    OutStreamer->emitCFILLVMDefAspaceCfa(Inst.getRegister(), Inst.getOffset(),
+                                         Inst.getAddressSpace(), Loc);
+    break;
+  case MCCFIInstruction::OpOffset:
+    OutStreamer->emitCFIOffset(Inst.getRegister(), Inst.getOffset(), Loc);
+    break;
+  case MCCFIInstruction::OpRegister:
+    OutStreamer->emitCFIRegister(Inst.getRegister(), Inst.getRegister2(), Loc);
+    break;
+  case MCCFIInstruction::OpWindowSave:
+    OutStreamer->emitCFIWindowSave(Loc);
+    break;
+  case MCCFIInstruction::OpNegateRAState:
+    OutStreamer->emitCFINegateRAState(Loc);
+    break;
+  case MCCFIInstruction::OpSameValue:
+    OutStreamer->emitCFISameValue(Inst.getRegister(), Loc);
+    break;
+  case MCCFIInstruction::OpGnuArgsSize:
+    OutStreamer->emitCFIGnuArgsSize(Inst.getOffset(), Loc);
+    break;
+  case MCCFIInstruction::OpEscape:
+    OutStreamer->AddComment(Inst.getComment());
+    OutStreamer->emitCFIEscape(Inst.getValues(), Loc);
+    break;
+  case MCCFIInstruction::OpRestore:
+    OutStreamer->emitCFIRestore(Inst.getRegister(), Loc);
+    break;
+  case MCCFIInstruction::OpUndefined:
+    OutStreamer->emitCFIUndefined(Inst.getRegister(), Loc);
+    break;
+  case MCCFIInstruction::OpRememberState:
+    OutStreamer->emitCFIRememberState(Loc);
+    break;
+  case MCCFIInstruction::OpRestoreState:
+    OutStreamer->emitCFIRestoreState(Loc);
+    break;
+  }
+}
+
+void AsmPrinter::emitDwarfDIE(const DIE &Die) const {
+  // Emit the code (index) for the abbreviation.
+  if (isVerbose())
+    OutStreamer->AddComment("Abbrev [" + Twine(Die.getAbbrevNumber()) + "] 0x" +
+                            Twine::utohexstr(Die.getOffset()) + ":0x" +
+                            Twine::utohexstr(Die.getSize()) + " " +
+                            dwarf::TagString(Die.getTag()));
+  emitULEB128(Die.getAbbrevNumber());
+
+  // Emit the DIE attribute values.
+  for (const auto &V : Die.values()) {
+    dwarf::Attribute Attr = V.getAttribute();
+    assert(V.getForm() && "Too many attributes for DIE (check abbreviation)");
+
+    if (isVerbose()) {
+      OutStreamer->AddComment(dwarf::AttributeString(Attr));
+      if (Attr == dwarf::DW_AT_accessibility)
+        OutStreamer->AddComment(
+            dwarf::AccessibilityString(V.getDIEInteger().getValue()));
+    }
+
+    // Emit an attribute using the defined form.
+    V.emitValue(this);
+  }
+
+  // Emit the DIE children if any.
+  if (Die.hasChildren()) {
+    for (const auto &Child : Die.children())
+      emitDwarfDIE(Child);
+
+    OutStreamer->AddComment("End Of Children Mark");
+    emitInt8(0);
+  }
+}
+
+void AsmPrinter::emitDwarfAbbrev(const DIEAbbrev &Abbrev) const {
+  // Emit the abbreviations code (base 1 index.)
+  emitULEB128(Abbrev.getNumber(), "Abbreviation Code");
+
+  // Emit the abbreviations data.
+  Abbrev.Emit(this);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
new file mode 100644
index 000000000000..32674bbeb061
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -0,0 +1,519 @@
+//===-- AsmPrinterInlineAsm.cpp - AsmPrinter Inline Asm Handling ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the inline assembler pieces of the AsmPrinter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+unsigned AsmPrinter::addInlineAsmDiagBuffer(StringRef AsmStr,
+                                            const MDNode *LocMDNode) const {
+  MCContext &Context = MMI->getContext();
+  Context.initInlineSourceManager();
+  SourceMgr &SrcMgr = *Context.getInlineSourceManager();
+  std::vector<const MDNode *> &LocInfos = Context.getLocInfos();
+
+  std::unique_ptr<MemoryBuffer> Buffer;
+  // The inline asm source manager will outlive AsmStr, so make a copy of the
+  // string for SourceMgr to own.
+  Buffer = MemoryBuffer::getMemBufferCopy(AsmStr, "<inline asm>");
+
+  // Tell SrcMgr about this buffer, it takes ownership of the buffer.
+  unsigned BufNum = SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc());
+
+  // Store LocMDNode in DiagInfo, using BufNum as an identifier.
+  if (LocMDNode) {
+    LocInfos.resize(BufNum);
+    LocInfos[BufNum - 1] = LocMDNode;
+  }
+
+  return BufNum;
+}
+
+
+/// EmitInlineAsm - Emit a blob of inline asm to the output streamer.
+void AsmPrinter::emitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
+                               const MCTargetOptions &MCOptions,
+                               const MDNode *LocMDNode,
+                               InlineAsm::AsmDialect Dialect) const {
+  assert(!Str.empty() && "Can't emit empty inline asm block");
+
+  // Remember if the buffer is nul terminated or not so we can avoid a copy.
+  bool isNullTerminated = Str.back() == 0;
+  if (isNullTerminated)
+    Str = Str.substr(0, Str.size()-1);
+
+  // If the output streamer does not have mature MC support or the integrated
+  // assembler has been disabled or not required, just emit the blob textually.
+  // Otherwise parse the asm and emit it via MC support.
+  // This is useful in case the asm parser doesn't handle something but the
+  // system assembler does.
+  const MCAsmInfo *MCAI = TM.getMCAsmInfo();
+  assert(MCAI && "No MCAsmInfo");
+  if (!MCAI->useIntegratedAssembler() &&
+      !MCAI->parseInlineAsmUsingAsmParser() &&
+      !OutStreamer->isIntegratedAssemblerRequired()) {
+    emitInlineAsmStart();
+    OutStreamer->emitRawText(Str);
+    emitInlineAsmEnd(STI, nullptr);
+    return;
+  }
+
+  unsigned BufNum = addInlineAsmDiagBuffer(Str, LocMDNode);
+  SourceMgr &SrcMgr = *MMI->getContext().getInlineSourceManager();
+  SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths);
+
+  std::unique_ptr<MCAsmParser> Parser(
+      createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI, BufNum));
+
+  // Do not use assembler-level information for parsing inline assembly.
+  OutStreamer->setUseAssemblerInfoForParsing(false);
+
+  // We create a new MCInstrInfo here since we might be at the module level
+  // and not have a MachineFunction to initialize the TargetInstrInfo from and
+  // we only need MCInstrInfo for asm parsing. We create one unconditionally
+  // because it's not subtarget dependent.
+  std::unique_ptr<MCInstrInfo> MII(TM.getTarget().createMCInstrInfo());
+  assert(MII && "Failed to create instruction info");
+  std::unique_ptr<MCTargetAsmParser> TAP(TM.getTarget().createMCAsmParser(
+      STI, *Parser, *MII, MCOptions));
+  if (!TAP)
+    report_fatal_error("Inline asm not supported by this streamer because"
+                       " we don't have an asm parser for this target\n");
+  Parser->setAssemblerDialect(Dialect);
+  Parser->setTargetParser(*TAP);
+  // Enable lexing Masm binary and hex integer literals in intel inline
+  // assembly.
+  if (Dialect == InlineAsm::AD_Intel)
+    Parser->getLexer().setLexMasmIntegers(true);
+
+  emitInlineAsmStart();
+  // Don't implicitly switch to the text section before the asm.
+  (void)Parser->Run(/*NoInitialTextSection*/ true,
+                    /*NoFinalize*/ true);
+  emitInlineAsmEnd(STI, &TAP->getSTI());
+}
+
+static void EmitInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
+                             MachineModuleInfo *MMI, const MCAsmInfo *MAI,
+                             AsmPrinter *AP, uint64_t LocCookie,
+                             raw_ostream &OS) {
+  bool InputIsIntelDialect = MI->getInlineAsmDialect() == InlineAsm::AD_Intel;
+
+  if (InputIsIntelDialect) {
+    // Switch to the inline assembly variant.
+    OS << "\t.intel_syntax\n\t";
+  }
+
+  int CurVariant = -1; // The number of the {.|.|.} region we are in.
+  const char *LastEmitted = AsmStr; // One past the last character emitted.
+  unsigned NumOperands = MI->getNumOperands();
+
+  int AsmPrinterVariant;
+  if (InputIsIntelDialect)
+    AsmPrinterVariant = 1; // X86MCAsmInfo.cpp's AsmWriterFlavorTy::Intel.
+  else
+    AsmPrinterVariant = MMI->getTarget().unqualifiedInlineAsmVariant();
+
+  // FIXME: Should this happen for `asm inteldialect` as well?
+  if (!InputIsIntelDialect && MAI->getEmitGNUAsmStartIndentationMarker())
+    OS << '\t';
+
+  while (*LastEmitted) {
+    switch (*LastEmitted) {
+    default: {
+      // Not a special case, emit the string section literally.
+      const char *LiteralEnd = LastEmitted+1;
+      while (*LiteralEnd && *LiteralEnd != '{' && *LiteralEnd != '|' &&
+             *LiteralEnd != '}' && *LiteralEnd != '$' && *LiteralEnd != '\n')
+        ++LiteralEnd;
+      if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
+        OS.write(LastEmitted, LiteralEnd - LastEmitted);
+      LastEmitted = LiteralEnd;
+      break;
+    }
+    case '\n':
+      ++LastEmitted;   // Consume newline character.
+      OS << '\n';      // Indent code with newline.
+      break;
+    case '$': {
+      ++LastEmitted;   // Consume '$' character.
+      bool Done = true;
+
+      // Handle escapes.
+      switch (*LastEmitted) {
+      default: Done = false; break;
+      case '$':     // $$ -> $
+        if (!InputIsIntelDialect)
+          if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
+            OS << '$';
+        ++LastEmitted;  // Consume second '$' character.
+        break;
+      case '(':        // $( -> same as GCC's { character.
+        ++LastEmitted; // Consume '(' character.
+        if (CurVariant != -1)
+          report_fatal_error("Nested variants found in inline asm string: '" +
+                             Twine(AsmStr) + "'");
+        CurVariant = 0; // We're in the first variant now.
+        break;
+      case '|':
+        ++LastEmitted; // Consume '|' character.
+        if (CurVariant == -1)
+          OS << '|'; // This is gcc's behavior for | outside a variant.
+        else
+          ++CurVariant; // We're in the next variant.
+        break;
+      case ')':        // $) -> same as GCC's } char.
+        ++LastEmitted; // Consume ')' character.
+        if (CurVariant == -1)
+          OS << '}'; // This is gcc's behavior for } outside a variant.
+        else
+          CurVariant = -1;
+        break;
+      }
+      if (Done) break;
+
+      bool HasCurlyBraces = false;
+      if (*LastEmitted == '{') {     // ${variable}
+        ++LastEmitted;               // Consume '{' character.
+        HasCurlyBraces = true;
+      }
+
+      // If we have ${:foo}, then this is not a real operand reference, it is a
+      // "magic" string reference, just like in .td files.  Arrange to call
+      // PrintSpecial.
+      if (HasCurlyBraces && *LastEmitted == ':') {
+        ++LastEmitted;
+        const char *StrStart = LastEmitted;
+        const char *StrEnd = strchr(StrStart, '}');
+        if (!StrEnd)
+          report_fatal_error("Unterminated ${:foo} operand in inline asm"
+                             " string: '" + Twine(AsmStr) + "'");
+        if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
+          AP->PrintSpecial(MI, OS, StringRef(StrStart, StrEnd - StrStart));
+        LastEmitted = StrEnd+1;
+        break;
+      }
+
+      const char *IDStart = LastEmitted;
+      const char *IDEnd = IDStart;
+      while (isDigit(*IDEnd))
+        ++IDEnd;
+
+      unsigned Val;
+      if (StringRef(IDStart, IDEnd-IDStart).getAsInteger(10, Val))
+        report_fatal_error("Bad $ operand number in inline asm string: '" +
+                           Twine(AsmStr) + "'");
+      LastEmitted = IDEnd;
+
+      if (Val >= NumOperands - 1)
+        report_fatal_error("Invalid $ operand number in inline asm string: '" +
+                           Twine(AsmStr) + "'");
+
+      char Modifier[2] = { 0, 0 };
+
+      if (HasCurlyBraces) {
+        // If we have curly braces, check for a modifier character.  This
+        // supports syntax like ${0:u}, which correspond to "%u0" in GCC asm.
+        if (*LastEmitted == ':') {
+          ++LastEmitted;    // Consume ':' character.
+          if (*LastEmitted == 0)
+            report_fatal_error("Bad ${:} expression in inline asm string: '" +
+                               Twine(AsmStr) + "'");
+
+          Modifier[0] = *LastEmitted;
+          ++LastEmitted;    // Consume modifier character.
+        }
+
+        if (*LastEmitted != '}')
+          report_fatal_error("Bad ${} expression in inline asm string: '" +
+                             Twine(AsmStr) + "'");
+        ++LastEmitted;    // Consume '}' character.
+      }
+
+      // Okay, we finally have a value number.  Ask the target to print this
+      // operand!
+      if (CurVariant == -1 || CurVariant == AsmPrinterVariant) {
+        unsigned OpNo = InlineAsm::MIOp_FirstOperand;
+
+        bool Error = false;
+
+        // Scan to find the machine operand number for the operand.
+        for (; Val; --Val) {
+          if (OpNo >= MI->getNumOperands())
+            break;
+          unsigned OpFlags = MI->getOperand(OpNo).getImm();
+          OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
+        }
+
+        // We may have a location metadata attached to the end of the
+        // instruction, and at no point should see metadata at any
+        // other point while processing. It's an error if so.
+        if (OpNo >= MI->getNumOperands() || MI->getOperand(OpNo).isMetadata()) {
+          Error = true;
+        } else {
+          unsigned OpFlags = MI->getOperand(OpNo).getImm();
+          ++OpNo; // Skip over the ID number.
+
+          // FIXME: Shouldn't arch-independent output template handling go into
+          // PrintAsmOperand?
+          // Labels are target independent.
+          if (MI->getOperand(OpNo).isBlockAddress()) {
+            const BlockAddress *BA = MI->getOperand(OpNo).getBlockAddress();
+            MCSymbol *Sym = AP->GetBlockAddressSymbol(BA);
+            Sym->print(OS, AP->MAI);
+            MMI->getContext().registerInlineAsmLabel(Sym);
+          } else if (MI->getOperand(OpNo).isMBB()) {
+            const MCSymbol *Sym = MI->getOperand(OpNo).getMBB()->getSymbol();
+            Sym->print(OS, AP->MAI);
+          } else if (InlineAsm::isMemKind(OpFlags)) {
+            Error = AP->PrintAsmMemoryOperand(
+                MI, OpNo, Modifier[0] ? Modifier : nullptr, OS);
+          } else {
+            Error = AP->PrintAsmOperand(MI, OpNo,
+                                        Modifier[0] ? Modifier : nullptr, OS);
+          }
+        }
+        if (Error) {
+          std::string msg;
+          raw_string_ostream Msg(msg);
+          Msg << "invalid operand in inline asm: '" << AsmStr << "'";
+          MMI->getModule()->getContext().emitError(LocCookie, Msg.str());
+        }
+      }
+      break;
+    }
+    }
+  }
+  if (InputIsIntelDialect)
+    OS << "\n\t.att_syntax";
+  OS << '\n' << (char)0;  // null terminate string.
+}
+
+/// This method formats and emits the specified machine instruction that is an
+/// inline asm.
+void AsmPrinter::emitInlineAsm(const MachineInstr *MI) const {
+  assert(MI->isInlineAsm() && "printInlineAsm only works on inline asms");
+
+  // Disassemble the AsmStr, printing out the literal pieces, the operands, etc.
+  const char *AsmStr = MI->getOperand(0).getSymbolName();
+
+  // If this asmstr is empty, just print the #APP/#NOAPP markers.
+  // These are useful to see where empty asm's wound up.
+  if (AsmStr[0] == 0) {
+    OutStreamer->emitRawComment(MAI->getInlineAsmStart());
+    OutStreamer->emitRawComment(MAI->getInlineAsmEnd());
+    return;
+  }
+
+  // Emit the #APP start marker.  This has to happen even if verbose-asm isn't
+  // enabled, so we use emitRawComment.
+  OutStreamer->emitRawComment(MAI->getInlineAsmStart());
+
+  // Get the !srcloc metadata node if we have it, and decode the loc cookie from
+  // it.
+  uint64_t LocCookie = 0;
+  const MDNode *LocMD = nullptr;
+  for (const MachineOperand &MO : llvm::reverse(MI->operands())) {
+    if (MO.isMetadata() && (LocMD = MO.getMetadata()) &&
+        LocMD->getNumOperands() != 0) {
+      if (const ConstantInt *CI =
+              mdconst::dyn_extract<ConstantInt>(LocMD->getOperand(0))) {
+        LocCookie = CI->getZExtValue();
+        break;
+      }
+    }
+  }
+
+  // Emit the inline asm to a temporary string so we can emit it through
+  // EmitInlineAsm.
+  SmallString<256> StringData;
+  raw_svector_ostream OS(StringData);
+
+  AsmPrinter *AP = const_cast<AsmPrinter*>(this);
+  EmitInlineAsmStr(AsmStr, MI, MMI, MAI, AP, LocCookie, OS);
+
+  // Emit warnings if we use reserved registers on the clobber list, as
+  // that might lead to undefined behaviour.
+  SmallVector<Register, 8> RestrRegs;
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  // Start with the first operand descriptor, and iterate over them.
+  for (unsigned I = InlineAsm::MIOp_FirstOperand, NumOps = MI->getNumOperands();
+       I < NumOps; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    if (!MO.isImm())
+      continue;
+    unsigned Flags = MO.getImm();
+    if (InlineAsm::getKind(Flags) == InlineAsm::Kind_Clobber) {
+      Register Reg = MI->getOperand(I + 1).getReg();
+      if (!TRI->isAsmClobberable(*MF, Reg))
+        RestrRegs.push_back(Reg);
+    }
+    // Skip to one before the next operand descriptor, if it exists.
+    I += InlineAsm::getNumOperandRegisters(Flags);
+  }
+
+  if (!RestrRegs.empty()) {
+    std::string Msg = "inline asm clobber list contains reserved registers: ";
+    ListSeparator LS;
+    for (const Register RR : RestrRegs) {
+      Msg += LS;
+      Msg += TRI->getRegAsmName(RR);
+    }
+    const char *Note =
+        "Reserved registers on the clobber list may not be "
+        "preserved across the asm statement, and clobbering them may "
+        "lead to undefined behaviour.";
+    MMI->getModule()->getContext().diagnose(DiagnosticInfoInlineAsm(
+        LocCookie, Msg, DiagnosticSeverity::DS_Warning));
+    MMI->getModule()->getContext().diagnose(
+        DiagnosticInfoInlineAsm(LocCookie, Note, DiagnosticSeverity::DS_Note));
+
+    for (const Register RR : RestrRegs) {
+      if (std::optional<std::string> reason =
+              TRI->explainReservedReg(*MF, RR)) {
+        MMI->getModule()->getContext().diagnose(DiagnosticInfoInlineAsm(
+            LocCookie, *reason, DiagnosticSeverity::DS_Note));
+      }
+    }
+  }
+
+  emitInlineAsm(OS.str(), getSubtargetInfo(), TM.Options.MCOptions, LocMD,
+                MI->getInlineAsmDialect());
+
+  // Emit the #NOAPP end marker.  This has to happen even if verbose-asm isn't
+  // enabled, so we use emitRawComment.
+  OutStreamer->emitRawComment(MAI->getInlineAsmEnd());
+}
+
+/// PrintSpecial - Print information related to the specified machine instr
+/// that is independent of the operand, and may be independent of the instr
+/// itself.  This can be useful for portably encoding the comment character
+/// or other bits of target-specific knowledge into the asmstrings.  The
+/// syntax used is ${:comment}.  Targets can override this to add support
+/// for their own strange codes.
+void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
+                              StringRef Code) const {
+  if (Code == "private") {
+    const DataLayout &DL = MF->getDataLayout();
+    OS << DL.getPrivateGlobalPrefix();
+  } else if (Code == "comment") {
+    OS << MAI->getCommentString();
+  } else if (Code == "uid") {
+    // Comparing the address of MI isn't sufficient, because machineinstrs may
+    // be allocated to the same address across functions.
+
+    // If this is a new LastFn instruction, bump the counter.
+    if (LastMI != MI || LastFn != getFunctionNumber()) {
+      ++Counter;
+      LastMI = MI;
+      LastFn = getFunctionNumber();
+    }
+    OS << Counter;
+  } else {
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "Unknown special formatter '" << Code
+         << "' for machine instr: " << *MI;
+    report_fatal_error(Twine(Msg.str()));
+  }
+}
+
+void AsmPrinter::PrintSymbolOperand(const MachineOperand &MO, raw_ostream &OS) {
+  assert(MO.isGlobal() && "caller should check MO.isGlobal");
+  getSymbolPreferLocal(*MO.getGlobal())->print(OS, MAI);
+  printOffset(MO.getOffset(), OS);
+}
+
+/// PrintAsmOperand - Print the specified operand of MI, an INLINEASM
+/// instruction, using the specified assembler variant.  Targets should
+/// override this to format as appropriate for machine specific ExtraCodes
+/// or when the arch-independent handling would be too complex otherwise.
+bool AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                 const char *ExtraCode, raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    // https://gcc.gnu.org/onlinedocs/gccint/Output-Template.html
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    switch (ExtraCode[0]) {
+    default:
+      return true;  // Unknown modifier.
+    case 'a': // Print as memory address.
+      if (MO.isReg()) {
+        PrintAsmMemoryOperand(MI, OpNo, nullptr, O);
+        return false;
+      }
+      [[fallthrough]]; // GCC allows '%a' to behave like '%c' with immediates.
+    case 'c': // Substitute immediate value without immediate syntax
+      if (MO.isImm()) {
+        O << MO.getImm();
+        return false;
+      }
+      if (MO.isGlobal()) {
+        PrintSymbolOperand(MO, O);
+        return false;
+      }
+      return true;
+    case 'n':  // Negate the immediate constant.
+      if (!MO.isImm())
+        return true;
+      O << -MO.getImm();
+      return false;
+    case 's':  // The GCC deprecated s modifier
+      if (!MO.isImm())
+        return true;
+      O << ((32 - MO.getImm()) & 31);
+      return false;
+    }
+  }
+  return true;
+}
+
+bool AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                       const char *ExtraCode, raw_ostream &O) {
+  // Target doesn't support this yet!
+  return true;
+}
+
+void AsmPrinter::emitInlineAsmStart() const {}
+
+void AsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+                                  const MCSubtargetInfo *EndInfo) const {}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
new file mode 100644
index 000000000000..bd2c60eadd61
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
@@ -0,0 +1,145 @@
+//===-- llvm/CodeGen/ByteStreamer.h - ByteStreamer class --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a class that can take bytes that would normally be
+// streamed via the AsmPrinter.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_BYTESTREAMER_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_BYTESTREAMER_H
+
+#include "DIEHash.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/LEB128.h"
+#include <string>
+
+namespace llvm {
+class ByteStreamer {
+ protected:
+  ~ByteStreamer() = default;
+  ByteStreamer(const ByteStreamer&) = default;
+  ByteStreamer() = default;
+
+ public:
+  // For now we're just handling the calls we need for dwarf emission/hashing.
+  virtual void emitInt8(uint8_t Byte, const Twine &Comment = "") = 0;
+  virtual void emitSLEB128(uint64_t DWord, const Twine &Comment = "") = 0;
+  virtual void emitULEB128(uint64_t DWord, const Twine &Comment = "",
+                           unsigned PadTo = 0) = 0;
+  virtual unsigned emitDIERef(const DIE &D) = 0;
+};
+
+class APByteStreamer final : public ByteStreamer {
+private:
+  AsmPrinter &AP;
+
+public:
+  APByteStreamer(AsmPrinter &Asm) : AP(Asm) {}
+  void emitInt8(uint8_t Byte, const Twine &Comment) override {
+    AP.OutStreamer->AddComment(Comment);
+    AP.emitInt8(Byte);
+  }
+  void emitSLEB128(uint64_t DWord, const Twine &Comment) override {
+    AP.OutStreamer->AddComment(Comment);
+    AP.emitSLEB128(DWord);
+  }
+  void emitULEB128(uint64_t DWord, const Twine &Comment,
+                   unsigned PadTo) override {
+    AP.OutStreamer->AddComment(Comment);
+    AP.emitULEB128(DWord, nullptr, PadTo);
+  }
+  unsigned emitDIERef(const DIE &D) override {
+    uint64_t Offset = D.getOffset();
+    static constexpr unsigned ULEB128PadSize = 4;
+    assert(Offset < (1ULL << (ULEB128PadSize * 7)) && "Offset wont fit");
+    emitULEB128(Offset, "", ULEB128PadSize);
+    // Return how many comments to skip in DwarfDebug::emitDebugLocEntry to keep
+    // comments aligned with debug loc entries.
+    return ULEB128PadSize;
+  }
+};
+
+class HashingByteStreamer final : public ByteStreamer {
+ private:
+  DIEHash &Hash;
+ public:
+   HashingByteStreamer(DIEHash &H) : Hash(H) {}
+   void emitInt8(uint8_t Byte, const Twine &Comment) override {
+     Hash.update(Byte);
+  }
+  void emitSLEB128(uint64_t DWord, const Twine &Comment) override {
+    Hash.addSLEB128(DWord);
+  }
+  void emitULEB128(uint64_t DWord, const Twine &Comment,
+                   unsigned PadTo) override {
+    Hash.addULEB128(DWord);
+  }
+  unsigned emitDIERef(const DIE &D) override {
+    Hash.hashRawTypeReference(D);
+    return 0; // Only used together with the APByteStreamer.
+  }
+};
+
+class BufferByteStreamer final : public ByteStreamer {
+private:
+  SmallVectorImpl<char> &Buffer;
+  std::vector<std::string> &Comments;
+
+public:
+  /// Only verbose textual output needs comments.  This will be set to
+  /// true for that case, and false otherwise.  If false, comments passed in to
+  /// the emit methods will be ignored.
+  const bool GenerateComments;
+
+  BufferByteStreamer(SmallVectorImpl<char> &Buffer,
+                     std::vector<std::string> &Comments, bool GenerateComments)
+      : Buffer(Buffer), Comments(Comments), GenerateComments(GenerateComments) {
+  }
+  void emitInt8(uint8_t Byte, const Twine &Comment) override {
+    Buffer.push_back(Byte);
+    if (GenerateComments)
+      Comments.push_back(Comment.str());
+  }
+  void emitSLEB128(uint64_t DWord, const Twine &Comment) override {
+    raw_svector_ostream OSE(Buffer);
+    unsigned Length = encodeSLEB128(DWord, OSE);
+    if (GenerateComments) {
+      Comments.push_back(Comment.str());
+      // Add some empty comments to keep the Buffer and Comments vectors aligned
+      // with each other.
+      for (size_t i = 1; i < Length; ++i)
+        Comments.push_back("");
+
+    }
+  }
+  void emitULEB128(uint64_t DWord, const Twine &Comment,
+                   unsigned PadTo) override {
+    raw_svector_ostream OSE(Buffer);
+    unsigned Length = encodeULEB128(DWord, OSE, PadTo);
+    if (GenerateComments) {
+      Comments.push_back(Comment.str());
+      // Add some empty comments to keep the Buffer and Comments vectors aligned
+      // with each other.
+      for (size_t i = 1; i < Length; ++i)
+        Comments.push_back("");
+    }
+  }
+  unsigned emitDIERef(const DIE &D) override {
+    uint64_t Offset = D.getOffset();
+    static constexpr unsigned ULEB128PadSize = 4;
+    assert(Offset < (1ULL << (ULEB128PadSize * 7)) && "Offset wont fit");
+    emitULEB128(Offset, "", ULEB128PadSize);
+    return 0; // Only used together with the APByteStreamer.
+  }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
new file mode 100644
index 000000000000..8161de57b58e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -0,0 +1,3444 @@
+//===- llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing Microsoft CodeView debug info.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeViewDebug.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/CodeViewRecordIO.h"
+#include "llvm/DebugInfo/CodeView/ContinuationRecordBuilder.h"
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/EnumTables.h"
+#include "llvm/DebugInfo/CodeView/Line.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeTableCollection.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Triple.h"
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cstddef>
+#include <iterator>
+#include <limits>
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+namespace {
+class CVMCAdapter : public CodeViewRecordStreamer {
+public:
+  CVMCAdapter(MCStreamer &OS, TypeCollection &TypeTable)
+      : OS(&OS), TypeTable(TypeTable) {}
+
+  void emitBytes(StringRef Data) override { OS->emitBytes(Data); }
+
+  void emitIntValue(uint64_t Value, unsigned Size) override {
+    OS->emitIntValueInHex(Value, Size);
+  }
+
+  void emitBinaryData(StringRef Data) override { OS->emitBinaryData(Data); }
+
+  void AddComment(const Twine &T) override { OS->AddComment(T); }
+
+  void AddRawComment(const Twine &T) override { OS->emitRawComment(T); }
+
+  bool isVerboseAsm() override { return OS->isVerboseAsm(); }
+
+  std::string getTypeName(TypeIndex TI) override {
+    std::string TypeName;
+    if (!TI.isNoneType()) {
+      if (TI.isSimple())
+        TypeName = std::string(TypeIndex::simpleTypeName(TI));
+      else
+        TypeName = std::string(TypeTable.getTypeName(TI));
+    }
+    return TypeName;
+  }
+
+private:
+  MCStreamer *OS = nullptr;
+  TypeCollection &TypeTable;
+};
+} // namespace
+
+static CPUType mapArchToCVCPUType(Triple::ArchType Type) {
+  switch (Type) {
+  case Triple::ArchType::x86:
+    return CPUType::Pentium3;
+  case Triple::ArchType::x86_64:
+    return CPUType::X64;
+  case Triple::ArchType::thumb:
+    // LLVM currently doesn't support Windows CE and so thumb
+    // here is indiscriminately mapped to ARMNT specifically.
+    return CPUType::ARMNT;
+  case Triple::ArchType::aarch64:
+    return CPUType::ARM64;
+  default:
+    report_fatal_error("target architecture doesn't map to a CodeView CPUType");
+  }
+}
+
+CodeViewDebug::CodeViewDebug(AsmPrinter *AP)
+    : DebugHandlerBase(AP), OS(*Asm->OutStreamer), TypeTable(Allocator) {}
+
+StringRef CodeViewDebug::getFullFilepath(const DIFile *File) {
+  std::string &Filepath = FileToFilepathMap[File];
+  if (!Filepath.empty())
+    return Filepath;
+
+  StringRef Dir = File->getDirectory(), Filename = File->getFilename();
+
+  // If this is a Unix-style path, just use it as is. Don't try to canonicalize
+  // it textually because one of the path components could be a symlink.
+  if (Dir.startswith("/") || Filename.startswith("/")) {
+    if (llvm::sys::path::is_absolute(Filename, llvm::sys::path::Style::posix))
+      return Filename;
+    Filepath = std::string(Dir);
+    if (Dir.back() != '/')
+      Filepath += '/';
+    Filepath += Filename;
+    return Filepath;
+  }
+
+  // Clang emits directory and relative filename info into the IR, but CodeView
+  // operates on full paths.  We could change Clang to emit full paths too, but
+  // that would increase the IR size and probably not needed for other users.
+  // For now, just concatenate and canonicalize the path here.
+  if (Filename.find(':') == 1)
+    Filepath = std::string(Filename);
+  else
+    Filepath = (Dir + "\\" + Filename).str();
+
+  // Canonicalize the path.  We have to do it textually because we may no longer
+  // have access the file in the filesystem.
+  // First, replace all slashes with backslashes.
+  std::replace(Filepath.begin(), Filepath.end(), '/', '\\');
+
+  // Remove all "\.\" with "\".
+  size_t Cursor = 0;
+  while ((Cursor = Filepath.find("\\.\\", Cursor)) != std::string::npos)
+    Filepath.erase(Cursor, 2);
+
+  // Replace all "\XXX\..\" with "\".  Don't try too hard though as the original
+  // path should be well-formatted, e.g. start with a drive letter, etc.
+  Cursor = 0;
+  while ((Cursor = Filepath.find("\\..\\", Cursor)) != std::string::npos) {
+    // Something's wrong if the path starts with "\..\", abort.
+    if (Cursor == 0)
+      break;
+
+    size_t PrevSlash = Filepath.rfind('\\', Cursor - 1);
+    if (PrevSlash == std::string::npos)
+      // Something's wrong, abort.
+      break;
+
+    Filepath.erase(PrevSlash, Cursor + 3 - PrevSlash);
+    // The next ".." might be following the one we've just erased.
+    Cursor = PrevSlash;
+  }
+
+  // Remove all duplicate backslashes.
+  Cursor = 0;
+  while ((Cursor = Filepath.find("\\\\", Cursor)) != std::string::npos)
+    Filepath.erase(Cursor, 1);
+
+  return Filepath;
+}
+
+unsigned CodeViewDebug::maybeRecordFile(const DIFile *F) {
+  StringRef FullPath = getFullFilepath(F);
+  unsigned NextId = FileIdMap.size() + 1;
+  auto Insertion = FileIdMap.insert(std::make_pair(FullPath, NextId));
+  if (Insertion.second) {
+    // We have to compute the full filepath and emit a .cv_file directive.
+    ArrayRef<uint8_t> ChecksumAsBytes;
+    FileChecksumKind CSKind = FileChecksumKind::None;
+    if (F->getChecksum()) {
+      std::string Checksum = fromHex(F->getChecksum()->Value);
+      void *CKMem = OS.getContext().allocate(Checksum.size(), 1);
+      memcpy(CKMem, Checksum.data(), Checksum.size());
+      ChecksumAsBytes = ArrayRef<uint8_t>(
+          reinterpret_cast<const uint8_t *>(CKMem), Checksum.size());
+      switch (F->getChecksum()->Kind) {
+      case DIFile::CSK_MD5:
+        CSKind = FileChecksumKind::MD5;
+        break;
+      case DIFile::CSK_SHA1:
+        CSKind = FileChecksumKind::SHA1;
+        break;
+      case DIFile::CSK_SHA256:
+        CSKind = FileChecksumKind::SHA256;
+        break;
+      }
+    }
+    bool Success = OS.emitCVFileDirective(NextId, FullPath, ChecksumAsBytes,
+                                          static_cast<unsigned>(CSKind));
+    (void)Success;
+    assert(Success && ".cv_file directive failed");
+  }
+  return Insertion.first->second;
+}
+
+CodeViewDebug::InlineSite &
+CodeViewDebug::getInlineSite(const DILocation *InlinedAt,
+                             const DISubprogram *Inlinee) {
+  auto SiteInsertion = CurFn->InlineSites.insert({InlinedAt, InlineSite()});
+  InlineSite *Site = &SiteInsertion.first->second;
+  if (SiteInsertion.second) {
+    unsigned ParentFuncId = CurFn->FuncId;
+    if (const DILocation *OuterIA = InlinedAt->getInlinedAt())
+      ParentFuncId =
+          getInlineSite(OuterIA, InlinedAt->getScope()->getSubprogram())
+              .SiteFuncId;
+
+    Site->SiteFuncId = NextFuncId++;
+    OS.emitCVInlineSiteIdDirective(
+        Site->SiteFuncId, ParentFuncId, maybeRecordFile(InlinedAt->getFile()),
+        InlinedAt->getLine(), InlinedAt->getColumn(), SMLoc());
+    Site->Inlinee = Inlinee;
+    InlinedSubprograms.insert(Inlinee);
+    getFuncIdForSubprogram(Inlinee);
+  }
+  return *Site;
+}
+
+static StringRef getPrettyScopeName(const DIScope *Scope) {
+  StringRef ScopeName = Scope->getName();
+  if (!ScopeName.empty())
+    return ScopeName;
+
+  switch (Scope->getTag()) {
+  case dwarf::DW_TAG_enumeration_type:
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+    return "<unnamed-tag>";
+  case dwarf::DW_TAG_namespace:
+    return "`anonymous namespace'";
+  default:
+    return StringRef();
+  }
+}
+
+const DISubprogram *CodeViewDebug::collectParentScopeNames(
+    const DIScope *Scope, SmallVectorImpl<StringRef> &QualifiedNameComponents) {
+  const DISubprogram *ClosestSubprogram = nullptr;
+  while (Scope != nullptr) {
+    if (ClosestSubprogram == nullptr)
+      ClosestSubprogram = dyn_cast<DISubprogram>(Scope);
+
+    // If a type appears in a scope chain, make sure it gets emitted. The
+    // frontend will be responsible for deciding if this should be a forward
+    // declaration or a complete type.
+    if (const auto *Ty = dyn_cast<DICompositeType>(Scope))
+      DeferredCompleteTypes.push_back(Ty);
+
+    StringRef ScopeName = getPrettyScopeName(Scope);
+    if (!ScopeName.empty())
+      QualifiedNameComponents.push_back(ScopeName);
+    Scope = Scope->getScope();
+  }
+  return ClosestSubprogram;
+}
+
+static std::string formatNestedName(ArrayRef<StringRef> QualifiedNameComponents,
+                                    StringRef TypeName) {
+  std::string FullyQualifiedName;
+  for (StringRef QualifiedNameComponent :
+       llvm::reverse(QualifiedNameComponents)) {
+    FullyQualifiedName.append(std::string(QualifiedNameComponent));
+    FullyQualifiedName.append("::");
+  }
+  FullyQualifiedName.append(std::string(TypeName));
+  return FullyQualifiedName;
+}
+
+struct CodeViewDebug::TypeLoweringScope {
+  TypeLoweringScope(CodeViewDebug &CVD) : CVD(CVD) { ++CVD.TypeEmissionLevel; }
+  ~TypeLoweringScope() {
+    // Don't decrement TypeEmissionLevel until after emitting deferred types, so
+    // inner TypeLoweringScopes don't attempt to emit deferred types.
+    if (CVD.TypeEmissionLevel == 1)
+      CVD.emitDeferredCompleteTypes();
+    --CVD.TypeEmissionLevel;
+  }
+  CodeViewDebug &CVD;
+};
+
+std::string CodeViewDebug::getFullyQualifiedName(const DIScope *Scope,
+                                                 StringRef Name) {
+  // Ensure types in the scope chain are emitted as soon as possible.
+  // This can create otherwise a situation where S_UDTs are emitted while
+  // looping in emitDebugInfoForUDTs.
+  TypeLoweringScope S(*this);
+  SmallVector<StringRef, 5> QualifiedNameComponents;
+  collectParentScopeNames(Scope, QualifiedNameComponents);
+  return formatNestedName(QualifiedNameComponents, Name);
+}
+
+std::string CodeViewDebug::getFullyQualifiedName(const DIScope *Ty) {
+  const DIScope *Scope = Ty->getScope();
+  return getFullyQualifiedName(Scope, getPrettyScopeName(Ty));
+}
+
+TypeIndex CodeViewDebug::getScopeIndex(const DIScope *Scope) {
+  // No scope means global scope and that uses the zero index.
+  //
+  // We also use zero index when the scope is a DISubprogram
+  // to suppress the emission of LF_STRING_ID for the function,
+  // which can trigger a link-time error with the linker in
+  // VS2019 version 16.11.2 or newer.
+  // Note, however, skipping the debug info emission for the DISubprogram
+  // is a temporary fix. The root issue here is that we need to figure out
+  // the proper way to encode a function nested in another function
+  // (as introduced by the Fortran 'contains' keyword) in CodeView.
+  if (!Scope || isa<DIFile>(Scope) || isa<DISubprogram>(Scope))
+    return TypeIndex();
+
+  assert(!isa<DIType>(Scope) && "shouldn't make a namespace scope for a type");
+
+  // Check if we've already translated this scope.
+  auto I = TypeIndices.find({Scope, nullptr});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  // Build the fully qualified name of the scope.
+  std::string ScopeName = getFullyQualifiedName(Scope);
+  StringIdRecord SID(TypeIndex(), ScopeName);
+  auto TI = TypeTable.writeLeafType(SID);
+  return recordTypeIndexForDINode(Scope, TI);
+}
+
+static StringRef removeTemplateArgs(StringRef Name) {
+  // Remove template args from the display name. Assume that the template args
+  // are the last thing in the name.
+  if (Name.empty() || Name.back() != '>')
+    return Name;
+
+  int OpenBrackets = 0;
+  for (int i = Name.size() - 1; i >= 0; --i) {
+    if (Name[i] == '>')
+      ++OpenBrackets;
+    else if (Name[i] == '<') {
+      --OpenBrackets;
+      if (OpenBrackets == 0)
+        return Name.substr(0, i);
+    }
+  }
+  return Name;
+}
+
+TypeIndex CodeViewDebug::getFuncIdForSubprogram(const DISubprogram *SP) {
+  assert(SP);
+
+  // Check if we've already translated this subprogram.
+  auto I = TypeIndices.find({SP, nullptr});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  // The display name includes function template arguments. Drop them to match
+  // MSVC. We need to have the template arguments in the DISubprogram name
+  // because they are used in other symbol records, such as S_GPROC32_IDs.
+  StringRef DisplayName = removeTemplateArgs(SP->getName());
+
+  const DIScope *Scope = SP->getScope();
+  TypeIndex TI;
+  if (const auto *Class = dyn_cast_or_null<DICompositeType>(Scope)) {
+    // If the scope is a DICompositeType, then this must be a method. Member
+    // function types take some special handling, and require access to the
+    // subprogram.
+    TypeIndex ClassType = getTypeIndex(Class);
+    MemberFuncIdRecord MFuncId(ClassType, getMemberFunctionType(SP, Class),
+                               DisplayName);
+    TI = TypeTable.writeLeafType(MFuncId);
+  } else {
+    // Otherwise, this must be a free function.
+    TypeIndex ParentScope = getScopeIndex(Scope);
+    FuncIdRecord FuncId(ParentScope, getTypeIndex(SP->getType()), DisplayName);
+    TI = TypeTable.writeLeafType(FuncId);
+  }
+
+  return recordTypeIndexForDINode(SP, TI);
+}
+
+static bool isNonTrivial(const DICompositeType *DCTy) {
+  return ((DCTy->getFlags() & DINode::FlagNonTrivial) == DINode::FlagNonTrivial);
+}
+
+static FunctionOptions
+getFunctionOptions(const DISubroutineType *Ty,
+                   const DICompositeType *ClassTy = nullptr,
+                   StringRef SPName = StringRef("")) {
+  FunctionOptions FO = FunctionOptions::None;
+  const DIType *ReturnTy = nullptr;
+  if (auto TypeArray = Ty->getTypeArray()) {
+    if (TypeArray.size())
+      ReturnTy = TypeArray[0];
+  }
+
+  // Add CxxReturnUdt option to functions that return nontrivial record types
+  // or methods that return record types.
+  if (auto *ReturnDCTy = dyn_cast_or_null<DICompositeType>(ReturnTy))
+    if (isNonTrivial(ReturnDCTy) || ClassTy)
+      FO |= FunctionOptions::CxxReturnUdt;
+
+  // DISubroutineType is unnamed. Use DISubprogram's i.e. SPName in comparison.
+  if (ClassTy && isNonTrivial(ClassTy) && SPName == ClassTy->getName()) {
+    FO |= FunctionOptions::Constructor;
+
+  // TODO: put the FunctionOptions::ConstructorWithVirtualBases flag.
+
+  }
+  return FO;
+}
+
+TypeIndex CodeViewDebug::getMemberFunctionType(const DISubprogram *SP,
+                                               const DICompositeType *Class) {
+  // Always use the method declaration as the key for the function type. The
+  // method declaration contains the this adjustment.
+  if (SP->getDeclaration())
+    SP = SP->getDeclaration();
+  assert(!SP->getDeclaration() && "should use declaration as key");
+
+  // Key the MemberFunctionRecord into the map as {SP, Class}. It won't collide
+  // with the MemberFuncIdRecord, which is keyed in as {SP, nullptr}.
+  auto I = TypeIndices.find({SP, Class});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  // Make sure complete type info for the class is emitted *after* the member
+  // function type, as the complete class type is likely to reference this
+  // member function type.
+  TypeLoweringScope S(*this);
+  const bool IsStaticMethod = (SP->getFlags() & DINode::FlagStaticMember) != 0;
+
+  FunctionOptions FO = getFunctionOptions(SP->getType(), Class, SP->getName());
+  TypeIndex TI = lowerTypeMemberFunction(
+      SP->getType(), Class, SP->getThisAdjustment(), IsStaticMethod, FO);
+  return recordTypeIndexForDINode(SP, TI, Class);
+}
+
+TypeIndex CodeViewDebug::recordTypeIndexForDINode(const DINode *Node,
+                                                  TypeIndex TI,
+                                                  const DIType *ClassTy) {
+  auto InsertResult = TypeIndices.insert({{Node, ClassTy}, TI});
+  (void)InsertResult;
+  assert(InsertResult.second && "DINode was already assigned a type index");
+  return TI;
+}
+
+unsigned CodeViewDebug::getPointerSizeInBytes() {
+  return MMI->getModule()->getDataLayout().getPointerSizeInBits() / 8;
+}
+
+void CodeViewDebug::recordLocalVariable(LocalVariable &&Var,
+                                        const LexicalScope *LS) {
+  if (const DILocation *InlinedAt = LS->getInlinedAt()) {
+    // This variable was inlined. Associate it with the InlineSite.
+    const DISubprogram *Inlinee = Var.DIVar->getScope()->getSubprogram();
+    InlineSite &Site = getInlineSite(InlinedAt, Inlinee);
+    Site.InlinedLocals.emplace_back(std::move(Var));
+  } else {
+    // This variable goes into the corresponding lexical scope.
+    ScopeVariables[LS].emplace_back(std::move(Var));
+  }
+}
+
+static void addLocIfNotPresent(SmallVectorImpl<const DILocation *> &Locs,
+                               const DILocation *Loc) {
+  if (!llvm::is_contained(Locs, Loc))
+    Locs.push_back(Loc);
+}
+
+void CodeViewDebug::maybeRecordLocation(const DebugLoc &DL,
+                                        const MachineFunction *MF) {
+  // Skip this instruction if it has the same location as the previous one.
+  if (!DL || DL == PrevInstLoc)
+    return;
+
+  const DIScope *Scope = DL->getScope();
+  if (!Scope)
+    return;
+
+  // Skip this line if it is longer than the maximum we can record.
+  LineInfo LI(DL.getLine(), DL.getLine(), /*IsStatement=*/true);
+  if (LI.getStartLine() != DL.getLine() || LI.isAlwaysStepInto() ||
+      LI.isNeverStepInto())
+    return;
+
+  ColumnInfo CI(DL.getCol(), /*EndColumn=*/0);
+  if (CI.getStartColumn() != DL.getCol())
+    return;
+
+  if (!CurFn->HaveLineInfo)
+    CurFn->HaveLineInfo = true;
+  unsigned FileId = 0;
+  if (PrevInstLoc.get() && PrevInstLoc->getFile() == DL->getFile())
+    FileId = CurFn->LastFileId;
+  else
+    FileId = CurFn->LastFileId = maybeRecordFile(DL->getFile());
+  PrevInstLoc = DL;
+
+  unsigned FuncId = CurFn->FuncId;
+  if (const DILocation *SiteLoc = DL->getInlinedAt()) {
+    const DILocation *Loc = DL.get();
+
+    // If this location was actually inlined from somewhere else, give it the ID
+    // of the inline call site.
+    FuncId =
+        getInlineSite(SiteLoc, Loc->getScope()->getSubprogram()).SiteFuncId;
+
+    // Ensure we have links in the tree of inline call sites.
+    bool FirstLoc = true;
+    while ((SiteLoc = Loc->getInlinedAt())) {
+      InlineSite &Site =
+          getInlineSite(SiteLoc, Loc->getScope()->getSubprogram());
+      if (!FirstLoc)
+        addLocIfNotPresent(Site.ChildSites, Loc);
+      FirstLoc = false;
+      Loc = SiteLoc;
+    }
+    addLocIfNotPresent(CurFn->ChildSites, Loc);
+  }
+
+  OS.emitCVLocDirective(FuncId, FileId, DL.getLine(), DL.getCol(),
+                        /*PrologueEnd=*/false, /*IsStmt=*/false,
+                        DL->getFilename(), SMLoc());
+}
+
+void CodeViewDebug::emitCodeViewMagicVersion() {
+  OS.emitValueToAlignment(Align(4));
+  OS.AddComment("Debug section magic");
+  OS.emitInt32(COFF::DEBUG_SECTION_MAGIC);
+}
+
+static SourceLanguage MapDWLangToCVLang(unsigned DWLang) {
+  switch (DWLang) {
+  case dwarf::DW_LANG_C:
+  case dwarf::DW_LANG_C89:
+  case dwarf::DW_LANG_C99:
+  case dwarf::DW_LANG_C11:
+    return SourceLanguage::C;
+  case dwarf::DW_LANG_C_plus_plus:
+  case dwarf::DW_LANG_C_plus_plus_03:
+  case dwarf::DW_LANG_C_plus_plus_11:
+  case dwarf::DW_LANG_C_plus_plus_14:
+    return SourceLanguage::Cpp;
+  case dwarf::DW_LANG_Fortran77:
+  case dwarf::DW_LANG_Fortran90:
+  case dwarf::DW_LANG_Fortran95:
+  case dwarf::DW_LANG_Fortran03:
+  case dwarf::DW_LANG_Fortran08:
+    return SourceLanguage::Fortran;
+  case dwarf::DW_LANG_Pascal83:
+    return SourceLanguage::Pascal;
+  case dwarf::DW_LANG_Cobol74:
+  case dwarf::DW_LANG_Cobol85:
+    return SourceLanguage::Cobol;
+  case dwarf::DW_LANG_Java:
+    return SourceLanguage::Java;
+  case dwarf::DW_LANG_D:
+    return SourceLanguage::D;
+  case dwarf::DW_LANG_Swift:
+    return SourceLanguage::Swift;
+  case dwarf::DW_LANG_Rust:
+    return SourceLanguage::Rust;
+  case dwarf::DW_LANG_ObjC:
+    return SourceLanguage::ObjC;
+  case dwarf::DW_LANG_ObjC_plus_plus:
+    return SourceLanguage::ObjCpp;
+  default:
+    // There's no CodeView representation for this language, and CV doesn't
+    // have an "unknown" option for the language field, so we'll use MASM,
+    // as it's very low level.
+    return SourceLanguage::Masm;
+  }
+}
+
+void CodeViewDebug::beginModule(Module *M) {
+  // If module doesn't have named metadata anchors or COFF debug section
+  // is not available, skip any debug info related stuff.
+  if (!MMI->hasDebugInfo() ||
+      !Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) {
+    Asm = nullptr;
+    return;
+  }
+
+  TheCPU = mapArchToCVCPUType(Triple(M->getTargetTriple()).getArch());
+
+  // Get the current source language.
+  const MDNode *Node = *M->debug_compile_units_begin();
+  const auto *CU = cast<DICompileUnit>(Node);
+
+  CurrentSourceLanguage = MapDWLangToCVLang(CU->getSourceLanguage());
+
+  collectGlobalVariableInfo();
+
+  // Check if we should emit type record hashes.
+  ConstantInt *GH =
+      mdconst::extract_or_null<ConstantInt>(M->getModuleFlag("CodeViewGHash"));
+  EmitDebugGlobalHashes = GH && !GH->isZero();
+}
+
+void CodeViewDebug::endModule() {
+  if (!Asm || !MMI->hasDebugInfo())
+    return;
+
+  // The COFF .debug$S section consists of several subsections, each starting
+  // with a 4-byte control code (e.g. 0xF1, 0xF2, etc) and then a 4-byte length
+  // of the payload followed by the payload itself.  The subsections are 4-byte
+  // aligned.
+
+  // Use the generic .debug$S section, and make a subsection for all the inlined
+  // subprograms.
+  switchToDebugSectionForSymbol(nullptr);
+
+  MCSymbol *CompilerInfo = beginCVSubsection(DebugSubsectionKind::Symbols);
+  emitObjName();
+  emitCompilerInformation();
+  endCVSubsection(CompilerInfo);
+
+  emitInlineeLinesSubsection();
+
+  // Emit per-function debug information.
+  for (auto &P : FnDebugInfo)
+    if (!P.first->isDeclarationForLinker())
+      emitDebugInfoForFunction(P.first, *P.second);
+
+  // Get types used by globals without emitting anything.
+  // This is meant to collect all static const data members so they can be
+  // emitted as globals.
+  collectDebugInfoForGlobals();
+
+  // Emit retained types.
+  emitDebugInfoForRetainedTypes();
+
+  // Emit global variable debug information.
+  setCurrentSubprogram(nullptr);
+  emitDebugInfoForGlobals();
+
+  // Switch back to the generic .debug$S section after potentially processing
+  // comdat symbol sections.
+  switchToDebugSectionForSymbol(nullptr);
+
+  // Emit UDT records for any types used by global variables.
+  if (!GlobalUDTs.empty()) {
+    MCSymbol *SymbolsEnd = beginCVSubsection(DebugSubsectionKind::Symbols);
+    emitDebugInfoForUDTs(GlobalUDTs);
+    endCVSubsection(SymbolsEnd);
+  }
+
+  // This subsection holds a file index to offset in string table table.
+  OS.AddComment("File index to string table offset subsection");
+  OS.emitCVFileChecksumsDirective();
+
+  // This subsection holds the string table.
+  OS.AddComment("String table");
+  OS.emitCVStringTableDirective();
+
+  // Emit S_BUILDINFO, which points to LF_BUILDINFO. Put this in its own symbol
+  // subsection in the generic .debug$S section at the end. There is no
+  // particular reason for this ordering other than to match MSVC.
+  emitBuildInfo();
+
+  // Emit type information and hashes last, so that any types we translate while
+  // emitting function info are included.
+  emitTypeInformation();
+
+  if (EmitDebugGlobalHashes)
+    emitTypeGlobalHashes();
+
+  clear();
+}
+
+static void
+emitNullTerminatedSymbolName(MCStreamer &OS, StringRef S,
+                             unsigned MaxFixedRecordLength = 0xF00) {
+  // The maximum CV record length is 0xFF00. Most of the strings we emit appear
+  // after a fixed length portion of the record. The fixed length portion should
+  // always be less than 0xF00 (3840) bytes, so truncate the string so that the
+  // overall record size is less than the maximum allowed.
+  SmallString<32> NullTerminatedString(
+      S.take_front(MaxRecordLength - MaxFixedRecordLength - 1));
+  NullTerminatedString.push_back('\0');
+  OS.emitBytes(NullTerminatedString);
+}
+
+void CodeViewDebug::emitTypeInformation() {
+  if (TypeTable.empty())
+    return;
+
+  // Start the .debug$T or .debug$P section with 0x4.
+  OS.switchSection(Asm->getObjFileLowering().getCOFFDebugTypesSection());
+  emitCodeViewMagicVersion();
+
+  TypeTableCollection Table(TypeTable.records());
+  TypeVisitorCallbackPipeline Pipeline;
+
+  // To emit type record using Codeview MCStreamer adapter
+  CVMCAdapter CVMCOS(OS, Table);
+  TypeRecordMapping typeMapping(CVMCOS);
+  Pipeline.addCallbackToPipeline(typeMapping);
+
+  std::optional<TypeIndex> B = Table.getFirst();
+  while (B) {
+    // This will fail if the record data is invalid.
+    CVType Record = Table.getType(*B);
+
+    Error E = codeview::visitTypeRecord(Record, *B, Pipeline);
+
+    if (E) {
+      logAllUnhandledErrors(std::move(E), errs(), "error: ");
+      llvm_unreachable("produced malformed type record");
+    }
+
+    B = Table.getNext(*B);
+  }
+}
+
+void CodeViewDebug::emitTypeGlobalHashes() {
+  if (TypeTable.empty())
+    return;
+
+  // Start the .debug$H section with the version and hash algorithm, currently
+  // hardcoded to version 0, SHA1.
+  OS.switchSection(Asm->getObjFileLowering().getCOFFGlobalTypeHashesSection());
+
+  OS.emitValueToAlignment(Align(4));
+  OS.AddComment("Magic");
+  OS.emitInt32(COFF::DEBUG_HASHES_SECTION_MAGIC);
+  OS.AddComment("Section Version");
+  OS.emitInt16(0);
+  OS.AddComment("Hash Algorithm");
+  OS.emitInt16(uint16_t(GlobalTypeHashAlg::BLAKE3));
+
+  TypeIndex TI(TypeIndex::FirstNonSimpleIndex);
+  for (const auto &GHR : TypeTable.hashes()) {
+    if (OS.isVerboseAsm()) {
+      // Emit an EOL-comment describing which TypeIndex this hash corresponds
+      // to, as well as the stringified SHA1 hash.
+      SmallString<32> Comment;
+      raw_svector_ostream CommentOS(Comment);
+      CommentOS << formatv("{0:X+} [{1}]", TI.getIndex(), GHR);
+      OS.AddComment(Comment);
+      ++TI;
+    }
+    assert(GHR.Hash.size() == 8);
+    StringRef S(reinterpret_cast<const char *>(GHR.Hash.data()),
+                GHR.Hash.size());
+    OS.emitBinaryData(S);
+  }
+}
+
+void CodeViewDebug::emitObjName() {
+  MCSymbol *CompilerEnd = beginSymbolRecord(SymbolKind::S_OBJNAME);
+
+  StringRef PathRef(Asm->TM.Options.ObjectFilenameForDebug);
+  llvm::SmallString<256> PathStore(PathRef);
+
+  if (PathRef.empty() || PathRef == "-") {
+    // Don't emit the filename if we're writing to stdout or to /dev/null.
+    PathRef = {};
+  } else {
+    PathRef = PathStore;
+  }
+
+  OS.AddComment("Signature");
+  OS.emitIntValue(0, 4);
+
+  OS.AddComment("Object name");
+  emitNullTerminatedSymbolName(OS, PathRef);
+
+  endSymbolRecord(CompilerEnd);
+}
+
+namespace {
+struct Version {
+  int Part[4];
+};
+} // end anonymous namespace
+
+// Takes a StringRef like "clang 4.0.0.0 (other nonsense 123)" and parses out
+// the version number.
+static Version parseVersion(StringRef Name) {
+  Version V = {{0}};
+  int N = 0;
+  for (const char C : Name) {
+    if (isdigit(C)) {
+      V.Part[N] *= 10;
+      V.Part[N] += C - '0';
+      V.Part[N] =
+          std::min<int>(V.Part[N], std::numeric_limits<uint16_t>::max());
+    } else if (C == '.') {
+      ++N;
+      if (N >= 4)
+        return V;
+    } else if (N > 0)
+      return V;
+  }
+  return V;
+}
+
+void CodeViewDebug::emitCompilerInformation() {
+  MCSymbol *CompilerEnd = beginSymbolRecord(SymbolKind::S_COMPILE3);
+  uint32_t Flags = 0;
+
+  // The low byte of the flags indicates the source language.
+  Flags = CurrentSourceLanguage;
+  // TODO:  Figure out which other flags need to be set.
+  if (MMI->getModule()->getProfileSummary(/*IsCS*/ false) != nullptr) {
+    Flags |= static_cast<uint32_t>(CompileSym3Flags::PGO);
+  }
+  using ArchType = llvm::Triple::ArchType;
+  ArchType Arch = Triple(MMI->getModule()->getTargetTriple()).getArch();
+  if (Asm->TM.Options.Hotpatch || Arch == ArchType::thumb ||
+      Arch == ArchType::aarch64) {
+    Flags |= static_cast<uint32_t>(CompileSym3Flags::HotPatch);
+  }
+
+  OS.AddComment("Flags and language");
+  OS.emitInt32(Flags);
+
+  OS.AddComment("CPUType");
+  OS.emitInt16(static_cast<uint64_t>(TheCPU));
+
+  NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+  const MDNode *Node = *CUs->operands().begin();
+  const auto *CU = cast<DICompileUnit>(Node);
+
+  StringRef CompilerVersion = CU->getProducer();
+  Version FrontVer = parseVersion(CompilerVersion);
+  OS.AddComment("Frontend version");
+  for (int N : FrontVer.Part) {
+    OS.emitInt16(N);
+  }
+
+  // Some Microsoft tools, like Binscope, expect a backend version number of at
+  // least 8.something, so we'll coerce the LLVM version into a form that
+  // guarantees it'll be big enough without really lying about the version.
+  int Major = 1000 * LLVM_VERSION_MAJOR +
+              10 * LLVM_VERSION_MINOR +
+              LLVM_VERSION_PATCH;
+  // Clamp it for builds that use unusually large version numbers.
+  Major = std::min<int>(Major, std::numeric_limits<uint16_t>::max());
+  Version BackVer = {{ Major, 0, 0, 0 }};
+  OS.AddComment("Backend version");
+  for (int N : BackVer.Part)
+    OS.emitInt16(N);
+
+  OS.AddComment("Null-terminated compiler version string");
+  emitNullTerminatedSymbolName(OS, CompilerVersion);
+
+  endSymbolRecord(CompilerEnd);
+}
+
+static TypeIndex getStringIdTypeIdx(GlobalTypeTableBuilder &TypeTable,
+                                    StringRef S) {
+  StringIdRecord SIR(TypeIndex(0x0), S);
+  return TypeTable.writeLeafType(SIR);
+}
+
+static std::string flattenCommandLine(ArrayRef<std::string> Args,
+                                      StringRef MainFilename) {
+  std::string FlatCmdLine;
+  raw_string_ostream OS(FlatCmdLine);
+  bool PrintedOneArg = false;
+  if (!StringRef(Args[0]).contains("-cc1")) {
+    llvm::sys::printArg(OS, "-cc1", /*Quote=*/true);
+    PrintedOneArg = true;
+  }
+  for (unsigned i = 0; i < Args.size(); i++) {
+    StringRef Arg = Args[i];
+    if (Arg.empty())
+      continue;
+    if (Arg == "-main-file-name" || Arg == "-o") {
+      i++; // Skip this argument and next one.
+      continue;
+    }
+    if (Arg.startswith("-object-file-name") || Arg == MainFilename)
+      continue;
+    // Skip fmessage-length for reproduciability.
+    if (Arg.startswith("-fmessage-length"))
+      continue;
+    if (PrintedOneArg)
+      OS << " ";
+    llvm::sys::printArg(OS, Arg, /*Quote=*/true);
+    PrintedOneArg = true;
+  }
+  OS.flush();
+  return FlatCmdLine;
+}
+
+void CodeViewDebug::emitBuildInfo() {
+  // First, make LF_BUILDINFO. It's a sequence of strings with various bits of
+  // build info. The known prefix is:
+  // - Absolute path of current directory
+  // - Compiler path
+  // - Main source file path, relative to CWD or absolute
+  // - Type server PDB file
+  // - Canonical compiler command line
+  // If frontend and backend compilation are separated (think llc or LTO), it's
+  // not clear if the compiler path should refer to the executable for the
+  // frontend or the backend. Leave it blank for now.
+  TypeIndex BuildInfoArgs[BuildInfoRecord::MaxArgs] = {};
+  NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+  const MDNode *Node = *CUs->operands().begin(); // FIXME: Multiple CUs.
+  const auto *CU = cast<DICompileUnit>(Node);
+  const DIFile *MainSourceFile = CU->getFile();
+  BuildInfoArgs[BuildInfoRecord::CurrentDirectory] =
+      getStringIdTypeIdx(TypeTable, MainSourceFile->getDirectory());
+  BuildInfoArgs[BuildInfoRecord::SourceFile] =
+      getStringIdTypeIdx(TypeTable, MainSourceFile->getFilename());
+  // FIXME: PDB is intentionally blank unless we implement /Zi type servers.
+  BuildInfoArgs[BuildInfoRecord::TypeServerPDB] =
+      getStringIdTypeIdx(TypeTable, "");
+  if (Asm->TM.Options.MCOptions.Argv0 != nullptr) {
+    BuildInfoArgs[BuildInfoRecord::BuildTool] =
+        getStringIdTypeIdx(TypeTable, Asm->TM.Options.MCOptions.Argv0);
+    BuildInfoArgs[BuildInfoRecord::CommandLine] = getStringIdTypeIdx(
+        TypeTable, flattenCommandLine(Asm->TM.Options.MCOptions.CommandLineArgs,
+                                      MainSourceFile->getFilename()));
+  }
+  BuildInfoRecord BIR(BuildInfoArgs);
+  TypeIndex BuildInfoIndex = TypeTable.writeLeafType(BIR);
+
+  // Make a new .debug$S subsection for the S_BUILDINFO record, which points
+  // from the module symbols into the type stream.
+  MCSymbol *BISubsecEnd = beginCVSubsection(DebugSubsectionKind::Symbols);
+  MCSymbol *BIEnd = beginSymbolRecord(SymbolKind::S_BUILDINFO);
+  OS.AddComment("LF_BUILDINFO index");
+  OS.emitInt32(BuildInfoIndex.getIndex());
+  endSymbolRecord(BIEnd);
+  endCVSubsection(BISubsecEnd);
+}
+
+void CodeViewDebug::emitInlineeLinesSubsection() {
+  if (InlinedSubprograms.empty())
+    return;
+
+  OS.AddComment("Inlinee lines subsection");
+  MCSymbol *InlineEnd = beginCVSubsection(DebugSubsectionKind::InlineeLines);
+
+  // We emit the checksum info for files.  This is used by debuggers to
+  // determine if a pdb matches the source before loading it.  Visual Studio,
+  // for instance, will display a warning that the breakpoints are not valid if
+  // the pdb does not match the source.
+  OS.AddComment("Inlinee lines signature");
+  OS.emitInt32(unsigned(InlineeLinesSignature::Normal));
+
+  for (const DISubprogram *SP : InlinedSubprograms) {
+    assert(TypeIndices.count({SP, nullptr}));
+    TypeIndex InlineeIdx = TypeIndices[{SP, nullptr}];
+
+    OS.addBlankLine();
+    unsigned FileId = maybeRecordFile(SP->getFile());
+    OS.AddComment("Inlined function " + SP->getName() + " starts at " +
+                  SP->getFilename() + Twine(':') + Twine(SP->getLine()));
+    OS.addBlankLine();
+    OS.AddComment("Type index of inlined function");
+    OS.emitInt32(InlineeIdx.getIndex());
+    OS.AddComment("Offset into filechecksum table");
+    OS.emitCVFileChecksumOffsetDirective(FileId);
+    OS.AddComment("Starting line number");
+    OS.emitInt32(SP->getLine());
+  }
+
+  endCVSubsection(InlineEnd);
+}
+
+void CodeViewDebug::emitInlinedCallSite(const FunctionInfo &FI,
+                                        const DILocation *InlinedAt,
+                                        const InlineSite &Site) {
+  assert(TypeIndices.count({Site.Inlinee, nullptr}));
+  TypeIndex InlineeIdx = TypeIndices[{Site.Inlinee, nullptr}];
+
+  // SymbolRecord
+  MCSymbol *InlineEnd = beginSymbolRecord(SymbolKind::S_INLINESITE);
+
+  OS.AddComment("PtrParent");
+  OS.emitInt32(0);
+  OS.AddComment("PtrEnd");
+  OS.emitInt32(0);
+  OS.AddComment("Inlinee type index");
+  OS.emitInt32(InlineeIdx.getIndex());
+
+  unsigned FileId = maybeRecordFile(Site.Inlinee->getFile());
+  unsigned StartLineNum = Site.Inlinee->getLine();
+
+  OS.emitCVInlineLinetableDirective(Site.SiteFuncId, FileId, StartLineNum,
+                                    FI.Begin, FI.End);
+
+  endSymbolRecord(InlineEnd);
+
+  emitLocalVariableList(FI, Site.InlinedLocals);
+
+  // Recurse on child inlined call sites before closing the scope.
+  for (const DILocation *ChildSite : Site.ChildSites) {
+    auto I = FI.InlineSites.find(ChildSite);
+    assert(I != FI.InlineSites.end() &&
+           "child site not in function inline site map");
+    emitInlinedCallSite(FI, ChildSite, I->second);
+  }
+
+  // Close the scope.
+  emitEndSymbolRecord(SymbolKind::S_INLINESITE_END);
+}
+
+void CodeViewDebug::switchToDebugSectionForSymbol(const MCSymbol *GVSym) {
+  // If we have a symbol, it may be in a section that is COMDAT. If so, find the
+  // comdat key. A section may be comdat because of -ffunction-sections or
+  // because it is comdat in the IR.
+  MCSectionCOFF *GVSec =
+      GVSym ? dyn_cast<MCSectionCOFF>(&GVSym->getSection()) : nullptr;
+  const MCSymbol *KeySym = GVSec ? GVSec->getCOMDATSymbol() : nullptr;
+
+  MCSectionCOFF *DebugSec = cast<MCSectionCOFF>(
+      Asm->getObjFileLowering().getCOFFDebugSymbolsSection());
+  DebugSec = OS.getContext().getAssociativeCOFFSection(DebugSec, KeySym);
+
+  OS.switchSection(DebugSec);
+
+  // Emit the magic version number if this is the first time we've switched to
+  // this section.
+  if (ComdatDebugSections.insert(DebugSec).second)
+    emitCodeViewMagicVersion();
+}
+
+// Emit an S_THUNK32/S_END symbol pair for a thunk routine.
+// The only supported thunk ordinal is currently the standard type.
+void CodeViewDebug::emitDebugInfoForThunk(const Function *GV,
+                                          FunctionInfo &FI,
+                                          const MCSymbol *Fn) {
+  std::string FuncName =
+      std::string(GlobalValue::dropLLVMManglingEscape(GV->getName()));
+  const ThunkOrdinal ordinal = ThunkOrdinal::Standard; // Only supported kind.
+
+  OS.AddComment("Symbol subsection for " + Twine(FuncName));
+  MCSymbol *SymbolsEnd = beginCVSubsection(DebugSubsectionKind::Symbols);
+
+  // Emit S_THUNK32
+  MCSymbol *ThunkRecordEnd = beginSymbolRecord(SymbolKind::S_THUNK32);
+  OS.AddComment("PtrParent");
+  OS.emitInt32(0);
+  OS.AddComment("PtrEnd");
+  OS.emitInt32(0);
+  OS.AddComment("PtrNext");
+  OS.emitInt32(0);
+  OS.AddComment("Thunk section relative address");
+  OS.emitCOFFSecRel32(Fn, /*Offset=*/0);
+  OS.AddComment("Thunk section index");
+  OS.emitCOFFSectionIndex(Fn);
+  OS.AddComment("Code size");
+  OS.emitAbsoluteSymbolDiff(FI.End, Fn, 2);
+  OS.AddComment("Ordinal");
+  OS.emitInt8(unsigned(ordinal));
+  OS.AddComment("Function name");
+  emitNullTerminatedSymbolName(OS, FuncName);
+  // Additional fields specific to the thunk ordinal would go here.
+  endSymbolRecord(ThunkRecordEnd);
+
+  // Local variables/inlined routines are purposely omitted here.  The point of
+  // marking this as a thunk is so Visual Studio will NOT stop in this routine.
+
+  // Emit S_PROC_ID_END
+  emitEndSymbolRecord(SymbolKind::S_PROC_ID_END);
+
+  endCVSubsection(SymbolsEnd);
+}
+
+void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
+                                             FunctionInfo &FI) {
+  // For each function there is a separate subsection which holds the PC to
+  // file:line table.
+  const MCSymbol *Fn = Asm->getSymbol(GV);
+  assert(Fn);
+
+  // Switch to the to a comdat section, if appropriate.
+  switchToDebugSectionForSymbol(Fn);
+
+  std::string FuncName;
+  auto *SP = GV->getSubprogram();
+  assert(SP);
+  setCurrentSubprogram(SP);
+
+  if (SP->isThunk()) {
+    emitDebugInfoForThunk(GV, FI, Fn);
+    return;
+  }
+
+  // If we have a display name, build the fully qualified name by walking the
+  // chain of scopes.
+  if (!SP->getName().empty())
+    FuncName = getFullyQualifiedName(SP->getScope(), SP->getName());
+
+  // If our DISubprogram name is empty, use the mangled name.
+  if (FuncName.empty())
+    FuncName = std::string(GlobalValue::dropLLVMManglingEscape(GV->getName()));
+
+  // Emit FPO data, but only on 32-bit x86. No other platforms use it.
+  if (Triple(MMI->getModule()->getTargetTriple()).getArch() == Triple::x86)
+    OS.emitCVFPOData(Fn);
+
+  // Emit a symbol subsection, required by VS2012+ to find function boundaries.
+  OS.AddComment("Symbol subsection for " + Twine(FuncName));
+  MCSymbol *SymbolsEnd = beginCVSubsection(DebugSubsectionKind::Symbols);
+  {
+    SymbolKind ProcKind = GV->hasLocalLinkage() ? SymbolKind::S_LPROC32_ID
+                                                : SymbolKind::S_GPROC32_ID;
+    MCSymbol *ProcRecordEnd = beginSymbolRecord(ProcKind);
+
+    // These fields are filled in by tools like CVPACK which run after the fact.
+    OS.AddComment("PtrParent");
+    OS.emitInt32(0);
+    OS.AddComment("PtrEnd");
+    OS.emitInt32(0);
+    OS.AddComment("PtrNext");
+    OS.emitInt32(0);
+    // This is the important bit that tells the debugger where the function
+    // code is located and what's its size:
+    OS.AddComment("Code size");
+    OS.emitAbsoluteSymbolDiff(FI.End, Fn, 4);
+    OS.AddComment("Offset after prologue");
+    OS.emitInt32(0);
+    OS.AddComment("Offset before epilogue");
+    OS.emitInt32(0);
+    OS.AddComment("Function type index");
+    OS.emitInt32(getFuncIdForSubprogram(GV->getSubprogram()).getIndex());
+    OS.AddComment("Function section relative address");
+    OS.emitCOFFSecRel32(Fn, /*Offset=*/0);
+    OS.AddComment("Function section index");
+    OS.emitCOFFSectionIndex(Fn);
+    OS.AddComment("Flags");
+    ProcSymFlags ProcFlags = ProcSymFlags::HasOptimizedDebugInfo;
+    if (FI.HasFramePointer)
+      ProcFlags |= ProcSymFlags::HasFP;
+    if (GV->hasFnAttribute(Attribute::NoReturn))
+      ProcFlags |= ProcSymFlags::IsNoReturn;
+    if (GV->hasFnAttribute(Attribute::NoInline))
+      ProcFlags |= ProcSymFlags::IsNoInline;
+    OS.emitInt8(static_cast<uint8_t>(ProcFlags));
+    // Emit the function display name as a null-terminated string.
+    OS.AddComment("Function name");
+    // Truncate the name so we won't overflow the record length field.
+    emitNullTerminatedSymbolName(OS, FuncName);
+    endSymbolRecord(ProcRecordEnd);
+
+    MCSymbol *FrameProcEnd = beginSymbolRecord(SymbolKind::S_FRAMEPROC);
+    // Subtract out the CSR size since MSVC excludes that and we include it.
+    OS.AddComment("FrameSize");
+    OS.emitInt32(FI.FrameSize - FI.CSRSize);
+    OS.AddComment("Padding");
+    OS.emitInt32(0);
+    OS.AddComment("Offset of padding");
+    OS.emitInt32(0);
+    OS.AddComment("Bytes of callee saved registers");
+    OS.emitInt32(FI.CSRSize);
+    OS.AddComment("Exception handler offset");
+    OS.emitInt32(0);
+    OS.AddComment("Exception handler section");
+    OS.emitInt16(0);
+    OS.AddComment("Flags (defines frame register)");
+    OS.emitInt32(uint32_t(FI.FrameProcOpts));
+    endSymbolRecord(FrameProcEnd);
+
+    emitLocalVariableList(FI, FI.Locals);
+    emitGlobalVariableList(FI.Globals);
+    emitLexicalBlockList(FI.ChildBlocks, FI);
+
+    // Emit inlined call site information. Only emit functions inlined directly
+    // into the parent function. We'll emit the other sites recursively as part
+    // of their parent inline site.
+    for (const DILocation *InlinedAt : FI.ChildSites) {
+      auto I = FI.InlineSites.find(InlinedAt);
+      assert(I != FI.InlineSites.end() &&
+             "child site not in function inline site map");
+      emitInlinedCallSite(FI, InlinedAt, I->second);
+    }
+
+    for (auto Annot : FI.Annotations) {
+      MCSymbol *Label = Annot.first;
+      MDTuple *Strs = cast<MDTuple>(Annot.second);
+      MCSymbol *AnnotEnd = beginSymbolRecord(SymbolKind::S_ANNOTATION);
+      OS.emitCOFFSecRel32(Label, /*Offset=*/0);
+      // FIXME: Make sure we don't overflow the max record size.
+      OS.emitCOFFSectionIndex(Label);
+      OS.emitInt16(Strs->getNumOperands());
+      for (Metadata *MD : Strs->operands()) {
+        // MDStrings are null terminated, so we can do EmitBytes and get the
+        // nice .asciz directive.
+        StringRef Str = cast<MDString>(MD)->getString();
+        assert(Str.data()[Str.size()] == '\0' && "non-nullterminated MDString");
+        OS.emitBytes(StringRef(Str.data(), Str.size() + 1));
+      }
+      endSymbolRecord(AnnotEnd);
+    }
+
+    for (auto HeapAllocSite : FI.HeapAllocSites) {
+      const MCSymbol *BeginLabel = std::get<0>(HeapAllocSite);
+      const MCSymbol *EndLabel = std::get<1>(HeapAllocSite);
+      const DIType *DITy = std::get<2>(HeapAllocSite);
+      MCSymbol *HeapAllocEnd = beginSymbolRecord(SymbolKind::S_HEAPALLOCSITE);
+      OS.AddComment("Call site offset");
+      OS.emitCOFFSecRel32(BeginLabel, /*Offset=*/0);
+      OS.AddComment("Call site section index");
+      OS.emitCOFFSectionIndex(BeginLabel);
+      OS.AddComment("Call instruction length");
+      OS.emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 2);
+      OS.AddComment("Type index");
+      OS.emitInt32(getCompleteTypeIndex(DITy).getIndex());
+      endSymbolRecord(HeapAllocEnd);
+    }
+
+    if (SP != nullptr)
+      emitDebugInfoForUDTs(LocalUDTs);
+
+    // We're done with this function.
+    emitEndSymbolRecord(SymbolKind::S_PROC_ID_END);
+  }
+  endCVSubsection(SymbolsEnd);
+
+  // We have an assembler directive that takes care of the whole line table.
+  OS.emitCVLinetableDirective(FI.FuncId, Fn, FI.End);
+}
+
+CodeViewDebug::LocalVarDef
+CodeViewDebug::createDefRangeMem(uint16_t CVRegister, int Offset) {
+  LocalVarDef DR;
+  DR.InMemory = -1;
+  DR.DataOffset = Offset;
+  assert(DR.DataOffset == Offset && "truncation");
+  DR.IsSubfield = 0;
+  DR.StructOffset = 0;
+  DR.CVRegister = CVRegister;
+  return DR;
+}
+
+void CodeViewDebug::collectVariableInfoFromMFTable(
+    DenseSet<InlinedEntity> &Processed) {
+  const MachineFunction &MF = *Asm->MF;
+  const TargetSubtargetInfo &TSI = MF.getSubtarget();
+  const TargetFrameLowering *TFI = TSI.getFrameLowering();
+  const TargetRegisterInfo *TRI = TSI.getRegisterInfo();
+
+  for (const MachineFunction::VariableDbgInfo &VI :
+       MF.getInStackSlotVariableDbgInfo()) {
+    if (!VI.Var)
+      continue;
+    assert(VI.Var->isValidLocationForIntrinsic(VI.Loc) &&
+           "Expected inlined-at fields to agree");
+
+    Processed.insert(InlinedEntity(VI.Var, VI.Loc->getInlinedAt()));
+    LexicalScope *Scope = LScopes.findLexicalScope(VI.Loc);
+
+    // If variable scope is not found then skip this variable.
+    if (!Scope)
+      continue;
+
+    // If the variable has an attached offset expression, extract it.
+    // FIXME: Try to handle DW_OP_deref as well.
+    int64_t ExprOffset = 0;
+    bool Deref = false;
+    if (VI.Expr) {
+      // If there is one DW_OP_deref element, use offset of 0 and keep going.
+      if (VI.Expr->getNumElements() == 1 &&
+          VI.Expr->getElement(0) == llvm::dwarf::DW_OP_deref)
+        Deref = true;
+      else if (!VI.Expr->extractIfOffset(ExprOffset))
+        continue;
+    }
+
+    // Get the frame register used and the offset.
+    Register FrameReg;
+    StackOffset FrameOffset =
+        TFI->getFrameIndexReference(*Asm->MF, VI.getStackSlot(), FrameReg);
+    uint16_t CVReg = TRI->getCodeViewRegNum(FrameReg);
+
+    assert(!FrameOffset.getScalable() &&
+           "Frame offsets with a scalable component are not supported");
+
+    // Calculate the label ranges.
+    LocalVarDef DefRange =
+        createDefRangeMem(CVReg, FrameOffset.getFixed() + ExprOffset);
+
+    LocalVariable Var;
+    Var.DIVar = VI.Var;
+
+    for (const InsnRange &Range : Scope->getRanges()) {
+      const MCSymbol *Begin = getLabelBeforeInsn(Range.first);
+      const MCSymbol *End = getLabelAfterInsn(Range.second);
+      End = End ? End : Asm->getFunctionEnd();
+      Var.DefRanges[DefRange].emplace_back(Begin, End);
+    }
+
+    if (Deref)
+      Var.UseReferenceType = true;
+
+    recordLocalVariable(std::move(Var), Scope);
+  }
+}
+
+static bool canUseReferenceType(const DbgVariableLocation &Loc) {
+  return !Loc.LoadChain.empty() && Loc.LoadChain.back() == 0;
+}
+
+static bool needsReferenceType(const DbgVariableLocation &Loc) {
+  return Loc.LoadChain.size() == 2 && Loc.LoadChain.back() == 0;
+}
+
+void CodeViewDebug::calculateRanges(
+    LocalVariable &Var, const DbgValueHistoryMap::Entries &Entries) {
+  const TargetRegisterInfo *TRI = Asm->MF->getSubtarget().getRegisterInfo();
+
+  // Calculate the definition ranges.
+  for (auto I = Entries.begin(), E = Entries.end(); I != E; ++I) {
+    const auto &Entry = *I;
+    if (!Entry.isDbgValue())
+      continue;
+    const MachineInstr *DVInst = Entry.getInstr();
+    assert(DVInst->isDebugValue() && "Invalid History entry");
+    // FIXME: Find a way to represent constant variables, since they are
+    // relatively common.
+    std::optional<DbgVariableLocation> Location =
+        DbgVariableLocation::extractFromMachineInstruction(*DVInst);
+    if (!Location)
+    {
+      // When we don't have a location this is usually because LLVM has
+      // transformed it into a constant and we only have an llvm.dbg.value. We
+      // can't represent these well in CodeView since S_LOCAL only works on
+      // registers and memory locations. Instead, we will pretend this to be a
+      // constant value to at least have it show up in the debugger.
+      auto Op = DVInst->getDebugOperand(0);
+      if (Op.isImm())
+        Var.ConstantValue = APSInt(APInt(64, Op.getImm()), false);
+      continue;
+    }
+
+    // CodeView can only express variables in register and variables in memory
+    // at a constant offset from a register. However, for variables passed
+    // indirectly by pointer, it is common for that pointer to be spilled to a
+    // stack location. For the special case of one offseted load followed by a
+    // zero offset load (a pointer spilled to the stack), we change the type of
+    // the local variable from a value type to a reference type. This tricks the
+    // debugger into doing the load for us.
+    if (Var.UseReferenceType) {
+      // We're using a reference type. Drop the last zero offset load.
+      if (canUseReferenceType(*Location))
+        Location->LoadChain.pop_back();
+      else
+        continue;
+    } else if (needsReferenceType(*Location)) {
+      // This location can't be expressed without switching to a reference type.
+      // Start over using that.
+      Var.UseReferenceType = true;
+      Var.DefRanges.clear();
+      calculateRanges(Var, Entries);
+      return;
+    }
+
+    // We can only handle a register or an offseted load of a register.
+    if (Location->Register == 0 || Location->LoadChain.size() > 1)
+      continue;
+
+    LocalVarDef DR;
+    DR.CVRegister = TRI->getCodeViewRegNum(Location->Register);
+    DR.InMemory = !Location->LoadChain.empty();
+    DR.DataOffset =
+        !Location->LoadChain.empty() ? Location->LoadChain.back() : 0;
+    if (Location->FragmentInfo) {
+      DR.IsSubfield = true;
+      DR.StructOffset = Location->FragmentInfo->OffsetInBits / 8;
+    } else {
+      DR.IsSubfield = false;
+      DR.StructOffset = 0;
+    }
+
+    // Compute the label range.
+    const MCSymbol *Begin = getLabelBeforeInsn(Entry.getInstr());
+    const MCSymbol *End;
+    if (Entry.getEndIndex() != DbgValueHistoryMap::NoEntry) {
+      auto &EndingEntry = Entries[Entry.getEndIndex()];
+      End = EndingEntry.isDbgValue()
+                ? getLabelBeforeInsn(EndingEntry.getInstr())
+                : getLabelAfterInsn(EndingEntry.getInstr());
+    } else
+      End = Asm->getFunctionEnd();
+
+    // If the last range end is our begin, just extend the last range.
+    // Otherwise make a new range.
+    SmallVectorImpl<std::pair<const MCSymbol *, const MCSymbol *>> &R =
+        Var.DefRanges[DR];
+    if (!R.empty() && R.back().second == Begin)
+      R.back().second = End;
+    else
+      R.emplace_back(Begin, End);
+
+    // FIXME: Do more range combining.
+  }
+}
+
+void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) {
+  DenseSet<InlinedEntity> Processed;
+  // Grab the variable info that was squirreled away in the MMI side-table.
+  collectVariableInfoFromMFTable(Processed);
+
+  for (const auto &I : DbgValues) {
+    InlinedEntity IV = I.first;
+    if (Processed.count(IV))
+      continue;
+    const DILocalVariable *DIVar = cast<DILocalVariable>(IV.first);
+    const DILocation *InlinedAt = IV.second;
+
+    // Instruction ranges, specifying where IV is accessible.
+    const auto &Entries = I.second;
+
+    LexicalScope *Scope = nullptr;
+    if (InlinedAt)
+      Scope = LScopes.findInlinedScope(DIVar->getScope(), InlinedAt);
+    else
+      Scope = LScopes.findLexicalScope(DIVar->getScope());
+    // If variable scope is not found then skip this variable.
+    if (!Scope)
+      continue;
+
+    LocalVariable Var;
+    Var.DIVar = DIVar;
+
+    calculateRanges(Var, Entries);
+    recordLocalVariable(std::move(Var), Scope);
+  }
+}
+
+void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) {
+  const TargetSubtargetInfo &TSI = MF->getSubtarget();
+  const TargetRegisterInfo *TRI = TSI.getRegisterInfo();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  const Function &GV = MF->getFunction();
+  auto Insertion = FnDebugInfo.insert({&GV, std::make_unique<FunctionInfo>()});
+  assert(Insertion.second && "function already has info");
+  CurFn = Insertion.first->second.get();
+  CurFn->FuncId = NextFuncId++;
+  CurFn->Begin = Asm->getFunctionBegin();
+
+  // The S_FRAMEPROC record reports the stack size, and how many bytes of
+  // callee-saved registers were used. For targets that don't use a PUSH
+  // instruction (AArch64), this will be zero.
+  CurFn->CSRSize = MFI.getCVBytesOfCalleeSavedRegisters();
+  CurFn->FrameSize = MFI.getStackSize();
+  CurFn->OffsetAdjustment = MFI.getOffsetAdjustment();
+  CurFn->HasStackRealignment = TRI->hasStackRealignment(*MF);
+
+  // For this function S_FRAMEPROC record, figure out which codeview register
+  // will be the frame pointer.
+  CurFn->EncodedParamFramePtrReg = EncodedFramePtrReg::None; // None.
+  CurFn->EncodedLocalFramePtrReg = EncodedFramePtrReg::None; // None.
+  if (CurFn->FrameSize > 0) {
+    if (!TSI.getFrameLowering()->hasFP(*MF)) {
+      CurFn->EncodedLocalFramePtrReg = EncodedFramePtrReg::StackPtr;
+      CurFn->EncodedParamFramePtrReg = EncodedFramePtrReg::StackPtr;
+    } else {
+      CurFn->HasFramePointer = true;
+      // If there is an FP, parameters are always relative to it.
+      CurFn->EncodedParamFramePtrReg = EncodedFramePtrReg::FramePtr;
+      if (CurFn->HasStackRealignment) {
+        // If the stack needs realignment, locals are relative to SP or VFRAME.
+        CurFn->EncodedLocalFramePtrReg = EncodedFramePtrReg::StackPtr;
+      } else {
+        // Otherwise, locals are relative to EBP, and we probably have VLAs or
+        // other stack adjustments.
+        CurFn->EncodedLocalFramePtrReg = EncodedFramePtrReg::FramePtr;
+      }
+    }
+  }
+
+  // Compute other frame procedure options.
+  FrameProcedureOptions FPO = FrameProcedureOptions::None;
+  if (MFI.hasVarSizedObjects())
+    FPO |= FrameProcedureOptions::HasAlloca;
+  if (MF->exposesReturnsTwice())
+    FPO |= FrameProcedureOptions::HasSetJmp;
+  // FIXME: Set HasLongJmp if we ever track that info.
+  if (MF->hasInlineAsm())
+    FPO |= FrameProcedureOptions::HasInlineAssembly;
+  if (GV.hasPersonalityFn()) {
+    if (isAsynchronousEHPersonality(
+            classifyEHPersonality(GV.getPersonalityFn())))
+      FPO |= FrameProcedureOptions::HasStructuredExceptionHandling;
+    else
+      FPO |= FrameProcedureOptions::HasExceptionHandling;
+  }
+  if (GV.hasFnAttribute(Attribute::InlineHint))
+    FPO |= FrameProcedureOptions::MarkedInline;
+  if (GV.hasFnAttribute(Attribute::Naked))
+    FPO |= FrameProcedureOptions::Naked;
+  if (MFI.hasStackProtectorIndex()) {
+    FPO |= FrameProcedureOptions::SecurityChecks;
+    if (GV.hasFnAttribute(Attribute::StackProtectStrong) ||
+        GV.hasFnAttribute(Attribute::StackProtectReq)) {
+      FPO |= FrameProcedureOptions::StrictSecurityChecks;
+    }
+  } else if (!GV.hasStackProtectorFnAttr()) {
+    // __declspec(safebuffers) disables stack guards.
+    FPO |= FrameProcedureOptions::SafeBuffers;
+  }
+  FPO |= FrameProcedureOptions(uint32_t(CurFn->EncodedLocalFramePtrReg) << 14U);
+  FPO |= FrameProcedureOptions(uint32_t(CurFn->EncodedParamFramePtrReg) << 16U);
+  if (Asm->TM.getOptLevel() != CodeGenOpt::None &&
+      !GV.hasOptSize() && !GV.hasOptNone())
+    FPO |= FrameProcedureOptions::OptimizedForSpeed;
+  if (GV.hasProfileData()) {
+    FPO |= FrameProcedureOptions::ValidProfileCounts;
+    FPO |= FrameProcedureOptions::ProfileGuidedOptimization;
+  }
+  // FIXME: Set GuardCfg when it is implemented.
+  CurFn->FrameProcOpts = FPO;
+
+  OS.emitCVFuncIdDirective(CurFn->FuncId);
+
+  // Find the end of the function prolog.  First known non-DBG_VALUE and
+  // non-frame setup location marks the beginning of the function body.
+  // FIXME: is there a simpler a way to do this? Can we just search
+  // for the first instruction of the function, not the last of the prolog?
+  DebugLoc PrologEndLoc;
+  bool EmptyPrologue = true;
+  for (const auto &MBB : *MF) {
+    for (const auto &MI : MBB) {
+      if (!MI.isMetaInstruction() && !MI.getFlag(MachineInstr::FrameSetup) &&
+          MI.getDebugLoc()) {
+        PrologEndLoc = MI.getDebugLoc();
+        break;
+      } else if (!MI.isMetaInstruction()) {
+        EmptyPrologue = false;
+      }
+    }
+  }
+
+  // Record beginning of function if we have a non-empty prologue.
+  if (PrologEndLoc && !EmptyPrologue) {
+    DebugLoc FnStartDL = PrologEndLoc.getFnDebugLoc();
+    maybeRecordLocation(FnStartDL, MF);
+  }
+
+  // Find heap alloc sites and emit labels around them.
+  for (const auto &MBB : *MF) {
+    for (const auto &MI : MBB) {
+      if (MI.getHeapAllocMarker()) {
+        requestLabelBeforeInsn(&MI);
+        requestLabelAfterInsn(&MI);
+      }
+    }
+  }
+}
+
+static bool shouldEmitUdt(const DIType *T) {
+  if (!T)
+    return false;
+
+  // MSVC does not emit UDTs for typedefs that are scoped to classes.
+  if (T->getTag() == dwarf::DW_TAG_typedef) {
+    if (DIScope *Scope = T->getScope()) {
+      switch (Scope->getTag()) {
+      case dwarf::DW_TAG_structure_type:
+      case dwarf::DW_TAG_class_type:
+      case dwarf::DW_TAG_union_type:
+        return false;
+      default:
+          // do nothing.
+          ;
+      }
+    }
+  }
+
+  while (true) {
+    if (!T || T->isForwardDecl())
+      return false;
+
+    const DIDerivedType *DT = dyn_cast<DIDerivedType>(T);
+    if (!DT)
+      return true;
+    T = DT->getBaseType();
+  }
+  return true;
+}
+
+void CodeViewDebug::addToUDTs(const DIType *Ty) {
+  // Don't record empty UDTs.
+  if (Ty->getName().empty())
+    return;
+  if (!shouldEmitUdt(Ty))
+    return;
+
+  SmallVector<StringRef, 5> ParentScopeNames;
+  const DISubprogram *ClosestSubprogram =
+      collectParentScopeNames(Ty->getScope(), ParentScopeNames);
+
+  std::string FullyQualifiedName =
+      formatNestedName(ParentScopeNames, getPrettyScopeName(Ty));
+
+  if (ClosestSubprogram == nullptr) {
+    GlobalUDTs.emplace_back(std::move(FullyQualifiedName), Ty);
+  } else if (ClosestSubprogram == CurrentSubprogram) {
+    LocalUDTs.emplace_back(std::move(FullyQualifiedName), Ty);
+  }
+
+  // TODO: What if the ClosestSubprogram is neither null or the current
+  // subprogram?  Currently, the UDT just gets dropped on the floor.
+  //
+  // The current behavior is not desirable.  To get maximal fidelity, we would
+  // need to perform all type translation before beginning emission of .debug$S
+  // and then make LocalUDTs a member of FunctionInfo
+}
+
+TypeIndex CodeViewDebug::lowerType(const DIType *Ty, const DIType *ClassTy) {
+  // Generic dispatch for lowering an unknown type.
+  switch (Ty->getTag()) {
+  case dwarf::DW_TAG_array_type:
+    return lowerTypeArray(cast<DICompositeType>(Ty));
+  case dwarf::DW_TAG_typedef:
+    return lowerTypeAlias(cast<DIDerivedType>(Ty));
+  case dwarf::DW_TAG_base_type:
+    return lowerTypeBasic(cast<DIBasicType>(Ty));
+  case dwarf::DW_TAG_pointer_type:
+    if (cast<DIDerivedType>(Ty)->getName() == "__vtbl_ptr_type")
+      return lowerTypeVFTableShape(cast<DIDerivedType>(Ty));
+    [[fallthrough]];
+  case dwarf::DW_TAG_reference_type:
+  case dwarf::DW_TAG_rvalue_reference_type:
+    return lowerTypePointer(cast<DIDerivedType>(Ty));
+  case dwarf::DW_TAG_ptr_to_member_type:
+    return lowerTypeMemberPointer(cast<DIDerivedType>(Ty));
+  case dwarf::DW_TAG_restrict_type:
+  case dwarf::DW_TAG_const_type:
+  case dwarf::DW_TAG_volatile_type:
+  // TODO: add support for DW_TAG_atomic_type here
+    return lowerTypeModifier(cast<DIDerivedType>(Ty));
+  case dwarf::DW_TAG_subroutine_type:
+    if (ClassTy) {
+      // The member function type of a member function pointer has no
+      // ThisAdjustment.
+      return lowerTypeMemberFunction(cast<DISubroutineType>(Ty), ClassTy,
+                                     /*ThisAdjustment=*/0,
+                                     /*IsStaticMethod=*/false);
+    }
+    return lowerTypeFunction(cast<DISubroutineType>(Ty));
+  case dwarf::DW_TAG_enumeration_type:
+    return lowerTypeEnum(cast<DICompositeType>(Ty));
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+    return lowerTypeClass(cast<DICompositeType>(Ty));
+  case dwarf::DW_TAG_union_type:
+    return lowerTypeUnion(cast<DICompositeType>(Ty));
+  case dwarf::DW_TAG_string_type:
+    return lowerTypeString(cast<DIStringType>(Ty));
+  case dwarf::DW_TAG_unspecified_type:
+    if (Ty->getName() == "decltype(nullptr)")
+      return TypeIndex::NullptrT();
+    return TypeIndex::None();
+  default:
+    // Use the null type index.
+    return TypeIndex();
+  }
+}
+
+TypeIndex CodeViewDebug::lowerTypeAlias(const DIDerivedType *Ty) {
+  TypeIndex UnderlyingTypeIndex = getTypeIndex(Ty->getBaseType());
+  StringRef TypeName = Ty->getName();
+
+  addToUDTs(Ty);
+
+  if (UnderlyingTypeIndex == TypeIndex(SimpleTypeKind::Int32Long) &&
+      TypeName == "HRESULT")
+    return TypeIndex(SimpleTypeKind::HResult);
+  if (UnderlyingTypeIndex == TypeIndex(SimpleTypeKind::UInt16Short) &&
+      TypeName == "wchar_t")
+    return TypeIndex(SimpleTypeKind::WideCharacter);
+
+  return UnderlyingTypeIndex;
+}
+
+TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
+  const DIType *ElementType = Ty->getBaseType();
+  TypeIndex ElementTypeIndex = getTypeIndex(ElementType);
+  // IndexType is size_t, which depends on the bitness of the target.
+  TypeIndex IndexType = getPointerSizeInBytes() == 8
+                            ? TypeIndex(SimpleTypeKind::UInt64Quad)
+                            : TypeIndex(SimpleTypeKind::UInt32Long);
+
+  uint64_t ElementSize = getBaseTypeSize(ElementType) / 8;
+
+  // Add subranges to array type.
+  DINodeArray Elements = Ty->getElements();
+  for (int i = Elements.size() - 1; i >= 0; --i) {
+    const DINode *Element = Elements[i];
+    assert(Element->getTag() == dwarf::DW_TAG_subrange_type);
+
+    const DISubrange *Subrange = cast<DISubrange>(Element);
+    int64_t Count = -1;
+
+    // If Subrange has a Count field, use it.
+    // Otherwise, if it has an upperboud, use (upperbound - lowerbound + 1),
+    // where lowerbound is from the LowerBound field of the Subrange,
+    // or the language default lowerbound if that field is unspecified.
+    if (auto *CI = dyn_cast_if_present<ConstantInt *>(Subrange->getCount()))
+      Count = CI->getSExtValue();
+    else if (auto *UI = dyn_cast_if_present<ConstantInt *>(
+                 Subrange->getUpperBound())) {
+      // Fortran uses 1 as the default lowerbound; other languages use 0.
+      int64_t Lowerbound = (moduleIsInFortran()) ? 1 : 0;
+      auto *LI = dyn_cast_if_present<ConstantInt *>(Subrange->getLowerBound());
+      Lowerbound = (LI) ? LI->getSExtValue() : Lowerbound;
+      Count = UI->getSExtValue() - Lowerbound + 1;
+    }
+
+    // Forward declarations of arrays without a size and VLAs use a count of -1.
+    // Emit a count of zero in these cases to match what MSVC does for arrays
+    // without a size. MSVC doesn't support VLAs, so it's not clear what we
+    // should do for them even if we could distinguish them.
+    if (Count == -1)
+      Count = 0;
+
+    // Update the element size and element type index for subsequent subranges.
+    ElementSize *= Count;
+
+    // If this is the outermost array, use the size from the array. It will be
+    // more accurate if we had a VLA or an incomplete element type size.
+    uint64_t ArraySize =
+        (i == 0 && ElementSize == 0) ? Ty->getSizeInBits() / 8 : ElementSize;
+
+    StringRef Name = (i == 0) ? Ty->getName() : "";
+    ArrayRecord AR(ElementTypeIndex, IndexType, ArraySize, Name);
+    ElementTypeIndex = TypeTable.writeLeafType(AR);
+  }
+
+  return ElementTypeIndex;
+}
+
+// This function lowers a Fortran character type (DIStringType).
+// Note that it handles only the character*n variant (using SizeInBits
+// field in DIString to describe the type size) at the moment.
+// Other variants (leveraging the StringLength and StringLengthExp
+// fields in DIStringType) remain TBD.
+TypeIndex CodeViewDebug::lowerTypeString(const DIStringType *Ty) {
+  TypeIndex CharType = TypeIndex(SimpleTypeKind::NarrowCharacter);
+  uint64_t ArraySize = Ty->getSizeInBits() >> 3;
+  StringRef Name = Ty->getName();
+  // IndexType is size_t, which depends on the bitness of the target.
+  TypeIndex IndexType = getPointerSizeInBytes() == 8
+                            ? TypeIndex(SimpleTypeKind::UInt64Quad)
+                            : TypeIndex(SimpleTypeKind::UInt32Long);
+
+  // Create a type of character array of ArraySize.
+  ArrayRecord AR(CharType, IndexType, ArraySize, Name);
+
+  return TypeTable.writeLeafType(AR);
+}
+
+TypeIndex CodeViewDebug::lowerTypeBasic(const DIBasicType *Ty) {
+  TypeIndex Index;
+  dwarf::TypeKind Kind;
+  uint32_t ByteSize;
+
+  Kind = static_cast<dwarf::TypeKind>(Ty->getEncoding());
+  ByteSize = Ty->getSizeInBits() / 8;
+
+  SimpleTypeKind STK = SimpleTypeKind::None;
+  switch (Kind) {
+  case dwarf::DW_ATE_address:
+    // FIXME: Translate
+    break;
+  case dwarf::DW_ATE_boolean:
+    switch (ByteSize) {
+    case 1:  STK = SimpleTypeKind::Boolean8;   break;
+    case 2:  STK = SimpleTypeKind::Boolean16;  break;
+    case 4:  STK = SimpleTypeKind::Boolean32;  break;
+    case 8:  STK = SimpleTypeKind::Boolean64;  break;
+    case 16: STK = SimpleTypeKind::Boolean128; break;
+    }
+    break;
+  case dwarf::DW_ATE_complex_float:
+    // The CodeView size for a complex represents the size of
+    // an individual component.
+    switch (ByteSize) {
+    case 4:  STK = SimpleTypeKind::Complex16;  break;
+    case 8:  STK = SimpleTypeKind::Complex32;  break;
+    case 16: STK = SimpleTypeKind::Complex64;  break;
+    case 20: STK = SimpleTypeKind::Complex80;  break;
+    case 32: STK = SimpleTypeKind::Complex128; break;
+    }
+    break;
+  case dwarf::DW_ATE_float:
+    switch (ByteSize) {
+    case 2:  STK = SimpleTypeKind::Float16;  break;
+    case 4:  STK = SimpleTypeKind::Float32;  break;
+    case 6:  STK = SimpleTypeKind::Float48;  break;
+    case 8:  STK = SimpleTypeKind::Float64;  break;
+    case 10: STK = SimpleTypeKind::Float80;  break;
+    case 16: STK = SimpleTypeKind::Float128; break;
+    }
+    break;
+  case dwarf::DW_ATE_signed:
+    switch (ByteSize) {
+    case 1:  STK = SimpleTypeKind::SignedCharacter; break;
+    case 2:  STK = SimpleTypeKind::Int16Short;      break;
+    case 4:  STK = SimpleTypeKind::Int32;           break;
+    case 8:  STK = SimpleTypeKind::Int64Quad;       break;
+    case 16: STK = SimpleTypeKind::Int128Oct;       break;
+    }
+    break;
+  case dwarf::DW_ATE_unsigned:
+    switch (ByteSize) {
+    case 1:  STK = SimpleTypeKind::UnsignedCharacter; break;
+    case 2:  STK = SimpleTypeKind::UInt16Short;       break;
+    case 4:  STK = SimpleTypeKind::UInt32;            break;
+    case 8:  STK = SimpleTypeKind::UInt64Quad;        break;
+    case 16: STK = SimpleTypeKind::UInt128Oct;        break;
+    }
+    break;
+  case dwarf::DW_ATE_UTF:
+    switch (ByteSize) {
+    case 1: STK = SimpleTypeKind::Character8; break;
+    case 2: STK = SimpleTypeKind::Character16; break;
+    case 4: STK = SimpleTypeKind::Character32; break;
+    }
+    break;
+  case dwarf::DW_ATE_signed_char:
+    if (ByteSize == 1)
+      STK = SimpleTypeKind::SignedCharacter;
+    break;
+  case dwarf::DW_ATE_unsigned_char:
+    if (ByteSize == 1)
+      STK = SimpleTypeKind::UnsignedCharacter;
+    break;
+  default:
+    break;
+  }
+
+  // Apply some fixups based on the source-level type name.
+  // Include some amount of canonicalization from an old naming scheme Clang
+  // used to use for integer types (in an outdated effort to be compatible with
+  // GCC's debug info/GDB's behavior, which has since been addressed).
+  if (STK == SimpleTypeKind::Int32 &&
+      (Ty->getName() == "long int" || Ty->getName() == "long"))
+    STK = SimpleTypeKind::Int32Long;
+  if (STK == SimpleTypeKind::UInt32 && (Ty->getName() == "long unsigned int" ||
+                                        Ty->getName() == "unsigned long"))
+    STK = SimpleTypeKind::UInt32Long;
+  if (STK == SimpleTypeKind::UInt16Short &&
+      (Ty->getName() == "wchar_t" || Ty->getName() == "__wchar_t"))
+    STK = SimpleTypeKind::WideCharacter;
+  if ((STK == SimpleTypeKind::SignedCharacter ||
+       STK == SimpleTypeKind::UnsignedCharacter) &&
+      Ty->getName() == "char")
+    STK = SimpleTypeKind::NarrowCharacter;
+
+  return TypeIndex(STK);
+}
+
+TypeIndex CodeViewDebug::lowerTypePointer(const DIDerivedType *Ty,
+                                          PointerOptions PO) {
+  TypeIndex PointeeTI = getTypeIndex(Ty->getBaseType());
+
+  // Pointers to simple types without any options can use SimpleTypeMode, rather
+  // than having a dedicated pointer type record.
+  if (PointeeTI.isSimple() && PO == PointerOptions::None &&
+      PointeeTI.getSimpleMode() == SimpleTypeMode::Direct &&
+      Ty->getTag() == dwarf::DW_TAG_pointer_type) {
+    SimpleTypeMode Mode = Ty->getSizeInBits() == 64
+                              ? SimpleTypeMode::NearPointer64
+                              : SimpleTypeMode::NearPointer32;
+    return TypeIndex(PointeeTI.getSimpleKind(), Mode);
+  }
+
+  PointerKind PK =
+      Ty->getSizeInBits() == 64 ? PointerKind::Near64 : PointerKind::Near32;
+  PointerMode PM = PointerMode::Pointer;
+  switch (Ty->getTag()) {
+  default: llvm_unreachable("not a pointer tag type");
+  case dwarf::DW_TAG_pointer_type:
+    PM = PointerMode::Pointer;
+    break;
+  case dwarf::DW_TAG_reference_type:
+    PM = PointerMode::LValueReference;
+    break;
+  case dwarf::DW_TAG_rvalue_reference_type:
+    PM = PointerMode::RValueReference;
+    break;
+  }
+
+  if (Ty->isObjectPointer())
+    PO |= PointerOptions::Const;
+
+  PointerRecord PR(PointeeTI, PK, PM, PO, Ty->getSizeInBits() / 8);
+  return TypeTable.writeLeafType(PR);
+}
+
+static PointerToMemberRepresentation
+translatePtrToMemberRep(unsigned SizeInBytes, bool IsPMF, unsigned Flags) {
+  // SizeInBytes being zero generally implies that the member pointer type was
+  // incomplete, which can happen if it is part of a function prototype. In this
+  // case, use the unknown model instead of the general model.
+  if (IsPMF) {
+    switch (Flags & DINode::FlagPtrToMemberRep) {
+    case 0:
+      return SizeInBytes == 0 ? PointerToMemberRepresentation::Unknown
+                              : PointerToMemberRepresentation::GeneralFunction;
+    case DINode::FlagSingleInheritance:
+      return PointerToMemberRepresentation::SingleInheritanceFunction;
+    case DINode::FlagMultipleInheritance:
+      return PointerToMemberRepresentation::MultipleInheritanceFunction;
+    case DINode::FlagVirtualInheritance:
+      return PointerToMemberRepresentation::VirtualInheritanceFunction;
+    }
+  } else {
+    switch (Flags & DINode::FlagPtrToMemberRep) {
+    case 0:
+      return SizeInBytes == 0 ? PointerToMemberRepresentation::Unknown
+                              : PointerToMemberRepresentation::GeneralData;
+    case DINode::FlagSingleInheritance:
+      return PointerToMemberRepresentation::SingleInheritanceData;
+    case DINode::FlagMultipleInheritance:
+      return PointerToMemberRepresentation::MultipleInheritanceData;
+    case DINode::FlagVirtualInheritance:
+      return PointerToMemberRepresentation::VirtualInheritanceData;
+    }
+  }
+  llvm_unreachable("invalid ptr to member representation");
+}
+
+TypeIndex CodeViewDebug::lowerTypeMemberPointer(const DIDerivedType *Ty,
+                                                PointerOptions PO) {
+  assert(Ty->getTag() == dwarf::DW_TAG_ptr_to_member_type);
+  bool IsPMF = isa<DISubroutineType>(Ty->getBaseType());
+  TypeIndex ClassTI = getTypeIndex(Ty->getClassType());
+  TypeIndex PointeeTI =
+      getTypeIndex(Ty->getBaseType(), IsPMF ? Ty->getClassType() : nullptr);
+  PointerKind PK = getPointerSizeInBytes() == 8 ? PointerKind::Near64
+                                                : PointerKind::Near32;
+  PointerMode PM = IsPMF ? PointerMode::PointerToMemberFunction
+                         : PointerMode::PointerToDataMember;
+
+  assert(Ty->getSizeInBits() / 8 <= 0xff && "pointer size too big");
+  uint8_t SizeInBytes = Ty->getSizeInBits() / 8;
+  MemberPointerInfo MPI(
+      ClassTI, translatePtrToMemberRep(SizeInBytes, IsPMF, Ty->getFlags()));
+  PointerRecord PR(PointeeTI, PK, PM, PO, SizeInBytes, MPI);
+  return TypeTable.writeLeafType(PR);
+}
+
+/// Given a DWARF calling convention, get the CodeView equivalent. If we don't
+/// have a translation, use the NearC convention.
+static CallingConvention dwarfCCToCodeView(unsigned DwarfCC) {
+  switch (DwarfCC) {
+  case dwarf::DW_CC_normal:             return CallingConvention::NearC;
+  case dwarf::DW_CC_BORLAND_msfastcall: return CallingConvention::NearFast;
+  case dwarf::DW_CC_BORLAND_thiscall:   return CallingConvention::ThisCall;
+  case dwarf::DW_CC_BORLAND_stdcall:    return CallingConvention::NearStdCall;
+  case dwarf::DW_CC_BORLAND_pascal:     return CallingConvention::NearPascal;
+  case dwarf::DW_CC_LLVM_vectorcall:    return CallingConvention::NearVector;
+  }
+  return CallingConvention::NearC;
+}
+
+TypeIndex CodeViewDebug::lowerTypeModifier(const DIDerivedType *Ty) {
+  ModifierOptions Mods = ModifierOptions::None;
+  PointerOptions PO = PointerOptions::None;
+  bool IsModifier = true;
+  const DIType *BaseTy = Ty;
+  while (IsModifier && BaseTy) {
+    // FIXME: Need to add DWARF tags for __unaligned and _Atomic
+    switch (BaseTy->getTag()) {
+    case dwarf::DW_TAG_const_type:
+      Mods |= ModifierOptions::Const;
+      PO |= PointerOptions::Const;
+      break;
+    case dwarf::DW_TAG_volatile_type:
+      Mods |= ModifierOptions::Volatile;
+      PO |= PointerOptions::Volatile;
+      break;
+    case dwarf::DW_TAG_restrict_type:
+      // Only pointer types be marked with __restrict. There is no known flag
+      // for __restrict in LF_MODIFIER records.
+      PO |= PointerOptions::Restrict;
+      break;
+    default:
+      IsModifier = false;
+      break;
+    }
+    if (IsModifier)
+      BaseTy = cast<DIDerivedType>(BaseTy)->getBaseType();
+  }
+
+  // Check if the inner type will use an LF_POINTER record. If so, the
+  // qualifiers will go in the LF_POINTER record. This comes up for types like
+  // 'int *const' and 'int *__restrict', not the more common cases like 'const
+  // char *'.
+  if (BaseTy) {
+    switch (BaseTy->getTag()) {
+    case dwarf::DW_TAG_pointer_type:
+    case dwarf::DW_TAG_reference_type:
+    case dwarf::DW_TAG_rvalue_reference_type:
+      return lowerTypePointer(cast<DIDerivedType>(BaseTy), PO);
+    case dwarf::DW_TAG_ptr_to_member_type:
+      return lowerTypeMemberPointer(cast<DIDerivedType>(BaseTy), PO);
+    default:
+      break;
+    }
+  }
+
+  TypeIndex ModifiedTI = getTypeIndex(BaseTy);
+
+  // Return the base type index if there aren't any modifiers. For example, the
+  // metadata could contain restrict wrappers around non-pointer types.
+  if (Mods == ModifierOptions::None)
+    return ModifiedTI;
+
+  ModifierRecord MR(ModifiedTI, Mods);
+  return TypeTable.writeLeafType(MR);
+}
+
+TypeIndex CodeViewDebug::lowerTypeFunction(const DISubroutineType *Ty) {
+  SmallVector<TypeIndex, 8> ReturnAndArgTypeIndices;
+  for (const DIType *ArgType : Ty->getTypeArray())
+    ReturnAndArgTypeIndices.push_back(getTypeIndex(ArgType));
+
+  // MSVC uses type none for variadic argument.
+  if (ReturnAndArgTypeIndices.size() > 1 &&
+      ReturnAndArgTypeIndices.back() == TypeIndex::Void()) {
+    ReturnAndArgTypeIndices.back() = TypeIndex::None();
+  }
+  TypeIndex ReturnTypeIndex = TypeIndex::Void();
+  ArrayRef<TypeIndex> ArgTypeIndices = std::nullopt;
+  if (!ReturnAndArgTypeIndices.empty()) {
+    auto ReturnAndArgTypesRef = ArrayRef(ReturnAndArgTypeIndices);
+    ReturnTypeIndex = ReturnAndArgTypesRef.front();
+    ArgTypeIndices = ReturnAndArgTypesRef.drop_front();
+  }
+
+  ArgListRecord ArgListRec(TypeRecordKind::ArgList, ArgTypeIndices);
+  TypeIndex ArgListIndex = TypeTable.writeLeafType(ArgListRec);
+
+  CallingConvention CC = dwarfCCToCodeView(Ty->getCC());
+
+  FunctionOptions FO = getFunctionOptions(Ty);
+  ProcedureRecord Procedure(ReturnTypeIndex, CC, FO, ArgTypeIndices.size(),
+                            ArgListIndex);
+  return TypeTable.writeLeafType(Procedure);
+}
+
+TypeIndex CodeViewDebug::lowerTypeMemberFunction(const DISubroutineType *Ty,
+                                                 const DIType *ClassTy,
+                                                 int ThisAdjustment,
+                                                 bool IsStaticMethod,
+                                                 FunctionOptions FO) {
+  // Lower the containing class type.
+  TypeIndex ClassType = getTypeIndex(ClassTy);
+
+  DITypeRefArray ReturnAndArgs = Ty->getTypeArray();
+
+  unsigned Index = 0;
+  SmallVector<TypeIndex, 8> ArgTypeIndices;
+  TypeIndex ReturnTypeIndex = TypeIndex::Void();
+  if (ReturnAndArgs.size() > Index) {
+    ReturnTypeIndex = getTypeIndex(ReturnAndArgs[Index++]);
+  }
+
+  // If the first argument is a pointer type and this isn't a static method,
+  // treat it as the special 'this' parameter, which is encoded separately from
+  // the arguments.
+  TypeIndex ThisTypeIndex;
+  if (!IsStaticMethod && ReturnAndArgs.size() > Index) {
+    if (const DIDerivedType *PtrTy =
+            dyn_cast_or_null<DIDerivedType>(ReturnAndArgs[Index])) {
+      if (PtrTy->getTag() == dwarf::DW_TAG_pointer_type) {
+        ThisTypeIndex = getTypeIndexForThisPtr(PtrTy, Ty);
+        Index++;
+      }
+    }
+  }
+
+  while (Index < ReturnAndArgs.size())
+    ArgTypeIndices.push_back(getTypeIndex(ReturnAndArgs[Index++]));
+
+  // MSVC uses type none for variadic argument.
+  if (!ArgTypeIndices.empty() && ArgTypeIndices.back() == TypeIndex::Void())
+    ArgTypeIndices.back() = TypeIndex::None();
+
+  ArgListRecord ArgListRec(TypeRecordKind::ArgList, ArgTypeIndices);
+  TypeIndex ArgListIndex = TypeTable.writeLeafType(ArgListRec);
+
+  CallingConvention CC = dwarfCCToCodeView(Ty->getCC());
+
+  MemberFunctionRecord MFR(ReturnTypeIndex, ClassType, ThisTypeIndex, CC, FO,
+                           ArgTypeIndices.size(), ArgListIndex, ThisAdjustment);
+  return TypeTable.writeLeafType(MFR);
+}
+
+TypeIndex CodeViewDebug::lowerTypeVFTableShape(const DIDerivedType *Ty) {
+  unsigned VSlotCount =
+      Ty->getSizeInBits() / (8 * Asm->MAI->getCodePointerSize());
+  SmallVector<VFTableSlotKind, 4> Slots(VSlotCount, VFTableSlotKind::Near);
+
+  VFTableShapeRecord VFTSR(Slots);
+  return TypeTable.writeLeafType(VFTSR);
+}
+
+static MemberAccess translateAccessFlags(unsigned RecordTag, unsigned Flags) {
+  switch (Flags & DINode::FlagAccessibility) {
+  case DINode::FlagPrivate:   return MemberAccess::Private;
+  case DINode::FlagPublic:    return MemberAccess::Public;
+  case DINode::FlagProtected: return MemberAccess::Protected;
+  case 0:
+    // If there was no explicit access control, provide the default for the tag.
+    return RecordTag == dwarf::DW_TAG_class_type ? MemberAccess::Private
+                                                 : MemberAccess::Public;
+  }
+  llvm_unreachable("access flags are exclusive");
+}
+
+static MethodOptions translateMethodOptionFlags(const DISubprogram *SP) {
+  if (SP->isArtificial())
+    return MethodOptions::CompilerGenerated;
+
+  // FIXME: Handle other MethodOptions.
+
+  return MethodOptions::None;
+}
+
+static MethodKind translateMethodKindFlags(const DISubprogram *SP,
+                                           bool Introduced) {
+  if (SP->getFlags() & DINode::FlagStaticMember)
+    return MethodKind::Static;
+
+  switch (SP->getVirtuality()) {
+  case dwarf::DW_VIRTUALITY_none:
+    break;
+  case dwarf::DW_VIRTUALITY_virtual:
+    return Introduced ? MethodKind::IntroducingVirtual : MethodKind::Virtual;
+  case dwarf::DW_VIRTUALITY_pure_virtual:
+    return Introduced ? MethodKind::PureIntroducingVirtual
+                      : MethodKind::PureVirtual;
+  default:
+    llvm_unreachable("unhandled virtuality case");
+  }
+
+  return MethodKind::Vanilla;
+}
+
+static TypeRecordKind getRecordKind(const DICompositeType *Ty) {
+  switch (Ty->getTag()) {
+  case dwarf::DW_TAG_class_type:
+    return TypeRecordKind::Class;
+  case dwarf::DW_TAG_structure_type:
+    return TypeRecordKind::Struct;
+  default:
+    llvm_unreachable("unexpected tag");
+  }
+}
+
+/// Return ClassOptions that should be present on both the forward declaration
+/// and the defintion of a tag type.
+static ClassOptions getCommonClassOptions(const DICompositeType *Ty) {
+  ClassOptions CO = ClassOptions::None;
+
+  // MSVC always sets this flag, even for local types. Clang doesn't always
+  // appear to give every type a linkage name, which may be problematic for us.
+  // FIXME: Investigate the consequences of not following them here.
+  if (!Ty->getIdentifier().empty())
+    CO |= ClassOptions::HasUniqueName;
+
+  // Put the Nested flag on a type if it appears immediately inside a tag type.
+  // Do not walk the scope chain. Do not attempt to compute ContainsNestedClass
+  // here. That flag is only set on definitions, and not forward declarations.
+  const DIScope *ImmediateScope = Ty->getScope();
+  if (ImmediateScope && isa<DICompositeType>(ImmediateScope))
+    CO |= ClassOptions::Nested;
+
+  // Put the Scoped flag on function-local types. MSVC puts this flag for enum
+  // type only when it has an immediate function scope. Clang never puts enums
+  // inside DILexicalBlock scopes. Enum types, as generated by clang, are
+  // always in function, class, or file scopes.
+  if (Ty->getTag() == dwarf::DW_TAG_enumeration_type) {
+    if (ImmediateScope && isa<DISubprogram>(ImmediateScope))
+      CO |= ClassOptions::Scoped;
+  } else {
+    for (const DIScope *Scope = ImmediateScope; Scope != nullptr;
+         Scope = Scope->getScope()) {
+      if (isa<DISubprogram>(Scope)) {
+        CO |= ClassOptions::Scoped;
+        break;
+      }
+    }
+  }
+
+  return CO;
+}
+
+void CodeViewDebug::addUDTSrcLine(const DIType *Ty, TypeIndex TI) {
+  switch (Ty->getTag()) {
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_enumeration_type:
+    break;
+  default:
+    return;
+  }
+
+  if (const auto *File = Ty->getFile()) {
+    StringIdRecord SIDR(TypeIndex(0x0), getFullFilepath(File));
+    TypeIndex SIDI = TypeTable.writeLeafType(SIDR);
+
+    UdtSourceLineRecord USLR(TI, SIDI, Ty->getLine());
+    TypeTable.writeLeafType(USLR);
+  }
+}
+
+TypeIndex CodeViewDebug::lowerTypeEnum(const DICompositeType *Ty) {
+  ClassOptions CO = getCommonClassOptions(Ty);
+  TypeIndex FTI;
+  unsigned EnumeratorCount = 0;
+
+  if (Ty->isForwardDecl()) {
+    CO |= ClassOptions::ForwardReference;
+  } else {
+    ContinuationRecordBuilder ContinuationBuilder;
+    ContinuationBuilder.begin(ContinuationRecordKind::FieldList);
+    for (const DINode *Element : Ty->getElements()) {
+      // We assume that the frontend provides all members in source declaration
+      // order, which is what MSVC does.
+      if (auto *Enumerator = dyn_cast_or_null<DIEnumerator>(Element)) {
+        // FIXME: Is it correct to always emit these as unsigned here?
+        EnumeratorRecord ER(MemberAccess::Public,
+                            APSInt(Enumerator->getValue(), true),
+                            Enumerator->getName());
+        ContinuationBuilder.writeMemberType(ER);
+        EnumeratorCount++;
+      }
+    }
+    FTI = TypeTable.insertRecord(ContinuationBuilder);
+  }
+
+  std::string FullName = getFullyQualifiedName(Ty);
+
+  EnumRecord ER(EnumeratorCount, CO, FTI, FullName, Ty->getIdentifier(),
+                getTypeIndex(Ty->getBaseType()));
+  TypeIndex EnumTI = TypeTable.writeLeafType(ER);
+
+  addUDTSrcLine(Ty, EnumTI);
+
+  return EnumTI;
+}
+
+//===----------------------------------------------------------------------===//
+// ClassInfo
+//===----------------------------------------------------------------------===//
+
+struct llvm::ClassInfo {
+  struct MemberInfo {
+    const DIDerivedType *MemberTypeNode;
+    uint64_t BaseOffset;
+  };
+  // [MemberInfo]
+  using MemberList = std::vector<MemberInfo>;
+
+  using MethodsList = TinyPtrVector<const DISubprogram *>;
+  // MethodName -> MethodsList
+  using MethodsMap = MapVector<MDString *, MethodsList>;
+
+  /// Base classes.
+  std::vector<const DIDerivedType *> Inheritance;
+
+  /// Direct members.
+  MemberList Members;
+  // Direct overloaded methods gathered by name.
+  MethodsMap Methods;
+
+  TypeIndex VShapeTI;
+
+  std::vector<const DIType *> NestedTypes;
+};
+
+void CodeViewDebug::clear() {
+  assert(CurFn == nullptr);
+  FileIdMap.clear();
+  FnDebugInfo.clear();
+  FileToFilepathMap.clear();
+  LocalUDTs.clear();
+  GlobalUDTs.clear();
+  TypeIndices.clear();
+  CompleteTypeIndices.clear();
+  ScopeGlobals.clear();
+  CVGlobalVariableOffsets.clear();
+}
+
+void CodeViewDebug::collectMemberInfo(ClassInfo &Info,
+                                      const DIDerivedType *DDTy) {
+  if (!DDTy->getName().empty()) {
+    Info.Members.push_back({DDTy, 0});
+
+    // Collect static const data members with values.
+    if ((DDTy->getFlags() & DINode::FlagStaticMember) ==
+        DINode::FlagStaticMember) {
+      if (DDTy->getConstant() && (isa<ConstantInt>(DDTy->getConstant()) ||
+                                  isa<ConstantFP>(DDTy->getConstant())))
+        StaticConstMembers.push_back(DDTy);
+    }
+
+    return;
+  }
+
+  // An unnamed member may represent a nested struct or union. Attempt to
+  // interpret the unnamed member as a DICompositeType possibly wrapped in
+  // qualifier types. Add all the indirect fields to the current record if that
+  // succeeds, and drop the member if that fails.
+  assert((DDTy->getOffsetInBits() % 8) == 0 && "Unnamed bitfield member!");
+  uint64_t Offset = DDTy->getOffsetInBits();
+  const DIType *Ty = DDTy->getBaseType();
+  bool FullyResolved = false;
+  while (!FullyResolved) {
+    switch (Ty->getTag()) {
+    case dwarf::DW_TAG_const_type:
+    case dwarf::DW_TAG_volatile_type:
+      // FIXME: we should apply the qualifier types to the indirect fields
+      // rather than dropping them.
+      Ty = cast<DIDerivedType>(Ty)->getBaseType();
+      break;
+    default:
+      FullyResolved = true;
+      break;
+    }
+  }
+
+  const DICompositeType *DCTy = dyn_cast<DICompositeType>(Ty);
+  if (!DCTy)
+    return;
+
+  ClassInfo NestedInfo = collectClassInfo(DCTy);
+  for (const ClassInfo::MemberInfo &IndirectField : NestedInfo.Members)
+    Info.Members.push_back(
+        {IndirectField.MemberTypeNode, IndirectField.BaseOffset + Offset});
+}
+
+ClassInfo CodeViewDebug::collectClassInfo(const DICompositeType *Ty) {
+  ClassInfo Info;
+  // Add elements to structure type.
+  DINodeArray Elements = Ty->getElements();
+  for (auto *Element : Elements) {
+    // We assume that the frontend provides all members in source declaration
+    // order, which is what MSVC does.
+    if (!Element)
+      continue;
+    if (auto *SP = dyn_cast<DISubprogram>(Element)) {
+      Info.Methods[SP->getRawName()].push_back(SP);
+    } else if (auto *DDTy = dyn_cast<DIDerivedType>(Element)) {
+      if (DDTy->getTag() == dwarf::DW_TAG_member) {
+        collectMemberInfo(Info, DDTy);
+      } else if (DDTy->getTag() == dwarf::DW_TAG_inheritance) {
+        Info.Inheritance.push_back(DDTy);
+      } else if (DDTy->getTag() == dwarf::DW_TAG_pointer_type &&
+                 DDTy->getName() == "__vtbl_ptr_type") {
+        Info.VShapeTI = getTypeIndex(DDTy);
+      } else if (DDTy->getTag() == dwarf::DW_TAG_typedef) {
+        Info.NestedTypes.push_back(DDTy);
+      } else if (DDTy->getTag() == dwarf::DW_TAG_friend) {
+        // Ignore friend members. It appears that MSVC emitted info about
+        // friends in the past, but modern versions do not.
+      }
+    } else if (auto *Composite = dyn_cast<DICompositeType>(Element)) {
+      Info.NestedTypes.push_back(Composite);
+    }
+    // Skip other unrecognized kinds of elements.
+  }
+  return Info;
+}
+
+static bool shouldAlwaysEmitCompleteClassType(const DICompositeType *Ty) {
+  // This routine is used by lowerTypeClass and lowerTypeUnion to determine
+  // if a complete type should be emitted instead of a forward reference.
+  return Ty->getName().empty() && Ty->getIdentifier().empty() &&
+      !Ty->isForwardDecl();
+}
+
+TypeIndex CodeViewDebug::lowerTypeClass(const DICompositeType *Ty) {
+  // Emit the complete type for unnamed structs.  C++ classes with methods
+  // which have a circular reference back to the class type are expected to
+  // be named by the front-end and should not be "unnamed".  C unnamed
+  // structs should not have circular references.
+  if (shouldAlwaysEmitCompleteClassType(Ty)) {
+    // If this unnamed complete type is already in the process of being defined
+    // then the description of the type is malformed and cannot be emitted
+    // into CodeView correctly so report a fatal error.
+    auto I = CompleteTypeIndices.find(Ty);
+    if (I != CompleteTypeIndices.end() && I->second == TypeIndex())
+      report_fatal_error("cannot debug circular reference to unnamed type");
+    return getCompleteTypeIndex(Ty);
+  }
+
+  // First, construct the forward decl.  Don't look into Ty to compute the
+  // forward decl options, since it might not be available in all TUs.
+  TypeRecordKind Kind = getRecordKind(Ty);
+  ClassOptions CO =
+      ClassOptions::ForwardReference | getCommonClassOptions(Ty);
+  std::string FullName = getFullyQualifiedName(Ty);
+  ClassRecord CR(Kind, 0, CO, TypeIndex(), TypeIndex(), TypeIndex(), 0,
+                 FullName, Ty->getIdentifier());
+  TypeIndex FwdDeclTI = TypeTable.writeLeafType(CR);
+  if (!Ty->isForwardDecl())
+    DeferredCompleteTypes.push_back(Ty);
+  return FwdDeclTI;
+}
+
+TypeIndex CodeViewDebug::lowerCompleteTypeClass(const DICompositeType *Ty) {
+  // Construct the field list and complete type record.
+  TypeRecordKind Kind = getRecordKind(Ty);
+  ClassOptions CO = getCommonClassOptions(Ty);
+  TypeIndex FieldTI;
+  TypeIndex VShapeTI;
+  unsigned FieldCount;
+  bool ContainsNestedClass;
+  std::tie(FieldTI, VShapeTI, FieldCount, ContainsNestedClass) =
+      lowerRecordFieldList(Ty);
+
+  if (ContainsNestedClass)
+    CO |= ClassOptions::ContainsNestedClass;
+
+  // MSVC appears to set this flag by searching any destructor or method with
+  // FunctionOptions::Constructor among the emitted members. Clang AST has all
+  // the members, however special member functions are not yet emitted into
+  // debug information. For now checking a class's non-triviality seems enough.
+  // FIXME: not true for a nested unnamed struct.
+  if (isNonTrivial(Ty))
+    CO |= ClassOptions::HasConstructorOrDestructor;
+
+  std::string FullName = getFullyQualifiedName(Ty);
+
+  uint64_t SizeInBytes = Ty->getSizeInBits() / 8;
+
+  ClassRecord CR(Kind, FieldCount, CO, FieldTI, TypeIndex(), VShapeTI,
+                 SizeInBytes, FullName, Ty->getIdentifier());
+  TypeIndex ClassTI = TypeTable.writeLeafType(CR);
+
+  addUDTSrcLine(Ty, ClassTI);
+
+  addToUDTs(Ty);
+
+  return ClassTI;
+}
+
+TypeIndex CodeViewDebug::lowerTypeUnion(const DICompositeType *Ty) {
+  // Emit the complete type for unnamed unions.
+  if (shouldAlwaysEmitCompleteClassType(Ty))
+    return getCompleteTypeIndex(Ty);
+
+  ClassOptions CO =
+      ClassOptions::ForwardReference | getCommonClassOptions(Ty);
+  std::string FullName = getFullyQualifiedName(Ty);
+  UnionRecord UR(0, CO, TypeIndex(), 0, FullName, Ty->getIdentifier());
+  TypeIndex FwdDeclTI = TypeTable.writeLeafType(UR);
+  if (!Ty->isForwardDecl())
+    DeferredCompleteTypes.push_back(Ty);
+  return FwdDeclTI;
+}
+
+TypeIndex CodeViewDebug::lowerCompleteTypeUnion(const DICompositeType *Ty) {
+  ClassOptions CO = ClassOptions::Sealed | getCommonClassOptions(Ty);
+  TypeIndex FieldTI;
+  unsigned FieldCount;
+  bool ContainsNestedClass;
+  std::tie(FieldTI, std::ignore, FieldCount, ContainsNestedClass) =
+      lowerRecordFieldList(Ty);
+
+  if (ContainsNestedClass)
+    CO |= ClassOptions::ContainsNestedClass;
+
+  uint64_t SizeInBytes = Ty->getSizeInBits() / 8;
+  std::string FullName = getFullyQualifiedName(Ty);
+
+  UnionRecord UR(FieldCount, CO, FieldTI, SizeInBytes, FullName,
+                 Ty->getIdentifier());
+  TypeIndex UnionTI = TypeTable.writeLeafType(UR);
+
+  addUDTSrcLine(Ty, UnionTI);
+
+  addToUDTs(Ty);
+
+  return UnionTI;
+}
+
+std::tuple<TypeIndex, TypeIndex, unsigned, bool>
+CodeViewDebug::lowerRecordFieldList(const DICompositeType *Ty) {
+  // Manually count members. MSVC appears to count everything that generates a
+  // field list record. Each individual overload in a method overload group
+  // contributes to this count, even though the overload group is a single field
+  // list record.
+  unsigned MemberCount = 0;
+  ClassInfo Info = collectClassInfo(Ty);
+  ContinuationRecordBuilder ContinuationBuilder;
+  ContinuationBuilder.begin(ContinuationRecordKind::FieldList);
+
+  // Create base classes.
+  for (const DIDerivedType *I : Info.Inheritance) {
+    if (I->getFlags() & DINode::FlagVirtual) {
+      // Virtual base.
+      unsigned VBPtrOffset = I->getVBPtrOffset();
+      // FIXME: Despite the accessor name, the offset is really in bytes.
+      unsigned VBTableIndex = I->getOffsetInBits() / 4;
+      auto RecordKind = (I->getFlags() & DINode::FlagIndirectVirtualBase) == DINode::FlagIndirectVirtualBase
+                            ? TypeRecordKind::IndirectVirtualBaseClass
+                            : TypeRecordKind::VirtualBaseClass;
+      VirtualBaseClassRecord VBCR(
+          RecordKind, translateAccessFlags(Ty->getTag(), I->getFlags()),
+          getTypeIndex(I->getBaseType()), getVBPTypeIndex(), VBPtrOffset,
+          VBTableIndex);
+
+      ContinuationBuilder.writeMemberType(VBCR);
+      MemberCount++;
+    } else {
+      assert(I->getOffsetInBits() % 8 == 0 &&
+             "bases must be on byte boundaries");
+      BaseClassRecord BCR(translateAccessFlags(Ty->getTag(), I->getFlags()),
+                          getTypeIndex(I->getBaseType()),
+                          I->getOffsetInBits() / 8);
+      ContinuationBuilder.writeMemberType(BCR);
+      MemberCount++;
+    }
+  }
+
+  // Create members.
+  for (ClassInfo::MemberInfo &MemberInfo : Info.Members) {
+    const DIDerivedType *Member = MemberInfo.MemberTypeNode;
+    TypeIndex MemberBaseType = getTypeIndex(Member->getBaseType());
+    StringRef MemberName = Member->getName();
+    MemberAccess Access =
+        translateAccessFlags(Ty->getTag(), Member->getFlags());
+
+    if (Member->isStaticMember()) {
+      StaticDataMemberRecord SDMR(Access, MemberBaseType, MemberName);
+      ContinuationBuilder.writeMemberType(SDMR);
+      MemberCount++;
+      continue;
+    }
+
+    // Virtual function pointer member.
+    if ((Member->getFlags() & DINode::FlagArtificial) &&
+        Member->getName().startswith("_vptr$")) {
+      VFPtrRecord VFPR(getTypeIndex(Member->getBaseType()));
+      ContinuationBuilder.writeMemberType(VFPR);
+      MemberCount++;
+      continue;
+    }
+
+    // Data member.
+    uint64_t MemberOffsetInBits =
+        Member->getOffsetInBits() + MemberInfo.BaseOffset;
+    if (Member->isBitField()) {
+      uint64_t StartBitOffset = MemberOffsetInBits;
+      if (const auto *CI =
+              dyn_cast_or_null<ConstantInt>(Member->getStorageOffsetInBits())) {
+        MemberOffsetInBits = CI->getZExtValue() + MemberInfo.BaseOffset;
+      }
+      StartBitOffset -= MemberOffsetInBits;
+      BitFieldRecord BFR(MemberBaseType, Member->getSizeInBits(),
+                         StartBitOffset);
+      MemberBaseType = TypeTable.writeLeafType(BFR);
+    }
+    uint64_t MemberOffsetInBytes = MemberOffsetInBits / 8;
+    DataMemberRecord DMR(Access, MemberBaseType, MemberOffsetInBytes,
+                         MemberName);
+    ContinuationBuilder.writeMemberType(DMR);
+    MemberCount++;
+  }
+
+  // Create methods
+  for (auto &MethodItr : Info.Methods) {
+    StringRef Name = MethodItr.first->getString();
+
+    std::vector<OneMethodRecord> Methods;
+    for (const DISubprogram *SP : MethodItr.second) {
+      TypeIndex MethodType = getMemberFunctionType(SP, Ty);
+      bool Introduced = SP->getFlags() & DINode::FlagIntroducedVirtual;
+
+      unsigned VFTableOffset = -1;
+      if (Introduced)
+        VFTableOffset = SP->getVirtualIndex() * getPointerSizeInBytes();
+
+      Methods.push_back(OneMethodRecord(
+          MethodType, translateAccessFlags(Ty->getTag(), SP->getFlags()),
+          translateMethodKindFlags(SP, Introduced),
+          translateMethodOptionFlags(SP), VFTableOffset, Name));
+      MemberCount++;
+    }
+    assert(!Methods.empty() && "Empty methods map entry");
+    if (Methods.size() == 1)
+      ContinuationBuilder.writeMemberType(Methods[0]);
+    else {
+      // FIXME: Make this use its own ContinuationBuilder so that
+      // MethodOverloadList can be split correctly.
+      MethodOverloadListRecord MOLR(Methods);
+      TypeIndex MethodList = TypeTable.writeLeafType(MOLR);
+
+      OverloadedMethodRecord OMR(Methods.size(), MethodList, Name);
+      ContinuationBuilder.writeMemberType(OMR);
+    }
+  }
+
+  // Create nested classes.
+  for (const DIType *Nested : Info.NestedTypes) {
+    NestedTypeRecord R(getTypeIndex(Nested), Nested->getName());
+    ContinuationBuilder.writeMemberType(R);
+    MemberCount++;
+  }
+
+  TypeIndex FieldTI = TypeTable.insertRecord(ContinuationBuilder);
+  return std::make_tuple(FieldTI, Info.VShapeTI, MemberCount,
+                         !Info.NestedTypes.empty());
+}
+
+TypeIndex CodeViewDebug::getVBPTypeIndex() {
+  if (!VBPType.getIndex()) {
+    // Make a 'const int *' type.
+    ModifierRecord MR(TypeIndex::Int32(), ModifierOptions::Const);
+    TypeIndex ModifiedTI = TypeTable.writeLeafType(MR);
+
+    PointerKind PK = getPointerSizeInBytes() == 8 ? PointerKind::Near64
+                                                  : PointerKind::Near32;
+    PointerMode PM = PointerMode::Pointer;
+    PointerOptions PO = PointerOptions::None;
+    PointerRecord PR(ModifiedTI, PK, PM, PO, getPointerSizeInBytes());
+    VBPType = TypeTable.writeLeafType(PR);
+  }
+
+  return VBPType;
+}
+
+TypeIndex CodeViewDebug::getTypeIndex(const DIType *Ty, const DIType *ClassTy) {
+  // The null DIType is the void type. Don't try to hash it.
+  if (!Ty)
+    return TypeIndex::Void();
+
+  // Check if we've already translated this type. Don't try to do a
+  // get-or-create style insertion that caches the hash lookup across the
+  // lowerType call. It will update the TypeIndices map.
+  auto I = TypeIndices.find({Ty, ClassTy});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  TypeLoweringScope S(*this);
+  TypeIndex TI = lowerType(Ty, ClassTy);
+  return recordTypeIndexForDINode(Ty, TI, ClassTy);
+}
+
+codeview::TypeIndex
+CodeViewDebug::getTypeIndexForThisPtr(const DIDerivedType *PtrTy,
+                                      const DISubroutineType *SubroutineTy) {
+  assert(PtrTy->getTag() == dwarf::DW_TAG_pointer_type &&
+         "this type must be a pointer type");
+
+  PointerOptions Options = PointerOptions::None;
+  if (SubroutineTy->getFlags() & DINode::DIFlags::FlagLValueReference)
+    Options = PointerOptions::LValueRefThisPointer;
+  else if (SubroutineTy->getFlags() & DINode::DIFlags::FlagRValueReference)
+    Options = PointerOptions::RValueRefThisPointer;
+
+  // Check if we've already translated this type.  If there is no ref qualifier
+  // on the function then we look up this pointer type with no associated class
+  // so that the TypeIndex for the this pointer can be shared with the type
+  // index for other pointers to this class type.  If there is a ref qualifier
+  // then we lookup the pointer using the subroutine as the parent type.
+  auto I = TypeIndices.find({PtrTy, SubroutineTy});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  TypeLoweringScope S(*this);
+  TypeIndex TI = lowerTypePointer(PtrTy, Options);
+  return recordTypeIndexForDINode(PtrTy, TI, SubroutineTy);
+}
+
+TypeIndex CodeViewDebug::getTypeIndexForReferenceTo(const DIType *Ty) {
+  PointerRecord PR(getTypeIndex(Ty),
+                   getPointerSizeInBytes() == 8 ? PointerKind::Near64
+                                                : PointerKind::Near32,
+                   PointerMode::LValueReference, PointerOptions::None,
+                   Ty->getSizeInBits() / 8);
+  return TypeTable.writeLeafType(PR);
+}
+
+TypeIndex CodeViewDebug::getCompleteTypeIndex(const DIType *Ty) {
+  // The null DIType is the void type. Don't try to hash it.
+  if (!Ty)
+    return TypeIndex::Void();
+
+  // Look through typedefs when getting the complete type index. Call
+  // getTypeIndex on the typdef to ensure that any UDTs are accumulated and are
+  // emitted only once.
+  if (Ty->getTag() == dwarf::DW_TAG_typedef)
+    (void)getTypeIndex(Ty);
+  while (Ty->getTag() == dwarf::DW_TAG_typedef)
+    Ty = cast<DIDerivedType>(Ty)->getBaseType();
+
+  // If this is a non-record type, the complete type index is the same as the
+  // normal type index. Just call getTypeIndex.
+  switch (Ty->getTag()) {
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+    break;
+  default:
+    return getTypeIndex(Ty);
+  }
+
+  const auto *CTy = cast<DICompositeType>(Ty);
+
+  TypeLoweringScope S(*this);
+
+  // Make sure the forward declaration is emitted first. It's unclear if this
+  // is necessary, but MSVC does it, and we should follow suit until we can show
+  // otherwise.
+  // We only emit a forward declaration for named types.
+  if (!CTy->getName().empty() || !CTy->getIdentifier().empty()) {
+    TypeIndex FwdDeclTI = getTypeIndex(CTy);
+
+    // Just use the forward decl if we don't have complete type info. This
+    // might happen if the frontend is using modules and expects the complete
+    // definition to be emitted elsewhere.
+    if (CTy->isForwardDecl())
+      return FwdDeclTI;
+  }
+
+  // Check if we've already translated the complete record type.
+  // Insert the type with a null TypeIndex to signify that the type is currently
+  // being lowered.
+  auto InsertResult = CompleteTypeIndices.insert({CTy, TypeIndex()});
+  if (!InsertResult.second)
+    return InsertResult.first->second;
+
+  TypeIndex TI;
+  switch (CTy->getTag()) {
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+    TI = lowerCompleteTypeClass(CTy);
+    break;
+  case dwarf::DW_TAG_union_type:
+    TI = lowerCompleteTypeUnion(CTy);
+    break;
+  default:
+    llvm_unreachable("not a record");
+  }
+
+  // Update the type index associated with this CompositeType.  This cannot
+  // use the 'InsertResult' iterator above because it is potentially
+  // invalidated by map insertions which can occur while lowering the class
+  // type above.
+  CompleteTypeIndices[CTy] = TI;
+  return TI;
+}
+
+/// Emit all the deferred complete record types. Try to do this in FIFO order,
+/// and do this until fixpoint, as each complete record type typically
+/// references
+/// many other record types.
+void CodeViewDebug::emitDeferredCompleteTypes() {
+  SmallVector<const DICompositeType *, 4> TypesToEmit;
+  while (!DeferredCompleteTypes.empty()) {
+    std::swap(DeferredCompleteTypes, TypesToEmit);
+    for (const DICompositeType *RecordTy : TypesToEmit)
+      getCompleteTypeIndex(RecordTy);
+    TypesToEmit.clear();
+  }
+}
+
+void CodeViewDebug::emitLocalVariableList(const FunctionInfo &FI,
+                                          ArrayRef<LocalVariable> Locals) {
+  // Get the sorted list of parameters and emit them first.
+  SmallVector<const LocalVariable *, 6> Params;
+  for (const LocalVariable &L : Locals)
+    if (L.DIVar->isParameter())
+      Params.push_back(&L);
+  llvm::sort(Params, [](const LocalVariable *L, const LocalVariable *R) {
+    return L->DIVar->getArg() < R->DIVar->getArg();
+  });
+  for (const LocalVariable *L : Params)
+    emitLocalVariable(FI, *L);
+
+  // Next emit all non-parameters in the order that we found them.
+  for (const LocalVariable &L : Locals) {
+    if (!L.DIVar->isParameter()) {
+      if (L.ConstantValue) {
+        // If ConstantValue is set we will emit it as a S_CONSTANT instead of a
+        // S_LOCAL in order to be able to represent it at all.
+        const DIType *Ty = L.DIVar->getType();
+        APSInt Val(*L.ConstantValue);
+        emitConstantSymbolRecord(Ty, Val, std::string(L.DIVar->getName()));
+      } else {
+        emitLocalVariable(FI, L);
+      }
+    }
+  }
+}
+
+void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
+                                      const LocalVariable &Var) {
+  // LocalSym record, see SymbolRecord.h for more info.
+  MCSymbol *LocalEnd = beginSymbolRecord(SymbolKind::S_LOCAL);
+
+  LocalSymFlags Flags = LocalSymFlags::None;
+  if (Var.DIVar->isParameter())
+    Flags |= LocalSymFlags::IsParameter;
+  if (Var.DefRanges.empty())
+    Flags |= LocalSymFlags::IsOptimizedOut;
+
+  OS.AddComment("TypeIndex");
+  TypeIndex TI = Var.UseReferenceType
+                     ? getTypeIndexForReferenceTo(Var.DIVar->getType())
+                     : getCompleteTypeIndex(Var.DIVar->getType());
+  OS.emitInt32(TI.getIndex());
+  OS.AddComment("Flags");
+  OS.emitInt16(static_cast<uint16_t>(Flags));
+  // Truncate the name so we won't overflow the record length field.
+  emitNullTerminatedSymbolName(OS, Var.DIVar->getName());
+  endSymbolRecord(LocalEnd);
+
+  // Calculate the on disk prefix of the appropriate def range record. The
+  // records and on disk formats are described in SymbolRecords.h. BytePrefix
+  // should be big enough to hold all forms without memory allocation.
+  SmallString<20> BytePrefix;
+  for (const auto &Pair : Var.DefRanges) {
+    LocalVarDef DefRange = Pair.first;
+    const auto &Ranges = Pair.second;
+    BytePrefix.clear();
+    if (DefRange.InMemory) {
+      int Offset = DefRange.DataOffset;
+      unsigned Reg = DefRange.CVRegister;
+
+      // 32-bit x86 call sequences often use PUSH instructions, which disrupt
+      // ESP-relative offsets. Use the virtual frame pointer, VFRAME or $T0,
+      // instead. In frames without stack realignment, $T0 will be the CFA.
+      if (RegisterId(Reg) == RegisterId::ESP) {
+        Reg = unsigned(RegisterId::VFRAME);
+        Offset += FI.OffsetAdjustment;
+      }
+
+      // If we can use the chosen frame pointer for the frame and this isn't a
+      // sliced aggregate, use the smaller S_DEFRANGE_FRAMEPOINTER_REL record.
+      // Otherwise, use S_DEFRANGE_REGISTER_REL.
+      EncodedFramePtrReg EncFP = encodeFramePtrReg(RegisterId(Reg), TheCPU);
+      if (!DefRange.IsSubfield && EncFP != EncodedFramePtrReg::None &&
+          (bool(Flags & LocalSymFlags::IsParameter)
+               ? (EncFP == FI.EncodedParamFramePtrReg)
+               : (EncFP == FI.EncodedLocalFramePtrReg))) {
+        DefRangeFramePointerRelHeader DRHdr;
+        DRHdr.Offset = Offset;
+        OS.emitCVDefRangeDirective(Ranges, DRHdr);
+      } else {
+        uint16_t RegRelFlags = 0;
+        if (DefRange.IsSubfield) {
+          RegRelFlags = DefRangeRegisterRelSym::IsSubfieldFlag |
+                        (DefRange.StructOffset
+                         << DefRangeRegisterRelSym::OffsetInParentShift);
+        }
+        DefRangeRegisterRelHeader DRHdr;
+        DRHdr.Register = Reg;
+        DRHdr.Flags = RegRelFlags;
+        DRHdr.BasePointerOffset = Offset;
+        OS.emitCVDefRangeDirective(Ranges, DRHdr);
+      }
+    } else {
+      assert(DefRange.DataOffset == 0 && "unexpected offset into register");
+      if (DefRange.IsSubfield) {
+        DefRangeSubfieldRegisterHeader DRHdr;
+        DRHdr.Register = DefRange.CVRegister;
+        DRHdr.MayHaveNoName = 0;
+        DRHdr.OffsetInParent = DefRange.StructOffset;
+        OS.emitCVDefRangeDirective(Ranges, DRHdr);
+      } else {
+        DefRangeRegisterHeader DRHdr;
+        DRHdr.Register = DefRange.CVRegister;
+        DRHdr.MayHaveNoName = 0;
+        OS.emitCVDefRangeDirective(Ranges, DRHdr);
+      }
+    }
+  }
+}
+
+void CodeViewDebug::emitLexicalBlockList(ArrayRef<LexicalBlock *> Blocks,
+                                         const FunctionInfo& FI) {
+  for (LexicalBlock *Block : Blocks)
+    emitLexicalBlock(*Block, FI);
+}
+
+/// Emit an S_BLOCK32 and S_END record pair delimiting the contents of a
+/// lexical block scope.
+void CodeViewDebug::emitLexicalBlock(const LexicalBlock &Block,
+                                     const FunctionInfo& FI) {
+  MCSymbol *RecordEnd = beginSymbolRecord(SymbolKind::S_BLOCK32);
+  OS.AddComment("PtrParent");
+  OS.emitInt32(0); // PtrParent
+  OS.AddComment("PtrEnd");
+  OS.emitInt32(0); // PtrEnd
+  OS.AddComment("Code size");
+  OS.emitAbsoluteSymbolDiff(Block.End, Block.Begin, 4);   // Code Size
+  OS.AddComment("Function section relative address");
+  OS.emitCOFFSecRel32(Block.Begin, /*Offset=*/0); // Func Offset
+  OS.AddComment("Function section index");
+  OS.emitCOFFSectionIndex(FI.Begin); // Func Symbol
+  OS.AddComment("Lexical block name");
+  emitNullTerminatedSymbolName(OS, Block.Name);           // Name
+  endSymbolRecord(RecordEnd);
+
+  // Emit variables local to this lexical block.
+  emitLocalVariableList(FI, Block.Locals);
+  emitGlobalVariableList(Block.Globals);
+
+  // Emit lexical blocks contained within this block.
+  emitLexicalBlockList(Block.Children, FI);
+
+  // Close the lexical block scope.
+  emitEndSymbolRecord(SymbolKind::S_END);
+}
+
+/// Convenience routine for collecting lexical block information for a list
+/// of lexical scopes.
+void CodeViewDebug::collectLexicalBlockInfo(
+        SmallVectorImpl<LexicalScope *> &Scopes,
+        SmallVectorImpl<LexicalBlock *> &Blocks,
+        SmallVectorImpl<LocalVariable> &Locals,
+        SmallVectorImpl<CVGlobalVariable> &Globals) {
+  for (LexicalScope *Scope : Scopes)
+    collectLexicalBlockInfo(*Scope, Blocks, Locals, Globals);
+}
+
+/// Populate the lexical blocks and local variable lists of the parent with
+/// information about the specified lexical scope.
+void CodeViewDebug::collectLexicalBlockInfo(
+    LexicalScope &Scope,
+    SmallVectorImpl<LexicalBlock *> &ParentBlocks,
+    SmallVectorImpl<LocalVariable> &ParentLocals,
+    SmallVectorImpl<CVGlobalVariable> &ParentGlobals) {
+  if (Scope.isAbstractScope())
+    return;
+
+  // Gather information about the lexical scope including local variables,
+  // global variables, and address ranges.
+  bool IgnoreScope = false;
+  auto LI = ScopeVariables.find(&Scope);
+  SmallVectorImpl<LocalVariable> *Locals =
+      LI != ScopeVariables.end() ? &LI->second : nullptr;
+  auto GI = ScopeGlobals.find(Scope.getScopeNode());
+  SmallVectorImpl<CVGlobalVariable> *Globals =
+      GI != ScopeGlobals.end() ? GI->second.get() : nullptr;
+  const DILexicalBlock *DILB = dyn_cast<DILexicalBlock>(Scope.getScopeNode());
+  const SmallVectorImpl<InsnRange> &Ranges = Scope.getRanges();
+
+  // Ignore lexical scopes which do not contain variables.
+  if (!Locals && !Globals)
+    IgnoreScope = true;
+
+  // Ignore lexical scopes which are not lexical blocks.
+  if (!DILB)
+    IgnoreScope = true;
+
+  // Ignore scopes which have too many address ranges to represent in the
+  // current CodeView format or do not have a valid address range.
+  //
+  // For lexical scopes with multiple address ranges you may be tempted to
+  // construct a single range covering every instruction where the block is
+  // live and everything in between.  Unfortunately, Visual Studio only
+  // displays variables from the first matching lexical block scope.  If the
+  // first lexical block contains exception handling code or cold code which
+  // is moved to the bottom of the routine creating a single range covering
+  // nearly the entire routine, then it will hide all other lexical blocks
+  // and the variables they contain.
+  if (Ranges.size() != 1 || !getLabelAfterInsn(Ranges.front().second))
+    IgnoreScope = true;
+
+  if (IgnoreScope) {
+    // This scope can be safely ignored and eliminating it will reduce the
+    // size of the debug information. Be sure to collect any variable and scope
+    // information from the this scope or any of its children and collapse them
+    // into the parent scope.
+    if (Locals)
+      ParentLocals.append(Locals->begin(), Locals->end());
+    if (Globals)
+      ParentGlobals.append(Globals->begin(), Globals->end());
+    collectLexicalBlockInfo(Scope.getChildren(),
+                            ParentBlocks,
+                            ParentLocals,
+                            ParentGlobals);
+    return;
+  }
+
+  // Create a new CodeView lexical block for this lexical scope.  If we've
+  // seen this DILexicalBlock before then the scope tree is malformed and
+  // we can handle this gracefully by not processing it a second time.
+  auto BlockInsertion = CurFn->LexicalBlocks.insert({DILB, LexicalBlock()});
+  if (!BlockInsertion.second)
+    return;
+
+  // Create a lexical block containing the variables and collect the the
+  // lexical block information for the children.
+  const InsnRange &Range = Ranges.front();
+  assert(Range.first && Range.second);
+  LexicalBlock &Block = BlockInsertion.first->second;
+  Block.Begin = getLabelBeforeInsn(Range.first);
+  Block.End = getLabelAfterInsn(Range.second);
+  assert(Block.Begin && "missing label for scope begin");
+  assert(Block.End && "missing label for scope end");
+  Block.Name = DILB->getName();
+  if (Locals)
+    Block.Locals = std::move(*Locals);
+  if (Globals)
+    Block.Globals = std::move(*Globals);
+  ParentBlocks.push_back(&Block);
+  collectLexicalBlockInfo(Scope.getChildren(),
+                          Block.Children,
+                          Block.Locals,
+                          Block.Globals);
+}
+
+void CodeViewDebug::endFunctionImpl(const MachineFunction *MF) {
+  const Function &GV = MF->getFunction();
+  assert(FnDebugInfo.count(&GV));
+  assert(CurFn == FnDebugInfo[&GV].get());
+
+  collectVariableInfo(GV.getSubprogram());
+
+  // Build the lexical block structure to emit for this routine.
+  if (LexicalScope *CFS = LScopes.getCurrentFunctionScope())
+    collectLexicalBlockInfo(*CFS,
+                            CurFn->ChildBlocks,
+                            CurFn->Locals,
+                            CurFn->Globals);
+
+  // Clear the scope and variable information from the map which will not be
+  // valid after we have finished processing this routine.  This also prepares
+  // the map for the subsequent routine.
+  ScopeVariables.clear();
+
+  // Don't emit anything if we don't have any line tables.
+  // Thunks are compiler-generated and probably won't have source correlation.
+  if (!CurFn->HaveLineInfo && !GV.getSubprogram()->isThunk()) {
+    FnDebugInfo.erase(&GV);
+    CurFn = nullptr;
+    return;
+  }
+
+  // Find heap alloc sites and add to list.
+  for (const auto &MBB : *MF) {
+    for (const auto &MI : MBB) {
+      if (MDNode *MD = MI.getHeapAllocMarker()) {
+        CurFn->HeapAllocSites.push_back(std::make_tuple(getLabelBeforeInsn(&MI),
+                                                        getLabelAfterInsn(&MI),
+                                                        dyn_cast<DIType>(MD)));
+      }
+    }
+  }
+
+  CurFn->Annotations = MF->getCodeViewAnnotations();
+
+  CurFn->End = Asm->getFunctionEnd();
+
+  CurFn = nullptr;
+}
+
+// Usable locations are valid with non-zero line numbers. A line number of zero
+// corresponds to optimized code that doesn't have a distinct source location.
+// In this case, we try to use the previous or next source location depending on
+// the context.
+static bool isUsableDebugLoc(DebugLoc DL) {
+  return DL && DL.getLine() != 0;
+}
+
+void CodeViewDebug::beginInstruction(const MachineInstr *MI) {
+  DebugHandlerBase::beginInstruction(MI);
+
+  // Ignore DBG_VALUE and DBG_LABEL locations and function prologue.
+  if (!Asm || !CurFn || MI->isDebugInstr() ||
+      MI->getFlag(MachineInstr::FrameSetup))
+    return;
+
+  // If the first instruction of a new MBB has no location, find the first
+  // instruction with a location and use that.
+  DebugLoc DL = MI->getDebugLoc();
+  if (!isUsableDebugLoc(DL) && MI->getParent() != PrevInstBB) {
+    for (const auto &NextMI : *MI->getParent()) {
+      if (NextMI.isDebugInstr())
+        continue;
+      DL = NextMI.getDebugLoc();
+      if (isUsableDebugLoc(DL))
+        break;
+    }
+    // FIXME: Handle the case where the BB has no valid locations. This would
+    // probably require doing a real dataflow analysis.
+  }
+  PrevInstBB = MI->getParent();
+
+  // If we still don't have a debug location, don't record a location.
+  if (!isUsableDebugLoc(DL))
+    return;
+
+  maybeRecordLocation(DL, Asm->MF);
+}
+
+MCSymbol *CodeViewDebug::beginCVSubsection(DebugSubsectionKind Kind) {
+  MCSymbol *BeginLabel = MMI->getContext().createTempSymbol(),
+           *EndLabel = MMI->getContext().createTempSymbol();
+  OS.emitInt32(unsigned(Kind));
+  OS.AddComment("Subsection size");
+  OS.emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 4);
+  OS.emitLabel(BeginLabel);
+  return EndLabel;
+}
+
+void CodeViewDebug::endCVSubsection(MCSymbol *EndLabel) {
+  OS.emitLabel(EndLabel);
+  // Every subsection must be aligned to a 4-byte boundary.
+  OS.emitValueToAlignment(Align(4));
+}
+
+static StringRef getSymbolName(SymbolKind SymKind) {
+  for (const EnumEntry<SymbolKind> &EE : getSymbolTypeNames())
+    if (EE.Value == SymKind)
+      return EE.Name;
+  return "";
+}
+
+MCSymbol *CodeViewDebug::beginSymbolRecord(SymbolKind SymKind) {
+  MCSymbol *BeginLabel = MMI->getContext().createTempSymbol(),
+           *EndLabel = MMI->getContext().createTempSymbol();
+  OS.AddComment("Record length");
+  OS.emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 2);
+  OS.emitLabel(BeginLabel);
+  if (OS.isVerboseAsm())
+    OS.AddComment("Record kind: " + getSymbolName(SymKind));
+  OS.emitInt16(unsigned(SymKind));
+  return EndLabel;
+}
+
+void CodeViewDebug::endSymbolRecord(MCSymbol *SymEnd) {
+  // MSVC does not pad out symbol records to four bytes, but LLVM does to avoid
+  // an extra copy of every symbol record in LLD. This increases object file
+  // size by less than 1% in the clang build, and is compatible with the Visual
+  // C++ linker.
+  OS.emitValueToAlignment(Align(4));
+  OS.emitLabel(SymEnd);
+}
+
+void CodeViewDebug::emitEndSymbolRecord(SymbolKind EndKind) {
+  OS.AddComment("Record length");
+  OS.emitInt16(2);
+  if (OS.isVerboseAsm())
+    OS.AddComment("Record kind: " + getSymbolName(EndKind));
+  OS.emitInt16(uint16_t(EndKind)); // Record Kind
+}
+
+void CodeViewDebug::emitDebugInfoForUDTs(
+    const std::vector<std::pair<std::string, const DIType *>> &UDTs) {
+#ifndef NDEBUG
+  size_t OriginalSize = UDTs.size();
+#endif
+  for (const auto &UDT : UDTs) {
+    const DIType *T = UDT.second;
+    assert(shouldEmitUdt(T));
+    MCSymbol *UDTRecordEnd = beginSymbolRecord(SymbolKind::S_UDT);
+    OS.AddComment("Type");
+    OS.emitInt32(getCompleteTypeIndex(T).getIndex());
+    assert(OriginalSize == UDTs.size() &&
+           "getCompleteTypeIndex found new UDTs!");
+    emitNullTerminatedSymbolName(OS, UDT.first);
+    endSymbolRecord(UDTRecordEnd);
+  }
+}
+
+void CodeViewDebug::collectGlobalVariableInfo() {
+  DenseMap<const DIGlobalVariableExpression *, const GlobalVariable *>
+      GlobalMap;
+  for (const GlobalVariable &GV : MMI->getModule()->globals()) {
+    SmallVector<DIGlobalVariableExpression *, 1> GVEs;
+    GV.getDebugInfo(GVEs);
+    for (const auto *GVE : GVEs)
+      GlobalMap[GVE] = &GV;
+  }
+
+  NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+  for (const MDNode *Node : CUs->operands()) {
+    const auto *CU = cast<DICompileUnit>(Node);
+    for (const auto *GVE : CU->getGlobalVariables()) {
+      const DIGlobalVariable *DIGV = GVE->getVariable();
+      const DIExpression *DIE = GVE->getExpression();
+      // Don't emit string literals in CodeView, as the only useful parts are
+      // generally the filename and line number, which isn't possible to output
+      // in CodeView. String literals should be the only unnamed GlobalVariable
+      // with debug info.
+      if (DIGV->getName().empty()) continue;
+
+      if ((DIE->getNumElements() == 2) &&
+          (DIE->getElement(0) == dwarf::DW_OP_plus_uconst))
+        // Record the constant offset for the variable.
+        //
+        // A Fortran common block uses this idiom to encode the offset
+        // of a variable from the common block's starting address.
+        CVGlobalVariableOffsets.insert(
+            std::make_pair(DIGV, DIE->getElement(1)));
+
+      // Emit constant global variables in a global symbol section.
+      if (GlobalMap.count(GVE) == 0 && DIE->isConstant()) {
+        CVGlobalVariable CVGV = {DIGV, DIE};
+        GlobalVariables.emplace_back(std::move(CVGV));
+      }
+
+      const auto *GV = GlobalMap.lookup(GVE);
+      if (!GV || GV->isDeclarationForLinker())
+        continue;
+
+      DIScope *Scope = DIGV->getScope();
+      SmallVector<CVGlobalVariable, 1> *VariableList;
+      if (Scope && isa<DILocalScope>(Scope)) {
+        // Locate a global variable list for this scope, creating one if
+        // necessary.
+        auto Insertion = ScopeGlobals.insert(
+            {Scope, std::unique_ptr<GlobalVariableList>()});
+        if (Insertion.second)
+          Insertion.first->second = std::make_unique<GlobalVariableList>();
+        VariableList = Insertion.first->second.get();
+      } else if (GV->hasComdat())
+        // Emit this global variable into a COMDAT section.
+        VariableList = &ComdatVariables;
+      else
+        // Emit this global variable in a single global symbol section.
+        VariableList = &GlobalVariables;
+      CVGlobalVariable CVGV = {DIGV, GV};
+      VariableList->emplace_back(std::move(CVGV));
+    }
+  }
+}
+
+void CodeViewDebug::collectDebugInfoForGlobals() {
+  for (const CVGlobalVariable &CVGV : GlobalVariables) {
+    const DIGlobalVariable *DIGV = CVGV.DIGV;
+    const DIScope *Scope = DIGV->getScope();
+    getCompleteTypeIndex(DIGV->getType());
+    getFullyQualifiedName(Scope, DIGV->getName());
+  }
+
+  for (const CVGlobalVariable &CVGV : ComdatVariables) {
+    const DIGlobalVariable *DIGV = CVGV.DIGV;
+    const DIScope *Scope = DIGV->getScope();
+    getCompleteTypeIndex(DIGV->getType());
+    getFullyQualifiedName(Scope, DIGV->getName());
+  }
+}
+
+void CodeViewDebug::emitDebugInfoForGlobals() {
+  // First, emit all globals that are not in a comdat in a single symbol
+  // substream. MSVC doesn't like it if the substream is empty, so only open
+  // it if we have at least one global to emit.
+  switchToDebugSectionForSymbol(nullptr);
+  if (!GlobalVariables.empty() || !StaticConstMembers.empty()) {
+    OS.AddComment("Symbol subsection for globals");
+    MCSymbol *EndLabel = beginCVSubsection(DebugSubsectionKind::Symbols);
+    emitGlobalVariableList(GlobalVariables);
+    emitStaticConstMemberList();
+    endCVSubsection(EndLabel);
+  }
+
+  // Second, emit each global that is in a comdat into its own .debug$S
+  // section along with its own symbol substream.
+  for (const CVGlobalVariable &CVGV : ComdatVariables) {
+    const GlobalVariable *GV = cast<const GlobalVariable *>(CVGV.GVInfo);
+    MCSymbol *GVSym = Asm->getSymbol(GV);
+    OS.AddComment("Symbol subsection for " +
+                  Twine(GlobalValue::dropLLVMManglingEscape(GV->getName())));
+    switchToDebugSectionForSymbol(GVSym);
+    MCSymbol *EndLabel = beginCVSubsection(DebugSubsectionKind::Symbols);
+    // FIXME: emitDebugInfoForGlobal() doesn't handle DIExpressions.
+    emitDebugInfoForGlobal(CVGV);
+    endCVSubsection(EndLabel);
+  }
+}
+
+void CodeViewDebug::emitDebugInfoForRetainedTypes() {
+  NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+  for (const MDNode *Node : CUs->operands()) {
+    for (auto *Ty : cast<DICompileUnit>(Node)->getRetainedTypes()) {
+      if (DIType *RT = dyn_cast<DIType>(Ty)) {
+        getTypeIndex(RT);
+        // FIXME: Add to global/local DTU list.
+      }
+    }
+  }
+}
+
+// Emit each global variable in the specified array.
+void CodeViewDebug::emitGlobalVariableList(ArrayRef<CVGlobalVariable> Globals) {
+  for (const CVGlobalVariable &CVGV : Globals) {
+    // FIXME: emitDebugInfoForGlobal() doesn't handle DIExpressions.
+    emitDebugInfoForGlobal(CVGV);
+  }
+}
+
+void CodeViewDebug::emitConstantSymbolRecord(const DIType *DTy, APSInt &Value,
+                                             const std::string &QualifiedName) {
+  MCSymbol *SConstantEnd = beginSymbolRecord(SymbolKind::S_CONSTANT);
+  OS.AddComment("Type");
+  OS.emitInt32(getTypeIndex(DTy).getIndex());
+
+  OS.AddComment("Value");
+
+  // Encoded integers shouldn't need more than 10 bytes.
+  uint8_t Data[10];
+  BinaryStreamWriter Writer(Data, llvm::support::endianness::little);
+  CodeViewRecordIO IO(Writer);
+  cantFail(IO.mapEncodedInteger(Value));
+  StringRef SRef((char *)Data, Writer.getOffset());
+  OS.emitBinaryData(SRef);
+
+  OS.AddComment("Name");
+  emitNullTerminatedSymbolName(OS, QualifiedName);
+  endSymbolRecord(SConstantEnd);
+}
+
+void CodeViewDebug::emitStaticConstMemberList() {
+  for (const DIDerivedType *DTy : StaticConstMembers) {
+    const DIScope *Scope = DTy->getScope();
+
+    APSInt Value;
+    if (const ConstantInt *CI =
+            dyn_cast_or_null<ConstantInt>(DTy->getConstant()))
+      Value = APSInt(CI->getValue(),
+                     DebugHandlerBase::isUnsignedDIType(DTy->getBaseType()));
+    else if (const ConstantFP *CFP =
+                 dyn_cast_or_null<ConstantFP>(DTy->getConstant()))
+      Value = APSInt(CFP->getValueAPF().bitcastToAPInt(), true);
+    else
+      llvm_unreachable("cannot emit a constant without a value");
+
+    emitConstantSymbolRecord(DTy->getBaseType(), Value,
+                             getFullyQualifiedName(Scope, DTy->getName()));
+  }
+}
+
+static bool isFloatDIType(const DIType *Ty) {
+  if (isa<DICompositeType>(Ty))
+    return false;
+
+  if (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+    dwarf::Tag T = (dwarf::Tag)Ty->getTag();
+    if (T == dwarf::DW_TAG_pointer_type ||
+        T == dwarf::DW_TAG_ptr_to_member_type ||
+        T == dwarf::DW_TAG_reference_type ||
+        T == dwarf::DW_TAG_rvalue_reference_type)
+      return false;
+    assert(DTy->getBaseType() && "Expected valid base type");
+    return isFloatDIType(DTy->getBaseType());
+  }
+
+  auto *BTy = cast<DIBasicType>(Ty);
+  return (BTy->getEncoding() == dwarf::DW_ATE_float);
+}
+
+void CodeViewDebug::emitDebugInfoForGlobal(const CVGlobalVariable &CVGV) {
+  const DIGlobalVariable *DIGV = CVGV.DIGV;
+
+  const DIScope *Scope = DIGV->getScope();
+  // For static data members, get the scope from the declaration.
+  if (const auto *MemberDecl = dyn_cast_or_null<DIDerivedType>(
+          DIGV->getRawStaticDataMemberDeclaration()))
+    Scope = MemberDecl->getScope();
+  // For static local variables and Fortran, the scoping portion is elided
+  // in its name so that we can reference the variable in the command line
+  // of the VS debugger.
+  std::string QualifiedName =
+      (moduleIsInFortran() || (Scope && isa<DILocalScope>(Scope)))
+          ? std::string(DIGV->getName())
+          : getFullyQualifiedName(Scope, DIGV->getName());
+
+  if (const GlobalVariable *GV =
+          dyn_cast_if_present<const GlobalVariable *>(CVGV.GVInfo)) {
+    // DataSym record, see SymbolRecord.h for more info. Thread local data
+    // happens to have the same format as global data.
+    MCSymbol *GVSym = Asm->getSymbol(GV);
+    SymbolKind DataSym = GV->isThreadLocal()
+                             ? (DIGV->isLocalToUnit() ? SymbolKind::S_LTHREAD32
+                                                      : SymbolKind::S_GTHREAD32)
+                             : (DIGV->isLocalToUnit() ? SymbolKind::S_LDATA32
+                                                      : SymbolKind::S_GDATA32);
+    MCSymbol *DataEnd = beginSymbolRecord(DataSym);
+    OS.AddComment("Type");
+    OS.emitInt32(getCompleteTypeIndex(DIGV->getType()).getIndex());
+    OS.AddComment("DataOffset");
+
+    uint64_t Offset = 0;
+    if (CVGlobalVariableOffsets.contains(DIGV))
+      // Use the offset seen while collecting info on globals.
+      Offset = CVGlobalVariableOffsets[DIGV];
+    OS.emitCOFFSecRel32(GVSym, Offset);
+
+    OS.AddComment("Segment");
+    OS.emitCOFFSectionIndex(GVSym);
+    OS.AddComment("Name");
+    const unsigned LengthOfDataRecord = 12;
+    emitNullTerminatedSymbolName(OS, QualifiedName, LengthOfDataRecord);
+    endSymbolRecord(DataEnd);
+  } else {
+    const DIExpression *DIE = cast<const DIExpression *>(CVGV.GVInfo);
+    assert(DIE->isConstant() &&
+           "Global constant variables must contain a constant expression.");
+
+    // Use unsigned for floats.
+    bool isUnsigned = isFloatDIType(DIGV->getType())
+                          ? true
+                          : DebugHandlerBase::isUnsignedDIType(DIGV->getType());
+    APSInt Value(APInt(/*BitWidth=*/64, DIE->getElement(1)), isUnsigned);
+    emitConstantSymbolRecord(DIGV->getType(), Value, QualifiedName);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
new file mode 100644
index 000000000000..1455ac417824
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -0,0 +1,530 @@
+//===- llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing Microsoft CodeView debug info.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H
+
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/DbgEntityHistoryCalculator.h"
+#include "llvm/CodeGen/DebugHandlerBase.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Compiler.h"
+#include <cstdint>
+#include <map>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+
+struct ClassInfo;
+class StringRef;
+class AsmPrinter;
+class Function;
+class GlobalVariable;
+class MCSectionCOFF;
+class MCStreamer;
+class MCSymbol;
+class MachineFunction;
+
+/// Collects and handles line tables information in a CodeView format.
+class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
+public:
+  struct LocalVarDef {
+    /// Indicates that variable data is stored in memory relative to the
+    /// specified register.
+    int InMemory : 1;
+
+    /// Offset of variable data in memory.
+    int DataOffset : 31;
+
+    /// Non-zero if this is a piece of an aggregate.
+    uint16_t IsSubfield : 1;
+
+    /// Offset into aggregate.
+    uint16_t StructOffset : 15;
+
+    /// Register containing the data or the register base of the memory
+    /// location containing the data.
+    uint16_t CVRegister;
+
+    uint64_t static toOpaqueValue(const LocalVarDef DR) {
+      uint64_t Val = 0;
+      std::memcpy(&Val, &DR, sizeof(Val));
+      return Val;
+    }
+
+    LocalVarDef static createFromOpaqueValue(uint64_t Val) {
+      LocalVarDef DR;
+      std::memcpy(&DR, &Val, sizeof(Val));
+      return DR;
+    }
+  };
+
+  static_assert(sizeof(uint64_t) == sizeof(LocalVarDef));
+
+private:
+  MCStreamer &OS;
+  BumpPtrAllocator Allocator;
+  codeview::GlobalTypeTableBuilder TypeTable;
+
+  /// Whether to emit type record hashes into .debug$H.
+  bool EmitDebugGlobalHashes = false;
+
+  /// The codeview CPU type used by the translation unit.
+  codeview::CPUType TheCPU;
+
+  static LocalVarDef createDefRangeMem(uint16_t CVRegister, int Offset);
+
+  /// Similar to DbgVariable in DwarfDebug, but not dwarf-specific.
+  struct LocalVariable {
+    const DILocalVariable *DIVar = nullptr;
+    MapVector<LocalVarDef,
+              SmallVector<std::pair<const MCSymbol *, const MCSymbol *>, 1>>
+        DefRanges;
+    bool UseReferenceType = false;
+    std::optional<APSInt> ConstantValue;
+  };
+
+  struct CVGlobalVariable {
+    const DIGlobalVariable *DIGV;
+    PointerUnion<const GlobalVariable *, const DIExpression *> GVInfo;
+  };
+
+  struct InlineSite {
+    SmallVector<LocalVariable, 1> InlinedLocals;
+    SmallVector<const DILocation *, 1> ChildSites;
+    const DISubprogram *Inlinee = nullptr;
+
+    /// The ID of the inline site or function used with .cv_loc. Not a type
+    /// index.
+    unsigned SiteFuncId = 0;
+  };
+
+  // Combines information from DILexicalBlock and LexicalScope.
+  struct LexicalBlock {
+    SmallVector<LocalVariable, 1> Locals;
+    SmallVector<CVGlobalVariable, 1> Globals;
+    SmallVector<LexicalBlock *, 1> Children;
+    const MCSymbol *Begin;
+    const MCSymbol *End;
+    StringRef Name;
+  };
+
+  // For each function, store a vector of labels to its instructions, as well as
+  // to the end of the function.
+  struct FunctionInfo {
+    FunctionInfo() = default;
+
+    // Uncopyable.
+    FunctionInfo(const FunctionInfo &FI) = delete;
+
+    /// Map from inlined call site to inlined instructions and child inlined
+    /// call sites. Listed in program order.
+    std::unordered_map<const DILocation *, InlineSite> InlineSites;
+
+    /// Ordered list of top-level inlined call sites.
+    SmallVector<const DILocation *, 1> ChildSites;
+
+    SmallVector<LocalVariable, 1> Locals;
+    SmallVector<CVGlobalVariable, 1> Globals;
+
+    std::unordered_map<const DILexicalBlockBase*, LexicalBlock> LexicalBlocks;
+
+    // Lexical blocks containing local variables.
+    SmallVector<LexicalBlock *, 1> ChildBlocks;
+
+    std::vector<std::pair<MCSymbol *, MDNode *>> Annotations;
+    std::vector<std::tuple<const MCSymbol *, const MCSymbol *, const DIType *>>
+        HeapAllocSites;
+
+    const MCSymbol *Begin = nullptr;
+    const MCSymbol *End = nullptr;
+    unsigned FuncId = 0;
+    unsigned LastFileId = 0;
+
+    /// Number of bytes allocated in the prologue for all local stack objects.
+    unsigned FrameSize = 0;
+
+    /// Number of bytes of parameters on the stack.
+    unsigned ParamSize = 0;
+
+    /// Number of bytes pushed to save CSRs.
+    unsigned CSRSize = 0;
+
+    /// Adjustment to apply on x86 when using the VFRAME frame pointer.
+    int OffsetAdjustment = 0;
+
+    /// Two-bit value indicating which register is the designated frame pointer
+    /// register for local variables. Included in S_FRAMEPROC.
+    codeview::EncodedFramePtrReg EncodedLocalFramePtrReg =
+        codeview::EncodedFramePtrReg::None;
+
+    /// Two-bit value indicating which register is the designated frame pointer
+    /// register for stack parameters. Included in S_FRAMEPROC.
+    codeview::EncodedFramePtrReg EncodedParamFramePtrReg =
+        codeview::EncodedFramePtrReg::None;
+
+    codeview::FrameProcedureOptions FrameProcOpts;
+
+    bool HasStackRealignment = false;
+
+    bool HaveLineInfo = false;
+
+    bool HasFramePointer = false;
+  };
+  FunctionInfo *CurFn = nullptr;
+
+  codeview::SourceLanguage CurrentSourceLanguage =
+      codeview::SourceLanguage::Masm;
+
+  // This map records the constant offset in DIExpression of the
+  // DIGlobalVariableExpression referencing the DIGlobalVariable.
+  DenseMap<const DIGlobalVariable *, uint64_t> CVGlobalVariableOffsets;
+
+  // Map used to seperate variables according to the lexical scope they belong
+  // in.  This is populated by recordLocalVariable() before
+  // collectLexicalBlocks() separates the variables between the FunctionInfo
+  // and LexicalBlocks.
+  DenseMap<const LexicalScope *, SmallVector<LocalVariable, 1>> ScopeVariables;
+
+  // Map to separate global variables according to the lexical scope they
+  // belong in. A null local scope represents the global scope.
+  typedef SmallVector<CVGlobalVariable, 1> GlobalVariableList;
+  DenseMap<const DIScope*, std::unique_ptr<GlobalVariableList> > ScopeGlobals;
+
+  // Array of global variables which  need to be emitted into a COMDAT section.
+  SmallVector<CVGlobalVariable, 1> ComdatVariables;
+
+  // Array of non-COMDAT global variables.
+  SmallVector<CVGlobalVariable, 1> GlobalVariables;
+
+  /// List of static const data members to be emitted as S_CONSTANTs.
+  SmallVector<const DIDerivedType *, 4> StaticConstMembers;
+
+  /// The set of comdat .debug$S sections that we've seen so far. Each section
+  /// must start with a magic version number that must only be emitted once.
+  /// This set tracks which sections we've already opened.
+  DenseSet<MCSectionCOFF *> ComdatDebugSections;
+
+  /// Switch to the appropriate .debug$S section for GVSym. If GVSym, the symbol
+  /// of an emitted global value, is in a comdat COFF section, this will switch
+  /// to a new .debug$S section in that comdat. This method ensures that the
+  /// section starts with the magic version number on first use. If GVSym is
+  /// null, uses the main .debug$S section.
+  void switchToDebugSectionForSymbol(const MCSymbol *GVSym);
+
+  /// The next available function index for use with our .cv_* directives. Not
+  /// to be confused with type indices for LF_FUNC_ID records.
+  unsigned NextFuncId = 0;
+
+  InlineSite &getInlineSite(const DILocation *InlinedAt,
+                            const DISubprogram *Inlinee);
+
+  codeview::TypeIndex getFuncIdForSubprogram(const DISubprogram *SP);
+
+  void calculateRanges(LocalVariable &Var,
+                       const DbgValueHistoryMap::Entries &Entries);
+
+  /// Remember some debug info about each function. Keep it in a stable order to
+  /// emit at the end of the TU.
+  MapVector<const Function *, std::unique_ptr<FunctionInfo>> FnDebugInfo;
+
+  /// Map from full file path to .cv_file id. Full paths are built from DIFiles
+  /// and are stored in FileToFilepathMap;
+  DenseMap<StringRef, unsigned> FileIdMap;
+
+  /// All inlined subprograms in the order they should be emitted.
+  SmallSetVector<const DISubprogram *, 4> InlinedSubprograms;
+
+  /// Map from a pair of DI metadata nodes and its DI type (or scope) that can
+  /// be nullptr, to CodeView type indices. Primarily indexed by
+  /// {DIType*, DIType*} and {DISubprogram*, DIType*}.
+  ///
+  /// The second entry in the key is needed for methods as DISubroutineType
+  /// representing static method type are shared with non-method function type.
+  DenseMap<std::pair<const DINode *, const DIType *>, codeview::TypeIndex>
+      TypeIndices;
+
+  /// Map from DICompositeType* to complete type index. Non-record types are
+  /// always looked up in the normal TypeIndices map.
+  DenseMap<const DICompositeType *, codeview::TypeIndex> CompleteTypeIndices;
+
+  /// Complete record types to emit after all active type lowerings are
+  /// finished.
+  SmallVector<const DICompositeType *, 4> DeferredCompleteTypes;
+
+  /// Number of type lowering frames active on the stack.
+  unsigned TypeEmissionLevel = 0;
+
+  codeview::TypeIndex VBPType;
+
+  const DISubprogram *CurrentSubprogram = nullptr;
+
+  // The UDTs we have seen while processing types; each entry is a pair of type
+  // index and type name.
+  std::vector<std::pair<std::string, const DIType *>> LocalUDTs;
+  std::vector<std::pair<std::string, const DIType *>> GlobalUDTs;
+
+  using FileToFilepathMapTy = std::map<const DIFile *, std::string>;
+  FileToFilepathMapTy FileToFilepathMap;
+
+  StringRef getFullFilepath(const DIFile *File);
+
+  unsigned maybeRecordFile(const DIFile *F);
+
+  void maybeRecordLocation(const DebugLoc &DL, const MachineFunction *MF);
+
+  void clear();
+
+  void setCurrentSubprogram(const DISubprogram *SP) {
+    CurrentSubprogram = SP;
+    LocalUDTs.clear();
+  }
+
+  /// Emit the magic version number at the start of a CodeView type or symbol
+  /// section. Appears at the front of every .debug$S or .debug$T or .debug$P
+  /// section.
+  void emitCodeViewMagicVersion();
+
+  void emitTypeInformation();
+
+  void emitTypeGlobalHashes();
+
+  void emitObjName();
+
+  void emitCompilerInformation();
+
+  void emitBuildInfo();
+
+  void emitInlineeLinesSubsection();
+
+  void emitDebugInfoForThunk(const Function *GV,
+                             FunctionInfo &FI,
+                             const MCSymbol *Fn);
+
+  void emitDebugInfoForFunction(const Function *GV, FunctionInfo &FI);
+
+  void emitDebugInfoForRetainedTypes();
+
+  void emitDebugInfoForUDTs(
+      const std::vector<std::pair<std::string, const DIType *>> &UDTs);
+
+  void collectDebugInfoForGlobals();
+  void emitDebugInfoForGlobals();
+  void emitGlobalVariableList(ArrayRef<CVGlobalVariable> Globals);
+  void emitConstantSymbolRecord(const DIType *DTy, APSInt &Value,
+                                const std::string &QualifiedName);
+  void emitDebugInfoForGlobal(const CVGlobalVariable &CVGV);
+  void emitStaticConstMemberList();
+
+  /// Opens a subsection of the given kind in a .debug$S codeview section.
+  /// Returns an end label for use with endCVSubsection when the subsection is
+  /// finished.
+  MCSymbol *beginCVSubsection(codeview::DebugSubsectionKind Kind);
+  void endCVSubsection(MCSymbol *EndLabel);
+
+  /// Opens a symbol record of the given kind. Returns an end label for use with
+  /// endSymbolRecord.
+  MCSymbol *beginSymbolRecord(codeview::SymbolKind Kind);
+  void endSymbolRecord(MCSymbol *SymEnd);
+
+  /// Emits an S_END, S_INLINESITE_END, or S_PROC_ID_END record. These records
+  /// are empty, so we emit them with a simpler assembly sequence that doesn't
+  /// involve labels.
+  void emitEndSymbolRecord(codeview::SymbolKind EndKind);
+
+  void emitInlinedCallSite(const FunctionInfo &FI, const DILocation *InlinedAt,
+                           const InlineSite &Site);
+
+  using InlinedEntity = DbgValueHistoryMap::InlinedEntity;
+
+  void collectGlobalVariableInfo();
+  void collectVariableInfo(const DISubprogram *SP);
+
+  void collectVariableInfoFromMFTable(DenseSet<InlinedEntity> &Processed);
+
+  // Construct the lexical block tree for a routine, pruning emptpy lexical
+  // scopes, and populate it with local variables.
+  void collectLexicalBlockInfo(SmallVectorImpl<LexicalScope *> &Scopes,
+                               SmallVectorImpl<LexicalBlock *> &Blocks,
+                               SmallVectorImpl<LocalVariable> &Locals,
+                               SmallVectorImpl<CVGlobalVariable> &Globals);
+  void collectLexicalBlockInfo(LexicalScope &Scope,
+                               SmallVectorImpl<LexicalBlock *> &ParentBlocks,
+                               SmallVectorImpl<LocalVariable> &ParentLocals,
+                               SmallVectorImpl<CVGlobalVariable> &ParentGlobals);
+
+  /// Records information about a local variable in the appropriate scope. In
+  /// particular, locals from inlined code live inside the inlining site.
+  void recordLocalVariable(LocalVariable &&Var, const LexicalScope *LS);
+
+  /// Emits local variables in the appropriate order.
+  void emitLocalVariableList(const FunctionInfo &FI,
+                             ArrayRef<LocalVariable> Locals);
+
+  /// Emits an S_LOCAL record and its associated defined ranges.
+  void emitLocalVariable(const FunctionInfo &FI, const LocalVariable &Var);
+
+  /// Emits a sequence of lexical block scopes and their children.
+  void emitLexicalBlockList(ArrayRef<LexicalBlock *> Blocks,
+                            const FunctionInfo& FI);
+
+  /// Emit a lexical block scope and its children.
+  void emitLexicalBlock(const LexicalBlock &Block, const FunctionInfo& FI);
+
+  /// Translates the DIType to codeview if necessary and returns a type index
+  /// for it.
+  codeview::TypeIndex getTypeIndex(const DIType *Ty,
+                                   const DIType *ClassTy = nullptr);
+
+  codeview::TypeIndex
+  getTypeIndexForThisPtr(const DIDerivedType *PtrTy,
+                         const DISubroutineType *SubroutineTy);
+
+  codeview::TypeIndex getTypeIndexForReferenceTo(const DIType *Ty);
+
+  codeview::TypeIndex getMemberFunctionType(const DISubprogram *SP,
+                                            const DICompositeType *Class);
+
+  codeview::TypeIndex getScopeIndex(const DIScope *Scope);
+
+  codeview::TypeIndex getVBPTypeIndex();
+
+  void addToUDTs(const DIType *Ty);
+
+  void addUDTSrcLine(const DIType *Ty, codeview::TypeIndex TI);
+
+  codeview::TypeIndex lowerType(const DIType *Ty, const DIType *ClassTy);
+  codeview::TypeIndex lowerTypeAlias(const DIDerivedType *Ty);
+  codeview::TypeIndex lowerTypeArray(const DICompositeType *Ty);
+  codeview::TypeIndex lowerTypeString(const DIStringType *Ty);
+  codeview::TypeIndex lowerTypeBasic(const DIBasicType *Ty);
+  codeview::TypeIndex lowerTypePointer(
+      const DIDerivedType *Ty,
+      codeview::PointerOptions PO = codeview::PointerOptions::None);
+  codeview::TypeIndex lowerTypeMemberPointer(
+      const DIDerivedType *Ty,
+      codeview::PointerOptions PO = codeview::PointerOptions::None);
+  codeview::TypeIndex lowerTypeModifier(const DIDerivedType *Ty);
+  codeview::TypeIndex lowerTypeFunction(const DISubroutineType *Ty);
+  codeview::TypeIndex lowerTypeVFTableShape(const DIDerivedType *Ty);
+  codeview::TypeIndex lowerTypeMemberFunction(
+      const DISubroutineType *Ty, const DIType *ClassTy, int ThisAdjustment,
+      bool IsStaticMethod,
+      codeview::FunctionOptions FO = codeview::FunctionOptions::None);
+  codeview::TypeIndex lowerTypeEnum(const DICompositeType *Ty);
+  codeview::TypeIndex lowerTypeClass(const DICompositeType *Ty);
+  codeview::TypeIndex lowerTypeUnion(const DICompositeType *Ty);
+
+  /// Symbol records should point to complete types, but type records should
+  /// always point to incomplete types to avoid cycles in the type graph. Only
+  /// use this entry point when generating symbol records. The complete and
+  /// incomplete type indices only differ for record types. All other types use
+  /// the same index.
+  codeview::TypeIndex getCompleteTypeIndex(const DIType *Ty);
+
+  codeview::TypeIndex lowerCompleteTypeClass(const DICompositeType *Ty);
+  codeview::TypeIndex lowerCompleteTypeUnion(const DICompositeType *Ty);
+
+  struct TypeLoweringScope;
+
+  void emitDeferredCompleteTypes();
+
+  void collectMemberInfo(ClassInfo &Info, const DIDerivedType *DDTy);
+  ClassInfo collectClassInfo(const DICompositeType *Ty);
+
+  /// Common record member lowering functionality for record types, which are
+  /// structs, classes, and unions. Returns the field list index and the member
+  /// count.
+  std::tuple<codeview::TypeIndex, codeview::TypeIndex, unsigned, bool>
+  lowerRecordFieldList(const DICompositeType *Ty);
+
+  /// Inserts {{Node, ClassTy}, TI} into TypeIndices and checks for duplicates.
+  codeview::TypeIndex recordTypeIndexForDINode(const DINode *Node,
+                                               codeview::TypeIndex TI,
+                                               const DIType *ClassTy = nullptr);
+
+  /// Collect the names of parent scopes, innermost to outermost. Return the
+  /// innermost subprogram scope if present. Ensure that parent type scopes are
+  /// inserted into the type table.
+  const DISubprogram *
+  collectParentScopeNames(const DIScope *Scope,
+                          SmallVectorImpl<StringRef> &ParentScopeNames);
+  std::string getFullyQualifiedName(const DIScope *Scope, StringRef Name);
+  std::string getFullyQualifiedName(const DIScope *Scope);
+
+  unsigned getPointerSizeInBytes();
+
+protected:
+  /// Gather pre-function debug information.
+  void beginFunctionImpl(const MachineFunction *MF) override;
+
+  /// Gather post-function debug information.
+  void endFunctionImpl(const MachineFunction *) override;
+
+  /// Check if the current module is in Fortran.
+  bool moduleIsInFortran() {
+    return CurrentSourceLanguage == codeview::SourceLanguage::Fortran;
+  }
+
+public:
+  CodeViewDebug(AsmPrinter *AP);
+
+  void beginModule(Module *M) override;
+
+  void setSymbolSize(const MCSymbol *, uint64_t) override {}
+
+  /// Emit the COFF section that holds the line table information.
+  void endModule() override;
+
+  /// Process beginning of an instruction.
+  void beginInstruction(const MachineInstr *MI) override;
+};
+
+template <> struct DenseMapInfo<CodeViewDebug::LocalVarDef> {
+
+  static inline CodeViewDebug::LocalVarDef getEmptyKey() {
+    return CodeViewDebug::LocalVarDef::createFromOpaqueValue(~0ULL);
+  }
+
+  static inline CodeViewDebug::LocalVarDef getTombstoneKey() {
+    return CodeViewDebug::LocalVarDef::createFromOpaqueValue(~0ULL - 1ULL);
+  }
+
+  static unsigned getHashValue(const CodeViewDebug::LocalVarDef &DR) {
+    return CodeViewDebug::LocalVarDef::toOpaqueValue(DR) * 37ULL;
+  }
+
+  static bool isEqual(const CodeViewDebug::LocalVarDef &LHS,
+                      const CodeViewDebug::LocalVarDef &RHS) {
+    return CodeViewDebug::LocalVarDef::toOpaqueValue(LHS) ==
+           CodeViewDebug::LocalVarDef::toOpaqueValue(RHS);
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
new file mode 100644
index 000000000000..619155cafe92
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -0,0 +1,872 @@
+//===--- lib/CodeGen/DIE.cpp - DWARF Info Entries -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Data structures for DWARF info entries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/DIE.h"
+#include "DwarfCompileUnit.h"
+#include "DwarfDebug.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "dwarfdebug"
+
+//===----------------------------------------------------------------------===//
+// DIEAbbrevData Implementation
+//===----------------------------------------------------------------------===//
+
+/// Profile - Used to gather unique data for the abbreviation folding set.
+///
+void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const {
+  // Explicitly cast to an integer type for which FoldingSetNodeID has
+  // overloads.  Otherwise MSVC 2010 thinks this call is ambiguous.
+  ID.AddInteger(unsigned(Attribute));
+  ID.AddInteger(unsigned(Form));
+  if (Form == dwarf::DW_FORM_implicit_const)
+    ID.AddInteger(Value);
+}
+
+//===----------------------------------------------------------------------===//
+// DIEAbbrev Implementation
+//===----------------------------------------------------------------------===//
+
+/// Profile - Used to gather unique data for the abbreviation folding set.
+///
+void DIEAbbrev::Profile(FoldingSetNodeID &ID) const {
+  ID.AddInteger(unsigned(Tag));
+  ID.AddInteger(unsigned(Children));
+
+  // For each attribute description.
+  for (unsigned i = 0, N = Data.size(); i < N; ++i)
+    Data[i].Profile(ID);
+}
+
+/// Emit - Print the abbreviation using the specified asm printer.
+///
+void DIEAbbrev::Emit(const AsmPrinter *AP) const {
+  // Emit its Dwarf tag type.
+  AP->emitULEB128(Tag, dwarf::TagString(Tag).data());
+
+  // Emit whether it has children DIEs.
+  AP->emitULEB128((unsigned)Children, dwarf::ChildrenString(Children).data());
+
+  // For each attribute description.
+  for (unsigned i = 0, N = Data.size(); i < N; ++i) {
+    const DIEAbbrevData &AttrData = Data[i];
+
+    // Emit attribute type.
+    AP->emitULEB128(AttrData.getAttribute(),
+                    dwarf::AttributeString(AttrData.getAttribute()).data());
+
+    // Emit form type.
+#ifndef NDEBUG
+    // Could be an assertion, but this way we can see the failing form code
+    // easily, which helps track down where it came from.
+    if (!dwarf::isValidFormForVersion(AttrData.getForm(),
+                                      AP->getDwarfVersion())) {
+      LLVM_DEBUG(dbgs() << "Invalid form " << format("0x%x", AttrData.getForm())
+                        << " for DWARF version " << AP->getDwarfVersion()
+                        << "\n");
+      llvm_unreachable("Invalid form for specified DWARF version");
+    }
+#endif
+    AP->emitULEB128(AttrData.getForm(),
+                    dwarf::FormEncodingString(AttrData.getForm()).data());
+
+    // Emit value for DW_FORM_implicit_const.
+    if (AttrData.getForm() == dwarf::DW_FORM_implicit_const)
+      AP->emitSLEB128(AttrData.getValue());
+  }
+
+  // Mark end of abbreviation.
+  AP->emitULEB128(0, "EOM(1)");
+  AP->emitULEB128(0, "EOM(2)");
+}
+
+LLVM_DUMP_METHOD
+void DIEAbbrev::print(raw_ostream &O) const {
+  O << "Abbreviation @"
+    << format("0x%lx", (long)(intptr_t)this)
+    << "  "
+    << dwarf::TagString(Tag)
+    << " "
+    << dwarf::ChildrenString(Children)
+    << '\n';
+
+  for (unsigned i = 0, N = Data.size(); i < N; ++i) {
+    O << "  "
+      << dwarf::AttributeString(Data[i].getAttribute())
+      << "  "
+      << dwarf::FormEncodingString(Data[i].getForm());
+
+    if (Data[i].getForm() == dwarf::DW_FORM_implicit_const)
+      O << " " << Data[i].getValue();
+
+    O << '\n';
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void DIEAbbrev::dump() const {
+  print(dbgs());
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIEAbbrevSet Implementation
+//===----------------------------------------------------------------------===//
+
+DIEAbbrevSet::~DIEAbbrevSet() {
+  for (DIEAbbrev *Abbrev : Abbreviations)
+    Abbrev->~DIEAbbrev();
+}
+
+DIEAbbrev &DIEAbbrevSet::uniqueAbbreviation(DIE &Die) {
+
+  FoldingSetNodeID ID;
+  DIEAbbrev Abbrev = Die.generateAbbrev();
+  Abbrev.Profile(ID);
+
+  void *InsertPos;
+  if (DIEAbbrev *Existing =
+          AbbreviationsSet.FindNodeOrInsertPos(ID, InsertPos)) {
+    Die.setAbbrevNumber(Existing->getNumber());
+    return *Existing;
+  }
+
+  // Move the abbreviation to the heap and assign a number.
+  DIEAbbrev *New = new (Alloc) DIEAbbrev(std::move(Abbrev));
+  Abbreviations.push_back(New);
+  New->setNumber(Abbreviations.size());
+  Die.setAbbrevNumber(Abbreviations.size());
+
+  // Store it for lookup.
+  AbbreviationsSet.InsertNode(New, InsertPos);
+  return *New;
+}
+
+void DIEAbbrevSet::Emit(const AsmPrinter *AP, MCSection *Section) const {
+  if (!Abbreviations.empty()) {
+    // Start the debug abbrev section.
+    AP->OutStreamer->switchSection(Section);
+    AP->emitDwarfAbbrevs(Abbreviations);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// DIE Implementation
+//===----------------------------------------------------------------------===//
+
+DIE *DIE::getParent() const { return dyn_cast_if_present<DIE *>(Owner); }
+
+DIEAbbrev DIE::generateAbbrev() const {
+  DIEAbbrev Abbrev(Tag, hasChildren());
+  for (const DIEValue &V : values())
+    if (V.getForm() == dwarf::DW_FORM_implicit_const)
+      Abbrev.AddImplicitConstAttribute(V.getAttribute(),
+                                       V.getDIEInteger().getValue());
+    else
+      Abbrev.AddAttribute(V.getAttribute(), V.getForm());
+  return Abbrev;
+}
+
+uint64_t DIE::getDebugSectionOffset() const {
+  const DIEUnit *Unit = getUnit();
+  assert(Unit && "DIE must be owned by a DIEUnit to get its absolute offset");
+  return Unit->getDebugSectionOffset() + getOffset();
+}
+
+const DIE *DIE::getUnitDie() const {
+  const DIE *p = this;
+  while (p) {
+    if (p->getTag() == dwarf::DW_TAG_compile_unit ||
+        p->getTag() == dwarf::DW_TAG_skeleton_unit ||
+        p->getTag() == dwarf::DW_TAG_type_unit)
+      return p;
+    p = p->getParent();
+  }
+  return nullptr;
+}
+
+DIEUnit *DIE::getUnit() const {
+  const DIE *UnitDie = getUnitDie();
+  if (UnitDie)
+    return dyn_cast_if_present<DIEUnit *>(UnitDie->Owner);
+  return nullptr;
+}
+
+DIEValue DIE::findAttribute(dwarf::Attribute Attribute) const {
+  // Iterate through all the attributes until we find the one we're
+  // looking for, if we can't find it return NULL.
+  for (const auto &V : values())
+    if (V.getAttribute() == Attribute)
+      return V;
+  return DIEValue();
+}
+
+LLVM_DUMP_METHOD
+static void printValues(raw_ostream &O, const DIEValueList &Values,
+                        StringRef Type, unsigned Size, unsigned IndentCount) {
+  O << Type << ": Size: " << Size << "\n";
+
+  unsigned I = 0;
+  const std::string Indent(IndentCount, ' ');
+  for (const auto &V : Values.values()) {
+    O << Indent;
+    O << "Blk[" << I++ << "]";
+    O << "  " << dwarf::FormEncodingString(V.getForm()) << " ";
+    V.print(O);
+    O << "\n";
+  }
+}
+
+LLVM_DUMP_METHOD
+void DIE::print(raw_ostream &O, unsigned IndentCount) const {
+  const std::string Indent(IndentCount, ' ');
+  O << Indent << "Die: " << format("0x%lx", (long)(intptr_t) this)
+    << ", Offset: " << Offset << ", Size: " << Size << "\n";
+
+  O << Indent << dwarf::TagString(getTag()) << " "
+    << dwarf::ChildrenString(hasChildren()) << "\n";
+
+  IndentCount += 2;
+  for (const auto &V : values()) {
+    O << Indent;
+    O << dwarf::AttributeString(V.getAttribute());
+    O << "  " << dwarf::FormEncodingString(V.getForm()) << " ";
+    V.print(O);
+    O << "\n";
+  }
+  IndentCount -= 2;
+
+  for (const auto &Child : children())
+    Child.print(O, IndentCount + 4);
+
+  O << "\n";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void DIE::dump() const {
+  print(dbgs());
+}
+#endif
+
+unsigned DIE::computeOffsetsAndAbbrevs(const dwarf::FormParams &FormParams,
+                                       DIEAbbrevSet &AbbrevSet,
+                                       unsigned CUOffset) {
+  // Unique the abbreviation and fill in the abbreviation number so this DIE
+  // can be emitted.
+  const DIEAbbrev &Abbrev = AbbrevSet.uniqueAbbreviation(*this);
+
+  // Set compile/type unit relative offset of this DIE.
+  setOffset(CUOffset);
+
+  // Add the byte size of the abbreviation code.
+  CUOffset += getULEB128Size(getAbbrevNumber());
+
+  // Add the byte size of all the DIE attribute values.
+  for (const auto &V : values())
+    CUOffset += V.sizeOf(FormParams);
+
+  // Let the children compute their offsets and abbreviation numbers.
+  if (hasChildren()) {
+    (void)Abbrev;
+    assert(Abbrev.hasChildren() && "Children flag not set");
+
+    for (auto &Child : children())
+      CUOffset =
+          Child.computeOffsetsAndAbbrevs(FormParams, AbbrevSet, CUOffset);
+
+    // Each child chain is terminated with a zero byte, adjust the offset.
+    CUOffset += sizeof(int8_t);
+  }
+
+  // Compute the byte size of this DIE and all of its children correctly. This
+  // is needed so that top level DIE can help the compile unit set its length
+  // correctly.
+  setSize(CUOffset - getOffset());
+  return CUOffset;
+}
+
+//===----------------------------------------------------------------------===//
+// DIEUnit Implementation
+//===----------------------------------------------------------------------===//
+DIEUnit::DIEUnit(dwarf::Tag UnitTag) : Die(UnitTag) {
+  Die.Owner = this;
+  assert((UnitTag == dwarf::DW_TAG_compile_unit ||
+          UnitTag == dwarf::DW_TAG_skeleton_unit ||
+          UnitTag == dwarf::DW_TAG_type_unit ||
+          UnitTag == dwarf::DW_TAG_partial_unit) &&
+         "expected a unit TAG");
+}
+
+void DIEValue::emitValue(const AsmPrinter *AP) const {
+  switch (Ty) {
+  case isNone:
+    llvm_unreachable("Expected valid DIEValue");
+#define HANDLE_DIEVALUE(T)                                                     \
+  case is##T:                                                                  \
+    getDIE##T().emitValue(AP, Form);                                           \
+    break;
+#include "llvm/CodeGen/DIEValue.def"
+  }
+}
+
+unsigned DIEValue::sizeOf(const dwarf::FormParams &FormParams) const {
+  switch (Ty) {
+  case isNone:
+    llvm_unreachable("Expected valid DIEValue");
+#define HANDLE_DIEVALUE(T)                                                     \
+  case is##T:                                                                  \
+    return getDIE##T().sizeOf(FormParams, Form);
+#include "llvm/CodeGen/DIEValue.def"
+  }
+  llvm_unreachable("Unknown DIE kind");
+}
+
+LLVM_DUMP_METHOD
+void DIEValue::print(raw_ostream &O) const {
+  switch (Ty) {
+  case isNone:
+    llvm_unreachable("Expected valid DIEValue");
+#define HANDLE_DIEVALUE(T)                                                     \
+  case is##T:                                                                  \
+    getDIE##T().print(O);                                                      \
+    break;
+#include "llvm/CodeGen/DIEValue.def"
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void DIEValue::dump() const {
+  print(dbgs());
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIEInteger Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit integer of appropriate size.
+///
+void DIEInteger::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
+  switch (Form) {
+  case dwarf::DW_FORM_implicit_const:
+  case dwarf::DW_FORM_flag_present:
+    // Emit something to keep the lines and comments in sync.
+    // FIXME: Is there a better way to do this?
+    Asm->OutStreamer->addBlankLine();
+    return;
+  case dwarf::DW_FORM_flag:
+  case dwarf::DW_FORM_ref1:
+  case dwarf::DW_FORM_data1:
+  case dwarf::DW_FORM_strx1:
+  case dwarf::DW_FORM_addrx1:
+  case dwarf::DW_FORM_ref2:
+  case dwarf::DW_FORM_data2:
+  case dwarf::DW_FORM_strx2:
+  case dwarf::DW_FORM_addrx2:
+  case dwarf::DW_FORM_strx3:
+  case dwarf::DW_FORM_addrx3:
+  case dwarf::DW_FORM_strp:
+  case dwarf::DW_FORM_ref4:
+  case dwarf::DW_FORM_data4:
+  case dwarf::DW_FORM_ref_sup4:
+  case dwarf::DW_FORM_strx4:
+  case dwarf::DW_FORM_addrx4:
+  case dwarf::DW_FORM_ref8:
+  case dwarf::DW_FORM_ref_sig8:
+  case dwarf::DW_FORM_data8:
+  case dwarf::DW_FORM_ref_sup8:
+  case dwarf::DW_FORM_GNU_ref_alt:
+  case dwarf::DW_FORM_GNU_strp_alt:
+  case dwarf::DW_FORM_line_strp:
+  case dwarf::DW_FORM_sec_offset:
+  case dwarf::DW_FORM_strp_sup:
+  case dwarf::DW_FORM_addr:
+  case dwarf::DW_FORM_ref_addr:
+    Asm->OutStreamer->emitIntValue(Integer,
+                                   sizeOf(Asm->getDwarfFormParams(), Form));
+    return;
+  case dwarf::DW_FORM_GNU_str_index:
+  case dwarf::DW_FORM_GNU_addr_index:
+  case dwarf::DW_FORM_ref_udata:
+  case dwarf::DW_FORM_strx:
+  case dwarf::DW_FORM_addrx:
+  case dwarf::DW_FORM_rnglistx:
+  case dwarf::DW_FORM_udata:
+    Asm->emitULEB128(Integer);
+    return;
+  case dwarf::DW_FORM_sdata:
+    Asm->emitSLEB128(Integer);
+    return;
+  default: llvm_unreachable("DIE Value form not supported yet");
+  }
+}
+
+/// sizeOf - Determine size of integer value in bytes.
+///
+unsigned DIEInteger::sizeOf(const dwarf::FormParams &FormParams,
+                            dwarf::Form Form) const {
+  if (std::optional<uint8_t> FixedSize =
+          dwarf::getFixedFormByteSize(Form, FormParams))
+    return *FixedSize;
+
+  switch (Form) {
+  case dwarf::DW_FORM_GNU_str_index:
+  case dwarf::DW_FORM_GNU_addr_index:
+  case dwarf::DW_FORM_ref_udata:
+  case dwarf::DW_FORM_strx:
+  case dwarf::DW_FORM_addrx:
+  case dwarf::DW_FORM_rnglistx:
+  case dwarf::DW_FORM_udata:
+    return getULEB128Size(Integer);
+  case dwarf::DW_FORM_sdata:
+    return getSLEB128Size(Integer);
+  default: llvm_unreachable("DIE Value form not supported yet");
+  }
+}
+
+LLVM_DUMP_METHOD
+void DIEInteger::print(raw_ostream &O) const {
+  O << "Int: " << (int64_t)Integer << "  0x";
+  O.write_hex(Integer);
+}
+
+//===----------------------------------------------------------------------===//
+// DIEExpr Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit expression value.
+///
+void DIEExpr::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  AP->emitDebugValue(Expr, sizeOf(AP->getDwarfFormParams(), Form));
+}
+
+/// SizeOf - Determine size of expression value in bytes.
+///
+unsigned DIEExpr::sizeOf(const dwarf::FormParams &FormParams,
+                         dwarf::Form Form) const {
+  switch (Form) {
+  case dwarf::DW_FORM_data4:
+    return 4;
+  case dwarf::DW_FORM_data8:
+    return 8;
+  case dwarf::DW_FORM_sec_offset:
+    return FormParams.getDwarfOffsetByteSize();
+  default:
+    llvm_unreachable("DIE Value form not supported yet");
+  }
+}
+
+LLVM_DUMP_METHOD
+void DIEExpr::print(raw_ostream &O) const { O << "Expr: " << *Expr; }
+
+//===----------------------------------------------------------------------===//
+// DIELabel Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit label value.
+///
+void DIELabel::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  bool IsSectionRelative = Form != dwarf::DW_FORM_addr;
+  AP->emitLabelReference(Label, sizeOf(AP->getDwarfFormParams(), Form),
+                         IsSectionRelative);
+}
+
+/// sizeOf - Determine size of label value in bytes.
+///
+unsigned DIELabel::sizeOf(const dwarf::FormParams &FormParams,
+                          dwarf::Form Form) const {
+  switch (Form) {
+  case dwarf::DW_FORM_data4:
+    return 4;
+  case dwarf::DW_FORM_data8:
+    return 8;
+  case dwarf::DW_FORM_sec_offset:
+  case dwarf::DW_FORM_strp:
+    return FormParams.getDwarfOffsetByteSize();
+  case dwarf::DW_FORM_addr:
+    return FormParams.AddrSize;
+  default:
+    llvm_unreachable("DIE Value form not supported yet");
+  }
+}
+
+LLVM_DUMP_METHOD
+void DIELabel::print(raw_ostream &O) const { O << "Lbl: " << Label->getName(); }
+
+//===----------------------------------------------------------------------===//
+// DIEBaseTypeRef Implementation
+//===----------------------------------------------------------------------===//
+
+void DIEBaseTypeRef::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  uint64_t Offset = CU->ExprRefedBaseTypes[Index].Die->getOffset();
+  assert(Offset < (1ULL << (ULEB128PadSize * 7)) && "Offset wont fit");
+  AP->emitULEB128(Offset, nullptr, ULEB128PadSize);
+}
+
+unsigned DIEBaseTypeRef::sizeOf(const dwarf::FormParams &, dwarf::Form) const {
+  return ULEB128PadSize;
+}
+
+LLVM_DUMP_METHOD
+void DIEBaseTypeRef::print(raw_ostream &O) const { O << "BaseTypeRef: " << Index; }
+
+//===----------------------------------------------------------------------===//
+// DIEDelta Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit delta value.
+///
+void DIEDelta::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  AP->emitLabelDifference(LabelHi, LabelLo,
+                          sizeOf(AP->getDwarfFormParams(), Form));
+}
+
+/// SizeOf - Determine size of delta value in bytes.
+///
+unsigned DIEDelta::sizeOf(const dwarf::FormParams &FormParams,
+                          dwarf::Form Form) const {
+  switch (Form) {
+  case dwarf::DW_FORM_data4:
+    return 4;
+  case dwarf::DW_FORM_data8:
+    return 8;
+  case dwarf::DW_FORM_sec_offset:
+    return FormParams.getDwarfOffsetByteSize();
+  default:
+    llvm_unreachable("DIE Value form not supported yet");
+  }
+}
+
+LLVM_DUMP_METHOD
+void DIEDelta::print(raw_ostream &O) const {
+  O << "Del: " << LabelHi->getName() << "-" << LabelLo->getName();
+}
+
+//===----------------------------------------------------------------------===//
+// DIEString Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit string value.
+///
+void DIEString::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  // Index of string in symbol table.
+  switch (Form) {
+  case dwarf::DW_FORM_GNU_str_index:
+  case dwarf::DW_FORM_strx:
+  case dwarf::DW_FORM_strx1:
+  case dwarf::DW_FORM_strx2:
+  case dwarf::DW_FORM_strx3:
+  case dwarf::DW_FORM_strx4:
+    DIEInteger(S.getIndex()).emitValue(AP, Form);
+    return;
+  case dwarf::DW_FORM_strp:
+    if (AP->doesDwarfUseRelocationsAcrossSections())
+      DIELabel(S.getSymbol()).emitValue(AP, Form);
+    else
+      DIEInteger(S.getOffset()).emitValue(AP, Form);
+    return;
+  default:
+    llvm_unreachable("Expected valid string form");
+  }
+}
+
+/// sizeOf - Determine size of delta value in bytes.
+///
+unsigned DIEString::sizeOf(const dwarf::FormParams &FormParams,
+                           dwarf::Form Form) const {
+  // Index of string in symbol table.
+  switch (Form) {
+  case dwarf::DW_FORM_GNU_str_index:
+  case dwarf::DW_FORM_strx:
+  case dwarf::DW_FORM_strx1:
+  case dwarf::DW_FORM_strx2:
+  case dwarf::DW_FORM_strx3:
+  case dwarf::DW_FORM_strx4:
+    return DIEInteger(S.getIndex()).sizeOf(FormParams, Form);
+  case dwarf::DW_FORM_strp:
+    if (FormParams.DwarfUsesRelocationsAcrossSections)
+      return DIELabel(S.getSymbol()).sizeOf(FormParams, Form);
+    return DIEInteger(S.getOffset()).sizeOf(FormParams, Form);
+  default:
+    llvm_unreachable("Expected valid string form");
+  }
+}
+
+LLVM_DUMP_METHOD
+void DIEString::print(raw_ostream &O) const {
+  O << "String: " << S.getString();
+}
+
+//===----------------------------------------------------------------------===//
+// DIEInlineString Implementation
+//===----------------------------------------------------------------------===//
+void DIEInlineString::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  if (Form == dwarf::DW_FORM_string) {
+    AP->OutStreamer->emitBytes(S);
+    AP->emitInt8(0);
+    return;
+  }
+  llvm_unreachable("Expected valid string form");
+}
+
+unsigned DIEInlineString::sizeOf(const dwarf::FormParams &, dwarf::Form) const {
+  // Emit string bytes + NULL byte.
+  return S.size() + 1;
+}
+
+LLVM_DUMP_METHOD
+void DIEInlineString::print(raw_ostream &O) const {
+  O << "InlineString: " << S;
+}
+
+//===----------------------------------------------------------------------===//
+// DIEEntry Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit debug information entry offset.
+///
+void DIEEntry::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+
+  switch (Form) {
+  case dwarf::DW_FORM_ref1:
+  case dwarf::DW_FORM_ref2:
+  case dwarf::DW_FORM_ref4:
+  case dwarf::DW_FORM_ref8:
+    AP->OutStreamer->emitIntValue(Entry->getOffset(),
+                                  sizeOf(AP->getDwarfFormParams(), Form));
+    return;
+
+  case dwarf::DW_FORM_ref_udata:
+    AP->emitULEB128(Entry->getOffset());
+    return;
+
+  case dwarf::DW_FORM_ref_addr: {
+    // Get the absolute offset for this DIE within the debug info/types section.
+    uint64_t Addr = Entry->getDebugSectionOffset();
+    if (const MCSymbol *SectionSym =
+            Entry->getUnit()->getCrossSectionRelativeBaseAddress()) {
+      AP->emitLabelPlusOffset(SectionSym, Addr,
+                              sizeOf(AP->getDwarfFormParams(), Form), true);
+      return;
+    }
+
+    AP->OutStreamer->emitIntValue(Addr, sizeOf(AP->getDwarfFormParams(), Form));
+    return;
+  }
+  default:
+    llvm_unreachable("Improper form for DIE reference");
+  }
+}
+
+unsigned DIEEntry::sizeOf(const dwarf::FormParams &FormParams,
+                          dwarf::Form Form) const {
+  switch (Form) {
+  case dwarf::DW_FORM_ref1:
+    return 1;
+  case dwarf::DW_FORM_ref2:
+    return 2;
+  case dwarf::DW_FORM_ref4:
+    return 4;
+  case dwarf::DW_FORM_ref8:
+    return 8;
+  case dwarf::DW_FORM_ref_udata:
+    return getULEB128Size(Entry->getOffset());
+  case dwarf::DW_FORM_ref_addr:
+    return FormParams.getRefAddrByteSize();
+
+  default:
+    llvm_unreachable("Improper form for DIE reference");
+  }
+}
+
+LLVM_DUMP_METHOD
+void DIEEntry::print(raw_ostream &O) const {
+  O << format("Die: 0x%lx", (long)(intptr_t)&Entry);
+}
+
+//===----------------------------------------------------------------------===//
+// DIELoc Implementation
+//===----------------------------------------------------------------------===//
+
+unsigned DIELoc::computeSize(const dwarf::FormParams &FormParams) const {
+  if (!Size) {
+    for (const auto &V : values())
+      Size += V.sizeOf(FormParams);
+  }
+
+  return Size;
+}
+
+/// EmitValue - Emit location data.
+///
+void DIELoc::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
+  switch (Form) {
+  default: llvm_unreachable("Improper form for block");
+  case dwarf::DW_FORM_block1: Asm->emitInt8(Size);    break;
+  case dwarf::DW_FORM_block2: Asm->emitInt16(Size);   break;
+  case dwarf::DW_FORM_block4: Asm->emitInt32(Size);   break;
+  case dwarf::DW_FORM_block:
+  case dwarf::DW_FORM_exprloc:
+    Asm->emitULEB128(Size);
+    break;
+  }
+
+  for (const auto &V : values())
+    V.emitValue(Asm);
+}
+
+/// sizeOf - Determine size of location data in bytes.
+///
+unsigned DIELoc::sizeOf(const dwarf::FormParams &, dwarf::Form Form) const {
+  switch (Form) {
+  case dwarf::DW_FORM_block1: return Size + sizeof(int8_t);
+  case dwarf::DW_FORM_block2: return Size + sizeof(int16_t);
+  case dwarf::DW_FORM_block4: return Size + sizeof(int32_t);
+  case dwarf::DW_FORM_block:
+  case dwarf::DW_FORM_exprloc:
+    return Size + getULEB128Size(Size);
+  default: llvm_unreachable("Improper form for block");
+  }
+}
+
+LLVM_DUMP_METHOD
+void DIELoc::print(raw_ostream &O) const {
+  printValues(O, *this, "ExprLoc", Size, 5);
+}
+
+//===----------------------------------------------------------------------===//
+// DIEBlock Implementation
+//===----------------------------------------------------------------------===//
+
+unsigned DIEBlock::computeSize(const dwarf::FormParams &FormParams) const {
+  if (!Size) {
+    for (const auto &V : values())
+      Size += V.sizeOf(FormParams);
+  }
+
+  return Size;
+}
+
+/// EmitValue - Emit block data.
+///
+void DIEBlock::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
+  switch (Form) {
+  default: llvm_unreachable("Improper form for block");
+  case dwarf::DW_FORM_block1: Asm->emitInt8(Size);    break;
+  case dwarf::DW_FORM_block2: Asm->emitInt16(Size);   break;
+  case dwarf::DW_FORM_block4: Asm->emitInt32(Size);   break;
+  case dwarf::DW_FORM_exprloc:
+  case dwarf::DW_FORM_block:
+    Asm->emitULEB128(Size);
+    break;
+  case dwarf::DW_FORM_string: break;
+  case dwarf::DW_FORM_data16: break;
+  }
+
+  for (const auto &V : values())
+    V.emitValue(Asm);
+}
+
+/// sizeOf - Determine size of block data in bytes.
+///
+unsigned DIEBlock::sizeOf(const dwarf::FormParams &, dwarf::Form Form) const {
+  switch (Form) {
+  case dwarf::DW_FORM_block1: return Size + sizeof(int8_t);
+  case dwarf::DW_FORM_block2: return Size + sizeof(int16_t);
+  case dwarf::DW_FORM_block4: return Size + sizeof(int32_t);
+  case dwarf::DW_FORM_exprloc:
+  case dwarf::DW_FORM_block:  return Size + getULEB128Size(Size);
+  case dwarf::DW_FORM_data16: return 16;
+  default: llvm_unreachable("Improper form for block");
+  }
+}
+
+LLVM_DUMP_METHOD
+void DIEBlock::print(raw_ostream &O) const {
+  printValues(O, *this, "Blk", Size, 5);
+}
+
+//===----------------------------------------------------------------------===//
+// DIELocList Implementation
+//===----------------------------------------------------------------------===//
+
+unsigned DIELocList::sizeOf(const dwarf::FormParams &FormParams,
+                            dwarf::Form Form) const {
+  switch (Form) {
+  case dwarf::DW_FORM_loclistx:
+    return getULEB128Size(Index);
+  case dwarf::DW_FORM_data4:
+    assert(FormParams.Format != dwarf::DWARF64 &&
+           "DW_FORM_data4 is not suitable to emit a pointer to a location list "
+           "in the 64-bit DWARF format");
+    return 4;
+  case dwarf::DW_FORM_data8:
+    assert(FormParams.Format == dwarf::DWARF64 &&
+           "DW_FORM_data8 is not suitable to emit a pointer to a location list "
+           "in the 32-bit DWARF format");
+    return 8;
+  case dwarf::DW_FORM_sec_offset:
+    return FormParams.getDwarfOffsetByteSize();
+  default:
+    llvm_unreachable("DIE Value form not supported yet");
+  }
+}
+
+/// EmitValue - Emit label value.
+///
+void DIELocList::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  if (Form == dwarf::DW_FORM_loclistx) {
+    AP->emitULEB128(Index);
+    return;
+  }
+  DwarfDebug *DD = AP->getDwarfDebug();
+  MCSymbol *Label = DD->getDebugLocs().getList(Index).Label;
+  AP->emitDwarfSymbolReference(Label, /*ForceOffset*/ DD->useSplitDwarf());
+}
+
+LLVM_DUMP_METHOD
+void DIELocList::print(raw_ostream &O) const { O << "LocList: " << Index; }
+
+//===----------------------------------------------------------------------===//
+// DIEAddrOffset Implementation
+//===----------------------------------------------------------------------===//
+
+unsigned DIEAddrOffset::sizeOf(const dwarf::FormParams &FormParams,
+                               dwarf::Form) const {
+  return Addr.sizeOf(FormParams, dwarf::DW_FORM_addrx) +
+         Offset.sizeOf(FormParams, dwarf::DW_FORM_data4);
+}
+
+/// EmitValue - Emit label value.
+///
+void DIEAddrOffset::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  Addr.emitValue(AP, dwarf::DW_FORM_addrx);
+  Offset.emitValue(AP, dwarf::DW_FORM_data4);
+}
+
+LLVM_DUMP_METHOD
+void DIEAddrOffset::print(raw_ostream &O) const {
+  O << "AddrOffset: ";
+  Addr.print(O);
+  O << " + ";
+  Offset.print(O);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
new file mode 100644
index 000000000000..08ed78eb20a1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -0,0 +1,440 @@
+//===-- llvm/CodeGen/DIEHash.cpp - Dwarf Hashing Framework ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for DWARF4 hashing of DIEs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DIEHash.h"
+#include "ByteStreamer.h"
+#include "DwarfCompileUnit.h"
+#include "DwarfDebug.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dwarfdebug"
+
+/// Grabs the string in whichever attribute is passed in and returns
+/// a reference to it.
+static StringRef getDIEStringAttr(const DIE &Die, uint16_t Attr) {
+  // Iterate through all the attributes until we find the one we're
+  // looking for, if we can't find it return an empty string.
+  for (const auto &V : Die.values())
+    if (V.getAttribute() == Attr)
+      return V.getDIEString().getString();
+
+  return StringRef("");
+}
+
+/// Adds the string in \p Str to the hash. This also hashes
+/// a trailing NULL with the string.
+void DIEHash::addString(StringRef Str) {
+  LLVM_DEBUG(dbgs() << "Adding string " << Str << " to hash.\n");
+  Hash.update(Str);
+  Hash.update(ArrayRef((uint8_t)'\0'));
+}
+
+// FIXME: The LEB128 routines are copied and only slightly modified out of
+// LEB128.h.
+
+/// Adds the unsigned in \p Value to the hash encoded as a ULEB128.
+void DIEHash::addULEB128(uint64_t Value) {
+  LLVM_DEBUG(dbgs() << "Adding ULEB128 " << Value << " to hash.\n");
+  do {
+    uint8_t Byte = Value & 0x7f;
+    Value >>= 7;
+    if (Value != 0)
+      Byte |= 0x80; // Mark this byte to show that more bytes will follow.
+    Hash.update(Byte);
+  } while (Value != 0);
+}
+
+void DIEHash::addSLEB128(int64_t Value) {
+  LLVM_DEBUG(dbgs() << "Adding ULEB128 " << Value << " to hash.\n");
+  bool More;
+  do {
+    uint8_t Byte = Value & 0x7f;
+    Value >>= 7;
+    More = !((((Value == 0) && ((Byte & 0x40) == 0)) ||
+              ((Value == -1) && ((Byte & 0x40) != 0))));
+    if (More)
+      Byte |= 0x80; // Mark this byte to show that more bytes will follow.
+    Hash.update(Byte);
+  } while (More);
+}
+
+/// Including \p Parent adds the context of Parent to the hash..
+void DIEHash::addParentContext(const DIE &Parent) {
+
+  LLVM_DEBUG(dbgs() << "Adding parent context to hash...\n");
+
+  // [7.27.2] For each surrounding type or namespace beginning with the
+  // outermost such construct...
+  SmallVector<const DIE *, 1> Parents;
+  const DIE *Cur = &Parent;
+  while (Cur->getParent()) {
+    Parents.push_back(Cur);
+    Cur = Cur->getParent();
+  }
+  assert(Cur->getTag() == dwarf::DW_TAG_compile_unit ||
+         Cur->getTag() == dwarf::DW_TAG_type_unit);
+
+  // Reverse iterate over our list to go from the outermost construct to the
+  // innermost.
+  for (const DIE *Die : llvm::reverse(Parents)) {
+    // ... Append the letter "C" to the sequence...
+    addULEB128('C');
+
+    // ... Followed by the DWARF tag of the construct...
+    addULEB128(Die->getTag());
+
+    // ... Then the name, taken from the DW_AT_name attribute.
+    StringRef Name = getDIEStringAttr(*Die, dwarf::DW_AT_name);
+    LLVM_DEBUG(dbgs() << "... adding context: " << Name << "\n");
+    if (!Name.empty())
+      addString(Name);
+  }
+}
+
+// Collect all of the attributes for a particular DIE in single structure.
+void DIEHash::collectAttributes(const DIE &Die, DIEAttrs &Attrs) {
+
+  for (const auto &V : Die.values()) {
+    LLVM_DEBUG(dbgs() << "Attribute: "
+                      << dwarf::AttributeString(V.getAttribute())
+                      << " added.\n");
+    switch (V.getAttribute()) {
+#define HANDLE_DIE_HASH_ATTR(NAME)                                             \
+  case dwarf::NAME:                                                            \
+    Attrs.NAME = V;                                                            \
+    break;
+#include "DIEHashAttributes.def"
+    default:
+      break;
+    }
+  }
+}
+
+void DIEHash::hashShallowTypeReference(dwarf::Attribute Attribute,
+                                       const DIE &Entry, StringRef Name) {
+  // append the letter 'N'
+  addULEB128('N');
+
+  // the DWARF attribute code (DW_AT_type or DW_AT_friend),
+  addULEB128(Attribute);
+
+  // the context of the tag,
+  if (const DIE *Parent = Entry.getParent())
+    addParentContext(*Parent);
+
+  // the letter 'E',
+  addULEB128('E');
+
+  // and the name of the type.
+  addString(Name);
+
+  // Currently DW_TAG_friends are not used by Clang, but if they do become so,
+  // here's the relevant spec text to implement:
+  //
+  // For DW_TAG_friend, if the referenced entry is the DW_TAG_subprogram,
+  // the context is omitted and the name to be used is the ABI-specific name
+  // of the subprogram (e.g., the mangled linker name).
+}
+
+void DIEHash::hashRepeatedTypeReference(dwarf::Attribute Attribute,
+                                        unsigned DieNumber) {
+  // a) If T is in the list of [previously hashed types], use the letter
+  // 'R' as the marker
+  addULEB128('R');
+
+  addULEB128(Attribute);
+
+  // and use the unsigned LEB128 encoding of [the index of T in the
+  // list] as the attribute value;
+  addULEB128(DieNumber);
+}
+
+void DIEHash::hashDIEEntry(dwarf::Attribute Attribute, dwarf::Tag Tag,
+                           const DIE &Entry) {
+  assert(Tag != dwarf::DW_TAG_friend && "No current LLVM clients emit friend "
+                                        "tags. Add support here when there's "
+                                        "a use case");
+  // Step 5
+  // If the tag in Step 3 is one of [the below tags]
+  if ((Tag == dwarf::DW_TAG_pointer_type ||
+       Tag == dwarf::DW_TAG_reference_type ||
+       Tag == dwarf::DW_TAG_rvalue_reference_type ||
+       Tag == dwarf::DW_TAG_ptr_to_member_type) &&
+      // and the referenced type (via the [below attributes])
+      // FIXME: This seems overly restrictive, and causes hash mismatches
+      // there's a decl/def difference in the containing type of a
+      // ptr_to_member_type, but it's what DWARF says, for some reason.
+      Attribute == dwarf::DW_AT_type) {
+    // ... has a DW_AT_name attribute,
+    StringRef Name = getDIEStringAttr(Entry, dwarf::DW_AT_name);
+    if (!Name.empty()) {
+      hashShallowTypeReference(Attribute, Entry, Name);
+      return;
+    }
+  }
+
+  unsigned &DieNumber = Numbering[&Entry];
+  if (DieNumber) {
+    hashRepeatedTypeReference(Attribute, DieNumber);
+    return;
+  }
+
+  // otherwise, b) use the letter 'T' as the marker, ...
+  addULEB128('T');
+
+  addULEB128(Attribute);
+
+  // ... process the type T recursively by performing Steps 2 through 7, and
+  // use the result as the attribute value.
+  DieNumber = Numbering.size();
+  computeHash(Entry);
+}
+
+void DIEHash::hashRawTypeReference(const DIE &Entry) {
+  unsigned &DieNumber = Numbering[&Entry];
+  if (DieNumber) {
+    addULEB128('R');
+    addULEB128(DieNumber);
+    return;
+  }
+  DieNumber = Numbering.size();
+  addULEB128('T');
+  computeHash(Entry);
+}
+
+// Hash all of the values in a block like set of values. This assumes that
+// all of the data is going to be added as integers.
+void DIEHash::hashBlockData(const DIE::const_value_range &Values) {
+  for (const auto &V : Values)
+    if (V.getType() == DIEValue::isBaseTypeRef) {
+      const DIE &C =
+          *CU->ExprRefedBaseTypes[V.getDIEBaseTypeRef().getIndex()].Die;
+      StringRef Name = getDIEStringAttr(C, dwarf::DW_AT_name);
+      assert(!Name.empty() &&
+             "Base types referenced from DW_OP_convert should have a name");
+      hashNestedType(C, Name);
+    } else
+      Hash.update((uint64_t)V.getDIEInteger().getValue());
+}
+
+// Hash the contents of a loclistptr class.
+void DIEHash::hashLocList(const DIELocList &LocList) {
+  HashingByteStreamer Streamer(*this);
+  DwarfDebug &DD = *AP->getDwarfDebug();
+  const DebugLocStream &Locs = DD.getDebugLocs();
+  const DebugLocStream::List &List = Locs.getList(LocList.getValue());
+  for (const DebugLocStream::Entry &Entry : Locs.getEntries(List))
+    DD.emitDebugLocEntry(Streamer, Entry, List.CU);
+}
+
+// Hash an individual attribute \param Attr based on the type of attribute and
+// the form.
+void DIEHash::hashAttribute(const DIEValue &Value, dwarf::Tag Tag) {
+  dwarf::Attribute Attribute = Value.getAttribute();
+
+  // Other attribute values use the letter 'A' as the marker, and the value
+  // consists of the form code (encoded as an unsigned LEB128 value) followed by
+  // the encoding of the value according to the form code. To ensure
+  // reproducibility of the signature, the set of forms used in the signature
+  // computation is limited to the following: DW_FORM_sdata, DW_FORM_flag,
+  // DW_FORM_string, and DW_FORM_block.
+
+  switch (Value.getType()) {
+  case DIEValue::isNone:
+    llvm_unreachable("Expected valid DIEValue");
+
+    // 7.27 Step 3
+    // ... An attribute that refers to another type entry T is processed as
+    // follows:
+  case DIEValue::isEntry:
+    hashDIEEntry(Attribute, Tag, Value.getDIEEntry().getEntry());
+    break;
+  case DIEValue::isInteger: {
+    addULEB128('A');
+    addULEB128(Attribute);
+    switch (Value.getForm()) {
+    case dwarf::DW_FORM_data1:
+    case dwarf::DW_FORM_data2:
+    case dwarf::DW_FORM_data4:
+    case dwarf::DW_FORM_data8:
+    case dwarf::DW_FORM_udata:
+    case dwarf::DW_FORM_sdata:
+      addULEB128(dwarf::DW_FORM_sdata);
+      addSLEB128((int64_t)Value.getDIEInteger().getValue());
+      break;
+    // DW_FORM_flag_present is just flag with a value of one. We still give it a
+    // value so just use the value.
+    case dwarf::DW_FORM_flag_present:
+    case dwarf::DW_FORM_flag:
+      addULEB128(dwarf::DW_FORM_flag);
+      addULEB128((int64_t)Value.getDIEInteger().getValue());
+      break;
+    default:
+      llvm_unreachable("Unknown integer form!");
+    }
+    break;
+  }
+  case DIEValue::isString:
+    addULEB128('A');
+    addULEB128(Attribute);
+    addULEB128(dwarf::DW_FORM_string);
+    addString(Value.getDIEString().getString());
+    break;
+  case DIEValue::isInlineString:
+    addULEB128('A');
+    addULEB128(Attribute);
+    addULEB128(dwarf::DW_FORM_string);
+    addString(Value.getDIEInlineString().getString());
+    break;
+  case DIEValue::isBlock:
+  case DIEValue::isLoc:
+  case DIEValue::isLocList:
+    addULEB128('A');
+    addULEB128(Attribute);
+    addULEB128(dwarf::DW_FORM_block);
+    if (Value.getType() == DIEValue::isBlock) {
+      addULEB128(Value.getDIEBlock().computeSize(AP->getDwarfFormParams()));
+      hashBlockData(Value.getDIEBlock().values());
+    } else if (Value.getType() == DIEValue::isLoc) {
+      addULEB128(Value.getDIELoc().computeSize(AP->getDwarfFormParams()));
+      hashBlockData(Value.getDIELoc().values());
+    } else {
+      // We could add the block length, but that would take
+      // a bit of work and not add a lot of uniqueness
+      // to the hash in some way we could test.
+      hashLocList(Value.getDIELocList());
+    }
+    break;
+    // FIXME: It's uncertain whether or not we should handle this at the moment.
+  case DIEValue::isExpr:
+  case DIEValue::isLabel:
+  case DIEValue::isBaseTypeRef:
+  case DIEValue::isDelta:
+  case DIEValue::isAddrOffset:
+    llvm_unreachable("Add support for additional value types.");
+  }
+}
+
+// Go through the attributes from \param Attrs in the order specified in 7.27.4
+// and hash them.
+void DIEHash::hashAttributes(const DIEAttrs &Attrs, dwarf::Tag Tag) {
+#define HANDLE_DIE_HASH_ATTR(NAME)                                             \
+  {                                                                            \
+    if (Attrs.NAME)                                                           \
+      hashAttribute(Attrs.NAME, Tag);                                         \
+  }
+#include "DIEHashAttributes.def"
+  // FIXME: Add the extended attributes.
+}
+
+// Add all of the attributes for \param Die to the hash.
+void DIEHash::addAttributes(const DIE &Die) {
+  DIEAttrs Attrs = {};
+  collectAttributes(Die, Attrs);
+  hashAttributes(Attrs, Die.getTag());
+}
+
+void DIEHash::hashNestedType(const DIE &Die, StringRef Name) {
+  // 7.27 Step 7
+  // ... append the letter 'S',
+  addULEB128('S');
+
+  // the tag of C,
+  addULEB128(Die.getTag());
+
+  // and the name.
+  addString(Name);
+}
+
+// Compute the hash of a DIE. This is based on the type signature computation
+// given in section 7.27 of the DWARF4 standard. It is the md5 hash of a
+// flattened description of the DIE.
+void DIEHash::computeHash(const DIE &Die) {
+  // Append the letter 'D', followed by the DWARF tag of the DIE.
+  addULEB128('D');
+  addULEB128(Die.getTag());
+
+  // Add each of the attributes of the DIE.
+  addAttributes(Die);
+
+  // Then hash each of the children of the DIE.
+  for (const auto &C : Die.children()) {
+    // 7.27 Step 7
+    // If C is a nested type entry or a member function entry, ...
+    if (isType(C.getTag()) || (C.getTag() == dwarf::DW_TAG_subprogram && isType(C.getParent()->getTag()))) {
+      StringRef Name = getDIEStringAttr(C, dwarf::DW_AT_name);
+      // ... and has a DW_AT_name attribute
+      if (!Name.empty()) {
+        hashNestedType(C, Name);
+        continue;
+      }
+    }
+    computeHash(C);
+  }
+
+  // Following the last (or if there are no children), append a zero byte.
+  Hash.update(ArrayRef((uint8_t)'\0'));
+}
+
+/// This is based on the type signature computation given in section 7.27 of the
+/// DWARF4 standard. It is an md5 hash of the flattened description of the DIE
+/// with the inclusion of the full CU and all top level CU entities.
+// TODO: Initialize the type chain at 0 instead of 1 for CU signatures.
+uint64_t DIEHash::computeCUSignature(StringRef DWOName, const DIE &Die) {
+  Numbering.clear();
+  Numbering[&Die] = 1;
+
+  if (!DWOName.empty())
+    Hash.update(DWOName);
+  // Hash the DIE.
+  computeHash(Die);
+
+  // Now return the result.
+  MD5::MD5Result Result;
+  Hash.final(Result);
+
+  // ... take the least significant 8 bytes and return those. Our MD5
+  // implementation always returns its results in little endian, so we actually
+  // need the "high" word.
+  return Result.high();
+}
+
+/// This is based on the type signature computation given in section 7.27 of the
+/// DWARF4 standard. It is an md5 hash of the flattened description of the DIE
+/// with the inclusion of additional forms not specifically called out in the
+/// standard.
+uint64_t DIEHash::computeTypeSignature(const DIE &Die) {
+  Numbering.clear();
+  Numbering[&Die] = 1;
+
+  if (const DIE *Parent = Die.getParent())
+    addParentContext(*Parent);
+
+  // Hash the DIE.
+  computeHash(Die);
+
+  // Now return the result.
+  MD5::MD5Result Result;
+  Hash.final(Result);
+
+  // ... take the least significant 8 bytes and return those. Our MD5
+  // implementation always returns its results in little endian, so we actually
+  // need the "high" word.
+  return Result.high();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHash.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
new file mode 100644
index 000000000000..24a973b39271
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
@@ -0,0 +1,112 @@
+//===-- llvm/CodeGen/DIEHash.h - Dwarf Hashing Framework -------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for DWARF4 hashing of DIEs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DIEHASH_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DIEHASH_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/Support/MD5.h"
+
+namespace llvm {
+
+class AsmPrinter;
+
+/// An object containing the capability of hashing and adding hash
+/// attributes onto a DIE.
+class DIEHash {
+  // Collection of all attributes used in hashing a particular DIE.
+  struct DIEAttrs {
+#define HANDLE_DIE_HASH_ATTR(NAME) DIEValue NAME;
+#include "DIEHashAttributes.def"
+  };
+
+public:
+  DIEHash(AsmPrinter *A = nullptr, DwarfCompileUnit *CU = nullptr)
+      : AP(A), CU(CU) {}
+
+  /// Computes the CU signature.
+  uint64_t computeCUSignature(StringRef DWOName, const DIE &Die);
+
+  /// Computes the type signature.
+  uint64_t computeTypeSignature(const DIE &Die);
+
+  // Helper routines to process parts of a DIE.
+private:
+  /// Adds the parent context of \param Parent to the hash.
+  void addParentContext(const DIE &Parent);
+
+  /// Adds the attributes of \param Die to the hash.
+  void addAttributes(const DIE &Die);
+
+  /// Computes the full DWARF4 7.27 hash of the DIE.
+  void computeHash(const DIE &Die);
+
+  // Routines that add DIEValues to the hash.
+public:
+  /// Adds \param Value to the hash.
+  void update(uint8_t Value) { Hash.update(Value); }
+
+  /// Encodes and adds \param Value to the hash as a ULEB128.
+  void addULEB128(uint64_t Value);
+
+  /// Encodes and adds \param Value to the hash as a SLEB128.
+  void addSLEB128(int64_t Value);
+
+  void hashRawTypeReference(const DIE &Entry);
+
+private:
+  /// Adds \param Str to the hash and includes a NULL byte.
+  void addString(StringRef Str);
+
+  /// Collects the attributes of DIE \param Die into the \param Attrs
+  /// structure.
+  void collectAttributes(const DIE &Die, DIEAttrs &Attrs);
+
+  /// Hashes the attributes in \param Attrs in order.
+  void hashAttributes(const DIEAttrs &Attrs, dwarf::Tag Tag);
+
+  /// Hashes the data in a block like DIEValue, e.g. DW_FORM_block or
+  /// DW_FORM_exprloc.
+  void hashBlockData(const DIE::const_value_range &Values);
+
+  /// Hashes the contents pointed to in the .debug_loc section.
+  void hashLocList(const DIELocList &LocList);
+
+  /// Hashes an individual attribute.
+  void hashAttribute(const DIEValue &Value, dwarf::Tag Tag);
+
+  /// Hashes an attribute that refers to another DIE.
+  void hashDIEEntry(dwarf::Attribute Attribute, dwarf::Tag Tag,
+                    const DIE &Entry);
+
+  /// Hashes a reference to a named type in such a way that is
+  /// independent of whether that type is described by a declaration or a
+  /// definition.
+  void hashShallowTypeReference(dwarf::Attribute Attribute, const DIE &Entry,
+                                StringRef Name);
+
+  /// Hashes a reference to a previously referenced type DIE.
+  void hashRepeatedTypeReference(dwarf::Attribute Attribute,
+                                 unsigned DieNumber);
+
+  void hashNestedType(const DIE &Die, StringRef Name);
+
+private:
+  MD5 Hash;
+  AsmPrinter *AP;
+  DwarfCompileUnit *CU;
+  DenseMap<const DIE *, unsigned> Numbering;
+};
+}
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHashAttributes.def b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHashAttributes.def
new file mode 100644
index 000000000000..c872d0dd2dfa
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DIEHashAttributes.def
@@ -0,0 +1,55 @@
+#ifndef HANDLE_DIE_HASH_ATTR
+#error "Missing macro definition of HANDLE_DIE_HASH_ATTR"
+#endif
+
+HANDLE_DIE_HASH_ATTR(DW_AT_name)
+HANDLE_DIE_HASH_ATTR(DW_AT_accessibility)
+HANDLE_DIE_HASH_ATTR(DW_AT_address_class)
+HANDLE_DIE_HASH_ATTR(DW_AT_allocated)
+HANDLE_DIE_HASH_ATTR(DW_AT_artificial)
+HANDLE_DIE_HASH_ATTR(DW_AT_associated)
+HANDLE_DIE_HASH_ATTR(DW_AT_binary_scale)
+HANDLE_DIE_HASH_ATTR(DW_AT_bit_offset)
+HANDLE_DIE_HASH_ATTR(DW_AT_bit_size)
+HANDLE_DIE_HASH_ATTR(DW_AT_bit_stride)
+HANDLE_DIE_HASH_ATTR(DW_AT_byte_size)
+HANDLE_DIE_HASH_ATTR(DW_AT_byte_stride)
+HANDLE_DIE_HASH_ATTR(DW_AT_const_expr)
+HANDLE_DIE_HASH_ATTR(DW_AT_const_value)
+HANDLE_DIE_HASH_ATTR(DW_AT_containing_type)
+HANDLE_DIE_HASH_ATTR(DW_AT_count)
+HANDLE_DIE_HASH_ATTR(DW_AT_data_bit_offset)
+HANDLE_DIE_HASH_ATTR(DW_AT_data_location)
+HANDLE_DIE_HASH_ATTR(DW_AT_data_member_location)
+HANDLE_DIE_HASH_ATTR(DW_AT_decimal_scale)
+HANDLE_DIE_HASH_ATTR(DW_AT_decimal_sign)
+HANDLE_DIE_HASH_ATTR(DW_AT_default_value)
+HANDLE_DIE_HASH_ATTR(DW_AT_digit_count)
+HANDLE_DIE_HASH_ATTR(DW_AT_discr)
+HANDLE_DIE_HASH_ATTR(DW_AT_discr_list)
+HANDLE_DIE_HASH_ATTR(DW_AT_discr_value)
+HANDLE_DIE_HASH_ATTR(DW_AT_encoding)
+HANDLE_DIE_HASH_ATTR(DW_AT_enum_class)
+HANDLE_DIE_HASH_ATTR(DW_AT_endianity)
+HANDLE_DIE_HASH_ATTR(DW_AT_explicit)
+HANDLE_DIE_HASH_ATTR(DW_AT_is_optional)
+HANDLE_DIE_HASH_ATTR(DW_AT_location)
+HANDLE_DIE_HASH_ATTR(DW_AT_lower_bound)
+HANDLE_DIE_HASH_ATTR(DW_AT_mutable)
+HANDLE_DIE_HASH_ATTR(DW_AT_ordering)
+HANDLE_DIE_HASH_ATTR(DW_AT_picture_string)
+HANDLE_DIE_HASH_ATTR(DW_AT_prototyped)
+HANDLE_DIE_HASH_ATTR(DW_AT_small)
+HANDLE_DIE_HASH_ATTR(DW_AT_segment)
+HANDLE_DIE_HASH_ATTR(DW_AT_string_length)
+HANDLE_DIE_HASH_ATTR(DW_AT_threads_scaled)
+HANDLE_DIE_HASH_ATTR(DW_AT_upper_bound)
+HANDLE_DIE_HASH_ATTR(DW_AT_use_location)
+HANDLE_DIE_HASH_ATTR(DW_AT_use_UTF8)
+HANDLE_DIE_HASH_ATTR(DW_AT_variable_parameter)
+HANDLE_DIE_HASH_ATTR(DW_AT_virtuality)
+HANDLE_DIE_HASH_ATTR(DW_AT_visibility)
+HANDLE_DIE_HASH_ATTR(DW_AT_vtable_elem_location)
+HANDLE_DIE_HASH_ATTR(DW_AT_type)
+HANDLE_DIE_HASH_ATTR(DW_AT_linkage_name)
+#undef HANDLE_DIE_HASH_ATTR
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
new file mode 100644
index 000000000000..55a0afcf7a33
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
@@ -0,0 +1,602 @@
+//===- llvm/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/DbgEntityHistoryCalculator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <map>
+#include <optional>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dwarfdebug"
+
+namespace {
+using EntryIndex = DbgValueHistoryMap::EntryIndex;
+}
+
+void InstructionOrdering::initialize(const MachineFunction &MF) {
+  // We give meta instructions the same ordinal as the preceding instruction
+  // because this class is written for the task of comparing positions of
+  // variable location ranges against scope ranges. To reflect what we'll see
+  // in the binary, when we look at location ranges we must consider all
+  // DBG_VALUEs between two real instructions at the same position. And a
+  // scope range which ends on a meta instruction should be considered to end
+  // at the last seen real instruction. E.g.
+  //
+  //  1 instruction p      Both the variable location for x and for y start
+  //  1 DBG_VALUE for "x"  after instruction p so we give them all the same
+  //  1 DBG_VALUE for "y"  number. If a scope range ends at DBG_VALUE for "y",
+  //  2 instruction q      we should treat it as ending after instruction p
+  //                       because it will be the last real instruction in the
+  //                       range. DBG_VALUEs at or after this position for
+  //                       variables declared in the scope will have no effect.
+  clear();
+  unsigned Position = 0;
+  for (const MachineBasicBlock &MBB : MF)
+    for (const MachineInstr &MI : MBB)
+      InstNumberMap[&MI] = MI.isMetaInstruction() ? Position : ++Position;
+}
+
+bool InstructionOrdering::isBefore(const MachineInstr *A,
+                                   const MachineInstr *B) const {
+  assert(A->getParent() && B->getParent() && "Operands must have a parent");
+  assert(A->getMF() == B->getMF() &&
+         "Operands must be in the same MachineFunction");
+  return InstNumberMap.lookup(A) < InstNumberMap.lookup(B);
+}
+
+bool DbgValueHistoryMap::startDbgValue(InlinedEntity Var,
+                                       const MachineInstr &MI,
+                                       EntryIndex &NewIndex) {
+  // Instruction range should start with a DBG_VALUE instruction for the
+  // variable.
+  assert(MI.isDebugValue() && "not a DBG_VALUE");
+  auto &Entries = VarEntries[Var];
+  if (!Entries.empty() && Entries.back().isDbgValue() &&
+      !Entries.back().isClosed() &&
+      Entries.back().getInstr()->isEquivalentDbgInstr(MI)) {
+    LLVM_DEBUG(dbgs() << "Coalescing identical DBG_VALUE entries:\n"
+                      << "\t" << Entries.back().getInstr() << "\t" << MI
+                      << "\n");
+    return false;
+  }
+  Entries.emplace_back(&MI, Entry::DbgValue);
+  NewIndex = Entries.size() - 1;
+  return true;
+}
+
+EntryIndex DbgValueHistoryMap::startClobber(InlinedEntity Var,
+                                            const MachineInstr &MI) {
+  auto &Entries = VarEntries[Var];
+  // If an instruction clobbers multiple registers that the variable is
+  // described by, then we may have already created a clobbering instruction.
+  if (Entries.back().isClobber() && Entries.back().getInstr() == &MI)
+    return Entries.size() - 1;
+  Entries.emplace_back(&MI, Entry::Clobber);
+  return Entries.size() - 1;
+}
+
+void DbgValueHistoryMap::Entry::endEntry(EntryIndex Index) {
+  // For now, instruction ranges are not allowed to cross basic block
+  // boundaries.
+  assert(isDbgValue() && "Setting end index for non-debug value");
+  assert(!isClosed() && "End index has already been set");
+  EndIndex = Index;
+}
+
+/// Check if the instruction range [StartMI, EndMI] intersects any instruction
+/// range in Ranges. EndMI can be nullptr to indicate that the range is
+/// unbounded. Assumes Ranges is ordered and disjoint. Returns true and points
+/// to the first intersecting scope range if one exists.
+static std::optional<ArrayRef<InsnRange>::iterator>
+intersects(const MachineInstr *StartMI, const MachineInstr *EndMI,
+           const ArrayRef<InsnRange> &Ranges,
+           const InstructionOrdering &Ordering) {
+  for (auto RangesI = Ranges.begin(), RangesE = Ranges.end();
+       RangesI != RangesE; ++RangesI) {
+    if (EndMI && Ordering.isBefore(EndMI, RangesI->first))
+      return std::nullopt;
+    if (EndMI && !Ordering.isBefore(RangesI->second, EndMI))
+      return RangesI;
+    if (Ordering.isBefore(StartMI, RangesI->second))
+      return RangesI;
+  }
+  return std::nullopt;
+}
+
+void DbgValueHistoryMap::trimLocationRanges(
+    const MachineFunction &MF, LexicalScopes &LScopes,
+    const InstructionOrdering &Ordering) {
+  // The indices of the entries we're going to remove for each variable.
+  SmallVector<EntryIndex, 4> ToRemove;
+  // Entry reference count for each variable. Clobbers left with no references
+  // will be removed.
+  SmallVector<int, 4> ReferenceCount;
+  // Entries reference other entries by index. Offsets is used to remap these
+  // references if any entries are removed.
+  SmallVector<size_t, 4> Offsets;
+
+  LLVM_DEBUG(dbgs() << "Trimming location ranges for function '" << MF.getName()
+                    << "'\n");
+
+  for (auto &Record : VarEntries) {
+    auto &HistoryMapEntries = Record.second;
+    if (HistoryMapEntries.empty())
+      continue;
+
+    InlinedEntity Entity = Record.first;
+    const DILocalVariable *LocalVar = cast<DILocalVariable>(Entity.first);
+
+    LexicalScope *Scope = nullptr;
+    if (const DILocation *InlinedAt = Entity.second) {
+      Scope = LScopes.findInlinedScope(LocalVar->getScope(), InlinedAt);
+    } else {
+      Scope = LScopes.findLexicalScope(LocalVar->getScope());
+      // Ignore variables for non-inlined function level scopes. The scope
+      // ranges (from scope->getRanges()) will not include any instructions
+      // before the first one with a debug-location, which could cause us to
+      // incorrectly drop a location. We could introduce special casing for
+      // these variables, but it doesn't seem worth it because no out-of-scope
+      // locations have been observed for variables declared in function level
+      // scopes.
+      if (Scope &&
+          (Scope->getScopeNode() == Scope->getScopeNode()->getSubprogram()) &&
+          (Scope->getScopeNode() == LocalVar->getScope()))
+        continue;
+    }
+
+    // If there is no scope for the variable then something has probably gone
+    // wrong.
+    if (!Scope)
+      continue;
+
+    ToRemove.clear();
+    // Zero the reference counts.
+    ReferenceCount.assign(HistoryMapEntries.size(), 0);
+    // Index of the DBG_VALUE which marks the start of the current location
+    // range.
+    EntryIndex StartIndex = 0;
+    ArrayRef<InsnRange> ScopeRanges(Scope->getRanges());
+    for (auto EI = HistoryMapEntries.begin(), EE = HistoryMapEntries.end();
+         EI != EE; ++EI, ++StartIndex) {
+      // Only DBG_VALUEs can open location ranges so skip anything else.
+      if (!EI->isDbgValue())
+        continue;
+
+      // Index of the entry which closes this range.
+      EntryIndex EndIndex = EI->getEndIndex();
+      // If this range is closed bump the reference count of the closing entry.
+      if (EndIndex != NoEntry)
+        ReferenceCount[EndIndex] += 1;
+      // Skip this location range if the opening entry is still referenced. It
+      // may close a location range which intersects a scope range.
+      // TODO: We could be 'smarter' and trim these kinds of ranges such that
+      // they do not leak out of the scope ranges if they partially overlap.
+      if (ReferenceCount[StartIndex] > 0)
+        continue;
+
+      const MachineInstr *StartMI = EI->getInstr();
+      const MachineInstr *EndMI = EndIndex != NoEntry
+                                      ? HistoryMapEntries[EndIndex].getInstr()
+                                      : nullptr;
+      // Check if the location range [StartMI, EndMI] intersects with any scope
+      // range for the variable.
+      if (auto R = intersects(StartMI, EndMI, ScopeRanges, Ordering)) {
+        // Adjust ScopeRanges to exclude ranges which subsequent location ranges
+        // cannot possibly intersect.
+        ScopeRanges = ArrayRef<InsnRange>(*R, ScopeRanges.end());
+      } else {
+        // If the location range does not intersect any scope range then the
+        // DBG_VALUE which opened this location range is usless, mark it for
+        // removal.
+        ToRemove.push_back(StartIndex);
+        // Because we'll be removing this entry we need to update the reference
+        // count of the closing entry, if one exists.
+        if (EndIndex != NoEntry)
+          ReferenceCount[EndIndex] -= 1;
+        LLVM_DEBUG(dbgs() << "Dropping value outside scope range of variable: ";
+                   StartMI->print(llvm::dbgs()););
+      }
+    }
+
+    // If there is nothing to remove then jump to next variable.
+    if (ToRemove.empty())
+      continue;
+
+    // Mark clobbers that will no longer close any location ranges for removal.
+    for (size_t i = 0; i < HistoryMapEntries.size(); ++i)
+      if (ReferenceCount[i] <= 0 && HistoryMapEntries[i].isClobber())
+        ToRemove.push_back(i);
+
+    llvm::sort(ToRemove);
+
+    // Build an offset map so we can update the EndIndex of the remaining
+    // entries.
+    // Zero the offsets.
+    Offsets.assign(HistoryMapEntries.size(), 0);
+    size_t CurOffset = 0;
+    auto ToRemoveItr = ToRemove.begin();
+    for (size_t EntryIdx = *ToRemoveItr; EntryIdx < HistoryMapEntries.size();
+         ++EntryIdx) {
+      // Check if this is an entry which will be removed.
+      if (ToRemoveItr != ToRemove.end() && *ToRemoveItr == EntryIdx) {
+        ++ToRemoveItr;
+        ++CurOffset;
+      }
+      Offsets[EntryIdx] = CurOffset;
+    }
+
+    // Update the EndIndex of the entries to account for those which will be
+    // removed.
+    for (auto &Entry : HistoryMapEntries)
+      if (Entry.isClosed())
+        Entry.EndIndex -= Offsets[Entry.EndIndex];
+
+    // Now actually remove the entries. Iterate backwards so that our remaining
+    // ToRemove indices are valid after each erase.
+    for (EntryIndex Idx : llvm::reverse(ToRemove))
+      HistoryMapEntries.erase(HistoryMapEntries.begin() + Idx);
+    LLVM_DEBUG(llvm::dbgs() << "New HistoryMap('" << LocalVar->getName()
+                            << "') size: " << HistoryMapEntries.size() << "\n");
+  }
+}
+
+bool DbgValueHistoryMap::hasNonEmptyLocation(const Entries &Entries) const {
+  for (const auto &Entry : Entries) {
+    if (!Entry.isDbgValue())
+      continue;
+
+    const MachineInstr *MI = Entry.getInstr();
+    assert(MI->isDebugValue());
+    // A DBG_VALUE $noreg is an empty variable location
+    if (MI->isUndefDebugValue())
+      continue;
+
+    return true;
+  }
+
+  return false;
+}
+
+void DbgLabelInstrMap::addInstr(InlinedEntity Label, const MachineInstr &MI) {
+  assert(MI.isDebugLabel() && "not a DBG_LABEL");
+  LabelInstr[Label] = &MI;
+}
+
+namespace {
+
+// Maps physreg numbers to the variables they describe.
+using InlinedEntity = DbgValueHistoryMap::InlinedEntity;
+using RegDescribedVarsMap = std::map<unsigned, SmallVector<InlinedEntity, 1>>;
+
+// Keeps track of the debug value entries that are currently live for each
+// inlined entity. As the history map entries are stored in a SmallVector, they
+// may be moved at insertion of new entries, so store indices rather than
+// pointers.
+using DbgValueEntriesMap = std::map<InlinedEntity, SmallSet<EntryIndex, 1>>;
+
+} // end anonymous namespace
+
+// Claim that @Var is not described by @RegNo anymore.
+static void dropRegDescribedVar(RegDescribedVarsMap &RegVars, unsigned RegNo,
+                                InlinedEntity Var) {
+  const auto &I = RegVars.find(RegNo);
+  assert(RegNo != 0U && I != RegVars.end());
+  auto &VarSet = I->second;
+  const auto &VarPos = llvm::find(VarSet, Var);
+  assert(VarPos != VarSet.end());
+  VarSet.erase(VarPos);
+  // Don't keep empty sets in a map to keep it as small as possible.
+  if (VarSet.empty())
+    RegVars.erase(I);
+}
+
+// Claim that @Var is now described by @RegNo.
+static void addRegDescribedVar(RegDescribedVarsMap &RegVars, unsigned RegNo,
+                               InlinedEntity Var) {
+  assert(RegNo != 0U);
+  auto &VarSet = RegVars[RegNo];
+  assert(!is_contained(VarSet, Var));
+  VarSet.push_back(Var);
+}
+
+/// Create a clobbering entry and end all open debug value entries
+/// for \p Var that are described by \p RegNo using that entry. Inserts into \p
+/// FellowRegisters the set of Registers that were also used to describe \p Var
+/// alongside \p RegNo.
+static void clobberRegEntries(InlinedEntity Var, unsigned RegNo,
+                              const MachineInstr &ClobberingInstr,
+                              DbgValueEntriesMap &LiveEntries,
+                              DbgValueHistoryMap &HistMap,
+                              SmallVectorImpl<Register> &FellowRegisters) {
+  EntryIndex ClobberIndex = HistMap.startClobber(Var, ClobberingInstr);
+  // Close all entries whose values are described by the register.
+  SmallVector<EntryIndex, 4> IndicesToErase;
+  // If a given register appears in a live DBG_VALUE_LIST for Var alongside the
+  // clobbered register, and never appears in a live DBG_VALUE* for Var without
+  // the clobbered register, then it is no longer linked to the variable.
+  SmallSet<Register, 4> MaybeRemovedRegisters;
+  SmallSet<Register, 4> KeepRegisters;
+  for (auto Index : LiveEntries[Var]) {
+    auto &Entry = HistMap.getEntry(Var, Index);
+    assert(Entry.isDbgValue() && "Not a DBG_VALUE in LiveEntries");
+    if (Entry.getInstr()->isDebugEntryValue())
+      continue;
+    if (Entry.getInstr()->hasDebugOperandForReg(RegNo)) {
+      IndicesToErase.push_back(Index);
+      Entry.endEntry(ClobberIndex);
+      for (const auto &MO : Entry.getInstr()->debug_operands())
+        if (MO.isReg() && MO.getReg() && MO.getReg() != RegNo)
+          MaybeRemovedRegisters.insert(MO.getReg());
+    } else {
+      for (const auto &MO : Entry.getInstr()->debug_operands())
+        if (MO.isReg() && MO.getReg())
+          KeepRegisters.insert(MO.getReg());
+    }
+  }
+
+  for (Register Reg : MaybeRemovedRegisters)
+    if (!KeepRegisters.contains(Reg))
+      FellowRegisters.push_back(Reg);
+
+  // Drop all entries that have ended.
+  for (auto Index : IndicesToErase)
+    LiveEntries[Var].erase(Index);
+}
+
+/// Add a new debug value for \p Var. Closes all overlapping debug values.
+static void handleNewDebugValue(InlinedEntity Var, const MachineInstr &DV,
+                                RegDescribedVarsMap &RegVars,
+                                DbgValueEntriesMap &LiveEntries,
+                                DbgValueHistoryMap &HistMap) {
+  EntryIndex NewIndex;
+  if (HistMap.startDbgValue(Var, DV, NewIndex)) {
+    SmallDenseMap<unsigned, bool, 4> TrackedRegs;
+
+    // If we have created a new debug value entry, close all preceding
+    // live entries that overlap.
+    SmallVector<EntryIndex, 4> IndicesToErase;
+    const DIExpression *DIExpr = DV.getDebugExpression();
+    for (auto Index : LiveEntries[Var]) {
+      auto &Entry = HistMap.getEntry(Var, Index);
+      assert(Entry.isDbgValue() && "Not a DBG_VALUE in LiveEntries");
+      const MachineInstr &DV = *Entry.getInstr();
+      bool Overlaps = DIExpr->fragmentsOverlap(DV.getDebugExpression());
+      if (Overlaps) {
+        IndicesToErase.push_back(Index);
+        Entry.endEntry(NewIndex);
+      }
+      if (!DV.isDebugEntryValue())
+        for (const MachineOperand &Op : DV.debug_operands())
+          if (Op.isReg() && Op.getReg())
+            TrackedRegs[Op.getReg()] |= !Overlaps;
+    }
+
+    // If the new debug value is described by a register, add tracking of
+    // that register if it is not already tracked.
+    if (!DV.isDebugEntryValue()) {
+      for (const MachineOperand &Op : DV.debug_operands()) {
+        if (Op.isReg() && Op.getReg()) {
+          Register NewReg = Op.getReg();
+          if (!TrackedRegs.count(NewReg))
+            addRegDescribedVar(RegVars, NewReg, Var);
+          LiveEntries[Var].insert(NewIndex);
+          TrackedRegs[NewReg] = true;
+        }
+      }
+    }
+
+    // Drop tracking of registers that are no longer used.
+    for (auto I : TrackedRegs)
+      if (!I.second)
+        dropRegDescribedVar(RegVars, I.first, Var);
+
+    // Drop all entries that have ended, and mark the new entry as live.
+    for (auto Index : IndicesToErase)
+      LiveEntries[Var].erase(Index);
+    LiveEntries[Var].insert(NewIndex);
+  }
+}
+
+// Terminate the location range for variables described by register at
+// @I by inserting @ClobberingInstr to their history.
+static void clobberRegisterUses(RegDescribedVarsMap &RegVars,
+                                RegDescribedVarsMap::iterator I,
+                                DbgValueHistoryMap &HistMap,
+                                DbgValueEntriesMap &LiveEntries,
+                                const MachineInstr &ClobberingInstr) {
+  // Iterate over all variables described by this register and add this
+  // instruction to their history, clobbering it. All registers that also
+  // describe the clobbered variables (i.e. in variadic debug values) will have
+  // those Variables removed from their DescribedVars.
+  for (const auto &Var : I->second) {
+    SmallVector<Register, 4> FellowRegisters;
+    clobberRegEntries(Var, I->first, ClobberingInstr, LiveEntries, HistMap,
+                      FellowRegisters);
+    for (Register RegNo : FellowRegisters)
+      dropRegDescribedVar(RegVars, RegNo, Var);
+  }
+  RegVars.erase(I);
+}
+
+// Terminate the location range for variables described by register
+// @RegNo by inserting @ClobberingInstr to their history.
+static void clobberRegisterUses(RegDescribedVarsMap &RegVars, unsigned RegNo,
+                                DbgValueHistoryMap &HistMap,
+                                DbgValueEntriesMap &LiveEntries,
+                                const MachineInstr &ClobberingInstr) {
+  const auto &I = RegVars.find(RegNo);
+  if (I == RegVars.end())
+    return;
+  clobberRegisterUses(RegVars, I, HistMap, LiveEntries, ClobberingInstr);
+}
+
+void llvm::calculateDbgEntityHistory(const MachineFunction *MF,
+                                     const TargetRegisterInfo *TRI,
+                                     DbgValueHistoryMap &DbgValues,
+                                     DbgLabelInstrMap &DbgLabels) {
+  const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+  Register SP = TLI->getStackPointerRegisterToSaveRestore();
+  Register FrameReg = TRI->getFrameRegister(*MF);
+  RegDescribedVarsMap RegVars;
+  DbgValueEntriesMap LiveEntries;
+  for (const auto &MBB : *MF) {
+    for (const auto &MI : MBB) {
+      if (MI.isDebugValue()) {
+        assert(MI.getNumOperands() > 1 && "Invalid DBG_VALUE instruction!");
+        // Use the base variable (without any DW_OP_piece expressions)
+        // as index into History. The full variables including the
+        // piece expressions are attached to the MI.
+        const DILocalVariable *RawVar = MI.getDebugVariable();
+        assert(RawVar->isValidLocationForIntrinsic(MI.getDebugLoc()) &&
+               "Expected inlined-at fields to agree");
+        InlinedEntity Var(RawVar, MI.getDebugLoc()->getInlinedAt());
+
+        handleNewDebugValue(Var, MI, RegVars, LiveEntries, DbgValues);
+      } else if (MI.isDebugLabel()) {
+        assert(MI.getNumOperands() == 1 && "Invalid DBG_LABEL instruction!");
+        const DILabel *RawLabel = MI.getDebugLabel();
+        assert(RawLabel->isValidLocationForIntrinsic(MI.getDebugLoc()) &&
+            "Expected inlined-at fields to agree");
+        // When collecting debug information for labels, there is no MCSymbol
+        // generated for it. So, we keep MachineInstr in DbgLabels in order
+        // to query MCSymbol afterward.
+        InlinedEntity L(RawLabel, MI.getDebugLoc()->getInlinedAt());
+        DbgLabels.addInstr(L, MI);
+      }
+
+      // Meta Instructions have no output and do not change any values and so
+      // can be safely ignored.
+      if (MI.isMetaInstruction())
+        continue;
+
+      // Not a DBG_VALUE instruction. It may clobber registers which describe
+      // some variables.
+      for (const MachineOperand &MO : MI.operands()) {
+        if (MO.isReg() && MO.isDef() && MO.getReg()) {
+          // Ignore call instructions that claim to clobber SP. The AArch64
+          // backend does this for aggregate function arguments.
+          if (MI.isCall() && MO.getReg() == SP)
+            continue;
+          // If this is a virtual register, only clobber it since it doesn't
+          // have aliases.
+          if (MO.getReg().isVirtual())
+            clobberRegisterUses(RegVars, MO.getReg(), DbgValues, LiveEntries,
+                                MI);
+          // If this is a register def operand, it may end a debug value
+          // range. Ignore frame-register defs in the epilogue and prologue,
+          // we expect debuggers to understand that stack-locations are
+          // invalid outside of the function body.
+          else if (MO.getReg() != FrameReg ||
+                   (!MI.getFlag(MachineInstr::FrameDestroy) &&
+                   !MI.getFlag(MachineInstr::FrameSetup))) {
+            for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid();
+                 ++AI)
+              clobberRegisterUses(RegVars, *AI, DbgValues, LiveEntries, MI);
+          }
+        } else if (MO.isRegMask()) {
+          // If this is a register mask operand, clobber all debug values in
+          // non-CSRs.
+          SmallVector<unsigned, 32> RegsToClobber;
+          // Don't consider SP to be clobbered by register masks.
+          for (auto It : RegVars) {
+            unsigned int Reg = It.first;
+            if (Reg != SP && Register::isPhysicalRegister(Reg) &&
+                MO.clobbersPhysReg(Reg))
+              RegsToClobber.push_back(Reg);
+          }
+
+          for (unsigned Reg : RegsToClobber) {
+            clobberRegisterUses(RegVars, Reg, DbgValues, LiveEntries, MI);
+          }
+        }
+      } // End MO loop.
+    }   // End instr loop.
+
+    // Make sure locations for all variables are valid only until the end of
+    // the basic block (unless it's the last basic block, in which case let
+    // their liveness run off to the end of the function).
+    if (!MBB.empty() && &MBB != &MF->back()) {
+      // Iterate over all variables that have open debug values.
+      for (auto &Pair : LiveEntries) {
+        if (Pair.second.empty())
+          continue;
+
+        // Create a clobbering entry.
+        EntryIndex ClobIdx = DbgValues.startClobber(Pair.first, MBB.back());
+
+        // End all entries.
+        for (EntryIndex Idx : Pair.second) {
+          DbgValueHistoryMap::Entry &Ent = DbgValues.getEntry(Pair.first, Idx);
+          assert(Ent.isDbgValue() && !Ent.isClosed());
+          Ent.endEntry(ClobIdx);
+        }
+      }
+
+      LiveEntries.clear();
+      RegVars.clear();
+    }
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void DbgValueHistoryMap::dump(StringRef FuncName) const {
+  dbgs() << "DbgValueHistoryMap('" << FuncName << "'):\n";
+  for (const auto &VarRangePair : *this) {
+    const InlinedEntity &Var = VarRangePair.first;
+    const Entries &Entries = VarRangePair.second;
+
+    const DILocalVariable *LocalVar = cast<DILocalVariable>(Var.first);
+    const DILocation *Location = Var.second;
+
+    dbgs() << " - " << LocalVar->getName() << " at ";
+
+    if (Location)
+      dbgs() << Location->getFilename() << ":" << Location->getLine() << ":"
+             << Location->getColumn();
+    else
+      dbgs() << "<unknown location>";
+
+    dbgs() << " --\n";
+
+    for (const auto &E : enumerate(Entries)) {
+      const auto &Entry = E.value();
+      dbgs() << "  Entry[" << E.index() << "]: ";
+      if (Entry.isDbgValue())
+        dbgs() << "Debug value\n";
+      else
+        dbgs() << "Clobber\n";
+      dbgs() << "   Instr: " << *Entry.getInstr();
+      if (Entry.isDbgValue()) {
+        if (Entry.getEndIndex() == NoEntry)
+          dbgs() << "   - Valid until end of function\n";
+        else
+          dbgs() << "   - Closed by Entry[" << Entry.getEndIndex() << "]\n";
+      }
+      dbgs() << "\n";
+    }
+  }
+}
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
new file mode 100644
index 000000000000..eb2d992c7e75
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -0,0 +1,427 @@
+//===-- llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp -------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Common functionality for different debug information format backends.
+// LLVM currently supports DWARF and CodeView.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/DebugHandlerBase.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dwarfdebug"
+
+/// If true, we drop variable location ranges which exist entirely outside the
+/// variable's lexical scope instruction ranges.
+static cl::opt<bool> TrimVarLocs("trim-var-locs", cl::Hidden, cl::init(true));
+
+std::optional<DbgVariableLocation>
+DbgVariableLocation::extractFromMachineInstruction(
+    const MachineInstr &Instruction) {
+  DbgVariableLocation Location;
+  // Variables calculated from multiple locations can't be represented here.
+  if (Instruction.getNumDebugOperands() != 1)
+    return std::nullopt;
+  if (!Instruction.getDebugOperand(0).isReg())
+    return std::nullopt;
+  Location.Register = Instruction.getDebugOperand(0).getReg();
+  Location.FragmentInfo.reset();
+  // We only handle expressions generated by DIExpression::appendOffset,
+  // which doesn't require a full stack machine.
+  int64_t Offset = 0;
+  const DIExpression *DIExpr = Instruction.getDebugExpression();
+  auto Op = DIExpr->expr_op_begin();
+  // We can handle a DBG_VALUE_LIST iff it has exactly one location operand that
+  // appears exactly once at the start of the expression.
+  if (Instruction.isDebugValueList()) {
+    if (Instruction.getNumDebugOperands() == 1 &&
+        Op->getOp() == dwarf::DW_OP_LLVM_arg)
+      ++Op;
+    else
+      return std::nullopt;
+  }
+  while (Op != DIExpr->expr_op_end()) {
+    switch (Op->getOp()) {
+    case dwarf::DW_OP_constu: {
+      int Value = Op->getArg(0);
+      ++Op;
+      if (Op != DIExpr->expr_op_end()) {
+        switch (Op->getOp()) {
+        case dwarf::DW_OP_minus:
+          Offset -= Value;
+          break;
+        case dwarf::DW_OP_plus:
+          Offset += Value;
+          break;
+        default:
+          continue;
+        }
+      }
+    } break;
+    case dwarf::DW_OP_plus_uconst:
+      Offset += Op->getArg(0);
+      break;
+    case dwarf::DW_OP_LLVM_fragment:
+      Location.FragmentInfo = {Op->getArg(1), Op->getArg(0)};
+      break;
+    case dwarf::DW_OP_deref:
+      Location.LoadChain.push_back(Offset);
+      Offset = 0;
+      break;
+    default:
+      return std::nullopt;
+    }
+    ++Op;
+  }
+
+  // Do one final implicit DW_OP_deref if this was an indirect DBG_VALUE
+  // instruction.
+  // FIXME: Replace these with DIExpression.
+  if (Instruction.isIndirectDebugValue())
+    Location.LoadChain.push_back(Offset);
+
+  return Location;
+}
+
+DebugHandlerBase::DebugHandlerBase(AsmPrinter *A) : Asm(A), MMI(Asm->MMI) {}
+
+void DebugHandlerBase::beginModule(Module *M) {
+  if (M->debug_compile_units().empty())
+    Asm = nullptr;
+}
+
+// Each LexicalScope has first instruction and last instruction to mark
+// beginning and end of a scope respectively. Create an inverse map that list
+// scopes starts (and ends) with an instruction. One instruction may start (or
+// end) multiple scopes. Ignore scopes that are not reachable.
+void DebugHandlerBase::identifyScopeMarkers() {
+  SmallVector<LexicalScope *, 4> WorkList;
+  WorkList.push_back(LScopes.getCurrentFunctionScope());
+  while (!WorkList.empty()) {
+    LexicalScope *S = WorkList.pop_back_val();
+
+    const SmallVectorImpl<LexicalScope *> &Children = S->getChildren();
+    if (!Children.empty())
+      WorkList.append(Children.begin(), Children.end());
+
+    if (S->isAbstractScope())
+      continue;
+
+    for (const InsnRange &R : S->getRanges()) {
+      assert(R.first && "InsnRange does not have first instruction!");
+      assert(R.second && "InsnRange does not have second instruction!");
+      requestLabelBeforeInsn(R.first);
+      requestLabelAfterInsn(R.second);
+    }
+  }
+}
+
+// Return Label preceding the instruction.
+MCSymbol *DebugHandlerBase::getLabelBeforeInsn(const MachineInstr *MI) {
+  MCSymbol *Label = LabelsBeforeInsn.lookup(MI);
+  assert(Label && "Didn't insert label before instruction");
+  return Label;
+}
+
+// Return Label immediately following the instruction.
+MCSymbol *DebugHandlerBase::getLabelAfterInsn(const MachineInstr *MI) {
+  return LabelsAfterInsn.lookup(MI);
+}
+
+/// If this type is derived from a base type then return base type size.
+uint64_t DebugHandlerBase::getBaseTypeSize(const DIType *Ty) {
+  assert(Ty);
+  const DIDerivedType *DDTy = dyn_cast<DIDerivedType>(Ty);
+  if (!DDTy)
+    return Ty->getSizeInBits();
+
+  unsigned Tag = DDTy->getTag();
+
+  if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef &&
+      Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type &&
+      Tag != dwarf::DW_TAG_restrict_type && Tag != dwarf::DW_TAG_atomic_type &&
+      Tag != dwarf::DW_TAG_immutable_type)
+    return DDTy->getSizeInBits();
+
+  DIType *BaseType = DDTy->getBaseType();
+
+  if (!BaseType)
+    return 0;
+
+  // If this is a derived type, go ahead and get the base type, unless it's a
+  // reference then it's just the size of the field. Pointer types have no need
+  // of this since they're a different type of qualification on the type.
+  if (BaseType->getTag() == dwarf::DW_TAG_reference_type ||
+      BaseType->getTag() == dwarf::DW_TAG_rvalue_reference_type)
+    return Ty->getSizeInBits();
+
+  return getBaseTypeSize(BaseType);
+}
+
+bool DebugHandlerBase::isUnsignedDIType(const DIType *Ty) {
+  if (isa<DIStringType>(Ty)) {
+    // Some transformations (e.g. instcombine) may decide to turn a Fortran
+    // character object into an integer, and later ones (e.g. SROA) may
+    // further inject a constant integer in a llvm.dbg.value call to track
+    // the object's value. Here we trust the transformations are doing the
+    // right thing, and treat the constant as unsigned to preserve that value
+    // (i.e. avoid sign extension).
+    return true;
+  }
+
+  if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
+    if (CTy->getTag() == dwarf::DW_TAG_enumeration_type) {
+      if (!(Ty = CTy->getBaseType()))
+        // FIXME: Enums without a fixed underlying type have unknown signedness
+        // here, leading to incorrectly emitted constants.
+        return false;
+    } else
+      // (Pieces of) aggregate types that get hacked apart by SROA may be
+      // represented by a constant. Encode them as unsigned bytes.
+      return true;
+  }
+
+  if (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+    dwarf::Tag T = (dwarf::Tag)Ty->getTag();
+    // Encode pointer constants as unsigned bytes. This is used at least for
+    // null pointer constant emission.
+    // FIXME: reference and rvalue_reference /probably/ shouldn't be allowed
+    // here, but accept them for now due to a bug in SROA producing bogus
+    // dbg.values.
+    if (T == dwarf::DW_TAG_pointer_type ||
+        T == dwarf::DW_TAG_ptr_to_member_type ||
+        T == dwarf::DW_TAG_reference_type ||
+        T == dwarf::DW_TAG_rvalue_reference_type)
+      return true;
+    assert(T == dwarf::DW_TAG_typedef || T == dwarf::DW_TAG_const_type ||
+           T == dwarf::DW_TAG_volatile_type ||
+           T == dwarf::DW_TAG_restrict_type || T == dwarf::DW_TAG_atomic_type ||
+           T == dwarf::DW_TAG_immutable_type);
+    assert(DTy->getBaseType() && "Expected valid base type");
+    return isUnsignedDIType(DTy->getBaseType());
+  }
+
+  auto *BTy = cast<DIBasicType>(Ty);
+  unsigned Encoding = BTy->getEncoding();
+  assert((Encoding == dwarf::DW_ATE_unsigned ||
+          Encoding == dwarf::DW_ATE_unsigned_char ||
+          Encoding == dwarf::DW_ATE_signed ||
+          Encoding == dwarf::DW_ATE_signed_char ||
+          Encoding == dwarf::DW_ATE_float || Encoding == dwarf::DW_ATE_UTF ||
+          Encoding == dwarf::DW_ATE_boolean ||
+          Encoding == dwarf::DW_ATE_complex_float ||
+          (Ty->getTag() == dwarf::DW_TAG_unspecified_type &&
+           Ty->getName() == "decltype(nullptr)")) &&
+         "Unsupported encoding");
+  return Encoding == dwarf::DW_ATE_unsigned ||
+         Encoding == dwarf::DW_ATE_unsigned_char ||
+         Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean ||
+         Ty->getTag() == dwarf::DW_TAG_unspecified_type;
+}
+
+static bool hasDebugInfo(const MachineModuleInfo *MMI,
+                         const MachineFunction *MF) {
+  if (!MMI->hasDebugInfo())
+    return false;
+  auto *SP = MF->getFunction().getSubprogram();
+  if (!SP)
+    return false;
+  assert(SP->getUnit());
+  auto EK = SP->getUnit()->getEmissionKind();
+  if (EK == DICompileUnit::NoDebug)
+    return false;
+  return true;
+}
+
+void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
+  PrevInstBB = nullptr;
+
+  if (!Asm || !hasDebugInfo(MMI, MF)) {
+    skippedNonDebugFunction();
+    return;
+  }
+
+  // Grab the lexical scopes for the function, if we don't have any of those
+  // then we're not going to be able to do anything.
+  LScopes.initialize(*MF);
+  if (LScopes.empty()) {
+    beginFunctionImpl(MF);
+    return;
+  }
+
+  // Make sure that each lexical scope will have a begin/end label.
+  identifyScopeMarkers();
+
+  // Calculate history for local variables.
+  assert(DbgValues.empty() && "DbgValues map wasn't cleaned!");
+  assert(DbgLabels.empty() && "DbgLabels map wasn't cleaned!");
+  calculateDbgEntityHistory(MF, Asm->MF->getSubtarget().getRegisterInfo(),
+                            DbgValues, DbgLabels);
+  InstOrdering.initialize(*MF);
+  if (TrimVarLocs)
+    DbgValues.trimLocationRanges(*MF, LScopes, InstOrdering);
+  LLVM_DEBUG(DbgValues.dump(MF->getName()));
+
+  // Request labels for the full history.
+  for (const auto &I : DbgValues) {
+    const auto &Entries = I.second;
+    if (Entries.empty())
+      continue;
+
+    auto IsDescribedByReg = [](const MachineInstr *MI) {
+      return any_of(MI->debug_operands(),
+                    [](auto &MO) { return MO.isReg() && MO.getReg(); });
+    };
+
+    // The first mention of a function argument gets the CurrentFnBegin label,
+    // so arguments are visible when breaking at function entry.
+    //
+    // We do not change the label for values that are described by registers,
+    // as that could place them above their defining instructions. We should
+    // ideally not change the labels for constant debug values either, since
+    // doing that violates the ranges that are calculated in the history map.
+    // However, we currently do not emit debug values for constant arguments
+    // directly at the start of the function, so this code is still useful.
+    const DILocalVariable *DIVar =
+        Entries.front().getInstr()->getDebugVariable();
+    if (DIVar->isParameter() &&
+        getDISubprogram(DIVar->getScope())->describes(&MF->getFunction())) {
+      if (!IsDescribedByReg(Entries.front().getInstr()))
+        LabelsBeforeInsn[Entries.front().getInstr()] = Asm->getFunctionBegin();
+      if (Entries.front().getInstr()->getDebugExpression()->isFragment()) {
+        // Mark all non-overlapping initial fragments.
+        for (const auto *I = Entries.begin(); I != Entries.end(); ++I) {
+          if (!I->isDbgValue())
+            continue;
+          const DIExpression *Fragment = I->getInstr()->getDebugExpression();
+          if (std::any_of(Entries.begin(), I,
+                          [&](DbgValueHistoryMap::Entry Pred) {
+                            return Pred.isDbgValue() &&
+                                   Fragment->fragmentsOverlap(
+                                       Pred.getInstr()->getDebugExpression());
+                          }))
+            break;
+          // The code that generates location lists for DWARF assumes that the
+          // entries' start labels are monotonically increasing, and since we
+          // don't change the label for fragments that are described by
+          // registers, we must bail out when encountering such a fragment.
+          if (IsDescribedByReg(I->getInstr()))
+            break;
+          LabelsBeforeInsn[I->getInstr()] = Asm->getFunctionBegin();
+        }
+      }
+    }
+
+    for (const auto &Entry : Entries) {
+      if (Entry.isDbgValue())
+        requestLabelBeforeInsn(Entry.getInstr());
+      else
+        requestLabelAfterInsn(Entry.getInstr());
+    }
+  }
+
+  // Ensure there is a symbol before DBG_LABEL.
+  for (const auto &I : DbgLabels) {
+    const MachineInstr *MI = I.second;
+    requestLabelBeforeInsn(MI);
+  }
+
+  PrevInstLoc = DebugLoc();
+  PrevLabel = Asm->getFunctionBegin();
+  beginFunctionImpl(MF);
+}
+
+void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
+  if (!Asm || !MMI->hasDebugInfo())
+    return;
+
+  assert(CurMI == nullptr);
+  CurMI = MI;
+
+  // Insert labels where requested.
+  DenseMap<const MachineInstr *, MCSymbol *>::iterator I =
+      LabelsBeforeInsn.find(MI);
+
+  // No label needed.
+  if (I == LabelsBeforeInsn.end())
+    return;
+
+  // Label already assigned.
+  if (I->second)
+    return;
+
+  if (!PrevLabel) {
+    PrevLabel = MMI->getContext().createTempSymbol();
+    Asm->OutStreamer->emitLabel(PrevLabel);
+  }
+  I->second = PrevLabel;
+}
+
+void DebugHandlerBase::endInstruction() {
+  if (!Asm || !MMI->hasDebugInfo())
+    return;
+
+  assert(CurMI != nullptr);
+  // Don't create a new label after DBG_VALUE and other instructions that don't
+  // generate code.
+  if (!CurMI->isMetaInstruction()) {
+    PrevLabel = nullptr;
+    PrevInstBB = CurMI->getParent();
+  }
+
+  DenseMap<const MachineInstr *, MCSymbol *>::iterator I =
+      LabelsAfterInsn.find(CurMI);
+
+  // No label needed or label already assigned.
+  if (I == LabelsAfterInsn.end() || I->second) {
+    CurMI = nullptr;
+    return;
+  }
+
+  // We need a label after this instruction.  With basic block sections, just
+  // use the end symbol of the section if this is the last instruction of the
+  // section.  This reduces the need for an additional label and also helps
+  // merging ranges.
+  if (CurMI->getParent()->isEndSection() && CurMI->getNextNode() == nullptr) {
+    PrevLabel = CurMI->getParent()->getEndSymbol();
+  } else if (!PrevLabel) {
+    PrevLabel = MMI->getContext().createTempSymbol();
+    Asm->OutStreamer->emitLabel(PrevLabel);
+  }
+  I->second = PrevLabel;
+  CurMI = nullptr;
+}
+
+void DebugHandlerBase::endFunction(const MachineFunction *MF) {
+  if (Asm && hasDebugInfo(MMI, MF))
+    endFunctionImpl(MF);
+  DbgValues.clear();
+  DbgLabels.clear();
+  LabelsBeforeInsn.clear();
+  LabelsAfterInsn.clear();
+  InstOrdering.clear();
+}
+
+void DebugHandlerBase::beginBasicBlockSection(const MachineBasicBlock &MBB) {
+  EpilogBeginBlock = nullptr;
+  if (!MBB.isEntryBlock())
+    PrevLabel = MBB.getSymbol();
+}
+
+void DebugHandlerBase::endBasicBlockSection(const MachineBasicBlock &MBB) {
+  PrevLabel = nullptr;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
new file mode 100644
index 000000000000..726aba18bb80
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
@@ -0,0 +1,290 @@
+//===-- llvm/CodeGen/DebugLocEntry.h - Entry in debug_loc list -*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H
+
+#include "DebugLocStream.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/Debug.h"
+
+namespace llvm {
+class AsmPrinter;
+
+/// This struct describes target specific location.
+struct TargetIndexLocation {
+  int Index;
+  int Offset;
+
+  TargetIndexLocation() = default;
+  TargetIndexLocation(unsigned Idx, int64_t Offset)
+      : Index(Idx), Offset(Offset) {}
+
+  bool operator==(const TargetIndexLocation &Other) const {
+    return Index == Other.Index && Offset == Other.Offset;
+  }
+};
+
+/// A single location or constant within a variable location description, with
+/// either a single entry (with an optional DIExpression) used for a DBG_VALUE,
+/// or a list of entries used for a DBG_VALUE_LIST.
+class DbgValueLocEntry {
+
+  /// Type of entry that this represents.
+  enum EntryType {
+    E_Location,
+    E_Integer,
+    E_ConstantFP,
+    E_ConstantInt,
+    E_TargetIndexLocation
+  };
+  enum EntryType EntryKind;
+
+  /// Either a constant,
+  union {
+    int64_t Int;
+    const ConstantFP *CFP;
+    const ConstantInt *CIP;
+  } Constant;
+
+  union {
+    /// Or a location in the machine frame.
+    MachineLocation Loc;
+    /// Or a location from target specific location.
+    TargetIndexLocation TIL;
+  };
+
+public:
+  DbgValueLocEntry(int64_t i) : EntryKind(E_Integer) { Constant.Int = i; }
+  DbgValueLocEntry(const ConstantFP *CFP) : EntryKind(E_ConstantFP) {
+    Constant.CFP = CFP;
+  }
+  DbgValueLocEntry(const ConstantInt *CIP) : EntryKind(E_ConstantInt) {
+    Constant.CIP = CIP;
+  }
+  DbgValueLocEntry(MachineLocation Loc) : EntryKind(E_Location), Loc(Loc) {}
+  DbgValueLocEntry(TargetIndexLocation Loc)
+      : EntryKind(E_TargetIndexLocation), TIL(Loc) {}
+
+  bool isLocation() const { return EntryKind == E_Location; }
+  bool isIndirectLocation() const {
+    return EntryKind == E_Location && Loc.isIndirect();
+  }
+  bool isTargetIndexLocation() const {
+    return EntryKind == E_TargetIndexLocation;
+  }
+  bool isInt() const { return EntryKind == E_Integer; }
+  bool isConstantFP() const { return EntryKind == E_ConstantFP; }
+  bool isConstantInt() const { return EntryKind == E_ConstantInt; }
+  int64_t getInt() const { return Constant.Int; }
+  const ConstantFP *getConstantFP() const { return Constant.CFP; }
+  const ConstantInt *getConstantInt() const { return Constant.CIP; }
+  MachineLocation getLoc() const { return Loc; }
+  TargetIndexLocation getTargetIndexLocation() const { return TIL; }
+  friend bool operator==(const DbgValueLocEntry &, const DbgValueLocEntry &);
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  LLVM_DUMP_METHOD void dump() const {
+    if (isLocation()) {
+      llvm::dbgs() << "Loc = { reg=" << Loc.getReg() << " ";
+      if (Loc.isIndirect())
+        llvm::dbgs() << "+0";
+      llvm::dbgs() << "} ";
+    } else if (isConstantInt())
+      Constant.CIP->dump();
+    else if (isConstantFP())
+      Constant.CFP->dump();
+  }
+#endif
+};
+
+/// The location of a single variable, composed of an expression and 0 or more
+/// DbgValueLocEntries.
+class DbgValueLoc {
+  /// Any complex address location expression for this DbgValueLoc.
+  const DIExpression *Expression;
+
+  SmallVector<DbgValueLocEntry, 2> ValueLocEntries;
+
+  bool IsVariadic;
+
+public:
+  DbgValueLoc(const DIExpression *Expr, ArrayRef<DbgValueLocEntry> Locs)
+      : Expression(Expr), ValueLocEntries(Locs.begin(), Locs.end()),
+        IsVariadic(true) {}
+
+  DbgValueLoc(const DIExpression *Expr, ArrayRef<DbgValueLocEntry> Locs,
+              bool IsVariadic)
+      : Expression(Expr), ValueLocEntries(Locs.begin(), Locs.end()),
+        IsVariadic(IsVariadic) {
+#ifndef NDEBUG
+    assert(Expr->isValid() ||
+           !any_of(Locs, [](auto LE) { return LE.isLocation(); }));
+    if (!IsVariadic) {
+      assert(ValueLocEntries.size() == 1);
+    }
+#endif
+  }
+
+  DbgValueLoc(const DIExpression *Expr, DbgValueLocEntry Loc)
+      : Expression(Expr), ValueLocEntries(1, Loc), IsVariadic(false) {
+    assert(((Expr && Expr->isValid()) || !Loc.isLocation()) &&
+           "DBG_VALUE with a machine location must have a valid expression.");
+  }
+
+  bool isFragment() const { return getExpression()->isFragment(); }
+  bool isEntryVal() const { return getExpression()->isEntryValue(); }
+  bool isVariadic() const { return IsVariadic; }
+  bool isEquivalent(const DbgValueLoc &Other) const {
+    // Cannot be equivalent with different numbers of entries.
+    if (ValueLocEntries.size() != Other.ValueLocEntries.size())
+      return false;
+    bool ThisIsIndirect =
+        !IsVariadic && ValueLocEntries[0].isIndirectLocation();
+    bool OtherIsIndirect =
+        !Other.IsVariadic && Other.ValueLocEntries[0].isIndirectLocation();
+    // Check equivalence of DIExpressions + Directness together.
+    if (!DIExpression::isEqualExpression(Expression, ThisIsIndirect,
+                                         Other.Expression, OtherIsIndirect))
+      return false;
+    // Indirectness should have been accounted for in the above check, so just
+    // compare register values directly here.
+    if (ThisIsIndirect || OtherIsIndirect) {
+      DbgValueLocEntry ThisOp = ValueLocEntries[0];
+      DbgValueLocEntry OtherOp = Other.ValueLocEntries[0];
+      return ThisOp.isLocation() && OtherOp.isLocation() &&
+             ThisOp.getLoc().getReg() == OtherOp.getLoc().getReg();
+    }
+    // If neither are indirect, then just compare the loc entries directly.
+    return ValueLocEntries == Other.ValueLocEntries;
+  }
+  const DIExpression *getExpression() const { return Expression; }
+  ArrayRef<DbgValueLocEntry> getLocEntries() const { return ValueLocEntries; }
+  friend bool operator==(const DbgValueLoc &, const DbgValueLoc &);
+  friend bool operator<(const DbgValueLoc &, const DbgValueLoc &);
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  LLVM_DUMP_METHOD void dump() const {
+    for (const DbgValueLocEntry &DV : ValueLocEntries)
+      DV.dump();
+    if (Expression)
+      Expression->dump();
+  }
+#endif
+};
+
+/// This struct describes location entries emitted in the .debug_loc
+/// section.
+class DebugLocEntry {
+  /// Begin and end symbols for the address range that this location is valid.
+  const MCSymbol *Begin;
+  const MCSymbol *End;
+
+  /// A nonempty list of locations/constants belonging to this entry,
+  /// sorted by offset.
+  SmallVector<DbgValueLoc, 1> Values;
+
+public:
+  /// Create a location list entry for the range [\p Begin, \p End).
+  ///
+  /// \param Vals One or more values describing (parts of) the variable.
+  DebugLocEntry(const MCSymbol *Begin, const MCSymbol *End,
+                ArrayRef<DbgValueLoc> Vals)
+      : Begin(Begin), End(End) {
+    addValues(Vals);
+  }
+
+  /// Attempt to merge this DebugLocEntry with Next and return
+  /// true if the merge was successful. Entries can be merged if they
+  /// share the same Loc/Constant and if Next immediately follows this
+  /// Entry.
+  bool MergeRanges(const DebugLocEntry &Next) {
+    // If this and Next are describing the same variable, merge them.
+    if (End != Next.Begin)
+      return false;
+    if (Values.size() != Next.Values.size())
+      return false;
+    for (unsigned EntryIdx = 0; EntryIdx < Values.size(); ++EntryIdx)
+      if (!Values[EntryIdx].isEquivalent(Next.Values[EntryIdx]))
+        return false;
+    End = Next.End;
+    return true;
+  }
+
+  const MCSymbol *getBeginSym() const { return Begin; }
+  const MCSymbol *getEndSym() const { return End; }
+  ArrayRef<DbgValueLoc> getValues() const { return Values; }
+  void addValues(ArrayRef<DbgValueLoc> Vals) {
+    Values.append(Vals.begin(), Vals.end());
+    sortUniqueValues();
+    assert((Values.size() == 1 || all_of(Values, [](DbgValueLoc V) {
+              return V.isFragment();
+            })) && "must either have a single value or multiple pieces");
+  }
+
+  // Sort the pieces by offset.
+  // Remove any duplicate entries by dropping all but the first.
+  void sortUniqueValues() {
+    // Values is either 1 item that does not have a fragment, or many items
+    // that all do. No need to sort if the former and also prevents operator<
+    // being called on a non fragment item when _GLIBCXX_DEBUG is defined.
+    if (Values.size() == 1)
+      return;
+    llvm::sort(Values);
+    Values.erase(std::unique(Values.begin(), Values.end(),
+                             [](const DbgValueLoc &A, const DbgValueLoc &B) {
+                               return A.getExpression() == B.getExpression();
+                             }),
+                 Values.end());
+  }
+
+  /// Lower this entry into a DWARF expression.
+  void finalize(const AsmPrinter &AP,
+                DebugLocStream::ListBuilder &List,
+                const DIBasicType *BT,
+                DwarfCompileUnit &TheCU);
+};
+
+/// Compare two DbgValueLocEntries for equality.
+inline bool operator==(const DbgValueLocEntry &A, const DbgValueLocEntry &B) {
+  if (A.EntryKind != B.EntryKind)
+    return false;
+
+  switch (A.EntryKind) {
+  case DbgValueLocEntry::E_Location:
+    return A.Loc == B.Loc;
+  case DbgValueLocEntry::E_TargetIndexLocation:
+    return A.TIL == B.TIL;
+  case DbgValueLocEntry::E_Integer:
+    return A.Constant.Int == B.Constant.Int;
+  case DbgValueLocEntry::E_ConstantFP:
+    return A.Constant.CFP == B.Constant.CFP;
+  case DbgValueLocEntry::E_ConstantInt:
+    return A.Constant.CIP == B.Constant.CIP;
+  }
+  llvm_unreachable("unhandled EntryKind");
+}
+
+/// Compare two DbgValueLocs for equality.
+inline bool operator==(const DbgValueLoc &A, const DbgValueLoc &B) {
+  return A.ValueLocEntries == B.ValueLocEntries &&
+         A.Expression == B.Expression && A.IsVariadic == B.IsVariadic;
+}
+
+/// Compare two fragments based on their offset.
+inline bool operator<(const DbgValueLoc &A,
+                      const DbgValueLoc &B) {
+  return A.getExpression()->getFragmentInfo()->OffsetInBits <
+         B.getExpression()->getFragmentInfo()->OffsetInBits;
+}
+
+}
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.cpp
new file mode 100644
index 000000000000..8c6109880afc
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.cpp
@@ -0,0 +1,47 @@
+//===- DebugLocStream.cpp - DWARF debug_loc stream --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DebugLocStream.h"
+#include "DwarfDebug.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+
+using namespace llvm;
+
+bool DebugLocStream::finalizeList(AsmPrinter &Asm) {
+  if (Lists.back().EntryOffset == Entries.size()) {
+    // Empty list.  Delete it.
+    Lists.pop_back();
+    return false;
+  }
+
+  // Real list.  Generate a label for it.
+  Lists.back().Label = Asm.createTempSymbol("debug_loc");
+  return true;
+}
+
+void DebugLocStream::finalizeEntry() {
+  if (Entries.back().ByteOffset != DWARFBytes.size())
+    return;
+
+  // The last entry was empty.  Delete it.
+  Comments.erase(Comments.begin() + Entries.back().CommentOffset,
+                 Comments.end());
+  Entries.pop_back();
+
+  assert(Lists.back().EntryOffset <= Entries.size() &&
+         "Popped off more entries than are in the list");
+}
+
+DebugLocStream::ListBuilder::~ListBuilder() {
+  if (!Locs.finalizeList(Asm))
+    return;
+  V.initializeDbgValue(&MI);
+  V.setDebugLocListIndex(ListIndex);
+  if (TagOffset)
+    V.setDebugLocListTagOffset(*TagOffset);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h
new file mode 100644
index 000000000000..a96bdd034918
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h
@@ -0,0 +1,200 @@
+//===--- lib/CodeGen/DebugLocStream.h - DWARF debug_loc stream --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCSTREAM_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCSTREAM_H
+
+#include "ByteStreamer.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class DbgVariable;
+class DwarfCompileUnit;
+class MachineInstr;
+class MCSymbol;
+
+/// Byte stream of .debug_loc entries.
+///
+/// Stores a unified stream of .debug_loc entries.  There's \a List for each
+/// variable/inlined-at pair, and an \a Entry for each \a DebugLocEntry.
+///
+/// FIXME: Do we need all these temp symbols?
+/// FIXME: Why not output directly to the output stream?
+class DebugLocStream {
+public:
+  struct List {
+    DwarfCompileUnit *CU;
+    MCSymbol *Label = nullptr;
+    size_t EntryOffset;
+    List(DwarfCompileUnit *CU, size_t EntryOffset)
+        : CU(CU), EntryOffset(EntryOffset) {}
+  };
+  struct Entry {
+    const MCSymbol *Begin;
+    const MCSymbol *End;
+    size_t ByteOffset;
+    size_t CommentOffset;
+  };
+
+private:
+  SmallVector<List, 4> Lists;
+  SmallVector<Entry, 32> Entries;
+  SmallString<256> DWARFBytes;
+  std::vector<std::string> Comments;
+  MCSymbol *Sym = nullptr;
+
+  /// Only verbose textual output needs comments.  This will be set to
+  /// true for that case, and false otherwise.
+  bool GenerateComments;
+
+public:
+  DebugLocStream(bool GenerateComments) : GenerateComments(GenerateComments) { }
+  size_t getNumLists() const { return Lists.size(); }
+  const List &getList(size_t LI) const { return Lists[LI]; }
+  ArrayRef<List> getLists() const { return Lists; }
+  MCSymbol *getSym() const {
+    return Sym;
+  }
+  void setSym(MCSymbol *Sym) {
+    this->Sym = Sym;
+  }
+
+  class ListBuilder;
+  class EntryBuilder;
+
+private:
+  /// Start a new .debug_loc entry list.
+  ///
+  /// Start a new .debug_loc entry list.  Return the new list's index so it can
+  /// be retrieved later via \a getList().
+  ///
+  /// Until the next call, \a startEntry() will add entries to this list.
+  size_t startList(DwarfCompileUnit *CU) {
+    size_t LI = Lists.size();
+    Lists.emplace_back(CU, Entries.size());
+    return LI;
+  }
+
+  /// Finalize a .debug_loc entry list.
+  ///
+  /// If there are no entries in this list, delete it outright.  Otherwise,
+  /// create a label with \a Asm.
+  ///
+  /// \return false iff the list is deleted.
+  bool finalizeList(AsmPrinter &Asm);
+
+  /// Start a new .debug_loc entry.
+  ///
+  /// Until the next call, bytes added to the stream will be added to this
+  /// entry.
+  void startEntry(const MCSymbol *BeginSym, const MCSymbol *EndSym) {
+    Entries.push_back({BeginSym, EndSym, DWARFBytes.size(), Comments.size()});
+  }
+
+  /// Finalize a .debug_loc entry, deleting if it's empty.
+  void finalizeEntry();
+
+public:
+  BufferByteStreamer getStreamer() {
+    return BufferByteStreamer(DWARFBytes, Comments, GenerateComments);
+  }
+
+  ArrayRef<Entry> getEntries(const List &L) const {
+    size_t LI = getIndex(L);
+    return ArrayRef(Entries).slice(Lists[LI].EntryOffset, getNumEntries(LI));
+  }
+
+  ArrayRef<char> getBytes(const Entry &E) const {
+    size_t EI = getIndex(E);
+    return ArrayRef(DWARFBytes.begin(), DWARFBytes.end())
+        .slice(Entries[EI].ByteOffset, getNumBytes(EI));
+  }
+  ArrayRef<std::string> getComments(const Entry &E) const {
+    size_t EI = getIndex(E);
+    return ArrayRef(Comments).slice(Entries[EI].CommentOffset,
+                                    getNumComments(EI));
+  }
+
+private:
+  size_t getIndex(const List &L) const {
+    assert(&Lists.front() <= &L && &L <= &Lists.back() &&
+           "Expected valid list");
+    return &L - &Lists.front();
+  }
+  size_t getIndex(const Entry &E) const {
+    assert(&Entries.front() <= &E && &E <= &Entries.back() &&
+           "Expected valid entry");
+    return &E - &Entries.front();
+  }
+  size_t getNumEntries(size_t LI) const {
+    if (LI + 1 == Lists.size())
+      return Entries.size() - Lists[LI].EntryOffset;
+    return Lists[LI + 1].EntryOffset - Lists[LI].EntryOffset;
+  }
+  size_t getNumBytes(size_t EI) const {
+    if (EI + 1 == Entries.size())
+      return DWARFBytes.size() - Entries[EI].ByteOffset;
+    return Entries[EI + 1].ByteOffset - Entries[EI].ByteOffset;
+  }
+  size_t getNumComments(size_t EI) const {
+    if (EI + 1 == Entries.size())
+      return Comments.size() - Entries[EI].CommentOffset;
+    return Entries[EI + 1].CommentOffset - Entries[EI].CommentOffset;
+  }
+};
+
+/// Builder for DebugLocStream lists.
+class DebugLocStream::ListBuilder {
+  DebugLocStream &Locs;
+  AsmPrinter &Asm;
+  DbgVariable &V;
+  const MachineInstr &MI;
+  size_t ListIndex;
+  std::optional<uint8_t> TagOffset;
+
+public:
+  ListBuilder(DebugLocStream &Locs, DwarfCompileUnit &CU, AsmPrinter &Asm,
+              DbgVariable &V, const MachineInstr &MI)
+      : Locs(Locs), Asm(Asm), V(V), MI(MI), ListIndex(Locs.startList(&CU)),
+        TagOffset(std::nullopt) {}
+
+  void setTagOffset(uint8_t TO) {
+    TagOffset = TO;
+  }
+
+  /// Finalize the list.
+  ///
+  /// If the list is empty, delete it.  Otherwise, finalize it by creating a
+  /// temp symbol in \a Asm and setting up the \a DbgVariable.
+  ~ListBuilder();
+
+  DebugLocStream &getLocs() { return Locs; }
+};
+
+/// Builder for DebugLocStream entries.
+class DebugLocStream::EntryBuilder {
+  DebugLocStream &Locs;
+
+public:
+  EntryBuilder(ListBuilder &List, const MCSymbol *Begin, const MCSymbol *End)
+      : Locs(List.getLocs()) {
+    Locs.startEntry(Begin, End);
+  }
+
+  /// Finalize the entry, deleting it if it's empty.
+  ~EntryBuilder() { Locs.finalizeEntry(); }
+
+  BufferByteStreamer getStreamer() { return Locs.getStreamer(); }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
new file mode 100644
index 000000000000..10c844ddb14a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -0,0 +1,151 @@
+//===-- CodeGen/AsmPrinter/DwarfException.cpp - Dwarf Exception Impl ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing DWARF exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfException.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+DwarfCFIException::DwarfCFIException(AsmPrinter *A) : EHStreamer(A) {}
+
+DwarfCFIException::~DwarfCFIException() = default;
+
+void DwarfCFIException::addPersonality(const GlobalValue *Personality) {
+  if (!llvm::is_contained(Personalities, Personality))
+    Personalities.push_back(Personality);
+}
+
+/// endModule - Emit all exception information that should come after the
+/// content.
+void DwarfCFIException::endModule() {
+  // SjLj uses this pass and it doesn't need this info.
+  if (!Asm->MAI->usesCFIForEH())
+    return;
+
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+
+  unsigned PerEncoding = TLOF.getPersonalityEncoding();
+
+  if ((PerEncoding & 0x80) != dwarf::DW_EH_PE_indirect)
+    return;
+
+  // Emit indirect reference table for all used personality functions
+  for (const GlobalValue *Personality : Personalities) {
+    MCSymbol *Sym = Asm->getSymbol(Personality);
+    TLOF.emitPersonalityValue(*Asm->OutStreamer, Asm->getDataLayout(), Sym);
+  }
+  Personalities.clear();
+}
+
+void DwarfCFIException::beginFunction(const MachineFunction *MF) {
+  shouldEmitPersonality = shouldEmitLSDA = false;
+  const Function &F = MF->getFunction();
+
+  // If any landing pads survive, we need an EH table.
+  bool hasLandingPads = !MF->getLandingPads().empty();
+
+  // See if we need frame move info.
+  bool shouldEmitMoves =
+      Asm->getFunctionCFISectionType(*MF) != AsmPrinter::CFISection::None;
+
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  unsigned PerEncoding = TLOF.getPersonalityEncoding();
+  const GlobalValue *Per = nullptr;
+  if (F.hasPersonalityFn())
+    Per = dyn_cast<GlobalValue>(F.getPersonalityFn()->stripPointerCasts());
+
+  // Emit a personality function even when there are no landing pads
+  forceEmitPersonality =
+      // ...if a personality function is explicitly specified
+      F.hasPersonalityFn() &&
+      // ... and it's not known to be a noop in the absence of invokes
+      !isNoOpWithoutInvoke(classifyEHPersonality(Per)) &&
+      // ... and we're not explicitly asked not to emit it
+      F.needsUnwindTableEntry();
+
+  shouldEmitPersonality =
+      (forceEmitPersonality ||
+       (hasLandingPads && PerEncoding != dwarf::DW_EH_PE_omit)) &&
+      Per;
+
+  unsigned LSDAEncoding = TLOF.getLSDAEncoding();
+  shouldEmitLSDA = shouldEmitPersonality &&
+    LSDAEncoding != dwarf::DW_EH_PE_omit;
+
+  const MCAsmInfo &MAI = *MF->getMMI().getContext().getAsmInfo();
+  if (MAI.getExceptionHandlingType() != ExceptionHandling::None)
+    shouldEmitCFI =
+        MAI.usesCFIForEH() && (shouldEmitPersonality || shouldEmitMoves);
+  else
+    shouldEmitCFI = Asm->usesCFIWithoutEH() && shouldEmitMoves;
+}
+
+void DwarfCFIException::beginBasicBlockSection(const MachineBasicBlock &MBB) {
+  if (!shouldEmitCFI)
+    return;
+
+  if (!hasEmittedCFISections) {
+    AsmPrinter::CFISection CFISecType = Asm->getModuleCFISectionType();
+    // If we don't say anything it implies `.cfi_sections .eh_frame`, so we
+    // chose not to be verbose in that case. And with `ForceDwarfFrameSection`,
+    // we should always emit .debug_frame.
+    if (CFISecType == AsmPrinter::CFISection::Debug ||
+        Asm->TM.Options.ForceDwarfFrameSection)
+      Asm->OutStreamer->emitCFISections(
+          CFISecType == AsmPrinter::CFISection::EH, true);
+    hasEmittedCFISections = true;
+  }
+
+  Asm->OutStreamer->emitCFIStartProc(/*IsSimple=*/false);
+
+  // Indicate personality routine, if any.
+  if (!shouldEmitPersonality)
+    return;
+
+  auto &F = MBB.getParent()->getFunction();
+  auto *P = dyn_cast<GlobalValue>(F.getPersonalityFn()->stripPointerCasts());
+  assert(P && "Expected personality function");
+  // Record the personality function.
+  addPersonality(P);
+
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  unsigned PerEncoding = TLOF.getPersonalityEncoding();
+  const MCSymbol *Sym = TLOF.getCFIPersonalitySymbol(P, Asm->TM, MMI);
+  Asm->OutStreamer->emitCFIPersonality(Sym, PerEncoding);
+
+  // Provide LSDA information.
+  if (shouldEmitLSDA)
+    Asm->OutStreamer->emitCFILsda(Asm->getMBBExceptionSym(MBB),
+                                  TLOF.getLSDAEncoding());
+}
+
+void DwarfCFIException::endBasicBlockSection(const MachineBasicBlock &MBB) {
+  if (shouldEmitCFI)
+    Asm->OutStreamer->emitCFIEndProc();
+}
+
+/// endFunction - Gather and emit post-function exception information.
+///
+void DwarfCFIException::endFunction(const MachineFunction *MF) {
+  if (!shouldEmitPersonality)
+    return;
+
+  emitExceptionTable();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
new file mode 100644
index 000000000000..58ed21379d29
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -0,0 +1,1708 @@
+//===- llvm/CodeGen/DwarfCompileUnit.cpp - Dwarf Compile Units ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for constructing a dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfCompileUnit.h"
+#include "AddressPool.h"
+#include "DwarfExpression.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <iterator>
+#include <optional>
+#include <string>
+#include <utility>
+
+using namespace llvm;
+
+static dwarf::Tag GetCompileUnitType(UnitKind Kind, DwarfDebug *DW) {
+
+  //  According to DWARF Debugging Information Format Version 5,
+  //  3.1.2 Skeleton Compilation Unit Entries:
+  //  "When generating a split DWARF object file (see Section 7.3.2
+  //  on page 187), the compilation unit in the .debug_info section
+  //  is a "skeleton" compilation unit with the tag DW_TAG_skeleton_unit"
+  if (DW->getDwarfVersion() >= 5 && Kind == UnitKind::Skeleton)
+    return dwarf::DW_TAG_skeleton_unit;
+
+  return dwarf::DW_TAG_compile_unit;
+}
+
+DwarfCompileUnit::DwarfCompileUnit(unsigned UID, const DICompileUnit *Node,
+                                   AsmPrinter *A, DwarfDebug *DW,
+                                   DwarfFile *DWU, UnitKind Kind)
+    : DwarfUnit(GetCompileUnitType(Kind, DW), Node, A, DW, DWU), UniqueID(UID) {
+  insertDIE(Node, &getUnitDie());
+  MacroLabelBegin = Asm->createTempSymbol("cu_macro_begin");
+}
+
+/// addLabelAddress - Add a dwarf label attribute data and value using
+/// DW_FORM_addr or DW_FORM_GNU_addr_index.
+void DwarfCompileUnit::addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
+                                       const MCSymbol *Label) {
+  if ((Skeleton || !DD->useSplitDwarf()) && Label)
+    DD->addArangeLabel(SymbolCU(this, Label));
+
+  // Don't use the address pool in non-fission or in the skeleton unit itself.
+  if ((!DD->useSplitDwarf() || !Skeleton) && DD->getDwarfVersion() < 5)
+    return addLocalLabelAddress(Die, Attribute, Label);
+
+  bool UseAddrOffsetFormOrExpressions =
+      DD->useAddrOffsetForm() || DD->useAddrOffsetExpressions();
+
+  const MCSymbol *Base = nullptr;
+  if (Label->isInSection() && UseAddrOffsetFormOrExpressions)
+    Base = DD->getSectionLabel(&Label->getSection());
+
+  if (!Base || Base == Label) {
+    unsigned idx = DD->getAddressPool().getIndex(Label);
+    addAttribute(Die, Attribute,
+                 DD->getDwarfVersion() >= 5 ? dwarf::DW_FORM_addrx
+                                            : dwarf::DW_FORM_GNU_addr_index,
+                 DIEInteger(idx));
+    return;
+  }
+
+  // Could be extended to work with DWARFv4 Split DWARF if that's important for
+  // someone. In that case DW_FORM_data would be used.
+  assert(DD->getDwarfVersion() >= 5 &&
+         "Addr+offset expressions are only valuable when using debug_addr (to "
+         "reduce relocations) available in DWARFv5 or higher");
+  if (DD->useAddrOffsetExpressions()) {
+    auto *Loc = new (DIEValueAllocator) DIEBlock();
+    addPoolOpAddress(*Loc, Label);
+    addBlock(Die, Attribute, dwarf::DW_FORM_exprloc, Loc);
+  } else
+    addAttribute(Die, Attribute, dwarf::DW_FORM_LLVM_addrx_offset,
+                 new (DIEValueAllocator) DIEAddrOffset(
+                     DD->getAddressPool().getIndex(Base), Label, Base));
+}
+
+void DwarfCompileUnit::addLocalLabelAddress(DIE &Die,
+                                            dwarf::Attribute Attribute,
+                                            const MCSymbol *Label) {
+  if (Label)
+    addAttribute(Die, Attribute, dwarf::DW_FORM_addr, DIELabel(Label));
+  else
+    addAttribute(Die, Attribute, dwarf::DW_FORM_addr, DIEInteger(0));
+}
+
+unsigned DwarfCompileUnit::getOrCreateSourceID(const DIFile *File) {
+  // If we print assembly, we can't separate .file entries according to
+  // compile units. Thus all files will belong to the default compile unit.
+
+  // FIXME: add a better feature test than hasRawTextSupport. Even better,
+  // extend .file to support this.
+  unsigned CUID = Asm->OutStreamer->hasRawTextSupport() ? 0 : getUniqueID();
+  if (!File)
+    return Asm->OutStreamer->emitDwarfFileDirective(0, "", "", std::nullopt,
+                                                    std::nullopt, CUID);
+
+  if (LastFile != File) {
+    LastFile = File;
+    LastFileID = Asm->OutStreamer->emitDwarfFileDirective(
+        0, File->getDirectory(), File->getFilename(), DD->getMD5AsBytes(File),
+        File->getSource(), CUID);
+  }
+  return LastFileID;
+}
+
+DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
+    const DIGlobalVariable *GV, ArrayRef<GlobalExpr> GlobalExprs) {
+  // Check for pre-existence.
+  if (DIE *Die = getDIE(GV))
+    return Die;
+
+  assert(GV);
+
+  auto *GVContext = GV->getScope();
+  const DIType *GTy = GV->getType();
+
+  auto *CB = GVContext ? dyn_cast<DICommonBlock>(GVContext) : nullptr;
+  DIE *ContextDIE = CB ? getOrCreateCommonBlock(CB, GlobalExprs)
+    : getOrCreateContextDIE(GVContext);
+
+  // Add to map.
+  DIE *VariableDIE = &createAndAddDIE(GV->getTag(), *ContextDIE, GV);
+  DIScope *DeclContext;
+  if (auto *SDMDecl = GV->getStaticDataMemberDeclaration()) {
+    DeclContext = SDMDecl->getScope();
+    assert(SDMDecl->isStaticMember() && "Expected static member decl");
+    assert(GV->isDefinition());
+    // We need the declaration DIE that is in the static member's class.
+    DIE *VariableSpecDIE = getOrCreateStaticMemberDIE(SDMDecl);
+    addDIEEntry(*VariableDIE, dwarf::DW_AT_specification, *VariableSpecDIE);
+    // If the global variable's type is different from the one in the class
+    // member type, assume that it's more specific and also emit it.
+    if (GTy != SDMDecl->getBaseType())
+      addType(*VariableDIE, GTy);
+  } else {
+    DeclContext = GV->getScope();
+    // Add name and type.
+    StringRef DisplayName = GV->getDisplayName();
+    if (!DisplayName.empty())
+      addString(*VariableDIE, dwarf::DW_AT_name, GV->getDisplayName());
+    if (GTy)
+      addType(*VariableDIE, GTy);
+
+    // Add scoping info.
+    if (!GV->isLocalToUnit())
+      addFlag(*VariableDIE, dwarf::DW_AT_external);
+
+    // Add line number info.
+    addSourceLine(*VariableDIE, GV);
+  }
+
+  if (!GV->isDefinition())
+    addFlag(*VariableDIE, dwarf::DW_AT_declaration);
+  else
+    addGlobalName(GV->getName(), *VariableDIE, DeclContext);
+
+  addAnnotation(*VariableDIE, GV->getAnnotations());
+
+  if (uint32_t AlignInBytes = GV->getAlignInBytes())
+    addUInt(*VariableDIE, dwarf::DW_AT_alignment, dwarf::DW_FORM_udata,
+            AlignInBytes);
+
+  if (MDTuple *TP = GV->getTemplateParams())
+    addTemplateParams(*VariableDIE, DINodeArray(TP));
+
+  // Add location.
+  addLocationAttribute(VariableDIE, GV, GlobalExprs);
+
+  return VariableDIE;
+}
+
+void DwarfCompileUnit::addLocationAttribute(
+    DIE *VariableDIE, const DIGlobalVariable *GV, ArrayRef<GlobalExpr> GlobalExprs) {
+  bool addToAccelTable = false;
+  DIELoc *Loc = nullptr;
+  std::optional<unsigned> NVPTXAddressSpace;
+  std::unique_ptr<DIEDwarfExpression> DwarfExpr;
+  for (const auto &GE : GlobalExprs) {
+    const GlobalVariable *Global = GE.Var;
+    const DIExpression *Expr = GE.Expr;
+
+    // For compatibility with DWARF 3 and earlier,
+    // DW_AT_location(DW_OP_constu, X, DW_OP_stack_value) or
+    // DW_AT_location(DW_OP_consts, X, DW_OP_stack_value) becomes
+    // DW_AT_const_value(X).
+    if (GlobalExprs.size() == 1 && Expr && Expr->isConstant()) {
+      addToAccelTable = true;
+      addConstantValue(
+          *VariableDIE,
+          DIExpression::SignedOrUnsignedConstant::UnsignedConstant ==
+              *Expr->isConstant(),
+          Expr->getElement(1));
+      break;
+    }
+
+    // We cannot describe the location of dllimport'd variables: the
+    // computation of their address requires loads from the IAT.
+    if (Global && Global->hasDLLImportStorageClass())
+      continue;
+
+    // Nothing to describe without address or constant.
+    if (!Global && (!Expr || !Expr->isConstant()))
+      continue;
+
+    if (Global && Global->isThreadLocal() &&
+        !Asm->getObjFileLowering().supportDebugThreadLocalLocation())
+      continue;
+
+    if (!Loc) {
+      addToAccelTable = true;
+      Loc = new (DIEValueAllocator) DIELoc;
+      DwarfExpr = std::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc);
+    }
+
+    if (Expr) {
+      // According to
+      // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf
+      // cuda-gdb requires DW_AT_address_class for all variables to be able to
+      // correctly interpret address space of the variable address.
+      // Decode DW_OP_constu <DWARF Address Space> DW_OP_swap DW_OP_xderef
+      // sequence for the NVPTX + gdb target.
+      unsigned LocalNVPTXAddressSpace;
+      if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB()) {
+        const DIExpression *NewExpr =
+            DIExpression::extractAddressClass(Expr, LocalNVPTXAddressSpace);
+        if (NewExpr != Expr) {
+          Expr = NewExpr;
+          NVPTXAddressSpace = LocalNVPTXAddressSpace;
+        }
+      }
+      DwarfExpr->addFragmentOffset(Expr);
+    }
+
+    if (Global) {
+      const MCSymbol *Sym = Asm->getSymbol(Global);
+      // 16-bit platforms like MSP430 and AVR take this path, so sink this
+      // assert to platforms that use it.
+      auto GetPointerSizedFormAndOp = [this]() {
+        unsigned PointerSize = Asm->MAI->getCodePointerSize();
+        assert((PointerSize == 4 || PointerSize == 8) &&
+               "Add support for other sizes if necessary");
+        struct FormAndOp {
+          dwarf::Form Form;
+          dwarf::LocationAtom Op;
+        };
+        return PointerSize == 4
+                   ? FormAndOp{dwarf::DW_FORM_data4, dwarf::DW_OP_const4u}
+                   : FormAndOp{dwarf::DW_FORM_data8, dwarf::DW_OP_const8u};
+      };
+      if (Global->isThreadLocal()) {
+        if (Asm->TM.getTargetTriple().isWasm()) {
+          // FIXME This is not guaranteed, but in practice, in static linking,
+          // if present, __tls_base's index is 1. This doesn't hold for dynamic
+          // linking, so TLS variables used in dynamic linking won't have
+          // correct debug info for now. See
+          // https://github.com/llvm/llvm-project/blob/19afbfe33156d211fa959dadeea46cd17b9c723c/lld/wasm/Driver.cpp#L786-L823
+          addWasmRelocBaseGlobal(Loc, "__tls_base", 1);
+          addOpAddress(*Loc, Sym);
+          addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+        } else if (Asm->TM.useEmulatedTLS()) {
+          // TODO: add debug info for emulated thread local mode.
+        } else {
+          // FIXME: Make this work with -gsplit-dwarf.
+          // Based on GCC's support for TLS:
+          if (!DD->useSplitDwarf()) {
+            auto FormAndOp = GetPointerSizedFormAndOp();
+            // 1) Start with a constNu of the appropriate pointer size
+            addUInt(*Loc, dwarf::DW_FORM_data1, FormAndOp.Op);
+            // 2) containing the (relocated) offset of the TLS variable
+            //    within the module's TLS block.
+            addExpr(*Loc, FormAndOp.Form,
+                    Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
+          } else {
+            addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
+            addUInt(*Loc, dwarf::DW_FORM_udata,
+                    DD->getAddressPool().getIndex(Sym, /* TLS */ true));
+          }
+          // 3) followed by an OP to make the debugger do a TLS lookup.
+          addUInt(*Loc, dwarf::DW_FORM_data1,
+                  DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address
+                                        : dwarf::DW_OP_form_tls_address);
+        }
+      } else if (Asm->TM.getTargetTriple().isWasm() &&
+                 Asm->TM.getRelocationModel() == Reloc::PIC_) {
+        // FIXME This is not guaranteed, but in practice, if present,
+        // __memory_base's index is 1. See
+        // https://github.com/llvm/llvm-project/blob/19afbfe33156d211fa959dadeea46cd17b9c723c/lld/wasm/Driver.cpp#L786-L823
+        addWasmRelocBaseGlobal(Loc, "__memory_base", 1);
+        addOpAddress(*Loc, Sym);
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+      } else if ((Asm->TM.getRelocationModel() == Reloc::RWPI ||
+                  Asm->TM.getRelocationModel() == Reloc::ROPI_RWPI) &&
+                 !Asm->getObjFileLowering()
+                      .getKindForGlobal(Global, Asm->TM)
+                      .isReadOnly()) {
+        auto FormAndOp = GetPointerSizedFormAndOp();
+        // Constant
+        addUInt(*Loc, dwarf::DW_FORM_data1, FormAndOp.Op);
+        // Relocation offset
+        addExpr(*Loc, FormAndOp.Form,
+                Asm->getObjFileLowering().getIndirectSymViaRWPI(Sym));
+        // Base register
+        Register BaseReg = Asm->getObjFileLowering().getStaticBase();
+        BaseReg = Asm->TM.getMCRegisterInfo()->getDwarfRegNum(BaseReg, false);
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + BaseReg);
+        // Offset from base register
+        addSInt(*Loc, dwarf::DW_FORM_sdata, 0);
+        // Operation
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+      } else {
+        DD->addArangeLabel(SymbolCU(this, Sym));
+        addOpAddress(*Loc, Sym);
+      }
+    }
+    // Global variables attached to symbols are memory locations.
+    // It would be better if this were unconditional, but malformed input that
+    // mixes non-fragments and fragments for the same variable is too expensive
+    // to detect in the verifier.
+    if (DwarfExpr->isUnknownLocation())
+      DwarfExpr->setMemoryLocationKind();
+    DwarfExpr->addExpression(Expr);
+  }
+  if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB()) {
+    // According to
+    // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf
+    // cuda-gdb requires DW_AT_address_class for all variables to be able to
+    // correctly interpret address space of the variable address.
+    const unsigned NVPTX_ADDR_global_space = 5;
+    addUInt(*VariableDIE, dwarf::DW_AT_address_class, dwarf::DW_FORM_data1,
+            NVPTXAddressSpace.value_or(NVPTX_ADDR_global_space));
+  }
+  if (Loc)
+    addBlock(*VariableDIE, dwarf::DW_AT_location, DwarfExpr->finalize());
+
+  if (DD->useAllLinkageNames())
+    addLinkageName(*VariableDIE, GV->getLinkageName());
+
+  if (addToAccelTable) {
+    DD->addAccelName(*CUNode, GV->getName(), *VariableDIE);
+
+    // If the linkage name is different than the name, go ahead and output
+    // that as well into the name table.
+    if (GV->getLinkageName() != "" && GV->getName() != GV->getLinkageName() &&
+        DD->useAllLinkageNames())
+      DD->addAccelName(*CUNode, GV->getLinkageName(), *VariableDIE);
+  }
+}
+
+DIE *DwarfCompileUnit::getOrCreateCommonBlock(
+    const DICommonBlock *CB, ArrayRef<GlobalExpr> GlobalExprs) {
+  // Check for pre-existence.
+  if (DIE *NDie = getDIE(CB))
+    return NDie;
+  DIE *ContextDIE = getOrCreateContextDIE(CB->getScope());
+  DIE &NDie = createAndAddDIE(dwarf::DW_TAG_common_block, *ContextDIE, CB);
+  StringRef Name = CB->getName().empty() ? "_BLNK_" : CB->getName();
+  addString(NDie, dwarf::DW_AT_name, Name);
+  addGlobalName(Name, NDie, CB->getScope());
+  if (CB->getFile())
+    addSourceLine(NDie, CB->getLineNo(), CB->getFile());
+  if (DIGlobalVariable *V = CB->getDecl())
+    getCU().addLocationAttribute(&NDie, V, GlobalExprs);
+  return &NDie;
+}
+
+void DwarfCompileUnit::addRange(RangeSpan Range) {
+  DD->insertSectionLabel(Range.Begin);
+
+  auto *PrevCU = DD->getPrevCU();
+  bool SameAsPrevCU = this == PrevCU;
+  DD->setPrevCU(this);
+  // If we have no current ranges just add the range and return, otherwise,
+  // check the current section and CU against the previous section and CU we
+  // emitted into and the subprogram was contained within. If these are the
+  // same then extend our current range, otherwise add this as a new range.
+  if (CURanges.empty() || !SameAsPrevCU ||
+      (&CURanges.back().End->getSection() !=
+       &Range.End->getSection())) {
+    // Before a new range is added, always terminate the prior line table.
+    if (PrevCU)
+      DD->terminateLineTable(PrevCU);
+    CURanges.push_back(Range);
+    return;
+  }
+
+  CURanges.back().End = Range.End;
+}
+
+void DwarfCompileUnit::initStmtList() {
+  if (CUNode->isDebugDirectivesOnly())
+    return;
+
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  if (DD->useSectionsAsReferences()) {
+    LineTableStartSym = TLOF.getDwarfLineSection()->getBeginSymbol();
+  } else {
+    LineTableStartSym =
+        Asm->OutStreamer->getDwarfLineTableSymbol(getUniqueID());
+  }
+
+  // DW_AT_stmt_list is a offset of line number information for this
+  // compile unit in debug_line section. For split dwarf this is
+  // left in the skeleton CU and so not included.
+  // The line table entries are not always emitted in assembly, so it
+  // is not okay to use line_table_start here.
+      addSectionLabel(getUnitDie(), dwarf::DW_AT_stmt_list, LineTableStartSym,
+                      TLOF.getDwarfLineSection()->getBeginSymbol());
+}
+
+void DwarfCompileUnit::applyStmtList(DIE &D) {
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  addSectionLabel(D, dwarf::DW_AT_stmt_list, LineTableStartSym,
+                  TLOF.getDwarfLineSection()->getBeginSymbol());
+}
+
+void DwarfCompileUnit::attachLowHighPC(DIE &D, const MCSymbol *Begin,
+                                       const MCSymbol *End) {
+  assert(Begin && "Begin label should not be null!");
+  assert(End && "End label should not be null!");
+  assert(Begin->isDefined() && "Invalid starting label");
+  assert(End->isDefined() && "Invalid end label");
+
+  addLabelAddress(D, dwarf::DW_AT_low_pc, Begin);
+  if (DD->getDwarfVersion() < 4)
+    addLabelAddress(D, dwarf::DW_AT_high_pc, End);
+  else
+    addLabelDelta(D, dwarf::DW_AT_high_pc, End, Begin);
+}
+
+// Find DIE for the given subprogram and attach appropriate DW_AT_low_pc
+// and DW_AT_high_pc attributes. If there are global variables in this
+// scope then create and insert DIEs for these variables.
+DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) {
+  DIE *SPDie = getOrCreateSubprogramDIE(SP, includeMinimalInlineScopes());
+  auto *ContextCU = static_cast<DwarfCompileUnit *>(SPDie->getUnit());
+  return ContextCU->updateSubprogramScopeDIEImpl(SP, SPDie);
+}
+
+// Add info for Wasm-global-based relocation.
+// 'GlobalIndex' is used for split dwarf, which currently relies on a few
+// assumptions that are not guaranteed in a formal way but work in practice.
+void DwarfCompileUnit::addWasmRelocBaseGlobal(DIELoc *Loc, StringRef GlobalName,
+                                              uint64_t GlobalIndex) {
+  // FIXME: duplicated from Target/WebAssembly/WebAssembly.h
+  // don't want to depend on target specific headers in this code?
+  const unsigned TI_GLOBAL_RELOC = 3;
+  unsigned PointerSize = Asm->getDataLayout().getPointerSize();
+  auto *Sym = cast<MCSymbolWasm>(Asm->GetExternalSymbolSymbol(GlobalName));
+  // FIXME: this repeats what WebAssemblyMCInstLower::
+  // GetExternalSymbolSymbol does, since if there's no code that
+  // refers to this symbol, we have to set it here.
+  Sym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
+  Sym->setGlobalType(wasm::WasmGlobalType{
+      static_cast<uint8_t>(PointerSize == 4 ? wasm::WASM_TYPE_I32
+                                            : wasm::WASM_TYPE_I64),
+      true});
+  addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_WASM_location);
+  addSInt(*Loc, dwarf::DW_FORM_sdata, TI_GLOBAL_RELOC);
+  if (!isDwoUnit()) {
+    addLabel(*Loc, dwarf::DW_FORM_data4, Sym);
+  } else {
+    // FIXME: when writing dwo, we need to avoid relocations. Probably
+    // the "right" solution is to treat globals the way func and data
+    // symbols are (with entries in .debug_addr).
+    // For now we hardcode the indices in the callsites. Global indices are not
+    // fixed, but in practice a few are fixed; for example, __stack_pointer is
+    // always index 0.
+    addUInt(*Loc, dwarf::DW_FORM_data4, GlobalIndex);
+  }
+}
+
+DIE &DwarfCompileUnit::updateSubprogramScopeDIEImpl(const DISubprogram *SP,
+                                                    DIE *SPDie) {
+  SmallVector<RangeSpan, 2> BB_List;
+  // If basic block sections are on, ranges for each basic block section has
+  // to be emitted separately.
+  for (const auto &R : Asm->MBBSectionRanges)
+    BB_List.push_back({R.second.BeginLabel, R.second.EndLabel});
+
+  attachRangesOrLowHighPC(*SPDie, BB_List);
+
+  if (DD->useAppleExtensionAttributes() &&
+      !DD->getCurrentFunction()->getTarget().Options.DisableFramePointerElim(
+          *DD->getCurrentFunction()))
+    addFlag(*SPDie, dwarf::DW_AT_APPLE_omit_frame_ptr);
+
+  // Only include DW_AT_frame_base in full debug info
+  if (!includeMinimalInlineScopes()) {
+    const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
+    TargetFrameLowering::DwarfFrameBase FrameBase =
+        TFI->getDwarfFrameBase(*Asm->MF);
+    switch (FrameBase.Kind) {
+    case TargetFrameLowering::DwarfFrameBase::Register: {
+      if (Register::isPhysicalRegister(FrameBase.Location.Reg)) {
+        MachineLocation Location(FrameBase.Location.Reg);
+        addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
+      }
+      break;
+    }
+    case TargetFrameLowering::DwarfFrameBase::CFA: {
+      DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_call_frame_cfa);
+      if (FrameBase.Location.Offset != 0) {
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_consts);
+        addSInt(*Loc, dwarf::DW_FORM_sdata, FrameBase.Location.Offset);
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+      }
+      addBlock(*SPDie, dwarf::DW_AT_frame_base, Loc);
+      break;
+    }
+    case TargetFrameLowering::DwarfFrameBase::WasmFrameBase: {
+      // FIXME: duplicated from Target/WebAssembly/WebAssembly.h
+      const unsigned TI_GLOBAL_RELOC = 3;
+      if (FrameBase.Location.WasmLoc.Kind == TI_GLOBAL_RELOC) {
+        // These need to be relocatable.
+        DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+        assert(FrameBase.Location.WasmLoc.Index == 0); // Only SP so far.
+        // For now, since we only ever use index 0, this should work as-is.
+        addWasmRelocBaseGlobal(Loc, "__stack_pointer",
+                               FrameBase.Location.WasmLoc.Index);
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value);
+        addBlock(*SPDie, dwarf::DW_AT_frame_base, Loc);
+      } else {
+        DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+        DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+        DIExpressionCursor Cursor({});
+        DwarfExpr.addWasmLocation(FrameBase.Location.WasmLoc.Kind,
+            FrameBase.Location.WasmLoc.Index);
+        DwarfExpr.addExpression(std::move(Cursor));
+        addBlock(*SPDie, dwarf::DW_AT_frame_base, DwarfExpr.finalize());
+      }
+      break;
+    }
+    }
+  }
+
+  // Add name to the name table, we do this here because we're guaranteed
+  // to have concrete versions of our DW_TAG_subprogram nodes.
+  DD->addSubprogramNames(*CUNode, SP, *SPDie);
+
+  return *SPDie;
+}
+
+// Construct a DIE for this scope.
+void DwarfCompileUnit::constructScopeDIE(LexicalScope *Scope,
+                                         DIE &ParentScopeDIE) {
+  if (!Scope || !Scope->getScopeNode())
+    return;
+
+  auto *DS = Scope->getScopeNode();
+
+  assert((Scope->getInlinedAt() || !isa<DISubprogram>(DS)) &&
+         "Only handle inlined subprograms here, use "
+         "constructSubprogramScopeDIE for non-inlined "
+         "subprograms");
+
+  // Emit inlined subprograms.
+  if (Scope->getParent() && isa<DISubprogram>(DS)) {
+    DIE *ScopeDIE = constructInlinedScopeDIE(Scope, ParentScopeDIE);
+    assert(ScopeDIE && "Scope DIE should not be null.");
+    createAndAddScopeChildren(Scope, *ScopeDIE);
+    return;
+  }
+
+  // Early exit when we know the scope DIE is going to be null.
+  if (DD->isLexicalScopeDIENull(Scope))
+    return;
+
+  // Emit lexical blocks.
+  DIE *ScopeDIE = constructLexicalScopeDIE(Scope);
+  assert(ScopeDIE && "Scope DIE should not be null.");
+
+  ParentScopeDIE.addChild(ScopeDIE);
+  createAndAddScopeChildren(Scope, *ScopeDIE);
+}
+
+void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
+                                         SmallVector<RangeSpan, 2> Range) {
+
+  HasRangeLists = true;
+
+  // Add the range list to the set of ranges to be emitted.
+  auto IndexAndList =
+      (DD->getDwarfVersion() < 5 && Skeleton ? Skeleton->DU : DU)
+          ->addRange(*(Skeleton ? Skeleton : this), std::move(Range));
+
+  uint32_t Index = IndexAndList.first;
+  auto &List = *IndexAndList.second;
+
+  // Under fission, ranges are specified by constant offsets relative to the
+  // CU's DW_AT_GNU_ranges_base.
+  // FIXME: For DWARF v5, do not generate the DW_AT_ranges attribute under
+  // fission until we support the forms using the .debug_addr section
+  // (DW_RLE_startx_endx etc.).
+  if (DD->getDwarfVersion() >= 5)
+    addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_rnglistx, Index);
+  else {
+    const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+    const MCSymbol *RangeSectionSym =
+        TLOF.getDwarfRangesSection()->getBeginSymbol();
+    if (isDwoUnit())
+      addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, List.Label,
+                      RangeSectionSym);
+    else
+      addSectionLabel(ScopeDIE, dwarf::DW_AT_ranges, List.Label,
+                      RangeSectionSym);
+  }
+}
+
+void DwarfCompileUnit::attachRangesOrLowHighPC(
+    DIE &Die, SmallVector<RangeSpan, 2> Ranges) {
+  assert(!Ranges.empty());
+  if (!DD->useRangesSection() ||
+      (Ranges.size() == 1 &&
+       (!DD->alwaysUseRanges(*this) ||
+        DD->getSectionLabel(&Ranges.front().Begin->getSection()) ==
+            Ranges.front().Begin))) {
+    const RangeSpan &Front = Ranges.front();
+    const RangeSpan &Back = Ranges.back();
+    attachLowHighPC(Die, Front.Begin, Back.End);
+  } else
+    addScopeRangeList(Die, std::move(Ranges));
+}
+
+void DwarfCompileUnit::attachRangesOrLowHighPC(
+    DIE &Die, const SmallVectorImpl<InsnRange> &Ranges) {
+  SmallVector<RangeSpan, 2> List;
+  List.reserve(Ranges.size());
+  for (const InsnRange &R : Ranges) {
+    auto *BeginLabel = DD->getLabelBeforeInsn(R.first);
+    auto *EndLabel = DD->getLabelAfterInsn(R.second);
+
+    const auto *BeginMBB = R.first->getParent();
+    const auto *EndMBB = R.second->getParent();
+
+    const auto *MBB = BeginMBB;
+    // Basic block sections allows basic block subsets to be placed in unique
+    // sections. For each section, the begin and end label must be added to the
+    // list. If there is more than one range, debug ranges must be used.
+    // Otherwise, low/high PC can be used.
+    // FIXME: Debug Info Emission depends on block order and this assumes that
+    // the order of blocks will be frozen beyond this point.
+    do {
+      if (MBB->sameSection(EndMBB) || MBB->isEndSection()) {
+        auto MBBSectionRange = Asm->MBBSectionRanges[MBB->getSectionIDNum()];
+        List.push_back(
+            {MBB->sameSection(BeginMBB) ? BeginLabel
+                                        : MBBSectionRange.BeginLabel,
+             MBB->sameSection(EndMBB) ? EndLabel : MBBSectionRange.EndLabel});
+      }
+      if (MBB->sameSection(EndMBB))
+        break;
+      MBB = MBB->getNextNode();
+    } while (true);
+  }
+  attachRangesOrLowHighPC(Die, std::move(List));
+}
+
+DIE *DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope,
+                                                DIE &ParentScopeDIE) {
+  assert(Scope->getScopeNode());
+  auto *DS = Scope->getScopeNode();
+  auto *InlinedSP = getDISubprogram(DS);
+  // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
+  // was inlined from another compile unit.
+  DIE *OriginDIE = getAbstractScopeDIEs()[InlinedSP];
+  assert(OriginDIE && "Unable to find original DIE for an inlined subprogram.");
+
+  auto ScopeDIE = DIE::get(DIEValueAllocator, dwarf::DW_TAG_inlined_subroutine);
+  ParentScopeDIE.addChild(ScopeDIE);
+  addDIEEntry(*ScopeDIE, dwarf::DW_AT_abstract_origin, *OriginDIE);
+
+  attachRangesOrLowHighPC(*ScopeDIE, Scope->getRanges());
+
+  // Add the call site information to the DIE.
+  const DILocation *IA = Scope->getInlinedAt();
+  addUInt(*ScopeDIE, dwarf::DW_AT_call_file, std::nullopt,
+          getOrCreateSourceID(IA->getFile()));
+  addUInt(*ScopeDIE, dwarf::DW_AT_call_line, std::nullopt, IA->getLine());
+  if (IA->getColumn())
+    addUInt(*ScopeDIE, dwarf::DW_AT_call_column, std::nullopt, IA->getColumn());
+  if (IA->getDiscriminator() && DD->getDwarfVersion() >= 4)
+    addUInt(*ScopeDIE, dwarf::DW_AT_GNU_discriminator, std::nullopt,
+            IA->getDiscriminator());
+
+  // Add name to the name table, we do this here because we're guaranteed
+  // to have concrete versions of our DW_TAG_inlined_subprogram nodes.
+  DD->addSubprogramNames(*CUNode, InlinedSP, *ScopeDIE);
+
+  return ScopeDIE;
+}
+
+// Construct new DW_TAG_lexical_block for this scope and attach
+// DW_AT_low_pc/DW_AT_high_pc labels.
+DIE *DwarfCompileUnit::constructLexicalScopeDIE(LexicalScope *Scope) {
+  if (DD->isLexicalScopeDIENull(Scope))
+    return nullptr;
+  const auto *DS = Scope->getScopeNode();
+
+  auto ScopeDIE = DIE::get(DIEValueAllocator, dwarf::DW_TAG_lexical_block);
+  if (Scope->isAbstractScope()) {
+    assert(!getAbstractScopeDIEs().count(DS) &&
+           "Abstract DIE for this scope exists!");
+    getAbstractScopeDIEs()[DS] = ScopeDIE;
+    return ScopeDIE;
+  }
+  if (!Scope->getInlinedAt()) {
+    assert(!LexicalBlockDIEs.count(DS) &&
+           "Concrete out-of-line DIE for this scope exists!");
+    LexicalBlockDIEs[DS] = ScopeDIE;
+  }
+
+  attachRangesOrLowHighPC(*ScopeDIE, Scope->getRanges());
+
+  return ScopeDIE;
+}
+
+/// constructVariableDIE - Construct a DIE for the given DbgVariable.
+DIE *DwarfCompileUnit::constructVariableDIE(DbgVariable &DV, bool Abstract) {
+  auto D = constructVariableDIEImpl(DV, Abstract);
+  DV.setDIE(*D);
+  return D;
+}
+
+DIE *DwarfCompileUnit::constructLabelDIE(DbgLabel &DL,
+                                         const LexicalScope &Scope) {
+  auto LabelDie = DIE::get(DIEValueAllocator, DL.getTag());
+  insertDIE(DL.getLabel(), LabelDie);
+  DL.setDIE(*LabelDie);
+
+  if (Scope.isAbstractScope())
+    applyLabelAttributes(DL, *LabelDie);
+
+  return LabelDie;
+}
+
+DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
+                                                bool Abstract) {
+  // Define variable debug information entry.
+  auto VariableDie = DIE::get(DIEValueAllocator, DV.getTag());
+  insertDIE(DV.getVariable(), VariableDie);
+
+  if (Abstract) {
+    applyVariableAttributes(DV, *VariableDie);
+    return VariableDie;
+  }
+
+  // Add variable address.
+
+  unsigned Index = DV.getDebugLocListIndex();
+  if (Index != ~0U) {
+    addLocationList(*VariableDie, dwarf::DW_AT_location, Index);
+    auto TagOffset = DV.getDebugLocListTagOffset();
+    if (TagOffset)
+      addUInt(*VariableDie, dwarf::DW_AT_LLVM_tag_offset, dwarf::DW_FORM_data1,
+              *TagOffset);
+    return VariableDie;
+  }
+
+  // Check if variable has a single location description.
+  if (auto *DVal = DV.getValueLoc()) {
+    if (!DVal->isVariadic()) {
+      const DbgValueLocEntry *Entry = DVal->getLocEntries().begin();
+      if (Entry->isLocation()) {
+        addVariableAddress(DV, *VariableDie, Entry->getLoc());
+      } else if (Entry->isInt()) {
+        auto *Expr = DV.getSingleExpression();
+        if (Expr && Expr->getNumElements()) {
+          DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+          DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+          // If there is an expression, emit raw unsigned bytes.
+          DwarfExpr.addFragmentOffset(Expr);
+          DwarfExpr.addUnsignedConstant(Entry->getInt());
+          DwarfExpr.addExpression(Expr);
+          addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize());
+          if (DwarfExpr.TagOffset)
+            addUInt(*VariableDie, dwarf::DW_AT_LLVM_tag_offset,
+                    dwarf::DW_FORM_data1, *DwarfExpr.TagOffset);
+        } else
+          addConstantValue(*VariableDie, Entry->getInt(), DV.getType());
+      } else if (Entry->isConstantFP()) {
+        addConstantFPValue(*VariableDie, Entry->getConstantFP());
+      } else if (Entry->isConstantInt()) {
+        addConstantValue(*VariableDie, Entry->getConstantInt(), DV.getType());
+      } else if (Entry->isTargetIndexLocation()) {
+        DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+        DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+        const DIBasicType *BT = dyn_cast<DIBasicType>(
+            static_cast<const Metadata *>(DV.getVariable()->getType()));
+        DwarfDebug::emitDebugLocValue(*Asm, BT, *DVal, DwarfExpr);
+        addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize());
+      }
+      return VariableDie;
+    }
+    // If any of the location entries are registers with the value 0, then the
+    // location is undefined.
+    if (any_of(DVal->getLocEntries(), [](const DbgValueLocEntry &Entry) {
+          return Entry.isLocation() && !Entry.getLoc().getReg();
+        }))
+      return VariableDie;
+    const DIExpression *Expr = DV.getSingleExpression();
+    assert(Expr && "Variadic Debug Value must have an Expression.");
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+    DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+    DwarfExpr.addFragmentOffset(Expr);
+    DIExpressionCursor Cursor(Expr);
+    const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
+
+    auto AddEntry = [&](const DbgValueLocEntry &Entry,
+                        DIExpressionCursor &Cursor) {
+      if (Entry.isLocation()) {
+        if (!DwarfExpr.addMachineRegExpression(TRI, Cursor,
+                                               Entry.getLoc().getReg()))
+          return false;
+      } else if (Entry.isInt()) {
+        // If there is an expression, emit raw unsigned bytes.
+        DwarfExpr.addUnsignedConstant(Entry.getInt());
+      } else if (Entry.isConstantFP()) {
+        // DwarfExpression does not support arguments wider than 64 bits
+        // (see PR52584).
+        // TODO: Consider chunking expressions containing overly wide
+        // arguments into separate pointer-sized fragment expressions.
+        APInt RawBytes = Entry.getConstantFP()->getValueAPF().bitcastToAPInt();
+        if (RawBytes.getBitWidth() > 64)
+          return false;
+        DwarfExpr.addUnsignedConstant(RawBytes.getZExtValue());
+      } else if (Entry.isConstantInt()) {
+        APInt RawBytes = Entry.getConstantInt()->getValue();
+        if (RawBytes.getBitWidth() > 64)
+          return false;
+        DwarfExpr.addUnsignedConstant(RawBytes.getZExtValue());
+      } else if (Entry.isTargetIndexLocation()) {
+        TargetIndexLocation Loc = Entry.getTargetIndexLocation();
+        // TODO TargetIndexLocation is a target-independent. Currently only the
+        // WebAssembly-specific encoding is supported.
+        assert(Asm->TM.getTargetTriple().isWasm());
+        DwarfExpr.addWasmLocation(Loc.Index, static_cast<uint64_t>(Loc.Offset));
+      } else {
+        llvm_unreachable("Unsupported Entry type.");
+      }
+      return true;
+    };
+
+    if (!DwarfExpr.addExpression(
+            std::move(Cursor),
+            [&](unsigned Idx, DIExpressionCursor &Cursor) -> bool {
+              return AddEntry(DVal->getLocEntries()[Idx], Cursor);
+            }))
+      return VariableDie;
+
+    // Now attach the location information to the DIE.
+    addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize());
+    if (DwarfExpr.TagOffset)
+      addUInt(*VariableDie, dwarf::DW_AT_LLVM_tag_offset, dwarf::DW_FORM_data1,
+              *DwarfExpr.TagOffset);
+
+    return VariableDie;
+  }
+
+  // .. else use frame index.
+  if (!DV.hasFrameIndexExprs())
+    return VariableDie;
+
+  std::optional<unsigned> NVPTXAddressSpace;
+  DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+  for (const auto &Fragment : DV.getFrameIndexExprs()) {
+    Register FrameReg;
+    const DIExpression *Expr = Fragment.Expr;
+    const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
+    StackOffset Offset =
+        TFI->getFrameIndexReference(*Asm->MF, Fragment.FI, FrameReg);
+    DwarfExpr.addFragmentOffset(Expr);
+
+    auto *TRI = Asm->MF->getSubtarget().getRegisterInfo();
+    SmallVector<uint64_t, 8> Ops;
+    TRI->getOffsetOpcodes(Offset, Ops);
+
+    // According to
+    // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf
+    // cuda-gdb requires DW_AT_address_class for all variables to be able to
+    // correctly interpret address space of the variable address.
+    // Decode DW_OP_constu <DWARF Address Space> DW_OP_swap DW_OP_xderef
+    // sequence for the NVPTX + gdb target.
+    unsigned LocalNVPTXAddressSpace;
+    if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB()) {
+      const DIExpression *NewExpr =
+          DIExpression::extractAddressClass(Expr, LocalNVPTXAddressSpace);
+      if (NewExpr != Expr) {
+        Expr = NewExpr;
+        NVPTXAddressSpace = LocalNVPTXAddressSpace;
+      }
+    }
+    if (Expr)
+      Ops.append(Expr->elements_begin(), Expr->elements_end());
+    DIExpressionCursor Cursor(Ops);
+    DwarfExpr.setMemoryLocationKind();
+    if (const MCSymbol *FrameSymbol = Asm->getFunctionFrameSymbol())
+      addOpAddress(*Loc, FrameSymbol);
+    else
+      DwarfExpr.addMachineRegExpression(
+          *Asm->MF->getSubtarget().getRegisterInfo(), Cursor, FrameReg);
+    DwarfExpr.addExpression(std::move(Cursor));
+  }
+  if (Asm->TM.getTargetTriple().isNVPTX() && DD->tuneForGDB()) {
+    // According to
+    // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf
+    // cuda-gdb requires DW_AT_address_class for all variables to be able to
+    // correctly interpret address space of the variable address.
+    const unsigned NVPTX_ADDR_local_space = 6;
+    addUInt(*VariableDie, dwarf::DW_AT_address_class, dwarf::DW_FORM_data1,
+            NVPTXAddressSpace.value_or(NVPTX_ADDR_local_space));
+  }
+  addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize());
+  if (DwarfExpr.TagOffset)
+    addUInt(*VariableDie, dwarf::DW_AT_LLVM_tag_offset, dwarf::DW_FORM_data1,
+            *DwarfExpr.TagOffset);
+
+  return VariableDie;
+}
+
+DIE *DwarfCompileUnit::constructVariableDIE(DbgVariable &DV,
+                                            const LexicalScope &Scope,
+                                            DIE *&ObjectPointer) {
+  auto Var = constructVariableDIE(DV, Scope.isAbstractScope());
+  if (DV.isObjectPointer())
+    ObjectPointer = Var;
+  return Var;
+}
+
+/// Return all DIVariables that appear in count: expressions.
+static SmallVector<const DIVariable *, 2> dependencies(DbgVariable *Var) {
+  SmallVector<const DIVariable *, 2> Result;
+  auto *Array = dyn_cast<DICompositeType>(Var->getType());
+  if (!Array || Array->getTag() != dwarf::DW_TAG_array_type)
+    return Result;
+  if (auto *DLVar = Array->getDataLocation())
+    Result.push_back(DLVar);
+  if (auto *AsVar = Array->getAssociated())
+    Result.push_back(AsVar);
+  if (auto *AlVar = Array->getAllocated())
+    Result.push_back(AlVar);
+  for (auto *El : Array->getElements()) {
+    if (auto *Subrange = dyn_cast<DISubrange>(El)) {
+      if (auto Count = Subrange->getCount())
+        if (auto *Dependency = dyn_cast_if_present<DIVariable *>(Count))
+          Result.push_back(Dependency);
+      if (auto LB = Subrange->getLowerBound())
+        if (auto *Dependency = dyn_cast_if_present<DIVariable *>(LB))
+          Result.push_back(Dependency);
+      if (auto UB = Subrange->getUpperBound())
+        if (auto *Dependency = dyn_cast_if_present<DIVariable *>(UB))
+          Result.push_back(Dependency);
+      if (auto ST = Subrange->getStride())
+        if (auto *Dependency = dyn_cast_if_present<DIVariable *>(ST))
+          Result.push_back(Dependency);
+    } else if (auto *GenericSubrange = dyn_cast<DIGenericSubrange>(El)) {
+      if (auto Count = GenericSubrange->getCount())
+        if (auto *Dependency = dyn_cast_if_present<DIVariable *>(Count))
+          Result.push_back(Dependency);
+      if (auto LB = GenericSubrange->getLowerBound())
+        if (auto *Dependency = dyn_cast_if_present<DIVariable *>(LB))
+          Result.push_back(Dependency);
+      if (auto UB = GenericSubrange->getUpperBound())
+        if (auto *Dependency = dyn_cast_if_present<DIVariable *>(UB))
+          Result.push_back(Dependency);
+      if (auto ST = GenericSubrange->getStride())
+        if (auto *Dependency = dyn_cast_if_present<DIVariable *>(ST))
+          Result.push_back(Dependency);
+    }
+  }
+  return Result;
+}
+
+/// Sort local variables so that variables appearing inside of helper
+/// expressions come first.
+static SmallVector<DbgVariable *, 8>
+sortLocalVars(SmallVectorImpl<DbgVariable *> &Input) {
+  SmallVector<DbgVariable *, 8> Result;
+  SmallVector<PointerIntPair<DbgVariable *, 1>, 8> WorkList;
+  // Map back from a DIVariable to its containing DbgVariable.
+  SmallDenseMap<const DILocalVariable *, DbgVariable *> DbgVar;
+  // Set of DbgVariables in Result.
+  SmallDenseSet<DbgVariable *, 8> Visited;
+  // For cycle detection.
+  SmallDenseSet<DbgVariable *, 8> Visiting;
+
+  // Initialize the worklist and the DIVariable lookup table.
+  for (auto *Var : reverse(Input)) {
+    DbgVar.insert({Var->getVariable(), Var});
+    WorkList.push_back({Var, 0});
+  }
+
+  // Perform a stable topological sort by doing a DFS.
+  while (!WorkList.empty()) {
+    auto Item = WorkList.back();
+    DbgVariable *Var = Item.getPointer();
+    bool visitedAllDependencies = Item.getInt();
+    WorkList.pop_back();
+
+    assert(Var);
+
+    // Already handled.
+    if (Visited.count(Var))
+      continue;
+
+    // Add to Result if all dependencies are visited.
+    if (visitedAllDependencies) {
+      Visited.insert(Var);
+      Result.push_back(Var);
+      continue;
+    }
+
+    // Detect cycles.
+    auto Res = Visiting.insert(Var);
+    if (!Res.second) {
+      assert(false && "dependency cycle in local variables");
+      return Result;
+    }
+
+    // Push dependencies and this node onto the worklist, so that this node is
+    // visited again after all of its dependencies are handled.
+    WorkList.push_back({Var, 1});
+    for (const auto *Dependency : dependencies(Var)) {
+      // Don't add dependency if it is in a different lexical scope or a global.
+      if (const auto *Dep = dyn_cast<const DILocalVariable>(Dependency))
+        if (DbgVariable *Var = DbgVar.lookup(Dep))
+          WorkList.push_back({Var, 0});
+    }
+  }
+  return Result;
+}
+
+DIE &DwarfCompileUnit::constructSubprogramScopeDIE(const DISubprogram *Sub,
+                                                   LexicalScope *Scope) {
+  DIE &ScopeDIE = updateSubprogramScopeDIE(Sub);
+  auto *ContextCU = static_cast<DwarfCompileUnit *>(ScopeDIE.getUnit());
+
+  if (Scope) {
+    assert(!Scope->getInlinedAt());
+    assert(!Scope->isAbstractScope());
+    // Collect lexical scope children first.
+    // ObjectPointer might be a local (non-argument) local variable if it's a
+    // block's synthetic this pointer.
+    if (DIE *ObjectPointer =
+            ContextCU->createAndAddScopeChildren(Scope, ScopeDIE))
+      ContextCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer,
+                             *ObjectPointer);
+  }
+
+  // If this is a variadic function, add an unspecified parameter.
+  DITypeRefArray FnArgs = Sub->getType()->getTypeArray();
+
+  // If we have a single element of null, it is a function that returns void.
+  // If we have more than one elements and the last one is null, it is a
+  // variadic function.
+  if (FnArgs.size() > 1 && !FnArgs[FnArgs.size() - 1] &&
+      !includeMinimalInlineScopes())
+    ScopeDIE.addChild(
+        DIE::get(DIEValueAllocator, dwarf::DW_TAG_unspecified_parameters));
+
+  return ScopeDIE;
+}
+
+DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope,
+                                                 DIE &ScopeDIE) {
+  DIE *ObjectPointer = nullptr;
+
+  // Emit function arguments (order is significant).
+  auto Vars = DU->getScopeVariables().lookup(Scope);
+  for (auto &DV : Vars.Args)
+    ScopeDIE.addChild(constructVariableDIE(*DV.second, *Scope, ObjectPointer));
+
+  // Emit local variables.
+  auto Locals = sortLocalVars(Vars.Locals);
+  for (DbgVariable *DV : Locals)
+    ScopeDIE.addChild(constructVariableDIE(*DV, *Scope, ObjectPointer));
+
+  // Emit labels.
+  for (DbgLabel *DL : DU->getScopeLabels().lookup(Scope))
+    ScopeDIE.addChild(constructLabelDIE(*DL, *Scope));
+
+  // Track other local entities (skipped in gmlt-like data).
+  // This creates mapping between CU and a set of local declarations that
+  // should be emitted for subprograms in this CU.
+  if (!includeMinimalInlineScopes() && !Scope->getInlinedAt()) {
+    auto &LocalDecls = DD->getLocalDeclsForScope(Scope->getScopeNode());
+    DeferredLocalDecls.insert(LocalDecls.begin(), LocalDecls.end());
+  }
+
+  // Emit inner lexical scopes.
+  auto skipLexicalScope = [this](LexicalScope *S) -> bool {
+    if (isa<DISubprogram>(S->getScopeNode()))
+      return false;
+    auto Vars = DU->getScopeVariables().lookup(S);
+    if (!Vars.Args.empty() || !Vars.Locals.empty())
+      return false;
+    return includeMinimalInlineScopes() ||
+           DD->getLocalDeclsForScope(S->getScopeNode()).empty();
+  };
+  for (LexicalScope *LS : Scope->getChildren()) {
+    // If the lexical block doesn't have non-scope children, skip
+    // its emission and put its children directly to the parent scope.
+    if (skipLexicalScope(LS))
+      createAndAddScopeChildren(LS, ScopeDIE);
+    else
+      constructScopeDIE(LS, ScopeDIE);
+  }
+
+  return ObjectPointer;
+}
+
+void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
+    LexicalScope *Scope) {
+  auto *SP = cast<DISubprogram>(Scope->getScopeNode());
+  if (getAbstractScopeDIEs().count(SP))
+    return;
+
+  DIE *ContextDIE;
+  DwarfCompileUnit *ContextCU = this;
+
+  if (includeMinimalInlineScopes())
+    ContextDIE = &getUnitDie();
+  // Some of this is duplicated from DwarfUnit::getOrCreateSubprogramDIE, with
+  // the important distinction that the debug node is not associated with the
+  // DIE (since the debug node will be associated with the concrete DIE, if
+  // any). It could be refactored to some common utility function.
+  else if (auto *SPDecl = SP->getDeclaration()) {
+    ContextDIE = &getUnitDie();
+    getOrCreateSubprogramDIE(SPDecl);
+  } else {
+    ContextDIE = getOrCreateContextDIE(SP->getScope());
+    // The scope may be shared with a subprogram that has already been
+    // constructed in another CU, in which case we need to construct this
+    // subprogram in the same CU.
+    ContextCU = DD->lookupCU(ContextDIE->getUnitDie());
+  }
+
+  // Passing null as the associated node because the abstract definition
+  // shouldn't be found by lookup.
+  DIE &AbsDef = ContextCU->createAndAddDIE(dwarf::DW_TAG_subprogram,
+                                           *ContextDIE, nullptr);
+
+  // Store the DIE before creating children.
+  ContextCU->getAbstractScopeDIEs()[SP] = &AbsDef;
+
+  ContextCU->applySubprogramAttributesToDefinition(SP, AbsDef);
+  ContextCU->addSInt(AbsDef, dwarf::DW_AT_inline,
+                     DD->getDwarfVersion() <= 4 ? std::optional<dwarf::Form>()
+                                                : dwarf::DW_FORM_implicit_const,
+                     dwarf::DW_INL_inlined);
+  if (DIE *ObjectPointer = ContextCU->createAndAddScopeChildren(Scope, AbsDef))
+    ContextCU->addDIEEntry(AbsDef, dwarf::DW_AT_object_pointer, *ObjectPointer);
+}
+
+bool DwarfCompileUnit::useGNUAnalogForDwarf5Feature() const {
+  return DD->getDwarfVersion() == 4 && !DD->tuneForLLDB();
+}
+
+dwarf::Tag DwarfCompileUnit::getDwarf5OrGNUTag(dwarf::Tag Tag) const {
+  if (!useGNUAnalogForDwarf5Feature())
+    return Tag;
+  switch (Tag) {
+  case dwarf::DW_TAG_call_site:
+    return dwarf::DW_TAG_GNU_call_site;
+  case dwarf::DW_TAG_call_site_parameter:
+    return dwarf::DW_TAG_GNU_call_site_parameter;
+  default:
+    llvm_unreachable("DWARF5 tag with no GNU analog");
+  }
+}
+
+dwarf::Attribute
+DwarfCompileUnit::getDwarf5OrGNUAttr(dwarf::Attribute Attr) const {
+  if (!useGNUAnalogForDwarf5Feature())
+    return Attr;
+  switch (Attr) {
+  case dwarf::DW_AT_call_all_calls:
+    return dwarf::DW_AT_GNU_all_call_sites;
+  case dwarf::DW_AT_call_target:
+    return dwarf::DW_AT_GNU_call_site_target;
+  case dwarf::DW_AT_call_origin:
+    return dwarf::DW_AT_abstract_origin;
+  case dwarf::DW_AT_call_return_pc:
+    return dwarf::DW_AT_low_pc;
+  case dwarf::DW_AT_call_value:
+    return dwarf::DW_AT_GNU_call_site_value;
+  case dwarf::DW_AT_call_tail_call:
+    return dwarf::DW_AT_GNU_tail_call;
+  default:
+    llvm_unreachable("DWARF5 attribute with no GNU analog");
+  }
+}
+
+dwarf::LocationAtom
+DwarfCompileUnit::getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const {
+  if (!useGNUAnalogForDwarf5Feature())
+    return Loc;
+  switch (Loc) {
+  case dwarf::DW_OP_entry_value:
+    return dwarf::DW_OP_GNU_entry_value;
+  default:
+    llvm_unreachable("DWARF5 location atom with no GNU analog");
+  }
+}
+
+DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE,
+                                                 const DISubprogram *CalleeSP,
+                                                 bool IsTail,
+                                                 const MCSymbol *PCAddr,
+                                                 const MCSymbol *CallAddr,
+                                                 unsigned CallReg) {
+  // Insert a call site entry DIE within ScopeDIE.
+  DIE &CallSiteDIE = createAndAddDIE(getDwarf5OrGNUTag(dwarf::DW_TAG_call_site),
+                                     ScopeDIE, nullptr);
+
+  if (CallReg) {
+    // Indirect call.
+    addAddress(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_target),
+               MachineLocation(CallReg));
+  } else {
+    DIE *CalleeDIE = getOrCreateSubprogramDIE(CalleeSP);
+    assert(CalleeDIE && "Could not create DIE for call site entry origin");
+    addDIEEntry(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_origin),
+                *CalleeDIE);
+  }
+
+  if (IsTail) {
+    // Attach DW_AT_call_tail_call to tail calls for standards compliance.
+    addFlag(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_tail_call));
+
+    // Attach the address of the branch instruction to allow the debugger to
+    // show where the tail call occurred. This attribute has no GNU analog.
+    //
+    // GDB works backwards from non-standard usage of DW_AT_low_pc (in DWARF4
+    // mode -- equivalently, in DWARF5 mode, DW_AT_call_return_pc) at tail-call
+    // site entries to figure out the PC of tail-calling branch instructions.
+    // This means it doesn't need the compiler to emit DW_AT_call_pc, so we
+    // don't emit it here.
+    //
+    // There's no need to tie non-GDB debuggers to this non-standardness, as it
+    // adds unnecessary complexity to the debugger. For non-GDB debuggers, emit
+    // the standard DW_AT_call_pc info.
+    if (!useGNUAnalogForDwarf5Feature())
+      addLabelAddress(CallSiteDIE, dwarf::DW_AT_call_pc, CallAddr);
+  }
+
+  // Attach the return PC to allow the debugger to disambiguate call paths
+  // from one function to another.
+  //
+  // The return PC is only really needed when the call /isn't/ a tail call, but
+  // GDB expects it in DWARF4 mode, even for tail calls (see the comment above
+  // the DW_AT_call_pc emission logic for an explanation).
+  if (!IsTail || useGNUAnalogForDwarf5Feature()) {
+    assert(PCAddr && "Missing return PC information for a call");
+    addLabelAddress(CallSiteDIE,
+                    getDwarf5OrGNUAttr(dwarf::DW_AT_call_return_pc), PCAddr);
+  }
+
+  return CallSiteDIE;
+}
+
+void DwarfCompileUnit::constructCallSiteParmEntryDIEs(
+    DIE &CallSiteDIE, SmallVector<DbgCallSiteParam, 4> &Params) {
+  for (const auto &Param : Params) {
+    unsigned Register = Param.getRegister();
+    auto CallSiteDieParam =
+        DIE::get(DIEValueAllocator,
+                 getDwarf5OrGNUTag(dwarf::DW_TAG_call_site_parameter));
+    insertDIE(CallSiteDieParam);
+    addAddress(*CallSiteDieParam, dwarf::DW_AT_location,
+               MachineLocation(Register));
+
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+    DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+    DwarfExpr.setCallSiteParamValueFlag();
+
+    DwarfDebug::emitDebugLocValue(*Asm, nullptr, Param.getValue(), DwarfExpr);
+
+    addBlock(*CallSiteDieParam, getDwarf5OrGNUAttr(dwarf::DW_AT_call_value),
+             DwarfExpr.finalize());
+
+    CallSiteDIE.addChild(CallSiteDieParam);
+  }
+}
+
+DIE *DwarfCompileUnit::constructImportedEntityDIE(
+    const DIImportedEntity *Module) {
+  DIE *IMDie = DIE::get(DIEValueAllocator, (dwarf::Tag)Module->getTag());
+  insertDIE(Module, IMDie);
+  DIE *EntityDie;
+  auto *Entity = Module->getEntity();
+  if (auto *NS = dyn_cast<DINamespace>(Entity))
+    EntityDie = getOrCreateNameSpace(NS);
+  else if (auto *M = dyn_cast<DIModule>(Entity))
+    EntityDie = getOrCreateModule(M);
+  else if (auto *SP = dyn_cast<DISubprogram>(Entity)) {
+    // If there is an abstract subprogram, refer to it. Note that this assumes
+    // that all the abstract subprograms have been already created (which is
+    // correct until imported entities get emitted in DwarfDebug::endModule()).
+    if (auto *AbsSPDie = getAbstractScopeDIEs().lookup(SP))
+      EntityDie = AbsSPDie;
+    else
+      EntityDie = getOrCreateSubprogramDIE(SP);
+  } else if (auto *T = dyn_cast<DIType>(Entity))
+    EntityDie = getOrCreateTypeDIE(T);
+  else if (auto *GV = dyn_cast<DIGlobalVariable>(Entity))
+    EntityDie = getOrCreateGlobalVariableDIE(GV, {});
+  else if (auto *IE = dyn_cast<DIImportedEntity>(Entity))
+    EntityDie = getOrCreateImportedEntityDIE(IE);
+  else
+    EntityDie = getDIE(Entity);
+  assert(EntityDie);
+  addSourceLine(*IMDie, Module->getLine(), Module->getFile());
+  addDIEEntry(*IMDie, dwarf::DW_AT_import, *EntityDie);
+  StringRef Name = Module->getName();
+  if (!Name.empty()) {
+    addString(*IMDie, dwarf::DW_AT_name, Name);
+
+    // FIXME: if consumers ever start caring about handling
+    // unnamed import declarations such as `using ::nullptr_t`
+    // or `using namespace std::ranges`, we could add the
+    // import declaration into the accelerator table with the
+    // name being the one of the entity being imported.
+    DD->addAccelNamespace(*CUNode, Name, *IMDie);
+  }
+
+  // This is for imported module with renamed entities (such as variables and
+  // subprograms).
+  DINodeArray Elements = Module->getElements();
+  for (const auto *Element : Elements) {
+    if (!Element)
+      continue;
+    IMDie->addChild(
+        constructImportedEntityDIE(cast<DIImportedEntity>(Element)));
+  }
+
+  return IMDie;
+}
+
+DIE *DwarfCompileUnit::getOrCreateImportedEntityDIE(
+    const DIImportedEntity *IE) {
+
+  // Check for pre-existence.
+  if (DIE *Die = getDIE(IE))
+    return Die;
+
+  DIE *ContextDIE = getOrCreateContextDIE(IE->getScope());
+  assert(ContextDIE && "Empty scope for the imported entity!");
+
+  DIE *IMDie = constructImportedEntityDIE(IE);
+  ContextDIE->addChild(IMDie);
+  return IMDie;
+}
+
+void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) {
+  DIE *D = getDIE(SP);
+  if (DIE *AbsSPDIE = getAbstractScopeDIEs().lookup(SP)) {
+    if (D)
+      // If this subprogram has an abstract definition, reference that
+      addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE);
+  } else {
+    assert(D || includeMinimalInlineScopes());
+    if (D)
+      // And attach the attributes
+      applySubprogramAttributesToDefinition(SP, *D);
+  }
+}
+
+void DwarfCompileUnit::finishEntityDefinition(const DbgEntity *Entity) {
+  DbgEntity *AbsEntity = getExistingAbstractEntity(Entity->getEntity());
+
+  auto *Die = Entity->getDIE();
+  /// Label may be used to generate DW_AT_low_pc, so put it outside
+  /// if/else block.
+  const DbgLabel *Label = nullptr;
+  if (AbsEntity && AbsEntity->getDIE()) {
+    addDIEEntry(*Die, dwarf::DW_AT_abstract_origin, *AbsEntity->getDIE());
+    Label = dyn_cast<const DbgLabel>(Entity);
+  } else {
+    if (const DbgVariable *Var = dyn_cast<const DbgVariable>(Entity))
+      applyVariableAttributes(*Var, *Die);
+    else if ((Label = dyn_cast<const DbgLabel>(Entity)))
+      applyLabelAttributes(*Label, *Die);
+    else
+      llvm_unreachable("DbgEntity must be DbgVariable or DbgLabel.");
+  }
+
+  if (Label)
+    if (const auto *Sym = Label->getSymbol())
+      addLabelAddress(*Die, dwarf::DW_AT_low_pc, Sym);
+}
+
+DbgEntity *DwarfCompileUnit::getExistingAbstractEntity(const DINode *Node) {
+  auto &AbstractEntities = getAbstractEntities();
+  auto I = AbstractEntities.find(Node);
+  if (I != AbstractEntities.end())
+    return I->second.get();
+  return nullptr;
+}
+
+void DwarfCompileUnit::createAbstractEntity(const DINode *Node,
+                                            LexicalScope *Scope) {
+  assert(Scope && Scope->isAbstractScope());
+  auto &Entity = getAbstractEntities()[Node];
+  if (isa<const DILocalVariable>(Node)) {
+    Entity = std::make_unique<DbgVariable>(cast<const DILocalVariable>(Node),
+                                           nullptr /* IA */);
+    DU->addScopeVariable(Scope, cast<DbgVariable>(Entity.get()));
+  } else if (isa<const DILabel>(Node)) {
+    Entity = std::make_unique<DbgLabel>(
+                        cast<const DILabel>(Node), nullptr /* IA */);
+    DU->addScopeLabel(Scope, cast<DbgLabel>(Entity.get()));
+  }
+}
+
+void DwarfCompileUnit::emitHeader(bool UseOffsets) {
+  // Don't bother labeling the .dwo unit, as its offset isn't used.
+  if (!Skeleton && !DD->useSectionsAsReferences()) {
+    LabelBegin = Asm->createTempSymbol("cu_begin");
+    Asm->OutStreamer->emitLabel(LabelBegin);
+  }
+
+  dwarf::UnitType UT = Skeleton ? dwarf::DW_UT_split_compile
+                                : DD->useSplitDwarf() ? dwarf::DW_UT_skeleton
+                                                      : dwarf::DW_UT_compile;
+  DwarfUnit::emitCommonHeader(UseOffsets, UT);
+  if (DD->getDwarfVersion() >= 5 && UT != dwarf::DW_UT_compile)
+    Asm->emitInt64(getDWOId());
+}
+
+bool DwarfCompileUnit::hasDwarfPubSections() const {
+  switch (CUNode->getNameTableKind()) {
+  case DICompileUnit::DebugNameTableKind::None:
+    return false;
+    // Opting in to GNU Pubnames/types overrides the default to ensure these are
+    // generated for things like Gold's gdb_index generation.
+  case DICompileUnit::DebugNameTableKind::GNU:
+    return true;
+  case DICompileUnit::DebugNameTableKind::Apple:
+    return false;
+  case DICompileUnit::DebugNameTableKind::Default:
+    return DD->tuneForGDB() && !includeMinimalInlineScopes() &&
+           !CUNode->isDebugDirectivesOnly() &&
+           DD->getAccelTableKind() != AccelTableKind::Apple &&
+           DD->getDwarfVersion() < 5;
+  }
+  llvm_unreachable("Unhandled DICompileUnit::DebugNameTableKind enum");
+}
+
+/// addGlobalName - Add a new global name to the compile unit.
+void DwarfCompileUnit::addGlobalName(StringRef Name, const DIE &Die,
+                                     const DIScope *Context) {
+  if (!hasDwarfPubSections())
+    return;
+  std::string FullName = getParentContextString(Context) + Name.str();
+  GlobalNames[FullName] = &Die;
+}
+
+void DwarfCompileUnit::addGlobalNameForTypeUnit(StringRef Name,
+                                                const DIScope *Context) {
+  if (!hasDwarfPubSections())
+    return;
+  std::string FullName = getParentContextString(Context) + Name.str();
+  // Insert, allowing the entry to remain as-is if it's already present
+  // This way the CU-level type DIE is preferred over the "can't describe this
+  // type as a unit offset because it's not really in the CU at all, it's only
+  // in a type unit"
+  GlobalNames.insert(std::make_pair(std::move(FullName), &getUnitDie()));
+}
+
+/// Add a new global type to the unit.
+void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die,
+                                     const DIScope *Context) {
+  if (!hasDwarfPubSections())
+    return;
+  std::string FullName = getParentContextString(Context) + Ty->getName().str();
+  GlobalTypes[FullName] = &Die;
+}
+
+void DwarfCompileUnit::addGlobalTypeUnitType(const DIType *Ty,
+                                             const DIScope *Context) {
+  if (!hasDwarfPubSections())
+    return;
+  std::string FullName = getParentContextString(Context) + Ty->getName().str();
+  // Insert, allowing the entry to remain as-is if it's already present
+  // This way the CU-level type DIE is preferred over the "can't describe this
+  // type as a unit offset because it's not really in the CU at all, it's only
+  // in a type unit"
+  GlobalTypes.insert(std::make_pair(std::move(FullName), &getUnitDie()));
+}
+
+void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,
+                                          MachineLocation Location) {
+  if (DV.hasComplexAddress())
+    addComplexAddress(DV, Die, dwarf::DW_AT_location, Location);
+  else
+    addAddress(Die, dwarf::DW_AT_location, Location);
+}
+
+/// Add an address attribute to a die based on the location provided.
+void DwarfCompileUnit::addAddress(DIE &Die, dwarf::Attribute Attribute,
+                                  const MachineLocation &Location) {
+  DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+  if (Location.isIndirect())
+    DwarfExpr.setMemoryLocationKind();
+
+  DIExpressionCursor Cursor({});
+  const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
+  if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
+    return;
+  DwarfExpr.addExpression(std::move(Cursor));
+
+  // Now attach the location information to the DIE.
+  addBlock(Die, Attribute, DwarfExpr.finalize());
+
+  if (DwarfExpr.TagOffset)
+    addUInt(Die, dwarf::DW_AT_LLVM_tag_offset, dwarf::DW_FORM_data1,
+            *DwarfExpr.TagOffset);
+}
+
+/// Start with the address based on the location provided, and generate the
+/// DWARF information necessary to find the actual variable given the extra
+/// address information encoded in the DbgVariable, starting from the starting
+/// location.  Add the DWARF information to the die.
+void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
+                                         dwarf::Attribute Attribute,
+                                         const MachineLocation &Location) {
+  DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+  const DIExpression *DIExpr = DV.getSingleExpression();
+  DwarfExpr.addFragmentOffset(DIExpr);
+  DwarfExpr.setLocation(Location, DIExpr);
+
+  DIExpressionCursor Cursor(DIExpr);
+
+  if (DIExpr->isEntryValue())
+    DwarfExpr.beginEntryValueExpression(Cursor);
+
+  const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
+  if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
+    return;
+  DwarfExpr.addExpression(std::move(Cursor));
+
+  // Now attach the location information to the DIE.
+  addBlock(Die, Attribute, DwarfExpr.finalize());
+
+  if (DwarfExpr.TagOffset)
+    addUInt(Die, dwarf::DW_AT_LLVM_tag_offset, dwarf::DW_FORM_data1,
+            *DwarfExpr.TagOffset);
+}
+
+/// Add a Dwarf loclistptr attribute data and value.
+void DwarfCompileUnit::addLocationList(DIE &Die, dwarf::Attribute Attribute,
+                                       unsigned Index) {
+  dwarf::Form Form = (DD->getDwarfVersion() >= 5)
+                         ? dwarf::DW_FORM_loclistx
+                         : DD->getDwarfSectionOffsetForm();
+  addAttribute(Die, Attribute, Form, DIELocList(Index));
+}
+
+void DwarfCompileUnit::applyVariableAttributes(const DbgVariable &Var,
+                                               DIE &VariableDie) {
+  StringRef Name = Var.getName();
+  if (!Name.empty())
+    addString(VariableDie, dwarf::DW_AT_name, Name);
+  const auto *DIVar = Var.getVariable();
+  if (DIVar) {
+    if (uint32_t AlignInBytes = DIVar->getAlignInBytes())
+      addUInt(VariableDie, dwarf::DW_AT_alignment, dwarf::DW_FORM_udata,
+              AlignInBytes);
+    addAnnotation(VariableDie, DIVar->getAnnotations());
+  }
+
+  addSourceLine(VariableDie, DIVar);
+  addType(VariableDie, Var.getType());
+  if (Var.isArtificial())
+    addFlag(VariableDie, dwarf::DW_AT_artificial);
+}
+
+void DwarfCompileUnit::applyLabelAttributes(const DbgLabel &Label,
+                                            DIE &LabelDie) {
+  StringRef Name = Label.getName();
+  if (!Name.empty())
+    addString(LabelDie, dwarf::DW_AT_name, Name);
+  const auto *DILabel = Label.getLabel();
+  addSourceLine(LabelDie, DILabel);
+}
+
+/// Add a Dwarf expression attribute data and value.
+void DwarfCompileUnit::addExpr(DIELoc &Die, dwarf::Form Form,
+                               const MCExpr *Expr) {
+  addAttribute(Die, (dwarf::Attribute)0, Form, DIEExpr(Expr));
+}
+
+void DwarfCompileUnit::applySubprogramAttributesToDefinition(
+    const DISubprogram *SP, DIE &SPDie) {
+  auto *SPDecl = SP->getDeclaration();
+  auto *Context = SPDecl ? SPDecl->getScope() : SP->getScope();
+  applySubprogramAttributes(SP, SPDie, includeMinimalInlineScopes());
+  addGlobalName(SP->getName(), SPDie, Context);
+}
+
+bool DwarfCompileUnit::isDwoUnit() const {
+  return DD->useSplitDwarf() && Skeleton;
+}
+
+void DwarfCompileUnit::finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) {
+  constructTypeDIE(D, CTy);
+}
+
+bool DwarfCompileUnit::includeMinimalInlineScopes() const {
+  return getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly ||
+         (DD->useSplitDwarf() && !Skeleton);
+}
+
+void DwarfCompileUnit::addAddrTableBase() {
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  MCSymbol *Label = DD->getAddressPool().getLabel();
+  addSectionLabel(getUnitDie(),
+                  DD->getDwarfVersion() >= 5 ? dwarf::DW_AT_addr_base
+                                             : dwarf::DW_AT_GNU_addr_base,
+                  Label, TLOF.getDwarfAddrSection()->getBeginSymbol());
+}
+
+void DwarfCompileUnit::addBaseTypeRef(DIEValueList &Die, int64_t Idx) {
+  addAttribute(Die, (dwarf::Attribute)0, dwarf::DW_FORM_udata,
+               new (DIEValueAllocator) DIEBaseTypeRef(this, Idx));
+}
+
+void DwarfCompileUnit::createBaseTypeDIEs() {
+  // Insert the base_type DIEs directly after the CU so that their offsets will
+  // fit in the fixed size ULEB128 used inside the location expressions.
+  // Maintain order by iterating backwards and inserting to the front of CU
+  // child list.
+  for (auto &Btr : reverse(ExprRefedBaseTypes)) {
+    DIE &Die = getUnitDie().addChildFront(
+      DIE::get(DIEValueAllocator, dwarf::DW_TAG_base_type));
+    SmallString<32> Str;
+    addString(Die, dwarf::DW_AT_name,
+              Twine(dwarf::AttributeEncodingString(Btr.Encoding) +
+                    "_" + Twine(Btr.BitSize)).toStringRef(Str));
+    addUInt(Die, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, Btr.Encoding);
+    // Round up to smallest number of bytes that contains this number of bits.
+    addUInt(Die, dwarf::DW_AT_byte_size, std::nullopt,
+            divideCeil(Btr.BitSize, 8));
+
+    Btr.Die = &Die;
+  }
+}
+
+DIE *DwarfCompileUnit::getLexicalBlockDIE(const DILexicalBlock *LB) {
+  // Assume if there is an abstract tree all the DIEs are already emitted.
+  bool isAbstract = getAbstractScopeDIEs().count(LB->getSubprogram());
+  if (isAbstract && getAbstractScopeDIEs().count(LB))
+    return getAbstractScopeDIEs()[LB];
+  assert(!isAbstract && "Missed lexical block DIE in abstract tree!");
+
+  // Return a concrete DIE if it exists or nullptr otherwise.
+  return LexicalBlockDIEs.lookup(LB);
+}
+
+DIE *DwarfCompileUnit::getOrCreateContextDIE(const DIScope *Context) {
+  if (isa_and_nonnull<DILocalScope>(Context)) {
+    if (auto *LFScope = dyn_cast<DILexicalBlockFile>(Context))
+      Context = LFScope->getNonLexicalBlockFileScope();
+    if (auto *LScope = dyn_cast<DILexicalBlock>(Context))
+      return getLexicalBlockDIE(LScope);
+
+    // Otherwise the context must be a DISubprogram.
+    auto *SPScope = cast<DISubprogram>(Context);
+    if (getAbstractScopeDIEs().count(SPScope))
+      return getAbstractScopeDIEs()[SPScope];
+  }
+  return DwarfUnit::getOrCreateContextDIE(Context);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
new file mode 100644
index 000000000000..6ef73ebd4f7f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -0,0 +1,380 @@
+//===- llvm/CodeGen/DwarfCompileUnit.h - Dwarf Compile Unit -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
+
+#include "DwarfDebug.h"
+#include "DwarfUnit.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/DbgEntityHistoryCalculator.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
+#include <cstdint>
+#include <memory>
+
+namespace llvm {
+
+class AsmPrinter;
+class DIE;
+class DIELoc;
+class DIEValueList;
+class DwarfFile;
+class GlobalVariable;
+class MCExpr;
+class MCSymbol;
+class MDNode;
+
+enum class UnitKind { Skeleton, Full };
+
+class DwarfCompileUnit final : public DwarfUnit {
+  /// A numeric ID unique among all CUs in the module
+  unsigned UniqueID;
+  bool HasRangeLists = false;
+
+  /// The start of the unit line section, this is also
+  /// reused in appyStmtList.
+  MCSymbol *LineTableStartSym;
+
+  /// Skeleton unit associated with this unit.
+  DwarfCompileUnit *Skeleton = nullptr;
+
+  /// The start of the unit within its section.
+  MCSymbol *LabelBegin = nullptr;
+
+  /// The start of the unit macro info within macro section.
+  MCSymbol *MacroLabelBegin;
+
+  /// GlobalNames - A map of globally visible named entities for this unit.
+  StringMap<const DIE *> GlobalNames;
+
+  /// GlobalTypes - A map of globally visible types for this unit.
+  StringMap<const DIE *> GlobalTypes;
+
+  // List of ranges for a given compile unit.
+  SmallVector<RangeSpan, 2> CURanges;
+
+  // The base address of this unit, if any. Used for relative references in
+  // ranges/locs.
+  const MCSymbol *BaseAddress = nullptr;
+
+  using MDNodeSetVector =
+      SetVector<const MDNode *, SmallVector<const MDNode *, 4>,
+                SmallPtrSet<const MDNode *, 4>>;
+
+  // List of entities (either static locals, types or imports) that
+  // belong to subprograms within this CU.
+  MDNodeSetVector DeferredLocalDecls;
+
+  // List of concrete lexical block scopes belong to subprograms within this CU.
+  DenseMap<const DILocalScope *, DIE *> LexicalBlockDIEs;
+
+  // List of abstract local scopes (either DISubprogram or DILexicalBlock).
+  DenseMap<const DILocalScope *, DIE *> AbstractLocalScopeDIEs;
+
+  DenseMap<const DINode *, std::unique_ptr<DbgEntity>> AbstractEntities;
+
+  /// DWO ID for correlating skeleton and split units.
+  uint64_t DWOId = 0;
+
+  const DIFile *LastFile = nullptr;
+  unsigned LastFileID;
+
+  /// Construct a DIE for the given DbgVariable without initializing the
+  /// DbgVariable's DIE reference.
+  DIE *constructVariableDIEImpl(const DbgVariable &DV, bool Abstract);
+
+  bool isDwoUnit() const override;
+
+  DenseMap<const DILocalScope *, DIE *> &getAbstractScopeDIEs() {
+    if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+      return AbstractLocalScopeDIEs;
+    return DU->getAbstractScopeDIEs();
+  }
+
+  DenseMap<const DINode *, std::unique_ptr<DbgEntity>> &getAbstractEntities() {
+    if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+      return AbstractEntities;
+    return DU->getAbstractEntities();
+  }
+
+  void finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) override;
+
+  /// Add info for Wasm-global-based relocation.
+  void addWasmRelocBaseGlobal(DIELoc *Loc, StringRef GlobalName,
+                              uint64_t GlobalIndex);
+
+public:
+  DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A,
+                   DwarfDebug *DW, DwarfFile *DWU,
+                   UnitKind Kind = UnitKind::Full);
+
+  bool hasRangeLists() const { return HasRangeLists; }
+  unsigned getUniqueID() const { return UniqueID; }
+
+  DwarfCompileUnit *getSkeleton() const {
+    return Skeleton;
+  }
+
+  bool includeMinimalInlineScopes() const;
+
+  void initStmtList();
+
+  /// Apply the DW_AT_stmt_list from this compile unit to the specified DIE.
+  void applyStmtList(DIE &D);
+
+  /// Get line table start symbol for this unit.
+  MCSymbol *getLineTableStartSym() const { return LineTableStartSym; }
+
+  /// A pair of GlobalVariable and DIExpression.
+  struct GlobalExpr {
+    const GlobalVariable *Var;
+    const DIExpression *Expr;
+  };
+
+  struct BaseTypeRef {
+    BaseTypeRef(unsigned BitSize, dwarf::TypeKind Encoding) :
+      BitSize(BitSize), Encoding(Encoding) {}
+    unsigned BitSize;
+    dwarf::TypeKind Encoding;
+    DIE *Die = nullptr;
+  };
+
+  std::vector<BaseTypeRef> ExprRefedBaseTypes;
+
+  /// Get or create global variable DIE.
+  DIE *
+  getOrCreateGlobalVariableDIE(const DIGlobalVariable *GV,
+                               ArrayRef<GlobalExpr> GlobalExprs);
+
+  DIE *getOrCreateCommonBlock(const DICommonBlock *CB,
+                              ArrayRef<GlobalExpr> GlobalExprs);
+
+  void addLocationAttribute(DIE *ToDIE, const DIGlobalVariable *GV,
+                            ArrayRef<GlobalExpr> GlobalExprs);
+
+  /// addLabelAddress - Add a dwarf label attribute data and value using
+  /// either DW_FORM_addr or DW_FORM_GNU_addr_index.
+  void addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
+                       const MCSymbol *Label);
+
+  /// addLocalLabelAddress - Add a dwarf label attribute data and value using
+  /// DW_FORM_addr only.
+  void addLocalLabelAddress(DIE &Die, dwarf::Attribute Attribute,
+                            const MCSymbol *Label);
+
+  DwarfCompileUnit &getCU() override { return *this; }
+
+  unsigned getOrCreateSourceID(const DIFile *File) override;
+
+  /// addRange - Add an address range to the list of ranges for this unit.
+  void addRange(RangeSpan Range);
+
+  void attachLowHighPC(DIE &D, const MCSymbol *Begin, const MCSymbol *End);
+
+  /// Find DIE for the given subprogram and attach appropriate
+  /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global
+  /// variables in this scope then create and insert DIEs for these
+  /// variables.
+  DIE &updateSubprogramScopeDIE(const DISubprogram *SP);
+  DIE &updateSubprogramScopeDIEImpl(const DISubprogram *SP, DIE *SPDie);
+
+  void constructScopeDIE(LexicalScope *Scope, DIE &ParentScopeDIE);
+
+  /// A helper function to construct a RangeSpanList for a given
+  /// lexical scope.
+  void addScopeRangeList(DIE &ScopeDIE, SmallVector<RangeSpan, 2> Range);
+
+  void attachRangesOrLowHighPC(DIE &D, SmallVector<RangeSpan, 2> Ranges);
+
+  void attachRangesOrLowHighPC(DIE &D,
+                               const SmallVectorImpl<InsnRange> &Ranges);
+
+  /// This scope represents an inlined body of a function. Construct a
+  /// DIE to represent this concrete inlined copy of the function.
+  DIE *constructInlinedScopeDIE(LexicalScope *Scope, DIE &ParentScopeDIE);
+
+  /// Construct new DW_TAG_lexical_block for this scope and
+  /// attach DW_AT_low_pc/DW_AT_high_pc labels.
+  DIE *constructLexicalScopeDIE(LexicalScope *Scope);
+
+  /// Get a DIE for the given DILexicalBlock.
+  /// Note that this function assumes that the DIE has been already created
+  /// and it's an error, if it hasn't.
+  DIE *getLexicalBlockDIE(const DILexicalBlock *LB);
+
+  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
+  DIE *constructVariableDIE(DbgVariable &DV, bool Abstract = false);
+
+  DIE *constructVariableDIE(DbgVariable &DV, const LexicalScope &Scope,
+                            DIE *&ObjectPointer);
+
+  /// Construct a DIE for the given DbgLabel.
+  DIE *constructLabelDIE(DbgLabel &DL, const LexicalScope &Scope);
+
+  void createBaseTypeDIEs();
+
+  /// Construct a DIE for a given scope.
+  /// This instance of 'getOrCreateContextDIE()' can handle DILocalScope.
+  DIE *getOrCreateContextDIE(const DIScope *Ty) override;
+
+  /// Construct a DIE for this subprogram scope.
+  DIE &constructSubprogramScopeDIE(const DISubprogram *Sub,
+                                   LexicalScope *Scope);
+
+  DIE *createAndAddScopeChildren(LexicalScope *Scope, DIE &ScopeDIE);
+
+  void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
+
+  /// Whether to use the GNU analog for a DWARF5 tag, attribute, or location
+  /// atom. Only applicable when emitting otherwise DWARF4-compliant debug info.
+  bool useGNUAnalogForDwarf5Feature() const;
+
+  /// This takes a DWARF 5 tag and returns it or a GNU analog.
+  dwarf::Tag getDwarf5OrGNUTag(dwarf::Tag Tag) const;
+
+  /// This takes a DWARF 5 attribute and returns it or a GNU analog.
+  dwarf::Attribute getDwarf5OrGNUAttr(dwarf::Attribute Attr) const;
+
+  /// This takes a DWARF 5 location atom and either returns it or a GNU analog.
+  dwarf::LocationAtom getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const;
+
+  /// Construct a call site entry DIE describing a call within \p Scope to a
+  /// callee described by \p CalleeSP.
+  /// \p IsTail specifies whether the call is a tail call.
+  /// \p PCAddr points to the PC value after the call instruction.
+  /// \p CallAddr points to the PC value at the call instruction (or is null).
+  /// \p CallReg is a register location for an indirect call. For direct calls
+  /// the \p CallReg is set to 0.
+  DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, const DISubprogram *CalleeSP,
+                                 bool IsTail, const MCSymbol *PCAddr,
+                                 const MCSymbol *CallAddr, unsigned CallReg);
+  /// Construct call site parameter DIEs for the \p CallSiteDIE. The \p Params
+  /// were collected by the \ref collectCallSiteParameters.
+  /// Note: The order of parameters does not matter, since debuggers recognize
+  ///       call site parameters by the DW_AT_location attribute.
+  void constructCallSiteParmEntryDIEs(DIE &CallSiteDIE,
+                                      SmallVector<DbgCallSiteParam, 4> &Params);
+
+  /// Get or create a DIE for an imported entity.
+  DIE *getOrCreateImportedEntityDIE(const DIImportedEntity *IE);
+  DIE *constructImportedEntityDIE(const DIImportedEntity *IE);
+
+  void finishSubprogramDefinition(const DISubprogram *SP);
+  void finishEntityDefinition(const DbgEntity *Entity);
+
+  /// Find abstract variable associated with Var.
+  using InlinedEntity = DbgValueHistoryMap::InlinedEntity;
+  DbgEntity *getExistingAbstractEntity(const DINode *Node);
+  void createAbstractEntity(const DINode *Node, LexicalScope *Scope);
+
+  /// Set the skeleton unit associated with this unit.
+  void setSkeleton(DwarfCompileUnit &Skel) { Skeleton = &Skel; }
+
+  unsigned getHeaderSize() const override {
+    // DWARF v5 added the DWO ID to the header for split/skeleton units.
+    unsigned DWOIdSize =
+        DD->getDwarfVersion() >= 5 && DD->useSplitDwarf() ? sizeof(uint64_t)
+                                                          : 0;
+    return DwarfUnit::getHeaderSize() + DWOIdSize;
+  }
+  unsigned getLength() {
+    return Asm->getUnitLengthFieldByteSize() + // Length field
+           getHeaderSize() + getUnitDie().getSize();
+  }
+
+  void emitHeader(bool UseOffsets) override;
+
+  /// Add the DW_AT_addr_base attribute to the unit DIE.
+  void addAddrTableBase();
+
+  MCSymbol *getLabelBegin() const {
+    assert(LabelBegin && "LabelBegin is not initialized");
+    return LabelBegin;
+  }
+
+  MCSymbol *getMacroLabelBegin() const {
+    return MacroLabelBegin;
+  }
+
+  /// Add a new global name to the compile unit.
+  void addGlobalName(StringRef Name, const DIE &Die,
+                     const DIScope *Context) override;
+
+  /// Add a new global name present in a type unit to this compile unit.
+  void addGlobalNameForTypeUnit(StringRef Name, const DIScope *Context);
+
+  /// Add a new global type to the compile unit.
+  void addGlobalType(const DIType *Ty, const DIE &Die,
+                     const DIScope *Context) override;
+
+  /// Add a new global type present in a type unit to this compile unit.
+  void addGlobalTypeUnitType(const DIType *Ty, const DIScope *Context);
+
+  const StringMap<const DIE *> &getGlobalNames() const { return GlobalNames; }
+  const StringMap<const DIE *> &getGlobalTypes() const { return GlobalTypes; }
+
+  /// Add DW_AT_location attribute for a DbgVariable based on provided
+  /// MachineLocation.
+  void addVariableAddress(const DbgVariable &DV, DIE &Die,
+                          MachineLocation Location);
+  /// Add an address attribute to a die based on the location provided.
+  void addAddress(DIE &Die, dwarf::Attribute Attribute,
+                  const MachineLocation &Location);
+
+  /// Start with the address based on the location provided, and generate the
+  /// DWARF information necessary to find the actual variable (navigating the
+  /// extra location information encoded in the type) based on the starting
+  /// location.  Add the DWARF information to the die.
+  void addComplexAddress(const DbgVariable &DV, DIE &Die,
+                         dwarf::Attribute Attribute,
+                         const MachineLocation &Location);
+
+  /// Add a Dwarf loclistptr attribute data and value.
+  void addLocationList(DIE &Die, dwarf::Attribute Attribute, unsigned Index);
+  void applyVariableAttributes(const DbgVariable &Var, DIE &VariableDie);
+
+  /// Add a Dwarf expression attribute data and value.
+  void addExpr(DIELoc &Die, dwarf::Form Form, const MCExpr *Expr);
+
+  void applySubprogramAttributesToDefinition(const DISubprogram *SP,
+                                             DIE &SPDie);
+
+  void applyLabelAttributes(const DbgLabel &Label, DIE &LabelDie);
+
+  /// getRanges - Get the list of ranges for this unit.
+  const SmallVectorImpl<RangeSpan> &getRanges() const { return CURanges; }
+  SmallVector<RangeSpan, 2> takeRanges() { return std::move(CURanges); }
+
+  void setBaseAddress(const MCSymbol *Base) { BaseAddress = Base; }
+  const MCSymbol *getBaseAddress() const { return BaseAddress; }
+
+  uint64_t getDWOId() const { return DWOId; }
+  void setDWOId(uint64_t DwoId) { DWOId = DwoId; }
+
+  bool hasDwarfPubSections() const;
+
+  void addBaseTypeRef(DIEValueList &Die, int64_t Idx);
+
+  MDNodeSetVector &getDeferredLocalDecls() { return DeferredLocalDecls; }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
new file mode 100644
index 000000000000..1ae17ec9b874
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -0,0 +1,3638 @@
+//===- llvm/CodeGen/DwarfDebug.cpp - Dwarf Debug Framework ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf debug info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfDebug.h"
+#include "ByteStreamer.h"
+#include "DIEHash.h"
+#include "DwarfCompileUnit.h"
+#include "DwarfExpression.h"
+#include "DwarfUnit.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Triple.h"
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <optional>
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dwarfdebug"
+
+STATISTIC(NumCSParams, "Number of dbg call site params created");
+
+static cl::opt<bool> UseDwarfRangesBaseAddressSpecifier(
+    "use-dwarf-ranges-base-address-specifier", cl::Hidden,
+    cl::desc("Use base address specifiers in debug_ranges"), cl::init(false));
+
+static cl::opt<bool> GenerateARangeSection("generate-arange-section",
+                                           cl::Hidden,
+                                           cl::desc("Generate dwarf aranges"),
+                                           cl::init(false));
+
+static cl::opt<bool>
+    GenerateDwarfTypeUnits("generate-type-units", cl::Hidden,
+                           cl::desc("Generate DWARF4 type units."),
+                           cl::init(false));
+
+static cl::opt<bool> SplitDwarfCrossCuReferences(
+    "split-dwarf-cross-cu-references", cl::Hidden,
+    cl::desc("Enable cross-cu references in DWO files"), cl::init(false));
+
+enum DefaultOnOff { Default, Enable, Disable };
+
+static cl::opt<DefaultOnOff> UnknownLocations(
+    "use-unknown-locations", cl::Hidden,
+    cl::desc("Make an absence of debug location information explicit."),
+    cl::values(clEnumVal(Default, "At top of block or after label"),
+               clEnumVal(Enable, "In all cases"), clEnumVal(Disable, "Never")),
+    cl::init(Default));
+
+static cl::opt<AccelTableKind> AccelTables(
+    "accel-tables", cl::Hidden, cl::desc("Output dwarf accelerator tables."),
+    cl::values(clEnumValN(AccelTableKind::Default, "Default",
+                          "Default for platform"),
+               clEnumValN(AccelTableKind::None, "Disable", "Disabled."),
+               clEnumValN(AccelTableKind::Apple, "Apple", "Apple"),
+               clEnumValN(AccelTableKind::Dwarf, "Dwarf", "DWARF")),
+    cl::init(AccelTableKind::Default));
+
+static cl::opt<DefaultOnOff>
+DwarfInlinedStrings("dwarf-inlined-strings", cl::Hidden,
+                 cl::desc("Use inlined strings rather than string section."),
+                 cl::values(clEnumVal(Default, "Default for platform"),
+                            clEnumVal(Enable, "Enabled"),
+                            clEnumVal(Disable, "Disabled")),
+                 cl::init(Default));
+
+static cl::opt<bool>
+    NoDwarfRangesSection("no-dwarf-ranges-section", cl::Hidden,
+                         cl::desc("Disable emission .debug_ranges section."),
+                         cl::init(false));
+
+static cl::opt<DefaultOnOff> DwarfSectionsAsReferences(
+    "dwarf-sections-as-references", cl::Hidden,
+    cl::desc("Use sections+offset as references rather than labels."),
+    cl::values(clEnumVal(Default, "Default for platform"),
+               clEnumVal(Enable, "Enabled"), clEnumVal(Disable, "Disabled")),
+    cl::init(Default));
+
+static cl::opt<bool>
+    UseGNUDebugMacro("use-gnu-debug-macro", cl::Hidden,
+                     cl::desc("Emit the GNU .debug_macro format with DWARF <5"),
+                     cl::init(false));
+
+static cl::opt<DefaultOnOff> DwarfOpConvert(
+    "dwarf-op-convert", cl::Hidden,
+    cl::desc("Enable use of the DWARFv5 DW_OP_convert operator"),
+    cl::values(clEnumVal(Default, "Default for platform"),
+               clEnumVal(Enable, "Enabled"), clEnumVal(Disable, "Disabled")),
+    cl::init(Default));
+
+enum LinkageNameOption {
+  DefaultLinkageNames,
+  AllLinkageNames,
+  AbstractLinkageNames
+};
+
+static cl::opt<LinkageNameOption>
+    DwarfLinkageNames("dwarf-linkage-names", cl::Hidden,
+                      cl::desc("Which DWARF linkage-name attributes to emit."),
+                      cl::values(clEnumValN(DefaultLinkageNames, "Default",
+                                            "Default for platform"),
+                                 clEnumValN(AllLinkageNames, "All", "All"),
+                                 clEnumValN(AbstractLinkageNames, "Abstract",
+                                            "Abstract subprograms")),
+                      cl::init(DefaultLinkageNames));
+
+static cl::opt<DwarfDebug::MinimizeAddrInV5> MinimizeAddrInV5Option(
+    "minimize-addr-in-v5", cl::Hidden,
+    cl::desc("Always use DW_AT_ranges in DWARFv5 whenever it could allow more "
+             "address pool entry sharing to reduce relocations/object size"),
+    cl::values(clEnumValN(DwarfDebug::MinimizeAddrInV5::Default, "Default",
+                          "Default address minimization strategy"),
+               clEnumValN(DwarfDebug::MinimizeAddrInV5::Ranges, "Ranges",
+                          "Use rnglists for contiguous ranges if that allows "
+                          "using a pre-existing base address"),
+               clEnumValN(DwarfDebug::MinimizeAddrInV5::Expressions,
+                          "Expressions",
+                          "Use exprloc addrx+offset expressions for any "
+                          "address with a prior base address"),
+               clEnumValN(DwarfDebug::MinimizeAddrInV5::Form, "Form",
+                          "Use addrx+offset extension form for any address "
+                          "with a prior base address"),
+               clEnumValN(DwarfDebug::MinimizeAddrInV5::Disabled, "Disabled",
+                          "Stuff")),
+    cl::init(DwarfDebug::MinimizeAddrInV5::Default));
+
+static constexpr unsigned ULEB128PadSize = 4;
+
+void DebugLocDwarfExpression::emitOp(uint8_t Op, const char *Comment) {
+  getActiveStreamer().emitInt8(
+      Op, Comment ? Twine(Comment) + " " + dwarf::OperationEncodingString(Op)
+                  : dwarf::OperationEncodingString(Op));
+}
+
+void DebugLocDwarfExpression::emitSigned(int64_t Value) {
+  getActiveStreamer().emitSLEB128(Value, Twine(Value));
+}
+
+void DebugLocDwarfExpression::emitUnsigned(uint64_t Value) {
+  getActiveStreamer().emitULEB128(Value, Twine(Value));
+}
+
+void DebugLocDwarfExpression::emitData1(uint8_t Value) {
+  getActiveStreamer().emitInt8(Value, Twine(Value));
+}
+
+void DebugLocDwarfExpression::emitBaseTypeRef(uint64_t Idx) {
+  assert(Idx < (1ULL << (ULEB128PadSize * 7)) && "Idx wont fit");
+  getActiveStreamer().emitULEB128(Idx, Twine(Idx), ULEB128PadSize);
+}
+
+bool DebugLocDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
+                                              llvm::Register MachineReg) {
+  // This information is not available while emitting .debug_loc entries.
+  return false;
+}
+
+void DebugLocDwarfExpression::enableTemporaryBuffer() {
+  assert(!IsBuffering && "Already buffering?");
+  if (!TmpBuf)
+    TmpBuf = std::make_unique<TempBuffer>(OutBS.GenerateComments);
+  IsBuffering = true;
+}
+
+void DebugLocDwarfExpression::disableTemporaryBuffer() { IsBuffering = false; }
+
+unsigned DebugLocDwarfExpression::getTemporaryBufferSize() {
+  return TmpBuf ? TmpBuf->Bytes.size() : 0;
+}
+
+void DebugLocDwarfExpression::commitTemporaryBuffer() {
+  if (!TmpBuf)
+    return;
+  for (auto Byte : enumerate(TmpBuf->Bytes)) {
+    const char *Comment = (Byte.index() < TmpBuf->Comments.size())
+                              ? TmpBuf->Comments[Byte.index()].c_str()
+                              : "";
+    OutBS.emitInt8(Byte.value(), Comment);
+  }
+  TmpBuf->Bytes.clear();
+  TmpBuf->Comments.clear();
+}
+
+const DIType *DbgVariable::getType() const {
+  return getVariable()->getType();
+}
+
+/// Get .debug_loc entry for the instruction range starting at MI.
+static DbgValueLoc getDebugLocValue(const MachineInstr *MI) {
+  const DIExpression *Expr = MI->getDebugExpression();
+  const bool IsVariadic = MI->isDebugValueList();
+  assert(MI->getNumOperands() >= 3);
+  SmallVector<DbgValueLocEntry, 4> DbgValueLocEntries;
+  for (const MachineOperand &Op : MI->debug_operands()) {
+    if (Op.isReg()) {
+      MachineLocation MLoc(Op.getReg(),
+                           MI->isNonListDebugValue() && MI->isDebugOffsetImm());
+      DbgValueLocEntries.push_back(DbgValueLocEntry(MLoc));
+    } else if (Op.isTargetIndex()) {
+      DbgValueLocEntries.push_back(
+          DbgValueLocEntry(TargetIndexLocation(Op.getIndex(), Op.getOffset())));
+    } else if (Op.isImm())
+      DbgValueLocEntries.push_back(DbgValueLocEntry(Op.getImm()));
+    else if (Op.isFPImm())
+      DbgValueLocEntries.push_back(DbgValueLocEntry(Op.getFPImm()));
+    else if (Op.isCImm())
+      DbgValueLocEntries.push_back(DbgValueLocEntry(Op.getCImm()));
+    else
+      llvm_unreachable("Unexpected debug operand in DBG_VALUE* instruction!");
+  }
+  return DbgValueLoc(Expr, DbgValueLocEntries, IsVariadic);
+}
+
+void DbgVariable::initializeDbgValue(const MachineInstr *DbgValue) {
+  assert(FrameIndexExprs.empty() && "Already initialized?");
+  assert(!ValueLoc.get() && "Already initialized?");
+
+  assert(getVariable() == DbgValue->getDebugVariable() && "Wrong variable");
+  assert(getInlinedAt() == DbgValue->getDebugLoc()->getInlinedAt() &&
+         "Wrong inlined-at");
+
+  ValueLoc = std::make_unique<DbgValueLoc>(getDebugLocValue(DbgValue));
+  if (auto *E = DbgValue->getDebugExpression())
+    if (E->getNumElements())
+      FrameIndexExprs.push_back({0, E});
+}
+
+ArrayRef<DbgVariable::FrameIndexExpr> DbgVariable::getFrameIndexExprs() const {
+  if (FrameIndexExprs.size() == 1)
+    return FrameIndexExprs;
+
+  assert(llvm::all_of(FrameIndexExprs,
+                      [](const FrameIndexExpr &A) {
+                        return A.Expr->isFragment();
+                      }) &&
+         "multiple FI expressions without DW_OP_LLVM_fragment");
+  llvm::sort(FrameIndexExprs,
+             [](const FrameIndexExpr &A, const FrameIndexExpr &B) -> bool {
+               return A.Expr->getFragmentInfo()->OffsetInBits <
+                      B.Expr->getFragmentInfo()->OffsetInBits;
+             });
+
+  return FrameIndexExprs;
+}
+
+void DbgVariable::addMMIEntry(const DbgVariable &V) {
+  assert(DebugLocListIndex == ~0U && !ValueLoc.get() && "not an MMI entry");
+  assert(V.DebugLocListIndex == ~0U && !V.ValueLoc.get() && "not an MMI entry");
+  assert(V.getVariable() == getVariable() && "conflicting variable");
+  assert(V.getInlinedAt() == getInlinedAt() && "conflicting inlined-at location");
+
+  assert(!FrameIndexExprs.empty() && "Expected an MMI entry");
+  assert(!V.FrameIndexExprs.empty() && "Expected an MMI entry");
+
+  // FIXME: This logic should not be necessary anymore, as we now have proper
+  // deduplication. However, without it, we currently run into the assertion
+  // below, which means that we are likely dealing with broken input, i.e. two
+  // non-fragment entries for the same variable at different frame indices.
+  if (FrameIndexExprs.size()) {
+    auto *Expr = FrameIndexExprs.back().Expr;
+    if (!Expr || !Expr->isFragment())
+      return;
+  }
+
+  for (const auto &FIE : V.FrameIndexExprs)
+    // Ignore duplicate entries.
+    if (llvm::none_of(FrameIndexExprs, [&](const FrameIndexExpr &Other) {
+          return FIE.FI == Other.FI && FIE.Expr == Other.Expr;
+        }))
+      FrameIndexExprs.push_back(FIE);
+
+  assert((FrameIndexExprs.size() == 1 ||
+          llvm::all_of(FrameIndexExprs,
+                       [](FrameIndexExpr &FIE) {
+                         return FIE.Expr && FIE.Expr->isFragment();
+                       })) &&
+         "conflicting locations for variable");
+}
+
+static AccelTableKind computeAccelTableKind(unsigned DwarfVersion,
+                                            bool GenerateTypeUnits,
+                                            DebuggerKind Tuning,
+                                            const Triple &TT) {
+  // Honor an explicit request.
+  if (AccelTables != AccelTableKind::Default)
+    return AccelTables;
+
+  // Accelerator tables with type units are currently not supported.
+  if (GenerateTypeUnits)
+    return AccelTableKind::None;
+
+  // Accelerator tables get emitted if targetting DWARF v5 or LLDB.  DWARF v5
+  // always implies debug_names. For lower standard versions we use apple
+  // accelerator tables on apple platforms and debug_names elsewhere.
+  if (DwarfVersion >= 5)
+    return AccelTableKind::Dwarf;
+  if (Tuning == DebuggerKind::LLDB)
+    return TT.isOSBinFormatMachO() ? AccelTableKind::Apple
+                                   : AccelTableKind::Dwarf;
+  return AccelTableKind::None;
+}
+
+DwarfDebug::DwarfDebug(AsmPrinter *A)
+    : DebugHandlerBase(A), DebugLocs(A->OutStreamer->isVerboseAsm()),
+      InfoHolder(A, "info_string", DIEValueAllocator),
+      SkeletonHolder(A, "skel_string", DIEValueAllocator),
+      IsDarwin(A->TM.getTargetTriple().isOSDarwin()) {
+  const Triple &TT = Asm->TM.getTargetTriple();
+
+  // Make sure we know our "debugger tuning".  The target option takes
+  // precedence; fall back to triple-based defaults.
+  if (Asm->TM.Options.DebuggerTuning != DebuggerKind::Default)
+    DebuggerTuning = Asm->TM.Options.DebuggerTuning;
+  else if (IsDarwin)
+    DebuggerTuning = DebuggerKind::LLDB;
+  else if (TT.isPS())
+    DebuggerTuning = DebuggerKind::SCE;
+  else if (TT.isOSAIX())
+    DebuggerTuning = DebuggerKind::DBX;
+  else
+    DebuggerTuning = DebuggerKind::GDB;
+
+  if (DwarfInlinedStrings == Default)
+    UseInlineStrings = TT.isNVPTX() || tuneForDBX();
+  else
+    UseInlineStrings = DwarfInlinedStrings == Enable;
+
+  UseLocSection = !TT.isNVPTX();
+
+  HasAppleExtensionAttributes = tuneForLLDB();
+
+  // Handle split DWARF.
+  HasSplitDwarf = !Asm->TM.Options.MCOptions.SplitDwarfFile.empty();
+
+  // SCE defaults to linkage names only for abstract subprograms.
+  if (DwarfLinkageNames == DefaultLinkageNames)
+    UseAllLinkageNames = !tuneForSCE();
+  else
+    UseAllLinkageNames = DwarfLinkageNames == AllLinkageNames;
+
+  unsigned DwarfVersionNumber = Asm->TM.Options.MCOptions.DwarfVersion;
+  unsigned DwarfVersion = DwarfVersionNumber ? DwarfVersionNumber
+                                    : MMI->getModule()->getDwarfVersion();
+  // Use dwarf 4 by default if nothing is requested. For NVPTX, use dwarf 2.
+  DwarfVersion =
+      TT.isNVPTX() ? 2 : (DwarfVersion ? DwarfVersion : dwarf::DWARF_VERSION);
+
+  bool Dwarf64 = DwarfVersion >= 3 && // DWARF64 was introduced in DWARFv3.
+                 TT.isArch64Bit();    // DWARF64 requires 64-bit relocations.
+
+  // Support DWARF64
+  // 1: For ELF when requested.
+  // 2: For XCOFF64: the AIX assembler will fill in debug section lengths
+  //    according to the DWARF64 format for 64-bit assembly, so we must use
+  //    DWARF64 in the compiler too for 64-bit mode.
+  Dwarf64 &=
+      ((Asm->TM.Options.MCOptions.Dwarf64 || MMI->getModule()->isDwarf64()) &&
+       TT.isOSBinFormatELF()) ||
+      TT.isOSBinFormatXCOFF();
+
+  if (!Dwarf64 && TT.isArch64Bit() && TT.isOSBinFormatXCOFF())
+    report_fatal_error("XCOFF requires DWARF64 for 64-bit mode!");
+
+  UseRangesSection = !NoDwarfRangesSection && !TT.isNVPTX();
+
+  // Use sections as references. Force for NVPTX.
+  if (DwarfSectionsAsReferences == Default)
+    UseSectionsAsReferences = TT.isNVPTX();
+  else
+    UseSectionsAsReferences = DwarfSectionsAsReferences == Enable;
+
+  // Don't generate type units for unsupported object file formats.
+  GenerateTypeUnits = (A->TM.getTargetTriple().isOSBinFormatELF() ||
+                       A->TM.getTargetTriple().isOSBinFormatWasm()) &&
+                      GenerateDwarfTypeUnits;
+
+  TheAccelTableKind = computeAccelTableKind(
+      DwarfVersion, GenerateTypeUnits, DebuggerTuning, A->TM.getTargetTriple());
+
+  // Work around a GDB bug. GDB doesn't support the standard opcode;
+  // SCE doesn't support GNU's; LLDB prefers the standard opcode, which
+  // is defined as of DWARF 3.
+  // See GDB bug 11616 - DW_OP_form_tls_address is unimplemented
+  // https://sourceware.org/bugzilla/show_bug.cgi?id=11616
+  UseGNUTLSOpcode = tuneForGDB() || DwarfVersion < 3;
+
+  UseDWARF2Bitfields = DwarfVersion < 4;
+
+  // The DWARF v5 string offsets table has - possibly shared - contributions
+  // from each compile and type unit each preceded by a header. The string
+  // offsets table used by the pre-DWARF v5 split-DWARF implementation uses
+  // a monolithic string offsets table without any header.
+  UseSegmentedStringOffsetsTable = DwarfVersion >= 5;
+
+  // Emit call-site-param debug info for GDB and LLDB, if the target supports
+  // the debug entry values feature. It can also be enabled explicitly.
+  EmitDebugEntryValues = Asm->TM.Options.ShouldEmitDebugEntryValues();
+
+  // It is unclear if the GCC .debug_macro extension is well-specified
+  // for split DWARF. For now, do not allow LLVM to emit it.
+  UseDebugMacroSection =
+      DwarfVersion >= 5 || (UseGNUDebugMacro && !useSplitDwarf());
+  if (DwarfOpConvert == Default)
+    EnableOpConvert = !((tuneForGDB() && useSplitDwarf()) || (tuneForLLDB() && !TT.isOSBinFormatMachO()));
+  else
+    EnableOpConvert = (DwarfOpConvert == Enable);
+
+  // Split DWARF would benefit object size significantly by trading reductions
+  // in address pool usage for slightly increased range list encodings.
+  if (DwarfVersion >= 5)
+    MinimizeAddr = MinimizeAddrInV5Option;
+
+  Asm->OutStreamer->getContext().setDwarfVersion(DwarfVersion);
+  Asm->OutStreamer->getContext().setDwarfFormat(Dwarf64 ? dwarf::DWARF64
+                                                        : dwarf::DWARF32);
+}
+
+// Define out of line so we don't have to include DwarfUnit.h in DwarfDebug.h.
+DwarfDebug::~DwarfDebug() = default;
+
+static bool isObjCClass(StringRef Name) {
+  return Name.startswith("+") || Name.startswith("-");
+}
+
+static bool hasObjCCategory(StringRef Name) {
+  if (!isObjCClass(Name))
+    return false;
+
+  return Name.contains(") ");
+}
+
+static void getObjCClassCategory(StringRef In, StringRef &Class,
+                                 StringRef &Category) {
+  if (!hasObjCCategory(In)) {
+    Class = In.slice(In.find('[') + 1, In.find(' '));
+    Category = "";
+    return;
+  }
+
+  Class = In.slice(In.find('[') + 1, In.find('('));
+  Category = In.slice(In.find('[') + 1, In.find(' '));
+}
+
+static StringRef getObjCMethodName(StringRef In) {
+  return In.slice(In.find(' ') + 1, In.find(']'));
+}
+
+// Add the various names to the Dwarf accelerator table names.
+void DwarfDebug::addSubprogramNames(const DICompileUnit &CU,
+                                    const DISubprogram *SP, DIE &Die) {
+  if (getAccelTableKind() != AccelTableKind::Apple &&
+      CU.getNameTableKind() != DICompileUnit::DebugNameTableKind::Apple &&
+      CU.getNameTableKind() == DICompileUnit::DebugNameTableKind::None)
+    return;
+
+  if (!SP->isDefinition())
+    return;
+
+  if (SP->getName() != "")
+    addAccelName(CU, SP->getName(), Die);
+
+  // If the linkage name is different than the name, go ahead and output that as
+  // well into the name table. Only do that if we are going to actually emit
+  // that name.
+  if (SP->getLinkageName() != "" && SP->getName() != SP->getLinkageName() &&
+      (useAllLinkageNames() || InfoHolder.getAbstractScopeDIEs().lookup(SP)))
+    addAccelName(CU, SP->getLinkageName(), Die);
+
+  // If this is an Objective-C selector name add it to the ObjC accelerator
+  // too.
+  if (isObjCClass(SP->getName())) {
+    StringRef Class, Category;
+    getObjCClassCategory(SP->getName(), Class, Category);
+    addAccelObjC(CU, Class, Die);
+    if (Category != "")
+      addAccelObjC(CU, Category, Die);
+    // Also add the base method name to the name table.
+    addAccelName(CU, getObjCMethodName(SP->getName()), Die);
+  }
+}
+
+/// Check whether we should create a DIE for the given Scope, return true
+/// if we don't create a DIE (the corresponding DIE is null).
+bool DwarfDebug::isLexicalScopeDIENull(LexicalScope *Scope) {
+  if (Scope->isAbstractScope())
+    return false;
+
+  // We don't create a DIE if there is no Range.
+  const SmallVectorImpl<InsnRange> &Ranges = Scope->getRanges();
+  if (Ranges.empty())
+    return true;
+
+  if (Ranges.size() > 1)
+    return false;
+
+  // We don't create a DIE if we have a single Range and the end label
+  // is null.
+  return !getLabelAfterInsn(Ranges.front().second);
+}
+
+template <typename Func> static void forBothCUs(DwarfCompileUnit &CU, Func F) {
+  F(CU);
+  if (auto *SkelCU = CU.getSkeleton())
+    if (CU.getCUNode()->getSplitDebugInlining())
+      F(*SkelCU);
+}
+
+bool DwarfDebug::shareAcrossDWOCUs() const {
+  return SplitDwarfCrossCuReferences;
+}
+
+void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
+                                                     LexicalScope *Scope) {
+  assert(Scope && Scope->getScopeNode());
+  assert(Scope->isAbstractScope());
+  assert(!Scope->getInlinedAt());
+
+  auto *SP = cast<DISubprogram>(Scope->getScopeNode());
+
+  // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
+  // was inlined from another compile unit.
+  if (useSplitDwarf() && !shareAcrossDWOCUs() && !SP->getUnit()->getSplitDebugInlining())
+    // Avoid building the original CU if it won't be used
+    SrcCU.constructAbstractSubprogramScopeDIE(Scope);
+  else {
+    auto &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
+    if (auto *SkelCU = CU.getSkeleton()) {
+      (shareAcrossDWOCUs() ? CU : SrcCU)
+          .constructAbstractSubprogramScopeDIE(Scope);
+      if (CU.getCUNode()->getSplitDebugInlining())
+        SkelCU->constructAbstractSubprogramScopeDIE(Scope);
+    } else
+      CU.constructAbstractSubprogramScopeDIE(Scope);
+  }
+}
+
+/// Represents a parameter whose call site value can be described by applying a
+/// debug expression to a register in the forwarded register worklist.
+struct FwdRegParamInfo {
+  /// The described parameter register.
+  unsigned ParamReg;
+
+  /// Debug expression that has been built up when walking through the
+  /// instruction chain that produces the parameter's value.
+  const DIExpression *Expr;
+};
+
+/// Register worklist for finding call site values.
+using FwdRegWorklist = MapVector<unsigned, SmallVector<FwdRegParamInfo, 2>>;
+/// Container for the set of registers known to be clobbered on the path to a
+/// call site.
+using ClobberedRegSet = SmallSet<Register, 16>;
+
+/// Append the expression \p Addition to \p Original and return the result.
+static const DIExpression *combineDIExpressions(const DIExpression *Original,
+                                                const DIExpression *Addition) {
+  std::vector<uint64_t> Elts = Addition->getElements().vec();
+  // Avoid multiple DW_OP_stack_values.
+  if (Original->isImplicit() && Addition->isImplicit())
+    erase_value(Elts, dwarf::DW_OP_stack_value);
+  const DIExpression *CombinedExpr =
+      (Elts.size() > 0) ? DIExpression::append(Original, Elts) : Original;
+  return CombinedExpr;
+}
+
+/// Emit call site parameter entries that are described by the given value and
+/// debug expression.
+template <typename ValT>
+static void finishCallSiteParams(ValT Val, const DIExpression *Expr,
+                                 ArrayRef<FwdRegParamInfo> DescribedParams,
+                                 ParamSet &Params) {
+  for (auto Param : DescribedParams) {
+    bool ShouldCombineExpressions = Expr && Param.Expr->getNumElements() > 0;
+
+    // TODO: Entry value operations can currently not be combined with any
+    // other expressions, so we can't emit call site entries in those cases.
+    if (ShouldCombineExpressions && Expr->isEntryValue())
+      continue;
+
+    // If a parameter's call site value is produced by a chain of
+    // instructions we may have already created an expression for the
+    // parameter when walking through the instructions. Append that to the
+    // base expression.
+    const DIExpression *CombinedExpr =
+        ShouldCombineExpressions ? combineDIExpressions(Expr, Param.Expr)
+                                 : Expr;
+    assert((!CombinedExpr || CombinedExpr->isValid()) &&
+           "Combined debug expression is invalid");
+
+    DbgValueLoc DbgLocVal(CombinedExpr, DbgValueLocEntry(Val));
+    DbgCallSiteParam CSParm(Param.ParamReg, DbgLocVal);
+    Params.push_back(CSParm);
+    ++NumCSParams;
+  }
+}
+
+/// Add \p Reg to the worklist, if it's not already present, and mark that the
+/// given parameter registers' values can (potentially) be described using
+/// that register and an debug expression.
+static void addToFwdRegWorklist(FwdRegWorklist &Worklist, unsigned Reg,
+                                const DIExpression *Expr,
+                                ArrayRef<FwdRegParamInfo> ParamsToAdd) {
+  auto I = Worklist.insert({Reg, {}});
+  auto &ParamsForFwdReg = I.first->second;
+  for (auto Param : ParamsToAdd) {
+    assert(none_of(ParamsForFwdReg,
+                   [Param](const FwdRegParamInfo &D) {
+                     return D.ParamReg == Param.ParamReg;
+                   }) &&
+           "Same parameter described twice by forwarding reg");
+
+    // If a parameter's call site value is produced by a chain of
+    // instructions we may have already created an expression for the
+    // parameter when walking through the instructions. Append that to the
+    // new expression.
+    const DIExpression *CombinedExpr = combineDIExpressions(Expr, Param.Expr);
+    ParamsForFwdReg.push_back({Param.ParamReg, CombinedExpr});
+  }
+}
+
+/// Interpret values loaded into registers by \p CurMI.
+static void interpretValues(const MachineInstr *CurMI,
+                            FwdRegWorklist &ForwardedRegWorklist,
+                            ParamSet &Params,
+                            ClobberedRegSet &ClobberedRegUnits) {
+
+  const MachineFunction *MF = CurMI->getMF();
+  const DIExpression *EmptyExpr =
+      DIExpression::get(MF->getFunction().getContext(), {});
+  const auto &TRI = *MF->getSubtarget().getRegisterInfo();
+  const auto &TII = *MF->getSubtarget().getInstrInfo();
+  const auto &TLI = *MF->getSubtarget().getTargetLowering();
+
+  // If an instruction defines more than one item in the worklist, we may run
+  // into situations where a worklist register's value is (potentially)
+  // described by the previous value of another register that is also defined
+  // by that instruction.
+  //
+  // This can for example occur in cases like this:
+  //
+  //   $r1 = mov 123
+  //   $r0, $r1 = mvrr $r1, 456
+  //   call @foo, $r0, $r1
+  //
+  // When describing $r1's value for the mvrr instruction, we need to make sure
+  // that we don't finalize an entry value for $r0, as that is dependent on the
+  // previous value of $r1 (123 rather than 456).
+  //
+  // In order to not have to distinguish between those cases when finalizing
+  // entry values, we simply postpone adding new parameter registers to the
+  // worklist, by first keeping them in this temporary container until the
+  // instruction has been handled.
+  FwdRegWorklist TmpWorklistItems;
+
+  // If the MI is an instruction defining one or more parameters' forwarding
+  // registers, add those defines.
+  ClobberedRegSet NewClobberedRegUnits;
+  auto getForwardingRegsDefinedByMI = [&](const MachineInstr &MI,
+                                          SmallSetVector<unsigned, 4> &Defs) {
+    if (MI.isDebugInstr())
+      return;
+
+    for (const MachineOperand &MO : MI.all_defs()) {
+      if (MO.getReg().isPhysical()) {
+        for (auto &FwdReg : ForwardedRegWorklist)
+          if (TRI.regsOverlap(FwdReg.first, MO.getReg()))
+            Defs.insert(FwdReg.first);
+        for (MCRegUnit Unit : TRI.regunits(MO.getReg()))
+          NewClobberedRegUnits.insert(Unit);
+      }
+    }
+  };
+
+  // Set of worklist registers that are defined by this instruction.
+  SmallSetVector<unsigned, 4> FwdRegDefs;
+
+  getForwardingRegsDefinedByMI(*CurMI, FwdRegDefs);
+  if (FwdRegDefs.empty()) {
+    // Any definitions by this instruction will clobber earlier reg movements.
+    ClobberedRegUnits.insert(NewClobberedRegUnits.begin(),
+                             NewClobberedRegUnits.end());
+    return;
+  }
+
+  // It's possible that we find a copy from a non-volatile register to the param
+  // register, which is clobbered in the meantime. Test for clobbered reg unit
+  // overlaps before completing.
+  auto IsRegClobberedInMeantime = [&](Register Reg) -> bool {
+    for (auto &RegUnit : ClobberedRegUnits)
+      if (TRI.hasRegUnit(Reg, RegUnit))
+        return true;
+    return false;
+  };
+
+  for (auto ParamFwdReg : FwdRegDefs) {
+    if (auto ParamValue = TII.describeLoadedValue(*CurMI, ParamFwdReg)) {
+      if (ParamValue->first.isImm()) {
+        int64_t Val = ParamValue->first.getImm();
+        finishCallSiteParams(Val, ParamValue->second,
+                             ForwardedRegWorklist[ParamFwdReg], Params);
+      } else if (ParamValue->first.isReg()) {
+        Register RegLoc = ParamValue->first.getReg();
+        Register SP = TLI.getStackPointerRegisterToSaveRestore();
+        Register FP = TRI.getFrameRegister(*MF);
+        bool IsSPorFP = (RegLoc == SP) || (RegLoc == FP);
+        if (!IsRegClobberedInMeantime(RegLoc) &&
+            (TRI.isCalleeSavedPhysReg(RegLoc, *MF) || IsSPorFP)) {
+          MachineLocation MLoc(RegLoc, /*Indirect=*/IsSPorFP);
+          finishCallSiteParams(MLoc, ParamValue->second,
+                               ForwardedRegWorklist[ParamFwdReg], Params);
+        } else {
+          // ParamFwdReg was described by the non-callee saved register
+          // RegLoc. Mark that the call site values for the parameters are
+          // dependent on that register instead of ParamFwdReg. Since RegLoc
+          // may be a register that will be handled in this iteration, we
+          // postpone adding the items to the worklist, and instead keep them
+          // in a temporary container.
+          addToFwdRegWorklist(TmpWorklistItems, RegLoc, ParamValue->second,
+                              ForwardedRegWorklist[ParamFwdReg]);
+        }
+      }
+    }
+  }
+
+  // Remove all registers that this instruction defines from the worklist.
+  for (auto ParamFwdReg : FwdRegDefs)
+    ForwardedRegWorklist.erase(ParamFwdReg);
+
+  // Any definitions by this instruction will clobber earlier reg movements.
+  ClobberedRegUnits.insert(NewClobberedRegUnits.begin(),
+                           NewClobberedRegUnits.end());
+
+  // Now that we are done handling this instruction, add items from the
+  // temporary worklist to the real one.
+  for (auto &New : TmpWorklistItems)
+    addToFwdRegWorklist(ForwardedRegWorklist, New.first, EmptyExpr, New.second);
+  TmpWorklistItems.clear();
+}
+
+static bool interpretNextInstr(const MachineInstr *CurMI,
+                               FwdRegWorklist &ForwardedRegWorklist,
+                               ParamSet &Params,
+                               ClobberedRegSet &ClobberedRegUnits) {
+  // Skip bundle headers.
+  if (CurMI->isBundle())
+    return true;
+
+  // If the next instruction is a call we can not interpret parameter's
+  // forwarding registers or we finished the interpretation of all
+  // parameters.
+  if (CurMI->isCall())
+    return false;
+
+  if (ForwardedRegWorklist.empty())
+    return false;
+
+  // Avoid NOP description.
+  if (CurMI->getNumOperands() == 0)
+    return true;
+
+  interpretValues(CurMI, ForwardedRegWorklist, Params, ClobberedRegUnits);
+
+  return true;
+}
+
+/// Try to interpret values loaded into registers that forward parameters
+/// for \p CallMI. Store parameters with interpreted value into \p Params.
+static void collectCallSiteParameters(const MachineInstr *CallMI,
+                                      ParamSet &Params) {
+  const MachineFunction *MF = CallMI->getMF();
+  const auto &CalleesMap = MF->getCallSitesInfo();
+  auto CallFwdRegsInfo = CalleesMap.find(CallMI);
+
+  // There is no information for the call instruction.
+  if (CallFwdRegsInfo == CalleesMap.end())
+    return;
+
+  const MachineBasicBlock *MBB = CallMI->getParent();
+
+  // Skip the call instruction.
+  auto I = std::next(CallMI->getReverseIterator());
+
+  FwdRegWorklist ForwardedRegWorklist;
+
+  const DIExpression *EmptyExpr =
+      DIExpression::get(MF->getFunction().getContext(), {});
+
+  // Add all the forwarding registers into the ForwardedRegWorklist.
+  for (const auto &ArgReg : CallFwdRegsInfo->second) {
+    bool InsertedReg =
+        ForwardedRegWorklist.insert({ArgReg.Reg, {{ArgReg.Reg, EmptyExpr}}})
+            .second;
+    assert(InsertedReg && "Single register used to forward two arguments?");
+    (void)InsertedReg;
+  }
+
+  // Do not emit CSInfo for undef forwarding registers.
+  for (const auto &MO : CallMI->uses())
+    if (MO.isReg() && MO.isUndef())
+      ForwardedRegWorklist.erase(MO.getReg());
+
+  // We erase, from the ForwardedRegWorklist, those forwarding registers for
+  // which we successfully describe a loaded value (by using
+  // the describeLoadedValue()). For those remaining arguments in the working
+  // list, for which we do not describe a loaded value by
+  // the describeLoadedValue(), we try to generate an entry value expression
+  // for their call site value description, if the call is within the entry MBB.
+  // TODO: Handle situations when call site parameter value can be described
+  // as the entry value within basic blocks other than the first one.
+  bool ShouldTryEmitEntryVals = MBB->getIterator() == MF->begin();
+
+  // Search for a loading value in forwarding registers inside call delay slot.
+  ClobberedRegSet ClobberedRegUnits;
+  if (CallMI->hasDelaySlot()) {
+    auto Suc = std::next(CallMI->getIterator());
+    // Only one-instruction delay slot is supported.
+    auto BundleEnd = llvm::getBundleEnd(CallMI->getIterator());
+    (void)BundleEnd;
+    assert(std::next(Suc) == BundleEnd &&
+           "More than one instruction in call delay slot");
+    // Try to interpret value loaded by instruction.
+    if (!interpretNextInstr(&*Suc, ForwardedRegWorklist, Params, ClobberedRegUnits))
+      return;
+  }
+
+  // Search for a loading value in forwarding registers.
+  for (; I != MBB->rend(); ++I) {
+    // Try to interpret values loaded by instruction.
+    if (!interpretNextInstr(&*I, ForwardedRegWorklist, Params, ClobberedRegUnits))
+      return;
+  }
+
+  // Emit the call site parameter's value as an entry value.
+  if (ShouldTryEmitEntryVals) {
+    // Create an expression where the register's entry value is used.
+    DIExpression *EntryExpr = DIExpression::get(
+        MF->getFunction().getContext(), {dwarf::DW_OP_LLVM_entry_value, 1});
+    for (auto &RegEntry : ForwardedRegWorklist) {
+      MachineLocation MLoc(RegEntry.first);
+      finishCallSiteParams(MLoc, EntryExpr, RegEntry.second, Params);
+    }
+  }
+}
+
+void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
+                                            DwarfCompileUnit &CU, DIE &ScopeDIE,
+                                            const MachineFunction &MF) {
+  // Add a call site-related attribute (DWARF5, Sec. 3.3.1.3). Do this only if
+  // the subprogram is required to have one.
+  if (!SP.areAllCallsDescribed() || !SP.isDefinition())
+    return;
+
+  // Use DW_AT_call_all_calls to express that call site entries are present
+  // for both tail and non-tail calls. Don't use DW_AT_call_all_source_calls
+  // because one of its requirements is not met: call site entries for
+  // optimized-out calls are elided.
+  CU.addFlag(ScopeDIE, CU.getDwarf5OrGNUAttr(dwarf::DW_AT_call_all_calls));
+
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  assert(TII && "TargetInstrInfo not found: cannot label tail calls");
+
+  // Delay slot support check.
+  auto delaySlotSupported = [&](const MachineInstr &MI) {
+    if (!MI.isBundledWithSucc())
+      return false;
+    auto Suc = std::next(MI.getIterator());
+    auto CallInstrBundle = getBundleStart(MI.getIterator());
+    (void)CallInstrBundle;
+    auto DelaySlotBundle = getBundleStart(Suc);
+    (void)DelaySlotBundle;
+    // Ensure that label after call is following delay slot instruction.
+    // Ex. CALL_INSTRUCTION {
+    //       DELAY_SLOT_INSTRUCTION }
+    //      LABEL_AFTER_CALL
+    assert(getLabelAfterInsn(&*CallInstrBundle) ==
+               getLabelAfterInsn(&*DelaySlotBundle) &&
+           "Call and its successor instruction don't have same label after.");
+    return true;
+  };
+
+  // Emit call site entries for each call or tail call in the function.
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB.instrs()) {
+      // Bundles with call in them will pass the isCall() test below but do not
+      // have callee operand information so skip them here. Iterator will
+      // eventually reach the call MI.
+      if (MI.isBundle())
+        continue;
+
+      // Skip instructions which aren't calls. Both calls and tail-calling jump
+      // instructions (e.g TAILJMPd64) are classified correctly here.
+      if (!MI.isCandidateForCallSiteEntry())
+        continue;
+
+      // Skip instructions marked as frame setup, as they are not interesting to
+      // the user.
+      if (MI.getFlag(MachineInstr::FrameSetup))
+        continue;
+
+      // Check if delay slot support is enabled.
+      if (MI.hasDelaySlot() && !delaySlotSupported(*&MI))
+        return;
+
+      // If this is a direct call, find the callee's subprogram.
+      // In the case of an indirect call find the register that holds
+      // the callee.
+      const MachineOperand &CalleeOp = TII->getCalleeOperand(MI);
+      if (!CalleeOp.isGlobal() &&
+          (!CalleeOp.isReg() || !CalleeOp.getReg().isPhysical()))
+        continue;
+
+      unsigned CallReg = 0;
+      const DISubprogram *CalleeSP = nullptr;
+      const Function *CalleeDecl = nullptr;
+      if (CalleeOp.isReg()) {
+        CallReg = CalleeOp.getReg();
+        if (!CallReg)
+          continue;
+      } else {
+        CalleeDecl = dyn_cast<Function>(CalleeOp.getGlobal());
+        if (!CalleeDecl || !CalleeDecl->getSubprogram())
+          continue;
+        CalleeSP = CalleeDecl->getSubprogram();
+      }
+
+      // TODO: Omit call site entries for runtime calls (objc_msgSend, etc).
+
+      bool IsTail = TII->isTailCall(MI);
+
+      // If MI is in a bundle, the label was created after the bundle since
+      // EmitFunctionBody iterates over top-level MIs. Get that top-level MI
+      // to search for that label below.
+      const MachineInstr *TopLevelCallMI =
+          MI.isInsideBundle() ? &*getBundleStart(MI.getIterator()) : &MI;
+
+      // For non-tail calls, the return PC is needed to disambiguate paths in
+      // the call graph which could lead to some target function. For tail
+      // calls, no return PC information is needed, unless tuning for GDB in
+      // DWARF4 mode in which case we fake a return PC for compatibility.
+      const MCSymbol *PCAddr =
+          (!IsTail || CU.useGNUAnalogForDwarf5Feature())
+              ? const_cast<MCSymbol *>(getLabelAfterInsn(TopLevelCallMI))
+              : nullptr;
+
+      // For tail calls, it's necessary to record the address of the branch
+      // instruction so that the debugger can show where the tail call occurred.
+      const MCSymbol *CallAddr =
+          IsTail ? getLabelBeforeInsn(TopLevelCallMI) : nullptr;
+
+      assert((IsTail || PCAddr) && "Non-tail call without return PC");
+
+      LLVM_DEBUG(dbgs() << "CallSiteEntry: " << MF.getName() << " -> "
+                        << (CalleeDecl ? CalleeDecl->getName()
+                                       : StringRef(MF.getSubtarget()
+                                                       .getRegisterInfo()
+                                                       ->getName(CallReg)))
+                        << (IsTail ? " [IsTail]" : "") << "\n");
+
+      DIE &CallSiteDIE = CU.constructCallSiteEntryDIE(
+          ScopeDIE, CalleeSP, IsTail, PCAddr, CallAddr, CallReg);
+
+      // Optionally emit call-site-param debug info.
+      if (emitDebugEntryValues()) {
+        ParamSet Params;
+        // Try to interpret values of call site parameters.
+        collectCallSiteParameters(&MI, Params);
+        CU.constructCallSiteParmEntryDIEs(CallSiteDIE, Params);
+      }
+    }
+  }
+}
+
+void DwarfDebug::addGnuPubAttributes(DwarfCompileUnit &U, DIE &D) const {
+  if (!U.hasDwarfPubSections())
+    return;
+
+  U.addFlag(D, dwarf::DW_AT_GNU_pubnames);
+}
+
+void DwarfDebug::finishUnitAttributes(const DICompileUnit *DIUnit,
+                                      DwarfCompileUnit &NewCU) {
+  DIE &Die = NewCU.getUnitDie();
+  StringRef FN = DIUnit->getFilename();
+
+  StringRef Producer = DIUnit->getProducer();
+  StringRef Flags = DIUnit->getFlags();
+  if (!Flags.empty() && !useAppleExtensionAttributes()) {
+    std::string ProducerWithFlags = Producer.str() + " " + Flags.str();
+    NewCU.addString(Die, dwarf::DW_AT_producer, ProducerWithFlags);
+  } else
+    NewCU.addString(Die, dwarf::DW_AT_producer, Producer);
+
+  NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
+                DIUnit->getSourceLanguage());
+  NewCU.addString(Die, dwarf::DW_AT_name, FN);
+  StringRef SysRoot = DIUnit->getSysRoot();
+  if (!SysRoot.empty())
+    NewCU.addString(Die, dwarf::DW_AT_LLVM_sysroot, SysRoot);
+  StringRef SDK = DIUnit->getSDK();
+  if (!SDK.empty())
+    NewCU.addString(Die, dwarf::DW_AT_APPLE_sdk, SDK);
+
+  if (!useSplitDwarf()) {
+    // Add DW_str_offsets_base to the unit DIE, except for split units.
+    if (useSegmentedStringOffsetsTable())
+      NewCU.addStringOffsetsStart();
+
+    NewCU.initStmtList();
+
+    // If we're using split dwarf the compilation dir is going to be in the
+    // skeleton CU and so we don't need to duplicate it here.
+    if (!CompilationDir.empty())
+      NewCU.addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
+    addGnuPubAttributes(NewCU, Die);
+  }
+
+  if (useAppleExtensionAttributes()) {
+    if (DIUnit->isOptimized())
+      NewCU.addFlag(Die, dwarf::DW_AT_APPLE_optimized);
+
+    StringRef Flags = DIUnit->getFlags();
+    if (!Flags.empty())
+      NewCU.addString(Die, dwarf::DW_AT_APPLE_flags, Flags);
+
+    if (unsigned RVer = DIUnit->getRuntimeVersion())
+      NewCU.addUInt(Die, dwarf::DW_AT_APPLE_major_runtime_vers,
+                    dwarf::DW_FORM_data1, RVer);
+  }
+
+  if (DIUnit->getDWOId()) {
+    // This CU is either a clang module DWO or a skeleton CU.
+    NewCU.addUInt(Die, dwarf::DW_AT_GNU_dwo_id, dwarf::DW_FORM_data8,
+                  DIUnit->getDWOId());
+    if (!DIUnit->getSplitDebugFilename().empty()) {
+      // This is a prefabricated skeleton CU.
+      dwarf::Attribute attrDWOName = getDwarfVersion() >= 5
+                                         ? dwarf::DW_AT_dwo_name
+                                         : dwarf::DW_AT_GNU_dwo_name;
+      NewCU.addString(Die, attrDWOName, DIUnit->getSplitDebugFilename());
+    }
+  }
+}
+// Create new DwarfCompileUnit for the given metadata node with tag
+// DW_TAG_compile_unit.
+DwarfCompileUnit &
+DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) {
+  if (auto *CU = CUMap.lookup(DIUnit))
+    return *CU;
+
+  if (useSplitDwarf() &&
+      !shareAcrossDWOCUs() &&
+      (!DIUnit->getSplitDebugInlining() ||
+       DIUnit->getEmissionKind() == DICompileUnit::FullDebug) &&
+      !CUMap.empty()) {
+    return *CUMap.begin()->second;
+  }
+  CompilationDir = DIUnit->getDirectory();
+
+  auto OwnedUnit = std::make_unique<DwarfCompileUnit>(
+      InfoHolder.getUnits().size(), DIUnit, Asm, this, &InfoHolder);
+  DwarfCompileUnit &NewCU = *OwnedUnit;
+  InfoHolder.addUnit(std::move(OwnedUnit));
+
+  // LTO with assembly output shares a single line table amongst multiple CUs.
+  // To avoid the compilation directory being ambiguous, let the line table
+  // explicitly describe the directory of all files, never relying on the
+  // compilation directory.
+  if (!Asm->OutStreamer->hasRawTextSupport() || SingleCU)
+    Asm->OutStreamer->emitDwarfFile0Directive(
+        CompilationDir, DIUnit->getFilename(), getMD5AsBytes(DIUnit->getFile()),
+        DIUnit->getSource(), NewCU.getUniqueID());
+
+  if (useSplitDwarf()) {
+    NewCU.setSkeleton(constructSkeletonCU(NewCU));
+    NewCU.setSection(Asm->getObjFileLowering().getDwarfInfoDWOSection());
+  } else {
+    finishUnitAttributes(DIUnit, NewCU);
+    NewCU.setSection(Asm->getObjFileLowering().getDwarfInfoSection());
+  }
+
+  CUMap.insert({DIUnit, &NewCU});
+  CUDieMap.insert({&NewCU.getUnitDie(), &NewCU});
+  return NewCU;
+}
+
+/// Sort and unique GVEs by comparing their fragment offset.
+static SmallVectorImpl<DwarfCompileUnit::GlobalExpr> &
+sortGlobalExprs(SmallVectorImpl<DwarfCompileUnit::GlobalExpr> &GVEs) {
+  llvm::sort(
+      GVEs, [](DwarfCompileUnit::GlobalExpr A, DwarfCompileUnit::GlobalExpr B) {
+        // Sort order: first null exprs, then exprs without fragment
+        // info, then sort by fragment offset in bits.
+        // FIXME: Come up with a more comprehensive comparator so
+        // the sorting isn't non-deterministic, and so the following
+        // std::unique call works correctly.
+        if (!A.Expr || !B.Expr)
+          return !!B.Expr;
+        auto FragmentA = A.Expr->getFragmentInfo();
+        auto FragmentB = B.Expr->getFragmentInfo();
+        if (!FragmentA || !FragmentB)
+          return !!FragmentB;
+        return FragmentA->OffsetInBits < FragmentB->OffsetInBits;
+      });
+  GVEs.erase(std::unique(GVEs.begin(), GVEs.end(),
+                         [](DwarfCompileUnit::GlobalExpr A,
+                            DwarfCompileUnit::GlobalExpr B) {
+                           return A.Expr == B.Expr;
+                         }),
+             GVEs.end());
+  return GVEs;
+}
+
+// Emit all Dwarf sections that should come prior to the content. Create
+// global DIEs and emit initial debug info sections. This is invoked by
+// the target AsmPrinter.
+void DwarfDebug::beginModule(Module *M) {
+  DebugHandlerBase::beginModule(M);
+
+  if (!Asm || !MMI->hasDebugInfo())
+    return;
+
+  unsigned NumDebugCUs = std::distance(M->debug_compile_units_begin(),
+                                       M->debug_compile_units_end());
+  assert(NumDebugCUs > 0 && "Asm unexpectedly initialized");
+  assert(MMI->hasDebugInfo() &&
+         "DebugInfoAvailabilty unexpectedly not initialized");
+  SingleCU = NumDebugCUs == 1;
+  DenseMap<DIGlobalVariable *, SmallVector<DwarfCompileUnit::GlobalExpr, 1>>
+      GVMap;
+  for (const GlobalVariable &Global : M->globals()) {
+    SmallVector<DIGlobalVariableExpression *, 1> GVs;
+    Global.getDebugInfo(GVs);
+    for (auto *GVE : GVs)
+      GVMap[GVE->getVariable()].push_back({&Global, GVE->getExpression()});
+  }
+
+  // Create the symbol that designates the start of the unit's contribution
+  // to the string offsets table. In a split DWARF scenario, only the skeleton
+  // unit has the DW_AT_str_offsets_base attribute (and hence needs the symbol).
+  if (useSegmentedStringOffsetsTable())
+    (useSplitDwarf() ? SkeletonHolder : InfoHolder)
+        .setStringOffsetsStartSym(Asm->createTempSymbol("str_offsets_base"));
+
+
+  // Create the symbols that designates the start of the DWARF v5 range list
+  // and locations list tables. They are located past the table headers.
+  if (getDwarfVersion() >= 5) {
+    DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+    Holder.setRnglistsTableBaseSym(
+        Asm->createTempSymbol("rnglists_table_base"));
+
+    if (useSplitDwarf())
+      InfoHolder.setRnglistsTableBaseSym(
+          Asm->createTempSymbol("rnglists_dwo_table_base"));
+  }
+
+  // Create the symbol that points to the first entry following the debug
+  // address table (.debug_addr) header.
+  AddrPool.setLabel(Asm->createTempSymbol("addr_table_base"));
+  DebugLocs.setSym(Asm->createTempSymbol("loclists_table_base"));
+
+  for (DICompileUnit *CUNode : M->debug_compile_units()) {
+    if (CUNode->getImportedEntities().empty() &&
+        CUNode->getEnumTypes().empty() && CUNode->getRetainedTypes().empty() &&
+        CUNode->getGlobalVariables().empty() && CUNode->getMacros().empty())
+      continue;
+
+    DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(CUNode);
+
+    // Global Variables.
+    for (auto *GVE : CUNode->getGlobalVariables()) {
+      // Don't bother adding DIGlobalVariableExpressions listed in the CU if we
+      // already know about the variable and it isn't adding a constant
+      // expression.
+      auto &GVMapEntry = GVMap[GVE->getVariable()];
+      auto *Expr = GVE->getExpression();
+      if (!GVMapEntry.size() || (Expr && Expr->isConstant()))
+        GVMapEntry.push_back({nullptr, Expr});
+    }
+
+    DenseSet<DIGlobalVariable *> Processed;
+    for (auto *GVE : CUNode->getGlobalVariables()) {
+      DIGlobalVariable *GV = GVE->getVariable();
+      if (Processed.insert(GV).second)
+        CU.getOrCreateGlobalVariableDIE(GV, sortGlobalExprs(GVMap[GV]));
+    }
+
+    for (auto *Ty : CUNode->getEnumTypes())
+      CU.getOrCreateTypeDIE(cast<DIType>(Ty));
+
+    for (auto *Ty : CUNode->getRetainedTypes()) {
+      // The retained types array by design contains pointers to
+      // MDNodes rather than DIRefs. Unique them here.
+      if (DIType *RT = dyn_cast<DIType>(Ty))
+        // There is no point in force-emitting a forward declaration.
+        CU.getOrCreateTypeDIE(RT);
+    }
+  }
+}
+
+void DwarfDebug::finishEntityDefinitions() {
+  for (const auto &Entity : ConcreteEntities) {
+    DIE *Die = Entity->getDIE();
+    assert(Die);
+    // FIXME: Consider the time-space tradeoff of just storing the unit pointer
+    // in the ConcreteEntities list, rather than looking it up again here.
+    // DIE::getUnit isn't simple - it walks parent pointers, etc.
+    DwarfCompileUnit *Unit = CUDieMap.lookup(Die->getUnitDie());
+    assert(Unit);
+    Unit->finishEntityDefinition(Entity.get());
+  }
+}
+
+void DwarfDebug::finishSubprogramDefinitions() {
+  for (const DISubprogram *SP : ProcessedSPNodes) {
+    assert(SP->getUnit()->getEmissionKind() != DICompileUnit::NoDebug);
+    forBothCUs(
+        getOrCreateDwarfCompileUnit(SP->getUnit()),
+        [&](DwarfCompileUnit &CU) { CU.finishSubprogramDefinition(SP); });
+  }
+}
+
+void DwarfDebug::finalizeModuleInfo() {
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+
+  finishSubprogramDefinitions();
+
+  finishEntityDefinitions();
+
+  // Include the DWO file name in the hash if there's more than one CU.
+  // This handles ThinLTO's situation where imported CUs may very easily be
+  // duplicate with the same CU partially imported into another ThinLTO unit.
+  StringRef DWOName;
+  if (CUMap.size() > 1)
+    DWOName = Asm->TM.Options.MCOptions.SplitDwarfFile;
+
+  bool HasEmittedSplitCU = false;
+
+  // Handle anything that needs to be done on a per-unit basis after
+  // all other generation.
+  for (const auto &P : CUMap) {
+    auto &TheCU = *P.second;
+    if (TheCU.getCUNode()->isDebugDirectivesOnly())
+      continue;
+    // Emit DW_AT_containing_type attribute to connect types with their
+    // vtable holding type.
+    TheCU.constructContainingTypeDIEs();
+
+    // Add CU specific attributes if we need to add any.
+    // If we're splitting the dwarf out now that we've got the entire
+    // CU then add the dwo id to it.
+    auto *SkCU = TheCU.getSkeleton();
+
+    bool HasSplitUnit = SkCU && !TheCU.getUnitDie().children().empty();
+
+    if (HasSplitUnit) {
+      (void)HasEmittedSplitCU;
+      assert((shareAcrossDWOCUs() || !HasEmittedSplitCU) &&
+             "Multiple CUs emitted into a single dwo file");
+      HasEmittedSplitCU = true;
+      dwarf::Attribute attrDWOName = getDwarfVersion() >= 5
+                                         ? dwarf::DW_AT_dwo_name
+                                         : dwarf::DW_AT_GNU_dwo_name;
+      finishUnitAttributes(TheCU.getCUNode(), TheCU);
+      TheCU.addString(TheCU.getUnitDie(), attrDWOName,
+                      Asm->TM.Options.MCOptions.SplitDwarfFile);
+      SkCU->addString(SkCU->getUnitDie(), attrDWOName,
+                      Asm->TM.Options.MCOptions.SplitDwarfFile);
+      // Emit a unique identifier for this CU.
+      uint64_t ID =
+          DIEHash(Asm, &TheCU).computeCUSignature(DWOName, TheCU.getUnitDie());
+      if (getDwarfVersion() >= 5) {
+        TheCU.setDWOId(ID);
+        SkCU->setDWOId(ID);
+      } else {
+        TheCU.addUInt(TheCU.getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
+                      dwarf::DW_FORM_data8, ID);
+        SkCU->addUInt(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
+                      dwarf::DW_FORM_data8, ID);
+      }
+
+      if (getDwarfVersion() < 5 && !SkeletonHolder.getRangeLists().empty()) {
+        const MCSymbol *Sym = TLOF.getDwarfRangesSection()->getBeginSymbol();
+        SkCU->addSectionLabel(SkCU->getUnitDie(), dwarf::DW_AT_GNU_ranges_base,
+                              Sym, Sym);
+      }
+    } else if (SkCU) {
+      finishUnitAttributes(SkCU->getCUNode(), *SkCU);
+    }
+
+    // If we have code split among multiple sections or non-contiguous
+    // ranges of code then emit a DW_AT_ranges attribute on the unit that will
+    // remain in the .o file, otherwise add a DW_AT_low_pc.
+    // FIXME: We should use ranges allow reordering of code ala
+    // .subsections_via_symbols in mach-o. This would mean turning on
+    // ranges for all subprogram DIEs for mach-o.
+    DwarfCompileUnit &U = SkCU ? *SkCU : TheCU;
+
+    if (unsigned NumRanges = TheCU.getRanges().size()) {
+      if (NumRanges > 1 && useRangesSection())
+        // A DW_AT_low_pc attribute may also be specified in combination with
+        // DW_AT_ranges to specify the default base address for use in
+        // location lists (see Section 2.6.2) and range lists (see Section
+        // 2.17.3).
+        U.addUInt(U.getUnitDie(), dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 0);
+      else
+        U.setBaseAddress(TheCU.getRanges().front().Begin);
+      U.attachRangesOrLowHighPC(U.getUnitDie(), TheCU.takeRanges());
+    }
+
+    // We don't keep track of which addresses are used in which CU so this
+    // is a bit pessimistic under LTO.
+    if ((HasSplitUnit || getDwarfVersion() >= 5) && !AddrPool.isEmpty())
+      U.addAddrTableBase();
+
+    if (getDwarfVersion() >= 5) {
+      if (U.hasRangeLists())
+        U.addRnglistsBase();
+
+      if (!DebugLocs.getLists().empty() && !useSplitDwarf()) {
+        U.addSectionLabel(U.getUnitDie(), dwarf::DW_AT_loclists_base,
+                          DebugLocs.getSym(),
+                          TLOF.getDwarfLoclistsSection()->getBeginSymbol());
+      }
+    }
+
+    auto *CUNode = cast<DICompileUnit>(P.first);
+    // If compile Unit has macros, emit "DW_AT_macro_info/DW_AT_macros"
+    // attribute.
+    if (CUNode->getMacros()) {
+      if (UseDebugMacroSection) {
+        if (useSplitDwarf())
+          TheCU.addSectionDelta(
+              TheCU.getUnitDie(), dwarf::DW_AT_macros, U.getMacroLabelBegin(),
+              TLOF.getDwarfMacroDWOSection()->getBeginSymbol());
+        else {
+          dwarf::Attribute MacrosAttr = getDwarfVersion() >= 5
+                                            ? dwarf::DW_AT_macros
+                                            : dwarf::DW_AT_GNU_macros;
+          U.addSectionLabel(U.getUnitDie(), MacrosAttr, U.getMacroLabelBegin(),
+                            TLOF.getDwarfMacroSection()->getBeginSymbol());
+        }
+      } else {
+        if (useSplitDwarf())
+          TheCU.addSectionDelta(
+              TheCU.getUnitDie(), dwarf::DW_AT_macro_info,
+              U.getMacroLabelBegin(),
+              TLOF.getDwarfMacinfoDWOSection()->getBeginSymbol());
+        else
+          U.addSectionLabel(U.getUnitDie(), dwarf::DW_AT_macro_info,
+                            U.getMacroLabelBegin(),
+                            TLOF.getDwarfMacinfoSection()->getBeginSymbol());
+      }
+    }
+    }
+
+  // Emit all frontend-produced Skeleton CUs, i.e., Clang modules.
+  for (auto *CUNode : MMI->getModule()->debug_compile_units())
+    if (CUNode->getDWOId())
+      getOrCreateDwarfCompileUnit(CUNode);
+
+  // Compute DIE offsets and sizes.
+  InfoHolder.computeSizeAndOffsets();
+  if (useSplitDwarf())
+    SkeletonHolder.computeSizeAndOffsets();
+}
+
+// Emit all Dwarf sections that should come after the content.
+void DwarfDebug::endModule() {
+  // Terminate the pending line table.
+  if (PrevCU)
+    terminateLineTable(PrevCU);
+  PrevCU = nullptr;
+  assert(CurFn == nullptr);
+  assert(CurMI == nullptr);
+
+  for (const auto &P : CUMap) {
+    const auto *CUNode = cast<DICompileUnit>(P.first);
+    DwarfCompileUnit *CU = &*P.second;
+
+    // Emit imported entities.
+    for (auto *IE : CUNode->getImportedEntities()) {
+      assert(!isa_and_nonnull<DILocalScope>(IE->getScope()) &&
+             "Unexpected function-local entity in 'imports' CU field.");
+      CU->getOrCreateImportedEntityDIE(IE);
+    }
+    for (const auto *D : CU->getDeferredLocalDecls()) {
+      if (auto *IE = dyn_cast<DIImportedEntity>(D))
+        CU->getOrCreateImportedEntityDIE(IE);
+      else
+        llvm_unreachable("Unexpected local retained node!");
+    }
+
+    // Emit base types.
+    CU->createBaseTypeDIEs();
+  }
+
+  // If we aren't actually generating debug info (check beginModule -
+  // conditionalized on the presence of the llvm.dbg.cu metadata node)
+  if (!Asm || !MMI->hasDebugInfo())
+    return;
+
+  // Finalize the debug info for the module.
+  finalizeModuleInfo();
+
+  if (useSplitDwarf())
+    // Emit debug_loc.dwo/debug_loclists.dwo section.
+    emitDebugLocDWO();
+  else
+    // Emit debug_loc/debug_loclists section.
+    emitDebugLoc();
+
+  // Corresponding abbreviations into a abbrev section.
+  emitAbbreviations();
+
+  // Emit all the DIEs into a debug info section.
+  emitDebugInfo();
+
+  // Emit info into a debug aranges section.
+  if (GenerateARangeSection)
+    emitDebugARanges();
+
+  // Emit info into a debug ranges section.
+  emitDebugRanges();
+
+  if (useSplitDwarf())
+  // Emit info into a debug macinfo.dwo section.
+    emitDebugMacinfoDWO();
+  else
+    // Emit info into a debug macinfo/macro section.
+    emitDebugMacinfo();
+
+  emitDebugStr();
+
+  if (useSplitDwarf()) {
+    emitDebugStrDWO();
+    emitDebugInfoDWO();
+    emitDebugAbbrevDWO();
+    emitDebugLineDWO();
+    emitDebugRangesDWO();
+  }
+
+  emitDebugAddr();
+
+  // Emit info into the dwarf accelerator table sections.
+  switch (getAccelTableKind()) {
+  case AccelTableKind::Apple:
+    emitAccelNames();
+    emitAccelObjC();
+    emitAccelNamespaces();
+    emitAccelTypes();
+    break;
+  case AccelTableKind::Dwarf:
+    emitAccelDebugNames();
+    break;
+  case AccelTableKind::None:
+    break;
+  case AccelTableKind::Default:
+    llvm_unreachable("Default should have already been resolved.");
+  }
+
+  // Emit the pubnames and pubtypes sections if requested.
+  emitDebugPubSections();
+
+  // clean up.
+  // FIXME: AbstractVariables.clear();
+}
+
+void DwarfDebug::ensureAbstractEntityIsCreatedIfScoped(DwarfCompileUnit &CU,
+    const DINode *Node, const MDNode *ScopeNode) {
+  if (CU.getExistingAbstractEntity(Node))
+    return;
+
+  if (LexicalScope *Scope =
+          LScopes.findAbstractScope(cast_or_null<DILocalScope>(ScopeNode)))
+    CU.createAbstractEntity(Node, Scope);
+}
+
+static const DILocalScope *getRetainedNodeScope(const MDNode *N) {
+  const DIScope *S;
+  if (const auto *LV = dyn_cast<DILocalVariable>(N))
+    S = LV->getScope();
+  else if (const auto *L = dyn_cast<DILabel>(N))
+    S = L->getScope();
+  else if (const auto *IE = dyn_cast<DIImportedEntity>(N))
+    S = IE->getScope();
+  else
+    llvm_unreachable("Unexpected retained node!");
+
+  // Ensure the scope is not a DILexicalBlockFile.
+  return cast<DILocalScope>(S)->getNonLexicalBlockFileScope();
+}
+
+// Collect variable information from side table maintained by MF.
+void DwarfDebug::collectVariableInfoFromMFTable(
+    DwarfCompileUnit &TheCU, DenseSet<InlinedEntity> &Processed) {
+  SmallDenseMap<InlinedEntity, DbgVariable *> MFVars;
+  LLVM_DEBUG(dbgs() << "DwarfDebug: collecting variables from MF side table\n");
+  for (const auto &VI : Asm->MF->getVariableDbgInfo()) {
+    if (!VI.Var)
+      continue;
+    assert(VI.Var->isValidLocationForIntrinsic(VI.Loc) &&
+           "Expected inlined-at fields to agree");
+
+    InlinedEntity Var(VI.Var, VI.Loc->getInlinedAt());
+    Processed.insert(Var);
+    LexicalScope *Scope = LScopes.findLexicalScope(VI.Loc);
+
+    // If variable scope is not found then skip this variable.
+    if (!Scope) {
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << VI.Var->getName()
+                        << ", no variable scope found\n");
+      continue;
+    }
+
+    ensureAbstractEntityIsCreatedIfScoped(TheCU, Var.first, Scope->getScopeNode());
+    auto RegVar = std::make_unique<DbgVariable>(
+                    cast<DILocalVariable>(Var.first), Var.second);
+    if (VI.inStackSlot())
+      RegVar->initializeMMI(VI.Expr, VI.getStackSlot());
+    else {
+      MachineLocation MLoc(VI.getEntryValueRegister(), /*IsIndirect*/ true);
+      auto LocEntry = DbgValueLocEntry(MLoc);
+      RegVar->initializeDbgValue(DbgValueLoc(VI.Expr, LocEntry));
+    }
+    LLVM_DEBUG(dbgs() << "Created DbgVariable for " << VI.Var->getName()
+                      << "\n");
+
+    if (DbgVariable *DbgVar = MFVars.lookup(Var)) {
+      if (DbgVar->getValueLoc())
+        LLVM_DEBUG(dbgs() << "Dropping repeated entry value debug info for "
+                             "variable "
+                          << VI.Var->getName() << "\n");
+      else
+        DbgVar->addMMIEntry(*RegVar);
+    } else if (InfoHolder.addScopeVariable(Scope, RegVar.get())) {
+      MFVars.insert({Var, RegVar.get()});
+      ConcreteEntities.push_back(std::move(RegVar));
+    }
+  }
+}
+
+/// Determine whether a *singular* DBG_VALUE is valid for the entirety of its
+/// enclosing lexical scope. The check ensures there are no other instructions
+/// in the same lexical scope preceding the DBG_VALUE and that its range is
+/// either open or otherwise rolls off the end of the scope.
+static bool validThroughout(LexicalScopes &LScopes,
+                            const MachineInstr *DbgValue,
+                            const MachineInstr *RangeEnd,
+                            const InstructionOrdering &Ordering) {
+  assert(DbgValue->getDebugLoc() && "DBG_VALUE without a debug location");
+  auto MBB = DbgValue->getParent();
+  auto DL = DbgValue->getDebugLoc();
+  auto *LScope = LScopes.findLexicalScope(DL);
+  // Scope doesn't exist; this is a dead DBG_VALUE.
+  if (!LScope)
+    return false;
+  auto &LSRange = LScope->getRanges();
+  if (LSRange.size() == 0)
+    return false;
+
+  const MachineInstr *LScopeBegin = LSRange.front().first;
+  // If the scope starts before the DBG_VALUE then we may have a negative
+  // result. Otherwise the location is live coming into the scope and we
+  // can skip the following checks.
+  if (!Ordering.isBefore(DbgValue, LScopeBegin)) {
+    // Exit if the lexical scope begins outside of the current block.
+    if (LScopeBegin->getParent() != MBB)
+      return false;
+
+    MachineBasicBlock::const_reverse_iterator Pred(DbgValue);
+    for (++Pred; Pred != MBB->rend(); ++Pred) {
+      if (Pred->getFlag(MachineInstr::FrameSetup))
+        break;
+      auto PredDL = Pred->getDebugLoc();
+      if (!PredDL || Pred->isMetaInstruction())
+        continue;
+      // Check whether the instruction preceding the DBG_VALUE is in the same
+      // (sub)scope as the DBG_VALUE.
+      if (DL->getScope() == PredDL->getScope())
+        return false;
+      auto *PredScope = LScopes.findLexicalScope(PredDL);
+      if (!PredScope || LScope->dominates(PredScope))
+        return false;
+    }
+  }
+
+  // If the range of the DBG_VALUE is open-ended, report success.
+  if (!RangeEnd)
+    return true;
+
+  // Single, constant DBG_VALUEs in the prologue are promoted to be live
+  // throughout the function. This is a hack, presumably for DWARF v2 and not
+  // necessarily correct. It would be much better to use a dbg.declare instead
+  // if we know the constant is live throughout the scope.
+  if (MBB->pred_empty() &&
+      all_of(DbgValue->debug_operands(),
+             [](const MachineOperand &Op) { return Op.isImm(); }))
+    return true;
+
+  // Test if the location terminates before the end of the scope.
+  const MachineInstr *LScopeEnd = LSRange.back().second;
+  if (Ordering.isBefore(RangeEnd, LScopeEnd))
+    return false;
+
+  // There's a single location which starts at the scope start, and ends at or
+  // after the scope end.
+  return true;
+}
+
+/// Build the location list for all DBG_VALUEs in the function that
+/// describe the same variable. The resulting DebugLocEntries will have
+/// strict monotonically increasing begin addresses and will never
+/// overlap. If the resulting list has only one entry that is valid
+/// throughout variable's scope return true.
+//
+// See the definition of DbgValueHistoryMap::Entry for an explanation of the
+// different kinds of history map entries. One thing to be aware of is that if
+// a debug value is ended by another entry (rather than being valid until the
+// end of the function), that entry's instruction may or may not be included in
+// the range, depending on if the entry is a clobbering entry (it has an
+// instruction that clobbers one or more preceding locations), or if it is an
+// (overlapping) debug value entry. This distinction can be seen in the example
+// below. The first debug value is ended by the clobbering entry 2, and the
+// second and third debug values are ended by the overlapping debug value entry
+// 4.
+//
+// Input:
+//
+//   History map entries [type, end index, mi]
+//
+// 0 |      [DbgValue, 2, DBG_VALUE $reg0, [...] (fragment 0, 32)]
+// 1 | |    [DbgValue, 4, DBG_VALUE $reg1, [...] (fragment 32, 32)]
+// 2 | |    [Clobber, $reg0 = [...], -, -]
+// 3   | |  [DbgValue, 4, DBG_VALUE 123, [...] (fragment 64, 32)]
+// 4        [DbgValue, ~0, DBG_VALUE @g, [...] (fragment 0, 96)]
+//
+// Output [start, end) [Value...]:
+//
+// [0-1)    [(reg0, fragment 0, 32)]
+// [1-3)    [(reg0, fragment 0, 32), (reg1, fragment 32, 32)]
+// [3-4)    [(reg1, fragment 32, 32), (123, fragment 64, 32)]
+// [4-)     [(@g, fragment 0, 96)]
+bool DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
+                                   const DbgValueHistoryMap::Entries &Entries) {
+  using OpenRange =
+      std::pair<DbgValueHistoryMap::EntryIndex, DbgValueLoc>;
+  SmallVector<OpenRange, 4> OpenRanges;
+  bool isSafeForSingleLocation = true;
+  const MachineInstr *StartDebugMI = nullptr;
+  const MachineInstr *EndMI = nullptr;
+
+  for (auto EB = Entries.begin(), EI = EB, EE = Entries.end(); EI != EE; ++EI) {
+    const MachineInstr *Instr = EI->getInstr();
+
+    // Remove all values that are no longer live.
+    size_t Index = std::distance(EB, EI);
+    erase_if(OpenRanges, [&](OpenRange &R) { return R.first <= Index; });
+
+    // If we are dealing with a clobbering entry, this iteration will result in
+    // a location list entry starting after the clobbering instruction.
+    const MCSymbol *StartLabel =
+        EI->isClobber() ? getLabelAfterInsn(Instr) : getLabelBeforeInsn(Instr);
+    assert(StartLabel &&
+           "Forgot label before/after instruction starting a range!");
+
+    const MCSymbol *EndLabel;
+    if (std::next(EI) == Entries.end()) {
+      const MachineBasicBlock &EndMBB = Asm->MF->back();
+      EndLabel = Asm->MBBSectionRanges[EndMBB.getSectionIDNum()].EndLabel;
+      if (EI->isClobber())
+        EndMI = EI->getInstr();
+    }
+    else if (std::next(EI)->isClobber())
+      EndLabel = getLabelAfterInsn(std::next(EI)->getInstr());
+    else
+      EndLabel = getLabelBeforeInsn(std::next(EI)->getInstr());
+    assert(EndLabel && "Forgot label after instruction ending a range!");
+
+    if (EI->isDbgValue())
+      LLVM_DEBUG(dbgs() << "DotDebugLoc: " << *Instr << "\n");
+
+    // If this history map entry has a debug value, add that to the list of
+    // open ranges and check if its location is valid for a single value
+    // location.
+    if (EI->isDbgValue()) {
+      // Do not add undef debug values, as they are redundant information in
+      // the location list entries. An undef debug results in an empty location
+      // description. If there are any non-undef fragments then padding pieces
+      // with empty location descriptions will automatically be inserted, and if
+      // all fragments are undef then the whole location list entry is
+      // redundant.
+      if (!Instr->isUndefDebugValue()) {
+        auto Value = getDebugLocValue(Instr);
+        OpenRanges.emplace_back(EI->getEndIndex(), Value);
+
+        // TODO: Add support for single value fragment locations.
+        if (Instr->getDebugExpression()->isFragment())
+          isSafeForSingleLocation = false;
+
+        if (!StartDebugMI)
+          StartDebugMI = Instr;
+      } else {
+        isSafeForSingleLocation = false;
+      }
+    }
+
+    // Location list entries with empty location descriptions are redundant
+    // information in DWARF, so do not emit those.
+    if (OpenRanges.empty())
+      continue;
+
+    // Omit entries with empty ranges as they do not have any effect in DWARF.
+    if (StartLabel == EndLabel) {
+      LLVM_DEBUG(dbgs() << "Omitting location list entry with empty range.\n");
+      continue;
+    }
+
+    SmallVector<DbgValueLoc, 4> Values;
+    for (auto &R : OpenRanges)
+      Values.push_back(R.second);
+
+    // With Basic block sections, it is posssible that the StartLabel and the
+    // Instr are not in the same section.  This happens when the StartLabel is
+    // the function begin label and the dbg value appears in a basic block
+    // that is not the entry.  In this case, the range needs to be split to
+    // span each individual section in the range from StartLabel to EndLabel.
+    if (Asm->MF->hasBBSections() && StartLabel == Asm->getFunctionBegin() &&
+        !Instr->getParent()->sameSection(&Asm->MF->front())) {
+      const MCSymbol *BeginSectionLabel = StartLabel;
+
+      for (const MachineBasicBlock &MBB : *Asm->MF) {
+        if (MBB.isBeginSection() && &MBB != &Asm->MF->front())
+          BeginSectionLabel = MBB.getSymbol();
+
+        if (MBB.sameSection(Instr->getParent())) {
+          DebugLoc.emplace_back(BeginSectionLabel, EndLabel, Values);
+          break;
+        }
+        if (MBB.isEndSection())
+          DebugLoc.emplace_back(BeginSectionLabel, MBB.getEndSymbol(), Values);
+      }
+    } else {
+      DebugLoc.emplace_back(StartLabel, EndLabel, Values);
+    }
+
+    // Attempt to coalesce the ranges of two otherwise identical
+    // DebugLocEntries.
+    auto CurEntry = DebugLoc.rbegin();
+    LLVM_DEBUG({
+      dbgs() << CurEntry->getValues().size() << " Values:\n";
+      for (auto &Value : CurEntry->getValues())
+        Value.dump();
+      dbgs() << "-----\n";
+    });
+
+    auto PrevEntry = std::next(CurEntry);
+    if (PrevEntry != DebugLoc.rend() && PrevEntry->MergeRanges(*CurEntry))
+      DebugLoc.pop_back();
+  }
+
+  if (!isSafeForSingleLocation ||
+      !validThroughout(LScopes, StartDebugMI, EndMI, getInstOrdering()))
+    return false;
+
+  if (DebugLoc.size() == 1)
+    return true;
+
+  if (!Asm->MF->hasBBSections())
+    return false;
+
+  // Check here to see if loclist can be merged into a single range. If not,
+  // we must keep the split loclists per section.  This does exactly what
+  // MergeRanges does without sections.  We don't actually merge the ranges
+  // as the split ranges must be kept intact if this cannot be collapsed
+  // into a single range.
+  const MachineBasicBlock *RangeMBB = nullptr;
+  if (DebugLoc[0].getBeginSym() == Asm->getFunctionBegin())
+    RangeMBB = &Asm->MF->front();
+  else
+    RangeMBB = Entries.begin()->getInstr()->getParent();
+  auto *CurEntry = DebugLoc.begin();
+  auto *NextEntry = std::next(CurEntry);
+  while (NextEntry != DebugLoc.end()) {
+    // Get the last machine basic block of this section.
+    while (!RangeMBB->isEndSection())
+      RangeMBB = RangeMBB->getNextNode();
+    if (!RangeMBB->getNextNode())
+      return false;
+    // CurEntry should end the current section and NextEntry should start
+    // the next section and the Values must match for these two ranges to be
+    // merged.
+    if (CurEntry->getEndSym() != RangeMBB->getEndSymbol() ||
+        NextEntry->getBeginSym() != RangeMBB->getNextNode()->getSymbol() ||
+        CurEntry->getValues() != NextEntry->getValues())
+      return false;
+    RangeMBB = RangeMBB->getNextNode();
+    CurEntry = NextEntry;
+    NextEntry = std::next(CurEntry);
+  }
+  return true;
+}
+
+DbgEntity *DwarfDebug::createConcreteEntity(DwarfCompileUnit &TheCU,
+                                            LexicalScope &Scope,
+                                            const DINode *Node,
+                                            const DILocation *Location,
+                                            const MCSymbol *Sym) {
+  ensureAbstractEntityIsCreatedIfScoped(TheCU, Node, Scope.getScopeNode());
+  if (isa<const DILocalVariable>(Node)) {
+    ConcreteEntities.push_back(
+        std::make_unique<DbgVariable>(cast<const DILocalVariable>(Node),
+                                       Location));
+    InfoHolder.addScopeVariable(&Scope,
+        cast<DbgVariable>(ConcreteEntities.back().get()));
+  } else if (isa<const DILabel>(Node)) {
+    ConcreteEntities.push_back(
+        std::make_unique<DbgLabel>(cast<const DILabel>(Node),
+                                    Location, Sym));
+    InfoHolder.addScopeLabel(&Scope,
+        cast<DbgLabel>(ConcreteEntities.back().get()));
+  }
+  return ConcreteEntities.back().get();
+}
+
+// Find variables for each lexical scope.
+void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU,
+                                   const DISubprogram *SP,
+                                   DenseSet<InlinedEntity> &Processed) {
+  // Grab the variable info that was squirreled away in the MMI side-table.
+  collectVariableInfoFromMFTable(TheCU, Processed);
+
+  for (const auto &I : DbgValues) {
+    InlinedEntity IV = I.first;
+    if (Processed.count(IV))
+      continue;
+
+    // Instruction ranges, specifying where IV is accessible.
+    const auto &HistoryMapEntries = I.second;
+
+    // Try to find any non-empty variable location. Do not create a concrete
+    // entity if there are no locations.
+    if (!DbgValues.hasNonEmptyLocation(HistoryMapEntries))
+      continue;
+
+    LexicalScope *Scope = nullptr;
+    const DILocalVariable *LocalVar = cast<DILocalVariable>(IV.first);
+    if (const DILocation *IA = IV.second)
+      Scope = LScopes.findInlinedScope(LocalVar->getScope(), IA);
+    else
+      Scope = LScopes.findLexicalScope(LocalVar->getScope());
+    // If variable scope is not found then skip this variable.
+    if (!Scope)
+      continue;
+
+    Processed.insert(IV);
+    DbgVariable *RegVar = cast<DbgVariable>(createConcreteEntity(TheCU,
+                                            *Scope, LocalVar, IV.second));
+
+    const MachineInstr *MInsn = HistoryMapEntries.front().getInstr();
+    assert(MInsn->isDebugValue() && "History must begin with debug value");
+
+    // Check if there is a single DBG_VALUE, valid throughout the var's scope.
+    // If the history map contains a single debug value, there may be an
+    // additional entry which clobbers the debug value.
+    size_t HistSize = HistoryMapEntries.size();
+    bool SingleValueWithClobber =
+        HistSize == 2 && HistoryMapEntries[1].isClobber();
+    if (HistSize == 1 || SingleValueWithClobber) {
+      const auto *End =
+          SingleValueWithClobber ? HistoryMapEntries[1].getInstr() : nullptr;
+      if (validThroughout(LScopes, MInsn, End, getInstOrdering())) {
+        RegVar->initializeDbgValue(MInsn);
+        continue;
+      }
+    }
+
+    // Do not emit location lists if .debug_loc secton is disabled.
+    if (!useLocSection())
+      continue;
+
+    // Handle multiple DBG_VALUE instructions describing one variable.
+    DebugLocStream::ListBuilder List(DebugLocs, TheCU, *Asm, *RegVar, *MInsn);
+
+    // Build the location list for this variable.
+    SmallVector<DebugLocEntry, 8> Entries;
+    bool isValidSingleLocation = buildLocationList(Entries, HistoryMapEntries);
+
+    // Check whether buildLocationList managed to merge all locations to one
+    // that is valid throughout the variable's scope. If so, produce single
+    // value location.
+    if (isValidSingleLocation) {
+      RegVar->initializeDbgValue(Entries[0].getValues()[0]);
+      continue;
+    }
+
+    // If the variable has a DIBasicType, extract it.  Basic types cannot have
+    // unique identifiers, so don't bother resolving the type with the
+    // identifier map.
+    const DIBasicType *BT = dyn_cast<DIBasicType>(
+        static_cast<const Metadata *>(LocalVar->getType()));
+
+    // Finalize the entry by lowering it into a DWARF bytestream.
+    for (auto &Entry : Entries)
+      Entry.finalize(*Asm, List, BT, TheCU);
+  }
+
+  // For each InlinedEntity collected from DBG_LABEL instructions, convert to
+  // DWARF-related DbgLabel.
+  for (const auto &I : DbgLabels) {
+    InlinedEntity IL = I.first;
+    const MachineInstr *MI = I.second;
+    if (MI == nullptr)
+      continue;
+
+    LexicalScope *Scope = nullptr;
+    const DILabel *Label = cast<DILabel>(IL.first);
+    // The scope could have an extra lexical block file.
+    const DILocalScope *LocalScope =
+        Label->getScope()->getNonLexicalBlockFileScope();
+    // Get inlined DILocation if it is inlined label.
+    if (const DILocation *IA = IL.second)
+      Scope = LScopes.findInlinedScope(LocalScope, IA);
+    else
+      Scope = LScopes.findLexicalScope(LocalScope);
+    // If label scope is not found then skip this label.
+    if (!Scope)
+      continue;
+
+    Processed.insert(IL);
+    /// At this point, the temporary label is created.
+    /// Save the temporary label to DbgLabel entity to get the
+    /// actually address when generating Dwarf DIE.
+    MCSymbol *Sym = getLabelBeforeInsn(MI);
+    createConcreteEntity(TheCU, *Scope, Label, IL.second, Sym);
+  }
+
+  // Collect info for retained nodes.
+  for (const DINode *DN : SP->getRetainedNodes()) {
+    const auto *LS = getRetainedNodeScope(DN);
+    if (isa<DILocalVariable>(DN) || isa<DILabel>(DN)) {
+      if (!Processed.insert(InlinedEntity(DN, nullptr)).second)
+        continue;
+      LexicalScope *LexS = LScopes.findLexicalScope(LS);
+      if (LexS)
+        createConcreteEntity(TheCU, *LexS, DN, nullptr);
+    } else {
+      LocalDeclsPerLS[LS].insert(DN);
+    }
+  }
+}
+
+// Process beginning of an instruction.
+void DwarfDebug::beginInstruction(const MachineInstr *MI) {
+  const MachineFunction &MF = *MI->getMF();
+  const auto *SP = MF.getFunction().getSubprogram();
+  bool NoDebug =
+      !SP || SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug;
+
+  // Delay slot support check.
+  auto delaySlotSupported = [](const MachineInstr &MI) {
+    if (!MI.isBundledWithSucc())
+      return false;
+    auto Suc = std::next(MI.getIterator());
+    (void)Suc;
+    // Ensure that delay slot instruction is successor of the call instruction.
+    // Ex. CALL_INSTRUCTION {
+    //        DELAY_SLOT_INSTRUCTION }
+    assert(Suc->isBundledWithPred() &&
+           "Call bundle instructions are out of order");
+    return true;
+  };
+
+  // When describing calls, we need a label for the call instruction.
+  if (!NoDebug && SP->areAllCallsDescribed() &&
+      MI->isCandidateForCallSiteEntry(MachineInstr::AnyInBundle) &&
+      (!MI->hasDelaySlot() || delaySlotSupported(*MI))) {
+    const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+    bool IsTail = TII->isTailCall(*MI);
+    // For tail calls, we need the address of the branch instruction for
+    // DW_AT_call_pc.
+    if (IsTail)
+      requestLabelBeforeInsn(MI);
+    // For non-tail calls, we need the return address for the call for
+    // DW_AT_call_return_pc. Under GDB tuning, this information is needed for
+    // tail calls as well.
+    requestLabelAfterInsn(MI);
+  }
+
+  DebugHandlerBase::beginInstruction(MI);
+  if (!CurMI)
+    return;
+
+  if (NoDebug)
+    return;
+
+  // Check if source location changes, but ignore DBG_VALUE and CFI locations.
+  // If the instruction is part of the function frame setup code, do not emit
+  // any line record, as there is no correspondence with any user code.
+  if (MI->isMetaInstruction() || MI->getFlag(MachineInstr::FrameSetup))
+    return;
+  const DebugLoc &DL = MI->getDebugLoc();
+  unsigned Flags = 0;
+
+  if (MI->getFlag(MachineInstr::FrameDestroy) && DL) {
+    const MachineBasicBlock *MBB = MI->getParent();
+    if (MBB && (MBB != EpilogBeginBlock)) {
+      // First time FrameDestroy has been seen in this basic block
+      EpilogBeginBlock = MBB;
+      Flags |= DWARF2_FLAG_EPILOGUE_BEGIN;
+    }
+  }
+
+  // When we emit a line-0 record, we don't update PrevInstLoc; so look at
+  // the last line number actually emitted, to see if it was line 0.
+  unsigned LastAsmLine =
+      Asm->OutStreamer->getContext().getCurrentDwarfLoc().getLine();
+
+  bool PrevInstInSameSection =
+      (!PrevInstBB ||
+       PrevInstBB->getSectionIDNum() == MI->getParent()->getSectionIDNum());
+  if (DL == PrevInstLoc && PrevInstInSameSection) {
+    // If we have an ongoing unspecified location, nothing to do here.
+    if (!DL)
+      return;
+    // We have an explicit location, same as the previous location.
+    // But we might be coming back to it after a line 0 record.
+    if ((LastAsmLine == 0 && DL.getLine() != 0) || Flags) {
+      // Reinstate the source location but not marked as a statement.
+      const MDNode *Scope = DL.getScope();
+      recordSourceLine(DL.getLine(), DL.getCol(), Scope, Flags);
+    }
+    return;
+  }
+
+  if (!DL) {
+    // We have an unspecified location, which might want to be line 0.
+    // If we have already emitted a line-0 record, don't repeat it.
+    if (LastAsmLine == 0)
+      return;
+    // If user said Don't Do That, don't do that.
+    if (UnknownLocations == Disable)
+      return;
+    // See if we have a reason to emit a line-0 record now.
+    // Reasons to emit a line-0 record include:
+    // - User asked for it (UnknownLocations).
+    // - Instruction has a label, so it's referenced from somewhere else,
+    //   possibly debug information; we want it to have a source location.
+    // - Instruction is at the top of a block; we don't want to inherit the
+    //   location from the physically previous (maybe unrelated) block.
+    if (UnknownLocations == Enable || PrevLabel ||
+        (PrevInstBB && PrevInstBB != MI->getParent())) {
+      // Preserve the file and column numbers, if we can, to save space in
+      // the encoded line table.
+      // Do not update PrevInstLoc, it remembers the last non-0 line.
+      const MDNode *Scope = nullptr;
+      unsigned Column = 0;
+      if (PrevInstLoc) {
+        Scope = PrevInstLoc.getScope();
+        Column = PrevInstLoc.getCol();
+      }
+      recordSourceLine(/*Line=*/0, Column, Scope, /*Flags=*/0);
+    }
+    return;
+  }
+
+  // We have an explicit location, different from the previous location.
+  // Don't repeat a line-0 record, but otherwise emit the new location.
+  // (The new location might be an explicit line 0, which we do emit.)
+  if (DL.getLine() == 0 && LastAsmLine == 0)
+    return;
+  if (DL == PrologEndLoc) {
+    Flags |= DWARF2_FLAG_PROLOGUE_END | DWARF2_FLAG_IS_STMT;
+    PrologEndLoc = DebugLoc();
+  }
+  // If the line changed, we call that a new statement; unless we went to
+  // line 0 and came back, in which case it is not a new statement.
+  unsigned OldLine = PrevInstLoc ? PrevInstLoc.getLine() : LastAsmLine;
+  if (DL.getLine() && DL.getLine() != OldLine)
+    Flags |= DWARF2_FLAG_IS_STMT;
+
+  const MDNode *Scope = DL.getScope();
+  recordSourceLine(DL.getLine(), DL.getCol(), Scope, Flags);
+
+  // If we're not at line 0, remember this location.
+  if (DL.getLine())
+    PrevInstLoc = DL;
+}
+
+static std::pair<DebugLoc, bool> findPrologueEndLoc(const MachineFunction *MF) {
+  // First known non-DBG_VALUE and non-frame setup location marks
+  // the beginning of the function body.
+  DebugLoc LineZeroLoc;
+  const Function &F = MF->getFunction();
+
+  // Some instructions may be inserted into prologue after this function. Must
+  // keep prologue for these cases.
+  bool IsEmptyPrologue =
+      !(F.hasPrologueData() || F.getMetadata(LLVMContext::MD_func_sanitize));
+  for (const auto &MBB : *MF) {
+    for (const auto &MI : MBB) {
+      if (!MI.isMetaInstruction()) {
+        if (!MI.getFlag(MachineInstr::FrameSetup) && MI.getDebugLoc()) {
+          // Scan forward to try to find a non-zero line number. The
+          // prologue_end marks the first breakpoint in the function after the
+          // frame setup, and a compiler-generated line 0 location is not a
+          // meaningful breakpoint. If none is found, return the first
+          // location after the frame setup.
+          if (MI.getDebugLoc().getLine())
+            return std::make_pair(MI.getDebugLoc(), IsEmptyPrologue);
+
+          LineZeroLoc = MI.getDebugLoc();
+        }
+        IsEmptyPrologue = false;
+      }
+    }
+  }
+  return std::make_pair(LineZeroLoc, IsEmptyPrologue);
+}
+
+/// Register a source line with debug info. Returns the  unique label that was
+/// emitted and which provides correspondence to the source line list.
+static void recordSourceLine(AsmPrinter &Asm, unsigned Line, unsigned Col,
+                             const MDNode *S, unsigned Flags, unsigned CUID,
+                             uint16_t DwarfVersion,
+                             ArrayRef<std::unique_ptr<DwarfCompileUnit>> DCUs) {
+  StringRef Fn;
+  unsigned FileNo = 1;
+  unsigned Discriminator = 0;
+  if (auto *Scope = cast_or_null<DIScope>(S)) {
+    Fn = Scope->getFilename();
+    if (Line != 0 && DwarfVersion >= 4)
+      if (auto *LBF = dyn_cast<DILexicalBlockFile>(Scope))
+        Discriminator = LBF->getDiscriminator();
+
+    FileNo = static_cast<DwarfCompileUnit &>(*DCUs[CUID])
+                 .getOrCreateSourceID(Scope->getFile());
+  }
+  Asm.OutStreamer->emitDwarfLocDirective(FileNo, Line, Col, Flags, 0,
+                                         Discriminator, Fn);
+}
+
+DebugLoc DwarfDebug::emitInitialLocDirective(const MachineFunction &MF,
+                                             unsigned CUID) {
+  std::pair<DebugLoc, bool> PrologEnd = findPrologueEndLoc(&MF);
+  DebugLoc PrologEndLoc = PrologEnd.first;
+  bool IsEmptyPrologue = PrologEnd.second;
+
+  // Get beginning of function.
+  if (PrologEndLoc) {
+    // If the prolog is empty, no need to generate scope line for the proc.
+    if (IsEmptyPrologue)
+      return PrologEndLoc;
+
+    // Ensure the compile unit is created if the function is called before
+    // beginFunction().
+    (void)getOrCreateDwarfCompileUnit(
+        MF.getFunction().getSubprogram()->getUnit());
+    // We'd like to list the prologue as "not statements" but GDB behaves
+    // poorly if we do that. Revisit this with caution/GDB (7.5+) testing.
+    const DISubprogram *SP = PrologEndLoc->getInlinedAtScope()->getSubprogram();
+    ::recordSourceLine(*Asm, SP->getScopeLine(), 0, SP, DWARF2_FLAG_IS_STMT,
+                       CUID, getDwarfVersion(), getUnits());
+    return PrologEndLoc;
+  }
+  return DebugLoc();
+}
+
+// Gather pre-function debug information.  Assumes being called immediately
+// after the function entry point has been emitted.
+void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) {
+  CurFn = MF;
+
+  auto *SP = MF->getFunction().getSubprogram();
+  assert(LScopes.empty() || SP == LScopes.getCurrentFunctionScope()->getScopeNode());
+  if (SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug)
+    return;
+
+  DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
+
+  Asm->OutStreamer->getContext().setDwarfCompileUnitID(
+      getDwarfCompileUnitIDForLineTable(CU));
+
+  // Record beginning of function.
+  PrologEndLoc = emitInitialLocDirective(
+      *MF, Asm->OutStreamer->getContext().getDwarfCompileUnitID());
+}
+
+unsigned
+DwarfDebug::getDwarfCompileUnitIDForLineTable(const DwarfCompileUnit &CU) {
+  // Set DwarfDwarfCompileUnitID in MCContext to the Compile Unit this function
+  // belongs to so that we add to the correct per-cu line table in the
+  // non-asm case.
+  if (Asm->OutStreamer->hasRawTextSupport())
+    // Use a single line table if we are generating assembly.
+    return 0;
+  else
+    return CU.getUniqueID();
+}
+
+void DwarfDebug::terminateLineTable(const DwarfCompileUnit *CU) {
+  const auto &CURanges = CU->getRanges();
+  auto &LineTable = Asm->OutStreamer->getContext().getMCDwarfLineTable(
+      getDwarfCompileUnitIDForLineTable(*CU));
+  // Add the last range label for the given CU.
+  LineTable.getMCLineSections().addEndEntry(
+      const_cast<MCSymbol *>(CURanges.back().End));
+}
+
+void DwarfDebug::skippedNonDebugFunction() {
+  // If we don't have a subprogram for this function then there will be a hole
+  // in the range information. Keep note of this by setting the previously used
+  // section to nullptr.
+  // Terminate the pending line table.
+  if (PrevCU)
+    terminateLineTable(PrevCU);
+  PrevCU = nullptr;
+  CurFn = nullptr;
+}
+
+// Gather and emit post-function debug information.
+void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
+  const DISubprogram *SP = MF->getFunction().getSubprogram();
+
+  assert(CurFn == MF &&
+      "endFunction should be called with the same function as beginFunction");
+
+  // Set DwarfDwarfCompileUnitID in MCContext to default value.
+  Asm->OutStreamer->getContext().setDwarfCompileUnitID(0);
+
+  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
+  assert(!FnScope || SP == FnScope->getScopeNode());
+  DwarfCompileUnit &TheCU = getOrCreateDwarfCompileUnit(SP->getUnit());
+  if (TheCU.getCUNode()->isDebugDirectivesOnly()) {
+    PrevLabel = nullptr;
+    CurFn = nullptr;
+    return;
+  }
+
+  DenseSet<InlinedEntity> Processed;
+  collectEntityInfo(TheCU, SP, Processed);
+
+  // Add the range of this function to the list of ranges for the CU.
+  // With basic block sections, add ranges for all basic block sections.
+  for (const auto &R : Asm->MBBSectionRanges)
+    TheCU.addRange({R.second.BeginLabel, R.second.EndLabel});
+
+  // Under -gmlt, skip building the subprogram if there are no inlined
+  // subroutines inside it. But with -fdebug-info-for-profiling, the subprogram
+  // is still needed as we need its source location.
+  if (!TheCU.getCUNode()->getDebugInfoForProfiling() &&
+      TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly &&
+      LScopes.getAbstractScopesList().empty() && !IsDarwin) {
+    for (const auto &R : Asm->MBBSectionRanges)
+      addArangeLabel(SymbolCU(&TheCU, R.second.BeginLabel));
+
+    assert(InfoHolder.getScopeVariables().empty());
+    PrevLabel = nullptr;
+    CurFn = nullptr;
+    return;
+  }
+
+#ifndef NDEBUG
+  size_t NumAbstractSubprograms = LScopes.getAbstractScopesList().size();
+#endif
+  for (LexicalScope *AScope : LScopes.getAbstractScopesList()) {
+    const auto *SP = cast<DISubprogram>(AScope->getScopeNode());
+    for (const DINode *DN : SP->getRetainedNodes()) {
+      const auto *LS = getRetainedNodeScope(DN);
+      // Ensure LexicalScope is created for the scope of this node.
+      auto *LexS = LScopes.getOrCreateAbstractScope(LS);
+      assert(LexS && "Expected the LexicalScope to be created.");
+      if (isa<DILocalVariable>(DN) || isa<DILabel>(DN)) {
+        // Collect info for variables/labels that were optimized out.
+        if (!Processed.insert(InlinedEntity(DN, nullptr)).second ||
+            TheCU.getExistingAbstractEntity(DN))
+          continue;
+        TheCU.createAbstractEntity(DN, LexS);
+      } else {
+        // Remember the node if this is a local declarations.
+        LocalDeclsPerLS[LS].insert(DN);
+      }
+      assert(
+          LScopes.getAbstractScopesList().size() == NumAbstractSubprograms &&
+          "getOrCreateAbstractScope() inserted an abstract subprogram scope");
+    }
+    constructAbstractSubprogramScopeDIE(TheCU, AScope);
+  }
+
+  ProcessedSPNodes.insert(SP);
+  DIE &ScopeDIE = TheCU.constructSubprogramScopeDIE(SP, FnScope);
+  if (auto *SkelCU = TheCU.getSkeleton())
+    if (!LScopes.getAbstractScopesList().empty() &&
+        TheCU.getCUNode()->getSplitDebugInlining())
+      SkelCU->constructSubprogramScopeDIE(SP, FnScope);
+
+  // Construct call site entries.
+  constructCallSiteEntryDIEs(*SP, TheCU, ScopeDIE, *MF);
+
+  // Clear debug info
+  // Ownership of DbgVariables is a bit subtle - ScopeVariables owns all the
+  // DbgVariables except those that are also in AbstractVariables (since they
+  // can be used cross-function)
+  InfoHolder.getScopeVariables().clear();
+  InfoHolder.getScopeLabels().clear();
+  LocalDeclsPerLS.clear();
+  PrevLabel = nullptr;
+  CurFn = nullptr;
+}
+
+// Register a source line with debug info. Returns the  unique label that was
+// emitted and which provides correspondence to the source line list.
+void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S,
+                                  unsigned Flags) {
+  ::recordSourceLine(*Asm, Line, Col, S, Flags,
+                     Asm->OutStreamer->getContext().getDwarfCompileUnitID(),
+                     getDwarfVersion(), getUnits());
+}
+
+//===----------------------------------------------------------------------===//
+// Emit Methods
+//===----------------------------------------------------------------------===//
+
+// Emit the debug info section.
+void DwarfDebug::emitDebugInfo() {
+  DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+  Holder.emitUnits(/* UseOffsets */ false);
+}
+
+// Emit the abbreviation section.
+void DwarfDebug::emitAbbreviations() {
+  DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+
+  Holder.emitAbbrevs(Asm->getObjFileLowering().getDwarfAbbrevSection());
+}
+
+void DwarfDebug::emitStringOffsetsTableHeader() {
+  DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+  Holder.getStringPool().emitStringOffsetsTableHeader(
+      *Asm, Asm->getObjFileLowering().getDwarfStrOffSection(),
+      Holder.getStringOffsetsStartSym());
+}
+
+template <typename AccelTableT>
+void DwarfDebug::emitAccel(AccelTableT &Accel, MCSection *Section,
+                           StringRef TableName) {
+  Asm->OutStreamer->switchSection(Section);
+
+  // Emit the full data.
+  emitAppleAccelTable(Asm, Accel, TableName, Section->getBeginSymbol());
+}
+
+void DwarfDebug::emitAccelDebugNames() {
+  // Don't emit anything if we have no compilation units to index.
+  if (getUnits().empty())
+    return;
+
+  emitDWARF5AccelTable(Asm, AccelDebugNames, *this, getUnits());
+}
+
+// Emit visible names into a hashed accelerator table section.
+void DwarfDebug::emitAccelNames() {
+  emitAccel(AccelNames, Asm->getObjFileLowering().getDwarfAccelNamesSection(),
+            "Names");
+}
+
+// Emit objective C classes and categories into a hashed accelerator table
+// section.
+void DwarfDebug::emitAccelObjC() {
+  emitAccel(AccelObjC, Asm->getObjFileLowering().getDwarfAccelObjCSection(),
+            "ObjC");
+}
+
+// Emit namespace dies into a hashed accelerator table.
+void DwarfDebug::emitAccelNamespaces() {
+  emitAccel(AccelNamespace,
+            Asm->getObjFileLowering().getDwarfAccelNamespaceSection(),
+            "namespac");
+}
+
+// Emit type dies into a hashed accelerator table.
+void DwarfDebug::emitAccelTypes() {
+  emitAccel(AccelTypes, Asm->getObjFileLowering().getDwarfAccelTypesSection(),
+            "types");
+}
+
+// Public name handling.
+// The format for the various pubnames:
+//
+// dwarf pubnames - offset/name pairs where the offset is the offset into the CU
+// for the DIE that is named.
+//
+// gnu pubnames - offset/index value/name tuples where the offset is the offset
+// into the CU and the index value is computed according to the type of value
+// for the DIE that is named.
+//
+// For type units the offset is the offset of the skeleton DIE. For split dwarf
+// it's the offset within the debug_info/debug_types dwo section, however, the
+// reference in the pubname header doesn't change.
+
+/// computeIndexValue - Compute the gdb index value for the DIE and CU.
+static dwarf::PubIndexEntryDescriptor computeIndexValue(DwarfUnit *CU,
+                                                        const DIE *Die) {
+  // Entities that ended up only in a Type Unit reference the CU instead (since
+  // the pub entry has offsets within the CU there's no real offset that can be
+  // provided anyway). As it happens all such entities (namespaces and types,
+  // types only in C++ at that) are rendered as TYPE+EXTERNAL. If this turns out
+  // not to be true it would be necessary to persist this information from the
+  // point at which the entry is added to the index data structure - since by
+  // the time the index is built from that, the original type/namespace DIE in a
+  // type unit has already been destroyed so it can't be queried for properties
+  // like tag, etc.
+  if (Die->getTag() == dwarf::DW_TAG_compile_unit)
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_TYPE,
+                                          dwarf::GIEL_EXTERNAL);
+  dwarf::GDBIndexEntryLinkage Linkage = dwarf::GIEL_STATIC;
+
+  // We could have a specification DIE that has our most of our knowledge,
+  // look for that now.
+  if (DIEValue SpecVal = Die->findAttribute(dwarf::DW_AT_specification)) {
+    DIE &SpecDIE = SpecVal.getDIEEntry().getEntry();
+    if (SpecDIE.findAttribute(dwarf::DW_AT_external))
+      Linkage = dwarf::GIEL_EXTERNAL;
+  } else if (Die->findAttribute(dwarf::DW_AT_external))
+    Linkage = dwarf::GIEL_EXTERNAL;
+
+  switch (Die->getTag()) {
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_enumeration_type:
+    return dwarf::PubIndexEntryDescriptor(
+        dwarf::GIEK_TYPE,
+        dwarf::isCPlusPlus((dwarf::SourceLanguage)CU->getLanguage())
+            ? dwarf::GIEL_EXTERNAL
+            : dwarf::GIEL_STATIC);
+  case dwarf::DW_TAG_typedef:
+  case dwarf::DW_TAG_base_type:
+  case dwarf::DW_TAG_subrange_type:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_TYPE, dwarf::GIEL_STATIC);
+  case dwarf::DW_TAG_namespace:
+    return dwarf::GIEK_TYPE;
+  case dwarf::DW_TAG_subprogram:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_FUNCTION, Linkage);
+  case dwarf::DW_TAG_variable:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_VARIABLE, Linkage);
+  case dwarf::DW_TAG_enumerator:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_VARIABLE,
+                                          dwarf::GIEL_STATIC);
+  default:
+    return dwarf::GIEK_NONE;
+  }
+}
+
+/// emitDebugPubSections - Emit visible names and types into debug pubnames and
+/// pubtypes sections.
+void DwarfDebug::emitDebugPubSections() {
+  for (const auto &NU : CUMap) {
+    DwarfCompileUnit *TheU = NU.second;
+    if (!TheU->hasDwarfPubSections())
+      continue;
+
+    bool GnuStyle = TheU->getCUNode()->getNameTableKind() ==
+                    DICompileUnit::DebugNameTableKind::GNU;
+
+    Asm->OutStreamer->switchSection(
+        GnuStyle ? Asm->getObjFileLowering().getDwarfGnuPubNamesSection()
+                 : Asm->getObjFileLowering().getDwarfPubNamesSection());
+    emitDebugPubSection(GnuStyle, "Names", TheU, TheU->getGlobalNames());
+
+    Asm->OutStreamer->switchSection(
+        GnuStyle ? Asm->getObjFileLowering().getDwarfGnuPubTypesSection()
+                 : Asm->getObjFileLowering().getDwarfPubTypesSection());
+    emitDebugPubSection(GnuStyle, "Types", TheU, TheU->getGlobalTypes());
+  }
+}
+
+void DwarfDebug::emitSectionReference(const DwarfCompileUnit &CU) {
+  if (useSectionsAsReferences())
+    Asm->emitDwarfOffset(CU.getSection()->getBeginSymbol(),
+                         CU.getDebugSectionOffset());
+  else
+    Asm->emitDwarfSymbolReference(CU.getLabelBegin());
+}
+
+void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name,
+                                     DwarfCompileUnit *TheU,
+                                     const StringMap<const DIE *> &Globals) {
+  if (auto *Skeleton = TheU->getSkeleton())
+    TheU = Skeleton;
+
+  // Emit the header.
+  MCSymbol *EndLabel = Asm->emitDwarfUnitLength(
+      "pub" + Name, "Length of Public " + Name + " Info");
+
+  Asm->OutStreamer->AddComment("DWARF Version");
+  Asm->emitInt16(dwarf::DW_PUBNAMES_VERSION);
+
+  Asm->OutStreamer->AddComment("Offset of Compilation Unit Info");
+  emitSectionReference(*TheU);
+
+  Asm->OutStreamer->AddComment("Compilation Unit Length");
+  Asm->emitDwarfLengthOrOffset(TheU->getLength());
+
+  // Emit the pubnames for this compilation unit.
+  SmallVector<std::pair<StringRef, const DIE *>, 0> Vec;
+  for (const auto &GI : Globals)
+    Vec.emplace_back(GI.first(), GI.second);
+  llvm::sort(Vec, [](auto &A, auto &B) {
+    return A.second->getOffset() < B.second->getOffset();
+  });
+  for (const auto &[Name, Entity] : Vec) {
+    Asm->OutStreamer->AddComment("DIE offset");
+    Asm->emitDwarfLengthOrOffset(Entity->getOffset());
+
+    if (GnuStyle) {
+      dwarf::PubIndexEntryDescriptor Desc = computeIndexValue(TheU, Entity);
+      Asm->OutStreamer->AddComment(
+          Twine("Attributes: ") + dwarf::GDBIndexEntryKindString(Desc.Kind) +
+          ", " + dwarf::GDBIndexEntryLinkageString(Desc.Linkage));
+      Asm->emitInt8(Desc.toBits());
+    }
+
+    Asm->OutStreamer->AddComment("External Name");
+    Asm->OutStreamer->emitBytes(StringRef(Name.data(), Name.size() + 1));
+  }
+
+  Asm->OutStreamer->AddComment("End Mark");
+  Asm->emitDwarfLengthOrOffset(0);
+  Asm->OutStreamer->emitLabel(EndLabel);
+}
+
+/// Emit null-terminated strings into a debug str section.
+void DwarfDebug::emitDebugStr() {
+  MCSection *StringOffsetsSection = nullptr;
+  if (useSegmentedStringOffsetsTable()) {
+    emitStringOffsetsTableHeader();
+    StringOffsetsSection = Asm->getObjFileLowering().getDwarfStrOffSection();
+  }
+  DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+  Holder.emitStrings(Asm->getObjFileLowering().getDwarfStrSection(),
+                     StringOffsetsSection, /* UseRelativeOffsets = */ true);
+}
+
+void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
+                                   const DebugLocStream::Entry &Entry,
+                                   const DwarfCompileUnit *CU) {
+  auto &&Comments = DebugLocs.getComments(Entry);
+  auto Comment = Comments.begin();
+  auto End = Comments.end();
+
+  // The expressions are inserted into a byte stream rather early (see
+  // DwarfExpression::addExpression) so for those ops (e.g. DW_OP_convert) that
+  // need to reference a base_type DIE the offset of that DIE is not yet known.
+  // To deal with this we instead insert a placeholder early and then extract
+  // it here and replace it with the real reference.
+  unsigned PtrSize = Asm->MAI->getCodePointerSize();
+  DWARFDataExtractor Data(StringRef(DebugLocs.getBytes(Entry).data(),
+                                    DebugLocs.getBytes(Entry).size()),
+                          Asm->getDataLayout().isLittleEndian(), PtrSize);
+  DWARFExpression Expr(Data, PtrSize, Asm->OutContext.getDwarfFormat());
+
+  using Encoding = DWARFExpression::Operation::Encoding;
+  uint64_t Offset = 0;
+  for (const auto &Op : Expr) {
+    assert(Op.getCode() != dwarf::DW_OP_const_type &&
+           "3 operand ops not yet supported");
+    assert(!Op.getSubCode() && "SubOps not yet supported");
+    Streamer.emitInt8(Op.getCode(), Comment != End ? *(Comment++) : "");
+    Offset++;
+    for (unsigned I = 0; I < Op.getDescription().Op.size(); ++I) {
+      if (Op.getDescription().Op[I] == Encoding::BaseTypeRef) {
+        unsigned Length =
+          Streamer.emitDIERef(*CU->ExprRefedBaseTypes[Op.getRawOperand(I)].Die);
+        // Make sure comments stay aligned.
+        for (unsigned J = 0; J < Length; ++J)
+          if (Comment != End)
+            Comment++;
+      } else {
+        for (uint64_t J = Offset; J < Op.getOperandEndOffset(I); ++J)
+          Streamer.emitInt8(Data.getData()[J], Comment != End ? *(Comment++) : "");
+      }
+      Offset = Op.getOperandEndOffset(I);
+    }
+    assert(Offset == Op.getEndOffset());
+  }
+}
+
+void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
+                                   const DbgValueLoc &Value,
+                                   DwarfExpression &DwarfExpr) {
+  auto *DIExpr = Value.getExpression();
+  DIExpressionCursor ExprCursor(DIExpr);
+  DwarfExpr.addFragmentOffset(DIExpr);
+
+  // If the DIExpr is is an Entry Value, we want to follow the same code path
+  // regardless of whether the DBG_VALUE is variadic or not.
+  if (DIExpr && DIExpr->isEntryValue()) {
+    // Entry values can only be a single register with no additional DIExpr,
+    // so just add it directly.
+    assert(Value.getLocEntries().size() == 1);
+    assert(Value.getLocEntries()[0].isLocation());
+    MachineLocation Location = Value.getLocEntries()[0].getLoc();
+    DwarfExpr.setLocation(Location, DIExpr);
+
+    DwarfExpr.beginEntryValueExpression(ExprCursor);
+
+    const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo();
+    if (!DwarfExpr.addMachineRegExpression(TRI, ExprCursor, Location.getReg()))
+      return;
+    return DwarfExpr.addExpression(std::move(ExprCursor));
+  }
+
+  // Regular entry.
+  auto EmitValueLocEntry = [&DwarfExpr, &BT,
+                            &AP](const DbgValueLocEntry &Entry,
+                                 DIExpressionCursor &Cursor) -> bool {
+    if (Entry.isInt()) {
+      if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed ||
+                 BT->getEncoding() == dwarf::DW_ATE_signed_char))
+        DwarfExpr.addSignedConstant(Entry.getInt());
+      else
+        DwarfExpr.addUnsignedConstant(Entry.getInt());
+    } else if (Entry.isLocation()) {
+      MachineLocation Location = Entry.getLoc();
+      if (Location.isIndirect())
+        DwarfExpr.setMemoryLocationKind();
+
+      const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo();
+      if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
+        return false;
+    } else if (Entry.isTargetIndexLocation()) {
+      TargetIndexLocation Loc = Entry.getTargetIndexLocation();
+      // TODO TargetIndexLocation is a target-independent. Currently only the
+      // WebAssembly-specific encoding is supported.
+      assert(AP.TM.getTargetTriple().isWasm());
+      DwarfExpr.addWasmLocation(Loc.Index, static_cast<uint64_t>(Loc.Offset));
+    } else if (Entry.isConstantFP()) {
+      if (AP.getDwarfVersion() >= 4 && !AP.getDwarfDebug()->tuneForSCE() &&
+          !Cursor) {
+        DwarfExpr.addConstantFP(Entry.getConstantFP()->getValueAPF(), AP);
+      } else if (Entry.getConstantFP()
+                     ->getValueAPF()
+                     .bitcastToAPInt()
+                     .getBitWidth() <= 64 /*bits*/) {
+        DwarfExpr.addUnsignedConstant(
+            Entry.getConstantFP()->getValueAPF().bitcastToAPInt());
+      } else {
+        LLVM_DEBUG(
+            dbgs() << "Skipped DwarfExpression creation for ConstantFP of size"
+                   << Entry.getConstantFP()
+                          ->getValueAPF()
+                          .bitcastToAPInt()
+                          .getBitWidth()
+                   << " bits\n");
+        return false;
+      }
+    }
+    return true;
+  };
+
+  if (!Value.isVariadic()) {
+    if (!EmitValueLocEntry(Value.getLocEntries()[0], ExprCursor))
+      return;
+    DwarfExpr.addExpression(std::move(ExprCursor));
+    return;
+  }
+
+  // If any of the location entries are registers with the value 0, then the
+  // location is undefined.
+  if (any_of(Value.getLocEntries(), [](const DbgValueLocEntry &Entry) {
+        return Entry.isLocation() && !Entry.getLoc().getReg();
+      }))
+    return;
+
+  DwarfExpr.addExpression(
+      std::move(ExprCursor),
+      [EmitValueLocEntry, &Value](unsigned Idx,
+                                  DIExpressionCursor &Cursor) -> bool {
+        return EmitValueLocEntry(Value.getLocEntries()[Idx], Cursor);
+      });
+}
+
+void DebugLocEntry::finalize(const AsmPrinter &AP,
+                             DebugLocStream::ListBuilder &List,
+                             const DIBasicType *BT,
+                             DwarfCompileUnit &TheCU) {
+  assert(!Values.empty() &&
+         "location list entries without values are redundant");
+  assert(Begin != End && "unexpected location list entry with empty range");
+  DebugLocStream::EntryBuilder Entry(List, Begin, End);
+  BufferByteStreamer Streamer = Entry.getStreamer();
+  DebugLocDwarfExpression DwarfExpr(AP.getDwarfVersion(), Streamer, TheCU);
+  const DbgValueLoc &Value = Values[0];
+  if (Value.isFragment()) {
+    // Emit all fragments that belong to the same variable and range.
+    assert(llvm::all_of(Values, [](DbgValueLoc P) {
+          return P.isFragment();
+        }) && "all values are expected to be fragments");
+    assert(llvm::is_sorted(Values) && "fragments are expected to be sorted");
+
+    for (const auto &Fragment : Values)
+      DwarfDebug::emitDebugLocValue(AP, BT, Fragment, DwarfExpr);
+
+  } else {
+    assert(Values.size() == 1 && "only fragments may have >1 value");
+    DwarfDebug::emitDebugLocValue(AP, BT, Value, DwarfExpr);
+  }
+  DwarfExpr.finalize();
+  if (DwarfExpr.TagOffset)
+    List.setTagOffset(*DwarfExpr.TagOffset);
+}
+
+void DwarfDebug::emitDebugLocEntryLocation(const DebugLocStream::Entry &Entry,
+                                           const DwarfCompileUnit *CU) {
+  // Emit the size.
+  Asm->OutStreamer->AddComment("Loc expr size");
+  if (getDwarfVersion() >= 5)
+    Asm->emitULEB128(DebugLocs.getBytes(Entry).size());
+  else if (DebugLocs.getBytes(Entry).size() <= std::numeric_limits<uint16_t>::max())
+    Asm->emitInt16(DebugLocs.getBytes(Entry).size());
+  else {
+    // The entry is too big to fit into 16 bit, drop it as there is nothing we
+    // can do.
+    Asm->emitInt16(0);
+    return;
+  }
+  // Emit the entry.
+  APByteStreamer Streamer(*Asm);
+  emitDebugLocEntry(Streamer, Entry, CU);
+}
+
+// Emit the header of a DWARF 5 range list table list table. Returns the symbol
+// that designates the end of the table for the caller to emit when the table is
+// complete.
+static MCSymbol *emitRnglistsTableHeader(AsmPrinter *Asm,
+                                         const DwarfFile &Holder) {
+  MCSymbol *TableEnd = mcdwarf::emitListsTableHeaderStart(*Asm->OutStreamer);
+
+  Asm->OutStreamer->AddComment("Offset entry count");
+  Asm->emitInt32(Holder.getRangeLists().size());
+  Asm->OutStreamer->emitLabel(Holder.getRnglistsTableBaseSym());
+
+  for (const RangeSpanList &List : Holder.getRangeLists())
+    Asm->emitLabelDifference(List.Label, Holder.getRnglistsTableBaseSym(),
+                             Asm->getDwarfOffsetByteSize());
+
+  return TableEnd;
+}
+
+// Emit the header of a DWARF 5 locations list table. Returns the symbol that
+// designates the end of the table for the caller to emit when the table is
+// complete.
+static MCSymbol *emitLoclistsTableHeader(AsmPrinter *Asm,
+                                         const DwarfDebug &DD) {
+  MCSymbol *TableEnd = mcdwarf::emitListsTableHeaderStart(*Asm->OutStreamer);
+
+  const auto &DebugLocs = DD.getDebugLocs();
+
+  Asm->OutStreamer->AddComment("Offset entry count");
+  Asm->emitInt32(DebugLocs.getLists().size());
+  Asm->OutStreamer->emitLabel(DebugLocs.getSym());
+
+  for (const auto &List : DebugLocs.getLists())
+    Asm->emitLabelDifference(List.Label, DebugLocs.getSym(),
+                             Asm->getDwarfOffsetByteSize());
+
+  return TableEnd;
+}
+
+template <typename Ranges, typename PayloadEmitter>
+static void emitRangeList(
+    DwarfDebug &DD, AsmPrinter *Asm, MCSymbol *Sym, const Ranges &R,
+    const DwarfCompileUnit &CU, unsigned BaseAddressx, unsigned OffsetPair,
+    unsigned StartxLength, unsigned EndOfList,
+    StringRef (*StringifyEnum)(unsigned),
+    bool ShouldUseBaseAddress,
+    PayloadEmitter EmitPayload) {
+
+  auto Size = Asm->MAI->getCodePointerSize();
+  bool UseDwarf5 = DD.getDwarfVersion() >= 5;
+
+  // Emit our symbol so we can find the beginning of the range.
+  Asm->OutStreamer->emitLabel(Sym);
+
+  // Gather all the ranges that apply to the same section so they can share
+  // a base address entry.
+  MapVector<const MCSection *, std::vector<decltype(&*R.begin())>> SectionRanges;
+
+  for (const auto &Range : R)
+    SectionRanges[&Range.Begin->getSection()].push_back(&Range);
+
+  const MCSymbol *CUBase = CU.getBaseAddress();
+  bool BaseIsSet = false;
+  for (const auto &P : SectionRanges) {
+    auto *Base = CUBase;
+    if (!Base && ShouldUseBaseAddress) {
+      const MCSymbol *Begin = P.second.front()->Begin;
+      const MCSymbol *NewBase = DD.getSectionLabel(&Begin->getSection());
+      if (!UseDwarf5) {
+        Base = NewBase;
+        BaseIsSet = true;
+        Asm->OutStreamer->emitIntValue(-1, Size);
+        Asm->OutStreamer->AddComment("  base address");
+        Asm->OutStreamer->emitSymbolValue(Base, Size);
+      } else if (NewBase != Begin || P.second.size() > 1) {
+        // Only use a base address if
+        //  * the existing pool address doesn't match (NewBase != Begin)
+        //  * or, there's more than one entry to share the base address
+        Base = NewBase;
+        BaseIsSet = true;
+        Asm->OutStreamer->AddComment(StringifyEnum(BaseAddressx));
+        Asm->emitInt8(BaseAddressx);
+        Asm->OutStreamer->AddComment("  base address index");
+        Asm->emitULEB128(DD.getAddressPool().getIndex(Base));
+      }
+    } else if (BaseIsSet && !UseDwarf5) {
+      BaseIsSet = false;
+      assert(!Base);
+      Asm->OutStreamer->emitIntValue(-1, Size);
+      Asm->OutStreamer->emitIntValue(0, Size);
+    }
+
+    for (const auto *RS : P.second) {
+      const MCSymbol *Begin = RS->Begin;
+      const MCSymbol *End = RS->End;
+      assert(Begin && "Range without a begin symbol?");
+      assert(End && "Range without an end symbol?");
+      if (Base) {
+        if (UseDwarf5) {
+          // Emit offset_pair when we have a base.
+          Asm->OutStreamer->AddComment(StringifyEnum(OffsetPair));
+          Asm->emitInt8(OffsetPair);
+          Asm->OutStreamer->AddComment("  starting offset");
+          Asm->emitLabelDifferenceAsULEB128(Begin, Base);
+          Asm->OutStreamer->AddComment("  ending offset");
+          Asm->emitLabelDifferenceAsULEB128(End, Base);
+        } else {
+          Asm->emitLabelDifference(Begin, Base, Size);
+          Asm->emitLabelDifference(End, Base, Size);
+        }
+      } else if (UseDwarf5) {
+        Asm->OutStreamer->AddComment(StringifyEnum(StartxLength));
+        Asm->emitInt8(StartxLength);
+        Asm->OutStreamer->AddComment("  start index");
+        Asm->emitULEB128(DD.getAddressPool().getIndex(Begin));
+        Asm->OutStreamer->AddComment("  length");
+        Asm->emitLabelDifferenceAsULEB128(End, Begin);
+      } else {
+        Asm->OutStreamer->emitSymbolValue(Begin, Size);
+        Asm->OutStreamer->emitSymbolValue(End, Size);
+      }
+      EmitPayload(*RS);
+    }
+  }
+
+  if (UseDwarf5) {
+    Asm->OutStreamer->AddComment(StringifyEnum(EndOfList));
+    Asm->emitInt8(EndOfList);
+  } else {
+    // Terminate the list with two 0 values.
+    Asm->OutStreamer->emitIntValue(0, Size);
+    Asm->OutStreamer->emitIntValue(0, Size);
+  }
+}
+
+// Handles emission of both debug_loclist / debug_loclist.dwo
+static void emitLocList(DwarfDebug &DD, AsmPrinter *Asm, const DebugLocStream::List &List) {
+  emitRangeList(DD, Asm, List.Label, DD.getDebugLocs().getEntries(List),
+                *List.CU, dwarf::DW_LLE_base_addressx,
+                dwarf::DW_LLE_offset_pair, dwarf::DW_LLE_startx_length,
+                dwarf::DW_LLE_end_of_list, llvm::dwarf::LocListEncodingString,
+                /* ShouldUseBaseAddress */ true,
+                [&](const DebugLocStream::Entry &E) {
+                  DD.emitDebugLocEntryLocation(E, List.CU);
+                });
+}
+
+void DwarfDebug::emitDebugLocImpl(MCSection *Sec) {
+  if (DebugLocs.getLists().empty())
+    return;
+
+  Asm->OutStreamer->switchSection(Sec);
+
+  MCSymbol *TableEnd = nullptr;
+  if (getDwarfVersion() >= 5)
+    TableEnd = emitLoclistsTableHeader(Asm, *this);
+
+  for (const auto &List : DebugLocs.getLists())
+    emitLocList(*this, Asm, List);
+
+  if (TableEnd)
+    Asm->OutStreamer->emitLabel(TableEnd);
+}
+
+// Emit locations into the .debug_loc/.debug_loclists section.
+void DwarfDebug::emitDebugLoc() {
+  emitDebugLocImpl(
+      getDwarfVersion() >= 5
+          ? Asm->getObjFileLowering().getDwarfLoclistsSection()
+          : Asm->getObjFileLowering().getDwarfLocSection());
+}
+
+// Emit locations into the .debug_loc.dwo/.debug_loclists.dwo section.
+void DwarfDebug::emitDebugLocDWO() {
+  if (getDwarfVersion() >= 5) {
+    emitDebugLocImpl(
+        Asm->getObjFileLowering().getDwarfLoclistsDWOSection());
+
+    return;
+  }
+
+  for (const auto &List : DebugLocs.getLists()) {
+    Asm->OutStreamer->switchSection(
+        Asm->getObjFileLowering().getDwarfLocDWOSection());
+    Asm->OutStreamer->emitLabel(List.Label);
+
+    for (const auto &Entry : DebugLocs.getEntries(List)) {
+      // GDB only supports startx_length in pre-standard split-DWARF.
+      // (in v5 standard loclists, it currently* /only/ supports base_address +
+      // offset_pair, so the implementations can't really share much since they
+      // need to use different representations)
+      // * as of October 2018, at least
+      //
+      // In v5 (see emitLocList), this uses SectionLabels to reuse existing
+      // addresses in the address pool to minimize object size/relocations.
+      Asm->emitInt8(dwarf::DW_LLE_startx_length);
+      unsigned idx = AddrPool.getIndex(Entry.Begin);
+      Asm->emitULEB128(idx);
+      // Also the pre-standard encoding is slightly different, emitting this as
+      // an address-length entry here, but its a ULEB128 in DWARFv5 loclists.
+      Asm->emitLabelDifference(Entry.End, Entry.Begin, 4);
+      emitDebugLocEntryLocation(Entry, List.CU);
+    }
+    Asm->emitInt8(dwarf::DW_LLE_end_of_list);
+  }
+}
+
+struct ArangeSpan {
+  const MCSymbol *Start, *End;
+};
+
+// Emit a debug aranges section, containing a CU lookup for any
+// address we can tie back to a CU.
+void DwarfDebug::emitDebugARanges() {
+  // Provides a unique id per text section.
+  MapVector<MCSection *, SmallVector<SymbolCU, 8>> SectionMap;
+
+  // Filter labels by section.
+  for (const SymbolCU &SCU : ArangeLabels) {
+    if (SCU.Sym->isInSection()) {
+      // Make a note of this symbol and it's section.
+      MCSection *Section = &SCU.Sym->getSection();
+      if (!Section->getKind().isMetadata())
+        SectionMap[Section].push_back(SCU);
+    } else {
+      // Some symbols (e.g. common/bss on mach-o) can have no section but still
+      // appear in the output. This sucks as we rely on sections to build
+      // arange spans. We can do it without, but it's icky.
+      SectionMap[nullptr].push_back(SCU);
+    }
+  }
+
+  DenseMap<DwarfCompileUnit *, std::vector<ArangeSpan>> Spans;
+
+  for (auto &I : SectionMap) {
+    MCSection *Section = I.first;
+    SmallVector<SymbolCU, 8> &List = I.second;
+    if (List.size() < 1)
+      continue;
+
+    // If we have no section (e.g. common), just write out
+    // individual spans for each symbol.
+    if (!Section) {
+      for (const SymbolCU &Cur : List) {
+        ArangeSpan Span;
+        Span.Start = Cur.Sym;
+        Span.End = nullptr;
+        assert(Cur.CU);
+        Spans[Cur.CU].push_back(Span);
+      }
+      continue;
+    }
+
+    // Sort the symbols by offset within the section.
+    llvm::stable_sort(List, [&](const SymbolCU &A, const SymbolCU &B) {
+      unsigned IA = A.Sym ? Asm->OutStreamer->getSymbolOrder(A.Sym) : 0;
+      unsigned IB = B.Sym ? Asm->OutStreamer->getSymbolOrder(B.Sym) : 0;
+
+      // Symbols with no order assigned should be placed at the end.
+      // (e.g. section end labels)
+      if (IA == 0)
+        return false;
+      if (IB == 0)
+        return true;
+      return IA < IB;
+    });
+
+    // Insert a final terminator.
+    List.push_back(SymbolCU(nullptr, Asm->OutStreamer->endSection(Section)));
+
+    // Build spans between each label.
+    const MCSymbol *StartSym = List[0].Sym;
+    for (size_t n = 1, e = List.size(); n < e; n++) {
+      const SymbolCU &Prev = List[n - 1];
+      const SymbolCU &Cur = List[n];
+
+      // Try and build the longest span we can within the same CU.
+      if (Cur.CU != Prev.CU) {
+        ArangeSpan Span;
+        Span.Start = StartSym;
+        Span.End = Cur.Sym;
+        assert(Prev.CU);
+        Spans[Prev.CU].push_back(Span);
+        StartSym = Cur.Sym;
+      }
+    }
+  }
+
+  // Start the dwarf aranges section.
+  Asm->OutStreamer->switchSection(
+      Asm->getObjFileLowering().getDwarfARangesSection());
+
+  unsigned PtrSize = Asm->MAI->getCodePointerSize();
+
+  // Build a list of CUs used.
+  std::vector<DwarfCompileUnit *> CUs;
+  for (const auto &it : Spans) {
+    DwarfCompileUnit *CU = it.first;
+    CUs.push_back(CU);
+  }
+
+  // Sort the CU list (again, to ensure consistent output order).
+  llvm::sort(CUs, [](const DwarfCompileUnit *A, const DwarfCompileUnit *B) {
+    return A->getUniqueID() < B->getUniqueID();
+  });
+
+  // Emit an arange table for each CU we used.
+  for (DwarfCompileUnit *CU : CUs) {
+    std::vector<ArangeSpan> &List = Spans[CU];
+
+    // Describe the skeleton CU's offset and length, not the dwo file's.
+    if (auto *Skel = CU->getSkeleton())
+      CU = Skel;
+
+    // Emit size of content not including length itself.
+    unsigned ContentSize =
+        sizeof(int16_t) +               // DWARF ARange version number
+        Asm->getDwarfOffsetByteSize() + // Offset of CU in the .debug_info
+                                        // section
+        sizeof(int8_t) +                // Pointer Size (in bytes)
+        sizeof(int8_t);                 // Segment Size (in bytes)
+
+    unsigned TupleSize = PtrSize * 2;
+
+    // 7.20 in the Dwarf specs requires the table to be aligned to a tuple.
+    unsigned Padding = offsetToAlignment(
+        Asm->getUnitLengthFieldByteSize() + ContentSize, Align(TupleSize));
+
+    ContentSize += Padding;
+    ContentSize += (List.size() + 1) * TupleSize;
+
+    // For each compile unit, write the list of spans it covers.
+    Asm->emitDwarfUnitLength(ContentSize, "Length of ARange Set");
+    Asm->OutStreamer->AddComment("DWARF Arange version number");
+    Asm->emitInt16(dwarf::DW_ARANGES_VERSION);
+    Asm->OutStreamer->AddComment("Offset Into Debug Info Section");
+    emitSectionReference(*CU);
+    Asm->OutStreamer->AddComment("Address Size (in bytes)");
+    Asm->emitInt8(PtrSize);
+    Asm->OutStreamer->AddComment("Segment Size (in bytes)");
+    Asm->emitInt8(0);
+
+    Asm->OutStreamer->emitFill(Padding, 0xff);
+
+    for (const ArangeSpan &Span : List) {
+      Asm->emitLabelReference(Span.Start, PtrSize);
+
+      // Calculate the size as being from the span start to its end.
+      //
+      // If the size is zero, then round it up to one byte. The DWARF
+      // specification requires that entries in this table have nonzero
+      // lengths.
+      auto SizeRef = SymSize.find(Span.Start);
+      if ((SizeRef == SymSize.end() || SizeRef->second != 0) && Span.End) {
+        Asm->emitLabelDifference(Span.End, Span.Start, PtrSize);
+      } else {
+        // For symbols without an end marker (e.g. common), we
+        // write a single arange entry containing just that one symbol.
+        uint64_t Size;
+        if (SizeRef == SymSize.end() || SizeRef->second == 0)
+          Size = 1;
+        else
+          Size = SizeRef->second;
+
+        Asm->OutStreamer->emitIntValue(Size, PtrSize);
+      }
+    }
+
+    Asm->OutStreamer->AddComment("ARange terminator");
+    Asm->OutStreamer->emitIntValue(0, PtrSize);
+    Asm->OutStreamer->emitIntValue(0, PtrSize);
+  }
+}
+
+/// Emit a single range list. We handle both DWARF v5 and earlier.
+static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm,
+                          const RangeSpanList &List) {
+  emitRangeList(DD, Asm, List.Label, List.Ranges, *List.CU,
+                dwarf::DW_RLE_base_addressx, dwarf::DW_RLE_offset_pair,
+                dwarf::DW_RLE_startx_length, dwarf::DW_RLE_end_of_list,
+                llvm::dwarf::RangeListEncodingString,
+                List.CU->getCUNode()->getRangesBaseAddress() ||
+                    DD.getDwarfVersion() >= 5,
+                [](auto) {});
+}
+
+void DwarfDebug::emitDebugRangesImpl(const DwarfFile &Holder, MCSection *Section) {
+  if (Holder.getRangeLists().empty())
+    return;
+
+  assert(useRangesSection());
+  assert(!CUMap.empty());
+  assert(llvm::any_of(CUMap, [](const decltype(CUMap)::value_type &Pair) {
+    return !Pair.second->getCUNode()->isDebugDirectivesOnly();
+  }));
+
+  Asm->OutStreamer->switchSection(Section);
+
+  MCSymbol *TableEnd = nullptr;
+  if (getDwarfVersion() >= 5)
+    TableEnd = emitRnglistsTableHeader(Asm, Holder);
+
+  for (const RangeSpanList &List : Holder.getRangeLists())
+    emitRangeList(*this, Asm, List);
+
+  if (TableEnd)
+    Asm->OutStreamer->emitLabel(TableEnd);
+}
+
+/// Emit address ranges into the .debug_ranges section or into the DWARF v5
+/// .debug_rnglists section.
+void DwarfDebug::emitDebugRanges() {
+  const auto &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+
+  emitDebugRangesImpl(Holder,
+                      getDwarfVersion() >= 5
+                          ? Asm->getObjFileLowering().getDwarfRnglistsSection()
+                          : Asm->getObjFileLowering().getDwarfRangesSection());
+}
+
+void DwarfDebug::emitDebugRangesDWO() {
+  emitDebugRangesImpl(InfoHolder,
+                      Asm->getObjFileLowering().getDwarfRnglistsDWOSection());
+}
+
+/// Emit the header of a DWARF 5 macro section, or the GNU extension for
+/// DWARF 4.
+static void emitMacroHeader(AsmPrinter *Asm, const DwarfDebug &DD,
+                            const DwarfCompileUnit &CU, uint16_t DwarfVersion) {
+  enum HeaderFlagMask {
+#define HANDLE_MACRO_FLAG(ID, NAME) MACRO_FLAG_##NAME = ID,
+#include "llvm/BinaryFormat/Dwarf.def"
+  };
+  Asm->OutStreamer->AddComment("Macro information version");
+  Asm->emitInt16(DwarfVersion >= 5 ? DwarfVersion : 4);
+  // We emit the line offset flag unconditionally here, since line offset should
+  // be mostly present.
+  if (Asm->isDwarf64()) {
+    Asm->OutStreamer->AddComment("Flags: 64 bit, debug_line_offset present");
+    Asm->emitInt8(MACRO_FLAG_OFFSET_SIZE | MACRO_FLAG_DEBUG_LINE_OFFSET);
+  } else {
+    Asm->OutStreamer->AddComment("Flags: 32 bit, debug_line_offset present");
+    Asm->emitInt8(MACRO_FLAG_DEBUG_LINE_OFFSET);
+  }
+  Asm->OutStreamer->AddComment("debug_line_offset");
+  if (DD.useSplitDwarf())
+    Asm->emitDwarfLengthOrOffset(0);
+  else
+    Asm->emitDwarfSymbolReference(CU.getLineTableStartSym());
+}
+
+void DwarfDebug::handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U) {
+  for (auto *MN : Nodes) {
+    if (auto *M = dyn_cast<DIMacro>(MN))
+      emitMacro(*M);
+    else if (auto *F = dyn_cast<DIMacroFile>(MN))
+      emitMacroFile(*F, U);
+    else
+      llvm_unreachable("Unexpected DI type!");
+  }
+}
+
+void DwarfDebug::emitMacro(DIMacro &M) {
+  StringRef Name = M.getName();
+  StringRef Value = M.getValue();
+
+  // There should be one space between the macro name and the macro value in
+  // define entries. In undef entries, only the macro name is emitted.
+  std::string Str = Value.empty() ? Name.str() : (Name + " " + Value).str();
+
+  if (UseDebugMacroSection) {
+    if (getDwarfVersion() >= 5) {
+      unsigned Type = M.getMacinfoType() == dwarf::DW_MACINFO_define
+                          ? dwarf::DW_MACRO_define_strx
+                          : dwarf::DW_MACRO_undef_strx;
+      Asm->OutStreamer->AddComment(dwarf::MacroString(Type));
+      Asm->emitULEB128(Type);
+      Asm->OutStreamer->AddComment("Line Number");
+      Asm->emitULEB128(M.getLine());
+      Asm->OutStreamer->AddComment("Macro String");
+      Asm->emitULEB128(
+          InfoHolder.getStringPool().getIndexedEntry(*Asm, Str).getIndex());
+    } else {
+      unsigned Type = M.getMacinfoType() == dwarf::DW_MACINFO_define
+                          ? dwarf::DW_MACRO_GNU_define_indirect
+                          : dwarf::DW_MACRO_GNU_undef_indirect;
+      Asm->OutStreamer->AddComment(dwarf::GnuMacroString(Type));
+      Asm->emitULEB128(Type);
+      Asm->OutStreamer->AddComment("Line Number");
+      Asm->emitULEB128(M.getLine());
+      Asm->OutStreamer->AddComment("Macro String");
+      Asm->emitDwarfSymbolReference(
+          InfoHolder.getStringPool().getEntry(*Asm, Str).getSymbol());
+    }
+  } else {
+    Asm->OutStreamer->AddComment(dwarf::MacinfoString(M.getMacinfoType()));
+    Asm->emitULEB128(M.getMacinfoType());
+    Asm->OutStreamer->AddComment("Line Number");
+    Asm->emitULEB128(M.getLine());
+    Asm->OutStreamer->AddComment("Macro String");
+    Asm->OutStreamer->emitBytes(Str);
+    Asm->emitInt8('\0');
+  }
+}
+
+void DwarfDebug::emitMacroFileImpl(
+    DIMacroFile &MF, DwarfCompileUnit &U, unsigned StartFile, unsigned EndFile,
+    StringRef (*MacroFormToString)(unsigned Form)) {
+
+  Asm->OutStreamer->AddComment(MacroFormToString(StartFile));
+  Asm->emitULEB128(StartFile);
+  Asm->OutStreamer->AddComment("Line Number");
+  Asm->emitULEB128(MF.getLine());
+  Asm->OutStreamer->AddComment("File Number");
+  DIFile &F = *MF.getFile();
+  if (useSplitDwarf())
+    Asm->emitULEB128(getDwoLineTable(U)->getFile(
+        F.getDirectory(), F.getFilename(), getMD5AsBytes(&F),
+        Asm->OutContext.getDwarfVersion(), F.getSource()));
+  else
+    Asm->emitULEB128(U.getOrCreateSourceID(&F));
+  handleMacroNodes(MF.getElements(), U);
+  Asm->OutStreamer->AddComment(MacroFormToString(EndFile));
+  Asm->emitULEB128(EndFile);
+}
+
+void DwarfDebug::emitMacroFile(DIMacroFile &F, DwarfCompileUnit &U) {
+  // DWARFv5 macro and DWARFv4 macinfo share some common encodings,
+  // so for readibility/uniformity, We are explicitly emitting those.
+  assert(F.getMacinfoType() == dwarf::DW_MACINFO_start_file);
+  if (UseDebugMacroSection)
+    emitMacroFileImpl(
+        F, U, dwarf::DW_MACRO_start_file, dwarf::DW_MACRO_end_file,
+        (getDwarfVersion() >= 5) ? dwarf::MacroString : dwarf::GnuMacroString);
+  else
+    emitMacroFileImpl(F, U, dwarf::DW_MACINFO_start_file,
+                      dwarf::DW_MACINFO_end_file, dwarf::MacinfoString);
+}
+
+void DwarfDebug::emitDebugMacinfoImpl(MCSection *Section) {
+  for (const auto &P : CUMap) {
+    auto &TheCU = *P.second;
+    auto *SkCU = TheCU.getSkeleton();
+    DwarfCompileUnit &U = SkCU ? *SkCU : TheCU;
+    auto *CUNode = cast<DICompileUnit>(P.first);
+    DIMacroNodeArray Macros = CUNode->getMacros();
+    if (Macros.empty())
+      continue;
+    Asm->OutStreamer->switchSection(Section);
+    Asm->OutStreamer->emitLabel(U.getMacroLabelBegin());
+    if (UseDebugMacroSection)
+      emitMacroHeader(Asm, *this, U, getDwarfVersion());
+    handleMacroNodes(Macros, U);
+    Asm->OutStreamer->AddComment("End Of Macro List Mark");
+    Asm->emitInt8(0);
+  }
+}
+
+/// Emit macros into a debug macinfo/macro section.
+void DwarfDebug::emitDebugMacinfo() {
+  auto &ObjLower = Asm->getObjFileLowering();
+  emitDebugMacinfoImpl(UseDebugMacroSection
+                           ? ObjLower.getDwarfMacroSection()
+                           : ObjLower.getDwarfMacinfoSection());
+}
+
+void DwarfDebug::emitDebugMacinfoDWO() {
+  auto &ObjLower = Asm->getObjFileLowering();
+  emitDebugMacinfoImpl(UseDebugMacroSection
+                           ? ObjLower.getDwarfMacroDWOSection()
+                           : ObjLower.getDwarfMacinfoDWOSection());
+}
+
+// DWARF5 Experimental Separate Dwarf emitters.
+
+void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die,
+                                  std::unique_ptr<DwarfCompileUnit> NewU) {
+
+  if (!CompilationDir.empty())
+    NewU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
+  addGnuPubAttributes(*NewU, Die);
+
+  SkeletonHolder.addUnit(std::move(NewU));
+}
+
+DwarfCompileUnit &DwarfDebug::constructSkeletonCU(const DwarfCompileUnit &CU) {
+
+  auto OwnedUnit = std::make_unique<DwarfCompileUnit>(
+      CU.getUniqueID(), CU.getCUNode(), Asm, this, &SkeletonHolder,
+      UnitKind::Skeleton);
+  DwarfCompileUnit &NewCU = *OwnedUnit;
+  NewCU.setSection(Asm->getObjFileLowering().getDwarfInfoSection());
+
+  NewCU.initStmtList();
+
+  if (useSegmentedStringOffsetsTable())
+    NewCU.addStringOffsetsStart();
+
+  initSkeletonUnit(CU, NewCU.getUnitDie(), std::move(OwnedUnit));
+
+  return NewCU;
+}
+
+// Emit the .debug_info.dwo section for separated dwarf. This contains the
+// compile units that would normally be in debug_info.
+void DwarfDebug::emitDebugInfoDWO() {
+  assert(useSplitDwarf() && "No split dwarf debug info?");
+  // Don't emit relocations into the dwo file.
+  InfoHolder.emitUnits(/* UseOffsets */ true);
+}
+
+// Emit the .debug_abbrev.dwo section for separated dwarf. This contains the
+// abbreviations for the .debug_info.dwo section.
+void DwarfDebug::emitDebugAbbrevDWO() {
+  assert(useSplitDwarf() && "No split dwarf?");
+  InfoHolder.emitAbbrevs(Asm->getObjFileLowering().getDwarfAbbrevDWOSection());
+}
+
+void DwarfDebug::emitDebugLineDWO() {
+  assert(useSplitDwarf() && "No split dwarf?");
+  SplitTypeUnitFileTable.Emit(
+      *Asm->OutStreamer, MCDwarfLineTableParams(),
+      Asm->getObjFileLowering().getDwarfLineDWOSection());
+}
+
+void DwarfDebug::emitStringOffsetsTableHeaderDWO() {
+  assert(useSplitDwarf() && "No split dwarf?");
+  InfoHolder.getStringPool().emitStringOffsetsTableHeader(
+      *Asm, Asm->getObjFileLowering().getDwarfStrOffDWOSection(),
+      InfoHolder.getStringOffsetsStartSym());
+}
+
+// Emit the .debug_str.dwo section for separated dwarf. This contains the
+// string section and is identical in format to traditional .debug_str
+// sections.
+void DwarfDebug::emitDebugStrDWO() {
+  if (useSegmentedStringOffsetsTable())
+    emitStringOffsetsTableHeaderDWO();
+  assert(useSplitDwarf() && "No split dwarf?");
+  MCSection *OffSec = Asm->getObjFileLowering().getDwarfStrOffDWOSection();
+  InfoHolder.emitStrings(Asm->getObjFileLowering().getDwarfStrDWOSection(),
+                         OffSec, /* UseRelativeOffsets = */ false);
+}
+
+// Emit address pool.
+void DwarfDebug::emitDebugAddr() {
+  AddrPool.emit(*Asm, Asm->getObjFileLowering().getDwarfAddrSection());
+}
+
+MCDwarfDwoLineTable *DwarfDebug::getDwoLineTable(const DwarfCompileUnit &CU) {
+  if (!useSplitDwarf())
+    return nullptr;
+  const DICompileUnit *DIUnit = CU.getCUNode();
+  SplitTypeUnitFileTable.maybeSetRootFile(
+      DIUnit->getDirectory(), DIUnit->getFilename(),
+      getMD5AsBytes(DIUnit->getFile()), DIUnit->getSource());
+  return &SplitTypeUnitFileTable;
+}
+
+uint64_t DwarfDebug::makeTypeSignature(StringRef Identifier) {
+  MD5 Hash;
+  Hash.update(Identifier);
+  // ... take the least significant 8 bytes and return those. Our MD5
+  // implementation always returns its results in little endian, so we actually
+  // need the "high" word.
+  MD5::MD5Result Result;
+  Hash.final(Result);
+  return Result.high();
+}
+
+void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
+                                      StringRef Identifier, DIE &RefDie,
+                                      const DICompositeType *CTy) {
+  // Fast path if we're building some type units and one has already used the
+  // address pool we know we're going to throw away all this work anyway, so
+  // don't bother building dependent types.
+  if (!TypeUnitsUnderConstruction.empty() && AddrPool.hasBeenUsed())
+    return;
+
+  auto Ins = TypeSignatures.insert(std::make_pair(CTy, 0));
+  if (!Ins.second) {
+    CU.addDIETypeSignature(RefDie, Ins.first->second);
+    return;
+  }
+
+  bool TopLevelType = TypeUnitsUnderConstruction.empty();
+  AddrPool.resetUsedFlag();
+
+  auto OwnedUnit = std::make_unique<DwarfTypeUnit>(CU, Asm, this, &InfoHolder,
+                                                    getDwoLineTable(CU));
+  DwarfTypeUnit &NewTU = *OwnedUnit;
+  DIE &UnitDie = NewTU.getUnitDie();
+  TypeUnitsUnderConstruction.emplace_back(std::move(OwnedUnit), CTy);
+
+  NewTU.addUInt(UnitDie, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
+                CU.getLanguage());
+
+  uint64_t Signature = makeTypeSignature(Identifier);
+  NewTU.setTypeSignature(Signature);
+  Ins.first->second = Signature;
+
+  if (useSplitDwarf()) {
+    MCSection *Section =
+        getDwarfVersion() <= 4
+            ? Asm->getObjFileLowering().getDwarfTypesDWOSection()
+            : Asm->getObjFileLowering().getDwarfInfoDWOSection();
+    NewTU.setSection(Section);
+  } else {
+    MCSection *Section =
+        getDwarfVersion() <= 4
+            ? Asm->getObjFileLowering().getDwarfTypesSection(Signature)
+            : Asm->getObjFileLowering().getDwarfInfoSection(Signature);
+    NewTU.setSection(Section);
+    // Non-split type units reuse the compile unit's line table.
+    CU.applyStmtList(UnitDie);
+  }
+
+  // Add DW_AT_str_offsets_base to the type unit DIE, but not for split type
+  // units.
+  if (useSegmentedStringOffsetsTable() && !useSplitDwarf())
+    NewTU.addStringOffsetsStart();
+
+  NewTU.setType(NewTU.createTypeDIE(CTy));
+
+  if (TopLevelType) {
+    auto TypeUnitsToAdd = std::move(TypeUnitsUnderConstruction);
+    TypeUnitsUnderConstruction.clear();
+
+    // Types referencing entries in the address table cannot be placed in type
+    // units.
+    if (AddrPool.hasBeenUsed()) {
+
+      // Remove all the types built while building this type.
+      // This is pessimistic as some of these types might not be dependent on
+      // the type that used an address.
+      for (const auto &TU : TypeUnitsToAdd)
+        TypeSignatures.erase(TU.second);
+
+      // Construct this type in the CU directly.
+      // This is inefficient because all the dependent types will be rebuilt
+      // from scratch, including building them in type units, discovering that
+      // they depend on addresses, throwing them out and rebuilding them.
+      CU.constructTypeDIE(RefDie, cast<DICompositeType>(CTy));
+      return;
+    }
+
+    // If the type wasn't dependent on fission addresses, finish adding the type
+    // and all its dependent types.
+    for (auto &TU : TypeUnitsToAdd) {
+      InfoHolder.computeSizeAndOffsetsForUnit(TU.first.get());
+      InfoHolder.emitUnit(TU.first.get(), useSplitDwarf());
+    }
+  }
+  CU.addDIETypeSignature(RefDie, Signature);
+}
+
+// Add the Name along with its companion DIE to the appropriate accelerator
+// table (for AccelTableKind::Dwarf it's always AccelDebugNames, for
+// AccelTableKind::Apple, we use the table we got as an argument). If
+// accelerator tables are disabled, this function does nothing.
+template <typename DataT>
+void DwarfDebug::addAccelNameImpl(const DICompileUnit &CU,
+                                  AccelTable<DataT> &AppleAccel, StringRef Name,
+                                  const DIE &Die) {
+  if (getAccelTableKind() == AccelTableKind::None || Name.empty())
+    return;
+
+  if (getAccelTableKind() != AccelTableKind::Apple &&
+      CU.getNameTableKind() != DICompileUnit::DebugNameTableKind::Apple &&
+      CU.getNameTableKind() != DICompileUnit::DebugNameTableKind::Default)
+    return;
+
+  DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+  DwarfStringPoolEntryRef Ref = Holder.getStringPool().getEntry(*Asm, Name);
+
+  switch (getAccelTableKind()) {
+  case AccelTableKind::Apple:
+    AppleAccel.addName(Ref, Die);
+    break;
+  case AccelTableKind::Dwarf:
+    AccelDebugNames.addName(Ref, Die);
+    break;
+  case AccelTableKind::Default:
+    llvm_unreachable("Default should have already been resolved.");
+  case AccelTableKind::None:
+    llvm_unreachable("None handled above");
+  }
+}
+
+void DwarfDebug::addAccelName(const DICompileUnit &CU, StringRef Name,
+                              const DIE &Die) {
+  addAccelNameImpl(CU, AccelNames, Name, Die);
+}
+
+void DwarfDebug::addAccelObjC(const DICompileUnit &CU, StringRef Name,
+                              const DIE &Die) {
+  // ObjC names go only into the Apple accelerator tables.
+  if (getAccelTableKind() == AccelTableKind::Apple)
+    addAccelNameImpl(CU, AccelObjC, Name, Die);
+}
+
+void DwarfDebug::addAccelNamespace(const DICompileUnit &CU, StringRef Name,
+                                   const DIE &Die) {
+  addAccelNameImpl(CU, AccelNamespace, Name, Die);
+}
+
+void DwarfDebug::addAccelType(const DICompileUnit &CU, StringRef Name,
+                              const DIE &Die, char Flags) {
+  addAccelNameImpl(CU, AccelTypes, Name, Die);
+}
+
+uint16_t DwarfDebug::getDwarfVersion() const {
+  return Asm->OutStreamer->getContext().getDwarfVersion();
+}
+
+dwarf::Form DwarfDebug::getDwarfSectionOffsetForm() const {
+  if (Asm->getDwarfVersion() >= 4)
+    return dwarf::Form::DW_FORM_sec_offset;
+  assert((!Asm->isDwarf64() || (Asm->getDwarfVersion() == 3)) &&
+         "DWARF64 is not defined prior DWARFv3");
+  return Asm->isDwarf64() ? dwarf::Form::DW_FORM_data8
+                          : dwarf::Form::DW_FORM_data4;
+}
+
+const MCSymbol *DwarfDebug::getSectionLabel(const MCSection *S) {
+  return SectionLabels.lookup(S);
+}
+
+void DwarfDebug::insertSectionLabel(const MCSymbol *S) {
+  if (SectionLabels.insert(std::make_pair(&S->getSection(), S)).second)
+    if (useSplitDwarf() || getDwarfVersion() >= 5)
+      AddrPool.getIndex(S);
+}
+
+std::optional<MD5::MD5Result>
+DwarfDebug::getMD5AsBytes(const DIFile *File) const {
+  assert(File);
+  if (getDwarfVersion() < 5)
+    return std::nullopt;
+  std::optional<DIFile::ChecksumInfo<StringRef>> Checksum = File->getChecksum();
+  if (!Checksum || Checksum->Kind != DIFile::CSK_MD5)
+    return std::nullopt;
+
+  // Convert the string checksum to an MD5Result for the streamer.
+  // The verifier validates the checksum so we assume it's okay.
+  // An MD5 checksum is 16 bytes.
+  std::string ChecksumString = fromHex(Checksum->Value);
+  MD5::MD5Result CKMem;
+  std::copy(ChecksumString.begin(), ChecksumString.end(), CKMem.data());
+  return CKMem;
+}
+
+bool DwarfDebug::alwaysUseRanges(const DwarfCompileUnit &CU) const {
+  if (MinimizeAddr == MinimizeAddrInV5::Ranges)
+    return true;
+  if (MinimizeAddr != MinimizeAddrInV5::Default)
+    return false;
+  if (useSplitDwarf())
+    return true;
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
new file mode 100644
index 000000000000..1af4b643eb17
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -0,0 +1,849 @@
+//===- llvm/CodeGen/DwarfDebug.h - Dwarf Debug Framework --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf debug info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
+
+#include "AddressPool.h"
+#include "DebugLocEntry.h"
+#include "DebugLocStream.h"
+#include "DwarfFile.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/AccelTable.h"
+#include "llvm/CodeGen/DbgEntityHistoryCalculator.h"
+#include "llvm/CodeGen/DebugHandlerBase.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Target/TargetOptions.h"
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+
+class AsmPrinter;
+class ByteStreamer;
+class DIE;
+class DwarfCompileUnit;
+class DwarfExpression;
+class DwarfTypeUnit;
+class DwarfUnit;
+class LexicalScope;
+class MachineFunction;
+class MCSection;
+class MCSymbol;
+class Module;
+
+//===----------------------------------------------------------------------===//
+/// This class is defined as the common parent of DbgVariable and DbgLabel
+/// such that it could levarage polymorphism to extract common code for
+/// DbgVariable and DbgLabel.
+class DbgEntity {
+public:
+  enum DbgEntityKind {
+    DbgVariableKind,
+    DbgLabelKind
+  };
+
+private:
+  const DINode *Entity;
+  const DILocation *InlinedAt;
+  DIE *TheDIE = nullptr;
+  const DbgEntityKind SubclassID;
+
+public:
+  DbgEntity(const DINode *N, const DILocation *IA, DbgEntityKind ID)
+      : Entity(N), InlinedAt(IA), SubclassID(ID) {}
+  virtual ~DbgEntity() = default;
+
+  /// Accessors.
+  /// @{
+  const DINode *getEntity() const { return Entity; }
+  const DILocation *getInlinedAt() const { return InlinedAt; }
+  DIE *getDIE() const { return TheDIE; }
+  DbgEntityKind getDbgEntityID() const { return SubclassID; }
+  /// @}
+
+  void setDIE(DIE &D) { TheDIE = &D; }
+
+  static bool classof(const DbgEntity *N) {
+    switch (N->getDbgEntityID()) {
+    case DbgVariableKind:
+    case DbgLabelKind:
+      return true;
+    }
+    llvm_unreachable("Invalid DbgEntityKind");
+  }
+};
+
+//===----------------------------------------------------------------------===//
+/// This class is used to track local variable information.
+///
+/// Variables can be created from allocas, in which case they're generated from
+/// the MMI table.  Such variables can have multiple expressions and frame
+/// indices.
+///
+/// Variables can be created from \c DBG_VALUE instructions.  Those whose
+/// location changes over time use \a DebugLocListIndex, while those with a
+/// single location use \a ValueLoc and (optionally) a single entry of \a Expr.
+///
+/// Variables that have been optimized out use none of these fields.
+class DbgVariable : public DbgEntity {
+  /// Index of the entry list in DebugLocs.
+  unsigned DebugLocListIndex = ~0u;
+  /// DW_OP_LLVM_tag_offset value from DebugLocs.
+  std::optional<uint8_t> DebugLocListTagOffset;
+
+  /// Single value location description.
+  std::unique_ptr<DbgValueLoc> ValueLoc = nullptr;
+
+  struct FrameIndexExpr {
+    int FI;
+    const DIExpression *Expr;
+  };
+  mutable SmallVector<FrameIndexExpr, 1>
+      FrameIndexExprs; /// Frame index + expression.
+
+public:
+  /// Construct a DbgVariable.
+  ///
+  /// Creates a variable without any DW_AT_location.  Call \a initializeMMI()
+  /// for MMI entries, or \a initializeDbgValue() for DBG_VALUE instructions.
+  DbgVariable(const DILocalVariable *V, const DILocation *IA)
+      : DbgEntity(V, IA, DbgVariableKind) {}
+
+  /// Initialize from the MMI table.
+  void initializeMMI(const DIExpression *E, int FI) {
+    assert(FrameIndexExprs.empty() && "Already initialized?");
+    assert(!ValueLoc.get() && "Already initialized?");
+
+    assert((!E || E->isValid()) && "Expected valid expression");
+    assert(FI != std::numeric_limits<int>::max() && "Expected valid index");
+
+    FrameIndexExprs.push_back({FI, E});
+  }
+
+  // Initialize variable's location.
+  void initializeDbgValue(DbgValueLoc Value) {
+    assert(FrameIndexExprs.empty() && "Already initialized?");
+    assert(!ValueLoc && "Already initialized?");
+    assert(!Value.getExpression()->isFragment() && "Fragments not supported.");
+
+    ValueLoc = std::make_unique<DbgValueLoc>(Value);
+    if (auto *E = ValueLoc->getExpression())
+      if (E->getNumElements())
+        FrameIndexExprs.push_back({0, E});
+  }
+
+  /// Initialize from a DBG_VALUE instruction.
+  void initializeDbgValue(const MachineInstr *DbgValue);
+
+  // Accessors.
+  const DILocalVariable *getVariable() const {
+    return cast<DILocalVariable>(getEntity());
+  }
+
+  const DIExpression *getSingleExpression() const {
+    assert(ValueLoc.get() && FrameIndexExprs.size() <= 1);
+    return FrameIndexExprs.size() ? FrameIndexExprs[0].Expr : nullptr;
+  }
+
+  void setDebugLocListIndex(unsigned O) { DebugLocListIndex = O; }
+  unsigned getDebugLocListIndex() const { return DebugLocListIndex; }
+  void setDebugLocListTagOffset(uint8_t O) { DebugLocListTagOffset = O; }
+  std::optional<uint8_t> getDebugLocListTagOffset() const {
+    return DebugLocListTagOffset;
+  }
+  StringRef getName() const { return getVariable()->getName(); }
+  const DbgValueLoc *getValueLoc() const { return ValueLoc.get(); }
+  /// Get the FI entries, sorted by fragment offset.
+  ArrayRef<FrameIndexExpr> getFrameIndexExprs() const;
+  bool hasFrameIndexExprs() const { return !FrameIndexExprs.empty(); }
+  void addMMIEntry(const DbgVariable &V);
+
+  // Translate tag to proper Dwarf tag.
+  dwarf::Tag getTag() const {
+    // FIXME: Why don't we just infer this tag and store it all along?
+    if (getVariable()->isParameter())
+      return dwarf::DW_TAG_formal_parameter;
+
+    return dwarf::DW_TAG_variable;
+  }
+
+  /// Return true if DbgVariable is artificial.
+  bool isArtificial() const {
+    if (getVariable()->isArtificial())
+      return true;
+    if (getType()->isArtificial())
+      return true;
+    return false;
+  }
+
+  bool isObjectPointer() const {
+    if (getVariable()->isObjectPointer())
+      return true;
+    if (getType()->isObjectPointer())
+      return true;
+    return false;
+  }
+
+  bool hasComplexAddress() const {
+    assert(ValueLoc.get() && "Expected DBG_VALUE, not MMI variable");
+    assert((FrameIndexExprs.empty() ||
+            (FrameIndexExprs.size() == 1 &&
+             FrameIndexExprs[0].Expr->getNumElements())) &&
+           "Invalid Expr for DBG_VALUE");
+    return !FrameIndexExprs.empty();
+  }
+
+  const DIType *getType() const;
+
+  static bool classof(const DbgEntity *N) {
+    return N->getDbgEntityID() == DbgVariableKind;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+/// This class is used to track label information.
+///
+/// Labels are collected from \c DBG_LABEL instructions.
+class DbgLabel : public DbgEntity {
+  const MCSymbol *Sym;                  /// Symbol before DBG_LABEL instruction.
+
+public:
+  /// We need MCSymbol information to generate DW_AT_low_pc.
+  DbgLabel(const DILabel *L, const DILocation *IA, const MCSymbol *Sym = nullptr)
+      : DbgEntity(L, IA, DbgLabelKind), Sym(Sym) {}
+
+  /// Accessors.
+  /// @{
+  const DILabel *getLabel() const { return cast<DILabel>(getEntity()); }
+  const MCSymbol *getSymbol() const { return Sym; }
+
+  StringRef getName() const { return getLabel()->getName(); }
+  /// @}
+
+  /// Translate tag to proper Dwarf tag.
+  dwarf::Tag getTag() const {
+    return dwarf::DW_TAG_label;
+  }
+
+  static bool classof(const DbgEntity *N) {
+    return N->getDbgEntityID() == DbgLabelKind;
+  }
+};
+
+/// Used for tracking debug info about call site parameters.
+class DbgCallSiteParam {
+private:
+  unsigned Register; ///< Parameter register at the callee entry point.
+  DbgValueLoc Value; ///< Corresponding location for the parameter value at
+                     ///< the call site.
+public:
+  DbgCallSiteParam(unsigned Reg, DbgValueLoc Val)
+      : Register(Reg), Value(Val) {
+    assert(Reg && "Parameter register cannot be undef");
+  }
+
+  unsigned getRegister() const { return Register; }
+  DbgValueLoc getValue() const { return Value; }
+};
+
+/// Collection used for storing debug call site parameters.
+using ParamSet = SmallVector<DbgCallSiteParam, 4>;
+
+/// Helper used to pair up a symbol and its DWARF compile unit.
+struct SymbolCU {
+  SymbolCU(DwarfCompileUnit *CU, const MCSymbol *Sym) : Sym(Sym), CU(CU) {}
+
+  const MCSymbol *Sym;
+  DwarfCompileUnit *CU;
+};
+
+/// The kind of accelerator tables we should emit.
+enum class AccelTableKind {
+  Default, ///< Platform default.
+  None,    ///< None.
+  Apple,   ///< .apple_names, .apple_namespaces, .apple_types, .apple_objc.
+  Dwarf,   ///< DWARF v5 .debug_names.
+};
+
+/// Collects and handles dwarf debug information.
+class DwarfDebug : public DebugHandlerBase {
+  /// All DIEValues are allocated through this allocator.
+  BumpPtrAllocator DIEValueAllocator;
+
+  /// Maps MDNode with its corresponding DwarfCompileUnit.
+  MapVector<const MDNode *, DwarfCompileUnit *> CUMap;
+
+  /// Maps a CU DIE with its corresponding DwarfCompileUnit.
+  DenseMap<const DIE *, DwarfCompileUnit *> CUDieMap;
+
+  /// List of all labels used in aranges generation.
+  std::vector<SymbolCU> ArangeLabels;
+
+  /// Size of each symbol emitted (for those symbols that have a specific size).
+  DenseMap<const MCSymbol *, uint64_t> SymSize;
+
+  /// Collection of abstract variables/labels.
+  SmallVector<std::unique_ptr<DbgEntity>, 64> ConcreteEntities;
+
+  /// Collection of DebugLocEntry. Stored in a linked list so that DIELocLists
+  /// can refer to them in spite of insertions into this list.
+  DebugLocStream DebugLocs;
+
+  /// This is a collection of subprogram MDNodes that are processed to
+  /// create DIEs.
+  SmallSetVector<const DISubprogram *, 16> ProcessedSPNodes;
+
+  /// Map function-local imported entities to their parent local scope
+  /// (either DILexicalBlock or DISubprogram) for a processed function
+  /// (including inlined subprograms).
+  using MDNodeSet = SetVector<const MDNode *, SmallVector<const MDNode *, 2>,
+                              SmallPtrSet<const MDNode *, 2>>;
+  DenseMap<const DILocalScope *, MDNodeSet> LocalDeclsPerLS;
+
+  /// If nonnull, stores the current machine function we're processing.
+  const MachineFunction *CurFn = nullptr;
+
+  /// If nonnull, stores the CU in which the previous subprogram was contained.
+  const DwarfCompileUnit *PrevCU = nullptr;
+
+  /// As an optimization, there is no need to emit an entry in the directory
+  /// table for the same directory as DW_AT_comp_dir.
+  StringRef CompilationDir;
+
+  /// Holder for the file specific debug information.
+  DwarfFile InfoHolder;
+
+  /// Holders for the various debug information flags that we might need to
+  /// have exposed. See accessor functions below for description.
+
+  /// Map from MDNodes for user-defined types to their type signatures. Also
+  /// used to keep track of which types we have emitted type units for.
+  DenseMap<const MDNode *, uint64_t> TypeSignatures;
+
+  DenseMap<const MCSection *, const MCSymbol *> SectionLabels;
+
+  SmallVector<
+      std::pair<std::unique_ptr<DwarfTypeUnit>, const DICompositeType *>, 1>
+      TypeUnitsUnderConstruction;
+
+  /// Whether to use the GNU TLS opcode (instead of the standard opcode).
+  bool UseGNUTLSOpcode;
+
+  /// Whether to use DWARF 2 bitfields (instead of the DWARF 4 format).
+  bool UseDWARF2Bitfields;
+
+  /// Whether to emit all linkage names, or just abstract subprograms.
+  bool UseAllLinkageNames;
+
+  /// Use inlined strings.
+  bool UseInlineStrings = false;
+
+  /// Allow emission of .debug_ranges section.
+  bool UseRangesSection = true;
+
+  /// True if the sections itself must be used as references and don't create
+  /// temp symbols inside DWARF sections.
+  bool UseSectionsAsReferences = false;
+
+  ///Allow emission of the .debug_loc section.
+  bool UseLocSection = true;
+
+  /// Generate DWARF v4 type units.
+  bool GenerateTypeUnits;
+
+  /// Emit a .debug_macro section instead of .debug_macinfo.
+  bool UseDebugMacroSection;
+
+  /// Avoid using DW_OP_convert due to consumer incompatibilities.
+  bool EnableOpConvert;
+
+public:
+  enum class MinimizeAddrInV5 {
+    Default,
+    Disabled,
+    Ranges,
+    Expressions,
+    Form,
+  };
+
+private:
+  /// Force the use of DW_AT_ranges even for single-entry range lists.
+  MinimizeAddrInV5 MinimizeAddr = MinimizeAddrInV5::Disabled;
+
+  /// DWARF5 Experimental Options
+  /// @{
+  AccelTableKind TheAccelTableKind;
+  bool HasAppleExtensionAttributes;
+  bool HasSplitDwarf;
+
+  /// Whether to generate the DWARF v5 string offsets table.
+  /// It consists of a series of contributions, each preceded by a header.
+  /// The pre-DWARF v5 string offsets table for split dwarf is, in contrast,
+  /// a monolithic sequence of string offsets.
+  bool UseSegmentedStringOffsetsTable;
+
+  /// Enable production of call site parameters needed to print the debug entry
+  /// values. Useful for testing purposes when a debugger does not support the
+  /// feature yet.
+  bool EmitDebugEntryValues;
+
+  /// Separated Dwarf Variables
+  /// In general these will all be for bits that are left in the
+  /// original object file, rather than things that are meant
+  /// to be in the .dwo sections.
+
+  /// Holder for the skeleton information.
+  DwarfFile SkeletonHolder;
+
+  /// Store file names for type units under fission in a line table
+  /// header that will be emitted into debug_line.dwo.
+  // FIXME: replace this with a map from comp_dir to table so that we
+  // can emit multiple tables during LTO each of which uses directory
+  // 0, referencing the comp_dir of all the type units that use it.
+  MCDwarfDwoLineTable SplitTypeUnitFileTable;
+  /// @}
+
+  /// True iff there are multiple CUs in this module.
+  bool SingleCU;
+  bool IsDarwin;
+
+  /// Map for tracking Fortran deferred CHARACTER lengths.
+  DenseMap<const DIStringType *, unsigned> StringTypeLocMap;
+
+  AddressPool AddrPool;
+
+  /// Accelerator tables.
+  AccelTable<DWARF5AccelTableData> AccelDebugNames;
+  AccelTable<AppleAccelTableOffsetData> AccelNames;
+  AccelTable<AppleAccelTableOffsetData> AccelObjC;
+  AccelTable<AppleAccelTableOffsetData> AccelNamespace;
+  AccelTable<AppleAccelTableTypeData> AccelTypes;
+
+  /// Identify a debugger for "tuning" the debug info.
+  ///
+  /// The "tuning" should be used to set defaults for individual feature flags
+  /// in DwarfDebug; if a given feature has a more specific command-line option,
+  /// that option should take precedence over the tuning.
+  DebuggerKind DebuggerTuning = DebuggerKind::Default;
+
+  MCDwarfDwoLineTable *getDwoLineTable(const DwarfCompileUnit &);
+
+  const SmallVectorImpl<std::unique_ptr<DwarfCompileUnit>> &getUnits() {
+    return InfoHolder.getUnits();
+  }
+
+  using InlinedEntity = DbgValueHistoryMap::InlinedEntity;
+
+  void ensureAbstractEntityIsCreatedIfScoped(DwarfCompileUnit &CU,
+                                             const DINode *Node,
+                                             const MDNode *Scope);
+
+  DbgEntity *createConcreteEntity(DwarfCompileUnit &TheCU,
+                                  LexicalScope &Scope,
+                                  const DINode *Node,
+                                  const DILocation *Location,
+                                  const MCSymbol *Sym = nullptr);
+
+  /// Construct a DIE for this abstract scope.
+  void constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, LexicalScope *Scope);
+
+  /// Construct DIEs for call site entries describing the calls in \p MF.
+  void constructCallSiteEntryDIEs(const DISubprogram &SP, DwarfCompileUnit &CU,
+                                  DIE &ScopeDIE, const MachineFunction &MF);
+
+  template <typename DataT>
+  void addAccelNameImpl(const DICompileUnit &CU, AccelTable<DataT> &AppleAccel,
+                        StringRef Name, const DIE &Die);
+
+  void finishEntityDefinitions();
+
+  void finishSubprogramDefinitions();
+
+  /// Finish off debug information after all functions have been
+  /// processed.
+  void finalizeModuleInfo();
+
+  /// Emit the debug info section.
+  void emitDebugInfo();
+
+  /// Emit the abbreviation section.
+  void emitAbbreviations();
+
+  /// Emit the string offsets table header.
+  void emitStringOffsetsTableHeader();
+
+  /// Emit a specified accelerator table.
+  template <typename AccelTableT>
+  void emitAccel(AccelTableT &Accel, MCSection *Section, StringRef TableName);
+
+  /// Emit DWARF v5 accelerator table.
+  void emitAccelDebugNames();
+
+  /// Emit visible names into a hashed accelerator table section.
+  void emitAccelNames();
+
+  /// Emit objective C classes and categories into a hashed
+  /// accelerator table section.
+  void emitAccelObjC();
+
+  /// Emit namespace dies into a hashed accelerator table.
+  void emitAccelNamespaces();
+
+  /// Emit type dies into a hashed accelerator table.
+  void emitAccelTypes();
+
+  /// Emit visible names and types into debug pubnames and pubtypes sections.
+  void emitDebugPubSections();
+
+  void emitDebugPubSection(bool GnuStyle, StringRef Name,
+                           DwarfCompileUnit *TheU,
+                           const StringMap<const DIE *> &Globals);
+
+  /// Emit null-terminated strings into a debug str section.
+  void emitDebugStr();
+
+  /// Emit variable locations into a debug loc section.
+  void emitDebugLoc();
+
+  /// Emit variable locations into a debug loc dwo section.
+  void emitDebugLocDWO();
+
+  void emitDebugLocImpl(MCSection *Sec);
+
+  /// Emit address ranges into a debug aranges section.
+  void emitDebugARanges();
+
+  /// Emit address ranges into a debug ranges section.
+  void emitDebugRanges();
+  void emitDebugRangesDWO();
+  void emitDebugRangesImpl(const DwarfFile &Holder, MCSection *Section);
+
+  /// Emit macros into a debug macinfo section.
+  void emitDebugMacinfo();
+  /// Emit macros into a debug macinfo.dwo section.
+  void emitDebugMacinfoDWO();
+  void emitDebugMacinfoImpl(MCSection *Section);
+  void emitMacro(DIMacro &M);
+  void emitMacroFile(DIMacroFile &F, DwarfCompileUnit &U);
+  void emitMacroFileImpl(DIMacroFile &F, DwarfCompileUnit &U,
+                         unsigned StartFile, unsigned EndFile,
+                         StringRef (*MacroFormToString)(unsigned Form));
+  void handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U);
+
+  /// DWARF 5 Experimental Split Dwarf Emitters
+
+  /// Initialize common features of skeleton units.
+  void initSkeletonUnit(const DwarfUnit &U, DIE &Die,
+                        std::unique_ptr<DwarfCompileUnit> NewU);
+
+  /// Construct the split debug info compile unit for the debug info section.
+  /// In DWARF v5, the skeleton unit DIE may have the following attributes:
+  /// DW_AT_addr_base, DW_AT_comp_dir, DW_AT_dwo_name, DW_AT_high_pc,
+  /// DW_AT_low_pc, DW_AT_ranges, DW_AT_stmt_list, and DW_AT_str_offsets_base.
+  /// Prior to DWARF v5 it may also have DW_AT_GNU_dwo_id. DW_AT_GNU_dwo_name
+  /// is used instead of DW_AT_dwo_name, Dw_AT_GNU_addr_base instead of
+  /// DW_AT_addr_base, and DW_AT_GNU_ranges_base instead of DW_AT_rnglists_base.
+  DwarfCompileUnit &constructSkeletonCU(const DwarfCompileUnit &CU);
+
+  /// Emit the debug info dwo section.
+  void emitDebugInfoDWO();
+
+  /// Emit the debug abbrev dwo section.
+  void emitDebugAbbrevDWO();
+
+  /// Emit the debug line dwo section.
+  void emitDebugLineDWO();
+
+  /// Emit the dwo stringoffsets table header.
+  void emitStringOffsetsTableHeaderDWO();
+
+  /// Emit the debug str dwo section.
+  void emitDebugStrDWO();
+
+  /// Emit DWO addresses.
+  void emitDebugAddr();
+
+  /// Flags to let the linker know we have emitted new style pubnames. Only
+  /// emit it here if we don't have a skeleton CU for split dwarf.
+  void addGnuPubAttributes(DwarfCompileUnit &U, DIE &D) const;
+
+  /// Create new DwarfCompileUnit for the given metadata node with tag
+  /// DW_TAG_compile_unit.
+  DwarfCompileUnit &getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit);
+  void finishUnitAttributes(const DICompileUnit *DIUnit,
+                            DwarfCompileUnit &NewCU);
+
+  /// Register a source line with debug info. Returns the unique
+  /// label that was emitted and which provides correspondence to the
+  /// source line list.
+  void recordSourceLine(unsigned Line, unsigned Col, const MDNode *Scope,
+                        unsigned Flags);
+
+  /// Populate LexicalScope entries with variables' info.
+  void collectEntityInfo(DwarfCompileUnit &TheCU, const DISubprogram *SP,
+                         DenseSet<InlinedEntity> &ProcessedVars);
+
+  /// Build the location list for all DBG_VALUEs in the
+  /// function that describe the same variable. If the resulting
+  /// list has only one entry that is valid for entire variable's
+  /// scope return true.
+  bool buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
+                         const DbgValueHistoryMap::Entries &Entries);
+
+  /// Collect variable information from the side table maintained by MF.
+  void collectVariableInfoFromMFTable(DwarfCompileUnit &TheCU,
+                                      DenseSet<InlinedEntity> &P);
+
+  /// Emit the reference to the section.
+  void emitSectionReference(const DwarfCompileUnit &CU);
+
+protected:
+  /// Gather pre-function debug information.
+  void beginFunctionImpl(const MachineFunction *MF) override;
+
+  /// Gather and emit post-function debug information.
+  void endFunctionImpl(const MachineFunction *MF) override;
+
+  /// Get Dwarf compile unit ID for line table.
+  unsigned getDwarfCompileUnitIDForLineTable(const DwarfCompileUnit &CU);
+
+  void skippedNonDebugFunction() override;
+
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  DwarfDebug(AsmPrinter *A);
+
+  ~DwarfDebug() override;
+
+  /// Emit all Dwarf sections that should come prior to the
+  /// content.
+  void beginModule(Module *M) override;
+
+  /// Emit all Dwarf sections that should come after the content.
+  void endModule() override;
+
+  /// Emits inital debug location directive.
+  DebugLoc emitInitialLocDirective(const MachineFunction &MF, unsigned CUID);
+
+  /// Process beginning of an instruction.
+  void beginInstruction(const MachineInstr *MI) override;
+
+  /// Perform an MD5 checksum of \p Identifier and return the lower 64 bits.
+  static uint64_t makeTypeSignature(StringRef Identifier);
+
+  /// Add a DIE to the set of types that we're going to pull into
+  /// type units.
+  void addDwarfTypeUnitType(DwarfCompileUnit &CU, StringRef Identifier,
+                            DIE &Die, const DICompositeType *CTy);
+
+  /// Add a label so that arange data can be generated for it.
+  void addArangeLabel(SymbolCU SCU) { ArangeLabels.push_back(SCU); }
+
+  /// For symbols that have a size designated (e.g. common symbols),
+  /// this tracks that size.
+  void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {
+    SymSize[Sym] = Size;
+  }
+
+  /// Returns whether we should emit all DW_AT_[MIPS_]linkage_name.
+  /// If not, we still might emit certain cases.
+  bool useAllLinkageNames() const { return UseAllLinkageNames; }
+
+  /// Returns whether to use DW_OP_GNU_push_tls_address, instead of the
+  /// standard DW_OP_form_tls_address opcode
+  bool useGNUTLSOpcode() const { return UseGNUTLSOpcode; }
+
+  /// Returns whether to use the DWARF2 format for bitfields instyead of the
+  /// DWARF4 format.
+  bool useDWARF2Bitfields() const { return UseDWARF2Bitfields; }
+
+  /// Returns whether to use inline strings.
+  bool useInlineStrings() const { return UseInlineStrings; }
+
+  /// Returns whether ranges section should be emitted.
+  bool useRangesSection() const { return UseRangesSection; }
+
+  /// Returns whether range encodings should be used for single entry range
+  /// lists.
+  bool alwaysUseRanges(const DwarfCompileUnit &) const;
+
+  // Returns whether novel exprloc addrx+offset encodings should be used to
+  // reduce debug_addr size.
+  bool useAddrOffsetExpressions() const {
+    return MinimizeAddr == MinimizeAddrInV5::Expressions;
+  }
+
+  // Returns whether addrx+offset LLVM extension form should be used to reduce
+  // debug_addr size.
+  bool useAddrOffsetForm() const {
+    return MinimizeAddr == MinimizeAddrInV5::Form;
+  }
+
+  /// Returns whether to use sections as labels rather than temp symbols.
+  bool useSectionsAsReferences() const {
+    return UseSectionsAsReferences;
+  }
+
+  /// Returns whether .debug_loc section should be emitted.
+  bool useLocSection() const { return UseLocSection; }
+
+  /// Returns whether to generate DWARF v4 type units.
+  bool generateTypeUnits() const { return GenerateTypeUnits; }
+
+  // Experimental DWARF5 features.
+
+  /// Returns what kind (if any) of accelerator tables to emit.
+  AccelTableKind getAccelTableKind() const { return TheAccelTableKind; }
+
+  bool useAppleExtensionAttributes() const {
+    return HasAppleExtensionAttributes;
+  }
+
+  /// Returns whether or not to change the current debug info for the
+  /// split dwarf proposal support.
+  bool useSplitDwarf() const { return HasSplitDwarf; }
+
+  /// Returns whether to generate a string offsets table with (possibly shared)
+  /// contributions from each CU and type unit. This implies the use of
+  /// DW_FORM_strx* indirect references with DWARF v5 and beyond. Note that
+  /// DW_FORM_GNU_str_index is also an indirect reference, but it is used with
+  /// a pre-DWARF v5 implementation of split DWARF sections, which uses a
+  /// monolithic string offsets table.
+  bool useSegmentedStringOffsetsTable() const {
+    return UseSegmentedStringOffsetsTable;
+  }
+
+  bool emitDebugEntryValues() const {
+    return EmitDebugEntryValues;
+  }
+
+  bool useOpConvert() const {
+    return EnableOpConvert;
+  }
+
+  bool shareAcrossDWOCUs() const;
+
+  /// Returns the Dwarf Version.
+  uint16_t getDwarfVersion() const;
+
+  /// Returns a suitable DWARF form to represent a section offset, i.e.
+  /// * DW_FORM_sec_offset for DWARF version >= 4;
+  /// * DW_FORM_data8 for 64-bit DWARFv3;
+  /// * DW_FORM_data4 for 32-bit DWARFv3 and DWARFv2.
+  dwarf::Form getDwarfSectionOffsetForm() const;
+
+  /// Returns the previous CU that was being updated
+  const DwarfCompileUnit *getPrevCU() const { return PrevCU; }
+  void setPrevCU(const DwarfCompileUnit *PrevCU) { this->PrevCU = PrevCU; }
+
+  /// Terminate the line table by adding the last range label.
+  void terminateLineTable(const DwarfCompileUnit *CU);
+
+  /// Returns the entries for the .debug_loc section.
+  const DebugLocStream &getDebugLocs() const { return DebugLocs; }
+
+  /// Emit an entry for the debug loc section. This can be used to
+  /// handle an entry that's going to be emitted into the debug loc section.
+  void emitDebugLocEntry(ByteStreamer &Streamer,
+                         const DebugLocStream::Entry &Entry,
+                         const DwarfCompileUnit *CU);
+
+  /// Emit the location for a debug loc entry, including the size header.
+  void emitDebugLocEntryLocation(const DebugLocStream::Entry &Entry,
+                                 const DwarfCompileUnit *CU);
+
+  void addSubprogramNames(const DICompileUnit &CU, const DISubprogram *SP,
+                          DIE &Die);
+
+  AddressPool &getAddressPool() { return AddrPool; }
+
+  void addAccelName(const DICompileUnit &CU, StringRef Name, const DIE &Die);
+
+  void addAccelObjC(const DICompileUnit &CU, StringRef Name, const DIE &Die);
+
+  void addAccelNamespace(const DICompileUnit &CU, StringRef Name,
+                         const DIE &Die);
+
+  void addAccelType(const DICompileUnit &CU, StringRef Name, const DIE &Die,
+                    char Flags);
+
+  const MachineFunction *getCurrentFunction() const { return CurFn; }
+
+  /// A helper function to check whether the DIE for a given Scope is
+  /// going to be null.
+  bool isLexicalScopeDIENull(LexicalScope *Scope);
+
+  /// Find the matching DwarfCompileUnit for the given CU DIE.
+  DwarfCompileUnit *lookupCU(const DIE *Die) { return CUDieMap.lookup(Die); }
+  const DwarfCompileUnit *lookupCU(const DIE *Die) const {
+    return CUDieMap.lookup(Die);
+  }
+
+  unsigned getStringTypeLoc(const DIStringType *ST) const {
+    return StringTypeLocMap.lookup(ST);
+  }
+
+  void addStringTypeLoc(const DIStringType *ST, unsigned Loc) {
+    assert(ST);
+    if (Loc)
+      StringTypeLocMap[ST] = Loc;
+  }
+
+  /// \defgroup DebuggerTuning Predicates to tune DWARF for a given debugger.
+  ///
+  /// Returns whether we are "tuning" for a given debugger.
+  /// @{
+  bool tuneForGDB() const { return DebuggerTuning == DebuggerKind::GDB; }
+  bool tuneForLLDB() const { return DebuggerTuning == DebuggerKind::LLDB; }
+  bool tuneForSCE() const { return DebuggerTuning == DebuggerKind::SCE; }
+  bool tuneForDBX() const { return DebuggerTuning == DebuggerKind::DBX; }
+  /// @}
+
+  const MCSymbol *getSectionLabel(const MCSection *S);
+  void insertSectionLabel(const MCSymbol *S);
+
+  static void emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
+                                const DbgValueLoc &Value,
+                                DwarfExpression &DwarfExpr);
+
+  /// If the \p File has an MD5 checksum, return it as an MD5Result
+  /// allocated in the MCContext.
+  std::optional<MD5::MD5Result> getMD5AsBytes(const DIFile *File) const;
+
+  MDNodeSet &getLocalDeclsForScope(const DILocalScope *S) {
+    return LocalDeclsPerLS[S];
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfException.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
new file mode 100644
index 000000000000..c2c11c7bc14d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -0,0 +1,110 @@
+//===-- DwarfException.h - Dwarf Exception Framework -----------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
+
+#include "EHStreamer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCDwarf.h"
+
+namespace llvm {
+class MachineFunction;
+class ARMTargetStreamer;
+
+class LLVM_LIBRARY_VISIBILITY DwarfCFIException : public EHStreamer {
+  /// Per-function flag to indicate if .cfi_personality should be emitted.
+  bool shouldEmitPersonality = false;
+
+  /// Per-function flag to indicate if .cfi_personality must be emitted.
+  bool forceEmitPersonality = false;
+
+  /// Per-function flag to indicate if .cfi_lsda should be emitted.
+  bool shouldEmitLSDA = false;
+
+  /// Per-function flag to indicate if frame CFI info should be emitted.
+  bool shouldEmitCFI = false;
+
+  /// Per-module flag to indicate if .cfi_section has beeen emitted.
+  bool hasEmittedCFISections = false;
+
+  /// Vector of all personality functions seen so far in the module.
+  std::vector<const GlobalValue *> Personalities;
+
+  void addPersonality(const GlobalValue *Personality);
+
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  DwarfCFIException(AsmPrinter *A);
+  ~DwarfCFIException() override;
+
+  /// Emit all exception information that should come after the content.
+  void endModule() override;
+
+  /// Gather pre-function exception information.  Assumes being emitted
+  /// immediately after the function entry point.
+  void beginFunction(const MachineFunction *MF) override;
+
+  /// Gather and emit post-function exception information.
+  void endFunction(const MachineFunction *) override;
+
+  void beginBasicBlockSection(const MachineBasicBlock &MBB) override;
+  void endBasicBlockSection(const MachineBasicBlock &MBB) override;
+};
+
+class LLVM_LIBRARY_VISIBILITY ARMException : public EHStreamer {
+  /// Per-function flag to indicate if frame CFI info should be emitted.
+  bool shouldEmitCFI = false;
+
+  /// Per-module flag to indicate if .cfi_section has beeen emitted.
+  bool hasEmittedCFISections = false;
+
+  void emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel) override;
+  ARMTargetStreamer &getTargetStreamer();
+
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  ARMException(AsmPrinter *A);
+  ~ARMException() override;
+
+  /// Emit all exception information that should come after the content.
+  void endModule() override {}
+
+  /// Gather pre-function exception information.  Assumes being emitted
+  /// immediately after the function entry point.
+  void beginFunction(const MachineFunction *MF) override;
+
+  /// Gather and emit post-function exception information.
+  void endFunction(const MachineFunction *) override;
+
+  void markFunctionEnd() override;
+};
+
+class LLVM_LIBRARY_VISIBILITY AIXException : public EHStreamer {
+  /// This is AIX's compat unwind section, which unwinder would use
+  /// to find the location of LSDA area and personality rountine.
+  void emitExceptionInfoTable(const MCSymbol *LSDA, const MCSymbol *PerSym);
+
+public:
+  AIXException(AsmPrinter *A);
+
+  void endModule() override {}
+  void beginFunction(const MachineFunction *MF) override {}
+  void endFunction(const MachineFunction *MF) override;
+};
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
new file mode 100644
index 000000000000..7623b7fb7c5d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -0,0 +1,740 @@
+//===- llvm/CodeGen/DwarfExpression.cpp - Dwarf Debug Framework -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf debug info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfExpression.h"
+#include "DwarfCompileUnit.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dwarfdebug"
+
+void DwarfExpression::emitConstu(uint64_t Value) {
+  if (Value < 32)
+    emitOp(dwarf::DW_OP_lit0 + Value);
+  else if (Value == std::numeric_limits<uint64_t>::max()) {
+    // Only do this for 64-bit values as the DWARF expression stack uses
+    // target-address-size values.
+    emitOp(dwarf::DW_OP_lit0);
+    emitOp(dwarf::DW_OP_not);
+  } else {
+    emitOp(dwarf::DW_OP_constu);
+    emitUnsigned(Value);
+  }
+}
+
+void DwarfExpression::addReg(int DwarfReg, const char *Comment) {
+  assert(DwarfReg >= 0 && "invalid negative dwarf register number");
+  assert((isUnknownLocation() || isRegisterLocation()) &&
+         "location description already locked down");
+  LocationKind = Register;
+  if (DwarfReg < 32) {
+    emitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment);
+  } else {
+    emitOp(dwarf::DW_OP_regx, Comment);
+    emitUnsigned(DwarfReg);
+  }
+}
+
+void DwarfExpression::addBReg(int DwarfReg, int Offset) {
+  assert(DwarfReg >= 0 && "invalid negative dwarf register number");
+  assert(!isRegisterLocation() && "location description already locked down");
+  if (DwarfReg < 32) {
+    emitOp(dwarf::DW_OP_breg0 + DwarfReg);
+  } else {
+    emitOp(dwarf::DW_OP_bregx);
+    emitUnsigned(DwarfReg);
+  }
+  emitSigned(Offset);
+}
+
+void DwarfExpression::addFBReg(int Offset) {
+  emitOp(dwarf::DW_OP_fbreg);
+  emitSigned(Offset);
+}
+
+void DwarfExpression::addOpPiece(unsigned SizeInBits, unsigned OffsetInBits) {
+  if (!SizeInBits)
+    return;
+
+  const unsigned SizeOfByte = 8;
+  if (OffsetInBits > 0 || SizeInBits % SizeOfByte) {
+    emitOp(dwarf::DW_OP_bit_piece);
+    emitUnsigned(SizeInBits);
+    emitUnsigned(OffsetInBits);
+  } else {
+    emitOp(dwarf::DW_OP_piece);
+    unsigned ByteSize = SizeInBits / SizeOfByte;
+    emitUnsigned(ByteSize);
+  }
+  this->OffsetInBits += SizeInBits;
+}
+
+void DwarfExpression::addShr(unsigned ShiftBy) {
+  emitConstu(ShiftBy);
+  emitOp(dwarf::DW_OP_shr);
+}
+
+void DwarfExpression::addAnd(unsigned Mask) {
+  emitConstu(Mask);
+  emitOp(dwarf::DW_OP_and);
+}
+
+bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
+                                    llvm::Register MachineReg,
+                                    unsigned MaxSize) {
+  if (!MachineReg.isPhysical()) {
+    if (isFrameRegister(TRI, MachineReg)) {
+      DwarfRegs.push_back(Register::createRegister(-1, nullptr));
+      return true;
+    }
+    return false;
+  }
+
+  int Reg = TRI.getDwarfRegNum(MachineReg, false);
+
+  // If this is a valid register number, emit it.
+  if (Reg >= 0) {
+    DwarfRegs.push_back(Register::createRegister(Reg, nullptr));
+    return true;
+  }
+
+  // Walk up the super-register chain until we find a valid number.
+  // For example, EAX on x86_64 is a 32-bit fragment of RAX with offset 0.
+  for (MCPhysReg SR : TRI.superregs(MachineReg)) {
+    Reg = TRI.getDwarfRegNum(SR, false);
+    if (Reg >= 0) {
+      unsigned Idx = TRI.getSubRegIndex(SR, MachineReg);
+      unsigned Size = TRI.getSubRegIdxSize(Idx);
+      unsigned RegOffset = TRI.getSubRegIdxOffset(Idx);
+      DwarfRegs.push_back(Register::createRegister(Reg, "super-register"));
+      // Use a DW_OP_bit_piece to describe the sub-register.
+      setSubRegisterPiece(Size, RegOffset);
+      return true;
+    }
+  }
+
+  // Otherwise, attempt to find a covering set of sub-register numbers.
+  // For example, Q0 on ARM is a composition of D0+D1.
+  unsigned CurPos = 0;
+  // The size of the register in bits.
+  const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(MachineReg);
+  unsigned RegSize = TRI.getRegSizeInBits(*RC);
+  // Keep track of the bits in the register we already emitted, so we
+  // can avoid emitting redundant aliasing subregs. Because this is
+  // just doing a greedy scan of all subregisters, it is possible that
+  // this doesn't find a combination of subregisters that fully cover
+  // the register (even though one may exist).
+  SmallBitVector Coverage(RegSize, false);
+  for (MCPhysReg SR : TRI.subregs(MachineReg)) {
+    unsigned Idx = TRI.getSubRegIndex(MachineReg, SR);
+    unsigned Size = TRI.getSubRegIdxSize(Idx);
+    unsigned Offset = TRI.getSubRegIdxOffset(Idx);
+    Reg = TRI.getDwarfRegNum(SR, false);
+    if (Reg < 0)
+      continue;
+
+    // Used to build the intersection between the bits we already
+    // emitted and the bits covered by this subregister.
+    SmallBitVector CurSubReg(RegSize, false);
+    CurSubReg.set(Offset, Offset + Size);
+
+    // If this sub-register has a DWARF number and we haven't covered
+    // its range, and its range covers the value, emit a DWARF piece for it.
+    if (Offset < MaxSize && CurSubReg.test(Coverage)) {
+      // Emit a piece for any gap in the coverage.
+      if (Offset > CurPos)
+        DwarfRegs.push_back(Register::createSubRegister(
+            -1, Offset - CurPos, "no DWARF register encoding"));
+      if (Offset == 0 && Size >= MaxSize)
+        DwarfRegs.push_back(Register::createRegister(Reg, "sub-register"));
+      else
+        DwarfRegs.push_back(Register::createSubRegister(
+            Reg, std::min<unsigned>(Size, MaxSize - Offset), "sub-register"));
+    }
+    // Mark it as emitted.
+    Coverage.set(Offset, Offset + Size);
+    CurPos = Offset + Size;
+  }
+  // Failed to find any DWARF encoding.
+  if (CurPos == 0)
+    return false;
+  // Found a partial or complete DWARF encoding.
+  if (CurPos < RegSize)
+    DwarfRegs.push_back(Register::createSubRegister(
+        -1, RegSize - CurPos, "no DWARF register encoding"));
+  return true;
+}
+
+void DwarfExpression::addStackValue() {
+  if (DwarfVersion >= 4)
+    emitOp(dwarf::DW_OP_stack_value);
+}
+
+void DwarfExpression::addSignedConstant(int64_t Value) {
+  assert(isImplicitLocation() || isUnknownLocation());
+  LocationKind = Implicit;
+  emitOp(dwarf::DW_OP_consts);
+  emitSigned(Value);
+}
+
+void DwarfExpression::addUnsignedConstant(uint64_t Value) {
+  assert(isImplicitLocation() || isUnknownLocation());
+  LocationKind = Implicit;
+  emitConstu(Value);
+}
+
+void DwarfExpression::addUnsignedConstant(const APInt &Value) {
+  assert(isImplicitLocation() || isUnknownLocation());
+  LocationKind = Implicit;
+
+  unsigned Size = Value.getBitWidth();
+  const uint64_t *Data = Value.getRawData();
+
+  // Chop it up into 64-bit pieces, because that's the maximum that
+  // addUnsignedConstant takes.
+  unsigned Offset = 0;
+  while (Offset < Size) {
+    addUnsignedConstant(*Data++);
+    if (Offset == 0 && Size <= 64)
+      break;
+    addStackValue();
+    addOpPiece(std::min(Size - Offset, 64u), Offset);
+    Offset += 64;
+  }
+}
+
+void DwarfExpression::addConstantFP(const APFloat &APF, const AsmPrinter &AP) {
+  assert(isImplicitLocation() || isUnknownLocation());
+  APInt API = APF.bitcastToAPInt();
+  int NumBytes = API.getBitWidth() / 8;
+  if (NumBytes == 4 /*float*/ || NumBytes == 8 /*double*/) {
+    // FIXME: Add support for `long double`.
+    emitOp(dwarf::DW_OP_implicit_value);
+    emitUnsigned(NumBytes /*Size of the block in bytes*/);
+
+    // The loop below is emitting the value starting at least significant byte,
+    // so we need to perform a byte-swap to get the byte order correct in case
+    // of a big-endian target.
+    if (AP.getDataLayout().isBigEndian())
+      API = API.byteSwap();
+
+    for (int i = 0; i < NumBytes; ++i) {
+      emitData1(API.getZExtValue() & 0xFF);
+      API = API.lshr(8);
+    }
+
+    return;
+  }
+  LLVM_DEBUG(
+      dbgs() << "Skipped DW_OP_implicit_value creation for ConstantFP of size: "
+             << API.getBitWidth() << " bits\n");
+}
+
+bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
+                                              DIExpressionCursor &ExprCursor,
+                                              llvm::Register MachineReg,
+                                              unsigned FragmentOffsetInBits) {
+  auto Fragment = ExprCursor.getFragmentInfo();
+  if (!addMachineReg(TRI, MachineReg, Fragment ? Fragment->SizeInBits : ~1U)) {
+    LocationKind = Unknown;
+    return false;
+  }
+
+  bool HasComplexExpression = false;
+  auto Op = ExprCursor.peek();
+  if (Op && Op->getOp() != dwarf::DW_OP_LLVM_fragment)
+    HasComplexExpression = true;
+
+  // If the register can only be described by a complex expression (i.e.,
+  // multiple subregisters) it doesn't safely compose with another complex
+  // expression. For example, it is not possible to apply a DW_OP_deref
+  // operation to multiple DW_OP_pieces, since composite location descriptions
+  // do not push anything on the DWARF stack.
+  //
+  // DW_OP_entry_value operations can only hold a DWARF expression or a
+  // register location description, so we can't emit a single entry value
+  // covering a composite location description. In the future we may want to
+  // emit entry value operations for each register location in the composite
+  // location, but until that is supported do not emit anything.
+  if ((HasComplexExpression || IsEmittingEntryValue) && DwarfRegs.size() > 1) {
+    if (IsEmittingEntryValue)
+      cancelEntryValue();
+    DwarfRegs.clear();
+    LocationKind = Unknown;
+    return false;
+  }
+
+  // Handle simple register locations. If we are supposed to emit
+  // a call site parameter expression and if that expression is just a register
+  // location, emit it with addBReg and offset 0, because we should emit a DWARF
+  // expression representing a value, rather than a location.
+  if ((!isParameterValue() && !isMemoryLocation() && !HasComplexExpression) ||
+      isEntryValue()) {
+    auto FragmentInfo = ExprCursor.getFragmentInfo();
+    unsigned RegSize = 0;
+    for (auto &Reg : DwarfRegs) {
+      RegSize += Reg.SubRegSize;
+      if (Reg.DwarfRegNo >= 0)
+        addReg(Reg.DwarfRegNo, Reg.Comment);
+      if (FragmentInfo)
+        if (RegSize > FragmentInfo->SizeInBits)
+          // If the register is larger than the current fragment stop
+          // once the fragment is covered.
+          break;
+      addOpPiece(Reg.SubRegSize);
+    }
+
+    if (isEntryValue()) {
+      finalizeEntryValue();
+
+      if (!isIndirect() && !isParameterValue() && !HasComplexExpression &&
+          DwarfVersion >= 4)
+        emitOp(dwarf::DW_OP_stack_value);
+    }
+
+    DwarfRegs.clear();
+    // If we need to mask out a subregister, do it now, unless the next
+    // operation would emit an OpPiece anyway.
+    auto NextOp = ExprCursor.peek();
+    if (SubRegisterSizeInBits && NextOp &&
+        (NextOp->getOp() != dwarf::DW_OP_LLVM_fragment))
+      maskSubRegister();
+    return true;
+  }
+
+  // Don't emit locations that cannot be expressed without DW_OP_stack_value.
+  if (DwarfVersion < 4)
+    if (any_of(ExprCursor, [](DIExpression::ExprOperand Op) -> bool {
+          return Op.getOp() == dwarf::DW_OP_stack_value;
+        })) {
+      DwarfRegs.clear();
+      LocationKind = Unknown;
+      return false;
+    }
+
+  // TODO: We should not give up here but the following code needs to be changed
+  //       to deal with multiple (sub)registers first.
+  if (DwarfRegs.size() > 1) {
+    LLVM_DEBUG(dbgs() << "TODO: giving up on debug information due to "
+                         "multi-register usage.\n");
+    DwarfRegs.clear();
+    LocationKind = Unknown;
+    return false;
+  }
+
+  auto Reg = DwarfRegs[0];
+  bool FBReg = isFrameRegister(TRI, MachineReg);
+  int SignedOffset = 0;
+  assert(!Reg.isSubRegister() && "full register expected");
+
+  // Pattern-match combinations for which more efficient representations exist.
+  // [Reg, DW_OP_plus_uconst, Offset] --> [DW_OP_breg, Offset].
+  if (Op && (Op->getOp() == dwarf::DW_OP_plus_uconst)) {
+    uint64_t Offset = Op->getArg(0);
+    uint64_t IntMax = static_cast<uint64_t>(std::numeric_limits<int>::max());
+    if (Offset <= IntMax) {
+      SignedOffset = Offset;
+      ExprCursor.take();
+    }
+  }
+
+  // [Reg, DW_OP_constu, Offset, DW_OP_plus]  --> [DW_OP_breg, Offset]
+  // [Reg, DW_OP_constu, Offset, DW_OP_minus] --> [DW_OP_breg,-Offset]
+  // If Reg is a subregister we need to mask it out before subtracting.
+  if (Op && Op->getOp() == dwarf::DW_OP_constu) {
+    uint64_t Offset = Op->getArg(0);
+    uint64_t IntMax = static_cast<uint64_t>(std::numeric_limits<int>::max());
+    auto N = ExprCursor.peekNext();
+    if (N && N->getOp() == dwarf::DW_OP_plus && Offset <= IntMax) {
+      SignedOffset = Offset;
+      ExprCursor.consume(2);
+    } else if (N && N->getOp() == dwarf::DW_OP_minus &&
+               !SubRegisterSizeInBits && Offset <= IntMax + 1) {
+      SignedOffset = -static_cast<int64_t>(Offset);
+      ExprCursor.consume(2);
+    }
+  }
+
+  if (FBReg)
+    addFBReg(SignedOffset);
+  else
+    addBReg(Reg.DwarfRegNo, SignedOffset);
+  DwarfRegs.clear();
+
+  // If we need to mask out a subregister, do it now, unless the next
+  // operation would emit an OpPiece anyway.
+  auto NextOp = ExprCursor.peek();
+  if (SubRegisterSizeInBits && NextOp &&
+      (NextOp->getOp() != dwarf::DW_OP_LLVM_fragment))
+    maskSubRegister();
+
+  return true;
+}
+
+void DwarfExpression::setEntryValueFlags(const MachineLocation &Loc) {
+  LocationFlags |= EntryValue;
+  if (Loc.isIndirect())
+    LocationFlags |= Indirect;
+}
+
+void DwarfExpression::setLocation(const MachineLocation &Loc,
+                                  const DIExpression *DIExpr) {
+  if (Loc.isIndirect())
+    setMemoryLocationKind();
+
+  if (DIExpr->isEntryValue())
+    setEntryValueFlags(Loc);
+}
+
+void DwarfExpression::beginEntryValueExpression(
+    DIExpressionCursor &ExprCursor) {
+  auto Op = ExprCursor.take();
+  (void)Op;
+  assert(Op && Op->getOp() == dwarf::DW_OP_LLVM_entry_value);
+  assert(!IsEmittingEntryValue && "Already emitting entry value?");
+  assert(Op->getArg(0) == 1 &&
+         "Can currently only emit entry values covering a single operation");
+
+  SavedLocationKind = LocationKind;
+  LocationKind = Register;
+  IsEmittingEntryValue = true;
+  enableTemporaryBuffer();
+}
+
+void DwarfExpression::finalizeEntryValue() {
+  assert(IsEmittingEntryValue && "Entry value not open?");
+  disableTemporaryBuffer();
+
+  emitOp(CU.getDwarf5OrGNULocationAtom(dwarf::DW_OP_entry_value));
+
+  // Emit the entry value's size operand.
+  unsigned Size = getTemporaryBufferSize();
+  emitUnsigned(Size);
+
+  // Emit the entry value's DWARF block operand.
+  commitTemporaryBuffer();
+
+  LocationFlags &= ~EntryValue;
+  LocationKind = SavedLocationKind;
+  IsEmittingEntryValue = false;
+}
+
+void DwarfExpression::cancelEntryValue() {
+  assert(IsEmittingEntryValue && "Entry value not open?");
+  disableTemporaryBuffer();
+
+  // The temporary buffer can't be emptied, so for now just assert that nothing
+  // has been emitted to it.
+  assert(getTemporaryBufferSize() == 0 &&
+         "Began emitting entry value block before cancelling entry value");
+
+  LocationKind = SavedLocationKind;
+  IsEmittingEntryValue = false;
+}
+
+unsigned DwarfExpression::getOrCreateBaseType(unsigned BitSize,
+                                              dwarf::TypeKind Encoding) {
+  // Reuse the base_type if we already have one in this CU otherwise we
+  // create a new one.
+  unsigned I = 0, E = CU.ExprRefedBaseTypes.size();
+  for (; I != E; ++I)
+    if (CU.ExprRefedBaseTypes[I].BitSize == BitSize &&
+        CU.ExprRefedBaseTypes[I].Encoding == Encoding)
+      break;
+
+  if (I == E)
+    CU.ExprRefedBaseTypes.emplace_back(BitSize, Encoding);
+  return I;
+}
+
+/// Assuming a well-formed expression, match "DW_OP_deref*
+/// DW_OP_LLVM_fragment?".
+static bool isMemoryLocation(DIExpressionCursor ExprCursor) {
+  while (ExprCursor) {
+    auto Op = ExprCursor.take();
+    switch (Op->getOp()) {
+    case dwarf::DW_OP_deref:
+    case dwarf::DW_OP_LLVM_fragment:
+      break;
+    default:
+      return false;
+    }
+  }
+  return true;
+}
+
+void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor) {
+  addExpression(std::move(ExprCursor),
+                [](unsigned Idx, DIExpressionCursor &Cursor) -> bool {
+                  llvm_unreachable("unhandled opcode found in expression");
+                });
+}
+
+bool DwarfExpression::addExpression(
+    DIExpressionCursor &&ExprCursor,
+    llvm::function_ref<bool(unsigned, DIExpressionCursor &)> InsertArg) {
+  // Entry values can currently only cover the initial register location,
+  // and not any other parts of the following DWARF expression.
+  assert(!IsEmittingEntryValue && "Can't emit entry value around expression");
+
+  std::optional<DIExpression::ExprOperand> PrevConvertOp;
+
+  while (ExprCursor) {
+    auto Op = ExprCursor.take();
+    uint64_t OpNum = Op->getOp();
+
+    if (OpNum >= dwarf::DW_OP_reg0 && OpNum <= dwarf::DW_OP_reg31) {
+      emitOp(OpNum);
+      continue;
+    } else if (OpNum >= dwarf::DW_OP_breg0 && OpNum <= dwarf::DW_OP_breg31) {
+      addBReg(OpNum - dwarf::DW_OP_breg0, Op->getArg(0));
+      continue;
+    }
+
+    switch (OpNum) {
+    case dwarf::DW_OP_LLVM_arg:
+      if (!InsertArg(Op->getArg(0), ExprCursor)) {
+        LocationKind = Unknown;
+        return false;
+      }
+      break;
+    case dwarf::DW_OP_LLVM_fragment: {
+      unsigned SizeInBits = Op->getArg(1);
+      unsigned FragmentOffset = Op->getArg(0);
+      // The fragment offset must have already been adjusted by emitting an
+      // empty DW_OP_piece / DW_OP_bit_piece before we emitted the base
+      // location.
+      assert(OffsetInBits >= FragmentOffset && "fragment offset not added?");
+      assert(SizeInBits >= OffsetInBits - FragmentOffset && "size underflow");
+
+      // If addMachineReg already emitted DW_OP_piece operations to represent
+      // a super-register by splicing together sub-registers, subtract the size
+      // of the pieces that was already emitted.
+      SizeInBits -= OffsetInBits - FragmentOffset;
+
+      // If addMachineReg requested a DW_OP_bit_piece to stencil out a
+      // sub-register that is smaller than the current fragment's size, use it.
+      if (SubRegisterSizeInBits)
+        SizeInBits = std::min<unsigned>(SizeInBits, SubRegisterSizeInBits);
+
+      // Emit a DW_OP_stack_value for implicit location descriptions.
+      if (isImplicitLocation())
+        addStackValue();
+
+      // Emit the DW_OP_piece.
+      addOpPiece(SizeInBits, SubRegisterOffsetInBits);
+      setSubRegisterPiece(0, 0);
+      // Reset the location description kind.
+      LocationKind = Unknown;
+      return true;
+    }
+    case dwarf::DW_OP_plus_uconst:
+      assert(!isRegisterLocation());
+      emitOp(dwarf::DW_OP_plus_uconst);
+      emitUnsigned(Op->getArg(0));
+      break;
+    case dwarf::DW_OP_plus:
+    case dwarf::DW_OP_minus:
+    case dwarf::DW_OP_mul:
+    case dwarf::DW_OP_div:
+    case dwarf::DW_OP_mod:
+    case dwarf::DW_OP_or:
+    case dwarf::DW_OP_and:
+    case dwarf::DW_OP_xor:
+    case dwarf::DW_OP_shl:
+    case dwarf::DW_OP_shr:
+    case dwarf::DW_OP_shra:
+    case dwarf::DW_OP_lit0:
+    case dwarf::DW_OP_not:
+    case dwarf::DW_OP_dup:
+    case dwarf::DW_OP_push_object_address:
+    case dwarf::DW_OP_over:
+    case dwarf::DW_OP_eq:
+    case dwarf::DW_OP_ne:
+    case dwarf::DW_OP_gt:
+    case dwarf::DW_OP_ge:
+    case dwarf::DW_OP_lt:
+    case dwarf::DW_OP_le:
+      emitOp(OpNum);
+      break;
+    case dwarf::DW_OP_deref:
+      assert(!isRegisterLocation());
+      if (!isMemoryLocation() && ::isMemoryLocation(ExprCursor))
+        // Turning this into a memory location description makes the deref
+        // implicit.
+        LocationKind = Memory;
+      else
+        emitOp(dwarf::DW_OP_deref);
+      break;
+    case dwarf::DW_OP_constu:
+      assert(!isRegisterLocation());
+      emitConstu(Op->getArg(0));
+      break;
+    case dwarf::DW_OP_consts:
+      assert(!isRegisterLocation());
+      emitOp(dwarf::DW_OP_consts);
+      emitSigned(Op->getArg(0));
+      break;
+    case dwarf::DW_OP_LLVM_convert: {
+      unsigned BitSize = Op->getArg(0);
+      dwarf::TypeKind Encoding = static_cast<dwarf::TypeKind>(Op->getArg(1));
+      if (DwarfVersion >= 5 && CU.getDwarfDebug().useOpConvert()) {
+        emitOp(dwarf::DW_OP_convert);
+        // If targeting a location-list; simply emit the index into the raw
+        // byte stream as ULEB128, DwarfDebug::emitDebugLocEntry has been
+        // fitted with means to extract it later.
+        // If targeting a inlined DW_AT_location; insert a DIEBaseTypeRef
+        // (containing the index and a resolve mechanism during emit) into the
+        // DIE value list.
+        emitBaseTypeRef(getOrCreateBaseType(BitSize, Encoding));
+      } else {
+        if (PrevConvertOp && PrevConvertOp->getArg(0) < BitSize) {
+          if (Encoding == dwarf::DW_ATE_signed)
+            emitLegacySExt(PrevConvertOp->getArg(0));
+          else if (Encoding == dwarf::DW_ATE_unsigned)
+            emitLegacyZExt(PrevConvertOp->getArg(0));
+          PrevConvertOp = std::nullopt;
+        } else {
+          PrevConvertOp = Op;
+        }
+      }
+      break;
+    }
+    case dwarf::DW_OP_stack_value:
+      LocationKind = Implicit;
+      break;
+    case dwarf::DW_OP_swap:
+      assert(!isRegisterLocation());
+      emitOp(dwarf::DW_OP_swap);
+      break;
+    case dwarf::DW_OP_xderef:
+      assert(!isRegisterLocation());
+      emitOp(dwarf::DW_OP_xderef);
+      break;
+    case dwarf::DW_OP_deref_size:
+      emitOp(dwarf::DW_OP_deref_size);
+      emitData1(Op->getArg(0));
+      break;
+    case dwarf::DW_OP_LLVM_tag_offset:
+      TagOffset = Op->getArg(0);
+      break;
+    case dwarf::DW_OP_regx:
+      emitOp(dwarf::DW_OP_regx);
+      emitUnsigned(Op->getArg(0));
+      break;
+    case dwarf::DW_OP_bregx:
+      emitOp(dwarf::DW_OP_bregx);
+      emitUnsigned(Op->getArg(0));
+      emitSigned(Op->getArg(1));
+      break;
+    default:
+      llvm_unreachable("unhandled opcode found in expression");
+    }
+  }
+
+  if (isImplicitLocation() && !isParameterValue())
+    // Turn this into an implicit location description.
+    addStackValue();
+
+  return true;
+}
+
+/// add masking operations to stencil out a subregister.
+void DwarfExpression::maskSubRegister() {
+  assert(SubRegisterSizeInBits && "no subregister was registered");
+  if (SubRegisterOffsetInBits > 0)
+    addShr(SubRegisterOffsetInBits);
+  uint64_t Mask = (1ULL << (uint64_t)SubRegisterSizeInBits) - 1ULL;
+  addAnd(Mask);
+}
+
+void DwarfExpression::finalize() {
+  assert(DwarfRegs.size() == 0 && "dwarf registers not emitted");
+  // Emit any outstanding DW_OP_piece operations to mask out subregisters.
+  if (SubRegisterSizeInBits == 0)
+    return;
+  // Don't emit a DW_OP_piece for a subregister at offset 0.
+  if (SubRegisterOffsetInBits == 0)
+    return;
+  addOpPiece(SubRegisterSizeInBits, SubRegisterOffsetInBits);
+}
+
+void DwarfExpression::addFragmentOffset(const DIExpression *Expr) {
+  if (!Expr || !Expr->isFragment())
+    return;
+
+  uint64_t FragmentOffset = Expr->getFragmentInfo()->OffsetInBits;
+  assert(FragmentOffset >= OffsetInBits &&
+         "overlapping or duplicate fragments");
+  if (FragmentOffset > OffsetInBits)
+    addOpPiece(FragmentOffset - OffsetInBits);
+  OffsetInBits = FragmentOffset;
+}
+
+void DwarfExpression::emitLegacySExt(unsigned FromBits) {
+  // (((X >> (FromBits - 1)) * (~0)) << FromBits) | X
+  emitOp(dwarf::DW_OP_dup);
+  emitOp(dwarf::DW_OP_constu);
+  emitUnsigned(FromBits - 1);
+  emitOp(dwarf::DW_OP_shr);
+  emitOp(dwarf::DW_OP_lit0);
+  emitOp(dwarf::DW_OP_not);
+  emitOp(dwarf::DW_OP_mul);
+  emitOp(dwarf::DW_OP_constu);
+  emitUnsigned(FromBits);
+  emitOp(dwarf::DW_OP_shl);
+  emitOp(dwarf::DW_OP_or);
+}
+
+void DwarfExpression::emitLegacyZExt(unsigned FromBits) {
+  // Heuristic to decide the most efficient encoding.
+  // A ULEB can encode 7 1-bits per byte.
+  if (FromBits / 7 < 1+1+1+1+1) {
+    // (X & (1 << FromBits - 1))
+    emitOp(dwarf::DW_OP_constu);
+    emitUnsigned((1ULL << FromBits) - 1);
+  } else {
+    // Note that the DWARF 4 stack consists of pointer-sized elements,
+    // so technically it doesn't make sense to shift left more than 64
+    // bits. We leave that for the consumer to decide though. LLDB for
+    // example uses APInt for the stack elements and can still deal
+    // with this.
+    emitOp(dwarf::DW_OP_lit1);
+    emitOp(dwarf::DW_OP_constu);
+    emitUnsigned(FromBits);
+    emitOp(dwarf::DW_OP_shl);
+    emitOp(dwarf::DW_OP_lit1);
+    emitOp(dwarf::DW_OP_minus);
+  }
+  emitOp(dwarf::DW_OP_and);
+}
+
+void DwarfExpression::addWasmLocation(unsigned Index, uint64_t Offset) {
+  emitOp(dwarf::DW_OP_WASM_location);
+  emitUnsigned(Index == 4/*TI_LOCAL_INDIRECT*/ ? 0/*TI_LOCAL*/ : Index);
+  emitUnsigned(Offset);
+  if (Index == 4 /*TI_LOCAL_INDIRECT*/) {
+    assert(LocationKind == Unknown);
+    LocationKind = Memory;
+  } else {
+    assert(LocationKind == Implicit || LocationKind == Unknown);
+    LocationKind = Implicit;
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
new file mode 100644
index 000000000000..667a9efc6f6c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -0,0 +1,439 @@
+//===- llvm/CodeGen/DwarfExpression.h - Dwarf Compile Unit ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXPRESSION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXPRESSION_H
+
+#include "ByteStreamer.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <optional>
+
+namespace llvm {
+
+class AsmPrinter;
+class APInt;
+class DwarfCompileUnit;
+class DIELoc;
+class TargetRegisterInfo;
+class MachineLocation;
+
+/// Holds a DIExpression and keeps track of how many operands have been consumed
+/// so far.
+class DIExpressionCursor {
+  DIExpression::expr_op_iterator Start, End;
+
+public:
+  DIExpressionCursor(const DIExpression *Expr) {
+    if (!Expr) {
+      assert(Start == End);
+      return;
+    }
+    Start = Expr->expr_op_begin();
+    End = Expr->expr_op_end();
+  }
+
+  DIExpressionCursor(ArrayRef<uint64_t> Expr)
+      : Start(Expr.begin()), End(Expr.end()) {}
+
+  DIExpressionCursor(const DIExpressionCursor &) = default;
+
+  /// Consume one operation.
+  std::optional<DIExpression::ExprOperand> take() {
+    if (Start == End)
+      return std::nullopt;
+    return *(Start++);
+  }
+
+  /// Consume N operations.
+  void consume(unsigned N) { std::advance(Start, N); }
+
+  /// Return the current operation.
+  std::optional<DIExpression::ExprOperand> peek() const {
+    if (Start == End)
+      return std::nullopt;
+    return *(Start);
+  }
+
+  /// Return the next operation.
+  std::optional<DIExpression::ExprOperand> peekNext() const {
+    if (Start == End)
+      return std::nullopt;
+
+    auto Next = Start.getNext();
+    if (Next == End)
+      return std::nullopt;
+
+    return *Next;
+  }
+
+  /// Determine whether there are any operations left in this expression.
+  operator bool() const { return Start != End; }
+
+  DIExpression::expr_op_iterator begin() const { return Start; }
+  DIExpression::expr_op_iterator end() const { return End; }
+
+  /// Retrieve the fragment information, if any.
+  std::optional<DIExpression::FragmentInfo> getFragmentInfo() const {
+    return DIExpression::getFragmentInfo(Start, End);
+  }
+};
+
+/// Base class containing the logic for constructing DWARF expressions
+/// independently of whether they are emitted into a DIE or into a .debug_loc
+/// entry.
+///
+/// Some DWARF operations, e.g. DW_OP_entry_value, need to calculate the size
+/// of a succeeding DWARF block before the latter is emitted to the output.
+/// To handle such cases, data can conditionally be emitted to a temporary
+/// buffer, which can later on be committed to the main output. The size of the
+/// temporary buffer is queryable, allowing for the size of the data to be
+/// emitted before the data is committed.
+class DwarfExpression {
+protected:
+  /// Holds information about all subregisters comprising a register location.
+  struct Register {
+    int DwarfRegNo;
+    unsigned SubRegSize;
+    const char *Comment;
+
+    /// Create a full register, no extra DW_OP_piece operators necessary.
+    static Register createRegister(int RegNo, const char *Comment) {
+      return {RegNo, 0, Comment};
+    }
+
+    /// Create a subregister that needs a DW_OP_piece operator with SizeInBits.
+    static Register createSubRegister(int RegNo, unsigned SizeInBits,
+                                      const char *Comment) {
+      return {RegNo, SizeInBits, Comment};
+    }
+
+    bool isSubRegister() const { return SubRegSize; }
+  };
+
+  /// Whether we are currently emitting an entry value operation.
+  bool IsEmittingEntryValue = false;
+
+  DwarfCompileUnit &CU;
+
+  /// The register location, if any.
+  SmallVector<Register, 2> DwarfRegs;
+
+  /// Current Fragment Offset in Bits.
+  uint64_t OffsetInBits = 0;
+
+  /// Sometimes we need to add a DW_OP_bit_piece to describe a subregister.
+  unsigned SubRegisterSizeInBits : 16;
+  unsigned SubRegisterOffsetInBits : 16;
+
+  /// The kind of location description being produced.
+  enum { Unknown = 0, Register, Memory, Implicit };
+
+  /// Additional location flags which may be combined with any location kind.
+  /// Currently, entry values are not supported for the Memory location kind.
+  enum { EntryValue = 1 << 0, Indirect = 1 << 1, CallSiteParamValue = 1 << 2 };
+
+  unsigned LocationKind : 3;
+  unsigned SavedLocationKind : 3;
+  unsigned LocationFlags : 3;
+  unsigned DwarfVersion : 4;
+
+public:
+  /// Set the location (\p Loc) and \ref DIExpression (\p DIExpr) to describe.
+  void setLocation(const MachineLocation &Loc, const DIExpression *DIExpr);
+
+  bool isUnknownLocation() const { return LocationKind == Unknown; }
+
+  bool isMemoryLocation() const { return LocationKind == Memory; }
+
+  bool isRegisterLocation() const { return LocationKind == Register; }
+
+  bool isImplicitLocation() const { return LocationKind == Implicit; }
+
+  bool isEntryValue() const { return LocationFlags & EntryValue; }
+
+  bool isIndirect() const { return LocationFlags & Indirect; }
+
+  bool isParameterValue() { return LocationFlags & CallSiteParamValue; }
+
+  std::optional<uint8_t> TagOffset;
+
+protected:
+  /// Push a DW_OP_piece / DW_OP_bit_piece for emitting later, if one is needed
+  /// to represent a subregister.
+  void setSubRegisterPiece(unsigned SizeInBits, unsigned OffsetInBits) {
+    assert(SizeInBits < 65536 && OffsetInBits < 65536);
+    SubRegisterSizeInBits = SizeInBits;
+    SubRegisterOffsetInBits = OffsetInBits;
+  }
+
+  /// Add masking operations to stencil out a subregister.
+  void maskSubRegister();
+
+  /// Output a dwarf operand and an optional assembler comment.
+  virtual void emitOp(uint8_t Op, const char *Comment = nullptr) = 0;
+
+  /// Emit a raw signed value.
+  virtual void emitSigned(int64_t Value) = 0;
+
+  /// Emit a raw unsigned value.
+  virtual void emitUnsigned(uint64_t Value) = 0;
+
+  virtual void emitData1(uint8_t Value) = 0;
+
+  virtual void emitBaseTypeRef(uint64_t Idx) = 0;
+
+  /// Start emitting data to the temporary buffer. The data stored in the
+  /// temporary buffer can be committed to the main output using
+  /// commitTemporaryBuffer().
+  virtual void enableTemporaryBuffer() = 0;
+
+  /// Disable emission to the temporary buffer. This does not commit data
+  /// in the temporary buffer to the main output.
+  virtual void disableTemporaryBuffer() = 0;
+
+  /// Return the emitted size, in number of bytes, for the data stored in the
+  /// temporary buffer.
+  virtual unsigned getTemporaryBufferSize() = 0;
+
+  /// Commit the data stored in the temporary buffer to the main output.
+  virtual void commitTemporaryBuffer() = 0;
+
+  /// Emit a normalized unsigned constant.
+  void emitConstu(uint64_t Value);
+
+  /// Return whether the given machine register is the frame register in the
+  /// current function.
+  virtual bool isFrameRegister(const TargetRegisterInfo &TRI,
+                               llvm::Register MachineReg) = 0;
+
+  /// Emit a DW_OP_reg operation. Note that this is only legal inside a DWARF
+  /// register location description.
+  void addReg(int DwarfReg, const char *Comment = nullptr);
+
+  /// Emit a DW_OP_breg operation.
+  void addBReg(int DwarfReg, int Offset);
+
+  /// Emit DW_OP_fbreg <Offset>.
+  void addFBReg(int Offset);
+
+  /// Emit a partial DWARF register operation.
+  ///
+  /// \param MachineReg           The register number.
+  /// \param MaxSize              If the register must be composed from
+  ///                             sub-registers this is an upper bound
+  ///                             for how many bits the emitted DW_OP_piece
+  ///                             may cover.
+  ///
+  /// If size and offset is zero an operation for the entire register is
+  /// emitted: Some targets do not provide a DWARF register number for every
+  /// register.  If this is the case, this function will attempt to emit a DWARF
+  /// register by emitting a fragment of a super-register or by piecing together
+  /// multiple subregisters that alias the register.
+  ///
+  /// \return false if no DWARF register exists for MachineReg.
+  bool addMachineReg(const TargetRegisterInfo &TRI, llvm::Register MachineReg,
+                     unsigned MaxSize = ~1U);
+
+  /// Emit a DW_OP_piece or DW_OP_bit_piece operation for a variable fragment.
+  /// \param OffsetInBits    This is an optional offset into the location that
+  /// is at the top of the DWARF stack.
+  void addOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0);
+
+  /// Emit a shift-right dwarf operation.
+  void addShr(unsigned ShiftBy);
+
+  /// Emit a bitwise and dwarf operation.
+  void addAnd(unsigned Mask);
+
+  /// Emit a DW_OP_stack_value, if supported.
+  ///
+  /// The proper way to describe a constant value is DW_OP_constu <const>,
+  /// DW_OP_stack_value.  Unfortunately, DW_OP_stack_value was not available
+  /// until DWARF 4, so we will continue to generate DW_OP_constu <const> for
+  /// DWARF 2 and DWARF 3. Technically, this is incorrect since DW_OP_const
+  /// <const> actually describes a value at a constant address, not a constant
+  /// value.  However, in the past there was no better way to describe a
+  /// constant value, so the producers and consumers started to rely on
+  /// heuristics to disambiguate the value vs. location status of the
+  /// expression.  See PR21176 for more details.
+  void addStackValue();
+
+  /// Finalize an entry value by emitting its size operand, and committing the
+  /// DWARF block which has been emitted to the temporary buffer.
+  void finalizeEntryValue();
+
+  /// Cancel the emission of an entry value.
+  void cancelEntryValue();
+
+  ~DwarfExpression() = default;
+
+public:
+  DwarfExpression(unsigned DwarfVersion, DwarfCompileUnit &CU)
+      : CU(CU), SubRegisterSizeInBits(0), SubRegisterOffsetInBits(0),
+        LocationKind(Unknown), SavedLocationKind(Unknown),
+        LocationFlags(Unknown), DwarfVersion(DwarfVersion) {}
+
+  /// This needs to be called last to commit any pending changes.
+  void finalize();
+
+  /// Emit a signed constant.
+  void addSignedConstant(int64_t Value);
+
+  /// Emit an unsigned constant.
+  void addUnsignedConstant(uint64_t Value);
+
+  /// Emit an unsigned constant.
+  void addUnsignedConstant(const APInt &Value);
+
+  /// Emit an floating point constant.
+  void addConstantFP(const APFloat &Value, const AsmPrinter &AP);
+
+  /// Lock this down to become a memory location description.
+  void setMemoryLocationKind() {
+    assert(isUnknownLocation());
+    LocationKind = Memory;
+  }
+
+  /// Lock this down to become an entry value location.
+  void setEntryValueFlags(const MachineLocation &Loc);
+
+  /// Lock this down to become a call site parameter location.
+  void setCallSiteParamValueFlag() { LocationFlags |= CallSiteParamValue; }
+
+  /// Emit a machine register location. As an optimization this may also consume
+  /// the prefix of a DwarfExpression if a more efficient representation for
+  /// combining the register location and the first operation exists.
+  ///
+  /// \param FragmentOffsetInBits     If this is one fragment out of a
+  /// fragmented
+  ///                                 location, this is the offset of the
+  ///                                 fragment inside the entire variable.
+  /// \return                         false if no DWARF register exists
+  ///                                 for MachineReg.
+  bool addMachineRegExpression(const TargetRegisterInfo &TRI,
+                               DIExpressionCursor &Expr,
+                               llvm::Register MachineReg,
+                               unsigned FragmentOffsetInBits = 0);
+
+  /// Begin emission of an entry value dwarf operation. The entry value's
+  /// first operand is the size of the DWARF block (its second operand),
+  /// which needs to be calculated at time of emission, so we don't emit
+  /// any operands here.
+  void beginEntryValueExpression(DIExpressionCursor &ExprCursor);
+
+  /// Return the index of a base type with the given properties and
+  /// create one if necessary.
+  unsigned getOrCreateBaseType(unsigned BitSize, dwarf::TypeKind Encoding);
+
+  /// Emit all remaining operations in the DIExpressionCursor. The
+  /// cursor must not contain any DW_OP_LLVM_arg operations.
+  void addExpression(DIExpressionCursor &&Expr);
+
+  /// Emit all remaining operations in the DIExpressionCursor.
+  /// DW_OP_LLVM_arg operations are resolved by calling (\p InsertArg).
+  //
+  /// \return false if any call to (\p InsertArg) returns false.
+  bool addExpression(
+      DIExpressionCursor &&Expr,
+      llvm::function_ref<bool(unsigned, DIExpressionCursor &)> InsertArg);
+
+  /// If applicable, emit an empty DW_OP_piece / DW_OP_bit_piece to advance to
+  /// the fragment described by \c Expr.
+  void addFragmentOffset(const DIExpression *Expr);
+
+  void emitLegacySExt(unsigned FromBits);
+  void emitLegacyZExt(unsigned FromBits);
+
+  /// Emit location information expressed via WebAssembly location + offset
+  /// The Index is an identifier for locals, globals or operand stack.
+  void addWasmLocation(unsigned Index, uint64_t Offset);
+};
+
+/// DwarfExpression implementation for .debug_loc entries.
+class DebugLocDwarfExpression final : public DwarfExpression {
+
+  struct TempBuffer {
+    SmallString<32> Bytes;
+    std::vector<std::string> Comments;
+    BufferByteStreamer BS;
+
+    TempBuffer(bool GenerateComments) : BS(Bytes, Comments, GenerateComments) {}
+  };
+
+  std::unique_ptr<TempBuffer> TmpBuf;
+  BufferByteStreamer &OutBS;
+  bool IsBuffering = false;
+
+  /// Return the byte streamer that currently is being emitted to.
+  ByteStreamer &getActiveStreamer() { return IsBuffering ? TmpBuf->BS : OutBS; }
+
+  void emitOp(uint8_t Op, const char *Comment = nullptr) override;
+  void emitSigned(int64_t Value) override;
+  void emitUnsigned(uint64_t Value) override;
+  void emitData1(uint8_t Value) override;
+  void emitBaseTypeRef(uint64_t Idx) override;
+
+  void enableTemporaryBuffer() override;
+  void disableTemporaryBuffer() override;
+  unsigned getTemporaryBufferSize() override;
+  void commitTemporaryBuffer() override;
+
+  bool isFrameRegister(const TargetRegisterInfo &TRI,
+                       llvm::Register MachineReg) override;
+
+public:
+  DebugLocDwarfExpression(unsigned DwarfVersion, BufferByteStreamer &BS,
+                          DwarfCompileUnit &CU)
+      : DwarfExpression(DwarfVersion, CU), OutBS(BS) {}
+};
+
+/// DwarfExpression implementation for singular DW_AT_location.
+class DIEDwarfExpression final : public DwarfExpression {
+  const AsmPrinter &AP;
+  DIELoc &OutDIE;
+  DIELoc TmpDIE;
+  bool IsBuffering = false;
+
+  /// Return the DIE that currently is being emitted to.
+  DIELoc &getActiveDIE() { return IsBuffering ? TmpDIE : OutDIE; }
+
+  void emitOp(uint8_t Op, const char *Comment = nullptr) override;
+  void emitSigned(int64_t Value) override;
+  void emitUnsigned(uint64_t Value) override;
+  void emitData1(uint8_t Value) override;
+  void emitBaseTypeRef(uint64_t Idx) override;
+
+  void enableTemporaryBuffer() override;
+  void disableTemporaryBuffer() override;
+  unsigned getTemporaryBufferSize() override;
+  void commitTemporaryBuffer() override;
+
+  bool isFrameRegister(const TargetRegisterInfo &TRI,
+                       llvm::Register MachineReg) override;
+
+public:
+  DIEDwarfExpression(const AsmPrinter &AP, DwarfCompileUnit &CU, DIELoc &DIE);
+
+  DIELoc *finalize() {
+    DwarfExpression::finalize();
+    return &OutDIE;
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXPRESSION_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
new file mode 100644
index 000000000000..3fe437a07c92
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -0,0 +1,132 @@
+//===- llvm/CodeGen/DwarfFile.cpp - Dwarf Debug Framework -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfFile.h"
+#include "DwarfCompileUnit.h"
+#include "DwarfDebug.h"
+#include "DwarfUnit.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/MC/MCStreamer.h"
+#include <cstdint>
+
+using namespace llvm;
+
+DwarfFile::DwarfFile(AsmPrinter *AP, StringRef Pref, BumpPtrAllocator &DA)
+    : Asm(AP), Abbrevs(AbbrevAllocator), StrPool(DA, *Asm, Pref) {}
+
+void DwarfFile::addUnit(std::unique_ptr<DwarfCompileUnit> U) {
+  CUs.push_back(std::move(U));
+}
+
+// Emit the various dwarf units to the unit section USection with
+// the abbreviations going into ASection.
+void DwarfFile::emitUnits(bool UseOffsets) {
+  for (const auto &TheU : CUs)
+    emitUnit(TheU.get(), UseOffsets);
+}
+
+void DwarfFile::emitUnit(DwarfUnit *TheU, bool UseOffsets) {
+  if (TheU->getCUNode()->isDebugDirectivesOnly())
+    return;
+
+  MCSection *S = TheU->getSection();
+
+  if (!S)
+    return;
+
+  // Skip CUs that ended up not being needed (split CUs that were abandoned
+  // because they added no information beyond the non-split CU)
+  if (TheU->getUnitDie().values().empty())
+    return;
+
+  Asm->OutStreamer->switchSection(S);
+  TheU->emitHeader(UseOffsets);
+  Asm->emitDwarfDIE(TheU->getUnitDie());
+
+  if (MCSymbol *EndLabel = TheU->getEndLabel())
+    Asm->OutStreamer->emitLabel(EndLabel);
+}
+
+// Compute the size and offset for each DIE.
+void DwarfFile::computeSizeAndOffsets() {
+  // Offset from the first CU in the debug info section is 0 initially.
+  uint64_t SecOffset = 0;
+
+  // Iterate over each compile unit and set the size and offsets for each
+  // DIE within each compile unit. All offsets are CU relative.
+  for (const auto &TheU : CUs) {
+    if (TheU->getCUNode()->isDebugDirectivesOnly())
+      continue;
+
+    // Skip CUs that ended up not being needed (split CUs that were abandoned
+    // because they added no information beyond the non-split CU)
+    if (TheU->getUnitDie().values().empty())
+      return;
+
+    TheU->setDebugSectionOffset(SecOffset);
+    SecOffset += computeSizeAndOffsetsForUnit(TheU.get());
+  }
+  if (SecOffset > UINT32_MAX && !Asm->isDwarf64())
+    report_fatal_error("The generated debug information is too large "
+                       "for the 32-bit DWARF format.");
+}
+
+unsigned DwarfFile::computeSizeAndOffsetsForUnit(DwarfUnit *TheU) {
+  // CU-relative offset is reset to 0 here.
+  unsigned Offset = Asm->getUnitLengthFieldByteSize() + // Length of Unit Info
+                    TheU->getHeaderSize();              // Unit-specific headers
+
+  // The return value here is CU-relative, after laying out
+  // all of the CU DIE.
+  return computeSizeAndOffset(TheU->getUnitDie(), Offset);
+}
+
+// Compute the size and offset of a DIE. The offset is relative to start of the
+// CU. It returns the offset after laying out the DIE.
+unsigned DwarfFile::computeSizeAndOffset(DIE &Die, unsigned Offset) {
+  return Die.computeOffsetsAndAbbrevs(Asm->getDwarfFormParams(), Abbrevs,
+                                      Offset);
+}
+
+void DwarfFile::emitAbbrevs(MCSection *Section) { Abbrevs.Emit(Asm, Section); }
+
+// Emit strings into a string section.
+void DwarfFile::emitStrings(MCSection *StrSection, MCSection *OffsetSection,
+                            bool UseRelativeOffsets) {
+  StrPool.emit(*Asm, StrSection, OffsetSection, UseRelativeOffsets);
+}
+
+bool DwarfFile::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
+  auto &ScopeVars = ScopeVariables[LS];
+  const DILocalVariable *DV = Var->getVariable();
+  if (unsigned ArgNum = DV->getArg()) {
+    auto Cached = ScopeVars.Args.find(ArgNum);
+    if (Cached == ScopeVars.Args.end())
+      ScopeVars.Args[ArgNum] = Var;
+    else {
+      Cached->second->addMMIEntry(*Var);
+      return false;
+    }
+  } else {
+    ScopeVars.Locals.push_back(Var);
+  }
+  return true;
+}
+
+void DwarfFile::addScopeLabel(LexicalScope *LS, DbgLabel *Label) {
+  SmallVectorImpl<DbgLabel *> &Labels = ScopeLabels[LS];
+  Labels.push_back(Label);
+}
+
+std::pair<uint32_t, RangeSpanList *>
+DwarfFile::addRange(const DwarfCompileUnit &CU, SmallVector<RangeSpan, 2> R) {
+  CURangeLists.push_back(
+      RangeSpanList{Asm->createTempSymbol("debug_ranges"), &CU, std::move(R)});
+  return std::make_pair(CURangeLists.size() - 1, &CURangeLists.back());
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
new file mode 100644
index 000000000000..464f4f048016
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -0,0 +1,185 @@
+//===- llvm/CodeGen/DwarfFile.h - Dwarf Debug Framework ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFFILE_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFFILE_H
+
+#include "DwarfStringPool.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/Support/Allocator.h"
+#include <map>
+#include <memory>
+#include <utility>
+
+namespace llvm {
+
+class AsmPrinter;
+class DbgEntity;
+class DbgVariable;
+class DbgLabel;
+class DINode;
+class DILocalScope;
+class DwarfCompileUnit;
+class DwarfUnit;
+class LexicalScope;
+class MCSection;
+class MDNode;
+
+// Data structure to hold a range for range lists.
+struct RangeSpan {
+  const MCSymbol *Begin;
+  const MCSymbol *End;
+};
+
+struct RangeSpanList {
+  // Index for locating within the debug_range section this particular span.
+  MCSymbol *Label;
+  const DwarfCompileUnit *CU;
+  // List of ranges.
+  SmallVector<RangeSpan, 2> Ranges;
+};
+
+class DwarfFile {
+  // Target of Dwarf emission, used for sizing of abbreviations.
+  AsmPrinter *Asm;
+
+  BumpPtrAllocator AbbrevAllocator;
+
+  // Used to uniquely define abbreviations.
+  DIEAbbrevSet Abbrevs;
+
+  // A pointer to all units in the section.
+  SmallVector<std::unique_ptr<DwarfCompileUnit>, 1> CUs;
+
+  DwarfStringPool StrPool;
+
+  // List of range lists for a given compile unit, separate from the ranges for
+  // the CU itself.
+  SmallVector<RangeSpanList, 1> CURangeLists;
+
+  /// DWARF v5: The symbol that designates the start of the contribution to
+  /// the string offsets table. The contribution is shared by all units.
+  MCSymbol *StringOffsetsStartSym = nullptr;
+
+  /// DWARF v5: The symbol that designates the base of the range list table.
+  /// The table is shared by all units.
+  MCSymbol *RnglistsTableBaseSym = nullptr;
+
+  /// The variables of a lexical scope.
+  struct ScopeVars {
+    /// We need to sort Args by ArgNo and check for duplicates. This could also
+    /// be implemented as a list or vector + std::lower_bound().
+    std::map<unsigned, DbgVariable *> Args;
+    SmallVector<DbgVariable *, 8> Locals;
+  };
+  /// Collection of DbgVariables of each lexical scope.
+  DenseMap<LexicalScope *, ScopeVars> ScopeVariables;
+
+  /// Collection of DbgLabels of each lexical scope.
+  using LabelList = SmallVector<DbgLabel *, 4>;
+  DenseMap<LexicalScope *, LabelList> ScopeLabels;
+
+  // Collection of abstract subprogram DIEs.
+  DenseMap<const DILocalScope *, DIE *> AbstractLocalScopeDIEs;
+  DenseMap<const DINode *, std::unique_ptr<DbgEntity>> AbstractEntities;
+
+  /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can
+  /// be shared across CUs, that is why we keep the map here instead
+  /// of in DwarfCompileUnit.
+  DenseMap<const MDNode *, DIE *> DITypeNodeToDieMap;
+
+public:
+  DwarfFile(AsmPrinter *AP, StringRef Pref, BumpPtrAllocator &DA);
+
+  const SmallVectorImpl<std::unique_ptr<DwarfCompileUnit>> &getUnits() {
+    return CUs;
+  }
+
+  std::pair<uint32_t, RangeSpanList *> addRange(const DwarfCompileUnit &CU,
+                                                SmallVector<RangeSpan, 2> R);
+
+  /// getRangeLists - Get the vector of range lists.
+  const SmallVectorImpl<RangeSpanList> &getRangeLists() const {
+    return CURangeLists;
+  }
+
+  /// Compute the size and offset of a DIE given an incoming Offset.
+  unsigned computeSizeAndOffset(DIE &Die, unsigned Offset);
+
+  /// Compute the size and offset of all the DIEs.
+  void computeSizeAndOffsets();
+
+  /// Compute the size and offset of all the DIEs in the given unit.
+  /// \returns The size of the root DIE.
+  unsigned computeSizeAndOffsetsForUnit(DwarfUnit *TheU);
+
+  /// Add a unit to the list of CUs.
+  void addUnit(std::unique_ptr<DwarfCompileUnit> U);
+
+  /// Emit all of the units to the section listed with the given
+  /// abbreviation section.
+  void emitUnits(bool UseOffsets);
+
+  /// Emit the given unit to its section.
+  void emitUnit(DwarfUnit *TheU, bool UseOffsets);
+
+  /// Emit a set of abbreviations to the specific section.
+  void emitAbbrevs(MCSection *);
+
+  /// Emit all of the strings to the section given. If OffsetSection is
+  /// non-null, emit a table of string offsets to it. If UseRelativeOffsets
+  /// is false, emit absolute offsets to the strings. Otherwise, emit
+  /// relocatable references to the strings if they are supported by the target.
+  void emitStrings(MCSection *StrSection, MCSection *OffsetSection = nullptr,
+                   bool UseRelativeOffsets = false);
+
+  /// Returns the string pool.
+  DwarfStringPool &getStringPool() { return StrPool; }
+
+  MCSymbol *getStringOffsetsStartSym() const { return StringOffsetsStartSym; }
+  void setStringOffsetsStartSym(MCSymbol *Sym) { StringOffsetsStartSym = Sym; }
+
+  MCSymbol *getRnglistsTableBaseSym() const { return RnglistsTableBaseSym; }
+  void setRnglistsTableBaseSym(MCSymbol *Sym) { RnglistsTableBaseSym = Sym; }
+
+  /// \returns false if the variable was merged with a previous one.
+  bool addScopeVariable(LexicalScope *LS, DbgVariable *Var);
+
+  void addScopeLabel(LexicalScope *LS, DbgLabel *Label);
+
+  DenseMap<LexicalScope *, ScopeVars> &getScopeVariables() {
+    return ScopeVariables;
+  }
+
+  DenseMap<LexicalScope *, LabelList> &getScopeLabels() {
+    return ScopeLabels;
+  }
+
+  DenseMap<const DILocalScope *, DIE *> &getAbstractScopeDIEs() {
+    return AbstractLocalScopeDIEs;
+  }
+
+  DenseMap<const DINode *, std::unique_ptr<DbgEntity>> &getAbstractEntities() {
+    return AbstractEntities;
+  }
+
+  void insertDIE(const MDNode *TypeMD, DIE *Die) {
+    DITypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
+  }
+
+  DIE *getDIE(const MDNode *TypeMD) {
+    return DITypeNodeToDieMap.lookup(TypeMD);
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_ASMPRINTER_DWARFFILE_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
new file mode 100644
index 000000000000..2292590b135e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
@@ -0,0 +1,128 @@
+//===- llvm/CodeGen/DwarfStringPool.cpp - Dwarf Debug Framework -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfStringPool.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+
+DwarfStringPool::DwarfStringPool(BumpPtrAllocator &A, AsmPrinter &Asm,
+                                 StringRef Prefix)
+    : Pool(A), Prefix(Prefix),
+      ShouldCreateSymbols(Asm.doesDwarfUseRelocationsAcrossSections()) {}
+
+StringMapEntry<DwarfStringPool::EntryTy> &
+DwarfStringPool::getEntryImpl(AsmPrinter &Asm, StringRef Str) {
+  auto I = Pool.insert(std::make_pair(Str, EntryTy()));
+  auto &Entry = I.first->second;
+  if (I.second) {
+    Entry.Index = EntryTy::NotIndexed;
+    Entry.Offset = NumBytes;
+    Entry.Symbol = ShouldCreateSymbols ? Asm.createTempSymbol(Prefix) : nullptr;
+
+    NumBytes += Str.size() + 1;
+  }
+  return *I.first;
+}
+
+DwarfStringPool::EntryRef DwarfStringPool::getEntry(AsmPrinter &Asm,
+                                                    StringRef Str) {
+  auto &MapEntry = getEntryImpl(Asm, Str);
+  return EntryRef(MapEntry);
+}
+
+DwarfStringPool::EntryRef DwarfStringPool::getIndexedEntry(AsmPrinter &Asm,
+                                                           StringRef Str) {
+  auto &MapEntry = getEntryImpl(Asm, Str);
+  if (!MapEntry.getValue().isIndexed())
+    MapEntry.getValue().Index = NumIndexedStrings++;
+  return EntryRef(MapEntry);
+}
+
+void DwarfStringPool::emitStringOffsetsTableHeader(AsmPrinter &Asm,
+                                                   MCSection *Section,
+                                                   MCSymbol *StartSym) {
+  if (getNumIndexedStrings() == 0)
+    return;
+  Asm.OutStreamer->switchSection(Section);
+  unsigned EntrySize = Asm.getDwarfOffsetByteSize();
+  // We are emitting the header for a contribution to the string offsets
+  // table. The header consists of an entry with the contribution's
+  // size (not including the size of the length field), the DWARF version and
+  // 2 bytes of padding.
+  Asm.emitDwarfUnitLength(getNumIndexedStrings() * EntrySize + 4,
+                          "Length of String Offsets Set");
+  Asm.emitInt16(Asm.getDwarfVersion());
+  Asm.emitInt16(0);
+  // Define the symbol that marks the start of the contribution. It is
+  // referenced by most unit headers via DW_AT_str_offsets_base.
+  // Split units do not use the attribute.
+  if (StartSym)
+    Asm.OutStreamer->emitLabel(StartSym);
+}
+
+void DwarfStringPool::emit(AsmPrinter &Asm, MCSection *StrSection,
+                           MCSection *OffsetSection, bool UseRelativeOffsets) {
+  if (Pool.empty())
+    return;
+
+  // Start the dwarf str section.
+  Asm.OutStreamer->switchSection(StrSection);
+
+  // Get all of the string pool entries and sort them by their offset.
+  SmallVector<const StringMapEntry<EntryTy> *, 64> Entries;
+  Entries.reserve(Pool.size());
+
+  for (const auto &E : Pool)
+    Entries.push_back(&E);
+
+  llvm::sort(Entries, [](const StringMapEntry<EntryTy> *A,
+                         const StringMapEntry<EntryTy> *B) {
+    return A->getValue().Offset < B->getValue().Offset;
+  });
+
+  for (const auto &Entry : Entries) {
+    assert(ShouldCreateSymbols == static_cast<bool>(Entry->getValue().Symbol) &&
+           "Mismatch between setting and entry");
+
+    // Emit a label for reference from debug information entries.
+    if (ShouldCreateSymbols)
+      Asm.OutStreamer->emitLabel(Entry->getValue().Symbol);
+
+    // Emit the string itself with a terminating null byte.
+    Asm.OutStreamer->AddComment("string offset=" +
+                                Twine(Entry->getValue().Offset));
+    Asm.OutStreamer->emitBytes(
+        StringRef(Entry->getKeyData(), Entry->getKeyLength() + 1));
+  }
+
+  // If we've got an offset section go ahead and emit that now as well.
+  if (OffsetSection) {
+    // Now only take the indexed entries and put them in an array by their ID so
+    // we can emit them in order.
+    Entries.resize(NumIndexedStrings);
+    for (const auto &Entry : Pool) {
+      if (Entry.getValue().isIndexed())
+        Entries[Entry.getValue().Index] = &Entry;
+    }
+
+    Asm.OutStreamer->switchSection(OffsetSection);
+    unsigned size = Asm.getDwarfOffsetByteSize();
+    for (const auto &Entry : Entries)
+      if (UseRelativeOffsets)
+        Asm.emitDwarfStringOffset(Entry->getValue());
+      else
+        Asm.OutStreamer->emitIntValue(Entry->getValue().Offset, size);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h
new file mode 100644
index 000000000000..79b5df89e338
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h
@@ -0,0 +1,66 @@
+//===- llvm/CodeGen/DwarfStringPool.h - Dwarf Debug Framework ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFSTRINGPOOL_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFSTRINGPOOL_H
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/DwarfStringPoolEntry.h"
+#include "llvm/Support/Allocator.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class MCSection;
+class MCSymbol;
+
+// Collection of strings for this unit and assorted symbols.
+// A String->Symbol mapping of strings used by indirect
+// references.
+class DwarfStringPool {
+  using EntryTy = DwarfStringPoolEntry;
+
+  StringMap<EntryTy, BumpPtrAllocator &> Pool;
+  StringRef Prefix;
+  uint64_t NumBytes = 0;
+  unsigned NumIndexedStrings = 0;
+  bool ShouldCreateSymbols;
+
+  StringMapEntry<EntryTy> &getEntryImpl(AsmPrinter &Asm, StringRef Str);
+
+public:
+  using EntryRef = DwarfStringPoolEntryRef;
+
+  DwarfStringPool(BumpPtrAllocator &A, AsmPrinter &Asm, StringRef Prefix);
+
+  void emitStringOffsetsTableHeader(AsmPrinter &Asm, MCSection *OffsetSection,
+                                    MCSymbol *StartSym);
+
+  void emit(AsmPrinter &Asm, MCSection *StrSection,
+            MCSection *OffsetSection = nullptr,
+            bool UseRelativeOffsets = false);
+
+  bool empty() const { return Pool.empty(); }
+
+  unsigned size() const { return Pool.size(); }
+
+  unsigned getNumIndexedStrings() const { return NumIndexedStrings; }
+
+  /// Get a reference to an entry in the string pool.
+  EntryRef getEntry(AsmPrinter &Asm, StringRef Str);
+
+  /// Same as getEntry, except that you can use EntryRef::getIndex to obtain a
+  /// unique ID of this entry (e.g., for use in indexed forms like
+  /// DW_FORM_strx).
+  EntryRef getIndexedEntry(AsmPrinter &Asm, StringRef Str);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_ASMPRINTER_DWARFSTRINGPOOL_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
new file mode 100644
index 000000000000..d30f0ef7af34
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -0,0 +1,1851 @@
+//===-- llvm/CodeGen/DwarfUnit.cpp - Dwarf Type and Compile Units ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for constructing a dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfUnit.h"
+#include "AddressPool.h"
+#include "DwarfCompileUnit.h"
+#include "DwarfExpression.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dwarfdebug"
+
+DIEDwarfExpression::DIEDwarfExpression(const AsmPrinter &AP,
+                                       DwarfCompileUnit &CU, DIELoc &DIE)
+    : DwarfExpression(AP.getDwarfVersion(), CU), AP(AP), OutDIE(DIE) {}
+
+void DIEDwarfExpression::emitOp(uint8_t Op, const char* Comment) {
+  CU.addUInt(getActiveDIE(), dwarf::DW_FORM_data1, Op);
+}
+
+void DIEDwarfExpression::emitSigned(int64_t Value) {
+  CU.addSInt(getActiveDIE(), dwarf::DW_FORM_sdata, Value);
+}
+
+void DIEDwarfExpression::emitUnsigned(uint64_t Value) {
+  CU.addUInt(getActiveDIE(), dwarf::DW_FORM_udata, Value);
+}
+
+void DIEDwarfExpression::emitData1(uint8_t Value) {
+  CU.addUInt(getActiveDIE(), dwarf::DW_FORM_data1, Value);
+}
+
+void DIEDwarfExpression::emitBaseTypeRef(uint64_t Idx) {
+  CU.addBaseTypeRef(getActiveDIE(), Idx);
+}
+
+void DIEDwarfExpression::enableTemporaryBuffer() {
+  assert(!IsBuffering && "Already buffering?");
+  IsBuffering = true;
+}
+
+void DIEDwarfExpression::disableTemporaryBuffer() { IsBuffering = false; }
+
+unsigned DIEDwarfExpression::getTemporaryBufferSize() {
+  return TmpDIE.computeSize(AP.getDwarfFormParams());
+}
+
+void DIEDwarfExpression::commitTemporaryBuffer() { OutDIE.takeValues(TmpDIE); }
+
+bool DIEDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
+                                         llvm::Register MachineReg) {
+  return MachineReg == TRI.getFrameRegister(*AP.MF);
+}
+
+DwarfUnit::DwarfUnit(dwarf::Tag UnitTag, const DICompileUnit *Node,
+                     AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU)
+    : DIEUnit(UnitTag), CUNode(Node), Asm(A), DD(DW), DU(DWU) {}
+
+DwarfTypeUnit::DwarfTypeUnit(DwarfCompileUnit &CU, AsmPrinter *A,
+                             DwarfDebug *DW, DwarfFile *DWU,
+                             MCDwarfDwoLineTable *SplitLineTable)
+    : DwarfUnit(dwarf::DW_TAG_type_unit, CU.getCUNode(), A, DW, DWU), CU(CU),
+      SplitLineTable(SplitLineTable) {
+}
+
+DwarfUnit::~DwarfUnit() {
+  for (DIEBlock *B : DIEBlocks)
+    B->~DIEBlock();
+  for (DIELoc *L : DIELocs)
+    L->~DIELoc();
+}
+
+int64_t DwarfUnit::getDefaultLowerBound() const {
+  switch (getLanguage()) {
+  default:
+    break;
+
+  // The languages below have valid values in all DWARF versions.
+  case dwarf::DW_LANG_C:
+  case dwarf::DW_LANG_C89:
+  case dwarf::DW_LANG_C_plus_plus:
+    return 0;
+
+  case dwarf::DW_LANG_Fortran77:
+  case dwarf::DW_LANG_Fortran90:
+    return 1;
+
+  // The languages below have valid values only if the DWARF version >= 3.
+  case dwarf::DW_LANG_C99:
+  case dwarf::DW_LANG_ObjC:
+  case dwarf::DW_LANG_ObjC_plus_plus:
+    if (DD->getDwarfVersion() >= 3)
+      return 0;
+    break;
+
+  case dwarf::DW_LANG_Fortran95:
+    if (DD->getDwarfVersion() >= 3)
+      return 1;
+    break;
+
+  // Starting with DWARF v4, all defined languages have valid values.
+  case dwarf::DW_LANG_D:
+  case dwarf::DW_LANG_Java:
+  case dwarf::DW_LANG_Python:
+  case dwarf::DW_LANG_UPC:
+    if (DD->getDwarfVersion() >= 4)
+      return 0;
+    break;
+
+  case dwarf::DW_LANG_Ada83:
+  case dwarf::DW_LANG_Ada95:
+  case dwarf::DW_LANG_Cobol74:
+  case dwarf::DW_LANG_Cobol85:
+  case dwarf::DW_LANG_Modula2:
+  case dwarf::DW_LANG_Pascal83:
+  case dwarf::DW_LANG_PLI:
+    if (DD->getDwarfVersion() >= 4)
+      return 1;
+    break;
+
+  // The languages below are new in DWARF v5.
+  case dwarf::DW_LANG_BLISS:
+  case dwarf::DW_LANG_C11:
+  case dwarf::DW_LANG_C_plus_plus_03:
+  case dwarf::DW_LANG_C_plus_plus_11:
+  case dwarf::DW_LANG_C_plus_plus_14:
+  case dwarf::DW_LANG_Dylan:
+  case dwarf::DW_LANG_Go:
+  case dwarf::DW_LANG_Haskell:
+  case dwarf::DW_LANG_OCaml:
+  case dwarf::DW_LANG_OpenCL:
+  case dwarf::DW_LANG_RenderScript:
+  case dwarf::DW_LANG_Rust:
+  case dwarf::DW_LANG_Swift:
+    if (DD->getDwarfVersion() >= 5)
+      return 0;
+    break;
+
+  case dwarf::DW_LANG_Fortran03:
+  case dwarf::DW_LANG_Fortran08:
+  case dwarf::DW_LANG_Julia:
+  case dwarf::DW_LANG_Modula3:
+    if (DD->getDwarfVersion() >= 5)
+      return 1;
+    break;
+  }
+
+  return -1;
+}
+
+/// Check whether the DIE for this MDNode can be shared across CUs.
+bool DwarfUnit::isShareableAcrossCUs(const DINode *D) const {
+  // When the MDNode can be part of the type system, the DIE can be shared
+  // across CUs.
+  // Combining type units and cross-CU DIE sharing is lower value (since
+  // cross-CU DIE sharing is used in LTO and removes type redundancy at that
+  // level already) but may be implementable for some value in projects
+  // building multiple independent libraries with LTO and then linking those
+  // together.
+  if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+    return false;
+  return (isa<DIType>(D) ||
+          (isa<DISubprogram>(D) && !cast<DISubprogram>(D)->isDefinition())) &&
+         !DD->generateTypeUnits();
+}
+
+DIE *DwarfUnit::getDIE(const DINode *D) const {
+  if (isShareableAcrossCUs(D))
+    return DU->getDIE(D);
+  return MDNodeToDieMap.lookup(D);
+}
+
+void DwarfUnit::insertDIE(const DINode *Desc, DIE *D) {
+  if (isShareableAcrossCUs(Desc)) {
+    DU->insertDIE(Desc, D);
+    return;
+  }
+  MDNodeToDieMap.insert(std::make_pair(Desc, D));
+}
+
+void DwarfUnit::insertDIE(DIE *D) {
+  MDNodeToDieMap.insert(std::make_pair(nullptr, D));
+}
+
+void DwarfUnit::addFlag(DIE &Die, dwarf::Attribute Attribute) {
+  if (DD->getDwarfVersion() >= 4)
+    addAttribute(Die, Attribute, dwarf::DW_FORM_flag_present, DIEInteger(1));
+  else
+    addAttribute(Die, Attribute, dwarf::DW_FORM_flag, DIEInteger(1));
+}
+
+void DwarfUnit::addUInt(DIEValueList &Die, dwarf::Attribute Attribute,
+                        std::optional<dwarf::Form> Form, uint64_t Integer) {
+  if (!Form)
+    Form = DIEInteger::BestForm(false, Integer);
+  assert(Form != dwarf::DW_FORM_implicit_const &&
+         "DW_FORM_implicit_const is used only for signed integers");
+  addAttribute(Die, Attribute, *Form, DIEInteger(Integer));
+}
+
+void DwarfUnit::addUInt(DIEValueList &Block, dwarf::Form Form,
+                        uint64_t Integer) {
+  addUInt(Block, (dwarf::Attribute)0, Form, Integer);
+}
+
+void DwarfUnit::addSInt(DIEValueList &Die, dwarf::Attribute Attribute,
+                        std::optional<dwarf::Form> Form, int64_t Integer) {
+  if (!Form)
+    Form = DIEInteger::BestForm(true, Integer);
+  addAttribute(Die, Attribute, *Form, DIEInteger(Integer));
+}
+
+void DwarfUnit::addSInt(DIELoc &Die, std::optional<dwarf::Form> Form,
+                        int64_t Integer) {
+  addSInt(Die, (dwarf::Attribute)0, Form, Integer);
+}
+
+void DwarfUnit::addString(DIE &Die, dwarf::Attribute Attribute,
+                          StringRef String) {
+  if (CUNode->isDebugDirectivesOnly())
+    return;
+
+  if (DD->useInlineStrings()) {
+    addAttribute(Die, Attribute, dwarf::DW_FORM_string,
+                 new (DIEValueAllocator)
+                     DIEInlineString(String, DIEValueAllocator));
+    return;
+  }
+  dwarf::Form IxForm =
+      isDwoUnit() ? dwarf::DW_FORM_GNU_str_index : dwarf::DW_FORM_strp;
+
+  auto StringPoolEntry =
+      useSegmentedStringOffsetsTable() || IxForm == dwarf::DW_FORM_GNU_str_index
+          ? DU->getStringPool().getIndexedEntry(*Asm, String)
+          : DU->getStringPool().getEntry(*Asm, String);
+
+  // For DWARF v5 and beyond, use the smallest strx? form possible.
+  if (useSegmentedStringOffsetsTable()) {
+    IxForm = dwarf::DW_FORM_strx1;
+    unsigned Index = StringPoolEntry.getIndex();
+    if (Index > 0xffffff)
+      IxForm = dwarf::DW_FORM_strx4;
+    else if (Index > 0xffff)
+      IxForm = dwarf::DW_FORM_strx3;
+    else if (Index > 0xff)
+      IxForm = dwarf::DW_FORM_strx2;
+  }
+  addAttribute(Die, Attribute, IxForm, DIEString(StringPoolEntry));
+}
+
+void DwarfUnit::addLabel(DIEValueList &Die, dwarf::Attribute Attribute,
+                         dwarf::Form Form, const MCSymbol *Label) {
+  addAttribute(Die, Attribute, Form, DIELabel(Label));
+}
+
+void DwarfUnit::addLabel(DIELoc &Die, dwarf::Form Form, const MCSymbol *Label) {
+  addLabel(Die, (dwarf::Attribute)0, Form, Label);
+}
+
+void DwarfUnit::addSectionOffset(DIE &Die, dwarf::Attribute Attribute,
+                                 uint64_t Integer) {
+  addUInt(Die, Attribute, DD->getDwarfSectionOffsetForm(), Integer);
+}
+
+unsigned DwarfTypeUnit::getOrCreateSourceID(const DIFile *File) {
+  if (!SplitLineTable)
+    return getCU().getOrCreateSourceID(File);
+  if (!UsedLineTable) {
+    UsedLineTable = true;
+    // This is a split type unit that needs a line table.
+    addSectionOffset(getUnitDie(), dwarf::DW_AT_stmt_list, 0);
+  }
+  return SplitLineTable->getFile(
+      File->getDirectory(), File->getFilename(), DD->getMD5AsBytes(File),
+      Asm->OutContext.getDwarfVersion(), File->getSource());
+}
+
+void DwarfUnit::addPoolOpAddress(DIEValueList &Die, const MCSymbol *Label) {
+  bool UseAddrOffsetFormOrExpressions =
+      DD->useAddrOffsetForm() || DD->useAddrOffsetExpressions();
+
+  const MCSymbol *Base = nullptr;
+  if (Label->isInSection() && UseAddrOffsetFormOrExpressions)
+    Base = DD->getSectionLabel(&Label->getSection());
+
+  uint32_t Index = DD->getAddressPool().getIndex(Base ? Base : Label);
+
+  if (DD->getDwarfVersion() >= 5) {
+    addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_addrx);
+    addUInt(Die, dwarf::DW_FORM_addrx, Index);
+  } else {
+    addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_addr_index);
+    addUInt(Die, dwarf::DW_FORM_GNU_addr_index, Index);
+  }
+
+  if (Base && Base != Label) {
+    addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_const4u);
+    addLabelDelta(Die, (dwarf::Attribute)0, Label, Base);
+    addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+  }
+}
+
+void DwarfUnit::addOpAddress(DIELoc &Die, const MCSymbol *Sym) {
+  if (DD->getDwarfVersion() >= 5) {
+    addPoolOpAddress(Die, Sym);
+    return;
+  }
+
+  if (DD->useSplitDwarf()) {
+    addPoolOpAddress(Die, Sym);
+    return;
+  }
+
+  addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
+  addLabel(Die, dwarf::DW_FORM_addr, Sym);
+}
+
+void DwarfUnit::addLabelDelta(DIEValueList &Die, dwarf::Attribute Attribute,
+                              const MCSymbol *Hi, const MCSymbol *Lo) {
+  addAttribute(Die, Attribute, dwarf::DW_FORM_data4,
+               new (DIEValueAllocator) DIEDelta(Hi, Lo));
+}
+
+void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIE &Entry) {
+  addDIEEntry(Die, Attribute, DIEEntry(Entry));
+}
+
+void DwarfUnit::addDIETypeSignature(DIE &Die, uint64_t Signature) {
+  // Flag the type unit reference as a declaration so that if it contains
+  // members (implicit special members, static data member definitions, member
+  // declarations for definitions in this CU, etc) consumers don't get confused
+  // and think this is a full definition.
+  addFlag(Die, dwarf::DW_AT_declaration);
+
+  addAttribute(Die, dwarf::DW_AT_signature, dwarf::DW_FORM_ref_sig8,
+               DIEInteger(Signature));
+}
+
+void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute,
+                            DIEEntry Entry) {
+  const DIEUnit *CU = Die.getUnit();
+  const DIEUnit *EntryCU = Entry.getEntry().getUnit();
+  if (!CU)
+    // We assume that Die belongs to this CU, if it is not linked to any CU yet.
+    CU = getUnitDie().getUnit();
+  if (!EntryCU)
+    EntryCU = getUnitDie().getUnit();
+  assert(EntryCU == CU || !DD->useSplitDwarf() || DD->shareAcrossDWOCUs() ||
+         !static_cast<const DwarfUnit*>(CU)->isDwoUnit());
+  addAttribute(Die, Attribute,
+               EntryCU == CU ? dwarf::DW_FORM_ref4 : dwarf::DW_FORM_ref_addr,
+               Entry);
+}
+
+DIE &DwarfUnit::createAndAddDIE(dwarf::Tag Tag, DIE &Parent, const DINode *N) {
+  DIE &Die = Parent.addChild(DIE::get(DIEValueAllocator, Tag));
+  if (N)
+    insertDIE(N, &Die);
+  return Die;
+}
+
+void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Loc) {
+  Loc->computeSize(Asm->getDwarfFormParams());
+  DIELocs.push_back(Loc); // Memoize so we can call the destructor later on.
+  addAttribute(Die, Attribute, Loc->BestForm(DD->getDwarfVersion()), Loc);
+}
+
+void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute, dwarf::Form Form,
+                         DIEBlock *Block) {
+  Block->computeSize(Asm->getDwarfFormParams());
+  DIEBlocks.push_back(Block); // Memoize so we can call the destructor later on.
+  addAttribute(Die, Attribute, Form, Block);
+}
+
+void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute,
+                         DIEBlock *Block) {
+  addBlock(Die, Attribute, Block->BestForm(), Block);
+}
+
+void DwarfUnit::addSourceLine(DIE &Die, unsigned Line, const DIFile *File) {
+  if (Line == 0)
+    return;
+
+  unsigned FileID = getOrCreateSourceID(File);
+  addUInt(Die, dwarf::DW_AT_decl_file, std::nullopt, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, std::nullopt, Line);
+}
+
+void DwarfUnit::addSourceLine(DIE &Die, const DILocalVariable *V) {
+  assert(V);
+
+  addSourceLine(Die, V->getLine(), V->getFile());
+}
+
+void DwarfUnit::addSourceLine(DIE &Die, const DIGlobalVariable *G) {
+  assert(G);
+
+  addSourceLine(Die, G->getLine(), G->getFile());
+}
+
+void DwarfUnit::addSourceLine(DIE &Die, const DISubprogram *SP) {
+  assert(SP);
+
+  addSourceLine(Die, SP->getLine(), SP->getFile());
+}
+
+void DwarfUnit::addSourceLine(DIE &Die, const DILabel *L) {
+  assert(L);
+
+  addSourceLine(Die, L->getLine(), L->getFile());
+}
+
+void DwarfUnit::addSourceLine(DIE &Die, const DIType *Ty) {
+  assert(Ty);
+
+  addSourceLine(Die, Ty->getLine(), Ty->getFile());
+}
+
+void DwarfUnit::addSourceLine(DIE &Die, const DIObjCProperty *Ty) {
+  assert(Ty);
+
+  addSourceLine(Die, Ty->getLine(), Ty->getFile());
+}
+
+void DwarfUnit::addConstantFPValue(DIE &Die, const ConstantFP *CFP) {
+  // Pass this down to addConstantValue as an unsigned bag of bits.
+  addConstantValue(Die, CFP->getValueAPF().bitcastToAPInt(), true);
+}
+
+void DwarfUnit::addConstantValue(DIE &Die, const ConstantInt *CI,
+                                 const DIType *Ty) {
+  addConstantValue(Die, CI->getValue(), Ty);
+}
+
+void DwarfUnit::addConstantValue(DIE &Die, uint64_t Val, const DIType *Ty) {
+  addConstantValue(Die, DD->isUnsignedDIType(Ty), Val);
+}
+
+void DwarfUnit::addConstantValue(DIE &Die, bool Unsigned, uint64_t Val) {
+  // FIXME: This is a bit conservative/simple - it emits negative values always
+  // sign extended to 64 bits rather than minimizing the number of bytes.
+  addUInt(Die, dwarf::DW_AT_const_value,
+          Unsigned ? dwarf::DW_FORM_udata : dwarf::DW_FORM_sdata, Val);
+}
+
+void DwarfUnit::addConstantValue(DIE &Die, const APInt &Val, const DIType *Ty) {
+  addConstantValue(Die, Val, DD->isUnsignedDIType(Ty));
+}
+
+void DwarfUnit::addConstantValue(DIE &Die, const APInt &Val, bool Unsigned) {
+  unsigned CIBitWidth = Val.getBitWidth();
+  if (CIBitWidth <= 64) {
+    addConstantValue(Die, Unsigned,
+                     Unsigned ? Val.getZExtValue() : Val.getSExtValue());
+    return;
+  }
+
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock;
+
+  // Get the raw data form of the large APInt.
+  const uint64_t *Ptr64 = Val.getRawData();
+
+  int NumBytes = Val.getBitWidth() / 8; // 8 bits per byte.
+  bool LittleEndian = Asm->getDataLayout().isLittleEndian();
+
+  // Output the constant to DWARF one byte at a time.
+  for (int i = 0; i < NumBytes; i++) {
+    uint8_t c;
+    if (LittleEndian)
+      c = Ptr64[i / 8] >> (8 * (i & 7));
+    else
+      c = Ptr64[(NumBytes - 1 - i) / 8] >> (8 * ((NumBytes - 1 - i) & 7));
+    addUInt(*Block, dwarf::DW_FORM_data1, c);
+  }
+
+  addBlock(Die, dwarf::DW_AT_const_value, Block);
+}
+
+void DwarfUnit::addLinkageName(DIE &Die, StringRef LinkageName) {
+  if (!LinkageName.empty())
+    addString(Die,
+              DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name
+                                         : dwarf::DW_AT_MIPS_linkage_name,
+              GlobalValue::dropLLVMManglingEscape(LinkageName));
+}
+
+void DwarfUnit::addTemplateParams(DIE &Buffer, DINodeArray TParams) {
+  // Add template parameters.
+  for (const auto *Element : TParams) {
+    if (auto *TTP = dyn_cast<DITemplateTypeParameter>(Element))
+      constructTemplateTypeParameterDIE(Buffer, TTP);
+    else if (auto *TVP = dyn_cast<DITemplateValueParameter>(Element))
+      constructTemplateValueParameterDIE(Buffer, TVP);
+  }
+}
+
+/// Add thrown types.
+void DwarfUnit::addThrownTypes(DIE &Die, DINodeArray ThrownTypes) {
+  for (const auto *Ty : ThrownTypes) {
+    DIE &TT = createAndAddDIE(dwarf::DW_TAG_thrown_type, Die);
+    addType(TT, cast<DIType>(Ty));
+  }
+}
+
+void DwarfUnit::addAccess(DIE &Die, DINode::DIFlags Flags) {
+  if ((Flags & DINode::FlagAccessibility) == DINode::FlagProtected)
+    addUInt(Die, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_protected);
+  else if ((Flags & DINode::FlagAccessibility) == DINode::FlagPrivate)
+    addUInt(Die, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_private);
+  else if ((Flags & DINode::FlagAccessibility) == DINode::FlagPublic)
+    addUInt(Die, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_public);
+}
+
+DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) {
+  if (!Context || isa<DIFile>(Context) || isa<DICompileUnit>(Context))
+    return &getUnitDie();
+  if (auto *T = dyn_cast<DIType>(Context))
+    return getOrCreateTypeDIE(T);
+  if (auto *NS = dyn_cast<DINamespace>(Context))
+    return getOrCreateNameSpace(NS);
+  if (auto *SP = dyn_cast<DISubprogram>(Context))
+    return getOrCreateSubprogramDIE(SP);
+  if (auto *M = dyn_cast<DIModule>(Context))
+    return getOrCreateModule(M);
+  return getDIE(Context);
+}
+
+DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) {
+  auto *Context = Ty->getScope();
+  DIE *ContextDIE = getOrCreateContextDIE(Context);
+
+  if (DIE *TyDIE = getDIE(Ty))
+    return TyDIE;
+
+  // Create new type.
+  DIE &TyDIE = createAndAddDIE(Ty->getTag(), *ContextDIE, Ty);
+
+  constructTypeDIE(TyDIE, cast<DICompositeType>(Ty));
+
+  updateAcceleratorTables(Context, Ty, TyDIE);
+  return &TyDIE;
+}
+
+DIE *DwarfUnit::createTypeDIE(const DIScope *Context, DIE &ContextDIE,
+                              const DIType *Ty) {
+  // Create new type.
+  DIE &TyDIE = createAndAddDIE(Ty->getTag(), ContextDIE, Ty);
+
+  updateAcceleratorTables(Context, Ty, TyDIE);
+
+  if (auto *BT = dyn_cast<DIBasicType>(Ty))
+    constructTypeDIE(TyDIE, BT);
+  else if (auto *ST = dyn_cast<DIStringType>(Ty))
+    constructTypeDIE(TyDIE, ST);
+  else if (auto *STy = dyn_cast<DISubroutineType>(Ty))
+    constructTypeDIE(TyDIE, STy);
+  else if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
+    if (DD->generateTypeUnits() && !Ty->isForwardDecl() &&
+        (Ty->getRawName() || CTy->getRawIdentifier())) {
+      // Skip updating the accelerator tables since this is not the full type.
+      if (MDString *TypeId = CTy->getRawIdentifier())
+        DD->addDwarfTypeUnitType(getCU(), TypeId->getString(), TyDIE, CTy);
+      else
+        finishNonUnitTypeDIE(TyDIE, CTy);
+      return &TyDIE;
+    }
+    constructTypeDIE(TyDIE, CTy);
+  } else {
+    constructTypeDIE(TyDIE, cast<DIDerivedType>(Ty));
+  }
+
+  return &TyDIE;
+}
+
+DIE *DwarfUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
+  if (!TyNode)
+    return nullptr;
+
+  auto *Ty = cast<DIType>(TyNode);
+
+  // DW_TAG_restrict_type is not supported in DWARF2
+  if (Ty->getTag() == dwarf::DW_TAG_restrict_type && DD->getDwarfVersion() <= 2)
+    return getOrCreateTypeDIE(cast<DIDerivedType>(Ty)->getBaseType());
+
+  // DW_TAG_atomic_type is not supported in DWARF < 5
+  if (Ty->getTag() == dwarf::DW_TAG_atomic_type && DD->getDwarfVersion() < 5)
+    return getOrCreateTypeDIE(cast<DIDerivedType>(Ty)->getBaseType());
+
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  auto *Context = Ty->getScope();
+  DIE *ContextDIE = getOrCreateContextDIE(Context);
+  assert(ContextDIE);
+
+  if (DIE *TyDIE = getDIE(Ty))
+    return TyDIE;
+
+  return static_cast<DwarfUnit *>(ContextDIE->getUnit())
+      ->createTypeDIE(Context, *ContextDIE, Ty);
+}
+
+void DwarfUnit::updateAcceleratorTables(const DIScope *Context,
+                                        const DIType *Ty, const DIE &TyDIE) {
+  if (!Ty->getName().empty() && !Ty->isForwardDecl()) {
+    bool IsImplementation = false;
+    if (auto *CT = dyn_cast<DICompositeType>(Ty)) {
+      // A runtime language of 0 actually means C/C++ and that any
+      // non-negative value is some version of Objective-C/C++.
+      IsImplementation = CT->getRuntimeLang() == 0 || CT->isObjcClassComplete();
+    }
+    unsigned Flags = IsImplementation ? dwarf::DW_FLAG_type_implementation : 0;
+    DD->addAccelType(*CUNode, Ty->getName(), TyDIE, Flags);
+
+    if (!Context || isa<DICompileUnit>(Context) || isa<DIFile>(Context) ||
+        isa<DINamespace>(Context) || isa<DICommonBlock>(Context))
+      addGlobalType(Ty, TyDIE, Context);
+  }
+}
+
+void DwarfUnit::addType(DIE &Entity, const DIType *Ty,
+                        dwarf::Attribute Attribute) {
+  assert(Ty && "Trying to add a type that doesn't exist?");
+  addDIEEntry(Entity, Attribute, DIEEntry(*getOrCreateTypeDIE(Ty)));
+}
+
+std::string DwarfUnit::getParentContextString(const DIScope *Context) const {
+  if (!Context)
+    return "";
+
+  // FIXME: Decide whether to implement this for non-C++ languages.
+  if (!dwarf::isCPlusPlus((dwarf::SourceLanguage)getLanguage()))
+    return "";
+
+  std::string CS;
+  SmallVector<const DIScope *, 1> Parents;
+  while (!isa<DICompileUnit>(Context)) {
+    Parents.push_back(Context);
+    if (const DIScope *S = Context->getScope())
+      Context = S;
+    else
+      // Structure, etc types will have a NULL context if they're at the top
+      // level.
+      break;
+  }
+
+  // Reverse iterate over our list to go from the outermost construct to the
+  // innermost.
+  for (const DIScope *Ctx : llvm::reverse(Parents)) {
+    StringRef Name = Ctx->getName();
+    if (Name.empty() && isa<DINamespace>(Ctx))
+      Name = "(anonymous namespace)";
+    if (!Name.empty()) {
+      CS += Name;
+      CS += "::";
+    }
+  }
+  return CS;
+}
+
+void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIBasicType *BTy) {
+  // Get core information.
+  StringRef Name = BTy->getName();
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    addString(Buffer, dwarf::DW_AT_name, Name);
+
+  // An unspecified type only has a name attribute.
+  if (BTy->getTag() == dwarf::DW_TAG_unspecified_type)
+    return;
+
+  if (BTy->getTag() != dwarf::DW_TAG_string_type)
+    addUInt(Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
+            BTy->getEncoding());
+
+  uint64_t Size = BTy->getSizeInBits() >> 3;
+  addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, Size);
+
+  if (BTy->isBigEndian())
+    addUInt(Buffer, dwarf::DW_AT_endianity, std::nullopt, dwarf::DW_END_big);
+  else if (BTy->isLittleEndian())
+    addUInt(Buffer, dwarf::DW_AT_endianity, std::nullopt, dwarf::DW_END_little);
+}
+
+void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIStringType *STy) {
+  // Get core information.
+  StringRef Name = STy->getName();
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    addString(Buffer, dwarf::DW_AT_name, Name);
+
+  if (DIVariable *Var = STy->getStringLength()) {
+    if (auto *VarDIE = getDIE(Var))
+      addDIEEntry(Buffer, dwarf::DW_AT_string_length, *VarDIE);
+  } else if (DIExpression *Expr = STy->getStringLengthExp()) {
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+    // This is to describe the memory location of the
+    // length of a Fortran deferred length string, so
+    // lock it down as such.
+    DwarfExpr.setMemoryLocationKind();
+    DwarfExpr.addExpression(Expr);
+    addBlock(Buffer, dwarf::DW_AT_string_length, DwarfExpr.finalize());
+  } else {
+    uint64_t Size = STy->getSizeInBits() >> 3;
+    addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, Size);
+  }
+
+  if (DIExpression *Expr = STy->getStringLocationExp()) {
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+    // This is to describe the memory location of the
+    // string, so lock it down as such.
+    DwarfExpr.setMemoryLocationKind();
+    DwarfExpr.addExpression(Expr);
+    addBlock(Buffer, dwarf::DW_AT_data_location, DwarfExpr.finalize());
+  }
+
+  if (STy->getEncoding()) {
+    // For eventual Unicode support.
+    addUInt(Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
+            STy->getEncoding());
+  }
+}
+
+void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
+  // Get core information.
+  StringRef Name = DTy->getName();
+  uint64_t Size = DTy->getSizeInBits() >> 3;
+  uint16_t Tag = Buffer.getTag();
+
+  // Map to main type, void will not have a type.
+  const DIType *FromTy = DTy->getBaseType();
+  if (FromTy)
+    addType(Buffer, FromTy);
+
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    addString(Buffer, dwarf::DW_AT_name, Name);
+
+  addAnnotation(Buffer, DTy->getAnnotations());
+
+  // If alignment is specified for a typedef , create and insert DW_AT_alignment
+  // attribute in DW_TAG_typedef DIE.
+  if (Tag == dwarf::DW_TAG_typedef && DD->getDwarfVersion() >= 5) {
+    uint32_t AlignInBytes = DTy->getAlignInBytes();
+    if (AlignInBytes > 0)
+      addUInt(Buffer, dwarf::DW_AT_alignment, dwarf::DW_FORM_udata,
+              AlignInBytes);
+  }
+
+  // Add size if non-zero (derived types might be zero-sized.)
+  if (Size && Tag != dwarf::DW_TAG_pointer_type
+           && Tag != dwarf::DW_TAG_ptr_to_member_type
+           && Tag != dwarf::DW_TAG_reference_type
+           && Tag != dwarf::DW_TAG_rvalue_reference_type)
+    addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, Size);
+
+  if (Tag == dwarf::DW_TAG_ptr_to_member_type)
+    addDIEEntry(Buffer, dwarf::DW_AT_containing_type,
+                *getOrCreateTypeDIE(cast<DIDerivedType>(DTy)->getClassType()));
+
+  addAccess(Buffer, DTy->getFlags());
+
+  // Add source line info if available and TyDesc is not a forward declaration.
+  if (!DTy->isForwardDecl())
+    addSourceLine(Buffer, DTy);
+
+  // If DWARF address space value is other than None, add it.  The IR
+  // verifier checks that DWARF address space only exists for pointer
+  // or reference types.
+  if (DTy->getDWARFAddressSpace())
+    addUInt(Buffer, dwarf::DW_AT_address_class, dwarf::DW_FORM_data4,
+            *DTy->getDWARFAddressSpace());
+}
+
+void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
+  for (unsigned i = 1, N = Args.size(); i < N; ++i) {
+    const DIType *Ty = Args[i];
+    if (!Ty) {
+      assert(i == N-1 && "Unspecified parameter must be the last argument");
+      createAndAddDIE(dwarf::DW_TAG_unspecified_parameters, Buffer);
+    } else {
+      DIE &Arg = createAndAddDIE(dwarf::DW_TAG_formal_parameter, Buffer);
+      addType(Arg, Ty);
+      if (Ty->isArtificial())
+        addFlag(Arg, dwarf::DW_AT_artificial);
+    }
+  }
+}
+
+void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) {
+  // Add return type.  A void return won't have a type.
+  auto Elements = cast<DISubroutineType>(CTy)->getTypeArray();
+  if (Elements.size())
+    if (auto RTy = Elements[0])
+      addType(Buffer, RTy);
+
+  bool isPrototyped = true;
+  if (Elements.size() == 2 && !Elements[1])
+    isPrototyped = false;
+
+  constructSubprogramArguments(Buffer, Elements);
+
+  // Add prototype flag if we're dealing with a C language and the function has
+  // been prototyped.
+  if (isPrototyped && dwarf::isC((dwarf::SourceLanguage)getLanguage()))
+    addFlag(Buffer, dwarf::DW_AT_prototyped);
+
+  // Add a DW_AT_calling_convention if this has an explicit convention.
+  if (CTy->getCC() && CTy->getCC() != dwarf::DW_CC_normal)
+    addUInt(Buffer, dwarf::DW_AT_calling_convention, dwarf::DW_FORM_data1,
+            CTy->getCC());
+
+  if (CTy->isLValueReference())
+    addFlag(Buffer, dwarf::DW_AT_reference);
+
+  if (CTy->isRValueReference())
+    addFlag(Buffer, dwarf::DW_AT_rvalue_reference);
+}
+
+void DwarfUnit::addAnnotation(DIE &Buffer, DINodeArray Annotations) {
+  if (!Annotations)
+    return;
+
+  for (const Metadata *Annotation : Annotations->operands()) {
+    const MDNode *MD = cast<MDNode>(Annotation);
+    const MDString *Name = cast<MDString>(MD->getOperand(0));
+    const auto &Value = MD->getOperand(1);
+
+    DIE &AnnotationDie = createAndAddDIE(dwarf::DW_TAG_LLVM_annotation, Buffer);
+    addString(AnnotationDie, dwarf::DW_AT_name, Name->getString());
+    if (const auto *Data = dyn_cast<MDString>(Value))
+      addString(AnnotationDie, dwarf::DW_AT_const_value, Data->getString());
+    else if (const auto *Data = dyn_cast<ConstantAsMetadata>(Value))
+      addConstantValue(AnnotationDie, Data->getValue()->getUniqueInteger(),
+                       /*Unsigned=*/true);
+    else
+      assert(false && "Unsupported annotation value type");
+  }
+}
+
+void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
+  // Add name if not anonymous or intermediate type.
+  StringRef Name = CTy->getName();
+
+  uint64_t Size = CTy->getSizeInBits() >> 3;
+  uint16_t Tag = Buffer.getTag();
+
+  switch (Tag) {
+  case dwarf::DW_TAG_array_type:
+    constructArrayTypeDIE(Buffer, CTy);
+    break;
+  case dwarf::DW_TAG_enumeration_type:
+    constructEnumTypeDIE(Buffer, CTy);
+    break;
+  case dwarf::DW_TAG_variant_part:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_namelist: {
+    // Emit the discriminator for a variant part.
+    DIDerivedType *Discriminator = nullptr;
+    if (Tag == dwarf::DW_TAG_variant_part) {
+      Discriminator = CTy->getDiscriminator();
+      if (Discriminator) {
+        // DWARF says:
+        //    If the variant part has a discriminant, the discriminant is
+        //    represented by a separate debugging information entry which is
+        //    a child of the variant part entry.
+        DIE &DiscMember = constructMemberDIE(Buffer, Discriminator);
+        addDIEEntry(Buffer, dwarf::DW_AT_discr, DiscMember);
+      }
+    }
+
+    // Add template parameters to a class, structure or union types.
+    if (Tag == dwarf::DW_TAG_class_type ||
+        Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
+      addTemplateParams(Buffer, CTy->getTemplateParams());
+
+    // Add elements to structure type.
+    DINodeArray Elements = CTy->getElements();
+    for (const auto *Element : Elements) {
+      if (!Element)
+        continue;
+      if (auto *SP = dyn_cast<DISubprogram>(Element))
+        getOrCreateSubprogramDIE(SP);
+      else if (auto *DDTy = dyn_cast<DIDerivedType>(Element)) {
+        if (DDTy->getTag() == dwarf::DW_TAG_friend) {
+          DIE &ElemDie = createAndAddDIE(dwarf::DW_TAG_friend, Buffer);
+          addType(ElemDie, DDTy->getBaseType(), dwarf::DW_AT_friend);
+        } else if (DDTy->isStaticMember()) {
+          getOrCreateStaticMemberDIE(DDTy);
+        } else if (Tag == dwarf::DW_TAG_variant_part) {
+          // When emitting a variant part, wrap each member in
+          // DW_TAG_variant.
+          DIE &Variant = createAndAddDIE(dwarf::DW_TAG_variant, Buffer);
+          if (const ConstantInt *CI =
+              dyn_cast_or_null<ConstantInt>(DDTy->getDiscriminantValue())) {
+            if (DD->isUnsignedDIType(Discriminator->getBaseType()))
+              addUInt(Variant, dwarf::DW_AT_discr_value, std::nullopt,
+                      CI->getZExtValue());
+            else
+              addSInt(Variant, dwarf::DW_AT_discr_value, std::nullopt,
+                      CI->getSExtValue());
+          }
+          constructMemberDIE(Variant, DDTy);
+        } else {
+          constructMemberDIE(Buffer, DDTy);
+        }
+      } else if (auto *Property = dyn_cast<DIObjCProperty>(Element)) {
+        DIE &ElemDie = createAndAddDIE(Property->getTag(), Buffer);
+        StringRef PropertyName = Property->getName();
+        addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName);
+        if (Property->getType())
+          addType(ElemDie, Property->getType());
+        addSourceLine(ElemDie, Property);
+        StringRef GetterName = Property->getGetterName();
+        if (!GetterName.empty())
+          addString(ElemDie, dwarf::DW_AT_APPLE_property_getter, GetterName);
+        StringRef SetterName = Property->getSetterName();
+        if (!SetterName.empty())
+          addString(ElemDie, dwarf::DW_AT_APPLE_property_setter, SetterName);
+        if (unsigned PropertyAttributes = Property->getAttributes())
+          addUInt(ElemDie, dwarf::DW_AT_APPLE_property_attribute, std::nullopt,
+                  PropertyAttributes);
+      } else if (auto *Composite = dyn_cast<DICompositeType>(Element)) {
+        if (Composite->getTag() == dwarf::DW_TAG_variant_part) {
+          DIE &VariantPart = createAndAddDIE(Composite->getTag(), Buffer);
+          constructTypeDIE(VariantPart, Composite);
+        }
+      } else if (Tag == dwarf::DW_TAG_namelist) {
+        auto *Var = dyn_cast<DINode>(Element);
+        auto *VarDIE = getDIE(Var);
+        if (VarDIE) {
+          DIE &ItemDie = createAndAddDIE(dwarf::DW_TAG_namelist_item, Buffer);
+          addDIEEntry(ItemDie, dwarf::DW_AT_namelist_item, *VarDIE);
+        }
+      }
+    }
+
+    if (CTy->isAppleBlockExtension())
+      addFlag(Buffer, dwarf::DW_AT_APPLE_block);
+
+    if (CTy->getExportSymbols())
+      addFlag(Buffer, dwarf::DW_AT_export_symbols);
+
+    // This is outside the DWARF spec, but GDB expects a DW_AT_containing_type
+    // inside C++ composite types to point to the base class with the vtable.
+    // Rust uses DW_AT_containing_type to link a vtable to the type
+    // for which it was created.
+    if (auto *ContainingType = CTy->getVTableHolder())
+      addDIEEntry(Buffer, dwarf::DW_AT_containing_type,
+                  *getOrCreateTypeDIE(ContainingType));
+
+    if (CTy->isObjcClassComplete())
+      addFlag(Buffer, dwarf::DW_AT_APPLE_objc_complete_type);
+
+    // Add the type's non-standard calling convention.
+    // DW_CC_pass_by_value/DW_CC_pass_by_reference are introduced in DWARF 5.
+    if (!Asm->TM.Options.DebugStrictDwarf || DD->getDwarfVersion() >= 5) {
+      uint8_t CC = 0;
+      if (CTy->isTypePassByValue())
+        CC = dwarf::DW_CC_pass_by_value;
+      else if (CTy->isTypePassByReference())
+        CC = dwarf::DW_CC_pass_by_reference;
+      if (CC)
+        addUInt(Buffer, dwarf::DW_AT_calling_convention, dwarf::DW_FORM_data1,
+                CC);
+    }
+    break;
+  }
+  default:
+    break;
+  }
+
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    addString(Buffer, dwarf::DW_AT_name, Name);
+
+  addAnnotation(Buffer, CTy->getAnnotations());
+
+  if (Tag == dwarf::DW_TAG_enumeration_type ||
+      Tag == dwarf::DW_TAG_class_type || Tag == dwarf::DW_TAG_structure_type ||
+      Tag == dwarf::DW_TAG_union_type) {
+    // Add size if non-zero (derived types might be zero-sized.)
+    // Ignore the size if it's a non-enum forward decl.
+    // TODO: Do we care about size for enum forward declarations?
+    if (Size &&
+        (!CTy->isForwardDecl() || Tag == dwarf::DW_TAG_enumeration_type))
+      addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, Size);
+    else if (!CTy->isForwardDecl())
+      // Add zero size if it is not a forward declaration.
+      addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, 0);
+
+    // If we're a forward decl, say so.
+    if (CTy->isForwardDecl())
+      addFlag(Buffer, dwarf::DW_AT_declaration);
+
+    // Add accessibility info if available.
+    addAccess(Buffer, CTy->getFlags());
+
+    // Add source line info if available.
+    if (!CTy->isForwardDecl())
+      addSourceLine(Buffer, CTy);
+
+    // No harm in adding the runtime language to the declaration.
+    unsigned RLang = CTy->getRuntimeLang();
+    if (RLang)
+      addUInt(Buffer, dwarf::DW_AT_APPLE_runtime_class, dwarf::DW_FORM_data1,
+              RLang);
+
+    // Add align info if available.
+    if (uint32_t AlignInBytes = CTy->getAlignInBytes())
+      addUInt(Buffer, dwarf::DW_AT_alignment, dwarf::DW_FORM_udata,
+              AlignInBytes);
+  }
+}
+
+void DwarfUnit::constructTemplateTypeParameterDIE(
+    DIE &Buffer, const DITemplateTypeParameter *TP) {
+  DIE &ParamDIE =
+      createAndAddDIE(dwarf::DW_TAG_template_type_parameter, Buffer);
+  // Add the type if it exists, it could be void and therefore no type.
+  if (TP->getType())
+    addType(ParamDIE, TP->getType());
+  if (!TP->getName().empty())
+    addString(ParamDIE, dwarf::DW_AT_name, TP->getName());
+  if (TP->isDefault() && isCompatibleWithVersion(5))
+    addFlag(ParamDIE, dwarf::DW_AT_default_value);
+}
+
+void DwarfUnit::constructTemplateValueParameterDIE(
+    DIE &Buffer, const DITemplateValueParameter *VP) {
+  DIE &ParamDIE = createAndAddDIE(VP->getTag(), Buffer);
+
+  // Add the type if there is one, template template and template parameter
+  // packs will not have a type.
+  if (VP->getTag() == dwarf::DW_TAG_template_value_parameter)
+    addType(ParamDIE, VP->getType());
+  if (!VP->getName().empty())
+    addString(ParamDIE, dwarf::DW_AT_name, VP->getName());
+  if (VP->isDefault() && isCompatibleWithVersion(5))
+    addFlag(ParamDIE, dwarf::DW_AT_default_value);
+  if (Metadata *Val = VP->getValue()) {
+    if (ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Val))
+      addConstantValue(ParamDIE, CI, VP->getType());
+    else if (GlobalValue *GV = mdconst::dyn_extract<GlobalValue>(Val)) {
+      // We cannot describe the location of dllimport'd entities: the
+      // computation of their address requires loads from the IAT.
+      if (!GV->hasDLLImportStorageClass()) {
+        // For declaration non-type template parameters (such as global values
+        // and functions)
+        DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+        addOpAddress(*Loc, Asm->getSymbol(GV));
+        // Emit DW_OP_stack_value to use the address as the immediate value of
+        // the parameter, rather than a pointer to it.
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value);
+        addBlock(ParamDIE, dwarf::DW_AT_location, Loc);
+      }
+    } else if (VP->getTag() == dwarf::DW_TAG_GNU_template_template_param) {
+      assert(isa<MDString>(Val));
+      addString(ParamDIE, dwarf::DW_AT_GNU_template_name,
+                cast<MDString>(Val)->getString());
+    } else if (VP->getTag() == dwarf::DW_TAG_GNU_template_parameter_pack) {
+      addTemplateParams(ParamDIE, cast<MDTuple>(Val));
+    }
+  }
+}
+
+DIE *DwarfUnit::getOrCreateNameSpace(const DINamespace *NS) {
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(NS->getScope());
+
+  if (DIE *NDie = getDIE(NS))
+    return NDie;
+  DIE &NDie = createAndAddDIE(dwarf::DW_TAG_namespace, *ContextDIE, NS);
+
+  StringRef Name = NS->getName();
+  if (!Name.empty())
+    addString(NDie, dwarf::DW_AT_name, NS->getName());
+  else
+    Name = "(anonymous namespace)";
+  DD->addAccelNamespace(*CUNode, Name, NDie);
+  addGlobalName(Name, NDie, NS->getScope());
+  if (NS->getExportSymbols())
+    addFlag(NDie, dwarf::DW_AT_export_symbols);
+  return &NDie;
+}
+
+DIE *DwarfUnit::getOrCreateModule(const DIModule *M) {
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(M->getScope());
+
+  if (DIE *MDie = getDIE(M))
+    return MDie;
+  DIE &MDie = createAndAddDIE(dwarf::DW_TAG_module, *ContextDIE, M);
+
+  if (!M->getName().empty()) {
+    addString(MDie, dwarf::DW_AT_name, M->getName());
+    addGlobalName(M->getName(), MDie, M->getScope());
+  }
+  if (!M->getConfigurationMacros().empty())
+    addString(MDie, dwarf::DW_AT_LLVM_config_macros,
+              M->getConfigurationMacros());
+  if (!M->getIncludePath().empty())
+    addString(MDie, dwarf::DW_AT_LLVM_include_path, M->getIncludePath());
+  if (!M->getAPINotesFile().empty())
+    addString(MDie, dwarf::DW_AT_LLVM_apinotes, M->getAPINotesFile());
+  if (M->getFile())
+    addUInt(MDie, dwarf::DW_AT_decl_file, std::nullopt,
+            getOrCreateSourceID(M->getFile()));
+  if (M->getLineNo())
+    addUInt(MDie, dwarf::DW_AT_decl_line, std::nullopt, M->getLineNo());
+  if (M->getIsDecl())
+    addFlag(MDie, dwarf::DW_AT_declaration);
+
+  return &MDie;
+}
+
+DIE *DwarfUnit::getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal) {
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE (as is the case for member function
+  // declarations).
+  DIE *ContextDIE =
+      Minimal ? &getUnitDie() : getOrCreateContextDIE(SP->getScope());
+
+  if (DIE *SPDie = getDIE(SP))
+    return SPDie;
+
+  if (auto *SPDecl = SP->getDeclaration()) {
+    if (!Minimal) {
+      // Add subprogram definitions to the CU die directly.
+      ContextDIE = &getUnitDie();
+      // Build the decl now to ensure it precedes the definition.
+      getOrCreateSubprogramDIE(SPDecl);
+    }
+  }
+
+  // DW_TAG_inlined_subroutine may refer to this DIE.
+  DIE &SPDie = createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE, SP);
+
+  // Stop here and fill this in later, depending on whether or not this
+  // subprogram turns out to have inlined instances or not.
+  if (SP->isDefinition())
+    return &SPDie;
+
+  static_cast<DwarfUnit *>(SPDie.getUnit())
+      ->applySubprogramAttributes(SP, SPDie);
+  return &SPDie;
+}
+
+bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP,
+                                                    DIE &SPDie, bool Minimal) {
+  DIE *DeclDie = nullptr;
+  StringRef DeclLinkageName;
+  if (auto *SPDecl = SP->getDeclaration()) {
+    if (!Minimal) {
+      DITypeRefArray DeclArgs, DefinitionArgs;
+      DeclArgs = SPDecl->getType()->getTypeArray();
+      DefinitionArgs = SP->getType()->getTypeArray();
+
+      if (DeclArgs.size() && DefinitionArgs.size())
+        if (DefinitionArgs[0] != nullptr && DeclArgs[0] != DefinitionArgs[0])
+          addType(SPDie, DefinitionArgs[0]);
+
+      DeclDie = getDIE(SPDecl);
+      assert(DeclDie && "This DIE should've already been constructed when the "
+                        "definition DIE was created in "
+                        "getOrCreateSubprogramDIE");
+      // Look at the Decl's linkage name only if we emitted it.
+      if (DD->useAllLinkageNames())
+        DeclLinkageName = SPDecl->getLinkageName();
+      unsigned DeclID = getOrCreateSourceID(SPDecl->getFile());
+      unsigned DefID = getOrCreateSourceID(SP->getFile());
+      if (DeclID != DefID)
+        addUInt(SPDie, dwarf::DW_AT_decl_file, std::nullopt, DefID);
+
+      if (SP->getLine() != SPDecl->getLine())
+        addUInt(SPDie, dwarf::DW_AT_decl_line, std::nullopt, SP->getLine());
+    }
+  }
+
+  // Add function template parameters.
+  addTemplateParams(SPDie, SP->getTemplateParams());
+
+  // Add the linkage name if we have one and it isn't in the Decl.
+  StringRef LinkageName = SP->getLinkageName();
+  assert(((LinkageName.empty() || DeclLinkageName.empty()) ||
+          LinkageName == DeclLinkageName) &&
+         "decl has a linkage name and it is different");
+  if (DeclLinkageName.empty() &&
+      // Always emit it for abstract subprograms.
+      (DD->useAllLinkageNames() || DU->getAbstractScopeDIEs().lookup(SP)))
+    addLinkageName(SPDie, LinkageName);
+
+  if (!DeclDie)
+    return false;
+
+  // Refer to the function declaration where all the other attributes will be
+  // found.
+  addDIEEntry(SPDie, dwarf::DW_AT_specification, *DeclDie);
+  return true;
+}
+
+void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
+                                          bool SkipSPAttributes) {
+  // If -fdebug-info-for-profiling is enabled, need to emit the subprogram
+  // and its source location.
+  bool SkipSPSourceLocation = SkipSPAttributes &&
+                              !CUNode->getDebugInfoForProfiling();
+  if (!SkipSPSourceLocation)
+    if (applySubprogramDefinitionAttributes(SP, SPDie, SkipSPAttributes))
+      return;
+
+  // Constructors and operators for anonymous aggregates do not have names.
+  if (!SP->getName().empty())
+    addString(SPDie, dwarf::DW_AT_name, SP->getName());
+
+  addAnnotation(SPDie, SP->getAnnotations());
+
+  if (!SkipSPSourceLocation)
+    addSourceLine(SPDie, SP);
+
+  // Skip the rest of the attributes under -gmlt to save space.
+  if (SkipSPAttributes)
+    return;
+
+  // Add the prototype if we have a prototype and we have a C like
+  // language.
+  if (SP->isPrototyped() && dwarf::isC((dwarf::SourceLanguage)getLanguage()))
+    addFlag(SPDie, dwarf::DW_AT_prototyped);
+
+  if (SP->isObjCDirect())
+    addFlag(SPDie, dwarf::DW_AT_APPLE_objc_direct);
+
+  unsigned CC = 0;
+  DITypeRefArray Args;
+  if (const DISubroutineType *SPTy = SP->getType()) {
+    Args = SPTy->getTypeArray();
+    CC = SPTy->getCC();
+  }
+
+  // Add a DW_AT_calling_convention if this has an explicit convention.
+  if (CC && CC != dwarf::DW_CC_normal)
+    addUInt(SPDie, dwarf::DW_AT_calling_convention, dwarf::DW_FORM_data1, CC);
+
+  // Add a return type. If this is a type like a C/C++ void type we don't add a
+  // return type.
+  if (Args.size())
+    if (auto Ty = Args[0])
+      addType(SPDie, Ty);
+
+  unsigned VK = SP->getVirtuality();
+  if (VK) {
+    addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_data1, VK);
+    if (SP->getVirtualIndex() != -1u) {
+      DIELoc *Block = getDIELoc();
+      addUInt(*Block, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+      addUInt(*Block, dwarf::DW_FORM_udata, SP->getVirtualIndex());
+      addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, Block);
+    }
+    ContainingTypeMap.insert(std::make_pair(&SPDie, SP->getContainingType()));
+  }
+
+  if (!SP->isDefinition()) {
+    addFlag(SPDie, dwarf::DW_AT_declaration);
+
+    // Add arguments. Do not add arguments for subprogram definition. They will
+    // be handled while processing variables.
+    constructSubprogramArguments(SPDie, Args);
+  }
+
+  addThrownTypes(SPDie, SP->getThrownTypes());
+
+  if (SP->isArtificial())
+    addFlag(SPDie, dwarf::DW_AT_artificial);
+
+  if (!SP->isLocalToUnit())
+    addFlag(SPDie, dwarf::DW_AT_external);
+
+  if (DD->useAppleExtensionAttributes()) {
+    if (SP->isOptimized())
+      addFlag(SPDie, dwarf::DW_AT_APPLE_optimized);
+
+    if (unsigned isa = Asm->getISAEncoding())
+      addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa);
+  }
+
+  if (SP->isLValueReference())
+    addFlag(SPDie, dwarf::DW_AT_reference);
+
+  if (SP->isRValueReference())
+    addFlag(SPDie, dwarf::DW_AT_rvalue_reference);
+
+  if (SP->isNoReturn())
+    addFlag(SPDie, dwarf::DW_AT_noreturn);
+
+  addAccess(SPDie, SP->getFlags());
+
+  if (SP->isExplicit())
+    addFlag(SPDie, dwarf::DW_AT_explicit);
+
+  if (SP->isMainSubprogram())
+    addFlag(SPDie, dwarf::DW_AT_main_subprogram);
+  if (SP->isPure())
+    addFlag(SPDie, dwarf::DW_AT_pure);
+  if (SP->isElemental())
+    addFlag(SPDie, dwarf::DW_AT_elemental);
+  if (SP->isRecursive())
+    addFlag(SPDie, dwarf::DW_AT_recursive);
+
+  if (!SP->getTargetFuncName().empty())
+    addString(SPDie, dwarf::DW_AT_trampoline, SP->getTargetFuncName());
+
+  if (DD->getDwarfVersion() >= 5 && SP->isDeleted())
+    addFlag(SPDie, dwarf::DW_AT_deleted);
+}
+
+void DwarfUnit::constructSubrangeDIE(DIE &Buffer, const DISubrange *SR,
+                                     DIE *IndexTy) {
+  DIE &DW_Subrange = createAndAddDIE(dwarf::DW_TAG_subrange_type, Buffer);
+  addDIEEntry(DW_Subrange, dwarf::DW_AT_type, *IndexTy);
+
+  // The LowerBound value defines the lower bounds which is typically zero for
+  // C/C++. The Count value is the number of elements.  Values are 64 bit. If
+  // Count == -1 then the array is unbounded and we do not emit
+  // DW_AT_lower_bound and DW_AT_count attributes.
+  int64_t DefaultLowerBound = getDefaultLowerBound();
+
+  auto AddBoundTypeEntry = [&](dwarf::Attribute Attr,
+                               DISubrange::BoundType Bound) -> void {
+    if (auto *BV = dyn_cast_if_present<DIVariable *>(Bound)) {
+      if (auto *VarDIE = getDIE(BV))
+        addDIEEntry(DW_Subrange, Attr, *VarDIE);
+    } else if (auto *BE = dyn_cast_if_present<DIExpression *>(Bound)) {
+      DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+      DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+      DwarfExpr.setMemoryLocationKind();
+      DwarfExpr.addExpression(BE);
+      addBlock(DW_Subrange, Attr, DwarfExpr.finalize());
+    } else if (auto *BI = dyn_cast_if_present<ConstantInt *>(Bound)) {
+      if (Attr == dwarf::DW_AT_count) {
+        if (BI->getSExtValue() != -1)
+          addUInt(DW_Subrange, Attr, std::nullopt, BI->getSExtValue());
+      } else if (Attr != dwarf::DW_AT_lower_bound || DefaultLowerBound == -1 ||
+                 BI->getSExtValue() != DefaultLowerBound)
+        addSInt(DW_Subrange, Attr, dwarf::DW_FORM_sdata, BI->getSExtValue());
+    }
+  };
+
+  AddBoundTypeEntry(dwarf::DW_AT_lower_bound, SR->getLowerBound());
+
+  AddBoundTypeEntry(dwarf::DW_AT_count, SR->getCount());
+
+  AddBoundTypeEntry(dwarf::DW_AT_upper_bound, SR->getUpperBound());
+
+  AddBoundTypeEntry(dwarf::DW_AT_byte_stride, SR->getStride());
+}
+
+void DwarfUnit::constructGenericSubrangeDIE(DIE &Buffer,
+                                            const DIGenericSubrange *GSR,
+                                            DIE *IndexTy) {
+  DIE &DwGenericSubrange =
+      createAndAddDIE(dwarf::DW_TAG_generic_subrange, Buffer);
+  addDIEEntry(DwGenericSubrange, dwarf::DW_AT_type, *IndexTy);
+
+  int64_t DefaultLowerBound = getDefaultLowerBound();
+
+  auto AddBoundTypeEntry = [&](dwarf::Attribute Attr,
+                               DIGenericSubrange::BoundType Bound) -> void {
+    if (auto *BV = dyn_cast_if_present<DIVariable *>(Bound)) {
+      if (auto *VarDIE = getDIE(BV))
+        addDIEEntry(DwGenericSubrange, Attr, *VarDIE);
+    } else if (auto *BE = dyn_cast_if_present<DIExpression *>(Bound)) {
+      if (BE->isConstant() &&
+          DIExpression::SignedOrUnsignedConstant::SignedConstant ==
+              *BE->isConstant()) {
+        if (Attr != dwarf::DW_AT_lower_bound || DefaultLowerBound == -1 ||
+            static_cast<int64_t>(BE->getElement(1)) != DefaultLowerBound)
+          addSInt(DwGenericSubrange, Attr, dwarf::DW_FORM_sdata,
+                  BE->getElement(1));
+      } else {
+        DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+        DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+        DwarfExpr.setMemoryLocationKind();
+        DwarfExpr.addExpression(BE);
+        addBlock(DwGenericSubrange, Attr, DwarfExpr.finalize());
+      }
+    }
+  };
+
+  AddBoundTypeEntry(dwarf::DW_AT_lower_bound, GSR->getLowerBound());
+  AddBoundTypeEntry(dwarf::DW_AT_count, GSR->getCount());
+  AddBoundTypeEntry(dwarf::DW_AT_upper_bound, GSR->getUpperBound());
+  AddBoundTypeEntry(dwarf::DW_AT_byte_stride, GSR->getStride());
+}
+
+DIE *DwarfUnit::getIndexTyDie() {
+  if (IndexTyDie)
+    return IndexTyDie;
+  // Construct an integer type to use for indexes.
+  IndexTyDie = &createAndAddDIE(dwarf::DW_TAG_base_type, getUnitDie());
+  StringRef Name = "__ARRAY_SIZE_TYPE__";
+  addString(*IndexTyDie, dwarf::DW_AT_name, Name);
+  addUInt(*IndexTyDie, dwarf::DW_AT_byte_size, std::nullopt, sizeof(int64_t));
+  addUInt(*IndexTyDie, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
+          dwarf::getArrayIndexTypeEncoding(
+              (dwarf::SourceLanguage)getLanguage()));
+  DD->addAccelType(*CUNode, Name, *IndexTyDie, /*Flags*/ 0);
+  return IndexTyDie;
+}
+
+/// Returns true if the vector's size differs from the sum of sizes of elements
+/// the user specified.  This can occur if the vector has been rounded up to
+/// fit memory alignment constraints.
+static bool hasVectorBeenPadded(const DICompositeType *CTy) {
+  assert(CTy && CTy->isVector() && "Composite type is not a vector");
+  const uint64_t ActualSize = CTy->getSizeInBits();
+
+  // Obtain the size of each element in the vector.
+  DIType *BaseTy = CTy->getBaseType();
+  assert(BaseTy && "Unknown vector element type.");
+  const uint64_t ElementSize = BaseTy->getSizeInBits();
+
+  // Locate the number of elements in the vector.
+  const DINodeArray Elements = CTy->getElements();
+  assert(Elements.size() == 1 &&
+         Elements[0]->getTag() == dwarf::DW_TAG_subrange_type &&
+         "Invalid vector element array, expected one element of type subrange");
+  const auto Subrange = cast<DISubrange>(Elements[0]);
+  const auto NumVecElements =
+      Subrange->getCount()
+          ? cast<ConstantInt *>(Subrange->getCount())->getSExtValue()
+          : 0;
+
+  // Ensure we found the element count and that the actual size is wide
+  // enough to contain the requested size.
+  assert(ActualSize >= (NumVecElements * ElementSize) && "Invalid vector size");
+  return ActualSize != (NumVecElements * ElementSize);
+}
+
+void DwarfUnit::constructArrayTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
+  if (CTy->isVector()) {
+    addFlag(Buffer, dwarf::DW_AT_GNU_vector);
+    if (hasVectorBeenPadded(CTy))
+      addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt,
+              CTy->getSizeInBits() / CHAR_BIT);
+  }
+
+  if (DIVariable *Var = CTy->getDataLocation()) {
+    if (auto *VarDIE = getDIE(Var))
+      addDIEEntry(Buffer, dwarf::DW_AT_data_location, *VarDIE);
+  } else if (DIExpression *Expr = CTy->getDataLocationExp()) {
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+    DwarfExpr.setMemoryLocationKind();
+    DwarfExpr.addExpression(Expr);
+    addBlock(Buffer, dwarf::DW_AT_data_location, DwarfExpr.finalize());
+  }
+
+  if (DIVariable *Var = CTy->getAssociated()) {
+    if (auto *VarDIE = getDIE(Var))
+      addDIEEntry(Buffer, dwarf::DW_AT_associated, *VarDIE);
+  } else if (DIExpression *Expr = CTy->getAssociatedExp()) {
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+    DwarfExpr.setMemoryLocationKind();
+    DwarfExpr.addExpression(Expr);
+    addBlock(Buffer, dwarf::DW_AT_associated, DwarfExpr.finalize());
+  }
+
+  if (DIVariable *Var = CTy->getAllocated()) {
+    if (auto *VarDIE = getDIE(Var))
+      addDIEEntry(Buffer, dwarf::DW_AT_allocated, *VarDIE);
+  } else if (DIExpression *Expr = CTy->getAllocatedExp()) {
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+    DwarfExpr.setMemoryLocationKind();
+    DwarfExpr.addExpression(Expr);
+    addBlock(Buffer, dwarf::DW_AT_allocated, DwarfExpr.finalize());
+  }
+
+  if (auto *RankConst = CTy->getRankConst()) {
+    addSInt(Buffer, dwarf::DW_AT_rank, dwarf::DW_FORM_sdata,
+            RankConst->getSExtValue());
+  } else if (auto *RankExpr = CTy->getRankExp()) {
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+    DwarfExpr.setMemoryLocationKind();
+    DwarfExpr.addExpression(RankExpr);
+    addBlock(Buffer, dwarf::DW_AT_rank, DwarfExpr.finalize());
+  }
+
+  // Emit the element type.
+  addType(Buffer, CTy->getBaseType());
+
+  // Get an anonymous type for index type.
+  // FIXME: This type should be passed down from the front end
+  // as different languages may have different sizes for indexes.
+  DIE *IdxTy = getIndexTyDie();
+
+  // Add subranges to array type.
+  DINodeArray Elements = CTy->getElements();
+  for (DINode *E : Elements) {
+    // FIXME: Should this really be such a loose cast?
+    if (auto *Element = dyn_cast_or_null<DINode>(E)) {
+      if (Element->getTag() == dwarf::DW_TAG_subrange_type)
+        constructSubrangeDIE(Buffer, cast<DISubrange>(Element), IdxTy);
+      else if (Element->getTag() == dwarf::DW_TAG_generic_subrange)
+        constructGenericSubrangeDIE(Buffer, cast<DIGenericSubrange>(Element),
+                                    IdxTy);
+    }
+  }
+}
+
+void DwarfUnit::constructEnumTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
+  const DIType *DTy = CTy->getBaseType();
+  bool IsUnsigned = DTy && DD->isUnsignedDIType(DTy);
+  if (DTy) {
+    if (DD->getDwarfVersion() >= 3)
+      addType(Buffer, DTy);
+    if (DD->getDwarfVersion() >= 4 && (CTy->getFlags() & DINode::FlagEnumClass))
+      addFlag(Buffer, dwarf::DW_AT_enum_class);
+  }
+
+  auto *Context = CTy->getScope();
+  bool IndexEnumerators = !Context || isa<DICompileUnit>(Context) || isa<DIFile>(Context) ||
+      isa<DINamespace>(Context) || isa<DICommonBlock>(Context);
+  DINodeArray Elements = CTy->getElements();
+
+  // Add enumerators to enumeration type.
+  for (const DINode *E : Elements) {
+    auto *Enum = dyn_cast_or_null<DIEnumerator>(E);
+    if (Enum) {
+      DIE &Enumerator = createAndAddDIE(dwarf::DW_TAG_enumerator, Buffer);
+      StringRef Name = Enum->getName();
+      addString(Enumerator, dwarf::DW_AT_name, Name);
+      addConstantValue(Enumerator, Enum->getValue(), IsUnsigned);
+      if (IndexEnumerators)
+        addGlobalName(Name, Enumerator, Context);
+    }
+  }
+}
+
+void DwarfUnit::constructContainingTypeDIEs() {
+  for (auto &P : ContainingTypeMap) {
+    DIE &SPDie = *P.first;
+    const DINode *D = P.second;
+    if (!D)
+      continue;
+    DIE *NDie = getDIE(D);
+    if (!NDie)
+      continue;
+    addDIEEntry(SPDie, dwarf::DW_AT_containing_type, *NDie);
+  }
+}
+
+DIE &DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
+  DIE &MemberDie = createAndAddDIE(DT->getTag(), Buffer);
+  StringRef Name = DT->getName();
+  if (!Name.empty())
+    addString(MemberDie, dwarf::DW_AT_name, Name);
+
+  addAnnotation(MemberDie, DT->getAnnotations());
+
+  if (DIType *Resolved = DT->getBaseType())
+    addType(MemberDie, Resolved);
+
+  addSourceLine(MemberDie, DT);
+
+  if (DT->getTag() == dwarf::DW_TAG_inheritance && DT->isVirtual()) {
+
+    // For C++, virtual base classes are not at fixed offset. Use following
+    // expression to extract appropriate offset from vtable.
+    // BaseAddr = ObAddr + *((*ObAddr) - Offset)
+
+    DIELoc *VBaseLocationDie = new (DIEValueAllocator) DIELoc;
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_dup);
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_udata, DT->getOffsetInBits());
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_minus);
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+
+    addBlock(MemberDie, dwarf::DW_AT_data_member_location, VBaseLocationDie);
+  } else {
+    uint64_t Size = DT->getSizeInBits();
+    uint64_t FieldSize = DD->getBaseTypeSize(DT);
+    uint32_t AlignInBytes = DT->getAlignInBytes();
+    uint64_t OffsetInBytes;
+
+    bool IsBitfield = DT->isBitField();
+    if (IsBitfield) {
+      // Handle bitfield, assume bytes are 8 bits.
+      if (DD->useDWARF2Bitfields())
+        addUInt(MemberDie, dwarf::DW_AT_byte_size, std::nullopt, FieldSize / 8);
+      addUInt(MemberDie, dwarf::DW_AT_bit_size, std::nullopt, Size);
+
+      uint64_t Offset = DT->getOffsetInBits();
+      // We can't use DT->getAlignInBits() here: AlignInBits for member type
+      // is non-zero if and only if alignment was forced (e.g. _Alignas()),
+      // which can't be done with bitfields. Thus we use FieldSize here.
+      uint32_t AlignInBits = FieldSize;
+      uint32_t AlignMask = ~(AlignInBits - 1);
+      // The bits from the start of the storage unit to the start of the field.
+      uint64_t StartBitOffset = Offset - (Offset & AlignMask);
+      // The byte offset of the field's aligned storage unit inside the struct.
+      OffsetInBytes = (Offset - StartBitOffset) / 8;
+
+      if (DD->useDWARF2Bitfields()) {
+        uint64_t HiMark = (Offset + FieldSize) & AlignMask;
+        uint64_t FieldOffset = (HiMark - FieldSize);
+        Offset -= FieldOffset;
+
+        // Maybe we need to work from the other end.
+        if (Asm->getDataLayout().isLittleEndian())
+          Offset = FieldSize - (Offset + Size);
+
+        addUInt(MemberDie, dwarf::DW_AT_bit_offset, std::nullopt, Offset);
+        OffsetInBytes = FieldOffset >> 3;
+      } else {
+        addUInt(MemberDie, dwarf::DW_AT_data_bit_offset, std::nullopt, Offset);
+      }
+    } else {
+      // This is not a bitfield.
+      OffsetInBytes = DT->getOffsetInBits() / 8;
+      if (AlignInBytes)
+        addUInt(MemberDie, dwarf::DW_AT_alignment, dwarf::DW_FORM_udata,
+                AlignInBytes);
+    }
+
+    if (DD->getDwarfVersion() <= 2) {
+      DIELoc *MemLocationDie = new (DIEValueAllocator) DIELoc;
+      addUInt(*MemLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+      addUInt(*MemLocationDie, dwarf::DW_FORM_udata, OffsetInBytes);
+      addBlock(MemberDie, dwarf::DW_AT_data_member_location, MemLocationDie);
+    } else if (!IsBitfield || DD->useDWARF2Bitfields()) {
+      // In DWARF v3, DW_FORM_data4/8 in DW_AT_data_member_location are
+      // interpreted as location-list pointers. Interpreting constants as
+      // pointers is not expected, so we use DW_FORM_udata to encode the
+      // constants here.
+      if (DD->getDwarfVersion() == 3)
+        addUInt(MemberDie, dwarf::DW_AT_data_member_location,
+                dwarf::DW_FORM_udata, OffsetInBytes);
+      else
+        addUInt(MemberDie, dwarf::DW_AT_data_member_location, std::nullopt,
+                OffsetInBytes);
+    }
+  }
+
+  addAccess(MemberDie, DT->getFlags());
+
+  if (DT->isVirtual())
+    addUInt(MemberDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_data1,
+            dwarf::DW_VIRTUALITY_virtual);
+
+  // Objective-C properties.
+  if (DINode *PNode = DT->getObjCProperty())
+    if (DIE *PDie = getDIE(PNode))
+      addAttribute(MemberDie, dwarf::DW_AT_APPLE_property,
+                   dwarf::DW_FORM_ref4, DIEEntry(*PDie));
+
+  if (DT->isArtificial())
+    addFlag(MemberDie, dwarf::DW_AT_artificial);
+
+  return MemberDie;
+}
+
+DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) {
+  if (!DT)
+    return nullptr;
+
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(DT->getScope());
+  assert(dwarf::isType(ContextDIE->getTag()) &&
+         "Static member should belong to a type.");
+
+  if (DIE *StaticMemberDIE = getDIE(DT))
+    return StaticMemberDIE;
+
+  DIE &StaticMemberDIE = createAndAddDIE(DT->getTag(), *ContextDIE, DT);
+
+  const DIType *Ty = DT->getBaseType();
+
+  addString(StaticMemberDIE, dwarf::DW_AT_name, DT->getName());
+  addType(StaticMemberDIE, Ty);
+  addSourceLine(StaticMemberDIE, DT);
+  addFlag(StaticMemberDIE, dwarf::DW_AT_external);
+  addFlag(StaticMemberDIE, dwarf::DW_AT_declaration);
+
+  // FIXME: We could omit private if the parent is a class_type, and
+  // public if the parent is something else.
+  addAccess(StaticMemberDIE, DT->getFlags());
+
+  if (const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(DT->getConstant()))
+    addConstantValue(StaticMemberDIE, CI, Ty);
+  if (const ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(DT->getConstant()))
+    addConstantFPValue(StaticMemberDIE, CFP);
+
+  if (uint32_t AlignInBytes = DT->getAlignInBytes())
+    addUInt(StaticMemberDIE, dwarf::DW_AT_alignment, dwarf::DW_FORM_udata,
+            AlignInBytes);
+
+  return &StaticMemberDIE;
+}
+
+void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) {
+  // Emit size of content not including length itself
+  if (!DD->useSectionsAsReferences())
+    EndLabel = Asm->emitDwarfUnitLength(
+        isDwoUnit() ? "debug_info_dwo" : "debug_info", "Length of Unit");
+  else
+    Asm->emitDwarfUnitLength(getHeaderSize() + getUnitDie().getSize(),
+                             "Length of Unit");
+
+  Asm->OutStreamer->AddComment("DWARF version number");
+  unsigned Version = DD->getDwarfVersion();
+  Asm->emitInt16(Version);
+
+  // DWARF v5 reorders the address size and adds a unit type.
+  if (Version >= 5) {
+    Asm->OutStreamer->AddComment("DWARF Unit Type");
+    Asm->emitInt8(UT);
+    Asm->OutStreamer->AddComment("Address Size (in bytes)");
+    Asm->emitInt8(Asm->MAI->getCodePointerSize());
+  }
+
+  // We share one abbreviations table across all units so it's always at the
+  // start of the section. Use a relocatable offset where needed to ensure
+  // linking doesn't invalidate that offset.
+  Asm->OutStreamer->AddComment("Offset Into Abbrev. Section");
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  if (UseOffsets)
+    Asm->emitDwarfLengthOrOffset(0);
+  else
+    Asm->emitDwarfSymbolReference(
+        TLOF.getDwarfAbbrevSection()->getBeginSymbol(), false);
+
+  if (Version <= 4) {
+    Asm->OutStreamer->AddComment("Address Size (in bytes)");
+    Asm->emitInt8(Asm->MAI->getCodePointerSize());
+  }
+}
+
+void DwarfTypeUnit::emitHeader(bool UseOffsets) {
+  DwarfUnit::emitCommonHeader(UseOffsets,
+                              DD->useSplitDwarf() ? dwarf::DW_UT_split_type
+                                                  : dwarf::DW_UT_type);
+  Asm->OutStreamer->AddComment("Type Signature");
+  Asm->OutStreamer->emitIntValue(TypeSignature, sizeof(TypeSignature));
+  Asm->OutStreamer->AddComment("Type DIE Offset");
+  // In a skeleton type unit there is no type DIE so emit a zero offset.
+  Asm->emitDwarfLengthOrOffset(Ty ? Ty->getOffset() : 0);
+}
+
+void DwarfUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
+                                const MCSymbol *Hi, const MCSymbol *Lo) {
+  addAttribute(Die, Attribute, DD->getDwarfSectionOffsetForm(),
+               new (DIEValueAllocator) DIEDelta(Hi, Lo));
+}
+
+void DwarfUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
+                                const MCSymbol *Label, const MCSymbol *Sec) {
+  if (Asm->doesDwarfUseRelocationsAcrossSections())
+    addLabel(Die, Attribute, DD->getDwarfSectionOffsetForm(), Label);
+  else
+    addSectionDelta(Die, Attribute, Label, Sec);
+}
+
+bool DwarfTypeUnit::isDwoUnit() const {
+  // Since there are no skeleton type units, all type units are dwo type units
+  // when split DWARF is being used.
+  return DD->useSplitDwarf();
+}
+
+void DwarfTypeUnit::addGlobalName(StringRef Name, const DIE &Die,
+                                  const DIScope *Context) {
+  getCU().addGlobalNameForTypeUnit(Name, Context);
+}
+
+void DwarfTypeUnit::addGlobalType(const DIType *Ty, const DIE &Die,
+                                  const DIScope *Context) {
+  getCU().addGlobalTypeUnitType(Ty, Context);
+}
+
+const MCSymbol *DwarfUnit::getCrossSectionRelativeBaseAddress() const {
+  if (!Asm->doesDwarfUseRelocationsAcrossSections())
+    return nullptr;
+  if (isDwoUnit())
+    return nullptr;
+  return getSection()->getBeginSymbol();
+}
+
+void DwarfUnit::addStringOffsetsStart() {
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  addSectionLabel(getUnitDie(), dwarf::DW_AT_str_offsets_base,
+                  DU->getStringOffsetsStartSym(),
+                  TLOF.getDwarfStrOffSection()->getBeginSymbol());
+}
+
+void DwarfUnit::addRnglistsBase() {
+  assert(DD->getDwarfVersion() >= 5 &&
+         "DW_AT_rnglists_base requires DWARF version 5 or later");
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  addSectionLabel(getUnitDie(), dwarf::DW_AT_rnglists_base,
+                  DU->getRnglistsTableBaseSym(),
+                  TLOF.getDwarfRnglistsSection()->getBeginSymbol());
+}
+
+void DwarfTypeUnit::finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) {
+  DD->getAddressPool().resetUsedFlag(true);
+}
+
+bool DwarfUnit::isCompatibleWithVersion(uint16_t Version) const {
+  return !Asm->TM.Options.DebugStrictDwarf || DD->getDwarfVersion() >= Version;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
new file mode 100644
index 000000000000..8f17e94c2d1c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -0,0 +1,390 @@
+//===-- llvm/CodeGen/DwarfUnit.h - Dwarf Compile Unit ---*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFUNIT_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFUNIT_H
+
+#include "DwarfDebug.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/Target/TargetMachine.h"
+#include <optional>
+#include <string>
+
+namespace llvm {
+
+class ConstantFP;
+class ConstantInt;
+class DwarfCompileUnit;
+class MCDwarfDwoLineTable;
+class MCSymbol;
+
+//===----------------------------------------------------------------------===//
+/// This dwarf writer support class manages information associated with a
+/// source file.
+class DwarfUnit : public DIEUnit {
+protected:
+  /// MDNode for the compile unit.
+  const DICompileUnit *CUNode;
+
+  // All DIEValues are allocated through this allocator.
+  BumpPtrAllocator DIEValueAllocator;
+
+  /// Target of Dwarf emission.
+  AsmPrinter *Asm;
+
+  /// Emitted at the end of the CU and used to compute the CU Length field.
+  MCSymbol *EndLabel = nullptr;
+
+  // Holders for some common dwarf information.
+  DwarfDebug *DD;
+  DwarfFile *DU;
+
+  /// An anonymous type for index type.  Owned by DIEUnit.
+  DIE *IndexTyDie = nullptr;
+
+  /// Tracks the mapping of unit level debug information variables to debug
+  /// information entries.
+  DenseMap<const MDNode *, DIE *> MDNodeToDieMap;
+
+  /// A list of all the DIEBlocks in use.
+  std::vector<DIEBlock *> DIEBlocks;
+
+  /// A list of all the DIELocs in use.
+  std::vector<DIELoc *> DIELocs;
+
+  /// This map is used to keep track of subprogram DIEs that need
+  /// DW_AT_containing_type attribute. This attribute points to a DIE that
+  /// corresponds to the MDNode mapped with the subprogram DIE.
+  DenseMap<DIE *, const DINode *> ContainingTypeMap;
+
+  DwarfUnit(dwarf::Tag, const DICompileUnit *Node, AsmPrinter *A, DwarfDebug *DW,
+            DwarfFile *DWU);
+
+  bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie, bool Minimal);
+
+  bool isShareableAcrossCUs(const DINode *D) const;
+
+  template <typename T>
+  void addAttribute(DIEValueList &Die, dwarf::Attribute Attribute,
+                    dwarf::Form Form, T &&Value) {
+    // For strict DWARF mode, only generate attributes available to current
+    // DWARF version.
+    // Attribute 0 is used when emitting form-encoded values in blocks, which
+    // don't have attributes (only forms) so we cannot detect their DWARF
+    // version compatibility here and assume they are compatible.
+    if (Attribute != 0 && Asm->TM.Options.DebugStrictDwarf &&
+        DD->getDwarfVersion() < dwarf::AttributeVersion(Attribute))
+      return;
+
+    Die.addValue(DIEValueAllocator,
+                 DIEValue(Attribute, Form, std::forward<T>(Value)));
+  }
+
+public:
+  // Accessors.
+  AsmPrinter* getAsmPrinter() const { return Asm; }
+  MCSymbol *getEndLabel() const { return EndLabel; }
+  uint16_t getLanguage() const { return CUNode->getSourceLanguage(); }
+  const DICompileUnit *getCUNode() const { return CUNode; }
+  DwarfDebug &getDwarfDebug() const { return *DD; }
+
+  /// Return true if this compile unit has something to write out.
+  bool hasContent() const { return getUnitDie().hasChildren(); }
+
+  /// Get string containing language specific context for a global name.
+  ///
+  /// Walks the metadata parent chain in a language specific manner (using the
+  /// compile unit language) and returns it as a string. This is done at the
+  /// metadata level because DIEs may not currently have been added to the
+  /// parent context and walking the DIEs looking for names is more expensive
+  /// than walking the metadata.
+  std::string getParentContextString(const DIScope *Context) const;
+
+  /// Add a new global name to the compile unit.
+  virtual void addGlobalName(StringRef Name, const DIE &Die,
+                             const DIScope *Context) = 0;
+
+  /// Add a new global type to the compile unit.
+  virtual void addGlobalType(const DIType *Ty, const DIE &Die,
+                             const DIScope *Context) = 0;
+
+  /// Returns the DIE map slot for the specified debug variable.
+  ///
+  /// We delegate the request to DwarfDebug when the MDNode can be part of the
+  /// type system, since DIEs for the type system can be shared across CUs and
+  /// the mappings are kept in DwarfDebug.
+  DIE *getDIE(const DINode *D) const;
+
+  /// Returns a fresh newly allocated DIELoc.
+  DIELoc *getDIELoc() { return new (DIEValueAllocator) DIELoc; }
+
+  /// Insert DIE into the map.
+  ///
+  /// We delegate the request to DwarfDebug when the MDNode can be part of the
+  /// type system, since DIEs for the type system can be shared across CUs and
+  /// the mappings are kept in DwarfDebug.
+  void insertDIE(const DINode *Desc, DIE *D);
+
+  void insertDIE(DIE *D);
+
+  /// Add a flag that is true to the DIE.
+  void addFlag(DIE &Die, dwarf::Attribute Attribute);
+
+  /// Add an unsigned integer attribute data and value.
+  void addUInt(DIEValueList &Die, dwarf::Attribute Attribute,
+               std::optional<dwarf::Form> Form, uint64_t Integer);
+
+  void addUInt(DIEValueList &Block, dwarf::Form Form, uint64_t Integer);
+
+  /// Add an signed integer attribute data and value.
+  void addSInt(DIEValueList &Die, dwarf::Attribute Attribute,
+               std::optional<dwarf::Form> Form, int64_t Integer);
+
+  void addSInt(DIELoc &Die, std::optional<dwarf::Form> Form, int64_t Integer);
+
+  /// Add a string attribute data and value.
+  ///
+  /// We always emit a reference to the string pool instead of immediate
+  /// strings so that DIEs have more predictable sizes. In the case of split
+  /// dwarf we emit an index into another table which gets us the static offset
+  /// into the string table.
+  void addString(DIE &Die, dwarf::Attribute Attribute, StringRef Str);
+
+  /// Add a Dwarf label attribute data and value.
+  void addLabel(DIEValueList &Die, dwarf::Attribute Attribute, dwarf::Form Form,
+                const MCSymbol *Label);
+
+  void addLabel(DIELoc &Die, dwarf::Form Form, const MCSymbol *Label);
+
+  /// Add an offset into a section attribute data and value.
+  void addSectionOffset(DIE &Die, dwarf::Attribute Attribute, uint64_t Integer);
+
+  /// Add a dwarf op address data and value using the form given and an
+  /// op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
+  void addOpAddress(DIELoc &Die, const MCSymbol *Sym);
+  void addPoolOpAddress(DIEValueList &Die, const MCSymbol *Label);
+
+  /// Add a label delta attribute data and value.
+  void addLabelDelta(DIEValueList &Die, dwarf::Attribute Attribute,
+                     const MCSymbol *Hi, const MCSymbol *Lo);
+
+  /// Add a DIE attribute data and value.
+  void addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIE &Entry);
+
+  /// Add a DIE attribute data and value.
+  void addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIEEntry Entry);
+
+  /// Add a type's DW_AT_signature and set the  declaration flag.
+  void addDIETypeSignature(DIE &Die, uint64_t Signature);
+
+  /// Add block data.
+  void addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Loc);
+
+  /// Add block data.
+  void addBlock(DIE &Die, dwarf::Attribute Attribute, DIEBlock *Block);
+  void addBlock(DIE &Die, dwarf::Attribute Attribute, dwarf::Form Form,
+                DIEBlock *Block);
+
+  /// Add location information to specified debug information entry.
+  void addSourceLine(DIE &Die, unsigned Line, const DIFile *File);
+  void addSourceLine(DIE &Die, const DILocalVariable *V);
+  void addSourceLine(DIE &Die, const DIGlobalVariable *G);
+  void addSourceLine(DIE &Die, const DISubprogram *SP);
+  void addSourceLine(DIE &Die, const DILabel *L);
+  void addSourceLine(DIE &Die, const DIType *Ty);
+  void addSourceLine(DIE &Die, const DIObjCProperty *Ty);
+
+  /// Add constant value entry in variable DIE.
+  void addConstantValue(DIE &Die, const ConstantInt *CI, const DIType *Ty);
+  void addConstantValue(DIE &Die, const APInt &Val, const DIType *Ty);
+  void addConstantValue(DIE &Die, const APInt &Val, bool Unsigned);
+  void addConstantValue(DIE &Die, uint64_t Val, const DIType *Ty);
+  void addConstantValue(DIE &Die, bool Unsigned, uint64_t Val);
+
+  /// Add constant value entry in variable DIE.
+  void addConstantFPValue(DIE &Die, const ConstantFP *CFP);
+
+  /// Add a linkage name, if it isn't empty.
+  void addLinkageName(DIE &Die, StringRef LinkageName);
+
+  /// Add template parameters in buffer.
+  void addTemplateParams(DIE &Buffer, DINodeArray TParams);
+
+  /// Add thrown types.
+  void addThrownTypes(DIE &Die, DINodeArray ThrownTypes);
+
+  /// Add the accessibility attribute.
+  void addAccess(DIE &Die, DINode::DIFlags Flags);
+
+  /// Add a new type attribute to the specified entity.
+  ///
+  /// This takes and attribute parameter because DW_AT_friend attributes are
+  /// also type references.
+  void addType(DIE &Entity, const DIType *Ty,
+               dwarf::Attribute Attribute = dwarf::DW_AT_type);
+
+  DIE *getOrCreateNameSpace(const DINamespace *NS);
+  DIE *getOrCreateModule(const DIModule *M);
+  DIE *getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal = false);
+
+  void applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
+                                 bool SkipSPAttributes = false);
+
+  /// Creates type DIE with specific context.
+  DIE *createTypeDIE(const DIScope *Context, DIE &ContextDIE, const DIType *Ty);
+
+  /// Find existing DIE or create new DIE for the given type.
+  virtual DIE *getOrCreateTypeDIE(const MDNode *TyNode);
+
+  /// Get context owner's DIE.
+  virtual DIE *getOrCreateContextDIE(const DIScope *Context);
+
+  /// Construct DIEs for types that contain vtables.
+  void constructContainingTypeDIEs();
+
+  /// Construct function argument DIEs.
+  void constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args);
+
+  /// Create a DIE with the given Tag, add the DIE to its parent, and
+  /// call insertDIE if MD is not null.
+  DIE &createAndAddDIE(dwarf::Tag Tag, DIE &Parent, const DINode *N = nullptr);
+
+  bool useSegmentedStringOffsetsTable() const {
+    return DD->useSegmentedStringOffsetsTable();
+  }
+
+  /// Compute the size of a header for this unit, not including the initial
+  /// length field.
+  virtual unsigned getHeaderSize() const {
+    return sizeof(int16_t) +               // DWARF version number
+           Asm->getDwarfOffsetByteSize() + // Offset Into Abbrev. Section
+           sizeof(int8_t) +                // Pointer Size (in bytes)
+           (DD->getDwarfVersion() >= 5 ? sizeof(int8_t)
+                                       : 0); // DWARF v5 unit type
+  }
+
+  /// Emit the header for this unit, not including the initial length field.
+  virtual void emitHeader(bool UseOffsets) = 0;
+
+  /// Add the DW_AT_str_offsets_base attribute to the unit DIE.
+  void addStringOffsetsStart();
+
+  /// Add the DW_AT_rnglists_base attribute to the unit DIE.
+  void addRnglistsBase();
+
+  virtual DwarfCompileUnit &getCU() = 0;
+
+  void constructTypeDIE(DIE &Buffer, const DICompositeType *CTy);
+
+  /// addSectionDelta - Add a label delta attribute data and value.
+  void addSectionDelta(DIE &Die, dwarf::Attribute Attribute, const MCSymbol *Hi,
+                       const MCSymbol *Lo);
+
+  /// Add a Dwarf section label attribute data and value.
+  void addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
+                       const MCSymbol *Label, const MCSymbol *Sec);
+
+  /// Add DW_TAG_LLVM_annotation.
+  void addAnnotation(DIE &Buffer, DINodeArray Annotations);
+
+  /// Get context owner's DIE.
+  DIE *createTypeDIE(const DICompositeType *Ty);
+
+protected:
+  ~DwarfUnit();
+
+  /// Create new static data member DIE.
+  DIE *getOrCreateStaticMemberDIE(const DIDerivedType *DT);
+
+  /// Look up the source ID for the given file. If none currently exists,
+  /// create a new ID and insert it in the line table.
+  virtual unsigned getOrCreateSourceID(const DIFile *File) = 0;
+
+  /// Emit the common part of the header for this unit.
+  void emitCommonHeader(bool UseOffsets, dwarf::UnitType UT);
+
+private:
+  void constructTypeDIE(DIE &Buffer, const DIBasicType *BTy);
+  void constructTypeDIE(DIE &Buffer, const DIStringType *BTy);
+  void constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy);
+  void constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy);
+  void constructSubrangeDIE(DIE &Buffer, const DISubrange *SR, DIE *IndexTy);
+  void constructGenericSubrangeDIE(DIE &Buffer, const DIGenericSubrange *SR,
+                                   DIE *IndexTy);
+  void constructArrayTypeDIE(DIE &Buffer, const DICompositeType *CTy);
+  void constructEnumTypeDIE(DIE &Buffer, const DICompositeType *CTy);
+  DIE &constructMemberDIE(DIE &Buffer, const DIDerivedType *DT);
+  void constructTemplateTypeParameterDIE(DIE &Buffer,
+                                         const DITemplateTypeParameter *TP);
+  void constructTemplateValueParameterDIE(DIE &Buffer,
+                                          const DITemplateValueParameter *TVP);
+
+  /// Return the default lower bound for an array.
+  ///
+  /// If the DWARF version doesn't handle the language, return -1.
+  int64_t getDefaultLowerBound() const;
+
+  /// Get an anonymous type for index type.
+  DIE *getIndexTyDie();
+
+  /// Set D as anonymous type for index which can be reused later.
+  void setIndexTyDie(DIE *D) { IndexTyDie = D; }
+
+  virtual void finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) = 0;
+
+  /// If this is a named finished type then include it in the list of types for
+  /// the accelerator tables.
+  void updateAcceleratorTables(const DIScope *Context, const DIType *Ty,
+                               const DIE &TyDIE);
+
+  virtual bool isDwoUnit() const = 0;
+  const MCSymbol *getCrossSectionRelativeBaseAddress() const override;
+
+  /// Returns 'true' if the current DwarfVersion is compatible
+  /// with the specified \p Version.
+  bool isCompatibleWithVersion(uint16_t Version) const;
+};
+
+class DwarfTypeUnit final : public DwarfUnit {
+  uint64_t TypeSignature;
+  const DIE *Ty;
+  DwarfCompileUnit &CU;
+  MCDwarfDwoLineTable *SplitLineTable;
+  bool UsedLineTable = false;
+
+  unsigned getOrCreateSourceID(const DIFile *File) override;
+  void finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) override;
+  bool isDwoUnit() const override;
+
+public:
+  DwarfTypeUnit(DwarfCompileUnit &CU, AsmPrinter *A, DwarfDebug *DW,
+                DwarfFile *DWU, MCDwarfDwoLineTable *SplitLineTable = nullptr);
+
+  void setTypeSignature(uint64_t Signature) { TypeSignature = Signature; }
+  void setType(const DIE *Ty) { this->Ty = Ty; }
+
+  /// Emit the header for this unit, not including the initial length field.
+  void emitHeader(bool UseOffsets) override;
+  unsigned getHeaderSize() const override {
+    return DwarfUnit::getHeaderSize() + sizeof(uint64_t) + // Type Signature
+           Asm->getDwarfOffsetByteSize();                  // Type DIE Offset
+  }
+  void addGlobalName(StringRef Name, const DIE &Die,
+                     const DIScope *Context) override;
+  void addGlobalType(const DIType *Ty, const DIE &Die,
+                     const DIScope *Context) override;
+  DwarfCompileUnit &getCU() override { return CU; }
+};
+} // end llvm namespace
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
new file mode 100644
index 000000000000..eef6b1d93f36
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -0,0 +1,850 @@
+//===- CodeGen/AsmPrinter/EHStreamer.cpp - Exception Directive Streamer ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing exception info into assembly files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "EHStreamer.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+using namespace llvm;
+
+EHStreamer::EHStreamer(AsmPrinter *A) : Asm(A), MMI(Asm->MMI) {}
+
+EHStreamer::~EHStreamer() = default;
+
+/// How many leading type ids two landing pads have in common.
+unsigned EHStreamer::sharedTypeIDs(const LandingPadInfo *L,
+                                   const LandingPadInfo *R) {
+  const std::vector<int> &LIds = L->TypeIds, &RIds = R->TypeIds;
+  return std::mismatch(LIds.begin(), LIds.end(), RIds.begin(), RIds.end())
+             .first -
+         LIds.begin();
+}
+
+/// Compute the actions table and gather the first action index for each landing
+/// pad site.
+void EHStreamer::computeActionsTable(
+    const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+    SmallVectorImpl<ActionEntry> &Actions,
+    SmallVectorImpl<unsigned> &FirstActions) {
+  // The action table follows the call-site table in the LSDA. The individual
+  // records are of two types:
+  //
+  //   * Catch clause
+  //   * Exception specification
+  //
+  // The two record kinds have the same format, with only small differences.
+  // They are distinguished by the "switch value" field: Catch clauses
+  // (TypeInfos) have strictly positive switch values, and exception
+  // specifications (FilterIds) have strictly negative switch values. Value 0
+  // indicates a catch-all clause.
+  //
+  // Negative type IDs index into FilterIds. Positive type IDs index into
+  // TypeInfos.  The value written for a positive type ID is just the type ID
+  // itself.  For a negative type ID, however, the value written is the
+  // (negative) byte offset of the corresponding FilterIds entry.  The byte
+  // offset is usually equal to the type ID (because the FilterIds entries are
+  // written using a variable width encoding, which outputs one byte per entry
+  // as long as the value written is not too large) but can differ.  This kind
+  // of complication does not occur for positive type IDs because type infos are
+  // output using a fixed width encoding.  FilterOffsets[i] holds the byte
+  // offset corresponding to FilterIds[i].
+
+  const std::vector<unsigned> &FilterIds = Asm->MF->getFilterIds();
+  SmallVector<int, 16> FilterOffsets;
+  FilterOffsets.reserve(FilterIds.size());
+  int Offset = -1;
+
+  for (unsigned FilterId : FilterIds) {
+    FilterOffsets.push_back(Offset);
+    Offset -= getULEB128Size(FilterId);
+  }
+
+  FirstActions.reserve(LandingPads.size());
+
+  int FirstAction = 0;
+  unsigned SizeActions = 0; // Total size of all action entries for a function
+  const LandingPadInfo *PrevLPI = nullptr;
+
+  for (const LandingPadInfo *LPI : LandingPads) {
+    const std::vector<int> &TypeIds = LPI->TypeIds;
+    unsigned NumShared = PrevLPI ? sharedTypeIDs(LPI, PrevLPI) : 0;
+    unsigned SizeSiteActions = 0; // Total size of all entries for a landingpad
+
+    if (NumShared < TypeIds.size()) {
+      // Size of one action entry (typeid + next action)
+      unsigned SizeActionEntry = 0;
+      unsigned PrevAction = (unsigned)-1;
+
+      if (NumShared) {
+        unsigned SizePrevIds = PrevLPI->TypeIds.size();
+        assert(Actions.size());
+        PrevAction = Actions.size() - 1;
+        SizeActionEntry = getSLEB128Size(Actions[PrevAction].NextAction) +
+                          getSLEB128Size(Actions[PrevAction].ValueForTypeID);
+
+        for (unsigned j = NumShared; j != SizePrevIds; ++j) {
+          assert(PrevAction != (unsigned)-1 && "PrevAction is invalid!");
+          SizeActionEntry -= getSLEB128Size(Actions[PrevAction].ValueForTypeID);
+          SizeActionEntry += -Actions[PrevAction].NextAction;
+          PrevAction = Actions[PrevAction].Previous;
+        }
+      }
+
+      // Compute the actions.
+      for (unsigned J = NumShared, M = TypeIds.size(); J != M; ++J) {
+        int TypeID = TypeIds[J];
+        assert(-1 - TypeID < (int)FilterOffsets.size() && "Unknown filter id!");
+        int ValueForTypeID =
+            isFilterEHSelector(TypeID) ? FilterOffsets[-1 - TypeID] : TypeID;
+        unsigned SizeTypeID = getSLEB128Size(ValueForTypeID);
+
+        int NextAction = SizeActionEntry ? -(SizeActionEntry + SizeTypeID) : 0;
+        SizeActionEntry = SizeTypeID + getSLEB128Size(NextAction);
+        SizeSiteActions += SizeActionEntry;
+
+        ActionEntry Action = { ValueForTypeID, NextAction, PrevAction };
+        Actions.push_back(Action);
+        PrevAction = Actions.size() - 1;
+      }
+
+      // Record the first action of the landing pad site.
+      FirstAction = SizeActions + SizeSiteActions - SizeActionEntry + 1;
+    } // else identical - re-use previous FirstAction
+
+    // Information used when creating the call-site table. The action record
+    // field of the call site record is the offset of the first associated
+    // action record, relative to the start of the actions table. This value is
+    // biased by 1 (1 indicating the start of the actions table), and 0
+    // indicates that there are no actions.
+    FirstActions.push_back(FirstAction);
+
+    // Compute this sites contribution to size.
+    SizeActions += SizeSiteActions;
+
+    PrevLPI = LPI;
+  }
+}
+
+/// Return `true' if this is a call to a function marked `nounwind'. Return
+/// `false' otherwise.
+bool EHStreamer::callToNoUnwindFunction(const MachineInstr *MI) {
+  assert(MI->isCall() && "This should be a call instruction!");
+
+  bool MarkedNoUnwind = false;
+  bool SawFunc = false;
+
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isGlobal()) continue;
+
+    const Function *F = dyn_cast<Function>(MO.getGlobal());
+    if (!F) continue;
+
+    if (SawFunc) {
+      // Be conservative. If we have more than one function operand for this
+      // call, then we can't make the assumption that it's the callee and
+      // not a parameter to the call.
+      //
+      // FIXME: Determine if there's a way to say that `F' is the callee or
+      // parameter.
+      MarkedNoUnwind = false;
+      break;
+    }
+
+    MarkedNoUnwind = F->doesNotThrow();
+    SawFunc = true;
+  }
+
+  return MarkedNoUnwind;
+}
+
+void EHStreamer::computePadMap(
+    const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+    RangeMapType &PadMap) {
+  // Invokes and nounwind calls have entries in PadMap (due to being bracketed
+  // by try-range labels when lowered).  Ordinary calls do not, so appropriate
+  // try-ranges for them need be deduced so we can put them in the LSDA.
+  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
+    const LandingPadInfo *LandingPad = LandingPads[i];
+    for (unsigned j = 0, E = LandingPad->BeginLabels.size(); j != E; ++j) {
+      MCSymbol *BeginLabel = LandingPad->BeginLabels[j];
+      MCSymbol *EndLabel = LandingPad->BeginLabels[j];
+      // If we have deleted the code for a given invoke after registering it in
+      // the LandingPad label list, the associated symbols will not have been
+      // emitted. In that case, ignore this callsite entry.
+      if (!BeginLabel->isDefined() || !EndLabel->isDefined())
+        continue;
+      assert(!PadMap.count(BeginLabel) && "Duplicate landing pad labels!");
+      PadRange P = { i, j };
+      PadMap[BeginLabel] = P;
+    }
+  }
+}
+
+/// Compute the call-site table.  The entry for an invoke has a try-range
+/// containing the call, a non-zero landing pad, and an appropriate action.  The
+/// entry for an ordinary call has a try-range containing the call and zero for
+/// the landing pad and the action.  Calls marked 'nounwind' have no entry and
+/// must not be contained in the try-range of any entry - they form gaps in the
+/// table.  Entries must be ordered by try-range address.
+///
+/// Call-sites are split into one or more call-site ranges associated with
+/// different sections of the function.
+///
+///   - Without -basic-block-sections, all call-sites are grouped into one
+///     call-site-range corresponding to the function section.
+///
+///   - With -basic-block-sections, one call-site range is created for each
+///     section, with its FragmentBeginLabel and FragmentEndLabel respectively
+//      set to the beginning and ending of the corresponding section and its
+//      ExceptionLabel set to the exception symbol dedicated for this section.
+//      Later, one LSDA header will be emitted for each call-site range with its
+//      call-sites following. The action table and type info table will be
+//      shared across all ranges.
+void EHStreamer::computeCallSiteTable(
+    SmallVectorImpl<CallSiteEntry> &CallSites,
+    SmallVectorImpl<CallSiteRange> &CallSiteRanges,
+    const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+    const SmallVectorImpl<unsigned> &FirstActions) {
+  RangeMapType PadMap;
+  computePadMap(LandingPads, PadMap);
+
+  // The end label of the previous invoke or nounwind try-range.
+  MCSymbol *LastLabel = Asm->getFunctionBegin();
+
+  // Whether there is a potentially throwing instruction (currently this means
+  // an ordinary call) between the end of the previous try-range and now.
+  bool SawPotentiallyThrowing = false;
+
+  // Whether the last CallSite entry was for an invoke.
+  bool PreviousIsInvoke = false;
+
+  bool IsSJLJ = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::SjLj;
+
+  // Visit all instructions in order of address.
+  for (const auto &MBB : *Asm->MF) {
+    if (&MBB == &Asm->MF->front() || MBB.isBeginSection()) {
+      // We start a call-site range upon function entry and at the beginning of
+      // every basic block section.
+      CallSiteRanges.push_back(
+          {Asm->MBBSectionRanges[MBB.getSectionIDNum()].BeginLabel,
+           Asm->MBBSectionRanges[MBB.getSectionIDNum()].EndLabel,
+           Asm->getMBBExceptionSym(MBB), CallSites.size()});
+      PreviousIsInvoke = false;
+      SawPotentiallyThrowing = false;
+      LastLabel = nullptr;
+    }
+
+    if (MBB.isEHPad())
+      CallSiteRanges.back().IsLPRange = true;
+
+    for (const auto &MI : MBB) {
+      if (!MI.isEHLabel()) {
+        if (MI.isCall())
+          SawPotentiallyThrowing |= !callToNoUnwindFunction(&MI);
+        continue;
+      }
+
+      // End of the previous try-range?
+      MCSymbol *BeginLabel = MI.getOperand(0).getMCSymbol();
+      if (BeginLabel == LastLabel)
+        SawPotentiallyThrowing = false;
+
+      // Beginning of a new try-range?
+      RangeMapType::const_iterator L = PadMap.find(BeginLabel);
+      if (L == PadMap.end())
+        // Nope, it was just some random label.
+        continue;
+
+      const PadRange &P = L->second;
+      const LandingPadInfo *LandingPad = LandingPads[P.PadIndex];
+      assert(BeginLabel == LandingPad->BeginLabels[P.RangeIndex] &&
+             "Inconsistent landing pad map!");
+
+      // For Dwarf and AIX exception handling (SjLj handling doesn't use this).
+      // If some instruction between the previous try-range and this one may
+      // throw, create a call-site entry with no landing pad for the region
+      // between the try-ranges.
+      if (SawPotentiallyThrowing &&
+          (Asm->MAI->usesCFIForEH() ||
+           Asm->MAI->getExceptionHandlingType() == ExceptionHandling::AIX)) {
+        CallSites.push_back({LastLabel, BeginLabel, nullptr, 0});
+        PreviousIsInvoke = false;
+      }
+
+      LastLabel = LandingPad->EndLabels[P.RangeIndex];
+      assert(BeginLabel && LastLabel && "Invalid landing pad!");
+
+      if (!LandingPad->LandingPadLabel) {
+        // Create a gap.
+        PreviousIsInvoke = false;
+      } else {
+        // This try-range is for an invoke.
+        CallSiteEntry Site = {
+          BeginLabel,
+          LastLabel,
+          LandingPad,
+          FirstActions[P.PadIndex]
+        };
+
+        // Try to merge with the previous call-site. SJLJ doesn't do this
+        if (PreviousIsInvoke && !IsSJLJ) {
+          CallSiteEntry &Prev = CallSites.back();
+          if (Site.LPad == Prev.LPad && Site.Action == Prev.Action) {
+            // Extend the range of the previous entry.
+            Prev.EndLabel = Site.EndLabel;
+            continue;
+          }
+        }
+
+        // Otherwise, create a new call-site.
+        if (!IsSJLJ)
+          CallSites.push_back(Site);
+        else {
+          // SjLj EH must maintain the call sites in the order assigned
+          // to them by the SjLjPrepare pass.
+          unsigned SiteNo = Asm->MF->getCallSiteBeginLabel(BeginLabel);
+          if (CallSites.size() < SiteNo)
+            CallSites.resize(SiteNo);
+          CallSites[SiteNo - 1] = Site;
+        }
+        PreviousIsInvoke = true;
+      }
+    }
+
+    // We end the call-site range upon function exit and at the end of every
+    // basic block section.
+    if (&MBB == &Asm->MF->back() || MBB.isEndSection()) {
+      // If some instruction between the previous try-range and the end of the
+      // function may throw, create a call-site entry with no landing pad for
+      // the region following the try-range.
+      if (SawPotentiallyThrowing && !IsSJLJ) {
+        CallSiteEntry Site = {LastLabel, CallSiteRanges.back().FragmentEndLabel,
+                              nullptr, 0};
+        CallSites.push_back(Site);
+        SawPotentiallyThrowing = false;
+      }
+      CallSiteRanges.back().CallSiteEndIdx = CallSites.size();
+    }
+  }
+}
+
+/// Emit landing pads and actions.
+///
+/// The general organization of the table is complex, but the basic concepts are
+/// easy.  First there is a header which describes the location and organization
+/// of the three components that follow.
+///
+///  1. The landing pad site information describes the range of code covered by
+///     the try.  In our case it's an accumulation of the ranges covered by the
+///     invokes in the try.  There is also a reference to the landing pad that
+///     handles the exception once processed.  Finally an index into the actions
+///     table.
+///  2. The action table, in our case, is composed of pairs of type IDs and next
+///     action offset.  Starting with the action index from the landing pad
+///     site, each type ID is checked for a match to the current exception.  If
+///     it matches then the exception and type id are passed on to the landing
+///     pad.  Otherwise the next action is looked up.  This chain is terminated
+///     with a next action of zero.  If no type id is found then the frame is
+///     unwound and handling continues.
+///  3. Type ID table contains references to all the C++ typeinfo for all
+///     catches in the function.  This tables is reverse indexed base 1.
+///
+/// Returns the starting symbol of an exception table.
+MCSymbol *EHStreamer::emitExceptionTable() {
+  const MachineFunction *MF = Asm->MF;
+  const std::vector<const GlobalValue *> &TypeInfos = MF->getTypeInfos();
+  const std::vector<unsigned> &FilterIds = MF->getFilterIds();
+  const std::vector<LandingPadInfo> &PadInfos = MF->getLandingPads();
+
+  // Sort the landing pads in order of their type ids.  This is used to fold
+  // duplicate actions.
+  SmallVector<const LandingPadInfo *, 64> LandingPads;
+  LandingPads.reserve(PadInfos.size());
+
+  for (const LandingPadInfo &LPI : PadInfos) {
+    // If a landing-pad has an associated label, but the label wasn't ever
+    // emitted, then skip it.  (This can occur if the landingpad's MBB was
+    // deleted).
+    if (LPI.LandingPadLabel && !LPI.LandingPadLabel->isDefined())
+      continue;
+    LandingPads.push_back(&LPI);
+  }
+
+  // Order landing pads lexicographically by type id.
+  llvm::sort(LandingPads, [](const LandingPadInfo *L, const LandingPadInfo *R) {
+    return L->TypeIds < R->TypeIds;
+  });
+
+  // Compute the actions table and gather the first action index for each
+  // landing pad site.
+  SmallVector<ActionEntry, 32> Actions;
+  SmallVector<unsigned, 64> FirstActions;
+  computeActionsTable(LandingPads, Actions, FirstActions);
+
+  // Compute the call-site table and call-site ranges. Normally, there is only
+  // one call-site-range which covers the whole function. With
+  // -basic-block-sections, there is one call-site-range per basic block
+  // section.
+  SmallVector<CallSiteEntry, 64> CallSites;
+  SmallVector<CallSiteRange, 4> CallSiteRanges;
+  computeCallSiteTable(CallSites, CallSiteRanges, LandingPads, FirstActions);
+
+  bool IsSJLJ = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::SjLj;
+  bool IsWasm = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::Wasm;
+  bool HasLEB128Directives = Asm->MAI->hasLEB128Directives();
+  unsigned CallSiteEncoding =
+      IsSJLJ ? static_cast<unsigned>(dwarf::DW_EH_PE_udata4) :
+               Asm->getObjFileLowering().getCallSiteEncoding();
+  bool HaveTTData = !TypeInfos.empty() || !FilterIds.empty();
+
+  // Type infos.
+  MCSection *LSDASection = Asm->getObjFileLowering().getSectionForLSDA(
+      MF->getFunction(), *Asm->CurrentFnSym, Asm->TM);
+  unsigned TTypeEncoding;
+
+  if (!HaveTTData) {
+    // If there is no TypeInfo, then we just explicitly say that we're omitting
+    // that bit.
+    TTypeEncoding = dwarf::DW_EH_PE_omit;
+  } else {
+    // Okay, we have actual filters or typeinfos to emit.  As such, we need to
+    // pick a type encoding for them.  We're about to emit a list of pointers to
+    // typeinfo objects at the end of the LSDA.  However, unless we're in static
+    // mode, this reference will require a relocation by the dynamic linker.
+    //
+    // Because of this, we have a couple of options:
+    //
+    //   1) If we are in -static mode, we can always use an absolute reference
+    //      from the LSDA, because the static linker will resolve it.
+    //
+    //   2) Otherwise, if the LSDA section is writable, we can output the direct
+    //      reference to the typeinfo and allow the dynamic linker to relocate
+    //      it.  Since it is in a writable section, the dynamic linker won't
+    //      have a problem.
+    //
+    //   3) Finally, if we're in PIC mode and the LDSA section isn't writable,
+    //      we need to use some form of indirection.  For example, on Darwin,
+    //      we can output a statically-relocatable reference to a dyld stub. The
+    //      offset to the stub is constant, but the contents are in a section
+    //      that is updated by the dynamic linker.  This is easy enough, but we
+    //      need to tell the personality function of the unwinder to indirect
+    //      through the dyld stub.
+    //
+    // FIXME: When (3) is actually implemented, we'll have to emit the stubs
+    // somewhere.  This predicate should be moved to a shared location that is
+    // in target-independent code.
+    //
+    TTypeEncoding = Asm->getObjFileLowering().getTTypeEncoding();
+  }
+
+  // Begin the exception table.
+  // Sometimes we want not to emit the data into separate section (e.g. ARM
+  // EHABI). In this case LSDASection will be NULL.
+  if (LSDASection)
+    Asm->OutStreamer->switchSection(LSDASection);
+  Asm->emitAlignment(Align(4));
+
+  // Emit the LSDA.
+  MCSymbol *GCCETSym =
+    Asm->OutContext.getOrCreateSymbol(Twine("GCC_except_table")+
+                                      Twine(Asm->getFunctionNumber()));
+  Asm->OutStreamer->emitLabel(GCCETSym);
+  MCSymbol *CstEndLabel = Asm->createTempSymbol(
+      CallSiteRanges.size() > 1 ? "action_table_base" : "cst_end");
+
+  MCSymbol *TTBaseLabel = nullptr;
+  if (HaveTTData)
+    TTBaseLabel = Asm->createTempSymbol("ttbase");
+
+  const bool VerboseAsm = Asm->OutStreamer->isVerboseAsm();
+
+  // Helper for emitting references (offsets) for type table and the end of the
+  // call-site table (which marks the beginning of the action table).
+  //  * For Itanium, these references will be emitted for every callsite range.
+  //  * For SJLJ and Wasm, they will be emitted only once in the LSDA header.
+  auto EmitTypeTableRefAndCallSiteTableEndRef = [&]() {
+    Asm->emitEncodingByte(TTypeEncoding, "@TType");
+    if (HaveTTData) {
+      // N.B.: There is a dependency loop between the size of the TTBase uleb128
+      // here and the amount of padding before the aligned type table. The
+      // assembler must sometimes pad this uleb128 or insert extra padding
+      // before the type table. See PR35809 or GNU as bug 4029.
+      MCSymbol *TTBaseRefLabel = Asm->createTempSymbol("ttbaseref");
+      Asm->emitLabelDifferenceAsULEB128(TTBaseLabel, TTBaseRefLabel);
+      Asm->OutStreamer->emitLabel(TTBaseRefLabel);
+    }
+
+    // The Action table follows the call-site table. So we emit the
+    // label difference from here (start of the call-site table for SJLJ and
+    // Wasm, and start of a call-site range for Itanium) to the end of the
+    // whole call-site table (end of the last call-site range for Itanium).
+    MCSymbol *CstBeginLabel = Asm->createTempSymbol("cst_begin");
+    Asm->emitEncodingByte(CallSiteEncoding, "Call site");
+    Asm->emitLabelDifferenceAsULEB128(CstEndLabel, CstBeginLabel);
+    Asm->OutStreamer->emitLabel(CstBeginLabel);
+  };
+
+  // An alternative path to EmitTypeTableRefAndCallSiteTableEndRef.
+  // For some platforms, the system assembler does not accept the form of
+  // `.uleb128 label2 - label1`. In those situations, we would need to calculate
+  // the size between label1 and label2 manually.
+  // In this case, we would need to calculate the LSDA size and the call
+  // site table size.
+  auto EmitTypeTableOffsetAndCallSiteTableOffset = [&]() {
+    assert(CallSiteEncoding == dwarf::DW_EH_PE_udata4 && !HasLEB128Directives &&
+           "Targets supporting .uleb128 do not need to take this path.");
+    if (CallSiteRanges.size() > 1)
+      report_fatal_error(
+          "-fbasic-block-sections is not yet supported on "
+          "platforms that do not have general LEB128 directive support.");
+
+    uint64_t CallSiteTableSize = 0;
+    const CallSiteRange &CSRange = CallSiteRanges.back();
+    for (size_t CallSiteIdx = CSRange.CallSiteBeginIdx;
+         CallSiteIdx < CSRange.CallSiteEndIdx; ++CallSiteIdx) {
+      const CallSiteEntry &S = CallSites[CallSiteIdx];
+      // Each call site entry consists of 3 udata4 fields (12 bytes) and
+      // 1 ULEB128 field.
+      CallSiteTableSize += 12 + getULEB128Size(S.Action);
+      assert(isUInt<32>(CallSiteTableSize) && "CallSiteTableSize overflows.");
+    }
+
+    Asm->emitEncodingByte(TTypeEncoding, "@TType");
+    if (HaveTTData) {
+      const unsigned ByteSizeOfCallSiteOffset =
+          getULEB128Size(CallSiteTableSize);
+      uint64_t ActionTableSize = 0;
+      for (const ActionEntry &Action : Actions) {
+        // Each action entry consists of two SLEB128 fields.
+        ActionTableSize += getSLEB128Size(Action.ValueForTypeID) +
+                           getSLEB128Size(Action.NextAction);
+        assert(isUInt<32>(ActionTableSize) && "ActionTableSize overflows.");
+      }
+
+      const unsigned TypeInfoSize =
+          Asm->GetSizeOfEncodedValue(TTypeEncoding) * MF->getTypeInfos().size();
+
+      const uint64_t LSDASizeBeforeAlign =
+          1                          // Call site encoding byte.
+          + ByteSizeOfCallSiteOffset // ULEB128 encoding of CallSiteTableSize.
+          + CallSiteTableSize        // Call site table content.
+          + ActionTableSize;         // Action table content.
+
+      const uint64_t LSDASizeWithoutAlign = LSDASizeBeforeAlign + TypeInfoSize;
+      const unsigned ByteSizeOfLSDAWithoutAlign =
+          getULEB128Size(LSDASizeWithoutAlign);
+      const uint64_t DisplacementBeforeAlign =
+          2 // LPStartEncoding and TypeTableEncoding.
+          + ByteSizeOfLSDAWithoutAlign + LSDASizeBeforeAlign;
+
+      // The type info area starts with 4 byte alignment.
+      const unsigned NeedAlignVal = (4 - DisplacementBeforeAlign % 4) % 4;
+      uint64_t LSDASizeWithAlign = LSDASizeWithoutAlign + NeedAlignVal;
+      const unsigned ByteSizeOfLSDAWithAlign =
+          getULEB128Size(LSDASizeWithAlign);
+
+      // The LSDASizeWithAlign could use 1 byte less padding for alignment
+      // when the data we use to represent the LSDA Size "needs" to be 1 byte
+      // larger than the one previously calculated without alignment.
+      if (ByteSizeOfLSDAWithAlign > ByteSizeOfLSDAWithoutAlign)
+        LSDASizeWithAlign -= 1;
+
+      Asm->OutStreamer->emitULEB128IntValue(LSDASizeWithAlign,
+                                            ByteSizeOfLSDAWithAlign);
+    }
+
+    Asm->emitEncodingByte(CallSiteEncoding, "Call site");
+    Asm->OutStreamer->emitULEB128IntValue(CallSiteTableSize);
+  };
+
+  // SjLj / Wasm Exception handling
+  if (IsSJLJ || IsWasm) {
+    Asm->OutStreamer->emitLabel(Asm->getMBBExceptionSym(Asm->MF->front()));
+
+    // emit the LSDA header.
+    Asm->emitEncodingByte(dwarf::DW_EH_PE_omit, "@LPStart");
+    EmitTypeTableRefAndCallSiteTableEndRef();
+
+    unsigned idx = 0;
+    for (SmallVectorImpl<CallSiteEntry>::const_iterator
+         I = CallSites.begin(), E = CallSites.end(); I != E; ++I, ++idx) {
+      const CallSiteEntry &S = *I;
+
+      // Index of the call site entry.
+      if (VerboseAsm) {
+        Asm->OutStreamer->AddComment(">> Call Site " + Twine(idx) + " <<");
+        Asm->OutStreamer->AddComment("  On exception at call site "+Twine(idx));
+      }
+      Asm->emitULEB128(idx);
+
+      // Offset of the first associated action record, relative to the start of
+      // the action table. This value is biased by 1 (1 indicates the start of
+      // the action table), and 0 indicates that there are no actions.
+      if (VerboseAsm) {
+        if (S.Action == 0)
+          Asm->OutStreamer->AddComment("  Action: cleanup");
+        else
+          Asm->OutStreamer->AddComment("  Action: " +
+                                       Twine((S.Action - 1) / 2 + 1));
+      }
+      Asm->emitULEB128(S.Action);
+    }
+    Asm->OutStreamer->emitLabel(CstEndLabel);
+  } else {
+    // Itanium LSDA exception handling
+
+    // The call-site table is a list of all call sites that may throw an
+    // exception (including C++ 'throw' statements) in the procedure
+    // fragment. It immediately follows the LSDA header. Each entry indicates,
+    // for a given call, the first corresponding action record and corresponding
+    // landing pad.
+    //
+    // The table begins with the number of bytes, stored as an LEB128
+    // compressed, unsigned integer. The records immediately follow the record
+    // count. They are sorted in increasing call-site address. Each record
+    // indicates:
+    //
+    //   * The position of the call-site.
+    //   * The position of the landing pad.
+    //   * The first action record for that call site.
+    //
+    // A missing entry in the call-site table indicates that a call is not
+    // supposed to throw.
+
+    assert(CallSiteRanges.size() != 0 && "No call-site ranges!");
+
+    // There should be only one call-site range which includes all the landing
+    // pads. Find that call-site range here.
+    const CallSiteRange *LandingPadRange = nullptr;
+    for (const CallSiteRange &CSRange : CallSiteRanges) {
+      if (CSRange.IsLPRange) {
+        assert(LandingPadRange == nullptr &&
+               "All landing pads must be in a single callsite range.");
+        LandingPadRange = &CSRange;
+      }
+    }
+
+    // The call-site table is split into its call-site ranges, each being
+    // emitted as:
+    //              [ LPStartEncoding | LPStart ]
+    //              [ TypeTableEncoding | TypeTableOffset ]
+    //              [ CallSiteEncoding | CallSiteTableEndOffset ]
+    // cst_begin -> { call-site entries contained in this range }
+    //
+    // and is followed by the next call-site range.
+    //
+    // For each call-site range, CallSiteTableEndOffset is computed as the
+    // difference between cst_begin of that range and the last call-site-table's
+    // end label. This offset is used to find the action table.
+
+    unsigned Entry = 0;
+    for (const CallSiteRange &CSRange : CallSiteRanges) {
+      if (CSRange.CallSiteBeginIdx != 0) {
+        // Align the call-site range for all ranges except the first. The
+        // first range is already aligned due to the exception table alignment.
+        Asm->emitAlignment(Align(4));
+      }
+      Asm->OutStreamer->emitLabel(CSRange.ExceptionLabel);
+
+      // Emit the LSDA header.
+      // LPStart is omitted if either we have a single call-site range (in which
+      // case the function entry is treated as @LPStart) or if this function has
+      // no landing pads (in which case @LPStart is undefined).
+      if (CallSiteRanges.size() == 1 || LandingPadRange == nullptr) {
+        Asm->emitEncodingByte(dwarf::DW_EH_PE_omit, "@LPStart");
+      } else if (!Asm->isPositionIndependent()) {
+        // For more than one call-site ranges, LPStart must be explicitly
+        // specified.
+        // For non-PIC we can simply use the absolute value.
+        Asm->emitEncodingByte(dwarf::DW_EH_PE_absptr, "@LPStart");
+        Asm->OutStreamer->emitSymbolValue(LandingPadRange->FragmentBeginLabel,
+                                          Asm->MAI->getCodePointerSize());
+      } else {
+        // For PIC mode, we Emit a PC-relative address for LPStart.
+        Asm->emitEncodingByte(dwarf::DW_EH_PE_pcrel, "@LPStart");
+        MCContext &Context = Asm->OutStreamer->getContext();
+        MCSymbol *Dot = Context.createTempSymbol();
+        Asm->OutStreamer->emitLabel(Dot);
+        Asm->OutStreamer->emitValue(
+            MCBinaryExpr::createSub(
+                MCSymbolRefExpr::create(LandingPadRange->FragmentBeginLabel,
+                                        Context),
+                MCSymbolRefExpr::create(Dot, Context), Context),
+            Asm->MAI->getCodePointerSize());
+      }
+
+      if (HasLEB128Directives)
+        EmitTypeTableRefAndCallSiteTableEndRef();
+      else
+        EmitTypeTableOffsetAndCallSiteTableOffset();
+
+      for (size_t CallSiteIdx = CSRange.CallSiteBeginIdx;
+           CallSiteIdx != CSRange.CallSiteEndIdx; ++CallSiteIdx) {
+        const CallSiteEntry &S = CallSites[CallSiteIdx];
+
+        MCSymbol *EHFuncBeginSym = CSRange.FragmentBeginLabel;
+        MCSymbol *EHFuncEndSym = CSRange.FragmentEndLabel;
+
+        MCSymbol *BeginLabel = S.BeginLabel;
+        if (!BeginLabel)
+          BeginLabel = EHFuncBeginSym;
+        MCSymbol *EndLabel = S.EndLabel;
+        if (!EndLabel)
+          EndLabel = EHFuncEndSym;
+
+        // Offset of the call site relative to the start of the procedure.
+        if (VerboseAsm)
+          Asm->OutStreamer->AddComment(">> Call Site " + Twine(++Entry) +
+                                       " <<");
+        Asm->emitCallSiteOffset(BeginLabel, EHFuncBeginSym, CallSiteEncoding);
+        if (VerboseAsm)
+          Asm->OutStreamer->AddComment(Twine("  Call between ") +
+                                       BeginLabel->getName() + " and " +
+                                       EndLabel->getName());
+        Asm->emitCallSiteOffset(EndLabel, BeginLabel, CallSiteEncoding);
+
+        // Offset of the landing pad relative to the start of the landing pad
+        // fragment.
+        if (!S.LPad) {
+          if (VerboseAsm)
+            Asm->OutStreamer->AddComment("    has no landing pad");
+          Asm->emitCallSiteValue(0, CallSiteEncoding);
+        } else {
+          if (VerboseAsm)
+            Asm->OutStreamer->AddComment(Twine("    jumps to ") +
+                                         S.LPad->LandingPadLabel->getName());
+          Asm->emitCallSiteOffset(S.LPad->LandingPadLabel,
+                                  LandingPadRange->FragmentBeginLabel,
+                                  CallSiteEncoding);
+        }
+
+        // Offset of the first associated action record, relative to the start
+        // of the action table. This value is biased by 1 (1 indicates the start
+        // of the action table), and 0 indicates that there are no actions.
+        if (VerboseAsm) {
+          if (S.Action == 0)
+            Asm->OutStreamer->AddComment("  On action: cleanup");
+          else
+            Asm->OutStreamer->AddComment("  On action: " +
+                                         Twine((S.Action - 1) / 2 + 1));
+        }
+        Asm->emitULEB128(S.Action);
+      }
+    }
+    Asm->OutStreamer->emitLabel(CstEndLabel);
+  }
+
+  // Emit the Action Table.
+  int Entry = 0;
+  for (const ActionEntry &Action : Actions) {
+    if (VerboseAsm) {
+      // Emit comments that decode the action table.
+      Asm->OutStreamer->AddComment(">> Action Record " + Twine(++Entry) + " <<");
+    }
+
+    // Type Filter
+    //
+    //   Used by the runtime to match the type of the thrown exception to the
+    //   type of the catch clauses or the types in the exception specification.
+    if (VerboseAsm) {
+      if (Action.ValueForTypeID > 0)
+        Asm->OutStreamer->AddComment("  Catch TypeInfo " +
+                                     Twine(Action.ValueForTypeID));
+      else if (Action.ValueForTypeID < 0)
+        Asm->OutStreamer->AddComment("  Filter TypeInfo " +
+                                     Twine(Action.ValueForTypeID));
+      else
+        Asm->OutStreamer->AddComment("  Cleanup");
+    }
+    Asm->emitSLEB128(Action.ValueForTypeID);
+
+    // Action Record
+    if (VerboseAsm) {
+      if (Action.Previous == unsigned(-1)) {
+        Asm->OutStreamer->AddComment("  No further actions");
+      } else {
+        Asm->OutStreamer->AddComment("  Continue to action " +
+                                     Twine(Action.Previous + 1));
+      }
+    }
+    Asm->emitSLEB128(Action.NextAction);
+  }
+
+  if (HaveTTData) {
+    Asm->emitAlignment(Align(4));
+    emitTypeInfos(TTypeEncoding, TTBaseLabel);
+  }
+
+  Asm->emitAlignment(Align(4));
+  return GCCETSym;
+}
+
+void EHStreamer::emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel) {
+  const MachineFunction *MF = Asm->MF;
+  const std::vector<const GlobalValue *> &TypeInfos = MF->getTypeInfos();
+  const std::vector<unsigned> &FilterIds = MF->getFilterIds();
+
+  const bool VerboseAsm = Asm->OutStreamer->isVerboseAsm();
+
+  int Entry = 0;
+  // Emit the Catch TypeInfos.
+  if (VerboseAsm && !TypeInfos.empty()) {
+    Asm->OutStreamer->AddComment(">> Catch TypeInfos <<");
+    Asm->OutStreamer->addBlankLine();
+    Entry = TypeInfos.size();
+  }
+
+  for (const GlobalValue *GV : llvm::reverse(TypeInfos)) {
+    if (VerboseAsm)
+      Asm->OutStreamer->AddComment("TypeInfo " + Twine(Entry--));
+    Asm->emitTTypeReference(GV, TTypeEncoding);
+  }
+
+  Asm->OutStreamer->emitLabel(TTBaseLabel);
+
+  // Emit the Exception Specifications.
+  if (VerboseAsm && !FilterIds.empty()) {
+    Asm->OutStreamer->AddComment(">> Filter TypeInfos <<");
+    Asm->OutStreamer->addBlankLine();
+    Entry = 0;
+  }
+  for (std::vector<unsigned>::const_iterator
+         I = FilterIds.begin(), E = FilterIds.end(); I < E; ++I) {
+    unsigned TypeID = *I;
+    if (VerboseAsm) {
+      --Entry;
+      if (isFilterEHSelector(TypeID))
+        Asm->OutStreamer->AddComment("FilterInfo " + Twine(Entry));
+    }
+
+    Asm->emitULEB128(TypeID);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h
new file mode 100644
index 000000000000..234e62506a56
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h
@@ -0,0 +1,165 @@
+//===- EHStreamer.h - Exception Handling Directive Streamer -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing exception info into assembly files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_EHSTREAMER_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_EHSTREAMER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/AsmPrinterHandler.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class AsmPrinter;
+struct LandingPadInfo;
+class MachineInstr;
+class MachineModuleInfo;
+class MCSymbol;
+template <typename T> class SmallVectorImpl;
+
+/// Emits exception handling directives.
+class LLVM_LIBRARY_VISIBILITY EHStreamer : public AsmPrinterHandler {
+protected:
+  /// Target of directive emission.
+  AsmPrinter *Asm;
+
+  /// Collected machine module information.
+  MachineModuleInfo *MMI;
+
+  /// How many leading type ids two landing pads have in common.
+  static unsigned sharedTypeIDs(const LandingPadInfo *L,
+                                const LandingPadInfo *R);
+
+  /// Structure holding a try-range and the associated landing pad.
+  struct PadRange {
+    // The index of the landing pad.
+    unsigned PadIndex;
+
+    // The index of the begin and end labels in the landing pad's label lists.
+    unsigned RangeIndex;
+  };
+
+  using RangeMapType = DenseMap<MCSymbol *, PadRange>;
+
+  /// Structure describing an entry in the actions table.
+  struct ActionEntry {
+    int ValueForTypeID; // The value to write - may not be equal to the type id.
+    int NextAction;
+    unsigned Previous;
+  };
+
+  /// Structure describing an entry in the call-site table.
+  struct CallSiteEntry {
+    // The 'try-range' is BeginLabel .. EndLabel.
+    MCSymbol *BeginLabel; // Null indicates the start of the function.
+    MCSymbol *EndLabel;   // Null indicates the end of the function.
+
+    // LPad contains the landing pad start labels.
+    const LandingPadInfo *LPad; // Null indicates that there is no landing pad.
+
+    unsigned Action;
+  };
+
+  /// Structure describing a contiguous range of call-sites which reside
+  /// in the same procedure fragment. With -fbasic-block-sections, there will
+  /// be one call site range per basic block section. Otherwise, we will have
+  /// one call site range containing all the call sites in the function.
+  struct CallSiteRange {
+    // Symbol marking the beginning of the precedure fragment.
+    MCSymbol *FragmentBeginLabel = nullptr;
+    // Symbol marking the end of the procedure fragment.
+    MCSymbol *FragmentEndLabel = nullptr;
+    // LSDA symbol for this call-site range.
+    MCSymbol *ExceptionLabel = nullptr;
+    // Index of the first call-site entry in the call-site table which
+    // belongs to this range.
+    size_t CallSiteBeginIdx = 0;
+    // Index just after the last call-site entry in the call-site table which
+    // belongs to this range.
+    size_t CallSiteEndIdx = 0;
+    // Whether this is the call-site range containing all the landing pads.
+    bool IsLPRange = false;
+  };
+
+  /// Compute the actions table and gather the first action index for each
+  /// landing pad site.
+  void computeActionsTable(
+      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+      SmallVectorImpl<ActionEntry> &Actions,
+      SmallVectorImpl<unsigned> &FirstActions);
+
+  void computePadMap(const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+                     RangeMapType &PadMap);
+
+  /// Compute the call-site table and the call-site ranges. The entry for an
+  /// invoke has a try-range containing the call, a non-zero landing pad and an
+  /// appropriate action. The entry for an ordinary call has a try-range
+  /// containing the call and zero for the landing pad and the action.  Calls
+  /// marked 'nounwind' have no entry and must not be contained in the try-range
+  /// of any entry - they form gaps in the table.  Entries must be ordered by
+  /// try-range address. CallSiteRanges vector is only populated for Itanium
+  /// exception handling.
+  virtual void computeCallSiteTable(
+      SmallVectorImpl<CallSiteEntry> &CallSites,
+      SmallVectorImpl<CallSiteRange> &CallSiteRanges,
+      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+      const SmallVectorImpl<unsigned> &FirstActions);
+
+  /// Emit landing pads and actions.
+  ///
+  /// The general organization of the table is complex, but the basic concepts
+  /// are easy.  First there is a header which describes the location and
+  /// organization of the three components that follow.
+  ///  1. The landing pad site information describes the range of code covered
+  ///     by the try.  In our case it's an accumulation of the ranges covered
+  ///     by the invokes in the try.  There is also a reference to the landing
+  ///     pad that handles the exception once processed.  Finally an index into
+  ///     the actions table.
+  ///  2. The action table, in our case, is composed of pairs of type ids
+  ///     and next action offset.  Starting with the action index from the
+  ///     landing pad site, each type Id is checked for a match to the current
+  ///     exception.  If it matches then the exception and type id are passed
+  ///     on to the landing pad.  Otherwise the next action is looked up.  This
+  ///     chain is terminated with a next action of zero.  If no type id is
+  ///     found the frame is unwound and handling continues.
+  ///  3. Type id table contains references to all the C++ typeinfo for all
+  ///     catches in the function.  This tables is reversed indexed base 1.
+  ///
+  /// Returns the starting symbol of an exception table.
+  MCSymbol *emitExceptionTable();
+
+  virtual void emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel);
+
+  // Helpers for identifying what kind of clause an EH typeid or selector
+  // corresponds to. Negative selectors are for filter clauses, the zero
+  // selector is for cleanups, and positive selectors are for catch clauses.
+  static bool isFilterEHSelector(int Selector) { return Selector < 0; }
+  static bool isCleanupEHSelector(int Selector) { return Selector == 0; }
+  static bool isCatchEHSelector(int Selector) { return Selector > 0; }
+
+public:
+  EHStreamer(AsmPrinter *A);
+  ~EHStreamer() override;
+
+  // Unused.
+  void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
+  void beginInstruction(const MachineInstr *MI) override {}
+  void endInstruction() override {}
+
+  /// Return `true' if this is a call to a function marked `nounwind'. Return
+  /// `false' otherwise.
+  static bool callToNoUnwindFunction(const MachineInstr *MI);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_ASMPRINTER_EHSTREAMER_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
new file mode 100644
index 000000000000..62fd15d89512
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
@@ -0,0 +1,117 @@
+//===- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the compiler plugin that is used in order to emit
+// garbage collection information in a convenient layout for parsing and
+// loading in the Erlang/OTP runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/IR/BuiltinGCs.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+namespace {
+
+class ErlangGCPrinter : public GCMetadataPrinter {
+public:
+  void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
+};
+
+} // end anonymous namespace
+
+static GCMetadataPrinterRegistry::Add<ErlangGCPrinter>
+    X("erlang", "erlang-compatible garbage collector");
+
+void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
+                                     AsmPrinter &AP) {
+  MCStreamer &OS = *AP.OutStreamer;
+  unsigned IntPtrSize = M.getDataLayout().getPointerSize();
+
+  // Put this in a custom .note section.
+  OS.switchSection(AP.getObjFileLowering().getContext().getELFSection(
+      ".note.gc", ELF::SHT_PROGBITS, 0));
+
+  // For each function...
+  for (GCModuleInfo::FuncInfoVec::iterator FI = Info.funcinfo_begin(),
+                                           IE = Info.funcinfo_end();
+       FI != IE; ++FI) {
+    GCFunctionInfo &MD = **FI;
+    if (MD.getStrategy().getName() != getStrategy().getName())
+      // this function is managed by some other GC
+      continue;
+    /** A compact GC layout. Emit this data structure:
+     *
+     * struct {
+     *   int16_t PointCount;
+     *   void *SafePointAddress[PointCount];
+     *   int16_t StackFrameSize; (in words)
+     *   int16_t StackArity;
+     *   int16_t LiveCount;
+     *   int16_t LiveOffsets[LiveCount];
+     * } __gcmap_<FUNCTIONNAME>;
+     **/
+
+    // Align to address width.
+    AP.emitAlignment(IntPtrSize == 4 ? Align(4) : Align(8));
+
+    // Emit PointCount.
+    OS.AddComment("safe point count");
+    AP.emitInt16(MD.size());
+
+    // And each safe point...
+    for (const GCPoint &P : MD) {
+      // Emit the address of the safe point.
+      OS.AddComment("safe point address");
+      MCSymbol *Label = P.Label;
+      AP.emitLabelPlusOffset(Label /*Hi*/, 0 /*Offset*/, 4 /*Size*/);
+    }
+
+    // Stack information never change in safe points! Only print info from the
+    // first call-site.
+    GCFunctionInfo::iterator PI = MD.begin();
+
+    // Emit the stack frame size.
+    OS.AddComment("stack frame size (in words)");
+    AP.emitInt16(MD.getFrameSize() / IntPtrSize);
+
+    // Emit stack arity, i.e. the number of stacked arguments.
+    unsigned RegisteredArgs = IntPtrSize == 4 ? 5 : 6;
+    unsigned StackArity = MD.getFunction().arg_size() > RegisteredArgs
+                              ? MD.getFunction().arg_size() - RegisteredArgs
+                              : 0;
+    OS.AddComment("stack arity");
+    AP.emitInt16(StackArity);
+
+    // Emit the number of live roots in the function.
+    OS.AddComment("live root count");
+    AP.emitInt16(MD.live_size(PI));
+
+    // And for each live root...
+    for (GCFunctionInfo::live_iterator LI = MD.live_begin(PI),
+                                       LE = MD.live_end(PI);
+         LI != LE; ++LI) {
+      // Emit live root's offset within the stack frame.
+      OS.AddComment("stack index (offset / wordsize)");
+      AP.emitInt16(LI->StackOffset / IntPtrSize);
+    }
+  }
+}
+
+void llvm::linkErlangGCPrinter() {}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
new file mode 100644
index 000000000000..74fa30ab321b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -0,0 +1,182 @@
+//===- OcamlGCPrinter.cpp - Ocaml frametable emitter ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements printing the assembly code for an Ocaml frametable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/IR/BuiltinGCs.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+using namespace llvm;
+
+namespace {
+
+class OcamlGCMetadataPrinter : public GCMetadataPrinter {
+public:
+  void beginAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
+  void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
+};
+
+} // end anonymous namespace
+
+static GCMetadataPrinterRegistry::Add<OcamlGCMetadataPrinter>
+    Y("ocaml", "ocaml 3.10-compatible collector");
+
+void llvm::linkOcamlGCPrinter() {}
+
+static void EmitCamlGlobal(const Module &M, AsmPrinter &AP, const char *Id) {
+  const std::string &MId = M.getModuleIdentifier();
+
+  std::string SymName;
+  SymName += "caml";
+  size_t Letter = SymName.size();
+  SymName.append(MId.begin(), llvm::find(MId, '.'));
+  SymName += "__";
+  SymName += Id;
+
+  // Capitalize the first letter of the module name.
+  SymName[Letter] = toupper(SymName[Letter]);
+
+  SmallString<128> TmpStr;
+  Mangler::getNameWithPrefix(TmpStr, SymName, M.getDataLayout());
+
+  MCSymbol *Sym = AP.OutContext.getOrCreateSymbol(TmpStr);
+
+  AP.OutStreamer->emitSymbolAttribute(Sym, MCSA_Global);
+  AP.OutStreamer->emitLabel(Sym);
+}
+
+void OcamlGCMetadataPrinter::beginAssembly(Module &M, GCModuleInfo &Info,
+                                           AsmPrinter &AP) {
+  AP.OutStreamer->switchSection(AP.getObjFileLowering().getTextSection());
+  EmitCamlGlobal(M, AP, "code_begin");
+
+  AP.OutStreamer->switchSection(AP.getObjFileLowering().getDataSection());
+  EmitCamlGlobal(M, AP, "data_begin");
+}
+
+/// emitAssembly - Print the frametable. The ocaml frametable format is thus:
+///
+///   extern "C" struct align(sizeof(intptr_t)) {
+///     uint16_t NumDescriptors;
+///     struct align(sizeof(intptr_t)) {
+///       void *ReturnAddress;
+///       uint16_t FrameSize;
+///       uint16_t NumLiveOffsets;
+///       uint16_t LiveOffsets[NumLiveOffsets];
+///     } Descriptors[NumDescriptors];
+///   } caml${module}__frametable;
+///
+/// Note that this precludes programs from stack frames larger than 64K
+/// (FrameSize and LiveOffsets would overflow). FrameTablePrinter will abort if
+/// either condition is detected in a function which uses the GC.
+///
+void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
+                                            AsmPrinter &AP) {
+  unsigned IntPtrSize = M.getDataLayout().getPointerSize();
+
+  AP.OutStreamer->switchSection(AP.getObjFileLowering().getTextSection());
+  EmitCamlGlobal(M, AP, "code_end");
+
+  AP.OutStreamer->switchSection(AP.getObjFileLowering().getDataSection());
+  EmitCamlGlobal(M, AP, "data_end");
+
+  // FIXME: Why does ocaml emit this??
+  AP.OutStreamer->emitIntValue(0, IntPtrSize);
+
+  AP.OutStreamer->switchSection(AP.getObjFileLowering().getDataSection());
+  EmitCamlGlobal(M, AP, "frametable");
+
+  int NumDescriptors = 0;
+  for (std::unique_ptr<GCFunctionInfo> &FI :
+       llvm::make_range(Info.funcinfo_begin(), Info.funcinfo_end())) {
+    if (FI->getStrategy().getName() != getStrategy().getName())
+      // this function is managed by some other GC
+      continue;
+    NumDescriptors += FI->size();
+  }
+
+  if (NumDescriptors >= 1 << 16) {
+    // Very rude!
+    report_fatal_error(" Too much descriptor for ocaml GC");
+  }
+  AP.emitInt16(NumDescriptors);
+  AP.emitAlignment(IntPtrSize == 4 ? Align(4) : Align(8));
+
+  for (std::unique_ptr<GCFunctionInfo> &FI :
+       llvm::make_range(Info.funcinfo_begin(), Info.funcinfo_end())) {
+    if (FI->getStrategy().getName() != getStrategy().getName())
+      // this function is managed by some other GC
+      continue;
+
+    uint64_t FrameSize = FI->getFrameSize();
+    if (FrameSize >= 1 << 16) {
+      // Very rude!
+      report_fatal_error("Function '" + FI->getFunction().getName() +
+                         "' is too large for the ocaml GC! "
+                         "Frame size " +
+                         Twine(FrameSize) +
+                         ">= 65536.\n"
+                         "(" +
+                         Twine(reinterpret_cast<uintptr_t>(FI.get())) + ")");
+    }
+
+    AP.OutStreamer->AddComment("live roots for " +
+                               Twine(FI->getFunction().getName()));
+    AP.OutStreamer->addBlankLine();
+
+    for (GCFunctionInfo::iterator J = FI->begin(), JE = FI->end(); J != JE;
+         ++J) {
+      size_t LiveCount = FI->live_size(J);
+      if (LiveCount >= 1 << 16) {
+        // Very rude!
+        report_fatal_error("Function '" + FI->getFunction().getName() +
+                           "' is too large for the ocaml GC! "
+                           "Live root count " +
+                           Twine(LiveCount) + " >= 65536.");
+      }
+
+      AP.OutStreamer->emitSymbolValue(J->Label, IntPtrSize);
+      AP.emitInt16(FrameSize);
+      AP.emitInt16(LiveCount);
+
+      for (GCFunctionInfo::live_iterator K = FI->live_begin(J),
+                                         KE = FI->live_end(J);
+           K != KE; ++K) {
+        if (K->StackOffset >= 1 << 16) {
+          // Very rude!
+          report_fatal_error(
+              "GC root stack offset is outside of fixed stack frame and out "
+              "of range for ocaml GC!");
+        }
+        AP.emitInt16(K->StackOffset);
+      }
+
+      AP.emitAlignment(IntPtrSize == 4 ? Align(4) : Align(8));
+    }
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
new file mode 100644
index 000000000000..59c3fa15885e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp
@@ -0,0 +1,56 @@
+//===- llvm/CodeGen/PseudoProbePrinter.cpp - Pseudo Probe Emission -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing pseudo probe info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PseudoProbePrinter.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PseudoProbe.h"
+#include "llvm/MC/MCPseudoProbe.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+PseudoProbeHandler::~PseudoProbeHandler() = default;
+
+void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index,
+                                         uint64_t Type, uint64_t Attr,
+                                         const DILocation *DebugLoc) {
+  // Gather all the inlined-at nodes.
+  // When it's done ReversedInlineStack looks like ([66, B], [88, A])
+  // which means, Function A inlines function B at calliste with a probe id 88,
+  // and B inlines C at probe 66 where C is represented by Guid.
+  SmallVector<InlineSite, 8> ReversedInlineStack;
+  auto *InlinedAt = DebugLoc ? DebugLoc->getInlinedAt() : nullptr;
+  while (InlinedAt) {
+    auto Name = InlinedAt->getSubprogramLinkageName();
+    // Use caching to avoid redundant md5 computation for build speed.
+    uint64_t &CallerGuid = NameGuidMap[Name];
+    if (!CallerGuid)
+      CallerGuid = Function::getGUID(Name);
+    uint64_t CallerProbeId = PseudoProbeDwarfDiscriminator::extractProbeIndex(
+        InlinedAt->getDiscriminator());
+    ReversedInlineStack.emplace_back(CallerGuid, CallerProbeId);
+    InlinedAt = InlinedAt->getInlinedAt();
+  }
+  uint64_t Discriminator = 0;
+  // For now only block probes have FS discriminators. See
+  // MIRFSDiscriminator.cpp for more details.
+  if (EnableFSDiscriminator && DebugLoc &&
+      (Type == (uint64_t)PseudoProbeType::Block))
+    Discriminator = DebugLoc->getDiscriminator();
+  assert((EnableFSDiscriminator || Discriminator == 0) &&
+         "Discriminator should not be set in non-FSAFDO mode");
+  SmallVector<InlineSite, 8> InlineStack(llvm::reverse(ReversedInlineStack));
+  Asm->OutStreamer->emitPseudoProbe(Guid, Index, Type, Attr, Discriminator,
+                                    InlineStack, Asm->CurrentFnSym);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
new file mode 100644
index 000000000000..a92a89084cad
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
@@ -0,0 +1,47 @@
+//===- PseudoProbePrinter.h - Pseudo probe encoding support -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing pseudo probe info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_PSEUDOPROBEPRINTER_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_PSEUDOPROBEPRINTER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/AsmPrinterHandler.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class DILocation;
+
+class PseudoProbeHandler : public AsmPrinterHandler {
+  // Target of pseudo probe emission.
+  AsmPrinter *Asm;
+  // Name to GUID map, used as caching/memoization for speed.
+  DenseMap<StringRef, uint64_t> NameGuidMap;
+
+public:
+  PseudoProbeHandler(AsmPrinter *A) : Asm(A){};
+  ~PseudoProbeHandler() override;
+
+  void emitPseudoProbe(uint64_t Guid, uint64_t Index, uint64_t Type,
+                       uint64_t Attr, const DILocation *DebugLoc);
+
+  // Unused.
+  void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
+  void endModule() override {}
+  void beginFunction(const MachineFunction *MF) override {}
+  void endFunction(const MachineFunction *MF) override {}
+  void beginInstruction(const MachineInstr *MI) override {}
+  void endInstruction() override {}
+};
+
+} // namespace llvm
+#endif // LLVM_LIB_CODEGEN_ASMPRINTER_PSEUDOPROBEPRINTER_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
new file mode 100644
index 000000000000..bf65e525dde1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
@@ -0,0 +1,98 @@
+//===-- CodeGen/AsmPrinter/WasmException.cpp - Wasm Exception Impl --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing WebAssembly exception info into asm
+// files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "WasmException.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+using namespace llvm;
+
+void WasmException::endModule() {
+  // These are symbols used to throw/catch C++ exceptions and C longjmps. These
+  // symbols have to be emitted somewhere once in the module. Check if each of
+  // the symbols has already been created, i.e., we have at least one 'throw' or
+  // 'catch' instruction with the symbol in the module, and emit the symbol only
+  // if so.
+  //
+  // But in dynamic linking, it is in general not possible to come up with a
+  // module instantiating order in which tag-defining modules are loaded before
+  // the importing modules. So we make them undefined symbols here, define tags
+  // in the JS side, and feed them to each importing module.
+  if (!Asm->isPositionIndependent()) {
+    for (const char *SymName : {"__cpp_exception", "__c_longjmp"}) {
+      SmallString<60> NameStr;
+      Mangler::getNameWithPrefix(NameStr, SymName, Asm->getDataLayout());
+      if (Asm->OutContext.lookupSymbol(NameStr)) {
+        MCSymbol *ExceptionSym = Asm->GetExternalSymbolSymbol(SymName);
+        Asm->OutStreamer->emitLabel(ExceptionSym);
+      }
+    }
+  }
+}
+
+void WasmException::endFunction(const MachineFunction *MF) {
+  bool ShouldEmitExceptionTable = false;
+  for (const LandingPadInfo &Info : MF->getLandingPads()) {
+    if (MF->hasWasmLandingPadIndex(Info.LandingPadBlock)) {
+      ShouldEmitExceptionTable = true;
+      break;
+    }
+  }
+  if (!ShouldEmitExceptionTable)
+    return;
+  MCSymbol *LSDALabel = emitExceptionTable();
+  assert(LSDALabel && ".GCC_exception_table has not been emitted!");
+
+  // Wasm requires every data section symbol to have a .size set. So we emit an
+  // end marker and set the size as the difference between the start end the end
+  // marker.
+  MCSymbol *LSDAEndLabel = Asm->createTempSymbol("GCC_except_table_end");
+  Asm->OutStreamer->emitLabel(LSDAEndLabel);
+  MCContext &OutContext = Asm->OutStreamer->getContext();
+  const MCExpr *SizeExp = MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(LSDAEndLabel, OutContext),
+      MCSymbolRefExpr::create(LSDALabel, OutContext), OutContext);
+  Asm->OutStreamer->emitELFSize(LSDALabel, SizeExp);
+}
+
+// Compute the call-site table for wasm EH. Even though we use the same function
+// name to share the common routines, a call site entry in the table corresponds
+// to not a call site for possibly-throwing functions but a landing pad. In wasm
+// EH the VM is responsible for stack unwinding. After an exception occurs and
+// the stack is unwound, the control flow is transferred to wasm 'catch'
+// instruction by the VM, after which the personality function is called from
+// the compiler-generated code. Refer to WasmEHPrepare pass for more
+// information.
+void WasmException::computeCallSiteTable(
+    SmallVectorImpl<CallSiteEntry> &CallSites,
+    SmallVectorImpl<CallSiteRange> &CallSiteRanges,
+    const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+    const SmallVectorImpl<unsigned> &FirstActions) {
+  MachineFunction &MF = *Asm->MF;
+  for (unsigned I = 0, N = LandingPads.size(); I < N; ++I) {
+    const LandingPadInfo *Info = LandingPads[I];
+    MachineBasicBlock *LPad = Info->LandingPadBlock;
+    // We don't emit LSDA for single catch (...).
+    if (!MF.hasWasmLandingPadIndex(LPad))
+      continue;
+    // Wasm EH must maintain the EH pads in the order assigned to them by the
+    // WasmEHPrepare pass.
+    unsigned LPadIndex = MF.getWasmLandingPadIndex(LPad);
+    CallSiteEntry Site = {nullptr, nullptr, Info, FirstActions[I]};
+    if (CallSites.size() < LPadIndex + 1)
+      CallSites.resize(LPadIndex + 1);
+    CallSites[LPadIndex] = Site;
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WasmException.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WasmException.h
new file mode 100644
index 000000000000..86cc37dfde07
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WasmException.h
@@ -0,0 +1,44 @@
+//===-- WasmException.h - Wasm Exception Framework -------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing WebAssembly exception info into asm
+// files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WASMEXCEPTION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_WASMEXCEPTION_H
+
+#include "EHStreamer.h"
+
+namespace llvm {
+class AsmPrinter;
+class MachineFunction;
+struct LandingPadInfo;
+template <typename T> class SmallVectorImpl;
+
+class LLVM_LIBRARY_VISIBILITY WasmException : public EHStreamer {
+public:
+  WasmException(AsmPrinter *A) : EHStreamer(A) {}
+
+  void endModule() override;
+  void beginFunction(const MachineFunction *MF) override {}
+  void endFunction(const MachineFunction *MF) override;
+
+protected:
+  // Compute the call site table for wasm EH.
+  void computeCallSiteTable(
+      SmallVectorImpl<CallSiteEntry> &CallSites,
+      SmallVectorImpl<CallSiteRange> &CallSiteRanges,
+      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+      const SmallVectorImpl<unsigned> &FirstActions) override;
+};
+
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
new file mode 100644
index 000000000000..5d813b72c0b7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
@@ -0,0 +1,125 @@
+//===-- CodeGen/AsmPrinter/WinCFGuard.cpp - Control Flow Guard Impl ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing the metadata for Windows Control Flow
+// Guard, including address-taken functions and valid longjmp targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "WinCFGuard.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCStreamer.h"
+
+#include <vector>
+
+using namespace llvm;
+
+WinCFGuard::WinCFGuard(AsmPrinter *A) : Asm(A) {}
+
+WinCFGuard::~WinCFGuard() = default;
+
+void WinCFGuard::endFunction(const MachineFunction *MF) {
+
+  // Skip functions without any longjmp targets.
+  if (MF->getLongjmpTargets().empty())
+    return;
+
+  // Copy the function's longjmp targets to a module-level list.
+  llvm::append_range(LongjmpTargets, MF->getLongjmpTargets());
+}
+
+/// Returns true if this function's address is escaped in a way that might make
+/// it an indirect call target. Function::hasAddressTaken gives different
+/// results when a function is called directly with a function prototype
+/// mismatch, which requires a cast.
+static bool isPossibleIndirectCallTarget(const Function *F) {
+  SmallVector<const Value *, 4> Users{F};
+  while (!Users.empty()) {
+    const Value *FnOrCast = Users.pop_back_val();
+    for (const Use &U : FnOrCast->uses()) {
+      const User *FnUser = U.getUser();
+      if (isa<BlockAddress>(FnUser))
+        continue;
+      if (const auto *Call = dyn_cast<CallBase>(FnUser)) {
+        if (!Call->isCallee(&U))
+          return true;
+      } else if (isa<Instruction>(FnUser)) {
+        // Consider any other instruction to be an escape. This has some weird
+        // consequences like no-op intrinsics being an escape or a store *to* a
+        // function address being an escape.
+        return true;
+      } else if (const auto *C = dyn_cast<Constant>(FnUser)) {
+        // If this is a constant pointer cast of the function, don't consider
+        // this escape. Analyze the uses of the cast as well. This ensures that
+        // direct calls with mismatched prototypes don't end up in the CFG
+        // table. Consider other constants, such as vtable initializers, to
+        // escape the function.
+        if (C->stripPointerCasts() == F)
+          Users.push_back(FnUser);
+        else
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+MCSymbol *WinCFGuard::lookupImpSymbol(const MCSymbol *Sym) {
+  if (Sym->getName().startswith("__imp_"))
+    return nullptr;
+  return Asm->OutContext.lookupSymbol(Twine("__imp_") + Sym->getName());
+}
+
+void WinCFGuard::endModule() {
+  const Module *M = Asm->MMI->getModule();
+  std::vector<const MCSymbol *> GFIDsEntries;
+  std::vector<const MCSymbol *> GIATsEntries;
+  for (const Function &F : *M) {
+    if (isPossibleIndirectCallTarget(&F)) {
+      // If F is a dllimport and has an "__imp_" symbol already defined, add the
+      // "__imp_" symbol to the .giats section.
+      if (F.hasDLLImportStorageClass()) {
+        if (MCSymbol *impSym = lookupImpSymbol(Asm->getSymbol(&F))) {
+          GIATsEntries.push_back(impSym);
+        }
+      }
+      // Add the function's symbol to the .gfids section.
+      // Note: For dllimport functions, MSVC sometimes does not add this symbol
+      // to the .gfids section, but only adds the corresponding "__imp_" symbol
+      // to the .giats section. Here we always add the symbol to the .gfids
+      // section, since this does not introduce security risks.
+      GFIDsEntries.push_back(Asm->getSymbol(&F));
+    }
+  }
+
+  if (GFIDsEntries.empty() && GIATsEntries.empty() && LongjmpTargets.empty())
+    return;
+
+  // Emit the symbol index of each GFIDs entry to form the .gfids section.
+  auto &OS = *Asm->OutStreamer;
+  OS.switchSection(Asm->OutContext.getObjectFileInfo()->getGFIDsSection());
+  for (const MCSymbol *S : GFIDsEntries)
+    OS.emitCOFFSymbolIndex(S);
+
+  // Emit the symbol index of each GIATs entry to form the .giats section.
+  OS.switchSection(Asm->OutContext.getObjectFileInfo()->getGIATsSection());
+  for (const MCSymbol *S : GIATsEntries) {
+    OS.emitCOFFSymbolIndex(S);
+  }
+
+  // Emit the symbol index of each longjmp target to form the .gljmp section.
+  OS.switchSection(Asm->OutContext.getObjectFileInfo()->getGLJMPSection());
+  for (const MCSymbol *S : LongjmpTargets) {
+    OS.emitCOFFSymbolIndex(S);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.h
new file mode 100644
index 000000000000..0e472af52c8f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.h
@@ -0,0 +1,57 @@
+//===-- WinCFGuard.h - Windows Control Flow Guard Handling ----*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing the metadata for Windows Control Flow
+// Guard, including address-taken functions, and valid longjmp targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WINCFGUARD_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_WINCFGUARD_H
+
+#include "llvm/CodeGen/AsmPrinterHandler.h"
+#include "llvm/Support/Compiler.h"
+#include <vector>
+
+namespace llvm {
+
+class LLVM_LIBRARY_VISIBILITY WinCFGuard : public AsmPrinterHandler {
+  /// Target of directive emission.
+  AsmPrinter *Asm;
+  std::vector<const MCSymbol *> LongjmpTargets;
+  MCSymbol *lookupImpSymbol(const MCSymbol *Sym);
+
+public:
+  WinCFGuard(AsmPrinter *A);
+  ~WinCFGuard() override;
+
+  void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
+
+  /// Emit the Control Flow Guard function ID table.
+  void endModule() override;
+
+  /// Gather pre-function debug information.
+  /// Every beginFunction(MF) call should be followed by an endFunction(MF)
+  /// call.
+  void beginFunction(const MachineFunction *MF) override {}
+
+  /// Gather post-function debug information.
+  /// Please note that some AsmPrinter implementations may not call
+  /// beginFunction at all.
+  void endFunction(const MachineFunction *MF) override;
+
+  /// Process beginning of an instruction.
+  void beginInstruction(const MachineInstr *MI) override {}
+
+  /// Process end of an instruction.
+  void endInstruction() override {}
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
new file mode 100644
index 000000000000..6d6432b61f2d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -0,0 +1,1345 @@
+//===-- CodeGen/AsmPrinter/WinException.cpp - Dwarf Exception Impl ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing Win64 exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "WinException.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+WinException::WinException(AsmPrinter *A) : EHStreamer(A) {
+  // MSVC's EH tables are always composed of 32-bit words.  All known 64-bit
+  // platforms use an imagerel32 relocation to refer to symbols.
+  useImageRel32 = (A->getDataLayout().getPointerSizeInBits() == 64);
+  isAArch64 = Asm->TM.getTargetTriple().isAArch64();
+  isThumb = Asm->TM.getTargetTriple().isThumb();
+}
+
+WinException::~WinException() = default;
+
+/// endModule - Emit all exception information that should come after the
+/// content.
+void WinException::endModule() {
+  auto &OS = *Asm->OutStreamer;
+  const Module *M = MMI->getModule();
+  for (const Function &F : *M)
+    if (F.hasFnAttribute("safeseh"))
+      OS.emitCOFFSafeSEH(Asm->getSymbol(&F));
+
+  if (M->getModuleFlag("ehcontguard") && !EHContTargets.empty()) {
+    // Emit the symbol index of each ehcont target.
+    OS.switchSection(Asm->OutContext.getObjectFileInfo()->getGEHContSection());
+    for (const MCSymbol *S : EHContTargets) {
+      OS.emitCOFFSymbolIndex(S);
+    }
+  }
+}
+
+void WinException::beginFunction(const MachineFunction *MF) {
+  shouldEmitMoves = shouldEmitPersonality = shouldEmitLSDA = false;
+
+  // If any landing pads survive, we need an EH table.
+  bool hasLandingPads = !MF->getLandingPads().empty();
+  bool hasEHFunclets = MF->hasEHFunclets();
+
+  const Function &F = MF->getFunction();
+
+  shouldEmitMoves = Asm->needsSEHMoves() && MF->hasWinCFI();
+
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  unsigned PerEncoding = TLOF.getPersonalityEncoding();
+
+  EHPersonality Per = EHPersonality::Unknown;
+  const Function *PerFn = nullptr;
+  if (F.hasPersonalityFn()) {
+    PerFn = dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts());
+    Per = classifyEHPersonality(PerFn);
+  }
+
+  bool forceEmitPersonality = F.hasPersonalityFn() &&
+                              !isNoOpWithoutInvoke(Per) &&
+                              F.needsUnwindTableEntry();
+
+  shouldEmitPersonality =
+      forceEmitPersonality || ((hasLandingPads || hasEHFunclets) &&
+                               PerEncoding != dwarf::DW_EH_PE_omit && PerFn);
+
+  unsigned LSDAEncoding = TLOF.getLSDAEncoding();
+  shouldEmitLSDA = shouldEmitPersonality &&
+    LSDAEncoding != dwarf::DW_EH_PE_omit;
+
+  // If we're not using CFI, we don't want the CFI or the personality, but we
+  // might want EH tables if we had EH pads.
+  if (!Asm->MAI->usesWindowsCFI()) {
+    if (Per == EHPersonality::MSVC_X86SEH && !hasEHFunclets) {
+      // If this is 32-bit SEH and we don't have any funclets (really invokes),
+      // make sure we emit the parent offset label. Some unreferenced filter
+      // functions may still refer to it.
+      const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
+      StringRef FLinkageName =
+          GlobalValue::dropLLVMManglingEscape(MF->getFunction().getName());
+      emitEHRegistrationOffsetLabel(FuncInfo, FLinkageName);
+    }
+    shouldEmitLSDA = hasEHFunclets;
+    shouldEmitPersonality = false;
+    return;
+  }
+
+  beginFunclet(MF->front(), Asm->CurrentFnSym);
+}
+
+void WinException::markFunctionEnd() {
+  if (isAArch64 && CurrentFuncletEntry &&
+      (shouldEmitMoves || shouldEmitPersonality))
+    Asm->OutStreamer->emitWinCFIFuncletOrFuncEnd();
+}
+
+/// endFunction - Gather and emit post-function exception information.
+///
+void WinException::endFunction(const MachineFunction *MF) {
+  if (!shouldEmitPersonality && !shouldEmitMoves && !shouldEmitLSDA)
+    return;
+
+  const Function &F = MF->getFunction();
+  EHPersonality Per = EHPersonality::Unknown;
+  if (F.hasPersonalityFn())
+    Per = classifyEHPersonality(F.getPersonalityFn()->stripPointerCasts());
+
+  endFuncletImpl();
+
+  // endFunclet will emit the necessary .xdata tables for table-based SEH.
+  if (Per == EHPersonality::MSVC_TableSEH && MF->hasEHFunclets())
+    return;
+
+  if (shouldEmitPersonality || shouldEmitLSDA) {
+    Asm->OutStreamer->pushSection();
+
+    // Just switch sections to the right xdata section.
+    MCSection *XData = Asm->OutStreamer->getAssociatedXDataSection(
+        Asm->OutStreamer->getCurrentSectionOnly());
+    Asm->OutStreamer->switchSection(XData);
+
+    // Emit the tables appropriate to the personality function in use. If we
+    // don't recognize the personality, assume it uses an Itanium-style LSDA.
+    if (Per == EHPersonality::MSVC_TableSEH)
+      emitCSpecificHandlerTable(MF);
+    else if (Per == EHPersonality::MSVC_X86SEH)
+      emitExceptHandlerTable(MF);
+    else if (Per == EHPersonality::MSVC_CXX)
+      emitCXXFrameHandler3Table(MF);
+    else if (Per == EHPersonality::CoreCLR)
+      emitCLRExceptionTable(MF);
+    else
+      emitExceptionTable();
+
+    Asm->OutStreamer->popSection();
+  }
+
+  if (!MF->getCatchretTargets().empty()) {
+    // Copy the function's catchret targets to a module-level list.
+    EHContTargets.insert(EHContTargets.end(), MF->getCatchretTargets().begin(),
+                         MF->getCatchretTargets().end());
+  }
+}
+
+/// Retrieve the MCSymbol for a GlobalValue or MachineBasicBlock.
+static MCSymbol *getMCSymbolForMBB(AsmPrinter *Asm,
+                                   const MachineBasicBlock *MBB) {
+  if (!MBB)
+    return nullptr;
+
+  assert(MBB->isEHFuncletEntry());
+
+  // Give catches and cleanups a name based off of their parent function and
+  // their funclet entry block's number.
+  const MachineFunction *MF = MBB->getParent();
+  const Function &F = MF->getFunction();
+  StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F.getName());
+  MCContext &Ctx = MF->getContext();
+  StringRef HandlerPrefix = MBB->isCleanupFuncletEntry() ? "dtor" : "catch";
+  return Ctx.getOrCreateSymbol("?" + HandlerPrefix + "$" +
+                               Twine(MBB->getNumber()) + "@?0?" +
+                               FuncLinkageName + "@4HA");
+}
+
+void WinException::beginFunclet(const MachineBasicBlock &MBB,
+                                MCSymbol *Sym) {
+  CurrentFuncletEntry = &MBB;
+
+  const Function &F = Asm->MF->getFunction();
+  // If a symbol was not provided for the funclet, invent one.
+  if (!Sym) {
+    Sym = getMCSymbolForMBB(Asm, &MBB);
+
+    // Describe our funclet symbol as a function with internal linkage.
+    Asm->OutStreamer->beginCOFFSymbolDef(Sym);
+    Asm->OutStreamer->emitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+    Asm->OutStreamer->emitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+                                         << COFF::SCT_COMPLEX_TYPE_SHIFT);
+    Asm->OutStreamer->endCOFFSymbolDef();
+
+    // We want our funclet's entry point to be aligned such that no nops will be
+    // present after the label.
+    Asm->emitAlignment(std::max(Asm->MF->getAlignment(), MBB.getAlignment()),
+                       &F);
+
+    // Now that we've emitted the alignment directive, point at our funclet.
+    Asm->OutStreamer->emitLabel(Sym);
+  }
+
+  // Mark 'Sym' as starting our funclet.
+  if (shouldEmitMoves || shouldEmitPersonality) {
+    CurrentFuncletTextSection = Asm->OutStreamer->getCurrentSectionOnly();
+    Asm->OutStreamer->emitWinCFIStartProc(Sym);
+  }
+
+  if (shouldEmitPersonality) {
+    const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+    const Function *PerFn = nullptr;
+
+    // Determine which personality routine we are using for this funclet.
+    if (F.hasPersonalityFn())
+      PerFn = dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts());
+    const MCSymbol *PersHandlerSym =
+        TLOF.getCFIPersonalitySymbol(PerFn, Asm->TM, MMI);
+
+    // Do not emit a .seh_handler directives for cleanup funclets.
+    // FIXME: This means cleanup funclets cannot handle exceptions. Given that
+    // Clang doesn't produce EH constructs inside cleanup funclets and LLVM's
+    // inliner doesn't allow inlining them, this isn't a major problem in
+    // practice.
+    if (!CurrentFuncletEntry->isCleanupFuncletEntry())
+      Asm->OutStreamer->emitWinEHHandler(PersHandlerSym, true, true);
+  }
+}
+
+void WinException::endFunclet() {
+  if (isAArch64 && CurrentFuncletEntry &&
+      (shouldEmitMoves || shouldEmitPersonality)) {
+    Asm->OutStreamer->switchSection(CurrentFuncletTextSection);
+    Asm->OutStreamer->emitWinCFIFuncletOrFuncEnd();
+  }
+  endFuncletImpl();
+}
+
+void WinException::endFuncletImpl() {
+  // No funclet to process?  Great, we have nothing to do.
+  if (!CurrentFuncletEntry)
+    return;
+
+  const MachineFunction *MF = Asm->MF;
+  if (shouldEmitMoves || shouldEmitPersonality) {
+    const Function &F = MF->getFunction();
+    EHPersonality Per = EHPersonality::Unknown;
+    if (F.hasPersonalityFn())
+      Per = classifyEHPersonality(F.getPersonalityFn()->stripPointerCasts());
+
+    if (Per == EHPersonality::MSVC_CXX && shouldEmitPersonality &&
+        !CurrentFuncletEntry->isCleanupFuncletEntry()) {
+      // Emit an UNWIND_INFO struct describing the prologue.
+      Asm->OutStreamer->emitWinEHHandlerData();
+
+      // If this is a C++ catch funclet (or the parent function),
+      // emit a reference to the LSDA for the parent function.
+      StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F.getName());
+      MCSymbol *FuncInfoXData = Asm->OutContext.getOrCreateSymbol(
+          Twine("$cppxdata$", FuncLinkageName));
+      Asm->OutStreamer->emitValue(create32bitRef(FuncInfoXData), 4);
+    } else if (Per == EHPersonality::MSVC_TableSEH && MF->hasEHFunclets() &&
+               !CurrentFuncletEntry->isEHFuncletEntry()) {
+      // Emit an UNWIND_INFO struct describing the prologue.
+      Asm->OutStreamer->emitWinEHHandlerData();
+
+      // If this is the parent function in Win64 SEH, emit the LSDA immediately
+      // following .seh_handlerdata.
+      emitCSpecificHandlerTable(MF);
+    } else if (shouldEmitPersonality || shouldEmitLSDA) {
+      // Emit an UNWIND_INFO struct describing the prologue.
+      Asm->OutStreamer->emitWinEHHandlerData();
+      // In these cases, no further info is written to the .xdata section
+      // right here, but is written by e.g. emitExceptionTable in endFunction()
+      // above.
+    } else {
+      // No need to emit the EH handler data right here if nothing needs
+      // writing to the .xdata section; it will be emitted for all
+      // functions that need it in the end anyway.
+    }
+
+    // Switch back to the funclet start .text section now that we are done
+    // writing to .xdata, and emit an .seh_endproc directive to mark the end of
+    // the function.
+    Asm->OutStreamer->switchSection(CurrentFuncletTextSection);
+    Asm->OutStreamer->emitWinCFIEndProc();
+  }
+
+  // Let's make sure we don't try to end the same funclet twice.
+  CurrentFuncletEntry = nullptr;
+}
+
+const MCExpr *WinException::create32bitRef(const MCSymbol *Value) {
+  if (!Value)
+    return MCConstantExpr::create(0, Asm->OutContext);
+  return MCSymbolRefExpr::create(Value, useImageRel32
+                                            ? MCSymbolRefExpr::VK_COFF_IMGREL32
+                                            : MCSymbolRefExpr::VK_None,
+                                 Asm->OutContext);
+}
+
+const MCExpr *WinException::create32bitRef(const GlobalValue *GV) {
+  if (!GV)
+    return MCConstantExpr::create(0, Asm->OutContext);
+  return create32bitRef(Asm->getSymbol(GV));
+}
+
+const MCExpr *WinException::getLabel(const MCSymbol *Label) {
+  return MCSymbolRefExpr::create(Label, MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                 Asm->OutContext);
+}
+
+const MCExpr *WinException::getLabelPlusOne(const MCSymbol *Label) {
+  return MCBinaryExpr::createAdd(getLabel(Label),
+                                 MCConstantExpr::create(1, Asm->OutContext),
+                                 Asm->OutContext);
+}
+
+const MCExpr *WinException::getOffset(const MCSymbol *OffsetOf,
+                                      const MCSymbol *OffsetFrom) {
+  return MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(OffsetOf, Asm->OutContext),
+      MCSymbolRefExpr::create(OffsetFrom, Asm->OutContext), Asm->OutContext);
+}
+
+const MCExpr *WinException::getOffsetPlusOne(const MCSymbol *OffsetOf,
+                                             const MCSymbol *OffsetFrom) {
+  return MCBinaryExpr::createAdd(getOffset(OffsetOf, OffsetFrom),
+                                 MCConstantExpr::create(1, Asm->OutContext),
+                                 Asm->OutContext);
+}
+
+int WinException::getFrameIndexOffset(int FrameIndex,
+                                      const WinEHFuncInfo &FuncInfo) {
+  const TargetFrameLowering &TFI = *Asm->MF->getSubtarget().getFrameLowering();
+  Register UnusedReg;
+  if (Asm->MAI->usesWindowsCFI()) {
+    StackOffset Offset =
+        TFI.getFrameIndexReferencePreferSP(*Asm->MF, FrameIndex, UnusedReg,
+                                           /*IgnoreSPUpdates*/ true);
+    assert(UnusedReg ==
+           Asm->MF->getSubtarget()
+               .getTargetLowering()
+               ->getStackPointerRegisterToSaveRestore());
+    return Offset.getFixed();
+  }
+
+  // For 32-bit, offsets should be relative to the end of the EH registration
+  // node. For 64-bit, it's relative to SP at the end of the prologue.
+  assert(FuncInfo.EHRegNodeEndOffset != INT_MAX);
+  StackOffset Offset = TFI.getFrameIndexReference(*Asm->MF, FrameIndex, UnusedReg);
+  Offset += StackOffset::getFixed(FuncInfo.EHRegNodeEndOffset);
+  assert(!Offset.getScalable() &&
+         "Frame offsets with a scalable component are not supported");
+  return Offset.getFixed();
+}
+
+namespace {
+
+/// Top-level state used to represent unwind to caller
+const int NullState = -1;
+
+struct InvokeStateChange {
+  /// EH Label immediately after the last invoke in the previous state, or
+  /// nullptr if the previous state was the null state.
+  const MCSymbol *PreviousEndLabel;
+
+  /// EH label immediately before the first invoke in the new state, or nullptr
+  /// if the new state is the null state.
+  const MCSymbol *NewStartLabel;
+
+  /// State of the invoke following NewStartLabel, or NullState to indicate
+  /// the presence of calls which may unwind to caller.
+  int NewState;
+};
+
+/// Iterator that reports all the invoke state changes in a range of machine
+/// basic blocks.  Changes to the null state are reported whenever a call that
+/// may unwind to caller is encountered.  The MBB range is expected to be an
+/// entire function or funclet, and the start and end of the range are treated
+/// as being in the NullState even if there's not an unwind-to-caller call
+/// before the first invoke or after the last one (i.e., the first state change
+/// reported is the first change to something other than NullState, and a
+/// change back to NullState is always reported at the end of iteration).
+class InvokeStateChangeIterator {
+  InvokeStateChangeIterator(const WinEHFuncInfo &EHInfo,
+                            MachineFunction::const_iterator MFI,
+                            MachineFunction::const_iterator MFE,
+                            MachineBasicBlock::const_iterator MBBI,
+                            int BaseState)
+      : EHInfo(EHInfo), MFI(MFI), MFE(MFE), MBBI(MBBI), BaseState(BaseState) {
+    LastStateChange.PreviousEndLabel = nullptr;
+    LastStateChange.NewStartLabel = nullptr;
+    LastStateChange.NewState = BaseState;
+    scan();
+  }
+
+public:
+  static iterator_range<InvokeStateChangeIterator>
+  range(const WinEHFuncInfo &EHInfo, MachineFunction::const_iterator Begin,
+        MachineFunction::const_iterator End, int BaseState = NullState) {
+    // Reject empty ranges to simplify bookkeeping by ensuring that we can get
+    // the end of the last block.
+    assert(Begin != End);
+    auto BlockBegin = Begin->begin();
+    auto BlockEnd = std::prev(End)->end();
+    return make_range(
+        InvokeStateChangeIterator(EHInfo, Begin, End, BlockBegin, BaseState),
+        InvokeStateChangeIterator(EHInfo, End, End, BlockEnd, BaseState));
+  }
+
+  // Iterator methods.
+  bool operator==(const InvokeStateChangeIterator &O) const {
+    assert(BaseState == O.BaseState);
+    // Must be visiting same block.
+    if (MFI != O.MFI)
+      return false;
+    // Must be visiting same isntr.
+    if (MBBI != O.MBBI)
+      return false;
+    // At end of block/instr iteration, we can still have two distinct states:
+    // one to report the final EndLabel, and another indicating the end of the
+    // state change iteration.  Check for CurrentEndLabel equality to
+    // distinguish these.
+    return CurrentEndLabel == O.CurrentEndLabel;
+  }
+
+  bool operator!=(const InvokeStateChangeIterator &O) const {
+    return !operator==(O);
+  }
+  InvokeStateChange &operator*() { return LastStateChange; }
+  InvokeStateChange *operator->() { return &LastStateChange; }
+  InvokeStateChangeIterator &operator++() { return scan(); }
+
+private:
+  InvokeStateChangeIterator &scan();
+
+  const WinEHFuncInfo &EHInfo;
+  const MCSymbol *CurrentEndLabel = nullptr;
+  MachineFunction::const_iterator MFI;
+  MachineFunction::const_iterator MFE;
+  MachineBasicBlock::const_iterator MBBI;
+  InvokeStateChange LastStateChange;
+  bool VisitingInvoke = false;
+  int BaseState;
+};
+
+} // end anonymous namespace
+
+InvokeStateChangeIterator &InvokeStateChangeIterator::scan() {
+  bool IsNewBlock = false;
+  for (; MFI != MFE; ++MFI, IsNewBlock = true) {
+    if (IsNewBlock)
+      MBBI = MFI->begin();
+    for (auto MBBE = MFI->end(); MBBI != MBBE; ++MBBI) {
+      const MachineInstr &MI = *MBBI;
+      if (!VisitingInvoke && LastStateChange.NewState != BaseState &&
+          MI.isCall() && !EHStreamer::callToNoUnwindFunction(&MI)) {
+        // Indicate a change of state to the null state.  We don't have
+        // start/end EH labels handy but the caller won't expect them for
+        // null state regions.
+        LastStateChange.PreviousEndLabel = CurrentEndLabel;
+        LastStateChange.NewStartLabel = nullptr;
+        LastStateChange.NewState = BaseState;
+        CurrentEndLabel = nullptr;
+        // Don't re-visit this instr on the next scan
+        ++MBBI;
+        return *this;
+      }
+
+      // All other state changes are at EH labels before/after invokes.
+      if (!MI.isEHLabel())
+        continue;
+      MCSymbol *Label = MI.getOperand(0).getMCSymbol();
+      if (Label == CurrentEndLabel) {
+        VisitingInvoke = false;
+        continue;
+      }
+      auto InvokeMapIter = EHInfo.LabelToStateMap.find(Label);
+      // Ignore EH labels that aren't the ones inserted before an invoke
+      if (InvokeMapIter == EHInfo.LabelToStateMap.end())
+        continue;
+      auto &StateAndEnd = InvokeMapIter->second;
+      int NewState = StateAndEnd.first;
+      // Keep track of the fact that we're between EH start/end labels so
+      // we know not to treat the inoke we'll see as unwinding to caller.
+      VisitingInvoke = true;
+      if (NewState == LastStateChange.NewState) {
+        // The state isn't actually changing here.  Record the new end and
+        // keep going.
+        CurrentEndLabel = StateAndEnd.second;
+        continue;
+      }
+      // Found a state change to report
+      LastStateChange.PreviousEndLabel = CurrentEndLabel;
+      LastStateChange.NewStartLabel = Label;
+      LastStateChange.NewState = NewState;
+      // Start keeping track of the new current end
+      CurrentEndLabel = StateAndEnd.second;
+      // Don't re-visit this instr on the next scan
+      ++MBBI;
+      return *this;
+    }
+  }
+  // Iteration hit the end of the block range.
+  if (LastStateChange.NewState != BaseState) {
+    // Report the end of the last new state
+    LastStateChange.PreviousEndLabel = CurrentEndLabel;
+    LastStateChange.NewStartLabel = nullptr;
+    LastStateChange.NewState = BaseState;
+    // Leave CurrentEndLabel non-null to distinguish this state from end.
+    assert(CurrentEndLabel != nullptr);
+    return *this;
+  }
+  // We've reported all state changes and hit the end state.
+  CurrentEndLabel = nullptr;
+  return *this;
+}
+
+/// Emit the language-specific data that __C_specific_handler expects.  This
+/// handler lives in the x64 Microsoft C runtime and allows catching or cleaning
+/// up after faults with __try, __except, and __finally.  The typeinfo values
+/// are not really RTTI data, but pointers to filter functions that return an
+/// integer (1, 0, or -1) indicating how to handle the exception. For __finally
+/// blocks and other cleanups, the landing pad label is zero, and the filter
+/// function is actually a cleanup handler with the same prototype.  A catch-all
+/// entry is modeled with a null filter function field and a non-zero landing
+/// pad label.
+///
+/// Possible filter function return values:
+///   EXCEPTION_EXECUTE_HANDLER (1):
+///     Jump to the landing pad label after cleanups.
+///   EXCEPTION_CONTINUE_SEARCH (0):
+///     Continue searching this table or continue unwinding.
+///   EXCEPTION_CONTINUE_EXECUTION (-1):
+///     Resume execution at the trapping PC.
+///
+/// Inferred table structure:
+///   struct Table {
+///     int NumEntries;
+///     struct Entry {
+///       imagerel32 LabelStart;       // Inclusive
+///       imagerel32 LabelEnd;         // Exclusive
+///       imagerel32 FilterOrFinally;  // One means catch-all.
+///       imagerel32 LabelLPad;        // Zero means __finally.
+///     } Entries[NumEntries];
+///   };
+void WinException::emitCSpecificHandlerTable(const MachineFunction *MF) {
+  auto &OS = *Asm->OutStreamer;
+  MCContext &Ctx = Asm->OutContext;
+  const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
+
+  bool VerboseAsm = OS.isVerboseAsm();
+  auto AddComment = [&](const Twine &Comment) {
+    if (VerboseAsm)
+      OS.AddComment(Comment);
+  };
+
+  if (!isAArch64) {
+    // Emit a label assignment with the SEH frame offset so we can use it for
+    // llvm.eh.recoverfp.
+    StringRef FLinkageName =
+        GlobalValue::dropLLVMManglingEscape(MF->getFunction().getName());
+    MCSymbol *ParentFrameOffset =
+        Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName);
+    const MCExpr *MCOffset =
+        MCConstantExpr::create(FuncInfo.SEHSetFrameOffset, Ctx);
+    Asm->OutStreamer->emitAssignment(ParentFrameOffset, MCOffset);
+  }
+
+  // Use the assembler to compute the number of table entries through label
+  // difference and division.
+  MCSymbol *TableBegin =
+      Ctx.createTempSymbol("lsda_begin", /*AlwaysAddSuffix=*/true);
+  MCSymbol *TableEnd =
+      Ctx.createTempSymbol("lsda_end", /*AlwaysAddSuffix=*/true);
+  const MCExpr *LabelDiff = getOffset(TableEnd, TableBegin);
+  const MCExpr *EntrySize = MCConstantExpr::create(16, Ctx);
+  const MCExpr *EntryCount = MCBinaryExpr::createDiv(LabelDiff, EntrySize, Ctx);
+  AddComment("Number of call sites");
+  OS.emitValue(EntryCount, 4);
+
+  OS.emitLabel(TableBegin);
+
+  // Iterate over all the invoke try ranges. Unlike MSVC, LLVM currently only
+  // models exceptions from invokes. LLVM also allows arbitrary reordering of
+  // the code, so our tables end up looking a bit different. Rather than
+  // trying to match MSVC's tables exactly, we emit a denormalized table.  For
+  // each range of invokes in the same state, we emit table entries for all
+  // the actions that would be taken in that state. This means our tables are
+  // slightly bigger, which is OK.
+  const MCSymbol *LastStartLabel = nullptr;
+  int LastEHState = -1;
+  // Break out before we enter into a finally funclet.
+  // FIXME: We need to emit separate EH tables for cleanups.
+  MachineFunction::const_iterator End = MF->end();
+  MachineFunction::const_iterator Stop = std::next(MF->begin());
+  while (Stop != End && !Stop->isEHFuncletEntry())
+    ++Stop;
+  for (const auto &StateChange :
+       InvokeStateChangeIterator::range(FuncInfo, MF->begin(), Stop)) {
+    // Emit all the actions for the state we just transitioned out of
+    // if it was not the null state
+    if (LastEHState != -1)
+      emitSEHActionsForRange(FuncInfo, LastStartLabel,
+                             StateChange.PreviousEndLabel, LastEHState);
+    LastStartLabel = StateChange.NewStartLabel;
+    LastEHState = StateChange.NewState;
+  }
+
+  OS.emitLabel(TableEnd);
+}
+
+void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
+                                          const MCSymbol *BeginLabel,
+                                          const MCSymbol *EndLabel, int State) {
+  auto &OS = *Asm->OutStreamer;
+  MCContext &Ctx = Asm->OutContext;
+  bool VerboseAsm = OS.isVerboseAsm();
+  auto AddComment = [&](const Twine &Comment) {
+    if (VerboseAsm)
+      OS.AddComment(Comment);
+  };
+
+  assert(BeginLabel && EndLabel);
+  while (State != -1) {
+    const SEHUnwindMapEntry &UME = FuncInfo.SEHUnwindMap[State];
+    const MCExpr *FilterOrFinally;
+    const MCExpr *ExceptOrNull;
+    auto *Handler = cast<MachineBasicBlock *>(UME.Handler);
+    if (UME.IsFinally) {
+      FilterOrFinally = create32bitRef(getMCSymbolForMBB(Asm, Handler));
+      ExceptOrNull = MCConstantExpr::create(0, Ctx);
+    } else {
+      // For an except, the filter can be 1 (catch-all) or a function
+      // label.
+      FilterOrFinally = UME.Filter ? create32bitRef(UME.Filter)
+                                   : MCConstantExpr::create(1, Ctx);
+      ExceptOrNull = create32bitRef(Handler->getSymbol());
+    }
+
+    AddComment("LabelStart");
+    OS.emitValue(getLabel(BeginLabel), 4);
+    AddComment("LabelEnd");
+    OS.emitValue(getLabelPlusOne(EndLabel), 4);
+    AddComment(UME.IsFinally ? "FinallyFunclet" : UME.Filter ? "FilterFunction"
+                                                             : "CatchAll");
+    OS.emitValue(FilterOrFinally, 4);
+    AddComment(UME.IsFinally ? "Null" : "ExceptionHandler");
+    OS.emitValue(ExceptOrNull, 4);
+
+    assert(UME.ToState < State && "states should decrease");
+    State = UME.ToState;
+  }
+}
+
+void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
+  const Function &F = MF->getFunction();
+  auto &OS = *Asm->OutStreamer;
+  const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
+
+  StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F.getName());
+
+  SmallVector<std::pair<const MCExpr *, int>, 4> IPToStateTable;
+  MCSymbol *FuncInfoXData = nullptr;
+  if (shouldEmitPersonality) {
+    // If we're 64-bit, emit a pointer to the C++ EH data, and build a map from
+    // IPs to state numbers.
+    FuncInfoXData =
+        Asm->OutContext.getOrCreateSymbol(Twine("$cppxdata$", FuncLinkageName));
+    computeIP2StateTable(MF, FuncInfo, IPToStateTable);
+  } else {
+    FuncInfoXData = Asm->OutContext.getOrCreateLSDASymbol(FuncLinkageName);
+  }
+
+  int UnwindHelpOffset = 0;
+  // TODO: The check for UnwindHelpFrameIdx against max() below (and the
+  // second check further below) can be removed if MS C++ unwinding is
+  // implemented for ARM, when test/CodeGen/ARM/Windows/wineh-basic.ll
+  // passes without the check.
+  if (Asm->MAI->usesWindowsCFI() &&
+      FuncInfo.UnwindHelpFrameIdx != std::numeric_limits<int>::max())
+    UnwindHelpOffset =
+        getFrameIndexOffset(FuncInfo.UnwindHelpFrameIdx, FuncInfo);
+
+  MCSymbol *UnwindMapXData = nullptr;
+  MCSymbol *TryBlockMapXData = nullptr;
+  MCSymbol *IPToStateXData = nullptr;
+  if (!FuncInfo.CxxUnwindMap.empty())
+    UnwindMapXData = Asm->OutContext.getOrCreateSymbol(
+        Twine("$stateUnwindMap$", FuncLinkageName));
+  if (!FuncInfo.TryBlockMap.empty())
+    TryBlockMapXData =
+        Asm->OutContext.getOrCreateSymbol(Twine("$tryMap$", FuncLinkageName));
+  if (!IPToStateTable.empty())
+    IPToStateXData =
+        Asm->OutContext.getOrCreateSymbol(Twine("$ip2state$", FuncLinkageName));
+
+  bool VerboseAsm = OS.isVerboseAsm();
+  auto AddComment = [&](const Twine &Comment) {
+    if (VerboseAsm)
+      OS.AddComment(Comment);
+  };
+
+  // FuncInfo {
+  //   uint32_t           MagicNumber
+  //   int32_t            MaxState;
+  //   UnwindMapEntry    *UnwindMap;
+  //   uint32_t           NumTryBlocks;
+  //   TryBlockMapEntry  *TryBlockMap;
+  //   uint32_t           IPMapEntries; // always 0 for x86
+  //   IPToStateMapEntry *IPToStateMap; // always 0 for x86
+  //   uint32_t           UnwindHelp;   // non-x86 only
+  //   ESTypeList        *ESTypeList;
+  //   int32_t            EHFlags;
+  // }
+  // EHFlags & 1 -> Synchronous exceptions only, no async exceptions.
+  // EHFlags & 2 -> ???
+  // EHFlags & 4 -> The function is noexcept(true), unwinding can't continue.
+  OS.emitValueToAlignment(Align(4));
+  OS.emitLabel(FuncInfoXData);
+
+  AddComment("MagicNumber");
+  OS.emitInt32(0x19930522);
+
+  AddComment("MaxState");
+  OS.emitInt32(FuncInfo.CxxUnwindMap.size());
+
+  AddComment("UnwindMap");
+  OS.emitValue(create32bitRef(UnwindMapXData), 4);
+
+  AddComment("NumTryBlocks");
+  OS.emitInt32(FuncInfo.TryBlockMap.size());
+
+  AddComment("TryBlockMap");
+  OS.emitValue(create32bitRef(TryBlockMapXData), 4);
+
+  AddComment("IPMapEntries");
+  OS.emitInt32(IPToStateTable.size());
+
+  AddComment("IPToStateXData");
+  OS.emitValue(create32bitRef(IPToStateXData), 4);
+
+  if (Asm->MAI->usesWindowsCFI() &&
+      FuncInfo.UnwindHelpFrameIdx != std::numeric_limits<int>::max()) {
+    AddComment("UnwindHelp");
+    OS.emitInt32(UnwindHelpOffset);
+  }
+
+  AddComment("ESTypeList");
+  OS.emitInt32(0);
+
+  AddComment("EHFlags");
+  if (MMI->getModule()->getModuleFlag("eh-asynch")) {
+    OS.emitInt32(0);
+  } else {
+    OS.emitInt32(1);
+  }
+
+  // UnwindMapEntry {
+  //   int32_t ToState;
+  //   void  (*Action)();
+  // };
+  if (UnwindMapXData) {
+    OS.emitLabel(UnwindMapXData);
+    for (const CxxUnwindMapEntry &UME : FuncInfo.CxxUnwindMap) {
+      MCSymbol *CleanupSym = getMCSymbolForMBB(
+          Asm, dyn_cast_if_present<MachineBasicBlock *>(UME.Cleanup));
+      AddComment("ToState");
+      OS.emitInt32(UME.ToState);
+
+      AddComment("Action");
+      OS.emitValue(create32bitRef(CleanupSym), 4);
+    }
+  }
+
+  // TryBlockMap {
+  //   int32_t      TryLow;
+  //   int32_t      TryHigh;
+  //   int32_t      CatchHigh;
+  //   int32_t      NumCatches;
+  //   HandlerType *HandlerArray;
+  // };
+  if (TryBlockMapXData) {
+    OS.emitLabel(TryBlockMapXData);
+    SmallVector<MCSymbol *, 1> HandlerMaps;
+    for (size_t I = 0, E = FuncInfo.TryBlockMap.size(); I != E; ++I) {
+      const WinEHTryBlockMapEntry &TBME = FuncInfo.TryBlockMap[I];
+
+      MCSymbol *HandlerMapXData = nullptr;
+      if (!TBME.HandlerArray.empty())
+        HandlerMapXData =
+            Asm->OutContext.getOrCreateSymbol(Twine("$handlerMap$")
+                                                  .concat(Twine(I))
+                                                  .concat("$")
+                                                  .concat(FuncLinkageName));
+      HandlerMaps.push_back(HandlerMapXData);
+
+      // TBMEs should form intervals.
+      assert(0 <= TBME.TryLow && "bad trymap interval");
+      assert(TBME.TryLow <= TBME.TryHigh && "bad trymap interval");
+      assert(TBME.TryHigh < TBME.CatchHigh && "bad trymap interval");
+      assert(TBME.CatchHigh < int(FuncInfo.CxxUnwindMap.size()) &&
+             "bad trymap interval");
+
+      AddComment("TryLow");
+      OS.emitInt32(TBME.TryLow);
+
+      AddComment("TryHigh");
+      OS.emitInt32(TBME.TryHigh);
+
+      AddComment("CatchHigh");
+      OS.emitInt32(TBME.CatchHigh);
+
+      AddComment("NumCatches");
+      OS.emitInt32(TBME.HandlerArray.size());
+
+      AddComment("HandlerArray");
+      OS.emitValue(create32bitRef(HandlerMapXData), 4);
+    }
+
+    // All funclets use the same parent frame offset currently.
+    unsigned ParentFrameOffset = 0;
+    if (shouldEmitPersonality) {
+      const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+      ParentFrameOffset = TFI->getWinEHParentFrameOffset(*MF);
+    }
+
+    for (size_t I = 0, E = FuncInfo.TryBlockMap.size(); I != E; ++I) {
+      const WinEHTryBlockMapEntry &TBME = FuncInfo.TryBlockMap[I];
+      MCSymbol *HandlerMapXData = HandlerMaps[I];
+      if (!HandlerMapXData)
+        continue;
+      // HandlerType {
+      //   int32_t         Adjectives;
+      //   TypeDescriptor *Type;
+      //   int32_t         CatchObjOffset;
+      //   void          (*Handler)();
+      //   int32_t         ParentFrameOffset; // x64 and AArch64 only
+      // };
+      OS.emitLabel(HandlerMapXData);
+      for (const WinEHHandlerType &HT : TBME.HandlerArray) {
+        // Get the frame escape label with the offset of the catch object. If
+        // the index is INT_MAX, then there is no catch object, and we should
+        // emit an offset of zero, indicating that no copy will occur.
+        const MCExpr *FrameAllocOffsetRef = nullptr;
+        if (HT.CatchObj.FrameIndex != INT_MAX) {
+          int Offset = getFrameIndexOffset(HT.CatchObj.FrameIndex, FuncInfo);
+          assert(Offset != 0 && "Illegal offset for catch object!");
+          FrameAllocOffsetRef = MCConstantExpr::create(Offset, Asm->OutContext);
+        } else {
+          FrameAllocOffsetRef = MCConstantExpr::create(0, Asm->OutContext);
+        }
+
+        MCSymbol *HandlerSym = getMCSymbolForMBB(
+            Asm, dyn_cast_if_present<MachineBasicBlock *>(HT.Handler));
+
+        AddComment("Adjectives");
+        OS.emitInt32(HT.Adjectives);
+
+        AddComment("Type");
+        OS.emitValue(create32bitRef(HT.TypeDescriptor), 4);
+
+        AddComment("CatchObjOffset");
+        OS.emitValue(FrameAllocOffsetRef, 4);
+
+        AddComment("Handler");
+        OS.emitValue(create32bitRef(HandlerSym), 4);
+
+        if (shouldEmitPersonality) {
+          AddComment("ParentFrameOffset");
+          OS.emitInt32(ParentFrameOffset);
+        }
+      }
+    }
+  }
+
+  // IPToStateMapEntry {
+  //   void   *IP;
+  //   int32_t State;
+  // };
+  if (IPToStateXData) {
+    OS.emitLabel(IPToStateXData);
+    for (auto &IPStatePair : IPToStateTable) {
+      AddComment("IP");
+      OS.emitValue(IPStatePair.first, 4);
+      AddComment("ToState");
+      OS.emitInt32(IPStatePair.second);
+    }
+  }
+}
+
+void WinException::computeIP2StateTable(
+    const MachineFunction *MF, const WinEHFuncInfo &FuncInfo,
+    SmallVectorImpl<std::pair<const MCExpr *, int>> &IPToStateTable) {
+
+  for (MachineFunction::const_iterator FuncletStart = MF->begin(),
+                                       FuncletEnd = MF->begin(),
+                                       End = MF->end();
+       FuncletStart != End; FuncletStart = FuncletEnd) {
+    // Find the end of the funclet
+    while (++FuncletEnd != End) {
+      if (FuncletEnd->isEHFuncletEntry()) {
+        break;
+      }
+    }
+
+    // Don't emit ip2state entries for cleanup funclets. Any interesting
+    // exceptional actions in cleanups must be handled in a separate IR
+    // function.
+    if (FuncletStart->isCleanupFuncletEntry())
+      continue;
+
+    MCSymbol *StartLabel;
+    int BaseState;
+    if (FuncletStart == MF->begin()) {
+      BaseState = NullState;
+      StartLabel = Asm->getFunctionBegin();
+    } else {
+      auto *FuncletPad =
+          cast<FuncletPadInst>(FuncletStart->getBasicBlock()->getFirstNonPHI());
+      assert(FuncInfo.FuncletBaseStateMap.count(FuncletPad) != 0);
+      BaseState = FuncInfo.FuncletBaseStateMap.find(FuncletPad)->second;
+      StartLabel = getMCSymbolForMBB(Asm, &*FuncletStart);
+    }
+    assert(StartLabel && "need local function start label");
+    IPToStateTable.push_back(
+        std::make_pair(create32bitRef(StartLabel), BaseState));
+
+    for (const auto &StateChange : InvokeStateChangeIterator::range(
+             FuncInfo, FuncletStart, FuncletEnd, BaseState)) {
+      // Compute the label to report as the start of this entry; use the EH
+      // start label for the invoke if we have one, otherwise (this is a call
+      // which may unwind to our caller and does not have an EH start label, so)
+      // use the previous end label.
+      const MCSymbol *ChangeLabel = StateChange.NewStartLabel;
+      if (!ChangeLabel)
+        ChangeLabel = StateChange.PreviousEndLabel;
+      // Emit an entry indicating that PCs after 'Label' have this EH state.
+      // NOTE: On ARM architectures, the StateFromIp automatically takes into
+      // account that the return address is after the call instruction (whose EH
+      // state we should be using), but on other platforms we need to +1 to the
+      // label so that we are using the correct EH state.
+      const MCExpr *LabelExpression = (isAArch64 || isThumb)
+                                          ? getLabel(ChangeLabel)
+                                          : getLabelPlusOne(ChangeLabel);
+      IPToStateTable.push_back(
+          std::make_pair(LabelExpression, StateChange.NewState));
+      // FIXME: assert that NewState is between CatchLow and CatchHigh.
+    }
+  }
+}
+
+void WinException::emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo,
+                                                 StringRef FLinkageName) {
+  // Outlined helpers called by the EH runtime need to know the offset of the EH
+  // registration in order to recover the parent frame pointer. Now that we know
+  // we've code generated the parent, we can emit the label assignment that
+  // those helpers use to get the offset of the registration node.
+
+  // Compute the parent frame offset. The EHRegNodeFrameIndex will be invalid if
+  // after optimization all the invokes were eliminated. We still need to emit
+  // the parent frame offset label, but it should be garbage and should never be
+  // used.
+  int64_t Offset = 0;
+  int FI = FuncInfo.EHRegNodeFrameIndex;
+  if (FI != INT_MAX) {
+    const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
+    Offset = TFI->getNonLocalFrameIndexReference(*Asm->MF, FI).getFixed();
+  }
+
+  MCContext &Ctx = Asm->OutContext;
+  MCSymbol *ParentFrameOffset =
+      Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName);
+  Asm->OutStreamer->emitAssignment(ParentFrameOffset,
+                                   MCConstantExpr::create(Offset, Ctx));
+}
+
+/// Emit the language-specific data that _except_handler3 and 4 expect. This is
+/// functionally equivalent to the __C_specific_handler table, except it is
+/// indexed by state number instead of IP.
+void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
+  MCStreamer &OS = *Asm->OutStreamer;
+  const Function &F = MF->getFunction();
+  StringRef FLinkageName = GlobalValue::dropLLVMManglingEscape(F.getName());
+
+  bool VerboseAsm = OS.isVerboseAsm();
+  auto AddComment = [&](const Twine &Comment) {
+    if (VerboseAsm)
+      OS.AddComment(Comment);
+  };
+
+  const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
+  emitEHRegistrationOffsetLabel(FuncInfo, FLinkageName);
+
+  // Emit the __ehtable label that we use for llvm.x86.seh.lsda.
+  MCSymbol *LSDALabel = Asm->OutContext.getOrCreateLSDASymbol(FLinkageName);
+  OS.emitValueToAlignment(Align(4));
+  OS.emitLabel(LSDALabel);
+
+  const auto *Per = cast<Function>(F.getPersonalityFn()->stripPointerCasts());
+  StringRef PerName = Per->getName();
+  int BaseState = -1;
+  if (PerName == "_except_handler4") {
+    // The LSDA for _except_handler4 starts with this struct, followed by the
+    // scope table:
+    //
+    // struct EH4ScopeTable {
+    //   int32_t GSCookieOffset;
+    //   int32_t GSCookieXOROffset;
+    //   int32_t EHCookieOffset;
+    //   int32_t EHCookieXOROffset;
+    //   ScopeTableEntry ScopeRecord[];
+    // };
+    //
+    // Offsets are %ebp relative.
+    //
+    // The GS cookie is present only if the function needs stack protection.
+    // GSCookieOffset = -2 means that GS cookie is not used.
+    //
+    // The EH cookie is always present.
+    //
+    // Check is done the following way:
+    //    (ebp+CookieXOROffset) ^ [ebp+CookieOffset] == _security_cookie
+
+    // Retrieve the Guard Stack slot.
+    int GSCookieOffset = -2;
+    const MachineFrameInfo &MFI = MF->getFrameInfo();
+    if (MFI.hasStackProtectorIndex()) {
+      Register UnusedReg;
+      const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+      int SSPIdx = MFI.getStackProtectorIndex();
+      GSCookieOffset =
+          TFI->getFrameIndexReference(*MF, SSPIdx, UnusedReg).getFixed();
+    }
+
+    // Retrieve the EH Guard slot.
+    // TODO(etienneb): Get rid of this value and change it for and assertion.
+    int EHCookieOffset = 9999;
+    if (FuncInfo.EHGuardFrameIndex != INT_MAX) {
+      Register UnusedReg;
+      const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+      int EHGuardIdx = FuncInfo.EHGuardFrameIndex;
+      EHCookieOffset =
+          TFI->getFrameIndexReference(*MF, EHGuardIdx, UnusedReg).getFixed();
+    }
+
+    AddComment("GSCookieOffset");
+    OS.emitInt32(GSCookieOffset);
+    AddComment("GSCookieXOROffset");
+    OS.emitInt32(0);
+    AddComment("EHCookieOffset");
+    OS.emitInt32(EHCookieOffset);
+    AddComment("EHCookieXOROffset");
+    OS.emitInt32(0);
+    BaseState = -2;
+  }
+
+  assert(!FuncInfo.SEHUnwindMap.empty());
+  for (const SEHUnwindMapEntry &UME : FuncInfo.SEHUnwindMap) {
+    auto *Handler = cast<MachineBasicBlock *>(UME.Handler);
+    const MCSymbol *ExceptOrFinally =
+        UME.IsFinally ? getMCSymbolForMBB(Asm, Handler) : Handler->getSymbol();
+    // -1 is usually the base state for "unwind to caller", but for
+    // _except_handler4 it's -2. Do that replacement here if necessary.
+    int ToState = UME.ToState == -1 ? BaseState : UME.ToState;
+    AddComment("ToState");
+    OS.emitInt32(ToState);
+    AddComment(UME.IsFinally ? "Null" : "FilterFunction");
+    OS.emitValue(create32bitRef(UME.Filter), 4);
+    AddComment(UME.IsFinally ? "FinallyFunclet" : "ExceptionHandler");
+    OS.emitValue(create32bitRef(ExceptOrFinally), 4);
+  }
+}
+
+static int getTryRank(const WinEHFuncInfo &FuncInfo, int State) {
+  int Rank = 0;
+  while (State != -1) {
+    ++Rank;
+    State = FuncInfo.ClrEHUnwindMap[State].TryParentState;
+  }
+  return Rank;
+}
+
+static int getTryAncestor(const WinEHFuncInfo &FuncInfo, int Left, int Right) {
+  int LeftRank = getTryRank(FuncInfo, Left);
+  int RightRank = getTryRank(FuncInfo, Right);
+
+  while (LeftRank < RightRank) {
+    Right = FuncInfo.ClrEHUnwindMap[Right].TryParentState;
+    --RightRank;
+  }
+
+  while (RightRank < LeftRank) {
+    Left = FuncInfo.ClrEHUnwindMap[Left].TryParentState;
+    --LeftRank;
+  }
+
+  while (Left != Right) {
+    Left = FuncInfo.ClrEHUnwindMap[Left].TryParentState;
+    Right = FuncInfo.ClrEHUnwindMap[Right].TryParentState;
+  }
+
+  return Left;
+}
+
+void WinException::emitCLRExceptionTable(const MachineFunction *MF) {
+  // CLR EH "states" are really just IDs that identify handlers/funclets;
+  // states, handlers, and funclets all have 1:1 mappings between them, and a
+  // handler/funclet's "state" is its index in the ClrEHUnwindMap.
+  MCStreamer &OS = *Asm->OutStreamer;
+  const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
+  MCSymbol *FuncBeginSym = Asm->getFunctionBegin();
+  MCSymbol *FuncEndSym = Asm->getFunctionEnd();
+
+  // A ClrClause describes a protected region.
+  struct ClrClause {
+    const MCSymbol *StartLabel; // Start of protected region
+    const MCSymbol *EndLabel;   // End of protected region
+    int State;          // Index of handler protecting the protected region
+    int EnclosingState; // Index of funclet enclosing the protected region
+  };
+  SmallVector<ClrClause, 8> Clauses;
+
+  // Build a map from handler MBBs to their corresponding states (i.e. their
+  // indices in the ClrEHUnwindMap).
+  int NumStates = FuncInfo.ClrEHUnwindMap.size();
+  assert(NumStates > 0 && "Don't need exception table!");
+  DenseMap<const MachineBasicBlock *, int> HandlerStates;
+  for (int State = 0; State < NumStates; ++State) {
+    MachineBasicBlock *HandlerBlock =
+        cast<MachineBasicBlock *>(FuncInfo.ClrEHUnwindMap[State].Handler);
+    HandlerStates[HandlerBlock] = State;
+    // Use this loop through all handlers to verify our assumption (used in
+    // the MinEnclosingState computation) that enclosing funclets have lower
+    // state numbers than their enclosed funclets.
+    assert(FuncInfo.ClrEHUnwindMap[State].HandlerParentState < State &&
+           "ill-formed state numbering");
+  }
+  // Map the main function to the NullState.
+  HandlerStates[&MF->front()] = NullState;
+
+  // Write out a sentinel indicating the end of the standard (Windows) xdata
+  // and the start of the additional (CLR) info.
+  OS.emitInt32(0xffffffff);
+  // Write out the number of funclets
+  OS.emitInt32(NumStates);
+
+  // Walk the machine blocks/instrs, computing and emitting a few things:
+  // 1. Emit a list of the offsets to each handler entry, in lexical order.
+  // 2. Compute a map (EndSymbolMap) from each funclet to the symbol at its end.
+  // 3. Compute the list of ClrClauses, in the required order (inner before
+  //    outer, earlier before later; the order by which a forward scan with
+  //    early termination will find the innermost enclosing clause covering
+  //    a given address).
+  // 4. A map (MinClauseMap) from each handler index to the index of the
+  //    outermost funclet/function which contains a try clause targeting the
+  //    key handler.  This will be used to determine IsDuplicate-ness when
+  //    emitting ClrClauses.  The NullState value is used to indicate that the
+  //    top-level function contains a try clause targeting the key handler.
+  // HandlerStack is a stack of (PendingStartLabel, PendingState) pairs for
+  // try regions we entered before entering the PendingState try but which
+  // we haven't yet exited.
+  SmallVector<std::pair<const MCSymbol *, int>, 4> HandlerStack;
+  // EndSymbolMap and MinClauseMap are maps described above.
+  std::unique_ptr<MCSymbol *[]> EndSymbolMap(new MCSymbol *[NumStates]);
+  SmallVector<int, 4> MinClauseMap((size_t)NumStates, NumStates);
+
+  // Visit the root function and each funclet.
+  for (MachineFunction::const_iterator FuncletStart = MF->begin(),
+                                       FuncletEnd = MF->begin(),
+                                       End = MF->end();
+       FuncletStart != End; FuncletStart = FuncletEnd) {
+    int FuncletState = HandlerStates[&*FuncletStart];
+    // Find the end of the funclet
+    MCSymbol *EndSymbol = FuncEndSym;
+    while (++FuncletEnd != End) {
+      if (FuncletEnd->isEHFuncletEntry()) {
+        EndSymbol = getMCSymbolForMBB(Asm, &*FuncletEnd);
+        break;
+      }
+    }
+    // Emit the function/funclet end and, if this is a funclet (and not the
+    // root function), record it in the EndSymbolMap.
+    OS.emitValue(getOffset(EndSymbol, FuncBeginSym), 4);
+    if (FuncletState != NullState) {
+      // Record the end of the handler.
+      EndSymbolMap[FuncletState] = EndSymbol;
+    }
+
+    // Walk the state changes in this function/funclet and compute its clauses.
+    // Funclets always start in the null state.
+    const MCSymbol *CurrentStartLabel = nullptr;
+    int CurrentState = NullState;
+    assert(HandlerStack.empty());
+    for (const auto &StateChange :
+         InvokeStateChangeIterator::range(FuncInfo, FuncletStart, FuncletEnd)) {
+      // Close any try regions we're not still under
+      int StillPendingState =
+          getTryAncestor(FuncInfo, CurrentState, StateChange.NewState);
+      while (CurrentState != StillPendingState) {
+        assert(CurrentState != NullState &&
+               "Failed to find still-pending state!");
+        // Close the pending clause
+        Clauses.push_back({CurrentStartLabel, StateChange.PreviousEndLabel,
+                           CurrentState, FuncletState});
+        // Now the next-outer try region is current
+        CurrentState = FuncInfo.ClrEHUnwindMap[CurrentState].TryParentState;
+        // Pop the new start label from the handler stack if we've exited all
+        // inner try regions of the corresponding try region.
+        if (HandlerStack.back().second == CurrentState)
+          CurrentStartLabel = HandlerStack.pop_back_val().first;
+      }
+
+      if (StateChange.NewState != CurrentState) {
+        // For each clause we're starting, update the MinClauseMap so we can
+        // know which is the topmost funclet containing a clause targeting
+        // it.
+        for (int EnteredState = StateChange.NewState;
+             EnteredState != CurrentState;
+             EnteredState =
+                 FuncInfo.ClrEHUnwindMap[EnteredState].TryParentState) {
+          int &MinEnclosingState = MinClauseMap[EnteredState];
+          if (FuncletState < MinEnclosingState)
+            MinEnclosingState = FuncletState;
+        }
+        // Save the previous current start/label on the stack and update to
+        // the newly-current start/state.
+        HandlerStack.emplace_back(CurrentStartLabel, CurrentState);
+        CurrentStartLabel = StateChange.NewStartLabel;
+        CurrentState = StateChange.NewState;
+      }
+    }
+    assert(HandlerStack.empty());
+  }
+
+  // Now emit the clause info, starting with the number of clauses.
+  OS.emitInt32(Clauses.size());
+  for (ClrClause &Clause : Clauses) {
+    // Emit a CORINFO_EH_CLAUSE :
+    /*
+      struct CORINFO_EH_CLAUSE
+      {
+          CORINFO_EH_CLAUSE_FLAGS Flags;         // actually a CorExceptionFlag
+          DWORD                   TryOffset;
+          DWORD                   TryLength;     // actually TryEndOffset
+          DWORD                   HandlerOffset;
+          DWORD                   HandlerLength; // actually HandlerEndOffset
+          union
+          {
+              DWORD               ClassToken;   // use for catch clauses
+              DWORD               FilterOffset; // use for filter clauses
+          };
+      };
+
+      enum CORINFO_EH_CLAUSE_FLAGS
+      {
+          CORINFO_EH_CLAUSE_NONE    = 0,
+          CORINFO_EH_CLAUSE_FILTER  = 0x0001, // This clause is for a filter
+          CORINFO_EH_CLAUSE_FINALLY = 0x0002, // This clause is a finally clause
+          CORINFO_EH_CLAUSE_FAULT   = 0x0004, // This clause is a fault clause
+      };
+      typedef enum CorExceptionFlag
+      {
+          COR_ILEXCEPTION_CLAUSE_NONE,
+          COR_ILEXCEPTION_CLAUSE_FILTER  = 0x0001, // This is a filter clause
+          COR_ILEXCEPTION_CLAUSE_FINALLY = 0x0002, // This is a finally clause
+          COR_ILEXCEPTION_CLAUSE_FAULT = 0x0004,   // This is a fault clause
+          COR_ILEXCEPTION_CLAUSE_DUPLICATED = 0x0008, // duplicated clause. This
+                                                      // clause was duplicated
+                                                      // to a funclet which was
+                                                      // pulled out of line
+      } CorExceptionFlag;
+    */
+    // Add 1 to the start/end of the EH clause; the IP associated with a
+    // call when the runtime does its scan is the IP of the next instruction
+    // (the one to which control will return after the call), so we need
+    // to add 1 to the end of the clause to cover that offset.  We also add
+    // 1 to the start of the clause to make sure that the ranges reported
+    // for all clauses are disjoint.  Note that we'll need some additional
+    // logic when machine traps are supported, since in that case the IP
+    // that the runtime uses is the offset of the faulting instruction
+    // itself; if such an instruction immediately follows a call but the
+    // two belong to different clauses, we'll need to insert a nop between
+    // them so the runtime can distinguish the point to which the call will
+    // return from the point at which the fault occurs.
+
+    const MCExpr *ClauseBegin =
+        getOffsetPlusOne(Clause.StartLabel, FuncBeginSym);
+    const MCExpr *ClauseEnd = getOffsetPlusOne(Clause.EndLabel, FuncBeginSym);
+
+    const ClrEHUnwindMapEntry &Entry = FuncInfo.ClrEHUnwindMap[Clause.State];
+    MachineBasicBlock *HandlerBlock = cast<MachineBasicBlock *>(Entry.Handler);
+    MCSymbol *BeginSym = getMCSymbolForMBB(Asm, HandlerBlock);
+    const MCExpr *HandlerBegin = getOffset(BeginSym, FuncBeginSym);
+    MCSymbol *EndSym = EndSymbolMap[Clause.State];
+    const MCExpr *HandlerEnd = getOffset(EndSym, FuncBeginSym);
+
+    uint32_t Flags = 0;
+    switch (Entry.HandlerType) {
+    case ClrHandlerType::Catch:
+      // Leaving bits 0-2 clear indicates catch.
+      break;
+    case ClrHandlerType::Filter:
+      Flags |= 1;
+      break;
+    case ClrHandlerType::Finally:
+      Flags |= 2;
+      break;
+    case ClrHandlerType::Fault:
+      Flags |= 4;
+      break;
+    }
+    if (Clause.EnclosingState != MinClauseMap[Clause.State]) {
+      // This is a "duplicate" clause; the handler needs to be entered from a
+      // frame above the one holding the invoke.
+      assert(Clause.EnclosingState > MinClauseMap[Clause.State]);
+      Flags |= 8;
+    }
+    OS.emitInt32(Flags);
+
+    // Write the clause start/end
+    OS.emitValue(ClauseBegin, 4);
+    OS.emitValue(ClauseEnd, 4);
+
+    // Write out the handler start/end
+    OS.emitValue(HandlerBegin, 4);
+    OS.emitValue(HandlerEnd, 4);
+
+    // Write out the type token or filter offset
+    assert(Entry.HandlerType != ClrHandlerType::Filter && "NYI: filters");
+    OS.emitInt32(Entry.TypeToken);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinException.h b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinException.h
new file mode 100644
index 000000000000..638589adf0dd
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AsmPrinter/WinException.h
@@ -0,0 +1,121 @@
+//===-- WinException.h - Windows Exception Handling ----------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing windows exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WIN64EXCEPTION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_WIN64EXCEPTION_H
+
+#include "EHStreamer.h"
+#include <vector>
+
+namespace llvm {
+class GlobalValue;
+class MachineFunction;
+class MCExpr;
+class MCSection;
+struct WinEHFuncInfo;
+
+class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
+  /// Per-function flag to indicate if personality info should be emitted.
+  bool shouldEmitPersonality = false;
+
+  /// Per-function flag to indicate if the LSDA should be emitted.
+  bool shouldEmitLSDA = false;
+
+  /// Per-function flag to indicate if frame moves info should be emitted.
+  bool shouldEmitMoves = false;
+
+  /// True if this is a 64-bit target and we should use image relative offsets.
+  bool useImageRel32 = false;
+
+  /// True if we are generating exception handling on Windows for ARM64.
+  bool isAArch64 = false;
+
+  /// True if we are generating exception handling on Windows for ARM (Thumb).
+  bool isThumb = false;
+
+  /// Pointer to the current funclet entry BB.
+  const MachineBasicBlock *CurrentFuncletEntry = nullptr;
+
+  /// The section of the last funclet start.
+  MCSection *CurrentFuncletTextSection = nullptr;
+
+  /// The list of symbols to add to the ehcont section
+  std::vector<const MCSymbol *> EHContTargets;
+
+  void emitCSpecificHandlerTable(const MachineFunction *MF);
+
+  void emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
+                              const MCSymbol *BeginLabel,
+                              const MCSymbol *EndLabel, int State);
+
+  /// Emit the EH table data for 32-bit and 64-bit functions using
+  /// the __CxxFrameHandler3 personality.
+  void emitCXXFrameHandler3Table(const MachineFunction *MF);
+
+  /// Emit the EH table data for _except_handler3 and _except_handler4
+  /// personality functions. These are only used on 32-bit and do not use CFI
+  /// tables.
+  void emitExceptHandlerTable(const MachineFunction *MF);
+
+  void emitCLRExceptionTable(const MachineFunction *MF);
+
+  void computeIP2StateTable(
+      const MachineFunction *MF, const WinEHFuncInfo &FuncInfo,
+      SmallVectorImpl<std::pair<const MCExpr *, int>> &IPToStateTable);
+
+  /// Emits the label used with llvm.eh.recoverfp, which is used by
+  /// outlined funclets.
+  void emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo,
+                                     StringRef FLinkageName);
+
+  const MCExpr *create32bitRef(const MCSymbol *Value);
+  const MCExpr *create32bitRef(const GlobalValue *GV);
+  const MCExpr *getLabel(const MCSymbol *Label);
+  const MCExpr *getLabelPlusOne(const MCSymbol *Label);
+  const MCExpr *getOffset(const MCSymbol *OffsetOf, const MCSymbol *OffsetFrom);
+  const MCExpr *getOffsetPlusOne(const MCSymbol *OffsetOf,
+                                 const MCSymbol *OffsetFrom);
+
+  /// Gets the offset that we should use in a table for a stack object with the
+  /// given index. For targets using CFI (Win64, etc), this is relative to the
+  /// established SP at the end of the prologue. For targets without CFI (Win32
+  /// only), it is relative to the frame pointer.
+  int getFrameIndexOffset(int FrameIndex, const WinEHFuncInfo &FuncInfo);
+
+  void endFuncletImpl();
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  WinException(AsmPrinter *A);
+  ~WinException() override;
+
+  /// Emit all exception information that should come after the content.
+  void endModule() override;
+
+  /// Gather pre-function exception information.  Assumes being emitted
+  /// immediately after the function entry point.
+  void beginFunction(const MachineFunction *MF) override;
+
+  void markFunctionEnd() override;
+
+  /// Gather and emit post-function exception information.
+  void endFunction(const MachineFunction *) override;
+
+  /// Emit target-specific EH funclet machinery.
+  void beginFunclet(const MachineBasicBlock &MBB, MCSymbol *Sym) override;
+  void endFunclet() override;
+};
+}
+
+#endif
+
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
new file mode 100644
index 000000000000..5ef850d09d92
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -0,0 +1,2576 @@
+#include "llvm/CodeGen/AssignmentTrackingAnalysis.h"
+#include "LiveDebugValues/LiveDebugValues.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/IntervalMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/UniqueVector.h"
+#include "llvm/Analysis/Interval.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PrintPasses.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <assert.h>
+#include <cstdint>
+#include <optional>
+#include <sstream>
+#include <unordered_map>
+
+using namespace llvm;
+#define DEBUG_TYPE "debug-ata"
+
+STATISTIC(NumDefsScanned, "Number of dbg locs that get scanned for removal");
+STATISTIC(NumDefsRemoved, "Number of dbg locs removed");
+STATISTIC(NumWedgesScanned, "Number of dbg wedges scanned");
+STATISTIC(NumWedgesChanged, "Number of dbg wedges changed");
+
+static cl::opt<unsigned>
+    MaxNumBlocks("debug-ata-max-blocks", cl::init(10000),
+                 cl::desc("Maximum num basic blocks before debug info dropped"),
+                 cl::Hidden);
+/// Option for debugging the pass, determines if the memory location fragment
+/// filling happens after generating the variable locations.
+static cl::opt<bool> EnableMemLocFragFill("mem-loc-frag-fill", cl::init(true),
+                                          cl::Hidden);
+/// Print the results of the analysis. Respects -filter-print-funcs.
+static cl::opt<bool> PrintResults("print-debug-ata", cl::init(false),
+                                  cl::Hidden);
+
+/// Coalesce adjacent dbg locs describing memory locations that have contiguous
+/// fragments. This reduces the cost of LiveDebugValues which does SSA
+/// construction for each explicitly stated variable fragment.
+static cl::opt<cl::boolOrDefault>
+    CoalesceAdjacentFragmentsOpt("debug-ata-coalesce-frags", cl::Hidden);
+
+// Implicit conversions are disabled for enum class types, so unfortunately we
+// need to create a DenseMapInfo wrapper around the specified underlying type.
+template <> struct llvm::DenseMapInfo<VariableID> {
+  using Wrapped = DenseMapInfo<unsigned>;
+  static inline VariableID getEmptyKey() {
+    return static_cast<VariableID>(Wrapped::getEmptyKey());
+  }
+  static inline VariableID getTombstoneKey() {
+    return static_cast<VariableID>(Wrapped::getTombstoneKey());
+  }
+  static unsigned getHashValue(const VariableID &Val) {
+    return Wrapped::getHashValue(static_cast<unsigned>(Val));
+  }
+  static bool isEqual(const VariableID &LHS, const VariableID &RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// Helper class to build FunctionVarLocs, since that class isn't easy to
+/// modify. TODO: There's not a great deal of value in the split, it could be
+/// worth merging the two classes.
+class FunctionVarLocsBuilder {
+  friend FunctionVarLocs;
+  UniqueVector<DebugVariable> Variables;
+  // Use an unordered_map so we don't invalidate iterators after
+  // insert/modifications.
+  std::unordered_map<const Instruction *, SmallVector<VarLocInfo>>
+      VarLocsBeforeInst;
+
+  SmallVector<VarLocInfo> SingleLocVars;
+
+public:
+  unsigned getNumVariables() const { return Variables.size(); }
+
+  /// Find or insert \p V and return the ID.
+  VariableID insertVariable(DebugVariable V) {
+    return static_cast<VariableID>(Variables.insert(V));
+  }
+
+  /// Get a variable from its \p ID.
+  const DebugVariable &getVariable(VariableID ID) const {
+    return Variables[static_cast<unsigned>(ID)];
+  }
+
+  /// Return ptr to wedge of defs or nullptr if no defs come just before /p
+  /// Before.
+  const SmallVectorImpl<VarLocInfo> *getWedge(const Instruction *Before) const {
+    auto R = VarLocsBeforeInst.find(Before);
+    if (R == VarLocsBeforeInst.end())
+      return nullptr;
+    return &R->second;
+  }
+
+  /// Replace the defs that come just before /p Before with /p Wedge.
+  void setWedge(const Instruction *Before, SmallVector<VarLocInfo> &&Wedge) {
+    VarLocsBeforeInst[Before] = std::move(Wedge);
+  }
+
+  /// Add a def for a variable that is valid for its lifetime.
+  void addSingleLocVar(DebugVariable Var, DIExpression *Expr, DebugLoc DL,
+                       RawLocationWrapper R) {
+    VarLocInfo VarLoc;
+    VarLoc.VariableID = insertVariable(Var);
+    VarLoc.Expr = Expr;
+    VarLoc.DL = DL;
+    VarLoc.Values = R;
+    SingleLocVars.emplace_back(VarLoc);
+  }
+
+  /// Add a def to the wedge of defs just before /p Before.
+  void addVarLoc(Instruction *Before, DebugVariable Var, DIExpression *Expr,
+                 DebugLoc DL, RawLocationWrapper R) {
+    VarLocInfo VarLoc;
+    VarLoc.VariableID = insertVariable(Var);
+    VarLoc.Expr = Expr;
+    VarLoc.DL = DL;
+    VarLoc.Values = R;
+    VarLocsBeforeInst[Before].emplace_back(VarLoc);
+  }
+};
+
+void FunctionVarLocs::print(raw_ostream &OS, const Function &Fn) const {
+  // Print the variable table first. TODO: Sorting by variable could make the
+  // output more stable?
+  unsigned Counter = -1;
+  OS << "=== Variables ===\n";
+  for (const DebugVariable &V : Variables) {
+    ++Counter;
+    // Skip first entry because it is a dummy entry.
+    if (Counter == 0) {
+      continue;
+    }
+    OS << "[" << Counter << "] " << V.getVariable()->getName();
+    if (auto F = V.getFragment())
+      OS << " bits [" << F->OffsetInBits << ", "
+         << F->OffsetInBits + F->SizeInBits << ")";
+    if (const auto *IA = V.getInlinedAt())
+      OS << " inlined-at " << *IA;
+    OS << "\n";
+  }
+
+  auto PrintLoc = [&OS](const VarLocInfo &Loc) {
+    OS << "DEF Var=[" << (unsigned)Loc.VariableID << "]"
+       << " Expr=" << *Loc.Expr << " Values=(";
+    for (auto *Op : Loc.Values.location_ops()) {
+      errs() << Op->getName() << " ";
+    }
+    errs() << ")\n";
+  };
+
+  // Print the single location variables.
+  OS << "=== Single location vars ===\n";
+  for (auto It = single_locs_begin(), End = single_locs_end(); It != End;
+       ++It) {
+    PrintLoc(*It);
+  }
+
+  // Print the non-single-location defs in line with IR.
+  OS << "=== In-line variable defs ===";
+  for (const BasicBlock &BB : Fn) {
+    OS << "\n" << BB.getName() << ":\n";
+    for (const Instruction &I : BB) {
+      for (auto It = locs_begin(&I), End = locs_end(&I); It != End; ++It) {
+        PrintLoc(*It);
+      }
+      OS << I << "\n";
+    }
+  }
+}
+
+void FunctionVarLocs::init(FunctionVarLocsBuilder &Builder) {
+  // Add the single-location variables first.
+  for (const auto &VarLoc : Builder.SingleLocVars)
+    VarLocRecords.emplace_back(VarLoc);
+  // Mark the end of the section.
+  SingleVarLocEnd = VarLocRecords.size();
+
+  // Insert a contiguous block of VarLocInfos for each instruction, mapping it
+  // to the start and end position in the vector with VarLocsBeforeInst.
+  for (auto &P : Builder.VarLocsBeforeInst) {
+    unsigned BlockStart = VarLocRecords.size();
+    for (const VarLocInfo &VarLoc : P.second)
+      VarLocRecords.emplace_back(VarLoc);
+    unsigned BlockEnd = VarLocRecords.size();
+    // Record the start and end indices.
+    if (BlockEnd != BlockStart)
+      VarLocsBeforeInst[P.first] = {BlockStart, BlockEnd};
+  }
+
+  // Copy the Variables vector from the builder's UniqueVector.
+  assert(Variables.empty() && "Expect clear before init");
+  // UniqueVectors IDs are one-based (which means the VarLocInfo VarID values
+  // are one-based) so reserve an extra and insert a dummy.
+  Variables.reserve(Builder.Variables.size() + 1);
+  Variables.push_back(DebugVariable(nullptr, std::nullopt, nullptr));
+  Variables.append(Builder.Variables.begin(), Builder.Variables.end());
+}
+
+void FunctionVarLocs::clear() {
+  Variables.clear();
+  VarLocRecords.clear();
+  VarLocsBeforeInst.clear();
+  SingleVarLocEnd = 0;
+}
+
+/// Walk backwards along constant GEPs and bitcasts to the base storage from \p
+/// Start as far as possible. Prepend \Expression with the offset and append it
+/// with a DW_OP_deref that haes been implicit until now. Returns the walked-to
+/// value and modified expression.
+static std::pair<Value *, DIExpression *>
+walkToAllocaAndPrependOffsetDeref(const DataLayout &DL, Value *Start,
+                                  DIExpression *Expression) {
+  APInt OffsetInBytes(DL.getTypeSizeInBits(Start->getType()), false);
+  Value *End =
+      Start->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetInBytes);
+  SmallVector<uint64_t, 3> Ops;
+  if (OffsetInBytes.getBoolValue()) {
+    Ops = {dwarf::DW_OP_plus_uconst, OffsetInBytes.getZExtValue()};
+    Expression = DIExpression::prependOpcodes(
+        Expression, Ops, /*StackValue=*/false, /*EntryValue=*/false);
+  }
+  Expression = DIExpression::append(Expression, {dwarf::DW_OP_deref});
+  return {End, Expression};
+}
+
+/// Extract the offset used in \p DIExpr. Returns std::nullopt if the expression
+/// doesn't explicitly describe a memory location with DW_OP_deref or if the
+/// expression is too complex to interpret.
+static std::optional<int64_t>
+getDerefOffsetInBytes(const DIExpression *DIExpr) {
+  int64_t Offset = 0;
+  const unsigned NumElements = DIExpr->getNumElements();
+  const auto Elements = DIExpr->getElements();
+  unsigned ExpectedDerefIdx = 0;
+  // Extract the offset.
+  if (NumElements > 2 && Elements[0] == dwarf::DW_OP_plus_uconst) {
+    Offset = Elements[1];
+    ExpectedDerefIdx = 2;
+  } else if (NumElements > 3 && Elements[0] == dwarf::DW_OP_constu) {
+    ExpectedDerefIdx = 3;
+    if (Elements[2] == dwarf::DW_OP_plus)
+      Offset = Elements[1];
+    else if (Elements[2] == dwarf::DW_OP_minus)
+      Offset = -Elements[1];
+    else
+      return std::nullopt;
+  }
+
+  // If that's all there is it means there's no deref.
+  if (ExpectedDerefIdx >= NumElements)
+    return std::nullopt;
+
+  // Check the next element is DW_OP_deref - otherwise this is too complex or
+  // isn't a deref expression.
+  if (Elements[ExpectedDerefIdx] != dwarf::DW_OP_deref)
+    return std::nullopt;
+
+  // Check the final operation is either the DW_OP_deref or is a fragment.
+  if (NumElements == ExpectedDerefIdx + 1)
+    return Offset; // Ends with deref.
+  unsigned ExpectedFragFirstIdx = ExpectedDerefIdx + 1;
+  unsigned ExpectedFragFinalIdx = ExpectedFragFirstIdx + 2;
+  if (NumElements == ExpectedFragFinalIdx + 1 &&
+      Elements[ExpectedFragFirstIdx] == dwarf::DW_OP_LLVM_fragment)
+    return Offset; // Ends with deref + fragment.
+
+  // Don't bother trying to interpret anything more complex.
+  return std::nullopt;
+}
+
+/// A whole (unfragmented) source variable.
+using DebugAggregate = std::pair<const DILocalVariable *, const DILocation *>;
+static DebugAggregate getAggregate(const DbgVariableIntrinsic *DII) {
+  return DebugAggregate(DII->getVariable(), DII->getDebugLoc().getInlinedAt());
+}
+static DebugAggregate getAggregate(const DebugVariable &Var) {
+  return DebugAggregate(Var.getVariable(), Var.getInlinedAt());
+}
+
+static bool shouldCoalesceFragments(Function &F) {
+  // Enabling fragment coalescing reduces compiler run time when instruction
+  // referencing is enabled. However, it may cause LiveDebugVariables to create
+  // incorrect locations. Since instruction-referencing mode effectively
+  // bypasses LiveDebugVariables we only enable coalescing if the cl::opt flag
+  // has not been explicitly set and instruction-referencing is turned on.
+  switch (CoalesceAdjacentFragmentsOpt) {
+  case cl::boolOrDefault::BOU_UNSET:
+    return debuginfoShouldUseDebugInstrRef(
+        Triple(F.getParent()->getTargetTriple()));
+  case cl::boolOrDefault::BOU_TRUE:
+    return true;
+  case cl::boolOrDefault::BOU_FALSE:
+    return false;
+  }
+  llvm_unreachable("Unknown boolOrDefault value");
+}
+
+namespace {
+/// In dwarf emission, the following sequence
+///    1. dbg.value ... Fragment(0, 64)
+///    2. dbg.value ... Fragment(0, 32)
+/// effectively sets Fragment(32, 32) to undef (each def sets all bits not in
+/// the intersection of the fragments to having "no location"). This makes
+/// sense for implicit location values because splitting the computed values
+/// could be troublesome, and is probably quite uncommon.  When we convert
+/// dbg.assigns to dbg.value+deref this kind of thing is common, and describing
+/// a location (memory) rather than a value means we don't need to worry about
+/// splitting any values, so we try to recover the rest of the fragment
+/// location here.
+/// This class performs a(nother) dataflow analysis over the function, adding
+/// variable locations so that any bits of a variable with a memory location
+/// have that location explicitly reinstated at each subsequent variable
+/// location definition that that doesn't overwrite those bits. i.e. after a
+/// variable location def, insert new defs for the memory location with
+/// fragments for the difference of "all bits currently in memory" and "the
+/// fragment of the second def".
+class MemLocFragmentFill {
+  Function &Fn;
+  FunctionVarLocsBuilder *FnVarLocs;
+  const DenseSet<DebugAggregate> *VarsWithStackSlot;
+  bool CoalesceAdjacentFragments;
+
+  // 0 = no memory location.
+  using BaseAddress = unsigned;
+  using OffsetInBitsTy = unsigned;
+  using FragTraits = IntervalMapHalfOpenInfo<OffsetInBitsTy>;
+  using FragsInMemMap = IntervalMap<
+      OffsetInBitsTy, BaseAddress,
+      IntervalMapImpl::NodeSizer<OffsetInBitsTy, BaseAddress>::LeafSize,
+      FragTraits>;
+  FragsInMemMap::Allocator IntervalMapAlloc;
+  using VarFragMap = DenseMap<unsigned, FragsInMemMap>;
+
+  /// IDs for memory location base addresses in maps. Use 0 to indicate that
+  /// there's no memory location.
+  UniqueVector<RawLocationWrapper> Bases;
+  UniqueVector<DebugAggregate> Aggregates;
+  DenseMap<const BasicBlock *, VarFragMap> LiveIn;
+  DenseMap<const BasicBlock *, VarFragMap> LiveOut;
+
+  struct FragMemLoc {
+    unsigned Var;
+    unsigned Base;
+    unsigned OffsetInBits;
+    unsigned SizeInBits;
+    DebugLoc DL;
+  };
+  using InsertMap = MapVector<Instruction *, SmallVector<FragMemLoc>>;
+
+  /// BBInsertBeforeMap holds a description for the set of location defs to be
+  /// inserted after the analysis is complete. It is updated during the dataflow
+  /// and the entry for a block is CLEARED each time it is (re-)visited. After
+  /// the dataflow is complete, each block entry will contain the set of defs
+  /// calculated during the final (fixed-point) iteration.
+  DenseMap<const BasicBlock *, InsertMap> BBInsertBeforeMap;
+
+  static bool intervalMapsAreEqual(const FragsInMemMap &A,
+                                   const FragsInMemMap &B) {
+    auto AIt = A.begin(), AEnd = A.end();
+    auto BIt = B.begin(), BEnd = B.end();
+    for (; AIt != AEnd; ++AIt, ++BIt) {
+      if (BIt == BEnd)
+        return false; // B has fewer elements than A.
+      if (AIt.start() != BIt.start() || AIt.stop() != BIt.stop())
+        return false; // Interval is different.
+      if (*AIt != *BIt)
+        return false; // Value at interval is different.
+    }
+    // AIt == AEnd. Check BIt is also now at end.
+    return BIt == BEnd;
+  }
+
+  static bool varFragMapsAreEqual(const VarFragMap &A, const VarFragMap &B) {
+    if (A.size() != B.size())
+      return false;
+    for (const auto &APair : A) {
+      auto BIt = B.find(APair.first);
+      if (BIt == B.end())
+        return false;
+      if (!intervalMapsAreEqual(APair.second, BIt->second))
+        return false;
+    }
+    return true;
+  }
+
+  /// Return a string for the value that \p BaseID represents.
+  std::string toString(unsigned BaseID) {
+    if (BaseID)
+      return Bases[BaseID].getVariableLocationOp(0)->getName().str();
+    else
+      return "None";
+  }
+
+  /// Format string describing an FragsInMemMap (IntervalMap) interval.
+  std::string toString(FragsInMemMap::const_iterator It, bool Newline = true) {
+    std::string String;
+    std::stringstream S(String);
+    if (It.valid()) {
+      S << "[" << It.start() << ", " << It.stop()
+        << "): " << toString(It.value());
+    } else {
+      S << "invalid iterator (end)";
+    }
+    if (Newline)
+      S << "\n";
+    return S.str();
+  };
+
+  FragsInMemMap meetFragments(const FragsInMemMap &A, const FragsInMemMap &B) {
+    FragsInMemMap Result(IntervalMapAlloc);
+    for (auto AIt = A.begin(), AEnd = A.end(); AIt != AEnd; ++AIt) {
+      LLVM_DEBUG(dbgs() << "a " << toString(AIt));
+      // This is basically copied from process() and inverted (process is
+      // performing something like a union whereas this is more of an
+      // intersect).
+
+      // There's no work to do if interval `a` overlaps no fragments in map `B`.
+      if (!B.overlaps(AIt.start(), AIt.stop()))
+        continue;
+
+      // Does StartBit intersect an existing fragment?
+      auto FirstOverlap = B.find(AIt.start());
+      assert(FirstOverlap != B.end());
+      bool IntersectStart = FirstOverlap.start() < AIt.start();
+      LLVM_DEBUG(dbgs() << "- FirstOverlap " << toString(FirstOverlap, false)
+                        << ", IntersectStart: " << IntersectStart << "\n");
+
+      // Does EndBit intersect an existing fragment?
+      auto LastOverlap = B.find(AIt.stop());
+      bool IntersectEnd =
+          LastOverlap != B.end() && LastOverlap.start() < AIt.stop();
+      LLVM_DEBUG(dbgs() << "- LastOverlap " << toString(LastOverlap, false)
+                        << ", IntersectEnd: " << IntersectEnd << "\n");
+
+      // Check if both ends of `a` intersect the same interval `b`.
+      if (IntersectStart && IntersectEnd && FirstOverlap == LastOverlap) {
+        // Insert `a` (`a` is contained in `b`) if the values match.
+        // [ a ]
+        // [ - b - ]
+        // -
+        // [ r ]
+        LLVM_DEBUG(dbgs() << "- a is contained within "
+                          << toString(FirstOverlap));
+        if (*AIt && *AIt == *FirstOverlap)
+          Result.insert(AIt.start(), AIt.stop(), *AIt);
+      } else {
+        // There's an overlap but `a` is not fully contained within
+        // `b`. Shorten any end-point intersections.
+        //     [ - a - ]
+        // [ - b - ]
+        // -
+        //     [ r ]
+        auto Next = FirstOverlap;
+        if (IntersectStart) {
+          LLVM_DEBUG(dbgs() << "- insert intersection of a and "
+                            << toString(FirstOverlap));
+          if (*AIt && *AIt == *FirstOverlap)
+            Result.insert(AIt.start(), FirstOverlap.stop(), *AIt);
+          ++Next;
+        }
+        // [ - a - ]
+        //     [ - b - ]
+        // -
+        //     [ r ]
+        if (IntersectEnd) {
+          LLVM_DEBUG(dbgs() << "- insert intersection of a and "
+                            << toString(LastOverlap));
+          if (*AIt && *AIt == *LastOverlap)
+            Result.insert(LastOverlap.start(), AIt.stop(), *AIt);
+        }
+
+        // Insert all intervals in map `B` that are contained within interval
+        // `a` where the values match.
+        // [ -  - a -  - ]
+        // [ b1 ]   [ b2 ]
+        // -
+        // [ r1 ]   [ r2 ]
+        while (Next != B.end() && Next.start() < AIt.stop() &&
+               Next.stop() <= AIt.stop()) {
+          LLVM_DEBUG(dbgs()
+                     << "- insert intersection of a and " << toString(Next));
+          if (*AIt && *AIt == *Next)
+            Result.insert(Next.start(), Next.stop(), *Next);
+          ++Next;
+        }
+      }
+    }
+    return Result;
+  }
+
+  /// Meet \p A and \p B, storing the result in \p A.
+  void meetVars(VarFragMap &A, const VarFragMap &B) {
+    // Meet A and B.
+    //
+    // Result = meet(a, b) for a in A, b in B where Var(a) == Var(b)
+    for (auto It = A.begin(), End = A.end(); It != End; ++It) {
+      unsigned AVar = It->first;
+      FragsInMemMap &AFrags = It->second;
+      auto BIt = B.find(AVar);
+      if (BIt == B.end()) {
+        A.erase(It);
+        continue; // Var has no bits defined in B.
+      }
+      LLVM_DEBUG(dbgs() << "meet fragment maps for "
+                        << Aggregates[AVar].first->getName() << "\n");
+      AFrags = meetFragments(AFrags, BIt->second);
+    }
+  }
+
+  bool meet(const BasicBlock &BB,
+            const SmallPtrSet<BasicBlock *, 16> &Visited) {
+    LLVM_DEBUG(dbgs() << "meet block info from preds of " << BB.getName()
+                      << "\n");
+
+    VarFragMap BBLiveIn;
+    bool FirstMeet = true;
+    // LiveIn locs for BB is the meet of the already-processed preds' LiveOut
+    // locs.
+    for (auto I = pred_begin(&BB), E = pred_end(&BB); I != E; I++) {
+      // Ignore preds that haven't been processed yet. This is essentially the
+      // same as initialising all variables to implicit top value (⊤) which is
+      // the identity value for the meet operation.
+      const BasicBlock *Pred = *I;
+      if (!Visited.count(Pred))
+        continue;
+
+      auto PredLiveOut = LiveOut.find(Pred);
+      assert(PredLiveOut != LiveOut.end());
+
+      if (FirstMeet) {
+        LLVM_DEBUG(dbgs() << "BBLiveIn = " << Pred->getName() << "\n");
+        BBLiveIn = PredLiveOut->second;
+        FirstMeet = false;
+      } else {
+        LLVM_DEBUG(dbgs() << "BBLiveIn = meet BBLiveIn, " << Pred->getName()
+                          << "\n");
+        meetVars(BBLiveIn, PredLiveOut->second);
+      }
+
+      // An empty set is ⊥ for the intersect-like meet operation. If we've
+      // already got ⊥ there's no need to run the code - we know the result is
+      // ⊥ since `meet(a, ⊥) = ⊥`.
+      if (BBLiveIn.size() == 0)
+        break;
+    }
+
+    auto CurrentLiveInEntry = LiveIn.find(&BB);
+    // If there's no LiveIn entry for the block yet, add it.
+    if (CurrentLiveInEntry == LiveIn.end()) {
+      LLVM_DEBUG(dbgs() << "change=true (first) on meet on " << BB.getName()
+                        << "\n");
+      LiveIn[&BB] = std::move(BBLiveIn);
+      return /*Changed=*/true;
+    }
+
+    // If the LiveIn set has changed (expensive check) update it and return
+    // true.
+    if (!varFragMapsAreEqual(BBLiveIn, CurrentLiveInEntry->second)) {
+      LLVM_DEBUG(dbgs() << "change=true on meet on " << BB.getName() << "\n");
+      CurrentLiveInEntry->second = std::move(BBLiveIn);
+      return /*Changed=*/true;
+    }
+
+    LLVM_DEBUG(dbgs() << "change=false on meet on " << BB.getName() << "\n");
+    return /*Changed=*/false;
+  }
+
+  void insertMemLoc(BasicBlock &BB, Instruction &Before, unsigned Var,
+                    unsigned StartBit, unsigned EndBit, unsigned Base,
+                    DebugLoc DL) {
+    assert(StartBit < EndBit && "Cannot create fragment of size <= 0");
+    if (!Base)
+      return;
+    FragMemLoc Loc;
+    Loc.Var = Var;
+    Loc.OffsetInBits = StartBit;
+    Loc.SizeInBits = EndBit - StartBit;
+    assert(Base && "Expected a non-zero ID for Base address");
+    Loc.Base = Base;
+    Loc.DL = DL;
+    BBInsertBeforeMap[&BB][&Before].push_back(Loc);
+    LLVM_DEBUG(dbgs() << "Add mem def for " << Aggregates[Var].first->getName()
+                      << " bits [" << StartBit << ", " << EndBit << ")\n");
+  }
+
+  /// Inserts a new dbg def if the interval found when looking up \p StartBit
+  /// in \p FragMap starts before \p StartBit or ends after \p EndBit (which
+  /// indicates - assuming StartBit->EndBit has just been inserted - that the
+  /// slice has been coalesced in the map).
+  void coalesceFragments(BasicBlock &BB, Instruction &Before, unsigned Var,
+                         unsigned StartBit, unsigned EndBit, unsigned Base,
+                         DebugLoc DL, const FragsInMemMap &FragMap) {
+    if (!CoalesceAdjacentFragments)
+      return;
+    // We've inserted the location into the map. The map will have coalesced
+    // adjacent intervals (variable fragments) that describe the same memory
+    // location. Use this knowledge to insert a debug location that describes
+    // that coalesced fragment. This may eclipse other locs we've just
+    // inserted. This is okay as redundant locs will be cleaned up later.
+    auto CoalescedFrag = FragMap.find(StartBit);
+    // Bail if no coalescing has taken place.
+    if (CoalescedFrag.start() == StartBit && CoalescedFrag.stop() == EndBit)
+      return;
+
+    LLVM_DEBUG(dbgs() << "- Insert loc for bits " << CoalescedFrag.start()
+                      << " to " << CoalescedFrag.stop() << "\n");
+    insertMemLoc(BB, Before, Var, CoalescedFrag.start(), CoalescedFrag.stop(),
+                 Base, DL);
+  }
+
+  void addDef(const VarLocInfo &VarLoc, Instruction &Before, BasicBlock &BB,
+              VarFragMap &LiveSet) {
+    DebugVariable DbgVar = FnVarLocs->getVariable(VarLoc.VariableID);
+    if (skipVariable(DbgVar.getVariable()))
+      return;
+    // Don't bother doing anything for this variables if we know it's fully
+    // promoted. We're only interested in variables that (sometimes) live on
+    // the stack here.
+    if (!VarsWithStackSlot->count(getAggregate(DbgVar)))
+      return;
+    unsigned Var = Aggregates.insert(
+        DebugAggregate(DbgVar.getVariable(), VarLoc.DL.getInlinedAt()));
+
+    // [StartBit: EndBit) are the bits affected by this def.
+    const DIExpression *DIExpr = VarLoc.Expr;
+    unsigned StartBit;
+    unsigned EndBit;
+    if (auto Frag = DIExpr->getFragmentInfo()) {
+      StartBit = Frag->OffsetInBits;
+      EndBit = StartBit + Frag->SizeInBits;
+    } else {
+      assert(static_cast<bool>(DbgVar.getVariable()->getSizeInBits()));
+      StartBit = 0;
+      EndBit = *DbgVar.getVariable()->getSizeInBits();
+    }
+
+    // We will only fill fragments for simple memory-describing dbg.value
+    // intrinsics. If the fragment offset is the same as the offset from the
+    // base pointer, do The Thing, otherwise fall back to normal dbg.value
+    // behaviour. AssignmentTrackingLowering has generated DIExpressions
+    // written in terms of the base pointer.
+    // TODO: Remove this condition since the fragment offset doesn't always
+    // equal the offset from base pointer (e.g. for a SROA-split variable).
+    const auto DerefOffsetInBytes = getDerefOffsetInBytes(DIExpr);
+    const unsigned Base =
+        DerefOffsetInBytes && *DerefOffsetInBytes * 8 == StartBit
+            ? Bases.insert(VarLoc.Values)
+            : 0;
+    LLVM_DEBUG(dbgs() << "DEF " << DbgVar.getVariable()->getName() << " ["
+                      << StartBit << ", " << EndBit << "): " << toString(Base)
+                      << "\n");
+
+    // First of all, any locs that use mem that are disrupted need reinstating.
+    // Unfortunately, IntervalMap doesn't let us insert intervals that overlap
+    // with existing intervals so this code involves a lot of fiddling around
+    // with intervals to do that manually.
+    auto FragIt = LiveSet.find(Var);
+
+    // Check if the variable does not exist in the map.
+    if (FragIt == LiveSet.end()) {
+      // Add this variable to the BB map.
+      auto P = LiveSet.try_emplace(Var, FragsInMemMap(IntervalMapAlloc));
+      assert(P.second && "Var already in map?");
+      // Add the interval to the fragment map.
+      P.first->second.insert(StartBit, EndBit, Base);
+      return;
+    }
+    // The variable has an entry in the map.
+
+    FragsInMemMap &FragMap = FragIt->second;
+    // First check the easy case: the new fragment `f` doesn't overlap with any
+    // intervals.
+    if (!FragMap.overlaps(StartBit, EndBit)) {
+      LLVM_DEBUG(dbgs() << "- No overlaps\n");
+      FragMap.insert(StartBit, EndBit, Base);
+      coalesceFragments(BB, Before, Var, StartBit, EndBit, Base, VarLoc.DL,
+                        FragMap);
+      return;
+    }
+    // There is at least one overlap.
+
+    // Does StartBit intersect an existing fragment?
+    auto FirstOverlap = FragMap.find(StartBit);
+    assert(FirstOverlap != FragMap.end());
+    bool IntersectStart = FirstOverlap.start() < StartBit;
+
+    // Does EndBit intersect an existing fragment?
+    auto LastOverlap = FragMap.find(EndBit);
+    bool IntersectEnd = LastOverlap.valid() && LastOverlap.start() < EndBit;
+
+    // Check if both ends of `f` intersect the same interval `i`.
+    if (IntersectStart && IntersectEnd && FirstOverlap == LastOverlap) {
+      LLVM_DEBUG(dbgs() << "- Intersect single interval @ both ends\n");
+      // Shorten `i` so that there's space to insert `f`.
+      //      [ f ]
+      // [  -   i   -  ]
+      // +
+      // [ i ][ f ][ i ]
+
+      // Save values for use after inserting a new interval.
+      auto EndBitOfOverlap = FirstOverlap.stop();
+      unsigned OverlapValue = FirstOverlap.value();
+
+      // Shorten the overlapping interval.
+      FirstOverlap.setStop(StartBit);
+      insertMemLoc(BB, Before, Var, FirstOverlap.start(), StartBit,
+                   OverlapValue, VarLoc.DL);
+
+      // Insert a new interval to represent the end part.
+      FragMap.insert(EndBit, EndBitOfOverlap, OverlapValue);
+      insertMemLoc(BB, Before, Var, EndBit, EndBitOfOverlap, OverlapValue,
+                   VarLoc.DL);
+
+      // Insert the new (middle) fragment now there is space.
+      FragMap.insert(StartBit, EndBit, Base);
+    } else {
+      // There's an overlap but `f` may not be fully contained within
+      // `i`. Shorten any end-point intersections so that we can then
+      // insert `f`.
+      //      [ - f - ]
+      // [ - i - ]
+      // |   |
+      // [ i ]
+      // Shorten any end-point intersections.
+      if (IntersectStart) {
+        LLVM_DEBUG(dbgs() << "- Intersect interval at start\n");
+        // Split off at the intersection.
+        FirstOverlap.setStop(StartBit);
+        insertMemLoc(BB, Before, Var, FirstOverlap.start(), StartBit,
+                     *FirstOverlap, VarLoc.DL);
+      }
+      // [ - f - ]
+      //      [ - i - ]
+      //          |   |
+      //          [ i ]
+      if (IntersectEnd) {
+        LLVM_DEBUG(dbgs() << "- Intersect interval at end\n");
+        // Split off at the intersection.
+        LastOverlap.setStart(EndBit);
+        insertMemLoc(BB, Before, Var, EndBit, LastOverlap.stop(), *LastOverlap,
+                     VarLoc.DL);
+      }
+
+      LLVM_DEBUG(dbgs() << "- Erase intervals contained within\n");
+      // FirstOverlap and LastOverlap have been shortened such that they're
+      // no longer overlapping with [StartBit, EndBit). Delete any overlaps
+      // that remain (these will be fully contained within `f`).
+      // [ - f - ]       }
+      //      [ - i - ]  } Intersection shortening that has happened above.
+      //          |   |  }
+      //          [ i ]  }
+      // -----------------
+      // [i2 ]           } Intervals fully contained within `f` get erased.
+      // -----------------
+      // [ - f - ][ i ]  } Completed insertion.
+      auto It = FirstOverlap;
+      if (IntersectStart)
+        ++It; // IntersectStart: first overlap has been shortened.
+      while (It.valid() && It.start() >= StartBit && It.stop() <= EndBit) {
+        LLVM_DEBUG(dbgs() << "- Erase " << toString(It));
+        It.erase(); // This increments It after removing the interval.
+      }
+      // We've dealt with all the overlaps now!
+      assert(!FragMap.overlaps(StartBit, EndBit));
+      LLVM_DEBUG(dbgs() << "- Insert DEF into now-empty space\n");
+      FragMap.insert(StartBit, EndBit, Base);
+    }
+
+    coalesceFragments(BB, Before, Var, StartBit, EndBit, Base, VarLoc.DL,
+                      FragMap);
+  }
+
+  bool skipVariable(const DILocalVariable *V) { return !V->getSizeInBits(); }
+
+  void process(BasicBlock &BB, VarFragMap &LiveSet) {
+    BBInsertBeforeMap[&BB].clear();
+    for (auto &I : BB) {
+      if (const auto *Locs = FnVarLocs->getWedge(&I)) {
+        for (const VarLocInfo &Loc : *Locs) {
+          addDef(Loc, I, *I.getParent(), LiveSet);
+        }
+      }
+    }
+  }
+
+public:
+  MemLocFragmentFill(Function &Fn,
+                     const DenseSet<DebugAggregate> *VarsWithStackSlot,
+                     bool CoalesceAdjacentFragments)
+      : Fn(Fn), VarsWithStackSlot(VarsWithStackSlot),
+        CoalesceAdjacentFragments(CoalesceAdjacentFragments) {}
+
+  /// Add variable locations to \p FnVarLocs so that any bits of a variable
+  /// with a memory location have that location explicitly reinstated at each
+  /// subsequent variable location definition that that doesn't overwrite those
+  /// bits. i.e. after a variable location def, insert new defs for the memory
+  /// location with fragments for the difference of "all bits currently in
+  /// memory" and "the fragment of the second def". e.g.
+  ///
+  ///     Before:
+  ///
+  ///     var x bits 0 to 63:  value in memory
+  ///     more instructions
+  ///     var x bits 0 to 31:  value is %0
+  ///
+  ///     After:
+  ///
+  ///     var x bits 0 to 63:  value in memory
+  ///     more instructions
+  ///     var x bits 0 to 31:  value is %0
+  ///     var x bits 32 to 61: value in memory ; <-- new loc def
+  ///
+  void run(FunctionVarLocsBuilder *FnVarLocs) {
+    if (!EnableMemLocFragFill)
+      return;
+
+    this->FnVarLocs = FnVarLocs;
+
+    // Prepare for traversal.
+    //
+    ReversePostOrderTraversal<Function *> RPOT(&Fn);
+    std::priority_queue<unsigned int, std::vector<unsigned int>,
+                        std::greater<unsigned int>>
+        Worklist;
+    std::priority_queue<unsigned int, std::vector<unsigned int>,
+                        std::greater<unsigned int>>
+        Pending;
+    DenseMap<unsigned int, BasicBlock *> OrderToBB;
+    DenseMap<BasicBlock *, unsigned int> BBToOrder;
+    { // Init OrderToBB and BBToOrder.
+      unsigned int RPONumber = 0;
+      for (auto RI = RPOT.begin(), RE = RPOT.end(); RI != RE; ++RI) {
+        OrderToBB[RPONumber] = *RI;
+        BBToOrder[*RI] = RPONumber;
+        Worklist.push(RPONumber);
+        ++RPONumber;
+      }
+      LiveIn.init(RPONumber);
+      LiveOut.init(RPONumber);
+    }
+
+    // Perform the traversal.
+    //
+    // This is a standard "intersect of predecessor outs" dataflow problem. To
+    // solve it, we perform meet() and process() using the two worklist method
+    // until the LiveIn data for each block becomes unchanging.
+    //
+    // This dataflow is essentially working on maps of sets and at each meet we
+    // intersect the maps and the mapped sets. So, initialized live-in maps
+    // monotonically decrease in value throughout the dataflow.
+    SmallPtrSet<BasicBlock *, 16> Visited;
+    while (!Worklist.empty() || !Pending.empty()) {
+      // We track what is on the pending worklist to avoid inserting the same
+      // thing twice.  We could avoid this with a custom priority queue, but
+      // this is probably not worth it.
+      SmallPtrSet<BasicBlock *, 16> OnPending;
+      LLVM_DEBUG(dbgs() << "Processing Worklist\n");
+      while (!Worklist.empty()) {
+        BasicBlock *BB = OrderToBB[Worklist.top()];
+        LLVM_DEBUG(dbgs() << "\nPop BB " << BB->getName() << "\n");
+        Worklist.pop();
+        bool InChanged = meet(*BB, Visited);
+        // Always consider LiveIn changed on the first visit.
+        InChanged |= Visited.insert(BB).second;
+        if (InChanged) {
+          LLVM_DEBUG(dbgs()
+                     << BB->getName() << " has new InLocs, process it\n");
+          //  Mutate a copy of LiveIn while processing BB. Once we've processed
+          //  the terminator LiveSet is the LiveOut set for BB.
+          //  This is an expensive copy!
+          VarFragMap LiveSet = LiveIn[BB];
+
+          // Process the instructions in the block.
+          process(*BB, LiveSet);
+
+          // Relatively expensive check: has anything changed in LiveOut for BB?
+          if (!varFragMapsAreEqual(LiveOut[BB], LiveSet)) {
+            LLVM_DEBUG(dbgs() << BB->getName()
+                              << " has new OutLocs, add succs to worklist: [ ");
+            LiveOut[BB] = std::move(LiveSet);
+            for (auto I = succ_begin(BB), E = succ_end(BB); I != E; I++) {
+              if (OnPending.insert(*I).second) {
+                LLVM_DEBUG(dbgs() << I->getName() << " ");
+                Pending.push(BBToOrder[*I]);
+              }
+            }
+            LLVM_DEBUG(dbgs() << "]\n");
+          }
+        }
+      }
+      Worklist.swap(Pending);
+      // At this point, pending must be empty, since it was just the empty
+      // worklist
+      assert(Pending.empty() && "Pending should be empty");
+    }
+
+    // Insert new location defs.
+    for (auto &Pair : BBInsertBeforeMap) {
+      InsertMap &Map = Pair.second;
+      for (auto &Pair : Map) {
+        Instruction *InsertBefore = Pair.first;
+        assert(InsertBefore && "should never be null");
+        auto FragMemLocs = Pair.second;
+        auto &Ctx = Fn.getContext();
+
+        for (auto &FragMemLoc : FragMemLocs) {
+          DIExpression *Expr = DIExpression::get(Ctx, std::nullopt);
+          if (FragMemLoc.SizeInBits !=
+              *Aggregates[FragMemLoc.Var].first->getSizeInBits())
+            Expr = *DIExpression::createFragmentExpression(
+                Expr, FragMemLoc.OffsetInBits, FragMemLoc.SizeInBits);
+          Expr = DIExpression::prepend(Expr, DIExpression::DerefAfter,
+                                       FragMemLoc.OffsetInBits / 8);
+          DebugVariable Var(Aggregates[FragMemLoc.Var].first, Expr,
+                            FragMemLoc.DL.getInlinedAt());
+          FnVarLocs->addVarLoc(InsertBefore, Var, Expr, FragMemLoc.DL,
+                               Bases[FragMemLoc.Base]);
+        }
+      }
+    }
+  }
+};
+
+/// AssignmentTrackingLowering encapsulates a dataflow analysis over a function
+/// that interprets assignment tracking debug info metadata and stores in IR to
+/// create a map of variable locations.
+class AssignmentTrackingLowering {
+public:
+  /// The kind of location in use for a variable, where Mem is the stack home,
+  /// Val is an SSA value or const, and None means that there is not one single
+  /// kind (either because there are multiple or because there is none; it may
+  /// prove useful to split this into two values in the future).
+  ///
+  /// LocKind is a join-semilattice with the partial order:
+  /// None > Mem, Val
+  ///
+  /// i.e.
+  /// join(Mem, Mem)   = Mem
+  /// join(Val, Val)   = Val
+  /// join(Mem, Val)   = None
+  /// join(None, Mem)  = None
+  /// join(None, Val)  = None
+  /// join(None, None) = None
+  ///
+  /// Note: the order is not `None > Val > Mem` because we're using DIAssignID
+  /// to name assignments and are not tracking the actual stored values.
+  /// Therefore currently there's no way to ensure that Mem values and Val
+  /// values are the same. This could be a future extension, though it's not
+  /// clear that many additional locations would be recovered that way in
+  /// practice as the likelihood of this sitation arising naturally seems
+  /// incredibly low.
+  enum class LocKind { Mem, Val, None };
+
+  /// An abstraction of the assignment of a value to a variable or memory
+  /// location.
+  ///
+  /// An Assignment is Known or NoneOrPhi. A Known Assignment means we have a
+  /// DIAssignID ptr that represents it. NoneOrPhi means that we don't (or
+  /// can't) know the ID of the last assignment that took place.
+  ///
+  /// The Status of the Assignment (Known or NoneOrPhi) is another
+  /// join-semilattice. The partial order is:
+  /// NoneOrPhi > Known {id_0, id_1, ...id_N}
+  ///
+  /// i.e. for all values x and y where x != y:
+  /// join(x, x) = x
+  /// join(x, y) = NoneOrPhi
+  struct Assignment {
+    enum S { Known, NoneOrPhi } Status;
+    /// ID of the assignment. nullptr if Status is not Known.
+    DIAssignID *ID;
+    /// The dbg.assign that marks this dbg-def. Mem-defs don't use this field.
+    /// May be nullptr.
+    DbgAssignIntrinsic *Source;
+
+    bool isSameSourceAssignment(const Assignment &Other) const {
+      // Don't include Source in the equality check. Assignments are
+      // defined by their ID, not debug intrinsic(s).
+      return std::tie(Status, ID) == std::tie(Other.Status, Other.ID);
+    }
+    void dump(raw_ostream &OS) {
+      static const char *LUT[] = {"Known", "NoneOrPhi"};
+      OS << LUT[Status] << "(id=";
+      if (ID)
+        OS << ID;
+      else
+        OS << "null";
+      OS << ", s=";
+      if (Source)
+        OS << *Source;
+      else
+        OS << "null";
+      OS << ")";
+    }
+
+    static Assignment make(DIAssignID *ID, DbgAssignIntrinsic *Source) {
+      return Assignment(Known, ID, Source);
+    }
+    static Assignment makeFromMemDef(DIAssignID *ID) {
+      return Assignment(Known, ID, nullptr);
+    }
+    static Assignment makeNoneOrPhi() {
+      return Assignment(NoneOrPhi, nullptr, nullptr);
+    }
+    // Again, need a Top value?
+    Assignment()
+        : Status(NoneOrPhi), ID(nullptr), Source(nullptr) {
+    } // Can we delete this?
+    Assignment(S Status, DIAssignID *ID, DbgAssignIntrinsic *Source)
+        : Status(Status), ID(ID), Source(Source) {
+      // If the Status is Known then we expect there to be an assignment ID.
+      assert(Status == NoneOrPhi || ID);
+    }
+  };
+
+  using AssignmentMap = SmallVector<Assignment>;
+  using LocMap = SmallVector<LocKind>;
+  using OverlapMap = DenseMap<VariableID, SmallVector<VariableID>>;
+  using UntaggedStoreAssignmentMap =
+      DenseMap<const Instruction *,
+               SmallVector<std::pair<VariableID, at::AssignmentInfo>>>;
+
+private:
+  /// The highest numbered VariableID for partially promoted variables plus 1,
+  /// the values for which start at 1.
+  unsigned TrackedVariablesVectorSize = 0;
+  /// Map a variable to the set of variables that it fully contains.
+  OverlapMap VarContains;
+  /// Map untagged stores to the variable fragments they assign to. Used by
+  /// processUntaggedInstruction.
+  UntaggedStoreAssignmentMap UntaggedStoreVars;
+
+  // Machinery to defer inserting dbg.values.
+  using InsertMap = MapVector<Instruction *, SmallVector<VarLocInfo>>;
+  InsertMap InsertBeforeMap;
+  /// Clear the location definitions currently cached for insertion after /p
+  /// After.
+  void resetInsertionPoint(Instruction &After);
+  void emitDbgValue(LocKind Kind, const DbgVariableIntrinsic *Source,
+                    Instruction *After);
+
+  static bool mapsAreEqual(const BitVector &Mask, const AssignmentMap &A,
+                           const AssignmentMap &B) {
+    return llvm::all_of(Mask.set_bits(), [&](unsigned VarID) {
+      return A[VarID].isSameSourceAssignment(B[VarID]);
+    });
+  }
+
+  /// Represents the stack and debug assignments in a block. Used to describe
+  /// the live-in and live-out values for blocks, as well as the "current"
+  /// value as we process each instruction in a block.
+  struct BlockInfo {
+    /// The set of variables (VariableID) being tracked in this block.
+    BitVector VariableIDsInBlock;
+    /// Dominating assignment to memory for each variable, indexed by
+    /// VariableID.
+    AssignmentMap StackHomeValue;
+    /// Dominating assignemnt to each variable, indexed by VariableID.
+    AssignmentMap DebugValue;
+    /// Location kind for each variable. LiveLoc indicates whether the
+    /// dominating assignment in StackHomeValue (LocKind::Mem), DebugValue
+    /// (LocKind::Val), or neither (LocKind::None) is valid, in that order of
+    /// preference. This cannot be derived by inspecting DebugValue and
+    /// StackHomeValue due to the fact that there's no distinction in
+    /// Assignment (the class) between whether an assignment is unknown or a
+    /// merge of multiple assignments (both are Status::NoneOrPhi). In other
+    /// words, the memory location may well be valid while both DebugValue and
+    /// StackHomeValue contain Assignments that have a Status of NoneOrPhi.
+    /// Indexed by VariableID.
+    LocMap LiveLoc;
+
+  public:
+    enum AssignmentKind { Stack, Debug };
+    const AssignmentMap &getAssignmentMap(AssignmentKind Kind) const {
+      switch (Kind) {
+      case Stack:
+        return StackHomeValue;
+      case Debug:
+        return DebugValue;
+      }
+      llvm_unreachable("Unknown AssignmentKind");
+    }
+    AssignmentMap &getAssignmentMap(AssignmentKind Kind) {
+      return const_cast<AssignmentMap &>(
+          const_cast<const BlockInfo *>(this)->getAssignmentMap(Kind));
+    }
+
+    bool isVariableTracked(VariableID Var) const {
+      return VariableIDsInBlock[static_cast<unsigned>(Var)];
+    }
+
+    const Assignment &getAssignment(AssignmentKind Kind, VariableID Var) const {
+      assert(isVariableTracked(Var) && "Var not tracked in block");
+      return getAssignmentMap(Kind)[static_cast<unsigned>(Var)];
+    }
+
+    LocKind getLocKind(VariableID Var) const {
+      assert(isVariableTracked(Var) && "Var not tracked in block");
+      return LiveLoc[static_cast<unsigned>(Var)];
+    }
+
+    /// Set LocKind for \p Var only: does not set LocKind for VariableIDs of
+    /// fragments contained win \p Var.
+    void setLocKind(VariableID Var, LocKind K) {
+      VariableIDsInBlock.set(static_cast<unsigned>(Var));
+      LiveLoc[static_cast<unsigned>(Var)] = K;
+    }
+
+    /// Set the assignment in the \p Kind assignment map for \p Var only: does
+    /// not set the assignment for VariableIDs of fragments contained win \p
+    /// Var.
+    void setAssignment(AssignmentKind Kind, VariableID Var,
+                       const Assignment &AV) {
+      VariableIDsInBlock.set(static_cast<unsigned>(Var));
+      getAssignmentMap(Kind)[static_cast<unsigned>(Var)] = AV;
+    }
+
+    /// Return true if there is an assignment matching \p AV in the \p Kind
+    /// assignment map. Does consider assignments for VariableIDs of fragments
+    /// contained win \p Var.
+    bool hasAssignment(AssignmentKind Kind, VariableID Var,
+                       const Assignment &AV) const {
+      if (!isVariableTracked(Var))
+        return false;
+      return AV.isSameSourceAssignment(getAssignment(Kind, Var));
+    }
+
+    /// Compare every element in each map to determine structural equality
+    /// (slow).
+    bool operator==(const BlockInfo &Other) const {
+      return VariableIDsInBlock == Other.VariableIDsInBlock &&
+             LiveLoc == Other.LiveLoc &&
+             mapsAreEqual(VariableIDsInBlock, StackHomeValue,
+                          Other.StackHomeValue) &&
+             mapsAreEqual(VariableIDsInBlock, DebugValue, Other.DebugValue);
+    }
+    bool operator!=(const BlockInfo &Other) const { return !(*this == Other); }
+    bool isValid() {
+      return LiveLoc.size() == DebugValue.size() &&
+             LiveLoc.size() == StackHomeValue.size();
+    }
+
+    /// Clear everything and initialise with ⊤-values for all variables.
+    void init(int NumVars) {
+      StackHomeValue.clear();
+      DebugValue.clear();
+      LiveLoc.clear();
+      VariableIDsInBlock = BitVector(NumVars);
+      StackHomeValue.insert(StackHomeValue.begin(), NumVars,
+                            Assignment::makeNoneOrPhi());
+      DebugValue.insert(DebugValue.begin(), NumVars,
+                        Assignment::makeNoneOrPhi());
+      LiveLoc.insert(LiveLoc.begin(), NumVars, LocKind::None);
+    }
+
+    /// Helper for join.
+    template <typename ElmtType, typename FnInputType>
+    static void joinElmt(int Index, SmallVector<ElmtType> &Target,
+                         const SmallVector<ElmtType> &A,
+                         const SmallVector<ElmtType> &B,
+                         ElmtType (*Fn)(FnInputType, FnInputType)) {
+      Target[Index] = Fn(A[Index], B[Index]);
+    }
+
+    /// See comment for AssignmentTrackingLowering::joinBlockInfo.
+    static BlockInfo join(const BlockInfo &A, const BlockInfo &B, int NumVars) {
+      // Join A and B.
+      //
+      // Intersect = join(a, b) for a in A, b in B where Var(a) == Var(b)
+      // Difference = join(x, ⊤) for x where Var(x) is in A xor B
+      // Join = Intersect ∪ Difference
+      //
+      // This is achieved by performing a join on elements from A and B with
+      // variables common to both A and B (join elements indexed by var
+      // intersect), then adding ⊤-value elements for vars in A xor B. The
+      // latter part is equivalent to performing join on elements with variables
+      // in A xor B with the ⊤-value for the map element since join(x, ⊤) = ⊤.
+      // BlockInfo::init initializes all variable entries to the ⊤ value so we
+      // don't need to explicitly perform that step as Join.VariableIDsInBlock
+      // is set to the union of the variables in A and B at the end of this
+      // function.
+      BlockInfo Join;
+      Join.init(NumVars);
+
+      BitVector Intersect = A.VariableIDsInBlock;
+      Intersect &= B.VariableIDsInBlock;
+
+      for (auto VarID : Intersect.set_bits()) {
+        joinElmt(VarID, Join.LiveLoc, A.LiveLoc, B.LiveLoc, joinKind);
+        joinElmt(VarID, Join.DebugValue, A.DebugValue, B.DebugValue,
+                 joinAssignment);
+        joinElmt(VarID, Join.StackHomeValue, A.StackHomeValue, B.StackHomeValue,
+                 joinAssignment);
+      }
+
+      Join.VariableIDsInBlock = A.VariableIDsInBlock;
+      Join.VariableIDsInBlock |= B.VariableIDsInBlock;
+      assert(Join.isValid());
+      return Join;
+    }
+  };
+
+  Function &Fn;
+  const DataLayout &Layout;
+  const DenseSet<DebugAggregate> *VarsWithStackSlot;
+  FunctionVarLocsBuilder *FnVarLocs;
+  DenseMap<const BasicBlock *, BlockInfo> LiveIn;
+  DenseMap<const BasicBlock *, BlockInfo> LiveOut;
+
+  /// Helper for process methods to track variables touched each frame.
+  DenseSet<VariableID> VarsTouchedThisFrame;
+
+  /// The set of variables that sometimes are not located in their stack home.
+  DenseSet<DebugAggregate> NotAlwaysStackHomed;
+
+  VariableID getVariableID(const DebugVariable &Var) {
+    return static_cast<VariableID>(FnVarLocs->insertVariable(Var));
+  }
+
+  /// Join the LiveOut values of preds that are contained in \p Visited into
+  /// LiveIn[BB]. Return True if LiveIn[BB] has changed as a result. LiveIn[BB]
+  /// values monotonically increase. See the @link joinMethods join methods
+  /// @endlink documentation for more info.
+  bool join(const BasicBlock &BB, const SmallPtrSet<BasicBlock *, 16> &Visited);
+  ///@name joinMethods
+  /// Functions that implement `join` (the least upper bound) for the
+  /// join-semilattice types used in the dataflow. There is an explicit bottom
+  /// value (⊥) for some types and and explicit top value (⊤) for all types.
+  /// By definition:
+  ///
+  ///     Join(A, B) >= A && Join(A, B) >= B
+  ///     Join(A, ⊥) = A
+  ///     Join(A, ⊤) = ⊤
+  ///
+  /// These invariants are important for monotonicity.
+  ///
+  /// For the map-type functions, all unmapped keys in an empty map are
+  /// associated with a bottom value (⊥). This represents their values being
+  /// unknown. Unmapped keys in non-empty maps (joining two maps with a key
+  /// only present in one) represents either a variable going out of scope or
+  /// dropped debug info. It is assumed the key is associated with a top value
+  /// (⊤) in this case (unknown location / assignment).
+  ///@{
+  static LocKind joinKind(LocKind A, LocKind B);
+  static Assignment joinAssignment(const Assignment &A, const Assignment &B);
+  BlockInfo joinBlockInfo(const BlockInfo &A, const BlockInfo &B);
+  ///@}
+
+  /// Process the instructions in \p BB updating \p LiveSet along the way. \p
+  /// LiveSet must be initialized with the current live-in locations before
+  /// calling this.
+  void process(BasicBlock &BB, BlockInfo *LiveSet);
+  ///@name processMethods
+  /// Methods to process instructions in order to update the LiveSet (current
+  /// location information).
+  ///@{
+  void processNonDbgInstruction(Instruction &I, BlockInfo *LiveSet);
+  void processDbgInstruction(DbgInfoIntrinsic &I, BlockInfo *LiveSet);
+  /// Update \p LiveSet after encountering an instruction with a DIAssignID
+  /// attachment, \p I.
+  void processTaggedInstruction(Instruction &I, BlockInfo *LiveSet);
+  /// Update \p LiveSet after encountering an instruciton without a DIAssignID
+  /// attachment, \p I.
+  void processUntaggedInstruction(Instruction &I, BlockInfo *LiveSet);
+  void processDbgAssign(DbgAssignIntrinsic &DAI, BlockInfo *LiveSet);
+  void processDbgValue(DbgValueInst &DVI, BlockInfo *LiveSet);
+  /// Add an assignment to memory for the variable /p Var.
+  void addMemDef(BlockInfo *LiveSet, VariableID Var, const Assignment &AV);
+  /// Add an assignment to the variable /p Var.
+  void addDbgDef(BlockInfo *LiveSet, VariableID Var, const Assignment &AV);
+  ///@}
+
+  /// Set the LocKind for \p Var.
+  void setLocKind(BlockInfo *LiveSet, VariableID Var, LocKind K);
+  /// Get the live LocKind for a \p Var. Requires addMemDef or addDbgDef to
+  /// have been called for \p Var first.
+  LocKind getLocKind(BlockInfo *LiveSet, VariableID Var);
+  /// Return true if \p Var has an assignment in \p M matching \p AV.
+  bool hasVarWithAssignment(BlockInfo *LiveSet, BlockInfo::AssignmentKind Kind,
+                            VariableID Var, const Assignment &AV);
+  /// Return the set of VariableIDs corresponding the fragments contained fully
+  /// within the variable/fragment \p Var.
+  ArrayRef<VariableID> getContainedFragments(VariableID Var) const;
+
+  /// Mark \p Var as having been touched this frame. Note, this applies only
+  /// to the exact fragment \p Var and not to any fragments contained within.
+  void touchFragment(VariableID Var);
+
+  /// Emit info for variables that are fully promoted.
+  bool emitPromotedVarLocs(FunctionVarLocsBuilder *FnVarLocs);
+
+public:
+  AssignmentTrackingLowering(Function &Fn, const DataLayout &Layout,
+                             const DenseSet<DebugAggregate> *VarsWithStackSlot)
+      : Fn(Fn), Layout(Layout), VarsWithStackSlot(VarsWithStackSlot) {}
+  /// Run the analysis, adding variable location info to \p FnVarLocs. Returns
+  /// true if any variable locations have been added to FnVarLocs.
+  bool run(FunctionVarLocsBuilder *FnVarLocs);
+};
+} // namespace
+
+ArrayRef<VariableID>
+AssignmentTrackingLowering::getContainedFragments(VariableID Var) const {
+  auto R = VarContains.find(Var);
+  if (R == VarContains.end())
+    return std::nullopt;
+  return R->second;
+}
+
+void AssignmentTrackingLowering::touchFragment(VariableID Var) {
+  VarsTouchedThisFrame.insert(Var);
+}
+
+void AssignmentTrackingLowering::setLocKind(BlockInfo *LiveSet, VariableID Var,
+                                            LocKind K) {
+  auto SetKind = [this](BlockInfo *LiveSet, VariableID Var, LocKind K) {
+    LiveSet->setLocKind(Var, K);
+    touchFragment(Var);
+  };
+  SetKind(LiveSet, Var, K);
+
+  // Update the LocKind for all fragments contained within Var.
+  for (VariableID Frag : getContainedFragments(Var))
+    SetKind(LiveSet, Frag, K);
+}
+
+AssignmentTrackingLowering::LocKind
+AssignmentTrackingLowering::getLocKind(BlockInfo *LiveSet, VariableID Var) {
+  return LiveSet->getLocKind(Var);
+}
+
+void AssignmentTrackingLowering::addMemDef(BlockInfo *LiveSet, VariableID Var,
+                                           const Assignment &AV) {
+  LiveSet->setAssignment(BlockInfo::Stack, Var, AV);
+
+  // Use this assigment for all fragments contained within Var, but do not
+  // provide a Source because we cannot convert Var's value to a value for the
+  // fragment.
+  Assignment FragAV = AV;
+  FragAV.Source = nullptr;
+  for (VariableID Frag : getContainedFragments(Var))
+    LiveSet->setAssignment(BlockInfo::Stack, Frag, FragAV);
+}
+
+void AssignmentTrackingLowering::addDbgDef(BlockInfo *LiveSet, VariableID Var,
+                                           const Assignment &AV) {
+  LiveSet->setAssignment(BlockInfo::Debug, Var, AV);
+
+  // Use this assigment for all fragments contained within Var, but do not
+  // provide a Source because we cannot convert Var's value to a value for the
+  // fragment.
+  Assignment FragAV = AV;
+  FragAV.Source = nullptr;
+  for (VariableID Frag : getContainedFragments(Var))
+    LiveSet->setAssignment(BlockInfo::Debug, Frag, FragAV);
+}
+
+static DIAssignID *getIDFromInst(const Instruction &I) {
+  return cast<DIAssignID>(I.getMetadata(LLVMContext::MD_DIAssignID));
+}
+
+static DIAssignID *getIDFromMarker(const DbgAssignIntrinsic &DAI) {
+  return cast<DIAssignID>(DAI.getAssignID());
+}
+
+/// Return true if \p Var has an assignment in \p M matching \p AV.
+bool AssignmentTrackingLowering::hasVarWithAssignment(
+    BlockInfo *LiveSet, BlockInfo::AssignmentKind Kind, VariableID Var,
+    const Assignment &AV) {
+  if (!LiveSet->hasAssignment(Kind, Var, AV))
+    return false;
+
+  // Check all the frags contained within Var as these will have all been
+  // mapped to AV at the last store to Var.
+  for (VariableID Frag : getContainedFragments(Var))
+    if (!LiveSet->hasAssignment(Kind, Frag, AV))
+      return false;
+  return true;
+}
+
+#ifndef NDEBUG
+const char *locStr(AssignmentTrackingLowering::LocKind Loc) {
+  using LocKind = AssignmentTrackingLowering::LocKind;
+  switch (Loc) {
+  case LocKind::Val:
+    return "Val";
+  case LocKind::Mem:
+    return "Mem";
+  case LocKind::None:
+    return "None";
+  };
+  llvm_unreachable("unknown LocKind");
+}
+#endif
+
+void AssignmentTrackingLowering::emitDbgValue(
+    AssignmentTrackingLowering::LocKind Kind,
+    const DbgVariableIntrinsic *Source, Instruction *After) {
+
+  DILocation *DL = Source->getDebugLoc();
+  auto Emit = [this, Source, After, DL](Metadata *Val, DIExpression *Expr) {
+    assert(Expr);
+    if (!Val)
+      Val = ValueAsMetadata::get(
+          PoisonValue::get(Type::getInt1Ty(Source->getContext())));
+
+    // Find a suitable insert point.
+    Instruction *InsertBefore = After->getNextNode();
+    assert(InsertBefore && "Shouldn't be inserting after a terminator");
+
+    VariableID Var = getVariableID(DebugVariable(Source));
+    VarLocInfo VarLoc;
+    VarLoc.VariableID = static_cast<VariableID>(Var);
+    VarLoc.Expr = Expr;
+    VarLoc.Values = RawLocationWrapper(Val);
+    VarLoc.DL = DL;
+    // Insert it into the map for later.
+    InsertBeforeMap[InsertBefore].push_back(VarLoc);
+  };
+
+  // NOTE: This block can mutate Kind.
+  if (Kind == LocKind::Mem) {
+    const auto *DAI = cast<DbgAssignIntrinsic>(Source);
+    // Check the address hasn't been dropped (e.g. the debug uses may not have
+    // been replaced before deleting a Value).
+    if (DAI->isKillAddress()) {
+      // The address isn't valid so treat this as a non-memory def.
+      Kind = LocKind::Val;
+    } else {
+      Value *Val = DAI->getAddress();
+      DIExpression *Expr = DAI->getAddressExpression();
+      assert(!Expr->getFragmentInfo() &&
+             "fragment info should be stored in value-expression only");
+      // Copy the fragment info over from the value-expression to the new
+      // DIExpression.
+      if (auto OptFragInfo = Source->getExpression()->getFragmentInfo()) {
+        auto FragInfo = *OptFragInfo;
+        Expr = *DIExpression::createFragmentExpression(
+            Expr, FragInfo.OffsetInBits, FragInfo.SizeInBits);
+      }
+      // The address-expression has an implicit deref, add it now.
+      std::tie(Val, Expr) =
+          walkToAllocaAndPrependOffsetDeref(Layout, Val, Expr);
+      Emit(ValueAsMetadata::get(Val), Expr);
+      return;
+    }
+  }
+
+  if (Kind == LocKind::Val) {
+    Emit(Source->getRawLocation(), Source->getExpression());
+    return;
+  }
+
+  if (Kind == LocKind::None) {
+    Emit(nullptr, Source->getExpression());
+    return;
+  }
+}
+
+void AssignmentTrackingLowering::processNonDbgInstruction(
+    Instruction &I, AssignmentTrackingLowering::BlockInfo *LiveSet) {
+  if (I.hasMetadata(LLVMContext::MD_DIAssignID))
+    processTaggedInstruction(I, LiveSet);
+  else
+    processUntaggedInstruction(I, LiveSet);
+}
+
+void AssignmentTrackingLowering::processUntaggedInstruction(
+    Instruction &I, AssignmentTrackingLowering::BlockInfo *LiveSet) {
+  // Interpret stack stores that are not tagged as an assignment in memory for
+  // the variables associated with that address. These stores may not be tagged
+  // because a) the store cannot be represented using dbg.assigns (non-const
+  // length or offset) or b) the tag was accidentally dropped during
+  // optimisations. For these stores we fall back to assuming that the stack
+  // home is a valid location for the variables. The benefit is that this
+  // prevents us missing an assignment and therefore incorrectly maintaining
+  // earlier location definitions, and in many cases it should be a reasonable
+  // assumption. However, this will occasionally lead to slight
+  // inaccuracies. The value of a hoisted untagged store will be visible
+  // "early", for example.
+  assert(!I.hasMetadata(LLVMContext::MD_DIAssignID));
+  auto It = UntaggedStoreVars.find(&I);
+  if (It == UntaggedStoreVars.end())
+    return; // No variables associated with the store destination.
+
+  LLVM_DEBUG(dbgs() << "processUntaggedInstruction on UNTAGGED INST " << I
+                    << "\n");
+  // Iterate over the variables that this store affects, add a NoneOrPhi dbg
+  // and mem def, set lockind to Mem, and emit a location def for each.
+  for (auto [Var, Info] : It->second) {
+    // This instruction is treated as both a debug and memory assignment,
+    // meaning the memory location should be used. We don't have an assignment
+    // ID though so use Assignment::makeNoneOrPhi() to create an imaginary one.
+    addMemDef(LiveSet, Var, Assignment::makeNoneOrPhi());
+    addDbgDef(LiveSet, Var, Assignment::makeNoneOrPhi());
+    setLocKind(LiveSet, Var, LocKind::Mem);
+    LLVM_DEBUG(dbgs() << "  setting Stack LocKind to: " << locStr(LocKind::Mem)
+                      << "\n");
+    // Build the dbg location def to insert.
+    //
+    // DIExpression: Add fragment and offset.
+    DebugVariable V = FnVarLocs->getVariable(Var);
+    DIExpression *DIE = DIExpression::get(I.getContext(), std::nullopt);
+    if (auto Frag = V.getFragment()) {
+      auto R = DIExpression::createFragmentExpression(DIE, Frag->OffsetInBits,
+                                                      Frag->SizeInBits);
+      assert(R && "unexpected createFragmentExpression failure");
+      DIE = *R;
+    }
+    SmallVector<uint64_t, 3> Ops;
+    if (Info.OffsetInBits)
+      Ops = {dwarf::DW_OP_plus_uconst, Info.OffsetInBits / 8};
+    Ops.push_back(dwarf::DW_OP_deref);
+    DIE = DIExpression::prependOpcodes(DIE, Ops, /*StackValue=*/false,
+                                       /*EntryValue=*/false);
+    // Find a suitable insert point.
+    Instruction *InsertBefore = I.getNextNode();
+    assert(InsertBefore && "Shouldn't be inserting after a terminator");
+
+    // Get DILocation for this unrecorded assignment.
+    DILocation *InlinedAt = const_cast<DILocation *>(V.getInlinedAt());
+    const DILocation *DILoc = DILocation::get(
+        Fn.getContext(), 0, 0, V.getVariable()->getScope(), InlinedAt);
+
+    VarLocInfo VarLoc;
+    VarLoc.VariableID = static_cast<VariableID>(Var);
+    VarLoc.Expr = DIE;
+    VarLoc.Values = RawLocationWrapper(
+        ValueAsMetadata::get(const_cast<AllocaInst *>(Info.Base)));
+    VarLoc.DL = DILoc;
+    // 3. Insert it into the map for later.
+    InsertBeforeMap[InsertBefore].push_back(VarLoc);
+  }
+}
+
+void AssignmentTrackingLowering::processTaggedInstruction(
+    Instruction &I, AssignmentTrackingLowering::BlockInfo *LiveSet) {
+  auto Linked = at::getAssignmentMarkers(&I);
+  // No dbg.assign intrinsics linked.
+  // FIXME: All vars that have a stack slot this store modifies that don't have
+  // a dbg.assign linked to it should probably treat this like an untagged
+  // store.
+  if (Linked.empty())
+    return;
+
+  LLVM_DEBUG(dbgs() << "processTaggedInstruction on " << I << "\n");
+  for (DbgAssignIntrinsic *DAI : Linked) {
+    VariableID Var = getVariableID(DebugVariable(DAI));
+    // Something has gone wrong if VarsWithStackSlot doesn't contain a variable
+    // that is linked to a store.
+    assert(VarsWithStackSlot->count(getAggregate(DAI)) &&
+           "expected DAI's variable to have stack slot");
+
+    Assignment AV = Assignment::makeFromMemDef(getIDFromInst(I));
+    addMemDef(LiveSet, Var, AV);
+
+    LLVM_DEBUG(dbgs() << "   linked to " << *DAI << "\n");
+    LLVM_DEBUG(dbgs() << "   LiveLoc " << locStr(getLocKind(LiveSet, Var))
+                      << " -> ");
+
+    // The last assignment to the stack is now AV. Check if the last debug
+    // assignment has a matching Assignment.
+    if (hasVarWithAssignment(LiveSet, BlockInfo::Debug, Var, AV)) {
+      // The StackHomeValue and DebugValue for this variable match so we can
+      // emit a stack home location here.
+      LLVM_DEBUG(dbgs() << "Mem, Stack matches Debug program\n";);
+      LLVM_DEBUG(dbgs() << "   Stack val: "; AV.dump(dbgs()); dbgs() << "\n");
+      LLVM_DEBUG(dbgs() << "   Debug val: ";
+                 LiveSet->DebugValue[static_cast<unsigned>(Var)].dump(dbgs());
+                 dbgs() << "\n");
+      setLocKind(LiveSet, Var, LocKind::Mem);
+      emitDbgValue(LocKind::Mem, DAI, &I);
+      continue;
+    }
+
+    // The StackHomeValue and DebugValue for this variable do not match. I.e.
+    // The value currently stored in the stack is not what we'd expect to
+    // see, so we cannot use emit a stack home location here. Now we will
+    // look at the live LocKind for the variable and determine an appropriate
+    // dbg.value to emit.
+    LocKind PrevLoc = getLocKind(LiveSet, Var);
+    switch (PrevLoc) {
+    case LocKind::Val: {
+      // The value in memory in memory has changed but we're not currently
+      // using the memory location. Do nothing.
+      LLVM_DEBUG(dbgs() << "Val, (unchanged)\n";);
+      setLocKind(LiveSet, Var, LocKind::Val);
+    } break;
+    case LocKind::Mem: {
+      // There's been an assignment to memory that we were using as a
+      // location for this variable, and the Assignment doesn't match what
+      // we'd expect to see in memory.
+      Assignment DbgAV = LiveSet->getAssignment(BlockInfo::Debug, Var);
+      if (DbgAV.Status == Assignment::NoneOrPhi) {
+        // We need to terminate any previously open location now.
+        LLVM_DEBUG(dbgs() << "None, No Debug value available\n";);
+        setLocKind(LiveSet, Var, LocKind::None);
+        emitDbgValue(LocKind::None, DAI, &I);
+      } else {
+        // The previous DebugValue Value can be used here.
+        LLVM_DEBUG(dbgs() << "Val, Debug value is Known\n";);
+        setLocKind(LiveSet, Var, LocKind::Val);
+        if (DbgAV.Source) {
+          emitDbgValue(LocKind::Val, DbgAV.Source, &I);
+        } else {
+          // PrevAV.Source is nullptr so we must emit undef here.
+          emitDbgValue(LocKind::None, DAI, &I);
+        }
+      }
+    } break;
+    case LocKind::None: {
+      // There's been an assignment to memory and we currently are
+      // not tracking a location for the variable. Do not emit anything.
+      LLVM_DEBUG(dbgs() << "None, (unchanged)\n";);
+      setLocKind(LiveSet, Var, LocKind::None);
+    } break;
+    }
+  }
+}
+
+void AssignmentTrackingLowering::processDbgAssign(DbgAssignIntrinsic &DAI,
+                                                  BlockInfo *LiveSet) {
+  // Only bother tracking variables that are at some point stack homed. Other
+  // variables can be dealt with trivially later.
+  if (!VarsWithStackSlot->count(getAggregate(&DAI)))
+    return;
+
+  VariableID Var = getVariableID(DebugVariable(&DAI));
+  Assignment AV = Assignment::make(getIDFromMarker(DAI), &DAI);
+  addDbgDef(LiveSet, Var, AV);
+
+  LLVM_DEBUG(dbgs() << "processDbgAssign on " << DAI << "\n";);
+  LLVM_DEBUG(dbgs() << "   LiveLoc " << locStr(getLocKind(LiveSet, Var))
+                    << " -> ");
+
+  // Check if the DebugValue and StackHomeValue both hold the same
+  // Assignment.
+  if (hasVarWithAssignment(LiveSet, BlockInfo::Stack, Var, AV)) {
+    // They match. We can use the stack home because the debug intrinsics state
+    // that an assignment happened here, and we know that specific assignment
+    // was the last one to take place in memory for this variable.
+    LocKind Kind;
+    if (DAI.isKillAddress()) {
+      LLVM_DEBUG(
+          dbgs()
+              << "Val, Stack matches Debug program but address is killed\n";);
+      Kind = LocKind::Val;
+    } else {
+      LLVM_DEBUG(dbgs() << "Mem, Stack matches Debug program\n";);
+      Kind = LocKind::Mem;
+    };
+    setLocKind(LiveSet, Var, Kind);
+    emitDbgValue(Kind, &DAI, &DAI);
+  } else {
+    // The last assignment to the memory location isn't the one that we want to
+    // show to the user so emit a dbg.value(Value). Value may be undef.
+    LLVM_DEBUG(dbgs() << "Val, Stack contents is unknown\n";);
+    setLocKind(LiveSet, Var, LocKind::Val);
+    emitDbgValue(LocKind::Val, &DAI, &DAI);
+  }
+}
+
+void AssignmentTrackingLowering::processDbgValue(DbgValueInst &DVI,
+                                                 BlockInfo *LiveSet) {
+  // Only other tracking variables that are at some point stack homed.
+  // Other variables can be dealt with trivally later.
+  if (!VarsWithStackSlot->count(getAggregate(&DVI)))
+    return;
+
+  VariableID Var = getVariableID(DebugVariable(&DVI));
+  // We have no ID to create an Assignment with so we mark this assignment as
+  // NoneOrPhi. Note that the dbg.value still exists, we just cannot determine
+  // the assignment responsible for setting this value.
+  // This is fine; dbg.values are essentially interchangable with unlinked
+  // dbg.assigns, and some passes such as mem2reg and instcombine add them to
+  // PHIs for promoted variables.
+  Assignment AV = Assignment::makeNoneOrPhi();
+  addDbgDef(LiveSet, Var, AV);
+
+  LLVM_DEBUG(dbgs() << "processDbgValue on " << DVI << "\n";);
+  LLVM_DEBUG(dbgs() << "   LiveLoc " << locStr(getLocKind(LiveSet, Var))
+                    << " -> Val, dbg.value override");
+
+  setLocKind(LiveSet, Var, LocKind::Val);
+  emitDbgValue(LocKind::Val, &DVI, &DVI);
+}
+
+static bool hasZeroSizedFragment(DbgVariableIntrinsic &DVI) {
+  if (auto F = DVI.getExpression()->getFragmentInfo())
+    return F->SizeInBits == 0;
+  return false;
+}
+
+void AssignmentTrackingLowering::processDbgInstruction(
+    DbgInfoIntrinsic &I, AssignmentTrackingLowering::BlockInfo *LiveSet) {
+  auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I);
+  if (!DVI)
+    return;
+
+  // Ignore assignments to zero bits of the variable.
+  if (hasZeroSizedFragment(*DVI))
+    return;
+
+  if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I))
+    processDbgAssign(*DAI, LiveSet);
+  else if (auto *DVI = dyn_cast<DbgValueInst>(&I))
+    processDbgValue(*DVI, LiveSet);
+}
+
+void AssignmentTrackingLowering::resetInsertionPoint(Instruction &After) {
+  assert(!After.isTerminator() && "Can't insert after a terminator");
+  auto R = InsertBeforeMap.find(After.getNextNode());
+  if (R == InsertBeforeMap.end())
+    return;
+  R->second.clear();
+}
+
+void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
+  for (auto II = BB.begin(), EI = BB.end(); II != EI;) {
+    assert(VarsTouchedThisFrame.empty());
+    // Process the instructions in "frames". A "frame" includes a single
+    // non-debug instruction followed any debug instructions before the
+    // next non-debug instruction.
+    if (!isa<DbgInfoIntrinsic>(&*II)) {
+      if (II->isTerminator())
+        break;
+      resetInsertionPoint(*II);
+      processNonDbgInstruction(*II, LiveSet);
+      assert(LiveSet->isValid());
+      ++II;
+    }
+    while (II != EI) {
+      auto *Dbg = dyn_cast<DbgInfoIntrinsic>(&*II);
+      if (!Dbg)
+        break;
+      resetInsertionPoint(*II);
+      processDbgInstruction(*Dbg, LiveSet);
+      assert(LiveSet->isValid());
+      ++II;
+    }
+
+    // We've processed everything in the "frame". Now determine which variables
+    // cannot be represented by a dbg.declare.
+    for (auto Var : VarsTouchedThisFrame) {
+      LocKind Loc = getLocKind(LiveSet, Var);
+      // If a variable's LocKind is anything other than LocKind::Mem then we
+      // must note that it cannot be represented with a dbg.declare.
+      // Note that this check is enough without having to check the result of
+      // joins() because for join to produce anything other than Mem after
+      // we've already seen a Mem we'd be joining None or Val with Mem. In that
+      // case, we've already hit this codepath when we set the LocKind to Val
+      // or None in that block.
+      if (Loc != LocKind::Mem) {
+        DebugVariable DbgVar = FnVarLocs->getVariable(Var);
+        DebugAggregate Aggr{DbgVar.getVariable(), DbgVar.getInlinedAt()};
+        NotAlwaysStackHomed.insert(Aggr);
+      }
+    }
+    VarsTouchedThisFrame.clear();
+  }
+}
+
+AssignmentTrackingLowering::LocKind
+AssignmentTrackingLowering::joinKind(LocKind A, LocKind B) {
+  // Partial order:
+  // None > Mem, Val
+  return A == B ? A : LocKind::None;
+}
+
+AssignmentTrackingLowering::Assignment
+AssignmentTrackingLowering::joinAssignment(const Assignment &A,
+                                           const Assignment &B) {
+  // Partial order:
+  // NoneOrPhi(null, null) > Known(v, ?s)
+
+  // If either are NoneOrPhi the join is NoneOrPhi.
+  // If either value is different then the result is
+  // NoneOrPhi (joining two values is a Phi).
+  if (!A.isSameSourceAssignment(B))
+    return Assignment::makeNoneOrPhi();
+  if (A.Status == Assignment::NoneOrPhi)
+    return Assignment::makeNoneOrPhi();
+
+  // Source is used to lookup the value + expression in the debug program if
+  // the stack slot gets assigned a value earlier than expected. Because
+  // we're only tracking the one dbg.assign, we can't capture debug PHIs.
+  // It's unlikely that we're losing out on much coverage by avoiding that
+  // extra work.
+  // The Source may differ in this situation:
+  // Pred.1:
+  //   dbg.assign i32 0, ..., !1, ...
+  // Pred.2:
+  //   dbg.assign i32 1, ..., !1, ...
+  // Here the same assignment (!1) was performed in both preds in the source,
+  // but we can't use either one unless they are identical (e.g. .we don't
+  // want to arbitrarily pick between constant values).
+  auto JoinSource = [&]() -> DbgAssignIntrinsic * {
+    if (A.Source == B.Source)
+      return A.Source;
+    if (A.Source == nullptr || B.Source == nullptr)
+      return nullptr;
+    if (A.Source->isIdenticalTo(B.Source))
+      return A.Source;
+    return nullptr;
+  };
+  DbgAssignIntrinsic *Source = JoinSource();
+  assert(A.Status == B.Status && A.Status == Assignment::Known);
+  assert(A.ID == B.ID);
+  return Assignment::make(A.ID, Source);
+}
+
+AssignmentTrackingLowering::BlockInfo
+AssignmentTrackingLowering::joinBlockInfo(const BlockInfo &A,
+                                          const BlockInfo &B) {
+  return BlockInfo::join(A, B, TrackedVariablesVectorSize);
+}
+
+bool AssignmentTrackingLowering::join(
+    const BasicBlock &BB, const SmallPtrSet<BasicBlock *, 16> &Visited) {
+
+  SmallVector<const BasicBlock *> VisitedPreds;
+  // Ignore backedges if we have not visited the predecessor yet. As the
+  // predecessor hasn't yet had locations propagated into it, most locations
+  // will not yet be valid, so treat them as all being uninitialized and
+  // potentially valid. If a location guessed to be correct here is
+  // invalidated later, we will remove it when we revisit this block. This
+  // is essentially the same as initialising all LocKinds and Assignments to
+  // an implicit ⊥ value which is the identity value for the join operation.
+  for (auto I = pred_begin(&BB), E = pred_end(&BB); I != E; I++) {
+    const BasicBlock *Pred = *I;
+    if (Visited.count(Pred))
+      VisitedPreds.push_back(Pred);
+  }
+
+  // No preds visited yet.
+  if (VisitedPreds.empty()) {
+    auto It = LiveIn.try_emplace(&BB, BlockInfo());
+    bool DidInsert = It.second;
+    if (DidInsert)
+      It.first->second.init(TrackedVariablesVectorSize);
+    return /*Changed*/ DidInsert;
+  }
+
+  // Exactly one visited pred. Copy the LiveOut from that pred into BB LiveIn.
+  if (VisitedPreds.size() == 1) {
+    const BlockInfo &PredLiveOut = LiveOut.find(VisitedPreds[0])->second;
+    auto CurrentLiveInEntry = LiveIn.find(&BB);
+
+    // Check if there isn't an entry, or there is but the LiveIn set has
+    // changed (expensive check).
+    if (CurrentLiveInEntry == LiveIn.end())
+      LiveIn.insert(std::make_pair(&BB, PredLiveOut));
+    else if (PredLiveOut != CurrentLiveInEntry->second)
+      CurrentLiveInEntry->second = PredLiveOut;
+    else
+      return /*Changed*/ false;
+    return /*Changed*/ true;
+  }
+
+  // More than one pred. Join LiveOuts of blocks 1 and 2.
+  assert(VisitedPreds.size() > 1);
+  const BlockInfo &PredLiveOut0 = LiveOut.find(VisitedPreds[0])->second;
+  const BlockInfo &PredLiveOut1 = LiveOut.find(VisitedPreds[1])->second;
+  BlockInfo BBLiveIn = joinBlockInfo(PredLiveOut0, PredLiveOut1);
+
+  // Join the LiveOuts of subsequent blocks.
+  ArrayRef Tail = ArrayRef(VisitedPreds).drop_front(2);
+  for (const BasicBlock *Pred : Tail) {
+    const auto &PredLiveOut = LiveOut.find(Pred);
+    assert(PredLiveOut != LiveOut.end() &&
+           "block should have been processed already");
+    BBLiveIn = joinBlockInfo(std::move(BBLiveIn), PredLiveOut->second);
+  }
+
+  // Save the joined result for BB.
+  auto CurrentLiveInEntry = LiveIn.find(&BB);
+  // Check if there isn't an entry, or there is but the LiveIn set has changed
+  // (expensive check).
+  if (CurrentLiveInEntry == LiveIn.end())
+    LiveIn.try_emplace(&BB, std::move(BBLiveIn));
+  else if (BBLiveIn != CurrentLiveInEntry->second)
+    CurrentLiveInEntry->second = std::move(BBLiveIn);
+  else
+    return /*Changed*/ false;
+  return /*Changed*/ true;
+}
+
+/// Return true if A fully contains B.
+static bool fullyContains(DIExpression::FragmentInfo A,
+                          DIExpression::FragmentInfo B) {
+  auto ALeft = A.OffsetInBits;
+  auto BLeft = B.OffsetInBits;
+  if (BLeft < ALeft)
+    return false;
+
+  auto ARight = ALeft + A.SizeInBits;
+  auto BRight = BLeft + B.SizeInBits;
+  if (BRight > ARight)
+    return false;
+  return true;
+}
+
+static std::optional<at::AssignmentInfo>
+getUntaggedStoreAssignmentInfo(const Instruction &I, const DataLayout &Layout) {
+  // Don't bother checking if this is an AllocaInst. We know this
+  // instruction has no tag which means there are no variables associated
+  // with it.
+  if (const auto *SI = dyn_cast<StoreInst>(&I))
+    return at::getAssignmentInfo(Layout, SI);
+  if (const auto *MI = dyn_cast<MemIntrinsic>(&I))
+    return at::getAssignmentInfo(Layout, MI);
+  // Alloca or non-store-like inst.
+  return std::nullopt;
+}
+
+/// Build a map of {Variable x: Variables y} where all variable fragments
+/// contained within the variable fragment x are in set y. This means that
+/// y does not contain all overlaps because partial overlaps are excluded.
+///
+/// While we're iterating over the function, add single location defs for
+/// dbg.declares to \p FnVarLocs.
+///
+/// Variables that are interesting to this pass in are added to
+/// FnVarLocs->Variables first. TrackedVariablesVectorSize is set to the ID of
+/// the last interesting variable plus 1, meaning variables with ID 1
+/// (inclusive) to TrackedVariablesVectorSize (exclusive) are interesting. The
+/// subsequent variables are either stack homed or fully promoted.
+///
+/// Finally, populate UntaggedStoreVars with a mapping of untagged stores to
+/// the stored-to variable fragments.
+///
+/// These tasks are bundled together to reduce the number of times we need
+/// to iterate over the function as they can be achieved together in one pass.
+static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares(
+    Function &Fn, FunctionVarLocsBuilder *FnVarLocs,
+    const DenseSet<DebugAggregate> &VarsWithStackSlot,
+    AssignmentTrackingLowering::UntaggedStoreAssignmentMap &UntaggedStoreVars,
+    unsigned &TrackedVariablesVectorSize) {
+  DenseSet<DebugVariable> Seen;
+  // Map of Variable: [Fragments].
+  DenseMap<DebugAggregate, SmallVector<DebugVariable, 8>> FragmentMap;
+  // Iterate over all instructions:
+  // - dbg.declare    -> add single location variable record
+  // - dbg.*          -> Add fragments to FragmentMap
+  // - untagged store -> Add fragments to FragmentMap and update
+  //                     UntaggedStoreVars.
+  // We need to add fragments for untagged stores too so that we can correctly
+  // clobber overlapped fragment locations later.
+  SmallVector<DbgDeclareInst *> Declares;
+  for (auto &BB : Fn) {
+    for (auto &I : BB) {
+      if (auto *DDI = dyn_cast<DbgDeclareInst>(&I)) {
+        Declares.push_back(DDI);
+      } else if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
+        DebugVariable DV = DebugVariable(DII);
+        DebugAggregate DA = {DV.getVariable(), DV.getInlinedAt()};
+        if (!VarsWithStackSlot.contains(DA))
+          continue;
+        if (Seen.insert(DV).second)
+          FragmentMap[DA].push_back(DV);
+      } else if (auto Info = getUntaggedStoreAssignmentInfo(
+                     I, Fn.getParent()->getDataLayout())) {
+        // Find markers linked to this alloca.
+        for (DbgAssignIntrinsic *DAI : at::getAssignmentMarkers(Info->Base)) {
+          // Discard the fragment if it covers the entire variable.
+          std::optional<DIExpression::FragmentInfo> FragInfo =
+              [&Info, DAI]() -> std::optional<DIExpression::FragmentInfo> {
+            DIExpression::FragmentInfo F;
+            F.OffsetInBits = Info->OffsetInBits;
+            F.SizeInBits = Info->SizeInBits;
+            if (auto ExistingFrag = DAI->getExpression()->getFragmentInfo())
+              F.OffsetInBits += ExistingFrag->OffsetInBits;
+            if (auto Sz = DAI->getVariable()->getSizeInBits()) {
+              if (F.OffsetInBits == 0 && F.SizeInBits == *Sz)
+                return std::nullopt;
+            }
+            return F;
+          }();
+
+          DebugVariable DV = DebugVariable(DAI->getVariable(), FragInfo,
+                                           DAI->getDebugLoc().getInlinedAt());
+          DebugAggregate DA = {DV.getVariable(), DV.getInlinedAt()};
+          if (!VarsWithStackSlot.contains(DA))
+            continue;
+
+          // Cache this info for later.
+          UntaggedStoreVars[&I].push_back(
+              {FnVarLocs->insertVariable(DV), *Info});
+
+          if (Seen.insert(DV).second)
+            FragmentMap[DA].push_back(DV);
+        }
+      }
+    }
+  }
+
+  // Sort the fragment map for each DebugAggregate in ascending
+  // order of fragment size - there should be no duplicates.
+  for (auto &Pair : FragmentMap) {
+    SmallVector<DebugVariable, 8> &Frags = Pair.second;
+    std::sort(Frags.begin(), Frags.end(),
+              [](const DebugVariable &Next, const DebugVariable &Elmt) {
+                return Elmt.getFragmentOrDefault().SizeInBits >
+                       Next.getFragmentOrDefault().SizeInBits;
+              });
+    // Check for duplicates.
+    assert(std::adjacent_find(Frags.begin(), Frags.end()) == Frags.end());
+  }
+
+  // Build the map.
+  AssignmentTrackingLowering::OverlapMap Map;
+  for (auto &Pair : FragmentMap) {
+    auto &Frags = Pair.second;
+    for (auto It = Frags.begin(), IEnd = Frags.end(); It != IEnd; ++It) {
+      DIExpression::FragmentInfo Frag = It->getFragmentOrDefault();
+      // Find the frags that this is contained within.
+      //
+      // Because Frags is sorted by size and none have the same offset and
+      // size, we know that this frag can only be contained by subsequent
+      // elements.
+      SmallVector<DebugVariable, 8>::iterator OtherIt = It;
+      ++OtherIt;
+      VariableID ThisVar = FnVarLocs->insertVariable(*It);
+      for (; OtherIt != IEnd; ++OtherIt) {
+        DIExpression::FragmentInfo OtherFrag = OtherIt->getFragmentOrDefault();
+        VariableID OtherVar = FnVarLocs->insertVariable(*OtherIt);
+        if (fullyContains(OtherFrag, Frag))
+          Map[OtherVar].push_back(ThisVar);
+      }
+    }
+  }
+
+  // VariableIDs are 1-based so the variable-tracking bitvector needs
+  // NumVariables plus 1 bits.
+  TrackedVariablesVectorSize = FnVarLocs->getNumVariables() + 1;
+
+  // Finally, insert the declares afterwards, so the first IDs are all
+  // partially stack homed vars.
+  for (auto *DDI : Declares)
+    FnVarLocs->addSingleLocVar(DebugVariable(DDI), DDI->getExpression(),
+                               DDI->getDebugLoc(), DDI->getWrappedLocation());
+  return Map;
+}
+
+bool AssignmentTrackingLowering::run(FunctionVarLocsBuilder *FnVarLocsBuilder) {
+  if (Fn.size() > MaxNumBlocks) {
+    LLVM_DEBUG(dbgs() << "[AT] Dropping var locs in: " << Fn.getName()
+                      << ": too many blocks (" << Fn.size() << ")\n");
+    at::deleteAll(&Fn);
+    return false;
+  }
+
+  FnVarLocs = FnVarLocsBuilder;
+
+  // The general structure here is inspired by VarLocBasedImpl.cpp
+  // (LiveDebugValues).
+
+  // Build the variable fragment overlap map.
+  // Note that this pass doesn't handle partial overlaps correctly (FWIW
+  // neither does LiveDebugVariables) because that is difficult to do and
+  // appears to be rare occurance.
+  VarContains = buildOverlapMapAndRecordDeclares(
+      Fn, FnVarLocs, *VarsWithStackSlot, UntaggedStoreVars,
+      TrackedVariablesVectorSize);
+
+  // Prepare for traversal.
+  ReversePostOrderTraversal<Function *> RPOT(&Fn);
+  std::priority_queue<unsigned int, std::vector<unsigned int>,
+                      std::greater<unsigned int>>
+      Worklist;
+  std::priority_queue<unsigned int, std::vector<unsigned int>,
+                      std::greater<unsigned int>>
+      Pending;
+  DenseMap<unsigned int, BasicBlock *> OrderToBB;
+  DenseMap<BasicBlock *, unsigned int> BBToOrder;
+  { // Init OrderToBB and BBToOrder.
+    unsigned int RPONumber = 0;
+    for (auto RI = RPOT.begin(), RE = RPOT.end(); RI != RE; ++RI) {
+      OrderToBB[RPONumber] = *RI;
+      BBToOrder[*RI] = RPONumber;
+      Worklist.push(RPONumber);
+      ++RPONumber;
+    }
+    LiveIn.init(RPONumber);
+    LiveOut.init(RPONumber);
+  }
+
+  // Perform the traversal.
+  //
+  // This is a standard "union of predecessor outs" dataflow problem. To solve
+  // it, we perform join() and process() using the two worklist method until
+  // the LiveIn data for each block becomes unchanging. The "proof" that this
+  // terminates can be put together by looking at the comments around LocKind,
+  // Assignment, and the various join methods, which show that all the elements
+  // involved are made up of join-semilattices; LiveIn(n) can only
+  // monotonically increase in value throughout the dataflow.
+  //
+  SmallPtrSet<BasicBlock *, 16> Visited;
+  while (!Worklist.empty()) {
+    // We track what is on the pending worklist to avoid inserting the same
+    // thing twice.
+    SmallPtrSet<BasicBlock *, 16> OnPending;
+    LLVM_DEBUG(dbgs() << "Processing Worklist\n");
+    while (!Worklist.empty()) {
+      BasicBlock *BB = OrderToBB[Worklist.top()];
+      LLVM_DEBUG(dbgs() << "\nPop BB " << BB->getName() << "\n");
+      Worklist.pop();
+      bool InChanged = join(*BB, Visited);
+      // Always consider LiveIn changed on the first visit.
+      InChanged |= Visited.insert(BB).second;
+      if (InChanged) {
+        LLVM_DEBUG(dbgs() << BB->getName() << " has new InLocs, process it\n");
+        // Mutate a copy of LiveIn while processing BB. After calling process
+        // LiveSet is the LiveOut set for BB.
+        BlockInfo LiveSet = LiveIn[BB];
+
+        // Process the instructions in the block.
+        process(*BB, &LiveSet);
+
+        // Relatively expensive check: has anything changed in LiveOut for BB?
+        if (LiveOut[BB] != LiveSet) {
+          LLVM_DEBUG(dbgs() << BB->getName()
+                            << " has new OutLocs, add succs to worklist: [ ");
+          LiveOut[BB] = std::move(LiveSet);
+          for (auto I = succ_begin(BB), E = succ_end(BB); I != E; I++) {
+            if (OnPending.insert(*I).second) {
+              LLVM_DEBUG(dbgs() << I->getName() << " ");
+              Pending.push(BBToOrder[*I]);
+            }
+          }
+          LLVM_DEBUG(dbgs() << "]\n");
+        }
+      }
+    }
+    Worklist.swap(Pending);
+    // At this point, pending must be empty, since it was just the empty
+    // worklist
+    assert(Pending.empty() && "Pending should be empty");
+  }
+
+  // That's the hard part over. Now we just have some admin to do.
+
+  // Record whether we inserted any intrinsics.
+  bool InsertedAnyIntrinsics = false;
+
+  // Identify and add defs for single location variables.
+  //
+  // Go through all of the defs that we plan to add. If the aggregate variable
+  // it's a part of is not in the NotAlwaysStackHomed set we can emit a single
+  // location def and omit the rest. Add an entry to AlwaysStackHomed so that
+  // we can identify those uneeded defs later.
+  DenseSet<DebugAggregate> AlwaysStackHomed;
+  for (const auto &Pair : InsertBeforeMap) {
+    const auto &Vec = Pair.second;
+    for (VarLocInfo VarLoc : Vec) {
+      DebugVariable Var = FnVarLocs->getVariable(VarLoc.VariableID);
+      DebugAggregate Aggr{Var.getVariable(), Var.getInlinedAt()};
+
+      // Skip this Var if it's not always stack homed.
+      if (NotAlwaysStackHomed.contains(Aggr))
+        continue;
+
+      // Skip complex cases such as when different fragments of a variable have
+      // been split into different allocas. Skipping in this case means falling
+      // back to using a list of defs (which could reduce coverage, but is no
+      // less correct).
+      bool Simple =
+          VarLoc.Expr->getNumElements() == 1 && VarLoc.Expr->startsWithDeref();
+      if (!Simple) {
+        NotAlwaysStackHomed.insert(Aggr);
+        continue;
+      }
+
+      // All source assignments to this variable remain and all stores to any
+      // part of the variable store to the same address (with varying
+      // offsets). We can just emit a single location for the whole variable.
+      //
+      // Unless we've already done so, create the single location def now.
+      if (AlwaysStackHomed.insert(Aggr).second) {
+        assert(!VarLoc.Values.hasArgList());
+        // TODO: When more complex cases are handled VarLoc.Expr should be
+        // built appropriately rather than always using an empty DIExpression.
+        // The assert below is a reminder.
+        assert(Simple);
+        VarLoc.Expr = DIExpression::get(Fn.getContext(), std::nullopt);
+        DebugVariable Var = FnVarLocs->getVariable(VarLoc.VariableID);
+        FnVarLocs->addSingleLocVar(Var, VarLoc.Expr, VarLoc.DL, VarLoc.Values);
+        InsertedAnyIntrinsics = true;
+      }
+    }
+  }
+
+  // Insert the other DEFs.
+  for (const auto &[InsertBefore, Vec] : InsertBeforeMap) {
+    SmallVector<VarLocInfo> NewDefs;
+    for (const VarLocInfo &VarLoc : Vec) {
+      DebugVariable Var = FnVarLocs->getVariable(VarLoc.VariableID);
+      DebugAggregate Aggr{Var.getVariable(), Var.getInlinedAt()};
+      // If this variable is always stack homed then we have already inserted a
+      // dbg.declare and deleted this dbg.value.
+      if (AlwaysStackHomed.contains(Aggr))
+        continue;
+      NewDefs.push_back(VarLoc);
+      InsertedAnyIntrinsics = true;
+    }
+
+    FnVarLocs->setWedge(InsertBefore, std::move(NewDefs));
+  }
+
+  InsertedAnyIntrinsics |= emitPromotedVarLocs(FnVarLocs);
+
+  return InsertedAnyIntrinsics;
+}
+
+bool AssignmentTrackingLowering::emitPromotedVarLocs(
+    FunctionVarLocsBuilder *FnVarLocs) {
+  bool InsertedAnyIntrinsics = false;
+  // Go through every block, translating debug intrinsics for fully promoted
+  // variables into FnVarLocs location defs. No analysis required for these.
+  for (auto &BB : Fn) {
+    for (auto &I : BB) {
+      // Skip instructions other than dbg.values and dbg.assigns.
+      auto *DVI = dyn_cast<DbgValueInst>(&I);
+      if (!DVI)
+        continue;
+      // Skip variables that haven't been promoted - we've dealt with those
+      // already.
+      if (VarsWithStackSlot->contains(getAggregate(DVI)))
+        continue;
+      Instruction *InsertBefore = I.getNextNode();
+      assert(InsertBefore && "Unexpected: debug intrinsics after a terminator");
+      FnVarLocs->addVarLoc(InsertBefore, DebugVariable(DVI),
+                           DVI->getExpression(), DVI->getDebugLoc(),
+                           DVI->getWrappedLocation());
+      InsertedAnyIntrinsics = true;
+    }
+  }
+  return InsertedAnyIntrinsics;
+}
+
+/// Remove redundant definitions within sequences of consecutive location defs.
+/// This is done using a backward scan to keep the last def describing a
+/// specific variable/fragment.
+///
+/// This implements removeRedundantDbgInstrsUsingBackwardScan from
+/// lib/Transforms/Utils/BasicBlockUtils.cpp for locations described with
+/// FunctionVarLocsBuilder instead of with intrinsics.
+static bool
+removeRedundantDbgLocsUsingBackwardScan(const BasicBlock *BB,
+                                        FunctionVarLocsBuilder &FnVarLocs) {
+  bool Changed = false;
+  SmallDenseMap<DebugAggregate, BitVector> VariableDefinedBits;
+  // Scan over the entire block, not just over the instructions mapped by
+  // FnVarLocs, because wedges in FnVarLocs may only be seperated by debug
+  // instructions.
+  for (const Instruction &I : reverse(*BB)) {
+    if (!isa<DbgVariableIntrinsic>(I)) {
+      // Sequence of consecutive defs ended. Clear map for the next one.
+      VariableDefinedBits.clear();
+    }
+
+    // Get the location defs that start just before this instruction.
+    const auto *Locs = FnVarLocs.getWedge(&I);
+    if (!Locs)
+      continue;
+
+    NumWedgesScanned++;
+    bool ChangedThisWedge = false;
+    // The new pruned set of defs, reversed because we're scanning backwards.
+    SmallVector<VarLocInfo> NewDefsReversed;
+
+    // Iterate over the existing defs in reverse.
+    for (auto RIt = Locs->rbegin(), REnd = Locs->rend(); RIt != REnd; ++RIt) {
+      NumDefsScanned++;
+      DebugAggregate Aggr =
+          getAggregate(FnVarLocs.getVariable(RIt->VariableID));
+      uint64_t SizeInBits = Aggr.first->getSizeInBits().value_or(0);
+
+      if (SizeInBits == 0) {
+        // If the size is unknown (0) then keep this location def to be safe.
+        NewDefsReversed.push_back(*RIt);
+        continue;
+      }
+
+      // Only keep this location definition if it is not fully eclipsed by
+      // other definitions in this wedge that come after it
+
+      // Inert the bits the location definition defines.
+      auto InsertResult =
+          VariableDefinedBits.try_emplace(Aggr, BitVector(SizeInBits));
+      bool FirstDefinition = InsertResult.second;
+      BitVector &DefinedBits = InsertResult.first->second;
+
+      DIExpression::FragmentInfo Fragment =
+          RIt->Expr->getFragmentInfo().value_or(
+              DIExpression::FragmentInfo(SizeInBits, 0));
+      bool InvalidFragment = Fragment.endInBits() > SizeInBits;
+
+      // If this defines any previously undefined bits, keep it.
+      if (FirstDefinition || InvalidFragment ||
+          DefinedBits.find_first_unset_in(Fragment.startInBits(),
+                                          Fragment.endInBits()) != -1) {
+        if (!InvalidFragment)
+          DefinedBits.set(Fragment.startInBits(), Fragment.endInBits());
+        NewDefsReversed.push_back(*RIt);
+        continue;
+      }
+
+      // Redundant def found: throw it away. Since the wedge of defs is being
+      // rebuilt, doing nothing is the same as deleting an entry.
+      ChangedThisWedge = true;
+      NumDefsRemoved++;
+    }
+
+    // Un-reverse the defs and replace the wedge with the pruned version.
+    if (ChangedThisWedge) {
+      std::reverse(NewDefsReversed.begin(), NewDefsReversed.end());
+      FnVarLocs.setWedge(&I, std::move(NewDefsReversed));
+      NumWedgesChanged++;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+/// Remove redundant location defs using a forward scan. This can remove a
+/// location definition that is redundant due to indicating that a variable has
+/// the same value as is already being indicated by an earlier def.
+///
+/// This implements removeRedundantDbgInstrsUsingForwardScan from
+/// lib/Transforms/Utils/BasicBlockUtils.cpp for locations described with
+/// FunctionVarLocsBuilder instead of with intrinsics
+static bool
+removeRedundantDbgLocsUsingForwardScan(const BasicBlock *BB,
+                                       FunctionVarLocsBuilder &FnVarLocs) {
+  bool Changed = false;
+  DenseMap<DebugVariable, std::pair<RawLocationWrapper, DIExpression *>>
+      VariableMap;
+
+  // Scan over the entire block, not just over the instructions mapped by
+  // FnVarLocs, because wedges in FnVarLocs may only be seperated by debug
+  // instructions.
+  for (const Instruction &I : *BB) {
+    // Get the defs that come just before this instruction.
+    const auto *Locs = FnVarLocs.getWedge(&I);
+    if (!Locs)
+      continue;
+
+    NumWedgesScanned++;
+    bool ChangedThisWedge = false;
+    // The new pruned set of defs.
+    SmallVector<VarLocInfo> NewDefs;
+
+    // Iterate over the existing defs.
+    for (const VarLocInfo &Loc : *Locs) {
+      NumDefsScanned++;
+      DebugVariable Key(FnVarLocs.getVariable(Loc.VariableID).getVariable(),
+                        std::nullopt, Loc.DL.getInlinedAt());
+      auto VMI = VariableMap.find(Key);
+
+      // Update the map if we found a new value/expression describing the
+      // variable, or if the variable wasn't mapped already.
+      if (VMI == VariableMap.end() || VMI->second.first != Loc.Values ||
+          VMI->second.second != Loc.Expr) {
+        VariableMap[Key] = {Loc.Values, Loc.Expr};
+        NewDefs.push_back(Loc);
+        continue;
+      }
+
+      // Did not insert this Loc, which is the same as removing it.
+      ChangedThisWedge = true;
+      NumDefsRemoved++;
+    }
+
+    // Replace the existing wedge with the pruned version.
+    if (ChangedThisWedge) {
+      FnVarLocs.setWedge(&I, std::move(NewDefs));
+      NumWedgesChanged++;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+static bool
+removeUndefDbgLocsFromEntryBlock(const BasicBlock *BB,
+                                 FunctionVarLocsBuilder &FnVarLocs) {
+  assert(BB->isEntryBlock());
+  // Do extra work to ensure that we remove semantically unimportant undefs.
+  //
+  // This is to work around the fact that SelectionDAG will hoist dbg.values
+  // using argument values to the top of the entry block. That can move arg
+  // dbg.values before undef and constant dbg.values which they previously
+  // followed. The easiest thing to do is to just try to feed SelectionDAG
+  // input it's happy with.
+  //
+  // Map of {Variable x: Fragments y} where the fragments y of variable x have
+  // have at least one non-undef location defined already. Don't use directly,
+  // instead call DefineBits and HasDefinedBits.
+  SmallDenseMap<DebugAggregate, SmallDenseSet<DIExpression::FragmentInfo>>
+      VarsWithDef;
+  // Specify that V (a fragment of A) has a non-undef location.
+  auto DefineBits = [&VarsWithDef](DebugAggregate A, DebugVariable V) {
+    VarsWithDef[A].insert(V.getFragmentOrDefault());
+  };
+  // Return true if a non-undef location has been defined for V (a fragment of
+  // A). Doesn't imply that the location is currently non-undef, just that a
+  // non-undef location has been seen previously.
+  auto HasDefinedBits = [&VarsWithDef](DebugAggregate A, DebugVariable V) {
+    auto FragsIt = VarsWithDef.find(A);
+    if (FragsIt == VarsWithDef.end())
+      return false;
+    return llvm::any_of(FragsIt->second, [V](auto Frag) {
+      return DIExpression::fragmentsOverlap(Frag, V.getFragmentOrDefault());
+    });
+  };
+
+  bool Changed = false;
+  DenseMap<DebugVariable, std::pair<Value *, DIExpression *>> VariableMap;
+
+  // Scan over the entire block, not just over the instructions mapped by
+  // FnVarLocs, because wedges in FnVarLocs may only be seperated by debug
+  // instructions.
+  for (const Instruction &I : *BB) {
+    // Get the defs that come just before this instruction.
+    const auto *Locs = FnVarLocs.getWedge(&I);
+    if (!Locs)
+      continue;
+
+    NumWedgesScanned++;
+    bool ChangedThisWedge = false;
+    // The new pruned set of defs.
+    SmallVector<VarLocInfo> NewDefs;
+
+    // Iterate over the existing defs.
+    for (const VarLocInfo &Loc : *Locs) {
+      NumDefsScanned++;
+      DebugAggregate Aggr{FnVarLocs.getVariable(Loc.VariableID).getVariable(),
+                          Loc.DL.getInlinedAt()};
+      DebugVariable Var = FnVarLocs.getVariable(Loc.VariableID);
+
+      // Remove undef entries that are encountered before any non-undef
+      // intrinsics from the entry block.
+      if (Loc.Values.isKillLocation(Loc.Expr) && !HasDefinedBits(Aggr, Var)) {
+        // Did not insert this Loc, which is the same as removing it.
+        NumDefsRemoved++;
+        ChangedThisWedge = true;
+        continue;
+      }
+
+      DefineBits(Aggr, Var);
+      NewDefs.push_back(Loc);
+    }
+
+    // Replace the existing wedge with the pruned version.
+    if (ChangedThisWedge) {
+      FnVarLocs.setWedge(&I, std::move(NewDefs));
+      NumWedgesChanged++;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+static bool removeRedundantDbgLocs(const BasicBlock *BB,
+                                   FunctionVarLocsBuilder &FnVarLocs) {
+  bool MadeChanges = false;
+  MadeChanges |= removeRedundantDbgLocsUsingBackwardScan(BB, FnVarLocs);
+  if (BB->isEntryBlock())
+    MadeChanges |= removeUndefDbgLocsFromEntryBlock(BB, FnVarLocs);
+  MadeChanges |= removeRedundantDbgLocsUsingForwardScan(BB, FnVarLocs);
+
+  if (MadeChanges)
+    LLVM_DEBUG(dbgs() << "Removed redundant dbg locs from: " << BB->getName()
+                      << "\n");
+  return MadeChanges;
+}
+
+static DenseSet<DebugAggregate> findVarsWithStackSlot(Function &Fn) {
+  DenseSet<DebugAggregate> Result;
+  for (auto &BB : Fn) {
+    for (auto &I : BB) {
+      // Any variable linked to an instruction is considered
+      // interesting. Ideally we only need to check Allocas, however, a
+      // DIAssignID might get dropped from an alloca but not stores. In that
+      // case, we need to consider the variable interesting for NFC behaviour
+      // with this change. TODO: Consider only looking at allocas.
+      for (DbgAssignIntrinsic *DAI : at::getAssignmentMarkers(&I)) {
+        Result.insert({DAI->getVariable(), DAI->getDebugLoc().getInlinedAt()});
+      }
+    }
+  }
+  return Result;
+}
+
+static void analyzeFunction(Function &Fn, const DataLayout &Layout,
+                            FunctionVarLocsBuilder *FnVarLocs) {
+  // The analysis will generate location definitions for all variables, but we
+  // only need to perform a dataflow on the set of variables which have a stack
+  // slot. Find those now.
+  DenseSet<DebugAggregate> VarsWithStackSlot = findVarsWithStackSlot(Fn);
+
+  bool Changed = false;
+
+  // Use a scope block to clean up AssignmentTrackingLowering before running
+  // MemLocFragmentFill to reduce peak memory consumption.
+  {
+    AssignmentTrackingLowering Pass(Fn, Layout, &VarsWithStackSlot);
+    Changed = Pass.run(FnVarLocs);
+  }
+
+  if (Changed) {
+    MemLocFragmentFill Pass(Fn, &VarsWithStackSlot,
+                            shouldCoalesceFragments(Fn));
+    Pass.run(FnVarLocs);
+
+    // Remove redundant entries. As well as reducing memory consumption and
+    // avoiding waiting cycles later by burning some now, this has another
+    // important job. That is to work around some SelectionDAG quirks. See
+    // removeRedundantDbgLocsUsingForwardScan comments for more info on that.
+    for (auto &BB : Fn)
+      removeRedundantDbgLocs(&BB, *FnVarLocs);
+  }
+}
+
+bool AssignmentTrackingAnalysis::runOnFunction(Function &F) {
+  if (!isAssignmentTrackingEnabled(*F.getParent()))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "AssignmentTrackingAnalysis run on " << F.getName()
+                    << "\n");
+  auto DL = std::make_unique<DataLayout>(F.getParent());
+
+  // Clear previous results.
+  Results->clear();
+
+  FunctionVarLocsBuilder Builder;
+  analyzeFunction(F, *DL.get(), &Builder);
+
+  // Save these results.
+  Results->init(Builder);
+
+  if (PrintResults && isFunctionInPrintList(F.getName()))
+    Results->print(errs(), F);
+
+  // Return false because this pass does not modify the function.
+  return false;
+}
+
+AssignmentTrackingAnalysis::AssignmentTrackingAnalysis()
+    : FunctionPass(ID), Results(std::make_unique<FunctionVarLocs>()) {}
+
+char AssignmentTrackingAnalysis::ID = 0;
+
+INITIALIZE_PASS(AssignmentTrackingAnalysis, DEBUG_TYPE,
+                "Assignment Tracking Analysis", false, true)
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/AtomicExpandPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/AtomicExpandPass.cpp
new file mode 100644
index 000000000000..80a0bb957cfc
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -0,0 +1,1974 @@
+//===- AtomicExpandPass.cpp - Expand atomic instructions ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass (at IR level) to replace atomic instructions with
+// __atomic_* library calls, or target specific instruction which implement the
+// same semantics in a way which better fits the target backend.  This can
+// include the use of (intrinsic-based) load-linked/store-conditional loops,
+// AtomicCmpXchg, or type coercions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/InstSimplifyFolder.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/AtomicExpandUtils.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/LowerAtomic.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "atomic-expand"
+
+namespace {
+
+class AtomicExpand : public FunctionPass {
+  const TargetLowering *TLI = nullptr;
+  const DataLayout *DL = nullptr;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  AtomicExpand() : FunctionPass(ID) {
+    initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
+  IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
+  LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
+  bool tryExpandAtomicLoad(LoadInst *LI);
+  bool expandAtomicLoadToLL(LoadInst *LI);
+  bool expandAtomicLoadToCmpXchg(LoadInst *LI);
+  StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
+  bool tryExpandAtomicStore(StoreInst *SI);
+  void expandAtomicStore(StoreInst *SI);
+  bool tryExpandAtomicRMW(AtomicRMWInst *AI);
+  AtomicRMWInst *convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI);
+  Value *
+  insertRMWLLSCLoop(IRBuilderBase &Builder, Type *ResultTy, Value *Addr,
+                    Align AddrAlign, AtomicOrdering MemOpOrder,
+                    function_ref<Value *(IRBuilderBase &, Value *)> PerformOp);
+  void expandAtomicOpToLLSC(
+      Instruction *I, Type *ResultTy, Value *Addr, Align AddrAlign,
+      AtomicOrdering MemOpOrder,
+      function_ref<Value *(IRBuilderBase &, Value *)> PerformOp);
+  void expandPartwordAtomicRMW(
+      AtomicRMWInst *I, TargetLoweringBase::AtomicExpansionKind ExpansionKind);
+  AtomicRMWInst *widenPartwordAtomicRMW(AtomicRMWInst *AI);
+  bool expandPartwordCmpXchg(AtomicCmpXchgInst *I);
+  void expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI);
+  void expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI);
+
+  AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);
+  static Value *insertRMWCmpXchgLoop(
+      IRBuilderBase &Builder, Type *ResultType, Value *Addr, Align AddrAlign,
+      AtomicOrdering MemOpOrder, SyncScope::ID SSID,
+      function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,
+      CreateCmpXchgInstFun CreateCmpXchg);
+  bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI);
+
+  bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
+  bool isIdempotentRMW(AtomicRMWInst *RMWI);
+  bool simplifyIdempotentRMW(AtomicRMWInst *RMWI);
+
+  bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, Align Alignment,
+                               Value *PointerOperand, Value *ValueOperand,
+                               Value *CASExpected, AtomicOrdering Ordering,
+                               AtomicOrdering Ordering2,
+                               ArrayRef<RTLIB::Libcall> Libcalls);
+  void expandAtomicLoadToLibcall(LoadInst *LI);
+  void expandAtomicStoreToLibcall(StoreInst *LI);
+  void expandAtomicRMWToLibcall(AtomicRMWInst *I);
+  void expandAtomicCASToLibcall(AtomicCmpXchgInst *I);
+
+  friend bool
+  llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
+                                 CreateCmpXchgInstFun CreateCmpXchg);
+};
+
+// IRBuilder to be used for replacement atomic instructions.
+struct ReplacementIRBuilder : IRBuilder<InstSimplifyFolder> {
+  // Preserves the DebugLoc from I, and preserves still valid metadata.
+  explicit ReplacementIRBuilder(Instruction *I, const DataLayout &DL)
+      : IRBuilder(I->getContext(), DL) {
+    SetInsertPoint(I);
+    this->CollectMetadataToCopy(I, {LLVMContext::MD_pcsections});
+  }
+};
+
+} // end anonymous namespace
+
+char AtomicExpand::ID = 0;
+
+char &llvm::AtomicExpandID = AtomicExpand::ID;
+
+INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions", false,
+                false)
+
+FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); }
+
+// Helper functions to retrieve the size of atomic instructions.
+static unsigned getAtomicOpSize(LoadInst *LI) {
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  return DL.getTypeStoreSize(LI->getType());
+}
+
+static unsigned getAtomicOpSize(StoreInst *SI) {
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  return DL.getTypeStoreSize(SI->getValueOperand()->getType());
+}
+
+static unsigned getAtomicOpSize(AtomicRMWInst *RMWI) {
+  const DataLayout &DL = RMWI->getModule()->getDataLayout();
+  return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
+}
+
+static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {
+  const DataLayout &DL = CASI->getModule()->getDataLayout();
+  return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
+}
+
+// Determine if a particular atomic operation has a supported size,
+// and is of appropriate alignment, to be passed through for target
+// lowering. (Versus turning into a __atomic libcall)
+template <typename Inst>
+static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
+  unsigned Size = getAtomicOpSize(I);
+  Align Alignment = I->getAlign();
+  return Alignment >= Size &&
+         Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
+}
+
+bool AtomicExpand::runOnFunction(Function &F) {
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  auto &TM = TPC->getTM<TargetMachine>();
+  const auto *Subtarget = TM.getSubtargetImpl(F);
+  if (!Subtarget->enableAtomicExpand())
+    return false;
+  TLI = Subtarget->getTargetLowering();
+  DL = &F.getParent()->getDataLayout();
+
+  SmallVector<Instruction *, 1> AtomicInsts;
+
+  // Changing control-flow while iterating through it is a bad idea, so gather a
+  // list of all atomic instructions before we start.
+  for (Instruction &I : instructions(F))
+    if (I.isAtomic() && !isa<FenceInst>(&I))
+      AtomicInsts.push_back(&I);
+
+  bool MadeChange = false;
+  for (auto *I : AtomicInsts) {
+    auto LI = dyn_cast<LoadInst>(I);
+    auto SI = dyn_cast<StoreInst>(I);
+    auto RMWI = dyn_cast<AtomicRMWInst>(I);
+    auto CASI = dyn_cast<AtomicCmpXchgInst>(I);
+    assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction");
+
+    // If the Size/Alignment is not supported, replace with a libcall.
+    if (LI) {
+      if (!atomicSizeSupported(TLI, LI)) {
+        expandAtomicLoadToLibcall(LI);
+        MadeChange = true;
+        continue;
+      }
+    } else if (SI) {
+      if (!atomicSizeSupported(TLI, SI)) {
+        expandAtomicStoreToLibcall(SI);
+        MadeChange = true;
+        continue;
+      }
+    } else if (RMWI) {
+      if (!atomicSizeSupported(TLI, RMWI)) {
+        expandAtomicRMWToLibcall(RMWI);
+        MadeChange = true;
+        continue;
+      }
+    } else if (CASI) {
+      if (!atomicSizeSupported(TLI, CASI)) {
+        expandAtomicCASToLibcall(CASI);
+        MadeChange = true;
+        continue;
+      }
+    }
+
+    if (LI && TLI->shouldCastAtomicLoadInIR(LI) ==
+                  TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
+      I = LI = convertAtomicLoadToIntegerType(LI);
+      MadeChange = true;
+    } else if (SI &&
+               TLI->shouldCastAtomicStoreInIR(SI) ==
+                   TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
+      I = SI = convertAtomicStoreToIntegerType(SI);
+      MadeChange = true;
+    } else if (RMWI &&
+               TLI->shouldCastAtomicRMWIInIR(RMWI) ==
+                   TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
+      I = RMWI = convertAtomicXchgToIntegerType(RMWI);
+      MadeChange = true;
+    } else if (CASI) {
+      // TODO: when we're ready to make the change at the IR level, we can
+      // extend convertCmpXchgToInteger for floating point too.
+      if (CASI->getCompareOperand()->getType()->isPointerTy()) {
+        // TODO: add a TLI hook to control this so that each target can
+        // convert to lowering the original type one at a time.
+        I = CASI = convertCmpXchgToIntegerType(CASI);
+        MadeChange = true;
+      }
+    }
+
+    if (TLI->shouldInsertFencesForAtomic(I)) {
+      auto FenceOrdering = AtomicOrdering::Monotonic;
+      if (LI && isAcquireOrStronger(LI->getOrdering())) {
+        FenceOrdering = LI->getOrdering();
+        LI->setOrdering(AtomicOrdering::Monotonic);
+      } else if (SI && isReleaseOrStronger(SI->getOrdering())) {
+        FenceOrdering = SI->getOrdering();
+        SI->setOrdering(AtomicOrdering::Monotonic);
+      } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
+                          isAcquireOrStronger(RMWI->getOrdering()))) {
+        FenceOrdering = RMWI->getOrdering();
+        RMWI->setOrdering(AtomicOrdering::Monotonic);
+      } else if (CASI &&
+                 TLI->shouldExpandAtomicCmpXchgInIR(CASI) ==
+                     TargetLoweringBase::AtomicExpansionKind::None &&
+                 (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
+                  isAcquireOrStronger(CASI->getSuccessOrdering()) ||
+                  isAcquireOrStronger(CASI->getFailureOrdering()))) {
+        // If a compare and swap is lowered to LL/SC, we can do smarter fence
+        // insertion, with a stronger one on the success path than on the
+        // failure path. As a result, fence insertion is directly done by
+        // expandAtomicCmpXchg in that case.
+        FenceOrdering = CASI->getMergedOrdering();
+        CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
+        CASI->setFailureOrdering(AtomicOrdering::Monotonic);
+      }
+
+      if (FenceOrdering != AtomicOrdering::Monotonic) {
+        MadeChange |= bracketInstWithFences(I, FenceOrdering);
+      }
+    } else if (I->hasAtomicStore() &&
+               TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
+      auto FenceOrdering = AtomicOrdering::Monotonic;
+      if (SI)
+        FenceOrdering = SI->getOrdering();
+      else if (RMWI)
+        FenceOrdering = RMWI->getOrdering();
+      else if (CASI && TLI->shouldExpandAtomicCmpXchgInIR(CASI) !=
+                           TargetLoweringBase::AtomicExpansionKind::LLSC)
+        // LLSC is handled in expandAtomicCmpXchg().
+        FenceOrdering = CASI->getSuccessOrdering();
+
+      IRBuilder Builder(I);
+      if (auto TrailingFence =
+              TLI->emitTrailingFence(Builder, I, FenceOrdering)) {
+        TrailingFence->moveAfter(I);
+        MadeChange = true;
+      }
+    }
+
+    if (LI)
+      MadeChange |= tryExpandAtomicLoad(LI);
+    else if (SI)
+      MadeChange |= tryExpandAtomicStore(SI);
+    else if (RMWI) {
+      // There are two different ways of expanding RMW instructions:
+      // - into a load if it is idempotent
+      // - into a Cmpxchg/LL-SC loop otherwise
+      // we try them in that order.
+
+      if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
+        MadeChange = true;
+      } else {
+        AtomicRMWInst::BinOp Op = RMWI->getOperation();
+        unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
+        unsigned ValueSize = getAtomicOpSize(RMWI);
+        if (ValueSize < MinCASSize &&
+            (Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor ||
+             Op == AtomicRMWInst::And)) {
+          RMWI = widenPartwordAtomicRMW(RMWI);
+          MadeChange = true;
+        }
+
+        MadeChange |= tryExpandAtomicRMW(RMWI);
+      }
+    } else if (CASI)
+      MadeChange |= tryExpandAtomicCmpXchg(CASI);
+  }
+  return MadeChange;
+}
+
+bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) {
+  ReplacementIRBuilder Builder(I, *DL);
+
+  auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
+
+  auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
+  // We have a guard here because not every atomic operation generates a
+  // trailing fence.
+  if (TrailingFence)
+    TrailingFence->moveAfter(I);
+
+  return (LeadingFence || TrailingFence);
+}
+
+/// Get the iX type with the same bitwidth as T.
+IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T,
+                                                       const DataLayout &DL) {
+  EVT VT = TLI->getMemValueType(DL, T);
+  unsigned BitWidth = VT.getStoreSizeInBits();
+  assert(BitWidth == VT.getSizeInBits() && "must be a power of two");
+  return IntegerType::get(T->getContext(), BitWidth);
+}
+
+/// Convert an atomic load of a non-integral type to an integer load of the
+/// equivalent bitwidth.  See the function comment on
+/// convertAtomicStoreToIntegerType for background.
+LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
+  auto *M = LI->getModule();
+  Type *NewTy = getCorrespondingIntegerType(LI->getType(), M->getDataLayout());
+
+  ReplacementIRBuilder Builder(LI, *DL);
+
+  Value *Addr = LI->getPointerOperand();
+  Type *PT = PointerType::get(NewTy, Addr->getType()->getPointerAddressSpace());
+  Value *NewAddr = Builder.CreateBitCast(Addr, PT);
+
+  auto *NewLI = Builder.CreateLoad(NewTy, NewAddr);
+  NewLI->setAlignment(LI->getAlign());
+  NewLI->setVolatile(LI->isVolatile());
+  NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
+  LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
+
+  Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
+  LI->replaceAllUsesWith(NewVal);
+  LI->eraseFromParent();
+  return NewLI;
+}
+
+AtomicRMWInst *
+AtomicExpand::convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI) {
+  auto *M = RMWI->getModule();
+  Type *NewTy =
+      getCorrespondingIntegerType(RMWI->getType(), M->getDataLayout());
+
+  ReplacementIRBuilder Builder(RMWI, *DL);
+
+  Value *Addr = RMWI->getPointerOperand();
+  Value *Val = RMWI->getValOperand();
+  Type *PT = PointerType::get(NewTy, RMWI->getPointerAddressSpace());
+  Value *NewAddr = Builder.CreateBitCast(Addr, PT);
+  Value *NewVal = Val->getType()->isPointerTy()
+                      ? Builder.CreatePtrToInt(Val, NewTy)
+                      : Builder.CreateBitCast(Val, NewTy);
+
+  auto *NewRMWI =
+      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, NewAddr, NewVal,
+                              RMWI->getAlign(), RMWI->getOrdering());
+  NewRMWI->setVolatile(RMWI->isVolatile());
+  LLVM_DEBUG(dbgs() << "Replaced " << *RMWI << " with " << *NewRMWI << "\n");
+
+  Value *NewRVal = RMWI->getType()->isPointerTy()
+                       ? Builder.CreateIntToPtr(NewRMWI, RMWI->getType())
+                       : Builder.CreateBitCast(NewRMWI, RMWI->getType());
+  RMWI->replaceAllUsesWith(NewRVal);
+  RMWI->eraseFromParent();
+  return NewRMWI;
+}
+
+bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
+  switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
+  case TargetLoweringBase::AtomicExpansionKind::None:
+    return false;
+  case TargetLoweringBase::AtomicExpansionKind::LLSC:
+    expandAtomicOpToLLSC(
+        LI, LI->getType(), LI->getPointerOperand(), LI->getAlign(),
+        LI->getOrdering(),
+        [](IRBuilderBase &Builder, Value *Loaded) { return Loaded; });
+    return true;
+  case TargetLoweringBase::AtomicExpansionKind::LLOnly:
+    return expandAtomicLoadToLL(LI);
+  case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
+    return expandAtomicLoadToCmpXchg(LI);
+  case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
+    LI->setAtomic(AtomicOrdering::NotAtomic);
+    return true;
+  default:
+    llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
+  }
+}
+
+bool AtomicExpand::tryExpandAtomicStore(StoreInst *SI) {
+  switch (TLI->shouldExpandAtomicStoreInIR(SI)) {
+  case TargetLoweringBase::AtomicExpansionKind::None:
+    return false;
+  case TargetLoweringBase::AtomicExpansionKind::Expand:
+    expandAtomicStore(SI);
+    return true;
+  case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
+    SI->setAtomic(AtomicOrdering::NotAtomic);
+    return true;
+  default:
+    llvm_unreachable("Unhandled case in tryExpandAtomicStore");
+  }
+}
+
+bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
+  ReplacementIRBuilder Builder(LI, *DL);
+
+  // On some architectures, load-linked instructions are atomic for larger
+  // sizes than normal loads. For example, the only 64-bit load guaranteed
+  // to be single-copy atomic by ARM is an ldrexd (A3.5.3).
+  Value *Val = TLI->emitLoadLinked(Builder, LI->getType(),
+                                   LI->getPointerOperand(), LI->getOrdering());
+  TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
+
+  LI->replaceAllUsesWith(Val);
+  LI->eraseFromParent();
+
+  return true;
+}
+
+bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
+  ReplacementIRBuilder Builder(LI, *DL);
+  AtomicOrdering Order = LI->getOrdering();
+  if (Order == AtomicOrdering::Unordered)
+    Order = AtomicOrdering::Monotonic;
+
+  Value *Addr = LI->getPointerOperand();
+  Type *Ty = LI->getType();
+  Constant *DummyVal = Constant::getNullValue(Ty);
+
+  Value *Pair = Builder.CreateAtomicCmpXchg(
+      Addr, DummyVal, DummyVal, LI->getAlign(), Order,
+      AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
+  Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded");
+
+  LI->replaceAllUsesWith(Loaded);
+  LI->eraseFromParent();
+
+  return true;
+}
+
+/// Convert an atomic store of a non-integral type to an integer store of the
+/// equivalent bitwidth.  We used to not support floating point or vector
+/// atomics in the IR at all.  The backends learned to deal with the bitcast
+/// idiom because that was the only way of expressing the notion of a atomic
+/// float or vector store.  The long term plan is to teach each backend to
+/// instruction select from the original atomic store, but as a migration
+/// mechanism, we convert back to the old format which the backends understand.
+/// Each backend will need individual work to recognize the new format.
+StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
+  ReplacementIRBuilder Builder(SI, *DL);
+  auto *M = SI->getModule();
+  Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(),
+                                            M->getDataLayout());
+  Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy);
+
+  Value *Addr = SI->getPointerOperand();
+  Type *PT = PointerType::get(NewTy, Addr->getType()->getPointerAddressSpace());
+  Value *NewAddr = Builder.CreateBitCast(Addr, PT);
+
+  StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
+  NewSI->setAlignment(SI->getAlign());
+  NewSI->setVolatile(SI->isVolatile());
+  NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
+  LLVM_DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
+  SI->eraseFromParent();
+  return NewSI;
+}
+
+void AtomicExpand::expandAtomicStore(StoreInst *SI) {
+  // This function is only called on atomic stores that are too large to be
+  // atomic if implemented as a native store. So we replace them by an
+  // atomic swap, that can be implemented for example as a ldrex/strex on ARM
+  // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
+  // It is the responsibility of the target to only signal expansion via
+  // shouldExpandAtomicRMW in cases where this is required and possible.
+  ReplacementIRBuilder Builder(SI, *DL);
+  AtomicOrdering Ordering = SI->getOrdering();
+  assert(Ordering != AtomicOrdering::NotAtomic);
+  AtomicOrdering RMWOrdering = Ordering == AtomicOrdering::Unordered
+                                   ? AtomicOrdering::Monotonic
+                                   : Ordering;
+  AtomicRMWInst *AI = Builder.CreateAtomicRMW(
+      AtomicRMWInst::Xchg, SI->getPointerOperand(), SI->getValueOperand(),
+      SI->getAlign(), RMWOrdering);
+  SI->eraseFromParent();
+
+  // Now we have an appropriate swap instruction, lower it as usual.
+  tryExpandAtomicRMW(AI);
+}
+
+static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
+                                 Value *Loaded, Value *NewVal, Align AddrAlign,
+                                 AtomicOrdering MemOpOrder, SyncScope::ID SSID,
+                                 Value *&Success, Value *&NewLoaded) {
+  Type *OrigTy = NewVal->getType();
+
+  // This code can go away when cmpxchg supports FP types.
+  assert(!OrigTy->isPointerTy());
+  bool NeedBitcast = OrigTy->isFloatingPointTy();
+  if (NeedBitcast) {
+    IntegerType *IntTy = Builder.getIntNTy(OrigTy->getPrimitiveSizeInBits());
+    unsigned AS = Addr->getType()->getPointerAddressSpace();
+    Addr = Builder.CreateBitCast(Addr, IntTy->getPointerTo(AS));
+    NewVal = Builder.CreateBitCast(NewVal, IntTy);
+    Loaded = Builder.CreateBitCast(Loaded, IntTy);
+  }
+
+  Value *Pair = Builder.CreateAtomicCmpXchg(
+      Addr, Loaded, NewVal, AddrAlign, MemOpOrder,
+      AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID);
+  Success = Builder.CreateExtractValue(Pair, 1, "success");
+  NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
+
+  if (NeedBitcast)
+    NewLoaded = Builder.CreateBitCast(NewLoaded, OrigTy);
+}
+
+bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
+  LLVMContext &Ctx = AI->getModule()->getContext();
+  TargetLowering::AtomicExpansionKind Kind = TLI->shouldExpandAtomicRMWInIR(AI);
+  switch (Kind) {
+  case TargetLoweringBase::AtomicExpansionKind::None:
+    return false;
+  case TargetLoweringBase::AtomicExpansionKind::LLSC: {
+    unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
+    unsigned ValueSize = getAtomicOpSize(AI);
+    if (ValueSize < MinCASSize) {
+      expandPartwordAtomicRMW(AI,
+                              TargetLoweringBase::AtomicExpansionKind::LLSC);
+    } else {
+      auto PerformOp = [&](IRBuilderBase &Builder, Value *Loaded) {
+        return buildAtomicRMWValue(AI->getOperation(), Builder, Loaded,
+                                   AI->getValOperand());
+      };
+      expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(),
+                           AI->getAlign(), AI->getOrdering(), PerformOp);
+    }
+    return true;
+  }
+  case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {
+    unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
+    unsigned ValueSize = getAtomicOpSize(AI);
+    if (ValueSize < MinCASSize) {
+      expandPartwordAtomicRMW(AI,
+                              TargetLoweringBase::AtomicExpansionKind::CmpXChg);
+    } else {
+      SmallVector<StringRef> SSNs;
+      Ctx.getSyncScopeNames(SSNs);
+      auto MemScope = SSNs[AI->getSyncScopeID()].empty()
+                          ? "system"
+                          : SSNs[AI->getSyncScopeID()];
+      OptimizationRemarkEmitter ORE(AI->getFunction());
+      ORE.emit([&]() {
+        return OptimizationRemark(DEBUG_TYPE, "Passed", AI)
+               << "A compare and swap loop was generated for an atomic "
+               << AI->getOperationName(AI->getOperation()) << " operation at "
+               << MemScope << " memory scope";
+      });
+      expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
+    }
+    return true;
+  }
+  case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic: {
+    expandAtomicRMWToMaskedIntrinsic(AI);
+    return true;
+  }
+  case TargetLoweringBase::AtomicExpansionKind::BitTestIntrinsic: {
+    TLI->emitBitTestAtomicRMWIntrinsic(AI);
+    return true;
+  }
+  case TargetLoweringBase::AtomicExpansionKind::CmpArithIntrinsic: {
+    TLI->emitCmpArithAtomicRMWIntrinsic(AI);
+    return true;
+  }
+  case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
+    return lowerAtomicRMWInst(AI);
+  case TargetLoweringBase::AtomicExpansionKind::Expand:
+    TLI->emitExpandAtomicRMW(AI);
+    return true;
+  default:
+    llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
+  }
+}
+
+namespace {
+
+struct PartwordMaskValues {
+  // These three fields are guaranteed to be set by createMaskInstrs.
+  Type *WordType = nullptr;
+  Type *ValueType = nullptr;
+  Type *IntValueType = nullptr;
+  Value *AlignedAddr = nullptr;
+  Align AlignedAddrAlignment;
+  // The remaining fields can be null.
+  Value *ShiftAmt = nullptr;
+  Value *Mask = nullptr;
+  Value *Inv_Mask = nullptr;
+};
+
+LLVM_ATTRIBUTE_UNUSED
+raw_ostream &operator<<(raw_ostream &O, const PartwordMaskValues &PMV) {
+  auto PrintObj = [&O](auto *V) {
+    if (V)
+      O << *V;
+    else
+      O << "nullptr";
+    O << '\n';
+  };
+  O << "PartwordMaskValues {\n";
+  O << "  WordType: ";
+  PrintObj(PMV.WordType);
+  O << "  ValueType: ";
+  PrintObj(PMV.ValueType);
+  O << "  AlignedAddr: ";
+  PrintObj(PMV.AlignedAddr);
+  O << "  AlignedAddrAlignment: " << PMV.AlignedAddrAlignment.value() << '\n';
+  O << "  ShiftAmt: ";
+  PrintObj(PMV.ShiftAmt);
+  O << "  Mask: ";
+  PrintObj(PMV.Mask);
+  O << "  Inv_Mask: ";
+  PrintObj(PMV.Inv_Mask);
+  O << "}\n";
+  return O;
+}
+
+} // end anonymous namespace
+
+/// This is a helper function which builds instructions to provide
+/// values necessary for partword atomic operations. It takes an
+/// incoming address, Addr, and ValueType, and constructs the address,
+/// shift-amounts and masks needed to work with a larger value of size
+/// WordSize.
+///
+/// AlignedAddr: Addr rounded down to a multiple of WordSize
+///
+/// ShiftAmt: Number of bits to right-shift a WordSize value loaded
+///           from AlignAddr for it to have the same value as if
+///           ValueType was loaded from Addr.
+///
+/// Mask: Value to mask with the value loaded from AlignAddr to
+///       include only the part that would've been loaded from Addr.
+///
+/// Inv_Mask: The inverse of Mask.
+static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder,
+                                           Instruction *I, Type *ValueType,
+                                           Value *Addr, Align AddrAlign,
+                                           unsigned MinWordSize) {
+  PartwordMaskValues PMV;
+
+  Module *M = I->getModule();
+  LLVMContext &Ctx = M->getContext();
+  const DataLayout &DL = M->getDataLayout();
+  unsigned ValueSize = DL.getTypeStoreSize(ValueType);
+
+  PMV.ValueType = PMV.IntValueType = ValueType;
+  if (PMV.ValueType->isFloatingPointTy())
+    PMV.IntValueType =
+        Type::getIntNTy(Ctx, ValueType->getPrimitiveSizeInBits());
+
+  PMV.WordType = MinWordSize > ValueSize ? Type::getIntNTy(Ctx, MinWordSize * 8)
+                                         : ValueType;
+  if (PMV.ValueType == PMV.WordType) {
+    PMV.AlignedAddr = Addr;
+    PMV.AlignedAddrAlignment = AddrAlign;
+    PMV.ShiftAmt = ConstantInt::get(PMV.ValueType, 0);
+    PMV.Mask = ConstantInt::get(PMV.ValueType, ~0, /*isSigned*/ true);
+    return PMV;
+  }
+
+  PMV.AlignedAddrAlignment = Align(MinWordSize);
+
+  assert(ValueSize < MinWordSize);
+
+  PointerType *PtrTy = cast<PointerType>(Addr->getType());
+  Type *WordPtrType = PMV.WordType->getPointerTo(PtrTy->getAddressSpace());
+  IntegerType *IntTy = DL.getIntPtrType(Ctx, PtrTy->getAddressSpace());
+  Value *PtrLSB;
+
+  if (AddrAlign < MinWordSize) {
+    PMV.AlignedAddr = Builder.CreateIntrinsic(
+        Intrinsic::ptrmask, {PtrTy, IntTy},
+        {Addr, ConstantInt::get(IntTy, ~(uint64_t)(MinWordSize - 1))}, nullptr,
+        "AlignedAddr");
+
+    Value *AddrInt = Builder.CreatePtrToInt(Addr, IntTy);
+    PtrLSB = Builder.CreateAnd(AddrInt, MinWordSize - 1, "PtrLSB");
+  } else {
+    // If the alignment is high enough, the LSB are known 0.
+    PMV.AlignedAddr = Addr;
+    PtrLSB = ConstantInt::getNullValue(IntTy);
+  }
+
+  if (DL.isLittleEndian()) {
+    // turn bytes into bits
+    PMV.ShiftAmt = Builder.CreateShl(PtrLSB, 3);
+  } else {
+    // turn bytes into bits, and count from the other side.
+    PMV.ShiftAmt = Builder.CreateShl(
+        Builder.CreateXor(PtrLSB, MinWordSize - ValueSize), 3);
+  }
+
+  PMV.ShiftAmt = Builder.CreateTrunc(PMV.ShiftAmt, PMV.WordType, "ShiftAmt");
+  PMV.Mask = Builder.CreateShl(
+      ConstantInt::get(PMV.WordType, (1 << (ValueSize * 8)) - 1), PMV.ShiftAmt,
+      "Mask");
+
+  PMV.Inv_Mask = Builder.CreateNot(PMV.Mask, "Inv_Mask");
+
+  // Cast for typed pointers.
+  PMV.AlignedAddr =
+    Builder.CreateBitCast(PMV.AlignedAddr, WordPtrType, "AlignedAddr");
+
+  return PMV;
+}
+
+static Value *extractMaskedValue(IRBuilderBase &Builder, Value *WideWord,
+                                 const PartwordMaskValues &PMV) {
+  assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
+  if (PMV.WordType == PMV.ValueType)
+    return WideWord;
+
+  Value *Shift = Builder.CreateLShr(WideWord, PMV.ShiftAmt, "shifted");
+  Value *Trunc = Builder.CreateTrunc(Shift, PMV.IntValueType, "extracted");
+  return Builder.CreateBitCast(Trunc, PMV.ValueType);
+}
+
+static Value *insertMaskedValue(IRBuilderBase &Builder, Value *WideWord,
+                                Value *Updated, const PartwordMaskValues &PMV) {
+  assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
+  assert(Updated->getType() == PMV.ValueType && "Value type mismatch");
+  if (PMV.WordType == PMV.ValueType)
+    return Updated;
+
+  Updated = Builder.CreateBitCast(Updated, PMV.IntValueType);
+
+  Value *ZExt = Builder.CreateZExt(Updated, PMV.WordType, "extended");
+  Value *Shift =
+      Builder.CreateShl(ZExt, PMV.ShiftAmt, "shifted", /*HasNUW*/ true);
+  Value *And = Builder.CreateAnd(WideWord, PMV.Inv_Mask, "unmasked");
+  Value *Or = Builder.CreateOr(And, Shift, "inserted");
+  return Or;
+}
+
+/// Emit IR to implement a masked version of a given atomicrmw
+/// operation. (That is, only the bits under the Mask should be
+/// affected by the operation)
+static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
+                                    IRBuilderBase &Builder, Value *Loaded,
+                                    Value *Shifted_Inc, Value *Inc,
+                                    const PartwordMaskValues &PMV) {
+  // TODO: update to use
+  // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge in order
+  // to merge bits from two values without requiring PMV.Inv_Mask.
+  switch (Op) {
+  case AtomicRMWInst::Xchg: {
+    Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
+    Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc);
+    return FinalVal;
+  }
+  case AtomicRMWInst::Or:
+  case AtomicRMWInst::Xor:
+  case AtomicRMWInst::And:
+    llvm_unreachable("Or/Xor/And handled by widenPartwordAtomicRMW");
+  case AtomicRMWInst::Add:
+  case AtomicRMWInst::Sub:
+  case AtomicRMWInst::Nand: {
+    // The other arithmetic ops need to be masked into place.
+    Value *NewVal = buildAtomicRMWValue(Op, Builder, Loaded, Shifted_Inc);
+    Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask);
+    Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
+    Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked);
+    return FinalVal;
+  }
+  case AtomicRMWInst::Max:
+  case AtomicRMWInst::Min:
+  case AtomicRMWInst::UMax:
+  case AtomicRMWInst::UMin:
+  case AtomicRMWInst::FAdd:
+  case AtomicRMWInst::FSub:
+  case AtomicRMWInst::FMin:
+  case AtomicRMWInst::FMax:
+  case AtomicRMWInst::UIncWrap:
+  case AtomicRMWInst::UDecWrap: {
+    // Finally, other ops will operate on the full value, so truncate down to
+    // the original size, and expand out again after doing the
+    // operation. Bitcasts will be inserted for FP values.
+    Value *Loaded_Extract = extractMaskedValue(Builder, Loaded, PMV);
+    Value *NewVal = buildAtomicRMWValue(Op, Builder, Loaded_Extract, Inc);
+    Value *FinalVal = insertMaskedValue(Builder, Loaded, NewVal, PMV);
+    return FinalVal;
+  }
+  default:
+    llvm_unreachable("Unknown atomic op");
+  }
+}
+
+/// Expand a sub-word atomicrmw operation into an appropriate
+/// word-sized operation.
+///
+/// It will create an LL/SC or cmpxchg loop, as appropriate, the same
+/// way as a typical atomicrmw expansion. The only difference here is
+/// that the operation inside of the loop may operate upon only a
+/// part of the value.
+void AtomicExpand::expandPartwordAtomicRMW(
+    AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) {
+  AtomicOrdering MemOpOrder = AI->getOrdering();
+  SyncScope::ID SSID = AI->getSyncScopeID();
+
+  ReplacementIRBuilder Builder(AI, *DL);
+
+  PartwordMaskValues PMV =
+      createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),
+                       AI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);
+
+  Value *ValOperand_Shifted = nullptr;
+  if (AI->getOperation() == AtomicRMWInst::Xchg ||
+      AI->getOperation() == AtomicRMWInst::Add ||
+      AI->getOperation() == AtomicRMWInst::Sub ||
+      AI->getOperation() == AtomicRMWInst::Nand) {
+    ValOperand_Shifted =
+        Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType),
+                          PMV.ShiftAmt, "ValOperand_Shifted");
+  }
+
+  auto PerformPartwordOp = [&](IRBuilderBase &Builder, Value *Loaded) {
+    return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded,
+                                 ValOperand_Shifted, AI->getValOperand(), PMV);
+  };
+
+  Value *OldResult;
+  if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) {
+    OldResult = insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr,
+                                     PMV.AlignedAddrAlignment, MemOpOrder, SSID,
+                                     PerformPartwordOp, createCmpXchgInstFun);
+  } else {
+    assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC);
+    OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr,
+                                  PMV.AlignedAddrAlignment, MemOpOrder,
+                                  PerformPartwordOp);
+  }
+
+  Value *FinalOldResult = extractMaskedValue(Builder, OldResult, PMV);
+  AI->replaceAllUsesWith(FinalOldResult);
+  AI->eraseFromParent();
+}
+
+// Widen the bitwise atomicrmw (or/xor/and) to the minimum supported width.
+AtomicRMWInst *AtomicExpand::widenPartwordAtomicRMW(AtomicRMWInst *AI) {
+  ReplacementIRBuilder Builder(AI, *DL);
+  AtomicRMWInst::BinOp Op = AI->getOperation();
+
+  assert((Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor ||
+          Op == AtomicRMWInst::And) &&
+         "Unable to widen operation");
+
+  PartwordMaskValues PMV =
+      createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),
+                       AI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);
+
+  Value *ValOperand_Shifted =
+      Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType),
+                        PMV.ShiftAmt, "ValOperand_Shifted");
+
+  Value *NewOperand;
+
+  if (Op == AtomicRMWInst::And)
+    NewOperand =
+        Builder.CreateOr(PMV.Inv_Mask, ValOperand_Shifted, "AndOperand");
+  else
+    NewOperand = ValOperand_Shifted;
+
+  AtomicRMWInst *NewAI =
+      Builder.CreateAtomicRMW(Op, PMV.AlignedAddr, NewOperand,
+                              PMV.AlignedAddrAlignment, AI->getOrdering());
+
+  Value *FinalOldResult = extractMaskedValue(Builder, NewAI, PMV);
+  AI->replaceAllUsesWith(FinalOldResult);
+  AI->eraseFromParent();
+  return NewAI;
+}
+
+bool AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
+  // The basic idea here is that we're expanding a cmpxchg of a
+  // smaller memory size up to a word-sized cmpxchg. To do this, we
+  // need to add a retry-loop for strong cmpxchg, so that
+  // modifications to other parts of the word don't cause a spurious
+  // failure.
+
+  // This generates code like the following:
+  //     [[Setup mask values PMV.*]]
+  //     %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt
+  //     %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt
+  //     %InitLoaded = load i32* %addr
+  //     %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask
+  //     br partword.cmpxchg.loop
+  // partword.cmpxchg.loop:
+  //     %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ],
+  //        [ %OldVal_MaskOut, %partword.cmpxchg.failure ]
+  //     %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted
+  //     %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted
+  //     %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp,
+  //        i32 %FullWord_NewVal success_ordering failure_ordering
+  //     %OldVal = extractvalue { i32, i1 } %NewCI, 0
+  //     %Success = extractvalue { i32, i1 } %NewCI, 1
+  //     br i1 %Success, label %partword.cmpxchg.end,
+  //        label %partword.cmpxchg.failure
+  // partword.cmpxchg.failure:
+  //     %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask
+  //     %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut
+  //     br i1 %ShouldContinue, label %partword.cmpxchg.loop,
+  //         label %partword.cmpxchg.end
+  // partword.cmpxchg.end:
+  //    %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt
+  //    %FinalOldVal = trunc i32 %tmp1 to i8
+  //    %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0
+  //    %Res = insertvalue { i8, i1 } %25, i1 %Success, 1
+
+  Value *Addr = CI->getPointerOperand();
+  Value *Cmp = CI->getCompareOperand();
+  Value *NewVal = CI->getNewValOperand();
+
+  BasicBlock *BB = CI->getParent();
+  Function *F = BB->getParent();
+  ReplacementIRBuilder Builder(CI, *DL);
+  LLVMContext &Ctx = Builder.getContext();
+
+  BasicBlock *EndBB =
+      BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end");
+  auto FailureBB =
+      BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB);
+  auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB);
+
+  // The split call above "helpfully" added a branch at the end of BB
+  // (to the wrong place).
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+
+  PartwordMaskValues PMV =
+      createMaskInstrs(Builder, CI, CI->getCompareOperand()->getType(), Addr,
+                       CI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);
+
+  // Shift the incoming values over, into the right location in the word.
+  Value *NewVal_Shifted =
+      Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);
+  Value *Cmp_Shifted =
+      Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt);
+
+  // Load the entire current word, and mask into place the expected and new
+  // values
+  LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr);
+  InitLoaded->setVolatile(CI->isVolatile());
+  Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask);
+  Builder.CreateBr(LoopBB);
+
+  // partword.cmpxchg.loop:
+  Builder.SetInsertPoint(LoopBB);
+  PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2);
+  Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB);
+
+  // Mask/Or the expected and new values into place in the loaded word.
+  Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted);
+  Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted);
+  AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg(
+      PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, PMV.AlignedAddrAlignment,
+      CI->getSuccessOrdering(), CI->getFailureOrdering(), CI->getSyncScopeID());
+  NewCI->setVolatile(CI->isVolatile());
+  // When we're building a strong cmpxchg, we need a loop, so you
+  // might think we could use a weak cmpxchg inside. But, using strong
+  // allows the below comparison for ShouldContinue, and we're
+  // expecting the underlying cmpxchg to be a machine instruction,
+  // which is strong anyways.
+  NewCI->setWeak(CI->isWeak());
+
+  Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
+  Value *Success = Builder.CreateExtractValue(NewCI, 1);
+
+  if (CI->isWeak())
+    Builder.CreateBr(EndBB);
+  else
+    Builder.CreateCondBr(Success, EndBB, FailureBB);
+
+  // partword.cmpxchg.failure:
+  Builder.SetInsertPoint(FailureBB);
+  // Upon failure, verify that the masked-out part of the loaded value
+  // has been modified.  If it didn't, abort the cmpxchg, since the
+  // masked-in part must've.
+  Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask);
+  Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut);
+  Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB);
+
+  // Add the second value to the phi from above
+  Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB);
+
+  // partword.cmpxchg.end:
+  Builder.SetInsertPoint(CI);
+
+  Value *FinalOldVal = extractMaskedValue(Builder, OldVal, PMV);
+  Value *Res = PoisonValue::get(CI->getType());
+  Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);
+  Res = Builder.CreateInsertValue(Res, Success, 1);
+
+  CI->replaceAllUsesWith(Res);
+  CI->eraseFromParent();
+  return true;
+}
+
+void AtomicExpand::expandAtomicOpToLLSC(
+    Instruction *I, Type *ResultType, Value *Addr, Align AddrAlign,
+    AtomicOrdering MemOpOrder,
+    function_ref<Value *(IRBuilderBase &, Value *)> PerformOp) {
+  ReplacementIRBuilder Builder(I, *DL);
+  Value *Loaded = insertRMWLLSCLoop(Builder, ResultType, Addr, AddrAlign,
+                                    MemOpOrder, PerformOp);
+
+  I->replaceAllUsesWith(Loaded);
+  I->eraseFromParent();
+}
+
+void AtomicExpand::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) {
+  ReplacementIRBuilder Builder(AI, *DL);
+
+  PartwordMaskValues PMV =
+      createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),
+                       AI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);
+
+  // The value operand must be sign-extended for signed min/max so that the
+  // target's signed comparison instructions can be used. Otherwise, just
+  // zero-ext.
+  Instruction::CastOps CastOp = Instruction::ZExt;
+  AtomicRMWInst::BinOp RMWOp = AI->getOperation();
+  if (RMWOp == AtomicRMWInst::Max || RMWOp == AtomicRMWInst::Min)
+    CastOp = Instruction::SExt;
+
+  Value *ValOperand_Shifted = Builder.CreateShl(
+      Builder.CreateCast(CastOp, AI->getValOperand(), PMV.WordType),
+      PMV.ShiftAmt, "ValOperand_Shifted");
+  Value *OldResult = TLI->emitMaskedAtomicRMWIntrinsic(
+      Builder, AI, PMV.AlignedAddr, ValOperand_Shifted, PMV.Mask, PMV.ShiftAmt,
+      AI->getOrdering());
+  Value *FinalOldResult = extractMaskedValue(Builder, OldResult, PMV);
+  AI->replaceAllUsesWith(FinalOldResult);
+  AI->eraseFromParent();
+}
+
+void AtomicExpand::expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI) {
+  ReplacementIRBuilder Builder(CI, *DL);
+
+  PartwordMaskValues PMV = createMaskInstrs(
+      Builder, CI, CI->getCompareOperand()->getType(), CI->getPointerOperand(),
+      CI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);
+
+  Value *CmpVal_Shifted = Builder.CreateShl(
+      Builder.CreateZExt(CI->getCompareOperand(), PMV.WordType), PMV.ShiftAmt,
+      "CmpVal_Shifted");
+  Value *NewVal_Shifted = Builder.CreateShl(
+      Builder.CreateZExt(CI->getNewValOperand(), PMV.WordType), PMV.ShiftAmt,
+      "NewVal_Shifted");
+  Value *OldVal = TLI->emitMaskedAtomicCmpXchgIntrinsic(
+      Builder, CI, PMV.AlignedAddr, CmpVal_Shifted, NewVal_Shifted, PMV.Mask,
+      CI->getMergedOrdering());
+  Value *FinalOldVal = extractMaskedValue(Builder, OldVal, PMV);
+  Value *Res = PoisonValue::get(CI->getType());
+  Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);
+  Value *Success = Builder.CreateICmpEQ(
+      CmpVal_Shifted, Builder.CreateAnd(OldVal, PMV.Mask), "Success");
+  Res = Builder.CreateInsertValue(Res, Success, 1);
+
+  CI->replaceAllUsesWith(Res);
+  CI->eraseFromParent();
+}
+
+Value *AtomicExpand::insertRMWLLSCLoop(
+    IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
+    AtomicOrdering MemOpOrder,
+    function_ref<Value *(IRBuilderBase &, Value *)> PerformOp) {
+  LLVMContext &Ctx = Builder.getContext();
+  BasicBlock *BB = Builder.GetInsertBlock();
+  Function *F = BB->getParent();
+
+  assert(AddrAlign >=
+             F->getParent()->getDataLayout().getTypeStoreSize(ResultTy) &&
+         "Expected at least natural alignment at this point.");
+
+  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
+  //
+  // The standard expansion we produce is:
+  //     [...]
+  // atomicrmw.start:
+  //     %loaded = @load.linked(%addr)
+  //     %new = some_op iN %loaded, %incr
+  //     %stored = @store_conditional(%new, %addr)
+  //     %try_again = icmp i32 ne %stored, 0
+  //     br i1 %try_again, label %loop, label %atomicrmw.end
+  // atomicrmw.end:
+  //     [...]
+  BasicBlock *ExitBB =
+      BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
+  BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place).
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  Value *Loaded = TLI->emitLoadLinked(Builder, ResultTy, Addr, MemOpOrder);
+
+  Value *NewVal = PerformOp(Builder, Loaded);
+
+  Value *StoreSuccess =
+      TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
+  Value *TryAgain = Builder.CreateICmpNE(
+      StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
+  Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
+
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  return Loaded;
+}
+
+/// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of
+/// the equivalent bitwidth.  We used to not support pointer cmpxchg in the
+/// IR.  As a migration step, we convert back to what use to be the standard
+/// way to represent a pointer cmpxchg so that we can update backends one by
+/// one.
+AtomicCmpXchgInst *
+AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
+  auto *M = CI->getModule();
+  Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(),
+                                            M->getDataLayout());
+
+  ReplacementIRBuilder Builder(CI, *DL);
+
+  Value *Addr = CI->getPointerOperand();
+  Type *PT = PointerType::get(NewTy, Addr->getType()->getPointerAddressSpace());
+  Value *NewAddr = Builder.CreateBitCast(Addr, PT);
+
+  Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy);
+  Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy);
+
+  auto *NewCI = Builder.CreateAtomicCmpXchg(
+      NewAddr, NewCmp, NewNewVal, CI->getAlign(), CI->getSuccessOrdering(),
+      CI->getFailureOrdering(), CI->getSyncScopeID());
+  NewCI->setVolatile(CI->isVolatile());
+  NewCI->setWeak(CI->isWeak());
+  LLVM_DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n");
+
+  Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
+  Value *Succ = Builder.CreateExtractValue(NewCI, 1);
+
+  OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType());
+
+  Value *Res = PoisonValue::get(CI->getType());
+  Res = Builder.CreateInsertValue(Res, OldVal, 0);
+  Res = Builder.CreateInsertValue(Res, Succ, 1);
+
+  CI->replaceAllUsesWith(Res);
+  CI->eraseFromParent();
+  return NewCI;
+}
+
+bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
+  AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
+  AtomicOrdering FailureOrder = CI->getFailureOrdering();
+  Value *Addr = CI->getPointerOperand();
+  BasicBlock *BB = CI->getParent();
+  Function *F = BB->getParent();
+  LLVMContext &Ctx = F->getContext();
+  // If shouldInsertFencesForAtomic() returns true, then the target does not
+  // want to deal with memory orders, and emitLeading/TrailingFence should take
+  // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we
+  // should preserve the ordering.
+  bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI);
+  AtomicOrdering MemOpOrder = ShouldInsertFencesForAtomic
+                                  ? AtomicOrdering::Monotonic
+                                  : CI->getMergedOrdering();
+
+  // In implementations which use a barrier to achieve release semantics, we can
+  // delay emitting this barrier until we know a store is actually going to be
+  // attempted. The cost of this delay is that we need 2 copies of the block
+  // emitting the load-linked, affecting code size.
+  //
+  // Ideally, this logic would be unconditional except for the minsize check
+  // since in other cases the extra blocks naturally collapse down to the
+  // minimal loop. Unfortunately, this puts too much stress on later
+  // optimisations so we avoid emitting the extra logic in those cases too.
+  bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic &&
+                           SuccessOrder != AtomicOrdering::Monotonic &&
+                           SuccessOrder != AtomicOrdering::Acquire &&
+                           !F->hasMinSize();
+
+  // There's no overhead for sinking the release barrier in a weak cmpxchg, so
+  // do it even on minsize.
+  bool UseUnconditionalReleaseBarrier = F->hasMinSize() && !CI->isWeak();
+
+  // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
+  //
+  // The full expansion we produce is:
+  //     [...]
+  // %aligned.addr = ...
+  // cmpxchg.start:
+  //     %unreleasedload = @load.linked(%aligned.addr)
+  //     %unreleasedload.extract = extract value from %unreleasedload
+  //     %should_store = icmp eq %unreleasedload.extract, %desired
+  //     br i1 %should_store, label %cmpxchg.releasingstore,
+  //                          label %cmpxchg.nostore
+  // cmpxchg.releasingstore:
+  //     fence?
+  //     br label cmpxchg.trystore
+  // cmpxchg.trystore:
+  //     %loaded.trystore = phi [%unreleasedload, %cmpxchg.releasingstore],
+  //                            [%releasedload, %cmpxchg.releasedload]
+  //     %updated.new = insert %new into %loaded.trystore
+  //     %stored = @store_conditional(%updated.new, %aligned.addr)
+  //     %success = icmp eq i32 %stored, 0
+  //     br i1 %success, label %cmpxchg.success,
+  //                     label %cmpxchg.releasedload/%cmpxchg.failure
+  // cmpxchg.releasedload:
+  //     %releasedload = @load.linked(%aligned.addr)
+  //     %releasedload.extract = extract value from %releasedload
+  //     %should_store = icmp eq %releasedload.extract, %desired
+  //     br i1 %should_store, label %cmpxchg.trystore,
+  //                          label %cmpxchg.failure
+  // cmpxchg.success:
+  //     fence?
+  //     br label %cmpxchg.end
+  // cmpxchg.nostore:
+  //     %loaded.nostore = phi [%unreleasedload, %cmpxchg.start],
+  //                           [%releasedload,
+  //                               %cmpxchg.releasedload/%cmpxchg.trystore]
+  //     @load_linked_fail_balance()?
+  //     br label %cmpxchg.failure
+  // cmpxchg.failure:
+  //     fence?
+  //     br label %cmpxchg.end
+  // cmpxchg.end:
+  //     %loaded.exit = phi [%loaded.nostore, %cmpxchg.failure],
+  //                        [%loaded.trystore, %cmpxchg.trystore]
+  //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
+  //     %loaded = extract value from %loaded.exit
+  //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
+  //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
+  //     [...]
+  BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end");
+  auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
+  auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB);
+  auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB);
+  auto ReleasedLoadBB =
+      BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB);
+  auto TryStoreBB =
+      BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB);
+  auto ReleasingStoreBB =
+      BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB);
+  auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB);
+
+  ReplacementIRBuilder Builder(CI, *DL);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we might want a fence too. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
+    TLI->emitLeadingFence(Builder, CI, SuccessOrder);
+
+  PartwordMaskValues PMV =
+      createMaskInstrs(Builder, CI, CI->getCompareOperand()->getType(), Addr,
+                       CI->getAlign(), TLI->getMinCmpXchgSizeInBits() / 8);
+  Builder.CreateBr(StartBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(StartBB);
+  Value *UnreleasedLoad =
+      TLI->emitLoadLinked(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder);
+  Value *UnreleasedLoadExtract =
+      extractMaskedValue(Builder, UnreleasedLoad, PMV);
+  Value *ShouldStore = Builder.CreateICmpEQ(
+      UnreleasedLoadExtract, CI->getCompareOperand(), "should_store");
+
+  // If the cmpxchg doesn't actually need any ordering when it fails, we can
+  // jump straight past that fence instruction (if it exists).
+  Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB);
+
+  Builder.SetInsertPoint(ReleasingStoreBB);
+  if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
+    TLI->emitLeadingFence(Builder, CI, SuccessOrder);
+  Builder.CreateBr(TryStoreBB);
+
+  Builder.SetInsertPoint(TryStoreBB);
+  PHINode *LoadedTryStore =
+      Builder.CreatePHI(PMV.WordType, 2, "loaded.trystore");
+  LoadedTryStore->addIncoming(UnreleasedLoad, ReleasingStoreBB);
+  Value *NewValueInsert =
+      insertMaskedValue(Builder, LoadedTryStore, CI->getNewValOperand(), PMV);
+  Value *StoreSuccess = TLI->emitStoreConditional(Builder, NewValueInsert,
+                                                  PMV.AlignedAddr, MemOpOrder);
+  StoreSuccess = Builder.CreateICmpEQ(
+      StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
+  BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
+  Builder.CreateCondBr(StoreSuccess, SuccessBB,
+                       CI->isWeak() ? FailureBB : RetryBB);
+
+  Builder.SetInsertPoint(ReleasedLoadBB);
+  Value *SecondLoad;
+  if (HasReleasedLoadBB) {
+    SecondLoad =
+        TLI->emitLoadLinked(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder);
+    Value *SecondLoadExtract = extractMaskedValue(Builder, SecondLoad, PMV);
+    ShouldStore = Builder.CreateICmpEQ(SecondLoadExtract,
+                                       CI->getCompareOperand(), "should_store");
+
+    // If the cmpxchg doesn't actually need any ordering when it fails, we can
+    // jump straight past that fence instruction (if it exists).
+    Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
+    // Update PHI node in TryStoreBB.
+    LoadedTryStore->addIncoming(SecondLoad, ReleasedLoadBB);
+  } else
+    Builder.CreateUnreachable();
+
+  // Make sure later instructions don't get reordered with a fence if
+  // necessary.
+  Builder.SetInsertPoint(SuccessBB);
+  if (ShouldInsertFencesForAtomic ||
+      TLI->shouldInsertTrailingFenceForAtomicStore(CI))
+    TLI->emitTrailingFence(Builder, CI, SuccessOrder);
+  Builder.CreateBr(ExitBB);
+
+  Builder.SetInsertPoint(NoStoreBB);
+  PHINode *LoadedNoStore =
+      Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.nostore");
+  LoadedNoStore->addIncoming(UnreleasedLoad, StartBB);
+  if (HasReleasedLoadBB)
+    LoadedNoStore->addIncoming(SecondLoad, ReleasedLoadBB);
+
+  // In the failing case, where we don't execute the store-conditional, the
+  // target might want to balance out the load-linked with a dedicated
+  // instruction (e.g., on ARM, clearing the exclusive monitor).
+  TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
+  Builder.CreateBr(FailureBB);
+
+  Builder.SetInsertPoint(FailureBB);
+  PHINode *LoadedFailure =
+      Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.failure");
+  LoadedFailure->addIncoming(LoadedNoStore, NoStoreBB);
+  if (CI->isWeak())
+    LoadedFailure->addIncoming(LoadedTryStore, TryStoreBB);
+  if (ShouldInsertFencesForAtomic)
+    TLI->emitTrailingFence(Builder, CI, FailureOrder);
+  Builder.CreateBr(ExitBB);
+
+  // Finally, we have control-flow based knowledge of whether the cmpxchg
+  // succeeded or not. We expose this to later passes by converting any
+  // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate
+  // PHI.
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  PHINode *LoadedExit =
+      Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.exit");
+  LoadedExit->addIncoming(LoadedTryStore, SuccessBB);
+  LoadedExit->addIncoming(LoadedFailure, FailureBB);
+  PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2, "success");
+  Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
+  Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
+
+  // This is the "exit value" from the cmpxchg expansion. It may be of
+  // a type wider than the one in the cmpxchg instruction.
+  Value *LoadedFull = LoadedExit;
+
+  Builder.SetInsertPoint(ExitBB, std::next(Success->getIterator()));
+  Value *Loaded = extractMaskedValue(Builder, LoadedFull, PMV);
+
+  // Look for any users of the cmpxchg that are just comparing the loaded value
+  // against the desired one, and replace them with the CFG-derived version.
+  SmallVector<ExtractValueInst *, 2> PrunedInsts;
+  for (auto *User : CI->users()) {
+    ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
+    if (!EV)
+      continue;
+
+    assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
+           "weird extraction from { iN, i1 }");
+
+    if (EV->getIndices()[0] == 0)
+      EV->replaceAllUsesWith(Loaded);
+    else
+      EV->replaceAllUsesWith(Success);
+
+    PrunedInsts.push_back(EV);
+  }
+
+  // We can remove the instructions now we're no longer iterating through them.
+  for (auto *EV : PrunedInsts)
+    EV->eraseFromParent();
+
+  if (!CI->use_empty()) {
+    // Some use of the full struct return that we don't understand has happened,
+    // so we've got to reconstruct it properly.
+    Value *Res;
+    Res = Builder.CreateInsertValue(PoisonValue::get(CI->getType()), Loaded, 0);
+    Res = Builder.CreateInsertValue(Res, Success, 1);
+
+    CI->replaceAllUsesWith(Res);
+  }
+
+  CI->eraseFromParent();
+  return true;
+}
+
+bool AtomicExpand::isIdempotentRMW(AtomicRMWInst *RMWI) {
+  auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());
+  if (!C)
+    return false;
+
+  AtomicRMWInst::BinOp Op = RMWI->getOperation();
+  switch (Op) {
+  case AtomicRMWInst::Add:
+  case AtomicRMWInst::Sub:
+  case AtomicRMWInst::Or:
+  case AtomicRMWInst::Xor:
+    return C->isZero();
+  case AtomicRMWInst::And:
+    return C->isMinusOne();
+  // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
+  default:
+    return false;
+  }
+}
+
+bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst *RMWI) {
+  if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
+    tryExpandAtomicLoad(ResultingLoad);
+    return true;
+  }
+  return false;
+}
+
+Value *AtomicExpand::insertRMWCmpXchgLoop(
+    IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
+    AtomicOrdering MemOpOrder, SyncScope::ID SSID,
+    function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,
+    CreateCmpXchgInstFun CreateCmpXchg) {
+  LLVMContext &Ctx = Builder.getContext();
+  BasicBlock *BB = Builder.GetInsertBlock();
+  Function *F = BB->getParent();
+
+  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
+  //
+  // The standard expansion we produce is:
+  //     [...]
+  //     %init_loaded = load atomic iN* %addr
+  //     br label %loop
+  // loop:
+  //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
+  //     %new = some_op iN %loaded, %incr
+  //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
+  //     %new_loaded = extractvalue { iN, i1 } %pair, 0
+  //     %success = extractvalue { iN, i1 } %pair, 1
+  //     br i1 %success, label %atomicrmw.end, label %loop
+  // atomicrmw.end:
+  //     [...]
+  BasicBlock *ExitBB =
+      BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
+  BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we want a load. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  LoadInst *InitLoaded = Builder.CreateAlignedLoad(ResultTy, Addr, AddrAlign);
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded");
+  Loaded->addIncoming(InitLoaded, BB);
+
+  Value *NewVal = PerformOp(Builder, Loaded);
+
+  Value *NewLoaded = nullptr;
+  Value *Success = nullptr;
+
+  CreateCmpXchg(Builder, Addr, Loaded, NewVal, AddrAlign,
+                MemOpOrder == AtomicOrdering::Unordered
+                    ? AtomicOrdering::Monotonic
+                    : MemOpOrder,
+                SSID, Success, NewLoaded);
+  assert(Success && NewLoaded);
+
+  Loaded->addIncoming(NewLoaded, LoopBB);
+
+  Builder.CreateCondBr(Success, ExitBB, LoopBB);
+
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  return NewLoaded;
+}
+
+bool AtomicExpand::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
+  unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
+  unsigned ValueSize = getAtomicOpSize(CI);
+
+  switch (TLI->shouldExpandAtomicCmpXchgInIR(CI)) {
+  default:
+    llvm_unreachable("Unhandled case in tryExpandAtomicCmpXchg");
+  case TargetLoweringBase::AtomicExpansionKind::None:
+    if (ValueSize < MinCASSize)
+      return expandPartwordCmpXchg(CI);
+    return false;
+  case TargetLoweringBase::AtomicExpansionKind::LLSC: {
+    return expandAtomicCmpXchg(CI);
+  }
+  case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic:
+    expandAtomicCmpXchgToMaskedIntrinsic(CI);
+    return true;
+  case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
+    return lowerAtomicCmpXchgInst(CI);
+  }
+}
+
+// Note: This function is exposed externally by AtomicExpandUtils.h
+bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
+                                    CreateCmpXchgInstFun CreateCmpXchg) {
+  ReplacementIRBuilder Builder(AI, AI->getModule()->getDataLayout());
+  Builder.setIsFPConstrained(
+      AI->getFunction()->hasFnAttribute(Attribute::StrictFP));
+
+  // FIXME: If FP exceptions are observable, we should force them off for the
+  // loop for the FP atomics.
+  Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop(
+      Builder, AI->getType(), AI->getPointerOperand(), AI->getAlign(),
+      AI->getOrdering(), AI->getSyncScopeID(),
+      [&](IRBuilderBase &Builder, Value *Loaded) {
+        return buildAtomicRMWValue(AI->getOperation(), Builder, Loaded,
+                                   AI->getValOperand());
+      },
+      CreateCmpXchg);
+
+  AI->replaceAllUsesWith(Loaded);
+  AI->eraseFromParent();
+  return true;
+}
+
+// In order to use one of the sized library calls such as
+// __atomic_fetch_add_4, the alignment must be sufficient, the size
+// must be one of the potentially-specialized sizes, and the value
+// type must actually exist in C on the target (otherwise, the
+// function wouldn't actually be defined.)
+static bool canUseSizedAtomicCall(unsigned Size, Align Alignment,
+                                  const DataLayout &DL) {
+  // TODO: "LargestSize" is an approximation for "largest type that
+  // you can express in C". It seems to be the case that int128 is
+  // supported on all 64-bit platforms, otherwise only up to 64-bit
+  // integers are supported. If we get this wrong, then we'll try to
+  // call a sized libcall that doesn't actually exist. There should
+  // really be some more reliable way in LLVM of determining integer
+  // sizes which are valid in the target's C ABI...
+  unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8;
+  return Alignment >= Size &&
+         (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) &&
+         Size <= LargestSize;
+}
+
+void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) {
+  static const RTLIB::Libcall Libcalls[6] = {
+      RTLIB::ATOMIC_LOAD,   RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2,
+      RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16};
+  unsigned Size = getAtomicOpSize(I);
+
+  bool expanded = expandAtomicOpToLibcall(
+      I, Size, I->getAlign(), I->getPointerOperand(), nullptr, nullptr,
+      I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
+  if (!expanded)
+    report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Load");
+}
+
+void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) {
+  static const RTLIB::Libcall Libcalls[6] = {
+      RTLIB::ATOMIC_STORE,   RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2,
+      RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16};
+  unsigned Size = getAtomicOpSize(I);
+
+  bool expanded = expandAtomicOpToLibcall(
+      I, Size, I->getAlign(), I->getPointerOperand(), I->getValueOperand(),
+      nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
+  if (!expanded)
+    report_fatal_error("expandAtomicOpToLibcall shouldn't fail for Store");
+}
+
+void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
+  static const RTLIB::Libcall Libcalls[6] = {
+      RTLIB::ATOMIC_COMPARE_EXCHANGE,   RTLIB::ATOMIC_COMPARE_EXCHANGE_1,
+      RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4,
+      RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16};
+  unsigned Size = getAtomicOpSize(I);
+
+  bool expanded = expandAtomicOpToLibcall(
+      I, Size, I->getAlign(), I->getPointerOperand(), I->getNewValOperand(),
+      I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(),
+      Libcalls);
+  if (!expanded)
+    report_fatal_error("expandAtomicOpToLibcall shouldn't fail for CAS");
+}
+
+static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) {
+  static const RTLIB::Libcall LibcallsXchg[6] = {
+      RTLIB::ATOMIC_EXCHANGE,   RTLIB::ATOMIC_EXCHANGE_1,
+      RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4,
+      RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16};
+  static const RTLIB::Libcall LibcallsAdd[6] = {
+      RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_ADD_1,
+      RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4,
+      RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16};
+  static const RTLIB::Libcall LibcallsSub[6] = {
+      RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_SUB_1,
+      RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4,
+      RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16};
+  static const RTLIB::Libcall LibcallsAnd[6] = {
+      RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_AND_1,
+      RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4,
+      RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16};
+  static const RTLIB::Libcall LibcallsOr[6] = {
+      RTLIB::UNKNOWN_LIBCALL,   RTLIB::ATOMIC_FETCH_OR_1,
+      RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4,
+      RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16};
+  static const RTLIB::Libcall LibcallsXor[6] = {
+      RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_XOR_1,
+      RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4,
+      RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16};
+  static const RTLIB::Libcall LibcallsNand[6] = {
+      RTLIB::UNKNOWN_LIBCALL,     RTLIB::ATOMIC_FETCH_NAND_1,
+      RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4,
+      RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16};
+
+  switch (Op) {
+  case AtomicRMWInst::BAD_BINOP:
+    llvm_unreachable("Should not have BAD_BINOP.");
+  case AtomicRMWInst::Xchg:
+    return ArrayRef(LibcallsXchg);
+  case AtomicRMWInst::Add:
+    return ArrayRef(LibcallsAdd);
+  case AtomicRMWInst::Sub:
+    return ArrayRef(LibcallsSub);
+  case AtomicRMWInst::And:
+    return ArrayRef(LibcallsAnd);
+  case AtomicRMWInst::Or:
+    return ArrayRef(LibcallsOr);
+  case AtomicRMWInst::Xor:
+    return ArrayRef(LibcallsXor);
+  case AtomicRMWInst::Nand:
+    return ArrayRef(LibcallsNand);
+  case AtomicRMWInst::Max:
+  case AtomicRMWInst::Min:
+  case AtomicRMWInst::UMax:
+  case AtomicRMWInst::UMin:
+  case AtomicRMWInst::FMax:
+  case AtomicRMWInst::FMin:
+  case AtomicRMWInst::FAdd:
+  case AtomicRMWInst::FSub:
+  case AtomicRMWInst::UIncWrap:
+  case AtomicRMWInst::UDecWrap:
+    // No atomic libcalls are available for max/min/umax/umin.
+    return {};
+  }
+  llvm_unreachable("Unexpected AtomicRMW operation.");
+}
+
+void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
+  ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation());
+
+  unsigned Size = getAtomicOpSize(I);
+
+  bool Success = false;
+  if (!Libcalls.empty())
+    Success = expandAtomicOpToLibcall(
+        I, Size, I->getAlign(), I->getPointerOperand(), I->getValOperand(),
+        nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
+
+  // The expansion failed: either there were no libcalls at all for
+  // the operation (min/max), or there were only size-specialized
+  // libcalls (add/sub/etc) and we needed a generic. So, expand to a
+  // CAS libcall, via a CAS loop, instead.
+  if (!Success) {
+    expandAtomicRMWToCmpXchg(
+        I, [this](IRBuilderBase &Builder, Value *Addr, Value *Loaded,
+                  Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder,
+                  SyncScope::ID SSID, Value *&Success, Value *&NewLoaded) {
+          // Create the CAS instruction normally...
+          AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
+              Addr, Loaded, NewVal, Alignment, MemOpOrder,
+              AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID);
+          Success = Builder.CreateExtractValue(Pair, 1, "success");
+          NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
+
+          // ...and then expand the CAS into a libcall.
+          expandAtomicCASToLibcall(Pair);
+        });
+  }
+}
+
+// A helper routine for the above expandAtomic*ToLibcall functions.
+//
+// 'Libcalls' contains an array of enum values for the particular
+// ATOMIC libcalls to be emitted. All of the other arguments besides
+// 'I' are extracted from the Instruction subclass by the
+// caller. Depending on the particular call, some will be null.
+bool AtomicExpand::expandAtomicOpToLibcall(
+    Instruction *I, unsigned Size, Align Alignment, Value *PointerOperand,
+    Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering,
+    AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) {
+  assert(Libcalls.size() == 6);
+
+  LLVMContext &Ctx = I->getContext();
+  Module *M = I->getModule();
+  const DataLayout &DL = M->getDataLayout();
+  IRBuilder<> Builder(I);
+  IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front());
+
+  bool UseSizedLibcall = canUseSizedAtomicCall(Size, Alignment, DL);
+  Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8);
+
+  const Align AllocaAlignment = DL.getPrefTypeAlign(SizedIntTy);
+
+  // TODO: the "order" argument type is "int", not int32. So
+  // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
+  ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size);
+  assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO");
+  Constant *OrderingVal =
+      ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering));
+  Constant *Ordering2Val = nullptr;
+  if (CASExpected) {
+    assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO");
+    Ordering2Val =
+        ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering2));
+  }
+  bool HasResult = I->getType() != Type::getVoidTy(Ctx);
+
+  RTLIB::Libcall RTLibType;
+  if (UseSizedLibcall) {
+    switch (Size) {
+    case 1:
+      RTLibType = Libcalls[1];
+      break;
+    case 2:
+      RTLibType = Libcalls[2];
+      break;
+    case 4:
+      RTLibType = Libcalls[3];
+      break;
+    case 8:
+      RTLibType = Libcalls[4];
+      break;
+    case 16:
+      RTLibType = Libcalls[5];
+      break;
+    }
+  } else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) {
+    RTLibType = Libcalls[0];
+  } else {
+    // Can't use sized function, and there's no generic for this
+    // operation, so give up.
+    return false;
+  }
+
+  if (!TLI->getLibcallName(RTLibType)) {
+    // This target does not implement the requested atomic libcall so give up.
+    return false;
+  }
+
+  // Build up the function call. There's two kinds. First, the sized
+  // variants.  These calls are going to be one of the following (with
+  // N=1,2,4,8,16):
+  //  iN    __atomic_load_N(iN *ptr, int ordering)
+  //  void  __atomic_store_N(iN *ptr, iN val, int ordering)
+  //  iN    __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering)
+  //  bool  __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired,
+  //                                    int success_order, int failure_order)
+  //
+  // Note that these functions can be used for non-integer atomic
+  // operations, the values just need to be bitcast to integers on the
+  // way in and out.
+  //
+  // And, then, the generic variants. They look like the following:
+  //  void  __atomic_load(size_t size, void *ptr, void *ret, int ordering)
+  //  void  __atomic_store(size_t size, void *ptr, void *val, int ordering)
+  //  void  __atomic_exchange(size_t size, void *ptr, void *val, void *ret,
+  //                          int ordering)
+  //  bool  __atomic_compare_exchange(size_t size, void *ptr, void *expected,
+  //                                  void *desired, int success_order,
+  //                                  int failure_order)
+  //
+  // The different signatures are built up depending on the
+  // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult'
+  // variables.
+
+  AllocaInst *AllocaCASExpected = nullptr;
+  Value *AllocaCASExpected_i8 = nullptr;
+  AllocaInst *AllocaValue = nullptr;
+  Value *AllocaValue_i8 = nullptr;
+  AllocaInst *AllocaResult = nullptr;
+  Value *AllocaResult_i8 = nullptr;
+
+  Type *ResultTy;
+  SmallVector<Value *, 6> Args;
+  AttributeList Attr;
+
+  // 'size' argument.
+  if (!UseSizedLibcall) {
+    // Note, getIntPtrType is assumed equivalent to size_t.
+    Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size));
+  }
+
+  // 'ptr' argument.
+  // note: This assumes all address spaces share a common libfunc
+  // implementation and that addresses are convertable.  For systems without
+  // that property, we'd need to extend this mechanism to support AS-specific
+  // families of atomic intrinsics.
+  auto PtrTypeAS = PointerOperand->getType()->getPointerAddressSpace();
+  Value *PtrVal =
+      Builder.CreateBitCast(PointerOperand, Type::getInt8PtrTy(Ctx, PtrTypeAS));
+  PtrVal = Builder.CreateAddrSpaceCast(PtrVal, Type::getInt8PtrTy(Ctx));
+  Args.push_back(PtrVal);
+
+  // 'expected' argument, if present.
+  if (CASExpected) {
+    AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());
+    AllocaCASExpected->setAlignment(AllocaAlignment);
+    unsigned AllocaAS = AllocaCASExpected->getType()->getPointerAddressSpace();
+
+    AllocaCASExpected_i8 = Builder.CreateBitCast(
+        AllocaCASExpected, Type::getInt8PtrTy(Ctx, AllocaAS));
+    Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64);
+    Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment);
+    Args.push_back(AllocaCASExpected_i8);
+  }
+
+  // 'val' argument ('desired' for cas), if present.
+  if (ValueOperand) {
+    if (UseSizedLibcall) {
+      Value *IntValue =
+          Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy);
+      Args.push_back(IntValue);
+    } else {
+      AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType());
+      AllocaValue->setAlignment(AllocaAlignment);
+      AllocaValue_i8 =
+          Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx));
+      Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64);
+      Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment);
+      Args.push_back(AllocaValue_i8);
+    }
+  }
+
+  // 'ret' argument.
+  if (!CASExpected && HasResult && !UseSizedLibcall) {
+    AllocaResult = AllocaBuilder.CreateAlloca(I->getType());
+    AllocaResult->setAlignment(AllocaAlignment);
+    unsigned AllocaAS = AllocaResult->getType()->getPointerAddressSpace();
+    AllocaResult_i8 =
+        Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx, AllocaAS));
+    Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64);
+    Args.push_back(AllocaResult_i8);
+  }
+
+  // 'ordering' ('success_order' for cas) argument.
+  Args.push_back(OrderingVal);
+
+  // 'failure_order' argument, if present.
+  if (Ordering2Val)
+    Args.push_back(Ordering2Val);
+
+  // Now, the return type.
+  if (CASExpected) {
+    ResultTy = Type::getInt1Ty(Ctx);
+    Attr = Attr.addRetAttribute(Ctx, Attribute::ZExt);
+  } else if (HasResult && UseSizedLibcall)
+    ResultTy = SizedIntTy;
+  else
+    ResultTy = Type::getVoidTy(Ctx);
+
+  // Done with setting up arguments and return types, create the call:
+  SmallVector<Type *, 6> ArgTys;
+  for (Value *Arg : Args)
+    ArgTys.push_back(Arg->getType());
+  FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false);
+  FunctionCallee LibcallFn =
+      M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr);
+  CallInst *Call = Builder.CreateCall(LibcallFn, Args);
+  Call->setAttributes(Attr);
+  Value *Result = Call;
+
+  // And then, extract the results...
+  if (ValueOperand && !UseSizedLibcall)
+    Builder.CreateLifetimeEnd(AllocaValue_i8, SizeVal64);
+
+  if (CASExpected) {
+    // The final result from the CAS is {load of 'expected' alloca, bool result
+    // from call}
+    Type *FinalResultTy = I->getType();
+    Value *V = PoisonValue::get(FinalResultTy);
+    Value *ExpectedOut = Builder.CreateAlignedLoad(
+        CASExpected->getType(), AllocaCASExpected, AllocaAlignment);
+    Builder.CreateLifetimeEnd(AllocaCASExpected_i8, SizeVal64);
+    V = Builder.CreateInsertValue(V, ExpectedOut, 0);
+    V = Builder.CreateInsertValue(V, Result, 1);
+    I->replaceAllUsesWith(V);
+  } else if (HasResult) {
+    Value *V;
+    if (UseSizedLibcall)
+      V = Builder.CreateBitOrPointerCast(Result, I->getType());
+    else {
+      V = Builder.CreateAlignedLoad(I->getType(), AllocaResult,
+                                    AllocaAlignment);
+      Builder.CreateLifetimeEnd(AllocaResult_i8, SizeVal64);
+    }
+    I->replaceAllUsesWith(V);
+  }
+  I->eraseFromParent();
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BasicBlockSections.cpp b/contrib/llvm-project/llvm/lib/CodeGen/BasicBlockSections.cpp
new file mode 100644
index 000000000000..6967ca5160c0
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/BasicBlockSections.cpp
@@ -0,0 +1,406 @@
+//===-- BasicBlockSections.cpp ---=========--------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// BasicBlockSections implementation.
+//
+// The purpose of this pass is to assign sections to basic blocks when
+// -fbasic-block-sections= option is used. Further, with profile information
+// only the subset of basic blocks with profiles are placed in separate sections
+// and the rest are grouped in a cold section. The exception handling blocks are
+// treated specially to ensure they are all in one seciton.
+//
+// Basic Block Sections
+// ====================
+//
+// With option, -fbasic-block-sections=list, every function may be split into
+// clusters of basic blocks. Every cluster will be emitted into a separate
+// section with its basic blocks sequenced in the given order. To get the
+// optimized performance, the clusters must form an optimal BB layout for the
+// function. We insert a symbol at the beginning of every cluster's section to
+// allow the linker to reorder the sections in any arbitrary sequence. A global
+// order of these sections would encapsulate the function layout.
+// For example, consider the following clusters for a function foo (consisting
+// of 6 basic blocks 0, 1, ..., 5).
+//
+// 0 2
+// 1 3 5
+//
+// * Basic blocks 0 and 2 are placed in one section with symbol `foo`
+//   referencing the beginning of this section.
+// * Basic blocks 1, 3, 5 are placed in a separate section. A new symbol
+//   `foo.__part.1` will reference the beginning of this section.
+// * Basic block 4 (note that it is not referenced in the list) is placed in
+//   one section, and a new symbol `foo.cold` will point to it.
+//
+// There are a couple of challenges to be addressed:
+//
+// 1. The last basic block of every cluster should not have any implicit
+//    fallthrough to its next basic block, as it can be reordered by the linker.
+//    The compiler should make these fallthroughs explicit by adding
+//    unconditional jumps..
+//
+// 2. All inter-cluster branch targets would now need to be resolved by the
+//    linker as they cannot be calculated during compile time. This is done
+//    using static relocations. Further, the compiler tries to use short branch
+//    instructions on some ISAs for small branch offsets. This is not possible
+//    for inter-cluster branches as the offset is not determined at compile
+//    time, and therefore, long branch instructions have to be used for those.
+//
+// 3. Debug Information (DebugInfo) and Call Frame Information (CFI) emission
+//    needs special handling with basic block sections. DebugInfo needs to be
+//    emitted with more relocations as basic block sections can break a
+//    function into potentially several disjoint pieces, and CFI needs to be
+//    emitted per cluster. This also bloats the object file and binary sizes.
+//
+// Basic Block Labels
+// ==================
+//
+// With -fbasic-block-sections=labels, we encode the offsets of BB addresses of
+// every function into the .llvm_bb_addr_map section. Along with the function
+// symbols, this allows for mapping of virtual addresses in PMU profiles back to
+// the corresponding basic blocks. This logic is implemented in AsmPrinter. This
+// pass only assigns the BBSectionType of every function to ``labels``.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/BasicBlockSectionUtils.h"
+#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include <optional>
+
+using namespace llvm;
+
+// Placing the cold clusters in a separate section mitigates against poor
+// profiles and allows optimizations such as hugepage mapping to be applied at a
+// section granularity. Defaults to ".text.split." which is recognized by lld
+// via the `-z keep-text-section-prefix` flag.
+cl::opt<std::string> llvm::BBSectionsColdTextPrefix(
+    "bbsections-cold-text-prefix",
+    cl::desc("The text prefix to use for cold basic block clusters"),
+    cl::init(".text.split."), cl::Hidden);
+
+static cl::opt<bool> BBSectionsDetectSourceDrift(
+    "bbsections-detect-source-drift",
+    cl::desc("This checks if there is a fdo instr. profile hash "
+             "mismatch for this function"),
+    cl::init(true), cl::Hidden);
+
+namespace {
+
+class BasicBlockSections : public MachineFunctionPass {
+public:
+  static char ID;
+
+  BasicBlockSectionsProfileReader *BBSectionsProfileReader = nullptr;
+
+  BasicBlockSections() : MachineFunctionPass(ID) {
+    initializeBasicBlockSectionsPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Basic Block Sections Analysis";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Identify basic blocks that need separate sections and prepare to emit them
+  /// accordingly.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // end anonymous namespace
+
+char BasicBlockSections::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    BasicBlockSections, "bbsections-prepare",
+    "Prepares for basic block sections, by splitting functions "
+    "into clusters of basic blocks.",
+    false, false)
+INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReader)
+INITIALIZE_PASS_END(BasicBlockSections, "bbsections-prepare",
+                    "Prepares for basic block sections, by splitting functions "
+                    "into clusters of basic blocks.",
+                    false, false)
+
+// This function updates and optimizes the branching instructions of every basic
+// block in a given function to account for changes in the layout.
+static void
+updateBranches(MachineFunction &MF,
+               const SmallVector<MachineBasicBlock *> &PreLayoutFallThroughs) {
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  SmallVector<MachineOperand, 4> Cond;
+  for (auto &MBB : MF) {
+    auto NextMBBI = std::next(MBB.getIterator());
+    auto *FTMBB = PreLayoutFallThroughs[MBB.getNumber()];
+    // If this block had a fallthrough before we need an explicit unconditional
+    // branch to that block if either
+    //     1- the block ends a section, which means its next block may be
+    //        reorderd by the linker, or
+    //     2- the fallthrough block is not adjacent to the block in the new
+    //        order.
+    if (FTMBB && (MBB.isEndSection() || &*NextMBBI != FTMBB))
+      TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
+
+    // We do not optimize branches for machine basic blocks ending sections, as
+    // their adjacent block might be reordered by the linker.
+    if (MBB.isEndSection())
+      continue;
+
+    // It might be possible to optimize branches by flipping the branch
+    // condition.
+    Cond.clear();
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
+    if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
+      continue;
+    MBB.updateTerminator(FTMBB);
+  }
+}
+
+// This function provides the BBCluster information associated with a function.
+// Returns true if a valid association exists and false otherwise.
+bool getBBClusterInfoForFunction(
+    const MachineFunction &MF,
+    BasicBlockSectionsProfileReader *BBSectionsProfileReader,
+    DenseMap<unsigned, BBClusterInfo> &V) {
+
+  // Find the assoicated cluster information.
+  std::pair<bool, SmallVector<BBClusterInfo, 4>> P =
+      BBSectionsProfileReader->getBBClusterInfoForFunction(MF.getName());
+  if (!P.first)
+    return false;
+
+  if (P.second.empty()) {
+    // This indicates that sections are desired for all basic blocks of this
+    // function. We clear the BBClusterInfo vector to denote this.
+    V.clear();
+    return true;
+  }
+
+  for (const BBClusterInfo &BBCI : P.second)
+    V[BBCI.BBID] = BBCI;
+  return true;
+}
+
+// This function sorts basic blocks according to the cluster's information.
+// All explicitly specified clusters of basic blocks will be ordered
+// accordingly. All non-specified BBs go into a separate "Cold" section.
+// Additionally, if exception handling landing pads end up in more than one
+// clusters, they are moved into a single "Exception" section. Eventually,
+// clusters are ordered in increasing order of their IDs, with the "Exception"
+// and "Cold" succeeding all other clusters.
+// FuncBBClusterInfo represent the cluster information for basic blocks. It
+// maps from BBID of basic blocks to their cluster information. If this is
+// empty, it means unique sections for all basic blocks in the function.
+static void
+assignSections(MachineFunction &MF,
+               const DenseMap<unsigned, BBClusterInfo> &FuncBBClusterInfo) {
+  assert(MF.hasBBSections() && "BB Sections is not set for function.");
+  // This variable stores the section ID of the cluster containing eh_pads (if
+  // all eh_pads are one cluster). If more than one cluster contain eh_pads, we
+  // set it equal to ExceptionSectionID.
+  std::optional<MBBSectionID> EHPadsSectionID;
+
+  for (auto &MBB : MF) {
+    // With the 'all' option, every basic block is placed in a unique section.
+    // With the 'list' option, every basic block is placed in a section
+    // associated with its cluster, unless we want individual unique sections
+    // for every basic block in this function (if FuncBBClusterInfo is empty).
+    if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All ||
+        FuncBBClusterInfo.empty()) {
+      // If unique sections are desired for all basic blocks of the function, we
+      // set every basic block's section ID equal to its original position in
+      // the layout (which is equal to its number). This ensures that basic
+      // blocks are ordered canonically.
+      MBB.setSectionID(MBB.getNumber());
+    } else {
+      // TODO: Replace `getBBIDOrNumber` with `getBBID` once version 1 is
+      // deprecated.
+      auto I = FuncBBClusterInfo.find(MBB.getBBIDOrNumber());
+      if (I != FuncBBClusterInfo.end()) {
+        MBB.setSectionID(I->second.ClusterID);
+      } else {
+        // BB goes into the special cold section if it is not specified in the
+        // cluster info map.
+        MBB.setSectionID(MBBSectionID::ColdSectionID);
+      }
+    }
+
+    if (MBB.isEHPad() && EHPadsSectionID != MBB.getSectionID() &&
+        EHPadsSectionID != MBBSectionID::ExceptionSectionID) {
+      // If we already have one cluster containing eh_pads, this must be updated
+      // to ExceptionSectionID. Otherwise, we set it equal to the current
+      // section ID.
+      EHPadsSectionID = EHPadsSectionID ? MBBSectionID::ExceptionSectionID
+                                        : MBB.getSectionID();
+    }
+  }
+
+  // If EHPads are in more than one section, this places all of them in the
+  // special exception section.
+  if (EHPadsSectionID == MBBSectionID::ExceptionSectionID)
+    for (auto &MBB : MF)
+      if (MBB.isEHPad())
+        MBB.setSectionID(*EHPadsSectionID);
+}
+
+void llvm::sortBasicBlocksAndUpdateBranches(
+    MachineFunction &MF, MachineBasicBlockComparator MBBCmp) {
+  [[maybe_unused]] const MachineBasicBlock *EntryBlock = &MF.front();
+  SmallVector<MachineBasicBlock *> PreLayoutFallThroughs(MF.getNumBlockIDs());
+  for (auto &MBB : MF)
+    PreLayoutFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
+
+  MF.sort(MBBCmp);
+  assert(&MF.front() == EntryBlock &&
+         "Entry block should not be displaced by basic block sections");
+
+  // Set IsBeginSection and IsEndSection according to the assigned section IDs.
+  MF.assignBeginEndSections();
+
+  // After reordering basic blocks, we must update basic block branches to
+  // insert explicit fallthrough branches when required and optimize branches
+  // when possible.
+  updateBranches(MF, PreLayoutFallThroughs);
+}
+
+// If the exception section begins with a landing pad, that landing pad will
+// assume a zero offset (relative to @LPStart) in the LSDA. However, a value of
+// zero implies "no landing pad." This function inserts a NOP just before the EH
+// pad label to ensure a nonzero offset.
+void llvm::avoidZeroOffsetLandingPad(MachineFunction &MF) {
+  for (auto &MBB : MF) {
+    if (MBB.isBeginSection() && MBB.isEHPad()) {
+      MachineBasicBlock::iterator MI = MBB.begin();
+      while (!MI->isEHLabel())
+        ++MI;
+      MCInst Nop = MF.getSubtarget().getInstrInfo()->getNop();
+      BuildMI(MBB, MI, DebugLoc(),
+              MF.getSubtarget().getInstrInfo()->get(Nop.getOpcode()));
+    }
+  }
+}
+
+// This checks if the source of this function has drifted since this binary was
+// profiled previously.  For now, we are piggy backing on what PGO does to
+// detect this with instrumented profiles.  PGO emits an hash of the IR and
+// checks if the hash has changed.  Advanced basic block layout is usually done
+// on top of PGO optimized binaries and hence this check works well in practice.
+static bool hasInstrProfHashMismatch(MachineFunction &MF) {
+  if (!BBSectionsDetectSourceDrift)
+    return false;
+
+  const char MetadataName[] = "instr_prof_hash_mismatch";
+  auto *Existing = MF.getFunction().getMetadata(LLVMContext::MD_annotation);
+  if (Existing) {
+    MDTuple *Tuple = cast<MDTuple>(Existing);
+    for (const auto &N : Tuple->operands())
+      if (N.equalsStr(MetadataName))
+        return true;
+  }
+
+  return false;
+}
+
+bool BasicBlockSections::runOnMachineFunction(MachineFunction &MF) {
+  auto BBSectionsType = MF.getTarget().getBBSectionsType();
+  assert(BBSectionsType != BasicBlockSection::None &&
+         "BB Sections not enabled!");
+
+  // Check for source drift.  If the source has changed since the profiles
+  // were obtained, optimizing basic blocks might be sub-optimal.
+  // This only applies to BasicBlockSection::List as it creates
+  // clusters of basic blocks using basic block ids. Source drift can
+  // invalidate these groupings leading to sub-optimal code generation with
+  // regards to performance.
+  if (BBSectionsType == BasicBlockSection::List &&
+      hasInstrProfHashMismatch(MF))
+    return true;
+  // Renumber blocks before sorting them. This is useful during sorting,
+  // basic blocks in the same section will retain the default order.
+  // This renumbering should also be done for basic block labels to match the
+  // profiles with the correct blocks.
+  // For LLVM_BB_ADDR_MAP versions 2 and higher, this renumbering serves
+  // the different purpose of accessing the original layout positions and
+  // finding the original fallthroughs.
+  // TODO: Change the above comment accordingly when version 1 is deprecated.
+  MF.RenumberBlocks();
+
+  if (BBSectionsType == BasicBlockSection::Labels) {
+    MF.setBBSectionsType(BBSectionsType);
+    return true;
+  }
+
+  BBSectionsProfileReader = &getAnalysis<BasicBlockSectionsProfileReader>();
+
+  // Map from BBID of blocks to their cluster information.
+  DenseMap<unsigned, BBClusterInfo> FuncBBClusterInfo;
+  if (BBSectionsType == BasicBlockSection::List &&
+      !getBBClusterInfoForFunction(MF, BBSectionsProfileReader,
+                                   FuncBBClusterInfo))
+    return true;
+  MF.setBBSectionsType(BBSectionsType);
+  assignSections(MF, FuncBBClusterInfo);
+
+  // We make sure that the cluster including the entry basic block precedes all
+  // other clusters.
+  auto EntryBBSectionID = MF.front().getSectionID();
+
+  // Helper function for ordering BB sections as follows:
+  //   * Entry section (section including the entry block).
+  //   * Regular sections (in increasing order of their Number).
+  //     ...
+  //   * Exception section
+  //   * Cold section
+  auto MBBSectionOrder = [EntryBBSectionID](const MBBSectionID &LHS,
+                                            const MBBSectionID &RHS) {
+    // We make sure that the section containing the entry block precedes all the
+    // other sections.
+    if (LHS == EntryBBSectionID || RHS == EntryBBSectionID)
+      return LHS == EntryBBSectionID;
+    return LHS.Type == RHS.Type ? LHS.Number < RHS.Number : LHS.Type < RHS.Type;
+  };
+
+  // We sort all basic blocks to make sure the basic blocks of every cluster are
+  // contiguous and ordered accordingly. Furthermore, clusters are ordered in
+  // increasing order of their section IDs, with the exception and the
+  // cold section placed at the end of the function.
+  auto Comparator = [&](const MachineBasicBlock &X,
+                        const MachineBasicBlock &Y) {
+    auto XSectionID = X.getSectionID();
+    auto YSectionID = Y.getSectionID();
+    if (XSectionID != YSectionID)
+      return MBBSectionOrder(XSectionID, YSectionID);
+    // If the two basic block are in the same section, the order is decided by
+    // their position within the section.
+    if (XSectionID.Type == MBBSectionID::SectionType::Default)
+      return FuncBBClusterInfo.lookup(X.getBBIDOrNumber()).PositionInCluster <
+             FuncBBClusterInfo.lookup(Y.getBBIDOrNumber()).PositionInCluster;
+    return X.getNumber() < Y.getNumber();
+  };
+
+  sortBasicBlocksAndUpdateBranches(MF, Comparator);
+  avoidZeroOffsetLandingPad(MF);
+  return true;
+}
+
+void BasicBlockSections::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<BasicBlockSectionsProfileReader>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineFunctionPass *llvm::createBasicBlockSectionsPass() {
+  return new BasicBlockSections();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/contrib/llvm-project/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
new file mode 100644
index 000000000000..5dede452ec34
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
@@ -0,0 +1,200 @@
+//===-- BasicBlockSectionsProfileReader.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the basic block sections profile reader pass. It parses
+// and stores the basic block sections profile file (which is specified via the
+// `-basic-block-sections` flag).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include <llvm/ADT/STLExtras.h>
+
+using namespace llvm;
+
+char BasicBlockSectionsProfileReader::ID = 0;
+INITIALIZE_PASS(BasicBlockSectionsProfileReader, "bbsections-profile-reader",
+                "Reads and parses a basic block sections profile.", false,
+                false)
+
+bool BasicBlockSectionsProfileReader::isFunctionHot(StringRef FuncName) const {
+  return getBBClusterInfoForFunction(FuncName).first;
+}
+
+std::pair<bool, SmallVector<BBClusterInfo>>
+BasicBlockSectionsProfileReader::getBBClusterInfoForFunction(
+    StringRef FuncName) const {
+  auto R = ProgramBBClusterInfo.find(getAliasName(FuncName));
+  return R != ProgramBBClusterInfo.end()
+             ? std::pair(true, R->second)
+             : std::pair(false, SmallVector<BBClusterInfo>{});
+}
+
+// Basic Block Sections can be enabled for a subset of machine basic blocks.
+// This is done by passing a file containing names of functions for which basic
+// block sections are desired.  Additionally, machine basic block ids of the
+// functions can also be specified for a finer granularity. Moreover, a cluster
+// of basic blocks could be assigned to the same section.
+// Optionally, a debug-info filename can be specified for each function to allow
+// distinguishing internal-linkage functions of the same name.
+// A file with basic block sections for all of function main and three blocks
+// for function foo (of which 1 and 2 are placed in a cluster) looks like this:
+// (Profile for function foo is only loaded when its debug-info filename
+// matches 'path/to/foo_file.cc').
+// ----------------------------
+// list.txt:
+// !main
+// !foo M=path/to/foo_file.cc
+// !!1 2
+// !!4
+Error BasicBlockSectionsProfileReader::ReadProfile() {
+  assert(MBuf);
+  line_iterator LineIt(*MBuf, /*SkipBlanks=*/true, /*CommentMarker=*/'#');
+
+  auto invalidProfileError = [&](auto Message) {
+    return make_error<StringError>(
+        Twine("Invalid profile " + MBuf->getBufferIdentifier() + " at line " +
+              Twine(LineIt.line_number()) + ": " + Message),
+        inconvertibleErrorCode());
+  };
+
+  auto FI = ProgramBBClusterInfo.end();
+
+  // Current cluster ID corresponding to this function.
+  unsigned CurrentCluster = 0;
+  // Current position in the current cluster.
+  unsigned CurrentPosition = 0;
+
+  // Temporary set to ensure every basic block ID appears once in the clusters
+  // of a function.
+  SmallSet<unsigned, 4> FuncBBIDs;
+
+  for (; !LineIt.is_at_eof(); ++LineIt) {
+    StringRef S(*LineIt);
+    if (S[0] == '@')
+      continue;
+    // Check for the leading "!"
+    if (!S.consume_front("!") || S.empty())
+      break;
+    // Check for second "!" which indicates a cluster of basic blocks.
+    if (S.consume_front("!")) {
+      // Skip the profile when we the profile iterator (FI) refers to the
+      // past-the-end element.
+      if (FI == ProgramBBClusterInfo.end())
+        continue;
+      SmallVector<StringRef, 4> BBIDs;
+      S.split(BBIDs, ' ');
+      // Reset current cluster position.
+      CurrentPosition = 0;
+      for (auto BBIDStr : BBIDs) {
+        unsigned long long BBID;
+        if (getAsUnsignedInteger(BBIDStr, 10, BBID))
+          return invalidProfileError(Twine("Unsigned integer expected: '") +
+                                     BBIDStr + "'.");
+        if (!FuncBBIDs.insert(BBID).second)
+          return invalidProfileError(Twine("Duplicate basic block id found '") +
+                                     BBIDStr + "'.");
+        if (BBID == 0 && CurrentPosition)
+          return invalidProfileError("Entry BB (0) does not begin a cluster.");
+
+        FI->second.emplace_back(
+            BBClusterInfo{((unsigned)BBID), CurrentCluster, CurrentPosition++});
+      }
+      CurrentCluster++;
+    } else {
+      // This is a function name specifier. It may include a debug info filename
+      // specifier starting with `M=`.
+      auto [AliasesStr, DIFilenameStr] = S.split(' ');
+      SmallString<128> DIFilename;
+      if (DIFilenameStr.startswith("M=")) {
+        DIFilename =
+            sys::path::remove_leading_dotslash(DIFilenameStr.substr(2));
+        if (DIFilename.empty())
+          return invalidProfileError("Empty module name specifier.");
+      } else if (!DIFilenameStr.empty()) {
+        return invalidProfileError("Unknown string found: '" + DIFilenameStr +
+                                   "'.");
+      }
+      // Function aliases are separated using '/'. We use the first function
+      // name for the cluster info mapping and delegate all other aliases to
+      // this one.
+      SmallVector<StringRef, 4> Aliases;
+      AliasesStr.split(Aliases, '/');
+      bool FunctionFound = any_of(Aliases, [&](StringRef Alias) {
+        auto It = FunctionNameToDIFilename.find(Alias);
+        // No match if this function name is not found in this module.
+        if (It == FunctionNameToDIFilename.end())
+          return false;
+        // Return a match if debug-info-filename is not specified. Otherwise,
+        // check for equality.
+        return DIFilename.empty() || It->second.equals(DIFilename);
+      });
+      if (!FunctionFound) {
+        // Skip the following profile by setting the profile iterator (FI) to
+        // the past-the-end element.
+        FI = ProgramBBClusterInfo.end();
+        continue;
+      }
+      for (size_t i = 1; i < Aliases.size(); ++i)
+        FuncAliasMap.try_emplace(Aliases[i], Aliases.front());
+
+      // Prepare for parsing clusters of this function name.
+      // Start a new cluster map for this function name.
+      auto R = ProgramBBClusterInfo.try_emplace(Aliases.front());
+      // Report error when multiple profiles have been specified for the same
+      // function.
+      if (!R.second)
+        return invalidProfileError("Duplicate profile for function '" +
+                                   Aliases.front() + "'.");
+      FI = R.first;
+      CurrentCluster = 0;
+      FuncBBIDs.clear();
+    }
+  }
+  return Error::success();
+}
+
+bool BasicBlockSectionsProfileReader::doInitialization(Module &M) {
+  if (!MBuf)
+    return false;
+  // Get the function name to debug info filename mapping.
+  FunctionNameToDIFilename.clear();
+  for (const Function &F : M) {
+    SmallString<128> DIFilename;
+    if (F.isDeclaration())
+      continue;
+    DISubprogram *Subprogram = F.getSubprogram();
+    if (Subprogram) {
+      llvm::DICompileUnit *CU = Subprogram->getUnit();
+      if (CU)
+        DIFilename = sys::path::remove_leading_dotslash(CU->getFilename());
+    }
+    [[maybe_unused]] bool inserted =
+        FunctionNameToDIFilename.try_emplace(F.getName(), DIFilename).second;
+    assert(inserted);
+  }
+  if (auto Err = ReadProfile())
+    report_fatal_error(std::move(Err));
+  return false;
+}
+
+ImmutablePass *
+llvm::createBasicBlockSectionsProfileReaderPass(const MemoryBuffer *Buf) {
+  return new BasicBlockSectionsProfileReader(Buf);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp
new file mode 100644
index 000000000000..57cefae2066a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -0,0 +1,34 @@
+//===- BasicTargetTransformInfo.cpp - Basic target-independent TTI impl ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file provides the implementation of a basic TargetTransformInfo pass
+/// predicated on the target abstractions present in the target independent
+/// code generator. It uses these (primarily TargetLowering) to model as much
+/// of the TTI query interface as possible. It is included by most targets so
+/// that they can specialize only a small subset of the query space.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+// This flag is used by the template base class for BasicTTIImpl, and here to
+// provide a definition.
+cl::opt<unsigned>
+llvm::PartialUnrollingThreshold("partial-unrolling-threshold", cl::init(0),
+                                cl::desc("Threshold for partial unrolling"),
+                                cl::Hidden);
+
+BasicTTIImpl::BasicTTIImpl(const TargetMachine *TM, const Function &F)
+    : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      TLI(ST->getTargetLowering()) {}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.cpp b/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.cpp
new file mode 100644
index 000000000000..3830f25debaf
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.cpp
@@ -0,0 +1,2046 @@
+//===- BranchFolding.cpp - Fold machine code branch instructions ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass forwards branches to unconditional branches to make them branch
+// directly to the target block.  This pass often results in dead MBB's, which
+// it then removes.
+//
+// Note that this pass must be run after register allocation, it cannot handle
+// SSA form. It also must handle virtual registers for targets that emit virtual
+// ISA (e.g. NVPTX).
+//
+//===----------------------------------------------------------------------===//
+
+#include "BranchFolding.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MBFIWrapper.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <cstddef>
+#include <iterator>
+#include <numeric>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "branch-folder"
+
+STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
+STATISTIC(NumBranchOpts, "Number of branches optimized");
+STATISTIC(NumTailMerge , "Number of block tails merged");
+STATISTIC(NumHoist     , "Number of times common instructions are hoisted");
+STATISTIC(NumTailCalls,  "Number of tail calls optimized");
+
+static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge",
+                              cl::init(cl::BOU_UNSET), cl::Hidden);
+
+// Throttle for huge numbers of predecessors (compile speed problems)
+static cl::opt<unsigned>
+TailMergeThreshold("tail-merge-threshold",
+          cl::desc("Max number of predecessors to consider tail merging"),
+          cl::init(150), cl::Hidden);
+
+// Heuristic for tail merging (and, inversely, tail duplication).
+// TODO: This should be replaced with a target query.
+static cl::opt<unsigned>
+TailMergeSize("tail-merge-size",
+              cl::desc("Min number of instructions to consider tail merging"),
+              cl::init(3), cl::Hidden);
+
+namespace {
+
+  /// BranchFolderPass - Wrap branch folder in a machine function pass.
+  class BranchFolderPass : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    explicit BranchFolderPass(): MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineBlockFrequencyInfo>();
+      AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<ProfileSummaryInfoWrapperPass>();
+      AU.addRequired<TargetPassConfig>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoPHIs);
+    }
+  };
+
+} // end anonymous namespace
+
+char BranchFolderPass::ID = 0;
+
+char &llvm::BranchFolderPassID = BranchFolderPass::ID;
+
+INITIALIZE_PASS(BranchFolderPass, DEBUG_TYPE,
+                "Control Flow Optimizer", false, false)
+
+bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
+  // TailMerge can create jump into if branches that make CFG irreducible for
+  // HW that requires structurized CFG.
+  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
+                         PassConfig->getEnableTailMerge();
+  MBFIWrapper MBBFreqInfo(
+      getAnalysis<MachineBlockFrequencyInfo>());
+  BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true, MBBFreqInfo,
+                      getAnalysis<MachineBranchProbabilityInfo>(),
+                      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
+  return Folder.OptimizeFunction(MF, MF.getSubtarget().getInstrInfo(),
+                                 MF.getSubtarget().getRegisterInfo());
+}
+
+BranchFolder::BranchFolder(bool DefaultEnableTailMerge, bool CommonHoist,
+                           MBFIWrapper &FreqInfo,
+                           const MachineBranchProbabilityInfo &ProbInfo,
+                           ProfileSummaryInfo *PSI, unsigned MinTailLength)
+    : EnableHoistCommonCode(CommonHoist), MinCommonTailLength(MinTailLength),
+      MBBFreqInfo(FreqInfo), MBPI(ProbInfo), PSI(PSI) {
+  if (MinCommonTailLength == 0)
+    MinCommonTailLength = TailMergeSize;
+  switch (FlagEnableTailMerge) {
+  case cl::BOU_UNSET:
+    EnableTailMerge = DefaultEnableTailMerge;
+    break;
+  case cl::BOU_TRUE: EnableTailMerge = true; break;
+  case cl::BOU_FALSE: EnableTailMerge = false; break;
+  }
+}
+
+void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
+  assert(MBB->pred_empty() && "MBB must be dead!");
+  LLVM_DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
+
+  MachineFunction *MF = MBB->getParent();
+  // drop all successors.
+  while (!MBB->succ_empty())
+    MBB->removeSuccessor(MBB->succ_end()-1);
+
+  // Avoid matching if this pointer gets reused.
+  TriedMerging.erase(MBB);
+
+  // Update call site info.
+  for (const MachineInstr &MI : *MBB)
+    if (MI.shouldUpdateCallSiteInfo())
+      MF->eraseCallSiteInfo(&MI);
+
+  // Remove the block.
+  MF->erase(MBB);
+  EHScopeMembership.erase(MBB);
+  if (MLI)
+    MLI->removeBlock(MBB);
+}
+
+bool BranchFolder::OptimizeFunction(MachineFunction &MF,
+                                    const TargetInstrInfo *tii,
+                                    const TargetRegisterInfo *tri,
+                                    MachineLoopInfo *mli, bool AfterPlacement) {
+  if (!tii) return false;
+
+  TriedMerging.clear();
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  AfterBlockPlacement = AfterPlacement;
+  TII = tii;
+  TRI = tri;
+  MLI = mli;
+  this->MRI = &MRI;
+
+  UpdateLiveIns = MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF);
+  if (!UpdateLiveIns)
+    MRI.invalidateLiveness();
+
+  bool MadeChange = false;
+
+  // Recalculate EH scope membership.
+  EHScopeMembership = getEHScopeMembership(MF);
+
+  bool MadeChangeThisIteration = true;
+  while (MadeChangeThisIteration) {
+    MadeChangeThisIteration    = TailMergeBlocks(MF);
+    // No need to clean up if tail merging does not change anything after the
+    // block placement.
+    if (!AfterBlockPlacement || MadeChangeThisIteration)
+      MadeChangeThisIteration |= OptimizeBranches(MF);
+    if (EnableHoistCommonCode)
+      MadeChangeThisIteration |= HoistCommonCode(MF);
+    MadeChange |= MadeChangeThisIteration;
+  }
+
+  // See if any jump tables have become dead as the code generator
+  // did its thing.
+  MachineJumpTableInfo *JTI = MF.getJumpTableInfo();
+  if (!JTI)
+    return MadeChange;
+
+  // Walk the function to find jump tables that are live.
+  BitVector JTIsLive(JTI->getJumpTables().size());
+  for (const MachineBasicBlock &BB : MF) {
+    for (const MachineInstr &I : BB)
+      for (const MachineOperand &Op : I.operands()) {
+        if (!Op.isJTI()) continue;
+
+        // Remember that this JT is live.
+        JTIsLive.set(Op.getIndex());
+      }
+  }
+
+  // Finally, remove dead jump tables.  This happens when the
+  // indirect jump was unreachable (and thus deleted).
+  for (unsigned i = 0, e = JTIsLive.size(); i != e; ++i)
+    if (!JTIsLive.test(i)) {
+      JTI->RemoveJumpTable(i);
+      MadeChange = true;
+    }
+
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+//  Tail Merging of Blocks
+//===----------------------------------------------------------------------===//
+
+/// HashMachineInstr - Compute a hash value for MI and its operands.
+static unsigned HashMachineInstr(const MachineInstr &MI) {
+  unsigned Hash = MI.getOpcode();
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &Op = MI.getOperand(i);
+
+    // Merge in bits from the operand if easy. We can't use MachineOperand's
+    // hash_code here because it's not deterministic and we sort by hash value
+    // later.
+    unsigned OperandHash = 0;
+    switch (Op.getType()) {
+    case MachineOperand::MO_Register:
+      OperandHash = Op.getReg();
+      break;
+    case MachineOperand::MO_Immediate:
+      OperandHash = Op.getImm();
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      OperandHash = Op.getMBB()->getNumber();
+      break;
+    case MachineOperand::MO_FrameIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_JumpTableIndex:
+      OperandHash = Op.getIndex();
+      break;
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+      // Global address / external symbol are too hard, don't bother, but do
+      // pull in the offset.
+      OperandHash = Op.getOffset();
+      break;
+    default:
+      break;
+    }
+
+    Hash += ((OperandHash << 3) | Op.getType()) << (i & 31);
+  }
+  return Hash;
+}
+
+/// HashEndOfMBB - Hash the last instruction in the MBB.
+static unsigned HashEndOfMBB(const MachineBasicBlock &MBB) {
+  MachineBasicBlock::const_iterator I = MBB.getLastNonDebugInstr(false);
+  if (I == MBB.end())
+    return 0;
+
+  return HashMachineInstr(*I);
+}
+
+/// Whether MI should be counted as an instruction when calculating common tail.
+static bool countsAsInstruction(const MachineInstr &MI) {
+  return !(MI.isDebugInstr() || MI.isCFIInstruction());
+}
+
+/// Iterate backwards from the given iterator \p I, towards the beginning of the
+/// block. If a MI satisfying 'countsAsInstruction' is found, return an iterator
+/// pointing to that MI. If no such MI is found, return the end iterator.
+static MachineBasicBlock::iterator
+skipBackwardPastNonInstructions(MachineBasicBlock::iterator I,
+                                MachineBasicBlock *MBB) {
+  while (I != MBB->begin()) {
+    --I;
+    if (countsAsInstruction(*I))
+      return I;
+  }
+  return MBB->end();
+}
+
+/// Given two machine basic blocks, return the number of instructions they
+/// actually have in common together at their end. If a common tail is found (at
+/// least by one instruction), then iterators for the first shared instruction
+/// in each block are returned as well.
+///
+/// Non-instructions according to countsAsInstruction are ignored.
+static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
+                                        MachineBasicBlock *MBB2,
+                                        MachineBasicBlock::iterator &I1,
+                                        MachineBasicBlock::iterator &I2) {
+  MachineBasicBlock::iterator MBBI1 = MBB1->end();
+  MachineBasicBlock::iterator MBBI2 = MBB2->end();
+
+  unsigned TailLen = 0;
+  while (true) {
+    MBBI1 = skipBackwardPastNonInstructions(MBBI1, MBB1);
+    MBBI2 = skipBackwardPastNonInstructions(MBBI2, MBB2);
+    if (MBBI1 == MBB1->end() || MBBI2 == MBB2->end())
+      break;
+    if (!MBBI1->isIdenticalTo(*MBBI2) ||
+        // FIXME: This check is dubious. It's used to get around a problem where
+        // people incorrectly expect inline asm directives to remain in the same
+        // relative order. This is untenable because normal compiler
+        // optimizations (like this one) may reorder and/or merge these
+        // directives.
+        MBBI1->isInlineAsm()) {
+      break;
+    }
+    if (MBBI1->getFlag(MachineInstr::NoMerge) ||
+        MBBI2->getFlag(MachineInstr::NoMerge))
+      break;
+    ++TailLen;
+    I1 = MBBI1;
+    I2 = MBBI2;
+  }
+
+  return TailLen;
+}
+
+void BranchFolder::replaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
+                                           MachineBasicBlock &NewDest) {
+  if (UpdateLiveIns) {
+    // OldInst should always point to an instruction.
+    MachineBasicBlock &OldMBB = *OldInst->getParent();
+    LiveRegs.clear();
+    LiveRegs.addLiveOuts(OldMBB);
+    // Move backward to the place where will insert the jump.
+    MachineBasicBlock::iterator I = OldMBB.end();
+    do {
+      --I;
+      LiveRegs.stepBackward(*I);
+    } while (I != OldInst);
+
+    // Merging the tails may have switched some undef operand to non-undef ones.
+    // Add IMPLICIT_DEFS into OldMBB as necessary to have a definition of the
+    // register.
+    for (MachineBasicBlock::RegisterMaskPair P : NewDest.liveins()) {
+      // We computed the liveins with computeLiveIn earlier and should only see
+      // full registers:
+      assert(P.LaneMask == LaneBitmask::getAll() &&
+             "Can only handle full register.");
+      MCPhysReg Reg = P.PhysReg;
+      if (!LiveRegs.available(*MRI, Reg))
+        continue;
+      DebugLoc DL;
+      BuildMI(OldMBB, OldInst, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Reg);
+    }
+  }
+
+  TII->ReplaceTailWithBranchTo(OldInst, &NewDest);
+  ++NumTailMerge;
+}
+
+MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
+                                            MachineBasicBlock::iterator BBI1,
+                                            const BasicBlock *BB) {
+  if (!TII->isLegalToSplitMBBAt(CurMBB, BBI1))
+    return nullptr;
+
+  MachineFunction &MF = *CurMBB.getParent();
+
+  // Create the fall-through block.
+  MachineFunction::iterator MBBI = CurMBB.getIterator();
+  MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(BB);
+  CurMBB.getParent()->insert(++MBBI, NewMBB);
+
+  // Move all the successors of this block to the specified block.
+  NewMBB->transferSuccessors(&CurMBB);
+
+  // Add an edge from CurMBB to NewMBB for the fall-through.
+  CurMBB.addSuccessor(NewMBB);
+
+  // Splice the code over.
+  NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end());
+
+  // NewMBB belongs to the same loop as CurMBB.
+  if (MLI)
+    if (MachineLoop *ML = MLI->getLoopFor(&CurMBB))
+      ML->addBasicBlockToLoop(NewMBB, MLI->getBase());
+
+  // NewMBB inherits CurMBB's block frequency.
+  MBBFreqInfo.setBlockFreq(NewMBB, MBBFreqInfo.getBlockFreq(&CurMBB));
+
+  if (UpdateLiveIns)
+    computeAndAddLiveIns(LiveRegs, *NewMBB);
+
+  // Add the new block to the EH scope.
+  const auto &EHScopeI = EHScopeMembership.find(&CurMBB);
+  if (EHScopeI != EHScopeMembership.end()) {
+    auto n = EHScopeI->second;
+    EHScopeMembership[NewMBB] = n;
+  }
+
+  return NewMBB;
+}
+
+/// EstimateRuntime - Make a rough estimate for how long it will take to run
+/// the specified code.
+static unsigned EstimateRuntime(MachineBasicBlock::iterator I,
+                                MachineBasicBlock::iterator E) {
+  unsigned Time = 0;
+  for (; I != E; ++I) {
+    if (!countsAsInstruction(*I))
+      continue;
+    if (I->isCall())
+      Time += 10;
+    else if (I->mayLoadOrStore())
+      Time += 2;
+    else
+      ++Time;
+  }
+  return Time;
+}
+
+// CurMBB needs to add an unconditional branch to SuccMBB (we removed these
+// branches temporarily for tail merging).  In the case where CurMBB ends
+// with a conditional branch to the next block, optimize by reversing the
+// test and conditionally branching to SuccMBB instead.
+static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,
+                    const TargetInstrInfo *TII) {
+  MachineFunction *MF = CurMBB->getParent();
+  MachineFunction::iterator I = std::next(MachineFunction::iterator(CurMBB));
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  DebugLoc dl = CurMBB->findBranchDebugLoc();
+  if (I != MF->end() && !TII->analyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {
+    MachineBasicBlock *NextBB = &*I;
+    if (TBB == NextBB && !Cond.empty() && !FBB) {
+      if (!TII->reverseBranchCondition(Cond)) {
+        TII->removeBranch(*CurMBB);
+        TII->insertBranch(*CurMBB, SuccBB, nullptr, Cond, dl);
+        return;
+      }
+    }
+  }
+  TII->insertBranch(*CurMBB, SuccBB, nullptr,
+                    SmallVector<MachineOperand, 0>(), dl);
+}
+
+bool
+BranchFolder::MergePotentialsElt::operator<(const MergePotentialsElt &o) const {
+  if (getHash() < o.getHash())
+    return true;
+  if (getHash() > o.getHash())
+    return false;
+  if (getBlock()->getNumber() < o.getBlock()->getNumber())
+    return true;
+  if (getBlock()->getNumber() > o.getBlock()->getNumber())
+    return false;
+  // _GLIBCXX_DEBUG checks strict weak ordering, which involves comparing
+  // an object with itself.
+#ifndef _GLIBCXX_DEBUG
+  llvm_unreachable("Predecessor appears twice");
+#else
+  return false;
+#endif
+}
+
+/// CountTerminators - Count the number of terminators in the given
+/// block and set I to the position of the first non-terminator, if there
+/// is one, or MBB->end() otherwise.
+static unsigned CountTerminators(MachineBasicBlock *MBB,
+                                 MachineBasicBlock::iterator &I) {
+  I = MBB->end();
+  unsigned NumTerms = 0;
+  while (true) {
+    if (I == MBB->begin()) {
+      I = MBB->end();
+      break;
+    }
+    --I;
+    if (!I->isTerminator()) break;
+    ++NumTerms;
+  }
+  return NumTerms;
+}
+
+/// A no successor, non-return block probably ends in unreachable and is cold.
+/// Also consider a block that ends in an indirect branch to be a return block,
+/// since many targets use plain indirect branches to return.
+static bool blockEndsInUnreachable(const MachineBasicBlock *MBB) {
+  if (!MBB->succ_empty())
+    return false;
+  if (MBB->empty())
+    return true;
+  return !(MBB->back().isReturn() || MBB->back().isIndirectBranch());
+}
+
+/// ProfitableToMerge - Check if two machine basic blocks have a common tail
+/// and decide if it would be profitable to merge those tails.  Return the
+/// length of the common tail and iterators to the first common instruction
+/// in each block.
+/// MBB1, MBB2      The blocks to check
+/// MinCommonTailLength  Minimum size of tail block to be merged.
+/// CommonTailLen   Out parameter to record the size of the shared tail between
+///                 MBB1 and MBB2
+/// I1, I2          Iterator references that will be changed to point to the first
+///                 instruction in the common tail shared by MBB1,MBB2
+/// SuccBB          A common successor of MBB1, MBB2 which are in a canonical form
+///                 relative to SuccBB
+/// PredBB          The layout predecessor of SuccBB, if any.
+/// EHScopeMembership  map from block to EH scope #.
+/// AfterPlacement  True if we are merging blocks after layout. Stricter
+///                 thresholds apply to prevent undoing tail-duplication.
+static bool
+ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
+                  unsigned MinCommonTailLength, unsigned &CommonTailLen,
+                  MachineBasicBlock::iterator &I1,
+                  MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB,
+                  MachineBasicBlock *PredBB,
+                  DenseMap<const MachineBasicBlock *, int> &EHScopeMembership,
+                  bool AfterPlacement,
+                  MBFIWrapper &MBBFreqInfo,
+                  ProfileSummaryInfo *PSI) {
+  // It is never profitable to tail-merge blocks from two different EH scopes.
+  if (!EHScopeMembership.empty()) {
+    auto EHScope1 = EHScopeMembership.find(MBB1);
+    assert(EHScope1 != EHScopeMembership.end());
+    auto EHScope2 = EHScopeMembership.find(MBB2);
+    assert(EHScope2 != EHScopeMembership.end());
+    if (EHScope1->second != EHScope2->second)
+      return false;
+  }
+
+  CommonTailLen = ComputeCommonTailLength(MBB1, MBB2, I1, I2);
+  if (CommonTailLen == 0)
+    return false;
+  LLVM_DEBUG(dbgs() << "Common tail length of " << printMBBReference(*MBB1)
+                    << " and " << printMBBReference(*MBB2) << " is "
+                    << CommonTailLen << '\n');
+
+  // Move the iterators to the beginning of the MBB if we only got debug
+  // instructions before the tail. This is to avoid splitting a block when we
+  // only got debug instructions before the tail (to be invariant on -g).
+  if (skipDebugInstructionsForward(MBB1->begin(), MBB1->end(), false) == I1)
+    I1 = MBB1->begin();
+  if (skipDebugInstructionsForward(MBB2->begin(), MBB2->end(), false) == I2)
+    I2 = MBB2->begin();
+
+  bool FullBlockTail1 = I1 == MBB1->begin();
+  bool FullBlockTail2 = I2 == MBB2->begin();
+
+  // It's almost always profitable to merge any number of non-terminator
+  // instructions with the block that falls through into the common successor.
+  // This is true only for a single successor. For multiple successors, we are
+  // trading a conditional branch for an unconditional one.
+  // TODO: Re-visit successor size for non-layout tail merging.
+  if ((MBB1 == PredBB || MBB2 == PredBB) &&
+      (!AfterPlacement || MBB1->succ_size() == 1)) {
+    MachineBasicBlock::iterator I;
+    unsigned NumTerms = CountTerminators(MBB1 == PredBB ? MBB2 : MBB1, I);
+    if (CommonTailLen > NumTerms)
+      return true;
+  }
+
+  // If these are identical non-return blocks with no successors, merge them.
+  // Such blocks are typically cold calls to noreturn functions like abort, and
+  // are unlikely to become a fallthrough target after machine block placement.
+  // Tail merging these blocks is unlikely to create additional unconditional
+  // branches, and will reduce the size of this cold code.
+  if (FullBlockTail1 && FullBlockTail2 &&
+      blockEndsInUnreachable(MBB1) && blockEndsInUnreachable(MBB2))
+    return true;
+
+  // If one of the blocks can be completely merged and happens to be in
+  // a position where the other could fall through into it, merge any number
+  // of instructions, because it can be done without a branch.
+  // TODO: If the blocks are not adjacent, move one of them so that they are?
+  if (MBB1->isLayoutSuccessor(MBB2) && FullBlockTail2)
+    return true;
+  if (MBB2->isLayoutSuccessor(MBB1) && FullBlockTail1)
+    return true;
+
+  // If both blocks are identical and end in a branch, merge them unless they
+  // both have a fallthrough predecessor and successor.
+  // We can only do this after block placement because it depends on whether
+  // there are fallthroughs, and we don't know until after layout.
+  if (AfterPlacement && FullBlockTail1 && FullBlockTail2) {
+    auto BothFallThrough = [](MachineBasicBlock *MBB) {
+      if (!MBB->succ_empty() && !MBB->canFallThrough())
+        return false;
+      MachineFunction::iterator I(MBB);
+      MachineFunction *MF = MBB->getParent();
+      return (MBB != &*MF->begin()) && std::prev(I)->canFallThrough();
+    };
+    if (!BothFallThrough(MBB1) || !BothFallThrough(MBB2))
+      return true;
+  }
+
+  // If both blocks have an unconditional branch temporarily stripped out,
+  // count that as an additional common instruction for the following
+  // heuristics. This heuristic is only accurate for single-succ blocks, so to
+  // make sure that during layout merging and duplicating don't crash, we check
+  // for that when merging during layout.
+  unsigned EffectiveTailLen = CommonTailLen;
+  if (SuccBB && MBB1 != PredBB && MBB2 != PredBB &&
+      (MBB1->succ_size() == 1 || !AfterPlacement) &&
+      !MBB1->back().isBarrier() &&
+      !MBB2->back().isBarrier())
+    ++EffectiveTailLen;
+
+  // Check if the common tail is long enough to be worthwhile.
+  if (EffectiveTailLen >= MinCommonTailLength)
+    return true;
+
+  // If we are optimizing for code size, 2 instructions in common is enough if
+  // we don't have to split a block.  At worst we will be introducing 1 new
+  // branch instruction, which is likely to be smaller than the 2
+  // instructions that would be deleted in the merge.
+  MachineFunction *MF = MBB1->getParent();
+  bool OptForSize =
+      MF->getFunction().hasOptSize() ||
+      (llvm::shouldOptimizeForSize(MBB1, PSI, &MBBFreqInfo) &&
+       llvm::shouldOptimizeForSize(MBB2, PSI, &MBBFreqInfo));
+  return EffectiveTailLen >= 2 && OptForSize &&
+         (FullBlockTail1 || FullBlockTail2);
+}
+
+unsigned BranchFolder::ComputeSameTails(unsigned CurHash,
+                                        unsigned MinCommonTailLength,
+                                        MachineBasicBlock *SuccBB,
+                                        MachineBasicBlock *PredBB) {
+  unsigned maxCommonTailLength = 0U;
+  SameTails.clear();
+  MachineBasicBlock::iterator TrialBBI1, TrialBBI2;
+  MPIterator HighestMPIter = std::prev(MergePotentials.end());
+  for (MPIterator CurMPIter = std::prev(MergePotentials.end()),
+                  B = MergePotentials.begin();
+       CurMPIter != B && CurMPIter->getHash() == CurHash; --CurMPIter) {
+    for (MPIterator I = std::prev(CurMPIter); I->getHash() == CurHash; --I) {
+      unsigned CommonTailLen;
+      if (ProfitableToMerge(CurMPIter->getBlock(), I->getBlock(),
+                            MinCommonTailLength,
+                            CommonTailLen, TrialBBI1, TrialBBI2,
+                            SuccBB, PredBB,
+                            EHScopeMembership,
+                            AfterBlockPlacement, MBBFreqInfo, PSI)) {
+        if (CommonTailLen > maxCommonTailLength) {
+          SameTails.clear();
+          maxCommonTailLength = CommonTailLen;
+          HighestMPIter = CurMPIter;
+          SameTails.push_back(SameTailElt(CurMPIter, TrialBBI1));
+        }
+        if (HighestMPIter == CurMPIter &&
+            CommonTailLen == maxCommonTailLength)
+          SameTails.push_back(SameTailElt(I, TrialBBI2));
+      }
+      if (I == B)
+        break;
+    }
+  }
+  return maxCommonTailLength;
+}
+
+void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,
+                                        MachineBasicBlock *SuccBB,
+                                        MachineBasicBlock *PredBB) {
+  MPIterator CurMPIter, B;
+  for (CurMPIter = std::prev(MergePotentials.end()),
+      B = MergePotentials.begin();
+       CurMPIter->getHash() == CurHash; --CurMPIter) {
+    // Put the unconditional branch back, if we need one.
+    MachineBasicBlock *CurMBB = CurMPIter->getBlock();
+    if (SuccBB && CurMBB != PredBB)
+      FixTail(CurMBB, SuccBB, TII);
+    if (CurMPIter == B)
+      break;
+  }
+  if (CurMPIter->getHash() != CurHash)
+    CurMPIter++;
+  MergePotentials.erase(CurMPIter, MergePotentials.end());
+}
+
+bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
+                                             MachineBasicBlock *SuccBB,
+                                             unsigned maxCommonTailLength,
+                                             unsigned &commonTailIndex) {
+  commonTailIndex = 0;
+  unsigned TimeEstimate = ~0U;
+  for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
+    // Use PredBB if possible; that doesn't require a new branch.
+    if (SameTails[i].getBlock() == PredBB) {
+      commonTailIndex = i;
+      break;
+    }
+    // Otherwise, make a (fairly bogus) choice based on estimate of
+    // how long it will take the various blocks to execute.
+    unsigned t = EstimateRuntime(SameTails[i].getBlock()->begin(),
+                                 SameTails[i].getTailStartPos());
+    if (t <= TimeEstimate) {
+      TimeEstimate = t;
+      commonTailIndex = i;
+    }
+  }
+
+  MachineBasicBlock::iterator BBI =
+    SameTails[commonTailIndex].getTailStartPos();
+  MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
+
+  LLVM_DEBUG(dbgs() << "\nSplitting " << printMBBReference(*MBB) << ", size "
+                    << maxCommonTailLength);
+
+  // If the split block unconditionally falls-thru to SuccBB, it will be
+  // merged. In control flow terms it should then take SuccBB's name. e.g. If
+  // SuccBB is an inner loop, the common tail is still part of the inner loop.
+  const BasicBlock *BB = (SuccBB && MBB->succ_size() == 1) ?
+    SuccBB->getBasicBlock() : MBB->getBasicBlock();
+  MachineBasicBlock *newMBB = SplitMBBAt(*MBB, BBI, BB);
+  if (!newMBB) {
+    LLVM_DEBUG(dbgs() << "... failed!");
+    return false;
+  }
+
+  SameTails[commonTailIndex].setBlock(newMBB);
+  SameTails[commonTailIndex].setTailStartPos(newMBB->begin());
+
+  // If we split PredBB, newMBB is the new predecessor.
+  if (PredBB == MBB)
+    PredBB = newMBB;
+
+  return true;
+}
+
+static void
+mergeOperations(MachineBasicBlock::iterator MBBIStartPos,
+                MachineBasicBlock &MBBCommon) {
+  MachineBasicBlock *MBB = MBBIStartPos->getParent();
+  // Note CommonTailLen does not necessarily matches the size of
+  // the common BB nor all its instructions because of debug
+  // instructions differences.
+  unsigned CommonTailLen = 0;
+  for (auto E = MBB->end(); MBBIStartPos != E; ++MBBIStartPos)
+    ++CommonTailLen;
+
+  MachineBasicBlock::reverse_iterator MBBI = MBB->rbegin();
+  MachineBasicBlock::reverse_iterator MBBIE = MBB->rend();
+  MachineBasicBlock::reverse_iterator MBBICommon = MBBCommon.rbegin();
+  MachineBasicBlock::reverse_iterator MBBIECommon = MBBCommon.rend();
+
+  while (CommonTailLen--) {
+    assert(MBBI != MBBIE && "Reached BB end within common tail length!");
+    (void)MBBIE;
+
+    if (!countsAsInstruction(*MBBI)) {
+      ++MBBI;
+      continue;
+    }
+
+    while ((MBBICommon != MBBIECommon) && !countsAsInstruction(*MBBICommon))
+      ++MBBICommon;
+
+    assert(MBBICommon != MBBIECommon &&
+           "Reached BB end within common tail length!");
+    assert(MBBICommon->isIdenticalTo(*MBBI) && "Expected matching MIIs!");
+
+    // Merge MMOs from memory operations in the common block.
+    if (MBBICommon->mayLoadOrStore())
+      MBBICommon->cloneMergedMemRefs(*MBB->getParent(), {&*MBBICommon, &*MBBI});
+    // Drop undef flags if they aren't present in all merged instructions.
+    for (unsigned I = 0, E = MBBICommon->getNumOperands(); I != E; ++I) {
+      MachineOperand &MO = MBBICommon->getOperand(I);
+      if (MO.isReg() && MO.isUndef()) {
+        const MachineOperand &OtherMO = MBBI->getOperand(I);
+        if (!OtherMO.isUndef())
+          MO.setIsUndef(false);
+      }
+    }
+
+    ++MBBI;
+    ++MBBICommon;
+  }
+}
+
+void BranchFolder::mergeCommonTails(unsigned commonTailIndex) {
+  MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
+
+  std::vector<MachineBasicBlock::iterator> NextCommonInsts(SameTails.size());
+  for (unsigned int i = 0 ; i != SameTails.size() ; ++i) {
+    if (i != commonTailIndex) {
+      NextCommonInsts[i] = SameTails[i].getTailStartPos();
+      mergeOperations(SameTails[i].getTailStartPos(), *MBB);
+    } else {
+      assert(SameTails[i].getTailStartPos() == MBB->begin() &&
+          "MBB is not a common tail only block");
+    }
+  }
+
+  for (auto &MI : *MBB) {
+    if (!countsAsInstruction(MI))
+      continue;
+    DebugLoc DL = MI.getDebugLoc();
+    for (unsigned int i = 0 ; i < NextCommonInsts.size() ; i++) {
+      if (i == commonTailIndex)
+        continue;
+
+      auto &Pos = NextCommonInsts[i];
+      assert(Pos != SameTails[i].getBlock()->end() &&
+          "Reached BB end within common tail");
+      while (!countsAsInstruction(*Pos)) {
+        ++Pos;
+        assert(Pos != SameTails[i].getBlock()->end() &&
+            "Reached BB end within common tail");
+      }
+      assert(MI.isIdenticalTo(*Pos) && "Expected matching MIIs!");
+      DL = DILocation::getMergedLocation(DL, Pos->getDebugLoc());
+      NextCommonInsts[i] = ++Pos;
+    }
+    MI.setDebugLoc(DL);
+  }
+
+  if (UpdateLiveIns) {
+    LivePhysRegs NewLiveIns(*TRI);
+    computeLiveIns(NewLiveIns, *MBB);
+    LiveRegs.init(*TRI);
+
+    // The flag merging may lead to some register uses no longer using the
+    // <undef> flag, add IMPLICIT_DEFs in the predecessors as necessary.
+    for (MachineBasicBlock *Pred : MBB->predecessors()) {
+      LiveRegs.clear();
+      LiveRegs.addLiveOuts(*Pred);
+      MachineBasicBlock::iterator InsertBefore = Pred->getFirstTerminator();
+      for (Register Reg : NewLiveIns) {
+        if (!LiveRegs.available(*MRI, Reg))
+          continue;
+
+        // Skip the register if we are about to add one of its super registers.
+        // TODO: Common this up with the same logic in addLineIns().
+        if (any_of(TRI->superregs(Reg), [&](MCPhysReg SReg) {
+              return NewLiveIns.contains(SReg) && !MRI->isReserved(SReg);
+            }))
+          continue;
+
+        DebugLoc DL;
+        BuildMI(*Pred, InsertBefore, DL, TII->get(TargetOpcode::IMPLICIT_DEF),
+                Reg);
+      }
+    }
+
+    MBB->clearLiveIns();
+    addLiveIns(*MBB, NewLiveIns);
+  }
+}
+
+// See if any of the blocks in MergePotentials (which all have SuccBB as a
+// successor, or all have no successor if it is null) can be tail-merged.
+// If there is a successor, any blocks in MergePotentials that are not
+// tail-merged and are not immediately before Succ must have an unconditional
+// branch to Succ added (but the predecessor/successor lists need no
+// adjustment). The lone predecessor of Succ that falls through into Succ,
+// if any, is given in PredBB.
+// MinCommonTailLength - Except for the special cases below, tail-merge if
+// there are at least this many instructions in common.
+bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
+                                      MachineBasicBlock *PredBB,
+                                      unsigned MinCommonTailLength) {
+  bool MadeChange = false;
+
+  LLVM_DEBUG(
+      dbgs() << "\nTryTailMergeBlocks: ";
+      for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i) dbgs()
+      << printMBBReference(*MergePotentials[i].getBlock())
+      << (i == e - 1 ? "" : ", ");
+      dbgs() << "\n"; if (SuccBB) {
+        dbgs() << "  with successor " << printMBBReference(*SuccBB) << '\n';
+        if (PredBB)
+          dbgs() << "  which has fall-through from "
+                 << printMBBReference(*PredBB) << "\n";
+      } dbgs() << "Looking for common tails of at least "
+               << MinCommonTailLength << " instruction"
+               << (MinCommonTailLength == 1 ? "" : "s") << '\n';);
+
+  // Sort by hash value so that blocks with identical end sequences sort
+  // together.
+  array_pod_sort(MergePotentials.begin(), MergePotentials.end());
+
+  // Walk through equivalence sets looking for actual exact matches.
+  while (MergePotentials.size() > 1) {
+    unsigned CurHash = MergePotentials.back().getHash();
+
+    // Build SameTails, identifying the set of blocks with this hash code
+    // and with the maximum number of instructions in common.
+    unsigned maxCommonTailLength = ComputeSameTails(CurHash,
+                                                    MinCommonTailLength,
+                                                    SuccBB, PredBB);
+
+    // If we didn't find any pair that has at least MinCommonTailLength
+    // instructions in common, remove all blocks with this hash code and retry.
+    if (SameTails.empty()) {
+      RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
+      continue;
+    }
+
+    // If one of the blocks is the entire common tail (and is not the entry
+    // block/an EH pad, which we can't jump to), we can treat all blocks with
+    // this same tail at once.  Use PredBB if that is one of the possibilities,
+    // as that will not introduce any extra branches.
+    MachineBasicBlock *EntryBB =
+        &MergePotentials.front().getBlock()->getParent()->front();
+    unsigned commonTailIndex = SameTails.size();
+    // If there are two blocks, check to see if one can be made to fall through
+    // into the other.
+    if (SameTails.size() == 2 &&
+        SameTails[0].getBlock()->isLayoutSuccessor(SameTails[1].getBlock()) &&
+        SameTails[1].tailIsWholeBlock() && !SameTails[1].getBlock()->isEHPad())
+      commonTailIndex = 1;
+    else if (SameTails.size() == 2 &&
+             SameTails[1].getBlock()->isLayoutSuccessor(
+                 SameTails[0].getBlock()) &&
+             SameTails[0].tailIsWholeBlock() &&
+             !SameTails[0].getBlock()->isEHPad())
+      commonTailIndex = 0;
+    else {
+      // Otherwise just pick one, favoring the fall-through predecessor if
+      // there is one.
+      for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
+        MachineBasicBlock *MBB = SameTails[i].getBlock();
+        if ((MBB == EntryBB || MBB->isEHPad()) &&
+            SameTails[i].tailIsWholeBlock())
+          continue;
+        if (MBB == PredBB) {
+          commonTailIndex = i;
+          break;
+        }
+        if (SameTails[i].tailIsWholeBlock())
+          commonTailIndex = i;
+      }
+    }
+
+    if (commonTailIndex == SameTails.size() ||
+        (SameTails[commonTailIndex].getBlock() == PredBB &&
+         !SameTails[commonTailIndex].tailIsWholeBlock())) {
+      // None of the blocks consist entirely of the common tail.
+      // Split a block so that one does.
+      if (!CreateCommonTailOnlyBlock(PredBB, SuccBB,
+                                     maxCommonTailLength, commonTailIndex)) {
+        RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
+        continue;
+      }
+    }
+
+    MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
+
+    // Recompute common tail MBB's edge weights and block frequency.
+    setCommonTailEdgeWeights(*MBB);
+
+    // Merge debug locations, MMOs and undef flags across identical instructions
+    // for common tail.
+    mergeCommonTails(commonTailIndex);
+
+    // MBB is common tail.  Adjust all other BB's to jump to this one.
+    // Traversal must be forwards so erases work.
+    LLVM_DEBUG(dbgs() << "\nUsing common tail in " << printMBBReference(*MBB)
+                      << " for ");
+    for (unsigned int i=0, e = SameTails.size(); i != e; ++i) {
+      if (commonTailIndex == i)
+        continue;
+      LLVM_DEBUG(dbgs() << printMBBReference(*SameTails[i].getBlock())
+                        << (i == e - 1 ? "" : ", "));
+      // Hack the end off BB i, making it jump to BB commonTailIndex instead.
+      replaceTailWithBranchTo(SameTails[i].getTailStartPos(), *MBB);
+      // BB i is no longer a predecessor of SuccBB; remove it from the worklist.
+      MergePotentials.erase(SameTails[i].getMPIter());
+    }
+    LLVM_DEBUG(dbgs() << "\n");
+    // We leave commonTailIndex in the worklist in case there are other blocks
+    // that match it with a smaller number of instructions.
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
+  bool MadeChange = false;
+  if (!EnableTailMerge)
+    return MadeChange;
+
+  // First find blocks with no successors.
+  // Block placement may create new tail merging opportunities for these blocks.
+  MergePotentials.clear();
+  for (MachineBasicBlock &MBB : MF) {
+    if (MergePotentials.size() == TailMergeThreshold)
+      break;
+    if (!TriedMerging.count(&MBB) && MBB.succ_empty())
+      MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(MBB), &MBB));
+  }
+
+  // If this is a large problem, avoid visiting the same basic blocks
+  // multiple times.
+  if (MergePotentials.size() == TailMergeThreshold)
+    for (const MergePotentialsElt &Elt : MergePotentials)
+      TriedMerging.insert(Elt.getBlock());
+
+  // See if we can do any tail merging on those.
+  if (MergePotentials.size() >= 2)
+    MadeChange |= TryTailMergeBlocks(nullptr, nullptr, MinCommonTailLength);
+
+  // Look at blocks (IBB) with multiple predecessors (PBB).
+  // We change each predecessor to a canonical form, by
+  // (1) temporarily removing any unconditional branch from the predecessor
+  // to IBB, and
+  // (2) alter conditional branches so they branch to the other block
+  // not IBB; this may require adding back an unconditional branch to IBB
+  // later, where there wasn't one coming in.  E.g.
+  //   Bcc IBB
+  //   fallthrough to QBB
+  // here becomes
+  //   Bncc QBB
+  // with a conceptual B to IBB after that, which never actually exists.
+  // With those changes, we see whether the predecessors' tails match,
+  // and merge them if so.  We change things out of canonical form and
+  // back to the way they were later in the process.  (OptimizeBranches
+  // would undo some of this, but we can't use it, because we'd get into
+  // a compile-time infinite loop repeatedly doing and undoing the same
+  // transformations.)
+
+  for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
+       I != E; ++I) {
+    if (I->pred_size() < 2) continue;
+    SmallPtrSet<MachineBasicBlock *, 8> UniquePreds;
+    MachineBasicBlock *IBB = &*I;
+    MachineBasicBlock *PredBB = &*std::prev(I);
+    MergePotentials.clear();
+    MachineLoop *ML;
+
+    // Bail if merging after placement and IBB is the loop header because
+    // -- If merging predecessors that belong to the same loop as IBB, the
+    // common tail of merged predecessors may become the loop top if block
+    // placement is called again and the predecessors may branch to this common
+    // tail and require more branches. This can be relaxed if
+    // MachineBlockPlacement::findBestLoopTop is more flexible.
+    // --If merging predecessors that do not belong to the same loop as IBB, the
+    // loop info of IBB's loop and the other loops may be affected. Calling the
+    // block placement again may make big change to the layout and eliminate the
+    // reason to do tail merging here.
+    if (AfterBlockPlacement && MLI) {
+      ML = MLI->getLoopFor(IBB);
+      if (ML && IBB == ML->getHeader())
+        continue;
+    }
+
+    for (MachineBasicBlock *PBB : I->predecessors()) {
+      if (MergePotentials.size() == TailMergeThreshold)
+        break;
+
+      if (TriedMerging.count(PBB))
+        continue;
+
+      // Skip blocks that loop to themselves, can't tail merge these.
+      if (PBB == IBB)
+        continue;
+
+      // Visit each predecessor only once.
+      if (!UniquePreds.insert(PBB).second)
+        continue;
+
+      // Skip blocks which may jump to a landing pad or jump from an asm blob.
+      // Can't tail merge these.
+      if (PBB->hasEHPadSuccessor() || PBB->mayHaveInlineAsmBr())
+        continue;
+
+      // After block placement, only consider predecessors that belong to the
+      // same loop as IBB.  The reason is the same as above when skipping loop
+      // header.
+      if (AfterBlockPlacement && MLI)
+        if (ML != MLI->getLoopFor(PBB))
+          continue;
+
+      MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+      SmallVector<MachineOperand, 4> Cond;
+      if (!TII->analyzeBranch(*PBB, TBB, FBB, Cond, true)) {
+        // Failing case: IBB is the target of a cbr, and we cannot reverse the
+        // branch.
+        SmallVector<MachineOperand, 4> NewCond(Cond);
+        if (!Cond.empty() && TBB == IBB) {
+          if (TII->reverseBranchCondition(NewCond))
+            continue;
+          // This is the QBB case described above
+          if (!FBB) {
+            auto Next = ++PBB->getIterator();
+            if (Next != MF.end())
+              FBB = &*Next;
+          }
+        }
+
+        // Remove the unconditional branch at the end, if any.
+        if (TBB && (Cond.empty() || FBB)) {
+          DebugLoc dl = PBB->findBranchDebugLoc();
+          TII->removeBranch(*PBB);
+          if (!Cond.empty())
+            // reinsert conditional branch only, for now
+            TII->insertBranch(*PBB, (TBB == IBB) ? FBB : TBB, nullptr,
+                              NewCond, dl);
+        }
+
+        MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(*PBB), PBB));
+      }
+    }
+
+    // If this is a large problem, avoid visiting the same basic blocks multiple
+    // times.
+    if (MergePotentials.size() == TailMergeThreshold)
+      for (MergePotentialsElt &Elt : MergePotentials)
+        TriedMerging.insert(Elt.getBlock());
+
+    if (MergePotentials.size() >= 2)
+      MadeChange |= TryTailMergeBlocks(IBB, PredBB, MinCommonTailLength);
+
+    // Reinsert an unconditional branch if needed. The 1 below can occur as a
+    // result of removing blocks in TryTailMergeBlocks.
+    PredBB = &*std::prev(I); // this may have been changed in TryTailMergeBlocks
+    if (MergePotentials.size() == 1 &&
+        MergePotentials.begin()->getBlock() != PredBB)
+      FixTail(MergePotentials.begin()->getBlock(), IBB, TII);
+  }
+
+  return MadeChange;
+}
+
+void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) {
+  SmallVector<BlockFrequency, 2> EdgeFreqLs(TailMBB.succ_size());
+  BlockFrequency AccumulatedMBBFreq;
+
+  // Aggregate edge frequency of successor edge j:
+  //  edgeFreq(j) = sum (freq(bb) * edgeProb(bb, j)),
+  //  where bb is a basic block that is in SameTails.
+  for (const auto &Src : SameTails) {
+    const MachineBasicBlock *SrcMBB = Src.getBlock();
+    BlockFrequency BlockFreq = MBBFreqInfo.getBlockFreq(SrcMBB);
+    AccumulatedMBBFreq += BlockFreq;
+
+    // It is not necessary to recompute edge weights if TailBB has less than two
+    // successors.
+    if (TailMBB.succ_size() <= 1)
+      continue;
+
+    auto EdgeFreq = EdgeFreqLs.begin();
+
+    for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
+         SuccI != SuccE; ++SuccI, ++EdgeFreq)
+      *EdgeFreq += BlockFreq * MBPI.getEdgeProbability(SrcMBB, *SuccI);
+  }
+
+  MBBFreqInfo.setBlockFreq(&TailMBB, AccumulatedMBBFreq);
+
+  if (TailMBB.succ_size() <= 1)
+    return;
+
+  auto SumEdgeFreq =
+      std::accumulate(EdgeFreqLs.begin(), EdgeFreqLs.end(), BlockFrequency(0))
+          .getFrequency();
+  auto EdgeFreq = EdgeFreqLs.begin();
+
+  if (SumEdgeFreq > 0) {
+    for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
+         SuccI != SuccE; ++SuccI, ++EdgeFreq) {
+      auto Prob = BranchProbability::getBranchProbability(
+          EdgeFreq->getFrequency(), SumEdgeFreq);
+      TailMBB.setSuccProbability(SuccI, Prob);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Branch Optimization
+//===----------------------------------------------------------------------===//
+
+bool BranchFolder::OptimizeBranches(MachineFunction &MF) {
+  bool MadeChange = false;
+
+  // Make sure blocks are numbered in order
+  MF.RenumberBlocks();
+  // Renumbering blocks alters EH scope membership, recalculate it.
+  EHScopeMembership = getEHScopeMembership(MF);
+
+  for (MachineBasicBlock &MBB :
+       llvm::make_early_inc_range(llvm::drop_begin(MF))) {
+    MadeChange |= OptimizeBlock(&MBB);
+
+    // If it is dead, remove it.
+    if (MBB.pred_empty() && !MBB.isMachineBlockAddressTaken()) {
+      RemoveDeadBlock(&MBB);
+      MadeChange = true;
+      ++NumDeadBlocks;
+    }
+  }
+
+  return MadeChange;
+}
+
+// Blocks should be considered empty if they contain only debug info;
+// else the debug info would affect codegen.
+static bool IsEmptyBlock(MachineBasicBlock *MBB) {
+  return MBB->getFirstNonDebugInstr(true) == MBB->end();
+}
+
+// Blocks with only debug info and branches should be considered the same
+// as blocks with only branches.
+static bool IsBranchOnlyBlock(MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator I = MBB->getFirstNonDebugInstr();
+  assert(I != MBB->end() && "empty block!");
+  return I->isBranch();
+}
+
+/// IsBetterFallthrough - Return true if it would be clearly better to
+/// fall-through to MBB1 than to fall through into MBB2.  This has to return
+/// a strict ordering, returning true for both (MBB1,MBB2) and (MBB2,MBB1) will
+/// result in infinite loops.
+static bool IsBetterFallthrough(MachineBasicBlock *MBB1,
+                                MachineBasicBlock *MBB2) {
+  assert(MBB1 && MBB2 && "Unknown MachineBasicBlock");
+
+  // Right now, we use a simple heuristic.  If MBB2 ends with a call, and
+  // MBB1 doesn't, we prefer to fall through into MBB1.  This allows us to
+  // optimize branches that branch to either a return block or an assert block
+  // into a fallthrough to the return.
+  MachineBasicBlock::iterator MBB1I = MBB1->getLastNonDebugInstr();
+  MachineBasicBlock::iterator MBB2I = MBB2->getLastNonDebugInstr();
+  if (MBB1I == MBB1->end() || MBB2I == MBB2->end())
+    return false;
+
+  // If there is a clear successor ordering we make sure that one block
+  // will fall through to the next
+  if (MBB1->isSuccessor(MBB2)) return true;
+  if (MBB2->isSuccessor(MBB1)) return false;
+
+  return MBB2I->isCall() && !MBB1I->isCall();
+}
+
+/// getBranchDebugLoc - Find and return, if any, the DebugLoc of the branch
+/// instructions on the block.
+static DebugLoc getBranchDebugLoc(MachineBasicBlock &MBB) {
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I != MBB.end() && I->isBranch())
+    return I->getDebugLoc();
+  return DebugLoc();
+}
+
+static void copyDebugInfoToPredecessor(const TargetInstrInfo *TII,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock &PredMBB) {
+  auto InsertBefore = PredMBB.getFirstTerminator();
+  for (MachineInstr &MI : MBB.instrs())
+    if (MI.isDebugInstr()) {
+      TII->duplicate(PredMBB, InsertBefore, MI);
+      LLVM_DEBUG(dbgs() << "Copied debug entity from empty block to pred: "
+                        << MI);
+    }
+}
+
+static void copyDebugInfoToSuccessor(const TargetInstrInfo *TII,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock &SuccMBB) {
+  auto InsertBefore = SuccMBB.SkipPHIsAndLabels(SuccMBB.begin());
+  for (MachineInstr &MI : MBB.instrs())
+    if (MI.isDebugInstr()) {
+      TII->duplicate(SuccMBB, InsertBefore, MI);
+      LLVM_DEBUG(dbgs() << "Copied debug entity from empty block to succ: "
+                        << MI);
+    }
+}
+
+// Try to salvage DBG_VALUE instructions from an otherwise empty block. If such
+// a basic block is removed we would lose the debug information unless we have
+// copied the information to a predecessor/successor.
+//
+// TODO: This function only handles some simple cases. An alternative would be
+// to run a heavier analysis, such as the LiveDebugValues pass, before we do
+// branch folding.
+static void salvageDebugInfoFromEmptyBlock(const TargetInstrInfo *TII,
+                                           MachineBasicBlock &MBB) {
+  assert(IsEmptyBlock(&MBB) && "Expected an empty block (except debug info).");
+  // If this MBB is the only predecessor of a successor it is legal to copy
+  // DBG_VALUE instructions to the beginning of the successor.
+  for (MachineBasicBlock *SuccBB : MBB.successors())
+    if (SuccBB->pred_size() == 1)
+      copyDebugInfoToSuccessor(TII, MBB, *SuccBB);
+  // If this MBB is the only successor of a predecessor it is legal to copy the
+  // DBG_VALUE instructions to the end of the predecessor (just before the
+  // terminators, assuming that the terminator isn't affecting the DBG_VALUE).
+  for (MachineBasicBlock *PredBB : MBB.predecessors())
+    if (PredBB->succ_size() == 1)
+      copyDebugInfoToPredecessor(TII, MBB, *PredBB);
+}
+
+bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
+  bool MadeChange = false;
+  MachineFunction &MF = *MBB->getParent();
+ReoptimizeBlock:
+
+  MachineFunction::iterator FallThrough = MBB->getIterator();
+  ++FallThrough;
+
+  // Make sure MBB and FallThrough belong to the same EH scope.
+  bool SameEHScope = true;
+  if (!EHScopeMembership.empty() && FallThrough != MF.end()) {
+    auto MBBEHScope = EHScopeMembership.find(MBB);
+    assert(MBBEHScope != EHScopeMembership.end());
+    auto FallThroughEHScope = EHScopeMembership.find(&*FallThrough);
+    assert(FallThroughEHScope != EHScopeMembership.end());
+    SameEHScope = MBBEHScope->second == FallThroughEHScope->second;
+  }
+
+  // Analyze the branch in the current block. As a side-effect, this may cause
+  // the block to become empty.
+  MachineBasicBlock *CurTBB = nullptr, *CurFBB = nullptr;
+  SmallVector<MachineOperand, 4> CurCond;
+  bool CurUnAnalyzable =
+      TII->analyzeBranch(*MBB, CurTBB, CurFBB, CurCond, true);
+
+  // If this block is empty, make everyone use its fall-through, not the block
+  // explicitly.  Landing pads should not do this since the landing-pad table
+  // points to this block.  Blocks with their addresses taken shouldn't be
+  // optimized away.
+  if (IsEmptyBlock(MBB) && !MBB->isEHPad() && !MBB->hasAddressTaken() &&
+      SameEHScope) {
+    salvageDebugInfoFromEmptyBlock(TII, *MBB);
+    // Dead block?  Leave for cleanup later.
+    if (MBB->pred_empty()) return MadeChange;
+
+    if (FallThrough == MF.end()) {
+      // TODO: Simplify preds to not branch here if possible!
+    } else if (FallThrough->isEHPad()) {
+      // Don't rewrite to a landing pad fallthough.  That could lead to the case
+      // where a BB jumps to more than one landing pad.
+      // TODO: Is it ever worth rewriting predecessors which don't already
+      // jump to a landing pad, and so can safely jump to the fallthrough?
+    } else if (MBB->isSuccessor(&*FallThrough)) {
+      // Rewrite all predecessors of the old block to go to the fallthrough
+      // instead.
+      while (!MBB->pred_empty()) {
+        MachineBasicBlock *Pred = *(MBB->pred_end()-1);
+        Pred->ReplaceUsesOfBlockWith(MBB, &*FallThrough);
+      }
+      // If MBB was the target of a jump table, update jump tables to go to the
+      // fallthrough instead.
+      if (MachineJumpTableInfo *MJTI = MF.getJumpTableInfo())
+        MJTI->ReplaceMBBInJumpTables(MBB, &*FallThrough);
+      MadeChange = true;
+    }
+    return MadeChange;
+  }
+
+  // Check to see if we can simplify the terminator of the block before this
+  // one.
+  MachineBasicBlock &PrevBB = *std::prev(MachineFunction::iterator(MBB));
+
+  MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr;
+  SmallVector<MachineOperand, 4> PriorCond;
+  bool PriorUnAnalyzable =
+      TII->analyzeBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, true);
+  if (!PriorUnAnalyzable) {
+    // If the previous branch is conditional and both conditions go to the same
+    // destination, remove the branch, replacing it with an unconditional one or
+    // a fall-through.
+    if (PriorTBB && PriorTBB == PriorFBB) {
+      DebugLoc dl = getBranchDebugLoc(PrevBB);
+      TII->removeBranch(PrevBB);
+      PriorCond.clear();
+      if (PriorTBB != MBB)
+        TII->insertBranch(PrevBB, PriorTBB, nullptr, PriorCond, dl);
+      MadeChange = true;
+      ++NumBranchOpts;
+      goto ReoptimizeBlock;
+    }
+
+    // If the previous block unconditionally falls through to this block and
+    // this block has no other predecessors, move the contents of this block
+    // into the prior block. This doesn't usually happen when SimplifyCFG
+    // has been used, but it can happen if tail merging splits a fall-through
+    // predecessor of a block.
+    // This has to check PrevBB->succ_size() because EH edges are ignored by
+    // analyzeBranch.
+    if (PriorCond.empty() && !PriorTBB && MBB->pred_size() == 1 &&
+        PrevBB.succ_size() == 1 &&
+        !MBB->hasAddressTaken() && !MBB->isEHPad()) {
+      LLVM_DEBUG(dbgs() << "\nMerging into block: " << PrevBB
+                        << "From MBB: " << *MBB);
+      // Remove redundant DBG_VALUEs first.
+      if (!PrevBB.empty()) {
+        MachineBasicBlock::iterator PrevBBIter = PrevBB.end();
+        --PrevBBIter;
+        MachineBasicBlock::iterator MBBIter = MBB->begin();
+        // Check if DBG_VALUE at the end of PrevBB is identical to the
+        // DBG_VALUE at the beginning of MBB.
+        while (PrevBBIter != PrevBB.begin() && MBBIter != MBB->end()
+               && PrevBBIter->isDebugInstr() && MBBIter->isDebugInstr()) {
+          if (!MBBIter->isIdenticalTo(*PrevBBIter))
+            break;
+          MachineInstr &DuplicateDbg = *MBBIter;
+          ++MBBIter; -- PrevBBIter;
+          DuplicateDbg.eraseFromParent();
+        }
+      }
+      PrevBB.splice(PrevBB.end(), MBB, MBB->begin(), MBB->end());
+      PrevBB.removeSuccessor(PrevBB.succ_begin());
+      assert(PrevBB.succ_empty());
+      PrevBB.transferSuccessors(MBB);
+      MadeChange = true;
+      return MadeChange;
+    }
+
+    // If the previous branch *only* branches to *this* block (conditional or
+    // not) remove the branch.
+    if (PriorTBB == MBB && !PriorFBB) {
+      TII->removeBranch(PrevBB);
+      MadeChange = true;
+      ++NumBranchOpts;
+      goto ReoptimizeBlock;
+    }
+
+    // If the prior block branches somewhere else on the condition and here if
+    // the condition is false, remove the uncond second branch.
+    if (PriorFBB == MBB) {
+      DebugLoc dl = getBranchDebugLoc(PrevBB);
+      TII->removeBranch(PrevBB);
+      TII->insertBranch(PrevBB, PriorTBB, nullptr, PriorCond, dl);
+      MadeChange = true;
+      ++NumBranchOpts;
+      goto ReoptimizeBlock;
+    }
+
+    // If the prior block branches here on true and somewhere else on false, and
+    // if the branch condition is reversible, reverse the branch to create a
+    // fall-through.
+    if (PriorTBB == MBB) {
+      SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
+      if (!TII->reverseBranchCondition(NewPriorCond)) {
+        DebugLoc dl = getBranchDebugLoc(PrevBB);
+        TII->removeBranch(PrevBB);
+        TII->insertBranch(PrevBB, PriorFBB, nullptr, NewPriorCond, dl);
+        MadeChange = true;
+        ++NumBranchOpts;
+        goto ReoptimizeBlock;
+      }
+    }
+
+    // If this block has no successors (e.g. it is a return block or ends with
+    // a call to a no-return function like abort or __cxa_throw) and if the pred
+    // falls through into this block, and if it would otherwise fall through
+    // into the block after this, move this block to the end of the function.
+    //
+    // We consider it more likely that execution will stay in the function (e.g.
+    // due to loops) than it is to exit it.  This asserts in loops etc, moving
+    // the assert condition out of the loop body.
+    if (MBB->succ_empty() && !PriorCond.empty() && !PriorFBB &&
+        MachineFunction::iterator(PriorTBB) == FallThrough &&
+        !MBB->canFallThrough()) {
+      bool DoTransform = true;
+
+      // We have to be careful that the succs of PredBB aren't both no-successor
+      // blocks.  If neither have successors and if PredBB is the second from
+      // last block in the function, we'd just keep swapping the two blocks for
+      // last.  Only do the swap if one is clearly better to fall through than
+      // the other.
+      if (FallThrough == --MF.end() &&
+          !IsBetterFallthrough(PriorTBB, MBB))
+        DoTransform = false;
+
+      if (DoTransform) {
+        // Reverse the branch so we will fall through on the previous true cond.
+        SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
+        if (!TII->reverseBranchCondition(NewPriorCond)) {
+          LLVM_DEBUG(dbgs() << "\nMoving MBB: " << *MBB
+                            << "To make fallthrough to: " << *PriorTBB << "\n");
+
+          DebugLoc dl = getBranchDebugLoc(PrevBB);
+          TII->removeBranch(PrevBB);
+          TII->insertBranch(PrevBB, MBB, nullptr, NewPriorCond, dl);
+
+          // Move this block to the end of the function.
+          MBB->moveAfter(&MF.back());
+          MadeChange = true;
+          ++NumBranchOpts;
+          return MadeChange;
+        }
+      }
+    }
+  }
+
+  if (!IsEmptyBlock(MBB)) {
+    MachineInstr &TailCall = *MBB->getFirstNonDebugInstr();
+    if (TII->isUnconditionalTailCall(TailCall)) {
+      SmallVector<MachineBasicBlock *> PredsChanged;
+      for (auto &Pred : MBB->predecessors()) {
+        MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
+        SmallVector<MachineOperand, 4> PredCond;
+        bool PredAnalyzable =
+            !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true);
+
+        // Only eliminate if MBB == TBB (Taken Basic Block)
+        if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB &&
+            PredTBB != PredFBB) {
+          // The predecessor has a conditional branch to this block which
+          // consists of only a tail call. Try to fold the tail call into the
+          // conditional branch.
+          if (TII->canMakeTailCallConditional(PredCond, TailCall)) {
+            // TODO: It would be nice if analyzeBranch() could provide a pointer
+            // to the branch instruction so replaceBranchWithTailCall() doesn't
+            // have to search for it.
+            TII->replaceBranchWithTailCall(*Pred, PredCond, TailCall);
+            PredsChanged.push_back(Pred);
+          }
+        }
+        // If the predecessor is falling through to this block, we could reverse
+        // the branch condition and fold the tail call into that. However, after
+        // that we might have to re-arrange the CFG to fall through to the other
+        // block and there is a high risk of regressing code size rather than
+        // improving it.
+      }
+      if (!PredsChanged.empty()) {
+        NumTailCalls += PredsChanged.size();
+        for (auto &Pred : PredsChanged)
+          Pred->removeSuccessor(MBB);
+
+        return true;
+      }
+    }
+  }
+
+  if (!CurUnAnalyzable) {
+    // If this is a two-way branch, and the FBB branches to this block, reverse
+    // the condition so the single-basic-block loop is faster.  Instead of:
+    //    Loop: xxx; jcc Out; jmp Loop
+    // we want:
+    //    Loop: xxx; jncc Loop; jmp Out
+    if (CurTBB && CurFBB && CurFBB == MBB && CurTBB != MBB) {
+      SmallVector<MachineOperand, 4> NewCond(CurCond);
+      if (!TII->reverseBranchCondition(NewCond)) {
+        DebugLoc dl = getBranchDebugLoc(*MBB);
+        TII->removeBranch(*MBB);
+        TII->insertBranch(*MBB, CurFBB, CurTBB, NewCond, dl);
+        MadeChange = true;
+        ++NumBranchOpts;
+        goto ReoptimizeBlock;
+      }
+    }
+
+    // If this branch is the only thing in its block, see if we can forward
+    // other blocks across it.
+    if (CurTBB && CurCond.empty() && !CurFBB &&
+        IsBranchOnlyBlock(MBB) && CurTBB != MBB &&
+        !MBB->hasAddressTaken() && !MBB->isEHPad()) {
+      DebugLoc dl = getBranchDebugLoc(*MBB);
+      // This block may contain just an unconditional branch.  Because there can
+      // be 'non-branch terminators' in the block, try removing the branch and
+      // then seeing if the block is empty.
+      TII->removeBranch(*MBB);
+      // If the only things remaining in the block are debug info, remove these
+      // as well, so this will behave the same as an empty block in non-debug
+      // mode.
+      if (IsEmptyBlock(MBB)) {
+        // Make the block empty, losing the debug info (we could probably
+        // improve this in some cases.)
+        MBB->erase(MBB->begin(), MBB->end());
+      }
+      // If this block is just an unconditional branch to CurTBB, we can
+      // usually completely eliminate the block.  The only case we cannot
+      // completely eliminate the block is when the block before this one
+      // falls through into MBB and we can't understand the prior block's branch
+      // condition.
+      if (MBB->empty()) {
+        bool PredHasNoFallThrough = !PrevBB.canFallThrough();
+        if (PredHasNoFallThrough || !PriorUnAnalyzable ||
+            !PrevBB.isSuccessor(MBB)) {
+          // If the prior block falls through into us, turn it into an
+          // explicit branch to us to make updates simpler.
+          if (!PredHasNoFallThrough && PrevBB.isSuccessor(MBB) &&
+              PriorTBB != MBB && PriorFBB != MBB) {
+            if (!PriorTBB) {
+              assert(PriorCond.empty() && !PriorFBB &&
+                     "Bad branch analysis");
+              PriorTBB = MBB;
+            } else {
+              assert(!PriorFBB && "Machine CFG out of date!");
+              PriorFBB = MBB;
+            }
+            DebugLoc pdl = getBranchDebugLoc(PrevBB);
+            TII->removeBranch(PrevBB);
+            TII->insertBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, pdl);
+          }
+
+          // Iterate through all the predecessors, revectoring each in-turn.
+          size_t PI = 0;
+          bool DidChange = false;
+          bool HasBranchToSelf = false;
+          while(PI != MBB->pred_size()) {
+            MachineBasicBlock *PMBB = *(MBB->pred_begin() + PI);
+            if (PMBB == MBB) {
+              // If this block has an uncond branch to itself, leave it.
+              ++PI;
+              HasBranchToSelf = true;
+            } else {
+              DidChange = true;
+              PMBB->ReplaceUsesOfBlockWith(MBB, CurTBB);
+              // If this change resulted in PMBB ending in a conditional
+              // branch where both conditions go to the same destination,
+              // change this to an unconditional branch.
+              MachineBasicBlock *NewCurTBB = nullptr, *NewCurFBB = nullptr;
+              SmallVector<MachineOperand, 4> NewCurCond;
+              bool NewCurUnAnalyzable = TII->analyzeBranch(
+                  *PMBB, NewCurTBB, NewCurFBB, NewCurCond, true);
+              if (!NewCurUnAnalyzable && NewCurTBB && NewCurTBB == NewCurFBB) {
+                DebugLoc pdl = getBranchDebugLoc(*PMBB);
+                TII->removeBranch(*PMBB);
+                NewCurCond.clear();
+                TII->insertBranch(*PMBB, NewCurTBB, nullptr, NewCurCond, pdl);
+                MadeChange = true;
+                ++NumBranchOpts;
+              }
+            }
+          }
+
+          // Change any jumptables to go to the new MBB.
+          if (MachineJumpTableInfo *MJTI = MF.getJumpTableInfo())
+            MJTI->ReplaceMBBInJumpTables(MBB, CurTBB);
+          if (DidChange) {
+            ++NumBranchOpts;
+            MadeChange = true;
+            if (!HasBranchToSelf) return MadeChange;
+          }
+        }
+      }
+
+      // Add the branch back if the block is more than just an uncond branch.
+      TII->insertBranch(*MBB, CurTBB, nullptr, CurCond, dl);
+    }
+  }
+
+  // If the prior block doesn't fall through into this block, and if this
+  // block doesn't fall through into some other block, see if we can find a
+  // place to move this block where a fall-through will happen.
+  if (!PrevBB.canFallThrough()) {
+    // Now we know that there was no fall-through into this block, check to
+    // see if it has a fall-through into its successor.
+    bool CurFallsThru = MBB->canFallThrough();
+
+    if (!MBB->isEHPad()) {
+      // Check all the predecessors of this block.  If one of them has no fall
+      // throughs, and analyzeBranch thinks it _could_ fallthrough to this
+      // block, move this block right after it.
+      for (MachineBasicBlock *PredBB : MBB->predecessors()) {
+        // Analyze the branch at the end of the pred.
+        MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
+        SmallVector<MachineOperand, 4> PredCond;
+        if (PredBB != MBB && !PredBB->canFallThrough() &&
+            !TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true) &&
+            (PredTBB == MBB || PredFBB == MBB) &&
+            (!CurFallsThru || !CurTBB || !CurFBB) &&
+            (!CurFallsThru || MBB->getNumber() >= PredBB->getNumber())) {
+          // If the current block doesn't fall through, just move it.
+          // If the current block can fall through and does not end with a
+          // conditional branch, we need to append an unconditional jump to
+          // the (current) next block.  To avoid a possible compile-time
+          // infinite loop, move blocks only backward in this case.
+          // Also, if there are already 2 branches here, we cannot add a third;
+          // this means we have the case
+          // Bcc next
+          // B elsewhere
+          // next:
+          if (CurFallsThru) {
+            MachineBasicBlock *NextBB = &*std::next(MBB->getIterator());
+            CurCond.clear();
+            TII->insertBranch(*MBB, NextBB, nullptr, CurCond, DebugLoc());
+          }
+          MBB->moveAfter(PredBB);
+          MadeChange = true;
+          goto ReoptimizeBlock;
+        }
+      }
+    }
+
+    if (!CurFallsThru) {
+      // Check analyzable branch-successors to see if we can move this block
+      // before one.
+      if (!CurUnAnalyzable) {
+        for (MachineBasicBlock *SuccBB : {CurFBB, CurTBB}) {
+          if (!SuccBB)
+            continue;
+          // Analyze the branch at the end of the block before the succ.
+          MachineFunction::iterator SuccPrev = --SuccBB->getIterator();
+
+          // If this block doesn't already fall-through to that successor, and
+          // if the succ doesn't already have a block that can fall through into
+          // it, we can arrange for the fallthrough to happen.
+          if (SuccBB != MBB && &*SuccPrev != MBB &&
+              !SuccPrev->canFallThrough()) {
+            MBB->moveBefore(SuccBB);
+            MadeChange = true;
+            goto ReoptimizeBlock;
+          }
+        }
+      }
+
+      // Okay, there is no really great place to put this block.  If, however,
+      // the block before this one would be a fall-through if this block were
+      // removed, move this block to the end of the function. There is no real
+      // advantage in "falling through" to an EH block, so we don't want to
+      // perform this transformation for that case.
+      //
+      // Also, Windows EH introduced the possibility of an arbitrary number of
+      // successors to a given block.  The analyzeBranch call does not consider
+      // exception handling and so we can get in a state where a block
+      // containing a call is followed by multiple EH blocks that would be
+      // rotated infinitely at the end of the function if the transformation
+      // below were performed for EH "FallThrough" blocks.  Therefore, even if
+      // that appears not to be happening anymore, we should assume that it is
+      // possible and not remove the "!FallThrough()->isEHPad" condition below.
+      MachineBasicBlock *PrevTBB = nullptr, *PrevFBB = nullptr;
+      SmallVector<MachineOperand, 4> PrevCond;
+      if (FallThrough != MF.end() &&
+          !FallThrough->isEHPad() &&
+          !TII->analyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) &&
+          PrevBB.isSuccessor(&*FallThrough)) {
+        MBB->moveAfter(&MF.back());
+        MadeChange = true;
+        return MadeChange;
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+//  Hoist Common Code
+//===----------------------------------------------------------------------===//
+
+bool BranchFolder::HoistCommonCode(MachineFunction &MF) {
+  bool MadeChange = false;
+  for (MachineBasicBlock &MBB : llvm::make_early_inc_range(MF))
+    MadeChange |= HoistCommonCodeInSuccs(&MBB);
+
+  return MadeChange;
+}
+
+/// findFalseBlock - BB has a fallthrough. Find its 'false' successor given
+/// its 'true' successor.
+static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
+                                         MachineBasicBlock *TrueBB) {
+  for (MachineBasicBlock *SuccBB : BB->successors())
+    if (SuccBB != TrueBB)
+      return SuccBB;
+  return nullptr;
+}
+
+template <class Container>
+static void addRegAndItsAliases(Register Reg, const TargetRegisterInfo *TRI,
+                                Container &Set) {
+  if (Reg.isPhysical()) {
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+      Set.insert(*AI);
+  } else {
+    Set.insert(Reg);
+  }
+}
+
+/// findHoistingInsertPosAndDeps - Find the location to move common instructions
+/// in successors to. The location is usually just before the terminator,
+/// however if the terminator is a conditional branch and its previous
+/// instruction is the flag setting instruction, the previous instruction is
+/// the preferred location. This function also gathers uses and defs of the
+/// instructions from the insertion point to the end of the block. The data is
+/// used by HoistCommonCodeInSuccs to ensure safety.
+static
+MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
+                                                  const TargetInstrInfo *TII,
+                                                  const TargetRegisterInfo *TRI,
+                                                  SmallSet<Register, 4> &Uses,
+                                                  SmallSet<Register, 4> &Defs) {
+  MachineBasicBlock::iterator Loc = MBB->getFirstTerminator();
+  if (!TII->isUnpredicatedTerminator(*Loc))
+    return MBB->end();
+
+  for (const MachineOperand &MO : Loc->operands()) {
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isUse()) {
+      addRegAndItsAliases(Reg, TRI, Uses);
+    } else {
+      if (!MO.isDead())
+        // Don't try to hoist code in the rare case the terminator defines a
+        // register that is later used.
+        return MBB->end();
+
+      // If the terminator defines a register, make sure we don't hoist
+      // the instruction whose def might be clobbered by the terminator.
+      addRegAndItsAliases(Reg, TRI, Defs);
+    }
+  }
+
+  if (Uses.empty())
+    return Loc;
+  // If the terminator is the only instruction in the block and Uses is not
+  // empty (or we would have returned above), we can still safely hoist
+  // instructions just before the terminator as long as the Defs/Uses are not
+  // violated (which is checked in HoistCommonCodeInSuccs).
+  if (Loc == MBB->begin())
+    return Loc;
+
+  // The terminator is probably a conditional branch, try not to separate the
+  // branch from condition setting instruction.
+  MachineBasicBlock::iterator PI = prev_nodbg(Loc, MBB->begin());
+
+  bool IsDef = false;
+  for (const MachineOperand &MO : PI->operands()) {
+    // If PI has a regmask operand, it is probably a call. Separate away.
+    if (MO.isRegMask())
+      return Loc;
+    if (!MO.isReg() || MO.isUse())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (Uses.count(Reg)) {
+      IsDef = true;
+      break;
+    }
+  }
+  if (!IsDef)
+    // The condition setting instruction is not just before the conditional
+    // branch.
+    return Loc;
+
+  // Be conservative, don't insert instruction above something that may have
+  // side-effects. And since it's potentially bad to separate flag setting
+  // instruction from the conditional branch, just abort the optimization
+  // completely.
+  // Also avoid moving code above predicated instruction since it's hard to
+  // reason about register liveness with predicated instruction.
+  bool DontMoveAcrossStore = true;
+  if (!PI->isSafeToMove(nullptr, DontMoveAcrossStore) || TII->isPredicated(*PI))
+    return MBB->end();
+
+  // Find out what registers are live. Note this routine is ignoring other live
+  // registers which are only used by instructions in successor blocks.
+  for (const MachineOperand &MO : PI->operands()) {
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isUse()) {
+      addRegAndItsAliases(Reg, TRI, Uses);
+    } else {
+      if (Uses.erase(Reg)) {
+        if (Reg.isPhysical()) {
+          for (MCPhysReg SubReg : TRI->subregs(Reg))
+            Uses.erase(SubReg); // Use sub-registers to be conservative
+        }
+      }
+      addRegAndItsAliases(Reg, TRI, Defs);
+    }
+  }
+
+  return PI;
+}
+
+bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  if (TII->analyzeBranch(*MBB, TBB, FBB, Cond, true) || !TBB || Cond.empty())
+    return false;
+
+  if (!FBB) FBB = findFalseBlock(MBB, TBB);
+  if (!FBB)
+    // Malformed bcc? True and false blocks are the same?
+    return false;
+
+  // Restrict the optimization to cases where MBB is the only predecessor,
+  // it is an obvious win.
+  if (TBB->pred_size() > 1 || FBB->pred_size() > 1)
+    return false;
+
+  // Find a suitable position to hoist the common instructions to. Also figure
+  // out which registers are used or defined by instructions from the insertion
+  // point to the end of the block.
+  SmallSet<Register, 4> Uses, Defs;
+  MachineBasicBlock::iterator Loc =
+    findHoistingInsertPosAndDeps(MBB, TII, TRI, Uses, Defs);
+  if (Loc == MBB->end())
+    return false;
+
+  bool HasDups = false;
+  SmallSet<Register, 4> ActiveDefsSet, AllDefsSet;
+  MachineBasicBlock::iterator TIB = TBB->begin();
+  MachineBasicBlock::iterator FIB = FBB->begin();
+  MachineBasicBlock::iterator TIE = TBB->end();
+  MachineBasicBlock::iterator FIE = FBB->end();
+  while (TIB != TIE && FIB != FIE) {
+    // Skip dbg_value instructions. These do not count.
+    TIB = skipDebugInstructionsForward(TIB, TIE, false);
+    FIB = skipDebugInstructionsForward(FIB, FIE, false);
+    if (TIB == TIE || FIB == FIE)
+      break;
+
+    if (!TIB->isIdenticalTo(*FIB, MachineInstr::CheckKillDead))
+      break;
+
+    if (TII->isPredicated(*TIB))
+      // Hard to reason about register liveness with predicated instruction.
+      break;
+
+    bool IsSafe = true;
+    for (MachineOperand &MO : TIB->operands()) {
+      // Don't attempt to hoist instructions with register masks.
+      if (MO.isRegMask()) {
+        IsSafe = false;
+        break;
+      }
+      if (!MO.isReg())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      if (MO.isDef()) {
+        if (Uses.count(Reg)) {
+          // Avoid clobbering a register that's used by the instruction at
+          // the point of insertion.
+          IsSafe = false;
+          break;
+        }
+
+        if (Defs.count(Reg) && !MO.isDead()) {
+          // Don't hoist the instruction if the def would be clobber by the
+          // instruction at the point insertion. FIXME: This is overly
+          // conservative. It should be possible to hoist the instructions
+          // in BB2 in the following example:
+          // BB1:
+          // r1, eflag = op1 r2, r3
+          // brcc eflag
+          //
+          // BB2:
+          // r1 = op2, ...
+          //    = op3, killed r1
+          IsSafe = false;
+          break;
+        }
+      } else if (!ActiveDefsSet.count(Reg)) {
+        if (Defs.count(Reg)) {
+          // Use is defined by the instruction at the point of insertion.
+          IsSafe = false;
+          break;
+        }
+
+        if (MO.isKill() && Uses.count(Reg))
+          // Kills a register that's read by the instruction at the point of
+          // insertion. Remove the kill marker.
+          MO.setIsKill(false);
+      }
+    }
+    if (!IsSafe)
+      break;
+
+    bool DontMoveAcrossStore = true;
+    if (!TIB->isSafeToMove(nullptr, DontMoveAcrossStore))
+      break;
+
+    // Remove kills from ActiveDefsSet, these registers had short live ranges.
+    for (const MachineOperand &MO : TIB->all_uses()) {
+      if (!MO.isKill())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      if (!AllDefsSet.count(Reg)) {
+        continue;
+      }
+      if (Reg.isPhysical()) {
+        for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+          ActiveDefsSet.erase(*AI);
+      } else {
+        ActiveDefsSet.erase(Reg);
+      }
+    }
+
+    // Track local defs so we can update liveins.
+    for (const MachineOperand &MO : TIB->all_defs()) {
+      if (MO.isDead())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg || Reg.isVirtual())
+        continue;
+      addRegAndItsAliases(Reg, TRI, ActiveDefsSet);
+      addRegAndItsAliases(Reg, TRI, AllDefsSet);
+    }
+
+    HasDups = true;
+    ++TIB;
+    ++FIB;
+  }
+
+  if (!HasDups)
+    return false;
+
+  MBB->splice(Loc, TBB, TBB->begin(), TIB);
+  FBB->erase(FBB->begin(), FIB);
+
+  if (UpdateLiveIns) {
+    recomputeLiveIns(*TBB);
+    recomputeLiveIns(*FBB);
+  }
+
+  ++NumHoist;
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.h b/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.h
new file mode 100644
index 000000000000..63b2ef04b21b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/BranchFolding.h
@@ -0,0 +1,200 @@
+//===- BranchFolding.h - Fold machine code branch instructions --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_BRANCHFOLDING_H
+#define LLVM_LIB_CODEGEN_BRANCHFOLDING_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/Support/Compiler.h"
+#include <vector>
+
+namespace llvm {
+
+class BasicBlock;
+class MachineBranchProbabilityInfo;
+class MachineFunction;
+class MachineLoopInfo;
+class MachineRegisterInfo;
+class MBFIWrapper;
+class ProfileSummaryInfo;
+class TargetInstrInfo;
+class TargetRegisterInfo;
+
+  class LLVM_LIBRARY_VISIBILITY BranchFolder {
+  public:
+    explicit BranchFolder(bool DefaultEnableTailMerge, bool CommonHoist,
+                          MBFIWrapper &FreqInfo,
+                          const MachineBranchProbabilityInfo &ProbInfo,
+                          ProfileSummaryInfo *PSI,
+                          // Min tail length to merge. Defaults to commandline
+                          // flag. Ignored for optsize.
+                          unsigned MinTailLength = 0);
+
+    /// Perhaps branch folding, tail merging and other CFG optimizations on the
+    /// given function.  Block placement changes the layout and may create new
+    /// tail merging opportunities.
+    bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii,
+                          const TargetRegisterInfo *tri,
+                          MachineLoopInfo *mli = nullptr,
+                          bool AfterPlacement = false);
+
+  private:
+    class MergePotentialsElt {
+      unsigned Hash;
+      MachineBasicBlock *Block;
+
+    public:
+      MergePotentialsElt(unsigned h, MachineBasicBlock *b)
+        : Hash(h), Block(b) {}
+
+      unsigned getHash() const { return Hash; }
+      MachineBasicBlock *getBlock() const { return Block; }
+
+      void setBlock(MachineBasicBlock *MBB) {
+        Block = MBB;
+      }
+
+      bool operator<(const MergePotentialsElt &) const;
+    };
+
+    using MPIterator = std::vector<MergePotentialsElt>::iterator;
+
+    std::vector<MergePotentialsElt> MergePotentials;
+    SmallPtrSet<const MachineBasicBlock*, 2> TriedMerging;
+    DenseMap<const MachineBasicBlock *, int> EHScopeMembership;
+
+    class SameTailElt {
+      MPIterator MPIter;
+      MachineBasicBlock::iterator TailStartPos;
+
+    public:
+      SameTailElt(MPIterator mp, MachineBasicBlock::iterator tsp)
+        : MPIter(mp), TailStartPos(tsp) {}
+
+      MPIterator getMPIter() const {
+        return MPIter;
+      }
+
+      MergePotentialsElt &getMergePotentialsElt() const {
+        return *getMPIter();
+      }
+
+      MachineBasicBlock::iterator getTailStartPos() const {
+        return TailStartPos;
+      }
+
+      unsigned getHash() const {
+        return getMergePotentialsElt().getHash();
+      }
+
+      MachineBasicBlock *getBlock() const {
+        return getMergePotentialsElt().getBlock();
+      }
+
+      bool tailIsWholeBlock() const {
+        return TailStartPos == getBlock()->begin();
+      }
+
+      void setBlock(MachineBasicBlock *MBB) {
+        getMergePotentialsElt().setBlock(MBB);
+      }
+
+      void setTailStartPos(MachineBasicBlock::iterator Pos) {
+        TailStartPos = Pos;
+      }
+    };
+    std::vector<SameTailElt> SameTails;
+
+    bool AfterBlockPlacement = false;
+    bool EnableTailMerge = false;
+    bool EnableHoistCommonCode = false;
+    bool UpdateLiveIns = false;
+    unsigned MinCommonTailLength;
+    const TargetInstrInfo *TII = nullptr;
+    const MachineRegisterInfo *MRI = nullptr;
+    const TargetRegisterInfo *TRI = nullptr;
+    MachineLoopInfo *MLI = nullptr;
+    LivePhysRegs LiveRegs;
+
+  private:
+    MBFIWrapper &MBBFreqInfo;
+    const MachineBranchProbabilityInfo &MBPI;
+    ProfileSummaryInfo *PSI;
+
+    bool TailMergeBlocks(MachineFunction &MF);
+    bool TryTailMergeBlocks(MachineBasicBlock* SuccBB,
+                       MachineBasicBlock* PredBB,
+                       unsigned MinCommonTailLength);
+    void setCommonTailEdgeWeights(MachineBasicBlock &TailMBB);
+
+    /// Delete the instruction OldInst and everything after it, replacing it
+    /// with an unconditional branch to NewDest.
+    void replaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
+                                 MachineBasicBlock &NewDest);
+
+    /// Given a machine basic block and an iterator into it, split the MBB so
+    /// that the part before the iterator falls into the part starting at the
+    /// iterator.  This returns the new MBB.
+    MachineBasicBlock *SplitMBBAt(MachineBasicBlock &CurMBB,
+                                  MachineBasicBlock::iterator BBI1,
+                                  const BasicBlock *BB);
+
+    /// Look through all the blocks in MergePotentials that have hash CurHash
+    /// (guaranteed to match the last element).  Build the vector SameTails of
+    /// all those that have the (same) largest number of instructions in common
+    /// of any pair of these blocks.  SameTails entries contain an iterator into
+    /// MergePotentials (from which the MachineBasicBlock can be found) and a
+    /// MachineBasicBlock::iterator into that MBB indicating the instruction
+    /// where the matching code sequence begins.  Order of elements in SameTails
+    /// is the reverse of the order in which those blocks appear in
+    /// MergePotentials (where they are not necessarily consecutive).
+    unsigned ComputeSameTails(unsigned CurHash, unsigned minCommonTailLength,
+                              MachineBasicBlock *SuccBB,
+                              MachineBasicBlock *PredBB);
+
+    /// Remove all blocks with hash CurHash from MergePotentials, restoring
+    /// branches at ends of blocks as appropriate.
+    void RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock* SuccBB,
+                                                MachineBasicBlock* PredBB);
+
+    /// None of the blocks to be tail-merged consist only of the common tail.
+    /// Create a block that does by splitting one.
+    bool CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
+                                   MachineBasicBlock *SuccBB,
+                                   unsigned maxCommonTailLength,
+                                   unsigned &commonTailIndex);
+
+    /// Create merged DebugLocs of identical instructions across SameTails and
+    /// assign it to the instruction in common tail; merge MMOs and undef flags.
+    void mergeCommonTails(unsigned commonTailIndex);
+
+    bool OptimizeBranches(MachineFunction &MF);
+
+    /// Analyze and optimize control flow related to the specified block. This
+    /// is never called on the entry block.
+    bool OptimizeBlock(MachineBasicBlock *MBB);
+
+    /// Remove the specified dead machine basic block from the function,
+    /// updating the CFG.
+    void RemoveDeadBlock(MachineBasicBlock *MBB);
+
+    /// Hoist common instruction sequences at the start of basic blocks to their
+    /// common predecessor.
+    bool HoistCommonCode(MachineFunction &MF);
+
+    /// If the successors of MBB has common instruction sequence at the start of
+    /// the function, move the instructions before MBB terminator if it's legal.
+    bool HoistCommonCodeInSuccs(MachineBasicBlock *MBB);
+  };
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_BRANCHFOLDING_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BranchRelaxation.cpp b/contrib/llvm-project/llvm/lib/CodeGen/BranchRelaxation.cpp
new file mode 100644
index 000000000000..05494f1ddc67
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/BranchRelaxation.cpp
@@ -0,0 +1,637 @@
+//===- BranchRelaxation.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "branch-relaxation"
+
+STATISTIC(NumSplit, "Number of basic blocks split");
+STATISTIC(NumConditionalRelaxed, "Number of conditional branches relaxed");
+STATISTIC(NumUnconditionalRelaxed, "Number of unconditional branches relaxed");
+
+#define BRANCH_RELAX_NAME "Branch relaxation pass"
+
+namespace {
+
+class BranchRelaxation : public MachineFunctionPass {
+  /// BasicBlockInfo - Information about the offset and size of a single
+  /// basic block.
+  struct BasicBlockInfo {
+    /// Offset - Distance from the beginning of the function to the beginning
+    /// of this basic block.
+    ///
+    /// The offset is always aligned as required by the basic block.
+    unsigned Offset = 0;
+
+    /// Size - Size of the basic block in bytes.  If the block contains
+    /// inline assembly, this is a worst case estimate.
+    ///
+    /// The size does not include any alignment padding whether from the
+    /// beginning of the block, or from an aligned jump table at the end.
+    unsigned Size = 0;
+
+    BasicBlockInfo() = default;
+
+    /// Compute the offset immediately following this block. \p MBB is the next
+    /// block.
+    unsigned postOffset(const MachineBasicBlock &MBB) const {
+      const unsigned PO = Offset + Size;
+      const Align Alignment = MBB.getAlignment();
+      const Align ParentAlign = MBB.getParent()->getAlignment();
+      if (Alignment <= ParentAlign)
+        return alignTo(PO, Alignment);
+
+      // The alignment of this MBB is larger than the function's alignment, so we
+      // can't tell whether or not it will insert nops. Assume that it will.
+      return alignTo(PO, Alignment) + Alignment.value() - ParentAlign.value();
+    }
+  };
+
+  SmallVector<BasicBlockInfo, 16> BlockInfo;
+  std::unique_ptr<RegScavenger> RS;
+  LivePhysRegs LiveRegs;
+
+  MachineFunction *MF = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+
+  bool relaxBranchInstructions();
+  void scanFunction();
+
+  MachineBasicBlock *createNewBlockAfter(MachineBasicBlock &OrigMBB);
+  MachineBasicBlock *createNewBlockAfter(MachineBasicBlock &OrigMBB,
+                                         const BasicBlock *BB);
+
+  MachineBasicBlock *splitBlockBeforeInstr(MachineInstr &MI,
+                                           MachineBasicBlock *DestBB);
+  void adjustBlockOffsets(MachineBasicBlock &Start);
+  bool isBlockInRange(const MachineInstr &MI, const MachineBasicBlock &BB) const;
+
+  bool fixupConditionalBranch(MachineInstr &MI);
+  bool fixupUnconditionalBranch(MachineInstr &MI);
+  uint64_t computeBlockSize(const MachineBasicBlock &MBB) const;
+  unsigned getInstrOffset(const MachineInstr &MI) const;
+  void dumpBBs();
+  void verify();
+
+public:
+  static char ID;
+
+  BranchRelaxation() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return BRANCH_RELAX_NAME; }
+};
+
+} // end anonymous namespace
+
+char BranchRelaxation::ID = 0;
+
+char &llvm::BranchRelaxationPassID = BranchRelaxation::ID;
+
+INITIALIZE_PASS(BranchRelaxation, DEBUG_TYPE, BRANCH_RELAX_NAME, false, false)
+
+/// verify - check BBOffsets, BBSizes, alignment of islands
+void BranchRelaxation::verify() {
+#ifndef NDEBUG
+  unsigned PrevNum = MF->begin()->getNumber();
+  for (MachineBasicBlock &MBB : *MF) {
+    const unsigned Num = MBB.getNumber();
+    assert(!Num || BlockInfo[PrevNum].postOffset(MBB) <= BlockInfo[Num].Offset);
+    assert(BlockInfo[Num].Size == computeBlockSize(MBB));
+    PrevNum = Num;
+  }
+
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineBasicBlock::iterator J = MBB.getFirstTerminator();
+         J != MBB.end(); J = std::next(J)) {
+      MachineInstr &MI = *J;
+      if (!MI.isConditionalBranch() && !MI.isUnconditionalBranch())
+        continue;
+      if (MI.getOpcode() == TargetOpcode::FAULTING_OP)
+        continue;
+      MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI);
+      assert(isBlockInRange(MI, *DestBB));
+    }
+  }
+#endif
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+/// print block size and offset information - debugging
+LLVM_DUMP_METHOD void BranchRelaxation::dumpBBs() {
+  for (auto &MBB : *MF) {
+    const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()];
+    dbgs() << format("%%bb.%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)
+           << format("size=%#x\n", BBI.Size);
+  }
+}
+#endif
+
+/// scanFunction - Do the initial scan of the function, building up
+/// information about each block.
+void BranchRelaxation::scanFunction() {
+  BlockInfo.clear();
+  BlockInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineBasicBlock &MBB : *MF)
+    BlockInfo[MBB.getNumber()].Size = computeBlockSize(MBB);
+
+  // Compute block offsets and known bits.
+  adjustBlockOffsets(*MF->begin());
+}
+
+/// computeBlockSize - Compute the size for MBB.
+uint64_t BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) const {
+  uint64_t Size = 0;
+  for (const MachineInstr &MI : MBB)
+    Size += TII->getInstSizeInBytes(MI);
+  return Size;
+}
+
+/// getInstrOffset - Return the current offset of the specified machine
+/// instruction from the start of the function.  This offset changes as stuff is
+/// moved around inside the function.
+unsigned BranchRelaxation::getInstrOffset(const MachineInstr &MI) const {
+  const MachineBasicBlock *MBB = MI.getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BlockInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::const_iterator I = MBB->begin(); &*I != &MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->getInstSizeInBytes(*I);
+  }
+
+  return Offset;
+}
+
+void BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) {
+  unsigned PrevNum = Start.getNumber();
+  for (auto &MBB :
+       make_range(std::next(MachineFunction::iterator(Start)), MF->end())) {
+    unsigned Num = MBB.getNumber();
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    BlockInfo[Num].Offset = BlockInfo[PrevNum].postOffset(MBB);
+
+    PrevNum = Num;
+  }
+}
+
+/// Insert a new empty MachineBasicBlock and insert it after \p OrigMBB
+MachineBasicBlock *
+BranchRelaxation::createNewBlockAfter(MachineBasicBlock &OrigBB) {
+  return createNewBlockAfter(OrigBB, OrigBB.getBasicBlock());
+}
+
+/// Insert a new empty MachineBasicBlock with \p BB as its BasicBlock
+/// and insert it after \p OrigMBB
+MachineBasicBlock *
+BranchRelaxation::createNewBlockAfter(MachineBasicBlock &OrigMBB,
+                                      const BasicBlock *BB) {
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(++OrigMBB.getIterator(), NewBB);
+
+  // Insert an entry into BlockInfo to align it properly with the block numbers.
+  BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  return NewBB;
+}
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *BranchRelaxation::splitBlockBeforeInstr(MachineInstr &MI,
+                                                           MachineBasicBlock *DestBB) {
+  MachineBasicBlock *OrigBB = MI.getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+      MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MF->insert(++OrigBB->getIterator(), NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI.getIterator(), OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  TII->insertUnconditionalBranch(*OrigBB, NewBB, DebugLoc());
+
+  // Insert an entry into BlockInfo to align it properly with the block numbers.
+  BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  NewBB->transferSuccessors(OrigBB);
+  OrigBB->addSuccessor(NewBB);
+  OrigBB->addSuccessor(DestBB);
+
+  // Cleanup potential unconditional branch to successor block.
+  // Note that updateTerminator may change the size of the blocks.
+  OrigBB->updateTerminator(NewBB);
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  BlockInfo[OrigBB->getNumber()].Size = computeBlockSize(*OrigBB);
+
+  // Figure out how large the NewMBB is. As the second half of the original
+  // block, it may contain a tablejump.
+  BlockInfo[NewBB->getNumber()].Size = computeBlockSize(*NewBB);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBlockOffsets(*OrigBB);
+
+  // Need to fix live-in lists if we track liveness.
+  if (TRI->trackLivenessAfterRegAlloc(*MF))
+    computeAndAddLiveIns(LiveRegs, *NewBB);
+
+  ++NumSplit;
+
+  return NewBB;
+}
+
+/// isBlockInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool BranchRelaxation::isBlockInRange(
+  const MachineInstr &MI, const MachineBasicBlock &DestBB) const {
+  int64_t BrOffset = getInstrOffset(MI);
+  int64_t DestOffset = BlockInfo[DestBB.getNumber()].Offset;
+
+  if (TII->isBranchOffsetInRange(MI.getOpcode(), DestOffset - BrOffset))
+    return true;
+
+  LLVM_DEBUG(dbgs() << "Out of range branch to destination "
+                    << printMBBReference(DestBB) << " from "
+                    << printMBBReference(*MI.getParent()) << " to "
+                    << DestOffset << " offset " << DestOffset - BrOffset << '\t'
+                    << MI);
+
+  return false;
+}
+
+/// fixupConditionalBranch - Fix up a conditional branch whose destination is
+/// too far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
+  DebugLoc DL = MI.getDebugLoc();
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  MachineBasicBlock *NewBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+
+  auto insertUncondBranch = [&](MachineBasicBlock *MBB,
+                                MachineBasicBlock *DestBB) {
+    unsigned &BBSize = BlockInfo[MBB->getNumber()].Size;
+    int NewBrSize = 0;
+    TII->insertUnconditionalBranch(*MBB, DestBB, DL, &NewBrSize);
+    BBSize += NewBrSize;
+  };
+  auto insertBranch = [&](MachineBasicBlock *MBB, MachineBasicBlock *TBB,
+                          MachineBasicBlock *FBB,
+                          SmallVectorImpl<MachineOperand>& Cond) {
+    unsigned &BBSize = BlockInfo[MBB->getNumber()].Size;
+    int NewBrSize = 0;
+    TII->insertBranch(*MBB, TBB, FBB, Cond, DL, &NewBrSize);
+    BBSize += NewBrSize;
+  };
+  auto removeBranch = [&](MachineBasicBlock *MBB) {
+    unsigned &BBSize = BlockInfo[MBB->getNumber()].Size;
+    int RemovedSize = 0;
+    TII->removeBranch(*MBB, &RemovedSize);
+    BBSize -= RemovedSize;
+  };
+
+  auto finalizeBlockChanges = [&](MachineBasicBlock *MBB,
+                                  MachineBasicBlock *NewBB) {
+    // Keep the block offsets up to date.
+    adjustBlockOffsets(*MBB);
+
+    // Need to fix live-in lists if we track liveness.
+    if (NewBB && TRI->trackLivenessAfterRegAlloc(*MF))
+      computeAndAddLiveIns(LiveRegs, *NewBB);
+  };
+
+  bool Fail = TII->analyzeBranch(*MBB, TBB, FBB, Cond);
+  assert(!Fail && "branches to be relaxed must be analyzable");
+  (void)Fail;
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // tbz L1
+  // =>
+  // tbnz L2
+  // b   L1
+  // L2:
+
+  bool ReversedCond = !TII->reverseBranchCondition(Cond);
+  if (ReversedCond) {
+    if (FBB && isBlockInRange(MI, *FBB)) {
+      // Last MI in the BB is an unconditional branch. We can simply invert the
+      // condition and swap destinations:
+      // beq L1
+      // b   L2
+      // =>
+      // bne L2
+      // b   L1
+      LLVM_DEBUG(dbgs() << "  Invert condition and swap "
+                           "its destination with "
+                        << MBB->back());
+
+      removeBranch(MBB);
+      insertBranch(MBB, FBB, TBB, Cond);
+      finalizeBlockChanges(MBB, nullptr);
+      return true;
+    }
+    if (FBB) {
+      // We need to split the basic block here to obtain two long-range
+      // unconditional branches.
+      NewBB = createNewBlockAfter(*MBB);
+
+      insertUncondBranch(NewBB, FBB);
+      // Update the succesor lists according to the transformation to follow.
+      // Do it here since if there's no split, no update is needed.
+      MBB->replaceSuccessor(FBB, NewBB);
+      NewBB->addSuccessor(FBB);
+    }
+
+    // We now have an appropriate fall-through block in place (either naturally or
+    // just created), so we can use the inverted the condition.
+    MachineBasicBlock &NextBB = *std::next(MachineFunction::iterator(MBB));
+
+    LLVM_DEBUG(dbgs() << "  Insert B to " << printMBBReference(*TBB)
+                      << ", invert condition and change dest. to "
+                      << printMBBReference(NextBB) << '\n');
+
+    removeBranch(MBB);
+    // Insert a new conditional branch and a new unconditional branch.
+    insertBranch(MBB, &NextBB, TBB, Cond);
+
+    finalizeBlockChanges(MBB, NewBB);
+    return true;
+  }
+  // Branch cond can't be inverted.
+  // In this case we always add a block after the MBB.
+  LLVM_DEBUG(dbgs() << "  The branch condition can't be inverted. "
+                    << "  Insert a new BB after " << MBB->back());
+
+  if (!FBB)
+    FBB = &(*std::next(MachineFunction::iterator(MBB)));
+
+  // This is the block with cond. branch and the distance to TBB is too long.
+  //    beq L1
+  // L2:
+
+  // We do the following transformation:
+  //    beq NewBB
+  //    b L2
+  // NewBB:
+  //    b L1
+  // L2:
+
+  NewBB = createNewBlockAfter(*MBB);
+  insertUncondBranch(NewBB, TBB);
+
+  LLVM_DEBUG(dbgs() << "  Insert cond B to the new BB "
+                    << printMBBReference(*NewBB)
+                    << "  Keep the exiting condition.\n"
+                    << "  Insert B to " << printMBBReference(*FBB) << ".\n"
+                    << "  In the new BB: Insert B to "
+                    << printMBBReference(*TBB) << ".\n");
+
+  // Update the successor lists according to the transformation to follow.
+  MBB->replaceSuccessor(TBB, NewBB);
+  NewBB->addSuccessor(TBB);
+
+  // Replace branch in the current (MBB) block.
+  removeBranch(MBB);
+  insertBranch(MBB, NewBB, FBB, Cond);
+
+  finalizeBlockChanges(MBB, NewBB);
+  return true;
+}
+
+bool BranchRelaxation::fixupUnconditionalBranch(MachineInstr &MI) {
+  MachineBasicBlock *MBB = MI.getParent();
+  SmallVector<MachineOperand, 4> Cond;
+  unsigned OldBrSize = TII->getInstSizeInBytes(MI);
+  MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI);
+
+  int64_t DestOffset = BlockInfo[DestBB->getNumber()].Offset;
+  int64_t SrcOffset = getInstrOffset(MI);
+
+  assert(!TII->isBranchOffsetInRange(MI.getOpcode(), DestOffset - SrcOffset));
+
+  BlockInfo[MBB->getNumber()].Size -= OldBrSize;
+
+  MachineBasicBlock *BranchBB = MBB;
+
+  // If this was an expanded conditional branch, there is already a single
+  // unconditional branch in a block.
+  if (!MBB->empty()) {
+    BranchBB = createNewBlockAfter(*MBB);
+
+    // Add live outs.
+    for (const MachineBasicBlock *Succ : MBB->successors()) {
+      for (const MachineBasicBlock::RegisterMaskPair &LiveIn : Succ->liveins())
+        BranchBB->addLiveIn(LiveIn);
+    }
+
+    BranchBB->sortUniqueLiveIns();
+    BranchBB->addSuccessor(DestBB);
+    MBB->replaceSuccessor(DestBB, BranchBB);
+  }
+
+  DebugLoc DL = MI.getDebugLoc();
+  MI.eraseFromParent();
+
+  // Create the optional restore block and, initially, place it at the end of
+  // function. That block will be placed later if it's used; otherwise, it will
+  // be erased.
+  MachineBasicBlock *RestoreBB = createNewBlockAfter(MF->back(),
+                                                     DestBB->getBasicBlock());
+
+  TII->insertIndirectBranch(*BranchBB, *DestBB, *RestoreBB, DL,
+                            DestOffset - SrcOffset, RS.get());
+
+  BlockInfo[BranchBB->getNumber()].Size = computeBlockSize(*BranchBB);
+  adjustBlockOffsets(*MBB);
+
+  // If RestoreBB is required, try to place just before DestBB.
+  if (!RestoreBB->empty()) {
+    // TODO: For multiple far branches to the same destination, there are
+    // chances that some restore blocks could be shared if they clobber the
+    // same registers and share the same restore sequence. So far, those
+    // restore blocks are just duplicated for each far branch.
+    assert(!DestBB->isEntryBlock());
+    MachineBasicBlock *PrevBB = &*std::prev(DestBB->getIterator());
+    // Fall through only if PrevBB has no unconditional branch as one of its
+    // terminators.
+    if (auto *FT = PrevBB->getLogicalFallThrough()) {
+      assert(FT == DestBB);
+      TII->insertUnconditionalBranch(*PrevBB, FT, DebugLoc());
+      BlockInfo[PrevBB->getNumber()].Size = computeBlockSize(*PrevBB);
+    }
+    // Now, RestoreBB could be placed directly before DestBB.
+    MF->splice(DestBB->getIterator(), RestoreBB->getIterator());
+    // Update successors and predecessors.
+    RestoreBB->addSuccessor(DestBB);
+    BranchBB->replaceSuccessor(DestBB, RestoreBB);
+    if (TRI->trackLivenessAfterRegAlloc(*MF))
+      computeAndAddLiveIns(LiveRegs, *RestoreBB);
+    // Compute the restore block size.
+    BlockInfo[RestoreBB->getNumber()].Size = computeBlockSize(*RestoreBB);
+    // Update the offset starting from the previous block.
+    adjustBlockOffsets(*PrevBB);
+  } else {
+    // Remove restore block if it's not required.
+    MF->erase(RestoreBB);
+  }
+
+  return true;
+}
+
+bool BranchRelaxation::relaxBranchInstructions() {
+  bool Changed = false;
+
+  // Relaxing branches involves creating new basic blocks, so re-eval
+  // end() for termination.
+  for (MachineBasicBlock &MBB : *MF) {
+    // Empty block?
+    MachineBasicBlock::iterator Last = MBB.getLastNonDebugInstr();
+    if (Last == MBB.end())
+      continue;
+
+    // Expand the unconditional branch first if necessary. If there is a
+    // conditional branch, this will end up changing the branch destination of
+    // it to be over the newly inserted indirect branch block, which may avoid
+    // the need to try expanding the conditional branch first, saving an extra
+    // jump.
+    if (Last->isUnconditionalBranch()) {
+      // Unconditional branch destination might be unanalyzable, assume these
+      // are OK.
+      if (MachineBasicBlock *DestBB = TII->getBranchDestBlock(*Last)) {
+        if (!isBlockInRange(*Last, *DestBB)) {
+          fixupUnconditionalBranch(*Last);
+          ++NumUnconditionalRelaxed;
+          Changed = true;
+        }
+      }
+    }
+
+    // Loop over the conditional branches.
+    MachineBasicBlock::iterator Next;
+    for (MachineBasicBlock::iterator J = MBB.getFirstTerminator();
+         J != MBB.end(); J = Next) {
+      Next = std::next(J);
+      MachineInstr &MI = *J;
+
+      if (!MI.isConditionalBranch())
+        continue;
+
+      if (MI.getOpcode() == TargetOpcode::FAULTING_OP)
+        // FAULTING_OP's destination is not encoded in the instruction stream
+        // and thus never needs relaxed.
+        continue;
+
+      MachineBasicBlock *DestBB = TII->getBranchDestBlock(MI);
+      if (!isBlockInRange(MI, *DestBB)) {
+        if (Next != MBB.end() && Next->isConditionalBranch()) {
+          // If there are multiple conditional branches, this isn't an
+          // analyzable block. Split later terminators into a new block so
+          // each one will be analyzable.
+
+          splitBlockBeforeInstr(*Next, DestBB);
+        } else {
+          fixupConditionalBranch(MI);
+          ++NumConditionalRelaxed;
+        }
+
+        Changed = true;
+
+        // This may have modified all of the terminators, so start over.
+        Next = MBB.getFirstTerminator();
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+
+  LLVM_DEBUG(dbgs() << "***** BranchRelaxation *****\n");
+
+  const TargetSubtargetInfo &ST = MF->getSubtarget();
+  TII = ST.getInstrInfo();
+
+  TRI = ST.getRegisterInfo();
+  if (TRI->trackLivenessAfterRegAlloc(*MF))
+    RS.reset(new RegScavenger());
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block.
+  scanFunction();
+
+  LLVM_DEBUG(dbgs() << "  Basic blocks before relaxation\n"; dumpBBs(););
+
+  bool MadeChange = false;
+  while (relaxBranchInstructions())
+    MadeChange = true;
+
+  // After a while, this might be made debug-only, but it is not expensive.
+  verify();
+
+  LLVM_DEBUG(dbgs() << "  Basic blocks after relaxation\n\n"; dumpBBs());
+
+  BlockInfo.clear();
+
+  return MadeChange;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/BreakFalseDeps.cpp b/contrib/llvm-project/llvm/lib/CodeGen/BreakFalseDeps.cpp
new file mode 100644
index 000000000000..618e41894b29
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/BreakFalseDeps.cpp
@@ -0,0 +1,305 @@
+//==- llvm/CodeGen/BreakFalseDeps.cpp - Break False Dependency Fix -*- C++ -*==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Break False Dependency pass.
+///
+/// Some instructions have false dependencies which cause unnecessary stalls.
+/// For example, instructions may write part of a register and implicitly
+/// need to read the other parts of the register. This may cause unwanted
+/// stalls preventing otherwise unrelated instructions from executing in
+/// parallel in an out-of-order CPU.
+/// This pass is aimed at identifying and avoiding these dependencies.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/ReachingDefAnalysis.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegister.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+class BreakFalseDeps : public MachineFunctionPass {
+private:
+  MachineFunction *MF = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  RegisterClassInfo RegClassInfo;
+
+  /// List of undefined register reads in this block in forward order.
+  std::vector<std::pair<MachineInstr *, unsigned>> UndefReads;
+
+  /// Storage for register unit liveness.
+  LivePhysRegs LiveRegSet;
+
+  ReachingDefAnalysis *RDA = nullptr;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  BreakFalseDeps() : MachineFunctionPass(ID) {
+    initializeBreakFalseDepsPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<ReachingDefAnalysis>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+      MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  /// Process he given basic block.
+  void processBasicBlock(MachineBasicBlock *MBB);
+
+  /// Update def-ages for registers defined by MI.
+  /// Also break dependencies on partial defs and undef uses.
+  void processDefs(MachineInstr *MI);
+
+  /// Helps avoid false dependencies on undef registers by updating the
+  /// machine instructions' undef operand to use a register that the instruction
+  /// is truly dependent on, or use a register with clearance higher than Pref.
+  /// Returns true if it was able to find a true dependency, thus not requiring
+  /// a dependency breaking instruction regardless of clearance.
+  bool pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
+    unsigned Pref);
+
+  /// Return true to if it makes sense to break dependence on a partial
+  /// def or undef use.
+  bool shouldBreakDependence(MachineInstr *, unsigned OpIdx, unsigned Pref);
+
+  /// Break false dependencies on undefined register reads.
+  /// Walk the block backward computing precise liveness. This is expensive, so
+  /// we only do it on demand. Note that the occurrence of undefined register
+  /// reads that should be broken is very rare, but when they occur we may have
+  /// many in a single block.
+  void processUndefReads(MachineBasicBlock *);
+};
+
+} // namespace llvm
+
+#define DEBUG_TYPE "break-false-deps"
+
+char BreakFalseDeps::ID = 0;
+INITIALIZE_PASS_BEGIN(BreakFalseDeps, DEBUG_TYPE, "BreakFalseDeps", false, false)
+INITIALIZE_PASS_DEPENDENCY(ReachingDefAnalysis)
+INITIALIZE_PASS_END(BreakFalseDeps, DEBUG_TYPE, "BreakFalseDeps", false, false)
+
+FunctionPass *llvm::createBreakFalseDeps() { return new BreakFalseDeps(); }
+
+bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
+  unsigned Pref) {
+
+  // We can't change tied operands.
+  if (MI->isRegTiedToDefOperand(OpIdx))
+    return false;
+
+  MachineOperand &MO = MI->getOperand(OpIdx);
+  assert(MO.isUndef() && "Expected undef machine operand");
+
+  // We can't change registers that aren't renamable.
+  if (!MO.isRenamable())
+    return false;
+
+  MCRegister OriginalReg = MO.getReg().asMCReg();
+
+  // Update only undef operands that have reg units that are mapped to one root.
+  for (MCRegUnit Unit : TRI->regunits(OriginalReg)) {
+    unsigned NumRoots = 0;
+    for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) {
+      NumRoots++;
+      if (NumRoots > 1)
+        return false;
+    }
+  }
+
+  // Get the undef operand's register class
+  const TargetRegisterClass *OpRC =
+    TII->getRegClass(MI->getDesc(), OpIdx, TRI, *MF);
+  assert(OpRC && "Not a valid register class");
+
+  // If the instruction has a true dependency, we can hide the false depdency
+  // behind it.
+  for (MachineOperand &CurrMO : MI->all_uses()) {
+    if (CurrMO.isUndef() || !OpRC->contains(CurrMO.getReg()))
+      continue;
+    // We found a true dependency - replace the undef register with the true
+    // dependency.
+    MO.setReg(CurrMO.getReg());
+    return true;
+  }
+
+  // Go over all registers in the register class and find the register with
+  // max clearance or clearance higher than Pref.
+  unsigned MaxClearance = 0;
+  unsigned MaxClearanceReg = OriginalReg;
+  ArrayRef<MCPhysReg> Order = RegClassInfo.getOrder(OpRC);
+  for (MCPhysReg Reg : Order) {
+    unsigned Clearance = RDA->getClearance(MI, Reg);
+    if (Clearance <= MaxClearance)
+      continue;
+    MaxClearance = Clearance;
+    MaxClearanceReg = Reg;
+
+    if (MaxClearance > Pref)
+      break;
+  }
+
+  // Update the operand if we found a register with better clearance.
+  if (MaxClearanceReg != OriginalReg)
+    MO.setReg(MaxClearanceReg);
+
+  return false;
+}
+
+bool BreakFalseDeps::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
+                                           unsigned Pref) {
+  MCRegister Reg = MI->getOperand(OpIdx).getReg().asMCReg();
+  unsigned Clearance = RDA->getClearance(MI, Reg);
+  LLVM_DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
+
+  if (Pref > Clearance) {
+    LLVM_DEBUG(dbgs() << ": Break dependency.\n");
+    return true;
+  }
+  LLVM_DEBUG(dbgs() << ": OK .\n");
+  return false;
+}
+
+void BreakFalseDeps::processDefs(MachineInstr *MI) {
+  assert(!MI->isDebugInstr() && "Won't process debug values");
+
+  const MCInstrDesc &MCID = MI->getDesc();
+
+  // Break dependence on undef uses. Do this before updating LiveRegs below.
+  // This can remove a false dependence with no additional instructions.
+  for (unsigned i = MCID.getNumDefs(), e = MCID.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.getReg() || !MO.isUse() || !MO.isUndef())
+      continue;
+
+    unsigned Pref = TII->getUndefRegClearance(*MI, i, TRI);
+    if (Pref) {
+      bool HadTrueDependency = pickBestRegisterForUndef(MI, i, Pref);
+      // We don't need to bother trying to break a dependency if this
+      // instruction has a true dependency on that register through another
+      // operand - we'll have to wait for it to be available regardless.
+      if (!HadTrueDependency && shouldBreakDependence(MI, i, Pref))
+        UndefReads.push_back(std::make_pair(MI, i));
+    }
+  }
+
+  // The code below allows the target to create a new instruction to break the
+  // dependence. That opposes the goal of minimizing size, so bail out now.
+  if (MF->getFunction().hasMinSize())
+    return;
+
+  for (unsigned i = 0,
+    e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs();
+    i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+    if (MO.isUse())
+      continue;
+    // Check clearance before partial register updates.
+    unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI);
+    if (Pref && shouldBreakDependence(MI, i, Pref))
+      TII->breakPartialRegDependency(*MI, i, TRI);
+  }
+}
+
+void BreakFalseDeps::processUndefReads(MachineBasicBlock *MBB) {
+  if (UndefReads.empty())
+    return;
+
+  // The code below allows the target to create a new instruction to break the
+  // dependence. That opposes the goal of minimizing size, so bail out now.
+  if (MF->getFunction().hasMinSize())
+    return;
+
+  // Collect this block's live out register units.
+  LiveRegSet.init(*TRI);
+  // We do not need to care about pristine registers as they are just preserved
+  // but not actually used in the function.
+  LiveRegSet.addLiveOutsNoPristines(*MBB);
+
+  MachineInstr *UndefMI = UndefReads.back().first;
+  unsigned OpIdx = UndefReads.back().second;
+
+  for (MachineInstr &I : llvm::reverse(*MBB)) {
+    // Update liveness, including the current instruction's defs.
+    LiveRegSet.stepBackward(I);
+
+    if (UndefMI == &I) {
+      if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg()))
+        TII->breakPartialRegDependency(*UndefMI, OpIdx, TRI);
+
+      UndefReads.pop_back();
+      if (UndefReads.empty())
+        return;
+
+      UndefMI = UndefReads.back().first;
+      OpIdx = UndefReads.back().second;
+    }
+  }
+}
+
+void BreakFalseDeps::processBasicBlock(MachineBasicBlock *MBB) {
+  UndefReads.clear();
+  // If this block is not done, it makes little sense to make any decisions
+  // based on clearance information. We need to make a second pass anyway,
+  // and by then we'll have better information, so we can avoid doing the work
+  // to try and break dependencies now.
+  for (MachineInstr &MI : *MBB) {
+    if (!MI.isDebugInstr())
+      processDefs(&MI);
+  }
+  processUndefReads(MBB);
+}
+
+bool BreakFalseDeps::runOnMachineFunction(MachineFunction &mf) {
+  if (skipFunction(mf.getFunction()))
+    return false;
+  MF = &mf;
+  TII = MF->getSubtarget().getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  RDA = &getAnalysis<ReachingDefAnalysis>();
+
+  RegClassInfo.runOnMachineFunction(mf);
+
+  LLVM_DEBUG(dbgs() << "********** BREAK FALSE DEPENDENCIES **********\n");
+
+  // Skip Dead blocks due to ReachingDefAnalysis has no idea about instructions
+  // in them.
+  df_iterator_default_set<MachineBasicBlock *> Reachable;
+  for (MachineBasicBlock *MBB : depth_first_ext(&mf, Reachable))
+    (void)MBB /* Mark all reachable blocks */;
+
+  // Traverse the basic blocks.
+  for (MachineBasicBlock &MBB : mf)
+    if (Reachable.count(&MBB))
+      processBasicBlock(&MBB);
+
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CFGuardLongjmp.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CFGuardLongjmp.cpp
new file mode 100644
index 000000000000..c3bf93855111
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CFGuardLongjmp.cpp
@@ -0,0 +1,120 @@
+//===-- CFGuardLongjmp.cpp - Longjmp symbols for CFGuard --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains a machine function pass to insert a symbol after each
+/// call to _setjmp and store this in the MachineFunction's LongjmpTargets
+/// vector. This will be used to emit the table of valid longjmp targets used
+/// by Control Flow Guard.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "cfguard-longjmp"
+
+STATISTIC(CFGuardLongjmpTargets,
+          "Number of Control Flow Guard longjmp targets");
+
+namespace {
+
+/// MachineFunction pass to insert a symbol after each call to _setjmp and store
+/// this in the MachineFunction's LongjmpTargets vector.
+class CFGuardLongjmp : public MachineFunctionPass {
+public:
+  static char ID;
+
+  CFGuardLongjmp() : MachineFunctionPass(ID) {
+    initializeCFGuardLongjmpPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Control Flow Guard longjmp targets";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // end anonymous namespace
+
+char CFGuardLongjmp::ID = 0;
+
+INITIALIZE_PASS(CFGuardLongjmp, "CFGuardLongjmp",
+                "Insert symbols at valid longjmp targets for /guard:cf", false,
+                false)
+FunctionPass *llvm::createCFGuardLongjmpPass() { return new CFGuardLongjmp(); }
+
+bool CFGuardLongjmp::runOnMachineFunction(MachineFunction &MF) {
+
+  // Skip modules for which the cfguard flag is not set.
+  if (!MF.getMMI().getModule()->getModuleFlag("cfguard"))
+    return false;
+
+  // Skip functions that do not have calls to _setjmp.
+  if (!MF.getFunction().callsFunctionThatReturnsTwice())
+    return false;
+
+  SmallVector<MachineInstr *, 8> SetjmpCalls;
+
+  // Iterate over all instructions in the function and add calls to functions
+  // that return twice to the list of targets.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+
+      // Skip instructions that are not calls.
+      if (!MI.isCall() || MI.getNumOperands() < 1)
+        continue;
+
+      // Iterate over operands to find calls to global functions.
+      for (MachineOperand &MO : MI.operands()) {
+        if (!MO.isGlobal())
+          continue;
+
+        auto *F = dyn_cast<Function>(MO.getGlobal());
+        if (!F)
+          continue;
+
+        // If the instruction calls a function that returns twice, add
+        // it to the list of targets.
+        if (F->hasFnAttribute(Attribute::ReturnsTwice)) {
+          SetjmpCalls.push_back(&MI);
+          break;
+        }
+      }
+    }
+  }
+
+  if (SetjmpCalls.empty())
+    return false;
+
+  unsigned SetjmpNum = 0;
+
+  // For each possible target, create a new symbol and insert it immediately
+  // after the call to setjmp. Add this symbol to the MachineFunction's list
+  // of longjmp targets.
+  for (MachineInstr *Setjmp : SetjmpCalls) {
+    SmallString<128> SymbolName;
+    raw_svector_ostream(SymbolName) << "$cfgsj_" << MF.getName() << SetjmpNum++;
+    MCSymbol *SjSymbol = MF.getContext().getOrCreateSymbol(SymbolName);
+
+    Setjmp->setPostInstrSymbol(MF, SjSymbol);
+    MF.addLongjmpTarget(SjSymbol);
+    CFGuardLongjmpTargets++;
+  }
+
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CFIFixup.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CFIFixup.cpp
new file mode 100644
index 000000000000..837dbd77d073
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CFIFixup.cpp
@@ -0,0 +1,225 @@
+//===------ CFIFixup.cpp - Insert CFI remember/restore instructions -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+
+// This pass inserts the necessary  instructions to adjust for the inconsistency
+// of the call-frame information caused by final machine basic block layout.
+// The pass relies in constraints LLVM imposes on the placement of
+// save/restore points (cf. ShrinkWrap):
+// * there is a single basic block, containing the function prologue
+// * possibly multiple epilogue blocks, where each epilogue block is
+//   complete and self-contained, i.e. CSR restore instructions (and the
+//   corresponding CFI instructions are not split across two or more blocks.
+// * prologue and epilogue blocks are outside of any loops
+// Thus, during execution, at the beginning and at the end of each basic block
+// the function can be in one of two states:
+//  - "has a call frame", if the function has executed the prologue, and
+//    has not executed any epilogue
+//  - "does not have a call frame", if the function has not executed the
+//    prologue, or has executed an epilogue
+// which can be computed by a single RPO traversal.
+
+// In order to accommodate backends which do not generate unwind info in
+// epilogues we compute an additional property "strong no call frame on entry",
+// which is set for the entry point of the function and for every block
+// reachable from the entry along a path that does not execute the prologue. If
+// this property holds, it takes precedence over the "has a call frame"
+// property.
+
+// From the point of view of the unwind tables, the "has/does not have call
+// frame" state at beginning of each block is determined by the state at the end
+// of the previous block, in layout order. Where these states differ, we insert
+// compensating CFI instructions, which come in two flavours:
+
+//   - CFI instructions, which reset the unwind table state to the initial one.
+//     This is done by a target specific hook and is expected to be trivial
+//     to implement, for example it could be:
+//       .cfi_def_cfa <sp>, 0
+//       .cfi_same_value <rN>
+//       .cfi_same_value <rN-1>
+//       ...
+//     where <rN> are the callee-saved registers.
+//   - CFI instructions, which reset the unwind table state to the one
+//     created by the function prologue. These are
+//       .cfi_restore_state
+//       .cfi_remember_state
+//     In this case we also insert a `.cfi_remember_state` after the last CFI
+//     instruction in the function prologue.
+//
+// Known limitations:
+//  * the pass cannot handle an epilogue preceding the prologue in the basic
+//    block layout
+//  * the pass does not handle functions where SP is used as a frame pointer and
+//    SP adjustments up and down are done in different basic blocks (TODO)
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/CFIFixup.h"
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "cfi-fixup"
+
+char CFIFixup::ID = 0;
+
+INITIALIZE_PASS(CFIFixup, "cfi-fixup",
+                "Insert CFI remember/restore state instructions", false, false)
+FunctionPass *llvm::createCFIFixup() { return new CFIFixup(); }
+
+static bool isPrologueCFIInstruction(const MachineInstr &MI) {
+  return MI.getOpcode() == TargetOpcode::CFI_INSTRUCTION &&
+         MI.getFlag(MachineInstr::FrameSetup);
+}
+
+static bool containsPrologue(const MachineBasicBlock &MBB) {
+  return llvm::any_of(MBB.instrs(), isPrologueCFIInstruction);
+}
+
+static bool containsEpilogue(const MachineBasicBlock &MBB) {
+  return llvm::any_of(llvm::reverse(MBB), [](const auto &MI) {
+    return MI.getOpcode() == TargetOpcode::CFI_INSTRUCTION &&
+           MI.getFlag(MachineInstr::FrameDestroy);
+  });
+}
+
+bool CFIFixup::runOnMachineFunction(MachineFunction &MF) {
+  const TargetFrameLowering &TFL = *MF.getSubtarget().getFrameLowering();
+  if (!TFL.enableCFIFixup(MF))
+    return false;
+
+  const unsigned NumBlocks = MF.getNumBlockIDs();
+  if (NumBlocks < 2)
+    return false;
+
+  struct BlockFlags {
+    bool Reachable : 1;
+    bool StrongNoFrameOnEntry : 1;
+    bool HasFrameOnEntry : 1;
+    bool HasFrameOnExit : 1;
+  };
+  SmallVector<BlockFlags, 32> BlockInfo(NumBlocks, {false, false, false, false});
+  BlockInfo[0].Reachable = true;
+  BlockInfo[0].StrongNoFrameOnEntry = true;
+
+  // Compute the presence/absence of frame at each basic block.
+  MachineBasicBlock *PrologueBlock = nullptr;
+  ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
+  for (MachineBasicBlock *MBB : RPOT) {
+    BlockFlags &Info = BlockInfo[MBB->getNumber()];
+
+    // Set to true if the current block contains the prologue or the epilogue,
+    // respectively.
+    bool HasPrologue = false;
+    bool HasEpilogue = false;
+
+    if (!PrologueBlock && !Info.HasFrameOnEntry && containsPrologue(*MBB)) {
+      PrologueBlock = MBB;
+      HasPrologue = true;
+    }
+
+    if (Info.HasFrameOnEntry || HasPrologue)
+      HasEpilogue = containsEpilogue(*MBB);
+
+    // If the function has a call frame at the entry of the current block or the
+    // current block contains the prologue, then the function has a call frame
+    // at the exit of the block, unless the block contains the epilogue.
+    Info.HasFrameOnExit = (Info.HasFrameOnEntry || HasPrologue) && !HasEpilogue;
+
+    // Set the successors' state on entry.
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      BlockFlags &SuccInfo = BlockInfo[Succ->getNumber()];
+      SuccInfo.Reachable = true;
+      SuccInfo.StrongNoFrameOnEntry |=
+          Info.StrongNoFrameOnEntry && !HasPrologue;
+      SuccInfo.HasFrameOnEntry = Info.HasFrameOnExit;
+    }
+  }
+
+  if (!PrologueBlock)
+    return false;
+
+  // Walk the blocks of the function in "physical" order.
+  // Every block inherits the frame state (as recorded in the unwind tables)
+  // of the previous block. If the intended frame state is different, insert
+  // compensating CFI instructions.
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  bool Change = false;
+  // `InsertPt` always points to the point in a preceding block where we have to
+  // insert a `.cfi_remember_state`, in the case that the current block needs a
+  // `.cfi_restore_state`.
+  MachineBasicBlock *InsertMBB = PrologueBlock;
+  MachineBasicBlock::iterator InsertPt = PrologueBlock->begin();
+  for (MachineInstr &MI : *PrologueBlock)
+    if (isPrologueCFIInstruction(MI))
+      InsertPt = std::next(MI.getIterator());
+
+  assert(InsertPt != PrologueBlock->begin() &&
+         "Inconsistent notion of \"prologue block\"");
+
+  // No point starting before the prologue block.
+  // TODO: the unwind tables will still be incorrect if an epilogue physically
+  // preceeds the prologue.
+  MachineFunction::iterator CurrBB = std::next(PrologueBlock->getIterator());
+  bool HasFrame = BlockInfo[PrologueBlock->getNumber()].HasFrameOnExit;
+  while (CurrBB != MF.end()) {
+    const BlockFlags &Info = BlockInfo[CurrBB->getNumber()];
+    if (!Info.Reachable) {
+      ++CurrBB;
+      continue;
+    }
+
+#ifndef NDEBUG
+    if (!Info.StrongNoFrameOnEntry) {
+      for (auto *Pred : CurrBB->predecessors()) {
+        BlockFlags &PredInfo = BlockInfo[Pred->getNumber()];
+        assert((!PredInfo.Reachable ||
+                Info.HasFrameOnEntry == PredInfo.HasFrameOnExit) &&
+               "Inconsistent call frame state");
+      }
+    }
+#endif
+    if (!Info.StrongNoFrameOnEntry && Info.HasFrameOnEntry && !HasFrame) {
+      // Reset to the "after prologue" state.
+
+      // Insert a `.cfi_remember_state` into the last block known to have a
+      // stack frame.
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::createRememberState(nullptr));
+      BuildMI(*InsertMBB, InsertPt, DebugLoc(),
+              TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+      // Insert a `.cfi_restore_state` at the beginning of the current block.
+      CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestoreState(nullptr));
+      InsertPt = BuildMI(*CurrBB, CurrBB->begin(), DebugLoc(),
+                         TII.get(TargetOpcode::CFI_INSTRUCTION))
+                     .addCFIIndex(CFIIndex);
+      ++InsertPt;
+      InsertMBB = &*CurrBB;
+      Change = true;
+    } else if ((Info.StrongNoFrameOnEntry || !Info.HasFrameOnEntry) &&
+               HasFrame) {
+      // Reset to the state upon function entry.
+      TFL.resetCFIToInitialState(*CurrBB);
+      Change = true;
+    }
+
+    HasFrame = Info.HasFrameOnExit;
+    ++CurrBB;
+  }
+
+  return Change;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CFIInstrInserter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CFIInstrInserter.cpp
new file mode 100644
index 000000000000..6a024287f002
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CFIInstrInserter.cpp
@@ -0,0 +1,449 @@
+//===------ CFIInstrInserter.cpp - Insert additional CFI instructions -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass verifies incoming and outgoing CFA information of basic
+/// blocks. CFA information is information about offset and register set by CFI
+/// directives, valid at the start and end of a basic block. This pass checks
+/// that outgoing information of predecessors matches incoming information of
+/// their successors. Then it checks if blocks have correct CFA calculation rule
+/// set and inserts additional CFI instruction at their beginnings if they
+/// don't. CFI instructions are inserted if basic blocks have incorrect offset
+/// or register set by previous blocks, as a result of a non-linear layout of
+/// blocks in a function.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCDwarf.h"
+using namespace llvm;
+
+static cl::opt<bool> VerifyCFI("verify-cfiinstrs",
+    cl::desc("Verify Call Frame Information instructions"),
+    cl::init(false),
+    cl::Hidden);
+
+namespace {
+class CFIInstrInserter : public MachineFunctionPass {
+ public:
+  static char ID;
+
+  CFIInstrInserter() : MachineFunctionPass(ID) {
+    initializeCFIInstrInserterPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (!MF.needsFrameMoves())
+      return false;
+
+    MBBVector.resize(MF.getNumBlockIDs());
+    calculateCFAInfo(MF);
+
+    if (VerifyCFI) {
+      if (unsigned ErrorNum = verify(MF))
+        report_fatal_error("Found " + Twine(ErrorNum) +
+                           " in/out CFI information errors.");
+    }
+    bool insertedCFI = insertCFIInstrs(MF);
+    MBBVector.clear();
+    return insertedCFI;
+  }
+
+ private:
+  struct MBBCFAInfo {
+    MachineBasicBlock *MBB;
+    /// Value of cfa offset valid at basic block entry.
+    int IncomingCFAOffset = -1;
+    /// Value of cfa offset valid at basic block exit.
+    int OutgoingCFAOffset = -1;
+    /// Value of cfa register valid at basic block entry.
+    unsigned IncomingCFARegister = 0;
+    /// Value of cfa register valid at basic block exit.
+    unsigned OutgoingCFARegister = 0;
+    /// Set of callee saved registers saved at basic block entry.
+    BitVector IncomingCSRSaved;
+    /// Set of callee saved registers saved at basic block exit.
+    BitVector OutgoingCSRSaved;
+    /// If in/out cfa offset and register values for this block have already
+    /// been set or not.
+    bool Processed = false;
+  };
+
+#define INVALID_REG UINT_MAX
+#define INVALID_OFFSET INT_MAX
+  /// contains the location where CSR register is saved.
+  struct CSRSavedLocation {
+    CSRSavedLocation(std::optional<unsigned> R, std::optional<int> O)
+        : Reg(R), Offset(O) {}
+    std::optional<unsigned> Reg;
+    std::optional<int> Offset;
+  };
+
+  /// Contains cfa offset and register values valid at entry and exit of basic
+  /// blocks.
+  std::vector<MBBCFAInfo> MBBVector;
+
+  /// Map the callee save registers to the locations where they are saved.
+  SmallDenseMap<unsigned, CSRSavedLocation, 16> CSRLocMap;
+
+  /// Calculate cfa offset and register values valid at entry and exit for all
+  /// basic blocks in a function.
+  void calculateCFAInfo(MachineFunction &MF);
+  /// Calculate cfa offset and register values valid at basic block exit by
+  /// checking the block for CFI instructions. Block's incoming CFA info remains
+  /// the same.
+  void calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo);
+  /// Update in/out cfa offset and register values for successors of the basic
+  /// block.
+  void updateSuccCFAInfo(MBBCFAInfo &MBBInfo);
+
+  /// Check if incoming CFA information of a basic block matches outgoing CFA
+  /// information of the previous block. If it doesn't, insert CFI instruction
+  /// at the beginning of the block that corrects the CFA calculation rule for
+  /// that block.
+  bool insertCFIInstrs(MachineFunction &MF);
+  /// Return the cfa offset value that should be set at the beginning of a MBB
+  /// if needed. The negated value is needed when creating CFI instructions that
+  /// set absolute offset.
+  int getCorrectCFAOffset(MachineBasicBlock *MBB) {
+    return MBBVector[MBB->getNumber()].IncomingCFAOffset;
+  }
+
+  void reportCFAError(const MBBCFAInfo &Pred, const MBBCFAInfo &Succ);
+  void reportCSRError(const MBBCFAInfo &Pred, const MBBCFAInfo &Succ);
+  /// Go through each MBB in a function and check that outgoing offset and
+  /// register of its predecessors match incoming offset and register of that
+  /// MBB, as well as that incoming offset and register of its successors match
+  /// outgoing offset and register of the MBB.
+  unsigned verify(MachineFunction &MF);
+};
+}  // namespace
+
+char CFIInstrInserter::ID = 0;
+INITIALIZE_PASS(CFIInstrInserter, "cfi-instr-inserter",
+                "Check CFA info and insert CFI instructions if needed", false,
+                false)
+FunctionPass *llvm::createCFIInstrInserter() { return new CFIInstrInserter(); }
+
+void CFIInstrInserter::calculateCFAInfo(MachineFunction &MF) {
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  // Initial CFA offset value i.e. the one valid at the beginning of the
+  // function.
+  int InitialOffset =
+      MF.getSubtarget().getFrameLowering()->getInitialCFAOffset(MF);
+  // Initial CFA register value i.e. the one valid at the beginning of the
+  // function.
+  Register InitialRegister =
+      MF.getSubtarget().getFrameLowering()->getInitialCFARegister(MF);
+  InitialRegister = TRI.getDwarfRegNum(InitialRegister, true);
+  unsigned NumRegs = TRI.getNumRegs();
+
+  // Initialize MBBMap.
+  for (MachineBasicBlock &MBB : MF) {
+    MBBCFAInfo &MBBInfo = MBBVector[MBB.getNumber()];
+    MBBInfo.MBB = &MBB;
+    MBBInfo.IncomingCFAOffset = InitialOffset;
+    MBBInfo.OutgoingCFAOffset = InitialOffset;
+    MBBInfo.IncomingCFARegister = InitialRegister;
+    MBBInfo.OutgoingCFARegister = InitialRegister;
+    MBBInfo.IncomingCSRSaved.resize(NumRegs);
+    MBBInfo.OutgoingCSRSaved.resize(NumRegs);
+  }
+  CSRLocMap.clear();
+
+  // Set in/out cfa info for all blocks in the function. This traversal is based
+  // on the assumption that the first block in the function is the entry block
+  // i.e. that it has initial cfa offset and register values as incoming CFA
+  // information.
+  updateSuccCFAInfo(MBBVector[MF.front().getNumber()]);
+}
+
+void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) {
+  // Outgoing cfa offset set by the block.
+  int SetOffset = MBBInfo.IncomingCFAOffset;
+  // Outgoing cfa register set by the block.
+  unsigned SetRegister = MBBInfo.IncomingCFARegister;
+  MachineFunction *MF = MBBInfo.MBB->getParent();
+  const std::vector<MCCFIInstruction> &Instrs = MF->getFrameInstructions();
+  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+  unsigned NumRegs = TRI.getNumRegs();
+  BitVector CSRSaved(NumRegs), CSRRestored(NumRegs);
+
+  // Determine cfa offset and register set by the block.
+  for (MachineInstr &MI : *MBBInfo.MBB) {
+    if (MI.isCFIInstruction()) {
+      std::optional<unsigned> CSRReg;
+      std::optional<int> CSROffset;
+      unsigned CFIIndex = MI.getOperand(0).getCFIIndex();
+      const MCCFIInstruction &CFI = Instrs[CFIIndex];
+      switch (CFI.getOperation()) {
+      case MCCFIInstruction::OpDefCfaRegister:
+        SetRegister = CFI.getRegister();
+        break;
+      case MCCFIInstruction::OpDefCfaOffset:
+        SetOffset = CFI.getOffset();
+        break;
+      case MCCFIInstruction::OpAdjustCfaOffset:
+        SetOffset += CFI.getOffset();
+        break;
+      case MCCFIInstruction::OpDefCfa:
+        SetRegister = CFI.getRegister();
+        SetOffset = CFI.getOffset();
+        break;
+      case MCCFIInstruction::OpOffset:
+        CSROffset = CFI.getOffset();
+        break;
+      case MCCFIInstruction::OpRegister:
+        CSRReg = CFI.getRegister2();
+        break;
+      case MCCFIInstruction::OpRelOffset:
+        CSROffset = CFI.getOffset() - SetOffset;
+        break;
+      case MCCFIInstruction::OpRestore:
+        CSRRestored.set(CFI.getRegister());
+        break;
+      case MCCFIInstruction::OpLLVMDefAspaceCfa:
+        // TODO: Add support for handling cfi_def_aspace_cfa.
+#ifndef NDEBUG
+        report_fatal_error(
+            "Support for cfi_llvm_def_aspace_cfa not implemented! Value of CFA "
+            "may be incorrect!\n");
+#endif
+        break;
+      case MCCFIInstruction::OpRememberState:
+        // TODO: Add support for handling cfi_remember_state.
+#ifndef NDEBUG
+        report_fatal_error(
+            "Support for cfi_remember_state not implemented! Value of CFA "
+            "may be incorrect!\n");
+#endif
+        break;
+      case MCCFIInstruction::OpRestoreState:
+        // TODO: Add support for handling cfi_restore_state.
+#ifndef NDEBUG
+        report_fatal_error(
+            "Support for cfi_restore_state not implemented! Value of CFA may "
+            "be incorrect!\n");
+#endif
+        break;
+      // Other CFI directives do not affect CFA value.
+      case MCCFIInstruction::OpUndefined:
+      case MCCFIInstruction::OpSameValue:
+      case MCCFIInstruction::OpEscape:
+      case MCCFIInstruction::OpWindowSave:
+      case MCCFIInstruction::OpNegateRAState:
+      case MCCFIInstruction::OpGnuArgsSize:
+        break;
+      }
+      if (CSRReg || CSROffset) {
+        auto It = CSRLocMap.find(CFI.getRegister());
+        if (It == CSRLocMap.end()) {
+          CSRLocMap.insert(
+              {CFI.getRegister(), CSRSavedLocation(CSRReg, CSROffset)});
+        } else if (It->second.Reg != CSRReg || It->second.Offset != CSROffset) {
+          llvm_unreachable("Different saved locations for the same CSR");
+        }
+        CSRSaved.set(CFI.getRegister());
+      }
+    }
+  }
+
+  MBBInfo.Processed = true;
+
+  // Update outgoing CFA info.
+  MBBInfo.OutgoingCFAOffset = SetOffset;
+  MBBInfo.OutgoingCFARegister = SetRegister;
+
+  // Update outgoing CSR info.
+  BitVector::apply([](auto x, auto y, auto z) { return (x | y) & ~z; },
+                   MBBInfo.OutgoingCSRSaved, MBBInfo.IncomingCSRSaved, CSRSaved,
+                   CSRRestored);
+}
+
+void CFIInstrInserter::updateSuccCFAInfo(MBBCFAInfo &MBBInfo) {
+  SmallVector<MachineBasicBlock *, 4> Stack;
+  Stack.push_back(MBBInfo.MBB);
+
+  do {
+    MachineBasicBlock *Current = Stack.pop_back_val();
+    MBBCFAInfo &CurrentInfo = MBBVector[Current->getNumber()];
+    calculateOutgoingCFAInfo(CurrentInfo);
+    for (auto *Succ : CurrentInfo.MBB->successors()) {
+      MBBCFAInfo &SuccInfo = MBBVector[Succ->getNumber()];
+      if (!SuccInfo.Processed) {
+        SuccInfo.IncomingCFAOffset = CurrentInfo.OutgoingCFAOffset;
+        SuccInfo.IncomingCFARegister = CurrentInfo.OutgoingCFARegister;
+        SuccInfo.IncomingCSRSaved = CurrentInfo.OutgoingCSRSaved;
+        Stack.push_back(Succ);
+      }
+    }
+  } while (!Stack.empty());
+}
+
+bool CFIInstrInserter::insertCFIInstrs(MachineFunction &MF) {
+  const MBBCFAInfo *PrevMBBInfo = &MBBVector[MF.front().getNumber()];
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  bool InsertedCFIInstr = false;
+
+  BitVector SetDifference;
+  for (MachineBasicBlock &MBB : MF) {
+    // Skip the first MBB in a function
+    if (MBB.getNumber() == MF.front().getNumber()) continue;
+
+    const MBBCFAInfo &MBBInfo = MBBVector[MBB.getNumber()];
+    auto MBBI = MBBInfo.MBB->begin();
+    DebugLoc DL = MBBInfo.MBB->findDebugLoc(MBBI);
+
+    // If the current MBB will be placed in a unique section, a full DefCfa
+    // must be emitted.
+    const bool ForceFullCFA = MBB.isBeginSection();
+
+    if ((PrevMBBInfo->OutgoingCFAOffset != MBBInfo.IncomingCFAOffset &&
+         PrevMBBInfo->OutgoingCFARegister != MBBInfo.IncomingCFARegister) ||
+        ForceFullCFA) {
+      // If both outgoing offset and register of a previous block don't match
+      // incoming offset and register of this block, or if this block begins a
+      // section, add a def_cfa instruction with the correct offset and
+      // register for this block.
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
+          nullptr, MBBInfo.IncomingCFARegister, getCorrectCFAOffset(&MBB)));
+      BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+      InsertedCFIInstr = true;
+    } else if (PrevMBBInfo->OutgoingCFAOffset != MBBInfo.IncomingCFAOffset) {
+      // If outgoing offset of a previous block doesn't match incoming offset
+      // of this block, add a def_cfa_offset instruction with the correct
+      // offset for this block.
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(
+          nullptr, getCorrectCFAOffset(&MBB)));
+      BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+      InsertedCFIInstr = true;
+    } else if (PrevMBBInfo->OutgoingCFARegister !=
+               MBBInfo.IncomingCFARegister) {
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+              nullptr, MBBInfo.IncomingCFARegister));
+      BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+      InsertedCFIInstr = true;
+    }
+
+    if (ForceFullCFA) {
+      MF.getSubtarget().getFrameLowering()->emitCalleeSavedFrameMovesFullCFA(
+          *MBBInfo.MBB, MBBI);
+      InsertedCFIInstr = true;
+      PrevMBBInfo = &MBBInfo;
+      continue;
+    }
+
+    BitVector::apply([](auto x, auto y) { return x & ~y; }, SetDifference,
+                     PrevMBBInfo->OutgoingCSRSaved, MBBInfo.IncomingCSRSaved);
+    for (int Reg : SetDifference.set_bits()) {
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, Reg));
+      BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+      InsertedCFIInstr = true;
+    }
+
+    BitVector::apply([](auto x, auto y) { return x & ~y; }, SetDifference,
+                     MBBInfo.IncomingCSRSaved, PrevMBBInfo->OutgoingCSRSaved);
+    for (int Reg : SetDifference.set_bits()) {
+      auto it = CSRLocMap.find(Reg);
+      assert(it != CSRLocMap.end() && "Reg should have an entry in CSRLocMap");
+      unsigned CFIIndex;
+      CSRSavedLocation RO = it->second;
+      if (!RO.Reg && RO.Offset) {
+        CFIIndex = MF.addFrameInst(
+            MCCFIInstruction::createOffset(nullptr, Reg, *RO.Offset));
+      } else if (RO.Reg && !RO.Offset) {
+        CFIIndex = MF.addFrameInst(
+            MCCFIInstruction::createRegister(nullptr, Reg, *RO.Reg));
+      } else {
+        llvm_unreachable("RO.Reg and RO.Offset cannot both be valid/invalid");
+      }
+      BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+      InsertedCFIInstr = true;
+    }
+
+    PrevMBBInfo = &MBBInfo;
+  }
+  return InsertedCFIInstr;
+}
+
+void CFIInstrInserter::reportCFAError(const MBBCFAInfo &Pred,
+                                      const MBBCFAInfo &Succ) {
+  errs() << "*** Inconsistent CFA register and/or offset between pred and succ "
+            "***\n";
+  errs() << "Pred: " << Pred.MBB->getName() << " #" << Pred.MBB->getNumber()
+         << " in " << Pred.MBB->getParent()->getName()
+         << " outgoing CFA Reg:" << Pred.OutgoingCFARegister << "\n";
+  errs() << "Pred: " << Pred.MBB->getName() << " #" << Pred.MBB->getNumber()
+         << " in " << Pred.MBB->getParent()->getName()
+         << " outgoing CFA Offset:" << Pred.OutgoingCFAOffset << "\n";
+  errs() << "Succ: " << Succ.MBB->getName() << " #" << Succ.MBB->getNumber()
+         << " incoming CFA Reg:" << Succ.IncomingCFARegister << "\n";
+  errs() << "Succ: " << Succ.MBB->getName() << " #" << Succ.MBB->getNumber()
+         << " incoming CFA Offset:" << Succ.IncomingCFAOffset << "\n";
+}
+
+void CFIInstrInserter::reportCSRError(const MBBCFAInfo &Pred,
+                                      const MBBCFAInfo &Succ) {
+  errs() << "*** Inconsistent CSR Saved between pred and succ in function "
+         << Pred.MBB->getParent()->getName() << " ***\n";
+  errs() << "Pred: " << Pred.MBB->getName() << " #" << Pred.MBB->getNumber()
+         << " outgoing CSR Saved: ";
+  for (int Reg : Pred.OutgoingCSRSaved.set_bits())
+    errs() << Reg << " ";
+  errs() << "\n";
+  errs() << "Succ: " << Succ.MBB->getName() << " #" << Succ.MBB->getNumber()
+         << " incoming CSR Saved: ";
+  for (int Reg : Succ.IncomingCSRSaved.set_bits())
+    errs() << Reg << " ";
+  errs() << "\n";
+}
+
+unsigned CFIInstrInserter::verify(MachineFunction &MF) {
+  unsigned ErrorNum = 0;
+  for (auto *CurrMBB : depth_first(&MF)) {
+    const MBBCFAInfo &CurrMBBInfo = MBBVector[CurrMBB->getNumber()];
+    for (MachineBasicBlock *Succ : CurrMBB->successors()) {
+      const MBBCFAInfo &SuccMBBInfo = MBBVector[Succ->getNumber()];
+      // Check that incoming offset and register values of successors match the
+      // outgoing offset and register values of CurrMBB
+      if (SuccMBBInfo.IncomingCFAOffset != CurrMBBInfo.OutgoingCFAOffset ||
+          SuccMBBInfo.IncomingCFARegister != CurrMBBInfo.OutgoingCFARegister) {
+        // Inconsistent offsets/registers are ok for 'noreturn' blocks because
+        // we don't generate epilogues inside such blocks.
+        if (SuccMBBInfo.MBB->succ_empty() && !SuccMBBInfo.MBB->isReturnBlock())
+          continue;
+        reportCFAError(CurrMBBInfo, SuccMBBInfo);
+        ErrorNum++;
+      }
+      // Check that IncomingCSRSaved of every successor matches the
+      // OutgoingCSRSaved of CurrMBB
+      if (SuccMBBInfo.IncomingCSRSaved != CurrMBBInfo.OutgoingCSRSaved) {
+        reportCSRError(CurrMBBInfo, SuccMBBInfo);
+        ErrorNum++;
+      }
+    }
+  }
+  return ErrorNum;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CalcSpillWeights.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CalcSpillWeights.cpp
new file mode 100644
index 000000000000..5a005ba7b414
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -0,0 +1,323 @@
+//===- CalcSpillWeights.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "calcspillweights"
+
+void VirtRegAuxInfo::calculateSpillWeightsAndHints() {
+  LLVM_DEBUG(dbgs() << "********** Compute Spill Weights **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    calculateSpillWeightAndHint(LIS.getInterval(Reg));
+  }
+}
+
+// Return the preferred allocation register for reg, given a COPY instruction.
+Register VirtRegAuxInfo::copyHint(const MachineInstr *MI, unsigned Reg,
+                                  const TargetRegisterInfo &TRI,
+                                  const MachineRegisterInfo &MRI) {
+  unsigned Sub, HSub;
+  Register HReg;
+  if (MI->getOperand(0).getReg() == Reg) {
+    Sub = MI->getOperand(0).getSubReg();
+    HReg = MI->getOperand(1).getReg();
+    HSub = MI->getOperand(1).getSubReg();
+  } else {
+    Sub = MI->getOperand(1).getSubReg();
+    HReg = MI->getOperand(0).getReg();
+    HSub = MI->getOperand(0).getSubReg();
+  }
+
+  if (!HReg)
+    return 0;
+
+  if (HReg.isVirtual())
+    return Sub == HSub ? HReg : Register();
+
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  MCRegister CopiedPReg = HSub ? TRI.getSubReg(HReg, HSub) : HReg.asMCReg();
+  if (RC->contains(CopiedPReg))
+    return CopiedPReg;
+
+  // Check if reg:sub matches so that a super register could be hinted.
+  if (Sub)
+    return TRI.getMatchingSuperReg(CopiedPReg, Sub, RC);
+
+  return 0;
+}
+
+// Check if all values in LI are rematerializable
+bool VirtRegAuxInfo::isRematerializable(const LiveInterval &LI,
+                                        const LiveIntervals &LIS,
+                                        const VirtRegMap &VRM,
+                                        const TargetInstrInfo &TII) {
+  Register Reg = LI.reg();
+  Register Original = VRM.getOriginal(Reg);
+  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
+       I != E; ++I) {
+    const VNInfo *VNI = *I;
+    if (VNI->isUnused())
+      continue;
+    if (VNI->isPHIDef())
+      return false;
+
+    MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
+    assert(MI && "Dead valno in interval");
+
+    // Trace copies introduced by live range splitting.  The inline
+    // spiller can rematerialize through these copies, so the spill
+    // weight must reflect this.
+    while (MI->isFullCopy()) {
+      // The copy destination must match the interval register.
+      if (MI->getOperand(0).getReg() != Reg)
+        return false;
+
+      // Get the source register.
+      Reg = MI->getOperand(1).getReg();
+
+      // If the original (pre-splitting) registers match this
+      // copy came from a split.
+      if (!Reg.isVirtual() || VRM.getOriginal(Reg) != Original)
+        return false;
+
+      // Follow the copy live-in value.
+      const LiveInterval &SrcLI = LIS.getInterval(Reg);
+      LiveQueryResult SrcQ = SrcLI.Query(VNI->def);
+      VNI = SrcQ.valueIn();
+      assert(VNI && "Copy from non-existing value");
+      if (VNI->isPHIDef())
+        return false;
+      MI = LIS.getInstructionFromIndex(VNI->def);
+      assert(MI && "Dead valno in interval");
+    }
+
+    if (!TII.isTriviallyReMaterializable(*MI))
+      return false;
+  }
+  return true;
+}
+
+bool VirtRegAuxInfo::isLiveAtStatepointVarArg(LiveInterval &LI) {
+  return any_of(VRM.getRegInfo().reg_operands(LI.reg()),
+                [](MachineOperand &MO) {
+    MachineInstr *MI = MO.getParent();
+    if (MI->getOpcode() != TargetOpcode::STATEPOINT)
+      return false;
+    return StatepointOpers(MI).getVarIdx() <= MO.getOperandNo();
+  });
+}
+
+void VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &LI) {
+  float Weight = weightCalcHelper(LI);
+  // Check if unspillable.
+  if (Weight < 0)
+    return;
+  LI.setWeight(Weight);
+}
+
+float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
+                                       SlotIndex *End) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  MachineBasicBlock *MBB = nullptr;
+  MachineLoop *Loop = nullptr;
+  bool IsExiting = false;
+  float TotalWeight = 0;
+  unsigned NumInstr = 0; // Number of instructions using LI
+  SmallPtrSet<MachineInstr *, 8> Visited;
+
+  std::pair<unsigned, Register> TargetHint = MRI.getRegAllocationHint(LI.reg());
+
+  if (LI.isSpillable()) {
+    Register Reg = LI.reg();
+    Register Original = VRM.getOriginal(Reg);
+    const LiveInterval &OrigInt = LIS.getInterval(Original);
+    // li comes from a split of OrigInt. If OrigInt was marked
+    // as not spillable, make sure the new interval is marked
+    // as not spillable as well.
+    if (!OrigInt.isSpillable())
+      LI.markNotSpillable();
+  }
+
+  // Don't recompute spill weight for an unspillable register.
+  bool IsSpillable = LI.isSpillable();
+
+  bool IsLocalSplitArtifact = Start && End;
+
+  // Do not update future local split artifacts.
+  bool ShouldUpdateLI = !IsLocalSplitArtifact;
+
+  if (IsLocalSplitArtifact) {
+    MachineBasicBlock *LocalMBB = LIS.getMBBFromIndex(*End);
+    assert(LocalMBB == LIS.getMBBFromIndex(*Start) &&
+           "start and end are expected to be in the same basic block");
+
+    // Local split artifact will have 2 additional copy instructions and they
+    // will be in the same BB.
+    // localLI = COPY other
+    // ...
+    // other   = COPY localLI
+    TotalWeight += LiveIntervals::getSpillWeight(true, false, &MBFI, LocalMBB);
+    TotalWeight += LiveIntervals::getSpillWeight(false, true, &MBFI, LocalMBB);
+
+    NumInstr += 2;
+  }
+
+  // CopyHint is a sortable hint derived from a COPY instruction.
+  struct CopyHint {
+    const Register Reg;
+    const float Weight;
+    CopyHint(Register R, float W) : Reg(R), Weight(W) {}
+    bool operator<(const CopyHint &Rhs) const {
+      // Always prefer any physreg hint.
+      if (Reg.isPhysical() != Rhs.Reg.isPhysical())
+        return Reg.isPhysical();
+      if (Weight != Rhs.Weight)
+        return (Weight > Rhs.Weight);
+      return Reg.id() < Rhs.Reg.id(); // Tie-breaker.
+    }
+  };
+
+  std::set<CopyHint> CopyHints;
+  DenseMap<unsigned, float> Hint;
+  for (MachineRegisterInfo::reg_instr_nodbg_iterator
+           I = MRI.reg_instr_nodbg_begin(LI.reg()),
+           E = MRI.reg_instr_nodbg_end();
+       I != E;) {
+    MachineInstr *MI = &*(I++);
+
+    // For local split artifacts, we are interested only in instructions between
+    // the expected start and end of the range.
+    SlotIndex SI = LIS.getInstructionIndex(*MI);
+    if (IsLocalSplitArtifact && ((SI < *Start) || (SI > *End)))
+      continue;
+
+    NumInstr++;
+    if (MI->isIdentityCopy() || MI->isImplicitDef())
+      continue;
+    if (!Visited.insert(MI).second)
+      continue;
+
+    // For terminators that produce values, ask the backend if the register is
+    // not spillable.
+    if (TII.isUnspillableTerminator(MI) && MI->definesRegister(LI.reg())) {
+      LI.markNotSpillable();
+      return -1.0f;
+    }
+
+    float Weight = 1.0f;
+    if (IsSpillable) {
+      // Get loop info for mi.
+      if (MI->getParent() != MBB) {
+        MBB = MI->getParent();
+        Loop = Loops.getLoopFor(MBB);
+        IsExiting = Loop ? Loop->isLoopExiting(MBB) : false;
+      }
+
+      // Calculate instr weight.
+      bool Reads, Writes;
+      std::tie(Reads, Writes) = MI->readsWritesVirtualRegister(LI.reg());
+      Weight = LiveIntervals::getSpillWeight(Writes, Reads, &MBFI, *MI);
+
+      // Give extra weight to what looks like a loop induction variable update.
+      if (Writes && IsExiting && LIS.isLiveOutOfMBB(LI, MBB))
+        Weight *= 3;
+
+      TotalWeight += Weight;
+    }
+
+    // Get allocation hints from copies.
+    if (!MI->isCopy())
+      continue;
+    Register HintReg = copyHint(MI, LI.reg(), TRI, MRI);
+    if (!HintReg)
+      continue;
+    // Force hweight onto the stack so that x86 doesn't add hidden precision,
+    // making the comparison incorrectly pass (i.e., 1 > 1 == true??).
+    //
+    // FIXME: we probably shouldn't use floats at all.
+    volatile float HWeight = Hint[HintReg] += Weight;
+    if (HintReg.isVirtual() || MRI.isAllocatable(HintReg))
+      CopyHints.insert(CopyHint(HintReg, HWeight));
+  }
+
+  // Pass all the sorted copy hints to mri.
+  if (ShouldUpdateLI && CopyHints.size()) {
+    // Remove a generic hint if previously added by target.
+    if (TargetHint.first == 0 && TargetHint.second)
+      MRI.clearSimpleHint(LI.reg());
+
+    SmallSet<Register, 4> HintedRegs;
+    for (const auto &Hint : CopyHints) {
+      if (!HintedRegs.insert(Hint.Reg).second ||
+          (TargetHint.first != 0 && Hint.Reg == TargetHint.second))
+        // Don't add the same reg twice or the target-type hint again.
+        continue;
+      MRI.addRegAllocationHint(LI.reg(), Hint.Reg);
+    }
+
+    // Weakly boost the spill weight of hinted registers.
+    TotalWeight *= 1.01F;
+  }
+
+  // If the live interval was already unspillable, leave it that way.
+  if (!IsSpillable)
+    return -1.0;
+
+  // Mark li as unspillable if all live ranges are tiny and the interval
+  // is not live at any reg mask.  If the interval is live at a reg mask
+  // spilling may be required. If li is live as use in statepoint instruction
+  // spilling may be required due to if we mark interval with use in statepoint
+  // as not spillable we are risky to end up with no register to allocate.
+  // At the same time STATEPOINT instruction is perfectly fine to have this
+  // operand on stack, so spilling such interval and folding its load from stack
+  // into instruction itself makes perfect sense.
+  if (ShouldUpdateLI && LI.isZeroLength(LIS.getSlotIndexes()) &&
+      !LI.isLiveAtIndexes(LIS.getRegMaskSlots()) &&
+      !isLiveAtStatepointVarArg(LI)) {
+    LI.markNotSpillable();
+    return -1.0;
+  }
+
+  // If all of the definitions of the interval are re-materializable,
+  // it is a preferred candidate for spilling.
+  // FIXME: this gets much more complicated once we support non-trivial
+  // re-materialization.
+  if (isRematerializable(LI, LIS, VRM, *MF.getSubtarget().getInstrInfo()))
+    TotalWeight *= 0.5F;
+
+  if (IsLocalSplitArtifact)
+    return normalize(TotalWeight, Start->distance(*End), NumInstr);
+  return normalize(TotalWeight, LI.getSize(), NumInstr);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CallBrPrepare.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CallBrPrepare.cpp
new file mode 100644
index 000000000000..db243a0bfebe
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CallBrPrepare.cpp
@@ -0,0 +1,231 @@
+//===-- CallBrPrepare - Prepare callbr for code generation ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers callbrs in LLVM IR in order to to assist SelectionDAG's
+// codegen.
+//
+// In particular, this pass assists in inserting register copies for the output
+// values of a callbr along the edges leading to the indirect target blocks.
+// Though the output SSA value is defined by the callbr instruction itself in
+// the IR representation, the value cannot be copied to the appropriate virtual
+// registers prior to jumping to an indirect label, since the jump occurs
+// within the user-provided assembly blob.
+//
+// Instead, those copies must occur separately at the beginning of each
+// indirect target. That requires that we create a separate SSA definition in
+// each of them (via llvm.callbr.landingpad), and may require splitting
+// critical edges so we have a location to place the intrinsic. Finally, we
+// remap users of the original callbr output SSA value to instead point to the
+// appropriate llvm.callbr.landingpad value.
+//
+// Ideally, this could be done inside SelectionDAG, or in the
+// MachineInstruction representation, without the use of an IR-level intrinsic.
+// But, within the current framework, it’s simpler to implement as an IR pass.
+// (If support for callbr in GlobalISel is implemented, it’s worth considering
+// whether this is still required.)
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "callbrprepare"
+
+namespace {
+
+class CallBrPrepare : public FunctionPass {
+  bool SplitCriticalEdges(ArrayRef<CallBrInst *> CBRs, DominatorTree &DT);
+  bool InsertIntrinsicCalls(ArrayRef<CallBrInst *> CBRs,
+                            DominatorTree &DT) const;
+  void UpdateSSA(DominatorTree &DT, CallBrInst *CBR, CallInst *Intrinsic,
+                 SSAUpdater &SSAUpdate) const;
+
+public:
+  CallBrPrepare() : FunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &Fn) override;
+  static char ID;
+};
+
+} // end anonymous namespace
+
+char CallBrPrepare::ID = 0;
+INITIALIZE_PASS_BEGIN(CallBrPrepare, DEBUG_TYPE, "Prepare callbr", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(CallBrPrepare, DEBUG_TYPE, "Prepare callbr", false, false)
+
+FunctionPass *llvm::createCallBrPass() { return new CallBrPrepare(); }
+
+void CallBrPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<DominatorTreeWrapperPass>();
+}
+
+static SmallVector<CallBrInst *, 2> FindCallBrs(Function &Fn) {
+  SmallVector<CallBrInst *, 2> CBRs;
+  for (BasicBlock &BB : Fn)
+    if (auto *CBR = dyn_cast<CallBrInst>(BB.getTerminator()))
+      if (!CBR->getType()->isVoidTy() && !CBR->use_empty())
+        CBRs.push_back(CBR);
+  return CBRs;
+}
+
+bool CallBrPrepare::SplitCriticalEdges(ArrayRef<CallBrInst *> CBRs,
+                                       DominatorTree &DT) {
+  bool Changed = false;
+  CriticalEdgeSplittingOptions Options(&DT);
+  Options.setMergeIdenticalEdges();
+
+  // The indirect destination might be duplicated between another parameter...
+  //   %0 = callbr ... [label %x, label %x]
+  // ...hence MergeIdenticalEdges and AllowIndentical edges, but we don't need
+  // to split the default destination if it's duplicated between an indirect
+  // destination...
+  //   %1 = callbr ... to label %x [label %x]
+  // ...hence starting at 1 and checking against successor 0 (aka the default
+  // destination).
+  for (CallBrInst *CBR : CBRs)
+    for (unsigned i = 1, e = CBR->getNumSuccessors(); i != e; ++i)
+      if (CBR->getSuccessor(i) == CBR->getSuccessor(0) ||
+          isCriticalEdge(CBR, i, /*AllowIdenticalEdges*/ true))
+        if (SplitKnownCriticalEdge(CBR, i, Options))
+          Changed = true;
+  return Changed;
+}
+
+bool CallBrPrepare::InsertIntrinsicCalls(ArrayRef<CallBrInst *> CBRs,
+                                         DominatorTree &DT) const {
+  bool Changed = false;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  IRBuilder<> Builder(CBRs[0]->getContext());
+  for (CallBrInst *CBR : CBRs) {
+    if (!CBR->getNumIndirectDests())
+      continue;
+
+    SSAUpdater SSAUpdate;
+    SSAUpdate.Initialize(CBR->getType(), CBR->getName());
+    SSAUpdate.AddAvailableValue(CBR->getParent(), CBR);
+    SSAUpdate.AddAvailableValue(CBR->getDefaultDest(), CBR);
+
+    for (BasicBlock *IndDest : CBR->getIndirectDests()) {
+      if (!Visited.insert(IndDest).second)
+        continue;
+      Builder.SetInsertPoint(&*IndDest->begin());
+      CallInst *Intrinsic = Builder.CreateIntrinsic(
+          CBR->getType(), Intrinsic::callbr_landingpad, {CBR});
+      SSAUpdate.AddAvailableValue(IndDest, Intrinsic);
+      UpdateSSA(DT, CBR, Intrinsic, SSAUpdate);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+static bool IsInSameBasicBlock(const Use &U, const BasicBlock *BB) {
+  const auto *I = dyn_cast<Instruction>(U.getUser());
+  return I && I->getParent() == BB;
+}
+
+#ifndef NDEBUG
+static void PrintDebugDomInfo(const DominatorTree &DT, const Use &U,
+                              const BasicBlock *BB, bool IsDefaultDest) {
+  if (!isa<Instruction>(U.getUser()))
+    return;
+  LLVM_DEBUG(dbgs() << "Use: " << *U.getUser() << ", in block "
+                    << cast<Instruction>(U.getUser())->getParent()->getName()
+                    << ", is " << (DT.dominates(BB, U) ? "" : "NOT ")
+                    << "dominated by " << BB->getName() << " ("
+                    << (IsDefaultDest ? "in" : "") << "direct)\n");
+}
+#endif
+
+void CallBrPrepare::UpdateSSA(DominatorTree &DT, CallBrInst *CBR,
+                              CallInst *Intrinsic,
+                              SSAUpdater &SSAUpdate) const {
+
+  SmallPtrSet<Use *, 4> Visited;
+  BasicBlock *DefaultDest = CBR->getDefaultDest();
+  BasicBlock *LandingPad = Intrinsic->getParent();
+
+  SmallVector<Use *, 4> Uses(make_pointer_range(CBR->uses()));
+  for (Use *U : Uses) {
+    if (!Visited.insert(U).second)
+      continue;
+
+#ifndef NDEBUG
+    PrintDebugDomInfo(DT, *U, LandingPad, /*IsDefaultDest*/ false);
+    PrintDebugDomInfo(DT, *U, DefaultDest, /*IsDefaultDest*/ true);
+#endif
+
+    // Don't rewrite the use in the newly inserted intrinsic.
+    if (const auto *II = dyn_cast<IntrinsicInst>(U->getUser()))
+      if (II->getIntrinsicID() == Intrinsic::callbr_landingpad)
+        continue;
+
+    // If the Use is in the same BasicBlock as the Intrinsic call, replace
+    // the Use with the value of the Intrinsic call.
+    if (IsInSameBasicBlock(*U, LandingPad)) {
+      U->set(Intrinsic);
+      continue;
+    }
+
+    // If the Use is dominated by the default dest, do not touch it.
+    if (DT.dominates(DefaultDest, *U))
+      continue;
+
+    SSAUpdate.RewriteUse(*U);
+  }
+}
+
+bool CallBrPrepare::runOnFunction(Function &Fn) {
+  bool Changed = false;
+  SmallVector<CallBrInst *, 2> CBRs = FindCallBrs(Fn);
+
+  if (CBRs.empty())
+    return Changed;
+
+  // It's highly likely that most programs do not contain CallBrInsts. Follow a
+  // similar pattern from SafeStackLegacyPass::runOnFunction to reuse previous
+  // domtree analysis if available, otherwise compute it lazily. This avoids
+  // forcing Dominator Tree Construction at -O0 for programs that likely do not
+  // contain CallBrInsts. It does pessimize programs with callbr at higher
+  // optimization levels, as the DominatorTree created here is not reused by
+  // subsequent passes.
+  DominatorTree *DT;
+  std::optional<DominatorTree> LazilyComputedDomTree;
+  if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
+    DT = &DTWP->getDomTree();
+  else {
+    LazilyComputedDomTree.emplace(Fn);
+    DT = &*LazilyComputedDomTree;
+  }
+
+  if (SplitCriticalEdges(CBRs, *DT))
+    Changed = true;
+
+  if (InsertIntrinsicCalls(CBRs, *DT))
+    Changed = true;
+
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CallingConvLower.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CallingConvLower.cpp
new file mode 100644
index 000000000000..b7152587a9fa
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CallingConvLower.cpp
@@ -0,0 +1,292 @@
+//===-- CallingConvLower.cpp - Calling Conventions ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CCState class, used for lowering and implementing
+// calling conventions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SaveAndRestore.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+CCState::CCState(CallingConv::ID CC, bool IsVarArg, MachineFunction &MF,
+                 SmallVectorImpl<CCValAssign> &Locs, LLVMContext &Context,
+                 bool NegativeOffsets)
+    : CallingConv(CC), IsVarArg(IsVarArg), MF(MF),
+      TRI(*MF.getSubtarget().getRegisterInfo()), Locs(Locs), Context(Context),
+      NegativeOffsets(NegativeOffsets) {
+
+  // No stack is used.
+  StackSize = 0;
+
+  clearByValRegsInfo();
+  UsedRegs.resize((TRI.getNumRegs()+31)/32);
+}
+
+/// Allocate space on the stack large enough to pass an argument by value.
+/// The size and alignment information of the argument is encoded in
+/// its parameter attribute.
+void CCState::HandleByVal(unsigned ValNo, MVT ValVT, MVT LocVT,
+                          CCValAssign::LocInfo LocInfo, int MinSize,
+                          Align MinAlign, ISD::ArgFlagsTy ArgFlags) {
+  Align Alignment = ArgFlags.getNonZeroByValAlign();
+  unsigned Size  = ArgFlags.getByValSize();
+  if (MinSize > (int)Size)
+    Size = MinSize;
+  if (MinAlign > Alignment)
+    Alignment = MinAlign;
+  ensureMaxAlignment(Alignment);
+  MF.getSubtarget().getTargetLowering()->HandleByVal(this, Size, Alignment);
+  Size = unsigned(alignTo(Size, MinAlign));
+  uint64_t Offset = AllocateStack(Size, Alignment);
+  addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+}
+
+/// Mark a register and all of its aliases as allocated.
+void CCState::MarkAllocated(MCPhysReg Reg) {
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    UsedRegs[*AI / 32] |= 1 << (*AI & 31);
+}
+
+void CCState::MarkUnallocated(MCPhysReg Reg) {
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    UsedRegs[*AI / 32] &= ~(1 << (*AI & 31));
+}
+
+bool CCState::IsShadowAllocatedReg(MCRegister Reg) const {
+  if (!isAllocated(Reg))
+    return false;
+
+  for (auto const &ValAssign : Locs)
+    if (ValAssign.isRegLoc() && TRI.regsOverlap(ValAssign.getLocReg(), Reg))
+      return false;
+  return true;
+}
+
+/// Analyze an array of argument values,
+/// incorporating info about the formals into this state.
+void
+CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                CCAssignFn Fn) {
+  unsigned NumArgs = Ins.size();
+
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    MVT ArgVT = Ins[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this))
+      report_fatal_error("unable to allocate function argument #" + Twine(i));
+  }
+}
+
+/// Analyze the return values of a function, returning true if the return can
+/// be performed without sret-demotion and false otherwise.
+bool CCState::CheckReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                          CCAssignFn Fn) {
+  // Determine which register each value should be copied into.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    MVT VT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this))
+      return false;
+  }
+  return true;
+}
+
+/// Analyze the returned values of a return,
+/// incorporating info about the result values into this state.
+void CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                            CCAssignFn Fn) {
+  // Determine which register each value should be copied into.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    MVT VT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this))
+      report_fatal_error("unable to allocate function return #" + Twine(i));
+  }
+}
+
+/// Analyze the outgoing arguments to a call,
+/// incorporating info about the passed values into this state.
+void CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  CCAssignFn Fn) {
+  unsigned NumOps = Outs.size();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MVT ArgVT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call operand #" << i << " has unhandled type "
+             << ArgVT << '\n';
+#endif
+      llvm_unreachable(nullptr);
+    }
+  }
+}
+
+/// Same as above except it takes vectors of types and argument flags.
+void CCState::AnalyzeCallOperands(SmallVectorImpl<MVT> &ArgVTs,
+                                  SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+                                  CCAssignFn Fn) {
+  unsigned NumOps = ArgVTs.size();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MVT ArgVT = ArgVTs[i];
+    ISD::ArgFlagsTy ArgFlags = Flags[i];
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call operand #" << i << " has unhandled type "
+             << ArgVT << '\n';
+#endif
+      llvm_unreachable(nullptr);
+    }
+  }
+}
+
+/// Analyze the return values of a call, incorporating info about the passed
+/// values into this state.
+void CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                CCAssignFn Fn) {
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    MVT VT = Ins[i].VT;
+    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, Flags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call result #" << i << " has unhandled type "
+             << VT << '\n';
+#endif
+      llvm_unreachable(nullptr);
+    }
+  }
+}
+
+/// Same as above except it's specialized for calls that produce a single value.
+void CCState::AnalyzeCallResult(MVT VT, CCAssignFn Fn) {
+  if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), *this)) {
+#ifndef NDEBUG
+    dbgs() << "Call result has unhandled type "
+           << VT << '\n';
+#endif
+    llvm_unreachable(nullptr);
+  }
+}
+
+void CCState::ensureMaxAlignment(Align Alignment) {
+  if (!AnalyzingMustTailForwardedRegs)
+    MF.getFrameInfo().ensureMaxAlignment(Alignment);
+}
+
+static bool isValueTypeInRegForCC(CallingConv::ID CC, MVT VT) {
+  if (VT.isVector())
+    return true; // Assume -msse-regparm might be in effect.
+  if (!VT.isInteger())
+    return false;
+  return (CC == CallingConv::X86_VectorCall || CC == CallingConv::X86_FastCall);
+}
+
+void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs,
+                                          MVT VT, CCAssignFn Fn) {
+  uint64_t SavedStackSize = StackSize;
+  Align SavedMaxStackArgAlign = MaxStackArgAlign;
+  unsigned NumLocs = Locs.size();
+
+  // Set the 'inreg' flag if it is used for this calling convention.
+  ISD::ArgFlagsTy Flags;
+  if (isValueTypeInRegForCC(CallingConv, VT))
+    Flags.setInReg();
+
+  // Allocate something of this value type repeatedly until we get assigned a
+  // location in memory.
+  bool HaveRegParm;
+  do {
+    if (Fn(0, VT, VT, CCValAssign::Full, Flags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call has unhandled type " << VT
+             << " while computing remaining regparms\n";
+#endif
+      llvm_unreachable(nullptr);
+    }
+    HaveRegParm = Locs.back().isRegLoc();
+  } while (HaveRegParm);
+
+  // Copy all the registers from the value locations we added.
+  assert(NumLocs < Locs.size() && "CC assignment failed to add location");
+  for (unsigned I = NumLocs, E = Locs.size(); I != E; ++I)
+    if (Locs[I].isRegLoc())
+      Regs.push_back(MCPhysReg(Locs[I].getLocReg()));
+
+  // Clear the assigned values and stack memory. We leave the registers marked
+  // as allocated so that future queries don't return the same registers, i.e.
+  // when i64 and f64 are both passed in GPRs.
+  StackSize = SavedStackSize;
+  MaxStackArgAlign = SavedMaxStackArgAlign;
+  Locs.truncate(NumLocs);
+}
+
+void CCState::analyzeMustTailForwardedRegisters(
+    SmallVectorImpl<ForwardedRegister> &Forwards, ArrayRef<MVT> RegParmTypes,
+    CCAssignFn Fn) {
+  // Oftentimes calling conventions will not user register parameters for
+  // variadic functions, so we need to assume we're not variadic so that we get
+  // all the registers that might be used in a non-variadic call.
+  SaveAndRestore SavedVarArg(IsVarArg, false);
+  SaveAndRestore SavedMustTail(AnalyzingMustTailForwardedRegs, true);
+
+  for (MVT RegVT : RegParmTypes) {
+    SmallVector<MCPhysReg, 8> RemainingRegs;
+    getRemainingRegParmsForType(RemainingRegs, RegVT, Fn);
+    const TargetLowering *TL = MF.getSubtarget().getTargetLowering();
+    const TargetRegisterClass *RC = TL->getRegClassFor(RegVT);
+    for (MCPhysReg PReg : RemainingRegs) {
+      Register VReg = MF.addLiveIn(PReg, RC);
+      Forwards.push_back(ForwardedRegister(VReg, PReg, RegVT));
+    }
+  }
+}
+
+bool CCState::resultsCompatible(CallingConv::ID CalleeCC,
+                                CallingConv::ID CallerCC, MachineFunction &MF,
+                                LLVMContext &C,
+                                const SmallVectorImpl<ISD::InputArg> &Ins,
+                                CCAssignFn CalleeFn, CCAssignFn CallerFn) {
+  if (CalleeCC == CallerCC)
+    return true;
+  SmallVector<CCValAssign, 4> RVLocs1;
+  CCState CCInfo1(CalleeCC, false, MF, RVLocs1, C);
+  CCInfo1.AnalyzeCallResult(Ins, CalleeFn);
+
+  SmallVector<CCValAssign, 4> RVLocs2;
+  CCState CCInfo2(CallerCC, false, MF, RVLocs2, C);
+  CCInfo2.AnalyzeCallResult(Ins, CallerFn);
+
+  auto AreCompatible = [](const CCValAssign &Loc1, const CCValAssign &Loc2) {
+    assert(!Loc1.isPendingLoc() && !Loc2.isPendingLoc() &&
+           "The location must have been decided by now");
+    // Must fill the same part of their locations.
+    if (Loc1.getLocInfo() != Loc2.getLocInfo())
+      return false;
+    // Must both be in the same registers, or both in memory at the same offset.
+    if (Loc1.isRegLoc() && Loc2.isRegLoc())
+      return Loc1.getLocReg() == Loc2.getLocReg();
+    if (Loc1.isMemLoc() && Loc2.isMemLoc())
+      return Loc1.getLocMemOffset() == Loc2.getLocMemOffset();
+    llvm_unreachable("Unknown location kind");
+  };
+
+  return std::equal(RVLocs1.begin(), RVLocs1.end(), RVLocs2.begin(),
+                    RVLocs2.end(), AreCompatible);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CodeGen.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CodeGen.cpp
new file mode 100644
index 000000000000..6272b654b329
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CodeGen.cpp
@@ -0,0 +1,143 @@
+//===-- CodeGen.cpp -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the common initialization routines for the
+// CodeGen library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
+
+using namespace llvm;
+
+/// initializeCodeGen - Initialize all passes linked into the CodeGen library.
+void llvm::initializeCodeGen(PassRegistry &Registry) {
+  initializeAssignmentTrackingAnalysisPass(Registry);
+  initializeAtomicExpandPass(Registry);
+  initializeBasicBlockSectionsPass(Registry);
+  initializeBranchFolderPassPass(Registry);
+  initializeBranchRelaxationPass(Registry);
+  initializeBreakFalseDepsPass(Registry);
+  initializeCallBrPreparePass(Registry);
+  initializeCFGuardLongjmpPass(Registry);
+  initializeCFIFixupPass(Registry);
+  initializeCFIInstrInserterPass(Registry);
+  initializeCheckDebugMachineModulePass(Registry);
+  initializeCodeGenPreparePass(Registry);
+  initializeDeadMachineInstructionElimPass(Registry);
+  initializeDebugifyMachineModulePass(Registry);
+  initializeDetectDeadLanesPass(Registry);
+  initializeDwarfEHPrepareLegacyPassPass(Registry);
+  initializeEarlyIfConverterPass(Registry);
+  initializeEarlyIfPredicatorPass(Registry);
+  initializeEarlyMachineLICMPass(Registry);
+  initializeEarlyTailDuplicatePass(Registry);
+  initializeExpandLargeDivRemLegacyPassPass(Registry);
+  initializeExpandLargeFpConvertLegacyPassPass(Registry);
+  initializeExpandMemCmpPassPass(Registry);
+  initializeExpandPostRAPass(Registry);
+  initializeFEntryInserterPass(Registry);
+  initializeFinalizeISelPass(Registry);
+  initializeFinalizeMachineBundlesPass(Registry);
+  initializeFixupStatepointCallerSavedPass(Registry);
+  initializeFuncletLayoutPass(Registry);
+  initializeGCMachineCodeAnalysisPass(Registry);
+  initializeGCModuleInfoPass(Registry);
+  initializeHardwareLoopsLegacyPass(Registry);
+  initializeIfConverterPass(Registry);
+  initializeImplicitNullChecksPass(Registry);
+  initializeIndirectBrExpandPassPass(Registry);
+  initializeInterleavedLoadCombinePass(Registry);
+  initializeInterleavedAccessPass(Registry);
+  initializeJMCInstrumenterPass(Registry);
+  initializeLiveDebugValuesPass(Registry);
+  initializeLiveDebugVariablesPass(Registry);
+  initializeLiveIntervalsPass(Registry);
+  initializeLiveRangeShrinkPass(Registry);
+  initializeLiveStacksPass(Registry);
+  initializeLiveVariablesPass(Registry);
+  initializeLocalStackSlotPassPass(Registry);
+  initializeLowerGlobalDtorsLegacyPassPass(Registry);
+  initializeLowerIntrinsicsPass(Registry);
+  initializeMIRAddFSDiscriminatorsPass(Registry);
+  initializeMIRCanonicalizerPass(Registry);
+  initializeMIRNamerPass(Registry);
+  initializeMIRProfileLoaderPassPass(Registry);
+  initializeMachineBlockFrequencyInfoPass(Registry);
+  initializeMachineBlockPlacementPass(Registry);
+  initializeMachineBlockPlacementStatsPass(Registry);
+  initializeMachineCFGPrinterPass(Registry);
+  initializeMachineCSEPass(Registry);
+  initializeMachineCombinerPass(Registry);
+  initializeMachineCopyPropagationPass(Registry);
+  initializeMachineCycleInfoPrinterPassPass(Registry);
+  initializeMachineCycleInfoWrapperPassPass(Registry);
+  initializeMachineDominatorTreePass(Registry);
+  initializeMachineFunctionPrinterPassPass(Registry);
+  initializeMachineLateInstrsCleanupPass(Registry);
+  initializeMachineLICMPass(Registry);
+  initializeMachineLoopInfoPass(Registry);
+  initializeMachineModuleInfoWrapperPassPass(Registry);
+  initializeMachineOptimizationRemarkEmitterPassPass(Registry);
+  initializeMachineOutlinerPass(Registry);
+  initializeMachinePipelinerPass(Registry);
+  initializeMachineSanitizerBinaryMetadataPass(Registry);
+  initializeModuloScheduleTestPass(Registry);
+  initializeMachinePostDominatorTreePass(Registry);
+  initializeMachineRegionInfoPassPass(Registry);
+  initializeMachineSchedulerPass(Registry);
+  initializeMachineSinkingPass(Registry);
+  initializeMachineUniformityAnalysisPassPass(Registry);
+  initializeMachineUniformityInfoPrinterPassPass(Registry);
+  initializeMachineVerifierPassPass(Registry);
+  initializeObjCARCContractLegacyPassPass(Registry);
+  initializeOptimizePHIsPass(Registry);
+  initializePEIPass(Registry);
+  initializePHIEliminationPass(Registry);
+  initializePatchableFunctionPass(Registry);
+  initializePeepholeOptimizerPass(Registry);
+  initializePostMachineSchedulerPass(Registry);
+  initializePostRAHazardRecognizerPass(Registry);
+  initializePostRAMachineSinkingPass(Registry);
+  initializePostRASchedulerPass(Registry);
+  initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
+  initializeProcessImplicitDefsPass(Registry);
+  initializeRABasicPass(Registry);
+  initializeRAGreedyPass(Registry);
+  initializeRegAllocFastPass(Registry);
+  initializeRegUsageInfoCollectorPass(Registry);
+  initializeRegUsageInfoPropagationPass(Registry);
+  initializeRegisterCoalescerPass(Registry);
+  initializeRemoveRedundantDebugValuesPass(Registry);
+  initializeRenameIndependentSubregsPass(Registry);
+  initializeSafeStackLegacyPassPass(Registry);
+  initializeSelectOptimizePass(Registry);
+  initializeShadowStackGCLoweringPass(Registry);
+  initializeShrinkWrapPass(Registry);
+  initializeSjLjEHPreparePass(Registry);
+  initializeSlotIndexesPass(Registry);
+  initializeStackColoringPass(Registry);
+  initializeStackFrameLayoutAnalysisPassPass(Registry);
+  initializeStackMapLivenessPass(Registry);
+  initializeStackProtectorPass(Registry);
+  initializeStackSlotColoringPass(Registry);
+  initializeStripDebugMachineModulePass(Registry);
+  initializeTailDuplicatePass(Registry);
+  initializeTargetPassConfigPass(Registry);
+  initializeTwoAddressInstructionPassPass(Registry);
+  initializeTypePromotionLegacyPass(Registry);
+  initializeUnpackMachineBundlesPass(Registry);
+  initializeUnreachableBlockElimLegacyPassPass(Registry);
+  initializeUnreachableMachineBlockElimPass(Registry);
+  initializeVirtRegMapPass(Registry);
+  initializeVirtRegRewriterPass(Registry);
+  initializeWasmEHPreparePass(Registry);
+  initializeWinEHPreparePass(Registry);
+  initializeXRayInstrumentationPass(Registry);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CodeGenCommonISel.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CodeGenCommonISel.cpp
new file mode 100644
index 000000000000..577c5dbc8e2d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CodeGenCommonISel.cpp
@@ -0,0 +1,293 @@
+//===-- CodeGenCommonISel.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines common utilies that are shared between SelectionDAG and
+// GlobalISel frameworks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/CodeGenCommonISel.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+
+#define DEBUG_TYPE "codegen-common"
+
+using namespace llvm;
+
+/// Add a successor MBB to ParentMBB< creating a new MachineBB for BB if SuccMBB
+/// is 0.
+MachineBasicBlock *
+StackProtectorDescriptor::addSuccessorMBB(
+    const BasicBlock *BB, MachineBasicBlock *ParentMBB, bool IsLikely,
+    MachineBasicBlock *SuccMBB) {
+  // If SuccBB has not been created yet, create it.
+  if (!SuccMBB) {
+    MachineFunction *MF = ParentMBB->getParent();
+    MachineFunction::iterator BBI(ParentMBB);
+    SuccMBB = MF->CreateMachineBasicBlock(BB);
+    MF->insert(++BBI, SuccMBB);
+  }
+  // Add it as a successor of ParentMBB.
+  ParentMBB->addSuccessor(
+      SuccMBB, BranchProbabilityInfo::getBranchProbStackProtector(IsLikely));
+  return SuccMBB;
+}
+
+/// Given that the input MI is before a partial terminator sequence TSeq, return
+/// true if M + TSeq also a partial terminator sequence.
+///
+/// A Terminator sequence is a sequence of MachineInstrs which at this point in
+/// lowering copy vregs into physical registers, which are then passed into
+/// terminator instructors so we can satisfy ABI constraints. A partial
+/// terminator sequence is an improper subset of a terminator sequence (i.e. it
+/// may be the whole terminator sequence).
+static bool MIIsInTerminatorSequence(const MachineInstr &MI) {
+  // If we do not have a copy or an implicit def, we return true if and only if
+  // MI is a debug value.
+  if (!MI.isCopy() && !MI.isImplicitDef()) {
+    // Sometimes DBG_VALUE MI sneak in between the copies from the vregs to the
+    // physical registers if there is debug info associated with the terminator
+    // of our mbb. We want to include said debug info in our terminator
+    // sequence, so we return true in that case.
+    if (MI.isDebugInstr())
+      return true;
+
+    // For GlobalISel, we may have extension instructions for arguments within
+    // copy sequences. Allow these.
+    switch (MI.getOpcode()) {
+    case TargetOpcode::G_TRUNC:
+    case TargetOpcode::G_ZEXT:
+    case TargetOpcode::G_ANYEXT:
+    case TargetOpcode::G_SEXT:
+    case TargetOpcode::G_MERGE_VALUES:
+    case TargetOpcode::G_UNMERGE_VALUES:
+    case TargetOpcode::G_CONCAT_VECTORS:
+    case TargetOpcode::G_BUILD_VECTOR:
+    case TargetOpcode::G_EXTRACT:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  // We have left the terminator sequence if we are not doing one of the
+  // following:
+  //
+  // 1. Copying a vreg into a physical register.
+  // 2. Copying a vreg into a vreg.
+  // 3. Defining a register via an implicit def.
+
+  // OPI should always be a register definition...
+  MachineInstr::const_mop_iterator OPI = MI.operands_begin();
+  if (!OPI->isReg() || !OPI->isDef())
+    return false;
+
+  // Defining any register via an implicit def is always ok.
+  if (MI.isImplicitDef())
+    return true;
+
+  // Grab the copy source...
+  MachineInstr::const_mop_iterator OPI2 = OPI;
+  ++OPI2;
+  assert(OPI2 != MI.operands_end()
+         && "Should have a copy implying we should have 2 arguments.");
+
+  // Make sure that the copy dest is not a vreg when the copy source is a
+  // physical register.
+  if (!OPI2->isReg() ||
+      (!OPI->getReg().isPhysical() && OPI2->getReg().isPhysical()))
+    return false;
+
+  return true;
+}
+
+/// Find the split point at which to splice the end of BB into its success stack
+/// protector check machine basic block.
+///
+/// On many platforms, due to ABI constraints, terminators, even before register
+/// allocation, use physical registers. This creates an issue for us since
+/// physical registers at this point can not travel across basic
+/// blocks. Luckily, selectiondag always moves physical registers into vregs
+/// when they enter functions and moves them through a sequence of copies back
+/// into the physical registers right before the terminator creating a
+/// ``Terminator Sequence''. This function is searching for the beginning of the
+/// terminator sequence so that we can ensure that we splice off not just the
+/// terminator, but additionally the copies that move the vregs into the
+/// physical registers.
+MachineBasicBlock::iterator
+llvm::findSplitPointForStackProtector(MachineBasicBlock *BB,
+                                      const TargetInstrInfo &TII) {
+  MachineBasicBlock::iterator SplitPoint = BB->getFirstTerminator();
+  if (SplitPoint == BB->begin())
+    return SplitPoint;
+
+  MachineBasicBlock::iterator Start = BB->begin();
+  MachineBasicBlock::iterator Previous = SplitPoint;
+  do {
+    --Previous;
+  } while (Previous != Start && Previous->isDebugInstr());
+
+  if (TII.isTailCall(*SplitPoint) &&
+      Previous->getOpcode() == TII.getCallFrameDestroyOpcode()) {
+    // Call frames cannot be nested, so if this frame is describing the tail
+    // call itself, then we must insert before the sequence even starts. For
+    // example:
+    //     <split point>
+    //     ADJCALLSTACKDOWN ...
+    //     <Moves>
+    //     ADJCALLSTACKUP ...
+    //     TAILJMP somewhere
+    // On the other hand, it could be an unrelated call in which case this tail
+    // call has no register moves of its own and should be the split point. For
+    // example:
+    //     ADJCALLSTACKDOWN
+    //     CALL something_else
+    //     ADJCALLSTACKUP
+    //     <split point>
+    //     TAILJMP somewhere
+    do {
+      --Previous;
+      if (Previous->isCall())
+        return SplitPoint;
+    } while(Previous->getOpcode() != TII.getCallFrameSetupOpcode());
+
+    return Previous;
+  }
+
+  while (MIIsInTerminatorSequence(*Previous)) {
+    SplitPoint = Previous;
+    if (Previous == Start)
+      break;
+    --Previous;
+  }
+
+  return SplitPoint;
+}
+
+FPClassTest llvm::invertFPClassTestIfSimpler(FPClassTest Test) {
+  FPClassTest InvertedTest = ~Test;
+  // Pick the direction with fewer tests
+  // TODO: Handle more combinations of cases that can be handled together
+  switch (static_cast<unsigned>(InvertedTest)) {
+  case fcNan:
+  case fcSNan:
+  case fcQNan:
+  case fcInf:
+  case fcPosInf:
+  case fcNegInf:
+  case fcNormal:
+  case fcPosNormal:
+  case fcNegNormal:
+  case fcSubnormal:
+  case fcPosSubnormal:
+  case fcNegSubnormal:
+  case fcZero:
+  case fcPosZero:
+  case fcNegZero:
+  case fcFinite:
+  case fcPosFinite:
+  case fcNegFinite:
+  case fcZero | fcNan:
+  case fcSubnormal | fcZero:
+  case fcSubnormal | fcZero | fcNan:
+    return InvertedTest;
+  default:
+    return fcNone;
+  }
+
+  llvm_unreachable("covered FPClassTest");
+}
+
+static MachineOperand *getSalvageOpsForCopy(const MachineRegisterInfo &MRI,
+                                            MachineInstr &Copy) {
+  assert(Copy.getOpcode() == TargetOpcode::COPY && "Must be a COPY");
+
+  return &Copy.getOperand(1);
+}
+
+static MachineOperand *getSalvageOpsForTrunc(const MachineRegisterInfo &MRI,
+                                            MachineInstr &Trunc,
+                                            SmallVectorImpl<uint64_t> &Ops) {
+  assert(Trunc.getOpcode() == TargetOpcode::G_TRUNC && "Must be a G_TRUNC");
+
+  const auto FromLLT = MRI.getType(Trunc.getOperand(1).getReg());
+  const auto ToLLT = MRI.getType(Trunc.defs().begin()->getReg());
+
+  // TODO: Support non-scalar types.
+  if (!FromLLT.isScalar()) {
+    return nullptr;
+  }
+
+  auto ExtOps = DIExpression::getExtOps(FromLLT.getSizeInBits(),
+                                        ToLLT.getSizeInBits(), false);
+  Ops.append(ExtOps.begin(), ExtOps.end());
+  return &Trunc.getOperand(1);
+}
+
+static MachineOperand *salvageDebugInfoImpl(const MachineRegisterInfo &MRI,
+                                            MachineInstr &MI,
+                                            SmallVectorImpl<uint64_t> &Ops) {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_TRUNC:
+    return getSalvageOpsForTrunc(MRI, MI, Ops);
+  case TargetOpcode::COPY:
+    return getSalvageOpsForCopy(MRI, MI);
+  default:
+    return nullptr;
+  }
+}
+
+void llvm::salvageDebugInfoForDbgValue(const MachineRegisterInfo &MRI,
+                                       MachineInstr &MI,
+                                       ArrayRef<MachineOperand *> DbgUsers) {
+  // These are arbitrary chosen limits on the maximum number of values and the
+  // maximum size of a debug expression we can salvage up to, used for
+  // performance reasons.
+  const unsigned MaxExpressionSize = 128;
+
+  for (auto *DefMO : DbgUsers) {
+    MachineInstr *DbgMI = DefMO->getParent();
+    if (DbgMI->isIndirectDebugValue()) {
+      continue;
+    }
+
+    int UseMOIdx = DbgMI->findRegisterUseOperandIdx(DefMO->getReg());
+    assert(UseMOIdx != -1 && DbgMI->hasDebugOperandForReg(DefMO->getReg()) &&
+           "Must use salvaged instruction as its location");
+
+    // TODO: Support DBG_VALUE_LIST.
+    if (DbgMI->getOpcode() != TargetOpcode::DBG_VALUE) {
+      assert(DbgMI->getOpcode() == TargetOpcode::DBG_VALUE_LIST &&
+             "Must be either DBG_VALUE or DBG_VALUE_LIST");
+      continue;
+    }
+
+    const DIExpression *SalvagedExpr = DbgMI->getDebugExpression();
+
+    SmallVector<uint64_t, 16> Ops;
+    auto Op0 = salvageDebugInfoImpl(MRI, MI, Ops);
+    if (!Op0)
+      continue;
+    SalvagedExpr = DIExpression::appendOpsToArg(SalvagedExpr, Ops, 0, true);
+
+    bool IsValidSalvageExpr =
+        SalvagedExpr->getNumElements() <= MaxExpressionSize;
+    if (IsValidSalvageExpr) {
+      auto &UseMO = DbgMI->getOperand(UseMOIdx);
+      UseMO.setReg(Op0->getReg());
+      UseMO.setSubReg(Op0->getSubReg());
+      DbgMI->getDebugExpressionOp().setMetadata(SalvagedExpr);
+
+      LLVM_DEBUG(dbgs() << "SALVAGE: " << *DbgMI << '\n');
+    }
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPassBuilder.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPassBuilder.cpp
new file mode 100644
index 000000000000..7f37f2069a3b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPassBuilder.cpp
@@ -0,0 +1,25 @@
+//===--- CodeGenPassBuilder.cpp --------------------------------------- ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines interfaces to access the target independent code
+// generation passes provided by the LLVM backend.
+//
+//===---------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/CodeGenPassBuilder.h"
+
+using namespace llvm;
+
+namespace llvm {
+#define DUMMY_MACHINE_MODULE_PASS(NAME, PASS_NAME, CONSTRUCTOR)                \
+  AnalysisKey PASS_NAME::Key;
+#include "llvm/CodeGen/MachinePassRegistry.def"
+#define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)              \
+  AnalysisKey PASS_NAME::Key;
+#include "llvm/CodeGen/MachinePassRegistry.def"
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPrepare.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPrepare.cpp
new file mode 100644
index 000000000000..b00df0b6c6cb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -0,0 +1,8660 @@
+//===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass munges the code in the input function to better prepare it for
+// SelectionDAG-based code generation. This works around limitations in it's
+// basic-block-at-a-time approach. It should eventually be removed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "codegenprepare"
+
+STATISTIC(NumBlocksElim, "Number of blocks eliminated");
+STATISTIC(NumPHIsElim, "Number of trivial PHIs eliminated");
+STATISTIC(NumGEPsElim, "Number of GEPs converted to casts");
+STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of "
+                      "sunken Cmps");
+STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses "
+                       "of sunken Casts");
+STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
+                          "computations were sunk");
+STATISTIC(NumMemoryInstsPhiCreated,
+          "Number of phis created when address "
+          "computations were sunk to memory instructions");
+STATISTIC(NumMemoryInstsSelectCreated,
+          "Number of select created when address "
+          "computations were sunk to memory instructions");
+STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads");
+STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized");
+STATISTIC(NumAndsAdded,
+          "Number of and mask instructions added to form ext loads");
+STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
+STATISTIC(NumRetsDup, "Number of return instructions duplicated");
+STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
+STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
+STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
+
+static cl::opt<bool> DisableBranchOpts(
+    "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
+    cl::desc("Disable branch optimizations in CodeGenPrepare"));
+
+static cl::opt<bool>
+    DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),
+                  cl::desc("Disable GC optimizations in CodeGenPrepare"));
+
+static cl::opt<bool>
+    DisableSelectToBranch("disable-cgp-select2branch", cl::Hidden,
+                          cl::init(false),
+                          cl::desc("Disable select to branch conversion."));
+
+static cl::opt<bool>
+    AddrSinkUsingGEPs("addr-sink-using-gep", cl::Hidden, cl::init(true),
+                      cl::desc("Address sinking in CGP using GEPs."));
+
+static cl::opt<bool>
+    EnableAndCmpSinking("enable-andcmp-sinking", cl::Hidden, cl::init(true),
+                        cl::desc("Enable sinkinig and/cmp into branches."));
+
+static cl::opt<bool> DisableStoreExtract(
+    "disable-cgp-store-extract", cl::Hidden, cl::init(false),
+    cl::desc("Disable store(extract) optimizations in CodeGenPrepare"));
+
+static cl::opt<bool> StressStoreExtract(
+    "stress-cgp-store-extract", cl::Hidden, cl::init(false),
+    cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));
+
+static cl::opt<bool> DisableExtLdPromotion(
+    "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
+    cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in "
+             "CodeGenPrepare"));
+
+static cl::opt<bool> StressExtLdPromotion(
+    "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
+    cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) "
+             "optimization in CodeGenPrepare"));
+
+static cl::opt<bool> DisablePreheaderProtect(
+    "disable-preheader-prot", cl::Hidden, cl::init(false),
+    cl::desc("Disable protection against removing loop preheaders"));
+
+static cl::opt<bool> ProfileGuidedSectionPrefix(
+    "profile-guided-section-prefix", cl::Hidden, cl::init(true),
+    cl::desc("Use profile info to add section prefix for hot/cold functions"));
+
+static cl::opt<bool> ProfileUnknownInSpecialSection(
+    "profile-unknown-in-special-section", cl::Hidden,
+    cl::desc("In profiling mode like sampleFDO, if a function doesn't have "
+             "profile, we cannot tell the function is cold for sure because "
+             "it may be a function newly added without ever being sampled. "
+             "With the flag enabled, compiler can put such profile unknown "
+             "functions into a special section, so runtime system can choose "
+             "to handle it in a different way than .text section, to save "
+             "RAM for example. "));
+
+static cl::opt<bool> BBSectionsGuidedSectionPrefix(
+    "bbsections-guided-section-prefix", cl::Hidden, cl::init(true),
+    cl::desc("Use the basic-block-sections profile to determine the text "
+             "section prefix for hot functions. Functions with "
+             "basic-block-sections profile will be placed in `.text.hot` "
+             "regardless of their FDO profile info. Other functions won't be "
+             "impacted, i.e., their prefixes will be decided by FDO/sampleFDO "
+             "profiles."));
+
+static cl::opt<unsigned> FreqRatioToSkipMerge(
+    "cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2),
+    cl::desc("Skip merging empty blocks if (frequency of empty block) / "
+             "(frequency of destination block) is greater than this ratio"));
+
+static cl::opt<bool> ForceSplitStore(
+    "force-split-store", cl::Hidden, cl::init(false),
+    cl::desc("Force store splitting no matter what the target query says."));
+
+static cl::opt<bool> EnableTypePromotionMerge(
+    "cgp-type-promotion-merge", cl::Hidden,
+    cl::desc("Enable merging of redundant sexts when one is dominating"
+             " the other."),
+    cl::init(true));
+
+static cl::opt<bool> DisableComplexAddrModes(
+    "disable-complex-addr-modes", cl::Hidden, cl::init(false),
+    cl::desc("Disables combining addressing modes with different parts "
+             "in optimizeMemoryInst."));
+
+static cl::opt<bool>
+    AddrSinkNewPhis("addr-sink-new-phis", cl::Hidden, cl::init(false),
+                    cl::desc("Allow creation of Phis in Address sinking."));
+
+static cl::opt<bool> AddrSinkNewSelects(
+    "addr-sink-new-select", cl::Hidden, cl::init(true),
+    cl::desc("Allow creation of selects in Address sinking."));
+
+static cl::opt<bool> AddrSinkCombineBaseReg(
+    "addr-sink-combine-base-reg", cl::Hidden, cl::init(true),
+    cl::desc("Allow combining of BaseReg field in Address sinking."));
+
+static cl::opt<bool> AddrSinkCombineBaseGV(
+    "addr-sink-combine-base-gv", cl::Hidden, cl::init(true),
+    cl::desc("Allow combining of BaseGV field in Address sinking."));
+
+static cl::opt<bool> AddrSinkCombineBaseOffs(
+    "addr-sink-combine-base-offs", cl::Hidden, cl::init(true),
+    cl::desc("Allow combining of BaseOffs field in Address sinking."));
+
+static cl::opt<bool> AddrSinkCombineScaledReg(
+    "addr-sink-combine-scaled-reg", cl::Hidden, cl::init(true),
+    cl::desc("Allow combining of ScaledReg field in Address sinking."));
+
+static cl::opt<bool>
+    EnableGEPOffsetSplit("cgp-split-large-offset-gep", cl::Hidden,
+                         cl::init(true),
+                         cl::desc("Enable splitting large offset of GEP."));
+
+static cl::opt<bool> EnableICMP_EQToICMP_ST(
+    "cgp-icmp-eq2icmp-st", cl::Hidden, cl::init(false),
+    cl::desc("Enable ICMP_EQ to ICMP_S(L|G)T conversion."));
+
+static cl::opt<bool>
+    VerifyBFIUpdates("cgp-verify-bfi-updates", cl::Hidden, cl::init(false),
+                     cl::desc("Enable BFI update verification for "
+                              "CodeGenPrepare."));
+
+static cl::opt<bool>
+    OptimizePhiTypes("cgp-optimize-phi-types", cl::Hidden, cl::init(true),
+                     cl::desc("Enable converting phi types in CodeGenPrepare"));
+
+static cl::opt<unsigned>
+    HugeFuncThresholdInCGPP("cgpp-huge-func", cl::init(10000), cl::Hidden,
+                            cl::desc("Least BB number of huge function."));
+
+static cl::opt<unsigned>
+    MaxAddressUsersToScan("cgp-max-address-users-to-scan", cl::init(100),
+                          cl::Hidden,
+                          cl::desc("Max number of address users to look at"));
+namespace {
+
+enum ExtType {
+  ZeroExtension, // Zero extension has been seen.
+  SignExtension, // Sign extension has been seen.
+  BothExtension  // This extension type is used if we saw sext after
+                 // ZeroExtension had been set, or if we saw zext after
+                 // SignExtension had been set. It makes the type
+                 // information of a promoted instruction invalid.
+};
+
+enum ModifyDT {
+  NotModifyDT, // Not Modify any DT.
+  ModifyBBDT,  // Modify the Basic Block Dominator Tree.
+  ModifyInstDT // Modify the Instruction Dominator in a Basic Block,
+               // This usually means we move/delete/insert instruction
+               // in a Basic Block. So we should re-iterate instructions
+               // in such Basic Block.
+};
+
+using SetOfInstrs = SmallPtrSet<Instruction *, 16>;
+using TypeIsSExt = PointerIntPair<Type *, 2, ExtType>;
+using InstrToOrigTy = DenseMap<Instruction *, TypeIsSExt>;
+using SExts = SmallVector<Instruction *, 16>;
+using ValueToSExts = MapVector<Value *, SExts>;
+
+class TypePromotionTransaction;
+
+class CodeGenPrepare : public FunctionPass {
+  const TargetMachine *TM = nullptr;
+  const TargetSubtargetInfo *SubtargetInfo = nullptr;
+  const TargetLowering *TLI = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  const TargetTransformInfo *TTI = nullptr;
+  const BasicBlockSectionsProfileReader *BBSectionsProfileReader = nullptr;
+  const TargetLibraryInfo *TLInfo = nullptr;
+  LoopInfo *LI = nullptr;
+  std::unique_ptr<BlockFrequencyInfo> BFI;
+  std::unique_ptr<BranchProbabilityInfo> BPI;
+  ProfileSummaryInfo *PSI = nullptr;
+
+  /// As we scan instructions optimizing them, this is the next instruction
+  /// to optimize. Transforms that can invalidate this should update it.
+  BasicBlock::iterator CurInstIterator;
+
+  /// Keeps track of non-local addresses that have been sunk into a block.
+  /// This allows us to avoid inserting duplicate code for blocks with
+  /// multiple load/stores of the same address. The usage of WeakTrackingVH
+  /// enables SunkAddrs to be treated as a cache whose entries can be
+  /// invalidated if a sunken address computation has been erased.
+  ValueMap<Value *, WeakTrackingVH> SunkAddrs;
+
+  /// Keeps track of all instructions inserted for the current function.
+  SetOfInstrs InsertedInsts;
+
+  /// Keeps track of the type of the related instruction before their
+  /// promotion for the current function.
+  InstrToOrigTy PromotedInsts;
+
+  /// Keep track of instructions removed during promotion.
+  SetOfInstrs RemovedInsts;
+
+  /// Keep track of sext chains based on their initial value.
+  DenseMap<Value *, Instruction *> SeenChainsForSExt;
+
+  /// Keep track of GEPs accessing the same data structures such as structs or
+  /// arrays that are candidates to be split later because of their large
+  /// size.
+  MapVector<AssertingVH<Value>,
+            SmallVector<std::pair<AssertingVH<GetElementPtrInst>, int64_t>, 32>>
+      LargeOffsetGEPMap;
+
+  /// Keep track of new GEP base after splitting the GEPs having large offset.
+  SmallSet<AssertingVH<Value>, 2> NewGEPBases;
+
+  /// Map serial numbers to Large offset GEPs.
+  DenseMap<AssertingVH<GetElementPtrInst>, int> LargeOffsetGEPID;
+
+  /// Keep track of SExt promoted.
+  ValueToSExts ValToSExtendedUses;
+
+  /// True if the function has the OptSize attribute.
+  bool OptSize;
+
+  /// DataLayout for the Function being processed.
+  const DataLayout *DL = nullptr;
+
+  /// Building the dominator tree can be expensive, so we only build it
+  /// lazily and update it when required.
+  std::unique_ptr<DominatorTree> DT;
+
+public:
+  /// If encounter huge function, we need to limit the build time.
+  bool IsHugeFunc = false;
+
+  /// FreshBBs is like worklist, it collected the updated BBs which need
+  /// to be optimized again.
+  /// Note: Consider building time in this pass, when a BB updated, we need
+  /// to insert such BB into FreshBBs for huge function.
+  SmallSet<BasicBlock *, 32> FreshBBs;
+
+  static char ID; // Pass identification, replacement for typeid
+
+  CodeGenPrepare() : FunctionPass(ID) {
+    initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void releaseMemory() override {
+    // Clear per function information.
+    InsertedInsts.clear();
+    PromotedInsts.clear();
+    FreshBBs.clear();
+    BPI.reset();
+    BFI.reset();
+  }
+
+  StringRef getPassName() const override { return "CodeGen Prepare"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // FIXME: When we can selectively preserve passes, preserve the domtree.
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetPassConfig>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addUsedIfAvailable<BasicBlockSectionsProfileReader>();
+  }
+
+private:
+  template <typename F>
+  void resetIteratorIfInvalidatedWhileCalling(BasicBlock *BB, F f) {
+    // Substituting can cause recursive simplifications, which can invalidate
+    // our iterator.  Use a WeakTrackingVH to hold onto it in case this
+    // happens.
+    Value *CurValue = &*CurInstIterator;
+    WeakTrackingVH IterHandle(CurValue);
+
+    f();
+
+    // If the iterator instruction was recursively deleted, start over at the
+    // start of the block.
+    if (IterHandle != CurValue) {
+      CurInstIterator = BB->begin();
+      SunkAddrs.clear();
+    }
+  }
+
+  // Get the DominatorTree, building if necessary.
+  DominatorTree &getDT(Function &F) {
+    if (!DT)
+      DT = std::make_unique<DominatorTree>(F);
+    return *DT;
+  }
+
+  void removeAllAssertingVHReferences(Value *V);
+  bool eliminateAssumptions(Function &F);
+  bool eliminateFallThrough(Function &F, DominatorTree *DT = nullptr);
+  bool eliminateMostlyEmptyBlocks(Function &F);
+  BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB);
+  bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
+  void eliminateMostlyEmptyBlock(BasicBlock *BB);
+  bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB,
+                                     bool isPreheader);
+  bool makeBitReverse(Instruction &I);
+  bool optimizeBlock(BasicBlock &BB, ModifyDT &ModifiedDT);
+  bool optimizeInst(Instruction *I, ModifyDT &ModifiedDT);
+  bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy,
+                          unsigned AddrSpace);
+  bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
+  bool optimizeInlineAsmInst(CallInst *CS);
+  bool optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT);
+  bool optimizeExt(Instruction *&I);
+  bool optimizeExtUses(Instruction *I);
+  bool optimizeLoadExt(LoadInst *Load);
+  bool optimizeShiftInst(BinaryOperator *BO);
+  bool optimizeFunnelShift(IntrinsicInst *Fsh);
+  bool optimizeSelectInst(SelectInst *SI);
+  bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);
+  bool optimizeSwitchType(SwitchInst *SI);
+  bool optimizeSwitchPhiConstants(SwitchInst *SI);
+  bool optimizeSwitchInst(SwitchInst *SI);
+  bool optimizeExtractElementInst(Instruction *Inst);
+  bool dupRetToEnableTailCallOpts(BasicBlock *BB, ModifyDT &ModifiedDT);
+  bool fixupDbgValue(Instruction *I);
+  bool placeDbgValues(Function &F);
+  bool placePseudoProbes(Function &F);
+  bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts,
+                    LoadInst *&LI, Instruction *&Inst, bool HasPromoted);
+  bool tryToPromoteExts(TypePromotionTransaction &TPT,
+                        const SmallVectorImpl<Instruction *> &Exts,
+                        SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
+                        unsigned CreatedInstsCost = 0);
+  bool mergeSExts(Function &F);
+  bool splitLargeGEPOffsets();
+  bool optimizePhiType(PHINode *Inst, SmallPtrSetImpl<PHINode *> &Visited,
+                       SmallPtrSetImpl<Instruction *> &DeletedInstrs);
+  bool optimizePhiTypes(Function &F);
+  bool performAddressTypePromotion(
+      Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,
+      bool HasPromoted, TypePromotionTransaction &TPT,
+      SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);
+  bool splitBranchCondition(Function &F, ModifyDT &ModifiedDT);
+  bool simplifyOffsetableRelocate(GCStatepointInst &I);
+
+  bool tryToSinkFreeOperands(Instruction *I);
+  bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0, Value *Arg1,
+                                   CmpInst *Cmp, Intrinsic::ID IID);
+  bool optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT);
+  bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
+  bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
+  void verifyBFIUpdates(Function &F);
+};
+
+} // end anonymous namespace
+
+char CodeGenPrepare::ID = 0;
+
+INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE,
+                      "Optimize for code generation", false, false)
+INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReader)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE, "Optimize for code generation",
+                    false, false)
+
+FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); }
+
+bool CodeGenPrepare::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  DL = &F.getParent()->getDataLayout();
+
+  bool EverMadeChange = false;
+
+  TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+  SubtargetInfo = TM->getSubtargetImpl(F);
+  TLI = SubtargetInfo->getTargetLowering();
+  TRI = SubtargetInfo->getRegisterInfo();
+  TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  BPI.reset(new BranchProbabilityInfo(F, *LI));
+  BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  BBSectionsProfileReader =
+      getAnalysisIfAvailable<BasicBlockSectionsProfileReader>();
+  OptSize = F.hasOptSize();
+  // Use the basic-block-sections profile to promote hot functions to .text.hot
+  // if requested.
+  if (BBSectionsGuidedSectionPrefix && BBSectionsProfileReader &&
+      BBSectionsProfileReader->isFunctionHot(F.getName())) {
+    F.setSectionPrefix("hot");
+  } else if (ProfileGuidedSectionPrefix) {
+    // The hot attribute overwrites profile count based hotness while profile
+    // counts based hotness overwrite the cold attribute.
+    // This is a conservative behabvior.
+    if (F.hasFnAttribute(Attribute::Hot) ||
+        PSI->isFunctionHotInCallGraph(&F, *BFI))
+      F.setSectionPrefix("hot");
+    // If PSI shows this function is not hot, we will placed the function
+    // into unlikely section if (1) PSI shows this is a cold function, or
+    // (2) the function has a attribute of cold.
+    else if (PSI->isFunctionColdInCallGraph(&F, *BFI) ||
+             F.hasFnAttribute(Attribute::Cold))
+      F.setSectionPrefix("unlikely");
+    else if (ProfileUnknownInSpecialSection && PSI->hasPartialSampleProfile() &&
+             PSI->isFunctionHotnessUnknown(F))
+      F.setSectionPrefix("unknown");
+  }
+
+  /// This optimization identifies DIV instructions that can be
+  /// profitably bypassed and carried out with a shorter, faster divide.
+  if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI->isSlowDivBypassed()) {
+    const DenseMap<unsigned int, unsigned int> &BypassWidths =
+        TLI->getBypassSlowDivWidths();
+    BasicBlock *BB = &*F.begin();
+    while (BB != nullptr) {
+      // bypassSlowDivision may create new BBs, but we don't want to reapply the
+      // optimization to those blocks.
+      BasicBlock *Next = BB->getNextNode();
+      // F.hasOptSize is already checked in the outer if statement.
+      if (!llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
+        EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
+      BB = Next;
+    }
+  }
+
+  // Get rid of @llvm.assume builtins before attempting to eliminate empty
+  // blocks, since there might be blocks that only contain @llvm.assume calls
+  // (plus arguments that we can get rid of).
+  EverMadeChange |= eliminateAssumptions(F);
+
+  // Eliminate blocks that contain only PHI nodes and an
+  // unconditional branch.
+  EverMadeChange |= eliminateMostlyEmptyBlocks(F);
+
+  ModifyDT ModifiedDT = ModifyDT::NotModifyDT;
+  if (!DisableBranchOpts)
+    EverMadeChange |= splitBranchCondition(F, ModifiedDT);
+
+  // Split some critical edges where one of the sources is an indirect branch,
+  // to help generate sane code for PHIs involving such edges.
+  EverMadeChange |=
+      SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/true);
+
+  // If we are optimzing huge function, we need to consider the build time.
+  // Because the basic algorithm's complex is near O(N!).
+  IsHugeFunc = F.size() > HugeFuncThresholdInCGPP;
+
+  // Transformations above may invalidate dominator tree and/or loop info.
+  DT.reset();
+  LI->releaseMemory();
+  LI->analyze(getDT(F));
+
+  bool MadeChange = true;
+  bool FuncIterated = false;
+  while (MadeChange) {
+    MadeChange = false;
+
+    for (BasicBlock &BB : llvm::make_early_inc_range(F)) {
+      if (FuncIterated && !FreshBBs.contains(&BB))
+        continue;
+
+      ModifyDT ModifiedDTOnIteration = ModifyDT::NotModifyDT;
+      bool Changed = optimizeBlock(BB, ModifiedDTOnIteration);
+
+      if (ModifiedDTOnIteration == ModifyDT::ModifyBBDT)
+        DT.reset();
+
+      MadeChange |= Changed;
+      if (IsHugeFunc) {
+        // If the BB is updated, it may still has chance to be optimized.
+        // This usually happen at sink optimization.
+        // For example:
+        //
+        // bb0：
+        // %and = and i32 %a, 4
+        // %cmp = icmp eq i32 %and, 0
+        //
+        // If the %cmp sink to other BB, the %and will has chance to sink.
+        if (Changed)
+          FreshBBs.insert(&BB);
+        else if (FuncIterated)
+          FreshBBs.erase(&BB);
+      } else {
+        // For small/normal functions, we restart BB iteration if the dominator
+        // tree of the Function was changed.
+        if (ModifiedDTOnIteration != ModifyDT::NotModifyDT)
+          break;
+      }
+    }
+    // We have iterated all the BB in the (only work for huge) function.
+    FuncIterated = IsHugeFunc;
+
+    if (EnableTypePromotionMerge && !ValToSExtendedUses.empty())
+      MadeChange |= mergeSExts(F);
+    if (!LargeOffsetGEPMap.empty())
+      MadeChange |= splitLargeGEPOffsets();
+    MadeChange |= optimizePhiTypes(F);
+
+    if (MadeChange)
+      eliminateFallThrough(F, DT.get());
+
+#ifndef NDEBUG
+    if (MadeChange && VerifyLoopInfo)
+      LI->verify(getDT(F));
+#endif
+
+    // Really free removed instructions during promotion.
+    for (Instruction *I : RemovedInsts)
+      I->deleteValue();
+
+    EverMadeChange |= MadeChange;
+    SeenChainsForSExt.clear();
+    ValToSExtendedUses.clear();
+    RemovedInsts.clear();
+    LargeOffsetGEPMap.clear();
+    LargeOffsetGEPID.clear();
+  }
+
+  NewGEPBases.clear();
+  SunkAddrs.clear();
+
+  if (!DisableBranchOpts) {
+    MadeChange = false;
+    // Use a set vector to get deterministic iteration order. The order the
+    // blocks are removed may affect whether or not PHI nodes in successors
+    // are removed.
+    SmallSetVector<BasicBlock *, 8> WorkList;
+    for (BasicBlock &BB : F) {
+      SmallVector<BasicBlock *, 2> Successors(successors(&BB));
+      MadeChange |= ConstantFoldTerminator(&BB, true);
+      if (!MadeChange)
+        continue;
+
+      for (BasicBlock *Succ : Successors)
+        if (pred_empty(Succ))
+          WorkList.insert(Succ);
+    }
+
+    // Delete the dead blocks and any of their dead successors.
+    MadeChange |= !WorkList.empty();
+    while (!WorkList.empty()) {
+      BasicBlock *BB = WorkList.pop_back_val();
+      SmallVector<BasicBlock *, 2> Successors(successors(BB));
+
+      DeleteDeadBlock(BB);
+
+      for (BasicBlock *Succ : Successors)
+        if (pred_empty(Succ))
+          WorkList.insert(Succ);
+    }
+
+    // Merge pairs of basic blocks with unconditional branches, connected by
+    // a single edge.
+    if (EverMadeChange || MadeChange)
+      MadeChange |= eliminateFallThrough(F);
+
+    EverMadeChange |= MadeChange;
+  }
+
+  if (!DisableGCOpts) {
+    SmallVector<GCStatepointInst *, 2> Statepoints;
+    for (BasicBlock &BB : F)
+      for (Instruction &I : BB)
+        if (auto *SP = dyn_cast<GCStatepointInst>(&I))
+          Statepoints.push_back(SP);
+    for (auto &I : Statepoints)
+      EverMadeChange |= simplifyOffsetableRelocate(*I);
+  }
+
+  // Do this last to clean up use-before-def scenarios introduced by other
+  // preparatory transforms.
+  EverMadeChange |= placeDbgValues(F);
+  EverMadeChange |= placePseudoProbes(F);
+
+#ifndef NDEBUG
+  if (VerifyBFIUpdates)
+    verifyBFIUpdates(F);
+#endif
+
+  return EverMadeChange;
+}
+
+bool CodeGenPrepare::eliminateAssumptions(Function &F) {
+  bool MadeChange = false;
+  for (BasicBlock &BB : F) {
+    CurInstIterator = BB.begin();
+    while (CurInstIterator != BB.end()) {
+      Instruction *I = &*(CurInstIterator++);
+      if (auto *Assume = dyn_cast<AssumeInst>(I)) {
+        MadeChange = true;
+        Value *Operand = Assume->getOperand(0);
+        Assume->eraseFromParent();
+
+        resetIteratorIfInvalidatedWhileCalling(&BB, [&]() {
+          RecursivelyDeleteTriviallyDeadInstructions(Operand, TLInfo, nullptr);
+        });
+      }
+    }
+  }
+  return MadeChange;
+}
+
+/// An instruction is about to be deleted, so remove all references to it in our
+/// GEP-tracking data strcutures.
+void CodeGenPrepare::removeAllAssertingVHReferences(Value *V) {
+  LargeOffsetGEPMap.erase(V);
+  NewGEPBases.erase(V);
+
+  auto GEP = dyn_cast<GetElementPtrInst>(V);
+  if (!GEP)
+    return;
+
+  LargeOffsetGEPID.erase(GEP);
+
+  auto VecI = LargeOffsetGEPMap.find(GEP->getPointerOperand());
+  if (VecI == LargeOffsetGEPMap.end())
+    return;
+
+  auto &GEPVector = VecI->second;
+  llvm::erase_if(GEPVector, [=](auto &Elt) { return Elt.first == GEP; });
+
+  if (GEPVector.empty())
+    LargeOffsetGEPMap.erase(VecI);
+}
+
+// Verify BFI has been updated correctly by recomputing BFI and comparing them.
+void LLVM_ATTRIBUTE_UNUSED CodeGenPrepare::verifyBFIUpdates(Function &F) {
+  DominatorTree NewDT(F);
+  LoopInfo NewLI(NewDT);
+  BranchProbabilityInfo NewBPI(F, NewLI, TLInfo);
+  BlockFrequencyInfo NewBFI(F, NewBPI, NewLI);
+  NewBFI.verifyMatch(*BFI);
+}
+
+/// Merge basic blocks which are connected by a single edge, where one of the
+/// basic blocks has a single successor pointing to the other basic block,
+/// which has a single predecessor.
+bool CodeGenPrepare::eliminateFallThrough(Function &F, DominatorTree *DT) {
+  bool Changed = false;
+  // Scan all of the blocks in the function, except for the entry block.
+  // Use a temporary array to avoid iterator being invalidated when
+  // deleting blocks.
+  SmallVector<WeakTrackingVH, 16> Blocks;
+  for (auto &Block : llvm::drop_begin(F))
+    Blocks.push_back(&Block);
+
+  SmallSet<WeakTrackingVH, 16> Preds;
+  for (auto &Block : Blocks) {
+    auto *BB = cast_or_null<BasicBlock>(Block);
+    if (!BB)
+      continue;
+    // If the destination block has a single pred, then this is a trivial
+    // edge, just collapse it.
+    BasicBlock *SinglePred = BB->getSinglePredecessor();
+
+    // Don't merge if BB's address is taken.
+    if (!SinglePred || SinglePred == BB || BB->hasAddressTaken())
+      continue;
+
+    // Make an effort to skip unreachable blocks.
+    if (DT && !DT->isReachableFromEntry(BB))
+      continue;
+
+    BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());
+    if (Term && !Term->isConditional()) {
+      Changed = true;
+      LLVM_DEBUG(dbgs() << "To merge:\n" << *BB << "\n\n\n");
+
+      // Merge BB into SinglePred and delete it.
+      MergeBlockIntoPredecessor(BB, /* DTU */ nullptr, LI, /* MSSAU */ nullptr,
+                                /* MemDep */ nullptr,
+                                /* PredecessorWithTwoSuccessors */ false, DT);
+      Preds.insert(SinglePred);
+
+      if (IsHugeFunc) {
+        // Update FreshBBs to optimize the merged BB.
+        FreshBBs.insert(SinglePred);
+        FreshBBs.erase(BB);
+      }
+    }
+  }
+
+  // (Repeatedly) merging blocks into their predecessors can create redundant
+  // debug intrinsics.
+  for (const auto &Pred : Preds)
+    if (auto *BB = cast_or_null<BasicBlock>(Pred))
+      RemoveRedundantDbgInstrs(BB);
+
+  return Changed;
+}
+
+/// Find a destination block from BB if BB is mergeable empty block.
+BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {
+  // If this block doesn't end with an uncond branch, ignore it.
+  BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isUnconditional())
+    return nullptr;
+
+  // If the instruction before the branch (skipping debug info) isn't a phi
+  // node, then other stuff is happening here.
+  BasicBlock::iterator BBI = BI->getIterator();
+  if (BBI != BB->begin()) {
+    --BBI;
+    while (isa<DbgInfoIntrinsic>(BBI)) {
+      if (BBI == BB->begin())
+        break;
+      --BBI;
+    }
+    if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
+      return nullptr;
+  }
+
+  // Do not break infinite loops.
+  BasicBlock *DestBB = BI->getSuccessor(0);
+  if (DestBB == BB)
+    return nullptr;
+
+  if (!canMergeBlocks(BB, DestBB))
+    DestBB = nullptr;
+
+  return DestBB;
+}
+
+/// Eliminate blocks that contain only PHI nodes, debug info directives, and an
+/// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
+/// edges in ways that are non-optimal for isel. Start by eliminating these
+/// blocks so we can split them the way we want them.
+bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
+  SmallPtrSet<BasicBlock *, 16> Preheaders;
+  SmallVector<Loop *, 16> LoopList(LI->begin(), LI->end());
+  while (!LoopList.empty()) {
+    Loop *L = LoopList.pop_back_val();
+    llvm::append_range(LoopList, *L);
+    if (BasicBlock *Preheader = L->getLoopPreheader())
+      Preheaders.insert(Preheader);
+  }
+
+  bool MadeChange = false;
+  // Copy blocks into a temporary array to avoid iterator invalidation issues
+  // as we remove them.
+  // Note that this intentionally skips the entry block.
+  SmallVector<WeakTrackingVH, 16> Blocks;
+  for (auto &Block : llvm::drop_begin(F))
+    Blocks.push_back(&Block);
+
+  for (auto &Block : Blocks) {
+    BasicBlock *BB = cast_or_null<BasicBlock>(Block);
+    if (!BB)
+      continue;
+    BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB);
+    if (!DestBB ||
+        !isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB)))
+      continue;
+
+    eliminateMostlyEmptyBlock(BB);
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,
+                                                   BasicBlock *DestBB,
+                                                   bool isPreheader) {
+  // Do not delete loop preheaders if doing so would create a critical edge.
+  // Loop preheaders can be good locations to spill registers. If the
+  // preheader is deleted and we create a critical edge, registers may be
+  // spilled in the loop body instead.
+  if (!DisablePreheaderProtect && isPreheader &&
+      !(BB->getSinglePredecessor() &&
+        BB->getSinglePredecessor()->getSingleSuccessor()))
+    return false;
+
+  // Skip merging if the block's successor is also a successor to any callbr
+  // that leads to this block.
+  // FIXME: Is this really needed? Is this a correctness issue?
+  for (BasicBlock *Pred : predecessors(BB)) {
+    if (auto *CBI = dyn_cast<CallBrInst>((Pred)->getTerminator()))
+      for (unsigned i = 0, e = CBI->getNumSuccessors(); i != e; ++i)
+        if (DestBB == CBI->getSuccessor(i))
+          return false;
+  }
+
+  // Try to skip merging if the unique predecessor of BB is terminated by a
+  // switch or indirect branch instruction, and BB is used as an incoming block
+  // of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to
+  // add COPY instructions in the predecessor of BB instead of BB (if it is not
+  // merged). Note that the critical edge created by merging such blocks wont be
+  // split in MachineSink because the jump table is not analyzable. By keeping
+  // such empty block (BB), ISel will place COPY instructions in BB, not in the
+  // predecessor of BB.
+  BasicBlock *Pred = BB->getUniquePredecessor();
+  if (!Pred || !(isa<SwitchInst>(Pred->getTerminator()) ||
+                 isa<IndirectBrInst>(Pred->getTerminator())))
+    return true;
+
+  if (BB->getTerminator() != BB->getFirstNonPHIOrDbg())
+    return true;
+
+  // We use a simple cost heuristic which determine skipping merging is
+  // profitable if the cost of skipping merging is less than the cost of
+  // merging : Cost(skipping merging) < Cost(merging BB), where the
+  // Cost(skipping merging) is Freq(BB) * (Cost(Copy) + Cost(Branch)), and
+  // the Cost(merging BB) is Freq(Pred) * Cost(Copy).
+  // Assuming Cost(Copy) == Cost(Branch), we could simplify it to :
+  //   Freq(Pred) / Freq(BB) > 2.
+  // Note that if there are multiple empty blocks sharing the same incoming
+  // value for the PHIs in the DestBB, we consider them together. In such
+  // case, Cost(merging BB) will be the sum of their frequencies.
+
+  if (!isa<PHINode>(DestBB->begin()))
+    return true;
+
+  SmallPtrSet<BasicBlock *, 16> SameIncomingValueBBs;
+
+  // Find all other incoming blocks from which incoming values of all PHIs in
+  // DestBB are the same as the ones from BB.
+  for (BasicBlock *DestBBPred : predecessors(DestBB)) {
+    if (DestBBPred == BB)
+      continue;
+
+    if (llvm::all_of(DestBB->phis(), [&](const PHINode &DestPN) {
+          return DestPN.getIncomingValueForBlock(BB) ==
+                 DestPN.getIncomingValueForBlock(DestBBPred);
+        }))
+      SameIncomingValueBBs.insert(DestBBPred);
+  }
+
+  // See if all BB's incoming values are same as the value from Pred. In this
+  // case, no reason to skip merging because COPYs are expected to be place in
+  // Pred already.
+  if (SameIncomingValueBBs.count(Pred))
+    return true;
+
+  BlockFrequency PredFreq = BFI->getBlockFreq(Pred);
+  BlockFrequency BBFreq = BFI->getBlockFreq(BB);
+
+  for (auto *SameValueBB : SameIncomingValueBBs)
+    if (SameValueBB->getUniquePredecessor() == Pred &&
+        DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB))
+      BBFreq += BFI->getBlockFreq(SameValueBB);
+
+  return PredFreq.getFrequency() <=
+         BBFreq.getFrequency() * FreqRatioToSkipMerge;
+}
+
+/// Return true if we can merge BB into DestBB if there is a single
+/// unconditional branch between them, and BB contains no other non-phi
+/// instructions.
+bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
+                                    const BasicBlock *DestBB) const {
+  // We only want to eliminate blocks whose phi nodes are used by phi nodes in
+  // the successor.  If there are more complex condition (e.g. preheaders),
+  // don't mess around with them.
+  for (const PHINode &PN : BB->phis()) {
+    for (const User *U : PN.users()) {
+      const Instruction *UI = cast<Instruction>(U);
+      if (UI->getParent() != DestBB || !isa<PHINode>(UI))
+        return false;
+      // If User is inside DestBB block and it is a PHINode then check
+      // incoming value. If incoming value is not from BB then this is
+      // a complex condition (e.g. preheaders) we want to avoid here.
+      if (UI->getParent() == DestBB) {
+        if (const PHINode *UPN = dyn_cast<PHINode>(UI))
+          for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {
+            Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I));
+            if (Insn && Insn->getParent() == BB &&
+                Insn->getParent() != UPN->getIncomingBlock(I))
+              return false;
+          }
+      }
+    }
+  }
+
+  // If BB and DestBB contain any common predecessors, then the phi nodes in BB
+  // and DestBB may have conflicting incoming values for the block.  If so, we
+  // can't merge the block.
+  const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin());
+  if (!DestBBPN)
+    return true; // no conflict.
+
+  // Collect the preds of BB.
+  SmallPtrSet<const BasicBlock *, 16> BBPreds;
+  if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
+    // It is faster to get preds from a PHI than with pred_iterator.
+    for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
+      BBPreds.insert(BBPN->getIncomingBlock(i));
+  } else {
+    BBPreds.insert(pred_begin(BB), pred_end(BB));
+  }
+
+  // Walk the preds of DestBB.
+  for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {
+    BasicBlock *Pred = DestBBPN->getIncomingBlock(i);
+    if (BBPreds.count(Pred)) { // Common predecessor?
+      for (const PHINode &PN : DestBB->phis()) {
+        const Value *V1 = PN.getIncomingValueForBlock(Pred);
+        const Value *V2 = PN.getIncomingValueForBlock(BB);
+
+        // If V2 is a phi node in BB, look up what the mapped value will be.
+        if (const PHINode *V2PN = dyn_cast<PHINode>(V2))
+          if (V2PN->getParent() == BB)
+            V2 = V2PN->getIncomingValueForBlock(Pred);
+
+        // If there is a conflict, bail out.
+        if (V1 != V2)
+          return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+/// Replace all old uses with new ones, and push the updated BBs into FreshBBs.
+static void replaceAllUsesWith(Value *Old, Value *New,
+                               SmallSet<BasicBlock *, 32> &FreshBBs,
+                               bool IsHuge) {
+  auto *OldI = dyn_cast<Instruction>(Old);
+  if (OldI) {
+    for (Value::user_iterator UI = OldI->user_begin(), E = OldI->user_end();
+         UI != E; ++UI) {
+      Instruction *User = cast<Instruction>(*UI);
+      if (IsHuge)
+        FreshBBs.insert(User->getParent());
+    }
+  }
+  Old->replaceAllUsesWith(New);
+}
+
+/// Eliminate a basic block that has only phi's and an unconditional branch in
+/// it.
+void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
+  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
+  BasicBlock *DestBB = BI->getSuccessor(0);
+
+  LLVM_DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n"
+                    << *BB << *DestBB);
+
+  // If the destination block has a single pred, then this is a trivial edge,
+  // just collapse it.
+  if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {
+    if (SinglePred != DestBB) {
+      assert(SinglePred == BB &&
+             "Single predecessor not the same as predecessor");
+      // Merge DestBB into SinglePred/BB and delete it.
+      MergeBlockIntoPredecessor(DestBB);
+      // Note: BB(=SinglePred) will not be deleted on this path.
+      // DestBB(=its single successor) is the one that was deleted.
+      LLVM_DEBUG(dbgs() << "AFTER:\n" << *SinglePred << "\n\n\n");
+
+      if (IsHugeFunc) {
+        // Update FreshBBs to optimize the merged BB.
+        FreshBBs.insert(SinglePred);
+        FreshBBs.erase(DestBB);
+      }
+      return;
+    }
+  }
+
+  // Otherwise, we have multiple predecessors of BB.  Update the PHIs in DestBB
+  // to handle the new incoming edges it is about to have.
+  for (PHINode &PN : DestBB->phis()) {
+    // Remove the incoming value for BB, and remember it.
+    Value *InVal = PN.removeIncomingValue(BB, false);
+
+    // Two options: either the InVal is a phi node defined in BB or it is some
+    // value that dominates BB.
+    PHINode *InValPhi = dyn_cast<PHINode>(InVal);
+    if (InValPhi && InValPhi->getParent() == BB) {
+      // Add all of the input values of the input PHI as inputs of this phi.
+      for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)
+        PN.addIncoming(InValPhi->getIncomingValue(i),
+                       InValPhi->getIncomingBlock(i));
+    } else {
+      // Otherwise, add one instance of the dominating value for each edge that
+      // we will be adding.
+      if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
+        for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
+          PN.addIncoming(InVal, BBPN->getIncomingBlock(i));
+      } else {
+        for (BasicBlock *Pred : predecessors(BB))
+          PN.addIncoming(InVal, Pred);
+      }
+    }
+  }
+
+  // The PHIs are now updated, change everything that refers to BB to use
+  // DestBB and remove BB.
+  BB->replaceAllUsesWith(DestBB);
+  BB->eraseFromParent();
+  ++NumBlocksElim;
+
+  LLVM_DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
+}
+
+// Computes a map of base pointer relocation instructions to corresponding
+// derived pointer relocation instructions given a vector of all relocate calls
+static void computeBaseDerivedRelocateMap(
+    const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls,
+    DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>>
+        &RelocateInstMap) {
+  // Collect information in two maps: one primarily for locating the base object
+  // while filling the second map; the second map is the final structure holding
+  // a mapping between Base and corresponding Derived relocate calls
+  DenseMap<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap;
+  for (auto *ThisRelocate : AllRelocateCalls) {
+    auto K = std::make_pair(ThisRelocate->getBasePtrIndex(),
+                            ThisRelocate->getDerivedPtrIndex());
+    RelocateIdxMap.insert(std::make_pair(K, ThisRelocate));
+  }
+  for (auto &Item : RelocateIdxMap) {
+    std::pair<unsigned, unsigned> Key = Item.first;
+    if (Key.first == Key.second)
+      // Base relocation: nothing to insert
+      continue;
+
+    GCRelocateInst *I = Item.second;
+    auto BaseKey = std::make_pair(Key.first, Key.first);
+
+    // We're iterating over RelocateIdxMap so we cannot modify it.
+    auto MaybeBase = RelocateIdxMap.find(BaseKey);
+    if (MaybeBase == RelocateIdxMap.end())
+      // TODO: We might want to insert a new base object relocate and gep off
+      // that, if there are enough derived object relocates.
+      continue;
+
+    RelocateInstMap[MaybeBase->second].push_back(I);
+  }
+}
+
+// Accepts a GEP and extracts the operands into a vector provided they're all
+// small integer constants
+static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
+                                          SmallVectorImpl<Value *> &OffsetV) {
+  for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
+    // Only accept small constant integer operands
+    auto *Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (!Op || Op->getZExtValue() > 20)
+      return false;
+  }
+
+  for (unsigned i = 1; i < GEP->getNumOperands(); i++)
+    OffsetV.push_back(GEP->getOperand(i));
+  return true;
+}
+
+// Takes a RelocatedBase (base pointer relocation instruction) and Targets to
+// replace, computes a replacement, and affects it.
+static bool
+simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
+                          const SmallVectorImpl<GCRelocateInst *> &Targets) {
+  bool MadeChange = false;
+  // We must ensure the relocation of derived pointer is defined after
+  // relocation of base pointer. If we find a relocation corresponding to base
+  // defined earlier than relocation of base then we move relocation of base
+  // right before found relocation. We consider only relocation in the same
+  // basic block as relocation of base. Relocations from other basic block will
+  // be skipped by optimization and we do not care about them.
+  for (auto R = RelocatedBase->getParent()->getFirstInsertionPt();
+       &*R != RelocatedBase; ++R)
+    if (auto *RI = dyn_cast<GCRelocateInst>(R))
+      if (RI->getStatepoint() == RelocatedBase->getStatepoint())
+        if (RI->getBasePtrIndex() == RelocatedBase->getBasePtrIndex()) {
+          RelocatedBase->moveBefore(RI);
+          break;
+        }
+
+  for (GCRelocateInst *ToReplace : Targets) {
+    assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() &&
+           "Not relocating a derived object of the original base object");
+    if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) {
+      // A duplicate relocate call. TODO: coalesce duplicates.
+      continue;
+    }
+
+    if (RelocatedBase->getParent() != ToReplace->getParent()) {
+      // Base and derived relocates are in different basic blocks.
+      // In this case transform is only valid when base dominates derived
+      // relocate. However it would be too expensive to check dominance
+      // for each such relocate, so we skip the whole transformation.
+      continue;
+    }
+
+    Value *Base = ToReplace->getBasePtr();
+    auto *Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());
+    if (!Derived || Derived->getPointerOperand() != Base)
+      continue;
+
+    SmallVector<Value *, 2> OffsetV;
+    if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV))
+      continue;
+
+    // Create a Builder and replace the target callsite with a gep
+    assert(RelocatedBase->getNextNode() &&
+           "Should always have one since it's not a terminator");
+
+    // Insert after RelocatedBase
+    IRBuilder<> Builder(RelocatedBase->getNextNode());
+    Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
+
+    // If gc_relocate does not match the actual type, cast it to the right type.
+    // In theory, there must be a bitcast after gc_relocate if the type does not
+    // match, and we should reuse it to get the derived pointer. But it could be
+    // cases like this:
+    // bb1:
+    //  ...
+    //  %g1 = call coldcc i8 addrspace(1)*
+    //  @llvm.experimental.gc.relocate.p1i8(...) br label %merge
+    //
+    // bb2:
+    //  ...
+    //  %g2 = call coldcc i8 addrspace(1)*
+    //  @llvm.experimental.gc.relocate.p1i8(...) br label %merge
+    //
+    // merge:
+    //  %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ]
+    //  %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)*
+    //
+    // In this case, we can not find the bitcast any more. So we insert a new
+    // bitcast no matter there is already one or not. In this way, we can handle
+    // all cases, and the extra bitcast should be optimized away in later
+    // passes.
+    Value *ActualRelocatedBase = RelocatedBase;
+    if (RelocatedBase->getType() != Base->getType()) {
+      ActualRelocatedBase =
+          Builder.CreateBitCast(RelocatedBase, Base->getType());
+    }
+    Value *Replacement =
+        Builder.CreateGEP(Derived->getSourceElementType(), ActualRelocatedBase,
+                          ArrayRef(OffsetV));
+    Replacement->takeName(ToReplace);
+    // If the newly generated derived pointer's type does not match the original
+    // derived pointer's type, cast the new derived pointer to match it. Same
+    // reasoning as above.
+    Value *ActualReplacement = Replacement;
+    if (Replacement->getType() != ToReplace->getType()) {
+      ActualReplacement =
+          Builder.CreateBitCast(Replacement, ToReplace->getType());
+    }
+    ToReplace->replaceAllUsesWith(ActualReplacement);
+    ToReplace->eraseFromParent();
+
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+// Turns this:
+//
+// %base = ...
+// %ptr = gep %base + 15
+// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
+// %base' = relocate(%tok, i32 4, i32 4)
+// %ptr' = relocate(%tok, i32 4, i32 5)
+// %val = load %ptr'
+//
+// into this:
+//
+// %base = ...
+// %ptr = gep %base + 15
+// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
+// %base' = gc.relocate(%tok, i32 4, i32 4)
+// %ptr' = gep %base' + 15
+// %val = load %ptr'
+bool CodeGenPrepare::simplifyOffsetableRelocate(GCStatepointInst &I) {
+  bool MadeChange = false;
+  SmallVector<GCRelocateInst *, 2> AllRelocateCalls;
+  for (auto *U : I.users())
+    if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U))
+      // Collect all the relocate calls associated with a statepoint
+      AllRelocateCalls.push_back(Relocate);
+
+  // We need at least one base pointer relocation + one derived pointer
+  // relocation to mangle
+  if (AllRelocateCalls.size() < 2)
+    return false;
+
+  // RelocateInstMap is a mapping from the base relocate instruction to the
+  // corresponding derived relocate instructions
+  DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> RelocateInstMap;
+  computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
+  if (RelocateInstMap.empty())
+    return false;
+
+  for (auto &Item : RelocateInstMap)
+    // Item.first is the RelocatedBase to offset against
+    // Item.second is the vector of Targets to replace
+    MadeChange = simplifyRelocatesOffABase(Item.first, Item.second);
+  return MadeChange;
+}
+
+/// Sink the specified cast instruction into its user blocks.
+static bool SinkCast(CastInst *CI) {
+  BasicBlock *DefBB = CI->getParent();
+
+  /// InsertedCasts - Only insert a cast in each block once.
+  DenseMap<BasicBlock *, CastInst *> InsertedCasts;
+
+  bool MadeChange = false;
+  for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end();
+       UI != E;) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Figure out which BB this cast is used in.  For PHI's this is the
+    // appropriate predecessor block.
+    BasicBlock *UserBB = User->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(User)) {
+      UserBB = PN->getIncomingBlock(TheUse);
+    }
+
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    // The first insertion point of a block containing an EH pad is after the
+    // pad.  If the pad is the user, we cannot sink the cast past the pad.
+    if (User->isEHPad())
+      continue;
+
+    // If the block selected to receive the cast is an EH pad that does not
+    // allow non-PHI instructions before the terminator, we can't sink the
+    // cast.
+    if (UserBB->getTerminator()->isEHPad())
+      continue;
+
+    // If this user is in the same block as the cast, don't change the cast.
+    if (UserBB == DefBB)
+      continue;
+
+    // If we have already inserted a cast into this block, use it.
+    CastInst *&InsertedCast = InsertedCasts[UserBB];
+
+    if (!InsertedCast) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+      assert(InsertPt != UserBB->end());
+      InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0),
+                                      CI->getType(), "", &*InsertPt);
+      InsertedCast->setDebugLoc(CI->getDebugLoc());
+    }
+
+    // Replace a use of the cast with a use of the new cast.
+    TheUse = InsertedCast;
+    MadeChange = true;
+    ++NumCastUses;
+  }
+
+  // If we removed all uses, nuke the cast.
+  if (CI->use_empty()) {
+    salvageDebugInfo(*CI);
+    CI->eraseFromParent();
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+/// If the specified cast instruction is a noop copy (e.g. it's casting from
+/// one pointer type to another, i32->i8 on PPC), sink it into user blocks to
+/// reduce the number of virtual registers that must be created and coalesced.
+///
+/// Return true if any changes are made.
+static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
+                                       const DataLayout &DL) {
+  // Sink only "cheap" (or nop) address-space casts.  This is a weaker condition
+  // than sinking only nop casts, but is helpful on some platforms.
+  if (auto *ASC = dyn_cast<AddrSpaceCastInst>(CI)) {
+    if (!TLI.isFreeAddrSpaceCast(ASC->getSrcAddressSpace(),
+                                 ASC->getDestAddressSpace()))
+      return false;
+  }
+
+  // If this is a noop copy,
+  EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(DL, CI->getType());
+
+  // This is an fp<->int conversion?
+  if (SrcVT.isInteger() != DstVT.isInteger())
+    return false;
+
+  // If this is an extension, it will be a zero or sign extension, which
+  // isn't a noop.
+  if (SrcVT.bitsLT(DstVT))
+    return false;
+
+  // If these values will be promoted, find out what they will be promoted
+  // to.  This helps us consider truncates on PPC as noop copies when they
+  // are.
+  if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
+      TargetLowering::TypePromoteInteger)
+    SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
+  if (TLI.getTypeAction(CI->getContext(), DstVT) ==
+      TargetLowering::TypePromoteInteger)
+    DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
+
+  // If, after promotion, these are the same types, this is a noop copy.
+  if (SrcVT != DstVT)
+    return false;
+
+  return SinkCast(CI);
+}
+
+// Match a simple increment by constant operation.  Note that if a sub is
+// matched, the step is negated (as if the step had been canonicalized to
+// an add, even though we leave the instruction alone.)
+bool matchIncrement(const Instruction *IVInc, Instruction *&LHS,
+                    Constant *&Step) {
+  if (match(IVInc, m_Add(m_Instruction(LHS), m_Constant(Step))) ||
+      match(IVInc, m_ExtractValue<0>(m_Intrinsic<Intrinsic::uadd_with_overflow>(
+                       m_Instruction(LHS), m_Constant(Step)))))
+    return true;
+  if (match(IVInc, m_Sub(m_Instruction(LHS), m_Constant(Step))) ||
+      match(IVInc, m_ExtractValue<0>(m_Intrinsic<Intrinsic::usub_with_overflow>(
+                       m_Instruction(LHS), m_Constant(Step))))) {
+    Step = ConstantExpr::getNeg(Step);
+    return true;
+  }
+  return false;
+}
+
+/// If given \p PN is an inductive variable with value IVInc coming from the
+/// backedge, and on each iteration it gets increased by Step, return pair
+/// <IVInc, Step>. Otherwise, return std::nullopt.
+static std::optional<std::pair<Instruction *, Constant *>>
+getIVIncrement(const PHINode *PN, const LoopInfo *LI) {
+  const Loop *L = LI->getLoopFor(PN->getParent());
+  if (!L || L->getHeader() != PN->getParent() || !L->getLoopLatch())
+    return std::nullopt;
+  auto *IVInc =
+      dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch()));
+  if (!IVInc || LI->getLoopFor(IVInc->getParent()) != L)
+    return std::nullopt;
+  Instruction *LHS = nullptr;
+  Constant *Step = nullptr;
+  if (matchIncrement(IVInc, LHS, Step) && LHS == PN)
+    return std::make_pair(IVInc, Step);
+  return std::nullopt;
+}
+
+static bool isIVIncrement(const Value *V, const LoopInfo *LI) {
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+  Instruction *LHS = nullptr;
+  Constant *Step = nullptr;
+  if (!matchIncrement(I, LHS, Step))
+    return false;
+  if (auto *PN = dyn_cast<PHINode>(LHS))
+    if (auto IVInc = getIVIncrement(PN, LI))
+      return IVInc->first == I;
+  return false;
+}
+
+bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
+                                                 Value *Arg0, Value *Arg1,
+                                                 CmpInst *Cmp,
+                                                 Intrinsic::ID IID) {
+  auto IsReplacableIVIncrement = [this, &Cmp](BinaryOperator *BO) {
+    if (!isIVIncrement(BO, LI))
+      return false;
+    const Loop *L = LI->getLoopFor(BO->getParent());
+    assert(L && "L should not be null after isIVIncrement()");
+    // Do not risk on moving increment into a child loop.
+    if (LI->getLoopFor(Cmp->getParent()) != L)
+      return false;
+
+    // Finally, we need to ensure that the insert point will dominate all
+    // existing uses of the increment.
+
+    auto &DT = getDT(*BO->getParent()->getParent());
+    if (DT.dominates(Cmp->getParent(), BO->getParent()))
+      // If we're moving up the dom tree, all uses are trivially dominated.
+      // (This is the common case for code produced by LSR.)
+      return true;
+
+    // Otherwise, special case the single use in the phi recurrence.
+    return BO->hasOneUse() && DT.dominates(Cmp->getParent(), L->getLoopLatch());
+  };
+  if (BO->getParent() != Cmp->getParent() && !IsReplacableIVIncrement(BO)) {
+    // We used to use a dominator tree here to allow multi-block optimization.
+    // But that was problematic because:
+    // 1. It could cause a perf regression by hoisting the math op into the
+    //    critical path.
+    // 2. It could cause a perf regression by creating a value that was live
+    //    across multiple blocks and increasing register pressure.
+    // 3. Use of a dominator tree could cause large compile-time regression.
+    //    This is because we recompute the DT on every change in the main CGP
+    //    run-loop. The recomputing is probably unnecessary in many cases, so if
+    //    that was fixed, using a DT here would be ok.
+    //
+    // There is one important particular case we still want to handle: if BO is
+    // the IV increment. Important properties that make it profitable:
+    // - We can speculate IV increment anywhere in the loop (as long as the
+    //   indvar Phi is its only user);
+    // - Upon computing Cmp, we effectively compute something equivalent to the
+    //   IV increment (despite it loops differently in the IR). So moving it up
+    //   to the cmp point does not really increase register pressure.
+    return false;
+  }
+
+  // We allow matching the canonical IR (add X, C) back to (usubo X, -C).
+  if (BO->getOpcode() == Instruction::Add &&
+      IID == Intrinsic::usub_with_overflow) {
+    assert(isa<Constant>(Arg1) && "Unexpected input for usubo");
+    Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1));
+  }
+
+  // Insert at the first instruction of the pair.
+  Instruction *InsertPt = nullptr;
+  for (Instruction &Iter : *Cmp->getParent()) {
+    // If BO is an XOR, it is not guaranteed that it comes after both inputs to
+    // the overflow intrinsic are defined.
+    if ((BO->getOpcode() != Instruction::Xor && &Iter == BO) || &Iter == Cmp) {
+      InsertPt = &Iter;
+      break;
+    }
+  }
+  assert(InsertPt != nullptr && "Parent block did not contain cmp or binop");
+
+  IRBuilder<> Builder(InsertPt);
+  Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
+  if (BO->getOpcode() != Instruction::Xor) {
+    Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
+    replaceAllUsesWith(BO, Math, FreshBBs, IsHugeFunc);
+  } else
+    assert(BO->hasOneUse() &&
+           "Patterns with XOr should use the BO only in the compare");
+  Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");
+  replaceAllUsesWith(Cmp, OV, FreshBBs, IsHugeFunc);
+  Cmp->eraseFromParent();
+  BO->eraseFromParent();
+  return true;
+}
+
+/// Match special-case patterns that check for unsigned add overflow.
+static bool matchUAddWithOverflowConstantEdgeCases(CmpInst *Cmp,
+                                                   BinaryOperator *&Add) {
+  // Add = add A, 1; Cmp = icmp eq A,-1 (overflow if A is max val)
+  // Add = add A,-1; Cmp = icmp ne A, 0 (overflow if A is non-zero)
+  Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);
+
+  // We are not expecting non-canonical/degenerate code. Just bail out.
+  if (isa<Constant>(A))
+    return false;
+
+  ICmpInst::Predicate Pred = Cmp->getPredicate();
+  if (Pred == ICmpInst::ICMP_EQ && match(B, m_AllOnes()))
+    B = ConstantInt::get(B->getType(), 1);
+  else if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt()))
+    B = ConstantInt::get(B->getType(), -1);
+  else
+    return false;
+
+  // Check the users of the variable operand of the compare looking for an add
+  // with the adjusted constant.
+  for (User *U : A->users()) {
+    if (match(U, m_Add(m_Specific(A), m_Specific(B)))) {
+      Add = cast<BinaryOperator>(U);
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Try to combine the compare into a call to the llvm.uadd.with.overflow
+/// intrinsic. Return true if any changes were made.
+bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,
+                                               ModifyDT &ModifiedDT) {
+  bool EdgeCase = false;
+  Value *A, *B;
+  BinaryOperator *Add;
+  if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) {
+    if (!matchUAddWithOverflowConstantEdgeCases(Cmp, Add))
+      return false;
+    // Set A and B in case we match matchUAddWithOverflowConstantEdgeCases.
+    A = Add->getOperand(0);
+    B = Add->getOperand(1);
+    EdgeCase = true;
+  }
+
+  if (!TLI->shouldFormOverflowOp(ISD::UADDO,
+                                 TLI->getValueType(*DL, Add->getType()),
+                                 Add->hasNUsesOrMore(EdgeCase ? 1 : 2)))
+    return false;
+
+  // We don't want to move around uses of condition values this late, so we
+  // check if it is legal to create the call to the intrinsic in the basic
+  // block containing the icmp.
+  if (Add->getParent() != Cmp->getParent() && !Add->hasOneUse())
+    return false;
+
+  if (!replaceMathCmpWithIntrinsic(Add, A, B, Cmp,
+                                   Intrinsic::uadd_with_overflow))
+    return false;
+
+  // Reset callers - do not crash by iterating over a dead instruction.
+  ModifiedDT = ModifyDT::ModifyInstDT;
+  return true;
+}
+
+bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
+                                               ModifyDT &ModifiedDT) {
+  // We are not expecting non-canonical/degenerate code. Just bail out.
+  Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);
+  if (isa<Constant>(A) && isa<Constant>(B))
+    return false;
+
+  // Convert (A u> B) to (A u< B) to simplify pattern matching.
+  ICmpInst::Predicate Pred = Cmp->getPredicate();
+  if (Pred == ICmpInst::ICMP_UGT) {
+    std::swap(A, B);
+    Pred = ICmpInst::ICMP_ULT;
+  }
+  // Convert special-case: (A == 0) is the same as (A u< 1).
+  if (Pred == ICmpInst::ICMP_EQ && match(B, m_ZeroInt())) {
+    B = ConstantInt::get(B->getType(), 1);
+    Pred = ICmpInst::ICMP_ULT;
+  }
+  // Convert special-case: (A != 0) is the same as (0 u< A).
+  if (Pred == ICmpInst::ICMP_NE && match(B, m_ZeroInt())) {
+    std::swap(A, B);
+    Pred = ICmpInst::ICMP_ULT;
+  }
+  if (Pred != ICmpInst::ICMP_ULT)
+    return false;
+
+  // Walk the users of a variable operand of a compare looking for a subtract or
+  // add with that same operand. Also match the 2nd operand of the compare to
+  // the add/sub, but that may be a negated constant operand of an add.
+  Value *CmpVariableOperand = isa<Constant>(A) ? B : A;
+  BinaryOperator *Sub = nullptr;
+  for (User *U : CmpVariableOperand->users()) {
+    // A - B, A u< B --> usubo(A, B)
+    if (match(U, m_Sub(m_Specific(A), m_Specific(B)))) {
+      Sub = cast<BinaryOperator>(U);
+      break;
+    }
+
+    // A + (-C), A u< C (canonicalized form of (sub A, C))
+    const APInt *CmpC, *AddC;
+    if (match(U, m_Add(m_Specific(A), m_APInt(AddC))) &&
+        match(B, m_APInt(CmpC)) && *AddC == -(*CmpC)) {
+      Sub = cast<BinaryOperator>(U);
+      break;
+    }
+  }
+  if (!Sub)
+    return false;
+
+  if (!TLI->shouldFormOverflowOp(ISD::USUBO,
+                                 TLI->getValueType(*DL, Sub->getType()),
+                                 Sub->hasNUsesOrMore(1)))
+    return false;
+
+  if (!replaceMathCmpWithIntrinsic(Sub, Sub->getOperand(0), Sub->getOperand(1),
+                                   Cmp, Intrinsic::usub_with_overflow))
+    return false;
+
+  // Reset callers - do not crash by iterating over a dead instruction.
+  ModifiedDT = ModifyDT::ModifyInstDT;
+  return true;
+}
+
+/// Sink the given CmpInst into user blocks to reduce the number of virtual
+/// registers that must be created and coalesced. This is a clear win except on
+/// targets with multiple condition code registers (PowerPC), where it might
+/// lose; some adjustment may be wanted there.
+///
+/// Return true if any changes are made.
+static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
+  if (TLI.hasMultipleConditionRegisters())
+    return false;
+
+  // Avoid sinking soft-FP comparisons, since this can move them into a loop.
+  if (TLI.useSoftFloat() && isa<FCmpInst>(Cmp))
+    return false;
+
+  // Only insert a cmp in each block once.
+  DenseMap<BasicBlock *, CmpInst *> InsertedCmps;
+
+  bool MadeChange = false;
+  for (Value::user_iterator UI = Cmp->user_begin(), E = Cmp->user_end();
+       UI != E;) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    // Don't bother for PHI nodes.
+    if (isa<PHINode>(User))
+      continue;
+
+    // Figure out which BB this cmp is used in.
+    BasicBlock *UserBB = User->getParent();
+    BasicBlock *DefBB = Cmp->getParent();
+
+    // If this user is in the same block as the cmp, don't change the cmp.
+    if (UserBB == DefBB)
+      continue;
+
+    // If we have already inserted a cmp into this block, use it.
+    CmpInst *&InsertedCmp = InsertedCmps[UserBB];
+
+    if (!InsertedCmp) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+      assert(InsertPt != UserBB->end());
+      InsertedCmp = CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(),
+                                    Cmp->getOperand(0), Cmp->getOperand(1), "",
+                                    &*InsertPt);
+      // Propagate the debug info.
+      InsertedCmp->setDebugLoc(Cmp->getDebugLoc());
+    }
+
+    // Replace a use of the cmp with a use of the new cmp.
+    TheUse = InsertedCmp;
+    MadeChange = true;
+    ++NumCmpUses;
+  }
+
+  // If we removed all uses, nuke the cmp.
+  if (Cmp->use_empty()) {
+    Cmp->eraseFromParent();
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+/// For pattern like:
+///
+///   DomCond = icmp sgt/slt CmpOp0, CmpOp1 (might not be in DomBB)
+///   ...
+/// DomBB:
+///   ...
+///   br DomCond, TrueBB, CmpBB
+/// CmpBB: (with DomBB being the single predecessor)
+///   ...
+///   Cmp = icmp eq CmpOp0, CmpOp1
+///   ...
+///
+/// It would use two comparison on targets that lowering of icmp sgt/slt is
+/// different from lowering of icmp eq (PowerPC). This function try to convert
+/// 'Cmp = icmp eq CmpOp0, CmpOp1' to ' Cmp = icmp slt/sgt CmpOp0, CmpOp1'.
+/// After that, DomCond and Cmp can use the same comparison so reduce one
+/// comparison.
+///
+/// Return true if any changes are made.
+static bool foldICmpWithDominatingICmp(CmpInst *Cmp,
+                                       const TargetLowering &TLI) {
+  if (!EnableICMP_EQToICMP_ST && TLI.isEqualityCmpFoldedWithSignedCmp())
+    return false;
+
+  ICmpInst::Predicate Pred = Cmp->getPredicate();
+  if (Pred != ICmpInst::ICMP_EQ)
+    return false;
+
+  // If icmp eq has users other than BranchInst and SelectInst, converting it to
+  // icmp slt/sgt would introduce more redundant LLVM IR.
+  for (User *U : Cmp->users()) {
+    if (isa<BranchInst>(U))
+      continue;
+    if (isa<SelectInst>(U) && cast<SelectInst>(U)->getCondition() == Cmp)
+      continue;
+    return false;
+  }
+
+  // This is a cheap/incomplete check for dominance - just match a single
+  // predecessor with a conditional branch.
+  BasicBlock *CmpBB = Cmp->getParent();
+  BasicBlock *DomBB = CmpBB->getSinglePredecessor();
+  if (!DomBB)
+    return false;
+
+  // We want to ensure that the only way control gets to the comparison of
+  // interest is that a less/greater than comparison on the same operands is
+  // false.
+  Value *DomCond;
+  BasicBlock *TrueBB, *FalseBB;
+  if (!match(DomBB->getTerminator(), m_Br(m_Value(DomCond), TrueBB, FalseBB)))
+    return false;
+  if (CmpBB != FalseBB)
+    return false;
+
+  Value *CmpOp0 = Cmp->getOperand(0), *CmpOp1 = Cmp->getOperand(1);
+  ICmpInst::Predicate DomPred;
+  if (!match(DomCond, m_ICmp(DomPred, m_Specific(CmpOp0), m_Specific(CmpOp1))))
+    return false;
+  if (DomPred != ICmpInst::ICMP_SGT && DomPred != ICmpInst::ICMP_SLT)
+    return false;
+
+  // Convert the equality comparison to the opposite of the dominating
+  // comparison and swap the direction for all branch/select users.
+  // We have conceptually converted:
+  // Res = (a < b) ? <LT_RES> : (a == b) ? <EQ_RES> : <GT_RES>;
+  // to
+  // Res = (a < b) ? <LT_RES> : (a > b)  ? <GT_RES> : <EQ_RES>;
+  // And similarly for branches.
+  for (User *U : Cmp->users()) {
+    if (auto *BI = dyn_cast<BranchInst>(U)) {
+      assert(BI->isConditional() && "Must be conditional");
+      BI->swapSuccessors();
+      continue;
+    }
+    if (auto *SI = dyn_cast<SelectInst>(U)) {
+      // Swap operands
+      SI->swapValues();
+      SI->swapProfMetadata();
+      continue;
+    }
+    llvm_unreachable("Must be a branch or a select");
+  }
+  Cmp->setPredicate(CmpInst::getSwappedPredicate(DomPred));
+  return true;
+}
+
+/// Many architectures use the same instruction for both subtract and cmp. Try
+/// to swap cmp operands to match subtract operations to allow for CSE.
+static bool swapICmpOperandsToExposeCSEOpportunities(CmpInst *Cmp) {
+  Value *Op0 = Cmp->getOperand(0);
+  Value *Op1 = Cmp->getOperand(1);
+  if (!Op0->getType()->isIntegerTy() || isa<Constant>(Op0) ||
+      isa<Constant>(Op1) || Op0 == Op1)
+    return false;
+
+  // If a subtract already has the same operands as a compare, swapping would be
+  // bad. If a subtract has the same operands as a compare but in reverse order,
+  // then swapping is good.
+  int GoodToSwap = 0;
+  unsigned NumInspected = 0;
+  for (const User *U : Op0->users()) {
+    // Avoid walking many users.
+    if (++NumInspected > 128)
+      return false;
+    if (match(U, m_Sub(m_Specific(Op1), m_Specific(Op0))))
+      GoodToSwap++;
+    else if (match(U, m_Sub(m_Specific(Op0), m_Specific(Op1))))
+      GoodToSwap--;
+  }
+
+  if (GoodToSwap > 0) {
+    Cmp->swapOperands();
+    return true;
+  }
+  return false;
+}
+
+bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
+  if (sinkCmpExpression(Cmp, *TLI))
+    return true;
+
+  if (combineToUAddWithOverflow(Cmp, ModifiedDT))
+    return true;
+
+  if (combineToUSubWithOverflow(Cmp, ModifiedDT))
+    return true;
+
+  if (foldICmpWithDominatingICmp(Cmp, *TLI))
+    return true;
+
+  if (swapICmpOperandsToExposeCSEOpportunities(Cmp))
+    return true;
+
+  return false;
+}
+
+/// Duplicate and sink the given 'and' instruction into user blocks where it is
+/// used in a compare to allow isel to generate better code for targets where
+/// this operation can be combined.
+///
+/// Return true if any changes are made.
+static bool sinkAndCmp0Expression(Instruction *AndI, const TargetLowering &TLI,
+                                  SetOfInstrs &InsertedInsts) {
+  // Double-check that we're not trying to optimize an instruction that was
+  // already optimized by some other part of this pass.
+  assert(!InsertedInsts.count(AndI) &&
+         "Attempting to optimize already optimized and instruction");
+  (void)InsertedInsts;
+
+  // Nothing to do for single use in same basic block.
+  if (AndI->hasOneUse() &&
+      AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent())
+    return false;
+
+  // Try to avoid cases where sinking/duplicating is likely to increase register
+  // pressure.
+  if (!isa<ConstantInt>(AndI->getOperand(0)) &&
+      !isa<ConstantInt>(AndI->getOperand(1)) &&
+      AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse())
+    return false;
+
+  for (auto *U : AndI->users()) {
+    Instruction *User = cast<Instruction>(U);
+
+    // Only sink 'and' feeding icmp with 0.
+    if (!isa<ICmpInst>(User))
+      return false;
+
+    auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1));
+    if (!CmpC || !CmpC->isZero())
+      return false;
+  }
+
+  if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
+  LLVM_DEBUG(AndI->getParent()->dump());
+
+  // Push the 'and' into the same block as the icmp 0.  There should only be
+  // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any
+  // others, so we don't need to keep track of which BBs we insert into.
+  for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end();
+       UI != E;) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    LLVM_DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");
+
+    // Keep the 'and' in the same place if the use is already in the same block.
+    Instruction *InsertPt =
+        User->getParent() == AndI->getParent() ? AndI : User;
+    Instruction *InsertedAnd =
+        BinaryOperator::Create(Instruction::And, AndI->getOperand(0),
+                               AndI->getOperand(1), "", InsertPt);
+    // Propagate the debug info.
+    InsertedAnd->setDebugLoc(AndI->getDebugLoc());
+
+    // Replace a use of the 'and' with a use of the new 'and'.
+    TheUse = InsertedAnd;
+    ++NumAndUses;
+    LLVM_DEBUG(User->getParent()->dump());
+  }
+
+  // We removed all uses, nuke the and.
+  AndI->eraseFromParent();
+  return true;
+}
+
+/// Check if the candidates could be combined with a shift instruction, which
+/// includes:
+/// 1. Truncate instruction
+/// 2. And instruction and the imm is a mask of the low bits:
+/// imm & (imm+1) == 0
+static bool isExtractBitsCandidateUse(Instruction *User) {
+  if (!isa<TruncInst>(User)) {
+    if (User->getOpcode() != Instruction::And ||
+        !isa<ConstantInt>(User->getOperand(1)))
+      return false;
+
+    const APInt &Cimm = cast<ConstantInt>(User->getOperand(1))->getValue();
+
+    if ((Cimm & (Cimm + 1)).getBoolValue())
+      return false;
+  }
+  return true;
+}
+
+/// Sink both shift and truncate instruction to the use of truncate's BB.
+static bool
+SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
+                     DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,
+                     const TargetLowering &TLI, const DataLayout &DL) {
+  BasicBlock *UserBB = User->getParent();
+  DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
+  auto *TruncI = cast<TruncInst>(User);
+  bool MadeChange = false;
+
+  for (Value::user_iterator TruncUI = TruncI->user_begin(),
+                            TruncE = TruncI->user_end();
+       TruncUI != TruncE;) {
+
+    Use &TruncTheUse = TruncUI.getUse();
+    Instruction *TruncUser = cast<Instruction>(*TruncUI);
+    // Preincrement use iterator so we don't invalidate it.
+
+    ++TruncUI;
+
+    int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());
+    if (!ISDOpcode)
+      continue;
+
+    // If the use is actually a legal node, there will not be an
+    // implicit truncate.
+    // FIXME: always querying the result type is just an
+    // approximation; some nodes' legality is determined by the
+    // operand or other means. There's no good way to find out though.
+    if (TLI.isOperationLegalOrCustom(
+            ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true)))
+      continue;
+
+    // Don't bother for PHI nodes.
+    if (isa<PHINode>(TruncUser))
+      continue;
+
+    BasicBlock *TruncUserBB = TruncUser->getParent();
+
+    if (UserBB == TruncUserBB)
+      continue;
+
+    BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
+    CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];
+
+    if (!InsertedShift && !InsertedTrunc) {
+      BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
+      assert(InsertPt != TruncUserBB->end());
+      // Sink the shift
+      if (ShiftI->getOpcode() == Instruction::AShr)
+        InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
+                                                   "", &*InsertPt);
+      else
+        InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
+                                                   "", &*InsertPt);
+      InsertedShift->setDebugLoc(ShiftI->getDebugLoc());
+
+      // Sink the trunc
+      BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();
+      TruncInsertPt++;
+      assert(TruncInsertPt != TruncUserBB->end());
+
+      InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
+                                       TruncI->getType(), "", &*TruncInsertPt);
+      InsertedTrunc->setDebugLoc(TruncI->getDebugLoc());
+
+      MadeChange = true;
+
+      TruncTheUse = InsertedTrunc;
+    }
+  }
+  return MadeChange;
+}
+
+/// Sink the shift *right* instruction into user blocks if the uses could
+/// potentially be combined with this shift instruction and generate BitExtract
+/// instruction. It will only be applied if the architecture supports BitExtract
+/// instruction. Here is an example:
+/// BB1:
+///   %x.extract.shift = lshr i64 %arg1, 32
+/// BB2:
+///   %x.extract.trunc = trunc i64 %x.extract.shift to i16
+/// ==>
+///
+/// BB2:
+///   %x.extract.shift.1 = lshr i64 %arg1, 32
+///   %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
+///
+/// CodeGen will recognize the pattern in BB2 and generate BitExtract
+/// instruction.
+/// Return true if any changes are made.
+static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
+                                const TargetLowering &TLI,
+                                const DataLayout &DL) {
+  BasicBlock *DefBB = ShiftI->getParent();
+
+  /// Only insert instructions in each block once.
+  DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
+
+  bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType()));
+
+  bool MadeChange = false;
+  for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
+       UI != E;) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    // Don't bother for PHI nodes.
+    if (isa<PHINode>(User))
+      continue;
+
+    if (!isExtractBitsCandidateUse(User))
+      continue;
+
+    BasicBlock *UserBB = User->getParent();
+
+    if (UserBB == DefBB) {
+      // If the shift and truncate instruction are in the same BB. The use of
+      // the truncate(TruncUse) may still introduce another truncate if not
+      // legal. In this case, we would like to sink both shift and truncate
+      // instruction to the BB of TruncUse.
+      // for example:
+      // BB1:
+      // i64 shift.result = lshr i64 opnd, imm
+      // trunc.result = trunc shift.result to i16
+      //
+      // BB2:
+      //   ----> We will have an implicit truncate here if the architecture does
+      //   not have i16 compare.
+      // cmp i16 trunc.result, opnd2
+      //
+      if (isa<TruncInst>(User) &&
+          shiftIsLegal
+          // If the type of the truncate is legal, no truncate will be
+          // introduced in other basic blocks.
+          && (!TLI.isTypeLegal(TLI.getValueType(DL, User->getType()))))
+        MadeChange =
+            SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL);
+
+      continue;
+    }
+    // If we have already inserted a shift into this block, use it.
+    BinaryOperator *&InsertedShift = InsertedShifts[UserBB];
+
+    if (!InsertedShift) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+      assert(InsertPt != UserBB->end());
+
+      if (ShiftI->getOpcode() == Instruction::AShr)
+        InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
+                                                   "", &*InsertPt);
+      else
+        InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
+                                                   "", &*InsertPt);
+      InsertedShift->setDebugLoc(ShiftI->getDebugLoc());
+
+      MadeChange = true;
+    }
+
+    // Replace a use of the shift with a use of the new shift.
+    TheUse = InsertedShift;
+  }
+
+  // If we removed all uses, or there are none, nuke the shift.
+  if (ShiftI->use_empty()) {
+    salvageDebugInfo(*ShiftI);
+    ShiftI->eraseFromParent();
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+/// If counting leading or trailing zeros is an expensive operation and a zero
+/// input is defined, add a check for zero to avoid calling the intrinsic.
+///
+/// We want to transform:
+///     %z = call i64 @llvm.cttz.i64(i64 %A, i1 false)
+///
+/// into:
+///   entry:
+///     %cmpz = icmp eq i64 %A, 0
+///     br i1 %cmpz, label %cond.end, label %cond.false
+///   cond.false:
+///     %z = call i64 @llvm.cttz.i64(i64 %A, i1 true)
+///     br label %cond.end
+///   cond.end:
+///     %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ]
+///
+/// If the transform is performed, return true and set ModifiedDT to true.
+static bool despeculateCountZeros(IntrinsicInst *CountZeros,
+                                  LoopInfo &LI,
+                                  const TargetLowering *TLI,
+                                  const DataLayout *DL, ModifyDT &ModifiedDT,
+                                  SmallSet<BasicBlock *, 32> &FreshBBs,
+                                  bool IsHugeFunc) {
+  // If a zero input is undefined, it doesn't make sense to despeculate that.
+  if (match(CountZeros->getOperand(1), m_One()))
+    return false;
+
+  // If it's cheap to speculate, there's nothing to do.
+  Type *Ty = CountZeros->getType();
+  auto IntrinsicID = CountZeros->getIntrinsicID();
+  if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz(Ty)) ||
+      (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz(Ty)))
+    return false;
+
+  // Only handle legal scalar cases. Anything else requires too much work.
+  unsigned SizeInBits = Ty->getScalarSizeInBits();
+  if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
+    return false;
+
+  // Bail if the value is never zero.
+  Use &Op = CountZeros->getOperandUse(0);
+  if (isKnownNonZero(Op, *DL))
+    return false;
+
+  // The intrinsic will be sunk behind a compare against zero and branch.
+  BasicBlock *StartBlock = CountZeros->getParent();
+  BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false");
+  if (IsHugeFunc)
+    FreshBBs.insert(CallBlock);
+
+  // Create another block after the count zero intrinsic. A PHI will be added
+  // in this block to select the result of the intrinsic or the bit-width
+  // constant if the input to the intrinsic is zero.
+  BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros));
+  BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end");
+  if (IsHugeFunc)
+    FreshBBs.insert(EndBlock);
+
+  // Update the LoopInfo. The new blocks are in the same loop as the start
+  // block.
+  if (Loop *L = LI.getLoopFor(StartBlock)) {
+    L->addBasicBlockToLoop(CallBlock, LI);
+    L->addBasicBlockToLoop(EndBlock, LI);
+  }
+
+  // Set up a builder to create a compare, conditional branch, and PHI.
+  IRBuilder<> Builder(CountZeros->getContext());
+  Builder.SetInsertPoint(StartBlock->getTerminator());
+  Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc());
+
+  // Replace the unconditional branch that was created by the first split with
+  // a compare against zero and a conditional branch.
+  Value *Zero = Constant::getNullValue(Ty);
+  // Avoid introducing branch on poison. This also replaces the ctz operand.
+  if (!isGuaranteedNotToBeUndefOrPoison(Op))
+    Op = Builder.CreateFreeze(Op, Op->getName() + ".fr");
+  Value *Cmp = Builder.CreateICmpEQ(Op, Zero, "cmpz");
+  Builder.CreateCondBr(Cmp, EndBlock, CallBlock);
+  StartBlock->getTerminator()->eraseFromParent();
+
+  // Create a PHI in the end block to select either the output of the intrinsic
+  // or the bit width of the operand.
+  Builder.SetInsertPoint(&EndBlock->front());
+  PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz");
+  replaceAllUsesWith(CountZeros, PN, FreshBBs, IsHugeFunc);
+  Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits));
+  PN->addIncoming(BitWidth, StartBlock);
+  PN->addIncoming(CountZeros, CallBlock);
+
+  // We are explicitly handling the zero case, so we can set the intrinsic's
+  // undefined zero argument to 'true'. This will also prevent reprocessing the
+  // intrinsic; we only despeculate when a zero input is defined.
+  CountZeros->setArgOperand(1, Builder.getTrue());
+  ModifiedDT = ModifyDT::ModifyBBDT;
+  return true;
+}
+
+bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
+  BasicBlock *BB = CI->getParent();
+
+  // Lower inline assembly if we can.
+  // If we found an inline asm expession, and if the target knows how to
+  // lower it to normal LLVM code, do so now.
+  if (CI->isInlineAsm()) {
+    if (TLI->ExpandInlineAsm(CI)) {
+      // Avoid invalidating the iterator.
+      CurInstIterator = BB->begin();
+      // Avoid processing instructions out of order, which could cause
+      // reuse before a value is defined.
+      SunkAddrs.clear();
+      return true;
+    }
+    // Sink address computing for memory operands into the block.
+    if (optimizeInlineAsmInst(CI))
+      return true;
+  }
+
+  // Align the pointer arguments to this call if the target thinks it's a good
+  // idea
+  unsigned MinSize;
+  Align PrefAlign;
+  if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
+    for (auto &Arg : CI->args()) {
+      // We want to align both objects whose address is used directly and
+      // objects whose address is used in casts and GEPs, though it only makes
+      // sense for GEPs if the offset is a multiple of the desired alignment and
+      // if size - offset meets the size threshold.
+      if (!Arg->getType()->isPointerTy())
+        continue;
+      APInt Offset(DL->getIndexSizeInBits(
+                       cast<PointerType>(Arg->getType())->getAddressSpace()),
+                   0);
+      Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset);
+      uint64_t Offset2 = Offset.getLimitedValue();
+      if (!isAligned(PrefAlign, Offset2))
+        continue;
+      AllocaInst *AI;
+      if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlign() < PrefAlign &&
+          DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
+        AI->setAlignment(PrefAlign);
+      // Global variables can only be aligned if they are defined in this
+      // object (i.e. they are uniquely initialized in this object), and
+      // over-aligning global variables that have an explicit section is
+      // forbidden.
+      GlobalVariable *GV;
+      if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment() &&
+          GV->getPointerAlignment(*DL) < PrefAlign &&
+          DL->getTypeAllocSize(GV->getValueType()) >= MinSize + Offset2)
+        GV->setAlignment(PrefAlign);
+    }
+  }
+  // If this is a memcpy (or similar) then we may be able to improve the
+  // alignment.
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
+    Align DestAlign = getKnownAlignment(MI->getDest(), *DL);
+    MaybeAlign MIDestAlign = MI->getDestAlign();
+    if (!MIDestAlign || DestAlign > *MIDestAlign)
+      MI->setDestAlignment(DestAlign);
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+      MaybeAlign MTISrcAlign = MTI->getSourceAlign();
+      Align SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
+      if (!MTISrcAlign || SrcAlign > *MTISrcAlign)
+        MTI->setSourceAlignment(SrcAlign);
+    }
+  }
+
+  // If we have a cold call site, try to sink addressing computation into the
+  // cold block.  This interacts with our handling for loads and stores to
+  // ensure that we can fold all uses of a potential addressing computation
+  // into their uses.  TODO: generalize this to work over profiling data
+  if (CI->hasFnAttr(Attribute::Cold) && !OptSize &&
+      !llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
+    for (auto &Arg : CI->args()) {
+      if (!Arg->getType()->isPointerTy())
+        continue;
+      unsigned AS = Arg->getType()->getPointerAddressSpace();
+      if (optimizeMemoryInst(CI, Arg, Arg->getType(), AS))
+        return true;
+    }
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+  if (II) {
+    switch (II->getIntrinsicID()) {
+    default:
+      break;
+    case Intrinsic::assume:
+      llvm_unreachable("llvm.assume should have been removed already");
+    case Intrinsic::experimental_widenable_condition: {
+      // Give up on future widening oppurtunties so that we can fold away dead
+      // paths and merge blocks before going into block-local instruction
+      // selection.
+      if (II->use_empty()) {
+        II->eraseFromParent();
+        return true;
+      }
+      Constant *RetVal = ConstantInt::getTrue(II->getContext());
+      resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
+        replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
+      });
+      return true;
+    }
+    case Intrinsic::objectsize:
+      llvm_unreachable("llvm.objectsize.* should have been lowered already");
+    case Intrinsic::is_constant:
+      llvm_unreachable("llvm.is.constant.* should have been lowered already");
+    case Intrinsic::aarch64_stlxr:
+    case Intrinsic::aarch64_stxr: {
+      ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
+      if (!ExtVal || !ExtVal->hasOneUse() ||
+          ExtVal->getParent() == CI->getParent())
+        return false;
+      // Sink a zext feeding stlxr/stxr before it, so it can be folded into it.
+      ExtVal->moveBefore(CI);
+      // Mark this instruction as "inserted by CGP", so that other
+      // optimizations don't touch it.
+      InsertedInsts.insert(ExtVal);
+      return true;
+    }
+
+    case Intrinsic::launder_invariant_group:
+    case Intrinsic::strip_invariant_group: {
+      Value *ArgVal = II->getArgOperand(0);
+      auto it = LargeOffsetGEPMap.find(II);
+      if (it != LargeOffsetGEPMap.end()) {
+        // Merge entries in LargeOffsetGEPMap to reflect the RAUW.
+        // Make sure not to have to deal with iterator invalidation
+        // after possibly adding ArgVal to LargeOffsetGEPMap.
+        auto GEPs = std::move(it->second);
+        LargeOffsetGEPMap[ArgVal].append(GEPs.begin(), GEPs.end());
+        LargeOffsetGEPMap.erase(II);
+      }
+
+      replaceAllUsesWith(II, ArgVal, FreshBBs, IsHugeFunc);
+      II->eraseFromParent();
+      return true;
+    }
+    case Intrinsic::cttz:
+    case Intrinsic::ctlz:
+      // If counting zeros is expensive, try to avoid it.
+      return despeculateCountZeros(II, *LI, TLI, DL, ModifiedDT, FreshBBs,
+                                   IsHugeFunc);
+    case Intrinsic::fshl:
+    case Intrinsic::fshr:
+      return optimizeFunnelShift(II);
+    case Intrinsic::dbg_assign:
+    case Intrinsic::dbg_value:
+      return fixupDbgValue(II);
+    case Intrinsic::masked_gather:
+      return optimizeGatherScatterInst(II, II->getArgOperand(0));
+    case Intrinsic::masked_scatter:
+      return optimizeGatherScatterInst(II, II->getArgOperand(1));
+    }
+
+    SmallVector<Value *, 2> PtrOps;
+    Type *AccessTy;
+    if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))
+      while (!PtrOps.empty()) {
+        Value *PtrVal = PtrOps.pop_back_val();
+        unsigned AS = PtrVal->getType()->getPointerAddressSpace();
+        if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))
+          return true;
+      }
+  }
+
+  // From here on out we're working with named functions.
+  if (!CI->getCalledFunction())
+    return false;
+
+  // Lower all default uses of _chk calls.  This is very similar
+  // to what InstCombineCalls does, but here we are only lowering calls
+  // to fortified library functions (e.g. __memcpy_chk) that have the default
+  // "don't know" as the objectsize.  Anything else should be left alone.
+  FortifiedLibCallSimplifier Simplifier(TLInfo, true);
+  IRBuilder<> Builder(CI);
+  if (Value *V = Simplifier.optimizeCall(CI, Builder)) {
+    replaceAllUsesWith(CI, V, FreshBBs, IsHugeFunc);
+    CI->eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+/// Look for opportunities to duplicate return instructions to the predecessor
+/// to enable tail call optimizations. The case it is currently looking for is:
+/// @code
+/// bb0:
+///   %tmp0 = tail call i32 @f0()
+///   br label %return
+/// bb1:
+///   %tmp1 = tail call i32 @f1()
+///   br label %return
+/// bb2:
+///   %tmp2 = tail call i32 @f2()
+///   br label %return
+/// return:
+///   %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
+///   ret i32 %retval
+/// @endcode
+///
+/// =>
+///
+/// @code
+/// bb0:
+///   %tmp0 = tail call i32 @f0()
+///   ret i32 %tmp0
+/// bb1:
+///   %tmp1 = tail call i32 @f1()
+///   ret i32 %tmp1
+/// bb2:
+///   %tmp2 = tail call i32 @f2()
+///   ret i32 %tmp2
+/// @endcode
+bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
+                                                ModifyDT &ModifiedDT) {
+  if (!BB->getTerminator())
+    return false;
+
+  ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator());
+  if (!RetI)
+    return false;
+
+  assert(LI->getLoopFor(BB) == nullptr && "A return block cannot be in a loop");
+
+  PHINode *PN = nullptr;
+  ExtractValueInst *EVI = nullptr;
+  BitCastInst *BCI = nullptr;
+  Value *V = RetI->getReturnValue();
+  if (V) {
+    BCI = dyn_cast<BitCastInst>(V);
+    if (BCI)
+      V = BCI->getOperand(0);
+
+    EVI = dyn_cast<ExtractValueInst>(V);
+    if (EVI) {
+      V = EVI->getOperand(0);
+      if (!llvm::all_of(EVI->indices(), [](unsigned idx) { return idx == 0; }))
+        return false;
+    }
+
+    PN = dyn_cast<PHINode>(V);
+    if (!PN)
+      return false;
+  }
+
+  if (PN && PN->getParent() != BB)
+    return false;
+
+  auto isLifetimeEndOrBitCastFor = [](const Instruction *Inst) {
+    const BitCastInst *BC = dyn_cast<BitCastInst>(Inst);
+    if (BC && BC->hasOneUse())
+      Inst = BC->user_back();
+
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
+      return II->getIntrinsicID() == Intrinsic::lifetime_end;
+    return false;
+  };
+
+  // Make sure there are no instructions between the first instruction
+  // and return.
+  const Instruction *BI = BB->getFirstNonPHI();
+  // Skip over debug and the bitcast.
+  while (isa<DbgInfoIntrinsic>(BI) || BI == BCI || BI == EVI ||
+         isa<PseudoProbeInst>(BI) || isLifetimeEndOrBitCastFor(BI))
+    BI = BI->getNextNode();
+  if (BI != RetI)
+    return false;
+
+  /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail
+  /// call.
+  const Function *F = BB->getParent();
+  SmallVector<BasicBlock *, 4> TailCallBBs;
+  if (PN) {
+    for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
+      // Look through bitcasts.
+      Value *IncomingVal = PN->getIncomingValue(I)->stripPointerCasts();
+      CallInst *CI = dyn_cast<CallInst>(IncomingVal);
+      BasicBlock *PredBB = PN->getIncomingBlock(I);
+      // Make sure the phi value is indeed produced by the tail call.
+      if (CI && CI->hasOneUse() && CI->getParent() == PredBB &&
+          TLI->mayBeEmittedAsTailCall(CI) &&
+          attributesPermitTailCall(F, CI, RetI, *TLI))
+        TailCallBBs.push_back(PredBB);
+    }
+  } else {
+    SmallPtrSet<BasicBlock *, 4> VisitedBBs;
+    for (BasicBlock *Pred : predecessors(BB)) {
+      if (!VisitedBBs.insert(Pred).second)
+        continue;
+      if (Instruction *I = Pred->rbegin()->getPrevNonDebugInstruction(true)) {
+        CallInst *CI = dyn_cast<CallInst>(I);
+        if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&
+            attributesPermitTailCall(F, CI, RetI, *TLI))
+          TailCallBBs.push_back(Pred);
+      }
+    }
+  }
+
+  bool Changed = false;
+  for (auto const &TailCallBB : TailCallBBs) {
+    // Make sure the call instruction is followed by an unconditional branch to
+    // the return block.
+    BranchInst *BI = dyn_cast<BranchInst>(TailCallBB->getTerminator());
+    if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB)
+      continue;
+
+    // Duplicate the return into TailCallBB.
+    (void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB);
+    assert(!VerifyBFIUpdates ||
+           BFI->getBlockFreq(BB) >= BFI->getBlockFreq(TailCallBB));
+    BFI->setBlockFreq(
+        BB,
+        (BFI->getBlockFreq(BB) - BFI->getBlockFreq(TailCallBB)).getFrequency());
+    ModifiedDT = ModifyDT::ModifyBBDT;
+    Changed = true;
+    ++NumRetsDup;
+  }
+
+  // If we eliminated all predecessors of the block, delete the block now.
+  if (Changed && !BB->hasAddressTaken() && pred_empty(BB))
+    BB->eraseFromParent();
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Memory Optimization
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// This is an extended version of TargetLowering::AddrMode
+/// which holds actual Value*'s for register values.
+struct ExtAddrMode : public TargetLowering::AddrMode {
+  Value *BaseReg = nullptr;
+  Value *ScaledReg = nullptr;
+  Value *OriginalValue = nullptr;
+  bool InBounds = true;
+
+  enum FieldName {
+    NoField = 0x00,
+    BaseRegField = 0x01,
+    BaseGVField = 0x02,
+    BaseOffsField = 0x04,
+    ScaledRegField = 0x08,
+    ScaleField = 0x10,
+    MultipleFields = 0xff
+  };
+
+  ExtAddrMode() = default;
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+
+  FieldName compare(const ExtAddrMode &other) {
+    // First check that the types are the same on each field, as differing types
+    // is something we can't cope with later on.
+    if (BaseReg && other.BaseReg &&
+        BaseReg->getType() != other.BaseReg->getType())
+      return MultipleFields;
+    if (BaseGV && other.BaseGV && BaseGV->getType() != other.BaseGV->getType())
+      return MultipleFields;
+    if (ScaledReg && other.ScaledReg &&
+        ScaledReg->getType() != other.ScaledReg->getType())
+      return MultipleFields;
+
+    // Conservatively reject 'inbounds' mismatches.
+    if (InBounds != other.InBounds)
+      return MultipleFields;
+
+    // Check each field to see if it differs.
+    unsigned Result = NoField;
+    if (BaseReg != other.BaseReg)
+      Result |= BaseRegField;
+    if (BaseGV != other.BaseGV)
+      Result |= BaseGVField;
+    if (BaseOffs != other.BaseOffs)
+      Result |= BaseOffsField;
+    if (ScaledReg != other.ScaledReg)
+      Result |= ScaledRegField;
+    // Don't count 0 as being a different scale, because that actually means
+    // unscaled (which will already be counted by having no ScaledReg).
+    if (Scale && other.Scale && Scale != other.Scale)
+      Result |= ScaleField;
+
+    if (llvm::popcount(Result) > 1)
+      return MultipleFields;
+    else
+      return static_cast<FieldName>(Result);
+  }
+
+  // An AddrMode is trivial if it involves no calculation i.e. it is just a base
+  // with no offset.
+  bool isTrivial() {
+    // An AddrMode is (BaseGV + BaseReg + BaseOffs + ScaleReg * Scale) so it is
+    // trivial if at most one of these terms is nonzero, except that BaseGV and
+    // BaseReg both being zero actually means a null pointer value, which we
+    // consider to be 'non-zero' here.
+    return !BaseOffs && !Scale && !(BaseGV && BaseReg);
+  }
+
+  Value *GetFieldAsValue(FieldName Field, Type *IntPtrTy) {
+    switch (Field) {
+    default:
+      return nullptr;
+    case BaseRegField:
+      return BaseReg;
+    case BaseGVField:
+      return BaseGV;
+    case ScaledRegField:
+      return ScaledReg;
+    case BaseOffsField:
+      return ConstantInt::get(IntPtrTy, BaseOffs);
+    }
+  }
+
+  void SetCombinedField(FieldName Field, Value *V,
+                        const SmallVectorImpl<ExtAddrMode> &AddrModes) {
+    switch (Field) {
+    default:
+      llvm_unreachable("Unhandled fields are expected to be rejected earlier");
+      break;
+    case ExtAddrMode::BaseRegField:
+      BaseReg = V;
+      break;
+    case ExtAddrMode::BaseGVField:
+      // A combined BaseGV is an Instruction, not a GlobalValue, so it goes
+      // in the BaseReg field.
+      assert(BaseReg == nullptr);
+      BaseReg = V;
+      BaseGV = nullptr;
+      break;
+    case ExtAddrMode::ScaledRegField:
+      ScaledReg = V;
+      // If we have a mix of scaled and unscaled addrmodes then we want scale
+      // to be the scale and not zero.
+      if (!Scale)
+        for (const ExtAddrMode &AM : AddrModes)
+          if (AM.Scale) {
+            Scale = AM.Scale;
+            break;
+          }
+      break;
+    case ExtAddrMode::BaseOffsField:
+      // The offset is no longer a constant, so it goes in ScaledReg with a
+      // scale of 1.
+      assert(ScaledReg == nullptr);
+      ScaledReg = V;
+      Scale = 1;
+      BaseOffs = 0;
+      break;
+    }
+  }
+};
+
+#ifndef NDEBUG
+static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
+  AM.print(OS);
+  return OS;
+}
+#endif
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ExtAddrMode::print(raw_ostream &OS) const {
+  bool NeedPlus = false;
+  OS << "[";
+  if (InBounds)
+    OS << "inbounds ";
+  if (BaseGV) {
+    OS << "GV:";
+    BaseGV->printAsOperand(OS, /*PrintType=*/false);
+    NeedPlus = true;
+  }
+
+  if (BaseOffs) {
+    OS << (NeedPlus ? " + " : "") << BaseOffs;
+    NeedPlus = true;
+  }
+
+  if (BaseReg) {
+    OS << (NeedPlus ? " + " : "") << "Base:";
+    BaseReg->printAsOperand(OS, /*PrintType=*/false);
+    NeedPlus = true;
+  }
+  if (Scale) {
+    OS << (NeedPlus ? " + " : "") << Scale << "*";
+    ScaledReg->printAsOperand(OS, /*PrintType=*/false);
+  }
+
+  OS << ']';
+}
+
+LLVM_DUMP_METHOD void ExtAddrMode::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+#endif
+
+} // end anonymous namespace
+
+namespace {
+
+/// This class provides transaction based operation on the IR.
+/// Every change made through this class is recorded in the internal state and
+/// can be undone (rollback) until commit is called.
+/// CGP does not check if instructions could be speculatively executed when
+/// moved. Preserving the original location would pessimize the debugging
+/// experience, as well as negatively impact the quality of sample PGO.
+class TypePromotionTransaction {
+  /// This represents the common interface of the individual transaction.
+  /// Each class implements the logic for doing one specific modification on
+  /// the IR via the TypePromotionTransaction.
+  class TypePromotionAction {
+  protected:
+    /// The Instruction modified.
+    Instruction *Inst;
+
+  public:
+    /// Constructor of the action.
+    /// The constructor performs the related action on the IR.
+    TypePromotionAction(Instruction *Inst) : Inst(Inst) {}
+
+    virtual ~TypePromotionAction() = default;
+
+    /// Undo the modification done by this action.
+    /// When this method is called, the IR must be in the same state as it was
+    /// before this action was applied.
+    /// \pre Undoing the action works if and only if the IR is in the exact same
+    /// state as it was directly after this action was applied.
+    virtual void undo() = 0;
+
+    /// Advocate every change made by this action.
+    /// When the results on the IR of the action are to be kept, it is important
+    /// to call this function, otherwise hidden information may be kept forever.
+    virtual void commit() {
+      // Nothing to be done, this action is not doing anything.
+    }
+  };
+
+  /// Utility to remember the position of an instruction.
+  class InsertionHandler {
+    /// Position of an instruction.
+    /// Either an instruction:
+    /// - Is the first in a basic block: BB is used.
+    /// - Has a previous instruction: PrevInst is used.
+    union {
+      Instruction *PrevInst;
+      BasicBlock *BB;
+    } Point;
+
+    /// Remember whether or not the instruction had a previous instruction.
+    bool HasPrevInstruction;
+
+  public:
+    /// Record the position of \p Inst.
+    InsertionHandler(Instruction *Inst) {
+      BasicBlock::iterator It = Inst->getIterator();
+      HasPrevInstruction = (It != (Inst->getParent()->begin()));
+      if (HasPrevInstruction)
+        Point.PrevInst = &*--It;
+      else
+        Point.BB = Inst->getParent();
+    }
+
+    /// Insert \p Inst at the recorded position.
+    void insert(Instruction *Inst) {
+      if (HasPrevInstruction) {
+        if (Inst->getParent())
+          Inst->removeFromParent();
+        Inst->insertAfter(Point.PrevInst);
+      } else {
+        Instruction *Position = &*Point.BB->getFirstInsertionPt();
+        if (Inst->getParent())
+          Inst->moveBefore(Position);
+        else
+          Inst->insertBefore(Position);
+      }
+    }
+  };
+
+  /// Move an instruction before another.
+  class InstructionMoveBefore : public TypePromotionAction {
+    /// Original position of the instruction.
+    InsertionHandler Position;
+
+  public:
+    /// Move \p Inst before \p Before.
+    InstructionMoveBefore(Instruction *Inst, Instruction *Before)
+        : TypePromotionAction(Inst), Position(Inst) {
+      LLVM_DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before
+                        << "\n");
+      Inst->moveBefore(Before);
+    }
+
+    /// Move the instruction back to its original position.
+    void undo() override {
+      LLVM_DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n");
+      Position.insert(Inst);
+    }
+  };
+
+  /// Set the operand of an instruction with a new value.
+  class OperandSetter : public TypePromotionAction {
+    /// Original operand of the instruction.
+    Value *Origin;
+
+    /// Index of the modified instruction.
+    unsigned Idx;
+
+  public:
+    /// Set \p Idx operand of \p Inst with \p NewVal.
+    OperandSetter(Instruction *Inst, unsigned Idx, Value *NewVal)
+        : TypePromotionAction(Inst), Idx(Idx) {
+      LLVM_DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n"
+                        << "for:" << *Inst << "\n"
+                        << "with:" << *NewVal << "\n");
+      Origin = Inst->getOperand(Idx);
+      Inst->setOperand(Idx, NewVal);
+    }
+
+    /// Restore the original value of the instruction.
+    void undo() override {
+      LLVM_DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n"
+                        << "for: " << *Inst << "\n"
+                        << "with: " << *Origin << "\n");
+      Inst->setOperand(Idx, Origin);
+    }
+  };
+
+  /// Hide the operands of an instruction.
+  /// Do as if this instruction was not using any of its operands.
+  class OperandsHider : public TypePromotionAction {
+    /// The list of original operands.
+    SmallVector<Value *, 4> OriginalValues;
+
+  public:
+    /// Remove \p Inst from the uses of the operands of \p Inst.
+    OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) {
+      LLVM_DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n");
+      unsigned NumOpnds = Inst->getNumOperands();
+      OriginalValues.reserve(NumOpnds);
+      for (unsigned It = 0; It < NumOpnds; ++It) {
+        // Save the current operand.
+        Value *Val = Inst->getOperand(It);
+        OriginalValues.push_back(Val);
+        // Set a dummy one.
+        // We could use OperandSetter here, but that would imply an overhead
+        // that we are not willing to pay.
+        Inst->setOperand(It, UndefValue::get(Val->getType()));
+      }
+    }
+
+    /// Restore the original list of uses.
+    void undo() override {
+      LLVM_DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n");
+      for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It)
+        Inst->setOperand(It, OriginalValues[It]);
+    }
+  };
+
+  /// Build a truncate instruction.
+  class TruncBuilder : public TypePromotionAction {
+    Value *Val;
+
+  public:
+    /// Build a truncate instruction of \p Opnd producing a \p Ty
+    /// result.
+    /// trunc Opnd to Ty.
+    TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) {
+      IRBuilder<> Builder(Opnd);
+      Builder.SetCurrentDebugLocation(DebugLoc());
+      Val = Builder.CreateTrunc(Opnd, Ty, "promoted");
+      LLVM_DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
+    }
+
+    /// Get the built value.
+    Value *getBuiltValue() { return Val; }
+
+    /// Remove the built instruction.
+    void undo() override {
+      LLVM_DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n");
+      if (Instruction *IVal = dyn_cast<Instruction>(Val))
+        IVal->eraseFromParent();
+    }
+  };
+
+  /// Build a sign extension instruction.
+  class SExtBuilder : public TypePromotionAction {
+    Value *Val;
+
+  public:
+    /// Build a sign extension instruction of \p Opnd producing a \p Ty
+    /// result.
+    /// sext Opnd to Ty.
+    SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
+        : TypePromotionAction(InsertPt) {
+      IRBuilder<> Builder(InsertPt);
+      Val = Builder.CreateSExt(Opnd, Ty, "promoted");
+      LLVM_DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n");
+    }
+
+    /// Get the built value.
+    Value *getBuiltValue() { return Val; }
+
+    /// Remove the built instruction.
+    void undo() override {
+      LLVM_DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n");
+      if (Instruction *IVal = dyn_cast<Instruction>(Val))
+        IVal->eraseFromParent();
+    }
+  };
+
+  /// Build a zero extension instruction.
+  class ZExtBuilder : public TypePromotionAction {
+    Value *Val;
+
+  public:
+    /// Build a zero extension instruction of \p Opnd producing a \p Ty
+    /// result.
+    /// zext Opnd to Ty.
+    ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
+        : TypePromotionAction(InsertPt) {
+      IRBuilder<> Builder(InsertPt);
+      Builder.SetCurrentDebugLocation(DebugLoc());
+      Val = Builder.CreateZExt(Opnd, Ty, "promoted");
+      LLVM_DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
+    }
+
+    /// Get the built value.
+    Value *getBuiltValue() { return Val; }
+
+    /// Remove the built instruction.
+    void undo() override {
+      LLVM_DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n");
+      if (Instruction *IVal = dyn_cast<Instruction>(Val))
+        IVal->eraseFromParent();
+    }
+  };
+
+  /// Mutate an instruction to another type.
+  class TypeMutator : public TypePromotionAction {
+    /// Record the original type.
+    Type *OrigTy;
+
+  public:
+    /// Mutate the type of \p Inst into \p NewTy.
+    TypeMutator(Instruction *Inst, Type *NewTy)
+        : TypePromotionAction(Inst), OrigTy(Inst->getType()) {
+      LLVM_DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy
+                        << "\n");
+      Inst->mutateType(NewTy);
+    }
+
+    /// Mutate the instruction back to its original type.
+    void undo() override {
+      LLVM_DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy
+                        << "\n");
+      Inst->mutateType(OrigTy);
+    }
+  };
+
+  /// Replace the uses of an instruction by another instruction.
+  class UsesReplacer : public TypePromotionAction {
+    /// Helper structure to keep track of the replaced uses.
+    struct InstructionAndIdx {
+      /// The instruction using the instruction.
+      Instruction *Inst;
+
+      /// The index where this instruction is used for Inst.
+      unsigned Idx;
+
+      InstructionAndIdx(Instruction *Inst, unsigned Idx)
+          : Inst(Inst), Idx(Idx) {}
+    };
+
+    /// Keep track of the original uses (pair Instruction, Index).
+    SmallVector<InstructionAndIdx, 4> OriginalUses;
+    /// Keep track of the debug users.
+    SmallVector<DbgValueInst *, 1> DbgValues;
+
+    /// Keep track of the new value so that we can undo it by replacing
+    /// instances of the new value with the original value.
+    Value *New;
+
+    using use_iterator = SmallVectorImpl<InstructionAndIdx>::iterator;
+
+  public:
+    /// Replace all the use of \p Inst by \p New.
+    UsesReplacer(Instruction *Inst, Value *New)
+        : TypePromotionAction(Inst), New(New) {
+      LLVM_DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New
+                        << "\n");
+      // Record the original uses.
+      for (Use &U : Inst->uses()) {
+        Instruction *UserI = cast<Instruction>(U.getUser());
+        OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo()));
+      }
+      // Record the debug uses separately. They are not in the instruction's
+      // use list, but they are replaced by RAUW.
+      findDbgValues(DbgValues, Inst);
+
+      // Now, we can replace the uses.
+      Inst->replaceAllUsesWith(New);
+    }
+
+    /// Reassign the original uses of Inst to Inst.
+    void undo() override {
+      LLVM_DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n");
+      for (InstructionAndIdx &Use : OriginalUses)
+        Use.Inst->setOperand(Use.Idx, Inst);
+      // RAUW has replaced all original uses with references to the new value,
+      // including the debug uses. Since we are undoing the replacements,
+      // the original debug uses must also be reinstated to maintain the
+      // correctness and utility of debug value instructions.
+      for (auto *DVI : DbgValues)
+        DVI->replaceVariableLocationOp(New, Inst);
+    }
+  };
+
+  /// Remove an instruction from the IR.
+  class InstructionRemover : public TypePromotionAction {
+    /// Original position of the instruction.
+    InsertionHandler Inserter;
+
+    /// Helper structure to hide all the link to the instruction. In other
+    /// words, this helps to do as if the instruction was removed.
+    OperandsHider Hider;
+
+    /// Keep track of the uses replaced, if any.
+    UsesReplacer *Replacer = nullptr;
+
+    /// Keep track of instructions removed.
+    SetOfInstrs &RemovedInsts;
+
+  public:
+    /// Remove all reference of \p Inst and optionally replace all its
+    /// uses with New.
+    /// \p RemovedInsts Keep track of the instructions removed by this Action.
+    /// \pre If !Inst->use_empty(), then New != nullptr
+    InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts,
+                       Value *New = nullptr)
+        : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst),
+          RemovedInsts(RemovedInsts) {
+      if (New)
+        Replacer = new UsesReplacer(Inst, New);
+      LLVM_DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
+      RemovedInsts.insert(Inst);
+      /// The instructions removed here will be freed after completing
+      /// optimizeBlock() for all blocks as we need to keep track of the
+      /// removed instructions during promotion.
+      Inst->removeFromParent();
+    }
+
+    ~InstructionRemover() override { delete Replacer; }
+
+    InstructionRemover &operator=(const InstructionRemover &other) = delete;
+    InstructionRemover(const InstructionRemover &other) = delete;
+
+    /// Resurrect the instruction and reassign it to the proper uses if
+    /// new value was provided when build this action.
+    void undo() override {
+      LLVM_DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n");
+      Inserter.insert(Inst);
+      if (Replacer)
+        Replacer->undo();
+      Hider.undo();
+      RemovedInsts.erase(Inst);
+    }
+  };
+
+public:
+  /// Restoration point.
+  /// The restoration point is a pointer to an action instead of an iterator
+  /// because the iterator may be invalidated but not the pointer.
+  using ConstRestorationPt = const TypePromotionAction *;
+
+  TypePromotionTransaction(SetOfInstrs &RemovedInsts)
+      : RemovedInsts(RemovedInsts) {}
+
+  /// Advocate every changes made in that transaction. Return true if any change
+  /// happen.
+  bool commit();
+
+  /// Undo all the changes made after the given point.
+  void rollback(ConstRestorationPt Point);
+
+  /// Get the current restoration point.
+  ConstRestorationPt getRestorationPoint() const;
+
+  /// \name API for IR modification with state keeping to support rollback.
+  /// @{
+  /// Same as Instruction::setOperand.
+  void setOperand(Instruction *Inst, unsigned Idx, Value *NewVal);
+
+  /// Same as Instruction::eraseFromParent.
+  void eraseInstruction(Instruction *Inst, Value *NewVal = nullptr);
+
+  /// Same as Value::replaceAllUsesWith.
+  void replaceAllUsesWith(Instruction *Inst, Value *New);
+
+  /// Same as Value::mutateType.
+  void mutateType(Instruction *Inst, Type *NewTy);
+
+  /// Same as IRBuilder::createTrunc.
+  Value *createTrunc(Instruction *Opnd, Type *Ty);
+
+  /// Same as IRBuilder::createSExt.
+  Value *createSExt(Instruction *Inst, Value *Opnd, Type *Ty);
+
+  /// Same as IRBuilder::createZExt.
+  Value *createZExt(Instruction *Inst, Value *Opnd, Type *Ty);
+
+  /// Same as Instruction::moveBefore.
+  void moveBefore(Instruction *Inst, Instruction *Before);
+  /// @}
+
+private:
+  /// The ordered list of actions made so far.
+  SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;
+
+  using CommitPt =
+      SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator;
+
+  SetOfInstrs &RemovedInsts;
+};
+
+} // end anonymous namespace
+
+void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
+                                          Value *NewVal) {
+  Actions.push_back(std::make_unique<TypePromotionTransaction::OperandSetter>(
+      Inst, Idx, NewVal));
+}
+
+void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
+                                                Value *NewVal) {
+  Actions.push_back(
+      std::make_unique<TypePromotionTransaction::InstructionRemover>(
+          Inst, RemovedInsts, NewVal));
+}
+
+void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
+                                                  Value *New) {
+  Actions.push_back(
+      std::make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New));
+}
+
+void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) {
+  Actions.push_back(
+      std::make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
+}
+
+Value *TypePromotionTransaction::createTrunc(Instruction *Opnd, Type *Ty) {
+  std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty));
+  Value *Val = Ptr->getBuiltValue();
+  Actions.push_back(std::move(Ptr));
+  return Val;
+}
+
+Value *TypePromotionTransaction::createSExt(Instruction *Inst, Value *Opnd,
+                                            Type *Ty) {
+  std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty));
+  Value *Val = Ptr->getBuiltValue();
+  Actions.push_back(std::move(Ptr));
+  return Val;
+}
+
+Value *TypePromotionTransaction::createZExt(Instruction *Inst, Value *Opnd,
+                                            Type *Ty) {
+  std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty));
+  Value *Val = Ptr->getBuiltValue();
+  Actions.push_back(std::move(Ptr));
+  return Val;
+}
+
+void TypePromotionTransaction::moveBefore(Instruction *Inst,
+                                          Instruction *Before) {
+  Actions.push_back(
+      std::make_unique<TypePromotionTransaction::InstructionMoveBefore>(
+          Inst, Before));
+}
+
+TypePromotionTransaction::ConstRestorationPt
+TypePromotionTransaction::getRestorationPoint() const {
+  return !Actions.empty() ? Actions.back().get() : nullptr;
+}
+
+bool TypePromotionTransaction::commit() {
+  for (std::unique_ptr<TypePromotionAction> &Action : Actions)
+    Action->commit();
+  bool Modified = !Actions.empty();
+  Actions.clear();
+  return Modified;
+}
+
+void TypePromotionTransaction::rollback(
+    TypePromotionTransaction::ConstRestorationPt Point) {
+  while (!Actions.empty() && Point != Actions.back().get()) {
+    std::unique_ptr<TypePromotionAction> Curr = Actions.pop_back_val();
+    Curr->undo();
+  }
+}
+
+namespace {
+
+/// A helper class for matching addressing modes.
+///
+/// This encapsulates the logic for matching the target-legal addressing modes.
+class AddressingModeMatcher {
+  SmallVectorImpl<Instruction *> &AddrModeInsts;
+  const TargetLowering &TLI;
+  const TargetRegisterInfo &TRI;
+  const DataLayout &DL;
+  const LoopInfo &LI;
+  const std::function<const DominatorTree &()> getDTFn;
+
+  /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
+  /// the memory instruction that we're computing this address for.
+  Type *AccessTy;
+  unsigned AddrSpace;
+  Instruction *MemoryInst;
+
+  /// This is the addressing mode that we're building up. This is
+  /// part of the return value of this addressing mode matching stuff.
+  ExtAddrMode &AddrMode;
+
+  /// The instructions inserted by other CodeGenPrepare optimizations.
+  const SetOfInstrs &InsertedInsts;
+
+  /// A map from the instructions to their type before promotion.
+  InstrToOrigTy &PromotedInsts;
+
+  /// The ongoing transaction where every action should be registered.
+  TypePromotionTransaction &TPT;
+
+  // A GEP which has too large offset to be folded into the addressing mode.
+  std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP;
+
+  /// This is set to true when we should not do profitability checks.
+  /// When true, IsProfitableToFoldIntoAddressingMode always returns true.
+  bool IgnoreProfitability;
+
+  /// True if we are optimizing for size.
+  bool OptSize = false;
+
+  ProfileSummaryInfo *PSI;
+  BlockFrequencyInfo *BFI;
+
+  AddressingModeMatcher(
+      SmallVectorImpl<Instruction *> &AMI, const TargetLowering &TLI,
+      const TargetRegisterInfo &TRI, const LoopInfo &LI,
+      const std::function<const DominatorTree &()> getDTFn, Type *AT,
+      unsigned AS, Instruction *MI, ExtAddrMode &AM,
+      const SetOfInstrs &InsertedInsts, InstrToOrigTy &PromotedInsts,
+      TypePromotionTransaction &TPT,
+      std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
+      bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
+      : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
+        DL(MI->getModule()->getDataLayout()), LI(LI), getDTFn(getDTFn),
+        AccessTy(AT), AddrSpace(AS), MemoryInst(MI), AddrMode(AM),
+        InsertedInsts(InsertedInsts), PromotedInsts(PromotedInsts), TPT(TPT),
+        LargeOffsetGEP(LargeOffsetGEP), OptSize(OptSize), PSI(PSI), BFI(BFI) {
+    IgnoreProfitability = false;
+  }
+
+public:
+  /// Find the maximal addressing mode that a load/store of V can fold,
+  /// give an access type of AccessTy.  This returns a list of involved
+  /// instructions in AddrModeInsts.
+  /// \p InsertedInsts The instructions inserted by other CodeGenPrepare
+  /// optimizations.
+  /// \p PromotedInsts maps the instructions to their type before promotion.
+  /// \p The ongoing transaction where every action should be registered.
+  static ExtAddrMode
+  Match(Value *V, Type *AccessTy, unsigned AS, Instruction *MemoryInst,
+        SmallVectorImpl<Instruction *> &AddrModeInsts,
+        const TargetLowering &TLI, const LoopInfo &LI,
+        const std::function<const DominatorTree &()> getDTFn,
+        const TargetRegisterInfo &TRI, const SetOfInstrs &InsertedInsts,
+        InstrToOrigTy &PromotedInsts, TypePromotionTransaction &TPT,
+        std::pair<AssertingVH<GetElementPtrInst>, int64_t> &LargeOffsetGEP,
+        bool OptSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
+    ExtAddrMode Result;
+
+    bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI, LI, getDTFn,
+                                         AccessTy, AS, MemoryInst, Result,
+                                         InsertedInsts, PromotedInsts, TPT,
+                                         LargeOffsetGEP, OptSize, PSI, BFI)
+                       .matchAddr(V, 0);
+    (void)Success;
+    assert(Success && "Couldn't select *anything*?");
+    return Result;
+  }
+
+private:
+  bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
+  bool matchAddr(Value *Addr, unsigned Depth);
+  bool matchOperationAddr(User *AddrInst, unsigned Opcode, unsigned Depth,
+                          bool *MovedAway = nullptr);
+  bool isProfitableToFoldIntoAddressingMode(Instruction *I,
+                                            ExtAddrMode &AMBefore,
+                                            ExtAddrMode &AMAfter);
+  bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);
+  bool isPromotionProfitable(unsigned NewCost, unsigned OldCost,
+                             Value *PromotedOperand) const;
+};
+
+class PhiNodeSet;
+
+/// An iterator for PhiNodeSet.
+class PhiNodeSetIterator {
+  PhiNodeSet *const Set;
+  size_t CurrentIndex = 0;
+
+public:
+  /// The constructor. Start should point to either a valid element, or be equal
+  /// to the size of the underlying SmallVector of the PhiNodeSet.
+  PhiNodeSetIterator(PhiNodeSet *const Set, size_t Start);
+  PHINode *operator*() const;
+  PhiNodeSetIterator &operator++();
+  bool operator==(const PhiNodeSetIterator &RHS) const;
+  bool operator!=(const PhiNodeSetIterator &RHS) const;
+};
+
+/// Keeps a set of PHINodes.
+///
+/// This is a minimal set implementation for a specific use case:
+/// It is very fast when there are very few elements, but also provides good
+/// performance when there are many. It is similar to SmallPtrSet, but also
+/// provides iteration by insertion order, which is deterministic and stable
+/// across runs. It is also similar to SmallSetVector, but provides removing
+/// elements in O(1) time. This is achieved by not actually removing the element
+/// from the underlying vector, so comes at the cost of using more memory, but
+/// that is fine, since PhiNodeSets are used as short lived objects.
+class PhiNodeSet {
+  friend class PhiNodeSetIterator;
+
+  using MapType = SmallDenseMap<PHINode *, size_t, 32>;
+  using iterator = PhiNodeSetIterator;
+
+  /// Keeps the elements in the order of their insertion in the underlying
+  /// vector. To achieve constant time removal, it never deletes any element.
+  SmallVector<PHINode *, 32> NodeList;
+
+  /// Keeps the elements in the underlying set implementation. This (and not the
+  /// NodeList defined above) is the source of truth on whether an element
+  /// is actually in the collection.
+  MapType NodeMap;
+
+  /// Points to the first valid (not deleted) element when the set is not empty
+  /// and the value is not zero. Equals to the size of the underlying vector
+  /// when the set is empty. When the value is 0, as in the beginning, the
+  /// first element may or may not be valid.
+  size_t FirstValidElement = 0;
+
+public:
+  /// Inserts a new element to the collection.
+  /// \returns true if the element is actually added, i.e. was not in the
+  /// collection before the operation.
+  bool insert(PHINode *Ptr) {
+    if (NodeMap.insert(std::make_pair(Ptr, NodeList.size())).second) {
+      NodeList.push_back(Ptr);
+      return true;
+    }
+    return false;
+  }
+
+  /// Removes the element from the collection.
+  /// \returns whether the element is actually removed, i.e. was in the
+  /// collection before the operation.
+  bool erase(PHINode *Ptr) {
+    if (NodeMap.erase(Ptr)) {
+      SkipRemovedElements(FirstValidElement);
+      return true;
+    }
+    return false;
+  }
+
+  /// Removes all elements and clears the collection.
+  void clear() {
+    NodeMap.clear();
+    NodeList.clear();
+    FirstValidElement = 0;
+  }
+
+  /// \returns an iterator that will iterate the elements in the order of
+  /// insertion.
+  iterator begin() {
+    if (FirstValidElement == 0)
+      SkipRemovedElements(FirstValidElement);
+    return PhiNodeSetIterator(this, FirstValidElement);
+  }
+
+  /// \returns an iterator that points to the end of the collection.
+  iterator end() { return PhiNodeSetIterator(this, NodeList.size()); }
+
+  /// Returns the number of elements in the collection.
+  size_t size() const { return NodeMap.size(); }
+
+  /// \returns 1 if the given element is in the collection, and 0 if otherwise.
+  size_t count(PHINode *Ptr) const { return NodeMap.count(Ptr); }
+
+private:
+  /// Updates the CurrentIndex so that it will point to a valid element.
+  ///
+  /// If the element of NodeList at CurrentIndex is valid, it does not
+  /// change it. If there are no more valid elements, it updates CurrentIndex
+  /// to point to the end of the NodeList.
+  void SkipRemovedElements(size_t &CurrentIndex) {
+    while (CurrentIndex < NodeList.size()) {
+      auto it = NodeMap.find(NodeList[CurrentIndex]);
+      // If the element has been deleted and added again later, NodeMap will
+      // point to a different index, so CurrentIndex will still be invalid.
+      if (it != NodeMap.end() && it->second == CurrentIndex)
+        break;
+      ++CurrentIndex;
+    }
+  }
+};
+
+PhiNodeSetIterator::PhiNodeSetIterator(PhiNodeSet *const Set, size_t Start)
+    : Set(Set), CurrentIndex(Start) {}
+
+PHINode *PhiNodeSetIterator::operator*() const {
+  assert(CurrentIndex < Set->NodeList.size() &&
+         "PhiNodeSet access out of range");
+  return Set->NodeList[CurrentIndex];
+}
+
+PhiNodeSetIterator &PhiNodeSetIterator::operator++() {
+  assert(CurrentIndex < Set->NodeList.size() &&
+         "PhiNodeSet access out of range");
+  ++CurrentIndex;
+  Set->SkipRemovedElements(CurrentIndex);
+  return *this;
+}
+
+bool PhiNodeSetIterator::operator==(const PhiNodeSetIterator &RHS) const {
+  return CurrentIndex == RHS.CurrentIndex;
+}
+
+bool PhiNodeSetIterator::operator!=(const PhiNodeSetIterator &RHS) const {
+  return !((*this) == RHS);
+}
+
+/// Keep track of simplification of Phi nodes.
+/// Accept the set of all phi nodes and erase phi node from this set
+/// if it is simplified.
+class SimplificationTracker {
+  DenseMap<Value *, Value *> Storage;
+  const SimplifyQuery &SQ;
+  // Tracks newly created Phi nodes. The elements are iterated by insertion
+  // order.
+  PhiNodeSet AllPhiNodes;
+  // Tracks newly created Select nodes.
+  SmallPtrSet<SelectInst *, 32> AllSelectNodes;
+
+public:
+  SimplificationTracker(const SimplifyQuery &sq) : SQ(sq) {}
+
+  Value *Get(Value *V) {
+    do {
+      auto SV = Storage.find(V);
+      if (SV == Storage.end())
+        return V;
+      V = SV->second;
+    } while (true);
+  }
+
+  Value *Simplify(Value *Val) {
+    SmallVector<Value *, 32> WorkList;
+    SmallPtrSet<Value *, 32> Visited;
+    WorkList.push_back(Val);
+    while (!WorkList.empty()) {
+      auto *P = WorkList.pop_back_val();
+      if (!Visited.insert(P).second)
+        continue;
+      if (auto *PI = dyn_cast<Instruction>(P))
+        if (Value *V = simplifyInstruction(cast<Instruction>(PI), SQ)) {
+          for (auto *U : PI->users())
+            WorkList.push_back(cast<Value>(U));
+          Put(PI, V);
+          PI->replaceAllUsesWith(V);
+          if (auto *PHI = dyn_cast<PHINode>(PI))
+            AllPhiNodes.erase(PHI);
+          if (auto *Select = dyn_cast<SelectInst>(PI))
+            AllSelectNodes.erase(Select);
+          PI->eraseFromParent();
+        }
+    }
+    return Get(Val);
+  }
+
+  void Put(Value *From, Value *To) { Storage.insert({From, To}); }
+
+  void ReplacePhi(PHINode *From, PHINode *To) {
+    Value *OldReplacement = Get(From);
+    while (OldReplacement != From) {
+      From = To;
+      To = dyn_cast<PHINode>(OldReplacement);
+      OldReplacement = Get(From);
+    }
+    assert(To && Get(To) == To && "Replacement PHI node is already replaced.");
+    Put(From, To);
+    From->replaceAllUsesWith(To);
+    AllPhiNodes.erase(From);
+    From->eraseFromParent();
+  }
+
+  PhiNodeSet &newPhiNodes() { return AllPhiNodes; }
+
+  void insertNewPhi(PHINode *PN) { AllPhiNodes.insert(PN); }
+
+  void insertNewSelect(SelectInst *SI) { AllSelectNodes.insert(SI); }
+
+  unsigned countNewPhiNodes() const { return AllPhiNodes.size(); }
+
+  unsigned countNewSelectNodes() const { return AllSelectNodes.size(); }
+
+  void destroyNewNodes(Type *CommonType) {
+    // For safe erasing, replace the uses with dummy value first.
+    auto *Dummy = PoisonValue::get(CommonType);
+    for (auto *I : AllPhiNodes) {
+      I->replaceAllUsesWith(Dummy);
+      I->eraseFromParent();
+    }
+    AllPhiNodes.clear();
+    for (auto *I : AllSelectNodes) {
+      I->replaceAllUsesWith(Dummy);
+      I->eraseFromParent();
+    }
+    AllSelectNodes.clear();
+  }
+};
+
+/// A helper class for combining addressing modes.
+class AddressingModeCombiner {
+  typedef DenseMap<Value *, Value *> FoldAddrToValueMapping;
+  typedef std::pair<PHINode *, PHINode *> PHIPair;
+
+private:
+  /// The addressing modes we've collected.
+  SmallVector<ExtAddrMode, 16> AddrModes;
+
+  /// The field in which the AddrModes differ, when we have more than one.
+  ExtAddrMode::FieldName DifferentField = ExtAddrMode::NoField;
+
+  /// Are the AddrModes that we have all just equal to their original values?
+  bool AllAddrModesTrivial = true;
+
+  /// Common Type for all different fields in addressing modes.
+  Type *CommonType = nullptr;
+
+  /// SimplifyQuery for simplifyInstruction utility.
+  const SimplifyQuery &SQ;
+
+  /// Original Address.
+  Value *Original;
+
+  /// Common value among addresses
+  Value *CommonValue = nullptr;
+
+public:
+  AddressingModeCombiner(const SimplifyQuery &_SQ, Value *OriginalValue)
+      : SQ(_SQ), Original(OriginalValue) {}
+
+  ~AddressingModeCombiner() { eraseCommonValueIfDead(); }
+
+  /// Get the combined AddrMode
+  const ExtAddrMode &getAddrMode() const { return AddrModes[0]; }
+
+  /// Add a new AddrMode if it's compatible with the AddrModes we already
+  /// have.
+  /// \return True iff we succeeded in doing so.
+  bool addNewAddrMode(ExtAddrMode &NewAddrMode) {
+    // Take note of if we have any non-trivial AddrModes, as we need to detect
+    // when all AddrModes are trivial as then we would introduce a phi or select
+    // which just duplicates what's already there.
+    AllAddrModesTrivial = AllAddrModesTrivial && NewAddrMode.isTrivial();
+
+    // If this is the first addrmode then everything is fine.
+    if (AddrModes.empty()) {
+      AddrModes.emplace_back(NewAddrMode);
+      return true;
+    }
+
+    // Figure out how different this is from the other address modes, which we
+    // can do just by comparing against the first one given that we only care
+    // about the cumulative difference.
+    ExtAddrMode::FieldName ThisDifferentField =
+        AddrModes[0].compare(NewAddrMode);
+    if (DifferentField == ExtAddrMode::NoField)
+      DifferentField = ThisDifferentField;
+    else if (DifferentField != ThisDifferentField)
+      DifferentField = ExtAddrMode::MultipleFields;
+
+    // If NewAddrMode differs in more than one dimension we cannot handle it.
+    bool CanHandle = DifferentField != ExtAddrMode::MultipleFields;
+
+    // If Scale Field is different then we reject.
+    CanHandle = CanHandle && DifferentField != ExtAddrMode::ScaleField;
+
+    // We also must reject the case when base offset is different and
+    // scale reg is not null, we cannot handle this case due to merge of
+    // different offsets will be used as ScaleReg.
+    CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseOffsField ||
+                              !NewAddrMode.ScaledReg);
+
+    // We also must reject the case when GV is different and BaseReg installed
+    // due to we want to use base reg as a merge of GV values.
+    CanHandle = CanHandle && (DifferentField != ExtAddrMode::BaseGVField ||
+                              !NewAddrMode.HasBaseReg);
+
+    // Even if NewAddMode is the same we still need to collect it due to
+    // original value is different. And later we will need all original values
+    // as anchors during finding the common Phi node.
+    if (CanHandle)
+      AddrModes.emplace_back(NewAddrMode);
+    else
+      AddrModes.clear();
+
+    return CanHandle;
+  }
+
+  /// Combine the addressing modes we've collected into a single
+  /// addressing mode.
+  /// \return True iff we successfully combined them or we only had one so
+  /// didn't need to combine them anyway.
+  bool combineAddrModes() {
+    // If we have no AddrModes then they can't be combined.
+    if (AddrModes.size() == 0)
+      return false;
+
+    // A single AddrMode can trivially be combined.
+    if (AddrModes.size() == 1 || DifferentField == ExtAddrMode::NoField)
+      return true;
+
+    // If the AddrModes we collected are all just equal to the value they are
+    // derived from then combining them wouldn't do anything useful.
+    if (AllAddrModesTrivial)
+      return false;
+
+    if (!addrModeCombiningAllowed())
+      return false;
+
+    // Build a map between <original value, basic block where we saw it> to
+    // value of base register.
+    // Bail out if there is no common type.
+    FoldAddrToValueMapping Map;
+    if (!initializeMap(Map))
+      return false;
+
+    CommonValue = findCommon(Map);
+    if (CommonValue)
+      AddrModes[0].SetCombinedField(DifferentField, CommonValue, AddrModes);
+    return CommonValue != nullptr;
+  }
+
+private:
+  /// `CommonValue` may be a placeholder inserted by us.
+  /// If the placeholder is not used, we should remove this dead instruction.
+  void eraseCommonValueIfDead() {
+    if (CommonValue && CommonValue->getNumUses() == 0)
+      if (Instruction *CommonInst = dyn_cast<Instruction>(CommonValue))
+        CommonInst->eraseFromParent();
+  }
+
+  /// Initialize Map with anchor values. For address seen
+  /// we set the value of different field saw in this address.
+  /// At the same time we find a common type for different field we will
+  /// use to create new Phi/Select nodes. Keep it in CommonType field.
+  /// Return false if there is no common type found.
+  bool initializeMap(FoldAddrToValueMapping &Map) {
+    // Keep track of keys where the value is null. We will need to replace it
+    // with constant null when we know the common type.
+    SmallVector<Value *, 2> NullValue;
+    Type *IntPtrTy = SQ.DL.getIntPtrType(AddrModes[0].OriginalValue->getType());
+    for (auto &AM : AddrModes) {
+      Value *DV = AM.GetFieldAsValue(DifferentField, IntPtrTy);
+      if (DV) {
+        auto *Type = DV->getType();
+        if (CommonType && CommonType != Type)
+          return false;
+        CommonType = Type;
+        Map[AM.OriginalValue] = DV;
+      } else {
+        NullValue.push_back(AM.OriginalValue);
+      }
+    }
+    assert(CommonType && "At least one non-null value must be!");
+    for (auto *V : NullValue)
+      Map[V] = Constant::getNullValue(CommonType);
+    return true;
+  }
+
+  /// We have mapping between value A and other value B where B was a field in
+  /// addressing mode represented by A. Also we have an original value C
+  /// representing an address we start with. Traversing from C through phi and
+  /// selects we ended up with A's in a map. This utility function tries to find
+  /// a value V which is a field in addressing mode C and traversing through phi
+  /// nodes and selects we will end up in corresponded values B in a map.
+  /// The utility will create a new Phi/Selects if needed.
+  // The simple example looks as follows:
+  // BB1:
+  //   p1 = b1 + 40
+  //   br cond BB2, BB3
+  // BB2:
+  //   p2 = b2 + 40
+  //   br BB3
+  // BB3:
+  //   p = phi [p1, BB1], [p2, BB2]
+  //   v = load p
+  // Map is
+  //   p1 -> b1
+  //   p2 -> b2
+  // Request is
+  //   p -> ?
+  // The function tries to find or build phi [b1, BB1], [b2, BB2] in BB3.
+  Value *findCommon(FoldAddrToValueMapping &Map) {
+    // Tracks the simplification of newly created phi nodes. The reason we use
+    // this mapping is because we will add new created Phi nodes in AddrToBase.
+    // Simplification of Phi nodes is recursive, so some Phi node may
+    // be simplified after we added it to AddrToBase. In reality this
+    // simplification is possible only if original phi/selects were not
+    // simplified yet.
+    // Using this mapping we can find the current value in AddrToBase.
+    SimplificationTracker ST(SQ);
+
+    // First step, DFS to create PHI nodes for all intermediate blocks.
+    // Also fill traverse order for the second step.
+    SmallVector<Value *, 32> TraverseOrder;
+    InsertPlaceholders(Map, TraverseOrder, ST);
+
+    // Second Step, fill new nodes by merged values and simplify if possible.
+    FillPlaceholders(Map, TraverseOrder, ST);
+
+    if (!AddrSinkNewSelects && ST.countNewSelectNodes() > 0) {
+      ST.destroyNewNodes(CommonType);
+      return nullptr;
+    }
+
+    // Now we'd like to match New Phi nodes to existed ones.
+    unsigned PhiNotMatchedCount = 0;
+    if (!MatchPhiSet(ST, AddrSinkNewPhis, PhiNotMatchedCount)) {
+      ST.destroyNewNodes(CommonType);
+      return nullptr;
+    }
+
+    auto *Result = ST.Get(Map.find(Original)->second);
+    if (Result) {
+      NumMemoryInstsPhiCreated += ST.countNewPhiNodes() + PhiNotMatchedCount;
+      NumMemoryInstsSelectCreated += ST.countNewSelectNodes();
+    }
+    return Result;
+  }
+
+  /// Try to match PHI node to Candidate.
+  /// Matcher tracks the matched Phi nodes.
+  bool MatchPhiNode(PHINode *PHI, PHINode *Candidate,
+                    SmallSetVector<PHIPair, 8> &Matcher,
+                    PhiNodeSet &PhiNodesToMatch) {
+    SmallVector<PHIPair, 8> WorkList;
+    Matcher.insert({PHI, Candidate});
+    SmallSet<PHINode *, 8> MatchedPHIs;
+    MatchedPHIs.insert(PHI);
+    WorkList.push_back({PHI, Candidate});
+    SmallSet<PHIPair, 8> Visited;
+    while (!WorkList.empty()) {
+      auto Item = WorkList.pop_back_val();
+      if (!Visited.insert(Item).second)
+        continue;
+      // We iterate over all incoming values to Phi to compare them.
+      // If values are different and both of them Phi and the first one is a
+      // Phi we added (subject to match) and both of them is in the same basic
+      // block then we can match our pair if values match. So we state that
+      // these values match and add it to work list to verify that.
+      for (auto *B : Item.first->blocks()) {
+        Value *FirstValue = Item.first->getIncomingValueForBlock(B);
+        Value *SecondValue = Item.second->getIncomingValueForBlock(B);
+        if (FirstValue == SecondValue)
+          continue;
+
+        PHINode *FirstPhi = dyn_cast<PHINode>(FirstValue);
+        PHINode *SecondPhi = dyn_cast<PHINode>(SecondValue);
+
+        // One of them is not Phi or
+        // The first one is not Phi node from the set we'd like to match or
+        // Phi nodes from different basic blocks then
+        // we will not be able to match.
+        if (!FirstPhi || !SecondPhi || !PhiNodesToMatch.count(FirstPhi) ||
+            FirstPhi->getParent() != SecondPhi->getParent())
+          return false;
+
+        // If we already matched them then continue.
+        if (Matcher.count({FirstPhi, SecondPhi}))
+          continue;
+        // So the values are different and does not match. So we need them to
+        // match. (But we register no more than one match per PHI node, so that
+        // we won't later try to replace them twice.)
+        if (MatchedPHIs.insert(FirstPhi).second)
+          Matcher.insert({FirstPhi, SecondPhi});
+        // But me must check it.
+        WorkList.push_back({FirstPhi, SecondPhi});
+      }
+    }
+    return true;
+  }
+
+  /// For the given set of PHI nodes (in the SimplificationTracker) try
+  /// to find their equivalents.
+  /// Returns false if this matching fails and creation of new Phi is disabled.
+  bool MatchPhiSet(SimplificationTracker &ST, bool AllowNewPhiNodes,
+                   unsigned &PhiNotMatchedCount) {
+    // Matched and PhiNodesToMatch iterate their elements in a deterministic
+    // order, so the replacements (ReplacePhi) are also done in a deterministic
+    // order.
+    SmallSetVector<PHIPair, 8> Matched;
+    SmallPtrSet<PHINode *, 8> WillNotMatch;
+    PhiNodeSet &PhiNodesToMatch = ST.newPhiNodes();
+    while (PhiNodesToMatch.size()) {
+      PHINode *PHI = *PhiNodesToMatch.begin();
+
+      // Add us, if no Phi nodes in the basic block we do not match.
+      WillNotMatch.clear();
+      WillNotMatch.insert(PHI);
+
+      // Traverse all Phis until we found equivalent or fail to do that.
+      bool IsMatched = false;
+      for (auto &P : PHI->getParent()->phis()) {
+        // Skip new Phi nodes.
+        if (PhiNodesToMatch.count(&P))
+          continue;
+        if ((IsMatched = MatchPhiNode(PHI, &P, Matched, PhiNodesToMatch)))
+          break;
+        // If it does not match, collect all Phi nodes from matcher.
+        // if we end up with no match, them all these Phi nodes will not match
+        // later.
+        for (auto M : Matched)
+          WillNotMatch.insert(M.first);
+        Matched.clear();
+      }
+      if (IsMatched) {
+        // Replace all matched values and erase them.
+        for (auto MV : Matched)
+          ST.ReplacePhi(MV.first, MV.second);
+        Matched.clear();
+        continue;
+      }
+      // If we are not allowed to create new nodes then bail out.
+      if (!AllowNewPhiNodes)
+        return false;
+      // Just remove all seen values in matcher. They will not match anything.
+      PhiNotMatchedCount += WillNotMatch.size();
+      for (auto *P : WillNotMatch)
+        PhiNodesToMatch.erase(P);
+    }
+    return true;
+  }
+  /// Fill the placeholders with values from predecessors and simplify them.
+  void FillPlaceholders(FoldAddrToValueMapping &Map,
+                        SmallVectorImpl<Value *> &TraverseOrder,
+                        SimplificationTracker &ST) {
+    while (!TraverseOrder.empty()) {
+      Value *Current = TraverseOrder.pop_back_val();
+      assert(Map.contains(Current) && "No node to fill!!!");
+      Value *V = Map[Current];
+
+      if (SelectInst *Select = dyn_cast<SelectInst>(V)) {
+        // CurrentValue also must be Select.
+        auto *CurrentSelect = cast<SelectInst>(Current);
+        auto *TrueValue = CurrentSelect->getTrueValue();
+        assert(Map.contains(TrueValue) && "No True Value!");
+        Select->setTrueValue(ST.Get(Map[TrueValue]));
+        auto *FalseValue = CurrentSelect->getFalseValue();
+        assert(Map.contains(FalseValue) && "No False Value!");
+        Select->setFalseValue(ST.Get(Map[FalseValue]));
+      } else {
+        // Must be a Phi node then.
+        auto *PHI = cast<PHINode>(V);
+        // Fill the Phi node with values from predecessors.
+        for (auto *B : predecessors(PHI->getParent())) {
+          Value *PV = cast<PHINode>(Current)->getIncomingValueForBlock(B);
+          assert(Map.contains(PV) && "No predecessor Value!");
+          PHI->addIncoming(ST.Get(Map[PV]), B);
+        }
+      }
+      Map[Current] = ST.Simplify(V);
+    }
+  }
+
+  /// Starting from original value recursively iterates over def-use chain up to
+  /// known ending values represented in a map. For each traversed phi/select
+  /// inserts a placeholder Phi or Select.
+  /// Reports all new created Phi/Select nodes by adding them to set.
+  /// Also reports and order in what values have been traversed.
+  void InsertPlaceholders(FoldAddrToValueMapping &Map,
+                          SmallVectorImpl<Value *> &TraverseOrder,
+                          SimplificationTracker &ST) {
+    SmallVector<Value *, 32> Worklist;
+    assert((isa<PHINode>(Original) || isa<SelectInst>(Original)) &&
+           "Address must be a Phi or Select node");
+    auto *Dummy = PoisonValue::get(CommonType);
+    Worklist.push_back(Original);
+    while (!Worklist.empty()) {
+      Value *Current = Worklist.pop_back_val();
+      // if it is already visited or it is an ending value then skip it.
+      if (Map.contains(Current))
+        continue;
+      TraverseOrder.push_back(Current);
+
+      // CurrentValue must be a Phi node or select. All others must be covered
+      // by anchors.
+      if (SelectInst *CurrentSelect = dyn_cast<SelectInst>(Current)) {
+        // Is it OK to get metadata from OrigSelect?!
+        // Create a Select placeholder with dummy value.
+        SelectInst *Select = SelectInst::Create(
+            CurrentSelect->getCondition(), Dummy, Dummy,
+            CurrentSelect->getName(), CurrentSelect, CurrentSelect);
+        Map[Current] = Select;
+        ST.insertNewSelect(Select);
+        // We are interested in True and False values.
+        Worklist.push_back(CurrentSelect->getTrueValue());
+        Worklist.push_back(CurrentSelect->getFalseValue());
+      } else {
+        // It must be a Phi node then.
+        PHINode *CurrentPhi = cast<PHINode>(Current);
+        unsigned PredCount = CurrentPhi->getNumIncomingValues();
+        PHINode *PHI =
+            PHINode::Create(CommonType, PredCount, "sunk_phi", CurrentPhi);
+        Map[Current] = PHI;
+        ST.insertNewPhi(PHI);
+        append_range(Worklist, CurrentPhi->incoming_values());
+      }
+    }
+  }
+
+  bool addrModeCombiningAllowed() {
+    if (DisableComplexAddrModes)
+      return false;
+    switch (DifferentField) {
+    default:
+      return false;
+    case ExtAddrMode::BaseRegField:
+      return AddrSinkCombineBaseReg;
+    case ExtAddrMode::BaseGVField:
+      return AddrSinkCombineBaseGV;
+    case ExtAddrMode::BaseOffsField:
+      return AddrSinkCombineBaseOffs;
+    case ExtAddrMode::ScaledRegField:
+      return AddrSinkCombineScaledReg;
+    }
+  }
+};
+} // end anonymous namespace
+
+/// Try adding ScaleReg*Scale to the current addressing mode.
+/// Return true and update AddrMode if this addr mode is legal for the target,
+/// false if not.
+bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
+                                             unsigned Depth) {
+  // If Scale is 1, then this is the same as adding ScaleReg to the addressing
+  // mode.  Just process that directly.
+  if (Scale == 1)
+    return matchAddr(ScaleReg, Depth);
+
+  // If the scale is 0, it takes nothing to add this.
+  if (Scale == 0)
+    return true;
+
+  // If we already have a scale of this value, we can add to it, otherwise, we
+  // need an available scale field.
+  if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
+    return false;
+
+  ExtAddrMode TestAddrMode = AddrMode;
+
+  // Add scale to turn X*4+X*3 -> X*7.  This could also do things like
+  // [A+B + A*7] -> [B+A*8].
+  TestAddrMode.Scale += Scale;
+  TestAddrMode.ScaledReg = ScaleReg;
+
+  // If the new address isn't legal, bail out.
+  if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace))
+    return false;
+
+  // It was legal, so commit it.
+  AddrMode = TestAddrMode;
+
+  // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
+  // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
+  // X*Scale + C*Scale to addr mode. If we found available IV increment, do not
+  // go any further: we can reuse it and cannot eliminate it.
+  ConstantInt *CI = nullptr;
+  Value *AddLHS = nullptr;
+  if (isa<Instruction>(ScaleReg) && // not a constant expr.
+      match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI))) &&
+      !isIVIncrement(ScaleReg, &LI) && CI->getValue().isSignedIntN(64)) {
+    TestAddrMode.InBounds = false;
+    TestAddrMode.ScaledReg = AddLHS;
+    TestAddrMode.BaseOffs += CI->getSExtValue() * TestAddrMode.Scale;
+
+    // If this addressing mode is legal, commit it and remember that we folded
+    // this instruction.
+    if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) {
+      AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
+      AddrMode = TestAddrMode;
+      return true;
+    }
+    // Restore status quo.
+    TestAddrMode = AddrMode;
+  }
+
+  // If this is an add recurrence with a constant step, return the increment
+  // instruction and the canonicalized step.
+  auto GetConstantStep =
+      [this](const Value *V) -> std::optional<std::pair<Instruction *, APInt>> {
+    auto *PN = dyn_cast<PHINode>(V);
+    if (!PN)
+      return std::nullopt;
+    auto IVInc = getIVIncrement(PN, &LI);
+    if (!IVInc)
+      return std::nullopt;
+    // TODO: The result of the intrinsics above is two-complement. However when
+    // IV inc is expressed as add or sub, iv.next is potentially a poison value.
+    // If it has nuw or nsw flags, we need to make sure that these flags are
+    // inferrable at the point of memory instruction. Otherwise we are replacing
+    // well-defined two-complement computation with poison. Currently, to avoid
+    // potentially complex analysis needed to prove this, we reject such cases.
+    if (auto *OIVInc = dyn_cast<OverflowingBinaryOperator>(IVInc->first))
+      if (OIVInc->hasNoSignedWrap() || OIVInc->hasNoUnsignedWrap())
+        return std::nullopt;
+    if (auto *ConstantStep = dyn_cast<ConstantInt>(IVInc->second))
+      return std::make_pair(IVInc->first, ConstantStep->getValue());
+    return std::nullopt;
+  };
+
+  // Try to account for the following special case:
+  // 1. ScaleReg is an inductive variable;
+  // 2. We use it with non-zero offset;
+  // 3. IV's increment is available at the point of memory instruction.
+  //
+  // In this case, we may reuse the IV increment instead of the IV Phi to
+  // achieve the following advantages:
+  // 1. If IV step matches the offset, we will have no need in the offset;
+  // 2. Even if they don't match, we will reduce the overlap of living IV
+  //    and IV increment, that will potentially lead to better register
+  //    assignment.
+  if (AddrMode.BaseOffs) {
+    if (auto IVStep = GetConstantStep(ScaleReg)) {
+      Instruction *IVInc = IVStep->first;
+      // The following assert is important to ensure a lack of infinite loops.
+      // This transforms is (intentionally) the inverse of the one just above.
+      // If they don't agree on the definition of an increment, we'd alternate
+      // back and forth indefinitely.
+      assert(isIVIncrement(IVInc, &LI) && "implied by GetConstantStep");
+      APInt Step = IVStep->second;
+      APInt Offset = Step * AddrMode.Scale;
+      if (Offset.isSignedIntN(64)) {
+        TestAddrMode.InBounds = false;
+        TestAddrMode.ScaledReg = IVInc;
+        TestAddrMode.BaseOffs -= Offset.getLimitedValue();
+        // If this addressing mode is legal, commit it..
+        // (Note that we defer the (expensive) domtree base legality check
+        // to the very last possible point.)
+        if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace) &&
+            getDTFn().dominates(IVInc, MemoryInst)) {
+          AddrModeInsts.push_back(cast<Instruction>(IVInc));
+          AddrMode = TestAddrMode;
+          return true;
+        }
+        // Restore status quo.
+        TestAddrMode = AddrMode;
+      }
+    }
+  }
+
+  // Otherwise, just return what we have.
+  return true;
+}
+
+/// This is a little filter, which returns true if an addressing computation
+/// involving I might be folded into a load/store accessing it.
+/// This doesn't need to be perfect, but needs to accept at least
+/// the set of instructions that MatchOperationAddr can.
+static bool MightBeFoldableInst(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+    // Don't touch identity bitcasts.
+    if (I->getType() == I->getOperand(0)->getType())
+      return false;
+    return I->getType()->isIntOrPtrTy();
+  case Instruction::PtrToInt:
+    // PtrToInt is always a noop, as we know that the int type is pointer sized.
+    return true;
+  case Instruction::IntToPtr:
+    // We know the input is intptr_t, so this is foldable.
+    return true;
+  case Instruction::Add:
+    return true;
+  case Instruction::Mul:
+  case Instruction::Shl:
+    // Can only handle X*C and X << C.
+    return isa<ConstantInt>(I->getOperand(1));
+  case Instruction::GetElementPtr:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// Check whether or not \p Val is a legal instruction for \p TLI.
+/// \note \p Val is assumed to be the product of some type promotion.
+/// Therefore if \p Val has an undefined state in \p TLI, this is assumed
+/// to be legal, as the non-promoted value would have had the same state.
+static bool isPromotedInstructionLegal(const TargetLowering &TLI,
+                                       const DataLayout &DL, Value *Val) {
+  Instruction *PromotedInst = dyn_cast<Instruction>(Val);
+  if (!PromotedInst)
+    return false;
+  int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());
+  // If the ISDOpcode is undefined, it was undefined before the promotion.
+  if (!ISDOpcode)
+    return true;
+  // Otherwise, check if the promoted instruction is legal or not.
+  return TLI.isOperationLegalOrCustom(
+      ISDOpcode, TLI.getValueType(DL, PromotedInst->getType()));
+}
+
+namespace {
+
+/// Hepler class to perform type promotion.
+class TypePromotionHelper {
+  /// Utility function to add a promoted instruction \p ExtOpnd to
+  /// \p PromotedInsts and record the type of extension we have seen.
+  static void addPromotedInst(InstrToOrigTy &PromotedInsts,
+                              Instruction *ExtOpnd, bool IsSExt) {
+    ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension;
+    InstrToOrigTy::iterator It = PromotedInsts.find(ExtOpnd);
+    if (It != PromotedInsts.end()) {
+      // If the new extension is same as original, the information in
+      // PromotedInsts[ExtOpnd] is still correct.
+      if (It->second.getInt() == ExtTy)
+        return;
+
+      // Now the new extension is different from old extension, we make
+      // the type information invalid by setting extension type to
+      // BothExtension.
+      ExtTy = BothExtension;
+    }
+    PromotedInsts[ExtOpnd] = TypeIsSExt(ExtOpnd->getType(), ExtTy);
+  }
+
+  /// Utility function to query the original type of instruction \p Opnd
+  /// with a matched extension type. If the extension doesn't match, we
+  /// cannot use the information we had on the original type.
+  /// BothExtension doesn't match any extension type.
+  static const Type *getOrigType(const InstrToOrigTy &PromotedInsts,
+                                 Instruction *Opnd, bool IsSExt) {
+    ExtType ExtTy = IsSExt ? SignExtension : ZeroExtension;
+    InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd);
+    if (It != PromotedInsts.end() && It->second.getInt() == ExtTy)
+      return It->second.getPointer();
+    return nullptr;
+  }
+
+  /// Utility function to check whether or not a sign or zero extension
+  /// of \p Inst with \p ConsideredExtType can be moved through \p Inst by
+  /// either using the operands of \p Inst or promoting \p Inst.
+  /// The type of the extension is defined by \p IsSExt.
+  /// In other words, check if:
+  /// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType.
+  /// #1 Promotion applies:
+  /// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...).
+  /// #2 Operand reuses:
+  /// ext opnd1 to ConsideredExtType.
+  /// \p PromotedInsts maps the instructions to their type before promotion.
+  static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType,
+                            const InstrToOrigTy &PromotedInsts, bool IsSExt);
+
+  /// Utility function to determine if \p OpIdx should be promoted when
+  /// promoting \p Inst.
+  static bool shouldExtOperand(const Instruction *Inst, int OpIdx) {
+    return !(isa<SelectInst>(Inst) && OpIdx == 0);
+  }
+
+  /// Utility function to promote the operand of \p Ext when this
+  /// operand is a promotable trunc or sext or zext.
+  /// \p PromotedInsts maps the instructions to their type before promotion.
+  /// \p CreatedInstsCost[out] contains the cost of all instructions
+  /// created to promote the operand of Ext.
+  /// Newly added extensions are inserted in \p Exts.
+  /// Newly added truncates are inserted in \p Truncs.
+  /// Should never be called directly.
+  /// \return The promoted value which is used instead of Ext.
+  static Value *promoteOperandForTruncAndAnyExt(
+      Instruction *Ext, TypePromotionTransaction &TPT,
+      InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
+      SmallVectorImpl<Instruction *> *Exts,
+      SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI);
+
+  /// Utility function to promote the operand of \p Ext when this
+  /// operand is promotable and is not a supported trunc or sext.
+  /// \p PromotedInsts maps the instructions to their type before promotion.
+  /// \p CreatedInstsCost[out] contains the cost of all the instructions
+  /// created to promote the operand of Ext.
+  /// Newly added extensions are inserted in \p Exts.
+  /// Newly added truncates are inserted in \p Truncs.
+  /// Should never be called directly.
+  /// \return The promoted value which is used instead of Ext.
+  static Value *promoteOperandForOther(Instruction *Ext,
+                                       TypePromotionTransaction &TPT,
+                                       InstrToOrigTy &PromotedInsts,
+                                       unsigned &CreatedInstsCost,
+                                       SmallVectorImpl<Instruction *> *Exts,
+                                       SmallVectorImpl<Instruction *> *Truncs,
+                                       const TargetLowering &TLI, bool IsSExt);
+
+  /// \see promoteOperandForOther.
+  static Value *signExtendOperandForOther(
+      Instruction *Ext, TypePromotionTransaction &TPT,
+      InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
+      SmallVectorImpl<Instruction *> *Exts,
+      SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
+    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
+                                  Exts, Truncs, TLI, true);
+  }
+
+  /// \see promoteOperandForOther.
+  static Value *zeroExtendOperandForOther(
+      Instruction *Ext, TypePromotionTransaction &TPT,
+      InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
+      SmallVectorImpl<Instruction *> *Exts,
+      SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
+    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
+                                  Exts, Truncs, TLI, false);
+  }
+
+public:
+  /// Type for the utility function that promotes the operand of Ext.
+  using Action = Value *(*)(Instruction *Ext, TypePromotionTransaction &TPT,
+                            InstrToOrigTy &PromotedInsts,
+                            unsigned &CreatedInstsCost,
+                            SmallVectorImpl<Instruction *> *Exts,
+                            SmallVectorImpl<Instruction *> *Truncs,
+                            const TargetLowering &TLI);
+
+  /// Given a sign/zero extend instruction \p Ext, return the appropriate
+  /// action to promote the operand of \p Ext instead of using Ext.
+  /// \return NULL if no promotable action is possible with the current
+  /// sign extension.
+  /// \p InsertedInsts keeps track of all the instructions inserted by the
+  /// other CodeGenPrepare optimizations. This information is important
+  /// because we do not want to promote these instructions as CodeGenPrepare
+  /// will reinsert them later. Thus creating an infinite loop: create/remove.
+  /// \p PromotedInsts maps the instructions to their type before promotion.
+  static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts,
+                          const TargetLowering &TLI,
+                          const InstrToOrigTy &PromotedInsts);
+};
+
+} // end anonymous namespace
+
+bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
+                                        Type *ConsideredExtType,
+                                        const InstrToOrigTy &PromotedInsts,
+                                        bool IsSExt) {
+  // The promotion helper does not know how to deal with vector types yet.
+  // To be able to fix that, we would need to fix the places where we
+  // statically extend, e.g., constants and such.
+  if (Inst->getType()->isVectorTy())
+    return false;
+
+  // We can always get through zext.
+  if (isa<ZExtInst>(Inst))
+    return true;
+
+  // sext(sext) is ok too.
+  if (IsSExt && isa<SExtInst>(Inst))
+    return true;
+
+  // We can get through binary operator, if it is legal. In other words, the
+  // binary operator must have a nuw or nsw flag.
+  if (const auto *BinOp = dyn_cast<BinaryOperator>(Inst))
+    if (isa<OverflowingBinaryOperator>(BinOp) &&
+        ((!IsSExt && BinOp->hasNoUnsignedWrap()) ||
+         (IsSExt && BinOp->hasNoSignedWrap())))
+      return true;
+
+  // ext(and(opnd, cst)) --> and(ext(opnd), ext(cst))
+  if ((Inst->getOpcode() == Instruction::And ||
+       Inst->getOpcode() == Instruction::Or))
+    return true;
+
+  // ext(xor(opnd, cst)) --> xor(ext(opnd), ext(cst))
+  if (Inst->getOpcode() == Instruction::Xor) {
+    // Make sure it is not a NOT.
+    if (const auto *Cst = dyn_cast<ConstantInt>(Inst->getOperand(1)))
+      if (!Cst->getValue().isAllOnes())
+        return true;
+  }
+
+  // zext(shrl(opnd, cst)) --> shrl(zext(opnd), zext(cst))
+  // It may change a poisoned value into a regular value, like
+  //     zext i32 (shrl i8 %val, 12)  -->  shrl i32 (zext i8 %val), 12
+  //          poisoned value                    regular value
+  // It should be OK since undef covers valid value.
+  if (Inst->getOpcode() == Instruction::LShr && !IsSExt)
+    return true;
+
+  // and(ext(shl(opnd, cst)), cst) --> and(shl(ext(opnd), ext(cst)), cst)
+  // It may change a poisoned value into a regular value, like
+  //     zext i32 (shl i8 %val, 12)  -->  shl i32 (zext i8 %val), 12
+  //          poisoned value                    regular value
+  // It should be OK since undef covers valid value.
+  if (Inst->getOpcode() == Instruction::Shl && Inst->hasOneUse()) {
+    const auto *ExtInst = cast<const Instruction>(*Inst->user_begin());
+    if (ExtInst->hasOneUse()) {
+      const auto *AndInst = dyn_cast<const Instruction>(*ExtInst->user_begin());
+      if (AndInst && AndInst->getOpcode() == Instruction::And) {
+        const auto *Cst = dyn_cast<ConstantInt>(AndInst->getOperand(1));
+        if (Cst &&
+            Cst->getValue().isIntN(Inst->getType()->getIntegerBitWidth()))
+          return true;
+      }
+    }
+  }
+
+  // Check if we can do the following simplification.
+  // ext(trunc(opnd)) --> ext(opnd)
+  if (!isa<TruncInst>(Inst))
+    return false;
+
+  Value *OpndVal = Inst->getOperand(0);
+  // Check if we can use this operand in the extension.
+  // If the type is larger than the result type of the extension, we cannot.
+  if (!OpndVal->getType()->isIntegerTy() ||
+      OpndVal->getType()->getIntegerBitWidth() >
+          ConsideredExtType->getIntegerBitWidth())
+    return false;
+
+  // If the operand of the truncate is not an instruction, we will not have
+  // any information on the dropped bits.
+  // (Actually we could for constant but it is not worth the extra logic).
+  Instruction *Opnd = dyn_cast<Instruction>(OpndVal);
+  if (!Opnd)
+    return false;
+
+  // Check if the source of the type is narrow enough.
+  // I.e., check that trunc just drops extended bits of the same kind of
+  // the extension.
+  // #1 get the type of the operand and check the kind of the extended bits.
+  const Type *OpndType = getOrigType(PromotedInsts, Opnd, IsSExt);
+  if (OpndType)
+    ;
+  else if ((IsSExt && isa<SExtInst>(Opnd)) || (!IsSExt && isa<ZExtInst>(Opnd)))
+    OpndType = Opnd->getOperand(0)->getType();
+  else
+    return false;
+
+  // #2 check that the truncate just drops extended bits.
+  return Inst->getType()->getIntegerBitWidth() >=
+         OpndType->getIntegerBitWidth();
+}
+
+TypePromotionHelper::Action TypePromotionHelper::getAction(
+    Instruction *Ext, const SetOfInstrs &InsertedInsts,
+    const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) {
+  assert((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+         "Unexpected instruction type");
+  Instruction *ExtOpnd = dyn_cast<Instruction>(Ext->getOperand(0));
+  Type *ExtTy = Ext->getType();
+  bool IsSExt = isa<SExtInst>(Ext);
+  // If the operand of the extension is not an instruction, we cannot
+  // get through.
+  // If it, check we can get through.
+  if (!ExtOpnd || !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt))
+    return nullptr;
+
+  // Do not promote if the operand has been added by codegenprepare.
+  // Otherwise, it means we are undoing an optimization that is likely to be
+  // redone, thus causing potential infinite loop.
+  if (isa<TruncInst>(ExtOpnd) && InsertedInsts.count(ExtOpnd))
+    return nullptr;
+
+  // SExt or Trunc instructions.
+  // Return the related handler.
+  if (isa<SExtInst>(ExtOpnd) || isa<TruncInst>(ExtOpnd) ||
+      isa<ZExtInst>(ExtOpnd))
+    return promoteOperandForTruncAndAnyExt;
+
+  // Regular instruction.
+  // Abort early if we will have to insert non-free instructions.
+  if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType()))
+    return nullptr;
+  return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther;
+}
+
+Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
+    Instruction *SExt, TypePromotionTransaction &TPT,
+    InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
+    SmallVectorImpl<Instruction *> *Exts,
+    SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
+  // By construction, the operand of SExt is an instruction. Otherwise we cannot
+  // get through it and this method should not be called.
+  Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
+  Value *ExtVal = SExt;
+  bool HasMergedNonFreeExt = false;
+  if (isa<ZExtInst>(SExtOpnd)) {
+    // Replace s|zext(zext(opnd))
+    // => zext(opnd).
+    HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd);
+    Value *ZExt =
+        TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType());
+    TPT.replaceAllUsesWith(SExt, ZExt);
+    TPT.eraseInstruction(SExt);
+    ExtVal = ZExt;
+  } else {
+    // Replace z|sext(trunc(opnd)) or sext(sext(opnd))
+    // => z|sext(opnd).
+    TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0));
+  }
+  CreatedInstsCost = 0;
+
+  // Remove dead code.
+  if (SExtOpnd->use_empty())
+    TPT.eraseInstruction(SExtOpnd);
+
+  // Check if the extension is still needed.
+  Instruction *ExtInst = dyn_cast<Instruction>(ExtVal);
+  if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) {
+    if (ExtInst) {
+      if (Exts)
+        Exts->push_back(ExtInst);
+      CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt;
+    }
+    return ExtVal;
+  }
+
+  // At this point we have: ext ty opnd to ty.
+  // Reassign the uses of ExtInst to the opnd and remove ExtInst.
+  Value *NextVal = ExtInst->getOperand(0);
+  TPT.eraseInstruction(ExtInst, NextVal);
+  return NextVal;
+}
+
+Value *TypePromotionHelper::promoteOperandForOther(
+    Instruction *Ext, TypePromotionTransaction &TPT,
+    InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
+    SmallVectorImpl<Instruction *> *Exts,
+    SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI,
+    bool IsSExt) {
+  // By construction, the operand of Ext is an instruction. Otherwise we cannot
+  // get through it and this method should not be called.
+  Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0));
+  CreatedInstsCost = 0;
+  if (!ExtOpnd->hasOneUse()) {
+    // ExtOpnd will be promoted.
+    // All its uses, but Ext, will need to use a truncated value of the
+    // promoted version.
+    // Create the truncate now.
+    Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType());
+    if (Instruction *ITrunc = dyn_cast<Instruction>(Trunc)) {
+      // Insert it just after the definition.
+      ITrunc->moveAfter(ExtOpnd);
+      if (Truncs)
+        Truncs->push_back(ITrunc);
+    }
+
+    TPT.replaceAllUsesWith(ExtOpnd, Trunc);
+    // Restore the operand of Ext (which has been replaced by the previous call
+    // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext.
+    TPT.setOperand(Ext, 0, ExtOpnd);
+  }
+
+  // Get through the Instruction:
+  // 1. Update its type.
+  // 2. Replace the uses of Ext by Inst.
+  // 3. Extend each operand that needs to be extended.
+
+  // Remember the original type of the instruction before promotion.
+  // This is useful to know that the high bits are sign extended bits.
+  addPromotedInst(PromotedInsts, ExtOpnd, IsSExt);
+  // Step #1.
+  TPT.mutateType(ExtOpnd, Ext->getType());
+  // Step #2.
+  TPT.replaceAllUsesWith(Ext, ExtOpnd);
+  // Step #3.
+  Instruction *ExtForOpnd = Ext;
+
+  LLVM_DEBUG(dbgs() << "Propagate Ext to operands\n");
+  for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx;
+       ++OpIdx) {
+    LLVM_DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n');
+    if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() ||
+        !shouldExtOperand(ExtOpnd, OpIdx)) {
+      LLVM_DEBUG(dbgs() << "No need to propagate\n");
+      continue;
+    }
+    // Check if we can statically extend the operand.
+    Value *Opnd = ExtOpnd->getOperand(OpIdx);
+    if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
+      LLVM_DEBUG(dbgs() << "Statically extend\n");
+      unsigned BitWidth = Ext->getType()->getIntegerBitWidth();
+      APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth)
+                            : Cst->getValue().zext(BitWidth);
+      TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal));
+      continue;
+    }
+    // UndefValue are typed, so we have to statically sign extend them.
+    if (isa<UndefValue>(Opnd)) {
+      LLVM_DEBUG(dbgs() << "Statically extend\n");
+      TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType()));
+      continue;
+    }
+
+    // Otherwise we have to explicitly sign extend the operand.
+    // Check if Ext was reused to extend an operand.
+    if (!ExtForOpnd) {
+      // If yes, create a new one.
+      LLVM_DEBUG(dbgs() << "More operands to ext\n");
+      Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
+                                    : TPT.createZExt(Ext, Opnd, Ext->getType());
+      if (!isa<Instruction>(ValForExtOpnd)) {
+        TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd);
+        continue;
+      }
+      ExtForOpnd = cast<Instruction>(ValForExtOpnd);
+    }
+    if (Exts)
+      Exts->push_back(ExtForOpnd);
+    TPT.setOperand(ExtForOpnd, 0, Opnd);
+
+    // Move the sign extension before the insertion point.
+    TPT.moveBefore(ExtForOpnd, ExtOpnd);
+    TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd);
+    CreatedInstsCost += !TLI.isExtFree(ExtForOpnd);
+    // If more sext are required, new instructions will have to be created.
+    ExtForOpnd = nullptr;
+  }
+  if (ExtForOpnd == Ext) {
+    LLVM_DEBUG(dbgs() << "Extension is useless now\n");
+    TPT.eraseInstruction(Ext);
+  }
+  return ExtOpnd;
+}
+
+/// Check whether or not promoting an instruction to a wider type is profitable.
+/// \p NewCost gives the cost of extension instructions created by the
+/// promotion.
+/// \p OldCost gives the cost of extension instructions before the promotion
+/// plus the number of instructions that have been
+/// matched in the addressing mode the promotion.
+/// \p PromotedOperand is the value that has been promoted.
+/// \return True if the promotion is profitable, false otherwise.
+bool AddressingModeMatcher::isPromotionProfitable(
+    unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const {
+  LLVM_DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost
+                    << '\n');
+  // The cost of the new extensions is greater than the cost of the
+  // old extension plus what we folded.
+  // This is not profitable.
+  if (NewCost > OldCost)
+    return false;
+  if (NewCost < OldCost)
+    return true;
+  // The promotion is neutral but it may help folding the sign extension in
+  // loads for instance.
+  // Check that we did not create an illegal instruction.
+  return isPromotedInstructionLegal(TLI, DL, PromotedOperand);
+}
+
+/// Given an instruction or constant expr, see if we can fold the operation
+/// into the addressing mode. If so, update the addressing mode and return
+/// true, otherwise return false without modifying AddrMode.
+/// If \p MovedAway is not NULL, it contains the information of whether or
+/// not AddrInst has to be folded into the addressing mode on success.
+/// If \p MovedAway == true, \p AddrInst will not be part of the addressing
+/// because it has been moved away.
+/// Thus AddrInst must not be added in the matched instructions.
+/// This state can happen when AddrInst is a sext, since it may be moved away.
+/// Therefore, AddrInst may not be valid when MovedAway is true and it must
+/// not be referenced anymore.
+bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
+                                               unsigned Depth,
+                                               bool *MovedAway) {
+  // Avoid exponential behavior on extremely deep expression trees.
+  if (Depth >= 5)
+    return false;
+
+  // By default, all matched instructions stay in place.
+  if (MovedAway)
+    *MovedAway = false;
+
+  switch (Opcode) {
+  case Instruction::PtrToInt:
+    // PtrToInt is always a noop, as we know that the int type is pointer sized.
+    return matchAddr(AddrInst->getOperand(0), Depth);
+  case Instruction::IntToPtr: {
+    auto AS = AddrInst->getType()->getPointerAddressSpace();
+    auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
+    // This inttoptr is a no-op if the integer type is pointer sized.
+    if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy)
+      return matchAddr(AddrInst->getOperand(0), Depth);
+    return false;
+  }
+  case Instruction::BitCast:
+    // BitCast is always a noop, and we can handle it as long as it is
+    // int->int or pointer->pointer (we don't want int<->fp or something).
+    if (AddrInst->getOperand(0)->getType()->isIntOrPtrTy() &&
+        // Don't touch identity bitcasts.  These were probably put here by LSR,
+        // and we don't want to mess around with them.  Assume it knows what it
+        // is doing.
+        AddrInst->getOperand(0)->getType() != AddrInst->getType())
+      return matchAddr(AddrInst->getOperand(0), Depth);
+    return false;
+  case Instruction::AddrSpaceCast: {
+    unsigned SrcAS =
+        AddrInst->getOperand(0)->getType()->getPointerAddressSpace();
+    unsigned DestAS = AddrInst->getType()->getPointerAddressSpace();
+    if (TLI.getTargetMachine().isNoopAddrSpaceCast(SrcAS, DestAS))
+      return matchAddr(AddrInst->getOperand(0), Depth);
+    return false;
+  }
+  case Instruction::Add: {
+    // Check to see if we can merge in one operand, then the other.  If so, we
+    // win.
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+    // Start a transaction at this point.
+    // The LHS may match but not the RHS.
+    // Therefore, we need a higher level restoration point to undo partially
+    // matched operation.
+    TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+        TPT.getRestorationPoint();
+
+    // Try to match an integer constant second to increase its chance of ending
+    // up in `BaseOffs`, resp. decrease its chance of ending up in `BaseReg`.
+    int First = 0, Second = 1;
+    if (isa<ConstantInt>(AddrInst->getOperand(First))
+      && !isa<ConstantInt>(AddrInst->getOperand(Second)))
+        std::swap(First, Second);
+    AddrMode.InBounds = false;
+    if (matchAddr(AddrInst->getOperand(First), Depth + 1) &&
+        matchAddr(AddrInst->getOperand(Second), Depth + 1))
+      return true;
+
+    // Restore the old addr mode info.
+    AddrMode = BackupAddrMode;
+    AddrModeInsts.resize(OldSize);
+    TPT.rollback(LastKnownGood);
+
+    // Otherwise this was over-aggressive.  Try merging operands in the opposite
+    // order.
+    if (matchAddr(AddrInst->getOperand(Second), Depth + 1) &&
+        matchAddr(AddrInst->getOperand(First), Depth + 1))
+      return true;
+
+    // Otherwise we definitely can't merge the ADD in.
+    AddrMode = BackupAddrMode;
+    AddrModeInsts.resize(OldSize);
+    TPT.rollback(LastKnownGood);
+    break;
+  }
+  // case Instruction::Or:
+  //  TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
+  // break;
+  case Instruction::Mul:
+  case Instruction::Shl: {
+    // Can only handle X*C and X << C.
+    AddrMode.InBounds = false;
+    ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
+    if (!RHS || RHS->getBitWidth() > 64)
+      return false;
+    int64_t Scale = Opcode == Instruction::Shl
+                        ? 1LL << RHS->getLimitedValue(RHS->getBitWidth() - 1)
+                        : RHS->getSExtValue();
+
+    return matchScaledValue(AddrInst->getOperand(0), Scale, Depth);
+  }
+  case Instruction::GetElementPtr: {
+    // Scan the GEP.  We check it if it contains constant offsets and at most
+    // one variable offset.
+    int VariableOperand = -1;
+    unsigned VariableScale = 0;
+
+    int64_t ConstantOffset = 0;
+    gep_type_iterator GTI = gep_type_begin(AddrInst);
+    for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
+      if (StructType *STy = GTI.getStructTypeOrNull()) {
+        const StructLayout *SL = DL.getStructLayout(STy);
+        unsigned Idx =
+            cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
+        ConstantOffset += SL->getElementOffset(Idx);
+      } else {
+        TypeSize TS = DL.getTypeAllocSize(GTI.getIndexedType());
+        if (TS.isNonZero()) {
+          // The optimisations below currently only work for fixed offsets.
+          if (TS.isScalable())
+            return false;
+          int64_t TypeSize = TS.getFixedValue();
+          if (ConstantInt *CI =
+                  dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
+            const APInt &CVal = CI->getValue();
+            if (CVal.getSignificantBits() <= 64) {
+              ConstantOffset += CVal.getSExtValue() * TypeSize;
+              continue;
+            }
+          }
+          // We only allow one variable index at the moment.
+          if (VariableOperand != -1)
+            return false;
+
+          // Remember the variable index.
+          VariableOperand = i;
+          VariableScale = TypeSize;
+        }
+      }
+    }
+
+    // A common case is for the GEP to only do a constant offset.  In this case,
+    // just add it to the disp field and check validity.
+    if (VariableOperand == -1) {
+      AddrMode.BaseOffs += ConstantOffset;
+      if (matchAddr(AddrInst->getOperand(0), Depth + 1)) {
+          if (!cast<GEPOperator>(AddrInst)->isInBounds())
+            AddrMode.InBounds = false;
+          return true;
+      }
+      AddrMode.BaseOffs -= ConstantOffset;
+
+      if (EnableGEPOffsetSplit && isa<GetElementPtrInst>(AddrInst) &&
+          TLI.shouldConsiderGEPOffsetSplit() && Depth == 0 &&
+          ConstantOffset > 0) {
+          // Record GEPs with non-zero offsets as candidates for splitting in
+          // the event that the offset cannot fit into the r+i addressing mode.
+          // Simple and common case that only one GEP is used in calculating the
+          // address for the memory access.
+          Value *Base = AddrInst->getOperand(0);
+          auto *BaseI = dyn_cast<Instruction>(Base);
+          auto *GEP = cast<GetElementPtrInst>(AddrInst);
+          if (isa<Argument>(Base) || isa<GlobalValue>(Base) ||
+              (BaseI && !isa<CastInst>(BaseI) &&
+               !isa<GetElementPtrInst>(BaseI))) {
+            // Make sure the parent block allows inserting non-PHI instructions
+            // before the terminator.
+            BasicBlock *Parent = BaseI ? BaseI->getParent()
+                                       : &GEP->getFunction()->getEntryBlock();
+            if (!Parent->getTerminator()->isEHPad())
+            LargeOffsetGEP = std::make_pair(GEP, ConstantOffset);
+          }
+      }
+
+      return false;
+    }
+
+    // Save the valid addressing mode in case we can't match.
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+
+    // See if the scale and offset amount is valid for this target.
+    AddrMode.BaseOffs += ConstantOffset;
+    if (!cast<GEPOperator>(AddrInst)->isInBounds())
+      AddrMode.InBounds = false;
+
+    // Match the base operand of the GEP.
+    if (!matchAddr(AddrInst->getOperand(0), Depth + 1)) {
+      // If it couldn't be matched, just stuff the value in a register.
+      if (AddrMode.HasBaseReg) {
+        AddrMode = BackupAddrMode;
+        AddrModeInsts.resize(OldSize);
+        return false;
+      }
+      AddrMode.HasBaseReg = true;
+      AddrMode.BaseReg = AddrInst->getOperand(0);
+    }
+
+    // Match the remaining variable portion of the GEP.
+    if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
+                          Depth)) {
+      // If it couldn't be matched, try stuffing the base into a register
+      // instead of matching it, and retrying the match of the scale.
+      AddrMode = BackupAddrMode;
+      AddrModeInsts.resize(OldSize);
+      if (AddrMode.HasBaseReg)
+        return false;
+      AddrMode.HasBaseReg = true;
+      AddrMode.BaseReg = AddrInst->getOperand(0);
+      AddrMode.BaseOffs += ConstantOffset;
+      if (!matchScaledValue(AddrInst->getOperand(VariableOperand),
+                            VariableScale, Depth)) {
+        // If even that didn't work, bail.
+        AddrMode = BackupAddrMode;
+        AddrModeInsts.resize(OldSize);
+        return false;
+      }
+    }
+
+    return true;
+  }
+  case Instruction::SExt:
+  case Instruction::ZExt: {
+    Instruction *Ext = dyn_cast<Instruction>(AddrInst);
+    if (!Ext)
+      return false;
+
+    // Try to move this ext out of the way of the addressing mode.
+    // Ask for a method for doing so.
+    TypePromotionHelper::Action TPH =
+        TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts);
+    if (!TPH)
+      return false;
+
+    TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+        TPT.getRestorationPoint();
+    unsigned CreatedInstsCost = 0;
+    unsigned ExtCost = !TLI.isExtFree(Ext);
+    Value *PromotedOperand =
+        TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI);
+    // SExt has been moved away.
+    // Thus either it will be rematched later in the recursive calls or it is
+    // gone. Anyway, we must not fold it into the addressing mode at this point.
+    // E.g.,
+    // op = add opnd, 1
+    // idx = ext op
+    // addr = gep base, idx
+    // is now:
+    // promotedOpnd = ext opnd            <- no match here
+    // op = promoted_add promotedOpnd, 1  <- match (later in recursive calls)
+    // addr = gep base, op                <- match
+    if (MovedAway)
+      *MovedAway = true;
+
+    assert(PromotedOperand &&
+           "TypePromotionHelper should have filtered out those cases");
+
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+
+    if (!matchAddr(PromotedOperand, Depth) ||
+        // The total of the new cost is equal to the cost of the created
+        // instructions.
+        // The total of the old cost is equal to the cost of the extension plus
+        // what we have saved in the addressing mode.
+        !isPromotionProfitable(CreatedInstsCost,
+                               ExtCost + (AddrModeInsts.size() - OldSize),
+                               PromotedOperand)) {
+      AddrMode = BackupAddrMode;
+      AddrModeInsts.resize(OldSize);
+      LLVM_DEBUG(dbgs() << "Sign extension does not pay off: rollback\n");
+      TPT.rollback(LastKnownGood);
+      return false;
+    }
+    return true;
+  }
+  }
+  return false;
+}
+
+/// If we can, try to add the value of 'Addr' into the current addressing mode.
+/// If Addr can't be added to AddrMode this returns false and leaves AddrMode
+/// unmodified. This assumes that Addr is either a pointer type or intptr_t
+/// for the target.
+///
+bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
+  // Start a transaction at this point that we will rollback if the matching
+  // fails.
+  TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+      TPT.getRestorationPoint();
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
+    if (CI->getValue().isSignedIntN(64)) {
+      // Fold in immediates if legal for the target.
+      AddrMode.BaseOffs += CI->getSExtValue();
+      if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
+        return true;
+      AddrMode.BaseOffs -= CI->getSExtValue();
+    }
+  } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
+    // If this is a global variable, try to fold it into the addressing mode.
+    if (!AddrMode.BaseGV) {
+      AddrMode.BaseGV = GV;
+      if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
+        return true;
+      AddrMode.BaseGV = nullptr;
+    }
+  } else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+
+    // Check to see if it is possible to fold this operation.
+    bool MovedAway = false;
+    if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) {
+      // This instruction may have been moved away. If so, there is nothing
+      // to check here.
+      if (MovedAway)
+        return true;
+      // Okay, it's possible to fold this.  Check to see if it is actually
+      // *profitable* to do so.  We use a simple cost model to avoid increasing
+      // register pressure too much.
+      if (I->hasOneUse() ||
+          isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
+        AddrModeInsts.push_back(I);
+        return true;
+      }
+
+      // It isn't profitable to do this, roll back.
+      AddrMode = BackupAddrMode;
+      AddrModeInsts.resize(OldSize);
+      TPT.rollback(LastKnownGood);
+    }
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
+    if (matchOperationAddr(CE, CE->getOpcode(), Depth))
+      return true;
+    TPT.rollback(LastKnownGood);
+  } else if (isa<ConstantPointerNull>(Addr)) {
+    // Null pointer gets folded without affecting the addressing mode.
+    return true;
+  }
+
+  // Worse case, the target should support [reg] addressing modes. :)
+  if (!AddrMode.HasBaseReg) {
+    AddrMode.HasBaseReg = true;
+    AddrMode.BaseReg = Addr;
+    // Still check for legality in case the target supports [imm] but not [i+r].
+    if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
+      return true;
+    AddrMode.HasBaseReg = false;
+    AddrMode.BaseReg = nullptr;
+  }
+
+  // If the base register is already taken, see if we can do [r+r].
+  if (AddrMode.Scale == 0) {
+    AddrMode.Scale = 1;
+    AddrMode.ScaledReg = Addr;
+    if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
+      return true;
+    AddrMode.Scale = 0;
+    AddrMode.ScaledReg = nullptr;
+  }
+  // Couldn't match.
+  TPT.rollback(LastKnownGood);
+  return false;
+}
+
+/// Check to see if all uses of OpVal by the specified inline asm call are due
+/// to memory operands. If so, return true, otherwise return false.
+static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
+                                    const TargetLowering &TLI,
+                                    const TargetRegisterInfo &TRI) {
+  const Function *F = CI->getFunction();
+  TargetLowering::AsmOperandInfoVector TargetConstraints =
+      TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, *CI);
+
+  for (TargetLowering::AsmOperandInfo &OpInfo : TargetConstraints) {
+    // Compute the constraint code and ConstraintType to use.
+    TLI.ComputeConstraintToUse(OpInfo, SDValue());
+
+    // If this asm operand is our Value*, and if it isn't an indirect memory
+    // operand, we can't fold it!  TODO: Also handle C_Address?
+    if (OpInfo.CallOperandVal == OpVal &&
+        (OpInfo.ConstraintType != TargetLowering::C_Memory ||
+         !OpInfo.isIndirect))
+      return false;
+  }
+
+  return true;
+}
+
+/// Recursively walk all the uses of I until we find a memory use.
+/// If we find an obviously non-foldable instruction, return true.
+/// Add accessed addresses and types to MemoryUses.
+static bool FindAllMemoryUses(
+    Instruction *I, SmallVectorImpl<std::pair<Use *, Type *>> &MemoryUses,
+    SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI,
+    const TargetRegisterInfo &TRI, bool OptSize, ProfileSummaryInfo *PSI,
+    BlockFrequencyInfo *BFI, unsigned &SeenInsts) {
+  // If we already considered this instruction, we're done.
+  if (!ConsideredInsts.insert(I).second)
+    return false;
+
+  // If this is an obviously unfoldable instruction, bail out.
+  if (!MightBeFoldableInst(I))
+    return true;
+
+  // Loop over all the uses, recursively processing them.
+  for (Use &U : I->uses()) {
+    // Conservatively return true if we're seeing a large number or a deep chain
+    // of users. This avoids excessive compilation times in pathological cases.
+    if (SeenInsts++ >= MaxAddressUsersToScan)
+      return true;
+
+    Instruction *UserI = cast<Instruction>(U.getUser());
+    if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) {
+      MemoryUses.push_back({&U, LI->getType()});
+      continue;
+    }
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {
+      if (U.getOperandNo() != StoreInst::getPointerOperandIndex())
+        return true; // Storing addr, not into addr.
+      MemoryUses.push_back({&U, SI->getValueOperand()->getType()});
+      continue;
+    }
+
+    if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) {
+      if (U.getOperandNo() != AtomicRMWInst::getPointerOperandIndex())
+        return true; // Storing addr, not into addr.
+      MemoryUses.push_back({&U, RMW->getValOperand()->getType()});
+      continue;
+    }
+
+    if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) {
+      if (U.getOperandNo() != AtomicCmpXchgInst::getPointerOperandIndex())
+        return true; // Storing addr, not into addr.
+      MemoryUses.push_back({&U, CmpX->getCompareOperand()->getType()});
+      continue;
+    }
+
+    if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
+      if (CI->hasFnAttr(Attribute::Cold)) {
+        // If this is a cold call, we can sink the addressing calculation into
+        // the cold path.  See optimizeCallInst
+        bool OptForSize =
+            OptSize || llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
+        if (!OptForSize)
+          continue;
+      }
+
+      InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand());
+      if (!IA)
+        return true;
+
+      // If this is a memory operand, we're cool, otherwise bail out.
+      if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI))
+        return true;
+      continue;
+    }
+
+    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,
+                          PSI, BFI, SeenInsts))
+      return true;
+  }
+
+  return false;
+}
+
+static bool FindAllMemoryUses(
+    Instruction *I, SmallVectorImpl<std::pair<Use *, Type *>> &MemoryUses,
+    const TargetLowering &TLI, const TargetRegisterInfo &TRI, bool OptSize,
+    ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
+  unsigned SeenInsts = 0;
+  SmallPtrSet<Instruction *, 16> ConsideredInsts;
+  return FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI, OptSize,
+                           PSI, BFI, SeenInsts);
+}
+
+
+/// Return true if Val is already known to be live at the use site that we're
+/// folding it into. If so, there is no cost to include it in the addressing
+/// mode. KnownLive1 and KnownLive2 are two values that we know are live at the
+/// instruction already.
+bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,
+                                                   Value *KnownLive1,
+                                                   Value *KnownLive2) {
+  // If Val is either of the known-live values, we know it is live!
+  if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2)
+    return true;
+
+  // All values other than instructions and arguments (e.g. constants) are live.
+  if (!isa<Instruction>(Val) && !isa<Argument>(Val))
+    return true;
+
+  // If Val is a constant sized alloca in the entry block, it is live, this is
+  // true because it is just a reference to the stack/frame pointer, which is
+  // live for the whole function.
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
+    if (AI->isStaticAlloca())
+      return true;
+
+  // Check to see if this value is already used in the memory instruction's
+  // block.  If so, it's already live into the block at the very least, so we
+  // can reasonably fold it.
+  return Val->isUsedInBasicBlock(MemoryInst->getParent());
+}
+
+/// It is possible for the addressing mode of the machine to fold the specified
+/// instruction into a load or store that ultimately uses it.
+/// However, the specified instruction has multiple uses.
+/// Given this, it may actually increase register pressure to fold it
+/// into the load. For example, consider this code:
+///
+///     X = ...
+///     Y = X+1
+///     use(Y)   -> nonload/store
+///     Z = Y+1
+///     load Z
+///
+/// In this case, Y has multiple uses, and can be folded into the load of Z
+/// (yielding load [X+2]).  However, doing this will cause both "X" and "X+1" to
+/// be live at the use(Y) line.  If we don't fold Y into load Z, we use one
+/// fewer register.  Since Y can't be folded into "use(Y)" we don't increase the
+/// number of computations either.
+///
+/// Note that this (like most of CodeGenPrepare) is just a rough heuristic.  If
+/// X was live across 'load Z' for other reasons, we actually *would* want to
+/// fold the addressing mode in the Z case.  This would make Y die earlier.
+bool AddressingModeMatcher::isProfitableToFoldIntoAddressingMode(
+    Instruction *I, ExtAddrMode &AMBefore, ExtAddrMode &AMAfter) {
+  if (IgnoreProfitability)
+    return true;
+
+  // AMBefore is the addressing mode before this instruction was folded into it,
+  // and AMAfter is the addressing mode after the instruction was folded.  Get
+  // the set of registers referenced by AMAfter and subtract out those
+  // referenced by AMBefore: this is the set of values which folding in this
+  // address extends the lifetime of.
+  //
+  // Note that there are only two potential values being referenced here,
+  // BaseReg and ScaleReg (global addresses are always available, as are any
+  // folded immediates).
+  Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
+
+  // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
+  // lifetime wasn't extended by adding this instruction.
+  if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
+    BaseReg = nullptr;
+  if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
+    ScaledReg = nullptr;
+
+  // If folding this instruction (and it's subexprs) didn't extend any live
+  // ranges, we're ok with it.
+  if (!BaseReg && !ScaledReg)
+    return true;
+
+  // If all uses of this instruction can have the address mode sunk into them,
+  // we can remove the addressing mode and effectively trade one live register
+  // for another (at worst.)  In this context, folding an addressing mode into
+  // the use is just a particularly nice way of sinking it.
+  SmallVector<std::pair<Use *, Type *>, 16> MemoryUses;
+  if (FindAllMemoryUses(I, MemoryUses, TLI, TRI, OptSize, PSI, BFI))
+    return false; // Has a non-memory, non-foldable use!
+
+  // Now that we know that all uses of this instruction are part of a chain of
+  // computation involving only operations that could theoretically be folded
+  // into a memory use, loop over each of these memory operation uses and see
+  // if they could  *actually* fold the instruction.  The assumption is that
+  // addressing modes are cheap and that duplicating the computation involved
+  // many times is worthwhile, even on a fastpath. For sinking candidates
+  // (i.e. cold call sites), this serves as a way to prevent excessive code
+  // growth since most architectures have some reasonable small and fast way to
+  // compute an effective address.  (i.e LEA on x86)
+  SmallVector<Instruction *, 32> MatchedAddrModeInsts;
+  for (const std::pair<Use *, Type *> &Pair : MemoryUses) {
+    Value *Address = Pair.first->get();
+    Instruction *UserI = cast<Instruction>(Pair.first->getUser());
+    Type *AddressAccessTy = Pair.second;
+    unsigned AS = Address->getType()->getPointerAddressSpace();
+
+    // Do a match against the root of this address, ignoring profitability. This
+    // will tell us if the addressing mode for the memory operation will
+    // *actually* cover the shared instruction.
+    ExtAddrMode Result;
+    std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
+                                                                      0);
+    TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+        TPT.getRestorationPoint();
+    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI, LI, getDTFn,
+                                  AddressAccessTy, AS, UserI, Result,
+                                  InsertedInsts, PromotedInsts, TPT,
+                                  LargeOffsetGEP, OptSize, PSI, BFI);
+    Matcher.IgnoreProfitability = true;
+    bool Success = Matcher.matchAddr(Address, 0);
+    (void)Success;
+    assert(Success && "Couldn't select *anything*?");
+
+    // The match was to check the profitability, the changes made are not
+    // part of the original matcher. Therefore, they should be dropped
+    // otherwise the original matcher will not present the right state.
+    TPT.rollback(LastKnownGood);
+
+    // If the match didn't cover I, then it won't be shared by it.
+    if (!is_contained(MatchedAddrModeInsts, I))
+      return false;
+
+    MatchedAddrModeInsts.clear();
+  }
+
+  return true;
+}
+
+/// Return true if the specified values are defined in a
+/// different basic block than BB.
+static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() != BB;
+  return false;
+}
+
+/// Sink addressing mode computation immediate before MemoryInst if doing so
+/// can be done without increasing register pressure.  The need for the
+/// register pressure constraint means this can end up being an all or nothing
+/// decision for all uses of the same addressing computation.
+///
+/// Load and Store Instructions often have addressing modes that can do
+/// significant amounts of computation. As such, instruction selection will try
+/// to get the load or store to do as much computation as possible for the
+/// program. The problem is that isel can only see within a single block. As
+/// such, we sink as much legal addressing mode work into the block as possible.
+///
+/// This method is used to optimize both load/store and inline asms with memory
+/// operands.  It's also used to sink addressing computations feeding into cold
+/// call sites into their (cold) basic block.
+///
+/// The motivation for handling sinking into cold blocks is that doing so can
+/// both enable other address mode sinking (by satisfying the register pressure
+/// constraint above), and reduce register pressure globally (by removing the
+/// addressing mode computation from the fast path entirely.).
+bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
+                                        Type *AccessTy, unsigned AddrSpace) {
+  Value *Repl = Addr;
+
+  // Try to collapse single-value PHI nodes.  This is necessary to undo
+  // unprofitable PRE transformations.
+  SmallVector<Value *, 8> worklist;
+  SmallPtrSet<Value *, 16> Visited;
+  worklist.push_back(Addr);
+
+  // Use a worklist to iteratively look through PHI and select nodes, and
+  // ensure that the addressing mode obtained from the non-PHI/select roots of
+  // the graph are compatible.
+  bool PhiOrSelectSeen = false;
+  SmallVector<Instruction *, 16> AddrModeInsts;
+  const SimplifyQuery SQ(*DL, TLInfo);
+  AddressingModeCombiner AddrModes(SQ, Addr);
+  TypePromotionTransaction TPT(RemovedInsts);
+  TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+      TPT.getRestorationPoint();
+  while (!worklist.empty()) {
+    Value *V = worklist.pop_back_val();
+
+    // We allow traversing cyclic Phi nodes.
+    // In case of success after this loop we ensure that traversing through
+    // Phi nodes ends up with all cases to compute address of the form
+    //    BaseGV + Base + Scale * Index + Offset
+    // where Scale and Offset are constans and BaseGV, Base and Index
+    // are exactly the same Values in all cases.
+    // It means that BaseGV, Scale and Offset dominate our memory instruction
+    // and have the same value as they had in address computation represented
+    // as Phi. So we can safely sink address computation to memory instruction.
+    if (!Visited.insert(V).second)
+      continue;
+
+    // For a PHI node, push all of its incoming values.
+    if (PHINode *P = dyn_cast<PHINode>(V)) {
+      append_range(worklist, P->incoming_values());
+      PhiOrSelectSeen = true;
+      continue;
+    }
+    // Similar for select.
+    if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
+      worklist.push_back(SI->getFalseValue());
+      worklist.push_back(SI->getTrueValue());
+      PhiOrSelectSeen = true;
+      continue;
+    }
+
+    // For non-PHIs, determine the addressing mode being computed.  Note that
+    // the result may differ depending on what other uses our candidate
+    // addressing instructions might have.
+    AddrModeInsts.clear();
+    std::pair<AssertingVH<GetElementPtrInst>, int64_t> LargeOffsetGEP(nullptr,
+                                                                      0);
+    // Defer the query (and possible computation of) the dom tree to point of
+    // actual use.  It's expected that most address matches don't actually need
+    // the domtree.
+    auto getDTFn = [MemoryInst, this]() -> const DominatorTree & {
+      Function *F = MemoryInst->getParent()->getParent();
+      return this->getDT(*F);
+    };
+    ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
+        V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *LI, getDTFn,
+        *TRI, InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
+        BFI.get());
+
+    GetElementPtrInst *GEP = LargeOffsetGEP.first;
+    if (GEP && !NewGEPBases.count(GEP)) {
+      // If splitting the underlying data structure can reduce the offset of a
+      // GEP, collect the GEP.  Skip the GEPs that are the new bases of
+      // previously split data structures.
+      LargeOffsetGEPMap[GEP->getPointerOperand()].push_back(LargeOffsetGEP);
+      LargeOffsetGEPID.insert(std::make_pair(GEP, LargeOffsetGEPID.size()));
+    }
+
+    NewAddrMode.OriginalValue = V;
+    if (!AddrModes.addNewAddrMode(NewAddrMode))
+      break;
+  }
+
+  // Try to combine the AddrModes we've collected. If we couldn't collect any,
+  // or we have multiple but either couldn't combine them or combining them
+  // wouldn't do anything useful, bail out now.
+  if (!AddrModes.combineAddrModes()) {
+    TPT.rollback(LastKnownGood);
+    return false;
+  }
+  bool Modified = TPT.commit();
+
+  // Get the combined AddrMode (or the only AddrMode, if we only had one).
+  ExtAddrMode AddrMode = AddrModes.getAddrMode();
+
+  // If all the instructions matched are already in this BB, don't do anything.
+  // If we saw a Phi node then it is not local definitely, and if we saw a
+  // select then we want to push the address calculation past it even if it's
+  // already in this BB.
+  if (!PhiOrSelectSeen && none_of(AddrModeInsts, [&](Value *V) {
+        return IsNonLocalValue(V, MemoryInst->getParent());
+      })) {
+    LLVM_DEBUG(dbgs() << "CGP: Found      local addrmode: " << AddrMode
+                      << "\n");
+    return Modified;
+  }
+
+  // Insert this computation right after this user.  Since our caller is
+  // scanning from the top of the BB to the bottom, reuse of the expr are
+  // guaranteed to happen later.
+  IRBuilder<> Builder(MemoryInst);
+
+  // Now that we determined the addressing expression we want to use and know
+  // that we have to sink it into this block.  Check to see if we have already
+  // done this for some other load/store instr in this block.  If so, reuse
+  // the computation.  Before attempting reuse, check if the address is valid
+  // as it may have been erased.
+
+  WeakTrackingVH SunkAddrVH = SunkAddrs[Addr];
+
+  Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
+  Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
+  if (SunkAddr) {
+    LLVM_DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode
+                      << " for " << *MemoryInst << "\n");
+    if (SunkAddr->getType() != Addr->getType()) {
+      if (SunkAddr->getType()->getPointerAddressSpace() !=
+              Addr->getType()->getPointerAddressSpace() &&
+          !DL->isNonIntegralPointerType(Addr->getType())) {
+        // There are two reasons the address spaces might not match: a no-op
+        // addrspacecast, or a ptrtoint/inttoptr pair. Either way, we emit a
+        // ptrtoint/inttoptr pair to ensure we match the original semantics.
+        // TODO: allow bitcast between different address space pointers with the
+        // same size.
+        SunkAddr = Builder.CreatePtrToInt(SunkAddr, IntPtrTy, "sunkaddr");
+        SunkAddr =
+            Builder.CreateIntToPtr(SunkAddr, Addr->getType(), "sunkaddr");
+      } else
+        SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
+    }
+  } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&
+                                   SubtargetInfo->addrSinkUsingGEPs())) {
+    // By default, we use the GEP-based method when AA is used later. This
+    // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
+    LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
+                      << " for " << *MemoryInst << "\n");
+    Value *ResultPtr = nullptr, *ResultIndex = nullptr;
+
+    // First, find the pointer.
+    if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()) {
+      ResultPtr = AddrMode.BaseReg;
+      AddrMode.BaseReg = nullptr;
+    }
+
+    if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()) {
+      // We can't add more than one pointer together, nor can we scale a
+      // pointer (both of which seem meaningless).
+      if (ResultPtr || AddrMode.Scale != 1)
+        return Modified;
+
+      ResultPtr = AddrMode.ScaledReg;
+      AddrMode.Scale = 0;
+    }
+
+    // It is only safe to sign extend the BaseReg if we know that the math
+    // required to create it did not overflow before we extend it. Since
+    // the original IR value was tossed in favor of a constant back when
+    // the AddrMode was created we need to bail out gracefully if widths
+    // do not match instead of extending it.
+    //
+    // (See below for code to add the scale.)
+    if (AddrMode.Scale) {
+      Type *ScaledRegTy = AddrMode.ScaledReg->getType();
+      if (cast<IntegerType>(IntPtrTy)->getBitWidth() >
+          cast<IntegerType>(ScaledRegTy)->getBitWidth())
+        return Modified;
+    }
+
+    if (AddrMode.BaseGV) {
+      if (ResultPtr)
+        return Modified;
+
+      ResultPtr = AddrMode.BaseGV;
+    }
+
+    // If the real base value actually came from an inttoptr, then the matcher
+    // will look through it and provide only the integer value. In that case,
+    // use it here.
+    if (!DL->isNonIntegralPointerType(Addr->getType())) {
+      if (!ResultPtr && AddrMode.BaseReg) {
+        ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(),
+                                           "sunkaddr");
+        AddrMode.BaseReg = nullptr;
+      } else if (!ResultPtr && AddrMode.Scale == 1) {
+        ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(),
+                                           "sunkaddr");
+        AddrMode.Scale = 0;
+      }
+    }
+
+    if (!ResultPtr && !AddrMode.BaseReg && !AddrMode.Scale &&
+        !AddrMode.BaseOffs) {
+      SunkAddr = Constant::getNullValue(Addr->getType());
+    } else if (!ResultPtr) {
+      return Modified;
+    } else {
+      Type *I8PtrTy =
+          Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace());
+      Type *I8Ty = Builder.getInt8Ty();
+
+      // Start with the base register. Do this first so that subsequent address
+      // matching finds it last, which will prevent it from trying to match it
+      // as the scaled value in case it happens to be a mul. That would be
+      // problematic if we've sunk a different mul for the scale, because then
+      // we'd end up sinking both muls.
+      if (AddrMode.BaseReg) {
+        Value *V = AddrMode.BaseReg;
+        if (V->getType() != IntPtrTy)
+          V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
+
+        ResultIndex = V;
+      }
+
+      // Add the scale value.
+      if (AddrMode.Scale) {
+        Value *V = AddrMode.ScaledReg;
+        if (V->getType() == IntPtrTy) {
+          // done.
+        } else {
+          assert(cast<IntegerType>(IntPtrTy)->getBitWidth() <
+                     cast<IntegerType>(V->getType())->getBitWidth() &&
+                 "We can't transform if ScaledReg is too narrow");
+          V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
+        }
+
+        if (AddrMode.Scale != 1)
+          V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
+                                "sunkaddr");
+        if (ResultIndex)
+          ResultIndex = Builder.CreateAdd(ResultIndex, V, "sunkaddr");
+        else
+          ResultIndex = V;
+      }
+
+      // Add in the Base Offset if present.
+      if (AddrMode.BaseOffs) {
+        Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
+        if (ResultIndex) {
+          // We need to add this separately from the scale above to help with
+          // SDAG consecutive load/store merging.
+          if (ResultPtr->getType() != I8PtrTy)
+            ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
+          ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex,
+                                        "sunkaddr", AddrMode.InBounds);
+        }
+
+        ResultIndex = V;
+      }
+
+      if (!ResultIndex) {
+        SunkAddr = ResultPtr;
+      } else {
+        if (ResultPtr->getType() != I8PtrTy)
+          ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
+        SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr",
+                                     AddrMode.InBounds);
+      }
+
+      if (SunkAddr->getType() != Addr->getType()) {
+        if (SunkAddr->getType()->getPointerAddressSpace() !=
+                Addr->getType()->getPointerAddressSpace() &&
+            !DL->isNonIntegralPointerType(Addr->getType())) {
+          // There are two reasons the address spaces might not match: a no-op
+          // addrspacecast, or a ptrtoint/inttoptr pair. Either way, we emit a
+          // ptrtoint/inttoptr pair to ensure we match the original semantics.
+          // TODO: allow bitcast between different address space pointers with
+          // the same size.
+          SunkAddr = Builder.CreatePtrToInt(SunkAddr, IntPtrTy, "sunkaddr");
+          SunkAddr =
+              Builder.CreateIntToPtr(SunkAddr, Addr->getType(), "sunkaddr");
+        } else
+          SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
+      }
+    }
+  } else {
+    // We'd require a ptrtoint/inttoptr down the line, which we can't do for
+    // non-integral pointers, so in that case bail out now.
+    Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr;
+    Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType() : nullptr;
+    PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(BaseTy);
+    PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(ScaleTy);
+    if (DL->isNonIntegralPointerType(Addr->getType()) ||
+        (BasePtrTy && DL->isNonIntegralPointerType(BasePtrTy)) ||
+        (ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) ||
+        (AddrMode.BaseGV &&
+         DL->isNonIntegralPointerType(AddrMode.BaseGV->getType())))
+      return Modified;
+
+    LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
+                      << " for " << *MemoryInst << "\n");
+    Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
+    Value *Result = nullptr;
+
+    // Start with the base register. Do this first so that subsequent address
+    // matching finds it last, which will prevent it from trying to match it
+    // as the scaled value in case it happens to be a mul. That would be
+    // problematic if we've sunk a different mul for the scale, because then
+    // we'd end up sinking both muls.
+    if (AddrMode.BaseReg) {
+      Value *V = AddrMode.BaseReg;
+      if (V->getType()->isPointerTy())
+        V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
+      if (V->getType() != IntPtrTy)
+        V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
+      Result = V;
+    }
+
+    // Add the scale value.
+    if (AddrMode.Scale) {
+      Value *V = AddrMode.ScaledReg;
+      if (V->getType() == IntPtrTy) {
+        // done.
+      } else if (V->getType()->isPointerTy()) {
+        V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
+      } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
+                 cast<IntegerType>(V->getType())->getBitWidth()) {
+        V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
+      } else {
+        // It is only safe to sign extend the BaseReg if we know that the math
+        // required to create it did not overflow before we extend it. Since
+        // the original IR value was tossed in favor of a constant back when
+        // the AddrMode was created we need to bail out gracefully if widths
+        // do not match instead of extending it.
+        Instruction *I = dyn_cast_or_null<Instruction>(Result);
+        if (I && (Result != AddrMode.BaseReg))
+          I->eraseFromParent();
+        return Modified;
+      }
+      if (AddrMode.Scale != 1)
+        V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
+                              "sunkaddr");
+      if (Result)
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
+      else
+        Result = V;
+    }
+
+    // Add in the BaseGV if present.
+    if (AddrMode.BaseGV) {
+      Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
+      if (Result)
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
+      else
+        Result = V;
+    }
+
+    // Add in the Base Offset if present.
+    if (AddrMode.BaseOffs) {
+      Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
+      if (Result)
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
+      else
+        Result = V;
+    }
+
+    if (!Result)
+      SunkAddr = Constant::getNullValue(Addr->getType());
+    else
+      SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");
+  }
+
+  MemoryInst->replaceUsesOfWith(Repl, SunkAddr);
+  // Store the newly computed address into the cache. In the case we reused a
+  // value, this should be idempotent.
+  SunkAddrs[Addr] = WeakTrackingVH(SunkAddr);
+
+  // If we have no uses, recursively delete the value and all dead instructions
+  // using it.
+  if (Repl->use_empty()) {
+    resetIteratorIfInvalidatedWhileCalling(CurInstIterator->getParent(), [&]() {
+      RecursivelyDeleteTriviallyDeadInstructions(
+          Repl, TLInfo, nullptr,
+          [&](Value *V) { removeAllAssertingVHReferences(V); });
+    });
+  }
+  ++NumMemoryInsts;
+  return true;
+}
+
+/// Rewrite GEP input to gather/scatter to enable SelectionDAGBuilder to find
+/// a uniform base to use for ISD::MGATHER/MSCATTER. SelectionDAGBuilder can
+/// only handle a 2 operand GEP in the same basic block or a splat constant
+/// vector. The 2 operands to the GEP must have a scalar pointer and a vector
+/// index.
+///
+/// If the existing GEP has a vector base pointer that is splat, we can look
+/// through the splat to find the scalar pointer. If we can't find a scalar
+/// pointer there's nothing we can do.
+///
+/// If we have a GEP with more than 2 indices where the middle indices are all
+/// zeroes, we can replace it with 2 GEPs where the second has 2 operands.
+///
+/// If the final index isn't a vector or is a splat, we can emit a scalar GEP
+/// followed by a GEP with an all zeroes vector index. This will enable
+/// SelectionDAGBuilder to use the scalar GEP as the uniform base and have a
+/// zero index.
+bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
+                                               Value *Ptr) {
+  Value *NewAddr;
+
+  if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+    // Don't optimize GEPs that don't have indices.
+    if (!GEP->hasIndices())
+      return false;
+
+    // If the GEP and the gather/scatter aren't in the same BB, don't optimize.
+    // FIXME: We should support this by sinking the GEP.
+    if (MemoryInst->getParent() != GEP->getParent())
+      return false;
+
+    SmallVector<Value *, 2> Ops(GEP->operands());
+
+    bool RewriteGEP = false;
+
+    if (Ops[0]->getType()->isVectorTy()) {
+      Ops[0] = getSplatValue(Ops[0]);
+      if (!Ops[0])
+        return false;
+      RewriteGEP = true;
+    }
+
+    unsigned FinalIndex = Ops.size() - 1;
+
+    // Ensure all but the last index is 0.
+    // FIXME: This isn't strictly required. All that's required is that they are
+    // all scalars or splats.
+    for (unsigned i = 1; i < FinalIndex; ++i) {
+      auto *C = dyn_cast<Constant>(Ops[i]);
+      if (!C)
+        return false;
+      if (isa<VectorType>(C->getType()))
+        C = C->getSplatValue();
+      auto *CI = dyn_cast_or_null<ConstantInt>(C);
+      if (!CI || !CI->isZero())
+        return false;
+      // Scalarize the index if needed.
+      Ops[i] = CI;
+    }
+
+    // Try to scalarize the final index.
+    if (Ops[FinalIndex]->getType()->isVectorTy()) {
+      if (Value *V = getSplatValue(Ops[FinalIndex])) {
+        auto *C = dyn_cast<ConstantInt>(V);
+        // Don't scalarize all zeros vector.
+        if (!C || !C->isZero()) {
+          Ops[FinalIndex] = V;
+          RewriteGEP = true;
+        }
+      }
+    }
+
+    // If we made any changes or the we have extra operands, we need to generate
+    // new instructions.
+    if (!RewriteGEP && Ops.size() == 2)
+      return false;
+
+    auto NumElts = cast<VectorType>(Ptr->getType())->getElementCount();
+
+    IRBuilder<> Builder(MemoryInst);
+
+    Type *SourceTy = GEP->getSourceElementType();
+    Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType());
+
+    // If the final index isn't a vector, emit a scalar GEP containing all ops
+    // and a vector GEP with all zeroes final index.
+    if (!Ops[FinalIndex]->getType()->isVectorTy()) {
+      NewAddr = Builder.CreateGEP(SourceTy, Ops[0], ArrayRef(Ops).drop_front());
+      auto *IndexTy = VectorType::get(ScalarIndexTy, NumElts);
+      auto *SecondTy = GetElementPtrInst::getIndexedType(
+          SourceTy, ArrayRef(Ops).drop_front());
+      NewAddr =
+          Builder.CreateGEP(SecondTy, NewAddr, Constant::getNullValue(IndexTy));
+    } else {
+      Value *Base = Ops[0];
+      Value *Index = Ops[FinalIndex];
+
+      // Create a scalar GEP if there are more than 2 operands.
+      if (Ops.size() != 2) {
+        // Replace the last index with 0.
+        Ops[FinalIndex] =
+            Constant::getNullValue(Ops[FinalIndex]->getType()->getScalarType());
+        Base = Builder.CreateGEP(SourceTy, Base, ArrayRef(Ops).drop_front());
+        SourceTy = GetElementPtrInst::getIndexedType(
+            SourceTy, ArrayRef(Ops).drop_front());
+      }
+
+      // Now create the GEP with scalar pointer and vector index.
+      NewAddr = Builder.CreateGEP(SourceTy, Base, Index);
+    }
+  } else if (!isa<Constant>(Ptr)) {
+    // Not a GEP, maybe its a splat and we can create a GEP to enable
+    // SelectionDAGBuilder to use it as a uniform base.
+    Value *V = getSplatValue(Ptr);
+    if (!V)
+      return false;
+
+    auto NumElts = cast<VectorType>(Ptr->getType())->getElementCount();
+
+    IRBuilder<> Builder(MemoryInst);
+
+    // Emit a vector GEP with a scalar pointer and all 0s vector index.
+    Type *ScalarIndexTy = DL->getIndexType(V->getType()->getScalarType());
+    auto *IndexTy = VectorType::get(ScalarIndexTy, NumElts);
+    Type *ScalarTy;
+    if (cast<IntrinsicInst>(MemoryInst)->getIntrinsicID() ==
+        Intrinsic::masked_gather) {
+      ScalarTy = MemoryInst->getType()->getScalarType();
+    } else {
+      assert(cast<IntrinsicInst>(MemoryInst)->getIntrinsicID() ==
+             Intrinsic::masked_scatter);
+      ScalarTy = MemoryInst->getOperand(0)->getType()->getScalarType();
+    }
+    NewAddr = Builder.CreateGEP(ScalarTy, V, Constant::getNullValue(IndexTy));
+  } else {
+    // Constant, SelectionDAGBuilder knows to check if its a splat.
+    return false;
+  }
+
+  MemoryInst->replaceUsesOfWith(Ptr, NewAddr);
+
+  // If we have no uses, recursively delete the value and all dead instructions
+  // using it.
+  if (Ptr->use_empty())
+    RecursivelyDeleteTriviallyDeadInstructions(
+        Ptr, TLInfo, nullptr,
+        [&](Value *V) { removeAllAssertingVHReferences(V); });
+
+  return true;
+}
+
+/// If there are any memory operands, use OptimizeMemoryInst to sink their
+/// address computing into the block when possible / profitable.
+bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
+  bool MadeChange = false;
+
+  const TargetRegisterInfo *TRI =
+      TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo();
+  TargetLowering::AsmOperandInfoVector TargetConstraints =
+      TLI->ParseConstraints(*DL, TRI, *CS);
+  unsigned ArgNo = 0;
+  for (TargetLowering::AsmOperandInfo &OpInfo : TargetConstraints) {
+    // Compute the constraint code and ConstraintType to use.
+    TLI->ComputeConstraintToUse(OpInfo, SDValue());
+
+    // TODO: Also handle C_Address?
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
+        OpInfo.isIndirect) {
+      Value *OpVal = CS->getArgOperand(ArgNo++);
+      MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u);
+    } else if (OpInfo.Type == InlineAsm::isInput)
+      ArgNo++;
+  }
+
+  return MadeChange;
+}
+
+/// Check if all the uses of \p Val are equivalent (or free) zero or
+/// sign extensions.
+static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) {
+  assert(!Val->use_empty() && "Input must have at least one use");
+  const Instruction *FirstUser = cast<Instruction>(*Val->user_begin());
+  bool IsSExt = isa<SExtInst>(FirstUser);
+  Type *ExtTy = FirstUser->getType();
+  for (const User *U : Val->users()) {
+    const Instruction *UI = cast<Instruction>(U);
+    if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))
+      return false;
+    Type *CurTy = UI->getType();
+    // Same input and output types: Same instruction after CSE.
+    if (CurTy == ExtTy)
+      continue;
+
+    // If IsSExt is true, we are in this situation:
+    // a = Val
+    // b = sext ty1 a to ty2
+    // c = sext ty1 a to ty3
+    // Assuming ty2 is shorter than ty3, this could be turned into:
+    // a = Val
+    // b = sext ty1 a to ty2
+    // c = sext ty2 b to ty3
+    // However, the last sext is not free.
+    if (IsSExt)
+      return false;
+
+    // This is a ZExt, maybe this is free to extend from one type to another.
+    // In that case, we would not account for a different use.
+    Type *NarrowTy;
+    Type *LargeTy;
+    if (ExtTy->getScalarType()->getIntegerBitWidth() >
+        CurTy->getScalarType()->getIntegerBitWidth()) {
+      NarrowTy = CurTy;
+      LargeTy = ExtTy;
+    } else {
+      NarrowTy = ExtTy;
+      LargeTy = CurTy;
+    }
+
+    if (!TLI.isZExtFree(NarrowTy, LargeTy))
+      return false;
+  }
+  // All uses are the same or can be derived from one another for free.
+  return true;
+}
+
+/// Try to speculatively promote extensions in \p Exts and continue
+/// promoting through newly promoted operands recursively as far as doing so is
+/// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts.
+/// When some promotion happened, \p TPT contains the proper state to revert
+/// them.
+///
+/// \return true if some promotion happened, false otherwise.
+bool CodeGenPrepare::tryToPromoteExts(
+    TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts,
+    SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
+    unsigned CreatedInstsCost) {
+  bool Promoted = false;
+
+  // Iterate over all the extensions to try to promote them.
+  for (auto *I : Exts) {
+    // Early check if we directly have ext(load).
+    if (isa<LoadInst>(I->getOperand(0))) {
+      ProfitablyMovedExts.push_back(I);
+      continue;
+    }
+
+    // Check whether or not we want to do any promotion.  The reason we have
+    // this check inside the for loop is to catch the case where an extension
+    // is directly fed by a load because in such case the extension can be moved
+    // up without any promotion on its operands.
+    if (!TLI->enableExtLdPromotion() || DisableExtLdPromotion)
+      return false;
+
+    // Get the action to perform the promotion.
+    TypePromotionHelper::Action TPH =
+        TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts);
+    // Check if we can promote.
+    if (!TPH) {
+      // Save the current extension as we cannot move up through its operand.
+      ProfitablyMovedExts.push_back(I);
+      continue;
+    }
+
+    // Save the current state.
+    TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+        TPT.getRestorationPoint();
+    SmallVector<Instruction *, 4> NewExts;
+    unsigned NewCreatedInstsCost = 0;
+    unsigned ExtCost = !TLI->isExtFree(I);
+    // Promote.
+    Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost,
+                             &NewExts, nullptr, *TLI);
+    assert(PromotedVal &&
+           "TypePromotionHelper should have filtered out those cases");
+
+    // We would be able to merge only one extension in a load.
+    // Therefore, if we have more than 1 new extension we heuristically
+    // cut this search path, because it means we degrade the code quality.
+    // With exactly 2, the transformation is neutral, because we will merge
+    // one extension but leave one. However, we optimistically keep going,
+    // because the new extension may be removed too.
+    long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost;
+    // FIXME: It would be possible to propagate a negative value instead of
+    // conservatively ceiling it to 0.
+    TotalCreatedInstsCost =
+        std::max((long long)0, (TotalCreatedInstsCost - ExtCost));
+    if (!StressExtLdPromotion &&
+        (TotalCreatedInstsCost > 1 ||
+         !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) {
+      // This promotion is not profitable, rollback to the previous state, and
+      // save the current extension in ProfitablyMovedExts as the latest
+      // speculative promotion turned out to be unprofitable.
+      TPT.rollback(LastKnownGood);
+      ProfitablyMovedExts.push_back(I);
+      continue;
+    }
+    // Continue promoting NewExts as far as doing so is profitable.
+    SmallVector<Instruction *, 2> NewlyMovedExts;
+    (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost);
+    bool NewPromoted = false;
+    for (auto *ExtInst : NewlyMovedExts) {
+      Instruction *MovedExt = cast<Instruction>(ExtInst);
+      Value *ExtOperand = MovedExt->getOperand(0);
+      // If we have reached to a load, we need this extra profitability check
+      // as it could potentially be merged into an ext(load).
+      if (isa<LoadInst>(ExtOperand) &&
+          !(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
+            (ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI))))
+        continue;
+
+      ProfitablyMovedExts.push_back(MovedExt);
+      NewPromoted = true;
+    }
+
+    // If none of speculative promotions for NewExts is profitable, rollback
+    // and save the current extension (I) as the last profitable extension.
+    if (!NewPromoted) {
+      TPT.rollback(LastKnownGood);
+      ProfitablyMovedExts.push_back(I);
+      continue;
+    }
+    // The promotion is profitable.
+    Promoted = true;
+  }
+  return Promoted;
+}
+
+/// Merging redundant sexts when one is dominating the other.
+bool CodeGenPrepare::mergeSExts(Function &F) {
+  bool Changed = false;
+  for (auto &Entry : ValToSExtendedUses) {
+    SExts &Insts = Entry.second;
+    SExts CurPts;
+    for (Instruction *Inst : Insts) {
+      if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) ||
+          Inst->getOperand(0) != Entry.first)
+        continue;
+      bool inserted = false;
+      for (auto &Pt : CurPts) {
+        if (getDT(F).dominates(Inst, Pt)) {
+          replaceAllUsesWith(Pt, Inst, FreshBBs, IsHugeFunc);
+          RemovedInsts.insert(Pt);
+          Pt->removeFromParent();
+          Pt = Inst;
+          inserted = true;
+          Changed = true;
+          break;
+        }
+        if (!getDT(F).dominates(Pt, Inst))
+          // Give up if we need to merge in a common dominator as the
+          // experiments show it is not profitable.
+          continue;
+        replaceAllUsesWith(Inst, Pt, FreshBBs, IsHugeFunc);
+        RemovedInsts.insert(Inst);
+        Inst->removeFromParent();
+        inserted = true;
+        Changed = true;
+        break;
+      }
+      if (!inserted)
+        CurPts.push_back(Inst);
+    }
+  }
+  return Changed;
+}
+
+// Splitting large data structures so that the GEPs accessing them can have
+// smaller offsets so that they can be sunk to the same blocks as their users.
+// For example, a large struct starting from %base is split into two parts
+// where the second part starts from %new_base.
+//
+// Before:
+// BB0:
+//   %base     =
+//
+// BB1:
+//   %gep0     = gep %base, off0
+//   %gep1     = gep %base, off1
+//   %gep2     = gep %base, off2
+//
+// BB2:
+//   %load1    = load %gep0
+//   %load2    = load %gep1
+//   %load3    = load %gep2
+//
+// After:
+// BB0:
+//   %base     =
+//   %new_base = gep %base, off0
+//
+// BB1:
+//   %new_gep0 = %new_base
+//   %new_gep1 = gep %new_base, off1 - off0
+//   %new_gep2 = gep %new_base, off2 - off0
+//
+// BB2:
+//   %load1    = load i32, i32* %new_gep0
+//   %load2    = load i32, i32* %new_gep1
+//   %load3    = load i32, i32* %new_gep2
+//
+// %new_gep1 and %new_gep2 can be sunk to BB2 now after the splitting because
+// their offsets are smaller enough to fit into the addressing mode.
+bool CodeGenPrepare::splitLargeGEPOffsets() {
+  bool Changed = false;
+  for (auto &Entry : LargeOffsetGEPMap) {
+    Value *OldBase = Entry.first;
+    SmallVectorImpl<std::pair<AssertingVH<GetElementPtrInst>, int64_t>>
+        &LargeOffsetGEPs = Entry.second;
+    auto compareGEPOffset =
+        [&](const std::pair<GetElementPtrInst *, int64_t> &LHS,
+            const std::pair<GetElementPtrInst *, int64_t> &RHS) {
+          if (LHS.first == RHS.first)
+            return false;
+          if (LHS.second != RHS.second)
+            return LHS.second < RHS.second;
+          return LargeOffsetGEPID[LHS.first] < LargeOffsetGEPID[RHS.first];
+        };
+    // Sorting all the GEPs of the same data structures based on the offsets.
+    llvm::sort(LargeOffsetGEPs, compareGEPOffset);
+    LargeOffsetGEPs.erase(
+        std::unique(LargeOffsetGEPs.begin(), LargeOffsetGEPs.end()),
+        LargeOffsetGEPs.end());
+    // Skip if all the GEPs have the same offsets.
+    if (LargeOffsetGEPs.front().second == LargeOffsetGEPs.back().second)
+      continue;
+    GetElementPtrInst *BaseGEP = LargeOffsetGEPs.begin()->first;
+    int64_t BaseOffset = LargeOffsetGEPs.begin()->second;
+    Value *NewBaseGEP = nullptr;
+
+    auto *LargeOffsetGEP = LargeOffsetGEPs.begin();
+    while (LargeOffsetGEP != LargeOffsetGEPs.end()) {
+      GetElementPtrInst *GEP = LargeOffsetGEP->first;
+      int64_t Offset = LargeOffsetGEP->second;
+      if (Offset != BaseOffset) {
+        TargetLowering::AddrMode AddrMode;
+        AddrMode.HasBaseReg = true;
+        AddrMode.BaseOffs = Offset - BaseOffset;
+        // The result type of the GEP might not be the type of the memory
+        // access.
+        if (!TLI->isLegalAddressingMode(*DL, AddrMode,
+                                        GEP->getResultElementType(),
+                                        GEP->getAddressSpace())) {
+          // We need to create a new base if the offset to the current base is
+          // too large to fit into the addressing mode. So, a very large struct
+          // may be split into several parts.
+          BaseGEP = GEP;
+          BaseOffset = Offset;
+          NewBaseGEP = nullptr;
+        }
+      }
+
+      // Generate a new GEP to replace the current one.
+      LLVMContext &Ctx = GEP->getContext();
+      Type *PtrIdxTy = DL->getIndexType(GEP->getType());
+      Type *I8PtrTy =
+          Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace());
+      Type *I8Ty = Type::getInt8Ty(Ctx);
+
+      if (!NewBaseGEP) {
+        // Create a new base if we don't have one yet.  Find the insertion
+        // pointer for the new base first.
+        BasicBlock::iterator NewBaseInsertPt;
+        BasicBlock *NewBaseInsertBB;
+        if (auto *BaseI = dyn_cast<Instruction>(OldBase)) {
+          // If the base of the struct is an instruction, the new base will be
+          // inserted close to it.
+          NewBaseInsertBB = BaseI->getParent();
+          if (isa<PHINode>(BaseI))
+            NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
+          else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(BaseI)) {
+            NewBaseInsertBB =
+                SplitEdge(NewBaseInsertBB, Invoke->getNormalDest(), DT.get(), LI);
+            NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
+          } else
+            NewBaseInsertPt = std::next(BaseI->getIterator());
+        } else {
+          // If the current base is an argument or global value, the new base
+          // will be inserted to the entry block.
+          NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock();
+          NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
+        }
+        IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt);
+        // Create a new base.
+        Value *BaseIndex = ConstantInt::get(PtrIdxTy, BaseOffset);
+        NewBaseGEP = OldBase;
+        if (NewBaseGEP->getType() != I8PtrTy)
+          NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy);
+        NewBaseGEP =
+            NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep");
+        NewGEPBases.insert(NewBaseGEP);
+      }
+
+      IRBuilder<> Builder(GEP);
+      Value *NewGEP = NewBaseGEP;
+      if (Offset == BaseOffset) {
+        if (GEP->getType() != I8PtrTy)
+          NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
+      } else {
+        // Calculate the new offset for the new GEP.
+        Value *Index = ConstantInt::get(PtrIdxTy, Offset - BaseOffset);
+        NewGEP = Builder.CreateGEP(I8Ty, NewBaseGEP, Index);
+
+        if (GEP->getType() != I8PtrTy)
+          NewGEP = Builder.CreatePointerCast(NewGEP, GEP->getType());
+      }
+      replaceAllUsesWith(GEP, NewGEP, FreshBBs, IsHugeFunc);
+      LargeOffsetGEPID.erase(GEP);
+      LargeOffsetGEP = LargeOffsetGEPs.erase(LargeOffsetGEP);
+      GEP->eraseFromParent();
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool CodeGenPrepare::optimizePhiType(
+    PHINode *I, SmallPtrSetImpl<PHINode *> &Visited,
+    SmallPtrSetImpl<Instruction *> &DeletedInstrs) {
+  // We are looking for a collection on interconnected phi nodes that together
+  // only use loads/bitcasts and are used by stores/bitcasts, and the bitcasts
+  // are of the same type. Convert the whole set of nodes to the type of the
+  // bitcast.
+  Type *PhiTy = I->getType();
+  Type *ConvertTy = nullptr;
+  if (Visited.count(I) ||
+      (!I->getType()->isIntegerTy() && !I->getType()->isFloatingPointTy()))
+    return false;
+
+  SmallVector<Instruction *, 4> Worklist;
+  Worklist.push_back(cast<Instruction>(I));
+  SmallPtrSet<PHINode *, 4> PhiNodes;
+  SmallPtrSet<ConstantData *, 4> Constants;
+  PhiNodes.insert(I);
+  Visited.insert(I);
+  SmallPtrSet<Instruction *, 4> Defs;
+  SmallPtrSet<Instruction *, 4> Uses;
+  // This works by adding extra bitcasts between load/stores and removing
+  // existing bicasts. If we have a phi(bitcast(load)) or a store(bitcast(phi))
+  // we can get in the situation where we remove a bitcast in one iteration
+  // just to add it again in the next. We need to ensure that at least one
+  // bitcast we remove are anchored to something that will not change back.
+  bool AnyAnchored = false;
+
+  while (!Worklist.empty()) {
+    Instruction *II = Worklist.pop_back_val();
+
+    if (auto *Phi = dyn_cast<PHINode>(II)) {
+      // Handle Defs, which might also be PHI's
+      for (Value *V : Phi->incoming_values()) {
+        if (auto *OpPhi = dyn_cast<PHINode>(V)) {
+          if (!PhiNodes.count(OpPhi)) {
+            if (!Visited.insert(OpPhi).second)
+              return false;
+            PhiNodes.insert(OpPhi);
+            Worklist.push_back(OpPhi);
+          }
+        } else if (auto *OpLoad = dyn_cast<LoadInst>(V)) {
+          if (!OpLoad->isSimple())
+            return false;
+          if (Defs.insert(OpLoad).second)
+            Worklist.push_back(OpLoad);
+        } else if (auto *OpEx = dyn_cast<ExtractElementInst>(V)) {
+          if (Defs.insert(OpEx).second)
+            Worklist.push_back(OpEx);
+        } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {
+          if (!ConvertTy)
+            ConvertTy = OpBC->getOperand(0)->getType();
+          if (OpBC->getOperand(0)->getType() != ConvertTy)
+            return false;
+          if (Defs.insert(OpBC).second) {
+            Worklist.push_back(OpBC);
+            AnyAnchored |= !isa<LoadInst>(OpBC->getOperand(0)) &&
+                           !isa<ExtractElementInst>(OpBC->getOperand(0));
+          }
+        } else if (auto *OpC = dyn_cast<ConstantData>(V))
+          Constants.insert(OpC);
+        else
+          return false;
+      }
+    }
+
+    // Handle uses which might also be phi's
+    for (User *V : II->users()) {
+      if (auto *OpPhi = dyn_cast<PHINode>(V)) {
+        if (!PhiNodes.count(OpPhi)) {
+          if (Visited.count(OpPhi))
+            return false;
+          PhiNodes.insert(OpPhi);
+          Visited.insert(OpPhi);
+          Worklist.push_back(OpPhi);
+        }
+      } else if (auto *OpStore = dyn_cast<StoreInst>(V)) {
+        if (!OpStore->isSimple() || OpStore->getOperand(0) != II)
+          return false;
+        Uses.insert(OpStore);
+      } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {
+        if (!ConvertTy)
+          ConvertTy = OpBC->getType();
+        if (OpBC->getType() != ConvertTy)
+          return false;
+        Uses.insert(OpBC);
+        AnyAnchored |=
+            any_of(OpBC->users(), [](User *U) { return !isa<StoreInst>(U); });
+      } else {
+        return false;
+      }
+    }
+  }
+
+  if (!ConvertTy || !AnyAnchored ||
+      !TLI->shouldConvertPhiType(PhiTy, ConvertTy))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Converting " << *I << "\n  and connected nodes to "
+                    << *ConvertTy << "\n");
+
+  // Create all the new phi nodes of the new type, and bitcast any loads to the
+  // correct type.
+  ValueToValueMap ValMap;
+  for (ConstantData *C : Constants)
+    ValMap[C] = ConstantExpr::getCast(Instruction::BitCast, C, ConvertTy);
+  for (Instruction *D : Defs) {
+    if (isa<BitCastInst>(D)) {
+      ValMap[D] = D->getOperand(0);
+      DeletedInstrs.insert(D);
+    } else {
+      ValMap[D] =
+          new BitCastInst(D, ConvertTy, D->getName() + ".bc", D->getNextNode());
+    }
+  }
+  for (PHINode *Phi : PhiNodes)
+    ValMap[Phi] = PHINode::Create(ConvertTy, Phi->getNumIncomingValues(),
+                                  Phi->getName() + ".tc", Phi);
+  // Pipe together all the PhiNodes.
+  for (PHINode *Phi : PhiNodes) {
+    PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
+    for (int i = 0, e = Phi->getNumIncomingValues(); i < e; i++)
+      NewPhi->addIncoming(ValMap[Phi->getIncomingValue(i)],
+                          Phi->getIncomingBlock(i));
+    Visited.insert(NewPhi);
+  }
+  // And finally pipe up the stores and bitcasts
+  for (Instruction *U : Uses) {
+    if (isa<BitCastInst>(U)) {
+      DeletedInstrs.insert(U);
+      replaceAllUsesWith(U, ValMap[U->getOperand(0)], FreshBBs, IsHugeFunc);
+    } else {
+      U->setOperand(0,
+                    new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc", U));
+    }
+  }
+
+  // Save the removed phis to be deleted later.
+  for (PHINode *Phi : PhiNodes)
+    DeletedInstrs.insert(Phi);
+  return true;
+}
+
+bool CodeGenPrepare::optimizePhiTypes(Function &F) {
+  if (!OptimizePhiTypes)
+    return false;
+
+  bool Changed = false;
+  SmallPtrSet<PHINode *, 4> Visited;
+  SmallPtrSet<Instruction *, 4> DeletedInstrs;
+
+  // Attempt to optimize all the phis in the functions to the correct type.
+  for (auto &BB : F)
+    for (auto &Phi : BB.phis())
+      Changed |= optimizePhiType(&Phi, Visited, DeletedInstrs);
+
+  // Remove any old phi's that have been converted.
+  for (auto *I : DeletedInstrs) {
+    replaceAllUsesWith(I, PoisonValue::get(I->getType()), FreshBBs, IsHugeFunc);
+    I->eraseFromParent();
+  }
+
+  return Changed;
+}
+
+/// Return true, if an ext(load) can be formed from an extension in
+/// \p MovedExts.
+bool CodeGenPrepare::canFormExtLd(
+    const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI,
+    Instruction *&Inst, bool HasPromoted) {
+  for (auto *MovedExtInst : MovedExts) {
+    if (isa<LoadInst>(MovedExtInst->getOperand(0))) {
+      LI = cast<LoadInst>(MovedExtInst->getOperand(0));
+      Inst = MovedExtInst;
+      break;
+    }
+  }
+  if (!LI)
+    return false;
+
+  // If they're already in the same block, there's nothing to do.
+  // Make the cheap checks first if we did not promote.
+  // If we promoted, we need to check if it is indeed profitable.
+  if (!HasPromoted && LI->getParent() == Inst->getParent())
+    return false;
+
+  return TLI->isExtLoad(LI, Inst, *DL);
+}
+
+/// Move a zext or sext fed by a load into the same basic block as the load,
+/// unless conditions are unfavorable. This allows SelectionDAG to fold the
+/// extend into the load.
+///
+/// E.g.,
+/// \code
+/// %ld = load i32* %addr
+/// %add = add nuw i32 %ld, 4
+/// %zext = zext i32 %add to i64
+// \endcode
+/// =>
+/// \code
+/// %ld = load i32* %addr
+/// %zext = zext i32 %ld to i64
+/// %add = add nuw i64 %zext, 4
+/// \encode
+/// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which
+/// allow us to match zext(load i32*) to i64.
+///
+/// Also, try to promote the computations used to obtain a sign extended
+/// value used into memory accesses.
+/// E.g.,
+/// \code
+/// a = add nsw i32 b, 3
+/// d = sext i32 a to i64
+/// e = getelementptr ..., i64 d
+/// \endcode
+/// =>
+/// \code
+/// f = sext i32 b to i64
+/// a = add nsw i64 f, 3
+/// e = getelementptr ..., i64 a
+/// \endcode
+///
+/// \p Inst[in/out] the extension may be modified during the process if some
+/// promotions apply.
+bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {
+  bool AllowPromotionWithoutCommonHeader = false;
+  /// See if it is an interesting sext operations for the address type
+  /// promotion before trying to promote it, e.g., the ones with the right
+  /// type and used in memory accesses.
+  bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion(
+      *Inst, AllowPromotionWithoutCommonHeader);
+  TypePromotionTransaction TPT(RemovedInsts);
+  TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+      TPT.getRestorationPoint();
+  SmallVector<Instruction *, 1> Exts;
+  SmallVector<Instruction *, 2> SpeculativelyMovedExts;
+  Exts.push_back(Inst);
+
+  bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts);
+
+  // Look for a load being extended.
+  LoadInst *LI = nullptr;
+  Instruction *ExtFedByLoad;
+
+  // Try to promote a chain of computation if it allows to form an extended
+  // load.
+  if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) {
+    assert(LI && ExtFedByLoad && "Expect a valid load and extension");
+    TPT.commit();
+    // Move the extend into the same block as the load.
+    ExtFedByLoad->moveAfter(LI);
+    ++NumExtsMoved;
+    Inst = ExtFedByLoad;
+    return true;
+  }
+
+  // Continue promoting SExts if known as considerable depending on targets.
+  if (ATPConsiderable &&
+      performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader,
+                                  HasPromoted, TPT, SpeculativelyMovedExts))
+    return true;
+
+  TPT.rollback(LastKnownGood);
+  return false;
+}
+
+// Perform address type promotion if doing so is profitable.
+// If AllowPromotionWithoutCommonHeader == false, we should find other sext
+// instructions that sign extended the same initial value. However, if
+// AllowPromotionWithoutCommonHeader == true, we expect promoting the
+// extension is just profitable.
+bool CodeGenPrepare::performAddressTypePromotion(
+    Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,
+    bool HasPromoted, TypePromotionTransaction &TPT,
+    SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) {
+  bool Promoted = false;
+  SmallPtrSet<Instruction *, 1> UnhandledExts;
+  bool AllSeenFirst = true;
+  for (auto *I : SpeculativelyMovedExts) {
+    Value *HeadOfChain = I->getOperand(0);
+    DenseMap<Value *, Instruction *>::iterator AlreadySeen =
+        SeenChainsForSExt.find(HeadOfChain);
+    // If there is an unhandled SExt which has the same header, try to promote
+    // it as well.
+    if (AlreadySeen != SeenChainsForSExt.end()) {
+      if (AlreadySeen->second != nullptr)
+        UnhandledExts.insert(AlreadySeen->second);
+      AllSeenFirst = false;
+    }
+  }
+
+  if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader &&
+                        SpeculativelyMovedExts.size() == 1)) {
+    TPT.commit();
+    if (HasPromoted)
+      Promoted = true;
+    for (auto *I : SpeculativelyMovedExts) {
+      Value *HeadOfChain = I->getOperand(0);
+      SeenChainsForSExt[HeadOfChain] = nullptr;
+      ValToSExtendedUses[HeadOfChain].push_back(I);
+    }
+    // Update Inst as promotion happen.
+    Inst = SpeculativelyMovedExts.pop_back_val();
+  } else {
+    // This is the first chain visited from the header, keep the current chain
+    // as unhandled. Defer to promote this until we encounter another SExt
+    // chain derived from the same header.
+    for (auto *I : SpeculativelyMovedExts) {
+      Value *HeadOfChain = I->getOperand(0);
+      SeenChainsForSExt[HeadOfChain] = Inst;
+    }
+    return false;
+  }
+
+  if (!AllSeenFirst && !UnhandledExts.empty())
+    for (auto *VisitedSExt : UnhandledExts) {
+      if (RemovedInsts.count(VisitedSExt))
+        continue;
+      TypePromotionTransaction TPT(RemovedInsts);
+      SmallVector<Instruction *, 1> Exts;
+      SmallVector<Instruction *, 2> Chains;
+      Exts.push_back(VisitedSExt);
+      bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains);
+      TPT.commit();
+      if (HasPromoted)
+        Promoted = true;
+      for (auto *I : Chains) {
+        Value *HeadOfChain = I->getOperand(0);
+        // Mark this as handled.
+        SeenChainsForSExt[HeadOfChain] = nullptr;
+        ValToSExtendedUses[HeadOfChain].push_back(I);
+      }
+    }
+  return Promoted;
+}
+
+bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
+  BasicBlock *DefBB = I->getParent();
+
+  // If the result of a {s|z}ext and its source are both live out, rewrite all
+  // other uses of the source with result of extension.
+  Value *Src = I->getOperand(0);
+  if (Src->hasOneUse())
+    return false;
+
+  // Only do this xform if truncating is free.
+  if (!TLI->isTruncateFree(I->getType(), Src->getType()))
+    return false;
+
+  // Only safe to perform the optimization if the source is also defined in
+  // this block.
+  if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent())
+    return false;
+
+  bool DefIsLiveOut = false;
+  for (User *U : I->users()) {
+    Instruction *UI = cast<Instruction>(U);
+
+    // Figure out which BB this ext is used in.
+    BasicBlock *UserBB = UI->getParent();
+    if (UserBB == DefBB)
+      continue;
+    DefIsLiveOut = true;
+    break;
+  }
+  if (!DefIsLiveOut)
+    return false;
+
+  // Make sure none of the uses are PHI nodes.
+  for (User *U : Src->users()) {
+    Instruction *UI = cast<Instruction>(U);
+    BasicBlock *UserBB = UI->getParent();
+    if (UserBB == DefBB)
+      continue;
+    // Be conservative. We don't want this xform to end up introducing
+    // reloads just before load / store instructions.
+    if (isa<PHINode>(UI) || isa<LoadInst>(UI) || isa<StoreInst>(UI))
+      return false;
+  }
+
+  // InsertedTruncs - Only insert one trunc in each block once.
+  DenseMap<BasicBlock *, Instruction *> InsertedTruncs;
+
+  bool MadeChange = false;
+  for (Use &U : Src->uses()) {
+    Instruction *User = cast<Instruction>(U.getUser());
+
+    // Figure out which BB this ext is used in.
+    BasicBlock *UserBB = User->getParent();
+    if (UserBB == DefBB)
+      continue;
+
+    // Both src and def are live in this block. Rewrite the use.
+    Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
+
+    if (!InsertedTrunc) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+      assert(InsertPt != UserBB->end());
+      InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt);
+      InsertedInsts.insert(InsertedTrunc);
+    }
+
+    // Replace a use of the {s|z}ext source with a use of the result.
+    U = InsertedTrunc;
+    ++NumExtUses;
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+// Find loads whose uses only use some of the loaded value's bits.  Add an "and"
+// just after the load if the target can fold this into one extload instruction,
+// with the hope of eliminating some of the other later "and" instructions using
+// the loaded value.  "and"s that are made trivially redundant by the insertion
+// of the new "and" are removed by this function, while others (e.g. those whose
+// path from the load goes through a phi) are left for isel to potentially
+// remove.
+//
+// For example:
+//
+// b0:
+//   x = load i32
+//   ...
+// b1:
+//   y = and x, 0xff
+//   z = use y
+//
+// becomes:
+//
+// b0:
+//   x = load i32
+//   x' = and x, 0xff
+//   ...
+// b1:
+//   z = use x'
+//
+// whereas:
+//
+// b0:
+//   x1 = load i32
+//   ...
+// b1:
+//   x2 = load i32
+//   ...
+// b2:
+//   x = phi x1, x2
+//   y = and x, 0xff
+//
+// becomes (after a call to optimizeLoadExt for each load):
+//
+// b0:
+//   x1 = load i32
+//   x1' = and x1, 0xff
+//   ...
+// b1:
+//   x2 = load i32
+//   x2' = and x2, 0xff
+//   ...
+// b2:
+//   x = phi x1', x2'
+//   y = and x, 0xff
+bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
+  if (!Load->isSimple() || !Load->getType()->isIntOrPtrTy())
+    return false;
+
+  // Skip loads we've already transformed.
+  if (Load->hasOneUse() &&
+      InsertedInsts.count(cast<Instruction>(*Load->user_begin())))
+    return false;
+
+  // Look at all uses of Load, looking through phis, to determine how many bits
+  // of the loaded value are needed.
+  SmallVector<Instruction *, 8> WorkList;
+  SmallPtrSet<Instruction *, 16> Visited;
+  SmallVector<Instruction *, 8> AndsToMaybeRemove;
+  for (auto *U : Load->users())
+    WorkList.push_back(cast<Instruction>(U));
+
+  EVT LoadResultVT = TLI->getValueType(*DL, Load->getType());
+  unsigned BitWidth = LoadResultVT.getSizeInBits();
+  // If the BitWidth is 0, do not try to optimize the type
+  if (BitWidth == 0)
+    return false;
+
+  APInt DemandBits(BitWidth, 0);
+  APInt WidestAndBits(BitWidth, 0);
+
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.pop_back_val();
+
+    // Break use-def graph loops.
+    if (!Visited.insert(I).second)
+      continue;
+
+    // For a PHI node, push all of its users.
+    if (auto *Phi = dyn_cast<PHINode>(I)) {
+      for (auto *U : Phi->users())
+        WorkList.push_back(cast<Instruction>(U));
+      continue;
+    }
+
+    switch (I->getOpcode()) {
+    case Instruction::And: {
+      auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1));
+      if (!AndC)
+        return false;
+      APInt AndBits = AndC->getValue();
+      DemandBits |= AndBits;
+      // Keep track of the widest and mask we see.
+      if (AndBits.ugt(WidestAndBits))
+        WidestAndBits = AndBits;
+      if (AndBits == WidestAndBits && I->getOperand(0) == Load)
+        AndsToMaybeRemove.push_back(I);
+      break;
+    }
+
+    case Instruction::Shl: {
+      auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1));
+      if (!ShlC)
+        return false;
+      uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1);
+      DemandBits.setLowBits(BitWidth - ShiftAmt);
+      break;
+    }
+
+    case Instruction::Trunc: {
+      EVT TruncVT = TLI->getValueType(*DL, I->getType());
+      unsigned TruncBitWidth = TruncVT.getSizeInBits();
+      DemandBits.setLowBits(TruncBitWidth);
+      break;
+    }
+
+    default:
+      return false;
+    }
+  }
+
+  uint32_t ActiveBits = DemandBits.getActiveBits();
+  // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the
+  // target even if isLoadExtLegal says an i1 EXTLOAD is valid.  For example,
+  // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but
+  // (and (load x) 1) is not matched as a single instruction, rather as a LDR
+  // followed by an AND.
+  // TODO: Look into removing this restriction by fixing backends to either
+  // return false for isLoadExtLegal for i1 or have them select this pattern to
+  // a single instruction.
+  //
+  // Also avoid hoisting if we didn't see any ands with the exact DemandBits
+  // mask, since these are the only ands that will be removed by isel.
+  if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) ||
+      WidestAndBits != DemandBits)
+    return false;
+
+  LLVMContext &Ctx = Load->getType()->getContext();
+  Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits);
+  EVT TruncVT = TLI->getValueType(*DL, TruncTy);
+
+  // Reject cases that won't be matched as extloads.
+  if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() ||
+      !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT))
+    return false;
+
+  IRBuilder<> Builder(Load->getNextNode());
+  auto *NewAnd = cast<Instruction>(
+      Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
+  // Mark this instruction as "inserted by CGP", so that other
+  // optimizations don't touch it.
+  InsertedInsts.insert(NewAnd);
+
+  // Replace all uses of load with new and (except for the use of load in the
+  // new and itself).
+  replaceAllUsesWith(Load, NewAnd, FreshBBs, IsHugeFunc);
+  NewAnd->setOperand(0, Load);
+
+  // Remove any and instructions that are now redundant.
+  for (auto *And : AndsToMaybeRemove)
+    // Check that the and mask is the same as the one we decided to put on the
+    // new and.
+    if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) {
+      replaceAllUsesWith(And, NewAnd, FreshBBs, IsHugeFunc);
+      if (&*CurInstIterator == And)
+        CurInstIterator = std::next(And->getIterator());
+      And->eraseFromParent();
+      ++NumAndUses;
+    }
+
+  ++NumAndsAdded;
+  return true;
+}
+
+/// Check if V (an operand of a select instruction) is an expensive instruction
+/// that is only used once.
+static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
+  auto *I = dyn_cast<Instruction>(V);
+  // If it's safe to speculatively execute, then it should not have side
+  // effects; therefore, it's safe to sink and possibly *not* execute.
+  return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&
+         TTI->isExpensiveToSpeculativelyExecute(I);
+}
+
+/// Returns true if a SelectInst should be turned into an explicit branch.
+static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,
+                                                const TargetLowering *TLI,
+                                                SelectInst *SI) {
+  // If even a predictable select is cheap, then a branch can't be cheaper.
+  if (!TLI->isPredictableSelectExpensive())
+    return false;
+
+  // FIXME: This should use the same heuristics as IfConversion to determine
+  // whether a select is better represented as a branch.
+
+  // If metadata tells us that the select condition is obviously predictable,
+  // then we want to replace the select with a branch.
+  uint64_t TrueWeight, FalseWeight;
+  if (extractBranchWeights(*SI, TrueWeight, FalseWeight)) {
+    uint64_t Max = std::max(TrueWeight, FalseWeight);
+    uint64_t Sum = TrueWeight + FalseWeight;
+    if (Sum != 0) {
+      auto Probability = BranchProbability::getBranchProbability(Max, Sum);
+      if (Probability > TTI->getPredictableBranchThreshold())
+        return true;
+    }
+  }
+
+  CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
+
+  // If a branch is predictable, an out-of-order CPU can avoid blocking on its
+  // comparison condition. If the compare has more than one use, there's
+  // probably another cmov or setcc around, so it's not worth emitting a branch.
+  if (!Cmp || !Cmp->hasOneUse())
+    return false;
+
+  // If either operand of the select is expensive and only needed on one side
+  // of the select, we should form a branch.
+  if (sinkSelectOperand(TTI, SI->getTrueValue()) ||
+      sinkSelectOperand(TTI, SI->getFalseValue()))
+    return true;
+
+  return false;
+}
+
+/// If \p isTrue is true, return the true value of \p SI, otherwise return
+/// false value of \p SI. If the true/false value of \p SI is defined by any
+/// select instructions in \p Selects, look through the defining select
+/// instruction until the true/false value is not defined in \p Selects.
+static Value *
+getTrueOrFalseValue(SelectInst *SI, bool isTrue,
+                    const SmallPtrSet<const Instruction *, 2> &Selects) {
+  Value *V = nullptr;
+
+  for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
+       DefSI = dyn_cast<SelectInst>(V)) {
+    assert(DefSI->getCondition() == SI->getCondition() &&
+           "The condition of DefSI does not match with SI");
+    V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
+  }
+
+  assert(V && "Failed to get select true/false value");
+  return V;
+}
+
+bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) {
+  assert(Shift->isShift() && "Expected a shift");
+
+  // If this is (1) a vector shift, (2) shifts by scalars are cheaper than
+  // general vector shifts, and (3) the shift amount is a select-of-splatted
+  // values, hoist the shifts before the select:
+  //   shift Op0, (select Cond, TVal, FVal) -->
+  //   select Cond, (shift Op0, TVal), (shift Op0, FVal)
+  //
+  // This is inverting a generic IR transform when we know that the cost of a
+  // general vector shift is more than the cost of 2 shift-by-scalars.
+  // We can't do this effectively in SDAG because we may not be able to
+  // determine if the select operands are splats from within a basic block.
+  Type *Ty = Shift->getType();
+  if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty))
+    return false;
+  Value *Cond, *TVal, *FVal;
+  if (!match(Shift->getOperand(1),
+             m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
+    return false;
+  if (!isSplatValue(TVal) || !isSplatValue(FVal))
+    return false;
+
+  IRBuilder<> Builder(Shift);
+  BinaryOperator::BinaryOps Opcode = Shift->getOpcode();
+  Value *NewTVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), TVal);
+  Value *NewFVal = Builder.CreateBinOp(Opcode, Shift->getOperand(0), FVal);
+  Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);
+  replaceAllUsesWith(Shift, NewSel, FreshBBs, IsHugeFunc);
+  Shift->eraseFromParent();
+  return true;
+}
+
+bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) {
+  Intrinsic::ID Opcode = Fsh->getIntrinsicID();
+  assert((Opcode == Intrinsic::fshl || Opcode == Intrinsic::fshr) &&
+         "Expected a funnel shift");
+
+  // If this is (1) a vector funnel shift, (2) shifts by scalars are cheaper
+  // than general vector shifts, and (3) the shift amount is select-of-splatted
+  // values, hoist the funnel shifts before the select:
+  //   fsh Op0, Op1, (select Cond, TVal, FVal) -->
+  //   select Cond, (fsh Op0, Op1, TVal), (fsh Op0, Op1, FVal)
+  //
+  // This is inverting a generic IR transform when we know that the cost of a
+  // general vector shift is more than the cost of 2 shift-by-scalars.
+  // We can't do this effectively in SDAG because we may not be able to
+  // determine if the select operands are splats from within a basic block.
+  Type *Ty = Fsh->getType();
+  if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty))
+    return false;
+  Value *Cond, *TVal, *FVal;
+  if (!match(Fsh->getOperand(2),
+             m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
+    return false;
+  if (!isSplatValue(TVal) || !isSplatValue(FVal))
+    return false;
+
+  IRBuilder<> Builder(Fsh);
+  Value *X = Fsh->getOperand(0), *Y = Fsh->getOperand(1);
+  Value *NewTVal = Builder.CreateIntrinsic(Opcode, Ty, {X, Y, TVal});
+  Value *NewFVal = Builder.CreateIntrinsic(Opcode, Ty, {X, Y, FVal});
+  Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);
+  replaceAllUsesWith(Fsh, NewSel, FreshBBs, IsHugeFunc);
+  Fsh->eraseFromParent();
+  return true;
+}
+
+/// If we have a SelectInst that will likely profit from branch prediction,
+/// turn it into a branch.
+bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
+  if (DisableSelectToBranch)
+    return false;
+
+  // If the SelectOptimize pass is enabled, selects have already been optimized.
+  if (!getCGPassBuilderOption().DisableSelectOptimize)
+    return false;
+
+  // Find all consecutive select instructions that share the same condition.
+  SmallVector<SelectInst *, 2> ASI;
+  ASI.push_back(SI);
+  for (BasicBlock::iterator It = ++BasicBlock::iterator(SI);
+       It != SI->getParent()->end(); ++It) {
+    SelectInst *I = dyn_cast<SelectInst>(&*It);
+    if (I && SI->getCondition() == I->getCondition()) {
+      ASI.push_back(I);
+    } else {
+      break;
+    }
+  }
+
+  SelectInst *LastSI = ASI.back();
+  // Increment the current iterator to skip all the rest of select instructions
+  // because they will be either "not lowered" or "all lowered" to branch.
+  CurInstIterator = std::next(LastSI->getIterator());
+
+  bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
+
+  // Can we convert the 'select' to CF ?
+  if (VectorCond || SI->getMetadata(LLVMContext::MD_unpredictable))
+    return false;
+
+  TargetLowering::SelectSupportKind SelectKind;
+  if (SI->getType()->isVectorTy())
+    SelectKind = TargetLowering::ScalarCondVectorVal;
+  else
+    SelectKind = TargetLowering::ScalarValSelect;
+
+  if (TLI->isSelectSupported(SelectKind) &&
+      (!isFormingBranchFromSelectProfitable(TTI, TLI, SI) || OptSize ||
+       llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get())))
+    return false;
+
+  // The DominatorTree needs to be rebuilt by any consumers after this
+  // transformation. We simply reset here rather than setting the ModifiedDT
+  // flag to avoid restarting the function walk in runOnFunction for each
+  // select optimized.
+  DT.reset();
+
+  // Transform a sequence like this:
+  //    start:
+  //       %cmp = cmp uge i32 %a, %b
+  //       %sel = select i1 %cmp, i32 %c, i32 %d
+  //
+  // Into:
+  //    start:
+  //       %cmp = cmp uge i32 %a, %b
+  //       %cmp.frozen = freeze %cmp
+  //       br i1 %cmp.frozen, label %select.true, label %select.false
+  //    select.true:
+  //       br label %select.end
+  //    select.false:
+  //       br label %select.end
+  //    select.end:
+  //       %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
+  //
+  // %cmp should be frozen, otherwise it may introduce undefined behavior.
+  // In addition, we may sink instructions that produce %c or %d from
+  // the entry block into the destination(s) of the new branch.
+  // If the true or false blocks do not contain a sunken instruction, that
+  // block and its branch may be optimized away. In that case, one side of the
+  // first branch will point directly to select.end, and the corresponding PHI
+  // predecessor block will be the start block.
+
+  // Collect values that go on the true side and the values that go on the false
+  // side.
+  SmallVector<Instruction *> TrueInstrs, FalseInstrs;
+  for (SelectInst *SI : ASI) {
+    if (Value *V = SI->getTrueValue(); sinkSelectOperand(TTI, V))
+      TrueInstrs.push_back(cast<Instruction>(V));
+    if (Value *V = SI->getFalseValue(); sinkSelectOperand(TTI, V))
+      FalseInstrs.push_back(cast<Instruction>(V));
+  }
+
+  // Split the select block, according to how many (if any) values go on each
+  // side.
+  BasicBlock *StartBlock = SI->getParent();
+  BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
+
+  IRBuilder<> IB(SI);
+  auto *CondFr = IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
+
+  BasicBlock *TrueBlock = nullptr;
+  BasicBlock *FalseBlock = nullptr;
+  BasicBlock *EndBlock = nullptr;
+  BranchInst *TrueBranch = nullptr;
+  BranchInst *FalseBranch = nullptr;
+  if (TrueInstrs.size() == 0) {
+    FalseBranch = cast<BranchInst>(SplitBlockAndInsertIfElse(
+        CondFr, &*SplitPt, false, nullptr, nullptr, LI));
+    FalseBlock = FalseBranch->getParent();
+    EndBlock = cast<BasicBlock>(FalseBranch->getOperand(0));
+  } else if (FalseInstrs.size() == 0) {
+    TrueBranch = cast<BranchInst>(SplitBlockAndInsertIfThen(
+        CondFr, &*SplitPt, false, nullptr, nullptr, LI));
+    TrueBlock = TrueBranch->getParent();
+    EndBlock = cast<BasicBlock>(TrueBranch->getOperand(0));
+  } else {
+    Instruction *ThenTerm = nullptr;
+    Instruction *ElseTerm = nullptr;
+    SplitBlockAndInsertIfThenElse(CondFr, &*SplitPt, &ThenTerm, &ElseTerm,
+                                  nullptr, nullptr, LI);
+    TrueBranch = cast<BranchInst>(ThenTerm);
+    FalseBranch = cast<BranchInst>(ElseTerm);
+    TrueBlock = TrueBranch->getParent();
+    FalseBlock = FalseBranch->getParent();
+    EndBlock = cast<BasicBlock>(TrueBranch->getOperand(0));
+  }
+
+  EndBlock->setName("select.end");
+  if (TrueBlock)
+    TrueBlock->setName("select.true.sink");
+  if (FalseBlock)
+    FalseBlock->setName(FalseInstrs.size() == 0 ? "select.false"
+                                                : "select.false.sink");
+
+  if (IsHugeFunc) {
+    if (TrueBlock)
+      FreshBBs.insert(TrueBlock);
+    if (FalseBlock)
+      FreshBBs.insert(FalseBlock);
+    FreshBBs.insert(EndBlock);
+  }
+
+  BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());
+
+  static const unsigned MD[] = {
+      LLVMContext::MD_prof, LLVMContext::MD_unpredictable,
+      LLVMContext::MD_make_implicit, LLVMContext::MD_dbg};
+  StartBlock->getTerminator()->copyMetadata(*SI, MD);
+
+  // Sink expensive instructions into the conditional blocks to avoid executing
+  // them speculatively.
+  for (Instruction *I : TrueInstrs)
+    I->moveBefore(TrueBranch);
+  for (Instruction *I : FalseInstrs)
+    I->moveBefore(FalseBranch);
+
+  // If we did not create a new block for one of the 'true' or 'false' paths
+  // of the condition, it means that side of the branch goes to the end block
+  // directly and the path originates from the start block from the point of
+  // view of the new PHI.
+  if (TrueBlock == nullptr)
+    TrueBlock = StartBlock;
+  else if (FalseBlock == nullptr)
+    FalseBlock = StartBlock;
+
+  SmallPtrSet<const Instruction *, 2> INS;
+  INS.insert(ASI.begin(), ASI.end());
+  // Use reverse iterator because later select may use the value of the
+  // earlier select, and we need to propagate value through earlier select
+  // to get the PHI operand.
+  for (SelectInst *SI : llvm::reverse(ASI)) {
+    // The select itself is replaced with a PHI Node.
+    PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
+    PN->takeName(SI);
+    PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
+    PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
+    PN->setDebugLoc(SI->getDebugLoc());
+
+    replaceAllUsesWith(SI, PN, FreshBBs, IsHugeFunc);
+    SI->eraseFromParent();
+    INS.erase(SI);
+    ++NumSelectsExpanded;
+  }
+
+  // Instruct OptimizeBlock to skip to the next block.
+  CurInstIterator = StartBlock->end();
+  return true;
+}
+
+/// Some targets only accept certain types for splat inputs. For example a VDUP
+/// in MVE takes a GPR (integer) register, and the instruction that incorporate
+/// a VDUP (such as a VADD qd, qm, rm) also require a gpr register.
+bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
+  // Accept shuf(insertelem(undef/poison, val, 0), undef/poison, <0,0,..>) only
+  if (!match(SVI, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
+                            m_Undef(), m_ZeroMask())))
+    return false;
+  Type *NewType = TLI->shouldConvertSplatType(SVI);
+  if (!NewType)
+    return false;
+
+  auto *SVIVecType = cast<FixedVectorType>(SVI->getType());
+  assert(!NewType->isVectorTy() && "Expected a scalar type!");
+  assert(NewType->getScalarSizeInBits() == SVIVecType->getScalarSizeInBits() &&
+         "Expected a type of the same size!");
+  auto *NewVecType =
+      FixedVectorType::get(NewType, SVIVecType->getNumElements());
+
+  // Create a bitcast (shuffle (insert (bitcast(..))))
+  IRBuilder<> Builder(SVI->getContext());
+  Builder.SetInsertPoint(SVI);
+  Value *BC1 = Builder.CreateBitCast(
+      cast<Instruction>(SVI->getOperand(0))->getOperand(1), NewType);
+  Value *Shuffle = Builder.CreateVectorSplat(NewVecType->getNumElements(), BC1);
+  Value *BC2 = Builder.CreateBitCast(Shuffle, SVIVecType);
+
+  replaceAllUsesWith(SVI, BC2, FreshBBs, IsHugeFunc);
+  RecursivelyDeleteTriviallyDeadInstructions(
+      SVI, TLInfo, nullptr,
+      [&](Value *V) { removeAllAssertingVHReferences(V); });
+
+  // Also hoist the bitcast up to its operand if it they are not in the same
+  // block.
+  if (auto *BCI = dyn_cast<Instruction>(BC1))
+    if (auto *Op = dyn_cast<Instruction>(BCI->getOperand(0)))
+      if (BCI->getParent() != Op->getParent() && !isa<PHINode>(Op) &&
+          !Op->isTerminator() && !Op->isEHPad())
+        BCI->moveAfter(Op);
+
+  return true;
+}
+
+bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
+  // If the operands of I can be folded into a target instruction together with
+  // I, duplicate and sink them.
+  SmallVector<Use *, 4> OpsToSink;
+  if (!TLI->shouldSinkOperands(I, OpsToSink))
+    return false;
+
+  // OpsToSink can contain multiple uses in a use chain (e.g.
+  // (%u1 with %u1 = shufflevector), (%u2 with %u2 = zext %u1)). The dominating
+  // uses must come first, so we process the ops in reverse order so as to not
+  // create invalid IR.
+  BasicBlock *TargetBB = I->getParent();
+  bool Changed = false;
+  SmallVector<Use *, 4> ToReplace;
+  Instruction *InsertPoint = I;
+  DenseMap<const Instruction *, unsigned long> InstOrdering;
+  unsigned long InstNumber = 0;
+  for (const auto &I : *TargetBB)
+    InstOrdering[&I] = InstNumber++;
+
+  for (Use *U : reverse(OpsToSink)) {
+    auto *UI = cast<Instruction>(U->get());
+    if (isa<PHINode>(UI))
+      continue;
+    if (UI->getParent() == TargetBB) {
+      if (InstOrdering[UI] < InstOrdering[InsertPoint])
+        InsertPoint = UI;
+      continue;
+    }
+    ToReplace.push_back(U);
+  }
+
+  SetVector<Instruction *> MaybeDead;
+  DenseMap<Instruction *, Instruction *> NewInstructions;
+  for (Use *U : ToReplace) {
+    auto *UI = cast<Instruction>(U->get());
+    Instruction *NI = UI->clone();
+
+    if (IsHugeFunc) {
+      // Now we clone an instruction, its operands' defs may sink to this BB
+      // now. So we put the operands defs' BBs into FreshBBs to do optimization.
+      for (unsigned I = 0; I < NI->getNumOperands(); ++I) {
+        auto *OpDef = dyn_cast<Instruction>(NI->getOperand(I));
+        if (!OpDef)
+          continue;
+        FreshBBs.insert(OpDef->getParent());
+      }
+    }
+
+    NewInstructions[UI] = NI;
+    MaybeDead.insert(UI);
+    LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n");
+    NI->insertBefore(InsertPoint);
+    InsertPoint = NI;
+    InsertedInsts.insert(NI);
+
+    // Update the use for the new instruction, making sure that we update the
+    // sunk instruction uses, if it is part of a chain that has already been
+    // sunk.
+    Instruction *OldI = cast<Instruction>(U->getUser());
+    if (NewInstructions.count(OldI))
+      NewInstructions[OldI]->setOperand(U->getOperandNo(), NI);
+    else
+      U->set(NI);
+    Changed = true;
+  }
+
+  // Remove instructions that are dead after sinking.
+  for (auto *I : MaybeDead) {
+    if (!I->hasNUsesOrMore(1)) {
+      LLVM_DEBUG(dbgs() << "Removing dead instruction: " << *I << "\n");
+      I->eraseFromParent();
+    }
+  }
+
+  return Changed;
+}
+
+bool CodeGenPrepare::optimizeSwitchType(SwitchInst *SI) {
+  Value *Cond = SI->getCondition();
+  Type *OldType = Cond->getType();
+  LLVMContext &Context = Cond->getContext();
+  EVT OldVT = TLI->getValueType(*DL, OldType);
+  MVT RegType = TLI->getPreferredSwitchConditionType(Context, OldVT);
+  unsigned RegWidth = RegType.getSizeInBits();
+
+  if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth())
+    return false;
+
+  // If the register width is greater than the type width, expand the condition
+  // of the switch instruction and each case constant to the width of the
+  // register. By widening the type of the switch condition, subsequent
+  // comparisons (for case comparisons) will not need to be extended to the
+  // preferred register width, so we will potentially eliminate N-1 extends,
+  // where N is the number of cases in the switch.
+  auto *NewType = Type::getIntNTy(Context, RegWidth);
+
+  // Extend the switch condition and case constants using the target preferred
+  // extend unless the switch condition is a function argument with an extend
+  // attribute. In that case, we can avoid an unnecessary mask/extension by
+  // matching the argument extension instead.
+  Instruction::CastOps ExtType = Instruction::ZExt;
+  // Some targets prefer SExt over ZExt.
+  if (TLI->isSExtCheaperThanZExt(OldVT, RegType))
+    ExtType = Instruction::SExt;
+
+  if (auto *Arg = dyn_cast<Argument>(Cond)) {
+    if (Arg->hasSExtAttr())
+      ExtType = Instruction::SExt;
+    if (Arg->hasZExtAttr())
+      ExtType = Instruction::ZExt;
+  }
+
+  auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);
+  ExtInst->insertBefore(SI);
+  ExtInst->setDebugLoc(SI->getDebugLoc());
+  SI->setCondition(ExtInst);
+  for (auto Case : SI->cases()) {
+    const APInt &NarrowConst = Case.getCaseValue()->getValue();
+    APInt WideConst = (ExtType == Instruction::ZExt)
+                          ? NarrowConst.zext(RegWidth)
+                          : NarrowConst.sext(RegWidth);
+    Case.setValue(ConstantInt::get(Context, WideConst));
+  }
+
+  return true;
+}
+
+bool CodeGenPrepare::optimizeSwitchPhiConstants(SwitchInst *SI) {
+  // The SCCP optimization tends to produce code like this:
+  //   switch(x) { case 42: phi(42, ...) }
+  // Materializing the constant for the phi-argument needs instructions; So we
+  // change the code to:
+  //   switch(x) { case 42: phi(x, ...) }
+
+  Value *Condition = SI->getCondition();
+  // Avoid endless loop in degenerate case.
+  if (isa<ConstantInt>(*Condition))
+    return false;
+
+  bool Changed = false;
+  BasicBlock *SwitchBB = SI->getParent();
+  Type *ConditionType = Condition->getType();
+
+  for (const SwitchInst::CaseHandle &Case : SI->cases()) {
+    ConstantInt *CaseValue = Case.getCaseValue();
+    BasicBlock *CaseBB = Case.getCaseSuccessor();
+    // Set to true if we previously checked that `CaseBB` is only reached by
+    // a single case from this switch.
+    bool CheckedForSinglePred = false;
+    for (PHINode &PHI : CaseBB->phis()) {
+      Type *PHIType = PHI.getType();
+      // If ZExt is free then we can also catch patterns like this:
+      //   switch((i32)x) { case 42: phi((i64)42, ...); }
+      // and replace `(i64)42` with `zext i32 %x to i64`.
+      bool TryZExt =
+          PHIType->isIntegerTy() &&
+          PHIType->getIntegerBitWidth() > ConditionType->getIntegerBitWidth() &&
+          TLI->isZExtFree(ConditionType, PHIType);
+      if (PHIType == ConditionType || TryZExt) {
+        // Set to true to skip this case because of multiple preds.
+        bool SkipCase = false;
+        Value *Replacement = nullptr;
+        for (unsigned I = 0, E = PHI.getNumIncomingValues(); I != E; I++) {
+          Value *PHIValue = PHI.getIncomingValue(I);
+          if (PHIValue != CaseValue) {
+            if (!TryZExt)
+              continue;
+            ConstantInt *PHIValueInt = dyn_cast<ConstantInt>(PHIValue);
+            if (!PHIValueInt ||
+                PHIValueInt->getValue() !=
+                    CaseValue->getValue().zext(PHIType->getIntegerBitWidth()))
+              continue;
+          }
+          if (PHI.getIncomingBlock(I) != SwitchBB)
+            continue;
+          // We cannot optimize if there are multiple case labels jumping to
+          // this block.  This check may get expensive when there are many
+          // case labels so we test for it last.
+          if (!CheckedForSinglePred) {
+            CheckedForSinglePred = true;
+            if (SI->findCaseDest(CaseBB) == nullptr) {
+              SkipCase = true;
+              break;
+            }
+          }
+
+          if (Replacement == nullptr) {
+            if (PHIValue == CaseValue) {
+              Replacement = Condition;
+            } else {
+              IRBuilder<> Builder(SI);
+              Replacement = Builder.CreateZExt(Condition, PHIType);
+            }
+          }
+          PHI.setIncomingValue(I, Replacement);
+          Changed = true;
+        }
+        if (SkipCase)
+          break;
+      }
+    }
+  }
+  return Changed;
+}
+
+bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
+  bool Changed = optimizeSwitchType(SI);
+  Changed |= optimizeSwitchPhiConstants(SI);
+  return Changed;
+}
+
+namespace {
+
+/// Helper class to promote a scalar operation to a vector one.
+/// This class is used to move downward extractelement transition.
+/// E.g.,
+/// a = vector_op <2 x i32>
+/// b = extractelement <2 x i32> a, i32 0
+/// c = scalar_op b
+/// store c
+///
+/// =>
+/// a = vector_op <2 x i32>
+/// c = vector_op a (equivalent to scalar_op on the related lane)
+/// * d = extractelement <2 x i32> c, i32 0
+/// * store d
+/// Assuming both extractelement and store can be combine, we get rid of the
+/// transition.
+class VectorPromoteHelper {
+  /// DataLayout associated with the current module.
+  const DataLayout &DL;
+
+  /// Used to perform some checks on the legality of vector operations.
+  const TargetLowering &TLI;
+
+  /// Used to estimated the cost of the promoted chain.
+  const TargetTransformInfo &TTI;
+
+  /// The transition being moved downwards.
+  Instruction *Transition;
+
+  /// The sequence of instructions to be promoted.
+  SmallVector<Instruction *, 4> InstsToBePromoted;
+
+  /// Cost of combining a store and an extract.
+  unsigned StoreExtractCombineCost;
+
+  /// Instruction that will be combined with the transition.
+  Instruction *CombineInst = nullptr;
+
+  /// The instruction that represents the current end of the transition.
+  /// Since we are faking the promotion until we reach the end of the chain
+  /// of computation, we need a way to get the current end of the transition.
+  Instruction *getEndOfTransition() const {
+    if (InstsToBePromoted.empty())
+      return Transition;
+    return InstsToBePromoted.back();
+  }
+
+  /// Return the index of the original value in the transition.
+  /// E.g., for "extractelement <2 x i32> c, i32 1" the original value,
+  /// c, is at index 0.
+  unsigned getTransitionOriginalValueIdx() const {
+    assert(isa<ExtractElementInst>(Transition) &&
+           "Other kind of transitions are not supported yet");
+    return 0;
+  }
+
+  /// Return the index of the index in the transition.
+  /// E.g., for "extractelement <2 x i32> c, i32 0" the index
+  /// is at index 1.
+  unsigned getTransitionIdx() const {
+    assert(isa<ExtractElementInst>(Transition) &&
+           "Other kind of transitions are not supported yet");
+    return 1;
+  }
+
+  /// Get the type of the transition.
+  /// This is the type of the original value.
+  /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the
+  /// transition is <2 x i32>.
+  Type *getTransitionType() const {
+    return Transition->getOperand(getTransitionOriginalValueIdx())->getType();
+  }
+
+  /// Promote \p ToBePromoted by moving \p Def downward through.
+  /// I.e., we have the following sequence:
+  /// Def = Transition <ty1> a to <ty2>
+  /// b = ToBePromoted <ty2> Def, ...
+  /// =>
+  /// b = ToBePromoted <ty1> a, ...
+  /// Def = Transition <ty1> ToBePromoted to <ty2>
+  void promoteImpl(Instruction *ToBePromoted);
+
+  /// Check whether or not it is profitable to promote all the
+  /// instructions enqueued to be promoted.
+  bool isProfitableToPromote() {
+    Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx());
+    unsigned Index = isa<ConstantInt>(ValIdx)
+                         ? cast<ConstantInt>(ValIdx)->getZExtValue()
+                         : -1;
+    Type *PromotedType = getTransitionType();
+
+    StoreInst *ST = cast<StoreInst>(CombineInst);
+    unsigned AS = ST->getPointerAddressSpace();
+    // Check if this store is supported.
+    if (!TLI.allowsMisalignedMemoryAccesses(
+            TLI.getValueType(DL, ST->getValueOperand()->getType()), AS,
+            ST->getAlign())) {
+      // If this is not supported, there is no way we can combine
+      // the extract with the store.
+      return false;
+    }
+
+    // The scalar chain of computation has to pay for the transition
+    // scalar to vector.
+    // The vector chain has to account for the combining cost.
+    enum TargetTransformInfo::TargetCostKind CostKind =
+        TargetTransformInfo::TCK_RecipThroughput;
+    InstructionCost ScalarCost =
+        TTI.getVectorInstrCost(*Transition, PromotedType, CostKind, Index);
+    InstructionCost VectorCost = StoreExtractCombineCost;
+    for (const auto &Inst : InstsToBePromoted) {
+      // Compute the cost.
+      // By construction, all instructions being promoted are arithmetic ones.
+      // Moreover, one argument is a constant that can be viewed as a splat
+      // constant.
+      Value *Arg0 = Inst->getOperand(0);
+      bool IsArg0Constant = isa<UndefValue>(Arg0) || isa<ConstantInt>(Arg0) ||
+                            isa<ConstantFP>(Arg0);
+      TargetTransformInfo::OperandValueInfo Arg0Info, Arg1Info;
+      if (IsArg0Constant)
+        Arg0Info.Kind = TargetTransformInfo::OK_UniformConstantValue;
+      else
+        Arg1Info.Kind = TargetTransformInfo::OK_UniformConstantValue;
+
+      ScalarCost += TTI.getArithmeticInstrCost(
+          Inst->getOpcode(), Inst->getType(), CostKind, Arg0Info, Arg1Info);
+      VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
+                                               CostKind, Arg0Info, Arg1Info);
+    }
+    LLVM_DEBUG(
+        dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
+               << ScalarCost << "\nVector: " << VectorCost << '\n');
+    return ScalarCost > VectorCost;
+  }
+
+  /// Generate a constant vector with \p Val with the same
+  /// number of elements as the transition.
+  /// \p UseSplat defines whether or not \p Val should be replicated
+  /// across the whole vector.
+  /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
+  /// otherwise we generate a vector with as many undef as possible:
+  /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
+  /// used at the index of the extract.
+  Value *getConstantVector(Constant *Val, bool UseSplat) const {
+    unsigned ExtractIdx = std::numeric_limits<unsigned>::max();
+    if (!UseSplat) {
+      // If we cannot determine where the constant must be, we have to
+      // use a splat constant.
+      Value *ValExtractIdx = Transition->getOperand(getTransitionIdx());
+      if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx))
+        ExtractIdx = CstVal->getSExtValue();
+      else
+        UseSplat = true;
+    }
+
+    ElementCount EC = cast<VectorType>(getTransitionType())->getElementCount();
+    if (UseSplat)
+      return ConstantVector::getSplat(EC, Val);
+
+    if (!EC.isScalable()) {
+      SmallVector<Constant *, 4> ConstVec;
+      UndefValue *UndefVal = UndefValue::get(Val->getType());
+      for (unsigned Idx = 0; Idx != EC.getKnownMinValue(); ++Idx) {
+        if (Idx == ExtractIdx)
+          ConstVec.push_back(Val);
+        else
+          ConstVec.push_back(UndefVal);
+      }
+      return ConstantVector::get(ConstVec);
+    } else
+      llvm_unreachable(
+          "Generate scalable vector for non-splat is unimplemented");
+  }
+
+  /// Check if promoting to a vector type an operand at \p OperandIdx
+  /// in \p Use can trigger undefined behavior.
+  static bool canCauseUndefinedBehavior(const Instruction *Use,
+                                        unsigned OperandIdx) {
+    // This is not safe to introduce undef when the operand is on
+    // the right hand side of a division-like instruction.
+    if (OperandIdx != 1)
+      return false;
+    switch (Use->getOpcode()) {
+    default:
+      return false;
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+    case Instruction::SRem:
+    case Instruction::URem:
+      return true;
+    case Instruction::FDiv:
+    case Instruction::FRem:
+      return !Use->hasNoNaNs();
+    }
+    llvm_unreachable(nullptr);
+  }
+
+public:
+  VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI,
+                      const TargetTransformInfo &TTI, Instruction *Transition,
+                      unsigned CombineCost)
+      : DL(DL), TLI(TLI), TTI(TTI), Transition(Transition),
+        StoreExtractCombineCost(CombineCost) {
+    assert(Transition && "Do not know how to promote null");
+  }
+
+  /// Check if we can promote \p ToBePromoted to \p Type.
+  bool canPromote(const Instruction *ToBePromoted) const {
+    // We could support CastInst too.
+    return isa<BinaryOperator>(ToBePromoted);
+  }
+
+  /// Check if it is profitable to promote \p ToBePromoted
+  /// by moving downward the transition through.
+  bool shouldPromote(const Instruction *ToBePromoted) const {
+    // Promote only if all the operands can be statically expanded.
+    // Indeed, we do not want to introduce any new kind of transitions.
+    for (const Use &U : ToBePromoted->operands()) {
+      const Value *Val = U.get();
+      if (Val == getEndOfTransition()) {
+        // If the use is a division and the transition is on the rhs,
+        // we cannot promote the operation, otherwise we may create a
+        // division by zero.
+        if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()))
+          return false;
+        continue;
+      }
+      if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) &&
+          !isa<ConstantFP>(Val))
+        return false;
+    }
+    // Check that the resulting operation is legal.
+    int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode());
+    if (!ISDOpcode)
+      return false;
+    return StressStoreExtract ||
+           TLI.isOperationLegalOrCustom(
+               ISDOpcode, TLI.getValueType(DL, getTransitionType(), true));
+  }
+
+  /// Check whether or not \p Use can be combined
+  /// with the transition.
+  /// I.e., is it possible to do Use(Transition) => AnotherUse?
+  bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); }
+
+  /// Record \p ToBePromoted as part of the chain to be promoted.
+  void enqueueForPromotion(Instruction *ToBePromoted) {
+    InstsToBePromoted.push_back(ToBePromoted);
+  }
+
+  /// Set the instruction that will be combined with the transition.
+  void recordCombineInstruction(Instruction *ToBeCombined) {
+    assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");
+    CombineInst = ToBeCombined;
+  }
+
+  /// Promote all the instructions enqueued for promotion if it is
+  /// is profitable.
+  /// \return True if the promotion happened, false otherwise.
+  bool promote() {
+    // Check if there is something to promote.
+    // Right now, if we do not have anything to combine with,
+    // we assume the promotion is not profitable.
+    if (InstsToBePromoted.empty() || !CombineInst)
+      return false;
+
+    // Check cost.
+    if (!StressStoreExtract && !isProfitableToPromote())
+      return false;
+
+    // Promote.
+    for (auto &ToBePromoted : InstsToBePromoted)
+      promoteImpl(ToBePromoted);
+    InstsToBePromoted.clear();
+    return true;
+  }
+};
+
+} // end anonymous namespace
+
+void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
+  // At this point, we know that all the operands of ToBePromoted but Def
+  // can be statically promoted.
+  // For Def, we need to use its parameter in ToBePromoted:
+  // b = ToBePromoted ty1 a
+  // Def = Transition ty1 b to ty2
+  // Move the transition down.
+  // 1. Replace all uses of the promoted operation by the transition.
+  // = ... b => = ... Def.
+  assert(ToBePromoted->getType() == Transition->getType() &&
+         "The type of the result of the transition does not match "
+         "the final type");
+  ToBePromoted->replaceAllUsesWith(Transition);
+  // 2. Update the type of the uses.
+  // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def.
+  Type *TransitionTy = getTransitionType();
+  ToBePromoted->mutateType(TransitionTy);
+  // 3. Update all the operands of the promoted operation with promoted
+  // operands.
+  // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a.
+  for (Use &U : ToBePromoted->operands()) {
+    Value *Val = U.get();
+    Value *NewVal = nullptr;
+    if (Val == Transition)
+      NewVal = Transition->getOperand(getTransitionOriginalValueIdx());
+    else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) ||
+             isa<ConstantFP>(Val)) {
+      // Use a splat constant if it is not safe to use undef.
+      NewVal = getConstantVector(
+          cast<Constant>(Val),
+          isa<UndefValue>(Val) ||
+              canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));
+    } else
+      llvm_unreachable("Did you modified shouldPromote and forgot to update "
+                       "this?");
+    ToBePromoted->setOperand(U.getOperandNo(), NewVal);
+  }
+  Transition->moveAfter(ToBePromoted);
+  Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
+}
+
+/// Some targets can do store(extractelement) with one instruction.
+/// Try to push the extractelement towards the stores when the target
+/// has this feature and this is profitable.
+bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
+  unsigned CombineCost = std::numeric_limits<unsigned>::max();
+  if (DisableStoreExtract ||
+      (!StressStoreExtract &&
+       !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),
+                                       Inst->getOperand(1), CombineCost)))
+    return false;
+
+  // At this point we know that Inst is a vector to scalar transition.
+  // Try to move it down the def-use chain, until:
+  // - We can combine the transition with its single use
+  //   => we got rid of the transition.
+  // - We escape the current basic block
+  //   => we would need to check that we are moving it at a cheaper place and
+  //      we do not do that for now.
+  BasicBlock *Parent = Inst->getParent();
+  LLVM_DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
+  VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost);
+  // If the transition has more than one use, assume this is not going to be
+  // beneficial.
+  while (Inst->hasOneUse()) {
+    Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin());
+    LLVM_DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');
+
+    if (ToBePromoted->getParent() != Parent) {
+      LLVM_DEBUG(dbgs() << "Instruction to promote is in a different block ("
+                        << ToBePromoted->getParent()->getName()
+                        << ") than the transition (" << Parent->getName()
+                        << ").\n");
+      return false;
+    }
+
+    if (VPH.canCombine(ToBePromoted)) {
+      LLVM_DEBUG(dbgs() << "Assume " << *Inst << '\n'
+                        << "will be combined with: " << *ToBePromoted << '\n');
+      VPH.recordCombineInstruction(ToBePromoted);
+      bool Changed = VPH.promote();
+      NumStoreExtractExposed += Changed;
+      return Changed;
+    }
+
+    LLVM_DEBUG(dbgs() << "Try promoting.\n");
+    if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted))
+      return false;
+
+    LLVM_DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");
+
+    VPH.enqueueForPromotion(ToBePromoted);
+    Inst = ToBePromoted;
+  }
+  return false;
+}
+
+/// For the instruction sequence of store below, F and I values
+/// are bundled together as an i64 value before being stored into memory.
+/// Sometimes it is more efficient to generate separate stores for F and I,
+/// which can remove the bitwise instructions or sink them to colder places.
+///
+///   (store (or (zext (bitcast F to i32) to i64),
+///              (shl (zext I to i64), 32)), addr)  -->
+///   (store F, addr) and (store I, addr+4)
+///
+/// Similarly, splitting for other merged store can also be beneficial, like:
+/// For pair of {i32, i32}, i64 store --> two i32 stores.
+/// For pair of {i32, i16}, i64 store --> two i32 stores.
+/// For pair of {i16, i16}, i32 store --> two i16 stores.
+/// For pair of {i16, i8},  i32 store --> two i16 stores.
+/// For pair of {i8, i8},   i16 store --> two i8 stores.
+///
+/// We allow each target to determine specifically which kind of splitting is
+/// supported.
+///
+/// The store patterns are commonly seen from the simple code snippet below
+/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
+///   void goo(const std::pair<int, float> &);
+///   hoo() {
+///     ...
+///     goo(std::make_pair(tmp, ftmp));
+///     ...
+///   }
+///
+/// Although we already have similar splitting in DAG Combine, we duplicate
+/// it in CodeGenPrepare to catch the case in which pattern is across
+/// multiple BBs. The logic in DAG Combine is kept to catch case generated
+/// during code expansion.
+static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
+                                const TargetLowering &TLI) {
+  // Handle simple but common cases only.
+  Type *StoreType = SI.getValueOperand()->getType();
+
+  // The code below assumes shifting a value by <number of bits>,
+  // whereas scalable vectors would have to be shifted by
+  // <2log(vscale) + number of bits> in order to store the
+  // low/high parts. Bailing out for now.
+  if (StoreType->isScalableTy())
+    return false;
+
+  if (!DL.typeSizeEqualsStoreSize(StoreType) ||
+      DL.getTypeSizeInBits(StoreType) == 0)
+    return false;
+
+  unsigned HalfValBitSize = DL.getTypeSizeInBits(StoreType) / 2;
+  Type *SplitStoreType = Type::getIntNTy(SI.getContext(), HalfValBitSize);
+  if (!DL.typeSizeEqualsStoreSize(SplitStoreType))
+    return false;
+
+  // Don't split the store if it is volatile.
+  if (SI.isVolatile())
+    return false;
+
+  // Match the following patterns:
+  // (store (or (zext LValue to i64),
+  //            (shl (zext HValue to i64), 32)), HalfValBitSize)
+  //  or
+  // (store (or (shl (zext HValue to i64), 32)), HalfValBitSize)
+  //            (zext LValue to i64),
+  // Expect both operands of OR and the first operand of SHL have only
+  // one use.
+  Value *LValue, *HValue;
+  if (!match(SI.getValueOperand(),
+             m_c_Or(m_OneUse(m_ZExt(m_Value(LValue))),
+                    m_OneUse(m_Shl(m_OneUse(m_ZExt(m_Value(HValue))),
+                                   m_SpecificInt(HalfValBitSize))))))
+    return false;
+
+  // Check LValue and HValue are int with size less or equal than 32.
+  if (!LValue->getType()->isIntegerTy() ||
+      DL.getTypeSizeInBits(LValue->getType()) > HalfValBitSize ||
+      !HValue->getType()->isIntegerTy() ||
+      DL.getTypeSizeInBits(HValue->getType()) > HalfValBitSize)
+    return false;
+
+  // If LValue/HValue is a bitcast instruction, use the EVT before bitcast
+  // as the input of target query.
+  auto *LBC = dyn_cast<BitCastInst>(LValue);
+  auto *HBC = dyn_cast<BitCastInst>(HValue);
+  EVT LowTy = LBC ? EVT::getEVT(LBC->getOperand(0)->getType())
+                  : EVT::getEVT(LValue->getType());
+  EVT HighTy = HBC ? EVT::getEVT(HBC->getOperand(0)->getType())
+                   : EVT::getEVT(HValue->getType());
+  if (!ForceSplitStore && !TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
+    return false;
+
+  // Start to split store.
+  IRBuilder<> Builder(SI.getContext());
+  Builder.SetInsertPoint(&SI);
+
+  // If LValue/HValue is a bitcast in another BB, create a new one in current
+  // BB so it may be merged with the splitted stores by dag combiner.
+  if (LBC && LBC->getParent() != SI.getParent())
+    LValue = Builder.CreateBitCast(LBC->getOperand(0), LBC->getType());
+  if (HBC && HBC->getParent() != SI.getParent())
+    HValue = Builder.CreateBitCast(HBC->getOperand(0), HBC->getType());
+
+  bool IsLE = SI.getModule()->getDataLayout().isLittleEndian();
+  auto CreateSplitStore = [&](Value *V, bool Upper) {
+    V = Builder.CreateZExtOrBitCast(V, SplitStoreType);
+    Value *Addr = Builder.CreateBitCast(
+        SI.getOperand(1),
+        SplitStoreType->getPointerTo(SI.getPointerAddressSpace()));
+    Align Alignment = SI.getAlign();
+    const bool IsOffsetStore = (IsLE && Upper) || (!IsLE && !Upper);
+    if (IsOffsetStore) {
+      Addr = Builder.CreateGEP(
+          SplitStoreType, Addr,
+          ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));
+
+      // When splitting the store in half, naturally one half will retain the
+      // alignment of the original wider store, regardless of whether it was
+      // over-aligned or not, while the other will require adjustment.
+      Alignment = commonAlignment(Alignment, HalfValBitSize / 8);
+    }
+    Builder.CreateAlignedStore(V, Addr, Alignment);
+  };
+
+  CreateSplitStore(LValue, false);
+  CreateSplitStore(HValue, true);
+
+  // Delete the old store.
+  SI.eraseFromParent();
+  return true;
+}
+
+// Return true if the GEP has two operands, the first operand is of a sequential
+// type, and the second operand is a constant.
+static bool GEPSequentialConstIndexed(GetElementPtrInst *GEP) {
+  gep_type_iterator I = gep_type_begin(*GEP);
+  return GEP->getNumOperands() == 2 && I.isSequential() &&
+         isa<ConstantInt>(GEP->getOperand(1));
+}
+
+// Try unmerging GEPs to reduce liveness interference (register pressure) across
+// IndirectBr edges. Since IndirectBr edges tend to touch on many blocks,
+// reducing liveness interference across those edges benefits global register
+// allocation. Currently handles only certain cases.
+//
+// For example, unmerge %GEPI and %UGEPI as below.
+//
+// ---------- BEFORE ----------
+// SrcBlock:
+//   ...
+//   %GEPIOp = ...
+//   ...
+//   %GEPI = gep %GEPIOp, Idx
+//   ...
+//   indirectbr ... [ label %DstB0, label %DstB1, ... label %DstBi ... ]
+//   (* %GEPI is alive on the indirectbr edges due to other uses ahead)
+//   (* %GEPIOp is alive on the indirectbr edges only because of it's used by
+//   %UGEPI)
+//
+// DstB0: ... (there may be a gep similar to %UGEPI to be unmerged)
+// DstB1: ... (there may be a gep similar to %UGEPI to be unmerged)
+// ...
+//
+// DstBi:
+//   ...
+//   %UGEPI = gep %GEPIOp, UIdx
+// ...
+// ---------------------------
+//
+// ---------- AFTER ----------
+// SrcBlock:
+//   ... (same as above)
+//    (* %GEPI is still alive on the indirectbr edges)
+//    (* %GEPIOp is no longer alive on the indirectbr edges as a result of the
+//    unmerging)
+// ...
+//
+// DstBi:
+//   ...
+//   %UGEPI = gep %GEPI, (UIdx-Idx)
+//   ...
+// ---------------------------
+//
+// The register pressure on the IndirectBr edges is reduced because %GEPIOp is
+// no longer alive on them.
+//
+// We try to unmerge GEPs here in CodGenPrepare, as opposed to limiting merging
+// of GEPs in the first place in InstCombiner::visitGetElementPtrInst() so as
+// not to disable further simplications and optimizations as a result of GEP
+// merging.
+//
+// Note this unmerging may increase the length of the data flow critical path
+// (the path from %GEPIOp to %UGEPI would go through %GEPI), which is a tradeoff
+// between the register pressure and the length of data-flow critical
+// path. Restricting this to the uncommon IndirectBr case would minimize the
+// impact of potentially longer critical path, if any, and the impact on compile
+// time.
+static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
+                                             const TargetTransformInfo *TTI) {
+  BasicBlock *SrcBlock = GEPI->getParent();
+  // Check that SrcBlock ends with an IndirectBr. If not, give up. The common
+  // (non-IndirectBr) cases exit early here.
+  if (!isa<IndirectBrInst>(SrcBlock->getTerminator()))
+    return false;
+  // Check that GEPI is a simple gep with a single constant index.
+  if (!GEPSequentialConstIndexed(GEPI))
+    return false;
+  ConstantInt *GEPIIdx = cast<ConstantInt>(GEPI->getOperand(1));
+  // Check that GEPI is a cheap one.
+  if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType(),
+                         TargetTransformInfo::TCK_SizeAndLatency) >
+      TargetTransformInfo::TCC_Basic)
+    return false;
+  Value *GEPIOp = GEPI->getOperand(0);
+  // Check that GEPIOp is an instruction that's also defined in SrcBlock.
+  if (!isa<Instruction>(GEPIOp))
+    return false;
+  auto *GEPIOpI = cast<Instruction>(GEPIOp);
+  if (GEPIOpI->getParent() != SrcBlock)
+    return false;
+  // Check that GEP is used outside the block, meaning it's alive on the
+  // IndirectBr edge(s).
+  if (llvm::none_of(GEPI->users(), [&](User *Usr) {
+        if (auto *I = dyn_cast<Instruction>(Usr)) {
+          if (I->getParent() != SrcBlock) {
+            return true;
+          }
+        }
+        return false;
+      }))
+    return false;
+  // The second elements of the GEP chains to be unmerged.
+  std::vector<GetElementPtrInst *> UGEPIs;
+  // Check each user of GEPIOp to check if unmerging would make GEPIOp not alive
+  // on IndirectBr edges.
+  for (User *Usr : GEPIOp->users()) {
+    if (Usr == GEPI)
+      continue;
+    // Check if Usr is an Instruction. If not, give up.
+    if (!isa<Instruction>(Usr))
+      return false;
+    auto *UI = cast<Instruction>(Usr);
+    // Check if Usr in the same block as GEPIOp, which is fine, skip.
+    if (UI->getParent() == SrcBlock)
+      continue;
+    // Check if Usr is a GEP. If not, give up.
+    if (!isa<GetElementPtrInst>(Usr))
+      return false;
+    auto *UGEPI = cast<GetElementPtrInst>(Usr);
+    // Check if UGEPI is a simple gep with a single constant index and GEPIOp is
+    // the pointer operand to it. If so, record it in the vector. If not, give
+    // up.
+    if (!GEPSequentialConstIndexed(UGEPI))
+      return false;
+    if (UGEPI->getOperand(0) != GEPIOp)
+      return false;
+    if (GEPIIdx->getType() !=
+        cast<ConstantInt>(UGEPI->getOperand(1))->getType())
+      return false;
+    ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
+    if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType(),
+                           TargetTransformInfo::TCK_SizeAndLatency) >
+        TargetTransformInfo::TCC_Basic)
+      return false;
+    UGEPIs.push_back(UGEPI);
+  }
+  if (UGEPIs.size() == 0)
+    return false;
+  // Check the materializing cost of (Uidx-Idx).
+  for (GetElementPtrInst *UGEPI : UGEPIs) {
+    ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
+    APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue();
+    InstructionCost ImmCost = TTI->getIntImmCost(
+        NewIdx, GEPIIdx->getType(), TargetTransformInfo::TCK_SizeAndLatency);
+    if (ImmCost > TargetTransformInfo::TCC_Basic)
+      return false;
+  }
+  // Now unmerge between GEPI and UGEPIs.
+  for (GetElementPtrInst *UGEPI : UGEPIs) {
+    UGEPI->setOperand(0, GEPI);
+    ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
+    Constant *NewUGEPIIdx = ConstantInt::get(
+        GEPIIdx->getType(), UGEPIIdx->getValue() - GEPIIdx->getValue());
+    UGEPI->setOperand(1, NewUGEPIIdx);
+    // If GEPI is not inbounds but UGEPI is inbounds, change UGEPI to not
+    // inbounds to avoid UB.
+    if (!GEPI->isInBounds()) {
+      UGEPI->setIsInBounds(false);
+    }
+  }
+  // After unmerging, verify that GEPIOp is actually only used in SrcBlock (not
+  // alive on IndirectBr edges).
+  assert(llvm::none_of(GEPIOp->users(),
+                       [&](User *Usr) {
+                         return cast<Instruction>(Usr)->getParent() != SrcBlock;
+                       }) &&
+         "GEPIOp is used outside SrcBlock");
+  return true;
+}
+
+static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI,
+                           SmallSet<BasicBlock *, 32> &FreshBBs,
+                           bool IsHugeFunc) {
+  // Try and convert
+  //  %c = icmp ult %x, 8
+  //  br %c, bla, blb
+  //  %tc = lshr %x, 3
+  // to
+  //  %tc = lshr %x, 3
+  //  %c = icmp eq %tc, 0
+  //  br %c, bla, blb
+  // Creating the cmp to zero can be better for the backend, especially if the
+  // lshr produces flags that can be used automatically.
+  if (!TLI.preferZeroCompareBranch() || !Branch->isConditional())
+    return false;
+
+  ICmpInst *Cmp = dyn_cast<ICmpInst>(Branch->getCondition());
+  if (!Cmp || !isa<ConstantInt>(Cmp->getOperand(1)) || !Cmp->hasOneUse())
+    return false;
+
+  Value *X = Cmp->getOperand(0);
+  APInt CmpC = cast<ConstantInt>(Cmp->getOperand(1))->getValue();
+
+  for (auto *U : X->users()) {
+    Instruction *UI = dyn_cast<Instruction>(U);
+    // A quick dominance check
+    if (!UI ||
+        (UI->getParent() != Branch->getParent() &&
+         UI->getParent() != Branch->getSuccessor(0) &&
+         UI->getParent() != Branch->getSuccessor(1)) ||
+        (UI->getParent() != Branch->getParent() &&
+         !UI->getParent()->getSinglePredecessor()))
+      continue;
+
+    if (CmpC.isPowerOf2() && Cmp->getPredicate() == ICmpInst::ICMP_ULT &&
+        match(UI, m_Shr(m_Specific(X), m_SpecificInt(CmpC.logBase2())))) {
+      IRBuilder<> Builder(Branch);
+      if (UI->getParent() != Branch->getParent())
+        UI->moveBefore(Branch);
+      Value *NewCmp = Builder.CreateCmp(ICmpInst::ICMP_EQ, UI,
+                                        ConstantInt::get(UI->getType(), 0));
+      LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n");
+      LLVM_DEBUG(dbgs() << " to compare on zero: " << *NewCmp << "\n");
+      replaceAllUsesWith(Cmp, NewCmp, FreshBBs, IsHugeFunc);
+      return true;
+    }
+    if (Cmp->isEquality() &&
+        (match(UI, m_Add(m_Specific(X), m_SpecificInt(-CmpC))) ||
+         match(UI, m_Sub(m_Specific(X), m_SpecificInt(CmpC))))) {
+      IRBuilder<> Builder(Branch);
+      if (UI->getParent() != Branch->getParent())
+        UI->moveBefore(Branch);
+      Value *NewCmp = Builder.CreateCmp(Cmp->getPredicate(), UI,
+                                        ConstantInt::get(UI->getType(), 0));
+      LLVM_DEBUG(dbgs() << "Converting " << *Cmp << "\n");
+      LLVM_DEBUG(dbgs() << " to compare on zero: " << *NewCmp << "\n");
+      replaceAllUsesWith(Cmp, NewCmp, FreshBBs, IsHugeFunc);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
+  // Bail out if we inserted the instruction to prevent optimizations from
+  // stepping on each other's toes.
+  if (InsertedInsts.count(I))
+    return false;
+
+  // TODO: Move into the switch on opcode below here.
+  if (PHINode *P = dyn_cast<PHINode>(I)) {
+    // It is possible for very late stage optimizations (such as SimplifyCFG)
+    // to introduce PHI nodes too late to be cleaned up.  If we detect such a
+    // trivial PHI, go ahead and zap it here.
+    if (Value *V = simplifyInstruction(P, {*DL, TLInfo})) {
+      LargeOffsetGEPMap.erase(P);
+      replaceAllUsesWith(P, V, FreshBBs, IsHugeFunc);
+      P->eraseFromParent();
+      ++NumPHIsElim;
+      return true;
+    }
+    return false;
+  }
+
+  if (CastInst *CI = dyn_cast<CastInst>(I)) {
+    // If the source of the cast is a constant, then this should have
+    // already been constant folded.  The only reason NOT to constant fold
+    // it is if something (e.g. LSR) was careful to place the constant
+    // evaluation in a block other than then one that uses it (e.g. to hoist
+    // the address of globals out of a loop).  If this is the case, we don't
+    // want to forward-subst the cast.
+    if (isa<Constant>(CI->getOperand(0)))
+      return false;
+
+    if (OptimizeNoopCopyExpression(CI, *TLI, *DL))
+      return true;
+
+    if ((isa<UIToFPInst>(I) || isa<FPToUIInst>(I) || isa<TruncInst>(I)) &&
+        TLI->optimizeExtendOrTruncateConversion(
+            I, LI->getLoopFor(I->getParent()), *TTI))
+      return true;
+
+    if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
+      /// Sink a zext or sext into its user blocks if the target type doesn't
+      /// fit in one register
+      if (TLI->getTypeAction(CI->getContext(),
+                             TLI->getValueType(*DL, CI->getType())) ==
+          TargetLowering::TypeExpandInteger) {
+        return SinkCast(CI);
+      } else {
+        if (TLI->optimizeExtendOrTruncateConversion(
+                I, LI->getLoopFor(I->getParent()), *TTI))
+          return true;
+
+        bool MadeChange = optimizeExt(I);
+        return MadeChange | optimizeExtUses(I);
+      }
+    }
+    return false;
+  }
+
+  if (auto *Cmp = dyn_cast<CmpInst>(I))
+    if (optimizeCmp(Cmp, ModifiedDT))
+      return true;
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
+    bool Modified = optimizeLoadExt(LI);
+    unsigned AS = LI->getPointerAddressSpace();
+    Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
+    return Modified;
+  }
+
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (splitMergedValStore(*SI, *DL, *TLI))
+      return true;
+    SI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
+    unsigned AS = SI->getPointerAddressSpace();
+    return optimizeMemoryInst(I, SI->getOperand(1),
+                              SI->getOperand(0)->getType(), AS);
+  }
+
+  if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    unsigned AS = RMW->getPointerAddressSpace();
+    return optimizeMemoryInst(I, RMW->getPointerOperand(), RMW->getType(), AS);
+  }
+
+  if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) {
+    unsigned AS = CmpX->getPointerAddressSpace();
+    return optimizeMemoryInst(I, CmpX->getPointerOperand(),
+                              CmpX->getCompareOperand()->getType(), AS);
+  }
+
+  BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
+
+  if (BinOp && BinOp->getOpcode() == Instruction::And && EnableAndCmpSinking &&
+      sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts))
+    return true;
+
+  // TODO: Move this into the switch on opcode - it handles shifts already.
+  if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
+                BinOp->getOpcode() == Instruction::LShr)) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
+    if (CI && TLI->hasExtractBitsInsn())
+      if (OptimizeExtractBits(BinOp, CI, *TLI, *DL))
+        return true;
+  }
+
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+    if (GEPI->hasAllZeroIndices()) {
+      /// The GEP operand must be a pointer, so must its result -> BitCast
+      Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
+                                        GEPI->getName(), GEPI);
+      NC->setDebugLoc(GEPI->getDebugLoc());
+      replaceAllUsesWith(GEPI, NC, FreshBBs, IsHugeFunc);
+      RecursivelyDeleteTriviallyDeadInstructions(
+          GEPI, TLInfo, nullptr,
+          [&](Value *V) { removeAllAssertingVHReferences(V); });
+      ++NumGEPsElim;
+      optimizeInst(NC, ModifiedDT);
+      return true;
+    }
+    if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) {
+      return true;
+    }
+    return false;
+  }
+
+  if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {
+    // freeze(icmp a, const)) -> icmp (freeze a), const
+    // This helps generate efficient conditional jumps.
+    Instruction *CmpI = nullptr;
+    if (ICmpInst *II = dyn_cast<ICmpInst>(FI->getOperand(0)))
+      CmpI = II;
+    else if (FCmpInst *F = dyn_cast<FCmpInst>(FI->getOperand(0)))
+      CmpI = F->getFastMathFlags().none() ? F : nullptr;
+
+    if (CmpI && CmpI->hasOneUse()) {
+      auto Op0 = CmpI->getOperand(0), Op1 = CmpI->getOperand(1);
+      bool Const0 = isa<ConstantInt>(Op0) || isa<ConstantFP>(Op0) ||
+                    isa<ConstantPointerNull>(Op0);
+      bool Const1 = isa<ConstantInt>(Op1) || isa<ConstantFP>(Op1) ||
+                    isa<ConstantPointerNull>(Op1);
+      if (Const0 || Const1) {
+        if (!Const0 || !Const1) {
+          auto *F = new FreezeInst(Const0 ? Op1 : Op0, "", CmpI);
+          F->takeName(FI);
+          CmpI->setOperand(Const0 ? 1 : 0, F);
+        }
+        replaceAllUsesWith(FI, CmpI, FreshBBs, IsHugeFunc);
+        FI->eraseFromParent();
+        return true;
+      }
+    }
+    return false;
+  }
+
+  if (tryToSinkFreeOperands(I))
+    return true;
+
+  switch (I->getOpcode()) {
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return optimizeShiftInst(cast<BinaryOperator>(I));
+  case Instruction::Call:
+    return optimizeCallInst(cast<CallInst>(I), ModifiedDT);
+  case Instruction::Select:
+    return optimizeSelectInst(cast<SelectInst>(I));
+  case Instruction::ShuffleVector:
+    return optimizeShuffleVectorInst(cast<ShuffleVectorInst>(I));
+  case Instruction::Switch:
+    return optimizeSwitchInst(cast<SwitchInst>(I));
+  case Instruction::ExtractElement:
+    return optimizeExtractElementInst(cast<ExtractElementInst>(I));
+  case Instruction::Br:
+    return optimizeBranch(cast<BranchInst>(I), *TLI, FreshBBs, IsHugeFunc);
+  }
+
+  return false;
+}
+
+/// Given an OR instruction, check to see if this is a bitreverse
+/// idiom. If so, insert the new intrinsic and return true.
+bool CodeGenPrepare::makeBitReverse(Instruction &I) {
+  if (!I.getType()->isIntegerTy() ||
+      !TLI->isOperationLegalOrCustom(ISD::BITREVERSE,
+                                     TLI->getValueType(*DL, I.getType(), true)))
+    return false;
+
+  SmallVector<Instruction *, 4> Insts;
+  if (!recognizeBSwapOrBitReverseIdiom(&I, false, true, Insts))
+    return false;
+  Instruction *LastInst = Insts.back();
+  replaceAllUsesWith(&I, LastInst, FreshBBs, IsHugeFunc);
+  RecursivelyDeleteTriviallyDeadInstructions(
+      &I, TLInfo, nullptr,
+      [&](Value *V) { removeAllAssertingVHReferences(V); });
+  return true;
+}
+
+// In this pass we look for GEP and cast instructions that are used
+// across basic blocks and rewrite them to improve basic-block-at-a-time
+// selection.
+bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, ModifyDT &ModifiedDT) {
+  SunkAddrs.clear();
+  bool MadeChange = false;
+
+  do {
+    CurInstIterator = BB.begin();
+    ModifiedDT = ModifyDT::NotModifyDT;
+    while (CurInstIterator != BB.end()) {
+      MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT);
+      if (ModifiedDT != ModifyDT::NotModifyDT) {
+        // For huge function we tend to quickly go though the inner optmization
+        // opportunities in the BB. So we go back to the BB head to re-optimize
+        // each instruction instead of go back to the function head.
+        if (IsHugeFunc) {
+          DT.reset();
+          getDT(*BB.getParent());
+          break;
+        } else {
+          return true;
+        }
+      }
+    }
+  } while (ModifiedDT == ModifyDT::ModifyInstDT);
+
+  bool MadeBitReverse = true;
+  while (MadeBitReverse) {
+    MadeBitReverse = false;
+    for (auto &I : reverse(BB)) {
+      if (makeBitReverse(I)) {
+        MadeBitReverse = MadeChange = true;
+        break;
+      }
+    }
+  }
+  MadeChange |= dupRetToEnableTailCallOpts(&BB, ModifiedDT);
+
+  return MadeChange;
+}
+
+// Some CGP optimizations may move or alter what's computed in a block. Check
+// whether a dbg.value intrinsic could be pointed at a more appropriate operand.
+bool CodeGenPrepare::fixupDbgValue(Instruction *I) {
+  assert(isa<DbgValueInst>(I));
+  DbgValueInst &DVI = *cast<DbgValueInst>(I);
+
+  // Does this dbg.value refer to a sunk address calculation?
+  bool AnyChange = false;
+  SmallDenseSet<Value *> LocationOps(DVI.location_ops().begin(),
+                                     DVI.location_ops().end());
+  for (Value *Location : LocationOps) {
+    WeakTrackingVH SunkAddrVH = SunkAddrs[Location];
+    Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
+    if (SunkAddr) {
+      // Point dbg.value at locally computed address, which should give the best
+      // opportunity to be accurately lowered. This update may change the type
+      // of pointer being referred to; however this makes no difference to
+      // debugging information, and we can't generate bitcasts that may affect
+      // codegen.
+      DVI.replaceVariableLocationOp(Location, SunkAddr);
+      AnyChange = true;
+    }
+  }
+  return AnyChange;
+}
+
+// A llvm.dbg.value may be using a value before its definition, due to
+// optimizations in this pass and others. Scan for such dbg.values, and rescue
+// them by moving the dbg.value to immediately after the value definition.
+// FIXME: Ideally this should never be necessary, and this has the potential
+// to re-order dbg.value intrinsics.
+bool CodeGenPrepare::placeDbgValues(Function &F) {
+  bool MadeChange = false;
+  DominatorTree DT(F);
+
+  for (BasicBlock &BB : F) {
+    for (Instruction &Insn : llvm::make_early_inc_range(BB)) {
+      DbgValueInst *DVI = dyn_cast<DbgValueInst>(&Insn);
+      if (!DVI)
+        continue;
+
+      SmallVector<Instruction *, 4> VIs;
+      for (Value *V : DVI->getValues())
+        if (Instruction *VI = dyn_cast_or_null<Instruction>(V))
+          VIs.push_back(VI);
+
+      // This DVI may depend on multiple instructions, complicating any
+      // potential sink. This block takes the defensive approach, opting to
+      // "undef" the DVI if it has more than one instruction and any of them do
+      // not dominate DVI.
+      for (Instruction *VI : VIs) {
+        if (VI->isTerminator())
+          continue;
+
+        // If VI is a phi in a block with an EHPad terminator, we can't insert
+        // after it.
+        if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad())
+          continue;
+
+        // If the defining instruction dominates the dbg.value, we do not need
+        // to move the dbg.value.
+        if (DT.dominates(VI, DVI))
+          continue;
+
+        // If we depend on multiple instructions and any of them doesn't
+        // dominate this DVI, we probably can't salvage it: moving it to
+        // after any of the instructions could cause us to lose the others.
+        if (VIs.size() > 1) {
+          LLVM_DEBUG(
+              dbgs()
+              << "Unable to find valid location for Debug Value, undefing:\n"
+              << *DVI);
+          DVI->setKillLocation();
+          break;
+        }
+
+        LLVM_DEBUG(dbgs() << "Moving Debug Value before :\n"
+                          << *DVI << ' ' << *VI);
+        DVI->removeFromParent();
+        if (isa<PHINode>(VI))
+          DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt());
+        else
+          DVI->insertAfter(VI);
+        MadeChange = true;
+        ++NumDbgValueMoved;
+      }
+    }
+  }
+  return MadeChange;
+}
+
+// Group scattered pseudo probes in a block to favor SelectionDAG. Scattered
+// probes can be chained dependencies of other regular DAG nodes and block DAG
+// combine optimizations.
+bool CodeGenPrepare::placePseudoProbes(Function &F) {
+  bool MadeChange = false;
+  for (auto &Block : F) {
+    // Move the rest probes to the beginning of the block.
+    auto FirstInst = Block.getFirstInsertionPt();
+    while (FirstInst != Block.end() && FirstInst->isDebugOrPseudoInst())
+      ++FirstInst;
+    BasicBlock::iterator I(FirstInst);
+    I++;
+    while (I != Block.end()) {
+      if (auto *II = dyn_cast<PseudoProbeInst>(I++)) {
+        II->moveBefore(&*FirstInst);
+        MadeChange = true;
+      }
+    }
+  }
+  return MadeChange;
+}
+
+/// Scale down both weights to fit into uint32_t.
+static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
+  uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
+  uint32_t Scale = (NewMax / std::numeric_limits<uint32_t>::max()) + 1;
+  NewTrue = NewTrue / Scale;
+  NewFalse = NewFalse / Scale;
+}
+
+/// Some targets prefer to split a conditional branch like:
+/// \code
+///   %0 = icmp ne i32 %a, 0
+///   %1 = icmp ne i32 %b, 0
+///   %or.cond = or i1 %0, %1
+///   br i1 %or.cond, label %TrueBB, label %FalseBB
+/// \endcode
+/// into multiple branch instructions like:
+/// \code
+///   bb1:
+///     %0 = icmp ne i32 %a, 0
+///     br i1 %0, label %TrueBB, label %bb2
+///   bb2:
+///     %1 = icmp ne i32 %b, 0
+///     br i1 %1, label %TrueBB, label %FalseBB
+/// \endcode
+/// This usually allows instruction selection to do even further optimizations
+/// and combine the compare with the branch instruction. Currently this is
+/// applied for targets which have "cheap" jump instructions.
+///
+/// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
+///
+bool CodeGenPrepare::splitBranchCondition(Function &F, ModifyDT &ModifiedDT) {
+  if (!TM->Options.EnableFastISel || TLI->isJumpExpensive())
+    return false;
+
+  bool MadeChange = false;
+  for (auto &BB : F) {
+    // Does this BB end with the following?
+    //   %cond1 = icmp|fcmp|binary instruction ...
+    //   %cond2 = icmp|fcmp|binary instruction ...
+    //   %cond.or = or|and i1 %cond1, cond2
+    //   br i1 %cond.or label %dest1, label %dest2"
+    Instruction *LogicOp;
+    BasicBlock *TBB, *FBB;
+    if (!match(BB.getTerminator(),
+               m_Br(m_OneUse(m_Instruction(LogicOp)), TBB, FBB)))
+      continue;
+
+    auto *Br1 = cast<BranchInst>(BB.getTerminator());
+    if (Br1->getMetadata(LLVMContext::MD_unpredictable))
+      continue;
+
+    // The merging of mostly empty BB can cause a degenerate branch.
+    if (TBB == FBB)
+      continue;
+
+    unsigned Opc;
+    Value *Cond1, *Cond2;
+    if (match(LogicOp,
+              m_LogicalAnd(m_OneUse(m_Value(Cond1)), m_OneUse(m_Value(Cond2)))))
+      Opc = Instruction::And;
+    else if (match(LogicOp, m_LogicalOr(m_OneUse(m_Value(Cond1)),
+                                        m_OneUse(m_Value(Cond2)))))
+      Opc = Instruction::Or;
+    else
+      continue;
+
+    auto IsGoodCond = [](Value *Cond) {
+      return match(
+          Cond,
+          m_CombineOr(m_Cmp(), m_CombineOr(m_LogicalAnd(m_Value(), m_Value()),
+                                           m_LogicalOr(m_Value(), m_Value()))));
+    };
+    if (!IsGoodCond(Cond1) || !IsGoodCond(Cond2))
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
+
+    // Create a new BB.
+    auto *TmpBB =
+        BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split",
+                           BB.getParent(), BB.getNextNode());
+    if (IsHugeFunc)
+      FreshBBs.insert(TmpBB);
+
+    // Update original basic block by using the first condition directly by the
+    // branch instruction and removing the no longer needed and/or instruction.
+    Br1->setCondition(Cond1);
+    LogicOp->eraseFromParent();
+
+    // Depending on the condition we have to either replace the true or the
+    // false successor of the original branch instruction.
+    if (Opc == Instruction::And)
+      Br1->setSuccessor(0, TmpBB);
+    else
+      Br1->setSuccessor(1, TmpBB);
+
+    // Fill in the new basic block.
+    auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB);
+    if (auto *I = dyn_cast<Instruction>(Cond2)) {
+      I->removeFromParent();
+      I->insertBefore(Br2);
+    }
+
+    // Update PHI nodes in both successors. The original BB needs to be
+    // replaced in one successor's PHI nodes, because the branch comes now from
+    // the newly generated BB (NewBB). In the other successor we need to add one
+    // incoming edge to the PHI nodes, because both branch instructions target
+    // now the same successor. Depending on the original branch condition
+    // (and/or) we have to swap the successors (TrueDest, FalseDest), so that
+    // we perform the correct update for the PHI nodes.
+    // This doesn't change the successor order of the just created branch
+    // instruction (or any other instruction).
+    if (Opc == Instruction::Or)
+      std::swap(TBB, FBB);
+
+    // Replace the old BB with the new BB.
+    TBB->replacePhiUsesWith(&BB, TmpBB);
+
+    // Add another incoming edge from the new BB.
+    for (PHINode &PN : FBB->phis()) {
+      auto *Val = PN.getIncomingValueForBlock(&BB);
+      PN.addIncoming(Val, TmpBB);
+    }
+
+    // Update the branch weights (from SelectionDAGBuilder::
+    // FindMergedConditions).
+    if (Opc == Instruction::Or) {
+      // Codegen X | Y as:
+      // BB1:
+      //   jmp_if_X TBB
+      //   jmp TmpBB
+      // TmpBB:
+      //   jmp_if_Y TBB
+      //   jmp FBB
+      //
+
+      // We have flexibility in setting Prob for BB1 and Prob for NewBB.
+      // The requirement is that
+      //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
+      //     = TrueProb for original BB.
+      // Assuming the original weights are A and B, one choice is to set BB1's
+      // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice
+      // assumes that
+      //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
+      // Another choice is to assume TrueProb for BB1 equals to TrueProb for
+      // TmpBB, but the math is more complicated.
+      uint64_t TrueWeight, FalseWeight;
+      if (extractBranchWeights(*Br1, TrueWeight, FalseWeight)) {
+        uint64_t NewTrueWeight = TrueWeight;
+        uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br1->setMetadata(LLVMContext::MD_prof,
+                         MDBuilder(Br1->getContext())
+                             .createBranchWeights(TrueWeight, FalseWeight));
+
+        NewTrueWeight = TrueWeight;
+        NewFalseWeight = 2 * FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br2->setMetadata(LLVMContext::MD_prof,
+                         MDBuilder(Br2->getContext())
+                             .createBranchWeights(TrueWeight, FalseWeight));
+      }
+    } else {
+      // Codegen X & Y as:
+      // BB1:
+      //   jmp_if_X TmpBB
+      //   jmp FBB
+      // TmpBB:
+      //   jmp_if_Y TBB
+      //   jmp FBB
+      //
+      //  This requires creation of TmpBB after CurBB.
+
+      // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
+      // The requirement is that
+      //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
+      //     = FalseProb for original BB.
+      // Assuming the original weights are A and B, one choice is to set BB1's
+      // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice
+      // assumes that
+      //   FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
+      uint64_t TrueWeight, FalseWeight;
+      if (extractBranchWeights(*Br1, TrueWeight, FalseWeight)) {
+        uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight;
+        uint64_t NewFalseWeight = FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br1->setMetadata(LLVMContext::MD_prof,
+                         MDBuilder(Br1->getContext())
+                             .createBranchWeights(TrueWeight, FalseWeight));
+
+        NewTrueWeight = 2 * TrueWeight;
+        NewFalseWeight = FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br2->setMetadata(LLVMContext::MD_prof,
+                         MDBuilder(Br2->getContext())
+                             .createBranchWeights(TrueWeight, FalseWeight));
+      }
+    }
+
+    ModifiedDT = ModifyDT::ModifyBBDT;
+    MadeChange = true;
+
+    LLVM_DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
+               TmpBB->dump());
+  }
+  return MadeChange;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CommandFlags.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CommandFlags.cpp
new file mode 100644
index 000000000000..c34a52a6f2de
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CommandFlags.cpp
@@ -0,0 +1,727 @@
+//===-- CommandFlags.cpp - Command Line Flags Interface ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains codegen-specific flags that are shared between different
+// command line tools. The tools "llc" and "opt" both use this file to prevent
+// flag duplication.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCTargetOptionsCommandFlags.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/SubtargetFeature.h"
+#include "llvm/TargetParser/Triple.h"
+#include <optional>
+
+using namespace llvm;
+
+#define CGOPT(TY, NAME)                                                        \
+  static cl::opt<TY> *NAME##View;                                              \
+  TY codegen::get##NAME() {                                                    \
+    assert(NAME##View && "RegisterCodeGenFlags not created.");                 \
+    return *NAME##View;                                                        \
+  }
+
+#define CGLIST(TY, NAME)                                                       \
+  static cl::list<TY> *NAME##View;                                             \
+  std::vector<TY> codegen::get##NAME() {                                       \
+    assert(NAME##View && "RegisterCodeGenFlags not created.");                 \
+    return *NAME##View;                                                        \
+  }
+
+// Temporary macro for incremental transition to std::optional.
+#define CGOPT_EXP(TY, NAME)                                                    \
+  CGOPT(TY, NAME)                                                              \
+  std::optional<TY> codegen::getExplicit##NAME() {                             \
+    if (NAME##View->getNumOccurrences()) {                                     \
+      TY res = *NAME##View;                                                    \
+      return res;                                                              \
+    }                                                                          \
+    return std::nullopt;                                                       \
+  }
+
+CGOPT(std::string, MArch)
+CGOPT(std::string, MCPU)
+CGLIST(std::string, MAttrs)
+CGOPT_EXP(Reloc::Model, RelocModel)
+CGOPT(ThreadModel::Model, ThreadModel)
+CGOPT_EXP(CodeModel::Model, CodeModel)
+CGOPT(ExceptionHandling, ExceptionModel)
+CGOPT_EXP(CodeGenFileType, FileType)
+CGOPT(FramePointerKind, FramePointerUsage)
+CGOPT(bool, EnableUnsafeFPMath)
+CGOPT(bool, EnableNoInfsFPMath)
+CGOPT(bool, EnableNoNaNsFPMath)
+CGOPT(bool, EnableNoSignedZerosFPMath)
+CGOPT(bool, EnableApproxFuncFPMath)
+CGOPT(bool, EnableNoTrappingFPMath)
+CGOPT(bool, EnableAIXExtendedAltivecABI)
+CGOPT(DenormalMode::DenormalModeKind, DenormalFPMath)
+CGOPT(DenormalMode::DenormalModeKind, DenormalFP32Math)
+CGOPT(bool, EnableHonorSignDependentRoundingFPMath)
+CGOPT(FloatABI::ABIType, FloatABIForCalls)
+CGOPT(FPOpFusion::FPOpFusionMode, FuseFPOps)
+CGOPT(SwiftAsyncFramePointerMode, SwiftAsyncFramePointer)
+CGOPT(bool, DontPlaceZerosInBSS)
+CGOPT(bool, EnableGuaranteedTailCallOpt)
+CGOPT(bool, DisableTailCalls)
+CGOPT(bool, StackSymbolOrdering)
+CGOPT(bool, StackRealign)
+CGOPT(std::string, TrapFuncName)
+CGOPT(bool, UseCtors)
+CGOPT(bool, DisableIntegratedAS)
+CGOPT(bool, RelaxELFRelocations)
+CGOPT_EXP(bool, DataSections)
+CGOPT_EXP(bool, FunctionSections)
+CGOPT(bool, IgnoreXCOFFVisibility)
+CGOPT(bool, XCOFFTracebackTable)
+CGOPT(std::string, BBSections)
+CGOPT(unsigned, TLSSize)
+CGOPT_EXP(bool, EmulatedTLS)
+CGOPT(bool, UniqueSectionNames)
+CGOPT(bool, UniqueBasicBlockSectionNames)
+CGOPT(EABI, EABIVersion)
+CGOPT(DebuggerKind, DebuggerTuningOpt)
+CGOPT(bool, EnableStackSizeSection)
+CGOPT(bool, EnableAddrsig)
+CGOPT(bool, EmitCallSiteInfo)
+CGOPT(bool, EnableMachineFunctionSplitter)
+CGOPT(bool, EnableDebugEntryValues)
+CGOPT(bool, ForceDwarfFrameSection)
+CGOPT(bool, XRayFunctionIndex)
+CGOPT(bool, DebugStrictDwarf)
+CGOPT(unsigned, AlignLoops)
+CGOPT(bool, JMCInstrument)
+CGOPT(bool, XCOFFReadOnlyPointers)
+
+codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
+#define CGBINDOPT(NAME)                                                        \
+  do {                                                                         \
+    NAME##View = std::addressof(NAME);                                         \
+  } while (0)
+
+  static cl::opt<std::string> MArch(
+      "march", cl::desc("Architecture to generate code for (see --version)"));
+  CGBINDOPT(MArch);
+
+  static cl::opt<std::string> MCPU(
+      "mcpu", cl::desc("Target a specific cpu type (-mcpu=help for details)"),
+      cl::value_desc("cpu-name"), cl::init(""));
+  CGBINDOPT(MCPU);
+
+  static cl::list<std::string> MAttrs(
+      "mattr", cl::CommaSeparated,
+      cl::desc("Target specific attributes (-mattr=help for details)"),
+      cl::value_desc("a1,+a2,-a3,..."));
+  CGBINDOPT(MAttrs);
+
+  static cl::opt<Reloc::Model> RelocModel(
+      "relocation-model", cl::desc("Choose relocation model"),
+      cl::values(
+          clEnumValN(Reloc::Static, "static", "Non-relocatable code"),
+          clEnumValN(Reloc::PIC_, "pic",
+                     "Fully relocatable, position independent code"),
+          clEnumValN(Reloc::DynamicNoPIC, "dynamic-no-pic",
+                     "Relocatable external references, non-relocatable code"),
+          clEnumValN(
+              Reloc::ROPI, "ropi",
+              "Code and read-only data relocatable, accessed PC-relative"),
+          clEnumValN(
+              Reloc::RWPI, "rwpi",
+              "Read-write data relocatable, accessed relative to static base"),
+          clEnumValN(Reloc::ROPI_RWPI, "ropi-rwpi",
+                     "Combination of ropi and rwpi")));
+  CGBINDOPT(RelocModel);
+
+  static cl::opt<ThreadModel::Model> ThreadModel(
+      "thread-model", cl::desc("Choose threading model"),
+      cl::init(ThreadModel::POSIX),
+      cl::values(
+          clEnumValN(ThreadModel::POSIX, "posix", "POSIX thread model"),
+          clEnumValN(ThreadModel::Single, "single", "Single thread model")));
+  CGBINDOPT(ThreadModel);
+
+  static cl::opt<CodeModel::Model> CodeModel(
+      "code-model", cl::desc("Choose code model"),
+      cl::values(clEnumValN(CodeModel::Tiny, "tiny", "Tiny code model"),
+                 clEnumValN(CodeModel::Small, "small", "Small code model"),
+                 clEnumValN(CodeModel::Kernel, "kernel", "Kernel code model"),
+                 clEnumValN(CodeModel::Medium, "medium", "Medium code model"),
+                 clEnumValN(CodeModel::Large, "large", "Large code model")));
+  CGBINDOPT(CodeModel);
+
+  static cl::opt<ExceptionHandling> ExceptionModel(
+      "exception-model", cl::desc("exception model"),
+      cl::init(ExceptionHandling::None),
+      cl::values(
+          clEnumValN(ExceptionHandling::None, "default",
+                     "default exception handling model"),
+          clEnumValN(ExceptionHandling::DwarfCFI, "dwarf",
+                     "DWARF-like CFI based exception handling"),
+          clEnumValN(ExceptionHandling::SjLj, "sjlj",
+                     "SjLj exception handling"),
+          clEnumValN(ExceptionHandling::ARM, "arm", "ARM EHABI exceptions"),
+          clEnumValN(ExceptionHandling::WinEH, "wineh",
+                     "Windows exception model"),
+          clEnumValN(ExceptionHandling::Wasm, "wasm",
+                     "WebAssembly exception handling")));
+  CGBINDOPT(ExceptionModel);
+
+  static cl::opt<CodeGenFileType> FileType(
+      "filetype", cl::init(CGFT_AssemblyFile),
+      cl::desc(
+          "Choose a file type (not all types are supported by all targets):"),
+      cl::values(
+          clEnumValN(CGFT_AssemblyFile, "asm", "Emit an assembly ('.s') file"),
+          clEnumValN(CGFT_ObjectFile, "obj",
+                     "Emit a native object ('.o') file"),
+          clEnumValN(CGFT_Null, "null",
+                     "Emit nothing, for performance testing")));
+  CGBINDOPT(FileType);
+
+  static cl::opt<FramePointerKind> FramePointerUsage(
+      "frame-pointer",
+      cl::desc("Specify frame pointer elimination optimization"),
+      cl::init(FramePointerKind::None),
+      cl::values(
+          clEnumValN(FramePointerKind::All, "all",
+                     "Disable frame pointer elimination"),
+          clEnumValN(FramePointerKind::NonLeaf, "non-leaf",
+                     "Disable frame pointer elimination for non-leaf frame"),
+          clEnumValN(FramePointerKind::None, "none",
+                     "Enable frame pointer elimination")));
+  CGBINDOPT(FramePointerUsage);
+
+  static cl::opt<bool> EnableUnsafeFPMath(
+      "enable-unsafe-fp-math",
+      cl::desc("Enable optimizations that may decrease FP precision"),
+      cl::init(false));
+  CGBINDOPT(EnableUnsafeFPMath);
+
+  static cl::opt<bool> EnableNoInfsFPMath(
+      "enable-no-infs-fp-math",
+      cl::desc("Enable FP math optimizations that assume no +-Infs"),
+      cl::init(false));
+  CGBINDOPT(EnableNoInfsFPMath);
+
+  static cl::opt<bool> EnableNoNaNsFPMath(
+      "enable-no-nans-fp-math",
+      cl::desc("Enable FP math optimizations that assume no NaNs"),
+      cl::init(false));
+  CGBINDOPT(EnableNoNaNsFPMath);
+
+  static cl::opt<bool> EnableNoSignedZerosFPMath(
+      "enable-no-signed-zeros-fp-math",
+      cl::desc("Enable FP math optimizations that assume "
+               "the sign of 0 is insignificant"),
+      cl::init(false));
+  CGBINDOPT(EnableNoSignedZerosFPMath);
+
+  static cl::opt<bool> EnableApproxFuncFPMath(
+      "enable-approx-func-fp-math",
+      cl::desc("Enable FP math optimizations that assume approx func"),
+      cl::init(false));
+  CGBINDOPT(EnableApproxFuncFPMath);
+
+  static cl::opt<bool> EnableNoTrappingFPMath(
+      "enable-no-trapping-fp-math",
+      cl::desc("Enable setting the FP exceptions build "
+               "attribute not to use exceptions"),
+      cl::init(false));
+  CGBINDOPT(EnableNoTrappingFPMath);
+
+  static const auto DenormFlagEnumOptions = cl::values(
+      clEnumValN(DenormalMode::IEEE, "ieee", "IEEE 754 denormal numbers"),
+      clEnumValN(DenormalMode::PreserveSign, "preserve-sign",
+                 "the sign of a  flushed-to-zero number is preserved "
+                 "in the sign of 0"),
+      clEnumValN(DenormalMode::PositiveZero, "positive-zero",
+                 "denormals are flushed to positive zero"),
+      clEnumValN(DenormalMode::Dynamic, "dynamic",
+                 "denormals have unknown treatment"));
+
+  // FIXME: Doesn't have way to specify separate input and output modes.
+  static cl::opt<DenormalMode::DenormalModeKind> DenormalFPMath(
+    "denormal-fp-math",
+    cl::desc("Select which denormal numbers the code is permitted to require"),
+    cl::init(DenormalMode::IEEE),
+    DenormFlagEnumOptions);
+  CGBINDOPT(DenormalFPMath);
+
+  static cl::opt<DenormalMode::DenormalModeKind> DenormalFP32Math(
+    "denormal-fp-math-f32",
+    cl::desc("Select which denormal numbers the code is permitted to require for float"),
+    cl::init(DenormalMode::Invalid),
+    DenormFlagEnumOptions);
+  CGBINDOPT(DenormalFP32Math);
+
+  static cl::opt<bool> EnableHonorSignDependentRoundingFPMath(
+      "enable-sign-dependent-rounding-fp-math", cl::Hidden,
+      cl::desc("Force codegen to assume rounding mode can change dynamically"),
+      cl::init(false));
+  CGBINDOPT(EnableHonorSignDependentRoundingFPMath);
+
+  static cl::opt<FloatABI::ABIType> FloatABIForCalls(
+      "float-abi", cl::desc("Choose float ABI type"),
+      cl::init(FloatABI::Default),
+      cl::values(clEnumValN(FloatABI::Default, "default",
+                            "Target default float ABI type"),
+                 clEnumValN(FloatABI::Soft, "soft",
+                            "Soft float ABI (implied by -soft-float)"),
+                 clEnumValN(FloatABI::Hard, "hard",
+                            "Hard float ABI (uses FP registers)")));
+  CGBINDOPT(FloatABIForCalls);
+
+  static cl::opt<FPOpFusion::FPOpFusionMode> FuseFPOps(
+      "fp-contract", cl::desc("Enable aggressive formation of fused FP ops"),
+      cl::init(FPOpFusion::Standard),
+      cl::values(
+          clEnumValN(FPOpFusion::Fast, "fast",
+                     "Fuse FP ops whenever profitable"),
+          clEnumValN(FPOpFusion::Standard, "on", "Only fuse 'blessed' FP ops."),
+          clEnumValN(FPOpFusion::Strict, "off",
+                     "Only fuse FP ops when the result won't be affected.")));
+  CGBINDOPT(FuseFPOps);
+
+  static cl::opt<SwiftAsyncFramePointerMode> SwiftAsyncFramePointer(
+      "swift-async-fp",
+      cl::desc("Determine when the Swift async frame pointer should be set"),
+      cl::init(SwiftAsyncFramePointerMode::Always),
+      cl::values(clEnumValN(SwiftAsyncFramePointerMode::DeploymentBased, "auto",
+                            "Determine based on deployment target"),
+                 clEnumValN(SwiftAsyncFramePointerMode::Always, "always",
+                            "Always set the bit"),
+                 clEnumValN(SwiftAsyncFramePointerMode::Never, "never",
+                            "Never set the bit")));
+  CGBINDOPT(SwiftAsyncFramePointer);
+
+  static cl::opt<bool> DontPlaceZerosInBSS(
+      "nozero-initialized-in-bss",
+      cl::desc("Don't place zero-initialized symbols into bss section"),
+      cl::init(false));
+  CGBINDOPT(DontPlaceZerosInBSS);
+
+  static cl::opt<bool> EnableAIXExtendedAltivecABI(
+      "vec-extabi", cl::desc("Enable the AIX Extended Altivec ABI."),
+      cl::init(false));
+  CGBINDOPT(EnableAIXExtendedAltivecABI);
+
+  static cl::opt<bool> EnableGuaranteedTailCallOpt(
+      "tailcallopt",
+      cl::desc(
+          "Turn fastcc calls into tail calls by (potentially) changing ABI."),
+      cl::init(false));
+  CGBINDOPT(EnableGuaranteedTailCallOpt);
+
+  static cl::opt<bool> DisableTailCalls(
+      "disable-tail-calls", cl::desc("Never emit tail calls"), cl::init(false));
+  CGBINDOPT(DisableTailCalls);
+
+  static cl::opt<bool> StackSymbolOrdering(
+      "stack-symbol-ordering", cl::desc("Order local stack symbols."),
+      cl::init(true));
+  CGBINDOPT(StackSymbolOrdering);
+
+  static cl::opt<bool> StackRealign(
+      "stackrealign",
+      cl::desc("Force align the stack to the minimum alignment"),
+      cl::init(false));
+  CGBINDOPT(StackRealign);
+
+  static cl::opt<std::string> TrapFuncName(
+      "trap-func", cl::Hidden,
+      cl::desc("Emit a call to trap function rather than a trap instruction"),
+      cl::init(""));
+  CGBINDOPT(TrapFuncName);
+
+  static cl::opt<bool> UseCtors("use-ctors",
+                                cl::desc("Use .ctors instead of .init_array."),
+                                cl::init(false));
+  CGBINDOPT(UseCtors);
+
+  static cl::opt<bool> RelaxELFRelocations(
+      "relax-elf-relocations",
+      cl::desc(
+          "Emit GOTPCRELX/REX_GOTPCRELX instead of GOTPCREL on x86-64 ELF"),
+      cl::init(true));
+  CGBINDOPT(RelaxELFRelocations);
+
+  static cl::opt<bool> DataSections(
+      "data-sections", cl::desc("Emit data into separate sections"),
+      cl::init(false));
+  CGBINDOPT(DataSections);
+
+  static cl::opt<bool> FunctionSections(
+      "function-sections", cl::desc("Emit functions into separate sections"),
+      cl::init(false));
+  CGBINDOPT(FunctionSections);
+
+  static cl::opt<bool> IgnoreXCOFFVisibility(
+      "ignore-xcoff-visibility",
+      cl::desc("Not emit the visibility attribute for asm in AIX OS or give "
+               "all symbols 'unspecified' visibility in XCOFF object file"),
+      cl::init(false));
+  CGBINDOPT(IgnoreXCOFFVisibility);
+
+  static cl::opt<bool> XCOFFTracebackTable(
+      "xcoff-traceback-table", cl::desc("Emit the XCOFF traceback table"),
+      cl::init(true));
+  CGBINDOPT(XCOFFTracebackTable);
+
+  static cl::opt<std::string> BBSections(
+      "basic-block-sections",
+      cl::desc("Emit basic blocks into separate sections"),
+      cl::value_desc("all | <function list (file)> | labels | none"),
+      cl::init("none"));
+  CGBINDOPT(BBSections);
+
+  static cl::opt<unsigned> TLSSize(
+      "tls-size", cl::desc("Bit size of immediate TLS offsets"), cl::init(0));
+  CGBINDOPT(TLSSize);
+
+  static cl::opt<bool> EmulatedTLS(
+      "emulated-tls", cl::desc("Use emulated TLS model"), cl::init(false));
+  CGBINDOPT(EmulatedTLS);
+
+  static cl::opt<bool> UniqueSectionNames(
+      "unique-section-names", cl::desc("Give unique names to every section"),
+      cl::init(true));
+  CGBINDOPT(UniqueSectionNames);
+
+  static cl::opt<bool> UniqueBasicBlockSectionNames(
+      "unique-basic-block-section-names",
+      cl::desc("Give unique names to every basic block section"),
+      cl::init(false));
+  CGBINDOPT(UniqueBasicBlockSectionNames);
+
+  static cl::opt<EABI> EABIVersion(
+      "meabi", cl::desc("Set EABI type (default depends on triple):"),
+      cl::init(EABI::Default),
+      cl::values(
+          clEnumValN(EABI::Default, "default", "Triple default EABI version"),
+          clEnumValN(EABI::EABI4, "4", "EABI version 4"),
+          clEnumValN(EABI::EABI5, "5", "EABI version 5"),
+          clEnumValN(EABI::GNU, "gnu", "EABI GNU")));
+  CGBINDOPT(EABIVersion);
+
+  static cl::opt<DebuggerKind> DebuggerTuningOpt(
+      "debugger-tune", cl::desc("Tune debug info for a particular debugger"),
+      cl::init(DebuggerKind::Default),
+      cl::values(
+          clEnumValN(DebuggerKind::GDB, "gdb", "gdb"),
+          clEnumValN(DebuggerKind::LLDB, "lldb", "lldb"),
+          clEnumValN(DebuggerKind::DBX, "dbx", "dbx"),
+          clEnumValN(DebuggerKind::SCE, "sce", "SCE targets (e.g. PS4)")));
+  CGBINDOPT(DebuggerTuningOpt);
+
+  static cl::opt<bool> EnableStackSizeSection(
+      "stack-size-section",
+      cl::desc("Emit a section containing stack size metadata"),
+      cl::init(false));
+  CGBINDOPT(EnableStackSizeSection);
+
+  static cl::opt<bool> EnableAddrsig(
+      "addrsig", cl::desc("Emit an address-significance table"),
+      cl::init(false));
+  CGBINDOPT(EnableAddrsig);
+
+  static cl::opt<bool> EmitCallSiteInfo(
+      "emit-call-site-info",
+      cl::desc(
+          "Emit call site debug information, if debug information is enabled."),
+      cl::init(false));
+  CGBINDOPT(EmitCallSiteInfo);
+
+  static cl::opt<bool> EnableDebugEntryValues(
+      "debug-entry-values",
+      cl::desc("Enable debug info for the debug entry values."),
+      cl::init(false));
+  CGBINDOPT(EnableDebugEntryValues);
+
+  static cl::opt<bool> EnableMachineFunctionSplitter(
+      "split-machine-functions",
+      cl::desc("Split out cold basic blocks from machine functions based on "
+               "profile information"),
+      cl::init(false));
+  CGBINDOPT(EnableMachineFunctionSplitter);
+
+  static cl::opt<bool> ForceDwarfFrameSection(
+      "force-dwarf-frame-section",
+      cl::desc("Always emit a debug frame section."), cl::init(false));
+  CGBINDOPT(ForceDwarfFrameSection);
+
+  static cl::opt<bool> XRayFunctionIndex("xray-function-index",
+                                         cl::desc("Emit xray_fn_idx section"),
+                                         cl::init(true));
+  CGBINDOPT(XRayFunctionIndex);
+
+  static cl::opt<bool> DebugStrictDwarf(
+      "strict-dwarf", cl::desc("use strict dwarf"), cl::init(false));
+  CGBINDOPT(DebugStrictDwarf);
+
+  static cl::opt<unsigned> AlignLoops("align-loops",
+                                      cl::desc("Default alignment for loops"));
+  CGBINDOPT(AlignLoops);
+
+  static cl::opt<bool> JMCInstrument(
+      "enable-jmc-instrument",
+      cl::desc("Instrument functions with a call to __CheckForDebuggerJustMyCode"),
+      cl::init(false));
+  CGBINDOPT(JMCInstrument);
+
+  static cl::opt<bool> XCOFFReadOnlyPointers(
+      "mxcoff-roptr",
+      cl::desc("When set to true, const objects with relocatable address "
+               "values are put into the RO data section."),
+      cl::init(false));
+  CGBINDOPT(XCOFFReadOnlyPointers);
+
+  static cl::opt<bool> DisableIntegratedAS(
+      "no-integrated-as", cl::desc("Disable integrated assembler"),
+      cl::init(false));
+  CGBINDOPT(DisableIntegratedAS);
+
+#undef CGBINDOPT
+
+  mc::RegisterMCTargetOptionsFlags();
+}
+
+llvm::BasicBlockSection
+codegen::getBBSectionsMode(llvm::TargetOptions &Options) {
+  if (getBBSections() == "all")
+    return BasicBlockSection::All;
+  else if (getBBSections() == "labels")
+    return BasicBlockSection::Labels;
+  else if (getBBSections() == "none")
+    return BasicBlockSection::None;
+  else {
+    ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr =
+        MemoryBuffer::getFile(getBBSections());
+    if (!MBOrErr) {
+      errs() << "Error loading basic block sections function list file: "
+             << MBOrErr.getError().message() << "\n";
+    } else {
+      Options.BBSectionsFuncListBuf = std::move(*MBOrErr);
+    }
+    return BasicBlockSection::List;
+  }
+}
+
+// Common utility function tightly tied to the options listed here. Initializes
+// a TargetOptions object with CodeGen flags and returns it.
+TargetOptions
+codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
+  TargetOptions Options;
+  Options.AllowFPOpFusion = getFuseFPOps();
+  Options.UnsafeFPMath = getEnableUnsafeFPMath();
+  Options.NoInfsFPMath = getEnableNoInfsFPMath();
+  Options.NoNaNsFPMath = getEnableNoNaNsFPMath();
+  Options.NoSignedZerosFPMath = getEnableNoSignedZerosFPMath();
+  Options.ApproxFuncFPMath = getEnableApproxFuncFPMath();
+  Options.NoTrappingFPMath = getEnableNoTrappingFPMath();
+
+  DenormalMode::DenormalModeKind DenormKind = getDenormalFPMath();
+
+  // FIXME: Should have separate input and output flags
+  Options.setFPDenormalMode(DenormalMode(DenormKind, DenormKind));
+
+  Options.HonorSignDependentRoundingFPMathOption =
+      getEnableHonorSignDependentRoundingFPMath();
+  if (getFloatABIForCalls() != FloatABI::Default)
+    Options.FloatABIType = getFloatABIForCalls();
+  Options.EnableAIXExtendedAltivecABI = getEnableAIXExtendedAltivecABI();
+  Options.NoZerosInBSS = getDontPlaceZerosInBSS();
+  Options.GuaranteedTailCallOpt = getEnableGuaranteedTailCallOpt();
+  Options.StackSymbolOrdering = getStackSymbolOrdering();
+  Options.UseInitArray = !getUseCtors();
+  Options.DisableIntegratedAS = getDisableIntegratedAS();
+  Options.RelaxELFRelocations = getRelaxELFRelocations();
+  Options.DataSections =
+      getExplicitDataSections().value_or(TheTriple.hasDefaultDataSections());
+  Options.FunctionSections = getFunctionSections();
+  Options.IgnoreXCOFFVisibility = getIgnoreXCOFFVisibility();
+  Options.XCOFFTracebackTable = getXCOFFTracebackTable();
+  Options.BBSections = getBBSectionsMode(Options);
+  Options.UniqueSectionNames = getUniqueSectionNames();
+  Options.UniqueBasicBlockSectionNames = getUniqueBasicBlockSectionNames();
+  Options.TLSSize = getTLSSize();
+  Options.EmulatedTLS =
+      getExplicitEmulatedTLS().value_or(TheTriple.hasDefaultEmulatedTLS());
+  Options.ExceptionModel = getExceptionModel();
+  Options.EmitStackSizeSection = getEnableStackSizeSection();
+  Options.EnableMachineFunctionSplitter = getEnableMachineFunctionSplitter();
+  Options.EmitAddrsig = getEnableAddrsig();
+  Options.EmitCallSiteInfo = getEmitCallSiteInfo();
+  Options.EnableDebugEntryValues = getEnableDebugEntryValues();
+  Options.ForceDwarfFrameSection = getForceDwarfFrameSection();
+  Options.XRayFunctionIndex = getXRayFunctionIndex();
+  Options.DebugStrictDwarf = getDebugStrictDwarf();
+  Options.LoopAlignment = getAlignLoops();
+  Options.JMCInstrument = getJMCInstrument();
+  Options.XCOFFReadOnlyPointers = getXCOFFReadOnlyPointers();
+
+  Options.MCOptions = mc::InitMCTargetOptionsFromFlags();
+
+  Options.ThreadModel = getThreadModel();
+  Options.EABIVersion = getEABIVersion();
+  Options.DebuggerTuning = getDebuggerTuningOpt();
+  Options.SwiftAsyncFramePointer = getSwiftAsyncFramePointer();
+  return Options;
+}
+
+std::string codegen::getCPUStr() {
+  // If user asked for the 'native' CPU, autodetect here. If autodection fails,
+  // this will set the CPU to an empty string which tells the target to
+  // pick a basic default.
+  if (getMCPU() == "native")
+    return std::string(sys::getHostCPUName());
+
+  return getMCPU();
+}
+
+std::string codegen::getFeaturesStr() {
+  SubtargetFeatures Features;
+
+  // If user asked for the 'native' CPU, we need to autodetect features.
+  // This is necessary for x86 where the CPU might not support all the
+  // features the autodetected CPU name lists in the target. For example,
+  // not all Sandybridge processors support AVX.
+  if (getMCPU() == "native") {
+    StringMap<bool> HostFeatures;
+    if (sys::getHostCPUFeatures(HostFeatures))
+      for (const auto &[Feature, IsEnabled] : HostFeatures)
+        Features.AddFeature(Feature, IsEnabled);
+  }
+
+  for (auto const &MAttr : getMAttrs())
+    Features.AddFeature(MAttr);
+
+  return Features.getString();
+}
+
+std::vector<std::string> codegen::getFeatureList() {
+  SubtargetFeatures Features;
+
+  // If user asked for the 'native' CPU, we need to autodetect features.
+  // This is necessary for x86 where the CPU might not support all the
+  // features the autodetected CPU name lists in the target. For example,
+  // not all Sandybridge processors support AVX.
+  if (getMCPU() == "native") {
+    StringMap<bool> HostFeatures;
+    if (sys::getHostCPUFeatures(HostFeatures))
+      for (const auto &[Feature, IsEnabled] : HostFeatures)
+        Features.AddFeature(Feature, IsEnabled);
+  }
+
+  for (auto const &MAttr : getMAttrs())
+    Features.AddFeature(MAttr);
+
+  return Features.getFeatures();
+}
+
+void codegen::renderBoolStringAttr(AttrBuilder &B, StringRef Name, bool Val) {
+  B.addAttribute(Name, Val ? "true" : "false");
+}
+
+#define HANDLE_BOOL_ATTR(CL, AttrName)                                         \
+  do {                                                                         \
+    if (CL->getNumOccurrences() > 0 && !F.hasFnAttribute(AttrName))            \
+      renderBoolStringAttr(NewAttrs, AttrName, *CL);                           \
+  } while (0)
+
+/// Set function attributes of function \p F based on CPU, Features, and command
+/// line flags.
+void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
+                                    Function &F) {
+  auto &Ctx = F.getContext();
+  AttributeList Attrs = F.getAttributes();
+  AttrBuilder NewAttrs(Ctx);
+
+  if (!CPU.empty() && !F.hasFnAttribute("target-cpu"))
+    NewAttrs.addAttribute("target-cpu", CPU);
+  if (!Features.empty()) {
+    // Append the command line features to any that are already on the function.
+    StringRef OldFeatures =
+        F.getFnAttribute("target-features").getValueAsString();
+    if (OldFeatures.empty())
+      NewAttrs.addAttribute("target-features", Features);
+    else {
+      SmallString<256> Appended(OldFeatures);
+      Appended.push_back(',');
+      Appended.append(Features);
+      NewAttrs.addAttribute("target-features", Appended);
+    }
+  }
+  if (FramePointerUsageView->getNumOccurrences() > 0 &&
+      !F.hasFnAttribute("frame-pointer")) {
+    if (getFramePointerUsage() == FramePointerKind::All)
+      NewAttrs.addAttribute("frame-pointer", "all");
+    else if (getFramePointerUsage() == FramePointerKind::NonLeaf)
+      NewAttrs.addAttribute("frame-pointer", "non-leaf");
+    else if (getFramePointerUsage() == FramePointerKind::None)
+      NewAttrs.addAttribute("frame-pointer", "none");
+  }
+  if (DisableTailCallsView->getNumOccurrences() > 0)
+    NewAttrs.addAttribute("disable-tail-calls",
+                          toStringRef(getDisableTailCalls()));
+  if (getStackRealign())
+    NewAttrs.addAttribute("stackrealign");
+
+  HANDLE_BOOL_ATTR(EnableUnsafeFPMathView, "unsafe-fp-math");
+  HANDLE_BOOL_ATTR(EnableNoInfsFPMathView, "no-infs-fp-math");
+  HANDLE_BOOL_ATTR(EnableNoNaNsFPMathView, "no-nans-fp-math");
+  HANDLE_BOOL_ATTR(EnableNoSignedZerosFPMathView, "no-signed-zeros-fp-math");
+  HANDLE_BOOL_ATTR(EnableApproxFuncFPMathView, "approx-func-fp-math");
+
+  if (DenormalFPMathView->getNumOccurrences() > 0 &&
+      !F.hasFnAttribute("denormal-fp-math")) {
+    DenormalMode::DenormalModeKind DenormKind = getDenormalFPMath();
+
+    // FIXME: Command line flag should expose separate input/output modes.
+    NewAttrs.addAttribute("denormal-fp-math",
+                          DenormalMode(DenormKind, DenormKind).str());
+  }
+
+  if (DenormalFP32MathView->getNumOccurrences() > 0 &&
+      !F.hasFnAttribute("denormal-fp-math-f32")) {
+    // FIXME: Command line flag should expose separate input/output modes.
+    DenormalMode::DenormalModeKind DenormKind = getDenormalFP32Math();
+
+    NewAttrs.addAttribute(
+      "denormal-fp-math-f32",
+      DenormalMode(DenormKind, DenormKind).str());
+  }
+
+  if (TrapFuncNameView->getNumOccurrences() > 0)
+    for (auto &B : F)
+      for (auto &I : B)
+        if (auto *Call = dyn_cast<CallInst>(&I))
+          if (const auto *F = Call->getCalledFunction())
+            if (F->getIntrinsicID() == Intrinsic::debugtrap ||
+                F->getIntrinsicID() == Intrinsic::trap)
+              Call->addFnAttr(
+                  Attribute::get(Ctx, "trap-func-name", getTrapFuncName()));
+
+  // Let NewAttrs override Attrs.
+  F.setAttributes(Attrs.addFnAttributes(Ctx, NewAttrs));
+}
+
+/// Set function attributes of functions in Module M based on CPU,
+/// Features, and command line flags.
+void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
+                                    Module &M) {
+  for (Function &F : M)
+    setFunctionAttributes(CPU, Features, F);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
new file mode 100644
index 000000000000..7979ac9a5fb7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -0,0 +1,2077 @@
+//===- ComplexDeinterleavingPass.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Identification:
+// This step is responsible for finding the patterns that can be lowered to
+// complex instructions, and building a graph to represent the complex
+// structures. Starting from the "Converging Shuffle" (a shuffle that
+// reinterleaves the complex components, with a mask of <0, 2, 1, 3>), the
+// operands are evaluated and identified as "Composite Nodes" (collections of
+// instructions that can potentially be lowered to a single complex
+// instruction). This is performed by checking the real and imaginary components
+// and tracking the data flow for each component while following the operand
+// pairs. Validity of each node is expected to be done upon creation, and any
+// validation errors should halt traversal and prevent further graph
+// construction.
+// Instead of relying on Shuffle operations, vector interleaving and
+// deinterleaving can be represented by vector.interleave2 and
+// vector.deinterleave2 intrinsics. Scalable vectors can be represented only by
+// these intrinsics, whereas, fixed-width vectors are recognized for both
+// shufflevector instruction and intrinsics.
+//
+// Replacement:
+// This step traverses the graph built up by identification, delegating to the
+// target to validate and generate the correct intrinsics, and plumbs them
+// together connecting each end of the new intrinsics graph to the existing
+// use-def chain. This step is assumed to finish successfully, as all
+// information is expected to be correct by this point.
+//
+//
+// Internal data structure:
+// ComplexDeinterleavingGraph:
+// Keeps references to all the valid CompositeNodes formed as part of the
+// transformation, and every Instruction contained within said nodes. It also
+// holds onto a reference to the root Instruction, and the root node that should
+// replace it.
+//
+// ComplexDeinterleavingCompositeNode:
+// A CompositeNode represents a single transformation point; each node should
+// transform into a single complex instruction (ignoring vector splitting, which
+// would generate more instructions per node). They are identified in a
+// depth-first manner, traversing and identifying the operands of each
+// instruction in the order they appear in the IR.
+// Each node maintains a reference  to its Real and Imaginary instructions,
+// as well as any additional instructions that make up the identified operation
+// (Internal instructions should only have uses within their containing node).
+// A Node also contains the rotation and operation type that it represents.
+// Operands contains pointers to other CompositeNodes, acting as the edges in
+// the graph. ReplacementValue is the transformed Value* that has been emitted
+// to the IR.
+//
+// Note: If the operation of a Node is Shuffle, only the Real, Imaginary, and
+// ReplacementValue fields of that Node are relevant, where the ReplacementValue
+// should be pre-populated.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "complex-deinterleaving"
+
+STATISTIC(NumComplexTransformations, "Amount of complex patterns transformed");
+
+static cl::opt<bool> ComplexDeinterleavingEnabled(
+    "enable-complex-deinterleaving",
+    cl::desc("Enable generation of complex instructions"), cl::init(true),
+    cl::Hidden);
+
+/// Checks the given mask, and determines whether said mask is interleaving.
+///
+/// To be interleaving, a mask must alternate between `i` and `i + (Length /
+/// 2)`, and must contain all numbers within the range of `[0..Length)` (e.g. a
+/// 4x vector interleaving mask would be <0, 2, 1, 3>).
+static bool isInterleavingMask(ArrayRef<int> Mask);
+
+/// Checks the given mask, and determines whether said mask is deinterleaving.
+///
+/// To be deinterleaving, a mask must increment in steps of 2, and either start
+/// with 0 or 1.
+/// (e.g. an 8x vector deinterleaving mask would be either <0, 2, 4, 6> or
+/// <1, 3, 5, 7>).
+static bool isDeinterleavingMask(ArrayRef<int> Mask);
+
+/// Returns true if the operation is a negation of V, and it works for both
+/// integers and floats.
+static bool isNeg(Value *V);
+
+/// Returns the operand for negation operation.
+static Value *getNegOperand(Value *V);
+
+namespace {
+
+class ComplexDeinterleavingLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  ComplexDeinterleavingLegacyPass(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM) {
+    initializeComplexDeinterleavingLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Complex Deinterleaving Pass";
+  }
+
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+private:
+  const TargetMachine *TM;
+};
+
+class ComplexDeinterleavingGraph;
+struct ComplexDeinterleavingCompositeNode {
+
+  ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op,
+                                     Value *R, Value *I)
+      : Operation(Op), Real(R), Imag(I) {}
+
+private:
+  friend class ComplexDeinterleavingGraph;
+  using NodePtr = std::shared_ptr<ComplexDeinterleavingCompositeNode>;
+  using RawNodePtr = ComplexDeinterleavingCompositeNode *;
+
+public:
+  ComplexDeinterleavingOperation Operation;
+  Value *Real;
+  Value *Imag;
+
+  // This two members are required exclusively for generating
+  // ComplexDeinterleavingOperation::Symmetric operations.
+  unsigned Opcode;
+  std::optional<FastMathFlags> Flags;
+
+  ComplexDeinterleavingRotation Rotation =
+      ComplexDeinterleavingRotation::Rotation_0;
+  SmallVector<RawNodePtr> Operands;
+  Value *ReplacementNode = nullptr;
+
+  void addOperand(NodePtr Node) { Operands.push_back(Node.get()); }
+
+  void dump() { dump(dbgs()); }
+  void dump(raw_ostream &OS) {
+    auto PrintValue = [&](Value *V) {
+      if (V) {
+        OS << "\"";
+        V->print(OS, true);
+        OS << "\"\n";
+      } else
+        OS << "nullptr\n";
+    };
+    auto PrintNodeRef = [&](RawNodePtr Ptr) {
+      if (Ptr)
+        OS << Ptr << "\n";
+      else
+        OS << "nullptr\n";
+    };
+
+    OS << "- CompositeNode: " << this << "\n";
+    OS << "  Real: ";
+    PrintValue(Real);
+    OS << "  Imag: ";
+    PrintValue(Imag);
+    OS << "  ReplacementNode: ";
+    PrintValue(ReplacementNode);
+    OS << "  Operation: " << (int)Operation << "\n";
+    OS << "  Rotation: " << ((int)Rotation * 90) << "\n";
+    OS << "  Operands: \n";
+    for (const auto &Op : Operands) {
+      OS << "    - ";
+      PrintNodeRef(Op);
+    }
+  }
+};
+
+class ComplexDeinterleavingGraph {
+public:
+  struct Product {
+    Value *Multiplier;
+    Value *Multiplicand;
+    bool IsPositive;
+  };
+
+  using Addend = std::pair<Value *, bool>;
+  using NodePtr = ComplexDeinterleavingCompositeNode::NodePtr;
+  using RawNodePtr = ComplexDeinterleavingCompositeNode::RawNodePtr;
+
+  // Helper struct for holding info about potential partial multiplication
+  // candidates
+  struct PartialMulCandidate {
+    Value *Common;
+    NodePtr Node;
+    unsigned RealIdx;
+    unsigned ImagIdx;
+    bool IsNodeInverted;
+  };
+
+  explicit ComplexDeinterleavingGraph(const TargetLowering *TL,
+                                      const TargetLibraryInfo *TLI)
+      : TL(TL), TLI(TLI) {}
+
+private:
+  const TargetLowering *TL = nullptr;
+  const TargetLibraryInfo *TLI = nullptr;
+  SmallVector<NodePtr> CompositeNodes;
+  DenseMap<std::pair<Value *, Value *>, NodePtr> CachedResult;
+
+  SmallPtrSet<Instruction *, 16> FinalInstructions;
+
+  /// Root instructions are instructions from which complex computation starts
+  std::map<Instruction *, NodePtr> RootToNode;
+
+  /// Topologically sorted root instructions
+  SmallVector<Instruction *, 1> OrderedRoots;
+
+  /// When examining a basic block for complex deinterleaving, if it is a simple
+  /// one-block loop, then the only incoming block is 'Incoming' and the
+  /// 'BackEdge' block is the block itself."
+  BasicBlock *BackEdge = nullptr;
+  BasicBlock *Incoming = nullptr;
+
+  /// ReductionInfo maps from %ReductionOp to %PHInode and Instruction
+  /// %OutsideUser as it is shown in the IR:
+  ///
+  /// vector.body:
+  ///   %PHInode = phi <vector type> [ zeroinitializer, %entry ],
+  ///                                [ %ReductionOp, %vector.body ]
+  ///   ...
+  ///   %ReductionOp = fadd i64 ...
+  ///   ...
+  ///   br i1 %condition, label %vector.body, %middle.block
+  ///
+  /// middle.block:
+  ///   %OutsideUser = llvm.vector.reduce.fadd(..., %ReductionOp)
+  ///
+  /// %OutsideUser can be `llvm.vector.reduce.fadd` or `fadd` preceding
+  /// `llvm.vector.reduce.fadd` when unroll factor isn't one.
+  std::map<Instruction *, std::pair<PHINode *, Instruction *>> ReductionInfo;
+
+  /// In the process of detecting a reduction, we consider a pair of
+  /// %ReductionOP, which we refer to as real and imag (or vice versa), and
+  /// traverse the use-tree to detect complex operations. As this is a reduction
+  /// operation, it will eventually reach RealPHI and ImagPHI, which corresponds
+  /// to the %ReductionOPs that we suspect to be complex.
+  /// RealPHI and ImagPHI are used by the identifyPHINode method.
+  PHINode *RealPHI = nullptr;
+  PHINode *ImagPHI = nullptr;
+
+  /// Set this flag to true if RealPHI and ImagPHI were reached during reduction
+  /// detection.
+  bool PHIsFound = false;
+
+  /// OldToNewPHI maps the original real PHINode to a new, double-sized PHINode.
+  /// The new PHINode corresponds to a vector of deinterleaved complex numbers.
+  /// This mapping is populated during
+  /// ComplexDeinterleavingOperation::ReductionPHI node replacement. It is then
+  /// used in the ComplexDeinterleavingOperation::ReductionOperation node
+  /// replacement process.
+  std::map<PHINode *, PHINode *> OldToNewPHI;
+
+  NodePtr prepareCompositeNode(ComplexDeinterleavingOperation Operation,
+                               Value *R, Value *I) {
+    assert(((Operation != ComplexDeinterleavingOperation::ReductionPHI &&
+             Operation != ComplexDeinterleavingOperation::ReductionOperation) ||
+            (R && I)) &&
+           "Reduction related nodes must have Real and Imaginary parts");
+    return std::make_shared<ComplexDeinterleavingCompositeNode>(Operation, R,
+                                                                I);
+  }
+
+  NodePtr submitCompositeNode(NodePtr Node) {
+    CompositeNodes.push_back(Node);
+    if (Node->Real && Node->Imag)
+      CachedResult[{Node->Real, Node->Imag}] = Node;
+    return Node;
+  }
+
+  /// Identifies a complex partial multiply pattern and its rotation, based on
+  /// the following patterns
+  ///
+  ///  0:  r: cr + ar * br
+  ///      i: ci + ar * bi
+  /// 90:  r: cr - ai * bi
+  ///      i: ci + ai * br
+  /// 180: r: cr - ar * br
+  ///      i: ci - ar * bi
+  /// 270: r: cr + ai * bi
+  ///      i: ci - ai * br
+  NodePtr identifyPartialMul(Instruction *Real, Instruction *Imag);
+
+  /// Identify the other branch of a Partial Mul, taking the CommonOperandI that
+  /// is partially known from identifyPartialMul, filling in the other half of
+  /// the complex pair.
+  NodePtr
+  identifyNodeWithImplicitAdd(Instruction *I, Instruction *J,
+                              std::pair<Value *, Value *> &CommonOperandI);
+
+  /// Identifies a complex add pattern and its rotation, based on the following
+  /// patterns.
+  ///
+  /// 90:  r: ar - bi
+  ///      i: ai + br
+  /// 270: r: ar + bi
+  ///      i: ai - br
+  NodePtr identifyAdd(Instruction *Real, Instruction *Imag);
+  NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag);
+
+  NodePtr identifyNode(Value *R, Value *I);
+
+  /// Determine if a sum of complex numbers can be formed from \p RealAddends
+  /// and \p ImagAddens. If \p Accumulator is not null, add the result to it.
+  /// Return nullptr if it is not possible to construct a complex number.
+  /// \p Flags are needed to generate symmetric Add and Sub operations.
+  NodePtr identifyAdditions(std::list<Addend> &RealAddends,
+                            std::list<Addend> &ImagAddends,
+                            std::optional<FastMathFlags> Flags,
+                            NodePtr Accumulator);
+
+  /// Extract one addend that have both real and imaginary parts positive.
+  NodePtr extractPositiveAddend(std::list<Addend> &RealAddends,
+                                std::list<Addend> &ImagAddends);
+
+  /// Determine if sum of multiplications of complex numbers can be formed from
+  /// \p RealMuls and \p ImagMuls. If \p Accumulator is not null, add the result
+  /// to it. Return nullptr if it is not possible to construct a complex number.
+  NodePtr identifyMultiplications(std::vector<Product> &RealMuls,
+                                  std::vector<Product> &ImagMuls,
+                                  NodePtr Accumulator);
+
+  /// Go through pairs of multiplication (one Real and one Imag) and find all
+  /// possible candidates for partial multiplication and put them into \p
+  /// Candidates. Returns true if all Product has pair with common operand
+  bool collectPartialMuls(const std::vector<Product> &RealMuls,
+                          const std::vector<Product> &ImagMuls,
+                          std::vector<PartialMulCandidate> &Candidates);
+
+  /// If the code is compiled with -Ofast or expressions have `reassoc` flag,
+  /// the order of complex computation operations may be significantly altered,
+  /// and the real and imaginary parts may not be executed in parallel. This
+  /// function takes this into consideration and employs a more general approach
+  /// to identify complex computations. Initially, it gathers all the addends
+  /// and multiplicands and then constructs a complex expression from them.
+  NodePtr identifyReassocNodes(Instruction *I, Instruction *J);
+
+  NodePtr identifyRoot(Instruction *I);
+
+  /// Identifies the Deinterleave operation applied to a vector containing
+  /// complex numbers. There are two ways to represent the Deinterleave
+  /// operation:
+  /// * Using two shufflevectors with even indices for /pReal instruction and
+  /// odd indices for /pImag instructions (only for fixed-width vectors)
+  /// * Using two extractvalue instructions applied to `vector.deinterleave2`
+  /// intrinsic (for both fixed and scalable vectors)
+  NodePtr identifyDeinterleave(Instruction *Real, Instruction *Imag);
+
+  /// identifying the operation that represents a complex number repeated in a
+  /// Splat vector. There are two possible types of splats: ConstantExpr with
+  /// the opcode ShuffleVector and ShuffleVectorInstr. Both should have an
+  /// initialization mask with all values set to zero.
+  NodePtr identifySplat(Value *Real, Value *Imag);
+
+  NodePtr identifyPHINode(Instruction *Real, Instruction *Imag);
+
+  /// Identifies SelectInsts in a loop that has reduction with predication masks
+  /// and/or predicated tail folding
+  NodePtr identifySelectNode(Instruction *Real, Instruction *Imag);
+
+  Value *replaceNode(IRBuilderBase &Builder, RawNodePtr Node);
+
+  /// Complete IR modifications after producing new reduction operation:
+  /// * Populate the PHINode generated for
+  /// ComplexDeinterleavingOperation::ReductionPHI
+  /// * Deinterleave the final value outside of the loop and repurpose original
+  /// reduction users
+  void processReductionOperation(Value *OperationReplacement, RawNodePtr Node);
+
+public:
+  void dump() { dump(dbgs()); }
+  void dump(raw_ostream &OS) {
+    for (const auto &Node : CompositeNodes)
+      Node->dump(OS);
+  }
+
+  /// Returns false if the deinterleaving operation should be cancelled for the
+  /// current graph.
+  bool identifyNodes(Instruction *RootI);
+
+  /// In case \pB is one-block loop, this function seeks potential reductions
+  /// and populates ReductionInfo. Returns true if any reductions were
+  /// identified.
+  bool collectPotentialReductions(BasicBlock *B);
+
+  void identifyReductionNodes();
+
+  /// Check that every instruction, from the roots to the leaves, has internal
+  /// uses.
+  bool checkNodes();
+
+  /// Perform the actual replacement of the underlying instruction graph.
+  void replaceNodes();
+};
+
+class ComplexDeinterleaving {
+public:
+  ComplexDeinterleaving(const TargetLowering *tl, const TargetLibraryInfo *tli)
+      : TL(tl), TLI(tli) {}
+  bool runOnFunction(Function &F);
+
+private:
+  bool evaluateBasicBlock(BasicBlock *B);
+
+  const TargetLowering *TL = nullptr;
+  const TargetLibraryInfo *TLI = nullptr;
+};
+
+} // namespace
+
+char ComplexDeinterleavingLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ComplexDeinterleavingLegacyPass, DEBUG_TYPE,
+                      "Complex Deinterleaving", false, false)
+INITIALIZE_PASS_END(ComplexDeinterleavingLegacyPass, DEBUG_TYPE,
+                    "Complex Deinterleaving", false, false)
+
+PreservedAnalyses ComplexDeinterleavingPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering();
+  auto &TLI = AM.getResult<llvm::TargetLibraryAnalysis>(F);
+  if (!ComplexDeinterleaving(TL, &TLI).runOnFunction(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<FunctionAnalysisManagerModuleProxy>();
+  return PA;
+}
+
+FunctionPass *llvm::createComplexDeinterleavingPass(const TargetMachine *TM) {
+  return new ComplexDeinterleavingLegacyPass(TM);
+}
+
+bool ComplexDeinterleavingLegacyPass::runOnFunction(Function &F) {
+  const auto *TL = TM->getSubtargetImpl(F)->getTargetLowering();
+  auto TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  return ComplexDeinterleaving(TL, &TLI).runOnFunction(F);
+}
+
+bool ComplexDeinterleaving::runOnFunction(Function &F) {
+  if (!ComplexDeinterleavingEnabled) {
+    LLVM_DEBUG(
+        dbgs() << "Complex deinterleaving has been explicitly disabled.\n");
+    return false;
+  }
+
+  if (!TL->isComplexDeinterleavingSupported()) {
+    LLVM_DEBUG(
+        dbgs() << "Complex deinterleaving has been disabled, target does "
+                  "not support lowering of complex number operations.\n");
+    return false;
+  }
+
+  bool Changed = false;
+  for (auto &B : F)
+    Changed |= evaluateBasicBlock(&B);
+
+  return Changed;
+}
+
+static bool isInterleavingMask(ArrayRef<int> Mask) {
+  // If the size is not even, it's not an interleaving mask
+  if ((Mask.size() & 1))
+    return false;
+
+  int HalfNumElements = Mask.size() / 2;
+  for (int Idx = 0; Idx < HalfNumElements; ++Idx) {
+    int MaskIdx = Idx * 2;
+    if (Mask[MaskIdx] != Idx || Mask[MaskIdx + 1] != (Idx + HalfNumElements))
+      return false;
+  }
+
+  return true;
+}
+
+static bool isDeinterleavingMask(ArrayRef<int> Mask) {
+  int Offset = Mask[0];
+  int HalfNumElements = Mask.size() / 2;
+
+  for (int Idx = 1; Idx < HalfNumElements; ++Idx) {
+    if (Mask[Idx] != (Idx * 2) + Offset)
+      return false;
+  }
+
+  return true;
+}
+
+bool isNeg(Value *V) {
+  return match(V, m_FNeg(m_Value())) || match(V, m_Neg(m_Value()));
+}
+
+Value *getNegOperand(Value *V) {
+  assert(isNeg(V));
+  auto *I = cast<Instruction>(V);
+  if (I->getOpcode() == Instruction::FNeg)
+    return I->getOperand(0);
+
+  return I->getOperand(1);
+}
+
+bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) {
+  ComplexDeinterleavingGraph Graph(TL, TLI);
+  if (Graph.collectPotentialReductions(B))
+    Graph.identifyReductionNodes();
+
+  for (auto &I : *B)
+    Graph.identifyNodes(&I);
+
+  if (Graph.checkNodes()) {
+    Graph.replaceNodes();
+    return true;
+  }
+
+  return false;
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyNodeWithImplicitAdd(
+    Instruction *Real, Instruction *Imag,
+    std::pair<Value *, Value *> &PartialMatch) {
+  LLVM_DEBUG(dbgs() << "identifyNodeWithImplicitAdd " << *Real << " / " << *Imag
+                    << "\n");
+
+  if (!Real->hasOneUse() || !Imag->hasOneUse()) {
+    LLVM_DEBUG(dbgs() << "  - Mul operand has multiple uses.\n");
+    return nullptr;
+  }
+
+  if ((Real->getOpcode() != Instruction::FMul &&
+       Real->getOpcode() != Instruction::Mul) ||
+      (Imag->getOpcode() != Instruction::FMul &&
+       Imag->getOpcode() != Instruction::Mul)) {
+    LLVM_DEBUG(
+        dbgs() << "  - Real or imaginary instruction is not fmul or mul\n");
+    return nullptr;
+  }
+
+  Value *R0 = Real->getOperand(0);
+  Value *R1 = Real->getOperand(1);
+  Value *I0 = Imag->getOperand(0);
+  Value *I1 = Imag->getOperand(1);
+
+  // A +/+ has a rotation of 0. If any of the operands are fneg, we flip the
+  // rotations and use the operand.
+  unsigned Negs = 0;
+  Value *Op;
+  if (match(R0, m_Neg(m_Value(Op)))) {
+    Negs |= 1;
+    R0 = Op;
+  } else if (match(R1, m_Neg(m_Value(Op)))) {
+    Negs |= 1;
+    R1 = Op;
+  }
+
+  if (isNeg(I0)) {
+    Negs |= 2;
+    Negs ^= 1;
+    I0 = Op;
+  } else if (match(I1, m_Neg(m_Value(Op)))) {
+    Negs |= 2;
+    Negs ^= 1;
+    I1 = Op;
+  }
+
+  ComplexDeinterleavingRotation Rotation = (ComplexDeinterleavingRotation)Negs;
+
+  Value *CommonOperand;
+  Value *UncommonRealOp;
+  Value *UncommonImagOp;
+
+  if (R0 == I0 || R0 == I1) {
+    CommonOperand = R0;
+    UncommonRealOp = R1;
+  } else if (R1 == I0 || R1 == I1) {
+    CommonOperand = R1;
+    UncommonRealOp = R0;
+  } else {
+    LLVM_DEBUG(dbgs() << "  - No equal operand\n");
+    return nullptr;
+  }
+
+  UncommonImagOp = (CommonOperand == I0) ? I1 : I0;
+  if (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
+      Rotation == ComplexDeinterleavingRotation::Rotation_270)
+    std::swap(UncommonRealOp, UncommonImagOp);
+
+  // Between identifyPartialMul and here we need to have found a complete valid
+  // pair from the CommonOperand of each part.
+  if (Rotation == ComplexDeinterleavingRotation::Rotation_0 ||
+      Rotation == ComplexDeinterleavingRotation::Rotation_180)
+    PartialMatch.first = CommonOperand;
+  else
+    PartialMatch.second = CommonOperand;
+
+  if (!PartialMatch.first || !PartialMatch.second) {
+    LLVM_DEBUG(dbgs() << "  - Incomplete partial match\n");
+    return nullptr;
+  }
+
+  NodePtr CommonNode = identifyNode(PartialMatch.first, PartialMatch.second);
+  if (!CommonNode) {
+    LLVM_DEBUG(dbgs() << "  - No CommonNode identified\n");
+    return nullptr;
+  }
+
+  NodePtr UncommonNode = identifyNode(UncommonRealOp, UncommonImagOp);
+  if (!UncommonNode) {
+    LLVM_DEBUG(dbgs() << "  - No UncommonNode identified\n");
+    return nullptr;
+  }
+
+  NodePtr Node = prepareCompositeNode(
+      ComplexDeinterleavingOperation::CMulPartial, Real, Imag);
+  Node->Rotation = Rotation;
+  Node->addOperand(CommonNode);
+  Node->addOperand(UncommonNode);
+  return submitCompositeNode(Node);
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyPartialMul(Instruction *Real,
+                                               Instruction *Imag) {
+  LLVM_DEBUG(dbgs() << "identifyPartialMul " << *Real << " / " << *Imag
+                    << "\n");
+  // Determine rotation
+  auto IsAdd = [](unsigned Op) {
+    return Op == Instruction::FAdd || Op == Instruction::Add;
+  };
+  auto IsSub = [](unsigned Op) {
+    return Op == Instruction::FSub || Op == Instruction::Sub;
+  };
+  ComplexDeinterleavingRotation Rotation;
+  if (IsAdd(Real->getOpcode()) && IsAdd(Imag->getOpcode()))
+    Rotation = ComplexDeinterleavingRotation::Rotation_0;
+  else if (IsSub(Real->getOpcode()) && IsAdd(Imag->getOpcode()))
+    Rotation = ComplexDeinterleavingRotation::Rotation_90;
+  else if (IsSub(Real->getOpcode()) && IsSub(Imag->getOpcode()))
+    Rotation = ComplexDeinterleavingRotation::Rotation_180;
+  else if (IsAdd(Real->getOpcode()) && IsSub(Imag->getOpcode()))
+    Rotation = ComplexDeinterleavingRotation::Rotation_270;
+  else {
+    LLVM_DEBUG(dbgs() << "  - Unhandled rotation.\n");
+    return nullptr;
+  }
+
+  if (isa<FPMathOperator>(Real) &&
+      (!Real->getFastMathFlags().allowContract() ||
+       !Imag->getFastMathFlags().allowContract())) {
+    LLVM_DEBUG(dbgs() << "  - Contract is missing from the FastMath flags.\n");
+    return nullptr;
+  }
+
+  Value *CR = Real->getOperand(0);
+  Instruction *RealMulI = dyn_cast<Instruction>(Real->getOperand(1));
+  if (!RealMulI)
+    return nullptr;
+  Value *CI = Imag->getOperand(0);
+  Instruction *ImagMulI = dyn_cast<Instruction>(Imag->getOperand(1));
+  if (!ImagMulI)
+    return nullptr;
+
+  if (!RealMulI->hasOneUse() || !ImagMulI->hasOneUse()) {
+    LLVM_DEBUG(dbgs() << "  - Mul instruction has multiple uses\n");
+    return nullptr;
+  }
+
+  Value *R0 = RealMulI->getOperand(0);
+  Value *R1 = RealMulI->getOperand(1);
+  Value *I0 = ImagMulI->getOperand(0);
+  Value *I1 = ImagMulI->getOperand(1);
+
+  Value *CommonOperand;
+  Value *UncommonRealOp;
+  Value *UncommonImagOp;
+
+  if (R0 == I0 || R0 == I1) {
+    CommonOperand = R0;
+    UncommonRealOp = R1;
+  } else if (R1 == I0 || R1 == I1) {
+    CommonOperand = R1;
+    UncommonRealOp = R0;
+  } else {
+    LLVM_DEBUG(dbgs() << "  - No equal operand\n");
+    return nullptr;
+  }
+
+  UncommonImagOp = (CommonOperand == I0) ? I1 : I0;
+  if (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
+      Rotation == ComplexDeinterleavingRotation::Rotation_270)
+    std::swap(UncommonRealOp, UncommonImagOp);
+
+  std::pair<Value *, Value *> PartialMatch(
+      (Rotation == ComplexDeinterleavingRotation::Rotation_0 ||
+       Rotation == ComplexDeinterleavingRotation::Rotation_180)
+          ? CommonOperand
+          : nullptr,
+      (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
+       Rotation == ComplexDeinterleavingRotation::Rotation_270)
+          ? CommonOperand
+          : nullptr);
+
+  auto *CRInst = dyn_cast<Instruction>(CR);
+  auto *CIInst = dyn_cast<Instruction>(CI);
+
+  if (!CRInst || !CIInst) {
+    LLVM_DEBUG(dbgs() << "  - Common operands are not instructions.\n");
+    return nullptr;
+  }
+
+  NodePtr CNode = identifyNodeWithImplicitAdd(CRInst, CIInst, PartialMatch);
+  if (!CNode) {
+    LLVM_DEBUG(dbgs() << "  - No cnode identified\n");
+    return nullptr;
+  }
+
+  NodePtr UncommonRes = identifyNode(UncommonRealOp, UncommonImagOp);
+  if (!UncommonRes) {
+    LLVM_DEBUG(dbgs() << "  - No UncommonRes identified\n");
+    return nullptr;
+  }
+
+  assert(PartialMatch.first && PartialMatch.second);
+  NodePtr CommonRes = identifyNode(PartialMatch.first, PartialMatch.second);
+  if (!CommonRes) {
+    LLVM_DEBUG(dbgs() << "  - No CommonRes identified\n");
+    return nullptr;
+  }
+
+  NodePtr Node = prepareCompositeNode(
+      ComplexDeinterleavingOperation::CMulPartial, Real, Imag);
+  Node->Rotation = Rotation;
+  Node->addOperand(CommonRes);
+  Node->addOperand(UncommonRes);
+  Node->addOperand(CNode);
+  return submitCompositeNode(Node);
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyAdd(Instruction *Real, Instruction *Imag) {
+  LLVM_DEBUG(dbgs() << "identifyAdd " << *Real << " / " << *Imag << "\n");
+
+  // Determine rotation
+  ComplexDeinterleavingRotation Rotation;
+  if ((Real->getOpcode() == Instruction::FSub &&
+       Imag->getOpcode() == Instruction::FAdd) ||
+      (Real->getOpcode() == Instruction::Sub &&
+       Imag->getOpcode() == Instruction::Add))
+    Rotation = ComplexDeinterleavingRotation::Rotation_90;
+  else if ((Real->getOpcode() == Instruction::FAdd &&
+            Imag->getOpcode() == Instruction::FSub) ||
+           (Real->getOpcode() == Instruction::Add &&
+            Imag->getOpcode() == Instruction::Sub))
+    Rotation = ComplexDeinterleavingRotation::Rotation_270;
+  else {
+    LLVM_DEBUG(dbgs() << " - Unhandled case, rotation is not assigned.\n");
+    return nullptr;
+  }
+
+  auto *AR = dyn_cast<Instruction>(Real->getOperand(0));
+  auto *BI = dyn_cast<Instruction>(Real->getOperand(1));
+  auto *AI = dyn_cast<Instruction>(Imag->getOperand(0));
+  auto *BR = dyn_cast<Instruction>(Imag->getOperand(1));
+
+  if (!AR || !AI || !BR || !BI) {
+    LLVM_DEBUG(dbgs() << " - Not all operands are instructions.\n");
+    return nullptr;
+  }
+
+  NodePtr ResA = identifyNode(AR, AI);
+  if (!ResA) {
+    LLVM_DEBUG(dbgs() << " - AR/AI is not identified as a composite node.\n");
+    return nullptr;
+  }
+  NodePtr ResB = identifyNode(BR, BI);
+  if (!ResB) {
+    LLVM_DEBUG(dbgs() << " - BR/BI is not identified as a composite node.\n");
+    return nullptr;
+  }
+
+  NodePtr Node =
+      prepareCompositeNode(ComplexDeinterleavingOperation::CAdd, Real, Imag);
+  Node->Rotation = Rotation;
+  Node->addOperand(ResA);
+  Node->addOperand(ResB);
+  return submitCompositeNode(Node);
+}
+
+static bool isInstructionPairAdd(Instruction *A, Instruction *B) {
+  unsigned OpcA = A->getOpcode();
+  unsigned OpcB = B->getOpcode();
+
+  return (OpcA == Instruction::FSub && OpcB == Instruction::FAdd) ||
+         (OpcA == Instruction::FAdd && OpcB == Instruction::FSub) ||
+         (OpcA == Instruction::Sub && OpcB == Instruction::Add) ||
+         (OpcA == Instruction::Add && OpcB == Instruction::Sub);
+}
+
+static bool isInstructionPairMul(Instruction *A, Instruction *B) {
+  auto Pattern =
+      m_BinOp(m_FMul(m_Value(), m_Value()), m_FMul(m_Value(), m_Value()));
+
+  return match(A, Pattern) && match(B, Pattern);
+}
+
+static bool isInstructionPotentiallySymmetric(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::FAdd:
+  case Instruction::FSub:
+  case Instruction::FMul:
+  case Instruction::FNeg:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    return true;
+  default:
+    return false;
+  }
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real,
+                                                       Instruction *Imag) {
+  if (Real->getOpcode() != Imag->getOpcode())
+    return nullptr;
+
+  if (!isInstructionPotentiallySymmetric(Real) ||
+      !isInstructionPotentiallySymmetric(Imag))
+    return nullptr;
+
+  auto *R0 = Real->getOperand(0);
+  auto *I0 = Imag->getOperand(0);
+
+  NodePtr Op0 = identifyNode(R0, I0);
+  NodePtr Op1 = nullptr;
+  if (Op0 == nullptr)
+    return nullptr;
+
+  if (Real->isBinaryOp()) {
+    auto *R1 = Real->getOperand(1);
+    auto *I1 = Imag->getOperand(1);
+    Op1 = identifyNode(R1, I1);
+    if (Op1 == nullptr)
+      return nullptr;
+  }
+
+  if (isa<FPMathOperator>(Real) &&
+      Real->getFastMathFlags() != Imag->getFastMathFlags())
+    return nullptr;
+
+  auto Node = prepareCompositeNode(ComplexDeinterleavingOperation::Symmetric,
+                                   Real, Imag);
+  Node->Opcode = Real->getOpcode();
+  if (isa<FPMathOperator>(Real))
+    Node->Flags = Real->getFastMathFlags();
+
+  Node->addOperand(Op0);
+  if (Real->isBinaryOp())
+    Node->addOperand(Op1);
+
+  return submitCompositeNode(Node);
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) {
+  LLVM_DEBUG(dbgs() << "identifyNode on " << *R << " / " << *I << "\n");
+  assert(R->getType() == I->getType() &&
+         "Real and imaginary parts should not have different types");
+
+  auto It = CachedResult.find({R, I});
+  if (It != CachedResult.end()) {
+    LLVM_DEBUG(dbgs() << " - Folding to existing node\n");
+    return It->second;
+  }
+
+  if (NodePtr CN = identifySplat(R, I))
+    return CN;
+
+  auto *Real = dyn_cast<Instruction>(R);
+  auto *Imag = dyn_cast<Instruction>(I);
+  if (!Real || !Imag)
+    return nullptr;
+
+  if (NodePtr CN = identifyDeinterleave(Real, Imag))
+    return CN;
+
+  if (NodePtr CN = identifyPHINode(Real, Imag))
+    return CN;
+
+  if (NodePtr CN = identifySelectNode(Real, Imag))
+    return CN;
+
+  auto *VTy = cast<VectorType>(Real->getType());
+  auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
+
+  bool HasCMulSupport = TL->isComplexDeinterleavingOperationSupported(
+      ComplexDeinterleavingOperation::CMulPartial, NewVTy);
+  bool HasCAddSupport = TL->isComplexDeinterleavingOperationSupported(
+      ComplexDeinterleavingOperation::CAdd, NewVTy);
+
+  if (HasCMulSupport && isInstructionPairMul(Real, Imag)) {
+    if (NodePtr CN = identifyPartialMul(Real, Imag))
+      return CN;
+  }
+
+  if (HasCAddSupport && isInstructionPairAdd(Real, Imag)) {
+    if (NodePtr CN = identifyAdd(Real, Imag))
+      return CN;
+  }
+
+  if (HasCMulSupport && HasCAddSupport) {
+    if (NodePtr CN = identifyReassocNodes(Real, Imag))
+      return CN;
+  }
+
+  if (NodePtr CN = identifySymmetricOperation(Real, Imag))
+    return CN;
+
+  LLVM_DEBUG(dbgs() << "  - Not recognised as a valid pattern.\n");
+  CachedResult[{R, I}] = nullptr;
+  return nullptr;
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real,
+                                                 Instruction *Imag) {
+  auto IsOperationSupported = [](unsigned Opcode) -> bool {
+    return Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+           Opcode == Instruction::FNeg || Opcode == Instruction::Add ||
+           Opcode == Instruction::Sub;
+  };
+
+  if (!IsOperationSupported(Real->getOpcode()) ||
+      !IsOperationSupported(Imag->getOpcode()))
+    return nullptr;
+
+  std::optional<FastMathFlags> Flags;
+  if (isa<FPMathOperator>(Real)) {
+    if (Real->getFastMathFlags() != Imag->getFastMathFlags()) {
+      LLVM_DEBUG(dbgs() << "The flags in Real and Imaginary instructions are "
+                           "not identical\n");
+      return nullptr;
+    }
+
+    Flags = Real->getFastMathFlags();
+    if (!Flags->allowReassoc()) {
+      LLVM_DEBUG(
+          dbgs()
+          << "the 'Reassoc' attribute is missing in the FastMath flags\n");
+      return nullptr;
+    }
+  }
+
+  // Collect multiplications and addend instructions from the given instruction
+  // while traversing it operands. Additionally, verify that all instructions
+  // have the same fast math flags.
+  auto Collect = [&Flags](Instruction *Insn, std::vector<Product> &Muls,
+                          std::list<Addend> &Addends) -> bool {
+    SmallVector<PointerIntPair<Value *, 1, bool>> Worklist = {{Insn, true}};
+    SmallPtrSet<Value *, 8> Visited;
+    while (!Worklist.empty()) {
+      auto [V, IsPositive] = Worklist.back();
+      Worklist.pop_back();
+      if (!Visited.insert(V).second)
+        continue;
+
+      Instruction *I = dyn_cast<Instruction>(V);
+      if (!I) {
+        Addends.emplace_back(V, IsPositive);
+        continue;
+      }
+
+      // If an instruction has more than one user, it indicates that it either
+      // has an external user, which will be later checked by the checkNodes
+      // function, or it is a subexpression utilized by multiple expressions. In
+      // the latter case, we will attempt to separately identify the complex
+      // operation from here in order to create a shared
+      // ComplexDeinterleavingCompositeNode.
+      if (I != Insn && I->getNumUses() > 1) {
+        LLVM_DEBUG(dbgs() << "Found potential sub-expression: " << *I << "\n");
+        Addends.emplace_back(I, IsPositive);
+        continue;
+      }
+      switch (I->getOpcode()) {
+      case Instruction::FAdd:
+      case Instruction::Add:
+        Worklist.emplace_back(I->getOperand(1), IsPositive);
+        Worklist.emplace_back(I->getOperand(0), IsPositive);
+        break;
+      case Instruction::FSub:
+        Worklist.emplace_back(I->getOperand(1), !IsPositive);
+        Worklist.emplace_back(I->getOperand(0), IsPositive);
+        break;
+      case Instruction::Sub:
+        if (isNeg(I)) {
+          Worklist.emplace_back(getNegOperand(I), !IsPositive);
+        } else {
+          Worklist.emplace_back(I->getOperand(1), !IsPositive);
+          Worklist.emplace_back(I->getOperand(0), IsPositive);
+        }
+        break;
+      case Instruction::FMul:
+      case Instruction::Mul: {
+        Value *A, *B;
+        if (isNeg(I->getOperand(0))) {
+          A = getNegOperand(I->getOperand(0));
+          IsPositive = !IsPositive;
+        } else {
+          A = I->getOperand(0);
+        }
+
+        if (isNeg(I->getOperand(1))) {
+          B = getNegOperand(I->getOperand(1));
+          IsPositive = !IsPositive;
+        } else {
+          B = I->getOperand(1);
+        }
+        Muls.push_back(Product{A, B, IsPositive});
+        break;
+      }
+      case Instruction::FNeg:
+        Worklist.emplace_back(I->getOperand(0), !IsPositive);
+        break;
+      default:
+        Addends.emplace_back(I, IsPositive);
+        continue;
+      }
+
+      if (Flags && I->getFastMathFlags() != *Flags) {
+        LLVM_DEBUG(dbgs() << "The instruction's fast math flags are "
+                             "inconsistent with the root instructions' flags: "
+                          << *I << "\n");
+        return false;
+      }
+    }
+    return true;
+  };
+
+  std::vector<Product> RealMuls, ImagMuls;
+  std::list<Addend> RealAddends, ImagAddends;
+  if (!Collect(Real, RealMuls, RealAddends) ||
+      !Collect(Imag, ImagMuls, ImagAddends))
+    return nullptr;
+
+  if (RealAddends.size() != ImagAddends.size())
+    return nullptr;
+
+  NodePtr FinalNode;
+  if (!RealMuls.empty() || !ImagMuls.empty()) {
+    // If there are multiplicands, extract positive addend and use it as an
+    // accumulator
+    FinalNode = extractPositiveAddend(RealAddends, ImagAddends);
+    FinalNode = identifyMultiplications(RealMuls, ImagMuls, FinalNode);
+    if (!FinalNode)
+      return nullptr;
+  }
+
+  // Identify and process remaining additions
+  if (!RealAddends.empty() || !ImagAddends.empty()) {
+    FinalNode = identifyAdditions(RealAddends, ImagAddends, Flags, FinalNode);
+    if (!FinalNode)
+      return nullptr;
+  }
+  assert(FinalNode && "FinalNode can not be nullptr here");
+  // Set the Real and Imag fields of the final node and submit it
+  FinalNode->Real = Real;
+  FinalNode->Imag = Imag;
+  submitCompositeNode(FinalNode);
+  return FinalNode;
+}
+
+bool ComplexDeinterleavingGraph::collectPartialMuls(
+    const std::vector<Product> &RealMuls, const std::vector<Product> &ImagMuls,
+    std::vector<PartialMulCandidate> &PartialMulCandidates) {
+  // Helper function to extract a common operand from two products
+  auto FindCommonInstruction = [](const Product &Real,
+                                  const Product &Imag) -> Value * {
+    if (Real.Multiplicand == Imag.Multiplicand ||
+        Real.Multiplicand == Imag.Multiplier)
+      return Real.Multiplicand;
+
+    if (Real.Multiplier == Imag.Multiplicand ||
+        Real.Multiplier == Imag.Multiplier)
+      return Real.Multiplier;
+
+    return nullptr;
+  };
+
+  // Iterating over real and imaginary multiplications to find common operands
+  // If a common operand is found, a partial multiplication candidate is created
+  // and added to the candidates vector The function returns false if no common
+  // operands are found for any product
+  for (unsigned i = 0; i < RealMuls.size(); ++i) {
+    bool FoundCommon = false;
+    for (unsigned j = 0; j < ImagMuls.size(); ++j) {
+      auto *Common = FindCommonInstruction(RealMuls[i], ImagMuls[j]);
+      if (!Common)
+        continue;
+
+      auto *A = RealMuls[i].Multiplicand == Common ? RealMuls[i].Multiplier
+                                                   : RealMuls[i].Multiplicand;
+      auto *B = ImagMuls[j].Multiplicand == Common ? ImagMuls[j].Multiplier
+                                                   : ImagMuls[j].Multiplicand;
+
+      auto Node = identifyNode(A, B);
+      if (Node) {
+        FoundCommon = true;
+        PartialMulCandidates.push_back({Common, Node, i, j, false});
+      }
+
+      Node = identifyNode(B, A);
+      if (Node) {
+        FoundCommon = true;
+        PartialMulCandidates.push_back({Common, Node, i, j, true});
+      }
+    }
+    if (!FoundCommon)
+      return false;
+  }
+  return true;
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyMultiplications(
+    std::vector<Product> &RealMuls, std::vector<Product> &ImagMuls,
+    NodePtr Accumulator = nullptr) {
+  if (RealMuls.size() != ImagMuls.size())
+    return nullptr;
+
+  std::vector<PartialMulCandidate> Info;
+  if (!collectPartialMuls(RealMuls, ImagMuls, Info))
+    return nullptr;
+
+  // Map to store common instruction to node pointers
+  std::map<Value *, NodePtr> CommonToNode;
+  std::vector<bool> Processed(Info.size(), false);
+  for (unsigned I = 0; I < Info.size(); ++I) {
+    if (Processed[I])
+      continue;
+
+    PartialMulCandidate &InfoA = Info[I];
+    for (unsigned J = I + 1; J < Info.size(); ++J) {
+      if (Processed[J])
+        continue;
+
+      PartialMulCandidate &InfoB = Info[J];
+      auto *InfoReal = &InfoA;
+      auto *InfoImag = &InfoB;
+
+      auto NodeFromCommon = identifyNode(InfoReal->Common, InfoImag->Common);
+      if (!NodeFromCommon) {
+        std::swap(InfoReal, InfoImag);
+        NodeFromCommon = identifyNode(InfoReal->Common, InfoImag->Common);
+      }
+      if (!NodeFromCommon)
+        continue;
+
+      CommonToNode[InfoReal->Common] = NodeFromCommon;
+      CommonToNode[InfoImag->Common] = NodeFromCommon;
+      Processed[I] = true;
+      Processed[J] = true;
+    }
+  }
+
+  std::vector<bool> ProcessedReal(RealMuls.size(), false);
+  std::vector<bool> ProcessedImag(ImagMuls.size(), false);
+  NodePtr Result = Accumulator;
+  for (auto &PMI : Info) {
+    if (ProcessedReal[PMI.RealIdx] || ProcessedImag[PMI.ImagIdx])
+      continue;
+
+    auto It = CommonToNode.find(PMI.Common);
+    // TODO: Process independent complex multiplications. Cases like this:
+    //  A.real() * B where both A and B are complex numbers.
+    if (It == CommonToNode.end()) {
+      LLVM_DEBUG({
+        dbgs() << "Unprocessed independent partial multiplication:\n";
+        for (auto *Mul : {&RealMuls[PMI.RealIdx], &RealMuls[PMI.RealIdx]})
+          dbgs().indent(4) << (Mul->IsPositive ? "+" : "-") << *Mul->Multiplier
+                           << " multiplied by " << *Mul->Multiplicand << "\n";
+      });
+      return nullptr;
+    }
+
+    auto &RealMul = RealMuls[PMI.RealIdx];
+    auto &ImagMul = ImagMuls[PMI.ImagIdx];
+
+    auto NodeA = It->second;
+    auto NodeB = PMI.Node;
+    auto IsMultiplicandReal = PMI.Common == NodeA->Real;
+    // The following table illustrates the relationship between multiplications
+    // and rotations. If we consider the multiplication (X + iY) * (U + iV), we
+    // can see:
+    //
+    // Rotation |   Real |   Imag |
+    // ---------+--------+--------+
+    //        0 |  x * u |  x * v |
+    //       90 | -y * v |  y * u |
+    //      180 | -x * u | -x * v |
+    //      270 |  y * v | -y * u |
+    //
+    // Check if the candidate can indeed be represented by partial
+    // multiplication
+    // TODO: Add support for multiplication by complex one
+    if ((IsMultiplicandReal && PMI.IsNodeInverted) ||
+        (!IsMultiplicandReal && !PMI.IsNodeInverted))
+      continue;
+
+    // Determine the rotation based on the multiplications
+    ComplexDeinterleavingRotation Rotation;
+    if (IsMultiplicandReal) {
+      // Detect 0 and 180 degrees rotation
+      if (RealMul.IsPositive && ImagMul.IsPositive)
+        Rotation = llvm::ComplexDeinterleavingRotation::Rotation_0;
+      else if (!RealMul.IsPositive && !ImagMul.IsPositive)
+        Rotation = llvm::ComplexDeinterleavingRotation::Rotation_180;
+      else
+        continue;
+
+    } else {
+      // Detect 90 and 270 degrees rotation
+      if (!RealMul.IsPositive && ImagMul.IsPositive)
+        Rotation = llvm::ComplexDeinterleavingRotation::Rotation_90;
+      else if (RealMul.IsPositive && !ImagMul.IsPositive)
+        Rotation = llvm::ComplexDeinterleavingRotation::Rotation_270;
+      else
+        continue;
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "Identified partial multiplication (X, Y) * (U, V):\n";
+      dbgs().indent(4) << "X: " << *NodeA->Real << "\n";
+      dbgs().indent(4) << "Y: " << *NodeA->Imag << "\n";
+      dbgs().indent(4) << "U: " << *NodeB->Real << "\n";
+      dbgs().indent(4) << "V: " << *NodeB->Imag << "\n";
+      dbgs().indent(4) << "Rotation - " << (int)Rotation * 90 << "\n";
+    });
+
+    NodePtr NodeMul = prepareCompositeNode(
+        ComplexDeinterleavingOperation::CMulPartial, nullptr, nullptr);
+    NodeMul->Rotation = Rotation;
+    NodeMul->addOperand(NodeA);
+    NodeMul->addOperand(NodeB);
+    if (Result)
+      NodeMul->addOperand(Result);
+    submitCompositeNode(NodeMul);
+    Result = NodeMul;
+    ProcessedReal[PMI.RealIdx] = true;
+    ProcessedImag[PMI.ImagIdx] = true;
+  }
+
+  // Ensure all products have been processed, if not return nullptr.
+  if (!all_of(ProcessedReal, [](bool V) { return V; }) ||
+      !all_of(ProcessedImag, [](bool V) { return V; })) {
+
+    // Dump debug information about which partial multiplications are not
+    // processed.
+    LLVM_DEBUG({
+      dbgs() << "Unprocessed products (Real):\n";
+      for (size_t i = 0; i < ProcessedReal.size(); ++i) {
+        if (!ProcessedReal[i])
+          dbgs().indent(4) << (RealMuls[i].IsPositive ? "+" : "-")
+                           << *RealMuls[i].Multiplier << " multiplied by "
+                           << *RealMuls[i].Multiplicand << "\n";
+      }
+      dbgs() << "Unprocessed products (Imag):\n";
+      for (size_t i = 0; i < ProcessedImag.size(); ++i) {
+        if (!ProcessedImag[i])
+          dbgs().indent(4) << (ImagMuls[i].IsPositive ? "+" : "-")
+                           << *ImagMuls[i].Multiplier << " multiplied by "
+                           << *ImagMuls[i].Multiplicand << "\n";
+      }
+    });
+    return nullptr;
+  }
+
+  return Result;
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyAdditions(
+    std::list<Addend> &RealAddends, std::list<Addend> &ImagAddends,
+    std::optional<FastMathFlags> Flags, NodePtr Accumulator = nullptr) {
+  if (RealAddends.size() != ImagAddends.size())
+    return nullptr;
+
+  NodePtr Result;
+  // If we have accumulator use it as first addend
+  if (Accumulator)
+    Result = Accumulator;
+  // Otherwise find an element with both positive real and imaginary parts.
+  else
+    Result = extractPositiveAddend(RealAddends, ImagAddends);
+
+  if (!Result)
+    return nullptr;
+
+  while (!RealAddends.empty()) {
+    auto ItR = RealAddends.begin();
+    auto [R, IsPositiveR] = *ItR;
+
+    bool FoundImag = false;
+    for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) {
+      auto [I, IsPositiveI] = *ItI;
+      ComplexDeinterleavingRotation Rotation;
+      if (IsPositiveR && IsPositiveI)
+        Rotation = ComplexDeinterleavingRotation::Rotation_0;
+      else if (!IsPositiveR && IsPositiveI)
+        Rotation = ComplexDeinterleavingRotation::Rotation_90;
+      else if (!IsPositiveR && !IsPositiveI)
+        Rotation = ComplexDeinterleavingRotation::Rotation_180;
+      else
+        Rotation = ComplexDeinterleavingRotation::Rotation_270;
+
+      NodePtr AddNode;
+      if (Rotation == ComplexDeinterleavingRotation::Rotation_0 ||
+          Rotation == ComplexDeinterleavingRotation::Rotation_180) {
+        AddNode = identifyNode(R, I);
+      } else {
+        AddNode = identifyNode(I, R);
+      }
+      if (AddNode) {
+        LLVM_DEBUG({
+          dbgs() << "Identified addition:\n";
+          dbgs().indent(4) << "X: " << *R << "\n";
+          dbgs().indent(4) << "Y: " << *I << "\n";
+          dbgs().indent(4) << "Rotation - " << (int)Rotation * 90 << "\n";
+        });
+
+        NodePtr TmpNode;
+        if (Rotation == llvm::ComplexDeinterleavingRotation::Rotation_0) {
+          TmpNode = prepareCompositeNode(
+              ComplexDeinterleavingOperation::Symmetric, nullptr, nullptr);
+          if (Flags) {
+            TmpNode->Opcode = Instruction::FAdd;
+            TmpNode->Flags = *Flags;
+          } else {
+            TmpNode->Opcode = Instruction::Add;
+          }
+        } else if (Rotation ==
+                   llvm::ComplexDeinterleavingRotation::Rotation_180) {
+          TmpNode = prepareCompositeNode(
+              ComplexDeinterleavingOperation::Symmetric, nullptr, nullptr);
+          if (Flags) {
+            TmpNode->Opcode = Instruction::FSub;
+            TmpNode->Flags = *Flags;
+          } else {
+            TmpNode->Opcode = Instruction::Sub;
+          }
+        } else {
+          TmpNode = prepareCompositeNode(ComplexDeinterleavingOperation::CAdd,
+                                         nullptr, nullptr);
+          TmpNode->Rotation = Rotation;
+        }
+
+        TmpNode->addOperand(Result);
+        TmpNode->addOperand(AddNode);
+        submitCompositeNode(TmpNode);
+        Result = TmpNode;
+        RealAddends.erase(ItR);
+        ImagAddends.erase(ItI);
+        FoundImag = true;
+        break;
+      }
+    }
+    if (!FoundImag)
+      return nullptr;
+  }
+  return Result;
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::extractPositiveAddend(
+    std::list<Addend> &RealAddends, std::list<Addend> &ImagAddends) {
+  for (auto ItR = RealAddends.begin(); ItR != RealAddends.end(); ++ItR) {
+    for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) {
+      auto [R, IsPositiveR] = *ItR;
+      auto [I, IsPositiveI] = *ItI;
+      if (IsPositiveR && IsPositiveI) {
+        auto Result = identifyNode(R, I);
+        if (Result) {
+          RealAddends.erase(ItR);
+          ImagAddends.erase(ItI);
+          return Result;
+        }
+      }
+    }
+  }
+  return nullptr;
+}
+
+bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
+  // This potential root instruction might already have been recognized as
+  // reduction. Because RootToNode maps both Real and Imaginary parts to
+  // CompositeNode we should choose only one either Real or Imag instruction to
+  // use as an anchor for generating complex instruction.
+  auto It = RootToNode.find(RootI);
+  if (It != RootToNode.end()) {
+    auto RootNode = It->second;
+    assert(RootNode->Operation ==
+           ComplexDeinterleavingOperation::ReductionOperation);
+    // Find out which part, Real or Imag, comes later, and only if we come to
+    // the latest part, add it to OrderedRoots.
+    auto *R = cast<Instruction>(RootNode->Real);
+    auto *I = cast<Instruction>(RootNode->Imag);
+    auto *ReplacementAnchor = R->comesBefore(I) ? I : R;
+    if (ReplacementAnchor != RootI)
+      return false;
+    OrderedRoots.push_back(RootI);
+    return true;
+  }
+
+  auto RootNode = identifyRoot(RootI);
+  if (!RootNode)
+    return false;
+
+  LLVM_DEBUG({
+    Function *F = RootI->getFunction();
+    BasicBlock *B = RootI->getParent();
+    dbgs() << "Complex deinterleaving graph for " << F->getName()
+           << "::" << B->getName() << ".\n";
+    dump(dbgs());
+    dbgs() << "\n";
+  });
+  RootToNode[RootI] = RootNode;
+  OrderedRoots.push_back(RootI);
+  return true;
+}
+
+bool ComplexDeinterleavingGraph::collectPotentialReductions(BasicBlock *B) {
+  bool FoundPotentialReduction = false;
+
+  auto *Br = dyn_cast<BranchInst>(B->getTerminator());
+  if (!Br || Br->getNumSuccessors() != 2)
+    return false;
+
+  // Identify simple one-block loop
+  if (Br->getSuccessor(0) != B && Br->getSuccessor(1) != B)
+    return false;
+
+  SmallVector<PHINode *> PHIs;
+  for (auto &PHI : B->phis()) {
+    if (PHI.getNumIncomingValues() != 2)
+      continue;
+
+    if (!PHI.getType()->isVectorTy())
+      continue;
+
+    auto *ReductionOp = dyn_cast<Instruction>(PHI.getIncomingValueForBlock(B));
+    if (!ReductionOp)
+      continue;
+
+    // Check if final instruction is reduced outside of current block
+    Instruction *FinalReduction = nullptr;
+    auto NumUsers = 0u;
+    for (auto *U : ReductionOp->users()) {
+      ++NumUsers;
+      if (U == &PHI)
+        continue;
+      FinalReduction = dyn_cast<Instruction>(U);
+    }
+
+    if (NumUsers != 2 || !FinalReduction || FinalReduction->getParent() == B ||
+        isa<PHINode>(FinalReduction))
+      continue;
+
+    ReductionInfo[ReductionOp] = {&PHI, FinalReduction};
+    BackEdge = B;
+    auto BackEdgeIdx = PHI.getBasicBlockIndex(B);
+    auto IncomingIdx = BackEdgeIdx == 0 ? 1 : 0;
+    Incoming = PHI.getIncomingBlock(IncomingIdx);
+    FoundPotentialReduction = true;
+
+    // If the initial value of PHINode is an Instruction, consider it a leaf
+    // value of a complex deinterleaving graph.
+    if (auto *InitPHI =
+            dyn_cast<Instruction>(PHI.getIncomingValueForBlock(Incoming)))
+      FinalInstructions.insert(InitPHI);
+  }
+  return FoundPotentialReduction;
+}
+
+void ComplexDeinterleavingGraph::identifyReductionNodes() {
+  SmallVector<bool> Processed(ReductionInfo.size(), false);
+  SmallVector<Instruction *> OperationInstruction;
+  for (auto &P : ReductionInfo)
+    OperationInstruction.push_back(P.first);
+
+  // Identify a complex computation by evaluating two reduction operations that
+  // potentially could be involved
+  for (size_t i = 0; i < OperationInstruction.size(); ++i) {
+    if (Processed[i])
+      continue;
+    for (size_t j = i + 1; j < OperationInstruction.size(); ++j) {
+      if (Processed[j])
+        continue;
+
+      auto *Real = OperationInstruction[i];
+      auto *Imag = OperationInstruction[j];
+      if (Real->getType() != Imag->getType())
+        continue;
+
+      RealPHI = ReductionInfo[Real].first;
+      ImagPHI = ReductionInfo[Imag].first;
+      PHIsFound = false;
+      auto Node = identifyNode(Real, Imag);
+      if (!Node) {
+        std::swap(Real, Imag);
+        std::swap(RealPHI, ImagPHI);
+        Node = identifyNode(Real, Imag);
+      }
+
+      // If a node is identified and reduction PHINode is used in the chain of
+      // operations, mark its operation instructions as used to prevent
+      // re-identification and attach the node to the real part
+      if (Node && PHIsFound) {
+        LLVM_DEBUG(dbgs() << "Identified reduction starting from instructions: "
+                          << *Real << " / " << *Imag << "\n");
+        Processed[i] = true;
+        Processed[j] = true;
+        auto RootNode = prepareCompositeNode(
+            ComplexDeinterleavingOperation::ReductionOperation, Real, Imag);
+        RootNode->addOperand(Node);
+        RootToNode[Real] = RootNode;
+        RootToNode[Imag] = RootNode;
+        submitCompositeNode(RootNode);
+        break;
+      }
+    }
+  }
+
+  RealPHI = nullptr;
+  ImagPHI = nullptr;
+}
+
+bool ComplexDeinterleavingGraph::checkNodes() {
+  // Collect all instructions from roots to leaves
+  SmallPtrSet<Instruction *, 16> AllInstructions;
+  SmallVector<Instruction *, 8> Worklist;
+  for (auto &Pair : RootToNode)
+    Worklist.push_back(Pair.first);
+
+  // Extract all instructions that are used by all XCMLA/XCADD/ADD/SUB/NEG
+  // chains
+  while (!Worklist.empty()) {
+    auto *I = Worklist.back();
+    Worklist.pop_back();
+
+    if (!AllInstructions.insert(I).second)
+      continue;
+
+    for (Value *Op : I->operands()) {
+      if (auto *OpI = dyn_cast<Instruction>(Op)) {
+        if (!FinalInstructions.count(I))
+          Worklist.emplace_back(OpI);
+      }
+    }
+  }
+
+  // Find instructions that have users outside of chain
+  SmallVector<Instruction *, 2> OuterInstructions;
+  for (auto *I : AllInstructions) {
+    // Skip root nodes
+    if (RootToNode.count(I))
+      continue;
+
+    for (User *U : I->users()) {
+      if (AllInstructions.count(cast<Instruction>(U)))
+        continue;
+
+      // Found an instruction that is not used by XCMLA/XCADD chain
+      Worklist.emplace_back(I);
+      break;
+    }
+  }
+
+  // If any instructions are found to be used outside, find and remove roots
+  // that somehow connect to those instructions.
+  SmallPtrSet<Instruction *, 16> Visited;
+  while (!Worklist.empty()) {
+    auto *I = Worklist.back();
+    Worklist.pop_back();
+    if (!Visited.insert(I).second)
+      continue;
+
+    // Found an impacted root node. Removing it from the nodes to be
+    // deinterleaved
+    if (RootToNode.count(I)) {
+      LLVM_DEBUG(dbgs() << "Instruction " << *I
+                        << " could be deinterleaved but its chain of complex "
+                           "operations have an outside user\n");
+      RootToNode.erase(I);
+    }
+
+    if (!AllInstructions.count(I) || FinalInstructions.count(I))
+      continue;
+
+    for (User *U : I->users())
+      Worklist.emplace_back(cast<Instruction>(U));
+
+    for (Value *Op : I->operands()) {
+      if (auto *OpI = dyn_cast<Instruction>(Op))
+        Worklist.emplace_back(OpI);
+    }
+  }
+  return !RootToNode.empty();
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyRoot(Instruction *RootI) {
+  if (auto *Intrinsic = dyn_cast<IntrinsicInst>(RootI)) {
+    if (Intrinsic->getIntrinsicID() !=
+        Intrinsic::experimental_vector_interleave2)
+      return nullptr;
+
+    auto *Real = dyn_cast<Instruction>(Intrinsic->getOperand(0));
+    auto *Imag = dyn_cast<Instruction>(Intrinsic->getOperand(1));
+    if (!Real || !Imag)
+      return nullptr;
+
+    return identifyNode(Real, Imag);
+  }
+
+  auto *SVI = dyn_cast<ShuffleVectorInst>(RootI);
+  if (!SVI)
+    return nullptr;
+
+  // Look for a shufflevector that takes separate vectors of the real and
+  // imaginary components and recombines them into a single vector.
+  if (!isInterleavingMask(SVI->getShuffleMask()))
+    return nullptr;
+
+  Instruction *Real;
+  Instruction *Imag;
+  if (!match(RootI, m_Shuffle(m_Instruction(Real), m_Instruction(Imag))))
+    return nullptr;
+
+  return identifyNode(Real, Imag);
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real,
+                                                 Instruction *Imag) {
+  Instruction *I = nullptr;
+  Value *FinalValue = nullptr;
+  if (match(Real, m_ExtractValue<0>(m_Instruction(I))) &&
+      match(Imag, m_ExtractValue<1>(m_Specific(I))) &&
+      match(I, m_Intrinsic<Intrinsic::experimental_vector_deinterleave2>(
+                   m_Value(FinalValue)))) {
+    NodePtr PlaceholderNode = prepareCompositeNode(
+        llvm::ComplexDeinterleavingOperation::Deinterleave, Real, Imag);
+    PlaceholderNode->ReplacementNode = FinalValue;
+    FinalInstructions.insert(Real);
+    FinalInstructions.insert(Imag);
+    return submitCompositeNode(PlaceholderNode);
+  }
+
+  auto *RealShuffle = dyn_cast<ShuffleVectorInst>(Real);
+  auto *ImagShuffle = dyn_cast<ShuffleVectorInst>(Imag);
+  if (!RealShuffle || !ImagShuffle) {
+    if (RealShuffle || ImagShuffle)
+      LLVM_DEBUG(dbgs() << " - There's a shuffle where there shouldn't be.\n");
+    return nullptr;
+  }
+
+  Value *RealOp1 = RealShuffle->getOperand(1);
+  if (!isa<UndefValue>(RealOp1) && !isa<ConstantAggregateZero>(RealOp1)) {
+    LLVM_DEBUG(dbgs() << " - RealOp1 is not undef or zero.\n");
+    return nullptr;
+  }
+  Value *ImagOp1 = ImagShuffle->getOperand(1);
+  if (!isa<UndefValue>(ImagOp1) && !isa<ConstantAggregateZero>(ImagOp1)) {
+    LLVM_DEBUG(dbgs() << " - ImagOp1 is not undef or zero.\n");
+    return nullptr;
+  }
+
+  Value *RealOp0 = RealShuffle->getOperand(0);
+  Value *ImagOp0 = ImagShuffle->getOperand(0);
+
+  if (RealOp0 != ImagOp0) {
+    LLVM_DEBUG(dbgs() << " - Shuffle operands are not equal.\n");
+    return nullptr;
+  }
+
+  ArrayRef<int> RealMask = RealShuffle->getShuffleMask();
+  ArrayRef<int> ImagMask = ImagShuffle->getShuffleMask();
+  if (!isDeinterleavingMask(RealMask) || !isDeinterleavingMask(ImagMask)) {
+    LLVM_DEBUG(dbgs() << " - Masks are not deinterleaving.\n");
+    return nullptr;
+  }
+
+  if (RealMask[0] != 0 || ImagMask[0] != 1) {
+    LLVM_DEBUG(dbgs() << " - Masks do not have the correct initial value.\n");
+    return nullptr;
+  }
+
+  // Type checking, the shuffle type should be a vector type of the same
+  // scalar type, but half the size
+  auto CheckType = [&](ShuffleVectorInst *Shuffle) {
+    Value *Op = Shuffle->getOperand(0);
+    auto *ShuffleTy = cast<FixedVectorType>(Shuffle->getType());
+    auto *OpTy = cast<FixedVectorType>(Op->getType());
+
+    if (OpTy->getScalarType() != ShuffleTy->getScalarType())
+      return false;
+    if ((ShuffleTy->getNumElements() * 2) != OpTy->getNumElements())
+      return false;
+
+    return true;
+  };
+
+  auto CheckDeinterleavingShuffle = [&](ShuffleVectorInst *Shuffle) -> bool {
+    if (!CheckType(Shuffle))
+      return false;
+
+    ArrayRef<int> Mask = Shuffle->getShuffleMask();
+    int Last = *Mask.rbegin();
+
+    Value *Op = Shuffle->getOperand(0);
+    auto *OpTy = cast<FixedVectorType>(Op->getType());
+    int NumElements = OpTy->getNumElements();
+
+    // Ensure that the deinterleaving shuffle only pulls from the first
+    // shuffle operand.
+    return Last < NumElements;
+  };
+
+  if (RealShuffle->getType() != ImagShuffle->getType()) {
+    LLVM_DEBUG(dbgs() << " - Shuffle types aren't equal.\n");
+    return nullptr;
+  }
+  if (!CheckDeinterleavingShuffle(RealShuffle)) {
+    LLVM_DEBUG(dbgs() << " - RealShuffle is invalid type.\n");
+    return nullptr;
+  }
+  if (!CheckDeinterleavingShuffle(ImagShuffle)) {
+    LLVM_DEBUG(dbgs() << " - ImagShuffle is invalid type.\n");
+    return nullptr;
+  }
+
+  NodePtr PlaceholderNode =
+      prepareCompositeNode(llvm::ComplexDeinterleavingOperation::Deinterleave,
+                           RealShuffle, ImagShuffle);
+  PlaceholderNode->ReplacementNode = RealShuffle->getOperand(0);
+  FinalInstructions.insert(RealShuffle);
+  FinalInstructions.insert(ImagShuffle);
+  return submitCompositeNode(PlaceholderNode);
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) {
+  auto IsSplat = [](Value *V) -> bool {
+    // Fixed-width vector with constants
+    if (isa<ConstantDataVector>(V))
+      return true;
+
+    VectorType *VTy;
+    ArrayRef<int> Mask;
+    // Splats are represented differently depending on whether the repeated
+    // value is a constant or an Instruction
+    if (auto *Const = dyn_cast<ConstantExpr>(V)) {
+      if (Const->getOpcode() != Instruction::ShuffleVector)
+        return false;
+      VTy = cast<VectorType>(Const->getType());
+      Mask = Const->getShuffleMask();
+    } else if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V)) {
+      VTy = Shuf->getType();
+      Mask = Shuf->getShuffleMask();
+    } else {
+      return false;
+    }
+
+    // When the data type is <1 x Type>, it's not possible to differentiate
+    // between the ComplexDeinterleaving::Deinterleave and
+    // ComplexDeinterleaving::Splat operations.
+    if (!VTy->isScalableTy() && VTy->getElementCount().getKnownMinValue() == 1)
+      return false;
+
+    return all_equal(Mask) && Mask[0] == 0;
+  };
+
+  if (!IsSplat(R) || !IsSplat(I))
+    return nullptr;
+
+  auto *Real = dyn_cast<Instruction>(R);
+  auto *Imag = dyn_cast<Instruction>(I);
+  if ((!Real && Imag) || (Real && !Imag))
+    return nullptr;
+
+  if (Real && Imag) {
+    // Non-constant splats should be in the same basic block
+    if (Real->getParent() != Imag->getParent())
+      return nullptr;
+
+    FinalInstructions.insert(Real);
+    FinalInstructions.insert(Imag);
+  }
+  NodePtr PlaceholderNode =
+      prepareCompositeNode(ComplexDeinterleavingOperation::Splat, R, I);
+  return submitCompositeNode(PlaceholderNode);
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyPHINode(Instruction *Real,
+                                            Instruction *Imag) {
+  if (Real != RealPHI || Imag != ImagPHI)
+    return nullptr;
+
+  PHIsFound = true;
+  NodePtr PlaceholderNode = prepareCompositeNode(
+      ComplexDeinterleavingOperation::ReductionPHI, Real, Imag);
+  return submitCompositeNode(PlaceholderNode);
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifySelectNode(Instruction *Real,
+                                               Instruction *Imag) {
+  auto *SelectReal = dyn_cast<SelectInst>(Real);
+  auto *SelectImag = dyn_cast<SelectInst>(Imag);
+  if (!SelectReal || !SelectImag)
+    return nullptr;
+
+  Instruction *MaskA, *MaskB;
+  Instruction *AR, *AI, *RA, *BI;
+  if (!match(Real, m_Select(m_Instruction(MaskA), m_Instruction(AR),
+                            m_Instruction(RA))) ||
+      !match(Imag, m_Select(m_Instruction(MaskB), m_Instruction(AI),
+                            m_Instruction(BI))))
+    return nullptr;
+
+  if (MaskA != MaskB && !MaskA->isIdenticalTo(MaskB))
+    return nullptr;
+
+  if (!MaskA->getType()->isVectorTy())
+    return nullptr;
+
+  auto NodeA = identifyNode(AR, AI);
+  if (!NodeA)
+    return nullptr;
+
+  auto NodeB = identifyNode(RA, BI);
+  if (!NodeB)
+    return nullptr;
+
+  NodePtr PlaceholderNode = prepareCompositeNode(
+      ComplexDeinterleavingOperation::ReductionSelect, Real, Imag);
+  PlaceholderNode->addOperand(NodeA);
+  PlaceholderNode->addOperand(NodeB);
+  FinalInstructions.insert(MaskA);
+  FinalInstructions.insert(MaskB);
+  return submitCompositeNode(PlaceholderNode);
+}
+
+static Value *replaceSymmetricNode(IRBuilderBase &B, unsigned Opcode,
+                                   std::optional<FastMathFlags> Flags,
+                                   Value *InputA, Value *InputB) {
+  Value *I;
+  switch (Opcode) {
+  case Instruction::FNeg:
+    I = B.CreateFNeg(InputA);
+    break;
+  case Instruction::FAdd:
+    I = B.CreateFAdd(InputA, InputB);
+    break;
+  case Instruction::Add:
+    I = B.CreateAdd(InputA, InputB);
+    break;
+  case Instruction::FSub:
+    I = B.CreateFSub(InputA, InputB);
+    break;
+  case Instruction::Sub:
+    I = B.CreateSub(InputA, InputB);
+    break;
+  case Instruction::FMul:
+    I = B.CreateFMul(InputA, InputB);
+    break;
+  case Instruction::Mul:
+    I = B.CreateMul(InputA, InputB);
+    break;
+  default:
+    llvm_unreachable("Incorrect symmetric opcode");
+  }
+  if (Flags)
+    cast<Instruction>(I)->setFastMathFlags(*Flags);
+  return I;
+}
+
+Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
+                                               RawNodePtr Node) {
+  if (Node->ReplacementNode)
+    return Node->ReplacementNode;
+
+  auto ReplaceOperandIfExist = [&](RawNodePtr &Node, unsigned Idx) -> Value * {
+    return Node->Operands.size() > Idx
+               ? replaceNode(Builder, Node->Operands[Idx])
+               : nullptr;
+  };
+
+  Value *ReplacementNode;
+  switch (Node->Operation) {
+  case ComplexDeinterleavingOperation::CAdd:
+  case ComplexDeinterleavingOperation::CMulPartial:
+  case ComplexDeinterleavingOperation::Symmetric: {
+    Value *Input0 = ReplaceOperandIfExist(Node, 0);
+    Value *Input1 = ReplaceOperandIfExist(Node, 1);
+    Value *Accumulator = ReplaceOperandIfExist(Node, 2);
+    assert(!Input1 || (Input0->getType() == Input1->getType() &&
+                       "Node inputs need to be of the same type"));
+    assert(!Accumulator ||
+           (Input0->getType() == Accumulator->getType() &&
+            "Accumulator and input need to be of the same type"));
+    if (Node->Operation == ComplexDeinterleavingOperation::Symmetric)
+      ReplacementNode = replaceSymmetricNode(Builder, Node->Opcode, Node->Flags,
+                                             Input0, Input1);
+    else
+      ReplacementNode = TL->createComplexDeinterleavingIR(
+          Builder, Node->Operation, Node->Rotation, Input0, Input1,
+          Accumulator);
+    break;
+  }
+  case ComplexDeinterleavingOperation::Deinterleave:
+    llvm_unreachable("Deinterleave node should already have ReplacementNode");
+    break;
+  case ComplexDeinterleavingOperation::Splat: {
+    auto *NewTy = VectorType::getDoubleElementsVectorType(
+        cast<VectorType>(Node->Real->getType()));
+    auto *R = dyn_cast<Instruction>(Node->Real);
+    auto *I = dyn_cast<Instruction>(Node->Imag);
+    if (R && I) {
+      // Splats that are not constant are interleaved where they are located
+      Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode();
+      IRBuilder<> IRB(InsertPoint);
+      ReplacementNode =
+          IRB.CreateIntrinsic(Intrinsic::experimental_vector_interleave2, NewTy,
+                              {Node->Real, Node->Imag});
+    } else {
+      ReplacementNode =
+          Builder.CreateIntrinsic(Intrinsic::experimental_vector_interleave2,
+                                  NewTy, {Node->Real, Node->Imag});
+    }
+    break;
+  }
+  case ComplexDeinterleavingOperation::ReductionPHI: {
+    // If Operation is ReductionPHI, a new empty PHINode is created.
+    // It is filled later when the ReductionOperation is processed.
+    auto *VTy = cast<VectorType>(Node->Real->getType());
+    auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
+    auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHI());
+    OldToNewPHI[dyn_cast<PHINode>(Node->Real)] = NewPHI;
+    ReplacementNode = NewPHI;
+    break;
+  }
+  case ComplexDeinterleavingOperation::ReductionOperation:
+    ReplacementNode = replaceNode(Builder, Node->Operands[0]);
+    processReductionOperation(ReplacementNode, Node);
+    break;
+  case ComplexDeinterleavingOperation::ReductionSelect: {
+    auto *MaskReal = cast<Instruction>(Node->Real)->getOperand(0);
+    auto *MaskImag = cast<Instruction>(Node->Imag)->getOperand(0);
+    auto *A = replaceNode(Builder, Node->Operands[0]);
+    auto *B = replaceNode(Builder, Node->Operands[1]);
+    auto *NewMaskTy = VectorType::getDoubleElementsVectorType(
+        cast<VectorType>(MaskReal->getType()));
+    auto *NewMask =
+        Builder.CreateIntrinsic(Intrinsic::experimental_vector_interleave2,
+                                NewMaskTy, {MaskReal, MaskImag});
+    ReplacementNode = Builder.CreateSelect(NewMask, A, B);
+    break;
+  }
+  }
+
+  assert(ReplacementNode && "Target failed to create Intrinsic call.");
+  NumComplexTransformations += 1;
+  Node->ReplacementNode = ReplacementNode;
+  return ReplacementNode;
+}
+
+void ComplexDeinterleavingGraph::processReductionOperation(
+    Value *OperationReplacement, RawNodePtr Node) {
+  auto *Real = cast<Instruction>(Node->Real);
+  auto *Imag = cast<Instruction>(Node->Imag);
+  auto *OldPHIReal = ReductionInfo[Real].first;
+  auto *OldPHIImag = ReductionInfo[Imag].first;
+  auto *NewPHI = OldToNewPHI[OldPHIReal];
+
+  auto *VTy = cast<VectorType>(Real->getType());
+  auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
+
+  // We have to interleave initial origin values coming from IncomingBlock
+  Value *InitReal = OldPHIReal->getIncomingValueForBlock(Incoming);
+  Value *InitImag = OldPHIImag->getIncomingValueForBlock(Incoming);
+
+  IRBuilder<> Builder(Incoming->getTerminator());
+  auto *NewInit = Builder.CreateIntrinsic(
+      Intrinsic::experimental_vector_interleave2, NewVTy, {InitReal, InitImag});
+
+  NewPHI->addIncoming(NewInit, Incoming);
+  NewPHI->addIncoming(OperationReplacement, BackEdge);
+
+  // Deinterleave complex vector outside of loop so that it can be finally
+  // reduced
+  auto *FinalReductionReal = ReductionInfo[Real].second;
+  auto *FinalReductionImag = ReductionInfo[Imag].second;
+
+  Builder.SetInsertPoint(
+      &*FinalReductionReal->getParent()->getFirstInsertionPt());
+  auto *Deinterleave = Builder.CreateIntrinsic(
+      Intrinsic::experimental_vector_deinterleave2,
+      OperationReplacement->getType(), OperationReplacement);
+
+  auto *NewReal = Builder.CreateExtractValue(Deinterleave, (uint64_t)0);
+  FinalReductionReal->replaceUsesOfWith(Real, NewReal);
+
+  Builder.SetInsertPoint(FinalReductionImag);
+  auto *NewImag = Builder.CreateExtractValue(Deinterleave, 1);
+  FinalReductionImag->replaceUsesOfWith(Imag, NewImag);
+}
+
+void ComplexDeinterleavingGraph::replaceNodes() {
+  SmallVector<Instruction *, 16> DeadInstrRoots;
+  for (auto *RootInstruction : OrderedRoots) {
+    // Check if this potential root went through check process and we can
+    // deinterleave it
+    if (!RootToNode.count(RootInstruction))
+      continue;
+
+    IRBuilder<> Builder(RootInstruction);
+    auto RootNode = RootToNode[RootInstruction];
+    Value *R = replaceNode(Builder, RootNode.get());
+
+    if (RootNode->Operation ==
+        ComplexDeinterleavingOperation::ReductionOperation) {
+      auto *RootReal = cast<Instruction>(RootNode->Real);
+      auto *RootImag = cast<Instruction>(RootNode->Imag);
+      ReductionInfo[RootReal].first->removeIncomingValue(BackEdge);
+      ReductionInfo[RootImag].first->removeIncomingValue(BackEdge);
+      DeadInstrRoots.push_back(cast<Instruction>(RootReal));
+      DeadInstrRoots.push_back(cast<Instruction>(RootImag));
+    } else {
+      assert(R && "Unable to find replacement for RootInstruction");
+      DeadInstrRoots.push_back(RootInstruction);
+      RootInstruction->replaceAllUsesWith(R);
+    }
+  }
+
+  for (auto *I : DeadInstrRoots)
+    RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/contrib/llvm-project/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
new file mode 100644
index 000000000000..106db7c51f27
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -0,0 +1,698 @@
+//===- CriticalAntiDepBreaker.cpp - Anti-dep breaker ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CriticalAntiDepBreaker class, which
+// implements register anti-dependence breaking along a blocks
+// critical path during post-RA scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CriticalAntiDepBreaker.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "post-RA-sched"
+
+CriticalAntiDepBreaker::CriticalAntiDepBreaker(MachineFunction &MFi,
+                                               const RegisterClassInfo &RCI)
+    : MF(MFi), MRI(MF.getRegInfo()), TII(MF.getSubtarget().getInstrInfo()),
+      TRI(MF.getSubtarget().getRegisterInfo()), RegClassInfo(RCI),
+      Classes(TRI->getNumRegs(), nullptr), KillIndices(TRI->getNumRegs(), 0),
+      DefIndices(TRI->getNumRegs(), 0), KeepRegs(TRI->getNumRegs(), false) {}
+
+CriticalAntiDepBreaker::~CriticalAntiDepBreaker() = default;
+
+void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
+  const unsigned BBSize = BB->size();
+  for (unsigned i = 1, e = TRI->getNumRegs(); i != e; ++i) {
+    // Clear out the register class data.
+    Classes[i] = nullptr;
+
+    // Initialize the indices to indicate that no registers are live.
+    KillIndices[i] = ~0u;
+    DefIndices[i] = BBSize;
+  }
+
+  // Clear "do not change" set.
+  KeepRegs.reset();
+
+  bool IsReturnBlock = BB->isReturnBlock();
+
+  // Examine the live-in regs of all successors.
+  for (const MachineBasicBlock *Succ : BB->successors())
+    for (const auto &LI : Succ->liveins()) {
+      for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI) {
+        unsigned Reg = *AI;
+        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        KillIndices[Reg] = BBSize;
+        DefIndices[Reg] = ~0u;
+      }
+    }
+
+  // Mark live-out callee-saved registers. In a return block this is
+  // all callee-saved registers. In non-return this is any
+  // callee-saved register that is not saved in the prolog.
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  BitVector Pristine = MFI.getPristineRegs(MF);
+  for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I;
+       ++I) {
+    unsigned Reg = *I;
+    if (!IsReturnBlock && !Pristine.test(Reg))
+      continue;
+    for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+      unsigned Reg = *AI;
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      KillIndices[Reg] = BBSize;
+      DefIndices[Reg] = ~0u;
+    }
+  }
+}
+
+void CriticalAntiDepBreaker::FinishBlock() {
+  RegRefs.clear();
+  KeepRegs.reset();
+}
+
+void CriticalAntiDepBreaker::Observe(MachineInstr &MI, unsigned Count,
+                                     unsigned InsertPosIndex) {
+  // Kill instructions can define registers but are really nops, and there might
+  // be a real definition earlier that needs to be paired with uses dominated by
+  // this kill.
+
+  // FIXME: It may be possible to remove the isKill() restriction once PR18663
+  // has been properly fixed. There can be value in processing kills as seen in
+  // the AggressiveAntiDepBreaker class.
+  if (MI.isDebugInstr() || MI.isKill())
+    return;
+  assert(Count < InsertPosIndex && "Instruction index out of expected range!");
+
+  for (unsigned Reg = 1; Reg != TRI->getNumRegs(); ++Reg) {
+    if (KillIndices[Reg] != ~0u) {
+      // If Reg is currently live, then mark that it can't be renamed as
+      // we don't know the extent of its live-range anymore (now that it
+      // has been scheduled).
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      KillIndices[Reg] = Count;
+    } else if (DefIndices[Reg] < InsertPosIndex && DefIndices[Reg] >= Count) {
+      // Any register which was defined within the previous scheduling region
+      // may have been rescheduled and its lifetime may overlap with registers
+      // in ways not reflected in our current liveness state. For each such
+      // register, adjust the liveness state to be conservatively correct.
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
+      // Move the def index to the end of the previous region, to reflect
+      // that the def could theoretically have been scheduled at the end.
+      DefIndices[Reg] = InsertPosIndex;
+    }
+  }
+
+  PrescanInstruction(MI);
+  ScanInstruction(MI, Count);
+}
+
+/// CriticalPathStep - Return the next SUnit after SU on the bottom-up
+/// critical path.
+static const SDep *CriticalPathStep(const SUnit *SU) {
+  const SDep *Next = nullptr;
+  unsigned NextDepth = 0;
+  // Find the predecessor edge with the greatest depth.
+  for (const SDep &P : SU->Preds) {
+    const SUnit *PredSU = P.getSUnit();
+    unsigned PredLatency = P.getLatency();
+    unsigned PredTotalLatency = PredSU->getDepth() + PredLatency;
+    // In the case of a latency tie, prefer an anti-dependency edge over
+    // other types of edges.
+    if (NextDepth < PredTotalLatency ||
+        (NextDepth == PredTotalLatency && P.getKind() == SDep::Anti)) {
+      NextDepth = PredTotalLatency;
+      Next = &P;
+    }
+  }
+  return Next;
+}
+
+void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) {
+  // It's not safe to change register allocation for source operands of
+  // instructions that have special allocation requirements. Also assume all
+  // registers used in a call must not be changed (ABI).
+  // FIXME: The issue with predicated instruction is more complex. We are being
+  // conservative here because the kill markers cannot be trusted after
+  // if-conversion:
+  // %r6 = LDR %sp, %reg0, 92, 14, %reg0; mem:LD4[FixedStack14]
+  // ...
+  // STR %r0, killed %r6, %reg0, 0, 0, %cpsr; mem:ST4[%395]
+  // %r6 = LDR %sp, %reg0, 100, 0, %cpsr; mem:LD4[FixedStack12]
+  // STR %r0, killed %r6, %reg0, 0, 14, %reg0; mem:ST4[%396](align=8)
+  //
+  // The first R6 kill is not really a kill since it's killed by a predicated
+  // instruction which may not be executed. The second R6 def may or may not
+  // re-define R6 so it's not safe to change it since the last R6 use cannot be
+  // changed.
+  bool Special =
+      MI.isCall() || MI.hasExtraSrcRegAllocReq() || TII->isPredicated(MI);
+
+  // Scan the register operands for this instruction and update
+  // Classes and RegRefs.
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg()) continue;
+    Register Reg = MO.getReg();
+    if (Reg == 0) continue;
+    const TargetRegisterClass *NewRC = nullptr;
+
+    if (i < MI.getDesc().getNumOperands())
+      NewRC = TII->getRegClass(MI.getDesc(), i, TRI, MF);
+
+    // For now, only allow the register to be changed if its register
+    // class is consistent across all uses.
+    if (!Classes[Reg] && NewRC)
+      Classes[Reg] = NewRC;
+    else if (!NewRC || Classes[Reg] != NewRC)
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
+    // Now check for aliases.
+    for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) {
+      // If an alias of the reg is used during the live range, give up.
+      // Note that this allows us to skip checking if AntiDepReg
+      // overlaps with any of the aliases, among other things.
+      unsigned AliasReg = *AI;
+      if (Classes[AliasReg]) {
+        Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      }
+    }
+
+    // If we're still willing to consider this register, note the reference.
+    if (Classes[Reg] != reinterpret_cast<TargetRegisterClass *>(-1))
+      RegRefs.insert(std::make_pair(Reg, &MO));
+
+    if (MO.isUse() && Special) {
+      if (!KeepRegs.test(Reg)) {
+        for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg))
+          KeepRegs.set(SubReg);
+      }
+    }
+  }
+
+  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI.getOperand(I);
+    if (!MO.isReg()) continue;
+    Register Reg = MO.getReg();
+    if (!Reg.isValid())
+      continue;
+    // If this reg is tied and live (Classes[Reg] is set to -1), we can't change
+    // it or any of its sub or super regs. We need to use KeepRegs to mark the
+    // reg because not all uses of the same reg within an instruction are
+    // necessarily tagged as tied.
+    // Example: an x86 "xor %eax, %eax" will have one source operand tied to the
+    // def register but not the second (see PR20020 for details).
+    // FIXME: can this check be relaxed to account for undef uses
+    // of a register? In the above 'xor' example, the uses of %eax are undef, so
+    // earlier instructions could still replace %eax even though the 'xor'
+    // itself can't be changed.
+    if (MI.isRegTiedToUseOperand(I) &&
+        Classes[Reg] == reinterpret_cast<TargetRegisterClass *>(-1)) {
+      for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) {
+        KeepRegs.set(SubReg);
+      }
+      for (MCPhysReg SuperReg : TRI->superregs(Reg)) {
+        KeepRegs.set(SuperReg);
+      }
+    }
+  }
+}
+
+void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) {
+  // Update liveness.
+  // Proceeding upwards, registers that are defed but not used in this
+  // instruction are now dead.
+  assert(!MI.isKill() && "Attempting to scan a kill instruction");
+
+  if (!TII->isPredicated(MI)) {
+    // Predicated defs are modeled as read + write, i.e. similar to two
+    // address updates.
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
+
+      if (MO.isRegMask()) {
+        auto ClobbersPhysRegAndSubRegs = [&](unsigned PhysReg) {
+          return all_of(TRI->subregs_inclusive(PhysReg),
+                        [&](MCPhysReg SR) { return MO.clobbersPhysReg(SR); });
+        };
+
+        for (unsigned i = 1, e = TRI->getNumRegs(); i != e; ++i) {
+          if (ClobbersPhysRegAndSubRegs(i)) {
+            DefIndices[i] = Count;
+            KillIndices[i] = ~0u;
+            KeepRegs.reset(i);
+            Classes[i] = nullptr;
+            RegRefs.erase(i);
+          }
+        }
+      }
+
+      if (!MO.isReg()) continue;
+      Register Reg = MO.getReg();
+      if (Reg == 0) continue;
+      if (!MO.isDef()) continue;
+
+      // Ignore two-addr defs.
+      if (MI.isRegTiedToUseOperand(i))
+        continue;
+
+      // If we've already marked this reg as unchangeable, don't remove
+      // it or any of its subregs from KeepRegs.
+      bool Keep = KeepRegs.test(Reg);
+
+      // For the reg itself and all subregs: update the def to current;
+      // reset the kill state, any restrictions, and references.
+      for (MCPhysReg SubregReg : TRI->subregs_inclusive(Reg)) {
+        DefIndices[SubregReg] = Count;
+        KillIndices[SubregReg] = ~0u;
+        Classes[SubregReg] = nullptr;
+        RegRefs.erase(SubregReg);
+        if (!Keep)
+          KeepRegs.reset(SubregReg);
+      }
+      // Conservatively mark super-registers as unusable.
+      for (MCPhysReg SR : TRI->superregs(Reg))
+        Classes[SR] = reinterpret_cast<TargetRegisterClass *>(-1);
+    }
+  }
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg()) continue;
+    Register Reg = MO.getReg();
+    if (Reg == 0) continue;
+    if (!MO.isUse()) continue;
+
+    const TargetRegisterClass *NewRC = nullptr;
+    if (i < MI.getDesc().getNumOperands())
+      NewRC = TII->getRegClass(MI.getDesc(), i, TRI, MF);
+
+    // For now, only allow the register to be changed if its register
+    // class is consistent across all uses.
+    if (!Classes[Reg] && NewRC)
+      Classes[Reg] = NewRC;
+    else if (!NewRC || Classes[Reg] != NewRC)
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
+    RegRefs.insert(std::make_pair(Reg, &MO));
+
+    // It wasn't previously live but now it is, this is a kill.
+    // Repeat for all aliases.
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+      unsigned AliasReg = *AI;
+      if (KillIndices[AliasReg] == ~0u) {
+        KillIndices[AliasReg] = Count;
+        DefIndices[AliasReg] = ~0u;
+      }
+    }
+  }
+}
+
+// Check all machine operands that reference the antidependent register and must
+// be replaced by NewReg. Return true if any of their parent instructions may
+// clobber the new register.
+//
+// Note: AntiDepReg may be referenced by a two-address instruction such that
+// it's use operand is tied to a def operand. We guard against the case in which
+// the two-address instruction also defines NewReg, as may happen with
+// pre/postincrement loads. In this case, both the use and def operands are in
+// RegRefs because the def is inserted by PrescanInstruction and not erased
+// during ScanInstruction. So checking for an instruction with definitions of
+// both NewReg and AntiDepReg covers it.
+bool
+CriticalAntiDepBreaker::isNewRegClobberedByRefs(RegRefIter RegRefBegin,
+                                                RegRefIter RegRefEnd,
+                                                unsigned NewReg) {
+  for (RegRefIter I = RegRefBegin; I != RegRefEnd; ++I ) {
+    MachineOperand *RefOper = I->second;
+
+    // Don't allow the instruction defining AntiDepReg to earlyclobber its
+    // operands, in case they may be assigned to NewReg. In this case antidep
+    // breaking must fail, but it's too rare to bother optimizing.
+    if (RefOper->isDef() && RefOper->isEarlyClobber())
+      return true;
+
+    // Handle cases in which this instruction defines NewReg.
+    MachineInstr *MI = RefOper->getParent();
+    for (const MachineOperand &CheckOper : MI->operands()) {
+      if (CheckOper.isRegMask() && CheckOper.clobbersPhysReg(NewReg))
+        return true;
+
+      if (!CheckOper.isReg() || !CheckOper.isDef() ||
+          CheckOper.getReg() != NewReg)
+        continue;
+
+      // Don't allow the instruction to define NewReg and AntiDepReg.
+      // When AntiDepReg is renamed it will be an illegal op.
+      if (RefOper->isDef())
+        return true;
+
+      // Don't allow an instruction using AntiDepReg to be earlyclobbered by
+      // NewReg.
+      if (CheckOper.isEarlyClobber())
+        return true;
+
+      // Don't allow inline asm to define NewReg at all. Who knows what it's
+      // doing with it.
+      if (MI->isInlineAsm())
+        return true;
+    }
+  }
+  return false;
+}
+
+unsigned CriticalAntiDepBreaker::
+findSuitableFreeRegister(RegRefIter RegRefBegin,
+                         RegRefIter RegRefEnd,
+                         unsigned AntiDepReg,
+                         unsigned LastNewReg,
+                         const TargetRegisterClass *RC,
+                         SmallVectorImpl<unsigned> &Forbid) {
+  ArrayRef<MCPhysReg> Order = RegClassInfo.getOrder(RC);
+  for (unsigned NewReg : Order) {
+    // Don't replace a register with itself.
+    if (NewReg == AntiDepReg) continue;
+    // Don't replace a register with one that was recently used to repair
+    // an anti-dependence with this AntiDepReg, because that would
+    // re-introduce that anti-dependence.
+    if (NewReg == LastNewReg) continue;
+    // If any instructions that define AntiDepReg also define the NewReg, it's
+    // not suitable.  For example, Instruction with multiple definitions can
+    // result in this condition.
+    if (isNewRegClobberedByRefs(RegRefBegin, RegRefEnd, NewReg)) continue;
+    // If NewReg is dead and NewReg's most recent def is not before
+    // AntiDepReg's kill, it's safe to replace AntiDepReg with NewReg.
+    assert(((KillIndices[AntiDepReg] == ~0u) != (DefIndices[AntiDepReg] == ~0u))
+           && "Kill and Def maps aren't consistent for AntiDepReg!");
+    assert(((KillIndices[NewReg] == ~0u) != (DefIndices[NewReg] == ~0u))
+           && "Kill and Def maps aren't consistent for NewReg!");
+    if (KillIndices[NewReg] != ~0u ||
+        Classes[NewReg] == reinterpret_cast<TargetRegisterClass *>(-1) ||
+        KillIndices[AntiDepReg] > DefIndices[NewReg])
+      continue;
+    // If NewReg overlaps any of the forbidden registers, we can't use it.
+    bool Forbidden = false;
+    for (unsigned R : Forbid)
+      if (TRI->regsOverlap(NewReg, R)) {
+        Forbidden = true;
+        break;
+      }
+    if (Forbidden) continue;
+    return NewReg;
+  }
+
+  // No registers are free and available!
+  return 0;
+}
+
+unsigned CriticalAntiDepBreaker::
+BreakAntiDependencies(const std::vector<SUnit> &SUnits,
+                      MachineBasicBlock::iterator Begin,
+                      MachineBasicBlock::iterator End,
+                      unsigned InsertPosIndex,
+                      DbgValueVector &DbgValues) {
+  // The code below assumes that there is at least one instruction,
+  // so just duck out immediately if the block is empty.
+  if (SUnits.empty()) return 0;
+
+  // Keep a map of the MachineInstr*'s back to the SUnit representing them.
+  // This is used for updating debug information.
+  //
+  // FIXME: Replace this with the existing map in ScheduleDAGInstrs::MISUnitMap
+  DenseMap<MachineInstr *, const SUnit *> MISUnitMap;
+
+  // Find the node at the bottom of the critical path.
+  const SUnit *Max = nullptr;
+  for (const SUnit &SU : SUnits) {
+    MISUnitMap[SU.getInstr()] = &SU;
+    if (!Max || SU.getDepth() + SU.Latency > Max->getDepth() + Max->Latency)
+      Max = &SU;
+  }
+  assert(Max && "Failed to find bottom of the critical path");
+
+#ifndef NDEBUG
+  {
+    LLVM_DEBUG(dbgs() << "Critical path has total latency "
+                      << (Max->getDepth() + Max->Latency) << "\n");
+    LLVM_DEBUG(dbgs() << "Available regs:");
+    for (unsigned Reg = 1; Reg < TRI->getNumRegs(); ++Reg) {
+      if (KillIndices[Reg] == ~0u)
+        LLVM_DEBUG(dbgs() << " " << printReg(Reg, TRI));
+    }
+    LLVM_DEBUG(dbgs() << '\n');
+  }
+#endif
+
+  // Track progress along the critical path through the SUnit graph as we walk
+  // the instructions.
+  const SUnit *CriticalPathSU = Max;
+  MachineInstr *CriticalPathMI = CriticalPathSU->getInstr();
+
+  // Consider this pattern:
+  //   A = ...
+  //   ... = A
+  //   A = ...
+  //   ... = A
+  //   A = ...
+  //   ... = A
+  //   A = ...
+  //   ... = A
+  // There are three anti-dependencies here, and without special care,
+  // we'd break all of them using the same register:
+  //   A = ...
+  //   ... = A
+  //   B = ...
+  //   ... = B
+  //   B = ...
+  //   ... = B
+  //   B = ...
+  //   ... = B
+  // because at each anti-dependence, B is the first register that
+  // isn't A which is free.  This re-introduces anti-dependencies
+  // at all but one of the original anti-dependencies that we were
+  // trying to break.  To avoid this, keep track of the most recent
+  // register that each register was replaced with, avoid
+  // using it to repair an anti-dependence on the same register.
+  // This lets us produce this:
+  //   A = ...
+  //   ... = A
+  //   B = ...
+  //   ... = B
+  //   C = ...
+  //   ... = C
+  //   B = ...
+  //   ... = B
+  // This still has an anti-dependence on B, but at least it isn't on the
+  // original critical path.
+  //
+  // TODO: If we tracked more than one register here, we could potentially
+  // fix that remaining critical edge too. This is a little more involved,
+  // because unlike the most recent register, less recent registers should
+  // still be considered, though only if no other registers are available.
+  std::vector<unsigned> LastNewReg(TRI->getNumRegs(), 0);
+
+  // Attempt to break anti-dependence edges on the critical path. Walk the
+  // instructions from the bottom up, tracking information about liveness
+  // as we go to help determine which registers are available.
+  unsigned Broken = 0;
+  unsigned Count = InsertPosIndex - 1;
+  for (MachineBasicBlock::iterator I = End, E = Begin; I != E; --Count) {
+    MachineInstr &MI = *--I;
+    // Kill instructions can define registers but are really nops, and there
+    // might be a real definition earlier that needs to be paired with uses
+    // dominated by this kill.
+
+    // FIXME: It may be possible to remove the isKill() restriction once PR18663
+    // has been properly fixed. There can be value in processing kills as seen
+    // in the AggressiveAntiDepBreaker class.
+    if (MI.isDebugInstr() || MI.isKill())
+      continue;
+
+    // Check if this instruction has a dependence on the critical path that
+    // is an anti-dependence that we may be able to break. If it is, set
+    // AntiDepReg to the non-zero register associated with the anti-dependence.
+    //
+    // We limit our attention to the critical path as a heuristic to avoid
+    // breaking anti-dependence edges that aren't going to significantly
+    // impact the overall schedule. There are a limited number of registers
+    // and we want to save them for the important edges.
+    //
+    // TODO: Instructions with multiple defs could have multiple
+    // anti-dependencies. The current code here only knows how to break one
+    // edge per instruction. Note that we'd have to be able to break all of
+    // the anti-dependencies in an instruction in order to be effective.
+    unsigned AntiDepReg = 0;
+    if (&MI == CriticalPathMI) {
+      if (const SDep *Edge = CriticalPathStep(CriticalPathSU)) {
+        const SUnit *NextSU = Edge->getSUnit();
+
+        // Only consider anti-dependence edges.
+        if (Edge->getKind() == SDep::Anti) {
+          AntiDepReg = Edge->getReg();
+          assert(AntiDepReg != 0 && "Anti-dependence on reg0?");
+          if (!MRI.isAllocatable(AntiDepReg))
+            // Don't break anti-dependencies on non-allocatable registers.
+            AntiDepReg = 0;
+          else if (KeepRegs.test(AntiDepReg))
+            // Don't break anti-dependencies if a use down below requires
+            // this exact register.
+            AntiDepReg = 0;
+          else {
+            // If the SUnit has other dependencies on the SUnit that it
+            // anti-depends on, don't bother breaking the anti-dependency
+            // since those edges would prevent such units from being
+            // scheduled past each other regardless.
+            //
+            // Also, if there are dependencies on other SUnits with the
+            // same register as the anti-dependency, don't attempt to
+            // break it.
+            for (const SDep &P : CriticalPathSU->Preds)
+              if (P.getSUnit() == NextSU
+                      ? (P.getKind() != SDep::Anti || P.getReg() != AntiDepReg)
+                      : (P.getKind() == SDep::Data &&
+                         P.getReg() == AntiDepReg)) {
+                AntiDepReg = 0;
+                break;
+              }
+          }
+        }
+        CriticalPathSU = NextSU;
+        CriticalPathMI = CriticalPathSU->getInstr();
+      } else {
+        // We've reached the end of the critical path.
+        CriticalPathSU = nullptr;
+        CriticalPathMI = nullptr;
+      }
+    }
+
+    PrescanInstruction(MI);
+
+    SmallVector<unsigned, 2> ForbidRegs;
+
+    // If MI's defs have a special allocation requirement, don't allow
+    // any def registers to be changed. Also assume all registers
+    // defined in a call must not be changed (ABI).
+    if (MI.isCall() || MI.hasExtraDefRegAllocReq() || TII->isPredicated(MI))
+      // If this instruction's defs have special allocation requirement, don't
+      // break this anti-dependency.
+      AntiDepReg = 0;
+    else if (AntiDepReg) {
+      // If this instruction has a use of AntiDepReg, breaking it
+      // is invalid.  If the instruction defines other registers,
+      // save a list of them so that we don't pick a new register
+      // that overlaps any of them.
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg()) continue;
+        Register Reg = MO.getReg();
+        if (Reg == 0) continue;
+        if (MO.isUse() && TRI->regsOverlap(AntiDepReg, Reg)) {
+          AntiDepReg = 0;
+          break;
+        }
+        if (MO.isDef() && Reg != AntiDepReg)
+          ForbidRegs.push_back(Reg);
+      }
+    }
+
+    // Determine AntiDepReg's register class, if it is live and is
+    // consistently used within a single class.
+    const TargetRegisterClass *RC = AntiDepReg != 0 ? Classes[AntiDepReg]
+                                                    : nullptr;
+    assert((AntiDepReg == 0 || RC != nullptr) &&
+           "Register should be live if it's causing an anti-dependence!");
+    if (RC == reinterpret_cast<TargetRegisterClass *>(-1))
+      AntiDepReg = 0;
+
+    // Look for a suitable register to use to break the anti-dependence.
+    //
+    // TODO: Instead of picking the first free register, consider which might
+    // be the best.
+    if (AntiDepReg != 0) {
+      std::pair<std::multimap<unsigned, MachineOperand *>::iterator,
+                std::multimap<unsigned, MachineOperand *>::iterator>
+        Range = RegRefs.equal_range(AntiDepReg);
+      if (unsigned NewReg = findSuitableFreeRegister(Range.first, Range.second,
+                                                     AntiDepReg,
+                                                     LastNewReg[AntiDepReg],
+                                                     RC, ForbidRegs)) {
+        LLVM_DEBUG(dbgs() << "Breaking anti-dependence edge on "
+                          << printReg(AntiDepReg, TRI) << " with "
+                          << RegRefs.count(AntiDepReg) << " references"
+                          << " using " << printReg(NewReg, TRI) << "!\n");
+
+        // Update the references to the old register to refer to the new
+        // register.
+        for (std::multimap<unsigned, MachineOperand *>::iterator
+             Q = Range.first, QE = Range.second; Q != QE; ++Q) {
+          Q->second->setReg(NewReg);
+          // If the SU for the instruction being updated has debug information
+          // related to the anti-dependency register, make sure to update that
+          // as well.
+          const SUnit *SU = MISUnitMap[Q->second->getParent()];
+          if (!SU) continue;
+          UpdateDbgValues(DbgValues, Q->second->getParent(),
+                          AntiDepReg, NewReg);
+        }
+
+        // We just went back in time and modified history; the
+        // liveness information for the anti-dependence reg is now
+        // inconsistent. Set the state as if it were dead.
+        Classes[NewReg] = Classes[AntiDepReg];
+        DefIndices[NewReg] = DefIndices[AntiDepReg];
+        KillIndices[NewReg] = KillIndices[AntiDepReg];
+        assert(((KillIndices[NewReg] == ~0u) !=
+                (DefIndices[NewReg] == ~0u)) &&
+             "Kill and Def maps aren't consistent for NewReg!");
+
+        Classes[AntiDepReg] = nullptr;
+        DefIndices[AntiDepReg] = KillIndices[AntiDepReg];
+        KillIndices[AntiDepReg] = ~0u;
+        assert(((KillIndices[AntiDepReg] == ~0u) !=
+                (DefIndices[AntiDepReg] == ~0u)) &&
+             "Kill and Def maps aren't consistent for AntiDepReg!");
+
+        RegRefs.erase(AntiDepReg);
+        LastNewReg[AntiDepReg] = NewReg;
+        ++Broken;
+      }
+    }
+
+    ScanInstruction(MI, Count);
+  }
+
+  return Broken;
+}
+
+AntiDepBreaker *
+llvm::createCriticalAntiDepBreaker(MachineFunction &MFi,
+                                   const RegisterClassInfo &RCI) {
+  return new CriticalAntiDepBreaker(MFi, RCI);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/CriticalAntiDepBreaker.h b/contrib/llvm-project/llvm/lib/CodeGen/CriticalAntiDepBreaker.h
new file mode 100644
index 000000000000..640506b6e9ed
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -0,0 +1,112 @@
+//===- llvm/CodeGen/CriticalAntiDepBreaker.h - Anti-Dep Support -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CriticalAntiDepBreaker class, which
+// implements register anti-dependence breaking along a blocks
+// critical path during post-RA scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_CRITICALANTIDEPBREAKER_H
+#define LLVM_LIB_CODEGEN_CRITICALANTIDEPBREAKER_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/AntiDepBreaker.h"
+#include "llvm/Support/Compiler.h"
+#include <map>
+#include <vector>
+
+namespace llvm {
+
+class MachineBasicBlock;
+class MachineFunction;
+class MachineInstr;
+class MachineOperand;
+class MachineRegisterInfo;
+class RegisterClassInfo;
+class TargetInstrInfo;
+class TargetRegisterClass;
+class TargetRegisterInfo;
+
+class LLVM_LIBRARY_VISIBILITY CriticalAntiDepBreaker : public AntiDepBreaker {
+    MachineFunction& MF;
+    MachineRegisterInfo &MRI;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const RegisterClassInfo &RegClassInfo;
+
+    /// The set of allocatable registers.
+    /// We'll be ignoring anti-dependencies on non-allocatable registers,
+    /// because they may not be safe to break.
+    const BitVector AllocatableSet;
+
+    /// For live regs that are only used in one register class in a
+    /// live range, the register class. If the register is not live, the
+    /// corresponding value is null. If the register is live but used in
+    /// multiple register classes, the corresponding value is -1 casted to a
+    /// pointer.
+    std::vector<const TargetRegisterClass *> Classes;
+
+    /// Map registers to all their references within a live range.
+    std::multimap<unsigned, MachineOperand *> RegRefs;
+
+    using RegRefIter =
+        std::multimap<unsigned, MachineOperand *>::const_iterator;
+
+    /// The index of the most recent kill (proceeding bottom-up),
+    /// or ~0u if the register is not live.
+    std::vector<unsigned> KillIndices;
+
+    /// The index of the most recent complete def (proceeding
+    /// bottom up), or ~0u if the register is live.
+    std::vector<unsigned> DefIndices;
+
+    /// A set of registers which are live and cannot be changed to
+    /// break anti-dependencies.
+    BitVector KeepRegs;
+
+  public:
+    CriticalAntiDepBreaker(MachineFunction& MFi, const RegisterClassInfo &RCI);
+    ~CriticalAntiDepBreaker() override;
+
+    /// Initialize anti-dep breaking for a new basic block.
+    void StartBlock(MachineBasicBlock *BB) override;
+
+    /// Identifiy anti-dependencies along the critical path
+    /// of the ScheduleDAG and break them by renaming registers.
+    unsigned BreakAntiDependencies(const std::vector<SUnit> &SUnits,
+                                   MachineBasicBlock::iterator Begin,
+                                   MachineBasicBlock::iterator End,
+                                   unsigned InsertPosIndex,
+                                   DbgValueVector &DbgValues) override;
+
+    /// Update liveness information to account for the current
+    /// instruction, which will not be scheduled.
+    void Observe(MachineInstr &MI, unsigned Count,
+                 unsigned InsertPosIndex) override;
+
+    /// Finish anti-dep breaking for a basic block.
+    void FinishBlock() override;
+
+  private:
+    void PrescanInstruction(MachineInstr &MI);
+    void ScanInstruction(MachineInstr &MI, unsigned Count);
+    bool isNewRegClobberedByRefs(RegRefIter RegRefBegin,
+                                 RegRefIter RegRefEnd,
+                                 unsigned NewReg);
+    unsigned findSuitableFreeRegister(RegRefIter RegRefBegin,
+                                      RegRefIter RegRefEnd,
+                                      unsigned AntiDepReg,
+                                      unsigned LastNewReg,
+                                      const TargetRegisterClass *RC,
+                                      SmallVectorImpl<unsigned> &Forbid);
+  };
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_CRITICALANTIDEPBREAKER_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/DFAPacketizer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/DFAPacketizer.cpp
new file mode 100644
index 000000000000..48bb4a07662e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/DFAPacketizer.cpp
@@ -0,0 +1,288 @@
+//=- llvm/CodeGen/DFAPacketizer.cpp - DFA Packetizer for VLIW -*- C++ -*-=====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This class implements a deterministic finite automaton (DFA) based
+// packetizing mechanism for VLIW architectures. It provides APIs to
+// determine whether there exists a legal mapping of instructions to
+// functional unit assignments in a packet. The DFA is auto-generated from
+// the target's Schedule.td file.
+//
+// A DFA consists of 3 major elements: states, inputs, and transitions. For
+// the packetizing mechanism, the input is the set of instruction classes for
+// a target. The state models all possible combinations of functional unit
+// consumption for a given set of instructions in a packet. A transition
+// models the addition of an instruction to a packet. In the DFA constructed
+// by this class, if an instruction can be added to a packet, then a valid
+// transition exists from the corresponding state. Invalid transitions
+// indicate that the instruction cannot be added to the current packet.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "packets"
+
+static cl::opt<unsigned> InstrLimit("dfa-instr-limit", cl::Hidden,
+  cl::init(0), cl::desc("If present, stops packetizing after N instructions"));
+
+static unsigned InstrCount = 0;
+
+// Check if the resources occupied by a MCInstrDesc are available in the
+// current state.
+bool DFAPacketizer::canReserveResources(const MCInstrDesc *MID) {
+  unsigned Action = ItinActions[MID->getSchedClass()];
+  if (MID->getSchedClass() == 0 || Action == 0)
+    return false;
+  return A.canAdd(Action);
+}
+
+// Reserve the resources occupied by a MCInstrDesc and change the current
+// state to reflect that change.
+void DFAPacketizer::reserveResources(const MCInstrDesc *MID) {
+  unsigned Action = ItinActions[MID->getSchedClass()];
+  if (MID->getSchedClass() == 0 || Action == 0)
+    return;
+  A.add(Action);
+}
+
+// Check if the resources occupied by a machine instruction are available
+// in the current state.
+bool DFAPacketizer::canReserveResources(MachineInstr &MI) {
+  const MCInstrDesc &MID = MI.getDesc();
+  return canReserveResources(&MID);
+}
+
+// Reserve the resources occupied by a machine instruction and change the
+// current state to reflect that change.
+void DFAPacketizer::reserveResources(MachineInstr &MI) {
+  const MCInstrDesc &MID = MI.getDesc();
+  reserveResources(&MID);
+}
+
+unsigned DFAPacketizer::getUsedResources(unsigned InstIdx) {
+  ArrayRef<NfaPath> NfaPaths = A.getNfaPaths();
+  assert(!NfaPaths.empty() && "Invalid bundle!");
+  const NfaPath &RS = NfaPaths.front();
+
+  // RS stores the cumulative resources used up to and including the I'th
+  // instruction. The 0th instruction is the base case.
+  if (InstIdx == 0)
+    return RS[0];
+  // Return the difference between the cumulative resources used by InstIdx and
+  // its predecessor.
+  return RS[InstIdx] ^ RS[InstIdx - 1];
+}
+
+DefaultVLIWScheduler::DefaultVLIWScheduler(MachineFunction &MF,
+                                           MachineLoopInfo &MLI,
+                                           AAResults *AA)
+    : ScheduleDAGInstrs(MF, &MLI), AA(AA) {
+  CanHandleTerminators = true;
+}
+
+/// Apply each ScheduleDAGMutation step in order.
+void DefaultVLIWScheduler::postProcessDAG() {
+  for (auto &M : Mutations)
+    M->apply(this);
+}
+
+void DefaultVLIWScheduler::schedule() {
+  // Build the scheduling graph.
+  buildSchedGraph(AA);
+  postProcessDAG();
+}
+
+VLIWPacketizerList::VLIWPacketizerList(MachineFunction &mf,
+                                       MachineLoopInfo &mli, AAResults *aa)
+    : MF(mf), TII(mf.getSubtarget().getInstrInfo()), AA(aa) {
+  ResourceTracker = TII->CreateTargetScheduleState(MF.getSubtarget());
+  ResourceTracker->setTrackResources(true);
+  VLIWScheduler = new DefaultVLIWScheduler(MF, mli, AA);
+}
+
+VLIWPacketizerList::~VLIWPacketizerList() {
+  delete VLIWScheduler;
+  delete ResourceTracker;
+}
+
+// End the current packet, bundle packet instructions and reset DFA state.
+void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB,
+                                   MachineBasicBlock::iterator MI) {
+  LLVM_DEBUG({
+    if (!CurrentPacketMIs.empty()) {
+      dbgs() << "Finalizing packet:\n";
+      unsigned Idx = 0;
+      for (MachineInstr *MI : CurrentPacketMIs) {
+        unsigned R = ResourceTracker->getUsedResources(Idx++);
+        dbgs() << " * [res:0x" << utohexstr(R) << "] " << *MI;
+      }
+    }
+  });
+  if (CurrentPacketMIs.size() > 1) {
+    MachineInstr &MIFirst = *CurrentPacketMIs.front();
+    finalizeBundle(*MBB, MIFirst.getIterator(), MI.getInstrIterator());
+  }
+  CurrentPacketMIs.clear();
+  ResourceTracker->clearResources();
+  LLVM_DEBUG(dbgs() << "End packet\n");
+}
+
+// Bundle machine instructions into packets.
+void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator BeginItr,
+                                      MachineBasicBlock::iterator EndItr) {
+  assert(VLIWScheduler && "VLIW Scheduler is not initialized!");
+  VLIWScheduler->startBlock(MBB);
+  VLIWScheduler->enterRegion(MBB, BeginItr, EndItr,
+                             std::distance(BeginItr, EndItr));
+  VLIWScheduler->schedule();
+
+  LLVM_DEBUG({
+    dbgs() << "Scheduling DAG of the packetize region\n";
+    VLIWScheduler->dump();
+  });
+
+  // Generate MI -> SU map.
+  MIToSUnit.clear();
+  for (SUnit &SU : VLIWScheduler->SUnits)
+    MIToSUnit[SU.getInstr()] = &SU;
+
+  bool LimitPresent = InstrLimit.getPosition();
+
+  // The main packetizer loop.
+  for (; BeginItr != EndItr; ++BeginItr) {
+    if (LimitPresent) {
+      if (InstrCount >= InstrLimit) {
+        EndItr = BeginItr;
+        break;
+      }
+      InstrCount++;
+    }
+    MachineInstr &MI = *BeginItr;
+    initPacketizerState();
+
+    // End the current packet if needed.
+    if (isSoloInstruction(MI)) {
+      endPacket(MBB, MI);
+      continue;
+    }
+
+    // Ignore pseudo instructions.
+    if (ignorePseudoInstruction(MI, MBB))
+      continue;
+
+    SUnit *SUI = MIToSUnit[&MI];
+    assert(SUI && "Missing SUnit Info!");
+
+    // Ask DFA if machine resource is available for MI.
+    LLVM_DEBUG(dbgs() << "Checking resources for adding MI to packet " << MI);
+
+    bool ResourceAvail = ResourceTracker->canReserveResources(MI);
+    LLVM_DEBUG({
+      if (ResourceAvail)
+        dbgs() << "  Resources are available for adding MI to packet\n";
+      else
+        dbgs() << "  Resources NOT available\n";
+    });
+    if (ResourceAvail && shouldAddToPacket(MI)) {
+      // Dependency check for MI with instructions in CurrentPacketMIs.
+      for (auto *MJ : CurrentPacketMIs) {
+        SUnit *SUJ = MIToSUnit[MJ];
+        assert(SUJ && "Missing SUnit Info!");
+
+        LLVM_DEBUG(dbgs() << "  Checking against MJ " << *MJ);
+        // Is it legal to packetize SUI and SUJ together.
+        if (!isLegalToPacketizeTogether(SUI, SUJ)) {
+          LLVM_DEBUG(dbgs() << "  Not legal to add MI, try to prune\n");
+          // Allow packetization if dependency can be pruned.
+          if (!isLegalToPruneDependencies(SUI, SUJ)) {
+            // End the packet if dependency cannot be pruned.
+            LLVM_DEBUG(dbgs()
+                       << "  Could not prune dependencies for adding MI\n");
+            endPacket(MBB, MI);
+            break;
+          }
+          LLVM_DEBUG(dbgs() << "  Pruned dependence for adding MI\n");
+        }
+      }
+    } else {
+      LLVM_DEBUG(if (ResourceAvail) dbgs()
+                 << "Resources are available, but instruction should not be "
+                    "added to packet\n  "
+                 << MI);
+      // End the packet if resource is not available, or if the instruction
+      // should not be added to the current packet.
+      endPacket(MBB, MI);
+    }
+
+    // Add MI to the current packet.
+    LLVM_DEBUG(dbgs() << "* Adding MI to packet " << MI << '\n');
+    BeginItr = addToPacket(MI);
+  } // For all instructions in the packetization range.
+
+  // End any packet left behind.
+  endPacket(MBB, EndItr);
+  VLIWScheduler->exitRegion();
+  VLIWScheduler->finishBlock();
+}
+
+bool VLIWPacketizerList::alias(const MachineMemOperand &Op1,
+                               const MachineMemOperand &Op2,
+                               bool UseTBAA) const {
+  if (!Op1.getValue() || !Op2.getValue())
+    return true;
+
+  int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
+  int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset;
+  int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset;
+
+  AliasResult AAResult =
+      AA->alias(MemoryLocation(Op1.getValue(), Overlapa,
+                               UseTBAA ? Op1.getAAInfo() : AAMDNodes()),
+                MemoryLocation(Op2.getValue(), Overlapb,
+                               UseTBAA ? Op2.getAAInfo() : AAMDNodes()));
+
+  return AAResult != AliasResult::NoAlias;
+}
+
+bool VLIWPacketizerList::alias(const MachineInstr &MI1,
+                               const MachineInstr &MI2,
+                               bool UseTBAA) const {
+  if (MI1.memoperands_empty() || MI2.memoperands_empty())
+    return true;
+
+  for (const MachineMemOperand *Op1 : MI1.memoperands())
+    for (const MachineMemOperand *Op2 : MI2.memoperands())
+      if (alias(*Op1, *Op2, UseTBAA))
+        return true;
+  return false;
+}
+
+// Add a DAG mutation object to the ordered list.
+void VLIWPacketizerList::addMutation(
+      std::unique_ptr<ScheduleDAGMutation> Mutation) {
+  VLIWScheduler->addMutation(std::move(Mutation));
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/contrib/llvm-project/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
new file mode 100644
index 000000000000..6a7de3b241fe
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -0,0 +1,151 @@
+//===- DeadMachineInstructionElim.cpp - Remove dead machine instructions --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an extremely simple MachineInstr-level dead-code-elimination pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dead-mi-elimination"
+
+STATISTIC(NumDeletes,          "Number of dead instructions deleted");
+
+namespace {
+  class DeadMachineInstructionElim : public MachineFunctionPass {
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    const MachineRegisterInfo *MRI = nullptr;
+    const TargetInstrInfo *TII = nullptr;
+    LiveRegUnits LivePhysRegs;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    DeadMachineInstructionElim() : MachineFunctionPass(ID) {
+     initializeDeadMachineInstructionElimPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    bool isDead(const MachineInstr *MI) const;
+
+    bool eliminateDeadMI(MachineFunction &MF);
+  };
+}
+char DeadMachineInstructionElim::ID = 0;
+char &llvm::DeadMachineInstructionElimID = DeadMachineInstructionElim::ID;
+
+INITIALIZE_PASS(DeadMachineInstructionElim, DEBUG_TYPE,
+                "Remove dead machine instructions", false, false)
+
+bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
+  // Technically speaking inline asm without side effects and no defs can still
+  // be deleted. But there is so much bad inline asm code out there, we should
+  // let them be.
+  if (MI->isInlineAsm())
+    return false;
+
+  // Don't delete frame allocation labels.
+  if (MI->getOpcode() == TargetOpcode::LOCAL_ESCAPE)
+    return false;
+
+  // Don't delete instructions with side effects.
+  bool SawStore = false;
+  if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI())
+    return false;
+
+  // Examine each operand.
+  for (const MachineOperand &MO : MI->all_defs()) {
+    Register Reg = MO.getReg();
+    if (Reg.isPhysical()) {
+      // Don't delete live physreg defs, or any reserved register defs.
+      if (!LivePhysRegs.available(Reg) || MRI->isReserved(Reg))
+        return false;
+    } else {
+      if (MO.isDead()) {
+#ifndef NDEBUG
+        // Basic check on the register. All of them should be 'undef'.
+        for (auto &U : MRI->use_nodbg_operands(Reg))
+          assert(U.isUndef() && "'Undef' use on a 'dead' register is found!");
+#endif
+        continue;
+      }
+      for (const MachineInstr &Use : MRI->use_nodbg_instructions(Reg)) {
+        if (&Use != MI)
+          // This def has a non-debug use. Don't delete the instruction!
+          return false;
+      }
+    }
+  }
+
+  // If there are no defs with uses, the instruction is dead.
+  return true;
+}
+
+bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  MRI = &MF.getRegInfo();
+
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+  TII = ST.getInstrInfo();
+  LivePhysRegs.init(*ST.getRegisterInfo());
+
+  bool AnyChanges = eliminateDeadMI(MF);
+  while (AnyChanges && eliminateDeadMI(MF))
+    ;
+  return AnyChanges;
+}
+
+bool DeadMachineInstructionElim::eliminateDeadMI(MachineFunction &MF) {
+  bool AnyChanges = false;
+
+  // Loop over all instructions in all blocks, from bottom to top, so that it's
+  // more likely that chains of dependent but ultimately dead instructions will
+  // be cleaned up.
+  for (MachineBasicBlock *MBB : post_order(&MF)) {
+    LivePhysRegs.addLiveOuts(*MBB);
+
+    // Now scan the instructions and delete dead ones, tracking physreg
+    // liveness as we go.
+    for (MachineInstr &MI : make_early_inc_range(reverse(*MBB))) {
+      // If the instruction is dead, delete it!
+      if (isDead(&MI)) {
+        LLVM_DEBUG(dbgs() << "DeadMachineInstructionElim: DELETING: " << MI);
+        // It is possible that some DBG_VALUE instructions refer to this
+        // instruction. They will be deleted in the live debug variable
+        // analysis.
+        MI.eraseFromParent();
+        AnyChanges = true;
+        ++NumDeletes;
+        continue;
+      }
+
+      LivePhysRegs.stepBackward(MI);
+    }
+  }
+
+  LivePhysRegs.clear();
+  return AnyChanges;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/DetectDeadLanes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/DetectDeadLanes.cpp
new file mode 100644
index 000000000000..86e9f3abe010
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/DetectDeadLanes.cpp
@@ -0,0 +1,566 @@
+//===- DetectDeadLanes.cpp - SubRegister Lane Usage Analysis --*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Analysis that tracks defined/used subregister lanes across COPY instructions
+/// and instructions that get lowered to a COPY (PHI, REG_SEQUENCE,
+/// INSERT_SUBREG, EXTRACT_SUBREG).
+/// The information is used to detect dead definitions and the usage of
+/// (completely) undefined values and mark the operands as such.
+/// This pass is necessary because the dead/undef status is not obvious anymore
+/// when subregisters are involved.
+///
+/// Example:
+///    %0 = some definition
+///    %1 = IMPLICIT_DEF
+///    %2 = REG_SEQUENCE %0, sub0, %1, sub1
+///    %3 = EXTRACT_SUBREG %2, sub1
+///       = use %3
+/// The %0 definition is dead and %3 contains an undefined value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/DetectDeadLanes.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "detect-dead-lanes"
+
+DeadLaneDetector::DeadLaneDetector(const MachineRegisterInfo *MRI,
+                                   const TargetRegisterInfo *TRI)
+    : MRI(MRI), TRI(TRI) {
+  unsigned NumVirtRegs = MRI->getNumVirtRegs();
+  VRegInfos = std::unique_ptr<VRegInfo[]>(new VRegInfo[NumVirtRegs]);
+  WorklistMembers.resize(NumVirtRegs);
+  DefinedByCopy.resize(NumVirtRegs);
+}
+
+/// Returns true if \p MI will get lowered to a series of COPY instructions.
+/// We call this a COPY-like instruction.
+static bool lowersToCopies(const MachineInstr &MI) {
+  // Note: We could support instructions with MCInstrDesc::isRegSequenceLike(),
+  // isExtractSubRegLike(), isInsertSubregLike() in the future even though they
+  // are not lowered to a COPY.
+  switch (MI.getOpcode()) {
+  case TargetOpcode::COPY:
+  case TargetOpcode::PHI:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::EXTRACT_SUBREG:
+    return true;
+  }
+  return false;
+}
+
+static bool isCrossCopy(const MachineRegisterInfo &MRI,
+                        const MachineInstr &MI,
+                        const TargetRegisterClass *DstRC,
+                        const MachineOperand &MO) {
+  assert(lowersToCopies(MI));
+  Register SrcReg = MO.getReg();
+  const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+  if (DstRC == SrcRC)
+    return false;
+
+  unsigned SrcSubIdx = MO.getSubReg();
+
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  unsigned DstSubIdx = 0;
+  switch (MI.getOpcode()) {
+  case TargetOpcode::INSERT_SUBREG:
+    if (MO.getOperandNo() == 2)
+      DstSubIdx = MI.getOperand(3).getImm();
+    break;
+  case TargetOpcode::REG_SEQUENCE: {
+    unsigned OpNum = MO.getOperandNo();
+    DstSubIdx = MI.getOperand(OpNum+1).getImm();
+    break;
+  }
+  case TargetOpcode::EXTRACT_SUBREG: {
+    unsigned SubReg = MI.getOperand(2).getImm();
+    SrcSubIdx = TRI.composeSubRegIndices(SubReg, SrcSubIdx);
+  }
+  }
+
+  unsigned PreA, PreB; // Unused.
+  if (SrcSubIdx && DstSubIdx)
+    return !TRI.getCommonSuperRegClass(SrcRC, SrcSubIdx, DstRC, DstSubIdx, PreA,
+                                       PreB);
+  if (SrcSubIdx)
+    return !TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSubIdx);
+  if (DstSubIdx)
+    return !TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSubIdx);
+  return !TRI.getCommonSubClass(SrcRC, DstRC);
+}
+
+void DeadLaneDetector::addUsedLanesOnOperand(const MachineOperand &MO,
+                                             LaneBitmask UsedLanes) {
+  if (!MO.readsReg())
+    return;
+  Register MOReg = MO.getReg();
+  if (!MOReg.isVirtual())
+    return;
+
+  unsigned MOSubReg = MO.getSubReg();
+  if (MOSubReg != 0)
+    UsedLanes = TRI->composeSubRegIndexLaneMask(MOSubReg, UsedLanes);
+  UsedLanes &= MRI->getMaxLaneMaskForVReg(MOReg);
+
+  unsigned MORegIdx = Register::virtReg2Index(MOReg);
+  DeadLaneDetector::VRegInfo &MORegInfo = VRegInfos[MORegIdx];
+  LaneBitmask PrevUsedLanes = MORegInfo.UsedLanes;
+  // Any change at all?
+  if ((UsedLanes & ~PrevUsedLanes).none())
+    return;
+
+  // Set UsedLanes and remember instruction for further propagation.
+  MORegInfo.UsedLanes = PrevUsedLanes | UsedLanes;
+  if (DefinedByCopy.test(MORegIdx))
+    PutInWorklist(MORegIdx);
+}
+
+void DeadLaneDetector::transferUsedLanesStep(const MachineInstr &MI,
+                                             LaneBitmask UsedLanes) {
+  for (const MachineOperand &MO : MI.uses()) {
+    if (!MO.isReg() || !MO.getReg().isVirtual())
+      continue;
+    LaneBitmask UsedOnMO = transferUsedLanes(MI, UsedLanes, MO);
+    addUsedLanesOnOperand(MO, UsedOnMO);
+  }
+}
+
+LaneBitmask
+DeadLaneDetector::transferUsedLanes(const MachineInstr &MI,
+                                    LaneBitmask UsedLanes,
+                                    const MachineOperand &MO) const {
+  unsigned OpNum = MO.getOperandNo();
+  assert(lowersToCopies(MI) &&
+         DefinedByCopy[Register::virtReg2Index(MI.getOperand(0).getReg())]);
+
+  switch (MI.getOpcode()) {
+  case TargetOpcode::COPY:
+  case TargetOpcode::PHI:
+    return UsedLanes;
+  case TargetOpcode::REG_SEQUENCE: {
+    assert(OpNum % 2 == 1);
+    unsigned SubIdx = MI.getOperand(OpNum + 1).getImm();
+    return TRI->reverseComposeSubRegIndexLaneMask(SubIdx, UsedLanes);
+  }
+  case TargetOpcode::INSERT_SUBREG: {
+    unsigned SubIdx = MI.getOperand(3).getImm();
+    LaneBitmask MO2UsedLanes =
+        TRI->reverseComposeSubRegIndexLaneMask(SubIdx, UsedLanes);
+    if (OpNum == 2)
+      return MO2UsedLanes;
+
+    const MachineOperand &Def = MI.getOperand(0);
+    Register DefReg = Def.getReg();
+    const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+    LaneBitmask MO1UsedLanes;
+    if (RC->CoveredBySubRegs)
+      MO1UsedLanes = UsedLanes & ~TRI->getSubRegIndexLaneMask(SubIdx);
+    else
+      MO1UsedLanes = RC->LaneMask;
+
+    assert(OpNum == 1);
+    return MO1UsedLanes;
+  }
+  case TargetOpcode::EXTRACT_SUBREG: {
+    assert(OpNum == 1);
+    unsigned SubIdx = MI.getOperand(2).getImm();
+    return TRI->composeSubRegIndexLaneMask(SubIdx, UsedLanes);
+  }
+  default:
+    llvm_unreachable("function must be called with COPY-like instruction");
+  }
+}
+
+void DeadLaneDetector::transferDefinedLanesStep(const MachineOperand &Use,
+                                                LaneBitmask DefinedLanes) {
+  if (!Use.readsReg())
+    return;
+  // Check whether the operand writes a vreg and is part of a COPY-like
+  // instruction.
+  const MachineInstr &MI = *Use.getParent();
+  if (MI.getDesc().getNumDefs() != 1)
+    return;
+  // FIXME: PATCHPOINT instructions announce a Def that does not always exist,
+  // they really need to be modeled differently!
+  if (MI.getOpcode() == TargetOpcode::PATCHPOINT)
+    return;
+  const MachineOperand &Def = *MI.defs().begin();
+  Register DefReg = Def.getReg();
+  if (!DefReg.isVirtual())
+    return;
+  unsigned DefRegIdx = Register::virtReg2Index(DefReg);
+  if (!DefinedByCopy.test(DefRegIdx))
+    return;
+
+  unsigned OpNum = Use.getOperandNo();
+  DefinedLanes =
+      TRI->reverseComposeSubRegIndexLaneMask(Use.getSubReg(), DefinedLanes);
+  DefinedLanes = transferDefinedLanes(Def, OpNum, DefinedLanes);
+
+  VRegInfo &RegInfo = VRegInfos[DefRegIdx];
+  LaneBitmask PrevDefinedLanes = RegInfo.DefinedLanes;
+  // Any change at all?
+  if ((DefinedLanes & ~PrevDefinedLanes).none())
+    return;
+
+  RegInfo.DefinedLanes = PrevDefinedLanes | DefinedLanes;
+  PutInWorklist(DefRegIdx);
+}
+
+LaneBitmask DeadLaneDetector::transferDefinedLanes(
+    const MachineOperand &Def, unsigned OpNum, LaneBitmask DefinedLanes) const {
+  const MachineInstr &MI = *Def.getParent();
+  // Translate DefinedLanes if necessary.
+  switch (MI.getOpcode()) {
+  case TargetOpcode::REG_SEQUENCE: {
+    unsigned SubIdx = MI.getOperand(OpNum + 1).getImm();
+    DefinedLanes = TRI->composeSubRegIndexLaneMask(SubIdx, DefinedLanes);
+    DefinedLanes &= TRI->getSubRegIndexLaneMask(SubIdx);
+    break;
+  }
+  case TargetOpcode::INSERT_SUBREG: {
+    unsigned SubIdx = MI.getOperand(3).getImm();
+    if (OpNum == 2) {
+      DefinedLanes = TRI->composeSubRegIndexLaneMask(SubIdx, DefinedLanes);
+      DefinedLanes &= TRI->getSubRegIndexLaneMask(SubIdx);
+    } else {
+      assert(OpNum == 1 && "INSERT_SUBREG must have two operands");
+      // Ignore lanes defined by operand 2.
+      DefinedLanes &= ~TRI->getSubRegIndexLaneMask(SubIdx);
+    }
+    break;
+  }
+  case TargetOpcode::EXTRACT_SUBREG: {
+    unsigned SubIdx = MI.getOperand(2).getImm();
+    assert(OpNum == 1 && "EXTRACT_SUBREG must have one register operand only");
+    DefinedLanes = TRI->reverseComposeSubRegIndexLaneMask(SubIdx, DefinedLanes);
+    break;
+  }
+  case TargetOpcode::COPY:
+  case TargetOpcode::PHI:
+    break;
+  default:
+    llvm_unreachable("function must be called with COPY-like instruction");
+  }
+
+  assert(Def.getSubReg() == 0 &&
+         "Should not have subregister defs in machine SSA phase");
+  DefinedLanes &= MRI->getMaxLaneMaskForVReg(Def.getReg());
+  return DefinedLanes;
+}
+
+LaneBitmask DeadLaneDetector::determineInitialDefinedLanes(unsigned Reg) {
+  // Live-In or unused registers have no definition but are considered fully
+  // defined.
+  if (!MRI->hasOneDef(Reg))
+    return LaneBitmask::getAll();
+
+  const MachineOperand &Def = *MRI->def_begin(Reg);
+  const MachineInstr &DefMI = *Def.getParent();
+  if (lowersToCopies(DefMI)) {
+    // Start optimisatically with no used or defined lanes for copy
+    // instructions. The following dataflow analysis will add more bits.
+    unsigned RegIdx = Register::virtReg2Index(Reg);
+    DefinedByCopy.set(RegIdx);
+    PutInWorklist(RegIdx);
+
+    if (Def.isDead())
+      return LaneBitmask::getNone();
+
+    // COPY/PHI can copy across unrelated register classes (example: float/int)
+    // with incompatible subregister structure. Do not include these in the
+    // dataflow analysis since we cannot transfer lanemasks in a meaningful way.
+    const TargetRegisterClass *DefRC = MRI->getRegClass(Reg);
+
+    // Determine initially DefinedLanes.
+    LaneBitmask DefinedLanes;
+    for (const MachineOperand &MO : DefMI.uses()) {
+      if (!MO.isReg() || !MO.readsReg())
+        continue;
+      Register MOReg = MO.getReg();
+      if (!MOReg)
+        continue;
+
+      LaneBitmask MODefinedLanes;
+      if (MOReg.isPhysical()) {
+        MODefinedLanes = LaneBitmask::getAll();
+      } else if (isCrossCopy(*MRI, DefMI, DefRC, MO)) {
+        MODefinedLanes = LaneBitmask::getAll();
+      } else {
+        assert(MOReg.isVirtual());
+        if (MRI->hasOneDef(MOReg)) {
+          const MachineOperand &MODef = *MRI->def_begin(MOReg);
+          const MachineInstr &MODefMI = *MODef.getParent();
+          // Bits from copy-like operations will be added later.
+          if (lowersToCopies(MODefMI) || MODefMI.isImplicitDef())
+            continue;
+        }
+        unsigned MOSubReg = MO.getSubReg();
+        MODefinedLanes = MRI->getMaxLaneMaskForVReg(MOReg);
+        MODefinedLanes = TRI->reverseComposeSubRegIndexLaneMask(
+            MOSubReg, MODefinedLanes);
+      }
+
+      unsigned OpNum = MO.getOperandNo();
+      DefinedLanes |= transferDefinedLanes(Def, OpNum, MODefinedLanes);
+    }
+    return DefinedLanes;
+  }
+  if (DefMI.isImplicitDef() || Def.isDead())
+    return LaneBitmask::getNone();
+
+  assert(Def.getSubReg() == 0 &&
+         "Should not have subregister defs in machine SSA phase");
+  return MRI->getMaxLaneMaskForVReg(Reg);
+}
+
+LaneBitmask DeadLaneDetector::determineInitialUsedLanes(unsigned Reg) {
+  LaneBitmask UsedLanes = LaneBitmask::getNone();
+  for (const MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+    if (!MO.readsReg())
+      continue;
+
+    const MachineInstr &UseMI = *MO.getParent();
+    if (UseMI.isKill())
+      continue;
+
+    unsigned SubReg = MO.getSubReg();
+    if (lowersToCopies(UseMI)) {
+      assert(UseMI.getDesc().getNumDefs() == 1);
+      const MachineOperand &Def = *UseMI.defs().begin();
+      Register DefReg = Def.getReg();
+      // The used lanes of COPY-like instruction operands are determined by the
+      // following dataflow analysis.
+      if (DefReg.isVirtual()) {
+        // But ignore copies across incompatible register classes.
+        bool CrossCopy = false;
+        if (lowersToCopies(UseMI)) {
+          const TargetRegisterClass *DstRC = MRI->getRegClass(DefReg);
+          CrossCopy = isCrossCopy(*MRI, UseMI, DstRC, MO);
+          if (CrossCopy)
+            LLVM_DEBUG(dbgs() << "Copy across incompatible classes: " << UseMI);
+        }
+
+        if (!CrossCopy)
+          continue;
+      }
+    }
+
+    // Shortcut: All lanes are used.
+    if (SubReg == 0)
+      return MRI->getMaxLaneMaskForVReg(Reg);
+
+    UsedLanes |= TRI->getSubRegIndexLaneMask(SubReg);
+  }
+  return UsedLanes;
+}
+
+namespace {
+
+class DetectDeadLanes : public MachineFunctionPass {
+public:
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  static char ID;
+  DetectDeadLanes() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "Detect Dead Lanes"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  /// update the operand status.
+  /// The first return value shows whether MF been changed.
+  /// The second return value indicates we need to call
+  /// DeadLaneDetector::computeSubRegisterLaneBitInfo and this function again
+  /// to propagate changes.
+  std::pair<bool, bool>
+  modifySubRegisterOperandStatus(const DeadLaneDetector &DLD,
+                                 MachineFunction &MF);
+
+  bool isUndefRegAtInput(const MachineOperand &MO,
+                         const DeadLaneDetector::VRegInfo &RegInfo) const;
+
+  bool isUndefInput(const DeadLaneDetector &DLD, const MachineOperand &MO,
+                    bool *CrossCopy) const;
+
+  const MachineRegisterInfo *MRI = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+};
+
+} // end anonymous namespace
+
+char DetectDeadLanes::ID = 0;
+char &llvm::DetectDeadLanesID = DetectDeadLanes::ID;
+
+INITIALIZE_PASS(DetectDeadLanes, DEBUG_TYPE, "Detect Dead Lanes", false, false)
+
+bool DetectDeadLanes::isUndefRegAtInput(
+    const MachineOperand &MO, const DeadLaneDetector::VRegInfo &RegInfo) const {
+  unsigned SubReg = MO.getSubReg();
+  LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubReg);
+  return (RegInfo.DefinedLanes & RegInfo.UsedLanes & Mask).none();
+}
+
+bool DetectDeadLanes::isUndefInput(const DeadLaneDetector &DLD,
+                                   const MachineOperand &MO,
+                                   bool *CrossCopy) const {
+  if (!MO.isUse())
+    return false;
+  const MachineInstr &MI = *MO.getParent();
+  if (!lowersToCopies(MI))
+    return false;
+  const MachineOperand &Def = MI.getOperand(0);
+  Register DefReg = Def.getReg();
+  if (!DefReg.isVirtual())
+    return false;
+  unsigned DefRegIdx = Register::virtReg2Index(DefReg);
+  if (!DLD.isDefinedByCopy(DefRegIdx))
+    return false;
+
+  const DeadLaneDetector::VRegInfo &DefRegInfo = DLD.getVRegInfo(DefRegIdx);
+  LaneBitmask UsedLanes = DLD.transferUsedLanes(MI, DefRegInfo.UsedLanes, MO);
+  if (UsedLanes.any())
+    return false;
+
+  Register MOReg = MO.getReg();
+  if (MOReg.isVirtual()) {
+    const TargetRegisterClass *DstRC = MRI->getRegClass(DefReg);
+    *CrossCopy = isCrossCopy(*MRI, MI, DstRC, MO);
+  }
+  return true;
+}
+
+void DeadLaneDetector::computeSubRegisterLaneBitInfo() {
+  // First pass: Populate defs/uses of vregs with initial values
+  unsigned NumVirtRegs = MRI->getNumVirtRegs();
+  for (unsigned RegIdx = 0; RegIdx < NumVirtRegs; ++RegIdx) {
+    Register Reg = Register::index2VirtReg(RegIdx);
+
+    // Determine used/defined lanes and add copy instructions to worklist.
+    VRegInfo &Info = VRegInfos[RegIdx];
+    Info.DefinedLanes = determineInitialDefinedLanes(Reg);
+    Info.UsedLanes = determineInitialUsedLanes(Reg);
+  }
+
+  // Iterate as long as defined lanes/used lanes keep changing.
+  while (!Worklist.empty()) {
+    unsigned RegIdx = Worklist.front();
+    Worklist.pop_front();
+    WorklistMembers.reset(RegIdx);
+    VRegInfo &Info = VRegInfos[RegIdx];
+    Register Reg = Register::index2VirtReg(RegIdx);
+
+    // Transfer UsedLanes to operands of DefMI (backwards dataflow).
+    MachineOperand &Def = *MRI->def_begin(Reg);
+    const MachineInstr &MI = *Def.getParent();
+    transferUsedLanesStep(MI, Info.UsedLanes);
+    // Transfer DefinedLanes to users of Reg (forward dataflow).
+    for (const MachineOperand &MO : MRI->use_nodbg_operands(Reg))
+      transferDefinedLanesStep(MO, Info.DefinedLanes);
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "Defined/Used lanes:\n";
+    for (unsigned RegIdx = 0; RegIdx < NumVirtRegs; ++RegIdx) {
+      Register Reg = Register::index2VirtReg(RegIdx);
+      const VRegInfo &Info = VRegInfos[RegIdx];
+      dbgs() << printReg(Reg, nullptr)
+             << " Used: " << PrintLaneMask(Info.UsedLanes)
+             << " Def: " << PrintLaneMask(Info.DefinedLanes) << '\n';
+    }
+    dbgs() << "\n";
+  });
+}
+
+std::pair<bool, bool>
+DetectDeadLanes::modifySubRegisterOperandStatus(const DeadLaneDetector &DLD,
+                                                MachineFunction &MF) {
+  bool Changed = false;
+  bool Again = false;
+  // Mark operands as dead/unused.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      for (MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg())
+          continue;
+        Register Reg = MO.getReg();
+        if (!Reg.isVirtual())
+          continue;
+        unsigned RegIdx = Register::virtReg2Index(Reg);
+        const DeadLaneDetector::VRegInfo &RegInfo = DLD.getVRegInfo(RegIdx);
+        if (MO.isDef() && !MO.isDead() && RegInfo.UsedLanes.none()) {
+          LLVM_DEBUG(dbgs()
+                     << "Marking operand '" << MO << "' as dead in " << MI);
+          MO.setIsDead();
+          Changed = true;
+        }
+        if (MO.readsReg()) {
+          bool CrossCopy = false;
+          if (isUndefRegAtInput(MO, RegInfo)) {
+            LLVM_DEBUG(dbgs()
+                       << "Marking operand '" << MO << "' as undef in " << MI);
+            MO.setIsUndef();
+            Changed = true;
+          } else if (isUndefInput(DLD, MO, &CrossCopy)) {
+            LLVM_DEBUG(dbgs()
+                       << "Marking operand '" << MO << "' as undef in " << MI);
+            MO.setIsUndef();
+            Changed = true;
+            if (CrossCopy)
+              Again = true;
+          }
+        }
+      }
+    }
+  }
+
+  return std::make_pair(Changed, Again);
+}
+
+bool DetectDeadLanes::runOnMachineFunction(MachineFunction &MF) {
+  // Don't bother if we won't track subregister liveness later.  This pass is
+  // required for correctness if subregister liveness is enabled because the
+  // register coalescer cannot deal with hidden dead defs. However without
+  // subregister liveness enabled, the expected benefits of this pass are small
+  // so we safe the compile time.
+  MRI = &MF.getRegInfo();
+  if (!MRI->subRegLivenessEnabled()) {
+    LLVM_DEBUG(dbgs() << "Skipping Detect dead lanes pass\n");
+    return false;
+  }
+
+  TRI = MRI->getTargetRegisterInfo();
+
+  DeadLaneDetector DLD(MRI, TRI);
+
+  bool Changed = false;
+  bool Again;
+  do {
+    DLD.computeSubRegisterLaneBitInfo();
+    bool LocalChanged;
+    std::tie(LocalChanged, Again) = modifySubRegisterOperandStatus(DLD, MF);
+    Changed |= LocalChanged;
+  } while (Again);
+
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/contrib/llvm-project/llvm/lib/CodeGen/DwarfEHPrepare.cpp
new file mode 100644
index 000000000000..32c94de7280c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/DwarfEHPrepare.cpp
@@ -0,0 +1,380 @@
+//===- DwarfEHPrepare - Prepare exception handling for code generation ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass mulches exception handling code into a form adapted to code
+// generation. Required if using dwarf exception handling.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/EHPersonalities.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cstddef>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dwarfehprepare"
+
+STATISTIC(NumResumesLowered, "Number of resume calls lowered");
+STATISTIC(NumCleanupLandingPadsUnreachable,
+          "Number of cleanup landing pads found unreachable");
+STATISTIC(NumCleanupLandingPadsRemaining,
+          "Number of cleanup landing pads remaining");
+STATISTIC(NumNoUnwind, "Number of functions with nounwind");
+STATISTIC(NumUnwind, "Number of functions with unwind");
+
+namespace {
+
+class DwarfEHPrepare {
+  CodeGenOpt::Level OptLevel;
+
+  Function &F;
+  const TargetLowering &TLI;
+  DomTreeUpdater *DTU;
+  const TargetTransformInfo *TTI;
+  const Triple &TargetTriple;
+
+  /// Return the exception object from the value passed into
+  /// the 'resume' instruction (typically an aggregate). Clean up any dead
+  /// instructions, including the 'resume' instruction.
+  Value *GetExceptionObject(ResumeInst *RI);
+
+  /// Replace resumes that are not reachable from a cleanup landing pad with
+  /// unreachable and then simplify those blocks.
+  size_t
+  pruneUnreachableResumes(SmallVectorImpl<ResumeInst *> &Resumes,
+                          SmallVectorImpl<LandingPadInst *> &CleanupLPads);
+
+  /// Convert the ResumeInsts that are still present
+  /// into calls to the appropriate _Unwind_Resume function.
+  bool InsertUnwindResumeCalls();
+
+public:
+  DwarfEHPrepare(CodeGenOpt::Level OptLevel_, Function &F_,
+                 const TargetLowering &TLI_, DomTreeUpdater *DTU_,
+                 const TargetTransformInfo *TTI_, const Triple &TargetTriple_)
+      : OptLevel(OptLevel_), F(F_), TLI(TLI_), DTU(DTU_), TTI(TTI_),
+        TargetTriple(TargetTriple_) {}
+
+  bool run();
+};
+
+} // namespace
+
+Value *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
+  Value *V = RI->getOperand(0);
+  Value *ExnObj = nullptr;
+  InsertValueInst *SelIVI = dyn_cast<InsertValueInst>(V);
+  LoadInst *SelLoad = nullptr;
+  InsertValueInst *ExcIVI = nullptr;
+  bool EraseIVIs = false;
+
+  if (SelIVI) {
+    if (SelIVI->getNumIndices() == 1 && *SelIVI->idx_begin() == 1) {
+      ExcIVI = dyn_cast<InsertValueInst>(SelIVI->getOperand(0));
+      if (ExcIVI && isa<UndefValue>(ExcIVI->getOperand(0)) &&
+          ExcIVI->getNumIndices() == 1 && *ExcIVI->idx_begin() == 0) {
+        ExnObj = ExcIVI->getOperand(1);
+        SelLoad = dyn_cast<LoadInst>(SelIVI->getOperand(1));
+        EraseIVIs = true;
+      }
+    }
+  }
+
+  if (!ExnObj)
+    ExnObj = ExtractValueInst::Create(RI->getOperand(0), 0, "exn.obj", RI);
+
+  RI->eraseFromParent();
+
+  if (EraseIVIs) {
+    if (SelIVI->use_empty())
+      SelIVI->eraseFromParent();
+    if (ExcIVI->use_empty())
+      ExcIVI->eraseFromParent();
+    if (SelLoad && SelLoad->use_empty())
+      SelLoad->eraseFromParent();
+  }
+
+  return ExnObj;
+}
+
+size_t DwarfEHPrepare::pruneUnreachableResumes(
+    SmallVectorImpl<ResumeInst *> &Resumes,
+    SmallVectorImpl<LandingPadInst *> &CleanupLPads) {
+  assert(DTU && "Should have DomTreeUpdater here.");
+
+  BitVector ResumeReachable(Resumes.size());
+  size_t ResumeIndex = 0;
+  for (auto *RI : Resumes) {
+    for (auto *LP : CleanupLPads) {
+      if (isPotentiallyReachable(LP, RI, nullptr, &DTU->getDomTree())) {
+        ResumeReachable.set(ResumeIndex);
+        break;
+      }
+    }
+    ++ResumeIndex;
+  }
+
+  // If everything is reachable, there is no change.
+  if (ResumeReachable.all())
+    return Resumes.size();
+
+  LLVMContext &Ctx = F.getContext();
+
+  // Otherwise, insert unreachable instructions and call simplifycfg.
+  size_t ResumesLeft = 0;
+  for (size_t I = 0, E = Resumes.size(); I < E; ++I) {
+    ResumeInst *RI = Resumes[I];
+    if (ResumeReachable[I]) {
+      Resumes[ResumesLeft++] = RI;
+    } else {
+      BasicBlock *BB = RI->getParent();
+      new UnreachableInst(Ctx, RI);
+      RI->eraseFromParent();
+      simplifyCFG(BB, *TTI, DTU);
+    }
+  }
+  Resumes.resize(ResumesLeft);
+  return ResumesLeft;
+}
+
+bool DwarfEHPrepare::InsertUnwindResumeCalls() {
+  SmallVector<ResumeInst *, 16> Resumes;
+  SmallVector<LandingPadInst *, 16> CleanupLPads;
+  if (F.doesNotThrow())
+    NumNoUnwind++;
+  else
+    NumUnwind++;
+  for (BasicBlock &BB : F) {
+    if (auto *RI = dyn_cast<ResumeInst>(BB.getTerminator()))
+      Resumes.push_back(RI);
+    if (auto *LP = BB.getLandingPadInst())
+      if (LP->isCleanup())
+        CleanupLPads.push_back(LP);
+  }
+
+  NumCleanupLandingPadsRemaining += CleanupLPads.size();
+
+  if (Resumes.empty())
+    return false;
+
+  // Check the personality, don't do anything if it's scope-based.
+  EHPersonality Pers = classifyEHPersonality(F.getPersonalityFn());
+  if (isScopedEHPersonality(Pers))
+    return false;
+
+  LLVMContext &Ctx = F.getContext();
+
+  size_t ResumesLeft = Resumes.size();
+  if (OptLevel != CodeGenOpt::None) {
+    ResumesLeft = pruneUnreachableResumes(Resumes, CleanupLPads);
+#if LLVM_ENABLE_STATS
+    unsigned NumRemainingLPs = 0;
+    for (BasicBlock &BB : F) {
+      if (auto *LP = BB.getLandingPadInst())
+        if (LP->isCleanup())
+          NumRemainingLPs++;
+    }
+    NumCleanupLandingPadsUnreachable += CleanupLPads.size() - NumRemainingLPs;
+    NumCleanupLandingPadsRemaining -= CleanupLPads.size() - NumRemainingLPs;
+#endif
+  }
+
+  if (ResumesLeft == 0)
+    return true; // We pruned them all.
+
+  // RewindFunction - _Unwind_Resume or the target equivalent.
+  FunctionCallee RewindFunction;
+  CallingConv::ID RewindFunctionCallingConv;
+  FunctionType *FTy;
+  const char *RewindName;
+  bool DoesRewindFunctionNeedExceptionObject;
+
+  if ((Pers == EHPersonality::GNU_CXX || Pers == EHPersonality::GNU_CXX_SjLj) &&
+      TargetTriple.isTargetEHABICompatible()) {
+    RewindName = TLI.getLibcallName(RTLIB::CXA_END_CLEANUP);
+    FTy = FunctionType::get(Type::getVoidTy(Ctx), false);
+    RewindFunctionCallingConv =
+        TLI.getLibcallCallingConv(RTLIB::CXA_END_CLEANUP);
+    DoesRewindFunctionNeedExceptionObject = false;
+  } else {
+    RewindName = TLI.getLibcallName(RTLIB::UNWIND_RESUME);
+    FTy =
+        FunctionType::get(Type::getVoidTy(Ctx), Type::getInt8PtrTy(Ctx), false);
+    RewindFunctionCallingConv = TLI.getLibcallCallingConv(RTLIB::UNWIND_RESUME);
+    DoesRewindFunctionNeedExceptionObject = true;
+  }
+  RewindFunction = F.getParent()->getOrInsertFunction(RewindName, FTy);
+
+  // Create the basic block where the _Unwind_Resume call will live.
+  if (ResumesLeft == 1) {
+    // Instead of creating a new BB and PHI node, just append the call to
+    // _Unwind_Resume to the end of the single resume block.
+    ResumeInst *RI = Resumes.front();
+    BasicBlock *UnwindBB = RI->getParent();
+    Value *ExnObj = GetExceptionObject(RI);
+    llvm::SmallVector<Value *, 1> RewindFunctionArgs;
+    if (DoesRewindFunctionNeedExceptionObject)
+      RewindFunctionArgs.push_back(ExnObj);
+
+    // Call the rewind function.
+    CallInst *CI =
+        CallInst::Create(RewindFunction, RewindFunctionArgs, "", UnwindBB);
+    // The verifier requires that all calls of debug-info-bearing functions
+    // from debug-info-bearing functions have a debug location (for inlining
+    // purposes). Assign a dummy location to satisfy the constraint.
+    Function *RewindFn = dyn_cast<Function>(RewindFunction.getCallee());
+    if (RewindFn && RewindFn->getSubprogram())
+      if (DISubprogram *SP = F.getSubprogram())
+        CI->setDebugLoc(DILocation::get(SP->getContext(), 0, 0, SP));
+    CI->setCallingConv(RewindFunctionCallingConv);
+
+    // We never expect _Unwind_Resume to return.
+    CI->setDoesNotReturn();
+    new UnreachableInst(Ctx, UnwindBB);
+    return true;
+  }
+
+  std::vector<DominatorTree::UpdateType> Updates;
+  Updates.reserve(Resumes.size());
+
+  llvm::SmallVector<Value *, 1> RewindFunctionArgs;
+
+  BasicBlock *UnwindBB = BasicBlock::Create(Ctx, "unwind_resume", &F);
+  PHINode *PN = PHINode::Create(Type::getInt8PtrTy(Ctx), ResumesLeft, "exn.obj",
+                                UnwindBB);
+
+  // Extract the exception object from the ResumeInst and add it to the PHI node
+  // that feeds the _Unwind_Resume call.
+  for (ResumeInst *RI : Resumes) {
+    BasicBlock *Parent = RI->getParent();
+    BranchInst::Create(UnwindBB, Parent);
+    Updates.push_back({DominatorTree::Insert, Parent, UnwindBB});
+
+    Value *ExnObj = GetExceptionObject(RI);
+    PN->addIncoming(ExnObj, Parent);
+
+    ++NumResumesLowered;
+  }
+
+  if (DoesRewindFunctionNeedExceptionObject)
+    RewindFunctionArgs.push_back(PN);
+
+  // Call the function.
+  CallInst *CI =
+      CallInst::Create(RewindFunction, RewindFunctionArgs, "", UnwindBB);
+  CI->setCallingConv(RewindFunctionCallingConv);
+
+  // We never expect _Unwind_Resume to return.
+  CI->setDoesNotReturn();
+  new UnreachableInst(Ctx, UnwindBB);
+
+  if (DTU)
+    DTU->applyUpdates(Updates);
+
+  return true;
+}
+
+bool DwarfEHPrepare::run() {
+  bool Changed = InsertUnwindResumeCalls();
+
+  return Changed;
+}
+
+static bool prepareDwarfEH(CodeGenOpt::Level OptLevel, Function &F,
+                           const TargetLowering &TLI, DominatorTree *DT,
+                           const TargetTransformInfo *TTI,
+                           const Triple &TargetTriple) {
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
+  return DwarfEHPrepare(OptLevel, F, TLI, DT ? &DTU : nullptr, TTI,
+                        TargetTriple)
+      .run();
+}
+
+namespace {
+
+class DwarfEHPrepareLegacyPass : public FunctionPass {
+
+  CodeGenOpt::Level OptLevel;
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+
+  DwarfEHPrepareLegacyPass(CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
+      : FunctionPass(ID), OptLevel(OptLevel) {}
+
+  bool runOnFunction(Function &F) override {
+    const TargetMachine &TM =
+        getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+    const TargetLowering &TLI = *TM.getSubtargetImpl(F)->getTargetLowering();
+    DominatorTree *DT = nullptr;
+    const TargetTransformInfo *TTI = nullptr;
+    if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
+      DT = &DTWP->getDomTree();
+    if (OptLevel != CodeGenOpt::None) {
+      if (!DT)
+        DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+      TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    }
+    return prepareDwarfEH(OptLevel, F, TLI, DT, TTI, TM.getTargetTriple());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    if (OptLevel != CodeGenOpt::None) {
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
+    }
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+  StringRef getPassName() const override {
+    return "Exception handling preparation";
+  }
+};
+
+} // end anonymous namespace
+
+char DwarfEHPrepareLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(DwarfEHPrepareLegacyPass, DEBUG_TYPE,
+                      "Prepare DWARF exceptions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(DwarfEHPrepareLegacyPass, DEBUG_TYPE,
+                    "Prepare DWARF exceptions", false, false)
+
+FunctionPass *llvm::createDwarfEHPass(CodeGenOpt::Level OptLevel) {
+  return new DwarfEHPrepareLegacyPass(OptLevel);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/EHContGuardCatchret.cpp b/contrib/llvm-project/llvm/lib/CodeGen/EHContGuardCatchret.cpp
new file mode 100644
index 000000000000..b26aa792bb93
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/EHContGuardCatchret.cpp
@@ -0,0 +1,82 @@
+//===-- EHContGuardCatchret.cpp - Catchret target symbols -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains a machine function pass to insert a symbol before each
+/// valid catchret target and store this in the MachineFunction's
+/// CatchRetTargets vector. This will be used to emit the table of valid targets
+/// used by EHCont Guard.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ehcontguard-catchret"
+
+STATISTIC(EHContGuardCatchretTargets,
+          "Number of EHCont Guard catchret targets");
+
+namespace {
+
+/// MachineFunction pass to insert a symbol before each valid catchret target
+/// and store these in the MachineFunction's CatchRetTargets vector.
+class EHContGuardCatchret : public MachineFunctionPass {
+public:
+  static char ID;
+
+  EHContGuardCatchret() : MachineFunctionPass(ID) {
+    initializeEHContGuardCatchretPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "EH Cont Guard catchret targets";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // end anonymous namespace
+
+char EHContGuardCatchret::ID = 0;
+
+INITIALIZE_PASS(EHContGuardCatchret, "EHContGuardCatchret",
+                "Insert symbols at valid catchret targets for /guard:ehcont",
+                false, false)
+FunctionPass *llvm::createEHContGuardCatchretPass() {
+  return new EHContGuardCatchret();
+}
+
+bool EHContGuardCatchret::runOnMachineFunction(MachineFunction &MF) {
+
+  // Skip modules for which the ehcontguard flag is not set.
+  if (!MF.getMMI().getModule()->getModuleFlag("ehcontguard"))
+    return false;
+
+  // Skip functions that do not have catchret
+  if (!MF.hasEHCatchret())
+    return false;
+
+  bool Result = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBB.isEHCatchretTarget()) {
+      MF.addCatchretTarget(MBB.getEHCatchretSymbol());
+      EHContGuardCatchretTargets++;
+      Result = true;
+    }
+  }
+
+  return Result;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/EarlyIfConversion.cpp b/contrib/llvm-project/llvm/lib/CodeGen/EarlyIfConversion.cpp
new file mode 100644
index 000000000000..61867d74bfa2
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -0,0 +1,1244 @@
+//===-- EarlyIfConversion.cpp - If-conversion on SSA form machine code ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Early if-conversion is for out-of-order CPUs that don't have a lot of
+// predicable instructions. The goal is to eliminate conditional branches that
+// may mispredict.
+//
+// Instructions from both sides of the branch are executed specutatively, and a
+// cmov instruction selects the result.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "early-ifcvt"
+
+// Absolute maximum number of instructions allowed per speculated block.
+// This bypasses all other heuristics, so it should be set fairly high.
+static cl::opt<unsigned>
+BlockInstrLimit("early-ifcvt-limit", cl::init(30), cl::Hidden,
+  cl::desc("Maximum number of instructions per speculated block."));
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("stress-early-ifcvt", cl::Hidden,
+  cl::desc("Turn all knobs to 11"));
+
+STATISTIC(NumDiamondsSeen,  "Number of diamonds");
+STATISTIC(NumDiamondsConv,  "Number of diamonds converted");
+STATISTIC(NumTrianglesSeen, "Number of triangles");
+STATISTIC(NumTrianglesConv, "Number of triangles converted");
+
+//===----------------------------------------------------------------------===//
+//                                 SSAIfConv
+//===----------------------------------------------------------------------===//
+//
+// The SSAIfConv class performs if-conversion on SSA form machine code after
+// determining if it is possible. The class contains no heuristics; external
+// code should be used to determine when if-conversion is a good idea.
+//
+// SSAIfConv can convert both triangles and diamonds:
+//
+//   Triangle: Head              Diamond: Head
+//              | \                       /  \_
+//              |  \                     /    |
+//              |  [TF]BB              FBB    TBB
+//              |  /                     \    /
+//              | /                       \  /
+//             Tail                       Tail
+//
+// Instructions in the conditional blocks TBB and/or FBB are spliced into the
+// Head block, and phis in the Tail block are converted to select instructions.
+//
+namespace {
+class SSAIfConv {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+public:
+  /// The block containing the conditional branch.
+  MachineBasicBlock *Head;
+
+  /// The block containing phis after the if-then-else.
+  MachineBasicBlock *Tail;
+
+  /// The 'true' conditional block as determined by analyzeBranch.
+  MachineBasicBlock *TBB;
+
+  /// The 'false' conditional block as determined by analyzeBranch.
+  MachineBasicBlock *FBB;
+
+  /// isTriangle - When there is no 'else' block, either TBB or FBB will be
+  /// equal to Tail.
+  bool isTriangle() const { return TBB == Tail || FBB == Tail; }
+
+  /// Returns the Tail predecessor for the True side.
+  MachineBasicBlock *getTPred() const { return TBB == Tail ? Head : TBB; }
+
+  /// Returns the Tail predecessor for the  False side.
+  MachineBasicBlock *getFPred() const { return FBB == Tail ? Head : FBB; }
+
+  /// Information about each phi in the Tail block.
+  struct PHIInfo {
+    MachineInstr *PHI;
+    unsigned TReg = 0, FReg = 0;
+    // Latencies from Cond+Branch, TReg, and FReg to DstReg.
+    int CondCycles = 0, TCycles = 0, FCycles = 0;
+
+    PHIInfo(MachineInstr *phi) : PHI(phi) {}
+  };
+
+  SmallVector<PHIInfo, 8> PHIs;
+
+  /// The branch condition determined by analyzeBranch.
+  SmallVector<MachineOperand, 4> Cond;
+
+private:
+  /// Instructions in Head that define values used by the conditional blocks.
+  /// The hoisted instructions must be inserted after these instructions.
+  SmallPtrSet<MachineInstr*, 8> InsertAfter;
+
+  /// Register units clobbered by the conditional blocks.
+  BitVector ClobberedRegUnits;
+
+  // Scratch pad for findInsertionPoint.
+  SparseSet<unsigned> LiveRegUnits;
+
+  /// Insertion point in Head for speculatively executed instructions form TBB
+  /// and FBB.
+  MachineBasicBlock::iterator InsertionPoint;
+
+  /// Return true if all non-terminator instructions in MBB can be safely
+  /// speculated.
+  bool canSpeculateInstrs(MachineBasicBlock *MBB);
+
+  /// Return true if all non-terminator instructions in MBB can be safely
+  /// predicated.
+  bool canPredicateInstrs(MachineBasicBlock *MBB);
+
+  /// Scan through instruction dependencies and update InsertAfter array.
+  /// Return false if any dependency is incompatible with if conversion.
+  bool InstrDependenciesAllowIfConv(MachineInstr *I);
+
+  /// Predicate all instructions of the basic block with current condition
+  /// except for terminators. Reverse the condition if ReversePredicate is set.
+  void PredicateBlock(MachineBasicBlock *MBB, bool ReversePredicate);
+
+  /// Find a valid insertion point in Head.
+  bool findInsertionPoint();
+
+  /// Replace PHI instructions in Tail with selects.
+  void replacePHIInstrs();
+
+  /// Insert selects and rewrite PHI operands to use them.
+  void rewritePHIOperands();
+
+public:
+  /// runOnMachineFunction - Initialize per-function data structures.
+  void runOnMachineFunction(MachineFunction &MF) {
+    TII = MF.getSubtarget().getInstrInfo();
+    TRI = MF.getSubtarget().getRegisterInfo();
+    MRI = &MF.getRegInfo();
+    LiveRegUnits.clear();
+    LiveRegUnits.setUniverse(TRI->getNumRegUnits());
+    ClobberedRegUnits.clear();
+    ClobberedRegUnits.resize(TRI->getNumRegUnits());
+  }
+
+  /// canConvertIf - If the sub-CFG headed by MBB can be if-converted,
+  /// initialize the internal state, and return true.
+  /// If predicate is set try to predicate the block otherwise try to
+  /// speculatively execute it.
+  bool canConvertIf(MachineBasicBlock *MBB, bool Predicate = false);
+
+  /// convertIf - If-convert the last block passed to canConvertIf(), assuming
+  /// it is possible. Add any erased blocks to RemovedBlocks.
+  void convertIf(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks,
+                 bool Predicate = false);
+};
+} // end anonymous namespace
+
+
+/// canSpeculateInstrs - Returns true if all the instructions in MBB can safely
+/// be speculated. The terminators are not considered.
+///
+/// If instructions use any values that are defined in the head basic block,
+/// the defining instructions are added to InsertAfter.
+///
+/// Any clobbered regunits are added to ClobberedRegUnits.
+///
+bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
+  // Reject any live-in physregs. It's probably CPSR/EFLAGS, and very hard to
+  // get right.
+  if (!MBB->livein_empty()) {
+    LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has live-ins.\n");
+    return false;
+  }
+
+  unsigned InstrCount = 0;
+
+  // Check all instructions, except the terminators. It is assumed that
+  // terminators never have side effects or define any used register values.
+  for (MachineInstr &MI :
+       llvm::make_range(MBB->begin(), MBB->getFirstTerminator())) {
+    if (MI.isDebugInstr())
+      continue;
+
+    if (++InstrCount > BlockInstrLimit && !Stress) {
+      LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has more than "
+                        << BlockInstrLimit << " instructions.\n");
+      return false;
+    }
+
+    // There shouldn't normally be any phis in a single-predecessor block.
+    if (MI.isPHI()) {
+      LLVM_DEBUG(dbgs() << "Can't hoist: " << MI);
+      return false;
+    }
+
+    // Don't speculate loads. Note that it may be possible and desirable to
+    // speculate GOT or constant pool loads that are guaranteed not to trap,
+    // but we don't support that for now.
+    if (MI.mayLoad()) {
+      LLVM_DEBUG(dbgs() << "Won't speculate load: " << MI);
+      return false;
+    }
+
+    // We never speculate stores, so an AA pointer isn't necessary.
+    bool DontMoveAcrossStore = true;
+    if (!MI.isSafeToMove(nullptr, DontMoveAcrossStore)) {
+      LLVM_DEBUG(dbgs() << "Can't speculate: " << MI);
+      return false;
+    }
+
+    // Check for any dependencies on Head instructions.
+    if (!InstrDependenciesAllowIfConv(&MI))
+      return false;
+  }
+  return true;
+}
+
+/// Check that there is no dependencies preventing if conversion.
+///
+/// If instruction uses any values that are defined in the head basic block,
+/// the defining instructions are added to InsertAfter.
+bool SSAIfConv::InstrDependenciesAllowIfConv(MachineInstr *I) {
+  for (const MachineOperand &MO : I->operands()) {
+    if (MO.isRegMask()) {
+      LLVM_DEBUG(dbgs() << "Won't speculate regmask: " << *I);
+      return false;
+    }
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+
+    // Remember clobbered regunits.
+    if (MO.isDef() && Reg.isPhysical())
+      for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
+        ClobberedRegUnits.set(Unit);
+
+    if (!MO.readsReg() || !Reg.isVirtual())
+      continue;
+    MachineInstr *DefMI = MRI->getVRegDef(Reg);
+    if (!DefMI || DefMI->getParent() != Head)
+      continue;
+    if (InsertAfter.insert(DefMI).second)
+      LLVM_DEBUG(dbgs() << printMBBReference(*I->getParent()) << " depends on "
+                        << *DefMI);
+    if (DefMI->isTerminator()) {
+      LLVM_DEBUG(dbgs() << "Can't insert instructions below terminator.\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+/// canPredicateInstrs - Returns true if all the instructions in MBB can safely
+/// be predicates. The terminators are not considered.
+///
+/// If instructions use any values that are defined in the head basic block,
+/// the defining instructions are added to InsertAfter.
+///
+/// Any clobbered regunits are added to ClobberedRegUnits.
+///
+bool SSAIfConv::canPredicateInstrs(MachineBasicBlock *MBB) {
+  // Reject any live-in physregs. It's probably CPSR/EFLAGS, and very hard to
+  // get right.
+  if (!MBB->livein_empty()) {
+    LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has live-ins.\n");
+    return false;
+  }
+
+  unsigned InstrCount = 0;
+
+  // Check all instructions, except the terminators. It is assumed that
+  // terminators never have side effects or define any used register values.
+  for (MachineBasicBlock::iterator I = MBB->begin(),
+                                   E = MBB->getFirstTerminator();
+       I != E; ++I) {
+    if (I->isDebugInstr())
+      continue;
+
+    if (++InstrCount > BlockInstrLimit && !Stress) {
+      LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has more than "
+                        << BlockInstrLimit << " instructions.\n");
+      return false;
+    }
+
+    // There shouldn't normally be any phis in a single-predecessor block.
+    if (I->isPHI()) {
+      LLVM_DEBUG(dbgs() << "Can't predicate: " << *I);
+      return false;
+    }
+
+    // Check that instruction is predicable
+    if (!TII->isPredicable(*I)) {
+      LLVM_DEBUG(dbgs() << "Isn't predicable: " << *I);
+      return false;
+    }
+
+    // Check that instruction is not already predicated.
+    if (TII->isPredicated(*I) && !TII->canPredicatePredicatedInstr(*I)) {
+      LLVM_DEBUG(dbgs() << "Is already predicated: " << *I);
+      return false;
+    }
+
+    // Check for any dependencies on Head instructions.
+    if (!InstrDependenciesAllowIfConv(&(*I)))
+      return false;
+  }
+  return true;
+}
+
+// Apply predicate to all instructions in the machine block.
+void SSAIfConv::PredicateBlock(MachineBasicBlock *MBB, bool ReversePredicate) {
+  auto Condition = Cond;
+  if (ReversePredicate) {
+    bool CanRevCond = !TII->reverseBranchCondition(Condition);
+    assert(CanRevCond && "Reversed predicate is not supported");
+    (void)CanRevCond;
+  }
+  // Terminators don't need to be predicated as they will be removed.
+  for (MachineBasicBlock::iterator I = MBB->begin(),
+                                   E = MBB->getFirstTerminator();
+       I != E; ++I) {
+    if (I->isDebugInstr())
+      continue;
+    TII->PredicateInstruction(*I, Condition);
+  }
+}
+
+/// Find an insertion point in Head for the speculated instructions. The
+/// insertion point must be:
+///
+/// 1. Before any terminators.
+/// 2. After any instructions in InsertAfter.
+/// 3. Not have any clobbered regunits live.
+///
+/// This function sets InsertionPoint and returns true when successful, it
+/// returns false if no valid insertion point could be found.
+///
+bool SSAIfConv::findInsertionPoint() {
+  // Keep track of live regunits before the current position.
+  // Only track RegUnits that are also in ClobberedRegUnits.
+  LiveRegUnits.clear();
+  SmallVector<MCRegister, 8> Reads;
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  MachineBasicBlock::iterator I = Head->end();
+  MachineBasicBlock::iterator B = Head->begin();
+  while (I != B) {
+    --I;
+    // Some of the conditional code depends in I.
+    if (InsertAfter.count(&*I)) {
+      LLVM_DEBUG(dbgs() << "Can't insert code after " << *I);
+      return false;
+    }
+
+    // Update live regunits.
+    for (const MachineOperand &MO : I->operands()) {
+      // We're ignoring regmask operands. That is conservatively correct.
+      if (!MO.isReg())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg.isPhysical())
+        continue;
+      // I clobbers Reg, so it isn't live before I.
+      if (MO.isDef())
+        for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
+          LiveRegUnits.erase(Unit);
+      // Unless I reads Reg.
+      if (MO.readsReg())
+        Reads.push_back(Reg.asMCReg());
+    }
+    // Anything read by I is live before I.
+    while (!Reads.empty())
+      for (MCRegUnit Unit : TRI->regunits(Reads.pop_back_val()))
+        if (ClobberedRegUnits.test(Unit))
+          LiveRegUnits.insert(Unit);
+
+    // We can't insert before a terminator.
+    if (I != FirstTerm && I->isTerminator())
+      continue;
+
+    // Some of the clobbered registers are live before I, not a valid insertion
+    // point.
+    if (!LiveRegUnits.empty()) {
+      LLVM_DEBUG({
+        dbgs() << "Would clobber";
+        for (unsigned LRU : LiveRegUnits)
+          dbgs() << ' ' << printRegUnit(LRU, TRI);
+        dbgs() << " live before " << *I;
+      });
+      continue;
+    }
+
+    // This is a valid insertion point.
+    InsertionPoint = I;
+    LLVM_DEBUG(dbgs() << "Can insert before " << *I);
+    return true;
+  }
+  LLVM_DEBUG(dbgs() << "No legal insertion point found.\n");
+  return false;
+}
+
+
+
+/// canConvertIf - analyze the sub-cfg rooted in MBB, and return true if it is
+/// a potential candidate for if-conversion. Fill out the internal state.
+///
+bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB, bool Predicate) {
+  Head = MBB;
+  TBB = FBB = Tail = nullptr;
+
+  if (Head->succ_size() != 2)
+    return false;
+  MachineBasicBlock *Succ0 = Head->succ_begin()[0];
+  MachineBasicBlock *Succ1 = Head->succ_begin()[1];
+
+  // Canonicalize so Succ0 has MBB as its single predecessor.
+  if (Succ0->pred_size() != 1)
+    std::swap(Succ0, Succ1);
+
+  if (Succ0->pred_size() != 1 || Succ0->succ_size() != 1)
+    return false;
+
+  Tail = Succ0->succ_begin()[0];
+
+  // This is not a triangle.
+  if (Tail != Succ1) {
+    // Check for a diamond. We won't deal with any critical edges.
+    if (Succ1->pred_size() != 1 || Succ1->succ_size() != 1 ||
+        Succ1->succ_begin()[0] != Tail)
+      return false;
+    LLVM_DEBUG(dbgs() << "\nDiamond: " << printMBBReference(*Head) << " -> "
+                      << printMBBReference(*Succ0) << "/"
+                      << printMBBReference(*Succ1) << " -> "
+                      << printMBBReference(*Tail) << '\n');
+
+    // Live-in physregs are tricky to get right when speculating code.
+    if (!Tail->livein_empty()) {
+      LLVM_DEBUG(dbgs() << "Tail has live-ins.\n");
+      return false;
+    }
+  } else {
+    LLVM_DEBUG(dbgs() << "\nTriangle: " << printMBBReference(*Head) << " -> "
+                      << printMBBReference(*Succ0) << " -> "
+                      << printMBBReference(*Tail) << '\n');
+  }
+
+  // This is a triangle or a diamond.
+  // Skip if we cannot predicate and there are no phis skip as there must be
+  // side effects that can only be handled with predication.
+  if (!Predicate && (Tail->empty() || !Tail->front().isPHI())) {
+    LLVM_DEBUG(dbgs() << "No phis in tail.\n");
+    return false;
+  }
+
+  // The branch we're looking to eliminate must be analyzable.
+  Cond.clear();
+  if (TII->analyzeBranch(*Head, TBB, FBB, Cond)) {
+    LLVM_DEBUG(dbgs() << "Branch not analyzable.\n");
+    return false;
+  }
+
+  // This is weird, probably some sort of degenerate CFG.
+  if (!TBB) {
+    LLVM_DEBUG(dbgs() << "analyzeBranch didn't find conditional branch.\n");
+    return false;
+  }
+
+  // Make sure the analyzed branch is conditional; one of the successors
+  // could be a landing pad. (Empty landing pads can be generated on Windows.)
+  if (Cond.empty()) {
+    LLVM_DEBUG(dbgs() << "analyzeBranch found an unconditional branch.\n");
+    return false;
+  }
+
+  // analyzeBranch doesn't set FBB on a fall-through branch.
+  // Make sure it is always set.
+  FBB = TBB == Succ0 ? Succ1 : Succ0;
+
+  // Any phis in the tail block must be convertible to selects.
+  PHIs.clear();
+  MachineBasicBlock *TPred = getTPred();
+  MachineBasicBlock *FPred = getFPred();
+  for (MachineBasicBlock::iterator I = Tail->begin(), E = Tail->end();
+       I != E && I->isPHI(); ++I) {
+    PHIs.push_back(&*I);
+    PHIInfo &PI = PHIs.back();
+    // Find PHI operands corresponding to TPred and FPred.
+    for (unsigned i = 1; i != PI.PHI->getNumOperands(); i += 2) {
+      if (PI.PHI->getOperand(i+1).getMBB() == TPred)
+        PI.TReg = PI.PHI->getOperand(i).getReg();
+      if (PI.PHI->getOperand(i+1).getMBB() == FPred)
+        PI.FReg = PI.PHI->getOperand(i).getReg();
+    }
+    assert(Register::isVirtualRegister(PI.TReg) && "Bad PHI");
+    assert(Register::isVirtualRegister(PI.FReg) && "Bad PHI");
+
+    // Get target information.
+    if (!TII->canInsertSelect(*Head, Cond, PI.PHI->getOperand(0).getReg(),
+                              PI.TReg, PI.FReg, PI.CondCycles, PI.TCycles,
+                              PI.FCycles)) {
+      LLVM_DEBUG(dbgs() << "Can't convert: " << *PI.PHI);
+      return false;
+    }
+  }
+
+  // Check that the conditional instructions can be speculated.
+  InsertAfter.clear();
+  ClobberedRegUnits.reset();
+  if (Predicate) {
+    if (TBB != Tail && !canPredicateInstrs(TBB))
+      return false;
+    if (FBB != Tail && !canPredicateInstrs(FBB))
+      return false;
+  } else {
+    if (TBB != Tail && !canSpeculateInstrs(TBB))
+      return false;
+    if (FBB != Tail && !canSpeculateInstrs(FBB))
+      return false;
+  }
+
+  // Try to find a valid insertion point for the speculated instructions in the
+  // head basic block.
+  if (!findInsertionPoint())
+    return false;
+
+  if (isTriangle())
+    ++NumTrianglesSeen;
+  else
+    ++NumDiamondsSeen;
+  return true;
+}
+
+/// \return true iff the two registers are known to have the same value.
+static bool hasSameValue(const MachineRegisterInfo &MRI,
+                         const TargetInstrInfo *TII, Register TReg,
+                         Register FReg) {
+  if (TReg == FReg)
+    return true;
+
+  if (!TReg.isVirtual() || !FReg.isVirtual())
+    return false;
+
+  const MachineInstr *TDef = MRI.getUniqueVRegDef(TReg);
+  const MachineInstr *FDef = MRI.getUniqueVRegDef(FReg);
+  if (!TDef || !FDef)
+    return false;
+
+  // If there are side-effects, all bets are off.
+  if (TDef->hasUnmodeledSideEffects())
+    return false;
+
+  // If the instruction could modify memory, or there may be some intervening
+  // store between the two, we can't consider them to be equal.
+  if (TDef->mayLoadOrStore() && !TDef->isDereferenceableInvariantLoad())
+    return false;
+
+  // We also can't guarantee that they are the same if, for example, the
+  // instructions are both a copy from a physical reg, because some other
+  // instruction may have modified the value in that reg between the two
+  // defining insts.
+  if (any_of(TDef->uses(), [](const MachineOperand &MO) {
+        return MO.isReg() && MO.getReg().isPhysical();
+      }))
+    return false;
+
+  // Check whether the two defining instructions produce the same value(s).
+  if (!TII->produceSameValue(*TDef, *FDef, &MRI))
+    return false;
+
+  // Further, check that the two defs come from corresponding operands.
+  int TIdx = TDef->findRegisterDefOperandIdx(TReg);
+  int FIdx = FDef->findRegisterDefOperandIdx(FReg);
+  if (TIdx == -1 || FIdx == -1)
+    return false;
+
+  return TIdx == FIdx;
+}
+
+/// replacePHIInstrs - Completely replace PHI instructions with selects.
+/// This is possible when the only Tail predecessors are the if-converted
+/// blocks.
+void SSAIfConv::replacePHIInstrs() {
+  assert(Tail->pred_size() == 2 && "Cannot replace PHIs");
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  assert(FirstTerm != Head->end() && "No terminators");
+  DebugLoc HeadDL = FirstTerm->getDebugLoc();
+
+  // Convert all PHIs to select instructions inserted before FirstTerm.
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+    PHIInfo &PI = PHIs[i];
+    LLVM_DEBUG(dbgs() << "If-converting " << *PI.PHI);
+    Register DstReg = PI.PHI->getOperand(0).getReg();
+    if (hasSameValue(*MRI, TII, PI.TReg, PI.FReg)) {
+      // We do not need the select instruction if both incoming values are
+      // equal, but we do need a COPY.
+      BuildMI(*Head, FirstTerm, HeadDL, TII->get(TargetOpcode::COPY), DstReg)
+          .addReg(PI.TReg);
+    } else {
+      TII->insertSelect(*Head, FirstTerm, HeadDL, DstReg, Cond, PI.TReg,
+                        PI.FReg);
+    }
+    LLVM_DEBUG(dbgs() << "          --> " << *std::prev(FirstTerm));
+    PI.PHI->eraseFromParent();
+    PI.PHI = nullptr;
+  }
+}
+
+/// rewritePHIOperands - When there are additional Tail predecessors, insert
+/// select instructions in Head and rewrite PHI operands to use the selects.
+/// Keep the PHI instructions in Tail to handle the other predecessors.
+void SSAIfConv::rewritePHIOperands() {
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  assert(FirstTerm != Head->end() && "No terminators");
+  DebugLoc HeadDL = FirstTerm->getDebugLoc();
+
+  // Convert all PHIs to select instructions inserted before FirstTerm.
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+    PHIInfo &PI = PHIs[i];
+    unsigned DstReg = 0;
+
+    LLVM_DEBUG(dbgs() << "If-converting " << *PI.PHI);
+    if (hasSameValue(*MRI, TII, PI.TReg, PI.FReg)) {
+      // We do not need the select instruction if both incoming values are
+      // equal.
+      DstReg = PI.TReg;
+    } else {
+      Register PHIDst = PI.PHI->getOperand(0).getReg();
+      DstReg = MRI->createVirtualRegister(MRI->getRegClass(PHIDst));
+      TII->insertSelect(*Head, FirstTerm, HeadDL,
+                         DstReg, Cond, PI.TReg, PI.FReg);
+      LLVM_DEBUG(dbgs() << "          --> " << *std::prev(FirstTerm));
+    }
+
+    // Rewrite PHI operands TPred -> (DstReg, Head), remove FPred.
+    for (unsigned i = PI.PHI->getNumOperands(); i != 1; i -= 2) {
+      MachineBasicBlock *MBB = PI.PHI->getOperand(i-1).getMBB();
+      if (MBB == getTPred()) {
+        PI.PHI->getOperand(i-1).setMBB(Head);
+        PI.PHI->getOperand(i-2).setReg(DstReg);
+      } else if (MBB == getFPred()) {
+        PI.PHI->removeOperand(i-1);
+        PI.PHI->removeOperand(i-2);
+      }
+    }
+    LLVM_DEBUG(dbgs() << "          --> " << *PI.PHI);
+  }
+}
+
+/// convertIf - Execute the if conversion after canConvertIf has determined the
+/// feasibility.
+///
+/// Any basic blocks erased will be added to RemovedBlocks.
+///
+void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks,
+                          bool Predicate) {
+  assert(Head && Tail && TBB && FBB && "Call canConvertIf first.");
+
+  // Update statistics.
+  if (isTriangle())
+    ++NumTrianglesConv;
+  else
+    ++NumDiamondsConv;
+
+  // Move all instructions into Head, except for the terminators.
+  if (TBB != Tail) {
+    if (Predicate)
+      PredicateBlock(TBB, /*ReversePredicate=*/false);
+    Head->splice(InsertionPoint, TBB, TBB->begin(), TBB->getFirstTerminator());
+  }
+  if (FBB != Tail) {
+    if (Predicate)
+      PredicateBlock(FBB, /*ReversePredicate=*/true);
+    Head->splice(InsertionPoint, FBB, FBB->begin(), FBB->getFirstTerminator());
+  }
+  // Are there extra Tail predecessors?
+  bool ExtraPreds = Tail->pred_size() != 2;
+  if (ExtraPreds)
+    rewritePHIOperands();
+  else
+    replacePHIInstrs();
+
+  // Fix up the CFG, temporarily leave Head without any successors.
+  Head->removeSuccessor(TBB);
+  Head->removeSuccessor(FBB, true);
+  if (TBB != Tail)
+    TBB->removeSuccessor(Tail, true);
+  if (FBB != Tail)
+    FBB->removeSuccessor(Tail, true);
+
+  // Fix up Head's terminators.
+  // It should become a single branch or a fallthrough.
+  DebugLoc HeadDL = Head->getFirstTerminator()->getDebugLoc();
+  TII->removeBranch(*Head);
+
+  // Erase the now empty conditional blocks. It is likely that Head can fall
+  // through to Tail, and we can join the two blocks.
+  if (TBB != Tail) {
+    RemovedBlocks.push_back(TBB);
+    TBB->eraseFromParent();
+  }
+  if (FBB != Tail) {
+    RemovedBlocks.push_back(FBB);
+    FBB->eraseFromParent();
+  }
+
+  assert(Head->succ_empty() && "Additional head successors?");
+  if (!ExtraPreds && Head->isLayoutSuccessor(Tail)) {
+    // Splice Tail onto the end of Head.
+    LLVM_DEBUG(dbgs() << "Joining tail " << printMBBReference(*Tail)
+                      << " into head " << printMBBReference(*Head) << '\n');
+    Head->splice(Head->end(), Tail,
+                     Tail->begin(), Tail->end());
+    Head->transferSuccessorsAndUpdatePHIs(Tail);
+    RemovedBlocks.push_back(Tail);
+    Tail->eraseFromParent();
+  } else {
+    // We need a branch to Tail, let code placement work it out later.
+    LLVM_DEBUG(dbgs() << "Converting to unconditional branch.\n");
+    SmallVector<MachineOperand, 0> EmptyCond;
+    TII->insertBranch(*Head, Tail, nullptr, EmptyCond, HeadDL);
+    Head->addSuccessor(Tail);
+  }
+  LLVM_DEBUG(dbgs() << *Head);
+}
+
+//===----------------------------------------------------------------------===//
+//                           EarlyIfConverter Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class EarlyIfConverter : public MachineFunctionPass {
+  const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  MCSchedModel SchedModel;
+  MachineRegisterInfo *MRI = nullptr;
+  MachineDominatorTree *DomTree = nullptr;
+  MachineLoopInfo *Loops = nullptr;
+  MachineTraceMetrics *Traces = nullptr;
+  MachineTraceMetrics::Ensemble *MinInstr = nullptr;
+  SSAIfConv IfConv;
+
+public:
+  static char ID;
+  EarlyIfConverter() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override { return "Early If-Conversion"; }
+
+private:
+  bool tryConvertIf(MachineBasicBlock*);
+  void invalidateTraces();
+  bool shouldConvertIf();
+};
+} // end anonymous namespace
+
+char EarlyIfConverter::ID = 0;
+char &llvm::EarlyIfConverterID = EarlyIfConverter::ID;
+
+INITIALIZE_PASS_BEGIN(EarlyIfConverter, DEBUG_TYPE,
+                      "Early If Converter", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(EarlyIfConverter, DEBUG_TYPE,
+                    "Early If Converter", false, false)
+
+void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<MachineTraceMetrics>();
+  AU.addPreserved<MachineTraceMetrics>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+namespace {
+/// Update the dominator tree after if-conversion erased some blocks.
+void updateDomTree(MachineDominatorTree *DomTree, const SSAIfConv &IfConv,
+                   ArrayRef<MachineBasicBlock *> Removed) {
+  // convertIf can remove TBB, FBB, and Tail can be merged into Head.
+  // TBB and FBB should not dominate any blocks.
+  // Tail children should be transferred to Head.
+  MachineDomTreeNode *HeadNode = DomTree->getNode(IfConv.Head);
+  for (auto *B : Removed) {
+    MachineDomTreeNode *Node = DomTree->getNode(B);
+    assert(Node != HeadNode && "Cannot erase the head node");
+    while (Node->getNumChildren()) {
+      assert(Node->getBlock() == IfConv.Tail && "Unexpected children");
+      DomTree->changeImmediateDominator(Node->back(), HeadNode);
+    }
+    DomTree->eraseNode(B);
+  }
+}
+
+/// Update LoopInfo after if-conversion.
+void updateLoops(MachineLoopInfo *Loops,
+                 ArrayRef<MachineBasicBlock *> Removed) {
+  if (!Loops)
+    return;
+  // If-conversion doesn't change loop structure, and it doesn't mess with back
+  // edges, so updating LoopInfo is simply removing the dead blocks.
+  for (auto *B : Removed)
+    Loops->removeBlock(B);
+}
+} // namespace
+
+/// Invalidate MachineTraceMetrics before if-conversion.
+void EarlyIfConverter::invalidateTraces() {
+  Traces->verifyAnalysis();
+  Traces->invalidate(IfConv.Head);
+  Traces->invalidate(IfConv.Tail);
+  Traces->invalidate(IfConv.TBB);
+  Traces->invalidate(IfConv.FBB);
+  Traces->verifyAnalysis();
+}
+
+// Adjust cycles with downward saturation.
+static unsigned adjCycles(unsigned Cyc, int Delta) {
+  if (Delta < 0 && Cyc + Delta > Cyc)
+    return 0;
+  return Cyc + Delta;
+}
+
+namespace {
+/// Helper class to simplify emission of cycle counts into optimization remarks.
+struct Cycles {
+  const char *Key;
+  unsigned Value;
+};
+template <typename Remark> Remark &operator<<(Remark &R, Cycles C) {
+  return R << ore::NV(C.Key, C.Value) << (C.Value == 1 ? " cycle" : " cycles");
+}
+} // anonymous namespace
+
+/// Apply cost model and heuristics to the if-conversion in IfConv.
+/// Return true if the conversion is a good idea.
+///
+bool EarlyIfConverter::shouldConvertIf() {
+  // Stress testing mode disables all cost considerations.
+  if (Stress)
+    return true;
+
+  // Do not try to if-convert if the condition has a high chance of being
+  // predictable.
+  MachineLoop *CurrentLoop = Loops->getLoopFor(IfConv.Head);
+  // If the condition is in a loop, consider it predictable if the condition
+  // itself or all its operands are loop-invariant. E.g. this considers a load
+  // from a loop-invariant address predictable; we were unable to prove that it
+  // doesn't alias any of the memory-writes in the loop, but it is likely to
+  // read to same value multiple times.
+  if (CurrentLoop && any_of(IfConv.Cond, [&](MachineOperand &MO) {
+        if (!MO.isReg() || !MO.isUse())
+          return false;
+        Register Reg = MO.getReg();
+        if (Register::isPhysicalRegister(Reg))
+          return false;
+
+        MachineInstr *Def = MRI->getVRegDef(Reg);
+        return CurrentLoop->isLoopInvariant(*Def) ||
+               all_of(Def->operands(), [&](MachineOperand &Op) {
+                 if (Op.isImm())
+                   return true;
+                 if (!MO.isReg() || !MO.isUse())
+                   return false;
+                 Register Reg = MO.getReg();
+                 if (Register::isPhysicalRegister(Reg))
+                   return false;
+
+                 MachineInstr *Def = MRI->getVRegDef(Reg);
+                 return CurrentLoop->isLoopInvariant(*Def);
+               });
+      }))
+    return false;
+
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceStrategy::TS_MinInstrCount);
+
+  MachineTraceMetrics::Trace TBBTrace = MinInstr->getTrace(IfConv.getTPred());
+  MachineTraceMetrics::Trace FBBTrace = MinInstr->getTrace(IfConv.getFPred());
+  LLVM_DEBUG(dbgs() << "TBB: " << TBBTrace << "FBB: " << FBBTrace);
+  unsigned MinCrit = std::min(TBBTrace.getCriticalPath(),
+                              FBBTrace.getCriticalPath());
+
+  // Set a somewhat arbitrary limit on the critical path extension we accept.
+  unsigned CritLimit = SchedModel.MispredictPenalty/2;
+
+  MachineBasicBlock &MBB = *IfConv.Head;
+  MachineOptimizationRemarkEmitter MORE(*MBB.getParent(), nullptr);
+
+  // If-conversion only makes sense when there is unexploited ILP. Compute the
+  // maximum-ILP resource length of the trace after if-conversion. Compare it
+  // to the shortest critical path.
+  SmallVector<const MachineBasicBlock*, 1> ExtraBlocks;
+  if (IfConv.TBB != IfConv.Tail)
+    ExtraBlocks.push_back(IfConv.TBB);
+  unsigned ResLength = FBBTrace.getResourceLength(ExtraBlocks);
+  LLVM_DEBUG(dbgs() << "Resource length " << ResLength
+                    << ", minimal critical path " << MinCrit << '\n');
+  if (ResLength > MinCrit + CritLimit) {
+    LLVM_DEBUG(dbgs() << "Not enough available ILP.\n");
+    MORE.emit([&]() {
+      MachineOptimizationRemarkMissed R(DEBUG_TYPE, "IfConversion",
+                                        MBB.findDebugLoc(MBB.back()), &MBB);
+      R << "did not if-convert branch: the resulting critical path ("
+        << Cycles{"ResLength", ResLength}
+        << ") would extend the shorter leg's critical path ("
+        << Cycles{"MinCrit", MinCrit} << ") by more than the threshold of "
+        << Cycles{"CritLimit", CritLimit}
+        << ", which cannot be hidden by available ILP.";
+      return R;
+    });
+    return false;
+  }
+
+  // Assume that the depth of the first head terminator will also be the depth
+  // of the select instruction inserted, as determined by the flag dependency.
+  // TBB / FBB data dependencies may delay the select even more.
+  MachineTraceMetrics::Trace HeadTrace = MinInstr->getTrace(IfConv.Head);
+  unsigned BranchDepth =
+      HeadTrace.getInstrCycles(*IfConv.Head->getFirstTerminator()).Depth;
+  LLVM_DEBUG(dbgs() << "Branch depth: " << BranchDepth << '\n');
+
+  // Look at all the tail phis, and compute the critical path extension caused
+  // by inserting select instructions.
+  MachineTraceMetrics::Trace TailTrace = MinInstr->getTrace(IfConv.Tail);
+  struct CriticalPathInfo {
+    unsigned Extra; // Count of extra cycles that the component adds.
+    unsigned Depth; // Absolute depth of the component in cycles.
+  };
+  CriticalPathInfo Cond{};
+  CriticalPathInfo TBlock{};
+  CriticalPathInfo FBlock{};
+  bool ShouldConvert = true;
+  for (unsigned i = 0, e = IfConv.PHIs.size(); i != e; ++i) {
+    SSAIfConv::PHIInfo &PI = IfConv.PHIs[i];
+    unsigned Slack = TailTrace.getInstrSlack(*PI.PHI);
+    unsigned MaxDepth = Slack + TailTrace.getInstrCycles(*PI.PHI).Depth;
+    LLVM_DEBUG(dbgs() << "Slack " << Slack << ":\t" << *PI.PHI);
+
+    // The condition is pulled into the critical path.
+    unsigned CondDepth = adjCycles(BranchDepth, PI.CondCycles);
+    if (CondDepth > MaxDepth) {
+      unsigned Extra = CondDepth - MaxDepth;
+      LLVM_DEBUG(dbgs() << "Condition adds " << Extra << " cycles.\n");
+      if (Extra > Cond.Extra)
+        Cond = {Extra, CondDepth};
+      if (Extra > CritLimit) {
+        LLVM_DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        ShouldConvert = false;
+      }
+    }
+
+    // The TBB value is pulled into the critical path.
+    unsigned TDepth = adjCycles(TBBTrace.getPHIDepth(*PI.PHI), PI.TCycles);
+    if (TDepth > MaxDepth) {
+      unsigned Extra = TDepth - MaxDepth;
+      LLVM_DEBUG(dbgs() << "TBB data adds " << Extra << " cycles.\n");
+      if (Extra > TBlock.Extra)
+        TBlock = {Extra, TDepth};
+      if (Extra > CritLimit) {
+        LLVM_DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        ShouldConvert = false;
+      }
+    }
+
+    // The FBB value is pulled into the critical path.
+    unsigned FDepth = adjCycles(FBBTrace.getPHIDepth(*PI.PHI), PI.FCycles);
+    if (FDepth > MaxDepth) {
+      unsigned Extra = FDepth - MaxDepth;
+      LLVM_DEBUG(dbgs() << "FBB data adds " << Extra << " cycles.\n");
+      if (Extra > FBlock.Extra)
+        FBlock = {Extra, FDepth};
+      if (Extra > CritLimit) {
+        LLVM_DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        ShouldConvert = false;
+      }
+    }
+  }
+
+  // Organize by "short" and "long" legs, since the diagnostics get confusing
+  // when referring to the "true" and "false" sides of the branch, given that
+  // those don't always correlate with what the user wrote in source-terms.
+  const CriticalPathInfo Short = TBlock.Extra > FBlock.Extra ? FBlock : TBlock;
+  const CriticalPathInfo Long = TBlock.Extra > FBlock.Extra ? TBlock : FBlock;
+
+  if (ShouldConvert) {
+    MORE.emit([&]() {
+      MachineOptimizationRemark R(DEBUG_TYPE, "IfConversion",
+                                  MBB.back().getDebugLoc(), &MBB);
+      R << "performing if-conversion on branch: the condition adds "
+        << Cycles{"CondCycles", Cond.Extra} << " to the critical path";
+      if (Short.Extra > 0)
+        R << ", and the short leg adds another "
+          << Cycles{"ShortCycles", Short.Extra};
+      if (Long.Extra > 0)
+        R << ", and the long leg adds another "
+          << Cycles{"LongCycles", Long.Extra};
+      R << ", each staying under the threshold of "
+        << Cycles{"CritLimit", CritLimit} << ".";
+      return R;
+    });
+  } else {
+    MORE.emit([&]() {
+      MachineOptimizationRemarkMissed R(DEBUG_TYPE, "IfConversion",
+                                        MBB.back().getDebugLoc(), &MBB);
+      R << "did not if-convert branch: the condition would add "
+        << Cycles{"CondCycles", Cond.Extra} << " to the critical path";
+      if (Cond.Extra > CritLimit)
+        R << " exceeding the limit of " << Cycles{"CritLimit", CritLimit};
+      if (Short.Extra > 0) {
+        R << ", and the short leg would add another "
+          << Cycles{"ShortCycles", Short.Extra};
+        if (Short.Extra > CritLimit)
+          R << " exceeding the limit of " << Cycles{"CritLimit", CritLimit};
+      }
+      if (Long.Extra > 0) {
+        R << ", and the long leg would add another "
+          << Cycles{"LongCycles", Long.Extra};
+        if (Long.Extra > CritLimit)
+          R << " exceeding the limit of " << Cycles{"CritLimit", CritLimit};
+      }
+      R << ".";
+      return R;
+    });
+  }
+
+  return ShouldConvert;
+}
+
+/// Attempt repeated if-conversion on MBB, return true if successful.
+///
+bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  while (IfConv.canConvertIf(MBB) && shouldConvertIf()) {
+    // If-convert MBB and update analyses.
+    invalidateTraces();
+    SmallVector<MachineBasicBlock*, 4> RemovedBlocks;
+    IfConv.convertIf(RemovedBlocks);
+    Changed = true;
+    updateDomTree(DomTree, IfConv, RemovedBlocks);
+    updateLoops(Loops, RemovedBlocks);
+  }
+  return Changed;
+}
+
+bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  // Only run if conversion if the target wants it.
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  if (!STI.enableEarlyIfConversion())
+    return false;
+
+  TII = STI.getInstrInfo();
+  TRI = STI.getRegisterInfo();
+  SchedModel = STI.getSchedModel();
+  MRI = &MF.getRegInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = nullptr;
+
+  bool Changed = false;
+  IfConv.runOnMachineFunction(MF);
+
+  // Visit blocks in dominator tree post-order. The post-order enables nested
+  // if-conversion in a single pass. The tryConvertIf() function may erase
+  // blocks, but only blocks dominated by the head block. This makes it safe to
+  // update the dominator tree while the post-order iterator is still active.
+  for (auto *DomNode : post_order(DomTree))
+    if (tryConvertIf(DomNode->getBlock()))
+      Changed = true;
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//                           EarlyIfPredicator Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class EarlyIfPredicator : public MachineFunctionPass {
+  const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  TargetSchedModel SchedModel;
+  MachineRegisterInfo *MRI = nullptr;
+  MachineDominatorTree *DomTree = nullptr;
+  MachineBranchProbabilityInfo *MBPI = nullptr;
+  MachineLoopInfo *Loops = nullptr;
+  SSAIfConv IfConv;
+
+public:
+  static char ID;
+  EarlyIfPredicator() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override { return "Early If-predicator"; }
+
+protected:
+  bool tryConvertIf(MachineBasicBlock *);
+  bool shouldConvertIf();
+};
+} // end anonymous namespace
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "early-if-predicator"
+
+char EarlyIfPredicator::ID = 0;
+char &llvm::EarlyIfPredicatorID = EarlyIfPredicator::ID;
+
+INITIALIZE_PASS_BEGIN(EarlyIfPredicator, DEBUG_TYPE, "Early If Predicator",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_END(EarlyIfPredicator, DEBUG_TYPE, "Early If Predicator", false,
+                    false)
+
+void EarlyIfPredicator::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Apply the target heuristic to decide if the transformation is profitable.
+bool EarlyIfPredicator::shouldConvertIf() {
+  auto TrueProbability = MBPI->getEdgeProbability(IfConv.Head, IfConv.TBB);
+  if (IfConv.isTriangle()) {
+    MachineBasicBlock &IfBlock =
+        (IfConv.TBB == IfConv.Tail) ? *IfConv.FBB : *IfConv.TBB;
+
+    unsigned ExtraPredCost = 0;
+    unsigned Cycles = 0;
+    for (MachineInstr &I : IfBlock) {
+      unsigned NumCycles = SchedModel.computeInstrLatency(&I, false);
+      if (NumCycles > 1)
+        Cycles += NumCycles - 1;
+      ExtraPredCost += TII->getPredicationCost(I);
+    }
+
+    return TII->isProfitableToIfCvt(IfBlock, Cycles, ExtraPredCost,
+                                    TrueProbability);
+  }
+  unsigned TExtra = 0;
+  unsigned FExtra = 0;
+  unsigned TCycle = 0;
+  unsigned FCycle = 0;
+  for (MachineInstr &I : *IfConv.TBB) {
+    unsigned NumCycles = SchedModel.computeInstrLatency(&I, false);
+    if (NumCycles > 1)
+      TCycle += NumCycles - 1;
+    TExtra += TII->getPredicationCost(I);
+  }
+  for (MachineInstr &I : *IfConv.FBB) {
+    unsigned NumCycles = SchedModel.computeInstrLatency(&I, false);
+    if (NumCycles > 1)
+      FCycle += NumCycles - 1;
+    FExtra += TII->getPredicationCost(I);
+  }
+  return TII->isProfitableToIfCvt(*IfConv.TBB, TCycle, TExtra, *IfConv.FBB,
+                                  FCycle, FExtra, TrueProbability);
+}
+
+/// Attempt repeated if-conversion on MBB, return true if successful.
+///
+bool EarlyIfPredicator::tryConvertIf(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  while (IfConv.canConvertIf(MBB, /*Predicate*/ true) && shouldConvertIf()) {
+    // If-convert MBB and update analyses.
+    SmallVector<MachineBasicBlock *, 4> RemovedBlocks;
+    IfConv.convertIf(RemovedBlocks, /*Predicate*/ true);
+    Changed = true;
+    updateDomTree(DomTree, IfConv, RemovedBlocks);
+    updateLoops(Loops, RemovedBlocks);
+  }
+  return Changed;
+}
+
+bool EarlyIfPredicator::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "********** EARLY IF-PREDICATOR **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  TII = STI.getInstrInfo();
+  TRI = STI.getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  SchedModel.init(&STI);
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+
+  bool Changed = false;
+  IfConv.runOnMachineFunction(MF);
+
+  // Visit blocks in dominator tree post-order. The post-order enables nested
+  // if-conversion in a single pass. The tryConvertIf() function may erase
+  // blocks, but only blocks dominated by the head block. This makes it safe to
+  // update the dominator tree while the post-order iterator is still active.
+  for (auto *DomNode : post_order(DomTree))
+    if (tryConvertIf(DomNode->getBlock()))
+      Changed = true;
+
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/EdgeBundles.cpp b/contrib/llvm-project/llvm/lib/CodeGen/EdgeBundles.cpp
new file mode 100644
index 000000000000..3dd354e8ab7e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/EdgeBundles.cpp
@@ -0,0 +1,101 @@
+//===-------- EdgeBundles.cpp - Bundles of CFG edges ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the implementation of the EdgeBundles analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+ViewEdgeBundles("view-edge-bundles", cl::Hidden,
+                cl::desc("Pop up a window to show edge bundle graphs"));
+
+char EdgeBundles::ID = 0;
+
+INITIALIZE_PASS(EdgeBundles, "edge-bundles", "Bundle Machine CFG Edges",
+                /* cfg = */true, /* is_analysis = */ true)
+
+char &llvm::EdgeBundlesID = EdgeBundles::ID;
+
+void EdgeBundles::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool EdgeBundles::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  EC.clear();
+  EC.grow(2 * MF->getNumBlockIDs());
+
+  for (const auto &MBB : *MF) {
+    unsigned OutE = 2 * MBB.getNumber() + 1;
+    // Join the outgoing bundle with the ingoing bundles of all successors.
+    for (const MachineBasicBlock *Succ : MBB.successors())
+      EC.join(OutE, 2 * Succ->getNumber());
+  }
+  EC.compress();
+  if (ViewEdgeBundles)
+    view();
+
+  // Compute the reverse mapping.
+  Blocks.clear();
+  Blocks.resize(getNumBundles());
+
+  for (unsigned i = 0, e = MF->getNumBlockIDs(); i != e; ++i) {
+    unsigned b0 = getBundle(i, false);
+    unsigned b1 = getBundle(i, true);
+    Blocks[b0].push_back(i);
+    if (b1 != b0)
+      Blocks[b1].push_back(i);
+  }
+
+  return false;
+}
+
+namespace llvm {
+
+/// Specialize WriteGraph, the standard implementation won't work.
+template<>
+raw_ostream &WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
+                          bool ShortNames,
+                          const Twine &Title) {
+  const MachineFunction *MF = G.getMachineFunction();
+
+  O << "digraph {\n";
+  for (const auto &MBB : *MF) {
+    unsigned BB = MBB.getNumber();
+    O << "\t\"" << printMBBReference(MBB) << "\" [ shape=box ]\n"
+      << '\t' << G.getBundle(BB, false) << " -> \"" << printMBBReference(MBB)
+      << "\"\n"
+      << "\t\"" << printMBBReference(MBB) << "\" -> " << G.getBundle(BB, true)
+      << '\n';
+    for (const MachineBasicBlock *Succ : MBB.successors())
+      O << "\t\"" << printMBBReference(MBB) << "\" -> \""
+        << printMBBReference(*Succ) << "\" [ color=lightgray ]\n";
+  }
+  O << "}\n";
+  return O;
+}
+
+} // end namespace llvm
+
+/// view - Visualize the annotated bipartite CFG with Graphviz.
+void EdgeBundles::view() const {
+  ViewGraph(*this, "EdgeBundles");
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ExecutionDomainFix.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ExecutionDomainFix.cpp
new file mode 100644
index 000000000000..21a7d02a320c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ExecutionDomainFix.cpp
@@ -0,0 +1,470 @@
+//===- ExecutionDomainFix.cpp - Fix execution domain issues ----*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ExecutionDomainFix.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "execution-deps-fix"
+
+iterator_range<SmallVectorImpl<int>::const_iterator>
+ExecutionDomainFix::regIndices(unsigned Reg) const {
+  assert(Reg < AliasMap.size() && "Invalid register");
+  const auto &Entry = AliasMap[Reg];
+  return make_range(Entry.begin(), Entry.end());
+}
+
+DomainValue *ExecutionDomainFix::alloc(int domain) {
+  DomainValue *dv = Avail.empty() ? new (Allocator.Allocate()) DomainValue
+                                  : Avail.pop_back_val();
+  if (domain >= 0)
+    dv->addDomain(domain);
+  assert(dv->Refs == 0 && "Reference count wasn't cleared");
+  assert(!dv->Next && "Chained DomainValue shouldn't have been recycled");
+  return dv;
+}
+
+void ExecutionDomainFix::release(DomainValue *DV) {
+  while (DV) {
+    assert(DV->Refs && "Bad DomainValue");
+    if (--DV->Refs)
+      return;
+
+    // There are no more DV references. Collapse any contained instructions.
+    if (DV->AvailableDomains && !DV->isCollapsed())
+      collapse(DV, DV->getFirstDomain());
+
+    DomainValue *Next = DV->Next;
+    DV->clear();
+    Avail.push_back(DV);
+    // Also release the next DomainValue in the chain.
+    DV = Next;
+  }
+}
+
+DomainValue *ExecutionDomainFix::resolve(DomainValue *&DVRef) {
+  DomainValue *DV = DVRef;
+  if (!DV || !DV->Next)
+    return DV;
+
+  // DV has a chain. Find the end.
+  do
+    DV = DV->Next;
+  while (DV->Next);
+
+  // Update DVRef to point to DV.
+  retain(DV);
+  release(DVRef);
+  DVRef = DV;
+  return DV;
+}
+
+void ExecutionDomainFix::setLiveReg(int rx, DomainValue *dv) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  assert(!LiveRegs.empty() && "Must enter basic block first.");
+
+  if (LiveRegs[rx] == dv)
+    return;
+  if (LiveRegs[rx])
+    release(LiveRegs[rx]);
+  LiveRegs[rx] = retain(dv);
+}
+
+void ExecutionDomainFix::kill(int rx) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  assert(!LiveRegs.empty() && "Must enter basic block first.");
+  if (!LiveRegs[rx])
+    return;
+
+  release(LiveRegs[rx]);
+  LiveRegs[rx] = nullptr;
+}
+
+void ExecutionDomainFix::force(int rx, unsigned domain) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  assert(!LiveRegs.empty() && "Must enter basic block first.");
+  if (DomainValue *dv = LiveRegs[rx]) {
+    if (dv->isCollapsed())
+      dv->addDomain(domain);
+    else if (dv->hasDomain(domain))
+      collapse(dv, domain);
+    else {
+      // This is an incompatible open DomainValue. Collapse it to whatever and
+      // force the new value into domain. This costs a domain crossing.
+      collapse(dv, dv->getFirstDomain());
+      assert(LiveRegs[rx] && "Not live after collapse?");
+      LiveRegs[rx]->addDomain(domain);
+    }
+  } else {
+    // Set up basic collapsed DomainValue.
+    setLiveReg(rx, alloc(domain));
+  }
+}
+
+void ExecutionDomainFix::collapse(DomainValue *dv, unsigned domain) {
+  assert(dv->hasDomain(domain) && "Cannot collapse");
+
+  // Collapse all the instructions.
+  while (!dv->Instrs.empty())
+    TII->setExecutionDomain(*dv->Instrs.pop_back_val(), domain);
+  dv->setSingleDomain(domain);
+
+  // If there are multiple users, give them new, unique DomainValues.
+  if (!LiveRegs.empty() && dv->Refs > 1)
+    for (unsigned rx = 0; rx != NumRegs; ++rx)
+      if (LiveRegs[rx] == dv)
+        setLiveReg(rx, alloc(domain));
+}
+
+bool ExecutionDomainFix::merge(DomainValue *A, DomainValue *B) {
+  assert(!A->isCollapsed() && "Cannot merge into collapsed");
+  assert(!B->isCollapsed() && "Cannot merge from collapsed");
+  if (A == B)
+    return true;
+  // Restrict to the domains that A and B have in common.
+  unsigned common = A->getCommonDomains(B->AvailableDomains);
+  if (!common)
+    return false;
+  A->AvailableDomains = common;
+  A->Instrs.append(B->Instrs.begin(), B->Instrs.end());
+
+  // Clear the old DomainValue so we won't try to swizzle instructions twice.
+  B->clear();
+  // All uses of B are referred to A.
+  B->Next = retain(A);
+
+  for (unsigned rx = 0; rx != NumRegs; ++rx) {
+    assert(!LiveRegs.empty() && "no space allocated for live registers");
+    if (LiveRegs[rx] == B)
+      setLiveReg(rx, A);
+  }
+  return true;
+}
+
+void ExecutionDomainFix::enterBasicBlock(
+    const LoopTraversal::TraversedMBBInfo &TraversedMBB) {
+
+  MachineBasicBlock *MBB = TraversedMBB.MBB;
+
+  // Set up LiveRegs to represent registers entering MBB.
+  // Set default domain values to 'no domain' (nullptr)
+  if (LiveRegs.empty())
+    LiveRegs.assign(NumRegs, nullptr);
+
+  // This is the entry block.
+  if (MBB->pred_empty()) {
+    LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << ": entry\n");
+    return;
+  }
+
+  // Try to coalesce live-out registers from predecessors.
+  for (MachineBasicBlock *pred : MBB->predecessors()) {
+    assert(unsigned(pred->getNumber()) < MBBOutRegsInfos.size() &&
+           "Should have pre-allocated MBBInfos for all MBBs");
+    LiveRegsDVInfo &Incoming = MBBOutRegsInfos[pred->getNumber()];
+    // Incoming is null if this is a backedge from a BB
+    // we haven't processed yet
+    if (Incoming.empty())
+      continue;
+
+    for (unsigned rx = 0; rx != NumRegs; ++rx) {
+      DomainValue *pdv = resolve(Incoming[rx]);
+      if (!pdv)
+        continue;
+      if (!LiveRegs[rx]) {
+        setLiveReg(rx, pdv);
+        continue;
+      }
+
+      // We have a live DomainValue from more than one predecessor.
+      if (LiveRegs[rx]->isCollapsed()) {
+        // We are already collapsed, but predecessor is not. Force it.
+        unsigned Domain = LiveRegs[rx]->getFirstDomain();
+        if (!pdv->isCollapsed() && pdv->hasDomain(Domain))
+          collapse(pdv, Domain);
+        continue;
+      }
+
+      // Currently open, merge in predecessor.
+      if (!pdv->isCollapsed())
+        merge(LiveRegs[rx], pdv);
+      else
+        force(rx, pdv->getFirstDomain());
+    }
+  }
+  LLVM_DEBUG(dbgs() << printMBBReference(*MBB)
+                    << (!TraversedMBB.IsDone ? ": incomplete\n"
+                                             : ": all preds known\n"));
+}
+
+void ExecutionDomainFix::leaveBasicBlock(
+    const LoopTraversal::TraversedMBBInfo &TraversedMBB) {
+  assert(!LiveRegs.empty() && "Must enter basic block first.");
+  unsigned MBBNumber = TraversedMBB.MBB->getNumber();
+  assert(MBBNumber < MBBOutRegsInfos.size() &&
+         "Unexpected basic block number.");
+  // Save register clearances at end of MBB - used by enterBasicBlock().
+  for (DomainValue *OldLiveReg : MBBOutRegsInfos[MBBNumber]) {
+    release(OldLiveReg);
+  }
+  MBBOutRegsInfos[MBBNumber] = LiveRegs;
+  LiveRegs.clear();
+}
+
+bool ExecutionDomainFix::visitInstr(MachineInstr *MI) {
+  // Update instructions with explicit execution domains.
+  std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI);
+  if (DomP.first) {
+    if (DomP.second)
+      visitSoftInstr(MI, DomP.second);
+    else
+      visitHardInstr(MI, DomP.first);
+  }
+
+  return !DomP.first;
+}
+
+void ExecutionDomainFix::processDefs(MachineInstr *MI, bool Kill) {
+  assert(!MI->isDebugInstr() && "Won't process debug values");
+  const MCInstrDesc &MCID = MI->getDesc();
+  for (unsigned i = 0,
+                e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs();
+       i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    if (MO.isUse())
+      continue;
+    for (int rx : regIndices(MO.getReg())) {
+      // This instruction explicitly defines rx.
+      LLVM_DEBUG(dbgs() << printReg(RC->getRegister(rx), TRI) << ":\t" << *MI);
+
+      // Kill off domains redefined by generic instructions.
+      if (Kill)
+        kill(rx);
+    }
+  }
+}
+
+void ExecutionDomainFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
+  // Collapse all uses.
+  for (unsigned i = mi->getDesc().getNumDefs(),
+                e = mi->getDesc().getNumOperands();
+       i != e; ++i) {
+    MachineOperand &mo = mi->getOperand(i);
+    if (!mo.isReg())
+      continue;
+    for (int rx : regIndices(mo.getReg())) {
+      force(rx, domain);
+    }
+  }
+
+  // Kill all defs and force them.
+  for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
+    MachineOperand &mo = mi->getOperand(i);
+    if (!mo.isReg())
+      continue;
+    for (int rx : regIndices(mo.getReg())) {
+      kill(rx);
+      force(rx, domain);
+    }
+  }
+}
+
+void ExecutionDomainFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
+  // Bitmask of available domains for this instruction after taking collapsed
+  // operands into account.
+  unsigned available = mask;
+
+  // Scan the explicit use operands for incoming domains.
+  SmallVector<int, 4> used;
+  if (!LiveRegs.empty())
+    for (unsigned i = mi->getDesc().getNumDefs(),
+                  e = mi->getDesc().getNumOperands();
+         i != e; ++i) {
+      MachineOperand &mo = mi->getOperand(i);
+      if (!mo.isReg())
+        continue;
+      for (int rx : regIndices(mo.getReg())) {
+        DomainValue *dv = LiveRegs[rx];
+        if (dv == nullptr)
+          continue;
+        // Bitmask of domains that dv and available have in common.
+        unsigned common = dv->getCommonDomains(available);
+        // Is it possible to use this collapsed register for free?
+        if (dv->isCollapsed()) {
+          // Restrict available domains to the ones in common with the operand.
+          // If there are no common domains, we must pay the cross-domain
+          // penalty for this operand.
+          if (common)
+            available = common;
+        } else if (common)
+          // Open DomainValue is compatible, save it for merging.
+          used.push_back(rx);
+        else
+          // Open DomainValue is not compatible with instruction. It is useless
+          // now.
+          kill(rx);
+      }
+    }
+
+  // If the collapsed operands force a single domain, propagate the collapse.
+  if (isPowerOf2_32(available)) {
+    unsigned domain = llvm::countr_zero(available);
+    TII->setExecutionDomain(*mi, domain);
+    visitHardInstr(mi, domain);
+    return;
+  }
+
+  // Kill off any remaining uses that don't match available, and build a list of
+  // incoming DomainValues that we want to merge.
+  SmallVector<int, 4> Regs;
+  for (int rx : used) {
+    assert(!LiveRegs.empty() && "no space allocated for live registers");
+    DomainValue *&LR = LiveRegs[rx];
+    // This useless DomainValue could have been missed above.
+    if (!LR->getCommonDomains(available)) {
+      kill(rx);
+      continue;
+    }
+    // Sorted insertion.
+    // Enables giving priority to the latest domains during merging.
+    const int Def = RDA->getReachingDef(mi, RC->getRegister(rx));
+    auto I = partition_point(Regs, [&](int I) {
+      return RDA->getReachingDef(mi, RC->getRegister(I)) <= Def;
+    });
+    Regs.insert(I, rx);
+  }
+
+  // doms are now sorted in order of appearance. Try to merge them all, giving
+  // priority to the latest ones.
+  DomainValue *dv = nullptr;
+  while (!Regs.empty()) {
+    if (!dv) {
+      dv = LiveRegs[Regs.pop_back_val()];
+      // Force the first dv to match the current instruction.
+      dv->AvailableDomains = dv->getCommonDomains(available);
+      assert(dv->AvailableDomains && "Domain should have been filtered");
+      continue;
+    }
+
+    DomainValue *Latest = LiveRegs[Regs.pop_back_val()];
+    // Skip already merged values.
+    if (Latest == dv || Latest->Next)
+      continue;
+    if (merge(dv, Latest))
+      continue;
+
+    // If latest didn't merge, it is useless now. Kill all registers using it.
+    for (int i : used) {
+      assert(!LiveRegs.empty() && "no space allocated for live registers");
+      if (LiveRegs[i] == Latest)
+        kill(i);
+    }
+  }
+
+  // dv is the DomainValue we are going to use for this instruction.
+  if (!dv) {
+    dv = alloc();
+    dv->AvailableDomains = available;
+  }
+  dv->Instrs.push_back(mi);
+
+  // Finally set all defs and non-collapsed uses to dv. We must iterate through
+  // all the operators, including imp-def ones.
+  for (const MachineOperand &mo : mi->operands()) {
+    if (!mo.isReg())
+      continue;
+    for (int rx : regIndices(mo.getReg())) {
+      if (!LiveRegs[rx] || (mo.isDef() && LiveRegs[rx] != dv)) {
+        kill(rx);
+        setLiveReg(rx, dv);
+      }
+    }
+  }
+}
+
+void ExecutionDomainFix::processBasicBlock(
+    const LoopTraversal::TraversedMBBInfo &TraversedMBB) {
+  enterBasicBlock(TraversedMBB);
+  // If this block is not done, it makes little sense to make any decisions
+  // based on clearance information. We need to make a second pass anyway,
+  // and by then we'll have better information, so we can avoid doing the work
+  // to try and break dependencies now.
+  for (MachineInstr &MI : *TraversedMBB.MBB) {
+    if (!MI.isDebugInstr()) {
+      bool Kill = false;
+      if (TraversedMBB.PrimaryPass)
+        Kill = visitInstr(&MI);
+      processDefs(&MI, Kill);
+    }
+  }
+  leaveBasicBlock(TraversedMBB);
+}
+
+bool ExecutionDomainFix::runOnMachineFunction(MachineFunction &mf) {
+  if (skipFunction(mf.getFunction()))
+    return false;
+  MF = &mf;
+  TII = MF->getSubtarget().getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  LiveRegs.clear();
+  assert(NumRegs == RC->getNumRegs() && "Bad regclass");
+
+  LLVM_DEBUG(dbgs() << "********** FIX EXECUTION DOMAIN: "
+                    << TRI->getRegClassName(RC) << " **********\n");
+
+  // If no relevant registers are used in the function, we can skip it
+  // completely.
+  bool anyregs = false;
+  const MachineRegisterInfo &MRI = mf.getRegInfo();
+  for (unsigned Reg : *RC) {
+    if (MRI.isPhysRegUsed(Reg)) {
+      anyregs = true;
+      break;
+    }
+  }
+  if (!anyregs)
+    return false;
+
+  RDA = &getAnalysis<ReachingDefAnalysis>();
+
+  // Initialize the AliasMap on the first use.
+  if (AliasMap.empty()) {
+    // Given a PhysReg, AliasMap[PhysReg] returns a list of indices into RC and
+    // therefore the LiveRegs array.
+    AliasMap.resize(TRI->getNumRegs());
+    for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i)
+      for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true); AI.isValid();
+           ++AI)
+        AliasMap[*AI].push_back(i);
+  }
+
+  // Initialize the MBBOutRegsInfos
+  MBBOutRegsInfos.resize(mf.getNumBlockIDs());
+
+  // Traverse the basic blocks.
+  LoopTraversal Traversal;
+  LoopTraversal::TraversalOrder TraversedMBBOrder = Traversal.traverse(mf);
+  for (const LoopTraversal::TraversedMBBInfo &TraversedMBB : TraversedMBBOrder)
+    processBasicBlock(TraversedMBB);
+
+  for (const LiveRegsDVInfo &OutLiveRegs : MBBOutRegsInfos)
+    for (DomainValue *OutLiveReg : OutLiveRegs)
+      if (OutLiveReg)
+        release(OutLiveReg);
+
+  MBBOutRegsInfos.clear();
+  Avail.clear();
+  Allocator.DestroyAll();
+
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ExpandLargeDivRem.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ExpandLargeDivRem.cpp
new file mode 100644
index 000000000000..057b5311db70
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ExpandLargeDivRem.cpp
@@ -0,0 +1,139 @@
+//===--- ExpandLargeDivRem.cpp - Expand large div/rem ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands div/rem instructions with a bitwidth above a threshold
+// into a call to auto-generated functions.
+// This is useful for targets like x86_64 that cannot lower divisions
+// with more than 128 bits or targets like x86_32 that cannot lower divisions
+// with more than 64 bits.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/IntegerDivision.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned>
+    ExpandDivRemBits("expand-div-rem-bits", cl::Hidden,
+                     cl::init(llvm::IntegerType::MAX_INT_BITS),
+                     cl::desc("div and rem instructions on integers with "
+                              "more than <N> bits are expanded."));
+
+static bool isConstantPowerOfTwo(llvm::Value *V, bool SignedOp) {
+  auto *C = dyn_cast<ConstantInt>(V);
+  if (!C)
+    return false;
+
+  APInt Val = C->getValue();
+  if (SignedOp && Val.isNegative())
+    Val = -Val;
+  return Val.isPowerOf2();
+}
+
+static bool isSigned(unsigned int Opcode) {
+  return Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
+}
+
+static bool runImpl(Function &F, const TargetLowering &TLI) {
+  SmallVector<BinaryOperator *, 4> Replace;
+  bool Modified = false;
+
+  unsigned MaxLegalDivRemBitWidth = TLI.getMaxDivRemBitWidthSupported();
+  if (ExpandDivRemBits != llvm::IntegerType::MAX_INT_BITS)
+    MaxLegalDivRemBitWidth = ExpandDivRemBits;
+
+  if (MaxLegalDivRemBitWidth >= llvm::IntegerType::MAX_INT_BITS)
+    return false;
+
+  for (auto &I : instructions(F)) {
+    switch (I.getOpcode()) {
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::URem:
+    case Instruction::SRem: {
+      // TODO: This doesn't handle vectors.
+      auto *IntTy = dyn_cast<IntegerType>(I.getType());
+      if (!IntTy || IntTy->getIntegerBitWidth() <= MaxLegalDivRemBitWidth)
+        continue;
+
+      // The backend has peephole optimizations for powers of two.
+      if (isConstantPowerOfTwo(I.getOperand(1), isSigned(I.getOpcode())))
+        continue;
+
+      Replace.push_back(&cast<BinaryOperator>(I));
+      Modified = true;
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  if (Replace.empty())
+    return false;
+
+  while (!Replace.empty()) {
+    BinaryOperator *I = Replace.pop_back_val();
+
+    if (I->getOpcode() == Instruction::UDiv ||
+        I->getOpcode() == Instruction::SDiv) {
+      expandDivision(I);
+    } else {
+      expandRemainder(I);
+    }
+  }
+
+  return Modified;
+}
+
+namespace {
+class ExpandLargeDivRemLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  ExpandLargeDivRemLegacyPass() : FunctionPass(ID) {
+    initializeExpandLargeDivRemLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+    auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+    return runImpl(F, *TLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+} // namespace
+
+char ExpandLargeDivRemLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ExpandLargeDivRemLegacyPass, "expand-large-div-rem",
+                      "Expand large div/rem", false, false)
+INITIALIZE_PASS_END(ExpandLargeDivRemLegacyPass, "expand-large-div-rem",
+                    "Expand large div/rem", false, false)
+
+FunctionPass *llvm::createExpandLargeDivRemPass() {
+  return new ExpandLargeDivRemLegacyPass();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
new file mode 100644
index 000000000000..ca8056a53139
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp
@@ -0,0 +1,664 @@
+//===--- ExpandLargeFpConvert.cpp - Expand large fp convert----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+
+// This pass expands ‘fptoui .. to’, ‘fptosi .. to’, ‘uitofp .. to’,
+// ‘sitofp .. to’ instructions with a bitwidth above a threshold into
+// auto-generated functions. This is useful for targets like x86_64 that cannot
+// lower fp convertions with more than 128 bits.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned>
+    ExpandFpConvertBits("expand-fp-convert-bits", cl::Hidden,
+                     cl::init(llvm::IntegerType::MAX_INT_BITS),
+                     cl::desc("fp convert instructions on integers with "
+                              "more than <N> bits are expanded."));
+
+/// Generate code to convert a fp number to integer, replacing FPToS(U)I with
+/// the generated code. This currently generates code similarly to compiler-rt's
+/// implementations.
+///
+/// An example IR generated from compiler-rt/fixsfdi.c looks like below:
+/// define dso_local i64 @foo(float noundef %a) local_unnamed_addr #0 {
+/// entry:
+///   %0 = bitcast float %a to i32
+///   %conv.i = zext i32 %0 to i64
+///   %tobool.not = icmp sgt i32 %0, -1
+///   %conv = select i1 %tobool.not, i64 1, i64 -1
+///   %and = lshr i64 %conv.i, 23
+///   %shr = and i64 %and, 255
+///   %and2 = and i64 %conv.i, 8388607
+///   %or = or i64 %and2, 8388608
+///   %cmp = icmp ult i64 %shr, 127
+///   br i1 %cmp, label %cleanup, label %if.end
+///
+/// if.end:                                           ; preds = %entry
+///   %sub = add nuw nsw i64 %shr, 4294967169
+///   %conv5 = and i64 %sub, 4294967232
+///   %cmp6.not = icmp eq i64 %conv5, 0
+///   br i1 %cmp6.not, label %if.end12, label %if.then8
+///
+/// if.then8:                                         ; preds = %if.end
+///   %cond11 = select i1 %tobool.not, i64 9223372036854775807, i64 -9223372036854775808
+///   br label %cleanup
+///
+/// if.end12:                                         ; preds = %if.end
+///   %cmp13 = icmp ult i64 %shr, 150
+///   br i1 %cmp13, label %if.then15, label %if.else
+///
+/// if.then15:                                        ; preds = %if.end12
+///   %sub16 = sub nuw nsw i64 150, %shr
+///   %shr17 = lshr i64 %or, %sub16
+///   %mul = mul nsw i64 %shr17, %conv
+///   br label %cleanup
+///
+/// if.else:                                          ; preds = %if.end12
+///   %sub18 = add nsw i64 %shr, -150
+///   %shl = shl i64 %or, %sub18
+///   %mul19 = mul nsw i64 %shl, %conv
+///   br label %cleanup
+///
+/// cleanup:                                          ; preds = %entry, %if.else, %if.then15, %if.then8
+///   %retval.0 = phi i64 [ %cond11, %if.then8 ], [ %mul, %if.then15 ], [ %mul19, %if.else ], [ 0, %entry ]
+///   ret i64 %retval.0
+/// }
+///
+/// Replace fp to integer with generated code.
+static void expandFPToI(Instruction *FPToI) {
+  IRBuilder<> Builder(FPToI);
+  auto *FloatVal = FPToI->getOperand(0);
+  IntegerType *IntTy = cast<IntegerType>(FPToI->getType());
+
+  unsigned BitWidth = FPToI->getType()->getIntegerBitWidth();
+  unsigned FPMantissaWidth = FloatVal->getType()->getFPMantissaWidth() - 1;
+
+  // FIXME: fp16's range is covered by i32. So `fptoi half` can convert
+  // to i32 first following a sext/zext to target integer type.
+  Value *A1 = nullptr;
+  if (FloatVal->getType()->isHalfTy()) {
+    if (FPToI->getOpcode() == Instruction::FPToUI) {
+      Value *A0 = Builder.CreateFPToUI(FloatVal, Builder.getIntNTy(32));
+      A1 = Builder.CreateZExt(A0, IntTy);
+    } else { // FPToSI
+      Value *A0 = Builder.CreateFPToSI(FloatVal, Builder.getIntNTy(32));
+      A1 = Builder.CreateSExt(A0, IntTy);
+    }
+    FPToI->replaceAllUsesWith(A1);
+    FPToI->dropAllReferences();
+    FPToI->eraseFromParent();
+    return;
+  }
+
+  // fp80 conversion is implemented by fpext to fp128 first then do the
+  // conversion.
+  FPMantissaWidth = FPMantissaWidth == 63 ? 112 : FPMantissaWidth;
+  unsigned FloatWidth = PowerOf2Ceil(FPMantissaWidth);
+  unsigned ExponentWidth = FloatWidth - FPMantissaWidth - 1;
+  unsigned ExponentBias = (1 << (ExponentWidth - 1)) - 1;
+  Value *ImplicitBit = Builder.CreateShl(
+      Builder.getIntN(BitWidth, 1), Builder.getIntN(BitWidth, FPMantissaWidth));
+  Value *SignificandMask =
+      Builder.CreateSub(ImplicitBit, Builder.getIntN(BitWidth, 1));
+  Value *NegOne = Builder.CreateSExt(
+      ConstantInt::getSigned(Builder.getInt32Ty(), -1), IntTy);
+  Value *NegInf =
+      Builder.CreateShl(ConstantInt::getSigned(IntTy, 1),
+                        ConstantInt::getSigned(IntTy, BitWidth - 1));
+
+  BasicBlock *Entry = Builder.GetInsertBlock();
+  Function *F = Entry->getParent();
+  Entry->setName(Twine(Entry->getName(), "fp-to-i-entry"));
+  BasicBlock *End =
+      Entry->splitBasicBlock(Builder.GetInsertPoint(), "fp-to-i-cleanup");
+  BasicBlock *IfEnd =
+      BasicBlock::Create(Builder.getContext(), "fp-to-i-if-end", F, End);
+  BasicBlock *IfThen5 =
+      BasicBlock::Create(Builder.getContext(), "fp-to-i-if-then5", F, End);
+  BasicBlock *IfEnd9 =
+      BasicBlock::Create(Builder.getContext(), "fp-to-i-if-end9", F, End);
+  BasicBlock *IfThen12 =
+      BasicBlock::Create(Builder.getContext(), "fp-to-i-if-then12", F, End);
+  BasicBlock *IfElse =
+      BasicBlock::Create(Builder.getContext(), "fp-to-i-if-else", F, End);
+
+  Entry->getTerminator()->eraseFromParent();
+
+  // entry:
+  Builder.SetInsertPoint(Entry);
+  Value *FloatVal0 = FloatVal;
+  // fp80 conversion is implemented by fpext to fp128 first then do the
+  // conversion.
+  if (FloatVal->getType()->isX86_FP80Ty())
+    FloatVal0 =
+        Builder.CreateFPExt(FloatVal, Type::getFP128Ty(Builder.getContext()));
+  Value *ARep0 =
+      Builder.CreateBitCast(FloatVal0, Builder.getIntNTy(FloatWidth));
+  Value *ARep = Builder.CreateZExt(ARep0, FPToI->getType());
+  Value *PosOrNeg = Builder.CreateICmpSGT(
+      ARep0, ConstantInt::getSigned(Builder.getIntNTy(FloatWidth), -1));
+  Value *Sign = Builder.CreateSelect(PosOrNeg, ConstantInt::getSigned(IntTy, 1),
+                                     ConstantInt::getSigned(IntTy, -1));
+  Value *And =
+      Builder.CreateLShr(ARep, Builder.getIntN(BitWidth, FPMantissaWidth));
+  Value *And2 = Builder.CreateAnd(
+      And, Builder.getIntN(BitWidth, (1 << ExponentWidth) - 1));
+  Value *Abs = Builder.CreateAnd(ARep, SignificandMask);
+  Value *Or = Builder.CreateOr(Abs, ImplicitBit);
+  Value *Cmp =
+      Builder.CreateICmpULT(And2, Builder.getIntN(BitWidth, ExponentBias));
+  Builder.CreateCondBr(Cmp, End, IfEnd);
+
+  // if.end:
+  Builder.SetInsertPoint(IfEnd);
+  Value *Add1 = Builder.CreateAdd(
+      And2, ConstantInt::getSigned(IntTy, -int64_t(ExponentBias + BitWidth)));
+  Value *Cmp3 =
+      Builder.CreateICmpULT(Add1, ConstantInt::getSigned(IntTy, -BitWidth));
+  Builder.CreateCondBr(Cmp3, IfThen5, IfEnd9);
+
+  // if.then5:
+  Builder.SetInsertPoint(IfThen5);
+  Value *PosInf = Builder.CreateXor(NegOne, NegInf);
+  Value *Cond8 = Builder.CreateSelect(PosOrNeg, PosInf, NegInf);
+  Builder.CreateBr(End);
+
+  // if.end9:
+  Builder.SetInsertPoint(IfEnd9);
+  Value *Cmp10 = Builder.CreateICmpULT(
+      And2, Builder.getIntN(BitWidth, ExponentBias + FPMantissaWidth));
+  Builder.CreateCondBr(Cmp10, IfThen12, IfElse);
+
+  // if.then12:
+  Builder.SetInsertPoint(IfThen12);
+  Value *Sub13 = Builder.CreateSub(
+      Builder.getIntN(BitWidth, ExponentBias + FPMantissaWidth), And2);
+  Value *Shr14 = Builder.CreateLShr(Or, Sub13);
+  Value *Mul = Builder.CreateMul(Shr14, Sign);
+  Builder.CreateBr(End);
+
+  // if.else:
+  Builder.SetInsertPoint(IfElse);
+  Value *Sub15 = Builder.CreateAdd(
+      And2,
+      ConstantInt::getSigned(IntTy, -(ExponentBias + FPMantissaWidth)));
+  Value *Shl = Builder.CreateShl(Or, Sub15);
+  Value *Mul16 = Builder.CreateMul(Shl, Sign);
+  Builder.CreateBr(End);
+
+  // cleanup:
+  Builder.SetInsertPoint(End, End->begin());
+  PHINode *Retval0 = Builder.CreatePHI(FPToI->getType(), 4);
+
+  Retval0->addIncoming(Cond8, IfThen5);
+  Retval0->addIncoming(Mul, IfThen12);
+  Retval0->addIncoming(Mul16, IfElse);
+  Retval0->addIncoming(Builder.getIntN(BitWidth, 0), Entry);
+
+  FPToI->replaceAllUsesWith(Retval0);
+  FPToI->dropAllReferences();
+  FPToI->eraseFromParent();
+}
+
+/// Generate code to convert a fp number to integer, replacing S(U)IToFP with
+/// the generated code. This currently generates code similarly to compiler-rt's
+/// implementations. This implementation has an implicit assumption that integer
+/// width is larger than fp.
+///
+/// An example IR generated from compiler-rt/floatdisf.c looks like below:
+/// define dso_local float @__floatdisf(i64 noundef %a) local_unnamed_addr #0 {
+/// entry:
+///   %cmp = icmp eq i64 %a, 0
+///   br i1 %cmp, label %return, label %if.end
+///
+/// if.end:                                           ; preds = %entry
+///   %shr = ashr i64 %a, 63
+///   %xor = xor i64 %shr, %a
+///   %sub = sub nsw i64 %xor, %shr
+///   %0 = tail call i64 @llvm.ctlz.i64(i64 %sub, i1 true), !range !5
+///   %cast = trunc i64 %0 to i32
+///   %sub1 = sub nuw nsw i32 64, %cast
+///   %sub2 = xor i32 %cast, 63
+///   %cmp3 = icmp ult i32 %cast, 40
+///   br i1 %cmp3, label %if.then4, label %if.else
+///
+/// if.then4:                                         ; preds = %if.end
+///   switch i32 %sub1, label %sw.default [
+///     i32 25, label %sw.bb
+///     i32 26, label %sw.epilog
+///   ]
+///
+/// sw.bb:                                            ; preds = %if.then4
+///   %shl = shl i64 %sub, 1
+///   br label %sw.epilog
+///
+/// sw.default:                                       ; preds = %if.then4
+///   %sub5 = sub nsw i64 38, %0
+///   %sh_prom = and i64 %sub5, 4294967295
+///   %shr6 = lshr i64 %sub, %sh_prom
+///   %shr9 = lshr i64 274877906943, %0
+///   %and = and i64 %shr9, %sub
+///   %cmp10 = icmp ne i64 %and, 0
+///   %conv11 = zext i1 %cmp10 to i64
+///   %or = or i64 %shr6, %conv11
+///   br label %sw.epilog
+///
+/// sw.epilog:                                        ; preds = %sw.default, %if.then4, %sw.bb
+///   %a.addr.0 = phi i64 [ %or, %sw.default ], [ %sub, %if.then4 ], [ %shl, %sw.bb ]
+///   %1 = lshr i64 %a.addr.0, 2
+///   %2 = and i64 %1, 1
+///   %or16 = or i64 %2, %a.addr.0
+///   %inc = add nsw i64 %or16, 1
+///   %3 = and i64 %inc, 67108864
+///   %tobool.not = icmp eq i64 %3, 0
+///   %spec.select.v = select i1 %tobool.not, i64 2, i64 3
+///   %spec.select = ashr i64 %inc, %spec.select.v
+///   %spec.select56 = select i1 %tobool.not, i32 %sub2, i32 %sub1
+///   br label %if.end26
+///
+/// if.else:                                          ; preds = %if.end
+///   %sub23 = add nuw nsw i64 %0, 4294967256
+///   %sh_prom24 = and i64 %sub23, 4294967295
+///   %shl25 = shl i64 %sub, %sh_prom24
+///   br label %if.end26
+///
+/// if.end26:                                         ; preds = %sw.epilog, %if.else
+///   %a.addr.1 = phi i64 [ %shl25, %if.else ], [ %spec.select, %sw.epilog ]
+///   %e.0 = phi i32 [ %sub2, %if.else ], [ %spec.select56, %sw.epilog ]
+///   %conv27 = trunc i64 %shr to i32
+///   %and28 = and i32 %conv27, -2147483648
+///   %add = shl nuw nsw i32 %e.0, 23
+///   %shl29 = add nuw nsw i32 %add, 1065353216
+///   %conv31 = trunc i64 %a.addr.1 to i32
+///   %and32 = and i32 %conv31, 8388607
+///   %or30 = or i32 %and32, %and28
+///   %or33 = or i32 %or30, %shl29
+///   %4 = bitcast i32 %or33 to float
+///   br label %return
+///
+/// return:                                           ; preds = %entry, %if.end26
+///   %retval.0 = phi float [ %4, %if.end26 ], [ 0.000000e+00, %entry ]
+///   ret float %retval.0
+/// }
+///
+/// Replace integer to fp with generated code.
+static void expandIToFP(Instruction *IToFP) {
+  IRBuilder<> Builder(IToFP);
+  auto *IntVal = IToFP->getOperand(0);
+  IntegerType *IntTy = cast<IntegerType>(IntVal->getType());
+
+  unsigned BitWidth = IntVal->getType()->getIntegerBitWidth();
+  unsigned FPMantissaWidth = IToFP->getType()->getFPMantissaWidth() - 1;
+  // fp80 conversion is implemented by conversion tp fp128 first following
+  // a fptrunc to fp80.
+  FPMantissaWidth = FPMantissaWidth == 63 ? 112 : FPMantissaWidth;
+  // FIXME: As there is no related builtins added in compliler-rt,
+  // here currently utilized the fp32 <-> fp16 lib calls to implement.
+  FPMantissaWidth = FPMantissaWidth == 10 ? 23 : FPMantissaWidth;
+  unsigned FloatWidth = PowerOf2Ceil(FPMantissaWidth);
+  bool IsSigned = IToFP->getOpcode() == Instruction::SIToFP;
+
+  assert(BitWidth > FloatWidth && "Unexpected conversion. expandIToFP() "
+                                  "assumes integer width is larger than fp.");
+
+  Value *Temp1 =
+      Builder.CreateShl(Builder.getIntN(BitWidth, 1),
+                        Builder.getIntN(BitWidth, FPMantissaWidth + 3));
+
+  BasicBlock *Entry = Builder.GetInsertBlock();
+  Function *F = Entry->getParent();
+  Entry->setName(Twine(Entry->getName(), "itofp-entry"));
+  BasicBlock *End =
+      Entry->splitBasicBlock(Builder.GetInsertPoint(), "itofp-return");
+  BasicBlock *IfEnd =
+      BasicBlock::Create(Builder.getContext(), "itofp-if-end", F, End);
+  BasicBlock *IfThen4 =
+      BasicBlock::Create(Builder.getContext(), "itofp-if-then4", F, End);
+  BasicBlock *SwBB =
+      BasicBlock::Create(Builder.getContext(), "itofp-sw-bb", F, End);
+  BasicBlock *SwDefault =
+      BasicBlock::Create(Builder.getContext(), "itofp-sw-default", F, End);
+  BasicBlock *SwEpilog =
+      BasicBlock::Create(Builder.getContext(), "itofp-sw-epilog", F, End);
+  BasicBlock *IfThen20 =
+      BasicBlock::Create(Builder.getContext(), "itofp-if-then20", F, End);
+  BasicBlock *IfElse =
+      BasicBlock::Create(Builder.getContext(), "itofp-if-else", F, End);
+  BasicBlock *IfEnd26 =
+      BasicBlock::Create(Builder.getContext(), "itofp-if-end26", F, End);
+
+  Entry->getTerminator()->eraseFromParent();
+
+  Function *CTLZ =
+      Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, IntTy);
+  ConstantInt *True = Builder.getTrue();
+
+  // entry:
+  Builder.SetInsertPoint(Entry);
+  Value *Cmp = Builder.CreateICmpEQ(IntVal, ConstantInt::getSigned(IntTy, 0));
+  Builder.CreateCondBr(Cmp, End, IfEnd);
+
+  // if.end:
+  Builder.SetInsertPoint(IfEnd);
+  Value *Shr =
+      Builder.CreateAShr(IntVal, Builder.getIntN(BitWidth, BitWidth - 1));
+  Value *Xor = Builder.CreateXor(Shr, IntVal);
+  Value *Sub = Builder.CreateSub(Xor, Shr);
+  Value *Call = Builder.CreateCall(CTLZ, {IsSigned ? Sub : IntVal, True});
+  Value *Cast = Builder.CreateTrunc(Call, Builder.getInt32Ty());
+  int BitWidthNew = FloatWidth == 128 ? BitWidth : 32;
+  Value *Sub1 = Builder.CreateSub(Builder.getIntN(BitWidthNew, BitWidth),
+                                  FloatWidth == 128 ? Call : Cast);
+  Value *Sub2 = Builder.CreateSub(Builder.getIntN(BitWidthNew, BitWidth - 1),
+                                  FloatWidth == 128 ? Call : Cast);
+  Value *Cmp3 = Builder.CreateICmpSGT(
+      Sub2, Builder.getIntN(BitWidthNew, FPMantissaWidth + 1));
+  Builder.CreateCondBr(Cmp3, IfThen4, IfElse);
+
+  // if.then4:
+  Builder.SetInsertPoint(IfThen4);
+  llvm::SwitchInst *SI = Builder.CreateSwitch(Sub1, SwDefault);
+  SI->addCase(Builder.getIntN(BitWidthNew, FPMantissaWidth + 2), SwBB);
+  SI->addCase(Builder.getIntN(BitWidthNew, FPMantissaWidth + 3), SwEpilog);
+
+  // sw.bb:
+  Builder.SetInsertPoint(SwBB);
+  Value *Shl =
+      Builder.CreateShl(IsSigned ? Sub : IntVal, Builder.getIntN(BitWidth, 1));
+  Builder.CreateBr(SwEpilog);
+
+  // sw.default:
+  Builder.SetInsertPoint(SwDefault);
+  Value *Sub5 = Builder.CreateSub(
+      Builder.getIntN(BitWidthNew, BitWidth - FPMantissaWidth - 3),
+      FloatWidth == 128 ? Call : Cast);
+  Value *ShProm = Builder.CreateZExt(Sub5, IntTy);
+  Value *Shr6 = Builder.CreateLShr(IsSigned ? Sub : IntVal,
+                                   FloatWidth == 128 ? Sub5 : ShProm);
+  Value *Sub8 =
+      Builder.CreateAdd(FloatWidth == 128 ? Call : Cast,
+                        Builder.getIntN(BitWidthNew, FPMantissaWidth + 3));
+  Value *ShProm9 = Builder.CreateZExt(Sub8, IntTy);
+  Value *Shr9 = Builder.CreateLShr(ConstantInt::getSigned(IntTy, -1),
+                                   FloatWidth == 128 ? Sub8 : ShProm9);
+  Value *And = Builder.CreateAnd(Shr9, IsSigned ? Sub : IntVal);
+  Value *Cmp10 = Builder.CreateICmpNE(And, Builder.getIntN(BitWidth, 0));
+  Value *Conv11 = Builder.CreateZExt(Cmp10, IntTy);
+  Value *Or = Builder.CreateOr(Shr6, Conv11);
+  Builder.CreateBr(SwEpilog);
+
+  // sw.epilog:
+  Builder.SetInsertPoint(SwEpilog);
+  PHINode *AAddr0 = Builder.CreatePHI(IntTy, 3);
+  AAddr0->addIncoming(Or, SwDefault);
+  AAddr0->addIncoming(IsSigned ? Sub : IntVal, IfThen4);
+  AAddr0->addIncoming(Shl, SwBB);
+  Value *A0 = Builder.CreateTrunc(AAddr0, Builder.getInt32Ty());
+  Value *A1 = Builder.CreateLShr(A0, Builder.getIntN(32, 2));
+  Value *A2 = Builder.CreateAnd(A1, Builder.getIntN(32, 1));
+  Value *Conv16 = Builder.CreateZExt(A2, IntTy);
+  Value *Or17 = Builder.CreateOr(AAddr0, Conv16);
+  Value *Inc = Builder.CreateAdd(Or17, Builder.getIntN(BitWidth, 1));
+  Value *Shr18 = nullptr;
+  if (IsSigned)
+    Shr18 = Builder.CreateAShr(Inc, Builder.getIntN(BitWidth, 2));
+  else
+    Shr18 = Builder.CreateLShr(Inc, Builder.getIntN(BitWidth, 2));
+  Value *A3 = Builder.CreateAnd(Inc, Temp1, "a3");
+  Value *PosOrNeg = Builder.CreateICmpEQ(A3, Builder.getIntN(BitWidth, 0));
+  Value *ExtractT60 = Builder.CreateTrunc(Shr18, Builder.getIntNTy(FloatWidth));
+  Value *Extract63 = Builder.CreateLShr(Shr18, Builder.getIntN(BitWidth, 32));
+  Value *ExtractT64 = nullptr;
+  if (FloatWidth > 80)
+    ExtractT64 = Builder.CreateTrunc(Sub2, Builder.getInt64Ty());
+  else
+    ExtractT64 = Builder.CreateTrunc(Extract63, Builder.getInt32Ty());
+  Builder.CreateCondBr(PosOrNeg, IfEnd26, IfThen20);
+
+  // if.then20
+  Builder.SetInsertPoint(IfThen20);
+  Value *Shr21 = nullptr;
+  if (IsSigned)
+    Shr21 = Builder.CreateAShr(Inc, Builder.getIntN(BitWidth, 3));
+  else
+    Shr21 = Builder.CreateLShr(Inc, Builder.getIntN(BitWidth, 3));
+  Value *ExtractT = Builder.CreateTrunc(Shr21, Builder.getIntNTy(FloatWidth));
+  Value *Extract = Builder.CreateLShr(Shr21, Builder.getIntN(BitWidth, 32));
+  Value *ExtractT62 = nullptr;
+  if (FloatWidth > 80)
+    ExtractT62 = Builder.CreateTrunc(Sub1, Builder.getIntNTy(64));
+  else
+    ExtractT62 = Builder.CreateTrunc(Extract, Builder.getIntNTy(32));
+  Builder.CreateBr(IfEnd26);
+
+  // if.else:
+  Builder.SetInsertPoint(IfElse);
+  Value *Sub24 = Builder.CreateAdd(
+      FloatWidth == 128 ? Call : Cast,
+      ConstantInt::getSigned(Builder.getIntNTy(BitWidthNew),
+                             -(BitWidth - FPMantissaWidth - 1)));
+  Value *ShProm25 = Builder.CreateZExt(Sub24, IntTy);
+  Value *Shl26 = Builder.CreateShl(IsSigned ? Sub : IntVal,
+                                   FloatWidth == 128 ? Sub24 : ShProm25);
+  Value *ExtractT61 = Builder.CreateTrunc(Shl26, Builder.getIntNTy(FloatWidth));
+  Value *Extract65 = Builder.CreateLShr(Shl26, Builder.getIntN(BitWidth, 32));
+  Value *ExtractT66 = nullptr;
+  if (FloatWidth > 80)
+    ExtractT66 = Builder.CreateTrunc(Sub2, Builder.getIntNTy(64));
+  else
+    ExtractT66 = Builder.CreateTrunc(Extract65, Builder.getInt32Ty());
+  Builder.CreateBr(IfEnd26);
+
+  // if.end26:
+  Builder.SetInsertPoint(IfEnd26);
+  PHINode *AAddr1Off0 = Builder.CreatePHI(Builder.getIntNTy(FloatWidth), 3);
+  AAddr1Off0->addIncoming(ExtractT, IfThen20);
+  AAddr1Off0->addIncoming(ExtractT60, SwEpilog);
+  AAddr1Off0->addIncoming(ExtractT61, IfElse);
+  PHINode *AAddr1Off32 = nullptr;
+  if (FloatWidth > 32) {
+    AAddr1Off32 =
+        Builder.CreatePHI(Builder.getIntNTy(FloatWidth > 80 ? 64 : 32), 3);
+    AAddr1Off32->addIncoming(ExtractT62, IfThen20);
+    AAddr1Off32->addIncoming(ExtractT64, SwEpilog);
+    AAddr1Off32->addIncoming(ExtractT66, IfElse);
+  }
+  PHINode *E0 = nullptr;
+  if (FloatWidth <= 80) {
+    E0 = Builder.CreatePHI(Builder.getIntNTy(BitWidthNew), 3);
+    E0->addIncoming(Sub1, IfThen20);
+    E0->addIncoming(Sub2, SwEpilog);
+    E0->addIncoming(Sub2, IfElse);
+  }
+  Value *And29 = nullptr;
+  if (FloatWidth > 80) {
+    Value *Temp2 = Builder.CreateShl(Builder.getIntN(BitWidth, 1),
+                                     Builder.getIntN(BitWidth, 63));
+    And29 = Builder.CreateAnd(Shr, Temp2, "and29");
+  } else {
+    Value *Conv28 = Builder.CreateTrunc(Shr, Builder.getIntNTy(32));
+    And29 = Builder.CreateAnd(
+        Conv28, ConstantInt::getSigned(Builder.getIntNTy(32), 0x80000000));
+  }
+  unsigned TempMod = FPMantissaWidth % 32;
+  Value *And34 = nullptr;
+  Value *Shl30 = nullptr;
+  if (FloatWidth > 80) {
+    TempMod += 32;
+    Value *Add = Builder.CreateShl(AAddr1Off32, Builder.getIntN(64, TempMod));
+    Shl30 = Builder.CreateAdd(
+        Add,
+        Builder.getIntN(64, ((1ull << (62ull - TempMod)) - 1ull) << TempMod));
+    And34 = Builder.CreateZExt(Shl30, Builder.getIntNTy(128));
+  } else {
+    Value *Add = Builder.CreateShl(E0, Builder.getIntN(32, TempMod));
+    Shl30 = Builder.CreateAdd(
+        Add, Builder.getIntN(32, ((1 << (30 - TempMod)) - 1) << TempMod));
+    And34 = Builder.CreateAnd(FloatWidth > 32 ? AAddr1Off32 : AAddr1Off0,
+                              Builder.getIntN(32, (1 << TempMod) - 1));
+  }
+  Value *Or35 = nullptr;
+  if (FloatWidth > 80) {
+    Value *And29Trunc = Builder.CreateTrunc(And29, Builder.getIntNTy(128));
+    Value *Or31 = Builder.CreateOr(And29Trunc, And34);
+    Value *Or34 = Builder.CreateShl(Or31, Builder.getIntN(128, 64));
+    Value *Temp3 = Builder.CreateShl(Builder.getIntN(128, 1),
+                                     Builder.getIntN(128, FPMantissaWidth));
+    Value *Temp4 = Builder.CreateSub(Temp3, Builder.getIntN(128, 1));
+    Value *A6 = Builder.CreateAnd(AAddr1Off0, Temp4);
+    Or35 = Builder.CreateOr(Or34, A6);
+  } else {
+    Value *Or31 = Builder.CreateOr(And34, And29);
+    Or35 = Builder.CreateOr(IsSigned ? Or31 : And34, Shl30);
+  }
+  Value *A4 = nullptr;
+  if (IToFP->getType()->isDoubleTy()) {
+    Value *ZExt1 = Builder.CreateZExt(Or35, Builder.getIntNTy(FloatWidth));
+    Value *Shl1 = Builder.CreateShl(ZExt1, Builder.getIntN(FloatWidth, 32));
+    Value *And1 =
+        Builder.CreateAnd(AAddr1Off0, Builder.getIntN(FloatWidth, 0xFFFFFFFF));
+    Value *Or1 = Builder.CreateOr(Shl1, And1);
+    A4 = Builder.CreateBitCast(Or1, IToFP->getType());
+  } else if (IToFP->getType()->isX86_FP80Ty()) {
+    Value *A40 =
+        Builder.CreateBitCast(Or35, Type::getFP128Ty(Builder.getContext()));
+    A4 = Builder.CreateFPTrunc(A40, IToFP->getType());
+  } else if (IToFP->getType()->isHalfTy()) {
+    // Deal with "half" situation. This is a workaround since we don't have
+    // floattihf.c currently as referring.
+    Value *A40 =
+        Builder.CreateBitCast(Or35, Type::getFloatTy(Builder.getContext()));
+    A4 = Builder.CreateFPTrunc(A40, IToFP->getType());
+  } else // float type
+    A4 = Builder.CreateBitCast(Or35, IToFP->getType());
+  Builder.CreateBr(End);
+
+  // return:
+  Builder.SetInsertPoint(End, End->begin());
+  PHINode *Retval0 = Builder.CreatePHI(IToFP->getType(), 2);
+  Retval0->addIncoming(A4, IfEnd26);
+  Retval0->addIncoming(ConstantFP::getZero(IToFP->getType(), false), Entry);
+
+  IToFP->replaceAllUsesWith(Retval0);
+  IToFP->dropAllReferences();
+  IToFP->eraseFromParent();
+}
+
+static bool runImpl(Function &F, const TargetLowering &TLI) {
+  SmallVector<Instruction *, 4> Replace;
+  bool Modified = false;
+
+  unsigned MaxLegalFpConvertBitWidth =
+      TLI.getMaxLargeFPConvertBitWidthSupported();
+  if (ExpandFpConvertBits != llvm::IntegerType::MAX_INT_BITS)
+    MaxLegalFpConvertBitWidth = ExpandFpConvertBits;
+
+  if (MaxLegalFpConvertBitWidth >= llvm::IntegerType::MAX_INT_BITS)
+    return false;
+
+  for (auto &I : instructions(F)) {
+    switch (I.getOpcode()) {
+    case Instruction::FPToUI:
+    case Instruction::FPToSI: {
+      // TODO: This pass doesn't handle vectors.
+      if (I.getOperand(0)->getType()->isVectorTy())
+        continue;
+
+      auto *IntTy = dyn_cast<IntegerType>(I.getType());
+      if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
+        continue;
+
+      Replace.push_back(&I);
+      Modified = true;
+      break;
+    }
+    case Instruction::UIToFP:
+    case Instruction::SIToFP: {
+      // TODO: This pass doesn't handle vectors.
+      if (I.getOperand(0)->getType()->isVectorTy())
+        continue;
+
+      auto *IntTy = dyn_cast<IntegerType>(I.getOperand(0)->getType());
+      if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
+        continue;
+
+      Replace.push_back(&I);
+      Modified = true;
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  if (Replace.empty())
+    return false;
+
+  while (!Replace.empty()) {
+    Instruction *I = Replace.pop_back_val();
+    if (I->getOpcode() == Instruction::FPToUI ||
+        I->getOpcode() == Instruction::FPToSI) {
+      expandFPToI(I);
+    } else {
+      expandIToFP(I);
+    }
+  }
+
+  return Modified;
+}
+
+namespace {
+class ExpandLargeFpConvertLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  ExpandLargeFpConvertLegacyPass() : FunctionPass(ID) {
+    initializeExpandLargeFpConvertLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+    auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+    return runImpl(F, *TLI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+} // namespace
+
+char ExpandLargeFpConvertLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ExpandLargeFpConvertLegacyPass, "expand-large-fp-convert",
+                      "Expand large fp convert", false, false)
+INITIALIZE_PASS_END(ExpandLargeFpConvertLegacyPass, "expand-large-fp-convert",
+                    "Expand large fp convert", false, false)
+
+FunctionPass *llvm::createExpandLargeFpConvertPass() {
+  return new ExpandLargeFpConvertLegacyPass();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ExpandMemCmp.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ExpandMemCmp.cpp
new file mode 100644
index 000000000000..500f31bd8e89
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -0,0 +1,916 @@
+//===--- ExpandMemCmp.cpp - Expand memcmp() to load/stores ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to expand memcmp() calls into optimally-sized loads and
+// compares for the target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include <optional>
+
+using namespace llvm;
+
+namespace llvm {
+class TargetLowering;
+}
+
+#define DEBUG_TYPE "expandmemcmp"
+
+STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
+STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
+STATISTIC(NumMemCmpGreaterThanMax,
+          "Number of memcmp calls with size greater than max size");
+STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
+
+static cl::opt<unsigned> MemCmpEqZeroNumLoadsPerBlock(
+    "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
+    cl::desc("The number of loads per basic block for inline expansion of "
+             "memcmp that is only being compared against zero."));
+
+static cl::opt<unsigned> MaxLoadsPerMemcmp(
+    "max-loads-per-memcmp", cl::Hidden,
+    cl::desc("Set maximum number of loads used in expanded memcmp"));
+
+static cl::opt<unsigned> MaxLoadsPerMemcmpOptSize(
+    "max-loads-per-memcmp-opt-size", cl::Hidden,
+    cl::desc("Set maximum number of loads used in expanded memcmp for -Os/Oz"));
+
+namespace {
+
+
+// This class provides helper functions to expand a memcmp library call into an
+// inline expansion.
+class MemCmpExpansion {
+  struct ResultBlock {
+    BasicBlock *BB = nullptr;
+    PHINode *PhiSrc1 = nullptr;
+    PHINode *PhiSrc2 = nullptr;
+
+    ResultBlock() = default;
+  };
+
+  CallInst *const CI = nullptr;
+  ResultBlock ResBlock;
+  const uint64_t Size;
+  unsigned MaxLoadSize = 0;
+  uint64_t NumLoadsNonOneByte = 0;
+  const uint64_t NumLoadsPerBlockForZeroCmp;
+  std::vector<BasicBlock *> LoadCmpBlocks;
+  BasicBlock *EndBlock = nullptr;
+  PHINode *PhiRes = nullptr;
+  const bool IsUsedForZeroCmp;
+  const DataLayout &DL;
+  DomTreeUpdater *DTU = nullptr;
+  IRBuilder<> Builder;
+  // Represents the decomposition in blocks of the expansion. For example,
+  // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and
+  // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {1, 32}.
+  struct LoadEntry {
+    LoadEntry(unsigned LoadSize, uint64_t Offset)
+        : LoadSize(LoadSize), Offset(Offset) {
+    }
+
+    // The size of the load for this block, in bytes.
+    unsigned LoadSize;
+    // The offset of this load from the base pointer, in bytes.
+    uint64_t Offset;
+  };
+  using LoadEntryVector = SmallVector<LoadEntry, 8>;
+  LoadEntryVector LoadSequence;
+
+  void createLoadCmpBlocks();
+  void createResultBlock();
+  void setupResultBlockPHINodes();
+  void setupEndBlockPHINodes();
+  Value *getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex);
+  void emitLoadCompareBlock(unsigned BlockIndex);
+  void emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
+                                         unsigned &LoadIndex);
+  void emitLoadCompareByteBlock(unsigned BlockIndex, unsigned OffsetBytes);
+  void emitMemCmpResultBlock();
+  Value *getMemCmpExpansionZeroCase();
+  Value *getMemCmpEqZeroOneBlock();
+  Value *getMemCmpOneBlock();
+  struct LoadPair {
+    Value *Lhs = nullptr;
+    Value *Rhs = nullptr;
+  };
+  LoadPair getLoadPair(Type *LoadSizeType, bool NeedsBSwap, Type *CmpSizeType,
+                       unsigned OffsetBytes);
+
+  static LoadEntryVector
+  computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,
+                            unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte);
+  static LoadEntryVector
+  computeOverlappingLoadSequence(uint64_t Size, unsigned MaxLoadSize,
+                                 unsigned MaxNumLoads,
+                                 unsigned &NumLoadsNonOneByte);
+
+public:
+  MemCmpExpansion(CallInst *CI, uint64_t Size,
+                  const TargetTransformInfo::MemCmpExpansionOptions &Options,
+                  const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout,
+                  DomTreeUpdater *DTU);
+
+  unsigned getNumBlocks();
+  uint64_t getNumLoads() const { return LoadSequence.size(); }
+
+  Value *getMemCmpExpansion();
+};
+
+MemCmpExpansion::LoadEntryVector MemCmpExpansion::computeGreedyLoadSequence(
+    uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,
+    const unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte) {
+  NumLoadsNonOneByte = 0;
+  LoadEntryVector LoadSequence;
+  uint64_t Offset = 0;
+  while (Size && !LoadSizes.empty()) {
+    const unsigned LoadSize = LoadSizes.front();
+    const uint64_t NumLoadsForThisSize = Size / LoadSize;
+    if (LoadSequence.size() + NumLoadsForThisSize > MaxNumLoads) {
+      // Do not expand if the total number of loads is larger than what the
+      // target allows. Note that it's important that we exit before completing
+      // the expansion to avoid using a ton of memory to store the expansion for
+      // large sizes.
+      return {};
+    }
+    if (NumLoadsForThisSize > 0) {
+      for (uint64_t I = 0; I < NumLoadsForThisSize; ++I) {
+        LoadSequence.push_back({LoadSize, Offset});
+        Offset += LoadSize;
+      }
+      if (LoadSize > 1)
+        ++NumLoadsNonOneByte;
+      Size = Size % LoadSize;
+    }
+    LoadSizes = LoadSizes.drop_front();
+  }
+  return LoadSequence;
+}
+
+MemCmpExpansion::LoadEntryVector
+MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size,
+                                                const unsigned MaxLoadSize,
+                                                const unsigned MaxNumLoads,
+                                                unsigned &NumLoadsNonOneByte) {
+  // These are already handled by the greedy approach.
+  if (Size < 2 || MaxLoadSize < 2)
+    return {};
+
+  // We try to do as many non-overlapping loads as possible starting from the
+  // beginning.
+  const uint64_t NumNonOverlappingLoads = Size / MaxLoadSize;
+  assert(NumNonOverlappingLoads && "there must be at least one load");
+  // There remain 0 to (MaxLoadSize - 1) bytes to load, this will be done with
+  // an overlapping load.
+  Size = Size - NumNonOverlappingLoads * MaxLoadSize;
+  // Bail if we do not need an overloapping store, this is already handled by
+  // the greedy approach.
+  if (Size == 0)
+    return {};
+  // Bail if the number of loads (non-overlapping + potential overlapping one)
+  // is larger than the max allowed.
+  if ((NumNonOverlappingLoads + 1) > MaxNumLoads)
+    return {};
+
+  // Add non-overlapping loads.
+  LoadEntryVector LoadSequence;
+  uint64_t Offset = 0;
+  for (uint64_t I = 0; I < NumNonOverlappingLoads; ++I) {
+    LoadSequence.push_back({MaxLoadSize, Offset});
+    Offset += MaxLoadSize;
+  }
+
+  // Add the last overlapping load.
+  assert(Size > 0 && Size < MaxLoadSize && "broken invariant");
+  LoadSequence.push_back({MaxLoadSize, Offset - (MaxLoadSize - Size)});
+  NumLoadsNonOneByte = 1;
+  return LoadSequence;
+}
+
+// Initialize the basic block structure required for expansion of memcmp call
+// with given maximum load size and memcmp size parameter.
+// This structure includes:
+// 1. A list of load compare blocks - LoadCmpBlocks.
+// 2. An EndBlock, split from original instruction point, which is the block to
+// return from.
+// 3. ResultBlock, block to branch to for early exit when a
+// LoadCmpBlock finds a difference.
+MemCmpExpansion::MemCmpExpansion(
+    CallInst *const CI, uint64_t Size,
+    const TargetTransformInfo::MemCmpExpansionOptions &Options,
+    const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout,
+    DomTreeUpdater *DTU)
+    : CI(CI), Size(Size), NumLoadsPerBlockForZeroCmp(Options.NumLoadsPerBlock),
+      IsUsedForZeroCmp(IsUsedForZeroCmp), DL(TheDataLayout), DTU(DTU),
+      Builder(CI) {
+  assert(Size > 0 && "zero blocks");
+  // Scale the max size down if the target can load more bytes than we need.
+  llvm::ArrayRef<unsigned> LoadSizes(Options.LoadSizes);
+  while (!LoadSizes.empty() && LoadSizes.front() > Size) {
+    LoadSizes = LoadSizes.drop_front();
+  }
+  assert(!LoadSizes.empty() && "cannot load Size bytes");
+  MaxLoadSize = LoadSizes.front();
+  // Compute the decomposition.
+  unsigned GreedyNumLoadsNonOneByte = 0;
+  LoadSequence = computeGreedyLoadSequence(Size, LoadSizes, Options.MaxNumLoads,
+                                           GreedyNumLoadsNonOneByte);
+  NumLoadsNonOneByte = GreedyNumLoadsNonOneByte;
+  assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
+  // If we allow overlapping loads and the load sequence is not already optimal,
+  // use overlapping loads.
+  if (Options.AllowOverlappingLoads &&
+      (LoadSequence.empty() || LoadSequence.size() > 2)) {
+    unsigned OverlappingNumLoadsNonOneByte = 0;
+    auto OverlappingLoads = computeOverlappingLoadSequence(
+        Size, MaxLoadSize, Options.MaxNumLoads, OverlappingNumLoadsNonOneByte);
+    if (!OverlappingLoads.empty() &&
+        (LoadSequence.empty() ||
+         OverlappingLoads.size() < LoadSequence.size())) {
+      LoadSequence = OverlappingLoads;
+      NumLoadsNonOneByte = OverlappingNumLoadsNonOneByte;
+    }
+  }
+  assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
+}
+
+unsigned MemCmpExpansion::getNumBlocks() {
+  if (IsUsedForZeroCmp)
+    return getNumLoads() / NumLoadsPerBlockForZeroCmp +
+           (getNumLoads() % NumLoadsPerBlockForZeroCmp != 0 ? 1 : 0);
+  return getNumLoads();
+}
+
+void MemCmpExpansion::createLoadCmpBlocks() {
+  for (unsigned i = 0; i < getNumBlocks(); i++) {
+    BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
+                                        EndBlock->getParent(), EndBlock);
+    LoadCmpBlocks.push_back(BB);
+  }
+}
+
+void MemCmpExpansion::createResultBlock() {
+  ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
+                                   EndBlock->getParent(), EndBlock);
+}
+
+MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
+                                                       bool NeedsBSwap,
+                                                       Type *CmpSizeType,
+                                                       unsigned OffsetBytes) {
+  // Get the memory source at offset `OffsetBytes`.
+  Value *LhsSource = CI->getArgOperand(0);
+  Value *RhsSource = CI->getArgOperand(1);
+  Align LhsAlign = LhsSource->getPointerAlignment(DL);
+  Align RhsAlign = RhsSource->getPointerAlignment(DL);
+  if (OffsetBytes > 0) {
+    auto *ByteType = Type::getInt8Ty(CI->getContext());
+    LhsSource = Builder.CreateConstGEP1_64(ByteType, LhsSource, OffsetBytes);
+    RhsSource = Builder.CreateConstGEP1_64(ByteType, RhsSource, OffsetBytes);
+    LhsAlign = commonAlignment(LhsAlign, OffsetBytes);
+    RhsAlign = commonAlignment(RhsAlign, OffsetBytes);
+  }
+
+  // Create a constant or a load from the source.
+  Value *Lhs = nullptr;
+  if (auto *C = dyn_cast<Constant>(LhsSource))
+    Lhs = ConstantFoldLoadFromConstPtr(C, LoadSizeType, DL);
+  if (!Lhs)
+    Lhs = Builder.CreateAlignedLoad(LoadSizeType, LhsSource, LhsAlign);
+
+  Value *Rhs = nullptr;
+  if (auto *C = dyn_cast<Constant>(RhsSource))
+    Rhs = ConstantFoldLoadFromConstPtr(C, LoadSizeType, DL);
+  if (!Rhs)
+    Rhs = Builder.CreateAlignedLoad(LoadSizeType, RhsSource, RhsAlign);
+
+  // Swap bytes if required.
+  if (NeedsBSwap) {
+    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
+                                                Intrinsic::bswap, LoadSizeType);
+    Lhs = Builder.CreateCall(Bswap, Lhs);
+    Rhs = Builder.CreateCall(Bswap, Rhs);
+  }
+
+  // Zero extend if required.
+  if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) {
+    Lhs = Builder.CreateZExt(Lhs, CmpSizeType);
+    Rhs = Builder.CreateZExt(Rhs, CmpSizeType);
+  }
+  return {Lhs, Rhs};
+}
+
+// This function creates the IR instructions for loading and comparing 1 byte.
+// It loads 1 byte from each source of the memcmp parameters with the given
+// GEPIndex. It then subtracts the two loaded values and adds this result to the
+// final phi node for selecting the memcmp result.
+void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
+                                               unsigned OffsetBytes) {
+  BasicBlock *BB = LoadCmpBlocks[BlockIndex];
+  Builder.SetInsertPoint(BB);
+  const LoadPair Loads =
+      getLoadPair(Type::getInt8Ty(CI->getContext()), /*NeedsBSwap=*/false,
+                  Type::getInt32Ty(CI->getContext()), OffsetBytes);
+  Value *Diff = Builder.CreateSub(Loads.Lhs, Loads.Rhs);
+
+  PhiRes->addIncoming(Diff, BB);
+
+  if (BlockIndex < (LoadCmpBlocks.size() - 1)) {
+    // Early exit branch if difference found to EndBlock. Otherwise, continue to
+    // next LoadCmpBlock,
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
+                                    ConstantInt::get(Diff->getType(), 0));
+    BranchInst *CmpBr =
+        BranchInst::Create(EndBlock, LoadCmpBlocks[BlockIndex + 1], Cmp);
+    Builder.Insert(CmpBr);
+    if (DTU)
+      DTU->applyUpdates(
+          {{DominatorTree::Insert, BB, EndBlock},
+           {DominatorTree::Insert, BB, LoadCmpBlocks[BlockIndex + 1]}});
+  } else {
+    // The last block has an unconditional branch to EndBlock.
+    BranchInst *CmpBr = BranchInst::Create(EndBlock);
+    Builder.Insert(CmpBr);
+    if (DTU)
+      DTU->applyUpdates({{DominatorTree::Insert, BB, EndBlock}});
+  }
+}
+
+/// Generate an equality comparison for one or more pairs of loaded values.
+/// This is used in the case where the memcmp() call is compared equal or not
+/// equal to zero.
+Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
+                                            unsigned &LoadIndex) {
+  assert(LoadIndex < getNumLoads() &&
+         "getCompareLoadPairs() called with no remaining loads");
+  std::vector<Value *> XorList, OrList;
+  Value *Diff = nullptr;
+
+  const unsigned NumLoads =
+      std::min(getNumLoads() - LoadIndex, NumLoadsPerBlockForZeroCmp);
+
+  // For a single-block expansion, start inserting before the memcmp call.
+  if (LoadCmpBlocks.empty())
+    Builder.SetInsertPoint(CI);
+  else
+    Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
+
+  Value *Cmp = nullptr;
+  // If we have multiple loads per block, we need to generate a composite
+  // comparison using xor+or. The type for the combinations is the largest load
+  // type.
+  IntegerType *const MaxLoadType =
+      NumLoads == 1 ? nullptr
+                    : IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+  for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
+    const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
+    const LoadPair Loads = getLoadPair(
+        IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8),
+        /*NeedsBSwap=*/false, MaxLoadType, CurLoadEntry.Offset);
+
+    if (NumLoads != 1) {
+      // If we have multiple loads per block, we need to generate a composite
+      // comparison using xor+or.
+      Diff = Builder.CreateXor(Loads.Lhs, Loads.Rhs);
+      Diff = Builder.CreateZExt(Diff, MaxLoadType);
+      XorList.push_back(Diff);
+    } else {
+      // If there's only one load per block, we just compare the loaded values.
+      Cmp = Builder.CreateICmpNE(Loads.Lhs, Loads.Rhs);
+    }
+  }
+
+  auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> {
+    std::vector<Value *> OutList;
+    for (unsigned i = 0; i < InList.size() - 1; i = i + 2) {
+      Value *Or = Builder.CreateOr(InList[i], InList[i + 1]);
+      OutList.push_back(Or);
+    }
+    if (InList.size() % 2 != 0)
+      OutList.push_back(InList.back());
+    return OutList;
+  };
+
+  if (!Cmp) {
+    // Pairwise OR the XOR results.
+    OrList = pairWiseOr(XorList);
+
+    // Pairwise OR the OR results until one result left.
+    while (OrList.size() != 1) {
+      OrList = pairWiseOr(OrList);
+    }
+
+    assert(Diff && "Failed to find comparison diff");
+    Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0));
+  }
+
+  return Cmp;
+}
+
+void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex,
+                                                        unsigned &LoadIndex) {
+  Value *Cmp = getCompareLoadPairs(BlockIndex, LoadIndex);
+
+  BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
+                           ? EndBlock
+                           : LoadCmpBlocks[BlockIndex + 1];
+  // Early exit branch if difference found to ResultBlock. Otherwise,
+  // continue to next LoadCmpBlock or EndBlock.
+  BasicBlock *BB = Builder.GetInsertBlock();
+  BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
+  Builder.Insert(CmpBr);
+  if (DTU)
+    DTU->applyUpdates({{DominatorTree::Insert, BB, ResBlock.BB},
+                       {DominatorTree::Insert, BB, NextBB}});
+
+  // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
+  // since early exit to ResultBlock was not taken (no difference was found in
+  // any of the bytes).
+  if (BlockIndex == LoadCmpBlocks.size() - 1) {
+    Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
+    PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);
+  }
+}
+
+// This function creates the IR intructions for loading and comparing using the
+// given LoadSize. It loads the number of bytes specified by LoadSize from each
+// source of the memcmp parameters. It then does a subtract to see if there was
+// a difference in the loaded values. If a difference is found, it branches
+// with an early exit to the ResultBlock for calculating which source was
+// larger. Otherwise, it falls through to the either the next LoadCmpBlock or
+// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with
+// a special case through emitLoadCompareByteBlock. The special handling can
+// simply subtract the loaded values and add it to the result phi node.
+void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
+  // There is one load per block in this case, BlockIndex == LoadIndex.
+  const LoadEntry &CurLoadEntry = LoadSequence[BlockIndex];
+
+  if (CurLoadEntry.LoadSize == 1) {
+    MemCmpExpansion::emitLoadCompareByteBlock(BlockIndex, CurLoadEntry.Offset);
+    return;
+  }
+
+  Type *LoadSizeType =
+      IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
+  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+  assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");
+
+  Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
+
+  const LoadPair Loads =
+      getLoadPair(LoadSizeType, /*NeedsBSwap=*/DL.isLittleEndian(), MaxLoadType,
+                  CurLoadEntry.Offset);
+
+  // Add the loaded values to the phi nodes for calculating memcmp result only
+  // if result is not used in a zero equality.
+  if (!IsUsedForZeroCmp) {
+    ResBlock.PhiSrc1->addIncoming(Loads.Lhs, LoadCmpBlocks[BlockIndex]);
+    ResBlock.PhiSrc2->addIncoming(Loads.Rhs, LoadCmpBlocks[BlockIndex]);
+  }
+
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Loads.Lhs, Loads.Rhs);
+  BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
+                           ? EndBlock
+                           : LoadCmpBlocks[BlockIndex + 1];
+  // Early exit branch if difference found to ResultBlock. Otherwise, continue
+  // to next LoadCmpBlock or EndBlock.
+  BasicBlock *BB = Builder.GetInsertBlock();
+  BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp);
+  Builder.Insert(CmpBr);
+  if (DTU)
+    DTU->applyUpdates({{DominatorTree::Insert, BB, NextBB},
+                       {DominatorTree::Insert, BB, ResBlock.BB}});
+
+  // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
+  // since early exit to ResultBlock was not taken (no difference was found in
+  // any of the bytes).
+  if (BlockIndex == LoadCmpBlocks.size() - 1) {
+    Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
+    PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]);
+  }
+}
+
+// This function populates the ResultBlock with a sequence to calculate the
+// memcmp result. It compares the two loaded source values and returns -1 if
+// src1 < src2 and 1 if src1 > src2.
+void MemCmpExpansion::emitMemCmpResultBlock() {
+  // Special case: if memcmp result is used in a zero equality, result does not
+  // need to be calculated and can simply return 1.
+  if (IsUsedForZeroCmp) {
+    BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
+    Builder.SetInsertPoint(ResBlock.BB, InsertPt);
+    Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1);
+    PhiRes->addIncoming(Res, ResBlock.BB);
+    BranchInst *NewBr = BranchInst::Create(EndBlock);
+    Builder.Insert(NewBr);
+    if (DTU)
+      DTU->applyUpdates({{DominatorTree::Insert, ResBlock.BB, EndBlock}});
+    return;
+  }
+  BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
+  Builder.SetInsertPoint(ResBlock.BB, InsertPt);
+
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1,
+                                  ResBlock.PhiSrc2);
+
+  Value *Res =
+      Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1),
+                           ConstantInt::get(Builder.getInt32Ty(), 1));
+
+  PhiRes->addIncoming(Res, ResBlock.BB);
+  BranchInst *NewBr = BranchInst::Create(EndBlock);
+  Builder.Insert(NewBr);
+  if (DTU)
+    DTU->applyUpdates({{DominatorTree::Insert, ResBlock.BB, EndBlock}});
+}
+
+void MemCmpExpansion::setupResultBlockPHINodes() {
+  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+  Builder.SetInsertPoint(ResBlock.BB);
+  // Note: this assumes one load per block.
+  ResBlock.PhiSrc1 =
+      Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src1");
+  ResBlock.PhiSrc2 =
+      Builder.CreatePHI(MaxLoadType, NumLoadsNonOneByte, "phi.src2");
+}
+
+void MemCmpExpansion::setupEndBlockPHINodes() {
+  Builder.SetInsertPoint(&EndBlock->front());
+  PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res");
+}
+
+Value *MemCmpExpansion::getMemCmpExpansionZeroCase() {
+  unsigned LoadIndex = 0;
+  // This loop populates each of the LoadCmpBlocks with the IR sequence to
+  // handle multiple loads per block.
+  for (unsigned I = 0; I < getNumBlocks(); ++I) {
+    emitLoadCompareBlockMultipleLoads(I, LoadIndex);
+  }
+
+  emitMemCmpResultBlock();
+  return PhiRes;
+}
+
+/// A memcmp expansion that compares equality with 0 and only has one block of
+/// load and compare can bypass the compare, branch, and phi IR that is required
+/// in the general case.
+Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
+  unsigned LoadIndex = 0;
+  Value *Cmp = getCompareLoadPairs(0, LoadIndex);
+  assert(LoadIndex == getNumLoads() && "some entries were not consumed");
+  return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
+}
+
+/// A memcmp expansion that only has one block of load and compare can bypass
+/// the compare, branch, and phi IR that is required in the general case.
+Value *MemCmpExpansion::getMemCmpOneBlock() {
+  Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
+  bool NeedsBSwap = DL.isLittleEndian() && Size != 1;
+
+  // The i8 and i16 cases don't need compares. We zext the loaded values and
+  // subtract them to get the suitable negative, zero, or positive i32 result.
+  if (Size < 4) {
+    const LoadPair Loads =
+        getLoadPair(LoadSizeType, NeedsBSwap, Builder.getInt32Ty(),
+                    /*Offset*/ 0);
+    return Builder.CreateSub(Loads.Lhs, Loads.Rhs);
+  }
+
+  const LoadPair Loads = getLoadPair(LoadSizeType, NeedsBSwap, LoadSizeType,
+                                     /*Offset*/ 0);
+  // The result of memcmp is negative, zero, or positive, so produce that by
+  // subtracting 2 extended compare bits: sub (ugt, ult).
+  // If a target prefers to use selects to get -1/0/1, they should be able
+  // to transform this later. The inverse transform (going from selects to math)
+  // may not be possible in the DAG because the selects got converted into
+  // branches before we got there.
+  Value *CmpUGT = Builder.CreateICmpUGT(Loads.Lhs, Loads.Rhs);
+  Value *CmpULT = Builder.CreateICmpULT(Loads.Lhs, Loads.Rhs);
+  Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty());
+  Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty());
+  return Builder.CreateSub(ZextUGT, ZextULT);
+}
+
+// This function expands the memcmp call into an inline expansion and returns
+// the memcmp result.
+Value *MemCmpExpansion::getMemCmpExpansion() {
+  // Create the basic block framework for a multi-block expansion.
+  if (getNumBlocks() != 1) {
+    BasicBlock *StartBlock = CI->getParent();
+    EndBlock = SplitBlock(StartBlock, CI, DTU, /*LI=*/nullptr,
+                          /*MSSAU=*/nullptr, "endblock");
+    setupEndBlockPHINodes();
+    createResultBlock();
+
+    // If return value of memcmp is not used in a zero equality, we need to
+    // calculate which source was larger. The calculation requires the
+    // two loaded source values of each load compare block.
+    // These will be saved in the phi nodes created by setupResultBlockPHINodes.
+    if (!IsUsedForZeroCmp) setupResultBlockPHINodes();
+
+    // Create the number of required load compare basic blocks.
+    createLoadCmpBlocks();
+
+    // Update the terminator added by SplitBlock to branch to the first
+    // LoadCmpBlock.
+    StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
+    if (DTU)
+      DTU->applyUpdates({{DominatorTree::Insert, StartBlock, LoadCmpBlocks[0]},
+                         {DominatorTree::Delete, StartBlock, EndBlock}});
+  }
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  if (IsUsedForZeroCmp)
+    return getNumBlocks() == 1 ? getMemCmpEqZeroOneBlock()
+                               : getMemCmpExpansionZeroCase();
+
+  if (getNumBlocks() == 1)
+    return getMemCmpOneBlock();
+
+  for (unsigned I = 0; I < getNumBlocks(); ++I) {
+    emitLoadCompareBlock(I);
+  }
+
+  emitMemCmpResultBlock();
+  return PhiRes;
+}
+
+// This function checks to see if an expansion of memcmp can be generated.
+// It checks for constant compare size that is less than the max inline size.
+// If an expansion cannot occur, returns false to leave as a library call.
+// Otherwise, the library call is replaced with a new IR instruction sequence.
+/// We want to transform:
+/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
+/// To:
+/// loadbb:
+///  %0 = bitcast i32* %buffer2 to i8*
+///  %1 = bitcast i32* %buffer1 to i8*
+///  %2 = bitcast i8* %1 to i64*
+///  %3 = bitcast i8* %0 to i64*
+///  %4 = load i64, i64* %2
+///  %5 = load i64, i64* %3
+///  %6 = call i64 @llvm.bswap.i64(i64 %4)
+///  %7 = call i64 @llvm.bswap.i64(i64 %5)
+///  %8 = sub i64 %6, %7
+///  %9 = icmp ne i64 %8, 0
+///  br i1 %9, label %res_block, label %loadbb1
+/// res_block:                                        ; preds = %loadbb2,
+/// %loadbb1, %loadbb
+///  %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]
+///  %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]
+///  %10 = icmp ult i64 %phi.src1, %phi.src2
+///  %11 = select i1 %10, i32 -1, i32 1
+///  br label %endblock
+/// loadbb1:                                          ; preds = %loadbb
+///  %12 = bitcast i32* %buffer2 to i8*
+///  %13 = bitcast i32* %buffer1 to i8*
+///  %14 = bitcast i8* %13 to i32*
+///  %15 = bitcast i8* %12 to i32*
+///  %16 = getelementptr i32, i32* %14, i32 2
+///  %17 = getelementptr i32, i32* %15, i32 2
+///  %18 = load i32, i32* %16
+///  %19 = load i32, i32* %17
+///  %20 = call i32 @llvm.bswap.i32(i32 %18)
+///  %21 = call i32 @llvm.bswap.i32(i32 %19)
+///  %22 = zext i32 %20 to i64
+///  %23 = zext i32 %21 to i64
+///  %24 = sub i64 %22, %23
+///  %25 = icmp ne i64 %24, 0
+///  br i1 %25, label %res_block, label %loadbb2
+/// loadbb2:                                          ; preds = %loadbb1
+///  %26 = bitcast i32* %buffer2 to i8*
+///  %27 = bitcast i32* %buffer1 to i8*
+///  %28 = bitcast i8* %27 to i16*
+///  %29 = bitcast i8* %26 to i16*
+///  %30 = getelementptr i16, i16* %28, i16 6
+///  %31 = getelementptr i16, i16* %29, i16 6
+///  %32 = load i16, i16* %30
+///  %33 = load i16, i16* %31
+///  %34 = call i16 @llvm.bswap.i16(i16 %32)
+///  %35 = call i16 @llvm.bswap.i16(i16 %33)
+///  %36 = zext i16 %34 to i64
+///  %37 = zext i16 %35 to i64
+///  %38 = sub i64 %36, %37
+///  %39 = icmp ne i64 %38, 0
+///  br i1 %39, label %res_block, label %loadbb3
+/// loadbb3:                                          ; preds = %loadbb2
+///  %40 = bitcast i32* %buffer2 to i8*
+///  %41 = bitcast i32* %buffer1 to i8*
+///  %42 = getelementptr i8, i8* %41, i8 14
+///  %43 = getelementptr i8, i8* %40, i8 14
+///  %44 = load i8, i8* %42
+///  %45 = load i8, i8* %43
+///  %46 = zext i8 %44 to i32
+///  %47 = zext i8 %45 to i32
+///  %48 = sub i32 %46, %47
+///  br label %endblock
+/// endblock:                                         ; preds = %res_block,
+/// %loadbb3
+///  %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
+///  ret i32 %phi.res
+static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
+                         const TargetLowering *TLI, const DataLayout *DL,
+                         ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
+                         DomTreeUpdater *DTU, const bool IsBCmp) {
+  NumMemCmpCalls++;
+
+  // Early exit from expansion if -Oz.
+  if (CI->getFunction()->hasMinSize())
+    return false;
+
+  // Early exit from expansion if size is not a constant.
+  ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  if (!SizeCast) {
+    NumMemCmpNotConstant++;
+    return false;
+  }
+  const uint64_t SizeVal = SizeCast->getZExtValue();
+
+  if (SizeVal == 0) {
+    return false;
+  }
+  // TTI call to check if target would like to expand memcmp. Also, get the
+  // available load sizes.
+  const bool IsUsedForZeroCmp =
+      IsBCmp || isOnlyUsedInZeroEqualityComparison(CI);
+  bool OptForSize = CI->getFunction()->hasOptSize() ||
+                    llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
+  auto Options = TTI->enableMemCmpExpansion(OptForSize,
+                                            IsUsedForZeroCmp);
+  if (!Options) return false;
+
+  if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences())
+    Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock;
+
+  if (OptForSize &&
+      MaxLoadsPerMemcmpOptSize.getNumOccurrences())
+    Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize;
+
+  if (!OptForSize && MaxLoadsPerMemcmp.getNumOccurrences())
+    Options.MaxNumLoads = MaxLoadsPerMemcmp;
+
+  MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL, DTU);
+
+  // Don't expand if this will require more loads than desired by the target.
+  if (Expansion.getNumLoads() == 0) {
+    NumMemCmpGreaterThanMax++;
+    return false;
+  }
+
+  NumMemCmpInlined++;
+
+  Value *Res = Expansion.getMemCmpExpansion();
+
+  // Replace call with result of expansion and erase call.
+  CI->replaceAllUsesWith(Res);
+  CI->eraseFromParent();
+
+  return true;
+}
+
+class ExpandMemCmpPass : public FunctionPass {
+public:
+  static char ID;
+
+  ExpandMemCmpPass() : FunctionPass(ID) {
+    initializeExpandMemCmpPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F)) return false;
+
+    auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+    if (!TPC) {
+      return false;
+    }
+    const TargetLowering* TL =
+        TPC->getTM<TargetMachine>().getSubtargetImpl(F)->getTargetLowering();
+
+    const TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    const TargetTransformInfo *TTI =
+        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+    auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+           &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
+           nullptr;
+    DominatorTree *DT = nullptr;
+    if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
+      DT = &DTWP->getDomTree();
+    auto PA = runImpl(F, TLI, TTI, TL, PSI, BFI, DT);
+    return !PA.areAllPreserved();
+  }
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
+                            const TargetTransformInfo *TTI,
+                            const TargetLowering *TL, ProfileSummaryInfo *PSI,
+                            BlockFrequencyInfo *BFI, DominatorTree *DT);
+  // Returns true if a change was made.
+  bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
+                  const TargetTransformInfo *TTI, const TargetLowering *TL,
+                  const DataLayout &DL, ProfileSummaryInfo *PSI,
+                  BlockFrequencyInfo *BFI, DomTreeUpdater *DTU);
+};
+
+bool ExpandMemCmpPass::runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
+                                  const TargetTransformInfo *TTI,
+                                  const TargetLowering *TL,
+                                  const DataLayout &DL, ProfileSummaryInfo *PSI,
+                                  BlockFrequencyInfo *BFI,
+                                  DomTreeUpdater *DTU) {
+  for (Instruction& I : BB) {
+    CallInst *CI = dyn_cast<CallInst>(&I);
+    if (!CI) {
+      continue;
+    }
+    LibFunc Func;
+    if (TLI->getLibFunc(*CI, Func) &&
+        (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
+        expandMemCmp(CI, TTI, TL, &DL, PSI, BFI, DTU, Func == LibFunc_bcmp)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+PreservedAnalyses
+ExpandMemCmpPass::runImpl(Function &F, const TargetLibraryInfo *TLI,
+                          const TargetTransformInfo *TTI,
+                          const TargetLowering *TL, ProfileSummaryInfo *PSI,
+                          BlockFrequencyInfo *BFI, DominatorTree *DT) {
+  std::optional<DomTreeUpdater> DTU;
+  if (DT)
+    DTU.emplace(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
+  const DataLayout& DL = F.getParent()->getDataLayout();
+  bool MadeChanges = false;
+  for (auto BBIt = F.begin(); BBIt != F.end();) {
+    if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI, DTU ? &*DTU : nullptr)) {
+      MadeChanges = true;
+      // If changes were made, restart the function from the beginning, since
+      // the structure of the function was changed.
+      BBIt = F.begin();
+    } else {
+      ++BBIt;
+    }
+  }
+  if (MadeChanges)
+    for (BasicBlock &BB : F)
+      SimplifyInstructionsInBlock(&BB);
+  if (!MadeChanges)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
+
+} // namespace
+
+char ExpandMemCmpPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ExpandMemCmpPass, "expandmemcmp",
+                      "Expand memcmp() to load/stores", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ExpandMemCmpPass, "expandmemcmp",
+                    "Expand memcmp() to load/stores", false, false)
+
+FunctionPass *llvm::createExpandMemCmpPass() {
+  return new ExpandMemCmpPass();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
new file mode 100644
index 000000000000..3a79f20f4732
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -0,0 +1,161 @@
+//===-- ExpandPostRAPseudos.cpp - Pseudo instruction expansion pass -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that expands COPY and SUBREG_TO_REG pseudo
+// instructions after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "postrapseudos"
+
+namespace {
+struct ExpandPostRA : public MachineFunctionPass {
+private:
+  const TargetRegisterInfo *TRI = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  ExpandPostRA() : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  /// runOnMachineFunction - pass entry point
+  bool runOnMachineFunction(MachineFunction&) override;
+
+private:
+  bool LowerSubregToReg(MachineInstr *MI);
+};
+} // end anonymous namespace
+
+char ExpandPostRA::ID = 0;
+char &llvm::ExpandPostRAPseudosID = ExpandPostRA::ID;
+
+INITIALIZE_PASS(ExpandPostRA, DEBUG_TYPE,
+                "Post-RA pseudo instruction expansion pass", false, false)
+
+bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
+  MachineBasicBlock *MBB = MI->getParent();
+  assert((MI->getOperand(0).isReg() && MI->getOperand(0).isDef()) &&
+         MI->getOperand(1).isImm() &&
+         (MI->getOperand(2).isReg() && MI->getOperand(2).isUse()) &&
+          MI->getOperand(3).isImm() && "Invalid subreg_to_reg");
+
+  Register DstReg = MI->getOperand(0).getReg();
+  Register InsReg = MI->getOperand(2).getReg();
+  assert(!MI->getOperand(2).getSubReg() && "SubIdx on physreg?");
+  unsigned SubIdx  = MI->getOperand(3).getImm();
+
+  assert(SubIdx != 0 && "Invalid index for insert_subreg");
+  Register DstSubReg = TRI->getSubReg(DstReg, SubIdx);
+
+  assert(DstReg.isPhysical() &&
+         "Insert destination must be in a physical register");
+  assert(InsReg.isPhysical() &&
+         "Inserted value must be in a physical register");
+
+  LLVM_DEBUG(dbgs() << "subreg: CONVERTING: " << *MI);
+
+  if (MI->allDefsAreDead()) {
+    MI->setDesc(TII->get(TargetOpcode::KILL));
+    MI->removeOperand(3); // SubIdx
+    MI->removeOperand(1); // Imm
+    LLVM_DEBUG(dbgs() << "subreg: replaced by: " << *MI);
+    return true;
+  }
+
+  if (DstSubReg == InsReg) {
+    // No need to insert an identity copy instruction.
+    // Watch out for case like this:
+    // %rax = SUBREG_TO_REG 0, killed %eax, 3
+    // We must leave %rax live.
+    if (DstReg != InsReg) {
+      MI->setDesc(TII->get(TargetOpcode::KILL));
+      MI->removeOperand(3);     // SubIdx
+      MI->removeOperand(1);     // Imm
+      LLVM_DEBUG(dbgs() << "subreg: replace by: " << *MI);
+      return true;
+    }
+    LLVM_DEBUG(dbgs() << "subreg: eliminated!");
+  } else {
+    TII->copyPhysReg(*MBB, MI, MI->getDebugLoc(), DstSubReg, InsReg,
+                     MI->getOperand(2).isKill());
+
+    // Implicitly define DstReg for subsequent uses.
+    MachineBasicBlock::iterator CopyMI = MI;
+    --CopyMI;
+    CopyMI->addRegisterDefined(DstReg);
+    LLVM_DEBUG(dbgs() << "subreg: " << *CopyMI);
+  }
+
+  LLVM_DEBUG(dbgs() << '\n');
+  MBB->erase(MI);
+  return true;
+}
+
+/// runOnMachineFunction - Reduce subregister inserts and extracts to register
+/// copies.
+///
+bool ExpandPostRA::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "Machine Function\n"
+                    << "********** EXPANDING POST-RA PSEUDO INSTRS **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  bool MadeChange = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+      // Only expand pseudos.
+      if (!MI.isPseudo())
+        continue;
+
+      // Give targets a chance to expand even standard pseudos.
+      if (TII->expandPostRAPseudo(MI)) {
+        MadeChange = true;
+        continue;
+      }
+
+      // Expand standard pseudos.
+      switch (MI.getOpcode()) {
+      case TargetOpcode::SUBREG_TO_REG:
+        MadeChange |= LowerSubregToReg(&MI);
+        break;
+      case TargetOpcode::COPY:
+        TII->lowerCopy(&MI, TRI);
+        MadeChange = true;
+        break;
+      case TargetOpcode::DBG_VALUE:
+        continue;
+      case TargetOpcode::INSERT_SUBREG:
+      case TargetOpcode::EXTRACT_SUBREG:
+        llvm_unreachable("Sub-register pseudos should have been eliminated.");
+      }
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ExpandReductions.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ExpandReductions.cpp
new file mode 100644
index 000000000000..79b6dc9154b3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ExpandReductions.cpp
@@ -0,0 +1,240 @@
+//===- ExpandReductions.cpp - Expand reduction intrinsics -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR expansion for reduction intrinsics, allowing targets
+// to enable the intrinsics until just before codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ExpandReductions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+namespace {
+
+unsigned getOpcode(Intrinsic::ID ID) {
+  switch (ID) {
+  case Intrinsic::vector_reduce_fadd:
+    return Instruction::FAdd;
+  case Intrinsic::vector_reduce_fmul:
+    return Instruction::FMul;
+  case Intrinsic::vector_reduce_add:
+    return Instruction::Add;
+  case Intrinsic::vector_reduce_mul:
+    return Instruction::Mul;
+  case Intrinsic::vector_reduce_and:
+    return Instruction::And;
+  case Intrinsic::vector_reduce_or:
+    return Instruction::Or;
+  case Intrinsic::vector_reduce_xor:
+    return Instruction::Xor;
+  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_umax:
+  case Intrinsic::vector_reduce_umin:
+    return Instruction::ICmp;
+  case Intrinsic::vector_reduce_fmax:
+  case Intrinsic::vector_reduce_fmin:
+    return Instruction::FCmp;
+  default:
+    llvm_unreachable("Unexpected ID");
+  }
+}
+
+RecurKind getRK(Intrinsic::ID ID) {
+  switch (ID) {
+  case Intrinsic::vector_reduce_smax:
+    return RecurKind::SMax;
+  case Intrinsic::vector_reduce_smin:
+    return RecurKind::SMin;
+  case Intrinsic::vector_reduce_umax:
+    return RecurKind::UMax;
+  case Intrinsic::vector_reduce_umin:
+    return RecurKind::UMin;
+  case Intrinsic::vector_reduce_fmax:
+    return RecurKind::FMax;
+  case Intrinsic::vector_reduce_fmin:
+    return RecurKind::FMin;
+  default:
+    return RecurKind::None;
+  }
+}
+
+bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
+  bool Changed = false;
+  SmallVector<IntrinsicInst *, 4> Worklist;
+  for (auto &I : instructions(F)) {
+    if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::vector_reduce_fadd:
+      case Intrinsic::vector_reduce_fmul:
+      case Intrinsic::vector_reduce_add:
+      case Intrinsic::vector_reduce_mul:
+      case Intrinsic::vector_reduce_and:
+      case Intrinsic::vector_reduce_or:
+      case Intrinsic::vector_reduce_xor:
+      case Intrinsic::vector_reduce_smax:
+      case Intrinsic::vector_reduce_smin:
+      case Intrinsic::vector_reduce_umax:
+      case Intrinsic::vector_reduce_umin:
+      case Intrinsic::vector_reduce_fmax:
+      case Intrinsic::vector_reduce_fmin:
+        if (TTI->shouldExpandReduction(II))
+          Worklist.push_back(II);
+
+        break;
+      }
+    }
+  }
+
+  for (auto *II : Worklist) {
+    FastMathFlags FMF =
+        isa<FPMathOperator>(II) ? II->getFastMathFlags() : FastMathFlags{};
+    Intrinsic::ID ID = II->getIntrinsicID();
+    RecurKind RK = getRK(ID);
+
+    Value *Rdx = nullptr;
+    IRBuilder<> Builder(II);
+    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+    Builder.setFastMathFlags(FMF);
+    switch (ID) {
+    default: llvm_unreachable("Unexpected intrinsic!");
+    case Intrinsic::vector_reduce_fadd:
+    case Intrinsic::vector_reduce_fmul: {
+      // FMFs must be attached to the call, otherwise it's an ordered reduction
+      // and it can't be handled by generating a shuffle sequence.
+      Value *Acc = II->getArgOperand(0);
+      Value *Vec = II->getArgOperand(1);
+      if (!FMF.allowReassoc())
+        Rdx = getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), RK);
+      else {
+        if (!isPowerOf2_32(
+                cast<FixedVectorType>(Vec->getType())->getNumElements()))
+          continue;
+
+        Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), RK);
+        Rdx = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(ID),
+                                  Acc, Rdx, "bin.rdx");
+      }
+      break;
+    }
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or: {
+      // Canonicalize logical or/and reductions:
+      // Or reduction for i1 is represented as:
+      // %val = bitcast <ReduxWidth x i1> to iReduxWidth
+      // %res = cmp ne iReduxWidth %val, 0
+      // And reduction for i1 is represented as:
+      // %val = bitcast <ReduxWidth x i1> to iReduxWidth
+      // %res = cmp eq iReduxWidth %val, 11111
+      Value *Vec = II->getArgOperand(0);
+      auto *FTy = cast<FixedVectorType>(Vec->getType());
+      unsigned NumElts = FTy->getNumElements();
+      if (!isPowerOf2_32(NumElts))
+        continue;
+
+      if (FTy->getElementType() == Builder.getInt1Ty()) {
+        Rdx = Builder.CreateBitCast(Vec, Builder.getIntNTy(NumElts));
+        if (ID == Intrinsic::vector_reduce_and) {
+          Rdx = Builder.CreateICmpEQ(
+              Rdx, ConstantInt::getAllOnesValue(Rdx->getType()));
+        } else {
+          assert(ID == Intrinsic::vector_reduce_or && "Expected or reduction.");
+          Rdx = Builder.CreateIsNotNull(Rdx);
+        }
+        break;
+      }
+
+      Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), RK);
+      break;
+    }
+    case Intrinsic::vector_reduce_add:
+    case Intrinsic::vector_reduce_mul:
+    case Intrinsic::vector_reduce_xor:
+    case Intrinsic::vector_reduce_smax:
+    case Intrinsic::vector_reduce_smin:
+    case Intrinsic::vector_reduce_umax:
+    case Intrinsic::vector_reduce_umin: {
+      Value *Vec = II->getArgOperand(0);
+      if (!isPowerOf2_32(
+              cast<FixedVectorType>(Vec->getType())->getNumElements()))
+        continue;
+
+      Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), RK);
+      break;
+    }
+    case Intrinsic::vector_reduce_fmax:
+    case Intrinsic::vector_reduce_fmin: {
+      // We require "nnan" to use a shuffle reduction; "nsz" is implied by the
+      // semantics of the reduction.
+      Value *Vec = II->getArgOperand(0);
+      if (!isPowerOf2_32(
+              cast<FixedVectorType>(Vec->getType())->getNumElements()) ||
+          !FMF.noNaNs())
+        continue;
+
+      Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), RK);
+      break;
+    }
+    }
+    II->replaceAllUsesWith(Rdx);
+    II->eraseFromParent();
+    Changed = true;
+  }
+  return Changed;
+}
+
+class ExpandReductions : public FunctionPass {
+public:
+  static char ID;
+  ExpandReductions() : FunctionPass(ID) {
+    initializeExpandReductionsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    const auto *TTI =&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    return expandReductions(F, TTI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+}
+
+char ExpandReductions::ID;
+INITIALIZE_PASS_BEGIN(ExpandReductions, "expand-reductions",
+                      "Expand reduction intrinsics", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ExpandReductions, "expand-reductions",
+                    "Expand reduction intrinsics", false, false)
+
+FunctionPass *llvm::createExpandReductionsPass() {
+  return new ExpandReductions();
+}
+
+PreservedAnalyses ExpandReductionsPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  if (!expandReductions(F, &TTI))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ExpandVectorPredication.cpp
new file mode 100644
index 000000000000..9807be0bea39
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -0,0 +1,769 @@
+//===----- CodeGen/ExpandVectorPredication.cpp - Expand VP intrinsics -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR expansion for vector predication intrinsics, allowing
+// targets to enable vector predication until just before codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ExpandVectorPredication.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include <optional>
+
+using namespace llvm;
+
+using VPLegalization = TargetTransformInfo::VPLegalization;
+using VPTransform = TargetTransformInfo::VPLegalization::VPTransform;
+
+// Keep this in sync with TargetTransformInfo::VPLegalization.
+#define VPINTERNAL_VPLEGAL_CASES                                               \
+  VPINTERNAL_CASE(Legal)                                                       \
+  VPINTERNAL_CASE(Discard)                                                     \
+  VPINTERNAL_CASE(Convert)
+
+#define VPINTERNAL_CASE(X) "|" #X
+
+// Override options.
+static cl::opt<std::string> EVLTransformOverride(
+    "expandvp-override-evl-transform", cl::init(""), cl::Hidden,
+    cl::desc("Options: <empty>" VPINTERNAL_VPLEGAL_CASES
+             ". If non-empty, ignore "
+             "TargetTransformInfo and "
+             "always use this transformation for the %evl parameter (Used in "
+             "testing)."));
+
+static cl::opt<std::string> MaskTransformOverride(
+    "expandvp-override-mask-transform", cl::init(""), cl::Hidden,
+    cl::desc("Options: <empty>" VPINTERNAL_VPLEGAL_CASES
+             ". If non-empty, Ignore "
+             "TargetTransformInfo and "
+             "always use this transformation for the %mask parameter (Used in "
+             "testing)."));
+
+#undef VPINTERNAL_CASE
+#define VPINTERNAL_CASE(X) .Case(#X, VPLegalization::X)
+
+static VPTransform parseOverrideOption(const std::string &TextOpt) {
+  return StringSwitch<VPTransform>(TextOpt) VPINTERNAL_VPLEGAL_CASES;
+}
+
+#undef VPINTERNAL_VPLEGAL_CASES
+
+// Whether any override options are set.
+static bool anyExpandVPOverridesSet() {
+  return !EVLTransformOverride.empty() || !MaskTransformOverride.empty();
+}
+
+#define DEBUG_TYPE "expandvp"
+
+STATISTIC(NumFoldedVL, "Number of folded vector length params");
+STATISTIC(NumLoweredVPOps, "Number of folded vector predication operations");
+
+///// Helpers {
+
+/// \returns Whether the vector mask \p MaskVal has all lane bits set.
+static bool isAllTrueMask(Value *MaskVal) {
+  if (Value *SplattedVal = getSplatValue(MaskVal))
+    if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
+      return ConstValue->isAllOnesValue();
+
+  return false;
+}
+
+/// \returns A non-excepting divisor constant for this type.
+static Constant *getSafeDivisor(Type *DivTy) {
+  assert(DivTy->isIntOrIntVectorTy() && "Unsupported divisor type");
+  return ConstantInt::get(DivTy, 1u, false);
+}
+
+/// Transfer operation properties from \p OldVPI to \p NewVal.
+static void transferDecorations(Value &NewVal, VPIntrinsic &VPI) {
+  auto *NewInst = dyn_cast<Instruction>(&NewVal);
+  if (!NewInst || !isa<FPMathOperator>(NewVal))
+    return;
+
+  auto *OldFMOp = dyn_cast<FPMathOperator>(&VPI);
+  if (!OldFMOp)
+    return;
+
+  NewInst->setFastMathFlags(OldFMOp->getFastMathFlags());
+}
+
+/// Transfer all properties from \p OldOp to \p NewOp and replace all uses.
+/// OldVP gets erased.
+static void replaceOperation(Value &NewOp, VPIntrinsic &OldOp) {
+  transferDecorations(NewOp, OldOp);
+  OldOp.replaceAllUsesWith(&NewOp);
+  OldOp.eraseFromParent();
+}
+
+static bool maySpeculateLanes(VPIntrinsic &VPI) {
+  // The result of VP reductions depends on the mask and evl.
+  if (isa<VPReductionIntrinsic>(VPI))
+    return false;
+  // Fallback to whether the intrinsic is speculatable.
+  std::optional<unsigned> OpcOpt = VPI.getFunctionalOpcode();
+  unsigned FunctionalOpc = OpcOpt.value_or((unsigned)Instruction::Call);
+  return isSafeToSpeculativelyExecuteWithOpcode(FunctionalOpc, &VPI);
+}
+
+//// } Helpers
+
+namespace {
+
+// Expansion pass state at function scope.
+struct CachingVPExpander {
+  Function &F;
+  const TargetTransformInfo &TTI;
+
+  /// \returns A (fixed length) vector with ascending integer indices
+  /// (<0, 1, ..., NumElems-1>).
+  /// \p Builder
+  ///    Used for instruction creation.
+  /// \p LaneTy
+  ///    Integer element type of the result vector.
+  /// \p NumElems
+  ///    Number of vector elements.
+  Value *createStepVector(IRBuilder<> &Builder, Type *LaneTy,
+                          unsigned NumElems);
+
+  /// \returns A bitmask that is true where the lane position is less-than \p
+  /// EVLParam
+  ///
+  /// \p Builder
+  ///    Used for instruction creation.
+  /// \p VLParam
+  ///    The explicit vector length parameter to test against the lane
+  ///    positions.
+  /// \p ElemCount
+  ///    Static (potentially scalable) number of vector elements.
+  Value *convertEVLToMask(IRBuilder<> &Builder, Value *EVLParam,
+                          ElementCount ElemCount);
+
+  Value *foldEVLIntoMask(VPIntrinsic &VPI);
+
+  /// "Remove" the %evl parameter of \p PI by setting it to the static vector
+  /// length of the operation.
+  void discardEVLParameter(VPIntrinsic &PI);
+
+  /// Lower this VP binary operator to a unpredicated binary operator.
+  Value *expandPredicationInBinaryOperator(IRBuilder<> &Builder,
+                                           VPIntrinsic &PI);
+
+  /// Lower this VP fp call to a unpredicated fp call.
+  Value *expandPredicationToFPCall(IRBuilder<> &Builder, VPIntrinsic &PI,
+                                   unsigned UnpredicatedIntrinsicID);
+
+  /// Lower this VP reduction to a call to an unpredicated reduction intrinsic.
+  Value *expandPredicationInReduction(IRBuilder<> &Builder,
+                                      VPReductionIntrinsic &PI);
+
+  /// Lower this VP memory operation to a non-VP intrinsic.
+  Value *expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
+                                            VPIntrinsic &VPI);
+
+  /// Lower this VP comparison to a call to an unpredicated comparison.
+  Value *expandPredicationInComparison(IRBuilder<> &Builder,
+                                       VPCmpIntrinsic &PI);
+
+  /// Query TTI and expand the vector predication in \p P accordingly.
+  Value *expandPredication(VPIntrinsic &PI);
+
+  /// Determine how and whether the VPIntrinsic \p VPI shall be expanded. This
+  /// overrides TTI with the cl::opts listed at the top of this file.
+  VPLegalization getVPLegalizationStrategy(const VPIntrinsic &VPI) const;
+  bool UsingTTIOverrides;
+
+public:
+  CachingVPExpander(Function &F, const TargetTransformInfo &TTI)
+      : F(F), TTI(TTI), UsingTTIOverrides(anyExpandVPOverridesSet()) {}
+
+  bool expandVectorPredication();
+};
+
+//// CachingVPExpander {
+
+Value *CachingVPExpander::createStepVector(IRBuilder<> &Builder, Type *LaneTy,
+                                           unsigned NumElems) {
+  // TODO add caching
+  SmallVector<Constant *, 16> ConstElems;
+
+  for (unsigned Idx = 0; Idx < NumElems; ++Idx)
+    ConstElems.push_back(ConstantInt::get(LaneTy, Idx, false));
+
+  return ConstantVector::get(ConstElems);
+}
+
+Value *CachingVPExpander::convertEVLToMask(IRBuilder<> &Builder,
+                                           Value *EVLParam,
+                                           ElementCount ElemCount) {
+  // TODO add caching
+  // Scalable vector %evl conversion.
+  if (ElemCount.isScalable()) {
+    auto *M = Builder.GetInsertBlock()->getModule();
+    Type *BoolVecTy = VectorType::get(Builder.getInt1Ty(), ElemCount);
+    Function *ActiveMaskFunc = Intrinsic::getDeclaration(
+        M, Intrinsic::get_active_lane_mask, {BoolVecTy, EVLParam->getType()});
+    // `get_active_lane_mask` performs an implicit less-than comparison.
+    Value *ConstZero = Builder.getInt32(0);
+    return Builder.CreateCall(ActiveMaskFunc, {ConstZero, EVLParam});
+  }
+
+  // Fixed vector %evl conversion.
+  Type *LaneTy = EVLParam->getType();
+  unsigned NumElems = ElemCount.getFixedValue();
+  Value *VLSplat = Builder.CreateVectorSplat(NumElems, EVLParam);
+  Value *IdxVec = createStepVector(Builder, LaneTy, NumElems);
+  return Builder.CreateICmp(CmpInst::ICMP_ULT, IdxVec, VLSplat);
+}
+
+Value *
+CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder,
+                                                     VPIntrinsic &VPI) {
+  assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) &&
+         "Implicitly dropping %evl in non-speculatable operator!");
+
+  auto OC = static_cast<Instruction::BinaryOps>(*VPI.getFunctionalOpcode());
+  assert(Instruction::isBinaryOp(OC));
+
+  Value *Op0 = VPI.getOperand(0);
+  Value *Op1 = VPI.getOperand(1);
+  Value *Mask = VPI.getMaskParam();
+
+  // Blend in safe operands.
+  if (Mask && !isAllTrueMask(Mask)) {
+    switch (OC) {
+    default:
+      // Can safely ignore the predicate.
+      break;
+
+    // Division operators need a safe divisor on masked-off lanes (1).
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+      // 2nd operand must not be zero.
+      Value *SafeDivisor = getSafeDivisor(VPI.getType());
+      Op1 = Builder.CreateSelect(Mask, Op1, SafeDivisor);
+    }
+  }
+
+  Value *NewBinOp = Builder.CreateBinOp(OC, Op0, Op1, VPI.getName());
+
+  replaceOperation(*NewBinOp, VPI);
+  return NewBinOp;
+}
+
+Value *CachingVPExpander::expandPredicationToFPCall(
+    IRBuilder<> &Builder, VPIntrinsic &VPI, unsigned UnpredicatedIntrinsicID) {
+  assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) &&
+         "Implicitly dropping %evl in non-speculatable operator!");
+
+  switch (UnpredicatedIntrinsicID) {
+  case Intrinsic::fabs:
+  case Intrinsic::sqrt: {
+    Value *Op0 = VPI.getOperand(0);
+    Function *Fn = Intrinsic::getDeclaration(
+        VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()});
+    Value *NewOp = Builder.CreateCall(Fn, {Op0}, VPI.getName());
+    replaceOperation(*NewOp, VPI);
+    return NewOp;
+  }
+  case Intrinsic::experimental_constrained_fma:
+  case Intrinsic::experimental_constrained_fmuladd: {
+    Value *Op0 = VPI.getOperand(0);
+    Value *Op1 = VPI.getOperand(1);
+    Value *Op2 = VPI.getOperand(2);
+    Function *Fn = Intrinsic::getDeclaration(
+        VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()});
+    Value *NewOp =
+        Builder.CreateConstrainedFPCall(Fn, {Op0, Op1, Op2}, VPI.getName());
+    replaceOperation(*NewOp, VPI);
+    return NewOp;
+  }
+  }
+
+  return nullptr;
+}
+
+static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI,
+                                         Type *EltTy) {
+  bool Negative = false;
+  unsigned EltBits = EltTy->getScalarSizeInBits();
+  switch (VPI.getIntrinsicID()) {
+  default:
+    llvm_unreachable("Expecting a VP reduction intrinsic");
+  case Intrinsic::vp_reduce_add:
+  case Intrinsic::vp_reduce_or:
+  case Intrinsic::vp_reduce_xor:
+  case Intrinsic::vp_reduce_umax:
+    return Constant::getNullValue(EltTy);
+  case Intrinsic::vp_reduce_mul:
+    return ConstantInt::get(EltTy, 1, /*IsSigned*/ false);
+  case Intrinsic::vp_reduce_and:
+  case Intrinsic::vp_reduce_umin:
+    return ConstantInt::getAllOnesValue(EltTy);
+  case Intrinsic::vp_reduce_smin:
+    return ConstantInt::get(EltTy->getContext(),
+                            APInt::getSignedMaxValue(EltBits));
+  case Intrinsic::vp_reduce_smax:
+    return ConstantInt::get(EltTy->getContext(),
+                            APInt::getSignedMinValue(EltBits));
+  case Intrinsic::vp_reduce_fmax:
+    Negative = true;
+    [[fallthrough]];
+  case Intrinsic::vp_reduce_fmin: {
+    FastMathFlags Flags = VPI.getFastMathFlags();
+    const fltSemantics &Semantics = EltTy->getFltSemantics();
+    return !Flags.noNaNs() ? ConstantFP::getQNaN(EltTy, Negative)
+           : !Flags.noInfs()
+               ? ConstantFP::getInfinity(EltTy, Negative)
+               : ConstantFP::get(EltTy,
+                                 APFloat::getLargest(Semantics, Negative));
+  }
+  case Intrinsic::vp_reduce_fadd:
+    return ConstantFP::getNegativeZero(EltTy);
+  case Intrinsic::vp_reduce_fmul:
+    return ConstantFP::get(EltTy, 1.0);
+  }
+}
+
+Value *
+CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder,
+                                                VPReductionIntrinsic &VPI) {
+  assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) &&
+         "Implicitly dropping %evl in non-speculatable operator!");
+
+  Value *Mask = VPI.getMaskParam();
+  Value *RedOp = VPI.getOperand(VPI.getVectorParamPos());
+
+  // Insert neutral element in masked-out positions
+  if (Mask && !isAllTrueMask(Mask)) {
+    auto *NeutralElt = getNeutralReductionElement(VPI, VPI.getType());
+    auto *NeutralVector = Builder.CreateVectorSplat(
+        cast<VectorType>(RedOp->getType())->getElementCount(), NeutralElt);
+    RedOp = Builder.CreateSelect(Mask, RedOp, NeutralVector);
+  }
+
+  Value *Reduction;
+  Value *Start = VPI.getOperand(VPI.getStartParamPos());
+
+  switch (VPI.getIntrinsicID()) {
+  default:
+    llvm_unreachable("Impossible reduction kind");
+  case Intrinsic::vp_reduce_add:
+    Reduction = Builder.CreateAddReduce(RedOp);
+    Reduction = Builder.CreateAdd(Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_mul:
+    Reduction = Builder.CreateMulReduce(RedOp);
+    Reduction = Builder.CreateMul(Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_and:
+    Reduction = Builder.CreateAndReduce(RedOp);
+    Reduction = Builder.CreateAnd(Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_or:
+    Reduction = Builder.CreateOrReduce(RedOp);
+    Reduction = Builder.CreateOr(Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_xor:
+    Reduction = Builder.CreateXorReduce(RedOp);
+    Reduction = Builder.CreateXor(Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_smax:
+    Reduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ true);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::smax, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_smin:
+    Reduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ true);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::smin, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_umax:
+    Reduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ false);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::umax, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_umin:
+    Reduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ false);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::umin, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_fmax:
+    Reduction = Builder.CreateFPMaxReduce(RedOp);
+    transferDecorations(*Reduction, VPI);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_fmin:
+    Reduction = Builder.CreateFPMinReduce(RedOp);
+    transferDecorations(*Reduction, VPI);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::minnum, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_fadd:
+    Reduction = Builder.CreateFAddReduce(Start, RedOp);
+    break;
+  case Intrinsic::vp_reduce_fmul:
+    Reduction = Builder.CreateFMulReduce(Start, RedOp);
+    break;
+  }
+
+  replaceOperation(*Reduction, VPI);
+  return Reduction;
+}
+
+Value *
+CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
+                                                      VPIntrinsic &VPI) {
+  assert(VPI.canIgnoreVectorLengthParam());
+
+  const auto &DL = F.getParent()->getDataLayout();
+
+  Value *MaskParam = VPI.getMaskParam();
+  Value *PtrParam = VPI.getMemoryPointerParam();
+  Value *DataParam = VPI.getMemoryDataParam();
+  bool IsUnmasked = isAllTrueMask(MaskParam);
+
+  MaybeAlign AlignOpt = VPI.getPointerAlignment();
+
+  Value *NewMemoryInst = nullptr;
+  switch (VPI.getIntrinsicID()) {
+  default:
+    llvm_unreachable("Not a VP memory intrinsic");
+  case Intrinsic::vp_store:
+    if (IsUnmasked) {
+      StoreInst *NewStore =
+          Builder.CreateStore(DataParam, PtrParam, /*IsVolatile*/ false);
+      if (AlignOpt.has_value())
+        NewStore->setAlignment(*AlignOpt);
+      NewMemoryInst = NewStore;
+    } else
+      NewMemoryInst = Builder.CreateMaskedStore(
+          DataParam, PtrParam, AlignOpt.valueOrOne(), MaskParam);
+
+    break;
+  case Intrinsic::vp_load:
+    if (IsUnmasked) {
+      LoadInst *NewLoad =
+          Builder.CreateLoad(VPI.getType(), PtrParam, /*IsVolatile*/ false);
+      if (AlignOpt.has_value())
+        NewLoad->setAlignment(*AlignOpt);
+      NewMemoryInst = NewLoad;
+    } else
+      NewMemoryInst = Builder.CreateMaskedLoad(
+          VPI.getType(), PtrParam, AlignOpt.valueOrOne(), MaskParam);
+
+    break;
+  case Intrinsic::vp_scatter: {
+    auto *ElementType =
+        cast<VectorType>(DataParam->getType())->getElementType();
+    NewMemoryInst = Builder.CreateMaskedScatter(
+        DataParam, PtrParam,
+        AlignOpt.value_or(DL.getPrefTypeAlign(ElementType)), MaskParam);
+    break;
+  }
+  case Intrinsic::vp_gather: {
+    auto *ElementType = cast<VectorType>(VPI.getType())->getElementType();
+    NewMemoryInst = Builder.CreateMaskedGather(
+        VPI.getType(), PtrParam,
+        AlignOpt.value_or(DL.getPrefTypeAlign(ElementType)), MaskParam, nullptr,
+        VPI.getName());
+    break;
+  }
+  }
+
+  assert(NewMemoryInst);
+  replaceOperation(*NewMemoryInst, VPI);
+  return NewMemoryInst;
+}
+
+Value *CachingVPExpander::expandPredicationInComparison(IRBuilder<> &Builder,
+                                                        VPCmpIntrinsic &VPI) {
+  assert((maySpeculateLanes(VPI) || VPI.canIgnoreVectorLengthParam()) &&
+         "Implicitly dropping %evl in non-speculatable operator!");
+
+  assert(*VPI.getFunctionalOpcode() == Instruction::ICmp ||
+         *VPI.getFunctionalOpcode() == Instruction::FCmp);
+
+  Value *Op0 = VPI.getOperand(0);
+  Value *Op1 = VPI.getOperand(1);
+  auto Pred = VPI.getPredicate();
+
+  auto *NewCmp = Builder.CreateCmp(Pred, Op0, Op1);
+
+  replaceOperation(*NewCmp, VPI);
+  return NewCmp;
+}
+
+void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
+  LLVM_DEBUG(dbgs() << "Discard EVL parameter in " << VPI << "\n");
+
+  if (VPI.canIgnoreVectorLengthParam())
+    return;
+
+  Value *EVLParam = VPI.getVectorLengthParam();
+  if (!EVLParam)
+    return;
+
+  ElementCount StaticElemCount = VPI.getStaticVectorLength();
+  Value *MaxEVL = nullptr;
+  Type *Int32Ty = Type::getInt32Ty(VPI.getContext());
+  if (StaticElemCount.isScalable()) {
+    // TODO add caching
+    auto *M = VPI.getModule();
+    Function *VScaleFunc =
+        Intrinsic::getDeclaration(M, Intrinsic::vscale, Int32Ty);
+    IRBuilder<> Builder(VPI.getParent(), VPI.getIterator());
+    Value *FactorConst = Builder.getInt32(StaticElemCount.getKnownMinValue());
+    Value *VScale = Builder.CreateCall(VScaleFunc, {}, "vscale");
+    MaxEVL = Builder.CreateMul(VScale, FactorConst, "scalable_size",
+                               /*NUW*/ true, /*NSW*/ false);
+  } else {
+    MaxEVL = ConstantInt::get(Int32Ty, StaticElemCount.getFixedValue(), false);
+  }
+  VPI.setVectorLengthParam(MaxEVL);
+}
+
+Value *CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) {
+  LLVM_DEBUG(dbgs() << "Folding vlen for " << VPI << '\n');
+
+  IRBuilder<> Builder(&VPI);
+
+  // Ineffective %evl parameter and so nothing to do here.
+  if (VPI.canIgnoreVectorLengthParam())
+    return &VPI;
+
+  // Only VP intrinsics can have an %evl parameter.
+  Value *OldMaskParam = VPI.getMaskParam();
+  Value *OldEVLParam = VPI.getVectorLengthParam();
+  assert(OldMaskParam && "no mask param to fold the vl param into");
+  assert(OldEVLParam && "no EVL param to fold away");
+
+  LLVM_DEBUG(dbgs() << "OLD evl: " << *OldEVLParam << '\n');
+  LLVM_DEBUG(dbgs() << "OLD mask: " << *OldMaskParam << '\n');
+
+  // Convert the %evl predication into vector mask predication.
+  ElementCount ElemCount = VPI.getStaticVectorLength();
+  Value *VLMask = convertEVLToMask(Builder, OldEVLParam, ElemCount);
+  Value *NewMaskParam = Builder.CreateAnd(VLMask, OldMaskParam);
+  VPI.setMaskParam(NewMaskParam);
+
+  // Drop the %evl parameter.
+  discardEVLParameter(VPI);
+  assert(VPI.canIgnoreVectorLengthParam() &&
+         "transformation did not render the evl param ineffective!");
+
+  // Reassess the modified instruction.
+  return &VPI;
+}
+
+Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
+  LLVM_DEBUG(dbgs() << "Lowering to unpredicated op: " << VPI << '\n');
+
+  IRBuilder<> Builder(&VPI);
+
+  // Try lowering to a LLVM instruction first.
+  auto OC = VPI.getFunctionalOpcode();
+
+  if (OC && Instruction::isBinaryOp(*OC))
+    return expandPredicationInBinaryOperator(Builder, VPI);
+
+  if (auto *VPRI = dyn_cast<VPReductionIntrinsic>(&VPI))
+    return expandPredicationInReduction(Builder, *VPRI);
+
+  if (auto *VPCmp = dyn_cast<VPCmpIntrinsic>(&VPI))
+    return expandPredicationInComparison(Builder, *VPCmp);
+
+  switch (VPI.getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::vp_fneg: {
+    Value *NewNegOp = Builder.CreateFNeg(VPI.getOperand(0), VPI.getName());
+    replaceOperation(*NewNegOp, VPI);
+    return NewNegOp;  
+  }
+  case Intrinsic::vp_fabs:
+    return expandPredicationToFPCall(Builder, VPI, Intrinsic::fabs);
+  case Intrinsic::vp_sqrt:
+    return expandPredicationToFPCall(Builder, VPI, Intrinsic::sqrt);
+  case Intrinsic::vp_load:
+  case Intrinsic::vp_store:
+  case Intrinsic::vp_gather:
+  case Intrinsic::vp_scatter:
+    return expandPredicationInMemoryIntrinsic(Builder, VPI);
+  }
+
+  if (auto CID = VPI.getConstrainedIntrinsicID())
+    if (Value *Call = expandPredicationToFPCall(Builder, VPI, *CID))
+      return Call;
+
+  return &VPI;
+}
+
+//// } CachingVPExpander
+
+struct TransformJob {
+  VPIntrinsic *PI;
+  TargetTransformInfo::VPLegalization Strategy;
+  TransformJob(VPIntrinsic *PI, TargetTransformInfo::VPLegalization InitStrat)
+      : PI(PI), Strategy(InitStrat) {}
+
+  bool isDone() const { return Strategy.shouldDoNothing(); }
+};
+
+void sanitizeStrategy(VPIntrinsic &VPI, VPLegalization &LegalizeStrat) {
+  // Operations with speculatable lanes do not strictly need predication.
+  if (maySpeculateLanes(VPI)) {
+    // Converting a speculatable VP intrinsic means dropping %mask and %evl.
+    // No need to expand %evl into the %mask only to ignore that code.
+    if (LegalizeStrat.OpStrategy == VPLegalization::Convert)
+      LegalizeStrat.EVLParamStrategy = VPLegalization::Discard;
+    return;
+  }
+
+  // We have to preserve the predicating effect of %evl for this
+  // non-speculatable VP intrinsic.
+  // 1) Never discard %evl.
+  // 2) If this VP intrinsic will be expanded to non-VP code, make sure that
+  //    %evl gets folded into %mask.
+  if ((LegalizeStrat.EVLParamStrategy == VPLegalization::Discard) ||
+      (LegalizeStrat.OpStrategy == VPLegalization::Convert)) {
+    LegalizeStrat.EVLParamStrategy = VPLegalization::Convert;
+  }
+}
+
+VPLegalization
+CachingVPExpander::getVPLegalizationStrategy(const VPIntrinsic &VPI) const {
+  auto VPStrat = TTI.getVPLegalizationStrategy(VPI);
+  if (LLVM_LIKELY(!UsingTTIOverrides)) {
+    // No overrides - we are in production.
+    return VPStrat;
+  }
+
+  // Overrides set - we are in testing, the following does not need to be
+  // efficient.
+  VPStrat.EVLParamStrategy = parseOverrideOption(EVLTransformOverride);
+  VPStrat.OpStrategy = parseOverrideOption(MaskTransformOverride);
+  return VPStrat;
+}
+
+/// Expand llvm.vp.* intrinsics as requested by \p TTI.
+bool CachingVPExpander::expandVectorPredication() {
+  SmallVector<TransformJob, 16> Worklist;
+
+  // Collect all VPIntrinsics that need expansion and determine their expansion
+  // strategy.
+  for (auto &I : instructions(F)) {
+    auto *VPI = dyn_cast<VPIntrinsic>(&I);
+    if (!VPI)
+      continue;
+    auto VPStrat = getVPLegalizationStrategy(*VPI);
+    sanitizeStrategy(*VPI, VPStrat);
+    if (!VPStrat.shouldDoNothing())
+      Worklist.emplace_back(VPI, VPStrat);
+  }
+  if (Worklist.empty())
+    return false;
+
+  // Transform all VPIntrinsics on the worklist.
+  LLVM_DEBUG(dbgs() << "\n:::: Transforming " << Worklist.size()
+                    << " instructions ::::\n");
+  for (TransformJob Job : Worklist) {
+    // Transform the EVL parameter.
+    switch (Job.Strategy.EVLParamStrategy) {
+    case VPLegalization::Legal:
+      break;
+    case VPLegalization::Discard:
+      discardEVLParameter(*Job.PI);
+      break;
+    case VPLegalization::Convert:
+      if (foldEVLIntoMask(*Job.PI))
+        ++NumFoldedVL;
+      break;
+    }
+    Job.Strategy.EVLParamStrategy = VPLegalization::Legal;
+
+    // Replace with a non-predicated operation.
+    switch (Job.Strategy.OpStrategy) {
+    case VPLegalization::Legal:
+      break;
+    case VPLegalization::Discard:
+      llvm_unreachable("Invalid strategy for operators.");
+    case VPLegalization::Convert:
+      expandPredication(*Job.PI);
+      ++NumLoweredVPOps;
+      break;
+    }
+    Job.Strategy.OpStrategy = VPLegalization::Legal;
+
+    assert(Job.isDone() && "incomplete transformation");
+  }
+
+  return true;
+}
+class ExpandVectorPredication : public FunctionPass {
+public:
+  static char ID;
+  ExpandVectorPredication() : FunctionPass(ID) {
+    initializeExpandVectorPredicationPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    CachingVPExpander VPExpander(F, *TTI);
+    return VPExpander.expandVectorPredication();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+} // namespace
+
+char ExpandVectorPredication::ID;
+INITIALIZE_PASS_BEGIN(ExpandVectorPredication, "expandvp",
+                      "Expand vector predication intrinsics", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ExpandVectorPredication, "expandvp",
+                    "Expand vector predication intrinsics", false, false)
+
+FunctionPass *llvm::createExpandVectorPredicationPass() {
+  return new ExpandVectorPredication();
+}
+
+PreservedAnalyses
+ExpandVectorPredicationPass::run(Function &F, FunctionAnalysisManager &AM) {
+  const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  CachingVPExpander VPExpander(F, TTI);
+  if (!VPExpander.expandVectorPredication())
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/FEntryInserter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/FEntryInserter.cpp
new file mode 100644
index 000000000000..68304dd41db0
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/FEntryInserter.cpp
@@ -0,0 +1,50 @@
+//===-- FEntryInsertion.cpp - Patchable prologues for LLVM -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file edits function bodies to insert fentry calls.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+namespace {
+struct FEntryInserter : public MachineFunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  FEntryInserter() : MachineFunctionPass(ID) {
+    initializeFEntryInserterPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+};
+}
+
+bool FEntryInserter::runOnMachineFunction(MachineFunction &MF) {
+  const std::string FEntryName = std::string(
+      MF.getFunction().getFnAttribute("fentry-call").getValueAsString());
+  if (FEntryName != "true")
+    return false;
+
+  auto &FirstMBB = *MF.begin();
+  auto *TII = MF.getSubtarget().getInstrInfo();
+  BuildMI(FirstMBB, FirstMBB.begin(), DebugLoc(),
+          TII->get(TargetOpcode::FENTRY_CALL));
+  return true;
+}
+
+char FEntryInserter::ID = 0;
+char &llvm::FEntryInserterID = FEntryInserter::ID;
+INITIALIZE_PASS(FEntryInserter, "fentry-insert", "Insert fentry calls", false,
+                false)
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/FaultMaps.cpp b/contrib/llvm-project/llvm/lib/CodeGen/FaultMaps.cpp
new file mode 100644
index 000000000000..3f8fe2402d65
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/FaultMaps.cpp
@@ -0,0 +1,114 @@
+//===- FaultMaps.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/FaultMaps.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "faultmaps"
+
+static const int FaultMapVersion = 1;
+const char *FaultMaps::WFMP = "Fault Maps: ";
+
+FaultMaps::FaultMaps(AsmPrinter &AP) : AP(AP) {}
+
+void FaultMaps::recordFaultingOp(FaultKind FaultTy,
+                                 const MCSymbol *FaultingLabel,
+                                 const MCSymbol *HandlerLabel) {
+  MCContext &OutContext = AP.OutStreamer->getContext();
+
+  const MCExpr *FaultingOffset = MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(FaultingLabel, OutContext),
+      MCSymbolRefExpr::create(AP.CurrentFnSymForSize, OutContext), OutContext);
+
+  const MCExpr *HandlerOffset = MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(HandlerLabel, OutContext),
+      MCSymbolRefExpr::create(AP.CurrentFnSymForSize, OutContext), OutContext);
+
+  FunctionInfos[AP.CurrentFnSym].emplace_back(FaultTy, FaultingOffset,
+                                              HandlerOffset);
+}
+
+void FaultMaps::serializeToFaultMapSection() {
+  if (FunctionInfos.empty())
+    return;
+
+  MCContext &OutContext = AP.OutStreamer->getContext();
+  MCStreamer &OS = *AP.OutStreamer;
+
+  // Create the section.
+  MCSection *FaultMapSection =
+      OutContext.getObjectFileInfo()->getFaultMapSection();
+  OS.switchSection(FaultMapSection);
+
+  // Emit a dummy symbol to force section inclusion.
+  OS.emitLabel(OutContext.getOrCreateSymbol(Twine("__LLVM_FaultMaps")));
+
+  LLVM_DEBUG(dbgs() << "********** Fault Map Output **********\n");
+
+  // Header
+  OS.emitIntValue(FaultMapVersion, 1); // Version.
+  OS.emitIntValue(0, 1);               // Reserved.
+  OS.emitInt16(0);                     // Reserved.
+
+  LLVM_DEBUG(dbgs() << WFMP << "#functions = " << FunctionInfos.size() << "\n");
+  OS.emitInt32(FunctionInfos.size());
+
+  LLVM_DEBUG(dbgs() << WFMP << "functions:\n");
+
+  for (const auto &FFI : FunctionInfos)
+    emitFunctionInfo(FFI.first, FFI.second);
+}
+
+void FaultMaps::emitFunctionInfo(const MCSymbol *FnLabel,
+                                 const FunctionFaultInfos &FFI) {
+  MCStreamer &OS = *AP.OutStreamer;
+
+  LLVM_DEBUG(dbgs() << WFMP << "  function addr: " << *FnLabel << "\n");
+  OS.emitSymbolValue(FnLabel, 8);
+
+  LLVM_DEBUG(dbgs() << WFMP << "  #faulting PCs: " << FFI.size() << "\n");
+  OS.emitInt32(FFI.size());
+
+  OS.emitInt32(0); // Reserved
+
+  for (const auto &Fault : FFI) {
+    LLVM_DEBUG(dbgs() << WFMP << "    fault type: "
+                      << faultTypeToString(Fault.Kind) << "\n");
+    OS.emitInt32(Fault.Kind);
+
+    LLVM_DEBUG(dbgs() << WFMP << "    faulting PC offset: "
+                      << *Fault.FaultingOffsetExpr << "\n");
+    OS.emitValue(Fault.FaultingOffsetExpr, 4);
+
+    LLVM_DEBUG(dbgs() << WFMP << "    fault handler PC offset: "
+                      << *Fault.HandlerOffsetExpr << "\n");
+    OS.emitValue(Fault.HandlerOffsetExpr, 4);
+  }
+}
+
+const char *FaultMaps::faultTypeToString(FaultMaps::FaultKind FT) {
+  switch (FT) {
+  default:
+    llvm_unreachable("unhandled fault type!");
+  case FaultMaps::FaultingLoad:
+    return "FaultingLoad";
+  case FaultMaps::FaultingLoadStore:
+    return "FaultingLoadStore";
+  case FaultMaps::FaultingStore:
+    return "FaultingStore";
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/FinalizeISel.cpp b/contrib/llvm-project/llvm/lib/CodeGen/FinalizeISel.cpp
new file mode 100644
index 000000000000..329c9587e321
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/FinalizeISel.cpp
@@ -0,0 +1,75 @@
+//===-- llvm/CodeGen/FinalizeISel.cpp ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// This pass expands Pseudo-instructions produced by ISel, fixes register
+/// reservations and may do machine frame information adjustments.
+/// The pseudo instructions are used to allow the expansion to contain control
+/// flow, such as a conditional move implemented with a conditional branch and a
+/// phi, or an atomic operation implemented with a loop.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "finalize-isel"
+
+namespace {
+  class FinalizeISel : public MachineFunctionPass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    FinalizeISel() : MachineFunctionPass(ID) {}
+
+  private:
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+} // end anonymous namespace
+
+char FinalizeISel::ID = 0;
+char &llvm::FinalizeISelID = FinalizeISel::ID;
+INITIALIZE_PASS(FinalizeISel, DEBUG_TYPE,
+                "Finalize ISel and expand pseudo-instructions", false, false)
+
+bool FinalizeISel::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
+
+  // Iterate through each instruction in the function, looking for pseudos.
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = &*I;
+    for (MachineBasicBlock::iterator MBBI = MBB->begin(), MBBE = MBB->end();
+         MBBI != MBBE; ) {
+      MachineInstr &MI = *MBBI++;
+
+      // If MI is a pseudo, expand it.
+      if (MI.usesCustomInsertionHook()) {
+        Changed = true;
+        MachineBasicBlock *NewMBB = TLI->EmitInstrWithCustomInserter(MI, MBB);
+        // The expansion may involve new basic blocks.
+        if (NewMBB != MBB) {
+          MBB = NewMBB;
+          I = NewMBB->getIterator();
+          MBBI = NewMBB->begin();
+          MBBE = NewMBB->end();
+        }
+      }
+    }
+  }
+
+  TLI->finalizeLowering(MF);
+
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/contrib/llvm-project/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
new file mode 100644
index 000000000000..75504ef32250
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
@@ -0,0 +1,628 @@
+//===-- FixupStatepointCallerSaved.cpp - Fixup caller saved registers  ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Statepoint instruction in deopt parameters contains values which are
+/// meaningful to the runtime and should be able to be read at the moment the
+/// call returns. So we can say that we need to encode the fact that these
+/// values are "late read" by runtime. If we could express this notion for
+/// register allocator it would produce the right form for us.
+/// The need to fixup (i.e this pass) is specifically handling the fact that
+/// we cannot describe such a late read for the register allocator.
+/// Register allocator may put the value on a register clobbered by the call.
+/// This pass forces the spill of such registers and replaces corresponding
+/// statepoint operands to added spill slots.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "fixup-statepoint-caller-saved"
+STATISTIC(NumSpilledRegisters, "Number of spilled register");
+STATISTIC(NumSpillSlotsAllocated, "Number of spill slots allocated");
+STATISTIC(NumSpillSlotsExtended, "Number of spill slots extended");
+
+static cl::opt<bool> FixupSCSExtendSlotSize(
+    "fixup-scs-extend-slot-size", cl::Hidden, cl::init(false),
+    cl::desc("Allow spill in spill slot of greater size than register size"),
+    cl::Hidden);
+
+static cl::opt<bool> PassGCPtrInCSR(
+    "fixup-allow-gcptr-in-csr", cl::Hidden, cl::init(false),
+    cl::desc("Allow passing GC Pointer arguments in callee saved registers"));
+
+static cl::opt<bool> EnableCopyProp(
+    "fixup-scs-enable-copy-propagation", cl::Hidden, cl::init(true),
+    cl::desc("Enable simple copy propagation during register reloading"));
+
+// This is purely debugging option.
+// It may be handy for investigating statepoint spilling issues.
+static cl::opt<unsigned> MaxStatepointsWithRegs(
+    "fixup-max-csr-statepoints", cl::Hidden,
+    cl::desc("Max number of statepoints allowed to pass GC Ptrs in registers"));
+
+namespace {
+
+class FixupStatepointCallerSaved : public MachineFunctionPass {
+public:
+  static char ID;
+
+  FixupStatepointCallerSaved() : MachineFunctionPass(ID) {
+    initializeFixupStatepointCallerSavedPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override {
+    return "Fixup Statepoint Caller Saved";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // End anonymous namespace.
+
+char FixupStatepointCallerSaved::ID = 0;
+char &llvm::FixupStatepointCallerSavedID = FixupStatepointCallerSaved::ID;
+
+INITIALIZE_PASS_BEGIN(FixupStatepointCallerSaved, DEBUG_TYPE,
+                      "Fixup Statepoint Caller Saved", false, false)
+INITIALIZE_PASS_END(FixupStatepointCallerSaved, DEBUG_TYPE,
+                    "Fixup Statepoint Caller Saved", false, false)
+
+// Utility function to get size of the register.
+static unsigned getRegisterSize(const TargetRegisterInfo &TRI, Register Reg) {
+  const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
+  return TRI.getSpillSize(*RC);
+}
+
+// Try to eliminate redundant copy to register which we're going to
+// spill, i.e. try to change:
+//    X = COPY Y
+//    SPILL X
+//  to
+//    SPILL Y
+//  If there are no uses of X between copy and STATEPOINT, that COPY
+//  may be eliminated.
+//  Reg - register we're about to spill
+//  RI - On entry points to statepoint.
+//       On successful copy propagation set to new spill point.
+//  IsKill - set to true if COPY is Kill (there are no uses of Y)
+//  Returns either found source copy register or original one.
+static Register performCopyPropagation(Register Reg,
+                                       MachineBasicBlock::iterator &RI,
+                                       bool &IsKill, const TargetInstrInfo &TII,
+                                       const TargetRegisterInfo &TRI) {
+  // First check if statepoint itself uses Reg in non-meta operands.
+  int Idx = RI->findRegisterUseOperandIdx(Reg, false, &TRI);
+  if (Idx >= 0 && (unsigned)Idx < StatepointOpers(&*RI).getNumDeoptArgsIdx()) {
+    IsKill = false;
+    return Reg;
+  }
+
+  if (!EnableCopyProp)
+    return Reg;
+
+  MachineBasicBlock *MBB = RI->getParent();
+  MachineBasicBlock::reverse_iterator E = MBB->rend();
+  MachineInstr *Def = nullptr, *Use = nullptr;
+  for (auto It = ++(RI.getReverse()); It != E; ++It) {
+    if (It->readsRegister(Reg, &TRI) && !Use)
+      Use = &*It;
+    if (It->modifiesRegister(Reg, &TRI)) {
+      Def = &*It;
+      break;
+    }
+  }
+
+  if (!Def)
+    return Reg;
+
+  auto DestSrc = TII.isCopyInstr(*Def);
+  if (!DestSrc || DestSrc->Destination->getReg() != Reg)
+    return Reg;
+
+  Register SrcReg = DestSrc->Source->getReg();
+
+  if (getRegisterSize(TRI, Reg) != getRegisterSize(TRI, SrcReg))
+    return Reg;
+
+  LLVM_DEBUG(dbgs() << "spillRegisters: perform copy propagation "
+                    << printReg(Reg, &TRI) << " -> " << printReg(SrcReg, &TRI)
+                    << "\n");
+
+  // Insert spill immediately after Def
+  RI = ++MachineBasicBlock::iterator(Def);
+  IsKill = DestSrc->Source->isKill();
+
+  if (!Use) {
+    // There are no uses of original register between COPY and STATEPOINT.
+    // There can't be any after STATEPOINT, so we can eliminate Def.
+    LLVM_DEBUG(dbgs() << "spillRegisters: removing dead copy " << *Def);
+    Def->eraseFromParent();
+  } else if (IsKill) {
+    // COPY will remain in place, spill will be inserted *after* it, so it is
+    // not a kill of source anymore.
+    const_cast<MachineOperand *>(DestSrc->Source)->setIsKill(false);
+  }
+
+  return SrcReg;
+}
+
+namespace {
+// Pair {Register, FrameIndex}
+using RegSlotPair = std::pair<Register, int>;
+
+// Keeps track of what reloads were inserted in MBB.
+class RegReloadCache {
+  using ReloadSet = SmallSet<RegSlotPair, 8>;
+  DenseMap<const MachineBasicBlock *, ReloadSet> Reloads;
+
+public:
+  RegReloadCache() = default;
+
+  // Record reload of Reg from FI in block MBB
+  void recordReload(Register Reg, int FI, const MachineBasicBlock *MBB) {
+    RegSlotPair RSP(Reg, FI);
+    auto Res = Reloads[MBB].insert(RSP);
+    (void)Res;
+    assert(Res.second && "reload already exists");
+  }
+
+  // Does basic block MBB contains reload of Reg from FI?
+  bool hasReload(Register Reg, int FI, const MachineBasicBlock *MBB) {
+    RegSlotPair RSP(Reg, FI);
+    return Reloads.count(MBB) && Reloads[MBB].count(RSP);
+  }
+};
+
+// Cache used frame indexes during statepoint re-write to re-use them in
+// processing next statepoint instruction.
+// Two strategies. One is to preserve the size of spill slot while another one
+// extends the size of spill slots to reduce the number of them, causing
+// the less total frame size. But unspill will have "implicit" any extend.
+class FrameIndexesCache {
+private:
+  struct FrameIndexesPerSize {
+    // List of used frame indexes during processing previous statepoints.
+    SmallVector<int, 8> Slots;
+    // Current index of un-used yet frame index.
+    unsigned Index = 0;
+  };
+  MachineFrameInfo &MFI;
+  const TargetRegisterInfo &TRI;
+  // Map size to list of frame indexes of this size. If the mode is
+  // FixupSCSExtendSlotSize then the key 0 is used to keep all frame indexes.
+  // If the size of required spill slot is greater than in a cache then the
+  // size will be increased.
+  DenseMap<unsigned, FrameIndexesPerSize> Cache;
+
+  // Keeps track of slots reserved for the shared landing pad processing.
+  // Initialized from GlobalIndices for the current EHPad.
+  SmallSet<int, 8> ReservedSlots;
+
+  // Landing pad can be destination of several statepoints. Every register
+  // defined by such statepoints must be spilled to the same stack slot.
+  // This map keeps that information.
+  DenseMap<const MachineBasicBlock *, SmallVector<RegSlotPair, 8>>
+      GlobalIndices;
+
+  FrameIndexesPerSize &getCacheBucket(unsigned Size) {
+    // In FixupSCSExtendSlotSize mode the bucket with 0 index is used
+    // for all sizes.
+    return Cache[FixupSCSExtendSlotSize ? 0 : Size];
+  }
+
+public:
+  FrameIndexesCache(MachineFrameInfo &MFI, const TargetRegisterInfo &TRI)
+      : MFI(MFI), TRI(TRI) {}
+  // Reset the current state of used frame indexes. After invocation of
+  // this function all frame indexes are available for allocation with
+  // the exception of slots reserved for landing pad processing (if any).
+  void reset(const MachineBasicBlock *EHPad) {
+    for (auto &It : Cache)
+      It.second.Index = 0;
+
+    ReservedSlots.clear();
+    if (EHPad && GlobalIndices.count(EHPad))
+      for (auto &RSP : GlobalIndices[EHPad])
+        ReservedSlots.insert(RSP.second);
+  }
+
+  // Get frame index to spill the register.
+  int getFrameIndex(Register Reg, MachineBasicBlock *EHPad) {
+    // Check if slot for Reg is already reserved at EHPad.
+    auto It = GlobalIndices.find(EHPad);
+    if (It != GlobalIndices.end()) {
+      auto &Vec = It->second;
+      auto Idx = llvm::find_if(
+          Vec, [Reg](RegSlotPair &RSP) { return Reg == RSP.first; });
+      if (Idx != Vec.end()) {
+        int FI = Idx->second;
+        LLVM_DEBUG(dbgs() << "Found global FI " << FI << " for register "
+                          << printReg(Reg, &TRI) << " at "
+                          << printMBBReference(*EHPad) << "\n");
+        assert(ReservedSlots.count(FI) && "using unreserved slot");
+        return FI;
+      }
+    }
+
+    unsigned Size = getRegisterSize(TRI, Reg);
+    FrameIndexesPerSize &Line = getCacheBucket(Size);
+    while (Line.Index < Line.Slots.size()) {
+      int FI = Line.Slots[Line.Index++];
+      if (ReservedSlots.count(FI))
+        continue;
+      // If all sizes are kept together we probably need to extend the
+      // spill slot size.
+      if (MFI.getObjectSize(FI) < Size) {
+        MFI.setObjectSize(FI, Size);
+        MFI.setObjectAlignment(FI, Align(Size));
+        NumSpillSlotsExtended++;
+      }
+      return FI;
+    }
+    int FI = MFI.CreateSpillStackObject(Size, Align(Size));
+    NumSpillSlotsAllocated++;
+    Line.Slots.push_back(FI);
+    ++Line.Index;
+
+    // Remember assignment {Reg, FI} for EHPad
+    if (EHPad) {
+      GlobalIndices[EHPad].push_back(std::make_pair(Reg, FI));
+      LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling reg "
+                        << printReg(Reg, &TRI) << " at landing pad "
+                        << printMBBReference(*EHPad) << "\n");
+    }
+
+    return FI;
+  }
+
+  // Sort all registers to spill in descendent order. In the
+  // FixupSCSExtendSlotSize mode it will minimize the total frame size.
+  // In non FixupSCSExtendSlotSize mode we can skip this step.
+  void sortRegisters(SmallVectorImpl<Register> &Regs) {
+    if (!FixupSCSExtendSlotSize)
+      return;
+    llvm::sort(Regs, [&](Register &A, Register &B) {
+      return getRegisterSize(TRI, A) > getRegisterSize(TRI, B);
+    });
+  }
+};
+
+// Describes the state of the current processing statepoint instruction.
+class StatepointState {
+private:
+  // statepoint instruction.
+  MachineInstr &MI;
+  MachineFunction &MF;
+  // If non-null then statepoint is invoke, and this points to the landing pad.
+  MachineBasicBlock *EHPad;
+  const TargetRegisterInfo &TRI;
+  const TargetInstrInfo &TII;
+  MachineFrameInfo &MFI;
+  // Mask with callee saved registers.
+  const uint32_t *Mask;
+  // Cache of frame indexes used on previous instruction processing.
+  FrameIndexesCache &CacheFI;
+  bool AllowGCPtrInCSR;
+  // Operands with physical registers requiring spilling.
+  SmallVector<unsigned, 8> OpsToSpill;
+  // Set of register to spill.
+  SmallVector<Register, 8> RegsToSpill;
+  // Set of registers to reload after statepoint.
+  SmallVector<Register, 8> RegsToReload;
+  // Map Register to Frame Slot index.
+  DenseMap<Register, int> RegToSlotIdx;
+
+public:
+  StatepointState(MachineInstr &MI, const uint32_t *Mask,
+                  FrameIndexesCache &CacheFI, bool AllowGCPtrInCSR)
+      : MI(MI), MF(*MI.getMF()), TRI(*MF.getSubtarget().getRegisterInfo()),
+        TII(*MF.getSubtarget().getInstrInfo()), MFI(MF.getFrameInfo()),
+        Mask(Mask), CacheFI(CacheFI), AllowGCPtrInCSR(AllowGCPtrInCSR) {
+
+    // Find statepoint's landing pad, if any.
+    EHPad = nullptr;
+    MachineBasicBlock *MBB = MI.getParent();
+    // Invoke statepoint must be last one in block.
+    bool Last = std::none_of(++MI.getIterator(), MBB->end().getInstrIterator(),
+                             [](MachineInstr &I) {
+                               return I.getOpcode() == TargetOpcode::STATEPOINT;
+                             });
+
+    if (!Last)
+      return;
+
+    auto IsEHPad = [](MachineBasicBlock *B) { return B->isEHPad(); };
+
+    assert(llvm::count_if(MBB->successors(), IsEHPad) < 2 && "multiple EHPads");
+
+    auto It = llvm::find_if(MBB->successors(), IsEHPad);
+    if (It != MBB->succ_end())
+      EHPad = *It;
+  }
+
+  MachineBasicBlock *getEHPad() const { return EHPad; }
+
+  // Return true if register is callee saved.
+  bool isCalleeSaved(Register Reg) { return (Mask[Reg / 32] >> Reg % 32) & 1; }
+
+  // Iterates over statepoint meta args to find caller saver registers.
+  // Also cache the size of found registers.
+  // Returns true if caller save registers found.
+  bool findRegistersToSpill() {
+    SmallSet<Register, 8> GCRegs;
+    // All GC pointer operands assigned to registers produce new value.
+    // Since they're tied to their defs, it is enough to collect def registers.
+    for (const auto &Def : MI.defs())
+      GCRegs.insert(Def.getReg());
+
+    SmallSet<Register, 8> VisitedRegs;
+    for (unsigned Idx = StatepointOpers(&MI).getVarIdx(),
+                  EndIdx = MI.getNumOperands();
+         Idx < EndIdx; ++Idx) {
+      MachineOperand &MO = MI.getOperand(Idx);
+      // Leave `undef` operands as is, StackMaps will rewrite them
+      // into a constant.
+      if (!MO.isReg() || MO.isImplicit() || MO.isUndef())
+        continue;
+      Register Reg = MO.getReg();
+      assert(Reg.isPhysical() && "Only physical regs are expected");
+
+      if (isCalleeSaved(Reg) && (AllowGCPtrInCSR || !GCRegs.contains(Reg)))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Will spill " << printReg(Reg, &TRI) << " at index "
+                        << Idx << "\n");
+
+      if (VisitedRegs.insert(Reg).second)
+        RegsToSpill.push_back(Reg);
+      OpsToSpill.push_back(Idx);
+    }
+    CacheFI.sortRegisters(RegsToSpill);
+    return !RegsToSpill.empty();
+  }
+
+  // Spill all caller saved registers right before statepoint instruction.
+  // Remember frame index where register is spilled.
+  void spillRegisters() {
+    for (Register Reg : RegsToSpill) {
+      int FI = CacheFI.getFrameIndex(Reg, EHPad);
+
+      NumSpilledRegisters++;
+      RegToSlotIdx[Reg] = FI;
+
+      LLVM_DEBUG(dbgs() << "Spilling " << printReg(Reg, &TRI) << " to FI " << FI
+                        << "\n");
+
+      // Perform trivial copy propagation
+      bool IsKill = true;
+      MachineBasicBlock::iterator InsertBefore(MI);
+      Reg = performCopyPropagation(Reg, InsertBefore, IsKill, TII, TRI);
+      const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
+
+      LLVM_DEBUG(dbgs() << "Insert spill before " << *InsertBefore);
+      TII.storeRegToStackSlot(*MI.getParent(), InsertBefore, Reg, IsKill, FI,
+                              RC, &TRI, Register());
+    }
+  }
+
+  void insertReloadBefore(unsigned Reg, MachineBasicBlock::iterator It,
+                          MachineBasicBlock *MBB) {
+    const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
+    int FI = RegToSlotIdx[Reg];
+    if (It != MBB->end()) {
+      TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI, Register());
+      return;
+    }
+
+    // To insert reload at the end of MBB, insert it before last instruction
+    // and then swap them.
+    assert(!MBB->empty() && "Empty block");
+    --It;
+    TII.loadRegFromStackSlot(*MBB, It, Reg, FI, RC, &TRI, Register());
+    MachineInstr *Reload = It->getPrevNode();
+    int Dummy = 0;
+    (void)Dummy;
+    assert(TII.isLoadFromStackSlot(*Reload, Dummy) == Reg);
+    assert(Dummy == FI);
+    MBB->remove(Reload);
+    MBB->insertAfter(It, Reload);
+  }
+
+  // Insert reloads of (relocated) registers spilled in statepoint.
+  void insertReloads(MachineInstr *NewStatepoint, RegReloadCache &RC) {
+    MachineBasicBlock *MBB = NewStatepoint->getParent();
+    auto InsertPoint = std::next(NewStatepoint->getIterator());
+
+    for (auto Reg : RegsToReload) {
+      insertReloadBefore(Reg, InsertPoint, MBB);
+      LLVM_DEBUG(dbgs() << "Reloading " << printReg(Reg, &TRI) << " from FI "
+                        << RegToSlotIdx[Reg] << " after statepoint\n");
+
+      if (EHPad && !RC.hasReload(Reg, RegToSlotIdx[Reg], EHPad)) {
+        RC.recordReload(Reg, RegToSlotIdx[Reg], EHPad);
+        auto EHPadInsertPoint = EHPad->SkipPHIsLabelsAndDebug(EHPad->begin());
+        insertReloadBefore(Reg, EHPadInsertPoint, EHPad);
+        LLVM_DEBUG(dbgs() << "...also reload at EHPad "
+                          << printMBBReference(*EHPad) << "\n");
+      }
+    }
+  }
+
+  // Re-write statepoint machine instruction to replace caller saved operands
+  // with indirect memory location (frame index).
+  MachineInstr *rewriteStatepoint() {
+    MachineInstr *NewMI =
+        MF.CreateMachineInstr(TII.get(MI.getOpcode()), MI.getDebugLoc(), true);
+    MachineInstrBuilder MIB(MF, NewMI);
+
+    unsigned NumOps = MI.getNumOperands();
+
+    // New indices for the remaining defs.
+    SmallVector<unsigned, 8> NewIndices;
+    unsigned NumDefs = MI.getNumDefs();
+    for (unsigned I = 0; I < NumDefs; ++I) {
+      MachineOperand &DefMO = MI.getOperand(I);
+      assert(DefMO.isReg() && DefMO.isDef() && "Expected Reg Def operand");
+      Register Reg = DefMO.getReg();
+      assert(DefMO.isTied() && "Def is expected to be tied");
+      // We skipped undef uses and did not spill them, so we should not
+      // proceed with defs here.
+      if (MI.getOperand(MI.findTiedOperandIdx(I)).isUndef()) {
+        if (AllowGCPtrInCSR) {
+          NewIndices.push_back(NewMI->getNumOperands());
+          MIB.addReg(Reg, RegState::Define);
+        }
+        continue;
+      }
+      if (!AllowGCPtrInCSR) {
+        assert(is_contained(RegsToSpill, Reg));
+        RegsToReload.push_back(Reg);
+      } else {
+        if (isCalleeSaved(Reg)) {
+          NewIndices.push_back(NewMI->getNumOperands());
+          MIB.addReg(Reg, RegState::Define);
+        } else {
+          NewIndices.push_back(NumOps);
+          RegsToReload.push_back(Reg);
+        }
+      }
+    }
+
+    // Add End marker.
+    OpsToSpill.push_back(MI.getNumOperands());
+    unsigned CurOpIdx = 0;
+
+    for (unsigned I = NumDefs; I < MI.getNumOperands(); ++I) {
+      MachineOperand &MO = MI.getOperand(I);
+      if (I == OpsToSpill[CurOpIdx]) {
+        int FI = RegToSlotIdx[MO.getReg()];
+        MIB.addImm(StackMaps::IndirectMemRefOp);
+        MIB.addImm(getRegisterSize(TRI, MO.getReg()));
+        assert(MO.isReg() && "Should be register");
+        assert(MO.getReg().isPhysical() && "Should be physical register");
+        MIB.addFrameIndex(FI);
+        MIB.addImm(0);
+        ++CurOpIdx;
+      } else {
+        MIB.add(MO);
+        unsigned OldDef;
+        if (AllowGCPtrInCSR && MI.isRegTiedToDefOperand(I, &OldDef)) {
+          assert(OldDef < NumDefs);
+          assert(NewIndices[OldDef] < NumOps);
+          MIB->tieOperands(NewIndices[OldDef], MIB->getNumOperands() - 1);
+        }
+      }
+    }
+    assert(CurOpIdx == (OpsToSpill.size() - 1) && "Not all operands processed");
+    // Add mem operands.
+    NewMI->setMemRefs(MF, MI.memoperands());
+    for (auto It : RegToSlotIdx) {
+      Register R = It.first;
+      int FrameIndex = It.second;
+      auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+      MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad;
+      if (is_contained(RegsToReload, R))
+        Flags |= MachineMemOperand::MOStore;
+      auto *MMO =
+          MF.getMachineMemOperand(PtrInfo, Flags, getRegisterSize(TRI, R),
+                                  MFI.getObjectAlign(FrameIndex));
+      NewMI->addMemOperand(MF, MMO);
+    }
+
+    // Insert new statepoint and erase old one.
+    MI.getParent()->insert(MI, NewMI);
+
+    LLVM_DEBUG(dbgs() << "rewritten statepoint to : " << *NewMI << "\n");
+    MI.eraseFromParent();
+    return NewMI;
+  }
+};
+
+class StatepointProcessor {
+private:
+  MachineFunction &MF;
+  const TargetRegisterInfo &TRI;
+  FrameIndexesCache CacheFI;
+  RegReloadCache ReloadCache;
+
+public:
+  StatepointProcessor(MachineFunction &MF)
+      : MF(MF), TRI(*MF.getSubtarget().getRegisterInfo()),
+        CacheFI(MF.getFrameInfo(), TRI) {}
+
+  bool process(MachineInstr &MI, bool AllowGCPtrInCSR) {
+    StatepointOpers SO(&MI);
+    uint64_t Flags = SO.getFlags();
+    // Do nothing for LiveIn, it supports all registers.
+    if (Flags & (uint64_t)StatepointFlags::DeoptLiveIn)
+      return false;
+    LLVM_DEBUG(dbgs() << "\nMBB " << MI.getParent()->getNumber() << " "
+                      << MI.getParent()->getName() << " : process statepoint "
+                      << MI);
+    CallingConv::ID CC = SO.getCallingConv();
+    const uint32_t *Mask = TRI.getCallPreservedMask(MF, CC);
+    StatepointState SS(MI, Mask, CacheFI, AllowGCPtrInCSR);
+    CacheFI.reset(SS.getEHPad());
+
+    if (!SS.findRegistersToSpill())
+      return false;
+
+    SS.spillRegisters();
+    auto *NewStatepoint = SS.rewriteStatepoint();
+    SS.insertReloads(NewStatepoint, ReloadCache);
+    return true;
+  }
+};
+} // namespace
+
+bool FixupStatepointCallerSaved::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  const Function &F = MF.getFunction();
+  if (!F.hasGC())
+    return false;
+
+  SmallVector<MachineInstr *, 16> Statepoints;
+  for (MachineBasicBlock &BB : MF)
+    for (MachineInstr &I : BB)
+      if (I.getOpcode() == TargetOpcode::STATEPOINT)
+        Statepoints.push_back(&I);
+
+  if (Statepoints.empty())
+    return false;
+
+  bool Changed = false;
+  StatepointProcessor SPP(MF);
+  unsigned NumStatepoints = 0;
+  bool AllowGCPtrInCSR = PassGCPtrInCSR;
+  for (MachineInstr *I : Statepoints) {
+    ++NumStatepoints;
+    if (MaxStatepointsWithRegs.getNumOccurrences() &&
+        NumStatepoints >= MaxStatepointsWithRegs)
+      AllowGCPtrInCSR = false;
+    Changed |= SPP.process(*I, AllowGCPtrInCSR);
+  }
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/FuncletLayout.cpp b/contrib/llvm-project/llvm/lib/CodeGen/FuncletLayout.cpp
new file mode 100644
index 000000000000..f1222a88b054
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/FuncletLayout.cpp
@@ -0,0 +1,62 @@
+//===-- FuncletLayout.cpp - Contiguously lay out funclets -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements basic block placement transformations which result in
+// funclets being contiguous.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "funclet-layout"
+
+namespace {
+class FuncletLayout : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  FuncletLayout() : MachineFunctionPass(ID) {
+    initializeFuncletLayoutPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+};
+}
+
+char FuncletLayout::ID = 0;
+char &llvm::FuncletLayoutID = FuncletLayout::ID;
+INITIALIZE_PASS(FuncletLayout, DEBUG_TYPE,
+                "Contiguously Lay Out Funclets", false, false)
+
+bool FuncletLayout::runOnMachineFunction(MachineFunction &F) {
+  // Even though this gets information from getEHScopeMembership(), this pass is
+  // only necessary for funclet-based EH personalities, in which these EH scopes
+  // are outlined at the end.
+  DenseMap<const MachineBasicBlock *, int> FuncletMembership =
+      getEHScopeMembership(F);
+  if (FuncletMembership.empty())
+    return false;
+
+  F.sort([&](MachineBasicBlock &X, MachineBasicBlock &Y) {
+    auto FuncletX = FuncletMembership.find(&X);
+    auto FuncletY = FuncletMembership.find(&Y);
+    assert(FuncletX != FuncletMembership.end());
+    assert(FuncletY != FuncletMembership.end());
+    return FuncletX->second < FuncletY->second;
+  });
+
+  // Conservatively assume we changed something.
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GCMetadata.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GCMetadata.cpp
new file mode 100644
index 000000000000..4d27143c5298
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GCMetadata.cpp
@@ -0,0 +1,150 @@
+//===-- GCMetadata.cpp - Garbage collector metadata -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the GCFunctionInfo class and GCModuleInfo pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <memory>
+#include <string>
+
+using namespace llvm;
+
+namespace {
+
+class Printer : public FunctionPass {
+  static char ID;
+
+  raw_ostream &OS;
+
+public:
+  explicit Printer(raw_ostream &OS) : FunctionPass(ID), OS(OS) {}
+
+  StringRef getPassName() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnFunction(Function &F) override;
+  bool doFinalization(Module &M) override;
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(GCModuleInfo, "collector-metadata",
+                "Create Garbage Collector Module Metadata", false, false)
+
+// -----------------------------------------------------------------------------
+
+GCFunctionInfo::GCFunctionInfo(const Function &F, GCStrategy &S)
+    : F(F), S(S), FrameSize(~0LL) {}
+
+GCFunctionInfo::~GCFunctionInfo() = default;
+
+// -----------------------------------------------------------------------------
+
+char GCModuleInfo::ID = 0;
+
+GCModuleInfo::GCModuleInfo() : ImmutablePass(ID) {
+  initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
+}
+
+GCFunctionInfo &GCModuleInfo::getFunctionInfo(const Function &F) {
+  assert(!F.isDeclaration() && "Can only get GCFunctionInfo for a definition!");
+  assert(F.hasGC());
+
+  finfo_map_type::iterator I = FInfoMap.find(&F);
+  if (I != FInfoMap.end())
+    return *I->second;
+
+  GCStrategy *S = getGCStrategy(F.getGC());
+  Functions.push_back(std::make_unique<GCFunctionInfo>(F, *S));
+  GCFunctionInfo *GFI = Functions.back().get();
+  FInfoMap[&F] = GFI;
+  return *GFI;
+}
+
+void GCModuleInfo::clear() {
+  Functions.clear();
+  FInfoMap.clear();
+  GCStrategyList.clear();
+}
+
+// -----------------------------------------------------------------------------
+
+char Printer::ID = 0;
+
+FunctionPass *llvm::createGCInfoPrinter(raw_ostream &OS) {
+  return new Printer(OS);
+}
+
+StringRef Printer::getPassName() const {
+  return "Print Garbage Collector Information";
+}
+
+void Printer::getAnalysisUsage(AnalysisUsage &AU) const {
+  FunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+  AU.addRequired<GCModuleInfo>();
+}
+
+bool Printer::runOnFunction(Function &F) {
+  if (F.hasGC())
+    return false;
+
+  GCFunctionInfo *FD = &getAnalysis<GCModuleInfo>().getFunctionInfo(F);
+
+  OS << "GC roots for " << FD->getFunction().getName() << ":\n";
+  for (GCFunctionInfo::roots_iterator RI = FD->roots_begin(),
+                                      RE = FD->roots_end();
+       RI != RE; ++RI)
+    OS << "\t" << RI->Num << "\t" << RI->StackOffset << "[sp]\n";
+
+  OS << "GC safe points for " << FD->getFunction().getName() << ":\n";
+  for (GCFunctionInfo::iterator PI = FD->begin(), PE = FD->end(); PI != PE;
+       ++PI) {
+
+    OS << "\t" << PI->Label->getName() << ": " << "post-call"
+       << ", live = {";
+
+    ListSeparator LS(",");
+    for (const GCRoot &R : make_range(FD->live_begin(PI), FD->live_end(PI)))
+      OS << LS << " " << R.Num;
+
+    OS << " }\n";
+  }
+
+  return false;
+}
+
+bool Printer::doFinalization(Module &M) {
+  GCModuleInfo *GMI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(GMI && "Printer didn't require GCModuleInfo?!");
+  GMI->clear();
+  return false;
+}
+
+GCStrategy *GCModuleInfo::getGCStrategy(const StringRef Name) {
+  // TODO: Arguably, just doing a linear search would be faster for small N
+  auto NMI = GCStrategyMap.find(Name);
+  if (NMI != GCStrategyMap.end())
+    return NMI->getValue();
+
+  std::unique_ptr<GCStrategy> S = llvm::getGCStrategy(Name);
+  S->Name = std::string(Name);
+  GCStrategyMap[Name] = S.get();
+  GCStrategyList.push_back(std::move(S));
+  return GCStrategyList.back().get();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GCMetadataPrinter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GCMetadataPrinter.cpp
new file mode 100644
index 000000000000..500dba9aea37
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GCMetadataPrinter.cpp
@@ -0,0 +1,21 @@
+//===- GCMetadataPrinter.cpp - Garbage collection infrastructure ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the abstract base class GCMetadataPrinter.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCMetadataPrinter.h"
+
+using namespace llvm;
+
+LLVM_INSTANTIATE_REGISTRY(GCMetadataPrinterRegistry)
+
+GCMetadataPrinter::GCMetadataPrinter() = default;
+
+GCMetadataPrinter::~GCMetadataPrinter() = default;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GCRootLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GCRootLowering.cpp
new file mode 100644
index 000000000000..c0ce37091933
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GCRootLowering.cpp
@@ -0,0 +1,328 @@
+//===-- GCRootLowering.cpp - Garbage collection infrastructure ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the lowering for the gc.root mechanism.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCContext.h"
+
+using namespace llvm;
+
+namespace {
+
+/// LowerIntrinsics - This pass rewrites calls to the llvm.gcread or
+/// llvm.gcwrite intrinsics, replacing them with simple loads and stores as
+/// directed by the GCStrategy. It also performs automatic root initialization
+/// and custom intrinsic lowering.
+class LowerIntrinsics : public FunctionPass {
+  bool DoLowering(Function &F, GCStrategy &S);
+
+public:
+  static char ID;
+
+  LowerIntrinsics();
+  StringRef getPassName() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+};
+
+/// GCMachineCodeAnalysis - This is a target-independent pass over the machine
+/// function representation to identify safe points for the garbage collector
+/// in the machine code. It inserts labels at safe points and populates a
+/// GCMetadata record for each function.
+class GCMachineCodeAnalysis : public MachineFunctionPass {
+  GCFunctionInfo *FI = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+
+  void FindSafePoints(MachineFunction &MF);
+  void VisitCallPoint(MachineBasicBlock::iterator CI);
+  MCSymbol *InsertLabel(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                        const DebugLoc &DL) const;
+
+  void FindStackOffsets(MachineFunction &MF);
+
+public:
+  static char ID;
+
+  GCMachineCodeAnalysis();
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+}
+
+// -----------------------------------------------------------------------------
+
+INITIALIZE_PASS_BEGIN(LowerIntrinsics, "gc-lowering", "GC Lowering", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(GCModuleInfo)
+INITIALIZE_PASS_END(LowerIntrinsics, "gc-lowering", "GC Lowering", false, false)
+
+FunctionPass *llvm::createGCLoweringPass() { return new LowerIntrinsics(); }
+
+char LowerIntrinsics::ID = 0;
+char &llvm::GCLoweringID = LowerIntrinsics::ID;
+
+LowerIntrinsics::LowerIntrinsics() : FunctionPass(ID) {
+  initializeLowerIntrinsicsPass(*PassRegistry::getPassRegistry());
+}
+
+StringRef LowerIntrinsics::getPassName() const {
+  return "Lower Garbage Collection Instructions";
+}
+
+void LowerIntrinsics::getAnalysisUsage(AnalysisUsage &AU) const {
+  FunctionPass::getAnalysisUsage(AU);
+  AU.addRequired<GCModuleInfo>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+}
+
+/// doInitialization - If this module uses the GC intrinsics, find them now.
+bool LowerIntrinsics::doInitialization(Module &M) {
+  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(MI && "LowerIntrinsics didn't require GCModuleInfo!?");
+  for (Function &F : M)
+    if (!F.isDeclaration() && F.hasGC())
+      MI->getFunctionInfo(F); // Instantiate the GC strategy.
+
+  return false;
+}
+
+/// CouldBecomeSafePoint - Predicate to conservatively determine whether the
+/// instruction could introduce a safe point.
+static bool CouldBecomeSafePoint(Instruction *I) {
+  // The natural definition of instructions which could introduce safe points
+  // are:
+  //
+  //   - call, invoke (AfterCall, BeforeCall)
+  //   - phis (Loops)
+  //   - invoke, ret, unwind (Exit)
+  //
+  // However, instructions as seemingly inoccuous as arithmetic can become
+  // libcalls upon lowering (e.g., div i64 on a 32-bit platform), so instead
+  // it is necessary to take a conservative approach.
+
+  if (isa<AllocaInst>(I) || isa<GetElementPtrInst>(I) || isa<StoreInst>(I) ||
+      isa<LoadInst>(I))
+    return false;
+
+  // llvm.gcroot is safe because it doesn't do anything at runtime.
+  if (CallInst *CI = dyn_cast<CallInst>(I))
+    if (Function *F = CI->getCalledFunction())
+      if (Intrinsic::ID IID = F->getIntrinsicID())
+        if (IID == Intrinsic::gcroot)
+          return false;
+
+  return true;
+}
+
+static bool InsertRootInitializers(Function &F, ArrayRef<AllocaInst *> Roots) {
+  // Scroll past alloca instructions.
+  BasicBlock::iterator IP = F.getEntryBlock().begin();
+  while (isa<AllocaInst>(IP))
+    ++IP;
+
+  // Search for initializers in the initial BB.
+  SmallPtrSet<AllocaInst *, 16> InitedRoots;
+  for (; !CouldBecomeSafePoint(&*IP); ++IP)
+    if (StoreInst *SI = dyn_cast<StoreInst>(IP))
+      if (AllocaInst *AI =
+              dyn_cast<AllocaInst>(SI->getOperand(1)->stripPointerCasts()))
+        InitedRoots.insert(AI);
+
+  // Add root initializers.
+  bool MadeChange = false;
+
+  for (AllocaInst *Root : Roots)
+    if (!InitedRoots.count(Root)) {
+      new StoreInst(
+          ConstantPointerNull::get(cast<PointerType>(Root->getAllocatedType())),
+          Root, Root->getNextNode());
+      MadeChange = true;
+    }
+
+  return MadeChange;
+}
+
+/// runOnFunction - Replace gcread/gcwrite intrinsics with loads and stores.
+/// Leave gcroot intrinsics; the code generator needs to see those.
+bool LowerIntrinsics::runOnFunction(Function &F) {
+  // Quick exit for functions that do not use GC.
+  if (!F.hasGC())
+    return false;
+
+  GCFunctionInfo &FI = getAnalysis<GCModuleInfo>().getFunctionInfo(F);
+  GCStrategy &S = FI.getStrategy();
+
+  return DoLowering(F, S);
+}
+
+/// Lower barriers out of existance (if the associated GCStrategy hasn't
+/// already done so...), and insert initializing stores to roots as a defensive
+/// measure.  Given we're going to report all roots live at all safepoints, we
+/// need to be able to ensure each root has been initialized by the point the
+/// first safepoint is reached.  This really should have been done by the
+/// frontend, but the old API made this non-obvious, so we do a potentially
+/// redundant store just in case.
+bool LowerIntrinsics::DoLowering(Function &F, GCStrategy &S) {
+  SmallVector<AllocaInst *, 32> Roots;
+
+  bool MadeChange = false;
+  for (BasicBlock &BB : F)
+    for (Instruction &I : llvm::make_early_inc_range(BB)) {
+      IntrinsicInst *CI = dyn_cast<IntrinsicInst>(&I);
+      if (!CI)
+        continue;
+
+      Function *F = CI->getCalledFunction();
+      switch (F->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::gcwrite: {
+        // Replace a write barrier with a simple store.
+        Value *St = new StoreInst(CI->getArgOperand(0),
+                                  CI->getArgOperand(2), CI);
+        CI->replaceAllUsesWith(St);
+        CI->eraseFromParent();
+        MadeChange = true;
+        break;
+      }
+      case Intrinsic::gcread: {
+        // Replace a read barrier with a simple load.
+        Value *Ld = new LoadInst(CI->getType(), CI->getArgOperand(1), "", CI);
+        Ld->takeName(CI);
+        CI->replaceAllUsesWith(Ld);
+        CI->eraseFromParent();
+        MadeChange = true;
+        break;
+      }
+      case Intrinsic::gcroot: {
+        // Initialize the GC root, but do not delete the intrinsic. The
+        // backend needs the intrinsic to flag the stack slot.
+        Roots.push_back(
+            cast<AllocaInst>(CI->getArgOperand(0)->stripPointerCasts()));
+        break;
+      }
+      }
+    }
+
+  if (Roots.size())
+    MadeChange |= InsertRootInitializers(F, Roots);
+
+  return MadeChange;
+}
+
+// -----------------------------------------------------------------------------
+
+char GCMachineCodeAnalysis::ID = 0;
+char &llvm::GCMachineCodeAnalysisID = GCMachineCodeAnalysis::ID;
+
+INITIALIZE_PASS(GCMachineCodeAnalysis, "gc-analysis",
+                "Analyze Machine Code For Garbage Collection", false, false)
+
+GCMachineCodeAnalysis::GCMachineCodeAnalysis() : MachineFunctionPass(ID) {}
+
+void GCMachineCodeAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+  AU.addRequired<GCModuleInfo>();
+}
+
+MCSymbol *GCMachineCodeAnalysis::InsertLabel(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MI,
+                                             const DebugLoc &DL) const {
+  MCSymbol *Label = MBB.getParent()->getContext().createTempSymbol();
+  BuildMI(MBB, MI, DL, TII->get(TargetOpcode::GC_LABEL)).addSym(Label);
+  return Label;
+}
+
+void GCMachineCodeAnalysis::VisitCallPoint(MachineBasicBlock::iterator CI) {
+  // Find the return address (next instruction), since that's what will be on
+  // the stack when the call is suspended and we need to inspect the stack.
+  MachineBasicBlock::iterator RAI = CI;
+  ++RAI;
+
+  MCSymbol *Label = InsertLabel(*CI->getParent(), RAI, CI->getDebugLoc());
+  FI->addSafePoint(Label, CI->getDebugLoc());
+}
+
+void GCMachineCodeAnalysis::FindSafePoints(MachineFunction &MF) {
+  for (MachineBasicBlock &MBB : MF)
+    for (MachineInstr &MI : MBB)
+      if (MI.isCall()) {
+        // Do not treat tail or sibling call sites as safe points.  This is
+        // legal since any arguments passed to the callee which live in the
+        // remnants of the callers frame will be owned and updated by the
+        // callee if required.
+        if (MI.isTerminator())
+          continue;
+        VisitCallPoint(&MI);
+      }
+}
+
+void GCMachineCodeAnalysis::FindStackOffsets(MachineFunction &MF) {
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  assert(TFI && "TargetRegisterInfo not available!");
+
+  for (GCFunctionInfo::roots_iterator RI = FI->roots_begin();
+       RI != FI->roots_end();) {
+    // If the root references a dead object, no need to keep it.
+    if (MF.getFrameInfo().isDeadObjectIndex(RI->Num)) {
+      RI = FI->removeStackRoot(RI);
+    } else {
+      Register FrameReg; // FIXME: surely GCRoot ought to store the
+                         // register that the offset is from?
+      auto FrameOffset = TFI->getFrameIndexReference(MF, RI->Num, FrameReg);
+      assert(!FrameOffset.getScalable() &&
+             "Frame offsets with a scalable component are not supported");
+      RI->StackOffset = FrameOffset.getFixed();
+      ++RI;
+    }
+  }
+}
+
+bool GCMachineCodeAnalysis::runOnMachineFunction(MachineFunction &MF) {
+  // Quick exit for functions that do not use GC.
+  if (!MF.getFunction().hasGC())
+    return false;
+
+  FI = &getAnalysis<GCModuleInfo>().getFunctionInfo(MF.getFunction());
+  TII = MF.getSubtarget().getInstrInfo();
+
+  // Find the size of the stack frame.  There may be no correct static frame
+  // size, we use UINT64_MAX to represent this.
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  const bool DynamicFrameSize =
+      MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF);
+  FI->setFrameSize(DynamicFrameSize ? UINT64_MAX : MFI.getStackSize());
+
+  // Find all safe points.
+  if (FI->getStrategy().needsSafePoints())
+    FindSafePoints(MF);
+
+  // Find the concrete stack offsets for all roots (stack slots)
+  FindStackOffsets(MF);
+
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
new file mode 100644
index 000000000000..e047996f9aa8
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
@@ -0,0 +1,452 @@
+//===- CSEInfo.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Error.h"
+
+#define DEBUG_TYPE "cseinfo"
+
+using namespace llvm;
+char llvm::GISelCSEAnalysisWrapperPass::ID = 0;
+GISelCSEAnalysisWrapperPass::GISelCSEAnalysisWrapperPass()
+    : MachineFunctionPass(ID) {
+  initializeGISelCSEAnalysisWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+INITIALIZE_PASS_BEGIN(GISelCSEAnalysisWrapperPass, DEBUG_TYPE,
+                      "Analysis containing CSE Info", false, true)
+INITIALIZE_PASS_END(GISelCSEAnalysisWrapperPass, DEBUG_TYPE,
+                    "Analysis containing CSE Info", false, true)
+
+/// -------- UniqueMachineInstr -------------//
+
+void UniqueMachineInstr::Profile(FoldingSetNodeID &ID) {
+  GISelInstProfileBuilder(ID, MI->getMF()->getRegInfo()).addNodeID(MI);
+}
+/// -----------------------------------------
+
+/// --------- CSEConfigFull ---------- ///
+bool CSEConfigFull::shouldCSEOpc(unsigned Opc) {
+  switch (Opc) {
+  default:
+    break;
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_ASHR:
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_MUL:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_SHL:
+  case TargetOpcode::G_SUB:
+  case TargetOpcode::G_XOR:
+  case TargetOpcode::G_UDIV:
+  case TargetOpcode::G_SDIV:
+  case TargetOpcode::G_UREM:
+  case TargetOpcode::G_SREM:
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_FCONSTANT:
+  case TargetOpcode::G_IMPLICIT_DEF:
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_SEXT:
+  case TargetOpcode::G_ANYEXT:
+  case TargetOpcode::G_UNMERGE_VALUES:
+  case TargetOpcode::G_TRUNC:
+  case TargetOpcode::G_PTR_ADD:
+  case TargetOpcode::G_EXTRACT:
+  case TargetOpcode::G_SELECT:
+  case TargetOpcode::G_BUILD_VECTOR:
+  case TargetOpcode::G_BUILD_VECTOR_TRUNC:
+  case TargetOpcode::G_SEXT_INREG:
+    return true;
+  }
+  return false;
+}
+
+bool CSEConfigConstantOnly::shouldCSEOpc(unsigned Opc) {
+  return Opc == TargetOpcode::G_CONSTANT || Opc == TargetOpcode::G_FCONSTANT ||
+         Opc == TargetOpcode::G_IMPLICIT_DEF;
+}
+
+std::unique_ptr<CSEConfigBase>
+llvm::getStandardCSEConfigForOpt(CodeGenOpt::Level Level) {
+  std::unique_ptr<CSEConfigBase> Config;
+  if (Level == CodeGenOpt::None)
+    Config = std::make_unique<CSEConfigConstantOnly>();
+  else
+    Config = std::make_unique<CSEConfigFull>();
+  return Config;
+}
+
+/// -----------------------------------------
+
+/// -------- GISelCSEInfo -------------//
+void GISelCSEInfo::setMF(MachineFunction &MF) {
+  this->MF = &MF;
+  this->MRI = &MF.getRegInfo();
+}
+
+GISelCSEInfo::~GISelCSEInfo() = default;
+
+bool GISelCSEInfo::isUniqueMachineInstValid(
+    const UniqueMachineInstr &UMI) const {
+  // Should we check here and assert that the instruction has been fully
+  // constructed?
+  // FIXME: Any other checks required to be done here? Remove this method if
+  // none.
+  return true;
+}
+
+void GISelCSEInfo::invalidateUniqueMachineInstr(UniqueMachineInstr *UMI) {
+  bool Removed = CSEMap.RemoveNode(UMI);
+  (void)Removed;
+  assert(Removed && "Invalidation called on invalid UMI");
+  // FIXME: Should UMI be deallocated/destroyed?
+}
+
+UniqueMachineInstr *GISelCSEInfo::getNodeIfExists(FoldingSetNodeID &ID,
+                                                  MachineBasicBlock *MBB,
+                                                  void *&InsertPos) {
+  auto *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  if (Node) {
+    if (!isUniqueMachineInstValid(*Node)) {
+      invalidateUniqueMachineInstr(Node);
+      return nullptr;
+    }
+
+    if (Node->MI->getParent() != MBB)
+      return nullptr;
+  }
+  return Node;
+}
+
+void GISelCSEInfo::insertNode(UniqueMachineInstr *UMI, void *InsertPos) {
+  handleRecordedInsts();
+  assert(UMI);
+  UniqueMachineInstr *MaybeNewNode = UMI;
+  if (InsertPos)
+    CSEMap.InsertNode(UMI, InsertPos);
+  else
+    MaybeNewNode = CSEMap.GetOrInsertNode(UMI);
+  if (MaybeNewNode != UMI) {
+    // A similar node exists in the folding set. Let's ignore this one.
+    return;
+  }
+  assert(InstrMapping.count(UMI->MI) == 0 &&
+         "This instruction should not be in the map");
+  InstrMapping[UMI->MI] = MaybeNewNode;
+}
+
+UniqueMachineInstr *GISelCSEInfo::getUniqueInstrForMI(const MachineInstr *MI) {
+  assert(shouldCSE(MI->getOpcode()) && "Trying to CSE an unsupported Node");
+  auto *Node = new (UniqueInstrAllocator) UniqueMachineInstr(MI);
+  return Node;
+}
+
+void GISelCSEInfo::insertInstr(MachineInstr *MI, void *InsertPos) {
+  assert(MI);
+  // If it exists in temporary insts, remove it.
+  TemporaryInsts.remove(MI);
+  auto *Node = getUniqueInstrForMI(MI);
+  insertNode(Node, InsertPos);
+}
+
+MachineInstr *GISelCSEInfo::getMachineInstrIfExists(FoldingSetNodeID &ID,
+                                                    MachineBasicBlock *MBB,
+                                                    void *&InsertPos) {
+  handleRecordedInsts();
+  if (auto *Inst = getNodeIfExists(ID, MBB, InsertPos)) {
+    LLVM_DEBUG(dbgs() << "CSEInfo::Found Instr " << *Inst->MI;);
+    return const_cast<MachineInstr *>(Inst->MI);
+  }
+  return nullptr;
+}
+
+void GISelCSEInfo::countOpcodeHit(unsigned Opc) {
+#ifndef NDEBUG
+  if (OpcodeHitTable.count(Opc))
+    OpcodeHitTable[Opc] += 1;
+  else
+    OpcodeHitTable[Opc] = 1;
+#endif
+  // Else do nothing.
+}
+
+void GISelCSEInfo::recordNewInstruction(MachineInstr *MI) {
+  if (shouldCSE(MI->getOpcode())) {
+    TemporaryInsts.insert(MI);
+    LLVM_DEBUG(dbgs() << "CSEInfo::Recording new MI " << *MI);
+  }
+}
+
+void GISelCSEInfo::handleRecordedInst(MachineInstr *MI) {
+  assert(shouldCSE(MI->getOpcode()) && "Invalid instruction for CSE");
+  auto *UMI = InstrMapping.lookup(MI);
+  LLVM_DEBUG(dbgs() << "CSEInfo::Handling recorded MI " << *MI);
+  if (UMI) {
+    // Invalidate this MI.
+    invalidateUniqueMachineInstr(UMI);
+    InstrMapping.erase(MI);
+  }
+  /// Now insert the new instruction.
+  if (UMI) {
+    /// We'll reuse the same UniqueMachineInstr to avoid the new
+    /// allocation.
+    *UMI = UniqueMachineInstr(MI);
+    insertNode(UMI, nullptr);
+  } else {
+    /// This is a new instruction. Allocate a new UniqueMachineInstr and
+    /// Insert.
+    insertInstr(MI);
+  }
+}
+
+void GISelCSEInfo::handleRemoveInst(MachineInstr *MI) {
+  if (auto *UMI = InstrMapping.lookup(MI)) {
+    invalidateUniqueMachineInstr(UMI);
+    InstrMapping.erase(MI);
+  }
+  TemporaryInsts.remove(MI);
+}
+
+void GISelCSEInfo::handleRecordedInsts() {
+  if (HandlingRecordedInstrs)
+    return;
+  HandlingRecordedInstrs = true;
+  while (!TemporaryInsts.empty()) {
+    auto *MI = TemporaryInsts.pop_back_val();
+    handleRecordedInst(MI);
+  }
+  HandlingRecordedInstrs = false;
+}
+
+bool GISelCSEInfo::shouldCSE(unsigned Opc) const {
+  assert(CSEOpt.get() && "CSEConfig not set");
+  return CSEOpt->shouldCSEOpc(Opc);
+}
+
+void GISelCSEInfo::erasingInstr(MachineInstr &MI) { handleRemoveInst(&MI); }
+void GISelCSEInfo::createdInstr(MachineInstr &MI) { recordNewInstruction(&MI); }
+void GISelCSEInfo::changingInstr(MachineInstr &MI) {
+  // For now, perform erase, followed by insert.
+  erasingInstr(MI);
+  createdInstr(MI);
+}
+void GISelCSEInfo::changedInstr(MachineInstr &MI) { changingInstr(MI); }
+
+void GISelCSEInfo::analyze(MachineFunction &MF) {
+  setMF(MF);
+  for (auto &MBB : MF) {
+    if (MBB.empty())
+      continue;
+    for (MachineInstr &MI : MBB) {
+      if (!shouldCSE(MI.getOpcode()))
+        continue;
+      LLVM_DEBUG(dbgs() << "CSEInfo::Add MI: " << MI);
+      insertInstr(&MI);
+    }
+  }
+}
+
+void GISelCSEInfo::releaseMemory() {
+  print();
+  CSEMap.clear();
+  InstrMapping.clear();
+  UniqueInstrAllocator.Reset();
+  TemporaryInsts.clear();
+  CSEOpt.reset();
+  MRI = nullptr;
+  MF = nullptr;
+#ifndef NDEBUG
+  OpcodeHitTable.clear();
+#endif
+}
+
+#ifndef NDEBUG
+static const char *stringify(const MachineInstr *MI, std::string &S) {
+  raw_string_ostream OS(S);
+  OS << *MI;
+  return OS.str().c_str();
+}
+#endif
+
+Error GISelCSEInfo::verify() {
+#ifndef NDEBUG
+  std::string S1, S2;
+  handleRecordedInsts();
+  // For each instruction in map from MI -> UMI,
+  // Profile(MI) and make sure UMI is found for that profile.
+  for (auto &It : InstrMapping) {
+    FoldingSetNodeID TmpID;
+    GISelInstProfileBuilder(TmpID, *MRI).addNodeID(It.first);
+    void *InsertPos;
+    UniqueMachineInstr *FoundNode =
+        CSEMap.FindNodeOrInsertPos(TmpID, InsertPos);
+    if (FoundNode != It.second)
+      return createStringError(std::errc::not_supported,
+                               "CSEMap mismatch, InstrMapping has MIs without "
+                               "corresponding Nodes in CSEMap:\n%s",
+                               stringify(It.second->MI, S1));
+  }
+
+  // For every node in the CSEMap, make sure that the InstrMapping
+  // points to it.
+  for (const UniqueMachineInstr &UMI : CSEMap) {
+    if (!InstrMapping.count(UMI.MI))
+      return createStringError(std::errc::not_supported,
+                               "Node in CSE without InstrMapping:\n%s",
+                               stringify(UMI.MI, S1));
+
+    if (InstrMapping[UMI.MI] != &UMI)
+      return createStringError(std::make_error_code(std::errc::not_supported),
+                               "Mismatch in CSE mapping:\n%s\n%s",
+                               stringify(InstrMapping[UMI.MI]->MI, S1),
+                               stringify(UMI.MI, S2));
+  }
+#endif
+  return Error::success();
+}
+
+void GISelCSEInfo::print() {
+  LLVM_DEBUG(for (auto &It
+                  : OpcodeHitTable) {
+    dbgs() << "CSEInfo::CSE Hit for Opc " << It.first << " : " << It.second
+           << "\n";
+  };);
+}
+/// -----------------------------------------
+// ---- Profiling methods for FoldingSetNode --- //
+const GISelInstProfileBuilder &
+GISelInstProfileBuilder::addNodeID(const MachineInstr *MI) const {
+  addNodeIDMBB(MI->getParent());
+  addNodeIDOpcode(MI->getOpcode());
+  for (const auto &Op : MI->operands())
+    addNodeIDMachineOperand(Op);
+  addNodeIDFlag(MI->getFlags());
+  return *this;
+}
+
+const GISelInstProfileBuilder &
+GISelInstProfileBuilder::addNodeIDOpcode(unsigned Opc) const {
+  ID.AddInteger(Opc);
+  return *this;
+}
+
+const GISelInstProfileBuilder &
+GISelInstProfileBuilder::addNodeIDRegType(const LLT Ty) const {
+  uint64_t Val = Ty.getUniqueRAWLLTData();
+  ID.AddInteger(Val);
+  return *this;
+}
+
+const GISelInstProfileBuilder &
+GISelInstProfileBuilder::addNodeIDRegType(const TargetRegisterClass *RC) const {
+  ID.AddPointer(RC);
+  return *this;
+}
+
+const GISelInstProfileBuilder &
+GISelInstProfileBuilder::addNodeIDRegType(const RegisterBank *RB) const {
+  ID.AddPointer(RB);
+  return *this;
+}
+
+const GISelInstProfileBuilder &
+GISelInstProfileBuilder::addNodeIDImmediate(int64_t Imm) const {
+  ID.AddInteger(Imm);
+  return *this;
+}
+
+const GISelInstProfileBuilder &
+GISelInstProfileBuilder::addNodeIDRegNum(Register Reg) const {
+  ID.AddInteger(Reg);
+  return *this;
+}
+
+const GISelInstProfileBuilder &
+GISelInstProfileBuilder::addNodeIDRegType(const Register Reg) const {
+  addNodeIDMachineOperand(MachineOperand::CreateReg(Reg, false));
+  return *this;
+}
+
+const GISelInstProfileBuilder &
+GISelInstProfileBuilder::addNodeIDMBB(const MachineBasicBlock *MBB) const {
+  ID.AddPointer(MBB);
+  return *this;
+}
+
+const GISelInstProfileBuilder &
+GISelInstProfileBuilder::addNodeIDFlag(unsigned Flag) const {
+  if (Flag)
+    ID.AddInteger(Flag);
+  return *this;
+}
+
+const GISelInstProfileBuilder &
+GISelInstProfileBuilder::addNodeIDReg(Register Reg) const {
+  LLT Ty = MRI.getType(Reg);
+  if (Ty.isValid())
+    addNodeIDRegType(Ty);
+
+  if (const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(Reg)) {
+    if (const auto *RB = dyn_cast_if_present<const RegisterBank *>(RCOrRB))
+      addNodeIDRegType(RB);
+    else if (const auto *RC =
+                 dyn_cast_if_present<const TargetRegisterClass *>(RCOrRB))
+      addNodeIDRegType(RC);
+  }
+  return *this;
+}
+
+const GISelInstProfileBuilder &GISelInstProfileBuilder::addNodeIDMachineOperand(
+    const MachineOperand &MO) const {
+  if (MO.isReg()) {
+    Register Reg = MO.getReg();
+    if (!MO.isDef())
+      addNodeIDRegNum(Reg);
+
+    // Profile the register properties.
+    addNodeIDReg(Reg);
+    assert(!MO.isImplicit() && "Unhandled case");
+  } else if (MO.isImm())
+    ID.AddInteger(MO.getImm());
+  else if (MO.isCImm())
+    ID.AddPointer(MO.getCImm());
+  else if (MO.isFPImm())
+    ID.AddPointer(MO.getFPImm());
+  else if (MO.isPredicate())
+    ID.AddInteger(MO.getPredicate());
+  else
+    llvm_unreachable("Unhandled operand type");
+  // Handle other types
+  return *this;
+}
+
+GISelCSEInfo &
+GISelCSEAnalysisWrapper::get(std::unique_ptr<CSEConfigBase> CSEOpt,
+                             bool Recompute) {
+  if (!AlreadyComputed || Recompute) {
+    Info.releaseMemory();
+    Info.setCSEConfig(std::move(CSEOpt));
+    Info.analyze(*MF);
+    AlreadyComputed = true;
+  }
+  return Info;
+}
+void GISelCSEAnalysisWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool GISelCSEAnalysisWrapperPass::runOnMachineFunction(MachineFunction &MF) {
+  releaseMemory();
+  Wrapper.setMF(MF);
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
new file mode 100644
index 000000000000..64e2d517e3b9
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
@@ -0,0 +1,354 @@
+//===-- llvm/CodeGen/GlobalISel/CSEMIRBuilder.cpp - MIBuilder--*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the CSEMIRBuilder class which CSEs as it builds
+/// instructions.
+//===----------------------------------------------------------------------===//
+//
+
+#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+
+using namespace llvm;
+
+bool CSEMIRBuilder::dominates(MachineBasicBlock::const_iterator A,
+                              MachineBasicBlock::const_iterator B) const {
+  auto MBBEnd = getMBB().end();
+  if (B == MBBEnd)
+    return true;
+  assert(A->getParent() == B->getParent() &&
+         "Iterators should be in same block");
+  const MachineBasicBlock *BBA = A->getParent();
+  MachineBasicBlock::const_iterator I = BBA->begin();
+  for (; &*I != A && &*I != B; ++I)
+    ;
+  return &*I == A;
+}
+
+MachineInstrBuilder
+CSEMIRBuilder::getDominatingInstrForID(FoldingSetNodeID &ID,
+                                       void *&NodeInsertPos) {
+  GISelCSEInfo *CSEInfo = getCSEInfo();
+  assert(CSEInfo && "Can't get here without setting CSEInfo");
+  MachineBasicBlock *CurMBB = &getMBB();
+  MachineInstr *MI =
+      CSEInfo->getMachineInstrIfExists(ID, CurMBB, NodeInsertPos);
+  if (MI) {
+    CSEInfo->countOpcodeHit(MI->getOpcode());
+    auto CurrPos = getInsertPt();
+    auto MII = MachineBasicBlock::iterator(MI);
+    if (MII == CurrPos) {
+      // Move the insert point ahead of the instruction so any future uses of
+      // this builder will have the def ready.
+      setInsertPt(*CurMBB, std::next(MII));
+    } else if (!dominates(MI, CurrPos)) {
+      CurMBB->splice(CurrPos, CurMBB, MI);
+    }
+    return MachineInstrBuilder(getMF(), MI);
+  }
+  return MachineInstrBuilder();
+}
+
+bool CSEMIRBuilder::canPerformCSEForOpc(unsigned Opc) const {
+  const GISelCSEInfo *CSEInfo = getCSEInfo();
+  if (!CSEInfo || !CSEInfo->shouldCSE(Opc))
+    return false;
+  return true;
+}
+
+void CSEMIRBuilder::profileDstOp(const DstOp &Op,
+                                 GISelInstProfileBuilder &B) const {
+  switch (Op.getDstOpKind()) {
+  case DstOp::DstType::Ty_RC:
+    B.addNodeIDRegType(Op.getRegClass());
+    break;
+  case DstOp::DstType::Ty_Reg: {
+    // Regs can have LLT&(RB|RC). If those exist, profile them as well.
+    B.addNodeIDReg(Op.getReg());
+    break;
+  }
+  default:
+    B.addNodeIDRegType(Op.getLLTTy(*getMRI()));
+    break;
+  }
+}
+
+void CSEMIRBuilder::profileSrcOp(const SrcOp &Op,
+                                 GISelInstProfileBuilder &B) const {
+  switch (Op.getSrcOpKind()) {
+  case SrcOp::SrcType::Ty_Imm:
+    B.addNodeIDImmediate(static_cast<int64_t>(Op.getImm()));
+    break;
+  case SrcOp::SrcType::Ty_Predicate:
+    B.addNodeIDImmediate(static_cast<int64_t>(Op.getPredicate()));
+    break;
+  default:
+    B.addNodeIDRegType(Op.getReg());
+    break;
+  }
+}
+
+void CSEMIRBuilder::profileMBBOpcode(GISelInstProfileBuilder &B,
+                                     unsigned Opc) const {
+  // First add the MBB (Local CSE).
+  B.addNodeIDMBB(&getMBB());
+  // Then add the opcode.
+  B.addNodeIDOpcode(Opc);
+}
+
+void CSEMIRBuilder::profileEverything(unsigned Opc, ArrayRef<DstOp> DstOps,
+                                      ArrayRef<SrcOp> SrcOps,
+                                      std::optional<unsigned> Flags,
+                                      GISelInstProfileBuilder &B) const {
+
+  profileMBBOpcode(B, Opc);
+  // Then add the DstOps.
+  profileDstOps(DstOps, B);
+  // Then add the SrcOps.
+  profileSrcOps(SrcOps, B);
+  // Add Flags if passed in.
+  if (Flags)
+    B.addNodeIDFlag(*Flags);
+}
+
+MachineInstrBuilder CSEMIRBuilder::memoizeMI(MachineInstrBuilder MIB,
+                                             void *NodeInsertPos) {
+  assert(canPerformCSEForOpc(MIB->getOpcode()) &&
+         "Attempting to CSE illegal op");
+  MachineInstr *MIBInstr = MIB;
+  getCSEInfo()->insertInstr(MIBInstr, NodeInsertPos);
+  return MIB;
+}
+
+bool CSEMIRBuilder::checkCopyToDefsPossible(ArrayRef<DstOp> DstOps) {
+  if (DstOps.size() == 1)
+    return true; // always possible to emit copy to just 1 vreg.
+
+  return llvm::all_of(DstOps, [](const DstOp &Op) {
+    DstOp::DstType DT = Op.getDstOpKind();
+    return DT == DstOp::DstType::Ty_LLT || DT == DstOp::DstType::Ty_RC;
+  });
+}
+
+MachineInstrBuilder
+CSEMIRBuilder::generateCopiesIfRequired(ArrayRef<DstOp> DstOps,
+                                        MachineInstrBuilder &MIB) {
+  assert(checkCopyToDefsPossible(DstOps) &&
+         "Impossible return a single MIB with copies to multiple defs");
+  if (DstOps.size() == 1) {
+    const DstOp &Op = DstOps[0];
+    if (Op.getDstOpKind() == DstOp::DstType::Ty_Reg)
+      return buildCopy(Op.getReg(), MIB.getReg(0));
+  }
+
+  // If we didn't generate a copy then we're re-using an existing node directly
+  // instead of emitting any code. Merge the debug location we wanted to emit
+  // into the instruction we're CSE'ing with. Debug locations arent part of the
+  // profile so we don't need to recompute it.
+  if (getDebugLoc()) {
+    GISelChangeObserver *Observer = getState().Observer;
+    if (Observer)
+      Observer->changingInstr(*MIB);
+    MIB->setDebugLoc(
+        DILocation::getMergedLocation(MIB->getDebugLoc(), getDebugLoc()));
+    if (Observer)
+      Observer->changedInstr(*MIB);
+  }
+
+  return MIB;
+}
+
+MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc,
+                                              ArrayRef<DstOp> DstOps,
+                                              ArrayRef<SrcOp> SrcOps,
+                                              std::optional<unsigned> Flag) {
+  switch (Opc) {
+  default:
+    break;
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_PTR_ADD:
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_ASHR:
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_MUL:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_SHL:
+  case TargetOpcode::G_SUB:
+  case TargetOpcode::G_XOR:
+  case TargetOpcode::G_UDIV:
+  case TargetOpcode::G_SDIV:
+  case TargetOpcode::G_UREM:
+  case TargetOpcode::G_SREM:
+  case TargetOpcode::G_SMIN:
+  case TargetOpcode::G_SMAX:
+  case TargetOpcode::G_UMIN:
+  case TargetOpcode::G_UMAX: {
+    // Try to constant fold these.
+    assert(SrcOps.size() == 2 && "Invalid sources");
+    assert(DstOps.size() == 1 && "Invalid dsts");
+    LLT SrcTy = SrcOps[0].getLLTTy(*getMRI());
+
+    if (Opc == TargetOpcode::G_PTR_ADD &&
+        getDataLayout().isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
+      break;
+
+    if (SrcTy.isVector()) {
+      // Try to constant fold vector constants.
+      SmallVector<APInt> VecCst = ConstantFoldVectorBinop(
+          Opc, SrcOps[0].getReg(), SrcOps[1].getReg(), *getMRI());
+      if (!VecCst.empty())
+        return buildBuildVectorConstant(DstOps[0], VecCst);
+      break;
+    }
+
+    if (std::optional<APInt> Cst = ConstantFoldBinOp(
+            Opc, SrcOps[0].getReg(), SrcOps[1].getReg(), *getMRI()))
+      return buildConstant(DstOps[0], *Cst);
+    break;
+  }
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FSUB:
+  case TargetOpcode::G_FMUL:
+  case TargetOpcode::G_FDIV:
+  case TargetOpcode::G_FREM:
+  case TargetOpcode::G_FMINNUM:
+  case TargetOpcode::G_FMAXNUM:
+  case TargetOpcode::G_FMINNUM_IEEE:
+  case TargetOpcode::G_FMAXNUM_IEEE:
+  case TargetOpcode::G_FMINIMUM:
+  case TargetOpcode::G_FMAXIMUM:
+  case TargetOpcode::G_FCOPYSIGN: {
+    // Try to constant fold these.
+    assert(SrcOps.size() == 2 && "Invalid sources");
+    assert(DstOps.size() == 1 && "Invalid dsts");
+    if (std::optional<APFloat> Cst = ConstantFoldFPBinOp(
+            Opc, SrcOps[0].getReg(), SrcOps[1].getReg(), *getMRI()))
+      return buildFConstant(DstOps[0], *Cst);
+    break;
+  }
+  case TargetOpcode::G_SEXT_INREG: {
+    assert(DstOps.size() == 1 && "Invalid dst ops");
+    assert(SrcOps.size() == 2 && "Invalid src ops");
+    const DstOp &Dst = DstOps[0];
+    const SrcOp &Src0 = SrcOps[0];
+    const SrcOp &Src1 = SrcOps[1];
+    if (auto MaybeCst =
+            ConstantFoldExtOp(Opc, Src0.getReg(), Src1.getImm(), *getMRI()))
+      return buildConstant(Dst, *MaybeCst);
+    break;
+  }
+  case TargetOpcode::G_SITOFP:
+  case TargetOpcode::G_UITOFP: {
+    // Try to constant fold these.
+    assert(SrcOps.size() == 1 && "Invalid sources");
+    assert(DstOps.size() == 1 && "Invalid dsts");
+    if (std::optional<APFloat> Cst = ConstantFoldIntToFloat(
+            Opc, DstOps[0].getLLTTy(*getMRI()), SrcOps[0].getReg(), *getMRI()))
+      return buildFConstant(DstOps[0], *Cst);
+    break;
+  }
+  case TargetOpcode::G_CTLZ: {
+    assert(SrcOps.size() == 1 && "Expected one source");
+    assert(DstOps.size() == 1 && "Expected one dest");
+    auto MaybeCsts = ConstantFoldCTLZ(SrcOps[0].getReg(), *getMRI());
+    if (!MaybeCsts)
+      break;
+    if (MaybeCsts->size() == 1)
+      return buildConstant(DstOps[0], (*MaybeCsts)[0]);
+    // This was a vector constant. Build a G_BUILD_VECTOR for them.
+    SmallVector<Register> ConstantRegs;
+    LLT VecTy = DstOps[0].getLLTTy(*getMRI());
+    for (unsigned Cst : *MaybeCsts)
+      ConstantRegs.emplace_back(
+          buildConstant(VecTy.getScalarType(), Cst).getReg(0));
+    return buildBuildVector(DstOps[0], ConstantRegs);
+  }
+  }
+  bool CanCopy = checkCopyToDefsPossible(DstOps);
+  if (!canPerformCSEForOpc(Opc))
+    return MachineIRBuilder::buildInstr(Opc, DstOps, SrcOps, Flag);
+  // If we can CSE this instruction, but involves generating copies to multiple
+  // regs, give up. This frequently happens to UNMERGEs.
+  if (!CanCopy) {
+    auto MIB = MachineIRBuilder::buildInstr(Opc, DstOps, SrcOps, Flag);
+    // CSEInfo would have tracked this instruction. Remove it from the temporary
+    // insts.
+    getCSEInfo()->handleRemoveInst(&*MIB);
+    return MIB;
+  }
+  FoldingSetNodeID ID;
+  GISelInstProfileBuilder ProfBuilder(ID, *getMRI());
+  void *InsertPos = nullptr;
+  profileEverything(Opc, DstOps, SrcOps, Flag, ProfBuilder);
+  MachineInstrBuilder MIB = getDominatingInstrForID(ID, InsertPos);
+  if (MIB) {
+    // Handle generating copies here.
+    return generateCopiesIfRequired(DstOps, MIB);
+  }
+  // This instruction does not exist in the CSEInfo. Build it and CSE it.
+  MachineInstrBuilder NewMIB =
+      MachineIRBuilder::buildInstr(Opc, DstOps, SrcOps, Flag);
+  return memoizeMI(NewMIB, InsertPos);
+}
+
+MachineInstrBuilder CSEMIRBuilder::buildConstant(const DstOp &Res,
+                                                 const ConstantInt &Val) {
+  constexpr unsigned Opc = TargetOpcode::G_CONSTANT;
+  if (!canPerformCSEForOpc(Opc))
+    return MachineIRBuilder::buildConstant(Res, Val);
+
+  // For vectors, CSE the element only for now.
+  LLT Ty = Res.getLLTTy(*getMRI());
+  if (Ty.isVector())
+    return buildSplatVector(Res, buildConstant(Ty.getElementType(), Val));
+
+  FoldingSetNodeID ID;
+  GISelInstProfileBuilder ProfBuilder(ID, *getMRI());
+  void *InsertPos = nullptr;
+  profileMBBOpcode(ProfBuilder, Opc);
+  profileDstOp(Res, ProfBuilder);
+  ProfBuilder.addNodeIDMachineOperand(MachineOperand::CreateCImm(&Val));
+  MachineInstrBuilder MIB = getDominatingInstrForID(ID, InsertPos);
+  if (MIB) {
+    // Handle generating copies here.
+    return generateCopiesIfRequired({Res}, MIB);
+  }
+
+  MachineInstrBuilder NewMIB = MachineIRBuilder::buildConstant(Res, Val);
+  return memoizeMI(NewMIB, InsertPos);
+}
+
+MachineInstrBuilder CSEMIRBuilder::buildFConstant(const DstOp &Res,
+                                                  const ConstantFP &Val) {
+  constexpr unsigned Opc = TargetOpcode::G_FCONSTANT;
+  if (!canPerformCSEForOpc(Opc))
+    return MachineIRBuilder::buildFConstant(Res, Val);
+
+  // For vectors, CSE the element only for now.
+  LLT Ty = Res.getLLTTy(*getMRI());
+  if (Ty.isVector())
+    return buildSplatVector(Res, buildFConstant(Ty.getElementType(), Val));
+
+  FoldingSetNodeID ID;
+  GISelInstProfileBuilder ProfBuilder(ID, *getMRI());
+  void *InsertPos = nullptr;
+  profileMBBOpcode(ProfBuilder, Opc);
+  profileDstOp(Res, ProfBuilder);
+  ProfBuilder.addNodeIDMachineOperand(MachineOperand::CreateFPImm(&Val));
+  MachineInstrBuilder MIB = getDominatingInstrForID(ID, InsertPos);
+  if (MIB) {
+    // Handle generating copies here.
+    return generateCopiesIfRequired({Res}, MIB);
+  }
+  MachineInstrBuilder NewMIB = MachineIRBuilder::buildFConstant(Res, Val);
+  return memoizeMI(NewMIB, InsertPos);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
new file mode 100644
index 000000000000..28c33e2038e4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -0,0 +1,1241 @@
+//===-- lib/CodeGen/GlobalISel/CallLowering.cpp - Call lowering -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements some simple delegations needed for call lowering.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "call-lowering"
+
+using namespace llvm;
+
+void CallLowering::anchor() {}
+
+/// Helper function which updates \p Flags when \p AttrFn returns true.
+static void
+addFlagsUsingAttrFn(ISD::ArgFlagsTy &Flags,
+                    const std::function<bool(Attribute::AttrKind)> &AttrFn) {
+  if (AttrFn(Attribute::SExt))
+    Flags.setSExt();
+  if (AttrFn(Attribute::ZExt))
+    Flags.setZExt();
+  if (AttrFn(Attribute::InReg))
+    Flags.setInReg();
+  if (AttrFn(Attribute::StructRet))
+    Flags.setSRet();
+  if (AttrFn(Attribute::Nest))
+    Flags.setNest();
+  if (AttrFn(Attribute::ByVal))
+    Flags.setByVal();
+  if (AttrFn(Attribute::Preallocated))
+    Flags.setPreallocated();
+  if (AttrFn(Attribute::InAlloca))
+    Flags.setInAlloca();
+  if (AttrFn(Attribute::Returned))
+    Flags.setReturned();
+  if (AttrFn(Attribute::SwiftSelf))
+    Flags.setSwiftSelf();
+  if (AttrFn(Attribute::SwiftAsync))
+    Flags.setSwiftAsync();
+  if (AttrFn(Attribute::SwiftError))
+    Flags.setSwiftError();
+}
+
+ISD::ArgFlagsTy CallLowering::getAttributesForArgIdx(const CallBase &Call,
+                                                     unsigned ArgIdx) const {
+  ISD::ArgFlagsTy Flags;
+  addFlagsUsingAttrFn(Flags, [&Call, &ArgIdx](Attribute::AttrKind Attr) {
+    return Call.paramHasAttr(ArgIdx, Attr);
+  });
+  return Flags;
+}
+
+ISD::ArgFlagsTy
+CallLowering::getAttributesForReturn(const CallBase &Call) const {
+  ISD::ArgFlagsTy Flags;
+  addFlagsUsingAttrFn(Flags, [&Call](Attribute::AttrKind Attr) {
+    return Call.hasRetAttr(Attr);
+  });
+  return Flags;
+}
+
+void CallLowering::addArgFlagsFromAttributes(ISD::ArgFlagsTy &Flags,
+                                             const AttributeList &Attrs,
+                                             unsigned OpIdx) const {
+  addFlagsUsingAttrFn(Flags, [&Attrs, &OpIdx](Attribute::AttrKind Attr) {
+    return Attrs.hasAttributeAtIndex(OpIdx, Attr);
+  });
+}
+
+bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
+                             ArrayRef<Register> ResRegs,
+                             ArrayRef<ArrayRef<Register>> ArgRegs,
+                             Register SwiftErrorVReg,
+                             std::function<unsigned()> GetCalleeReg) const {
+  CallLoweringInfo Info;
+  const DataLayout &DL = MIRBuilder.getDataLayout();
+  MachineFunction &MF = MIRBuilder.getMF();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  bool CanBeTailCalled = CB.isTailCall() &&
+                         isInTailCallPosition(CB, MF.getTarget()) &&
+                         (MF.getFunction()
+                              .getFnAttribute("disable-tail-calls")
+                              .getValueAsString() != "true");
+
+  CallingConv::ID CallConv = CB.getCallingConv();
+  Type *RetTy = CB.getType();
+  bool IsVarArg = CB.getFunctionType()->isVarArg();
+
+  SmallVector<BaseArgInfo, 4> SplitArgs;
+  getReturnInfo(CallConv, RetTy, CB.getAttributes(), SplitArgs, DL);
+  Info.CanLowerReturn = canLowerReturn(MF, CallConv, SplitArgs, IsVarArg);
+
+  if (!Info.CanLowerReturn) {
+    // Callee requires sret demotion.
+    insertSRetOutgoingArgument(MIRBuilder, CB, Info);
+
+    // The sret demotion isn't compatible with tail-calls, since the sret
+    // argument points into the caller's stack frame.
+    CanBeTailCalled = false;
+  }
+
+
+  // First step is to marshall all the function's parameters into the correct
+  // physregs and memory locations. Gather the sequence of argument types that
+  // we'll pass to the assigner function.
+  unsigned i = 0;
+  unsigned NumFixedArgs = CB.getFunctionType()->getNumParams();
+  for (const auto &Arg : CB.args()) {
+    ArgInfo OrigArg{ArgRegs[i], *Arg.get(), i, getAttributesForArgIdx(CB, i),
+                    i < NumFixedArgs};
+    setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CB);
+
+    // If we have an explicit sret argument that is an Instruction, (i.e., it
+    // might point to function-local memory), we can't meaningfully tail-call.
+    if (OrigArg.Flags[0].isSRet() && isa<Instruction>(&Arg))
+      CanBeTailCalled = false;
+
+    Info.OrigArgs.push_back(OrigArg);
+    ++i;
+  }
+
+  // Try looking through a bitcast from one function type to another.
+  // Commonly happens with calls to objc_msgSend().
+  const Value *CalleeV = CB.getCalledOperand()->stripPointerCasts();
+  if (const Function *F = dyn_cast<Function>(CalleeV))
+    Info.Callee = MachineOperand::CreateGA(F, 0);
+  else
+    Info.Callee = MachineOperand::CreateReg(GetCalleeReg(), false);
+
+  Register ReturnHintAlignReg;
+  Align ReturnHintAlign;
+
+  Info.OrigRet = ArgInfo{ResRegs, RetTy, 0, getAttributesForReturn(CB)};
+
+  if (!Info.OrigRet.Ty->isVoidTy()) {
+    setArgFlags(Info.OrigRet, AttributeList::ReturnIndex, DL, CB);
+
+    if (MaybeAlign Alignment = CB.getRetAlign()) {
+      if (*Alignment > Align(1)) {
+        ReturnHintAlignReg = MRI.cloneVirtualRegister(ResRegs[0]);
+        Info.OrigRet.Regs[0] = ReturnHintAlignReg;
+        ReturnHintAlign = *Alignment;
+      }
+    }
+  }
+
+  auto Bundle = CB.getOperandBundle(LLVMContext::OB_kcfi);
+  if (Bundle && CB.isIndirectCall()) {
+    Info.CFIType = cast<ConstantInt>(Bundle->Inputs[0]);
+    assert(Info.CFIType->getType()->isIntegerTy(32) && "Invalid CFI type");
+  }
+
+  Info.CB = &CB;
+  Info.KnownCallees = CB.getMetadata(LLVMContext::MD_callees);
+  Info.CallConv = CallConv;
+  Info.SwiftErrorVReg = SwiftErrorVReg;
+  Info.IsMustTailCall = CB.isMustTailCall();
+  Info.IsTailCall = CanBeTailCalled;
+  Info.IsVarArg = IsVarArg;
+  if (!lowerCall(MIRBuilder, Info))
+    return false;
+
+  if (ReturnHintAlignReg && !Info.IsTailCall) {
+    MIRBuilder.buildAssertAlign(ResRegs[0], ReturnHintAlignReg,
+                                ReturnHintAlign);
+  }
+
+  return true;
+}
+
+template <typename FuncInfoTy>
+void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
+                               const DataLayout &DL,
+                               const FuncInfoTy &FuncInfo) const {
+  auto &Flags = Arg.Flags[0];
+  const AttributeList &Attrs = FuncInfo.getAttributes();
+  addArgFlagsFromAttributes(Flags, Attrs, OpIdx);
+
+  PointerType *PtrTy = dyn_cast<PointerType>(Arg.Ty->getScalarType());
+  if (PtrTy) {
+    Flags.setPointer();
+    Flags.setPointerAddrSpace(PtrTy->getPointerAddressSpace());
+  }
+
+  Align MemAlign = DL.getABITypeAlign(Arg.Ty);
+  if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated()) {
+    assert(OpIdx >= AttributeList::FirstArgIndex);
+    unsigned ParamIdx = OpIdx - AttributeList::FirstArgIndex;
+
+    Type *ElementTy = FuncInfo.getParamByValType(ParamIdx);
+    if (!ElementTy)
+      ElementTy = FuncInfo.getParamInAllocaType(ParamIdx);
+    if (!ElementTy)
+      ElementTy = FuncInfo.getParamPreallocatedType(ParamIdx);
+    assert(ElementTy && "Must have byval, inalloca or preallocated type");
+    Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
+
+    // For ByVal, alignment should be passed from FE.  BE will guess if
+    // this info is not there but there are cases it cannot get right.
+    if (auto ParamAlign = FuncInfo.getParamStackAlign(ParamIdx))
+      MemAlign = *ParamAlign;
+    else if ((ParamAlign = FuncInfo.getParamAlign(ParamIdx)))
+      MemAlign = *ParamAlign;
+    else
+      MemAlign = Align(getTLI()->getByValTypeAlignment(ElementTy, DL));
+  } else if (OpIdx >= AttributeList::FirstArgIndex) {
+    if (auto ParamAlign =
+            FuncInfo.getParamStackAlign(OpIdx - AttributeList::FirstArgIndex))
+      MemAlign = *ParamAlign;
+  }
+  Flags.setMemAlign(MemAlign);
+  Flags.setOrigAlign(DL.getABITypeAlign(Arg.Ty));
+
+  // Don't try to use the returned attribute if the argument is marked as
+  // swiftself, since it won't be passed in x0.
+  if (Flags.isSwiftSelf())
+    Flags.setReturned(false);
+}
+
+template void
+CallLowering::setArgFlags<Function>(CallLowering::ArgInfo &Arg, unsigned OpIdx,
+                                    const DataLayout &DL,
+                                    const Function &FuncInfo) const;
+
+template void
+CallLowering::setArgFlags<CallBase>(CallLowering::ArgInfo &Arg, unsigned OpIdx,
+                                    const DataLayout &DL,
+                                    const CallBase &FuncInfo) const;
+
+void CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
+                                     SmallVectorImpl<ArgInfo> &SplitArgs,
+                                     const DataLayout &DL,
+                                     CallingConv::ID CallConv,
+                                     SmallVectorImpl<uint64_t> *Offsets) const {
+  LLVMContext &Ctx = OrigArg.Ty->getContext();
+
+  SmallVector<EVT, 4> SplitVTs;
+  ComputeValueVTs(*TLI, DL, OrigArg.Ty, SplitVTs, Offsets, 0);
+
+  if (SplitVTs.size() == 0)
+    return;
+
+  if (SplitVTs.size() == 1) {
+    // No splitting to do, but we want to replace the original type (e.g. [1 x
+    // double] -> double).
+    SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
+                           OrigArg.OrigArgIndex, OrigArg.Flags[0],
+                           OrigArg.IsFixed, OrigArg.OrigValue);
+    return;
+  }
+
+  // Create one ArgInfo for each virtual register in the original ArgInfo.
+  assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch");
+
+  bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters(
+      OrigArg.Ty, CallConv, false, DL);
+  for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) {
+    Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx);
+    SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.OrigArgIndex,
+                           OrigArg.Flags[0], OrigArg.IsFixed);
+    if (NeedsRegBlock)
+      SplitArgs.back().Flags[0].setInConsecutiveRegs();
+  }
+
+  SplitArgs.back().Flags[0].setInConsecutiveRegsLast();
+}
+
+/// Pack values \p SrcRegs to cover the vector type result \p DstRegs.
+static MachineInstrBuilder
+mergeVectorRegsToResultRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
+                            ArrayRef<Register> SrcRegs) {
+  MachineRegisterInfo &MRI = *B.getMRI();
+  LLT LLTy = MRI.getType(DstRegs[0]);
+  LLT PartLLT = MRI.getType(SrcRegs[0]);
+
+  // Deal with v3s16 split into v2s16
+  LLT LCMTy = getCoverTy(LLTy, PartLLT);
+  if (LCMTy == LLTy) {
+    // Common case where no padding is needed.
+    assert(DstRegs.size() == 1);
+    return B.buildConcatVectors(DstRegs[0], SrcRegs);
+  }
+
+  // We need to create an unmerge to the result registers, which may require
+  // widening the original value.
+  Register UnmergeSrcReg;
+  if (LCMTy != PartLLT) {
+    assert(DstRegs.size() == 1);
+    return B.buildDeleteTrailingVectorElements(
+        DstRegs[0], B.buildMergeLikeInstr(LCMTy, SrcRegs));
+  } else {
+    // We don't need to widen anything if we're extracting a scalar which was
+    // promoted to a vector e.g. s8 -> v4s8 -> s8
+    assert(SrcRegs.size() == 1);
+    UnmergeSrcReg = SrcRegs[0];
+  }
+
+  int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits();
+
+  SmallVector<Register, 8> PadDstRegs(NumDst);
+  std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin());
+
+  // Create the excess dead defs for the unmerge.
+  for (int I = DstRegs.size(); I != NumDst; ++I)
+    PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
+
+  if (PadDstRegs.size() == 1)
+    return B.buildDeleteTrailingVectorElements(DstRegs[0], UnmergeSrcReg);
+  return B.buildUnmerge(PadDstRegs, UnmergeSrcReg);
+}
+
+/// Create a sequence of instructions to combine pieces split into register
+/// typed values to the original IR value. \p OrigRegs contains the destination
+/// value registers of type \p LLTy, and \p Regs contains the legalized pieces
+/// with type \p PartLLT. This is used for incoming values (physregs to vregs).
+static void buildCopyFromRegs(MachineIRBuilder &B, ArrayRef<Register> OrigRegs,
+                              ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT,
+                              const ISD::ArgFlagsTy Flags) {
+  MachineRegisterInfo &MRI = *B.getMRI();
+
+  if (PartLLT == LLTy) {
+    // We should have avoided introducing a new virtual register, and just
+    // directly assigned here.
+    assert(OrigRegs[0] == Regs[0]);
+    return;
+  }
+
+  if (PartLLT.getSizeInBits() == LLTy.getSizeInBits() && OrigRegs.size() == 1 &&
+      Regs.size() == 1) {
+    B.buildBitcast(OrigRegs[0], Regs[0]);
+    return;
+  }
+
+  // A vector PartLLT needs extending to LLTy's element size.
+  // E.g. <2 x s64> = G_SEXT <2 x s32>.
+  if (PartLLT.isVector() == LLTy.isVector() &&
+      PartLLT.getScalarSizeInBits() > LLTy.getScalarSizeInBits() &&
+      (!PartLLT.isVector() ||
+       PartLLT.getNumElements() == LLTy.getNumElements()) &&
+      OrigRegs.size() == 1 && Regs.size() == 1) {
+    Register SrcReg = Regs[0];
+
+    LLT LocTy = MRI.getType(SrcReg);
+
+    if (Flags.isSExt()) {
+      SrcReg = B.buildAssertSExt(LocTy, SrcReg, LLTy.getScalarSizeInBits())
+                   .getReg(0);
+    } else if (Flags.isZExt()) {
+      SrcReg = B.buildAssertZExt(LocTy, SrcReg, LLTy.getScalarSizeInBits())
+                   .getReg(0);
+    }
+
+    // Sometimes pointers are passed zero extended.
+    LLT OrigTy = MRI.getType(OrigRegs[0]);
+    if (OrigTy.isPointer()) {
+      LLT IntPtrTy = LLT::scalar(OrigTy.getSizeInBits());
+      B.buildIntToPtr(OrigRegs[0], B.buildTrunc(IntPtrTy, SrcReg));
+      return;
+    }
+
+    B.buildTrunc(OrigRegs[0], SrcReg);
+    return;
+  }
+
+  if (!LLTy.isVector() && !PartLLT.isVector()) {
+    assert(OrigRegs.size() == 1);
+    LLT OrigTy = MRI.getType(OrigRegs[0]);
+
+    unsigned SrcSize = PartLLT.getSizeInBits().getFixedValue() * Regs.size();
+    if (SrcSize == OrigTy.getSizeInBits())
+      B.buildMergeValues(OrigRegs[0], Regs);
+    else {
+      auto Widened = B.buildMergeLikeInstr(LLT::scalar(SrcSize), Regs);
+      B.buildTrunc(OrigRegs[0], Widened);
+    }
+
+    return;
+  }
+
+  if (PartLLT.isVector()) {
+    assert(OrigRegs.size() == 1);
+    SmallVector<Register> CastRegs(Regs.begin(), Regs.end());
+
+    // If PartLLT is a mismatched vector in both number of elements and element
+    // size, e.g. PartLLT == v2s64 and LLTy is v3s32, then first coerce it to
+    // have the same elt type, i.e. v4s32.
+    if (PartLLT.getSizeInBits() > LLTy.getSizeInBits() &&
+        PartLLT.getScalarSizeInBits() == LLTy.getScalarSizeInBits() * 2 &&
+        Regs.size() == 1) {
+      LLT NewTy = PartLLT.changeElementType(LLTy.getElementType())
+                      .changeElementCount(PartLLT.getElementCount() * 2);
+      CastRegs[0] = B.buildBitcast(NewTy, Regs[0]).getReg(0);
+      PartLLT = NewTy;
+    }
+
+    if (LLTy.getScalarType() == PartLLT.getElementType()) {
+      mergeVectorRegsToResultRegs(B, OrigRegs, CastRegs);
+    } else {
+      unsigned I = 0;
+      LLT GCDTy = getGCDType(LLTy, PartLLT);
+
+      // We are both splitting a vector, and bitcasting its element types. Cast
+      // the source pieces into the appropriate number of pieces with the result
+      // element type.
+      for (Register SrcReg : CastRegs)
+        CastRegs[I++] = B.buildBitcast(GCDTy, SrcReg).getReg(0);
+      mergeVectorRegsToResultRegs(B, OrigRegs, CastRegs);
+    }
+
+    return;
+  }
+
+  assert(LLTy.isVector() && !PartLLT.isVector());
+
+  LLT DstEltTy = LLTy.getElementType();
+
+  // Pointer information was discarded. We'll need to coerce some register types
+  // to avoid violating type constraints.
+  LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType();
+
+  assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits());
+
+  if (DstEltTy == PartLLT) {
+    // Vector was trivially scalarized.
+
+    if (RealDstEltTy.isPointer()) {
+      for (Register Reg : Regs)
+        MRI.setType(Reg, RealDstEltTy);
+    }
+
+    B.buildBuildVector(OrigRegs[0], Regs);
+  } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
+    // Deal with vector with 64-bit elements decomposed to 32-bit
+    // registers. Need to create intermediate 64-bit elements.
+    SmallVector<Register, 8> EltMerges;
+    int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
+
+    assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
+
+    for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) {
+      auto Merge =
+          B.buildMergeLikeInstr(RealDstEltTy, Regs.take_front(PartsPerElt));
+      // Fix the type in case this is really a vector of pointers.
+      MRI.setType(Merge.getReg(0), RealDstEltTy);
+      EltMerges.push_back(Merge.getReg(0));
+      Regs = Regs.drop_front(PartsPerElt);
+    }
+
+    B.buildBuildVector(OrigRegs[0], EltMerges);
+  } else {
+    // Vector was split, and elements promoted to a wider type.
+    // FIXME: Should handle floating point promotions.
+    LLT BVType = LLT::fixed_vector(LLTy.getNumElements(), PartLLT);
+    auto BV = B.buildBuildVector(BVType, Regs);
+    B.buildTrunc(OrigRegs[0], BV);
+  }
+}
+
+/// Create a sequence of instructions to expand the value in \p SrcReg (of type
+/// \p SrcTy) to the types in \p DstRegs (of type \p PartTy). \p ExtendOp should
+/// contain the type of scalar value extension if necessary.
+///
+/// This is used for outgoing values (vregs to physregs)
+static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
+                            Register SrcReg, LLT SrcTy, LLT PartTy,
+                            unsigned ExtendOp = TargetOpcode::G_ANYEXT) {
+  // We could just insert a regular copy, but this is unreachable at the moment.
+  assert(SrcTy != PartTy && "identical part types shouldn't reach here");
+
+  const unsigned PartSize = PartTy.getSizeInBits();
+
+  if (PartTy.isVector() == SrcTy.isVector() &&
+      PartTy.getScalarSizeInBits() > SrcTy.getScalarSizeInBits()) {
+    assert(DstRegs.size() == 1);
+    B.buildInstr(ExtendOp, {DstRegs[0]}, {SrcReg});
+    return;
+  }
+
+  if (SrcTy.isVector() && !PartTy.isVector() &&
+      PartSize > SrcTy.getElementType().getSizeInBits()) {
+    // Vector was scalarized, and the elements extended.
+    auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg);
+    for (int i = 0, e = DstRegs.size(); i != e; ++i)
+      B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
+    return;
+  }
+
+  if (SrcTy.isVector() && PartTy.isVector() &&
+      PartTy.getScalarSizeInBits() == SrcTy.getScalarSizeInBits() &&
+      SrcTy.getNumElements() < PartTy.getNumElements()) {
+    // A coercion like: v2f32 -> v4f32.
+    Register DstReg = DstRegs.front();
+    B.buildPadVectorWithUndefElements(DstReg, SrcReg);
+    return;
+  }
+
+  LLT GCDTy = getGCDType(SrcTy, PartTy);
+  if (GCDTy == PartTy) {
+    // If this already evenly divisible, we can create a simple unmerge.
+    B.buildUnmerge(DstRegs, SrcReg);
+    return;
+  }
+
+  MachineRegisterInfo &MRI = *B.getMRI();
+  LLT DstTy = MRI.getType(DstRegs[0]);
+  LLT LCMTy = getCoverTy(SrcTy, PartTy);
+
+  if (PartTy.isVector() && LCMTy == PartTy) {
+    assert(DstRegs.size() == 1);
+    B.buildPadVectorWithUndefElements(DstRegs[0], SrcReg);
+    return;
+  }
+
+  const unsigned DstSize = DstTy.getSizeInBits();
+  const unsigned SrcSize = SrcTy.getSizeInBits();
+  unsigned CoveringSize = LCMTy.getSizeInBits();
+
+  Register UnmergeSrc = SrcReg;
+
+  if (!LCMTy.isVector() && CoveringSize != SrcSize) {
+    // For scalars, it's common to be able to use a simple extension.
+    if (SrcTy.isScalar() && DstTy.isScalar()) {
+      CoveringSize = alignTo(SrcSize, DstSize);
+      LLT CoverTy = LLT::scalar(CoveringSize);
+      UnmergeSrc = B.buildInstr(ExtendOp, {CoverTy}, {SrcReg}).getReg(0);
+    } else {
+      // Widen to the common type.
+      // FIXME: This should respect the extend type
+      Register Undef = B.buildUndef(SrcTy).getReg(0);
+      SmallVector<Register, 8> MergeParts(1, SrcReg);
+      for (unsigned Size = SrcSize; Size != CoveringSize; Size += SrcSize)
+        MergeParts.push_back(Undef);
+      UnmergeSrc = B.buildMergeLikeInstr(LCMTy, MergeParts).getReg(0);
+    }
+  }
+
+  if (LCMTy.isVector() && CoveringSize != SrcSize)
+    UnmergeSrc = B.buildPadVectorWithUndefElements(LCMTy, SrcReg).getReg(0);
+
+  B.buildUnmerge(DstRegs, UnmergeSrc);
+}
+
+bool CallLowering::determineAndHandleAssignments(
+    ValueHandler &Handler, ValueAssigner &Assigner,
+    SmallVectorImpl<ArgInfo> &Args, MachineIRBuilder &MIRBuilder,
+    CallingConv::ID CallConv, bool IsVarArg,
+    ArrayRef<Register> ThisReturnRegs) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  const Function &F = MF.getFunction();
+  SmallVector<CCValAssign, 16> ArgLocs;
+
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, F.getContext());
+  if (!determineAssignments(Assigner, Args, CCInfo))
+    return false;
+
+  return handleAssignments(Handler, Args, CCInfo, ArgLocs, MIRBuilder,
+                           ThisReturnRegs);
+}
+
+static unsigned extendOpFromFlags(llvm::ISD::ArgFlagsTy Flags) {
+  if (Flags.isSExt())
+    return TargetOpcode::G_SEXT;
+  if (Flags.isZExt())
+    return TargetOpcode::G_ZEXT;
+  return TargetOpcode::G_ANYEXT;
+}
+
+bool CallLowering::determineAssignments(ValueAssigner &Assigner,
+                                        SmallVectorImpl<ArgInfo> &Args,
+                                        CCState &CCInfo) const {
+  LLVMContext &Ctx = CCInfo.getContext();
+  const CallingConv::ID CallConv = CCInfo.getCallingConv();
+
+  unsigned NumArgs = Args.size();
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    EVT CurVT = EVT::getEVT(Args[i].Ty);
+
+    MVT NewVT = TLI->getRegisterTypeForCallingConv(Ctx, CallConv, CurVT);
+
+    // If we need to split the type over multiple regs, check it's a scenario
+    // we currently support.
+    unsigned NumParts =
+        TLI->getNumRegistersForCallingConv(Ctx, CallConv, CurVT);
+
+    if (NumParts == 1) {
+      // Try to use the register type if we couldn't assign the VT.
+      if (Assigner.assignArg(i, CurVT, NewVT, NewVT, CCValAssign::Full, Args[i],
+                             Args[i].Flags[0], CCInfo))
+        return false;
+      continue;
+    }
+
+    // For incoming arguments (physregs to vregs), we could have values in
+    // physregs (or memlocs) which we want to extract and copy to vregs.
+    // During this, we might have to deal with the LLT being split across
+    // multiple regs, so we have to record this information for later.
+    //
+    // If we have outgoing args, then we have the opposite case. We have a
+    // vreg with an LLT which we want to assign to a physical location, and
+    // we might have to record that the value has to be split later.
+
+    // We're handling an incoming arg which is split over multiple regs.
+    // E.g. passing an s128 on AArch64.
+    ISD::ArgFlagsTy OrigFlags = Args[i].Flags[0];
+    Args[i].Flags.clear();
+
+    for (unsigned Part = 0; Part < NumParts; ++Part) {
+      ISD::ArgFlagsTy Flags = OrigFlags;
+      if (Part == 0) {
+        Flags.setSplit();
+      } else {
+        Flags.setOrigAlign(Align(1));
+        if (Part == NumParts - 1)
+          Flags.setSplitEnd();
+      }
+
+      Args[i].Flags.push_back(Flags);
+      if (Assigner.assignArg(i, CurVT, NewVT, NewVT, CCValAssign::Full, Args[i],
+                             Args[i].Flags[Part], CCInfo)) {
+        // Still couldn't assign this smaller part type for some reason.
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+bool CallLowering::handleAssignments(ValueHandler &Handler,
+                                     SmallVectorImpl<ArgInfo> &Args,
+                                     CCState &CCInfo,
+                                     SmallVectorImpl<CCValAssign> &ArgLocs,
+                                     MachineIRBuilder &MIRBuilder,
+                                     ArrayRef<Register> ThisReturnRegs) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const Function &F = MF.getFunction();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  const unsigned NumArgs = Args.size();
+
+  // Stores thunks for outgoing register assignments. This is used so we delay
+  // generating register copies until mem loc assignments are done. We do this
+  // so that if the target is using the delayed stack protector feature, we can
+  // find the split point of the block accurately. E.g. if we have:
+  // G_STORE %val, %memloc
+  // $x0 = COPY %foo
+  // $x1 = COPY %bar
+  // CALL func
+  // ... then the split point for the block will correctly be at, and including,
+  // the copy to $x0. If instead the G_STORE instruction immediately precedes
+  // the CALL, then we'd prematurely choose the CALL as the split point, thus
+  // generating a split block with a CALL that uses undefined physregs.
+  SmallVector<std::function<void()>> DelayedOutgoingRegAssignments;
+
+  for (unsigned i = 0, j = 0; i != NumArgs; ++i, ++j) {
+    assert(j < ArgLocs.size() && "Skipped too many arg locs");
+    CCValAssign &VA = ArgLocs[j];
+    assert(VA.getValNo() == i && "Location doesn't correspond to current arg");
+
+    if (VA.needsCustom()) {
+      std::function<void()> Thunk;
+      unsigned NumArgRegs = Handler.assignCustomValue(
+          Args[i], ArrayRef(ArgLocs).slice(j), &Thunk);
+      if (Thunk)
+        DelayedOutgoingRegAssignments.emplace_back(Thunk);
+      if (!NumArgRegs)
+        return false;
+      j += NumArgRegs;
+      continue;
+    }
+
+    const MVT ValVT = VA.getValVT();
+    const MVT LocVT = VA.getLocVT();
+
+    const LLT LocTy(LocVT);
+    const LLT ValTy(ValVT);
+    const LLT NewLLT = Handler.isIncomingArgumentHandler() ? LocTy : ValTy;
+    const EVT OrigVT = EVT::getEVT(Args[i].Ty);
+    const LLT OrigTy = getLLTForType(*Args[i].Ty, DL);
+
+    // Expected to be multiple regs for a single incoming arg.
+    // There should be Regs.size() ArgLocs per argument.
+    // This should be the same as getNumRegistersForCallingConv
+    const unsigned NumParts = Args[i].Flags.size();
+
+    // Now split the registers into the assigned types.
+    Args[i].OrigRegs.assign(Args[i].Regs.begin(), Args[i].Regs.end());
+
+    if (NumParts != 1 || NewLLT != OrigTy) {
+      // If we can't directly assign the register, we need one or more
+      // intermediate values.
+      Args[i].Regs.resize(NumParts);
+
+      // For each split register, create and assign a vreg that will store
+      // the incoming component of the larger value. These will later be
+      // merged to form the final vreg.
+      for (unsigned Part = 0; Part < NumParts; ++Part)
+        Args[i].Regs[Part] = MRI.createGenericVirtualRegister(NewLLT);
+    }
+
+    assert((j + (NumParts - 1)) < ArgLocs.size() &&
+           "Too many regs for number of args");
+
+    // Coerce into outgoing value types before register assignment.
+    if (!Handler.isIncomingArgumentHandler() && OrigTy != ValTy) {
+      assert(Args[i].OrigRegs.size() == 1);
+      buildCopyToRegs(MIRBuilder, Args[i].Regs, Args[i].OrigRegs[0], OrigTy,
+                      ValTy, extendOpFromFlags(Args[i].Flags[0]));
+    }
+
+    bool BigEndianPartOrdering = TLI->hasBigEndianPartOrdering(OrigVT, DL);
+    for (unsigned Part = 0; Part < NumParts; ++Part) {
+      Register ArgReg = Args[i].Regs[Part];
+      // There should be Regs.size() ArgLocs per argument.
+      unsigned Idx = BigEndianPartOrdering ? NumParts - 1 - Part : Part;
+      CCValAssign &VA = ArgLocs[j + Idx];
+      const ISD::ArgFlagsTy Flags = Args[i].Flags[Part];
+
+      if (VA.isMemLoc() && !Flags.isByVal()) {
+        // Individual pieces may have been spilled to the stack and others
+        // passed in registers.
+
+        // TODO: The memory size may be larger than the value we need to
+        // store. We may need to adjust the offset for big endian targets.
+        LLT MemTy = Handler.getStackValueStoreType(DL, VA, Flags);
+
+        MachinePointerInfo MPO;
+        Register StackAddr = Handler.getStackAddress(
+            MemTy.getSizeInBytes(), VA.getLocMemOffset(), MPO, Flags);
+
+        Handler.assignValueToAddress(Args[i], Part, StackAddr, MemTy, MPO, VA);
+        continue;
+      }
+
+      if (VA.isMemLoc() && Flags.isByVal()) {
+        assert(Args[i].Regs.size() == 1 &&
+               "didn't expect split byval pointer");
+
+        if (Handler.isIncomingArgumentHandler()) {
+          // We just need to copy the frame index value to the pointer.
+          MachinePointerInfo MPO;
+          Register StackAddr = Handler.getStackAddress(
+              Flags.getByValSize(), VA.getLocMemOffset(), MPO, Flags);
+          MIRBuilder.buildCopy(Args[i].Regs[0], StackAddr);
+        } else {
+          // For outgoing byval arguments, insert the implicit copy byval
+          // implies, such that writes in the callee do not modify the caller's
+          // value.
+          uint64_t MemSize = Flags.getByValSize();
+          int64_t Offset = VA.getLocMemOffset();
+
+          MachinePointerInfo DstMPO;
+          Register StackAddr =
+              Handler.getStackAddress(MemSize, Offset, DstMPO, Flags);
+
+          MachinePointerInfo SrcMPO(Args[i].OrigValue);
+          if (!Args[i].OrigValue) {
+            // We still need to accurately track the stack address space if we
+            // don't know the underlying value.
+            const LLT PtrTy = MRI.getType(StackAddr);
+            SrcMPO = MachinePointerInfo(PtrTy.getAddressSpace());
+          }
+
+          Align DstAlign = std::max(Flags.getNonZeroByValAlign(),
+                                    inferAlignFromPtrInfo(MF, DstMPO));
+
+          Align SrcAlign = std::max(Flags.getNonZeroByValAlign(),
+                                    inferAlignFromPtrInfo(MF, SrcMPO));
+
+          Handler.copyArgumentMemory(Args[i], StackAddr, Args[i].Regs[0],
+                                     DstMPO, DstAlign, SrcMPO, SrcAlign,
+                                     MemSize, VA);
+        }
+        continue;
+      }
+
+      assert(!VA.needsCustom() && "custom loc should have been handled already");
+
+      if (i == 0 && !ThisReturnRegs.empty() &&
+          Handler.isIncomingArgumentHandler() &&
+          isTypeIsValidForThisReturn(ValVT)) {
+        Handler.assignValueToReg(ArgReg, ThisReturnRegs[Part], VA);
+        continue;
+      }
+
+      if (Handler.isIncomingArgumentHandler())
+        Handler.assignValueToReg(ArgReg, VA.getLocReg(), VA);
+      else {
+        DelayedOutgoingRegAssignments.emplace_back([=, &Handler]() {
+          Handler.assignValueToReg(ArgReg, VA.getLocReg(), VA);
+        });
+      }
+    }
+
+    // Now that all pieces have been assigned, re-pack the register typed values
+    // into the original value typed registers.
+    if (Handler.isIncomingArgumentHandler() && OrigVT != LocVT) {
+      // Merge the split registers into the expected larger result vregs of
+      // the original call.
+      buildCopyFromRegs(MIRBuilder, Args[i].OrigRegs, Args[i].Regs, OrigTy,
+                        LocTy, Args[i].Flags[0]);
+    }
+
+    j += NumParts - 1;
+  }
+  for (auto &Fn : DelayedOutgoingRegAssignments)
+    Fn();
+
+  return true;
+}
+
+void CallLowering::insertSRetLoads(MachineIRBuilder &MIRBuilder, Type *RetTy,
+                                   ArrayRef<Register> VRegs, Register DemoteReg,
+                                   int FI) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const DataLayout &DL = MF.getDataLayout();
+
+  SmallVector<EVT, 4> SplitVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, &Offsets, 0);
+
+  assert(VRegs.size() == SplitVTs.size());
+
+  unsigned NumValues = SplitVTs.size();
+  Align BaseAlign = DL.getPrefTypeAlign(RetTy);
+  Type *RetPtrTy = RetTy->getPointerTo(DL.getAllocaAddrSpace());
+  LLT OffsetLLTy = getLLTForType(*DL.getIndexType(RetPtrTy), DL);
+
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
+
+  for (unsigned I = 0; I < NumValues; ++I) {
+    Register Addr;
+    MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]);
+    auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
+                                        MRI.getType(VRegs[I]),
+                                        commonAlignment(BaseAlign, Offsets[I]));
+    MIRBuilder.buildLoad(VRegs[I], Addr, *MMO);
+  }
+}
+
+void CallLowering::insertSRetStores(MachineIRBuilder &MIRBuilder, Type *RetTy,
+                                    ArrayRef<Register> VRegs,
+                                    Register DemoteReg) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const DataLayout &DL = MF.getDataLayout();
+
+  SmallVector<EVT, 4> SplitVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(*TLI, DL, RetTy, SplitVTs, &Offsets, 0);
+
+  assert(VRegs.size() == SplitVTs.size());
+
+  unsigned NumValues = SplitVTs.size();
+  Align BaseAlign = DL.getPrefTypeAlign(RetTy);
+  unsigned AS = DL.getAllocaAddrSpace();
+  LLT OffsetLLTy = getLLTForType(*DL.getIndexType(RetTy->getPointerTo(AS)), DL);
+
+  MachinePointerInfo PtrInfo(AS);
+
+  for (unsigned I = 0; I < NumValues; ++I) {
+    Register Addr;
+    MIRBuilder.materializePtrAdd(Addr, DemoteReg, OffsetLLTy, Offsets[I]);
+    auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+                                        MRI.getType(VRegs[I]),
+                                        commonAlignment(BaseAlign, Offsets[I]));
+    MIRBuilder.buildStore(VRegs[I], Addr, *MMO);
+  }
+}
+
+void CallLowering::insertSRetIncomingArgument(
+    const Function &F, SmallVectorImpl<ArgInfo> &SplitArgs, Register &DemoteReg,
+    MachineRegisterInfo &MRI, const DataLayout &DL) const {
+  unsigned AS = DL.getAllocaAddrSpace();
+  DemoteReg = MRI.createGenericVirtualRegister(
+      LLT::pointer(AS, DL.getPointerSizeInBits(AS)));
+
+  Type *PtrTy = PointerType::get(F.getReturnType(), AS);
+
+  SmallVector<EVT, 1> ValueVTs;
+  ComputeValueVTs(*TLI, DL, PtrTy, ValueVTs);
+
+  // NOTE: Assume that a pointer won't get split into more than one VT.
+  assert(ValueVTs.size() == 1);
+
+  ArgInfo DemoteArg(DemoteReg, ValueVTs[0].getTypeForEVT(PtrTy->getContext()),
+                    ArgInfo::NoArgIndex);
+  setArgFlags(DemoteArg, AttributeList::ReturnIndex, DL, F);
+  DemoteArg.Flags[0].setSRet();
+  SplitArgs.insert(SplitArgs.begin(), DemoteArg);
+}
+
+void CallLowering::insertSRetOutgoingArgument(MachineIRBuilder &MIRBuilder,
+                                              const CallBase &CB,
+                                              CallLoweringInfo &Info) const {
+  const DataLayout &DL = MIRBuilder.getDataLayout();
+  Type *RetTy = CB.getType();
+  unsigned AS = DL.getAllocaAddrSpace();
+  LLT FramePtrTy = LLT::pointer(AS, DL.getPointerSizeInBits(AS));
+
+  int FI = MIRBuilder.getMF().getFrameInfo().CreateStackObject(
+      DL.getTypeAllocSize(RetTy), DL.getPrefTypeAlign(RetTy), false);
+
+  Register DemoteReg = MIRBuilder.buildFrameIndex(FramePtrTy, FI).getReg(0);
+  ArgInfo DemoteArg(DemoteReg, PointerType::get(RetTy, AS),
+                    ArgInfo::NoArgIndex);
+  setArgFlags(DemoteArg, AttributeList::ReturnIndex, DL, CB);
+  DemoteArg.Flags[0].setSRet();
+
+  Info.OrigArgs.insert(Info.OrigArgs.begin(), DemoteArg);
+  Info.DemoteStackIndex = FI;
+  Info.DemoteRegister = DemoteReg;
+}
+
+bool CallLowering::checkReturn(CCState &CCInfo,
+                               SmallVectorImpl<BaseArgInfo> &Outs,
+                               CCAssignFn *Fn) const {
+  for (unsigned I = 0, E = Outs.size(); I < E; ++I) {
+    MVT VT = MVT::getVT(Outs[I].Ty);
+    if (Fn(I, VT, VT, CCValAssign::Full, Outs[I].Flags[0], CCInfo))
+      return false;
+  }
+  return true;
+}
+
+void CallLowering::getReturnInfo(CallingConv::ID CallConv, Type *RetTy,
+                                 AttributeList Attrs,
+                                 SmallVectorImpl<BaseArgInfo> &Outs,
+                                 const DataLayout &DL) const {
+  LLVMContext &Context = RetTy->getContext();
+  ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+
+  SmallVector<EVT, 4> SplitVTs;
+  ComputeValueVTs(*TLI, DL, RetTy, SplitVTs);
+  addArgFlagsFromAttributes(Flags, Attrs, AttributeList::ReturnIndex);
+
+  for (EVT VT : SplitVTs) {
+    unsigned NumParts =
+        TLI->getNumRegistersForCallingConv(Context, CallConv, VT);
+    MVT RegVT = TLI->getRegisterTypeForCallingConv(Context, CallConv, VT);
+    Type *PartTy = EVT(RegVT).getTypeForEVT(Context);
+
+    for (unsigned I = 0; I < NumParts; ++I) {
+      Outs.emplace_back(PartTy, Flags);
+    }
+  }
+}
+
+bool CallLowering::checkReturnTypeForCallConv(MachineFunction &MF) const {
+  const auto &F = MF.getFunction();
+  Type *ReturnType = F.getReturnType();
+  CallingConv::ID CallConv = F.getCallingConv();
+
+  SmallVector<BaseArgInfo, 4> SplitArgs;
+  getReturnInfo(CallConv, ReturnType, F.getAttributes(), SplitArgs,
+                MF.getDataLayout());
+  return canLowerReturn(MF, CallConv, SplitArgs, F.isVarArg());
+}
+
+bool CallLowering::parametersInCSRMatch(
+    const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask,
+    const SmallVectorImpl<CCValAssign> &OutLocs,
+    const SmallVectorImpl<ArgInfo> &OutArgs) const {
+  for (unsigned i = 0; i < OutLocs.size(); ++i) {
+    const auto &ArgLoc = OutLocs[i];
+    // If it's not a register, it's fine.
+    if (!ArgLoc.isRegLoc())
+      continue;
+
+    MCRegister PhysReg = ArgLoc.getLocReg();
+
+    // Only look at callee-saved registers.
+    if (MachineOperand::clobbersPhysReg(CallerPreservedMask, PhysReg))
+      continue;
+
+    LLVM_DEBUG(
+        dbgs()
+        << "... Call has an argument passed in a callee-saved register.\n");
+
+    // Check if it was copied from.
+    const ArgInfo &OutInfo = OutArgs[i];
+
+    if (OutInfo.Regs.size() > 1) {
+      LLVM_DEBUG(
+          dbgs() << "... Cannot handle arguments in multiple registers.\n");
+      return false;
+    }
+
+    // Check if we copy the register, walking through copies from virtual
+    // registers. Note that getDefIgnoringCopies does not ignore copies from
+    // physical registers.
+    MachineInstr *RegDef = getDefIgnoringCopies(OutInfo.Regs[0], MRI);
+    if (!RegDef || RegDef->getOpcode() != TargetOpcode::COPY) {
+      LLVM_DEBUG(
+          dbgs()
+          << "... Parameter was not copied into a VReg, cannot tail call.\n");
+      return false;
+    }
+
+    // Got a copy. Verify that it's the same as the register we want.
+    Register CopyRHS = RegDef->getOperand(1).getReg();
+    if (CopyRHS != PhysReg) {
+      LLVM_DEBUG(dbgs() << "... Callee-saved register was not copied into "
+                           "VReg, cannot tail call.\n");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool CallLowering::resultsCompatible(CallLoweringInfo &Info,
+                                     MachineFunction &MF,
+                                     SmallVectorImpl<ArgInfo> &InArgs,
+                                     ValueAssigner &CalleeAssigner,
+                                     ValueAssigner &CallerAssigner) const {
+  const Function &F = MF.getFunction();
+  CallingConv::ID CalleeCC = Info.CallConv;
+  CallingConv::ID CallerCC = F.getCallingConv();
+
+  if (CallerCC == CalleeCC)
+    return true;
+
+  SmallVector<CCValAssign, 16> ArgLocs1;
+  CCState CCInfo1(CalleeCC, Info.IsVarArg, MF, ArgLocs1, F.getContext());
+  if (!determineAssignments(CalleeAssigner, InArgs, CCInfo1))
+    return false;
+
+  SmallVector<CCValAssign, 16> ArgLocs2;
+  CCState CCInfo2(CallerCC, F.isVarArg(), MF, ArgLocs2, F.getContext());
+  if (!determineAssignments(CallerAssigner, InArgs, CCInfo2))
+    return false;
+
+  // We need the argument locations to match up exactly. If there's more in
+  // one than the other, then we are done.
+  if (ArgLocs1.size() != ArgLocs2.size())
+    return false;
+
+  // Make sure that each location is passed in exactly the same way.
+  for (unsigned i = 0, e = ArgLocs1.size(); i < e; ++i) {
+    const CCValAssign &Loc1 = ArgLocs1[i];
+    const CCValAssign &Loc2 = ArgLocs2[i];
+
+    // We need both of them to be the same. So if one is a register and one
+    // isn't, we're done.
+    if (Loc1.isRegLoc() != Loc2.isRegLoc())
+      return false;
+
+    if (Loc1.isRegLoc()) {
+      // If they don't have the same register location, we're done.
+      if (Loc1.getLocReg() != Loc2.getLocReg())
+        return false;
+
+      // They matched, so we can move to the next ArgLoc.
+      continue;
+    }
+
+    // Loc1 wasn't a RegLoc, so they both must be MemLocs. Check if they match.
+    if (Loc1.getLocMemOffset() != Loc2.getLocMemOffset())
+      return false;
+  }
+
+  return true;
+}
+
+LLT CallLowering::ValueHandler::getStackValueStoreType(
+    const DataLayout &DL, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const {
+  const MVT ValVT = VA.getValVT();
+  if (ValVT != MVT::iPTR) {
+    LLT ValTy(ValVT);
+
+    // We lost the pointeriness going through CCValAssign, so try to restore it
+    // based on the flags.
+    if (Flags.isPointer()) {
+      LLT PtrTy = LLT::pointer(Flags.getPointerAddrSpace(),
+                               ValTy.getScalarSizeInBits());
+      if (ValVT.isVector())
+        return LLT::vector(ValTy.getElementCount(), PtrTy);
+      return PtrTy;
+    }
+
+    return ValTy;
+  }
+
+  unsigned AddrSpace = Flags.getPointerAddrSpace();
+  return LLT::pointer(AddrSpace, DL.getPointerSize(AddrSpace));
+}
+
+void CallLowering::ValueHandler::copyArgumentMemory(
+    const ArgInfo &Arg, Register DstPtr, Register SrcPtr,
+    const MachinePointerInfo &DstPtrInfo, Align DstAlign,
+    const MachinePointerInfo &SrcPtrInfo, Align SrcAlign, uint64_t MemSize,
+    CCValAssign &VA) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  MachineMemOperand *SrcMMO = MF.getMachineMemOperand(
+      SrcPtrInfo,
+      MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable, MemSize,
+      SrcAlign);
+
+  MachineMemOperand *DstMMO = MF.getMachineMemOperand(
+      DstPtrInfo,
+      MachineMemOperand::MOStore | MachineMemOperand::MODereferenceable,
+      MemSize, DstAlign);
+
+  const LLT PtrTy = MRI.getType(DstPtr);
+  const LLT SizeTy = LLT::scalar(PtrTy.getSizeInBits());
+
+  auto SizeConst = MIRBuilder.buildConstant(SizeTy, MemSize);
+  MIRBuilder.buildMemCpy(DstPtr, SrcPtr, SizeConst, *DstMMO, *SrcMMO);
+}
+
+Register CallLowering::ValueHandler::extendRegister(Register ValReg,
+                                                    CCValAssign &VA,
+                                                    unsigned MaxSizeBits) {
+  LLT LocTy{VA.getLocVT()};
+  LLT ValTy{VA.getValVT()};
+
+  if (LocTy.getSizeInBits() == ValTy.getSizeInBits())
+    return ValReg;
+
+  if (LocTy.isScalar() && MaxSizeBits && MaxSizeBits < LocTy.getSizeInBits()) {
+    if (MaxSizeBits <= ValTy.getSizeInBits())
+      return ValReg;
+    LocTy = LLT::scalar(MaxSizeBits);
+  }
+
+  const LLT ValRegTy = MRI.getType(ValReg);
+  if (ValRegTy.isPointer()) {
+    // The x32 ABI wants to zero extend 32-bit pointers to 64-bit registers, so
+    // we have to cast to do the extension.
+    LLT IntPtrTy = LLT::scalar(ValRegTy.getSizeInBits());
+    ValReg = MIRBuilder.buildPtrToInt(IntPtrTy, ValReg).getReg(0);
+  }
+
+  switch (VA.getLocInfo()) {
+  default: break;
+  case CCValAssign::Full:
+  case CCValAssign::BCvt:
+    // FIXME: bitconverting between vector types may or may not be a
+    // nop in big-endian situations.
+    return ValReg;
+  case CCValAssign::AExt: {
+    auto MIB = MIRBuilder.buildAnyExt(LocTy, ValReg);
+    return MIB.getReg(0);
+  }
+  case CCValAssign::SExt: {
+    Register NewReg = MRI.createGenericVirtualRegister(LocTy);
+    MIRBuilder.buildSExt(NewReg, ValReg);
+    return NewReg;
+  }
+  case CCValAssign::ZExt: {
+    Register NewReg = MRI.createGenericVirtualRegister(LocTy);
+    MIRBuilder.buildZExt(NewReg, ValReg);
+    return NewReg;
+  }
+  }
+  llvm_unreachable("unable to extend register");
+}
+
+void CallLowering::ValueAssigner::anchor() {}
+
+Register CallLowering::IncomingValueHandler::buildExtensionHint(CCValAssign &VA,
+                                                                Register SrcReg,
+                                                                LLT NarrowTy) {
+  switch (VA.getLocInfo()) {
+  case CCValAssign::LocInfo::ZExt: {
+    return MIRBuilder
+        .buildAssertZExt(MRI.cloneVirtualRegister(SrcReg), SrcReg,
+                         NarrowTy.getScalarSizeInBits())
+        .getReg(0);
+  }
+  case CCValAssign::LocInfo::SExt: {
+    return MIRBuilder
+        .buildAssertSExt(MRI.cloneVirtualRegister(SrcReg), SrcReg,
+                         NarrowTy.getScalarSizeInBits())
+        .getReg(0);
+    break;
+  }
+  default:
+    return SrcReg;
+  }
+}
+
+/// Check if we can use a basic COPY instruction between the two types.
+///
+/// We're currently building on top of the infrastructure using MVT, which loses
+/// pointer information in the CCValAssign. We accept copies from physical
+/// registers that have been reported as integers if it's to an equivalent sized
+/// pointer LLT.
+static bool isCopyCompatibleType(LLT SrcTy, LLT DstTy) {
+  if (SrcTy == DstTy)
+    return true;
+
+  if (SrcTy.getSizeInBits() != DstTy.getSizeInBits())
+    return false;
+
+  SrcTy = SrcTy.getScalarType();
+  DstTy = DstTy.getScalarType();
+
+  return (SrcTy.isPointer() && DstTy.isScalar()) ||
+         (DstTy.isPointer() && SrcTy.isScalar());
+}
+
+void CallLowering::IncomingValueHandler::assignValueToReg(Register ValVReg,
+                                                          Register PhysReg,
+                                                          CCValAssign VA) {
+  const MVT LocVT = VA.getLocVT();
+  const LLT LocTy(LocVT);
+  const LLT RegTy = MRI.getType(ValVReg);
+
+  if (isCopyCompatibleType(RegTy, LocTy)) {
+    MIRBuilder.buildCopy(ValVReg, PhysReg);
+    return;
+  }
+
+  auto Copy = MIRBuilder.buildCopy(LocTy, PhysReg);
+  auto Hint = buildExtensionHint(VA, Copy.getReg(0), RegTy);
+  MIRBuilder.buildTrunc(ValVReg, Hint);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Combiner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
new file mode 100644
index 000000000000..748fa273d499
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
@@ -0,0 +1,166 @@
+//===-- lib/CodeGen/GlobalISel/Combiner.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file constains common code to combine machine functions at generic
+// level.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
+#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/GISelWorkList.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "gi-combiner"
+
+using namespace llvm;
+
+namespace llvm {
+cl::OptionCategory GICombinerOptionCategory(
+    "GlobalISel Combiner",
+    "Control the rules which are enabled. These options all take a comma "
+    "separated list of rules to disable and may be specified by number "
+    "or number range (e.g. 1-10)."
+#ifndef NDEBUG
+    " They may also be specified by name."
+#endif
+);
+} // end namespace llvm
+
+namespace {
+/// This class acts as the glue the joins the CombinerHelper to the overall
+/// Combine algorithm. The CombinerHelper is intended to report the
+/// modifications it makes to the MIR to the GISelChangeObserver and the
+/// observer subclass will act on these events. In this case, instruction
+/// erasure will cancel any future visits to the erased instruction and
+/// instruction creation will schedule that instruction for a future visit.
+/// Other Combiner implementations may require more complex behaviour from
+/// their GISelChangeObserver subclass.
+class WorkListMaintainer : public GISelChangeObserver {
+  using WorkListTy = GISelWorkList<512>;
+  WorkListTy &WorkList;
+  /// The instructions that have been created but we want to report once they
+  /// have their operands. This is only maintained if debug output is requested.
+#ifndef NDEBUG
+  SetVector<const MachineInstr *> CreatedInstrs;
+#endif
+
+public:
+  WorkListMaintainer(WorkListTy &WorkList) : WorkList(WorkList) {}
+  virtual ~WorkListMaintainer() = default;
+
+  void erasingInstr(MachineInstr &MI) override {
+    LLVM_DEBUG(dbgs() << "Erasing: " << MI << "\n");
+    WorkList.remove(&MI);
+  }
+  void createdInstr(MachineInstr &MI) override {
+    LLVM_DEBUG(dbgs() << "Creating: " << MI << "\n");
+    WorkList.insert(&MI);
+    LLVM_DEBUG(CreatedInstrs.insert(&MI));
+  }
+  void changingInstr(MachineInstr &MI) override {
+    LLVM_DEBUG(dbgs() << "Changing: " << MI << "\n");
+    WorkList.insert(&MI);
+  }
+  void changedInstr(MachineInstr &MI) override {
+    LLVM_DEBUG(dbgs() << "Changed: " << MI << "\n");
+    WorkList.insert(&MI);
+  }
+
+  void reportFullyCreatedInstrs() {
+    LLVM_DEBUG(for (const auto *MI
+                    : CreatedInstrs) {
+      dbgs() << "Created: ";
+      MI->print(dbgs());
+    });
+    LLVM_DEBUG(CreatedInstrs.clear());
+  }
+};
+}
+
+Combiner::Combiner(CombinerInfo &Info, const TargetPassConfig *TPC)
+    : CInfo(Info), TPC(TPC) {
+  (void)this->TPC; // FIXME: Remove when used.
+}
+
+bool Combiner::combineMachineInstrs(MachineFunction &MF,
+                                    GISelCSEInfo *CSEInfo) {
+  // If the ISel pipeline failed, do not bother running this pass.
+  // FIXME: Should this be here or in individual combiner passes.
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+
+  Builder =
+      CSEInfo ? std::make_unique<CSEMIRBuilder>() : std::make_unique<MachineIRBuilder>();
+  MRI = &MF.getRegInfo();
+  Builder->setMF(MF);
+  if (CSEInfo)
+    Builder->setCSEInfo(CSEInfo);
+
+  LLVM_DEBUG(dbgs() << "Generic MI Combiner for: " << MF.getName() << '\n');
+
+  MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr);
+
+  bool MFChanged = false;
+  bool Changed;
+  MachineIRBuilder &B = *Builder;
+
+  do {
+    // Collect all instructions. Do a post order traversal for basic blocks and
+    // insert with list bottom up, so while we pop_back_val, we'll traverse top
+    // down RPOT.
+    Changed = false;
+    GISelWorkList<512> WorkList;
+    WorkListMaintainer Observer(WorkList);
+    GISelObserverWrapper WrapperObserver(&Observer);
+    if (CSEInfo)
+      WrapperObserver.addObserver(CSEInfo);
+    RAIIDelegateInstaller DelInstall(MF, &WrapperObserver);
+    for (MachineBasicBlock *MBB : post_order(&MF)) {
+      for (MachineInstr &CurMI :
+           llvm::make_early_inc_range(llvm::reverse(*MBB))) {
+        // Erase dead insts before even adding to the list.
+        if (isTriviallyDead(CurMI, *MRI)) {
+          LLVM_DEBUG(dbgs() << CurMI << "Is dead; erasing.\n");
+          llvm::salvageDebugInfo(*MRI, CurMI);
+          CurMI.eraseFromParent();
+          continue;
+        }
+        WorkList.deferred_insert(&CurMI);
+      }
+    }
+    WorkList.finalize();
+    // Main Loop. Process the instructions here.
+    while (!WorkList.empty()) {
+      MachineInstr *CurrInst = WorkList.pop_back_val();
+      LLVM_DEBUG(dbgs() << "\nTry combining " << *CurrInst;);
+      Changed |= CInfo.combine(WrapperObserver, *CurrInst, B);
+      Observer.reportFullyCreatedInstrs();
+    }
+    MFChanged |= Changed;
+  } while (Changed);
+
+#ifndef NDEBUG
+  if (CSEInfo) {
+    if (auto E = CSEInfo->verify()) {
+      errs() << E << '\n';
+      assert(false && "CSEInfo is not consistent. Likely missing calls to "
+                      "observer on mutations.");
+    }
+  }
+#endif
+  return MFChanged;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
new file mode 100644
index 000000000000..cc7fb3ee1109
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -0,0 +1,6029 @@
+//===-- lib/CodeGen/GlobalISel/GICombinerHelper.cpp -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/DivisionByConstantInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cmath>
+#include <optional>
+#include <tuple>
+
+#define DEBUG_TYPE "gi-combiner"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+// Option to allow testing of the combiner while no targets know about indexed
+// addressing.
+static cl::opt<bool>
+    ForceLegalIndexing("force-legal-indexing", cl::Hidden, cl::init(false),
+                       cl::desc("Force all indexed operations to be "
+                                "legal for the GlobalISel combiner"));
+
+CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
+                               MachineIRBuilder &B, bool IsPreLegalize,
+                               GISelKnownBits *KB, MachineDominatorTree *MDT,
+                               const LegalizerInfo *LI)
+    : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer), KB(KB),
+      MDT(MDT), IsPreLegalize(IsPreLegalize), LI(LI),
+      RBI(Builder.getMF().getSubtarget().getRegBankInfo()),
+      TRI(Builder.getMF().getSubtarget().getRegisterInfo()) {
+  (void)this->KB;
+}
+
+const TargetLowering &CombinerHelper::getTargetLowering() const {
+  return *Builder.getMF().getSubtarget().getTargetLowering();
+}
+
+/// \returns The little endian in-memory byte position of byte \p I in a
+/// \p ByteWidth bytes wide type.
+///
+/// E.g. Given a 4-byte type x, x[0] -> byte 0
+static unsigned littleEndianByteAt(const unsigned ByteWidth, const unsigned I) {
+  assert(I < ByteWidth && "I must be in [0, ByteWidth)");
+  return I;
+}
+
+/// Determines the LogBase2 value for a non-null input value using the
+/// transform: LogBase2(V) = (EltBits - 1) - ctlz(V).
+static Register buildLogBase2(Register V, MachineIRBuilder &MIB) {
+  auto &MRI = *MIB.getMRI();
+  LLT Ty = MRI.getType(V);
+  auto Ctlz = MIB.buildCTLZ(Ty, V);
+  auto Base = MIB.buildConstant(Ty, Ty.getScalarSizeInBits() - 1);
+  return MIB.buildSub(Ty, Base, Ctlz).getReg(0);
+}
+
+/// \returns The big endian in-memory byte position of byte \p I in a
+/// \p ByteWidth bytes wide type.
+///
+/// E.g. Given a 4-byte type x, x[0] -> byte 3
+static unsigned bigEndianByteAt(const unsigned ByteWidth, const unsigned I) {
+  assert(I < ByteWidth && "I must be in [0, ByteWidth)");
+  return ByteWidth - I - 1;
+}
+
+/// Given a map from byte offsets in memory to indices in a load/store,
+/// determine if that map corresponds to a little or big endian byte pattern.
+///
+/// \param MemOffset2Idx maps memory offsets to address offsets.
+/// \param LowestIdx is the lowest index in \p MemOffset2Idx.
+///
+/// \returns true if the map corresponds to a big endian byte pattern, false if
+/// it corresponds to a little endian byte pattern, and std::nullopt otherwise.
+///
+/// E.g. given a 32-bit type x, and x[AddrOffset], the in-memory byte patterns
+/// are as follows:
+///
+/// AddrOffset   Little endian    Big endian
+/// 0            0                3
+/// 1            1                2
+/// 2            2                1
+/// 3            3                0
+static std::optional<bool>
+isBigEndian(const SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
+            int64_t LowestIdx) {
+  // Need at least two byte positions to decide on endianness.
+  unsigned Width = MemOffset2Idx.size();
+  if (Width < 2)
+    return std::nullopt;
+  bool BigEndian = true, LittleEndian = true;
+  for (unsigned MemOffset = 0; MemOffset < Width; ++ MemOffset) {
+    auto MemOffsetAndIdx = MemOffset2Idx.find(MemOffset);
+    if (MemOffsetAndIdx == MemOffset2Idx.end())
+      return std::nullopt;
+    const int64_t Idx = MemOffsetAndIdx->second - LowestIdx;
+    assert(Idx >= 0 && "Expected non-negative byte offset?");
+    LittleEndian &= Idx == littleEndianByteAt(Width, MemOffset);
+    BigEndian &= Idx == bigEndianByteAt(Width, MemOffset);
+    if (!BigEndian && !LittleEndian)
+      return std::nullopt;
+  }
+
+  assert((BigEndian != LittleEndian) &&
+         "Pattern cannot be both big and little endian!");
+  return BigEndian;
+}
+
+bool CombinerHelper::isPreLegalize() const { return IsPreLegalize; }
+
+bool CombinerHelper::isLegal(const LegalityQuery &Query) const {
+  assert(LI && "Must have LegalizerInfo to query isLegal!");
+  return LI->getAction(Query).Action == LegalizeActions::Legal;
+}
+
+bool CombinerHelper::isLegalOrBeforeLegalizer(
+    const LegalityQuery &Query) const {
+  return isPreLegalize() || isLegal(Query);
+}
+
+bool CombinerHelper::isConstantLegalOrBeforeLegalizer(const LLT Ty) const {
+  if (!Ty.isVector())
+    return isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {Ty}});
+  // Vector constants are represented as a G_BUILD_VECTOR of scalar G_CONSTANTs.
+  if (isPreLegalize())
+    return true;
+  LLT EltTy = Ty.getElementType();
+  return isLegal({TargetOpcode::G_BUILD_VECTOR, {Ty, EltTy}}) &&
+         isLegal({TargetOpcode::G_CONSTANT, {EltTy}});
+}
+
+void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
+                                    Register ToReg) const {
+  Observer.changingAllUsesOfReg(MRI, FromReg);
+
+  if (MRI.constrainRegAttrs(ToReg, FromReg))
+    MRI.replaceRegWith(FromReg, ToReg);
+  else
+    Builder.buildCopy(ToReg, FromReg);
+
+  Observer.finishedChangingAllUsesOfReg();
+}
+
+void CombinerHelper::replaceRegOpWith(MachineRegisterInfo &MRI,
+                                      MachineOperand &FromRegOp,
+                                      Register ToReg) const {
+  assert(FromRegOp.getParent() && "Expected an operand in an MI");
+  Observer.changingInstr(*FromRegOp.getParent());
+
+  FromRegOp.setReg(ToReg);
+
+  Observer.changedInstr(*FromRegOp.getParent());
+}
+
+void CombinerHelper::replaceOpcodeWith(MachineInstr &FromMI,
+                                       unsigned ToOpcode) const {
+  Observer.changingInstr(FromMI);
+
+  FromMI.setDesc(Builder.getTII().get(ToOpcode));
+
+  Observer.changedInstr(FromMI);
+}
+
+const RegisterBank *CombinerHelper::getRegBank(Register Reg) const {
+  return RBI->getRegBank(Reg, MRI, *TRI);
+}
+
+void CombinerHelper::setRegBank(Register Reg, const RegisterBank *RegBank) {
+  if (RegBank)
+    MRI.setRegBank(Reg, *RegBank);
+}
+
+bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
+  if (matchCombineCopy(MI)) {
+    applyCombineCopy(MI);
+    return true;
+  }
+  return false;
+}
+bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
+  if (MI.getOpcode() != TargetOpcode::COPY)
+    return false;
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  return canReplaceReg(DstReg, SrcReg, MRI);
+}
+void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  MI.eraseFromParent();
+  replaceRegWith(MRI, DstReg, SrcReg);
+}
+
+bool CombinerHelper::tryCombineConcatVectors(MachineInstr &MI) {
+  bool IsUndef = false;
+  SmallVector<Register, 4> Ops;
+  if (matchCombineConcatVectors(MI, IsUndef, Ops)) {
+    applyCombineConcatVectors(MI, IsUndef, Ops);
+    return true;
+  }
+  return false;
+}
+
+bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
+                                               SmallVectorImpl<Register> &Ops) {
+  assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
+         "Invalid instruction");
+  IsUndef = true;
+  MachineInstr *Undef = nullptr;
+
+  // Walk over all the operands of concat vectors and check if they are
+  // build_vector themselves or undef.
+  // Then collect their operands in Ops.
+  for (const MachineOperand &MO : MI.uses()) {
+    Register Reg = MO.getReg();
+    MachineInstr *Def = MRI.getVRegDef(Reg);
+    assert(Def && "Operand not defined");
+    switch (Def->getOpcode()) {
+    case TargetOpcode::G_BUILD_VECTOR:
+      IsUndef = false;
+      // Remember the operands of the build_vector to fold
+      // them into the yet-to-build flattened concat vectors.
+      for (const MachineOperand &BuildVecMO : Def->uses())
+        Ops.push_back(BuildVecMO.getReg());
+      break;
+    case TargetOpcode::G_IMPLICIT_DEF: {
+      LLT OpType = MRI.getType(Reg);
+      // Keep one undef value for all the undef operands.
+      if (!Undef) {
+        Builder.setInsertPt(*MI.getParent(), MI);
+        Undef = Builder.buildUndef(OpType.getScalarType());
+      }
+      assert(MRI.getType(Undef->getOperand(0).getReg()) ==
+                 OpType.getScalarType() &&
+             "All undefs should have the same type");
+      // Break the undef vector in as many scalar elements as needed
+      // for the flattening.
+      for (unsigned EltIdx = 0, EltEnd = OpType.getNumElements();
+           EltIdx != EltEnd; ++EltIdx)
+        Ops.push_back(Undef->getOperand(0).getReg());
+      break;
+    }
+    default:
+      return false;
+    }
+  }
+  return true;
+}
+void CombinerHelper::applyCombineConcatVectors(
+    MachineInstr &MI, bool IsUndef, const ArrayRef<Register> Ops) {
+  // We determined that the concat_vectors can be flatten.
+  // Generate the flattened build_vector.
+  Register DstReg = MI.getOperand(0).getReg();
+  Builder.setInsertPt(*MI.getParent(), MI);
+  Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
+
+  // Note: IsUndef is sort of redundant. We could have determine it by
+  // checking that at all Ops are undef.  Alternatively, we could have
+  // generate a build_vector of undefs and rely on another combine to
+  // clean that up.  For now, given we already gather this information
+  // in tryCombineConcatVectors, just save compile time and issue the
+  // right thing.
+  if (IsUndef)
+    Builder.buildUndef(NewDstReg);
+  else
+    Builder.buildBuildVector(NewDstReg, Ops);
+  MI.eraseFromParent();
+  replaceRegWith(MRI, DstReg, NewDstReg);
+}
+
+bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
+  SmallVector<Register, 4> Ops;
+  if (matchCombineShuffleVector(MI, Ops)) {
+    applyCombineShuffleVector(MI, Ops);
+    return true;
+  }
+  return false;
+}
+
+bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI,
+                                               SmallVectorImpl<Register> &Ops) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
+         "Invalid instruction kind");
+  LLT DstType = MRI.getType(MI.getOperand(0).getReg());
+  Register Src1 = MI.getOperand(1).getReg();
+  LLT SrcType = MRI.getType(Src1);
+  // As bizarre as it may look, shuffle vector can actually produce
+  // scalar! This is because at the IR level a <1 x ty> shuffle
+  // vector is perfectly valid.
+  unsigned DstNumElts = DstType.isVector() ? DstType.getNumElements() : 1;
+  unsigned SrcNumElts = SrcType.isVector() ? SrcType.getNumElements() : 1;
+
+  // If the resulting vector is smaller than the size of the source
+  // vectors being concatenated, we won't be able to replace the
+  // shuffle vector into a concat_vectors.
+  //
+  // Note: We may still be able to produce a concat_vectors fed by
+  //       extract_vector_elt and so on. It is less clear that would
+  //       be better though, so don't bother for now.
+  //
+  // If the destination is a scalar, the size of the sources doesn't
+  // matter. we will lower the shuffle to a plain copy. This will
+  // work only if the source and destination have the same size. But
+  // that's covered by the next condition.
+  //
+  // TODO: If the size between the source and destination don't match
+  //       we could still emit an extract vector element in that case.
+  if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1)
+    return false;
+
+  // Check that the shuffle mask can be broken evenly between the
+  // different sources.
+  if (DstNumElts % SrcNumElts != 0)
+    return false;
+
+  // Mask length is a multiple of the source vector length.
+  // Check if the shuffle is some kind of concatenation of the input
+  // vectors.
+  unsigned NumConcat = DstNumElts / SrcNumElts;
+  SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
+  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+  for (unsigned i = 0; i != DstNumElts; ++i) {
+    int Idx = Mask[i];
+    // Undef value.
+    if (Idx < 0)
+      continue;
+    // Ensure the indices in each SrcType sized piece are sequential and that
+    // the same source is used for the whole piece.
+    if ((Idx % SrcNumElts != (i % SrcNumElts)) ||
+        (ConcatSrcs[i / SrcNumElts] >= 0 &&
+         ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts)))
+      return false;
+    // Remember which source this index came from.
+    ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
+  }
+
+  // The shuffle is concatenating multiple vectors together.
+  // Collect the different operands for that.
+  Register UndefReg;
+  Register Src2 = MI.getOperand(2).getReg();
+  for (auto Src : ConcatSrcs) {
+    if (Src < 0) {
+      if (!UndefReg) {
+        Builder.setInsertPt(*MI.getParent(), MI);
+        UndefReg = Builder.buildUndef(SrcType).getReg(0);
+      }
+      Ops.push_back(UndefReg);
+    } else if (Src == 0)
+      Ops.push_back(Src1);
+    else
+      Ops.push_back(Src2);
+  }
+  return true;
+}
+
+void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
+                                               const ArrayRef<Register> Ops) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Builder.setInsertPt(*MI.getParent(), MI);
+  Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
+
+  if (Ops.size() == 1)
+    Builder.buildCopy(NewDstReg, Ops[0]);
+  else
+    Builder.buildMergeLikeInstr(NewDstReg, Ops);
+
+  MI.eraseFromParent();
+  replaceRegWith(MRI, DstReg, NewDstReg);
+}
+
+namespace {
+
+/// Select a preference between two uses. CurrentUse is the current preference
+/// while *ForCandidate is attributes of the candidate under consideration.
+PreferredTuple ChoosePreferredUse(MachineInstr &LoadMI,
+                                  PreferredTuple &CurrentUse,
+                                  const LLT TyForCandidate,
+                                  unsigned OpcodeForCandidate,
+                                  MachineInstr *MIForCandidate) {
+  if (!CurrentUse.Ty.isValid()) {
+    if (CurrentUse.ExtendOpcode == OpcodeForCandidate ||
+        CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT)
+      return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
+    return CurrentUse;
+  }
+
+  // We permit the extend to hoist through basic blocks but this is only
+  // sensible if the target has extending loads. If you end up lowering back
+  // into a load and extend during the legalizer then the end result is
+  // hoisting the extend up to the load.
+
+  // Prefer defined extensions to undefined extensions as these are more
+  // likely to reduce the number of instructions.
+  if (OpcodeForCandidate == TargetOpcode::G_ANYEXT &&
+      CurrentUse.ExtendOpcode != TargetOpcode::G_ANYEXT)
+    return CurrentUse;
+  else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT &&
+           OpcodeForCandidate != TargetOpcode::G_ANYEXT)
+    return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
+
+  // Prefer sign extensions to zero extensions as sign-extensions tend to be
+  // more expensive. Don't do this if the load is already a zero-extend load
+  // though, otherwise we'll rewrite a zero-extend load into a sign-extend
+  // later.
+  if (!isa<GZExtLoad>(LoadMI) && CurrentUse.Ty == TyForCandidate) {
+    if (CurrentUse.ExtendOpcode == TargetOpcode::G_SEXT &&
+        OpcodeForCandidate == TargetOpcode::G_ZEXT)
+      return CurrentUse;
+    else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ZEXT &&
+             OpcodeForCandidate == TargetOpcode::G_SEXT)
+      return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
+  }
+
+  // This is potentially target specific. We've chosen the largest type
+  // because G_TRUNC is usually free. One potential catch with this is that
+  // some targets have a reduced number of larger registers than smaller
+  // registers and this choice potentially increases the live-range for the
+  // larger value.
+  if (TyForCandidate.getSizeInBits() > CurrentUse.Ty.getSizeInBits()) {
+    return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
+  }
+  return CurrentUse;
+}
+
+/// Find a suitable place to insert some instructions and insert them. This
+/// function accounts for special cases like inserting before a PHI node.
+/// The current strategy for inserting before PHI's is to duplicate the
+/// instructions for each predecessor. However, while that's ok for G_TRUNC
+/// on most targets since it generally requires no code, other targets/cases may
+/// want to try harder to find a dominating block.
+static void InsertInsnsWithoutSideEffectsBeforeUse(
+    MachineIRBuilder &Builder, MachineInstr &DefMI, MachineOperand &UseMO,
+    std::function<void(MachineBasicBlock *, MachineBasicBlock::iterator,
+                       MachineOperand &UseMO)>
+        Inserter) {
+  MachineInstr &UseMI = *UseMO.getParent();
+
+  MachineBasicBlock *InsertBB = UseMI.getParent();
+
+  // If the use is a PHI then we want the predecessor block instead.
+  if (UseMI.isPHI()) {
+    MachineOperand *PredBB = std::next(&UseMO);
+    InsertBB = PredBB->getMBB();
+  }
+
+  // If the block is the same block as the def then we want to insert just after
+  // the def instead of at the start of the block.
+  if (InsertBB == DefMI.getParent()) {
+    MachineBasicBlock::iterator InsertPt = &DefMI;
+    Inserter(InsertBB, std::next(InsertPt), UseMO);
+    return;
+  }
+
+  // Otherwise we want the start of the BB
+  Inserter(InsertBB, InsertBB->getFirstNonPHI(), UseMO);
+}
+} // end anonymous namespace
+
+bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) {
+  PreferredTuple Preferred;
+  if (matchCombineExtendingLoads(MI, Preferred)) {
+    applyCombineExtendingLoads(MI, Preferred);
+    return true;
+  }
+  return false;
+}
+
+static unsigned getExtLoadOpcForExtend(unsigned ExtOpc) {
+  unsigned CandidateLoadOpc;
+  switch (ExtOpc) {
+  case TargetOpcode::G_ANYEXT:
+    CandidateLoadOpc = TargetOpcode::G_LOAD;
+    break;
+  case TargetOpcode::G_SEXT:
+    CandidateLoadOpc = TargetOpcode::G_SEXTLOAD;
+    break;
+  case TargetOpcode::G_ZEXT:
+    CandidateLoadOpc = TargetOpcode::G_ZEXTLOAD;
+    break;
+  default:
+    llvm_unreachable("Unexpected extend opc");
+  }
+  return CandidateLoadOpc;
+}
+
+bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI,
+                                                PreferredTuple &Preferred) {
+  // We match the loads and follow the uses to the extend instead of matching
+  // the extends and following the def to the load. This is because the load
+  // must remain in the same position for correctness (unless we also add code
+  // to find a safe place to sink it) whereas the extend is freely movable.
+  // It also prevents us from duplicating the load for the volatile case or just
+  // for performance.
+  GAnyLoad *LoadMI = dyn_cast<GAnyLoad>(&MI);
+  if (!LoadMI)
+    return false;
+
+  Register LoadReg = LoadMI->getDstReg();
+
+  LLT LoadValueTy = MRI.getType(LoadReg);
+  if (!LoadValueTy.isScalar())
+    return false;
+
+  // Most architectures are going to legalize <s8 loads into at least a 1 byte
+  // load, and the MMOs can only describe memory accesses in multiples of bytes.
+  // If we try to perform extload combining on those, we can end up with
+  // %a(s8) = extload %ptr (load 1 byte from %ptr)
+  // ... which is an illegal extload instruction.
+  if (LoadValueTy.getSizeInBits() < 8)
+    return false;
+
+  // For non power-of-2 types, they will very likely be legalized into multiple
+  // loads. Don't bother trying to match them into extending loads.
+  if (!llvm::has_single_bit<uint32_t>(LoadValueTy.getSizeInBits()))
+    return false;
+
+  // Find the preferred type aside from the any-extends (unless it's the only
+  // one) and non-extending ops. We'll emit an extending load to that type and
+  // and emit a variant of (extend (trunc X)) for the others according to the
+  // relative type sizes. At the same time, pick an extend to use based on the
+  // extend involved in the chosen type.
+  unsigned PreferredOpcode =
+      isa<GLoad>(&MI)
+          ? TargetOpcode::G_ANYEXT
+          : isa<GSExtLoad>(&MI) ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
+  Preferred = {LLT(), PreferredOpcode, nullptr};
+  for (auto &UseMI : MRI.use_nodbg_instructions(LoadReg)) {
+    if (UseMI.getOpcode() == TargetOpcode::G_SEXT ||
+        UseMI.getOpcode() == TargetOpcode::G_ZEXT ||
+        (UseMI.getOpcode() == TargetOpcode::G_ANYEXT)) {
+      const auto &MMO = LoadMI->getMMO();
+      // For atomics, only form anyextending loads.
+      if (MMO.isAtomic() && UseMI.getOpcode() != TargetOpcode::G_ANYEXT)
+        continue;
+      // Check for legality.
+      if (!isPreLegalize()) {
+        LegalityQuery::MemDesc MMDesc(MMO);
+        unsigned CandidateLoadOpc = getExtLoadOpcForExtend(UseMI.getOpcode());
+        LLT UseTy = MRI.getType(UseMI.getOperand(0).getReg());
+        LLT SrcTy = MRI.getType(LoadMI->getPointerReg());
+        if (LI->getAction({CandidateLoadOpc, {UseTy, SrcTy}, {MMDesc}})
+                .Action != LegalizeActions::Legal)
+          continue;
+      }
+      Preferred = ChoosePreferredUse(MI, Preferred,
+                                     MRI.getType(UseMI.getOperand(0).getReg()),
+                                     UseMI.getOpcode(), &UseMI);
+    }
+  }
+
+  // There were no extends
+  if (!Preferred.MI)
+    return false;
+  // It should be impossible to chose an extend without selecting a different
+  // type since by definition the result of an extend is larger.
+  assert(Preferred.Ty != LoadValueTy && "Extending to same type?");
+
+  LLVM_DEBUG(dbgs() << "Preferred use is: " << *Preferred.MI);
+  return true;
+}
+
+void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
+                                                PreferredTuple &Preferred) {
+  // Rewrite the load to the chosen extending load.
+  Register ChosenDstReg = Preferred.MI->getOperand(0).getReg();
+
+  // Inserter to insert a truncate back to the original type at a given point
+  // with some basic CSE to limit truncate duplication to one per BB.
+  DenseMap<MachineBasicBlock *, MachineInstr *> EmittedInsns;
+  auto InsertTruncAt = [&](MachineBasicBlock *InsertIntoBB,
+                           MachineBasicBlock::iterator InsertBefore,
+                           MachineOperand &UseMO) {
+    MachineInstr *PreviouslyEmitted = EmittedInsns.lookup(InsertIntoBB);
+    if (PreviouslyEmitted) {
+      Observer.changingInstr(*UseMO.getParent());
+      UseMO.setReg(PreviouslyEmitted->getOperand(0).getReg());
+      Observer.changedInstr(*UseMO.getParent());
+      return;
+    }
+
+    Builder.setInsertPt(*InsertIntoBB, InsertBefore);
+    Register NewDstReg = MRI.cloneVirtualRegister(MI.getOperand(0).getReg());
+    MachineInstr *NewMI = Builder.buildTrunc(NewDstReg, ChosenDstReg);
+    EmittedInsns[InsertIntoBB] = NewMI;
+    replaceRegOpWith(MRI, UseMO, NewDstReg);
+  };
+
+  Observer.changingInstr(MI);
+  unsigned LoadOpc = getExtLoadOpcForExtend(Preferred.ExtendOpcode);
+  MI.setDesc(Builder.getTII().get(LoadOpc));
+
+  // Rewrite all the uses to fix up the types.
+  auto &LoadValue = MI.getOperand(0);
+  SmallVector<MachineOperand *, 4> Uses;
+  for (auto &UseMO : MRI.use_operands(LoadValue.getReg()))
+    Uses.push_back(&UseMO);
+
+  for (auto *UseMO : Uses) {
+    MachineInstr *UseMI = UseMO->getParent();
+
+    // If the extend is compatible with the preferred extend then we should fix
+    // up the type and extend so that it uses the preferred use.
+    if (UseMI->getOpcode() == Preferred.ExtendOpcode ||
+        UseMI->getOpcode() == TargetOpcode::G_ANYEXT) {
+      Register UseDstReg = UseMI->getOperand(0).getReg();
+      MachineOperand &UseSrcMO = UseMI->getOperand(1);
+      const LLT UseDstTy = MRI.getType(UseDstReg);
+      if (UseDstReg != ChosenDstReg) {
+        if (Preferred.Ty == UseDstTy) {
+          // If the use has the same type as the preferred use, then merge
+          // the vregs and erase the extend. For example:
+          //    %1:_(s8) = G_LOAD ...
+          //    %2:_(s32) = G_SEXT %1(s8)
+          //    %3:_(s32) = G_ANYEXT %1(s8)
+          //    ... = ... %3(s32)
+          // rewrites to:
+          //    %2:_(s32) = G_SEXTLOAD ...
+          //    ... = ... %2(s32)
+          replaceRegWith(MRI, UseDstReg, ChosenDstReg);
+          Observer.erasingInstr(*UseMO->getParent());
+          UseMO->getParent()->eraseFromParent();
+        } else if (Preferred.Ty.getSizeInBits() < UseDstTy.getSizeInBits()) {
+          // If the preferred size is smaller, then keep the extend but extend
+          // from the result of the extending load. For example:
+          //    %1:_(s8) = G_LOAD ...
+          //    %2:_(s32) = G_SEXT %1(s8)
+          //    %3:_(s64) = G_ANYEXT %1(s8)
+          //    ... = ... %3(s64)
+          /// rewrites to:
+          //    %2:_(s32) = G_SEXTLOAD ...
+          //    %3:_(s64) = G_ANYEXT %2:_(s32)
+          //    ... = ... %3(s64)
+          replaceRegOpWith(MRI, UseSrcMO, ChosenDstReg);
+        } else {
+          // If the preferred size is large, then insert a truncate. For
+          // example:
+          //    %1:_(s8) = G_LOAD ...
+          //    %2:_(s64) = G_SEXT %1(s8)
+          //    %3:_(s32) = G_ZEXT %1(s8)
+          //    ... = ... %3(s32)
+          /// rewrites to:
+          //    %2:_(s64) = G_SEXTLOAD ...
+          //    %4:_(s8) = G_TRUNC %2:_(s32)
+          //    %3:_(s64) = G_ZEXT %2:_(s8)
+          //    ... = ... %3(s64)
+          InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO,
+                                                 InsertTruncAt);
+        }
+        continue;
+      }
+      // The use is (one of) the uses of the preferred use we chose earlier.
+      // We're going to update the load to def this value later so just erase
+      // the old extend.
+      Observer.erasingInstr(*UseMO->getParent());
+      UseMO->getParent()->eraseFromParent();
+      continue;
+    }
+
+    // The use isn't an extend. Truncate back to the type we originally loaded.
+    // This is free on many targets.
+    InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO, InsertTruncAt);
+  }
+
+  MI.getOperand(0).setReg(ChosenDstReg);
+  Observer.changedInstr(MI);
+}
+
+bool CombinerHelper::matchCombineLoadWithAndMask(MachineInstr &MI,
+                                                 BuildFnTy &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_AND);
+
+  // If we have the following code:
+  //  %mask = G_CONSTANT 255
+  //  %ld   = G_LOAD %ptr, (load s16)
+  //  %and  = G_AND %ld, %mask
+  //
+  // Try to fold it into
+  //   %ld = G_ZEXTLOAD %ptr, (load s8)
+
+  Register Dst = MI.getOperand(0).getReg();
+  if (MRI.getType(Dst).isVector())
+    return false;
+
+  auto MaybeMask =
+      getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+  if (!MaybeMask)
+    return false;
+
+  APInt MaskVal = MaybeMask->Value;
+
+  if (!MaskVal.isMask())
+    return false;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  // Don't use getOpcodeDef() here since intermediate instructions may have
+  // multiple users.
+  GAnyLoad *LoadMI = dyn_cast<GAnyLoad>(MRI.getVRegDef(SrcReg));
+  if (!LoadMI || !MRI.hasOneNonDBGUse(LoadMI->getDstReg()))
+    return false;
+
+  Register LoadReg = LoadMI->getDstReg();
+  LLT RegTy = MRI.getType(LoadReg);
+  Register PtrReg = LoadMI->getPointerReg();
+  unsigned RegSize = RegTy.getSizeInBits();
+  uint64_t LoadSizeBits = LoadMI->getMemSizeInBits();
+  unsigned MaskSizeBits = MaskVal.countr_one();
+
+  // The mask may not be larger than the in-memory type, as it might cover sign
+  // extended bits
+  if (MaskSizeBits > LoadSizeBits)
+    return false;
+
+  // If the mask covers the whole destination register, there's nothing to
+  // extend
+  if (MaskSizeBits >= RegSize)
+    return false;
+
+  // Most targets cannot deal with loads of size < 8 and need to re-legalize to
+  // at least byte loads. Avoid creating such loads here
+  if (MaskSizeBits < 8 || !isPowerOf2_32(MaskSizeBits))
+    return false;
+
+  const MachineMemOperand &MMO = LoadMI->getMMO();
+  LegalityQuery::MemDesc MemDesc(MMO);
+
+  // Don't modify the memory access size if this is atomic/volatile, but we can
+  // still adjust the opcode to indicate the high bit behavior.
+  if (LoadMI->isSimple())
+    MemDesc.MemoryTy = LLT::scalar(MaskSizeBits);
+  else if (LoadSizeBits > MaskSizeBits || LoadSizeBits == RegSize)
+    return false;
+
+  // TODO: Could check if it's legal with the reduced or original memory size.
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_ZEXTLOAD, {RegTy, MRI.getType(PtrReg)}, {MemDesc}}))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    B.setInstrAndDebugLoc(*LoadMI);
+    auto &MF = B.getMF();
+    auto PtrInfo = MMO.getPointerInfo();
+    auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, MemDesc.MemoryTy);
+    B.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, Dst, PtrReg, *NewMMO);
+    LoadMI->eraseFromParent();
+  };
+  return true;
+}
+
+bool CombinerHelper::isPredecessor(const MachineInstr &DefMI,
+                                   const MachineInstr &UseMI) {
+  assert(!DefMI.isDebugInstr() && !UseMI.isDebugInstr() &&
+         "shouldn't consider debug uses");
+  assert(DefMI.getParent() == UseMI.getParent());
+  if (&DefMI == &UseMI)
+    return true;
+  const MachineBasicBlock &MBB = *DefMI.getParent();
+  auto DefOrUse = find_if(MBB, [&DefMI, &UseMI](const MachineInstr &MI) {
+    return &MI == &DefMI || &MI == &UseMI;
+  });
+  if (DefOrUse == MBB.end())
+    llvm_unreachable("Block must contain both DefMI and UseMI!");
+  return &*DefOrUse == &DefMI;
+}
+
+bool CombinerHelper::dominates(const MachineInstr &DefMI,
+                               const MachineInstr &UseMI) {
+  assert(!DefMI.isDebugInstr() && !UseMI.isDebugInstr() &&
+         "shouldn't consider debug uses");
+  if (MDT)
+    return MDT->dominates(&DefMI, &UseMI);
+  else if (DefMI.getParent() != UseMI.getParent())
+    return false;
+
+  return isPredecessor(DefMI, UseMI);
+}
+
+bool CombinerHelper::matchSextTruncSextLoad(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register LoadUser = SrcReg;
+
+  if (MRI.getType(SrcReg).isVector())
+    return false;
+
+  Register TruncSrc;
+  if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc))))
+    LoadUser = TruncSrc;
+
+  uint64_t SizeInBits = MI.getOperand(2).getImm();
+  // If the source is a G_SEXTLOAD from the same bit width, then we don't
+  // need any extend at all, just a truncate.
+  if (auto *LoadMI = getOpcodeDef<GSExtLoad>(LoadUser, MRI)) {
+    // If truncating more than the original extended value, abort.
+    auto LoadSizeBits = LoadMI->getMemSizeInBits();
+    if (TruncSrc && MRI.getType(TruncSrc).getSizeInBits() < LoadSizeBits)
+      return false;
+    if (LoadSizeBits == SizeInBits)
+      return true;
+  }
+  return false;
+}
+
+void CombinerHelper::applySextTruncSextLoad(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
+  Builder.setInstrAndDebugLoc(MI);
+  Builder.buildCopy(MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchSextInRegOfLoad(
+    MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT RegTy = MRI.getType(DstReg);
+
+  // Only supports scalars for now.
+  if (RegTy.isVector())
+    return false;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  auto *LoadDef = getOpcodeDef<GLoad>(SrcReg, MRI);
+  if (!LoadDef || !MRI.hasOneNonDBGUse(DstReg))
+    return false;
+
+  uint64_t MemBits = LoadDef->getMemSizeInBits();
+
+  // If the sign extend extends from a narrower width than the load's width,
+  // then we can narrow the load width when we combine to a G_SEXTLOAD.
+  // Avoid widening the load at all.
+  unsigned NewSizeBits = std::min((uint64_t)MI.getOperand(2).getImm(), MemBits);
+
+  // Don't generate G_SEXTLOADs with a < 1 byte width.
+  if (NewSizeBits < 8)
+    return false;
+  // Don't bother creating a non-power-2 sextload, it will likely be broken up
+  // anyway for most targets.
+  if (!isPowerOf2_32(NewSizeBits))
+    return false;
+
+  const MachineMemOperand &MMO = LoadDef->getMMO();
+  LegalityQuery::MemDesc MMDesc(MMO);
+
+  // Don't modify the memory access size if this is atomic/volatile, but we can
+  // still adjust the opcode to indicate the high bit behavior.
+  if (LoadDef->isSimple())
+    MMDesc.MemoryTy = LLT::scalar(NewSizeBits);
+  else if (MemBits > NewSizeBits || MemBits == RegTy.getSizeInBits())
+    return false;
+
+  // TODO: Could check if it's legal with the reduced or original memory size.
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SEXTLOAD,
+                                 {MRI.getType(LoadDef->getDstReg()),
+                                  MRI.getType(LoadDef->getPointerReg())},
+                                 {MMDesc}}))
+    return false;
+
+  MatchInfo = std::make_tuple(LoadDef->getDstReg(), NewSizeBits);
+  return true;
+}
+
+void CombinerHelper::applySextInRegOfLoad(
+    MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
+  Register LoadReg;
+  unsigned ScalarSizeBits;
+  std::tie(LoadReg, ScalarSizeBits) = MatchInfo;
+  GLoad *LoadDef = cast<GLoad>(MRI.getVRegDef(LoadReg));
+
+  // If we have the following:
+  // %ld = G_LOAD %ptr, (load 2)
+  // %ext = G_SEXT_INREG %ld, 8
+  //    ==>
+  // %ld = G_SEXTLOAD %ptr (load 1)
+
+  auto &MMO = LoadDef->getMMO();
+  Builder.setInstrAndDebugLoc(*LoadDef);
+  auto &MF = Builder.getMF();
+  auto PtrInfo = MMO.getPointerInfo();
+  auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, ScalarSizeBits / 8);
+  Builder.buildLoadInstr(TargetOpcode::G_SEXTLOAD, MI.getOperand(0).getReg(),
+                         LoadDef->getPointerReg(), *NewMMO);
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::findPostIndexCandidate(MachineInstr &MI, Register &Addr,
+                                            Register &Base, Register &Offset) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+
+#ifndef NDEBUG
+  unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
+         Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
+#endif
+
+  Base = MI.getOperand(1).getReg();
+  MachineInstr *BaseDef = MRI.getUniqueVRegDef(Base);
+  if (BaseDef && BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Searching for post-indexing opportunity for: " << MI);
+  // FIXME: The following use traversal needs a bail out for patholigical cases.
+  for (auto &Use : MRI.use_nodbg_instructions(Base)) {
+    if (Use.getOpcode() != TargetOpcode::G_PTR_ADD)
+      continue;
+
+    Offset = Use.getOperand(2).getReg();
+    if (!ForceLegalIndexing &&
+        !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ false, MRI)) {
+      LLVM_DEBUG(dbgs() << "    Ignoring candidate with illegal addrmode: "
+                        << Use);
+      continue;
+    }
+
+    // Make sure the offset calculation is before the potentially indexed op.
+    // FIXME: we really care about dependency here. The offset calculation might
+    // be movable.
+    MachineInstr *OffsetDef = MRI.getUniqueVRegDef(Offset);
+    if (!OffsetDef || !dominates(*OffsetDef, MI)) {
+      LLVM_DEBUG(dbgs() << "    Ignoring candidate with offset after mem-op: "
+                        << Use);
+      continue;
+    }
+
+    // FIXME: check whether all uses of Base are load/store with foldable
+    // addressing modes. If so, using the normal addr-modes is better than
+    // forming an indexed one.
+
+    bool MemOpDominatesAddrUses = true;
+    for (auto &PtrAddUse :
+         MRI.use_nodbg_instructions(Use.getOperand(0).getReg())) {
+      if (!dominates(MI, PtrAddUse)) {
+        MemOpDominatesAddrUses = false;
+        break;
+      }
+    }
+
+    if (!MemOpDominatesAddrUses) {
+      LLVM_DEBUG(
+          dbgs() << "    Ignoring candidate as memop does not dominate uses: "
+                 << Use);
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "    Found match: " << Use);
+    Addr = Use.getOperand(0).getReg();
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::findPreIndexCandidate(MachineInstr &MI, Register &Addr,
+                                           Register &Base, Register &Offset) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+
+#ifndef NDEBUG
+  unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
+         Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
+#endif
+
+  Addr = MI.getOperand(1).getReg();
+  MachineInstr *AddrDef = getOpcodeDef(TargetOpcode::G_PTR_ADD, Addr, MRI);
+  if (!AddrDef || MRI.hasOneNonDBGUse(Addr))
+    return false;
+
+  Base = AddrDef->getOperand(1).getReg();
+  Offset = AddrDef->getOperand(2).getReg();
+
+  LLVM_DEBUG(dbgs() << "Found potential pre-indexed load_store: " << MI);
+
+  if (!ForceLegalIndexing &&
+      !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ true, MRI)) {
+    LLVM_DEBUG(dbgs() << "    Skipping, not legal for target");
+    return false;
+  }
+
+  MachineInstr *BaseDef = getDefIgnoringCopies(Base, MRI);
+  if (BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
+    LLVM_DEBUG(dbgs() << "    Skipping, frame index would need copy anyway.");
+    return false;
+  }
+
+  if (MI.getOpcode() == TargetOpcode::G_STORE) {
+    // Would require a copy.
+    if (Base == MI.getOperand(0).getReg()) {
+      LLVM_DEBUG(dbgs() << "    Skipping, storing base so need copy anyway.");
+      return false;
+    }
+
+    // We're expecting one use of Addr in MI, but it could also be the
+    // value stored, which isn't actually dominated by the instruction.
+    if (MI.getOperand(0).getReg() == Addr) {
+      LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses");
+      return false;
+    }
+  }
+
+  // FIXME: check whether all uses of the base pointer are constant PtrAdds.
+  // That might allow us to end base's liveness here by adjusting the constant.
+
+  for (auto &UseMI : MRI.use_nodbg_instructions(Addr)) {
+    if (!dominates(MI, UseMI)) {
+      LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses.");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool CombinerHelper::tryCombineIndexedLoadStore(MachineInstr &MI) {
+  IndexedLoadStoreMatchInfo MatchInfo;
+  if (matchCombineIndexedLoadStore(MI, MatchInfo)) {
+    applyCombineIndexedLoadStore(MI, MatchInfo);
+    return true;
+  }
+  return false;
+}
+
+bool CombinerHelper::matchCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
+  unsigned Opcode = MI.getOpcode();
+  if (Opcode != TargetOpcode::G_LOAD && Opcode != TargetOpcode::G_SEXTLOAD &&
+      Opcode != TargetOpcode::G_ZEXTLOAD && Opcode != TargetOpcode::G_STORE)
+    return false;
+
+  // For now, no targets actually support these opcodes so don't waste time
+  // running these unless we're forced to for testing.
+  if (!ForceLegalIndexing)
+    return false;
+
+  MatchInfo.IsPre = findPreIndexCandidate(MI, MatchInfo.Addr, MatchInfo.Base,
+                                          MatchInfo.Offset);
+  if (!MatchInfo.IsPre &&
+      !findPostIndexCandidate(MI, MatchInfo.Addr, MatchInfo.Base,
+                              MatchInfo.Offset))
+    return false;
+
+  return true;
+}
+
+void CombinerHelper::applyCombineIndexedLoadStore(
+    MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
+  MachineInstr &AddrDef = *MRI.getUniqueVRegDef(MatchInfo.Addr);
+  MachineIRBuilder MIRBuilder(MI);
+  unsigned Opcode = MI.getOpcode();
+  bool IsStore = Opcode == TargetOpcode::G_STORE;
+  unsigned NewOpcode;
+  switch (Opcode) {
+  case TargetOpcode::G_LOAD:
+    NewOpcode = TargetOpcode::G_INDEXED_LOAD;
+    break;
+  case TargetOpcode::G_SEXTLOAD:
+    NewOpcode = TargetOpcode::G_INDEXED_SEXTLOAD;
+    break;
+  case TargetOpcode::G_ZEXTLOAD:
+    NewOpcode = TargetOpcode::G_INDEXED_ZEXTLOAD;
+    break;
+  case TargetOpcode::G_STORE:
+    NewOpcode = TargetOpcode::G_INDEXED_STORE;
+    break;
+  default:
+    llvm_unreachable("Unknown load/store opcode");
+  }
+
+  auto MIB = MIRBuilder.buildInstr(NewOpcode);
+  if (IsStore) {
+    MIB.addDef(MatchInfo.Addr);
+    MIB.addUse(MI.getOperand(0).getReg());
+  } else {
+    MIB.addDef(MI.getOperand(0).getReg());
+    MIB.addDef(MatchInfo.Addr);
+  }
+
+  MIB.addUse(MatchInfo.Base);
+  MIB.addUse(MatchInfo.Offset);
+  MIB.addImm(MatchInfo.IsPre);
+  MI.eraseFromParent();
+  AddrDef.eraseFromParent();
+
+  LLVM_DEBUG(dbgs() << "    Combinined to indexed operation");
+}
+
+bool CombinerHelper::matchCombineDivRem(MachineInstr &MI,
+                                        MachineInstr *&OtherMI) {
+  unsigned Opcode = MI.getOpcode();
+  bool IsDiv, IsSigned;
+
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Unexpected opcode!");
+  case TargetOpcode::G_SDIV:
+  case TargetOpcode::G_UDIV: {
+    IsDiv = true;
+    IsSigned = Opcode == TargetOpcode::G_SDIV;
+    break;
+  }
+  case TargetOpcode::G_SREM:
+  case TargetOpcode::G_UREM: {
+    IsDiv = false;
+    IsSigned = Opcode == TargetOpcode::G_SREM;
+    break;
+  }
+  }
+
+  Register Src1 = MI.getOperand(1).getReg();
+  unsigned DivOpcode, RemOpcode, DivremOpcode;
+  if (IsSigned) {
+    DivOpcode = TargetOpcode::G_SDIV;
+    RemOpcode = TargetOpcode::G_SREM;
+    DivremOpcode = TargetOpcode::G_SDIVREM;
+  } else {
+    DivOpcode = TargetOpcode::G_UDIV;
+    RemOpcode = TargetOpcode::G_UREM;
+    DivremOpcode = TargetOpcode::G_UDIVREM;
+  }
+
+  if (!isLegalOrBeforeLegalizer({DivremOpcode, {MRI.getType(Src1)}}))
+    return false;
+
+  // Combine:
+  //   %div:_ = G_[SU]DIV %src1:_, %src2:_
+  //   %rem:_ = G_[SU]REM %src1:_, %src2:_
+  // into:
+  //  %div:_, %rem:_ = G_[SU]DIVREM %src1:_, %src2:_
+
+  // Combine:
+  //   %rem:_ = G_[SU]REM %src1:_, %src2:_
+  //   %div:_ = G_[SU]DIV %src1:_, %src2:_
+  // into:
+  //  %div:_, %rem:_ = G_[SU]DIVREM %src1:_, %src2:_
+
+  for (auto &UseMI : MRI.use_nodbg_instructions(Src1)) {
+    if (MI.getParent() == UseMI.getParent() &&
+        ((IsDiv && UseMI.getOpcode() == RemOpcode) ||
+         (!IsDiv && UseMI.getOpcode() == DivOpcode)) &&
+        matchEqualDefs(MI.getOperand(2), UseMI.getOperand(2)) &&
+        matchEqualDefs(MI.getOperand(1), UseMI.getOperand(1))) {
+      OtherMI = &UseMI;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void CombinerHelper::applyCombineDivRem(MachineInstr &MI,
+                                        MachineInstr *&OtherMI) {
+  unsigned Opcode = MI.getOpcode();
+  assert(OtherMI && "OtherMI shouldn't be empty.");
+
+  Register DestDivReg, DestRemReg;
+  if (Opcode == TargetOpcode::G_SDIV || Opcode == TargetOpcode::G_UDIV) {
+    DestDivReg = MI.getOperand(0).getReg();
+    DestRemReg = OtherMI->getOperand(0).getReg();
+  } else {
+    DestDivReg = OtherMI->getOperand(0).getReg();
+    DestRemReg = MI.getOperand(0).getReg();
+  }
+
+  bool IsSigned =
+      Opcode == TargetOpcode::G_SDIV || Opcode == TargetOpcode::G_SREM;
+
+  // Check which instruction is first in the block so we don't break def-use
+  // deps by "moving" the instruction incorrectly. Also keep track of which
+  // instruction is first so we pick it's operands, avoiding use-before-def
+  // bugs.
+  MachineInstr *FirstInst;
+  if (dominates(MI, *OtherMI)) {
+    Builder.setInstrAndDebugLoc(MI);
+    FirstInst = &MI;
+  } else {
+    Builder.setInstrAndDebugLoc(*OtherMI);
+    FirstInst = OtherMI;
+  }
+
+  Builder.buildInstr(IsSigned ? TargetOpcode::G_SDIVREM
+                              : TargetOpcode::G_UDIVREM,
+                     {DestDivReg, DestRemReg},
+                     { FirstInst->getOperand(1), FirstInst->getOperand(2) });
+  MI.eraseFromParent();
+  OtherMI->eraseFromParent();
+}
+
+bool CombinerHelper::matchOptBrCondByInvertingCond(MachineInstr &MI,
+                                                   MachineInstr *&BrCond) {
+  assert(MI.getOpcode() == TargetOpcode::G_BR);
+
+  // Try to match the following:
+  // bb1:
+  //   G_BRCOND %c1, %bb2
+  //   G_BR %bb3
+  // bb2:
+  // ...
+  // bb3:
+
+  // The above pattern does not have a fall through to the successor bb2, always
+  // resulting in a branch no matter which path is taken. Here we try to find
+  // and replace that pattern with conditional branch to bb3 and otherwise
+  // fallthrough to bb2. This is generally better for branch predictors.
+
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineBasicBlock::iterator BrIt(MI);
+  if (BrIt == MBB->begin())
+    return false;
+  assert(std::next(BrIt) == MBB->end() && "expected G_BR to be a terminator");
+
+  BrCond = &*std::prev(BrIt);
+  if (BrCond->getOpcode() != TargetOpcode::G_BRCOND)
+    return false;
+
+  // Check that the next block is the conditional branch target. Also make sure
+  // that it isn't the same as the G_BR's target (otherwise, this will loop.)
+  MachineBasicBlock *BrCondTarget = BrCond->getOperand(1).getMBB();
+  return BrCondTarget != MI.getOperand(0).getMBB() &&
+         MBB->isLayoutSuccessor(BrCondTarget);
+}
+
+void CombinerHelper::applyOptBrCondByInvertingCond(MachineInstr &MI,
+                                                   MachineInstr *&BrCond) {
+  MachineBasicBlock *BrTarget = MI.getOperand(0).getMBB();
+  Builder.setInstrAndDebugLoc(*BrCond);
+  LLT Ty = MRI.getType(BrCond->getOperand(0).getReg());
+  // FIXME: Does int/fp matter for this? If so, we might need to restrict
+  // this to i1 only since we might not know for sure what kind of
+  // compare generated the condition value.
+  auto True = Builder.buildConstant(
+      Ty, getICmpTrueVal(getTargetLowering(), false, false));
+  auto Xor = Builder.buildXor(Ty, BrCond->getOperand(0), True);
+
+  auto *FallthroughBB = BrCond->getOperand(1).getMBB();
+  Observer.changingInstr(MI);
+  MI.getOperand(0).setMBB(FallthroughBB);
+  Observer.changedInstr(MI);
+
+  // Change the conditional branch to use the inverted condition and
+  // new target block.
+  Observer.changingInstr(*BrCond);
+  BrCond->getOperand(0).setReg(Xor.getReg(0));
+  BrCond->getOperand(1).setMBB(BrTarget);
+  Observer.changedInstr(*BrCond);
+}
+
+static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
+  if (Ty.isVector())
+    return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
+                                Ty.getNumElements());
+  return IntegerType::get(C, Ty.getSizeInBits());
+}
+
+bool CombinerHelper::tryEmitMemcpyInline(MachineInstr &MI) {
+  MachineIRBuilder HelperBuilder(MI);
+  GISelObserverWrapper DummyObserver;
+  LegalizerHelper Helper(HelperBuilder.getMF(), DummyObserver, HelperBuilder);
+  return Helper.lowerMemcpyInline(MI) ==
+         LegalizerHelper::LegalizeResult::Legalized;
+}
+
+bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
+  MachineIRBuilder HelperBuilder(MI);
+  GISelObserverWrapper DummyObserver;
+  LegalizerHelper Helper(HelperBuilder.getMF(), DummyObserver, HelperBuilder);
+  return Helper.lowerMemCpyFamily(MI, MaxLen) ==
+         LegalizerHelper::LegalizeResult::Legalized;
+}
+
+static APFloat constantFoldFpUnary(const MachineInstr &MI,
+                                   const MachineRegisterInfo &MRI,
+                                   const APFloat &Val) {
+  APFloat Result(Val);
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode!");
+  case TargetOpcode::G_FNEG: {
+    Result.changeSign();
+    return Result;
+  }
+  case TargetOpcode::G_FABS: {
+    Result.clearSign();
+    return Result;
+  }
+  case TargetOpcode::G_FPTRUNC: {
+    bool Unused;
+    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+    Result.convert(getFltSemanticForLLT(DstTy), APFloat::rmNearestTiesToEven,
+                   &Unused);
+    return Result;
+  }
+  case TargetOpcode::G_FSQRT: {
+    bool Unused;
+    Result.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
+                   &Unused);
+    Result = APFloat(sqrt(Result.convertToDouble()));
+    break;
+  }
+  case TargetOpcode::G_FLOG2: {
+    bool Unused;
+    Result.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
+                   &Unused);
+    Result = APFloat(log2(Result.convertToDouble()));
+    break;
+  }
+  }
+  // Convert `APFloat` to appropriate IEEE type depending on `DstTy`. Otherwise,
+  // `buildFConstant` will assert on size mismatch. Only `G_FSQRT`, and
+  // `G_FLOG2` reach here.
+  bool Unused;
+  Result.convert(Val.getSemantics(), APFloat::rmNearestTiesToEven, &Unused);
+  return Result;
+}
+
+void CombinerHelper::applyCombineConstantFoldFpUnary(MachineInstr &MI,
+                                                     const ConstantFP *Cst) {
+  Builder.setInstrAndDebugLoc(MI);
+  APFloat Folded = constantFoldFpUnary(MI, MRI, Cst->getValue());
+  const ConstantFP *NewCst = ConstantFP::get(Builder.getContext(), Folded);
+  Builder.buildFConstant(MI.getOperand(0), *NewCst);
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
+                                           PtrAddChain &MatchInfo) {
+  // We're trying to match the following pattern:
+  //   %t1 = G_PTR_ADD %base, G_CONSTANT imm1
+  //   %root = G_PTR_ADD %t1, G_CONSTANT imm2
+  // -->
+  //   %root = G_PTR_ADD %base, G_CONSTANT (imm1 + imm2)
+
+  if (MI.getOpcode() != TargetOpcode::G_PTR_ADD)
+    return false;
+
+  Register Add2 = MI.getOperand(1).getReg();
+  Register Imm1 = MI.getOperand(2).getReg();
+  auto MaybeImmVal = getIConstantVRegValWithLookThrough(Imm1, MRI);
+  if (!MaybeImmVal)
+    return false;
+
+  MachineInstr *Add2Def = MRI.getVRegDef(Add2);
+  if (!Add2Def || Add2Def->getOpcode() != TargetOpcode::G_PTR_ADD)
+    return false;
+
+  Register Base = Add2Def->getOperand(1).getReg();
+  Register Imm2 = Add2Def->getOperand(2).getReg();
+  auto MaybeImm2Val = getIConstantVRegValWithLookThrough(Imm2, MRI);
+  if (!MaybeImm2Val)
+    return false;
+
+  // Check if the new combined immediate forms an illegal addressing mode.
+  // Do not combine if it was legal before but would get illegal.
+  // To do so, we need to find a load/store user of the pointer to get
+  // the access type.
+  Type *AccessTy = nullptr;
+  auto &MF = *MI.getMF();
+  for (auto &UseMI : MRI.use_nodbg_instructions(MI.getOperand(0).getReg())) {
+    if (auto *LdSt = dyn_cast<GLoadStore>(&UseMI)) {
+      AccessTy = getTypeForLLT(MRI.getType(LdSt->getReg(0)),
+                               MF.getFunction().getContext());
+      break;
+    }
+  }
+  TargetLoweringBase::AddrMode AMNew;
+  APInt CombinedImm = MaybeImmVal->Value + MaybeImm2Val->Value;
+  AMNew.BaseOffs = CombinedImm.getSExtValue();
+  if (AccessTy) {
+    AMNew.HasBaseReg = true;
+    TargetLoweringBase::AddrMode AMOld;
+    AMOld.BaseOffs = MaybeImm2Val->Value.getSExtValue();
+    AMOld.HasBaseReg = true;
+    unsigned AS = MRI.getType(Add2).getAddressSpace();
+    const auto &TLI = *MF.getSubtarget().getTargetLowering();
+    if (TLI.isLegalAddressingMode(MF.getDataLayout(), AMOld, AccessTy, AS) &&
+        !TLI.isLegalAddressingMode(MF.getDataLayout(), AMNew, AccessTy, AS))
+      return false;
+  }
+
+  // Pass the combined immediate to the apply function.
+  MatchInfo.Imm = AMNew.BaseOffs;
+  MatchInfo.Base = Base;
+  MatchInfo.Bank = getRegBank(Imm2);
+  return true;
+}
+
+void CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI,
+                                           PtrAddChain &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
+  MachineIRBuilder MIB(MI);
+  LLT OffsetTy = MRI.getType(MI.getOperand(2).getReg());
+  auto NewOffset = MIB.buildConstant(OffsetTy, MatchInfo.Imm);
+  setRegBank(NewOffset.getReg(0), MatchInfo.Bank);
+  Observer.changingInstr(MI);
+  MI.getOperand(1).setReg(MatchInfo.Base);
+  MI.getOperand(2).setReg(NewOffset.getReg(0));
+  Observer.changedInstr(MI);
+}
+
+bool CombinerHelper::matchShiftImmedChain(MachineInstr &MI,
+                                          RegisterImmPair &MatchInfo) {
+  // We're trying to match the following pattern with any of
+  // G_SHL/G_ASHR/G_LSHR/G_SSHLSAT/G_USHLSAT shift instructions:
+  //   %t1 = SHIFT %base, G_CONSTANT imm1
+  //   %root = SHIFT %t1, G_CONSTANT imm2
+  // -->
+  //   %root = SHIFT %base, G_CONSTANT (imm1 + imm2)
+
+  unsigned Opcode = MI.getOpcode();
+  assert((Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_ASHR ||
+          Opcode == TargetOpcode::G_LSHR || Opcode == TargetOpcode::G_SSHLSAT ||
+          Opcode == TargetOpcode::G_USHLSAT) &&
+         "Expected G_SHL, G_ASHR, G_LSHR, G_SSHLSAT or G_USHLSAT");
+
+  Register Shl2 = MI.getOperand(1).getReg();
+  Register Imm1 = MI.getOperand(2).getReg();
+  auto MaybeImmVal = getIConstantVRegValWithLookThrough(Imm1, MRI);
+  if (!MaybeImmVal)
+    return false;
+
+  MachineInstr *Shl2Def = MRI.getUniqueVRegDef(Shl2);
+  if (Shl2Def->getOpcode() != Opcode)
+    return false;
+
+  Register Base = Shl2Def->getOperand(1).getReg();
+  Register Imm2 = Shl2Def->getOperand(2).getReg();
+  auto MaybeImm2Val = getIConstantVRegValWithLookThrough(Imm2, MRI);
+  if (!MaybeImm2Val)
+    return false;
+
+  // Pass the combined immediate to the apply function.
+  MatchInfo.Imm =
+      (MaybeImmVal->Value.getSExtValue() + MaybeImm2Val->Value).getSExtValue();
+  MatchInfo.Reg = Base;
+
+  // There is no simple replacement for a saturating unsigned left shift that
+  // exceeds the scalar size.
+  if (Opcode == TargetOpcode::G_USHLSAT &&
+      MatchInfo.Imm >= MRI.getType(Shl2).getScalarSizeInBits())
+    return false;
+
+  return true;
+}
+
+void CombinerHelper::applyShiftImmedChain(MachineInstr &MI,
+                                          RegisterImmPair &MatchInfo) {
+  unsigned Opcode = MI.getOpcode();
+  assert((Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_ASHR ||
+          Opcode == TargetOpcode::G_LSHR || Opcode == TargetOpcode::G_SSHLSAT ||
+          Opcode == TargetOpcode::G_USHLSAT) &&
+         "Expected G_SHL, G_ASHR, G_LSHR, G_SSHLSAT or G_USHLSAT");
+
+  Builder.setInstrAndDebugLoc(MI);
+  LLT Ty = MRI.getType(MI.getOperand(1).getReg());
+  unsigned const ScalarSizeInBits = Ty.getScalarSizeInBits();
+  auto Imm = MatchInfo.Imm;
+
+  if (Imm >= ScalarSizeInBits) {
+    // Any logical shift that exceeds scalar size will produce zero.
+    if (Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_LSHR) {
+      Builder.buildConstant(MI.getOperand(0), 0);
+      MI.eraseFromParent();
+      return;
+    }
+    // Arithmetic shift and saturating signed left shift have no effect beyond
+    // scalar size.
+    Imm = ScalarSizeInBits - 1;
+  }
+
+  LLT ImmTy = MRI.getType(MI.getOperand(2).getReg());
+  Register NewImm = Builder.buildConstant(ImmTy, Imm).getReg(0);
+  Observer.changingInstr(MI);
+  MI.getOperand(1).setReg(MatchInfo.Reg);
+  MI.getOperand(2).setReg(NewImm);
+  Observer.changedInstr(MI);
+}
+
+bool CombinerHelper::matchShiftOfShiftedLogic(MachineInstr &MI,
+                                              ShiftOfShiftedLogic &MatchInfo) {
+  // We're trying to match the following pattern with any of
+  // G_SHL/G_ASHR/G_LSHR/G_USHLSAT/G_SSHLSAT shift instructions in combination
+  // with any of G_AND/G_OR/G_XOR logic instructions.
+  //   %t1 = SHIFT %X, G_CONSTANT C0
+  //   %t2 = LOGIC %t1, %Y
+  //   %root = SHIFT %t2, G_CONSTANT C1
+  // -->
+  //   %t3 = SHIFT %X, G_CONSTANT (C0+C1)
+  //   %t4 = SHIFT %Y, G_CONSTANT C1
+  //   %root = LOGIC %t3, %t4
+  unsigned ShiftOpcode = MI.getOpcode();
+  assert((ShiftOpcode == TargetOpcode::G_SHL ||
+          ShiftOpcode == TargetOpcode::G_ASHR ||
+          ShiftOpcode == TargetOpcode::G_LSHR ||
+          ShiftOpcode == TargetOpcode::G_USHLSAT ||
+          ShiftOpcode == TargetOpcode::G_SSHLSAT) &&
+         "Expected G_SHL, G_ASHR, G_LSHR, G_USHLSAT and G_SSHLSAT");
+
+  // Match a one-use bitwise logic op.
+  Register LogicDest = MI.getOperand(1).getReg();
+  if (!MRI.hasOneNonDBGUse(LogicDest))
+    return false;
+
+  MachineInstr *LogicMI = MRI.getUniqueVRegDef(LogicDest);
+  unsigned LogicOpcode = LogicMI->getOpcode();
+  if (LogicOpcode != TargetOpcode::G_AND && LogicOpcode != TargetOpcode::G_OR &&
+      LogicOpcode != TargetOpcode::G_XOR)
+    return false;
+
+  // Find a matching one-use shift by constant.
+  const Register C1 = MI.getOperand(2).getReg();
+  auto MaybeImmVal = getIConstantVRegValWithLookThrough(C1, MRI);
+  if (!MaybeImmVal)
+    return false;
+
+  const uint64_t C1Val = MaybeImmVal->Value.getZExtValue();
+
+  auto matchFirstShift = [&](const MachineInstr *MI, uint64_t &ShiftVal) {
+    // Shift should match previous one and should be a one-use.
+    if (MI->getOpcode() != ShiftOpcode ||
+        !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
+      return false;
+
+    // Must be a constant.
+    auto MaybeImmVal =
+        getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
+    if (!MaybeImmVal)
+      return false;
+
+    ShiftVal = MaybeImmVal->Value.getSExtValue();
+    return true;
+  };
+
+  // Logic ops are commutative, so check each operand for a match.
+  Register LogicMIReg1 = LogicMI->getOperand(1).getReg();
+  MachineInstr *LogicMIOp1 = MRI.getUniqueVRegDef(LogicMIReg1);
+  Register LogicMIReg2 = LogicMI->getOperand(2).getReg();
+  MachineInstr *LogicMIOp2 = MRI.getUniqueVRegDef(LogicMIReg2);
+  uint64_t C0Val;
+
+  if (matchFirstShift(LogicMIOp1, C0Val)) {
+    MatchInfo.LogicNonShiftReg = LogicMIReg2;
+    MatchInfo.Shift2 = LogicMIOp1;
+  } else if (matchFirstShift(LogicMIOp2, C0Val)) {
+    MatchInfo.LogicNonShiftReg = LogicMIReg1;
+    MatchInfo.Shift2 = LogicMIOp2;
+  } else
+    return false;
+
+  MatchInfo.ValSum = C0Val + C1Val;
+
+  // The fold is not valid if the sum of the shift values exceeds bitwidth.
+  if (MatchInfo.ValSum >= MRI.getType(LogicDest).getScalarSizeInBits())
+    return false;
+
+  MatchInfo.Logic = LogicMI;
+  return true;
+}
+
+void CombinerHelper::applyShiftOfShiftedLogic(MachineInstr &MI,
+                                              ShiftOfShiftedLogic &MatchInfo) {
+  unsigned Opcode = MI.getOpcode();
+  assert((Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_ASHR ||
+          Opcode == TargetOpcode::G_LSHR || Opcode == TargetOpcode::G_USHLSAT ||
+          Opcode == TargetOpcode::G_SSHLSAT) &&
+         "Expected G_SHL, G_ASHR, G_LSHR, G_USHLSAT and G_SSHLSAT");
+
+  LLT ShlType = MRI.getType(MI.getOperand(2).getReg());
+  LLT DestType = MRI.getType(MI.getOperand(0).getReg());
+  Builder.setInstrAndDebugLoc(MI);
+
+  Register Const = Builder.buildConstant(ShlType, MatchInfo.ValSum).getReg(0);
+
+  Register Shift1Base = MatchInfo.Shift2->getOperand(1).getReg();
+  Register Shift1 =
+      Builder.buildInstr(Opcode, {DestType}, {Shift1Base, Const}).getReg(0);
+
+  // If LogicNonShiftReg is the same to Shift1Base, and shift1 const is the same
+  // to MatchInfo.Shift2 const, CSEMIRBuilder will reuse the old shift1 when
+  // build shift2. So, if we erase MatchInfo.Shift2 at the end, actually we
+  // remove old shift1. And it will cause crash later. So erase it earlier to
+  // avoid the crash.
+  MatchInfo.Shift2->eraseFromParent();
+
+  Register Shift2Const = MI.getOperand(2).getReg();
+  Register Shift2 = Builder
+                        .buildInstr(Opcode, {DestType},
+                                    {MatchInfo.LogicNonShiftReg, Shift2Const})
+                        .getReg(0);
+
+  Register Dest = MI.getOperand(0).getReg();
+  Builder.buildInstr(MatchInfo.Logic->getOpcode(), {Dest}, {Shift1, Shift2});
+
+  // This was one use so it's safe to remove it.
+  MatchInfo.Logic->eraseFromParent();
+
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHL && "Expected G_SHL");
+  // Combine (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
+  // Combine (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
+  auto &Shl = cast<GenericMachineInstr>(MI);
+  Register DstReg = Shl.getReg(0);
+  Register SrcReg = Shl.getReg(1);
+  Register ShiftReg = Shl.getReg(2);
+  Register X, C1;
+
+  if (!getTargetLowering().isDesirableToCommuteWithShift(MI, !isPreLegalize()))
+    return false;
+
+  if (!mi_match(SrcReg, MRI,
+                m_OneNonDBGUse(m_any_of(m_GAdd(m_Reg(X), m_Reg(C1)),
+                                        m_GOr(m_Reg(X), m_Reg(C1))))))
+    return false;
+
+  APInt C1Val, C2Val;
+  if (!mi_match(C1, MRI, m_ICstOrSplat(C1Val)) ||
+      !mi_match(ShiftReg, MRI, m_ICstOrSplat(C2Val)))
+    return false;
+
+  auto *SrcDef = MRI.getVRegDef(SrcReg);
+  assert((SrcDef->getOpcode() == TargetOpcode::G_ADD ||
+          SrcDef->getOpcode() == TargetOpcode::G_OR) && "Unexpected op");
+  LLT SrcTy = MRI.getType(SrcReg);
+  MatchInfo = [=](MachineIRBuilder &B) {
+    auto S1 = B.buildShl(SrcTy, X, ShiftReg);
+    auto S2 = B.buildShl(SrcTy, C1, ShiftReg);
+    B.buildInstr(SrcDef->getOpcode(), {DstReg}, {S1, S2});
+  };
+  return true;
+}
+
+bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI,
+                                          unsigned &ShiftVal) {
+  assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
+  auto MaybeImmVal =
+      getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+  if (!MaybeImmVal)
+    return false;
+
+  ShiftVal = MaybeImmVal->Value.exactLogBase2();
+  return (static_cast<int32_t>(ShiftVal) != -1);
+}
+
+void CombinerHelper::applyCombineMulToShl(MachineInstr &MI,
+                                          unsigned &ShiftVal) {
+  assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
+  MachineIRBuilder MIB(MI);
+  LLT ShiftTy = MRI.getType(MI.getOperand(0).getReg());
+  auto ShiftCst = MIB.buildConstant(ShiftTy, ShiftVal);
+  Observer.changingInstr(MI);
+  MI.setDesc(MIB.getTII().get(TargetOpcode::G_SHL));
+  MI.getOperand(2).setReg(ShiftCst.getReg(0));
+  Observer.changedInstr(MI);
+}
+
+// shl ([sza]ext x), y => zext (shl x, y), if shift does not overflow source
+bool CombinerHelper::matchCombineShlOfExtend(MachineInstr &MI,
+                                             RegisterImmPair &MatchData) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHL && KB);
+
+  Register LHS = MI.getOperand(1).getReg();
+
+  Register ExtSrc;
+  if (!mi_match(LHS, MRI, m_GAnyExt(m_Reg(ExtSrc))) &&
+      !mi_match(LHS, MRI, m_GZExt(m_Reg(ExtSrc))) &&
+      !mi_match(LHS, MRI, m_GSExt(m_Reg(ExtSrc))))
+    return false;
+
+  Register RHS = MI.getOperand(2).getReg();
+  MachineInstr *MIShiftAmt = MRI.getVRegDef(RHS);
+  auto MaybeShiftAmtVal = isConstantOrConstantSplatVector(*MIShiftAmt, MRI);
+  if (!MaybeShiftAmtVal)
+    return false;
+
+  if (LI) {
+    LLT SrcTy = MRI.getType(ExtSrc);
+
+    // We only really care about the legality with the shifted value. We can
+    // pick any type the constant shift amount, so ask the target what to
+    // use. Otherwise we would have to guess and hope it is reported as legal.
+    LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(SrcTy);
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SHL, {SrcTy, ShiftAmtTy}}))
+      return false;
+  }
+
+  int64_t ShiftAmt = MaybeShiftAmtVal->getSExtValue();
+  MatchData.Reg = ExtSrc;
+  MatchData.Imm = ShiftAmt;
+
+  unsigned MinLeadingZeros = KB->getKnownZeroes(ExtSrc).countl_one();
+  unsigned SrcTySize = MRI.getType(ExtSrc).getScalarSizeInBits();
+  return MinLeadingZeros >= ShiftAmt && ShiftAmt < SrcTySize;
+}
+
+void CombinerHelper::applyCombineShlOfExtend(MachineInstr &MI,
+                                             const RegisterImmPair &MatchData) {
+  Register ExtSrcReg = MatchData.Reg;
+  int64_t ShiftAmtVal = MatchData.Imm;
+
+  LLT ExtSrcTy = MRI.getType(ExtSrcReg);
+  Builder.setInstrAndDebugLoc(MI);
+  auto ShiftAmt = Builder.buildConstant(ExtSrcTy, ShiftAmtVal);
+  auto NarrowShift =
+      Builder.buildShl(ExtSrcTy, ExtSrcReg, ShiftAmt, MI.getFlags());
+  Builder.buildZExt(MI.getOperand(0), NarrowShift);
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchCombineMergeUnmerge(MachineInstr &MI,
+                                              Register &MatchInfo) {
+  GMerge &Merge = cast<GMerge>(MI);
+  SmallVector<Register, 16> MergedValues;
+  for (unsigned I = 0; I < Merge.getNumSources(); ++I)
+    MergedValues.emplace_back(Merge.getSourceReg(I));
+
+  auto *Unmerge = getOpcodeDef<GUnmerge>(MergedValues[0], MRI);
+  if (!Unmerge || Unmerge->getNumDefs() != Merge.getNumSources())
+    return false;
+
+  for (unsigned I = 0; I < MergedValues.size(); ++I)
+    if (MergedValues[I] != Unmerge->getReg(I))
+      return false;
+
+  MatchInfo = Unmerge->getSourceReg();
+  return true;
+}
+
+static Register peekThroughBitcast(Register Reg,
+                                   const MachineRegisterInfo &MRI) {
+  while (mi_match(Reg, MRI, m_GBitcast(m_Reg(Reg))))
+    ;
+
+  return Reg;
+}
+
+bool CombinerHelper::matchCombineUnmergeMergeToPlainValues(
+    MachineInstr &MI, SmallVectorImpl<Register> &Operands) {
+  assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
+         "Expected an unmerge");
+  auto &Unmerge = cast<GUnmerge>(MI);
+  Register SrcReg = peekThroughBitcast(Unmerge.getSourceReg(), MRI);
+
+  auto *SrcInstr = getOpcodeDef<GMergeLikeInstr>(SrcReg, MRI);
+  if (!SrcInstr)
+    return false;
+
+  // Check the source type of the merge.
+  LLT SrcMergeTy = MRI.getType(SrcInstr->getSourceReg(0));
+  LLT Dst0Ty = MRI.getType(Unmerge.getReg(0));
+  bool SameSize = Dst0Ty.getSizeInBits() == SrcMergeTy.getSizeInBits();
+  if (SrcMergeTy != Dst0Ty && !SameSize)
+    return false;
+  // They are the same now (modulo a bitcast).
+  // We can collect all the src registers.
+  for (unsigned Idx = 0; Idx < SrcInstr->getNumSources(); ++Idx)
+    Operands.push_back(SrcInstr->getSourceReg(Idx));
+  return true;
+}
+
+void CombinerHelper::applyCombineUnmergeMergeToPlainValues(
+    MachineInstr &MI, SmallVectorImpl<Register> &Operands) {
+  assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
+         "Expected an unmerge");
+  assert((MI.getNumOperands() - 1 == Operands.size()) &&
+         "Not enough operands to replace all defs");
+  unsigned NumElems = MI.getNumOperands() - 1;
+
+  LLT SrcTy = MRI.getType(Operands[0]);
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+  bool CanReuseInputDirectly = DstTy == SrcTy;
+  Builder.setInstrAndDebugLoc(MI);
+  for (unsigned Idx = 0; Idx < NumElems; ++Idx) {
+    Register DstReg = MI.getOperand(Idx).getReg();
+    Register SrcReg = Operands[Idx];
+
+    // This combine may run after RegBankSelect, so we need to be aware of
+    // register banks.
+    const auto &DstCB = MRI.getRegClassOrRegBank(DstReg);
+    if (!DstCB.isNull() && DstCB != MRI.getRegClassOrRegBank(SrcReg)) {
+      SrcReg = Builder.buildCopy(MRI.getType(SrcReg), SrcReg).getReg(0);
+      MRI.setRegClassOrRegBank(SrcReg, DstCB);
+    }
+
+    if (CanReuseInputDirectly)
+      replaceRegWith(MRI, DstReg, SrcReg);
+    else
+      Builder.buildCast(DstReg, SrcReg);
+  }
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchCombineUnmergeConstant(MachineInstr &MI,
+                                                 SmallVectorImpl<APInt> &Csts) {
+  unsigned SrcIdx = MI.getNumOperands() - 1;
+  Register SrcReg = MI.getOperand(SrcIdx).getReg();
+  MachineInstr *SrcInstr = MRI.getVRegDef(SrcReg);
+  if (SrcInstr->getOpcode() != TargetOpcode::G_CONSTANT &&
+      SrcInstr->getOpcode() != TargetOpcode::G_FCONSTANT)
+    return false;
+  // Break down the big constant in smaller ones.
+  const MachineOperand &CstVal = SrcInstr->getOperand(1);
+  APInt Val = SrcInstr->getOpcode() == TargetOpcode::G_CONSTANT
+                  ? CstVal.getCImm()->getValue()
+                  : CstVal.getFPImm()->getValueAPF().bitcastToAPInt();
+
+  LLT Dst0Ty = MRI.getType(MI.getOperand(0).getReg());
+  unsigned ShiftAmt = Dst0Ty.getSizeInBits();
+  // Unmerge a constant.
+  for (unsigned Idx = 0; Idx != SrcIdx; ++Idx) {
+    Csts.emplace_back(Val.trunc(ShiftAmt));
+    Val = Val.lshr(ShiftAmt);
+  }
+
+  return true;
+}
+
+void CombinerHelper::applyCombineUnmergeConstant(MachineInstr &MI,
+                                                 SmallVectorImpl<APInt> &Csts) {
+  assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
+         "Expected an unmerge");
+  assert((MI.getNumOperands() - 1 == Csts.size()) &&
+         "Not enough operands to replace all defs");
+  unsigned NumElems = MI.getNumOperands() - 1;
+  Builder.setInstrAndDebugLoc(MI);
+  for (unsigned Idx = 0; Idx < NumElems; ++Idx) {
+    Register DstReg = MI.getOperand(Idx).getReg();
+    Builder.buildConstant(DstReg, Csts[Idx]);
+  }
+
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchCombineUnmergeUndef(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  unsigned SrcIdx = MI.getNumOperands() - 1;
+  Register SrcReg = MI.getOperand(SrcIdx).getReg();
+  MatchInfo = [&MI](MachineIRBuilder &B) {
+    unsigned NumElems = MI.getNumOperands() - 1;
+    for (unsigned Idx = 0; Idx < NumElems; ++Idx) {
+      Register DstReg = MI.getOperand(Idx).getReg();
+      B.buildUndef(DstReg);
+    }
+  };
+  return isa<GImplicitDef>(MRI.getVRegDef(SrcReg));
+}
+
+bool CombinerHelper::matchCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
+         "Expected an unmerge");
+  // Check that all the lanes are dead except the first one.
+  for (unsigned Idx = 1, EndIdx = MI.getNumDefs(); Idx != EndIdx; ++Idx) {
+    if (!MRI.use_nodbg_empty(MI.getOperand(Idx).getReg()))
+      return false;
+  }
+  return true;
+}
+
+void CombinerHelper::applyCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) {
+  Builder.setInstrAndDebugLoc(MI);
+  Register SrcReg = MI.getOperand(MI.getNumDefs()).getReg();
+  // Truncating a vector is going to truncate every single lane,
+  // whereas we want the full lowbits.
+  // Do the operation on a scalar instead.
+  LLT SrcTy = MRI.getType(SrcReg);
+  if (SrcTy.isVector())
+    SrcReg =
+        Builder.buildCast(LLT::scalar(SrcTy.getSizeInBits()), SrcReg).getReg(0);
+
+  Register Dst0Reg = MI.getOperand(0).getReg();
+  LLT Dst0Ty = MRI.getType(Dst0Reg);
+  if (Dst0Ty.isVector()) {
+    auto MIB = Builder.buildTrunc(LLT::scalar(Dst0Ty.getSizeInBits()), SrcReg);
+    Builder.buildCast(Dst0Reg, MIB);
+  } else
+    Builder.buildTrunc(Dst0Reg, SrcReg);
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchCombineUnmergeZExtToZExt(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
+         "Expected an unmerge");
+  Register Dst0Reg = MI.getOperand(0).getReg();
+  LLT Dst0Ty = MRI.getType(Dst0Reg);
+  // G_ZEXT on vector applies to each lane, so it will
+  // affect all destinations. Therefore we won't be able
+  // to simplify the unmerge to just the first definition.
+  if (Dst0Ty.isVector())
+    return false;
+  Register SrcReg = MI.getOperand(MI.getNumDefs()).getReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+  if (SrcTy.isVector())
+    return false;
+
+  Register ZExtSrcReg;
+  if (!mi_match(SrcReg, MRI, m_GZExt(m_Reg(ZExtSrcReg))))
+    return false;
+
+  // Finally we can replace the first definition with
+  // a zext of the source if the definition is big enough to hold
+  // all of ZExtSrc bits.
+  LLT ZExtSrcTy = MRI.getType(ZExtSrcReg);
+  return ZExtSrcTy.getSizeInBits() <= Dst0Ty.getSizeInBits();
+}
+
+void CombinerHelper::applyCombineUnmergeZExtToZExt(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
+         "Expected an unmerge");
+
+  Register Dst0Reg = MI.getOperand(0).getReg();
+
+  MachineInstr *ZExtInstr =
+      MRI.getVRegDef(MI.getOperand(MI.getNumDefs()).getReg());
+  assert(ZExtInstr && ZExtInstr->getOpcode() == TargetOpcode::G_ZEXT &&
+         "Expecting a G_ZEXT");
+
+  Register ZExtSrcReg = ZExtInstr->getOperand(1).getReg();
+  LLT Dst0Ty = MRI.getType(Dst0Reg);
+  LLT ZExtSrcTy = MRI.getType(ZExtSrcReg);
+
+  Builder.setInstrAndDebugLoc(MI);
+
+  if (Dst0Ty.getSizeInBits() > ZExtSrcTy.getSizeInBits()) {
+    Builder.buildZExt(Dst0Reg, ZExtSrcReg);
+  } else {
+    assert(Dst0Ty.getSizeInBits() == ZExtSrcTy.getSizeInBits() &&
+           "ZExt src doesn't fit in destination");
+    replaceRegWith(MRI, Dst0Reg, ZExtSrcReg);
+  }
+
+  Register ZeroReg;
+  for (unsigned Idx = 1, EndIdx = MI.getNumDefs(); Idx != EndIdx; ++Idx) {
+    if (!ZeroReg)
+      ZeroReg = Builder.buildConstant(Dst0Ty, 0).getReg(0);
+    replaceRegWith(MRI, MI.getOperand(Idx).getReg(), ZeroReg);
+  }
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchCombineShiftToUnmerge(MachineInstr &MI,
+                                                unsigned TargetShiftSize,
+                                                unsigned &ShiftVal) {
+  assert((MI.getOpcode() == TargetOpcode::G_SHL ||
+          MI.getOpcode() == TargetOpcode::G_LSHR ||
+          MI.getOpcode() == TargetOpcode::G_ASHR) && "Expected a shift");
+
+  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+  if (Ty.isVector()) // TODO:
+    return false;
+
+  // Don't narrow further than the requested size.
+  unsigned Size = Ty.getSizeInBits();
+  if (Size <= TargetShiftSize)
+    return false;
+
+  auto MaybeImmVal =
+      getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+  if (!MaybeImmVal)
+    return false;
+
+  ShiftVal = MaybeImmVal->Value.getSExtValue();
+  return ShiftVal >= Size / 2 && ShiftVal < Size;
+}
+
+void CombinerHelper::applyCombineShiftToUnmerge(MachineInstr &MI,
+                                                const unsigned &ShiftVal) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT Ty = MRI.getType(SrcReg);
+  unsigned Size = Ty.getSizeInBits();
+  unsigned HalfSize = Size / 2;
+  assert(ShiftVal >= HalfSize);
+
+  LLT HalfTy = LLT::scalar(HalfSize);
+
+  Builder.setInstr(MI);
+  auto Unmerge = Builder.buildUnmerge(HalfTy, SrcReg);
+  unsigned NarrowShiftAmt = ShiftVal - HalfSize;
+
+  if (MI.getOpcode() == TargetOpcode::G_LSHR) {
+    Register Narrowed = Unmerge.getReg(1);
+
+    //  dst = G_LSHR s64:x, C for C >= 32
+    // =>
+    //   lo, hi = G_UNMERGE_VALUES x
+    //   dst = G_MERGE_VALUES (G_LSHR hi, C - 32), 0
+
+    if (NarrowShiftAmt != 0) {
+      Narrowed = Builder.buildLShr(HalfTy, Narrowed,
+        Builder.buildConstant(HalfTy, NarrowShiftAmt)).getReg(0);
+    }
+
+    auto Zero = Builder.buildConstant(HalfTy, 0);
+    Builder.buildMergeLikeInstr(DstReg, {Narrowed, Zero});
+  } else if (MI.getOpcode() == TargetOpcode::G_SHL) {
+    Register Narrowed = Unmerge.getReg(0);
+    //  dst = G_SHL s64:x, C for C >= 32
+    // =>
+    //   lo, hi = G_UNMERGE_VALUES x
+    //   dst = G_MERGE_VALUES 0, (G_SHL hi, C - 32)
+    if (NarrowShiftAmt != 0) {
+      Narrowed = Builder.buildShl(HalfTy, Narrowed,
+        Builder.buildConstant(HalfTy, NarrowShiftAmt)).getReg(0);
+    }
+
+    auto Zero = Builder.buildConstant(HalfTy, 0);
+    Builder.buildMergeLikeInstr(DstReg, {Zero, Narrowed});
+  } else {
+    assert(MI.getOpcode() == TargetOpcode::G_ASHR);
+    auto Hi = Builder.buildAShr(
+      HalfTy, Unmerge.getReg(1),
+      Builder.buildConstant(HalfTy, HalfSize - 1));
+
+    if (ShiftVal == HalfSize) {
+      // (G_ASHR i64:x, 32) ->
+      //   G_MERGE_VALUES hi_32(x), (G_ASHR hi_32(x), 31)
+      Builder.buildMergeLikeInstr(DstReg, {Unmerge.getReg(1), Hi});
+    } else if (ShiftVal == Size - 1) {
+      // Don't need a second shift.
+      // (G_ASHR i64:x, 63) ->
+      //   %narrowed = (G_ASHR hi_32(x), 31)
+      //   G_MERGE_VALUES %narrowed, %narrowed
+      Builder.buildMergeLikeInstr(DstReg, {Hi, Hi});
+    } else {
+      auto Lo = Builder.buildAShr(
+        HalfTy, Unmerge.getReg(1),
+        Builder.buildConstant(HalfTy, ShiftVal - HalfSize));
+
+      // (G_ASHR i64:x, C) ->, for C >= 32
+      //   G_MERGE_VALUES (G_ASHR hi_32(x), C - 32), (G_ASHR hi_32(x), 31)
+      Builder.buildMergeLikeInstr(DstReg, {Lo, Hi});
+    }
+  }
+
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::tryCombineShiftToUnmerge(MachineInstr &MI,
+                                              unsigned TargetShiftAmount) {
+  unsigned ShiftAmt;
+  if (matchCombineShiftToUnmerge(MI, TargetShiftAmount, ShiftAmt)) {
+    applyCombineShiftToUnmerge(MI, ShiftAmt);
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchCombineI2PToP2I(MachineInstr &MI, Register &Reg) {
+  assert(MI.getOpcode() == TargetOpcode::G_INTTOPTR && "Expected a G_INTTOPTR");
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  Register SrcReg = MI.getOperand(1).getReg();
+  return mi_match(SrcReg, MRI,
+                  m_GPtrToInt(m_all_of(m_SpecificType(DstTy), m_Reg(Reg))));
+}
+
+void CombinerHelper::applyCombineI2PToP2I(MachineInstr &MI, Register &Reg) {
+  assert(MI.getOpcode() == TargetOpcode::G_INTTOPTR && "Expected a G_INTTOPTR");
+  Register DstReg = MI.getOperand(0).getReg();
+  Builder.setInstr(MI);
+  Builder.buildCopy(DstReg, Reg);
+  MI.eraseFromParent();
+}
+
+void CombinerHelper::applyCombineP2IToI2P(MachineInstr &MI, Register &Reg) {
+  assert(MI.getOpcode() == TargetOpcode::G_PTRTOINT && "Expected a G_PTRTOINT");
+  Register DstReg = MI.getOperand(0).getReg();
+  Builder.setInstr(MI);
+  Builder.buildZExtOrTrunc(DstReg, Reg);
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchCombineAddP2IToPtrAdd(
+    MachineInstr &MI, std::pair<Register, bool> &PtrReg) {
+  assert(MI.getOpcode() == TargetOpcode::G_ADD);
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  LLT IntTy = MRI.getType(LHS);
+
+  // G_PTR_ADD always has the pointer in the LHS, so we may need to commute the
+  // instruction.
+  PtrReg.second = false;
+  for (Register SrcReg : {LHS, RHS}) {
+    if (mi_match(SrcReg, MRI, m_GPtrToInt(m_Reg(PtrReg.first)))) {
+      // Don't handle cases where the integer is implicitly converted to the
+      // pointer width.
+      LLT PtrTy = MRI.getType(PtrReg.first);
+      if (PtrTy.getScalarSizeInBits() == IntTy.getScalarSizeInBits())
+        return true;
+    }
+
+    PtrReg.second = true;
+  }
+
+  return false;
+}
+
+void CombinerHelper::applyCombineAddP2IToPtrAdd(
+    MachineInstr &MI, std::pair<Register, bool> &PtrReg) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+
+  const bool DoCommute = PtrReg.second;
+  if (DoCommute)
+    std::swap(LHS, RHS);
+  LHS = PtrReg.first;
+
+  LLT PtrTy = MRI.getType(LHS);
+
+  Builder.setInstrAndDebugLoc(MI);
+  auto PtrAdd = Builder.buildPtrAdd(PtrTy, LHS, RHS);
+  Builder.buildPtrToInt(Dst, PtrAdd);
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchCombineConstPtrAddToI2P(MachineInstr &MI,
+                                                  APInt &NewCst) {
+  auto &PtrAdd = cast<GPtrAdd>(MI);
+  Register LHS = PtrAdd.getBaseReg();
+  Register RHS = PtrAdd.getOffsetReg();
+  MachineRegisterInfo &MRI = Builder.getMF().getRegInfo();
+
+  if (auto RHSCst = getIConstantVRegVal(RHS, MRI)) {
+    APInt Cst;
+    if (mi_match(LHS, MRI, m_GIntToPtr(m_ICst(Cst)))) {
+      auto DstTy = MRI.getType(PtrAdd.getReg(0));
+      // G_INTTOPTR uses zero-extension
+      NewCst = Cst.zextOrTrunc(DstTy.getSizeInBits());
+      NewCst += RHSCst->sextOrTrunc(DstTy.getSizeInBits());
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void CombinerHelper::applyCombineConstPtrAddToI2P(MachineInstr &MI,
+                                                  APInt &NewCst) {
+  auto &PtrAdd = cast<GPtrAdd>(MI);
+  Register Dst = PtrAdd.getReg(0);
+
+  Builder.setInstrAndDebugLoc(MI);
+  Builder.buildConstant(Dst, NewCst);
+  PtrAdd.eraseFromParent();
+}
+
+bool CombinerHelper::matchCombineAnyExtTrunc(MachineInstr &MI, Register &Reg) {
+  assert(MI.getOpcode() == TargetOpcode::G_ANYEXT && "Expected a G_ANYEXT");
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  return mi_match(SrcReg, MRI,
+                  m_GTrunc(m_all_of(m_Reg(Reg), m_SpecificType(DstTy))));
+}
+
+bool CombinerHelper::matchCombineZextTrunc(MachineInstr &MI, Register &Reg) {
+  assert(MI.getOpcode() == TargetOpcode::G_ZEXT && "Expected a G_ZEXT");
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  if (mi_match(SrcReg, MRI,
+               m_GTrunc(m_all_of(m_Reg(Reg), m_SpecificType(DstTy))))) {
+    unsigned DstSize = DstTy.getScalarSizeInBits();
+    unsigned SrcSize = MRI.getType(SrcReg).getScalarSizeInBits();
+    return KB->getKnownBits(Reg).countMinLeadingZeros() >= DstSize - SrcSize;
+  }
+  return false;
+}
+
+bool CombinerHelper::matchCombineExtOfExt(
+    MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo) {
+  assert((MI.getOpcode() == TargetOpcode::G_ANYEXT ||
+          MI.getOpcode() == TargetOpcode::G_SEXT ||
+          MI.getOpcode() == TargetOpcode::G_ZEXT) &&
+         "Expected a G_[ASZ]EXT");
+  Register SrcReg = MI.getOperand(1).getReg();
+  MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+  // Match exts with the same opcode, anyext([sz]ext) and sext(zext).
+  unsigned Opc = MI.getOpcode();
+  unsigned SrcOpc = SrcMI->getOpcode();
+  if (Opc == SrcOpc ||
+      (Opc == TargetOpcode::G_ANYEXT &&
+       (SrcOpc == TargetOpcode::G_SEXT || SrcOpc == TargetOpcode::G_ZEXT)) ||
+      (Opc == TargetOpcode::G_SEXT && SrcOpc == TargetOpcode::G_ZEXT)) {
+    MatchInfo = std::make_tuple(SrcMI->getOperand(1).getReg(), SrcOpc);
+    return true;
+  }
+  return false;
+}
+
+void CombinerHelper::applyCombineExtOfExt(
+    MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo) {
+  assert((MI.getOpcode() == TargetOpcode::G_ANYEXT ||
+          MI.getOpcode() == TargetOpcode::G_SEXT ||
+          MI.getOpcode() == TargetOpcode::G_ZEXT) &&
+         "Expected a G_[ASZ]EXT");
+
+  Register Reg = std::get<0>(MatchInfo);
+  unsigned SrcExtOp = std::get<1>(MatchInfo);
+
+  // Combine exts with the same opcode.
+  if (MI.getOpcode() == SrcExtOp) {
+    Observer.changingInstr(MI);
+    MI.getOperand(1).setReg(Reg);
+    Observer.changedInstr(MI);
+    return;
+  }
+
+  // Combine:
+  // - anyext([sz]ext x) to [sz]ext x
+  // - sext(zext x) to zext x
+  if (MI.getOpcode() == TargetOpcode::G_ANYEXT ||
+      (MI.getOpcode() == TargetOpcode::G_SEXT &&
+       SrcExtOp == TargetOpcode::G_ZEXT)) {
+    Register DstReg = MI.getOperand(0).getReg();
+    Builder.setInstrAndDebugLoc(MI);
+    Builder.buildInstr(SrcExtOp, {DstReg}, {Reg});
+    MI.eraseFromParent();
+  }
+}
+
+void CombinerHelper::applyCombineMulByNegativeOne(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+
+  Builder.setInstrAndDebugLoc(MI);
+  Builder.buildSub(DstReg, Builder.buildConstant(DstTy, 0), SrcReg,
+                   MI.getFlags());
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchCombineFAbsOfFNeg(MachineInstr &MI,
+                                            BuildFnTy &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FABS && "Expected a G_FABS");
+  Register Src = MI.getOperand(1).getReg();
+  Register NegSrc;
+
+  if (!mi_match(Src, MRI, m_GFNeg(m_Reg(NegSrc))))
+    return false;
+
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    Observer.changingInstr(MI);
+    MI.getOperand(1).setReg(NegSrc);
+    Observer.changedInstr(MI);
+  };
+  return true;
+}
+
+bool CombinerHelper::matchCombineTruncOfExt(
+    MachineInstr &MI, std::pair<Register, unsigned> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC");
+  Register SrcReg = MI.getOperand(1).getReg();
+  MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+  unsigned SrcOpc = SrcMI->getOpcode();
+  if (SrcOpc == TargetOpcode::G_ANYEXT || SrcOpc == TargetOpcode::G_SEXT ||
+      SrcOpc == TargetOpcode::G_ZEXT) {
+    MatchInfo = std::make_pair(SrcMI->getOperand(1).getReg(), SrcOpc);
+    return true;
+  }
+  return false;
+}
+
+void CombinerHelper::applyCombineTruncOfExt(
+    MachineInstr &MI, std::pair<Register, unsigned> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC");
+  Register SrcReg = MatchInfo.first;
+  unsigned SrcExtOp = MatchInfo.second;
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+  LLT DstTy = MRI.getType(DstReg);
+  if (SrcTy == DstTy) {
+    MI.eraseFromParent();
+    replaceRegWith(MRI, DstReg, SrcReg);
+    return;
+  }
+  Builder.setInstrAndDebugLoc(MI);
+  if (SrcTy.getSizeInBits() < DstTy.getSizeInBits())
+    Builder.buildInstr(SrcExtOp, {DstReg}, {SrcReg});
+  else
+    Builder.buildTrunc(DstReg, SrcReg);
+  MI.eraseFromParent();
+}
+
+static LLT getMidVTForTruncRightShiftCombine(LLT ShiftTy, LLT TruncTy) {
+  const unsigned ShiftSize = ShiftTy.getScalarSizeInBits();
+  const unsigned TruncSize = TruncTy.getScalarSizeInBits();
+
+  // ShiftTy > 32 > TruncTy -> 32
+  if (ShiftSize > 32 && TruncSize < 32)
+    return ShiftTy.changeElementSize(32);
+
+  // TODO: We could also reduce to 16 bits, but that's more target-dependent.
+  //  Some targets like it, some don't, some only like it under certain
+  //  conditions/processor versions, etc.
+  //  A TL hook might be needed for this.
+
+  // Don't combine
+  return ShiftTy;
+}
+
+bool CombinerHelper::matchCombineTruncOfShift(
+    MachineInstr &MI, std::pair<MachineInstr *, LLT> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC");
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+
+  if (!MRI.hasOneNonDBGUse(SrcReg))
+    return false;
+
+  LLT SrcTy = MRI.getType(SrcReg);
+  LLT DstTy = MRI.getType(DstReg);
+
+  MachineInstr *SrcMI = getDefIgnoringCopies(SrcReg, MRI);
+  const auto &TL = getTargetLowering();
+
+  LLT NewShiftTy;
+  switch (SrcMI->getOpcode()) {
+  default:
+    return false;
+  case TargetOpcode::G_SHL: {
+    NewShiftTy = DstTy;
+
+    // Make sure new shift amount is legal.
+    KnownBits Known = KB->getKnownBits(SrcMI->getOperand(2).getReg());
+    if (Known.getMaxValue().uge(NewShiftTy.getScalarSizeInBits()))
+      return false;
+    break;
+  }
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_ASHR: {
+    // For right shifts, we conservatively do not do the transform if the TRUNC
+    // has any STORE users. The reason is that if we change the type of the
+    // shift, we may break the truncstore combine.
+    //
+    // TODO: Fix truncstore combine to handle (trunc(lshr (trunc x), k)).
+    for (auto &User : MRI.use_instructions(DstReg))
+      if (User.getOpcode() == TargetOpcode::G_STORE)
+        return false;
+
+    NewShiftTy = getMidVTForTruncRightShiftCombine(SrcTy, DstTy);
+    if (NewShiftTy == SrcTy)
+      return false;
+
+    // Make sure we won't lose information by truncating the high bits.
+    KnownBits Known = KB->getKnownBits(SrcMI->getOperand(2).getReg());
+    if (Known.getMaxValue().ugt(NewShiftTy.getScalarSizeInBits() -
+                                DstTy.getScalarSizeInBits()))
+      return false;
+    break;
+  }
+  }
+
+  if (!isLegalOrBeforeLegalizer(
+          {SrcMI->getOpcode(),
+           {NewShiftTy, TL.getPreferredShiftAmountTy(NewShiftTy)}}))
+    return false;
+
+  MatchInfo = std::make_pair(SrcMI, NewShiftTy);
+  return true;
+}
+
+void CombinerHelper::applyCombineTruncOfShift(
+    MachineInstr &MI, std::pair<MachineInstr *, LLT> &MatchInfo) {
+  Builder.setInstrAndDebugLoc(MI);
+
+  MachineInstr *ShiftMI = MatchInfo.first;
+  LLT NewShiftTy = MatchInfo.second;
+
+  Register Dst = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(Dst);
+
+  Register ShiftAmt = ShiftMI->getOperand(2).getReg();
+  Register ShiftSrc = ShiftMI->getOperand(1).getReg();
+  ShiftSrc = Builder.buildTrunc(NewShiftTy, ShiftSrc).getReg(0);
+
+  Register NewShift =
+      Builder
+          .buildInstr(ShiftMI->getOpcode(), {NewShiftTy}, {ShiftSrc, ShiftAmt})
+          .getReg(0);
+
+  if (NewShiftTy == DstTy)
+    replaceRegWith(MRI, Dst, NewShift);
+  else
+    Builder.buildTrunc(Dst, NewShift);
+
+  eraseInst(MI);
+}
+
+bool CombinerHelper::matchAnyExplicitUseIsUndef(MachineInstr &MI) {
+  return any_of(MI.explicit_uses(), [this](const MachineOperand &MO) {
+    return MO.isReg() &&
+           getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI);
+  });
+}
+
+bool CombinerHelper::matchAllExplicitUsesAreUndef(MachineInstr &MI) {
+  return all_of(MI.explicit_uses(), [this](const MachineOperand &MO) {
+    return !MO.isReg() ||
+           getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI);
+  });
+}
+
+bool CombinerHelper::matchUndefShuffleVectorMask(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+  return all_of(Mask, [](int Elt) { return Elt < 0; });
+}
+
+bool CombinerHelper::matchUndefStore(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_STORE);
+  return getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MI.getOperand(0).getReg(),
+                      MRI);
+}
+
+bool CombinerHelper::matchUndefSelectCmp(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SELECT);
+  return getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MI.getOperand(1).getReg(),
+                      MRI);
+}
+
+bool CombinerHelper::matchInsertExtractVecEltOutOfBounds(MachineInstr &MI) {
+  assert((MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT ||
+          MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) &&
+         "Expected an insert/extract element op");
+  LLT VecTy = MRI.getType(MI.getOperand(1).getReg());
+  unsigned IdxIdx =
+      MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
+  auto Idx = getIConstantVRegVal(MI.getOperand(IdxIdx).getReg(), MRI);
+  if (!Idx)
+    return false;
+  return Idx->getZExtValue() >= VecTy.getNumElements();
+}
+
+bool CombinerHelper::matchConstantSelectCmp(MachineInstr &MI, unsigned &OpIdx) {
+  GSelect &SelMI = cast<GSelect>(MI);
+  auto Cst =
+      isConstantOrConstantSplatVector(*MRI.getVRegDef(SelMI.getCondReg()), MRI);
+  if (!Cst)
+    return false;
+  OpIdx = Cst->isZero() ? 3 : 2;
+  return true;
+}
+
+void CombinerHelper::eraseInst(MachineInstr &MI) { MI.eraseFromParent(); }
+
+bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1,
+                                    const MachineOperand &MOP2) {
+  if (!MOP1.isReg() || !MOP2.isReg())
+    return false;
+  auto InstAndDef1 = getDefSrcRegIgnoringCopies(MOP1.getReg(), MRI);
+  if (!InstAndDef1)
+    return false;
+  auto InstAndDef2 = getDefSrcRegIgnoringCopies(MOP2.getReg(), MRI);
+  if (!InstAndDef2)
+    return false;
+  MachineInstr *I1 = InstAndDef1->MI;
+  MachineInstr *I2 = InstAndDef2->MI;
+
+  // Handle a case like this:
+  //
+  // %0:_(s64), %1:_(s64) = G_UNMERGE_VALUES %2:_(<2 x s64>)
+  //
+  // Even though %0 and %1 are produced by the same instruction they are not
+  // the same values.
+  if (I1 == I2)
+    return MOP1.getReg() == MOP2.getReg();
+
+  // If we have an instruction which loads or stores, we can't guarantee that
+  // it is identical.
+  //
+  // For example, we may have
+  //
+  // %x1 = G_LOAD %addr (load N from @somewhere)
+  // ...
+  // call @foo
+  // ...
+  // %x2 = G_LOAD %addr (load N from @somewhere)
+  // ...
+  // %or = G_OR %x1, %x2
+  //
+  // It's possible that @foo will modify whatever lives at the address we're
+  // loading from. To be safe, let's just assume that all loads and stores
+  // are different (unless we have something which is guaranteed to not
+  // change.)
+  if (I1->mayLoadOrStore() && !I1->isDereferenceableInvariantLoad())
+    return false;
+
+  // If both instructions are loads or stores, they are equal only if both
+  // are dereferenceable invariant loads with the same number of bits.
+  if (I1->mayLoadOrStore() && I2->mayLoadOrStore()) {
+    GLoadStore *LS1 = dyn_cast<GLoadStore>(I1);
+    GLoadStore *LS2 = dyn_cast<GLoadStore>(I2);
+    if (!LS1 || !LS2)
+      return false;
+
+    if (!I2->isDereferenceableInvariantLoad() ||
+        (LS1->getMemSizeInBits() != LS2->getMemSizeInBits()))
+      return false;
+  }
+
+  // Check for physical registers on the instructions first to avoid cases
+  // like this:
+  //
+  // %a = COPY $physreg
+  // ...
+  // SOMETHING implicit-def $physreg
+  // ...
+  // %b = COPY $physreg
+  //
+  // These copies are not equivalent.
+  if (any_of(I1->uses(), [](const MachineOperand &MO) {
+        return MO.isReg() && MO.getReg().isPhysical();
+      })) {
+    // Check if we have a case like this:
+    //
+    // %a = COPY $physreg
+    // %b = COPY %a
+    //
+    // In this case, I1 and I2 will both be equal to %a = COPY $physreg.
+    // From that, we know that they must have the same value, since they must
+    // have come from the same COPY.
+    return I1->isIdenticalTo(*I2);
+  }
+
+  // We don't have any physical registers, so we don't necessarily need the
+  // same vreg defs.
+  //
+  // On the off-chance that there's some target instruction feeding into the
+  // instruction, let's use produceSameValue instead of isIdenticalTo.
+  if (Builder.getTII().produceSameValue(*I1, *I2, &MRI)) {
+    // Handle instructions with multiple defs that produce same values. Values
+    // are same for operands with same index.
+    // %0:_(s8), %1:_(s8), %2:_(s8), %3:_(s8) = G_UNMERGE_VALUES %4:_(<4 x s8>)
+    // %5:_(s8), %6:_(s8), %7:_(s8), %8:_(s8) = G_UNMERGE_VALUES %4:_(<4 x s8>)
+    // I1 and I2 are different instructions but produce same values,
+    // %1 and %6 are same, %1 and %7 are not the same value.
+    return I1->findRegisterDefOperandIdx(InstAndDef1->Reg) ==
+           I2->findRegisterDefOperandIdx(InstAndDef2->Reg);
+  }
+  return false;
+}
+
+bool CombinerHelper::matchConstantOp(const MachineOperand &MOP, int64_t C) {
+  if (!MOP.isReg())
+    return false;
+  auto *MI = MRI.getVRegDef(MOP.getReg());
+  auto MaybeCst = isConstantOrConstantSplatVector(*MI, MRI);
+  return MaybeCst && MaybeCst->getBitWidth() <= 64 &&
+         MaybeCst->getSExtValue() == C;
+}
+
+void CombinerHelper::replaceSingleDefInstWithOperand(MachineInstr &MI,
+                                                     unsigned OpIdx) {
+  assert(MI.getNumExplicitDefs() == 1 && "Expected one explicit def?");
+  Register OldReg = MI.getOperand(0).getReg();
+  Register Replacement = MI.getOperand(OpIdx).getReg();
+  assert(canReplaceReg(OldReg, Replacement, MRI) && "Cannot replace register?");
+  MI.eraseFromParent();
+  replaceRegWith(MRI, OldReg, Replacement);
+}
+
+void CombinerHelper::replaceSingleDefInstWithReg(MachineInstr &MI,
+                                                 Register Replacement) {
+  assert(MI.getNumExplicitDefs() == 1 && "Expected one explicit def?");
+  Register OldReg = MI.getOperand(0).getReg();
+  assert(canReplaceReg(OldReg, Replacement, MRI) && "Cannot replace register?");
+  MI.eraseFromParent();
+  replaceRegWith(MRI, OldReg, Replacement);
+}
+
+bool CombinerHelper::matchSelectSameVal(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SELECT);
+  // Match (cond ? x : x)
+  return matchEqualDefs(MI.getOperand(2), MI.getOperand(3)) &&
+         canReplaceReg(MI.getOperand(0).getReg(), MI.getOperand(2).getReg(),
+                       MRI);
+}
+
+bool CombinerHelper::matchBinOpSameVal(MachineInstr &MI) {
+  return matchEqualDefs(MI.getOperand(1), MI.getOperand(2)) &&
+         canReplaceReg(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(),
+                       MRI);
+}
+
+bool CombinerHelper::matchOperandIsZero(MachineInstr &MI, unsigned OpIdx) {
+  return matchConstantOp(MI.getOperand(OpIdx), 0) &&
+         canReplaceReg(MI.getOperand(0).getReg(), MI.getOperand(OpIdx).getReg(),
+                       MRI);
+}
+
+bool CombinerHelper::matchOperandIsUndef(MachineInstr &MI, unsigned OpIdx) {
+  MachineOperand &MO = MI.getOperand(OpIdx);
+  return MO.isReg() &&
+         getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI);
+}
+
+bool CombinerHelper::matchOperandIsKnownToBeAPowerOfTwo(MachineInstr &MI,
+                                                        unsigned OpIdx) {
+  MachineOperand &MO = MI.getOperand(OpIdx);
+  return isKnownToBeAPowerOfTwo(MO.getReg(), MRI, KB);
+}
+
+void CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, double C) {
+  assert(MI.getNumDefs() == 1 && "Expected only one def?");
+  Builder.setInstr(MI);
+  Builder.buildFConstant(MI.getOperand(0), C);
+  MI.eraseFromParent();
+}
+
+void CombinerHelper::replaceInstWithConstant(MachineInstr &MI, int64_t C) {
+  assert(MI.getNumDefs() == 1 && "Expected only one def?");
+  Builder.setInstr(MI);
+  Builder.buildConstant(MI.getOperand(0), C);
+  MI.eraseFromParent();
+}
+
+void CombinerHelper::replaceInstWithConstant(MachineInstr &MI, APInt C) {
+  assert(MI.getNumDefs() == 1 && "Expected only one def?");
+  Builder.setInstr(MI);
+  Builder.buildConstant(MI.getOperand(0), C);
+  MI.eraseFromParent();
+}
+
+void CombinerHelper::replaceInstWithUndef(MachineInstr &MI) {
+  assert(MI.getNumDefs() == 1 && "Expected only one def?");
+  Builder.setInstr(MI);
+  Builder.buildUndef(MI.getOperand(0));
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchSimplifyAddToSub(
+    MachineInstr &MI, std::tuple<Register, Register> &MatchInfo) {
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  Register &NewLHS = std::get<0>(MatchInfo);
+  Register &NewRHS = std::get<1>(MatchInfo);
+
+  // Helper lambda to check for opportunities for
+  // ((0-A) + B) -> B - A
+  // (A + (0-B)) -> A - B
+  auto CheckFold = [&](Register &MaybeSub, Register &MaybeNewLHS) {
+    if (!mi_match(MaybeSub, MRI, m_Neg(m_Reg(NewRHS))))
+      return false;
+    NewLHS = MaybeNewLHS;
+    return true;
+  };
+
+  return CheckFold(LHS, RHS) || CheckFold(RHS, LHS);
+}
+
+bool CombinerHelper::matchCombineInsertVecElts(
+    MachineInstr &MI, SmallVectorImpl<Register> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT &&
+         "Invalid opcode");
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  assert(DstTy.isVector() && "Invalid G_INSERT_VECTOR_ELT?");
+  unsigned NumElts = DstTy.getNumElements();
+  // If this MI is part of a sequence of insert_vec_elts, then
+  // don't do the combine in the middle of the sequence.
+  if (MRI.hasOneUse(DstReg) && MRI.use_instr_begin(DstReg)->getOpcode() ==
+                                   TargetOpcode::G_INSERT_VECTOR_ELT)
+    return false;
+  MachineInstr *CurrInst = &MI;
+  MachineInstr *TmpInst;
+  int64_t IntImm;
+  Register TmpReg;
+  MatchInfo.resize(NumElts);
+  while (mi_match(
+      CurrInst->getOperand(0).getReg(), MRI,
+      m_GInsertVecElt(m_MInstr(TmpInst), m_Reg(TmpReg), m_ICst(IntImm)))) {
+    if (IntImm >= NumElts || IntImm < 0)
+      return false;
+    if (!MatchInfo[IntImm])
+      MatchInfo[IntImm] = TmpReg;
+    CurrInst = TmpInst;
+  }
+  // Variable index.
+  if (CurrInst->getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
+    return false;
+  if (TmpInst->getOpcode() == TargetOpcode::G_BUILD_VECTOR) {
+    for (unsigned I = 1; I < TmpInst->getNumOperands(); ++I) {
+      if (!MatchInfo[I - 1].isValid())
+        MatchInfo[I - 1] = TmpInst->getOperand(I).getReg();
+    }
+    return true;
+  }
+  // If we didn't end in a G_IMPLICIT_DEF, bail out.
+  return TmpInst->getOpcode() == TargetOpcode::G_IMPLICIT_DEF;
+}
+
+void CombinerHelper::applyCombineInsertVecElts(
+    MachineInstr &MI, SmallVectorImpl<Register> &MatchInfo) {
+  Builder.setInstr(MI);
+  Register UndefReg;
+  auto GetUndef = [&]() {
+    if (UndefReg)
+      return UndefReg;
+    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+    UndefReg = Builder.buildUndef(DstTy.getScalarType()).getReg(0);
+    return UndefReg;
+  };
+  for (unsigned I = 0; I < MatchInfo.size(); ++I) {
+    if (!MatchInfo[I])
+      MatchInfo[I] = GetUndef();
+  }
+  Builder.buildBuildVector(MI.getOperand(0).getReg(), MatchInfo);
+  MI.eraseFromParent();
+}
+
+void CombinerHelper::applySimplifyAddToSub(
+    MachineInstr &MI, std::tuple<Register, Register> &MatchInfo) {
+  Builder.setInstr(MI);
+  Register SubLHS, SubRHS;
+  std::tie(SubLHS, SubRHS) = MatchInfo;
+  Builder.buildSub(MI.getOperand(0).getReg(), SubLHS, SubRHS);
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchHoistLogicOpWithSameOpcodeHands(
+    MachineInstr &MI, InstructionStepsMatchInfo &MatchInfo) {
+  // Matches: logic (hand x, ...), (hand y, ...) -> hand (logic x, y), ...
+  //
+  // Creates the new hand + logic instruction (but does not insert them.)
+  //
+  // On success, MatchInfo is populated with the new instructions. These are
+  // inserted in applyHoistLogicOpWithSameOpcodeHands.
+  unsigned LogicOpcode = MI.getOpcode();
+  assert(LogicOpcode == TargetOpcode::G_AND ||
+         LogicOpcode == TargetOpcode::G_OR ||
+         LogicOpcode == TargetOpcode::G_XOR);
+  MachineIRBuilder MIB(MI);
+  Register Dst = MI.getOperand(0).getReg();
+  Register LHSReg = MI.getOperand(1).getReg();
+  Register RHSReg = MI.getOperand(2).getReg();
+
+  // Don't recompute anything.
+  if (!MRI.hasOneNonDBGUse(LHSReg) || !MRI.hasOneNonDBGUse(RHSReg))
+    return false;
+
+  // Make sure we have (hand x, ...), (hand y, ...)
+  MachineInstr *LeftHandInst = getDefIgnoringCopies(LHSReg, MRI);
+  MachineInstr *RightHandInst = getDefIgnoringCopies(RHSReg, MRI);
+  if (!LeftHandInst || !RightHandInst)
+    return false;
+  unsigned HandOpcode = LeftHandInst->getOpcode();
+  if (HandOpcode != RightHandInst->getOpcode())
+    return false;
+  if (!LeftHandInst->getOperand(1).isReg() ||
+      !RightHandInst->getOperand(1).isReg())
+    return false;
+
+  // Make sure the types match up, and if we're doing this post-legalization,
+  // we end up with legal types.
+  Register X = LeftHandInst->getOperand(1).getReg();
+  Register Y = RightHandInst->getOperand(1).getReg();
+  LLT XTy = MRI.getType(X);
+  LLT YTy = MRI.getType(Y);
+  if (!XTy.isValid() || XTy != YTy)
+    return false;
+
+  // Optional extra source register.
+  Register ExtraHandOpSrcReg;
+  switch (HandOpcode) {
+  default:
+    return false;
+  case TargetOpcode::G_ANYEXT:
+  case TargetOpcode::G_SEXT:
+  case TargetOpcode::G_ZEXT: {
+    // Match: logic (ext X), (ext Y) --> ext (logic X, Y)
+    break;
+  }
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_ASHR:
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_SHL: {
+    // Match: logic (binop x, z), (binop y, z) -> binop (logic x, y), z
+    MachineOperand &ZOp = LeftHandInst->getOperand(2);
+    if (!matchEqualDefs(ZOp, RightHandInst->getOperand(2)))
+      return false;
+    ExtraHandOpSrcReg = ZOp.getReg();
+    break;
+  }
+  }
+
+  if (!isLegalOrBeforeLegalizer({LogicOpcode, {XTy, YTy}}))
+    return false;
+
+  // Record the steps to build the new instructions.
+  //
+  // Steps to build (logic x, y)
+  auto NewLogicDst = MRI.createGenericVirtualRegister(XTy);
+  OperandBuildSteps LogicBuildSteps = {
+      [=](MachineInstrBuilder &MIB) { MIB.addDef(NewLogicDst); },
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(X); },
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Y); }};
+  InstructionBuildSteps LogicSteps(LogicOpcode, LogicBuildSteps);
+
+  // Steps to build hand (logic x, y), ...z
+  OperandBuildSteps HandBuildSteps = {
+      [=](MachineInstrBuilder &MIB) { MIB.addDef(Dst); },
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(NewLogicDst); }};
+  if (ExtraHandOpSrcReg.isValid())
+    HandBuildSteps.push_back(
+        [=](MachineInstrBuilder &MIB) { MIB.addReg(ExtraHandOpSrcReg); });
+  InstructionBuildSteps HandSteps(HandOpcode, HandBuildSteps);
+
+  MatchInfo = InstructionStepsMatchInfo({LogicSteps, HandSteps});
+  return true;
+}
+
+void CombinerHelper::applyBuildInstructionSteps(
+    MachineInstr &MI, InstructionStepsMatchInfo &MatchInfo) {
+  assert(MatchInfo.InstrsToBuild.size() &&
+         "Expected at least one instr to build?");
+  Builder.setInstr(MI);
+  for (auto &InstrToBuild : MatchInfo.InstrsToBuild) {
+    assert(InstrToBuild.Opcode && "Expected a valid opcode?");
+    assert(InstrToBuild.OperandFns.size() && "Expected at least one operand?");
+    MachineInstrBuilder Instr = Builder.buildInstr(InstrToBuild.Opcode);
+    for (auto &OperandFn : InstrToBuild.OperandFns)
+      OperandFn(Instr);
+  }
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchAshrShlToSextInreg(
+    MachineInstr &MI, std::tuple<Register, int64_t> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_ASHR);
+  int64_t ShlCst, AshrCst;
+  Register Src;
+  if (!mi_match(MI.getOperand(0).getReg(), MRI,
+                m_GAShr(m_GShl(m_Reg(Src), m_ICstOrSplat(ShlCst)),
+                        m_ICstOrSplat(AshrCst))))
+    return false;
+  if (ShlCst != AshrCst)
+    return false;
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_SEXT_INREG, {MRI.getType(Src)}}))
+    return false;
+  MatchInfo = std::make_tuple(Src, ShlCst);
+  return true;
+}
+
+void CombinerHelper::applyAshShlToSextInreg(
+    MachineInstr &MI, std::tuple<Register, int64_t> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_ASHR);
+  Register Src;
+  int64_t ShiftAmt;
+  std::tie(Src, ShiftAmt) = MatchInfo;
+  unsigned Size = MRI.getType(Src).getScalarSizeInBits();
+  Builder.setInstrAndDebugLoc(MI);
+  Builder.buildSExtInReg(MI.getOperand(0).getReg(), Src, Size - ShiftAmt);
+  MI.eraseFromParent();
+}
+
+/// and(and(x, C1), C2) -> C1&C2 ? and(x, C1&C2) : 0
+bool CombinerHelper::matchOverlappingAnd(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_AND);
+
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+
+  Register R;
+  int64_t C1;
+  int64_t C2;
+  if (!mi_match(
+          Dst, MRI,
+          m_GAnd(m_GAnd(m_Reg(R), m_ICst(C1)), m_ICst(C2))))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    if (C1 & C2) {
+      B.buildAnd(Dst, R, B.buildConstant(Ty, C1 & C2));
+      return;
+    }
+    auto Zero = B.buildConstant(Ty, 0);
+    replaceRegWith(MRI, Dst, Zero->getOperand(0).getReg());
+  };
+  return true;
+}
+
+bool CombinerHelper::matchRedundantAnd(MachineInstr &MI,
+                                       Register &Replacement) {
+  // Given
+  //
+  // %y:_(sN) = G_SOMETHING
+  // %x:_(sN) = G_SOMETHING
+  // %res:_(sN) = G_AND %x, %y
+  //
+  // Eliminate the G_AND when it is known that x & y == x or x & y == y.
+  //
+  // Patterns like this can appear as a result of legalization. E.g.
+  //
+  // %cmp:_(s32) = G_ICMP intpred(pred), %x(s32), %y
+  // %one:_(s32) = G_CONSTANT i32 1
+  // %and:_(s32) = G_AND %cmp, %one
+  //
+  // In this case, G_ICMP only produces a single bit, so x & 1 == x.
+  assert(MI.getOpcode() == TargetOpcode::G_AND);
+  if (!KB)
+    return false;
+
+  Register AndDst = MI.getOperand(0).getReg();
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  KnownBits LHSBits = KB->getKnownBits(LHS);
+  KnownBits RHSBits = KB->getKnownBits(RHS);
+
+  // Check that x & Mask == x.
+  // x & 1 == x, always
+  // x & 0 == x, only if x is also 0
+  // Meaning Mask has no effect if every bit is either one in Mask or zero in x.
+  //
+  // Check if we can replace AndDst with the LHS of the G_AND
+  if (canReplaceReg(AndDst, LHS, MRI) &&
+      (LHSBits.Zero | RHSBits.One).isAllOnes()) {
+    Replacement = LHS;
+    return true;
+  }
+
+  // Check if we can replace AndDst with the RHS of the G_AND
+  if (canReplaceReg(AndDst, RHS, MRI) &&
+      (LHSBits.One | RHSBits.Zero).isAllOnes()) {
+    Replacement = RHS;
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchRedundantOr(MachineInstr &MI, Register &Replacement) {
+  // Given
+  //
+  // %y:_(sN) = G_SOMETHING
+  // %x:_(sN) = G_SOMETHING
+  // %res:_(sN) = G_OR %x, %y
+  //
+  // Eliminate the G_OR when it is known that x | y == x or x | y == y.
+  assert(MI.getOpcode() == TargetOpcode::G_OR);
+  if (!KB)
+    return false;
+
+  Register OrDst = MI.getOperand(0).getReg();
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  KnownBits LHSBits = KB->getKnownBits(LHS);
+  KnownBits RHSBits = KB->getKnownBits(RHS);
+
+  // Check that x | Mask == x.
+  // x | 0 == x, always
+  // x | 1 == x, only if x is also 1
+  // Meaning Mask has no effect if every bit is either zero in Mask or one in x.
+  //
+  // Check if we can replace OrDst with the LHS of the G_OR
+  if (canReplaceReg(OrDst, LHS, MRI) &&
+      (LHSBits.One | RHSBits.Zero).isAllOnes()) {
+    Replacement = LHS;
+    return true;
+  }
+
+  // Check if we can replace OrDst with the RHS of the G_OR
+  if (canReplaceReg(OrDst, RHS, MRI) &&
+      (LHSBits.Zero | RHSBits.One).isAllOnes()) {
+    Replacement = RHS;
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchRedundantSExtInReg(MachineInstr &MI) {
+  // If the input is already sign extended, just drop the extension.
+  Register Src = MI.getOperand(1).getReg();
+  unsigned ExtBits = MI.getOperand(2).getImm();
+  unsigned TypeSize = MRI.getType(Src).getScalarSizeInBits();
+  return KB->computeNumSignBits(Src) >= (TypeSize - ExtBits + 1);
+}
+
+static bool isConstValidTrue(const TargetLowering &TLI, unsigned ScalarSizeBits,
+                             int64_t Cst, bool IsVector, bool IsFP) {
+  // For i1, Cst will always be -1 regardless of boolean contents.
+  return (ScalarSizeBits == 1 && Cst == -1) ||
+         isConstTrueVal(TLI, Cst, IsVector, IsFP);
+}
+
+bool CombinerHelper::matchNotCmp(MachineInstr &MI,
+                                 SmallVectorImpl<Register> &RegsToNegate) {
+  assert(MI.getOpcode() == TargetOpcode::G_XOR);
+  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+  const auto &TLI = *Builder.getMF().getSubtarget().getTargetLowering();
+  Register XorSrc;
+  Register CstReg;
+  // We match xor(src, true) here.
+  if (!mi_match(MI.getOperand(0).getReg(), MRI,
+                m_GXor(m_Reg(XorSrc), m_Reg(CstReg))))
+    return false;
+
+  if (!MRI.hasOneNonDBGUse(XorSrc))
+    return false;
+
+  // Check that XorSrc is the root of a tree of comparisons combined with ANDs
+  // and ORs. The suffix of RegsToNegate starting from index I is used a work
+  // list of tree nodes to visit.
+  RegsToNegate.push_back(XorSrc);
+  // Remember whether the comparisons are all integer or all floating point.
+  bool IsInt = false;
+  bool IsFP = false;
+  for (unsigned I = 0; I < RegsToNegate.size(); ++I) {
+    Register Reg = RegsToNegate[I];
+    if (!MRI.hasOneNonDBGUse(Reg))
+      return false;
+    MachineInstr *Def = MRI.getVRegDef(Reg);
+    switch (Def->getOpcode()) {
+    default:
+      // Don't match if the tree contains anything other than ANDs, ORs and
+      // comparisons.
+      return false;
+    case TargetOpcode::G_ICMP:
+      if (IsFP)
+        return false;
+      IsInt = true;
+      // When we apply the combine we will invert the predicate.
+      break;
+    case TargetOpcode::G_FCMP:
+      if (IsInt)
+        return false;
+      IsFP = true;
+      // When we apply the combine we will invert the predicate.
+      break;
+    case TargetOpcode::G_AND:
+    case TargetOpcode::G_OR:
+      // Implement De Morgan's laws:
+      // ~(x & y) -> ~x | ~y
+      // ~(x | y) -> ~x & ~y
+      // When we apply the combine we will change the opcode and recursively
+      // negate the operands.
+      RegsToNegate.push_back(Def->getOperand(1).getReg());
+      RegsToNegate.push_back(Def->getOperand(2).getReg());
+      break;
+    }
+  }
+
+  // Now we know whether the comparisons are integer or floating point, check
+  // the constant in the xor.
+  int64_t Cst;
+  if (Ty.isVector()) {
+    MachineInstr *CstDef = MRI.getVRegDef(CstReg);
+    auto MaybeCst = getIConstantSplatSExtVal(*CstDef, MRI);
+    if (!MaybeCst)
+      return false;
+    if (!isConstValidTrue(TLI, Ty.getScalarSizeInBits(), *MaybeCst, true, IsFP))
+      return false;
+  } else {
+    if (!mi_match(CstReg, MRI, m_ICst(Cst)))
+      return false;
+    if (!isConstValidTrue(TLI, Ty.getSizeInBits(), Cst, false, IsFP))
+      return false;
+  }
+
+  return true;
+}
+
+void CombinerHelper::applyNotCmp(MachineInstr &MI,
+                                 SmallVectorImpl<Register> &RegsToNegate) {
+  for (Register Reg : RegsToNegate) {
+    MachineInstr *Def = MRI.getVRegDef(Reg);
+    Observer.changingInstr(*Def);
+    // For each comparison, invert the opcode. For each AND and OR, change the
+    // opcode.
+    switch (Def->getOpcode()) {
+    default:
+      llvm_unreachable("Unexpected opcode");
+    case TargetOpcode::G_ICMP:
+    case TargetOpcode::G_FCMP: {
+      MachineOperand &PredOp = Def->getOperand(1);
+      CmpInst::Predicate NewP = CmpInst::getInversePredicate(
+          (CmpInst::Predicate)PredOp.getPredicate());
+      PredOp.setPredicate(NewP);
+      break;
+    }
+    case TargetOpcode::G_AND:
+      Def->setDesc(Builder.getTII().get(TargetOpcode::G_OR));
+      break;
+    case TargetOpcode::G_OR:
+      Def->setDesc(Builder.getTII().get(TargetOpcode::G_AND));
+      break;
+    }
+    Observer.changedInstr(*Def);
+  }
+
+  replaceRegWith(MRI, MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchXorOfAndWithSameReg(
+    MachineInstr &MI, std::pair<Register, Register> &MatchInfo) {
+  // Match (xor (and x, y), y) (or any of its commuted cases)
+  assert(MI.getOpcode() == TargetOpcode::G_XOR);
+  Register &X = MatchInfo.first;
+  Register &Y = MatchInfo.second;
+  Register AndReg = MI.getOperand(1).getReg();
+  Register SharedReg = MI.getOperand(2).getReg();
+
+  // Find a G_AND on either side of the G_XOR.
+  // Look for one of
+  //
+  // (xor (and x, y), SharedReg)
+  // (xor SharedReg, (and x, y))
+  if (!mi_match(AndReg, MRI, m_GAnd(m_Reg(X), m_Reg(Y)))) {
+    std::swap(AndReg, SharedReg);
+    if (!mi_match(AndReg, MRI, m_GAnd(m_Reg(X), m_Reg(Y))))
+      return false;
+  }
+
+  // Only do this if we'll eliminate the G_AND.
+  if (!MRI.hasOneNonDBGUse(AndReg))
+    return false;
+
+  // We can combine if SharedReg is the same as either the LHS or RHS of the
+  // G_AND.
+  if (Y != SharedReg)
+    std::swap(X, Y);
+  return Y == SharedReg;
+}
+
+void CombinerHelper::applyXorOfAndWithSameReg(
+    MachineInstr &MI, std::pair<Register, Register> &MatchInfo) {
+  // Fold (xor (and x, y), y) -> (and (not x), y)
+  Builder.setInstrAndDebugLoc(MI);
+  Register X, Y;
+  std::tie(X, Y) = MatchInfo;
+  auto Not = Builder.buildNot(MRI.getType(X), X);
+  Observer.changingInstr(MI);
+  MI.setDesc(Builder.getTII().get(TargetOpcode::G_AND));
+  MI.getOperand(1).setReg(Not->getOperand(0).getReg());
+  MI.getOperand(2).setReg(Y);
+  Observer.changedInstr(MI);
+}
+
+bool CombinerHelper::matchPtrAddZero(MachineInstr &MI) {
+  auto &PtrAdd = cast<GPtrAdd>(MI);
+  Register DstReg = PtrAdd.getReg(0);
+  LLT Ty = MRI.getType(DstReg);
+  const DataLayout &DL = Builder.getMF().getDataLayout();
+
+  if (DL.isNonIntegralAddressSpace(Ty.getScalarType().getAddressSpace()))
+    return false;
+
+  if (Ty.isPointer()) {
+    auto ConstVal = getIConstantVRegVal(PtrAdd.getBaseReg(), MRI);
+    return ConstVal && *ConstVal == 0;
+  }
+
+  assert(Ty.isVector() && "Expecting a vector type");
+  const MachineInstr *VecMI = MRI.getVRegDef(PtrAdd.getBaseReg());
+  return isBuildVectorAllZeros(*VecMI, MRI);
+}
+
+void CombinerHelper::applyPtrAddZero(MachineInstr &MI) {
+  auto &PtrAdd = cast<GPtrAdd>(MI);
+  Builder.setInstrAndDebugLoc(PtrAdd);
+  Builder.buildIntToPtr(PtrAdd.getReg(0), PtrAdd.getOffsetReg());
+  PtrAdd.eraseFromParent();
+}
+
+/// The second source operand is known to be a power of 2.
+void CombinerHelper::applySimplifyURemByPow2(MachineInstr &MI) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src0 = MI.getOperand(1).getReg();
+  Register Pow2Src1 = MI.getOperand(2).getReg();
+  LLT Ty = MRI.getType(DstReg);
+  Builder.setInstrAndDebugLoc(MI);
+
+  // Fold (urem x, pow2) -> (and x, pow2-1)
+  auto NegOne = Builder.buildConstant(Ty, -1);
+  auto Add = Builder.buildAdd(Ty, Pow2Src1, NegOne);
+  Builder.buildAnd(DstReg, Src0, Add);
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchFoldBinOpIntoSelect(MachineInstr &MI,
+                                              unsigned &SelectOpNo) {
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+
+  Register OtherOperandReg = RHS;
+  SelectOpNo = 1;
+  MachineInstr *Select = MRI.getVRegDef(LHS);
+
+  // Don't do this unless the old select is going away. We want to eliminate the
+  // binary operator, not replace a binop with a select.
+  if (Select->getOpcode() != TargetOpcode::G_SELECT ||
+      !MRI.hasOneNonDBGUse(LHS)) {
+    OtherOperandReg = LHS;
+    SelectOpNo = 2;
+    Select = MRI.getVRegDef(RHS);
+    if (Select->getOpcode() != TargetOpcode::G_SELECT ||
+        !MRI.hasOneNonDBGUse(RHS))
+      return false;
+  }
+
+  MachineInstr *SelectLHS = MRI.getVRegDef(Select->getOperand(2).getReg());
+  MachineInstr *SelectRHS = MRI.getVRegDef(Select->getOperand(3).getReg());
+
+  if (!isConstantOrConstantVector(*SelectLHS, MRI,
+                                  /*AllowFP*/ true,
+                                  /*AllowOpaqueConstants*/ false))
+    return false;
+  if (!isConstantOrConstantVector(*SelectRHS, MRI,
+                                  /*AllowFP*/ true,
+                                  /*AllowOpaqueConstants*/ false))
+    return false;
+
+  unsigned BinOpcode = MI.getOpcode();
+
+  // We know know one of the operands is a select of constants. Now verify that
+  // the other binary operator operand is either a constant, or we can handle a
+  // variable.
+  bool CanFoldNonConst =
+      (BinOpcode == TargetOpcode::G_AND || BinOpcode == TargetOpcode::G_OR) &&
+      (isNullOrNullSplat(*SelectLHS, MRI) ||
+       isAllOnesOrAllOnesSplat(*SelectLHS, MRI)) &&
+      (isNullOrNullSplat(*SelectRHS, MRI) ||
+       isAllOnesOrAllOnesSplat(*SelectRHS, MRI));
+  if (CanFoldNonConst)
+    return true;
+
+  return isConstantOrConstantVector(*MRI.getVRegDef(OtherOperandReg), MRI,
+                                    /*AllowFP*/ true,
+                                    /*AllowOpaqueConstants*/ false);
+}
+
+/// \p SelectOperand is the operand in binary operator \p MI that is the select
+/// to fold.
+void CombinerHelper::applyFoldBinOpIntoSelect(MachineInstr &MI,
+                                              const unsigned &SelectOperand) {
+  Builder.setInstrAndDebugLoc(MI);
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  MachineInstr *Select = MRI.getVRegDef(MI.getOperand(SelectOperand).getReg());
+
+  Register SelectCond = Select->getOperand(1).getReg();
+  Register SelectTrue = Select->getOperand(2).getReg();
+  Register SelectFalse = Select->getOperand(3).getReg();
+
+  LLT Ty = MRI.getType(Dst);
+  unsigned BinOpcode = MI.getOpcode();
+
+  Register FoldTrue, FoldFalse;
+
+  // We have a select-of-constants followed by a binary operator with a
+  // constant. Eliminate the binop by pulling the constant math into the select.
+  // Example: add (select Cond, CT, CF), CBO --> select Cond, CT + CBO, CF + CBO
+  if (SelectOperand == 1) {
+    // TODO: SelectionDAG verifies this actually constant folds before
+    // committing to the combine.
+
+    FoldTrue = Builder.buildInstr(BinOpcode, {Ty}, {SelectTrue, RHS}).getReg(0);
+    FoldFalse =
+        Builder.buildInstr(BinOpcode, {Ty}, {SelectFalse, RHS}).getReg(0);
+  } else {
+    FoldTrue = Builder.buildInstr(BinOpcode, {Ty}, {LHS, SelectTrue}).getReg(0);
+    FoldFalse =
+        Builder.buildInstr(BinOpcode, {Ty}, {LHS, SelectFalse}).getReg(0);
+  }
+
+  Builder.buildSelect(Dst, SelectCond, FoldTrue, FoldFalse, MI.getFlags());
+  MI.eraseFromParent();
+}
+
+std::optional<SmallVector<Register, 8>>
+CombinerHelper::findCandidatesForLoadOrCombine(const MachineInstr *Root) const {
+  assert(Root->getOpcode() == TargetOpcode::G_OR && "Expected G_OR only!");
+  // We want to detect if Root is part of a tree which represents a bunch
+  // of loads being merged into a larger load. We'll try to recognize patterns
+  // like, for example:
+  //
+  //  Reg   Reg
+  //   \    /
+  //    OR_1   Reg
+  //     \    /
+  //      OR_2
+  //        \     Reg
+  //         .. /
+  //        Root
+  //
+  //  Reg   Reg   Reg   Reg
+  //     \ /       \   /
+  //     OR_1      OR_2
+  //       \       /
+  //        \    /
+  //         ...
+  //         Root
+  //
+  // Each "Reg" may have been produced by a load + some arithmetic. This
+  // function will save each of them.
+  SmallVector<Register, 8> RegsToVisit;
+  SmallVector<const MachineInstr *, 7> Ors = {Root};
+
+  // In the "worst" case, we're dealing with a load for each byte. So, there
+  // are at most #bytes - 1 ORs.
+  const unsigned MaxIter =
+      MRI.getType(Root->getOperand(0).getReg()).getSizeInBytes() - 1;
+  for (unsigned Iter = 0; Iter < MaxIter; ++Iter) {
+    if (Ors.empty())
+      break;
+    const MachineInstr *Curr = Ors.pop_back_val();
+    Register OrLHS = Curr->getOperand(1).getReg();
+    Register OrRHS = Curr->getOperand(2).getReg();
+
+    // In the combine, we want to elimate the entire tree.
+    if (!MRI.hasOneNonDBGUse(OrLHS) || !MRI.hasOneNonDBGUse(OrRHS))
+      return std::nullopt;
+
+    // If it's a G_OR, save it and continue to walk. If it's not, then it's
+    // something that may be a load + arithmetic.
+    if (const MachineInstr *Or = getOpcodeDef(TargetOpcode::G_OR, OrLHS, MRI))
+      Ors.push_back(Or);
+    else
+      RegsToVisit.push_back(OrLHS);
+    if (const MachineInstr *Or = getOpcodeDef(TargetOpcode::G_OR, OrRHS, MRI))
+      Ors.push_back(Or);
+    else
+      RegsToVisit.push_back(OrRHS);
+  }
+
+  // We're going to try and merge each register into a wider power-of-2 type,
+  // so we ought to have an even number of registers.
+  if (RegsToVisit.empty() || RegsToVisit.size() % 2 != 0)
+    return std::nullopt;
+  return RegsToVisit;
+}
+
+/// Helper function for findLoadOffsetsForLoadOrCombine.
+///
+/// Check if \p Reg is the result of loading a \p MemSizeInBits wide value,
+/// and then moving that value into a specific byte offset.
+///
+/// e.g. x[i] << 24
+///
+/// \returns The load instruction and the byte offset it is moved into.
+static std::optional<std::pair<GZExtLoad *, int64_t>>
+matchLoadAndBytePosition(Register Reg, unsigned MemSizeInBits,
+                         const MachineRegisterInfo &MRI) {
+  assert(MRI.hasOneNonDBGUse(Reg) &&
+         "Expected Reg to only have one non-debug use?");
+  Register MaybeLoad;
+  int64_t Shift;
+  if (!mi_match(Reg, MRI,
+                m_OneNonDBGUse(m_GShl(m_Reg(MaybeLoad), m_ICst(Shift))))) {
+    Shift = 0;
+    MaybeLoad = Reg;
+  }
+
+  if (Shift % MemSizeInBits != 0)
+    return std::nullopt;
+
+  // TODO: Handle other types of loads.
+  auto *Load = getOpcodeDef<GZExtLoad>(MaybeLoad, MRI);
+  if (!Load)
+    return std::nullopt;
+
+  if (!Load->isUnordered() || Load->getMemSizeInBits() != MemSizeInBits)
+    return std::nullopt;
+
+  return std::make_pair(Load, Shift / MemSizeInBits);
+}
+
+std::optional<std::tuple<GZExtLoad *, int64_t, GZExtLoad *>>
+CombinerHelper::findLoadOffsetsForLoadOrCombine(
+    SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
+    const SmallVector<Register, 8> &RegsToVisit, const unsigned MemSizeInBits) {
+
+  // Each load found for the pattern. There should be one for each RegsToVisit.
+  SmallSetVector<const MachineInstr *, 8> Loads;
+
+  // The lowest index used in any load. (The lowest "i" for each x[i].)
+  int64_t LowestIdx = INT64_MAX;
+
+  // The load which uses the lowest index.
+  GZExtLoad *LowestIdxLoad = nullptr;
+
+  // Keeps track of the load indices we see. We shouldn't see any indices twice.
+  SmallSet<int64_t, 8> SeenIdx;
+
+  // Ensure each load is in the same MBB.
+  // TODO: Support multiple MachineBasicBlocks.
+  MachineBasicBlock *MBB = nullptr;
+  const MachineMemOperand *MMO = nullptr;
+
+  // Earliest instruction-order load in the pattern.
+  GZExtLoad *EarliestLoad = nullptr;
+
+  // Latest instruction-order load in the pattern.
+  GZExtLoad *LatestLoad = nullptr;
+
+  // Base pointer which every load should share.
+  Register BasePtr;
+
+  // We want to find a load for each register. Each load should have some
+  // appropriate bit twiddling arithmetic. During this loop, we will also keep
+  // track of the load which uses the lowest index. Later, we will check if we
+  // can use its pointer in the final, combined load.
+  for (auto Reg : RegsToVisit) {
+    // Find the load, and find the position that it will end up in (e.g. a
+    // shifted) value.
+    auto LoadAndPos = matchLoadAndBytePosition(Reg, MemSizeInBits, MRI);
+    if (!LoadAndPos)
+      return std::nullopt;
+    GZExtLoad *Load;
+    int64_t DstPos;
+    std::tie(Load, DstPos) = *LoadAndPos;
+
+    // TODO: Handle multiple MachineBasicBlocks. Currently not handled because
+    // it is difficult to check for stores/calls/etc between loads.
+    MachineBasicBlock *LoadMBB = Load->getParent();
+    if (!MBB)
+      MBB = LoadMBB;
+    if (LoadMBB != MBB)
+      return std::nullopt;
+
+    // Make sure that the MachineMemOperands of every seen load are compatible.
+    auto &LoadMMO = Load->getMMO();
+    if (!MMO)
+      MMO = &LoadMMO;
+    if (MMO->getAddrSpace() != LoadMMO.getAddrSpace())
+      return std::nullopt;
+
+    // Find out what the base pointer and index for the load is.
+    Register LoadPtr;
+    int64_t Idx;
+    if (!mi_match(Load->getOperand(1).getReg(), MRI,
+                  m_GPtrAdd(m_Reg(LoadPtr), m_ICst(Idx)))) {
+      LoadPtr = Load->getOperand(1).getReg();
+      Idx = 0;
+    }
+
+    // Don't combine things like a[i], a[i] -> a bigger load.
+    if (!SeenIdx.insert(Idx).second)
+      return std::nullopt;
+
+    // Every load must share the same base pointer; don't combine things like:
+    //
+    // a[i], b[i + 1] -> a bigger load.
+    if (!BasePtr.isValid())
+      BasePtr = LoadPtr;
+    if (BasePtr != LoadPtr)
+      return std::nullopt;
+
+    if (Idx < LowestIdx) {
+      LowestIdx = Idx;
+      LowestIdxLoad = Load;
+    }
+
+    // Keep track of the byte offset that this load ends up at. If we have seen
+    // the byte offset, then stop here. We do not want to combine:
+    //
+    // a[i] << 16, a[i + k] << 16 -> a bigger load.
+    if (!MemOffset2Idx.try_emplace(DstPos, Idx).second)
+      return std::nullopt;
+    Loads.insert(Load);
+
+    // Keep track of the position of the earliest/latest loads in the pattern.
+    // We will check that there are no load fold barriers between them later
+    // on.
+    //
+    // FIXME: Is there a better way to check for load fold barriers?
+    if (!EarliestLoad || dominates(*Load, *EarliestLoad))
+      EarliestLoad = Load;
+    if (!LatestLoad || dominates(*LatestLoad, *Load))
+      LatestLoad = Load;
+  }
+
+  // We found a load for each register. Let's check if each load satisfies the
+  // pattern.
+  assert(Loads.size() == RegsToVisit.size() &&
+         "Expected to find a load for each register?");
+  assert(EarliestLoad != LatestLoad && EarliestLoad &&
+         LatestLoad && "Expected at least two loads?");
+
+  // Check if there are any stores, calls, etc. between any of the loads. If
+  // there are, then we can't safely perform the combine.
+  //
+  // MaxIter is chosen based off the (worst case) number of iterations it
+  // typically takes to succeed in the LLVM test suite plus some padding.
+  //
+  // FIXME: Is there a better way to check for load fold barriers?
+  const unsigned MaxIter = 20;
+  unsigned Iter = 0;
+  for (const auto &MI : instructionsWithoutDebug(EarliestLoad->getIterator(),
+                                                 LatestLoad->getIterator())) {
+    if (Loads.count(&MI))
+      continue;
+    if (MI.isLoadFoldBarrier())
+      return std::nullopt;
+    if (Iter++ == MaxIter)
+      return std::nullopt;
+  }
+
+  return std::make_tuple(LowestIdxLoad, LowestIdx, LatestLoad);
+}
+
+bool CombinerHelper::matchLoadOrCombine(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_OR);
+  MachineFunction &MF = *MI.getMF();
+  // Assuming a little-endian target, transform:
+  //  s8 *a = ...
+  //  s32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
+  // =>
+  //  s32 val = *((i32)a)
+  //
+  //  s8 *a = ...
+  //  s32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]
+  // =>
+  //  s32 val = BSWAP(*((s32)a))
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  if (Ty.isVector())
+    return false;
+
+  // We need to combine at least two loads into this type. Since the smallest
+  // possible load is into a byte, we need at least a 16-bit wide type.
+  const unsigned WideMemSizeInBits = Ty.getSizeInBits();
+  if (WideMemSizeInBits < 16 || WideMemSizeInBits % 8 != 0)
+    return false;
+
+  // Match a collection of non-OR instructions in the pattern.
+  auto RegsToVisit = findCandidatesForLoadOrCombine(&MI);
+  if (!RegsToVisit)
+    return false;
+
+  // We have a collection of non-OR instructions. Figure out how wide each of
+  // the small loads should be based off of the number of potential loads we
+  // found.
+  const unsigned NarrowMemSizeInBits = WideMemSizeInBits / RegsToVisit->size();
+  if (NarrowMemSizeInBits % 8 != 0)
+    return false;
+
+  // Check if each register feeding into each OR is a load from the same
+  // base pointer + some arithmetic.
+  //
+  // e.g. a[0], a[1] << 8, a[2] << 16, etc.
+  //
+  // Also verify that each of these ends up putting a[i] into the same memory
+  // offset as a load into a wide type would.
+  SmallDenseMap<int64_t, int64_t, 8> MemOffset2Idx;
+  GZExtLoad *LowestIdxLoad, *LatestLoad;
+  int64_t LowestIdx;
+  auto MaybeLoadInfo = findLoadOffsetsForLoadOrCombine(
+      MemOffset2Idx, *RegsToVisit, NarrowMemSizeInBits);
+  if (!MaybeLoadInfo)
+    return false;
+  std::tie(LowestIdxLoad, LowestIdx, LatestLoad) = *MaybeLoadInfo;
+
+  // We have a bunch of loads being OR'd together. Using the addresses + offsets
+  // we found before, check if this corresponds to a big or little endian byte
+  // pattern. If it does, then we can represent it using a load + possibly a
+  // BSWAP.
+  bool IsBigEndianTarget = MF.getDataLayout().isBigEndian();
+  std::optional<bool> IsBigEndian = isBigEndian(MemOffset2Idx, LowestIdx);
+  if (!IsBigEndian)
+    return false;
+  bool NeedsBSwap = IsBigEndianTarget != *IsBigEndian;
+  if (NeedsBSwap && !isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {Ty}}))
+    return false;
+
+  // Make sure that the load from the lowest index produces offset 0 in the
+  // final value.
+  //
+  // This ensures that we won't combine something like this:
+  //
+  // load x[i] -> byte 2
+  // load x[i+1] -> byte 0 ---> wide_load x[i]
+  // load x[i+2] -> byte 1
+  const unsigned NumLoadsInTy = WideMemSizeInBits / NarrowMemSizeInBits;
+  const unsigned ZeroByteOffset =
+      *IsBigEndian
+          ? bigEndianByteAt(NumLoadsInTy, 0)
+          : littleEndianByteAt(NumLoadsInTy, 0);
+  auto ZeroOffsetIdx = MemOffset2Idx.find(ZeroByteOffset);
+  if (ZeroOffsetIdx == MemOffset2Idx.end() ||
+      ZeroOffsetIdx->second != LowestIdx)
+    return false;
+
+  // We wil reuse the pointer from the load which ends up at byte offset 0. It
+  // may not use index 0.
+  Register Ptr = LowestIdxLoad->getPointerReg();
+  const MachineMemOperand &MMO = LowestIdxLoad->getMMO();
+  LegalityQuery::MemDesc MMDesc(MMO);
+  MMDesc.MemoryTy = Ty;
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_LOAD, {Ty, MRI.getType(Ptr)}, {MMDesc}}))
+    return false;
+  auto PtrInfo = MMO.getPointerInfo();
+  auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, WideMemSizeInBits / 8);
+
+  // Load must be allowed and fast on the target.
+  LLVMContext &C = MF.getFunction().getContext();
+  auto &DL = MF.getDataLayout();
+  unsigned Fast = 0;
+  if (!getTargetLowering().allowsMemoryAccess(C, DL, Ty, *NewMMO, &Fast) ||
+      !Fast)
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &MIB) {
+    MIB.setInstrAndDebugLoc(*LatestLoad);
+    Register LoadDst = NeedsBSwap ? MRI.cloneVirtualRegister(Dst) : Dst;
+    MIB.buildLoad(LoadDst, Ptr, *NewMMO);
+    if (NeedsBSwap)
+      MIB.buildBSwap(Dst, LoadDst);
+  };
+  return true;
+}
+
+bool CombinerHelper::matchExtendThroughPhis(MachineInstr &MI,
+                                            MachineInstr *&ExtMI) {
+  assert(MI.getOpcode() == TargetOpcode::G_PHI);
+
+  Register DstReg = MI.getOperand(0).getReg();
+
+  // TODO: Extending a vector may be expensive, don't do this until heuristics
+  // are better.
+  if (MRI.getType(DstReg).isVector())
+    return false;
+
+  // Try to match a phi, whose only use is an extend.
+  if (!MRI.hasOneNonDBGUse(DstReg))
+    return false;
+  ExtMI = &*MRI.use_instr_nodbg_begin(DstReg);
+  switch (ExtMI->getOpcode()) {
+  case TargetOpcode::G_ANYEXT:
+    return true; // G_ANYEXT is usually free.
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_SEXT:
+    break;
+  default:
+    return false;
+  }
+
+  // If the target is likely to fold this extend away, don't propagate.
+  if (Builder.getTII().isExtendLikelyToBeFolded(*ExtMI, MRI))
+    return false;
+
+  // We don't want to propagate the extends unless there's a good chance that
+  // they'll be optimized in some way.
+  // Collect the unique incoming values.
+  SmallPtrSet<MachineInstr *, 4> InSrcs;
+  for (unsigned Idx = 1; Idx < MI.getNumOperands(); Idx += 2) {
+    auto *DefMI = getDefIgnoringCopies(MI.getOperand(Idx).getReg(), MRI);
+    switch (DefMI->getOpcode()) {
+    case TargetOpcode::G_LOAD:
+    case TargetOpcode::G_TRUNC:
+    case TargetOpcode::G_SEXT:
+    case TargetOpcode::G_ZEXT:
+    case TargetOpcode::G_ANYEXT:
+    case TargetOpcode::G_CONSTANT:
+      InSrcs.insert(getDefIgnoringCopies(MI.getOperand(Idx).getReg(), MRI));
+      // Don't try to propagate if there are too many places to create new
+      // extends, chances are it'll increase code size.
+      if (InSrcs.size() > 2)
+        return false;
+      break;
+    default:
+      return false;
+    }
+  }
+  return true;
+}
+
+void CombinerHelper::applyExtendThroughPhis(MachineInstr &MI,
+                                            MachineInstr *&ExtMI) {
+  assert(MI.getOpcode() == TargetOpcode::G_PHI);
+  Register DstReg = ExtMI->getOperand(0).getReg();
+  LLT ExtTy = MRI.getType(DstReg);
+
+  // Propagate the extension into the block of each incoming reg's block.
+  // Use a SetVector here because PHIs can have duplicate edges, and we want
+  // deterministic iteration order.
+  SmallSetVector<MachineInstr *, 8> SrcMIs;
+  SmallDenseMap<MachineInstr *, MachineInstr *, 8> OldToNewSrcMap;
+  for (unsigned SrcIdx = 1; SrcIdx < MI.getNumOperands(); SrcIdx += 2) {
+    auto *SrcMI = MRI.getVRegDef(MI.getOperand(SrcIdx).getReg());
+    if (!SrcMIs.insert(SrcMI))
+      continue;
+
+    // Build an extend after each src inst.
+    auto *MBB = SrcMI->getParent();
+    MachineBasicBlock::iterator InsertPt = ++SrcMI->getIterator();
+    if (InsertPt != MBB->end() && InsertPt->isPHI())
+      InsertPt = MBB->getFirstNonPHI();
+
+    Builder.setInsertPt(*SrcMI->getParent(), InsertPt);
+    Builder.setDebugLoc(MI.getDebugLoc());
+    auto NewExt = Builder.buildExtOrTrunc(ExtMI->getOpcode(), ExtTy,
+                                          SrcMI->getOperand(0).getReg());
+    OldToNewSrcMap[SrcMI] = NewExt;
+  }
+
+  // Create a new phi with the extended inputs.
+  Builder.setInstrAndDebugLoc(MI);
+  auto NewPhi = Builder.buildInstrNoInsert(TargetOpcode::G_PHI);
+  NewPhi.addDef(DstReg);
+  for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
+    if (!MO.isReg()) {
+      NewPhi.addMBB(MO.getMBB());
+      continue;
+    }
+    auto *NewSrc = OldToNewSrcMap[MRI.getVRegDef(MO.getReg())];
+    NewPhi.addUse(NewSrc->getOperand(0).getReg());
+  }
+  Builder.insertInstr(NewPhi);
+  ExtMI->eraseFromParent();
+}
+
+bool CombinerHelper::matchExtractVecEltBuildVec(MachineInstr &MI,
+                                                Register &Reg) {
+  assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT);
+  // If we have a constant index, look for a G_BUILD_VECTOR source
+  // and find the source register that the index maps to.
+  Register SrcVec = MI.getOperand(1).getReg();
+  LLT SrcTy = MRI.getType(SrcVec);
+
+  auto Cst = getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+  if (!Cst || Cst->Value.getZExtValue() >= SrcTy.getNumElements())
+    return false;
+
+  unsigned VecIdx = Cst->Value.getZExtValue();
+
+  // Check if we have a build_vector or build_vector_trunc with an optional
+  // trunc in front.
+  MachineInstr *SrcVecMI = MRI.getVRegDef(SrcVec);
+  if (SrcVecMI->getOpcode() == TargetOpcode::G_TRUNC) {
+    SrcVecMI = MRI.getVRegDef(SrcVecMI->getOperand(1).getReg());
+  }
+
+  if (SrcVecMI->getOpcode() != TargetOpcode::G_BUILD_VECTOR &&
+      SrcVecMI->getOpcode() != TargetOpcode::G_BUILD_VECTOR_TRUNC)
+    return false;
+
+  EVT Ty(getMVTForLLT(SrcTy));
+  if (!MRI.hasOneNonDBGUse(SrcVec) &&
+      !getTargetLowering().aggressivelyPreferBuildVectorSources(Ty))
+    return false;
+
+  Reg = SrcVecMI->getOperand(VecIdx + 1).getReg();
+  return true;
+}
+
+void CombinerHelper::applyExtractVecEltBuildVec(MachineInstr &MI,
+                                                Register &Reg) {
+  // Check the type of the register, since it may have come from a
+  // G_BUILD_VECTOR_TRUNC.
+  LLT ScalarTy = MRI.getType(Reg);
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+
+  Builder.setInstrAndDebugLoc(MI);
+  if (ScalarTy != DstTy) {
+    assert(ScalarTy.getSizeInBits() > DstTy.getSizeInBits());
+    Builder.buildTrunc(DstReg, Reg);
+    MI.eraseFromParent();
+    return;
+  }
+  replaceSingleDefInstWithReg(MI, Reg);
+}
+
+bool CombinerHelper::matchExtractAllEltsFromBuildVector(
+    MachineInstr &MI,
+    SmallVectorImpl<std::pair<Register, MachineInstr *>> &SrcDstPairs) {
+  assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
+  // This combine tries to find build_vector's which have every source element
+  // extracted using G_EXTRACT_VECTOR_ELT. This can happen when transforms like
+  // the masked load scalarization is run late in the pipeline. There's already
+  // a combine for a similar pattern starting from the extract, but that
+  // doesn't attempt to do it if there are multiple uses of the build_vector,
+  // which in this case is true. Starting the combine from the build_vector
+  // feels more natural than trying to find sibling nodes of extracts.
+  // E.g.
+  //  %vec(<4 x s32>) = G_BUILD_VECTOR %s1(s32), %s2, %s3, %s4
+  //  %ext1 = G_EXTRACT_VECTOR_ELT %vec, 0
+  //  %ext2 = G_EXTRACT_VECTOR_ELT %vec, 1
+  //  %ext3 = G_EXTRACT_VECTOR_ELT %vec, 2
+  //  %ext4 = G_EXTRACT_VECTOR_ELT %vec, 3
+  // ==>
+  // replace ext{1,2,3,4} with %s{1,2,3,4}
+
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  unsigned NumElts = DstTy.getNumElements();
+
+  SmallBitVector ExtractedElts(NumElts);
+  for (MachineInstr &II : MRI.use_nodbg_instructions(DstReg)) {
+    if (II.getOpcode() != TargetOpcode::G_EXTRACT_VECTOR_ELT)
+      return false;
+    auto Cst = getIConstantVRegVal(II.getOperand(2).getReg(), MRI);
+    if (!Cst)
+      return false;
+    unsigned Idx = Cst->getZExtValue();
+    if (Idx >= NumElts)
+      return false; // Out of range.
+    ExtractedElts.set(Idx);
+    SrcDstPairs.emplace_back(
+        std::make_pair(MI.getOperand(Idx + 1).getReg(), &II));
+  }
+  // Match if every element was extracted.
+  return ExtractedElts.all();
+}
+
+void CombinerHelper::applyExtractAllEltsFromBuildVector(
+    MachineInstr &MI,
+    SmallVectorImpl<std::pair<Register, MachineInstr *>> &SrcDstPairs) {
+  assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
+  for (auto &Pair : SrcDstPairs) {
+    auto *ExtMI = Pair.second;
+    replaceRegWith(MRI, ExtMI->getOperand(0).getReg(), Pair.first);
+    ExtMI->eraseFromParent();
+  }
+  MI.eraseFromParent();
+}
+
+void CombinerHelper::applyBuildFn(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  Builder.setInstrAndDebugLoc(MI);
+  MatchInfo(Builder);
+  MI.eraseFromParent();
+}
+
+void CombinerHelper::applyBuildFnNoErase(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  Builder.setInstrAndDebugLoc(MI);
+  MatchInfo(Builder);
+}
+
+bool CombinerHelper::matchOrShiftToFunnelShift(MachineInstr &MI,
+                                               BuildFnTy &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_OR);
+
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  unsigned BitWidth = Ty.getScalarSizeInBits();
+
+  Register ShlSrc, ShlAmt, LShrSrc, LShrAmt, Amt;
+  unsigned FshOpc = 0;
+
+  // Match (or (shl ...), (lshr ...)).
+  if (!mi_match(Dst, MRI,
+                // m_GOr() handles the commuted version as well.
+                m_GOr(m_GShl(m_Reg(ShlSrc), m_Reg(ShlAmt)),
+                      m_GLShr(m_Reg(LShrSrc), m_Reg(LShrAmt)))))
+    return false;
+
+  // Given constants C0 and C1 such that C0 + C1 is bit-width:
+  // (or (shl x, C0), (lshr y, C1)) -> (fshl x, y, C0) or (fshr x, y, C1)
+  int64_t CstShlAmt, CstLShrAmt;
+  if (mi_match(ShlAmt, MRI, m_ICstOrSplat(CstShlAmt)) &&
+      mi_match(LShrAmt, MRI, m_ICstOrSplat(CstLShrAmt)) &&
+      CstShlAmt + CstLShrAmt == BitWidth) {
+    FshOpc = TargetOpcode::G_FSHR;
+    Amt = LShrAmt;
+
+  } else if (mi_match(LShrAmt, MRI,
+                      m_GSub(m_SpecificICstOrSplat(BitWidth), m_Reg(Amt))) &&
+             ShlAmt == Amt) {
+    // (or (shl x, amt), (lshr y, (sub bw, amt))) -> (fshl x, y, amt)
+    FshOpc = TargetOpcode::G_FSHL;
+
+  } else if (mi_match(ShlAmt, MRI,
+                      m_GSub(m_SpecificICstOrSplat(BitWidth), m_Reg(Amt))) &&
+             LShrAmt == Amt) {
+    // (or (shl x, (sub bw, amt)), (lshr y, amt)) -> (fshr x, y, amt)
+    FshOpc = TargetOpcode::G_FSHR;
+
+  } else {
+    return false;
+  }
+
+  LLT AmtTy = MRI.getType(Amt);
+  if (!isLegalOrBeforeLegalizer({FshOpc, {Ty, AmtTy}}))
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    B.buildInstr(FshOpc, {Dst}, {ShlSrc, LShrSrc, Amt});
+  };
+  return true;
+}
+
+/// Match an FSHL or FSHR that can be combined to a ROTR or ROTL rotate.
+bool CombinerHelper::matchFunnelShiftToRotate(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  assert(Opc == TargetOpcode::G_FSHL || Opc == TargetOpcode::G_FSHR);
+  Register X = MI.getOperand(1).getReg();
+  Register Y = MI.getOperand(2).getReg();
+  if (X != Y)
+    return false;
+  unsigned RotateOpc =
+      Opc == TargetOpcode::G_FSHL ? TargetOpcode::G_ROTL : TargetOpcode::G_ROTR;
+  return isLegalOrBeforeLegalizer({RotateOpc, {MRI.getType(X), MRI.getType(Y)}});
+}
+
+void CombinerHelper::applyFunnelShiftToRotate(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  assert(Opc == TargetOpcode::G_FSHL || Opc == TargetOpcode::G_FSHR);
+  bool IsFSHL = Opc == TargetOpcode::G_FSHL;
+  Observer.changingInstr(MI);
+  MI.setDesc(Builder.getTII().get(IsFSHL ? TargetOpcode::G_ROTL
+                                         : TargetOpcode::G_ROTR));
+  MI.removeOperand(2);
+  Observer.changedInstr(MI);
+}
+
+// Fold (rot x, c) -> (rot x, c % BitSize)
+bool CombinerHelper::matchRotateOutOfRange(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_ROTL ||
+         MI.getOpcode() == TargetOpcode::G_ROTR);
+  unsigned Bitsize =
+      MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits();
+  Register AmtReg = MI.getOperand(2).getReg();
+  bool OutOfRange = false;
+  auto MatchOutOfRange = [Bitsize, &OutOfRange](const Constant *C) {
+    if (auto *CI = dyn_cast<ConstantInt>(C))
+      OutOfRange |= CI->getValue().uge(Bitsize);
+    return true;
+  };
+  return matchUnaryPredicate(MRI, AmtReg, MatchOutOfRange) && OutOfRange;
+}
+
+void CombinerHelper::applyRotateOutOfRange(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_ROTL ||
+         MI.getOpcode() == TargetOpcode::G_ROTR);
+  unsigned Bitsize =
+      MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits();
+  Builder.setInstrAndDebugLoc(MI);
+  Register Amt = MI.getOperand(2).getReg();
+  LLT AmtTy = MRI.getType(Amt);
+  auto Bits = Builder.buildConstant(AmtTy, Bitsize);
+  Amt = Builder.buildURem(AmtTy, MI.getOperand(2).getReg(), Bits).getReg(0);
+  Observer.changingInstr(MI);
+  MI.getOperand(2).setReg(Amt);
+  Observer.changedInstr(MI);
+}
+
+bool CombinerHelper::matchICmpToTrueFalseKnownBits(MachineInstr &MI,
+                                                   int64_t &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_ICMP);
+  auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+  auto KnownLHS = KB->getKnownBits(MI.getOperand(2).getReg());
+  auto KnownRHS = KB->getKnownBits(MI.getOperand(3).getReg());
+  std::optional<bool> KnownVal;
+  switch (Pred) {
+  default:
+    llvm_unreachable("Unexpected G_ICMP predicate?");
+  case CmpInst::ICMP_EQ:
+    KnownVal = KnownBits::eq(KnownLHS, KnownRHS);
+    break;
+  case CmpInst::ICMP_NE:
+    KnownVal = KnownBits::ne(KnownLHS, KnownRHS);
+    break;
+  case CmpInst::ICMP_SGE:
+    KnownVal = KnownBits::sge(KnownLHS, KnownRHS);
+    break;
+  case CmpInst::ICMP_SGT:
+    KnownVal = KnownBits::sgt(KnownLHS, KnownRHS);
+    break;
+  case CmpInst::ICMP_SLE:
+    KnownVal = KnownBits::sle(KnownLHS, KnownRHS);
+    break;
+  case CmpInst::ICMP_SLT:
+    KnownVal = KnownBits::slt(KnownLHS, KnownRHS);
+    break;
+  case CmpInst::ICMP_UGE:
+    KnownVal = KnownBits::uge(KnownLHS, KnownRHS);
+    break;
+  case CmpInst::ICMP_UGT:
+    KnownVal = KnownBits::ugt(KnownLHS, KnownRHS);
+    break;
+  case CmpInst::ICMP_ULE:
+    KnownVal = KnownBits::ule(KnownLHS, KnownRHS);
+    break;
+  case CmpInst::ICMP_ULT:
+    KnownVal = KnownBits::ult(KnownLHS, KnownRHS);
+    break;
+  }
+  if (!KnownVal)
+    return false;
+  MatchInfo =
+      *KnownVal
+          ? getICmpTrueVal(getTargetLowering(),
+                           /*IsVector = */
+                           MRI.getType(MI.getOperand(0).getReg()).isVector(),
+                           /* IsFP = */ false)
+          : 0;
+  return true;
+}
+
+bool CombinerHelper::matchICmpToLHSKnownBits(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_ICMP);
+  // Given:
+  //
+  // %x = G_WHATEVER (... x is known to be 0 or 1 ...)
+  // %cmp = G_ICMP ne %x, 0
+  //
+  // Or:
+  //
+  // %x = G_WHATEVER (... x is known to be 0 or 1 ...)
+  // %cmp = G_ICMP eq %x, 1
+  //
+  // We can replace %cmp with %x assuming true is 1 on the target.
+  auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+  if (!CmpInst::isEquality(Pred))
+    return false;
+  Register Dst = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  if (getICmpTrueVal(getTargetLowering(), DstTy.isVector(),
+                     /* IsFP = */ false) != 1)
+    return false;
+  int64_t OneOrZero = Pred == CmpInst::ICMP_EQ;
+  if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICst(OneOrZero)))
+    return false;
+  Register LHS = MI.getOperand(2).getReg();
+  auto KnownLHS = KB->getKnownBits(LHS);
+  if (KnownLHS.getMinValue() != 0 || KnownLHS.getMaxValue() != 1)
+    return false;
+  // Make sure replacing Dst with the LHS is a legal operation.
+  LLT LHSTy = MRI.getType(LHS);
+  unsigned LHSSize = LHSTy.getSizeInBits();
+  unsigned DstSize = DstTy.getSizeInBits();
+  unsigned Op = TargetOpcode::COPY;
+  if (DstSize != LHSSize)
+    Op = DstSize < LHSSize ? TargetOpcode::G_TRUNC : TargetOpcode::G_ZEXT;
+  if (!isLegalOrBeforeLegalizer({Op, {DstTy, LHSTy}}))
+    return false;
+  MatchInfo = [=](MachineIRBuilder &B) { B.buildInstr(Op, {Dst}, {LHS}); };
+  return true;
+}
+
+// Replace (and (or x, c1), c2) with (and x, c2) iff c1 & c2 == 0
+bool CombinerHelper::matchAndOrDisjointMask(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_AND);
+
+  // Ignore vector types to simplify matching the two constants.
+  // TODO: do this for vectors and scalars via a demanded bits analysis.
+  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+  if (Ty.isVector())
+    return false;
+
+  Register Src;
+  Register AndMaskReg;
+  int64_t AndMaskBits;
+  int64_t OrMaskBits;
+  if (!mi_match(MI, MRI,
+                m_GAnd(m_GOr(m_Reg(Src), m_ICst(OrMaskBits)),
+                       m_all_of(m_ICst(AndMaskBits), m_Reg(AndMaskReg)))))
+    return false;
+
+  // Check if OrMask could turn on any bits in Src.
+  if (AndMaskBits & OrMaskBits)
+    return false;
+
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    Observer.changingInstr(MI);
+    // Canonicalize the result to have the constant on the RHS.
+    if (MI.getOperand(1).getReg() == AndMaskReg)
+      MI.getOperand(2).setReg(AndMaskReg);
+    MI.getOperand(1).setReg(Src);
+    Observer.changedInstr(MI);
+  };
+  return true;
+}
+
+/// Form a G_SBFX from a G_SEXT_INREG fed by a right shift.
+bool CombinerHelper::matchBitfieldExtractFromSExtInReg(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  LLT Ty = MRI.getType(Src);
+  LLT ExtractTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+  if (!LI || !LI->isLegalOrCustom({TargetOpcode::G_SBFX, {Ty, ExtractTy}}))
+    return false;
+  int64_t Width = MI.getOperand(2).getImm();
+  Register ShiftSrc;
+  int64_t ShiftImm;
+  if (!mi_match(
+          Src, MRI,
+          m_OneNonDBGUse(m_any_of(m_GAShr(m_Reg(ShiftSrc), m_ICst(ShiftImm)),
+                                  m_GLShr(m_Reg(ShiftSrc), m_ICst(ShiftImm))))))
+    return false;
+  if (ShiftImm < 0 || ShiftImm + Width > Ty.getScalarSizeInBits())
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    auto Cst1 = B.buildConstant(ExtractTy, ShiftImm);
+    auto Cst2 = B.buildConstant(ExtractTy, Width);
+    B.buildSbfx(Dst, ShiftSrc, Cst1, Cst2);
+  };
+  return true;
+}
+
+/// Form a G_UBFX from "(a srl b) & mask", where b and mask are constants.
+bool CombinerHelper::matchBitfieldExtractFromAnd(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_AND);
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  LLT ExtractTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+  if (!getTargetLowering().isConstantUnsignedBitfieldExtractLegal(
+          TargetOpcode::G_UBFX, Ty, ExtractTy))
+    return false;
+
+  int64_t AndImm, LSBImm;
+  Register ShiftSrc;
+  const unsigned Size = Ty.getScalarSizeInBits();
+  if (!mi_match(MI.getOperand(0).getReg(), MRI,
+                m_GAnd(m_OneNonDBGUse(m_GLShr(m_Reg(ShiftSrc), m_ICst(LSBImm))),
+                       m_ICst(AndImm))))
+    return false;
+
+  // The mask is a mask of the low bits iff imm & (imm+1) == 0.
+  auto MaybeMask = static_cast<uint64_t>(AndImm);
+  if (MaybeMask & (MaybeMask + 1))
+    return false;
+
+  // LSB must fit within the register.
+  if (static_cast<uint64_t>(LSBImm) >= Size)
+    return false;
+
+  uint64_t Width = APInt(Size, AndImm).countr_one();
+  MatchInfo = [=](MachineIRBuilder &B) {
+    auto WidthCst = B.buildConstant(ExtractTy, Width);
+    auto LSBCst = B.buildConstant(ExtractTy, LSBImm);
+    B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {ShiftSrc, LSBCst, WidthCst});
+  };
+  return true;
+}
+
+bool CombinerHelper::matchBitfieldExtractFromShr(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  const unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_ASHR || Opcode == TargetOpcode::G_LSHR);
+
+  const Register Dst = MI.getOperand(0).getReg();
+
+  const unsigned ExtrOpcode = Opcode == TargetOpcode::G_ASHR
+                                  ? TargetOpcode::G_SBFX
+                                  : TargetOpcode::G_UBFX;
+
+  // Check if the type we would use for the extract is legal
+  LLT Ty = MRI.getType(Dst);
+  LLT ExtractTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+  if (!LI || !LI->isLegalOrCustom({ExtrOpcode, {Ty, ExtractTy}}))
+    return false;
+
+  Register ShlSrc;
+  int64_t ShrAmt;
+  int64_t ShlAmt;
+  const unsigned Size = Ty.getScalarSizeInBits();
+
+  // Try to match shr (shl x, c1), c2
+  if (!mi_match(Dst, MRI,
+                m_BinOp(Opcode,
+                        m_OneNonDBGUse(m_GShl(m_Reg(ShlSrc), m_ICst(ShlAmt))),
+                        m_ICst(ShrAmt))))
+    return false;
+
+  // Make sure that the shift sizes can fit a bitfield extract
+  if (ShlAmt < 0 || ShlAmt > ShrAmt || ShrAmt >= Size)
+    return false;
+
+  // Skip this combine if the G_SEXT_INREG combine could handle it
+  if (Opcode == TargetOpcode::G_ASHR && ShlAmt == ShrAmt)
+    return false;
+
+  // Calculate start position and width of the extract
+  const int64_t Pos = ShrAmt - ShlAmt;
+  const int64_t Width = Size - ShrAmt;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    auto WidthCst = B.buildConstant(ExtractTy, Width);
+    auto PosCst = B.buildConstant(ExtractTy, Pos);
+    B.buildInstr(ExtrOpcode, {Dst}, {ShlSrc, PosCst, WidthCst});
+  };
+  return true;
+}
+
+bool CombinerHelper::matchBitfieldExtractFromShrAnd(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  const unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_LSHR || Opcode == TargetOpcode::G_ASHR);
+
+  const Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  LLT ExtractTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+  if (!getTargetLowering().isConstantUnsignedBitfieldExtractLegal(
+          TargetOpcode::G_UBFX, Ty, ExtractTy))
+    return false;
+
+  // Try to match shr (and x, c1), c2
+  Register AndSrc;
+  int64_t ShrAmt;
+  int64_t SMask;
+  if (!mi_match(Dst, MRI,
+                m_BinOp(Opcode,
+                        m_OneNonDBGUse(m_GAnd(m_Reg(AndSrc), m_ICst(SMask))),
+                        m_ICst(ShrAmt))))
+    return false;
+
+  const unsigned Size = Ty.getScalarSizeInBits();
+  if (ShrAmt < 0 || ShrAmt >= Size)
+    return false;
+
+  // If the shift subsumes the mask, emit the 0 directly.
+  if (0 == (SMask >> ShrAmt)) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      B.buildConstant(Dst, 0);
+    };
+    return true;
+  }
+
+  // Check that ubfx can do the extraction, with no holes in the mask.
+  uint64_t UMask = SMask;
+  UMask |= maskTrailingOnes<uint64_t>(ShrAmt);
+  UMask &= maskTrailingOnes<uint64_t>(Size);
+  if (!isMask_64(UMask))
+    return false;
+
+  // Calculate start position and width of the extract.
+  const int64_t Pos = ShrAmt;
+  const int64_t Width = llvm::countr_one(UMask) - ShrAmt;
+
+  // It's preferable to keep the shift, rather than form G_SBFX.
+  // TODO: remove the G_AND via demanded bits analysis.
+  if (Opcode == TargetOpcode::G_ASHR && Width + ShrAmt == Size)
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &B) {
+    auto WidthCst = B.buildConstant(ExtractTy, Width);
+    auto PosCst = B.buildConstant(ExtractTy, Pos);
+    B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {AndSrc, PosCst, WidthCst});
+  };
+  return true;
+}
+
+bool CombinerHelper::reassociationCanBreakAddressingModePattern(
+    MachineInstr &PtrAdd) {
+  assert(PtrAdd.getOpcode() == TargetOpcode::G_PTR_ADD);
+
+  Register Src1Reg = PtrAdd.getOperand(1).getReg();
+  MachineInstr *Src1Def = getOpcodeDef(TargetOpcode::G_PTR_ADD, Src1Reg, MRI);
+  if (!Src1Def)
+    return false;
+
+  Register Src2Reg = PtrAdd.getOperand(2).getReg();
+
+  if (MRI.hasOneNonDBGUse(Src1Reg))
+    return false;
+
+  auto C1 = getIConstantVRegVal(Src1Def->getOperand(2).getReg(), MRI);
+  if (!C1)
+    return false;
+  auto C2 = getIConstantVRegVal(Src2Reg, MRI);
+  if (!C2)
+    return false;
+
+  const APInt &C1APIntVal = *C1;
+  const APInt &C2APIntVal = *C2;
+  const int64_t CombinedValue = (C1APIntVal + C2APIntVal).getSExtValue();
+
+  for (auto &UseMI : MRI.use_nodbg_instructions(Src1Reg)) {
+    // This combine may end up running before ptrtoint/inttoptr combines
+    // manage to eliminate redundant conversions, so try to look through them.
+    MachineInstr *ConvUseMI = &UseMI;
+    unsigned ConvUseOpc = ConvUseMI->getOpcode();
+    while (ConvUseOpc == TargetOpcode::G_INTTOPTR ||
+           ConvUseOpc == TargetOpcode::G_PTRTOINT) {
+      Register DefReg = ConvUseMI->getOperand(0).getReg();
+      if (!MRI.hasOneNonDBGUse(DefReg))
+        break;
+      ConvUseMI = &*MRI.use_instr_nodbg_begin(DefReg);
+      ConvUseOpc = ConvUseMI->getOpcode();
+    }
+    auto LoadStore = ConvUseOpc == TargetOpcode::G_LOAD ||
+                     ConvUseOpc == TargetOpcode::G_STORE;
+    if (!LoadStore)
+      continue;
+    // Is x[offset2] already not a legal addressing mode? If so then
+    // reassociating the constants breaks nothing (we test offset2 because
+    // that's the one we hope to fold into the load or store).
+    TargetLoweringBase::AddrMode AM;
+    AM.HasBaseReg = true;
+    AM.BaseOffs = C2APIntVal.getSExtValue();
+    unsigned AS =
+        MRI.getType(ConvUseMI->getOperand(1).getReg()).getAddressSpace();
+    Type *AccessTy =
+        getTypeForLLT(MRI.getType(ConvUseMI->getOperand(0).getReg()),
+                      PtrAdd.getMF()->getFunction().getContext());
+    const auto &TLI = *PtrAdd.getMF()->getSubtarget().getTargetLowering();
+    if (!TLI.isLegalAddressingMode(PtrAdd.getMF()->getDataLayout(), AM,
+                                   AccessTy, AS))
+      continue;
+
+    // Would x[offset1+offset2] still be a legal addressing mode?
+    AM.BaseOffs = CombinedValue;
+    if (!TLI.isLegalAddressingMode(PtrAdd.getMF()->getDataLayout(), AM,
+                                   AccessTy, AS))
+      return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchReassocConstantInnerRHS(GPtrAdd &MI,
+                                                  MachineInstr *RHS,
+                                                  BuildFnTy &MatchInfo) {
+  // G_PTR_ADD(BASE, G_ADD(X, C)) -> G_PTR_ADD(G_PTR_ADD(BASE, X), C)
+  Register Src1Reg = MI.getOperand(1).getReg();
+  if (RHS->getOpcode() != TargetOpcode::G_ADD)
+    return false;
+  auto C2 = getIConstantVRegVal(RHS->getOperand(2).getReg(), MRI);
+  if (!C2)
+    return false;
+
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    LLT PtrTy = MRI.getType(MI.getOperand(0).getReg());
+
+    auto NewBase =
+        Builder.buildPtrAdd(PtrTy, Src1Reg, RHS->getOperand(1).getReg());
+    Observer.changingInstr(MI);
+    MI.getOperand(1).setReg(NewBase.getReg(0));
+    MI.getOperand(2).setReg(RHS->getOperand(2).getReg());
+    Observer.changedInstr(MI);
+  };
+  return !reassociationCanBreakAddressingModePattern(MI);
+}
+
+bool CombinerHelper::matchReassocConstantInnerLHS(GPtrAdd &MI,
+                                                  MachineInstr *LHS,
+                                                  MachineInstr *RHS,
+                                                  BuildFnTy &MatchInfo) {
+  // G_PTR_ADD (G_PTR_ADD X, C), Y) -> (G_PTR_ADD (G_PTR_ADD(X, Y), C)
+  // if and only if (G_PTR_ADD X, C) has one use.
+  Register LHSBase;
+  std::optional<ValueAndVReg> LHSCstOff;
+  if (!mi_match(MI.getBaseReg(), MRI,
+                m_OneNonDBGUse(m_GPtrAdd(m_Reg(LHSBase), m_GCst(LHSCstOff)))))
+    return false;
+
+  auto *LHSPtrAdd = cast<GPtrAdd>(LHS);
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    // When we change LHSPtrAdd's offset register we might cause it to use a reg
+    // before its def. Sink the instruction so the outer PTR_ADD to ensure this
+    // doesn't happen.
+    LHSPtrAdd->moveBefore(&MI);
+    Register RHSReg = MI.getOffsetReg();
+    // set VReg will cause type mismatch if it comes from extend/trunc
+    auto NewCst = B.buildConstant(MRI.getType(RHSReg), LHSCstOff->Value);
+    Observer.changingInstr(MI);
+    MI.getOperand(2).setReg(NewCst.getReg(0));
+    Observer.changedInstr(MI);
+    Observer.changingInstr(*LHSPtrAdd);
+    LHSPtrAdd->getOperand(2).setReg(RHSReg);
+    Observer.changedInstr(*LHSPtrAdd);
+  };
+  return !reassociationCanBreakAddressingModePattern(MI);
+}
+
+bool CombinerHelper::matchReassocFoldConstantsInSubTree(GPtrAdd &MI,
+                                                        MachineInstr *LHS,
+                                                        MachineInstr *RHS,
+                                                        BuildFnTy &MatchInfo) {
+  // G_PTR_ADD(G_PTR_ADD(BASE, C1), C2) -> G_PTR_ADD(BASE, C1+C2)
+  auto *LHSPtrAdd = dyn_cast<GPtrAdd>(LHS);
+  if (!LHSPtrAdd)
+    return false;
+
+  Register Src2Reg = MI.getOperand(2).getReg();
+  Register LHSSrc1 = LHSPtrAdd->getBaseReg();
+  Register LHSSrc2 = LHSPtrAdd->getOffsetReg();
+  auto C1 = getIConstantVRegVal(LHSSrc2, MRI);
+  if (!C1)
+    return false;
+  auto C2 = getIConstantVRegVal(Src2Reg, MRI);
+  if (!C2)
+    return false;
+
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    auto NewCst = B.buildConstant(MRI.getType(Src2Reg), *C1 + *C2);
+    Observer.changingInstr(MI);
+    MI.getOperand(1).setReg(LHSSrc1);
+    MI.getOperand(2).setReg(NewCst.getReg(0));
+    Observer.changedInstr(MI);
+  };
+  return !reassociationCanBreakAddressingModePattern(MI);
+}
+
+bool CombinerHelper::matchReassocPtrAdd(MachineInstr &MI,
+                                        BuildFnTy &MatchInfo) {
+  auto &PtrAdd = cast<GPtrAdd>(MI);
+  // We're trying to match a few pointer computation patterns here for
+  // re-association opportunities.
+  // 1) Isolating a constant operand to be on the RHS, e.g.:
+  // G_PTR_ADD(BASE, G_ADD(X, C)) -> G_PTR_ADD(G_PTR_ADD(BASE, X), C)
+  //
+  // 2) Folding two constants in each sub-tree as long as such folding
+  // doesn't break a legal addressing mode.
+  // G_PTR_ADD(G_PTR_ADD(BASE, C1), C2) -> G_PTR_ADD(BASE, C1+C2)
+  //
+  // 3) Move a constant from the LHS of an inner op to the RHS of the outer.
+  // G_PTR_ADD (G_PTR_ADD X, C), Y) -> G_PTR_ADD (G_PTR_ADD(X, Y), C)
+  // iif (G_PTR_ADD X, C) has one use.
+  MachineInstr *LHS = MRI.getVRegDef(PtrAdd.getBaseReg());
+  MachineInstr *RHS = MRI.getVRegDef(PtrAdd.getOffsetReg());
+
+  // Try to match example 2.
+  if (matchReassocFoldConstantsInSubTree(PtrAdd, LHS, RHS, MatchInfo))
+    return true;
+
+  // Try to match example 3.
+  if (matchReassocConstantInnerLHS(PtrAdd, LHS, RHS, MatchInfo))
+    return true;
+
+  // Try to match example 1.
+  if (matchReassocConstantInnerRHS(PtrAdd, RHS, MatchInfo))
+    return true;
+
+  return false;
+}
+bool CombinerHelper::tryReassocBinOp(unsigned Opc, Register DstReg,
+                                     Register OpLHS, Register OpRHS,
+                                     BuildFnTy &MatchInfo) {
+  LLT OpRHSTy = MRI.getType(OpRHS);
+  MachineInstr *OpLHSDef = MRI.getVRegDef(OpLHS);
+
+  if (OpLHSDef->getOpcode() != Opc)
+    return false;
+
+  MachineInstr *OpRHSDef = MRI.getVRegDef(OpRHS);
+  Register OpLHSLHS = OpLHSDef->getOperand(1).getReg();
+  Register OpLHSRHS = OpLHSDef->getOperand(2).getReg();
+
+  // If the inner op is (X op C), pull the constant out so it can be folded with
+  // other constants in the expression tree. Folding is not guaranteed so we
+  // might have (C1 op C2). In that case do not pull a constant out because it
+  // won't help and can lead to infinite loops.
+  if (isConstantOrConstantSplatVector(*MRI.getVRegDef(OpLHSRHS), MRI) &&
+      !isConstantOrConstantSplatVector(*MRI.getVRegDef(OpLHSLHS), MRI)) {
+    if (isConstantOrConstantSplatVector(*OpRHSDef, MRI)) {
+      // (Opc (Opc X, C1), C2) -> (Opc X, (Opc C1, C2))
+      MatchInfo = [=](MachineIRBuilder &B) {
+        auto NewCst = B.buildInstr(Opc, {OpRHSTy}, {OpLHSRHS, OpRHS});
+        B.buildInstr(Opc, {DstReg}, {OpLHSLHS, NewCst});
+      };
+      return true;
+    }
+    if (getTargetLowering().isReassocProfitable(MRI, OpLHS, OpRHS)) {
+      // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
+      //              iff (op x, c1) has one use
+      MatchInfo = [=](MachineIRBuilder &B) {
+        auto NewLHSLHS = B.buildInstr(Opc, {OpRHSTy}, {OpLHSLHS, OpRHS});
+        B.buildInstr(Opc, {DstReg}, {NewLHSLHS, OpLHSRHS});
+      };
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchReassocCommBinOp(MachineInstr &MI,
+                                           BuildFnTy &MatchInfo) {
+  // We don't check if the reassociation will break a legal addressing mode
+  // here since pointer arithmetic is handled by G_PTR_ADD.
+  unsigned Opc = MI.getOpcode();
+  Register DstReg = MI.getOperand(0).getReg();
+  Register LHSReg = MI.getOperand(1).getReg();
+  Register RHSReg = MI.getOperand(2).getReg();
+
+  if (tryReassocBinOp(Opc, DstReg, LHSReg, RHSReg, MatchInfo))
+    return true;
+  if (tryReassocBinOp(Opc, DstReg, RHSReg, LHSReg, MatchInfo))
+    return true;
+  return false;
+}
+
+bool CombinerHelper::matchConstantFold(MachineInstr &MI, APInt &MatchInfo) {
+  Register Op1 = MI.getOperand(1).getReg();
+  Register Op2 = MI.getOperand(2).getReg();
+  auto MaybeCst = ConstantFoldBinOp(MI.getOpcode(), Op1, Op2, MRI);
+  if (!MaybeCst)
+    return false;
+  MatchInfo = *MaybeCst;
+  return true;
+}
+
+bool CombinerHelper::matchNarrowBinopFeedingAnd(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  // Look for a binop feeding into an AND with a mask:
+  //
+  // %add = G_ADD %lhs, %rhs
+  // %and = G_AND %add, 000...11111111
+  //
+  // Check if it's possible to perform the binop at a narrower width and zext
+  // back to the original width like so:
+  //
+  // %narrow_lhs = G_TRUNC %lhs
+  // %narrow_rhs = G_TRUNC %rhs
+  // %narrow_add = G_ADD %narrow_lhs, %narrow_rhs
+  // %new_add = G_ZEXT %narrow_add
+  // %and = G_AND %new_add, 000...11111111
+  //
+  // This can allow later combines to eliminate the G_AND if it turns out
+  // that the mask is irrelevant.
+  assert(MI.getOpcode() == TargetOpcode::G_AND);
+  Register Dst = MI.getOperand(0).getReg();
+  Register AndLHS = MI.getOperand(1).getReg();
+  Register AndRHS = MI.getOperand(2).getReg();
+  LLT WideTy = MRI.getType(Dst);
+
+  // If the potential binop has more than one use, then it's possible that one
+  // of those uses will need its full width.
+  if (!WideTy.isScalar() || !MRI.hasOneNonDBGUse(AndLHS))
+    return false;
+
+  // Check if the LHS feeding the AND is impacted by the high bits that we're
+  // masking out.
+  //
+  // e.g. for 64-bit x, y:
+  //
+  // add_64(x, y) & 65535 == zext(add_16(trunc(x), trunc(y))) & 65535
+  MachineInstr *LHSInst = getDefIgnoringCopies(AndLHS, MRI);
+  if (!LHSInst)
+    return false;
+  unsigned LHSOpc = LHSInst->getOpcode();
+  switch (LHSOpc) {
+  default:
+    return false;
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_SUB:
+  case TargetOpcode::G_MUL:
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_XOR:
+    break;
+  }
+
+  // Find the mask on the RHS.
+  auto Cst = getIConstantVRegValWithLookThrough(AndRHS, MRI);
+  if (!Cst)
+    return false;
+  auto Mask = Cst->Value;
+  if (!Mask.isMask())
+    return false;
+
+  // No point in combining if there's nothing to truncate.
+  unsigned NarrowWidth = Mask.countr_one();
+  if (NarrowWidth == WideTy.getSizeInBits())
+    return false;
+  LLT NarrowTy = LLT::scalar(NarrowWidth);
+
+  // Check if adding the zext + truncates could be harmful.
+  auto &MF = *MI.getMF();
+  const auto &TLI = getTargetLowering();
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  auto &DL = MF.getDataLayout();
+  if (!TLI.isTruncateFree(WideTy, NarrowTy, DL, Ctx) ||
+      !TLI.isZExtFree(NarrowTy, WideTy, DL, Ctx))
+    return false;
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {NarrowTy, WideTy}}) ||
+      !isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {WideTy, NarrowTy}}))
+    return false;
+  Register BinOpLHS = LHSInst->getOperand(1).getReg();
+  Register BinOpRHS = LHSInst->getOperand(2).getReg();
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    auto NarrowLHS = Builder.buildTrunc(NarrowTy, BinOpLHS);
+    auto NarrowRHS = Builder.buildTrunc(NarrowTy, BinOpRHS);
+    auto NarrowBinOp =
+        Builder.buildInstr(LHSOpc, {NarrowTy}, {NarrowLHS, NarrowRHS});
+    auto Ext = Builder.buildZExt(WideTy, NarrowBinOp);
+    Observer.changingInstr(MI);
+    MI.getOperand(1).setReg(Ext.getReg(0));
+    Observer.changedInstr(MI);
+  };
+  return true;
+}
+
+bool CombinerHelper::matchMulOBy2(MachineInstr &MI, BuildFnTy &MatchInfo) {
+  unsigned Opc = MI.getOpcode();
+  assert(Opc == TargetOpcode::G_UMULO || Opc == TargetOpcode::G_SMULO);
+
+  if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICstOrSplat(2)))
+    return false;
+
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    Observer.changingInstr(MI);
+    unsigned NewOpc = Opc == TargetOpcode::G_UMULO ? TargetOpcode::G_UADDO
+                                                   : TargetOpcode::G_SADDO;
+    MI.setDesc(Builder.getTII().get(NewOpc));
+    MI.getOperand(3).setReg(MI.getOperand(2).getReg());
+    Observer.changedInstr(MI);
+  };
+  return true;
+}
+
+bool CombinerHelper::matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
+  // (G_*MULO x, 0) -> 0 + no carry out
+  assert(MI.getOpcode() == TargetOpcode::G_UMULO ||
+         MI.getOpcode() == TargetOpcode::G_SMULO);
+  if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICstOrSplat(0)))
+    return false;
+  Register Dst = MI.getOperand(0).getReg();
+  Register Carry = MI.getOperand(1).getReg();
+  if (!isConstantLegalOrBeforeLegalizer(MRI.getType(Dst)) ||
+      !isConstantLegalOrBeforeLegalizer(MRI.getType(Carry)))
+    return false;
+  MatchInfo = [=](MachineIRBuilder &B) {
+    B.buildConstant(Dst, 0);
+    B.buildConstant(Carry, 0);
+  };
+  return true;
+}
+
+bool CombinerHelper::matchAddOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
+  // (G_*ADDO x, 0) -> x + no carry out
+  assert(MI.getOpcode() == TargetOpcode::G_UADDO ||
+         MI.getOpcode() == TargetOpcode::G_SADDO);
+  if (!mi_match(MI.getOperand(3).getReg(), MRI, m_SpecificICstOrSplat(0)))
+    return false;
+  Register Carry = MI.getOperand(1).getReg();
+  if (!isConstantLegalOrBeforeLegalizer(MRI.getType(Carry)))
+    return false;
+  Register Dst = MI.getOperand(0).getReg();
+  Register LHS = MI.getOperand(2).getReg();
+  MatchInfo = [=](MachineIRBuilder &B) {
+    B.buildCopy(Dst, LHS);
+    B.buildConstant(Carry, 0);
+  };
+  return true;
+}
+
+bool CombinerHelper::matchAddEToAddO(MachineInstr &MI, BuildFnTy &MatchInfo) {
+  // (G_*ADDE x, y, 0) -> (G_*ADDO x, y)
+  // (G_*SUBE x, y, 0) -> (G_*SUBO x, y)
+  assert(MI.getOpcode() == TargetOpcode::G_UADDE ||
+         MI.getOpcode() == TargetOpcode::G_SADDE ||
+         MI.getOpcode() == TargetOpcode::G_USUBE ||
+         MI.getOpcode() == TargetOpcode::G_SSUBE);
+  if (!mi_match(MI.getOperand(4).getReg(), MRI, m_SpecificICstOrSplat(0)))
+    return false;
+  MatchInfo = [&](MachineIRBuilder &B) {
+    unsigned NewOpcode;
+    switch (MI.getOpcode()) {
+    case TargetOpcode::G_UADDE:
+      NewOpcode = TargetOpcode::G_UADDO;
+      break;
+    case TargetOpcode::G_SADDE:
+      NewOpcode = TargetOpcode::G_SADDO;
+      break;
+    case TargetOpcode::G_USUBE:
+      NewOpcode = TargetOpcode::G_USUBO;
+      break;
+    case TargetOpcode::G_SSUBE:
+      NewOpcode = TargetOpcode::G_SSUBO;
+      break;
+    }
+    Observer.changingInstr(MI);
+    MI.setDesc(B.getTII().get(NewOpcode));
+    MI.removeOperand(4);
+    Observer.changedInstr(MI);
+  };
+  return true;
+}
+
+bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI,
+                                        BuildFnTy &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SUB);
+  Register Dst = MI.getOperand(0).getReg();
+  // (x + y) - z -> x (if y == z)
+  // (x + y) - z -> y (if x == z)
+  Register X, Y, Z;
+  if (mi_match(Dst, MRI, m_GSub(m_GAdd(m_Reg(X), m_Reg(Y)), m_Reg(Z)))) {
+    Register ReplaceReg;
+    int64_t CstX, CstY;
+    if (Y == Z || (mi_match(Y, MRI, m_ICstOrSplat(CstY)) &&
+                   mi_match(Z, MRI, m_SpecificICstOrSplat(CstY))))
+      ReplaceReg = X;
+    else if (X == Z || (mi_match(X, MRI, m_ICstOrSplat(CstX)) &&
+                        mi_match(Z, MRI, m_SpecificICstOrSplat(CstX))))
+      ReplaceReg = Y;
+    if (ReplaceReg) {
+      MatchInfo = [=](MachineIRBuilder &B) { B.buildCopy(Dst, ReplaceReg); };
+      return true;
+    }
+  }
+
+  // x - (y + z) -> 0 - y (if x == z)
+  // x - (y + z) -> 0 - z (if x == y)
+  if (mi_match(Dst, MRI, m_GSub(m_Reg(X), m_GAdd(m_Reg(Y), m_Reg(Z))))) {
+    Register ReplaceReg;
+    int64_t CstX;
+    if (X == Z || (mi_match(X, MRI, m_ICstOrSplat(CstX)) &&
+                   mi_match(Z, MRI, m_SpecificICstOrSplat(CstX))))
+      ReplaceReg = Y;
+    else if (X == Y || (mi_match(X, MRI, m_ICstOrSplat(CstX)) &&
+                        mi_match(Y, MRI, m_SpecificICstOrSplat(CstX))))
+      ReplaceReg = Z;
+    if (ReplaceReg) {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        auto Zero = B.buildConstant(MRI.getType(Dst), 0);
+        B.buildSub(Dst, Zero, ReplaceReg);
+      };
+      return true;
+    }
+  }
+  return false;
+}
+
+MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UDIV);
+  auto &UDiv = cast<GenericMachineInstr>(MI);
+  Register Dst = UDiv.getReg(0);
+  Register LHS = UDiv.getReg(1);
+  Register RHS = UDiv.getReg(2);
+  LLT Ty = MRI.getType(Dst);
+  LLT ScalarTy = Ty.getScalarType();
+  const unsigned EltBits = ScalarTy.getScalarSizeInBits();
+  LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+  LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType();
+  auto &MIB = Builder;
+  MIB.setInstrAndDebugLoc(MI);
+
+  bool UseNPQ = false;
+  SmallVector<Register, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
+
+  auto BuildUDIVPattern = [&](const Constant *C) {
+    auto *CI = cast<ConstantInt>(C);
+    const APInt &Divisor = CI->getValue();
+
+    bool SelNPQ = false;
+    APInt Magic(Divisor.getBitWidth(), 0);
+    unsigned PreShift = 0, PostShift = 0;
+
+    // Magic algorithm doesn't work for division by 1. We need to emit a select
+    // at the end.
+    // TODO: Use undef values for divisor of 1.
+    if (!Divisor.isOne()) {
+      UnsignedDivisionByConstantInfo magics =
+          UnsignedDivisionByConstantInfo::get(Divisor);
+
+      Magic = std::move(magics.Magic);
+
+      assert(magics.PreShift < Divisor.getBitWidth() &&
+             "We shouldn't generate an undefined shift!");
+      assert(magics.PostShift < Divisor.getBitWidth() &&
+             "We shouldn't generate an undefined shift!");
+      assert((!magics.IsAdd || magics.PreShift == 0) && "Unexpected pre-shift");
+      PreShift = magics.PreShift;
+      PostShift = magics.PostShift;
+      SelNPQ = magics.IsAdd;
+    }
+
+    PreShifts.push_back(
+        MIB.buildConstant(ScalarShiftAmtTy, PreShift).getReg(0));
+    MagicFactors.push_back(MIB.buildConstant(ScalarTy, Magic).getReg(0));
+    NPQFactors.push_back(
+        MIB.buildConstant(ScalarTy,
+                          SelNPQ ? APInt::getOneBitSet(EltBits, EltBits - 1)
+                                 : APInt::getZero(EltBits))
+            .getReg(0));
+    PostShifts.push_back(
+        MIB.buildConstant(ScalarShiftAmtTy, PostShift).getReg(0));
+    UseNPQ |= SelNPQ;
+    return true;
+  };
+
+  // Collect the shifts/magic values from each element.
+  bool Matched = matchUnaryPredicate(MRI, RHS, BuildUDIVPattern);
+  (void)Matched;
+  assert(Matched && "Expected unary predicate match to succeed");
+
+  Register PreShift, PostShift, MagicFactor, NPQFactor;
+  auto *RHSDef = getOpcodeDef<GBuildVector>(RHS, MRI);
+  if (RHSDef) {
+    PreShift = MIB.buildBuildVector(ShiftAmtTy, PreShifts).getReg(0);
+    MagicFactor = MIB.buildBuildVector(Ty, MagicFactors).getReg(0);
+    NPQFactor = MIB.buildBuildVector(Ty, NPQFactors).getReg(0);
+    PostShift = MIB.buildBuildVector(ShiftAmtTy, PostShifts).getReg(0);
+  } else {
+    assert(MRI.getType(RHS).isScalar() &&
+           "Non-build_vector operation should have been a scalar");
+    PreShift = PreShifts[0];
+    MagicFactor = MagicFactors[0];
+    PostShift = PostShifts[0];
+  }
+
+  Register Q = LHS;
+  Q = MIB.buildLShr(Ty, Q, PreShift).getReg(0);
+
+  // Multiply the numerator (operand 0) by the magic value.
+  Q = MIB.buildUMulH(Ty, Q, MagicFactor).getReg(0);
+
+  if (UseNPQ) {
+    Register NPQ = MIB.buildSub(Ty, LHS, Q).getReg(0);
+
+    // For vectors we might have a mix of non-NPQ/NPQ paths, so use
+    // G_UMULH to act as a SRL-by-1 for NPQ, else multiply by zero.
+    if (Ty.isVector())
+      NPQ = MIB.buildUMulH(Ty, NPQ, NPQFactor).getReg(0);
+    else
+      NPQ = MIB.buildLShr(Ty, NPQ, MIB.buildConstant(ShiftAmtTy, 1)).getReg(0);
+
+    Q = MIB.buildAdd(Ty, NPQ, Q).getReg(0);
+  }
+
+  Q = MIB.buildLShr(Ty, Q, PostShift).getReg(0);
+  auto One = MIB.buildConstant(Ty, 1);
+  auto IsOne = MIB.buildICmp(
+      CmpInst::Predicate::ICMP_EQ,
+      Ty.isScalar() ? LLT::scalar(1) : Ty.changeElementSize(1), RHS, One);
+  return MIB.buildSelect(Ty, IsOne, LHS, Q);
+}
+
+bool CombinerHelper::matchUDivByConst(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UDIV);
+  Register Dst = MI.getOperand(0).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  auto *RHSDef = MRI.getVRegDef(RHS);
+  if (!isConstantOrConstantVector(*RHSDef, MRI))
+    return false;
+
+  auto &MF = *MI.getMF();
+  AttributeList Attr = MF.getFunction().getAttributes();
+  const auto &TLI = getTargetLowering();
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  auto &DL = MF.getDataLayout();
+  if (TLI.isIntDivCheap(getApproximateEVTForLLT(DstTy, DL, Ctx), Attr))
+    return false;
+
+  // Don't do this for minsize because the instruction sequence is usually
+  // larger.
+  if (MF.getFunction().hasMinSize())
+    return false;
+
+  // Don't do this if the types are not going to be legal.
+  if (LI) {
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_MUL, {DstTy, DstTy}}))
+      return false;
+    if (!isLegalOrBeforeLegalizer({TargetOpcode::G_UMULH, {DstTy}}))
+      return false;
+    if (!isLegalOrBeforeLegalizer(
+            {TargetOpcode::G_ICMP,
+             {DstTy.isVector() ? DstTy.changeElementSize(1) : LLT::scalar(1),
+              DstTy}}))
+      return false;
+  }
+
+  auto CheckEltValue = [&](const Constant *C) {
+    if (auto *CI = dyn_cast_or_null<ConstantInt>(C))
+      return !CI->isZero();
+    return false;
+  };
+  return matchUnaryPredicate(MRI, RHS, CheckEltValue);
+}
+
+void CombinerHelper::applyUDivByConst(MachineInstr &MI) {
+  auto *NewMI = buildUDivUsingMul(MI);
+  replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
+}
+
+bool CombinerHelper::matchSDivByConst(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
+  Register Dst = MI.getOperand(0).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  LLT DstTy = MRI.getType(Dst);
+
+  auto &MF = *MI.getMF();
+  AttributeList Attr = MF.getFunction().getAttributes();
+  const auto &TLI = getTargetLowering();
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  auto &DL = MF.getDataLayout();
+  if (TLI.isIntDivCheap(getApproximateEVTForLLT(DstTy, DL, Ctx), Attr))
+    return false;
+
+  // Don't do this for minsize because the instruction sequence is usually
+  // larger.
+  if (MF.getFunction().hasMinSize())
+    return false;
+
+  // If the sdiv has an 'exact' flag we can use a simpler lowering.
+  if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
+    return matchUnaryPredicate(
+        MRI, RHS, [](const Constant *C) { return C && !C->isZeroValue(); });
+  }
+
+  // Don't support the general case for now.
+  return false;
+}
+
+void CombinerHelper::applySDivByConst(MachineInstr &MI) {
+  auto *NewMI = buildSDivUsingMul(MI);
+  replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
+}
+
+MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
+  auto &SDiv = cast<GenericMachineInstr>(MI);
+  Register Dst = SDiv.getReg(0);
+  Register LHS = SDiv.getReg(1);
+  Register RHS = SDiv.getReg(2);
+  LLT Ty = MRI.getType(Dst);
+  LLT ScalarTy = Ty.getScalarType();
+  LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+  LLT ScalarShiftAmtTy = ShiftAmtTy.getScalarType();
+  auto &MIB = Builder;
+  MIB.setInstrAndDebugLoc(MI);
+
+  bool UseSRA = false;
+  SmallVector<Register, 16> Shifts, Factors;
+
+  auto *RHSDef = cast<GenericMachineInstr>(getDefIgnoringCopies(RHS, MRI));
+  bool IsSplat = getIConstantSplatVal(*RHSDef, MRI).has_value();
+
+  auto BuildSDIVPattern = [&](const Constant *C) {
+    // Don't recompute inverses for each splat element.
+    if (IsSplat && !Factors.empty()) {
+      Shifts.push_back(Shifts[0]);
+      Factors.push_back(Factors[0]);
+      return true;
+    }
+
+    auto *CI = cast<ConstantInt>(C);
+    APInt Divisor = CI->getValue();
+    unsigned Shift = Divisor.countr_zero();
+    if (Shift) {
+      Divisor.ashrInPlace(Shift);
+      UseSRA = true;
+    }
+
+    // Calculate the multiplicative inverse modulo BW.
+    // 2^W requires W + 1 bits, so we have to extend and then truncate.
+    unsigned W = Divisor.getBitWidth();
+    APInt Factor = Divisor.zext(W + 1)
+                       .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
+                       .trunc(W);
+    Shifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0));
+    Factors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0));
+    return true;
+  };
+
+  // Collect all magic values from the build vector.
+  bool Matched = matchUnaryPredicate(MRI, RHS, BuildSDIVPattern);
+  (void)Matched;
+  assert(Matched && "Expected unary predicate match to succeed");
+
+  Register Shift, Factor;
+  if (Ty.isVector()) {
+    Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0);
+    Factor = MIB.buildBuildVector(Ty, Factors).getReg(0);
+  } else {
+    Shift = Shifts[0];
+    Factor = Factors[0];
+  }
+
+  Register Res = LHS;
+
+  if (UseSRA)
+    Res = MIB.buildAShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0);
+
+  return MIB.buildMul(Ty, Res, Factor);
+}
+
+bool CombinerHelper::matchUMulHToLShr(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UMULH);
+  Register RHS = MI.getOperand(2).getReg();
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+  auto MatchPow2ExceptOne = [&](const Constant *C) {
+    if (auto *CI = dyn_cast<ConstantInt>(C))
+      return CI->getValue().isPowerOf2() && !CI->getValue().isOne();
+    return false;
+  };
+  if (!matchUnaryPredicate(MRI, RHS, MatchPow2ExceptOne, false))
+    return false;
+  return isLegalOrBeforeLegalizer({TargetOpcode::G_LSHR, {Ty, ShiftAmtTy}});
+}
+
+void CombinerHelper::applyUMulHToLShr(MachineInstr &MI) {
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+  unsigned NumEltBits = Ty.getScalarSizeInBits();
+
+  Builder.setInstrAndDebugLoc(MI);
+  auto LogBase2 = buildLogBase2(RHS, Builder);
+  auto ShiftAmt =
+      Builder.buildSub(Ty, Builder.buildConstant(Ty, NumEltBits), LogBase2);
+  auto Trunc = Builder.buildZExtOrTrunc(ShiftAmtTy, ShiftAmt);
+  Builder.buildLShr(Dst, LHS, Trunc);
+  MI.eraseFromParent();
+}
+
+bool CombinerHelper::matchRedundantNegOperands(MachineInstr &MI,
+                                               BuildFnTy &MatchInfo) {
+  unsigned Opc = MI.getOpcode();
+  assert(Opc == TargetOpcode::G_FADD || Opc == TargetOpcode::G_FSUB ||
+         Opc == TargetOpcode::G_FMUL || Opc == TargetOpcode::G_FDIV ||
+         Opc == TargetOpcode::G_FMAD || Opc == TargetOpcode::G_FMA);
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register X = MI.getOperand(1).getReg();
+  Register Y = MI.getOperand(2).getReg();
+  LLT Type = MRI.getType(Dst);
+
+  // fold (fadd x, fneg(y)) -> (fsub x, y)
+  // fold (fadd fneg(y), x) -> (fsub x, y)
+  // G_ADD is commutative so both cases are checked by m_GFAdd
+  if (mi_match(Dst, MRI, m_GFAdd(m_Reg(X), m_GFNeg(m_Reg(Y)))) &&
+      isLegalOrBeforeLegalizer({TargetOpcode::G_FSUB, {Type}})) {
+    Opc = TargetOpcode::G_FSUB;
+  }
+  /// fold (fsub x, fneg(y)) -> (fadd x, y)
+  else if (mi_match(Dst, MRI, m_GFSub(m_Reg(X), m_GFNeg(m_Reg(Y)))) &&
+           isLegalOrBeforeLegalizer({TargetOpcode::G_FADD, {Type}})) {
+    Opc = TargetOpcode::G_FADD;
+  }
+  // fold (fmul fneg(x), fneg(y)) -> (fmul x, y)
+  // fold (fdiv fneg(x), fneg(y)) -> (fdiv x, y)
+  // fold (fmad fneg(x), fneg(y), z) -> (fmad x, y, z)
+  // fold (fma fneg(x), fneg(y), z) -> (fma x, y, z)
+  else if ((Opc == TargetOpcode::G_FMUL || Opc == TargetOpcode::G_FDIV ||
+            Opc == TargetOpcode::G_FMAD || Opc == TargetOpcode::G_FMA) &&
+           mi_match(X, MRI, m_GFNeg(m_Reg(X))) &&
+           mi_match(Y, MRI, m_GFNeg(m_Reg(Y)))) {
+    // no opcode change
+  } else
+    return false;
+
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    Observer.changingInstr(MI);
+    MI.setDesc(B.getTII().get(Opc));
+    MI.getOperand(1).setReg(X);
+    MI.getOperand(2).setReg(Y);
+    Observer.changedInstr(MI);
+  };
+  return true;
+}
+
+bool CombinerHelper::matchFsubToFneg(MachineInstr &MI, Register &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FSUB);
+
+  Register LHS = MI.getOperand(1).getReg();
+  MatchInfo = MI.getOperand(2).getReg();
+  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+
+  const auto LHSCst = Ty.isVector()
+                          ? getFConstantSplat(LHS, MRI, /* allowUndef */ true)
+                          : getFConstantVRegValWithLookThrough(LHS, MRI);
+  if (!LHSCst)
+    return false;
+
+  // -0.0 is always allowed
+  if (LHSCst->Value.isNegZero())
+    return true;
+
+  // +0.0 is only allowed if nsz is set.
+  if (LHSCst->Value.isPosZero())
+    return MI.getFlag(MachineInstr::FmNsz);
+
+  return false;
+}
+
+void CombinerHelper::applyFsubToFneg(MachineInstr &MI, Register &MatchInfo) {
+  Builder.setInstrAndDebugLoc(MI);
+  Register Dst = MI.getOperand(0).getReg();
+  Builder.buildFNeg(
+      Dst, Builder.buildFCanonicalize(MRI.getType(Dst), MatchInfo).getReg(0));
+  eraseInst(MI);
+}
+
+/// Checks if \p MI is TargetOpcode::G_FMUL and contractable either
+/// due to global flags or MachineInstr flags.
+static bool isContractableFMul(MachineInstr &MI, bool AllowFusionGlobally) {
+  if (MI.getOpcode() != TargetOpcode::G_FMUL)
+    return false;
+  return AllowFusionGlobally || MI.getFlag(MachineInstr::MIFlag::FmContract);
+}
+
+static bool hasMoreUses(const MachineInstr &MI0, const MachineInstr &MI1,
+                        const MachineRegisterInfo &MRI) {
+  return std::distance(MRI.use_instr_nodbg_begin(MI0.getOperand(0).getReg()),
+                       MRI.use_instr_nodbg_end()) >
+         std::distance(MRI.use_instr_nodbg_begin(MI1.getOperand(0).getReg()),
+                       MRI.use_instr_nodbg_end());
+}
+
+bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI,
+                                         bool &AllowFusionGlobally,
+                                         bool &HasFMAD, bool &Aggressive,
+                                         bool CanReassociate) {
+
+  auto *MF = MI.getMF();
+  const auto &TLI = *MF->getSubtarget().getTargetLowering();
+  const TargetOptions &Options = MF->getTarget().Options;
+  LLT DstType = MRI.getType(MI.getOperand(0).getReg());
+
+  if (CanReassociate &&
+      !(Options.UnsafeFPMath || MI.getFlag(MachineInstr::MIFlag::FmReassoc)))
+    return false;
+
+  // Floating-point multiply-add with intermediate rounding.
+  HasFMAD = (!isPreLegalize() && TLI.isFMADLegal(MI, DstType));
+  // Floating-point multiply-add without intermediate rounding.
+  bool HasFMA = TLI.isFMAFasterThanFMulAndFAdd(*MF, DstType) &&
+                isLegalOrBeforeLegalizer({TargetOpcode::G_FMA, {DstType}});
+  // No valid opcode, do not combine.
+  if (!HasFMAD && !HasFMA)
+    return false;
+
+  AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast ||
+                        Options.UnsafeFPMath || HasFMAD;
+  // If the addition is not contractable, do not combine.
+  if (!AllowFusionGlobally && !MI.getFlag(MachineInstr::MIFlag::FmContract))
+    return false;
+
+  Aggressive = TLI.enableAggressiveFMAFusion(DstType);
+  return true;
+}
+
+bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FADD);
+
+  bool AllowFusionGlobally, HasFMAD, Aggressive;
+  if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
+    return false;
+
+  Register Op1 = MI.getOperand(1).getReg();
+  Register Op2 = MI.getOperand(2).getReg();
+  DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1};
+  DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2};
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+
+  // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+      isContractableFMul(*RHS.MI, AllowFusionGlobally)) {
+    if (hasMoreUses(*LHS.MI, *RHS.MI, MRI))
+      std::swap(LHS, RHS);
+  }
+
+  // fold (fadd (fmul x, y), z) -> (fma x, y, z)
+  if (isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+      (Aggressive || MRI.hasOneNonDBGUse(LHS.Reg))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {LHS.MI->getOperand(1).getReg(),
+                    LHS.MI->getOperand(2).getReg(), RHS.Reg});
+    };
+    return true;
+  }
+
+  // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
+  if (isContractableFMul(*RHS.MI, AllowFusionGlobally) &&
+      (Aggressive || MRI.hasOneNonDBGUse(RHS.Reg))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {RHS.MI->getOperand(1).getReg(),
+                    RHS.MI->getOperand(2).getReg(), LHS.Reg});
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FADD);
+
+  bool AllowFusionGlobally, HasFMAD, Aggressive;
+  if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
+    return false;
+
+  const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering();
+  Register Op1 = MI.getOperand(1).getReg();
+  Register Op2 = MI.getOperand(2).getReg();
+  DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1};
+  DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2};
+  LLT DstType = MRI.getType(MI.getOperand(0).getReg());
+
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+
+  // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+      isContractableFMul(*RHS.MI, AllowFusionGlobally)) {
+    if (hasMoreUses(*LHS.MI, *RHS.MI, MRI))
+      std::swap(LHS, RHS);
+  }
+
+  // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
+  MachineInstr *FpExtSrc;
+  if (mi_match(LHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) &&
+      isContractableFMul(*FpExtSrc, AllowFusionGlobally) &&
+      TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
+                          MRI.getType(FpExtSrc->getOperand(1).getReg()))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      auto FpExtX = B.buildFPExt(DstType, FpExtSrc->getOperand(1).getReg());
+      auto FpExtY = B.buildFPExt(DstType, FpExtSrc->getOperand(2).getReg());
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {FpExtX.getReg(0), FpExtY.getReg(0), RHS.Reg});
+    };
+    return true;
+  }
+
+  // fold (fadd z, (fpext (fmul x, y))) -> (fma (fpext x), (fpext y), z)
+  // Note: Commutes FADD operands.
+  if (mi_match(RHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) &&
+      isContractableFMul(*FpExtSrc, AllowFusionGlobally) &&
+      TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
+                          MRI.getType(FpExtSrc->getOperand(1).getReg()))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      auto FpExtX = B.buildFPExt(DstType, FpExtSrc->getOperand(1).getReg());
+      auto FpExtY = B.buildFPExt(DstType, FpExtSrc->getOperand(2).getReg());
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {FpExtX.getReg(0), FpExtY.getReg(0), LHS.Reg});
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchCombineFAddFMAFMulToFMadOrFMA(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FADD);
+
+  bool AllowFusionGlobally, HasFMAD, Aggressive;
+  if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive, true))
+    return false;
+
+  Register Op1 = MI.getOperand(1).getReg();
+  Register Op2 = MI.getOperand(2).getReg();
+  DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1};
+  DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2};
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+
+  // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+      isContractableFMul(*RHS.MI, AllowFusionGlobally)) {
+    if (hasMoreUses(*LHS.MI, *RHS.MI, MRI))
+      std::swap(LHS, RHS);
+  }
+
+  MachineInstr *FMA = nullptr;
+  Register Z;
+  // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y, (fma u, v, z))
+  if (LHS.MI->getOpcode() == PreferredFusedOpcode &&
+      (MRI.getVRegDef(LHS.MI->getOperand(3).getReg())->getOpcode() ==
+       TargetOpcode::G_FMUL) &&
+      MRI.hasOneNonDBGUse(LHS.MI->getOperand(0).getReg()) &&
+      MRI.hasOneNonDBGUse(LHS.MI->getOperand(3).getReg())) {
+    FMA = LHS.MI;
+    Z = RHS.Reg;
+  }
+  // fold (fadd z, (fma x, y, (fmul u, v))) -> (fma x, y, (fma u, v, z))
+  else if (RHS.MI->getOpcode() == PreferredFusedOpcode &&
+           (MRI.getVRegDef(RHS.MI->getOperand(3).getReg())->getOpcode() ==
+            TargetOpcode::G_FMUL) &&
+           MRI.hasOneNonDBGUse(RHS.MI->getOperand(0).getReg()) &&
+           MRI.hasOneNonDBGUse(RHS.MI->getOperand(3).getReg())) {
+    Z = LHS.Reg;
+    FMA = RHS.MI;
+  }
+
+  if (FMA) {
+    MachineInstr *FMulMI = MRI.getVRegDef(FMA->getOperand(3).getReg());
+    Register X = FMA->getOperand(1).getReg();
+    Register Y = FMA->getOperand(2).getReg();
+    Register U = FMulMI->getOperand(1).getReg();
+    Register V = FMulMI->getOperand(2).getReg();
+
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      Register InnerFMA = MRI.createGenericVirtualRegister(DstTy);
+      B.buildInstr(PreferredFusedOpcode, {InnerFMA}, {U, V, Z});
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {X, Y, InnerFMA});
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FADD);
+
+  bool AllowFusionGlobally, HasFMAD, Aggressive;
+  if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
+    return false;
+
+  if (!Aggressive)
+    return false;
+
+  const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering();
+  LLT DstType = MRI.getType(MI.getOperand(0).getReg());
+  Register Op1 = MI.getOperand(1).getReg();
+  Register Op2 = MI.getOperand(2).getReg();
+  DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1};
+  DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2};
+
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+
+  // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+      isContractableFMul(*RHS.MI, AllowFusionGlobally)) {
+    if (hasMoreUses(*LHS.MI, *RHS.MI, MRI))
+      std::swap(LHS, RHS);
+  }
+
+  // Builds: (fma x, y, (fma (fpext u), (fpext v), z))
+  auto buildMatchInfo = [=, &MI](Register U, Register V, Register Z, Register X,
+                                 Register Y, MachineIRBuilder &B) {
+    Register FpExtU = B.buildFPExt(DstType, U).getReg(0);
+    Register FpExtV = B.buildFPExt(DstType, V).getReg(0);
+    Register InnerFMA =
+        B.buildInstr(PreferredFusedOpcode, {DstType}, {FpExtU, FpExtV, Z})
+            .getReg(0);
+    B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                 {X, Y, InnerFMA});
+  };
+
+  MachineInstr *FMulMI, *FMAMI;
+  // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
+  //   -> (fma x, y, (fma (fpext u), (fpext v), z))
+  if (LHS.MI->getOpcode() == PreferredFusedOpcode &&
+      mi_match(LHS.MI->getOperand(3).getReg(), MRI,
+               m_GFPExt(m_MInstr(FMulMI))) &&
+      isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+      TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
+                          MRI.getType(FMulMI->getOperand(0).getReg()))) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      buildMatchInfo(FMulMI->getOperand(1).getReg(),
+                     FMulMI->getOperand(2).getReg(), RHS.Reg,
+                     LHS.MI->getOperand(1).getReg(),
+                     LHS.MI->getOperand(2).getReg(), B);
+    };
+    return true;
+  }
+
+  // fold (fadd (fpext (fma x, y, (fmul u, v))), z)
+  //   -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
+  // FIXME: This turns two single-precision and one double-precision
+  // operation into two double-precision operations, which might not be
+  // interesting for all targets, especially GPUs.
+  if (mi_match(LHS.Reg, MRI, m_GFPExt(m_MInstr(FMAMI))) &&
+      FMAMI->getOpcode() == PreferredFusedOpcode) {
+    MachineInstr *FMulMI = MRI.getVRegDef(FMAMI->getOperand(3).getReg());
+    if (isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+        TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
+                            MRI.getType(FMAMI->getOperand(0).getReg()))) {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        Register X = FMAMI->getOperand(1).getReg();
+        Register Y = FMAMI->getOperand(2).getReg();
+        X = B.buildFPExt(DstType, X).getReg(0);
+        Y = B.buildFPExt(DstType, Y).getReg(0);
+        buildMatchInfo(FMulMI->getOperand(1).getReg(),
+                       FMulMI->getOperand(2).getReg(), RHS.Reg, X, Y, B);
+      };
+
+      return true;
+    }
+  }
+
+  // fold (fadd z, (fma x, y, (fpext (fmul u, v)))
+  //   -> (fma x, y, (fma (fpext u), (fpext v), z))
+  if (RHS.MI->getOpcode() == PreferredFusedOpcode &&
+      mi_match(RHS.MI->getOperand(3).getReg(), MRI,
+               m_GFPExt(m_MInstr(FMulMI))) &&
+      isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+      TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
+                          MRI.getType(FMulMI->getOperand(0).getReg()))) {
+    MatchInfo = [=](MachineIRBuilder &B) {
+      buildMatchInfo(FMulMI->getOperand(1).getReg(),
+                     FMulMI->getOperand(2).getReg(), LHS.Reg,
+                     RHS.MI->getOperand(1).getReg(),
+                     RHS.MI->getOperand(2).getReg(), B);
+    };
+    return true;
+  }
+
+  // fold (fadd z, (fpext (fma x, y, (fmul u, v)))
+  //   -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
+  // FIXME: This turns two single-precision and one double-precision
+  // operation into two double-precision operations, which might not be
+  // interesting for all targets, especially GPUs.
+  if (mi_match(RHS.Reg, MRI, m_GFPExt(m_MInstr(FMAMI))) &&
+      FMAMI->getOpcode() == PreferredFusedOpcode) {
+    MachineInstr *FMulMI = MRI.getVRegDef(FMAMI->getOperand(3).getReg());
+    if (isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+        TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
+                            MRI.getType(FMAMI->getOperand(0).getReg()))) {
+      MatchInfo = [=](MachineIRBuilder &B) {
+        Register X = FMAMI->getOperand(1).getReg();
+        Register Y = FMAMI->getOperand(2).getReg();
+        X = B.buildFPExt(DstType, X).getReg(0);
+        Y = B.buildFPExt(DstType, Y).getReg(0);
+        buildMatchInfo(FMulMI->getOperand(1).getReg(),
+                       FMulMI->getOperand(2).getReg(), LHS.Reg, X, Y, B);
+      };
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FSUB);
+
+  bool AllowFusionGlobally, HasFMAD, Aggressive;
+  if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
+    return false;
+
+  Register Op1 = MI.getOperand(1).getReg();
+  Register Op2 = MI.getOperand(2).getReg();
+  DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1};
+  DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2};
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+  // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  int FirstMulHasFewerUses = true;
+  if (isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+      isContractableFMul(*RHS.MI, AllowFusionGlobally) &&
+      hasMoreUses(*LHS.MI, *RHS.MI, MRI))
+    FirstMulHasFewerUses = false;
+
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+
+  // fold (fsub (fmul x, y), z) -> (fma x, y, -z)
+  if (FirstMulHasFewerUses &&
+      (isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+       (Aggressive || MRI.hasOneNonDBGUse(LHS.Reg)))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      Register NegZ = B.buildFNeg(DstTy, RHS.Reg).getReg(0);
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {LHS.MI->getOperand(1).getReg(),
+                    LHS.MI->getOperand(2).getReg(), NegZ});
+    };
+    return true;
+  }
+  // fold (fsub x, (fmul y, z)) -> (fma -y, z, x)
+  else if ((isContractableFMul(*RHS.MI, AllowFusionGlobally) &&
+            (Aggressive || MRI.hasOneNonDBGUse(RHS.Reg)))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      Register NegY =
+          B.buildFNeg(DstTy, RHS.MI->getOperand(1).getReg()).getReg(0);
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {NegY, RHS.MI->getOperand(2).getReg(), LHS.Reg});
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchCombineFSubFNegFMulToFMadOrFMA(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FSUB);
+
+  bool AllowFusionGlobally, HasFMAD, Aggressive;
+  if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
+    return false;
+
+  Register LHSReg = MI.getOperand(1).getReg();
+  Register RHSReg = MI.getOperand(2).getReg();
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+
+  MachineInstr *FMulMI;
+  // fold (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
+  if (mi_match(LHSReg, MRI, m_GFNeg(m_MInstr(FMulMI))) &&
+      (Aggressive || (MRI.hasOneNonDBGUse(LHSReg) &&
+                      MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg()))) &&
+      isContractableFMul(*FMulMI, AllowFusionGlobally)) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      Register NegX =
+          B.buildFNeg(DstTy, FMulMI->getOperand(1).getReg()).getReg(0);
+      Register NegZ = B.buildFNeg(DstTy, RHSReg).getReg(0);
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {NegX, FMulMI->getOperand(2).getReg(), NegZ});
+    };
+    return true;
+  }
+
+  // fold (fsub x, (fneg (fmul, y, z))) -> (fma y, z, x)
+  if (mi_match(RHSReg, MRI, m_GFNeg(m_MInstr(FMulMI))) &&
+      (Aggressive || (MRI.hasOneNonDBGUse(RHSReg) &&
+                      MRI.hasOneNonDBGUse(FMulMI->getOperand(0).getReg()))) &&
+      isContractableFMul(*FMulMI, AllowFusionGlobally)) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {FMulMI->getOperand(1).getReg(),
+                    FMulMI->getOperand(2).getReg(), LHSReg});
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FSUB);
+
+  bool AllowFusionGlobally, HasFMAD, Aggressive;
+  if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
+    return false;
+
+  Register LHSReg = MI.getOperand(1).getReg();
+  Register RHSReg = MI.getOperand(2).getReg();
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+
+  MachineInstr *FMulMI;
+  // fold (fsub (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), (fneg z))
+  if (mi_match(LHSReg, MRI, m_GFPExt(m_MInstr(FMulMI))) &&
+      isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+      (Aggressive || MRI.hasOneNonDBGUse(LHSReg))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      Register FpExtX =
+          B.buildFPExt(DstTy, FMulMI->getOperand(1).getReg()).getReg(0);
+      Register FpExtY =
+          B.buildFPExt(DstTy, FMulMI->getOperand(2).getReg()).getReg(0);
+      Register NegZ = B.buildFNeg(DstTy, RHSReg).getReg(0);
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {FpExtX, FpExtY, NegZ});
+    };
+    return true;
+  }
+
+  // fold (fsub x, (fpext (fmul y, z))) -> (fma (fneg (fpext y)), (fpext z), x)
+  if (mi_match(RHSReg, MRI, m_GFPExt(m_MInstr(FMulMI))) &&
+      isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+      (Aggressive || MRI.hasOneNonDBGUse(RHSReg))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      Register FpExtY =
+          B.buildFPExt(DstTy, FMulMI->getOperand(1).getReg()).getReg(0);
+      Register NegY = B.buildFNeg(DstTy, FpExtY).getReg(0);
+      Register FpExtZ =
+          B.buildFPExt(DstTy, FMulMI->getOperand(2).getReg()).getReg(0);
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {NegY, FpExtZ, LHSReg});
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_FSUB);
+
+  bool AllowFusionGlobally, HasFMAD, Aggressive;
+  if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
+    return false;
+
+  const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering();
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+  Register LHSReg = MI.getOperand(1).getReg();
+  Register RHSReg = MI.getOperand(2).getReg();
+
+  unsigned PreferredFusedOpcode =
+      HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
+
+  auto buildMatchInfo = [=](Register Dst, Register X, Register Y, Register Z,
+                            MachineIRBuilder &B) {
+    Register FpExtX = B.buildFPExt(DstTy, X).getReg(0);
+    Register FpExtY = B.buildFPExt(DstTy, Y).getReg(0);
+    B.buildInstr(PreferredFusedOpcode, {Dst}, {FpExtX, FpExtY, Z});
+  };
+
+  MachineInstr *FMulMI;
+  // fold (fsub (fpext (fneg (fmul x, y))), z) ->
+  //      (fneg (fma (fpext x), (fpext y), z))
+  // fold (fsub (fneg (fpext (fmul x, y))), z) ->
+  //      (fneg (fma (fpext x), (fpext y), z))
+  if ((mi_match(LHSReg, MRI, m_GFPExt(m_GFNeg(m_MInstr(FMulMI)))) ||
+       mi_match(LHSReg, MRI, m_GFNeg(m_GFPExt(m_MInstr(FMulMI))))) &&
+      isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+      TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy,
+                          MRI.getType(FMulMI->getOperand(0).getReg()))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      Register FMAReg = MRI.createGenericVirtualRegister(DstTy);
+      buildMatchInfo(FMAReg, FMulMI->getOperand(1).getReg(),
+                     FMulMI->getOperand(2).getReg(), RHSReg, B);
+      B.buildFNeg(MI.getOperand(0).getReg(), FMAReg);
+    };
+    return true;
+  }
+
+  // fold (fsub x, (fpext (fneg (fmul y, z)))) -> (fma (fpext y), (fpext z), x)
+  // fold (fsub x, (fneg (fpext (fmul y, z)))) -> (fma (fpext y), (fpext z), x)
+  if ((mi_match(RHSReg, MRI, m_GFPExt(m_GFNeg(m_MInstr(FMulMI)))) ||
+       mi_match(RHSReg, MRI, m_GFNeg(m_GFPExt(m_MInstr(FMulMI))))) &&
+      isContractableFMul(*FMulMI, AllowFusionGlobally) &&
+      TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstTy,
+                          MRI.getType(FMulMI->getOperand(0).getReg()))) {
+    MatchInfo = [=, &MI](MachineIRBuilder &B) {
+      buildMatchInfo(MI.getOperand(0).getReg(), FMulMI->getOperand(1).getReg(),
+                     FMulMI->getOperand(2).getReg(), LHSReg, B);
+    };
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchSelectToLogical(MachineInstr &MI,
+                                          BuildFnTy &MatchInfo) {
+  GSelect &Sel = cast<GSelect>(MI);
+  Register DstReg = Sel.getReg(0);
+  Register Cond = Sel.getCondReg();
+  Register TrueReg = Sel.getTrueReg();
+  Register FalseReg = Sel.getFalseReg();
+
+  auto *TrueDef = getDefIgnoringCopies(TrueReg, MRI);
+  auto *FalseDef = getDefIgnoringCopies(FalseReg, MRI);
+
+  const LLT CondTy = MRI.getType(Cond);
+  const LLT OpTy = MRI.getType(TrueReg);
+  if (CondTy != OpTy || OpTy.getScalarSizeInBits() != 1)
+    return false;
+
+  // We have a boolean select.
+
+  // select Cond, Cond, F --> or Cond, F
+  // select Cond, 1, F    --> or Cond, F
+  auto MaybeCstTrue = isConstantOrConstantSplatVector(*TrueDef, MRI);
+  if (Cond == TrueReg || (MaybeCstTrue && MaybeCstTrue->isOne())) {
+    MatchInfo = [=](MachineIRBuilder &MIB) {
+      MIB.buildOr(DstReg, Cond, FalseReg);
+    };
+    return true;
+  }
+
+  // select Cond, T, Cond --> and Cond, T
+  // select Cond, T, 0    --> and Cond, T
+  auto MaybeCstFalse = isConstantOrConstantSplatVector(*FalseDef, MRI);
+  if (Cond == FalseReg || (MaybeCstFalse && MaybeCstFalse->isZero())) {
+    MatchInfo = [=](MachineIRBuilder &MIB) {
+      MIB.buildAnd(DstReg, Cond, TrueReg);
+    };
+    return true;
+  }
+
+ // select Cond, T, 1 --> or (not Cond), T
+  if (MaybeCstFalse && MaybeCstFalse->isOne()) {
+    MatchInfo = [=](MachineIRBuilder &MIB) {
+      MIB.buildOr(DstReg, MIB.buildNot(OpTy, Cond), TrueReg);
+    };
+    return true;
+  }
+
+  // select Cond, 0, F --> and (not Cond), F
+  if (MaybeCstTrue && MaybeCstTrue->isZero()) {
+    MatchInfo = [=](MachineIRBuilder &MIB) {
+      MIB.buildAnd(DstReg, MIB.buildNot(OpTy, Cond), FalseReg);
+    };
+    return true;
+  }
+  return false;
+}
+
+bool CombinerHelper::matchCombineFMinMaxNaN(MachineInstr &MI,
+                                            unsigned &IdxToPropagate) {
+  bool PropagateNaN;
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case TargetOpcode::G_FMINNUM:
+  case TargetOpcode::G_FMAXNUM:
+    PropagateNaN = false;
+    break;
+  case TargetOpcode::G_FMINIMUM:
+  case TargetOpcode::G_FMAXIMUM:
+    PropagateNaN = true;
+    break;
+  }
+
+  auto MatchNaN = [&](unsigned Idx) {
+    Register MaybeNaNReg = MI.getOperand(Idx).getReg();
+    const ConstantFP *MaybeCst = getConstantFPVRegVal(MaybeNaNReg, MRI);
+    if (!MaybeCst || !MaybeCst->getValueAPF().isNaN())
+      return false;
+    IdxToPropagate = PropagateNaN ? Idx : (Idx == 1 ? 2 : 1);
+    return true;
+  };
+
+  return MatchNaN(1) || MatchNaN(2);
+}
+
+bool CombinerHelper::matchAddSubSameReg(MachineInstr &MI, Register &Src) {
+  assert(MI.getOpcode() == TargetOpcode::G_ADD && "Expected a G_ADD");
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+
+  // Helper lambda to check for opportunities for
+  // A + (B - A) -> B
+  // (B - A) + A -> B
+  auto CheckFold = [&](Register MaybeSub, Register MaybeSameReg) {
+    Register Reg;
+    return mi_match(MaybeSub, MRI, m_GSub(m_Reg(Src), m_Reg(Reg))) &&
+           Reg == MaybeSameReg;
+  };
+  return CheckFold(LHS, RHS) || CheckFold(RHS, LHS);
+}
+
+bool CombinerHelper::matchBuildVectorIdentityFold(MachineInstr &MI,
+                                                  Register &MatchInfo) {
+  // This combine folds the following patterns:
+  //
+  //  G_BUILD_VECTOR_TRUNC (G_BITCAST(x), G_LSHR(G_BITCAST(x), k))
+  //  G_BUILD_VECTOR(G_TRUNC(G_BITCAST(x)), G_TRUNC(G_LSHR(G_BITCAST(x), k)))
+  //    into
+  //      x
+  //    if
+  //      k == sizeof(VecEltTy)/2
+  //      type(x) == type(dst)
+  //
+  //  G_BUILD_VECTOR(G_TRUNC(G_BITCAST(x)), undef)
+  //    into
+  //      x
+  //    if
+  //      type(x) == type(dst)
+
+  LLT DstVecTy = MRI.getType(MI.getOperand(0).getReg());
+  LLT DstEltTy = DstVecTy.getElementType();
+
+  Register Lo, Hi;
+
+  if (mi_match(
+          MI, MRI,
+          m_GBuildVector(m_GTrunc(m_GBitcast(m_Reg(Lo))), m_GImplicitDef()))) {
+    MatchInfo = Lo;
+    return MRI.getType(MatchInfo) == DstVecTy;
+  }
+
+  std::optional<ValueAndVReg> ShiftAmount;
+  const auto LoPattern = m_GBitcast(m_Reg(Lo));
+  const auto HiPattern = m_GLShr(m_GBitcast(m_Reg(Hi)), m_GCst(ShiftAmount));
+  if (mi_match(
+          MI, MRI,
+          m_any_of(m_GBuildVectorTrunc(LoPattern, HiPattern),
+                   m_GBuildVector(m_GTrunc(LoPattern), m_GTrunc(HiPattern))))) {
+    if (Lo == Hi && ShiftAmount->Value == DstEltTy.getSizeInBits()) {
+      MatchInfo = Lo;
+      return MRI.getType(MatchInfo) == DstVecTy;
+    }
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchTruncBuildVectorFold(MachineInstr &MI,
+                                               Register &MatchInfo) {
+  // Replace (G_TRUNC (G_BITCAST (G_BUILD_VECTOR x, y)) with just x
+  // if type(x) == type(G_TRUNC)
+  if (!mi_match(MI.getOperand(1).getReg(), MRI,
+                m_GBitcast(m_GBuildVector(m_Reg(MatchInfo), m_Reg()))))
+    return false;
+
+  return MRI.getType(MatchInfo) == MRI.getType(MI.getOperand(0).getReg());
+}
+
+bool CombinerHelper::matchTruncLshrBuildVectorFold(MachineInstr &MI,
+                                                   Register &MatchInfo) {
+  // Replace (G_TRUNC (G_LSHR (G_BITCAST (G_BUILD_VECTOR x, y)), K)) with
+  //    y if K == size of vector element type
+  std::optional<ValueAndVReg> ShiftAmt;
+  if (!mi_match(MI.getOperand(1).getReg(), MRI,
+                m_GLShr(m_GBitcast(m_GBuildVector(m_Reg(), m_Reg(MatchInfo))),
+                        m_GCst(ShiftAmt))))
+    return false;
+
+  LLT MatchTy = MRI.getType(MatchInfo);
+  return ShiftAmt->Value.getZExtValue() == MatchTy.getSizeInBits() &&
+         MatchTy == MRI.getType(MI.getOperand(0).getReg());
+}
+
+unsigned CombinerHelper::getFPMinMaxOpcForSelect(
+    CmpInst::Predicate Pred, LLT DstTy,
+    SelectPatternNaNBehaviour VsNaNRetVal) const {
+  assert(VsNaNRetVal != SelectPatternNaNBehaviour::NOT_APPLICABLE &&
+         "Expected a NaN behaviour?");
+  // Choose an opcode based off of legality or the behaviour when one of the
+  // LHS/RHS may be NaN.
+  switch (Pred) {
+  default:
+    return 0;
+  case CmpInst::FCMP_UGT:
+  case CmpInst::FCMP_UGE:
+  case CmpInst::FCMP_OGT:
+  case CmpInst::FCMP_OGE:
+    if (VsNaNRetVal == SelectPatternNaNBehaviour::RETURNS_OTHER)
+      return TargetOpcode::G_FMAXNUM;
+    if (VsNaNRetVal == SelectPatternNaNBehaviour::RETURNS_NAN)
+      return TargetOpcode::G_FMAXIMUM;
+    if (isLegal({TargetOpcode::G_FMAXNUM, {DstTy}}))
+      return TargetOpcode::G_FMAXNUM;
+    if (isLegal({TargetOpcode::G_FMAXIMUM, {DstTy}}))
+      return TargetOpcode::G_FMAXIMUM;
+    return 0;
+  case CmpInst::FCMP_ULT:
+  case CmpInst::FCMP_ULE:
+  case CmpInst::FCMP_OLT:
+  case CmpInst::FCMP_OLE:
+    if (VsNaNRetVal == SelectPatternNaNBehaviour::RETURNS_OTHER)
+      return TargetOpcode::G_FMINNUM;
+    if (VsNaNRetVal == SelectPatternNaNBehaviour::RETURNS_NAN)
+      return TargetOpcode::G_FMINIMUM;
+    if (isLegal({TargetOpcode::G_FMINNUM, {DstTy}}))
+      return TargetOpcode::G_FMINNUM;
+    if (!isLegal({TargetOpcode::G_FMINIMUM, {DstTy}}))
+      return 0;
+    return TargetOpcode::G_FMINIMUM;
+  }
+}
+
+CombinerHelper::SelectPatternNaNBehaviour
+CombinerHelper::computeRetValAgainstNaN(Register LHS, Register RHS,
+                                        bool IsOrderedComparison) const {
+  bool LHSSafe = isKnownNeverNaN(LHS, MRI);
+  bool RHSSafe = isKnownNeverNaN(RHS, MRI);
+  // Completely unsafe.
+  if (!LHSSafe && !RHSSafe)
+    return SelectPatternNaNBehaviour::NOT_APPLICABLE;
+  if (LHSSafe && RHSSafe)
+    return SelectPatternNaNBehaviour::RETURNS_ANY;
+  // An ordered comparison will return false when given a NaN, so it
+  // returns the RHS.
+  if (IsOrderedComparison)
+    return LHSSafe ? SelectPatternNaNBehaviour::RETURNS_NAN
+                   : SelectPatternNaNBehaviour::RETURNS_OTHER;
+  // An unordered comparison will return true when given a NaN, so it
+  // returns the LHS.
+  return LHSSafe ? SelectPatternNaNBehaviour::RETURNS_OTHER
+                 : SelectPatternNaNBehaviour::RETURNS_NAN;
+}
+
+bool CombinerHelper::matchFPSelectToMinMax(Register Dst, Register Cond,
+                                           Register TrueVal, Register FalseVal,
+                                           BuildFnTy &MatchInfo) {
+  // Match: select (fcmp cond x, y) x, y
+  //        select (fcmp cond x, y) y, x
+  // And turn it into fminnum/fmaxnum or fmin/fmax based off of the condition.
+  LLT DstTy = MRI.getType(Dst);
+  // Bail out early on pointers, since we'll never want to fold to a min/max.
+  if (DstTy.isPointer())
+    return false;
+  // Match a floating point compare with a less-than/greater-than predicate.
+  // TODO: Allow multiple users of the compare if they are all selects.
+  CmpInst::Predicate Pred;
+  Register CmpLHS, CmpRHS;
+  if (!mi_match(Cond, MRI,
+                m_OneNonDBGUse(
+                    m_GFCmp(m_Pred(Pred), m_Reg(CmpLHS), m_Reg(CmpRHS)))) ||
+      CmpInst::isEquality(Pred))
+    return false;
+  SelectPatternNaNBehaviour ResWithKnownNaNInfo =
+      computeRetValAgainstNaN(CmpLHS, CmpRHS, CmpInst::isOrdered(Pred));
+  if (ResWithKnownNaNInfo == SelectPatternNaNBehaviour::NOT_APPLICABLE)
+    return false;
+  if (TrueVal == CmpRHS && FalseVal == CmpLHS) {
+    std::swap(CmpLHS, CmpRHS);
+    Pred = CmpInst::getSwappedPredicate(Pred);
+    if (ResWithKnownNaNInfo == SelectPatternNaNBehaviour::RETURNS_NAN)
+      ResWithKnownNaNInfo = SelectPatternNaNBehaviour::RETURNS_OTHER;
+    else if (ResWithKnownNaNInfo == SelectPatternNaNBehaviour::RETURNS_OTHER)
+      ResWithKnownNaNInfo = SelectPatternNaNBehaviour::RETURNS_NAN;
+  }
+  if (TrueVal != CmpLHS || FalseVal != CmpRHS)
+    return false;
+  // Decide what type of max/min this should be based off of the predicate.
+  unsigned Opc = getFPMinMaxOpcForSelect(Pred, DstTy, ResWithKnownNaNInfo);
+  if (!Opc || !isLegal({Opc, {DstTy}}))
+    return false;
+  // Comparisons between signed zero and zero may have different results...
+  // unless we have fmaximum/fminimum. In that case, we know -0 < 0.
+  if (Opc != TargetOpcode::G_FMAXIMUM && Opc != TargetOpcode::G_FMINIMUM) {
+    // We don't know if a comparison between two 0s will give us a consistent
+    // result. Be conservative and only proceed if at least one side is
+    // non-zero.
+    auto KnownNonZeroSide = getFConstantVRegValWithLookThrough(CmpLHS, MRI);
+    if (!KnownNonZeroSide || !KnownNonZeroSide->Value.isNonZero()) {
+      KnownNonZeroSide = getFConstantVRegValWithLookThrough(CmpRHS, MRI);
+      if (!KnownNonZeroSide || !KnownNonZeroSide->Value.isNonZero())
+        return false;
+    }
+  }
+  MatchInfo = [=](MachineIRBuilder &B) {
+    B.buildInstr(Opc, {Dst}, {CmpLHS, CmpRHS});
+  };
+  return true;
+}
+
+bool CombinerHelper::matchSimplifySelectToMinMax(MachineInstr &MI,
+                                                 BuildFnTy &MatchInfo) {
+  // TODO: Handle integer cases.
+  assert(MI.getOpcode() == TargetOpcode::G_SELECT);
+  // Condition may be fed by a truncated compare.
+  Register Cond = MI.getOperand(1).getReg();
+  Register MaybeTrunc;
+  if (mi_match(Cond, MRI, m_OneNonDBGUse(m_GTrunc(m_Reg(MaybeTrunc)))))
+    Cond = MaybeTrunc;
+  Register Dst = MI.getOperand(0).getReg();
+  Register TrueVal = MI.getOperand(2).getReg();
+  Register FalseVal = MI.getOperand(3).getReg();
+  return matchFPSelectToMinMax(Dst, Cond, TrueVal, FalseVal, MatchInfo);
+}
+
+bool CombinerHelper::matchRedundantBinOpInEquality(MachineInstr &MI,
+                                                   BuildFnTy &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_ICMP);
+  // (X + Y) == X --> Y == 0
+  // (X + Y) != X --> Y != 0
+  // (X - Y) == X --> Y == 0
+  // (X - Y) != X --> Y != 0
+  // (X ^ Y) == X --> Y == 0
+  // (X ^ Y) != X --> Y != 0
+  Register Dst = MI.getOperand(0).getReg();
+  CmpInst::Predicate Pred;
+  Register X, Y, OpLHS, OpRHS;
+  bool MatchedSub = mi_match(
+      Dst, MRI,
+      m_c_GICmp(m_Pred(Pred), m_Reg(X), m_GSub(m_Reg(OpLHS), m_Reg(Y))));
+  if (MatchedSub && X != OpLHS)
+    return false;
+  if (!MatchedSub) {
+    if (!mi_match(Dst, MRI,
+                  m_c_GICmp(m_Pred(Pred), m_Reg(X),
+                            m_any_of(m_GAdd(m_Reg(OpLHS), m_Reg(OpRHS)),
+                                     m_GXor(m_Reg(OpLHS), m_Reg(OpRHS))))))
+      return false;
+    Y = X == OpLHS ? OpRHS : X == OpRHS ? OpLHS : Register();
+  }
+  MatchInfo = [=](MachineIRBuilder &B) {
+    auto Zero = B.buildConstant(MRI.getType(Y), 0);
+    B.buildICmp(Pred, Dst, Y, Zero);
+  };
+  return CmpInst::isEquality(Pred) && Y.isValid();
+}
+
+bool CombinerHelper::matchShiftsTooBig(MachineInstr &MI) {
+  Register ShiftReg = MI.getOperand(2).getReg();
+  LLT ResTy = MRI.getType(MI.getOperand(0).getReg());
+  auto IsShiftTooBig = [&](const Constant *C) {
+    auto *CI = dyn_cast<ConstantInt>(C);
+    return CI && CI->uge(ResTy.getScalarSizeInBits());
+  };
+  return matchUnaryPredicate(MRI, ShiftReg, IsShiftTooBig);
+}
+
+bool CombinerHelper::tryCombine(MachineInstr &MI) {
+  if (tryCombineCopy(MI))
+    return true;
+  if (tryCombineExtendingLoads(MI))
+    return true;
+  if (tryCombineIndexedLoadStore(MI))
+    return true;
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GIMatchTableExecutor.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GIMatchTableExecutor.cpp
new file mode 100644
index 000000000000..d747cbf5aadc
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GIMatchTableExecutor.cpp
@@ -0,0 +1,68 @@
+//===- llvm/CodeGen/GlobalISel/GIMatchTableExecutor.cpp -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements the GIMatchTableExecutor class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define DEBUG_TYPE "gi-match-table-executor"
+
+using namespace llvm;
+
+GIMatchTableExecutor::MatcherState::MatcherState(unsigned MaxRenderers)
+    : Renderers(MaxRenderers) {}
+
+GIMatchTableExecutor::GIMatchTableExecutor() = default;
+
+bool GIMatchTableExecutor::isOperandImmEqual(
+    const MachineOperand &MO, int64_t Value,
+    const MachineRegisterInfo &MRI) const {
+  if (MO.isReg() && MO.getReg())
+    if (auto VRegVal = getIConstantVRegValWithLookThrough(MO.getReg(), MRI))
+      return VRegVal->Value.getSExtValue() == Value;
+  return false;
+}
+
+bool GIMatchTableExecutor::isBaseWithConstantOffset(
+    const MachineOperand &Root, const MachineRegisterInfo &MRI) const {
+  if (!Root.isReg())
+    return false;
+
+  MachineInstr *RootI = MRI.getVRegDef(Root.getReg());
+  if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
+    return false;
+
+  MachineOperand &RHS = RootI->getOperand(2);
+  MachineInstr *RHSI = MRI.getVRegDef(RHS.getReg());
+  if (RHSI->getOpcode() != TargetOpcode::G_CONSTANT)
+    return false;
+
+  return true;
+}
+
+bool GIMatchTableExecutor::isObviouslySafeToFold(MachineInstr &MI,
+                                                 MachineInstr &IntoMI) const {
+  // Immediate neighbours are already folded.
+  if (MI.getParent() == IntoMI.getParent() &&
+      std::next(MI.getIterator()) == IntoMI.getIterator())
+    return true;
+
+  // Convergent instructions cannot be moved in the CFG.
+  if (MI.isConvergent() && MI.getParent() != IntoMI.getParent())
+    return false;
+
+  return !MI.mayLoadOrStore() && !MI.mayRaiseFPException() &&
+         !MI.hasUnmodeledSideEffects() && MI.implicit_operands().empty();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp
new file mode 100644
index 000000000000..59f4d60a41d8
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp
@@ -0,0 +1,48 @@
+//===-- lib/CodeGen/GlobalISel/GISelChangeObserver.cpp --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file constains common code to combine machine functions at generic
+// level.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+void GISelChangeObserver::changingAllUsesOfReg(
+    const MachineRegisterInfo &MRI, Register Reg) {
+  for (auto &ChangingMI : MRI.use_instructions(Reg)) {
+    changingInstr(ChangingMI);
+    ChangingAllUsesOfReg.insert(&ChangingMI);
+  }
+}
+
+void GISelChangeObserver::finishedChangingAllUsesOfReg() {
+  for (auto *ChangedMI : ChangingAllUsesOfReg)
+    changedInstr(*ChangedMI);
+  ChangingAllUsesOfReg.clear();
+}
+
+RAIIDelegateInstaller::RAIIDelegateInstaller(MachineFunction &MF,
+                                             MachineFunction::Delegate *Del)
+    : MF(MF), Delegate(Del) {
+  // Register this as the delegate for handling insertions and deletions of
+  // instructions.
+  MF.setDelegate(Del);
+}
+
+RAIIDelegateInstaller::~RAIIDelegateInstaller() { MF.resetDelegate(Delegate); }
+
+RAIIMFObserverInstaller::RAIIMFObserverInstaller(MachineFunction &MF,
+                                                 GISelChangeObserver &Observer)
+    : MF(MF) {
+  MF.setObserver(&Observer);
+}
+
+RAIIMFObserverInstaller::~RAIIMFObserverInstaller() { MF.setObserver(nullptr); }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
new file mode 100644
index 000000000000..363ffbfa90b5
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -0,0 +1,771 @@
+//===- lib/CodeGen/GlobalISel/GISelKnownBits.cpp --------------*- C++ *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// Provides analysis for querying information about KnownBits during GISel
+/// passes.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/IR/Module.h"
+
+#define DEBUG_TYPE "gisel-known-bits"
+
+using namespace llvm;
+
+char llvm::GISelKnownBitsAnalysis::ID = 0;
+
+INITIALIZE_PASS(GISelKnownBitsAnalysis, DEBUG_TYPE,
+                "Analysis for ComputingKnownBits", false, true)
+
+GISelKnownBits::GISelKnownBits(MachineFunction &MF, unsigned MaxDepth)
+    : MF(MF), MRI(MF.getRegInfo()), TL(*MF.getSubtarget().getTargetLowering()),
+      DL(MF.getFunction().getParent()->getDataLayout()), MaxDepth(MaxDepth) {}
+
+Align GISelKnownBits::computeKnownAlignment(Register R, unsigned Depth) {
+  const MachineInstr *MI = MRI.getVRegDef(R);
+  switch (MI->getOpcode()) {
+  case TargetOpcode::COPY:
+    return computeKnownAlignment(MI->getOperand(1).getReg(), Depth);
+  case TargetOpcode::G_ASSERT_ALIGN: {
+    // TODO: Min with source
+    return Align(MI->getOperand(2).getImm());
+  }
+  case TargetOpcode::G_FRAME_INDEX: {
+    int FrameIdx = MI->getOperand(1).getIndex();
+    return MF.getFrameInfo().getObjectAlign(FrameIdx);
+  }
+  case TargetOpcode::G_INTRINSIC:
+  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+  default:
+    return TL.computeKnownAlignForTargetInstr(*this, R, MRI, Depth + 1);
+  }
+}
+
+KnownBits GISelKnownBits::getKnownBits(MachineInstr &MI) {
+  assert(MI.getNumExplicitDefs() == 1 &&
+         "expected single return generic instruction");
+  return getKnownBits(MI.getOperand(0).getReg());
+}
+
+KnownBits GISelKnownBits::getKnownBits(Register R) {
+  const LLT Ty = MRI.getType(R);
+  APInt DemandedElts =
+      Ty.isVector() ? APInt::getAllOnes(Ty.getNumElements()) : APInt(1, 1);
+  return getKnownBits(R, DemandedElts);
+}
+
+KnownBits GISelKnownBits::getKnownBits(Register R, const APInt &DemandedElts,
+                                       unsigned Depth) {
+  // For now, we only maintain the cache during one request.
+  assert(ComputeKnownBitsCache.empty() && "Cache should have been cleared");
+
+  KnownBits Known;
+  computeKnownBitsImpl(R, Known, DemandedElts);
+  ComputeKnownBitsCache.clear();
+  return Known;
+}
+
+bool GISelKnownBits::signBitIsZero(Register R) {
+  LLT Ty = MRI.getType(R);
+  unsigned BitWidth = Ty.getScalarSizeInBits();
+  return maskedValueIsZero(R, APInt::getSignMask(BitWidth));
+}
+
+APInt GISelKnownBits::getKnownZeroes(Register R) {
+  return getKnownBits(R).Zero;
+}
+
+APInt GISelKnownBits::getKnownOnes(Register R) { return getKnownBits(R).One; }
+
+LLVM_ATTRIBUTE_UNUSED static void
+dumpResult(const MachineInstr &MI, const KnownBits &Known, unsigned Depth) {
+  dbgs() << "[" << Depth << "] Compute known bits: " << MI << "[" << Depth
+         << "] Computed for: " << MI << "[" << Depth << "] Known: 0x"
+         << toString(Known.Zero | Known.One, 16, false) << "\n"
+         << "[" << Depth << "] Zero: 0x" << toString(Known.Zero, 16, false)
+         << "\n"
+         << "[" << Depth << "] One:  0x" << toString(Known.One, 16, false)
+         << "\n";
+}
+
+/// Compute known bits for the intersection of \p Src0 and \p Src1
+void GISelKnownBits::computeKnownBitsMin(Register Src0, Register Src1,
+                                         KnownBits &Known,
+                                         const APInt &DemandedElts,
+                                         unsigned Depth) {
+  // Test src1 first, since we canonicalize simpler expressions to the RHS.
+  computeKnownBitsImpl(Src1, Known, DemandedElts, Depth);
+
+  // If we don't know any bits, early out.
+  if (Known.isUnknown())
+    return;
+
+  KnownBits Known2;
+  computeKnownBitsImpl(Src0, Known2, DemandedElts, Depth);
+
+  // Only known if known in both the LHS and RHS.
+  Known = Known.intersectWith(Known2);
+}
+
+// Bitfield extract is computed as (Src >> Offset) & Mask, where Mask is
+// created using Width. Use this function when the inputs are KnownBits
+// objects. TODO: Move this KnownBits.h if this is usable in more cases.
+static KnownBits extractBits(unsigned BitWidth, const KnownBits &SrcOpKnown,
+                             const KnownBits &OffsetKnown,
+                             const KnownBits &WidthKnown) {
+  KnownBits Mask(BitWidth);
+  Mask.Zero = APInt::getBitsSetFrom(
+      BitWidth, WidthKnown.getMaxValue().getLimitedValue(BitWidth));
+  Mask.One = APInt::getLowBitsSet(
+      BitWidth, WidthKnown.getMinValue().getLimitedValue(BitWidth));
+  return KnownBits::lshr(SrcOpKnown, OffsetKnown) & Mask;
+}
+
+void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
+                                          const APInt &DemandedElts,
+                                          unsigned Depth) {
+  MachineInstr &MI = *MRI.getVRegDef(R);
+  unsigned Opcode = MI.getOpcode();
+  LLT DstTy = MRI.getType(R);
+
+  // Handle the case where this is called on a register that does not have a
+  // type constraint (i.e. it has a register class constraint instead). This is
+  // unlikely to occur except by looking through copies but it is possible for
+  // the initial register being queried to be in this state.
+  if (!DstTy.isValid()) {
+    Known = KnownBits();
+    return;
+  }
+
+  unsigned BitWidth = DstTy.getScalarSizeInBits();
+  auto CacheEntry = ComputeKnownBitsCache.find(R);
+  if (CacheEntry != ComputeKnownBitsCache.end()) {
+    Known = CacheEntry->second;
+    LLVM_DEBUG(dbgs() << "Cache hit at ");
+    LLVM_DEBUG(dumpResult(MI, Known, Depth));
+    assert(Known.getBitWidth() == BitWidth && "Cache entry size doesn't match");
+    return;
+  }
+  Known = KnownBits(BitWidth); // Don't know anything
+
+  // Depth may get bigger than max depth if it gets passed to a different
+  // GISelKnownBits object.
+  // This may happen when say a generic part uses a GISelKnownBits object
+  // with some max depth, but then we hit TL.computeKnownBitsForTargetInstr
+  // which creates a new GISelKnownBits object with a different and smaller
+  // depth. If we just check for equality, we would never exit if the depth
+  // that is passed down to the target specific GISelKnownBits object is
+  // already bigger than its max depth.
+  if (Depth >= getMaxDepth())
+    return;
+
+  if (!DemandedElts)
+    return; // No demanded elts, better to assume we don't know anything.
+
+  KnownBits Known2;
+
+  switch (Opcode) {
+  default:
+    TL.computeKnownBitsForTargetInstr(*this, R, Known, DemandedElts, MRI,
+                                      Depth);
+    break;
+  case TargetOpcode::G_BUILD_VECTOR: {
+    // Collect the known bits that are shared by every demanded vector element.
+    Known.Zero.setAllBits(); Known.One.setAllBits();
+    for (unsigned i = 0, e = MI.getNumOperands() - 1; i < e; ++i) {
+      if (!DemandedElts[i])
+        continue;
+
+      computeKnownBitsImpl(MI.getOperand(i + 1).getReg(), Known2, DemandedElts,
+                           Depth + 1);
+
+      // Known bits are the values that are shared by every demanded element.
+      Known = Known.intersectWith(Known2);
+
+      // If we don't know any bits, early out.
+      if (Known.isUnknown())
+        break;
+    }
+    break;
+  }
+  case TargetOpcode::COPY:
+  case TargetOpcode::G_PHI:
+  case TargetOpcode::PHI: {
+    Known.One = APInt::getAllOnes(BitWidth);
+    Known.Zero = APInt::getAllOnes(BitWidth);
+    // Destination registers should not have subregisters at this
+    // point of the pipeline, otherwise the main live-range will be
+    // defined more than once, which is against SSA.
+    assert(MI.getOperand(0).getSubReg() == 0 && "Is this code in SSA?");
+    // Record in the cache that we know nothing for MI.
+    // This will get updated later and in the meantime, if we reach that
+    // phi again, because of a loop, we will cut the search thanks to this
+    // cache entry.
+    // We could actually build up more information on the phi by not cutting
+    // the search, but that additional information is more a side effect
+    // than an intended choice.
+    // Therefore, for now, save on compile time until we derive a proper way
+    // to derive known bits for PHIs within loops.
+    ComputeKnownBitsCache[R] = KnownBits(BitWidth);
+    // PHI's operand are a mix of registers and basic blocks interleaved.
+    // We only care about the register ones.
+    for (unsigned Idx = 1; Idx < MI.getNumOperands(); Idx += 2) {
+      const MachineOperand &Src = MI.getOperand(Idx);
+      Register SrcReg = Src.getReg();
+      // Look through trivial copies and phis but don't look through trivial
+      // copies or phis of the form `%1:(s32) = OP %0:gpr32`, known-bits
+      // analysis is currently unable to determine the bit width of a
+      // register class.
+      //
+      // We can't use NoSubRegister by name as it's defined by each target but
+      // it's always defined to be 0 by tablegen.
+      if (SrcReg.isVirtual() && Src.getSubReg() == 0 /*NoSubRegister*/ &&
+          MRI.getType(SrcReg).isValid()) {
+        // For COPYs we don't do anything, don't increase the depth.
+        computeKnownBitsImpl(SrcReg, Known2, DemandedElts,
+                             Depth + (Opcode != TargetOpcode::COPY));
+        Known = Known.intersectWith(Known2);
+        // If we reach a point where we don't know anything
+        // just stop looking through the operands.
+        if (Known.isUnknown())
+          break;
+      } else {
+        // We know nothing.
+        Known = KnownBits(BitWidth);
+        break;
+      }
+    }
+    break;
+  }
+  case TargetOpcode::G_CONSTANT: {
+    auto CstVal = getIConstantVRegVal(R, MRI);
+    if (!CstVal)
+      break;
+    Known = KnownBits::makeConstant(*CstVal);
+    break;
+  }
+  case TargetOpcode::G_FRAME_INDEX: {
+    int FrameIdx = MI.getOperand(1).getIndex();
+    TL.computeKnownBitsForFrameIndex(FrameIdx, Known, MF);
+    break;
+  }
+  case TargetOpcode::G_SUB: {
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    Known = KnownBits::computeForAddSub(/*Add*/ false, /*NSW*/ false, Known,
+                                        Known2);
+    break;
+  }
+  case TargetOpcode::G_XOR: {
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+
+    Known ^= Known2;
+    break;
+  }
+  case TargetOpcode::G_PTR_ADD: {
+    if (DstTy.isVector())
+      break;
+    // G_PTR_ADD is like G_ADD. FIXME: Is this true for all targets?
+    LLT Ty = MRI.getType(MI.getOperand(1).getReg());
+    if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
+      break;
+    [[fallthrough]];
+  }
+  case TargetOpcode::G_ADD: {
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    Known =
+        KnownBits::computeForAddSub(/*Add*/ true, /*NSW*/ false, Known, Known2);
+    break;
+  }
+  case TargetOpcode::G_AND: {
+    // If either the LHS or the RHS are Zero, the result is zero.
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+
+    Known &= Known2;
+    break;
+  }
+  case TargetOpcode::G_OR: {
+    // If either the LHS or the RHS are Zero, the result is zero.
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+
+    Known |= Known2;
+    break;
+  }
+  case TargetOpcode::G_MUL: {
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    Known = KnownBits::mul(Known, Known2);
+    break;
+  }
+  case TargetOpcode::G_SELECT: {
+    computeKnownBitsMin(MI.getOperand(2).getReg(), MI.getOperand(3).getReg(),
+                        Known, DemandedElts, Depth + 1);
+    break;
+  }
+  case TargetOpcode::G_SMIN: {
+    // TODO: Handle clamp pattern with number of sign bits
+    KnownBits KnownRHS;
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), KnownRHS, DemandedElts,
+                         Depth + 1);
+    Known = KnownBits::smin(Known, KnownRHS);
+    break;
+  }
+  case TargetOpcode::G_SMAX: {
+    // TODO: Handle clamp pattern with number of sign bits
+    KnownBits KnownRHS;
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), KnownRHS, DemandedElts,
+                         Depth + 1);
+    Known = KnownBits::smax(Known, KnownRHS);
+    break;
+  }
+  case TargetOpcode::G_UMIN: {
+    KnownBits KnownRHS;
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known,
+                         DemandedElts, Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), KnownRHS,
+                         DemandedElts, Depth + 1);
+    Known = KnownBits::umin(Known, KnownRHS);
+    break;
+  }
+  case TargetOpcode::G_UMAX: {
+    KnownBits KnownRHS;
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known,
+                         DemandedElts, Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), KnownRHS,
+                         DemandedElts, Depth + 1);
+    Known = KnownBits::umax(Known, KnownRHS);
+    break;
+  }
+  case TargetOpcode::G_FCMP:
+  case TargetOpcode::G_ICMP: {
+    if (DstTy.isVector())
+      break;
+    if (TL.getBooleanContents(DstTy.isVector(),
+                              Opcode == TargetOpcode::G_FCMP) ==
+            TargetLowering::ZeroOrOneBooleanContent &&
+        BitWidth > 1)
+      Known.Zero.setBitsFrom(1);
+    break;
+  }
+  case TargetOpcode::G_SEXT: {
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    // If the sign bit is known to be zero or one, then sext will extend
+    // it to the top bits, else it will just zext.
+    Known = Known.sext(BitWidth);
+    break;
+  }
+  case TargetOpcode::G_ASSERT_SEXT:
+  case TargetOpcode::G_SEXT_INREG: {
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    Known = Known.sextInReg(MI.getOperand(2).getImm());
+    break;
+  }
+  case TargetOpcode::G_ANYEXT: {
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    Known = Known.anyext(BitWidth);
+    break;
+  }
+  case TargetOpcode::G_LOAD: {
+    const MachineMemOperand *MMO = *MI.memoperands_begin();
+    if (const MDNode *Ranges = MMO->getRanges()) {
+      computeKnownBitsFromRangeMetadata(*Ranges, Known);
+    }
+
+    break;
+  }
+  case TargetOpcode::G_ZEXTLOAD: {
+    if (DstTy.isVector())
+      break;
+    // Everything above the retrieved bits is zero
+    Known.Zero.setBitsFrom((*MI.memoperands_begin())->getSizeInBits());
+    break;
+  }
+  case TargetOpcode::G_ASHR: {
+    KnownBits LHSKnown, RHSKnown;
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), LHSKnown, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), RHSKnown, DemandedElts,
+                         Depth + 1);
+    Known = KnownBits::ashr(LHSKnown, RHSKnown);
+    break;
+  }
+  case TargetOpcode::G_LSHR: {
+    KnownBits LHSKnown, RHSKnown;
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), LHSKnown, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), RHSKnown, DemandedElts,
+                         Depth + 1);
+    Known = KnownBits::lshr(LHSKnown, RHSKnown);
+    break;
+  }
+  case TargetOpcode::G_SHL: {
+    KnownBits LHSKnown, RHSKnown;
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), LHSKnown, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), RHSKnown, DemandedElts,
+                         Depth + 1);
+    Known = KnownBits::shl(LHSKnown, RHSKnown);
+    break;
+  }
+  case TargetOpcode::G_INTTOPTR:
+  case TargetOpcode::G_PTRTOINT:
+    if (DstTy.isVector())
+      break;
+    // Fall through and handle them the same as zext/trunc.
+    [[fallthrough]];
+  case TargetOpcode::G_ASSERT_ZEXT:
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_TRUNC: {
+    Register SrcReg = MI.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+    unsigned SrcBitWidth;
+
+    // G_ASSERT_ZEXT stores the original bitwidth in the immediate operand.
+    if (Opcode == TargetOpcode::G_ASSERT_ZEXT)
+      SrcBitWidth = MI.getOperand(2).getImm();
+    else {
+      SrcBitWidth = SrcTy.isPointer()
+                        ? DL.getIndexSizeInBits(SrcTy.getAddressSpace())
+                        : SrcTy.getSizeInBits();
+    }
+    assert(SrcBitWidth && "SrcBitWidth can't be zero");
+    Known = Known.zextOrTrunc(SrcBitWidth);
+    computeKnownBitsImpl(SrcReg, Known, DemandedElts, Depth + 1);
+    Known = Known.zextOrTrunc(BitWidth);
+    if (BitWidth > SrcBitWidth)
+      Known.Zero.setBitsFrom(SrcBitWidth);
+    break;
+  }
+  case TargetOpcode::G_ASSERT_ALIGN: {
+    int64_t LogOfAlign = Log2_64(MI.getOperand(2).getImm());
+
+    // TODO: Should use maximum with source
+    // If a node is guaranteed to be aligned, set low zero bits accordingly as
+    // well as clearing one bits.
+    Known.Zero.setLowBits(LogOfAlign);
+    Known.One.clearLowBits(LogOfAlign);
+    break;
+  }
+  case TargetOpcode::G_MERGE_VALUES: {
+    unsigned NumOps = MI.getNumOperands();
+    unsigned OpSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+
+    for (unsigned I = 0; I != NumOps - 1; ++I) {
+      KnownBits SrcOpKnown;
+      computeKnownBitsImpl(MI.getOperand(I + 1).getReg(), SrcOpKnown,
+                           DemandedElts, Depth + 1);
+      Known.insertBits(SrcOpKnown, I * OpSize);
+    }
+    break;
+  }
+  case TargetOpcode::G_UNMERGE_VALUES: {
+    if (DstTy.isVector())
+      break;
+    unsigned NumOps = MI.getNumOperands();
+    Register SrcReg = MI.getOperand(NumOps - 1).getReg();
+    if (MRI.getType(SrcReg).isVector())
+      return; // TODO: Handle vectors.
+
+    KnownBits SrcOpKnown;
+    computeKnownBitsImpl(SrcReg, SrcOpKnown, DemandedElts, Depth + 1);
+
+    // Figure out the result operand index
+    unsigned DstIdx = 0;
+    for (; DstIdx != NumOps - 1 && MI.getOperand(DstIdx).getReg() != R;
+         ++DstIdx)
+      ;
+
+    Known = SrcOpKnown.extractBits(BitWidth, BitWidth * DstIdx);
+    break;
+  }
+  case TargetOpcode::G_BSWAP: {
+    Register SrcReg = MI.getOperand(1).getReg();
+    computeKnownBitsImpl(SrcReg, Known, DemandedElts, Depth + 1);
+    Known = Known.byteSwap();
+    break;
+  }
+  case TargetOpcode::G_BITREVERSE: {
+    Register SrcReg = MI.getOperand(1).getReg();
+    computeKnownBitsImpl(SrcReg, Known, DemandedElts, Depth + 1);
+    Known = Known.reverseBits();
+    break;
+  }
+  case TargetOpcode::G_CTPOP: {
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    // We can bound the space the count needs.  Also, bits known to be zero can't
+    // contribute to the population.
+    unsigned BitsPossiblySet = Known2.countMaxPopulation();
+    unsigned LowBits = llvm::bit_width(BitsPossiblySet);
+    Known.Zero.setBitsFrom(LowBits);
+    // TODO: we could bound Known.One using the lower bound on the number of
+    // bits which might be set provided by popcnt KnownOne2.
+    break;
+  }
+  case TargetOpcode::G_UBFX: {
+    KnownBits SrcOpKnown, OffsetKnown, WidthKnown;
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), SrcOpKnown, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), OffsetKnown, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(3).getReg(), WidthKnown, DemandedElts,
+                         Depth + 1);
+    Known = extractBits(BitWidth, SrcOpKnown, OffsetKnown, WidthKnown);
+    break;
+  }
+  case TargetOpcode::G_SBFX: {
+    KnownBits SrcOpKnown, OffsetKnown, WidthKnown;
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), SrcOpKnown, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), OffsetKnown, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(3).getReg(), WidthKnown, DemandedElts,
+                         Depth + 1);
+    Known = extractBits(BitWidth, SrcOpKnown, OffsetKnown, WidthKnown);
+    // Sign extend the extracted value using shift left and arithmetic shift
+    // right.
+    KnownBits ExtKnown = KnownBits::makeConstant(APInt(BitWidth, BitWidth));
+    KnownBits ShiftKnown = KnownBits::computeForAddSub(
+        /*Add*/ false, /*NSW*/ false, ExtKnown, WidthKnown);
+    Known = KnownBits::ashr(KnownBits::shl(Known, ShiftKnown), ShiftKnown);
+    break;
+  }
+  case TargetOpcode::G_UADDO:
+  case TargetOpcode::G_UADDE:
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_SADDE:
+  case TargetOpcode::G_USUBO:
+  case TargetOpcode::G_USUBE:
+  case TargetOpcode::G_SSUBO:
+  case TargetOpcode::G_SSUBE:
+  case TargetOpcode::G_UMULO:
+  case TargetOpcode::G_SMULO: {
+    if (MI.getOperand(1).getReg() == R) {
+      // If we know the result of a compare has the top bits zero, use this
+      // info.
+      if (TL.getBooleanContents(DstTy.isVector(), false) ==
+              TargetLowering::ZeroOrOneBooleanContent &&
+          BitWidth > 1)
+        Known.Zero.setBitsFrom(1);
+    }
+    break;
+  }
+  }
+
+  assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+  LLVM_DEBUG(dumpResult(MI, Known, Depth));
+
+  // Update the cache.
+  ComputeKnownBitsCache[R] = Known;
+}
+
+/// Compute number of sign bits for the intersection of \p Src0 and \p Src1
+unsigned GISelKnownBits::computeNumSignBitsMin(Register Src0, Register Src1,
+                                               const APInt &DemandedElts,
+                                               unsigned Depth) {
+  // Test src1 first, since we canonicalize simpler expressions to the RHS.
+  unsigned Src1SignBits = computeNumSignBits(Src1, DemandedElts, Depth);
+  if (Src1SignBits == 1)
+    return 1;
+  return std::min(computeNumSignBits(Src0, DemandedElts, Depth), Src1SignBits);
+}
+
+unsigned GISelKnownBits::computeNumSignBits(Register R,
+                                            const APInt &DemandedElts,
+                                            unsigned Depth) {
+  MachineInstr &MI = *MRI.getVRegDef(R);
+  unsigned Opcode = MI.getOpcode();
+
+  if (Opcode == TargetOpcode::G_CONSTANT)
+    return MI.getOperand(1).getCImm()->getValue().getNumSignBits();
+
+  if (Depth == getMaxDepth())
+    return 1;
+
+  if (!DemandedElts)
+    return 1; // No demanded elts, better to assume we don't know anything.
+
+  LLT DstTy = MRI.getType(R);
+  const unsigned TyBits = DstTy.getScalarSizeInBits();
+
+  // Handle the case where this is called on a register that does not have a
+  // type constraint. This is unlikely to occur except by looking through copies
+  // but it is possible for the initial register being queried to be in this
+  // state.
+  if (!DstTy.isValid())
+    return 1;
+
+  unsigned FirstAnswer = 1;
+  switch (Opcode) {
+  case TargetOpcode::COPY: {
+    MachineOperand &Src = MI.getOperand(1);
+    if (Src.getReg().isVirtual() && Src.getSubReg() == 0 &&
+        MRI.getType(Src.getReg()).isValid()) {
+      // Don't increment Depth for this one since we didn't do any work.
+      return computeNumSignBits(Src.getReg(), DemandedElts, Depth);
+    }
+
+    return 1;
+  }
+  case TargetOpcode::G_SEXT: {
+    Register Src = MI.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(Src);
+    unsigned Tmp = DstTy.getScalarSizeInBits() - SrcTy.getScalarSizeInBits();
+    return computeNumSignBits(Src, DemandedElts, Depth + 1) + Tmp;
+  }
+  case TargetOpcode::G_ASSERT_SEXT:
+  case TargetOpcode::G_SEXT_INREG: {
+    // Max of the input and what this extends.
+    Register Src = MI.getOperand(1).getReg();
+    unsigned SrcBits = MI.getOperand(2).getImm();
+    unsigned InRegBits = TyBits - SrcBits + 1;
+    return std::max(computeNumSignBits(Src, DemandedElts, Depth + 1), InRegBits);
+  }
+  case TargetOpcode::G_SEXTLOAD: {
+    // FIXME: We need an in-memory type representation.
+    if (DstTy.isVector())
+      return 1;
+
+    // e.g. i16->i32 = '17' bits known.
+    const MachineMemOperand *MMO = *MI.memoperands_begin();
+    return TyBits - MMO->getSizeInBits() + 1;
+  }
+  case TargetOpcode::G_ZEXTLOAD: {
+    // FIXME: We need an in-memory type representation.
+    if (DstTy.isVector())
+      return 1;
+
+    // e.g. i16->i32 = '16' bits known.
+    const MachineMemOperand *MMO = *MI.memoperands_begin();
+    return TyBits - MMO->getSizeInBits();
+  }
+  case TargetOpcode::G_TRUNC: {
+    Register Src = MI.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(Src);
+
+    // Check if the sign bits of source go down as far as the truncated value.
+    unsigned DstTyBits = DstTy.getScalarSizeInBits();
+    unsigned NumSrcBits = SrcTy.getScalarSizeInBits();
+    unsigned NumSrcSignBits = computeNumSignBits(Src, DemandedElts, Depth + 1);
+    if (NumSrcSignBits > (NumSrcBits - DstTyBits))
+      return NumSrcSignBits - (NumSrcBits - DstTyBits);
+    break;
+  }
+  case TargetOpcode::G_SELECT: {
+    return computeNumSignBitsMin(MI.getOperand(2).getReg(),
+                                 MI.getOperand(3).getReg(), DemandedElts,
+                                 Depth + 1);
+  }
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_SADDE:
+  case TargetOpcode::G_UADDO:
+  case TargetOpcode::G_UADDE:
+  case TargetOpcode::G_SSUBO:
+  case TargetOpcode::G_SSUBE:
+  case TargetOpcode::G_USUBO:
+  case TargetOpcode::G_USUBE:
+  case TargetOpcode::G_SMULO:
+  case TargetOpcode::G_UMULO: {
+    // If compares returns 0/-1, all bits are sign bits.
+    // We know that we have an integer-based boolean since these operations
+    // are only available for integer.
+    if (MI.getOperand(1).getReg() == R) {
+      if (TL.getBooleanContents(DstTy.isVector(), false) ==
+          TargetLowering::ZeroOrNegativeOneBooleanContent)
+        return TyBits;
+    }
+
+    break;
+  }
+  case TargetOpcode::G_FCMP:
+  case TargetOpcode::G_ICMP: {
+    bool IsFP = Opcode == TargetOpcode::G_FCMP;
+    if (TyBits == 1)
+      break;
+    auto BC = TL.getBooleanContents(DstTy.isVector(), IsFP);
+    if (BC == TargetLoweringBase::ZeroOrNegativeOneBooleanContent)
+      return TyBits; // All bits are sign bits.
+    if (BC == TargetLowering::ZeroOrOneBooleanContent)
+      return TyBits - 1; // Every always-zero bit is a sign bit.
+    break;
+  }
+  case TargetOpcode::G_INTRINSIC:
+  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+  default: {
+    unsigned NumBits =
+      TL.computeNumSignBitsForTargetInstr(*this, R, DemandedElts, MRI, Depth);
+    if (NumBits > 1)
+      FirstAnswer = std::max(FirstAnswer, NumBits);
+    break;
+  }
+  }
+
+  // Finally, if we can prove that the top bits of the result are 0's or 1's,
+  // use this information.
+  KnownBits Known = getKnownBits(R, DemandedElts, Depth);
+  APInt Mask;
+  if (Known.isNonNegative()) {        // sign bit is 0
+    Mask = Known.Zero;
+  } else if (Known.isNegative()) {  // sign bit is 1;
+    Mask = Known.One;
+  } else {
+    // Nothing known.
+    return FirstAnswer;
+  }
+
+  // Okay, we know that the sign bit in Mask is set.  Use CLO to determine
+  // the number of identical bits in the top of the input value.
+  Mask <<= Mask.getBitWidth() - TyBits;
+  return std::max(FirstAnswer, Mask.countl_one());
+}
+
+unsigned GISelKnownBits::computeNumSignBits(Register R, unsigned Depth) {
+  LLT Ty = MRI.getType(R);
+  APInt DemandedElts =
+      Ty.isVector() ? APInt::getAllOnes(Ty.getNumElements()) : APInt(1, 1);
+  return computeNumSignBits(R, DemandedElts, Depth);
+}
+
+void GISelKnownBitsAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool GISelKnownBitsAnalysis::runOnMachineFunction(MachineFunction &MF) {
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
new file mode 100644
index 000000000000..efcc40641ea8
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
@@ -0,0 +1,24 @@
+//===-- llvm/CodeGen/GlobalISel/GlobalIsel.cpp --- GlobalISel ----*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+// This file implements the common initialization routines for the
+// GlobalISel library.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+void llvm::initializeGlobalISel(PassRegistry &Registry) {
+  initializeIRTranslatorPass(Registry);
+  initializeLegalizerPass(Registry);
+  initializeLoadStoreOptPass(Registry);
+  initializeLocalizerPass(Registry);
+  initializeRegBankSelectPass(Registry);
+  initializeInstructionSelectPass(Registry);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
new file mode 100644
index 000000000000..9a67a8d05a4d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -0,0 +1,3698 @@
+//===- llvm/CodeGen/GlobalISel/IRTranslator.cpp - IRTranslator ---*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the IRTranslator class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
+#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/SwitchLoweringUtils.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/MemoryOpRemark.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#define DEBUG_TYPE "irtranslator"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    EnableCSEInIRTranslator("enable-cse-in-irtranslator",
+                            cl::desc("Should enable CSE in irtranslator"),
+                            cl::Optional, cl::init(false));
+char IRTranslator::ID = 0;
+
+INITIALIZE_PASS_BEGIN(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI",
+                false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(StackProtector)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI",
+                false, false)
+
+static void reportTranslationError(MachineFunction &MF,
+                                   const TargetPassConfig &TPC,
+                                   OptimizationRemarkEmitter &ORE,
+                                   OptimizationRemarkMissed &R) {
+  MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+
+  // Print the function name explicitly if we don't have a debug location (which
+  // makes the diagnostic less useful) or if we're going to emit a raw error.
+  if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled())
+    R << (" (in function: " + MF.getName() + ")").str();
+
+  if (TPC.isGlobalISelAbortEnabled())
+    report_fatal_error(Twine(R.getMsg()));
+  else
+    ORE.emit(R);
+}
+
+IRTranslator::IRTranslator(CodeGenOpt::Level optlevel)
+    : MachineFunctionPass(ID), OptLevel(optlevel) {}
+
+#ifndef NDEBUG
+namespace {
+/// Verify that every instruction created has the same DILocation as the
+/// instruction being translated.
+class DILocationVerifier : public GISelChangeObserver {
+  const Instruction *CurrInst = nullptr;
+
+public:
+  DILocationVerifier() = default;
+  ~DILocationVerifier() = default;
+
+  const Instruction *getCurrentInst() const { return CurrInst; }
+  void setCurrentInst(const Instruction *Inst) { CurrInst = Inst; }
+
+  void erasingInstr(MachineInstr &MI) override {}
+  void changingInstr(MachineInstr &MI) override {}
+  void changedInstr(MachineInstr &MI) override {}
+
+  void createdInstr(MachineInstr &MI) override {
+    assert(getCurrentInst() && "Inserted instruction without a current MI");
+
+    // Only print the check message if we're actually checking it.
+#ifndef NDEBUG
+    LLVM_DEBUG(dbgs() << "Checking DILocation from " << *CurrInst
+                      << " was copied to " << MI);
+#endif
+    // We allow insts in the entry block to have no debug loc because
+    // they could have originated from constants, and we don't want a jumpy
+    // debug experience.
+    assert((CurrInst->getDebugLoc() == MI.getDebugLoc() ||
+            (MI.getParent()->isEntryBlock() && !MI.getDebugLoc())) &&
+           "Line info was not transferred to all instructions");
+  }
+};
+} // namespace
+#endif // ifndef NDEBUG
+
+
+void IRTranslator::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<StackProtector>();
+  AU.addRequired<TargetPassConfig>();
+  AU.addRequired<GISelCSEAnalysisWrapperPass>();
+  AU.addRequired<AssumptionCacheTracker>();
+  if (OptLevel != CodeGenOpt::None) {
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+  }
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addPreserved<TargetLibraryInfoWrapperPass>();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+IRTranslator::ValueToVRegInfo::VRegListT &
+IRTranslator::allocateVRegs(const Value &Val) {
+  auto VRegsIt = VMap.findVRegs(Val);
+  if (VRegsIt != VMap.vregs_end())
+    return *VRegsIt->second;
+  auto *Regs = VMap.getVRegs(Val);
+  auto *Offsets = VMap.getOffsets(Val);
+  SmallVector<LLT, 4> SplitTys;
+  computeValueLLTs(*DL, *Val.getType(), SplitTys,
+                   Offsets->empty() ? Offsets : nullptr);
+  for (unsigned i = 0; i < SplitTys.size(); ++i)
+    Regs->push_back(0);
+  return *Regs;
+}
+
+ArrayRef<Register> IRTranslator::getOrCreateVRegs(const Value &Val) {
+  auto VRegsIt = VMap.findVRegs(Val);
+  if (VRegsIt != VMap.vregs_end())
+    return *VRegsIt->second;
+
+  if (Val.getType()->isVoidTy())
+    return *VMap.getVRegs(Val);
+
+  // Create entry for this type.
+  auto *VRegs = VMap.getVRegs(Val);
+  auto *Offsets = VMap.getOffsets(Val);
+
+  assert(Val.getType()->isSized() &&
+         "Don't know how to create an empty vreg");
+
+  SmallVector<LLT, 4> SplitTys;
+  computeValueLLTs(*DL, *Val.getType(), SplitTys,
+                   Offsets->empty() ? Offsets : nullptr);
+
+  if (!isa<Constant>(Val)) {
+    for (auto Ty : SplitTys)
+      VRegs->push_back(MRI->createGenericVirtualRegister(Ty));
+    return *VRegs;
+  }
+
+  if (Val.getType()->isAggregateType()) {
+    // UndefValue, ConstantAggregateZero
+    auto &C = cast<Constant>(Val);
+    unsigned Idx = 0;
+    while (auto Elt = C.getAggregateElement(Idx++)) {
+      auto EltRegs = getOrCreateVRegs(*Elt);
+      llvm::copy(EltRegs, std::back_inserter(*VRegs));
+    }
+  } else {
+    assert(SplitTys.size() == 1 && "unexpectedly split LLT");
+    VRegs->push_back(MRI->createGenericVirtualRegister(SplitTys[0]));
+    bool Success = translate(cast<Constant>(Val), VRegs->front());
+    if (!Success) {
+      OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                                 MF->getFunction().getSubprogram(),
+                                 &MF->getFunction().getEntryBlock());
+      R << "unable to translate constant: " << ore::NV("Type", Val.getType());
+      reportTranslationError(*MF, *TPC, *ORE, R);
+      return *VRegs;
+    }
+  }
+
+  return *VRegs;
+}
+
+int IRTranslator::getOrCreateFrameIndex(const AllocaInst &AI) {
+  auto MapEntry = FrameIndices.find(&AI);
+  if (MapEntry != FrameIndices.end())
+    return MapEntry->second;
+
+  uint64_t ElementSize = DL->getTypeAllocSize(AI.getAllocatedType());
+  uint64_t Size =
+      ElementSize * cast<ConstantInt>(AI.getArraySize())->getZExtValue();
+
+  // Always allocate at least one byte.
+  Size = std::max<uint64_t>(Size, 1u);
+
+  int &FI = FrameIndices[&AI];
+  FI = MF->getFrameInfo().CreateStackObject(Size, AI.getAlign(), false, &AI);
+  return FI;
+}
+
+Align IRTranslator::getMemOpAlign(const Instruction &I) {
+  if (const StoreInst *SI = dyn_cast<StoreInst>(&I))
+    return SI->getAlign();
+  if (const LoadInst *LI = dyn_cast<LoadInst>(&I))
+    return LI->getAlign();
+  if (const AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(&I))
+    return AI->getAlign();
+  if (const AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(&I))
+    return AI->getAlign();
+
+  OptimizationRemarkMissed R("gisel-irtranslator", "", &I);
+  R << "unable to translate memop: " << ore::NV("Opcode", &I);
+  reportTranslationError(*MF, *TPC, *ORE, R);
+  return Align(1);
+}
+
+MachineBasicBlock &IRTranslator::getMBB(const BasicBlock &BB) {
+  MachineBasicBlock *&MBB = BBToMBB[&BB];
+  assert(MBB && "BasicBlock was not encountered before");
+  return *MBB;
+}
+
+void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) {
+  assert(NewPred && "new predecessor must be a real MachineBasicBlock");
+  MachinePreds[Edge].push_back(NewPred);
+}
+
+bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
+                                     MachineIRBuilder &MIRBuilder) {
+  // Get or create a virtual register for each value.
+  // Unless the value is a Constant => loadimm cst?
+  // or inline constant each time?
+  // Creation of a virtual register needs to have a size.
+  Register Op0 = getOrCreateVReg(*U.getOperand(0));
+  Register Op1 = getOrCreateVReg(*U.getOperand(1));
+  Register Res = getOrCreateVReg(U);
+  uint32_t Flags = 0;
+  if (isa<Instruction>(U)) {
+    const Instruction &I = cast<Instruction>(U);
+    Flags = MachineInstr::copyFlagsFromInstruction(I);
+  }
+
+  MIRBuilder.buildInstr(Opcode, {Res}, {Op0, Op1}, Flags);
+  return true;
+}
+
+bool IRTranslator::translateUnaryOp(unsigned Opcode, const User &U,
+                                    MachineIRBuilder &MIRBuilder) {
+  Register Op0 = getOrCreateVReg(*U.getOperand(0));
+  Register Res = getOrCreateVReg(U);
+  uint32_t Flags = 0;
+  if (isa<Instruction>(U)) {
+    const Instruction &I = cast<Instruction>(U);
+    Flags = MachineInstr::copyFlagsFromInstruction(I);
+  }
+  MIRBuilder.buildInstr(Opcode, {Res}, {Op0}, Flags);
+  return true;
+}
+
+bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) {
+  return translateUnaryOp(TargetOpcode::G_FNEG, U, MIRBuilder);
+}
+
+bool IRTranslator::translateCompare(const User &U,
+                                    MachineIRBuilder &MIRBuilder) {
+  auto *CI = dyn_cast<CmpInst>(&U);
+  Register Op0 = getOrCreateVReg(*U.getOperand(0));
+  Register Op1 = getOrCreateVReg(*U.getOperand(1));
+  Register Res = getOrCreateVReg(U);
+  CmpInst::Predicate Pred =
+      CI ? CI->getPredicate() : static_cast<CmpInst::Predicate>(
+                                    cast<ConstantExpr>(U).getPredicate());
+  if (CmpInst::isIntPredicate(Pred))
+    MIRBuilder.buildICmp(Pred, Res, Op0, Op1);
+  else if (Pred == CmpInst::FCMP_FALSE)
+    MIRBuilder.buildCopy(
+        Res, getOrCreateVReg(*Constant::getNullValue(U.getType())));
+  else if (Pred == CmpInst::FCMP_TRUE)
+    MIRBuilder.buildCopy(
+        Res, getOrCreateVReg(*Constant::getAllOnesValue(U.getType())));
+  else {
+    uint32_t Flags = 0;
+    if (CI)
+      Flags = MachineInstr::copyFlagsFromInstruction(*CI);
+    MIRBuilder.buildFCmp(Pred, Res, Op0, Op1, Flags);
+  }
+
+  return true;
+}
+
+bool IRTranslator::translateRet(const User &U, MachineIRBuilder &MIRBuilder) {
+  const ReturnInst &RI = cast<ReturnInst>(U);
+  const Value *Ret = RI.getReturnValue();
+  if (Ret && DL->getTypeStoreSize(Ret->getType()) == 0)
+    Ret = nullptr;
+
+  ArrayRef<Register> VRegs;
+  if (Ret)
+    VRegs = getOrCreateVRegs(*Ret);
+
+  Register SwiftErrorVReg = 0;
+  if (CLI->supportSwiftError() && SwiftError.getFunctionArg()) {
+    SwiftErrorVReg = SwiftError.getOrCreateVRegUseAt(
+        &RI, &MIRBuilder.getMBB(), SwiftError.getFunctionArg());
+  }
+
+  // The target may mess up with the insertion point, but
+  // this is not important as a return is the last instruction
+  // of the block anyway.
+  return CLI->lowerReturn(MIRBuilder, Ret, VRegs, FuncInfo, SwiftErrorVReg);
+}
+
+void IRTranslator::emitBranchForMergedCondition(
+    const Value *Cond, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB,
+    BranchProbability TProb, BranchProbability FProb, bool InvertCond) {
+  // If the leaf of the tree is a comparison, merge the condition into
+  // the caseblock.
+  if (const CmpInst *BOp = dyn_cast<CmpInst>(Cond)) {
+    CmpInst::Predicate Condition;
+    if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) {
+      Condition = InvertCond ? IC->getInversePredicate() : IC->getPredicate();
+    } else {
+      const FCmpInst *FC = cast<FCmpInst>(Cond);
+      Condition = InvertCond ? FC->getInversePredicate() : FC->getPredicate();
+    }
+
+    SwitchCG::CaseBlock CB(Condition, false, BOp->getOperand(0),
+                           BOp->getOperand(1), nullptr, TBB, FBB, CurBB,
+                           CurBuilder->getDebugLoc(), TProb, FProb);
+    SL->SwitchCases.push_back(CB);
+    return;
+  }
+
+  // Create a CaseBlock record representing this branch.
+  CmpInst::Predicate Pred = InvertCond ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
+  SwitchCG::CaseBlock CB(
+      Pred, false, Cond, ConstantInt::getTrue(MF->getFunction().getContext()),
+      nullptr, TBB, FBB, CurBB, CurBuilder->getDebugLoc(), TProb, FProb);
+  SL->SwitchCases.push_back(CB);
+}
+
+static bool isValInBlock(const Value *V, const BasicBlock *BB) {
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() == BB;
+  return true;
+}
+
+void IRTranslator::findMergedConditions(
+    const Value *Cond, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    MachineBasicBlock *CurBB, MachineBasicBlock *SwitchBB,
+    Instruction::BinaryOps Opc, BranchProbability TProb,
+    BranchProbability FProb, bool InvertCond) {
+  using namespace PatternMatch;
+  assert((Opc == Instruction::And || Opc == Instruction::Or) &&
+         "Expected Opc to be AND/OR");
+  // Skip over not part of the tree and remember to invert op and operands at
+  // next level.
+  Value *NotCond;
+  if (match(Cond, m_OneUse(m_Not(m_Value(NotCond)))) &&
+      isValInBlock(NotCond, CurBB->getBasicBlock())) {
+    findMergedConditions(NotCond, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb,
+                         !InvertCond);
+    return;
+  }
+
+  const Instruction *BOp = dyn_cast<Instruction>(Cond);
+  const Value *BOpOp0, *BOpOp1;
+  // Compute the effective opcode for Cond, taking into account whether it needs
+  // to be inverted, e.g.
+  //   and (not (or A, B)), C
+  // gets lowered as
+  //   and (and (not A, not B), C)
+  Instruction::BinaryOps BOpc = (Instruction::BinaryOps)0;
+  if (BOp) {
+    BOpc = match(BOp, m_LogicalAnd(m_Value(BOpOp0), m_Value(BOpOp1)))
+               ? Instruction::And
+               : (match(BOp, m_LogicalOr(m_Value(BOpOp0), m_Value(BOpOp1)))
+                      ? Instruction::Or
+                      : (Instruction::BinaryOps)0);
+    if (InvertCond) {
+      if (BOpc == Instruction::And)
+        BOpc = Instruction::Or;
+      else if (BOpc == Instruction::Or)
+        BOpc = Instruction::And;
+    }
+  }
+
+  // If this node is not part of the or/and tree, emit it as a branch.
+  // Note that all nodes in the tree should have same opcode.
+  bool BOpIsInOrAndTree = BOpc && BOpc == Opc && BOp->hasOneUse();
+  if (!BOpIsInOrAndTree || BOp->getParent() != CurBB->getBasicBlock() ||
+      !isValInBlock(BOpOp0, CurBB->getBasicBlock()) ||
+      !isValInBlock(BOpOp1, CurBB->getBasicBlock())) {
+    emitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB, TProb, FProb,
+                                 InvertCond);
+    return;
+  }
+
+  //  Create TmpBB after CurBB.
+  MachineFunction::iterator BBI(CurBB);
+  MachineBasicBlock *TmpBB =
+      MF->CreateMachineBasicBlock(CurBB->getBasicBlock());
+  CurBB->getParent()->insert(++BBI, TmpBB);
+
+  if (Opc == Instruction::Or) {
+    // Codegen X | Y as:
+    // BB1:
+    //   jmp_if_X TBB
+    //   jmp TmpBB
+    // TmpBB:
+    //   jmp_if_Y TBB
+    //   jmp FBB
+    //
+
+    // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
+    // The requirement is that
+    //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
+    //     = TrueProb for original BB.
+    // Assuming the original probabilities are A and B, one choice is to set
+    // BB1's probabilities to A/2 and A/2+B, and set TmpBB's probabilities to
+    // A/(1+B) and 2B/(1+B). This choice assumes that
+    //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
+    // Another choice is to assume TrueProb for BB1 equals to TrueProb for
+    // TmpBB, but the math is more complicated.
+
+    auto NewTrueProb = TProb / 2;
+    auto NewFalseProb = TProb / 2 + FProb;
+    // Emit the LHS condition.
+    findMergedConditions(BOpOp0, TBB, TmpBB, CurBB, SwitchBB, Opc, NewTrueProb,
+                         NewFalseProb, InvertCond);
+
+    // Normalize A/2 and B to get A/(1+B) and 2B/(1+B).
+    SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb};
+    BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
+    // Emit the RHS condition into TmpBB.
+    findMergedConditions(BOpOp1, TBB, FBB, TmpBB, SwitchBB, Opc, Probs[0],
+                         Probs[1], InvertCond);
+  } else {
+    assert(Opc == Instruction::And && "Unknown merge op!");
+    // Codegen X & Y as:
+    // BB1:
+    //   jmp_if_X TmpBB
+    //   jmp FBB
+    // TmpBB:
+    //   jmp_if_Y TBB
+    //   jmp FBB
+    //
+    //  This requires creation of TmpBB after CurBB.
+
+    // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
+    // The requirement is that
+    //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
+    //     = FalseProb for original BB.
+    // Assuming the original probabilities are A and B, one choice is to set
+    // BB1's probabilities to A+B/2 and B/2, and set TmpBB's probabilities to
+    // 2A/(1+A) and B/(1+A). This choice assumes that FalseProb for BB1 ==
+    // TrueProb for BB1 * FalseProb for TmpBB.
+
+    auto NewTrueProb = TProb + FProb / 2;
+    auto NewFalseProb = FProb / 2;
+    // Emit the LHS condition.
+    findMergedConditions(BOpOp0, TmpBB, FBB, CurBB, SwitchBB, Opc, NewTrueProb,
+                         NewFalseProb, InvertCond);
+
+    // Normalize A and B/2 to get 2A/(1+A) and B/(1+A).
+    SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2};
+    BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
+    // Emit the RHS condition into TmpBB.
+    findMergedConditions(BOpOp1, TBB, FBB, TmpBB, SwitchBB, Opc, Probs[0],
+                         Probs[1], InvertCond);
+  }
+}
+
+bool IRTranslator::shouldEmitAsBranches(
+    const std::vector<SwitchCG::CaseBlock> &Cases) {
+  // For multiple cases, it's better to emit as branches.
+  if (Cases.size() != 2)
+    return true;
+
+  // If this is two comparisons of the same values or'd or and'd together, they
+  // will get folded into a single comparison, so don't emit two blocks.
+  if ((Cases[0].CmpLHS == Cases[1].CmpLHS &&
+       Cases[0].CmpRHS == Cases[1].CmpRHS) ||
+      (Cases[0].CmpRHS == Cases[1].CmpLHS &&
+       Cases[0].CmpLHS == Cases[1].CmpRHS)) {
+    return false;
+  }
+
+  // Handle: (X != null) | (Y != null) --> (X|Y) != 0
+  // Handle: (X == null) & (Y == null) --> (X|Y) == 0
+  if (Cases[0].CmpRHS == Cases[1].CmpRHS &&
+      Cases[0].PredInfo.Pred == Cases[1].PredInfo.Pred &&
+      isa<Constant>(Cases[0].CmpRHS) &&
+      cast<Constant>(Cases[0].CmpRHS)->isNullValue()) {
+    if (Cases[0].PredInfo.Pred == CmpInst::ICMP_EQ &&
+        Cases[0].TrueBB == Cases[1].ThisBB)
+      return false;
+    if (Cases[0].PredInfo.Pred == CmpInst::ICMP_NE &&
+        Cases[0].FalseBB == Cases[1].ThisBB)
+      return false;
+  }
+
+  return true;
+}
+
+bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
+  const BranchInst &BrInst = cast<BranchInst>(U);
+  auto &CurMBB = MIRBuilder.getMBB();
+  auto *Succ0MBB = &getMBB(*BrInst.getSuccessor(0));
+
+  if (BrInst.isUnconditional()) {
+    // If the unconditional target is the layout successor, fallthrough.
+    if (OptLevel == CodeGenOpt::None || !CurMBB.isLayoutSuccessor(Succ0MBB))
+      MIRBuilder.buildBr(*Succ0MBB);
+
+    // Link successors.
+    for (const BasicBlock *Succ : successors(&BrInst))
+      CurMBB.addSuccessor(&getMBB(*Succ));
+    return true;
+  }
+
+  // If this condition is one of the special cases we handle, do special stuff
+  // now.
+  const Value *CondVal = BrInst.getCondition();
+  MachineBasicBlock *Succ1MBB = &getMBB(*BrInst.getSuccessor(1));
+
+  const auto &TLI = *MF->getSubtarget().getTargetLowering();
+
+  // If this is a series of conditions that are or'd or and'd together, emit
+  // this as a sequence of branches instead of setcc's with and/or operations.
+  // As long as jumps are not expensive (exceptions for multi-use logic ops,
+  // unpredictable branches, and vector extracts because those jumps are likely
+  // expensive for any target), this should improve performance.
+  // For example, instead of something like:
+  //     cmp A, B
+  //     C = seteq
+  //     cmp D, E
+  //     F = setle
+  //     or C, F
+  //     jnz foo
+  // Emit:
+  //     cmp A, B
+  //     je foo
+  //     cmp D, E
+  //     jle foo
+  using namespace PatternMatch;
+  const Instruction *CondI = dyn_cast<Instruction>(CondVal);
+  if (!TLI.isJumpExpensive() && CondI && CondI->hasOneUse() &&
+      !BrInst.hasMetadata(LLVMContext::MD_unpredictable)) {
+    Instruction::BinaryOps Opcode = (Instruction::BinaryOps)0;
+    Value *Vec;
+    const Value *BOp0, *BOp1;
+    if (match(CondI, m_LogicalAnd(m_Value(BOp0), m_Value(BOp1))))
+      Opcode = Instruction::And;
+    else if (match(CondI, m_LogicalOr(m_Value(BOp0), m_Value(BOp1))))
+      Opcode = Instruction::Or;
+
+    if (Opcode && !(match(BOp0, m_ExtractElt(m_Value(Vec), m_Value())) &&
+                    match(BOp1, m_ExtractElt(m_Specific(Vec), m_Value())))) {
+      findMergedConditions(CondI, Succ0MBB, Succ1MBB, &CurMBB, &CurMBB, Opcode,
+                           getEdgeProbability(&CurMBB, Succ0MBB),
+                           getEdgeProbability(&CurMBB, Succ1MBB),
+                           /*InvertCond=*/false);
+      assert(SL->SwitchCases[0].ThisBB == &CurMBB && "Unexpected lowering!");
+
+      // Allow some cases to be rejected.
+      if (shouldEmitAsBranches(SL->SwitchCases)) {
+        // Emit the branch for this block.
+        emitSwitchCase(SL->SwitchCases[0], &CurMBB, *CurBuilder);
+        SL->SwitchCases.erase(SL->SwitchCases.begin());
+        return true;
+      }
+
+      // Okay, we decided not to do this, remove any inserted MBB's and clear
+      // SwitchCases.
+      for (unsigned I = 1, E = SL->SwitchCases.size(); I != E; ++I)
+        MF->erase(SL->SwitchCases[I].ThisBB);
+
+      SL->SwitchCases.clear();
+    }
+  }
+
+  // Create a CaseBlock record representing this branch.
+  SwitchCG::CaseBlock CB(CmpInst::ICMP_EQ, false, CondVal,
+                         ConstantInt::getTrue(MF->getFunction().getContext()),
+                         nullptr, Succ0MBB, Succ1MBB, &CurMBB,
+                         CurBuilder->getDebugLoc());
+
+  // Use emitSwitchCase to actually insert the fast branch sequence for this
+  // cond branch.
+  emitSwitchCase(CB, &CurMBB, *CurBuilder);
+  return true;
+}
+
+void IRTranslator::addSuccessorWithProb(MachineBasicBlock *Src,
+                                        MachineBasicBlock *Dst,
+                                        BranchProbability Prob) {
+  if (!FuncInfo.BPI) {
+    Src->addSuccessorWithoutProb(Dst);
+    return;
+  }
+  if (Prob.isUnknown())
+    Prob = getEdgeProbability(Src, Dst);
+  Src->addSuccessor(Dst, Prob);
+}
+
+BranchProbability
+IRTranslator::getEdgeProbability(const MachineBasicBlock *Src,
+                                 const MachineBasicBlock *Dst) const {
+  const BasicBlock *SrcBB = Src->getBasicBlock();
+  const BasicBlock *DstBB = Dst->getBasicBlock();
+  if (!FuncInfo.BPI) {
+    // If BPI is not available, set the default probability as 1 / N, where N is
+    // the number of successors.
+    auto SuccSize = std::max<uint32_t>(succ_size(SrcBB), 1);
+    return BranchProbability(1, SuccSize);
+  }
+  return FuncInfo.BPI->getEdgeProbability(SrcBB, DstBB);
+}
+
+bool IRTranslator::translateSwitch(const User &U, MachineIRBuilder &MIB) {
+  using namespace SwitchCG;
+  // Extract cases from the switch.
+  const SwitchInst &SI = cast<SwitchInst>(U);
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  CaseClusterVector Clusters;
+  Clusters.reserve(SI.getNumCases());
+  for (const auto &I : SI.cases()) {
+    MachineBasicBlock *Succ = &getMBB(*I.getCaseSuccessor());
+    assert(Succ && "Could not find successor mbb in mapping");
+    const ConstantInt *CaseVal = I.getCaseValue();
+    BranchProbability Prob =
+        BPI ? BPI->getEdgeProbability(SI.getParent(), I.getSuccessorIndex())
+            : BranchProbability(1, SI.getNumCases() + 1);
+    Clusters.push_back(CaseCluster::range(CaseVal, CaseVal, Succ, Prob));
+  }
+
+  MachineBasicBlock *DefaultMBB = &getMBB(*SI.getDefaultDest());
+
+  // Cluster adjacent cases with the same destination. We do this at all
+  // optimization levels because it's cheap to do and will make codegen faster
+  // if there are many clusters.
+  sortAndRangeify(Clusters);
+
+  MachineBasicBlock *SwitchMBB = &getMBB(*SI.getParent());
+
+  // If there is only the default destination, jump there directly.
+  if (Clusters.empty()) {
+    SwitchMBB->addSuccessor(DefaultMBB);
+    if (DefaultMBB != SwitchMBB->getNextNode())
+      MIB.buildBr(*DefaultMBB);
+    return true;
+  }
+
+  SL->findJumpTables(Clusters, &SI, DefaultMBB, nullptr, nullptr);
+  SL->findBitTestClusters(Clusters, &SI);
+
+  LLVM_DEBUG({
+    dbgs() << "Case clusters: ";
+    for (const CaseCluster &C : Clusters) {
+      if (C.Kind == CC_JumpTable)
+        dbgs() << "JT:";
+      if (C.Kind == CC_BitTests)
+        dbgs() << "BT:";
+
+      C.Low->getValue().print(dbgs(), true);
+      if (C.Low != C.High) {
+        dbgs() << '-';
+        C.High->getValue().print(dbgs(), true);
+      }
+      dbgs() << ' ';
+    }
+    dbgs() << '\n';
+  });
+
+  assert(!Clusters.empty());
+  SwitchWorkList WorkList;
+  CaseClusterIt First = Clusters.begin();
+  CaseClusterIt Last = Clusters.end() - 1;
+  auto DefaultProb = getEdgeProbability(SwitchMBB, DefaultMBB);
+  WorkList.push_back({SwitchMBB, First, Last, nullptr, nullptr, DefaultProb});
+
+  // FIXME: At the moment we don't do any splitting optimizations here like
+  // SelectionDAG does, so this worklist only has one entry.
+  while (!WorkList.empty()) {
+    SwitchWorkListItem W = WorkList.pop_back_val();
+    if (!lowerSwitchWorkItem(W, SI.getCondition(), SwitchMBB, DefaultMBB, MIB))
+      return false;
+  }
+  return true;
+}
+
+void IRTranslator::emitJumpTable(SwitchCG::JumpTable &JT,
+                                 MachineBasicBlock *MBB) {
+  // Emit the code for the jump table
+  assert(JT.Reg != -1U && "Should lower JT Header first!");
+  MachineIRBuilder MIB(*MBB->getParent());
+  MIB.setMBB(*MBB);
+  MIB.setDebugLoc(CurBuilder->getDebugLoc());
+
+  Type *PtrIRTy = Type::getInt8PtrTy(MF->getFunction().getContext());
+  const LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
+
+  auto Table = MIB.buildJumpTable(PtrTy, JT.JTI);
+  MIB.buildBrJT(Table.getReg(0), JT.JTI, JT.Reg);
+}
+
+bool IRTranslator::emitJumpTableHeader(SwitchCG::JumpTable &JT,
+                                       SwitchCG::JumpTableHeader &JTH,
+                                       MachineBasicBlock *HeaderBB) {
+  MachineIRBuilder MIB(*HeaderBB->getParent());
+  MIB.setMBB(*HeaderBB);
+  MIB.setDebugLoc(CurBuilder->getDebugLoc());
+
+  const Value &SValue = *JTH.SValue;
+  // Subtract the lowest switch case value from the value being switched on.
+  const LLT SwitchTy = getLLTForType(*SValue.getType(), *DL);
+  Register SwitchOpReg = getOrCreateVReg(SValue);
+  auto FirstCst = MIB.buildConstant(SwitchTy, JTH.First);
+  auto Sub = MIB.buildSub({SwitchTy}, SwitchOpReg, FirstCst);
+
+  // This value may be smaller or larger than the target's pointer type, and
+  // therefore require extension or truncating.
+  Type *PtrIRTy = SValue.getType()->getPointerTo();
+  const LLT PtrScalarTy = LLT::scalar(DL->getTypeSizeInBits(PtrIRTy));
+  Sub = MIB.buildZExtOrTrunc(PtrScalarTy, Sub);
+
+  JT.Reg = Sub.getReg(0);
+
+  if (JTH.FallthroughUnreachable) {
+    if (JT.MBB != HeaderBB->getNextNode())
+      MIB.buildBr(*JT.MBB);
+    return true;
+  }
+
+  // Emit the range check for the jump table, and branch to the default block
+  // for the switch statement if the value being switched on exceeds the
+  // largest case in the switch.
+  auto Cst = getOrCreateVReg(
+      *ConstantInt::get(SValue.getType(), JTH.Last - JTH.First));
+  Cst = MIB.buildZExtOrTrunc(PtrScalarTy, Cst).getReg(0);
+  auto Cmp = MIB.buildICmp(CmpInst::ICMP_UGT, LLT::scalar(1), Sub, Cst);
+
+  auto BrCond = MIB.buildBrCond(Cmp.getReg(0), *JT.Default);
+
+  // Avoid emitting unnecessary branches to the next block.
+  if (JT.MBB != HeaderBB->getNextNode())
+    BrCond = MIB.buildBr(*JT.MBB);
+  return true;
+}
+
+void IRTranslator::emitSwitchCase(SwitchCG::CaseBlock &CB,
+                                  MachineBasicBlock *SwitchBB,
+                                  MachineIRBuilder &MIB) {
+  Register CondLHS = getOrCreateVReg(*CB.CmpLHS);
+  Register Cond;
+  DebugLoc OldDbgLoc = MIB.getDebugLoc();
+  MIB.setDebugLoc(CB.DbgLoc);
+  MIB.setMBB(*CB.ThisBB);
+
+  if (CB.PredInfo.NoCmp) {
+    // Branch or fall through to TrueBB.
+    addSuccessorWithProb(CB.ThisBB, CB.TrueBB, CB.TrueProb);
+    addMachineCFGPred({SwitchBB->getBasicBlock(), CB.TrueBB->getBasicBlock()},
+                      CB.ThisBB);
+    CB.ThisBB->normalizeSuccProbs();
+    if (CB.TrueBB != CB.ThisBB->getNextNode())
+      MIB.buildBr(*CB.TrueBB);
+    MIB.setDebugLoc(OldDbgLoc);
+    return;
+  }
+
+  const LLT i1Ty = LLT::scalar(1);
+  // Build the compare.
+  if (!CB.CmpMHS) {
+    const auto *CI = dyn_cast<ConstantInt>(CB.CmpRHS);
+    // For conditional branch lowering, we might try to do something silly like
+    // emit an G_ICMP to compare an existing G_ICMP i1 result with true. If so,
+    // just re-use the existing condition vreg.
+    if (MRI->getType(CondLHS).getSizeInBits() == 1 && CI && CI->isOne() &&
+        CB.PredInfo.Pred == CmpInst::ICMP_EQ) {
+      Cond = CondLHS;
+    } else {
+      Register CondRHS = getOrCreateVReg(*CB.CmpRHS);
+      if (CmpInst::isFPPredicate(CB.PredInfo.Pred))
+        Cond =
+            MIB.buildFCmp(CB.PredInfo.Pred, i1Ty, CondLHS, CondRHS).getReg(0);
+      else
+        Cond =
+            MIB.buildICmp(CB.PredInfo.Pred, i1Ty, CondLHS, CondRHS).getReg(0);
+    }
+  } else {
+    assert(CB.PredInfo.Pred == CmpInst::ICMP_SLE &&
+           "Can only handle SLE ranges");
+
+    const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
+    const APInt& High = cast<ConstantInt>(CB.CmpRHS)->getValue();
+
+    Register CmpOpReg = getOrCreateVReg(*CB.CmpMHS);
+    if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) {
+      Register CondRHS = getOrCreateVReg(*CB.CmpRHS);
+      Cond =
+          MIB.buildICmp(CmpInst::ICMP_SLE, i1Ty, CmpOpReg, CondRHS).getReg(0);
+    } else {
+      const LLT CmpTy = MRI->getType(CmpOpReg);
+      auto Sub = MIB.buildSub({CmpTy}, CmpOpReg, CondLHS);
+      auto Diff = MIB.buildConstant(CmpTy, High - Low);
+      Cond = MIB.buildICmp(CmpInst::ICMP_ULE, i1Ty, Sub, Diff).getReg(0);
+    }
+  }
+
+  // Update successor info
+  addSuccessorWithProb(CB.ThisBB, CB.TrueBB, CB.TrueProb);
+
+  addMachineCFGPred({SwitchBB->getBasicBlock(), CB.TrueBB->getBasicBlock()},
+                    CB.ThisBB);
+
+  // TrueBB and FalseBB are always different unless the incoming IR is
+  // degenerate. This only happens when running llc on weird IR.
+  if (CB.TrueBB != CB.FalseBB)
+    addSuccessorWithProb(CB.ThisBB, CB.FalseBB, CB.FalseProb);
+  CB.ThisBB->normalizeSuccProbs();
+
+  addMachineCFGPred({SwitchBB->getBasicBlock(), CB.FalseBB->getBasicBlock()},
+                    CB.ThisBB);
+
+  MIB.buildBrCond(Cond, *CB.TrueBB);
+  MIB.buildBr(*CB.FalseBB);
+  MIB.setDebugLoc(OldDbgLoc);
+}
+
+bool IRTranslator::lowerJumpTableWorkItem(SwitchCG::SwitchWorkListItem W,
+                                          MachineBasicBlock *SwitchMBB,
+                                          MachineBasicBlock *CurMBB,
+                                          MachineBasicBlock *DefaultMBB,
+                                          MachineIRBuilder &MIB,
+                                          MachineFunction::iterator BBI,
+                                          BranchProbability UnhandledProbs,
+                                          SwitchCG::CaseClusterIt I,
+                                          MachineBasicBlock *Fallthrough,
+                                          bool FallthroughUnreachable) {
+  using namespace SwitchCG;
+  MachineFunction *CurMF = SwitchMBB->getParent();
+  // FIXME: Optimize away range check based on pivot comparisons.
+  JumpTableHeader *JTH = &SL->JTCases[I->JTCasesIndex].first;
+  SwitchCG::JumpTable *JT = &SL->JTCases[I->JTCasesIndex].second;
+  BranchProbability DefaultProb = W.DefaultProb;
+
+  // The jump block hasn't been inserted yet; insert it here.
+  MachineBasicBlock *JumpMBB = JT->MBB;
+  CurMF->insert(BBI, JumpMBB);
+
+  // Since the jump table block is separate from the switch block, we need
+  // to keep track of it as a machine predecessor to the default block,
+  // otherwise we lose the phi edges.
+  addMachineCFGPred({SwitchMBB->getBasicBlock(), DefaultMBB->getBasicBlock()},
+                    CurMBB);
+  addMachineCFGPred({SwitchMBB->getBasicBlock(), DefaultMBB->getBasicBlock()},
+                    JumpMBB);
+
+  auto JumpProb = I->Prob;
+  auto FallthroughProb = UnhandledProbs;
+
+  // If the default statement is a target of the jump table, we evenly
+  // distribute the default probability to successors of CurMBB. Also
+  // update the probability on the edge from JumpMBB to Fallthrough.
+  for (MachineBasicBlock::succ_iterator SI = JumpMBB->succ_begin(),
+                                        SE = JumpMBB->succ_end();
+       SI != SE; ++SI) {
+    if (*SI == DefaultMBB) {
+      JumpProb += DefaultProb / 2;
+      FallthroughProb -= DefaultProb / 2;
+      JumpMBB->setSuccProbability(SI, DefaultProb / 2);
+      JumpMBB->normalizeSuccProbs();
+    } else {
+      // Also record edges from the jump table block to it's successors.
+      addMachineCFGPred({SwitchMBB->getBasicBlock(), (*SI)->getBasicBlock()},
+                        JumpMBB);
+    }
+  }
+
+  if (FallthroughUnreachable)
+    JTH->FallthroughUnreachable = true;
+
+  if (!JTH->FallthroughUnreachable)
+    addSuccessorWithProb(CurMBB, Fallthrough, FallthroughProb);
+  addSuccessorWithProb(CurMBB, JumpMBB, JumpProb);
+  CurMBB->normalizeSuccProbs();
+
+  // The jump table header will be inserted in our current block, do the
+  // range check, and fall through to our fallthrough block.
+  JTH->HeaderBB = CurMBB;
+  JT->Default = Fallthrough; // FIXME: Move Default to JumpTableHeader.
+
+  // If we're in the right place, emit the jump table header right now.
+  if (CurMBB == SwitchMBB) {
+    if (!emitJumpTableHeader(*JT, *JTH, CurMBB))
+      return false;
+    JTH->Emitted = true;
+  }
+  return true;
+}
+bool IRTranslator::lowerSwitchRangeWorkItem(SwitchCG::CaseClusterIt I,
+                                            Value *Cond,
+                                            MachineBasicBlock *Fallthrough,
+                                            bool FallthroughUnreachable,
+                                            BranchProbability UnhandledProbs,
+                                            MachineBasicBlock *CurMBB,
+                                            MachineIRBuilder &MIB,
+                                            MachineBasicBlock *SwitchMBB) {
+  using namespace SwitchCG;
+  const Value *RHS, *LHS, *MHS;
+  CmpInst::Predicate Pred;
+  if (I->Low == I->High) {
+    // Check Cond == I->Low.
+    Pred = CmpInst::ICMP_EQ;
+    LHS = Cond;
+    RHS = I->Low;
+    MHS = nullptr;
+  } else {
+    // Check I->Low <= Cond <= I->High.
+    Pred = CmpInst::ICMP_SLE;
+    LHS = I->Low;
+    MHS = Cond;
+    RHS = I->High;
+  }
+
+  // If Fallthrough is unreachable, fold away the comparison.
+  // The false probability is the sum of all unhandled cases.
+  CaseBlock CB(Pred, FallthroughUnreachable, LHS, RHS, MHS, I->MBB, Fallthrough,
+               CurMBB, MIB.getDebugLoc(), I->Prob, UnhandledProbs);
+
+  emitSwitchCase(CB, SwitchMBB, MIB);
+  return true;
+}
+
+void IRTranslator::emitBitTestHeader(SwitchCG::BitTestBlock &B,
+                                     MachineBasicBlock *SwitchBB) {
+  MachineIRBuilder &MIB = *CurBuilder;
+  MIB.setMBB(*SwitchBB);
+
+  // Subtract the minimum value.
+  Register SwitchOpReg = getOrCreateVReg(*B.SValue);
+
+  LLT SwitchOpTy = MRI->getType(SwitchOpReg);
+  Register MinValReg = MIB.buildConstant(SwitchOpTy, B.First).getReg(0);
+  auto RangeSub = MIB.buildSub(SwitchOpTy, SwitchOpReg, MinValReg);
+
+  Type *PtrIRTy = Type::getInt8PtrTy(MF->getFunction().getContext());
+  const LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
+
+  LLT MaskTy = SwitchOpTy;
+  if (MaskTy.getSizeInBits() > PtrTy.getSizeInBits() ||
+      !llvm::has_single_bit<uint32_t>(MaskTy.getSizeInBits()))
+    MaskTy = LLT::scalar(PtrTy.getSizeInBits());
+  else {
+    // Ensure that the type will fit the mask value.
+    for (unsigned I = 0, E = B.Cases.size(); I != E; ++I) {
+      if (!isUIntN(SwitchOpTy.getSizeInBits(), B.Cases[I].Mask)) {
+        // Switch table case range are encoded into series of masks.
+        // Just use pointer type, it's guaranteed to fit.
+        MaskTy = LLT::scalar(PtrTy.getSizeInBits());
+        break;
+      }
+    }
+  }
+  Register SubReg = RangeSub.getReg(0);
+  if (SwitchOpTy != MaskTy)
+    SubReg = MIB.buildZExtOrTrunc(MaskTy, SubReg).getReg(0);
+
+  B.RegVT = getMVTForLLT(MaskTy);
+  B.Reg = SubReg;
+
+  MachineBasicBlock *MBB = B.Cases[0].ThisBB;
+
+  if (!B.FallthroughUnreachable)
+    addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb);
+  addSuccessorWithProb(SwitchBB, MBB, B.Prob);
+
+  SwitchBB->normalizeSuccProbs();
+
+  if (!B.FallthroughUnreachable) {
+    // Conditional branch to the default block.
+    auto RangeCst = MIB.buildConstant(SwitchOpTy, B.Range);
+    auto RangeCmp = MIB.buildICmp(CmpInst::Predicate::ICMP_UGT, LLT::scalar(1),
+                                  RangeSub, RangeCst);
+    MIB.buildBrCond(RangeCmp, *B.Default);
+  }
+
+  // Avoid emitting unnecessary branches to the next block.
+  if (MBB != SwitchBB->getNextNode())
+    MIB.buildBr(*MBB);
+}
+
+void IRTranslator::emitBitTestCase(SwitchCG::BitTestBlock &BB,
+                                   MachineBasicBlock *NextMBB,
+                                   BranchProbability BranchProbToNext,
+                                   Register Reg, SwitchCG::BitTestCase &B,
+                                   MachineBasicBlock *SwitchBB) {
+  MachineIRBuilder &MIB = *CurBuilder;
+  MIB.setMBB(*SwitchBB);
+
+  LLT SwitchTy = getLLTForMVT(BB.RegVT);
+  Register Cmp;
+  unsigned PopCount = llvm::popcount(B.Mask);
+  if (PopCount == 1) {
+    // Testing for a single bit; just compare the shift count with what it
+    // would need to be to shift a 1 bit in that position.
+    auto MaskTrailingZeros =
+        MIB.buildConstant(SwitchTy, llvm::countr_zero(B.Mask));
+    Cmp =
+        MIB.buildICmp(ICmpInst::ICMP_EQ, LLT::scalar(1), Reg, MaskTrailingZeros)
+            .getReg(0);
+  } else if (PopCount == BB.Range) {
+    // There is only one zero bit in the range, test for it directly.
+    auto MaskTrailingOnes =
+        MIB.buildConstant(SwitchTy, llvm::countr_one(B.Mask));
+    Cmp = MIB.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Reg, MaskTrailingOnes)
+              .getReg(0);
+  } else {
+    // Make desired shift.
+    auto CstOne = MIB.buildConstant(SwitchTy, 1);
+    auto SwitchVal = MIB.buildShl(SwitchTy, CstOne, Reg);
+
+    // Emit bit tests and jumps.
+    auto CstMask = MIB.buildConstant(SwitchTy, B.Mask);
+    auto AndOp = MIB.buildAnd(SwitchTy, SwitchVal, CstMask);
+    auto CstZero = MIB.buildConstant(SwitchTy, 0);
+    Cmp = MIB.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), AndOp, CstZero)
+              .getReg(0);
+  }
+
+  // The branch probability from SwitchBB to B.TargetBB is B.ExtraProb.
+  addSuccessorWithProb(SwitchBB, B.TargetBB, B.ExtraProb);
+  // The branch probability from SwitchBB to NextMBB is BranchProbToNext.
+  addSuccessorWithProb(SwitchBB, NextMBB, BranchProbToNext);
+  // It is not guaranteed that the sum of B.ExtraProb and BranchProbToNext is
+  // one as they are relative probabilities (and thus work more like weights),
+  // and hence we need to normalize them to let the sum of them become one.
+  SwitchBB->normalizeSuccProbs();
+
+  // Record the fact that the IR edge from the header to the bit test target
+  // will go through our new block. Neeeded for PHIs to have nodes added.
+  addMachineCFGPred({BB.Parent->getBasicBlock(), B.TargetBB->getBasicBlock()},
+                    SwitchBB);
+
+  MIB.buildBrCond(Cmp, *B.TargetBB);
+
+  // Avoid emitting unnecessary branches to the next block.
+  if (NextMBB != SwitchBB->getNextNode())
+    MIB.buildBr(*NextMBB);
+}
+
+bool IRTranslator::lowerBitTestWorkItem(
+    SwitchCG::SwitchWorkListItem W, MachineBasicBlock *SwitchMBB,
+    MachineBasicBlock *CurMBB, MachineBasicBlock *DefaultMBB,
+    MachineIRBuilder &MIB, MachineFunction::iterator BBI,
+    BranchProbability DefaultProb, BranchProbability UnhandledProbs,
+    SwitchCG::CaseClusterIt I, MachineBasicBlock *Fallthrough,
+    bool FallthroughUnreachable) {
+  using namespace SwitchCG;
+  MachineFunction *CurMF = SwitchMBB->getParent();
+  // FIXME: Optimize away range check based on pivot comparisons.
+  BitTestBlock *BTB = &SL->BitTestCases[I->BTCasesIndex];
+  // The bit test blocks haven't been inserted yet; insert them here.
+  for (BitTestCase &BTC : BTB->Cases)
+    CurMF->insert(BBI, BTC.ThisBB);
+
+  // Fill in fields of the BitTestBlock.
+  BTB->Parent = CurMBB;
+  BTB->Default = Fallthrough;
+
+  BTB->DefaultProb = UnhandledProbs;
+  // If the cases in bit test don't form a contiguous range, we evenly
+  // distribute the probability on the edge to Fallthrough to two
+  // successors of CurMBB.
+  if (!BTB->ContiguousRange) {
+    BTB->Prob += DefaultProb / 2;
+    BTB->DefaultProb -= DefaultProb / 2;
+  }
+
+  if (FallthroughUnreachable)
+    BTB->FallthroughUnreachable = true;
+
+  // If we're in the right place, emit the bit test header right now.
+  if (CurMBB == SwitchMBB) {
+    emitBitTestHeader(*BTB, SwitchMBB);
+    BTB->Emitted = true;
+  }
+  return true;
+}
+
+bool IRTranslator::lowerSwitchWorkItem(SwitchCG::SwitchWorkListItem W,
+                                       Value *Cond,
+                                       MachineBasicBlock *SwitchMBB,
+                                       MachineBasicBlock *DefaultMBB,
+                                       MachineIRBuilder &MIB) {
+  using namespace SwitchCG;
+  MachineFunction *CurMF = FuncInfo.MF;
+  MachineBasicBlock *NextMBB = nullptr;
+  MachineFunction::iterator BBI(W.MBB);
+  if (++BBI != FuncInfo.MF->end())
+    NextMBB = &*BBI;
+
+  if (EnableOpts) {
+    // Here, we order cases by probability so the most likely case will be
+    // checked first. However, two clusters can have the same probability in
+    // which case their relative ordering is non-deterministic. So we use Low
+    // as a tie-breaker as clusters are guaranteed to never overlap.
+    llvm::sort(W.FirstCluster, W.LastCluster + 1,
+               [](const CaseCluster &a, const CaseCluster &b) {
+                 return a.Prob != b.Prob
+                            ? a.Prob > b.Prob
+                            : a.Low->getValue().slt(b.Low->getValue());
+               });
+
+    // Rearrange the case blocks so that the last one falls through if possible
+    // without changing the order of probabilities.
+    for (CaseClusterIt I = W.LastCluster; I > W.FirstCluster;) {
+      --I;
+      if (I->Prob > W.LastCluster->Prob)
+        break;
+      if (I->Kind == CC_Range && I->MBB == NextMBB) {
+        std::swap(*I, *W.LastCluster);
+        break;
+      }
+    }
+  }
+
+  // Compute total probability.
+  BranchProbability DefaultProb = W.DefaultProb;
+  BranchProbability UnhandledProbs = DefaultProb;
+  for (CaseClusterIt I = W.FirstCluster; I <= W.LastCluster; ++I)
+    UnhandledProbs += I->Prob;
+
+  MachineBasicBlock *CurMBB = W.MBB;
+  for (CaseClusterIt I = W.FirstCluster, E = W.LastCluster; I <= E; ++I) {
+    bool FallthroughUnreachable = false;
+    MachineBasicBlock *Fallthrough;
+    if (I == W.LastCluster) {
+      // For the last cluster, fall through to the default destination.
+      Fallthrough = DefaultMBB;
+      FallthroughUnreachable = isa<UnreachableInst>(
+          DefaultMBB->getBasicBlock()->getFirstNonPHIOrDbg());
+    } else {
+      Fallthrough = CurMF->CreateMachineBasicBlock(CurMBB->getBasicBlock());
+      CurMF->insert(BBI, Fallthrough);
+    }
+    UnhandledProbs -= I->Prob;
+
+    switch (I->Kind) {
+    case CC_BitTests: {
+      if (!lowerBitTestWorkItem(W, SwitchMBB, CurMBB, DefaultMBB, MIB, BBI,
+                                DefaultProb, UnhandledProbs, I, Fallthrough,
+                                FallthroughUnreachable)) {
+        LLVM_DEBUG(dbgs() << "Failed to lower bit test for switch");
+        return false;
+      }
+      break;
+    }
+
+    case CC_JumpTable: {
+      if (!lowerJumpTableWorkItem(W, SwitchMBB, CurMBB, DefaultMBB, MIB, BBI,
+                                  UnhandledProbs, I, Fallthrough,
+                                  FallthroughUnreachable)) {
+        LLVM_DEBUG(dbgs() << "Failed to lower jump table");
+        return false;
+      }
+      break;
+    }
+    case CC_Range: {
+      if (!lowerSwitchRangeWorkItem(I, Cond, Fallthrough,
+                                    FallthroughUnreachable, UnhandledProbs,
+                                    CurMBB, MIB, SwitchMBB)) {
+        LLVM_DEBUG(dbgs() << "Failed to lower switch range");
+        return false;
+      }
+      break;
+    }
+    }
+    CurMBB = Fallthrough;
+  }
+
+  return true;
+}
+
+bool IRTranslator::translateIndirectBr(const User &U,
+                                       MachineIRBuilder &MIRBuilder) {
+  const IndirectBrInst &BrInst = cast<IndirectBrInst>(U);
+
+  const Register Tgt = getOrCreateVReg(*BrInst.getAddress());
+  MIRBuilder.buildBrIndirect(Tgt);
+
+  // Link successors.
+  SmallPtrSet<const BasicBlock *, 32> AddedSuccessors;
+  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+  for (const BasicBlock *Succ : successors(&BrInst)) {
+    // It's legal for indirectbr instructions to have duplicate blocks in the
+    // destination list. We don't allow this in MIR. Skip anything that's
+    // already a successor.
+    if (!AddedSuccessors.insert(Succ).second)
+      continue;
+    CurBB.addSuccessor(&getMBB(*Succ));
+  }
+
+  return true;
+}
+
+static bool isSwiftError(const Value *V) {
+  if (auto Arg = dyn_cast<Argument>(V))
+    return Arg->hasSwiftErrorAttr();
+  if (auto AI = dyn_cast<AllocaInst>(V))
+    return AI->isSwiftError();
+  return false;
+}
+
+bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
+  const LoadInst &LI = cast<LoadInst>(U);
+
+  unsigned StoreSize = DL->getTypeStoreSize(LI.getType());
+  if (StoreSize == 0)
+    return true;
+
+  ArrayRef<Register> Regs = getOrCreateVRegs(LI);
+  ArrayRef<uint64_t> Offsets = *VMap.getOffsets(LI);
+  Register Base = getOrCreateVReg(*LI.getPointerOperand());
+  AAMDNodes AAInfo = LI.getAAMetadata();
+
+  const Value *Ptr = LI.getPointerOperand();
+  Type *OffsetIRTy = DL->getIndexType(Ptr->getType());
+  LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
+
+  if (CLI->supportSwiftError() && isSwiftError(Ptr)) {
+    assert(Regs.size() == 1 && "swifterror should be single pointer");
+    Register VReg =
+        SwiftError.getOrCreateVRegUseAt(&LI, &MIRBuilder.getMBB(), Ptr);
+    MIRBuilder.buildCopy(Regs[0], VReg);
+    return true;
+  }
+
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  MachineMemOperand::Flags Flags =
+      TLI.getLoadMemOperandFlags(LI, *DL, AC, LibInfo);
+  if (AA && !(Flags & MachineMemOperand::MOInvariant)) {
+    if (AA->pointsToConstantMemory(
+            MemoryLocation(Ptr, LocationSize::precise(StoreSize), AAInfo))) {
+      Flags |= MachineMemOperand::MOInvariant;
+    }
+  }
+
+  const MDNode *Ranges =
+      Regs.size() == 1 ? LI.getMetadata(LLVMContext::MD_range) : nullptr;
+  for (unsigned i = 0; i < Regs.size(); ++i) {
+    Register Addr;
+    MIRBuilder.materializePtrAdd(Addr, Base, OffsetTy, Offsets[i] / 8);
+
+    MachinePointerInfo Ptr(LI.getPointerOperand(), Offsets[i] / 8);
+    Align BaseAlign = getMemOpAlign(LI);
+    auto MMO = MF->getMachineMemOperand(
+        Ptr, Flags, MRI->getType(Regs[i]),
+        commonAlignment(BaseAlign, Offsets[i] / 8), AAInfo, Ranges,
+        LI.getSyncScopeID(), LI.getOrdering());
+    MIRBuilder.buildLoad(Regs[i], Addr, *MMO);
+  }
+
+  return true;
+}
+
+bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
+  const StoreInst &SI = cast<StoreInst>(U);
+  if (DL->getTypeStoreSize(SI.getValueOperand()->getType()) == 0)
+    return true;
+
+  ArrayRef<Register> Vals = getOrCreateVRegs(*SI.getValueOperand());
+  ArrayRef<uint64_t> Offsets = *VMap.getOffsets(*SI.getValueOperand());
+  Register Base = getOrCreateVReg(*SI.getPointerOperand());
+
+  Type *OffsetIRTy = DL->getIndexType(SI.getPointerOperandType());
+  LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
+
+  if (CLI->supportSwiftError() && isSwiftError(SI.getPointerOperand())) {
+    assert(Vals.size() == 1 && "swifterror should be single pointer");
+
+    Register VReg = SwiftError.getOrCreateVRegDefAt(&SI, &MIRBuilder.getMBB(),
+                                                    SI.getPointerOperand());
+    MIRBuilder.buildCopy(VReg, Vals[0]);
+    return true;
+  }
+
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  MachineMemOperand::Flags Flags = TLI.getStoreMemOperandFlags(SI, *DL);
+
+  for (unsigned i = 0; i < Vals.size(); ++i) {
+    Register Addr;
+    MIRBuilder.materializePtrAdd(Addr, Base, OffsetTy, Offsets[i] / 8);
+
+    MachinePointerInfo Ptr(SI.getPointerOperand(), Offsets[i] / 8);
+    Align BaseAlign = getMemOpAlign(SI);
+    auto MMO = MF->getMachineMemOperand(
+        Ptr, Flags, MRI->getType(Vals[i]),
+        commonAlignment(BaseAlign, Offsets[i] / 8), SI.getAAMetadata(), nullptr,
+        SI.getSyncScopeID(), SI.getOrdering());
+    MIRBuilder.buildStore(Vals[i], Addr, *MMO);
+  }
+  return true;
+}
+
+static uint64_t getOffsetFromIndices(const User &U, const DataLayout &DL) {
+  const Value *Src = U.getOperand(0);
+  Type *Int32Ty = Type::getInt32Ty(U.getContext());
+
+  // getIndexedOffsetInType is designed for GEPs, so the first index is the
+  // usual array element rather than looking into the actual aggregate.
+  SmallVector<Value *, 1> Indices;
+  Indices.push_back(ConstantInt::get(Int32Ty, 0));
+
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(&U)) {
+    for (auto Idx : EVI->indices())
+      Indices.push_back(ConstantInt::get(Int32Ty, Idx));
+  } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(&U)) {
+    for (auto Idx : IVI->indices())
+      Indices.push_back(ConstantInt::get(Int32Ty, Idx));
+  } else {
+    for (unsigned i = 1; i < U.getNumOperands(); ++i)
+      Indices.push_back(U.getOperand(i));
+  }
+
+  return 8 * static_cast<uint64_t>(
+                 DL.getIndexedOffsetInType(Src->getType(), Indices));
+}
+
+bool IRTranslator::translateExtractValue(const User &U,
+                                         MachineIRBuilder &MIRBuilder) {
+  const Value *Src = U.getOperand(0);
+  uint64_t Offset = getOffsetFromIndices(U, *DL);
+  ArrayRef<Register> SrcRegs = getOrCreateVRegs(*Src);
+  ArrayRef<uint64_t> Offsets = *VMap.getOffsets(*Src);
+  unsigned Idx = llvm::lower_bound(Offsets, Offset) - Offsets.begin();
+  auto &DstRegs = allocateVRegs(U);
+
+  for (unsigned i = 0; i < DstRegs.size(); ++i)
+    DstRegs[i] = SrcRegs[Idx++];
+
+  return true;
+}
+
+bool IRTranslator::translateInsertValue(const User &U,
+                                        MachineIRBuilder &MIRBuilder) {
+  const Value *Src = U.getOperand(0);
+  uint64_t Offset = getOffsetFromIndices(U, *DL);
+  auto &DstRegs = allocateVRegs(U);
+  ArrayRef<uint64_t> DstOffsets = *VMap.getOffsets(U);
+  ArrayRef<Register> SrcRegs = getOrCreateVRegs(*Src);
+  ArrayRef<Register> InsertedRegs = getOrCreateVRegs(*U.getOperand(1));
+  auto *InsertedIt = InsertedRegs.begin();
+
+  for (unsigned i = 0; i < DstRegs.size(); ++i) {
+    if (DstOffsets[i] >= Offset && InsertedIt != InsertedRegs.end())
+      DstRegs[i] = *InsertedIt++;
+    else
+      DstRegs[i] = SrcRegs[i];
+  }
+
+  return true;
+}
+
+bool IRTranslator::translateSelect(const User &U,
+                                   MachineIRBuilder &MIRBuilder) {
+  Register Tst = getOrCreateVReg(*U.getOperand(0));
+  ArrayRef<Register> ResRegs = getOrCreateVRegs(U);
+  ArrayRef<Register> Op0Regs = getOrCreateVRegs(*U.getOperand(1));
+  ArrayRef<Register> Op1Regs = getOrCreateVRegs(*U.getOperand(2));
+
+  uint32_t Flags = 0;
+  if (const SelectInst *SI = dyn_cast<SelectInst>(&U))
+    Flags = MachineInstr::copyFlagsFromInstruction(*SI);
+
+  for (unsigned i = 0; i < ResRegs.size(); ++i) {
+    MIRBuilder.buildSelect(ResRegs[i], Tst, Op0Regs[i], Op1Regs[i], Flags);
+  }
+
+  return true;
+}
+
+bool IRTranslator::translateCopy(const User &U, const Value &V,
+                                 MachineIRBuilder &MIRBuilder) {
+  Register Src = getOrCreateVReg(V);
+  auto &Regs = *VMap.getVRegs(U);
+  if (Regs.empty()) {
+    Regs.push_back(Src);
+    VMap.getOffsets(U)->push_back(0);
+  } else {
+    // If we already assigned a vreg for this instruction, we can't change that.
+    // Emit a copy to satisfy the users we already emitted.
+    MIRBuilder.buildCopy(Regs[0], Src);
+  }
+  return true;
+}
+
+bool IRTranslator::translateBitCast(const User &U,
+                                    MachineIRBuilder &MIRBuilder) {
+  // If we're bitcasting to the source type, we can reuse the source vreg.
+  if (getLLTForType(*U.getOperand(0)->getType(), *DL) ==
+      getLLTForType(*U.getType(), *DL)) {
+    // If the source is a ConstantInt then it was probably created by
+    // ConstantHoisting and we should leave it alone.
+    if (isa<ConstantInt>(U.getOperand(0)))
+      return translateCast(TargetOpcode::G_CONSTANT_FOLD_BARRIER, U,
+                           MIRBuilder);
+    return translateCopy(U, *U.getOperand(0), MIRBuilder);
+  }
+
+  return translateCast(TargetOpcode::G_BITCAST, U, MIRBuilder);
+}
+
+bool IRTranslator::translateCast(unsigned Opcode, const User &U,
+                                 MachineIRBuilder &MIRBuilder) {
+  Register Op = getOrCreateVReg(*U.getOperand(0));
+  Register Res = getOrCreateVReg(U);
+  MIRBuilder.buildInstr(Opcode, {Res}, {Op});
+  return true;
+}
+
+bool IRTranslator::translateGetElementPtr(const User &U,
+                                          MachineIRBuilder &MIRBuilder) {
+  Value &Op0 = *U.getOperand(0);
+  Register BaseReg = getOrCreateVReg(Op0);
+  Type *PtrIRTy = Op0.getType();
+  LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
+  Type *OffsetIRTy = DL->getIndexType(PtrIRTy);
+  LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
+
+  // Normalize Vector GEP - all scalar operands should be converted to the
+  // splat vector.
+  unsigned VectorWidth = 0;
+
+  // True if we should use a splat vector; using VectorWidth alone is not
+  // sufficient.
+  bool WantSplatVector = false;
+  if (auto *VT = dyn_cast<VectorType>(U.getType())) {
+    VectorWidth = cast<FixedVectorType>(VT)->getNumElements();
+    // We don't produce 1 x N vectors; those are treated as scalars.
+    WantSplatVector = VectorWidth > 1;
+  }
+
+  // We might need to splat the base pointer into a vector if the offsets
+  // are vectors.
+  if (WantSplatVector && !PtrTy.isVector()) {
+    BaseReg =
+        MIRBuilder
+            .buildSplatVector(LLT::fixed_vector(VectorWidth, PtrTy), BaseReg)
+            .getReg(0);
+    PtrIRTy = FixedVectorType::get(PtrIRTy, VectorWidth);
+    PtrTy = getLLTForType(*PtrIRTy, *DL);
+    OffsetIRTy = DL->getIndexType(PtrIRTy);
+    OffsetTy = getLLTForType(*OffsetIRTy, *DL);
+  }
+
+  int64_t Offset = 0;
+  for (gep_type_iterator GTI = gep_type_begin(&U), E = gep_type_end(&U);
+       GTI != E; ++GTI) {
+    const Value *Idx = GTI.getOperand();
+    if (StructType *StTy = GTI.getStructTypeOrNull()) {
+      unsigned Field = cast<Constant>(Idx)->getUniqueInteger().getZExtValue();
+      Offset += DL->getStructLayout(StTy)->getElementOffset(Field);
+      continue;
+    } else {
+      uint64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
+
+      // If this is a scalar constant or a splat vector of constants,
+      // handle it quickly.
+      if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
+        Offset += ElementSize * CI->getSExtValue();
+        continue;
+      }
+
+      if (Offset != 0) {
+        auto OffsetMIB = MIRBuilder.buildConstant({OffsetTy}, Offset);
+        BaseReg = MIRBuilder.buildPtrAdd(PtrTy, BaseReg, OffsetMIB.getReg(0))
+                      .getReg(0);
+        Offset = 0;
+      }
+
+      Register IdxReg = getOrCreateVReg(*Idx);
+      LLT IdxTy = MRI->getType(IdxReg);
+      if (IdxTy != OffsetTy) {
+        if (!IdxTy.isVector() && WantSplatVector) {
+          IdxReg = MIRBuilder.buildSplatVector(
+            OffsetTy.changeElementType(IdxTy), IdxReg).getReg(0);
+        }
+
+        IdxReg = MIRBuilder.buildSExtOrTrunc(OffsetTy, IdxReg).getReg(0);
+      }
+
+      // N = N + Idx * ElementSize;
+      // Avoid doing it for ElementSize of 1.
+      Register GepOffsetReg;
+      if (ElementSize != 1) {
+        auto ElementSizeMIB = MIRBuilder.buildConstant(
+            getLLTForType(*OffsetIRTy, *DL), ElementSize);
+        GepOffsetReg =
+            MIRBuilder.buildMul(OffsetTy, IdxReg, ElementSizeMIB).getReg(0);
+      } else
+        GepOffsetReg = IdxReg;
+
+      BaseReg = MIRBuilder.buildPtrAdd(PtrTy, BaseReg, GepOffsetReg).getReg(0);
+    }
+  }
+
+  if (Offset != 0) {
+    auto OffsetMIB =
+        MIRBuilder.buildConstant(OffsetTy, Offset);
+    MIRBuilder.buildPtrAdd(getOrCreateVReg(U), BaseReg, OffsetMIB.getReg(0));
+    return true;
+  }
+
+  MIRBuilder.buildCopy(getOrCreateVReg(U), BaseReg);
+  return true;
+}
+
+bool IRTranslator::translateMemFunc(const CallInst &CI,
+                                    MachineIRBuilder &MIRBuilder,
+                                    unsigned Opcode) {
+  const Value *SrcPtr = CI.getArgOperand(1);
+  // If the source is undef, then just emit a nop.
+  if (isa<UndefValue>(SrcPtr))
+    return true;
+
+  SmallVector<Register, 3> SrcRegs;
+
+  unsigned MinPtrSize = UINT_MAX;
+  for (auto AI = CI.arg_begin(), AE = CI.arg_end(); std::next(AI) != AE; ++AI) {
+    Register SrcReg = getOrCreateVReg(**AI);
+    LLT SrcTy = MRI->getType(SrcReg);
+    if (SrcTy.isPointer())
+      MinPtrSize = std::min<unsigned>(SrcTy.getSizeInBits(), MinPtrSize);
+    SrcRegs.push_back(SrcReg);
+  }
+
+  LLT SizeTy = LLT::scalar(MinPtrSize);
+
+  // The size operand should be the minimum of the pointer sizes.
+  Register &SizeOpReg = SrcRegs[SrcRegs.size() - 1];
+  if (MRI->getType(SizeOpReg) != SizeTy)
+    SizeOpReg = MIRBuilder.buildZExtOrTrunc(SizeTy, SizeOpReg).getReg(0);
+
+  auto ICall = MIRBuilder.buildInstr(Opcode);
+  for (Register SrcReg : SrcRegs)
+    ICall.addUse(SrcReg);
+
+  Align DstAlign;
+  Align SrcAlign;
+  unsigned IsVol =
+      cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1))->getZExtValue();
+
+  ConstantInt *CopySize = nullptr;
+
+  if (auto *MCI = dyn_cast<MemCpyInst>(&CI)) {
+    DstAlign = MCI->getDestAlign().valueOrOne();
+    SrcAlign = MCI->getSourceAlign().valueOrOne();
+    CopySize = dyn_cast<ConstantInt>(MCI->getArgOperand(2));
+  } else if (auto *MCI = dyn_cast<MemCpyInlineInst>(&CI)) {
+    DstAlign = MCI->getDestAlign().valueOrOne();
+    SrcAlign = MCI->getSourceAlign().valueOrOne();
+    CopySize = dyn_cast<ConstantInt>(MCI->getArgOperand(2));
+  } else if (auto *MMI = dyn_cast<MemMoveInst>(&CI)) {
+    DstAlign = MMI->getDestAlign().valueOrOne();
+    SrcAlign = MMI->getSourceAlign().valueOrOne();
+    CopySize = dyn_cast<ConstantInt>(MMI->getArgOperand(2));
+  } else {
+    auto *MSI = cast<MemSetInst>(&CI);
+    DstAlign = MSI->getDestAlign().valueOrOne();
+  }
+
+  if (Opcode != TargetOpcode::G_MEMCPY_INLINE) {
+    // We need to propagate the tail call flag from the IR inst as an argument.
+    // Otherwise, we have to pessimize and assume later that we cannot tail call
+    // any memory intrinsics.
+    ICall.addImm(CI.isTailCall() ? 1 : 0);
+  }
+
+  // Create mem operands to store the alignment and volatile info.
+  MachineMemOperand::Flags LoadFlags = MachineMemOperand::MOLoad;
+  MachineMemOperand::Flags StoreFlags = MachineMemOperand::MOStore;
+  if (IsVol) {
+    LoadFlags |= MachineMemOperand::MOVolatile;
+    StoreFlags |= MachineMemOperand::MOVolatile;
+  }
+
+  AAMDNodes AAInfo = CI.getAAMetadata();
+  if (AA && CopySize &&
+      AA->pointsToConstantMemory(MemoryLocation(
+          SrcPtr, LocationSize::precise(CopySize->getZExtValue()), AAInfo))) {
+    LoadFlags |= MachineMemOperand::MOInvariant;
+
+    // FIXME: pointsToConstantMemory probably does not imply dereferenceable,
+    // but the previous usage implied it did. Probably should check
+    // isDereferenceableAndAlignedPointer.
+    LoadFlags |= MachineMemOperand::MODereferenceable;
+  }
+
+  ICall.addMemOperand(
+      MF->getMachineMemOperand(MachinePointerInfo(CI.getArgOperand(0)),
+                               StoreFlags, 1, DstAlign, AAInfo));
+  if (Opcode != TargetOpcode::G_MEMSET)
+    ICall.addMemOperand(MF->getMachineMemOperand(
+        MachinePointerInfo(SrcPtr), LoadFlags, 1, SrcAlign, AAInfo));
+
+  return true;
+}
+
+void IRTranslator::getStackGuard(Register DstReg,
+                                 MachineIRBuilder &MIRBuilder) {
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  MRI->setRegClass(DstReg, TRI->getPointerRegClass(*MF));
+  auto MIB =
+      MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD, {DstReg}, {});
+
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  Value *Global = TLI.getSDagStackGuard(*MF->getFunction().getParent());
+  if (!Global)
+    return;
+
+  unsigned AddrSpace = Global->getType()->getPointerAddressSpace();
+  LLT PtrTy = LLT::pointer(AddrSpace, DL->getPointerSizeInBits(AddrSpace));
+
+  MachinePointerInfo MPInfo(Global);
+  auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
+               MachineMemOperand::MODereferenceable;
+  MachineMemOperand *MemRef = MF->getMachineMemOperand(
+      MPInfo, Flags, PtrTy, DL->getPointerABIAlignment(AddrSpace));
+  MIB.setMemRefs({MemRef});
+}
+
+bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
+                                              MachineIRBuilder &MIRBuilder) {
+  ArrayRef<Register> ResRegs = getOrCreateVRegs(CI);
+  MIRBuilder.buildInstr(
+      Op, {ResRegs[0], ResRegs[1]},
+      {getOrCreateVReg(*CI.getOperand(0)), getOrCreateVReg(*CI.getOperand(1))});
+
+  return true;
+}
+
+bool IRTranslator::translateFixedPointIntrinsic(unsigned Op, const CallInst &CI,
+                                                MachineIRBuilder &MIRBuilder) {
+  Register Dst = getOrCreateVReg(CI);
+  Register Src0 = getOrCreateVReg(*CI.getOperand(0));
+  Register Src1 = getOrCreateVReg(*CI.getOperand(1));
+  uint64_t Scale = cast<ConstantInt>(CI.getOperand(2))->getZExtValue();
+  MIRBuilder.buildInstr(Op, {Dst}, { Src0, Src1, Scale });
+  return true;
+}
+
+unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
+  switch (ID) {
+    default:
+      break;
+    case Intrinsic::bswap:
+      return TargetOpcode::G_BSWAP;
+    case Intrinsic::bitreverse:
+      return TargetOpcode::G_BITREVERSE;
+    case Intrinsic::fshl:
+      return TargetOpcode::G_FSHL;
+    case Intrinsic::fshr:
+      return TargetOpcode::G_FSHR;
+    case Intrinsic::ceil:
+      return TargetOpcode::G_FCEIL;
+    case Intrinsic::cos:
+      return TargetOpcode::G_FCOS;
+    case Intrinsic::ctpop:
+      return TargetOpcode::G_CTPOP;
+    case Intrinsic::exp:
+      return TargetOpcode::G_FEXP;
+    case Intrinsic::exp2:
+      return TargetOpcode::G_FEXP2;
+    case Intrinsic::fabs:
+      return TargetOpcode::G_FABS;
+    case Intrinsic::copysign:
+      return TargetOpcode::G_FCOPYSIGN;
+    case Intrinsic::minnum:
+      return TargetOpcode::G_FMINNUM;
+    case Intrinsic::maxnum:
+      return TargetOpcode::G_FMAXNUM;
+    case Intrinsic::minimum:
+      return TargetOpcode::G_FMINIMUM;
+    case Intrinsic::maximum:
+      return TargetOpcode::G_FMAXIMUM;
+    case Intrinsic::canonicalize:
+      return TargetOpcode::G_FCANONICALIZE;
+    case Intrinsic::floor:
+      return TargetOpcode::G_FFLOOR;
+    case Intrinsic::fma:
+      return TargetOpcode::G_FMA;
+    case Intrinsic::log:
+      return TargetOpcode::G_FLOG;
+    case Intrinsic::log2:
+      return TargetOpcode::G_FLOG2;
+    case Intrinsic::log10:
+      return TargetOpcode::G_FLOG10;
+    case Intrinsic::ldexp:
+      return TargetOpcode::G_FLDEXP;
+    case Intrinsic::nearbyint:
+      return TargetOpcode::G_FNEARBYINT;
+    case Intrinsic::pow:
+      return TargetOpcode::G_FPOW;
+    case Intrinsic::powi:
+      return TargetOpcode::G_FPOWI;
+    case Intrinsic::rint:
+      return TargetOpcode::G_FRINT;
+    case Intrinsic::round:
+      return TargetOpcode::G_INTRINSIC_ROUND;
+    case Intrinsic::roundeven:
+      return TargetOpcode::G_INTRINSIC_ROUNDEVEN;
+    case Intrinsic::sin:
+      return TargetOpcode::G_FSIN;
+    case Intrinsic::sqrt:
+      return TargetOpcode::G_FSQRT;
+    case Intrinsic::trunc:
+      return TargetOpcode::G_INTRINSIC_TRUNC;
+    case Intrinsic::readcyclecounter:
+      return TargetOpcode::G_READCYCLECOUNTER;
+    case Intrinsic::ptrmask:
+      return TargetOpcode::G_PTRMASK;
+    case Intrinsic::lrint:
+      return TargetOpcode::G_INTRINSIC_LRINT;
+    // FADD/FMUL require checking the FMF, so are handled elsewhere.
+    case Intrinsic::vector_reduce_fmin:
+      return TargetOpcode::G_VECREDUCE_FMIN;
+    case Intrinsic::vector_reduce_fmax:
+      return TargetOpcode::G_VECREDUCE_FMAX;
+    case Intrinsic::vector_reduce_add:
+      return TargetOpcode::G_VECREDUCE_ADD;
+    case Intrinsic::vector_reduce_mul:
+      return TargetOpcode::G_VECREDUCE_MUL;
+    case Intrinsic::vector_reduce_and:
+      return TargetOpcode::G_VECREDUCE_AND;
+    case Intrinsic::vector_reduce_or:
+      return TargetOpcode::G_VECREDUCE_OR;
+    case Intrinsic::vector_reduce_xor:
+      return TargetOpcode::G_VECREDUCE_XOR;
+    case Intrinsic::vector_reduce_smax:
+      return TargetOpcode::G_VECREDUCE_SMAX;
+    case Intrinsic::vector_reduce_smin:
+      return TargetOpcode::G_VECREDUCE_SMIN;
+    case Intrinsic::vector_reduce_umax:
+      return TargetOpcode::G_VECREDUCE_UMAX;
+    case Intrinsic::vector_reduce_umin:
+      return TargetOpcode::G_VECREDUCE_UMIN;
+    case Intrinsic::lround:
+      return TargetOpcode::G_LROUND;
+    case Intrinsic::llround:
+      return TargetOpcode::G_LLROUND;
+  }
+  return Intrinsic::not_intrinsic;
+}
+
+bool IRTranslator::translateSimpleIntrinsic(const CallInst &CI,
+                                            Intrinsic::ID ID,
+                                            MachineIRBuilder &MIRBuilder) {
+
+  unsigned Op = getSimpleIntrinsicOpcode(ID);
+
+  // Is this a simple intrinsic?
+  if (Op == Intrinsic::not_intrinsic)
+    return false;
+
+  // Yes. Let's translate it.
+  SmallVector<llvm::SrcOp, 4> VRegs;
+  for (const auto &Arg : CI.args())
+    VRegs.push_back(getOrCreateVReg(*Arg));
+
+  MIRBuilder.buildInstr(Op, {getOrCreateVReg(CI)}, VRegs,
+                        MachineInstr::copyFlagsFromInstruction(CI));
+  return true;
+}
+
+// TODO: Include ConstainedOps.def when all strict instructions are defined.
+static unsigned getConstrainedOpcode(Intrinsic::ID ID) {
+  switch (ID) {
+  case Intrinsic::experimental_constrained_fadd:
+    return TargetOpcode::G_STRICT_FADD;
+  case Intrinsic::experimental_constrained_fsub:
+    return TargetOpcode::G_STRICT_FSUB;
+  case Intrinsic::experimental_constrained_fmul:
+    return TargetOpcode::G_STRICT_FMUL;
+  case Intrinsic::experimental_constrained_fdiv:
+    return TargetOpcode::G_STRICT_FDIV;
+  case Intrinsic::experimental_constrained_frem:
+    return TargetOpcode::G_STRICT_FREM;
+  case Intrinsic::experimental_constrained_fma:
+    return TargetOpcode::G_STRICT_FMA;
+  case Intrinsic::experimental_constrained_sqrt:
+    return TargetOpcode::G_STRICT_FSQRT;
+  case Intrinsic::experimental_constrained_ldexp:
+    return TargetOpcode::G_STRICT_FLDEXP;
+  default:
+    return 0;
+  }
+}
+
+bool IRTranslator::translateConstrainedFPIntrinsic(
+  const ConstrainedFPIntrinsic &FPI, MachineIRBuilder &MIRBuilder) {
+  fp::ExceptionBehavior EB = *FPI.getExceptionBehavior();
+
+  unsigned Opcode = getConstrainedOpcode(FPI.getIntrinsicID());
+  if (!Opcode)
+    return false;
+
+  uint32_t Flags = MachineInstr::copyFlagsFromInstruction(FPI);
+  if (EB == fp::ExceptionBehavior::ebIgnore)
+    Flags |= MachineInstr::NoFPExcept;
+
+  SmallVector<llvm::SrcOp, 4> VRegs;
+  VRegs.push_back(getOrCreateVReg(*FPI.getArgOperand(0)));
+  if (!FPI.isUnaryOp())
+    VRegs.push_back(getOrCreateVReg(*FPI.getArgOperand(1)));
+  if (FPI.isTernaryOp())
+    VRegs.push_back(getOrCreateVReg(*FPI.getArgOperand(2)));
+
+  MIRBuilder.buildInstr(Opcode, {getOrCreateVReg(FPI)}, VRegs, Flags);
+  return true;
+}
+
+std::optional<MCRegister> IRTranslator::getArgPhysReg(Argument &Arg) {
+  auto VRegs = getOrCreateVRegs(Arg);
+  if (VRegs.size() != 1)
+    return std::nullopt;
+
+  // Arguments are lowered as a copy of a livein physical register.
+  auto *VRegDef = MF->getRegInfo().getVRegDef(VRegs[0]);
+  if (!VRegDef || !VRegDef->isCopy())
+    return std::nullopt;
+  return VRegDef->getOperand(1).getReg().asMCReg();
+}
+
+bool IRTranslator::translateIfEntryValueArgument(const DbgValueInst &DebugInst,
+                                                 MachineIRBuilder &MIRBuilder) {
+  auto *Arg = dyn_cast<Argument>(DebugInst.getValue());
+  if (!Arg)
+    return false;
+
+  const DIExpression *Expr = DebugInst.getExpression();
+  if (!Expr->isEntryValue())
+    return false;
+
+  std::optional<MCRegister> PhysReg = getArgPhysReg(*Arg);
+  if (!PhysReg) {
+    LLVM_DEBUG(dbgs() << "Dropping dbg.value: expression is entry_value but "
+                         "couldn't find a physical register\n"
+                      << DebugInst << "\n");
+    return true;
+  }
+
+  MIRBuilder.buildDirectDbgValue(*PhysReg, DebugInst.getVariable(),
+                                 DebugInst.getExpression());
+  return true;
+}
+
+bool IRTranslator::translateIfEntryValueArgument(
+    const DbgDeclareInst &DebugInst) {
+  auto *Arg = dyn_cast<Argument>(DebugInst.getAddress());
+  if (!Arg)
+    return false;
+
+  const DIExpression *Expr = DebugInst.getExpression();
+  if (!Expr->isEntryValue())
+    return false;
+
+  std::optional<MCRegister> PhysReg = getArgPhysReg(*Arg);
+  if (!PhysReg)
+    return false;
+
+  MF->setVariableDbgInfo(DebugInst.getVariable(), Expr, *PhysReg,
+                         DebugInst.getDebugLoc());
+  return true;
+}
+
+bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
+                                           MachineIRBuilder &MIRBuilder) {
+  if (auto *MI = dyn_cast<AnyMemIntrinsic>(&CI)) {
+    if (ORE->enabled()) {
+      if (MemoryOpRemark::canHandle(MI, *LibInfo)) {
+        MemoryOpRemark R(*ORE, "gisel-irtranslator-memsize", *DL, *LibInfo);
+        R.visit(MI);
+      }
+    }
+  }
+
+  // If this is a simple intrinsic (that is, we just need to add a def of
+  // a vreg, and uses for each arg operand, then translate it.
+  if (translateSimpleIntrinsic(CI, ID, MIRBuilder))
+    return true;
+
+  switch (ID) {
+  default:
+    break;
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end: {
+    // No stack colouring in O0, discard region information.
+    if (MF->getTarget().getOptLevel() == CodeGenOpt::None)
+      return true;
+
+    unsigned Op = ID == Intrinsic::lifetime_start ? TargetOpcode::LIFETIME_START
+                                                  : TargetOpcode::LIFETIME_END;
+
+    // Get the underlying objects for the location passed on the lifetime
+    // marker.
+    SmallVector<const Value *, 4> Allocas;
+    getUnderlyingObjects(CI.getArgOperand(1), Allocas);
+
+    // Iterate over each underlying object, creating lifetime markers for each
+    // static alloca. Quit if we find a non-static alloca.
+    for (const Value *V : Allocas) {
+      const AllocaInst *AI = dyn_cast<AllocaInst>(V);
+      if (!AI)
+        continue;
+
+      if (!AI->isStaticAlloca())
+        return true;
+
+      MIRBuilder.buildInstr(Op).addFrameIndex(getOrCreateFrameIndex(*AI));
+    }
+    return true;
+  }
+  case Intrinsic::dbg_declare: {
+    const DbgDeclareInst &DI = cast<DbgDeclareInst>(CI);
+    assert(DI.getVariable() && "Missing variable");
+
+    const Value *Address = DI.getAddress();
+    if (!Address || isa<UndefValue>(Address)) {
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+      return true;
+    }
+
+    assert(DI.getVariable()->isValidLocationForIntrinsic(
+               MIRBuilder.getDebugLoc()) &&
+           "Expected inlined-at fields to agree");
+    auto AI = dyn_cast<AllocaInst>(Address);
+    if (AI && AI->isStaticAlloca()) {
+      // Static allocas are tracked at the MF level, no need for DBG_VALUE
+      // instructions (in fact, they get ignored if they *do* exist).
+      MF->setVariableDbgInfo(DI.getVariable(), DI.getExpression(),
+                             getOrCreateFrameIndex(*AI), DI.getDebugLoc());
+      return true;
+    }
+
+    if (translateIfEntryValueArgument(DI))
+      return true;
+
+    // A dbg.declare describes the address of a source variable, so lower it
+    // into an indirect DBG_VALUE.
+    MIRBuilder.buildIndirectDbgValue(getOrCreateVReg(*Address),
+                                     DI.getVariable(), DI.getExpression());
+    return true;
+  }
+  case Intrinsic::dbg_label: {
+    const DbgLabelInst &DI = cast<DbgLabelInst>(CI);
+    assert(DI.getLabel() && "Missing label");
+
+    assert(DI.getLabel()->isValidLocationForIntrinsic(
+               MIRBuilder.getDebugLoc()) &&
+           "Expected inlined-at fields to agree");
+
+    MIRBuilder.buildDbgLabel(DI.getLabel());
+    return true;
+  }
+  case Intrinsic::vaend:
+    // No target I know of cares about va_end. Certainly no in-tree target
+    // does. Simplest intrinsic ever!
+    return true;
+  case Intrinsic::vastart: {
+    auto &TLI = *MF->getSubtarget().getTargetLowering();
+    Value *Ptr = CI.getArgOperand(0);
+    unsigned ListSize = TLI.getVaListSizeInBits(*DL) / 8;
+
+    // FIXME: Get alignment
+    MIRBuilder.buildInstr(TargetOpcode::G_VASTART, {}, {getOrCreateVReg(*Ptr)})
+        .addMemOperand(MF->getMachineMemOperand(MachinePointerInfo(Ptr),
+                                                MachineMemOperand::MOStore,
+                                                ListSize, Align(1)));
+    return true;
+  }
+  case Intrinsic::dbg_value: {
+    // This form of DBG_VALUE is target-independent.
+    const DbgValueInst &DI = cast<DbgValueInst>(CI);
+    const Value *V = DI.getValue();
+    assert(DI.getVariable()->isValidLocationForIntrinsic(
+               MIRBuilder.getDebugLoc()) &&
+           "Expected inlined-at fields to agree");
+    if (!V || DI.hasArgList()) {
+      // DI cannot produce a valid DBG_VALUE, so produce an undef DBG_VALUE to
+      // terminate any prior location.
+      MIRBuilder.buildIndirectDbgValue(0, DI.getVariable(), DI.getExpression());
+      return true;
+    }
+    if (const auto *CI = dyn_cast<Constant>(V)) {
+      MIRBuilder.buildConstDbgValue(*CI, DI.getVariable(), DI.getExpression());
+      return true;
+    }
+    if (auto *AI = dyn_cast<AllocaInst>(V);
+        AI && AI->isStaticAlloca() && DI.getExpression()->startsWithDeref()) {
+      // If the value is an alloca and the expression starts with a
+      // dereference, track a stack slot instead of a register, as registers
+      // may be clobbered.
+      auto ExprOperands = DI.getExpression()->getElements();
+      auto *ExprDerefRemoved =
+          DIExpression::get(AI->getContext(), ExprOperands.drop_front());
+      MIRBuilder.buildFIDbgValue(getOrCreateFrameIndex(*AI), DI.getVariable(),
+                                 ExprDerefRemoved);
+      return true;
+    }
+    if (translateIfEntryValueArgument(DI, MIRBuilder))
+      return true;
+    for (Register Reg : getOrCreateVRegs(*V)) {
+      // FIXME: This does not handle register-indirect values at offset 0. The
+      // direct/indirect thing shouldn't really be handled by something as
+      // implicit as reg+noreg vs reg+imm in the first place, but it seems
+      // pretty baked in right now.
+      MIRBuilder.buildDirectDbgValue(Reg, DI.getVariable(), DI.getExpression());
+    }
+    return true;
+  }
+  case Intrinsic::uadd_with_overflow:
+    return translateOverflowIntrinsic(CI, TargetOpcode::G_UADDO, MIRBuilder);
+  case Intrinsic::sadd_with_overflow:
+    return translateOverflowIntrinsic(CI, TargetOpcode::G_SADDO, MIRBuilder);
+  case Intrinsic::usub_with_overflow:
+    return translateOverflowIntrinsic(CI, TargetOpcode::G_USUBO, MIRBuilder);
+  case Intrinsic::ssub_with_overflow:
+    return translateOverflowIntrinsic(CI, TargetOpcode::G_SSUBO, MIRBuilder);
+  case Intrinsic::umul_with_overflow:
+    return translateOverflowIntrinsic(CI, TargetOpcode::G_UMULO, MIRBuilder);
+  case Intrinsic::smul_with_overflow:
+    return translateOverflowIntrinsic(CI, TargetOpcode::G_SMULO, MIRBuilder);
+  case Intrinsic::uadd_sat:
+    return translateBinaryOp(TargetOpcode::G_UADDSAT, CI, MIRBuilder);
+  case Intrinsic::sadd_sat:
+    return translateBinaryOp(TargetOpcode::G_SADDSAT, CI, MIRBuilder);
+  case Intrinsic::usub_sat:
+    return translateBinaryOp(TargetOpcode::G_USUBSAT, CI, MIRBuilder);
+  case Intrinsic::ssub_sat:
+    return translateBinaryOp(TargetOpcode::G_SSUBSAT, CI, MIRBuilder);
+  case Intrinsic::ushl_sat:
+    return translateBinaryOp(TargetOpcode::G_USHLSAT, CI, MIRBuilder);
+  case Intrinsic::sshl_sat:
+    return translateBinaryOp(TargetOpcode::G_SSHLSAT, CI, MIRBuilder);
+  case Intrinsic::umin:
+    return translateBinaryOp(TargetOpcode::G_UMIN, CI, MIRBuilder);
+  case Intrinsic::umax:
+    return translateBinaryOp(TargetOpcode::G_UMAX, CI, MIRBuilder);
+  case Intrinsic::smin:
+    return translateBinaryOp(TargetOpcode::G_SMIN, CI, MIRBuilder);
+  case Intrinsic::smax:
+    return translateBinaryOp(TargetOpcode::G_SMAX, CI, MIRBuilder);
+  case Intrinsic::abs:
+    // TODO: Preserve "int min is poison" arg in GMIR?
+    return translateUnaryOp(TargetOpcode::G_ABS, CI, MIRBuilder);
+  case Intrinsic::smul_fix:
+    return translateFixedPointIntrinsic(TargetOpcode::G_SMULFIX, CI, MIRBuilder);
+  case Intrinsic::umul_fix:
+    return translateFixedPointIntrinsic(TargetOpcode::G_UMULFIX, CI, MIRBuilder);
+  case Intrinsic::smul_fix_sat:
+    return translateFixedPointIntrinsic(TargetOpcode::G_SMULFIXSAT, CI, MIRBuilder);
+  case Intrinsic::umul_fix_sat:
+    return translateFixedPointIntrinsic(TargetOpcode::G_UMULFIXSAT, CI, MIRBuilder);
+  case Intrinsic::sdiv_fix:
+    return translateFixedPointIntrinsic(TargetOpcode::G_SDIVFIX, CI, MIRBuilder);
+  case Intrinsic::udiv_fix:
+    return translateFixedPointIntrinsic(TargetOpcode::G_UDIVFIX, CI, MIRBuilder);
+  case Intrinsic::sdiv_fix_sat:
+    return translateFixedPointIntrinsic(TargetOpcode::G_SDIVFIXSAT, CI, MIRBuilder);
+  case Intrinsic::udiv_fix_sat:
+    return translateFixedPointIntrinsic(TargetOpcode::G_UDIVFIXSAT, CI, MIRBuilder);
+  case Intrinsic::fmuladd: {
+    const TargetMachine &TM = MF->getTarget();
+    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
+    Register Dst = getOrCreateVReg(CI);
+    Register Op0 = getOrCreateVReg(*CI.getArgOperand(0));
+    Register Op1 = getOrCreateVReg(*CI.getArgOperand(1));
+    Register Op2 = getOrCreateVReg(*CI.getArgOperand(2));
+    if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
+        TLI.isFMAFasterThanFMulAndFAdd(*MF,
+                                       TLI.getValueType(*DL, CI.getType()))) {
+      // TODO: Revisit this to see if we should move this part of the
+      // lowering to the combiner.
+      MIRBuilder.buildFMA(Dst, Op0, Op1, Op2,
+                          MachineInstr::copyFlagsFromInstruction(CI));
+    } else {
+      LLT Ty = getLLTForType(*CI.getType(), *DL);
+      auto FMul = MIRBuilder.buildFMul(
+          Ty, Op0, Op1, MachineInstr::copyFlagsFromInstruction(CI));
+      MIRBuilder.buildFAdd(Dst, FMul, Op2,
+                           MachineInstr::copyFlagsFromInstruction(CI));
+    }
+    return true;
+  }
+  case Intrinsic::convert_from_fp16:
+    // FIXME: This intrinsic should probably be removed from the IR.
+    MIRBuilder.buildFPExt(getOrCreateVReg(CI),
+                          getOrCreateVReg(*CI.getArgOperand(0)),
+                          MachineInstr::copyFlagsFromInstruction(CI));
+    return true;
+  case Intrinsic::convert_to_fp16:
+    // FIXME: This intrinsic should probably be removed from the IR.
+    MIRBuilder.buildFPTrunc(getOrCreateVReg(CI),
+                            getOrCreateVReg(*CI.getArgOperand(0)),
+                            MachineInstr::copyFlagsFromInstruction(CI));
+    return true;
+  case Intrinsic::frexp: {
+    ArrayRef<Register> VRegs = getOrCreateVRegs(CI);
+    MIRBuilder.buildFFrexp(VRegs[0], VRegs[1],
+                           getOrCreateVReg(*CI.getArgOperand(0)),
+                           MachineInstr::copyFlagsFromInstruction(CI));
+    return true;
+  }
+  case Intrinsic::memcpy_inline:
+    return translateMemFunc(CI, MIRBuilder, TargetOpcode::G_MEMCPY_INLINE);
+  case Intrinsic::memcpy:
+    return translateMemFunc(CI, MIRBuilder, TargetOpcode::G_MEMCPY);
+  case Intrinsic::memmove:
+    return translateMemFunc(CI, MIRBuilder, TargetOpcode::G_MEMMOVE);
+  case Intrinsic::memset:
+    return translateMemFunc(CI, MIRBuilder, TargetOpcode::G_MEMSET);
+  case Intrinsic::eh_typeid_for: {
+    GlobalValue *GV = ExtractTypeInfo(CI.getArgOperand(0));
+    Register Reg = getOrCreateVReg(CI);
+    unsigned TypeID = MF->getTypeIDFor(GV);
+    MIRBuilder.buildConstant(Reg, TypeID);
+    return true;
+  }
+  case Intrinsic::objectsize:
+    llvm_unreachable("llvm.objectsize.* should have been lowered already");
+
+  case Intrinsic::is_constant:
+    llvm_unreachable("llvm.is.constant.* should have been lowered already");
+
+  case Intrinsic::stackguard:
+    getStackGuard(getOrCreateVReg(CI), MIRBuilder);
+    return true;
+  case Intrinsic::stackprotector: {
+    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
+    LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);
+    Register GuardVal;
+    if (TLI.useLoadStackGuardNode()) {
+      GuardVal = MRI->createGenericVirtualRegister(PtrTy);
+      getStackGuard(GuardVal, MIRBuilder);
+    } else
+      GuardVal = getOrCreateVReg(*CI.getArgOperand(0)); // The guard's value.
+
+    AllocaInst *Slot = cast<AllocaInst>(CI.getArgOperand(1));
+    int FI = getOrCreateFrameIndex(*Slot);
+    MF->getFrameInfo().setStackProtectorIndex(FI);
+
+    MIRBuilder.buildStore(
+        GuardVal, getOrCreateVReg(*Slot),
+        *MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
+                                  MachineMemOperand::MOStore |
+                                      MachineMemOperand::MOVolatile,
+                                  PtrTy, Align(8)));
+    return true;
+  }
+  case Intrinsic::stacksave: {
+    // Save the stack pointer to the location provided by the intrinsic.
+    Register Reg = getOrCreateVReg(CI);
+    Register StackPtr = MF->getSubtarget()
+                            .getTargetLowering()
+                            ->getStackPointerRegisterToSaveRestore();
+
+    // If the target doesn't specify a stack pointer, then fall back.
+    if (!StackPtr)
+      return false;
+
+    MIRBuilder.buildCopy(Reg, StackPtr);
+    return true;
+  }
+  case Intrinsic::stackrestore: {
+    // Restore the stack pointer from the location provided by the intrinsic.
+    Register Reg = getOrCreateVReg(*CI.getArgOperand(0));
+    Register StackPtr = MF->getSubtarget()
+                            .getTargetLowering()
+                            ->getStackPointerRegisterToSaveRestore();
+
+    // If the target doesn't specify a stack pointer, then fall back.
+    if (!StackPtr)
+      return false;
+
+    MIRBuilder.buildCopy(StackPtr, Reg);
+    return true;
+  }
+  case Intrinsic::cttz:
+  case Intrinsic::ctlz: {
+    ConstantInt *Cst = cast<ConstantInt>(CI.getArgOperand(1));
+    bool isTrailing = ID == Intrinsic::cttz;
+    unsigned Opcode = isTrailing
+                          ? Cst->isZero() ? TargetOpcode::G_CTTZ
+                                          : TargetOpcode::G_CTTZ_ZERO_UNDEF
+                          : Cst->isZero() ? TargetOpcode::G_CTLZ
+                                          : TargetOpcode::G_CTLZ_ZERO_UNDEF;
+    MIRBuilder.buildInstr(Opcode, {getOrCreateVReg(CI)},
+                          {getOrCreateVReg(*CI.getArgOperand(0))});
+    return true;
+  }
+  case Intrinsic::invariant_start: {
+    LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);
+    Register Undef = MRI->createGenericVirtualRegister(PtrTy);
+    MIRBuilder.buildUndef(Undef);
+    return true;
+  }
+  case Intrinsic::invariant_end:
+    return true;
+  case Intrinsic::expect:
+  case Intrinsic::annotation:
+  case Intrinsic::ptr_annotation:
+  case Intrinsic::launder_invariant_group:
+  case Intrinsic::strip_invariant_group: {
+    // Drop the intrinsic, but forward the value.
+    MIRBuilder.buildCopy(getOrCreateVReg(CI),
+                         getOrCreateVReg(*CI.getArgOperand(0)));
+    return true;
+  }
+  case Intrinsic::assume:
+  case Intrinsic::experimental_noalias_scope_decl:
+  case Intrinsic::var_annotation:
+  case Intrinsic::sideeffect:
+    // Discard annotate attributes, assumptions, and artificial side-effects.
+    return true;
+  case Intrinsic::read_volatile_register:
+  case Intrinsic::read_register: {
+    Value *Arg = CI.getArgOperand(0);
+    MIRBuilder
+        .buildInstr(TargetOpcode::G_READ_REGISTER, {getOrCreateVReg(CI)}, {})
+        .addMetadata(cast<MDNode>(cast<MetadataAsValue>(Arg)->getMetadata()));
+    return true;
+  }
+  case Intrinsic::write_register: {
+    Value *Arg = CI.getArgOperand(0);
+    MIRBuilder.buildInstr(TargetOpcode::G_WRITE_REGISTER)
+      .addMetadata(cast<MDNode>(cast<MetadataAsValue>(Arg)->getMetadata()))
+      .addUse(getOrCreateVReg(*CI.getArgOperand(1)));
+    return true;
+  }
+  case Intrinsic::localescape: {
+    MachineBasicBlock &EntryMBB = MF->front();
+    StringRef EscapedName = GlobalValue::dropLLVMManglingEscape(MF->getName());
+
+    // Directly emit some LOCAL_ESCAPE machine instrs. Label assignment emission
+    // is the same on all targets.
+    for (unsigned Idx = 0, E = CI.arg_size(); Idx < E; ++Idx) {
+      Value *Arg = CI.getArgOperand(Idx)->stripPointerCasts();
+      if (isa<ConstantPointerNull>(Arg))
+        continue; // Skip null pointers. They represent a hole in index space.
+
+      int FI = getOrCreateFrameIndex(*cast<AllocaInst>(Arg));
+      MCSymbol *FrameAllocSym =
+          MF->getMMI().getContext().getOrCreateFrameAllocSymbol(EscapedName,
+                                                                Idx);
+
+      // This should be inserted at the start of the entry block.
+      auto LocalEscape =
+          MIRBuilder.buildInstrNoInsert(TargetOpcode::LOCAL_ESCAPE)
+              .addSym(FrameAllocSym)
+              .addFrameIndex(FI);
+
+      EntryMBB.insert(EntryMBB.begin(), LocalEscape);
+    }
+
+    return true;
+  }
+  case Intrinsic::vector_reduce_fadd:
+  case Intrinsic::vector_reduce_fmul: {
+    // Need to check for the reassoc flag to decide whether we want a
+    // sequential reduction opcode or not.
+    Register Dst = getOrCreateVReg(CI);
+    Register ScalarSrc = getOrCreateVReg(*CI.getArgOperand(0));
+    Register VecSrc = getOrCreateVReg(*CI.getArgOperand(1));
+    unsigned Opc = 0;
+    if (!CI.hasAllowReassoc()) {
+      // The sequential ordering case.
+      Opc = ID == Intrinsic::vector_reduce_fadd
+                ? TargetOpcode::G_VECREDUCE_SEQ_FADD
+                : TargetOpcode::G_VECREDUCE_SEQ_FMUL;
+      MIRBuilder.buildInstr(Opc, {Dst}, {ScalarSrc, VecSrc},
+                            MachineInstr::copyFlagsFromInstruction(CI));
+      return true;
+    }
+    // We split the operation into a separate G_FADD/G_FMUL + the reduce,
+    // since the associativity doesn't matter.
+    unsigned ScalarOpc;
+    if (ID == Intrinsic::vector_reduce_fadd) {
+      Opc = TargetOpcode::G_VECREDUCE_FADD;
+      ScalarOpc = TargetOpcode::G_FADD;
+    } else {
+      Opc = TargetOpcode::G_VECREDUCE_FMUL;
+      ScalarOpc = TargetOpcode::G_FMUL;
+    }
+    LLT DstTy = MRI->getType(Dst);
+    auto Rdx = MIRBuilder.buildInstr(
+        Opc, {DstTy}, {VecSrc}, MachineInstr::copyFlagsFromInstruction(CI));
+    MIRBuilder.buildInstr(ScalarOpc, {Dst}, {ScalarSrc, Rdx},
+                          MachineInstr::copyFlagsFromInstruction(CI));
+
+    return true;
+  }
+  case Intrinsic::trap:
+  case Intrinsic::debugtrap:
+  case Intrinsic::ubsantrap: {
+    StringRef TrapFuncName =
+        CI.getAttributes().getFnAttr("trap-func-name").getValueAsString();
+    if (TrapFuncName.empty())
+      break; // Use the default handling.
+    CallLowering::CallLoweringInfo Info;
+    if (ID == Intrinsic::ubsantrap) {
+      Info.OrigArgs.push_back({getOrCreateVRegs(*CI.getArgOperand(0)),
+                               CI.getArgOperand(0)->getType(), 0});
+    }
+    Info.Callee = MachineOperand::CreateES(TrapFuncName.data());
+    Info.CB = &CI;
+    Info.OrigRet = {Register(), Type::getVoidTy(CI.getContext()), 0};
+    return CLI->lowerCall(MIRBuilder, Info);
+  }
+  case Intrinsic::fptrunc_round: {
+    uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI);
+
+    // Convert the metadata argument to a constant integer
+    Metadata *MD = cast<MetadataAsValue>(CI.getArgOperand(1))->getMetadata();
+    std::optional<RoundingMode> RoundMode =
+        convertStrToRoundingMode(cast<MDString>(MD)->getString());
+
+    // Add the Rounding mode as an integer
+    MIRBuilder
+        .buildInstr(TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND,
+                    {getOrCreateVReg(CI)},
+                    {getOrCreateVReg(*CI.getArgOperand(0))}, Flags)
+        .addImm((int)*RoundMode);
+
+    return true;
+  }
+  case Intrinsic::is_fpclass: {
+    Value *FpValue = CI.getOperand(0);
+    ConstantInt *TestMaskValue = cast<ConstantInt>(CI.getOperand(1));
+
+    MIRBuilder
+        .buildInstr(TargetOpcode::G_IS_FPCLASS, {getOrCreateVReg(CI)},
+                    {getOrCreateVReg(*FpValue)})
+        .addImm(TestMaskValue->getZExtValue());
+
+    return true;
+  }
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)  \
+  case Intrinsic::INTRINSIC:
+#include "llvm/IR/ConstrainedOps.def"
+    return translateConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(CI),
+                                           MIRBuilder);
+
+  }
+  return false;
+}
+
+bool IRTranslator::translateInlineAsm(const CallBase &CB,
+                                      MachineIRBuilder &MIRBuilder) {
+
+  const InlineAsmLowering *ALI = MF->getSubtarget().getInlineAsmLowering();
+
+  if (!ALI) {
+    LLVM_DEBUG(
+        dbgs() << "Inline asm lowering is not supported for this target yet\n");
+    return false;
+  }
+
+  return ALI->lowerInlineAsm(
+      MIRBuilder, CB, [&](const Value &Val) { return getOrCreateVRegs(Val); });
+}
+
+bool IRTranslator::translateCallBase(const CallBase &CB,
+                                     MachineIRBuilder &MIRBuilder) {
+  ArrayRef<Register> Res = getOrCreateVRegs(CB);
+
+  SmallVector<ArrayRef<Register>, 8> Args;
+  Register SwiftInVReg = 0;
+  Register SwiftErrorVReg = 0;
+  for (const auto &Arg : CB.args()) {
+    if (CLI->supportSwiftError() && isSwiftError(Arg)) {
+      assert(SwiftInVReg == 0 && "Expected only one swift error argument");
+      LLT Ty = getLLTForType(*Arg->getType(), *DL);
+      SwiftInVReg = MRI->createGenericVirtualRegister(Ty);
+      MIRBuilder.buildCopy(SwiftInVReg, SwiftError.getOrCreateVRegUseAt(
+                                            &CB, &MIRBuilder.getMBB(), Arg));
+      Args.emplace_back(ArrayRef(SwiftInVReg));
+      SwiftErrorVReg =
+          SwiftError.getOrCreateVRegDefAt(&CB, &MIRBuilder.getMBB(), Arg);
+      continue;
+    }
+    Args.push_back(getOrCreateVRegs(*Arg));
+  }
+
+  if (auto *CI = dyn_cast<CallInst>(&CB)) {
+    if (ORE->enabled()) {
+      if (MemoryOpRemark::canHandle(CI, *LibInfo)) {
+        MemoryOpRemark R(*ORE, "gisel-irtranslator-memsize", *DL, *LibInfo);
+        R.visit(CI);
+      }
+    }
+  }
+
+  // We don't set HasCalls on MFI here yet because call lowering may decide to
+  // optimize into tail calls. Instead, we defer that to selection where a final
+  // scan is done to check if any instructions are calls.
+  bool Success =
+      CLI->lowerCall(MIRBuilder, CB, Res, Args, SwiftErrorVReg,
+                     [&]() { return getOrCreateVReg(*CB.getCalledOperand()); });
+
+  // Check if we just inserted a tail call.
+  if (Success) {
+    assert(!HasTailCall && "Can't tail call return twice from block?");
+    const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+    HasTailCall = TII->isTailCall(*std::prev(MIRBuilder.getInsertPt()));
+  }
+
+  return Success;
+}
+
+bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
+  const CallInst &CI = cast<CallInst>(U);
+  auto TII = MF->getTarget().getIntrinsicInfo();
+  const Function *F = CI.getCalledFunction();
+
+  // FIXME: support Windows dllimport function calls.
+  if (F && (F->hasDLLImportStorageClass() ||
+            (MF->getTarget().getTargetTriple().isOSWindows() &&
+             F->hasExternalWeakLinkage())))
+    return false;
+
+  // FIXME: support control flow guard targets.
+  if (CI.countOperandBundlesOfType(LLVMContext::OB_cfguardtarget))
+    return false;
+
+  // FIXME: support statepoints and related.
+  if (isa<GCStatepointInst, GCRelocateInst, GCResultInst>(U))
+    return false;
+
+  if (CI.isInlineAsm())
+    return translateInlineAsm(CI, MIRBuilder);
+
+  diagnoseDontCall(CI);
+
+  Intrinsic::ID ID = Intrinsic::not_intrinsic;
+  if (F && F->isIntrinsic()) {
+    ID = F->getIntrinsicID();
+    if (TII && ID == Intrinsic::not_intrinsic)
+      ID = static_cast<Intrinsic::ID>(TII->getIntrinsicID(F));
+  }
+
+  if (!F || !F->isIntrinsic() || ID == Intrinsic::not_intrinsic)
+    return translateCallBase(CI, MIRBuilder);
+
+  assert(ID != Intrinsic::not_intrinsic && "unknown intrinsic");
+
+  if (translateKnownIntrinsic(CI, ID, MIRBuilder))
+    return true;
+
+  ArrayRef<Register> ResultRegs;
+  if (!CI.getType()->isVoidTy())
+    ResultRegs = getOrCreateVRegs(CI);
+
+  // Ignore the callsite attributes. Backend code is most likely not expecting
+  // an intrinsic to sometimes have side effects and sometimes not.
+  MachineInstrBuilder MIB =
+      MIRBuilder.buildIntrinsic(ID, ResultRegs, !F->doesNotAccessMemory());
+  if (isa<FPMathOperator>(CI))
+    MIB->copyIRFlags(CI);
+
+  for (const auto &Arg : enumerate(CI.args())) {
+    // If this is required to be an immediate, don't materialize it in a
+    // register.
+    if (CI.paramHasAttr(Arg.index(), Attribute::ImmArg)) {
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg.value())) {
+        // imm arguments are more convenient than cimm (and realistically
+        // probably sufficient), so use them.
+        assert(CI->getBitWidth() <= 64 &&
+               "large intrinsic immediates not handled");
+        MIB.addImm(CI->getSExtValue());
+      } else {
+        MIB.addFPImm(cast<ConstantFP>(Arg.value()));
+      }
+    } else if (auto *MDVal = dyn_cast<MetadataAsValue>(Arg.value())) {
+      auto *MD = MDVal->getMetadata();
+      auto *MDN = dyn_cast<MDNode>(MD);
+      if (!MDN) {
+        if (auto *ConstMD = dyn_cast<ConstantAsMetadata>(MD))
+          MDN = MDNode::get(MF->getFunction().getContext(), ConstMD);
+        else // This was probably an MDString.
+          return false;
+      }
+      MIB.addMetadata(MDN);
+    } else {
+      ArrayRef<Register> VRegs = getOrCreateVRegs(*Arg.value());
+      if (VRegs.size() > 1)
+        return false;
+      MIB.addUse(VRegs[0]);
+    }
+  }
+
+  // Add a MachineMemOperand if it is a target mem intrinsic.
+  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
+  TargetLowering::IntrinsicInfo Info;
+  // TODO: Add a GlobalISel version of getTgtMemIntrinsic.
+  if (TLI.getTgtMemIntrinsic(Info, CI, *MF, ID)) {
+    Align Alignment = Info.align.value_or(
+        DL->getABITypeAlign(Info.memVT.getTypeForEVT(F->getContext())));
+    LLT MemTy = Info.memVT.isSimple()
+                    ? getLLTForMVT(Info.memVT.getSimpleVT())
+                    : LLT::scalar(Info.memVT.getStoreSizeInBits());
+
+    // TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic
+    //       didn't yield anything useful.
+    MachinePointerInfo MPI;
+    if (Info.ptrVal)
+      MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
+    else if (Info.fallbackAddressSpace)
+      MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
+    MIB.addMemOperand(
+        MF->getMachineMemOperand(MPI, Info.flags, MemTy, Alignment, CI.getAAMetadata()));
+  }
+
+  return true;
+}
+
+bool IRTranslator::findUnwindDestinations(
+    const BasicBlock *EHPadBB,
+    BranchProbability Prob,
+    SmallVectorImpl<std::pair<MachineBasicBlock *, BranchProbability>>
+        &UnwindDests) {
+  EHPersonality Personality = classifyEHPersonality(
+      EHPadBB->getParent()->getFunction().getPersonalityFn());
+  bool IsMSVCCXX = Personality == EHPersonality::MSVC_CXX;
+  bool IsCoreCLR = Personality == EHPersonality::CoreCLR;
+  bool IsWasmCXX = Personality == EHPersonality::Wasm_CXX;
+  bool IsSEH = isAsynchronousEHPersonality(Personality);
+
+  if (IsWasmCXX) {
+    // Ignore this for now.
+    return false;
+  }
+
+  while (EHPadBB) {
+    const Instruction *Pad = EHPadBB->getFirstNonPHI();
+    BasicBlock *NewEHPadBB = nullptr;
+    if (isa<LandingPadInst>(Pad)) {
+      // Stop on landingpads. They are not funclets.
+      UnwindDests.emplace_back(&getMBB(*EHPadBB), Prob);
+      break;
+    }
+    if (isa<CleanupPadInst>(Pad)) {
+      // Stop on cleanup pads. Cleanups are always funclet entries for all known
+      // personalities.
+      UnwindDests.emplace_back(&getMBB(*EHPadBB), Prob);
+      UnwindDests.back().first->setIsEHScopeEntry();
+      UnwindDests.back().first->setIsEHFuncletEntry();
+      break;
+    }
+    if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
+      // Add the catchpad handlers to the possible destinations.
+      for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
+        UnwindDests.emplace_back(&getMBB(*CatchPadBB), Prob);
+        // For MSVC++ and the CLR, catchblocks are funclets and need prologues.
+        if (IsMSVCCXX || IsCoreCLR)
+          UnwindDests.back().first->setIsEHFuncletEntry();
+        if (!IsSEH)
+          UnwindDests.back().first->setIsEHScopeEntry();
+      }
+      NewEHPadBB = CatchSwitch->getUnwindDest();
+    } else {
+      continue;
+    }
+
+    BranchProbabilityInfo *BPI = FuncInfo.BPI;
+    if (BPI && NewEHPadBB)
+      Prob *= BPI->getEdgeProbability(EHPadBB, NewEHPadBB);
+    EHPadBB = NewEHPadBB;
+  }
+  return true;
+}
+
+bool IRTranslator::translateInvoke(const User &U,
+                                   MachineIRBuilder &MIRBuilder) {
+  const InvokeInst &I = cast<InvokeInst>(U);
+  MCContext &Context = MF->getContext();
+
+  const BasicBlock *ReturnBB = I.getSuccessor(0);
+  const BasicBlock *EHPadBB = I.getSuccessor(1);
+
+  const Function *Fn = I.getCalledFunction();
+
+  // FIXME: support invoking patchpoint and statepoint intrinsics.
+  if (Fn && Fn->isIntrinsic())
+    return false;
+
+  // FIXME: support whatever these are.
+  if (I.countOperandBundlesOfType(LLVMContext::OB_deopt))
+    return false;
+
+  // FIXME: support control flow guard targets.
+  if (I.countOperandBundlesOfType(LLVMContext::OB_cfguardtarget))
+    return false;
+
+  // FIXME: support Windows exception handling.
+  if (!isa<LandingPadInst>(EHPadBB->getFirstNonPHI()))
+    return false;
+
+  bool LowerInlineAsm = I.isInlineAsm();
+  bool NeedEHLabel = true;
+
+  // Emit the actual call, bracketed by EH_LABELs so that the MF knows about
+  // the region covered by the try.
+  MCSymbol *BeginSymbol = nullptr;
+  if (NeedEHLabel) {
+    MIRBuilder.buildInstr(TargetOpcode::G_INVOKE_REGION_START);
+    BeginSymbol = Context.createTempSymbol();
+    MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(BeginSymbol);
+  }
+
+  if (LowerInlineAsm) {
+    if (!translateInlineAsm(I, MIRBuilder))
+      return false;
+  } else if (!translateCallBase(I, MIRBuilder))
+    return false;
+
+  MCSymbol *EndSymbol = nullptr;
+  if (NeedEHLabel) {
+    EndSymbol = Context.createTempSymbol();
+    MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(EndSymbol);
+  }
+
+  SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  MachineBasicBlock *InvokeMBB = &MIRBuilder.getMBB();
+  BranchProbability EHPadBBProb =
+      BPI ? BPI->getEdgeProbability(InvokeMBB->getBasicBlock(), EHPadBB)
+          : BranchProbability::getZero();
+
+  if (!findUnwindDestinations(EHPadBB, EHPadBBProb, UnwindDests))
+    return false;
+
+  MachineBasicBlock &EHPadMBB = getMBB(*EHPadBB),
+                    &ReturnMBB = getMBB(*ReturnBB);
+  // Update successor info.
+  addSuccessorWithProb(InvokeMBB, &ReturnMBB);
+  for (auto &UnwindDest : UnwindDests) {
+    UnwindDest.first->setIsEHPad();
+    addSuccessorWithProb(InvokeMBB, UnwindDest.first, UnwindDest.second);
+  }
+  InvokeMBB->normalizeSuccProbs();
+
+  if (NeedEHLabel) {
+    assert(BeginSymbol && "Expected a begin symbol!");
+    assert(EndSymbol && "Expected an end symbol!");
+    MF->addInvoke(&EHPadMBB, BeginSymbol, EndSymbol);
+  }
+
+  MIRBuilder.buildBr(ReturnMBB);
+  return true;
+}
+
+bool IRTranslator::translateCallBr(const User &U,
+                                   MachineIRBuilder &MIRBuilder) {
+  // FIXME: Implement this.
+  return false;
+}
+
+bool IRTranslator::translateLandingPad(const User &U,
+                                       MachineIRBuilder &MIRBuilder) {
+  const LandingPadInst &LP = cast<LandingPadInst>(U);
+
+  MachineBasicBlock &MBB = MIRBuilder.getMBB();
+
+  MBB.setIsEHPad();
+
+  // If there aren't registers to copy the values into (e.g., during SjLj
+  // exceptions), then don't bother.
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  const Constant *PersonalityFn = MF->getFunction().getPersonalityFn();
+  if (TLI.getExceptionPointerRegister(PersonalityFn) == 0 &&
+      TLI.getExceptionSelectorRegister(PersonalityFn) == 0)
+    return true;
+
+  // If landingpad's return type is token type, we don't create DAG nodes
+  // for its exception pointer and selector value. The extraction of exception
+  // pointer or selector value from token type landingpads is not currently
+  // supported.
+  if (LP.getType()->isTokenTy())
+    return true;
+
+  // Add a label to mark the beginning of the landing pad.  Deletion of the
+  // landing pad can thus be detected via the MachineModuleInfo.
+  MIRBuilder.buildInstr(TargetOpcode::EH_LABEL)
+    .addSym(MF->addLandingPad(&MBB));
+
+  // If the unwinder does not preserve all registers, ensure that the
+  // function marks the clobbered registers as used.
+  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+  if (auto *RegMask = TRI.getCustomEHPadPreservedMask(*MF))
+    MF->getRegInfo().addPhysRegsUsedFromRegMask(RegMask);
+
+  LLT Ty = getLLTForType(*LP.getType(), *DL);
+  Register Undef = MRI->createGenericVirtualRegister(Ty);
+  MIRBuilder.buildUndef(Undef);
+
+  SmallVector<LLT, 2> Tys;
+  for (Type *Ty : cast<StructType>(LP.getType())->elements())
+    Tys.push_back(getLLTForType(*Ty, *DL));
+  assert(Tys.size() == 2 && "Only two-valued landingpads are supported");
+
+  // Mark exception register as live in.
+  Register ExceptionReg = TLI.getExceptionPointerRegister(PersonalityFn);
+  if (!ExceptionReg)
+    return false;
+
+  MBB.addLiveIn(ExceptionReg);
+  ArrayRef<Register> ResRegs = getOrCreateVRegs(LP);
+  MIRBuilder.buildCopy(ResRegs[0], ExceptionReg);
+
+  Register SelectorReg = TLI.getExceptionSelectorRegister(PersonalityFn);
+  if (!SelectorReg)
+    return false;
+
+  MBB.addLiveIn(SelectorReg);
+  Register PtrVReg = MRI->createGenericVirtualRegister(Tys[0]);
+  MIRBuilder.buildCopy(PtrVReg, SelectorReg);
+  MIRBuilder.buildCast(ResRegs[1], PtrVReg);
+
+  return true;
+}
+
+bool IRTranslator::translateAlloca(const User &U,
+                                   MachineIRBuilder &MIRBuilder) {
+  auto &AI = cast<AllocaInst>(U);
+
+  if (AI.isSwiftError())
+    return true;
+
+  if (AI.isStaticAlloca()) {
+    Register Res = getOrCreateVReg(AI);
+    int FI = getOrCreateFrameIndex(AI);
+    MIRBuilder.buildFrameIndex(Res, FI);
+    return true;
+  }
+
+  // FIXME: support stack probing for Windows.
+  if (MF->getTarget().getTargetTriple().isOSWindows())
+    return false;
+
+  // Now we're in the harder dynamic case.
+  Register NumElts = getOrCreateVReg(*AI.getArraySize());
+  Type *IntPtrIRTy = DL->getIntPtrType(AI.getType());
+  LLT IntPtrTy = getLLTForType(*IntPtrIRTy, *DL);
+  if (MRI->getType(NumElts) != IntPtrTy) {
+    Register ExtElts = MRI->createGenericVirtualRegister(IntPtrTy);
+    MIRBuilder.buildZExtOrTrunc(ExtElts, NumElts);
+    NumElts = ExtElts;
+  }
+
+  Type *Ty = AI.getAllocatedType();
+
+  Register AllocSize = MRI->createGenericVirtualRegister(IntPtrTy);
+  Register TySize =
+      getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, DL->getTypeAllocSize(Ty)));
+  MIRBuilder.buildMul(AllocSize, NumElts, TySize);
+
+  // Round the size of the allocation up to the stack alignment size
+  // by add SA-1 to the size. This doesn't overflow because we're computing
+  // an address inside an alloca.
+  Align StackAlign = MF->getSubtarget().getFrameLowering()->getStackAlign();
+  auto SAMinusOne = MIRBuilder.buildConstant(IntPtrTy, StackAlign.value() - 1);
+  auto AllocAdd = MIRBuilder.buildAdd(IntPtrTy, AllocSize, SAMinusOne,
+                                      MachineInstr::NoUWrap);
+  auto AlignCst =
+      MIRBuilder.buildConstant(IntPtrTy, ~(uint64_t)(StackAlign.value() - 1));
+  auto AlignedAlloc = MIRBuilder.buildAnd(IntPtrTy, AllocAdd, AlignCst);
+
+  Align Alignment = std::max(AI.getAlign(), DL->getPrefTypeAlign(Ty));
+  if (Alignment <= StackAlign)
+    Alignment = Align(1);
+  MIRBuilder.buildDynStackAlloc(getOrCreateVReg(AI), AlignedAlloc, Alignment);
+
+  MF->getFrameInfo().CreateVariableSizedObject(Alignment, &AI);
+  assert(MF->getFrameInfo().hasVarSizedObjects());
+  return true;
+}
+
+bool IRTranslator::translateVAArg(const User &U, MachineIRBuilder &MIRBuilder) {
+  // FIXME: We may need more info about the type. Because of how LLT works,
+  // we're completely discarding the i64/double distinction here (amongst
+  // others). Fortunately the ABIs I know of where that matters don't use va_arg
+  // anyway but that's not guaranteed.
+  MIRBuilder.buildInstr(TargetOpcode::G_VAARG, {getOrCreateVReg(U)},
+                        {getOrCreateVReg(*U.getOperand(0)),
+                         DL->getABITypeAlign(U.getType()).value()});
+  return true;
+}
+
+bool IRTranslator::translateUnreachable(const User &U, MachineIRBuilder &MIRBuilder) {
+    if (!MF->getTarget().Options.TrapUnreachable)
+    return true;
+
+  auto &UI = cast<UnreachableInst>(U);
+  // We may be able to ignore unreachable behind a noreturn call.
+  if (MF->getTarget().Options.NoTrapAfterNoreturn) {
+    const BasicBlock &BB = *UI.getParent();
+    if (&UI != &BB.front()) {
+      BasicBlock::const_iterator PredI =
+        std::prev(BasicBlock::const_iterator(UI));
+      if (const CallInst *Call = dyn_cast<CallInst>(&*PredI)) {
+        if (Call->doesNotReturn())
+          return true;
+      }
+    }
+  }
+
+  MIRBuilder.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
+  return true;
+}
+
+bool IRTranslator::translateInsertElement(const User &U,
+                                          MachineIRBuilder &MIRBuilder) {
+  // If it is a <1 x Ty> vector, use the scalar as it is
+  // not a legal vector type in LLT.
+  if (cast<FixedVectorType>(U.getType())->getNumElements() == 1)
+    return translateCopy(U, *U.getOperand(1), MIRBuilder);
+
+  Register Res = getOrCreateVReg(U);
+  Register Val = getOrCreateVReg(*U.getOperand(0));
+  Register Elt = getOrCreateVReg(*U.getOperand(1));
+  Register Idx = getOrCreateVReg(*U.getOperand(2));
+  MIRBuilder.buildInsertVectorElement(Res, Val, Elt, Idx);
+  return true;
+}
+
+bool IRTranslator::translateExtractElement(const User &U,
+                                           MachineIRBuilder &MIRBuilder) {
+  // If it is a <1 x Ty> vector, use the scalar as it is
+  // not a legal vector type in LLT.
+  if (cast<FixedVectorType>(U.getOperand(0)->getType())->getNumElements() == 1)
+    return translateCopy(U, *U.getOperand(0), MIRBuilder);
+
+  Register Res = getOrCreateVReg(U);
+  Register Val = getOrCreateVReg(*U.getOperand(0));
+  const auto &TLI = *MF->getSubtarget().getTargetLowering();
+  unsigned PreferredVecIdxWidth = TLI.getVectorIdxTy(*DL).getSizeInBits();
+  Register Idx;
+  if (auto *CI = dyn_cast<ConstantInt>(U.getOperand(1))) {
+    if (CI->getBitWidth() != PreferredVecIdxWidth) {
+      APInt NewIdx = CI->getValue().zextOrTrunc(PreferredVecIdxWidth);
+      auto *NewIdxCI = ConstantInt::get(CI->getContext(), NewIdx);
+      Idx = getOrCreateVReg(*NewIdxCI);
+    }
+  }
+  if (!Idx)
+    Idx = getOrCreateVReg(*U.getOperand(1));
+  if (MRI->getType(Idx).getSizeInBits() != PreferredVecIdxWidth) {
+    const LLT VecIdxTy = LLT::scalar(PreferredVecIdxWidth);
+    Idx = MIRBuilder.buildZExtOrTrunc(VecIdxTy, Idx).getReg(0);
+  }
+  MIRBuilder.buildExtractVectorElement(Res, Val, Idx);
+  return true;
+}
+
+bool IRTranslator::translateShuffleVector(const User &U,
+                                          MachineIRBuilder &MIRBuilder) {
+  ArrayRef<int> Mask;
+  if (auto *SVI = dyn_cast<ShuffleVectorInst>(&U))
+    Mask = SVI->getShuffleMask();
+  else
+    Mask = cast<ConstantExpr>(U).getShuffleMask();
+  ArrayRef<int> MaskAlloc = MF->allocateShuffleMask(Mask);
+  MIRBuilder
+      .buildInstr(TargetOpcode::G_SHUFFLE_VECTOR, {getOrCreateVReg(U)},
+                  {getOrCreateVReg(*U.getOperand(0)),
+                   getOrCreateVReg(*U.getOperand(1))})
+      .addShuffleMask(MaskAlloc);
+  return true;
+}
+
+bool IRTranslator::translatePHI(const User &U, MachineIRBuilder &MIRBuilder) {
+  const PHINode &PI = cast<PHINode>(U);
+
+  SmallVector<MachineInstr *, 4> Insts;
+  for (auto Reg : getOrCreateVRegs(PI)) {
+    auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_PHI, {Reg}, {});
+    Insts.push_back(MIB.getInstr());
+  }
+
+  PendingPHIs.emplace_back(&PI, std::move(Insts));
+  return true;
+}
+
+bool IRTranslator::translateAtomicCmpXchg(const User &U,
+                                          MachineIRBuilder &MIRBuilder) {
+  const AtomicCmpXchgInst &I = cast<AtomicCmpXchgInst>(U);
+
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  auto Flags = TLI.getAtomicMemOperandFlags(I, *DL);
+
+  auto Res = getOrCreateVRegs(I);
+  Register OldValRes = Res[0];
+  Register SuccessRes = Res[1];
+  Register Addr = getOrCreateVReg(*I.getPointerOperand());
+  Register Cmp = getOrCreateVReg(*I.getCompareOperand());
+  Register NewVal = getOrCreateVReg(*I.getNewValOperand());
+
+  MIRBuilder.buildAtomicCmpXchgWithSuccess(
+      OldValRes, SuccessRes, Addr, Cmp, NewVal,
+      *MF->getMachineMemOperand(
+          MachinePointerInfo(I.getPointerOperand()), Flags, MRI->getType(Cmp),
+          getMemOpAlign(I), I.getAAMetadata(), nullptr, I.getSyncScopeID(),
+          I.getSuccessOrdering(), I.getFailureOrdering()));
+  return true;
+}
+
+bool IRTranslator::translateAtomicRMW(const User &U,
+                                      MachineIRBuilder &MIRBuilder) {
+  const AtomicRMWInst &I = cast<AtomicRMWInst>(U);
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  auto Flags = TLI.getAtomicMemOperandFlags(I, *DL);
+
+  Register Res = getOrCreateVReg(I);
+  Register Addr = getOrCreateVReg(*I.getPointerOperand());
+  Register Val = getOrCreateVReg(*I.getValOperand());
+
+  unsigned Opcode = 0;
+  switch (I.getOperation()) {
+  default:
+    return false;
+  case AtomicRMWInst::Xchg:
+    Opcode = TargetOpcode::G_ATOMICRMW_XCHG;
+    break;
+  case AtomicRMWInst::Add:
+    Opcode = TargetOpcode::G_ATOMICRMW_ADD;
+    break;
+  case AtomicRMWInst::Sub:
+    Opcode = TargetOpcode::G_ATOMICRMW_SUB;
+    break;
+  case AtomicRMWInst::And:
+    Opcode = TargetOpcode::G_ATOMICRMW_AND;
+    break;
+  case AtomicRMWInst::Nand:
+    Opcode = TargetOpcode::G_ATOMICRMW_NAND;
+    break;
+  case AtomicRMWInst::Or:
+    Opcode = TargetOpcode::G_ATOMICRMW_OR;
+    break;
+  case AtomicRMWInst::Xor:
+    Opcode = TargetOpcode::G_ATOMICRMW_XOR;
+    break;
+  case AtomicRMWInst::Max:
+    Opcode = TargetOpcode::G_ATOMICRMW_MAX;
+    break;
+  case AtomicRMWInst::Min:
+    Opcode = TargetOpcode::G_ATOMICRMW_MIN;
+    break;
+  case AtomicRMWInst::UMax:
+    Opcode = TargetOpcode::G_ATOMICRMW_UMAX;
+    break;
+  case AtomicRMWInst::UMin:
+    Opcode = TargetOpcode::G_ATOMICRMW_UMIN;
+    break;
+  case AtomicRMWInst::FAdd:
+    Opcode = TargetOpcode::G_ATOMICRMW_FADD;
+    break;
+  case AtomicRMWInst::FSub:
+    Opcode = TargetOpcode::G_ATOMICRMW_FSUB;
+    break;
+  case AtomicRMWInst::FMax:
+    Opcode = TargetOpcode::G_ATOMICRMW_FMAX;
+    break;
+  case AtomicRMWInst::FMin:
+    Opcode = TargetOpcode::G_ATOMICRMW_FMIN;
+    break;
+  case AtomicRMWInst::UIncWrap:
+    Opcode = TargetOpcode::G_ATOMICRMW_UINC_WRAP;
+    break;
+  case AtomicRMWInst::UDecWrap:
+    Opcode = TargetOpcode::G_ATOMICRMW_UDEC_WRAP;
+    break;
+  }
+
+  MIRBuilder.buildAtomicRMW(
+      Opcode, Res, Addr, Val,
+      *MF->getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()),
+                                Flags, MRI->getType(Val), getMemOpAlign(I),
+                                I.getAAMetadata(), nullptr, I.getSyncScopeID(),
+                                I.getOrdering()));
+  return true;
+}
+
+bool IRTranslator::translateFence(const User &U,
+                                  MachineIRBuilder &MIRBuilder) {
+  const FenceInst &Fence = cast<FenceInst>(U);
+  MIRBuilder.buildFence(static_cast<unsigned>(Fence.getOrdering()),
+                        Fence.getSyncScopeID());
+  return true;
+}
+
+bool IRTranslator::translateFreeze(const User &U,
+                                   MachineIRBuilder &MIRBuilder) {
+  const ArrayRef<Register> DstRegs = getOrCreateVRegs(U);
+  const ArrayRef<Register> SrcRegs = getOrCreateVRegs(*U.getOperand(0));
+
+  assert(DstRegs.size() == SrcRegs.size() &&
+         "Freeze with different source and destination type?");
+
+  for (unsigned I = 0; I < DstRegs.size(); ++I) {
+    MIRBuilder.buildFreeze(DstRegs[I], SrcRegs[I]);
+  }
+
+  return true;
+}
+
+void IRTranslator::finishPendingPhis() {
+#ifndef NDEBUG
+  DILocationVerifier Verifier;
+  GISelObserverWrapper WrapperObserver(&Verifier);
+  RAIIDelegateInstaller DelInstall(*MF, &WrapperObserver);
+#endif // ifndef NDEBUG
+  for (auto &Phi : PendingPHIs) {
+    const PHINode *PI = Phi.first;
+    ArrayRef<MachineInstr *> ComponentPHIs = Phi.second;
+    MachineBasicBlock *PhiMBB = ComponentPHIs[0]->getParent();
+    EntryBuilder->setDebugLoc(PI->getDebugLoc());
+#ifndef NDEBUG
+    Verifier.setCurrentInst(PI);
+#endif // ifndef NDEBUG
+
+    SmallSet<const MachineBasicBlock *, 16> SeenPreds;
+    for (unsigned i = 0; i < PI->getNumIncomingValues(); ++i) {
+      auto IRPred = PI->getIncomingBlock(i);
+      ArrayRef<Register> ValRegs = getOrCreateVRegs(*PI->getIncomingValue(i));
+      for (auto *Pred : getMachinePredBBs({IRPred, PI->getParent()})) {
+        if (SeenPreds.count(Pred) || !PhiMBB->isPredecessor(Pred))
+          continue;
+        SeenPreds.insert(Pred);
+        for (unsigned j = 0; j < ValRegs.size(); ++j) {
+          MachineInstrBuilder MIB(*MF, ComponentPHIs[j]);
+          MIB.addUse(ValRegs[j]);
+          MIB.addMBB(Pred);
+        }
+      }
+    }
+  }
+}
+
+bool IRTranslator::translate(const Instruction &Inst) {
+  CurBuilder->setDebugLoc(Inst.getDebugLoc());
+  CurBuilder->setPCSections(Inst.getMetadata(LLVMContext::MD_pcsections));
+
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  if (TLI.fallBackToDAGISel(Inst))
+    return false;
+
+  switch (Inst.getOpcode()) {
+#define HANDLE_INST(NUM, OPCODE, CLASS)                                        \
+  case Instruction::OPCODE:                                                    \
+    return translate##OPCODE(Inst, *CurBuilder.get());
+#include "llvm/IR/Instruction.def"
+  default:
+    return false;
+  }
+}
+
+bool IRTranslator::translate(const Constant &C, Register Reg) {
+  // We only emit constants into the entry block from here. To prevent jumpy
+  // debug behaviour remove debug line.
+  if (auto CurrInstDL = CurBuilder->getDL())
+    EntryBuilder->setDebugLoc(DebugLoc());
+
+  if (auto CI = dyn_cast<ConstantInt>(&C))
+    EntryBuilder->buildConstant(Reg, *CI);
+  else if (auto CF = dyn_cast<ConstantFP>(&C))
+    EntryBuilder->buildFConstant(Reg, *CF);
+  else if (isa<UndefValue>(C))
+    EntryBuilder->buildUndef(Reg);
+  else if (isa<ConstantPointerNull>(C))
+    EntryBuilder->buildConstant(Reg, 0);
+  else if (auto GV = dyn_cast<GlobalValue>(&C))
+    EntryBuilder->buildGlobalValue(Reg, GV);
+  else if (auto CAZ = dyn_cast<ConstantAggregateZero>(&C)) {
+    if (!isa<FixedVectorType>(CAZ->getType()))
+      return false;
+    // Return the scalar if it is a <1 x Ty> vector.
+    unsigned NumElts = CAZ->getElementCount().getFixedValue();
+    if (NumElts == 1)
+      return translateCopy(C, *CAZ->getElementValue(0u), *EntryBuilder);
+    SmallVector<Register, 4> Ops;
+    for (unsigned I = 0; I < NumElts; ++I) {
+      Constant &Elt = *CAZ->getElementValue(I);
+      Ops.push_back(getOrCreateVReg(Elt));
+    }
+    EntryBuilder->buildBuildVector(Reg, Ops);
+  } else if (auto CV = dyn_cast<ConstantDataVector>(&C)) {
+    // Return the scalar if it is a <1 x Ty> vector.
+    if (CV->getNumElements() == 1)
+      return translateCopy(C, *CV->getElementAsConstant(0), *EntryBuilder);
+    SmallVector<Register, 4> Ops;
+    for (unsigned i = 0; i < CV->getNumElements(); ++i) {
+      Constant &Elt = *CV->getElementAsConstant(i);
+      Ops.push_back(getOrCreateVReg(Elt));
+    }
+    EntryBuilder->buildBuildVector(Reg, Ops);
+  } else if (auto CE = dyn_cast<ConstantExpr>(&C)) {
+    switch(CE->getOpcode()) {
+#define HANDLE_INST(NUM, OPCODE, CLASS)                                        \
+  case Instruction::OPCODE:                                                    \
+    return translate##OPCODE(*CE, *EntryBuilder.get());
+#include "llvm/IR/Instruction.def"
+    default:
+      return false;
+    }
+  } else if (auto CV = dyn_cast<ConstantVector>(&C)) {
+    if (CV->getNumOperands() == 1)
+      return translateCopy(C, *CV->getOperand(0), *EntryBuilder);
+    SmallVector<Register, 4> Ops;
+    for (unsigned i = 0; i < CV->getNumOperands(); ++i) {
+      Ops.push_back(getOrCreateVReg(*CV->getOperand(i)));
+    }
+    EntryBuilder->buildBuildVector(Reg, Ops);
+  } else if (auto *BA = dyn_cast<BlockAddress>(&C)) {
+    EntryBuilder->buildBlockAddress(Reg, BA);
+  } else
+    return false;
+
+  return true;
+}
+
+bool IRTranslator::finalizeBasicBlock(const BasicBlock &BB,
+                                      MachineBasicBlock &MBB) {
+  for (auto &BTB : SL->BitTestCases) {
+    // Emit header first, if it wasn't already emitted.
+    if (!BTB.Emitted)
+      emitBitTestHeader(BTB, BTB.Parent);
+
+    BranchProbability UnhandledProb = BTB.Prob;
+    for (unsigned j = 0, ej = BTB.Cases.size(); j != ej; ++j) {
+      UnhandledProb -= BTB.Cases[j].ExtraProb;
+      // Set the current basic block to the mbb we wish to insert the code into
+      MachineBasicBlock *MBB = BTB.Cases[j].ThisBB;
+      // If all cases cover a contiguous range, it is not necessary to jump to
+      // the default block after the last bit test fails. This is because the
+      // range check during bit test header creation has guaranteed that every
+      // case here doesn't go outside the range. In this case, there is no need
+      // to perform the last bit test, as it will always be true. Instead, make
+      // the second-to-last bit-test fall through to the target of the last bit
+      // test, and delete the last bit test.
+
+      MachineBasicBlock *NextMBB;
+      if ((BTB.ContiguousRange || BTB.FallthroughUnreachable) && j + 2 == ej) {
+        // Second-to-last bit-test with contiguous range: fall through to the
+        // target of the final bit test.
+        NextMBB = BTB.Cases[j + 1].TargetBB;
+      } else if (j + 1 == ej) {
+        // For the last bit test, fall through to Default.
+        NextMBB = BTB.Default;
+      } else {
+        // Otherwise, fall through to the next bit test.
+        NextMBB = BTB.Cases[j + 1].ThisBB;
+      }
+
+      emitBitTestCase(BTB, NextMBB, UnhandledProb, BTB.Reg, BTB.Cases[j], MBB);
+
+      if ((BTB.ContiguousRange || BTB.FallthroughUnreachable) && j + 2 == ej) {
+        // We need to record the replacement phi edge here that normally
+        // happens in emitBitTestCase before we delete the case, otherwise the
+        // phi edge will be lost.
+        addMachineCFGPred({BTB.Parent->getBasicBlock(),
+                           BTB.Cases[ej - 1].TargetBB->getBasicBlock()},
+                          MBB);
+        // Since we're not going to use the final bit test, remove it.
+        BTB.Cases.pop_back();
+        break;
+      }
+    }
+    // This is "default" BB. We have two jumps to it. From "header" BB and from
+    // last "case" BB, unless the latter was skipped.
+    CFGEdge HeaderToDefaultEdge = {BTB.Parent->getBasicBlock(),
+                                   BTB.Default->getBasicBlock()};
+    addMachineCFGPred(HeaderToDefaultEdge, BTB.Parent);
+    if (!BTB.ContiguousRange) {
+      addMachineCFGPred(HeaderToDefaultEdge, BTB.Cases.back().ThisBB);
+    }
+  }
+  SL->BitTestCases.clear();
+
+  for (auto &JTCase : SL->JTCases) {
+    // Emit header first, if it wasn't already emitted.
+    if (!JTCase.first.Emitted)
+      emitJumpTableHeader(JTCase.second, JTCase.first, JTCase.first.HeaderBB);
+
+    emitJumpTable(JTCase.second, JTCase.second.MBB);
+  }
+  SL->JTCases.clear();
+
+  for (auto &SwCase : SL->SwitchCases)
+    emitSwitchCase(SwCase, &CurBuilder->getMBB(), *CurBuilder);
+  SL->SwitchCases.clear();
+
+  // Check if we need to generate stack-protector guard checks.
+  StackProtector &SP = getAnalysis<StackProtector>();
+  if (SP.shouldEmitSDCheck(BB)) {
+    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
+    bool FunctionBasedInstrumentation =
+        TLI.getSSPStackGuardCheck(*MF->getFunction().getParent());
+    SPDescriptor.initialize(&BB, &MBB, FunctionBasedInstrumentation);
+  }
+  // Handle stack protector.
+  if (SPDescriptor.shouldEmitFunctionBasedCheckStackProtector()) {
+    LLVM_DEBUG(dbgs() << "Unimplemented stack protector case\n");
+    return false;
+  } else if (SPDescriptor.shouldEmitStackProtector()) {
+    MachineBasicBlock *ParentMBB = SPDescriptor.getParentMBB();
+    MachineBasicBlock *SuccessMBB = SPDescriptor.getSuccessMBB();
+
+    // Find the split point to split the parent mbb. At the same time copy all
+    // physical registers used in the tail of parent mbb into virtual registers
+    // before the split point and back into physical registers after the split
+    // point. This prevents us needing to deal with Live-ins and many other
+    // register allocation issues caused by us splitting the parent mbb. The
+    // register allocator will clean up said virtual copies later on.
+    MachineBasicBlock::iterator SplitPoint = findSplitPointForStackProtector(
+        ParentMBB, *MF->getSubtarget().getInstrInfo());
+
+    // Splice the terminator of ParentMBB into SuccessMBB.
+    SuccessMBB->splice(SuccessMBB->end(), ParentMBB, SplitPoint,
+                       ParentMBB->end());
+
+    // Add compare/jump on neq/jump to the parent BB.
+    if (!emitSPDescriptorParent(SPDescriptor, ParentMBB))
+      return false;
+
+    // CodeGen Failure MBB if we have not codegened it yet.
+    MachineBasicBlock *FailureMBB = SPDescriptor.getFailureMBB();
+    if (FailureMBB->empty()) {
+      if (!emitSPDescriptorFailure(SPDescriptor, FailureMBB))
+        return false;
+    }
+
+    // Clear the Per-BB State.
+    SPDescriptor.resetPerBBState();
+  }
+  return true;
+}
+
+bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
+                                          MachineBasicBlock *ParentBB) {
+  CurBuilder->setInsertPt(*ParentBB, ParentBB->end());
+  // First create the loads to the guard/stack slot for the comparison.
+  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
+  Type *PtrIRTy = Type::getInt8PtrTy(MF->getFunction().getContext());
+  const LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
+  LLT PtrMemTy = getLLTForMVT(TLI.getPointerMemTy(*DL));
+
+  MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
+  int FI = MFI.getStackProtectorIndex();
+
+  Register Guard;
+  Register StackSlotPtr = CurBuilder->buildFrameIndex(PtrTy, FI).getReg(0);
+  const Module &M = *ParentBB->getParent()->getFunction().getParent();
+  Align Align = DL->getPrefTypeAlign(Type::getInt8PtrTy(M.getContext()));
+
+  // Generate code to load the content of the guard slot.
+  Register GuardVal =
+      CurBuilder
+          ->buildLoad(PtrMemTy, StackSlotPtr,
+                      MachinePointerInfo::getFixedStack(*MF, FI), Align,
+                      MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile)
+          .getReg(0);
+
+  if (TLI.useStackGuardXorFP()) {
+    LLVM_DEBUG(dbgs() << "Stack protector xor'ing with FP not yet implemented");
+    return false;
+  }
+
+  // Retrieve guard check function, nullptr if instrumentation is inlined.
+  if (const Function *GuardCheckFn = TLI.getSSPStackGuardCheck(M)) {
+    // This path is currently untestable on GlobalISel, since the only platform
+    // that needs this seems to be Windows, and we fall back on that currently.
+    // The code still lives here in case that changes.
+    // Silence warning about unused variable until the code below that uses
+    // 'GuardCheckFn' is enabled.
+    (void)GuardCheckFn;
+    return false;
+#if 0
+    // The target provides a guard check function to validate the guard value.
+    // Generate a call to that function with the content of the guard slot as
+    // argument.
+    FunctionType *FnTy = GuardCheckFn->getFunctionType();
+    assert(FnTy->getNumParams() == 1 && "Invalid function signature");
+    ISD::ArgFlagsTy Flags;
+    if (GuardCheckFn->hasAttribute(1, Attribute::AttrKind::InReg))
+      Flags.setInReg();
+    CallLowering::ArgInfo GuardArgInfo(
+        {GuardVal, FnTy->getParamType(0), {Flags}});
+
+    CallLowering::CallLoweringInfo Info;
+    Info.OrigArgs.push_back(GuardArgInfo);
+    Info.CallConv = GuardCheckFn->getCallingConv();
+    Info.Callee = MachineOperand::CreateGA(GuardCheckFn, 0);
+    Info.OrigRet = {Register(), FnTy->getReturnType()};
+    if (!CLI->lowerCall(MIRBuilder, Info)) {
+      LLVM_DEBUG(dbgs() << "Failed to lower call to stack protector check\n");
+      return false;
+    }
+    return true;
+#endif
+  }
+
+  // If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
+  // Otherwise, emit a volatile load to retrieve the stack guard value.
+  if (TLI.useLoadStackGuardNode()) {
+    Guard =
+        MRI->createGenericVirtualRegister(LLT::scalar(PtrTy.getSizeInBits()));
+    getStackGuard(Guard, *CurBuilder);
+  } else {
+    // TODO: test using android subtarget when we support @llvm.thread.pointer.
+    const Value *IRGuard = TLI.getSDagStackGuard(M);
+    Register GuardPtr = getOrCreateVReg(*IRGuard);
+
+    Guard = CurBuilder
+                ->buildLoad(PtrMemTy, GuardPtr,
+                            MachinePointerInfo::getFixedStack(*MF, FI), Align,
+                            MachineMemOperand::MOLoad |
+                                MachineMemOperand::MOVolatile)
+                .getReg(0);
+  }
+
+  // Perform the comparison.
+  auto Cmp =
+      CurBuilder->buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Guard, GuardVal);
+  // If the guard/stackslot do not equal, branch to failure MBB.
+  CurBuilder->buildBrCond(Cmp, *SPD.getFailureMBB());
+  // Otherwise branch to success MBB.
+  CurBuilder->buildBr(*SPD.getSuccessMBB());
+  return true;
+}
+
+bool IRTranslator::emitSPDescriptorFailure(StackProtectorDescriptor &SPD,
+                                           MachineBasicBlock *FailureBB) {
+  CurBuilder->setInsertPt(*FailureBB, FailureBB->end());
+  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
+
+  const RTLIB::Libcall Libcall = RTLIB::STACKPROTECTOR_CHECK_FAIL;
+  const char *Name = TLI.getLibcallName(Libcall);
+
+  CallLowering::CallLoweringInfo Info;
+  Info.CallConv = TLI.getLibcallCallingConv(Libcall);
+  Info.Callee = MachineOperand::CreateES(Name);
+  Info.OrigRet = {Register(), Type::getVoidTy(MF->getFunction().getContext()),
+                  0};
+  if (!CLI->lowerCall(*CurBuilder, Info)) {
+    LLVM_DEBUG(dbgs() << "Failed to lower call to stack protector fail\n");
+    return false;
+  }
+
+  // On PS4/PS5, the "return address" must still be within the calling
+  // function, even if it's at the very end, so emit an explicit TRAP here.
+  // WebAssembly needs an unreachable instruction after a non-returning call,
+  // because the function return type can be different from __stack_chk_fail's
+  // return type (void).
+  const TargetMachine &TM = MF->getTarget();
+  if (TM.getTargetTriple().isPS() || TM.getTargetTriple().isWasm()) {
+    LLVM_DEBUG(dbgs() << "Unhandled trap emission for stack protector fail\n");
+    return false;
+  }
+  return true;
+}
+
+void IRTranslator::finalizeFunction() {
+  // Release the memory used by the different maps we
+  // needed during the translation.
+  PendingPHIs.clear();
+  VMap.reset();
+  FrameIndices.clear();
+  MachinePreds.clear();
+  // MachineIRBuilder::DebugLoc can outlive the DILocation it holds. Clear it
+  // to avoid accessing free’d memory (in runOnMachineFunction) and to avoid
+  // destroying it twice (in ~IRTranslator() and ~LLVMContext())
+  EntryBuilder.reset();
+  CurBuilder.reset();
+  FuncInfo.clear();
+  SPDescriptor.resetPerFunctionState();
+}
+
+/// Returns true if a BasicBlock \p BB within a variadic function contains a
+/// variadic musttail call.
+static bool checkForMustTailInVarArgFn(bool IsVarArg, const BasicBlock &BB) {
+  if (!IsVarArg)
+    return false;
+
+  // Walk the block backwards, because tail calls usually only appear at the end
+  // of a block.
+  return llvm::any_of(llvm::reverse(BB), [](const Instruction &I) {
+    const auto *CI = dyn_cast<CallInst>(&I);
+    return CI && CI->isMustTailCall();
+  });
+}
+
+bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
+  MF = &CurMF;
+  const Function &F = MF->getFunction();
+  GISelCSEAnalysisWrapper &Wrapper =
+      getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
+  // Set the CSEConfig and run the analysis.
+  GISelCSEInfo *CSEInfo = nullptr;
+  TPC = &getAnalysis<TargetPassConfig>();
+  bool EnableCSE = EnableCSEInIRTranslator.getNumOccurrences()
+                       ? EnableCSEInIRTranslator
+                       : TPC->isGISelCSEEnabled();
+
+  if (EnableCSE) {
+    EntryBuilder = std::make_unique<CSEMIRBuilder>(CurMF);
+    CSEInfo = &Wrapper.get(TPC->getCSEConfig());
+    EntryBuilder->setCSEInfo(CSEInfo);
+    CurBuilder = std::make_unique<CSEMIRBuilder>(CurMF);
+    CurBuilder->setCSEInfo(CSEInfo);
+  } else {
+    EntryBuilder = std::make_unique<MachineIRBuilder>();
+    CurBuilder = std::make_unique<MachineIRBuilder>();
+  }
+  CLI = MF->getSubtarget().getCallLowering();
+  CurBuilder->setMF(*MF);
+  EntryBuilder->setMF(*MF);
+  MRI = &MF->getRegInfo();
+  DL = &F.getParent()->getDataLayout();
+  ORE = std::make_unique<OptimizationRemarkEmitter>(&F);
+  const TargetMachine &TM = MF->getTarget();
+  TM.resetTargetOptions(F);
+  EnableOpts = OptLevel != CodeGenOpt::None && !skipFunction(F);
+  FuncInfo.MF = MF;
+  if (EnableOpts) {
+    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+    FuncInfo.BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+  } else {
+    AA = nullptr;
+    FuncInfo.BPI = nullptr;
+  }
+
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+      MF->getFunction());
+  LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  FuncInfo.CanLowerReturn = CLI->checkReturnTypeForCallConv(*MF);
+
+  const auto &TLI = *MF->getSubtarget().getTargetLowering();
+
+  SL = std::make_unique<GISelSwitchLowering>(this, FuncInfo);
+  SL->init(TLI, TM, *DL);
+
+
+
+  assert(PendingPHIs.empty() && "stale PHIs");
+
+  // Targets which want to use big endian can enable it using
+  // enableBigEndian()
+  if (!DL->isLittleEndian() && !CLI->enableBigEndian()) {
+    // Currently we don't properly handle big endian code.
+    OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                               F.getSubprogram(), &F.getEntryBlock());
+    R << "unable to translate in big endian mode";
+    reportTranslationError(*MF, *TPC, *ORE, R);
+  }
+
+  // Release the per-function state when we return, whether we succeeded or not.
+  auto FinalizeOnReturn = make_scope_exit([this]() { finalizeFunction(); });
+
+  // Setup a separate basic-block for the arguments and constants
+  MachineBasicBlock *EntryBB = MF->CreateMachineBasicBlock();
+  MF->push_back(EntryBB);
+  EntryBuilder->setMBB(*EntryBB);
+
+  DebugLoc DbgLoc = F.getEntryBlock().getFirstNonPHI()->getDebugLoc();
+  SwiftError.setFunction(CurMF);
+  SwiftError.createEntriesInEntryBlock(DbgLoc);
+
+  bool IsVarArg = F.isVarArg();
+  bool HasMustTailInVarArgFn = false;
+
+  // Create all blocks, in IR order, to preserve the layout.
+  for (const BasicBlock &BB: F) {
+    auto *&MBB = BBToMBB[&BB];
+
+    MBB = MF->CreateMachineBasicBlock(&BB);
+    MF->push_back(MBB);
+
+    if (BB.hasAddressTaken())
+      MBB->setAddressTakenIRBlock(const_cast<BasicBlock *>(&BB));
+
+    if (!HasMustTailInVarArgFn)
+      HasMustTailInVarArgFn = checkForMustTailInVarArgFn(IsVarArg, BB);
+  }
+
+  MF->getFrameInfo().setHasMustTailInVarArgFunc(HasMustTailInVarArgFn);
+
+  // Make our arguments/constants entry block fallthrough to the IR entry block.
+  EntryBB->addSuccessor(&getMBB(F.front()));
+
+  if (CLI->fallBackToDAGISel(*MF)) {
+    OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                               F.getSubprogram(), &F.getEntryBlock());
+    R << "unable to lower function: " << ore::NV("Prototype", F.getType());
+    reportTranslationError(*MF, *TPC, *ORE, R);
+    return false;
+  }
+
+  // Lower the actual args into this basic block.
+  SmallVector<ArrayRef<Register>, 8> VRegArgs;
+  for (const Argument &Arg: F.args()) {
+    if (DL->getTypeStoreSize(Arg.getType()).isZero())
+      continue; // Don't handle zero sized types.
+    ArrayRef<Register> VRegs = getOrCreateVRegs(Arg);
+    VRegArgs.push_back(VRegs);
+
+    if (Arg.hasSwiftErrorAttr()) {
+      assert(VRegs.size() == 1 && "Too many vregs for Swift error");
+      SwiftError.setCurrentVReg(EntryBB, SwiftError.getFunctionArg(), VRegs[0]);
+    }
+  }
+
+  if (!CLI->lowerFormalArguments(*EntryBuilder, F, VRegArgs, FuncInfo)) {
+    OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                               F.getSubprogram(), &F.getEntryBlock());
+    R << "unable to lower arguments: " << ore::NV("Prototype", F.getType());
+    reportTranslationError(*MF, *TPC, *ORE, R);
+    return false;
+  }
+
+  // Need to visit defs before uses when translating instructions.
+  GISelObserverWrapper WrapperObserver;
+  if (EnableCSE && CSEInfo)
+    WrapperObserver.addObserver(CSEInfo);
+  {
+    ReversePostOrderTraversal<const Function *> RPOT(&F);
+#ifndef NDEBUG
+    DILocationVerifier Verifier;
+    WrapperObserver.addObserver(&Verifier);
+#endif // ifndef NDEBUG
+    RAIIDelegateInstaller DelInstall(*MF, &WrapperObserver);
+    RAIIMFObserverInstaller ObsInstall(*MF, WrapperObserver);
+    for (const BasicBlock *BB : RPOT) {
+      MachineBasicBlock &MBB = getMBB(*BB);
+      // Set the insertion point of all the following translations to
+      // the end of this basic block.
+      CurBuilder->setMBB(MBB);
+      HasTailCall = false;
+      for (const Instruction &Inst : *BB) {
+        // If we translated a tail call in the last step, then we know
+        // everything after the call is either a return, or something that is
+        // handled by the call itself. (E.g. a lifetime marker or assume
+        // intrinsic.) In this case, we should stop translating the block and
+        // move on.
+        if (HasTailCall)
+          break;
+#ifndef NDEBUG
+        Verifier.setCurrentInst(&Inst);
+#endif // ifndef NDEBUG
+        if (translate(Inst))
+          continue;
+
+        OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                                   Inst.getDebugLoc(), BB);
+        R << "unable to translate instruction: " << ore::NV("Opcode", &Inst);
+
+        if (ORE->allowExtraAnalysis("gisel-irtranslator")) {
+          std::string InstStrStorage;
+          raw_string_ostream InstStr(InstStrStorage);
+          InstStr << Inst;
+
+          R << ": '" << InstStr.str() << "'";
+        }
+
+        reportTranslationError(*MF, *TPC, *ORE, R);
+        return false;
+      }
+
+      if (!finalizeBasicBlock(*BB, MBB)) {
+        OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                                   BB->getTerminator()->getDebugLoc(), BB);
+        R << "unable to translate basic block";
+        reportTranslationError(*MF, *TPC, *ORE, R);
+        return false;
+      }
+    }
+#ifndef NDEBUG
+    WrapperObserver.removeObserver(&Verifier);
+#endif
+  }
+
+  finishPendingPhis();
+
+  SwiftError.propagateVRegs();
+
+  // Merge the argument lowering and constants block with its single
+  // successor, the LLVM-IR entry block.  We want the basic block to
+  // be maximal.
+  assert(EntryBB->succ_size() == 1 &&
+         "Custom BB used for lowering should have only one successor");
+  // Get the successor of the current entry block.
+  MachineBasicBlock &NewEntryBB = **EntryBB->succ_begin();
+  assert(NewEntryBB.pred_size() == 1 &&
+         "LLVM-IR entry block has a predecessor!?");
+  // Move all the instruction from the current entry block to the
+  // new entry block.
+  NewEntryBB.splice(NewEntryBB.begin(), EntryBB, EntryBB->begin(),
+                    EntryBB->end());
+
+  // Update the live-in information for the new entry block.
+  for (const MachineBasicBlock::RegisterMaskPair &LiveIn : EntryBB->liveins())
+    NewEntryBB.addLiveIn(LiveIn);
+  NewEntryBB.sortUniqueLiveIns();
+
+  // Get rid of the now empty basic block.
+  EntryBB->removeSuccessor(&NewEntryBB);
+  MF->remove(EntryBB);
+  MF->deleteMachineBasicBlock(EntryBB);
+
+  assert(&MF->front() == &NewEntryBB &&
+         "New entry wasn't next in the list of basic block!");
+
+  // Initialize stack protector information.
+  StackProtector &SP = getAnalysis<StackProtector>();
+  SP.copyToMachineFrameInfo(MF->getFrameInfo());
+
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
new file mode 100644
index 000000000000..3925611f1485
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
@@ -0,0 +1,687 @@
+//===-- lib/CodeGen/GlobalISel/InlineAsmLowering.cpp ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering from LLVM IR inline asm to MIR INLINEASM
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/Module.h"
+
+#define DEBUG_TYPE "inline-asm-lowering"
+
+using namespace llvm;
+
+void InlineAsmLowering::anchor() {}
+
+namespace {
+
+/// GISelAsmOperandInfo - This contains information for each constraint that we
+/// are lowering.
+class GISelAsmOperandInfo : public TargetLowering::AsmOperandInfo {
+public:
+  /// Regs - If this is a register or register class operand, this
+  /// contains the set of assigned registers corresponding to the operand.
+  SmallVector<Register, 1> Regs;
+
+  explicit GISelAsmOperandInfo(const TargetLowering::AsmOperandInfo &Info)
+      : TargetLowering::AsmOperandInfo(Info) {}
+};
+
+using GISelAsmOperandInfoVector = SmallVector<GISelAsmOperandInfo, 16>;
+
+class ExtraFlags {
+  unsigned Flags = 0;
+
+public:
+  explicit ExtraFlags(const CallBase &CB) {
+    const InlineAsm *IA = cast<InlineAsm>(CB.getCalledOperand());
+    if (IA->hasSideEffects())
+      Flags |= InlineAsm::Extra_HasSideEffects;
+    if (IA->isAlignStack())
+      Flags |= InlineAsm::Extra_IsAlignStack;
+    if (CB.isConvergent())
+      Flags |= InlineAsm::Extra_IsConvergent;
+    Flags |= IA->getDialect() * InlineAsm::Extra_AsmDialect;
+  }
+
+  void update(const TargetLowering::AsmOperandInfo &OpInfo) {
+    // Ideally, we would only check against memory constraints.  However, the
+    // meaning of an Other constraint can be target-specific and we can't easily
+    // reason about it.  Therefore, be conservative and set MayLoad/MayStore
+    // for Other constraints as well.
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory ||
+        OpInfo.ConstraintType == TargetLowering::C_Other) {
+      if (OpInfo.Type == InlineAsm::isInput)
+        Flags |= InlineAsm::Extra_MayLoad;
+      else if (OpInfo.Type == InlineAsm::isOutput)
+        Flags |= InlineAsm::Extra_MayStore;
+      else if (OpInfo.Type == InlineAsm::isClobber)
+        Flags |= (InlineAsm::Extra_MayLoad | InlineAsm::Extra_MayStore);
+    }
+  }
+
+  unsigned get() const { return Flags; }
+};
+
+} // namespace
+
+/// Assign virtual/physical registers for the specified register operand.
+static void getRegistersForValue(MachineFunction &MF,
+                                 MachineIRBuilder &MIRBuilder,
+                                 GISelAsmOperandInfo &OpInfo,
+                                 GISelAsmOperandInfo &RefOpInfo) {
+
+  const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+
+  // No work to do for memory operations.
+  if (OpInfo.ConstraintType == TargetLowering::C_Memory)
+    return;
+
+  // If this is a constraint for a single physreg, or a constraint for a
+  // register class, find it.
+  Register AssignedReg;
+  const TargetRegisterClass *RC;
+  std::tie(AssignedReg, RC) = TLI.getRegForInlineAsmConstraint(
+      &TRI, RefOpInfo.ConstraintCode, RefOpInfo.ConstraintVT);
+  // RC is unset only on failure. Return immediately.
+  if (!RC)
+    return;
+
+  // No need to allocate a matching input constraint since the constraint it's
+  // matching to has already been allocated.
+  if (OpInfo.isMatchingInputConstraint())
+    return;
+
+  // Initialize NumRegs.
+  unsigned NumRegs = 1;
+  if (OpInfo.ConstraintVT != MVT::Other)
+    NumRegs =
+        TLI.getNumRegisters(MF.getFunction().getContext(), OpInfo.ConstraintVT);
+
+  // If this is a constraint for a specific physical register, but the type of
+  // the operand requires more than one register to be passed, we allocate the
+  // required amount of physical registers, starting from the selected physical
+  // register.
+  // For this, first retrieve a register iterator for the given register class
+  TargetRegisterClass::iterator I = RC->begin();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  // Advance the iterator to the assigned register (if set)
+  if (AssignedReg) {
+    for (; *I != AssignedReg; ++I)
+      assert(I != RC->end() && "AssignedReg should be a member of provided RC");
+  }
+
+  // Finally, assign the registers. If the AssignedReg isn't set, create virtual
+  // registers with the provided register class
+  for (; NumRegs; --NumRegs, ++I) {
+    assert(I != RC->end() && "Ran out of registers to allocate!");
+    Register R = AssignedReg ? Register(*I) : RegInfo.createVirtualRegister(RC);
+    OpInfo.Regs.push_back(R);
+  }
+}
+
+/// Return an integer indicating how general CT is.
+static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
+  switch (CT) {
+  case TargetLowering::C_Immediate:
+  case TargetLowering::C_Other:
+  case TargetLowering::C_Unknown:
+    return 0;
+  case TargetLowering::C_Register:
+    return 1;
+  case TargetLowering::C_RegisterClass:
+    return 2;
+  case TargetLowering::C_Memory:
+  case TargetLowering::C_Address:
+    return 3;
+  }
+  llvm_unreachable("Invalid constraint type");
+}
+
+static void chooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
+                             const TargetLowering *TLI) {
+  assert(OpInfo.Codes.size() > 1 && "Doesn't have multiple constraint options");
+  unsigned BestIdx = 0;
+  TargetLowering::ConstraintType BestType = TargetLowering::C_Unknown;
+  int BestGenerality = -1;
+
+  // Loop over the options, keeping track of the most general one.
+  for (unsigned i = 0, e = OpInfo.Codes.size(); i != e; ++i) {
+    TargetLowering::ConstraintType CType =
+        TLI->getConstraintType(OpInfo.Codes[i]);
+
+    // Indirect 'other' or 'immediate' constraints are not allowed.
+    if (OpInfo.isIndirect && !(CType == TargetLowering::C_Memory ||
+                               CType == TargetLowering::C_Register ||
+                               CType == TargetLowering::C_RegisterClass))
+      continue;
+
+    // If this is an 'other' or 'immediate' constraint, see if the operand is
+    // valid for it. For example, on X86 we might have an 'rI' constraint. If
+    // the operand is an integer in the range [0..31] we want to use I (saving a
+    // load of a register), otherwise we must use 'r'.
+    if (CType == TargetLowering::C_Other ||
+        CType == TargetLowering::C_Immediate) {
+      assert(OpInfo.Codes[i].size() == 1 &&
+             "Unhandled multi-letter 'other' constraint");
+      // FIXME: prefer immediate constraints if the target allows it
+    }
+
+    // Things with matching constraints can only be registers, per gcc
+    // documentation.  This mainly affects "g" constraints.
+    if (CType == TargetLowering::C_Memory && OpInfo.hasMatchingInput())
+      continue;
+
+    // This constraint letter is more general than the previous one, use it.
+    int Generality = getConstraintGenerality(CType);
+    if (Generality > BestGenerality) {
+      BestType = CType;
+      BestIdx = i;
+      BestGenerality = Generality;
+    }
+  }
+
+  OpInfo.ConstraintCode = OpInfo.Codes[BestIdx];
+  OpInfo.ConstraintType = BestType;
+}
+
+static void computeConstraintToUse(const TargetLowering *TLI,
+                                   TargetLowering::AsmOperandInfo &OpInfo) {
+  assert(!OpInfo.Codes.empty() && "Must have at least one constraint");
+
+  // Single-letter constraints ('r') are very common.
+  if (OpInfo.Codes.size() == 1) {
+    OpInfo.ConstraintCode = OpInfo.Codes[0];
+    OpInfo.ConstraintType = TLI->getConstraintType(OpInfo.ConstraintCode);
+  } else {
+    chooseConstraint(OpInfo, TLI);
+  }
+
+  // 'X' matches anything.
+  if (OpInfo.ConstraintCode == "X" && OpInfo.CallOperandVal) {
+    // Labels and constants are handled elsewhere ('X' is the only thing
+    // that matches labels).  For Functions, the type here is the type of
+    // the result, which is not what we want to look at; leave them alone.
+    Value *Val = OpInfo.CallOperandVal;
+    if (isa<BasicBlock>(Val) || isa<ConstantInt>(Val) || isa<Function>(Val))
+      return;
+
+    // Otherwise, try to resolve it to something we know about by looking at
+    // the actual operand type.
+    if (const char *Repl = TLI->LowerXConstraint(OpInfo.ConstraintVT)) {
+      OpInfo.ConstraintCode = Repl;
+      OpInfo.ConstraintType = TLI->getConstraintType(OpInfo.ConstraintCode);
+    }
+  }
+}
+
+static unsigned getNumOpRegs(const MachineInstr &I, unsigned OpIdx) {
+  unsigned Flag = I.getOperand(OpIdx).getImm();
+  return InlineAsm::getNumOperandRegisters(Flag);
+}
+
+static bool buildAnyextOrCopy(Register Dst, Register Src,
+                              MachineIRBuilder &MIRBuilder) {
+  const TargetRegisterInfo *TRI =
+      MIRBuilder.getMF().getSubtarget().getRegisterInfo();
+  MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+
+  auto SrcTy = MRI->getType(Src);
+  if (!SrcTy.isValid()) {
+    LLVM_DEBUG(dbgs() << "Source type for copy is not valid\n");
+    return false;
+  }
+  unsigned SrcSize = TRI->getRegSizeInBits(Src, *MRI);
+  unsigned DstSize = TRI->getRegSizeInBits(Dst, *MRI);
+
+  if (DstSize < SrcSize) {
+    LLVM_DEBUG(dbgs() << "Input can't fit in destination reg class\n");
+    return false;
+  }
+
+  // Attempt to anyext small scalar sources.
+  if (DstSize > SrcSize) {
+    if (!SrcTy.isScalar()) {
+      LLVM_DEBUG(dbgs() << "Can't extend non-scalar input to size of"
+                           "destination register class\n");
+      return false;
+    }
+    Src = MIRBuilder.buildAnyExt(LLT::scalar(DstSize), Src).getReg(0);
+  }
+
+  MIRBuilder.buildCopy(Dst, Src);
+  return true;
+}
+
+bool InlineAsmLowering::lowerInlineAsm(
+    MachineIRBuilder &MIRBuilder, const CallBase &Call,
+    std::function<ArrayRef<Register>(const Value &Val)> GetOrCreateVRegs)
+    const {
+  const InlineAsm *IA = cast<InlineAsm>(Call.getCalledOperand());
+
+  /// ConstraintOperands - Information about all of the constraints.
+  GISelAsmOperandInfoVector ConstraintOperands;
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  const Function &F = MF.getFunction();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+  MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+
+  TargetLowering::AsmOperandInfoVector TargetConstraints =
+      TLI->ParseConstraints(DL, TRI, Call);
+
+  ExtraFlags ExtraInfo(Call);
+  unsigned ArgNo = 0; // ArgNo - The argument of the CallInst.
+  unsigned ResNo = 0; // ResNo - The result number of the next output.
+  for (auto &T : TargetConstraints) {
+    ConstraintOperands.push_back(GISelAsmOperandInfo(T));
+    GISelAsmOperandInfo &OpInfo = ConstraintOperands.back();
+
+    // Compute the value type for each operand.
+    if (OpInfo.hasArg()) {
+      OpInfo.CallOperandVal = const_cast<Value *>(Call.getArgOperand(ArgNo));
+
+      if (isa<BasicBlock>(OpInfo.CallOperandVal)) {
+        LLVM_DEBUG(dbgs() << "Basic block input operands not supported yet\n");
+        return false;
+      }
+
+      Type *OpTy = OpInfo.CallOperandVal->getType();
+
+      // If this is an indirect operand, the operand is a pointer to the
+      // accessed type.
+      if (OpInfo.isIndirect) {
+        OpTy = Call.getParamElementType(ArgNo);
+        assert(OpTy && "Indirect operand must have elementtype attribute");
+      }
+
+      // FIXME: Support aggregate input operands
+      if (!OpTy->isSingleValueType()) {
+        LLVM_DEBUG(
+            dbgs() << "Aggregate input operands are not supported yet\n");
+        return false;
+      }
+
+      OpInfo.ConstraintVT =
+          TLI->getAsmOperandValueType(DL, OpTy, true).getSimpleVT();
+      ++ArgNo;
+    } else if (OpInfo.Type == InlineAsm::isOutput && !OpInfo.isIndirect) {
+      assert(!Call.getType()->isVoidTy() && "Bad inline asm!");
+      if (StructType *STy = dyn_cast<StructType>(Call.getType())) {
+        OpInfo.ConstraintVT =
+            TLI->getSimpleValueType(DL, STy->getElementType(ResNo));
+      } else {
+        assert(ResNo == 0 && "Asm only has one result!");
+        OpInfo.ConstraintVT =
+            TLI->getAsmOperandValueType(DL, Call.getType()).getSimpleVT();
+      }
+      ++ResNo;
+    } else {
+      assert(OpInfo.Type != InlineAsm::isLabel &&
+             "GlobalISel currently doesn't support callbr");
+      OpInfo.ConstraintVT = MVT::Other;
+    }
+
+    if (OpInfo.ConstraintVT == MVT::i64x8)
+      return false;
+
+    // Compute the constraint code and ConstraintType to use.
+    computeConstraintToUse(TLI, OpInfo);
+
+    // The selected constraint type might expose new sideeffects
+    ExtraInfo.update(OpInfo);
+  }
+
+  // At this point, all operand types are decided.
+  // Create the MachineInstr, but don't insert it yet since input
+  // operands still need to insert instructions before this one
+  auto Inst = MIRBuilder.buildInstrNoInsert(TargetOpcode::INLINEASM)
+                  .addExternalSymbol(IA->getAsmString().c_str())
+                  .addImm(ExtraInfo.get());
+
+  // Starting from this operand: flag followed by register(s) will be added as
+  // operands to Inst for each constraint. Used for matching input constraints.
+  unsigned StartIdx = Inst->getNumOperands();
+
+  // Collects the output operands for later processing
+  GISelAsmOperandInfoVector OutputOperands;
+
+  for (auto &OpInfo : ConstraintOperands) {
+    GISelAsmOperandInfo &RefOpInfo =
+        OpInfo.isMatchingInputConstraint()
+            ? ConstraintOperands[OpInfo.getMatchedOperand()]
+            : OpInfo;
+
+    // Assign registers for register operands
+    getRegistersForValue(MF, MIRBuilder, OpInfo, RefOpInfo);
+
+    switch (OpInfo.Type) {
+    case InlineAsm::isOutput:
+      if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
+        unsigned ConstraintID =
+            TLI->getInlineAsmMemConstraint(OpInfo.ConstraintCode);
+        assert(ConstraintID != InlineAsm::Constraint_Unknown &&
+               "Failed to convert memory constraint code to constraint id.");
+
+        // Add information to the INLINEASM instruction to know about this
+        // output.
+        unsigned OpFlags = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
+        OpFlags = InlineAsm::getFlagWordForMem(OpFlags, ConstraintID);
+        Inst.addImm(OpFlags);
+        ArrayRef<Register> SourceRegs =
+            GetOrCreateVRegs(*OpInfo.CallOperandVal);
+        assert(
+            SourceRegs.size() == 1 &&
+            "Expected the memory output to fit into a single virtual register");
+        Inst.addReg(SourceRegs[0]);
+      } else {
+        // Otherwise, this outputs to a register (directly for C_Register /
+        // C_RegisterClass/C_Other.
+        assert(OpInfo.ConstraintType == TargetLowering::C_Register ||
+               OpInfo.ConstraintType == TargetLowering::C_RegisterClass ||
+               OpInfo.ConstraintType == TargetLowering::C_Other);
+
+        // Find a register that we can use.
+        if (OpInfo.Regs.empty()) {
+          LLVM_DEBUG(dbgs()
+                     << "Couldn't allocate output register for constraint\n");
+          return false;
+        }
+
+        // Add information to the INLINEASM instruction to know that this
+        // register is set.
+        unsigned Flag = InlineAsm::getFlagWord(
+            OpInfo.isEarlyClobber ? InlineAsm::Kind_RegDefEarlyClobber
+                                  : InlineAsm::Kind_RegDef,
+            OpInfo.Regs.size());
+        if (OpInfo.Regs.front().isVirtual()) {
+          // Put the register class of the virtual registers in the flag word.
+          // That way, later passes can recompute register class constraints for
+          // inline assembly as well as normal instructions. Don't do this for
+          // tied operands that can use the regclass information from the def.
+          const TargetRegisterClass *RC = MRI->getRegClass(OpInfo.Regs.front());
+          Flag = InlineAsm::getFlagWordForRegClass(Flag, RC->getID());
+        }
+
+        Inst.addImm(Flag);
+
+        for (Register Reg : OpInfo.Regs) {
+          Inst.addReg(Reg,
+                      RegState::Define | getImplRegState(Reg.isPhysical()) |
+                          (OpInfo.isEarlyClobber ? RegState::EarlyClobber : 0));
+        }
+
+        // Remember this output operand for later processing
+        OutputOperands.push_back(OpInfo);
+      }
+
+      break;
+    case InlineAsm::isInput:
+    case InlineAsm::isLabel: {
+      if (OpInfo.isMatchingInputConstraint()) {
+        unsigned DefIdx = OpInfo.getMatchedOperand();
+        // Find operand with register def that corresponds to DefIdx.
+        unsigned InstFlagIdx = StartIdx;
+        for (unsigned i = 0; i < DefIdx; ++i)
+          InstFlagIdx += getNumOpRegs(*Inst, InstFlagIdx) + 1;
+        assert(getNumOpRegs(*Inst, InstFlagIdx) == 1 && "Wrong flag");
+
+        unsigned MatchedOperandFlag = Inst->getOperand(InstFlagIdx).getImm();
+        if (InlineAsm::isMemKind(MatchedOperandFlag)) {
+          LLVM_DEBUG(dbgs() << "Matching input constraint to mem operand not "
+                               "supported. This should be target specific.\n");
+          return false;
+        }
+        if (!InlineAsm::isRegDefKind(MatchedOperandFlag) &&
+            !InlineAsm::isRegDefEarlyClobberKind(MatchedOperandFlag)) {
+          LLVM_DEBUG(dbgs() << "Unknown matching constraint\n");
+          return false;
+        }
+
+        // We want to tie input to register in next operand.
+        unsigned DefRegIdx = InstFlagIdx + 1;
+        Register Def = Inst->getOperand(DefRegIdx).getReg();
+
+        ArrayRef<Register> SrcRegs = GetOrCreateVRegs(*OpInfo.CallOperandVal);
+        assert(SrcRegs.size() == 1 && "Single register is expected here");
+
+        // When Def is physreg: use given input.
+        Register In = SrcRegs[0];
+        // When Def is vreg: copy input to new vreg with same reg class as Def.
+        if (Def.isVirtual()) {
+          In = MRI->createVirtualRegister(MRI->getRegClass(Def));
+          if (!buildAnyextOrCopy(In, SrcRegs[0], MIRBuilder))
+            return false;
+        }
+
+        // Add Flag and input register operand (In) to Inst. Tie In to Def.
+        unsigned UseFlag = InlineAsm::getFlagWord(InlineAsm::Kind_RegUse, 1);
+        unsigned Flag = InlineAsm::getFlagWordForMatchingOp(UseFlag, DefIdx);
+        Inst.addImm(Flag);
+        Inst.addReg(In);
+        Inst->tieOperands(DefRegIdx, Inst->getNumOperands() - 1);
+        break;
+      }
+
+      if (OpInfo.ConstraintType == TargetLowering::C_Other &&
+          OpInfo.isIndirect) {
+        LLVM_DEBUG(dbgs() << "Indirect input operands with unknown constraint "
+                             "not supported yet\n");
+        return false;
+      }
+
+      if (OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+          OpInfo.ConstraintType == TargetLowering::C_Other) {
+
+        std::vector<MachineOperand> Ops;
+        if (!lowerAsmOperandForConstraint(OpInfo.CallOperandVal,
+                                          OpInfo.ConstraintCode, Ops,
+                                          MIRBuilder)) {
+          LLVM_DEBUG(dbgs() << "Don't support constraint: "
+                            << OpInfo.ConstraintCode << " yet\n");
+          return false;
+        }
+
+        assert(Ops.size() > 0 &&
+               "Expected constraint to be lowered to at least one operand");
+
+        // Add information to the INLINEASM node to know about this input.
+        unsigned OpFlags =
+            InlineAsm::getFlagWord(InlineAsm::Kind_Imm, Ops.size());
+        Inst.addImm(OpFlags);
+        Inst.add(Ops);
+        break;
+      }
+
+      if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
+
+        if (!OpInfo.isIndirect) {
+          LLVM_DEBUG(dbgs()
+                     << "Cannot indirectify memory input operands yet\n");
+          return false;
+        }
+
+        assert(OpInfo.isIndirect && "Operand must be indirect to be a mem!");
+
+        unsigned ConstraintID =
+            TLI->getInlineAsmMemConstraint(OpInfo.ConstraintCode);
+        unsigned OpFlags = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
+        OpFlags = InlineAsm::getFlagWordForMem(OpFlags, ConstraintID);
+        Inst.addImm(OpFlags);
+        ArrayRef<Register> SourceRegs =
+            GetOrCreateVRegs(*OpInfo.CallOperandVal);
+        assert(
+            SourceRegs.size() == 1 &&
+            "Expected the memory input to fit into a single virtual register");
+        Inst.addReg(SourceRegs[0]);
+        break;
+      }
+
+      assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass ||
+              OpInfo.ConstraintType == TargetLowering::C_Register) &&
+             "Unknown constraint type!");
+
+      if (OpInfo.isIndirect) {
+        LLVM_DEBUG(dbgs() << "Can't handle indirect register inputs yet "
+                             "for constraint '"
+                          << OpInfo.ConstraintCode << "'\n");
+        return false;
+      }
+
+      // Copy the input into the appropriate registers.
+      if (OpInfo.Regs.empty()) {
+        LLVM_DEBUG(
+            dbgs()
+            << "Couldn't allocate input register for register constraint\n");
+        return false;
+      }
+
+      unsigned NumRegs = OpInfo.Regs.size();
+      ArrayRef<Register> SourceRegs = GetOrCreateVRegs(*OpInfo.CallOperandVal);
+      assert(NumRegs == SourceRegs.size() &&
+             "Expected the number of input registers to match the number of "
+             "source registers");
+
+      if (NumRegs > 1) {
+        LLVM_DEBUG(dbgs() << "Input operands with multiple input registers are "
+                             "not supported yet\n");
+        return false;
+      }
+
+      unsigned Flag = InlineAsm::getFlagWord(InlineAsm::Kind_RegUse, NumRegs);
+      if (OpInfo.Regs.front().isVirtual()) {
+        // Put the register class of the virtual registers in the flag word.
+        const TargetRegisterClass *RC = MRI->getRegClass(OpInfo.Regs.front());
+        Flag = InlineAsm::getFlagWordForRegClass(Flag, RC->getID());
+      }
+      Inst.addImm(Flag);
+      if (!buildAnyextOrCopy(OpInfo.Regs[0], SourceRegs[0], MIRBuilder))
+        return false;
+      Inst.addReg(OpInfo.Regs[0]);
+      break;
+    }
+
+    case InlineAsm::isClobber: {
+
+      unsigned NumRegs = OpInfo.Regs.size();
+      if (NumRegs > 0) {
+        unsigned Flag =
+            InlineAsm::getFlagWord(InlineAsm::Kind_Clobber, NumRegs);
+        Inst.addImm(Flag);
+
+        for (Register Reg : OpInfo.Regs) {
+          Inst.addReg(Reg, RegState::Define | RegState::EarlyClobber |
+                               getImplRegState(Reg.isPhysical()));
+        }
+      }
+      break;
+    }
+    }
+  }
+
+  if (const MDNode *SrcLoc = Call.getMetadata("srcloc"))
+    Inst.addMetadata(SrcLoc);
+
+  // All inputs are handled, insert the instruction now
+  MIRBuilder.insertInstr(Inst);
+
+  // Finally, copy the output operands into the output registers
+  ArrayRef<Register> ResRegs = GetOrCreateVRegs(Call);
+  if (ResRegs.size() != OutputOperands.size()) {
+    LLVM_DEBUG(dbgs() << "Expected the number of output registers to match the "
+                         "number of destination registers\n");
+    return false;
+  }
+  for (unsigned int i = 0, e = ResRegs.size(); i < e; i++) {
+    GISelAsmOperandInfo &OpInfo = OutputOperands[i];
+
+    if (OpInfo.Regs.empty())
+      continue;
+
+    switch (OpInfo.ConstraintType) {
+    case TargetLowering::C_Register:
+    case TargetLowering::C_RegisterClass: {
+      if (OpInfo.Regs.size() > 1) {
+        LLVM_DEBUG(dbgs() << "Output operands with multiple defining "
+                             "registers are not supported yet\n");
+        return false;
+      }
+
+      Register SrcReg = OpInfo.Regs[0];
+      unsigned SrcSize = TRI->getRegSizeInBits(SrcReg, *MRI);
+      LLT ResTy = MRI->getType(ResRegs[i]);
+      if (ResTy.isScalar() && ResTy.getSizeInBits() < SrcSize) {
+        // First copy the non-typed virtual register into a generic virtual
+        // register
+        Register Tmp1Reg =
+            MRI->createGenericVirtualRegister(LLT::scalar(SrcSize));
+        MIRBuilder.buildCopy(Tmp1Reg, SrcReg);
+        // Need to truncate the result of the register
+        MIRBuilder.buildTrunc(ResRegs[i], Tmp1Reg);
+      } else if (ResTy.getSizeInBits() == SrcSize) {
+        MIRBuilder.buildCopy(ResRegs[i], SrcReg);
+      } else {
+        LLVM_DEBUG(dbgs() << "Unhandled output operand with "
+                             "mismatched register size\n");
+        return false;
+      }
+
+      break;
+    }
+    case TargetLowering::C_Immediate:
+    case TargetLowering::C_Other:
+      LLVM_DEBUG(
+          dbgs() << "Cannot lower target specific output constraints yet\n");
+      return false;
+    case TargetLowering::C_Memory:
+      break; // Already handled.
+    case TargetLowering::C_Address:
+      break; // Silence warning.
+    case TargetLowering::C_Unknown:
+      LLVM_DEBUG(dbgs() << "Unexpected unknown constraint\n");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool InlineAsmLowering::lowerAsmOperandForConstraint(
+    Value *Val, StringRef Constraint, std::vector<MachineOperand> &Ops,
+    MachineIRBuilder &MIRBuilder) const {
+  if (Constraint.size() > 1)
+    return false;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default:
+    return false;
+  case 'i': // Simple Integer or Relocatable Constant
+  case 'n': // immediate integer with a known value.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+      assert(CI->getBitWidth() <= 64 &&
+             "expected immediate to fit into 64-bits");
+      // Boolean constants should be zero-extended, others are sign-extended
+      bool IsBool = CI->getBitWidth() == 1;
+      int64_t ExtVal = IsBool ? CI->getZExtValue() : CI->getSExtValue();
+      Ops.push_back(MachineOperand::CreateImm(ExtVal));
+      return true;
+    }
+    return false;
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
new file mode 100644
index 000000000000..9bbef11067ae
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -0,0 +1,330 @@
+//===- llvm/CodeGen/GlobalISel/InstructionSelect.cpp - InstructionSelect ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the InstructionSelect class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/config.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/CodeGenCoverage.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "instruction-select"
+
+using namespace llvm;
+
+#ifdef LLVM_GISEL_COV_PREFIX
+static cl::opt<std::string>
+    CoveragePrefix("gisel-coverage-prefix", cl::init(LLVM_GISEL_COV_PREFIX),
+                   cl::desc("Record GlobalISel rule coverage files of this "
+                            "prefix if instrumentation was generated"));
+#else
+static const std::string CoveragePrefix;
+#endif
+
+char InstructionSelect::ID = 0;
+INITIALIZE_PASS_BEGIN(InstructionSelect, DEBUG_TYPE,
+                      "Select target instructions out of generic instructions",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
+INITIALIZE_PASS_END(InstructionSelect, DEBUG_TYPE,
+                    "Select target instructions out of generic instructions",
+                    false, false)
+
+InstructionSelect::InstructionSelect(CodeGenOpt::Level OL)
+    : MachineFunctionPass(ID), OptLevel(OL) {}
+
+// In order not to crash when calling getAnalysis during testing with -run-pass
+// we use the default opt level here instead of None, so that the addRequired()
+// calls are made in getAnalysisUsage().
+InstructionSelect::InstructionSelect()
+    : MachineFunctionPass(ID), OptLevel(CodeGenOpt::Default) {}
+
+void InstructionSelect::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
+  AU.addRequired<GISelKnownBitsAnalysis>();
+  AU.addPreserved<GISelKnownBitsAnalysis>();
+
+  if (OptLevel != CodeGenOpt::None) {
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
+  }
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
+  // If the ISel pipeline failed, do not bother running that pass.
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Selecting function: " << MF.getName() << '\n');
+
+  const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+  InstructionSelector *ISel = MF.getSubtarget().getInstructionSelector();
+
+  CodeGenOpt::Level OldOptLevel = OptLevel;
+  auto RestoreOptLevel = make_scope_exit([=]() { OptLevel = OldOptLevel; });
+  OptLevel = MF.getFunction().hasOptNone() ? CodeGenOpt::None
+                                           : MF.getTarget().getOptLevel();
+
+  GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+  if (OptLevel != CodeGenOpt::None) {
+    PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+    if (PSI && PSI->hasProfileSummary())
+      BFI = &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI();
+  }
+
+  CodeGenCoverage CoverageInfo;
+  assert(ISel && "Cannot work without InstructionSelector");
+  ISel->setupMF(MF, KB, &CoverageInfo, PSI, BFI);
+
+  // An optimization remark emitter. Used to report failures.
+  MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr);
+
+  // FIXME: There are many other MF/MFI fields we need to initialize.
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+#ifndef NDEBUG
+  // Check that our input is fully legal: we require the function to have the
+  // Legalized property, so it should be.
+  // FIXME: This should be in the MachineVerifier, as the RegBankSelected
+  // property check already is.
+  if (!DisableGISelLegalityCheck)
+    if (const MachineInstr *MI = machineFunctionIsIllegal(MF)) {
+      reportGISelFailure(MF, TPC, MORE, "gisel-select",
+                         "instruction is not legal", *MI);
+      return false;
+    }
+  // FIXME: We could introduce new blocks and will need to fix the outer loop.
+  // Until then, keep track of the number of blocks to assert that we don't.
+  const size_t NumBlocks = MF.size();
+#endif
+  // Keep track of selected blocks, so we can delete unreachable ones later.
+  DenseSet<MachineBasicBlock *> SelectedBlocks;
+
+  for (MachineBasicBlock *MBB : post_order(&MF)) {
+    ISel->CurMBB = MBB;
+    SelectedBlocks.insert(MBB);
+    if (MBB->empty())
+      continue;
+
+    // Select instructions in reverse block order. We permit erasing so have
+    // to resort to manually iterating and recognizing the begin (rend) case.
+    bool ReachedBegin = false;
+    for (auto MII = std::prev(MBB->end()), Begin = MBB->begin();
+         !ReachedBegin;) {
+#ifndef NDEBUG
+      // Keep track of the insertion range for debug printing.
+      const auto AfterIt = std::next(MII);
+#endif
+      // Select this instruction.
+      MachineInstr &MI = *MII;
+
+      // And have our iterator point to the next instruction, if there is one.
+      if (MII == Begin)
+        ReachedBegin = true;
+      else
+        --MII;
+
+      LLVM_DEBUG(dbgs() << "Selecting: \n  " << MI);
+
+      // We could have folded this instruction away already, making it dead.
+      // If so, erase it.
+      if (isTriviallyDead(MI, MRI)) {
+        LLVM_DEBUG(dbgs() << "Is dead; erasing.\n");
+        salvageDebugInfo(MRI, MI);
+        MI.eraseFromParent();
+        continue;
+      }
+
+      // Eliminate hints or G_CONSTANT_FOLD_BARRIER.
+      if (isPreISelGenericOptimizationHint(MI.getOpcode()) ||
+          MI.getOpcode() == TargetOpcode::G_CONSTANT_FOLD_BARRIER) {
+        auto [DstReg, SrcReg] = MI.getFirst2Regs();
+
+        // At this point, the destination register class of the op may have
+        // been decided.
+        //
+        // Propagate that through to the source register.
+        const TargetRegisterClass *DstRC = MRI.getRegClassOrNull(DstReg);
+        if (DstRC)
+          MRI.setRegClass(SrcReg, DstRC);
+        assert(canReplaceReg(DstReg, SrcReg, MRI) &&
+               "Must be able to replace dst with src!");
+        MI.eraseFromParent();
+        MRI.replaceRegWith(DstReg, SrcReg);
+        continue;
+      }
+
+      if (MI.getOpcode() == TargetOpcode::G_INVOKE_REGION_START) {
+        MI.eraseFromParent();
+        continue;
+      }
+
+      if (!ISel->select(MI)) {
+        // FIXME: It would be nice to dump all inserted instructions.  It's
+        // not obvious how, esp. considering select() can insert after MI.
+        reportGISelFailure(MF, TPC, MORE, "gisel-select", "cannot select", MI);
+        return false;
+      }
+
+      // Dump the range of instructions that MI expanded into.
+      LLVM_DEBUG({
+        auto InsertedBegin = ReachedBegin ? MBB->begin() : std::next(MII);
+        dbgs() << "Into:\n";
+        for (auto &InsertedMI : make_range(InsertedBegin, AfterIt))
+          dbgs() << "  " << InsertedMI;
+        dbgs() << '\n';
+      });
+    }
+  }
+
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBB.empty())
+      continue;
+
+    if (!SelectedBlocks.contains(&MBB)) {
+      // This is an unreachable block and therefore hasn't been selected, since
+      // the main selection loop above uses a postorder block traversal.
+      // We delete all the instructions in this block since it's unreachable.
+      MBB.clear();
+      // Don't delete the block in case the block has it's address taken or is
+      // still being referenced by a phi somewhere.
+      continue;
+    }
+    // Try to find redundant copies b/w vregs of the same register class.
+    bool ReachedBegin = false;
+    for (auto MII = std::prev(MBB.end()), Begin = MBB.begin(); !ReachedBegin;) {
+      // Select this instruction.
+      MachineInstr &MI = *MII;
+
+      // And have our iterator point to the next instruction, if there is one.
+      if (MII == Begin)
+        ReachedBegin = true;
+      else
+        --MII;
+      if (MI.getOpcode() != TargetOpcode::COPY)
+        continue;
+      Register SrcReg = MI.getOperand(1).getReg();
+      Register DstReg = MI.getOperand(0).getReg();
+      if (SrcReg.isVirtual() && DstReg.isVirtual()) {
+        auto SrcRC = MRI.getRegClass(SrcReg);
+        auto DstRC = MRI.getRegClass(DstReg);
+        if (SrcRC == DstRC) {
+          MRI.replaceRegWith(DstReg, SrcReg);
+          MI.eraseFromParent();
+        }
+      }
+    }
+  }
+
+#ifndef NDEBUG
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  // Now that selection is complete, there are no more generic vregs.  Verify
+  // that the size of the now-constrained vreg is unchanged and that it has a
+  // register class.
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    Register VReg = Register::index2VirtReg(I);
+
+    MachineInstr *MI = nullptr;
+    if (!MRI.def_empty(VReg))
+      MI = &*MRI.def_instr_begin(VReg);
+    else if (!MRI.use_empty(VReg)) {
+      MI = &*MRI.use_instr_begin(VReg);
+      // Debug value instruction is permitted to use undefined vregs.
+      if (MI->isDebugValue())
+        continue;
+    }
+    if (!MI)
+      continue;
+
+    const TargetRegisterClass *RC = MRI.getRegClassOrNull(VReg);
+    if (!RC) {
+      reportGISelFailure(MF, TPC, MORE, "gisel-select",
+                         "VReg has no regclass after selection", *MI);
+      return false;
+    }
+
+    const LLT Ty = MRI.getType(VReg);
+    if (Ty.isValid() && Ty.getSizeInBits() > TRI.getRegSizeInBits(*RC)) {
+      reportGISelFailure(
+          MF, TPC, MORE, "gisel-select",
+          "VReg's low-level type and register class have different sizes", *MI);
+      return false;
+    }
+  }
+
+  if (MF.size() != NumBlocks) {
+    MachineOptimizationRemarkMissed R("gisel-select", "GISelFailure",
+                                      MF.getFunction().getSubprogram(),
+                                      /*MBB=*/nullptr);
+    R << "inserting blocks is not supported yet";
+    reportGISelFailure(MF, TPC, MORE, R);
+    return false;
+  }
+#endif
+  // Determine if there are any calls in this machine function. Ported from
+  // SelectionDAG.
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  for (const auto &MBB : MF) {
+    if (MFI.hasCalls() && MF.hasInlineAsm())
+      break;
+
+    for (const auto &MI : MBB) {
+      if ((MI.isCall() && !MI.isReturn()) || MI.isStackAligningInlineAsm())
+        MFI.setHasCalls(true);
+      if (MI.isInlineAsm())
+        MF.setHasInlineAsm(true);
+    }
+  }
+
+  // FIXME: FinalizeISel pass calls finalizeLowering, so it's called twice.
+  auto &TLI = *MF.getSubtarget().getTargetLowering();
+  TLI.finalizeLowering(MF);
+
+  LLVM_DEBUG({
+    dbgs() << "Rules covered by selecting function: " << MF.getName() << ":";
+    for (auto RuleID : CoverageInfo.covered())
+      dbgs() << " id" << RuleID;
+    dbgs() << "\n\n";
+  });
+  CoverageInfo.emit(CoveragePrefix,
+                    TLI.getTargetMachine().getTarget().getBackendName());
+
+  // If we successfully selected the function nothing is going to use the vreg
+  // types after us (otherwise MIRPrinter would need them). Make sure the types
+  // disappear.
+  MRI.clearVirtRegTypes();
+
+  // FIXME: Should we accurately track changes?
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
new file mode 100644
index 000000000000..c48591cc2f02
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -0,0 +1,16 @@
+//===- llvm/CodeGen/GlobalISel/InstructionSelector.cpp --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+
+namespace llvm {
+
+// vtable anchor
+InstructionSelector::~InstructionSelector() = default;
+
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegacyLegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegacyLegalizerInfo.cpp
new file mode 100644
index 000000000000..8cfb1b786c24
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegacyLegalizerInfo.cpp
@@ -0,0 +1,383 @@
+//===- lib/CodeGen/GlobalISel/LegacyLegalizerInfo.cpp - Legalizer ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implement an interface to specify and query how an illegal operation on a
+// given type should be expanded.
+//
+// Issues to be resolved:
+//   + Make it fast.
+//   + Support weird types like i3, <7 x i3>, ...
+//   + Operations with more than one type (ICMP, CMPXCHG, intrinsics, ...)
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/LegacyLegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include <map>
+
+using namespace llvm;
+using namespace LegacyLegalizeActions;
+
+#define DEBUG_TYPE "legalizer-info"
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, LegacyLegalizeAction Action) {
+  switch (Action) {
+  case Legal:
+    OS << "Legal";
+    break;
+  case NarrowScalar:
+    OS << "NarrowScalar";
+    break;
+  case WidenScalar:
+    OS << "WidenScalar";
+    break;
+  case FewerElements:
+    OS << "FewerElements";
+    break;
+  case MoreElements:
+    OS << "MoreElements";
+    break;
+  case Bitcast:
+    OS << "Bitcast";
+    break;
+  case Lower:
+    OS << "Lower";
+    break;
+  case Libcall:
+    OS << "Libcall";
+    break;
+  case Custom:
+    OS << "Custom";
+    break;
+  case Unsupported:
+    OS << "Unsupported";
+    break;
+  case NotFound:
+    OS << "NotFound";
+    break;
+  }
+  return OS;
+}
+
+LegacyLegalizerInfo::LegacyLegalizerInfo() {
+  // Set defaults.
+  // FIXME: these two (G_ANYEXT and G_TRUNC?) can be legalized to the
+  // fundamental load/store Jakob proposed. Once loads & stores are supported.
+  setScalarAction(TargetOpcode::G_ANYEXT, 1, {{1, Legal}});
+  setScalarAction(TargetOpcode::G_ZEXT, 1, {{1, Legal}});
+  setScalarAction(TargetOpcode::G_SEXT, 1, {{1, Legal}});
+  setScalarAction(TargetOpcode::G_TRUNC, 0, {{1, Legal}});
+  setScalarAction(TargetOpcode::G_TRUNC, 1, {{1, Legal}});
+
+  setScalarAction(TargetOpcode::G_INTRINSIC, 0, {{1, Legal}});
+  setScalarAction(TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS, 0, {{1, Legal}});
+
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_IMPLICIT_DEF, 0, narrowToSmallerAndUnsupportedIfTooSmall);
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_ADD, 0, widenToLargerTypesAndNarrowToLargest);
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_OR, 0, widenToLargerTypesAndNarrowToLargest);
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_LOAD, 0, narrowToSmallerAndUnsupportedIfTooSmall);
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_STORE, 0, narrowToSmallerAndUnsupportedIfTooSmall);
+
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_BRCOND, 0, widenToLargerTypesUnsupportedOtherwise);
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_INSERT, 0, narrowToSmallerAndUnsupportedIfTooSmall);
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_EXTRACT, 0, narrowToSmallerAndUnsupportedIfTooSmall);
+  setLegalizeScalarToDifferentSizeStrategy(
+      TargetOpcode::G_EXTRACT, 1, narrowToSmallerAndUnsupportedIfTooSmall);
+  setScalarAction(TargetOpcode::G_FNEG, 0, {{1, Lower}});
+}
+
+void LegacyLegalizerInfo::computeTables() {
+  assert(TablesInitialized == false);
+
+  for (unsigned OpcodeIdx = 0; OpcodeIdx <= LastOp - FirstOp; ++OpcodeIdx) {
+    const unsigned Opcode = FirstOp + OpcodeIdx;
+    for (unsigned TypeIdx = 0; TypeIdx != SpecifiedActions[OpcodeIdx].size();
+         ++TypeIdx) {
+      // 0. Collect information specified through the setAction API, i.e.
+      // for specific bit sizes.
+      // For scalar types:
+      SizeAndActionsVec ScalarSpecifiedActions;
+      // For pointer types:
+      std::map<uint16_t, SizeAndActionsVec> AddressSpace2SpecifiedActions;
+      // For vector types:
+      std::map<uint16_t, SizeAndActionsVec> ElemSize2SpecifiedActions;
+      for (auto LLT2Action : SpecifiedActions[OpcodeIdx][TypeIdx]) {
+        const LLT Type = LLT2Action.first;
+        const LegacyLegalizeAction Action = LLT2Action.second;
+
+        auto SizeAction = std::make_pair(Type.getSizeInBits(), Action);
+        if (Type.isPointer())
+          AddressSpace2SpecifiedActions[Type.getAddressSpace()].push_back(
+              SizeAction);
+        else if (Type.isVector())
+          ElemSize2SpecifiedActions[Type.getElementType().getSizeInBits()]
+              .push_back(SizeAction);
+        else
+          ScalarSpecifiedActions.push_back(SizeAction);
+      }
+
+      // 1. Handle scalar types
+      {
+        // Decide how to handle bit sizes for which no explicit specification
+        // was given.
+        SizeChangeStrategy S = &unsupportedForDifferentSizes;
+        if (TypeIdx < ScalarSizeChangeStrategies[OpcodeIdx].size() &&
+            ScalarSizeChangeStrategies[OpcodeIdx][TypeIdx] != nullptr)
+          S = ScalarSizeChangeStrategies[OpcodeIdx][TypeIdx];
+        llvm::sort(ScalarSpecifiedActions);
+        checkPartialSizeAndActionsVector(ScalarSpecifiedActions);
+        setScalarAction(Opcode, TypeIdx, S(ScalarSpecifiedActions));
+      }
+
+      // 2. Handle pointer types
+      for (auto PointerSpecifiedActions : AddressSpace2SpecifiedActions) {
+        llvm::sort(PointerSpecifiedActions.second);
+        checkPartialSizeAndActionsVector(PointerSpecifiedActions.second);
+        // For pointer types, we assume that there isn't a meaningfull way
+        // to change the number of bits used in the pointer.
+        setPointerAction(
+            Opcode, TypeIdx, PointerSpecifiedActions.first,
+            unsupportedForDifferentSizes(PointerSpecifiedActions.second));
+      }
+
+      // 3. Handle vector types
+      SizeAndActionsVec ElementSizesSeen;
+      for (auto VectorSpecifiedActions : ElemSize2SpecifiedActions) {
+        llvm::sort(VectorSpecifiedActions.second);
+        const uint16_t ElementSize = VectorSpecifiedActions.first;
+        ElementSizesSeen.push_back({ElementSize, Legal});
+        checkPartialSizeAndActionsVector(VectorSpecifiedActions.second);
+        // For vector types, we assume that the best way to adapt the number
+        // of elements is to the next larger number of elements type for which
+        // the vector type is legal, unless there is no such type. In that case,
+        // legalize towards a vector type with a smaller number of elements.
+        SizeAndActionsVec NumElementsActions;
+        for (SizeAndAction BitsizeAndAction : VectorSpecifiedActions.second) {
+          assert(BitsizeAndAction.first % ElementSize == 0);
+          const uint16_t NumElements = BitsizeAndAction.first / ElementSize;
+          NumElementsActions.push_back({NumElements, BitsizeAndAction.second});
+        }
+        setVectorNumElementAction(
+            Opcode, TypeIdx, ElementSize,
+            moreToWiderTypesAndLessToWidest(NumElementsActions));
+      }
+      llvm::sort(ElementSizesSeen);
+      SizeChangeStrategy VectorElementSizeChangeStrategy =
+          &unsupportedForDifferentSizes;
+      if (TypeIdx < VectorElementSizeChangeStrategies[OpcodeIdx].size() &&
+          VectorElementSizeChangeStrategies[OpcodeIdx][TypeIdx] != nullptr)
+        VectorElementSizeChangeStrategy =
+            VectorElementSizeChangeStrategies[OpcodeIdx][TypeIdx];
+      setScalarInVectorAction(
+          Opcode, TypeIdx, VectorElementSizeChangeStrategy(ElementSizesSeen));
+    }
+  }
+
+  TablesInitialized = true;
+}
+
+// FIXME: inefficient implementation for now. Without ComputeValueVTs we're
+// probably going to need specialized lookup structures for various types before
+// we have any hope of doing well with something like <13 x i3>. Even the common
+// cases should do better than what we have now.
+std::pair<LegacyLegalizeAction, LLT>
+LegacyLegalizerInfo::getAspectAction(const InstrAspect &Aspect) const {
+  assert(TablesInitialized && "backend forgot to call computeTables");
+  // These *have* to be implemented for now, they're the fundamental basis of
+  // how everything else is transformed.
+  if (Aspect.Type.isScalar() || Aspect.Type.isPointer())
+    return findScalarLegalAction(Aspect);
+  assert(Aspect.Type.isVector());
+  return findVectorLegalAction(Aspect);
+}
+
+LegacyLegalizerInfo::SizeAndActionsVec
+LegacyLegalizerInfo::increaseToLargerTypesAndDecreaseToLargest(
+    const SizeAndActionsVec &v, LegacyLegalizeAction IncreaseAction,
+    LegacyLegalizeAction DecreaseAction) {
+  SizeAndActionsVec result;
+  unsigned LargestSizeSoFar = 0;
+  if (v.size() >= 1 && v[0].first != 1)
+    result.push_back({1, IncreaseAction});
+  for (size_t i = 0; i < v.size(); ++i) {
+    result.push_back(v[i]);
+    LargestSizeSoFar = v[i].first;
+    if (i + 1 < v.size() && v[i + 1].first != v[i].first + 1) {
+      result.push_back({LargestSizeSoFar + 1, IncreaseAction});
+      LargestSizeSoFar = v[i].first + 1;
+    }
+  }
+  result.push_back({LargestSizeSoFar + 1, DecreaseAction});
+  return result;
+}
+
+LegacyLegalizerInfo::SizeAndActionsVec
+LegacyLegalizerInfo::decreaseToSmallerTypesAndIncreaseToSmallest(
+    const SizeAndActionsVec &v, LegacyLegalizeAction DecreaseAction,
+    LegacyLegalizeAction IncreaseAction) {
+  SizeAndActionsVec result;
+  if (v.size() == 0 || v[0].first != 1)
+    result.push_back({1, IncreaseAction});
+  for (size_t i = 0; i < v.size(); ++i) {
+    result.push_back(v[i]);
+    if (i + 1 == v.size() || v[i + 1].first != v[i].first + 1) {
+      result.push_back({v[i].first + 1, DecreaseAction});
+    }
+  }
+  return result;
+}
+
+LegacyLegalizerInfo::SizeAndAction
+LegacyLegalizerInfo::findAction(const SizeAndActionsVec &Vec, const uint32_t Size) {
+  assert(Size >= 1);
+  // Find the last element in Vec that has a bitsize equal to or smaller than
+  // the requested bit size.
+  // That is the element just before the first element that is bigger than Size.
+  auto It = partition_point(
+      Vec, [=](const SizeAndAction &A) { return A.first <= Size; });
+  assert(It != Vec.begin() && "Does Vec not start with size 1?");
+  int VecIdx = It - Vec.begin() - 1;
+
+  LegacyLegalizeAction Action = Vec[VecIdx].second;
+  switch (Action) {
+  case Legal:
+  case Bitcast:
+  case Lower:
+  case Libcall:
+  case Custom:
+    return {Size, Action};
+  case FewerElements:
+    // FIXME: is this special case still needed and correct?
+    // Special case for scalarization:
+    if (Vec == SizeAndActionsVec({{1, FewerElements}}))
+      return {1, FewerElements};
+    [[fallthrough]];
+  case NarrowScalar: {
+    // The following needs to be a loop, as for now, we do allow needing to
+    // go over "Unsupported" bit sizes before finding a legalizable bit size.
+    // e.g. (s8, WidenScalar), (s9, Unsupported), (s32, Legal). if Size==8,
+    // we need to iterate over s9, and then to s32 to return (s32, Legal).
+    // If we want to get rid of the below loop, we should have stronger asserts
+    // when building the SizeAndActionsVecs, probably not allowing
+    // "Unsupported" unless at the ends of the vector.
+    for (int i = VecIdx - 1; i >= 0; --i)
+      if (!needsLegalizingToDifferentSize(Vec[i].second) &&
+          Vec[i].second != Unsupported)
+        return {Vec[i].first, Action};
+    llvm_unreachable("");
+  }
+  case WidenScalar:
+  case MoreElements: {
+    // See above, the following needs to be a loop, at least for now.
+    for (std::size_t i = VecIdx + 1; i < Vec.size(); ++i)
+      if (!needsLegalizingToDifferentSize(Vec[i].second) &&
+          Vec[i].second != Unsupported)
+        return {Vec[i].first, Action};
+    llvm_unreachable("");
+  }
+  case Unsupported:
+    return {Size, Unsupported};
+  case NotFound:
+    llvm_unreachable("NotFound");
+  }
+  llvm_unreachable("Action has an unknown enum value");
+}
+
+std::pair<LegacyLegalizeAction, LLT>
+LegacyLegalizerInfo::findScalarLegalAction(const InstrAspect &Aspect) const {
+  assert(Aspect.Type.isScalar() || Aspect.Type.isPointer());
+  if (Aspect.Opcode < FirstOp || Aspect.Opcode > LastOp)
+    return {NotFound, LLT()};
+  const unsigned OpcodeIdx = getOpcodeIdxForOpcode(Aspect.Opcode);
+  if (Aspect.Type.isPointer() &&
+      AddrSpace2PointerActions[OpcodeIdx].find(Aspect.Type.getAddressSpace()) ==
+          AddrSpace2PointerActions[OpcodeIdx].end()) {
+    return {NotFound, LLT()};
+  }
+  const SmallVector<SizeAndActionsVec, 1> &Actions =
+      Aspect.Type.isPointer()
+          ? AddrSpace2PointerActions[OpcodeIdx]
+                .find(Aspect.Type.getAddressSpace())
+                ->second
+          : ScalarActions[OpcodeIdx];
+  if (Aspect.Idx >= Actions.size())
+    return {NotFound, LLT()};
+  const SizeAndActionsVec &Vec = Actions[Aspect.Idx];
+  // FIXME: speed up this search, e.g. by using a results cache for repeated
+  // queries?
+  auto SizeAndAction = findAction(Vec, Aspect.Type.getSizeInBits());
+  return {SizeAndAction.second,
+          Aspect.Type.isScalar() ? LLT::scalar(SizeAndAction.first)
+                                 : LLT::pointer(Aspect.Type.getAddressSpace(),
+                                                SizeAndAction.first)};
+}
+
+std::pair<LegacyLegalizeAction, LLT>
+LegacyLegalizerInfo::findVectorLegalAction(const InstrAspect &Aspect) const {
+  assert(Aspect.Type.isVector());
+  // First legalize the vector element size, then legalize the number of
+  // lanes in the vector.
+  if (Aspect.Opcode < FirstOp || Aspect.Opcode > LastOp)
+    return {NotFound, Aspect.Type};
+  const unsigned OpcodeIdx = getOpcodeIdxForOpcode(Aspect.Opcode);
+  const unsigned TypeIdx = Aspect.Idx;
+  if (TypeIdx >= ScalarInVectorActions[OpcodeIdx].size())
+    return {NotFound, Aspect.Type};
+  const SizeAndActionsVec &ElemSizeVec =
+      ScalarInVectorActions[OpcodeIdx][TypeIdx];
+
+  LLT IntermediateType;
+  auto ElementSizeAndAction =
+      findAction(ElemSizeVec, Aspect.Type.getScalarSizeInBits());
+  IntermediateType = LLT::fixed_vector(Aspect.Type.getNumElements(),
+                                       ElementSizeAndAction.first);
+  if (ElementSizeAndAction.second != Legal)
+    return {ElementSizeAndAction.second, IntermediateType};
+
+  auto i = NumElements2Actions[OpcodeIdx].find(
+      IntermediateType.getScalarSizeInBits());
+  if (i == NumElements2Actions[OpcodeIdx].end()) {
+    return {NotFound, IntermediateType};
+  }
+  const SizeAndActionsVec &NumElementsVec = (*i).second[TypeIdx];
+  auto NumElementsAndAction =
+      findAction(NumElementsVec, IntermediateType.getNumElements());
+  return {NumElementsAndAction.second,
+          LLT::fixed_vector(NumElementsAndAction.first,
+                            IntermediateType.getScalarSizeInBits())};
+}
+
+unsigned LegacyLegalizerInfo::getOpcodeIdxForOpcode(unsigned Opcode) const {
+  assert(Opcode >= FirstOp && Opcode <= LastOp && "Unsupported opcode");
+  return Opcode - FirstOp;
+}
+
+
+LegacyLegalizeActionStep
+LegacyLegalizerInfo::getAction(const LegalityQuery &Query) const {
+  for (unsigned i = 0; i < Query.Types.size(); ++i) {
+    auto Action = getAspectAction({Query.Opcode, i, Query.Types[i]});
+    if (Action.first != Legal) {
+      LLVM_DEBUG(dbgs() << ".. (legacy) Type " << i << " Action="
+                        << Action.first << ", " << Action.second << "\n");
+      return {Action.first, i, Action.second};
+    } else
+      LLVM_DEBUG(dbgs() << ".. (legacy) Type " << i << " Legal\n");
+  }
+  LLVM_DEBUG(dbgs() << ".. (legacy) Legal\n");
+  return {Legal, 0, LLT{}};
+}
+
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
new file mode 100644
index 000000000000..2c77ed8b0600
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
@@ -0,0 +1,213 @@
+//===- lib/CodeGen/GlobalISel/LegalizerPredicates.cpp - Predicates --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A library of predicate factories to use for LegalityPredicate.
+//
+//===----------------------------------------------------------------------===//
+
+// Enable optimizations to work around MSVC debug mode bug in 32-bit:
+// https://developercommunity.visualstudio.com/content/problem/1179643/msvc-copies-overaligned-non-trivially-copyable-par.html
+// FIXME: Remove this when the issue is closed.
+#if defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)
+// We have to disable runtime checks in order to enable optimizations. This is
+// done for the entire file because the problem is actually observed in STL
+// template functions.
+#pragma runtime_checks("", off)
+#pragma optimize("gs", on)
+#endif
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+using namespace llvm;
+
+LegalityPredicate LegalityPredicates::typeIs(unsigned TypeIdx, LLT Type) {
+  return
+      [=](const LegalityQuery &Query) { return Query.Types[TypeIdx] == Type; };
+}
+
+LegalityPredicate
+LegalityPredicates::typeInSet(unsigned TypeIdx,
+                              std::initializer_list<LLT> TypesInit) {
+  SmallVector<LLT, 4> Types = TypesInit;
+  return [=](const LegalityQuery &Query) {
+    return llvm::is_contained(Types, Query.Types[TypeIdx]);
+  };
+}
+
+LegalityPredicate LegalityPredicates::typePairInSet(
+    unsigned TypeIdx0, unsigned TypeIdx1,
+    std::initializer_list<std::pair<LLT, LLT>> TypesInit) {
+  SmallVector<std::pair<LLT, LLT>, 4> Types = TypesInit;
+  return [=](const LegalityQuery &Query) {
+    std::pair<LLT, LLT> Match = {Query.Types[TypeIdx0], Query.Types[TypeIdx1]};
+    return llvm::is_contained(Types, Match);
+  };
+}
+
+LegalityPredicate LegalityPredicates::typePairAndMemDescInSet(
+    unsigned TypeIdx0, unsigned TypeIdx1, unsigned MMOIdx,
+    std::initializer_list<TypePairAndMemDesc> TypesAndMemDescInit) {
+  SmallVector<TypePairAndMemDesc, 4> TypesAndMemDesc = TypesAndMemDescInit;
+  return [=](const LegalityQuery &Query) {
+    TypePairAndMemDesc Match = {Query.Types[TypeIdx0], Query.Types[TypeIdx1],
+                                Query.MMODescrs[MMOIdx].MemoryTy,
+                                Query.MMODescrs[MMOIdx].AlignInBits};
+    return llvm::any_of(TypesAndMemDesc,
+                        [=](const TypePairAndMemDesc &Entry) -> bool {
+                          return Match.isCompatible(Entry);
+                        });
+  };
+}
+
+LegalityPredicate LegalityPredicates::isScalar(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    return Query.Types[TypeIdx].isScalar();
+  };
+}
+
+LegalityPredicate LegalityPredicates::isVector(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    return Query.Types[TypeIdx].isVector();
+  };
+}
+
+LegalityPredicate LegalityPredicates::isPointer(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    return Query.Types[TypeIdx].isPointer();
+  };
+}
+
+LegalityPredicate LegalityPredicates::isPointer(unsigned TypeIdx,
+                                                unsigned AddrSpace) {
+  return [=](const LegalityQuery &Query) {
+    LLT Ty = Query.Types[TypeIdx];
+    return Ty.isPointer() && Ty.getAddressSpace() == AddrSpace;
+  };
+}
+
+LegalityPredicate LegalityPredicates::elementTypeIs(unsigned TypeIdx,
+                                                    LLT EltTy) {
+  return [=](const LegalityQuery &Query) {
+    const LLT QueryTy = Query.Types[TypeIdx];
+    return QueryTy.isVector() && QueryTy.getElementType() == EltTy;
+  };
+}
+
+LegalityPredicate LegalityPredicates::scalarNarrowerThan(unsigned TypeIdx,
+                                                         unsigned Size) {
+  return [=](const LegalityQuery &Query) {
+    const LLT QueryTy = Query.Types[TypeIdx];
+    return QueryTy.isScalar() && QueryTy.getSizeInBits() < Size;
+  };
+}
+
+LegalityPredicate LegalityPredicates::scalarWiderThan(unsigned TypeIdx,
+                                                      unsigned Size) {
+  return [=](const LegalityQuery &Query) {
+    const LLT QueryTy = Query.Types[TypeIdx];
+    return QueryTy.isScalar() && QueryTy.getSizeInBits() > Size;
+  };
+}
+
+LegalityPredicate LegalityPredicates::smallerThan(unsigned TypeIdx0,
+                                                  unsigned TypeIdx1) {
+  return [=](const LegalityQuery &Query) {
+    return Query.Types[TypeIdx0].getSizeInBits() <
+           Query.Types[TypeIdx1].getSizeInBits();
+  };
+}
+
+LegalityPredicate LegalityPredicates::largerThan(unsigned TypeIdx0,
+                                                  unsigned TypeIdx1) {
+  return [=](const LegalityQuery &Query) {
+    return Query.Types[TypeIdx0].getSizeInBits() >
+           Query.Types[TypeIdx1].getSizeInBits();
+  };
+}
+
+LegalityPredicate LegalityPredicates::scalarOrEltNarrowerThan(unsigned TypeIdx,
+                                                              unsigned Size) {
+  return [=](const LegalityQuery &Query) {
+    const LLT QueryTy = Query.Types[TypeIdx];
+    return QueryTy.getScalarSizeInBits() < Size;
+  };
+}
+
+LegalityPredicate LegalityPredicates::scalarOrEltWiderThan(unsigned TypeIdx,
+                                                           unsigned Size) {
+  return [=](const LegalityQuery &Query) {
+    const LLT QueryTy = Query.Types[TypeIdx];
+    return QueryTy.getScalarSizeInBits() > Size;
+  };
+}
+
+LegalityPredicate LegalityPredicates::scalarOrEltSizeNotPow2(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT QueryTy = Query.Types[TypeIdx];
+    return !isPowerOf2_32(QueryTy.getScalarSizeInBits());
+  };
+}
+
+LegalityPredicate LegalityPredicates::sizeNotMultipleOf(unsigned TypeIdx,
+                                                        unsigned Size) {
+  return [=](const LegalityQuery &Query) {
+    const LLT QueryTy = Query.Types[TypeIdx];
+    return QueryTy.isScalar() && QueryTy.getSizeInBits() % Size != 0;
+  };
+}
+
+LegalityPredicate LegalityPredicates::sizeNotPow2(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT QueryTy = Query.Types[TypeIdx];
+    return QueryTy.isScalar() &&
+           !llvm::has_single_bit<uint32_t>(QueryTy.getSizeInBits());
+  };
+}
+
+LegalityPredicate LegalityPredicates::sizeIs(unsigned TypeIdx, unsigned Size) {
+  return [=](const LegalityQuery &Query) {
+    return Query.Types[TypeIdx].getSizeInBits() == Size;
+  };
+}
+
+LegalityPredicate LegalityPredicates::sameSize(unsigned TypeIdx0,
+                                               unsigned TypeIdx1) {
+  return [=](const LegalityQuery &Query) {
+    return Query.Types[TypeIdx0].getSizeInBits() ==
+           Query.Types[TypeIdx1].getSizeInBits();
+  };
+}
+
+LegalityPredicate LegalityPredicates::memSizeInBytesNotPow2(unsigned MMOIdx) {
+  return [=](const LegalityQuery &Query) {
+    return !llvm::has_single_bit<uint32_t>(
+        Query.MMODescrs[MMOIdx].MemoryTy.getSizeInBytes());
+  };
+}
+
+LegalityPredicate LegalityPredicates::memSizeNotByteSizePow2(unsigned MMOIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT MemTy = Query.MMODescrs[MMOIdx].MemoryTy;
+    return !MemTy.isByteSized() ||
+           !llvm::has_single_bit<uint32_t>(MemTy.getSizeInBytes());
+  };
+}
+
+LegalityPredicate LegalityPredicates::numElementsNotPow2(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT QueryTy = Query.Types[TypeIdx];
+    return QueryTy.isVector() && !isPowerOf2_32(QueryTy.getNumElements());
+  };
+}
+
+LegalityPredicate LegalityPredicates::atomicOrderingAtLeastOrStrongerThan(
+    unsigned MMOIdx, AtomicOrdering Ordering) {
+  return [=](const LegalityQuery &Query) {
+    return isAtLeastOrStrongerThan(Query.MMODescrs[MMOIdx].Ordering, Ordering);
+  };
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
new file mode 100644
index 000000000000..25c1db91b05d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizeMutations.cpp
@@ -0,0 +1,112 @@
+//===- lib/CodeGen/GlobalISel/LegalizerMutations.cpp - Mutations ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A library of mutation factories to use for LegalityMutation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+using namespace llvm;
+
+LegalizeMutation LegalizeMutations::changeTo(unsigned TypeIdx, LLT Ty) {
+  return
+      [=](const LegalityQuery &Query) { return std::make_pair(TypeIdx, Ty); };
+}
+
+LegalizeMutation LegalizeMutations::changeTo(unsigned TypeIdx,
+                                             unsigned FromTypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    return std::make_pair(TypeIdx, Query.Types[FromTypeIdx]);
+  };
+}
+
+LegalizeMutation LegalizeMutations::changeElementTo(unsigned TypeIdx,
+                                                    unsigned FromTypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT OldTy = Query.Types[TypeIdx];
+    const LLT NewTy = Query.Types[FromTypeIdx];
+    return std::make_pair(TypeIdx, OldTy.changeElementType(NewTy));
+  };
+}
+
+LegalizeMutation LegalizeMutations::changeElementTo(unsigned TypeIdx,
+                                                    LLT NewEltTy) {
+  return [=](const LegalityQuery &Query) {
+    const LLT OldTy = Query.Types[TypeIdx];
+    return std::make_pair(TypeIdx, OldTy.changeElementType(NewEltTy));
+  };
+}
+
+LegalizeMutation LegalizeMutations::changeElementCountTo(unsigned TypeIdx,
+                                                         unsigned FromTypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT OldTy = Query.Types[TypeIdx];
+    const LLT NewTy = Query.Types[FromTypeIdx];
+    ElementCount NewEltCount =
+        NewTy.isVector() ? NewTy.getElementCount() : ElementCount::getFixed(1);
+    return std::make_pair(TypeIdx, OldTy.changeElementCount(NewEltCount));
+  };
+}
+
+LegalizeMutation LegalizeMutations::changeElementCountTo(unsigned TypeIdx,
+                                                         LLT NewEltTy) {
+  return [=](const LegalityQuery &Query) {
+    const LLT OldTy = Query.Types[TypeIdx];
+    ElementCount NewEltCount = NewEltTy.isVector() ? NewEltTy.getElementCount()
+                                                   : ElementCount::getFixed(1);
+    return std::make_pair(TypeIdx, OldTy.changeElementCount(NewEltCount));
+  };
+}
+
+LegalizeMutation LegalizeMutations::changeElementSizeTo(unsigned TypeIdx,
+                                                        unsigned FromTypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT OldTy = Query.Types[TypeIdx];
+    const LLT NewTy = Query.Types[FromTypeIdx];
+    const LLT NewEltTy = LLT::scalar(NewTy.getScalarSizeInBits());
+    return std::make_pair(TypeIdx, OldTy.changeElementType(NewEltTy));
+  };
+}
+
+LegalizeMutation LegalizeMutations::widenScalarOrEltToNextPow2(unsigned TypeIdx,
+                                                               unsigned Min) {
+  return [=](const LegalityQuery &Query) {
+    const LLT Ty = Query.Types[TypeIdx];
+    unsigned NewEltSizeInBits =
+        std::max(1u << Log2_32_Ceil(Ty.getScalarSizeInBits()), Min);
+    return std::make_pair(TypeIdx, Ty.changeElementSize(NewEltSizeInBits));
+  };
+}
+
+LegalizeMutation
+LegalizeMutations::widenScalarOrEltToNextMultipleOf(unsigned TypeIdx,
+                                                    unsigned Size) {
+  return [=](const LegalityQuery &Query) {
+    const LLT Ty = Query.Types[TypeIdx];
+    unsigned NewEltSizeInBits = alignTo(Ty.getScalarSizeInBits(), Size);
+    return std::make_pair(TypeIdx, Ty.changeElementSize(NewEltSizeInBits));
+  };
+}
+
+LegalizeMutation LegalizeMutations::moreElementsToNextPow2(unsigned TypeIdx,
+                                                           unsigned Min) {
+  return [=](const LegalityQuery &Query) {
+    const LLT VecTy = Query.Types[TypeIdx];
+    unsigned NewNumElements =
+        std::max(1u << Log2_32_Ceil(VecTy.getNumElements()), Min);
+    return std::make_pair(
+        TypeIdx, LLT::fixed_vector(NewNumElements, VecTy.getElementType()));
+  };
+}
+
+LegalizeMutation LegalizeMutations::scalarize(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    return std::make_pair(TypeIdx, Query.Types[TypeIdx].getElementType());
+  };
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
new file mode 100644
index 000000000000..aecbe0b7604c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
@@ -0,0 +1,385 @@
+//===-- llvm/CodeGen/GlobalISel/Legalizer.cpp -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file implements the LegalizerHelper class to legalize individual
+/// instructions and the LegalizePass wrapper pass for the primary
+/// legalization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
+#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/GISelWorkList.h"
+#include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+
+#define DEBUG_TYPE "legalizer"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    EnableCSEInLegalizer("enable-cse-in-legalizer",
+                         cl::desc("Should enable CSE in Legalizer"),
+                         cl::Optional, cl::init(false));
+
+// This is a temporary hack, should be removed soon.
+static cl::opt<bool> AllowGInsertAsArtifact(
+    "allow-ginsert-as-artifact",
+    cl::desc("Allow G_INSERT to be considered an artifact. Hack around AMDGPU "
+             "test infinite loops."),
+    cl::Optional, cl::init(true));
+
+enum class DebugLocVerifyLevel {
+  None,
+  Legalizations,
+  LegalizationsAndArtifactCombiners,
+};
+#ifndef NDEBUG
+static cl::opt<DebugLocVerifyLevel> VerifyDebugLocs(
+    "verify-legalizer-debug-locs",
+    cl::desc("Verify that debug locations are handled"),
+    cl::values(
+        clEnumValN(DebugLocVerifyLevel::None, "none", "No verification"),
+        clEnumValN(DebugLocVerifyLevel::Legalizations, "legalizations",
+                   "Verify legalizations"),
+        clEnumValN(DebugLocVerifyLevel::LegalizationsAndArtifactCombiners,
+                   "legalizations+artifactcombiners",
+                   "Verify legalizations and artifact combines")),
+    cl::init(DebugLocVerifyLevel::Legalizations));
+#else
+// Always disable it for release builds by preventing the observer from being
+// installed.
+static const DebugLocVerifyLevel VerifyDebugLocs = DebugLocVerifyLevel::None;
+#endif
+
+char Legalizer::ID = 0;
+INITIALIZE_PASS_BEGIN(Legalizer, DEBUG_TYPE,
+                      "Legalize the Machine IR a function's Machine IR", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
+INITIALIZE_PASS_END(Legalizer, DEBUG_TYPE,
+                    "Legalize the Machine IR a function's Machine IR", false,
+                    false)
+
+Legalizer::Legalizer() : MachineFunctionPass(ID) { }
+
+void Legalizer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
+  AU.addRequired<GISelCSEAnalysisWrapperPass>();
+  AU.addPreserved<GISelCSEAnalysisWrapperPass>();
+  AU.addRequired<GISelKnownBitsAnalysis>();
+  AU.addPreserved<GISelKnownBitsAnalysis>();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void Legalizer::init(MachineFunction &MF) {
+}
+
+static bool isArtifact(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case TargetOpcode::G_TRUNC:
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_ANYEXT:
+  case TargetOpcode::G_SEXT:
+  case TargetOpcode::G_MERGE_VALUES:
+  case TargetOpcode::G_UNMERGE_VALUES:
+  case TargetOpcode::G_CONCAT_VECTORS:
+  case TargetOpcode::G_BUILD_VECTOR:
+  case TargetOpcode::G_EXTRACT:
+    return true;
+  case TargetOpcode::G_INSERT:
+    return AllowGInsertAsArtifact;
+  }
+}
+using InstListTy = GISelWorkList<256>;
+using ArtifactListTy = GISelWorkList<128>;
+
+namespace {
+class LegalizerWorkListManager : public GISelChangeObserver {
+  InstListTy &InstList;
+  ArtifactListTy &ArtifactList;
+#ifndef NDEBUG
+  SmallVector<MachineInstr *, 4> NewMIs;
+#endif
+
+public:
+  LegalizerWorkListManager(InstListTy &Insts, ArtifactListTy &Arts)
+      : InstList(Insts), ArtifactList(Arts) {}
+
+  void createdOrChangedInstr(MachineInstr &MI) {
+    // Only legalize pre-isel generic instructions.
+    // Legalization process could generate Target specific pseudo
+    // instructions with generic types. Don't record them
+    if (isPreISelGenericOpcode(MI.getOpcode())) {
+      if (isArtifact(MI))
+        ArtifactList.insert(&MI);
+      else
+        InstList.insert(&MI);
+    }
+  }
+
+  void createdInstr(MachineInstr &MI) override {
+    LLVM_DEBUG(NewMIs.push_back(&MI));
+    createdOrChangedInstr(MI);
+  }
+
+  void printNewInstrs() {
+    LLVM_DEBUG({
+      for (const auto *MI : NewMIs)
+        dbgs() << ".. .. New MI: " << *MI;
+      NewMIs.clear();
+    });
+  }
+
+  void erasingInstr(MachineInstr &MI) override {
+    LLVM_DEBUG(dbgs() << ".. .. Erasing: " << MI);
+    InstList.remove(&MI);
+    ArtifactList.remove(&MI);
+  }
+
+  void changingInstr(MachineInstr &MI) override {
+    LLVM_DEBUG(dbgs() << ".. .. Changing MI: " << MI);
+  }
+
+  void changedInstr(MachineInstr &MI) override {
+    // When insts change, we want to revisit them to legalize them again.
+    // We'll consider them the same as created.
+    LLVM_DEBUG(dbgs() << ".. .. Changed MI: " << MI);
+    createdOrChangedInstr(MI);
+  }
+};
+} // namespace
+
+Legalizer::MFResult
+Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI,
+                                   ArrayRef<GISelChangeObserver *> AuxObservers,
+                                   LostDebugLocObserver &LocObserver,
+                                   MachineIRBuilder &MIRBuilder,
+                                   GISelKnownBits *KB) {
+  MIRBuilder.setMF(MF);
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Populate worklists.
+  InstListTy InstList;
+  ArtifactListTy ArtifactList;
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  // Perform legalization bottom up so we can DCE as we legalize.
+  // Traverse BB in RPOT and within each basic block, add insts top down,
+  // so when we pop_back_val in the legalization process, we traverse bottom-up.
+  for (auto *MBB : RPOT) {
+    if (MBB->empty())
+      continue;
+    for (MachineInstr &MI : *MBB) {
+      // Only legalize pre-isel generic instructions: others don't have types
+      // and are assumed to be legal.
+      if (!isPreISelGenericOpcode(MI.getOpcode()))
+        continue;
+      if (isArtifact(MI))
+        ArtifactList.deferred_insert(&MI);
+      else
+        InstList.deferred_insert(&MI);
+    }
+  }
+  ArtifactList.finalize();
+  InstList.finalize();
+
+  // This observer keeps the worklists updated.
+  LegalizerWorkListManager WorkListObserver(InstList, ArtifactList);
+  // We want both WorkListObserver as well as all the auxiliary observers (e.g.
+  // CSEInfo) to observe all changes. Use the wrapper observer.
+  GISelObserverWrapper WrapperObserver(&WorkListObserver);
+  for (GISelChangeObserver *Observer : AuxObservers)
+    WrapperObserver.addObserver(Observer);
+
+  // Now install the observer as the delegate to MF.
+  // This will keep all the observers notified about new insertions/deletions.
+  RAIIMFObsDelInstaller Installer(MF, WrapperObserver);
+  LegalizerHelper Helper(MF, LI, WrapperObserver, MIRBuilder, KB);
+  LegalizationArtifactCombiner ArtCombiner(MIRBuilder, MRI, LI);
+  bool Changed = false;
+  SmallVector<MachineInstr *, 128> RetryList;
+  do {
+    LLVM_DEBUG(dbgs() << "=== New Iteration ===\n");
+    assert(RetryList.empty() && "Expected no instructions in RetryList");
+    unsigned NumArtifacts = ArtifactList.size();
+    while (!InstList.empty()) {
+      MachineInstr &MI = *InstList.pop_back_val();
+      assert(isPreISelGenericOpcode(MI.getOpcode()) &&
+             "Expecting generic opcode");
+      if (isTriviallyDead(MI, MRI)) {
+        salvageDebugInfo(MRI, MI);
+        eraseInstr(MI, MRI, &LocObserver);
+        continue;
+      }
+
+      // Do the legalization for this instruction.
+      auto Res = Helper.legalizeInstrStep(MI, LocObserver);
+      // Error out if we couldn't legalize this instruction. We may want to
+      // fall back to DAG ISel instead in the future.
+      if (Res == LegalizerHelper::UnableToLegalize) {
+        // Move illegal artifacts to RetryList instead of aborting because
+        // legalizing InstList may generate artifacts that allow
+        // ArtifactCombiner to combine away them.
+        if (isArtifact(MI)) {
+          LLVM_DEBUG(dbgs() << ".. Not legalized, moving to artifacts retry\n");
+          assert(NumArtifacts == 0 &&
+                 "Artifacts are only expected in instruction list starting the "
+                 "second iteration, but each iteration starting second must "
+                 "start with an empty artifacts list");
+          (void)NumArtifacts;
+          RetryList.push_back(&MI);
+          continue;
+        }
+        Helper.MIRBuilder.stopObservingChanges();
+        return {Changed, &MI};
+      }
+      WorkListObserver.printNewInstrs();
+      LocObserver.checkpoint();
+      Changed |= Res == LegalizerHelper::Legalized;
+    }
+    // Try to combine the instructions in RetryList again if there
+    // are new artifacts. If not, stop legalizing.
+    if (!RetryList.empty()) {
+      if (!ArtifactList.empty()) {
+        while (!RetryList.empty())
+          ArtifactList.insert(RetryList.pop_back_val());
+      } else {
+        LLVM_DEBUG(dbgs() << "No new artifacts created, not retrying!\n");
+        Helper.MIRBuilder.stopObservingChanges();
+        return {Changed, RetryList.front()};
+      }
+    }
+    LocObserver.checkpoint();
+    while (!ArtifactList.empty()) {
+      MachineInstr &MI = *ArtifactList.pop_back_val();
+      assert(isPreISelGenericOpcode(MI.getOpcode()) &&
+             "Expecting generic opcode");
+      if (isTriviallyDead(MI, MRI)) {
+        salvageDebugInfo(MRI, MI);
+        eraseInstr(MI, MRI, &LocObserver);
+        continue;
+      }
+      SmallVector<MachineInstr *, 4> DeadInstructions;
+      LLVM_DEBUG(dbgs() << "Trying to combine: " << MI);
+      if (ArtCombiner.tryCombineInstruction(MI, DeadInstructions,
+                                            WrapperObserver)) {
+        WorkListObserver.printNewInstrs();
+        eraseInstrs(DeadInstructions, MRI, &LocObserver);
+        LocObserver.checkpoint(
+            VerifyDebugLocs ==
+            DebugLocVerifyLevel::LegalizationsAndArtifactCombiners);
+        Changed = true;
+        continue;
+      }
+      // If this was not an artifact (that could be combined away), this might
+      // need special handling. Add it to InstList, so when it's processed
+      // there, it has to be legal or specially handled.
+      else {
+        LLVM_DEBUG(dbgs() << ".. Not combined, moving to instructions list\n");
+        InstList.insert(&MI);
+      }
+    }
+  } while (!InstList.empty());
+
+  return {Changed, /*FailedOn*/ nullptr};
+}
+
+bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
+  // If the ISel pipeline failed, do not bother running that pass.
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+  LLVM_DEBUG(dbgs() << "Legalize Machine IR for: " << MF.getName() << '\n');
+  init(MF);
+  const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+  GISelCSEAnalysisWrapper &Wrapper =
+      getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
+  MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr);
+
+  std::unique_ptr<MachineIRBuilder> MIRBuilder;
+  GISelCSEInfo *CSEInfo = nullptr;
+  bool EnableCSE = EnableCSEInLegalizer.getNumOccurrences()
+                       ? EnableCSEInLegalizer
+                       : TPC.isGISelCSEEnabled();
+  if (EnableCSE) {
+    MIRBuilder = std::make_unique<CSEMIRBuilder>();
+    CSEInfo = &Wrapper.get(TPC.getCSEConfig());
+    MIRBuilder->setCSEInfo(CSEInfo);
+  } else
+    MIRBuilder = std::make_unique<MachineIRBuilder>();
+
+  SmallVector<GISelChangeObserver *, 1> AuxObservers;
+  if (EnableCSE && CSEInfo) {
+    // We want CSEInfo in addition to WorkListObserver to observe all changes.
+    AuxObservers.push_back(CSEInfo);
+  }
+  assert(!CSEInfo || !errorToBool(CSEInfo->verify()));
+  LostDebugLocObserver LocObserver(DEBUG_TYPE);
+  if (VerifyDebugLocs > DebugLocVerifyLevel::None)
+    AuxObservers.push_back(&LocObserver);
+
+  // This allows Known Bits Analysis in the legalizer.
+  GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+
+  const LegalizerInfo &LI = *MF.getSubtarget().getLegalizerInfo();
+  MFResult Result = legalizeMachineFunction(MF, LI, AuxObservers, LocObserver,
+                                            *MIRBuilder, KB);
+
+  if (Result.FailedOn) {
+    reportGISelFailure(MF, TPC, MORE, "gisel-legalize",
+                       "unable to legalize instruction", *Result.FailedOn);
+    return false;
+  }
+
+  if (LocObserver.getNumLostDebugLocs()) {
+    MachineOptimizationRemarkMissed R("gisel-legalize", "LostDebugLoc",
+                                      MF.getFunction().getSubprogram(),
+                                      /*MBB=*/&*MF.begin());
+    R << "lost "
+      << ore::NV("NumLostDebugLocs", LocObserver.getNumLostDebugLocs())
+      << " debug locations during pass";
+    reportGISelWarning(MF, TPC, MORE, R);
+    // Example remark:
+    // --- !Missed
+    // Pass:            gisel-legalize
+    // Name:            GISelFailure
+    // DebugLoc:        { File: '.../legalize-urem.mir', Line: 1, Column: 0 }
+    // Function:        test_urem_s32
+    // Args:
+    //   - String:          'lost '
+    //   - NumLostDebugLocs: '1'
+    //   - String:          ' debug locations during pass'
+    // ...
+  }
+
+  // If for some reason CSE was not enabled, make sure that we invalidate the
+  // CSEInfo object (as we currently declare that the analysis is preserved).
+  // The next time get on the wrapper is called, it will force it to recompute
+  // the analysis.
+  if (!EnableCSE)
+    Wrapper.setComputed(false);
+  return Result.Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
new file mode 100644
index 000000000000..f0da0d88140f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -0,0 +1,8119 @@
+//===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file implements the LegalizerHelper class to legalize
+/// individual instructions and the LegalizeMachineIR wrapper pass for the
+/// primary legalization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <numeric>
+#include <optional>
+
+#define DEBUG_TYPE "legalizer"
+
+using namespace llvm;
+using namespace LegalizeActions;
+using namespace MIPatternMatch;
+
+/// Try to break down \p OrigTy into \p NarrowTy sized pieces.
+///
+/// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
+/// with any leftover piece as type \p LeftoverTy
+///
+/// Returns -1 in the first element of the pair if the breakdown is not
+/// satisfiable.
+static std::pair<int, int>
+getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
+  assert(!LeftoverTy.isValid() && "this is an out argument");
+
+  unsigned Size = OrigTy.getSizeInBits();
+  unsigned NarrowSize = NarrowTy.getSizeInBits();
+  unsigned NumParts = Size / NarrowSize;
+  unsigned LeftoverSize = Size - NumParts * NarrowSize;
+  assert(Size > NarrowSize);
+
+  if (LeftoverSize == 0)
+    return {NumParts, 0};
+
+  if (NarrowTy.isVector()) {
+    unsigned EltSize = OrigTy.getScalarSizeInBits();
+    if (LeftoverSize % EltSize != 0)
+      return {-1, -1};
+    LeftoverTy = LLT::scalarOrVector(
+        ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
+  } else {
+    LeftoverTy = LLT::scalar(LeftoverSize);
+  }
+
+  int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
+  return std::make_pair(NumParts, NumLeftover);
+}
+
+static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
+
+  if (!Ty.isScalar())
+    return nullptr;
+
+  switch (Ty.getSizeInBits()) {
+  case 16:
+    return Type::getHalfTy(Ctx);
+  case 32:
+    return Type::getFloatTy(Ctx);
+  case 64:
+    return Type::getDoubleTy(Ctx);
+  case 80:
+    return Type::getX86_FP80Ty(Ctx);
+  case 128:
+    return Type::getFP128Ty(Ctx);
+  default:
+    return nullptr;
+  }
+}
+
+LegalizerHelper::LegalizerHelper(MachineFunction &MF,
+                                 GISelChangeObserver &Observer,
+                                 MachineIRBuilder &Builder)
+    : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
+      LI(*MF.getSubtarget().getLegalizerInfo()),
+      TLI(*MF.getSubtarget().getTargetLowering()), KB(nullptr) {}
+
+LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
+                                 GISelChangeObserver &Observer,
+                                 MachineIRBuilder &B, GISelKnownBits *KB)
+    : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
+      TLI(*MF.getSubtarget().getTargetLowering()), KB(KB) {}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
+                                   LostDebugLocObserver &LocObserver) {
+  LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
+
+  MIRBuilder.setInstrAndDebugLoc(MI);
+
+  if (MI.getOpcode() == TargetOpcode::G_INTRINSIC ||
+      MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS)
+    return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
+  auto Step = LI.getAction(MI, MRI);
+  switch (Step.Action) {
+  case Legal:
+    LLVM_DEBUG(dbgs() << ".. Already legal\n");
+    return AlreadyLegal;
+  case Libcall:
+    LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
+    return libcall(MI, LocObserver);
+  case NarrowScalar:
+    LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
+    return narrowScalar(MI, Step.TypeIdx, Step.NewType);
+  case WidenScalar:
+    LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
+    return widenScalar(MI, Step.TypeIdx, Step.NewType);
+  case Bitcast:
+    LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
+    return bitcast(MI, Step.TypeIdx, Step.NewType);
+  case Lower:
+    LLVM_DEBUG(dbgs() << ".. Lower\n");
+    return lower(MI, Step.TypeIdx, Step.NewType);
+  case FewerElements:
+    LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
+    return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
+  case MoreElements:
+    LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
+    return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
+  case Custom:
+    LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
+    return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize;
+  default:
+    LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
+    return UnableToLegalize;
+  }
+}
+
+void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts,
+                                   SmallVectorImpl<Register> &VRegs) {
+  for (int i = 0; i < NumParts; ++i)
+    VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
+  MIRBuilder.buildUnmerge(VRegs, Reg);
+}
+
+bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
+                                   LLT MainTy, LLT &LeftoverTy,
+                                   SmallVectorImpl<Register> &VRegs,
+                                   SmallVectorImpl<Register> &LeftoverRegs) {
+  assert(!LeftoverTy.isValid() && "this is an out argument");
+
+  unsigned RegSize = RegTy.getSizeInBits();
+  unsigned MainSize = MainTy.getSizeInBits();
+  unsigned NumParts = RegSize / MainSize;
+  unsigned LeftoverSize = RegSize - NumParts * MainSize;
+
+  // Use an unmerge when possible.
+  if (LeftoverSize == 0) {
+    for (unsigned I = 0; I < NumParts; ++I)
+      VRegs.push_back(MRI.createGenericVirtualRegister(MainTy));
+    MIRBuilder.buildUnmerge(VRegs, Reg);
+    return true;
+  }
+
+  // Perform irregular split. Leftover is last element of RegPieces.
+  if (MainTy.isVector()) {
+    SmallVector<Register, 8> RegPieces;
+    extractVectorParts(Reg, MainTy.getNumElements(), RegPieces);
+    for (unsigned i = 0; i < RegPieces.size() - 1; ++i)
+      VRegs.push_back(RegPieces[i]);
+    LeftoverRegs.push_back(RegPieces[RegPieces.size() - 1]);
+    LeftoverTy = MRI.getType(LeftoverRegs[0]);
+    return true;
+  }
+
+  LeftoverTy = LLT::scalar(LeftoverSize);
+  // For irregular sizes, extract the individual parts.
+  for (unsigned I = 0; I != NumParts; ++I) {
+    Register NewReg = MRI.createGenericVirtualRegister(MainTy);
+    VRegs.push_back(NewReg);
+    MIRBuilder.buildExtract(NewReg, Reg, MainSize * I);
+  }
+
+  for (unsigned Offset = MainSize * NumParts; Offset < RegSize;
+       Offset += LeftoverSize) {
+    Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy);
+    LeftoverRegs.push_back(NewReg);
+    MIRBuilder.buildExtract(NewReg, Reg, Offset);
+  }
+
+  return true;
+}
+
+void LegalizerHelper::extractVectorParts(Register Reg, unsigned NumElts,
+                                         SmallVectorImpl<Register> &VRegs) {
+  LLT RegTy = MRI.getType(Reg);
+  assert(RegTy.isVector() && "Expected a vector type");
+
+  LLT EltTy = RegTy.getElementType();
+  LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
+  unsigned RegNumElts = RegTy.getNumElements();
+  unsigned LeftoverNumElts = RegNumElts % NumElts;
+  unsigned NumNarrowTyPieces = RegNumElts / NumElts;
+
+  // Perfect split without leftover
+  if (LeftoverNumElts == 0)
+    return extractParts(Reg, NarrowTy, NumNarrowTyPieces, VRegs);
+
+  // Irregular split. Provide direct access to all elements for artifact
+  // combiner using unmerge to elements. Then build vectors with NumElts
+  // elements. Remaining element(s) will be (used to build vector) Leftover.
+  SmallVector<Register, 8> Elts;
+  extractParts(Reg, EltTy, RegNumElts, Elts);
+
+  unsigned Offset = 0;
+  // Requested sub-vectors of NarrowTy.
+  for (unsigned i = 0; i < NumNarrowTyPieces; ++i, Offset += NumElts) {
+    ArrayRef<Register> Pieces(&Elts[Offset], NumElts);
+    VRegs.push_back(MIRBuilder.buildMergeLikeInstr(NarrowTy, Pieces).getReg(0));
+  }
+
+  // Leftover element(s).
+  if (LeftoverNumElts == 1) {
+    VRegs.push_back(Elts[Offset]);
+  } else {
+    LLT LeftoverTy = LLT::fixed_vector(LeftoverNumElts, EltTy);
+    ArrayRef<Register> Pieces(&Elts[Offset], LeftoverNumElts);
+    VRegs.push_back(
+        MIRBuilder.buildMergeLikeInstr(LeftoverTy, Pieces).getReg(0));
+  }
+}
+
+void LegalizerHelper::insertParts(Register DstReg,
+                                  LLT ResultTy, LLT PartTy,
+                                  ArrayRef<Register> PartRegs,
+                                  LLT LeftoverTy,
+                                  ArrayRef<Register> LeftoverRegs) {
+  if (!LeftoverTy.isValid()) {
+    assert(LeftoverRegs.empty());
+
+    if (!ResultTy.isVector()) {
+      MIRBuilder.buildMergeLikeInstr(DstReg, PartRegs);
+      return;
+    }
+
+    if (PartTy.isVector())
+      MIRBuilder.buildConcatVectors(DstReg, PartRegs);
+    else
+      MIRBuilder.buildBuildVector(DstReg, PartRegs);
+    return;
+  }
+
+  // Merge sub-vectors with different number of elements and insert into DstReg.
+  if (ResultTy.isVector()) {
+    assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
+    SmallVector<Register, 8> AllRegs;
+    for (auto Reg : concat<const Register>(PartRegs, LeftoverRegs))
+      AllRegs.push_back(Reg);
+    return mergeMixedSubvectors(DstReg, AllRegs);
+  }
+
+  SmallVector<Register> GCDRegs;
+  LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
+  for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
+    extractGCDType(GCDRegs, GCDTy, PartReg);
+  LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
+  buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
+}
+
+void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
+                                       Register Reg) {
+  LLT Ty = MRI.getType(Reg);
+  SmallVector<Register, 8> RegElts;
+  extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts);
+  Elts.append(RegElts);
+}
+
+/// Merge \p PartRegs with different types into \p DstReg.
+void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
+                                           ArrayRef<Register> PartRegs) {
+  SmallVector<Register, 8> AllElts;
+  for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
+    appendVectorElts(AllElts, PartRegs[i]);
+
+  Register Leftover = PartRegs[PartRegs.size() - 1];
+  if (MRI.getType(Leftover).isScalar())
+    AllElts.push_back(Leftover);
+  else
+    appendVectorElts(AllElts, Leftover);
+
+  MIRBuilder.buildMergeLikeInstr(DstReg, AllElts);
+}
+
+/// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
+static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
+                              const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
+
+  const int StartIdx = Regs.size();
+  const int NumResults = MI.getNumOperands() - 1;
+  Regs.resize(Regs.size() + NumResults);
+  for (int I = 0; I != NumResults; ++I)
+    Regs[StartIdx + I] = MI.getOperand(I).getReg();
+}
+
+void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
+                                     LLT GCDTy, Register SrcReg) {
+  LLT SrcTy = MRI.getType(SrcReg);
+  if (SrcTy == GCDTy) {
+    // If the source already evenly divides the result type, we don't need to do
+    // anything.
+    Parts.push_back(SrcReg);
+  } else {
+    // Need to split into common type sized pieces.
+    auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
+    getUnmergeResults(Parts, *Unmerge);
+  }
+}
+
+LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
+                                    LLT NarrowTy, Register SrcReg) {
+  LLT SrcTy = MRI.getType(SrcReg);
+  LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
+  extractGCDType(Parts, GCDTy, SrcReg);
+  return GCDTy;
+}
+
+LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
+                                         SmallVectorImpl<Register> &VRegs,
+                                         unsigned PadStrategy) {
+  LLT LCMTy = getLCMType(DstTy, NarrowTy);
+
+  int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
+  int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
+  int NumOrigSrc = VRegs.size();
+
+  Register PadReg;
+
+  // Get a value we can use to pad the source value if the sources won't evenly
+  // cover the result type.
+  if (NumOrigSrc < NumParts * NumSubParts) {
+    if (PadStrategy == TargetOpcode::G_ZEXT)
+      PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
+    else if (PadStrategy == TargetOpcode::G_ANYEXT)
+      PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
+    else {
+      assert(PadStrategy == TargetOpcode::G_SEXT);
+
+      // Shift the sign bit of the low register through the high register.
+      auto ShiftAmt =
+        MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
+      PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
+    }
+  }
+
+  // Registers for the final merge to be produced.
+  SmallVector<Register, 4> Remerge(NumParts);
+
+  // Registers needed for intermediate merges, which will be merged into a
+  // source for Remerge.
+  SmallVector<Register, 4> SubMerge(NumSubParts);
+
+  // Once we've fully read off the end of the original source bits, we can reuse
+  // the same high bits for remaining padding elements.
+  Register AllPadReg;
+
+  // Build merges to the LCM type to cover the original result type.
+  for (int I = 0; I != NumParts; ++I) {
+    bool AllMergePartsArePadding = true;
+
+    // Build the requested merges to the requested type.
+    for (int J = 0; J != NumSubParts; ++J) {
+      int Idx = I * NumSubParts + J;
+      if (Idx >= NumOrigSrc) {
+        SubMerge[J] = PadReg;
+        continue;
+      }
+
+      SubMerge[J] = VRegs[Idx];
+
+      // There are meaningful bits here we can't reuse later.
+      AllMergePartsArePadding = false;
+    }
+
+    // If we've filled up a complete piece with padding bits, we can directly
+    // emit the natural sized constant if applicable, rather than a merge of
+    // smaller constants.
+    if (AllMergePartsArePadding && !AllPadReg) {
+      if (PadStrategy == TargetOpcode::G_ANYEXT)
+        AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
+      else if (PadStrategy == TargetOpcode::G_ZEXT)
+        AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
+
+      // If this is a sign extension, we can't materialize a trivial constant
+      // with the right type and have to produce a merge.
+    }
+
+    if (AllPadReg) {
+      // Avoid creating additional instructions if we're just adding additional
+      // copies of padding bits.
+      Remerge[I] = AllPadReg;
+      continue;
+    }
+
+    if (NumSubParts == 1)
+      Remerge[I] = SubMerge[0];
+    else
+      Remerge[I] = MIRBuilder.buildMergeLikeInstr(NarrowTy, SubMerge).getReg(0);
+
+    // In the sign extend padding case, re-use the first all-signbit merge.
+    if (AllMergePartsArePadding && !AllPadReg)
+      AllPadReg = Remerge[I];
+  }
+
+  VRegs = std::move(Remerge);
+  return LCMTy;
+}
+
+void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
+                                               ArrayRef<Register> RemergeRegs) {
+  LLT DstTy = MRI.getType(DstReg);
+
+  // Create the merge to the widened source, and extract the relevant bits into
+  // the result.
+
+  if (DstTy == LCMTy) {
+    MIRBuilder.buildMergeLikeInstr(DstReg, RemergeRegs);
+    return;
+  }
+
+  auto Remerge = MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs);
+  if (DstTy.isScalar() && LCMTy.isScalar()) {
+    MIRBuilder.buildTrunc(DstReg, Remerge);
+    return;
+  }
+
+  if (LCMTy.isVector()) {
+    unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
+    SmallVector<Register, 8> UnmergeDefs(NumDefs);
+    UnmergeDefs[0] = DstReg;
+    for (unsigned I = 1; I != NumDefs; ++I)
+      UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
+
+    MIRBuilder.buildUnmerge(UnmergeDefs,
+                            MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs));
+    return;
+  }
+
+  llvm_unreachable("unhandled case");
+}
+
+static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
+#define RTLIBCASE_INT(LibcallPrefix)                                           \
+  do {                                                                         \
+    switch (Size) {                                                            \
+    case 32:                                                                   \
+      return RTLIB::LibcallPrefix##32;                                         \
+    case 64:                                                                   \
+      return RTLIB::LibcallPrefix##64;                                         \
+    case 128:                                                                  \
+      return RTLIB::LibcallPrefix##128;                                        \
+    default:                                                                   \
+      llvm_unreachable("unexpected size");                                     \
+    }                                                                          \
+  } while (0)
+
+#define RTLIBCASE(LibcallPrefix)                                               \
+  do {                                                                         \
+    switch (Size) {                                                            \
+    case 32:                                                                   \
+      return RTLIB::LibcallPrefix##32;                                         \
+    case 64:                                                                   \
+      return RTLIB::LibcallPrefix##64;                                         \
+    case 80:                                                                   \
+      return RTLIB::LibcallPrefix##80;                                         \
+    case 128:                                                                  \
+      return RTLIB::LibcallPrefix##128;                                        \
+    default:                                                                   \
+      llvm_unreachable("unexpected size");                                     \
+    }                                                                          \
+  } while (0)
+
+  switch (Opcode) {
+  case TargetOpcode::G_MUL:
+    RTLIBCASE_INT(MUL_I);
+  case TargetOpcode::G_SDIV:
+    RTLIBCASE_INT(SDIV_I);
+  case TargetOpcode::G_UDIV:
+    RTLIBCASE_INT(UDIV_I);
+  case TargetOpcode::G_SREM:
+    RTLIBCASE_INT(SREM_I);
+  case TargetOpcode::G_UREM:
+    RTLIBCASE_INT(UREM_I);
+  case TargetOpcode::G_CTLZ_ZERO_UNDEF:
+    RTLIBCASE_INT(CTLZ_I);
+  case TargetOpcode::G_FADD:
+    RTLIBCASE(ADD_F);
+  case TargetOpcode::G_FSUB:
+    RTLIBCASE(SUB_F);
+  case TargetOpcode::G_FMUL:
+    RTLIBCASE(MUL_F);
+  case TargetOpcode::G_FDIV:
+    RTLIBCASE(DIV_F);
+  case TargetOpcode::G_FEXP:
+    RTLIBCASE(EXP_F);
+  case TargetOpcode::G_FEXP2:
+    RTLIBCASE(EXP2_F);
+  case TargetOpcode::G_FREM:
+    RTLIBCASE(REM_F);
+  case TargetOpcode::G_FPOW:
+    RTLIBCASE(POW_F);
+  case TargetOpcode::G_FMA:
+    RTLIBCASE(FMA_F);
+  case TargetOpcode::G_FSIN:
+    RTLIBCASE(SIN_F);
+  case TargetOpcode::G_FCOS:
+    RTLIBCASE(COS_F);
+  case TargetOpcode::G_FLOG10:
+    RTLIBCASE(LOG10_F);
+  case TargetOpcode::G_FLOG:
+    RTLIBCASE(LOG_F);
+  case TargetOpcode::G_FLOG2:
+    RTLIBCASE(LOG2_F);
+  case TargetOpcode::G_FLDEXP:
+    RTLIBCASE(LDEXP_F);
+  case TargetOpcode::G_FCEIL:
+    RTLIBCASE(CEIL_F);
+  case TargetOpcode::G_FFLOOR:
+    RTLIBCASE(FLOOR_F);
+  case TargetOpcode::G_FMINNUM:
+    RTLIBCASE(FMIN_F);
+  case TargetOpcode::G_FMAXNUM:
+    RTLIBCASE(FMAX_F);
+  case TargetOpcode::G_FSQRT:
+    RTLIBCASE(SQRT_F);
+  case TargetOpcode::G_FRINT:
+    RTLIBCASE(RINT_F);
+  case TargetOpcode::G_FNEARBYINT:
+    RTLIBCASE(NEARBYINT_F);
+  case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
+    RTLIBCASE(ROUNDEVEN_F);
+  }
+  llvm_unreachable("Unknown libcall function");
+}
+
+/// True if an instruction is in tail position in its caller. Intended for
+/// legalizing libcalls as tail calls when possible.
+static bool isLibCallInTailPosition(MachineInstr &MI,
+                                    const TargetInstrInfo &TII,
+                                    MachineRegisterInfo &MRI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  const Function &F = MBB.getParent()->getFunction();
+
+  // Conservatively require the attributes of the call to match those of
+  // the return. Ignore NoAlias and NonNull because they don't affect the
+  // call sequence.
+  AttributeList CallerAttrs = F.getAttributes();
+  if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
+          .removeAttribute(Attribute::NoAlias)
+          .removeAttribute(Attribute::NonNull)
+          .hasAttributes())
+    return false;
+
+  // It's not safe to eliminate the sign / zero extension of the return value.
+  if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
+      CallerAttrs.hasRetAttr(Attribute::SExt))
+    return false;
+
+  // Only tail call if the following instruction is a standard return or if we
+  // have a `thisreturn` callee, and a sequence like:
+  //
+  //   G_MEMCPY %0, %1, %2
+  //   $x0 = COPY %0
+  //   RET_ReallyLR implicit $x0
+  auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
+  if (Next != MBB.instr_end() && Next->isCopy()) {
+    switch (MI.getOpcode()) {
+    default:
+      llvm_unreachable("unsupported opcode");
+    case TargetOpcode::G_BZERO:
+      return false;
+    case TargetOpcode::G_MEMCPY:
+    case TargetOpcode::G_MEMMOVE:
+    case TargetOpcode::G_MEMSET:
+      break;
+    }
+
+    Register VReg = MI.getOperand(0).getReg();
+    if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
+      return false;
+
+    Register PReg = Next->getOperand(0).getReg();
+    if (!PReg.isPhysical())
+      return false;
+
+    auto Ret = next_nodbg(Next, MBB.instr_end());
+    if (Ret == MBB.instr_end() || !Ret->isReturn())
+      return false;
+
+    if (Ret->getNumImplicitOperands() != 1)
+      return false;
+
+    if (PReg != Ret->getOperand(0).getReg())
+      return false;
+
+    // Skip over the COPY that we just validated.
+    Next = Ret;
+  }
+
+  if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
+    return false;
+
+  return true;
+}
+
+LegalizerHelper::LegalizeResult
+llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
+                    const CallLowering::ArgInfo &Result,
+                    ArrayRef<CallLowering::ArgInfo> Args,
+                    const CallingConv::ID CC) {
+  auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
+
+  CallLowering::CallLoweringInfo Info;
+  Info.CallConv = CC;
+  Info.Callee = MachineOperand::CreateES(Name);
+  Info.OrigRet = Result;
+  std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
+  if (!CLI.lowerCall(MIRBuilder, Info))
+    return LegalizerHelper::UnableToLegalize;
+
+  return LegalizerHelper::Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
+                    const CallLowering::ArgInfo &Result,
+                    ArrayRef<CallLowering::ArgInfo> Args) {
+  auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
+  const char *Name = TLI.getLibcallName(Libcall);
+  const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
+  return createLibcall(MIRBuilder, Name, Result, Args, CC);
+}
+
+// Useful for libcalls where all operands have the same type.
+static LegalizerHelper::LegalizeResult
+simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
+              Type *OpType) {
+  auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
+
+  // FIXME: What does the original arg index mean here?
+  SmallVector<CallLowering::ArgInfo, 3> Args;
+  for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
+    Args.push_back({MO.getReg(), OpType, 0});
+  return createLibcall(MIRBuilder, Libcall,
+                       {MI.getOperand(0).getReg(), OpType, 0}, Args);
+}
+
+LegalizerHelper::LegalizeResult
+llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                       MachineInstr &MI, LostDebugLocObserver &LocObserver) {
+  auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
+
+  SmallVector<CallLowering::ArgInfo, 3> Args;
+  // Add all the args, except for the last which is an imm denoting 'tail'.
+  for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
+    Register Reg = MI.getOperand(i).getReg();
+
+    // Need derive an IR type for call lowering.
+    LLT OpLLT = MRI.getType(Reg);
+    Type *OpTy = nullptr;
+    if (OpLLT.isPointer())
+      OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace());
+    else
+      OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
+    Args.push_back({Reg, OpTy, 0});
+  }
+
+  auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
+  auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
+  RTLIB::Libcall RTLibcall;
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  case TargetOpcode::G_BZERO:
+    RTLibcall = RTLIB::BZERO;
+    break;
+  case TargetOpcode::G_MEMCPY:
+    RTLibcall = RTLIB::MEMCPY;
+    Args[0].Flags[0].setReturned();
+    break;
+  case TargetOpcode::G_MEMMOVE:
+    RTLibcall = RTLIB::MEMMOVE;
+    Args[0].Flags[0].setReturned();
+    break;
+  case TargetOpcode::G_MEMSET:
+    RTLibcall = RTLIB::MEMSET;
+    Args[0].Flags[0].setReturned();
+    break;
+  default:
+    llvm_unreachable("unsupported opcode");
+  }
+  const char *Name = TLI.getLibcallName(RTLibcall);
+
+  // Unsupported libcall on the target.
+  if (!Name) {
+    LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
+                      << MIRBuilder.getTII().getName(Opc) << "\n");
+    return LegalizerHelper::UnableToLegalize;
+  }
+
+  CallLowering::CallLoweringInfo Info;
+  Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
+  Info.Callee = MachineOperand::CreateES(Name);
+  Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
+  Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() &&
+                    isLibCallInTailPosition(MI, MIRBuilder.getTII(), MRI);
+
+  std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
+  if (!CLI.lowerCall(MIRBuilder, Info))
+    return LegalizerHelper::UnableToLegalize;
+
+  if (Info.LoweredTailCall) {
+    assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
+
+    // Check debug locations before removing the return.
+    LocObserver.checkpoint(true);
+
+    // We must have a return following the call (or debug insts) to get past
+    // isLibCallInTailPosition.
+    do {
+      MachineInstr *Next = MI.getNextNode();
+      assert(Next &&
+             (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
+             "Expected instr following MI to be return or debug inst?");
+      // We lowered a tail call, so the call is now the return from the block.
+      // Delete the old return.
+      Next->eraseFromParent();
+    } while (MI.getNextNode());
+
+    // We expect to lose the debug location from the return.
+    LocObserver.checkpoint(false);
+  }
+
+  return LegalizerHelper::Legalized;
+}
+
+static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
+                                       Type *FromType) {
+  auto ToMVT = MVT::getVT(ToType);
+  auto FromMVT = MVT::getVT(FromType);
+
+  switch (Opcode) {
+  case TargetOpcode::G_FPEXT:
+    return RTLIB::getFPEXT(FromMVT, ToMVT);
+  case TargetOpcode::G_FPTRUNC:
+    return RTLIB::getFPROUND(FromMVT, ToMVT);
+  case TargetOpcode::G_FPTOSI:
+    return RTLIB::getFPTOSINT(FromMVT, ToMVT);
+  case TargetOpcode::G_FPTOUI:
+    return RTLIB::getFPTOUINT(FromMVT, ToMVT);
+  case TargetOpcode::G_SITOFP:
+    return RTLIB::getSINTTOFP(FromMVT, ToMVT);
+  case TargetOpcode::G_UITOFP:
+    return RTLIB::getUINTTOFP(FromMVT, ToMVT);
+  }
+  llvm_unreachable("Unsupported libcall function");
+}
+
+static LegalizerHelper::LegalizeResult
+conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
+                  Type *FromType) {
+  RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
+  return createLibcall(MIRBuilder, Libcall,
+                       {MI.getOperand(0).getReg(), ToType, 0},
+                       {{MI.getOperand(1).getReg(), FromType, 0}});
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
+  LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
+  unsigned Size = LLTy.getSizeInBits();
+  auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
+
+  switch (MI.getOpcode()) {
+  default:
+    return UnableToLegalize;
+  case TargetOpcode::G_MUL:
+  case TargetOpcode::G_SDIV:
+  case TargetOpcode::G_UDIV:
+  case TargetOpcode::G_SREM:
+  case TargetOpcode::G_UREM:
+  case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
+    Type *HLTy = IntegerType::get(Ctx, Size);
+    auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
+    if (Status != Legalized)
+      return Status;
+    break;
+  }
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FSUB:
+  case TargetOpcode::G_FMUL:
+  case TargetOpcode::G_FDIV:
+  case TargetOpcode::G_FMA:
+  case TargetOpcode::G_FPOW:
+  case TargetOpcode::G_FREM:
+  case TargetOpcode::G_FCOS:
+  case TargetOpcode::G_FSIN:
+  case TargetOpcode::G_FLOG10:
+  case TargetOpcode::G_FLOG:
+  case TargetOpcode::G_FLOG2:
+  case TargetOpcode::G_FLDEXP:
+  case TargetOpcode::G_FEXP:
+  case TargetOpcode::G_FEXP2:
+  case TargetOpcode::G_FCEIL:
+  case TargetOpcode::G_FFLOOR:
+  case TargetOpcode::G_FMINNUM:
+  case TargetOpcode::G_FMAXNUM:
+  case TargetOpcode::G_FSQRT:
+  case TargetOpcode::G_FRINT:
+  case TargetOpcode::G_FNEARBYINT:
+  case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
+    Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
+    if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
+      LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
+      return UnableToLegalize;
+    }
+    auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
+    if (Status != Legalized)
+      return Status;
+    break;
+  }
+  case TargetOpcode::G_FPEXT:
+  case TargetOpcode::G_FPTRUNC: {
+    Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
+    Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
+    if (!FromTy || !ToTy)
+      return UnableToLegalize;
+    LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy );
+    if (Status != Legalized)
+      return Status;
+    break;
+  }
+  case TargetOpcode::G_FPTOSI:
+  case TargetOpcode::G_FPTOUI: {
+    // FIXME: Support other types
+    unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+    unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
+      return UnableToLegalize;
+    LegalizeResult Status = conversionLibcall(
+        MI, MIRBuilder,
+        ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
+        FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx));
+    if (Status != Legalized)
+      return Status;
+    break;
+  }
+  case TargetOpcode::G_SITOFP:
+  case TargetOpcode::G_UITOFP: {
+    // FIXME: Support other types
+    unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+    unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
+      return UnableToLegalize;
+    LegalizeResult Status = conversionLibcall(
+        MI, MIRBuilder,
+        ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
+        FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx));
+    if (Status != Legalized)
+      return Status;
+    break;
+  }
+  case TargetOpcode::G_BZERO:
+  case TargetOpcode::G_MEMCPY:
+  case TargetOpcode::G_MEMMOVE:
+  case TargetOpcode::G_MEMSET: {
+    LegalizeResult Result =
+        createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
+    if (Result != Legalized)
+      return Result;
+    MI.eraseFromParent();
+    return Result;
+  }
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
+                                                              unsigned TypeIdx,
+                                                              LLT NarrowTy) {
+  uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+  uint64_t NarrowSize = NarrowTy.getSizeInBits();
+
+  switch (MI.getOpcode()) {
+  default:
+    return UnableToLegalize;
+  case TargetOpcode::G_IMPLICIT_DEF: {
+    Register DstReg = MI.getOperand(0).getReg();
+    LLT DstTy = MRI.getType(DstReg);
+
+    // If SizeOp0 is not an exact multiple of NarrowSize, emit
+    // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
+    // FIXME: Although this would also be legal for the general case, it causes
+    //  a lot of regressions in the emitted code (superfluous COPYs, artifact
+    //  combines not being hit). This seems to be a problem related to the
+    //  artifact combiner.
+    if (SizeOp0 % NarrowSize != 0) {
+      LLT ImplicitTy = NarrowTy;
+      if (DstTy.isVector())
+        ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
+
+      Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
+      MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
+
+      MI.eraseFromParent();
+      return Legalized;
+    }
+
+    int NumParts = SizeOp0 / NarrowSize;
+
+    SmallVector<Register, 2> DstRegs;
+    for (int i = 0; i < NumParts; ++i)
+      DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
+
+    if (DstTy.isVector())
+      MIRBuilder.buildBuildVector(DstReg, DstRegs);
+    else
+      MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_CONSTANT: {
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    const APInt &Val = MI.getOperand(1).getCImm()->getValue();
+    unsigned TotalSize = Ty.getSizeInBits();
+    unsigned NarrowSize = NarrowTy.getSizeInBits();
+    int NumParts = TotalSize / NarrowSize;
+
+    SmallVector<Register, 4> PartRegs;
+    for (int I = 0; I != NumParts; ++I) {
+      unsigned Offset = I * NarrowSize;
+      auto K = MIRBuilder.buildConstant(NarrowTy,
+                                        Val.lshr(Offset).trunc(NarrowSize));
+      PartRegs.push_back(K.getReg(0));
+    }
+
+    LLT LeftoverTy;
+    unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
+    SmallVector<Register, 1> LeftoverRegs;
+    if (LeftoverBits != 0) {
+      LeftoverTy = LLT::scalar(LeftoverBits);
+      auto K = MIRBuilder.buildConstant(
+        LeftoverTy,
+        Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
+      LeftoverRegs.push_back(K.getReg(0));
+    }
+
+    insertParts(MI.getOperand(0).getReg(),
+                Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_SEXT:
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_ANYEXT:
+    return narrowScalarExt(MI, TypeIdx, NarrowTy);
+  case TargetOpcode::G_TRUNC: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+
+    uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+    if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
+      LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
+      return UnableToLegalize;
+    }
+
+    auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
+    MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  case TargetOpcode::G_FREEZE: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    // Should widen scalar first
+    if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
+      return UnableToLegalize;
+
+    auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
+    SmallVector<Register, 8> Parts;
+    for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
+      Parts.push_back(
+          MIRBuilder.buildFreeze(NarrowTy, Unmerge.getReg(i)).getReg(0));
+    }
+
+    MIRBuilder.buildMergeLikeInstr(MI.getOperand(0).getReg(), Parts);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_SUB:
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_SSUBO:
+  case TargetOpcode::G_SADDE:
+  case TargetOpcode::G_SSUBE:
+  case TargetOpcode::G_UADDO:
+  case TargetOpcode::G_USUBO:
+  case TargetOpcode::G_UADDE:
+  case TargetOpcode::G_USUBE:
+    return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
+  case TargetOpcode::G_MUL:
+  case TargetOpcode::G_UMULH:
+    return narrowScalarMul(MI, NarrowTy);
+  case TargetOpcode::G_EXTRACT:
+    return narrowScalarExtract(MI, TypeIdx, NarrowTy);
+  case TargetOpcode::G_INSERT:
+    return narrowScalarInsert(MI, TypeIdx, NarrowTy);
+  case TargetOpcode::G_LOAD: {
+    auto &LoadMI = cast<GLoad>(MI);
+    Register DstReg = LoadMI.getDstReg();
+    LLT DstTy = MRI.getType(DstReg);
+    if (DstTy.isVector())
+      return UnableToLegalize;
+
+    if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
+      Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
+      MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
+      MIRBuilder.buildAnyExt(DstReg, TmpReg);
+      LoadMI.eraseFromParent();
+      return Legalized;
+    }
+
+    return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
+  }
+  case TargetOpcode::G_ZEXTLOAD:
+  case TargetOpcode::G_SEXTLOAD: {
+    auto &LoadMI = cast<GExtLoad>(MI);
+    Register DstReg = LoadMI.getDstReg();
+    Register PtrReg = LoadMI.getPointerReg();
+
+    Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
+    auto &MMO = LoadMI.getMMO();
+    unsigned MemSize = MMO.getSizeInBits();
+
+    if (MemSize == NarrowSize) {
+      MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
+    } else if (MemSize < NarrowSize) {
+      MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
+    } else if (MemSize > NarrowSize) {
+      // FIXME: Need to split the load.
+      return UnableToLegalize;
+    }
+
+    if (isa<GZExtLoad>(LoadMI))
+      MIRBuilder.buildZExt(DstReg, TmpReg);
+    else
+      MIRBuilder.buildSExt(DstReg, TmpReg);
+
+    LoadMI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_STORE: {
+    auto &StoreMI = cast<GStore>(MI);
+
+    Register SrcReg = StoreMI.getValueReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+    if (SrcTy.isVector())
+      return UnableToLegalize;
+
+    int NumParts = SizeOp0 / NarrowSize;
+    unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
+    unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
+    if (SrcTy.isVector() && LeftoverBits != 0)
+      return UnableToLegalize;
+
+    if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
+      Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
+      MIRBuilder.buildTrunc(TmpReg, SrcReg);
+      MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
+      StoreMI.eraseFromParent();
+      return Legalized;
+    }
+
+    return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
+  }
+  case TargetOpcode::G_SELECT:
+    return narrowScalarSelect(MI, TypeIdx, NarrowTy);
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_XOR: {
+    // Legalize bitwise operation:
+    // A = BinOp<Ty> B, C
+    // into:
+    // B1, ..., BN = G_UNMERGE_VALUES B
+    // C1, ..., CN = G_UNMERGE_VALUES C
+    // A1 = BinOp<Ty/N> B1, C2
+    // ...
+    // AN = BinOp<Ty/N> BN, CN
+    // A = G_MERGE_VALUES A1, ..., AN
+    return narrowScalarBasic(MI, TypeIdx, NarrowTy);
+  }
+  case TargetOpcode::G_SHL:
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_ASHR:
+    return narrowScalarShift(MI, TypeIdx, NarrowTy);
+  case TargetOpcode::G_CTLZ:
+  case TargetOpcode::G_CTLZ_ZERO_UNDEF:
+  case TargetOpcode::G_CTTZ:
+  case TargetOpcode::G_CTTZ_ZERO_UNDEF:
+  case TargetOpcode::G_CTPOP:
+    if (TypeIdx == 1)
+      switch (MI.getOpcode()) {
+      case TargetOpcode::G_CTLZ:
+      case TargetOpcode::G_CTLZ_ZERO_UNDEF:
+        return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
+      case TargetOpcode::G_CTTZ:
+      case TargetOpcode::G_CTTZ_ZERO_UNDEF:
+        return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
+      case TargetOpcode::G_CTPOP:
+        return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
+      default:
+        return UnableToLegalize;
+      }
+
+    Observer.changingInstr(MI);
+    narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_INTTOPTR:
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+
+    Observer.changingInstr(MI);
+    narrowScalarSrc(MI, NarrowTy, 1);
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_PTRTOINT:
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    Observer.changingInstr(MI);
+    narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_PHI: {
+    // FIXME: add support for when SizeOp0 isn't an exact multiple of
+    // NarrowSize.
+    if (SizeOp0 % NarrowSize != 0)
+      return UnableToLegalize;
+
+    unsigned NumParts = SizeOp0 / NarrowSize;
+    SmallVector<Register, 2> DstRegs(NumParts);
+    SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
+    Observer.changingInstr(MI);
+    for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+      MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
+      MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
+      extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
+                   SrcRegs[i / 2]);
+    }
+    MachineBasicBlock &MBB = *MI.getParent();
+    MIRBuilder.setInsertPt(MBB, MI);
+    for (unsigned i = 0; i < NumParts; ++i) {
+      DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
+      MachineInstrBuilder MIB =
+          MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
+      for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
+        MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
+    }
+    MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
+    MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs);
+    Observer.changedInstr(MI);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
+  case TargetOpcode::G_INSERT_VECTOR_ELT: {
+    if (TypeIdx != 2)
+      return UnableToLegalize;
+
+    int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
+    Observer.changingInstr(MI);
+    narrowScalarSrc(MI, NarrowTy, OpIdx);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_ICMP: {
+    Register LHS = MI.getOperand(2).getReg();
+    LLT SrcTy = MRI.getType(LHS);
+    uint64_t SrcSize = SrcTy.getSizeInBits();
+    CmpInst::Predicate Pred =
+        static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+
+    // TODO: Handle the non-equality case for weird sizes.
+    if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
+      return UnableToLegalize;
+
+    LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
+    SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
+    if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
+                      LHSLeftoverRegs))
+      return UnableToLegalize;
+
+    LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
+    SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
+    if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
+                      RHSPartRegs, RHSLeftoverRegs))
+      return UnableToLegalize;
+
+    // We now have the LHS and RHS of the compare split into narrow-type
+    // registers, plus potentially some leftover type.
+    Register Dst = MI.getOperand(0).getReg();
+    LLT ResTy = MRI.getType(Dst);
+    if (ICmpInst::isEquality(Pred)) {
+      // For each part on the LHS and RHS, keep track of the result of XOR-ing
+      // them together. For each equal part, the result should be all 0s. For
+      // each non-equal part, we'll get at least one 1.
+      auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
+      SmallVector<Register, 4> Xors;
+      for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
+        auto LHS = std::get<0>(LHSAndRHS);
+        auto RHS = std::get<1>(LHSAndRHS);
+        auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
+        Xors.push_back(Xor);
+      }
+
+      // Build a G_XOR for each leftover register. Each G_XOR must be widened
+      // to the desired narrow type so that we can OR them together later.
+      SmallVector<Register, 4> WidenedXors;
+      for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
+        auto LHS = std::get<0>(LHSAndRHS);
+        auto RHS = std::get<1>(LHSAndRHS);
+        auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
+        LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
+        buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
+                            /* PadStrategy = */ TargetOpcode::G_ZEXT);
+        Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
+      }
+
+      // Now, for each part we broke up, we know if they are equal/not equal
+      // based off the G_XOR. We can OR these all together and compare against
+      // 0 to get the result.
+      assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
+      auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
+      for (unsigned I = 2, E = Xors.size(); I < E; ++I)
+        Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
+      MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
+    } else {
+      // TODO: Handle non-power-of-two types.
+      assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
+      assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
+      Register LHSL = LHSPartRegs[0];
+      Register LHSH = LHSPartRegs[1];
+      Register RHSL = RHSPartRegs[0];
+      Register RHSH = RHSPartRegs[1];
+      MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
+      MachineInstrBuilder CmpHEQ =
+          MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
+      MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
+          ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
+      MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
+    }
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_SEXT_INREG: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    int64_t SizeInBits = MI.getOperand(2).getImm();
+
+    // So long as the new type has more bits than the bits we're extending we
+    // don't need to break it apart.
+    if (NarrowTy.getScalarSizeInBits() >= SizeInBits) {
+      Observer.changingInstr(MI);
+      // We don't lose any non-extension bits by truncating the src and
+      // sign-extending the dst.
+      MachineOperand &MO1 = MI.getOperand(1);
+      auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
+      MO1.setReg(TruncMIB.getReg(0));
+
+      MachineOperand &MO2 = MI.getOperand(0);
+      Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
+      MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+      MIRBuilder.buildSExt(MO2, DstExt);
+      MO2.setReg(DstExt);
+      Observer.changedInstr(MI);
+      return Legalized;
+    }
+
+    // Break it apart. Components below the extension point are unmodified. The
+    // component containing the extension point becomes a narrower SEXT_INREG.
+    // Components above it are ashr'd from the component containing the
+    // extension point.
+    if (SizeOp0 % NarrowSize != 0)
+      return UnableToLegalize;
+    int NumParts = SizeOp0 / NarrowSize;
+
+    // List the registers where the destination will be scattered.
+    SmallVector<Register, 2> DstRegs;
+    // List the registers where the source will be split.
+    SmallVector<Register, 2> SrcRegs;
+
+    // Create all the temporary registers.
+    for (int i = 0; i < NumParts; ++i) {
+      Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
+
+      SrcRegs.push_back(SrcReg);
+    }
+
+    // Explode the big arguments into smaller chunks.
+    MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
+
+    Register AshrCstReg =
+        MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
+            .getReg(0);
+    Register FullExtensionReg = 0;
+    Register PartialExtensionReg = 0;
+
+    // Do the operation on each small part.
+    for (int i = 0; i < NumParts; ++i) {
+      if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits)
+        DstRegs.push_back(SrcRegs[i]);
+      else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) {
+        assert(PartialExtensionReg &&
+               "Expected to visit partial extension before full");
+        if (FullExtensionReg) {
+          DstRegs.push_back(FullExtensionReg);
+          continue;
+        }
+        DstRegs.push_back(
+            MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
+                .getReg(0));
+        FullExtensionReg = DstRegs.back();
+      } else {
+        DstRegs.push_back(
+            MIRBuilder
+                .buildInstr(
+                    TargetOpcode::G_SEXT_INREG, {NarrowTy},
+                    {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
+                .getReg(0));
+        PartialExtensionReg = DstRegs.back();
+      }
+    }
+
+    // Gather the destination registers into the final destination.
+    Register DstReg = MI.getOperand(0).getReg();
+    MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_BSWAP:
+  case TargetOpcode::G_BITREVERSE: {
+    if (SizeOp0 % NarrowSize != 0)
+      return UnableToLegalize;
+
+    Observer.changingInstr(MI);
+    SmallVector<Register, 2> SrcRegs, DstRegs;
+    unsigned NumParts = SizeOp0 / NarrowSize;
+    extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
+
+    for (unsigned i = 0; i < NumParts; ++i) {
+      auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
+                                           {SrcRegs[NumParts - 1 - i]});
+      DstRegs.push_back(DstPart.getReg(0));
+    }
+
+    MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs);
+
+    Observer.changedInstr(MI);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_PTR_ADD:
+  case TargetOpcode::G_PTRMASK: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    narrowScalarSrc(MI, NarrowTy, 2);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_FPTOUI:
+  case TargetOpcode::G_FPTOSI:
+    return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
+  case TargetOpcode::G_FPEXT:
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_FLDEXP:
+  case TargetOpcode::G_STRICT_FLDEXP:
+    return narrowScalarFLDEXP(MI, TypeIdx, NarrowTy);
+  }
+}
+
+Register LegalizerHelper::coerceToScalar(Register Val) {
+  LLT Ty = MRI.getType(Val);
+  if (Ty.isScalar())
+    return Val;
+
+  const DataLayout &DL = MIRBuilder.getDataLayout();
+  LLT NewTy = LLT::scalar(Ty.getSizeInBits());
+  if (Ty.isPointer()) {
+    if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
+      return Register();
+    return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
+  }
+
+  Register NewVal = Val;
+
+  assert(Ty.isVector());
+  LLT EltTy = Ty.getElementType();
+  if (EltTy.isPointer())
+    NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
+  return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
+}
+
+void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
+                                     unsigned OpIdx, unsigned ExtOpcode) {
+  MachineOperand &MO = MI.getOperand(OpIdx);
+  auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
+  MO.setReg(ExtB.getReg(0));
+}
+
+void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
+                                      unsigned OpIdx) {
+  MachineOperand &MO = MI.getOperand(OpIdx);
+  auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
+  MO.setReg(ExtB.getReg(0));
+}
+
+void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
+                                     unsigned OpIdx, unsigned TruncOpcode) {
+  MachineOperand &MO = MI.getOperand(OpIdx);
+  Register DstExt = MRI.createGenericVirtualRegister(WideTy);
+  MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+  MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
+  MO.setReg(DstExt);
+}
+
+void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
+                                      unsigned OpIdx, unsigned ExtOpcode) {
+  MachineOperand &MO = MI.getOperand(OpIdx);
+  Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
+  MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+  MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
+  MO.setReg(DstTrunc);
+}
+
+void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
+                                            unsigned OpIdx) {
+  MachineOperand &MO = MI.getOperand(OpIdx);
+  MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+  Register Dst = MO.getReg();
+  Register DstExt = MRI.createGenericVirtualRegister(WideTy);
+  MO.setReg(DstExt);
+  MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt);
+}
+
+void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
+                                            unsigned OpIdx) {
+  MachineOperand &MO = MI.getOperand(OpIdx);
+  SmallVector<Register, 8> Regs;
+  MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0));
+}
+
+void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
+  MachineOperand &Op = MI.getOperand(OpIdx);
+  Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
+}
+
+void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
+  MachineOperand &MO = MI.getOperand(OpIdx);
+  Register CastDst = MRI.createGenericVirtualRegister(CastTy);
+  MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+  MIRBuilder.buildBitcast(MO, CastDst);
+  MO.setReg(CastDst);
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
+                                        LLT WideTy) {
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  auto [DstReg, DstTy, Src1Reg, Src1Ty] = MI.getFirst2RegLLTs();
+  if (DstTy.isVector())
+    return UnableToLegalize;
+
+  LLT SrcTy = MRI.getType(Src1Reg);
+  const int DstSize = DstTy.getSizeInBits();
+  const int SrcSize = SrcTy.getSizeInBits();
+  const int WideSize = WideTy.getSizeInBits();
+  const int NumMerge = (DstSize + WideSize - 1) / WideSize;
+
+  unsigned NumOps = MI.getNumOperands();
+  unsigned NumSrc = MI.getNumOperands() - 1;
+  unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
+
+  if (WideSize >= DstSize) {
+    // Directly pack the bits in the target type.
+    Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1Reg).getReg(0);
+
+    for (unsigned I = 2; I != NumOps; ++I) {
+      const unsigned Offset = (I - 1) * PartSize;
+
+      Register SrcReg = MI.getOperand(I).getReg();
+      assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
+
+      auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
+
+      Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
+        MRI.createGenericVirtualRegister(WideTy);
+
+      auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
+      auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
+      MIRBuilder.buildOr(NextResult, ResultReg, Shl);
+      ResultReg = NextResult;
+    }
+
+    if (WideSize > DstSize)
+      MIRBuilder.buildTrunc(DstReg, ResultReg);
+    else if (DstTy.isPointer())
+      MIRBuilder.buildIntToPtr(DstReg, ResultReg);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  // Unmerge the original values to the GCD type, and recombine to the next
+  // multiple greater than the original type.
+  //
+  // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
+  // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
+  // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
+  // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
+  // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
+  // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
+  // %12:_(s12) = G_MERGE_VALUES %10, %11
+  //
+  // Padding with undef if necessary:
+  //
+  // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
+  // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
+  // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
+  // %7:_(s2) = G_IMPLICIT_DEF
+  // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
+  // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
+  // %10:_(s12) = G_MERGE_VALUES %8, %9
+
+  const int GCD = std::gcd(SrcSize, WideSize);
+  LLT GCDTy = LLT::scalar(GCD);
+
+  SmallVector<Register, 8> Parts;
+  SmallVector<Register, 8> NewMergeRegs;
+  SmallVector<Register, 8> Unmerges;
+  LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
+
+  // Decompose the original operands if they don't evenly divide.
+  for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
+    Register SrcReg = MO.getReg();
+    if (GCD == SrcSize) {
+      Unmerges.push_back(SrcReg);
+    } else {
+      auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
+      for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
+        Unmerges.push_back(Unmerge.getReg(J));
+    }
+  }
+
+  // Pad with undef to the next size that is a multiple of the requested size.
+  if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
+    Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
+    for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
+      Unmerges.push_back(UndefReg);
+  }
+
+  const int PartsPerGCD = WideSize / GCD;
+
+  // Build merges of each piece.
+  ArrayRef<Register> Slicer(Unmerges);
+  for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
+    auto Merge =
+        MIRBuilder.buildMergeLikeInstr(WideTy, Slicer.take_front(PartsPerGCD));
+    NewMergeRegs.push_back(Merge.getReg(0));
+  }
+
+  // A truncate may be necessary if the requested type doesn't evenly divide the
+  // original result type.
+  if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
+    MIRBuilder.buildMergeLikeInstr(DstReg, NewMergeRegs);
+  } else {
+    auto FinalMerge = MIRBuilder.buildMergeLikeInstr(WideDstTy, NewMergeRegs);
+    MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
+                                          LLT WideTy) {
+  if (TypeIdx != 0)
+    return UnableToLegalize;
+
+  int NumDst = MI.getNumOperands() - 1;
+  Register SrcReg = MI.getOperand(NumDst).getReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+  if (SrcTy.isVector())
+    return UnableToLegalize;
+
+  Register Dst0Reg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(Dst0Reg);
+  if (!DstTy.isScalar())
+    return UnableToLegalize;
+
+  if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
+    if (SrcTy.isPointer()) {
+      const DataLayout &DL = MIRBuilder.getDataLayout();
+      if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
+        LLVM_DEBUG(
+            dbgs() << "Not casting non-integral address space integer\n");
+        return UnableToLegalize;
+      }
+
+      SrcTy = LLT::scalar(SrcTy.getSizeInBits());
+      SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
+    }
+
+    // Widen SrcTy to WideTy. This does not affect the result, but since the
+    // user requested this size, it is probably better handled than SrcTy and
+    // should reduce the total number of legalization artifacts.
+    if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
+      SrcTy = WideTy;
+      SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
+    }
+
+    // Theres no unmerge type to target. Directly extract the bits from the
+    // source type
+    unsigned DstSize = DstTy.getSizeInBits();
+
+    MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
+    for (int I = 1; I != NumDst; ++I) {
+      auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
+      auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
+      MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
+    }
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  // Extend the source to a wider type.
+  LLT LCMTy = getLCMType(SrcTy, WideTy);
+
+  Register WideSrc = SrcReg;
+  if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
+    // TODO: If this is an integral address space, cast to integer and anyext.
+    if (SrcTy.isPointer()) {
+      LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
+      return UnableToLegalize;
+    }
+
+    WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
+  }
+
+  auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
+
+  // Create a sequence of unmerges and merges to the original results. Since we
+  // may have widened the source, we will need to pad the results with dead defs
+  // to cover the source register.
+  // e.g. widen s48 to s64:
+  // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
+  //
+  // =>
+  //  %4:_(s192) = G_ANYEXT %0:_(s96)
+  //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
+  //  ; unpack to GCD type, with extra dead defs
+  //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
+  //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
+  //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
+  //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
+  //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
+  const LLT GCDTy = getGCDType(WideTy, DstTy);
+  const int NumUnmerge = Unmerge->getNumOperands() - 1;
+  const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
+
+  // Directly unmerge to the destination without going through a GCD type
+  // if possible
+  if (PartsPerRemerge == 1) {
+    const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
+
+    for (int I = 0; I != NumUnmerge; ++I) {
+      auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
+
+      for (int J = 0; J != PartsPerUnmerge; ++J) {
+        int Idx = I * PartsPerUnmerge + J;
+        if (Idx < NumDst)
+          MIB.addDef(MI.getOperand(Idx).getReg());
+        else {
+          // Create dead def for excess components.
+          MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
+        }
+      }
+
+      MIB.addUse(Unmerge.getReg(I));
+    }
+  } else {
+    SmallVector<Register, 16> Parts;
+    for (int J = 0; J != NumUnmerge; ++J)
+      extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
+
+    SmallVector<Register, 8> RemergeParts;
+    for (int I = 0; I != NumDst; ++I) {
+      for (int J = 0; J < PartsPerRemerge; ++J) {
+        const int Idx = I * PartsPerRemerge + J;
+        RemergeParts.emplace_back(Parts[Idx]);
+      }
+
+      MIRBuilder.buildMergeLikeInstr(MI.getOperand(I).getReg(), RemergeParts);
+      RemergeParts.clear();
+    }
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
+                                    LLT WideTy) {
+  auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
+  unsigned Offset = MI.getOperand(2).getImm();
+
+  if (TypeIdx == 0) {
+    if (SrcTy.isVector() || DstTy.isVector())
+      return UnableToLegalize;
+
+    SrcOp Src(SrcReg);
+    if (SrcTy.isPointer()) {
+      // Extracts from pointers can be handled only if they are really just
+      // simple integers.
+      const DataLayout &DL = MIRBuilder.getDataLayout();
+      if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
+        return UnableToLegalize;
+
+      LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
+      Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
+      SrcTy = SrcAsIntTy;
+    }
+
+    if (DstTy.isPointer())
+      return UnableToLegalize;
+
+    if (Offset == 0) {
+      // Avoid a shift in the degenerate case.
+      MIRBuilder.buildTrunc(DstReg,
+                            MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
+      MI.eraseFromParent();
+      return Legalized;
+    }
+
+    // Do a shift in the source type.
+    LLT ShiftTy = SrcTy;
+    if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
+      Src = MIRBuilder.buildAnyExt(WideTy, Src);
+      ShiftTy = WideTy;
+    }
+
+    auto LShr = MIRBuilder.buildLShr(
+      ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
+    MIRBuilder.buildTrunc(DstReg, LShr);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  if (SrcTy.isScalar()) {
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+
+  if (!SrcTy.isVector())
+    return UnableToLegalize;
+
+  if (DstTy != SrcTy.getElementType())
+    return UnableToLegalize;
+
+  if (Offset % SrcTy.getScalarSizeInBits() != 0)
+    return UnableToLegalize;
+
+  Observer.changingInstr(MI);
+  widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+
+  MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
+                          Offset);
+  widenScalarDst(MI, WideTy.getScalarType(), 0);
+  Observer.changedInstr(MI);
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
+                                   LLT WideTy) {
+  if (TypeIdx != 0 || WideTy.isVector())
+    return UnableToLegalize;
+  Observer.changingInstr(MI);
+  widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+  widenScalarDst(MI, WideTy);
+  Observer.changedInstr(MI);
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
+                                           LLT WideTy) {
+  unsigned Opcode;
+  unsigned ExtOpcode;
+  std::optional<Register> CarryIn;
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode!");
+  case TargetOpcode::G_SADDO:
+    Opcode = TargetOpcode::G_ADD;
+    ExtOpcode = TargetOpcode::G_SEXT;
+    break;
+  case TargetOpcode::G_SSUBO:
+    Opcode = TargetOpcode::G_SUB;
+    ExtOpcode = TargetOpcode::G_SEXT;
+    break;
+  case TargetOpcode::G_UADDO:
+    Opcode = TargetOpcode::G_ADD;
+    ExtOpcode = TargetOpcode::G_ZEXT;
+    break;
+  case TargetOpcode::G_USUBO:
+    Opcode = TargetOpcode::G_SUB;
+    ExtOpcode = TargetOpcode::G_ZEXT;
+    break;
+  case TargetOpcode::G_SADDE:
+    Opcode = TargetOpcode::G_UADDE;
+    ExtOpcode = TargetOpcode::G_SEXT;
+    CarryIn = MI.getOperand(4).getReg();
+    break;
+  case TargetOpcode::G_SSUBE:
+    Opcode = TargetOpcode::G_USUBE;
+    ExtOpcode = TargetOpcode::G_SEXT;
+    CarryIn = MI.getOperand(4).getReg();
+    break;
+  case TargetOpcode::G_UADDE:
+    Opcode = TargetOpcode::G_UADDE;
+    ExtOpcode = TargetOpcode::G_ZEXT;
+    CarryIn = MI.getOperand(4).getReg();
+    break;
+  case TargetOpcode::G_USUBE:
+    Opcode = TargetOpcode::G_USUBE;
+    ExtOpcode = TargetOpcode::G_ZEXT;
+    CarryIn = MI.getOperand(4).getReg();
+    break;
+  }
+
+  if (TypeIdx == 1) {
+    unsigned BoolExtOp = MIRBuilder.getBoolExtOp(WideTy.isVector(), false);
+
+    Observer.changingInstr(MI);
+    if (CarryIn)
+      widenScalarSrc(MI, WideTy, 4, BoolExtOp);
+    widenScalarDst(MI, WideTy, 1);
+
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+
+  auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
+  auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
+  // Do the arithmetic in the larger type.
+  Register NewOp;
+  if (CarryIn) {
+    LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
+    NewOp = MIRBuilder
+                .buildInstr(Opcode, {WideTy, CarryOutTy},
+                            {LHSExt, RHSExt, *CarryIn})
+                .getReg(0);
+  } else {
+    NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
+  }
+  LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
+  auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
+  auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
+  // There is no overflow if the ExtOp is the same as NewOp.
+  MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
+  // Now trunc the NewOp to the original result.
+  MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
+                                         LLT WideTy) {
+  bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
+                  MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
+                  MI.getOpcode() == TargetOpcode::G_SSHLSAT;
+  bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
+                 MI.getOpcode() == TargetOpcode::G_USHLSAT;
+  // We can convert this to:
+  //   1. Any extend iN to iM
+  //   2. SHL by M-N
+  //   3. [US][ADD|SUB|SHL]SAT
+  //   4. L/ASHR by M-N
+  //
+  // It may be more efficient to lower this to a min and a max operation in
+  // the higher precision arithmetic if the promoted operation isn't legal,
+  // but this decision is up to the target's lowering request.
+  Register DstReg = MI.getOperand(0).getReg();
+
+  unsigned NewBits = WideTy.getScalarSizeInBits();
+  unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
+
+  // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
+  // must not left shift the RHS to preserve the shift amount.
+  auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
+  auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
+                     : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
+  auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
+  auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
+  auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
+
+  auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
+                                        {ShiftL, ShiftR}, MI.getFlags());
+
+  // Use a shift that will preserve the number of sign bits when the trunc is
+  // folded away.
+  auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
+                         : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
+
+  MIRBuilder.buildTrunc(DstReg, Result);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
+                                 LLT WideTy) {
+  if (TypeIdx == 1) {
+    Observer.changingInstr(MI);
+    widenScalarDst(MI, WideTy, 1);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+
+  bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
+  auto [Result, OriginalOverflow, LHS, RHS] = MI.getFirst4Regs();
+  LLT SrcTy = MRI.getType(LHS);
+  LLT OverflowTy = MRI.getType(OriginalOverflow);
+  unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
+
+  // To determine if the result overflowed in the larger type, we extend the
+  // input to the larger type, do the multiply (checking if it overflows),
+  // then also check the high bits of the result to see if overflow happened
+  // there.
+  unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
+  auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
+  auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
+
+  auto Mulo = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy, OverflowTy},
+                                    {LeftOperand, RightOperand});
+  auto Mul = Mulo->getOperand(0);
+  MIRBuilder.buildTrunc(Result, Mul);
+
+  MachineInstrBuilder ExtResult;
+  // Overflow occurred if it occurred in the larger type, or if the high part
+  // of the result does not zero/sign-extend the low part.  Check this second
+  // possibility first.
+  if (IsSigned) {
+    // For signed, overflow occurred when the high part does not sign-extend
+    // the low part.
+    ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
+  } else {
+    // Unsigned overflow occurred when the high part does not zero-extend the
+    // low part.
+    ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
+  }
+
+  // Multiplication cannot overflow if the WideTy is >= 2 * original width,
+  // so we don't need to check the overflow result of larger type Mulo.
+  if (WideTy.getScalarSizeInBits() < 2 * SrcBitWidth) {
+    auto Overflow =
+        MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
+    // Finally check if the multiplication in the larger type itself overflowed.
+    MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
+  } else {
+    MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
+  }
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
+  switch (MI.getOpcode()) {
+  default:
+    return UnableToLegalize;
+  case TargetOpcode::G_ATOMICRMW_XCHG:
+  case TargetOpcode::G_ATOMICRMW_ADD:
+  case TargetOpcode::G_ATOMICRMW_SUB:
+  case TargetOpcode::G_ATOMICRMW_AND:
+  case TargetOpcode::G_ATOMICRMW_OR:
+  case TargetOpcode::G_ATOMICRMW_XOR:
+  case TargetOpcode::G_ATOMICRMW_MIN:
+  case TargetOpcode::G_ATOMICRMW_MAX:
+  case TargetOpcode::G_ATOMICRMW_UMIN:
+  case TargetOpcode::G_ATOMICRMW_UMAX:
+    assert(TypeIdx == 0 && "atomicrmw with second scalar type");
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
+    widenScalarDst(MI, WideTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_ATOMIC_CMPXCHG:
+    assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
+    widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
+    widenScalarDst(MI, WideTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
+    if (TypeIdx == 0) {
+      Observer.changingInstr(MI);
+      widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
+      widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
+      widenScalarDst(MI, WideTy, 0);
+      Observer.changedInstr(MI);
+      return Legalized;
+    }
+    assert(TypeIdx == 1 &&
+           "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
+    Observer.changingInstr(MI);
+    widenScalarDst(MI, WideTy, 1);
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_EXTRACT:
+    return widenScalarExtract(MI, TypeIdx, WideTy);
+  case TargetOpcode::G_INSERT:
+    return widenScalarInsert(MI, TypeIdx, WideTy);
+  case TargetOpcode::G_MERGE_VALUES:
+    return widenScalarMergeValues(MI, TypeIdx, WideTy);
+  case TargetOpcode::G_UNMERGE_VALUES:
+    return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_SSUBO:
+  case TargetOpcode::G_UADDO:
+  case TargetOpcode::G_USUBO:
+  case TargetOpcode::G_SADDE:
+  case TargetOpcode::G_SSUBE:
+  case TargetOpcode::G_UADDE:
+  case TargetOpcode::G_USUBE:
+    return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
+  case TargetOpcode::G_UMULO:
+  case TargetOpcode::G_SMULO:
+    return widenScalarMulo(MI, TypeIdx, WideTy);
+  case TargetOpcode::G_SADDSAT:
+  case TargetOpcode::G_SSUBSAT:
+  case TargetOpcode::G_SSHLSAT:
+  case TargetOpcode::G_UADDSAT:
+  case TargetOpcode::G_USUBSAT:
+  case TargetOpcode::G_USHLSAT:
+    return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
+  case TargetOpcode::G_CTTZ:
+  case TargetOpcode::G_CTTZ_ZERO_UNDEF:
+  case TargetOpcode::G_CTLZ:
+  case TargetOpcode::G_CTLZ_ZERO_UNDEF:
+  case TargetOpcode::G_CTPOP: {
+    if (TypeIdx == 0) {
+      Observer.changingInstr(MI);
+      widenScalarDst(MI, WideTy, 0);
+      Observer.changedInstr(MI);
+      return Legalized;
+    }
+
+    Register SrcReg = MI.getOperand(1).getReg();
+
+    // First extend the input.
+    unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
+                              MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
+                          ? TargetOpcode::G_ANYEXT
+                          : TargetOpcode::G_ZEXT;
+    auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
+    LLT CurTy = MRI.getType(SrcReg);
+    unsigned NewOpc = MI.getOpcode();
+    if (NewOpc == TargetOpcode::G_CTTZ) {
+      // The count is the same in the larger type except if the original
+      // value was zero.  This can be handled by setting the bit just off
+      // the top of the original type.
+      auto TopBit =
+          APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
+      MIBSrc = MIRBuilder.buildOr(
+        WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
+      // Now we know the operand is non-zero, use the more relaxed opcode.
+      NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
+    }
+
+    // Perform the operation at the larger size.
+    auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
+    // This is already the correct result for CTPOP and CTTZs
+    if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
+        MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
+      // The correct result is NewOp - (Difference in widety and current ty).
+      unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
+      MIBNewOp = MIRBuilder.buildSub(
+          WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
+    }
+
+    MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_BSWAP: {
+    Observer.changingInstr(MI);
+    Register DstReg = MI.getOperand(0).getReg();
+
+    Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
+    Register DstExt = MRI.createGenericVirtualRegister(WideTy);
+    Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+
+    MI.getOperand(0).setReg(DstExt);
+
+    MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+
+    LLT Ty = MRI.getType(DstReg);
+    unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
+    MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
+    MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
+
+    MIRBuilder.buildTrunc(DstReg, ShrReg);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_BITREVERSE: {
+    Observer.changingInstr(MI);
+
+    Register DstReg = MI.getOperand(0).getReg();
+    LLT Ty = MRI.getType(DstReg);
+    unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
+
+    Register DstExt = MRI.createGenericVirtualRegister(WideTy);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+    MI.getOperand(0).setReg(DstExt);
+    MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+
+    auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
+    auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
+    MIRBuilder.buildTrunc(DstReg, Shift);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_FREEZE:
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+    widenScalarDst(MI, WideTy);
+    Observer.changedInstr(MI);
+    return Legalized;
+
+  case TargetOpcode::G_ABS:
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
+    widenScalarDst(MI, WideTy);
+    Observer.changedInstr(MI);
+    return Legalized;
+
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_MUL:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_XOR:
+  case TargetOpcode::G_SUB:
+    // Perform operation at larger width (any extension is fines here, high bits
+    // don't affect the result) and then truncate the result back to the
+    // original type.
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
+    widenScalarDst(MI, WideTy);
+    Observer.changedInstr(MI);
+    return Legalized;
+
+  case TargetOpcode::G_SBFX:
+  case TargetOpcode::G_UBFX:
+    Observer.changingInstr(MI);
+
+    if (TypeIdx == 0) {
+      widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+      widenScalarDst(MI, WideTy);
+    } else {
+      widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
+      widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
+    }
+
+    Observer.changedInstr(MI);
+    return Legalized;
+
+  case TargetOpcode::G_SHL:
+    Observer.changingInstr(MI);
+
+    if (TypeIdx == 0) {
+      widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+      widenScalarDst(MI, WideTy);
+    } else {
+      assert(TypeIdx == 1);
+      // The "number of bits to shift" operand must preserve its value as an
+      // unsigned integer:
+      widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
+    }
+
+    Observer.changedInstr(MI);
+    return Legalized;
+
+  case TargetOpcode::G_SDIV:
+  case TargetOpcode::G_SREM:
+  case TargetOpcode::G_SMIN:
+  case TargetOpcode::G_SMAX:
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
+    widenScalarDst(MI, WideTy);
+    Observer.changedInstr(MI);
+    return Legalized;
+
+  case TargetOpcode::G_SDIVREM:
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
+    widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
+    widenScalarDst(MI, WideTy);
+    widenScalarDst(MI, WideTy, 1);
+    Observer.changedInstr(MI);
+    return Legalized;
+
+  case TargetOpcode::G_ASHR:
+  case TargetOpcode::G_LSHR:
+    Observer.changingInstr(MI);
+
+    if (TypeIdx == 0) {
+      unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
+        TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
+
+      widenScalarSrc(MI, WideTy, 1, CvtOp);
+      widenScalarDst(MI, WideTy);
+    } else {
+      assert(TypeIdx == 1);
+      // The "number of bits to shift" operand must preserve its value as an
+      // unsigned integer:
+      widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
+    }
+
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_UDIV:
+  case TargetOpcode::G_UREM:
+  case TargetOpcode::G_UMIN:
+  case TargetOpcode::G_UMAX:
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
+    widenScalarDst(MI, WideTy);
+    Observer.changedInstr(MI);
+    return Legalized;
+
+  case TargetOpcode::G_UDIVREM:
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
+    widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
+    widenScalarDst(MI, WideTy);
+    widenScalarDst(MI, WideTy, 1);
+    Observer.changedInstr(MI);
+    return Legalized;
+
+  case TargetOpcode::G_SELECT:
+    Observer.changingInstr(MI);
+    if (TypeIdx == 0) {
+      // Perform operation at larger width (any extension is fine here, high
+      // bits don't affect the result) and then truncate the result back to the
+      // original type.
+      widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
+      widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
+      widenScalarDst(MI, WideTy);
+    } else {
+      bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
+      // Explicit extension is required here since high bits affect the result.
+      widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
+    }
+    Observer.changedInstr(MI);
+    return Legalized;
+
+  case TargetOpcode::G_FPTOSI:
+  case TargetOpcode::G_FPTOUI:
+    Observer.changingInstr(MI);
+
+    if (TypeIdx == 0)
+      widenScalarDst(MI, WideTy);
+    else
+      widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
+
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_SITOFP:
+    Observer.changingInstr(MI);
+
+    if (TypeIdx == 0)
+      widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
+    else
+      widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
+
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_UITOFP:
+    Observer.changingInstr(MI);
+
+    if (TypeIdx == 0)
+      widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
+    else
+      widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
+
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_SEXTLOAD:
+  case TargetOpcode::G_ZEXTLOAD:
+    Observer.changingInstr(MI);
+    widenScalarDst(MI, WideTy);
+    Observer.changedInstr(MI);
+    return Legalized;
+
+  case TargetOpcode::G_STORE: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    if (!Ty.isScalar())
+      return UnableToLegalize;
+
+    Observer.changingInstr(MI);
+
+    unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
+      TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
+    widenScalarSrc(MI, WideTy, 0, ExtType);
+
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_CONSTANT: {
+    MachineOperand &SrcMO = MI.getOperand(1);
+    LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
+    unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
+        MRI.getType(MI.getOperand(0).getReg()));
+    assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
+            ExtOpc == TargetOpcode::G_ANYEXT) &&
+           "Illegal Extend");
+    const APInt &SrcVal = SrcMO.getCImm()->getValue();
+    const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
+                           ? SrcVal.sext(WideTy.getSizeInBits())
+                           : SrcVal.zext(WideTy.getSizeInBits());
+    Observer.changingInstr(MI);
+    SrcMO.setCImm(ConstantInt::get(Ctx, Val));
+
+    widenScalarDst(MI, WideTy);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_FCONSTANT: {
+    // To avoid changing the bits of the constant due to extension to a larger
+    // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT.
+    MachineOperand &SrcMO = MI.getOperand(1);
+    APInt Val = SrcMO.getFPImm()->getValueAPF().bitcastToAPInt();
+    MIRBuilder.setInstrAndDebugLoc(MI);
+    auto IntCst = MIRBuilder.buildConstant(MI.getOperand(0).getReg(), Val);
+    widenScalarDst(*IntCst, WideTy, 0, TargetOpcode::G_TRUNC);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_IMPLICIT_DEF: {
+    Observer.changingInstr(MI);
+    widenScalarDst(MI, WideTy);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_BRCOND:
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
+    Observer.changedInstr(MI);
+    return Legalized;
+
+  case TargetOpcode::G_FCMP:
+    Observer.changingInstr(MI);
+    if (TypeIdx == 0)
+      widenScalarDst(MI, WideTy);
+    else {
+      widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
+      widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
+    }
+    Observer.changedInstr(MI);
+    return Legalized;
+
+  case TargetOpcode::G_ICMP:
+    Observer.changingInstr(MI);
+    if (TypeIdx == 0)
+      widenScalarDst(MI, WideTy);
+    else {
+      unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
+                               MI.getOperand(1).getPredicate()))
+                               ? TargetOpcode::G_SEXT
+                               : TargetOpcode::G_ZEXT;
+      widenScalarSrc(MI, WideTy, 2, ExtOpcode);
+      widenScalarSrc(MI, WideTy, 3, ExtOpcode);
+    }
+    Observer.changedInstr(MI);
+    return Legalized;
+
+  case TargetOpcode::G_PTR_ADD:
+    assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
+    Observer.changedInstr(MI);
+    return Legalized;
+
+  case TargetOpcode::G_PHI: {
+    assert(TypeIdx == 0 && "Expecting only Idx 0");
+
+    Observer.changingInstr(MI);
+    for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
+      MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
+      MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
+      widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
+    }
+
+    MachineBasicBlock &MBB = *MI.getParent();
+    MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
+    widenScalarDst(MI, WideTy);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
+    if (TypeIdx == 0) {
+      Register VecReg = MI.getOperand(1).getReg();
+      LLT VecTy = MRI.getType(VecReg);
+      Observer.changingInstr(MI);
+
+      widenScalarSrc(
+          MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
+          TargetOpcode::G_ANYEXT);
+
+      widenScalarDst(MI, WideTy, 0);
+      Observer.changedInstr(MI);
+      return Legalized;
+    }
+
+    if (TypeIdx != 2)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    // TODO: Probably should be zext
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_INSERT_VECTOR_ELT: {
+    if (TypeIdx == 1) {
+      Observer.changingInstr(MI);
+
+      Register VecReg = MI.getOperand(1).getReg();
+      LLT VecTy = MRI.getType(VecReg);
+      LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
+
+      widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
+      widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
+      widenScalarDst(MI, WideVecTy, 0);
+      Observer.changedInstr(MI);
+      return Legalized;
+    }
+
+    if (TypeIdx == 2) {
+      Observer.changingInstr(MI);
+      // TODO: Probably should be zext
+      widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
+      Observer.changedInstr(MI);
+      return Legalized;
+    }
+
+    return UnableToLegalize;
+  }
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FMUL:
+  case TargetOpcode::G_FSUB:
+  case TargetOpcode::G_FMA:
+  case TargetOpcode::G_FMAD:
+  case TargetOpcode::G_FNEG:
+  case TargetOpcode::G_FABS:
+  case TargetOpcode::G_FCANONICALIZE:
+  case TargetOpcode::G_FMINNUM:
+  case TargetOpcode::G_FMAXNUM:
+  case TargetOpcode::G_FMINNUM_IEEE:
+  case TargetOpcode::G_FMAXNUM_IEEE:
+  case TargetOpcode::G_FMINIMUM:
+  case TargetOpcode::G_FMAXIMUM:
+  case TargetOpcode::G_FDIV:
+  case TargetOpcode::G_FREM:
+  case TargetOpcode::G_FCEIL:
+  case TargetOpcode::G_FFLOOR:
+  case TargetOpcode::G_FCOS:
+  case TargetOpcode::G_FSIN:
+  case TargetOpcode::G_FLOG10:
+  case TargetOpcode::G_FLOG:
+  case TargetOpcode::G_FLOG2:
+  case TargetOpcode::G_FRINT:
+  case TargetOpcode::G_FNEARBYINT:
+  case TargetOpcode::G_FSQRT:
+  case TargetOpcode::G_FEXP:
+  case TargetOpcode::G_FEXP2:
+  case TargetOpcode::G_FPOW:
+  case TargetOpcode::G_INTRINSIC_TRUNC:
+  case TargetOpcode::G_INTRINSIC_ROUND:
+  case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
+    assert(TypeIdx == 0);
+    Observer.changingInstr(MI);
+
+    for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
+      widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
+
+    widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_FPOWI:
+  case TargetOpcode::G_FLDEXP:
+  case TargetOpcode::G_STRICT_FLDEXP: {
+    if (TypeIdx == 0) {
+      if (MI.getOpcode() == TargetOpcode::G_STRICT_FLDEXP)
+        return UnableToLegalize;
+
+      Observer.changingInstr(MI);
+      widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
+      widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
+      Observer.changedInstr(MI);
+      return Legalized;
+    }
+
+    if (TypeIdx == 1) {
+      // For some reason SelectionDAG tries to promote to a libcall without
+      // actually changing the integer type for promotion.
+      Observer.changingInstr(MI);
+      widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
+      Observer.changedInstr(MI);
+      return Legalized;
+    }
+
+    return UnableToLegalize;
+  }
+  case TargetOpcode::G_FFREXP: {
+    Observer.changingInstr(MI);
+
+    if (TypeIdx == 0) {
+      widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
+      widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
+    } else {
+      widenScalarDst(MI, WideTy, 1);
+    }
+
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_INTTOPTR:
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_PTRTOINT:
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    Observer.changingInstr(MI);
+    widenScalarDst(MI, WideTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_BUILD_VECTOR: {
+    Observer.changingInstr(MI);
+
+    const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
+    for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
+      widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
+
+    // Avoid changing the result vector type if the source element type was
+    // requested.
+    if (TypeIdx == 1) {
+      MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
+    } else {
+      widenScalarDst(MI, WideTy, 0);
+    }
+
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_SEXT_INREG:
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+    widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_PTRMASK: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  }
+}
+
+static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
+                             MachineIRBuilder &B, Register Src, LLT Ty) {
+  auto Unmerge = B.buildUnmerge(Ty, Src);
+  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+    Pieces.push_back(Unmerge.getReg(I));
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFConstant(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  const DataLayout &DL = MIRBuilder.getDataLayout();
+
+  unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
+  LLT AddrPtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
+  Align Alignment = Align(DL.getABITypeAlign(
+      getFloatTypeForLLT(MF.getFunction().getContext(), MRI.getType(Dst))));
+
+  auto Addr = MIRBuilder.buildConstantPool(
+      AddrPtrTy, MF.getConstantPool()->getConstantPoolIndex(
+                     MI.getOperand(1).getFPImm(), Alignment));
+
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
+      MRI.getType(Dst), Alignment);
+
+  MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, Dst, Addr, *MMO);
+  MI.eraseFromParent();
+
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerBitcast(MachineInstr &MI) {
+  auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
+  if (SrcTy.isVector()) {
+    LLT SrcEltTy = SrcTy.getElementType();
+    SmallVector<Register, 8> SrcRegs;
+
+    if (DstTy.isVector()) {
+      int NumDstElt = DstTy.getNumElements();
+      int NumSrcElt = SrcTy.getNumElements();
+
+      LLT DstEltTy = DstTy.getElementType();
+      LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
+      LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
+
+      // If there's an element size mismatch, insert intermediate casts to match
+      // the result element type.
+      if (NumSrcElt < NumDstElt) { // Source element type is larger.
+        // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
+        //
+        // =>
+        //
+        // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
+        // %3:_(<2 x s8>) = G_BITCAST %2
+        // %4:_(<2 x s8>) = G_BITCAST %3
+        // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
+        DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
+        SrcPartTy = SrcEltTy;
+      } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
+        //
+        // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
+        //
+        // =>
+        //
+        // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
+        // %3:_(s16) = G_BITCAST %2
+        // %4:_(s16) = G_BITCAST %3
+        // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
+        SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
+        DstCastTy = DstEltTy;
+      }
+
+      getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
+      for (Register &SrcReg : SrcRegs)
+        SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
+    } else
+      getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
+
+    MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  if (DstTy.isVector()) {
+    SmallVector<Register, 8> SrcRegs;
+    getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
+    MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+/// Figure out the bit offset into a register when coercing a vector index for
+/// the wide element type. This is only for the case when promoting vector to
+/// one with larger elements.
+//
+///
+/// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
+/// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
+static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
+                                                   Register Idx,
+                                                   unsigned NewEltSize,
+                                                   unsigned OldEltSize) {
+  const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
+  LLT IdxTy = B.getMRI()->getType(Idx);
+
+  // Now figure out the amount we need to shift to get the target bits.
+  auto OffsetMask = B.buildConstant(
+      IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
+  auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
+  return B.buildShl(IdxTy, OffsetIdx,
+                    B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
+}
+
+/// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
+/// is casting to a vector with a smaller element size, perform multiple element
+/// extracts and merge the results. If this is coercing to a vector with larger
+/// elements, index the bitcasted vector and extract the target element with bit
+/// operations. This is intended to force the indexing in the native register
+/// size for architectures that can dynamically index the register file.
+LegalizerHelper::LegalizeResult
+LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
+                                         LLT CastTy) {
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  auto [Dst, DstTy, SrcVec, SrcVecTy, Idx, IdxTy] = MI.getFirst3RegLLTs();
+
+  LLT SrcEltTy = SrcVecTy.getElementType();
+  unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
+  unsigned OldNumElts = SrcVecTy.getNumElements();
+
+  LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
+  Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
+
+  const unsigned NewEltSize = NewEltTy.getSizeInBits();
+  const unsigned OldEltSize = SrcEltTy.getSizeInBits();
+  if (NewNumElts > OldNumElts) {
+    // Decreasing the vector element size
+    //
+    // e.g. i64 = extract_vector_elt x:v2i64, y:i32
+    //  =>
+    //  v4i32:castx = bitcast x:v2i64
+    //
+    // i64 = bitcast
+    //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
+    //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
+    //
+    if (NewNumElts % OldNumElts != 0)
+      return UnableToLegalize;
+
+    // Type of the intermediate result vector.
+    const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
+    LLT MidTy =
+        LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
+
+    auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
+
+    SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
+    auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
+
+    for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
+      auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
+      auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
+      auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
+      NewOps[I] = Elt.getReg(0);
+    }
+
+    auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
+    MIRBuilder.buildBitcast(Dst, NewVec);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  if (NewNumElts < OldNumElts) {
+    if (NewEltSize % OldEltSize != 0)
+      return UnableToLegalize;
+
+    // This only depends on powers of 2 because we use bit tricks to figure out
+    // the bit offset we need to shift to get the target element. A general
+    // expansion could emit division/multiply.
+    if (!isPowerOf2_32(NewEltSize / OldEltSize))
+      return UnableToLegalize;
+
+    // Increasing the vector element size.
+    // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
+    //
+    //   =>
+    //
+    // %cast = G_BITCAST %vec
+    // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
+    // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
+    // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
+    // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
+    // %elt_bits = G_LSHR %wide_elt, %offset_bits
+    // %elt = G_TRUNC %elt_bits
+
+    const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
+    auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
+
+    // Divide to get the index in the wider element type.
+    auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
+
+    Register WideElt = CastVec;
+    if (CastTy.isVector()) {
+      WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
+                                                     ScaledIdx).getReg(0);
+    }
+
+    // Compute the bit offset into the register of the target element.
+    Register OffsetBits = getBitcastWiderVectorElementOffset(
+      MIRBuilder, Idx, NewEltSize, OldEltSize);
+
+    // Shift the wide element to get the target element.
+    auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
+    MIRBuilder.buildTrunc(Dst, ExtractedBits);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+/// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
+/// TargetReg, while preserving other bits in \p TargetReg.
+///
+/// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
+static Register buildBitFieldInsert(MachineIRBuilder &B,
+                                    Register TargetReg, Register InsertReg,
+                                    Register OffsetBits) {
+  LLT TargetTy = B.getMRI()->getType(TargetReg);
+  LLT InsertTy = B.getMRI()->getType(InsertReg);
+  auto ZextVal = B.buildZExt(TargetTy, InsertReg);
+  auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
+
+  // Produce a bitmask of the value to insert
+  auto EltMask = B.buildConstant(
+    TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
+                                   InsertTy.getSizeInBits()));
+  // Shift it into position
+  auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
+  auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
+
+  // Clear out the bits in the wide element
+  auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
+
+  // The value to insert has all zeros already, so stick it into the masked
+  // wide element.
+  return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
+}
+
+/// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
+/// is increasing the element size, perform the indexing in the target element
+/// type, and use bit operations to insert at the element position. This is
+/// intended for architectures that can dynamically index the register file and
+/// want to force indexing in the native register size.
+LegalizerHelper::LegalizeResult
+LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
+                                        LLT CastTy) {
+  if (TypeIdx != 0)
+    return UnableToLegalize;
+
+  auto [Dst, DstTy, SrcVec, SrcVecTy, Val, ValTy, Idx, IdxTy] =
+      MI.getFirst4RegLLTs();
+  LLT VecTy = DstTy;
+
+  LLT VecEltTy = VecTy.getElementType();
+  LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
+  const unsigned NewEltSize = NewEltTy.getSizeInBits();
+  const unsigned OldEltSize = VecEltTy.getSizeInBits();
+
+  unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
+  unsigned OldNumElts = VecTy.getNumElements();
+
+  Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
+  if (NewNumElts < OldNumElts) {
+    if (NewEltSize % OldEltSize != 0)
+      return UnableToLegalize;
+
+    // This only depends on powers of 2 because we use bit tricks to figure out
+    // the bit offset we need to shift to get the target element. A general
+    // expansion could emit division/multiply.
+    if (!isPowerOf2_32(NewEltSize / OldEltSize))
+      return UnableToLegalize;
+
+    const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
+    auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
+
+    // Divide to get the index in the wider element type.
+    auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
+
+    Register ExtractedElt = CastVec;
+    if (CastTy.isVector()) {
+      ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
+                                                          ScaledIdx).getReg(0);
+    }
+
+    // Compute the bit offset into the register of the target element.
+    Register OffsetBits = getBitcastWiderVectorElementOffset(
+      MIRBuilder, Idx, NewEltSize, OldEltSize);
+
+    Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
+                                               Val, OffsetBits);
+    if (CastTy.isVector()) {
+      InsertedElt = MIRBuilder.buildInsertVectorElement(
+        CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
+    }
+
+    MIRBuilder.buildBitcast(Dst, InsertedElt);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
+  // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
+  Register DstReg = LoadMI.getDstReg();
+  Register PtrReg = LoadMI.getPointerReg();
+  LLT DstTy = MRI.getType(DstReg);
+  MachineMemOperand &MMO = LoadMI.getMMO();
+  LLT MemTy = MMO.getMemoryType();
+  MachineFunction &MF = MIRBuilder.getMF();
+
+  unsigned MemSizeInBits = MemTy.getSizeInBits();
+  unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
+
+  if (MemSizeInBits != MemStoreSizeInBits) {
+    if (MemTy.isVector())
+      return UnableToLegalize;
+
+    // Promote to a byte-sized load if not loading an integral number of
+    // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
+    LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
+    MachineMemOperand *NewMMO =
+        MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
+
+    Register LoadReg = DstReg;
+    LLT LoadTy = DstTy;
+
+    // If this wasn't already an extending load, we need to widen the result
+    // register to avoid creating a load with a narrower result than the source.
+    if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
+      LoadTy = WideMemTy;
+      LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
+    }
+
+    if (isa<GSExtLoad>(LoadMI)) {
+      auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
+      MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
+    } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == LoadTy) {
+      auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
+      // The extra bits are guaranteed to be zero, since we stored them that
+      // way.  A zext load from Wide thus automatically gives zext from MemVT.
+      MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
+    } else {
+      MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
+    }
+
+    if (DstTy != LoadTy)
+      MIRBuilder.buildTrunc(DstReg, LoadReg);
+
+    LoadMI.eraseFromParent();
+    return Legalized;
+  }
+
+  // Big endian lowering not implemented.
+  if (MIRBuilder.getDataLayout().isBigEndian())
+    return UnableToLegalize;
+
+  // This load needs splitting into power of 2 sized loads.
+  //
+  // Our strategy here is to generate anyextending loads for the smaller
+  // types up to next power-2 result type, and then combine the two larger
+  // result values together, before truncating back down to the non-pow-2
+  // type.
+  // E.g. v1 = i24 load =>
+  // v2 = i32 zextload (2 byte)
+  // v3 = i32 load (1 byte)
+  // v4 = i32 shl v3, 16
+  // v5 = i32 or v4, v2
+  // v1 = i24 trunc v5
+  // By doing this we generate the correct truncate which should get
+  // combined away as an artifact with a matching extend.
+
+  uint64_t LargeSplitSize, SmallSplitSize;
+
+  if (!isPowerOf2_32(MemSizeInBits)) {
+    // This load needs splitting into power of 2 sized loads.
+    LargeSplitSize = llvm::bit_floor(MemSizeInBits);
+    SmallSplitSize = MemSizeInBits - LargeSplitSize;
+  } else {
+    // This is already a power of 2, but we still need to split this in half.
+    //
+    // Assume we're being asked to decompose an unaligned load.
+    // TODO: If this requires multiple splits, handle them all at once.
+    auto &Ctx = MF.getFunction().getContext();
+    if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
+      return UnableToLegalize;
+
+    SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
+  }
+
+  if (MemTy.isVector()) {
+    // TODO: Handle vector extloads
+    if (MemTy != DstTy)
+      return UnableToLegalize;
+
+    // TODO: We can do better than scalarizing the vector and at least split it
+    // in half.
+    return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
+  }
+
+  MachineMemOperand *LargeMMO =
+      MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
+  MachineMemOperand *SmallMMO =
+      MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
+
+  LLT PtrTy = MRI.getType(PtrReg);
+  unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
+  LLT AnyExtTy = LLT::scalar(AnyExtSize);
+  auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
+                                             PtrReg, *LargeMMO);
+
+  auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
+                                            LargeSplitSize / 8);
+  Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
+  auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
+  auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
+                                             SmallPtr, *SmallMMO);
+
+  auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
+  auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
+
+  if (AnyExtTy == DstTy)
+    MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
+  else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
+    auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
+    MIRBuilder.buildTrunc(DstReg, {Or});
+  } else {
+    assert(DstTy.isPointer() && "expected pointer");
+    auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
+
+    // FIXME: We currently consider this to be illegal for non-integral address
+    // spaces, but we need still need a way to reinterpret the bits.
+    MIRBuilder.buildIntToPtr(DstReg, Or);
+  }
+
+  LoadMI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
+  // Lower a non-power of 2 store into multiple pow-2 stores.
+  // E.g. split an i24 store into an i16 store + i8 store.
+  // We do this by first extending the stored value to the next largest power
+  // of 2 type, and then using truncating stores to store the components.
+  // By doing this, likewise with G_LOAD, generate an extend that can be
+  // artifact-combined away instead of leaving behind extracts.
+  Register SrcReg = StoreMI.getValueReg();
+  Register PtrReg = StoreMI.getPointerReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+  MachineFunction &MF = MIRBuilder.getMF();
+  MachineMemOperand &MMO = **StoreMI.memoperands_begin();
+  LLT MemTy = MMO.getMemoryType();
+
+  unsigned StoreWidth = MemTy.getSizeInBits();
+  unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
+
+  if (StoreWidth != StoreSizeInBits) {
+    if (SrcTy.isVector())
+      return UnableToLegalize;
+
+    // Promote to a byte-sized store with upper bits zero if not
+    // storing an integral number of bytes.  For example, promote
+    // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
+    LLT WideTy = LLT::scalar(StoreSizeInBits);
+
+    if (StoreSizeInBits > SrcTy.getSizeInBits()) {
+      // Avoid creating a store with a narrower source than result.
+      SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
+      SrcTy = WideTy;
+    }
+
+    auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
+
+    MachineMemOperand *NewMMO =
+        MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
+    MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
+    StoreMI.eraseFromParent();
+    return Legalized;
+  }
+
+  if (MemTy.isVector()) {
+    // TODO: Handle vector trunc stores
+    if (MemTy != SrcTy)
+      return UnableToLegalize;
+
+    // TODO: We can do better than scalarizing the vector and at least split it
+    // in half.
+    return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
+  }
+
+  unsigned MemSizeInBits = MemTy.getSizeInBits();
+  uint64_t LargeSplitSize, SmallSplitSize;
+
+  if (!isPowerOf2_32(MemSizeInBits)) {
+    LargeSplitSize = llvm::bit_floor<uint64_t>(MemTy.getSizeInBits());
+    SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
+  } else {
+    auto &Ctx = MF.getFunction().getContext();
+    if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
+      return UnableToLegalize; // Don't know what we're being asked to do.
+
+    SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
+  }
+
+  // Extend to the next pow-2. If this store was itself the result of lowering,
+  // e.g. an s56 store being broken into s32 + s24, we might have a stored type
+  // that's wider than the stored size.
+  unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
+  const LLT NewSrcTy = LLT::scalar(AnyExtSize);
+
+  if (SrcTy.isPointer()) {
+    const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
+    SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
+  }
+
+  auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
+
+  // Obtain the smaller value by shifting away the larger value.
+  auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
+  auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
+
+  // Generate the PtrAdd and truncating stores.
+  LLT PtrTy = MRI.getType(PtrReg);
+  auto OffsetCst = MIRBuilder.buildConstant(
+    LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
+  auto SmallPtr =
+    MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
+
+  MachineMemOperand *LargeMMO =
+    MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
+  MachineMemOperand *SmallMMO =
+    MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
+  MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
+  MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
+  StoreMI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_LOAD: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+    MachineMemOperand &MMO = **MI.memoperands_begin();
+
+    // Not sure how to interpret a bitcast of an extending load.
+    if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
+      return UnableToLegalize;
+
+    Observer.changingInstr(MI);
+    bitcastDst(MI, CastTy, 0);
+    MMO.setType(CastTy);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_STORE: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    MachineMemOperand &MMO = **MI.memoperands_begin();
+
+    // Not sure how to interpret a bitcast of a truncating store.
+    if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
+      return UnableToLegalize;
+
+    Observer.changingInstr(MI);
+    bitcastSrc(MI, CastTy, 0);
+    MMO.setType(CastTy);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_SELECT: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
+      LLVM_DEBUG(
+          dbgs() << "bitcast action not implemented for vector select\n");
+      return UnableToLegalize;
+    }
+
+    Observer.changingInstr(MI);
+    bitcastSrc(MI, CastTy, 2);
+    bitcastSrc(MI, CastTy, 3);
+    bitcastDst(MI, CastTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_XOR: {
+    Observer.changingInstr(MI);
+    bitcastSrc(MI, CastTy, 1);
+    bitcastSrc(MI, CastTy, 2);
+    bitcastDst(MI, CastTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
+    return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
+  case TargetOpcode::G_INSERT_VECTOR_ELT:
+    return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
+  default:
+    return UnableToLegalize;
+  }
+}
+
+// Legalize an instruction by changing the opcode in place.
+void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
+    Observer.changingInstr(MI);
+    MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
+    Observer.changedInstr(MI);
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
+  using namespace TargetOpcode;
+
+  switch(MI.getOpcode()) {
+  default:
+    return UnableToLegalize;
+  case TargetOpcode::G_FCONSTANT:
+    return lowerFConstant(MI);
+  case TargetOpcode::G_BITCAST:
+    return lowerBitcast(MI);
+  case TargetOpcode::G_SREM:
+  case TargetOpcode::G_UREM: {
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    auto Quot =
+        MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
+                              {MI.getOperand(1), MI.getOperand(2)});
+
+    auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
+    MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_SSUBO:
+    return lowerSADDO_SSUBO(MI);
+  case TargetOpcode::G_UMULH:
+  case TargetOpcode::G_SMULH:
+    return lowerSMULH_UMULH(MI);
+  case TargetOpcode::G_SMULO:
+  case TargetOpcode::G_UMULO: {
+    // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
+    // result.
+    auto [Res, Overflow, LHS, RHS] = MI.getFirst4Regs();
+    LLT Ty = MRI.getType(Res);
+
+    unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
+                          ? TargetOpcode::G_SMULH
+                          : TargetOpcode::G_UMULH;
+
+    Observer.changingInstr(MI);
+    const auto &TII = MIRBuilder.getTII();
+    MI.setDesc(TII.get(TargetOpcode::G_MUL));
+    MI.removeOperand(1);
+    Observer.changedInstr(MI);
+
+    auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
+    auto Zero = MIRBuilder.buildConstant(Ty, 0);
+
+    // Move insert point forward so we can use the Res register if needed.
+    MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+
+    // For *signed* multiply, overflow is detected by checking:
+    // (hi != (lo >> bitwidth-1))
+    if (Opcode == TargetOpcode::G_SMULH) {
+      auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
+      auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
+      MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
+    } else {
+      MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
+    }
+    return Legalized;
+  }
+  case TargetOpcode::G_FNEG: {
+    auto [Res, SubByReg] = MI.getFirst2Regs();
+    LLT Ty = MRI.getType(Res);
+
+    // TODO: Handle vector types once we are able to
+    // represent them.
+    if (Ty.isVector())
+      return UnableToLegalize;
+    auto SignMask =
+        MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
+    MIRBuilder.buildXor(Res, SubByReg, SignMask);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_FSUB:
+  case TargetOpcode::G_STRICT_FSUB: {
+    auto [Res, LHS, RHS] = MI.getFirst3Regs();
+    LLT Ty = MRI.getType(Res);
+
+    // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
+    auto Neg = MIRBuilder.buildFNeg(Ty, RHS);
+
+    if (MI.getOpcode() == TargetOpcode::G_STRICT_FSUB)
+      MIRBuilder.buildStrictFAdd(Res, LHS, Neg, MI.getFlags());
+    else
+      MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_FMAD:
+    return lowerFMad(MI);
+  case TargetOpcode::G_FFLOOR:
+    return lowerFFloor(MI);
+  case TargetOpcode::G_INTRINSIC_ROUND:
+    return lowerIntrinsicRound(MI);
+  case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
+    // Since round even is the assumed rounding mode for unconstrained FP
+    // operations, rint and roundeven are the same operation.
+    changeOpcode(MI, TargetOpcode::G_FRINT);
+    return Legalized;
+  }
+  case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
+    auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs();
+    MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
+                                  **MI.memoperands_begin());
+    MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_SEXTLOAD:
+  case TargetOpcode::G_ZEXTLOAD:
+    return lowerLoad(cast<GAnyLoad>(MI));
+  case TargetOpcode::G_STORE:
+    return lowerStore(cast<GStore>(MI));
+  case TargetOpcode::G_CTLZ_ZERO_UNDEF:
+  case TargetOpcode::G_CTTZ_ZERO_UNDEF:
+  case TargetOpcode::G_CTLZ:
+  case TargetOpcode::G_CTTZ:
+  case TargetOpcode::G_CTPOP:
+    return lowerBitCount(MI);
+  case G_UADDO: {
+    auto [Res, CarryOut, LHS, RHS] = MI.getFirst4Regs();
+
+    MIRBuilder.buildAdd(Res, LHS, RHS);
+    MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case G_UADDE: {
+    auto [Res, CarryOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
+    LLT Ty = MRI.getType(Res);
+
+    auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
+    auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
+    MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
+    MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case G_USUBO: {
+    auto [Res, BorrowOut, LHS, RHS] = MI.getFirst4Regs();
+
+    MIRBuilder.buildSub(Res, LHS, RHS);
+    MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case G_USUBE: {
+    auto [Res, BorrowOut, LHS, RHS, BorrowIn] = MI.getFirst5Regs();
+    const LLT CondTy = MRI.getType(BorrowOut);
+    const LLT Ty = MRI.getType(Res);
+
+    auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
+    auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
+    MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
+
+    auto LHS_EQ_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, LHS, RHS);
+    auto LHS_ULT_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, LHS, RHS);
+    MIRBuilder.buildSelect(BorrowOut, LHS_EQ_RHS, BorrowIn, LHS_ULT_RHS);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case G_UITOFP:
+    return lowerUITOFP(MI);
+  case G_SITOFP:
+    return lowerSITOFP(MI);
+  case G_FPTOUI:
+    return lowerFPTOUI(MI);
+  case G_FPTOSI:
+    return lowerFPTOSI(MI);
+  case G_FPTRUNC:
+    return lowerFPTRUNC(MI);
+  case G_FPOWI:
+    return lowerFPOWI(MI);
+  case G_SMIN:
+  case G_SMAX:
+  case G_UMIN:
+  case G_UMAX:
+    return lowerMinMax(MI);
+  case G_FCOPYSIGN:
+    return lowerFCopySign(MI);
+  case G_FMINNUM:
+  case G_FMAXNUM:
+    return lowerFMinNumMaxNum(MI);
+  case G_MERGE_VALUES:
+    return lowerMergeValues(MI);
+  case G_UNMERGE_VALUES:
+    return lowerUnmergeValues(MI);
+  case TargetOpcode::G_SEXT_INREG: {
+    assert(MI.getOperand(2).isImm() && "Expected immediate");
+    int64_t SizeInBits = MI.getOperand(2).getImm();
+
+    auto [DstReg, SrcReg] = MI.getFirst2Regs();
+    LLT DstTy = MRI.getType(DstReg);
+    Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
+
+    auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
+    MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
+    MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case G_EXTRACT_VECTOR_ELT:
+  case G_INSERT_VECTOR_ELT:
+    return lowerExtractInsertVectorElt(MI);
+  case G_SHUFFLE_VECTOR:
+    return lowerShuffleVector(MI);
+  case G_DYN_STACKALLOC:
+    return lowerDynStackAlloc(MI);
+  case G_EXTRACT:
+    return lowerExtract(MI);
+  case G_INSERT:
+    return lowerInsert(MI);
+  case G_BSWAP:
+    return lowerBswap(MI);
+  case G_BITREVERSE:
+    return lowerBitreverse(MI);
+  case G_READ_REGISTER:
+  case G_WRITE_REGISTER:
+    return lowerReadWriteRegister(MI);
+  case G_UADDSAT:
+  case G_USUBSAT: {
+    // Try to make a reasonable guess about which lowering strategy to use. The
+    // target can override this with custom lowering and calling the
+    // implementation functions.
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    if (LI.isLegalOrCustom({G_UMIN, Ty}))
+      return lowerAddSubSatToMinMax(MI);
+    return lowerAddSubSatToAddoSubo(MI);
+  }
+  case G_SADDSAT:
+  case G_SSUBSAT: {
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+
+    // FIXME: It would probably make more sense to see if G_SADDO is preferred,
+    // since it's a shorter expansion. However, we would need to figure out the
+    // preferred boolean type for the carry out for the query.
+    if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
+      return lowerAddSubSatToMinMax(MI);
+    return lowerAddSubSatToAddoSubo(MI);
+  }
+  case G_SSHLSAT:
+  case G_USHLSAT:
+    return lowerShlSat(MI);
+  case G_ABS:
+    return lowerAbsToAddXor(MI);
+  case G_SELECT:
+    return lowerSelect(MI);
+  case G_IS_FPCLASS:
+    return lowerISFPCLASS(MI);
+  case G_SDIVREM:
+  case G_UDIVREM:
+    return lowerDIVREM(MI);
+  case G_FSHL:
+  case G_FSHR:
+    return lowerFunnelShift(MI);
+  case G_ROTL:
+  case G_ROTR:
+    return lowerRotate(MI);
+  case G_MEMSET:
+  case G_MEMCPY:
+  case G_MEMMOVE:
+    return lowerMemCpyFamily(MI);
+  case G_MEMCPY_INLINE:
+    return lowerMemcpyInline(MI);
+  GISEL_VECREDUCE_CASES_NONSEQ
+    return lowerVectorReduction(MI);
+  }
+}
+
+Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
+                                                  Align MinAlign) const {
+  // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
+  // datalayout for the preferred alignment. Also there should be a target hook
+  // for this to allow targets to reduce the alignment and ignore the
+  // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
+  // the type.
+  return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
+}
+
+MachineInstrBuilder
+LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
+                                      MachinePointerInfo &PtrInfo) {
+  MachineFunction &MF = MIRBuilder.getMF();
+  const DataLayout &DL = MIRBuilder.getDataLayout();
+  int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
+
+  unsigned AddrSpace = DL.getAllocaAddrSpace();
+  LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
+
+  PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
+  return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
+}
+
+static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
+                                        LLT VecTy) {
+  int64_t IdxVal;
+  if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
+    return IdxReg;
+
+  LLT IdxTy = B.getMRI()->getType(IdxReg);
+  unsigned NElts = VecTy.getNumElements();
+  if (isPowerOf2_32(NElts)) {
+    APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
+    return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
+  }
+
+  return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
+      .getReg(0);
+}
+
+Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
+                                                  Register Index) {
+  LLT EltTy = VecTy.getElementType();
+
+  // Calculate the element offset and add it to the pointer.
+  unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
+  assert(EltSize * 8 == EltTy.getSizeInBits() &&
+         "Converting bits to bytes lost precision");
+
+  Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
+
+  LLT IdxTy = MRI.getType(Index);
+  auto Mul = MIRBuilder.buildMul(IdxTy, Index,
+                                 MIRBuilder.buildConstant(IdxTy, EltSize));
+
+  LLT PtrTy = MRI.getType(VecPtr);
+  return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
+}
+
+#ifndef NDEBUG
+/// Check that all vector operands have same number of elements. Other operands
+/// should be listed in NonVecOp.
+static bool hasSameNumEltsOnAllVectorOperands(
+    GenericMachineInstr &MI, MachineRegisterInfo &MRI,
+    std::initializer_list<unsigned> NonVecOpIndices) {
+  if (MI.getNumMemOperands() != 0)
+    return false;
+
+  LLT VecTy = MRI.getType(MI.getReg(0));
+  if (!VecTy.isVector())
+    return false;
+  unsigned NumElts = VecTy.getNumElements();
+
+  for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
+    MachineOperand &Op = MI.getOperand(OpIdx);
+    if (!Op.isReg()) {
+      if (!is_contained(NonVecOpIndices, OpIdx))
+        return false;
+      continue;
+    }
+
+    LLT Ty = MRI.getType(Op.getReg());
+    if (!Ty.isVector()) {
+      if (!is_contained(NonVecOpIndices, OpIdx))
+        return false;
+      continue;
+    }
+
+    if (Ty.getNumElements() != NumElts)
+      return false;
+  }
+
+  return true;
+}
+#endif
+
+/// Fill \p DstOps with DstOps that have same number of elements combined as
+/// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
+/// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
+/// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
+static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
+                       unsigned NumElts) {
+  LLT LeftoverTy;
+  assert(Ty.isVector() && "Expected vector type");
+  LLT EltTy = Ty.getElementType();
+  LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
+  int NumParts, NumLeftover;
+  std::tie(NumParts, NumLeftover) =
+      getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy);
+
+  assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
+  for (int i = 0; i < NumParts; ++i) {
+    DstOps.push_back(NarrowTy);
+  }
+
+  if (LeftoverTy.isValid()) {
+    assert(NumLeftover == 1 && "expected exactly one leftover");
+    DstOps.push_back(LeftoverTy);
+  }
+}
+
+/// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
+/// made from \p Op depending on operand type.
+static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
+                           MachineOperand &Op) {
+  for (unsigned i = 0; i < N; ++i) {
+    if (Op.isReg())
+      Ops.push_back(Op.getReg());
+    else if (Op.isImm())
+      Ops.push_back(Op.getImm());
+    else if (Op.isPredicate())
+      Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate()));
+    else
+      llvm_unreachable("Unsupported type");
+  }
+}
+
+// Handle splitting vector operations which need to have the same number of
+// elements in each type index, but each type index may have a different element
+// type.
+//
+// e.g.  <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
+//       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
+//       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
+//
+// Also handles some irregular breakdown cases, e.g.
+// e.g.  <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
+//       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
+//             s64 = G_SHL s64, s32
+LegalizerHelper::LegalizeResult
+LegalizerHelper::fewerElementsVectorMultiEltType(
+    GenericMachineInstr &MI, unsigned NumElts,
+    std::initializer_list<unsigned> NonVecOpIndices) {
+  assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
+         "Non-compatible opcode or not specified non-vector operands");
+  unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
+
+  unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
+  unsigned NumDefs = MI.getNumDefs();
+
+  // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
+  // Build instructions with DstOps to use instruction found by CSE directly.
+  // CSE copies found instruction into given vreg when building with vreg dest.
+  SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
+  // Output registers will be taken from created instructions.
+  SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
+  for (unsigned i = 0; i < NumDefs; ++i) {
+    makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts);
+  }
+
+  // Split vector input operands into sub-vectors with NumElts elts + Leftover.
+  // Operands listed in NonVecOpIndices will be used as is without splitting;
+  // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
+  // scalar condition (op 1), immediate in sext_inreg (op 2).
+  SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
+  for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
+       ++UseIdx, ++UseNo) {
+    if (is_contained(NonVecOpIndices, UseIdx)) {
+      broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(),
+                     MI.getOperand(UseIdx));
+    } else {
+      SmallVector<Register, 8> SplitPieces;
+      extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces);
+      for (auto Reg : SplitPieces)
+        InputOpsPieces[UseNo].push_back(Reg);
+    }
+  }
+
+  unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
+
+  // Take i-th piece of each input operand split and build sub-vector/scalar
+  // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
+  for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
+    SmallVector<DstOp, 2> Defs;
+    for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
+      Defs.push_back(OutputOpsPieces[DstNo][i]);
+
+    SmallVector<SrcOp, 3> Uses;
+    for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
+      Uses.push_back(InputOpsPieces[InputNo][i]);
+
+    auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags());
+    for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
+      OutputRegs[DstNo].push_back(I.getReg(DstNo));
+  }
+
+  // Merge small outputs into MI's output for each def operand.
+  if (NumLeftovers) {
+    for (unsigned i = 0; i < NumDefs; ++i)
+      mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]);
+  } else {
+    for (unsigned i = 0; i < NumDefs; ++i)
+      MIRBuilder.buildMergeLikeInstr(MI.getReg(i), OutputRegs[i]);
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
+                                        unsigned NumElts) {
+  unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
+
+  unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
+  unsigned NumDefs = MI.getNumDefs();
+
+  SmallVector<DstOp, 8> OutputOpsPieces;
+  SmallVector<Register, 8> OutputRegs;
+  makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts);
+
+  // Instructions that perform register split will be inserted in basic block
+  // where register is defined (basic block is in the next operand).
+  SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
+  for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
+       UseIdx += 2, ++UseNo) {
+    MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB();
+    MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
+    extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo]);
+  }
+
+  // Build PHIs with fewer elements.
+  unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
+  MIRBuilder.setInsertPt(*MI.getParent(), MI);
+  for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
+    auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI);
+    Phi.addDef(
+        MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI)));
+    OutputRegs.push_back(Phi.getReg(0));
+
+    for (unsigned j = 0; j < NumInputs / 2; ++j) {
+      Phi.addUse(InputOpsPieces[j][i]);
+      Phi.add(MI.getOperand(1 + j * 2 + 1));
+    }
+  }
+
+  // Merge small outputs into MI's def.
+  if (NumLeftovers) {
+    mergeMixedSubvectors(MI.getReg(0), OutputRegs);
+  } else {
+    MIRBuilder.buildMergeLikeInstr(MI.getReg(0), OutputRegs);
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
+                                                  unsigned TypeIdx,
+                                                  LLT NarrowTy) {
+  const int NumDst = MI.getNumOperands() - 1;
+  const Register SrcReg = MI.getOperand(NumDst).getReg();
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+  LLT SrcTy = MRI.getType(SrcReg);
+
+  if (TypeIdx != 1 || NarrowTy == DstTy)
+    return UnableToLegalize;
+
+  // Requires compatible types. Otherwise SrcReg should have been defined by
+  // merge-like instruction that would get artifact combined. Most likely
+  // instruction that defines SrcReg has to perform more/fewer elements
+  // legalization compatible with NarrowTy.
+  assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
+  assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
+
+  if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
+      (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
+    return UnableToLegalize;
+
+  // This is most likely DstTy (smaller then register size) packed in SrcTy
+  // (larger then register size) and since unmerge was not combined it will be
+  // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
+  // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
+
+  // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
+  //
+  // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
+  // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
+  // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
+  auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg);
+  const int NumUnmerge = Unmerge->getNumOperands() - 1;
+  const int PartsPerUnmerge = NumDst / NumUnmerge;
+
+  for (int I = 0; I != NumUnmerge; ++I) {
+    auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
+
+    for (int J = 0; J != PartsPerUnmerge; ++J)
+      MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
+    MIB.addUse(Unmerge.getReg(I));
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
+                                          LLT NarrowTy) {
+  auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
+  // Requires compatible types. Otherwise user of DstReg did not perform unmerge
+  // that should have been artifact combined. Most likely instruction that uses
+  // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
+  assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
+  assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
+  if (NarrowTy == SrcTy)
+    return UnableToLegalize;
+
+  // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
+  // is for old mir tests. Since the changes to more/fewer elements it should no
+  // longer be possible to generate MIR like this when starting from llvm-ir
+  // because LCMTy approach was replaced with merge/unmerge to vector elements.
+  if (TypeIdx == 1) {
+    assert(SrcTy.isVector() && "Expected vector types");
+    assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
+    if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
+        (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
+      return UnableToLegalize;
+    // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
+    //
+    // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
+    // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
+    // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
+    // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
+    // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
+    // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
+
+    SmallVector<Register, 8> Elts;
+    LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType();
+    for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
+      auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg());
+      for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
+        Elts.push_back(Unmerge.getReg(j));
+    }
+
+    SmallVector<Register, 8> NarrowTyElts;
+    unsigned NumNarrowTyElts = NarrowTy.getNumElements();
+    unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
+    for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
+         ++i, Offset += NumNarrowTyElts) {
+      ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
+      NarrowTyElts.push_back(
+          MIRBuilder.buildMergeLikeInstr(NarrowTy, Pieces).getReg(0));
+    }
+
+    MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  assert(TypeIdx == 0 && "Bad type index");
+  if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
+      (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
+    return UnableToLegalize;
+
+  // This is most likely SrcTy (smaller then register size) packed in DstTy
+  // (larger then register size) and since merge was not combined it will be
+  // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
+  // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
+
+  // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
+  //
+  // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
+  // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
+  // %0:_(DstTy)  = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
+  SmallVector<Register, 8> NarrowTyElts;
+  unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
+  unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
+  unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
+  for (unsigned i = 0; i < NumParts; ++i) {
+    SmallVector<Register, 8> Sources;
+    for (unsigned j = 0; j < NumElts; ++j)
+      Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg());
+    NarrowTyElts.push_back(
+        MIRBuilder.buildMergeLikeInstr(NarrowTy, Sources).getReg(0));
+  }
+
+  MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
+                                                           unsigned TypeIdx,
+                                                           LLT NarrowVecTy) {
+  auto [DstReg, SrcVec] = MI.getFirst2Regs();
+  Register InsertVal;
+  bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
+
+  assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
+  if (IsInsert)
+    InsertVal = MI.getOperand(2).getReg();
+
+  Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
+
+  // TODO: Handle total scalarization case.
+  if (!NarrowVecTy.isVector())
+    return UnableToLegalize;
+
+  LLT VecTy = MRI.getType(SrcVec);
+
+  // If the index is a constant, we can really break this down as you would
+  // expect, and index into the target size pieces.
+  int64_t IdxVal;
+  auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
+  if (MaybeCst) {
+    IdxVal = MaybeCst->Value.getSExtValue();
+    // Avoid out of bounds indexing the pieces.
+    if (IdxVal >= VecTy.getNumElements()) {
+      MIRBuilder.buildUndef(DstReg);
+      MI.eraseFromParent();
+      return Legalized;
+    }
+
+    SmallVector<Register, 8> VecParts;
+    LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
+
+    // Build a sequence of NarrowTy pieces in VecParts for this operand.
+    LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
+                                    TargetOpcode::G_ANYEXT);
+
+    unsigned NewNumElts = NarrowVecTy.getNumElements();
+
+    LLT IdxTy = MRI.getType(Idx);
+    int64_t PartIdx = IdxVal / NewNumElts;
+    auto NewIdx =
+        MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
+
+    if (IsInsert) {
+      LLT PartTy = MRI.getType(VecParts[PartIdx]);
+
+      // Use the adjusted index to insert into one of the subvectors.
+      auto InsertPart = MIRBuilder.buildInsertVectorElement(
+          PartTy, VecParts[PartIdx], InsertVal, NewIdx);
+      VecParts[PartIdx] = InsertPart.getReg(0);
+
+      // Recombine the inserted subvector with the others to reform the result
+      // vector.
+      buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
+    } else {
+      MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
+    }
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  // With a variable index, we can't perform the operation in a smaller type, so
+  // we're forced to expand this.
+  //
+  // TODO: We could emit a chain of compare/select to figure out which piece to
+  // index.
+  return lowerExtractInsertVectorElt(MI);
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
+                                      LLT NarrowTy) {
+  // FIXME: Don't know how to handle secondary types yet.
+  if (TypeIdx != 0)
+    return UnableToLegalize;
+
+  // This implementation doesn't work for atomics. Give up instead of doing
+  // something invalid.
+  if (LdStMI.isAtomic())
+    return UnableToLegalize;
+
+  bool IsLoad = isa<GLoad>(LdStMI);
+  Register ValReg = LdStMI.getReg(0);
+  Register AddrReg = LdStMI.getPointerReg();
+  LLT ValTy = MRI.getType(ValReg);
+
+  // FIXME: Do we need a distinct NarrowMemory legalize action?
+  if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
+    LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
+    return UnableToLegalize;
+  }
+
+  int NumParts = -1;
+  int NumLeftover = -1;
+  LLT LeftoverTy;
+  SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
+  if (IsLoad) {
+    std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
+  } else {
+    if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
+                     NarrowLeftoverRegs)) {
+      NumParts = NarrowRegs.size();
+      NumLeftover = NarrowLeftoverRegs.size();
+    }
+  }
+
+  if (NumParts == -1)
+    return UnableToLegalize;
+
+  LLT PtrTy = MRI.getType(AddrReg);
+  const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
+
+  unsigned TotalSize = ValTy.getSizeInBits();
+
+  // Split the load/store into PartTy sized pieces starting at Offset. If this
+  // is a load, return the new registers in ValRegs. For a store, each elements
+  // of ValRegs should be PartTy. Returns the next offset that needs to be
+  // handled.
+  bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
+  auto MMO = LdStMI.getMMO();
+  auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
+                             unsigned NumParts, unsigned Offset) -> unsigned {
+    MachineFunction &MF = MIRBuilder.getMF();
+    unsigned PartSize = PartTy.getSizeInBits();
+    for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
+         ++Idx) {
+      unsigned ByteOffset = Offset / 8;
+      Register NewAddrReg;
+
+      MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
+
+      MachineMemOperand *NewMMO =
+          MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
+
+      if (IsLoad) {
+        Register Dst = MRI.createGenericVirtualRegister(PartTy);
+        ValRegs.push_back(Dst);
+        MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
+      } else {
+        MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
+      }
+      Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
+    }
+
+    return Offset;
+  };
+
+  unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
+  unsigned HandledOffset =
+      splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
+
+  // Handle the rest of the register if this isn't an even type breakdown.
+  if (LeftoverTy.isValid())
+    splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
+
+  if (IsLoad) {
+    insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
+                LeftoverTy, NarrowLeftoverRegs);
+  }
+
+  LdStMI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
+                                     LLT NarrowTy) {
+  using namespace TargetOpcode;
+  GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI);
+  unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
+
+  switch (MI.getOpcode()) {
+  case G_IMPLICIT_DEF:
+  case G_TRUNC:
+  case G_AND:
+  case G_OR:
+  case G_XOR:
+  case G_ADD:
+  case G_SUB:
+  case G_MUL:
+  case G_PTR_ADD:
+  case G_SMULH:
+  case G_UMULH:
+  case G_FADD:
+  case G_FMUL:
+  case G_FSUB:
+  case G_FNEG:
+  case G_FABS:
+  case G_FCANONICALIZE:
+  case G_FDIV:
+  case G_FREM:
+  case G_FMA:
+  case G_FMAD:
+  case G_FPOW:
+  case G_FEXP:
+  case G_FEXP2:
+  case G_FLOG:
+  case G_FLOG2:
+  case G_FLOG10:
+  case G_FLDEXP:
+  case G_FNEARBYINT:
+  case G_FCEIL:
+  case G_FFLOOR:
+  case G_FRINT:
+  case G_INTRINSIC_ROUND:
+  case G_INTRINSIC_ROUNDEVEN:
+  case G_INTRINSIC_TRUNC:
+  case G_FCOS:
+  case G_FSIN:
+  case G_FSQRT:
+  case G_BSWAP:
+  case G_BITREVERSE:
+  case G_SDIV:
+  case G_UDIV:
+  case G_SREM:
+  case G_UREM:
+  case G_SDIVREM:
+  case G_UDIVREM:
+  case G_SMIN:
+  case G_SMAX:
+  case G_UMIN:
+  case G_UMAX:
+  case G_ABS:
+  case G_FMINNUM:
+  case G_FMAXNUM:
+  case G_FMINNUM_IEEE:
+  case G_FMAXNUM_IEEE:
+  case G_FMINIMUM:
+  case G_FMAXIMUM:
+  case G_FSHL:
+  case G_FSHR:
+  case G_ROTL:
+  case G_ROTR:
+  case G_FREEZE:
+  case G_SADDSAT:
+  case G_SSUBSAT:
+  case G_UADDSAT:
+  case G_USUBSAT:
+  case G_UMULO:
+  case G_SMULO:
+  case G_SHL:
+  case G_LSHR:
+  case G_ASHR:
+  case G_SSHLSAT:
+  case G_USHLSAT:
+  case G_CTLZ:
+  case G_CTLZ_ZERO_UNDEF:
+  case G_CTTZ:
+  case G_CTTZ_ZERO_UNDEF:
+  case G_CTPOP:
+  case G_FCOPYSIGN:
+  case G_ZEXT:
+  case G_SEXT:
+  case G_ANYEXT:
+  case G_FPEXT:
+  case G_FPTRUNC:
+  case G_SITOFP:
+  case G_UITOFP:
+  case G_FPTOSI:
+  case G_FPTOUI:
+  case G_INTTOPTR:
+  case G_PTRTOINT:
+  case G_ADDRSPACE_CAST:
+  case G_UADDO:
+  case G_USUBO:
+  case G_UADDE:
+  case G_USUBE:
+  case G_SADDO:
+  case G_SSUBO:
+  case G_SADDE:
+  case G_SSUBE:
+  case G_STRICT_FADD:
+  case G_STRICT_FSUB:
+  case G_STRICT_FMUL:
+  case G_STRICT_FMA:
+  case G_STRICT_FLDEXP:
+  case G_FFREXP:
+    return fewerElementsVectorMultiEltType(GMI, NumElts);
+  case G_ICMP:
+  case G_FCMP:
+    return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/});
+  case G_IS_FPCLASS:
+    return fewerElementsVectorMultiEltType(GMI, NumElts, {2, 3 /*mask,fpsem*/});
+  case G_SELECT:
+    if (MRI.getType(MI.getOperand(1).getReg()).isVector())
+      return fewerElementsVectorMultiEltType(GMI, NumElts);
+    return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/});
+  case G_PHI:
+    return fewerElementsVectorPhi(GMI, NumElts);
+  case G_UNMERGE_VALUES:
+    return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
+  case G_BUILD_VECTOR:
+    assert(TypeIdx == 0 && "not a vector type index");
+    return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
+  case G_CONCAT_VECTORS:
+    if (TypeIdx != 1) // TODO: This probably does work as expected already.
+      return UnableToLegalize;
+    return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
+  case G_EXTRACT_VECTOR_ELT:
+  case G_INSERT_VECTOR_ELT:
+    return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
+  case G_LOAD:
+  case G_STORE:
+    return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
+  case G_SEXT_INREG:
+    return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/});
+  GISEL_VECREDUCE_CASES_NONSEQ
+    return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
+  case G_SHUFFLE_VECTOR:
+    return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
+  default:
+    return UnableToLegalize;
+  }
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
+    MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  if (TypeIdx != 0)
+    return UnableToLegalize;
+
+  auto [DstReg, DstTy, Src1Reg, Src1Ty, Src2Reg, Src2Ty] =
+      MI.getFirst3RegLLTs();
+  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+  // The shuffle should be canonicalized by now.
+  if (DstTy != Src1Ty)
+    return UnableToLegalize;
+  if (DstTy != Src2Ty)
+    return UnableToLegalize;
+
+  if (!isPowerOf2_32(DstTy.getNumElements()))
+    return UnableToLegalize;
+
+  // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
+  // Further legalization attempts will be needed to do split further.
+  NarrowTy =
+      DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
+  unsigned NewElts = NarrowTy.getNumElements();
+
+  SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
+  extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs);
+  extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs);
+  Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
+                        SplitSrc2Regs[1]};
+
+  Register Hi, Lo;
+
+  // If Lo or Hi uses elements from at most two of the four input vectors, then
+  // express it as a vector shuffle of those two inputs.  Otherwise extract the
+  // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
+  SmallVector<int, 16> Ops;
+  for (unsigned High = 0; High < 2; ++High) {
+    Register &Output = High ? Hi : Lo;
+
+    // Build a shuffle mask for the output, discovering on the fly which
+    // input vectors to use as shuffle operands (recorded in InputUsed).
+    // If building a suitable shuffle vector proves too hard, then bail
+    // out with useBuildVector set.
+    unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
+    unsigned FirstMaskIdx = High * NewElts;
+    bool UseBuildVector = false;
+    for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
+      // The mask element.  This indexes into the input.
+      int Idx = Mask[FirstMaskIdx + MaskOffset];
+
+      // The input vector this mask element indexes into.
+      unsigned Input = (unsigned)Idx / NewElts;
+
+      if (Input >= std::size(Inputs)) {
+        // The mask element does not index into any input vector.
+        Ops.push_back(-1);
+        continue;
+      }
+
+      // Turn the index into an offset from the start of the input vector.
+      Idx -= Input * NewElts;
+
+      // Find or create a shuffle vector operand to hold this input.
+      unsigned OpNo;
+      for (OpNo = 0; OpNo < std::size(InputUsed); ++OpNo) {
+        if (InputUsed[OpNo] == Input) {
+          // This input vector is already an operand.
+          break;
+        } else if (InputUsed[OpNo] == -1U) {
+          // Create a new operand for this input vector.
+          InputUsed[OpNo] = Input;
+          break;
+        }
+      }
+
+      if (OpNo >= std::size(InputUsed)) {
+        // More than two input vectors used!  Give up on trying to create a
+        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
+        UseBuildVector = true;
+        break;
+      }
+
+      // Add the mask index for the new shuffle vector.
+      Ops.push_back(Idx + OpNo * NewElts);
+    }
+
+    if (UseBuildVector) {
+      LLT EltTy = NarrowTy.getElementType();
+      SmallVector<Register, 16> SVOps;
+
+      // Extract the input elements by hand.
+      for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
+        // The mask element.  This indexes into the input.
+        int Idx = Mask[FirstMaskIdx + MaskOffset];
+
+        // The input vector this mask element indexes into.
+        unsigned Input = (unsigned)Idx / NewElts;
+
+        if (Input >= std::size(Inputs)) {
+          // The mask element is "undef" or indexes off the end of the input.
+          SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
+          continue;
+        }
+
+        // Turn the index into an offset from the start of the input vector.
+        Idx -= Input * NewElts;
+
+        // Extract the vector element by hand.
+        SVOps.push_back(MIRBuilder
+                            .buildExtractVectorElement(
+                                EltTy, Inputs[Input],
+                                MIRBuilder.buildConstant(LLT::scalar(32), Idx))
+                            .getReg(0));
+      }
+
+      // Construct the Lo/Hi output using a G_BUILD_VECTOR.
+      Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
+    } else if (InputUsed[0] == -1U) {
+      // No input vectors were used! The result is undefined.
+      Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
+    } else {
+      Register Op0 = Inputs[InputUsed[0]];
+      // If only one input was used, use an undefined vector for the other.
+      Register Op1 = InputUsed[1] == -1U
+                         ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
+                         : Inputs[InputUsed[1]];
+      // At least one input vector was used. Create a new shuffle vector.
+      Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
+    }
+
+    Ops.clear();
+  }
+
+  MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi});
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+static unsigned getScalarOpcForReduction(unsigned Opc) {
+  unsigned ScalarOpc;
+  switch (Opc) {
+  case TargetOpcode::G_VECREDUCE_FADD:
+    ScalarOpc = TargetOpcode::G_FADD;
+    break;
+  case TargetOpcode::G_VECREDUCE_FMUL:
+    ScalarOpc = TargetOpcode::G_FMUL;
+    break;
+  case TargetOpcode::G_VECREDUCE_FMAX:
+    ScalarOpc = TargetOpcode::G_FMAXNUM;
+    break;
+  case TargetOpcode::G_VECREDUCE_FMIN:
+    ScalarOpc = TargetOpcode::G_FMINNUM;
+    break;
+  case TargetOpcode::G_VECREDUCE_ADD:
+    ScalarOpc = TargetOpcode::G_ADD;
+    break;
+  case TargetOpcode::G_VECREDUCE_MUL:
+    ScalarOpc = TargetOpcode::G_MUL;
+    break;
+  case TargetOpcode::G_VECREDUCE_AND:
+    ScalarOpc = TargetOpcode::G_AND;
+    break;
+  case TargetOpcode::G_VECREDUCE_OR:
+    ScalarOpc = TargetOpcode::G_OR;
+    break;
+  case TargetOpcode::G_VECREDUCE_XOR:
+    ScalarOpc = TargetOpcode::G_XOR;
+    break;
+  case TargetOpcode::G_VECREDUCE_SMAX:
+    ScalarOpc = TargetOpcode::G_SMAX;
+    break;
+  case TargetOpcode::G_VECREDUCE_SMIN:
+    ScalarOpc = TargetOpcode::G_SMIN;
+    break;
+  case TargetOpcode::G_VECREDUCE_UMAX:
+    ScalarOpc = TargetOpcode::G_UMAX;
+    break;
+  case TargetOpcode::G_VECREDUCE_UMIN:
+    ScalarOpc = TargetOpcode::G_UMIN;
+    break;
+  default:
+    llvm_unreachable("Unhandled reduction");
+  }
+  return ScalarOpc;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
+    MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
+  unsigned Opc = MI.getOpcode();
+  assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
+         Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
+         "Sequential reductions not expected");
+
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  // The semantics of the normal non-sequential reductions allow us to freely
+  // re-associate the operation.
+  auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
+
+  if (NarrowTy.isVector() &&
+      (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
+    return UnableToLegalize;
+
+  unsigned ScalarOpc = getScalarOpcForReduction(Opc);
+  SmallVector<Register> SplitSrcs;
+  // If NarrowTy is a scalar then we're being asked to scalarize.
+  const unsigned NumParts =
+      NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
+                          : SrcTy.getNumElements();
+
+  extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
+  if (NarrowTy.isScalar()) {
+    if (DstTy != NarrowTy)
+      return UnableToLegalize; // FIXME: handle implicit extensions.
+
+    if (isPowerOf2_32(NumParts)) {
+      // Generate a tree of scalar operations to reduce the critical path.
+      SmallVector<Register> PartialResults;
+      unsigned NumPartsLeft = NumParts;
+      while (NumPartsLeft > 1) {
+        for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
+          PartialResults.emplace_back(
+              MIRBuilder
+                  .buildInstr(ScalarOpc, {NarrowTy},
+                              {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
+                  .getReg(0));
+        }
+        SplitSrcs = PartialResults;
+        PartialResults.clear();
+        NumPartsLeft = SplitSrcs.size();
+      }
+      assert(SplitSrcs.size() == 1);
+      MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
+      MI.eraseFromParent();
+      return Legalized;
+    }
+    // If we can't generate a tree, then just do sequential operations.
+    Register Acc = SplitSrcs[0];
+    for (unsigned Idx = 1; Idx < NumParts; ++Idx)
+      Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
+                .getReg(0);
+    MIRBuilder.buildCopy(DstReg, Acc);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  SmallVector<Register> PartialReductions;
+  for (unsigned Part = 0; Part < NumParts; ++Part) {
+    PartialReductions.push_back(
+        MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
+  }
+
+
+  // If the types involved are powers of 2, we can generate intermediate vector
+  // ops, before generating a final reduction operation.
+  if (isPowerOf2_32(SrcTy.getNumElements()) &&
+      isPowerOf2_32(NarrowTy.getNumElements())) {
+    return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
+  }
+
+  Register Acc = PartialReductions[0];
+  for (unsigned Part = 1; Part < NumParts; ++Part) {
+    if (Part == NumParts - 1) {
+      MIRBuilder.buildInstr(ScalarOpc, {DstReg},
+                            {Acc, PartialReductions[Part]});
+    } else {
+      Acc = MIRBuilder
+                .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
+                .getReg(0);
+    }
+  }
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
+                                        LLT SrcTy, LLT NarrowTy,
+                                        unsigned ScalarOpc) {
+  SmallVector<Register> SplitSrcs;
+  // Split the sources into NarrowTy size pieces.
+  extractParts(SrcReg, NarrowTy,
+               SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs);
+  // We're going to do a tree reduction using vector operations until we have
+  // one NarrowTy size value left.
+  while (SplitSrcs.size() > 1) {
+    SmallVector<Register> PartialRdxs;
+    for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
+      Register LHS = SplitSrcs[Idx];
+      Register RHS = SplitSrcs[Idx + 1];
+      // Create the intermediate vector op.
+      Register Res =
+          MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
+      PartialRdxs.push_back(Res);
+    }
+    SplitSrcs = std::move(PartialRdxs);
+  }
+  // Finally generate the requested NarrowTy based reduction.
+  Observer.changingInstr(MI);
+  MI.getOperand(1).setReg(SplitSrcs[0]);
+  Observer.changedInstr(MI);
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
+                                             const LLT HalfTy, const LLT AmtTy) {
+
+  Register InL = MRI.createGenericVirtualRegister(HalfTy);
+  Register InH = MRI.createGenericVirtualRegister(HalfTy);
+  MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
+
+  if (Amt.isZero()) {
+    MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {InL, InH});
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  LLT NVT = HalfTy;
+  unsigned NVTBits = HalfTy.getSizeInBits();
+  unsigned VTBits = 2 * NVTBits;
+
+  SrcOp Lo(Register(0)), Hi(Register(0));
+  if (MI.getOpcode() == TargetOpcode::G_SHL) {
+    if (Amt.ugt(VTBits)) {
+      Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
+    } else if (Amt.ugt(NVTBits)) {
+      Lo = MIRBuilder.buildConstant(NVT, 0);
+      Hi = MIRBuilder.buildShl(NVT, InL,
+                               MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
+    } else if (Amt == NVTBits) {
+      Lo = MIRBuilder.buildConstant(NVT, 0);
+      Hi = InL;
+    } else {
+      Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
+      auto OrLHS =
+          MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
+      auto OrRHS = MIRBuilder.buildLShr(
+          NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
+      Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
+    }
+  } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
+    if (Amt.ugt(VTBits)) {
+      Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
+    } else if (Amt.ugt(NVTBits)) {
+      Lo = MIRBuilder.buildLShr(NVT, InH,
+                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
+      Hi = MIRBuilder.buildConstant(NVT, 0);
+    } else if (Amt == NVTBits) {
+      Lo = InH;
+      Hi = MIRBuilder.buildConstant(NVT, 0);
+    } else {
+      auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
+
+      auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
+      auto OrRHS = MIRBuilder.buildShl(
+          NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
+
+      Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
+      Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
+    }
+  } else {
+    if (Amt.ugt(VTBits)) {
+      Hi = Lo = MIRBuilder.buildAShr(
+          NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
+    } else if (Amt.ugt(NVTBits)) {
+      Lo = MIRBuilder.buildAShr(NVT, InH,
+                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
+      Hi = MIRBuilder.buildAShr(NVT, InH,
+                                MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
+    } else if (Amt == NVTBits) {
+      Lo = InH;
+      Hi = MIRBuilder.buildAShr(NVT, InH,
+                                MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
+    } else {
+      auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
+
+      auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
+      auto OrRHS = MIRBuilder.buildShl(
+          NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
+
+      Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
+      Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
+    }
+  }
+
+  MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {Lo, Hi});
+  MI.eraseFromParent();
+
+  return Legalized;
+}
+
+// TODO: Optimize if constant shift amount.
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
+                                   LLT RequestedTy) {
+  if (TypeIdx == 1) {
+    Observer.changingInstr(MI);
+    narrowScalarSrc(MI, RequestedTy, 2);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  if (DstTy.isVector())
+    return UnableToLegalize;
+
+  Register Amt = MI.getOperand(2).getReg();
+  LLT ShiftAmtTy = MRI.getType(Amt);
+  const unsigned DstEltSize = DstTy.getScalarSizeInBits();
+  if (DstEltSize % 2 != 0)
+    return UnableToLegalize;
+
+  // Ignore the input type. We can only go to exactly half the size of the
+  // input. If that isn't small enough, the resulting pieces will be further
+  // legalized.
+  const unsigned NewBitSize = DstEltSize / 2;
+  const LLT HalfTy = LLT::scalar(NewBitSize);
+  const LLT CondTy = LLT::scalar(1);
+
+  if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
+    return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
+                                       ShiftAmtTy);
+  }
+
+  // TODO: Expand with known bits.
+
+  // Handle the fully general expansion by an unknown amount.
+  auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
+
+  Register InL = MRI.createGenericVirtualRegister(HalfTy);
+  Register InH = MRI.createGenericVirtualRegister(HalfTy);
+  MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
+
+  auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
+  auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
+
+  auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
+  auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
+  auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
+
+  Register ResultRegs[2];
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_SHL: {
+    // Short: ShAmt < NewBitSize
+    auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
+
+    auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
+    auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
+    auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
+
+    // Long: ShAmt >= NewBitSize
+    auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
+    auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
+
+    auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
+    auto Hi = MIRBuilder.buildSelect(
+        HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
+
+    ResultRegs[0] = Lo.getReg(0);
+    ResultRegs[1] = Hi.getReg(0);
+    break;
+  }
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_ASHR: {
+    // Short: ShAmt < NewBitSize
+    auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
+
+    auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
+    auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
+    auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
+
+    // Long: ShAmt >= NewBitSize
+    MachineInstrBuilder HiL;
+    if (MI.getOpcode() == TargetOpcode::G_LSHR) {
+      HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
+    } else {
+      auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
+      HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
+    }
+    auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
+                                     {InH, AmtExcess});     // Lo from Hi part.
+
+    auto Lo = MIRBuilder.buildSelect(
+        HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
+
+    auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
+
+    ResultRegs[0] = Lo.getReg(0);
+    ResultRegs[1] = Hi.getReg(0);
+    break;
+  }
+  default:
+    llvm_unreachable("not a shift");
+  }
+
+  MIRBuilder.buildMergeLikeInstr(DstReg, ResultRegs);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
+                                       LLT MoreTy) {
+  assert(TypeIdx == 0 && "Expecting only Idx 0");
+
+  Observer.changingInstr(MI);
+  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
+    MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
+    MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
+    moreElementsVectorSrc(MI, MoreTy, I);
+  }
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
+  moreElementsVectorDst(MI, MoreTy, 0);
+  Observer.changedInstr(MI);
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
+                                    LLT MoreTy) {
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  case TargetOpcode::G_IMPLICIT_DEF:
+  case TargetOpcode::G_LOAD: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    moreElementsVectorDst(MI, MoreTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_STORE:
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    moreElementsVectorSrc(MI, MoreTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_XOR:
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_SUB:
+  case TargetOpcode::G_MUL:
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FMUL:
+  case TargetOpcode::G_UADDSAT:
+  case TargetOpcode::G_USUBSAT:
+  case TargetOpcode::G_SADDSAT:
+  case TargetOpcode::G_SSUBSAT:
+  case TargetOpcode::G_SMIN:
+  case TargetOpcode::G_SMAX:
+  case TargetOpcode::G_UMIN:
+  case TargetOpcode::G_UMAX:
+  case TargetOpcode::G_FMINNUM:
+  case TargetOpcode::G_FMAXNUM:
+  case TargetOpcode::G_FMINNUM_IEEE:
+  case TargetOpcode::G_FMAXNUM_IEEE:
+  case TargetOpcode::G_FMINIMUM:
+  case TargetOpcode::G_FMAXIMUM:
+  case TargetOpcode::G_STRICT_FADD:
+  case TargetOpcode::G_STRICT_FSUB:
+  case TargetOpcode::G_STRICT_FMUL: {
+    Observer.changingInstr(MI);
+    moreElementsVectorSrc(MI, MoreTy, 1);
+    moreElementsVectorSrc(MI, MoreTy, 2);
+    moreElementsVectorDst(MI, MoreTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_FMA:
+  case TargetOpcode::G_STRICT_FMA:
+  case TargetOpcode::G_FSHR:
+  case TargetOpcode::G_FSHL: {
+    Observer.changingInstr(MI);
+    moreElementsVectorSrc(MI, MoreTy, 1);
+    moreElementsVectorSrc(MI, MoreTy, 2);
+    moreElementsVectorSrc(MI, MoreTy, 3);
+    moreElementsVectorDst(MI, MoreTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
+  case TargetOpcode::G_EXTRACT:
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    moreElementsVectorSrc(MI, MoreTy, 1);
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_INSERT:
+  case TargetOpcode::G_INSERT_VECTOR_ELT:
+  case TargetOpcode::G_FREEZE:
+  case TargetOpcode::G_FNEG:
+  case TargetOpcode::G_FABS:
+  case TargetOpcode::G_BSWAP:
+  case TargetOpcode::G_FCANONICALIZE:
+  case TargetOpcode::G_SEXT_INREG:
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    moreElementsVectorSrc(MI, MoreTy, 1);
+    moreElementsVectorDst(MI, MoreTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  case TargetOpcode::G_SELECT: {
+    auto [DstReg, DstTy, CondReg, CondTy] = MI.getFirst2RegLLTs();
+    if (TypeIdx == 1) {
+      if (!CondTy.isScalar() ||
+          DstTy.getElementCount() != MoreTy.getElementCount())
+        return UnableToLegalize;
+
+      // This is turning a scalar select of vectors into a vector
+      // select. Broadcast the select condition.
+      auto ShufSplat = MIRBuilder.buildShuffleSplat(MoreTy, CondReg);
+      Observer.changingInstr(MI);
+      MI.getOperand(1).setReg(ShufSplat.getReg(0));
+      Observer.changedInstr(MI);
+      return Legalized;
+    }
+
+    if (CondTy.isVector())
+      return UnableToLegalize;
+
+    Observer.changingInstr(MI);
+    moreElementsVectorSrc(MI, MoreTy, 2);
+    moreElementsVectorSrc(MI, MoreTy, 3);
+    moreElementsVectorDst(MI, MoreTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_UNMERGE_VALUES:
+    return UnableToLegalize;
+  case TargetOpcode::G_PHI:
+    return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
+  case TargetOpcode::G_SHUFFLE_VECTOR:
+    return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
+  case TargetOpcode::G_BUILD_VECTOR: {
+    SmallVector<SrcOp, 8> Elts;
+    for (auto Op : MI.uses()) {
+      Elts.push_back(Op.getReg());
+    }
+
+    for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
+      Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType()));
+    }
+
+    MIRBuilder.buildDeleteTrailingVectorElements(
+        MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts));
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_TRUNC: {
+    Observer.changingInstr(MI);
+    moreElementsVectorSrc(MI, MoreTy, 1);
+    moreElementsVectorDst(MI, MoreTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_FPTRUNC:
+  case TargetOpcode::G_FPEXT: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    LLT SrcTy = LLT::fixed_vector(
+        MoreTy.getNumElements(),
+        MRI.getType(MI.getOperand(1).getReg()).getElementType());
+    moreElementsVectorSrc(MI, SrcTy, 1);
+    moreElementsVectorDst(MI, MoreTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  default:
+    return UnableToLegalize;
+  }
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::equalizeVectorShuffleLengths(MachineInstr &MI) {
+  auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
+  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+  unsigned MaskNumElts = Mask.size();
+  unsigned SrcNumElts = SrcTy.getNumElements();
+  LLT DestEltTy = DstTy.getElementType();
+
+  if (MaskNumElts == SrcNumElts)
+    return Legalized;
+
+  if (MaskNumElts < SrcNumElts) {
+    // Extend mask to match new destination vector size with
+    // undef values.
+    SmallVector<int, 16> NewMask(Mask);
+    for (unsigned I = MaskNumElts; I < SrcNumElts; ++I)
+      NewMask.push_back(-1);
+
+    moreElementsVectorDst(MI, SrcTy, 0);
+    MIRBuilder.setInstrAndDebugLoc(MI);
+    MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
+                                  MI.getOperand(1).getReg(),
+                                  MI.getOperand(2).getReg(), NewMask);
+    MI.eraseFromParent();
+
+    return Legalized;
+  }
+
+  unsigned PaddedMaskNumElts = alignTo(MaskNumElts, SrcNumElts);
+  unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
+  LLT PaddedTy = LLT::fixed_vector(PaddedMaskNumElts, DestEltTy);
+
+  // Create new source vectors by concatenating the initial
+  // source vectors with undefined vectors of the same size.
+  auto Undef = MIRBuilder.buildUndef(SrcTy);
+  SmallVector<Register, 8> MOps1(NumConcat, Undef.getReg(0));
+  SmallVector<Register, 8> MOps2(NumConcat, Undef.getReg(0));
+  MOps1[0] = MI.getOperand(1).getReg();
+  MOps2[0] = MI.getOperand(2).getReg();
+
+  auto Src1 = MIRBuilder.buildConcatVectors(PaddedTy, MOps1);
+  auto Src2 = MIRBuilder.buildConcatVectors(PaddedTy, MOps2);
+
+  // Readjust mask for new input vector length.
+  SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
+  for (unsigned I = 0; I != MaskNumElts; ++I) {
+    int Idx = Mask[I];
+    if (Idx >= static_cast<int>(SrcNumElts))
+      Idx += PaddedMaskNumElts - SrcNumElts;
+    MappedOps[I] = Idx;
+  }
+
+  // If we got more elements than required, extract subvector.
+  if (MaskNumElts != PaddedMaskNumElts) {
+    auto Shuffle =
+        MIRBuilder.buildShuffleVector(PaddedTy, Src1, Src2, MappedOps);
+
+    SmallVector<Register, 16> Elts(MaskNumElts);
+    for (unsigned I = 0; I < MaskNumElts; ++I) {
+      Elts[I] =
+          MIRBuilder.buildExtractVectorElementConstant(DestEltTy, Shuffle, I)
+              .getReg(0);
+    }
+    MIRBuilder.buildBuildVector(DstReg, Elts);
+  } else {
+    MIRBuilder.buildShuffleVector(DstReg, Src1, Src2, MappedOps);
+  }
+
+  MI.eraseFromParent();
+  return LegalizerHelper::LegalizeResult::Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
+                                           unsigned int TypeIdx, LLT MoreTy) {
+  auto [DstTy, Src1Ty, Src2Ty] = MI.getFirst3LLTs();
+  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+  unsigned NumElts = DstTy.getNumElements();
+  unsigned WidenNumElts = MoreTy.getNumElements();
+
+  if (DstTy.isVector() && Src1Ty.isVector() &&
+      DstTy.getNumElements() != Src1Ty.getNumElements()) {
+    return equalizeVectorShuffleLengths(MI);
+  }
+
+  if (TypeIdx != 0)
+    return UnableToLegalize;
+
+  // Expect a canonicalized shuffle.
+  if (DstTy != Src1Ty || DstTy != Src2Ty)
+    return UnableToLegalize;
+
+  moreElementsVectorSrc(MI, MoreTy, 1);
+  moreElementsVectorSrc(MI, MoreTy, 2);
+
+  // Adjust mask based on new input vector length.
+  SmallVector<int, 16> NewMask;
+  for (unsigned I = 0; I != NumElts; ++I) {
+    int Idx = Mask[I];
+    if (Idx < static_cast<int>(NumElts))
+      NewMask.push_back(Idx);
+    else
+      NewMask.push_back(Idx - NumElts + WidenNumElts);
+  }
+  for (unsigned I = NumElts; I != WidenNumElts; ++I)
+    NewMask.push_back(-1);
+  moreElementsVectorDst(MI, MoreTy, 0);
+  MIRBuilder.setInstrAndDebugLoc(MI);
+  MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
+                                MI.getOperand(1).getReg(),
+                                MI.getOperand(2).getReg(), NewMask);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
+                                        ArrayRef<Register> Src1Regs,
+                                        ArrayRef<Register> Src2Regs,
+                                        LLT NarrowTy) {
+  MachineIRBuilder &B = MIRBuilder;
+  unsigned SrcParts = Src1Regs.size();
+  unsigned DstParts = DstRegs.size();
+
+  unsigned DstIdx = 0; // Low bits of the result.
+  Register FactorSum =
+      B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
+  DstRegs[DstIdx] = FactorSum;
+
+  unsigned CarrySumPrevDstIdx;
+  SmallVector<Register, 4> Factors;
+
+  for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
+    // Collect low parts of muls for DstIdx.
+    for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
+         i <= std::min(DstIdx, SrcParts - 1); ++i) {
+      MachineInstrBuilder Mul =
+          B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
+      Factors.push_back(Mul.getReg(0));
+    }
+    // Collect high parts of muls from previous DstIdx.
+    for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
+         i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
+      MachineInstrBuilder Umulh =
+          B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
+      Factors.push_back(Umulh.getReg(0));
+    }
+    // Add CarrySum from additions calculated for previous DstIdx.
+    if (DstIdx != 1) {
+      Factors.push_back(CarrySumPrevDstIdx);
+    }
+
+    Register CarrySum;
+    // Add all factors and accumulate all carries into CarrySum.
+    if (DstIdx != DstParts - 1) {
+      MachineInstrBuilder Uaddo =
+          B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
+      FactorSum = Uaddo.getReg(0);
+      CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
+      for (unsigned i = 2; i < Factors.size(); ++i) {
+        MachineInstrBuilder Uaddo =
+            B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
+        FactorSum = Uaddo.getReg(0);
+        MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
+        CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
+      }
+    } else {
+      // Since value for the next index is not calculated, neither is CarrySum.
+      FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
+      for (unsigned i = 2; i < Factors.size(); ++i)
+        FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
+    }
+
+    CarrySumPrevDstIdx = CarrySum;
+    DstRegs[DstIdx] = FactorSum;
+    Factors.clear();
+  }
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
+                                    LLT NarrowTy) {
+  if (TypeIdx != 0)
+    return UnableToLegalize;
+
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstType = MRI.getType(DstReg);
+  // FIXME: add support for vector types
+  if (DstType.isVector())
+    return UnableToLegalize;
+
+  unsigned Opcode = MI.getOpcode();
+  unsigned OpO, OpE, OpF;
+  switch (Opcode) {
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_SADDE:
+  case TargetOpcode::G_UADDO:
+  case TargetOpcode::G_UADDE:
+  case TargetOpcode::G_ADD:
+    OpO = TargetOpcode::G_UADDO;
+    OpE = TargetOpcode::G_UADDE;
+    OpF = TargetOpcode::G_UADDE;
+    if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
+      OpF = TargetOpcode::G_SADDE;
+    break;
+  case TargetOpcode::G_SSUBO:
+  case TargetOpcode::G_SSUBE:
+  case TargetOpcode::G_USUBO:
+  case TargetOpcode::G_USUBE:
+  case TargetOpcode::G_SUB:
+    OpO = TargetOpcode::G_USUBO;
+    OpE = TargetOpcode::G_USUBE;
+    OpF = TargetOpcode::G_USUBE;
+    if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
+      OpF = TargetOpcode::G_SSUBE;
+    break;
+  default:
+    llvm_unreachable("Unexpected add/sub opcode!");
+  }
+
+  // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
+  unsigned NumDefs = MI.getNumExplicitDefs();
+  Register Src1 = MI.getOperand(NumDefs).getReg();
+  Register Src2 = MI.getOperand(NumDefs + 1).getReg();
+  Register CarryDst, CarryIn;
+  if (NumDefs == 2)
+    CarryDst = MI.getOperand(1).getReg();
+  if (MI.getNumOperands() == NumDefs + 3)
+    CarryIn = MI.getOperand(NumDefs + 2).getReg();
+
+  LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
+  LLT LeftoverTy, DummyTy;
+  SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
+  extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left);
+  extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left);
+
+  int NarrowParts = Src1Regs.size();
+  for (int I = 0, E = Src1Left.size(); I != E; ++I) {
+    Src1Regs.push_back(Src1Left[I]);
+    Src2Regs.push_back(Src2Left[I]);
+  }
+  DstRegs.reserve(Src1Regs.size());
+
+  for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
+    Register DstReg =
+        MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
+    Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
+    // Forward the final carry-out to the destination register
+    if (i == e - 1 && CarryDst)
+      CarryOut = CarryDst;
+
+    if (!CarryIn) {
+      MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
+                            {Src1Regs[i], Src2Regs[i]});
+    } else if (i == e - 1) {
+      MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
+                            {Src1Regs[i], Src2Regs[i], CarryIn});
+    } else {
+      MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
+                            {Src1Regs[i], Src2Regs[i], CarryIn});
+    }
+
+    DstRegs.push_back(DstReg);
+    CarryIn = CarryOut;
+  }
+  insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
+              ArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
+              ArrayRef(DstRegs).drop_front(NarrowParts));
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
+  auto [DstReg, Src1, Src2] = MI.getFirst3Regs();
+
+  LLT Ty = MRI.getType(DstReg);
+  if (Ty.isVector())
+    return UnableToLegalize;
+
+  unsigned Size = Ty.getSizeInBits();
+  unsigned NarrowSize = NarrowTy.getSizeInBits();
+  if (Size % NarrowSize != 0)
+    return UnableToLegalize;
+
+  unsigned NumParts = Size / NarrowSize;
+  bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
+  unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
+
+  SmallVector<Register, 2> Src1Parts, Src2Parts;
+  SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
+  extractParts(Src1, NarrowTy, NumParts, Src1Parts);
+  extractParts(Src2, NarrowTy, NumParts, Src2Parts);
+  multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
+
+  // Take only high half of registers if this is high mul.
+  ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
+  MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
+                                   LLT NarrowTy) {
+  if (TypeIdx != 0)
+    return UnableToLegalize;
+
+  bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
+
+  Register Src = MI.getOperand(1).getReg();
+  LLT SrcTy = MRI.getType(Src);
+
+  // If all finite floats fit into the narrowed integer type, we can just swap
+  // out the result type. This is practically only useful for conversions from
+  // half to at least 16-bits, so just handle the one case.
+  if (SrcTy.getScalarType() != LLT::scalar(16) ||
+      NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
+    return UnableToLegalize;
+
+  Observer.changingInstr(MI);
+  narrowScalarDst(MI, NarrowTy, 0,
+                  IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
+  Observer.changedInstr(MI);
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
+                                     LLT NarrowTy) {
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  uint64_t NarrowSize = NarrowTy.getSizeInBits();
+
+  int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+  // FIXME: add support for when SizeOp1 isn't an exact multiple of
+  // NarrowSize.
+  if (SizeOp1 % NarrowSize != 0)
+    return UnableToLegalize;
+  int NumParts = SizeOp1 / NarrowSize;
+
+  SmallVector<Register, 2> SrcRegs, DstRegs;
+  SmallVector<uint64_t, 2> Indexes;
+  extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
+
+  Register OpReg = MI.getOperand(0).getReg();
+  uint64_t OpStart = MI.getOperand(2).getImm();
+  uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
+  for (int i = 0; i < NumParts; ++i) {
+    unsigned SrcStart = i * NarrowSize;
+
+    if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
+      // No part of the extract uses this subregister, ignore it.
+      continue;
+    } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
+      // The entire subregister is extracted, forward the value.
+      DstRegs.push_back(SrcRegs[i]);
+      continue;
+    }
+
+    // OpSegStart is where this destination segment would start in OpReg if it
+    // extended infinitely in both directions.
+    int64_t ExtractOffset;
+    uint64_t SegSize;
+    if (OpStart < SrcStart) {
+      ExtractOffset = 0;
+      SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
+    } else {
+      ExtractOffset = OpStart - SrcStart;
+      SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
+    }
+
+    Register SegReg = SrcRegs[i];
+    if (ExtractOffset != 0 || SegSize != NarrowSize) {
+      // A genuine extract is needed.
+      SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
+      MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
+    }
+
+    DstRegs.push_back(SegReg);
+  }
+
+  Register DstReg = MI.getOperand(0).getReg();
+  if (MRI.getType(DstReg).isVector())
+    MIRBuilder.buildBuildVector(DstReg, DstRegs);
+  else if (DstRegs.size() > 1)
+    MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
+  else
+    MIRBuilder.buildCopy(DstReg, DstRegs[0]);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
+                                    LLT NarrowTy) {
+  // FIXME: Don't know how to handle secondary types yet.
+  if (TypeIdx != 0)
+    return UnableToLegalize;
+
+  SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
+  SmallVector<uint64_t, 2> Indexes;
+  LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
+  LLT LeftoverTy;
+  extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
+               LeftoverRegs);
+
+  for (Register Reg : LeftoverRegs)
+    SrcRegs.push_back(Reg);
+
+  uint64_t NarrowSize = NarrowTy.getSizeInBits();
+  Register OpReg = MI.getOperand(2).getReg();
+  uint64_t OpStart = MI.getOperand(3).getImm();
+  uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
+  for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
+    unsigned DstStart = I * NarrowSize;
+
+    if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
+      // The entire subregister is defined by this insert, forward the new
+      // value.
+      DstRegs.push_back(OpReg);
+      continue;
+    }
+
+    Register SrcReg = SrcRegs[I];
+    if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
+      // The leftover reg is smaller than NarrowTy, so we need to extend it.
+      SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
+      MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
+    }
+
+    if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
+      // No part of the insert affects this subregister, forward the original.
+      DstRegs.push_back(SrcReg);
+      continue;
+    }
+
+    // OpSegStart is where this destination segment would start in OpReg if it
+    // extended infinitely in both directions.
+    int64_t ExtractOffset, InsertOffset;
+    uint64_t SegSize;
+    if (OpStart < DstStart) {
+      InsertOffset = 0;
+      ExtractOffset = DstStart - OpStart;
+      SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
+    } else {
+      InsertOffset = OpStart - DstStart;
+      ExtractOffset = 0;
+      SegSize =
+        std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
+    }
+
+    Register SegReg = OpReg;
+    if (ExtractOffset != 0 || SegSize != OpSize) {
+      // A genuine extract is needed.
+      SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
+      MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
+    }
+
+    Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
+    MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
+    DstRegs.push_back(DstReg);
+  }
+
+  uint64_t WideSize = DstRegs.size() * NarrowSize;
+  Register DstReg = MI.getOperand(0).getReg();
+  if (WideSize > RegTy.getSizeInBits()) {
+    Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
+    MIRBuilder.buildMergeLikeInstr(MergeReg, DstRegs);
+    MIRBuilder.buildTrunc(DstReg, MergeReg);
+  } else
+    MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
+                                   LLT NarrowTy) {
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+
+  assert(MI.getNumOperands() == 3 && TypeIdx == 0);
+
+  SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
+  SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
+  SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
+  LLT LeftoverTy;
+  if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
+                    Src0Regs, Src0LeftoverRegs))
+    return UnableToLegalize;
+
+  LLT Unused;
+  if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
+                    Src1Regs, Src1LeftoverRegs))
+    llvm_unreachable("inconsistent extractParts result");
+
+  for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
+    auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
+                                        {Src0Regs[I], Src1Regs[I]});
+    DstRegs.push_back(Inst.getReg(0));
+  }
+
+  for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
+    auto Inst = MIRBuilder.buildInstr(
+      MI.getOpcode(),
+      {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
+    DstLeftoverRegs.push_back(Inst.getReg(0));
+  }
+
+  insertParts(DstReg, DstTy, NarrowTy, DstRegs,
+              LeftoverTy, DstLeftoverRegs);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
+                                 LLT NarrowTy) {
+  if (TypeIdx != 0)
+    return UnableToLegalize;
+
+  auto [DstReg, SrcReg] = MI.getFirst2Regs();
+
+  LLT DstTy = MRI.getType(DstReg);
+  if (DstTy.isVector())
+    return UnableToLegalize;
+
+  SmallVector<Register, 8> Parts;
+  LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
+  LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
+  buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
+                                    LLT NarrowTy) {
+  if (TypeIdx != 0)
+    return UnableToLegalize;
+
+  Register CondReg = MI.getOperand(1).getReg();
+  LLT CondTy = MRI.getType(CondReg);
+  if (CondTy.isVector()) // TODO: Handle vselect
+    return UnableToLegalize;
+
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+
+  SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
+  SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
+  SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
+  LLT LeftoverTy;
+  if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
+                    Src1Regs, Src1LeftoverRegs))
+    return UnableToLegalize;
+
+  LLT Unused;
+  if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
+                    Src2Regs, Src2LeftoverRegs))
+    llvm_unreachable("inconsistent extractParts result");
+
+  for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
+    auto Select = MIRBuilder.buildSelect(NarrowTy,
+                                         CondReg, Src1Regs[I], Src2Regs[I]);
+    DstRegs.push_back(Select.getReg(0));
+  }
+
+  for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
+    auto Select = MIRBuilder.buildSelect(
+      LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
+    DstLeftoverRegs.push_back(Select.getReg(0));
+  }
+
+  insertParts(DstReg, DstTy, NarrowTy, DstRegs,
+              LeftoverTy, DstLeftoverRegs);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
+                                  LLT NarrowTy) {
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
+  unsigned NarrowSize = NarrowTy.getSizeInBits();
+
+  if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
+    const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
+
+    MachineIRBuilder &B = MIRBuilder;
+    auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
+    // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
+    auto C_0 = B.buildConstant(NarrowTy, 0);
+    auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
+                                UnmergeSrc.getReg(1), C_0);
+    auto LoCTLZ = IsUndef ?
+      B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
+      B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
+    auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
+    auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
+    auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
+    B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
+                                  LLT NarrowTy) {
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
+  unsigned NarrowSize = NarrowTy.getSizeInBits();
+
+  if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
+    const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
+
+    MachineIRBuilder &B = MIRBuilder;
+    auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
+    // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
+    auto C_0 = B.buildConstant(NarrowTy, 0);
+    auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
+                                UnmergeSrc.getReg(0), C_0);
+    auto HiCTTZ = IsUndef ?
+      B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
+      B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
+    auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
+    auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
+    auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
+    B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
+                                   LLT NarrowTy) {
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
+  unsigned NarrowSize = NarrowTy.getSizeInBits();
+
+  if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
+    auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
+
+    auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
+    auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
+    MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx,
+                                    LLT NarrowTy) {
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  MachineIRBuilder &B = MIRBuilder;
+  Register ExpReg = MI.getOperand(2).getReg();
+  LLT ExpTy = MRI.getType(ExpReg);
+
+  unsigned ClampSize = NarrowTy.getScalarSizeInBits();
+
+  // Clamp the exponent to the range of the target type.
+  auto MinExp = B.buildConstant(ExpTy, minIntN(ClampSize));
+  auto ClampMin = B.buildSMax(ExpTy, ExpReg, MinExp);
+  auto MaxExp = B.buildConstant(ExpTy, maxIntN(ClampSize));
+  auto Clamp = B.buildSMin(ExpTy, ClampMin, MaxExp);
+
+  auto Trunc = B.buildTrunc(NarrowTy, Clamp);
+  Observer.changingInstr(MI);
+  MI.getOperand(2).setReg(Trunc.getReg(0));
+  Observer.changedInstr(MI);
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerBitCount(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  const auto &TII = MIRBuilder.getTII();
+  auto isSupported = [this](const LegalityQuery &Q) {
+    auto QAction = LI.getAction(Q).Action;
+    return QAction == Legal || QAction == Libcall || QAction == Custom;
+  };
+  switch (Opc) {
+  default:
+    return UnableToLegalize;
+  case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
+    // This trivially expands to CTLZ.
+    Observer.changingInstr(MI);
+    MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_CTLZ: {
+    auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
+    unsigned Len = SrcTy.getSizeInBits();
+
+    if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
+      // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
+      auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
+      auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
+      auto ICmp = MIRBuilder.buildICmp(
+          CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
+      auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
+      MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
+      MI.eraseFromParent();
+      return Legalized;
+    }
+    // for now, we do this:
+    // NewLen = NextPowerOf2(Len);
+    // x = x | (x >> 1);
+    // x = x | (x >> 2);
+    // ...
+    // x = x | (x >>16);
+    // x = x | (x >>32); // for 64-bit input
+    // Upto NewLen/2
+    // return Len - popcount(x);
+    //
+    // Ref: "Hacker's Delight" by Henry Warren
+    Register Op = SrcReg;
+    unsigned NewLen = PowerOf2Ceil(Len);
+    for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
+      auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
+      auto MIBOp = MIRBuilder.buildOr(
+          SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
+      Op = MIBOp.getReg(0);
+    }
+    auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
+    MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
+                        MIBPop);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
+    // This trivially expands to CTTZ.
+    Observer.changingInstr(MI);
+    MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_CTTZ: {
+    auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
+
+    unsigned Len = SrcTy.getSizeInBits();
+    if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
+      // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
+      // zero.
+      auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
+      auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
+      auto ICmp = MIRBuilder.buildICmp(
+          CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
+      auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
+      MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
+      MI.eraseFromParent();
+      return Legalized;
+    }
+    // for now, we use: { return popcount(~x & (x - 1)); }
+    // unless the target has ctlz but not ctpop, in which case we use:
+    // { return 32 - nlz(~x & (x-1)); }
+    // Ref: "Hacker's Delight" by Henry Warren
+    auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
+    auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
+    auto MIBTmp = MIRBuilder.buildAnd(
+        SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
+    if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
+        isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
+      auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
+      MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
+                          MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
+      MI.eraseFromParent();
+      return Legalized;
+    }
+    MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
+    MI.getOperand(1).setReg(MIBTmp.getReg(0));
+    return Legalized;
+  }
+  case TargetOpcode::G_CTPOP: {
+    Register SrcReg = MI.getOperand(1).getReg();
+    LLT Ty = MRI.getType(SrcReg);
+    unsigned Size = Ty.getSizeInBits();
+    MachineIRBuilder &B = MIRBuilder;
+
+    // Count set bits in blocks of 2 bits. Default approach would be
+    // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
+    // We use following formula instead:
+    // B2Count = val - { (val >> 1) & 0x55555555 }
+    // since it gives same result in blocks of 2 with one instruction less.
+    auto C_1 = B.buildConstant(Ty, 1);
+    auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
+    APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
+    auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
+    auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
+    auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
+
+    // In order to get count in blocks of 4 add values from adjacent block of 2.
+    // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
+    auto C_2 = B.buildConstant(Ty, 2);
+    auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
+    APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
+    auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
+    auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
+    auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
+    auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
+
+    // For count in blocks of 8 bits we don't have to mask high 4 bits before
+    // addition since count value sits in range {0,...,8} and 4 bits are enough
+    // to hold such binary values. After addition high 4 bits still hold count
+    // of set bits in high 4 bit block, set them to zero and get 8 bit result.
+    // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
+    auto C_4 = B.buildConstant(Ty, 4);
+    auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
+    auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
+    APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
+    auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
+    auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
+
+    assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
+    // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
+    // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
+    auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
+    auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
+
+    // Shift count result from 8 high bits to low bits.
+    auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
+    B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  }
+}
+
+// Check that (every element of) Reg is undef or not an exact multiple of BW.
+static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
+                                        Register Reg, unsigned BW) {
+  return matchUnaryPredicate(
+      MRI, Reg,
+      [=](const Constant *C) {
+        // Null constant here means an undef.
+        const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
+        return !CI || CI->getValue().urem(BW) != 0;
+      },
+      /*AllowUndefs*/ true);
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
+  auto [Dst, X, Y, Z] = MI.getFirst4Regs();
+  LLT Ty = MRI.getType(Dst);
+  LLT ShTy = MRI.getType(Z);
+
+  unsigned BW = Ty.getScalarSizeInBits();
+
+  if (!isPowerOf2_32(BW))
+    return UnableToLegalize;
+
+  const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
+  unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
+
+  if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
+    // fshl X, Y, Z -> fshr X, Y, -Z
+    // fshr X, Y, Z -> fshl X, Y, -Z
+    auto Zero = MIRBuilder.buildConstant(ShTy, 0);
+    Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
+  } else {
+    // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
+    // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
+    auto One = MIRBuilder.buildConstant(ShTy, 1);
+    if (IsFSHL) {
+      Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
+      X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
+    } else {
+      X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
+      Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
+    }
+
+    Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
+  }
+
+  MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
+  auto [Dst, X, Y, Z] = MI.getFirst4Regs();
+  LLT Ty = MRI.getType(Dst);
+  LLT ShTy = MRI.getType(Z);
+
+  const unsigned BW = Ty.getScalarSizeInBits();
+  const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
+
+  Register ShX, ShY;
+  Register ShAmt, InvShAmt;
+
+  // FIXME: Emit optimized urem by constant instead of letting it expand later.
+  if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
+    // fshl: X << C | Y >> (BW - C)
+    // fshr: X << (BW - C) | Y >> C
+    // where C = Z % BW is not zero
+    auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
+    ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
+    InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
+    ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
+    ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
+  } else {
+    // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
+    // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
+    auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
+    if (isPowerOf2_32(BW)) {
+      // Z % BW -> Z & (BW - 1)
+      ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
+      // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
+      auto NotZ = MIRBuilder.buildNot(ShTy, Z);
+      InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
+    } else {
+      auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
+      ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
+      InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
+    }
+
+    auto One = MIRBuilder.buildConstant(ShTy, 1);
+    if (IsFSHL) {
+      ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
+      auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
+      ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
+    } else {
+      auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
+      ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
+      ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
+    }
+  }
+
+  MIRBuilder.buildOr(Dst, ShX, ShY);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
+  // These operations approximately do the following (while avoiding undefined
+  // shifts by BW):
+  // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+  // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
+
+  bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
+  unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
+
+  // TODO: Use smarter heuristic that accounts for vector legalization.
+  if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
+    return lowerFunnelShiftAsShifts(MI);
+
+  // This only works for powers of 2, fallback to shifts if it fails.
+  LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
+  if (Result == UnableToLegalize)
+    return lowerFunnelShiftAsShifts(MI);
+  return Result;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
+  auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
+  auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
+  bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
+  unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
+  auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
+  MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
+  auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
+
+  unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
+  bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
+
+  MIRBuilder.setInstrAndDebugLoc(MI);
+
+  // If a rotate in the other direction is supported, use it.
+  unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
+  if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
+      isPowerOf2_32(EltSizeInBits))
+    return lowerRotateWithReverseRotate(MI);
+
+  // If a funnel shift is supported, use it.
+  unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
+  unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
+  bool IsFShLegal = false;
+  if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
+      LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
+    auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
+                                Register R3) {
+      MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
+      MI.eraseFromParent();
+      return Legalized;
+    };
+    // If a funnel shift in the other direction is supported, use it.
+    if (IsFShLegal) {
+      return buildFunnelShift(FShOpc, Dst, Src, Amt);
+    } else if (isPowerOf2_32(EltSizeInBits)) {
+      Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
+      return buildFunnelShift(RevFsh, Dst, Src, Amt);
+    }
+  }
+
+  auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
+  unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
+  unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
+  auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
+  Register ShVal;
+  Register RevShiftVal;
+  if (isPowerOf2_32(EltSizeInBits)) {
+    // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
+    // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
+    auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
+    auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
+    ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
+    auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
+    RevShiftVal =
+        MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
+  } else {
+    // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
+    // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
+    auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
+    auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
+    ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
+    auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
+    auto One = MIRBuilder.buildConstant(AmtTy, 1);
+    auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
+    RevShiftVal =
+        MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
+  }
+  MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+// Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
+// representation.
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
+  auto [Dst, Src] = MI.getFirst2Regs();
+  const LLT S64 = LLT::scalar(64);
+  const LLT S32 = LLT::scalar(32);
+  const LLT S1 = LLT::scalar(1);
+
+  assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
+
+  // unsigned cul2f(ulong u) {
+  //   uint lz = clz(u);
+  //   uint e = (u != 0) ? 127U + 63U - lz : 0;
+  //   u = (u << lz) & 0x7fffffffffffffffUL;
+  //   ulong t = u & 0xffffffffffUL;
+  //   uint v = (e << 23) | (uint)(u >> 40);
+  //   uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
+  //   return as_float(v + r);
+  // }
+
+  auto Zero32 = MIRBuilder.buildConstant(S32, 0);
+  auto Zero64 = MIRBuilder.buildConstant(S64, 0);
+
+  auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
+
+  auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
+  auto Sub = MIRBuilder.buildSub(S32, K, LZ);
+
+  auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
+  auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
+
+  auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
+  auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
+
+  auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
+
+  auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
+  auto T = MIRBuilder.buildAnd(S64, U, Mask1);
+
+  auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
+  auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
+  auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
+
+  auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
+  auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
+  auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
+  auto One = MIRBuilder.buildConstant(S32, 1);
+
+  auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
+  auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
+  auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
+  MIRBuilder.buildAdd(Dst, V, R);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
+  auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
+
+  if (SrcTy == LLT::scalar(1)) {
+    auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
+    auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
+    MIRBuilder.buildSelect(Dst, Src, True, False);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  if (SrcTy != LLT::scalar(64))
+    return UnableToLegalize;
+
+  if (DstTy == LLT::scalar(32)) {
+    // TODO: SelectionDAG has several alternative expansions to port which may
+    // be more reasonble depending on the available instructions. If a target
+    // has sitofp, does not have CTLZ, or can efficiently use f64 as an
+    // intermediate type, this is probably worse.
+    return lowerU64ToF32BitOps(MI);
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
+  auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
+
+  const LLT S64 = LLT::scalar(64);
+  const LLT S32 = LLT::scalar(32);
+  const LLT S1 = LLT::scalar(1);
+
+  if (SrcTy == S1) {
+    auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
+    auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
+    MIRBuilder.buildSelect(Dst, Src, True, False);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  if (SrcTy != S64)
+    return UnableToLegalize;
+
+  if (DstTy == S32) {
+    // signed cl2f(long l) {
+    //   long s = l >> 63;
+    //   float r = cul2f((l + s) ^ s);
+    //   return s ? -r : r;
+    // }
+    Register L = Src;
+    auto SignBit = MIRBuilder.buildConstant(S64, 63);
+    auto S = MIRBuilder.buildAShr(S64, L, SignBit);
+
+    auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
+    auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
+    auto R = MIRBuilder.buildUITOFP(S32, Xor);
+
+    auto RNeg = MIRBuilder.buildFNeg(S32, R);
+    auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
+                                            MIRBuilder.buildConstant(S64, 0));
+    MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
+  auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
+  const LLT S64 = LLT::scalar(64);
+  const LLT S32 = LLT::scalar(32);
+
+  if (SrcTy != S64 && SrcTy != S32)
+    return UnableToLegalize;
+  if (DstTy != S32 && DstTy != S64)
+    return UnableToLegalize;
+
+  // FPTOSI gives same result as FPTOUI for positive signed integers.
+  // FPTOUI needs to deal with fp values that convert to unsigned integers
+  // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
+
+  APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
+  APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
+                                                : APFloat::IEEEdouble(),
+                    APInt::getZero(SrcTy.getSizeInBits()));
+  TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
+
+  MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
+
+  MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
+  // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
+  // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
+  MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
+  MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
+  MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
+  MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
+
+  const LLT S1 = LLT::scalar(1);
+
+  MachineInstrBuilder FCMP =
+      MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
+  MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
+  auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
+  const LLT S64 = LLT::scalar(64);
+  const LLT S32 = LLT::scalar(32);
+
+  // FIXME: Only f32 to i64 conversions are supported.
+  if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
+    return UnableToLegalize;
+
+  // Expand f32 -> i64 conversion
+  // This algorithm comes from compiler-rt's implementation of fixsfdi:
+  // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
+
+  unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
+
+  auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
+  auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
+
+  auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
+  auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
+
+  auto SignMask = MIRBuilder.buildConstant(SrcTy,
+                                           APInt::getSignMask(SrcEltBits));
+  auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
+  auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
+  auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
+  Sign = MIRBuilder.buildSExt(DstTy, Sign);
+
+  auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
+  auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
+  auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
+
+  auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
+  R = MIRBuilder.buildZExt(DstTy, R);
+
+  auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
+  auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
+  auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
+  auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
+
+  auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
+  auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
+
+  const LLT S1 = LLT::scalar(1);
+  auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
+                                    S1, Exponent, ExponentLoBit);
+
+  R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
+
+  auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
+  auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
+
+  auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
+
+  auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
+                                          S1, Exponent, ZeroSrcTy);
+
+  auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
+  MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+// f64 -> f16 conversion using round-to-nearest-even rounding mode.
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
+  const LLT S1 = LLT::scalar(1);
+  const LLT S32 = LLT::scalar(32);
+
+  auto [Dst, Src] = MI.getFirst2Regs();
+  assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) &&
+         MRI.getType(Src).getScalarType() == LLT::scalar(64));
+
+  if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
+    return UnableToLegalize;
+
+  if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) {
+    unsigned Flags = MI.getFlags();
+    auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags);
+    MIRBuilder.buildFPTrunc(Dst, Src32, Flags);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  const unsigned ExpMask = 0x7ff;
+  const unsigned ExpBiasf64 = 1023;
+  const unsigned ExpBiasf16 = 15;
+
+  auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
+  Register U = Unmerge.getReg(0);
+  Register UH = Unmerge.getReg(1);
+
+  auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
+  E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
+
+  // Subtract the fp64 exponent bias (1023) to get the real exponent and
+  // add the f16 bias (15) to get the biased exponent for the f16 format.
+  E = MIRBuilder.buildAdd(
+    S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
+
+  auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
+  M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
+
+  auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
+                                       MIRBuilder.buildConstant(S32, 0x1ff));
+  MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
+
+  auto Zero = MIRBuilder.buildConstant(S32, 0);
+  auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
+  auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
+  M = MIRBuilder.buildOr(S32, M, Lo40Set);
+
+  // (M != 0 ? 0x0200 : 0) | 0x7c00;
+  auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
+  auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
+  auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
+
+  auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
+  auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
+
+  // N = M | (E << 12);
+  auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
+  auto N = MIRBuilder.buildOr(S32, M, EShl12);
+
+  // B = clamp(1-E, 0, 13);
+  auto One = MIRBuilder.buildConstant(S32, 1);
+  auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
+  auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
+  B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
+
+  auto SigSetHigh = MIRBuilder.buildOr(S32, M,
+                                       MIRBuilder.buildConstant(S32, 0x1000));
+
+  auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
+  auto D0 = MIRBuilder.buildShl(S32, D, B);
+
+  auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
+                                             D0, SigSetHigh);
+  auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
+  D = MIRBuilder.buildOr(S32, D, D1);
+
+  auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
+  auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
+
+  auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
+  V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
+
+  auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
+                                       MIRBuilder.buildConstant(S32, 3));
+  auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
+
+  auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
+                                       MIRBuilder.buildConstant(S32, 5));
+  auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
+
+  V1 = MIRBuilder.buildOr(S32, V0, V1);
+  V = MIRBuilder.buildAdd(S32, V, V1);
+
+  auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
+                                       E, MIRBuilder.buildConstant(S32, 30));
+  V = MIRBuilder.buildSelect(S32, CmpEGt30,
+                             MIRBuilder.buildConstant(S32, 0x7c00), V);
+
+  auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
+                                         E, MIRBuilder.buildConstant(S32, 1039));
+  V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
+
+  // Extract the sign bit.
+  auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
+  Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
+
+  // Insert the sign bit
+  V = MIRBuilder.buildOr(S32, Sign, V);
+
+  MIRBuilder.buildTrunc(Dst, V);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
+  auto [DstTy, SrcTy] = MI.getFirst2LLTs();
+  const LLT S64 = LLT::scalar(64);
+  const LLT S16 = LLT::scalar(16);
+
+  if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
+    return lowerFPTRUNC_F64_TO_F16(MI);
+
+  return UnableToLegalize;
+}
+
+// TODO: If RHS is a constant SelectionDAGBuilder expands this into a
+// multiplication tree.
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
+  auto [Dst, Src0, Src1] = MI.getFirst3Regs();
+  LLT Ty = MRI.getType(Dst);
+
+  auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
+  MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
+  switch (Opc) {
+  case TargetOpcode::G_SMIN:
+    return CmpInst::ICMP_SLT;
+  case TargetOpcode::G_SMAX:
+    return CmpInst::ICMP_SGT;
+  case TargetOpcode::G_UMIN:
+    return CmpInst::ICMP_ULT;
+  case TargetOpcode::G_UMAX:
+    return CmpInst::ICMP_UGT;
+  default:
+    llvm_unreachable("not in integer min/max");
+  }
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
+  auto [Dst, Src0, Src1] = MI.getFirst3Regs();
+
+  const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
+  LLT CmpType = MRI.getType(Dst).changeElementSize(1);
+
+  auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
+  MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
+  auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs();
+  const int Src0Size = Src0Ty.getScalarSizeInBits();
+  const int Src1Size = Src1Ty.getScalarSizeInBits();
+
+  auto SignBitMask = MIRBuilder.buildConstant(
+    Src0Ty, APInt::getSignMask(Src0Size));
+
+  auto NotSignBitMask = MIRBuilder.buildConstant(
+    Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
+
+  Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
+  Register And1;
+  if (Src0Ty == Src1Ty) {
+    And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
+  } else if (Src0Size > Src1Size) {
+    auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
+    auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
+    auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
+    And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
+  } else {
+    auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
+    auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
+    auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
+    And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
+  }
+
+  // Be careful about setting nsz/nnan/ninf on every instruction, since the
+  // constants are a nan and -0.0, but the final result should preserve
+  // everything.
+  unsigned Flags = MI.getFlags();
+  MIRBuilder.buildOr(Dst, And0, And1, Flags);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
+  unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
+    TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
+
+  auto [Dst, Src0, Src1] = MI.getFirst3Regs();
+  LLT Ty = MRI.getType(Dst);
+
+  if (!MI.getFlag(MachineInstr::FmNoNans)) {
+    // Insert canonicalizes if it's possible we need to quiet to get correct
+    // sNaN behavior.
+
+    // Note this must be done here, and not as an optimization combine in the
+    // absence of a dedicate quiet-snan instruction as we're using an
+    // omni-purpose G_FCANONICALIZE.
+    if (!isKnownNeverSNaN(Src0, MRI))
+      Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
+
+    if (!isKnownNeverSNaN(Src1, MRI))
+      Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
+  }
+
+  // If there are no nans, it's safe to simply replace this with the non-IEEE
+  // version.
+  MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
+  // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DstReg);
+  unsigned Flags = MI.getFlags();
+
+  auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
+                                  Flags);
+  MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
+  auto [DstReg, X] = MI.getFirst2Regs();
+  const unsigned Flags = MI.getFlags();
+  const LLT Ty = MRI.getType(DstReg);
+  const LLT CondTy = Ty.changeElementSize(1);
+
+  // round(x) =>
+  //  t = trunc(x);
+  //  d = fabs(x - t);
+  //  o = copysign(1.0f, x);
+  //  return t + (d >= 0.5 ? o : 0.0);
+
+  auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
+
+  auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
+  auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
+  auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
+  auto One = MIRBuilder.buildFConstant(Ty, 1.0);
+  auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
+  auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X);
+
+  auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half,
+                                  Flags);
+  auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags);
+
+  MIRBuilder.buildFAdd(DstReg, T, Sel, Flags);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerFFloor(MachineInstr &MI) {
+  auto [DstReg, SrcReg] = MI.getFirst2Regs();
+  unsigned Flags = MI.getFlags();
+  LLT Ty = MRI.getType(DstReg);
+  const LLT CondTy = Ty.changeElementSize(1);
+
+  // result = trunc(src);
+  // if (src < 0.0 && src != result)
+  //   result += -1.0.
+
+  auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
+  auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
+
+  auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
+                                  SrcReg, Zero, Flags);
+  auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
+                                      SrcReg, Trunc, Flags);
+  auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
+  auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
+
+  MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
+  const unsigned NumOps = MI.getNumOperands();
+  auto [DstReg, DstTy, Src0Reg, Src0Ty] = MI.getFirst2RegLLTs();
+  unsigned PartSize = Src0Ty.getSizeInBits();
+
+  LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
+  Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
+
+  for (unsigned I = 2; I != NumOps; ++I) {
+    const unsigned Offset = (I - 1) * PartSize;
+
+    Register SrcReg = MI.getOperand(I).getReg();
+    auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
+
+    Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
+      MRI.createGenericVirtualRegister(WideTy);
+
+    auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
+    auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
+    MIRBuilder.buildOr(NextResult, ResultReg, Shl);
+    ResultReg = NextResult;
+  }
+
+  if (DstTy.isPointer()) {
+    if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
+          DstTy.getAddressSpace())) {
+      LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
+      return UnableToLegalize;
+    }
+
+    MIRBuilder.buildIntToPtr(DstReg, ResultReg);
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
+  const unsigned NumDst = MI.getNumOperands() - 1;
+  Register SrcReg = MI.getOperand(NumDst).getReg();
+  Register Dst0Reg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(Dst0Reg);
+  if (DstTy.isPointer())
+    return UnableToLegalize; // TODO
+
+  SrcReg = coerceToScalar(SrcReg);
+  if (!SrcReg)
+    return UnableToLegalize;
+
+  // Expand scalarizing unmerge as bitcast to integer and shift.
+  LLT IntTy = MRI.getType(SrcReg);
+
+  MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
+
+  const unsigned DstSize = DstTy.getSizeInBits();
+  unsigned Offset = DstSize;
+  for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
+    auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
+    auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
+    MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+/// Lower a vector extract or insert by writing the vector to a stack temporary
+/// and reloading the element or vector.
+///
+/// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
+///  =>
+///  %stack_temp = G_FRAME_INDEX
+///  G_STORE %vec, %stack_temp
+///  %idx = clamp(%idx, %vec.getNumElements())
+///  %element_ptr = G_PTR_ADD %stack_temp, %idx
+///  %dst = G_LOAD %element_ptr
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcVec = MI.getOperand(1).getReg();
+  Register InsertVal;
+  if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
+    InsertVal = MI.getOperand(2).getReg();
+
+  Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
+
+  LLT VecTy = MRI.getType(SrcVec);
+  LLT EltTy = VecTy.getElementType();
+  unsigned NumElts = VecTy.getNumElements();
+
+  int64_t IdxVal;
+  if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) {
+    SmallVector<Register, 8> SrcRegs;
+    extractParts(SrcVec, EltTy, NumElts, SrcRegs);
+
+    if (InsertVal) {
+      SrcRegs[IdxVal] = MI.getOperand(2).getReg();
+      MIRBuilder.buildMergeLikeInstr(DstReg, SrcRegs);
+    } else {
+      MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]);
+    }
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  if (!EltTy.isByteSized()) { // Not implemented.
+    LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
+    return UnableToLegalize;
+  }
+
+  unsigned EltBytes = EltTy.getSizeInBytes();
+  Align VecAlign = getStackTemporaryAlignment(VecTy);
+  Align EltAlign;
+
+  MachinePointerInfo PtrInfo;
+  auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()),
+                                        VecAlign, PtrInfo);
+  MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
+
+  // Get the pointer to the element, and be sure not to hit undefined behavior
+  // if the index is out of bounds.
+  Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
+
+  if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
+    int64_t Offset = IdxVal * EltBytes;
+    PtrInfo = PtrInfo.getWithOffset(Offset);
+    EltAlign = commonAlignment(VecAlign, Offset);
+  } else {
+    // We lose information with a variable offset.
+    EltAlign = getStackTemporaryAlignment(EltTy);
+    PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
+  }
+
+  if (InsertVal) {
+    // Write the inserted element
+    MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
+
+    // Reload the whole vector.
+    MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
+  } else {
+    MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
+  auto [DstReg, DstTy, Src0Reg, Src0Ty, Src1Reg, Src1Ty] =
+      MI.getFirst3RegLLTs();
+  LLT IdxTy = LLT::scalar(32);
+
+  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+
+  if (DstTy.isScalar()) {
+    if (Src0Ty.isVector())
+      return UnableToLegalize;
+
+    // This is just a SELECT.
+    assert(Mask.size() == 1 && "Expected a single mask element");
+    Register Val;
+    if (Mask[0] < 0 || Mask[0] > 1)
+      Val = MIRBuilder.buildUndef(DstTy).getReg(0);
+    else
+      Val = Mask[0] == 0 ? Src0Reg : Src1Reg;
+    MIRBuilder.buildCopy(DstReg, Val);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  Register Undef;
+  SmallVector<Register, 32> BuildVec;
+  LLT EltTy = DstTy.getElementType();
+
+  for (int Idx : Mask) {
+    if (Idx < 0) {
+      if (!Undef.isValid())
+        Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
+      BuildVec.push_back(Undef);
+      continue;
+    }
+
+    if (Src0Ty.isScalar()) {
+      BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
+    } else {
+      int NumElts = Src0Ty.getNumElements();
+      Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
+      int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
+      auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
+      auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
+      BuildVec.push_back(Extract.getReg(0));
+    }
+  }
+
+  MIRBuilder.buildBuildVector(DstReg, BuildVec);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
+  const auto &MF = *MI.getMF();
+  const auto &TFI = *MF.getSubtarget().getFrameLowering();
+  if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
+    return UnableToLegalize;
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register AllocSize = MI.getOperand(1).getReg();
+  Align Alignment = assumeAligned(MI.getOperand(2).getImm());
+
+  LLT PtrTy = MRI.getType(Dst);
+  LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
+
+  Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
+  auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
+  SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
+
+  // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
+  // have to generate an extra instruction to negate the alloc and then use
+  // G_PTR_ADD to add the negative offset.
+  auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
+  if (Alignment > Align(1)) {
+    APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
+    AlignMask.negate();
+    auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
+    Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
+  }
+
+  SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
+  MIRBuilder.buildCopy(SPReg, SPTmp);
+  MIRBuilder.buildCopy(Dst, SPTmp);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerExtract(MachineInstr &MI) {
+  auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
+  unsigned Offset = MI.getOperand(2).getImm();
+
+  // Extract sub-vector or one element
+  if (SrcTy.isVector()) {
+    unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
+    unsigned DstSize = DstTy.getSizeInBits();
+
+    if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
+        (Offset + DstSize <= SrcTy.getSizeInBits())) {
+      // Unmerge and allow access to each Src element for the artifact combiner.
+      auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), SrcReg);
+
+      // Take element(s) we need to extract and copy it (merge them).
+      SmallVector<Register, 8> SubVectorElts;
+      for (unsigned Idx = Offset / SrcEltSize;
+           Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
+        SubVectorElts.push_back(Unmerge.getReg(Idx));
+      }
+      if (SubVectorElts.size() == 1)
+        MIRBuilder.buildCopy(DstReg, SubVectorElts[0]);
+      else
+        MIRBuilder.buildMergeLikeInstr(DstReg, SubVectorElts);
+
+      MI.eraseFromParent();
+      return Legalized;
+    }
+  }
+
+  if (DstTy.isScalar() &&
+      (SrcTy.isScalar() ||
+       (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
+    LLT SrcIntTy = SrcTy;
+    if (!SrcTy.isScalar()) {
+      SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
+      SrcReg = MIRBuilder.buildBitcast(SrcIntTy, SrcReg).getReg(0);
+    }
+
+    if (Offset == 0)
+      MIRBuilder.buildTrunc(DstReg, SrcReg);
+    else {
+      auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
+      auto Shr = MIRBuilder.buildLShr(SrcIntTy, SrcReg, ShiftAmt);
+      MIRBuilder.buildTrunc(DstReg, Shr);
+    }
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
+  auto [Dst, Src, InsertSrc] = MI.getFirst3Regs();
+  uint64_t Offset = MI.getOperand(3).getImm();
+
+  LLT DstTy = MRI.getType(Src);
+  LLT InsertTy = MRI.getType(InsertSrc);
+
+  // Insert sub-vector or one element
+  if (DstTy.isVector() && !InsertTy.isPointer()) {
+    LLT EltTy = DstTy.getElementType();
+    unsigned EltSize = EltTy.getSizeInBits();
+    unsigned InsertSize = InsertTy.getSizeInBits();
+
+    if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
+        (Offset + InsertSize <= DstTy.getSizeInBits())) {
+      auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src);
+      SmallVector<Register, 8> DstElts;
+      unsigned Idx = 0;
+      // Elements from Src before insert start Offset
+      for (; Idx < Offset / EltSize; ++Idx) {
+        DstElts.push_back(UnmergeSrc.getReg(Idx));
+      }
+
+      // Replace elements in Src with elements from InsertSrc
+      if (InsertTy.getSizeInBits() > EltSize) {
+        auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc);
+        for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
+             ++Idx, ++i) {
+          DstElts.push_back(UnmergeInsertSrc.getReg(i));
+        }
+      } else {
+        DstElts.push_back(InsertSrc);
+        ++Idx;
+      }
+
+      // Remaining elements from Src after insert
+      for (; Idx < DstTy.getNumElements(); ++Idx) {
+        DstElts.push_back(UnmergeSrc.getReg(Idx));
+      }
+
+      MIRBuilder.buildMergeLikeInstr(Dst, DstElts);
+      MI.eraseFromParent();
+      return Legalized;
+    }
+  }
+
+  if (InsertTy.isVector() ||
+      (DstTy.isVector() && DstTy.getElementType() != InsertTy))
+    return UnableToLegalize;
+
+  const DataLayout &DL = MIRBuilder.getDataLayout();
+  if ((DstTy.isPointer() &&
+       DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
+      (InsertTy.isPointer() &&
+       DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
+    LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
+    return UnableToLegalize;
+  }
+
+  LLT IntDstTy = DstTy;
+
+  if (!DstTy.isScalar()) {
+    IntDstTy = LLT::scalar(DstTy.getSizeInBits());
+    Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
+  }
+
+  if (!InsertTy.isScalar()) {
+    const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
+    InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
+  }
+
+  Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
+  if (Offset != 0) {
+    auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
+    ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
+  }
+
+  APInt MaskVal = APInt::getBitsSetWithWrap(
+      DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
+
+  auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
+  auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
+  auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
+
+  MIRBuilder.buildCast(Dst, Or);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
+  auto [Dst0, Dst0Ty, Dst1, Dst1Ty, LHS, LHSTy, RHS, RHSTy] =
+      MI.getFirst4RegLLTs();
+  const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
+
+  LLT Ty = Dst0Ty;
+  LLT BoolTy = Dst1Ty;
+
+  if (IsAdd)
+    MIRBuilder.buildAdd(Dst0, LHS, RHS);
+  else
+    MIRBuilder.buildSub(Dst0, LHS, RHS);
+
+  // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
+
+  auto Zero = MIRBuilder.buildConstant(Ty, 0);
+
+  // For an addition, the result should be less than one of the operands (LHS)
+  // if and only if the other operand (RHS) is negative, otherwise there will
+  // be overflow.
+  // For a subtraction, the result should be less than one of the operands
+  // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
+  // otherwise there will be overflow.
+  auto ResultLowerThanLHS =
+      MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
+  auto ConditionRHS = MIRBuilder.buildICmp(
+      IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
+
+  MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
+  auto [Res, LHS, RHS] = MI.getFirst3Regs();
+  LLT Ty = MRI.getType(Res);
+  bool IsSigned;
+  bool IsAdd;
+  unsigned BaseOp;
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("unexpected addsat/subsat opcode");
+  case TargetOpcode::G_UADDSAT:
+    IsSigned = false;
+    IsAdd = true;
+    BaseOp = TargetOpcode::G_ADD;
+    break;
+  case TargetOpcode::G_SADDSAT:
+    IsSigned = true;
+    IsAdd = true;
+    BaseOp = TargetOpcode::G_ADD;
+    break;
+  case TargetOpcode::G_USUBSAT:
+    IsSigned = false;
+    IsAdd = false;
+    BaseOp = TargetOpcode::G_SUB;
+    break;
+  case TargetOpcode::G_SSUBSAT:
+    IsSigned = true;
+    IsAdd = false;
+    BaseOp = TargetOpcode::G_SUB;
+    break;
+  }
+
+  if (IsSigned) {
+    // sadd.sat(a, b) ->
+    //   hi = 0x7fffffff - smax(a, 0)
+    //   lo = 0x80000000 - smin(a, 0)
+    //   a + smin(smax(lo, b), hi)
+    // ssub.sat(a, b) ->
+    //   lo = smax(a, -1) - 0x7fffffff
+    //   hi = smin(a, -1) - 0x80000000
+    //   a - smin(smax(lo, b), hi)
+    // TODO: AMDGPU can use a "median of 3" instruction here:
+    //   a +/- med3(lo, b, hi)
+    uint64_t NumBits = Ty.getScalarSizeInBits();
+    auto MaxVal =
+        MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
+    auto MinVal =
+        MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
+    MachineInstrBuilder Hi, Lo;
+    if (IsAdd) {
+      auto Zero = MIRBuilder.buildConstant(Ty, 0);
+      Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
+      Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
+    } else {
+      auto NegOne = MIRBuilder.buildConstant(Ty, -1);
+      Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
+                               MaxVal);
+      Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
+                               MinVal);
+    }
+    auto RHSClamped =
+        MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
+    MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
+  } else {
+    // uadd.sat(a, b) -> a + umin(~a, b)
+    // usub.sat(a, b) -> a - umin(a, b)
+    Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
+    auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
+    MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
+  auto [Res, LHS, RHS] = MI.getFirst3Regs();
+  LLT Ty = MRI.getType(Res);
+  LLT BoolTy = Ty.changeElementSize(1);
+  bool IsSigned;
+  bool IsAdd;
+  unsigned OverflowOp;
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("unexpected addsat/subsat opcode");
+  case TargetOpcode::G_UADDSAT:
+    IsSigned = false;
+    IsAdd = true;
+    OverflowOp = TargetOpcode::G_UADDO;
+    break;
+  case TargetOpcode::G_SADDSAT:
+    IsSigned = true;
+    IsAdd = true;
+    OverflowOp = TargetOpcode::G_SADDO;
+    break;
+  case TargetOpcode::G_USUBSAT:
+    IsSigned = false;
+    IsAdd = false;
+    OverflowOp = TargetOpcode::G_USUBO;
+    break;
+  case TargetOpcode::G_SSUBSAT:
+    IsSigned = true;
+    IsAdd = false;
+    OverflowOp = TargetOpcode::G_SSUBO;
+    break;
+  }
+
+  auto OverflowRes =
+      MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
+  Register Tmp = OverflowRes.getReg(0);
+  Register Ov = OverflowRes.getReg(1);
+  MachineInstrBuilder Clamp;
+  if (IsSigned) {
+    // sadd.sat(a, b) ->
+    //   {tmp, ov} = saddo(a, b)
+    //   ov ? (tmp >>s 31) + 0x80000000 : r
+    // ssub.sat(a, b) ->
+    //   {tmp, ov} = ssubo(a, b)
+    //   ov ? (tmp >>s 31) + 0x80000000 : r
+    uint64_t NumBits = Ty.getScalarSizeInBits();
+    auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
+    auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
+    auto MinVal =
+        MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
+    Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
+  } else {
+    // uadd.sat(a, b) ->
+    //   {tmp, ov} = uaddo(a, b)
+    //   ov ? 0xffffffff : tmp
+    // usub.sat(a, b) ->
+    //   {tmp, ov} = usubo(a, b)
+    //   ov ? 0 : tmp
+    Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
+  }
+  MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerShlSat(MachineInstr &MI) {
+  assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
+          MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
+         "Expected shlsat opcode!");
+  bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
+  auto [Res, LHS, RHS] = MI.getFirst3Regs();
+  LLT Ty = MRI.getType(Res);
+  LLT BoolTy = Ty.changeElementSize(1);
+
+  unsigned BW = Ty.getScalarSizeInBits();
+  auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
+  auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
+                       : MIRBuilder.buildLShr(Ty, Result, RHS);
+
+  MachineInstrBuilder SatVal;
+  if (IsSigned) {
+    auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
+    auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
+    auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
+                                    MIRBuilder.buildConstant(Ty, 0));
+    SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
+  } else {
+    SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
+  }
+  auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
+  MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) {
+  auto [Dst, Src] = MI.getFirst2Regs();
+  const LLT Ty = MRI.getType(Src);
+  unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
+  unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
+
+  // Swap most and least significant byte, set remaining bytes in Res to zero.
+  auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
+  auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
+  auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
+  auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
+
+  // Set i-th high/low byte in Res to i-th low/high byte from Src.
+  for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
+    // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
+    APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
+    auto Mask = MIRBuilder.buildConstant(Ty, APMask);
+    auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
+    // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
+    auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
+    auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
+    Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
+    // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
+    auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
+    auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
+    Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
+  }
+  Res.getInstr()->getOperand(0).setReg(Dst);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+//{ (Src & Mask) >> N } | { (Src << N) & Mask }
+static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
+                                 MachineInstrBuilder Src, APInt Mask) {
+  const LLT Ty = Dst.getLLTTy(*B.getMRI());
+  MachineInstrBuilder C_N = B.buildConstant(Ty, N);
+  MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
+  auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
+  auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
+  return B.buildOr(Dst, LHS, RHS);
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
+  auto [Dst, Src] = MI.getFirst2Regs();
+  const LLT Ty = MRI.getType(Src);
+  unsigned Size = Ty.getSizeInBits();
+
+  MachineInstrBuilder BSWAP =
+      MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
+
+  // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
+  //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
+  // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
+  MachineInstrBuilder Swap4 =
+      SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
+
+  // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
+  //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
+  // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
+  MachineInstrBuilder Swap2 =
+      SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
+
+  // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
+  //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
+  // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
+  SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
+  MachineFunction &MF = MIRBuilder.getMF();
+
+  bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
+  int NameOpIdx = IsRead ? 1 : 0;
+  int ValRegIndex = IsRead ? 0 : 1;
+
+  Register ValReg = MI.getOperand(ValRegIndex).getReg();
+  const LLT Ty = MRI.getType(ValReg);
+  const MDString *RegStr = cast<MDString>(
+    cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
+
+  Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
+  if (!PhysReg.isValid())
+    return UnableToLegalize;
+
+  if (IsRead)
+    MIRBuilder.buildCopy(ValReg, PhysReg);
+  else
+    MIRBuilder.buildCopy(PhysReg, ValReg);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
+  bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
+  unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
+  Register Result = MI.getOperand(0).getReg();
+  LLT OrigTy = MRI.getType(Result);
+  auto SizeInBits = OrigTy.getScalarSizeInBits();
+  LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
+
+  auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
+  auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
+  auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
+  unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
+
+  auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
+  auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
+  MIRBuilder.buildTrunc(Result, Shifted);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) {
+  auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
+  FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(2).getImm());
+
+  if (Mask == fcNone) {
+    MIRBuilder.buildConstant(DstReg, 0);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  if (Mask == fcAllFlags) {
+    MIRBuilder.buildConstant(DstReg, 1);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  // TODO: Try inverting the test with getInvertedFPClassTest like the DAG
+  // version
+
+  unsigned BitSize = SrcTy.getScalarSizeInBits();
+  const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType());
+
+  LLT IntTy = LLT::scalar(BitSize);
+  if (SrcTy.isVector())
+    IntTy = LLT::vector(SrcTy.getElementCount(), IntTy);
+  auto AsInt = MIRBuilder.buildCopy(IntTy, SrcReg);
+
+  // Various masks.
+  APInt SignBit = APInt::getSignMask(BitSize);
+  APInt ValueMask = APInt::getSignedMaxValue(BitSize);     // All bits but sign.
+  APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit.
+  APInt ExpMask = Inf;
+  APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf;
+  APInt QNaNBitMask =
+      APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1);
+  APInt InvertionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits());
+
+  auto SignBitC = MIRBuilder.buildConstant(IntTy, SignBit);
+  auto ValueMaskC = MIRBuilder.buildConstant(IntTy, ValueMask);
+  auto InfC = MIRBuilder.buildConstant(IntTy, Inf);
+  auto ExpMaskC = MIRBuilder.buildConstant(IntTy, ExpMask);
+  auto ZeroC = MIRBuilder.buildConstant(IntTy, 0);
+
+  auto Abs = MIRBuilder.buildAnd(IntTy, AsInt, ValueMaskC);
+  auto Sign =
+      MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, AsInt, Abs);
+
+  auto Res = MIRBuilder.buildConstant(DstTy, 0);
+  // Clang doesn't support capture of structured bindings:
+  LLT DstTyCopy = DstTy;
+  const auto appendToRes = [&](MachineInstrBuilder ToAppend) {
+    Res = MIRBuilder.buildOr(DstTyCopy, Res, ToAppend);
+  };
+
+  // Tests that involve more than one class should be processed first.
+  if ((Mask & fcFinite) == fcFinite) {
+    // finite(V) ==> abs(V) u< exp_mask
+    appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
+                                     ExpMaskC));
+    Mask &= ~fcFinite;
+  } else if ((Mask & fcFinite) == fcPosFinite) {
+    // finite(V) && V > 0 ==> V u< exp_mask
+    appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, AsInt,
+                                     ExpMaskC));
+    Mask &= ~fcPosFinite;
+  } else if ((Mask & fcFinite) == fcNegFinite) {
+    // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1
+    auto Cmp = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
+                                    ExpMaskC);
+    auto And = MIRBuilder.buildAnd(DstTy, Cmp, Sign);
+    appendToRes(And);
+    Mask &= ~fcNegFinite;
+  }
+
+  if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) {
+    // fcZero | fcSubnormal => test all exponent bits are 0
+    // TODO: Handle sign bit specific cases
+    // TODO: Handle inverted case
+    if (PartialCheck == (fcZero | fcSubnormal)) {
+      auto ExpBits = MIRBuilder.buildAnd(IntTy, AsInt, ExpMaskC);
+      appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
+                                       ExpBits, ZeroC));
+      Mask &= ~PartialCheck;
+    }
+  }
+
+  // Check for individual classes.
+  if (FPClassTest PartialCheck = Mask & fcZero) {
+    if (PartialCheck == fcPosZero)
+      appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
+                                       AsInt, ZeroC));
+    else if (PartialCheck == fcZero)
+      appendToRes(
+          MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, ZeroC));
+    else // fcNegZero
+      appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
+                                       AsInt, SignBitC));
+  }
+
+  if (FPClassTest PartialCheck = Mask & fcSubnormal) {
+    // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set)
+    // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set)
+    auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs;
+    auto OneC = MIRBuilder.buildConstant(IntTy, 1);
+    auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC);
+    auto SubnormalRes =
+        MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne,
+                             MIRBuilder.buildConstant(IntTy, AllOneMantissa));
+    if (PartialCheck == fcNegSubnormal)
+      SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign);
+    appendToRes(SubnormalRes);
+  }
+
+  if (FPClassTest PartialCheck = Mask & fcInf) {
+    if (PartialCheck == fcPosInf)
+      appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
+                                       AsInt, InfC));
+    else if (PartialCheck == fcInf)
+      appendToRes(
+          MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, InfC));
+    else { // fcNegInf
+      APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt();
+      auto NegInfC = MIRBuilder.buildConstant(IntTy, NegInf);
+      appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
+                                       AsInt, NegInfC));
+    }
+  }
+
+  if (FPClassTest PartialCheck = Mask & fcNan) {
+    auto InfWithQnanBitC = MIRBuilder.buildConstant(IntTy, Inf | QNaNBitMask);
+    if (PartialCheck == fcNan) {
+      // isnan(V) ==> abs(V) u> int(inf)
+      appendToRes(
+          MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC));
+    } else if (PartialCheck == fcQNan) {
+      // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit)
+      appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGE, DstTy, Abs,
+                                       InfWithQnanBitC));
+    } else { // fcSNan
+      // issignaling(V) ==> abs(V) u> unsigned(Inf) &&
+      //                    abs(V) u< (unsigned(Inf) | quiet_bit)
+      auto IsNan =
+          MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC);
+      auto IsNotQnan = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy,
+                                            Abs, InfWithQnanBitC);
+      appendToRes(MIRBuilder.buildAnd(DstTy, IsNan, IsNotQnan));
+    }
+  }
+
+  if (FPClassTest PartialCheck = Mask & fcNormal) {
+    // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u<
+    // (max_exp-1))
+    APInt ExpLSB = ExpMask & ~(ExpMask.shl(1));
+    auto ExpMinusOne = MIRBuilder.buildSub(
+        IntTy, Abs, MIRBuilder.buildConstant(IntTy, ExpLSB));
+    APInt MaxExpMinusOne = ExpMask - ExpLSB;
+    auto NormalRes =
+        MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne,
+                             MIRBuilder.buildConstant(IntTy, MaxExpMinusOne));
+    if (PartialCheck == fcNegNormal)
+      NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, Sign);
+    else if (PartialCheck == fcPosNormal) {
+      auto PosSign = MIRBuilder.buildXor(
+          DstTy, Sign, MIRBuilder.buildConstant(DstTy, InvertionMask));
+      NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, PosSign);
+    }
+    appendToRes(NormalRes);
+  }
+
+  MIRBuilder.buildCopy(DstReg, Res);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
+  // Implement vector G_SELECT in terms of XOR, AND, OR.
+  auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] =
+      MI.getFirst4RegLLTs();
+  if (!DstTy.isVector())
+    return UnableToLegalize;
+
+  bool IsEltPtr = DstTy.getElementType().isPointer();
+  if (IsEltPtr) {
+    LLT ScalarPtrTy = LLT::scalar(DstTy.getScalarSizeInBits());
+    LLT NewTy = DstTy.changeElementType(ScalarPtrTy);
+    Op1Reg = MIRBuilder.buildPtrToInt(NewTy, Op1Reg).getReg(0);
+    Op2Reg = MIRBuilder.buildPtrToInt(NewTy, Op2Reg).getReg(0);
+    DstTy = NewTy;
+  }
+
+  if (MaskTy.isScalar()) {
+    // Turn the scalar condition into a vector condition mask.
+
+    Register MaskElt = MaskReg;
+
+    // The condition was potentially zero extended before, but we want a sign
+    // extended boolean.
+    if (MaskTy != LLT::scalar(1))
+      MaskElt = MIRBuilder.buildSExtInReg(MaskTy, MaskElt, 1).getReg(0);
+
+    // Continue the sign extension (or truncate) to match the data type.
+    MaskElt = MIRBuilder.buildSExtOrTrunc(DstTy.getElementType(),
+                                          MaskElt).getReg(0);
+
+    // Generate a vector splat idiom.
+    auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
+    MaskReg = ShufSplat.getReg(0);
+    MaskTy = DstTy;
+  }
+
+  if (MaskTy.getSizeInBits() != DstTy.getSizeInBits()) {
+    return UnableToLegalize;
+  }
+
+  auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
+  auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
+  auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
+  if (IsEltPtr) {
+    auto Or = MIRBuilder.buildOr(DstTy, NewOp1, NewOp2);
+    MIRBuilder.buildIntToPtr(DstReg, Or);
+  } else {
+    MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
+  }
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
+  // Split DIVREM into individual instructions.
+  unsigned Opcode = MI.getOpcode();
+
+  MIRBuilder.buildInstr(
+      Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
+                                        : TargetOpcode::G_UDIV,
+      {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
+  MIRBuilder.buildInstr(
+      Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
+                                        : TargetOpcode::G_UREM,
+      {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
+  // Expand %res = G_ABS %a into:
+  // %v1 = G_ASHR %a, scalar_size-1
+  // %v2 = G_ADD %a, %v1
+  // %res = G_XOR %v2, %v1
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+  Register OpReg = MI.getOperand(1).getReg();
+  auto ShiftAmt =
+      MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
+  auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
+  auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
+  MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
+  // Expand %res = G_ABS %a into:
+  // %v1 = G_CONSTANT 0
+  // %v2 = G_SUB %v1, %a
+  // %res = G_SMAX %a, %v2
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT Ty = MRI.getType(SrcReg);
+  auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
+  auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
+  MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+  LLT DstTy = MRI.getType(SrcReg);
+
+  // The source could be a scalar if the IR type was <1 x sN>.
+  if (SrcTy.isScalar()) {
+    if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
+      return UnableToLegalize; // FIXME: handle extension.
+    // This can be just a plain copy.
+    Observer.changingInstr(MI);
+    MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  return UnableToLegalize;
+}
+
+static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
+  // On Darwin, -Os means optimize for size without hurting performance, so
+  // only really optimize for size when -Oz (MinSize) is used.
+  if (MF.getTarget().getTargetTriple().isOSDarwin())
+    return MF.getFunction().hasMinSize();
+  return MF.getFunction().hasOptSize();
+}
+
+// Returns a list of types to use for memory op lowering in MemOps. A partial
+// port of findOptimalMemOpLowering in TargetLowering.
+static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
+                                          unsigned Limit, const MemOp &Op,
+                                          unsigned DstAS, unsigned SrcAS,
+                                          const AttributeList &FuncAttributes,
+                                          const TargetLowering &TLI) {
+  if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
+    return false;
+
+  LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
+
+  if (Ty == LLT()) {
+    // Use the largest scalar type whose alignment constraints are satisfied.
+    // We only need to check DstAlign here as SrcAlign is always greater or
+    // equal to DstAlign (or zero).
+    Ty = LLT::scalar(64);
+    if (Op.isFixedDstAlign())
+      while (Op.getDstAlign() < Ty.getSizeInBytes() &&
+             !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
+        Ty = LLT::scalar(Ty.getSizeInBytes());
+    assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
+    // FIXME: check for the largest legal type we can load/store to.
+  }
+
+  unsigned NumMemOps = 0;
+  uint64_t Size = Op.size();
+  while (Size) {
+    unsigned TySize = Ty.getSizeInBytes();
+    while (TySize > Size) {
+      // For now, only use non-vector load / store's for the left-over pieces.
+      LLT NewTy = Ty;
+      // FIXME: check for mem op safety and legality of the types. Not all of
+      // SDAGisms map cleanly to GISel concepts.
+      if (NewTy.isVector())
+        NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
+      NewTy = LLT::scalar(llvm::bit_floor(NewTy.getSizeInBits() - 1));
+      unsigned NewTySize = NewTy.getSizeInBytes();
+      assert(NewTySize > 0 && "Could not find appropriate type");
+
+      // If the new LLT cannot cover all of the remaining bits, then consider
+      // issuing a (or a pair of) unaligned and overlapping load / store.
+      unsigned Fast;
+      // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
+      MVT VT = getMVTForLLT(Ty);
+      if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
+          TLI.allowsMisalignedMemoryAccesses(
+              VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
+              MachineMemOperand::MONone, &Fast) &&
+          Fast)
+        TySize = Size;
+      else {
+        Ty = NewTy;
+        TySize = NewTySize;
+      }
+    }
+
+    if (++NumMemOps > Limit)
+      return false;
+
+    MemOps.push_back(Ty);
+    Size -= TySize;
+  }
+
+  return true;
+}
+
+static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
+  if (Ty.isVector())
+    return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
+                                Ty.getNumElements());
+  return IntegerType::get(C, Ty.getSizeInBits());
+}
+
+// Get a vectorized representation of the memset value operand, GISel edition.
+static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  unsigned NumBits = Ty.getScalarSizeInBits();
+  auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
+  if (!Ty.isVector() && ValVRegAndVal) {
+    APInt Scalar = ValVRegAndVal->Value.trunc(8);
+    APInt SplatVal = APInt::getSplat(NumBits, Scalar);
+    return MIB.buildConstant(Ty, SplatVal).getReg(0);
+  }
+
+  // Extend the byte value to the larger type, and then multiply by a magic
+  // value 0x010101... in order to replicate it across every byte.
+  // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
+  if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
+    return MIB.buildConstant(Ty, 0).getReg(0);
+  }
+
+  LLT ExtType = Ty.getScalarType();
+  auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
+  if (NumBits > 8) {
+    APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
+    auto MagicMI = MIB.buildConstant(ExtType, Magic);
+    Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
+  }
+
+  // For vector types create a G_BUILD_VECTOR.
+  if (Ty.isVector())
+    Val = MIB.buildSplatVector(Ty, Val).getReg(0);
+
+  return Val;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
+                             uint64_t KnownLen, Align Alignment,
+                             bool IsVolatile) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+  auto &DL = MF.getDataLayout();
+  LLVMContext &C = MF.getFunction().getContext();
+
+  assert(KnownLen != 0 && "Have a zero length memset length!");
+
+  bool DstAlignCanChange = false;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool OptSize = shouldLowerMemFuncForSize(MF);
+
+  MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
+  if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
+    DstAlignCanChange = true;
+
+  unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
+  std::vector<LLT> MemOps;
+
+  const auto &DstMMO = **MI.memoperands_begin();
+  MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
+
+  auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
+  bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
+
+  if (!findGISelOptimalMemOpLowering(MemOps, Limit,
+                                     MemOp::Set(KnownLen, DstAlignCanChange,
+                                                Alignment,
+                                                /*IsZeroMemset=*/IsZeroVal,
+                                                /*IsVolatile=*/IsVolatile),
+                                     DstPtrInfo.getAddrSpace(), ~0u,
+                                     MF.getFunction().getAttributes(), TLI))
+    return UnableToLegalize;
+
+  if (DstAlignCanChange) {
+    // Get an estimate of the type from the LLT.
+    Type *IRTy = getTypeForLLT(MemOps[0], C);
+    Align NewAlign = DL.getABITypeAlign(IRTy);
+    if (NewAlign > Alignment) {
+      Alignment = NewAlign;
+      unsigned FI = FIDef->getOperand(1).getIndex();
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI.getObjectAlign(FI) < Alignment)
+        MFI.setObjectAlignment(FI, Alignment);
+    }
+  }
+
+  MachineIRBuilder MIB(MI);
+  // Find the largest store and generate the bit pattern for it.
+  LLT LargestTy = MemOps[0];
+  for (unsigned i = 1; i < MemOps.size(); i++)
+    if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
+      LargestTy = MemOps[i];
+
+  // The memset stored value is always defined as an s8, so in order to make it
+  // work with larger store types we need to repeat the bit pattern across the
+  // wider type.
+  Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
+
+  if (!MemSetValue)
+    return UnableToLegalize;
+
+  // Generate the stores. For each store type in the list, we generate the
+  // matching store of that type to the destination address.
+  LLT PtrTy = MRI.getType(Dst);
+  unsigned DstOff = 0;
+  unsigned Size = KnownLen;
+  for (unsigned I = 0; I < MemOps.size(); I++) {
+    LLT Ty = MemOps[I];
+    unsigned TySize = Ty.getSizeInBytes();
+    if (TySize > Size) {
+      // Issuing an unaligned load / store pair that overlaps with the previous
+      // pair. Adjust the offset accordingly.
+      assert(I == MemOps.size() - 1 && I != 0);
+      DstOff -= TySize - Size;
+    }
+
+    // If this store is smaller than the largest store see whether we can get
+    // the smaller value for free with a truncate.
+    Register Value = MemSetValue;
+    if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
+      MVT VT = getMVTForLLT(Ty);
+      MVT LargestVT = getMVTForLLT(LargestTy);
+      if (!LargestTy.isVector() && !Ty.isVector() &&
+          TLI.isTruncateFree(LargestVT, VT))
+        Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
+      else
+        Value = getMemsetValue(Val, Ty, MIB);
+      if (!Value)
+        return UnableToLegalize;
+    }
+
+    auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
+
+    Register Ptr = Dst;
+    if (DstOff != 0) {
+      auto Offset =
+          MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
+      Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
+    }
+
+    MIB.buildStore(Value, Ptr, *StoreMMO);
+    DstOff += Ty.getSizeInBytes();
+    Size -= TySize;
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
+
+  auto [Dst, Src, Len] = MI.getFirst3Regs();
+
+  const auto *MMOIt = MI.memoperands_begin();
+  const MachineMemOperand *MemOp = *MMOIt;
+  bool IsVolatile = MemOp->isVolatile();
+
+  // See if this is a constant length copy
+  auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
+  // FIXME: support dynamically sized G_MEMCPY_INLINE
+  assert(LenVRegAndVal &&
+         "inline memcpy with dynamic size is not yet supported");
+  uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
+  if (KnownLen == 0) {
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  const auto &DstMMO = **MI.memoperands_begin();
+  const auto &SrcMMO = **std::next(MI.memoperands_begin());
+  Align DstAlign = DstMMO.getBaseAlign();
+  Align SrcAlign = SrcMMO.getBaseAlign();
+
+  return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
+                           IsVolatile);
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
+                                   uint64_t KnownLen, Align DstAlign,
+                                   Align SrcAlign, bool IsVolatile) {
+  assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
+  return lowerMemcpy(MI, Dst, Src, KnownLen,
+                     std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
+                     IsVolatile);
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
+                             uint64_t KnownLen, uint64_t Limit, Align DstAlign,
+                             Align SrcAlign, bool IsVolatile) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+  auto &DL = MF.getDataLayout();
+  LLVMContext &C = MF.getFunction().getContext();
+
+  assert(KnownLen != 0 && "Have a zero length memcpy length!");
+
+  bool DstAlignCanChange = false;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  Align Alignment = std::min(DstAlign, SrcAlign);
+
+  MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
+  if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
+    DstAlignCanChange = true;
+
+  // FIXME: infer better src pointer alignment like SelectionDAG does here.
+  // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
+  // if the memcpy is in a tail call position.
+
+  std::vector<LLT> MemOps;
+
+  const auto &DstMMO = **MI.memoperands_begin();
+  const auto &SrcMMO = **std::next(MI.memoperands_begin());
+  MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
+  MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
+
+  if (!findGISelOptimalMemOpLowering(
+          MemOps, Limit,
+          MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
+                      IsVolatile),
+          DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
+          MF.getFunction().getAttributes(), TLI))
+    return UnableToLegalize;
+
+  if (DstAlignCanChange) {
+    // Get an estimate of the type from the LLT.
+    Type *IRTy = getTypeForLLT(MemOps[0], C);
+    Align NewAlign = DL.getABITypeAlign(IRTy);
+
+    // Don't promote to an alignment that would require dynamic stack
+    // realignment.
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+    if (!TRI->hasStackRealignment(MF))
+      while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
+        NewAlign = NewAlign.previous();
+
+    if (NewAlign > Alignment) {
+      Alignment = NewAlign;
+      unsigned FI = FIDef->getOperand(1).getIndex();
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI.getObjectAlign(FI) < Alignment)
+        MFI.setObjectAlignment(FI, Alignment);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
+
+  MachineIRBuilder MIB(MI);
+  // Now we need to emit a pair of load and stores for each of the types we've
+  // collected. I.e. for each type, generate a load from the source pointer of
+  // that type width, and then generate a corresponding store to the dest buffer
+  // of that value loaded. This can result in a sequence of loads and stores
+  // mixed types, depending on what the target specifies as good types to use.
+  unsigned CurrOffset = 0;
+  unsigned Size = KnownLen;
+  for (auto CopyTy : MemOps) {
+    // Issuing an unaligned load / store pair  that overlaps with the previous
+    // pair. Adjust the offset accordingly.
+    if (CopyTy.getSizeInBytes() > Size)
+      CurrOffset -= CopyTy.getSizeInBytes() - Size;
+
+    // Construct MMOs for the accesses.
+    auto *LoadMMO =
+        MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
+    auto *StoreMMO =
+        MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
+
+    // Create the load.
+    Register LoadPtr = Src;
+    Register Offset;
+    if (CurrOffset != 0) {
+      LLT SrcTy = MRI.getType(Src);
+      Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
+                   .getReg(0);
+      LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
+    }
+    auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
+
+    // Create the store.
+    Register StorePtr = Dst;
+    if (CurrOffset != 0) {
+      LLT DstTy = MRI.getType(Dst);
+      StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
+    }
+    MIB.buildStore(LdVal, StorePtr, *StoreMMO);
+    CurrOffset += CopyTy.getSizeInBytes();
+    Size -= CopyTy.getSizeInBytes();
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
+                              uint64_t KnownLen, Align DstAlign, Align SrcAlign,
+                              bool IsVolatile) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+  auto &DL = MF.getDataLayout();
+  LLVMContext &C = MF.getFunction().getContext();
+
+  assert(KnownLen != 0 && "Have a zero length memmove length!");
+
+  bool DstAlignCanChange = false;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool OptSize = shouldLowerMemFuncForSize(MF);
+  Align Alignment = std::min(DstAlign, SrcAlign);
+
+  MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
+  if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
+    DstAlignCanChange = true;
+
+  unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
+  std::vector<LLT> MemOps;
+
+  const auto &DstMMO = **MI.memoperands_begin();
+  const auto &SrcMMO = **std::next(MI.memoperands_begin());
+  MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
+  MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
+
+  // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
+  // to a bug in it's findOptimalMemOpLowering implementation. For now do the
+  // same thing here.
+  if (!findGISelOptimalMemOpLowering(
+          MemOps, Limit,
+          MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
+                      /*IsVolatile*/ true),
+          DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
+          MF.getFunction().getAttributes(), TLI))
+    return UnableToLegalize;
+
+  if (DstAlignCanChange) {
+    // Get an estimate of the type from the LLT.
+    Type *IRTy = getTypeForLLT(MemOps[0], C);
+    Align NewAlign = DL.getABITypeAlign(IRTy);
+
+    // Don't promote to an alignment that would require dynamic stack
+    // realignment.
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+    if (!TRI->hasStackRealignment(MF))
+      while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
+        NewAlign = NewAlign.previous();
+
+    if (NewAlign > Alignment) {
+      Alignment = NewAlign;
+      unsigned FI = FIDef->getOperand(1).getIndex();
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI.getObjectAlign(FI) < Alignment)
+        MFI.setObjectAlignment(FI, Alignment);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
+
+  MachineIRBuilder MIB(MI);
+  // Memmove requires that we perform the loads first before issuing the stores.
+  // Apart from that, this loop is pretty much doing the same thing as the
+  // memcpy codegen function.
+  unsigned CurrOffset = 0;
+  SmallVector<Register, 16> LoadVals;
+  for (auto CopyTy : MemOps) {
+    // Construct MMO for the load.
+    auto *LoadMMO =
+        MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
+
+    // Create the load.
+    Register LoadPtr = Src;
+    if (CurrOffset != 0) {
+      LLT SrcTy = MRI.getType(Src);
+      auto Offset =
+          MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
+      LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
+    }
+    LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
+    CurrOffset += CopyTy.getSizeInBytes();
+  }
+
+  CurrOffset = 0;
+  for (unsigned I = 0; I < MemOps.size(); ++I) {
+    LLT CopyTy = MemOps[I];
+    // Now store the values loaded.
+    auto *StoreMMO =
+        MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
+
+    Register StorePtr = Dst;
+    if (CurrOffset != 0) {
+      LLT DstTy = MRI.getType(Dst);
+      auto Offset =
+          MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
+      StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
+    }
+    MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
+    CurrOffset += CopyTy.getSizeInBytes();
+  }
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
+  const unsigned Opc = MI.getOpcode();
+  // This combine is fairly complex so it's not written with a separate
+  // matcher function.
+  assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
+          Opc == TargetOpcode::G_MEMSET) &&
+         "Expected memcpy like instruction");
+
+  auto MMOIt = MI.memoperands_begin();
+  const MachineMemOperand *MemOp = *MMOIt;
+
+  Align DstAlign = MemOp->getBaseAlign();
+  Align SrcAlign;
+  auto [Dst, Src, Len] = MI.getFirst3Regs();
+
+  if (Opc != TargetOpcode::G_MEMSET) {
+    assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
+    MemOp = *(++MMOIt);
+    SrcAlign = MemOp->getBaseAlign();
+  }
+
+  // See if this is a constant length copy
+  auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
+  if (!LenVRegAndVal)
+    return UnableToLegalize;
+  uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
+
+  if (KnownLen == 0) {
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  bool IsVolatile = MemOp->isVolatile();
+  if (Opc == TargetOpcode::G_MEMCPY_INLINE)
+    return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
+                             IsVolatile);
+
+  // Don't try to optimize volatile.
+  if (IsVolatile)
+    return UnableToLegalize;
+
+  if (MaxLen && KnownLen > MaxLen)
+    return UnableToLegalize;
+
+  if (Opc == TargetOpcode::G_MEMCPY) {
+    auto &MF = *MI.getParent()->getParent();
+    const auto &TLI = *MF.getSubtarget().getTargetLowering();
+    bool OptSize = shouldLowerMemFuncForSize(MF);
+    uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
+    return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
+                       IsVolatile);
+  }
+  if (Opc == TargetOpcode::G_MEMMOVE)
+    return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
+  if (Opc == TargetOpcode::G_MEMSET)
+    return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
+  return UnableToLegalize;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
new file mode 100644
index 000000000000..1f2e481c63e0
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -0,0 +1,435 @@
+//===- lib/CodeGen/GlobalISel/LegalizerInfo.cpp - Legalizer ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implement an interface to specify and query how an illegal operation on a
+// given type should be expanded.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+
+using namespace llvm;
+using namespace LegalizeActions;
+
+#define DEBUG_TYPE "legalizer-info"
+
+cl::opt<bool> llvm::DisableGISelLegalityCheck(
+    "disable-gisel-legality-check",
+    cl::desc("Don't verify that MIR is fully legal between GlobalISel passes"),
+    cl::Hidden);
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, LegalizeAction Action) {
+  switch (Action) {
+  case Legal:
+    OS << "Legal";
+    break;
+  case NarrowScalar:
+    OS << "NarrowScalar";
+    break;
+  case WidenScalar:
+    OS << "WidenScalar";
+    break;
+  case FewerElements:
+    OS << "FewerElements";
+    break;
+  case MoreElements:
+    OS << "MoreElements";
+    break;
+  case Bitcast:
+    OS << "Bitcast";
+    break;
+  case Lower:
+    OS << "Lower";
+    break;
+  case Libcall:
+    OS << "Libcall";
+    break;
+  case Custom:
+    OS << "Custom";
+    break;
+  case Unsupported:
+    OS << "Unsupported";
+    break;
+  case NotFound:
+    OS << "NotFound";
+    break;
+  case UseLegacyRules:
+    OS << "UseLegacyRules";
+    break;
+  }
+  return OS;
+}
+
+raw_ostream &LegalityQuery::print(raw_ostream &OS) const {
+  OS << Opcode << ", Tys={";
+  for (const auto &Type : Types) {
+    OS << Type << ", ";
+  }
+  OS << "}, Opcode=";
+
+  OS << Opcode << ", MMOs={";
+  for (const auto &MMODescr : MMODescrs) {
+    OS << MMODescr.MemoryTy << ", ";
+  }
+  OS << "}";
+
+  return OS;
+}
+
+#ifndef NDEBUG
+// Make sure the rule won't (trivially) loop forever.
+static bool hasNoSimpleLoops(const LegalizeRule &Rule, const LegalityQuery &Q,
+                             const std::pair<unsigned, LLT> &Mutation) {
+  switch (Rule.getAction()) {
+  case Legal:
+  case Custom:
+  case Lower:
+  case MoreElements:
+  case FewerElements:
+    break;
+  default:
+    return Q.Types[Mutation.first] != Mutation.second;
+  }
+  return true;
+}
+
+// Make sure the returned mutation makes sense for the match type.
+static bool mutationIsSane(const LegalizeRule &Rule,
+                           const LegalityQuery &Q,
+                           std::pair<unsigned, LLT> Mutation) {
+  // If the user wants a custom mutation, then we can't really say much about
+  // it. Return true, and trust that they're doing the right thing.
+  if (Rule.getAction() == Custom || Rule.getAction() == Legal)
+    return true;
+
+  const unsigned TypeIdx = Mutation.first;
+  const LLT OldTy = Q.Types[TypeIdx];
+  const LLT NewTy = Mutation.second;
+
+  switch (Rule.getAction()) {
+  case FewerElements:
+    if (!OldTy.isVector())
+      return false;
+    [[fallthrough]];
+  case MoreElements: {
+    // MoreElements can go from scalar to vector.
+    const ElementCount OldElts = OldTy.isVector() ?
+      OldTy.getElementCount() : ElementCount::getFixed(1);
+    if (NewTy.isVector()) {
+      if (Rule.getAction() == FewerElements) {
+        // Make sure the element count really decreased.
+        if (ElementCount::isKnownGE(NewTy.getElementCount(), OldElts))
+          return false;
+      } else {
+        // Make sure the element count really increased.
+        if (ElementCount::isKnownLE(NewTy.getElementCount(), OldElts))
+          return false;
+      }
+    } else if (Rule.getAction() == MoreElements)
+      return false;
+
+    // Make sure the element type didn't change.
+    return NewTy.getScalarType() == OldTy.getScalarType();
+  }
+  case NarrowScalar:
+  case WidenScalar: {
+    if (OldTy.isVector()) {
+      // Number of elements should not change.
+      if (!NewTy.isVector() || OldTy.getNumElements() != NewTy.getNumElements())
+        return false;
+    } else {
+      // Both types must be vectors
+      if (NewTy.isVector())
+        return false;
+    }
+
+    if (Rule.getAction() == NarrowScalar)  {
+      // Make sure the size really decreased.
+      if (NewTy.getScalarSizeInBits() >= OldTy.getScalarSizeInBits())
+        return false;
+    } else {
+      // Make sure the size really increased.
+      if (NewTy.getScalarSizeInBits() <= OldTy.getScalarSizeInBits())
+        return false;
+    }
+
+    return true;
+  }
+  case Bitcast: {
+    return OldTy != NewTy && OldTy.getSizeInBits() == NewTy.getSizeInBits();
+  }
+  default:
+    return true;
+  }
+}
+#endif
+
+LegalizeActionStep LegalizeRuleSet::apply(const LegalityQuery &Query) const {
+  LLVM_DEBUG(dbgs() << "Applying legalizer ruleset to: "; Query.print(dbgs());
+             dbgs() << "\n");
+  if (Rules.empty()) {
+    LLVM_DEBUG(dbgs() << ".. fallback to legacy rules (no rules defined)\n");
+    return {LegalizeAction::UseLegacyRules, 0, LLT{}};
+  }
+  for (const LegalizeRule &Rule : Rules) {
+    if (Rule.match(Query)) {
+      LLVM_DEBUG(dbgs() << ".. match\n");
+      std::pair<unsigned, LLT> Mutation = Rule.determineMutation(Query);
+      LLVM_DEBUG(dbgs() << ".. .. " << Rule.getAction() << ", "
+                        << Mutation.first << ", " << Mutation.second << "\n");
+      assert(mutationIsSane(Rule, Query, Mutation) &&
+             "legality mutation invalid for match");
+      assert(hasNoSimpleLoops(Rule, Query, Mutation) && "Simple loop detected");
+      return {Rule.getAction(), Mutation.first, Mutation.second};
+    } else
+      LLVM_DEBUG(dbgs() << ".. no match\n");
+  }
+  LLVM_DEBUG(dbgs() << ".. unsupported\n");
+  return {LegalizeAction::Unsupported, 0, LLT{}};
+}
+
+bool LegalizeRuleSet::verifyTypeIdxsCoverage(unsigned NumTypeIdxs) const {
+#ifndef NDEBUG
+  if (Rules.empty()) {
+    LLVM_DEBUG(
+        dbgs() << ".. type index coverage check SKIPPED: no rules defined\n");
+    return true;
+  }
+  const int64_t FirstUncovered = TypeIdxsCovered.find_first_unset();
+  if (FirstUncovered < 0) {
+    LLVM_DEBUG(dbgs() << ".. type index coverage check SKIPPED:"
+                         " user-defined predicate detected\n");
+    return true;
+  }
+  const bool AllCovered = (FirstUncovered >= NumTypeIdxs);
+  if (NumTypeIdxs > 0)
+    LLVM_DEBUG(dbgs() << ".. the first uncovered type index: " << FirstUncovered
+                      << ", " << (AllCovered ? "OK" : "FAIL") << "\n");
+  return AllCovered;
+#else
+  return true;
+#endif
+}
+
+bool LegalizeRuleSet::verifyImmIdxsCoverage(unsigned NumImmIdxs) const {
+#ifndef NDEBUG
+  if (Rules.empty()) {
+    LLVM_DEBUG(
+        dbgs() << ".. imm index coverage check SKIPPED: no rules defined\n");
+    return true;
+  }
+  const int64_t FirstUncovered = ImmIdxsCovered.find_first_unset();
+  if (FirstUncovered < 0) {
+    LLVM_DEBUG(dbgs() << ".. imm index coverage check SKIPPED:"
+                         " user-defined predicate detected\n");
+    return true;
+  }
+  const bool AllCovered = (FirstUncovered >= NumImmIdxs);
+  LLVM_DEBUG(dbgs() << ".. the first uncovered imm index: " << FirstUncovered
+                    << ", " << (AllCovered ? "OK" : "FAIL") << "\n");
+  return AllCovered;
+#else
+  return true;
+#endif
+}
+
+/// Helper function to get LLT for the given type index.
+static LLT getTypeFromTypeIdx(const MachineInstr &MI,
+                              const MachineRegisterInfo &MRI, unsigned OpIdx,
+                              unsigned TypeIdx) {
+  assert(TypeIdx < MI.getNumOperands() && "Unexpected TypeIdx");
+  // G_UNMERGE_VALUES has variable number of operands, but there is only
+  // one source type and one destination type as all destinations must be the
+  // same type. So, get the last operand if TypeIdx == 1.
+  if (MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES && TypeIdx == 1)
+    return MRI.getType(MI.getOperand(MI.getNumOperands() - 1).getReg());
+  return MRI.getType(MI.getOperand(OpIdx).getReg());
+}
+
+unsigned LegalizerInfo::getOpcodeIdxForOpcode(unsigned Opcode) const {
+  assert(Opcode >= FirstOp && Opcode <= LastOp && "Unsupported opcode");
+  return Opcode - FirstOp;
+}
+
+unsigned LegalizerInfo::getActionDefinitionsIdx(unsigned Opcode) const {
+  unsigned OpcodeIdx = getOpcodeIdxForOpcode(Opcode);
+  if (unsigned Alias = RulesForOpcode[OpcodeIdx].getAlias()) {
+    LLVM_DEBUG(dbgs() << ".. opcode " << Opcode << " is aliased to " << Alias
+                      << "\n");
+    OpcodeIdx = getOpcodeIdxForOpcode(Alias);
+    assert(RulesForOpcode[OpcodeIdx].getAlias() == 0 && "Cannot chain aliases");
+  }
+
+  return OpcodeIdx;
+}
+
+const LegalizeRuleSet &
+LegalizerInfo::getActionDefinitions(unsigned Opcode) const {
+  unsigned OpcodeIdx = getActionDefinitionsIdx(Opcode);
+  return RulesForOpcode[OpcodeIdx];
+}
+
+LegalizeRuleSet &LegalizerInfo::getActionDefinitionsBuilder(unsigned Opcode) {
+  unsigned OpcodeIdx = getActionDefinitionsIdx(Opcode);
+  auto &Result = RulesForOpcode[OpcodeIdx];
+  assert(!Result.isAliasedByAnother() && "Modifying this opcode will modify aliases");
+  return Result;
+}
+
+LegalizeRuleSet &LegalizerInfo::getActionDefinitionsBuilder(
+    std::initializer_list<unsigned> Opcodes) {
+  unsigned Representative = *Opcodes.begin();
+
+  assert(Opcodes.size() >= 2 &&
+         "Initializer list must have at least two opcodes");
+
+  for (unsigned Op : llvm::drop_begin(Opcodes))
+    aliasActionDefinitions(Representative, Op);
+
+  auto &Return = getActionDefinitionsBuilder(Representative);
+  Return.setIsAliasedByAnother();
+  return Return;
+}
+
+void LegalizerInfo::aliasActionDefinitions(unsigned OpcodeTo,
+                                           unsigned OpcodeFrom) {
+  assert(OpcodeTo != OpcodeFrom && "Cannot alias to self");
+  assert(OpcodeTo >= FirstOp && OpcodeTo <= LastOp && "Unsupported opcode");
+  const unsigned OpcodeFromIdx = getOpcodeIdxForOpcode(OpcodeFrom);
+  RulesForOpcode[OpcodeFromIdx].aliasTo(OpcodeTo);
+}
+
+LegalizeActionStep
+LegalizerInfo::getAction(const LegalityQuery &Query) const {
+  LegalizeActionStep Step = getActionDefinitions(Query.Opcode).apply(Query);
+  if (Step.Action != LegalizeAction::UseLegacyRules) {
+    return Step;
+  }
+
+  return getLegacyLegalizerInfo().getAction(Query);
+}
+
+LegalizeActionStep
+LegalizerInfo::getAction(const MachineInstr &MI,
+                         const MachineRegisterInfo &MRI) const {
+  SmallVector<LLT, 8> Types;
+  SmallBitVector SeenTypes(8);
+  ArrayRef<MCOperandInfo> OpInfo = MI.getDesc().operands();
+  // FIXME: probably we'll need to cache the results here somehow?
+  for (unsigned i = 0; i < MI.getDesc().getNumOperands(); ++i) {
+    if (!OpInfo[i].isGenericType())
+      continue;
+
+    // We must only record actions once for each TypeIdx; otherwise we'd
+    // try to legalize operands multiple times down the line.
+    unsigned TypeIdx = OpInfo[i].getGenericTypeIndex();
+    if (SeenTypes[TypeIdx])
+      continue;
+
+    SeenTypes.set(TypeIdx);
+
+    LLT Ty = getTypeFromTypeIdx(MI, MRI, i, TypeIdx);
+    Types.push_back(Ty);
+  }
+
+  SmallVector<LegalityQuery::MemDesc, 2> MemDescrs;
+  for (const auto &MMO : MI.memoperands())
+    MemDescrs.push_back({*MMO});
+
+  return getAction({MI.getOpcode(), Types, MemDescrs});
+}
+
+bool LegalizerInfo::isLegal(const MachineInstr &MI,
+                            const MachineRegisterInfo &MRI) const {
+  return getAction(MI, MRI).Action == Legal;
+}
+
+bool LegalizerInfo::isLegalOrCustom(const MachineInstr &MI,
+                                    const MachineRegisterInfo &MRI) const {
+  auto Action = getAction(MI, MRI).Action;
+  // If the action is custom, it may not necessarily modify the instruction,
+  // so we have to assume it's legal.
+  return Action == Legal || Action == Custom;
+}
+
+unsigned LegalizerInfo::getExtOpcodeForWideningConstant(LLT SmallTy) const {
+  return SmallTy.isByteSized() ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
+}
+
+/// \pre Type indices of every opcode form a dense set starting from 0.
+void LegalizerInfo::verify(const MCInstrInfo &MII) const {
+#ifndef NDEBUG
+  std::vector<unsigned> FailedOpcodes;
+  for (unsigned Opcode = FirstOp; Opcode <= LastOp; ++Opcode) {
+    const MCInstrDesc &MCID = MII.get(Opcode);
+    const unsigned NumTypeIdxs = std::accumulate(
+        MCID.operands().begin(), MCID.operands().end(), 0U,
+        [](unsigned Acc, const MCOperandInfo &OpInfo) {
+          return OpInfo.isGenericType()
+                     ? std::max(OpInfo.getGenericTypeIndex() + 1U, Acc)
+                     : Acc;
+        });
+    const unsigned NumImmIdxs = std::accumulate(
+        MCID.operands().begin(), MCID.operands().end(), 0U,
+        [](unsigned Acc, const MCOperandInfo &OpInfo) {
+          return OpInfo.isGenericImm()
+                     ? std::max(OpInfo.getGenericImmIndex() + 1U, Acc)
+                     : Acc;
+        });
+    LLVM_DEBUG(dbgs() << MII.getName(Opcode) << " (opcode " << Opcode
+                      << "): " << NumTypeIdxs << " type ind"
+                      << (NumTypeIdxs == 1 ? "ex" : "ices") << ", "
+                      << NumImmIdxs << " imm ind"
+                      << (NumImmIdxs == 1 ? "ex" : "ices") << "\n");
+    const LegalizeRuleSet &RuleSet = getActionDefinitions(Opcode);
+    if (!RuleSet.verifyTypeIdxsCoverage(NumTypeIdxs))
+      FailedOpcodes.push_back(Opcode);
+    else if (!RuleSet.verifyImmIdxsCoverage(NumImmIdxs))
+      FailedOpcodes.push_back(Opcode);
+  }
+  if (!FailedOpcodes.empty()) {
+    errs() << "The following opcodes have ill-defined legalization rules:";
+    for (unsigned Opcode : FailedOpcodes)
+      errs() << " " << MII.getName(Opcode);
+    errs() << "\n";
+
+    report_fatal_error("ill-defined LegalizerInfo"
+                       ", try -debug-only=legalizer-info for details");
+  }
+#endif
+}
+
+#ifndef NDEBUG
+// FIXME: This should be in the MachineVerifier, but it can't use the
+// LegalizerInfo as it's currently in the separate GlobalISel library.
+// Note that RegBankSelected property already checked in the verifier
+// has the same layering problem, but we only use inline methods so
+// end up not needing to link against the GlobalISel library.
+const MachineInstr *llvm::machineFunctionIsIllegal(const MachineFunction &MF) {
+  if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo()) {
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    for (const MachineBasicBlock &MBB : MF)
+      for (const MachineInstr &MI : MBB)
+        if (isPreISelGenericOpcode(MI.getOpcode()) &&
+            !MLI->isLegalOrCustom(MI, MRI))
+          return &MI;
+  }
+  return nullptr;
+}
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
new file mode 100644
index 000000000000..49f40495d6fc
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -0,0 +1,971 @@
+//===- LoadStoreOpt.cpp ----------- Generic memory optimizations -*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the LoadStoreOpt optimization pass.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/LoadStoreOpt.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+
+#define DEBUG_TYPE "loadstore-opt"
+
+using namespace llvm;
+using namespace ore;
+using namespace MIPatternMatch;
+
+STATISTIC(NumStoresMerged, "Number of stores merged");
+
+const unsigned MaxStoreSizeToForm = 128;
+
+char LoadStoreOpt::ID = 0;
+INITIALIZE_PASS_BEGIN(LoadStoreOpt, DEBUG_TYPE, "Generic memory optimizations",
+                      false, false)
+INITIALIZE_PASS_END(LoadStoreOpt, DEBUG_TYPE, "Generic memory optimizations",
+                    false, false)
+
+LoadStoreOpt::LoadStoreOpt(std::function<bool(const MachineFunction &)> F)
+    : MachineFunctionPass(ID), DoNotRunPass(F) {}
+
+LoadStoreOpt::LoadStoreOpt()
+    : LoadStoreOpt([](const MachineFunction &) { return false; }) {}
+
+void LoadStoreOpt::init(MachineFunction &MF) {
+  this->MF = &MF;
+  MRI = &MF.getRegInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  TLI = MF.getSubtarget().getTargetLowering();
+  LI = MF.getSubtarget().getLegalizerInfo();
+  Builder.setMF(MF);
+  IsPreLegalizer = !MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::Legalized);
+  InstsToErase.clear();
+}
+
+void LoadStoreOpt::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.setPreservesAll();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+BaseIndexOffset GISelAddressing::getPointerInfo(Register Ptr,
+                                                MachineRegisterInfo &MRI) {
+  BaseIndexOffset Info;
+  Register PtrAddRHS;
+  if (!mi_match(Ptr, MRI, m_GPtrAdd(m_Reg(Info.BaseReg), m_Reg(PtrAddRHS)))) {
+    Info.BaseReg = Ptr;
+    Info.IndexReg = Register();
+    Info.IsIndexSignExt = false;
+    return Info;
+  }
+
+  auto RHSCst = getIConstantVRegValWithLookThrough(PtrAddRHS, MRI);
+  if (RHSCst)
+    Info.Offset = RHSCst->Value.getSExtValue();
+
+  // Just recognize a simple case for now. In future we'll need to match
+  // indexing patterns for base + index + constant.
+  Info.IndexReg = PtrAddRHS;
+  Info.IsIndexSignExt = false;
+  return Info;
+}
+
+bool GISelAddressing::aliasIsKnownForLoadStore(const MachineInstr &MI1,
+                                               const MachineInstr &MI2,
+                                               bool &IsAlias,
+                                               MachineRegisterInfo &MRI) {
+  auto *LdSt1 = dyn_cast<GLoadStore>(&MI1);
+  auto *LdSt2 = dyn_cast<GLoadStore>(&MI2);
+  if (!LdSt1 || !LdSt2)
+    return false;
+
+  BaseIndexOffset BasePtr0 = getPointerInfo(LdSt1->getPointerReg(), MRI);
+  BaseIndexOffset BasePtr1 = getPointerInfo(LdSt2->getPointerReg(), MRI);
+
+  if (!BasePtr0.BaseReg.isValid() || !BasePtr1.BaseReg.isValid())
+    return false;
+
+  int64_t Size1 = LdSt1->getMemSize();
+  int64_t Size2 = LdSt2->getMemSize();
+
+  int64_t PtrDiff;
+  if (BasePtr0.BaseReg == BasePtr1.BaseReg) {
+    PtrDiff = BasePtr1.Offset - BasePtr0.Offset;
+    // If the size of memory access is unknown, do not use it to do analysis.
+    // One example of unknown size memory access is to load/store scalable
+    // vector objects on the stack.
+    // BasePtr1 is PtrDiff away from BasePtr0. They alias if none of the
+    // following situations arise:
+    if (PtrDiff >= 0 &&
+        Size1 != static_cast<int64_t>(MemoryLocation::UnknownSize)) {
+      // [----BasePtr0----]
+      //                         [---BasePtr1--]
+      // ========PtrDiff========>
+      IsAlias = !(Size1 <= PtrDiff);
+      return true;
+    }
+    if (PtrDiff < 0 &&
+        Size2 != static_cast<int64_t>(MemoryLocation::UnknownSize)) {
+      //                     [----BasePtr0----]
+      // [---BasePtr1--]
+      // =====(-PtrDiff)====>
+      IsAlias = !((PtrDiff + Size2) <= 0);
+      return true;
+    }
+    return false;
+  }
+
+  // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
+  // able to calculate their relative offset if at least one arises
+  // from an alloca. However, these allocas cannot overlap and we
+  // can infer there is no alias.
+  auto *Base0Def = getDefIgnoringCopies(BasePtr0.BaseReg, MRI);
+  auto *Base1Def = getDefIgnoringCopies(BasePtr1.BaseReg, MRI);
+  if (!Base0Def || !Base1Def)
+    return false; // Couldn't tell anything.
+
+
+  if (Base0Def->getOpcode() != Base1Def->getOpcode())
+    return false;
+
+  if (Base0Def->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
+    MachineFrameInfo &MFI = Base0Def->getMF()->getFrameInfo();
+    // If the bases have the same frame index but we couldn't find a
+    // constant offset, (indices are different) be conservative.
+    if (Base0Def != Base1Def &&
+        (!MFI.isFixedObjectIndex(Base0Def->getOperand(1).getIndex()) ||
+         !MFI.isFixedObjectIndex(Base1Def->getOperand(1).getIndex()))) {
+      IsAlias = false;
+      return true;
+    }
+  }
+
+  // This implementation is a lot more primitive than the SDAG one for now.
+  // FIXME: what about constant pools?
+  if (Base0Def->getOpcode() == TargetOpcode::G_GLOBAL_VALUE) {
+    auto GV0 = Base0Def->getOperand(1).getGlobal();
+    auto GV1 = Base1Def->getOperand(1).getGlobal();
+    if (GV0 != GV1) {
+      IsAlias = false;
+      return true;
+    }
+  }
+
+  // Can't tell anything about aliasing.
+  return false;
+}
+
+bool GISelAddressing::instMayAlias(const MachineInstr &MI,
+                                   const MachineInstr &Other,
+                                   MachineRegisterInfo &MRI,
+                                   AliasAnalysis *AA) {
+  struct MemUseCharacteristics {
+    bool IsVolatile;
+    bool IsAtomic;
+    Register BasePtr;
+    int64_t Offset;
+    uint64_t NumBytes;
+    MachineMemOperand *MMO;
+  };
+
+  auto getCharacteristics =
+      [&](const MachineInstr *MI) -> MemUseCharacteristics {
+    if (const auto *LS = dyn_cast<GLoadStore>(MI)) {
+      Register BaseReg;
+      int64_t Offset = 0;
+      // No pre/post-inc addressing modes are considered here, unlike in SDAG.
+      if (!mi_match(LS->getPointerReg(), MRI,
+                    m_GPtrAdd(m_Reg(BaseReg), m_ICst(Offset)))) {
+        BaseReg = LS->getPointerReg();
+        Offset = 0;
+      }
+
+      uint64_t Size = MemoryLocation::getSizeOrUnknown(
+          LS->getMMO().getMemoryType().getSizeInBytes());
+      return {LS->isVolatile(),       LS->isAtomic(),          BaseReg,
+              Offset /*base offset*/, Size, &LS->getMMO()};
+    }
+    // FIXME: support recognizing lifetime instructions.
+    // Default.
+    return {false /*isvolatile*/,
+            /*isAtomic*/ false,          Register(),
+            (int64_t)0 /*offset*/,       0 /*size*/,
+            (MachineMemOperand *)nullptr};
+  };
+  MemUseCharacteristics MUC0 = getCharacteristics(&MI),
+                        MUC1 = getCharacteristics(&Other);
+
+  // If they are to the same address, then they must be aliases.
+  if (MUC0.BasePtr.isValid() && MUC0.BasePtr == MUC1.BasePtr &&
+      MUC0.Offset == MUC1.Offset)
+    return true;
+
+  // If they are both volatile then they cannot be reordered.
+  if (MUC0.IsVolatile && MUC1.IsVolatile)
+    return true;
+
+  // Be conservative about atomics for the moment
+  // TODO: This is way overconservative for unordered atomics (see D66309)
+  if (MUC0.IsAtomic && MUC1.IsAtomic)
+    return true;
+
+  // If one operation reads from invariant memory, and the other may store, they
+  // cannot alias.
+  if (MUC0.MMO && MUC1.MMO) {
+    if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) ||
+        (MUC1.MMO->isInvariant() && MUC0.MMO->isStore()))
+      return false;
+  }
+
+  // Try to prove that there is aliasing, or that there is no aliasing. Either
+  // way, we can return now. If nothing can be proved, proceed with more tests.
+  bool IsAlias;
+  if (GISelAddressing::aliasIsKnownForLoadStore(MI, Other, IsAlias, MRI))
+    return IsAlias;
+
+  // The following all rely on MMO0 and MMO1 being valid.
+  if (!MUC0.MMO || !MUC1.MMO)
+    return true;
+
+  // FIXME: port the alignment based alias analysis from SDAG's isAlias().
+  int64_t SrcValOffset0 = MUC0.MMO->getOffset();
+  int64_t SrcValOffset1 = MUC1.MMO->getOffset();
+  uint64_t Size0 = MUC0.NumBytes;
+  uint64_t Size1 = MUC1.NumBytes;
+  if (AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() &&
+      Size0 != MemoryLocation::UnknownSize &&
+      Size1 != MemoryLocation::UnknownSize) {
+    // Use alias analysis information.
+    int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
+    int64_t Overlap0 = Size0 + SrcValOffset0 - MinOffset;
+    int64_t Overlap1 = Size1 + SrcValOffset1 - MinOffset;
+    if (AA->isNoAlias(MemoryLocation(MUC0.MMO->getValue(), Overlap0,
+                                     MUC0.MMO->getAAInfo()),
+                      MemoryLocation(MUC1.MMO->getValue(), Overlap1,
+                                     MUC1.MMO->getAAInfo())))
+      return false;
+  }
+
+  // Otherwise we have to assume they alias.
+  return true;
+}
+
+/// Returns true if the instruction creates an unavoidable hazard that
+/// forces a boundary between store merge candidates.
+static bool isInstHardMergeHazard(MachineInstr &MI) {
+  return MI.hasUnmodeledSideEffects() || MI.hasOrderedMemoryRef();
+}
+
+bool LoadStoreOpt::mergeStores(SmallVectorImpl<GStore *> &StoresToMerge) {
+  // Try to merge all the stores in the vector, splitting into separate segments
+  // as necessary.
+  assert(StoresToMerge.size() > 1 && "Expected multiple stores to merge");
+  LLT OrigTy = MRI->getType(StoresToMerge[0]->getValueReg());
+  LLT PtrTy = MRI->getType(StoresToMerge[0]->getPointerReg());
+  unsigned AS = PtrTy.getAddressSpace();
+  // Ensure the legal store info is computed for this address space.
+  initializeStoreMergeTargetInfo(AS);
+  const auto &LegalSizes = LegalStoreSizes[AS];
+
+#ifndef NDEBUG
+  for (auto *StoreMI : StoresToMerge)
+    assert(MRI->getType(StoreMI->getValueReg()) == OrigTy);
+#endif
+
+  const auto &DL = MF->getFunction().getParent()->getDataLayout();
+  bool AnyMerged = false;
+  do {
+    unsigned NumPow2 = llvm::bit_floor(StoresToMerge.size());
+    unsigned MaxSizeBits = NumPow2 * OrigTy.getSizeInBits().getFixedValue();
+    // Compute the biggest store we can generate to handle the number of stores.
+    unsigned MergeSizeBits;
+    for (MergeSizeBits = MaxSizeBits; MergeSizeBits > 1; MergeSizeBits /= 2) {
+      LLT StoreTy = LLT::scalar(MergeSizeBits);
+      EVT StoreEVT =
+          getApproximateEVTForLLT(StoreTy, DL, MF->getFunction().getContext());
+      if (LegalSizes.size() > MergeSizeBits && LegalSizes[MergeSizeBits] &&
+          TLI->canMergeStoresTo(AS, StoreEVT, *MF) &&
+          (TLI->isTypeLegal(StoreEVT)))
+        break; // We can generate a MergeSize bits store.
+    }
+    if (MergeSizeBits <= OrigTy.getSizeInBits())
+      return AnyMerged; // No greater merge.
+
+    unsigned NumStoresToMerge = MergeSizeBits / OrigTy.getSizeInBits();
+    // Perform the actual merging.
+    SmallVector<GStore *, 8> SingleMergeStores(
+        StoresToMerge.begin(), StoresToMerge.begin() + NumStoresToMerge);
+    AnyMerged |= doSingleStoreMerge(SingleMergeStores);
+    StoresToMerge.erase(StoresToMerge.begin(),
+                        StoresToMerge.begin() + NumStoresToMerge);
+  } while (StoresToMerge.size() > 1);
+  return AnyMerged;
+}
+
+bool LoadStoreOpt::isLegalOrBeforeLegalizer(const LegalityQuery &Query,
+                                            MachineFunction &MF) const {
+  auto Action = LI->getAction(Query).Action;
+  // If the instruction is unsupported, it can't be legalized at all.
+  if (Action == LegalizeActions::Unsupported)
+    return false;
+  return IsPreLegalizer || Action == LegalizeAction::Legal;
+}
+
+bool LoadStoreOpt::doSingleStoreMerge(SmallVectorImpl<GStore *> &Stores) {
+  assert(Stores.size() > 1);
+  // We know that all the stores are consecutive and there are no aliasing
+  // operations in the range. However, the values that are being stored may be
+  // generated anywhere before each store. To ensure we have the values
+  // available, we materialize the wide value and new store at the place of the
+  // final store in the merge sequence.
+  GStore *FirstStore = Stores[0];
+  const unsigned NumStores = Stores.size();
+  LLT SmallTy = MRI->getType(FirstStore->getValueReg());
+  LLT WideValueTy =
+      LLT::scalar(NumStores * SmallTy.getSizeInBits().getFixedValue());
+
+  // For each store, compute pairwise merged debug locs.
+  DebugLoc MergedLoc = Stores.front()->getDebugLoc();
+  for (auto *Store : drop_begin(Stores))
+    MergedLoc = DILocation::getMergedLocation(MergedLoc, Store->getDebugLoc());
+
+  Builder.setInstr(*Stores.back());
+  Builder.setDebugLoc(MergedLoc);
+
+  // If all of the store values are constants, then create a wide constant
+  // directly. Otherwise, we need to generate some instructions to merge the
+  // existing values together into a wider type.
+  SmallVector<APInt, 8> ConstantVals;
+  for (auto *Store : Stores) {
+    auto MaybeCst =
+        getIConstantVRegValWithLookThrough(Store->getValueReg(), *MRI);
+    if (!MaybeCst) {
+      ConstantVals.clear();
+      break;
+    }
+    ConstantVals.emplace_back(MaybeCst->Value);
+  }
+
+  Register WideReg;
+  auto *WideMMO =
+      MF->getMachineMemOperand(&FirstStore->getMMO(), 0, WideValueTy);
+  if (ConstantVals.empty()) {
+    // Mimic the SDAG behaviour here and don't try to do anything for unknown
+    // values. In future, we should also support the cases of loads and
+    // extracted vector elements.
+    return false;
+  }
+
+  assert(ConstantVals.size() == NumStores);
+  // Check if our wide constant is legal.
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_CONSTANT, {WideValueTy}}, *MF))
+    return false;
+  APInt WideConst(WideValueTy.getSizeInBits(), 0);
+  for (unsigned Idx = 0; Idx < ConstantVals.size(); ++Idx) {
+    // Insert the smaller constant into the corresponding position in the
+    // wider one.
+    WideConst.insertBits(ConstantVals[Idx], Idx * SmallTy.getSizeInBits());
+  }
+  WideReg = Builder.buildConstant(WideValueTy, WideConst).getReg(0);
+  auto NewStore =
+      Builder.buildStore(WideReg, FirstStore->getPointerReg(), *WideMMO);
+  (void) NewStore;
+  LLVM_DEBUG(dbgs() << "Merged " << Stores.size()
+                    << " stores into merged store: " << *NewStore);
+  LLVM_DEBUG(for (auto *MI : Stores) dbgs() << "  " << *MI;);
+  NumStoresMerged += Stores.size();
+
+  MachineOptimizationRemarkEmitter MORE(*MF, nullptr);
+  MORE.emit([&]() {
+    MachineOptimizationRemark R(DEBUG_TYPE, "MergedStore",
+                                FirstStore->getDebugLoc(),
+                                FirstStore->getParent());
+    R << "Merged " << NV("NumMerged", Stores.size()) << " stores of "
+      << NV("OrigWidth", SmallTy.getSizeInBytes())
+      << " bytes into a single store of "
+      << NV("NewWidth", WideValueTy.getSizeInBytes()) << " bytes";
+    return R;
+  });
+
+  for (auto *MI : Stores)
+    InstsToErase.insert(MI);
+  return true;
+}
+
+bool LoadStoreOpt::processMergeCandidate(StoreMergeCandidate &C) {
+  if (C.Stores.size() < 2) {
+    C.reset();
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Checking store merge candidate with " << C.Stores.size()
+                    << " stores, starting with " << *C.Stores[0]);
+  // We know that the stores in the candidate are adjacent.
+  // Now we need to check if any potential aliasing instructions recorded
+  // during the search alias with load/stores added to the candidate after.
+  // For example, if we have the candidate:
+  //   C.Stores = [ST1, ST2, ST3, ST4]
+  // and after seeing ST2 we saw a load LD1, which did not alias with ST1 or
+  // ST2, then we would have recorded it into the PotentialAliases structure
+  // with the associated index value of "1". Then we see ST3 and ST4 and add
+  // them to the candidate group. We know that LD1 does not alias with ST1 or
+  // ST2, since we already did that check. However we don't yet know if it
+  // may alias ST3 and ST4, so we perform those checks now.
+  SmallVector<GStore *> StoresToMerge;
+
+  auto DoesStoreAliasWithPotential = [&](unsigned Idx, GStore &CheckStore) {
+    for (auto AliasInfo : reverse(C.PotentialAliases)) {
+      MachineInstr *PotentialAliasOp = AliasInfo.first;
+      unsigned PreCheckedIdx = AliasInfo.second;
+      if (static_cast<unsigned>(Idx) < PreCheckedIdx) {
+        // Once our store index is lower than the index associated with the
+        // potential alias, we know that we've already checked for this alias
+        // and all of the earlier potential aliases too.
+        return false;
+      }
+      // Need to check this alias.
+      if (GISelAddressing::instMayAlias(CheckStore, *PotentialAliasOp, *MRI,
+                                        AA)) {
+        LLVM_DEBUG(dbgs() << "Potential alias " << *PotentialAliasOp
+                          << " detected\n");
+        return true;
+      }
+    }
+    return false;
+  };
+  // Start from the last store in the group, and check if it aliases with any
+  // of the potential aliasing operations in the list.
+  for (int StoreIdx = C.Stores.size() - 1; StoreIdx >= 0; --StoreIdx) {
+    auto *CheckStore = C.Stores[StoreIdx];
+    if (DoesStoreAliasWithPotential(StoreIdx, *CheckStore))
+      continue;
+    StoresToMerge.emplace_back(CheckStore);
+  }
+
+  LLVM_DEBUG(dbgs() << StoresToMerge.size()
+                    << " stores remaining after alias checks. Merging...\n");
+
+  // Now we've checked for aliasing hazards, merge any stores left.
+  C.reset();
+  if (StoresToMerge.size() < 2)
+    return false;
+  return mergeStores(StoresToMerge);
+}
+
+bool LoadStoreOpt::operationAliasesWithCandidate(MachineInstr &MI,
+                                                 StoreMergeCandidate &C) {
+  if (C.Stores.empty())
+    return false;
+  return llvm::any_of(C.Stores, [&](MachineInstr *OtherMI) {
+    return instMayAlias(MI, *OtherMI, *MRI, AA);
+  });
+}
+
+void LoadStoreOpt::StoreMergeCandidate::addPotentialAlias(MachineInstr &MI) {
+  PotentialAliases.emplace_back(std::make_pair(&MI, Stores.size() - 1));
+}
+
+bool LoadStoreOpt::addStoreToCandidate(GStore &StoreMI,
+                                       StoreMergeCandidate &C) {
+  // Check if the given store writes to an adjacent address, and other
+  // requirements.
+  LLT ValueTy = MRI->getType(StoreMI.getValueReg());
+  LLT PtrTy = MRI->getType(StoreMI.getPointerReg());
+
+  // Only handle scalars.
+  if (!ValueTy.isScalar())
+    return false;
+
+  // Don't allow truncating stores for now.
+  if (StoreMI.getMemSizeInBits() != ValueTy.getSizeInBits())
+    return false;
+
+  // Avoid adding volatile or ordered stores to the candidate. We already have a
+  // check for this in instMayAlias() but that only get's called later between
+  // potential aliasing hazards.
+  if (!StoreMI.isSimple())
+    return false;
+
+  Register StoreAddr = StoreMI.getPointerReg();
+  auto BIO = getPointerInfo(StoreAddr, *MRI);
+  Register StoreBase = BIO.BaseReg;
+  uint64_t StoreOffCst = BIO.Offset;
+  if (C.Stores.empty()) {
+    // This is the first store of the candidate.
+    // If the offset can't possibly allow for a lower addressed store with the
+    // same base, don't bother adding it.
+    if (StoreOffCst < ValueTy.getSizeInBytes())
+      return false;
+    C.BasePtr = StoreBase;
+    C.CurrentLowestOffset = StoreOffCst;
+    C.Stores.emplace_back(&StoreMI);
+    LLVM_DEBUG(dbgs() << "Starting a new merge candidate group with: "
+                      << StoreMI);
+    return true;
+  }
+
+  // Check the store is the same size as the existing ones in the candidate.
+  if (MRI->getType(C.Stores[0]->getValueReg()).getSizeInBits() !=
+      ValueTy.getSizeInBits())
+    return false;
+
+  if (MRI->getType(C.Stores[0]->getPointerReg()).getAddressSpace() !=
+      PtrTy.getAddressSpace())
+    return false;
+
+  // There are other stores in the candidate. Check that the store address
+  // writes to the next lowest adjacent address.
+  if (C.BasePtr != StoreBase)
+    return false;
+  if ((C.CurrentLowestOffset - ValueTy.getSizeInBytes()) !=
+      static_cast<uint64_t>(StoreOffCst))
+    return false;
+
+  // This writes to an adjacent address. Allow it.
+  C.Stores.emplace_back(&StoreMI);
+  C.CurrentLowestOffset = C.CurrentLowestOffset - ValueTy.getSizeInBytes();
+  LLVM_DEBUG(dbgs() << "Candidate added store: " << StoreMI);
+  return true;
+}
+
+bool LoadStoreOpt::mergeBlockStores(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  // Walk through the block bottom-up, looking for merging candidates.
+  StoreMergeCandidate Candidate;
+  for (MachineInstr &MI : llvm::reverse(MBB)) {
+    if (InstsToErase.contains(&MI))
+      continue;
+
+    if (auto *StoreMI = dyn_cast<GStore>(&MI)) {
+      // We have a G_STORE. Add it to the candidate if it writes to an adjacent
+      // address.
+      if (!addStoreToCandidate(*StoreMI, Candidate)) {
+        // Store wasn't eligible to be added. May need to record it as a
+        // potential alias.
+        if (operationAliasesWithCandidate(*StoreMI, Candidate)) {
+          Changed |= processMergeCandidate(Candidate);
+          continue;
+        }
+        Candidate.addPotentialAlias(*StoreMI);
+      }
+      continue;
+    }
+
+    // If we don't have any stores yet, this instruction can't pose a problem.
+    if (Candidate.Stores.empty())
+      continue;
+
+    // We're dealing with some other kind of instruction.
+    if (isInstHardMergeHazard(MI)) {
+      Changed |= processMergeCandidate(Candidate);
+      Candidate.Stores.clear();
+      continue;
+    }
+
+    if (!MI.mayLoadOrStore())
+      continue;
+
+    if (operationAliasesWithCandidate(MI, Candidate)) {
+      // We have a potential alias, so process the current candidate if we can
+      // and then continue looking for a new candidate.
+      Changed |= processMergeCandidate(Candidate);
+      continue;
+    }
+
+    // Record this instruction as a potential alias for future stores that are
+    // added to the candidate.
+    Candidate.addPotentialAlias(MI);
+  }
+
+  // Process any candidate left after finishing searching the entire block.
+  Changed |= processMergeCandidate(Candidate);
+
+  // Erase instructions now that we're no longer iterating over the block.
+  for (auto *MI : InstsToErase)
+    MI->eraseFromParent();
+  InstsToErase.clear();
+  return Changed;
+}
+
+/// Check if the store \p Store is a truncstore that can be merged. That is,
+/// it's a store of a shifted value of \p SrcVal. If \p SrcVal is an empty
+/// Register then it does not need to match and SrcVal is set to the source
+/// value found.
+/// On match, returns the start byte offset of the \p SrcVal that is being
+/// stored.
+static std::optional<int64_t>
+getTruncStoreByteOffset(GStore &Store, Register &SrcVal,
+                        MachineRegisterInfo &MRI) {
+  Register TruncVal;
+  if (!mi_match(Store.getValueReg(), MRI, m_GTrunc(m_Reg(TruncVal))))
+    return std::nullopt;
+
+  // The shift amount must be a constant multiple of the narrow type.
+  // It is translated to the offset address in the wide source value "y".
+  //
+  // x = G_LSHR y, ShiftAmtC
+  // s8 z = G_TRUNC x
+  // store z, ...
+  Register FoundSrcVal;
+  int64_t ShiftAmt;
+  if (!mi_match(TruncVal, MRI,
+                m_any_of(m_GLShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt)),
+                         m_GAShr(m_Reg(FoundSrcVal), m_ICst(ShiftAmt))))) {
+    if (!SrcVal.isValid() || TruncVal == SrcVal) {
+      if (!SrcVal.isValid())
+        SrcVal = TruncVal;
+      return 0; // If it's the lowest index store.
+    }
+    return std::nullopt;
+  }
+
+  unsigned NarrowBits = Store.getMMO().getMemoryType().getScalarSizeInBits();
+  if (ShiftAmt % NarrowBits != 0)
+    return std::nullopt;
+  const unsigned Offset = ShiftAmt / NarrowBits;
+
+  if (SrcVal.isValid() && FoundSrcVal != SrcVal)
+    return std::nullopt;
+
+  if (!SrcVal.isValid())
+    SrcVal = FoundSrcVal;
+  else if (MRI.getType(SrcVal) != MRI.getType(FoundSrcVal))
+    return std::nullopt;
+  return Offset;
+}
+
+/// Match a pattern where a wide type scalar value is stored by several narrow
+/// stores. Fold it into a single store or a BSWAP and a store if the targets
+/// supports it.
+///
+/// Assuming little endian target:
+///  i8 *p = ...
+///  i32 val = ...
+///  p[0] = (val >> 0) & 0xFF;
+///  p[1] = (val >> 8) & 0xFF;
+///  p[2] = (val >> 16) & 0xFF;
+///  p[3] = (val >> 24) & 0xFF;
+/// =>
+///  *((i32)p) = val;
+///
+///  i8 *p = ...
+///  i32 val = ...
+///  p[0] = (val >> 24) & 0xFF;
+///  p[1] = (val >> 16) & 0xFF;
+///  p[2] = (val >> 8) & 0xFF;
+///  p[3] = (val >> 0) & 0xFF;
+/// =>
+///  *((i32)p) = BSWAP(val);
+bool LoadStoreOpt::mergeTruncStore(GStore &StoreMI,
+                                   SmallPtrSetImpl<GStore *> &DeletedStores) {
+  LLT MemTy = StoreMI.getMMO().getMemoryType();
+
+  // We only handle merging simple stores of 1-4 bytes.
+  if (!MemTy.isScalar())
+    return false;
+  switch (MemTy.getSizeInBits()) {
+  case 8:
+  case 16:
+  case 32:
+    break;
+  default:
+    return false;
+  }
+  if (!StoreMI.isSimple())
+    return false;
+
+  // We do a simple search for mergeable stores prior to this one.
+  // Any potential alias hazard along the way terminates the search.
+  SmallVector<GStore *> FoundStores;
+
+  // We're looking for:
+  // 1) a (store(trunc(...)))
+  // 2) of an LSHR/ASHR of a single wide value, by the appropriate shift to get
+  //    the partial value stored.
+  // 3) where the offsets form either a little or big-endian sequence.
+
+  auto &LastStore = StoreMI;
+
+  // The single base pointer that all stores must use.
+  Register BaseReg;
+  int64_t LastOffset;
+  if (!mi_match(LastStore.getPointerReg(), *MRI,
+                m_GPtrAdd(m_Reg(BaseReg), m_ICst(LastOffset)))) {
+    BaseReg = LastStore.getPointerReg();
+    LastOffset = 0;
+  }
+
+  GStore *LowestIdxStore = &LastStore;
+  int64_t LowestIdxOffset = LastOffset;
+
+  Register WideSrcVal;
+  auto LowestShiftAmt = getTruncStoreByteOffset(LastStore, WideSrcVal, *MRI);
+  if (!LowestShiftAmt)
+    return false; // Didn't match a trunc.
+  assert(WideSrcVal.isValid());
+
+  LLT WideStoreTy = MRI->getType(WideSrcVal);
+  // The wide type might not be a multiple of the memory type, e.g. s48 and s32.
+  if (WideStoreTy.getSizeInBits() % MemTy.getSizeInBits() != 0)
+    return false;
+  const unsigned NumStoresRequired =
+      WideStoreTy.getSizeInBits() / MemTy.getSizeInBits();
+
+  SmallVector<int64_t, 8> OffsetMap(NumStoresRequired, INT64_MAX);
+  OffsetMap[*LowestShiftAmt] = LastOffset;
+  FoundStores.emplace_back(&LastStore);
+
+  const int MaxInstsToCheck = 10;
+  int NumInstsChecked = 0;
+  for (auto II = ++LastStore.getReverseIterator();
+       II != LastStore.getParent()->rend() && NumInstsChecked < MaxInstsToCheck;
+       ++II) {
+    NumInstsChecked++;
+    GStore *NewStore;
+    if ((NewStore = dyn_cast<GStore>(&*II))) {
+      if (NewStore->getMMO().getMemoryType() != MemTy || !NewStore->isSimple())
+        break;
+    } else if (II->isLoadFoldBarrier() || II->mayLoad()) {
+      break;
+    } else {
+      continue; // This is a safe instruction we can look past.
+    }
+
+    Register NewBaseReg;
+    int64_t MemOffset;
+    // Check we're storing to the same base + some offset.
+    if (!mi_match(NewStore->getPointerReg(), *MRI,
+                  m_GPtrAdd(m_Reg(NewBaseReg), m_ICst(MemOffset)))) {
+      NewBaseReg = NewStore->getPointerReg();
+      MemOffset = 0;
+    }
+    if (BaseReg != NewBaseReg)
+      break;
+
+    auto ShiftByteOffset = getTruncStoreByteOffset(*NewStore, WideSrcVal, *MRI);
+    if (!ShiftByteOffset)
+      break;
+    if (MemOffset < LowestIdxOffset) {
+      LowestIdxOffset = MemOffset;
+      LowestIdxStore = NewStore;
+    }
+
+    // Map the offset in the store and the offset in the combined value, and
+    // early return if it has been set before.
+    if (*ShiftByteOffset < 0 || *ShiftByteOffset >= NumStoresRequired ||
+        OffsetMap[*ShiftByteOffset] != INT64_MAX)
+      break;
+    OffsetMap[*ShiftByteOffset] = MemOffset;
+
+    FoundStores.emplace_back(NewStore);
+    // Reset counter since we've found a matching inst.
+    NumInstsChecked = 0;
+    if (FoundStores.size() == NumStoresRequired)
+      break;
+  }
+
+  if (FoundStores.size() != NumStoresRequired) {
+    if (FoundStores.size() == 1)
+      return false;
+    // We didn't find enough stores to merge into the size of the original
+    // source value, but we may be able to generate a smaller store if we
+    // truncate the source value.
+    WideStoreTy = LLT::scalar(FoundStores.size() * MemTy.getScalarSizeInBits());
+  }
+
+  unsigned NumStoresFound = FoundStores.size();
+
+  const auto &DL = LastStore.getMF()->getDataLayout();
+  auto &C = LastStore.getMF()->getFunction().getContext();
+  // Check that a store of the wide type is both allowed and fast on the target
+  unsigned Fast = 0;
+  bool Allowed = TLI->allowsMemoryAccess(
+      C, DL, WideStoreTy, LowestIdxStore->getMMO(), &Fast);
+  if (!Allowed || !Fast)
+    return false;
+
+  // Check if the pieces of the value are going to the expected places in memory
+  // to merge the stores.
+  unsigned NarrowBits = MemTy.getScalarSizeInBits();
+  auto checkOffsets = [&](bool MatchLittleEndian) {
+    if (MatchLittleEndian) {
+      for (unsigned i = 0; i != NumStoresFound; ++i)
+        if (OffsetMap[i] != i * (NarrowBits / 8) + LowestIdxOffset)
+          return false;
+    } else { // MatchBigEndian by reversing loop counter.
+      for (unsigned i = 0, j = NumStoresFound - 1; i != NumStoresFound;
+           ++i, --j)
+        if (OffsetMap[j] != i * (NarrowBits / 8) + LowestIdxOffset)
+          return false;
+    }
+    return true;
+  };
+
+  // Check if the offsets line up for the native data layout of this target.
+  bool NeedBswap = false;
+  bool NeedRotate = false;
+  if (!checkOffsets(DL.isLittleEndian())) {
+    // Special-case: check if byte offsets line up for the opposite endian.
+    if (NarrowBits == 8 && checkOffsets(DL.isBigEndian()))
+      NeedBswap = true;
+    else if (NumStoresFound == 2 && checkOffsets(DL.isBigEndian()))
+      NeedRotate = true;
+    else
+      return false;
+  }
+
+  if (NeedBswap &&
+      !isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {WideStoreTy}}, *MF))
+    return false;
+  if (NeedRotate &&
+      !isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_ROTR, {WideStoreTy, WideStoreTy}}, *MF))
+    return false;
+
+  Builder.setInstrAndDebugLoc(StoreMI);
+
+  if (WideStoreTy != MRI->getType(WideSrcVal))
+    WideSrcVal = Builder.buildTrunc(WideStoreTy, WideSrcVal).getReg(0);
+
+  if (NeedBswap) {
+    WideSrcVal = Builder.buildBSwap(WideStoreTy, WideSrcVal).getReg(0);
+  } else if (NeedRotate) {
+    assert(WideStoreTy.getSizeInBits() % 2 == 0 &&
+           "Unexpected type for rotate");
+    auto RotAmt =
+        Builder.buildConstant(WideStoreTy, WideStoreTy.getSizeInBits() / 2);
+    WideSrcVal =
+        Builder.buildRotateRight(WideStoreTy, WideSrcVal, RotAmt).getReg(0);
+  }
+
+  Builder.buildStore(WideSrcVal, LowestIdxStore->getPointerReg(),
+                     LowestIdxStore->getMMO().getPointerInfo(),
+                     LowestIdxStore->getMMO().getAlign());
+
+  // Erase the old stores.
+  for (auto *ST : FoundStores) {
+    ST->eraseFromParent();
+    DeletedStores.insert(ST);
+  }
+  return true;
+}
+
+bool LoadStoreOpt::mergeTruncStoresBlock(MachineBasicBlock &BB) {
+  bool Changed = false;
+  SmallVector<GStore *, 16> Stores;
+  SmallPtrSet<GStore *, 8> DeletedStores;
+  // Walk up the block so we can see the most eligible stores.
+  for (MachineInstr &MI : llvm::reverse(BB))
+    if (auto *StoreMI = dyn_cast<GStore>(&MI))
+      Stores.emplace_back(StoreMI);
+
+  for (auto *StoreMI : Stores) {
+    if (DeletedStores.count(StoreMI))
+      continue;
+    if (mergeTruncStore(*StoreMI, DeletedStores))
+      Changed = true;
+  }
+  return Changed;
+}
+
+bool LoadStoreOpt::mergeFunctionStores(MachineFunction &MF) {
+  bool Changed = false;
+  for (auto &BB : MF){
+    Changed |= mergeBlockStores(BB);
+    Changed |= mergeTruncStoresBlock(BB);
+  }
+
+  // Erase all dead instructions left over by the merging.
+  if (Changed) {
+    for (auto &BB : MF) {
+      for (auto &I : make_early_inc_range(make_range(BB.rbegin(), BB.rend()))) {
+        if (isTriviallyDead(I, *MRI))
+          I.eraseFromParent();
+      }
+    }
+  }
+
+  return Changed;
+}
+
+void LoadStoreOpt::initializeStoreMergeTargetInfo(unsigned AddrSpace) {
+  // Query the legalizer info to record what store types are legal.
+  // We record this because we don't want to bother trying to merge stores into
+  // illegal ones, which would just result in being split again.
+
+  if (LegalStoreSizes.count(AddrSpace)) {
+    assert(LegalStoreSizes[AddrSpace].any());
+    return; // Already cached sizes for this address space.
+  }
+
+  // Need to reserve at least MaxStoreSizeToForm + 1 bits.
+  BitVector LegalSizes(MaxStoreSizeToForm * 2);
+  const auto &LI = *MF->getSubtarget().getLegalizerInfo();
+  const auto &DL = MF->getFunction().getParent()->getDataLayout();
+  Type *IntPtrIRTy =
+      DL.getIntPtrType(MF->getFunction().getContext(), AddrSpace);
+  LLT PtrTy = getLLTForType(*IntPtrIRTy->getPointerTo(AddrSpace), DL);
+  // We assume that we're not going to be generating any stores wider than
+  // MaxStoreSizeToForm bits for now.
+  for (unsigned Size = 2; Size <= MaxStoreSizeToForm; Size *= 2) {
+    LLT Ty = LLT::scalar(Size);
+    SmallVector<LegalityQuery::MemDesc, 2> MemDescrs(
+        {{Ty, Ty.getSizeInBits(), AtomicOrdering::NotAtomic}});
+    SmallVector<LLT> StoreTys({Ty, PtrTy});
+    LegalityQuery Q(TargetOpcode::G_STORE, StoreTys, MemDescrs);
+    LegalizeActionStep ActionStep = LI.getAction(Q);
+    if (ActionStep.Action == LegalizeActions::Legal)
+      LegalSizes.set(Size);
+  }
+  assert(LegalSizes.any() && "Expected some store sizes to be legal!");
+  LegalStoreSizes[AddrSpace] = LegalSizes;
+}
+
+bool LoadStoreOpt::runOnMachineFunction(MachineFunction &MF) {
+  // If the ISel pipeline failed, do not bother running that pass.
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Begin memory optimizations for: " << MF.getName()
+                    << '\n');
+
+  init(MF);
+  bool Changed = false;
+  Changed |= mergeFunctionStores(MF);
+
+  LegalStoreSizes.clear();
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
new file mode 100644
index 000000000000..55984423e5bc
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
@@ -0,0 +1,220 @@
+//===- Localizer.cpp ---------------------- Localize some instrs -*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the Localizer class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/Localizer.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "localizer"
+
+using namespace llvm;
+
+char Localizer::ID = 0;
+INITIALIZE_PASS_BEGIN(Localizer, DEBUG_TYPE,
+                      "Move/duplicate certain instructions close to their use",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(Localizer, DEBUG_TYPE,
+                    "Move/duplicate certain instructions close to their use",
+                    false, false)
+
+Localizer::Localizer(std::function<bool(const MachineFunction &)> F)
+    : MachineFunctionPass(ID), DoNotRunPass(F) {}
+
+Localizer::Localizer()
+    : Localizer([](const MachineFunction &) { return false; }) {}
+
+void Localizer::init(MachineFunction &MF) {
+  MRI = &MF.getRegInfo();
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(MF.getFunction());
+}
+
+void Localizer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def,
+                           MachineBasicBlock *&InsertMBB) {
+  MachineInstr &MIUse = *MOUse.getParent();
+  InsertMBB = MIUse.getParent();
+  if (MIUse.isPHI())
+    InsertMBB = MIUse.getOperand(MOUse.getOperandNo() + 1).getMBB();
+  return InsertMBB == Def.getParent();
+}
+
+bool Localizer::isNonUniquePhiValue(MachineOperand &Op) const {
+  MachineInstr *MI = Op.getParent();
+  if (!MI->isPHI())
+    return false;
+
+  Register SrcReg = Op.getReg();
+  for (unsigned Idx = 1; Idx < MI->getNumOperands(); Idx += 2) {
+    auto &MO = MI->getOperand(Idx);
+    if (&MO != &Op && MO.isReg() && MO.getReg() == SrcReg)
+      return true;
+  }
+  return false;
+}
+
+bool Localizer::localizeInterBlock(MachineFunction &MF,
+                                   LocalizedSetVecT &LocalizedInstrs) {
+  bool Changed = false;
+  DenseMap<std::pair<MachineBasicBlock *, unsigned>, unsigned> MBBWithLocalDef;
+
+  // Since the IRTranslator only emits constants into the entry block, and the
+  // rest of the GISel pipeline generally emits constants close to their users,
+  // we only localize instructions in the entry block here. This might change if
+  // we start doing CSE across blocks.
+  auto &MBB = MF.front();
+  auto &TL = *MF.getSubtarget().getTargetLowering();
+  for (MachineInstr &MI : llvm::reverse(MBB)) {
+    if (!TL.shouldLocalize(MI, TTI))
+      continue;
+    LLVM_DEBUG(dbgs() << "Should localize: " << MI);
+    assert(MI.getDesc().getNumDefs() == 1 &&
+           "More than one definition not supported yet");
+    Register Reg = MI.getOperand(0).getReg();
+    // Check if all the users of MI are local.
+    // We are going to invalidation the list of use operands, so we
+    // can't use range iterator.
+    for (MachineOperand &MOUse :
+         llvm::make_early_inc_range(MRI->use_operands(Reg))) {
+      // Check if the use is already local.
+      MachineBasicBlock *InsertMBB;
+      LLVM_DEBUG(MachineInstr &MIUse = *MOUse.getParent();
+                 dbgs() << "Checking use: " << MIUse
+                        << " #Opd: " << MOUse.getOperandNo() << '\n');
+      if (isLocalUse(MOUse, MI, InsertMBB)) {
+        // Even if we're in the same block, if the block is very large we could
+        // still have many long live ranges. Try to do intra-block localization
+        // too.
+        LocalizedInstrs.insert(&MI);
+        continue;
+      }
+
+      // If the use is a phi operand that's not unique, don't try to localize.
+      // If we do, we can cause unnecessary instruction bloat by duplicating
+      // into each predecessor block, when the existing one is sufficient and
+      // allows for easier optimization later.
+      if (isNonUniquePhiValue(MOUse))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Fixing non-local use\n");
+      Changed = true;
+      auto MBBAndReg = std::make_pair(InsertMBB, Reg);
+      auto NewVRegIt = MBBWithLocalDef.find(MBBAndReg);
+      if (NewVRegIt == MBBWithLocalDef.end()) {
+        // Create the localized instruction.
+        MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI);
+        LocalizedInstrs.insert(LocalizedMI);
+        MachineInstr &UseMI = *MOUse.getParent();
+        if (MRI->hasOneUse(Reg) && !UseMI.isPHI())
+          InsertMBB->insert(UseMI, LocalizedMI);
+        else
+          InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(InsertMBB->begin()),
+                            LocalizedMI);
+
+        // Set a new register for the definition.
+        Register NewReg = MRI->cloneVirtualRegister(Reg);
+        LocalizedMI->getOperand(0).setReg(NewReg);
+        NewVRegIt =
+            MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first;
+        LLVM_DEBUG(dbgs() << "Inserted: " << *LocalizedMI);
+      }
+      LLVM_DEBUG(dbgs() << "Update use with: " << printReg(NewVRegIt->second)
+                        << '\n');
+      // Update the user reg.
+      MOUse.setReg(NewVRegIt->second);
+    }
+  }
+  return Changed;
+}
+
+bool Localizer::localizeIntraBlock(LocalizedSetVecT &LocalizedInstrs) {
+  bool Changed = false;
+
+  // For each already-localized instruction which has multiple users, then we
+  // scan the block top down from the current position until we hit one of them.
+
+  // FIXME: Consider doing inst duplication if live ranges are very long due to
+  // many users, but this case may be better served by regalloc improvements.
+
+  for (MachineInstr *MI : LocalizedInstrs) {
+    Register Reg = MI->getOperand(0).getReg();
+    MachineBasicBlock &MBB = *MI->getParent();
+    // All of the user MIs of this reg.
+    SmallPtrSet<MachineInstr *, 32> Users;
+    for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg)) {
+      if (!UseMI.isPHI())
+        Users.insert(&UseMI);
+    }
+    // If all the users were PHIs then they're not going to be in our block,
+    // don't try to move this instruction.
+    if (Users.empty())
+      continue;
+
+    MachineBasicBlock::iterator II(MI);
+    ++II;
+    while (II != MBB.end() && !Users.count(&*II))
+      ++II;
+
+    assert(II != MBB.end() && "Didn't find the user in the MBB");
+    LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II
+                      << '\n');
+
+    MI->removeFromParent();
+    MBB.insert(II, MI);
+    Changed = true;
+
+    // If the instruction (constant) being localized has single user, we can
+    // propagate debug location from user.
+    if (Users.size() == 1) {
+      const auto &DefDL = MI->getDebugLoc();
+      const auto &UserDL = (*Users.begin())->getDebugLoc();
+
+      if ((!DefDL || DefDL.getLine() == 0) && UserDL && UserDL.getLine() != 0) {
+        MI->setDebugLoc(UserDL);
+      }
+    }
+  }
+  return Changed;
+}
+
+bool Localizer::runOnMachineFunction(MachineFunction &MF) {
+  // If the ISel pipeline failed, do not bother running that pass.
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+
+  // Don't run the pass if the target asked so.
+  if (DoNotRunPass(MF))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Localize instructions for: " << MF.getName() << '\n');
+
+  init(MF);
+
+  // Keep track of the instructions we localized. We'll do a second pass of
+  // intra-block localization to further reduce live ranges.
+  LocalizedSetVecT LocalizedInstrs;
+
+  bool Changed = localizeInterBlock(MF, LocalizedInstrs);
+  Changed |= localizeIntraBlock(LocalizedInstrs);
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LostDebugLocObserver.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LostDebugLocObserver.cpp
new file mode 100644
index 000000000000..6d606e5550f1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LostDebugLocObserver.cpp
@@ -0,0 +1,113 @@
+//===----- llvm/CodeGen/GlobalISel/LostDebugLocObserver.cpp -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// Tracks DebugLocs between checkpoints and verifies that they are transferred.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
+
+using namespace llvm;
+
+#define LOC_DEBUG(X) DEBUG_WITH_TYPE(DebugType.str().c_str(), X)
+
+void LostDebugLocObserver::analyzeDebugLocations() {
+  if (LostDebugLocs.empty()) {
+    LOC_DEBUG(dbgs() << ".. No debug info was present\n");
+    return;
+  }
+  if (PotentialMIsForDebugLocs.empty()) {
+    LOC_DEBUG(
+        dbgs() << ".. No instructions to carry debug info (dead code?)\n");
+    return;
+  }
+
+  LOC_DEBUG(dbgs() << ".. Searching " << PotentialMIsForDebugLocs.size()
+                   << " instrs for " << LostDebugLocs.size() << " locations\n");
+  SmallPtrSet<MachineInstr *, 4> FoundIn;
+  for (MachineInstr *MI : PotentialMIsForDebugLocs) {
+    if (!MI->getDebugLoc())
+      continue;
+    // Check this first in case there's a matching line-0 location on both input
+    // and output.
+    if (MI->getDebugLoc().getLine() == 0) {
+      LOC_DEBUG(
+          dbgs() << ".. Assuming line-0 location covers remainder (if any)\n");
+      return;
+    }
+    if (LostDebugLocs.erase(MI->getDebugLoc())) {
+      LOC_DEBUG(dbgs() << ".. .. found " << MI->getDebugLoc() << " in " << *MI);
+      FoundIn.insert(MI);
+      continue;
+    }
+  }
+  if (LostDebugLocs.empty())
+    return;
+
+  NumLostDebugLocs += LostDebugLocs.size();
+  LOC_DEBUG({
+    dbgs() << ".. Lost locations:\n";
+    for (const DebugLoc &Loc : LostDebugLocs) {
+      dbgs() << ".. .. ";
+      Loc.print(dbgs());
+      dbgs() << "\n";
+    }
+    dbgs() << ".. MIs with matched locations:\n";
+    for (MachineInstr *MI : FoundIn)
+      if (PotentialMIsForDebugLocs.erase(MI))
+        dbgs() << ".. .. " << *MI;
+    dbgs() << ".. Remaining MIs with unmatched/no locations:\n";
+    for (const MachineInstr *MI : PotentialMIsForDebugLocs)
+      dbgs() << ".. .. " << *MI;
+  });
+}
+
+void LostDebugLocObserver::checkpoint(bool CheckDebugLocs) {
+  if (CheckDebugLocs)
+    analyzeDebugLocations();
+  PotentialMIsForDebugLocs.clear();
+  LostDebugLocs.clear();
+}
+
+void LostDebugLocObserver::createdInstr(MachineInstr &MI) {
+  PotentialMIsForDebugLocs.insert(&MI);
+}
+
+static bool irTranslatorNeverAddsLocations(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_FCONSTANT:
+  case TargetOpcode::G_IMPLICIT_DEF:
+  case TargetOpcode::G_GLOBAL_VALUE:
+    return true;
+  }
+}
+
+void LostDebugLocObserver::erasingInstr(MachineInstr &MI) {
+  if (irTranslatorNeverAddsLocations(MI.getOpcode()))
+    return;
+
+  PotentialMIsForDebugLocs.erase(&MI);
+  if (MI.getDebugLoc())
+    LostDebugLocs.insert(MI.getDebugLoc());
+}
+
+void LostDebugLocObserver::changingInstr(MachineInstr &MI) {
+  if (irTranslatorNeverAddsLocations(MI.getOpcode()))
+    return;
+
+  PotentialMIsForDebugLocs.erase(&MI);
+  if (MI.getDebugLoc())
+    LostDebugLocs.insert(MI.getDebugLoc());
+}
+
+void LostDebugLocObserver::changedInstr(MachineInstr &MI) {
+  PotentialMIsForDebugLocs.insert(&MI);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
new file mode 100644
index 000000000000..962b54ec5d6b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -0,0 +1,1318 @@
+//===-- llvm/CodeGen/GlobalISel/MachineIRBuilder.cpp - MIBuilder--*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the MachineIRBuidler class.
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+
+using namespace llvm;
+
+void MachineIRBuilder::setMF(MachineFunction &MF) {
+  State.MF = &MF;
+  State.MBB = nullptr;
+  State.MRI = &MF.getRegInfo();
+  State.TII = MF.getSubtarget().getInstrInfo();
+  State.DL = DebugLoc();
+  State.PCSections = nullptr;
+  State.II = MachineBasicBlock::iterator();
+  State.Observer = nullptr;
+}
+
+//------------------------------------------------------------------------------
+// Build instruction variants.
+//------------------------------------------------------------------------------
+
+MachineInstrBuilder MachineIRBuilder::buildInstrNoInsert(unsigned Opcode) {
+  return BuildMI(getMF(), {getDL(), getPCSections()}, getTII().get(Opcode));
+}
+
+MachineInstrBuilder MachineIRBuilder::insertInstr(MachineInstrBuilder MIB) {
+  getMBB().insert(getInsertPt(), MIB);
+  recordInsertion(MIB);
+  return MIB;
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildDirectDbgValue(Register Reg, const MDNode *Variable,
+                                      const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(
+      cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(getDL()) &&
+      "Expected inlined-at fields to agree");
+  return insertInstr(BuildMI(getMF(), getDL(),
+                             getTII().get(TargetOpcode::DBG_VALUE),
+                             /*IsIndirect*/ false, Reg, Variable, Expr));
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildIndirectDbgValue(Register Reg, const MDNode *Variable,
+                                        const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(
+      cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(getDL()) &&
+      "Expected inlined-at fields to agree");
+  return insertInstr(BuildMI(getMF(), getDL(),
+                             getTII().get(TargetOpcode::DBG_VALUE),
+                             /*IsIndirect*/ true, Reg, Variable, Expr));
+}
+
+MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI,
+                                                      const MDNode *Variable,
+                                                      const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(
+      cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(getDL()) &&
+      "Expected inlined-at fields to agree");
+  return insertInstr(buildInstrNoInsert(TargetOpcode::DBG_VALUE)
+                         .addFrameIndex(FI)
+                         .addImm(0)
+                         .addMetadata(Variable)
+                         .addMetadata(Expr));
+}
+
+MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
+                                                         const MDNode *Variable,
+                                                         const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(
+      cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(getDL()) &&
+      "Expected inlined-at fields to agree");
+  auto MIB = buildInstrNoInsert(TargetOpcode::DBG_VALUE);
+
+  auto *NumericConstant = [&] () -> const Constant* {
+    if (const auto *CE = dyn_cast<ConstantExpr>(&C))
+      if (CE->getOpcode() == Instruction::IntToPtr)
+        return CE->getOperand(0);
+    return &C;
+  }();
+
+  if (auto *CI = dyn_cast<ConstantInt>(NumericConstant)) {
+    if (CI->getBitWidth() > 64)
+      MIB.addCImm(CI);
+    else
+      MIB.addImm(CI->getZExtValue());
+  } else if (auto *CFP = dyn_cast<ConstantFP>(NumericConstant)) {
+    MIB.addFPImm(CFP);
+  } else if (isa<ConstantPointerNull>(NumericConstant)) {
+    MIB.addImm(0);
+  } else {
+    // Insert $noreg if we didn't find a usable constant and had to drop it.
+    MIB.addReg(Register());
+  }
+
+  MIB.addImm(0).addMetadata(Variable).addMetadata(Expr);
+  return insertInstr(MIB);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildDbgLabel(const MDNode *Label) {
+  assert(isa<DILabel>(Label) && "not a label");
+  assert(cast<DILabel>(Label)->isValidLocationForIntrinsic(State.DL) &&
+         "Expected inlined-at fields to agree");
+  auto MIB = buildInstr(TargetOpcode::DBG_LABEL);
+
+  return MIB.addMetadata(Label);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildDynStackAlloc(const DstOp &Res,
+                                                         const SrcOp &Size,
+                                                         Align Alignment) {
+  assert(Res.getLLTTy(*getMRI()).isPointer() && "expected ptr dst type");
+  auto MIB = buildInstr(TargetOpcode::G_DYN_STACKALLOC);
+  Res.addDefToMIB(*getMRI(), MIB);
+  Size.addSrcToMIB(MIB);
+  MIB.addImm(Alignment.value());
+  return MIB;
+}
+
+MachineInstrBuilder MachineIRBuilder::buildFrameIndex(const DstOp &Res,
+                                                      int Idx) {
+  assert(Res.getLLTTy(*getMRI()).isPointer() && "invalid operand type");
+  auto MIB = buildInstr(TargetOpcode::G_FRAME_INDEX);
+  Res.addDefToMIB(*getMRI(), MIB);
+  MIB.addFrameIndex(Idx);
+  return MIB;
+}
+
+MachineInstrBuilder MachineIRBuilder::buildGlobalValue(const DstOp &Res,
+                                                       const GlobalValue *GV) {
+  assert(Res.getLLTTy(*getMRI()).isPointer() && "invalid operand type");
+  assert(Res.getLLTTy(*getMRI()).getAddressSpace() ==
+             GV->getType()->getAddressSpace() &&
+         "address space mismatch");
+
+  auto MIB = buildInstr(TargetOpcode::G_GLOBAL_VALUE);
+  Res.addDefToMIB(*getMRI(), MIB);
+  MIB.addGlobalAddress(GV);
+  return MIB;
+}
+
+MachineInstrBuilder MachineIRBuilder::buildConstantPool(const DstOp &Res,
+                                                        unsigned Idx) {
+  assert(Res.getLLTTy(*getMRI()).isPointer() && "invalid operand type");
+  auto MIB = buildInstr(TargetOpcode::G_CONSTANT_POOL);
+  Res.addDefToMIB(*getMRI(), MIB);
+  MIB.addConstantPoolIndex(Idx);
+  return MIB;
+}
+
+MachineInstrBuilder MachineIRBuilder::buildJumpTable(const LLT PtrTy,
+                                                     unsigned JTI) {
+  return buildInstr(TargetOpcode::G_JUMP_TABLE, {PtrTy}, {})
+      .addJumpTableIndex(JTI);
+}
+
+void MachineIRBuilder::validateUnaryOp(const LLT Res, const LLT Op0) {
+  assert((Res.isScalar() || Res.isVector()) && "invalid operand type");
+  assert((Res == Op0) && "type mismatch");
+}
+
+void MachineIRBuilder::validateBinaryOp(const LLT Res, const LLT Op0,
+                                        const LLT Op1) {
+  assert((Res.isScalar() || Res.isVector()) && "invalid operand type");
+  assert((Res == Op0 && Res == Op1) && "type mismatch");
+}
+
+void MachineIRBuilder::validateShiftOp(const LLT Res, const LLT Op0,
+                                       const LLT Op1) {
+  assert((Res.isScalar() || Res.isVector()) && "invalid operand type");
+  assert((Res == Op0) && "type mismatch");
+}
+
+MachineInstrBuilder MachineIRBuilder::buildPtrAdd(const DstOp &Res,
+                                                  const SrcOp &Op0,
+                                                  const SrcOp &Op1) {
+  assert(Res.getLLTTy(*getMRI()).getScalarType().isPointer() &&
+         Res.getLLTTy(*getMRI()) == Op0.getLLTTy(*getMRI()) && "type mismatch");
+  assert(Op1.getLLTTy(*getMRI()).getScalarType().isScalar() && "invalid offset type");
+
+  return buildInstr(TargetOpcode::G_PTR_ADD, {Res}, {Op0, Op1});
+}
+
+std::optional<MachineInstrBuilder>
+MachineIRBuilder::materializePtrAdd(Register &Res, Register Op0,
+                                    const LLT ValueTy, uint64_t Value) {
+  assert(Res == 0 && "Res is a result argument");
+  assert(ValueTy.isScalar()  && "invalid offset type");
+
+  if (Value == 0) {
+    Res = Op0;
+    return std::nullopt;
+  }
+
+  Res = getMRI()->createGenericVirtualRegister(getMRI()->getType(Op0));
+  auto Cst = buildConstant(ValueTy, Value);
+  return buildPtrAdd(Res, Op0, Cst.getReg(0));
+}
+
+MachineInstrBuilder MachineIRBuilder::buildMaskLowPtrBits(const DstOp &Res,
+                                                          const SrcOp &Op0,
+                                                          uint32_t NumBits) {
+  LLT PtrTy = Res.getLLTTy(*getMRI());
+  LLT MaskTy = LLT::scalar(PtrTy.getSizeInBits());
+  Register MaskReg = getMRI()->createGenericVirtualRegister(MaskTy);
+  buildConstant(MaskReg, maskTrailingZeros<uint64_t>(NumBits));
+  return buildPtrMask(Res, Op0, MaskReg);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildPadVectorWithUndefElements(const DstOp &Res,
+                                                  const SrcOp &Op0) {
+  LLT ResTy = Res.getLLTTy(*getMRI());
+  LLT Op0Ty = Op0.getLLTTy(*getMRI());
+
+  assert(ResTy.isVector() && "Res non vector type");
+
+  SmallVector<Register, 8> Regs;
+  if (Op0Ty.isVector()) {
+    assert((ResTy.getElementType() == Op0Ty.getElementType()) &&
+           "Different vector element types");
+    assert((ResTy.getNumElements() > Op0Ty.getNumElements()) &&
+           "Op0 has more elements");
+    auto Unmerge = buildUnmerge(Op0Ty.getElementType(), Op0);
+
+    for (auto Op : Unmerge.getInstr()->defs())
+      Regs.push_back(Op.getReg());
+  } else {
+    assert((ResTy.getSizeInBits() > Op0Ty.getSizeInBits()) &&
+           "Op0 has more size");
+    Regs.push_back(Op0.getReg());
+  }
+  Register Undef =
+      buildUndef(Op0Ty.isVector() ? Op0Ty.getElementType() : Op0Ty).getReg(0);
+  unsigned NumberOfPadElts = ResTy.getNumElements() - Regs.size();
+  for (unsigned i = 0; i < NumberOfPadElts; ++i)
+    Regs.push_back(Undef);
+  return buildMergeLikeInstr(Res, Regs);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildDeleteTrailingVectorElements(const DstOp &Res,
+                                                    const SrcOp &Op0) {
+  LLT ResTy = Res.getLLTTy(*getMRI());
+  LLT Op0Ty = Op0.getLLTTy(*getMRI());
+
+  assert((ResTy.isVector() && Op0Ty.isVector()) && "Non vector type");
+  assert((ResTy.getElementType() == Op0Ty.getElementType()) &&
+         "Different vector element types");
+  assert((ResTy.getNumElements() < Op0Ty.getNumElements()) &&
+         "Op0 has fewer elements");
+
+  SmallVector<Register, 8> Regs;
+  auto Unmerge = buildUnmerge(Op0Ty.getElementType(), Op0);
+  for (unsigned i = 0; i < ResTy.getNumElements(); ++i)
+    Regs.push_back(Unmerge.getReg(i));
+  return buildMergeLikeInstr(Res, Regs);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) {
+  return buildInstr(TargetOpcode::G_BR).addMBB(&Dest);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildBrIndirect(Register Tgt) {
+  assert(getMRI()->getType(Tgt).isPointer() && "invalid branch destination");
+  return buildInstr(TargetOpcode::G_BRINDIRECT).addUse(Tgt);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildBrJT(Register TablePtr,
+                                                unsigned JTI,
+                                                Register IndexReg) {
+  assert(getMRI()->getType(TablePtr).isPointer() &&
+         "Table reg must be a pointer");
+  return buildInstr(TargetOpcode::G_BRJT)
+      .addUse(TablePtr)
+      .addJumpTableIndex(JTI)
+      .addUse(IndexReg);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildCopy(const DstOp &Res,
+                                                const SrcOp &Op) {
+  return buildInstr(TargetOpcode::COPY, Res, Op);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildConstant(const DstOp &Res,
+                                                    const ConstantInt &Val) {
+  LLT Ty = Res.getLLTTy(*getMRI());
+  LLT EltTy = Ty.getScalarType();
+  assert(EltTy.getScalarSizeInBits() == Val.getBitWidth() &&
+         "creating constant with the wrong size");
+
+  if (Ty.isVector()) {
+    auto Const = buildInstr(TargetOpcode::G_CONSTANT)
+    .addDef(getMRI()->createGenericVirtualRegister(EltTy))
+    .addCImm(&Val);
+    return buildSplatVector(Res, Const);
+  }
+
+  auto Const = buildInstr(TargetOpcode::G_CONSTANT);
+  Const->setDebugLoc(DebugLoc());
+  Res.addDefToMIB(*getMRI(), Const);
+  Const.addCImm(&Val);
+  return Const;
+}
+
+MachineInstrBuilder MachineIRBuilder::buildConstant(const DstOp &Res,
+                                                    int64_t Val) {
+  auto IntN = IntegerType::get(getMF().getFunction().getContext(),
+                               Res.getLLTTy(*getMRI()).getScalarSizeInBits());
+  ConstantInt *CI = ConstantInt::get(IntN, Val, true);
+  return buildConstant(Res, *CI);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildFConstant(const DstOp &Res,
+                                                     const ConstantFP &Val) {
+  LLT Ty = Res.getLLTTy(*getMRI());
+  LLT EltTy = Ty.getScalarType();
+
+  assert(APFloat::getSizeInBits(Val.getValueAPF().getSemantics())
+         == EltTy.getSizeInBits() &&
+         "creating fconstant with the wrong size");
+
+  assert(!Ty.isPointer() && "invalid operand type");
+
+  if (Ty.isVector()) {
+    auto Const = buildInstr(TargetOpcode::G_FCONSTANT)
+    .addDef(getMRI()->createGenericVirtualRegister(EltTy))
+    .addFPImm(&Val);
+
+    return buildSplatVector(Res, Const);
+  }
+
+  auto Const = buildInstr(TargetOpcode::G_FCONSTANT);
+  Const->setDebugLoc(DebugLoc());
+  Res.addDefToMIB(*getMRI(), Const);
+  Const.addFPImm(&Val);
+  return Const;
+}
+
+MachineInstrBuilder MachineIRBuilder::buildConstant(const DstOp &Res,
+                                                    const APInt &Val) {
+  ConstantInt *CI = ConstantInt::get(getMF().getFunction().getContext(), Val);
+  return buildConstant(Res, *CI);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildFConstant(const DstOp &Res,
+                                                     double Val) {
+  LLT DstTy = Res.getLLTTy(*getMRI());
+  auto &Ctx = getMF().getFunction().getContext();
+  auto *CFP =
+      ConstantFP::get(Ctx, getAPFloatFromSize(Val, DstTy.getScalarSizeInBits()));
+  return buildFConstant(Res, *CFP);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildFConstant(const DstOp &Res,
+                                                     const APFloat &Val) {
+  auto &Ctx = getMF().getFunction().getContext();
+  auto *CFP = ConstantFP::get(Ctx, Val);
+  return buildFConstant(Res, *CFP);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildBrCond(const SrcOp &Tst,
+                                                  MachineBasicBlock &Dest) {
+  assert(Tst.getLLTTy(*getMRI()).isScalar() && "invalid operand type");
+
+  auto MIB = buildInstr(TargetOpcode::G_BRCOND);
+  Tst.addSrcToMIB(MIB);
+  MIB.addMBB(&Dest);
+  return MIB;
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildLoad(const DstOp &Dst, const SrcOp &Addr,
+                            MachinePointerInfo PtrInfo, Align Alignment,
+                            MachineMemOperand::Flags MMOFlags,
+                            const AAMDNodes &AAInfo) {
+  MMOFlags |= MachineMemOperand::MOLoad;
+  assert((MMOFlags & MachineMemOperand::MOStore) == 0);
+
+  LLT Ty = Dst.getLLTTy(*getMRI());
+  MachineMemOperand *MMO =
+      getMF().getMachineMemOperand(PtrInfo, MMOFlags, Ty, Alignment, AAInfo);
+  return buildLoad(Dst, Addr, *MMO);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildLoadInstr(unsigned Opcode,
+                                                     const DstOp &Res,
+                                                     const SrcOp &Addr,
+                                                     MachineMemOperand &MMO) {
+  assert(Res.getLLTTy(*getMRI()).isValid() && "invalid operand type");
+  assert(Addr.getLLTTy(*getMRI()).isPointer() && "invalid operand type");
+
+  auto MIB = buildInstr(Opcode);
+  Res.addDefToMIB(*getMRI(), MIB);
+  Addr.addSrcToMIB(MIB);
+  MIB.addMemOperand(&MMO);
+  return MIB;
+}
+
+MachineInstrBuilder MachineIRBuilder::buildLoadFromOffset(
+  const DstOp &Dst, const SrcOp &BasePtr,
+  MachineMemOperand &BaseMMO, int64_t Offset) {
+  LLT LoadTy = Dst.getLLTTy(*getMRI());
+  MachineMemOperand *OffsetMMO =
+      getMF().getMachineMemOperand(&BaseMMO, Offset, LoadTy);
+
+  if (Offset == 0) // This may be a size or type changing load.
+    return buildLoad(Dst, BasePtr, *OffsetMMO);
+
+  LLT PtrTy = BasePtr.getLLTTy(*getMRI());
+  LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
+  auto ConstOffset = buildConstant(OffsetTy, Offset);
+  auto Ptr = buildPtrAdd(PtrTy, BasePtr, ConstOffset);
+  return buildLoad(Dst, Ptr, *OffsetMMO);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildStore(const SrcOp &Val,
+                                                 const SrcOp &Addr,
+                                                 MachineMemOperand &MMO) {
+  assert(Val.getLLTTy(*getMRI()).isValid() && "invalid operand type");
+  assert(Addr.getLLTTy(*getMRI()).isPointer() && "invalid operand type");
+
+  auto MIB = buildInstr(TargetOpcode::G_STORE);
+  Val.addSrcToMIB(MIB);
+  Addr.addSrcToMIB(MIB);
+  MIB.addMemOperand(&MMO);
+  return MIB;
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildStore(const SrcOp &Val, const SrcOp &Addr,
+                             MachinePointerInfo PtrInfo, Align Alignment,
+                             MachineMemOperand::Flags MMOFlags,
+                             const AAMDNodes &AAInfo) {
+  MMOFlags |= MachineMemOperand::MOStore;
+  assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
+
+  LLT Ty = Val.getLLTTy(*getMRI());
+  MachineMemOperand *MMO =
+      getMF().getMachineMemOperand(PtrInfo, MMOFlags, Ty, Alignment, AAInfo);
+  return buildStore(Val, Addr, *MMO);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildAnyExt(const DstOp &Res,
+                                                  const SrcOp &Op) {
+  return buildInstr(TargetOpcode::G_ANYEXT, Res, Op);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildSExt(const DstOp &Res,
+                                                const SrcOp &Op) {
+  return buildInstr(TargetOpcode::G_SEXT, Res, Op);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildZExt(const DstOp &Res,
+                                                const SrcOp &Op) {
+  return buildInstr(TargetOpcode::G_ZEXT, Res, Op);
+}
+
+unsigned MachineIRBuilder::getBoolExtOp(bool IsVec, bool IsFP) const {
+  const auto *TLI = getMF().getSubtarget().getTargetLowering();
+  switch (TLI->getBooleanContents(IsVec, IsFP)) {
+  case TargetLoweringBase::ZeroOrNegativeOneBooleanContent:
+    return TargetOpcode::G_SEXT;
+  case TargetLoweringBase::ZeroOrOneBooleanContent:
+    return TargetOpcode::G_ZEXT;
+  default:
+    return TargetOpcode::G_ANYEXT;
+  }
+}
+
+MachineInstrBuilder MachineIRBuilder::buildBoolExt(const DstOp &Res,
+                                                   const SrcOp &Op,
+                                                   bool IsFP) {
+  unsigned ExtOp = getBoolExtOp(getMRI()->getType(Op.getReg()).isVector(), IsFP);
+  return buildInstr(ExtOp, Res, Op);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildBoolExtInReg(const DstOp &Res,
+                                                        const SrcOp &Op,
+                                                        bool IsVector,
+                                                        bool IsFP) {
+  const auto *TLI = getMF().getSubtarget().getTargetLowering();
+  switch (TLI->getBooleanContents(IsVector, IsFP)) {
+  case TargetLoweringBase::ZeroOrNegativeOneBooleanContent:
+    return buildSExtInReg(Res, Op, 1);
+  case TargetLoweringBase::ZeroOrOneBooleanContent:
+    return buildZExtInReg(Res, Op, 1);
+  case TargetLoweringBase::UndefinedBooleanContent:
+    return buildCopy(Res, Op);
+  }
+
+  llvm_unreachable("unexpected BooleanContent");
+}
+
+MachineInstrBuilder MachineIRBuilder::buildExtOrTrunc(unsigned ExtOpc,
+                                                      const DstOp &Res,
+                                                      const SrcOp &Op) {
+  assert((TargetOpcode::G_ANYEXT == ExtOpc || TargetOpcode::G_ZEXT == ExtOpc ||
+          TargetOpcode::G_SEXT == ExtOpc) &&
+         "Expecting Extending Opc");
+  assert(Res.getLLTTy(*getMRI()).isScalar() ||
+         Res.getLLTTy(*getMRI()).isVector());
+  assert(Res.getLLTTy(*getMRI()).isScalar() ==
+         Op.getLLTTy(*getMRI()).isScalar());
+
+  unsigned Opcode = TargetOpcode::COPY;
+  if (Res.getLLTTy(*getMRI()).getSizeInBits() >
+      Op.getLLTTy(*getMRI()).getSizeInBits())
+    Opcode = ExtOpc;
+  else if (Res.getLLTTy(*getMRI()).getSizeInBits() <
+           Op.getLLTTy(*getMRI()).getSizeInBits())
+    Opcode = TargetOpcode::G_TRUNC;
+  else
+    assert(Res.getLLTTy(*getMRI()) == Op.getLLTTy(*getMRI()));
+
+  return buildInstr(Opcode, Res, Op);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildSExtOrTrunc(const DstOp &Res,
+                                                       const SrcOp &Op) {
+  return buildExtOrTrunc(TargetOpcode::G_SEXT, Res, Op);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildZExtOrTrunc(const DstOp &Res,
+                                                       const SrcOp &Op) {
+  return buildExtOrTrunc(TargetOpcode::G_ZEXT, Res, Op);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildAnyExtOrTrunc(const DstOp &Res,
+                                                         const SrcOp &Op) {
+  return buildExtOrTrunc(TargetOpcode::G_ANYEXT, Res, Op);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildZExtInReg(const DstOp &Res,
+                                                     const SrcOp &Op,
+                                                     int64_t ImmOp) {
+  LLT ResTy = Res.getLLTTy(*getMRI());
+  auto Mask = buildConstant(
+      ResTy, APInt::getLowBitsSet(ResTy.getScalarSizeInBits(), ImmOp));
+  return buildAnd(Res, Op, Mask);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildCast(const DstOp &Dst,
+                                                const SrcOp &Src) {
+  LLT SrcTy = Src.getLLTTy(*getMRI());
+  LLT DstTy = Dst.getLLTTy(*getMRI());
+  if (SrcTy == DstTy)
+    return buildCopy(Dst, Src);
+
+  unsigned Opcode;
+  if (SrcTy.isPointer() && DstTy.isScalar())
+    Opcode = TargetOpcode::G_PTRTOINT;
+  else if (DstTy.isPointer() && SrcTy.isScalar())
+    Opcode = TargetOpcode::G_INTTOPTR;
+  else {
+    assert(!SrcTy.isPointer() && !DstTy.isPointer() && "n G_ADDRCAST yet");
+    Opcode = TargetOpcode::G_BITCAST;
+  }
+
+  return buildInstr(Opcode, Dst, Src);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildExtract(const DstOp &Dst,
+                                                   const SrcOp &Src,
+                                                   uint64_t Index) {
+  LLT SrcTy = Src.getLLTTy(*getMRI());
+  LLT DstTy = Dst.getLLTTy(*getMRI());
+
+#ifndef NDEBUG
+  assert(SrcTy.isValid() && "invalid operand type");
+  assert(DstTy.isValid() && "invalid operand type");
+  assert(Index + DstTy.getSizeInBits() <= SrcTy.getSizeInBits() &&
+         "extracting off end of register");
+#endif
+
+  if (DstTy.getSizeInBits() == SrcTy.getSizeInBits()) {
+    assert(Index == 0 && "insertion past the end of a register");
+    return buildCast(Dst, Src);
+  }
+
+  auto Extract = buildInstr(TargetOpcode::G_EXTRACT);
+  Dst.addDefToMIB(*getMRI(), Extract);
+  Src.addSrcToMIB(Extract);
+  Extract.addImm(Index);
+  return Extract;
+}
+
+MachineInstrBuilder MachineIRBuilder::buildUndef(const DstOp &Res) {
+  return buildInstr(TargetOpcode::G_IMPLICIT_DEF, {Res}, {});
+}
+
+MachineInstrBuilder MachineIRBuilder::buildMergeValues(const DstOp &Res,
+                                                       ArrayRef<Register> Ops) {
+  // Unfortunately to convert from ArrayRef<LLT> to ArrayRef<SrcOp>,
+  // we need some temporary storage for the DstOp objects. Here we use a
+  // sufficiently large SmallVector to not go through the heap.
+  SmallVector<SrcOp, 8> TmpVec(Ops.begin(), Ops.end());
+  assert(TmpVec.size() > 1);
+  return buildInstr(TargetOpcode::G_MERGE_VALUES, Res, TmpVec);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildMergeLikeInstr(const DstOp &Res,
+                                      ArrayRef<Register> Ops) {
+  // Unfortunately to convert from ArrayRef<LLT> to ArrayRef<SrcOp>,
+  // we need some temporary storage for the DstOp objects. Here we use a
+  // sufficiently large SmallVector to not go through the heap.
+  SmallVector<SrcOp, 8> TmpVec(Ops.begin(), Ops.end());
+  assert(TmpVec.size() > 1);
+  return buildInstr(getOpcodeForMerge(Res, TmpVec), Res, TmpVec);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildMergeLikeInstr(const DstOp &Res,
+                                      std::initializer_list<SrcOp> Ops) {
+  assert(Ops.size() > 1);
+  return buildInstr(getOpcodeForMerge(Res, Ops), Res, Ops);
+}
+
+unsigned MachineIRBuilder::getOpcodeForMerge(const DstOp &DstOp,
+                                             ArrayRef<SrcOp> SrcOps) const {
+  if (DstOp.getLLTTy(*getMRI()).isVector()) {
+    if (SrcOps[0].getLLTTy(*getMRI()).isVector())
+      return TargetOpcode::G_CONCAT_VECTORS;
+    return TargetOpcode::G_BUILD_VECTOR;
+  }
+
+  return TargetOpcode::G_MERGE_VALUES;
+}
+
+MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<LLT> Res,
+                                                   const SrcOp &Op) {
+  // Unfortunately to convert from ArrayRef<LLT> to ArrayRef<DstOp>,
+  // we need some temporary storage for the DstOp objects. Here we use a
+  // sufficiently large SmallVector to not go through the heap.
+  SmallVector<DstOp, 8> TmpVec(Res.begin(), Res.end());
+  assert(TmpVec.size() > 1);
+  return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildUnmerge(LLT Res,
+                                                   const SrcOp &Op) {
+  unsigned NumReg = Op.getLLTTy(*getMRI()).getSizeInBits() / Res.getSizeInBits();
+  SmallVector<DstOp, 8> TmpVec(NumReg, Res);
+  return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<Register> Res,
+                                                   const SrcOp &Op) {
+  // Unfortunately to convert from ArrayRef<Register> to ArrayRef<DstOp>,
+  // we need some temporary storage for the DstOp objects. Here we use a
+  // sufficiently large SmallVector to not go through the heap.
+  SmallVector<DstOp, 8> TmpVec(Res.begin(), Res.end());
+  assert(TmpVec.size() > 1);
+  return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildBuildVector(const DstOp &Res,
+                                                       ArrayRef<Register> Ops) {
+  // Unfortunately to convert from ArrayRef<Register> to ArrayRef<SrcOp>,
+  // we need some temporary storage for the DstOp objects. Here we use a
+  // sufficiently large SmallVector to not go through the heap.
+  SmallVector<SrcOp, 8> TmpVec(Ops.begin(), Ops.end());
+  return buildInstr(TargetOpcode::G_BUILD_VECTOR, Res, TmpVec);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildBuildVectorConstant(const DstOp &Res,
+                                           ArrayRef<APInt> Ops) {
+  SmallVector<SrcOp> TmpVec;
+  TmpVec.reserve(Ops.size());
+  LLT EltTy = Res.getLLTTy(*getMRI()).getElementType();
+  for (const auto &Op : Ops)
+    TmpVec.push_back(buildConstant(EltTy, Op));
+  return buildInstr(TargetOpcode::G_BUILD_VECTOR, Res, TmpVec);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildSplatVector(const DstOp &Res,
+                                                       const SrcOp &Src) {
+  SmallVector<SrcOp, 8> TmpVec(Res.getLLTTy(*getMRI()).getNumElements(), Src);
+  return buildInstr(TargetOpcode::G_BUILD_VECTOR, Res, TmpVec);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildBuildVectorTrunc(const DstOp &Res,
+                                        ArrayRef<Register> Ops) {
+  // Unfortunately to convert from ArrayRef<Register> to ArrayRef<SrcOp>,
+  // we need some temporary storage for the DstOp objects. Here we use a
+  // sufficiently large SmallVector to not go through the heap.
+  SmallVector<SrcOp, 8> TmpVec(Ops.begin(), Ops.end());
+  if (TmpVec[0].getLLTTy(*getMRI()).getSizeInBits() ==
+      Res.getLLTTy(*getMRI()).getElementType().getSizeInBits())
+    return buildInstr(TargetOpcode::G_BUILD_VECTOR, Res, TmpVec);
+  return buildInstr(TargetOpcode::G_BUILD_VECTOR_TRUNC, Res, TmpVec);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildShuffleSplat(const DstOp &Res,
+                                                        const SrcOp &Src) {
+  LLT DstTy = Res.getLLTTy(*getMRI());
+  assert(Src.getLLTTy(*getMRI()) == DstTy.getElementType() &&
+         "Expected Src to match Dst elt ty");
+  auto UndefVec = buildUndef(DstTy);
+  auto Zero = buildConstant(LLT::scalar(64), 0);
+  auto InsElt = buildInsertVectorElement(DstTy, UndefVec, Src, Zero);
+  SmallVector<int, 16> ZeroMask(DstTy.getNumElements());
+  return buildShuffleVector(DstTy, InsElt, UndefVec, ZeroMask);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildShuffleVector(const DstOp &Res,
+                                                         const SrcOp &Src1,
+                                                         const SrcOp &Src2,
+                                                         ArrayRef<int> Mask) {
+  LLT DstTy = Res.getLLTTy(*getMRI());
+  LLT Src1Ty = Src1.getLLTTy(*getMRI());
+  LLT Src2Ty = Src2.getLLTTy(*getMRI());
+  assert((size_t)(Src1Ty.getNumElements() + Src2Ty.getNumElements()) >=
+         Mask.size());
+  assert(DstTy.getElementType() == Src1Ty.getElementType() &&
+         DstTy.getElementType() == Src2Ty.getElementType());
+  (void)DstTy;
+  (void)Src1Ty;
+  (void)Src2Ty;
+  ArrayRef<int> MaskAlloc = getMF().allocateShuffleMask(Mask);
+  return buildInstr(TargetOpcode::G_SHUFFLE_VECTOR, {Res}, {Src1, Src2})
+      .addShuffleMask(MaskAlloc);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildConcatVectors(const DstOp &Res, ArrayRef<Register> Ops) {
+  // Unfortunately to convert from ArrayRef<Register> to ArrayRef<SrcOp>,
+  // we need some temporary storage for the DstOp objects. Here we use a
+  // sufficiently large SmallVector to not go through the heap.
+  SmallVector<SrcOp, 8> TmpVec(Ops.begin(), Ops.end());
+  return buildInstr(TargetOpcode::G_CONCAT_VECTORS, Res, TmpVec);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildInsert(const DstOp &Res,
+                                                  const SrcOp &Src,
+                                                  const SrcOp &Op,
+                                                  unsigned Index) {
+  assert(Index + Op.getLLTTy(*getMRI()).getSizeInBits() <=
+             Res.getLLTTy(*getMRI()).getSizeInBits() &&
+         "insertion past the end of a register");
+
+  if (Res.getLLTTy(*getMRI()).getSizeInBits() ==
+      Op.getLLTTy(*getMRI()).getSizeInBits()) {
+    return buildCast(Res, Op);
+  }
+
+  return buildInstr(TargetOpcode::G_INSERT, Res, {Src, Op, uint64_t(Index)});
+}
+
+MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID,
+                                                     ArrayRef<Register> ResultRegs,
+                                                     bool HasSideEffects) {
+  auto MIB =
+      buildInstr(HasSideEffects ? TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS
+                                : TargetOpcode::G_INTRINSIC);
+  for (unsigned ResultReg : ResultRegs)
+    MIB.addDef(ResultReg);
+  MIB.addIntrinsicID(ID);
+  return MIB;
+}
+
+MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID,
+                                                     ArrayRef<DstOp> Results,
+                                                     bool HasSideEffects) {
+  auto MIB =
+      buildInstr(HasSideEffects ? TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS
+                                : TargetOpcode::G_INTRINSIC);
+  for (DstOp Result : Results)
+    Result.addDefToMIB(*getMRI(), MIB);
+  MIB.addIntrinsicID(ID);
+  return MIB;
+}
+
+MachineInstrBuilder MachineIRBuilder::buildTrunc(const DstOp &Res,
+                                                 const SrcOp &Op) {
+  return buildInstr(TargetOpcode::G_TRUNC, Res, Op);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildFPTrunc(const DstOp &Res, const SrcOp &Op,
+                               std::optional<unsigned> Flags) {
+  return buildInstr(TargetOpcode::G_FPTRUNC, Res, Op, Flags);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildICmp(CmpInst::Predicate Pred,
+                                                const DstOp &Res,
+                                                const SrcOp &Op0,
+                                                const SrcOp &Op1) {
+  return buildInstr(TargetOpcode::G_ICMP, Res, {Pred, Op0, Op1});
+}
+
+MachineInstrBuilder MachineIRBuilder::buildFCmp(CmpInst::Predicate Pred,
+                                                const DstOp &Res,
+                                                const SrcOp &Op0,
+                                                const SrcOp &Op1,
+                                                std::optional<unsigned> Flags) {
+
+  return buildInstr(TargetOpcode::G_FCMP, Res, {Pred, Op0, Op1}, Flags);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildSelect(const DstOp &Res, const SrcOp &Tst,
+                              const SrcOp &Op0, const SrcOp &Op1,
+                              std::optional<unsigned> Flags) {
+
+  return buildInstr(TargetOpcode::G_SELECT, {Res}, {Tst, Op0, Op1}, Flags);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildInsertVectorElement(const DstOp &Res, const SrcOp &Val,
+                                           const SrcOp &Elt, const SrcOp &Idx) {
+  return buildInstr(TargetOpcode::G_INSERT_VECTOR_ELT, Res, {Val, Elt, Idx});
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildExtractVectorElement(const DstOp &Res, const SrcOp &Val,
+                                            const SrcOp &Idx) {
+  return buildInstr(TargetOpcode::G_EXTRACT_VECTOR_ELT, Res, {Val, Idx});
+}
+
+MachineInstrBuilder MachineIRBuilder::buildAtomicCmpXchgWithSuccess(
+    Register OldValRes, Register SuccessRes, Register Addr, Register CmpVal,
+    Register NewVal, MachineMemOperand &MMO) {
+#ifndef NDEBUG
+  LLT OldValResTy = getMRI()->getType(OldValRes);
+  LLT SuccessResTy = getMRI()->getType(SuccessRes);
+  LLT AddrTy = getMRI()->getType(Addr);
+  LLT CmpValTy = getMRI()->getType(CmpVal);
+  LLT NewValTy = getMRI()->getType(NewVal);
+  assert(OldValResTy.isScalar() && "invalid operand type");
+  assert(SuccessResTy.isScalar() && "invalid operand type");
+  assert(AddrTy.isPointer() && "invalid operand type");
+  assert(CmpValTy.isValid() && "invalid operand type");
+  assert(NewValTy.isValid() && "invalid operand type");
+  assert(OldValResTy == CmpValTy && "type mismatch");
+  assert(OldValResTy == NewValTy && "type mismatch");
+#endif
+
+  return buildInstr(TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS)
+      .addDef(OldValRes)
+      .addDef(SuccessRes)
+      .addUse(Addr)
+      .addUse(CmpVal)
+      .addUse(NewVal)
+      .addMemOperand(&MMO);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicCmpXchg(Register OldValRes, Register Addr,
+                                     Register CmpVal, Register NewVal,
+                                     MachineMemOperand &MMO) {
+#ifndef NDEBUG
+  LLT OldValResTy = getMRI()->getType(OldValRes);
+  LLT AddrTy = getMRI()->getType(Addr);
+  LLT CmpValTy = getMRI()->getType(CmpVal);
+  LLT NewValTy = getMRI()->getType(NewVal);
+  assert(OldValResTy.isScalar() && "invalid operand type");
+  assert(AddrTy.isPointer() && "invalid operand type");
+  assert(CmpValTy.isValid() && "invalid operand type");
+  assert(NewValTy.isValid() && "invalid operand type");
+  assert(OldValResTy == CmpValTy && "type mismatch");
+  assert(OldValResTy == NewValTy && "type mismatch");
+#endif
+
+  return buildInstr(TargetOpcode::G_ATOMIC_CMPXCHG)
+      .addDef(OldValRes)
+      .addUse(Addr)
+      .addUse(CmpVal)
+      .addUse(NewVal)
+      .addMemOperand(&MMO);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildAtomicRMW(
+  unsigned Opcode, const DstOp &OldValRes,
+  const SrcOp &Addr, const SrcOp &Val,
+  MachineMemOperand &MMO) {
+
+#ifndef NDEBUG
+  LLT OldValResTy = OldValRes.getLLTTy(*getMRI());
+  LLT AddrTy = Addr.getLLTTy(*getMRI());
+  LLT ValTy = Val.getLLTTy(*getMRI());
+  assert(OldValResTy.isScalar() && "invalid operand type");
+  assert(AddrTy.isPointer() && "invalid operand type");
+  assert(ValTy.isValid() && "invalid operand type");
+  assert(OldValResTy == ValTy && "type mismatch");
+  assert(MMO.isAtomic() && "not atomic mem operand");
+#endif
+
+  auto MIB = buildInstr(Opcode);
+  OldValRes.addDefToMIB(*getMRI(), MIB);
+  Addr.addSrcToMIB(MIB);
+  Val.addSrcToMIB(MIB);
+  MIB.addMemOperand(&MMO);
+  return MIB;
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWXchg(Register OldValRes, Register Addr,
+                                     Register Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_XCHG, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWAdd(Register OldValRes, Register Addr,
+                                    Register Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_ADD, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWSub(Register OldValRes, Register Addr,
+                                    Register Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_SUB, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWAnd(Register OldValRes, Register Addr,
+                                    Register Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_AND, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWNand(Register OldValRes, Register Addr,
+                                     Register Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_NAND, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder MachineIRBuilder::buildAtomicRMWOr(Register OldValRes,
+                                                       Register Addr,
+                                                       Register Val,
+                                                       MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_OR, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWXor(Register OldValRes, Register Addr,
+                                    Register Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_XOR, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWMax(Register OldValRes, Register Addr,
+                                    Register Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_MAX, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWMin(Register OldValRes, Register Addr,
+                                    Register Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_MIN, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWUmax(Register OldValRes, Register Addr,
+                                     Register Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_UMAX, OldValRes, Addr, Val,
+                        MMO);
+}
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWUmin(Register OldValRes, Register Addr,
+                                     Register Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_UMIN, OldValRes, Addr, Val,
+                        MMO);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWFAdd(
+  const DstOp &OldValRes, const SrcOp &Addr, const SrcOp &Val,
+  MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_FADD, OldValRes, Addr, Val,
+                        MMO);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWFSub(const DstOp &OldValRes, const SrcOp &Addr, const SrcOp &Val,
+                                     MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_FSUB, OldValRes, Addr, Val,
+                        MMO);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWFMax(const DstOp &OldValRes, const SrcOp &Addr,
+                                     const SrcOp &Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_FMAX, OldValRes, Addr, Val,
+                        MMO);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWFMin(const DstOp &OldValRes, const SrcOp &Addr,
+                                     const SrcOp &Val, MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_FMIN, OldValRes, Addr, Val,
+                        MMO);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildFence(unsigned Ordering, unsigned Scope) {
+  return buildInstr(TargetOpcode::G_FENCE)
+    .addImm(Ordering)
+    .addImm(Scope);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildBlockAddress(Register Res, const BlockAddress *BA) {
+#ifndef NDEBUG
+  assert(getMRI()->getType(Res).isPointer() && "invalid res type");
+#endif
+
+  return buildInstr(TargetOpcode::G_BLOCK_ADDR).addDef(Res).addBlockAddress(BA);
+}
+
+void MachineIRBuilder::validateTruncExt(const LLT DstTy, const LLT SrcTy,
+                                        bool IsExtend) {
+#ifndef NDEBUG
+  if (DstTy.isVector()) {
+    assert(SrcTy.isVector() && "mismatched cast between vector and non-vector");
+    assert(SrcTy.getNumElements() == DstTy.getNumElements() &&
+           "different number of elements in a trunc/ext");
+  } else
+    assert(DstTy.isScalar() && SrcTy.isScalar() && "invalid extend/trunc");
+
+  if (IsExtend)
+    assert(DstTy.getSizeInBits() > SrcTy.getSizeInBits() &&
+           "invalid narrowing extend");
+  else
+    assert(DstTy.getSizeInBits() < SrcTy.getSizeInBits() &&
+           "invalid widening trunc");
+#endif
+}
+
+void MachineIRBuilder::validateSelectOp(const LLT ResTy, const LLT TstTy,
+                                        const LLT Op0Ty, const LLT Op1Ty) {
+#ifndef NDEBUG
+  assert((ResTy.isScalar() || ResTy.isVector() || ResTy.isPointer()) &&
+         "invalid operand type");
+  assert((ResTy == Op0Ty && ResTy == Op1Ty) && "type mismatch");
+  if (ResTy.isScalar() || ResTy.isPointer())
+    assert(TstTy.isScalar() && "type mismatch");
+  else
+    assert((TstTy.isScalar() ||
+            (TstTy.isVector() &&
+             TstTy.getNumElements() == Op0Ty.getNumElements())) &&
+           "type mismatch");
+#endif
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildInstr(unsigned Opc, ArrayRef<DstOp> DstOps,
+                             ArrayRef<SrcOp> SrcOps,
+                             std::optional<unsigned> Flags) {
+  switch (Opc) {
+  default:
+    break;
+  case TargetOpcode::G_SELECT: {
+    assert(DstOps.size() == 1 && "Invalid select");
+    assert(SrcOps.size() == 3 && "Invalid select");
+    validateSelectOp(
+        DstOps[0].getLLTTy(*getMRI()), SrcOps[0].getLLTTy(*getMRI()),
+        SrcOps[1].getLLTTy(*getMRI()), SrcOps[2].getLLTTy(*getMRI()));
+    break;
+  }
+  case TargetOpcode::G_FNEG:
+  case TargetOpcode::G_ABS:
+    // All these are unary ops.
+    assert(DstOps.size() == 1 && "Invalid Dst");
+    assert(SrcOps.size() == 1 && "Invalid Srcs");
+    validateUnaryOp(DstOps[0].getLLTTy(*getMRI()),
+                    SrcOps[0].getLLTTy(*getMRI()));
+    break;
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_MUL:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_SUB:
+  case TargetOpcode::G_XOR:
+  case TargetOpcode::G_UDIV:
+  case TargetOpcode::G_SDIV:
+  case TargetOpcode::G_UREM:
+  case TargetOpcode::G_SREM:
+  case TargetOpcode::G_SMIN:
+  case TargetOpcode::G_SMAX:
+  case TargetOpcode::G_UMIN:
+  case TargetOpcode::G_UMAX:
+  case TargetOpcode::G_UADDSAT:
+  case TargetOpcode::G_SADDSAT:
+  case TargetOpcode::G_USUBSAT:
+  case TargetOpcode::G_SSUBSAT: {
+    // All these are binary ops.
+    assert(DstOps.size() == 1 && "Invalid Dst");
+    assert(SrcOps.size() == 2 && "Invalid Srcs");
+    validateBinaryOp(DstOps[0].getLLTTy(*getMRI()),
+                     SrcOps[0].getLLTTy(*getMRI()),
+                     SrcOps[1].getLLTTy(*getMRI()));
+    break;
+  }
+  case TargetOpcode::G_SHL:
+  case TargetOpcode::G_ASHR:
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_USHLSAT:
+  case TargetOpcode::G_SSHLSAT: {
+    assert(DstOps.size() == 1 && "Invalid Dst");
+    assert(SrcOps.size() == 2 && "Invalid Srcs");
+    validateShiftOp(DstOps[0].getLLTTy(*getMRI()),
+                    SrcOps[0].getLLTTy(*getMRI()),
+                    SrcOps[1].getLLTTy(*getMRI()));
+    break;
+  }
+  case TargetOpcode::G_SEXT:
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_ANYEXT:
+    assert(DstOps.size() == 1 && "Invalid Dst");
+    assert(SrcOps.size() == 1 && "Invalid Srcs");
+    validateTruncExt(DstOps[0].getLLTTy(*getMRI()),
+                     SrcOps[0].getLLTTy(*getMRI()), true);
+    break;
+  case TargetOpcode::G_TRUNC:
+  case TargetOpcode::G_FPTRUNC: {
+    assert(DstOps.size() == 1 && "Invalid Dst");
+    assert(SrcOps.size() == 1 && "Invalid Srcs");
+    validateTruncExt(DstOps[0].getLLTTy(*getMRI()),
+                     SrcOps[0].getLLTTy(*getMRI()), false);
+    break;
+  }
+  case TargetOpcode::G_BITCAST: {
+    assert(DstOps.size() == 1 && "Invalid Dst");
+    assert(SrcOps.size() == 1 && "Invalid Srcs");
+    assert(DstOps[0].getLLTTy(*getMRI()).getSizeInBits() ==
+           SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() && "invalid bitcast");
+    break;
+  }
+  case TargetOpcode::COPY:
+    assert(DstOps.size() == 1 && "Invalid Dst");
+    // If the caller wants to add a subreg source it has to be done separately
+    // so we may not have any SrcOps at this point yet.
+    break;
+  case TargetOpcode::G_FCMP:
+  case TargetOpcode::G_ICMP: {
+    assert(DstOps.size() == 1 && "Invalid Dst Operands");
+    assert(SrcOps.size() == 3 && "Invalid Src Operands");
+    // For F/ICMP, the first src operand is the predicate, followed by
+    // the two comparands.
+    assert(SrcOps[0].getSrcOpKind() == SrcOp::SrcType::Ty_Predicate &&
+           "Expecting predicate");
+    assert([&]() -> bool {
+      CmpInst::Predicate Pred = SrcOps[0].getPredicate();
+      return Opc == TargetOpcode::G_ICMP ? CmpInst::isIntPredicate(Pred)
+                                         : CmpInst::isFPPredicate(Pred);
+    }() && "Invalid predicate");
+    assert(SrcOps[1].getLLTTy(*getMRI()) == SrcOps[2].getLLTTy(*getMRI()) &&
+           "Type mismatch");
+    assert([&]() -> bool {
+      LLT Op0Ty = SrcOps[1].getLLTTy(*getMRI());
+      LLT DstTy = DstOps[0].getLLTTy(*getMRI());
+      if (Op0Ty.isScalar() || Op0Ty.isPointer())
+        return DstTy.isScalar();
+      else
+        return DstTy.isVector() &&
+               DstTy.getNumElements() == Op0Ty.getNumElements();
+    }() && "Type Mismatch");
+    break;
+  }
+  case TargetOpcode::G_UNMERGE_VALUES: {
+    assert(!DstOps.empty() && "Invalid trivial sequence");
+    assert(SrcOps.size() == 1 && "Invalid src for Unmerge");
+    assert(llvm::all_of(DstOps,
+                        [&, this](const DstOp &Op) {
+                          return Op.getLLTTy(*getMRI()) ==
+                                 DstOps[0].getLLTTy(*getMRI());
+                        }) &&
+           "type mismatch in output list");
+    assert((TypeSize::ScalarTy)DstOps.size() *
+                   DstOps[0].getLLTTy(*getMRI()).getSizeInBits() ==
+               SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() &&
+           "input operands do not cover output register");
+    break;
+  }
+  case TargetOpcode::G_MERGE_VALUES: {
+    assert(SrcOps.size() >= 2 && "invalid trivial sequence");
+    assert(DstOps.size() == 1 && "Invalid Dst");
+    assert(llvm::all_of(SrcOps,
+                        [&, this](const SrcOp &Op) {
+                          return Op.getLLTTy(*getMRI()) ==
+                                 SrcOps[0].getLLTTy(*getMRI());
+                        }) &&
+           "type mismatch in input list");
+    assert((TypeSize::ScalarTy)SrcOps.size() *
+                   SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() ==
+               DstOps[0].getLLTTy(*getMRI()).getSizeInBits() &&
+           "input operands do not cover output register");
+    assert(!DstOps[0].getLLTTy(*getMRI()).isVector() &&
+           "vectors should be built with G_CONCAT_VECTOR or G_BUILD_VECTOR");
+    break;
+  }
+  case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
+    assert(DstOps.size() == 1 && "Invalid Dst size");
+    assert(SrcOps.size() == 2 && "Invalid Src size");
+    assert(SrcOps[0].getLLTTy(*getMRI()).isVector() && "Invalid operand type");
+    assert((DstOps[0].getLLTTy(*getMRI()).isScalar() ||
+            DstOps[0].getLLTTy(*getMRI()).isPointer()) &&
+           "Invalid operand type");
+    assert(SrcOps[1].getLLTTy(*getMRI()).isScalar() && "Invalid operand type");
+    assert(SrcOps[0].getLLTTy(*getMRI()).getElementType() ==
+               DstOps[0].getLLTTy(*getMRI()) &&
+           "Type mismatch");
+    break;
+  }
+  case TargetOpcode::G_INSERT_VECTOR_ELT: {
+    assert(DstOps.size() == 1 && "Invalid dst size");
+    assert(SrcOps.size() == 3 && "Invalid src size");
+    assert(DstOps[0].getLLTTy(*getMRI()).isVector() &&
+           SrcOps[0].getLLTTy(*getMRI()).isVector() && "Invalid operand type");
+    assert(DstOps[0].getLLTTy(*getMRI()).getElementType() ==
+               SrcOps[1].getLLTTy(*getMRI()) &&
+           "Type mismatch");
+    assert(SrcOps[2].getLLTTy(*getMRI()).isScalar() && "Invalid index");
+    assert(DstOps[0].getLLTTy(*getMRI()).getNumElements() ==
+               SrcOps[0].getLLTTy(*getMRI()).getNumElements() &&
+           "Type mismatch");
+    break;
+  }
+  case TargetOpcode::G_BUILD_VECTOR: {
+    assert((!SrcOps.empty() || SrcOps.size() < 2) &&
+           "Must have at least 2 operands");
+    assert(DstOps.size() == 1 && "Invalid DstOps");
+    assert(DstOps[0].getLLTTy(*getMRI()).isVector() &&
+           "Res type must be a vector");
+    assert(llvm::all_of(SrcOps,
+                        [&, this](const SrcOp &Op) {
+                          return Op.getLLTTy(*getMRI()) ==
+                                 SrcOps[0].getLLTTy(*getMRI());
+                        }) &&
+           "type mismatch in input list");
+    assert((TypeSize::ScalarTy)SrcOps.size() *
+                   SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() ==
+               DstOps[0].getLLTTy(*getMRI()).getSizeInBits() &&
+           "input scalars do not exactly cover the output vector register");
+    break;
+  }
+  case TargetOpcode::G_BUILD_VECTOR_TRUNC: {
+    assert((!SrcOps.empty() || SrcOps.size() < 2) &&
+           "Must have at least 2 operands");
+    assert(DstOps.size() == 1 && "Invalid DstOps");
+    assert(DstOps[0].getLLTTy(*getMRI()).isVector() &&
+           "Res type must be a vector");
+    assert(llvm::all_of(SrcOps,
+                        [&, this](const SrcOp &Op) {
+                          return Op.getLLTTy(*getMRI()) ==
+                                 SrcOps[0].getLLTTy(*getMRI());
+                        }) &&
+           "type mismatch in input list");
+    break;
+  }
+  case TargetOpcode::G_CONCAT_VECTORS: {
+    assert(DstOps.size() == 1 && "Invalid DstOps");
+    assert((!SrcOps.empty() || SrcOps.size() < 2) &&
+           "Must have at least 2 operands");
+    assert(llvm::all_of(SrcOps,
+                        [&, this](const SrcOp &Op) {
+                          return (Op.getLLTTy(*getMRI()).isVector() &&
+                                  Op.getLLTTy(*getMRI()) ==
+                                      SrcOps[0].getLLTTy(*getMRI()));
+                        }) &&
+           "type mismatch in input list");
+    assert((TypeSize::ScalarTy)SrcOps.size() *
+                   SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() ==
+               DstOps[0].getLLTTy(*getMRI()).getSizeInBits() &&
+           "input vectors do not exactly cover the output vector register");
+    break;
+  }
+  case TargetOpcode::G_UADDE: {
+    assert(DstOps.size() == 2 && "Invalid no of dst operands");
+    assert(SrcOps.size() == 3 && "Invalid no of src operands");
+    assert(DstOps[0].getLLTTy(*getMRI()).isScalar() && "Invalid operand");
+    assert((DstOps[0].getLLTTy(*getMRI()) == SrcOps[0].getLLTTy(*getMRI())) &&
+           (DstOps[0].getLLTTy(*getMRI()) == SrcOps[1].getLLTTy(*getMRI())) &&
+           "Invalid operand");
+    assert(DstOps[1].getLLTTy(*getMRI()).isScalar() && "Invalid operand");
+    assert(DstOps[1].getLLTTy(*getMRI()) == SrcOps[2].getLLTTy(*getMRI()) &&
+           "type mismatch");
+    break;
+  }
+  }
+
+  auto MIB = buildInstr(Opc);
+  for (const DstOp &Op : DstOps)
+    Op.addDefToMIB(*getMRI(), MIB);
+  for (const SrcOp &Op : SrcOps)
+    Op.addSrcToMIB(MIB);
+  if (Flags)
+    MIB->setFlags(*Flags);
+  return MIB;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
new file mode 100644
index 000000000000..885a1056b2ea
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -0,0 +1,1110 @@
+//==- llvm/CodeGen/GlobalISel/RegBankSelect.cpp - RegBankSelect --*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the RegBankSelect class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
+
+#define DEBUG_TYPE "regbankselect"
+
+using namespace llvm;
+
+static cl::opt<RegBankSelect::Mode> RegBankSelectMode(
+    cl::desc("Mode of the RegBankSelect pass"), cl::Hidden, cl::Optional,
+    cl::values(clEnumValN(RegBankSelect::Mode::Fast, "regbankselect-fast",
+                          "Run the Fast mode (default mapping)"),
+               clEnumValN(RegBankSelect::Mode::Greedy, "regbankselect-greedy",
+                          "Use the Greedy mode (best local mapping)")));
+
+char RegBankSelect::ID = 0;
+
+INITIALIZE_PASS_BEGIN(RegBankSelect, DEBUG_TYPE,
+                      "Assign register bank of generic virtual registers",
+                      false, false);
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE,
+                    "Assign register bank of generic virtual registers", false,
+                    false)
+
+RegBankSelect::RegBankSelect(char &PassID, Mode RunningMode)
+    : MachineFunctionPass(PassID), OptMode(RunningMode) {
+  if (RegBankSelectMode.getNumOccurrences() != 0) {
+    OptMode = RegBankSelectMode;
+    if (RegBankSelectMode != RunningMode)
+      LLVM_DEBUG(dbgs() << "RegBankSelect mode overrided by command line\n");
+  }
+}
+
+void RegBankSelect::init(MachineFunction &MF) {
+  RBI = MF.getSubtarget().getRegBankInfo();
+  assert(RBI && "Cannot work without RegisterBankInfo");
+  MRI = &MF.getRegInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TPC = &getAnalysis<TargetPassConfig>();
+  if (OptMode != Mode::Fast) {
+    MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+    MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  } else {
+    MBFI = nullptr;
+    MBPI = nullptr;
+  }
+  MIRBuilder.setMF(MF);
+  MORE = std::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
+}
+
+void RegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
+  if (OptMode != Mode::Fast) {
+    // We could preserve the information from these two analysis but
+    // the APIs do not allow to do so yet.
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.addRequired<MachineBranchProbabilityInfo>();
+  }
+  AU.addRequired<TargetPassConfig>();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool RegBankSelect::assignmentMatch(
+    Register Reg, const RegisterBankInfo::ValueMapping &ValMapping,
+    bool &OnlyAssign) const {
+  // By default we assume we will have to repair something.
+  OnlyAssign = false;
+  // Each part of a break down needs to end up in a different register.
+  // In other word, Reg assignment does not match.
+  if (ValMapping.NumBreakDowns != 1)
+    return false;
+
+  const RegisterBank *CurRegBank = RBI->getRegBank(Reg, *MRI, *TRI);
+  const RegisterBank *DesiredRegBank = ValMapping.BreakDown[0].RegBank;
+  // Reg is free of assignment, a simple assignment will make the
+  // register bank to match.
+  OnlyAssign = CurRegBank == nullptr;
+  LLVM_DEBUG(dbgs() << "Does assignment already match: ";
+             if (CurRegBank) dbgs() << *CurRegBank; else dbgs() << "none";
+             dbgs() << " against ";
+             assert(DesiredRegBank && "The mapping must be valid");
+             dbgs() << *DesiredRegBank << '\n';);
+  return CurRegBank == DesiredRegBank;
+}
+
+bool RegBankSelect::repairReg(
+    MachineOperand &MO, const RegisterBankInfo::ValueMapping &ValMapping,
+    RegBankSelect::RepairingPlacement &RepairPt,
+    const iterator_range<SmallVectorImpl<Register>::const_iterator> &NewVRegs) {
+
+  assert(ValMapping.NumBreakDowns == (unsigned)size(NewVRegs) &&
+         "need new vreg for each breakdown");
+
+  // An empty range of new register means no repairing.
+  assert(!NewVRegs.empty() && "We should not have to repair");
+
+  MachineInstr *MI;
+  if (ValMapping.NumBreakDowns == 1) {
+    // Assume we are repairing a use and thus, the original reg will be
+    // the source of the repairing.
+    Register Src = MO.getReg();
+    Register Dst = *NewVRegs.begin();
+
+    // If we repair a definition, swap the source and destination for
+    // the repairing.
+    if (MO.isDef())
+      std::swap(Src, Dst);
+
+    assert((RepairPt.getNumInsertPoints() == 1 || Dst.isPhysical()) &&
+           "We are about to create several defs for Dst");
+
+    // Build the instruction used to repair, then clone it at the right
+    // places. Avoiding buildCopy bypasses the check that Src and Dst have the
+    // same types because the type is a placeholder when this function is called.
+    MI = MIRBuilder.buildInstrNoInsert(TargetOpcode::COPY)
+      .addDef(Dst)
+      .addUse(Src);
+    LLVM_DEBUG(dbgs() << "Copy: " << printReg(Src) << ':'
+                      << printRegClassOrBank(Src, *MRI, TRI)
+                      << " to: " << printReg(Dst) << ':'
+                      << printRegClassOrBank(Dst, *MRI, TRI) << '\n');
+  } else {
+    // TODO: Support with G_IMPLICIT_DEF + G_INSERT sequence or G_EXTRACT
+    // sequence.
+    assert(ValMapping.partsAllUniform() && "irregular breakdowns not supported");
+
+    LLT RegTy = MRI->getType(MO.getReg());
+    if (MO.isDef()) {
+      unsigned MergeOp;
+      if (RegTy.isVector()) {
+        if (ValMapping.NumBreakDowns == RegTy.getNumElements())
+          MergeOp = TargetOpcode::G_BUILD_VECTOR;
+        else {
+          assert(
+              (ValMapping.BreakDown[0].Length * ValMapping.NumBreakDowns ==
+               RegTy.getSizeInBits()) &&
+              (ValMapping.BreakDown[0].Length % RegTy.getScalarSizeInBits() ==
+               0) &&
+              "don't understand this value breakdown");
+
+          MergeOp = TargetOpcode::G_CONCAT_VECTORS;
+        }
+      } else
+        MergeOp = TargetOpcode::G_MERGE_VALUES;
+
+      auto MergeBuilder =
+        MIRBuilder.buildInstrNoInsert(MergeOp)
+        .addDef(MO.getReg());
+
+      for (Register SrcReg : NewVRegs)
+        MergeBuilder.addUse(SrcReg);
+
+      MI = MergeBuilder;
+    } else {
+      MachineInstrBuilder UnMergeBuilder =
+        MIRBuilder.buildInstrNoInsert(TargetOpcode::G_UNMERGE_VALUES);
+      for (Register DefReg : NewVRegs)
+        UnMergeBuilder.addDef(DefReg);
+
+      UnMergeBuilder.addUse(MO.getReg());
+      MI = UnMergeBuilder;
+    }
+  }
+
+  if (RepairPt.getNumInsertPoints() != 1)
+    report_fatal_error("need testcase to support multiple insertion points");
+
+  // TODO:
+  // Check if MI is legal. if not, we need to legalize all the
+  // instructions we are going to insert.
+  std::unique_ptr<MachineInstr *[]> NewInstrs(
+      new MachineInstr *[RepairPt.getNumInsertPoints()]);
+  bool IsFirst = true;
+  unsigned Idx = 0;
+  for (const std::unique_ptr<InsertPoint> &InsertPt : RepairPt) {
+    MachineInstr *CurMI;
+    if (IsFirst)
+      CurMI = MI;
+    else
+      CurMI = MIRBuilder.getMF().CloneMachineInstr(MI);
+    InsertPt->insert(*CurMI);
+    NewInstrs[Idx++] = CurMI;
+    IsFirst = false;
+  }
+  // TODO:
+  // Legalize NewInstrs if need be.
+  return true;
+}
+
+uint64_t RegBankSelect::getRepairCost(
+    const MachineOperand &MO,
+    const RegisterBankInfo::ValueMapping &ValMapping) const {
+  assert(MO.isReg() && "We should only repair register operand");
+  assert(ValMapping.NumBreakDowns && "Nothing to map??");
+
+  bool IsSameNumOfValues = ValMapping.NumBreakDowns == 1;
+  const RegisterBank *CurRegBank = RBI->getRegBank(MO.getReg(), *MRI, *TRI);
+  // If MO does not have a register bank, we should have just been
+  // able to set one unless we have to break the value down.
+  assert(CurRegBank || MO.isDef());
+
+  // Def: Val <- NewDefs
+  //     Same number of values: copy
+  //     Different number: Val = build_sequence Defs1, Defs2, ...
+  // Use: NewSources <- Val.
+  //     Same number of values: copy.
+  //     Different number: Src1, Src2, ... =
+  //           extract_value Val, Src1Begin, Src1Len, Src2Begin, Src2Len, ...
+  // We should remember that this value is available somewhere else to
+  // coalesce the value.
+
+  if (ValMapping.NumBreakDowns != 1)
+    return RBI->getBreakDownCost(ValMapping, CurRegBank);
+
+  if (IsSameNumOfValues) {
+    const RegisterBank *DesiredRegBank = ValMapping.BreakDown[0].RegBank;
+    // If we repair a definition, swap the source and destination for
+    // the repairing.
+    if (MO.isDef())
+      std::swap(CurRegBank, DesiredRegBank);
+    // TODO: It may be possible to actually avoid the copy.
+    // If we repair something where the source is defined by a copy
+    // and the source of that copy is on the right bank, we can reuse
+    // it for free.
+    // E.g.,
+    // RegToRepair<BankA> = copy AlternativeSrc<BankB>
+    // = op RegToRepair<BankA>
+    // We can simply propagate AlternativeSrc instead of copying RegToRepair
+    // into a new virtual register.
+    // We would also need to propagate this information in the
+    // repairing placement.
+    unsigned Cost = RBI->copyCost(*DesiredRegBank, *CurRegBank,
+                                  RBI->getSizeInBits(MO.getReg(), *MRI, *TRI));
+    // TODO: use a dedicated constant for ImpossibleCost.
+    if (Cost != std::numeric_limits<unsigned>::max())
+      return Cost;
+    // Return the legalization cost of that repairing.
+  }
+  return std::numeric_limits<unsigned>::max();
+}
+
+const RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping(
+    MachineInstr &MI, RegisterBankInfo::InstructionMappings &PossibleMappings,
+    SmallVectorImpl<RepairingPlacement> &RepairPts) {
+  assert(!PossibleMappings.empty() &&
+         "Do not know how to map this instruction");
+
+  const RegisterBankInfo::InstructionMapping *BestMapping = nullptr;
+  MappingCost Cost = MappingCost::ImpossibleCost();
+  SmallVector<RepairingPlacement, 4> LocalRepairPts;
+  for (const RegisterBankInfo::InstructionMapping *CurMapping :
+       PossibleMappings) {
+    MappingCost CurCost =
+        computeMapping(MI, *CurMapping, LocalRepairPts, &Cost);
+    if (CurCost < Cost) {
+      LLVM_DEBUG(dbgs() << "New best: " << CurCost << '\n');
+      Cost = CurCost;
+      BestMapping = CurMapping;
+      RepairPts.clear();
+      for (RepairingPlacement &RepairPt : LocalRepairPts)
+        RepairPts.emplace_back(std::move(RepairPt));
+    }
+  }
+  if (!BestMapping && !TPC->isGlobalISelAbortEnabled()) {
+    // If none of the mapping worked that means they are all impossible.
+    // Thus, pick the first one and set an impossible repairing point.
+    // It will trigger the failed isel mode.
+    BestMapping = *PossibleMappings.begin();
+    RepairPts.emplace_back(
+        RepairingPlacement(MI, 0, *TRI, *this, RepairingPlacement::Impossible));
+  } else
+    assert(BestMapping && "No suitable mapping for instruction");
+  return *BestMapping;
+}
+
+void RegBankSelect::tryAvoidingSplit(
+    RegBankSelect::RepairingPlacement &RepairPt, const MachineOperand &MO,
+    const RegisterBankInfo::ValueMapping &ValMapping) const {
+  const MachineInstr &MI = *MO.getParent();
+  assert(RepairPt.hasSplit() && "We should not have to adjust for split");
+  // Splitting should only occur for PHIs or between terminators,
+  // because we only do local repairing.
+  assert((MI.isPHI() || MI.isTerminator()) && "Why do we split?");
+
+  assert(&MI.getOperand(RepairPt.getOpIdx()) == &MO &&
+         "Repairing placement does not match operand");
+
+  // If we need splitting for phis, that means it is because we
+  // could not find an insertion point before the terminators of
+  // the predecessor block for this argument. In other words,
+  // the input value is defined by one of the terminators.
+  assert((!MI.isPHI() || !MO.isDef()) && "Need split for phi def?");
+
+  // We split to repair the use of a phi or a terminator.
+  if (!MO.isDef()) {
+    if (MI.isTerminator()) {
+      assert(&MI != &(*MI.getParent()->getFirstTerminator()) &&
+             "Need to split for the first terminator?!");
+    } else {
+      // For the PHI case, the split may not be actually required.
+      // In the copy case, a phi is already a copy on the incoming edge,
+      // therefore there is no need to split.
+      if (ValMapping.NumBreakDowns == 1)
+        // This is a already a copy, there is nothing to do.
+        RepairPt.switchTo(RepairingPlacement::RepairingKind::Reassign);
+    }
+    return;
+  }
+
+  // At this point, we need to repair a defintion of a terminator.
+
+  // Technically we need to fix the def of MI on all outgoing
+  // edges of MI to keep the repairing local. In other words, we
+  // will create several definitions of the same register. This
+  // does not work for SSA unless that definition is a physical
+  // register.
+  // However, there are other cases where we can get away with
+  // that while still keeping the repairing local.
+  assert(MI.isTerminator() && MO.isDef() &&
+         "This code is for the def of a terminator");
+
+  // Since we use RPO traversal, if we need to repair a definition
+  // this means this definition could be:
+  // 1. Used by PHIs (i.e., this VReg has been visited as part of the
+  //    uses of a phi.), or
+  // 2. Part of a target specific instruction (i.e., the target applied
+  //    some register class constraints when creating the instruction.)
+  // If the constraints come for #2, the target said that another mapping
+  // is supported so we may just drop them. Indeed, if we do not change
+  // the number of registers holding that value, the uses will get fixed
+  // when we get to them.
+  // Uses in PHIs may have already been proceeded though.
+  // If the constraints come for #1, then, those are weak constraints and
+  // no actual uses may rely on them. However, the problem remains mainly
+  // the same as for #2. If the value stays in one register, we could
+  // just switch the register bank of the definition, but we would need to
+  // account for a repairing cost for each phi we silently change.
+  //
+  // In any case, if the value needs to be broken down into several
+  // registers, the repairing is not local anymore as we need to patch
+  // every uses to rebuild the value in just one register.
+  //
+  // To summarize:
+  // - If the value is in a physical register, we can do the split and
+  //   fix locally.
+  // Otherwise if the value is in a virtual register:
+  // - If the value remains in one register, we do not have to split
+  //   just switching the register bank would do, but we need to account
+  //   in the repairing cost all the phi we changed.
+  // - If the value spans several registers, then we cannot do a local
+  //   repairing.
+
+  // Check if this is a physical or virtual register.
+  Register Reg = MO.getReg();
+  if (Reg.isPhysical()) {
+    // We are going to split every outgoing edges.
+    // Check that this is possible.
+    // FIXME: The machine representation is currently broken
+    // since it also several terminators in one basic block.
+    // Because of that we would technically need a way to get
+    // the targets of just one terminator to know which edges
+    // we have to split.
+    // Assert that we do not hit the ill-formed representation.
+
+    // If there are other terminators before that one, some of
+    // the outgoing edges may not be dominated by this definition.
+    assert(&MI == &(*MI.getParent()->getFirstTerminator()) &&
+           "Do not know which outgoing edges are relevant");
+    const MachineInstr *Next = MI.getNextNode();
+    assert((!Next || Next->isUnconditionalBranch()) &&
+           "Do not know where each terminator ends up");
+    if (Next)
+      // If the next terminator uses Reg, this means we have
+      // to split right after MI and thus we need a way to ask
+      // which outgoing edges are affected.
+      assert(!Next->readsRegister(Reg) && "Need to split between terminators");
+    // We will split all the edges and repair there.
+  } else {
+    // This is a virtual register defined by a terminator.
+    if (ValMapping.NumBreakDowns == 1) {
+      // There is nothing to repair, but we may actually lie on
+      // the repairing cost because of the PHIs already proceeded
+      // as already stated.
+      // Though the code will be correct.
+      assert(false && "Repairing cost may not be accurate");
+    } else {
+      // We need to do non-local repairing. Basically, patch all
+      // the uses (i.e., phis) that we already proceeded.
+      // For now, just say this mapping is not possible.
+      RepairPt.switchTo(RepairingPlacement::RepairingKind::Impossible);
+    }
+  }
+}
+
+RegBankSelect::MappingCost RegBankSelect::computeMapping(
+    MachineInstr &MI, const RegisterBankInfo::InstructionMapping &InstrMapping,
+    SmallVectorImpl<RepairingPlacement> &RepairPts,
+    const RegBankSelect::MappingCost *BestCost) {
+  assert((MBFI || !BestCost) && "Costs comparison require MBFI");
+
+  if (!InstrMapping.isValid())
+    return MappingCost::ImpossibleCost();
+
+  // If mapped with InstrMapping, MI will have the recorded cost.
+  MappingCost Cost(MBFI ? MBFI->getBlockFreq(MI.getParent()) : 1);
+  bool Saturated = Cost.addLocalCost(InstrMapping.getCost());
+  assert(!Saturated && "Possible mapping saturated the cost");
+  LLVM_DEBUG(dbgs() << "Evaluating mapping cost for: " << MI);
+  LLVM_DEBUG(dbgs() << "With: " << InstrMapping << '\n');
+  RepairPts.clear();
+  if (BestCost && Cost > *BestCost) {
+    LLVM_DEBUG(dbgs() << "Mapping is too expensive from the start\n");
+    return Cost;
+  }
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+
+  // Moreover, to realize this mapping, the register bank of each operand must
+  // match this mapping. In other words, we may need to locally reassign the
+  // register banks. Account for that repairing cost as well.
+  // In this context, local means in the surrounding of MI.
+  for (unsigned OpIdx = 0, EndOpIdx = InstrMapping.getNumOperands();
+       OpIdx != EndOpIdx; ++OpIdx) {
+    const MachineOperand &MO = MI.getOperand(OpIdx);
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    LLT Ty = MRI.getType(Reg);
+    if (!Ty.isValid())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Opd" << OpIdx << '\n');
+    const RegisterBankInfo::ValueMapping &ValMapping =
+        InstrMapping.getOperandMapping(OpIdx);
+    // If Reg is already properly mapped, this is free.
+    bool Assign;
+    if (assignmentMatch(Reg, ValMapping, Assign)) {
+      LLVM_DEBUG(dbgs() << "=> is free (match).\n");
+      continue;
+    }
+    if (Assign) {
+      LLVM_DEBUG(dbgs() << "=> is free (simple assignment).\n");
+      RepairPts.emplace_back(RepairingPlacement(MI, OpIdx, *TRI, *this,
+                                                RepairingPlacement::Reassign));
+      continue;
+    }
+
+    // Find the insertion point for the repairing code.
+    RepairPts.emplace_back(
+        RepairingPlacement(MI, OpIdx, *TRI, *this, RepairingPlacement::Insert));
+    RepairingPlacement &RepairPt = RepairPts.back();
+
+    // If we need to split a basic block to materialize this insertion point,
+    // we may give a higher cost to this mapping.
+    // Nevertheless, we may get away with the split, so try that first.
+    if (RepairPt.hasSplit())
+      tryAvoidingSplit(RepairPt, MO, ValMapping);
+
+    // Check that the materialization of the repairing is possible.
+    if (!RepairPt.canMaterialize()) {
+      LLVM_DEBUG(dbgs() << "Mapping involves impossible repairing\n");
+      return MappingCost::ImpossibleCost();
+    }
+
+    // Account for the split cost and repair cost.
+    // Unless the cost is already saturated or we do not care about the cost.
+    if (!BestCost || Saturated)
+      continue;
+
+    // To get accurate information we need MBFI and MBPI.
+    // Thus, if we end up here this information should be here.
+    assert(MBFI && MBPI && "Cost computation requires MBFI and MBPI");
+
+    // FIXME: We will have to rework the repairing cost model.
+    // The repairing cost depends on the register bank that MO has.
+    // However, when we break down the value into different values,
+    // MO may not have a register bank while still needing repairing.
+    // For the fast mode, we don't compute the cost so that is fine,
+    // but still for the repairing code, we will have to make a choice.
+    // For the greedy mode, we should choose greedily what is the best
+    // choice based on the next use of MO.
+
+    // Sums up the repairing cost of MO at each insertion point.
+    uint64_t RepairCost = getRepairCost(MO, ValMapping);
+
+    // This is an impossible to repair cost.
+    if (RepairCost == std::numeric_limits<unsigned>::max())
+      return MappingCost::ImpossibleCost();
+
+    // Bias used for splitting: 5%.
+    const uint64_t PercentageForBias = 5;
+    uint64_t Bias = (RepairCost * PercentageForBias + 99) / 100;
+    // We should not need more than a couple of instructions to repair
+    // an assignment. In other words, the computation should not
+    // overflow because the repairing cost is free of basic block
+    // frequency.
+    assert(((RepairCost < RepairCost * PercentageForBias) &&
+            (RepairCost * PercentageForBias <
+             RepairCost * PercentageForBias + 99)) &&
+           "Repairing involves more than a billion of instructions?!");
+    for (const std::unique_ptr<InsertPoint> &InsertPt : RepairPt) {
+      assert(InsertPt->canMaterialize() && "We should not have made it here");
+      // We will applied some basic block frequency and those uses uint64_t.
+      if (!InsertPt->isSplit())
+        Saturated = Cost.addLocalCost(RepairCost);
+      else {
+        uint64_t CostForInsertPt = RepairCost;
+        // Again we shouldn't overflow here givent that
+        // CostForInsertPt is frequency free at this point.
+        assert(CostForInsertPt + Bias > CostForInsertPt &&
+               "Repairing + split bias overflows");
+        CostForInsertPt += Bias;
+        uint64_t PtCost = InsertPt->frequency(*this) * CostForInsertPt;
+        // Check if we just overflowed.
+        if ((Saturated = PtCost < CostForInsertPt))
+          Cost.saturate();
+        else
+          Saturated = Cost.addNonLocalCost(PtCost);
+      }
+
+      // Stop looking into what it takes to repair, this is already
+      // too expensive.
+      if (BestCost && Cost > *BestCost) {
+        LLVM_DEBUG(dbgs() << "Mapping is too expensive, stop processing\n");
+        return Cost;
+      }
+
+      // No need to accumulate more cost information.
+      // We need to still gather the repairing information though.
+      if (Saturated)
+        break;
+    }
+  }
+  LLVM_DEBUG(dbgs() << "Total cost is: " << Cost << "\n");
+  return Cost;
+}
+
+bool RegBankSelect::applyMapping(
+    MachineInstr &MI, const RegisterBankInfo::InstructionMapping &InstrMapping,
+    SmallVectorImpl<RegBankSelect::RepairingPlacement> &RepairPts) {
+  // OpdMapper will hold all the information needed for the rewriting.
+  RegisterBankInfo::OperandsMapper OpdMapper(MI, InstrMapping, *MRI);
+
+  // First, place the repairing code.
+  for (RepairingPlacement &RepairPt : RepairPts) {
+    if (!RepairPt.canMaterialize() ||
+        RepairPt.getKind() == RepairingPlacement::Impossible)
+      return false;
+    assert(RepairPt.getKind() != RepairingPlacement::None &&
+           "This should not make its way in the list");
+    unsigned OpIdx = RepairPt.getOpIdx();
+    MachineOperand &MO = MI.getOperand(OpIdx);
+    const RegisterBankInfo::ValueMapping &ValMapping =
+        InstrMapping.getOperandMapping(OpIdx);
+    Register Reg = MO.getReg();
+
+    switch (RepairPt.getKind()) {
+    case RepairingPlacement::Reassign:
+      assert(ValMapping.NumBreakDowns == 1 &&
+             "Reassignment should only be for simple mapping");
+      MRI->setRegBank(Reg, *ValMapping.BreakDown[0].RegBank);
+      break;
+    case RepairingPlacement::Insert:
+      // Don't insert additional instruction for debug instruction.
+      if (MI.isDebugInstr())
+        break;
+      OpdMapper.createVRegs(OpIdx);
+      if (!repairReg(MO, ValMapping, RepairPt, OpdMapper.getVRegs(OpIdx)))
+        return false;
+      break;
+    default:
+      llvm_unreachable("Other kind should not happen");
+    }
+  }
+
+  // Second, rewrite the instruction.
+  LLVM_DEBUG(dbgs() << "Actual mapping of the operands: " << OpdMapper << '\n');
+  RBI->applyMapping(OpdMapper);
+
+  return true;
+}
+
+bool RegBankSelect::assignInstr(MachineInstr &MI) {
+  LLVM_DEBUG(dbgs() << "Assign: " << MI);
+
+  unsigned Opc = MI.getOpcode();
+  if (isPreISelGenericOptimizationHint(Opc)) {
+    assert((Opc == TargetOpcode::G_ASSERT_ZEXT ||
+            Opc == TargetOpcode::G_ASSERT_SEXT ||
+            Opc == TargetOpcode::G_ASSERT_ALIGN) &&
+           "Unexpected hint opcode!");
+    // The only correct mapping for these is to always use the source register
+    // bank.
+    const RegisterBank *RB =
+        RBI->getRegBank(MI.getOperand(1).getReg(), *MRI, *TRI);
+    // We can assume every instruction above this one has a selected register
+    // bank.
+    assert(RB && "Expected source register to have a register bank?");
+    LLVM_DEBUG(dbgs() << "... Hint always uses source's register bank.\n");
+    MRI->setRegBank(MI.getOperand(0).getReg(), *RB);
+    return true;
+  }
+
+  // Remember the repairing placement for all the operands.
+  SmallVector<RepairingPlacement, 4> RepairPts;
+
+  const RegisterBankInfo::InstructionMapping *BestMapping;
+  if (OptMode == RegBankSelect::Mode::Fast) {
+    BestMapping = &RBI->getInstrMapping(MI);
+    MappingCost DefaultCost = computeMapping(MI, *BestMapping, RepairPts);
+    (void)DefaultCost;
+    if (DefaultCost == MappingCost::ImpossibleCost())
+      return false;
+  } else {
+    RegisterBankInfo::InstructionMappings PossibleMappings =
+        RBI->getInstrPossibleMappings(MI);
+    if (PossibleMappings.empty())
+      return false;
+    BestMapping = &findBestMapping(MI, PossibleMappings, RepairPts);
+  }
+  // Make sure the mapping is valid for MI.
+  assert(BestMapping->verify(MI) && "Invalid instruction mapping");
+
+  LLVM_DEBUG(dbgs() << "Best Mapping: " << *BestMapping << '\n');
+
+  // After this call, MI may not be valid anymore.
+  // Do not use it.
+  return applyMapping(MI, *BestMapping, RepairPts);
+}
+
+bool RegBankSelect::assignRegisterBanks(MachineFunction &MF) {
+  // Walk the function and assign register banks to all operands.
+  // Use a RPOT to make sure all registers are assigned before we choose
+  // the best mapping of the current instruction.
+  ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
+  for (MachineBasicBlock *MBB : RPOT) {
+    // Set a sensible insertion point so that subsequent calls to
+    // MIRBuilder.
+    MIRBuilder.setMBB(*MBB);
+    SmallVector<MachineInstr *> WorkList(
+        make_pointer_range(reverse(MBB->instrs())));
+
+    while (!WorkList.empty()) {
+      MachineInstr &MI = *WorkList.pop_back_val();
+
+      // Ignore target-specific post-isel instructions: they should use proper
+      // regclasses.
+      if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode())
+        continue;
+
+      // Ignore inline asm instructions: they should use physical
+      // registers/regclasses
+      if (MI.isInlineAsm())
+        continue;
+
+      // Ignore IMPLICIT_DEF which must have a regclass.
+      if (MI.isImplicitDef())
+        continue;
+
+      if (!assignInstr(MI)) {
+        reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect",
+                           "unable to map instruction", MI);
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+bool RegBankSelect::checkFunctionIsLegal(MachineFunction &MF) const {
+#ifndef NDEBUG
+  if (!DisableGISelLegalityCheck) {
+    if (const MachineInstr *MI = machineFunctionIsIllegal(MF)) {
+      reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect",
+                         "instruction is not legal", *MI);
+      return false;
+    }
+  }
+#endif
+  return true;
+}
+
+bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
+  // If the ISel pipeline failed, do not bother running that pass.
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Assign register banks for: " << MF.getName() << '\n');
+  const Function &F = MF.getFunction();
+  Mode SaveOptMode = OptMode;
+  if (F.hasOptNone())
+    OptMode = Mode::Fast;
+  init(MF);
+
+#ifndef NDEBUG
+  if (!checkFunctionIsLegal(MF))
+    return false;
+#endif
+
+  assignRegisterBanks(MF);
+
+  OptMode = SaveOptMode;
+  return false;
+}
+
+//------------------------------------------------------------------------------
+//                  Helper Classes Implementation
+//------------------------------------------------------------------------------
+RegBankSelect::RepairingPlacement::RepairingPlacement(
+    MachineInstr &MI, unsigned OpIdx, const TargetRegisterInfo &TRI, Pass &P,
+    RepairingPlacement::RepairingKind Kind)
+    // Default is, we are going to insert code to repair OpIdx.
+    : Kind(Kind), OpIdx(OpIdx),
+      CanMaterialize(Kind != RepairingKind::Impossible), P(P) {
+  const MachineOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isReg() && "Trying to repair a non-reg operand");
+
+  if (Kind != RepairingKind::Insert)
+    return;
+
+  // Repairings for definitions happen after MI, uses happen before.
+  bool Before = !MO.isDef();
+
+  // Check if we are done with MI.
+  if (!MI.isPHI() && !MI.isTerminator()) {
+    addInsertPoint(MI, Before);
+    // We are done with the initialization.
+    return;
+  }
+
+  // Now, look for the special cases.
+  if (MI.isPHI()) {
+    // - PHI must be the first instructions:
+    //   * Before, we have to split the related incoming edge.
+    //   * After, move the insertion point past the last phi.
+    if (!Before) {
+      MachineBasicBlock::iterator It = MI.getParent()->getFirstNonPHI();
+      if (It != MI.getParent()->end())
+        addInsertPoint(*It, /*Before*/ true);
+      else
+        addInsertPoint(*(--It), /*Before*/ false);
+      return;
+    }
+    // We repair a use of a phi, we may need to split the related edge.
+    MachineBasicBlock &Pred = *MI.getOperand(OpIdx + 1).getMBB();
+    // Check if we can move the insertion point prior to the
+    // terminators of the predecessor.
+    Register Reg = MO.getReg();
+    MachineBasicBlock::iterator It = Pred.getLastNonDebugInstr();
+    for (auto Begin = Pred.begin(); It != Begin && It->isTerminator(); --It)
+      if (It->modifiesRegister(Reg, &TRI)) {
+        // We cannot hoist the repairing code in the predecessor.
+        // Split the edge.
+        addInsertPoint(Pred, *MI.getParent());
+        return;
+      }
+    // At this point, we can insert in Pred.
+
+    // - If It is invalid, Pred is empty and we can insert in Pred
+    //   wherever we want.
+    // - If It is valid, It is the first non-terminator, insert after It.
+    if (It == Pred.end())
+      addInsertPoint(Pred, /*Beginning*/ false);
+    else
+      addInsertPoint(*It, /*Before*/ false);
+  } else {
+    // - Terminators must be the last instructions:
+    //   * Before, move the insert point before the first terminator.
+    //   * After, we have to split the outcoming edges.
+    if (Before) {
+      // Check whether Reg is defined by any terminator.
+      MachineBasicBlock::reverse_iterator It = MI;
+      auto REnd = MI.getParent()->rend();
+
+      for (; It != REnd && It->isTerminator(); ++It) {
+        assert(!It->modifiesRegister(MO.getReg(), &TRI) &&
+               "copy insertion in middle of terminators not handled");
+      }
+
+      if (It == REnd) {
+        addInsertPoint(*MI.getParent()->begin(), true);
+        return;
+      }
+
+      // We are sure to be right before the first terminator.
+      addInsertPoint(*It, /*Before*/ false);
+      return;
+    }
+    // Make sure Reg is not redefined by other terminators, otherwise
+    // we do not know how to split.
+    for (MachineBasicBlock::iterator It = MI, End = MI.getParent()->end();
+         ++It != End;)
+      // The machine verifier should reject this kind of code.
+      assert(It->modifiesRegister(MO.getReg(), &TRI) &&
+             "Do not know where to split");
+    // Split each outcoming edges.
+    MachineBasicBlock &Src = *MI.getParent();
+    for (auto &Succ : Src.successors())
+      addInsertPoint(Src, Succ);
+  }
+}
+
+void RegBankSelect::RepairingPlacement::addInsertPoint(MachineInstr &MI,
+                                                       bool Before) {
+  addInsertPoint(*new InstrInsertPoint(MI, Before));
+}
+
+void RegBankSelect::RepairingPlacement::addInsertPoint(MachineBasicBlock &MBB,
+                                                       bool Beginning) {
+  addInsertPoint(*new MBBInsertPoint(MBB, Beginning));
+}
+
+void RegBankSelect::RepairingPlacement::addInsertPoint(MachineBasicBlock &Src,
+                                                       MachineBasicBlock &Dst) {
+  addInsertPoint(*new EdgeInsertPoint(Src, Dst, P));
+}
+
+void RegBankSelect::RepairingPlacement::addInsertPoint(
+    RegBankSelect::InsertPoint &Point) {
+  CanMaterialize &= Point.canMaterialize();
+  HasSplit |= Point.isSplit();
+  InsertPoints.emplace_back(&Point);
+}
+
+RegBankSelect::InstrInsertPoint::InstrInsertPoint(MachineInstr &Instr,
+                                                  bool Before)
+    : Instr(Instr), Before(Before) {
+  // Since we do not support splitting, we do not need to update
+  // liveness and such, so do not do anything with P.
+  assert((!Before || !Instr.isPHI()) &&
+         "Splitting before phis requires more points");
+  assert((!Before || !Instr.getNextNode() || !Instr.getNextNode()->isPHI()) &&
+         "Splitting between phis does not make sense");
+}
+
+void RegBankSelect::InstrInsertPoint::materialize() {
+  if (isSplit()) {
+    // Slice and return the beginning of the new block.
+    // If we need to split between the terminators, we theoritically
+    // need to know where the first and second set of terminators end
+    // to update the successors properly.
+    // Now, in pratice, we should have a maximum of 2 branch
+    // instructions; one conditional and one unconditional. Therefore
+    // we know how to update the successor by looking at the target of
+    // the unconditional branch.
+    // If we end up splitting at some point, then, we should update
+    // the liveness information and such. I.e., we would need to
+    // access P here.
+    // The machine verifier should actually make sure such cases
+    // cannot happen.
+    llvm_unreachable("Not yet implemented");
+  }
+  // Otherwise the insertion point is just the current or next
+  // instruction depending on Before. I.e., there is nothing to do
+  // here.
+}
+
+bool RegBankSelect::InstrInsertPoint::isSplit() const {
+  // If the insertion point is after a terminator, we need to split.
+  if (!Before)
+    return Instr.isTerminator();
+  // If we insert before an instruction that is after a terminator,
+  // we are still after a terminator.
+  return Instr.getPrevNode() && Instr.getPrevNode()->isTerminator();
+}
+
+uint64_t RegBankSelect::InstrInsertPoint::frequency(const Pass &P) const {
+  // Even if we need to split, because we insert between terminators,
+  // this split has actually the same frequency as the instruction.
+  const MachineBlockFrequencyInfo *MBFI =
+      P.getAnalysisIfAvailable<MachineBlockFrequencyInfo>();
+  if (!MBFI)
+    return 1;
+  return MBFI->getBlockFreq(Instr.getParent()).getFrequency();
+}
+
+uint64_t RegBankSelect::MBBInsertPoint::frequency(const Pass &P) const {
+  const MachineBlockFrequencyInfo *MBFI =
+      P.getAnalysisIfAvailable<MachineBlockFrequencyInfo>();
+  if (!MBFI)
+    return 1;
+  return MBFI->getBlockFreq(&MBB).getFrequency();
+}
+
+void RegBankSelect::EdgeInsertPoint::materialize() {
+  // If we end up repairing twice at the same place before materializing the
+  // insertion point, we may think we have to split an edge twice.
+  // We should have a factory for the insert point such that identical points
+  // are the same instance.
+  assert(Src.isSuccessor(DstOrSplit) && DstOrSplit->isPredecessor(&Src) &&
+         "This point has already been split");
+  MachineBasicBlock *NewBB = Src.SplitCriticalEdge(DstOrSplit, P);
+  assert(NewBB && "Invalid call to materialize");
+  // We reuse the destination block to hold the information of the new block.
+  DstOrSplit = NewBB;
+}
+
+uint64_t RegBankSelect::EdgeInsertPoint::frequency(const Pass &P) const {
+  const MachineBlockFrequencyInfo *MBFI =
+      P.getAnalysisIfAvailable<MachineBlockFrequencyInfo>();
+  if (!MBFI)
+    return 1;
+  if (WasMaterialized)
+    return MBFI->getBlockFreq(DstOrSplit).getFrequency();
+
+  const MachineBranchProbabilityInfo *MBPI =
+      P.getAnalysisIfAvailable<MachineBranchProbabilityInfo>();
+  if (!MBPI)
+    return 1;
+  // The basic block will be on the edge.
+  return (MBFI->getBlockFreq(&Src) * MBPI->getEdgeProbability(&Src, DstOrSplit))
+      .getFrequency();
+}
+
+bool RegBankSelect::EdgeInsertPoint::canMaterialize() const {
+  // If this is not a critical edge, we should not have used this insert
+  // point. Indeed, either the successor or the predecessor should
+  // have do.
+  assert(Src.succ_size() > 1 && DstOrSplit->pred_size() > 1 &&
+         "Edge is not critical");
+  return Src.canSplitCriticalEdge(DstOrSplit);
+}
+
+RegBankSelect::MappingCost::MappingCost(const BlockFrequency &LocalFreq)
+    : LocalFreq(LocalFreq.getFrequency()) {}
+
+bool RegBankSelect::MappingCost::addLocalCost(uint64_t Cost) {
+  // Check if this overflows.
+  if (LocalCost + Cost < LocalCost) {
+    saturate();
+    return true;
+  }
+  LocalCost += Cost;
+  return isSaturated();
+}
+
+bool RegBankSelect::MappingCost::addNonLocalCost(uint64_t Cost) {
+  // Check if this overflows.
+  if (NonLocalCost + Cost < NonLocalCost) {
+    saturate();
+    return true;
+  }
+  NonLocalCost += Cost;
+  return isSaturated();
+}
+
+bool RegBankSelect::MappingCost::isSaturated() const {
+  return LocalCost == UINT64_MAX - 1 && NonLocalCost == UINT64_MAX &&
+         LocalFreq == UINT64_MAX;
+}
+
+void RegBankSelect::MappingCost::saturate() {
+  *this = ImpossibleCost();
+  --LocalCost;
+}
+
+RegBankSelect::MappingCost RegBankSelect::MappingCost::ImpossibleCost() {
+  return MappingCost(UINT64_MAX, UINT64_MAX, UINT64_MAX);
+}
+
+bool RegBankSelect::MappingCost::operator<(const MappingCost &Cost) const {
+  // Sort out the easy cases.
+  if (*this == Cost)
+    return false;
+  // If one is impossible to realize the other is cheaper unless it is
+  // impossible as well.
+  if ((*this == ImpossibleCost()) || (Cost == ImpossibleCost()))
+    return (*this == ImpossibleCost()) < (Cost == ImpossibleCost());
+  // If one is saturated the other is cheaper, unless it is saturated
+  // as well.
+  if (isSaturated() || Cost.isSaturated())
+    return isSaturated() < Cost.isSaturated();
+  // At this point we know both costs hold sensible values.
+
+  // If both values have a different base frequency, there is no much
+  // we can do but to scale everything.
+  // However, if they have the same base frequency we can avoid making
+  // complicated computation.
+  uint64_t ThisLocalAdjust;
+  uint64_t OtherLocalAdjust;
+  if (LLVM_LIKELY(LocalFreq == Cost.LocalFreq)) {
+
+    // At this point, we know the local costs are comparable.
+    // Do the case that do not involve potential overflow first.
+    if (NonLocalCost == Cost.NonLocalCost)
+      // Since the non-local costs do not discriminate on the result,
+      // just compare the local costs.
+      return LocalCost < Cost.LocalCost;
+
+    // The base costs are comparable so we may only keep the relative
+    // value to increase our chances of avoiding overflows.
+    ThisLocalAdjust = 0;
+    OtherLocalAdjust = 0;
+    if (LocalCost < Cost.LocalCost)
+      OtherLocalAdjust = Cost.LocalCost - LocalCost;
+    else
+      ThisLocalAdjust = LocalCost - Cost.LocalCost;
+  } else {
+    ThisLocalAdjust = LocalCost;
+    OtherLocalAdjust = Cost.LocalCost;
+  }
+
+  // The non-local costs are comparable, just keep the relative value.
+  uint64_t ThisNonLocalAdjust = 0;
+  uint64_t OtherNonLocalAdjust = 0;
+  if (NonLocalCost < Cost.NonLocalCost)
+    OtherNonLocalAdjust = Cost.NonLocalCost - NonLocalCost;
+  else
+    ThisNonLocalAdjust = NonLocalCost - Cost.NonLocalCost;
+  // Scale everything to make them comparable.
+  uint64_t ThisScaledCost = ThisLocalAdjust * LocalFreq;
+  // Check for overflow on that operation.
+  bool ThisOverflows = ThisLocalAdjust && (ThisScaledCost < ThisLocalAdjust ||
+                                           ThisScaledCost < LocalFreq);
+  uint64_t OtherScaledCost = OtherLocalAdjust * Cost.LocalFreq;
+  // Check for overflow on the last operation.
+  bool OtherOverflows =
+      OtherLocalAdjust &&
+      (OtherScaledCost < OtherLocalAdjust || OtherScaledCost < Cost.LocalFreq);
+  // Add the non-local costs.
+  ThisOverflows |= ThisNonLocalAdjust &&
+                   ThisScaledCost + ThisNonLocalAdjust < ThisNonLocalAdjust;
+  ThisScaledCost += ThisNonLocalAdjust;
+  OtherOverflows |= OtherNonLocalAdjust &&
+                    OtherScaledCost + OtherNonLocalAdjust < OtherNonLocalAdjust;
+  OtherScaledCost += OtherNonLocalAdjust;
+  // If both overflows, we cannot compare without additional
+  // precision, e.g., APInt. Just give up on that case.
+  if (ThisOverflows && OtherOverflows)
+    return false;
+  // If one overflows but not the other, we can still compare.
+  if (ThisOverflows || OtherOverflows)
+    return ThisOverflows < OtherOverflows;
+  // Otherwise, just compare the values.
+  return ThisScaledCost < OtherScaledCost;
+}
+
+bool RegBankSelect::MappingCost::operator==(const MappingCost &Cost) const {
+  return LocalCost == Cost.LocalCost && NonLocalCost == Cost.NonLocalCost &&
+         LocalFreq == Cost.LocalFreq;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegBankSelect::MappingCost::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+#endif
+
+void RegBankSelect::MappingCost::print(raw_ostream &OS) const {
+  if (*this == ImpossibleCost()) {
+    OS << "impossible";
+    return;
+  }
+  if (isSaturated()) {
+    OS << "saturated";
+    return;
+  }
+  OS << LocalFreq << " * " << LocalCost << " + " << NonLocalCost;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Utils.cpp
new file mode 100644
index 000000000000..080600d3cc98
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -0,0 +1,1381 @@
+//===- llvm/CodeGen/GlobalISel/Utils.cpp -------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file This file implements the utility functions used by the GlobalISel
+/// pipeline.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/CodeGen/CodeGenCommonISel.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include <numeric>
+#include <optional>
+
+#define DEBUG_TYPE "globalisel-utils"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+Register llvm::constrainRegToClass(MachineRegisterInfo &MRI,
+                                   const TargetInstrInfo &TII,
+                                   const RegisterBankInfo &RBI, Register Reg,
+                                   const TargetRegisterClass &RegClass) {
+  if (!RBI.constrainGenericRegister(Reg, RegClass, MRI))
+    return MRI.createVirtualRegister(&RegClass);
+
+  return Reg;
+}
+
+Register llvm::constrainOperandRegClass(
+    const MachineFunction &MF, const TargetRegisterInfo &TRI,
+    MachineRegisterInfo &MRI, const TargetInstrInfo &TII,
+    const RegisterBankInfo &RBI, MachineInstr &InsertPt,
+    const TargetRegisterClass &RegClass, MachineOperand &RegMO) {
+  Register Reg = RegMO.getReg();
+  // Assume physical registers are properly constrained.
+  assert(Reg.isVirtual() && "PhysReg not implemented");
+
+  // Save the old register class to check whether
+  // the change notifications will be required.
+  // TODO: A better approach would be to pass
+  // the observers to constrainRegToClass().
+  auto *OldRegClass = MRI.getRegClassOrNull(Reg);
+  Register ConstrainedReg = constrainRegToClass(MRI, TII, RBI, Reg, RegClass);
+  // If we created a new virtual register because the class is not compatible
+  // then create a copy between the new and the old register.
+  if (ConstrainedReg != Reg) {
+    MachineBasicBlock::iterator InsertIt(&InsertPt);
+    MachineBasicBlock &MBB = *InsertPt.getParent();
+    // FIXME: The copy needs to have the classes constrained for its operands.
+    // Use operand's regbank to get the class for old register (Reg).
+    if (RegMO.isUse()) {
+      BuildMI(MBB, InsertIt, InsertPt.getDebugLoc(),
+              TII.get(TargetOpcode::COPY), ConstrainedReg)
+          .addReg(Reg);
+    } else {
+      assert(RegMO.isDef() && "Must be a definition");
+      BuildMI(MBB, std::next(InsertIt), InsertPt.getDebugLoc(),
+              TII.get(TargetOpcode::COPY), Reg)
+          .addReg(ConstrainedReg);
+    }
+    if (GISelChangeObserver *Observer = MF.getObserver()) {
+      Observer->changingInstr(*RegMO.getParent());
+    }
+    RegMO.setReg(ConstrainedReg);
+    if (GISelChangeObserver *Observer = MF.getObserver()) {
+      Observer->changedInstr(*RegMO.getParent());
+    }
+  } else if (OldRegClass != MRI.getRegClassOrNull(Reg)) {
+    if (GISelChangeObserver *Observer = MF.getObserver()) {
+      if (!RegMO.isDef()) {
+        MachineInstr *RegDef = MRI.getVRegDef(Reg);
+        Observer->changedInstr(*RegDef);
+      }
+      Observer->changingAllUsesOfReg(MRI, Reg);
+      Observer->finishedChangingAllUsesOfReg();
+    }
+  }
+  return ConstrainedReg;
+}
+
+Register llvm::constrainOperandRegClass(
+    const MachineFunction &MF, const TargetRegisterInfo &TRI,
+    MachineRegisterInfo &MRI, const TargetInstrInfo &TII,
+    const RegisterBankInfo &RBI, MachineInstr &InsertPt, const MCInstrDesc &II,
+    MachineOperand &RegMO, unsigned OpIdx) {
+  Register Reg = RegMO.getReg();
+  // Assume physical registers are properly constrained.
+  assert(Reg.isVirtual() && "PhysReg not implemented");
+
+  const TargetRegisterClass *OpRC = TII.getRegClass(II, OpIdx, &TRI, MF);
+  // Some of the target independent instructions, like COPY, may not impose any
+  // register class constraints on some of their operands: If it's a use, we can
+  // skip constraining as the instruction defining the register would constrain
+  // it.
+
+  if (OpRC) {
+    // Obtain the RC from incoming regbank if it is a proper sub-class. Operands
+    // can have multiple regbanks for a superclass that combine different
+    // register types (E.g., AMDGPU's VGPR and AGPR). The regbank ambiguity
+    // resolved by targets during regbankselect should not be overridden.
+    if (const auto *SubRC = TRI.getCommonSubClass(
+            OpRC, TRI.getConstrainedRegClassForOperand(RegMO, MRI)))
+      OpRC = SubRC;
+
+    OpRC = TRI.getAllocatableClass(OpRC);
+  }
+
+  if (!OpRC) {
+    assert((!isTargetSpecificOpcode(II.getOpcode()) || RegMO.isUse()) &&
+           "Register class constraint is required unless either the "
+           "instruction is target independent or the operand is a use");
+    // FIXME: Just bailing out like this here could be not enough, unless we
+    // expect the users of this function to do the right thing for PHIs and
+    // COPY:
+    //   v1 = COPY v0
+    //   v2 = COPY v1
+    // v1 here may end up not being constrained at all. Please notice that to
+    // reproduce the issue we likely need a destination pattern of a selection
+    // rule producing such extra copies, not just an input GMIR with them as
+    // every existing target using selectImpl handles copies before calling it
+    // and they never reach this function.
+    return Reg;
+  }
+  return constrainOperandRegClass(MF, TRI, MRI, TII, RBI, InsertPt, *OpRC,
+                                  RegMO);
+}
+
+bool llvm::constrainSelectedInstRegOperands(MachineInstr &I,
+                                            const TargetInstrInfo &TII,
+                                            const TargetRegisterInfo &TRI,
+                                            const RegisterBankInfo &RBI) {
+  assert(!isPreISelGenericOpcode(I.getOpcode()) &&
+         "A selected instruction is expected");
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  for (unsigned OpI = 0, OpE = I.getNumExplicitOperands(); OpI != OpE; ++OpI) {
+    MachineOperand &MO = I.getOperand(OpI);
+
+    // There's nothing to be done on non-register operands.
+    if (!MO.isReg())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Converting operand: " << MO << '\n');
+    assert(MO.isReg() && "Unsupported non-reg operand");
+
+    Register Reg = MO.getReg();
+    // Physical registers don't need to be constrained.
+    if (Reg.isPhysical())
+      continue;
+
+    // Register operands with a value of 0 (e.g. predicate operands) don't need
+    // to be constrained.
+    if (Reg == 0)
+      continue;
+
+    // If the operand is a vreg, we should constrain its regclass, and only
+    // insert COPYs if that's impossible.
+    // constrainOperandRegClass does that for us.
+    constrainOperandRegClass(MF, TRI, MRI, TII, RBI, I, I.getDesc(), MO, OpI);
+
+    // Tie uses to defs as indicated in MCInstrDesc if this hasn't already been
+    // done.
+    if (MO.isUse()) {
+      int DefIdx = I.getDesc().getOperandConstraint(OpI, MCOI::TIED_TO);
+      if (DefIdx != -1 && !I.isRegTiedToUseOperand(DefIdx))
+        I.tieOperands(DefIdx, OpI);
+    }
+  }
+  return true;
+}
+
+bool llvm::canReplaceReg(Register DstReg, Register SrcReg,
+                         MachineRegisterInfo &MRI) {
+  // Give up if either DstReg or SrcReg  is a physical register.
+  if (DstReg.isPhysical() || SrcReg.isPhysical())
+    return false;
+  // Give up if the types don't match.
+  if (MRI.getType(DstReg) != MRI.getType(SrcReg))
+    return false;
+  // Replace if either DstReg has no constraints or the register
+  // constraints match.
+  return !MRI.getRegClassOrRegBank(DstReg) ||
+         MRI.getRegClassOrRegBank(DstReg) == MRI.getRegClassOrRegBank(SrcReg);
+}
+
+bool llvm::isTriviallyDead(const MachineInstr &MI,
+                           const MachineRegisterInfo &MRI) {
+  // FIXME: This logical is mostly duplicated with
+  // DeadMachineInstructionElim::isDead. Why is LOCAL_ESCAPE not considered in
+  // MachineInstr::isLabel?
+
+  // Don't delete frame allocation labels.
+  if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE)
+    return false;
+  // LIFETIME markers should be preserved even if they seem dead.
+  if (MI.getOpcode() == TargetOpcode::LIFETIME_START ||
+      MI.getOpcode() == TargetOpcode::LIFETIME_END)
+    return false;
+
+  // If we can move an instruction, we can remove it.  Otherwise, it has
+  // a side-effect of some sort.
+  bool SawStore = false;
+  if (!MI.isSafeToMove(/*AA=*/nullptr, SawStore) && !MI.isPHI())
+    return false;
+
+  // Instructions without side-effects are dead iff they only define dead vregs.
+  for (const auto &MO : MI.all_defs()) {
+    Register Reg = MO.getReg();
+    if (Reg.isPhysical() || !MRI.use_nodbg_empty(Reg))
+      return false;
+  }
+  return true;
+}
+
+static void reportGISelDiagnostic(DiagnosticSeverity Severity,
+                                  MachineFunction &MF,
+                                  const TargetPassConfig &TPC,
+                                  MachineOptimizationRemarkEmitter &MORE,
+                                  MachineOptimizationRemarkMissed &R) {
+  bool IsFatal = Severity == DS_Error &&
+                 TPC.isGlobalISelAbortEnabled();
+  // Print the function name explicitly if we don't have a debug location (which
+  // makes the diagnostic less useful) or if we're going to emit a raw error.
+  if (!R.getLocation().isValid() || IsFatal)
+    R << (" (in function: " + MF.getName() + ")").str();
+
+  if (IsFatal)
+    report_fatal_error(Twine(R.getMsg()));
+  else
+    MORE.emit(R);
+}
+
+void llvm::reportGISelWarning(MachineFunction &MF, const TargetPassConfig &TPC,
+                              MachineOptimizationRemarkEmitter &MORE,
+                              MachineOptimizationRemarkMissed &R) {
+  reportGISelDiagnostic(DS_Warning, MF, TPC, MORE, R);
+}
+
+void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
+                              MachineOptimizationRemarkEmitter &MORE,
+                              MachineOptimizationRemarkMissed &R) {
+  MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+  reportGISelDiagnostic(DS_Error, MF, TPC, MORE, R);
+}
+
+void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
+                              MachineOptimizationRemarkEmitter &MORE,
+                              const char *PassName, StringRef Msg,
+                              const MachineInstr &MI) {
+  MachineOptimizationRemarkMissed R(PassName, "GISelFailure: ",
+                                    MI.getDebugLoc(), MI.getParent());
+  R << Msg;
+  // Printing MI is expensive;  only do it if expensive remarks are enabled.
+  if (TPC.isGlobalISelAbortEnabled() || MORE.allowExtraAnalysis(PassName))
+    R << ": " << ore::MNV("Inst", MI);
+  reportGISelFailure(MF, TPC, MORE, R);
+}
+
+std::optional<APInt> llvm::getIConstantVRegVal(Register VReg,
+                                               const MachineRegisterInfo &MRI) {
+  std::optional<ValueAndVReg> ValAndVReg = getIConstantVRegValWithLookThrough(
+      VReg, MRI, /*LookThroughInstrs*/ false);
+  assert((!ValAndVReg || ValAndVReg->VReg == VReg) &&
+         "Value found while looking through instrs");
+  if (!ValAndVReg)
+    return std::nullopt;
+  return ValAndVReg->Value;
+}
+
+std::optional<int64_t>
+llvm::getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI) {
+  std::optional<APInt> Val = getIConstantVRegVal(VReg, MRI);
+  if (Val && Val->getBitWidth() <= 64)
+    return Val->getSExtValue();
+  return std::nullopt;
+}
+
+namespace {
+
+typedef std::function<bool(const MachineInstr *)> IsOpcodeFn;
+typedef std::function<std::optional<APInt>(const MachineInstr *MI)> GetAPCstFn;
+
+std::optional<ValueAndVReg> getConstantVRegValWithLookThrough(
+    Register VReg, const MachineRegisterInfo &MRI, IsOpcodeFn IsConstantOpcode,
+    GetAPCstFn getAPCstValue, bool LookThroughInstrs = true,
+    bool LookThroughAnyExt = false) {
+  SmallVector<std::pair<unsigned, unsigned>, 4> SeenOpcodes;
+  MachineInstr *MI;
+
+  while ((MI = MRI.getVRegDef(VReg)) && !IsConstantOpcode(MI) &&
+         LookThroughInstrs) {
+    switch (MI->getOpcode()) {
+    case TargetOpcode::G_ANYEXT:
+      if (!LookThroughAnyExt)
+        return std::nullopt;
+      [[fallthrough]];
+    case TargetOpcode::G_TRUNC:
+    case TargetOpcode::G_SEXT:
+    case TargetOpcode::G_ZEXT:
+      SeenOpcodes.push_back(std::make_pair(
+          MI->getOpcode(),
+          MRI.getType(MI->getOperand(0).getReg()).getSizeInBits()));
+      VReg = MI->getOperand(1).getReg();
+      break;
+    case TargetOpcode::COPY:
+      VReg = MI->getOperand(1).getReg();
+      if (VReg.isPhysical())
+        return std::nullopt;
+      break;
+    case TargetOpcode::G_INTTOPTR:
+      VReg = MI->getOperand(1).getReg();
+      break;
+    default:
+      return std::nullopt;
+    }
+  }
+  if (!MI || !IsConstantOpcode(MI))
+    return std::nullopt;
+
+  std::optional<APInt> MaybeVal = getAPCstValue(MI);
+  if (!MaybeVal)
+    return std::nullopt;
+  APInt &Val = *MaybeVal;
+  while (!SeenOpcodes.empty()) {
+    std::pair<unsigned, unsigned> OpcodeAndSize = SeenOpcodes.pop_back_val();
+    switch (OpcodeAndSize.first) {
+    case TargetOpcode::G_TRUNC:
+      Val = Val.trunc(OpcodeAndSize.second);
+      break;
+    case TargetOpcode::G_ANYEXT:
+    case TargetOpcode::G_SEXT:
+      Val = Val.sext(OpcodeAndSize.second);
+      break;
+    case TargetOpcode::G_ZEXT:
+      Val = Val.zext(OpcodeAndSize.second);
+      break;
+    }
+  }
+
+  return ValueAndVReg{Val, VReg};
+}
+
+bool isIConstant(const MachineInstr *MI) {
+  if (!MI)
+    return false;
+  return MI->getOpcode() == TargetOpcode::G_CONSTANT;
+}
+
+bool isFConstant(const MachineInstr *MI) {
+  if (!MI)
+    return false;
+  return MI->getOpcode() == TargetOpcode::G_FCONSTANT;
+}
+
+bool isAnyConstant(const MachineInstr *MI) {
+  if (!MI)
+    return false;
+  unsigned Opc = MI->getOpcode();
+  return Opc == TargetOpcode::G_CONSTANT || Opc == TargetOpcode::G_FCONSTANT;
+}
+
+std::optional<APInt> getCImmAsAPInt(const MachineInstr *MI) {
+  const MachineOperand &CstVal = MI->getOperand(1);
+  if (CstVal.isCImm())
+    return CstVal.getCImm()->getValue();
+  return std::nullopt;
+}
+
+std::optional<APInt> getCImmOrFPImmAsAPInt(const MachineInstr *MI) {
+  const MachineOperand &CstVal = MI->getOperand(1);
+  if (CstVal.isCImm())
+    return CstVal.getCImm()->getValue();
+  if (CstVal.isFPImm())
+    return CstVal.getFPImm()->getValueAPF().bitcastToAPInt();
+  return std::nullopt;
+}
+
+} // end anonymous namespace
+
+std::optional<ValueAndVReg> llvm::getIConstantVRegValWithLookThrough(
+    Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs) {
+  return getConstantVRegValWithLookThrough(VReg, MRI, isIConstant,
+                                           getCImmAsAPInt, LookThroughInstrs);
+}
+
+std::optional<ValueAndVReg> llvm::getAnyConstantVRegValWithLookThrough(
+    Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs,
+    bool LookThroughAnyExt) {
+  return getConstantVRegValWithLookThrough(
+      VReg, MRI, isAnyConstant, getCImmOrFPImmAsAPInt, LookThroughInstrs,
+      LookThroughAnyExt);
+}
+
+std::optional<FPValueAndVReg> llvm::getFConstantVRegValWithLookThrough(
+    Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs) {
+  auto Reg = getConstantVRegValWithLookThrough(
+      VReg, MRI, isFConstant, getCImmOrFPImmAsAPInt, LookThroughInstrs);
+  if (!Reg)
+    return std::nullopt;
+  return FPValueAndVReg{getConstantFPVRegVal(Reg->VReg, MRI)->getValueAPF(),
+                        Reg->VReg};
+}
+
+const ConstantFP *
+llvm::getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI) {
+  MachineInstr *MI = MRI.getVRegDef(VReg);
+  if (TargetOpcode::G_FCONSTANT != MI->getOpcode())
+    return nullptr;
+  return MI->getOperand(1).getFPImm();
+}
+
+std::optional<DefinitionAndSourceRegister>
+llvm::getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI) {
+  Register DefSrcReg = Reg;
+  auto *DefMI = MRI.getVRegDef(Reg);
+  auto DstTy = MRI.getType(DefMI->getOperand(0).getReg());
+  if (!DstTy.isValid())
+    return std::nullopt;
+  unsigned Opc = DefMI->getOpcode();
+  while (Opc == TargetOpcode::COPY || isPreISelGenericOptimizationHint(Opc)) {
+    Register SrcReg = DefMI->getOperand(1).getReg();
+    auto SrcTy = MRI.getType(SrcReg);
+    if (!SrcTy.isValid())
+      break;
+    DefMI = MRI.getVRegDef(SrcReg);
+    DefSrcReg = SrcReg;
+    Opc = DefMI->getOpcode();
+  }
+  return DefinitionAndSourceRegister{DefMI, DefSrcReg};
+}
+
+MachineInstr *llvm::getDefIgnoringCopies(Register Reg,
+                                         const MachineRegisterInfo &MRI) {
+  std::optional<DefinitionAndSourceRegister> DefSrcReg =
+      getDefSrcRegIgnoringCopies(Reg, MRI);
+  return DefSrcReg ? DefSrcReg->MI : nullptr;
+}
+
+Register llvm::getSrcRegIgnoringCopies(Register Reg,
+                                       const MachineRegisterInfo &MRI) {
+  std::optional<DefinitionAndSourceRegister> DefSrcReg =
+      getDefSrcRegIgnoringCopies(Reg, MRI);
+  return DefSrcReg ? DefSrcReg->Reg : Register();
+}
+
+MachineInstr *llvm::getOpcodeDef(unsigned Opcode, Register Reg,
+                                 const MachineRegisterInfo &MRI) {
+  MachineInstr *DefMI = getDefIgnoringCopies(Reg, MRI);
+  return DefMI && DefMI->getOpcode() == Opcode ? DefMI : nullptr;
+}
+
+APFloat llvm::getAPFloatFromSize(double Val, unsigned Size) {
+  if (Size == 32)
+    return APFloat(float(Val));
+  if (Size == 64)
+    return APFloat(Val);
+  if (Size != 16)
+    llvm_unreachable("Unsupported FPConstant size");
+  bool Ignored;
+  APFloat APF(Val);
+  APF.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &Ignored);
+  return APF;
+}
+
+std::optional<APInt> llvm::ConstantFoldBinOp(unsigned Opcode,
+                                             const Register Op1,
+                                             const Register Op2,
+                                             const MachineRegisterInfo &MRI) {
+  auto MaybeOp2Cst = getAnyConstantVRegValWithLookThrough(Op2, MRI, false);
+  if (!MaybeOp2Cst)
+    return std::nullopt;
+
+  auto MaybeOp1Cst = getAnyConstantVRegValWithLookThrough(Op1, MRI, false);
+  if (!MaybeOp1Cst)
+    return std::nullopt;
+
+  const APInt &C1 = MaybeOp1Cst->Value;
+  const APInt &C2 = MaybeOp2Cst->Value;
+  switch (Opcode) {
+  default:
+    break;
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_PTR_ADD:
+    return C1 + C2;
+  case TargetOpcode::G_AND:
+    return C1 & C2;
+  case TargetOpcode::G_ASHR:
+    return C1.ashr(C2);
+  case TargetOpcode::G_LSHR:
+    return C1.lshr(C2);
+  case TargetOpcode::G_MUL:
+    return C1 * C2;
+  case TargetOpcode::G_OR:
+    return C1 | C2;
+  case TargetOpcode::G_SHL:
+    return C1 << C2;
+  case TargetOpcode::G_SUB:
+    return C1 - C2;
+  case TargetOpcode::G_XOR:
+    return C1 ^ C2;
+  case TargetOpcode::G_UDIV:
+    if (!C2.getBoolValue())
+      break;
+    return C1.udiv(C2);
+  case TargetOpcode::G_SDIV:
+    if (!C2.getBoolValue())
+      break;
+    return C1.sdiv(C2);
+  case TargetOpcode::G_UREM:
+    if (!C2.getBoolValue())
+      break;
+    return C1.urem(C2);
+  case TargetOpcode::G_SREM:
+    if (!C2.getBoolValue())
+      break;
+    return C1.srem(C2);
+  case TargetOpcode::G_SMIN:
+    return APIntOps::smin(C1, C2);
+  case TargetOpcode::G_SMAX:
+    return APIntOps::smax(C1, C2);
+  case TargetOpcode::G_UMIN:
+    return APIntOps::umin(C1, C2);
+  case TargetOpcode::G_UMAX:
+    return APIntOps::umax(C1, C2);
+  }
+
+  return std::nullopt;
+}
+
+std::optional<APFloat>
+llvm::ConstantFoldFPBinOp(unsigned Opcode, const Register Op1,
+                          const Register Op2, const MachineRegisterInfo &MRI) {
+  const ConstantFP *Op2Cst = getConstantFPVRegVal(Op2, MRI);
+  if (!Op2Cst)
+    return std::nullopt;
+
+  const ConstantFP *Op1Cst = getConstantFPVRegVal(Op1, MRI);
+  if (!Op1Cst)
+    return std::nullopt;
+
+  APFloat C1 = Op1Cst->getValueAPF();
+  const APFloat &C2 = Op2Cst->getValueAPF();
+  switch (Opcode) {
+  case TargetOpcode::G_FADD:
+    C1.add(C2, APFloat::rmNearestTiesToEven);
+    return C1;
+  case TargetOpcode::G_FSUB:
+    C1.subtract(C2, APFloat::rmNearestTiesToEven);
+    return C1;
+  case TargetOpcode::G_FMUL:
+    C1.multiply(C2, APFloat::rmNearestTiesToEven);
+    return C1;
+  case TargetOpcode::G_FDIV:
+    C1.divide(C2, APFloat::rmNearestTiesToEven);
+    return C1;
+  case TargetOpcode::G_FREM:
+    C1.mod(C2);
+    return C1;
+  case TargetOpcode::G_FCOPYSIGN:
+    C1.copySign(C2);
+    return C1;
+  case TargetOpcode::G_FMINNUM:
+    return minnum(C1, C2);
+  case TargetOpcode::G_FMAXNUM:
+    return maxnum(C1, C2);
+  case TargetOpcode::G_FMINIMUM:
+    return minimum(C1, C2);
+  case TargetOpcode::G_FMAXIMUM:
+    return maximum(C1, C2);
+  case TargetOpcode::G_FMINNUM_IEEE:
+  case TargetOpcode::G_FMAXNUM_IEEE:
+    // FIXME: These operations were unfortunately named. fminnum/fmaxnum do not
+    // follow the IEEE behavior for signaling nans and follow libm's fmin/fmax,
+    // and currently there isn't a nice wrapper in APFloat for the version with
+    // correct snan handling.
+    break;
+  default:
+    break;
+  }
+
+  return std::nullopt;
+}
+
+SmallVector<APInt>
+llvm::ConstantFoldVectorBinop(unsigned Opcode, const Register Op1,
+                              const Register Op2,
+                              const MachineRegisterInfo &MRI) {
+  auto *SrcVec2 = getOpcodeDef<GBuildVector>(Op2, MRI);
+  if (!SrcVec2)
+    return SmallVector<APInt>();
+
+  auto *SrcVec1 = getOpcodeDef<GBuildVector>(Op1, MRI);
+  if (!SrcVec1)
+    return SmallVector<APInt>();
+
+  SmallVector<APInt> FoldedElements;
+  for (unsigned Idx = 0, E = SrcVec1->getNumSources(); Idx < E; ++Idx) {
+    auto MaybeCst = ConstantFoldBinOp(Opcode, SrcVec1->getSourceReg(Idx),
+                                      SrcVec2->getSourceReg(Idx), MRI);
+    if (!MaybeCst)
+      return SmallVector<APInt>();
+    FoldedElements.push_back(*MaybeCst);
+  }
+  return FoldedElements;
+}
+
+bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
+                           bool SNaN) {
+  const MachineInstr *DefMI = MRI.getVRegDef(Val);
+  if (!DefMI)
+    return false;
+
+  const TargetMachine& TM = DefMI->getMF()->getTarget();
+  if (DefMI->getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath)
+    return true;
+
+  // If the value is a constant, we can obviously see if it is a NaN or not.
+  if (const ConstantFP *FPVal = getConstantFPVRegVal(Val, MRI)) {
+    return !FPVal->getValueAPF().isNaN() ||
+           (SNaN && !FPVal->getValueAPF().isSignaling());
+  }
+
+  if (DefMI->getOpcode() == TargetOpcode::G_BUILD_VECTOR) {
+    for (const auto &Op : DefMI->uses())
+      if (!isKnownNeverNaN(Op.getReg(), MRI, SNaN))
+        return false;
+    return true;
+  }
+
+  switch (DefMI->getOpcode()) {
+  default:
+    break;
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FSUB:
+  case TargetOpcode::G_FMUL:
+  case TargetOpcode::G_FDIV:
+  case TargetOpcode::G_FREM:
+  case TargetOpcode::G_FSIN:
+  case TargetOpcode::G_FCOS:
+  case TargetOpcode::G_FMA:
+  case TargetOpcode::G_FMAD:
+    if (SNaN)
+      return true;
+
+    // TODO: Need isKnownNeverInfinity
+    return false;
+  case TargetOpcode::G_FMINNUM_IEEE:
+  case TargetOpcode::G_FMAXNUM_IEEE: {
+    if (SNaN)
+      return true;
+    // This can return a NaN if either operand is an sNaN, or if both operands
+    // are NaN.
+    return (isKnownNeverNaN(DefMI->getOperand(1).getReg(), MRI) &&
+            isKnownNeverSNaN(DefMI->getOperand(2).getReg(), MRI)) ||
+           (isKnownNeverSNaN(DefMI->getOperand(1).getReg(), MRI) &&
+            isKnownNeverNaN(DefMI->getOperand(2).getReg(), MRI));
+  }
+  case TargetOpcode::G_FMINNUM:
+  case TargetOpcode::G_FMAXNUM: {
+    // Only one needs to be known not-nan, since it will be returned if the
+    // other ends up being one.
+    return isKnownNeverNaN(DefMI->getOperand(1).getReg(), MRI, SNaN) ||
+           isKnownNeverNaN(DefMI->getOperand(2).getReg(), MRI, SNaN);
+  }
+  }
+
+  if (SNaN) {
+    // FP operations quiet. For now, just handle the ones inserted during
+    // legalization.
+    switch (DefMI->getOpcode()) {
+    case TargetOpcode::G_FPEXT:
+    case TargetOpcode::G_FPTRUNC:
+    case TargetOpcode::G_FCANONICALIZE:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  return false;
+}
+
+Align llvm::inferAlignFromPtrInfo(MachineFunction &MF,
+                                  const MachinePointerInfo &MPO) {
+  auto PSV = dyn_cast_if_present<const PseudoSourceValue *>(MPO.V);
+  if (auto FSPV = dyn_cast_or_null<FixedStackPseudoSourceValue>(PSV)) {
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    return commonAlignment(MFI.getObjectAlign(FSPV->getFrameIndex()),
+                           MPO.Offset);
+  }
+
+  if (const Value *V = dyn_cast_if_present<const Value *>(MPO.V)) {
+    const Module *M = MF.getFunction().getParent();
+    return V->getPointerAlignment(M->getDataLayout());
+  }
+
+  return Align(1);
+}
+
+Register llvm::getFunctionLiveInPhysReg(MachineFunction &MF,
+                                        const TargetInstrInfo &TII,
+                                        MCRegister PhysReg,
+                                        const TargetRegisterClass &RC,
+                                        const DebugLoc &DL, LLT RegTy) {
+  MachineBasicBlock &EntryMBB = MF.front();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  Register LiveIn = MRI.getLiveInVirtReg(PhysReg);
+  if (LiveIn) {
+    MachineInstr *Def = MRI.getVRegDef(LiveIn);
+    if (Def) {
+      // FIXME: Should the verifier check this is in the entry block?
+      assert(Def->getParent() == &EntryMBB && "live-in copy not in entry block");
+      return LiveIn;
+    }
+
+    // It's possible the incoming argument register and copy was added during
+    // lowering, but later deleted due to being/becoming dead. If this happens,
+    // re-insert the copy.
+  } else {
+    // The live in register was not present, so add it.
+    LiveIn = MF.addLiveIn(PhysReg, &RC);
+    if (RegTy.isValid())
+      MRI.setType(LiveIn, RegTy);
+  }
+
+  BuildMI(EntryMBB, EntryMBB.begin(), DL, TII.get(TargetOpcode::COPY), LiveIn)
+    .addReg(PhysReg);
+  if (!EntryMBB.isLiveIn(PhysReg))
+    EntryMBB.addLiveIn(PhysReg);
+  return LiveIn;
+}
+
+std::optional<APInt> llvm::ConstantFoldExtOp(unsigned Opcode,
+                                             const Register Op1, uint64_t Imm,
+                                             const MachineRegisterInfo &MRI) {
+  auto MaybeOp1Cst = getIConstantVRegVal(Op1, MRI);
+  if (MaybeOp1Cst) {
+    switch (Opcode) {
+    default:
+      break;
+    case TargetOpcode::G_SEXT_INREG: {
+      LLT Ty = MRI.getType(Op1);
+      return MaybeOp1Cst->trunc(Imm).sext(Ty.getScalarSizeInBits());
+    }
+    }
+  }
+  return std::nullopt;
+}
+
+std::optional<APFloat>
+llvm::ConstantFoldIntToFloat(unsigned Opcode, LLT DstTy, Register Src,
+                             const MachineRegisterInfo &MRI) {
+  assert(Opcode == TargetOpcode::G_SITOFP || Opcode == TargetOpcode::G_UITOFP);
+  if (auto MaybeSrcVal = getIConstantVRegVal(Src, MRI)) {
+    APFloat DstVal(getFltSemanticForLLT(DstTy));
+    DstVal.convertFromAPInt(*MaybeSrcVal, Opcode == TargetOpcode::G_SITOFP,
+                            APFloat::rmNearestTiesToEven);
+    return DstVal;
+  }
+  return std::nullopt;
+}
+
+std::optional<SmallVector<unsigned>>
+llvm::ConstantFoldCTLZ(Register Src, const MachineRegisterInfo &MRI) {
+  LLT Ty = MRI.getType(Src);
+  SmallVector<unsigned> FoldedCTLZs;
+  auto tryFoldScalar = [&](Register R) -> std::optional<unsigned> {
+    auto MaybeCst = getIConstantVRegVal(R, MRI);
+    if (!MaybeCst)
+      return std::nullopt;
+    return MaybeCst->countl_zero();
+  };
+  if (Ty.isVector()) {
+    // Try to constant fold each element.
+    auto *BV = getOpcodeDef<GBuildVector>(Src, MRI);
+    if (!BV)
+      return std::nullopt;
+    for (unsigned SrcIdx = 0; SrcIdx < BV->getNumSources(); ++SrcIdx) {
+      if (auto MaybeFold = tryFoldScalar(BV->getSourceReg(SrcIdx))) {
+        FoldedCTLZs.emplace_back(*MaybeFold);
+        continue;
+      }
+      return std::nullopt;
+    }
+    return FoldedCTLZs;
+  }
+  if (auto MaybeCst = tryFoldScalar(Src)) {
+    FoldedCTLZs.emplace_back(*MaybeCst);
+    return FoldedCTLZs;
+  }
+  return std::nullopt;
+}
+
+bool llvm::isKnownToBeAPowerOfTwo(Register Reg, const MachineRegisterInfo &MRI,
+                                  GISelKnownBits *KB) {
+  std::optional<DefinitionAndSourceRegister> DefSrcReg =
+      getDefSrcRegIgnoringCopies(Reg, MRI);
+  if (!DefSrcReg)
+    return false;
+
+  const MachineInstr &MI = *DefSrcReg->MI;
+  const LLT Ty = MRI.getType(Reg);
+
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_CONSTANT: {
+    unsigned BitWidth = Ty.getScalarSizeInBits();
+    const ConstantInt *CI = MI.getOperand(1).getCImm();
+    return CI->getValue().zextOrTrunc(BitWidth).isPowerOf2();
+  }
+  case TargetOpcode::G_SHL: {
+    // A left-shift of a constant one will have exactly one bit set because
+    // shifting the bit off the end is undefined.
+
+    // TODO: Constant splat
+    if (auto ConstLHS = getIConstantVRegVal(MI.getOperand(1).getReg(), MRI)) {
+      if (*ConstLHS == 1)
+        return true;
+    }
+
+    break;
+  }
+  case TargetOpcode::G_LSHR: {
+    if (auto ConstLHS = getIConstantVRegVal(MI.getOperand(1).getReg(), MRI)) {
+      if (ConstLHS->isSignMask())
+        return true;
+    }
+
+    break;
+  }
+  case TargetOpcode::G_BUILD_VECTOR: {
+    // TODO: Probably should have a recursion depth guard since you could have
+    // bitcasted vector elements.
+    for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
+      if (!isKnownToBeAPowerOfTwo(MO.getReg(), MRI, KB))
+        return false;
+
+    return true;
+  }
+  case TargetOpcode::G_BUILD_VECTOR_TRUNC: {
+    // Only handle constants since we would need to know if number of leading
+    // zeros is greater than the truncation amount.
+    const unsigned BitWidth = Ty.getScalarSizeInBits();
+    for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
+      auto Const = getIConstantVRegVal(MO.getReg(), MRI);
+      if (!Const || !Const->zextOrTrunc(BitWidth).isPowerOf2())
+        return false;
+    }
+
+    return true;
+  }
+  default:
+    break;
+  }
+
+  if (!KB)
+    return false;
+
+  // More could be done here, though the above checks are enough
+  // to handle some common cases.
+
+  // Fall back to computeKnownBits to catch other known cases.
+  KnownBits Known = KB->getKnownBits(Reg);
+  return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1);
+}
+
+void llvm::getSelectionDAGFallbackAnalysisUsage(AnalysisUsage &AU) {
+  AU.addPreserved<StackProtector>();
+}
+
+LLT llvm::getLCMType(LLT OrigTy, LLT TargetTy) {
+  const unsigned OrigSize = OrigTy.getSizeInBits();
+  const unsigned TargetSize = TargetTy.getSizeInBits();
+
+  if (OrigSize == TargetSize)
+    return OrigTy;
+
+  if (OrigTy.isVector()) {
+    const LLT OrigElt = OrigTy.getElementType();
+
+    if (TargetTy.isVector()) {
+      const LLT TargetElt = TargetTy.getElementType();
+
+      if (OrigElt.getSizeInBits() == TargetElt.getSizeInBits()) {
+        int GCDElts =
+            std::gcd(OrigTy.getNumElements(), TargetTy.getNumElements());
+        // Prefer the original element type.
+        ElementCount Mul = OrigTy.getElementCount() * TargetTy.getNumElements();
+        return LLT::vector(Mul.divideCoefficientBy(GCDElts),
+                           OrigTy.getElementType());
+      }
+    } else {
+      if (OrigElt.getSizeInBits() == TargetSize)
+        return OrigTy;
+    }
+
+    unsigned LCMSize = std::lcm(OrigSize, TargetSize);
+    return LLT::fixed_vector(LCMSize / OrigElt.getSizeInBits(), OrigElt);
+  }
+
+  if (TargetTy.isVector()) {
+    unsigned LCMSize = std::lcm(OrigSize, TargetSize);
+    return LLT::fixed_vector(LCMSize / OrigSize, OrigTy);
+  }
+
+  unsigned LCMSize = std::lcm(OrigSize, TargetSize);
+
+  // Preserve pointer types.
+  if (LCMSize == OrigSize)
+    return OrigTy;
+  if (LCMSize == TargetSize)
+    return TargetTy;
+
+  return LLT::scalar(LCMSize);
+}
+
+LLT llvm::getCoverTy(LLT OrigTy, LLT TargetTy) {
+  if (!OrigTy.isVector() || !TargetTy.isVector() || OrigTy == TargetTy ||
+      (OrigTy.getScalarSizeInBits() != TargetTy.getScalarSizeInBits()))
+    return getLCMType(OrigTy, TargetTy);
+
+  unsigned OrigTyNumElts = OrigTy.getNumElements();
+  unsigned TargetTyNumElts = TargetTy.getNumElements();
+  if (OrigTyNumElts % TargetTyNumElts == 0)
+    return OrigTy;
+
+  unsigned NumElts = alignTo(OrigTyNumElts, TargetTyNumElts);
+  return LLT::scalarOrVector(ElementCount::getFixed(NumElts),
+                             OrigTy.getElementType());
+}
+
+LLT llvm::getGCDType(LLT OrigTy, LLT TargetTy) {
+  const unsigned OrigSize = OrigTy.getSizeInBits();
+  const unsigned TargetSize = TargetTy.getSizeInBits();
+
+  if (OrigSize == TargetSize)
+    return OrigTy;
+
+  if (OrigTy.isVector()) {
+    LLT OrigElt = OrigTy.getElementType();
+    if (TargetTy.isVector()) {
+      LLT TargetElt = TargetTy.getElementType();
+      if (OrigElt.getSizeInBits() == TargetElt.getSizeInBits()) {
+        int GCD = std::gcd(OrigTy.getNumElements(), TargetTy.getNumElements());
+        return LLT::scalarOrVector(ElementCount::getFixed(GCD), OrigElt);
+      }
+    } else {
+      // If the source is a vector of pointers, return a pointer element.
+      if (OrigElt.getSizeInBits() == TargetSize)
+        return OrigElt;
+    }
+
+    unsigned GCD = std::gcd(OrigSize, TargetSize);
+    if (GCD == OrigElt.getSizeInBits())
+      return OrigElt;
+
+    // If we can't produce the original element type, we have to use a smaller
+    // scalar.
+    if (GCD < OrigElt.getSizeInBits())
+      return LLT::scalar(GCD);
+    return LLT::fixed_vector(GCD / OrigElt.getSizeInBits(), OrigElt);
+  }
+
+  if (TargetTy.isVector()) {
+    // Try to preserve the original element type.
+    LLT TargetElt = TargetTy.getElementType();
+    if (TargetElt.getSizeInBits() == OrigSize)
+      return OrigTy;
+  }
+
+  unsigned GCD = std::gcd(OrigSize, TargetSize);
+  return LLT::scalar(GCD);
+}
+
+std::optional<int> llvm::getSplatIndex(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
+         "Only G_SHUFFLE_VECTOR can have a splat index!");
+  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+  auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; });
+
+  // If all elements are undefined, this shuffle can be considered a splat.
+  // Return 0 for better potential for callers to simplify.
+  if (FirstDefinedIdx == Mask.end())
+    return 0;
+
+  // Make sure all remaining elements are either undef or the same
+  // as the first non-undef value.
+  int SplatValue = *FirstDefinedIdx;
+  if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()),
+             [&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; }))
+    return std::nullopt;
+
+  return SplatValue;
+}
+
+static bool isBuildVectorOp(unsigned Opcode) {
+  return Opcode == TargetOpcode::G_BUILD_VECTOR ||
+         Opcode == TargetOpcode::G_BUILD_VECTOR_TRUNC;
+}
+
+namespace {
+
+std::optional<ValueAndVReg> getAnyConstantSplat(Register VReg,
+                                                const MachineRegisterInfo &MRI,
+                                                bool AllowUndef) {
+  MachineInstr *MI = getDefIgnoringCopies(VReg, MRI);
+  if (!MI)
+    return std::nullopt;
+
+  bool isConcatVectorsOp = MI->getOpcode() == TargetOpcode::G_CONCAT_VECTORS;
+  if (!isBuildVectorOp(MI->getOpcode()) && !isConcatVectorsOp)
+    return std::nullopt;
+
+  std::optional<ValueAndVReg> SplatValAndReg;
+  for (MachineOperand &Op : MI->uses()) {
+    Register Element = Op.getReg();
+    // If we have a G_CONCAT_VECTOR, we recursively look into the
+    // vectors that we're concatenating to see if they're splats.
+    auto ElementValAndReg =
+        isConcatVectorsOp
+            ? getAnyConstantSplat(Element, MRI, AllowUndef)
+            : getAnyConstantVRegValWithLookThrough(Element, MRI, true, true);
+
+    // If AllowUndef, treat undef as value that will result in a constant splat.
+    if (!ElementValAndReg) {
+      if (AllowUndef && isa<GImplicitDef>(MRI.getVRegDef(Element)))
+        continue;
+      return std::nullopt;
+    }
+
+    // Record splat value
+    if (!SplatValAndReg)
+      SplatValAndReg = ElementValAndReg;
+
+    // Different constant than the one already recorded, not a constant splat.
+    if (SplatValAndReg->Value != ElementValAndReg->Value)
+      return std::nullopt;
+  }
+
+  return SplatValAndReg;
+}
+
+} // end anonymous namespace
+
+bool llvm::isBuildVectorConstantSplat(const Register Reg,
+                                      const MachineRegisterInfo &MRI,
+                                      int64_t SplatValue, bool AllowUndef) {
+  if (auto SplatValAndReg = getAnyConstantSplat(Reg, MRI, AllowUndef))
+    return mi_match(SplatValAndReg->VReg, MRI, m_SpecificICst(SplatValue));
+  return false;
+}
+
+bool llvm::isBuildVectorConstantSplat(const MachineInstr &MI,
+                                      const MachineRegisterInfo &MRI,
+                                      int64_t SplatValue, bool AllowUndef) {
+  return isBuildVectorConstantSplat(MI.getOperand(0).getReg(), MRI, SplatValue,
+                                    AllowUndef);
+}
+
+std::optional<APInt>
+llvm::getIConstantSplatVal(const Register Reg, const MachineRegisterInfo &MRI) {
+  if (auto SplatValAndReg =
+          getAnyConstantSplat(Reg, MRI, /* AllowUndef */ false)) {
+    std::optional<ValueAndVReg> ValAndVReg =
+        getIConstantVRegValWithLookThrough(SplatValAndReg->VReg, MRI);
+    return ValAndVReg->Value;
+  }
+
+  return std::nullopt;
+}
+
+std::optional<APInt>
+llvm::getIConstantSplatVal(const MachineInstr &MI,
+                           const MachineRegisterInfo &MRI) {
+  return getIConstantSplatVal(MI.getOperand(0).getReg(), MRI);
+}
+
+std::optional<int64_t>
+llvm::getIConstantSplatSExtVal(const Register Reg,
+                               const MachineRegisterInfo &MRI) {
+  if (auto SplatValAndReg =
+          getAnyConstantSplat(Reg, MRI, /* AllowUndef */ false))
+    return getIConstantVRegSExtVal(SplatValAndReg->VReg, MRI);
+  return std::nullopt;
+}
+
+std::optional<int64_t>
+llvm::getIConstantSplatSExtVal(const MachineInstr &MI,
+                               const MachineRegisterInfo &MRI) {
+  return getIConstantSplatSExtVal(MI.getOperand(0).getReg(), MRI);
+}
+
+std::optional<FPValueAndVReg>
+llvm::getFConstantSplat(Register VReg, const MachineRegisterInfo &MRI,
+                        bool AllowUndef) {
+  if (auto SplatValAndReg = getAnyConstantSplat(VReg, MRI, AllowUndef))
+    return getFConstantVRegValWithLookThrough(SplatValAndReg->VReg, MRI);
+  return std::nullopt;
+}
+
+bool llvm::isBuildVectorAllZeros(const MachineInstr &MI,
+                                 const MachineRegisterInfo &MRI,
+                                 bool AllowUndef) {
+  return isBuildVectorConstantSplat(MI, MRI, 0, AllowUndef);
+}
+
+bool llvm::isBuildVectorAllOnes(const MachineInstr &MI,
+                                const MachineRegisterInfo &MRI,
+                                bool AllowUndef) {
+  return isBuildVectorConstantSplat(MI, MRI, -1, AllowUndef);
+}
+
+std::optional<RegOrConstant>
+llvm::getVectorSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI) {
+  unsigned Opc = MI.getOpcode();
+  if (!isBuildVectorOp(Opc))
+    return std::nullopt;
+  if (auto Splat = getIConstantSplatSExtVal(MI, MRI))
+    return RegOrConstant(*Splat);
+  auto Reg = MI.getOperand(1).getReg();
+  if (any_of(make_range(MI.operands_begin() + 2, MI.operands_end()),
+             [&Reg](const MachineOperand &Op) { return Op.getReg() != Reg; }))
+    return std::nullopt;
+  return RegOrConstant(Reg);
+}
+
+static bool isConstantScalar(const MachineInstr &MI,
+                             const MachineRegisterInfo &MRI,
+                             bool AllowFP = true,
+                             bool AllowOpaqueConstants = true) {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_IMPLICIT_DEF:
+    return true;
+  case TargetOpcode::G_FCONSTANT:
+    return AllowFP;
+  case TargetOpcode::G_GLOBAL_VALUE:
+  case TargetOpcode::G_FRAME_INDEX:
+  case TargetOpcode::G_BLOCK_ADDR:
+  case TargetOpcode::G_JUMP_TABLE:
+    return AllowOpaqueConstants;
+  default:
+    return false;
+  }
+}
+
+bool llvm::isConstantOrConstantVector(MachineInstr &MI,
+                                      const MachineRegisterInfo &MRI) {
+  Register Def = MI.getOperand(0).getReg();
+  if (auto C = getIConstantVRegValWithLookThrough(Def, MRI))
+    return true;
+  GBuildVector *BV = dyn_cast<GBuildVector>(&MI);
+  if (!BV)
+    return false;
+  for (unsigned SrcIdx = 0; SrcIdx < BV->getNumSources(); ++SrcIdx) {
+    if (getIConstantVRegValWithLookThrough(BV->getSourceReg(SrcIdx), MRI) ||
+        getOpcodeDef<GImplicitDef>(BV->getSourceReg(SrcIdx), MRI))
+      continue;
+    return false;
+  }
+  return true;
+}
+
+bool llvm::isConstantOrConstantVector(const MachineInstr &MI,
+                                      const MachineRegisterInfo &MRI,
+                                      bool AllowFP, bool AllowOpaqueConstants) {
+  if (isConstantScalar(MI, MRI, AllowFP, AllowOpaqueConstants))
+    return true;
+
+  if (!isBuildVectorOp(MI.getOpcode()))
+    return false;
+
+  const unsigned NumOps = MI.getNumOperands();
+  for (unsigned I = 1; I != NumOps; ++I) {
+    const MachineInstr *ElementDef = MRI.getVRegDef(MI.getOperand(I).getReg());
+    if (!isConstantScalar(*ElementDef, MRI, AllowFP, AllowOpaqueConstants))
+      return false;
+  }
+
+  return true;
+}
+
+std::optional<APInt>
+llvm::isConstantOrConstantSplatVector(MachineInstr &MI,
+                                      const MachineRegisterInfo &MRI) {
+  Register Def = MI.getOperand(0).getReg();
+  if (auto C = getIConstantVRegValWithLookThrough(Def, MRI))
+    return C->Value;
+  auto MaybeCst = getIConstantSplatSExtVal(MI, MRI);
+  if (!MaybeCst)
+    return std::nullopt;
+  const unsigned ScalarSize = MRI.getType(Def).getScalarSizeInBits();
+  return APInt(ScalarSize, *MaybeCst, true);
+}
+
+bool llvm::isNullOrNullSplat(const MachineInstr &MI,
+                             const MachineRegisterInfo &MRI, bool AllowUndefs) {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_IMPLICIT_DEF:
+    return AllowUndefs;
+  case TargetOpcode::G_CONSTANT:
+    return MI.getOperand(1).getCImm()->isNullValue();
+  case TargetOpcode::G_FCONSTANT: {
+    const ConstantFP *FPImm = MI.getOperand(1).getFPImm();
+    return FPImm->isZero() && !FPImm->isNegative();
+  }
+  default:
+    if (!AllowUndefs) // TODO: isBuildVectorAllZeros assumes undef is OK already
+      return false;
+    return isBuildVectorAllZeros(MI, MRI);
+  }
+}
+
+bool llvm::isAllOnesOrAllOnesSplat(const MachineInstr &MI,
+                                   const MachineRegisterInfo &MRI,
+                                   bool AllowUndefs) {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_IMPLICIT_DEF:
+    return AllowUndefs;
+  case TargetOpcode::G_CONSTANT:
+    return MI.getOperand(1).getCImm()->isAllOnesValue();
+  default:
+    if (!AllowUndefs) // TODO: isBuildVectorAllOnes assumes undef is OK already
+      return false;
+    return isBuildVectorAllOnes(MI, MRI);
+  }
+}
+
+bool llvm::matchUnaryPredicate(
+    const MachineRegisterInfo &MRI, Register Reg,
+    std::function<bool(const Constant *ConstVal)> Match, bool AllowUndefs) {
+
+  const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+  if (AllowUndefs && Def->getOpcode() == TargetOpcode::G_IMPLICIT_DEF)
+    return Match(nullptr);
+
+  // TODO: Also handle fconstant
+  if (Def->getOpcode() == TargetOpcode::G_CONSTANT)
+    return Match(Def->getOperand(1).getCImm());
+
+  if (Def->getOpcode() != TargetOpcode::G_BUILD_VECTOR)
+    return false;
+
+  for (unsigned I = 1, E = Def->getNumOperands(); I != E; ++I) {
+    Register SrcElt = Def->getOperand(I).getReg();
+    const MachineInstr *SrcDef = getDefIgnoringCopies(SrcElt, MRI);
+    if (AllowUndefs && SrcDef->getOpcode() == TargetOpcode::G_IMPLICIT_DEF) {
+      if (!Match(nullptr))
+        return false;
+      continue;
+    }
+
+    if (SrcDef->getOpcode() != TargetOpcode::G_CONSTANT ||
+        !Match(SrcDef->getOperand(1).getCImm()))
+      return false;
+  }
+
+  return true;
+}
+
+bool llvm::isConstTrueVal(const TargetLowering &TLI, int64_t Val, bool IsVector,
+                          bool IsFP) {
+  switch (TLI.getBooleanContents(IsVector, IsFP)) {
+  case TargetLowering::UndefinedBooleanContent:
+    return Val & 0x1;
+  case TargetLowering::ZeroOrOneBooleanContent:
+    return Val == 1;
+  case TargetLowering::ZeroOrNegativeOneBooleanContent:
+    return Val == -1;
+  }
+  llvm_unreachable("Invalid boolean contents");
+}
+
+bool llvm::isConstFalseVal(const TargetLowering &TLI, int64_t Val,
+                           bool IsVector, bool IsFP) {
+  switch (TLI.getBooleanContents(IsVector, IsFP)) {
+  case TargetLowering::UndefinedBooleanContent:
+    return ~Val & 0x1;
+  case TargetLowering::ZeroOrOneBooleanContent:
+  case TargetLowering::ZeroOrNegativeOneBooleanContent:
+    return Val == 0;
+  }
+  llvm_unreachable("Invalid boolean contents");
+}
+
+int64_t llvm::getICmpTrueVal(const TargetLowering &TLI, bool IsVector,
+                             bool IsFP) {
+  switch (TLI.getBooleanContents(IsVector, IsFP)) {
+  case TargetLowering::UndefinedBooleanContent:
+  case TargetLowering::ZeroOrOneBooleanContent:
+    return 1;
+  case TargetLowering::ZeroOrNegativeOneBooleanContent:
+    return -1;
+  }
+  llvm_unreachable("Invalid boolean contents");
+}
+
+bool llvm::shouldOptForSize(const MachineBasicBlock &MBB,
+                            ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
+  const auto &F = MBB.getParent()->getFunction();
+  return F.hasOptSize() || F.hasMinSize() ||
+         llvm::shouldOptimizeForSize(MBB.getBasicBlock(), PSI, BFI);
+}
+
+void llvm::saveUsesAndErase(MachineInstr &MI, MachineRegisterInfo &MRI,
+                            LostDebugLocObserver *LocObserver,
+                            SmallInstListTy &DeadInstChain) {
+  for (MachineOperand &Op : MI.uses()) {
+    if (Op.isReg() && Op.getReg().isVirtual())
+      DeadInstChain.insert(MRI.getVRegDef(Op.getReg()));
+  }
+  LLVM_DEBUG(dbgs() << MI << "Is dead; erasing.\n");
+  DeadInstChain.remove(&MI);
+  MI.eraseFromParent();
+  if (LocObserver)
+    LocObserver->checkpoint(false);
+}
+
+void llvm::eraseInstrs(ArrayRef<MachineInstr *> DeadInstrs,
+                       MachineRegisterInfo &MRI,
+                       LostDebugLocObserver *LocObserver) {
+  SmallInstListTy DeadInstChain;
+  for (MachineInstr *MI : DeadInstrs)
+    saveUsesAndErase(*MI, MRI, LocObserver, DeadInstChain);
+
+  while (!DeadInstChain.empty()) {
+    MachineInstr *Inst = DeadInstChain.pop_back_val();
+    if (!isTriviallyDead(*Inst, MRI))
+      continue;
+    saveUsesAndErase(*Inst, MRI, LocObserver, DeadInstChain);
+  }
+}
+
+void llvm::eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI,
+                      LostDebugLocObserver *LocObserver) {
+  return eraseInstrs({&MI}, MRI, LocObserver);
+}
+
+void llvm::salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI) {
+  for (auto &Def : MI.defs()) {
+    assert(Def.isReg() && "Must be a reg");
+
+    SmallVector<MachineOperand *, 16> DbgUsers;
+    for (auto &MOUse : MRI.use_operands(Def.getReg())) {
+      MachineInstr *DbgValue = MOUse.getParent();
+      // Ignore partially formed DBG_VALUEs.
+      if (DbgValue->isNonListDebugValue() && DbgValue->getNumOperands() == 4) {
+        DbgUsers.push_back(&MOUse);
+      }
+    }
+
+    if (!DbgUsers.empty()) {
+      salvageDebugInfoForDbgValue(MRI, MI, DbgUsers);
+    }
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/GlobalMerge.cpp b/contrib/llvm-project/llvm/lib/CodeGen/GlobalMerge.cpp
new file mode 100644
index 000000000000..f259cbc1d788
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -0,0 +1,706 @@
+//===- GlobalMerge.cpp - Internal globals merging -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass merges globals with internal linkage into one. This way all the
+// globals which were merged into a biggest one can be addressed using offsets
+// from the same base pointer (no need for separate base pointer for each of the
+// global). Such a transformation can significantly reduce the register pressure
+// when many globals are involved.
+//
+// For example, consider the code which touches several global variables at
+// once:
+//
+// static int foo[N], bar[N], baz[N];
+//
+// for (i = 0; i < N; ++i) {
+//    foo[i] = bar[i] * baz[i];
+// }
+//
+//  On ARM the addresses of 3 arrays should be kept in the registers, thus
+//  this code has quite large register pressure (loop body):
+//
+//  ldr     r1, [r5], #4
+//  ldr     r2, [r6], #4
+//  mul     r1, r2, r1
+//  str     r1, [r0], #4
+//
+//  Pass converts the code to something like:
+//
+//  static struct {
+//    int foo[N];
+//    int bar[N];
+//    int baz[N];
+//  } merged;
+//
+//  for (i = 0; i < N; ++i) {
+//    merged.foo[i] = merged.bar[i] * merged.baz[i];
+//  }
+//
+//  and in ARM code this becomes:
+//
+//  ldr     r0, [r5, #40]
+//  ldr     r1, [r5, #80]
+//  mul     r0, r1, r0
+//  str     r0, [r5], #4
+//
+//  note that we saved 2 registers here almostly "for free".
+//
+// However, merging globals can have tradeoffs:
+// - it confuses debuggers, tools, and users
+// - it makes linker optimizations less useful (order files, LOHs, ...)
+// - it forces usage of indexed addressing (which isn't necessarily "free")
+// - it can increase register pressure when the uses are disparate enough.
+//
+// We use heuristics to discover the best global grouping we can (cf cl::opts).
+//
+// ===---------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Triple.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "global-merge"
+
+// FIXME: This is only useful as a last-resort way to disable the pass.
+static cl::opt<bool>
+EnableGlobalMerge("enable-global-merge", cl::Hidden,
+                  cl::desc("Enable the global merge pass"),
+                  cl::init(true));
+
+static cl::opt<unsigned>
+GlobalMergeMaxOffset("global-merge-max-offset", cl::Hidden,
+                     cl::desc("Set maximum offset for global merge pass"),
+                     cl::init(0));
+
+static cl::opt<bool> GlobalMergeGroupByUse(
+    "global-merge-group-by-use", cl::Hidden,
+    cl::desc("Improve global merge pass to look at uses"), cl::init(true));
+
+static cl::opt<bool> GlobalMergeIgnoreSingleUse(
+    "global-merge-ignore-single-use", cl::Hidden,
+    cl::desc("Improve global merge pass to ignore globals only used alone"),
+    cl::init(true));
+
+static cl::opt<bool>
+EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden,
+                         cl::desc("Enable global merge pass on constants"),
+                         cl::init(false));
+
+// FIXME: this could be a transitional option, and we probably need to remove
+// it if only we are sure this optimization could always benefit all targets.
+static cl::opt<cl::boolOrDefault>
+EnableGlobalMergeOnExternal("global-merge-on-external", cl::Hidden,
+     cl::desc("Enable global merge pass on external linkage"));
+
+STATISTIC(NumMerged, "Number of globals merged");
+
+namespace {
+
+  class GlobalMerge : public FunctionPass {
+    const TargetMachine *TM = nullptr;
+
+    // FIXME: Infer the maximum possible offset depending on the actual users
+    // (these max offsets are different for the users inside Thumb or ARM
+    // functions), see the code that passes in the offset in the ARM backend
+    // for more information.
+    unsigned MaxOffset;
+
+    /// Whether we should try to optimize for size only.
+    /// Currently, this applies a dead simple heuristic: only consider globals
+    /// used in minsize functions for merging.
+    /// FIXME: This could learn about optsize, and be used in the cost model.
+    bool OnlyOptimizeForSize = false;
+
+    /// Whether we should merge global variables that have external linkage.
+    bool MergeExternalGlobals = false;
+
+    bool IsMachO = false;
+
+    bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
+                 Module &M, bool isConst, unsigned AddrSpace) const;
+
+    /// Merge everything in \p Globals for which the corresponding bit
+    /// in \p GlobalSet is set.
+    bool doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
+                 const BitVector &GlobalSet, Module &M, bool isConst,
+                 unsigned AddrSpace) const;
+
+    /// Check if the given variable has been identified as must keep
+    /// \pre setMustKeepGlobalVariables must have been called on the Module that
+    ///      contains GV
+    bool isMustKeepGlobalVariable(const GlobalVariable *GV) const {
+      return MustKeepGlobalVariables.count(GV);
+    }
+
+    /// Collect every variables marked as "used" or used in a landing pad
+    /// instruction for this Module.
+    void setMustKeepGlobalVariables(Module &M);
+
+    /// Collect every variables marked as "used"
+    void collectUsedGlobalVariables(Module &M, StringRef Name);
+
+    /// Keep track of the GlobalVariable that must not be merged away
+    SmallSetVector<const GlobalVariable *, 16> MustKeepGlobalVariables;
+
+  public:
+    static char ID;             // Pass identification, replacement for typeid.
+
+    explicit GlobalMerge()
+        : FunctionPass(ID), MaxOffset(GlobalMergeMaxOffset) {
+      initializeGlobalMergePass(*PassRegistry::getPassRegistry());
+    }
+
+    explicit GlobalMerge(const TargetMachine *TM, unsigned MaximalOffset,
+                         bool OnlyOptimizeForSize, bool MergeExternalGlobals)
+        : FunctionPass(ID), TM(TM), MaxOffset(MaximalOffset),
+          OnlyOptimizeForSize(OnlyOptimizeForSize),
+          MergeExternalGlobals(MergeExternalGlobals) {
+      initializeGlobalMergePass(*PassRegistry::getPassRegistry());
+    }
+
+    bool doInitialization(Module &M) override;
+    bool runOnFunction(Function &F) override;
+    bool doFinalization(Module &M) override;
+
+    StringRef getPassName() const override { return "Merge internal globals"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      FunctionPass::getAnalysisUsage(AU);
+    }
+  };
+
+} // end anonymous namespace
+
+char GlobalMerge::ID = 0;
+
+INITIALIZE_PASS(GlobalMerge, DEBUG_TYPE, "Merge global variables", false, false)
+
+bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
+                          Module &M, bool isConst, unsigned AddrSpace) const {
+  auto &DL = M.getDataLayout();
+  // FIXME: Find better heuristics
+  llvm::stable_sort(
+      Globals, [&DL](const GlobalVariable *GV1, const GlobalVariable *GV2) {
+        // We don't support scalable global variables.
+        return DL.getTypeAllocSize(GV1->getValueType()).getFixedValue() <
+               DL.getTypeAllocSize(GV2->getValueType()).getFixedValue();
+      });
+
+  // If we want to just blindly group all globals together, do so.
+  if (!GlobalMergeGroupByUse) {
+    BitVector AllGlobals(Globals.size());
+    AllGlobals.set();
+    return doMerge(Globals, AllGlobals, M, isConst, AddrSpace);
+  }
+
+  // If we want to be smarter, look at all uses of each global, to try to
+  // discover all sets of globals used together, and how many times each of
+  // these sets occurred.
+  //
+  // Keep this reasonably efficient, by having an append-only list of all sets
+  // discovered so far (UsedGlobalSet), and mapping each "together-ness" unit of
+  // code (currently, a Function) to the set of globals seen so far that are
+  // used together in that unit (GlobalUsesByFunction).
+  //
+  // When we look at the Nth global, we know that any new set is either:
+  // - the singleton set {N}, containing this global only, or
+  // - the union of {N} and a previously-discovered set, containing some
+  //   combination of the previous N-1 globals.
+  // Using that knowledge, when looking at the Nth global, we can keep:
+  // - a reference to the singleton set {N} (CurGVOnlySetIdx)
+  // - a list mapping each previous set to its union with {N} (EncounteredUGS),
+  //   if it actually occurs.
+
+  // We keep track of the sets of globals used together "close enough".
+  struct UsedGlobalSet {
+    BitVector Globals;
+    unsigned UsageCount = 1;
+
+    UsedGlobalSet(size_t Size) : Globals(Size) {}
+  };
+
+  // Each set is unique in UsedGlobalSets.
+  std::vector<UsedGlobalSet> UsedGlobalSets;
+
+  // Avoid repeating the create-global-set pattern.
+  auto CreateGlobalSet = [&]() -> UsedGlobalSet & {
+    UsedGlobalSets.emplace_back(Globals.size());
+    return UsedGlobalSets.back();
+  };
+
+  // The first set is the empty set.
+  CreateGlobalSet().UsageCount = 0;
+
+  // We define "close enough" to be "in the same function".
+  // FIXME: Grouping uses by function is way too aggressive, so we should have
+  // a better metric for distance between uses.
+  // The obvious alternative would be to group by BasicBlock, but that's in
+  // turn too conservative..
+  // Anything in between wouldn't be trivial to compute, so just stick with
+  // per-function grouping.
+
+  // The value type is an index into UsedGlobalSets.
+  // The default (0) conveniently points to the empty set.
+  DenseMap<Function *, size_t /*UsedGlobalSetIdx*/> GlobalUsesByFunction;
+
+  // Now, look at each merge-eligible global in turn.
+
+  // Keep track of the sets we already encountered to which we added the
+  // current global.
+  // Each element matches the same-index element in UsedGlobalSets.
+  // This lets us efficiently tell whether a set has already been expanded to
+  // include the current global.
+  std::vector<size_t> EncounteredUGS;
+
+  for (size_t GI = 0, GE = Globals.size(); GI != GE; ++GI) {
+    GlobalVariable *GV = Globals[GI];
+
+    // Reset the encountered sets for this global...
+    std::fill(EncounteredUGS.begin(), EncounteredUGS.end(), 0);
+    // ...and grow it in case we created new sets for the previous global.
+    EncounteredUGS.resize(UsedGlobalSets.size());
+
+    // We might need to create a set that only consists of the current global.
+    // Keep track of its index into UsedGlobalSets.
+    size_t CurGVOnlySetIdx = 0;
+
+    // For each global, look at all its Uses.
+    for (auto &U : GV->uses()) {
+      // This Use might be a ConstantExpr.  We're interested in Instruction
+      // users, so look through ConstantExpr...
+      Use *UI, *UE;
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U.getUser())) {
+        if (CE->use_empty())
+          continue;
+        UI = &*CE->use_begin();
+        UE = nullptr;
+      } else if (isa<Instruction>(U.getUser())) {
+        UI = &U;
+        UE = UI->getNext();
+      } else {
+        continue;
+      }
+
+      // ...to iterate on all the instruction users of the global.
+      // Note that we iterate on Uses and not on Users to be able to getNext().
+      for (; UI != UE; UI = UI->getNext()) {
+        Instruction *I = dyn_cast<Instruction>(UI->getUser());
+        if (!I)
+          continue;
+
+        Function *ParentFn = I->getParent()->getParent();
+
+        // If we're only optimizing for size, ignore non-minsize functions.
+        if (OnlyOptimizeForSize && !ParentFn->hasMinSize())
+          continue;
+
+        size_t UGSIdx = GlobalUsesByFunction[ParentFn];
+
+        // If this is the first global the basic block uses, map it to the set
+        // consisting of this global only.
+        if (!UGSIdx) {
+          // If that set doesn't exist yet, create it.
+          if (!CurGVOnlySetIdx) {
+            CurGVOnlySetIdx = UsedGlobalSets.size();
+            CreateGlobalSet().Globals.set(GI);
+          } else {
+            ++UsedGlobalSets[CurGVOnlySetIdx].UsageCount;
+          }
+
+          GlobalUsesByFunction[ParentFn] = CurGVOnlySetIdx;
+          continue;
+        }
+
+        // If we already encountered this BB, just increment the counter.
+        if (UsedGlobalSets[UGSIdx].Globals.test(GI)) {
+          ++UsedGlobalSets[UGSIdx].UsageCount;
+          continue;
+        }
+
+        // If not, the previous set wasn't actually used in this function.
+        --UsedGlobalSets[UGSIdx].UsageCount;
+
+        // If we already expanded the previous set to include this global, just
+        // reuse that expanded set.
+        if (size_t ExpandedIdx = EncounteredUGS[UGSIdx]) {
+          ++UsedGlobalSets[ExpandedIdx].UsageCount;
+          GlobalUsesByFunction[ParentFn] = ExpandedIdx;
+          continue;
+        }
+
+        // If not, create a new set consisting of the union of the previous set
+        // and this global.  Mark it as encountered, so we can reuse it later.
+        GlobalUsesByFunction[ParentFn] = EncounteredUGS[UGSIdx] =
+            UsedGlobalSets.size();
+
+        UsedGlobalSet &NewUGS = CreateGlobalSet();
+        NewUGS.Globals.set(GI);
+        NewUGS.Globals |= UsedGlobalSets[UGSIdx].Globals;
+      }
+    }
+  }
+
+  // Now we found a bunch of sets of globals used together.  We accumulated
+  // the number of times we encountered the sets (i.e., the number of blocks
+  // that use that exact set of globals).
+  //
+  // Multiply that by the size of the set to give us a crude profitability
+  // metric.
+  llvm::stable_sort(UsedGlobalSets,
+                    [](const UsedGlobalSet &UGS1, const UsedGlobalSet &UGS2) {
+                      return UGS1.Globals.count() * UGS1.UsageCount <
+                             UGS2.Globals.count() * UGS2.UsageCount;
+                    });
+
+  // We can choose to merge all globals together, but ignore globals never used
+  // with another global.  This catches the obviously non-profitable cases of
+  // having a single global, but is aggressive enough for any other case.
+  if (GlobalMergeIgnoreSingleUse) {
+    BitVector AllGlobals(Globals.size());
+    for (const UsedGlobalSet &UGS : llvm::reverse(UsedGlobalSets)) {
+      if (UGS.UsageCount == 0)
+        continue;
+      if (UGS.Globals.count() > 1)
+        AllGlobals |= UGS.Globals;
+    }
+    return doMerge(Globals, AllGlobals, M, isConst, AddrSpace);
+  }
+
+  // Starting from the sets with the best (=biggest) profitability, find a
+  // good combination.
+  // The ideal (and expensive) solution can only be found by trying all
+  // combinations, looking for the one with the best profitability.
+  // Don't be smart about it, and just pick the first compatible combination,
+  // starting with the sets with the best profitability.
+  BitVector PickedGlobals(Globals.size());
+  bool Changed = false;
+
+  for (const UsedGlobalSet &UGS : llvm::reverse(UsedGlobalSets)) {
+    if (UGS.UsageCount == 0)
+      continue;
+    if (PickedGlobals.anyCommon(UGS.Globals))
+      continue;
+    PickedGlobals |= UGS.Globals;
+    // If the set only contains one global, there's no point in merging.
+    // Ignore the global for inclusion in other sets though, so keep it in
+    // PickedGlobals.
+    if (UGS.Globals.count() < 2)
+      continue;
+    Changed |= doMerge(Globals, UGS.Globals, M, isConst, AddrSpace);
+  }
+
+  return Changed;
+}
+
+bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
+                          const BitVector &GlobalSet, Module &M, bool isConst,
+                          unsigned AddrSpace) const {
+  assert(Globals.size() > 1);
+
+  Type *Int32Ty = Type::getInt32Ty(M.getContext());
+  Type *Int8Ty = Type::getInt8Ty(M.getContext());
+  auto &DL = M.getDataLayout();
+
+  LLVM_DEBUG(dbgs() << " Trying to merge set, starts with #"
+                    << GlobalSet.find_first() << "\n");
+
+  bool Changed = false;
+  ssize_t i = GlobalSet.find_first();
+  while (i != -1) {
+    ssize_t j = 0;
+    uint64_t MergedSize = 0;
+    std::vector<Type*> Tys;
+    std::vector<Constant*> Inits;
+    std::vector<unsigned> StructIdxs;
+
+    bool HasExternal = false;
+    StringRef FirstExternalName;
+    Align MaxAlign;
+    unsigned CurIdx = 0;
+    for (j = i; j != -1; j = GlobalSet.find_next(j)) {
+      Type *Ty = Globals[j]->getValueType();
+
+      // Make sure we use the same alignment AsmPrinter would use.
+      Align Alignment = DL.getPreferredAlign(Globals[j]);
+      unsigned Padding = alignTo(MergedSize, Alignment) - MergedSize;
+      MergedSize += Padding;
+      MergedSize += DL.getTypeAllocSize(Ty);
+      if (MergedSize > MaxOffset) {
+        break;
+      }
+      if (Padding) {
+        Tys.push_back(ArrayType::get(Int8Ty, Padding));
+        Inits.push_back(ConstantAggregateZero::get(Tys.back()));
+        ++CurIdx;
+      }
+      Tys.push_back(Ty);
+      Inits.push_back(Globals[j]->getInitializer());
+      StructIdxs.push_back(CurIdx++);
+
+      MaxAlign = std::max(MaxAlign, Alignment);
+
+      if (Globals[j]->hasExternalLinkage() && !HasExternal) {
+        HasExternal = true;
+        FirstExternalName = Globals[j]->getName();
+      }
+    }
+
+    // Exit early if there is only one global to merge.
+    if (Tys.size() < 2) {
+      i = j;
+      continue;
+    }
+
+    // If merged variables doesn't have external linkage, we needn't to expose
+    // the symbol after merging.
+    GlobalValue::LinkageTypes Linkage = HasExternal
+                                            ? GlobalValue::ExternalLinkage
+                                            : GlobalValue::InternalLinkage;
+    // Use a packed struct so we can control alignment.
+    StructType *MergedTy = StructType::get(M.getContext(), Tys, true);
+    Constant *MergedInit = ConstantStruct::get(MergedTy, Inits);
+
+    // On Darwin external linkage needs to be preserved, otherwise
+    // dsymutil cannot preserve the debug info for the merged
+    // variables.  If they have external linkage, use the symbol name
+    // of the first variable merged as the suffix of global symbol
+    // name.  This avoids a link-time naming conflict for the
+    // _MergedGlobals symbols.
+    Twine MergedName =
+        (IsMachO && HasExternal)
+            ? "_MergedGlobals_" + FirstExternalName
+            : "_MergedGlobals";
+    auto MergedLinkage = IsMachO ? Linkage : GlobalValue::PrivateLinkage;
+    auto *MergedGV = new GlobalVariable(
+        M, MergedTy, isConst, MergedLinkage, MergedInit, MergedName, nullptr,
+        GlobalVariable::NotThreadLocal, AddrSpace);
+
+    MergedGV->setAlignment(MaxAlign);
+    MergedGV->setSection(Globals[i]->getSection());
+
+    const StructLayout *MergedLayout = DL.getStructLayout(MergedTy);
+    for (ssize_t k = i, idx = 0; k != j; k = GlobalSet.find_next(k), ++idx) {
+      GlobalValue::LinkageTypes Linkage = Globals[k]->getLinkage();
+      std::string Name(Globals[k]->getName());
+      GlobalValue::VisibilityTypes Visibility = Globals[k]->getVisibility();
+      GlobalValue::DLLStorageClassTypes DLLStorage =
+          Globals[k]->getDLLStorageClass();
+
+      // Copy metadata while adjusting any debug info metadata by the original
+      // global's offset within the merged global.
+      MergedGV->copyMetadata(Globals[k],
+                             MergedLayout->getElementOffset(StructIdxs[idx]));
+
+      Constant *Idx[2] = {
+          ConstantInt::get(Int32Ty, 0),
+          ConstantInt::get(Int32Ty, StructIdxs[idx]),
+      };
+      Constant *GEP =
+          ConstantExpr::getInBoundsGetElementPtr(MergedTy, MergedGV, Idx);
+      Globals[k]->replaceAllUsesWith(GEP);
+      Globals[k]->eraseFromParent();
+
+      // When the linkage is not internal we must emit an alias for the original
+      // variable name as it may be accessed from another object. On non-Mach-O
+      // we can also emit an alias for internal linkage as it's safe to do so.
+      // It's not safe on Mach-O as the alias (and thus the portion of the
+      // MergedGlobals variable) may be dead stripped at link time.
+      if (Linkage != GlobalValue::InternalLinkage || !IsMachO) {
+        GlobalAlias *GA = GlobalAlias::create(Tys[StructIdxs[idx]], AddrSpace,
+                                              Linkage, Name, GEP, &M);
+        GA->setVisibility(Visibility);
+        GA->setDLLStorageClass(DLLStorage);
+      }
+
+      NumMerged++;
+    }
+    Changed = true;
+    i = j;
+  }
+
+  return Changed;
+}
+
+void GlobalMerge::collectUsedGlobalVariables(Module &M, StringRef Name) {
+  // Extract global variables from llvm.used array
+  const GlobalVariable *GV = M.getGlobalVariable(Name);
+  if (!GV || !GV->hasInitializer()) return;
+
+  // Should be an array of 'i8*'.
+  const ConstantArray *InitList = cast<ConstantArray>(GV->getInitializer());
+
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
+    if (const GlobalVariable *G =
+        dyn_cast<GlobalVariable>(InitList->getOperand(i)->stripPointerCasts()))
+      MustKeepGlobalVariables.insert(G);
+}
+
+void GlobalMerge::setMustKeepGlobalVariables(Module &M) {
+  collectUsedGlobalVariables(M, "llvm.used");
+  collectUsedGlobalVariables(M, "llvm.compiler.used");
+
+  for (Function &F : M) {
+    for (BasicBlock &BB : F) {
+      Instruction *Pad = BB.getFirstNonPHI();
+      if (!Pad->isEHPad())
+        continue;
+
+      // Keep globals used by landingpads and catchpads.
+      for (const Use &U : Pad->operands()) {
+        if (const GlobalVariable *GV =
+                dyn_cast<GlobalVariable>(U->stripPointerCasts()))
+          MustKeepGlobalVariables.insert(GV);
+        else if (const ConstantArray *CA = dyn_cast<ConstantArray>(U->stripPointerCasts())) {
+          for (const Use &Elt : CA->operands()) {
+            if (const GlobalVariable *GV =
+                    dyn_cast<GlobalVariable>(Elt->stripPointerCasts()))
+              MustKeepGlobalVariables.insert(GV);
+          }
+        }
+      }
+    }
+  }
+}
+
+bool GlobalMerge::doInitialization(Module &M) {
+  if (!EnableGlobalMerge)
+    return false;
+
+  IsMachO = Triple(M.getTargetTriple()).isOSBinFormatMachO();
+
+  auto &DL = M.getDataLayout();
+  DenseMap<std::pair<unsigned, StringRef>, SmallVector<GlobalVariable *, 16>>
+      Globals, ConstGlobals, BSSGlobals;
+  bool Changed = false;
+  setMustKeepGlobalVariables(M);
+
+  LLVM_DEBUG({
+      dbgs() << "Number of GV that must be kept:  " <<
+                MustKeepGlobalVariables.size() << "\n";
+      for (const GlobalVariable *KeptGV : MustKeepGlobalVariables)
+        dbgs() << "Kept: " << *KeptGV << "\n";
+  });
+  // Grab all non-const globals.
+  for (auto &GV : M.globals()) {
+    // Merge is safe for "normal" internal or external globals only
+    if (GV.isDeclaration() || GV.isThreadLocal() || GV.hasImplicitSection())
+      continue;
+
+    // It's not safe to merge globals that may be preempted
+    if (TM && !TM->shouldAssumeDSOLocal(M, &GV))
+      continue;
+
+    if (!(MergeExternalGlobals && GV.hasExternalLinkage()) &&
+        !GV.hasInternalLinkage())
+      continue;
+
+    PointerType *PT = dyn_cast<PointerType>(GV.getType());
+    assert(PT && "Global variable is not a pointer!");
+
+    unsigned AddressSpace = PT->getAddressSpace();
+    StringRef Section = GV.getSection();
+
+    // Ignore all 'special' globals.
+    if (GV.getName().startswith("llvm.") ||
+        GV.getName().startswith(".llvm."))
+      continue;
+
+    // Ignore all "required" globals:
+    if (isMustKeepGlobalVariable(&GV))
+      continue;
+
+    // Don't merge tagged globals, as each global should have its own unique
+    // memory tag at runtime. TODO(hctim): This can be relaxed: constant globals
+    // with compatible alignment and the same contents may be merged as long as
+    // the globals occupy the same number of tag granules (i.e. `size_a / 16 ==
+    // size_b / 16`).
+    if (GV.isTagged())
+      continue;
+
+    Type *Ty = GV.getValueType();
+    if (DL.getTypeAllocSize(Ty) < MaxOffset) {
+      if (TM &&
+          TargetLoweringObjectFile::getKindForGlobal(&GV, *TM).isBSS())
+        BSSGlobals[{AddressSpace, Section}].push_back(&GV);
+      else if (GV.isConstant())
+        ConstGlobals[{AddressSpace, Section}].push_back(&GV);
+      else
+        Globals[{AddressSpace, Section}].push_back(&GV);
+    }
+  }
+
+  for (auto &P : Globals)
+    if (P.second.size() > 1)
+      Changed |= doMerge(P.second, M, false, P.first.first);
+
+  for (auto &P : BSSGlobals)
+    if (P.second.size() > 1)
+      Changed |= doMerge(P.second, M, false, P.first.first);
+
+  if (EnableGlobalMergeOnConst)
+    for (auto &P : ConstGlobals)
+      if (P.second.size() > 1)
+        Changed |= doMerge(P.second, M, true, P.first.first);
+
+  return Changed;
+}
+
+bool GlobalMerge::runOnFunction(Function &F) {
+  return false;
+}
+
+bool GlobalMerge::doFinalization(Module &M) {
+  MustKeepGlobalVariables.clear();
+  return false;
+}
+
+Pass *llvm::createGlobalMergePass(const TargetMachine *TM, unsigned Offset,
+                                  bool OnlyOptimizeForSize,
+                                  bool MergeExternalByDefault) {
+  bool MergeExternal = (EnableGlobalMergeOnExternal == cl::BOU_UNSET) ?
+    MergeExternalByDefault : (EnableGlobalMergeOnExternal == cl::BOU_TRUE);
+  return new GlobalMerge(TM, Offset, OnlyOptimizeForSize, MergeExternal);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/HardwareLoops.cpp b/contrib/llvm-project/llvm/lib/CodeGen/HardwareLoops.cpp
new file mode 100644
index 000000000000..e7b14d700a44
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/HardwareLoops.cpp
@@ -0,0 +1,606 @@
+//===-- HardwareLoops.cpp - Target Independent Hardware Loops --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// Insert hardware loop intrinsics into loops which are deemed profitable by
+/// the target, by querying TargetTransformInfo. A hardware loop comprises of
+/// two intrinsics: one, outside the loop, to set the loop iteration count and
+/// another, in the exit block, to decrement the counter. The decremented value
+/// can either be carried through the loop via a phi or handled in some opaque
+/// way by the target.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/HardwareLoops.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+
+#define DEBUG_TYPE "hardware-loops"
+
+#define HW_LOOPS_NAME "Hardware Loop Insertion"
+
+using namespace llvm;
+
+static cl::opt<bool>
+ForceHardwareLoops("force-hardware-loops", cl::Hidden, cl::init(false),
+                   cl::desc("Force hardware loops intrinsics to be inserted"));
+
+static cl::opt<bool>
+ForceHardwareLoopPHI(
+  "force-hardware-loop-phi", cl::Hidden, cl::init(false),
+  cl::desc("Force hardware loop counter to be updated through a phi"));
+
+static cl::opt<bool>
+ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false),
+                cl::desc("Force allowance of nested hardware loops"));
+
+static cl::opt<unsigned>
+LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1),
+            cl::desc("Set the loop decrement value"));
+
+static cl::opt<unsigned>
+CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32),
+                cl::desc("Set the loop counter bitwidth"));
+
+static cl::opt<bool>
+ForceGuardLoopEntry(
+  "force-hardware-loop-guard", cl::Hidden, cl::init(false),
+  cl::desc("Force generation of loop guard intrinsic"));
+
+STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
+
+#ifndef NDEBUG
+static void debugHWLoopFailure(const StringRef DebugMsg,
+    Instruction *I) {
+  dbgs() << "HWLoops: " << DebugMsg;
+  if (I)
+    dbgs() << ' ' << *I;
+  else
+    dbgs() << '.';
+  dbgs() << '\n';
+}
+#endif
+
+static OptimizationRemarkAnalysis
+createHWLoopAnalysis(StringRef RemarkName, Loop *L, Instruction *I) {
+  Value *CodeRegion = L->getHeader();
+  DebugLoc DL = L->getStartLoc();
+
+  if (I) {
+    CodeRegion = I->getParent();
+    // If there is no debug location attached to the instruction, revert back to
+    // using the loop's.
+    if (I->getDebugLoc())
+      DL = I->getDebugLoc();
+  }
+
+  OptimizationRemarkAnalysis R(DEBUG_TYPE, RemarkName, DL, CodeRegion);
+  R << "hardware-loop not created: ";
+  return R;
+}
+
+namespace {
+
+  void reportHWLoopFailure(const StringRef Msg, const StringRef ORETag,
+      OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr) {
+    LLVM_DEBUG(debugHWLoopFailure(Msg, I));
+    ORE->emit(createHWLoopAnalysis(ORETag, TheLoop, I) << Msg);
+  }
+
+  using TTI = TargetTransformInfo;
+
+  class HardwareLoopsLegacy : public FunctionPass {
+  public:
+    static char ID;
+
+    HardwareLoopsLegacy() : FunctionPass(ID) {
+      initializeHardwareLoopsLegacyPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addPreserved<DominatorTreeWrapperPass>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
+      AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+      AU.addPreserved<BranchProbabilityInfoWrapperPass>();
+    }
+  };
+
+  class HardwareLoopsImpl {
+  public:
+    HardwareLoopsImpl(ScalarEvolution &SE, LoopInfo &LI, bool PreserveLCSSA,
+                      DominatorTree &DT, const DataLayout &DL,
+                      const TargetTransformInfo &TTI, TargetLibraryInfo *TLI,
+                      AssumptionCache &AC, OptimizationRemarkEmitter *ORE,
+                      HardwareLoopOptions &Opts)
+      : SE(SE), LI(LI), PreserveLCSSA(PreserveLCSSA), DT(DT), DL(DL), TTI(TTI),
+        TLI(TLI), AC(AC), ORE(ORE), Opts(Opts) { }
+
+    bool run(Function &F);
+
+  private:
+    // Try to convert the given Loop into a hardware loop.
+    bool TryConvertLoop(Loop *L, LLVMContext &Ctx);
+
+    // Given that the target believes the loop to be profitable, try to
+    // convert it.
+    bool TryConvertLoop(HardwareLoopInfo &HWLoopInfo);
+
+    ScalarEvolution &SE;
+    LoopInfo &LI;
+    bool PreserveLCSSA;
+    DominatorTree &DT;
+    const DataLayout &DL;
+    const TargetTransformInfo &TTI;
+    TargetLibraryInfo *TLI = nullptr;
+    AssumptionCache &AC;
+    OptimizationRemarkEmitter *ORE;
+    HardwareLoopOptions &Opts;
+    bool MadeChange = false;
+  };
+
+  class HardwareLoop {
+    // Expand the trip count scev into a value that we can use.
+    Value *InitLoopCount();
+
+    // Insert the set_loop_iteration intrinsic.
+    Value *InsertIterationSetup(Value *LoopCountInit);
+
+    // Insert the loop_decrement intrinsic.
+    void InsertLoopDec();
+
+    // Insert the loop_decrement_reg intrinsic.
+    Instruction *InsertLoopRegDec(Value *EltsRem);
+
+    // If the target requires the counter value to be updated in the loop,
+    // insert a phi to hold the value. The intended purpose is for use by
+    // loop_decrement_reg.
+    PHINode *InsertPHICounter(Value *NumElts, Value *EltsRem);
+
+    // Create a new cmp, that checks the returned value of loop_decrement*,
+    // and update the exit branch to use it.
+    void UpdateBranch(Value *EltsRem);
+
+  public:
+    HardwareLoop(HardwareLoopInfo &Info, ScalarEvolution &SE,
+                 const DataLayout &DL,
+                 OptimizationRemarkEmitter *ORE,
+                 HardwareLoopOptions &Opts) :
+      SE(SE), DL(DL), ORE(ORE), Opts(Opts), L(Info.L), M(L->getHeader()->getModule()),
+      ExitCount(Info.ExitCount),
+      CountType(Info.CountType),
+      ExitBranch(Info.ExitBranch),
+      LoopDecrement(Info.LoopDecrement),
+      UsePHICounter(Info.CounterInReg),
+      UseLoopGuard(Info.PerformEntryTest) { }
+
+    void Create();
+
+  private:
+    ScalarEvolution &SE;
+    const DataLayout &DL;
+    OptimizationRemarkEmitter *ORE = nullptr;
+    HardwareLoopOptions &Opts;
+    Loop *L                 = nullptr;
+    Module *M               = nullptr;
+    const SCEV *ExitCount   = nullptr;
+    Type *CountType         = nullptr;
+    BranchInst *ExitBranch  = nullptr;
+    Value *LoopDecrement    = nullptr;
+    bool UsePHICounter      = false;
+    bool UseLoopGuard       = false;
+    BasicBlock *BeginBB     = nullptr;
+  };
+}
+
+char HardwareLoopsLegacy::ID = 0;
+
+bool HardwareLoopsLegacy::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n");
+
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  auto &DL = F.getParent()->getDataLayout();
+  auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+  auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+  auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+  HardwareLoopOptions Opts;
+  if (ForceHardwareLoops.getNumOccurrences())
+    Opts.setForce(ForceHardwareLoops);
+  if (ForceHardwareLoopPHI.getNumOccurrences())
+    Opts.setForcePhi(ForceHardwareLoopPHI);
+  if (ForceNestedLoop.getNumOccurrences())
+    Opts.setForceNested(ForceNestedLoop);
+  if (ForceGuardLoopEntry.getNumOccurrences())
+    Opts.setForceGuard(ForceGuardLoopEntry);
+  if (LoopDecrement.getNumOccurrences())
+    Opts.setDecrement(LoopDecrement);
+  if (CounterBitWidth.getNumOccurrences())
+    Opts.setCounterBitwidth(CounterBitWidth);
+
+  HardwareLoopsImpl Impl(SE, LI, PreserveLCSSA, DT, DL, TTI, TLI, AC, ORE,
+                         Opts);
+  return Impl.run(F);
+}
+
+PreservedAnalyses HardwareLoopsPass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  auto &DL = F.getParent()->getDataLayout();
+
+  HardwareLoopsImpl Impl(SE, LI, true, DT, DL, TTI, TLI, AC, ORE, Opts);
+  bool Changed = Impl.run(F);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<LoopAnalysis>();
+  PA.preserve<ScalarEvolutionAnalysis>();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<BranchProbabilityAnalysis>();
+  return PA;
+}
+
+bool HardwareLoopsImpl::run(Function &F) {
+  LLVMContext &Ctx = F.getParent()->getContext();
+  for (Loop *L : LI)
+    if (L->isOutermost())
+      TryConvertLoop(L, Ctx);
+  return MadeChange;
+}
+
+// Return true if the search should stop, which will be when an inner loop is
+// converted and the parent loop doesn't support containing a hardware loop.
+bool HardwareLoopsImpl::TryConvertLoop(Loop *L, LLVMContext &Ctx) {
+  // Process nested loops first.
+  bool AnyChanged = false;
+  for (Loop *SL : *L)
+    AnyChanged |= TryConvertLoop(SL, Ctx);
+  if (AnyChanged) {
+    reportHWLoopFailure("nested hardware-loops not supported", "HWLoopNested",
+                        ORE, L);
+    return true; // Stop search.
+  }
+
+  LLVM_DEBUG(dbgs() << "HWLoops: Loop " << L->getHeader()->getName() << "\n");
+
+  HardwareLoopInfo HWLoopInfo(L);
+  if (!HWLoopInfo.canAnalyze(LI)) {
+    reportHWLoopFailure("cannot analyze loop, irreducible control flow",
+                        "HWLoopCannotAnalyze", ORE, L);
+    return false;
+  }
+
+  if (!Opts.Force &&
+      !TTI.isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
+    reportHWLoopFailure("it's not profitable to create a hardware-loop",
+                        "HWLoopNotProfitable", ORE, L);
+    return false;
+  }
+
+  // Allow overriding of the counter width and loop decrement value.
+  if (Opts.Bitwidth.has_value()) {
+    HWLoopInfo.CountType = IntegerType::get(Ctx, Opts.Bitwidth.value());
+  }
+
+  if (Opts.Decrement.has_value())
+    HWLoopInfo.LoopDecrement =
+      ConstantInt::get(HWLoopInfo.CountType, Opts.Decrement.value());
+
+  MadeChange |= TryConvertLoop(HWLoopInfo);
+  return MadeChange && (!HWLoopInfo.IsNestingLegal && !Opts.ForceNested);
+}
+
+bool HardwareLoopsImpl::TryConvertLoop(HardwareLoopInfo &HWLoopInfo) {
+
+  Loop *L = HWLoopInfo.L;
+  LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L);
+
+  if (!HWLoopInfo.isHardwareLoopCandidate(SE, LI, DT, Opts.getForceNested(),
+                                          Opts.getForcePhi())) {
+    // TODO: there can be many reasons a loop is not considered a
+    // candidate, so we should let isHardwareLoopCandidate fill in the
+    // reason and then report a better message here.
+    reportHWLoopFailure("loop is not a candidate", "HWLoopNoCandidate", ORE, L);
+    return false;
+  }
+
+  assert(
+      (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.ExitCount) &&
+      "Hardware Loop must have set exit info.");
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+
+  // If we don't have a preheader, then insert one.
+  if (!Preheader)
+    Preheader = InsertPreheaderForLoop(L, &DT, &LI, nullptr, PreserveLCSSA);
+  if (!Preheader)
+    return false;
+
+  HardwareLoop HWLoop(HWLoopInfo, SE, DL, ORE, Opts);
+  HWLoop.Create();
+  ++NumHWLoops;
+  return true;
+}
+
+void HardwareLoop::Create() {
+  LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n");
+
+  Value *LoopCountInit = InitLoopCount();
+  if (!LoopCountInit) {
+    reportHWLoopFailure("could not safely create a loop count expression",
+                        "HWLoopNotSafe", ORE, L);
+    return;
+  }
+
+  Value *Setup = InsertIterationSetup(LoopCountInit);
+
+  if (UsePHICounter || Opts.ForcePhi) {
+    Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
+    Value *EltsRem = InsertPHICounter(Setup, LoopDec);
+    LoopDec->setOperand(0, EltsRem);
+    UpdateBranch(LoopDec);
+  } else
+    InsertLoopDec();
+
+  // Run through the basic blocks of the loop and see if any of them have dead
+  // PHIs that can be removed.
+  for (auto *I : L->blocks())
+    DeleteDeadPHIs(I);
+}
+
+static bool CanGenerateTest(Loop *L, Value *Count) {
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader->getSinglePredecessor())
+    return false;
+
+  BasicBlock *Pred = Preheader->getSinglePredecessor();
+  if (!isa<BranchInst>(Pred->getTerminator()))
+    return false;
+
+  auto *BI = cast<BranchInst>(Pred->getTerminator());
+  if (BI->isUnconditional() || !isa<ICmpInst>(BI->getCondition()))
+    return false;
+
+  // Check that the icmp is checking for equality of Count and zero and that
+  // a non-zero value results in entering the loop.
+  auto ICmp = cast<ICmpInst>(BI->getCondition());
+  LLVM_DEBUG(dbgs() << " - Found condition: " << *ICmp << "\n");
+  if (!ICmp->isEquality())
+    return false;
+
+  auto IsCompareZero = [](ICmpInst *ICmp, Value *Count, unsigned OpIdx) {
+    if (auto *Const = dyn_cast<ConstantInt>(ICmp->getOperand(OpIdx)))
+      return Const->isZero() && ICmp->getOperand(OpIdx ^ 1) == Count;
+    return false;
+  };
+
+  // Check if Count is a zext.
+  Value *CountBefZext =
+      isa<ZExtInst>(Count) ? cast<ZExtInst>(Count)->getOperand(0) : nullptr;
+
+  if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1) &&
+      !IsCompareZero(ICmp, CountBefZext, 0) &&
+      !IsCompareZero(ICmp, CountBefZext, 1))
+    return false;
+
+  unsigned SuccIdx = ICmp->getPredicate() == ICmpInst::ICMP_NE ? 0 : 1;
+  if (BI->getSuccessor(SuccIdx) != Preheader)
+    return false;
+
+  return true;
+}
+
+Value *HardwareLoop::InitLoopCount() {
+  LLVM_DEBUG(dbgs() << "HWLoops: Initialising loop counter value:\n");
+  // Can we replace a conditional branch with an intrinsic that sets the
+  // loop counter and tests that is not zero?
+
+  SCEVExpander SCEVE(SE, DL, "loopcnt");
+  if (!ExitCount->getType()->isPointerTy() &&
+      ExitCount->getType() != CountType)
+    ExitCount = SE.getZeroExtendExpr(ExitCount, CountType);
+
+  ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType));
+
+  // If we're trying to use the 'test and set' form of the intrinsic, we need
+  // to replace a conditional branch that is controlling entry to the loop. It
+  // is likely (guaranteed?) that the preheader has an unconditional branch to
+  // the loop header, so also check if it has a single predecessor.
+  if (SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, ExitCount,
+                                  SE.getZero(ExitCount->getType()))) {
+    LLVM_DEBUG(dbgs() << " - Attempting to use test.set counter.\n");
+    if (Opts.ForceGuard)
+      UseLoopGuard = true;
+  } else
+    UseLoopGuard = false;
+
+  BasicBlock *BB = L->getLoopPreheader();
+  if (UseLoopGuard && BB->getSinglePredecessor() &&
+      cast<BranchInst>(BB->getTerminator())->isUnconditional()) {
+    BasicBlock *Predecessor = BB->getSinglePredecessor();
+    // If it's not safe to create a while loop then don't force it and create a
+    // do-while loop instead
+    if (!SCEVE.isSafeToExpandAt(ExitCount, Predecessor->getTerminator()))
+        UseLoopGuard = false;
+    else
+        BB = Predecessor;
+  }
+
+  if (!SCEVE.isSafeToExpandAt(ExitCount, BB->getTerminator())) {
+    LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand ExitCount "
+               << *ExitCount << "\n");
+    return nullptr;
+  }
+
+  Value *Count = SCEVE.expandCodeFor(ExitCount, CountType,
+                                     BB->getTerminator());
+
+  // FIXME: We've expanded Count where we hope to insert the counter setting
+  // intrinsic. But, in the case of the 'test and set' form, we may fallback to
+  // the just 'set' form and in which case the insertion block is most likely
+  // different. It means there will be instruction(s) in a block that possibly
+  // aren't needed. The isLoopEntryGuardedByCond is trying to avoid this issue,
+  // but it's doesn't appear to work in all cases.
+
+  UseLoopGuard = UseLoopGuard && CanGenerateTest(L, Count);
+  BeginBB = UseLoopGuard ? BB : L->getLoopPreheader();
+  LLVM_DEBUG(dbgs() << " - Loop Count: " << *Count << "\n"
+                    << " - Expanded Count in " << BB->getName() << "\n"
+                    << " - Will insert set counter intrinsic into: "
+                    << BeginBB->getName() << "\n");
+  return Count;
+}
+
+Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
+  IRBuilder<> Builder(BeginBB->getTerminator());
+  Type *Ty = LoopCountInit->getType();
+  bool UsePhi = UsePHICounter || Opts.ForcePhi;
+  Intrinsic::ID ID = UseLoopGuard
+                         ? (UsePhi ? Intrinsic::test_start_loop_iterations
+                                   : Intrinsic::test_set_loop_iterations)
+                         : (UsePhi ? Intrinsic::start_loop_iterations
+                                   : Intrinsic::set_loop_iterations);
+  Function *LoopIter = Intrinsic::getDeclaration(M, ID, Ty);
+  Value *LoopSetup = Builder.CreateCall(LoopIter, LoopCountInit);
+
+  // Use the return value of the intrinsic to control the entry of the loop.
+  if (UseLoopGuard) {
+    assert((isa<BranchInst>(BeginBB->getTerminator()) &&
+            cast<BranchInst>(BeginBB->getTerminator())->isConditional()) &&
+           "Expected conditional branch");
+
+    Value *SetCount =
+        UsePhi ? Builder.CreateExtractValue(LoopSetup, 1) : LoopSetup;
+    auto *LoopGuard = cast<BranchInst>(BeginBB->getTerminator());
+    LoopGuard->setCondition(SetCount);
+    if (LoopGuard->getSuccessor(0) != L->getLoopPreheader())
+      LoopGuard->swapSuccessors();
+  }
+  LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop counter: " << *LoopSetup
+                    << "\n");
+  if (UsePhi && UseLoopGuard)
+    LoopSetup = Builder.CreateExtractValue(LoopSetup, 0);
+  return !UsePhi ? LoopCountInit : LoopSetup;
+}
+
+void HardwareLoop::InsertLoopDec() {
+  IRBuilder<> CondBuilder(ExitBranch);
+
+  Function *DecFunc =
+    Intrinsic::getDeclaration(M, Intrinsic::loop_decrement,
+                              LoopDecrement->getType());
+  Value *Ops[] = { LoopDecrement };
+  Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops);
+  Value *OldCond = ExitBranch->getCondition();
+  ExitBranch->setCondition(NewCond);
+
+  // The false branch must exit the loop.
+  if (!L->contains(ExitBranch->getSuccessor(0)))
+    ExitBranch->swapSuccessors();
+
+  // The old condition may be dead now, and may have even created a dead PHI
+  // (the original induction variable).
+  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+
+  LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *NewCond << "\n");
+}
+
+Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) {
+  IRBuilder<> CondBuilder(ExitBranch);
+
+  Function *DecFunc =
+      Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg,
+                                { EltsRem->getType() });
+  Value *Ops[] = { EltsRem, LoopDecrement };
+  Value *Call = CondBuilder.CreateCall(DecFunc, Ops);
+
+  LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n");
+  return cast<Instruction>(Call);
+}
+
+PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) {
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = ExitBranch->getParent();
+  IRBuilder<> Builder(Header->getFirstNonPHI());
+  PHINode *Index = Builder.CreatePHI(NumElts->getType(), 2);
+  Index->addIncoming(NumElts, Preheader);
+  Index->addIncoming(EltsRem, Latch);
+  LLVM_DEBUG(dbgs() << "HWLoops: PHI Counter: " << *Index << "\n");
+  return Index;
+}
+
+void HardwareLoop::UpdateBranch(Value *EltsRem) {
+  IRBuilder<> CondBuilder(ExitBranch);
+  Value *NewCond =
+    CondBuilder.CreateICmpNE(EltsRem, ConstantInt::get(EltsRem->getType(), 0));
+  Value *OldCond = ExitBranch->getCondition();
+  ExitBranch->setCondition(NewCond);
+
+  // The false branch must exit the loop.
+  if (!L->contains(ExitBranch->getSuccessor(0)))
+    ExitBranch->swapSuccessors();
+
+  // The old condition may be dead now, and may have even created a dead PHI
+  // (the original induction variable).
+  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+}
+
+INITIALIZE_PASS_BEGIN(HardwareLoopsLegacy, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(HardwareLoopsLegacy, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
+
+FunctionPass *llvm::createHardwareLoopsLegacyPass() { return new HardwareLoopsLegacy(); }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/IfConversion.cpp b/contrib/llvm-project/llvm/lib/CodeGen/IfConversion.cpp
new file mode 100644
index 000000000000..2ad5820bd9fb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/IfConversion.cpp
@@ -0,0 +1,2360 @@
+//===- IfConversion.cpp - Machine code if conversion pass -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the machine instruction level if-conversion pass, which
+// tries to convert conditional branches into predicated instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BranchFolding.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MBFIWrapper.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "if-converter"
+
+// Hidden options for help debugging.
+static cl::opt<int> IfCvtFnStart("ifcvt-fn-start", cl::init(-1), cl::Hidden);
+static cl::opt<int> IfCvtFnStop("ifcvt-fn-stop", cl::init(-1), cl::Hidden);
+static cl::opt<int> IfCvtLimit("ifcvt-limit", cl::init(-1), cl::Hidden);
+static cl::opt<bool> DisableSimple("disable-ifcvt-simple",
+                                   cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableSimpleF("disable-ifcvt-simple-false",
+                                    cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableTriangle("disable-ifcvt-triangle",
+                                     cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableTriangleR("disable-ifcvt-triangle-rev",
+                                      cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableTriangleF("disable-ifcvt-triangle-false",
+                                      cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableDiamond("disable-ifcvt-diamond",
+                                    cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableForkedDiamond("disable-ifcvt-forked-diamond",
+                                        cl::init(false), cl::Hidden);
+static cl::opt<bool> IfCvtBranchFold("ifcvt-branch-fold",
+                                     cl::init(true), cl::Hidden);
+
+STATISTIC(NumSimple,       "Number of simple if-conversions performed");
+STATISTIC(NumSimpleFalse,  "Number of simple (F) if-conversions performed");
+STATISTIC(NumTriangle,     "Number of triangle if-conversions performed");
+STATISTIC(NumTriangleRev,  "Number of triangle (R) if-conversions performed");
+STATISTIC(NumTriangleFalse,"Number of triangle (F) if-conversions performed");
+STATISTIC(NumTriangleFRev, "Number of triangle (F/R) if-conversions performed");
+STATISTIC(NumDiamonds,     "Number of diamond if-conversions performed");
+STATISTIC(NumForkedDiamonds, "Number of forked-diamond if-conversions performed");
+STATISTIC(NumIfConvBBs,    "Number of if-converted blocks");
+STATISTIC(NumDupBBs,       "Number of duplicated blocks");
+STATISTIC(NumUnpred,       "Number of true blocks of diamonds unpredicated");
+
+namespace {
+
+  class IfConverter : public MachineFunctionPass {
+    enum IfcvtKind {
+      ICNotClassfied,  // BB data valid, but not classified.
+      ICSimpleFalse,   // Same as ICSimple, but on the false path.
+      ICSimple,        // BB is entry of an one split, no rejoin sub-CFG.
+      ICTriangleFRev,  // Same as ICTriangleFalse, but false path rev condition.
+      ICTriangleRev,   // Same as ICTriangle, but true path rev condition.
+      ICTriangleFalse, // Same as ICTriangle, but on the false path.
+      ICTriangle,      // BB is entry of a triangle sub-CFG.
+      ICDiamond,       // BB is entry of a diamond sub-CFG.
+      ICForkedDiamond  // BB is entry of an almost diamond sub-CFG, with a
+                       // common tail that can be shared.
+    };
+
+    /// One per MachineBasicBlock, this is used to cache the result
+    /// if-conversion feasibility analysis. This includes results from
+    /// TargetInstrInfo::analyzeBranch() (i.e. TBB, FBB, and Cond), and its
+    /// classification, and common tail block of its successors (if it's a
+    /// diamond shape), its size, whether it's predicable, and whether any
+    /// instruction can clobber the 'would-be' predicate.
+    ///
+    /// IsDone          - True if BB is not to be considered for ifcvt.
+    /// IsBeingAnalyzed - True if BB is currently being analyzed.
+    /// IsAnalyzed      - True if BB has been analyzed (info is still valid).
+    /// IsEnqueued      - True if BB has been enqueued to be ifcvt'ed.
+    /// IsBrAnalyzable  - True if analyzeBranch() returns false.
+    /// HasFallThrough  - True if BB may fallthrough to the following BB.
+    /// IsUnpredicable  - True if BB is known to be unpredicable.
+    /// ClobbersPred    - True if BB could modify predicates (e.g. has
+    ///                   cmp, call, etc.)
+    /// NonPredSize     - Number of non-predicated instructions.
+    /// ExtraCost       - Extra cost for multi-cycle instructions.
+    /// ExtraCost2      - Some instructions are slower when predicated
+    /// BB              - Corresponding MachineBasicBlock.
+    /// TrueBB / FalseBB- See analyzeBranch().
+    /// BrCond          - Conditions for end of block conditional branches.
+    /// Predicate       - Predicate used in the BB.
+    struct BBInfo {
+      bool IsDone          : 1;
+      bool IsBeingAnalyzed : 1;
+      bool IsAnalyzed      : 1;
+      bool IsEnqueued      : 1;
+      bool IsBrAnalyzable  : 1;
+      bool IsBrReversible  : 1;
+      bool HasFallThrough  : 1;
+      bool IsUnpredicable  : 1;
+      bool CannotBeCopied  : 1;
+      bool ClobbersPred    : 1;
+      unsigned NonPredSize = 0;
+      unsigned ExtraCost = 0;
+      unsigned ExtraCost2 = 0;
+      MachineBasicBlock *BB = nullptr;
+      MachineBasicBlock *TrueBB = nullptr;
+      MachineBasicBlock *FalseBB = nullptr;
+      SmallVector<MachineOperand, 4> BrCond;
+      SmallVector<MachineOperand, 4> Predicate;
+
+      BBInfo() : IsDone(false), IsBeingAnalyzed(false),
+                 IsAnalyzed(false), IsEnqueued(false), IsBrAnalyzable(false),
+                 IsBrReversible(false), HasFallThrough(false),
+                 IsUnpredicable(false), CannotBeCopied(false),
+                 ClobbersPred(false) {}
+    };
+
+    /// Record information about pending if-conversions to attempt:
+    /// BBI             - Corresponding BBInfo.
+    /// Kind            - Type of block. See IfcvtKind.
+    /// NeedSubsumption - True if the to-be-predicated BB has already been
+    ///                   predicated.
+    /// NumDups      - Number of instructions that would be duplicated due
+    ///                   to this if-conversion. (For diamonds, the number of
+    ///                   identical instructions at the beginnings of both
+    ///                   paths).
+    /// NumDups2     - For diamonds, the number of identical instructions
+    ///                   at the ends of both paths.
+    struct IfcvtToken {
+      BBInfo &BBI;
+      IfcvtKind Kind;
+      unsigned NumDups;
+      unsigned NumDups2;
+      bool NeedSubsumption : 1;
+      bool TClobbersPred : 1;
+      bool FClobbersPred : 1;
+
+      IfcvtToken(BBInfo &b, IfcvtKind k, bool s, unsigned d, unsigned d2 = 0,
+                 bool tc = false, bool fc = false)
+        : BBI(b), Kind(k), NumDups(d), NumDups2(d2), NeedSubsumption(s),
+          TClobbersPred(tc), FClobbersPred(fc) {}
+    };
+
+    /// Results of if-conversion feasibility analysis indexed by basic block
+    /// number.
+    std::vector<BBInfo> BBAnalysis;
+    TargetSchedModel SchedModel;
+
+    const TargetLoweringBase *TLI = nullptr;
+    const TargetInstrInfo *TII = nullptr;
+    const TargetRegisterInfo *TRI = nullptr;
+    const MachineBranchProbabilityInfo *MBPI = nullptr;
+    MachineRegisterInfo *MRI = nullptr;
+
+    LivePhysRegs Redefs;
+
+    bool PreRegAlloc = true;
+    bool MadeChange = false;
+    int FnNum = -1;
+    std::function<bool(const MachineFunction &)> PredicateFtor;
+
+  public:
+    static char ID;
+
+    IfConverter(std::function<bool(const MachineFunction &)> Ftor = nullptr)
+        : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) {
+      initializeIfConverterPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineBlockFrequencyInfo>();
+      AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<ProfileSummaryInfoWrapperPass>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+  private:
+    bool reverseBranchCondition(BBInfo &BBI) const;
+    bool ValidSimple(BBInfo &TrueBBI, unsigned &Dups,
+                     BranchProbability Prediction) const;
+    bool ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
+                       bool FalseBranch, unsigned &Dups,
+                       BranchProbability Prediction) const;
+    bool CountDuplicatedInstructions(
+        MachineBasicBlock::iterator &TIB, MachineBasicBlock::iterator &FIB,
+        MachineBasicBlock::iterator &TIE, MachineBasicBlock::iterator &FIE,
+        unsigned &Dups1, unsigned &Dups2,
+        MachineBasicBlock &TBB, MachineBasicBlock &FBB,
+        bool SkipUnconditionalBranches) const;
+    bool ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
+                      unsigned &Dups1, unsigned &Dups2,
+                      BBInfo &TrueBBICalc, BBInfo &FalseBBICalc) const;
+    bool ValidForkedDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
+                            unsigned &Dups1, unsigned &Dups2,
+                            BBInfo &TrueBBICalc, BBInfo &FalseBBICalc) const;
+    void AnalyzeBranches(BBInfo &BBI);
+    void ScanInstructions(BBInfo &BBI,
+                          MachineBasicBlock::iterator &Begin,
+                          MachineBasicBlock::iterator &End,
+                          bool BranchUnpredicable = false) const;
+    bool RescanInstructions(
+        MachineBasicBlock::iterator &TIB, MachineBasicBlock::iterator &FIB,
+        MachineBasicBlock::iterator &TIE, MachineBasicBlock::iterator &FIE,
+        BBInfo &TrueBBI, BBInfo &FalseBBI) const;
+    void AnalyzeBlock(MachineBasicBlock &MBB,
+                      std::vector<std::unique_ptr<IfcvtToken>> &Tokens);
+    bool FeasibilityAnalysis(BBInfo &BBI, SmallVectorImpl<MachineOperand> &Pred,
+                             bool isTriangle = false, bool RevBranch = false,
+                             bool hasCommonTail = false);
+    void AnalyzeBlocks(MachineFunction &MF,
+                       std::vector<std::unique_ptr<IfcvtToken>> &Tokens);
+    void InvalidatePreds(MachineBasicBlock &MBB);
+    bool IfConvertSimple(BBInfo &BBI, IfcvtKind Kind);
+    bool IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind);
+    bool IfConvertDiamondCommon(BBInfo &BBI, BBInfo &TrueBBI, BBInfo &FalseBBI,
+                                unsigned NumDups1, unsigned NumDups2,
+                                bool TClobbersPred, bool FClobbersPred,
+                                bool RemoveBranch, bool MergeAddEdges);
+    bool IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
+                          unsigned NumDups1, unsigned NumDups2,
+                          bool TClobbers, bool FClobbers);
+    bool IfConvertForkedDiamond(BBInfo &BBI, IfcvtKind Kind,
+                              unsigned NumDups1, unsigned NumDups2,
+                              bool TClobbers, bool FClobbers);
+    void PredicateBlock(BBInfo &BBI,
+                        MachineBasicBlock::iterator E,
+                        SmallVectorImpl<MachineOperand> &Cond,
+                        SmallSet<MCPhysReg, 4> *LaterRedefs = nullptr);
+    void CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
+                               SmallVectorImpl<MachineOperand> &Cond,
+                               bool IgnoreBr = false);
+    void MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges = true);
+
+    bool MeetIfcvtSizeLimit(MachineBasicBlock &BB,
+                            unsigned Cycle, unsigned Extra,
+                            BranchProbability Prediction) const {
+      return Cycle > 0 && TII->isProfitableToIfCvt(BB, Cycle, Extra,
+                                                   Prediction);
+    }
+
+    bool MeetIfcvtSizeLimit(BBInfo &TBBInfo, BBInfo &FBBInfo,
+                            MachineBasicBlock &CommBB, unsigned Dups,
+                            BranchProbability Prediction, bool Forked) const {
+      const MachineFunction &MF = *TBBInfo.BB->getParent();
+      if (MF.getFunction().hasMinSize()) {
+        MachineBasicBlock::iterator TIB = TBBInfo.BB->begin();
+        MachineBasicBlock::iterator FIB = FBBInfo.BB->begin();
+        MachineBasicBlock::iterator TIE = TBBInfo.BB->end();
+        MachineBasicBlock::iterator FIE = FBBInfo.BB->end();
+
+        unsigned Dups1 = 0, Dups2 = 0;
+        if (!CountDuplicatedInstructions(TIB, FIB, TIE, FIE, Dups1, Dups2,
+                                         *TBBInfo.BB, *FBBInfo.BB,
+                                         /*SkipUnconditionalBranches*/ true))
+          llvm_unreachable("should already have been checked by ValidDiamond");
+
+        unsigned BranchBytes = 0;
+        unsigned CommonBytes = 0;
+
+        // Count common instructions at the start of the true and false blocks.
+        for (auto &I : make_range(TBBInfo.BB->begin(), TIB)) {
+          LLVM_DEBUG(dbgs() << "Common inst: " << I);
+          CommonBytes += TII->getInstSizeInBytes(I);
+        }
+        for (auto &I : make_range(FBBInfo.BB->begin(), FIB)) {
+          LLVM_DEBUG(dbgs() << "Common inst: " << I);
+          CommonBytes += TII->getInstSizeInBytes(I);
+        }
+
+        // Count instructions at the end of the true and false blocks, after
+        // the ones we plan to predicate. Analyzable branches will be removed
+        // (unless this is a forked diamond), and all other instructions are
+        // common between the two blocks.
+        for (auto &I : make_range(TIE, TBBInfo.BB->end())) {
+          if (I.isBranch() && TBBInfo.IsBrAnalyzable && !Forked) {
+            LLVM_DEBUG(dbgs() << "Saving branch: " << I);
+            BranchBytes += TII->predictBranchSizeForIfCvt(I);
+          } else {
+            LLVM_DEBUG(dbgs() << "Common inst: " << I);
+            CommonBytes += TII->getInstSizeInBytes(I);
+          }
+        }
+        for (auto &I : make_range(FIE, FBBInfo.BB->end())) {
+          if (I.isBranch() && FBBInfo.IsBrAnalyzable && !Forked) {
+            LLVM_DEBUG(dbgs() << "Saving branch: " << I);
+            BranchBytes += TII->predictBranchSizeForIfCvt(I);
+          } else {
+            LLVM_DEBUG(dbgs() << "Common inst: " << I);
+            CommonBytes += TII->getInstSizeInBytes(I);
+          }
+        }
+        for (auto &I : CommBB.terminators()) {
+          if (I.isBranch()) {
+            LLVM_DEBUG(dbgs() << "Saving branch: " << I);
+            BranchBytes += TII->predictBranchSizeForIfCvt(I);
+          }
+        }
+
+        // The common instructions in one branch will be eliminated, halving
+        // their code size.
+        CommonBytes /= 2;
+
+        // Count the instructions which we need to predicate.
+        unsigned NumPredicatedInstructions = 0;
+        for (auto &I : make_range(TIB, TIE)) {
+          if (!I.isDebugInstr()) {
+            LLVM_DEBUG(dbgs() << "Predicating: " << I);
+            NumPredicatedInstructions++;
+          }
+        }
+        for (auto &I : make_range(FIB, FIE)) {
+          if (!I.isDebugInstr()) {
+            LLVM_DEBUG(dbgs() << "Predicating: " << I);
+            NumPredicatedInstructions++;
+          }
+        }
+
+        // Even though we're optimising for size at the expense of performance,
+        // avoid creating really long predicated blocks.
+        if (NumPredicatedInstructions > 15)
+          return false;
+
+        // Some targets (e.g. Thumb2) need to insert extra instructions to
+        // start predicated blocks.
+        unsigned ExtraPredicateBytes = TII->extraSizeToPredicateInstructions(
+            MF, NumPredicatedInstructions);
+
+        LLVM_DEBUG(dbgs() << "MeetIfcvtSizeLimit(BranchBytes=" << BranchBytes
+                          << ", CommonBytes=" << CommonBytes
+                          << ", NumPredicatedInstructions="
+                          << NumPredicatedInstructions
+                          << ", ExtraPredicateBytes=" << ExtraPredicateBytes
+                          << ")\n");
+        return (BranchBytes + CommonBytes) > ExtraPredicateBytes;
+      } else {
+        unsigned TCycle = TBBInfo.NonPredSize + TBBInfo.ExtraCost - Dups;
+        unsigned FCycle = FBBInfo.NonPredSize + FBBInfo.ExtraCost - Dups;
+        bool Res = TCycle > 0 && FCycle > 0 &&
+                   TII->isProfitableToIfCvt(
+                       *TBBInfo.BB, TCycle, TBBInfo.ExtraCost2, *FBBInfo.BB,
+                       FCycle, FBBInfo.ExtraCost2, Prediction);
+        LLVM_DEBUG(dbgs() << "MeetIfcvtSizeLimit(TCycle=" << TCycle
+                          << ", FCycle=" << FCycle
+                          << ", TExtra=" << TBBInfo.ExtraCost2 << ", FExtra="
+                          << FBBInfo.ExtraCost2 << ") = " << Res << "\n");
+        return Res;
+      }
+    }
+
+    /// Returns true if Block ends without a terminator.
+    bool blockAlwaysFallThrough(BBInfo &BBI) const {
+      return BBI.IsBrAnalyzable && BBI.TrueBB == nullptr;
+    }
+
+    /// Used to sort if-conversion candidates.
+    static bool IfcvtTokenCmp(const std::unique_ptr<IfcvtToken> &C1,
+                              const std::unique_ptr<IfcvtToken> &C2) {
+      int Incr1 = (C1->Kind == ICDiamond)
+        ? -(int)(C1->NumDups + C1->NumDups2) : (int)C1->NumDups;
+      int Incr2 = (C2->Kind == ICDiamond)
+        ? -(int)(C2->NumDups + C2->NumDups2) : (int)C2->NumDups;
+      if (Incr1 > Incr2)
+        return true;
+      else if (Incr1 == Incr2) {
+        // Favors subsumption.
+        if (!C1->NeedSubsumption && C2->NeedSubsumption)
+          return true;
+        else if (C1->NeedSubsumption == C2->NeedSubsumption) {
+          // Favors diamond over triangle, etc.
+          if ((unsigned)C1->Kind < (unsigned)C2->Kind)
+            return true;
+          else if (C1->Kind == C2->Kind)
+            return C1->BBI.BB->getNumber() < C2->BBI.BB->getNumber();
+        }
+      }
+      return false;
+    }
+  };
+
+} // end anonymous namespace
+
+char IfConverter::ID = 0;
+
+char &llvm::IfConverterID = IfConverter::ID;
+
+INITIALIZE_PASS_BEGIN(IfConverter, DEBUG_TYPE, "If Converter", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_END(IfConverter, DEBUG_TYPE, "If Converter", false, false)
+
+bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()) || (PredicateFtor && !PredicateFtor(MF)))
+    return false;
+
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+  TLI = ST.getTargetLowering();
+  TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
+  MBFIWrapper MBFI(getAnalysis<MachineBlockFrequencyInfo>());
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  ProfileSummaryInfo *PSI =
+      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  MRI = &MF.getRegInfo();
+  SchedModel.init(&ST);
+
+  if (!TII) return false;
+
+  PreRegAlloc = MRI->isSSA();
+
+  bool BFChange = false;
+  if (!PreRegAlloc) {
+    // Tail merge tend to expose more if-conversion opportunities.
+    BranchFolder BF(true, false, MBFI, *MBPI, PSI);
+    BFChange = BF.OptimizeFunction(MF, TII, ST.getRegisterInfo());
+  }
+
+  LLVM_DEBUG(dbgs() << "\nIfcvt: function (" << ++FnNum << ") \'"
+                    << MF.getName() << "\'");
+
+  if (FnNum < IfCvtFnStart || (IfCvtFnStop != -1 && FnNum > IfCvtFnStop)) {
+    LLVM_DEBUG(dbgs() << " skipped\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "\n");
+
+  MF.RenumberBlocks();
+  BBAnalysis.resize(MF.getNumBlockIDs());
+
+  std::vector<std::unique_ptr<IfcvtToken>> Tokens;
+  MadeChange = false;
+  unsigned NumIfCvts = NumSimple + NumSimpleFalse + NumTriangle +
+    NumTriangleRev + NumTriangleFalse + NumTriangleFRev + NumDiamonds;
+  while (IfCvtLimit == -1 || (int)NumIfCvts < IfCvtLimit) {
+    // Do an initial analysis for each basic block and find all the potential
+    // candidates to perform if-conversion.
+    bool Change = false;
+    AnalyzeBlocks(MF, Tokens);
+    while (!Tokens.empty()) {
+      std::unique_ptr<IfcvtToken> Token = std::move(Tokens.back());
+      Tokens.pop_back();
+      BBInfo &BBI = Token->BBI;
+      IfcvtKind Kind = Token->Kind;
+      unsigned NumDups = Token->NumDups;
+      unsigned NumDups2 = Token->NumDups2;
+
+      // If the block has been evicted out of the queue or it has already been
+      // marked dead (due to it being predicated), then skip it.
+      if (BBI.IsDone)
+        BBI.IsEnqueued = false;
+      if (!BBI.IsEnqueued)
+        continue;
+
+      BBI.IsEnqueued = false;
+
+      bool RetVal = false;
+      switch (Kind) {
+      default: llvm_unreachable("Unexpected!");
+      case ICSimple:
+      case ICSimpleFalse: {
+        bool isFalse = Kind == ICSimpleFalse;
+        if ((isFalse && DisableSimpleF) || (!isFalse && DisableSimple)) break;
+        LLVM_DEBUG(dbgs() << "Ifcvt (Simple"
+                          << (Kind == ICSimpleFalse ? " false" : "")
+                          << "): " << printMBBReference(*BBI.BB) << " ("
+                          << ((Kind == ICSimpleFalse) ? BBI.FalseBB->getNumber()
+                                                      : BBI.TrueBB->getNumber())
+                          << ") ");
+        RetVal = IfConvertSimple(BBI, Kind);
+        LLVM_DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
+        if (RetVal) {
+          if (isFalse) ++NumSimpleFalse;
+          else         ++NumSimple;
+        }
+       break;
+      }
+      case ICTriangle:
+      case ICTriangleRev:
+      case ICTriangleFalse:
+      case ICTriangleFRev: {
+        bool isFalse = Kind == ICTriangleFalse;
+        bool isRev   = (Kind == ICTriangleRev || Kind == ICTriangleFRev);
+        if (DisableTriangle && !isFalse && !isRev) break;
+        if (DisableTriangleR && !isFalse && isRev) break;
+        if (DisableTriangleF && isFalse && !isRev) break;
+        LLVM_DEBUG(dbgs() << "Ifcvt (Triangle");
+        if (isFalse)
+          LLVM_DEBUG(dbgs() << " false");
+        if (isRev)
+          LLVM_DEBUG(dbgs() << " rev");
+        LLVM_DEBUG(dbgs() << "): " << printMBBReference(*BBI.BB)
+                          << " (T:" << BBI.TrueBB->getNumber()
+                          << ",F:" << BBI.FalseBB->getNumber() << ") ");
+        RetVal = IfConvertTriangle(BBI, Kind);
+        LLVM_DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
+        if (RetVal) {
+          if (isFalse) {
+            if (isRev) ++NumTriangleFRev;
+            else       ++NumTriangleFalse;
+          } else {
+            if (isRev) ++NumTriangleRev;
+            else       ++NumTriangle;
+          }
+        }
+        break;
+      }
+      case ICDiamond:
+        if (DisableDiamond) break;
+        LLVM_DEBUG(dbgs() << "Ifcvt (Diamond): " << printMBBReference(*BBI.BB)
+                          << " (T:" << BBI.TrueBB->getNumber()
+                          << ",F:" << BBI.FalseBB->getNumber() << ") ");
+        RetVal = IfConvertDiamond(BBI, Kind, NumDups, NumDups2,
+                                  Token->TClobbersPred,
+                                  Token->FClobbersPred);
+        LLVM_DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
+        if (RetVal) ++NumDiamonds;
+        break;
+      case ICForkedDiamond:
+        if (DisableForkedDiamond) break;
+        LLVM_DEBUG(dbgs() << "Ifcvt (Forked Diamond): "
+                          << printMBBReference(*BBI.BB)
+                          << " (T:" << BBI.TrueBB->getNumber()
+                          << ",F:" << BBI.FalseBB->getNumber() << ") ");
+        RetVal = IfConvertForkedDiamond(BBI, Kind, NumDups, NumDups2,
+                                      Token->TClobbersPred,
+                                      Token->FClobbersPred);
+        LLVM_DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
+        if (RetVal) ++NumForkedDiamonds;
+        break;
+      }
+
+      if (RetVal && MRI->tracksLiveness())
+        recomputeLivenessFlags(*BBI.BB);
+
+      Change |= RetVal;
+
+      NumIfCvts = NumSimple + NumSimpleFalse + NumTriangle + NumTriangleRev +
+        NumTriangleFalse + NumTriangleFRev + NumDiamonds;
+      if (IfCvtLimit != -1 && (int)NumIfCvts >= IfCvtLimit)
+        break;
+    }
+
+    if (!Change)
+      break;
+    MadeChange |= Change;
+  }
+
+  Tokens.clear();
+  BBAnalysis.clear();
+
+  if (MadeChange && IfCvtBranchFold) {
+    BranchFolder BF(false, false, MBFI, *MBPI, PSI);
+    BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo());
+  }
+
+  MadeChange |= BFChange;
+  return MadeChange;
+}
+
+/// BB has a fallthrough. Find its 'false' successor given its 'true' successor.
+static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
+                                         MachineBasicBlock *TrueBB) {
+  for (MachineBasicBlock *SuccBB : BB->successors()) {
+    if (SuccBB != TrueBB)
+      return SuccBB;
+  }
+  return nullptr;
+}
+
+/// Reverse the condition of the end of the block branch. Swap block's 'true'
+/// and 'false' successors.
+bool IfConverter::reverseBranchCondition(BBInfo &BBI) const {
+  DebugLoc dl;  // FIXME: this is nowhere
+  if (!TII->reverseBranchCondition(BBI.BrCond)) {
+    TII->removeBranch(*BBI.BB);
+    TII->insertBranch(*BBI.BB, BBI.FalseBB, BBI.TrueBB, BBI.BrCond, dl);
+    std::swap(BBI.TrueBB, BBI.FalseBB);
+    return true;
+  }
+  return false;
+}
+
+/// Returns the next block in the function blocks ordering. If it is the end,
+/// returns NULL.
+static inline MachineBasicBlock *getNextBlock(MachineBasicBlock &MBB) {
+  MachineFunction::iterator I = MBB.getIterator();
+  MachineFunction::iterator E = MBB.getParent()->end();
+  if (++I == E)
+    return nullptr;
+  return &*I;
+}
+
+/// Returns true if the 'true' block (along with its predecessor) forms a valid
+/// simple shape for ifcvt. It also returns the number of instructions that the
+/// ifcvt would need to duplicate if performed in Dups.
+bool IfConverter::ValidSimple(BBInfo &TrueBBI, unsigned &Dups,
+                              BranchProbability Prediction) const {
+  Dups = 0;
+  if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone)
+    return false;
+
+  if (TrueBBI.IsBrAnalyzable)
+    return false;
+
+  if (TrueBBI.BB->pred_size() > 1) {
+    if (TrueBBI.CannotBeCopied ||
+        !TII->isProfitableToDupForIfCvt(*TrueBBI.BB, TrueBBI.NonPredSize,
+                                        Prediction))
+      return false;
+    Dups = TrueBBI.NonPredSize;
+  }
+
+  return true;
+}
+
+/// Returns true if the 'true' and 'false' blocks (along with their common
+/// predecessor) forms a valid triangle shape for ifcvt. If 'FalseBranch' is
+/// true, it checks if 'true' block's false branch branches to the 'false' block
+/// rather than the other way around. It also returns the number of instructions
+/// that the ifcvt would need to duplicate if performed in 'Dups'.
+bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
+                                bool FalseBranch, unsigned &Dups,
+                                BranchProbability Prediction) const {
+  Dups = 0;
+  if (TrueBBI.BB == FalseBBI.BB)
+    return false;
+
+  if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone)
+    return false;
+
+  if (TrueBBI.BB->pred_size() > 1) {
+    if (TrueBBI.CannotBeCopied)
+      return false;
+
+    unsigned Size = TrueBBI.NonPredSize;
+    if (TrueBBI.IsBrAnalyzable) {
+      if (TrueBBI.TrueBB && TrueBBI.BrCond.empty())
+        // Ends with an unconditional branch. It will be removed.
+        --Size;
+      else {
+        MachineBasicBlock *FExit = FalseBranch
+          ? TrueBBI.TrueBB : TrueBBI.FalseBB;
+        if (FExit)
+          // Require a conditional branch
+          ++Size;
+      }
+    }
+    if (!TII->isProfitableToDupForIfCvt(*TrueBBI.BB, Size, Prediction))
+      return false;
+    Dups = Size;
+  }
+
+  MachineBasicBlock *TExit = FalseBranch ? TrueBBI.FalseBB : TrueBBI.TrueBB;
+  if (!TExit && blockAlwaysFallThrough(TrueBBI)) {
+    MachineFunction::iterator I = TrueBBI.BB->getIterator();
+    if (++I == TrueBBI.BB->getParent()->end())
+      return false;
+    TExit = &*I;
+  }
+  return TExit && TExit == FalseBBI.BB;
+}
+
+/// Count duplicated instructions and move the iterators to show where they
+/// are.
+/// @param TIB True Iterator Begin
+/// @param FIB False Iterator Begin
+/// These two iterators initially point to the first instruction of the two
+/// blocks, and finally point to the first non-shared instruction.
+/// @param TIE True Iterator End
+/// @param FIE False Iterator End
+/// These two iterators initially point to End() for the two blocks() and
+/// finally point to the first shared instruction in the tail.
+/// Upon return [TIB, TIE), and [FIB, FIE) mark the un-duplicated portions of
+/// two blocks.
+/// @param Dups1 count of duplicated instructions at the beginning of the 2
+/// blocks.
+/// @param Dups2 count of duplicated instructions at the end of the 2 blocks.
+/// @param SkipUnconditionalBranches if true, Don't make sure that
+/// unconditional branches at the end of the blocks are the same. True is
+/// passed when the blocks are analyzable to allow for fallthrough to be
+/// handled.
+/// @return false if the shared portion prevents if conversion.
+bool IfConverter::CountDuplicatedInstructions(
+    MachineBasicBlock::iterator &TIB,
+    MachineBasicBlock::iterator &FIB,
+    MachineBasicBlock::iterator &TIE,
+    MachineBasicBlock::iterator &FIE,
+    unsigned &Dups1, unsigned &Dups2,
+    MachineBasicBlock &TBB, MachineBasicBlock &FBB,
+    bool SkipUnconditionalBranches) const {
+  while (TIB != TIE && FIB != FIE) {
+    // Skip dbg_value instructions. These do not count.
+    TIB = skipDebugInstructionsForward(TIB, TIE, false);
+    FIB = skipDebugInstructionsForward(FIB, FIE, false);
+    if (TIB == TIE || FIB == FIE)
+      break;
+    if (!TIB->isIdenticalTo(*FIB))
+      break;
+    // A pred-clobbering instruction in the shared portion prevents
+    // if-conversion.
+    std::vector<MachineOperand> PredDefs;
+    if (TII->ClobbersPredicate(*TIB, PredDefs, false))
+      return false;
+    // If we get all the way to the branch instructions, don't count them.
+    if (!TIB->isBranch())
+      ++Dups1;
+    ++TIB;
+    ++FIB;
+  }
+
+  // Check for already containing all of the block.
+  if (TIB == TIE || FIB == FIE)
+    return true;
+  // Now, in preparation for counting duplicate instructions at the ends of the
+  // blocks, switch to reverse_iterators. Note that getReverse() returns an
+  // iterator that points to the same instruction, unlike std::reverse_iterator.
+  // We have to do our own shifting so that we get the same range.
+  MachineBasicBlock::reverse_iterator RTIE = std::next(TIE.getReverse());
+  MachineBasicBlock::reverse_iterator RFIE = std::next(FIE.getReverse());
+  const MachineBasicBlock::reverse_iterator RTIB = std::next(TIB.getReverse());
+  const MachineBasicBlock::reverse_iterator RFIB = std::next(FIB.getReverse());
+
+  if (!TBB.succ_empty() || !FBB.succ_empty()) {
+    if (SkipUnconditionalBranches) {
+      while (RTIE != RTIB && RTIE->isUnconditionalBranch())
+        ++RTIE;
+      while (RFIE != RFIB && RFIE->isUnconditionalBranch())
+        ++RFIE;
+    }
+  }
+
+  // Count duplicate instructions at the ends of the blocks.
+  while (RTIE != RTIB && RFIE != RFIB) {
+    // Skip dbg_value instructions. These do not count.
+    // Note that these are reverse iterators going forward.
+    RTIE = skipDebugInstructionsForward(RTIE, RTIB, false);
+    RFIE = skipDebugInstructionsForward(RFIE, RFIB, false);
+    if (RTIE == RTIB || RFIE == RFIB)
+      break;
+    if (!RTIE->isIdenticalTo(*RFIE))
+      break;
+    // We have to verify that any branch instructions are the same, and then we
+    // don't count them toward the # of duplicate instructions.
+    if (!RTIE->isBranch())
+      ++Dups2;
+    ++RTIE;
+    ++RFIE;
+  }
+  TIE = std::next(RTIE.getReverse());
+  FIE = std::next(RFIE.getReverse());
+  return true;
+}
+
+/// RescanInstructions - Run ScanInstructions on a pair of blocks.
+/// @param TIB - True Iterator Begin, points to first non-shared instruction
+/// @param FIB - False Iterator Begin, points to first non-shared instruction
+/// @param TIE - True Iterator End, points past last non-shared instruction
+/// @param FIE - False Iterator End, points past last non-shared instruction
+/// @param TrueBBI  - BBInfo to update for the true block.
+/// @param FalseBBI - BBInfo to update for the false block.
+/// @returns - false if either block cannot be predicated or if both blocks end
+///   with a predicate-clobbering instruction.
+bool IfConverter::RescanInstructions(
+    MachineBasicBlock::iterator &TIB, MachineBasicBlock::iterator &FIB,
+    MachineBasicBlock::iterator &TIE, MachineBasicBlock::iterator &FIE,
+    BBInfo &TrueBBI, BBInfo &FalseBBI) const {
+  bool BranchUnpredicable = true;
+  TrueBBI.IsUnpredicable = FalseBBI.IsUnpredicable = false;
+  ScanInstructions(TrueBBI, TIB, TIE, BranchUnpredicable);
+  if (TrueBBI.IsUnpredicable)
+    return false;
+  ScanInstructions(FalseBBI, FIB, FIE, BranchUnpredicable);
+  if (FalseBBI.IsUnpredicable)
+    return false;
+  if (TrueBBI.ClobbersPred && FalseBBI.ClobbersPred)
+    return false;
+  return true;
+}
+
+#ifndef NDEBUG
+static void verifySameBranchInstructions(
+    MachineBasicBlock *MBB1,
+    MachineBasicBlock *MBB2) {
+  const MachineBasicBlock::reverse_iterator B1 = MBB1->rend();
+  const MachineBasicBlock::reverse_iterator B2 = MBB2->rend();
+  MachineBasicBlock::reverse_iterator E1 = MBB1->rbegin();
+  MachineBasicBlock::reverse_iterator E2 = MBB2->rbegin();
+  while (E1 != B1 && E2 != B2) {
+    skipDebugInstructionsForward(E1, B1, false);
+    skipDebugInstructionsForward(E2, B2, false);
+    if (E1 == B1 && E2 == B2)
+      break;
+
+    if (E1 == B1) {
+      assert(!E2->isBranch() && "Branch mis-match, one block is empty.");
+      break;
+    }
+    if (E2 == B2) {
+      assert(!E1->isBranch() && "Branch mis-match, one block is empty.");
+      break;
+    }
+
+    if (E1->isBranch() || E2->isBranch())
+      assert(E1->isIdenticalTo(*E2) &&
+             "Branch mis-match, branch instructions don't match.");
+    else
+      break;
+    ++E1;
+    ++E2;
+  }
+}
+#endif
+
+/// ValidForkedDiamond - Returns true if the 'true' and 'false' blocks (along
+/// with their common predecessor) form a diamond if a common tail block is
+/// extracted.
+/// While not strictly a diamond, this pattern would form a diamond if
+/// tail-merging had merged the shared tails.
+///           EBB
+///         _/   \_
+///         |     |
+///        TBB   FBB
+///        /  \ /   \
+///  FalseBB TrueBB FalseBB
+/// Currently only handles analyzable branches.
+/// Specifically excludes actual diamonds to avoid overlap.
+bool IfConverter::ValidForkedDiamond(
+    BBInfo &TrueBBI, BBInfo &FalseBBI,
+    unsigned &Dups1, unsigned &Dups2,
+    BBInfo &TrueBBICalc, BBInfo &FalseBBICalc) const {
+  Dups1 = Dups2 = 0;
+  if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone ||
+      FalseBBI.IsBeingAnalyzed || FalseBBI.IsDone)
+    return false;
+
+  if (!TrueBBI.IsBrAnalyzable || !FalseBBI.IsBrAnalyzable)
+    return false;
+  // Don't IfConvert blocks that can't be folded into their predecessor.
+  if  (TrueBBI.BB->pred_size() > 1 || FalseBBI.BB->pred_size() > 1)
+    return false;
+
+  // This function is specifically looking for conditional tails, as
+  // unconditional tails are already handled by the standard diamond case.
+  if (TrueBBI.BrCond.size() == 0 ||
+      FalseBBI.BrCond.size() == 0)
+    return false;
+
+  MachineBasicBlock *TT = TrueBBI.TrueBB;
+  MachineBasicBlock *TF = TrueBBI.FalseBB;
+  MachineBasicBlock *FT = FalseBBI.TrueBB;
+  MachineBasicBlock *FF = FalseBBI.FalseBB;
+
+  if (!TT)
+    TT = getNextBlock(*TrueBBI.BB);
+  if (!TF)
+    TF = getNextBlock(*TrueBBI.BB);
+  if (!FT)
+    FT = getNextBlock(*FalseBBI.BB);
+  if (!FF)
+    FF = getNextBlock(*FalseBBI.BB);
+
+  if (!TT || !TF)
+    return false;
+
+  // Check successors. If they don't match, bail.
+  if (!((TT == FT && TF == FF) || (TF == FT && TT == FF)))
+    return false;
+
+  bool FalseReversed = false;
+  if (TF == FT && TT == FF) {
+    // If the branches are opposing, but we can't reverse, don't do it.
+    if (!FalseBBI.IsBrReversible)
+      return false;
+    FalseReversed = true;
+    reverseBranchCondition(FalseBBI);
+  }
+  auto UnReverseOnExit = make_scope_exit([&]() {
+    if (FalseReversed)
+      reverseBranchCondition(FalseBBI);
+  });
+
+  // Count duplicate instructions at the beginning of the true and false blocks.
+  MachineBasicBlock::iterator TIB = TrueBBI.BB->begin();
+  MachineBasicBlock::iterator FIB = FalseBBI.BB->begin();
+  MachineBasicBlock::iterator TIE = TrueBBI.BB->end();
+  MachineBasicBlock::iterator FIE = FalseBBI.BB->end();
+  if(!CountDuplicatedInstructions(TIB, FIB, TIE, FIE, Dups1, Dups2,
+                                  *TrueBBI.BB, *FalseBBI.BB,
+                                  /* SkipUnconditionalBranches */ true))
+    return false;
+
+  TrueBBICalc.BB = TrueBBI.BB;
+  FalseBBICalc.BB = FalseBBI.BB;
+  TrueBBICalc.IsBrAnalyzable = TrueBBI.IsBrAnalyzable;
+  FalseBBICalc.IsBrAnalyzable = FalseBBI.IsBrAnalyzable;
+  if (!RescanInstructions(TIB, FIB, TIE, FIE, TrueBBICalc, FalseBBICalc))
+    return false;
+
+  // The size is used to decide whether to if-convert, and the shared portions
+  // are subtracted off. Because of the subtraction, we just use the size that
+  // was calculated by the original ScanInstructions, as it is correct.
+  TrueBBICalc.NonPredSize = TrueBBI.NonPredSize;
+  FalseBBICalc.NonPredSize = FalseBBI.NonPredSize;
+  return true;
+}
+
+/// ValidDiamond - Returns true if the 'true' and 'false' blocks (along
+/// with their common predecessor) forms a valid diamond shape for ifcvt.
+bool IfConverter::ValidDiamond(
+    BBInfo &TrueBBI, BBInfo &FalseBBI,
+    unsigned &Dups1, unsigned &Dups2,
+    BBInfo &TrueBBICalc, BBInfo &FalseBBICalc) const {
+  Dups1 = Dups2 = 0;
+  if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone ||
+      FalseBBI.IsBeingAnalyzed || FalseBBI.IsDone)
+    return false;
+
+  // If the True and False BBs are equal we're dealing with a degenerate case
+  // that we don't treat as a diamond.
+  if (TrueBBI.BB == FalseBBI.BB)
+    return false;
+
+  MachineBasicBlock *TT = TrueBBI.TrueBB;
+  MachineBasicBlock *FT = FalseBBI.TrueBB;
+
+  if (!TT && blockAlwaysFallThrough(TrueBBI))
+    TT = getNextBlock(*TrueBBI.BB);
+  if (!FT && blockAlwaysFallThrough(FalseBBI))
+    FT = getNextBlock(*FalseBBI.BB);
+  if (TT != FT)
+    return false;
+  if (!TT && (TrueBBI.IsBrAnalyzable || FalseBBI.IsBrAnalyzable))
+    return false;
+  if  (TrueBBI.BB->pred_size() > 1 || FalseBBI.BB->pred_size() > 1)
+    return false;
+
+  // FIXME: Allow true block to have an early exit?
+  if (TrueBBI.FalseBB || FalseBBI.FalseBB)
+    return false;
+
+  // Count duplicate instructions at the beginning and end of the true and
+  // false blocks.
+  // Skip unconditional branches only if we are considering an analyzable
+  // diamond. Otherwise the branches must be the same.
+  bool SkipUnconditionalBranches =
+      TrueBBI.IsBrAnalyzable && FalseBBI.IsBrAnalyzable;
+  MachineBasicBlock::iterator TIB = TrueBBI.BB->begin();
+  MachineBasicBlock::iterator FIB = FalseBBI.BB->begin();
+  MachineBasicBlock::iterator TIE = TrueBBI.BB->end();
+  MachineBasicBlock::iterator FIE = FalseBBI.BB->end();
+  if(!CountDuplicatedInstructions(TIB, FIB, TIE, FIE, Dups1, Dups2,
+                                  *TrueBBI.BB, *FalseBBI.BB,
+                                  SkipUnconditionalBranches))
+    return false;
+
+  TrueBBICalc.BB = TrueBBI.BB;
+  FalseBBICalc.BB = FalseBBI.BB;
+  TrueBBICalc.IsBrAnalyzable = TrueBBI.IsBrAnalyzable;
+  FalseBBICalc.IsBrAnalyzable = FalseBBI.IsBrAnalyzable;
+  if (!RescanInstructions(TIB, FIB, TIE, FIE, TrueBBICalc, FalseBBICalc))
+    return false;
+  // The size is used to decide whether to if-convert, and the shared portions
+  // are subtracted off. Because of the subtraction, we just use the size that
+  // was calculated by the original ScanInstructions, as it is correct.
+  TrueBBICalc.NonPredSize = TrueBBI.NonPredSize;
+  FalseBBICalc.NonPredSize = FalseBBI.NonPredSize;
+  return true;
+}
+
+/// AnalyzeBranches - Look at the branches at the end of a block to determine if
+/// the block is predicable.
+void IfConverter::AnalyzeBranches(BBInfo &BBI) {
+  if (BBI.IsDone)
+    return;
+
+  BBI.TrueBB = BBI.FalseBB = nullptr;
+  BBI.BrCond.clear();
+  BBI.IsBrAnalyzable =
+      !TII->analyzeBranch(*BBI.BB, BBI.TrueBB, BBI.FalseBB, BBI.BrCond);
+  if (!BBI.IsBrAnalyzable) {
+    BBI.TrueBB = nullptr;
+    BBI.FalseBB = nullptr;
+    BBI.BrCond.clear();
+  }
+
+  SmallVector<MachineOperand, 4> RevCond(BBI.BrCond.begin(), BBI.BrCond.end());
+  BBI.IsBrReversible = (RevCond.size() == 0) ||
+      !TII->reverseBranchCondition(RevCond);
+  BBI.HasFallThrough = BBI.IsBrAnalyzable && BBI.FalseBB == nullptr;
+
+  if (BBI.BrCond.size()) {
+    // No false branch. This BB must end with a conditional branch and a
+    // fallthrough.
+    if (!BBI.FalseBB)
+      BBI.FalseBB = findFalseBlock(BBI.BB, BBI.TrueBB);
+    if (!BBI.FalseBB) {
+      // Malformed bcc? True and false blocks are the same?
+      BBI.IsUnpredicable = true;
+    }
+  }
+}
+
+/// ScanInstructions - Scan all the instructions in the block to determine if
+/// the block is predicable. In most cases, that means all the instructions
+/// in the block are isPredicable(). Also checks if the block contains any
+/// instruction which can clobber a predicate (e.g. condition code register).
+/// If so, the block is not predicable unless it's the last instruction.
+void IfConverter::ScanInstructions(BBInfo &BBI,
+                                   MachineBasicBlock::iterator &Begin,
+                                   MachineBasicBlock::iterator &End,
+                                   bool BranchUnpredicable) const {
+  if (BBI.IsDone || BBI.IsUnpredicable)
+    return;
+
+  bool AlreadyPredicated = !BBI.Predicate.empty();
+
+  BBI.NonPredSize = 0;
+  BBI.ExtraCost = 0;
+  BBI.ExtraCost2 = 0;
+  BBI.ClobbersPred = false;
+  for (MachineInstr &MI : make_range(Begin, End)) {
+    if (MI.isDebugInstr())
+      continue;
+
+    // It's unsafe to duplicate convergent instructions in this context, so set
+    // BBI.CannotBeCopied to true if MI is convergent.  To see why, consider the
+    // following CFG, which is subject to our "simple" transformation.
+    //
+    //    BB0     // if (c1) goto BB1; else goto BB2;
+    //   /   \
+    //  BB1   |
+    //   |   BB2  // if (c2) goto TBB; else goto FBB;
+    //   |   / |
+    //   |  /  |
+    //   TBB   |
+    //    |    |
+    //    |   FBB
+    //    |
+    //    exit
+    //
+    // Suppose we want to move TBB's contents up into BB1 and BB2 (in BB1 they'd
+    // be unconditional, and in BB2, they'd be predicated upon c2), and suppose
+    // TBB contains a convergent instruction.  This is safe iff doing so does
+    // not add a control-flow dependency to the convergent instruction -- i.e.,
+    // it's safe iff the set of control flows that leads us to the convergent
+    // instruction does not get smaller after the transformation.
+    //
+    // Originally we executed TBB if c1 || c2.  After the transformation, there
+    // are two copies of TBB's instructions.  We get to the first if c1, and we
+    // get to the second if !c1 && c2.
+    //
+    // There are clearly fewer ways to satisfy the condition "c1" than
+    // "c1 || c2".  Since we've shrunk the set of control flows which lead to
+    // our convergent instruction, the transformation is unsafe.
+    if (MI.isNotDuplicable() || MI.isConvergent())
+      BBI.CannotBeCopied = true;
+
+    bool isPredicated = TII->isPredicated(MI);
+    bool isCondBr = BBI.IsBrAnalyzable && MI.isConditionalBranch();
+
+    if (BranchUnpredicable && MI.isBranch()) {
+      BBI.IsUnpredicable = true;
+      return;
+    }
+
+    // A conditional branch is not predicable, but it may be eliminated.
+    if (isCondBr)
+      continue;
+
+    if (!isPredicated) {
+      BBI.NonPredSize++;
+      unsigned ExtraPredCost = TII->getPredicationCost(MI);
+      unsigned NumCycles = SchedModel.computeInstrLatency(&MI, false);
+      if (NumCycles > 1)
+        BBI.ExtraCost += NumCycles-1;
+      BBI.ExtraCost2 += ExtraPredCost;
+    } else if (!AlreadyPredicated) {
+      // FIXME: This instruction is already predicated before the
+      // if-conversion pass. It's probably something like a conditional move.
+      // Mark this block unpredicable for now.
+      BBI.IsUnpredicable = true;
+      return;
+    }
+
+    if (BBI.ClobbersPred && !isPredicated) {
+      // Predicate modification instruction should end the block (except for
+      // already predicated instructions and end of block branches).
+      // Predicate may have been modified, the subsequent (currently)
+      // unpredicated instructions cannot be correctly predicated.
+      BBI.IsUnpredicable = true;
+      return;
+    }
+
+    // FIXME: Make use of PredDefs? e.g. ADDC, SUBC sets predicates but are
+    // still potentially predicable.
+    std::vector<MachineOperand> PredDefs;
+    if (TII->ClobbersPredicate(MI, PredDefs, true))
+      BBI.ClobbersPred = true;
+
+    if (!TII->isPredicable(MI)) {
+      BBI.IsUnpredicable = true;
+      return;
+    }
+  }
+}
+
+/// Determine if the block is a suitable candidate to be predicated by the
+/// specified predicate.
+/// @param BBI BBInfo for the block to check
+/// @param Pred Predicate array for the branch that leads to BBI
+/// @param isTriangle true if the Analysis is for a triangle
+/// @param RevBranch true if Reverse(Pred) leads to BBI (e.g. BBI is the false
+///        case
+/// @param hasCommonTail true if BBI shares a tail with a sibling block that
+///        contains any instruction that would make the block unpredicable.
+bool IfConverter::FeasibilityAnalysis(BBInfo &BBI,
+                                      SmallVectorImpl<MachineOperand> &Pred,
+                                      bool isTriangle, bool RevBranch,
+                                      bool hasCommonTail) {
+  // If the block is dead or unpredicable, then it cannot be predicated.
+  // Two blocks may share a common unpredicable tail, but this doesn't prevent
+  // them from being if-converted. The non-shared portion is assumed to have
+  // been checked
+  if (BBI.IsDone || (BBI.IsUnpredicable && !hasCommonTail))
+    return false;
+
+  // If it is already predicated but we couldn't analyze its terminator, the
+  // latter might fallthrough, but we can't determine where to.
+  // Conservatively avoid if-converting again.
+  if (BBI.Predicate.size() && !BBI.IsBrAnalyzable)
+    return false;
+
+  // If it is already predicated, check if the new predicate subsumes
+  // its predicate.
+  if (BBI.Predicate.size() && !TII->SubsumesPredicate(Pred, BBI.Predicate))
+    return false;
+
+  if (!hasCommonTail && BBI.BrCond.size()) {
+    if (!isTriangle)
+      return false;
+
+    // Test predicate subsumption.
+    SmallVector<MachineOperand, 4> RevPred(Pred.begin(), Pred.end());
+    SmallVector<MachineOperand, 4> Cond(BBI.BrCond.begin(), BBI.BrCond.end());
+    if (RevBranch) {
+      if (TII->reverseBranchCondition(Cond))
+        return false;
+    }
+    if (TII->reverseBranchCondition(RevPred) ||
+        !TII->SubsumesPredicate(Cond, RevPred))
+      return false;
+  }
+
+  return true;
+}
+
+/// Analyze the structure of the sub-CFG starting from the specified block.
+/// Record its successors and whether it looks like an if-conversion candidate.
+void IfConverter::AnalyzeBlock(
+    MachineBasicBlock &MBB, std::vector<std::unique_ptr<IfcvtToken>> &Tokens) {
+  struct BBState {
+    BBState(MachineBasicBlock &MBB) : MBB(&MBB) {}
+    MachineBasicBlock *MBB;
+
+    /// This flag is true if MBB's successors have been analyzed.
+    bool SuccsAnalyzed = false;
+  };
+
+  // Push MBB to the stack.
+  SmallVector<BBState, 16> BBStack(1, MBB);
+
+  while (!BBStack.empty()) {
+    BBState &State = BBStack.back();
+    MachineBasicBlock *BB = State.MBB;
+    BBInfo &BBI = BBAnalysis[BB->getNumber()];
+
+    if (!State.SuccsAnalyzed) {
+      if (BBI.IsAnalyzed || BBI.IsBeingAnalyzed) {
+        BBStack.pop_back();
+        continue;
+      }
+
+      BBI.BB = BB;
+      BBI.IsBeingAnalyzed = true;
+
+      AnalyzeBranches(BBI);
+      MachineBasicBlock::iterator Begin = BBI.BB->begin();
+      MachineBasicBlock::iterator End = BBI.BB->end();
+      ScanInstructions(BBI, Begin, End);
+
+      // Unanalyzable or ends with fallthrough or unconditional branch, or if is
+      // not considered for ifcvt anymore.
+      if (!BBI.IsBrAnalyzable || BBI.BrCond.empty() || BBI.IsDone) {
+        BBI.IsBeingAnalyzed = false;
+        BBI.IsAnalyzed = true;
+        BBStack.pop_back();
+        continue;
+      }
+
+      // Do not ifcvt if either path is a back edge to the entry block.
+      if (BBI.TrueBB == BB || BBI.FalseBB == BB) {
+        BBI.IsBeingAnalyzed = false;
+        BBI.IsAnalyzed = true;
+        BBStack.pop_back();
+        continue;
+      }
+
+      // Do not ifcvt if true and false fallthrough blocks are the same.
+      if (!BBI.FalseBB) {
+        BBI.IsBeingAnalyzed = false;
+        BBI.IsAnalyzed = true;
+        BBStack.pop_back();
+        continue;
+      }
+
+      // Push the False and True blocks to the stack.
+      State.SuccsAnalyzed = true;
+      BBStack.push_back(*BBI.FalseBB);
+      BBStack.push_back(*BBI.TrueBB);
+      continue;
+    }
+
+    BBInfo &TrueBBI = BBAnalysis[BBI.TrueBB->getNumber()];
+    BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
+
+    if (TrueBBI.IsDone && FalseBBI.IsDone) {
+      BBI.IsBeingAnalyzed = false;
+      BBI.IsAnalyzed = true;
+      BBStack.pop_back();
+      continue;
+    }
+
+    SmallVector<MachineOperand, 4>
+        RevCond(BBI.BrCond.begin(), BBI.BrCond.end());
+    bool CanRevCond = !TII->reverseBranchCondition(RevCond);
+
+    unsigned Dups = 0;
+    unsigned Dups2 = 0;
+    bool TNeedSub = !TrueBBI.Predicate.empty();
+    bool FNeedSub = !FalseBBI.Predicate.empty();
+    bool Enqueued = false;
+
+    BranchProbability Prediction = MBPI->getEdgeProbability(BB, TrueBBI.BB);
+
+    if (CanRevCond) {
+      BBInfo TrueBBICalc, FalseBBICalc;
+      auto feasibleDiamond = [&](bool Forked) {
+        bool MeetsSize = MeetIfcvtSizeLimit(TrueBBICalc, FalseBBICalc, *BB,
+                                            Dups + Dups2, Prediction, Forked);
+        bool TrueFeasible = FeasibilityAnalysis(TrueBBI, BBI.BrCond,
+                                                /* IsTriangle */ false, /* RevCond */ false,
+                                                /* hasCommonTail */ true);
+        bool FalseFeasible = FeasibilityAnalysis(FalseBBI, RevCond,
+                                                 /* IsTriangle */ false, /* RevCond */ false,
+                                                 /* hasCommonTail */ true);
+        return MeetsSize && TrueFeasible && FalseFeasible;
+      };
+
+      if (ValidDiamond(TrueBBI, FalseBBI, Dups, Dups2,
+                       TrueBBICalc, FalseBBICalc)) {
+        if (feasibleDiamond(false)) {
+          // Diamond:
+          //   EBB
+          //   / \_
+          //  |   |
+          // TBB FBB
+          //   \ /
+          //  TailBB
+          // Note TailBB can be empty.
+          Tokens.push_back(std::make_unique<IfcvtToken>(
+              BBI, ICDiamond, TNeedSub | FNeedSub, Dups, Dups2,
+              (bool) TrueBBICalc.ClobbersPred, (bool) FalseBBICalc.ClobbersPred));
+          Enqueued = true;
+        }
+      } else if (ValidForkedDiamond(TrueBBI, FalseBBI, Dups, Dups2,
+                                    TrueBBICalc, FalseBBICalc)) {
+        if (feasibleDiamond(true)) {
+          // ForkedDiamond:
+          // if TBB and FBB have a common tail that includes their conditional
+          // branch instructions, then we can If Convert this pattern.
+          //          EBB
+          //         _/ \_
+          //         |   |
+          //        TBB  FBB
+          //        / \ /   \
+          //  FalseBB TrueBB FalseBB
+          //
+          Tokens.push_back(std::make_unique<IfcvtToken>(
+              BBI, ICForkedDiamond, TNeedSub | FNeedSub, Dups, Dups2,
+              (bool) TrueBBICalc.ClobbersPred, (bool) FalseBBICalc.ClobbersPred));
+          Enqueued = true;
+        }
+      }
+    }
+
+    if (ValidTriangle(TrueBBI, FalseBBI, false, Dups, Prediction) &&
+        MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize + TrueBBI.ExtraCost,
+                           TrueBBI.ExtraCost2, Prediction) &&
+        FeasibilityAnalysis(TrueBBI, BBI.BrCond, true)) {
+      // Triangle:
+      //   EBB
+      //   | \_
+      //   |  |
+      //   | TBB
+      //   |  /
+      //   FBB
+      Tokens.push_back(
+          std::make_unique<IfcvtToken>(BBI, ICTriangle, TNeedSub, Dups));
+      Enqueued = true;
+    }
+
+    if (ValidTriangle(TrueBBI, FalseBBI, true, Dups, Prediction) &&
+        MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize + TrueBBI.ExtraCost,
+                           TrueBBI.ExtraCost2, Prediction) &&
+        FeasibilityAnalysis(TrueBBI, BBI.BrCond, true, true)) {
+      Tokens.push_back(
+          std::make_unique<IfcvtToken>(BBI, ICTriangleRev, TNeedSub, Dups));
+      Enqueued = true;
+    }
+
+    if (ValidSimple(TrueBBI, Dups, Prediction) &&
+        MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize + TrueBBI.ExtraCost,
+                           TrueBBI.ExtraCost2, Prediction) &&
+        FeasibilityAnalysis(TrueBBI, BBI.BrCond)) {
+      // Simple (split, no rejoin):
+      //   EBB
+      //   | \_
+      //   |  |
+      //   | TBB---> exit
+      //   |
+      //   FBB
+      Tokens.push_back(
+          std::make_unique<IfcvtToken>(BBI, ICSimple, TNeedSub, Dups));
+      Enqueued = true;
+    }
+
+    if (CanRevCond) {
+      // Try the other path...
+      if (ValidTriangle(FalseBBI, TrueBBI, false, Dups,
+                        Prediction.getCompl()) &&
+          MeetIfcvtSizeLimit(*FalseBBI.BB,
+                             FalseBBI.NonPredSize + FalseBBI.ExtraCost,
+                             FalseBBI.ExtraCost2, Prediction.getCompl()) &&
+          FeasibilityAnalysis(FalseBBI, RevCond, true)) {
+        Tokens.push_back(std::make_unique<IfcvtToken>(BBI, ICTriangleFalse,
+                                                       FNeedSub, Dups));
+        Enqueued = true;
+      }
+
+      if (ValidTriangle(FalseBBI, TrueBBI, true, Dups,
+                        Prediction.getCompl()) &&
+          MeetIfcvtSizeLimit(*FalseBBI.BB,
+                             FalseBBI.NonPredSize + FalseBBI.ExtraCost,
+                           FalseBBI.ExtraCost2, Prediction.getCompl()) &&
+        FeasibilityAnalysis(FalseBBI, RevCond, true, true)) {
+        Tokens.push_back(
+            std::make_unique<IfcvtToken>(BBI, ICTriangleFRev, FNeedSub, Dups));
+        Enqueued = true;
+      }
+
+      if (ValidSimple(FalseBBI, Dups, Prediction.getCompl()) &&
+          MeetIfcvtSizeLimit(*FalseBBI.BB,
+                             FalseBBI.NonPredSize + FalseBBI.ExtraCost,
+                             FalseBBI.ExtraCost2, Prediction.getCompl()) &&
+          FeasibilityAnalysis(FalseBBI, RevCond)) {
+        Tokens.push_back(
+            std::make_unique<IfcvtToken>(BBI, ICSimpleFalse, FNeedSub, Dups));
+        Enqueued = true;
+      }
+    }
+
+    BBI.IsEnqueued = Enqueued;
+    BBI.IsBeingAnalyzed = false;
+    BBI.IsAnalyzed = true;
+    BBStack.pop_back();
+  }
+}
+
+/// Analyze all blocks and find entries for all if-conversion candidates.
+void IfConverter::AnalyzeBlocks(
+    MachineFunction &MF, std::vector<std::unique_ptr<IfcvtToken>> &Tokens) {
+  for (MachineBasicBlock &MBB : MF)
+    AnalyzeBlock(MBB, Tokens);
+
+  // Sort to favor more complex ifcvt scheme.
+  llvm::stable_sort(Tokens, IfcvtTokenCmp);
+}
+
+/// Returns true either if ToMBB is the next block after MBB or that all the
+/// intervening blocks are empty (given MBB can fall through to its next block).
+static bool canFallThroughTo(MachineBasicBlock &MBB, MachineBasicBlock &ToMBB) {
+  MachineFunction::iterator PI = MBB.getIterator();
+  MachineFunction::iterator I = std::next(PI);
+  MachineFunction::iterator TI = ToMBB.getIterator();
+  MachineFunction::iterator E = MBB.getParent()->end();
+  while (I != TI) {
+    // Check isSuccessor to avoid case where the next block is empty, but
+    // it's not a successor.
+    if (I == E || !I->empty() || !PI->isSuccessor(&*I))
+      return false;
+    PI = I++;
+  }
+  // Finally see if the last I is indeed a successor to PI.
+  return PI->isSuccessor(&*I);
+}
+
+/// Invalidate predecessor BB info so it would be re-analyzed to determine if it
+/// can be if-converted. If predecessor is already enqueued, dequeue it!
+void IfConverter::InvalidatePreds(MachineBasicBlock &MBB) {
+  for (const MachineBasicBlock *Predecessor : MBB.predecessors()) {
+    BBInfo &PBBI = BBAnalysis[Predecessor->getNumber()];
+    if (PBBI.IsDone || PBBI.BB == &MBB)
+      continue;
+    PBBI.IsAnalyzed = false;
+    PBBI.IsEnqueued = false;
+  }
+}
+
+/// Inserts an unconditional branch from \p MBB to \p ToMBB.
+static void InsertUncondBranch(MachineBasicBlock &MBB, MachineBasicBlock &ToMBB,
+                               const TargetInstrInfo *TII) {
+  DebugLoc dl;  // FIXME: this is nowhere
+  SmallVector<MachineOperand, 0> NoCond;
+  TII->insertBranch(MBB, &ToMBB, nullptr, NoCond, dl);
+}
+
+/// Behaves like LiveRegUnits::StepForward() but also adds implicit uses to all
+/// values defined in MI which are also live/used by MI.
+static void UpdatePredRedefs(MachineInstr &MI, LivePhysRegs &Redefs) {
+  const TargetRegisterInfo *TRI = MI.getMF()->getSubtarget().getRegisterInfo();
+
+  // Before stepping forward past MI, remember which regs were live
+  // before MI. This is needed to set the Undef flag only when reg is
+  // dead.
+  SparseSet<MCPhysReg, identity<MCPhysReg>> LiveBeforeMI;
+  LiveBeforeMI.setUniverse(TRI->getNumRegs());
+  for (unsigned Reg : Redefs)
+    LiveBeforeMI.insert(Reg);
+
+  SmallVector<std::pair<MCPhysReg, const MachineOperand*>, 4> Clobbers;
+  Redefs.stepForward(MI, Clobbers);
+
+  // Now add the implicit uses for each of the clobbered values.
+  for (auto Clobber : Clobbers) {
+    // FIXME: Const cast here is nasty, but better than making StepForward
+    // take a mutable instruction instead of const.
+    unsigned Reg = Clobber.first;
+    MachineOperand &Op = const_cast<MachineOperand&>(*Clobber.second);
+    MachineInstr *OpMI = Op.getParent();
+    MachineInstrBuilder MIB(*OpMI->getMF(), OpMI);
+    if (Op.isRegMask()) {
+      // First handle regmasks.  They clobber any entries in the mask which
+      // means that we need a def for those registers.
+      if (LiveBeforeMI.count(Reg))
+        MIB.addReg(Reg, RegState::Implicit);
+
+      // We also need to add an implicit def of this register for the later
+      // use to read from.
+      // For the register allocator to have allocated a register clobbered
+      // by the call which is used later, it must be the case that
+      // the call doesn't return.
+      MIB.addReg(Reg, RegState::Implicit | RegState::Define);
+      continue;
+    }
+    if (any_of(TRI->subregs_inclusive(Reg),
+               [&](MCPhysReg S) { return LiveBeforeMI.count(S); }))
+      MIB.addReg(Reg, RegState::Implicit);
+  }
+}
+
+/// If convert a simple (split, no rejoin) sub-CFG.
+bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
+  BBInfo &TrueBBI  = BBAnalysis[BBI.TrueBB->getNumber()];
+  BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
+  BBInfo *CvtBBI = &TrueBBI;
+  BBInfo *NextBBI = &FalseBBI;
+
+  SmallVector<MachineOperand, 4> Cond(BBI.BrCond.begin(), BBI.BrCond.end());
+  if (Kind == ICSimpleFalse)
+    std::swap(CvtBBI, NextBBI);
+
+  MachineBasicBlock &CvtMBB = *CvtBBI->BB;
+  MachineBasicBlock &NextMBB = *NextBBI->BB;
+  if (CvtBBI->IsDone ||
+      (CvtBBI->CannotBeCopied && CvtMBB.pred_size() > 1)) {
+    // Something has changed. It's no longer safe to predicate this block.
+    BBI.IsAnalyzed = false;
+    CvtBBI->IsAnalyzed = false;
+    return false;
+  }
+
+  if (CvtMBB.hasAddressTaken())
+    // Conservatively abort if-conversion if BB's address is taken.
+    return false;
+
+  if (Kind == ICSimpleFalse)
+    if (TII->reverseBranchCondition(Cond))
+      llvm_unreachable("Unable to reverse branch condition!");
+
+  Redefs.init(*TRI);
+
+  if (MRI->tracksLiveness()) {
+    // Initialize liveins to the first BB. These are potentially redefined by
+    // predicated instructions.
+    Redefs.addLiveInsNoPristines(CvtMBB);
+    Redefs.addLiveInsNoPristines(NextMBB);
+  }
+
+  // Remove the branches from the entry so we can add the contents of the true
+  // block to it.
+  BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
+
+  if (CvtMBB.pred_size() > 1) {
+    // Copy instructions in the true block, predicate them, and add them to
+    // the entry block.
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond);
+
+    // Keep the CFG updated.
+    BBI.BB->removeSuccessor(&CvtMBB, true);
+  } else {
+    // Predicate the instructions in the true block.
+    PredicateBlock(*CvtBBI, CvtMBB.end(), Cond);
+
+    // Merge converted block into entry block. The BB to Cvt edge is removed
+    // by MergeBlocks.
+    MergeBlocks(BBI, *CvtBBI);
+  }
+
+  bool IterIfcvt = true;
+  if (!canFallThroughTo(*BBI.BB, NextMBB)) {
+    InsertUncondBranch(*BBI.BB, NextMBB, TII);
+    BBI.HasFallThrough = false;
+    // Now ifcvt'd block will look like this:
+    // BB:
+    // ...
+    // t, f = cmp
+    // if t op
+    // b BBf
+    //
+    // We cannot further ifcvt this block because the unconditional branch
+    // will have to be predicated on the new condition, that will not be
+    // available if cmp executes.
+    IterIfcvt = false;
+  }
+
+  // Update block info. BB can be iteratively if-converted.
+  if (!IterIfcvt)
+    BBI.IsDone = true;
+  InvalidatePreds(*BBI.BB);
+  CvtBBI->IsDone = true;
+
+  // FIXME: Must maintain LiveIns.
+  return true;
+}
+
+/// If convert a triangle sub-CFG.
+bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
+  BBInfo &TrueBBI = BBAnalysis[BBI.TrueBB->getNumber()];
+  BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
+  BBInfo *CvtBBI = &TrueBBI;
+  BBInfo *NextBBI = &FalseBBI;
+  DebugLoc dl;  // FIXME: this is nowhere
+
+  SmallVector<MachineOperand, 4> Cond(BBI.BrCond.begin(), BBI.BrCond.end());
+  if (Kind == ICTriangleFalse || Kind == ICTriangleFRev)
+    std::swap(CvtBBI, NextBBI);
+
+  MachineBasicBlock &CvtMBB = *CvtBBI->BB;
+  MachineBasicBlock &NextMBB = *NextBBI->BB;
+  if (CvtBBI->IsDone ||
+      (CvtBBI->CannotBeCopied && CvtMBB.pred_size() > 1)) {
+    // Something has changed. It's no longer safe to predicate this block.
+    BBI.IsAnalyzed = false;
+    CvtBBI->IsAnalyzed = false;
+    return false;
+  }
+
+  if (CvtMBB.hasAddressTaken())
+    // Conservatively abort if-conversion if BB's address is taken.
+    return false;
+
+  if (Kind == ICTriangleFalse || Kind == ICTriangleFRev)
+    if (TII->reverseBranchCondition(Cond))
+      llvm_unreachable("Unable to reverse branch condition!");
+
+  if (Kind == ICTriangleRev || Kind == ICTriangleFRev) {
+    if (reverseBranchCondition(*CvtBBI)) {
+      // BB has been changed, modify its predecessors (except for this
+      // one) so they don't get ifcvt'ed based on bad intel.
+      for (MachineBasicBlock *PBB : CvtMBB.predecessors()) {
+        if (PBB == BBI.BB)
+          continue;
+        BBInfo &PBBI = BBAnalysis[PBB->getNumber()];
+        if (PBBI.IsEnqueued) {
+          PBBI.IsAnalyzed = false;
+          PBBI.IsEnqueued = false;
+        }
+      }
+    }
+  }
+
+  // Initialize liveins to the first BB. These are potentially redefined by
+  // predicated instructions.
+  Redefs.init(*TRI);
+  if (MRI->tracksLiveness()) {
+    Redefs.addLiveInsNoPristines(CvtMBB);
+    Redefs.addLiveInsNoPristines(NextMBB);
+  }
+
+  bool HasEarlyExit = CvtBBI->FalseBB != nullptr;
+  BranchProbability CvtNext, CvtFalse, BBNext, BBCvt;
+
+  if (HasEarlyExit) {
+    // Get probabilities before modifying CvtMBB and BBI.BB.
+    CvtNext = MBPI->getEdgeProbability(&CvtMBB, &NextMBB);
+    CvtFalse = MBPI->getEdgeProbability(&CvtMBB, CvtBBI->FalseBB);
+    BBNext = MBPI->getEdgeProbability(BBI.BB, &NextMBB);
+    BBCvt = MBPI->getEdgeProbability(BBI.BB, &CvtMBB);
+  }
+
+  // Remove the branches from the entry so we can add the contents of the true
+  // block to it.
+  BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
+
+  if (CvtMBB.pred_size() > 1) {
+    // Copy instructions in the true block, predicate them, and add them to
+    // the entry block.
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true);
+  } else {
+    // Predicate the 'true' block after removing its branch.
+    CvtBBI->NonPredSize -= TII->removeBranch(CvtMBB);
+    PredicateBlock(*CvtBBI, CvtMBB.end(), Cond);
+
+    // Now merge the entry of the triangle with the true block.
+    MergeBlocks(BBI, *CvtBBI, false);
+  }
+
+  // Keep the CFG updated.
+  BBI.BB->removeSuccessor(&CvtMBB, true);
+
+  // If 'true' block has a 'false' successor, add an exit branch to it.
+  if (HasEarlyExit) {
+    SmallVector<MachineOperand, 4> RevCond(CvtBBI->BrCond.begin(),
+                                           CvtBBI->BrCond.end());
+    if (TII->reverseBranchCondition(RevCond))
+      llvm_unreachable("Unable to reverse branch condition!");
+
+    // Update the edge probability for both CvtBBI->FalseBB and NextBBI.
+    // NewNext = New_Prob(BBI.BB, NextMBB) =
+    //   Prob(BBI.BB, NextMBB) +
+    //   Prob(BBI.BB, CvtMBB) * Prob(CvtMBB, NextMBB)
+    // NewFalse = New_Prob(BBI.BB, CvtBBI->FalseBB) =
+    //   Prob(BBI.BB, CvtMBB) * Prob(CvtMBB, CvtBBI->FalseBB)
+    auto NewTrueBB = getNextBlock(*BBI.BB);
+    auto NewNext = BBNext + BBCvt * CvtNext;
+    auto NewTrueBBIter = find(BBI.BB->successors(), NewTrueBB);
+    if (NewTrueBBIter != BBI.BB->succ_end())
+      BBI.BB->setSuccProbability(NewTrueBBIter, NewNext);
+
+    auto NewFalse = BBCvt * CvtFalse;
+    TII->insertBranch(*BBI.BB, CvtBBI->FalseBB, nullptr, RevCond, dl);
+    BBI.BB->addSuccessor(CvtBBI->FalseBB, NewFalse);
+  }
+
+  // Merge in the 'false' block if the 'false' block has no other
+  // predecessors. Otherwise, add an unconditional branch to 'false'.
+  bool FalseBBDead = false;
+  bool IterIfcvt = true;
+  bool isFallThrough = canFallThroughTo(*BBI.BB, NextMBB);
+  if (!isFallThrough) {
+    // Only merge them if the true block does not fallthrough to the false
+    // block. By not merging them, we make it possible to iteratively
+    // ifcvt the blocks.
+    if (!HasEarlyExit &&
+        NextMBB.pred_size() == 1 && !NextBBI->HasFallThrough &&
+        !NextMBB.hasAddressTaken()) {
+      MergeBlocks(BBI, *NextBBI);
+      FalseBBDead = true;
+    } else {
+      InsertUncondBranch(*BBI.BB, NextMBB, TII);
+      BBI.HasFallThrough = false;
+    }
+    // Mixed predicated and unpredicated code. This cannot be iteratively
+    // predicated.
+    IterIfcvt = false;
+  }
+
+  // Update block info. BB can be iteratively if-converted.
+  if (!IterIfcvt)
+    BBI.IsDone = true;
+  InvalidatePreds(*BBI.BB);
+  CvtBBI->IsDone = true;
+  if (FalseBBDead)
+    NextBBI->IsDone = true;
+
+  // FIXME: Must maintain LiveIns.
+  return true;
+}
+
+/// Common code shared between diamond conversions.
+/// \p BBI, \p TrueBBI, and \p FalseBBI form the diamond shape.
+/// \p NumDups1 - number of shared instructions at the beginning of \p TrueBBI
+///               and FalseBBI
+/// \p NumDups2 - number of shared instructions at the end of \p TrueBBI
+///               and \p FalseBBI
+/// \p RemoveBranch - Remove the common branch of the two blocks before
+///                   predicating. Only false for unanalyzable fallthrough
+///                   cases. The caller will replace the branch if necessary.
+/// \p MergeAddEdges - Add successor edges when merging blocks. Only false for
+///                    unanalyzable fallthrough
+bool IfConverter::IfConvertDiamondCommon(
+    BBInfo &BBI, BBInfo &TrueBBI, BBInfo &FalseBBI,
+    unsigned NumDups1, unsigned NumDups2,
+    bool TClobbersPred, bool FClobbersPred,
+    bool RemoveBranch, bool MergeAddEdges) {
+
+  if (TrueBBI.IsDone || FalseBBI.IsDone ||
+      TrueBBI.BB->pred_size() > 1 || FalseBBI.BB->pred_size() > 1) {
+    // Something has changed. It's no longer safe to predicate these blocks.
+    BBI.IsAnalyzed = false;
+    TrueBBI.IsAnalyzed = false;
+    FalseBBI.IsAnalyzed = false;
+    return false;
+  }
+
+  if (TrueBBI.BB->hasAddressTaken() || FalseBBI.BB->hasAddressTaken())
+    // Conservatively abort if-conversion if either BB has its address taken.
+    return false;
+
+  // Put the predicated instructions from the 'true' block before the
+  // instructions from the 'false' block, unless the true block would clobber
+  // the predicate, in which case, do the opposite.
+  BBInfo *BBI1 = &TrueBBI;
+  BBInfo *BBI2 = &FalseBBI;
+  SmallVector<MachineOperand, 4> RevCond(BBI.BrCond.begin(), BBI.BrCond.end());
+  if (TII->reverseBranchCondition(RevCond))
+    llvm_unreachable("Unable to reverse branch condition!");
+  SmallVector<MachineOperand, 4> *Cond1 = &BBI.BrCond;
+  SmallVector<MachineOperand, 4> *Cond2 = &RevCond;
+
+  // Figure out the more profitable ordering.
+  bool DoSwap = false;
+  if (TClobbersPred && !FClobbersPred)
+    DoSwap = true;
+  else if (!TClobbersPred && !FClobbersPred) {
+    if (TrueBBI.NonPredSize > FalseBBI.NonPredSize)
+      DoSwap = true;
+  } else if (TClobbersPred && FClobbersPred)
+    llvm_unreachable("Predicate info cannot be clobbered by both sides.");
+  if (DoSwap) {
+    std::swap(BBI1, BBI2);
+    std::swap(Cond1, Cond2);
+  }
+
+  // Remove the conditional branch from entry to the blocks.
+  BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
+
+  MachineBasicBlock &MBB1 = *BBI1->BB;
+  MachineBasicBlock &MBB2 = *BBI2->BB;
+
+  // Initialize the Redefs:
+  // - BB2 live-in regs need implicit uses before being redefined by BB1
+  //   instructions.
+  // - BB1 live-out regs need implicit uses before being redefined by BB2
+  //   instructions. We start with BB1 live-ins so we have the live-out regs
+  //   after tracking the BB1 instructions.
+  Redefs.init(*TRI);
+  if (MRI->tracksLiveness()) {
+    Redefs.addLiveInsNoPristines(MBB1);
+    Redefs.addLiveInsNoPristines(MBB2);
+  }
+
+  // Remove the duplicated instructions at the beginnings of both paths.
+  // Skip dbg_value instructions.
+  MachineBasicBlock::iterator DI1 = MBB1.getFirstNonDebugInstr(false);
+  MachineBasicBlock::iterator DI2 = MBB2.getFirstNonDebugInstr(false);
+  BBI1->NonPredSize -= NumDups1;
+  BBI2->NonPredSize -= NumDups1;
+
+  // Skip past the dups on each side separately since there may be
+  // differing dbg_value entries. NumDups1 can include a "return"
+  // instruction, if it's not marked as "branch".
+  for (unsigned i = 0; i < NumDups1; ++DI1) {
+    if (DI1 == MBB1.end())
+      break;
+    if (!DI1->isDebugInstr())
+      ++i;
+  }
+  while (NumDups1 != 0) {
+    // Since this instruction is going to be deleted, update call
+    // site info state if the instruction is call instruction.
+    if (DI2->shouldUpdateCallSiteInfo())
+      MBB2.getParent()->eraseCallSiteInfo(&*DI2);
+
+    ++DI2;
+    if (DI2 == MBB2.end())
+      break;
+    if (!DI2->isDebugInstr())
+      --NumDups1;
+  }
+
+  if (MRI->tracksLiveness()) {
+    for (const MachineInstr &MI : make_range(MBB1.begin(), DI1)) {
+      SmallVector<std::pair<MCPhysReg, const MachineOperand*>, 4> Dummy;
+      Redefs.stepForward(MI, Dummy);
+    }
+  }
+
+  BBI.BB->splice(BBI.BB->end(), &MBB1, MBB1.begin(), DI1);
+  MBB2.erase(MBB2.begin(), DI2);
+
+  // The branches have been checked to match, so it is safe to remove the
+  // branch in BB1 and rely on the copy in BB2. The complication is that
+  // the blocks may end with a return instruction, which may or may not
+  // be marked as "branch". If it's not, then it could be included in
+  // "dups1", leaving the blocks potentially empty after moving the common
+  // duplicates.
+#ifndef NDEBUG
+  // Unanalyzable branches must match exactly. Check that now.
+  if (!BBI1->IsBrAnalyzable)
+    verifySameBranchInstructions(&MBB1, &MBB2);
+#endif
+  // Remove duplicated instructions from the tail of MBB1: any branch
+  // instructions, and the common instructions counted by NumDups2.
+  DI1 = MBB1.end();
+  while (DI1 != MBB1.begin()) {
+    MachineBasicBlock::iterator Prev = std::prev(DI1);
+    if (!Prev->isBranch() && !Prev->isDebugInstr())
+      break;
+    DI1 = Prev;
+  }
+  for (unsigned i = 0; i != NumDups2; ) {
+    // NumDups2 only counted non-dbg_value instructions, so this won't
+    // run off the head of the list.
+    assert(DI1 != MBB1.begin());
+
+    --DI1;
+
+    // Since this instruction is going to be deleted, update call
+    // site info state if the instruction is call instruction.
+    if (DI1->shouldUpdateCallSiteInfo())
+      MBB1.getParent()->eraseCallSiteInfo(&*DI1);
+
+    // skip dbg_value instructions
+    if (!DI1->isDebugInstr())
+      ++i;
+  }
+  MBB1.erase(DI1, MBB1.end());
+
+  DI2 = BBI2->BB->end();
+  // The branches have been checked to match. Skip over the branch in the false
+  // block so that we don't try to predicate it.
+  if (RemoveBranch)
+    BBI2->NonPredSize -= TII->removeBranch(*BBI2->BB);
+  else {
+    // Make DI2 point to the end of the range where the common "tail"
+    // instructions could be found.
+    while (DI2 != MBB2.begin()) {
+      MachineBasicBlock::iterator Prev = std::prev(DI2);
+      if (!Prev->isBranch() && !Prev->isDebugInstr())
+        break;
+      DI2 = Prev;
+    }
+  }
+  while (NumDups2 != 0) {
+    // NumDups2 only counted non-dbg_value instructions, so this won't
+    // run off the head of the list.
+    assert(DI2 != MBB2.begin());
+    --DI2;
+    // skip dbg_value instructions
+    if (!DI2->isDebugInstr())
+      --NumDups2;
+  }
+
+  // Remember which registers would later be defined by the false block.
+  // This allows us not to predicate instructions in the true block that would
+  // later be re-defined. That is, rather than
+  //   subeq  r0, r1, #1
+  //   addne  r0, r1, #1
+  // generate:
+  //   sub    r0, r1, #1
+  //   addne  r0, r1, #1
+  SmallSet<MCPhysReg, 4> RedefsByFalse;
+  SmallSet<MCPhysReg, 4> ExtUses;
+  if (TII->isProfitableToUnpredicate(MBB1, MBB2)) {
+    for (const MachineInstr &FI : make_range(MBB2.begin(), DI2)) {
+      if (FI.isDebugInstr())
+        continue;
+      SmallVector<MCPhysReg, 4> Defs;
+      for (const MachineOperand &MO : FI.operands()) {
+        if (!MO.isReg())
+          continue;
+        Register Reg = MO.getReg();
+        if (!Reg)
+          continue;
+        if (MO.isDef()) {
+          Defs.push_back(Reg);
+        } else if (!RedefsByFalse.count(Reg)) {
+          // These are defined before ctrl flow reach the 'false' instructions.
+          // They cannot be modified by the 'true' instructions.
+          for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg))
+            ExtUses.insert(SubReg);
+        }
+      }
+
+      for (MCPhysReg Reg : Defs) {
+        if (!ExtUses.count(Reg)) {
+          for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg))
+            RedefsByFalse.insert(SubReg);
+        }
+      }
+    }
+  }
+
+  // Predicate the 'true' block.
+  PredicateBlock(*BBI1, MBB1.end(), *Cond1, &RedefsByFalse);
+
+  // After predicating BBI1, if there is a predicated terminator in BBI1 and
+  // a non-predicated in BBI2, then we don't want to predicate the one from
+  // BBI2. The reason is that if we merged these blocks, we would end up with
+  // two predicated terminators in the same block.
+  // Also, if the branches in MBB1 and MBB2 were non-analyzable, then don't
+  // predicate them either. They were checked to be identical, and so the
+  // same branch would happen regardless of which path was taken.
+  if (!MBB2.empty() && (DI2 == MBB2.end())) {
+    MachineBasicBlock::iterator BBI1T = MBB1.getFirstTerminator();
+    MachineBasicBlock::iterator BBI2T = MBB2.getFirstTerminator();
+    bool BB1Predicated = BBI1T != MBB1.end() && TII->isPredicated(*BBI1T);
+    bool BB2NonPredicated = BBI2T != MBB2.end() && !TII->isPredicated(*BBI2T);
+    if (BB2NonPredicated && (BB1Predicated || !BBI2->IsBrAnalyzable))
+      --DI2;
+  }
+
+  // Predicate the 'false' block.
+  PredicateBlock(*BBI2, DI2, *Cond2);
+
+  // Merge the true block into the entry of the diamond.
+  MergeBlocks(BBI, *BBI1, MergeAddEdges);
+  MergeBlocks(BBI, *BBI2, MergeAddEdges);
+  return true;
+}
+
+/// If convert an almost-diamond sub-CFG where the true
+/// and false blocks share a common tail.
+bool IfConverter::IfConvertForkedDiamond(
+    BBInfo &BBI, IfcvtKind Kind,
+    unsigned NumDups1, unsigned NumDups2,
+    bool TClobbersPred, bool FClobbersPred) {
+  BBInfo &TrueBBI  = BBAnalysis[BBI.TrueBB->getNumber()];
+  BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
+
+  // Save the debug location for later.
+  DebugLoc dl;
+  MachineBasicBlock::iterator TIE = TrueBBI.BB->getFirstTerminator();
+  if (TIE != TrueBBI.BB->end())
+    dl = TIE->getDebugLoc();
+  // Removing branches from both blocks is safe, because we have already
+  // determined that both blocks have the same branch instructions. The branch
+  // will be added back at the end, unpredicated.
+  if (!IfConvertDiamondCommon(
+      BBI, TrueBBI, FalseBBI,
+      NumDups1, NumDups2,
+      TClobbersPred, FClobbersPred,
+      /* RemoveBranch */ true, /* MergeAddEdges */ true))
+    return false;
+
+  // Add back the branch.
+  // Debug location saved above when removing the branch from BBI2
+  TII->insertBranch(*BBI.BB, TrueBBI.TrueBB, TrueBBI.FalseBB,
+                    TrueBBI.BrCond, dl);
+
+  // Update block info.
+  BBI.IsDone = TrueBBI.IsDone = FalseBBI.IsDone = true;
+  InvalidatePreds(*BBI.BB);
+
+  // FIXME: Must maintain LiveIns.
+  return true;
+}
+
+/// If convert a diamond sub-CFG.
+bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
+                                   unsigned NumDups1, unsigned NumDups2,
+                                   bool TClobbersPred, bool FClobbersPred) {
+  BBInfo &TrueBBI  = BBAnalysis[BBI.TrueBB->getNumber()];
+  BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
+  MachineBasicBlock *TailBB = TrueBBI.TrueBB;
+
+  // True block must fall through or end with an unanalyzable terminator.
+  if (!TailBB) {
+    if (blockAlwaysFallThrough(TrueBBI))
+      TailBB = FalseBBI.TrueBB;
+    assert((TailBB || !TrueBBI.IsBrAnalyzable) && "Unexpected!");
+  }
+
+  if (!IfConvertDiamondCommon(
+      BBI, TrueBBI, FalseBBI,
+      NumDups1, NumDups2,
+      TClobbersPred, FClobbersPred,
+      /* RemoveBranch */ TrueBBI.IsBrAnalyzable,
+      /* MergeAddEdges */ TailBB == nullptr))
+    return false;
+
+  // If the if-converted block falls through or unconditionally branches into
+  // the tail block, and the tail block does not have other predecessors, then
+  // fold the tail block in as well. Otherwise, unless it falls through to the
+  // tail, add a unconditional branch to it.
+  if (TailBB) {
+    // We need to remove the edges to the true and false blocks manually since
+    // we didn't let IfConvertDiamondCommon update the CFG.
+    BBI.BB->removeSuccessor(TrueBBI.BB);
+    BBI.BB->removeSuccessor(FalseBBI.BB, true);
+
+    BBInfo &TailBBI = BBAnalysis[TailBB->getNumber()];
+    bool CanMergeTail = !TailBBI.HasFallThrough &&
+      !TailBBI.BB->hasAddressTaken();
+    // The if-converted block can still have a predicated terminator
+    // (e.g. a predicated return). If that is the case, we cannot merge
+    // it with the tail block.
+    MachineBasicBlock::const_iterator TI = BBI.BB->getFirstTerminator();
+    if (TI != BBI.BB->end() && TII->isPredicated(*TI))
+      CanMergeTail = false;
+    // There may still be a fall-through edge from BBI1 or BBI2 to TailBB;
+    // check if there are any other predecessors besides those.
+    unsigned NumPreds = TailBB->pred_size();
+    if (NumPreds > 1)
+      CanMergeTail = false;
+    else if (NumPreds == 1 && CanMergeTail) {
+      MachineBasicBlock::pred_iterator PI = TailBB->pred_begin();
+      if (*PI != TrueBBI.BB && *PI != FalseBBI.BB)
+        CanMergeTail = false;
+    }
+    if (CanMergeTail) {
+      MergeBlocks(BBI, TailBBI);
+      TailBBI.IsDone = true;
+    } else {
+      BBI.BB->addSuccessor(TailBB, BranchProbability::getOne());
+      InsertUncondBranch(*BBI.BB, *TailBB, TII);
+      BBI.HasFallThrough = false;
+    }
+  }
+
+  // Update block info.
+  BBI.IsDone = TrueBBI.IsDone = FalseBBI.IsDone = true;
+  InvalidatePreds(*BBI.BB);
+
+  // FIXME: Must maintain LiveIns.
+  return true;
+}
+
+static bool MaySpeculate(const MachineInstr &MI,
+                         SmallSet<MCPhysReg, 4> &LaterRedefs) {
+  bool SawStore = true;
+  if (!MI.isSafeToMove(nullptr, SawStore))
+    return false;
+
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isDef() && !LaterRedefs.count(Reg))
+      return false;
+  }
+
+  return true;
+}
+
+/// Predicate instructions from the start of the block to the specified end with
+/// the specified condition.
+void IfConverter::PredicateBlock(BBInfo &BBI,
+                                 MachineBasicBlock::iterator E,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 SmallSet<MCPhysReg, 4> *LaterRedefs) {
+  bool AnyUnpred = false;
+  bool MaySpec = LaterRedefs != nullptr;
+  for (MachineInstr &I : make_range(BBI.BB->begin(), E)) {
+    if (I.isDebugInstr() || TII->isPredicated(I))
+      continue;
+    // It may be possible not to predicate an instruction if it's the 'true'
+    // side of a diamond and the 'false' side may re-define the instruction's
+    // defs.
+    if (MaySpec && MaySpeculate(I, *LaterRedefs)) {
+      AnyUnpred = true;
+      continue;
+    }
+    // If any instruction is predicated, then every instruction after it must
+    // be predicated.
+    MaySpec = false;
+    if (!TII->PredicateInstruction(I, Cond)) {
+#ifndef NDEBUG
+      dbgs() << "Unable to predicate " << I << "!\n";
+#endif
+      llvm_unreachable(nullptr);
+    }
+
+    // If the predicated instruction now redefines a register as the result of
+    // if-conversion, add an implicit kill.
+    UpdatePredRedefs(I, Redefs);
+  }
+
+  BBI.Predicate.append(Cond.begin(), Cond.end());
+
+  BBI.IsAnalyzed = false;
+  BBI.NonPredSize = 0;
+
+  ++NumIfConvBBs;
+  if (AnyUnpred)
+    ++NumUnpred;
+}
+
+/// Copy and predicate instructions from source BB to the destination block.
+/// Skip end of block branches if IgnoreBr is true.
+void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
+                                        SmallVectorImpl<MachineOperand> &Cond,
+                                        bool IgnoreBr) {
+  MachineFunction &MF = *ToBBI.BB->getParent();
+
+  MachineBasicBlock &FromMBB = *FromBBI.BB;
+  for (MachineInstr &I : FromMBB) {
+    // Do not copy the end of the block branches.
+    if (IgnoreBr && I.isBranch())
+      break;
+
+    MachineInstr *MI = MF.CloneMachineInstr(&I);
+    // Make a copy of the call site info.
+    if (I.isCandidateForCallSiteEntry())
+      MF.copyCallSiteInfo(&I, MI);
+
+    ToBBI.BB->insert(ToBBI.BB->end(), MI);
+    ToBBI.NonPredSize++;
+    unsigned ExtraPredCost = TII->getPredicationCost(I);
+    unsigned NumCycles = SchedModel.computeInstrLatency(&I, false);
+    if (NumCycles > 1)
+      ToBBI.ExtraCost += NumCycles-1;
+    ToBBI.ExtraCost2 += ExtraPredCost;
+
+    if (!TII->isPredicated(I) && !MI->isDebugInstr()) {
+      if (!TII->PredicateInstruction(*MI, Cond)) {
+#ifndef NDEBUG
+        dbgs() << "Unable to predicate " << I << "!\n";
+#endif
+        llvm_unreachable(nullptr);
+      }
+    }
+
+    // If the predicated instruction now redefines a register as the result of
+    // if-conversion, add an implicit kill.
+    UpdatePredRedefs(*MI, Redefs);
+  }
+
+  if (!IgnoreBr) {
+    std::vector<MachineBasicBlock *> Succs(FromMBB.succ_begin(),
+                                           FromMBB.succ_end());
+    MachineBasicBlock *NBB = getNextBlock(FromMBB);
+    MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr;
+
+    for (MachineBasicBlock *Succ : Succs) {
+      // Fallthrough edge can't be transferred.
+      if (Succ == FallThrough)
+        continue;
+      ToBBI.BB->addSuccessor(Succ);
+    }
+  }
+
+  ToBBI.Predicate.append(FromBBI.Predicate.begin(), FromBBI.Predicate.end());
+  ToBBI.Predicate.append(Cond.begin(), Cond.end());
+
+  ToBBI.ClobbersPred |= FromBBI.ClobbersPred;
+  ToBBI.IsAnalyzed = false;
+
+  ++NumDupBBs;
+}
+
+/// Move all instructions from FromBB to the end of ToBB.  This will leave
+/// FromBB as an empty block, so remove all of its successor edges and move it
+/// to the end of the function.  If AddEdges is true, i.e., when FromBBI's
+/// branch is being moved, add those successor edges to ToBBI and remove the old
+/// edge from ToBBI to FromBBI.
+void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
+  MachineBasicBlock &FromMBB = *FromBBI.BB;
+  assert(!FromMBB.hasAddressTaken() &&
+         "Removing a BB whose address is taken!");
+
+  // If we're about to splice an INLINEASM_BR from FromBBI, we need to update
+  // ToBBI's successor list accordingly.
+  if (FromMBB.mayHaveInlineAsmBr())
+    for (MachineInstr &MI : FromMBB)
+      if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
+        for (MachineOperand &MO : MI.operands())
+          if (MO.isMBB() && !ToBBI.BB->isSuccessor(MO.getMBB()))
+            ToBBI.BB->addSuccessor(MO.getMBB(), BranchProbability::getZero());
+
+  // In case FromMBB contains terminators (e.g. return instruction),
+  // first move the non-terminator instructions, then the terminators.
+  MachineBasicBlock::iterator FromTI = FromMBB.getFirstTerminator();
+  MachineBasicBlock::iterator ToTI = ToBBI.BB->getFirstTerminator();
+  ToBBI.BB->splice(ToTI, &FromMBB, FromMBB.begin(), FromTI);
+
+  // If FromBB has non-predicated terminator we should copy it at the end.
+  if (FromTI != FromMBB.end() && !TII->isPredicated(*FromTI))
+    ToTI = ToBBI.BB->end();
+  ToBBI.BB->splice(ToTI, &FromMBB, FromTI, FromMBB.end());
+
+  // Force normalizing the successors' probabilities of ToBBI.BB to convert all
+  // unknown probabilities into known ones.
+  // FIXME: This usage is too tricky and in the future we would like to
+  // eliminate all unknown probabilities in MBB.
+  if (ToBBI.IsBrAnalyzable)
+    ToBBI.BB->normalizeSuccProbs();
+
+  SmallVector<MachineBasicBlock *, 4> FromSuccs(FromMBB.successors());
+  MachineBasicBlock *NBB = getNextBlock(FromMBB);
+  MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr;
+  // The edge probability from ToBBI.BB to FromMBB, which is only needed when
+  // AddEdges is true and FromMBB is a successor of ToBBI.BB.
+  auto To2FromProb = BranchProbability::getZero();
+  if (AddEdges && ToBBI.BB->isSuccessor(&FromMBB)) {
+    // Remove the old edge but remember the edge probability so we can calculate
+    // the correct weights on the new edges being added further down.
+    To2FromProb = MBPI->getEdgeProbability(ToBBI.BB, &FromMBB);
+    ToBBI.BB->removeSuccessor(&FromMBB);
+  }
+
+  for (MachineBasicBlock *Succ : FromSuccs) {
+    // Fallthrough edge can't be transferred.
+    if (Succ == FallThrough) {
+      FromMBB.removeSuccessor(Succ);
+      continue;
+    }
+
+    auto NewProb = BranchProbability::getZero();
+    if (AddEdges) {
+      // Calculate the edge probability for the edge from ToBBI.BB to Succ,
+      // which is a portion of the edge probability from FromMBB to Succ. The
+      // portion ratio is the edge probability from ToBBI.BB to FromMBB (if
+      // FromBBI is a successor of ToBBI.BB. See comment below for exception).
+      NewProb = MBPI->getEdgeProbability(&FromMBB, Succ);
+
+      // To2FromProb is 0 when FromMBB is not a successor of ToBBI.BB. This
+      // only happens when if-converting a diamond CFG and FromMBB is the
+      // tail BB.  In this case FromMBB post-dominates ToBBI.BB and hence we
+      // could just use the probabilities on FromMBB's out-edges when adding
+      // new successors.
+      if (!To2FromProb.isZero())
+        NewProb *= To2FromProb;
+    }
+
+    FromMBB.removeSuccessor(Succ);
+
+    if (AddEdges) {
+      // If the edge from ToBBI.BB to Succ already exists, update the
+      // probability of this edge by adding NewProb to it. An example is shown
+      // below, in which A is ToBBI.BB and B is FromMBB. In this case we
+      // don't have to set C as A's successor as it already is. We only need to
+      // update the edge probability on A->C. Note that B will not be
+      // immediately removed from A's successors. It is possible that B->D is
+      // not removed either if D is a fallthrough of B. Later the edge A->D
+      // (generated here) and B->D will be combined into one edge. To maintain
+      // correct edge probability of this combined edge, we need to set the edge
+      // probability of A->B to zero, which is already done above. The edge
+      // probability on A->D is calculated by scaling the original probability
+      // on A->B by the probability of B->D.
+      //
+      // Before ifcvt:      After ifcvt (assume B->D is kept):
+      //
+      //       A                A
+      //      /|               /|\
+      //     / B              / B|
+      //    | /|             |  ||
+      //    |/ |             |  |/
+      //    C  D             C  D
+      //
+      if (ToBBI.BB->isSuccessor(Succ))
+        ToBBI.BB->setSuccProbability(
+            find(ToBBI.BB->successors(), Succ),
+            MBPI->getEdgeProbability(ToBBI.BB, Succ) + NewProb);
+      else
+        ToBBI.BB->addSuccessor(Succ, NewProb);
+    }
+  }
+
+  // Move the now empty FromMBB out of the way to the end of the function so
+  // it doesn't interfere with fallthrough checks done by canFallThroughTo().
+  MachineBasicBlock *Last = &*FromMBB.getParent()->rbegin();
+  if (Last != &FromMBB)
+    FromMBB.moveAfter(Last);
+
+  // Normalize the probabilities of ToBBI.BB's successors with all adjustment
+  // we've done above.
+  if (ToBBI.IsBrAnalyzable && FromBBI.IsBrAnalyzable)
+    ToBBI.BB->normalizeSuccProbs();
+
+  ToBBI.Predicate.append(FromBBI.Predicate.begin(), FromBBI.Predicate.end());
+  FromBBI.Predicate.clear();
+
+  ToBBI.NonPredSize += FromBBI.NonPredSize;
+  ToBBI.ExtraCost += FromBBI.ExtraCost;
+  ToBBI.ExtraCost2 += FromBBI.ExtraCost2;
+  FromBBI.NonPredSize = 0;
+  FromBBI.ExtraCost = 0;
+  FromBBI.ExtraCost2 = 0;
+
+  ToBBI.ClobbersPred |= FromBBI.ClobbersPred;
+  ToBBI.HasFallThrough = FromBBI.HasFallThrough;
+  ToBBI.IsAnalyzed = false;
+  FromBBI.IsAnalyzed = false;
+}
+
+FunctionPass *
+llvm::createIfConverter(std::function<bool(const MachineFunction &)> Ftor) {
+  return new IfConverter(std::move(Ftor));
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ImplicitNullChecks.cpp
new file mode 100644
index 000000000000..b2a7aad73411
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ImplicitNullChecks.cpp
@@ -0,0 +1,818 @@
+//===- ImplicitNullChecks.cpp - Fold null checks into memory accesses -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass turns explicit null checks of the form
+//
+//   test %r10, %r10
+//   je throw_npe
+//   movl (%r10), %esi
+//   ...
+//
+// to
+//
+//   faulting_load_op("movl (%r10), %esi", throw_npe)
+//   ...
+//
+// With the help of a runtime that understands the .fault_maps section,
+// faulting_load_op branches to throw_npe if executing movl (%r10), %esi incurs
+// a page fault.
+// Store and LoadStore are also supported.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/FaultMaps.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+
+using namespace llvm;
+
+static cl::opt<int> PageSize("imp-null-check-page-size",
+                             cl::desc("The page size of the target in bytes"),
+                             cl::init(4096), cl::Hidden);
+
+static cl::opt<unsigned> MaxInstsToConsider(
+    "imp-null-max-insts-to-consider",
+    cl::desc("The max number of instructions to consider hoisting loads over "
+             "(the algorithm is quadratic over this number)"),
+    cl::Hidden, cl::init(8));
+
+#define DEBUG_TYPE "implicit-null-checks"
+
+STATISTIC(NumImplicitNullChecks,
+          "Number of explicit null checks made implicit");
+
+namespace {
+
+class ImplicitNullChecks : public MachineFunctionPass {
+  /// Return true if \c computeDependence can process \p MI.
+  static bool canHandle(const MachineInstr *MI);
+
+  /// Helper function for \c computeDependence.  Return true if \p A
+  /// and \p B do not have any dependences between them, and can be
+  /// re-ordered without changing program semantics.
+  bool canReorder(const MachineInstr *A, const MachineInstr *B);
+
+  /// A data type for representing the result computed by \c
+  /// computeDependence.  States whether it is okay to reorder the
+  /// instruction passed to \c computeDependence with at most one
+  /// dependency.
+  struct DependenceResult {
+    /// Can we actually re-order \p MI with \p Insts (see \c
+    /// computeDependence).
+    bool CanReorder;
+
+    /// If non-std::nullopt, then an instruction in \p Insts that also must be
+    /// hoisted.
+    std::optional<ArrayRef<MachineInstr *>::iterator> PotentialDependence;
+
+    /*implicit*/ DependenceResult(
+        bool CanReorder,
+        std::optional<ArrayRef<MachineInstr *>::iterator> PotentialDependence)
+        : CanReorder(CanReorder), PotentialDependence(PotentialDependence) {
+      assert((!PotentialDependence || CanReorder) &&
+             "!CanReorder && PotentialDependence.hasValue() not allowed!");
+    }
+  };
+
+  /// Compute a result for the following question: can \p MI be
+  /// re-ordered from after \p Insts to before it.
+  ///
+  /// \c canHandle should return true for all instructions in \p
+  /// Insts.
+  DependenceResult computeDependence(const MachineInstr *MI,
+                                     ArrayRef<MachineInstr *> Block);
+
+  /// Represents one null check that can be made implicit.
+  class NullCheck {
+    // The memory operation the null check can be folded into.
+    MachineInstr *MemOperation;
+
+    // The instruction actually doing the null check (Ptr != 0).
+    MachineInstr *CheckOperation;
+
+    // The block the check resides in.
+    MachineBasicBlock *CheckBlock;
+
+    // The block branched to if the pointer is non-null.
+    MachineBasicBlock *NotNullSucc;
+
+    // The block branched to if the pointer is null.
+    MachineBasicBlock *NullSucc;
+
+    // If this is non-null, then MemOperation has a dependency on this
+    // instruction; and it needs to be hoisted to execute before MemOperation.
+    MachineInstr *OnlyDependency;
+
+  public:
+    explicit NullCheck(MachineInstr *memOperation, MachineInstr *checkOperation,
+                       MachineBasicBlock *checkBlock,
+                       MachineBasicBlock *notNullSucc,
+                       MachineBasicBlock *nullSucc,
+                       MachineInstr *onlyDependency)
+        : MemOperation(memOperation), CheckOperation(checkOperation),
+          CheckBlock(checkBlock), NotNullSucc(notNullSucc), NullSucc(nullSucc),
+          OnlyDependency(onlyDependency) {}
+
+    MachineInstr *getMemOperation() const { return MemOperation; }
+
+    MachineInstr *getCheckOperation() const { return CheckOperation; }
+
+    MachineBasicBlock *getCheckBlock() const { return CheckBlock; }
+
+    MachineBasicBlock *getNotNullSucc() const { return NotNullSucc; }
+
+    MachineBasicBlock *getNullSucc() const { return NullSucc; }
+
+    MachineInstr *getOnlyDependency() const { return OnlyDependency; }
+  };
+
+  const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  AliasAnalysis *AA = nullptr;
+  MachineFrameInfo *MFI = nullptr;
+
+  bool analyzeBlockForNullChecks(MachineBasicBlock &MBB,
+                                 SmallVectorImpl<NullCheck> &NullCheckList);
+  MachineInstr *insertFaultingInstr(MachineInstr *MI, MachineBasicBlock *MBB,
+                                    MachineBasicBlock *HandlerMBB);
+  void rewriteNullChecks(ArrayRef<NullCheck> NullCheckList);
+
+  enum AliasResult {
+    AR_NoAlias,
+    AR_MayAlias,
+    AR_WillAliasEverything
+  };
+
+  /// Returns AR_NoAlias if \p MI memory operation does not alias with
+  /// \p PrevMI, AR_MayAlias if they may alias and AR_WillAliasEverything if
+  /// they may alias and any further memory operation may alias with \p PrevMI.
+  AliasResult areMemoryOpsAliased(const MachineInstr &MI,
+                                  const MachineInstr *PrevMI) const;
+
+  enum SuitabilityResult {
+    SR_Suitable,
+    SR_Unsuitable,
+    SR_Impossible
+  };
+
+  /// Return SR_Suitable if \p MI a memory operation that can be used to
+  /// implicitly null check the value in \p PointerReg, SR_Unsuitable if
+  /// \p MI cannot be used to null check and SR_Impossible if there is
+  /// no sense to continue lookup due to any other instruction will not be able
+  /// to be used. \p PrevInsts is the set of instruction seen since
+  /// the explicit null check on \p PointerReg.
+  SuitabilityResult isSuitableMemoryOp(const MachineInstr &MI,
+                                       unsigned PointerReg,
+                                       ArrayRef<MachineInstr *> PrevInsts);
+
+  /// Returns true if \p DependenceMI can clobber the liveIns in NullSucc block
+  /// if it was hoisted to the NullCheck block. This is used by caller
+  /// canHoistInst to decide if DependenceMI can be hoisted safely.
+  bool canDependenceHoistingClobberLiveIns(MachineInstr *DependenceMI,
+                                           MachineBasicBlock *NullSucc);
+
+  /// Return true if \p FaultingMI can be hoisted from after the
+  /// instructions in \p InstsSeenSoFar to before them.  Set \p Dependence to a
+  /// non-null value if we also need to (and legally can) hoist a dependency.
+  bool canHoistInst(MachineInstr *FaultingMI,
+                    ArrayRef<MachineInstr *> InstsSeenSoFar,
+                    MachineBasicBlock *NullSucc, MachineInstr *&Dependence);
+
+public:
+  static char ID;
+
+  ImplicitNullChecks() : MachineFunctionPass(ID) {
+    initializeImplicitNullChecksPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+};
+
+} // end anonymous namespace
+
+bool ImplicitNullChecks::canHandle(const MachineInstr *MI) {
+  if (MI->isCall() || MI->mayRaiseFPException() ||
+      MI->hasUnmodeledSideEffects())
+    return false;
+  auto IsRegMask = [](const MachineOperand &MO) { return MO.isRegMask(); };
+  (void)IsRegMask;
+
+  assert(llvm::none_of(MI->operands(), IsRegMask) &&
+         "Calls were filtered out above!");
+
+  auto IsUnordered = [](MachineMemOperand *MMO) { return MMO->isUnordered(); };
+  return llvm::all_of(MI->memoperands(), IsUnordered);
+}
+
+ImplicitNullChecks::DependenceResult
+ImplicitNullChecks::computeDependence(const MachineInstr *MI,
+                                      ArrayRef<MachineInstr *> Block) {
+  assert(llvm::all_of(Block, canHandle) && "Check this first!");
+  assert(!is_contained(Block, MI) && "Block must be exclusive of MI!");
+
+  std::optional<ArrayRef<MachineInstr *>::iterator> Dep;
+
+  for (auto I = Block.begin(), E = Block.end(); I != E; ++I) {
+    if (canReorder(*I, MI))
+      continue;
+
+    if (Dep == std::nullopt) {
+      // Found one possible dependency, keep track of it.
+      Dep = I;
+    } else {
+      // We found two dependencies, so bail out.
+      return {false, std::nullopt};
+    }
+  }
+
+  return {true, Dep};
+}
+
+bool ImplicitNullChecks::canReorder(const MachineInstr *A,
+                                    const MachineInstr *B) {
+  assert(canHandle(A) && canHandle(B) && "Precondition!");
+
+  // canHandle makes sure that we _can_ correctly analyze the dependencies
+  // between A and B here -- for instance, we should not be dealing with heap
+  // load-store dependencies here.
+
+  for (const auto &MOA : A->operands()) {
+    if (!(MOA.isReg() && MOA.getReg()))
+      continue;
+
+    Register RegA = MOA.getReg();
+    for (const auto &MOB : B->operands()) {
+      if (!(MOB.isReg() && MOB.getReg()))
+        continue;
+
+      Register RegB = MOB.getReg();
+
+      if (TRI->regsOverlap(RegA, RegB) && (MOA.isDef() || MOB.isDef()))
+        return false;
+    }
+  }
+
+  return true;
+}
+
+bool ImplicitNullChecks::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getRegInfo().getTargetRegisterInfo();
+  MFI = &MF.getFrameInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  SmallVector<NullCheck, 16> NullCheckList;
+
+  for (auto &MBB : MF)
+    analyzeBlockForNullChecks(MBB, NullCheckList);
+
+  if (!NullCheckList.empty())
+    rewriteNullChecks(NullCheckList);
+
+  return !NullCheckList.empty();
+}
+
+// Return true if any register aliasing \p Reg is live-in into \p MBB.
+static bool AnyAliasLiveIn(const TargetRegisterInfo *TRI,
+                           MachineBasicBlock *MBB, unsigned Reg) {
+  for (MCRegAliasIterator AR(Reg, TRI, /*IncludeSelf*/ true); AR.isValid();
+       ++AR)
+    if (MBB->isLiveIn(*AR))
+      return true;
+  return false;
+}
+
+ImplicitNullChecks::AliasResult
+ImplicitNullChecks::areMemoryOpsAliased(const MachineInstr &MI,
+                                        const MachineInstr *PrevMI) const {
+  // If it is not memory access, skip the check.
+  if (!(PrevMI->mayStore() || PrevMI->mayLoad()))
+    return AR_NoAlias;
+  // Load-Load may alias
+  if (!(MI.mayStore() || PrevMI->mayStore()))
+    return AR_NoAlias;
+  // We lost info, conservatively alias. If it was store then no sense to
+  // continue because we won't be able to check against it further.
+  if (MI.memoperands_empty())
+    return MI.mayStore() ? AR_WillAliasEverything : AR_MayAlias;
+  if (PrevMI->memoperands_empty())
+    return PrevMI->mayStore() ? AR_WillAliasEverything : AR_MayAlias;
+
+  for (MachineMemOperand *MMO1 : MI.memoperands()) {
+    // MMO1 should have a value due it comes from operation we'd like to use
+    // as implicit null check.
+    assert(MMO1->getValue() && "MMO1 should have a Value!");
+    for (MachineMemOperand *MMO2 : PrevMI->memoperands()) {
+      if (const PseudoSourceValue *PSV = MMO2->getPseudoValue()) {
+        if (PSV->mayAlias(MFI))
+          return AR_MayAlias;
+        continue;
+      }
+      if (!AA->isNoAlias(
+              MemoryLocation::getAfter(MMO1->getValue(), MMO1->getAAInfo()),
+              MemoryLocation::getAfter(MMO2->getValue(), MMO2->getAAInfo())))
+        return AR_MayAlias;
+    }
+  }
+  return AR_NoAlias;
+}
+
+ImplicitNullChecks::SuitabilityResult
+ImplicitNullChecks::isSuitableMemoryOp(const MachineInstr &MI,
+                                       unsigned PointerReg,
+                                       ArrayRef<MachineInstr *> PrevInsts) {
+  // Implementation restriction for faulting_op insertion
+  // TODO: This could be relaxed if we find a test case which warrants it.
+  if (MI.getDesc().getNumDefs() > 1)
+   return SR_Unsuitable;
+
+  if (!MI.mayLoadOrStore() || MI.isPredicable())
+    return SR_Unsuitable;
+  auto AM = TII->getAddrModeFromMemoryOp(MI, TRI);
+  if (!AM)
+    return SR_Unsuitable;
+  auto AddrMode = *AM;
+  const Register BaseReg = AddrMode.BaseReg, ScaledReg = AddrMode.ScaledReg;
+  int64_t Displacement = AddrMode.Displacement;
+
+  // We need the base of the memory instruction to be same as the register
+  // where the null check is performed (i.e. PointerReg).
+  if (BaseReg != PointerReg && ScaledReg != PointerReg)
+    return SR_Unsuitable;
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+  unsigned PointerRegSizeInBits = TRI->getRegSizeInBits(PointerReg, MRI);
+  // Bail out of the sizes of BaseReg, ScaledReg and PointerReg are not the
+  // same.
+  if ((BaseReg &&
+       TRI->getRegSizeInBits(BaseReg, MRI) != PointerRegSizeInBits) ||
+      (ScaledReg &&
+       TRI->getRegSizeInBits(ScaledReg, MRI) != PointerRegSizeInBits))
+    return SR_Unsuitable;
+
+  // Returns true if RegUsedInAddr is used for calculating the displacement
+  // depending on addressing mode. Also calculates the Displacement.
+  auto CalculateDisplacementFromAddrMode = [&](Register RegUsedInAddr,
+                                               int64_t Multiplier) {
+    // The register can be NoRegister, which is defined as zero for all targets.
+    // Consider instruction of interest as `movq 8(,%rdi,8), %rax`. Here the
+    // ScaledReg is %rdi, while there is no BaseReg.
+    if (!RegUsedInAddr)
+      return false;
+    assert(Multiplier && "expected to be non-zero!");
+    MachineInstr *ModifyingMI = nullptr;
+    for (auto It = std::next(MachineBasicBlock::const_reverse_iterator(&MI));
+         It != MI.getParent()->rend(); It++) {
+      const MachineInstr *CurrMI = &*It;
+      if (CurrMI->modifiesRegister(RegUsedInAddr, TRI)) {
+        ModifyingMI = const_cast<MachineInstr *>(CurrMI);
+        break;
+      }
+    }
+    if (!ModifyingMI)
+      return false;
+    // Check for the const value defined in register by ModifyingMI. This means
+    // all other previous values for that register has been invalidated.
+    int64_t ImmVal;
+    if (!TII->getConstValDefinedInReg(*ModifyingMI, RegUsedInAddr, ImmVal))
+      return false;
+    // Calculate the reg size in bits, since this is needed for bailing out in
+    // case of overflow.
+    int32_t RegSizeInBits = TRI->getRegSizeInBits(RegUsedInAddr, MRI);
+    APInt ImmValC(RegSizeInBits, ImmVal, true /*IsSigned*/);
+    APInt MultiplierC(RegSizeInBits, Multiplier);
+    assert(MultiplierC.isStrictlyPositive() &&
+           "expected to be a positive value!");
+    bool IsOverflow;
+    // Sign of the product depends on the sign of the ImmVal, since Multiplier
+    // is always positive.
+    APInt Product = ImmValC.smul_ov(MultiplierC, IsOverflow);
+    if (IsOverflow)
+      return false;
+    APInt DisplacementC(64, Displacement, true /*isSigned*/);
+    DisplacementC = Product.sadd_ov(DisplacementC, IsOverflow);
+    if (IsOverflow)
+      return false;
+
+    // We only handle diplacements upto 64 bits wide.
+    if (DisplacementC.getActiveBits() > 64)
+      return false;
+    Displacement = DisplacementC.getSExtValue();
+    return true;
+  };
+
+  // If a register used in the address is constant, fold it's effect into the
+  // displacement for ease of analysis.
+  bool BaseRegIsConstVal = false, ScaledRegIsConstVal = false;
+  if (CalculateDisplacementFromAddrMode(BaseReg, 1))
+    BaseRegIsConstVal = true;
+  if (CalculateDisplacementFromAddrMode(ScaledReg, AddrMode.Scale))
+    ScaledRegIsConstVal = true;
+
+  // The register which is not null checked should be part of the Displacement
+  // calculation, otherwise we do not know whether the Displacement is made up
+  // by some symbolic values.
+  // This matters because we do not want to incorrectly assume that load from
+  // falls in the zeroth faulting page in the "sane offset check" below.
+  if ((BaseReg && BaseReg != PointerReg && !BaseRegIsConstVal) ||
+      (ScaledReg && ScaledReg != PointerReg && !ScaledRegIsConstVal))
+    return SR_Unsuitable;
+
+  // We want the mem access to be issued at a sane offset from PointerReg,
+  // so that if PointerReg is null then the access reliably page faults.
+  if (!(-PageSize < Displacement && Displacement < PageSize))
+    return SR_Unsuitable;
+
+  // Finally, check whether the current memory access aliases with previous one.
+  for (auto *PrevMI : PrevInsts) {
+    AliasResult AR = areMemoryOpsAliased(MI, PrevMI);
+    if (AR == AR_WillAliasEverything)
+      return SR_Impossible;
+    if (AR == AR_MayAlias)
+      return SR_Unsuitable;
+  }
+  return SR_Suitable;
+}
+
+bool ImplicitNullChecks::canDependenceHoistingClobberLiveIns(
+    MachineInstr *DependenceMI, MachineBasicBlock *NullSucc) {
+  for (const auto &DependenceMO : DependenceMI->operands()) {
+    if (!(DependenceMO.isReg() && DependenceMO.getReg()))
+      continue;
+
+    // Make sure that we won't clobber any live ins to the sibling block by
+    // hoisting Dependency.  For instance, we can't hoist INST to before the
+    // null check (even if it safe, and does not violate any dependencies in
+    // the non_null_block) if %rdx is live in to _null_block.
+    //
+    //    test %rcx, %rcx
+    //    je _null_block
+    //  _non_null_block:
+    //    %rdx = INST
+    //    ...
+    //
+    // This restriction does not apply to the faulting load inst because in
+    // case the pointer loaded from is in the null page, the load will not
+    // semantically execute, and affect machine state.  That is, if the load
+    // was loading into %rax and it faults, the value of %rax should stay the
+    // same as it would have been had the load not have executed and we'd have
+    // branched to NullSucc directly.
+    if (AnyAliasLiveIn(TRI, NullSucc, DependenceMO.getReg()))
+      return true;
+
+  }
+
+  // The dependence does not clobber live-ins in NullSucc block.
+  return false;
+}
+
+bool ImplicitNullChecks::canHoistInst(MachineInstr *FaultingMI,
+                                      ArrayRef<MachineInstr *> InstsSeenSoFar,
+                                      MachineBasicBlock *NullSucc,
+                                      MachineInstr *&Dependence) {
+  auto DepResult = computeDependence(FaultingMI, InstsSeenSoFar);
+  if (!DepResult.CanReorder)
+    return false;
+
+  if (!DepResult.PotentialDependence) {
+    Dependence = nullptr;
+    return true;
+  }
+
+  auto DependenceItr = *DepResult.PotentialDependence;
+  auto *DependenceMI = *DependenceItr;
+
+  // We don't want to reason about speculating loads.  Note -- at this point
+  // we should have already filtered out all of the other non-speculatable
+  // things, like calls and stores.
+  // We also do not want to hoist stores because it might change the memory
+  // while the FaultingMI may result in faulting.
+  assert(canHandle(DependenceMI) && "Should never have reached here!");
+  if (DependenceMI->mayLoadOrStore())
+    return false;
+
+  if (canDependenceHoistingClobberLiveIns(DependenceMI, NullSucc))
+    return false;
+
+  auto DepDepResult =
+      computeDependence(DependenceMI, {InstsSeenSoFar.begin(), DependenceItr});
+
+  if (!DepDepResult.CanReorder || DepDepResult.PotentialDependence)
+    return false;
+
+  Dependence = DependenceMI;
+  return true;
+}
+
+/// Analyze MBB to check if its terminating branch can be turned into an
+/// implicit null check.  If yes, append a description of the said null check to
+/// NullCheckList and return true, else return false.
+bool ImplicitNullChecks::analyzeBlockForNullChecks(
+    MachineBasicBlock &MBB, SmallVectorImpl<NullCheck> &NullCheckList) {
+  using MachineBranchPredicate = TargetInstrInfo::MachineBranchPredicate;
+
+  MDNode *BranchMD = nullptr;
+  if (auto *BB = MBB.getBasicBlock())
+    BranchMD = BB->getTerminator()->getMetadata(LLVMContext::MD_make_implicit);
+
+  if (!BranchMD)
+    return false;
+
+  MachineBranchPredicate MBP;
+
+  if (TII->analyzeBranchPredicate(MBB, MBP, true))
+    return false;
+
+  // Is the predicate comparing an integer to zero?
+  if (!(MBP.LHS.isReg() && MBP.RHS.isImm() && MBP.RHS.getImm() == 0 &&
+        (MBP.Predicate == MachineBranchPredicate::PRED_NE ||
+         MBP.Predicate == MachineBranchPredicate::PRED_EQ)))
+    return false;
+
+  // If there is a separate condition generation instruction, we chose not to
+  // transform unless we can remove both condition and consuming branch.
+  if (MBP.ConditionDef && !MBP.SingleUseCondition)
+    return false;
+
+  MachineBasicBlock *NotNullSucc, *NullSucc;
+
+  if (MBP.Predicate == MachineBranchPredicate::PRED_NE) {
+    NotNullSucc = MBP.TrueDest;
+    NullSucc = MBP.FalseDest;
+  } else {
+    NotNullSucc = MBP.FalseDest;
+    NullSucc = MBP.TrueDest;
+  }
+
+  // We handle the simplest case for now.  We can potentially do better by using
+  // the machine dominator tree.
+  if (NotNullSucc->pred_size() != 1)
+    return false;
+
+  const Register PointerReg = MBP.LHS.getReg();
+
+  if (MBP.ConditionDef) {
+    // To prevent the invalid transformation of the following code:
+    //
+    //   mov %rax, %rcx
+    //   test %rax, %rax
+    //   %rax = ...
+    //   je throw_npe
+    //   mov(%rcx), %r9
+    //   mov(%rax), %r10
+    //
+    // into:
+    //
+    //   mov %rax, %rcx
+    //   %rax = ....
+    //   faulting_load_op("movl (%rax), %r10", throw_npe)
+    //   mov(%rcx), %r9
+    //
+    // we must ensure that there are no instructions between the 'test' and
+    // conditional jump that modify %rax.
+    assert(MBP.ConditionDef->getParent() ==  &MBB &&
+           "Should be in basic block");
+
+    for (auto I = MBB.rbegin(); MBP.ConditionDef != &*I; ++I)
+      if (I->modifiesRegister(PointerReg, TRI))
+        return false;
+  }
+  // Starting with a code fragment like:
+  //
+  //   test %rax, %rax
+  //   jne LblNotNull
+  //
+  //  LblNull:
+  //   callq throw_NullPointerException
+  //
+  //  LblNotNull:
+  //   Inst0
+  //   Inst1
+  //   ...
+  //   Def = Load (%rax + <offset>)
+  //   ...
+  //
+  //
+  // we want to end up with
+  //
+  //   Def = FaultingLoad (%rax + <offset>), LblNull
+  //   jmp LblNotNull ;; explicit or fallthrough
+  //
+  //  LblNotNull:
+  //   Inst0
+  //   Inst1
+  //   ...
+  //
+  //  LblNull:
+  //   callq throw_NullPointerException
+  //
+  //
+  // To see why this is legal, consider the two possibilities:
+  //
+  //  1. %rax is null: since we constrain <offset> to be less than PageSize, the
+  //     load instruction dereferences the null page, causing a segmentation
+  //     fault.
+  //
+  //  2. %rax is not null: in this case we know that the load cannot fault, as
+  //     otherwise the load would've faulted in the original program too and the
+  //     original program would've been undefined.
+  //
+  // This reasoning cannot be extended to justify hoisting through arbitrary
+  // control flow.  For instance, in the example below (in pseudo-C)
+  //
+  //    if (ptr == null) { throw_npe(); unreachable; }
+  //    if (some_cond) { return 42; }
+  //    v = ptr->field;  // LD
+  //    ...
+  //
+  // we cannot (without code duplication) use the load marked "LD" to null check
+  // ptr -- clause (2) above does not apply in this case.  In the above program
+  // the safety of ptr->field can be dependent on some_cond; and, for instance,
+  // ptr could be some non-null invalid reference that never gets loaded from
+  // because some_cond is always true.
+
+  SmallVector<MachineInstr *, 8> InstsSeenSoFar;
+
+  for (auto &MI : *NotNullSucc) {
+    if (!canHandle(&MI) || InstsSeenSoFar.size() >= MaxInstsToConsider)
+      return false;
+
+    MachineInstr *Dependence;
+    SuitabilityResult SR = isSuitableMemoryOp(MI, PointerReg, InstsSeenSoFar);
+    if (SR == SR_Impossible)
+      return false;
+    if (SR == SR_Suitable &&
+        canHoistInst(&MI, InstsSeenSoFar, NullSucc, Dependence)) {
+      NullCheckList.emplace_back(&MI, MBP.ConditionDef, &MBB, NotNullSucc,
+                                 NullSucc, Dependence);
+      return true;
+    }
+
+    // If MI re-defines the PointerReg in a way that changes the value of
+    // PointerReg if it was null, then we cannot move further.
+    if (!TII->preservesZeroValueInReg(&MI, PointerReg, TRI))
+      return false;
+    InstsSeenSoFar.push_back(&MI);
+  }
+
+  return false;
+}
+
+/// Wrap a machine instruction, MI, into a FAULTING machine instruction.
+/// The FAULTING instruction does the same load/store as MI
+/// (defining the same register), and branches to HandlerMBB if the mem access
+/// faults.  The FAULTING instruction is inserted at the end of MBB.
+MachineInstr *ImplicitNullChecks::insertFaultingInstr(
+    MachineInstr *MI, MachineBasicBlock *MBB, MachineBasicBlock *HandlerMBB) {
+  const unsigned NoRegister = 0; // Guaranteed to be the NoRegister value for
+                                 // all targets.
+
+  DebugLoc DL;
+  unsigned NumDefs = MI->getDesc().getNumDefs();
+  assert(NumDefs <= 1 && "other cases unhandled!");
+
+  unsigned DefReg = NoRegister;
+  if (NumDefs != 0) {
+    DefReg = MI->getOperand(0).getReg();
+    assert(NumDefs == 1 && "expected exactly one def!");
+  }
+
+  FaultMaps::FaultKind FK;
+  if (MI->mayLoad())
+    FK =
+        MI->mayStore() ? FaultMaps::FaultingLoadStore : FaultMaps::FaultingLoad;
+  else
+    FK = FaultMaps::FaultingStore;
+
+  auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_OP), DefReg)
+                 .addImm(FK)
+                 .addMBB(HandlerMBB)
+                 .addImm(MI->getOpcode());
+
+  for (auto &MO : MI->uses()) {
+    if (MO.isReg()) {
+      MachineOperand NewMO = MO;
+      if (MO.isUse()) {
+        NewMO.setIsKill(false);
+      } else {
+        assert(MO.isDef() && "Expected def or use");
+        NewMO.setIsDead(false);
+      }
+      MIB.add(NewMO);
+    } else {
+      MIB.add(MO);
+    }
+  }
+
+  MIB.setMemRefs(MI->memoperands());
+
+  return MIB;
+}
+
+/// Rewrite the null checks in NullCheckList into implicit null checks.
+void ImplicitNullChecks::rewriteNullChecks(
+    ArrayRef<ImplicitNullChecks::NullCheck> NullCheckList) {
+  DebugLoc DL;
+
+  for (const auto &NC : NullCheckList) {
+    // Remove the conditional branch dependent on the null check.
+    unsigned BranchesRemoved = TII->removeBranch(*NC.getCheckBlock());
+    (void)BranchesRemoved;
+    assert(BranchesRemoved > 0 && "expected at least one branch!");
+
+    if (auto *DepMI = NC.getOnlyDependency()) {
+      DepMI->removeFromParent();
+      NC.getCheckBlock()->insert(NC.getCheckBlock()->end(), DepMI);
+    }
+
+    // Insert a faulting instruction where the conditional branch was
+    // originally. We check earlier ensures that this bit of code motion
+    // is legal.  We do not touch the successors list for any basic block
+    // since we haven't changed control flow, we've just made it implicit.
+    MachineInstr *FaultingInstr = insertFaultingInstr(
+        NC.getMemOperation(), NC.getCheckBlock(), NC.getNullSucc());
+    // Now the values defined by MemOperation, if any, are live-in of
+    // the block of MemOperation.
+    // The original operation may define implicit-defs alongside
+    // the value.
+    MachineBasicBlock *MBB = NC.getMemOperation()->getParent();
+    for (const MachineOperand &MO : FaultingInstr->all_defs()) {
+      Register Reg = MO.getReg();
+      if (!Reg || MBB->isLiveIn(Reg))
+        continue;
+      MBB->addLiveIn(Reg);
+    }
+
+    if (auto *DepMI = NC.getOnlyDependency()) {
+      for (auto &MO : DepMI->all_defs()) {
+        if (!MO.getReg() || MO.isDead())
+          continue;
+        if (!NC.getNotNullSucc()->isLiveIn(MO.getReg()))
+          NC.getNotNullSucc()->addLiveIn(MO.getReg());
+      }
+    }
+
+    NC.getMemOperation()->eraseFromParent();
+    if (auto *CheckOp = NC.getCheckOperation())
+      CheckOp->eraseFromParent();
+
+    // Insert an *unconditional* branch to not-null successor - we expect
+    // block placement to remove fallthroughs later.
+    TII->insertBranch(*NC.getCheckBlock(), NC.getNotNullSucc(), nullptr,
+                      /*Cond=*/std::nullopt, DL);
+
+    NumImplicitNullChecks++;
+  }
+}
+
+char ImplicitNullChecks::ID = 0;
+
+char &llvm::ImplicitNullChecksID = ImplicitNullChecks::ID;
+
+INITIALIZE_PASS_BEGIN(ImplicitNullChecks, DEBUG_TYPE,
+                      "Implicit null checks", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(ImplicitNullChecks, DEBUG_TYPE,
+                    "Implicit null checks", false, false)
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/IndirectBrExpandPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
new file mode 100644
index 000000000000..012892166ae7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/IndirectBrExpandPass.cpp
@@ -0,0 +1,270 @@
+//===- IndirectBrExpandPass.cpp - Expand indirectbr to switch -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Implements an expansion pass to turn `indirectbr` instructions in the IR
+/// into `switch` instructions. This works by enumerating the basic blocks in
+/// a dense range of integers, replacing each `blockaddr` constant with the
+/// corresponding integer constant, and then building a switch that maps from
+/// the integers to the actual blocks. All of the indirectbr instructions in the
+/// function are redirected to this common switch.
+///
+/// While this is generically useful if a target is unable to codegen
+/// `indirectbr` natively, it is primarily useful when there is some desire to
+/// get the builtin non-jump-table lowering of a switch even when the input
+/// source contained an explicit indirect branch construct.
+///
+/// Note that it doesn't make any sense to enable this pass unless a target also
+/// disables jump-table lowering of switches. Doing that is likely to pessimize
+/// the code.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include <optional>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "indirectbr-expand"
+
+namespace {
+
+class IndirectBrExpandPass : public FunctionPass {
+  const TargetLowering *TLI = nullptr;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  IndirectBrExpandPass() : FunctionPass(ID) {
+    initializeIndirectBrExpandPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+
+} // end anonymous namespace
+
+char IndirectBrExpandPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(IndirectBrExpandPass, DEBUG_TYPE,
+                      "Expand indirectbr instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(IndirectBrExpandPass, DEBUG_TYPE,
+                    "Expand indirectbr instructions", false, false)
+
+FunctionPass *llvm::createIndirectBrExpandPass() {
+  return new IndirectBrExpandPass();
+}
+
+bool IndirectBrExpandPass::runOnFunction(Function &F) {
+  auto &DL = F.getParent()->getDataLayout();
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  auto &TM = TPC->getTM<TargetMachine>();
+  auto &STI = *TM.getSubtargetImpl(F);
+  if (!STI.enableIndirectBrExpand())
+    return false;
+  TLI = STI.getTargetLowering();
+
+  std::optional<DomTreeUpdater> DTU;
+  if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
+    DTU.emplace(DTWP->getDomTree(), DomTreeUpdater::UpdateStrategy::Lazy);
+
+  SmallVector<IndirectBrInst *, 1> IndirectBrs;
+
+  // Set of all potential successors for indirectbr instructions.
+  SmallPtrSet<BasicBlock *, 4> IndirectBrSuccs;
+
+  // Build a list of indirectbrs that we want to rewrite.
+  for (BasicBlock &BB : F)
+    if (auto *IBr = dyn_cast<IndirectBrInst>(BB.getTerminator())) {
+      // Handle the degenerate case of no successors by replacing the indirectbr
+      // with unreachable as there is no successor available.
+      if (IBr->getNumSuccessors() == 0) {
+        (void)new UnreachableInst(F.getContext(), IBr);
+        IBr->eraseFromParent();
+        continue;
+      }
+
+      IndirectBrs.push_back(IBr);
+      for (BasicBlock *SuccBB : IBr->successors())
+        IndirectBrSuccs.insert(SuccBB);
+    }
+
+  if (IndirectBrs.empty())
+    return false;
+
+  // If we need to replace any indirectbrs we need to establish integer
+  // constants that will correspond to each of the basic blocks in the function
+  // whose address escapes. We do that here and rewrite all the blockaddress
+  // constants to just be those integer constants cast to a pointer type.
+  SmallVector<BasicBlock *, 4> BBs;
+
+  for (BasicBlock &BB : F) {
+    // Skip blocks that aren't successors to an indirectbr we're going to
+    // rewrite.
+    if (!IndirectBrSuccs.count(&BB))
+      continue;
+
+    auto IsBlockAddressUse = [&](const Use &U) {
+      return isa<BlockAddress>(U.getUser());
+    };
+    auto BlockAddressUseIt = llvm::find_if(BB.uses(), IsBlockAddressUse);
+    if (BlockAddressUseIt == BB.use_end())
+      continue;
+
+    assert(std::find_if(std::next(BlockAddressUseIt), BB.use_end(),
+                        IsBlockAddressUse) == BB.use_end() &&
+           "There should only ever be a single blockaddress use because it is "
+           "a constant and should be uniqued.");
+
+    auto *BA = cast<BlockAddress>(BlockAddressUseIt->getUser());
+
+    // Skip if the constant was formed but ended up not being used (due to DCE
+    // or whatever).
+    if (!BA->isConstantUsed())
+      continue;
+
+    // Compute the index we want to use for this basic block. We can't use zero
+    // because null can be compared with block addresses.
+    int BBIndex = BBs.size() + 1;
+    BBs.push_back(&BB);
+
+    auto *ITy = cast<IntegerType>(DL.getIntPtrType(BA->getType()));
+    ConstantInt *BBIndexC = ConstantInt::get(ITy, BBIndex);
+
+    // Now rewrite the blockaddress to an integer constant based on the index.
+    // FIXME: This part doesn't properly recognize other uses of blockaddress
+    // expressions, for instance, where they are used to pass labels to
+    // asm-goto. This part of the pass needs a rework.
+    BA->replaceAllUsesWith(ConstantExpr::getIntToPtr(BBIndexC, BA->getType()));
+  }
+
+  if (BBs.empty()) {
+    // There are no blocks whose address is taken, so any indirectbr instruction
+    // cannot get a valid input and we can replace all of them with unreachable.
+    SmallVector<DominatorTree::UpdateType, 8> Updates;
+    if (DTU)
+      Updates.reserve(IndirectBrSuccs.size());
+    for (auto *IBr : IndirectBrs) {
+      if (DTU) {
+        for (BasicBlock *SuccBB : IBr->successors())
+          Updates.push_back({DominatorTree::Delete, IBr->getParent(), SuccBB});
+      }
+      (void)new UnreachableInst(F.getContext(), IBr);
+      IBr->eraseFromParent();
+    }
+    if (DTU) {
+      assert(Updates.size() == IndirectBrSuccs.size() &&
+             "Got unexpected update count.");
+      DTU->applyUpdates(Updates);
+    }
+    return true;
+  }
+
+  BasicBlock *SwitchBB;
+  Value *SwitchValue;
+
+  // Compute a common integer type across all the indirectbr instructions.
+  IntegerType *CommonITy = nullptr;
+  for (auto *IBr : IndirectBrs) {
+    auto *ITy =
+        cast<IntegerType>(DL.getIntPtrType(IBr->getAddress()->getType()));
+    if (!CommonITy || ITy->getBitWidth() > CommonITy->getBitWidth())
+      CommonITy = ITy;
+  }
+
+  auto GetSwitchValue = [CommonITy](IndirectBrInst *IBr) {
+    return CastInst::CreatePointerCast(
+        IBr->getAddress(), CommonITy,
+        Twine(IBr->getAddress()->getName()) + ".switch_cast", IBr);
+  };
+
+  SmallVector<DominatorTree::UpdateType, 8> Updates;
+
+  if (IndirectBrs.size() == 1) {
+    // If we only have one indirectbr, we can just directly replace it within
+    // its block.
+    IndirectBrInst *IBr = IndirectBrs[0];
+    SwitchBB = IBr->getParent();
+    SwitchValue = GetSwitchValue(IBr);
+    if (DTU) {
+      Updates.reserve(IndirectBrSuccs.size());
+      for (BasicBlock *SuccBB : IBr->successors())
+        Updates.push_back({DominatorTree::Delete, IBr->getParent(), SuccBB});
+      assert(Updates.size() == IndirectBrSuccs.size() &&
+             "Got unexpected update count.");
+    }
+    IBr->eraseFromParent();
+  } else {
+    // Otherwise we need to create a new block to hold the switch across BBs,
+    // jump to that block instead of each indirectbr, and phi together the
+    // values for the switch.
+    SwitchBB = BasicBlock::Create(F.getContext(), "switch_bb", &F);
+    auto *SwitchPN = PHINode::Create(CommonITy, IndirectBrs.size(),
+                                     "switch_value_phi", SwitchBB);
+    SwitchValue = SwitchPN;
+
+    // Now replace the indirectbr instructions with direct branches to the
+    // switch block and fill out the PHI operands.
+    if (DTU)
+      Updates.reserve(IndirectBrs.size() + 2 * IndirectBrSuccs.size());
+    for (auto *IBr : IndirectBrs) {
+      SwitchPN->addIncoming(GetSwitchValue(IBr), IBr->getParent());
+      BranchInst::Create(SwitchBB, IBr);
+      if (DTU) {
+        Updates.push_back({DominatorTree::Insert, IBr->getParent(), SwitchBB});
+        for (BasicBlock *SuccBB : IBr->successors())
+          Updates.push_back({DominatorTree::Delete, IBr->getParent(), SuccBB});
+      }
+      IBr->eraseFromParent();
+    }
+  }
+
+  // Now build the switch in the block. The block will have no terminator
+  // already.
+  auto *SI = SwitchInst::Create(SwitchValue, BBs[0], BBs.size(), SwitchBB);
+
+  // Add a case for each block.
+  for (int i : llvm::seq<int>(1, BBs.size()))
+    SI->addCase(ConstantInt::get(CommonITy, i + 1), BBs[i]);
+
+  if (DTU) {
+    // If there were multiple indirectbr's, they may have common successors,
+    // but in the dominator tree, we only track unique edges.
+    SmallPtrSet<BasicBlock *, 8> UniqueSuccessors;
+    Updates.reserve(Updates.size() + BBs.size());
+    for (BasicBlock *BB : BBs) {
+      if (UniqueSuccessors.insert(BB).second)
+        Updates.push_back({DominatorTree::Insert, SwitchBB, BB});
+    }
+    DTU->applyUpdates(Updates);
+  }
+
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/InlineSpiller.cpp b/contrib/llvm-project/llvm/lib/CodeGen/InlineSpiller.cpp
new file mode 100644
index 000000000000..277c6be418c5
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -0,0 +1,1691 @@
+//===- InlineSpiller.cpp - Insert spills and restores inline --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The inline spiller modifies the machine function directly instead of
+// inserting spills and restores in VirtRegMap.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SplitKit.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveStacks.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/Spiller.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <iterator>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(NumSpilledRanges,   "Number of spilled live ranges");
+STATISTIC(NumSnippets,        "Number of spilled snippets");
+STATISTIC(NumSpills,          "Number of spills inserted");
+STATISTIC(NumSpillsRemoved,   "Number of spills removed");
+STATISTIC(NumReloads,         "Number of reloads inserted");
+STATISTIC(NumReloadsRemoved,  "Number of reloads removed");
+STATISTIC(NumFolded,          "Number of folded stack accesses");
+STATISTIC(NumFoldedLoads,     "Number of folded loads");
+STATISTIC(NumRemats,          "Number of rematerialized defs for spilling");
+
+static cl::opt<bool> DisableHoisting("disable-spill-hoist", cl::Hidden,
+                                     cl::desc("Disable inline spill hoisting"));
+static cl::opt<bool>
+RestrictStatepointRemat("restrict-statepoint-remat",
+                       cl::init(false), cl::Hidden,
+                       cl::desc("Restrict remat for statepoint operands"));
+
+namespace {
+
+class HoistSpillHelper : private LiveRangeEdit::Delegate {
+  MachineFunction &MF;
+  LiveIntervals &LIS;
+  LiveStacks &LSS;
+  MachineDominatorTree &MDT;
+  MachineLoopInfo &Loops;
+  VirtRegMap &VRM;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  const MachineBlockFrequencyInfo &MBFI;
+
+  InsertPointAnalysis IPA;
+
+  // Map from StackSlot to the LiveInterval of the original register.
+  // Note the LiveInterval of the original register may have been deleted
+  // after it is spilled. We keep a copy here to track the range where
+  // spills can be moved.
+  DenseMap<int, std::unique_ptr<LiveInterval>> StackSlotToOrigLI;
+
+  // Map from pair of (StackSlot and Original VNI) to a set of spills which
+  // have the same stackslot and have equal values defined by Original VNI.
+  // These spills are mergeable and are hoist candidates.
+  using MergeableSpillsMap =
+      MapVector<std::pair<int, VNInfo *>, SmallPtrSet<MachineInstr *, 16>>;
+  MergeableSpillsMap MergeableSpills;
+
+  /// This is the map from original register to a set containing all its
+  /// siblings. To hoist a spill to another BB, we need to find out a live
+  /// sibling there and use it as the source of the new spill.
+  DenseMap<Register, SmallSetVector<Register, 16>> Virt2SiblingsMap;
+
+  bool isSpillCandBB(LiveInterval &OrigLI, VNInfo &OrigVNI,
+                     MachineBasicBlock &BB, Register &LiveReg);
+
+  void rmRedundantSpills(
+      SmallPtrSet<MachineInstr *, 16> &Spills,
+      SmallVectorImpl<MachineInstr *> &SpillsToRm,
+      DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill);
+
+  void getVisitOrders(
+      MachineBasicBlock *Root, SmallPtrSet<MachineInstr *, 16> &Spills,
+      SmallVectorImpl<MachineDomTreeNode *> &Orders,
+      SmallVectorImpl<MachineInstr *> &SpillsToRm,
+      DenseMap<MachineDomTreeNode *, unsigned> &SpillsToKeep,
+      DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill);
+
+  void runHoistSpills(LiveInterval &OrigLI, VNInfo &OrigVNI,
+                      SmallPtrSet<MachineInstr *, 16> &Spills,
+                      SmallVectorImpl<MachineInstr *> &SpillsToRm,
+                      DenseMap<MachineBasicBlock *, unsigned> &SpillsToIns);
+
+public:
+  HoistSpillHelper(MachineFunctionPass &pass, MachineFunction &mf,
+                   VirtRegMap &vrm)
+      : MF(mf), LIS(pass.getAnalysis<LiveIntervals>()),
+        LSS(pass.getAnalysis<LiveStacks>()),
+        MDT(pass.getAnalysis<MachineDominatorTree>()),
+        Loops(pass.getAnalysis<MachineLoopInfo>()), VRM(vrm),
+        MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()),
+        TRI(*mf.getSubtarget().getRegisterInfo()),
+        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()),
+        IPA(LIS, mf.getNumBlockIDs()) {}
+
+  void addToMergeableSpills(MachineInstr &Spill, int StackSlot,
+                            unsigned Original);
+  bool rmFromMergeableSpills(MachineInstr &Spill, int StackSlot);
+  void hoistAllSpills();
+  void LRE_DidCloneVirtReg(Register, Register) override;
+};
+
+class InlineSpiller : public Spiller {
+  MachineFunction &MF;
+  LiveIntervals &LIS;
+  LiveStacks &LSS;
+  MachineDominatorTree &MDT;
+  MachineLoopInfo &Loops;
+  VirtRegMap &VRM;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  const MachineBlockFrequencyInfo &MBFI;
+
+  // Variables that are valid during spill(), but used by multiple methods.
+  LiveRangeEdit *Edit = nullptr;
+  LiveInterval *StackInt = nullptr;
+  int StackSlot;
+  Register Original;
+
+  // All registers to spill to StackSlot, including the main register.
+  SmallVector<Register, 8> RegsToSpill;
+
+  // All COPY instructions to/from snippets.
+  // They are ignored since both operands refer to the same stack slot.
+  // For bundled copies, this will only include the first header copy.
+  SmallPtrSet<MachineInstr*, 8> SnippetCopies;
+
+  // Values that failed to remat at some point.
+  SmallPtrSet<VNInfo*, 8> UsedValues;
+
+  // Dead defs generated during spilling.
+  SmallVector<MachineInstr*, 8> DeadDefs;
+
+  // Object records spills information and does the hoisting.
+  HoistSpillHelper HSpiller;
+
+  // Live range weight calculator.
+  VirtRegAuxInfo &VRAI;
+
+  ~InlineSpiller() override = default;
+
+public:
+  InlineSpiller(MachineFunctionPass &Pass, MachineFunction &MF, VirtRegMap &VRM,
+                VirtRegAuxInfo &VRAI)
+      : MF(MF), LIS(Pass.getAnalysis<LiveIntervals>()),
+        LSS(Pass.getAnalysis<LiveStacks>()),
+        MDT(Pass.getAnalysis<MachineDominatorTree>()),
+        Loops(Pass.getAnalysis<MachineLoopInfo>()), VRM(VRM),
+        MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()),
+        TRI(*MF.getSubtarget().getRegisterInfo()),
+        MBFI(Pass.getAnalysis<MachineBlockFrequencyInfo>()),
+        HSpiller(Pass, MF, VRM), VRAI(VRAI) {}
+
+  void spill(LiveRangeEdit &) override;
+  void postOptimization() override;
+
+private:
+  bool isSnippet(const LiveInterval &SnipLI);
+  void collectRegsToSpill();
+
+  bool isRegToSpill(Register Reg) { return is_contained(RegsToSpill, Reg); }
+
+  bool isSibling(Register Reg);
+  bool hoistSpillInsideBB(LiveInterval &SpillLI, MachineInstr &CopyMI);
+  void eliminateRedundantSpills(LiveInterval &LI, VNInfo *VNI);
+
+  void markValueUsed(LiveInterval*, VNInfo*);
+  bool canGuaranteeAssignmentAfterRemat(Register VReg, MachineInstr &MI);
+  bool reMaterializeFor(LiveInterval &, MachineInstr &MI);
+  void reMaterializeAll();
+
+  bool coalesceStackAccess(MachineInstr *MI, Register Reg);
+  bool foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>>,
+                         MachineInstr *LoadMI = nullptr);
+  void insertReload(Register VReg, SlotIndex, MachineBasicBlock::iterator MI);
+  void insertSpill(Register VReg, bool isKill, MachineBasicBlock::iterator MI);
+
+  void spillAroundUses(Register Reg);
+  void spillAll();
+};
+
+} // end anonymous namespace
+
+Spiller::~Spiller() = default;
+
+void Spiller::anchor() {}
+
+Spiller *llvm::createInlineSpiller(MachineFunctionPass &Pass,
+                                   MachineFunction &MF, VirtRegMap &VRM,
+                                   VirtRegAuxInfo &VRAI) {
+  return new InlineSpiller(Pass, MF, VRM, VRAI);
+}
+
+//===----------------------------------------------------------------------===//
+//                                Snippets
+//===----------------------------------------------------------------------===//
+
+// When spilling a virtual register, we also spill any snippets it is connected
+// to. The snippets are small live ranges that only have a single real use,
+// leftovers from live range splitting. Spilling them enables memory operand
+// folding or tightens the live range around the single use.
+//
+// This minimizes register pressure and maximizes the store-to-load distance for
+// spill slots which can be important in tight loops.
+
+/// If MI is a COPY to or from Reg, return the other register, otherwise return
+/// 0.
+static Register isCopyOf(const MachineInstr &MI, Register Reg) {
+  assert(!MI.isBundled());
+  if (!MI.isCopy())
+    return Register();
+
+  const MachineOperand &DstOp = MI.getOperand(0);
+  const MachineOperand &SrcOp = MI.getOperand(1);
+
+  // TODO: Probably only worth allowing subreg copies with undef dests.
+  if (DstOp.getSubReg() != SrcOp.getSubReg())
+    return Register();
+  if (DstOp.getReg() == Reg)
+    return SrcOp.getReg();
+  if (SrcOp.getReg() == Reg)
+    return DstOp.getReg();
+  return Register();
+}
+
+/// Check for a copy bundle as formed by SplitKit.
+static Register isCopyOfBundle(const MachineInstr &FirstMI, Register Reg) {
+  if (!FirstMI.isBundled())
+    return isCopyOf(FirstMI, Reg);
+
+  assert(!FirstMI.isBundledWithPred() && FirstMI.isBundledWithSucc() &&
+         "expected to see first instruction in bundle");
+
+  Register SnipReg;
+  MachineBasicBlock::const_instr_iterator I = FirstMI.getIterator();
+  while (I->isBundledWithSucc()) {
+    const MachineInstr &MI = *I;
+    if (!MI.isCopy())
+      return Register();
+
+    const MachineOperand &DstOp = MI.getOperand(0);
+    const MachineOperand &SrcOp = MI.getOperand(1);
+    if (DstOp.getReg() == Reg) {
+      if (!SnipReg)
+        SnipReg = SrcOp.getReg();
+      else if (SnipReg != SrcOp.getReg())
+        return Register();
+    } else if (SrcOp.getReg() == Reg) {
+      if (!SnipReg)
+        SnipReg = DstOp.getReg();
+      else if (SnipReg != DstOp.getReg())
+        return Register();
+    }
+
+    ++I;
+  }
+
+  return Register();
+}
+
+static void getVDefInterval(const MachineInstr &MI, LiveIntervals &LIS) {
+  for (const MachineOperand &MO : MI.all_defs())
+    if (MO.getReg().isVirtual())
+      LIS.getInterval(MO.getReg());
+}
+
+/// isSnippet - Identify if a live interval is a snippet that should be spilled.
+/// It is assumed that SnipLI is a virtual register with the same original as
+/// Edit->getReg().
+bool InlineSpiller::isSnippet(const LiveInterval &SnipLI) {
+  Register Reg = Edit->getReg();
+
+  // A snippet is a tiny live range with only a single instruction using it
+  // besides copies to/from Reg or spills/fills.
+  // Exception is done for statepoint instructions which will fold fills
+  // into their operands.
+  // We accept:
+  //
+  //   %snip = COPY %Reg / FILL fi#
+  //   %snip = USE %snip
+  //   %snip = STATEPOINT %snip in var arg area
+  //   %Reg = COPY %snip / SPILL %snip, fi#
+  //
+  if (!LIS.intervalIsInOneMBB(SnipLI))
+    return false;
+
+  // Number of defs should not exceed 2 not accounting defs coming from
+  // statepoint instructions.
+  unsigned NumValNums = SnipLI.getNumValNums();
+  for (auto *VNI : SnipLI.vnis()) {
+    MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
+    if (MI->getOpcode() == TargetOpcode::STATEPOINT)
+      --NumValNums;
+  }
+  if (NumValNums > 2)
+    return false;
+
+  MachineInstr *UseMI = nullptr;
+
+  // Check that all uses satisfy our criteria.
+  for (MachineRegisterInfo::reg_bundle_nodbg_iterator
+           RI = MRI.reg_bundle_nodbg_begin(SnipLI.reg()),
+           E = MRI.reg_bundle_nodbg_end();
+       RI != E;) {
+    MachineInstr &MI = *RI++;
+
+    // Allow copies to/from Reg.
+    if (isCopyOfBundle(MI, Reg))
+      continue;
+
+    // Allow stack slot loads.
+    int FI;
+    if (SnipLI.reg() == TII.isLoadFromStackSlot(MI, FI) && FI == StackSlot)
+      continue;
+
+    // Allow stack slot stores.
+    if (SnipLI.reg() == TII.isStoreToStackSlot(MI, FI) && FI == StackSlot)
+      continue;
+
+    if (StatepointOpers::isFoldableReg(&MI, SnipLI.reg()))
+      continue;
+
+    // Allow a single additional instruction.
+    if (UseMI && &MI != UseMI)
+      return false;
+    UseMI = &MI;
+  }
+  return true;
+}
+
+/// collectRegsToSpill - Collect live range snippets that only have a single
+/// real use.
+void InlineSpiller::collectRegsToSpill() {
+  Register Reg = Edit->getReg();
+
+  // Main register always spills.
+  RegsToSpill.assign(1, Reg);
+  SnippetCopies.clear();
+
+  // Snippets all have the same original, so there can't be any for an original
+  // register.
+  if (Original == Reg)
+    return;
+
+  for (MachineInstr &MI : llvm::make_early_inc_range(MRI.reg_bundles(Reg))) {
+    Register SnipReg = isCopyOfBundle(MI, Reg);
+    if (!isSibling(SnipReg))
+      continue;
+    LiveInterval &SnipLI = LIS.getInterval(SnipReg);
+    if (!isSnippet(SnipLI))
+      continue;
+    SnippetCopies.insert(&MI);
+    if (isRegToSpill(SnipReg))
+      continue;
+    RegsToSpill.push_back(SnipReg);
+    LLVM_DEBUG(dbgs() << "\talso spill snippet " << SnipLI << '\n');
+    ++NumSnippets;
+  }
+}
+
+bool InlineSpiller::isSibling(Register Reg) {
+  return Reg.isVirtual() && VRM.getOriginal(Reg) == Original;
+}
+
+/// It is beneficial to spill to earlier place in the same BB in case
+/// as follows:
+/// There is an alternative def earlier in the same MBB.
+/// Hoist the spill as far as possible in SpillMBB. This can ease
+/// register pressure:
+///
+///   x = def
+///   y = use x
+///   s = copy x
+///
+/// Hoisting the spill of s to immediately after the def removes the
+/// interference between x and y:
+///
+///   x = def
+///   spill x
+///   y = use killed x
+///
+/// This hoist only helps when the copy kills its source.
+///
+bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
+                                       MachineInstr &CopyMI) {
+  SlotIndex Idx = LIS.getInstructionIndex(CopyMI);
+#ifndef NDEBUG
+  VNInfo *VNI = SpillLI.getVNInfoAt(Idx.getRegSlot());
+  assert(VNI && VNI->def == Idx.getRegSlot() && "Not defined by copy");
+#endif
+
+  Register SrcReg = CopyMI.getOperand(1).getReg();
+  LiveInterval &SrcLI = LIS.getInterval(SrcReg);
+  VNInfo *SrcVNI = SrcLI.getVNInfoAt(Idx);
+  LiveQueryResult SrcQ = SrcLI.Query(Idx);
+  MachineBasicBlock *DefMBB = LIS.getMBBFromIndex(SrcVNI->def);
+  if (DefMBB != CopyMI.getParent() || !SrcQ.isKill())
+    return false;
+
+  // Conservatively extend the stack slot range to the range of the original
+  // value. We may be able to do better with stack slot coloring by being more
+  // careful here.
+  assert(StackInt && "No stack slot assigned yet.");
+  LiveInterval &OrigLI = LIS.getInterval(Original);
+  VNInfo *OrigVNI = OrigLI.getVNInfoAt(Idx);
+  StackInt->MergeValueInAsValue(OrigLI, OrigVNI, StackInt->getValNumInfo(0));
+  LLVM_DEBUG(dbgs() << "\tmerged orig valno " << OrigVNI->id << ": "
+                    << *StackInt << '\n');
+
+  // We are going to spill SrcVNI immediately after its def, so clear out
+  // any later spills of the same value.
+  eliminateRedundantSpills(SrcLI, SrcVNI);
+
+  MachineBasicBlock *MBB = LIS.getMBBFromIndex(SrcVNI->def);
+  MachineBasicBlock::iterator MII;
+  if (SrcVNI->isPHIDef())
+    MII = MBB->SkipPHIsLabelsAndDebug(MBB->begin());
+  else {
+    MachineInstr *DefMI = LIS.getInstructionFromIndex(SrcVNI->def);
+    assert(DefMI && "Defining instruction disappeared");
+    MII = DefMI;
+    ++MII;
+  }
+  MachineInstrSpan MIS(MII, MBB);
+  // Insert spill without kill flag immediately after def.
+  TII.storeRegToStackSlot(*MBB, MII, SrcReg, false, StackSlot,
+                          MRI.getRegClass(SrcReg), &TRI, Register());
+  LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MII);
+  for (const MachineInstr &MI : make_range(MIS.begin(), MII))
+    getVDefInterval(MI, LIS);
+  --MII; // Point to store instruction.
+  LLVM_DEBUG(dbgs() << "\thoisted: " << SrcVNI->def << '\t' << *MII);
+
+  // If there is only 1 store instruction is required for spill, add it
+  // to mergeable list. In X86 AMX, 2 intructions are required to store.
+  // We disable the merge for this case.
+  if (MIS.begin() == MII)
+    HSpiller.addToMergeableSpills(*MII, StackSlot, Original);
+  ++NumSpills;
+  return true;
+}
+
+/// eliminateRedundantSpills - SLI:VNI is known to be on the stack. Remove any
+/// redundant spills of this value in SLI.reg and sibling copies.
+void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
+  assert(VNI && "Missing value");
+  SmallVector<std::pair<LiveInterval*, VNInfo*>, 8> WorkList;
+  WorkList.push_back(std::make_pair(&SLI, VNI));
+  assert(StackInt && "No stack slot assigned yet.");
+
+  do {
+    LiveInterval *LI;
+    std::tie(LI, VNI) = WorkList.pop_back_val();
+    Register Reg = LI->reg();
+    LLVM_DEBUG(dbgs() << "Checking redundant spills for " << VNI->id << '@'
+                      << VNI->def << " in " << *LI << '\n');
+
+    // Regs to spill are taken care of.
+    if (isRegToSpill(Reg))
+      continue;
+
+    // Add all of VNI's live range to StackInt.
+    StackInt->MergeValueInAsValue(*LI, VNI, StackInt->getValNumInfo(0));
+    LLVM_DEBUG(dbgs() << "Merged to stack int: " << *StackInt << '\n');
+
+    // Find all spills and copies of VNI.
+    for (MachineInstr &MI :
+         llvm::make_early_inc_range(MRI.use_nodbg_bundles(Reg))) {
+      if (!MI.isCopy() && !MI.mayStore())
+        continue;
+      SlotIndex Idx = LIS.getInstructionIndex(MI);
+      if (LI->getVNInfoAt(Idx) != VNI)
+        continue;
+
+      // Follow sibling copies down the dominator tree.
+      if (Register DstReg = isCopyOfBundle(MI, Reg)) {
+        if (isSibling(DstReg)) {
+          LiveInterval &DstLI = LIS.getInterval(DstReg);
+          VNInfo *DstVNI = DstLI.getVNInfoAt(Idx.getRegSlot());
+          assert(DstVNI && "Missing defined value");
+          assert(DstVNI->def == Idx.getRegSlot() && "Wrong copy def slot");
+
+          WorkList.push_back(std::make_pair(&DstLI, DstVNI));
+        }
+        continue;
+      }
+
+      // Erase spills.
+      int FI;
+      if (Reg == TII.isStoreToStackSlot(MI, FI) && FI == StackSlot) {
+        LLVM_DEBUG(dbgs() << "Redundant spill " << Idx << '\t' << MI);
+        // eliminateDeadDefs won't normally remove stores, so switch opcode.
+        MI.setDesc(TII.get(TargetOpcode::KILL));
+        DeadDefs.push_back(&MI);
+        ++NumSpillsRemoved;
+        if (HSpiller.rmFromMergeableSpills(MI, StackSlot))
+          --NumSpills;
+      }
+    }
+  } while (!WorkList.empty());
+}
+
+//===----------------------------------------------------------------------===//
+//                            Rematerialization
+//===----------------------------------------------------------------------===//
+
+/// markValueUsed - Remember that VNI failed to rematerialize, so its defining
+/// instruction cannot be eliminated. See through snippet copies
+void InlineSpiller::markValueUsed(LiveInterval *LI, VNInfo *VNI) {
+  SmallVector<std::pair<LiveInterval*, VNInfo*>, 8> WorkList;
+  WorkList.push_back(std::make_pair(LI, VNI));
+  do {
+    std::tie(LI, VNI) = WorkList.pop_back_val();
+    if (!UsedValues.insert(VNI).second)
+      continue;
+
+    if (VNI->isPHIDef()) {
+      MachineBasicBlock *MBB = LIS.getMBBFromIndex(VNI->def);
+      for (MachineBasicBlock *P : MBB->predecessors()) {
+        VNInfo *PVNI = LI->getVNInfoBefore(LIS.getMBBEndIdx(P));
+        if (PVNI)
+          WorkList.push_back(std::make_pair(LI, PVNI));
+      }
+      continue;
+    }
+
+    // Follow snippet copies.
+    MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
+    if (!SnippetCopies.count(MI))
+      continue;
+    LiveInterval &SnipLI = LIS.getInterval(MI->getOperand(1).getReg());
+    assert(isRegToSpill(SnipLI.reg()) && "Unexpected register in copy");
+    VNInfo *SnipVNI = SnipLI.getVNInfoAt(VNI->def.getRegSlot(true));
+    assert(SnipVNI && "Snippet undefined before copy");
+    WorkList.push_back(std::make_pair(&SnipLI, SnipVNI));
+  } while (!WorkList.empty());
+}
+
+bool InlineSpiller::canGuaranteeAssignmentAfterRemat(Register VReg,
+                                                     MachineInstr &MI) {
+  if (!RestrictStatepointRemat)
+    return true;
+  // Here's a quick explanation of the problem we're trying to handle here:
+  // * There are some pseudo instructions with more vreg uses than there are
+  //   physical registers on the machine.
+  // * This is normally handled by spilling the vreg, and folding the reload
+  //   into the user instruction.  (Thus decreasing the number of used vregs
+  //   until the remainder can be assigned to physregs.)
+  // * However, since we may try to spill vregs in any order, we can end up
+  //   trying to spill each operand to the instruction, and then rematting it
+  //   instead.  When that happens, the new live intervals (for the remats) are
+  //   expected to be trivially assignable (i.e. RS_Done).  However, since we
+  //   may have more remats than physregs, we're guaranteed to fail to assign
+  //   one.
+  // At the moment, we only handle this for STATEPOINTs since they're the only
+  // pseudo op where we've seen this.  If we start seeing other instructions
+  // with the same problem, we need to revisit this.
+  if (MI.getOpcode() != TargetOpcode::STATEPOINT)
+    return true;
+  // For STATEPOINTs we allow re-materialization for fixed arguments only hoping
+  // that number of physical registers is enough to cover all fixed arguments.
+  // If it is not true we need to revisit it.
+  for (unsigned Idx = StatepointOpers(&MI).getVarIdx(),
+                EndIdx = MI.getNumOperands();
+       Idx < EndIdx; ++Idx) {
+    MachineOperand &MO = MI.getOperand(Idx);
+    if (MO.isReg() && MO.getReg() == VReg)
+      return false;
+  }
+  return true;
+}
+
+/// reMaterializeFor - Attempt to rematerialize before MI instead of reloading.
+bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
+  // Analyze instruction
+  SmallVector<std::pair<MachineInstr *, unsigned>, 8> Ops;
+  VirtRegInfo RI = AnalyzeVirtRegInBundle(MI, VirtReg.reg(), &Ops);
+
+  if (!RI.Reads)
+    return false;
+
+  SlotIndex UseIdx = LIS.getInstructionIndex(MI).getRegSlot(true);
+  VNInfo *ParentVNI = VirtReg.getVNInfoAt(UseIdx.getBaseIndex());
+
+  if (!ParentVNI) {
+    LLVM_DEBUG(dbgs() << "\tadding <undef> flags: ");
+    for (MachineOperand &MO : MI.all_uses())
+      if (MO.getReg() == VirtReg.reg())
+        MO.setIsUndef();
+    LLVM_DEBUG(dbgs() << UseIdx << '\t' << MI);
+    return true;
+  }
+
+  if (SnippetCopies.count(&MI))
+    return false;
+
+  LiveInterval &OrigLI = LIS.getInterval(Original);
+  VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx);
+  LiveRangeEdit::Remat RM(ParentVNI);
+  RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);
+
+  if (!Edit->canRematerializeAt(RM, OrigVNI, UseIdx, false)) {
+    markValueUsed(&VirtReg, ParentVNI);
+    LLVM_DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << MI);
+    return false;
+  }
+
+  // If the instruction also writes VirtReg.reg, it had better not require the
+  // same register for uses and defs.
+  if (RI.Tied) {
+    markValueUsed(&VirtReg, ParentVNI);
+    LLVM_DEBUG(dbgs() << "\tcannot remat tied reg: " << UseIdx << '\t' << MI);
+    return false;
+  }
+
+  // Before rematerializing into a register for a single instruction, try to
+  // fold a load into the instruction. That avoids allocating a new register.
+  if (RM.OrigMI->canFoldAsLoad() &&
+      foldMemoryOperand(Ops, RM.OrigMI)) {
+    Edit->markRematerialized(RM.ParentVNI);
+    ++NumFoldedLoads;
+    return true;
+  }
+
+  // If we can't guarantee that we'll be able to actually assign the new vreg,
+  // we can't remat.
+  if (!canGuaranteeAssignmentAfterRemat(VirtReg.reg(), MI)) {
+    markValueUsed(&VirtReg, ParentVNI);
+    LLVM_DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << MI);
+    return false;
+  }
+
+  // Allocate a new register for the remat.
+  Register NewVReg = Edit->createFrom(Original);
+
+  // Finally we can rematerialize OrigMI before MI.
+  SlotIndex DefIdx =
+      Edit->rematerializeAt(*MI.getParent(), MI, NewVReg, RM, TRI);
+
+  // We take the DebugLoc from MI, since OrigMI may be attributed to a
+  // different source location.
+  auto *NewMI = LIS.getInstructionFromIndex(DefIdx);
+  NewMI->setDebugLoc(MI.getDebugLoc());
+
+  (void)DefIdx;
+  LLVM_DEBUG(dbgs() << "\tremat:  " << DefIdx << '\t'
+                    << *LIS.getInstructionFromIndex(DefIdx));
+
+  // Replace operands
+  for (const auto &OpPair : Ops) {
+    MachineOperand &MO = OpPair.first->getOperand(OpPair.second);
+    if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg()) {
+      MO.setReg(NewVReg);
+      MO.setIsKill();
+    }
+  }
+  LLVM_DEBUG(dbgs() << "\t        " << UseIdx << '\t' << MI << '\n');
+
+  ++NumRemats;
+  return true;
+}
+
+/// reMaterializeAll - Try to rematerialize as many uses as possible,
+/// and trim the live ranges after.
+void InlineSpiller::reMaterializeAll() {
+  if (!Edit->anyRematerializable())
+    return;
+
+  UsedValues.clear();
+
+  // Try to remat before all uses of snippets.
+  bool anyRemat = false;
+  for (Register Reg : RegsToSpill) {
+    LiveInterval &LI = LIS.getInterval(Reg);
+    for (MachineInstr &MI : llvm::make_early_inc_range(MRI.reg_bundles(Reg))) {
+      // Debug values are not allowed to affect codegen.
+      if (MI.isDebugValue())
+        continue;
+
+      assert(!MI.isDebugInstr() && "Did not expect to find a use in debug "
+             "instruction that isn't a DBG_VALUE");
+
+      anyRemat |= reMaterializeFor(LI, MI);
+    }
+  }
+  if (!anyRemat)
+    return;
+
+  // Remove any values that were completely rematted.
+  for (Register Reg : RegsToSpill) {
+    LiveInterval &LI = LIS.getInterval(Reg);
+    for (VNInfo *VNI : LI.vnis()) {
+      if (VNI->isUnused() || VNI->isPHIDef() || UsedValues.count(VNI))
+        continue;
+      MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
+      MI->addRegisterDead(Reg, &TRI);
+      if (!MI->allDefsAreDead())
+        continue;
+      LLVM_DEBUG(dbgs() << "All defs dead: " << *MI);
+      DeadDefs.push_back(MI);
+    }
+  }
+
+  // Eliminate dead code after remat. Note that some snippet copies may be
+  // deleted here.
+  if (DeadDefs.empty())
+    return;
+  LLVM_DEBUG(dbgs() << "Remat created " << DeadDefs.size() << " dead defs.\n");
+  Edit->eliminateDeadDefs(DeadDefs, RegsToSpill);
+
+  // LiveRangeEdit::eliminateDeadDef is used to remove dead define instructions
+  // after rematerialization.  To remove a VNI for a vreg from its LiveInterval,
+  // LiveIntervals::removeVRegDefAt is used. However, after non-PHI VNIs are all
+  // removed, PHI VNI are still left in the LiveInterval.
+  // So to get rid of unused reg, we need to check whether it has non-dbg
+  // reference instead of whether it has non-empty interval.
+  unsigned ResultPos = 0;
+  for (Register Reg : RegsToSpill) {
+    if (MRI.reg_nodbg_empty(Reg)) {
+      Edit->eraseVirtReg(Reg);
+      continue;
+    }
+
+    assert(LIS.hasInterval(Reg) &&
+           (!LIS.getInterval(Reg).empty() || !MRI.reg_nodbg_empty(Reg)) &&
+           "Empty and not used live-range?!");
+
+    RegsToSpill[ResultPos++] = Reg;
+  }
+  RegsToSpill.erase(RegsToSpill.begin() + ResultPos, RegsToSpill.end());
+  LLVM_DEBUG(dbgs() << RegsToSpill.size()
+                    << " registers to spill after remat.\n");
+}
+
+//===----------------------------------------------------------------------===//
+//                                 Spilling
+//===----------------------------------------------------------------------===//
+
+/// If MI is a load or store of StackSlot, it can be removed.
+bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, Register Reg) {
+  int FI = 0;
+  Register InstrReg = TII.isLoadFromStackSlot(*MI, FI);
+  bool IsLoad = InstrReg;
+  if (!IsLoad)
+    InstrReg = TII.isStoreToStackSlot(*MI, FI);
+
+  // We have a stack access. Is it the right register and slot?
+  if (InstrReg != Reg || FI != StackSlot)
+    return false;
+
+  if (!IsLoad)
+    HSpiller.rmFromMergeableSpills(*MI, StackSlot);
+
+  LLVM_DEBUG(dbgs() << "Coalescing stack access: " << *MI);
+  LIS.RemoveMachineInstrFromMaps(*MI);
+  MI->eraseFromParent();
+
+  if (IsLoad) {
+    ++NumReloadsRemoved;
+    --NumReloads;
+  } else {
+    ++NumSpillsRemoved;
+    --NumSpills;
+  }
+
+  return true;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+// Dump the range of instructions from B to E with their slot indexes.
+static void dumpMachineInstrRangeWithSlotIndex(MachineBasicBlock::iterator B,
+                                               MachineBasicBlock::iterator E,
+                                               LiveIntervals const &LIS,
+                                               const char *const header,
+                                               Register VReg = Register()) {
+  char NextLine = '\n';
+  char SlotIndent = '\t';
+
+  if (std::next(B) == E) {
+    NextLine = ' ';
+    SlotIndent = ' ';
+  }
+
+  dbgs() << '\t' << header << ": " << NextLine;
+
+  for (MachineBasicBlock::iterator I = B; I != E; ++I) {
+    SlotIndex Idx = LIS.getInstructionIndex(*I).getRegSlot();
+
+    // If a register was passed in and this instruction has it as a
+    // destination that is marked as an early clobber, print the
+    // early-clobber slot index.
+    if (VReg) {
+      MachineOperand *MO = I->findRegisterDefOperand(VReg);
+      if (MO && MO->isEarlyClobber())
+        Idx = Idx.getRegSlot(true);
+    }
+
+    dbgs() << SlotIndent << Idx << '\t' << *I;
+  }
+}
+#endif
+
+/// foldMemoryOperand - Try folding stack slot references in Ops into their
+/// instructions.
+///
+/// @param Ops    Operand indices from AnalyzeVirtRegInBundle().
+/// @param LoadMI Load instruction to use instead of stack slot when non-null.
+/// @return       True on success.
+bool InlineSpiller::
+foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
+                  MachineInstr *LoadMI) {
+  if (Ops.empty())
+    return false;
+  // Don't attempt folding in bundles.
+  MachineInstr *MI = Ops.front().first;
+  if (Ops.back().first != MI || MI->isBundled())
+    return false;
+
+  bool WasCopy = MI->isCopy();
+  Register ImpReg;
+
+  // TII::foldMemoryOperand will do what we need here for statepoint
+  // (fold load into use and remove corresponding def). We will replace
+  // uses of removed def with loads (spillAroundUses).
+  // For that to work we need to untie def and use to pass it through
+  // foldMemoryOperand and signal foldPatchpoint that it is allowed to
+  // fold them.
+  bool UntieRegs = MI->getOpcode() == TargetOpcode::STATEPOINT;
+
+  // Spill subregs if the target allows it.
+  // We always want to spill subregs for stackmap/patchpoint pseudos.
+  bool SpillSubRegs = TII.isSubregFoldable() ||
+                      MI->getOpcode() == TargetOpcode::STATEPOINT ||
+                      MI->getOpcode() == TargetOpcode::PATCHPOINT ||
+                      MI->getOpcode() == TargetOpcode::STACKMAP;
+
+  // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
+  // operands.
+  SmallVector<unsigned, 8> FoldOps;
+  for (const auto &OpPair : Ops) {
+    unsigned Idx = OpPair.second;
+    assert(MI == OpPair.first && "Instruction conflict during operand folding");
+    MachineOperand &MO = MI->getOperand(Idx);
+
+    // No point restoring an undef read, and we'll produce an invalid live
+    // interval.
+    // TODO: Is this really the correct way to handle undef tied uses?
+    if (MO.isUse() && !MO.readsReg() && !MO.isTied())
+      continue;
+
+    if (MO.isImplicit()) {
+      ImpReg = MO.getReg();
+      continue;
+    }
+
+    if (!SpillSubRegs && MO.getSubReg())
+      return false;
+    // We cannot fold a load instruction into a def.
+    if (LoadMI && MO.isDef())
+      return false;
+    // Tied use operands should not be passed to foldMemoryOperand.
+    if (UntieRegs || !MI->isRegTiedToDefOperand(Idx))
+      FoldOps.push_back(Idx);
+  }
+
+  // If we only have implicit uses, we won't be able to fold that.
+  // Moreover, TargetInstrInfo::foldMemoryOperand will assert if we try!
+  if (FoldOps.empty())
+    return false;
+
+  MachineInstrSpan MIS(MI, MI->getParent());
+
+  SmallVector<std::pair<unsigned, unsigned> > TiedOps;
+  if (UntieRegs)
+    for (unsigned Idx : FoldOps) {
+      MachineOperand &MO = MI->getOperand(Idx);
+      if (!MO.isTied())
+        continue;
+      unsigned Tied = MI->findTiedOperandIdx(Idx);
+      if (MO.isUse())
+        TiedOps.emplace_back(Tied, Idx);
+      else {
+        assert(MO.isDef() && "Tied to not use and def?");
+        TiedOps.emplace_back(Idx, Tied);
+      }
+      MI->untieRegOperand(Idx);
+    }
+
+  MachineInstr *FoldMI =
+      LoadMI ? TII.foldMemoryOperand(*MI, FoldOps, *LoadMI, &LIS)
+             : TII.foldMemoryOperand(*MI, FoldOps, StackSlot, &LIS, &VRM);
+  if (!FoldMI) {
+    // Re-tie operands.
+    for (auto Tied : TiedOps)
+      MI->tieOperands(Tied.first, Tied.second);
+    return false;
+  }
+
+  // Remove LIS for any dead defs in the original MI not in FoldMI.
+  for (MIBundleOperands MO(*MI); MO.isValid(); ++MO) {
+    if (!MO->isReg())
+      continue;
+    Register Reg = MO->getReg();
+    if (!Reg || Reg.isVirtual() || MRI.isReserved(Reg)) {
+      continue;
+    }
+    // Skip non-Defs, including undef uses and internal reads.
+    if (MO->isUse())
+      continue;
+    PhysRegInfo RI = AnalyzePhysRegInBundle(*FoldMI, Reg, &TRI);
+    if (RI.FullyDefined)
+      continue;
+    // FoldMI does not define this physreg. Remove the LI segment.
+    assert(MO->isDead() && "Cannot fold physreg def");
+    SlotIndex Idx = LIS.getInstructionIndex(*MI).getRegSlot();
+    LIS.removePhysRegDefAt(Reg.asMCReg(), Idx);
+  }
+
+  int FI;
+  if (TII.isStoreToStackSlot(*MI, FI) &&
+      HSpiller.rmFromMergeableSpills(*MI, FI))
+    --NumSpills;
+  LIS.ReplaceMachineInstrInMaps(*MI, *FoldMI);
+  // Update the call site info.
+  if (MI->isCandidateForCallSiteEntry())
+    MI->getMF()->moveCallSiteInfo(MI, FoldMI);
+
+  // If we've folded a store into an instruction labelled with debug-info,
+  // record a substitution from the old operand to the memory operand. Handle
+  // the simple common case where operand 0 is the one being folded, plus when
+  // the destination operand is also a tied def. More values could be
+  // substituted / preserved with more analysis.
+  if (MI->peekDebugInstrNum() && Ops[0].second == 0) {
+    // Helper lambda.
+    auto MakeSubstitution = [this,FoldMI,MI,&Ops]() {
+      // Substitute old operand zero to the new instructions memory operand.
+      unsigned OldOperandNum = Ops[0].second;
+      unsigned NewNum = FoldMI->getDebugInstrNum();
+      unsigned OldNum = MI->getDebugInstrNum();
+      MF.makeDebugValueSubstitution({OldNum, OldOperandNum},
+                         {NewNum, MachineFunction::DebugOperandMemNumber});
+    };
+
+    const MachineOperand &Op0 = MI->getOperand(Ops[0].second);
+    if (Ops.size() == 1 && Op0.isDef()) {
+      MakeSubstitution();
+    } else if (Ops.size() == 2 && Op0.isDef() && MI->getOperand(1).isTied() &&
+               Op0.getReg() == MI->getOperand(1).getReg()) {
+      MakeSubstitution();
+    }
+  } else if (MI->peekDebugInstrNum()) {
+    // This is a debug-labelled instruction, but the operand being folded isn't
+    // at operand zero. Most likely this means it's a load being folded in.
+    // Substitute any register defs from operand zero up to the one being
+    // folded -- past that point, we don't know what the new operand indexes
+    // will be.
+    MF.substituteDebugValuesForInst(*MI, *FoldMI, Ops[0].second);
+  }
+
+  MI->eraseFromParent();
+
+  // Insert any new instructions other than FoldMI into the LIS maps.
+  assert(!MIS.empty() && "Unexpected empty span of instructions!");
+  for (MachineInstr &MI : MIS)
+    if (&MI != FoldMI)
+      LIS.InsertMachineInstrInMaps(MI);
+
+  // TII.foldMemoryOperand may have left some implicit operands on the
+  // instruction.  Strip them.
+  if (ImpReg)
+    for (unsigned i = FoldMI->getNumOperands(); i; --i) {
+      MachineOperand &MO = FoldMI->getOperand(i - 1);
+      if (!MO.isReg() || !MO.isImplicit())
+        break;
+      if (MO.getReg() == ImpReg)
+        FoldMI->removeOperand(i - 1);
+    }
+
+  LLVM_DEBUG(dumpMachineInstrRangeWithSlotIndex(MIS.begin(), MIS.end(), LIS,
+                                                "folded"));
+
+  if (!WasCopy)
+    ++NumFolded;
+  else if (Ops.front().second == 0) {
+    ++NumSpills;
+    // If there is only 1 store instruction is required for spill, add it
+    // to mergeable list. In X86 AMX, 2 intructions are required to store.
+    // We disable the merge for this case.
+    if (std::distance(MIS.begin(), MIS.end()) <= 1)
+      HSpiller.addToMergeableSpills(*FoldMI, StackSlot, Original);
+  } else
+    ++NumReloads;
+  return true;
+}
+
+void InlineSpiller::insertReload(Register NewVReg,
+                                 SlotIndex Idx,
+                                 MachineBasicBlock::iterator MI) {
+  MachineBasicBlock &MBB = *MI->getParent();
+
+  MachineInstrSpan MIS(MI, &MBB);
+  TII.loadRegFromStackSlot(MBB, MI, NewVReg, StackSlot,
+                           MRI.getRegClass(NewVReg), &TRI, Register());
+
+  LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MI);
+
+  LLVM_DEBUG(dumpMachineInstrRangeWithSlotIndex(MIS.begin(), MI, LIS, "reload",
+                                                NewVReg));
+  ++NumReloads;
+}
+
+/// Check if \p Def fully defines a VReg with an undefined value.
+/// If that's the case, that means the value of VReg is actually
+/// not relevant.
+static bool isRealSpill(const MachineInstr &Def) {
+  if (!Def.isImplicitDef())
+    return true;
+  assert(Def.getNumOperands() == 1 &&
+         "Implicit def with more than one definition");
+  // We can say that the VReg defined by Def is undef, only if it is
+  // fully defined by Def. Otherwise, some of the lanes may not be
+  // undef and the value of the VReg matters.
+  return Def.getOperand(0).getSubReg();
+}
+
+/// insertSpill - Insert a spill of NewVReg after MI.
+void InlineSpiller::insertSpill(Register NewVReg, bool isKill,
+                                 MachineBasicBlock::iterator MI) {
+  // Spill are not terminators, so inserting spills after terminators will
+  // violate invariants in MachineVerifier.
+  assert(!MI->isTerminator() && "Inserting a spill after a terminator");
+  MachineBasicBlock &MBB = *MI->getParent();
+
+  MachineInstrSpan MIS(MI, &MBB);
+  MachineBasicBlock::iterator SpillBefore = std::next(MI);
+  bool IsRealSpill = isRealSpill(*MI);
+
+  if (IsRealSpill)
+    TII.storeRegToStackSlot(MBB, SpillBefore, NewVReg, isKill, StackSlot,
+                            MRI.getRegClass(NewVReg), &TRI, Register());
+  else
+    // Don't spill undef value.
+    // Anything works for undef, in particular keeping the memory
+    // uninitialized is a viable option and it saves code size and
+    // run time.
+    BuildMI(MBB, SpillBefore, MI->getDebugLoc(), TII.get(TargetOpcode::KILL))
+        .addReg(NewVReg, getKillRegState(isKill));
+
+  MachineBasicBlock::iterator Spill = std::next(MI);
+  LIS.InsertMachineInstrRangeInMaps(Spill, MIS.end());
+  for (const MachineInstr &MI : make_range(Spill, MIS.end()))
+    getVDefInterval(MI, LIS);
+
+  LLVM_DEBUG(
+      dumpMachineInstrRangeWithSlotIndex(Spill, MIS.end(), LIS, "spill"));
+  ++NumSpills;
+  // If there is only 1 store instruction is required for spill, add it
+  // to mergeable list. In X86 AMX, 2 intructions are required to store.
+  // We disable the merge for this case.
+  if (IsRealSpill && std::distance(Spill, MIS.end()) <= 1)
+    HSpiller.addToMergeableSpills(*Spill, StackSlot, Original);
+}
+
+/// spillAroundUses - insert spill code around each use of Reg.
+void InlineSpiller::spillAroundUses(Register Reg) {
+  LLVM_DEBUG(dbgs() << "spillAroundUses " << printReg(Reg) << '\n');
+  LiveInterval &OldLI = LIS.getInterval(Reg);
+
+  // Iterate over instructions using Reg.
+  for (MachineInstr &MI : llvm::make_early_inc_range(MRI.reg_bundles(Reg))) {
+    // Debug values are not allowed to affect codegen.
+    if (MI.isDebugValue()) {
+      // Modify DBG_VALUE now that the value is in a spill slot.
+      MachineBasicBlock *MBB = MI.getParent();
+      LLVM_DEBUG(dbgs() << "Modifying debug info due to spill:\t" << MI);
+      buildDbgValueForSpill(*MBB, &MI, MI, StackSlot, Reg);
+      MBB->erase(MI);
+      continue;
+    }
+
+    assert(!MI.isDebugInstr() && "Did not expect to find a use in debug "
+           "instruction that isn't a DBG_VALUE");
+
+    // Ignore copies to/from snippets. We'll delete them.
+    if (SnippetCopies.count(&MI))
+      continue;
+
+    // Stack slot accesses may coalesce away.
+    if (coalesceStackAccess(&MI, Reg))
+      continue;
+
+    // Analyze instruction.
+    SmallVector<std::pair<MachineInstr*, unsigned>, 8> Ops;
+    VirtRegInfo RI = AnalyzeVirtRegInBundle(MI, Reg, &Ops);
+
+    // Find the slot index where this instruction reads and writes OldLI.
+    // This is usually the def slot, except for tied early clobbers.
+    SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
+    if (VNInfo *VNI = OldLI.getVNInfoAt(Idx.getRegSlot(true)))
+      if (SlotIndex::isSameInstr(Idx, VNI->def))
+        Idx = VNI->def;
+
+    // Check for a sibling copy.
+    Register SibReg = isCopyOfBundle(MI, Reg);
+    if (SibReg && isSibling(SibReg)) {
+      // This may actually be a copy between snippets.
+      if (isRegToSpill(SibReg)) {
+        LLVM_DEBUG(dbgs() << "Found new snippet copy: " << MI);
+        SnippetCopies.insert(&MI);
+        continue;
+      }
+      if (RI.Writes) {
+        if (hoistSpillInsideBB(OldLI, MI)) {
+          // This COPY is now dead, the value is already in the stack slot.
+          MI.getOperand(0).setIsDead();
+          DeadDefs.push_back(&MI);
+          continue;
+        }
+      } else {
+        // This is a reload for a sib-reg copy. Drop spills downstream.
+        LiveInterval &SibLI = LIS.getInterval(SibReg);
+        eliminateRedundantSpills(SibLI, SibLI.getVNInfoAt(Idx));
+        // The COPY will fold to a reload below.
+      }
+    }
+
+    // Attempt to fold memory ops.
+    if (foldMemoryOperand(Ops))
+      continue;
+
+    // Create a new virtual register for spill/fill.
+    // FIXME: Infer regclass from instruction alone.
+    Register NewVReg = Edit->createFrom(Reg);
+
+    if (RI.Reads)
+      insertReload(NewVReg, Idx, &MI);
+
+    // Rewrite instruction operands.
+    bool hasLiveDef = false;
+    for (const auto &OpPair : Ops) {
+      MachineOperand &MO = OpPair.first->getOperand(OpPair.second);
+      MO.setReg(NewVReg);
+      if (MO.isUse()) {
+        if (!OpPair.first->isRegTiedToDefOperand(OpPair.second))
+          MO.setIsKill();
+      } else {
+        if (!MO.isDead())
+          hasLiveDef = true;
+      }
+    }
+    LLVM_DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << MI << '\n');
+
+    // FIXME: Use a second vreg if instruction has no tied ops.
+    if (RI.Writes)
+      if (hasLiveDef)
+        insertSpill(NewVReg, true, &MI);
+  }
+}
+
+/// spillAll - Spill all registers remaining after rematerialization.
+void InlineSpiller::spillAll() {
+  // Update LiveStacks now that we are committed to spilling.
+  if (StackSlot == VirtRegMap::NO_STACK_SLOT) {
+    StackSlot = VRM.assignVirt2StackSlot(Original);
+    StackInt = &LSS.getOrCreateInterval(StackSlot, MRI.getRegClass(Original));
+    StackInt->getNextValue(SlotIndex(), LSS.getVNInfoAllocator());
+  } else
+    StackInt = &LSS.getInterval(StackSlot);
+
+  if (Original != Edit->getReg())
+    VRM.assignVirt2StackSlot(Edit->getReg(), StackSlot);
+
+  assert(StackInt->getNumValNums() == 1 && "Bad stack interval values");
+  for (Register Reg : RegsToSpill)
+    StackInt->MergeSegmentsInAsValue(LIS.getInterval(Reg),
+                                     StackInt->getValNumInfo(0));
+  LLVM_DEBUG(dbgs() << "Merged spilled regs: " << *StackInt << '\n');
+
+  // Spill around uses of all RegsToSpill.
+  for (Register Reg : RegsToSpill)
+    spillAroundUses(Reg);
+
+  // Hoisted spills may cause dead code.
+  if (!DeadDefs.empty()) {
+    LLVM_DEBUG(dbgs() << "Eliminating " << DeadDefs.size() << " dead defs\n");
+    Edit->eliminateDeadDefs(DeadDefs, RegsToSpill);
+  }
+
+  // Finally delete the SnippetCopies.
+  for (Register Reg : RegsToSpill) {
+    for (MachineInstr &MI :
+         llvm::make_early_inc_range(MRI.reg_instructions(Reg))) {
+      assert(SnippetCopies.count(&MI) && "Remaining use wasn't a snippet copy");
+      // FIXME: Do this with a LiveRangeEdit callback.
+      LIS.getSlotIndexes()->removeSingleMachineInstrFromMaps(MI);
+      MI.eraseFromBundle();
+    }
+  }
+
+  // Delete all spilled registers.
+  for (Register Reg : RegsToSpill)
+    Edit->eraseVirtReg(Reg);
+}
+
+void InlineSpiller::spill(LiveRangeEdit &edit) {
+  ++NumSpilledRanges;
+  Edit = &edit;
+  assert(!Register::isStackSlot(edit.getReg()) &&
+         "Trying to spill a stack slot.");
+  // Share a stack slot among all descendants of Original.
+  Original = VRM.getOriginal(edit.getReg());
+  StackSlot = VRM.getStackSlot(Original);
+  StackInt = nullptr;
+
+  LLVM_DEBUG(dbgs() << "Inline spilling "
+                    << TRI.getRegClassName(MRI.getRegClass(edit.getReg()))
+                    << ':' << edit.getParent() << "\nFrom original "
+                    << printReg(Original) << '\n');
+  assert(edit.getParent().isSpillable() &&
+         "Attempting to spill already spilled value.");
+  assert(DeadDefs.empty() && "Previous spill didn't remove dead defs");
+
+  collectRegsToSpill();
+  reMaterializeAll();
+
+  // Remat may handle everything.
+  if (!RegsToSpill.empty())
+    spillAll();
+
+  Edit->calculateRegClassAndHint(MF, VRAI);
+}
+
+/// Optimizations after all the reg selections and spills are done.
+void InlineSpiller::postOptimization() { HSpiller.hoistAllSpills(); }
+
+/// When a spill is inserted, add the spill to MergeableSpills map.
+void HoistSpillHelper::addToMergeableSpills(MachineInstr &Spill, int StackSlot,
+                                            unsigned Original) {
+  BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator();
+  LiveInterval &OrigLI = LIS.getInterval(Original);
+  // save a copy of LiveInterval in StackSlotToOrigLI because the original
+  // LiveInterval may be cleared after all its references are spilled.
+  if (!StackSlotToOrigLI.contains(StackSlot)) {
+    auto LI = std::make_unique<LiveInterval>(OrigLI.reg(), OrigLI.weight());
+    LI->assign(OrigLI, Allocator);
+    StackSlotToOrigLI[StackSlot] = std::move(LI);
+  }
+  SlotIndex Idx = LIS.getInstructionIndex(Spill);
+  VNInfo *OrigVNI = StackSlotToOrigLI[StackSlot]->getVNInfoAt(Idx.getRegSlot());
+  std::pair<int, VNInfo *> MIdx = std::make_pair(StackSlot, OrigVNI);
+  MergeableSpills[MIdx].insert(&Spill);
+}
+
+/// When a spill is removed, remove the spill from MergeableSpills map.
+/// Return true if the spill is removed successfully.
+bool HoistSpillHelper::rmFromMergeableSpills(MachineInstr &Spill,
+                                             int StackSlot) {
+  auto It = StackSlotToOrigLI.find(StackSlot);
+  if (It == StackSlotToOrigLI.end())
+    return false;
+  SlotIndex Idx = LIS.getInstructionIndex(Spill);
+  VNInfo *OrigVNI = It->second->getVNInfoAt(Idx.getRegSlot());
+  std::pair<int, VNInfo *> MIdx = std::make_pair(StackSlot, OrigVNI);
+  return MergeableSpills[MIdx].erase(&Spill);
+}
+
+/// Check BB to see if it is a possible target BB to place a hoisted spill,
+/// i.e., there should be a living sibling of OrigReg at the insert point.
+bool HoistSpillHelper::isSpillCandBB(LiveInterval &OrigLI, VNInfo &OrigVNI,
+                                     MachineBasicBlock &BB, Register &LiveReg) {
+  SlotIndex Idx = IPA.getLastInsertPoint(OrigLI, BB);
+  // The original def could be after the last insert point in the root block,
+  // we can't hoist to here.
+  if (Idx < OrigVNI.def) {
+    // TODO: We could be better here. If LI is not alive in landing pad
+    // we could hoist spill after LIP.
+    LLVM_DEBUG(dbgs() << "can't spill in root block - def after LIP\n");
+    return false;
+  }
+  Register OrigReg = OrigLI.reg();
+  SmallSetVector<Register, 16> &Siblings = Virt2SiblingsMap[OrigReg];
+  assert(OrigLI.getVNInfoAt(Idx) == &OrigVNI && "Unexpected VNI");
+
+  for (const Register &SibReg : Siblings) {
+    LiveInterval &LI = LIS.getInterval(SibReg);
+    VNInfo *VNI = LI.getVNInfoAt(Idx);
+    if (VNI) {
+      LiveReg = SibReg;
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Remove redundant spills in the same BB. Save those redundant spills in
+/// SpillsToRm, and save the spill to keep and its BB in SpillBBToSpill map.
+void HoistSpillHelper::rmRedundantSpills(
+    SmallPtrSet<MachineInstr *, 16> &Spills,
+    SmallVectorImpl<MachineInstr *> &SpillsToRm,
+    DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill) {
+  // For each spill saw, check SpillBBToSpill[] and see if its BB already has
+  // another spill inside. If a BB contains more than one spill, only keep the
+  // earlier spill with smaller SlotIndex.
+  for (auto *const CurrentSpill : Spills) {
+    MachineBasicBlock *Block = CurrentSpill->getParent();
+    MachineDomTreeNode *Node = MDT.getBase().getNode(Block);
+    MachineInstr *PrevSpill = SpillBBToSpill[Node];
+    if (PrevSpill) {
+      SlotIndex PIdx = LIS.getInstructionIndex(*PrevSpill);
+      SlotIndex CIdx = LIS.getInstructionIndex(*CurrentSpill);
+      MachineInstr *SpillToRm = (CIdx > PIdx) ? CurrentSpill : PrevSpill;
+      MachineInstr *SpillToKeep = (CIdx > PIdx) ? PrevSpill : CurrentSpill;
+      SpillsToRm.push_back(SpillToRm);
+      SpillBBToSpill[MDT.getBase().getNode(Block)] = SpillToKeep;
+    } else {
+      SpillBBToSpill[MDT.getBase().getNode(Block)] = CurrentSpill;
+    }
+  }
+  for (auto *const SpillToRm : SpillsToRm)
+    Spills.erase(SpillToRm);
+}
+
+/// Starting from \p Root find a top-down traversal order of the dominator
+/// tree to visit all basic blocks containing the elements of \p Spills.
+/// Redundant spills will be found and put into \p SpillsToRm at the same
+/// time. \p SpillBBToSpill will be populated as part of the process and
+/// maps a basic block to the first store occurring in the basic block.
+/// \post SpillsToRm.union(Spills\@post) == Spills\@pre
+void HoistSpillHelper::getVisitOrders(
+    MachineBasicBlock *Root, SmallPtrSet<MachineInstr *, 16> &Spills,
+    SmallVectorImpl<MachineDomTreeNode *> &Orders,
+    SmallVectorImpl<MachineInstr *> &SpillsToRm,
+    DenseMap<MachineDomTreeNode *, unsigned> &SpillsToKeep,
+    DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill) {
+  // The set contains all the possible BB nodes to which we may hoist
+  // original spills.
+  SmallPtrSet<MachineDomTreeNode *, 8> WorkSet;
+  // Save the BB nodes on the path from the first BB node containing
+  // non-redundant spill to the Root node.
+  SmallPtrSet<MachineDomTreeNode *, 8> NodesOnPath;
+  // All the spills to be hoisted must originate from a single def instruction
+  // to the OrigReg. It means the def instruction should dominate all the spills
+  // to be hoisted. We choose the BB where the def instruction is located as
+  // the Root.
+  MachineDomTreeNode *RootIDomNode = MDT[Root]->getIDom();
+  // For every node on the dominator tree with spill, walk up on the dominator
+  // tree towards the Root node until it is reached. If there is other node
+  // containing spill in the middle of the path, the previous spill saw will
+  // be redundant and the node containing it will be removed. All the nodes on
+  // the path starting from the first node with non-redundant spill to the Root
+  // node will be added to the WorkSet, which will contain all the possible
+  // locations where spills may be hoisted to after the loop below is done.
+  for (auto *const Spill : Spills) {
+    MachineBasicBlock *Block = Spill->getParent();
+    MachineDomTreeNode *Node = MDT[Block];
+    MachineInstr *SpillToRm = nullptr;
+    while (Node != RootIDomNode) {
+      // If Node dominates Block, and it already contains a spill, the spill in
+      // Block will be redundant.
+      if (Node != MDT[Block] && SpillBBToSpill[Node]) {
+        SpillToRm = SpillBBToSpill[MDT[Block]];
+        break;
+        /// If we see the Node already in WorkSet, the path from the Node to
+        /// the Root node must already be traversed by another spill.
+        /// Then no need to repeat.
+      } else if (WorkSet.count(Node)) {
+        break;
+      } else {
+        NodesOnPath.insert(Node);
+      }
+      Node = Node->getIDom();
+    }
+    if (SpillToRm) {
+      SpillsToRm.push_back(SpillToRm);
+    } else {
+      // Add a BB containing the original spills to SpillsToKeep -- i.e.,
+      // set the initial status before hoisting start. The value of BBs
+      // containing original spills is set to 0, in order to descriminate
+      // with BBs containing hoisted spills which will be inserted to
+      // SpillsToKeep later during hoisting.
+      SpillsToKeep[MDT[Block]] = 0;
+      WorkSet.insert(NodesOnPath.begin(), NodesOnPath.end());
+    }
+    NodesOnPath.clear();
+  }
+
+  // Sort the nodes in WorkSet in top-down order and save the nodes
+  // in Orders. Orders will be used for hoisting in runHoistSpills.
+  unsigned idx = 0;
+  Orders.push_back(MDT.getBase().getNode(Root));
+  do {
+    MachineDomTreeNode *Node = Orders[idx++];
+    for (MachineDomTreeNode *Child : Node->children()) {
+      if (WorkSet.count(Child))
+        Orders.push_back(Child);
+    }
+  } while (idx != Orders.size());
+  assert(Orders.size() == WorkSet.size() &&
+         "Orders have different size with WorkSet");
+
+#ifndef NDEBUG
+  LLVM_DEBUG(dbgs() << "Orders size is " << Orders.size() << "\n");
+  SmallVector<MachineDomTreeNode *, 32>::reverse_iterator RIt = Orders.rbegin();
+  for (; RIt != Orders.rend(); RIt++)
+    LLVM_DEBUG(dbgs() << "BB" << (*RIt)->getBlock()->getNumber() << ",");
+  LLVM_DEBUG(dbgs() << "\n");
+#endif
+}
+
+/// Try to hoist spills according to BB hotness. The spills to removed will
+/// be saved in \p SpillsToRm. The spills to be inserted will be saved in
+/// \p SpillsToIns.
+void HoistSpillHelper::runHoistSpills(
+    LiveInterval &OrigLI, VNInfo &OrigVNI,
+    SmallPtrSet<MachineInstr *, 16> &Spills,
+    SmallVectorImpl<MachineInstr *> &SpillsToRm,
+    DenseMap<MachineBasicBlock *, unsigned> &SpillsToIns) {
+  // Visit order of dominator tree nodes.
+  SmallVector<MachineDomTreeNode *, 32> Orders;
+  // SpillsToKeep contains all the nodes where spills are to be inserted
+  // during hoisting. If the spill to be inserted is an original spill
+  // (not a hoisted one), the value of the map entry is 0. If the spill
+  // is a hoisted spill, the value of the map entry is the VReg to be used
+  // as the source of the spill.
+  DenseMap<MachineDomTreeNode *, unsigned> SpillsToKeep;
+  // Map from BB to the first spill inside of it.
+  DenseMap<MachineDomTreeNode *, MachineInstr *> SpillBBToSpill;
+
+  rmRedundantSpills(Spills, SpillsToRm, SpillBBToSpill);
+
+  MachineBasicBlock *Root = LIS.getMBBFromIndex(OrigVNI.def);
+  getVisitOrders(Root, Spills, Orders, SpillsToRm, SpillsToKeep,
+                 SpillBBToSpill);
+
+  // SpillsInSubTreeMap keeps the map from a dom tree node to a pair of
+  // nodes set and the cost of all the spills inside those nodes.
+  // The nodes set are the locations where spills are to be inserted
+  // in the subtree of current node.
+  using NodesCostPair =
+      std::pair<SmallPtrSet<MachineDomTreeNode *, 16>, BlockFrequency>;
+  DenseMap<MachineDomTreeNode *, NodesCostPair> SpillsInSubTreeMap;
+
+  // Iterate Orders set in reverse order, which will be a bottom-up order
+  // in the dominator tree. Once we visit a dom tree node, we know its
+  // children have already been visited and the spill locations in the
+  // subtrees of all the children have been determined.
+  SmallVector<MachineDomTreeNode *, 32>::reverse_iterator RIt = Orders.rbegin();
+  for (; RIt != Orders.rend(); RIt++) {
+    MachineBasicBlock *Block = (*RIt)->getBlock();
+
+    // If Block contains an original spill, simply continue.
+    if (SpillsToKeep.contains(*RIt) && !SpillsToKeep[*RIt]) {
+      SpillsInSubTreeMap[*RIt].first.insert(*RIt);
+      // SpillsInSubTreeMap[*RIt].second contains the cost of spill.
+      SpillsInSubTreeMap[*RIt].second = MBFI.getBlockFreq(Block);
+      continue;
+    }
+
+    // Collect spills in subtree of current node (*RIt) to
+    // SpillsInSubTreeMap[*RIt].first.
+    for (MachineDomTreeNode *Child : (*RIt)->children()) {
+      if (!SpillsInSubTreeMap.contains(Child))
+        continue;
+      // The stmt "SpillsInSubTree = SpillsInSubTreeMap[*RIt].first" below
+      // should be placed before getting the begin and end iterators of
+      // SpillsInSubTreeMap[Child].first, or else the iterators may be
+      // invalidated when SpillsInSubTreeMap[*RIt] is seen the first time
+      // and the map grows and then the original buckets in the map are moved.
+      SmallPtrSet<MachineDomTreeNode *, 16> &SpillsInSubTree =
+          SpillsInSubTreeMap[*RIt].first;
+      BlockFrequency &SubTreeCost = SpillsInSubTreeMap[*RIt].second;
+      SubTreeCost += SpillsInSubTreeMap[Child].second;
+      auto BI = SpillsInSubTreeMap[Child].first.begin();
+      auto EI = SpillsInSubTreeMap[Child].first.end();
+      SpillsInSubTree.insert(BI, EI);
+      SpillsInSubTreeMap.erase(Child);
+    }
+
+    SmallPtrSet<MachineDomTreeNode *, 16> &SpillsInSubTree =
+          SpillsInSubTreeMap[*RIt].first;
+    BlockFrequency &SubTreeCost = SpillsInSubTreeMap[*RIt].second;
+    // No spills in subtree, simply continue.
+    if (SpillsInSubTree.empty())
+      continue;
+
+    // Check whether Block is a possible candidate to insert spill.
+    Register LiveReg;
+    if (!isSpillCandBB(OrigLI, OrigVNI, *Block, LiveReg))
+      continue;
+
+    // If there are multiple spills that could be merged, bias a little
+    // to hoist the spill.
+    BranchProbability MarginProb = (SpillsInSubTree.size() > 1)
+                                       ? BranchProbability(9, 10)
+                                       : BranchProbability(1, 1);
+    if (SubTreeCost > MBFI.getBlockFreq(Block) * MarginProb) {
+      // Hoist: Move spills to current Block.
+      for (auto *const SpillBB : SpillsInSubTree) {
+        // When SpillBB is a BB contains original spill, insert the spill
+        // to SpillsToRm.
+        if (SpillsToKeep.contains(SpillBB) && !SpillsToKeep[SpillBB]) {
+          MachineInstr *SpillToRm = SpillBBToSpill[SpillBB];
+          SpillsToRm.push_back(SpillToRm);
+        }
+        // SpillBB will not contain spill anymore, remove it from SpillsToKeep.
+        SpillsToKeep.erase(SpillBB);
+      }
+      // Current Block is the BB containing the new hoisted spill. Add it to
+      // SpillsToKeep. LiveReg is the source of the new spill.
+      SpillsToKeep[*RIt] = LiveReg;
+      LLVM_DEBUG({
+        dbgs() << "spills in BB: ";
+        for (const auto Rspill : SpillsInSubTree)
+          dbgs() << Rspill->getBlock()->getNumber() << " ";
+        dbgs() << "were promoted to BB" << (*RIt)->getBlock()->getNumber()
+               << "\n";
+      });
+      SpillsInSubTree.clear();
+      SpillsInSubTree.insert(*RIt);
+      SubTreeCost = MBFI.getBlockFreq(Block);
+    }
+  }
+  // For spills in SpillsToKeep with LiveReg set (i.e., not original spill),
+  // save them to SpillsToIns.
+  for (const auto &Ent : SpillsToKeep) {
+    if (Ent.second)
+      SpillsToIns[Ent.first->getBlock()] = Ent.second;
+  }
+}
+
+/// For spills with equal values, remove redundant spills and hoist those left
+/// to less hot spots.
+///
+/// Spills with equal values will be collected into the same set in
+/// MergeableSpills when spill is inserted. These equal spills are originated
+/// from the same defining instruction and are dominated by the instruction.
+/// Before hoisting all the equal spills, redundant spills inside in the same
+/// BB are first marked to be deleted. Then starting from the spills left, walk
+/// up on the dominator tree towards the Root node where the define instruction
+/// is located, mark the dominated spills to be deleted along the way and
+/// collect the BB nodes on the path from non-dominated spills to the define
+/// instruction into a WorkSet. The nodes in WorkSet are the candidate places
+/// where we are considering to hoist the spills. We iterate the WorkSet in
+/// bottom-up order, and for each node, we will decide whether to hoist spills
+/// inside its subtree to that node. In this way, we can get benefit locally
+/// even if hoisting all the equal spills to one cold place is impossible.
+void HoistSpillHelper::hoistAllSpills() {
+  SmallVector<Register, 4> NewVRegs;
+  LiveRangeEdit Edit(nullptr, NewVRegs, MF, LIS, &VRM, this);
+
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
+    Register Reg = Register::index2VirtReg(i);
+    Register Original = VRM.getPreSplitReg(Reg);
+    if (!MRI.def_empty(Reg))
+      Virt2SiblingsMap[Original].insert(Reg);
+  }
+
+  // Each entry in MergeableSpills contains a spill set with equal values.
+  for (auto &Ent : MergeableSpills) {
+    int Slot = Ent.first.first;
+    LiveInterval &OrigLI = *StackSlotToOrigLI[Slot];
+    VNInfo *OrigVNI = Ent.first.second;
+    SmallPtrSet<MachineInstr *, 16> &EqValSpills = Ent.second;
+    if (Ent.second.empty())
+      continue;
+
+    LLVM_DEBUG({
+      dbgs() << "\nFor Slot" << Slot << " and VN" << OrigVNI->id << ":\n"
+             << "Equal spills in BB: ";
+      for (const auto spill : EqValSpills)
+        dbgs() << spill->getParent()->getNumber() << " ";
+      dbgs() << "\n";
+    });
+
+    // SpillsToRm is the spill set to be removed from EqValSpills.
+    SmallVector<MachineInstr *, 16> SpillsToRm;
+    // SpillsToIns is the spill set to be newly inserted after hoisting.
+    DenseMap<MachineBasicBlock *, unsigned> SpillsToIns;
+
+    runHoistSpills(OrigLI, *OrigVNI, EqValSpills, SpillsToRm, SpillsToIns);
+
+    LLVM_DEBUG({
+      dbgs() << "Finally inserted spills in BB: ";
+      for (const auto &Ispill : SpillsToIns)
+        dbgs() << Ispill.first->getNumber() << " ";
+      dbgs() << "\nFinally removed spills in BB: ";
+      for (const auto Rspill : SpillsToRm)
+        dbgs() << Rspill->getParent()->getNumber() << " ";
+      dbgs() << "\n";
+    });
+
+    // Stack live range update.
+    LiveInterval &StackIntvl = LSS.getInterval(Slot);
+    if (!SpillsToIns.empty() || !SpillsToRm.empty())
+      StackIntvl.MergeValueInAsValue(OrigLI, OrigVNI,
+                                     StackIntvl.getValNumInfo(0));
+
+    // Insert hoisted spills.
+    for (auto const &Insert : SpillsToIns) {
+      MachineBasicBlock *BB = Insert.first;
+      Register LiveReg = Insert.second;
+      MachineBasicBlock::iterator MII = IPA.getLastInsertPointIter(OrigLI, *BB);
+      MachineInstrSpan MIS(MII, BB);
+      TII.storeRegToStackSlot(*BB, MII, LiveReg, false, Slot,
+                              MRI.getRegClass(LiveReg), &TRI, Register());
+      LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MII);
+      for (const MachineInstr &MI : make_range(MIS.begin(), MII))
+        getVDefInterval(MI, LIS);
+      ++NumSpills;
+    }
+
+    // Remove redundant spills or change them to dead instructions.
+    NumSpills -= SpillsToRm.size();
+    for (auto *const RMEnt : SpillsToRm) {
+      RMEnt->setDesc(TII.get(TargetOpcode::KILL));
+      for (unsigned i = RMEnt->getNumOperands(); i; --i) {
+        MachineOperand &MO = RMEnt->getOperand(i - 1);
+        if (MO.isReg() && MO.isImplicit() && MO.isDef() && !MO.isDead())
+          RMEnt->removeOperand(i - 1);
+      }
+    }
+    Edit.eliminateDeadDefs(SpillsToRm, std::nullopt);
+  }
+}
+
+/// For VirtReg clone, the \p New register should have the same physreg or
+/// stackslot as the \p old register.
+void HoistSpillHelper::LRE_DidCloneVirtReg(Register New, Register Old) {
+  if (VRM.hasPhys(Old))
+    VRM.assignVirt2Phys(New, VRM.getPhys(Old));
+  else if (VRM.getStackSlot(Old) != VirtRegMap::NO_STACK_SLOT)
+    VRM.assignVirt2StackSlot(New, VRM.getStackSlot(Old));
+  else
+    llvm_unreachable("VReg should be assigned either physreg or stackslot");
+  if (VRM.hasShape(Old))
+    VRM.assignVirt2Shape(New, VRM.getShape(Old));
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/InterferenceCache.cpp b/contrib/llvm-project/llvm/lib/CodeGen/InterferenceCache.cpp
new file mode 100644
index 000000000000..ae197ee5553a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/InterferenceCache.cpp
@@ -0,0 +1,258 @@
+//===- InterferenceCache.cpp - Caching per-block interference -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// InterferenceCache remembers per-block interference in LiveIntervalUnions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InterferenceCache.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+// Static member used for null interference cursors.
+const InterferenceCache::BlockInterference
+    InterferenceCache::Cursor::NoInterference;
+
+// Initializes PhysRegEntries (instead of a SmallVector, PhysRegEntries is a
+// buffer of size NumPhysRegs to speed up alloc/clear for targets with large
+// reg files). Calloced memory is used for good form, and quites tools like
+// Valgrind too, but zero initialized memory is not required by the algorithm:
+// this is because PhysRegEntries works like a SparseSet and its entries are
+// only valid when there is a corresponding CacheEntries assignment. There is
+// also support for when pass managers are reused for targets with different
+// numbers of PhysRegs: in this case PhysRegEntries is freed and reinitialized.
+void InterferenceCache::reinitPhysRegEntries() {
+  if (PhysRegEntriesCount == TRI->getNumRegs()) return;
+  free(PhysRegEntries);
+  PhysRegEntriesCount = TRI->getNumRegs();
+  PhysRegEntries = static_cast<unsigned char*>(
+      safe_calloc(PhysRegEntriesCount, sizeof(unsigned char)));
+}
+
+void InterferenceCache::init(MachineFunction *mf,
+                             LiveIntervalUnion *liuarray,
+                             SlotIndexes *indexes,
+                             LiveIntervals *lis,
+                             const TargetRegisterInfo *tri) {
+  MF = mf;
+  LIUArray = liuarray;
+  TRI = tri;
+  reinitPhysRegEntries();
+  for (Entry &E : Entries)
+    E.clear(mf, indexes, lis);
+}
+
+InterferenceCache::Entry *InterferenceCache::get(MCRegister PhysReg) {
+  unsigned char E = PhysRegEntries[PhysReg.id()];
+  if (E < CacheEntries && Entries[E].getPhysReg() == PhysReg) {
+    if (!Entries[E].valid(LIUArray, TRI))
+      Entries[E].revalidate(LIUArray, TRI);
+    return &Entries[E];
+  }
+  // No valid entry exists, pick the next round-robin entry.
+  E = RoundRobin;
+  if (++RoundRobin == CacheEntries)
+    RoundRobin = 0;
+  for (unsigned i = 0; i != CacheEntries; ++i) {
+    // Skip entries that are in use.
+    if (Entries[E].hasRefs()) {
+      if (++E == CacheEntries)
+        E = 0;
+      continue;
+    }
+    Entries[E].reset(PhysReg, LIUArray, TRI, MF);
+    PhysRegEntries[PhysReg] = E;
+    return &Entries[E];
+  }
+  llvm_unreachable("Ran out of interference cache entries.");
+}
+
+/// revalidate - LIU contents have changed, update tags.
+void InterferenceCache::Entry::revalidate(LiveIntervalUnion *LIUArray,
+                                          const TargetRegisterInfo *TRI) {
+  // Invalidate all block entries.
+  ++Tag;
+  // Invalidate all iterators.
+  PrevPos = SlotIndex();
+  unsigned i = 0;
+  for (MCRegUnit Unit : TRI->regunits(PhysReg))
+    RegUnits[i++].VirtTag = LIUArray[Unit].getTag();
+}
+
+void InterferenceCache::Entry::reset(MCRegister physReg,
+                                     LiveIntervalUnion *LIUArray,
+                                     const TargetRegisterInfo *TRI,
+                                     const MachineFunction *MF) {
+  assert(!hasRefs() && "Cannot reset cache entry with references");
+  // LIU's changed, invalidate cache.
+  ++Tag;
+  PhysReg = physReg;
+  Blocks.resize(MF->getNumBlockIDs());
+
+  // Reset iterators.
+  PrevPos = SlotIndex();
+  RegUnits.clear();
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    RegUnits.push_back(LIUArray[Unit]);
+    RegUnits.back().Fixed = &LIS->getRegUnit(Unit);
+  }
+}
+
+bool InterferenceCache::Entry::valid(LiveIntervalUnion *LIUArray,
+                                     const TargetRegisterInfo *TRI) {
+  unsigned i = 0, e = RegUnits.size();
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    if (i == e)
+      return false;
+    if (LIUArray[Unit].changedSince(RegUnits[i].VirtTag))
+      return false;
+    ++i;
+  }
+  return i == e;
+}
+
+void InterferenceCache::Entry::update(unsigned MBBNum) {
+  SlotIndex Start, Stop;
+  std::tie(Start, Stop) = Indexes->getMBBRange(MBBNum);
+
+  // Use advanceTo only when possible.
+  if (PrevPos != Start) {
+    if (!PrevPos.isValid() || Start < PrevPos) {
+      for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+        RegUnitInfo &RUI = RegUnits[i];
+        RUI.VirtI.find(Start);
+        RUI.FixedI = RUI.Fixed->find(Start);
+      }
+    } else {
+      for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+        RegUnitInfo &RUI = RegUnits[i];
+        RUI.VirtI.advanceTo(Start);
+        if (RUI.FixedI != RUI.Fixed->end())
+          RUI.FixedI = RUI.Fixed->advanceTo(RUI.FixedI, Start);
+      }
+    }
+    PrevPos = Start;
+  }
+
+  MachineFunction::const_iterator MFI =
+      MF->getBlockNumbered(MBBNum)->getIterator();
+  BlockInterference *BI = &Blocks[MBBNum];
+  ArrayRef<SlotIndex> RegMaskSlots;
+  ArrayRef<const uint32_t*> RegMaskBits;
+  while (true) {
+    BI->Tag = Tag;
+    BI->First = BI->Last = SlotIndex();
+
+    // Check for first interference from virtregs.
+    for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+      LiveIntervalUnion::SegmentIter &I = RegUnits[i].VirtI;
+      if (!I.valid())
+        continue;
+      SlotIndex StartI = I.start();
+      if (StartI >= Stop)
+        continue;
+      if (!BI->First.isValid() || StartI < BI->First)
+        BI->First = StartI;
+    }
+
+    // Same thing for fixed interference.
+    for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+      LiveInterval::const_iterator I = RegUnits[i].FixedI;
+      LiveInterval::const_iterator E = RegUnits[i].Fixed->end();
+      if (I == E)
+        continue;
+      SlotIndex StartI = I->start;
+      if (StartI >= Stop)
+        continue;
+      if (!BI->First.isValid() || StartI < BI->First)
+        BI->First = StartI;
+    }
+
+    // Also check for register mask interference.
+    RegMaskSlots = LIS->getRegMaskSlotsInBlock(MBBNum);
+    RegMaskBits = LIS->getRegMaskBitsInBlock(MBBNum);
+    SlotIndex Limit = BI->First.isValid() ? BI->First : Stop;
+    for (unsigned i = 0, e = RegMaskSlots.size();
+         i != e && RegMaskSlots[i] < Limit; ++i)
+      if (MachineOperand::clobbersPhysReg(RegMaskBits[i], PhysReg)) {
+        // Register mask i clobbers PhysReg before the LIU interference.
+        BI->First = RegMaskSlots[i];
+        break;
+      }
+
+    PrevPos = Stop;
+    if (BI->First.isValid())
+      break;
+
+    // No interference in this block? Go ahead and precompute the next block.
+    if (++MFI == MF->end())
+      return;
+    MBBNum = MFI->getNumber();
+    BI = &Blocks[MBBNum];
+    if (BI->Tag == Tag)
+      return;
+    std::tie(Start, Stop) = Indexes->getMBBRange(MBBNum);
+  }
+
+  // Check for last interference in block.
+  for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+    LiveIntervalUnion::SegmentIter &I = RegUnits[i].VirtI;
+    if (!I.valid() || I.start() >= Stop)
+      continue;
+    I.advanceTo(Stop);
+    bool Backup = !I.valid() || I.start() >= Stop;
+    if (Backup)
+      --I;
+    SlotIndex StopI = I.stop();
+    if (!BI->Last.isValid() || StopI > BI->Last)
+      BI->Last = StopI;
+    if (Backup)
+      ++I;
+  }
+
+  // Fixed interference.
+  for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+    LiveInterval::iterator &I = RegUnits[i].FixedI;
+    LiveRange *LR = RegUnits[i].Fixed;
+    if (I == LR->end() || I->start >= Stop)
+      continue;
+    I = LR->advanceTo(I, Stop);
+    bool Backup = I == LR->end() || I->start >= Stop;
+    if (Backup)
+      --I;
+    SlotIndex StopI = I->end;
+    if (!BI->Last.isValid() || StopI > BI->Last)
+      BI->Last = StopI;
+    if (Backup)
+      ++I;
+  }
+
+  // Also check for register mask interference.
+  SlotIndex Limit = BI->Last.isValid() ? BI->Last : Start;
+  for (unsigned i = RegMaskSlots.size();
+       i && RegMaskSlots[i-1].getDeadSlot() > Limit; --i)
+    if (MachineOperand::clobbersPhysReg(RegMaskBits[i-1], PhysReg)) {
+      // Register mask i-1 clobbers PhysReg after the LIU interference.
+      // Model the regmask clobber as a dead def.
+      BI->Last = RegMaskSlots[i-1].getDeadSlot();
+      break;
+    }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/InterferenceCache.h b/contrib/llvm-project/llvm/lib/CodeGen/InterferenceCache.h
new file mode 100644
index 000000000000..2a176b4f2cf7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/InterferenceCache.h
@@ -0,0 +1,243 @@
+//===- InterferenceCache.h - Caching per-block interference ----*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// InterferenceCache remembers per-block interference from LiveIntervalUnions,
+// fixed RegUnit interference, and register masks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_INTERFERENCECACHE_H
+#define LLVM_LIB_CODEGEN_INTERFERENCECACHE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalUnion.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/Support/Compiler.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+
+namespace llvm {
+
+class LiveIntervals;
+class MachineFunction;
+class TargetRegisterInfo;
+
+class LLVM_LIBRARY_VISIBILITY InterferenceCache {
+  /// BlockInterference - information about the interference in a single basic
+  /// block.
+  struct BlockInterference {
+    unsigned Tag = 0;
+    SlotIndex First;
+    SlotIndex Last;
+
+    BlockInterference() = default;
+  };
+
+  /// Entry - A cache entry containing interference information for all aliases
+  /// of PhysReg in all basic blocks.
+  class Entry {
+    /// PhysReg - The register currently represented.
+    MCRegister PhysReg = 0;
+
+    /// Tag - Cache tag is changed when any of the underlying LiveIntervalUnions
+    /// change.
+    unsigned Tag = 0;
+
+    /// RefCount - The total number of Cursor instances referring to this Entry.
+    unsigned RefCount = 0;
+
+    /// MF - The current function.
+    MachineFunction *MF = nullptr;
+
+    /// Indexes - Mapping block numbers to SlotIndex ranges.
+    SlotIndexes *Indexes = nullptr;
+
+    /// LIS - Used for accessing register mask interference maps.
+    LiveIntervals *LIS = nullptr;
+
+    /// PrevPos - The previous position the iterators were moved to.
+    SlotIndex PrevPos;
+
+    /// RegUnitInfo - Information tracked about each RegUnit in PhysReg.
+    /// When PrevPos is set, the iterators are valid as if advanceTo(PrevPos)
+    /// had just been called.
+    struct RegUnitInfo {
+      /// Iterator pointing into the LiveIntervalUnion containing virtual
+      /// register interference.
+      LiveIntervalUnion::SegmentIter VirtI;
+
+      /// Tag of the LIU last time we looked.
+      unsigned VirtTag;
+
+      /// Fixed interference in RegUnit.
+      LiveRange *Fixed = nullptr;
+
+      /// Iterator pointing into the fixed RegUnit interference.
+      LiveInterval::iterator FixedI;
+
+      RegUnitInfo(LiveIntervalUnion &LIU) : VirtTag(LIU.getTag()) {
+        VirtI.setMap(LIU.getMap());
+      }
+    };
+
+    /// Info for each RegUnit in PhysReg. It is very rare ofr a PHysReg to have
+    /// more than 4 RegUnits.
+    SmallVector<RegUnitInfo, 4> RegUnits;
+
+    /// Blocks - Interference for each block in the function.
+    SmallVector<BlockInterference, 8> Blocks;
+
+    /// update - Recompute Blocks[MBBNum]
+    void update(unsigned MBBNum);
+
+  public:
+    Entry() = default;
+
+    void clear(MachineFunction *mf, SlotIndexes *indexes, LiveIntervals *lis) {
+      assert(!hasRefs() && "Cannot clear cache entry with references");
+      PhysReg = MCRegister::NoRegister;
+      MF = mf;
+      Indexes = indexes;
+      LIS = lis;
+    }
+
+    MCRegister getPhysReg() const { return PhysReg; }
+
+    void addRef(int Delta) { RefCount += Delta; }
+
+    bool hasRefs() const { return RefCount > 0; }
+
+    void revalidate(LiveIntervalUnion *LIUArray, const TargetRegisterInfo *TRI);
+
+    /// valid - Return true if this is a valid entry for physReg.
+    bool valid(LiveIntervalUnion *LIUArray, const TargetRegisterInfo *TRI);
+
+    /// reset - Initialize entry to represent physReg's aliases.
+    void reset(MCRegister physReg, LiveIntervalUnion *LIUArray,
+               const TargetRegisterInfo *TRI, const MachineFunction *MF);
+
+    /// get - Return an up to date BlockInterference.
+    BlockInterference *get(unsigned MBBNum) {
+      if (Blocks[MBBNum].Tag != Tag)
+        update(MBBNum);
+      return &Blocks[MBBNum];
+    }
+  };
+
+  // We don't keep a cache entry for every physical register, that would use too
+  // much memory. Instead, a fixed number of cache entries are used in a round-
+  // robin manner.
+  enum { CacheEntries = 32 };
+
+  const TargetRegisterInfo *TRI = nullptr;
+  LiveIntervalUnion *LIUArray = nullptr;
+  MachineFunction *MF = nullptr;
+
+  // Point to an entry for each physreg. The entry pointed to may not be up to
+  // date, and it may have been reused for a different physreg.
+  unsigned char* PhysRegEntries = nullptr;
+  size_t PhysRegEntriesCount = 0;
+
+  // Next round-robin entry to be picked.
+  unsigned RoundRobin = 0;
+
+  // The actual cache entries.
+  Entry Entries[CacheEntries];
+
+  // get - Get a valid entry for PhysReg.
+  Entry *get(MCRegister PhysReg);
+
+public:
+  InterferenceCache() = default;
+  InterferenceCache &operator=(const InterferenceCache &other) = delete;
+  InterferenceCache(const InterferenceCache &other) = delete;
+  ~InterferenceCache() {
+    free(PhysRegEntries);
+  }
+
+  void reinitPhysRegEntries();
+
+  /// init - Prepare cache for a new function.
+  void init(MachineFunction *mf, LiveIntervalUnion *liuarray,
+            SlotIndexes *indexes, LiveIntervals *lis,
+            const TargetRegisterInfo *tri);
+
+  /// getMaxCursors - Return the maximum number of concurrent cursors that can
+  /// be supported.
+  unsigned getMaxCursors() const { return CacheEntries; }
+
+  /// Cursor - The primary query interface for the block interference cache.
+  class Cursor {
+    Entry *CacheEntry = nullptr;
+    const BlockInterference *Current = nullptr;
+    static const BlockInterference NoInterference;
+
+    void setEntry(Entry *E) {
+      Current = nullptr;
+      // Update reference counts. Nothing happens when RefCount reaches 0, so
+      // we don't have to check for E == CacheEntry etc.
+      if (CacheEntry)
+        CacheEntry->addRef(-1);
+      CacheEntry = E;
+      if (CacheEntry)
+        CacheEntry->addRef(+1);
+    }
+
+  public:
+    /// Cursor - Create a dangling cursor.
+    Cursor() = default;
+
+    Cursor(const Cursor &O) {
+      setEntry(O.CacheEntry);
+    }
+
+    Cursor &operator=(const Cursor &O) {
+      setEntry(O.CacheEntry);
+      return *this;
+    }
+
+    ~Cursor() { setEntry(nullptr); }
+
+    /// setPhysReg - Point this cursor to PhysReg's interference.
+    void setPhysReg(InterferenceCache &Cache, MCRegister PhysReg) {
+      // Release reference before getting a new one. That guarantees we can
+      // actually have CacheEntries live cursors.
+      setEntry(nullptr);
+      if (PhysReg.isValid())
+        setEntry(Cache.get(PhysReg));
+    }
+
+    /// moveTo - Move cursor to basic block MBBNum.
+    void moveToBlock(unsigned MBBNum) {
+      Current = CacheEntry ? CacheEntry->get(MBBNum) : &NoInterference;
+    }
+
+    /// hasInterference - Return true if the current block has any interference.
+    bool hasInterference() {
+      return Current->First.isValid();
+    }
+
+    /// first - Return the starting index of the first interfering range in the
+    /// current block.
+    SlotIndex first() {
+      return Current->First;
+    }
+
+    /// last - Return the ending index of the last interfering range in the
+    /// current block.
+    SlotIndex last() {
+      return Current->Last;
+    }
+  };
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_INTERFERENCECACHE_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/InterleavedAccessPass.cpp
new file mode 100644
index 000000000000..6b3848531569
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -0,0 +1,538 @@
+//===- InterleavedAccessPass.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Interleaved Access pass, which identifies
+// interleaved memory accesses and transforms them into target specific
+// intrinsics.
+//
+// An interleaved load reads data from memory into several vectors, with
+// DE-interleaving the data on a factor. An interleaved store writes several
+// vectors to memory with RE-interleaving the data on a factor.
+//
+// As interleaved accesses are difficult to identified in CodeGen (mainly
+// because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector
+// IR), we identify and transform them to intrinsics in this pass so the
+// intrinsics can be easily matched into target specific instructions later in
+// CodeGen.
+//
+// E.g. An interleaved load (Factor = 2):
+//        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+//        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
+//        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
+//
+// It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
+// intrinsic in ARM backend.
+//
+// In X86, this can be further optimized into a set of target
+// specific loads followed by an optimized sequence of shuffles.
+//
+// E.g. An interleaved store (Factor = 3):
+//        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
+//                                    <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+//        store <12 x i32> %i.vec, <12 x i32>* %ptr
+//
+// It could be transformed into a st3 intrinsic in AArch64 backend or a vst3
+// intrinsic in ARM backend.
+//
+// Similarly, a set of interleaved stores can be transformed into an optimized
+// sequence of shuffles followed by a set of target specific stores for X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "interleaved-access"
+
+static cl::opt<bool> LowerInterleavedAccesses(
+    "lower-interleaved-accesses",
+    cl::desc("Enable lowering interleaved accesses to intrinsics"),
+    cl::init(true), cl::Hidden);
+
+namespace {
+
+class InterleavedAccess : public FunctionPass {
+public:
+  static char ID;
+
+  InterleavedAccess() : FunctionPass(ID) {
+    initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "Interleaved Access Pass"; }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+private:
+  DominatorTree *DT = nullptr;
+  const TargetLowering *TLI = nullptr;
+
+  /// The maximum supported interleave factor.
+  unsigned MaxFactor = 0u;
+
+  /// Transform an interleaved load into target specific intrinsics.
+  bool lowerInterleavedLoad(LoadInst *LI,
+                            SmallVector<Instruction *, 32> &DeadInsts);
+
+  /// Transform an interleaved store into target specific intrinsics.
+  bool lowerInterleavedStore(StoreInst *SI,
+                             SmallVector<Instruction *, 32> &DeadInsts);
+
+  /// Transform a load and a deinterleave intrinsic into target specific
+  /// instructions.
+  bool lowerDeinterleaveIntrinsic(IntrinsicInst *II,
+                                  SmallVector<Instruction *, 32> &DeadInsts);
+
+  /// Transform an interleave intrinsic and a store into target specific
+  /// instructions.
+  bool lowerInterleaveIntrinsic(IntrinsicInst *II,
+                                SmallVector<Instruction *, 32> &DeadInsts);
+
+  /// Returns true if the uses of an interleaved load by the
+  /// extractelement instructions in \p Extracts can be replaced by uses of the
+  /// shufflevector instructions in \p Shuffles instead. If so, the necessary
+  /// replacements are also performed.
+  bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
+                          ArrayRef<ShuffleVectorInst *> Shuffles);
+
+  /// Given a number of shuffles of the form shuffle(binop(x,y)), convert them
+  /// to binop(shuffle(x), shuffle(y)) to allow the formation of an
+  /// interleaving load. Any newly created shuffles that operate on \p LI will
+  /// be added to \p Shuffles. Returns true, if any changes to the IR have been
+  /// made.
+  bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
+                            SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
+                            LoadInst *LI);
+};
+
+} // end anonymous namespace.
+
+char InterleavedAccess::ID = 0;
+
+INITIALIZE_PASS_BEGIN(InterleavedAccess, DEBUG_TYPE,
+    "Lower interleaved memory accesses to target specific intrinsics", false,
+    false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(InterleavedAccess, DEBUG_TYPE,
+    "Lower interleaved memory accesses to target specific intrinsics", false,
+    false)
+
+FunctionPass *llvm::createInterleavedAccessPass() {
+  return new InterleavedAccess();
+}
+
+/// Check if the mask is a DE-interleave mask of the given factor
+/// \p Factor like:
+///     <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
+static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
+                                       unsigned &Index) {
+  // Check all potential start indices from 0 to (Factor - 1).
+  for (Index = 0; Index < Factor; Index++) {
+    unsigned i = 0;
+
+    // Check that elements are in ascending order by Factor. Ignore undef
+    // elements.
+    for (; i < Mask.size(); i++)
+      if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i * Factor)
+        break;
+
+    if (i == Mask.size())
+      return true;
+  }
+
+  return false;
+}
+
+/// Check if the mask is a DE-interleave mask for an interleaved load.
+///
+/// E.g. DE-interleave masks (Factor = 2) could be:
+///     <0, 2, 4, 6>    (mask of index 0 to extract even elements)
+///     <1, 3, 5, 7>    (mask of index 1 to extract odd elements)
+static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
+                               unsigned &Index, unsigned MaxFactor,
+                               unsigned NumLoadElements) {
+  if (Mask.size() < 2)
+    return false;
+
+  // Check potential Factors.
+  for (Factor = 2; Factor <= MaxFactor; Factor++) {
+    // Make sure we don't produce a load wider than the input load.
+    if (Mask.size() * Factor > NumLoadElements)
+      return false;
+    if (isDeInterleaveMaskOfFactor(Mask, Factor, Index))
+      return true;
+  }
+
+  return false;
+}
+
+/// Check if the mask can be used in an interleaved store.
+//
+/// It checks for a more general pattern than the RE-interleave mask.
+/// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...>
+/// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35>
+/// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
+/// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5>
+///
+/// The particular case of an RE-interleave mask is:
+/// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
+/// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
+static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
+                               unsigned MaxFactor) {
+  unsigned NumElts = SVI->getShuffleMask().size();
+  if (NumElts < 4)
+    return false;
+
+  // Check potential Factors.
+  for (Factor = 2; Factor <= MaxFactor; Factor++) {
+    if (SVI->isInterleave(Factor))
+      return true;
+  }
+
+  return false;
+}
+
+bool InterleavedAccess::lowerInterleavedLoad(
+    LoadInst *LI, SmallVector<Instruction *, 32> &DeadInsts) {
+  if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
+    return false;
+
+  // Check if all users of this load are shufflevectors. If we encounter any
+  // users that are extractelement instructions or binary operators, we save
+  // them to later check if they can be modified to extract from one of the
+  // shufflevectors instead of the load.
+
+  SmallVector<ShuffleVectorInst *, 4> Shuffles;
+  SmallVector<ExtractElementInst *, 4> Extracts;
+  // BinOpShuffles need to be handled a single time in case both operands of the
+  // binop are the same load.
+  SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
+
+  for (auto *User : LI->users()) {
+    auto *Extract = dyn_cast<ExtractElementInst>(User);
+    if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
+      Extracts.push_back(Extract);
+      continue;
+    }
+    if (auto *BI = dyn_cast<BinaryOperator>(User)) {
+      if (all_of(BI->users(), [](auto *U) {
+            auto *SVI = dyn_cast<ShuffleVectorInst>(U);
+            return SVI && isa<UndefValue>(SVI->getOperand(1));
+          })) {
+        for (auto *SVI : BI->users())
+          BinOpShuffles.insert(cast<ShuffleVectorInst>(SVI));
+        continue;
+      }
+    }
+    auto *SVI = dyn_cast<ShuffleVectorInst>(User);
+    if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
+      return false;
+
+    Shuffles.push_back(SVI);
+  }
+
+  if (Shuffles.empty() && BinOpShuffles.empty())
+    return false;
+
+  unsigned Factor, Index;
+
+  unsigned NumLoadElements =
+      cast<FixedVectorType>(LI->getType())->getNumElements();
+  auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
+  // Check if the first shufflevector is DE-interleave shuffle.
+  if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
+                          NumLoadElements))
+    return false;
+
+  // Holds the corresponding index for each DE-interleave shuffle.
+  SmallVector<unsigned, 4> Indices;
+
+  Type *VecTy = FirstSVI->getType();
+
+  // Check if other shufflevectors are also DE-interleaved of the same type
+  // and factor as the first shufflevector.
+  for (auto *Shuffle : Shuffles) {
+    if (Shuffle->getType() != VecTy)
+      return false;
+    if (!isDeInterleaveMaskOfFactor(Shuffle->getShuffleMask(), Factor,
+                                    Index))
+      return false;
+
+    assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
+    Indices.push_back(Index);
+  }
+  for (auto *Shuffle : BinOpShuffles) {
+    if (Shuffle->getType() != VecTy)
+      return false;
+    if (!isDeInterleaveMaskOfFactor(Shuffle->getShuffleMask(), Factor,
+                                    Index))
+      return false;
+
+    assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
+
+    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LI)
+      Indices.push_back(Index);
+    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LI)
+      Indices.push_back(Index);
+  }
+
+  // Try and modify users of the load that are extractelement instructions to
+  // use the shufflevector instructions instead of the load.
+  if (!tryReplaceExtracts(Extracts, Shuffles))
+    return false;
+
+  bool BinOpShuffleChanged =
+      replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI);
+
+  LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
+
+  // Try to create target specific intrinsics to replace the load and shuffles.
+  if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor)) {
+    // If Extracts is not empty, tryReplaceExtracts made changes earlier.
+    return !Extracts.empty() || BinOpShuffleChanged;
+  }
+
+  append_range(DeadInsts, Shuffles);
+
+  DeadInsts.push_back(LI);
+  return true;
+}
+
+bool InterleavedAccess::replaceBinOpShuffles(
+    ArrayRef<ShuffleVectorInst *> BinOpShuffles,
+    SmallVectorImpl<ShuffleVectorInst *> &Shuffles, LoadInst *LI) {
+  for (auto *SVI : BinOpShuffles) {
+    BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
+    Type *BIOp0Ty = BI->getOperand(0)->getType();
+    ArrayRef<int> Mask = SVI->getShuffleMask();
+    assert(all_of(Mask, [&](int Idx) {
+      return Idx < (int)cast<FixedVectorType>(BIOp0Ty)->getNumElements();
+    }));
+
+    auto *NewSVI1 =
+        new ShuffleVectorInst(BI->getOperand(0), PoisonValue::get(BIOp0Ty),
+                              Mask, SVI->getName(), SVI);
+    auto *NewSVI2 = new ShuffleVectorInst(
+        BI->getOperand(1), PoisonValue::get(BI->getOperand(1)->getType()), Mask,
+        SVI->getName(), SVI);
+    BinaryOperator *NewBI = BinaryOperator::CreateWithCopiedFlags(
+        BI->getOpcode(), NewSVI1, NewSVI2, BI, BI->getName(), SVI);
+    SVI->replaceAllUsesWith(NewBI);
+    LLVM_DEBUG(dbgs() << "  Replaced: " << *BI << "\n    And   : " << *SVI
+                      << "\n  With    : " << *NewSVI1 << "\n    And   : "
+                      << *NewSVI2 << "\n    And   : " << *NewBI << "\n");
+    RecursivelyDeleteTriviallyDeadInstructions(SVI);
+    if (NewSVI1->getOperand(0) == LI)
+      Shuffles.push_back(NewSVI1);
+    if (NewSVI2->getOperand(0) == LI)
+      Shuffles.push_back(NewSVI2);
+  }
+
+  return !BinOpShuffles.empty();
+}
+
+bool InterleavedAccess::tryReplaceExtracts(
+    ArrayRef<ExtractElementInst *> Extracts,
+    ArrayRef<ShuffleVectorInst *> Shuffles) {
+  // If there aren't any extractelement instructions to modify, there's nothing
+  // to do.
+  if (Extracts.empty())
+    return true;
+
+  // Maps extractelement instructions to vector-index pairs. The extractlement
+  // instructions will be modified to use the new vector and index operands.
+  DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap;
+
+  for (auto *Extract : Extracts) {
+    // The vector index that is extracted.
+    auto *IndexOperand = cast<ConstantInt>(Extract->getIndexOperand());
+    auto Index = IndexOperand->getSExtValue();
+
+    // Look for a suitable shufflevector instruction. The goal is to modify the
+    // extractelement instruction (which uses an interleaved load) to use one
+    // of the shufflevector instructions instead of the load.
+    for (auto *Shuffle : Shuffles) {
+      // If the shufflevector instruction doesn't dominate the extract, we
+      // can't create a use of it.
+      if (!DT->dominates(Shuffle, Extract))
+        continue;
+
+      // Inspect the indices of the shufflevector instruction. If the shuffle
+      // selects the same index that is extracted, we can modify the
+      // extractelement instruction.
+      SmallVector<int, 4> Indices;
+      Shuffle->getShuffleMask(Indices);
+      for (unsigned I = 0; I < Indices.size(); ++I)
+        if (Indices[I] == Index) {
+          assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
+                 "Vector operations do not match");
+          ReplacementMap[Extract] = std::make_pair(Shuffle, I);
+          break;
+        }
+
+      // If we found a suitable shufflevector instruction, stop looking.
+      if (ReplacementMap.count(Extract))
+        break;
+    }
+
+    // If we did not find a suitable shufflevector instruction, the
+    // extractelement instruction cannot be modified, so we must give up.
+    if (!ReplacementMap.count(Extract))
+      return false;
+  }
+
+  // Finally, perform the replacements.
+  IRBuilder<> Builder(Extracts[0]->getContext());
+  for (auto &Replacement : ReplacementMap) {
+    auto *Extract = Replacement.first;
+    auto *Vector = Replacement.second.first;
+    auto Index = Replacement.second.second;
+    Builder.SetInsertPoint(Extract);
+    Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index));
+    Extract->eraseFromParent();
+  }
+
+  return true;
+}
+
+bool InterleavedAccess::lowerInterleavedStore(
+    StoreInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
+  if (!SI->isSimple())
+    return false;
+
+  auto *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
+  if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
+    return false;
+
+  // Check if the shufflevector is RE-interleave shuffle.
+  unsigned Factor;
+  if (!isReInterleaveMask(SVI, Factor, MaxFactor))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
+
+  // Try to create target specific intrinsics to replace the store and shuffle.
+  if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
+    return false;
+
+  // Already have a new target specific interleaved store. Erase the old store.
+  DeadInsts.push_back(SI);
+  DeadInsts.push_back(SVI);
+  return true;
+}
+
+bool InterleavedAccess::lowerDeinterleaveIntrinsic(
+    IntrinsicInst *DI, SmallVector<Instruction *, 32> &DeadInsts) {
+  LoadInst *LI = dyn_cast<LoadInst>(DI->getOperand(0));
+
+  if (!LI || !LI->hasOneUse() || !LI->isSimple())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
+
+  // Try and match this with target specific intrinsics.
+  if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
+    return false;
+
+  // We now have a target-specific load, so delete the old one.
+  DeadInsts.push_back(DI);
+  DeadInsts.push_back(LI);
+  return true;
+}
+
+bool InterleavedAccess::lowerInterleaveIntrinsic(
+    IntrinsicInst *II, SmallVector<Instruction *, 32> &DeadInsts) {
+  if (!II->hasOneUse())
+    return false;
+
+  StoreInst *SI = dyn_cast<StoreInst>(*(II->users().begin()));
+
+  if (!SI || !SI->isSimple())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
+
+  // Try and match this with target specific intrinsics.
+  if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
+    return false;
+
+  // We now have a target-specific store, so delete the old one.
+  DeadInsts.push_back(SI);
+  DeadInsts.push_back(II);
+  return true;
+}
+
+bool InterleavedAccess::runOnFunction(Function &F) {
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC || !LowerInterleavedAccesses)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
+
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &TM = TPC->getTM<TargetMachine>();
+  TLI = TM.getSubtargetImpl(F)->getTargetLowering();
+  MaxFactor = TLI->getMaxSupportedInterleaveFactor();
+
+  // Holds dead instructions that will be erased later.
+  SmallVector<Instruction *, 32> DeadInsts;
+  bool Changed = false;
+
+  for (auto &I : instructions(F)) {
+    if (auto *LI = dyn_cast<LoadInst>(&I))
+      Changed |= lowerInterleavedLoad(LI, DeadInsts);
+
+    if (auto *SI = dyn_cast<StoreInst>(&I))
+      Changed |= lowerInterleavedStore(SI, DeadInsts);
+
+    if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+      // At present, we only have intrinsics to represent (de)interleaving
+      // with a factor of 2.
+      if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2)
+        Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
+      if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2)
+        Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
+    }
+  }
+
+  for (auto *I : DeadInsts)
+    I->eraseFromParent();
+
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
new file mode 100644
index 000000000000..d0ad6e45b4d3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -0,0 +1,1363 @@
+//===- InterleavedLoadCombine.cpp - Combine Interleaved Loads ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+//
+// This file defines the interleaved-load-combine pass. The pass searches for
+// ShuffleVectorInstruction that execute interleaving loads. If a matching
+// pattern is found, it adds a combined load and further instructions in a
+// pattern that is detectable by InterleavedAccesPass. The old instructions are
+// left dead to be removed later. The pass is specifically designed to be
+// executed just before InterleavedAccesPass to find any left-over instances
+// that are not detected within former passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include <algorithm>
+#include <cassert>
+#include <list>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "interleaved-load-combine"
+
+namespace {
+
+/// Statistic counter
+STATISTIC(NumInterleavedLoadCombine, "Number of combined loads");
+
+/// Option to disable the pass
+static cl::opt<bool> DisableInterleavedLoadCombine(
+    "disable-" DEBUG_TYPE, cl::init(false), cl::Hidden,
+    cl::desc("Disable combining of interleaved loads"));
+
+struct VectorInfo;
+
+struct InterleavedLoadCombineImpl {
+public:
+  InterleavedLoadCombineImpl(Function &F, DominatorTree &DT, MemorySSA &MSSA,
+                             TargetMachine &TM)
+      : F(F), DT(DT), MSSA(MSSA),
+        TLI(*TM.getSubtargetImpl(F)->getTargetLowering()),
+        TTI(TM.getTargetTransformInfo(F)) {}
+
+  /// Scan the function for interleaved load candidates and execute the
+  /// replacement if applicable.
+  bool run();
+
+private:
+  /// Function this pass is working on
+  Function &F;
+
+  /// Dominator Tree Analysis
+  DominatorTree &DT;
+
+  /// Memory Alias Analyses
+  MemorySSA &MSSA;
+
+  /// Target Lowering Information
+  const TargetLowering &TLI;
+
+  /// Target Transform Information
+  const TargetTransformInfo TTI;
+
+  /// Find the instruction in sets LIs that dominates all others, return nullptr
+  /// if there is none.
+  LoadInst *findFirstLoad(const std::set<LoadInst *> &LIs);
+
+  /// Replace interleaved load candidates. It does additional
+  /// analyses if this makes sense. Returns true on success and false
+  /// of nothing has been changed.
+  bool combine(std::list<VectorInfo> &InterleavedLoad,
+               OptimizationRemarkEmitter &ORE);
+
+  /// Given a set of VectorInfo containing candidates for a given interleave
+  /// factor, find a set that represents a 'factor' interleaved load.
+  bool findPattern(std::list<VectorInfo> &Candidates,
+                   std::list<VectorInfo> &InterleavedLoad, unsigned Factor,
+                   const DataLayout &DL);
+}; // InterleavedLoadCombine
+
+/// First Order Polynomial on an n-Bit Integer Value
+///
+/// Polynomial(Value) = Value * B + A + E*2^(n-e)
+///
+/// A and B are the coefficients. E*2^(n-e) is an error within 'e' most
+/// significant bits. It is introduced if an exact computation cannot be proven
+/// (e.q. division by 2).
+///
+/// As part of this optimization multiple loads will be combined. It necessary
+/// to prove that loads are within some relative offset to each other. This
+/// class is used to prove relative offsets of values loaded from memory.
+///
+/// Representing an integer in this form is sound since addition in two's
+/// complement is associative (trivial) and multiplication distributes over the
+/// addition (see Proof(1) in Polynomial::mul). Further, both operations
+/// commute.
+//
+// Example:
+// declare @fn(i64 %IDX, <4 x float>* %PTR) {
+//   %Pa1 = add i64 %IDX, 2
+//   %Pa2 = lshr i64 %Pa1, 1
+//   %Pa3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pa2
+//   %Va = load <4 x float>, <4 x float>* %Pa3
+//
+//   %Pb1 = add i64 %IDX, 4
+//   %Pb2 = lshr i64 %Pb1, 1
+//   %Pb3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pb2
+//   %Vb = load <4 x float>, <4 x float>* %Pb3
+// ... }
+//
+// The goal is to prove that two loads load consecutive addresses.
+//
+// In this case the polynomials are constructed by the following
+// steps.
+//
+// The number tag #e specifies the error bits.
+//
+// Pa_0 = %IDX              #0
+// Pa_1 = %IDX + 2          #0 | add 2
+// Pa_2 = %IDX/2 + 1        #1 | lshr 1
+// Pa_3 = %IDX/2 + 1        #1 | GEP, step signext to i64
+// Pa_4 = (%IDX/2)*16 + 16  #0 | GEP, multiply index by sizeof(4) for floats
+// Pa_5 = (%IDX/2)*16 + 16  #0 | GEP, add offset of leading components
+//
+// Pb_0 = %IDX              #0
+// Pb_1 = %IDX + 4          #0 | add 2
+// Pb_2 = %IDX/2 + 2        #1 | lshr 1
+// Pb_3 = %IDX/2 + 2        #1 | GEP, step signext to i64
+// Pb_4 = (%IDX/2)*16 + 32  #0 | GEP, multiply index by sizeof(4) for floats
+// Pb_5 = (%IDX/2)*16 + 16  #0 | GEP, add offset of leading components
+//
+// Pb_5 - Pa_5 = 16         #0 | subtract to get the offset
+//
+// Remark: %PTR is not maintained within this class. So in this instance the
+// offset of 16 can only be assumed if the pointers are equal.
+//
+class Polynomial {
+  /// Operations on B
+  enum BOps {
+    LShr,
+    Mul,
+    SExt,
+    Trunc,
+  };
+
+  /// Number of Error Bits e
+  unsigned ErrorMSBs = (unsigned)-1;
+
+  /// Value
+  Value *V = nullptr;
+
+  /// Coefficient B
+  SmallVector<std::pair<BOps, APInt>, 4> B;
+
+  /// Coefficient A
+  APInt A;
+
+public:
+  Polynomial(Value *V) : V(V) {
+    IntegerType *Ty = dyn_cast<IntegerType>(V->getType());
+    if (Ty) {
+      ErrorMSBs = 0;
+      this->V = V;
+      A = APInt(Ty->getBitWidth(), 0);
+    }
+  }
+
+  Polynomial(const APInt &A, unsigned ErrorMSBs = 0)
+      : ErrorMSBs(ErrorMSBs), A(A) {}
+
+  Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = 0)
+      : ErrorMSBs(ErrorMSBs), A(BitWidth, A) {}
+
+  Polynomial() = default;
+
+  /// Increment and clamp the number of undefined bits.
+  void incErrorMSBs(unsigned amt) {
+    if (ErrorMSBs == (unsigned)-1)
+      return;
+
+    ErrorMSBs += amt;
+    if (ErrorMSBs > A.getBitWidth())
+      ErrorMSBs = A.getBitWidth();
+  }
+
+  /// Decrement and clamp the number of undefined bits.
+  void decErrorMSBs(unsigned amt) {
+    if (ErrorMSBs == (unsigned)-1)
+      return;
+
+    if (ErrorMSBs > amt)
+      ErrorMSBs -= amt;
+    else
+      ErrorMSBs = 0;
+  }
+
+  /// Apply an add on the polynomial
+  Polynomial &add(const APInt &C) {
+    // Note: Addition is associative in two's complement even when in case of
+    // signed overflow.
+    //
+    // Error bits can only propagate into higher significant bits. As these are
+    // already regarded as undefined, there is no change.
+    //
+    // Theorem: Adding a constant to a polynomial does not change the error
+    // term.
+    //
+    // Proof:
+    //
+    //   Since the addition is associative and commutes:
+    //
+    //   (B + A + E*2^(n-e)) + C = B + (A + C) + E*2^(n-e)
+    // [qed]
+
+    if (C.getBitWidth() != A.getBitWidth()) {
+      ErrorMSBs = (unsigned)-1;
+      return *this;
+    }
+
+    A += C;
+    return *this;
+  }
+
+  /// Apply a multiplication onto the polynomial.
+  Polynomial &mul(const APInt &C) {
+    // Note: Multiplication distributes over the addition
+    //
+    // Theorem: Multiplication distributes over the addition
+    //
+    // Proof(1):
+    //
+    //   (B+A)*C =-
+    //        = (B + A) + (B + A) + .. {C Times}
+    //         addition is associative and commutes, hence
+    //        = B + B + .. {C Times} .. + A + A + .. {C times}
+    //        = B*C + A*C
+    //   (see (function add) for signed values and overflows)
+    // [qed]
+    //
+    // Theorem: If C has c trailing zeros, errors bits in A or B are shifted out
+    // to the left.
+    //
+    // Proof(2):
+    //
+    //   Let B' and A' be the n-Bit inputs with some unknown errors EA,
+    //   EB at e leading bits. B' and A' can be written down as:
+    //
+    //     B' = B + 2^(n-e)*EB
+    //     A' = A + 2^(n-e)*EA
+    //
+    //   Let C' be an input with c trailing zero bits. C' can be written as
+    //
+    //     C' = C*2^c
+    //
+    //   Therefore we can compute the result by using distributivity and
+    //   commutativity.
+    //
+    //     (B'*C' + A'*C') = [B + 2^(n-e)*EB] * C' + [A + 2^(n-e)*EA] * C' =
+    //                     = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' =
+    //                     = (B'+A') * C' =
+    //                     = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' =
+    //                     = [B + A + 2^(n-e)*EB + 2^(n-e)*EA] * C' =
+    //                     = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C' =
+    //                     = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C*2^c =
+    //                     = (B + A) * C' + C*(EB + EA)*2^(n-e)*2^c =
+    //
+    //   Let EC be the final error with EC = C*(EB + EA)
+    //
+    //                     = (B + A)*C' + EC*2^(n-e)*2^c =
+    //                     = (B + A)*C' + EC*2^(n-(e-c))
+    //
+    //   Since EC is multiplied by 2^(n-(e-c)) the resulting error contains c
+    //   less error bits than the input. c bits are shifted out to the left.
+    // [qed]
+
+    if (C.getBitWidth() != A.getBitWidth()) {
+      ErrorMSBs = (unsigned)-1;
+      return *this;
+    }
+
+    // Multiplying by one is a no-op.
+    if (C.isOne()) {
+      return *this;
+    }
+
+    // Multiplying by zero removes the coefficient B and defines all bits.
+    if (C.isZero()) {
+      ErrorMSBs = 0;
+      deleteB();
+    }
+
+    // See Proof(2): Trailing zero bits indicate a left shift. This removes
+    // leading bits from the result even if they are undefined.
+    decErrorMSBs(C.countr_zero());
+
+    A *= C;
+    pushBOperation(Mul, C);
+    return *this;
+  }
+
+  /// Apply a logical shift right on the polynomial
+  Polynomial &lshr(const APInt &C) {
+    // Theorem(1): (B + A + E*2^(n-e)) >> 1 => (B >> 1) + (A >> 1) + E'*2^(n-e')
+    //          where
+    //             e' = e + 1,
+    //             E is a e-bit number,
+    //             E' is a e'-bit number,
+    //   holds under the following precondition:
+    //          pre(1): A % 2 = 0
+    //          pre(2): e < n, (see Theorem(2) for the trivial case with e=n)
+    //   where >> expresses a logical shift to the right, with adding zeros.
+    //
+    //  We need to show that for every, E there is a E'
+    //
+    //  B = b_h * 2^(n-1) + b_m * 2 + b_l
+    //  A = a_h * 2^(n-1) + a_m * 2         (pre(1))
+    //
+    //  where a_h, b_h, b_l are single bits, and a_m, b_m are (n-2) bit numbers
+    //
+    //  Let X = (B + A + E*2^(n-e)) >> 1
+    //  Let Y = (B >> 1) + (A >> 1) + E*2^(n-e) >> 1
+    //
+    //    X = [B + A + E*2^(n-e)] >> 1 =
+    //      = [  b_h * 2^(n-1) + b_m * 2 + b_l +
+    //         + a_h * 2^(n-1) + a_m * 2 +
+    //         + E * 2^(n-e) ] >> 1 =
+    //
+    //    The sum is built by putting the overflow of [a_m + b+n] into the term
+    //    2^(n-1). As there are no more bits beyond 2^(n-1) the overflow within
+    //    this bit is discarded. This is expressed by % 2.
+    //
+    //    The bit in position 0 cannot overflow into the term (b_m + a_m).
+    //
+    //      = [  ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-1) +
+    //         + ((b_m + a_m) % 2^(n-2)) * 2 +
+    //         + b_l + E * 2^(n-e) ] >> 1 =
+    //
+    //    The shift is computed by dividing the terms by 2 and by cutting off
+    //    b_l.
+    //
+    //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
+    //         + ((b_m + a_m) % 2^(n-2)) +
+    //         + E * 2^(n-(e+1)) =
+    //
+    //    by the definition in the Theorem e+1 = e'
+    //
+    //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
+    //         + ((b_m + a_m) % 2^(n-2)) +
+    //         + E * 2^(n-e') =
+    //
+    //    Compute Y by applying distributivity first
+    //
+    //    Y =  (B >> 1) + (A >> 1) + E*2^(n-e') =
+    //      =    (b_h * 2^(n-1) + b_m * 2 + b_l) >> 1 +
+    //         + (a_h * 2^(n-1) + a_m * 2) >> 1 +
+    //         + E * 2^(n-e) >> 1 =
+    //
+    //    Again, the shift is computed by dividing the terms by 2 and by cutting
+    //    off b_l.
+    //
+    //      =     b_h * 2^(n-2) + b_m +
+    //         +  a_h * 2^(n-2) + a_m +
+    //         +  E * 2^(n-(e+1)) =
+    //
+    //    Again, the sum is built by putting the overflow of [a_m + b+n] into
+    //    the term 2^(n-1). But this time there is room for a second bit in the
+    //    term 2^(n-2) we add this bit to a new term and denote it o_h in a
+    //    second step.
+    //
+    //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] >> 1) * 2^(n-1) +
+    //         + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
+    //         + ((b_m + a_m) % 2^(n-2)) +
+    //         + E * 2^(n-(e+1)) =
+    //
+    //    Let o_h = [b_h + a_h + (b_m + a_m) >> (n-2)] >> 1
+    //    Further replace e+1 by e'.
+    //
+    //      =    o_h * 2^(n-1) +
+    //         + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
+    //         + ((b_m + a_m) % 2^(n-2)) +
+    //         + E * 2^(n-e') =
+    //
+    //    Move o_h into the error term and construct E'. To ensure that there is
+    //    no 2^x with negative x, this step requires pre(2) (e < n).
+    //
+    //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
+    //         + ((b_m + a_m) % 2^(n-2)) +
+    //         + o_h * 2^(e'-1) * 2^(n-e') +               | pre(2), move 2^(e'-1)
+    //                                                     | out of the old exponent
+    //         + E * 2^(n-e') =
+    //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
+    //         + ((b_m + a_m) % 2^(n-2)) +
+    //         + [o_h * 2^(e'-1) + E] * 2^(n-e') +         | move 2^(e'-1) out of
+    //                                                     | the old exponent
+    //
+    //    Let E' = o_h * 2^(e'-1) + E
+    //
+    //      =    ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
+    //         + ((b_m + a_m) % 2^(n-2)) +
+    //         + E' * 2^(n-e')
+    //
+    //    Because X and Y are distinct only in there error terms and E' can be
+    //    constructed as shown the theorem holds.
+    // [qed]
+    //
+    // For completeness in case of the case e=n it is also required to show that
+    // distributivity can be applied.
+    //
+    // In this case Theorem(1) transforms to (the pre-condition on A can also be
+    // dropped)
+    //
+    // Theorem(2): (B + A + E) >> 1 => (B >> 1) + (A >> 1) + E'
+    //          where
+    //             A, B, E, E' are two's complement numbers with the same bit
+    //             width
+    //
+    //   Let A + B + E = X
+    //   Let (B >> 1) + (A >> 1) = Y
+    //
+    //   Therefore we need to show that for every X and Y there is an E' which
+    //   makes the equation
+    //
+    //     X = Y + E'
+    //
+    //   hold. This is trivially the case for E' = X - Y.
+    //
+    // [qed]
+    //
+    // Remark: Distributing lshr with and arbitrary number n can be expressed as
+    //   ((((B + A) lshr 1) lshr 1) ... ) {n times}.
+    // This construction induces n additional error bits at the left.
+
+    if (C.getBitWidth() != A.getBitWidth()) {
+      ErrorMSBs = (unsigned)-1;
+      return *this;
+    }
+
+    if (C.isZero())
+      return *this;
+
+    // Test if the result will be zero
+    unsigned shiftAmt = C.getZExtValue();
+    if (shiftAmt >= C.getBitWidth())
+      return mul(APInt(C.getBitWidth(), 0));
+
+    // The proof that shiftAmt LSBs are zero for at least one summand is only
+    // possible for the constant number.
+    //
+    // If this can be proven add shiftAmt to the error counter
+    // `ErrorMSBs`. Otherwise set all bits as undefined.
+    if (A.countr_zero() < shiftAmt)
+      ErrorMSBs = A.getBitWidth();
+    else
+      incErrorMSBs(shiftAmt);
+
+    // Apply the operation.
+    pushBOperation(LShr, C);
+    A = A.lshr(shiftAmt);
+
+    return *this;
+  }
+
+  /// Apply a sign-extend or truncate operation on the polynomial.
+  Polynomial &sextOrTrunc(unsigned n) {
+    if (n < A.getBitWidth()) {
+      // Truncate: Clearly undefined Bits on the MSB side are removed
+      // if there are any.
+      decErrorMSBs(A.getBitWidth() - n);
+      A = A.trunc(n);
+      pushBOperation(Trunc, APInt(sizeof(n) * 8, n));
+    }
+    if (n > A.getBitWidth()) {
+      // Extend: Clearly extending first and adding later is different
+      // to adding first and extending later in all extended bits.
+      incErrorMSBs(n - A.getBitWidth());
+      A = A.sext(n);
+      pushBOperation(SExt, APInt(sizeof(n) * 8, n));
+    }
+
+    return *this;
+  }
+
+  /// Test if there is a coefficient B.
+  bool isFirstOrder() const { return V != nullptr; }
+
+  /// Test coefficient B of two Polynomials are equal.
+  bool isCompatibleTo(const Polynomial &o) const {
+    // The polynomial use different bit width.
+    if (A.getBitWidth() != o.A.getBitWidth())
+      return false;
+
+    // If neither Polynomial has the Coefficient B.
+    if (!isFirstOrder() && !o.isFirstOrder())
+      return true;
+
+    // The index variable is different.
+    if (V != o.V)
+      return false;
+
+    // Check the operations.
+    if (B.size() != o.B.size())
+      return false;
+
+    auto *ob = o.B.begin();
+    for (const auto &b : B) {
+      if (b != *ob)
+        return false;
+      ob++;
+    }
+
+    return true;
+  }
+
+  /// Subtract two polynomials, return an undefined polynomial if
+  /// subtraction is not possible.
+  Polynomial operator-(const Polynomial &o) const {
+    // Return an undefined polynomial if incompatible.
+    if (!isCompatibleTo(o))
+      return Polynomial();
+
+    // If the polynomials are compatible (meaning they have the same
+    // coefficient on B), B is eliminated. Thus a polynomial solely
+    // containing A is returned
+    return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs));
+  }
+
+  /// Subtract a constant from a polynomial,
+  Polynomial operator-(uint64_t C) const {
+    Polynomial Result(*this);
+    Result.A -= C;
+    return Result;
+  }
+
+  /// Add a constant to a polynomial,
+  Polynomial operator+(uint64_t C) const {
+    Polynomial Result(*this);
+    Result.A += C;
+    return Result;
+  }
+
+  /// Returns true if it can be proven that two Polynomials are equal.
+  bool isProvenEqualTo(const Polynomial &o) {
+    // Subtract both polynomials and test if it is fully defined and zero.
+    Polynomial r = *this - o;
+    return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isZero());
+  }
+
+  /// Print the polynomial into a stream.
+  void print(raw_ostream &OS) const {
+    OS << "[{#ErrBits:" << ErrorMSBs << "} ";
+
+    if (V) {
+      for (auto b : B)
+        OS << "(";
+      OS << "(" << *V << ") ";
+
+      for (auto b : B) {
+        switch (b.first) {
+        case LShr:
+          OS << "LShr ";
+          break;
+        case Mul:
+          OS << "Mul ";
+          break;
+        case SExt:
+          OS << "SExt ";
+          break;
+        case Trunc:
+          OS << "Trunc ";
+          break;
+        }
+
+        OS << b.second << ") ";
+      }
+    }
+
+    OS << "+ " << A << "]";
+  }
+
+private:
+  void deleteB() {
+    V = nullptr;
+    B.clear();
+  }
+
+  void pushBOperation(const BOps Op, const APInt &C) {
+    if (isFirstOrder()) {
+      B.push_back(std::make_pair(Op, C));
+      return;
+    }
+  }
+};
+
+#ifndef NDEBUG
+static raw_ostream &operator<<(raw_ostream &OS, const Polynomial &S) {
+  S.print(OS);
+  return OS;
+}
+#endif
+
+/// VectorInfo stores abstract the following information for each vector
+/// element:
+///
+/// 1) The the memory address loaded into the element as Polynomial
+/// 2) a set of load instruction necessary to construct the vector,
+/// 3) a set of all other instructions that are necessary to create the vector and
+/// 4) a pointer value that can be used as relative base for all elements.
+struct VectorInfo {
+private:
+  VectorInfo(const VectorInfo &c) : VTy(c.VTy) {
+    llvm_unreachable(
+        "Copying VectorInfo is neither implemented nor necessary,");
+  }
+
+public:
+  /// Information of a Vector Element
+  struct ElementInfo {
+    /// Offset Polynomial.
+    Polynomial Ofs;
+
+    /// The Load Instruction used to Load the entry. LI is null if the pointer
+    /// of the load instruction does not point on to the entry
+    LoadInst *LI;
+
+    ElementInfo(Polynomial Offset = Polynomial(), LoadInst *LI = nullptr)
+        : Ofs(Offset), LI(LI) {}
+  };
+
+  /// Basic-block the load instructions are within
+  BasicBlock *BB = nullptr;
+
+  /// Pointer value of all participation load instructions
+  Value *PV = nullptr;
+
+  /// Participating load instructions
+  std::set<LoadInst *> LIs;
+
+  /// Participating instructions
+  std::set<Instruction *> Is;
+
+  /// Final shuffle-vector instruction
+  ShuffleVectorInst *SVI = nullptr;
+
+  /// Information of the offset for each vector element
+  ElementInfo *EI;
+
+  /// Vector Type
+  FixedVectorType *const VTy;
+
+  VectorInfo(FixedVectorType *VTy) : VTy(VTy) {
+    EI = new ElementInfo[VTy->getNumElements()];
+  }
+
+  VectorInfo &operator=(const VectorInfo &other) = delete;
+
+  virtual ~VectorInfo() { delete[] EI; }
+
+  unsigned getDimension() const { return VTy->getNumElements(); }
+
+  /// Test if the VectorInfo can be part of an interleaved load with the
+  /// specified factor.
+  ///
+  /// \param Factor of the interleave
+  /// \param DL Targets Datalayout
+  ///
+  /// \returns true if this is possible and false if not
+  bool isInterleaved(unsigned Factor, const DataLayout &DL) const {
+    unsigned Size = DL.getTypeAllocSize(VTy->getElementType());
+    for (unsigned i = 1; i < getDimension(); i++) {
+      if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor * Size)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Recursively computes the vector information stored in V.
+  ///
+  /// This function delegates the work to specialized implementations
+  ///
+  /// \param V Value to operate on
+  /// \param Result Result of the computation
+  ///
+  /// \returns false if no sensible information can be gathered.
+  static bool compute(Value *V, VectorInfo &Result, const DataLayout &DL) {
+    ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V);
+    if (SVI)
+      return computeFromSVI(SVI, Result, DL);
+    LoadInst *LI = dyn_cast<LoadInst>(V);
+    if (LI)
+      return computeFromLI(LI, Result, DL);
+    BitCastInst *BCI = dyn_cast<BitCastInst>(V);
+    if (BCI)
+      return computeFromBCI(BCI, Result, DL);
+    return false;
+  }
+
+  /// BitCastInst specialization to compute the vector information.
+  ///
+  /// \param BCI BitCastInst to operate on
+  /// \param Result Result of the computation
+  ///
+  /// \returns false if no sensible information can be gathered.
+  static bool computeFromBCI(BitCastInst *BCI, VectorInfo &Result,
+                             const DataLayout &DL) {
+    Instruction *Op = dyn_cast<Instruction>(BCI->getOperand(0));
+
+    if (!Op)
+      return false;
+
+    FixedVectorType *VTy = dyn_cast<FixedVectorType>(Op->getType());
+    if (!VTy)
+      return false;
+
+    // We can only cast from large to smaller vectors
+    if (Result.VTy->getNumElements() % VTy->getNumElements())
+      return false;
+
+    unsigned Factor = Result.VTy->getNumElements() / VTy->getNumElements();
+    unsigned NewSize = DL.getTypeAllocSize(Result.VTy->getElementType());
+    unsigned OldSize = DL.getTypeAllocSize(VTy->getElementType());
+
+    if (NewSize * Factor != OldSize)
+      return false;
+
+    VectorInfo Old(VTy);
+    if (!compute(Op, Old, DL))
+      return false;
+
+    for (unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) {
+      for (unsigned j = 0; j < Factor; j++) {
+        Result.EI[i + j] =
+            ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize,
+                        j == 0 ? Old.EI[i / Factor].LI : nullptr);
+      }
+    }
+
+    Result.BB = Old.BB;
+    Result.PV = Old.PV;
+    Result.LIs.insert(Old.LIs.begin(), Old.LIs.end());
+    Result.Is.insert(Old.Is.begin(), Old.Is.end());
+    Result.Is.insert(BCI);
+    Result.SVI = nullptr;
+
+    return true;
+  }
+
+  /// ShuffleVectorInst specialization to compute vector information.
+  ///
+  /// \param SVI ShuffleVectorInst to operate on
+  /// \param Result Result of the computation
+  ///
+  /// Compute the left and the right side vector information and merge them by
+  /// applying the shuffle operation. This function also ensures that the left
+  /// and right side have compatible loads. This means that all loads are with
+  /// in the same basic block and are based on the same pointer.
+  ///
+  /// \returns false if no sensible information can be gathered.
+  static bool computeFromSVI(ShuffleVectorInst *SVI, VectorInfo &Result,
+                             const DataLayout &DL) {
+    FixedVectorType *ArgTy =
+        cast<FixedVectorType>(SVI->getOperand(0)->getType());
+
+    // Compute the left hand vector information.
+    VectorInfo LHS(ArgTy);
+    if (!compute(SVI->getOperand(0), LHS, DL))
+      LHS.BB = nullptr;
+
+    // Compute the right hand vector information.
+    VectorInfo RHS(ArgTy);
+    if (!compute(SVI->getOperand(1), RHS, DL))
+      RHS.BB = nullptr;
+
+    // Neither operand produced sensible results?
+    if (!LHS.BB && !RHS.BB)
+      return false;
+    // Only RHS produced sensible results?
+    else if (!LHS.BB) {
+      Result.BB = RHS.BB;
+      Result.PV = RHS.PV;
+    }
+    // Only LHS produced sensible results?
+    else if (!RHS.BB) {
+      Result.BB = LHS.BB;
+      Result.PV = LHS.PV;
+    }
+    // Both operands produced sensible results?
+    else if ((LHS.BB == RHS.BB) && (LHS.PV == RHS.PV)) {
+      Result.BB = LHS.BB;
+      Result.PV = LHS.PV;
+    }
+    // Both operands produced sensible results but they are incompatible.
+    else {
+      return false;
+    }
+
+    // Merge and apply the operation on the offset information.
+    if (LHS.BB) {
+      Result.LIs.insert(LHS.LIs.begin(), LHS.LIs.end());
+      Result.Is.insert(LHS.Is.begin(), LHS.Is.end());
+    }
+    if (RHS.BB) {
+      Result.LIs.insert(RHS.LIs.begin(), RHS.LIs.end());
+      Result.Is.insert(RHS.Is.begin(), RHS.Is.end());
+    }
+    Result.Is.insert(SVI);
+    Result.SVI = SVI;
+
+    int j = 0;
+    for (int i : SVI->getShuffleMask()) {
+      assert((i < 2 * (signed)ArgTy->getNumElements()) &&
+             "Invalid ShuffleVectorInst (index out of bounds)");
+
+      if (i < 0)
+        Result.EI[j] = ElementInfo();
+      else if (i < (signed)ArgTy->getNumElements()) {
+        if (LHS.BB)
+          Result.EI[j] = LHS.EI[i];
+        else
+          Result.EI[j] = ElementInfo();
+      } else {
+        if (RHS.BB)
+          Result.EI[j] = RHS.EI[i - ArgTy->getNumElements()];
+        else
+          Result.EI[j] = ElementInfo();
+      }
+      j++;
+    }
+
+    return true;
+  }
+
+  /// LoadInst specialization to compute vector information.
+  ///
+  /// This function also acts as abort condition to the recursion.
+  ///
+  /// \param LI LoadInst to operate on
+  /// \param Result Result of the computation
+  ///
+  /// \returns false if no sensible information can be gathered.
+  static bool computeFromLI(LoadInst *LI, VectorInfo &Result,
+                            const DataLayout &DL) {
+    Value *BasePtr;
+    Polynomial Offset;
+
+    if (LI->isVolatile())
+      return false;
+
+    if (LI->isAtomic())
+      return false;
+
+    // Get the base polynomial
+    computePolynomialFromPointer(*LI->getPointerOperand(), Offset, BasePtr, DL);
+
+    Result.BB = LI->getParent();
+    Result.PV = BasePtr;
+    Result.LIs.insert(LI);
+    Result.Is.insert(LI);
+
+    for (unsigned i = 0; i < Result.getDimension(); i++) {
+      Value *Idx[2] = {
+          ConstantInt::get(Type::getInt32Ty(LI->getContext()), 0),
+          ConstantInt::get(Type::getInt32Ty(LI->getContext()), i),
+      };
+      int64_t Ofs = DL.getIndexedOffsetInType(Result.VTy, ArrayRef(Idx, 2));
+      Result.EI[i] = ElementInfo(Offset + Ofs, i == 0 ? LI : nullptr);
+    }
+
+    return true;
+  }
+
+  /// Recursively compute polynomial of a value.
+  ///
+  /// \param BO Input binary operation
+  /// \param Result Result polynomial
+  static void computePolynomialBinOp(BinaryOperator &BO, Polynomial &Result) {
+    Value *LHS = BO.getOperand(0);
+    Value *RHS = BO.getOperand(1);
+
+    // Find the RHS Constant if any
+    ConstantInt *C = dyn_cast<ConstantInt>(RHS);
+    if ((!C) && BO.isCommutative()) {
+      C = dyn_cast<ConstantInt>(LHS);
+      if (C)
+        std::swap(LHS, RHS);
+    }
+
+    switch (BO.getOpcode()) {
+    case Instruction::Add:
+      if (!C)
+        break;
+
+      computePolynomial(*LHS, Result);
+      Result.add(C->getValue());
+      return;
+
+    case Instruction::LShr:
+      if (!C)
+        break;
+
+      computePolynomial(*LHS, Result);
+      Result.lshr(C->getValue());
+      return;
+
+    default:
+      break;
+    }
+
+    Result = Polynomial(&BO);
+  }
+
+  /// Recursively compute polynomial of a value
+  ///
+  /// \param V input value
+  /// \param Result result polynomial
+  static void computePolynomial(Value &V, Polynomial &Result) {
+    if (auto *BO = dyn_cast<BinaryOperator>(&V))
+      computePolynomialBinOp(*BO, Result);
+    else
+      Result = Polynomial(&V);
+  }
+
+  /// Compute the Polynomial representation of a Pointer type.
+  ///
+  /// \param Ptr input pointer value
+  /// \param Result result polynomial
+  /// \param BasePtr pointer the polynomial is based on
+  /// \param DL Datalayout of the target machine
+  static void computePolynomialFromPointer(Value &Ptr, Polynomial &Result,
+                                           Value *&BasePtr,
+                                           const DataLayout &DL) {
+    // Not a pointer type? Return an undefined polynomial
+    PointerType *PtrTy = dyn_cast<PointerType>(Ptr.getType());
+    if (!PtrTy) {
+      Result = Polynomial();
+      BasePtr = nullptr;
+      return;
+    }
+    unsigned PointerBits =
+        DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace());
+
+    /// Skip pointer casts. Return Zero polynomial otherwise
+    if (isa<CastInst>(&Ptr)) {
+      CastInst &CI = *cast<CastInst>(&Ptr);
+      switch (CI.getOpcode()) {
+      case Instruction::BitCast:
+        computePolynomialFromPointer(*CI.getOperand(0), Result, BasePtr, DL);
+        break;
+      default:
+        BasePtr = &Ptr;
+        Polynomial(PointerBits, 0);
+        break;
+      }
+    }
+    /// Resolve GetElementPtrInst.
+    else if (isa<GetElementPtrInst>(&Ptr)) {
+      GetElementPtrInst &GEP = *cast<GetElementPtrInst>(&Ptr);
+
+      APInt BaseOffset(PointerBits, 0);
+
+      // Check if we can compute the Offset with accumulateConstantOffset
+      if (GEP.accumulateConstantOffset(DL, BaseOffset)) {
+        Result = Polynomial(BaseOffset);
+        BasePtr = GEP.getPointerOperand();
+        return;
+      } else {
+        // Otherwise we allow that the last index operand of the GEP is
+        // non-constant.
+        unsigned idxOperand, e;
+        SmallVector<Value *, 4> Indices;
+        for (idxOperand = 1, e = GEP.getNumOperands(); idxOperand < e;
+             idxOperand++) {
+          ConstantInt *IDX = dyn_cast<ConstantInt>(GEP.getOperand(idxOperand));
+          if (!IDX)
+            break;
+          Indices.push_back(IDX);
+        }
+
+        // It must also be the last operand.
+        if (idxOperand + 1 != e) {
+          Result = Polynomial();
+          BasePtr = nullptr;
+          return;
+        }
+
+        // Compute the polynomial of the index operand.
+        computePolynomial(*GEP.getOperand(idxOperand), Result);
+
+        // Compute base offset from zero based index, excluding the last
+        // variable operand.
+        BaseOffset =
+            DL.getIndexedOffsetInType(GEP.getSourceElementType(), Indices);
+
+        // Apply the operations of GEP to the polynomial.
+        unsigned ResultSize = DL.getTypeAllocSize(GEP.getResultElementType());
+        Result.sextOrTrunc(PointerBits);
+        Result.mul(APInt(PointerBits, ResultSize));
+        Result.add(BaseOffset);
+        BasePtr = GEP.getPointerOperand();
+      }
+    }
+    // All other instructions are handled by using the value as base pointer and
+    // a zero polynomial.
+    else {
+      BasePtr = &Ptr;
+      Polynomial(DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0);
+    }
+  }
+
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const {
+    if (PV)
+      OS << *PV;
+    else
+      OS << "(none)";
+    OS << " + ";
+    for (unsigned i = 0; i < getDimension(); i++)
+      OS << ((i == 0) ? "[" : ", ") << EI[i].Ofs;
+    OS << "]";
+  }
+#endif
+};
+
+} // anonymous namespace
+
+bool InterleavedLoadCombineImpl::findPattern(
+    std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad,
+    unsigned Factor, const DataLayout &DL) {
+  for (auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
+    unsigned i;
+    // Try to find an interleaved load using the front of Worklist as first line
+    unsigned Size = DL.getTypeAllocSize(C0->VTy->getElementType());
+
+    // List containing iterators pointing to the VectorInfos of the candidates
+    std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end());
+
+    for (auto C = Candidates.begin(), E = Candidates.end(); C != E; C++) {
+      if (C->VTy != C0->VTy)
+        continue;
+      if (C->BB != C0->BB)
+        continue;
+      if (C->PV != C0->PV)
+        continue;
+
+      // Check the current value matches any of factor - 1 remaining lines
+      for (i = 1; i < Factor; i++) {
+        if (C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i * Size)) {
+          Res[i] = C;
+        }
+      }
+
+      for (i = 1; i < Factor; i++) {
+        if (Res[i] == Candidates.end())
+          break;
+      }
+      if (i == Factor) {
+        Res[0] = C0;
+        break;
+      }
+    }
+
+    if (Res[0] != Candidates.end()) {
+      // Move the result into the output
+      for (unsigned i = 0; i < Factor; i++) {
+        InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]);
+      }
+
+      return true;
+    }
+  }
+  return false;
+}
+
+LoadInst *
+InterleavedLoadCombineImpl::findFirstLoad(const std::set<LoadInst *> &LIs) {
+  assert(!LIs.empty() && "No load instructions given.");
+
+  // All LIs are within the same BB. Select the first for a reference.
+  BasicBlock *BB = (*LIs.begin())->getParent();
+  BasicBlock::iterator FLI = llvm::find_if(
+      *BB, [&LIs](Instruction &I) -> bool { return is_contained(LIs, &I); });
+  assert(FLI != BB->end());
+
+  return cast<LoadInst>(FLI);
+}
+
+bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
+                                         OptimizationRemarkEmitter &ORE) {
+  LLVM_DEBUG(dbgs() << "Checking interleaved load\n");
+
+  // The insertion point is the LoadInst which loads the first values. The
+  // following tests are used to proof that the combined load can be inserted
+  // just before InsertionPoint.
+  LoadInst *InsertionPoint = InterleavedLoad.front().EI[0].LI;
+
+  // Test if the offset is computed
+  if (!InsertionPoint)
+    return false;
+
+  std::set<LoadInst *> LIs;
+  std::set<Instruction *> Is;
+  std::set<Instruction *> SVIs;
+
+  InstructionCost InterleavedCost;
+  InstructionCost InstructionCost = 0;
+  const TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency;
+
+  // Get the interleave factor
+  unsigned Factor = InterleavedLoad.size();
+
+  // Merge all input sets used in analysis
+  for (auto &VI : InterleavedLoad) {
+    // Generate a set of all load instructions to be combined
+    LIs.insert(VI.LIs.begin(), VI.LIs.end());
+
+    // Generate a set of all instructions taking part in load
+    // interleaved. This list excludes the instructions necessary for the
+    // polynomial construction.
+    Is.insert(VI.Is.begin(), VI.Is.end());
+
+    // Generate the set of the final ShuffleVectorInst.
+    SVIs.insert(VI.SVI);
+  }
+
+  // There is nothing to combine.
+  if (LIs.size() < 2)
+    return false;
+
+  // Test if all participating instruction will be dead after the
+  // transformation. If intermediate results are used, no performance gain can
+  // be expected. Also sum the cost of the Instructions beeing left dead.
+  for (const auto &I : Is) {
+    // Compute the old cost
+    InstructionCost += TTI.getInstructionCost(I, CostKind);
+
+    // The final SVIs are allowed not to be dead, all uses will be replaced
+    if (SVIs.find(I) != SVIs.end())
+      continue;
+
+    // If there are users outside the set to be eliminated, we abort the
+    // transformation. No gain can be expected.
+    for (auto *U : I->users()) {
+      if (Is.find(dyn_cast<Instruction>(U)) == Is.end())
+        return false;
+    }
+  }
+
+  // We need to have a valid cost in order to proceed.
+  if (!InstructionCost.isValid())
+    return false;
+
+  // We know that all LoadInst are within the same BB. This guarantees that
+  // either everything or nothing is loaded.
+  LoadInst *First = findFirstLoad(LIs);
+
+  // To be safe that the loads can be combined, iterate over all loads and test
+  // that the corresponding defining access dominates first LI. This guarantees
+  // that there are no aliasing stores in between the loads.
+  auto FMA = MSSA.getMemoryAccess(First);
+  for (auto *LI : LIs) {
+    auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess();
+    if (!MSSA.dominates(MADef, FMA))
+      return false;
+  }
+  assert(!LIs.empty() && "There are no LoadInst to combine");
+
+  // It is necessary that insertion point dominates all final ShuffleVectorInst.
+  for (auto &VI : InterleavedLoad) {
+    if (!DT.dominates(InsertionPoint, VI.SVI))
+      return false;
+  }
+
+  // All checks are done. Add instructions detectable by InterleavedAccessPass
+  // The old instruction will are left dead.
+  IRBuilder<> Builder(InsertionPoint);
+  Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
+  unsigned ElementsPerSVI =
+      cast<FixedVectorType>(InterleavedLoad.front().SVI->getType())
+          ->getNumElements();
+  FixedVectorType *ILTy = FixedVectorType::get(ETy, Factor * ElementsPerSVI);
+
+  auto Indices = llvm::to_vector<4>(llvm::seq<unsigned>(0, Factor));
+  InterleavedCost = TTI.getInterleavedMemoryOpCost(
+      Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlign(),
+      InsertionPoint->getPointerAddressSpace(), CostKind);
+
+  if (InterleavedCost >= InstructionCost) {
+    return false;
+  }
+
+  // Create a pointer cast for the wide load.
+  auto CI = Builder.CreatePointerCast(InsertionPoint->getOperand(0),
+                                      ILTy->getPointerTo(),
+                                      "interleaved.wide.ptrcast");
+
+  // Create the wide load and update the MemorySSA.
+  auto LI = Builder.CreateAlignedLoad(ILTy, CI, InsertionPoint->getAlign(),
+                                      "interleaved.wide.load");
+  auto MSSAU = MemorySSAUpdater(&MSSA);
+  MemoryUse *MSSALoad = cast<MemoryUse>(MSSAU.createMemoryAccessBefore(
+      LI, nullptr, MSSA.getMemoryAccess(InsertionPoint)));
+  MSSAU.insertUse(MSSALoad, /*RenameUses=*/ true);
+
+  // Create the final SVIs and replace all uses.
+  int i = 0;
+  for (auto &VI : InterleavedLoad) {
+    SmallVector<int, 4> Mask;
+    for (unsigned j = 0; j < ElementsPerSVI; j++)
+      Mask.push_back(i + j * Factor);
+
+    Builder.SetInsertPoint(VI.SVI);
+    auto SVI = Builder.CreateShuffleVector(LI, Mask, "interleaved.shuffle");
+    VI.SVI->replaceAllUsesWith(SVI);
+    i++;
+  }
+
+  NumInterleavedLoadCombine++;
+  ORE.emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "Combined Interleaved Load", LI)
+           << "Load interleaved combined with factor "
+           << ore::NV("Factor", Factor);
+  });
+
+  return true;
+}
+
+bool InterleavedLoadCombineImpl::run() {
+  OptimizationRemarkEmitter ORE(&F);
+  bool changed = false;
+  unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
+
+  auto &DL = F.getParent()->getDataLayout();
+
+  // Start with the highest factor to avoid combining and recombining.
+  for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
+    std::list<VectorInfo> Candidates;
+
+    for (BasicBlock &BB : F) {
+      for (Instruction &I : BB) {
+        if (auto SVI = dyn_cast<ShuffleVectorInst>(&I)) {
+          // We don't support scalable vectors in this pass.
+          if (isa<ScalableVectorType>(SVI->getType()))
+            continue;
+
+          Candidates.emplace_back(cast<FixedVectorType>(SVI->getType()));
+
+          if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) {
+            Candidates.pop_back();
+            continue;
+          }
+
+          if (!Candidates.back().isInterleaved(Factor, DL)) {
+            Candidates.pop_back();
+          }
+        }
+      }
+    }
+
+    std::list<VectorInfo> InterleavedLoad;
+    while (findPattern(Candidates, InterleavedLoad, Factor, DL)) {
+      if (combine(InterleavedLoad, ORE)) {
+        changed = true;
+      } else {
+        // Remove the first element of the Interleaved Load but put the others
+        // back on the list and continue searching
+        Candidates.splice(Candidates.begin(), InterleavedLoad,
+                          std::next(InterleavedLoad.begin()),
+                          InterleavedLoad.end());
+      }
+      InterleavedLoad.clear();
+    }
+  }
+
+  return changed;
+}
+
+namespace {
+/// This pass combines interleaved loads into a pattern detectable by
+/// InterleavedAccessPass.
+struct InterleavedLoadCombine : public FunctionPass {
+  static char ID;
+
+  InterleavedLoadCombine() : FunctionPass(ID) {
+    initializeInterleavedLoadCombinePass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Interleaved Load Combine Pass";
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (DisableInterleavedLoadCombine)
+      return false;
+
+    auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+    if (!TPC)
+      return false;
+
+    LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName()
+                      << "\n");
+
+    return InterleavedLoadCombineImpl(
+               F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+               getAnalysis<MemorySSAWrapperPass>().getMSSA(),
+               TPC->getTM<TargetMachine>())
+        .run();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+};
+} // anonymous namespace
+
+char InterleavedLoadCombine::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+    InterleavedLoadCombine, DEBUG_TYPE,
+    "Combine interleaved loads into wide loads and shufflevector instructions",
+    false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(
+    InterleavedLoadCombine, DEBUG_TYPE,
+    "Combine interleaved loads into wide loads and shufflevector instructions",
+    false, false)
+
+FunctionPass *
+llvm::createInterleavedLoadCombinePass() {
+  auto P = new InterleavedLoadCombine();
+  return P;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/IntrinsicLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/IntrinsicLowering.cpp
new file mode 100644
index 000000000000..61920a0e04ab
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -0,0 +1,474 @@
+//===-- IntrinsicLowering.cpp - Intrinsic Lowering default implementation -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IntrinsicLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+/// This function is used when we want to lower an intrinsic call to a call of
+/// an external function. This handles hard cases such as when there was already
+/// a prototype for the external function, but that prototype doesn't match the
+/// arguments we expect to pass in.
+template <class ArgIt>
+static CallInst *ReplaceCallWith(const char *NewFn, CallInst *CI,
+                                 ArgIt ArgBegin, ArgIt ArgEnd,
+                                 Type *RetTy) {
+  // If we haven't already looked up this function, check to see if the
+  // program already contains a function with this name.
+  Module *M = CI->getModule();
+  // Get or insert the definition now.
+  std::vector<Type *> ParamTys;
+  for (ArgIt I = ArgBegin; I != ArgEnd; ++I)
+    ParamTys.push_back((*I)->getType());
+  FunctionCallee FCache =
+      M->getOrInsertFunction(NewFn, FunctionType::get(RetTy, ParamTys, false));
+
+  IRBuilder<> Builder(CI->getParent(), CI->getIterator());
+  SmallVector<Value *, 8> Args(ArgBegin, ArgEnd);
+  CallInst *NewCI = Builder.CreateCall(FCache, Args);
+  NewCI->setName(CI->getName());
+  if (!CI->use_empty())
+    CI->replaceAllUsesWith(NewCI);
+  return NewCI;
+}
+
+/// Emit the code to lower bswap of V before the specified instruction IP.
+static Value *LowerBSWAP(LLVMContext &Context, Value *V, Instruction *IP) {
+  assert(V->getType()->isIntOrIntVectorTy() && "Can't bswap a non-integer type!");
+
+  unsigned BitSize = V->getType()->getScalarSizeInBits();
+
+  IRBuilder<> Builder(IP);
+
+  switch(BitSize) {
+  default: llvm_unreachable("Unhandled type size of value to byteswap!");
+  case 16: {
+    Value *Tmp1 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 8),
+                                    "bswap.2");
+    Value *Tmp2 = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 8),
+                                     "bswap.1");
+    V = Builder.CreateOr(Tmp1, Tmp2, "bswap.i16");
+    break;
+  }
+  case 32: {
+    Value *Tmp4 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 24),
+                                    "bswap.4");
+    Value *Tmp3 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 8),
+                                    "bswap.3");
+    Value *Tmp2 = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 8),
+                                     "bswap.2");
+    Value *Tmp1 = Builder.CreateLShr(V,ConstantInt::get(V->getType(), 24),
+                                     "bswap.1");
+    Tmp3 = Builder.CreateAnd(Tmp3,
+                         ConstantInt::get(V->getType(), 0xFF0000),
+                             "bswap.and3");
+    Tmp2 = Builder.CreateAnd(Tmp2,
+                           ConstantInt::get(V->getType(), 0xFF00),
+                             "bswap.and2");
+    Tmp4 = Builder.CreateOr(Tmp4, Tmp3, "bswap.or1");
+    Tmp2 = Builder.CreateOr(Tmp2, Tmp1, "bswap.or2");
+    V = Builder.CreateOr(Tmp4, Tmp2, "bswap.i32");
+    break;
+  }
+  case 64: {
+    Value *Tmp8 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 56),
+                                    "bswap.8");
+    Value *Tmp7 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 40),
+                                    "bswap.7");
+    Value *Tmp6 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 24),
+                                    "bswap.6");
+    Value *Tmp5 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 8),
+                                    "bswap.5");
+    Value* Tmp4 = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 8),
+                                     "bswap.4");
+    Value* Tmp3 = Builder.CreateLShr(V,
+                                     ConstantInt::get(V->getType(), 24),
+                                     "bswap.3");
+    Value* Tmp2 = Builder.CreateLShr(V,
+                                     ConstantInt::get(V->getType(), 40),
+                                     "bswap.2");
+    Value* Tmp1 = Builder.CreateLShr(V,
+                                     ConstantInt::get(V->getType(), 56),
+                                     "bswap.1");
+    Tmp7 = Builder.CreateAnd(Tmp7,
+                             ConstantInt::get(V->getType(),
+                                              0xFF000000000000ULL),
+                             "bswap.and7");
+    Tmp6 = Builder.CreateAnd(Tmp6,
+                             ConstantInt::get(V->getType(),
+                                              0xFF0000000000ULL),
+                             "bswap.and6");
+    Tmp5 = Builder.CreateAnd(Tmp5,
+                        ConstantInt::get(V->getType(),
+                             0xFF00000000ULL),
+                             "bswap.and5");
+    Tmp4 = Builder.CreateAnd(Tmp4,
+                        ConstantInt::get(V->getType(),
+                             0xFF000000ULL),
+                             "bswap.and4");
+    Tmp3 = Builder.CreateAnd(Tmp3,
+                             ConstantInt::get(V->getType(),
+                             0xFF0000ULL),
+                             "bswap.and3");
+    Tmp2 = Builder.CreateAnd(Tmp2,
+                             ConstantInt::get(V->getType(),
+                             0xFF00ULL),
+                             "bswap.and2");
+    Tmp8 = Builder.CreateOr(Tmp8, Tmp7, "bswap.or1");
+    Tmp6 = Builder.CreateOr(Tmp6, Tmp5, "bswap.or2");
+    Tmp4 = Builder.CreateOr(Tmp4, Tmp3, "bswap.or3");
+    Tmp2 = Builder.CreateOr(Tmp2, Tmp1, "bswap.or4");
+    Tmp8 = Builder.CreateOr(Tmp8, Tmp6, "bswap.or5");
+    Tmp4 = Builder.CreateOr(Tmp4, Tmp2, "bswap.or6");
+    V = Builder.CreateOr(Tmp8, Tmp4, "bswap.i64");
+    break;
+  }
+  }
+  return V;
+}
+
+/// Emit the code to lower ctpop of V before the specified instruction IP.
+static Value *LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP) {
+  assert(V->getType()->isIntegerTy() && "Can't ctpop a non-integer type!");
+
+  static const uint64_t MaskValues[6] = {
+    0x5555555555555555ULL, 0x3333333333333333ULL,
+    0x0F0F0F0F0F0F0F0FULL, 0x00FF00FF00FF00FFULL,
+    0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL
+  };
+
+  IRBuilder<> Builder(IP);
+
+  unsigned BitSize = V->getType()->getPrimitiveSizeInBits();
+  unsigned WordSize = (BitSize + 63) / 64;
+  Value *Count = ConstantInt::get(V->getType(), 0);
+
+  for (unsigned n = 0; n < WordSize; ++n) {
+    Value *PartValue = V;
+    for (unsigned i = 1, ct = 0; i < (BitSize>64 ? 64 : BitSize);
+         i <<= 1, ++ct) {
+      Value *MaskCst = ConstantInt::get(V->getType(), MaskValues[ct]);
+      Value *LHS = Builder.CreateAnd(PartValue, MaskCst, "cppop.and1");
+      Value *VShift = Builder.CreateLShr(PartValue,
+                                        ConstantInt::get(V->getType(), i),
+                                         "ctpop.sh");
+      Value *RHS = Builder.CreateAnd(VShift, MaskCst, "cppop.and2");
+      PartValue = Builder.CreateAdd(LHS, RHS, "ctpop.step");
+    }
+    Count = Builder.CreateAdd(PartValue, Count, "ctpop.part");
+    if (BitSize > 64) {
+      V = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 64),
+                             "ctpop.part.sh");
+      BitSize -= 64;
+    }
+  }
+
+  return Count;
+}
+
+/// Emit the code to lower ctlz of V before the specified instruction IP.
+static Value *LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP) {
+
+  IRBuilder<> Builder(IP);
+
+  unsigned BitSize = V->getType()->getPrimitiveSizeInBits();
+  for (unsigned i = 1; i < BitSize; i <<= 1) {
+    Value *ShVal = ConstantInt::get(V->getType(), i);
+    ShVal = Builder.CreateLShr(V, ShVal, "ctlz.sh");
+    V = Builder.CreateOr(V, ShVal, "ctlz.step");
+  }
+
+  V = Builder.CreateNot(V);
+  return LowerCTPOP(Context, V, IP);
+}
+
+static void ReplaceFPIntrinsicWithCall(CallInst *CI, const char *Fname,
+                                       const char *Dname,
+                                       const char *LDname) {
+  switch (CI->getArgOperand(0)->getType()->getTypeID()) {
+  default: llvm_unreachable("Invalid type in intrinsic");
+  case Type::FloatTyID:
+    ReplaceCallWith(Fname, CI, CI->arg_begin(), CI->arg_end(),
+                    Type::getFloatTy(CI->getContext()));
+    break;
+  case Type::DoubleTyID:
+    ReplaceCallWith(Dname, CI, CI->arg_begin(), CI->arg_end(),
+                    Type::getDoubleTy(CI->getContext()));
+    break;
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+    ReplaceCallWith(LDname, CI, CI->arg_begin(), CI->arg_end(),
+                    CI->getArgOperand(0)->getType());
+    break;
+  }
+}
+
+void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
+  IRBuilder<> Builder(CI);
+  LLVMContext &Context = CI->getContext();
+
+  const Function *Callee = CI->getCalledFunction();
+  assert(Callee && "Cannot lower an indirect call!");
+
+  switch (Callee->getIntrinsicID()) {
+  case Intrinsic::not_intrinsic:
+    report_fatal_error("Cannot lower a call to a non-intrinsic function '"+
+                      Callee->getName() + "'!");
+  default:
+    report_fatal_error("Code generator does not support intrinsic function '"+
+                      Callee->getName()+"'!");
+
+  case Intrinsic::expect: {
+    // Just replace __builtin_expect(exp, c) with EXP.
+    Value *V = CI->getArgOperand(0);
+    CI->replaceAllUsesWith(V);
+    break;
+  }
+
+  case Intrinsic::ctpop:
+    CI->replaceAllUsesWith(LowerCTPOP(Context, CI->getArgOperand(0), CI));
+    break;
+
+  case Intrinsic::bswap:
+    CI->replaceAllUsesWith(LowerBSWAP(Context, CI->getArgOperand(0), CI));
+    break;
+
+  case Intrinsic::ctlz:
+    CI->replaceAllUsesWith(LowerCTLZ(Context, CI->getArgOperand(0), CI));
+    break;
+
+  case Intrinsic::cttz: {
+    // cttz(x) -> ctpop(~X & (X-1))
+    Value *Src = CI->getArgOperand(0);
+    Value *NotSrc = Builder.CreateNot(Src);
+    NotSrc->setName(Src->getName() + ".not");
+    Value *SrcM1 = ConstantInt::get(Src->getType(), 1);
+    SrcM1 = Builder.CreateSub(Src, SrcM1);
+    Src = LowerCTPOP(Context, Builder.CreateAnd(NotSrc, SrcM1), CI);
+    CI->replaceAllUsesWith(Src);
+    break;
+  }
+
+  case Intrinsic::stacksave:
+  case Intrinsic::stackrestore: {
+    if (!Warned)
+      errs() << "WARNING: this target does not support the llvm.stack"
+             << (Callee->getIntrinsicID() == Intrinsic::stacksave ?
+               "save" : "restore") << " intrinsic.\n";
+    Warned = true;
+    if (Callee->getIntrinsicID() == Intrinsic::stacksave)
+      CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
+    break;
+  }
+
+  case Intrinsic::get_dynamic_area_offset:
+    errs() << "WARNING: this target does not support the custom llvm.get."
+              "dynamic.area.offset.  It is being lowered to a constant 0\n";
+    // Just lower it to a constant 0 because for most targets
+    // @llvm.get.dynamic.area.offset is lowered to zero.
+    CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 0));
+    break;
+  case Intrinsic::returnaddress:
+  case Intrinsic::frameaddress:
+    errs() << "WARNING: this target does not support the llvm."
+           << (Callee->getIntrinsicID() == Intrinsic::returnaddress ?
+             "return" : "frame") << "address intrinsic.\n";
+    CI->replaceAllUsesWith(
+        ConstantPointerNull::get(cast<PointerType>(CI->getType())));
+    break;
+  case Intrinsic::addressofreturnaddress:
+    errs() << "WARNING: this target does not support the "
+              "llvm.addressofreturnaddress intrinsic.\n";
+    CI->replaceAllUsesWith(
+        ConstantPointerNull::get(cast<PointerType>(CI->getType())));
+    break;
+
+  case Intrinsic::prefetch:
+    break;    // Simply strip out prefetches on unsupported architectures
+
+  case Intrinsic::pcmarker:
+    break;    // Simply strip out pcmarker on unsupported architectures
+  case Intrinsic::readcyclecounter: {
+    errs() << "WARNING: this target does not support the llvm.readcyclecoun"
+           << "ter intrinsic.  It is being lowered to a constant 0\n";
+    CI->replaceAllUsesWith(ConstantInt::get(Type::getInt64Ty(Context), 0));
+    break;
+  }
+
+  case Intrinsic::dbg_declare:
+  case Intrinsic::dbg_label:
+    break;    // Simply strip out debugging intrinsics
+
+  case Intrinsic::eh_typeid_for:
+    // Return something different to eh_selector.
+    CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 1));
+    break;
+
+  case Intrinsic::annotation:
+  case Intrinsic::ptr_annotation:
+    // Just drop the annotation, but forward the value
+    CI->replaceAllUsesWith(CI->getOperand(0));
+    break;
+
+  case Intrinsic::assume:
+  case Intrinsic::experimental_noalias_scope_decl:
+  case Intrinsic::var_annotation:
+    break;   // Strip out these intrinsics
+
+  case Intrinsic::memcpy: {
+    Type *IntPtr = DL.getIntPtrType(Context);
+    Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+                                        /* isSigned */ false);
+    Value *Ops[3];
+    Ops[0] = CI->getArgOperand(0);
+    Ops[1] = CI->getArgOperand(1);
+    Ops[2] = Size;
+    ReplaceCallWith("memcpy", CI, Ops, Ops+3, CI->getArgOperand(0)->getType());
+    break;
+  }
+  case Intrinsic::memmove: {
+    Type *IntPtr = DL.getIntPtrType(Context);
+    Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+                                        /* isSigned */ false);
+    Value *Ops[3];
+    Ops[0] = CI->getArgOperand(0);
+    Ops[1] = CI->getArgOperand(1);
+    Ops[2] = Size;
+    ReplaceCallWith("memmove", CI, Ops, Ops+3, CI->getArgOperand(0)->getType());
+    break;
+  }
+  case Intrinsic::memset: {
+    Value *Op0 = CI->getArgOperand(0);
+    Type *IntPtr = DL.getIntPtrType(Op0->getType());
+    Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+                                        /* isSigned */ false);
+    Value *Ops[3];
+    Ops[0] = Op0;
+    // Extend the amount to i32.
+    Ops[1] = Builder.CreateIntCast(CI->getArgOperand(1),
+                                   Type::getInt32Ty(Context),
+                                   /* isSigned */ false);
+    Ops[2] = Size;
+    ReplaceCallWith("memset", CI, Ops, Ops+3, CI->getArgOperand(0)->getType());
+    break;
+  }
+  case Intrinsic::sqrt: {
+    ReplaceFPIntrinsicWithCall(CI, "sqrtf", "sqrt", "sqrtl");
+    break;
+  }
+  case Intrinsic::log: {
+    ReplaceFPIntrinsicWithCall(CI, "logf", "log", "logl");
+    break;
+  }
+  case Intrinsic::log2: {
+    ReplaceFPIntrinsicWithCall(CI, "log2f", "log2", "log2l");
+    break;
+  }
+  case Intrinsic::log10: {
+    ReplaceFPIntrinsicWithCall(CI, "log10f", "log10", "log10l");
+    break;
+  }
+  case Intrinsic::exp: {
+    ReplaceFPIntrinsicWithCall(CI, "expf", "exp", "expl");
+    break;
+  }
+  case Intrinsic::exp2: {
+    ReplaceFPIntrinsicWithCall(CI, "exp2f", "exp2", "exp2l");
+    break;
+  }
+  case Intrinsic::pow: {
+    ReplaceFPIntrinsicWithCall(CI, "powf", "pow", "powl");
+    break;
+  }
+  case Intrinsic::sin: {
+    ReplaceFPIntrinsicWithCall(CI, "sinf", "sin", "sinl");
+    break;
+  }
+  case Intrinsic::cos: {
+    ReplaceFPIntrinsicWithCall(CI, "cosf", "cos", "cosl");
+    break;
+  }
+  case Intrinsic::floor: {
+    ReplaceFPIntrinsicWithCall(CI, "floorf", "floor", "floorl");
+    break;
+  }
+  case Intrinsic::ceil: {
+    ReplaceFPIntrinsicWithCall(CI, "ceilf", "ceil", "ceill");
+    break;
+  }
+  case Intrinsic::trunc: {
+    ReplaceFPIntrinsicWithCall(CI, "truncf", "trunc", "truncl");
+    break;
+  }
+  case Intrinsic::round: {
+    ReplaceFPIntrinsicWithCall(CI, "roundf", "round", "roundl");
+    break;
+  }
+  case Intrinsic::roundeven: {
+    ReplaceFPIntrinsicWithCall(CI, "roundevenf", "roundeven", "roundevenl");
+    break;
+  }
+  case Intrinsic::copysign: {
+    ReplaceFPIntrinsicWithCall(CI, "copysignf", "copysign", "copysignl");
+    break;
+  }
+  case Intrinsic::get_rounding:
+     // Lower to "round to the nearest"
+     if (!CI->getType()->isVoidTy())
+       CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 1));
+     break;
+  case Intrinsic::invariant_start:
+  case Intrinsic::lifetime_start:
+    // Discard region information.
+    CI->replaceAllUsesWith(UndefValue::get(CI->getType()));
+    break;
+  case Intrinsic::invariant_end:
+  case Intrinsic::lifetime_end:
+    // Discard region information.
+    break;
+  }
+
+  assert(CI->use_empty() &&
+         "Lowering should have eliminated any uses of the intrinsic call!");
+  CI->eraseFromParent();
+}
+
+bool IntrinsicLowering::LowerToByteSwap(CallInst *CI) {
+  // Verify this is a simple bswap.
+  if (CI->arg_size() != 1 || CI->getType() != CI->getArgOperand(0)->getType() ||
+      !CI->getType()->isIntegerTy())
+    return false;
+
+  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+  if (!Ty)
+    return false;
+
+  // Okay, we can do this xform, do so now.
+  Module *M = CI->getModule();
+  Function *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Ty);
+
+  Value *Op = CI->getArgOperand(0);
+  Op = CallInst::Create(Int, Op, CI->getName(), CI);
+
+  CI->replaceAllUsesWith(Op);
+  CI->eraseFromParent();
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/JMCInstrumenter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/JMCInstrumenter.cpp
new file mode 100644
index 000000000000..f1953c363b59
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/JMCInstrumenter.cpp
@@ -0,0 +1,233 @@
+//===- JMCInstrumenter.cpp - JMC Instrumentation --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// JMCInstrumenter pass:
+// - instrument each function with a call to __CheckForDebuggerJustMyCode. The
+//   sole argument should be defined in .msvcjmc. Each flag is 1 byte initilized
+//   to 1.
+// - create the dummy COMDAT function __JustMyCode_Default to prevent linking
+//   error if __CheckForDebuggerJustMyCode is not available.
+// - For MSVC:
+//   add "/alternatename:__CheckForDebuggerJustMyCode=__JustMyCode_Default" to
+//   "llvm.linker.options"
+//   For ELF:
+//   Rename __JustMyCode_Default to __CheckForDebuggerJustMyCode and mark it as
+//   weak symbol.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/DJB.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "jmc-instrument"
+
+namespace {
+struct JMCInstrumenter : public ModulePass {
+  static char ID;
+  JMCInstrumenter() : ModulePass(ID) {
+    initializeJMCInstrumenterPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override;
+};
+char JMCInstrumenter::ID = 0;
+} // namespace
+
+INITIALIZE_PASS(
+    JMCInstrumenter, DEBUG_TYPE,
+    "Instrument function entry with call to __CheckForDebuggerJustMyCode",
+    false, false)
+
+ModulePass *llvm::createJMCInstrumenterPass() { return new JMCInstrumenter(); }
+
+namespace {
+const char CheckFunctionName[] = "__CheckForDebuggerJustMyCode";
+
+std::string getFlagName(DISubprogram &SP, bool UseX86FastCall) {
+  // absolute windows path:           windows_backslash
+  // relative windows backslash path: windows_backslash
+  // relative windows slash path:     posix
+  // absolute posix path:             posix
+  // relative posix path:             posix
+  sys::path::Style PathStyle =
+      has_root_name(SP.getDirectory(), sys::path::Style::windows_backslash) ||
+              SP.getDirectory().contains("\\") ||
+              SP.getFilename().contains("\\")
+          ? sys::path::Style::windows_backslash
+          : sys::path::Style::posix;
+  // Best effort path normalization. This is to guarantee an unique flag symbol
+  // is produced for the same directory. Some builds may want to use relative
+  // paths, or paths with a specific prefix (see the -fdebug-compilation-dir
+  // flag), so only hash paths in debuginfo. Don't expand them to absolute
+  // paths.
+  SmallString<256> FilePath(SP.getDirectory());
+  sys::path::append(FilePath, PathStyle, SP.getFilename());
+  sys::path::native(FilePath, PathStyle);
+  sys::path::remove_dots(FilePath, /*remove_dot_dot=*/true, PathStyle);
+
+  // The naming convention for the flag name is __<hash>_<file name> with '.' in
+  // <file name> replaced with '@'. For example C:\file.any.c would have a flag
+  // __D032E919_file@any@c. The naming convention match MSVC's format however
+  // the match is not required to make JMC work. The hashing function used here
+  // is different from MSVC's.
+
+  std::string Suffix;
+  for (auto C : sys::path::filename(FilePath, PathStyle))
+    Suffix.push_back(C == '.' ? '@' : C);
+
+  sys::path::remove_filename(FilePath, PathStyle);
+  return (UseX86FastCall ? "_" : "__") +
+         utohexstr(djbHash(FilePath), /*LowerCase=*/false,
+                   /*Width=*/8) +
+         "_" + Suffix;
+}
+
+void attachDebugInfo(GlobalVariable &GV, DISubprogram &SP) {
+  Module &M = *GV.getParent();
+  DICompileUnit *CU = SP.getUnit();
+  assert(CU);
+  DIBuilder DB(M, false, CU);
+
+  auto *DType =
+      DB.createBasicType("unsigned char", 8, dwarf::DW_ATE_unsigned_char,
+                         llvm::DINode::FlagArtificial);
+
+  auto *DGVE = DB.createGlobalVariableExpression(
+      CU, GV.getName(), /*LinkageName=*/StringRef(), SP.getFile(),
+      /*LineNo=*/0, DType, /*IsLocalToUnit=*/true, /*IsDefined=*/true);
+  GV.addMetadata(LLVMContext::MD_dbg, *DGVE);
+  DB.finalize();
+}
+
+FunctionType *getCheckFunctionType(LLVMContext &Ctx) {
+  Type *VoidTy = Type::getVoidTy(Ctx);
+  PointerType *VoidPtrTy = Type::getInt8PtrTy(Ctx);
+  return FunctionType::get(VoidTy, VoidPtrTy, false);
+}
+
+Function *createDefaultCheckFunction(Module &M, bool UseX86FastCall) {
+  LLVMContext &Ctx = M.getContext();
+  const char *DefaultCheckFunctionName =
+      UseX86FastCall ? "_JustMyCode_Default" : "__JustMyCode_Default";
+  // Create the function.
+  Function *DefaultCheckFunc =
+      Function::Create(getCheckFunctionType(Ctx), GlobalValue::ExternalLinkage,
+                       DefaultCheckFunctionName, &M);
+  DefaultCheckFunc->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  DefaultCheckFunc->addParamAttr(0, Attribute::NoUndef);
+  if (UseX86FastCall)
+    DefaultCheckFunc->addParamAttr(0, Attribute::InReg);
+
+  BasicBlock *EntryBB = BasicBlock::Create(Ctx, "", DefaultCheckFunc);
+  ReturnInst::Create(Ctx, EntryBB);
+  return DefaultCheckFunc;
+}
+} // namespace
+
+bool JMCInstrumenter::runOnModule(Module &M) {
+  bool Changed = false;
+  LLVMContext &Ctx = M.getContext();
+  Triple ModuleTriple(M.getTargetTriple());
+  bool IsMSVC = ModuleTriple.isKnownWindowsMSVCEnvironment();
+  bool IsELF = ModuleTriple.isOSBinFormatELF();
+  assert((IsELF || IsMSVC) && "Unsupported triple for JMC");
+  bool UseX86FastCall = IsMSVC && ModuleTriple.getArch() == Triple::x86;
+  const char *const FlagSymbolSection = IsELF ? ".data.just.my.code" : ".msvcjmc";
+
+  GlobalValue *CheckFunction = nullptr;
+  DenseMap<DISubprogram *, Constant *> SavedFlags(8);
+  for (auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+    auto *SP = F.getSubprogram();
+    if (!SP)
+      continue;
+
+    Constant *&Flag = SavedFlags[SP];
+    if (!Flag) {
+      std::string FlagName = getFlagName(*SP, UseX86FastCall);
+      IntegerType *FlagTy = Type::getInt8Ty(Ctx);
+      Flag = M.getOrInsertGlobal(FlagName, FlagTy, [&] {
+        // FIXME: Put the GV in comdat and have linkonce_odr linkage to save
+        //        .msvcjmc section space? maybe not worth it.
+        GlobalVariable *GV = new GlobalVariable(
+            M, FlagTy, /*isConstant=*/false, GlobalValue::InternalLinkage,
+            ConstantInt::get(FlagTy, 1), FlagName);
+        GV->setSection(FlagSymbolSection);
+        GV->setAlignment(Align(1));
+        GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+        attachDebugInfo(*GV, *SP);
+        return GV;
+      });
+    }
+
+    if (!CheckFunction) {
+      Function *DefaultCheckFunc =
+          createDefaultCheckFunction(M, UseX86FastCall);
+      if (IsELF) {
+        DefaultCheckFunc->setName(CheckFunctionName);
+        DefaultCheckFunc->setLinkage(GlobalValue::WeakAnyLinkage);
+        CheckFunction = DefaultCheckFunc;
+      } else {
+        assert(!M.getFunction(CheckFunctionName) &&
+               "JMC instrument more than once?");
+        auto *CheckFunc = cast<Function>(
+            M.getOrInsertFunction(CheckFunctionName, getCheckFunctionType(Ctx))
+                .getCallee());
+        CheckFunc->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+        CheckFunc->addParamAttr(0, Attribute::NoUndef);
+        if (UseX86FastCall) {
+          CheckFunc->setCallingConv(CallingConv::X86_FastCall);
+          CheckFunc->addParamAttr(0, Attribute::InReg);
+        }
+        CheckFunction = CheckFunc;
+
+        StringRef DefaultCheckFunctionName = DefaultCheckFunc->getName();
+        appendToUsed(M, {DefaultCheckFunc});
+        Comdat *C = M.getOrInsertComdat(DefaultCheckFunctionName);
+        C->setSelectionKind(Comdat::Any);
+        DefaultCheckFunc->setComdat(C);
+        // Add a linker option /alternatename to set the default implementation
+        // for the check function.
+        // https://devblogs.microsoft.com/oldnewthing/20200731-00/?p=104024
+        std::string AltOption = std::string("/alternatename:") +
+                                CheckFunctionName + "=" +
+                                DefaultCheckFunctionName.str();
+        llvm::Metadata *Ops[] = {llvm::MDString::get(Ctx, AltOption)};
+        MDTuple *N = MDNode::get(Ctx, Ops);
+        M.getOrInsertNamedMetadata("llvm.linker.options")->addOperand(N);
+      }
+    }
+    // FIXME: it would be nice to make CI scheduling boundary, although in
+    //        practice it does not matter much.
+    auto *CI = CallInst::Create(getCheckFunctionType(Ctx), CheckFunction,
+                                {Flag}, "", &*F.begin()->getFirstInsertionPt());
+    CI->addParamAttr(0, Attribute::NoUndef);
+    if (UseX86FastCall) {
+      CI->setCallingConv(CallingConv::X86_FastCall);
+      CI->addParamAttr(0, Attribute::InReg);
+    }
+
+    Changed = true;
+  }
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/KCFI.cpp b/contrib/llvm-project/llvm/lib/CodeGen/KCFI.cpp
new file mode 100644
index 000000000000..bffa02ca8afd
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/KCFI.cpp
@@ -0,0 +1,111 @@
+//===---- KCFI.cpp - Implements Kernel Control-Flow Integrity (KCFI) ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements Kernel Control-Flow Integrity (KCFI) indirect call
+// check lowering. For each call instruction with a cfi-type attribute, it
+// emits an arch-specific check before the call, and bundles the check and
+// the call to prevent unintentional modifications.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "kcfi"
+#define KCFI_PASS_NAME "Insert KCFI indirect call checks"
+
+STATISTIC(NumKCFIChecksAdded, "Number of indirect call checks added");
+
+namespace {
+class KCFI : public MachineFunctionPass {
+public:
+  static char ID;
+
+  KCFI() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return KCFI_PASS_NAME; }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  /// Machine instruction info used throughout the class.
+  const TargetInstrInfo *TII = nullptr;
+
+  /// Target lowering for arch-specific parts.
+  const TargetLowering *TLI = nullptr;
+
+  /// Emits a KCFI check before an indirect call.
+  /// \returns true if the check was added and false otherwise.
+  bool emitCheck(MachineBasicBlock &MBB,
+                 MachineBasicBlock::instr_iterator I) const;
+};
+
+char KCFI::ID = 0;
+} // end anonymous namespace
+
+INITIALIZE_PASS(KCFI, DEBUG_TYPE, KCFI_PASS_NAME, false, false)
+
+FunctionPass *llvm::createKCFIPass() { return new KCFI(); }
+
+bool KCFI::emitCheck(MachineBasicBlock &MBB,
+                     MachineBasicBlock::instr_iterator MBBI) const {
+  assert(TII && "Target instruction info was not initialized");
+  assert(TLI && "Target lowering was not initialized");
+
+  // If the call instruction is bundled, we can only emit a check safely if
+  // it's the first instruction in the bundle.
+  if (MBBI->isBundled() && !std::prev(MBBI)->isBundle())
+    report_fatal_error("Cannot emit a KCFI check for a bundled call");
+
+  // Emit a KCFI check for the call instruction at MBBI. The implementation
+  // must unfold memory operands if applicable.
+  MachineInstr *Check = TLI->EmitKCFICheck(MBB, MBBI, TII);
+
+  // Clear the original call's CFI type.
+  assert(MBBI->isCall() && "Unexpected instruction type");
+  MBBI->setCFIType(*MBB.getParent(), 0);
+
+  // If not already bundled, bundle the check and the call to prevent
+  // further changes.
+  if (!MBBI->isBundled())
+    finalizeBundle(MBB, Check->getIterator(), std::next(MBBI->getIterator()));
+
+  ++NumKCFIChecksAdded;
+  return true;
+}
+
+bool KCFI::runOnMachineFunction(MachineFunction &MF) {
+  const Module *M = MF.getMMI().getModule();
+  if (!M->getModuleFlag("kcfi"))
+    return false;
+
+  const auto &SubTarget = MF.getSubtarget();
+  TII = SubTarget.getInstrInfo();
+  TLI = SubTarget.getTargetLowering();
+
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF) {
+    // Use instr_iterator because we don't want to skip bundles.
+    for (MachineBasicBlock::instr_iterator MII = MBB.instr_begin(),
+                                           MIE = MBB.instr_end();
+         MII != MIE; ++MII) {
+      if (MII->isCall() && MII->getCFIType())
+        Changed |= emitCheck(MBB, MII);
+    }
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LLVMTargetMachine.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LLVMTargetMachine.cpp
new file mode 100644
index 000000000000..d02ec1db1165
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LLVMTargetMachine.cpp
@@ -0,0 +1,301 @@
+//===-- LLVMTargetMachine.cpp - Implement the LLVMTargetMachine class -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LLVMTargetMachine class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+static cl::opt<bool>
+    EnableTrapUnreachable("trap-unreachable", cl::Hidden,
+                          cl::desc("Enable generating trap for unreachable"));
+
+void LLVMTargetMachine::initAsmInfo() {
+  MRI.reset(TheTarget.createMCRegInfo(getTargetTriple().str()));
+  assert(MRI && "Unable to create reg info");
+  MII.reset(TheTarget.createMCInstrInfo());
+  assert(MII && "Unable to create instruction info");
+  // FIXME: Having an MCSubtargetInfo on the target machine is a hack due
+  // to some backends having subtarget feature dependent module level
+  // code generation. This is similar to the hack in the AsmPrinter for
+  // module level assembly etc.
+  STI.reset(TheTarget.createMCSubtargetInfo(
+      getTargetTriple().str(), getTargetCPU(), getTargetFeatureString()));
+  assert(STI && "Unable to create subtarget info");
+
+  MCAsmInfo *TmpAsmInfo = TheTarget.createMCAsmInfo(
+      *MRI, getTargetTriple().str(), Options.MCOptions);
+  // TargetSelect.h moved to a different directory between LLVM 2.9 and 3.0,
+  // and if the old one gets included then MCAsmInfo will be NULL and
+  // we'll crash later.
+  // Provide the user with a useful error message about what's wrong.
+  assert(TmpAsmInfo && "MCAsmInfo not initialized. "
+         "Make sure you include the correct TargetSelect.h"
+         "and that InitializeAllTargetMCs() is being invoked!");
+
+  if (Options.BinutilsVersion.first > 0)
+    TmpAsmInfo->setBinutilsVersion(Options.BinutilsVersion);
+
+  if (Options.DisableIntegratedAS) {
+    TmpAsmInfo->setUseIntegratedAssembler(false);
+    // If there is explict option disable integratedAS, we can't use it for
+    // inlineasm either.
+    TmpAsmInfo->setParseInlineAsmUsingAsmParser(false);
+  }
+
+  TmpAsmInfo->setPreserveAsmComments(Options.MCOptions.PreserveAsmComments);
+
+  TmpAsmInfo->setCompressDebugSections(Options.CompressDebugSections);
+
+  TmpAsmInfo->setRelaxELFRelocations(Options.RelaxELFRelocations);
+
+  if (Options.ExceptionModel != ExceptionHandling::None)
+    TmpAsmInfo->setExceptionsType(Options.ExceptionModel);
+
+  AsmInfo.reset(TmpAsmInfo);
+}
+
+LLVMTargetMachine::LLVMTargetMachine(const Target &T,
+                                     StringRef DataLayoutString,
+                                     const Triple &TT, StringRef CPU,
+                                     StringRef FS, const TargetOptions &Options,
+                                     Reloc::Model RM, CodeModel::Model CM,
+                                     CodeGenOpt::Level OL)
+    : TargetMachine(T, DataLayoutString, TT, CPU, FS, Options) {
+  this->RM = RM;
+  this->CMModel = CM;
+  this->OptLevel = OL;
+
+  if (EnableTrapUnreachable)
+    this->Options.TrapUnreachable = true;
+}
+
+TargetTransformInfo
+LLVMTargetMachine::getTargetTransformInfo(const Function &F) const {
+  return TargetTransformInfo(BasicTTIImpl(this, F));
+}
+
+/// addPassesToX helper drives creation and initialization of TargetPassConfig.
+static TargetPassConfig *
+addPassesToGenerateCode(LLVMTargetMachine &TM, PassManagerBase &PM,
+                        bool DisableVerify,
+                        MachineModuleInfoWrapperPass &MMIWP) {
+  // Targets may override createPassConfig to provide a target-specific
+  // subclass.
+  TargetPassConfig *PassConfig = TM.createPassConfig(PM);
+  // Set PassConfig options provided by TargetMachine.
+  PassConfig->setDisableVerify(DisableVerify);
+  PM.add(PassConfig);
+  PM.add(&MMIWP);
+
+  if (PassConfig->addISelPasses())
+    return nullptr;
+  PassConfig->addMachinePasses();
+  PassConfig->setInitialized();
+  return PassConfig;
+}
+
+bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM,
+                                      raw_pwrite_stream &Out,
+                                      raw_pwrite_stream *DwoOut,
+                                      CodeGenFileType FileType,
+                                      MCContext &Context) {
+  Expected<std::unique_ptr<MCStreamer>> MCStreamerOrErr =
+      createMCStreamer(Out, DwoOut, FileType, Context);
+  if (auto Err = MCStreamerOrErr.takeError())
+    return true;
+
+  // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
+  FunctionPass *Printer =
+      getTarget().createAsmPrinter(*this, std::move(*MCStreamerOrErr));
+  if (!Printer)
+    return true;
+
+  PM.add(Printer);
+  return false;
+}
+
+Expected<std::unique_ptr<MCStreamer>> LLVMTargetMachine::createMCStreamer(
+    raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
+    MCContext &Context) {
+  if (Options.MCOptions.MCSaveTempLabels)
+    Context.setAllowTemporaryLabels(false);
+
+  const MCSubtargetInfo &STI = *getMCSubtargetInfo();
+  const MCAsmInfo &MAI = *getMCAsmInfo();
+  const MCRegisterInfo &MRI = *getMCRegisterInfo();
+  const MCInstrInfo &MII = *getMCInstrInfo();
+
+  std::unique_ptr<MCStreamer> AsmStreamer;
+
+  switch (FileType) {
+  case CGFT_AssemblyFile: {
+    MCInstPrinter *InstPrinter = getTarget().createMCInstPrinter(
+        getTargetTriple(), MAI.getAssemblerDialect(), MAI, MII, MRI);
+
+    // Create a code emitter if asked to show the encoding.
+    std::unique_ptr<MCCodeEmitter> MCE;
+    if (Options.MCOptions.ShowMCEncoding)
+      MCE.reset(getTarget().createMCCodeEmitter(MII, Context));
+
+    bool UseDwarfDirectory = false;
+    switch (Options.MCOptions.MCUseDwarfDirectory) {
+    case MCTargetOptions::DisableDwarfDirectory:
+      UseDwarfDirectory = false;
+      break;
+    case MCTargetOptions::EnableDwarfDirectory:
+      UseDwarfDirectory = true;
+      break;
+    case MCTargetOptions::DefaultDwarfDirectory:
+      UseDwarfDirectory = MAI.enableDwarfFileDirectoryDefault();
+      break;
+    }
+
+    std::unique_ptr<MCAsmBackend> MAB(
+        getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions));
+    auto FOut = std::make_unique<formatted_raw_ostream>(Out);
+    MCStreamer *S = getTarget().createAsmStreamer(
+        Context, std::move(FOut), Options.MCOptions.AsmVerbose,
+        UseDwarfDirectory, InstPrinter, std::move(MCE), std::move(MAB),
+        Options.MCOptions.ShowMCInst);
+    AsmStreamer.reset(S);
+    break;
+  }
+  case CGFT_ObjectFile: {
+    // Create the code emitter for the target if it exists.  If not, .o file
+    // emission fails.
+    MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(MII, Context);
+    if (!MCE)
+      return make_error<StringError>("createMCCodeEmitter failed",
+                                     inconvertibleErrorCode());
+    MCAsmBackend *MAB =
+        getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions);
+    if (!MAB)
+      return make_error<StringError>("createMCAsmBackend failed",
+                                     inconvertibleErrorCode());
+
+    Triple T(getTargetTriple().str());
+    AsmStreamer.reset(getTarget().createMCObjectStreamer(
+        T, Context, std::unique_ptr<MCAsmBackend>(MAB),
+        DwoOut ? MAB->createDwoObjectWriter(Out, *DwoOut)
+               : MAB->createObjectWriter(Out),
+        std::unique_ptr<MCCodeEmitter>(MCE), STI, Options.MCOptions.MCRelaxAll,
+        Options.MCOptions.MCIncrementalLinkerCompatible,
+        /*DWARFMustBeAtTheEnd*/ true));
+    break;
+  }
+  case CGFT_Null:
+    // The Null output is intended for use for performance analysis and testing,
+    // not real users.
+    AsmStreamer.reset(getTarget().createNullStreamer(Context));
+    break;
+  }
+
+  return std::move(AsmStreamer);
+}
+
+bool LLVMTargetMachine::addPassesToEmitFile(
+    PassManagerBase &PM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+    CodeGenFileType FileType, bool DisableVerify,
+    MachineModuleInfoWrapperPass *MMIWP) {
+  // Add common CodeGen passes.
+  if (!MMIWP)
+    MMIWP = new MachineModuleInfoWrapperPass(this);
+  TargetPassConfig *PassConfig =
+      addPassesToGenerateCode(*this, PM, DisableVerify, *MMIWP);
+  if (!PassConfig)
+    return true;
+
+  if (TargetPassConfig::willCompleteCodeGenPipeline()) {
+    if (addAsmPrinter(PM, Out, DwoOut, FileType, MMIWP->getMMI().getContext()))
+      return true;
+  } else {
+    // MIR printing is redundant with -filetype=null.
+    if (FileType != CGFT_Null)
+      PM.add(createPrintMIRPass(Out));
+  }
+
+  PM.add(createFreeMachineFunctionPass());
+  return false;
+}
+
+/// addPassesToEmitMC - Add passes to the specified pass manager to get
+/// machine code emitted with the MCJIT. This method returns true if machine
+/// code is not supported. It fills the MCContext Ctx pointer which can be
+/// used to build custom MCStreamer.
+///
+bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx,
+                                          raw_pwrite_stream &Out,
+                                          bool DisableVerify) {
+  // Add common CodeGen passes.
+  MachineModuleInfoWrapperPass *MMIWP = new MachineModuleInfoWrapperPass(this);
+  TargetPassConfig *PassConfig =
+      addPassesToGenerateCode(*this, PM, DisableVerify, *MMIWP);
+  if (!PassConfig)
+    return true;
+  assert(TargetPassConfig::willCompleteCodeGenPipeline() &&
+         "Cannot emit MC with limited codegen pipeline");
+
+  Ctx = &MMIWP->getMMI().getContext();
+  // libunwind is unable to load compact unwind dynamically, so we must generate
+  // DWARF unwind info for the JIT.
+  Options.MCOptions.EmitDwarfUnwind = EmitDwarfUnwindType::Always;
+  if (Options.MCOptions.MCSaveTempLabels)
+    Ctx->setAllowTemporaryLabels(false);
+
+  // Create the code emitter for the target if it exists.  If not, .o file
+  // emission fails.
+  const MCSubtargetInfo &STI = *getMCSubtargetInfo();
+  const MCRegisterInfo &MRI = *getMCRegisterInfo();
+  std::unique_ptr<MCCodeEmitter> MCE(
+      getTarget().createMCCodeEmitter(*getMCInstrInfo(), *Ctx));
+  std::unique_ptr<MCAsmBackend> MAB(
+      getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions));
+  if (!MCE || !MAB)
+    return true;
+
+  const Triple &T = getTargetTriple();
+  std::unique_ptr<MCStreamer> AsmStreamer(getTarget().createMCObjectStreamer(
+      T, *Ctx, std::move(MAB), MAB->createObjectWriter(Out), std::move(MCE),
+      STI, Options.MCOptions.MCRelaxAll,
+      Options.MCOptions.MCIncrementalLinkerCompatible,
+      /*DWARFMustBeAtTheEnd*/ true));
+
+  // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
+  FunctionPass *Printer =
+      getTarget().createAsmPrinter(*this, std::move(AsmStreamer));
+  if (!Printer)
+    return true;
+
+  PM.add(Printer);
+  PM.add(createFreeMachineFunctionPass());
+
+  return false; // success!
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LatencyPriorityQueue.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LatencyPriorityQueue.cpp
new file mode 100644
index 000000000000..fab6b8d10a33
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LatencyPriorityQueue.cpp
@@ -0,0 +1,147 @@
+//===---- LatencyPriorityQueue.cpp - A latency-oriented priority queue ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LatencyPriorityQueue class, which is a
+// SchedulingPriorityQueue that schedules using latency information to
+// reduce the length of the critical path through the basic block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "scheduler"
+
+bool latency_sort::operator()(const SUnit *LHS, const SUnit *RHS) const {
+  // The isScheduleHigh flag allows nodes with wraparound dependencies that
+  // cannot easily be modeled as edges with latencies to be scheduled as
+  // soon as possible in a top-down schedule.
+  if (LHS->isScheduleHigh && !RHS->isScheduleHigh)
+    return false;
+  if (!LHS->isScheduleHigh && RHS->isScheduleHigh)
+    return true;
+
+  unsigned LHSNum = LHS->NodeNum;
+  unsigned RHSNum = RHS->NodeNum;
+
+  // The most important heuristic is scheduling the critical path.
+  unsigned LHSLatency = PQ->getLatency(LHSNum);
+  unsigned RHSLatency = PQ->getLatency(RHSNum);
+  if (LHSLatency < RHSLatency) return true;
+  if (LHSLatency > RHSLatency) return false;
+
+  // After that, if two nodes have identical latencies, look to see if one will
+  // unblock more other nodes than the other.
+  unsigned LHSBlocked = PQ->getNumSolelyBlockNodes(LHSNum);
+  unsigned RHSBlocked = PQ->getNumSolelyBlockNodes(RHSNum);
+  if (LHSBlocked < RHSBlocked) return true;
+  if (LHSBlocked > RHSBlocked) return false;
+
+  // Finally, just to provide a stable ordering, use the node number as a
+  // deciding factor.
+  return RHSNum < LHSNum;
+}
+
+
+/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
+/// of SU, return it, otherwise return null.
+SUnit *LatencyPriorityQueue::getSingleUnscheduledPred(SUnit *SU) {
+  SUnit *OnlyAvailablePred = nullptr;
+  for (const SDep &P : SU->Preds) {
+    SUnit &Pred = *P.getSUnit();
+    if (!Pred.isScheduled) {
+      // We found an available, but not scheduled, predecessor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
+        return nullptr;
+      OnlyAvailablePred = &Pred;
+    }
+  }
+
+  return OnlyAvailablePred;
+}
+
+void LatencyPriorityQueue::push(SUnit *SU) {
+  // Look at all of the successors of this node.  Count the number of nodes that
+  // this node is the sole unscheduled node for.
+  unsigned NumNodesBlocking = 0;
+  for (const SDep &Succ : SU->Succs)
+    if (getSingleUnscheduledPred(Succ.getSUnit()) == SU)
+      ++NumNodesBlocking;
+  NumNodesSolelyBlocking[SU->NodeNum] = NumNodesBlocking;
+
+  Queue.push_back(SU);
+}
+
+
+// scheduledNode - As nodes are scheduled, we look to see if there are any
+// successor nodes that have a single unscheduled predecessor.  If so, that
+// single predecessor has a higher priority, since scheduling it will make
+// the node available.
+void LatencyPriorityQueue::scheduledNode(SUnit *SU) {
+  for (const SDep &Succ : SU->Succs)
+    AdjustPriorityOfUnscheduledPreds(Succ.getSUnit());
+}
+
+/// AdjustPriorityOfUnscheduledPreds - One of the predecessors of SU was just
+/// scheduled.  If SU is not itself available, then there is at least one
+/// predecessor node that has not been scheduled yet.  If SU has exactly ONE
+/// unscheduled predecessor, we want to increase its priority: it getting
+/// scheduled will make this node available, so it is better than some other
+/// node of the same priority that will not make a node available.
+void LatencyPriorityQueue::AdjustPriorityOfUnscheduledPreds(SUnit *SU) {
+  if (SU->isAvailable) return;  // All preds scheduled.
+
+  SUnit *OnlyAvailablePred = getSingleUnscheduledPred(SU);
+  if (!OnlyAvailablePred || !OnlyAvailablePred->isAvailable) return;
+
+  // Okay, we found a single predecessor that is available, but not scheduled.
+  // Since it is available, it must be in the priority queue.  First remove it.
+  remove(OnlyAvailablePred);
+
+  // Reinsert the node into the priority queue, which recomputes its
+  // NumNodesSolelyBlocking value.
+  push(OnlyAvailablePred);
+}
+
+SUnit *LatencyPriorityQueue::pop() {
+  if (empty()) return nullptr;
+  std::vector<SUnit *>::iterator Best = Queue.begin();
+  for (std::vector<SUnit *>::iterator I = std::next(Queue.begin()),
+       E = Queue.end(); I != E; ++I)
+    if (Picker(*Best, *I))
+      Best = I;
+  SUnit *V = *Best;
+  if (Best != std::prev(Queue.end()))
+    std::swap(*Best, Queue.back());
+  Queue.pop_back();
+  return V;
+}
+
+void LatencyPriorityQueue::remove(SUnit *SU) {
+  assert(!Queue.empty() && "Queue is empty!");
+  std::vector<SUnit *>::iterator I = find(Queue, SU);
+  assert(I != Queue.end() && "Queue doesn't contain the SU being removed!");
+  if (I != std::prev(Queue.end()))
+    std::swap(*I, Queue.back());
+  Queue.pop_back();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LatencyPriorityQueue::dump(ScheduleDAG *DAG) const {
+  dbgs() << "Latency Priority Queue\n";
+  dbgs() << "  Number of Queue Entries: " << Queue.size() << "\n";
+  for (const SUnit *SU : Queue) {
+    dbgs() << "    ";
+    DAG->dumpNode(*SU);
+  }
+}
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
new file mode 100644
index 000000000000..39b44b917d9e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
@@ -0,0 +1,98 @@
+///===- LazyMachineBlockFrequencyInfo.cpp - Lazy Machine Block Frequency --===//
+///
+/// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+/// See https://llvm.org/LICENSE.txt for license information.
+/// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+///
+///===---------------------------------------------------------------------===//
+/// \file
+/// This is an alternative analysis pass to MachineBlockFrequencyInfo.  The
+/// difference is that with this pass the block frequencies are not computed
+/// when the analysis pass is executed but rather when the BFI result is
+/// explicitly requested by the analysis client.
+///
+///===---------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lazy-machine-block-freq"
+
+INITIALIZE_PASS_BEGIN(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE,
+                      "Lazy Machine Block Frequency Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE,
+                    "Lazy Machine Block Frequency Analysis", true, true)
+
+char LazyMachineBlockFrequencyInfoPass::ID = 0;
+
+LazyMachineBlockFrequencyInfoPass::LazyMachineBlockFrequencyInfoPass()
+    : MachineFunctionPass(ID) {
+  initializeLazyMachineBlockFrequencyInfoPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+void LazyMachineBlockFrequencyInfoPass::print(raw_ostream &OS,
+                                              const Module *M) const {
+  getBFI().print(OS, M);
+}
+
+void LazyMachineBlockFrequencyInfoPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void LazyMachineBlockFrequencyInfoPass::releaseMemory() {
+  OwnedMBFI.reset();
+  OwnedMLI.reset();
+  OwnedMDT.reset();
+}
+
+MachineBlockFrequencyInfo &
+LazyMachineBlockFrequencyInfoPass::calculateIfNotAvailable() const {
+  auto *MBFI = getAnalysisIfAvailable<MachineBlockFrequencyInfo>();
+  if (MBFI) {
+    LLVM_DEBUG(dbgs() << "MachineBlockFrequencyInfo is available\n");
+    return *MBFI;
+  }
+
+  auto &MBPI = getAnalysis<MachineBranchProbabilityInfo>();
+  auto *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+  auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+  LLVM_DEBUG(dbgs() << "Building MachineBlockFrequencyInfo on the fly\n");
+  LLVM_DEBUG(if (MLI) dbgs() << "LoopInfo is available\n");
+
+  if (!MLI) {
+    LLVM_DEBUG(dbgs() << "Building LoopInfo on the fly\n");
+    // First create a dominator tree.
+    LLVM_DEBUG(if (MDT) dbgs() << "DominatorTree is available\n");
+
+    if (!MDT) {
+      LLVM_DEBUG(dbgs() << "Building DominatorTree on the fly\n");
+      OwnedMDT = std::make_unique<MachineDominatorTree>();
+      OwnedMDT->getBase().recalculate(*MF);
+      MDT = OwnedMDT.get();
+    }
+
+    // Generate LoopInfo from it.
+    OwnedMLI = std::make_unique<MachineLoopInfo>();
+    OwnedMLI->getBase().analyze(MDT->getBase());
+    MLI = OwnedMLI.get();
+  }
+
+  OwnedMBFI = std::make_unique<MachineBlockFrequencyInfo>();
+  OwnedMBFI->calculate(*MF, MBPI, *MLI);
+  return *OwnedMBFI;
+}
+
+bool LazyMachineBlockFrequencyInfoPass::runOnMachineFunction(
+    MachineFunction &F) {
+  MF = &F;
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LexicalScopes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LexicalScopes.cpp
new file mode 100644
index 000000000000..47c19c3d8ec4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LexicalScopes.cpp
@@ -0,0 +1,347 @@
+//===- LexicalScopes.cpp - Collecting lexical scope info ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements LexicalScopes analysis.
+//
+// This pass collects lexical scope information and maps machine instructions
+// to respective lexical scopes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <string>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lexicalscopes"
+
+/// reset - Reset the instance so that it's prepared for another function.
+void LexicalScopes::reset() {
+  MF = nullptr;
+  CurrentFnLexicalScope = nullptr;
+  LexicalScopeMap.clear();
+  AbstractScopeMap.clear();
+  InlinedLexicalScopeMap.clear();
+  AbstractScopesList.clear();
+  DominatedBlocks.clear();
+}
+
+/// initialize - Scan machine function and constuct lexical scope nest.
+void LexicalScopes::initialize(const MachineFunction &Fn) {
+  reset();
+  // Don't attempt any lexical scope creation for a NoDebug compile unit.
+  if (Fn.getFunction().getSubprogram()->getUnit()->getEmissionKind() ==
+      DICompileUnit::NoDebug)
+    return;
+  MF = &Fn;
+  SmallVector<InsnRange, 4> MIRanges;
+  DenseMap<const MachineInstr *, LexicalScope *> MI2ScopeMap;
+  extractLexicalScopes(MIRanges, MI2ScopeMap);
+  if (CurrentFnLexicalScope) {
+    constructScopeNest(CurrentFnLexicalScope);
+    assignInstructionRanges(MIRanges, MI2ScopeMap);
+  }
+}
+
+/// extractLexicalScopes - Extract instruction ranges for each lexical scopes
+/// for the given machine function.
+void LexicalScopes::extractLexicalScopes(
+    SmallVectorImpl<InsnRange> &MIRanges,
+    DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) {
+  // Scan each instruction and create scopes. First build working set of scopes.
+  for (const auto &MBB : *MF) {
+    const MachineInstr *RangeBeginMI = nullptr;
+    const MachineInstr *PrevMI = nullptr;
+    const DILocation *PrevDL = nullptr;
+    for (const auto &MInsn : MBB) {
+      // Ignore DBG_VALUE and similar instruction that do not contribute to any
+      // instruction in the output.
+      if (MInsn.isMetaInstruction())
+        continue;
+
+      // Check if instruction has valid location information.
+      const DILocation *MIDL = MInsn.getDebugLoc();
+      if (!MIDL) {
+        PrevMI = &MInsn;
+        continue;
+      }
+
+      // If scope has not changed then skip this instruction.
+      if (MIDL == PrevDL) {
+        PrevMI = &MInsn;
+        continue;
+      }
+
+      if (RangeBeginMI) {
+        // If we have already seen a beginning of an instruction range and
+        // current instruction scope does not match scope of first instruction
+        // in this range then create a new instruction range.
+        InsnRange R(RangeBeginMI, PrevMI);
+        MI2ScopeMap[RangeBeginMI] = getOrCreateLexicalScope(PrevDL);
+        MIRanges.push_back(R);
+      }
+
+      // This is a beginning of a new instruction range.
+      RangeBeginMI = &MInsn;
+
+      // Reset previous markers.
+      PrevMI = &MInsn;
+      PrevDL = MIDL;
+    }
+
+    // Create last instruction range.
+    if (RangeBeginMI && PrevMI && PrevDL) {
+      InsnRange R(RangeBeginMI, PrevMI);
+      MIRanges.push_back(R);
+      MI2ScopeMap[RangeBeginMI] = getOrCreateLexicalScope(PrevDL);
+    }
+  }
+}
+
+/// findLexicalScope - Find lexical scope, either regular or inlined, for the
+/// given DebugLoc. Return NULL if not found.
+LexicalScope *LexicalScopes::findLexicalScope(const DILocation *DL) {
+  DILocalScope *Scope = DL->getScope();
+  if (!Scope)
+    return nullptr;
+
+  // The scope that we were created with could have an extra file - which
+  // isn't what we care about in this case.
+  Scope = Scope->getNonLexicalBlockFileScope();
+
+  if (auto *IA = DL->getInlinedAt()) {
+    auto I = InlinedLexicalScopeMap.find(std::make_pair(Scope, IA));
+    return I != InlinedLexicalScopeMap.end() ? &I->second : nullptr;
+  }
+  return findLexicalScope(Scope);
+}
+
+/// getOrCreateLexicalScope - Find lexical scope for the given DebugLoc. If
+/// not available then create new lexical scope.
+LexicalScope *LexicalScopes::getOrCreateLexicalScope(const DILocalScope *Scope,
+                                                     const DILocation *IA) {
+  if (IA) {
+    // Skip scopes inlined from a NoDebug compile unit.
+    if (Scope->getSubprogram()->getUnit()->getEmissionKind() ==
+        DICompileUnit::NoDebug)
+      return getOrCreateLexicalScope(IA);
+    // Create an abstract scope for inlined function.
+    getOrCreateAbstractScope(Scope);
+    // Create an inlined scope for inlined function.
+    return getOrCreateInlinedScope(Scope, IA);
+  }
+
+  return getOrCreateRegularScope(Scope);
+}
+
+/// getOrCreateRegularScope - Find or create a regular lexical scope.
+LexicalScope *
+LexicalScopes::getOrCreateRegularScope(const DILocalScope *Scope) {
+  assert(Scope && "Invalid Scope encoding!");
+  Scope = Scope->getNonLexicalBlockFileScope();
+
+  auto I = LexicalScopeMap.find(Scope);
+  if (I != LexicalScopeMap.end())
+    return &I->second;
+
+  // FIXME: Should the following dyn_cast be DILexicalBlock?
+  LexicalScope *Parent = nullptr;
+  if (auto *Block = dyn_cast<DILexicalBlockBase>(Scope))
+    Parent = getOrCreateLexicalScope(Block->getScope());
+  I = LexicalScopeMap.emplace(std::piecewise_construct,
+                              std::forward_as_tuple(Scope),
+                              std::forward_as_tuple(Parent, Scope, nullptr,
+                                                    false)).first;
+
+  if (!Parent) {
+    assert(cast<DISubprogram>(Scope)->describes(&MF->getFunction()));
+    assert(!CurrentFnLexicalScope);
+    CurrentFnLexicalScope = &I->second;
+  }
+
+  return &I->second;
+}
+
+/// getOrCreateInlinedScope - Find or create an inlined lexical scope.
+LexicalScope *
+LexicalScopes::getOrCreateInlinedScope(const DILocalScope *Scope,
+                                       const DILocation *InlinedAt) {
+  assert(Scope && "Invalid Scope encoding!");
+  Scope = Scope->getNonLexicalBlockFileScope();
+  std::pair<const DILocalScope *, const DILocation *> P(Scope, InlinedAt);
+  auto I = InlinedLexicalScopeMap.find(P);
+  if (I != InlinedLexicalScopeMap.end())
+    return &I->second;
+
+  LexicalScope *Parent;
+  if (auto *Block = dyn_cast<DILexicalBlockBase>(Scope))
+    Parent = getOrCreateInlinedScope(Block->getScope(), InlinedAt);
+  else
+    Parent = getOrCreateLexicalScope(InlinedAt);
+
+  I = InlinedLexicalScopeMap
+          .emplace(std::piecewise_construct, std::forward_as_tuple(P),
+                   std::forward_as_tuple(Parent, Scope, InlinedAt, false))
+          .first;
+  return &I->second;
+}
+
+/// getOrCreateAbstractScope - Find or create an abstract lexical scope.
+LexicalScope *
+LexicalScopes::getOrCreateAbstractScope(const DILocalScope *Scope) {
+  assert(Scope && "Invalid Scope encoding!");
+  Scope = Scope->getNonLexicalBlockFileScope();
+  auto I = AbstractScopeMap.find(Scope);
+  if (I != AbstractScopeMap.end())
+    return &I->second;
+
+  // FIXME: Should the following isa be DILexicalBlock?
+  LexicalScope *Parent = nullptr;
+  if (auto *Block = dyn_cast<DILexicalBlockBase>(Scope))
+    Parent = getOrCreateAbstractScope(Block->getScope());
+
+  I = AbstractScopeMap.emplace(std::piecewise_construct,
+                               std::forward_as_tuple(Scope),
+                               std::forward_as_tuple(Parent, Scope,
+                                                     nullptr, true)).first;
+  if (isa<DISubprogram>(Scope))
+    AbstractScopesList.push_back(&I->second);
+  return &I->second;
+}
+
+/// constructScopeNest - Traverse the Scope tree depth-first, storing
+/// traversal state in WorkStack and recording the depth-first
+/// numbering (setDFSIn, setDFSOut) for edge classification.
+void LexicalScopes::constructScopeNest(LexicalScope *Scope) {
+  assert(Scope && "Unable to calculate scope dominance graph!");
+  SmallVector<std::pair<LexicalScope *, size_t>, 4> WorkStack;
+  WorkStack.push_back(std::make_pair(Scope, 0));
+  unsigned Counter = 0;
+  while (!WorkStack.empty()) {
+    auto &ScopePosition = WorkStack.back();
+    LexicalScope *WS = ScopePosition.first;
+    size_t ChildNum = ScopePosition.second++;
+    const SmallVectorImpl<LexicalScope *> &Children = WS->getChildren();
+    if (ChildNum < Children.size()) {
+      auto &ChildScope = Children[ChildNum];
+      WorkStack.push_back(std::make_pair(ChildScope, 0));
+      ChildScope->setDFSIn(++Counter);
+    } else {
+      WorkStack.pop_back();
+      WS->setDFSOut(++Counter);
+    }
+  }
+}
+
+/// assignInstructionRanges - Find ranges of instructions covered by each
+/// lexical scope.
+void LexicalScopes::assignInstructionRanges(
+    SmallVectorImpl<InsnRange> &MIRanges,
+    DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) {
+  LexicalScope *PrevLexicalScope = nullptr;
+  for (const auto &R : MIRanges) {
+    LexicalScope *S = MI2ScopeMap.lookup(R.first);
+    assert(S && "Lost LexicalScope for a machine instruction!");
+    if (PrevLexicalScope && !PrevLexicalScope->dominates(S))
+      PrevLexicalScope->closeInsnRange(S);
+    S->openInsnRange(R.first);
+    S->extendInsnRange(R.second);
+    PrevLexicalScope = S;
+  }
+
+  if (PrevLexicalScope)
+    PrevLexicalScope->closeInsnRange();
+}
+
+/// getMachineBasicBlocks - Populate given set using machine basic blocks which
+/// have machine instructions that belong to lexical scope identified by
+/// DebugLoc.
+void LexicalScopes::getMachineBasicBlocks(
+    const DILocation *DL, SmallPtrSetImpl<const MachineBasicBlock *> &MBBs) {
+  assert(MF && "Method called on a uninitialized LexicalScopes object!");
+  MBBs.clear();
+
+  LexicalScope *Scope = getOrCreateLexicalScope(DL);
+  if (!Scope)
+    return;
+
+  if (Scope == CurrentFnLexicalScope) {
+    for (const auto &MBB : *MF)
+      MBBs.insert(&MBB);
+    return;
+  }
+
+  // The scope ranges can cover multiple basic blocks in each span. Iterate over
+  // all blocks (in the order they are in the function) until we reach the one
+  // containing the end of the span.
+  SmallVectorImpl<InsnRange> &InsnRanges = Scope->getRanges();
+  for (auto &R : InsnRanges)
+    for (auto CurMBBIt = R.first->getParent()->getIterator(),
+              EndBBIt = std::next(R.second->getParent()->getIterator());
+         CurMBBIt != EndBBIt; CurMBBIt++)
+      MBBs.insert(&*CurMBBIt);
+}
+
+bool LexicalScopes::dominates(const DILocation *DL, MachineBasicBlock *MBB) {
+  assert(MF && "Unexpected uninitialized LexicalScopes object!");
+  LexicalScope *Scope = getOrCreateLexicalScope(DL);
+  if (!Scope)
+    return false;
+
+  // Current function scope covers all basic blocks in the function.
+  if (Scope == CurrentFnLexicalScope && MBB->getParent() == MF)
+    return true;
+
+  // Fetch all the blocks in DLs scope. Because the range / block list also
+  // contain any subscopes, any instruction that DL dominates can be found in
+  // the block set.
+  //
+  // Cache the set of fetched blocks to avoid repeatedly recomputing the set in
+  // the LiveDebugValues pass.
+  std::unique_ptr<BlockSetT> &Set = DominatedBlocks[DL];
+  if (!Set) {
+    Set = std::make_unique<BlockSetT>();
+    getMachineBasicBlocks(DL, *Set);
+  }
+  return Set->contains(MBB);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LexicalScope::dump(unsigned Indent) const {
+  raw_ostream &err = dbgs();
+  err.indent(Indent);
+  err << "DFSIn: " << DFSIn << " DFSOut: " << DFSOut << "\n";
+  const MDNode *N = Desc;
+  err.indent(Indent);
+  N->dump();
+  if (AbstractScope)
+    err << std::string(Indent, ' ') << "Abstract Scope\n";
+
+  if (!Children.empty())
+    err << std::string(Indent + 2, ' ') << "Children ...\n";
+  for (unsigned i = 0, e = Children.size(); i != e; ++i)
+    if (Children[i] != this)
+      Children[i]->dump(Indent + 2);
+}
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
new file mode 100644
index 000000000000..57df9b67fd02
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -0,0 +1,4230 @@
+//===- InstrRefBasedImpl.cpp - Tracking Debug Value MIs -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file InstrRefBasedImpl.cpp
+///
+/// This is a separate implementation of LiveDebugValues, see
+/// LiveDebugValues.cpp and VarLocBasedImpl.cpp for more information.
+///
+/// This pass propagates variable locations between basic blocks, resolving
+/// control flow conflicts between them. The problem is SSA construction, where
+/// each debug instruction assigns the *value* that a variable has, and every
+/// instruction where the variable is in scope uses that variable. The resulting
+/// map of instruction-to-value is then translated into a register (or spill)
+/// location for each variable over each instruction.
+///
+/// The primary difference from normal SSA construction is that we cannot
+/// _create_ PHI values that contain variable values. CodeGen has already
+/// completed, and we can't alter it just to make debug-info complete. Thus:
+/// we can identify function positions where we would like a PHI value for a
+/// variable, but must search the MachineFunction to see whether such a PHI is
+/// available. If no such PHI exists, the variable location must be dropped.
+///
+/// To achieve this, we perform two kinds of analysis. First, we identify
+/// every value defined by every instruction (ignoring those that only move
+/// another value), then re-compute an SSA-form representation of the
+/// MachineFunction, using value propagation to eliminate any un-necessary
+/// PHI values. This gives us a map of every value computed in the function,
+/// and its location within the register file / stack.
+///
+/// Secondly, for each variable we perform the same analysis, where each debug
+/// instruction is considered a def, and every instruction where the variable
+/// is in lexical scope as a use. Value propagation is used again to eliminate
+/// any un-necessary PHIs. This gives us a map of each variable to the value
+/// it should have in a block.
+///
+/// Once both are complete, we have two maps for each block:
+///  * Variables to the values they should have,
+///  * Values to the register / spill slot they are located in.
+/// After which we can marry-up variable values with a location, and emit
+/// DBG_VALUE instructions specifying those locations. Variable locations may
+/// be dropped in this process due to the desired variable value not being
+/// resident in any machine location, or because there is no PHI value in any
+/// location that accurately represents the desired value.  The building of
+/// location lists for each block is left to DbgEntityHistoryCalculator.
+///
+/// This pass is kept efficient because the size of the first SSA problem
+/// is proportional to the working-set size of the function, which the compiler
+/// tries to keep small. (It's also proportional to the number of blocks).
+/// Additionally, we repeatedly perform the second SSA problem analysis with
+/// only the variables and blocks in a single lexical scope, exploiting their
+/// locality.
+///
+/// ### Terminology
+///
+/// A machine location is a register or spill slot, a value is something that's
+/// defined by an instruction or PHI node, while a variable value is the value
+/// assigned to a variable. A variable location is a machine location, that must
+/// contain the appropriate variable value. A value that is a PHI node is
+/// occasionally called an mphi.
+///
+/// The first SSA problem is the "machine value location" problem,
+/// because we're determining which machine locations contain which values.
+/// The "locations" are constant: what's unknown is what value they contain.
+///
+/// The second SSA problem (the one for variables) is the "variable value
+/// problem", because it's determining what values a variable has, rather than
+/// what location those values are placed in.
+///
+/// TODO:
+///   Overlapping fragments
+///   Entry values
+///   Add back DEBUG statements for debugging this
+///   Collect statistics
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GenericIteratedDominanceFrontier.h"
+#include "llvm/Support/TypeSize.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cstdint>
+#include <functional>
+#include <queue>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "InstrRefBasedImpl.h"
+#include "LiveDebugValues.h"
+#include <optional>
+
+using namespace llvm;
+using namespace LiveDebugValues;
+
+// SSAUpdaterImple sets DEBUG_TYPE, change it.
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "livedebugvalues"
+
+// Act more like the VarLoc implementation, by propagating some locations too
+// far and ignoring some transfers.
+static cl::opt<bool> EmulateOldLDV("emulate-old-livedebugvalues", cl::Hidden,
+                                   cl::desc("Act like old LiveDebugValues did"),
+                                   cl::init(false));
+
+// Limit for the maximum number of stack slots we should track, past which we
+// will ignore any spills. InstrRefBasedLDV gathers detailed information on all
+// stack slots which leads to high memory consumption, and in some scenarios
+// (such as asan with very many locals) the working set of the function can be
+// very large, causing many spills. In these scenarios, it is very unlikely that
+// the developer has hundreds of variables live at the same time that they're
+// carefully thinking about -- instead, they probably autogenerated the code.
+// When this happens, gracefully stop tracking excess spill slots, rather than
+// consuming all the developer's memory.
+static cl::opt<unsigned>
+    StackWorkingSetLimit("livedebugvalues-max-stack-slots", cl::Hidden,
+                         cl::desc("livedebugvalues-stack-ws-limit"),
+                         cl::init(250));
+
+DbgOpID DbgOpID::UndefID = DbgOpID(0xffffffff);
+
+/// Tracker for converting machine value locations and variable values into
+/// variable locations (the output of LiveDebugValues), recorded as DBG_VALUEs
+/// specifying block live-in locations and transfers within blocks.
+///
+/// Operating on a per-block basis, this class takes a (pre-loaded) MLocTracker
+/// and must be initialized with the set of variable values that are live-in to
+/// the block. The caller then repeatedly calls process(). TransferTracker picks
+/// out variable locations for the live-in variable values (if there _is_ a
+/// location) and creates the corresponding DBG_VALUEs. Then, as the block is
+/// stepped through, transfers of values between machine locations are
+/// identified and if profitable, a DBG_VALUE created.
+///
+/// This is where debug use-before-defs would be resolved: a variable with an
+/// unavailable value could materialize in the middle of a block, when the
+/// value becomes available. Or, we could detect clobbers and re-specify the
+/// variable in a backup location. (XXX these are unimplemented).
+class TransferTracker {
+public:
+  const TargetInstrInfo *TII;
+  const TargetLowering *TLI;
+  /// This machine location tracker is assumed to always contain the up-to-date
+  /// value mapping for all machine locations. TransferTracker only reads
+  /// information from it. (XXX make it const?)
+  MLocTracker *MTracker;
+  MachineFunction &MF;
+  bool ShouldEmitDebugEntryValues;
+
+  /// Record of all changes in variable locations at a block position. Awkwardly
+  /// we allow inserting either before or after the point: MBB != nullptr
+  /// indicates it's before, otherwise after.
+  struct Transfer {
+    MachineBasicBlock::instr_iterator Pos; /// Position to insert DBG_VALUes
+    MachineBasicBlock *MBB; /// non-null if we should insert after.
+    SmallVector<MachineInstr *, 4> Insts; /// Vector of DBG_VALUEs to insert.
+  };
+
+  /// Stores the resolved operands (machine locations and constants) and
+  /// qualifying meta-information needed to construct a concrete DBG_VALUE-like
+  /// instruction.
+  struct ResolvedDbgValue {
+    SmallVector<ResolvedDbgOp> Ops;
+    DbgValueProperties Properties;
+
+    ResolvedDbgValue(SmallVectorImpl<ResolvedDbgOp> &Ops,
+                     DbgValueProperties Properties)
+        : Ops(Ops.begin(), Ops.end()), Properties(Properties) {}
+
+    /// Returns all the LocIdx values used in this struct, in the order in which
+    /// they appear as operands in the debug value; may contain duplicates.
+    auto loc_indices() const {
+      return map_range(
+          make_filter_range(
+              Ops, [](const ResolvedDbgOp &Op) { return !Op.IsConst; }),
+          [](const ResolvedDbgOp &Op) { return Op.Loc; });
+    }
+  };
+
+  /// Collection of transfers (DBG_VALUEs) to be inserted.
+  SmallVector<Transfer, 32> Transfers;
+
+  /// Local cache of what-value-is-in-what-LocIdx. Used to identify differences
+  /// between TransferTrackers view of variable locations and MLocTrackers. For
+  /// example, MLocTracker observes all clobbers, but TransferTracker lazily
+  /// does not.
+  SmallVector<ValueIDNum, 32> VarLocs;
+
+  /// Map from LocIdxes to which DebugVariables are based that location.
+  /// Mantained while stepping through the block. Not accurate if
+  /// VarLocs[Idx] != MTracker->LocIdxToIDNum[Idx].
+  DenseMap<LocIdx, SmallSet<DebugVariable, 4>> ActiveMLocs;
+
+  /// Map from DebugVariable to it's current location and qualifying meta
+  /// information. To be used in conjunction with ActiveMLocs to construct
+  /// enough information for the DBG_VALUEs for a particular LocIdx.
+  DenseMap<DebugVariable, ResolvedDbgValue> ActiveVLocs;
+
+  /// Temporary cache of DBG_VALUEs to be entered into the Transfers collection.
+  SmallVector<MachineInstr *, 4> PendingDbgValues;
+
+  /// Record of a use-before-def: created when a value that's live-in to the
+  /// current block isn't available in any machine location, but it will be
+  /// defined in this block.
+  struct UseBeforeDef {
+    /// Value of this variable, def'd in block.
+    SmallVector<DbgOp> Values;
+    /// Identity of this variable.
+    DebugVariable Var;
+    /// Additional variable properties.
+    DbgValueProperties Properties;
+    UseBeforeDef(ArrayRef<DbgOp> Values, const DebugVariable &Var,
+                 const DbgValueProperties &Properties)
+        : Values(Values.begin(), Values.end()), Var(Var),
+          Properties(Properties) {}
+  };
+
+  /// Map from instruction index (within the block) to the set of UseBeforeDefs
+  /// that become defined at that instruction.
+  DenseMap<unsigned, SmallVector<UseBeforeDef, 1>> UseBeforeDefs;
+
+  /// The set of variables that are in UseBeforeDefs and can become a location
+  /// once the relevant value is defined. An element being erased from this
+  /// collection prevents the use-before-def materializing.
+  DenseSet<DebugVariable> UseBeforeDefVariables;
+
+  const TargetRegisterInfo &TRI;
+  const BitVector &CalleeSavedRegs;
+
+  TransferTracker(const TargetInstrInfo *TII, MLocTracker *MTracker,
+                  MachineFunction &MF, const TargetRegisterInfo &TRI,
+                  const BitVector &CalleeSavedRegs, const TargetPassConfig &TPC)
+      : TII(TII), MTracker(MTracker), MF(MF), TRI(TRI),
+        CalleeSavedRegs(CalleeSavedRegs) {
+    TLI = MF.getSubtarget().getTargetLowering();
+    auto &TM = TPC.getTM<TargetMachine>();
+    ShouldEmitDebugEntryValues = TM.Options.ShouldEmitDebugEntryValues();
+  }
+
+  bool isCalleeSaved(LocIdx L) const {
+    unsigned Reg = MTracker->LocIdxToLocID[L];
+    if (Reg >= MTracker->NumRegs)
+      return false;
+    for (MCRegAliasIterator RAI(Reg, &TRI, true); RAI.isValid(); ++RAI)
+      if (CalleeSavedRegs.test(*RAI))
+        return true;
+    return false;
+  };
+
+  // An estimate of the expected lifespan of values at a machine location, with
+  // a greater value corresponding to a longer expected lifespan, i.e. spill
+  // slots generally live longer than callee-saved registers which generally
+  // live longer than non-callee-saved registers. The minimum value of 0
+  // corresponds to an illegal location that cannot have a "lifespan" at all.
+  enum class LocationQuality : unsigned char {
+    Illegal = 0,
+    Register,
+    CalleeSavedRegister,
+    SpillSlot,
+    Best = SpillSlot
+  };
+
+  class LocationAndQuality {
+    unsigned Location : 24;
+    unsigned Quality : 8;
+
+  public:
+    LocationAndQuality() : Location(0), Quality(0) {}
+    LocationAndQuality(LocIdx L, LocationQuality Q)
+        : Location(L.asU64()), Quality(static_cast<unsigned>(Q)) {}
+    LocIdx getLoc() const {
+      if (!Quality)
+        return LocIdx::MakeIllegalLoc();
+      return LocIdx(Location);
+    }
+    LocationQuality getQuality() const { return LocationQuality(Quality); }
+    bool isIllegal() const { return !Quality; }
+    bool isBest() const { return getQuality() == LocationQuality::Best; }
+  };
+
+  // Returns the LocationQuality for the location L iff the quality of L is
+  // is strictly greater than the provided minimum quality.
+  std::optional<LocationQuality>
+  getLocQualityIfBetter(LocIdx L, LocationQuality Min) const {
+    if (L.isIllegal())
+      return std::nullopt;
+    if (Min >= LocationQuality::SpillSlot)
+      return std::nullopt;
+    if (MTracker->isSpill(L))
+      return LocationQuality::SpillSlot;
+    if (Min >= LocationQuality::CalleeSavedRegister)
+      return std::nullopt;
+    if (isCalleeSaved(L))
+      return LocationQuality::CalleeSavedRegister;
+    if (Min >= LocationQuality::Register)
+      return std::nullopt;
+    return LocationQuality::Register;
+  }
+
+  /// For a variable \p Var with the live-in value \p Value, attempts to resolve
+  /// the DbgValue to a concrete DBG_VALUE, emitting that value and loading the
+  /// tracking information to track Var throughout the block.
+  /// \p ValueToLoc is a map containing the best known location for every
+  ///    ValueIDNum that Value may use.
+  /// \p MBB is the basic block that we are loading the live-in value for.
+  /// \p DbgOpStore is the map containing the DbgOpID->DbgOp mapping needed to
+  ///    determine the values used by Value.
+  void loadVarInloc(MachineBasicBlock &MBB, DbgOpIDMap &DbgOpStore,
+                    const DenseMap<ValueIDNum, LocationAndQuality> &ValueToLoc,
+                    DebugVariable Var, DbgValue Value) {
+    SmallVector<DbgOp> DbgOps;
+    SmallVector<ResolvedDbgOp> ResolvedDbgOps;
+    bool IsValueValid = true;
+    unsigned LastUseBeforeDef = 0;
+
+    // If every value used by the incoming DbgValue is available at block
+    // entry, ResolvedDbgOps will contain the machine locations/constants for
+    // those values and will be used to emit a debug location.
+    // If one or more values are not yet available, but will all be defined in
+    // this block, then LastUseBeforeDef will track the instruction index in
+    // this BB at which the last of those values is defined, DbgOps will
+    // contain the values that we will emit when we reach that instruction.
+    // If one or more values are undef or not available throughout this block,
+    // and we can't recover as an entry value, we set IsValueValid=false and
+    // skip this variable.
+    for (DbgOpID ID : Value.getDbgOpIDs()) {
+      DbgOp Op = DbgOpStore.find(ID);
+      DbgOps.push_back(Op);
+      if (ID.isUndef()) {
+        IsValueValid = false;
+        break;
+      }
+      if (ID.isConst()) {
+        ResolvedDbgOps.push_back(Op.MO);
+        continue;
+      }
+
+      // If the value has no location, we can't make a variable location.
+      const ValueIDNum &Num = Op.ID;
+      auto ValuesPreferredLoc = ValueToLoc.find(Num);
+      if (ValuesPreferredLoc->second.isIllegal()) {
+        // If it's a def that occurs in this block, register it as a
+        // use-before-def to be resolved as we step through the block.
+        // Continue processing values so that we add any other UseBeforeDef
+        // entries needed for later.
+        if (Num.getBlock() == (unsigned)MBB.getNumber() && !Num.isPHI()) {
+          LastUseBeforeDef = std::max(LastUseBeforeDef,
+                                      static_cast<unsigned>(Num.getInst()));
+          continue;
+        }
+        recoverAsEntryValue(Var, Value.Properties, Num);
+        IsValueValid = false;
+        break;
+      }
+
+      // Defer modifying ActiveVLocs until after we've confirmed we have a
+      // live range.
+      LocIdx M = ValuesPreferredLoc->second.getLoc();
+      ResolvedDbgOps.push_back(M);
+    }
+
+    // If we cannot produce a valid value for the LiveIn value within this
+    // block, skip this variable.
+    if (!IsValueValid)
+      return;
+
+    // Add UseBeforeDef entry for the last value to be defined in this block.
+    if (LastUseBeforeDef) {
+      addUseBeforeDef(Var, Value.Properties, DbgOps,
+                      LastUseBeforeDef);
+      return;
+    }
+
+    // The LiveIn value is available at block entry, begin tracking and record
+    // the transfer.
+    for (const ResolvedDbgOp &Op : ResolvedDbgOps)
+      if (!Op.IsConst)
+        ActiveMLocs[Op.Loc].insert(Var);
+    auto NewValue = ResolvedDbgValue{ResolvedDbgOps, Value.Properties};
+    auto Result = ActiveVLocs.insert(std::make_pair(Var, NewValue));
+    if (!Result.second)
+      Result.first->second = NewValue;
+    PendingDbgValues.push_back(
+        MTracker->emitLoc(ResolvedDbgOps, Var, Value.Properties));
+  }
+
+  /// Load object with live-in variable values. \p mlocs contains the live-in
+  /// values in each machine location, while \p vlocs the live-in variable
+  /// values. This method picks variable locations for the live-in variables,
+  /// creates DBG_VALUEs and puts them in #Transfers, then prepares the other
+  /// object fields to track variable locations as we step through the block.
+  /// FIXME: could just examine mloctracker instead of passing in \p mlocs?
+  void
+  loadInlocs(MachineBasicBlock &MBB, ValueTable &MLocs, DbgOpIDMap &DbgOpStore,
+             const SmallVectorImpl<std::pair<DebugVariable, DbgValue>> &VLocs,
+             unsigned NumLocs) {
+    ActiveMLocs.clear();
+    ActiveVLocs.clear();
+    VarLocs.clear();
+    VarLocs.reserve(NumLocs);
+    UseBeforeDefs.clear();
+    UseBeforeDefVariables.clear();
+
+    // Map of the preferred location for each value.
+    DenseMap<ValueIDNum, LocationAndQuality> ValueToLoc;
+
+    // Initialized the preferred-location map with illegal locations, to be
+    // filled in later.
+    for (const auto &VLoc : VLocs)
+      if (VLoc.second.Kind == DbgValue::Def)
+        for (DbgOpID OpID : VLoc.second.getDbgOpIDs())
+          if (!OpID.ID.IsConst)
+            ValueToLoc.insert({DbgOpStore.find(OpID).ID, LocationAndQuality()});
+
+    ActiveMLocs.reserve(VLocs.size());
+    ActiveVLocs.reserve(VLocs.size());
+
+    // Produce a map of value numbers to the current machine locs they live
+    // in. When emulating VarLocBasedImpl, there should only be one
+    // location; when not, we get to pick.
+    for (auto Location : MTracker->locations()) {
+      LocIdx Idx = Location.Idx;
+      ValueIDNum &VNum = MLocs[Idx.asU64()];
+      if (VNum == ValueIDNum::EmptyValue)
+        continue;
+      VarLocs.push_back(VNum);
+
+      // Is there a variable that wants a location for this value? If not, skip.
+      auto VIt = ValueToLoc.find(VNum);
+      if (VIt == ValueToLoc.end())
+        continue;
+
+      auto &Previous = VIt->second;
+      // If this is the first location with that value, pick it. Otherwise,
+      // consider whether it's a "longer term" location.
+      std::optional<LocationQuality> ReplacementQuality =
+          getLocQualityIfBetter(Idx, Previous.getQuality());
+      if (ReplacementQuality)
+        Previous = LocationAndQuality(Idx, *ReplacementQuality);
+    }
+
+    // Now map variables to their picked LocIdxes.
+    for (const auto &Var : VLocs) {
+      loadVarInloc(MBB, DbgOpStore, ValueToLoc, Var.first, Var.second);
+    }
+    flushDbgValues(MBB.begin(), &MBB);
+  }
+
+  /// Record that \p Var has value \p ID, a value that becomes available
+  /// later in the function.
+  void addUseBeforeDef(const DebugVariable &Var,
+                       const DbgValueProperties &Properties,
+                       const SmallVectorImpl<DbgOp> &DbgOps, unsigned Inst) {
+    UseBeforeDefs[Inst].emplace_back(DbgOps, Var, Properties);
+    UseBeforeDefVariables.insert(Var);
+  }
+
+  /// After the instruction at index \p Inst and position \p pos has been
+  /// processed, check whether it defines a variable value in a use-before-def.
+  /// If so, and the variable value hasn't changed since the start of the
+  /// block, create a DBG_VALUE.
+  void checkInstForNewValues(unsigned Inst, MachineBasicBlock::iterator pos) {
+    auto MIt = UseBeforeDefs.find(Inst);
+    if (MIt == UseBeforeDefs.end())
+      return;
+
+    // Map of values to the locations that store them for every value used by
+    // the variables that may have become available.
+    SmallDenseMap<ValueIDNum, LocationAndQuality> ValueToLoc;
+
+    // Populate ValueToLoc with illegal default mappings for every value used by
+    // any UseBeforeDef variables for this instruction.
+    for (auto &Use : MIt->second) {
+      if (!UseBeforeDefVariables.count(Use.Var))
+        continue;
+
+      for (DbgOp &Op : Use.Values) {
+        assert(!Op.isUndef() && "UseBeforeDef erroneously created for a "
+                                "DbgValue with undef values.");
+        if (Op.IsConst)
+          continue;
+
+        ValueToLoc.insert({Op.ID, LocationAndQuality()});
+      }
+    }
+
+    // Exit early if we have no DbgValues to produce.
+    if (ValueToLoc.empty())
+      return;
+
+    // Determine the best location for each desired value.
+    for (auto Location : MTracker->locations()) {
+      LocIdx Idx = Location.Idx;
+      ValueIDNum &LocValueID = Location.Value;
+
+      // Is there a variable that wants a location for this value? If not, skip.
+      auto VIt = ValueToLoc.find(LocValueID);
+      if (VIt == ValueToLoc.end())
+        continue;
+
+      auto &Previous = VIt->second;
+      // If this is the first location with that value, pick it. Otherwise,
+      // consider whether it's a "longer term" location.
+      std::optional<LocationQuality> ReplacementQuality =
+          getLocQualityIfBetter(Idx, Previous.getQuality());
+      if (ReplacementQuality)
+        Previous = LocationAndQuality(Idx, *ReplacementQuality);
+    }
+
+    // Using the map of values to locations, produce a final set of values for
+    // this variable.
+    for (auto &Use : MIt->second) {
+      if (!UseBeforeDefVariables.count(Use.Var))
+        continue;
+
+      SmallVector<ResolvedDbgOp> DbgOps;
+
+      for (DbgOp &Op : Use.Values) {
+        if (Op.IsConst) {
+          DbgOps.push_back(Op.MO);
+          continue;
+        }
+        LocIdx NewLoc = ValueToLoc.find(Op.ID)->second.getLoc();
+        if (NewLoc.isIllegal())
+          break;
+        DbgOps.push_back(NewLoc);
+      }
+
+      // If at least one value used by this debug value is no longer available,
+      // i.e. one of the values was killed before we finished defining all of
+      // the values used by this variable, discard.
+      if (DbgOps.size() != Use.Values.size())
+        continue;
+
+      // Otherwise, we're good to go.
+      PendingDbgValues.push_back(
+          MTracker->emitLoc(DbgOps, Use.Var, Use.Properties));
+    }
+    flushDbgValues(pos, nullptr);
+  }
+
+  /// Helper to move created DBG_VALUEs into Transfers collection.
+  void flushDbgValues(MachineBasicBlock::iterator Pos, MachineBasicBlock *MBB) {
+    if (PendingDbgValues.size() == 0)
+      return;
+
+    // Pick out the instruction start position.
+    MachineBasicBlock::instr_iterator BundleStart;
+    if (MBB && Pos == MBB->begin())
+      BundleStart = MBB->instr_begin();
+    else
+      BundleStart = getBundleStart(Pos->getIterator());
+
+    Transfers.push_back({BundleStart, MBB, PendingDbgValues});
+    PendingDbgValues.clear();
+  }
+
+  bool isEntryValueVariable(const DebugVariable &Var,
+                            const DIExpression *Expr) const {
+    if (!Var.getVariable()->isParameter())
+      return false;
+
+    if (Var.getInlinedAt())
+      return false;
+
+    if (Expr->getNumElements() > 0 && !Expr->isDeref())
+      return false;
+
+    return true;
+  }
+
+  bool isEntryValueValue(const ValueIDNum &Val) const {
+    // Must be in entry block (block number zero), and be a PHI / live-in value.
+    if (Val.getBlock() || !Val.isPHI())
+      return false;
+
+    // Entry values must enter in a register.
+    if (MTracker->isSpill(Val.getLoc()))
+      return false;
+
+    Register SP = TLI->getStackPointerRegisterToSaveRestore();
+    Register FP = TRI.getFrameRegister(MF);
+    Register Reg = MTracker->LocIdxToLocID[Val.getLoc()];
+    return Reg != SP && Reg != FP;
+  }
+
+  bool recoverAsEntryValue(const DebugVariable &Var,
+                           const DbgValueProperties &Prop,
+                           const ValueIDNum &Num) {
+    // Is this variable location a candidate to be an entry value. First,
+    // should we be trying this at all?
+    if (!ShouldEmitDebugEntryValues)
+      return false;
+
+    const DIExpression *DIExpr = Prop.DIExpr;
+
+    // We don't currently emit entry values for DBG_VALUE_LISTs.
+    if (Prop.IsVariadic) {
+      // If this debug value can be converted to be non-variadic, then do so;
+      // otherwise give up.
+      auto NonVariadicExpression =
+          DIExpression::convertToNonVariadicExpression(DIExpr);
+      if (!NonVariadicExpression)
+        return false;
+      DIExpr = *NonVariadicExpression;
+    }
+
+    // Is the variable appropriate for entry values (i.e., is a parameter).
+    if (!isEntryValueVariable(Var, DIExpr))
+      return false;
+
+    // Is the value assigned to this variable still the entry value?
+    if (!isEntryValueValue(Num))
+      return false;
+
+    // Emit a variable location using an entry value expression.
+    DIExpression *NewExpr =
+        DIExpression::prepend(DIExpr, DIExpression::EntryValue);
+    Register Reg = MTracker->LocIdxToLocID[Num.getLoc()];
+    MachineOperand MO = MachineOperand::CreateReg(Reg, false);
+
+    PendingDbgValues.push_back(
+        emitMOLoc(MO, Var, {NewExpr, Prop.Indirect, false}));
+    return true;
+  }
+
+  /// Change a variable value after encountering a DBG_VALUE inside a block.
+  void redefVar(const MachineInstr &MI) {
+    DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
+                      MI.getDebugLoc()->getInlinedAt());
+    DbgValueProperties Properties(MI);
+
+    // Ignore non-register locations, we don't transfer those.
+    if (MI.isUndefDebugValue() ||
+        all_of(MI.debug_operands(),
+               [](const MachineOperand &MO) { return !MO.isReg(); })) {
+      auto It = ActiveVLocs.find(Var);
+      if (It != ActiveVLocs.end()) {
+        for (LocIdx Loc : It->second.loc_indices())
+          ActiveMLocs[Loc].erase(Var);
+        ActiveVLocs.erase(It);
+      }
+      // Any use-before-defs no longer apply.
+      UseBeforeDefVariables.erase(Var);
+      return;
+    }
+
+    SmallVector<ResolvedDbgOp> NewLocs;
+    for (const MachineOperand &MO : MI.debug_operands()) {
+      if (MO.isReg()) {
+        // Any undef regs have already been filtered out above.
+        Register Reg = MO.getReg();
+        LocIdx NewLoc = MTracker->getRegMLoc(Reg);
+        NewLocs.push_back(NewLoc);
+      } else {
+        NewLocs.push_back(MO);
+      }
+    }
+
+    redefVar(MI, Properties, NewLocs);
+  }
+
+  /// Handle a change in variable location within a block. Terminate the
+  /// variables current location, and record the value it now refers to, so
+  /// that we can detect location transfers later on.
+  void redefVar(const MachineInstr &MI, const DbgValueProperties &Properties,
+                SmallVectorImpl<ResolvedDbgOp> &NewLocs) {
+    DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
+                      MI.getDebugLoc()->getInlinedAt());
+    // Any use-before-defs no longer apply.
+    UseBeforeDefVariables.erase(Var);
+
+    // Erase any previous location.
+    auto It = ActiveVLocs.find(Var);
+    if (It != ActiveVLocs.end()) {
+      for (LocIdx Loc : It->second.loc_indices())
+        ActiveMLocs[Loc].erase(Var);
+    }
+
+    // If there _is_ no new location, all we had to do was erase.
+    if (NewLocs.empty()) {
+      if (It != ActiveVLocs.end())
+        ActiveVLocs.erase(It);
+      return;
+    }
+
+    SmallVector<std::pair<LocIdx, DebugVariable>> LostMLocs;
+    for (ResolvedDbgOp &Op : NewLocs) {
+      if (Op.IsConst)
+        continue;
+
+      LocIdx NewLoc = Op.Loc;
+
+      // Check whether our local copy of values-by-location in #VarLocs is out
+      // of date. Wipe old tracking data for the location if it's been clobbered
+      // in the meantime.
+      if (MTracker->readMLoc(NewLoc) != VarLocs[NewLoc.asU64()]) {
+        for (const auto &P : ActiveMLocs[NewLoc]) {
+          auto LostVLocIt = ActiveVLocs.find(P);
+          if (LostVLocIt != ActiveVLocs.end()) {
+            for (LocIdx Loc : LostVLocIt->second.loc_indices()) {
+              // Every active variable mapping for NewLoc will be cleared, no
+              // need to track individual variables.
+              if (Loc == NewLoc)
+                continue;
+              LostMLocs.emplace_back(Loc, P);
+            }
+          }
+          ActiveVLocs.erase(P);
+        }
+        for (const auto &LostMLoc : LostMLocs)
+          ActiveMLocs[LostMLoc.first].erase(LostMLoc.second);
+        LostMLocs.clear();
+        It = ActiveVLocs.find(Var);
+        ActiveMLocs[NewLoc.asU64()].clear();
+        VarLocs[NewLoc.asU64()] = MTracker->readMLoc(NewLoc);
+      }
+
+      ActiveMLocs[NewLoc].insert(Var);
+    }
+
+    if (It == ActiveVLocs.end()) {
+      ActiveVLocs.insert(
+          std::make_pair(Var, ResolvedDbgValue(NewLocs, Properties)));
+    } else {
+      It->second.Ops.assign(NewLocs);
+      It->second.Properties = Properties;
+    }
+  }
+
+  /// Account for a location \p mloc being clobbered. Examine the variable
+  /// locations that will be terminated: and try to recover them by using
+  /// another location. Optionally, given \p MakeUndef, emit a DBG_VALUE to
+  /// explicitly terminate a location if it can't be recovered.
+  void clobberMloc(LocIdx MLoc, MachineBasicBlock::iterator Pos,
+                   bool MakeUndef = true) {
+    auto ActiveMLocIt = ActiveMLocs.find(MLoc);
+    if (ActiveMLocIt == ActiveMLocs.end())
+      return;
+
+    // What was the old variable value?
+    ValueIDNum OldValue = VarLocs[MLoc.asU64()];
+    clobberMloc(MLoc, OldValue, Pos, MakeUndef);
+  }
+  /// Overload that takes an explicit value \p OldValue for when the value in
+  /// \p MLoc has changed and the TransferTracker's locations have not been
+  /// updated yet.
+  void clobberMloc(LocIdx MLoc, ValueIDNum OldValue,
+                   MachineBasicBlock::iterator Pos, bool MakeUndef = true) {
+    auto ActiveMLocIt = ActiveMLocs.find(MLoc);
+    if (ActiveMLocIt == ActiveMLocs.end())
+      return;
+
+    VarLocs[MLoc.asU64()] = ValueIDNum::EmptyValue;
+
+    // Examine the remaining variable locations: if we can find the same value
+    // again, we can recover the location.
+    std::optional<LocIdx> NewLoc;
+    for (auto Loc : MTracker->locations())
+      if (Loc.Value == OldValue)
+        NewLoc = Loc.Idx;
+
+    // If there is no location, and we weren't asked to make the variable
+    // explicitly undef, then stop here.
+    if (!NewLoc && !MakeUndef) {
+      // Try and recover a few more locations with entry values.
+      for (const auto &Var : ActiveMLocIt->second) {
+        auto &Prop = ActiveVLocs.find(Var)->second.Properties;
+        recoverAsEntryValue(Var, Prop, OldValue);
+      }
+      flushDbgValues(Pos, nullptr);
+      return;
+    }
+
+    // Examine all the variables based on this location.
+    DenseSet<DebugVariable> NewMLocs;
+    // If no new location has been found, every variable that depends on this
+    // MLoc is dead, so end their existing MLoc->Var mappings as well.
+    SmallVector<std::pair<LocIdx, DebugVariable>> LostMLocs;
+    for (const auto &Var : ActiveMLocIt->second) {
+      auto ActiveVLocIt = ActiveVLocs.find(Var);
+      // Re-state the variable location: if there's no replacement then NewLoc
+      // is std::nullopt and a $noreg DBG_VALUE will be created. Otherwise, a
+      // DBG_VALUE identifying the alternative location will be emitted.
+      const DbgValueProperties &Properties = ActiveVLocIt->second.Properties;
+
+      // Produce the new list of debug ops - an empty list if no new location
+      // was found, or the existing list with the substitution MLoc -> NewLoc
+      // otherwise.
+      SmallVector<ResolvedDbgOp> DbgOps;
+      if (NewLoc) {
+        ResolvedDbgOp OldOp(MLoc);
+        ResolvedDbgOp NewOp(*NewLoc);
+        // Insert illegal ops to overwrite afterwards.
+        DbgOps.insert(DbgOps.begin(), ActiveVLocIt->second.Ops.size(),
+                      ResolvedDbgOp(LocIdx::MakeIllegalLoc()));
+        replace_copy(ActiveVLocIt->second.Ops, DbgOps.begin(), OldOp, NewOp);
+      }
+
+      PendingDbgValues.push_back(MTracker->emitLoc(DbgOps, Var, Properties));
+
+      // Update machine locations <=> variable locations maps. Defer updating
+      // ActiveMLocs to avoid invalidating the ActiveMLocIt iterator.
+      if (!NewLoc) {
+        for (LocIdx Loc : ActiveVLocIt->second.loc_indices()) {
+          if (Loc != MLoc)
+            LostMLocs.emplace_back(Loc, Var);
+        }
+        ActiveVLocs.erase(ActiveVLocIt);
+      } else {
+        ActiveVLocIt->second.Ops = DbgOps;
+        NewMLocs.insert(Var);
+      }
+    }
+
+    // Remove variables from ActiveMLocs if they no longer use any other MLocs
+    // due to being killed by this clobber.
+    for (auto &LocVarIt : LostMLocs) {
+      auto LostMLocIt = ActiveMLocs.find(LocVarIt.first);
+      assert(LostMLocIt != ActiveMLocs.end() &&
+             "Variable was using this MLoc, but ActiveMLocs[MLoc] has no "
+             "entries?");
+      LostMLocIt->second.erase(LocVarIt.second);
+    }
+
+    // We lazily track what locations have which values; if we've found a new
+    // location for the clobbered value, remember it.
+    if (NewLoc)
+      VarLocs[NewLoc->asU64()] = OldValue;
+
+    flushDbgValues(Pos, nullptr);
+
+    // Commit ActiveMLoc changes.
+    ActiveMLocIt->second.clear();
+    if (!NewMLocs.empty())
+      for (auto &Var : NewMLocs)
+        ActiveMLocs[*NewLoc].insert(Var);
+  }
+
+  /// Transfer variables based on \p Src to be based on \p Dst. This handles
+  /// both register copies as well as spills and restores. Creates DBG_VALUEs
+  /// describing the movement.
+  void transferMlocs(LocIdx Src, LocIdx Dst, MachineBasicBlock::iterator Pos) {
+    // Does Src still contain the value num we expect? If not, it's been
+    // clobbered in the meantime, and our variable locations are stale.
+    if (VarLocs[Src.asU64()] != MTracker->readMLoc(Src))
+      return;
+
+    // assert(ActiveMLocs[Dst].size() == 0);
+    //^^^ Legitimate scenario on account of un-clobbered slot being assigned to?
+
+    // Move set of active variables from one location to another.
+    auto MovingVars = ActiveMLocs[Src];
+    ActiveMLocs[Dst].insert(MovingVars.begin(), MovingVars.end());
+    VarLocs[Dst.asU64()] = VarLocs[Src.asU64()];
+
+    // For each variable based on Src; create a location at Dst.
+    ResolvedDbgOp SrcOp(Src);
+    ResolvedDbgOp DstOp(Dst);
+    for (const auto &Var : MovingVars) {
+      auto ActiveVLocIt = ActiveVLocs.find(Var);
+      assert(ActiveVLocIt != ActiveVLocs.end());
+
+      // Update all instances of Src in the variable's tracked values to Dst.
+      std::replace(ActiveVLocIt->second.Ops.begin(),
+                   ActiveVLocIt->second.Ops.end(), SrcOp, DstOp);
+
+      MachineInstr *MI = MTracker->emitLoc(ActiveVLocIt->second.Ops, Var,
+                                           ActiveVLocIt->second.Properties);
+      PendingDbgValues.push_back(MI);
+    }
+    ActiveMLocs[Src].clear();
+    flushDbgValues(Pos, nullptr);
+
+    // XXX XXX XXX "pretend to be old LDV" means dropping all tracking data
+    // about the old location.
+    if (EmulateOldLDV)
+      VarLocs[Src.asU64()] = ValueIDNum::EmptyValue;
+  }
+
+  MachineInstrBuilder emitMOLoc(const MachineOperand &MO,
+                                const DebugVariable &Var,
+                                const DbgValueProperties &Properties) {
+    DebugLoc DL = DILocation::get(Var.getVariable()->getContext(), 0, 0,
+                                  Var.getVariable()->getScope(),
+                                  const_cast<DILocation *>(Var.getInlinedAt()));
+    auto MIB = BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE));
+    MIB.add(MO);
+    if (Properties.Indirect)
+      MIB.addImm(0);
+    else
+      MIB.addReg(0);
+    MIB.addMetadata(Var.getVariable());
+    MIB.addMetadata(Properties.DIExpr);
+    return MIB;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+//            Implementation
+//===----------------------------------------------------------------------===//
+
+ValueIDNum ValueIDNum::EmptyValue = {UINT_MAX, UINT_MAX, UINT_MAX};
+ValueIDNum ValueIDNum::TombstoneValue = {UINT_MAX, UINT_MAX, UINT_MAX - 1};
+
+#ifndef NDEBUG
+void ResolvedDbgOp::dump(const MLocTracker *MTrack) const {
+  if (IsConst) {
+    dbgs() << MO;
+  } else {
+    dbgs() << MTrack->LocIdxToName(Loc);
+  }
+}
+void DbgOp::dump(const MLocTracker *MTrack) const {
+  if (IsConst) {
+    dbgs() << MO;
+  } else if (!isUndef()) {
+    dbgs() << MTrack->IDAsString(ID);
+  }
+}
+void DbgOpID::dump(const MLocTracker *MTrack, const DbgOpIDMap *OpStore) const {
+  if (!OpStore) {
+    dbgs() << "ID(" << asU32() << ")";
+  } else {
+    OpStore->find(*this).dump(MTrack);
+  }
+}
+void DbgValue::dump(const MLocTracker *MTrack,
+                    const DbgOpIDMap *OpStore) const {
+  if (Kind == NoVal) {
+    dbgs() << "NoVal(" << BlockNo << ")";
+  } else if (Kind == VPHI || Kind == Def) {
+    if (Kind == VPHI)
+      dbgs() << "VPHI(" << BlockNo << ",";
+    else
+      dbgs() << "Def(";
+    for (unsigned Idx = 0; Idx < getDbgOpIDs().size(); ++Idx) {
+      getDbgOpID(Idx).dump(MTrack, OpStore);
+      if (Idx != 0)
+        dbgs() << ",";
+    }
+    dbgs() << ")";
+  }
+  if (Properties.Indirect)
+    dbgs() << " indir";
+  if (Properties.DIExpr)
+    dbgs() << " " << *Properties.DIExpr;
+}
+#endif
+
+MLocTracker::MLocTracker(MachineFunction &MF, const TargetInstrInfo &TII,
+                         const TargetRegisterInfo &TRI,
+                         const TargetLowering &TLI)
+    : MF(MF), TII(TII), TRI(TRI), TLI(TLI),
+      LocIdxToIDNum(ValueIDNum::EmptyValue), LocIdxToLocID(0) {
+  NumRegs = TRI.getNumRegs();
+  reset();
+  LocIDToLocIdx.resize(NumRegs, LocIdx::MakeIllegalLoc());
+  assert(NumRegs < (1u << NUM_LOC_BITS)); // Detect bit packing failure
+
+  // Always track SP. This avoids the implicit clobbering caused by regmasks
+  // from affectings its values. (LiveDebugValues disbelieves calls and
+  // regmasks that claim to clobber SP).
+  Register SP = TLI.getStackPointerRegisterToSaveRestore();
+  if (SP) {
+    unsigned ID = getLocID(SP);
+    (void)lookupOrTrackRegister(ID);
+
+    for (MCRegAliasIterator RAI(SP, &TRI, true); RAI.isValid(); ++RAI)
+      SPAliases.insert(*RAI);
+  }
+
+  // Build some common stack positions -- full registers being spilt to the
+  // stack.
+  StackSlotIdxes.insert({{8, 0}, 0});
+  StackSlotIdxes.insert({{16, 0}, 1});
+  StackSlotIdxes.insert({{32, 0}, 2});
+  StackSlotIdxes.insert({{64, 0}, 3});
+  StackSlotIdxes.insert({{128, 0}, 4});
+  StackSlotIdxes.insert({{256, 0}, 5});
+  StackSlotIdxes.insert({{512, 0}, 6});
+
+  // Traverse all the subregister idxes, and ensure there's an index for them.
+  // Duplicates are no problem: we're interested in their position in the
+  // stack slot, we don't want to type the slot.
+  for (unsigned int I = 1; I < TRI.getNumSubRegIndices(); ++I) {
+    unsigned Size = TRI.getSubRegIdxSize(I);
+    unsigned Offs = TRI.getSubRegIdxOffset(I);
+    unsigned Idx = StackSlotIdxes.size();
+
+    // Some subregs have -1, -2 and so forth fed into their fields, to mean
+    // special backend things. Ignore those.
+    if (Size > 60000 || Offs > 60000)
+      continue;
+
+    StackSlotIdxes.insert({{Size, Offs}, Idx});
+  }
+
+  // There may also be strange register class sizes (think x86 fp80s).
+  for (const TargetRegisterClass *RC : TRI.regclasses()) {
+    unsigned Size = TRI.getRegSizeInBits(*RC);
+
+    // We might see special reserved values as sizes, and classes for other
+    // stuff the machine tries to model. If it's more than 512 bits, then it
+    // is very unlikely to be a register than can be spilt.
+    if (Size > 512)
+      continue;
+
+    unsigned Idx = StackSlotIdxes.size();
+    StackSlotIdxes.insert({{Size, 0}, Idx});
+  }
+
+  for (auto &Idx : StackSlotIdxes)
+    StackIdxesToPos[Idx.second] = Idx.first;
+
+  NumSlotIdxes = StackSlotIdxes.size();
+}
+
+LocIdx MLocTracker::trackRegister(unsigned ID) {
+  assert(ID != 0);
+  LocIdx NewIdx = LocIdx(LocIdxToIDNum.size());
+  LocIdxToIDNum.grow(NewIdx);
+  LocIdxToLocID.grow(NewIdx);
+
+  // Default: it's an mphi.
+  ValueIDNum ValNum = {CurBB, 0, NewIdx};
+  // Was this reg ever touched by a regmask?
+  for (const auto &MaskPair : reverse(Masks)) {
+    if (MaskPair.first->clobbersPhysReg(ID)) {
+      // There was an earlier def we skipped.
+      ValNum = {CurBB, MaskPair.second, NewIdx};
+      break;
+    }
+  }
+
+  LocIdxToIDNum[NewIdx] = ValNum;
+  LocIdxToLocID[NewIdx] = ID;
+  return NewIdx;
+}
+
+void MLocTracker::writeRegMask(const MachineOperand *MO, unsigned CurBB,
+                               unsigned InstID) {
+  // Def any register we track have that isn't preserved. The regmask
+  // terminates the liveness of a register, meaning its value can't be
+  // relied upon -- we represent this by giving it a new value.
+  for (auto Location : locations()) {
+    unsigned ID = LocIdxToLocID[Location.Idx];
+    // Don't clobber SP, even if the mask says it's clobbered.
+    if (ID < NumRegs && !SPAliases.count(ID) && MO->clobbersPhysReg(ID))
+      defReg(ID, CurBB, InstID);
+  }
+  Masks.push_back(std::make_pair(MO, InstID));
+}
+
+std::optional<SpillLocationNo> MLocTracker::getOrTrackSpillLoc(SpillLoc L) {
+  SpillLocationNo SpillID(SpillLocs.idFor(L));
+
+  if (SpillID.id() == 0) {
+    // If there is no location, and we have reached the limit of how many stack
+    // slots to track, then don't track this one.
+    if (SpillLocs.size() >= StackWorkingSetLimit)
+      return std::nullopt;
+
+    // Spill location is untracked: create record for this one, and all
+    // subregister slots too.
+    SpillID = SpillLocationNo(SpillLocs.insert(L));
+    for (unsigned StackIdx = 0; StackIdx < NumSlotIdxes; ++StackIdx) {
+      unsigned L = getSpillIDWithIdx(SpillID, StackIdx);
+      LocIdx Idx = LocIdx(LocIdxToIDNum.size()); // New idx
+      LocIdxToIDNum.grow(Idx);
+      LocIdxToLocID.grow(Idx);
+      LocIDToLocIdx.push_back(Idx);
+      LocIdxToLocID[Idx] = L;
+      // Initialize to PHI value; corresponds to the location's live-in value
+      // during transfer function construction.
+      LocIdxToIDNum[Idx] = ValueIDNum(CurBB, 0, Idx);
+    }
+  }
+  return SpillID;
+}
+
+std::string MLocTracker::LocIdxToName(LocIdx Idx) const {
+  unsigned ID = LocIdxToLocID[Idx];
+  if (ID >= NumRegs) {
+    StackSlotPos Pos = locIDToSpillIdx(ID);
+    ID -= NumRegs;
+    unsigned Slot = ID / NumSlotIdxes;
+    return Twine("slot ")
+        .concat(Twine(Slot).concat(Twine(" sz ").concat(Twine(Pos.first)
+        .concat(Twine(" offs ").concat(Twine(Pos.second))))))
+        .str();
+  } else {
+    return TRI.getRegAsmName(ID).str();
+  }
+}
+
+std::string MLocTracker::IDAsString(const ValueIDNum &Num) const {
+  std::string DefName = LocIdxToName(Num.getLoc());
+  return Num.asString(DefName);
+}
+
+#ifndef NDEBUG
+LLVM_DUMP_METHOD void MLocTracker::dump() {
+  for (auto Location : locations()) {
+    std::string MLocName = LocIdxToName(Location.Value.getLoc());
+    std::string DefName = Location.Value.asString(MLocName);
+    dbgs() << LocIdxToName(Location.Idx) << " --> " << DefName << "\n";
+  }
+}
+
+LLVM_DUMP_METHOD void MLocTracker::dump_mloc_map() {
+  for (auto Location : locations()) {
+    std::string foo = LocIdxToName(Location.Idx);
+    dbgs() << "Idx " << Location.Idx.asU64() << " " << foo << "\n";
+  }
+}
+#endif
+
+MachineInstrBuilder
+MLocTracker::emitLoc(const SmallVectorImpl<ResolvedDbgOp> &DbgOps,
+                     const DebugVariable &Var,
+                     const DbgValueProperties &Properties) {
+  DebugLoc DL = DILocation::get(Var.getVariable()->getContext(), 0, 0,
+                                Var.getVariable()->getScope(),
+                                const_cast<DILocation *>(Var.getInlinedAt()));
+
+  const MCInstrDesc &Desc = Properties.IsVariadic
+                                ? TII.get(TargetOpcode::DBG_VALUE_LIST)
+                                : TII.get(TargetOpcode::DBG_VALUE);
+
+#ifdef EXPENSIVE_CHECKS
+  assert(all_of(DbgOps,
+                [](const ResolvedDbgOp &Op) {
+                  return Op.IsConst || !Op.Loc.isIllegal();
+                }) &&
+         "Did not expect illegal ops in DbgOps.");
+  assert((DbgOps.size() == 0 ||
+          DbgOps.size() == Properties.getLocationOpCount()) &&
+         "Expected to have either one DbgOp per MI LocationOp, or none.");
+#endif
+
+  auto GetRegOp = [](unsigned Reg) -> MachineOperand {
+    return MachineOperand::CreateReg(
+        /* Reg */ Reg, /* isDef */ false, /* isImp */ false,
+        /* isKill */ false, /* isDead */ false,
+        /* isUndef */ false, /* isEarlyClobber */ false,
+        /* SubReg */ 0, /* isDebug */ true);
+  };
+
+  SmallVector<MachineOperand> MOs;
+
+  auto EmitUndef = [&]() {
+    MOs.clear();
+    MOs.assign(Properties.getLocationOpCount(), GetRegOp(0));
+    return BuildMI(MF, DL, Desc, false, MOs, Var.getVariable(),
+                   Properties.DIExpr);
+  };
+
+  // Don't bother passing any real operands to BuildMI if any of them would be
+  // $noreg.
+  if (DbgOps.empty())
+    return EmitUndef();
+
+  bool Indirect = Properties.Indirect;
+
+  const DIExpression *Expr = Properties.DIExpr;
+
+  assert(DbgOps.size() == Properties.getLocationOpCount());
+
+  // If all locations are valid, accumulate them into our list of
+  // MachineOperands. For any spilled locations, either update the indirectness
+  // register or apply the appropriate transformations in the DIExpression.
+  for (size_t Idx = 0; Idx < Properties.getLocationOpCount(); ++Idx) {
+    const ResolvedDbgOp &Op = DbgOps[Idx];
+
+    if (Op.IsConst) {
+      MOs.push_back(Op.MO);
+      continue;
+    }
+
+    LocIdx MLoc = Op.Loc;
+    unsigned LocID = LocIdxToLocID[MLoc];
+    if (LocID >= NumRegs) {
+      SpillLocationNo SpillID = locIDToSpill(LocID);
+      StackSlotPos StackIdx = locIDToSpillIdx(LocID);
+      unsigned short Offset = StackIdx.second;
+
+      // TODO: support variables that are located in spill slots, with non-zero
+      // offsets from the start of the spill slot. It would require some more
+      // complex DIExpression calculations. This doesn't seem to be produced by
+      // LLVM right now, so don't try and support it.
+      // Accept no-subregister slots and subregisters where the offset is zero.
+      // The consumer should already have type information to work out how large
+      // the variable is.
+      if (Offset == 0) {
+        const SpillLoc &Spill = SpillLocs[SpillID.id()];
+        unsigned Base = Spill.SpillBase;
+
+        // There are several ways we can dereference things, and several inputs
+        // to consider:
+        // * NRVO variables will appear with IsIndirect set, but should have
+        //   nothing else in their DIExpressions,
+        // * Variables with DW_OP_stack_value in their expr already need an
+        //   explicit dereference of the stack location,
+        // * Values that don't match the variable size need DW_OP_deref_size,
+        // * Everything else can just become a simple location expression.
+
+        // We need to use deref_size whenever there's a mismatch between the
+        // size of value and the size of variable portion being read.
+        // Additionally, we should use it whenever dealing with stack_value
+        // fragments, to avoid the consumer having to determine the deref size
+        // from DW_OP_piece.
+        bool UseDerefSize = false;
+        unsigned ValueSizeInBits = getLocSizeInBits(MLoc);
+        unsigned DerefSizeInBytes = ValueSizeInBits / 8;
+        if (auto Fragment = Var.getFragment()) {
+          unsigned VariableSizeInBits = Fragment->SizeInBits;
+          if (VariableSizeInBits != ValueSizeInBits || Expr->isComplex())
+            UseDerefSize = true;
+        } else if (auto Size = Var.getVariable()->getSizeInBits()) {
+          if (*Size != ValueSizeInBits) {
+            UseDerefSize = true;
+          }
+        }
+
+        SmallVector<uint64_t, 5> OffsetOps;
+        TRI.getOffsetOpcodes(Spill.SpillOffset, OffsetOps);
+        bool StackValue = false;
+
+        if (Properties.Indirect) {
+          // This is something like an NRVO variable, where the pointer has been
+          // spilt to the stack. It should end up being a memory location, with
+          // the pointer to the variable loaded off the stack with a deref:
+          assert(!Expr->isImplicit());
+          OffsetOps.push_back(dwarf::DW_OP_deref);
+        } else if (UseDerefSize && Expr->isSingleLocationExpression()) {
+          // TODO: Figure out how to handle deref size issues for variadic
+          // values.
+          // We're loading a value off the stack that's not the same size as the
+          // variable. Add / subtract stack offset, explicitly deref with a
+          // size, and add DW_OP_stack_value if not already present.
+          OffsetOps.push_back(dwarf::DW_OP_deref_size);
+          OffsetOps.push_back(DerefSizeInBytes);
+          StackValue = true;
+        } else if (Expr->isComplex() || Properties.IsVariadic) {
+          // A variable with no size ambiguity, but with extra elements in it's
+          // expression. Manually dereference the stack location.
+          OffsetOps.push_back(dwarf::DW_OP_deref);
+        } else {
+          // A plain value that has been spilt to the stack, with no further
+          // context. Request a location expression, marking the DBG_VALUE as
+          // IsIndirect.
+          Indirect = true;
+        }
+
+        Expr = DIExpression::appendOpsToArg(Expr, OffsetOps, Idx, StackValue);
+        MOs.push_back(GetRegOp(Base));
+      } else {
+        // This is a stack location with a weird subregister offset: emit an
+        // undef DBG_VALUE instead.
+        return EmitUndef();
+      }
+    } else {
+      // Non-empty, non-stack slot, must be a plain register.
+      MOs.push_back(GetRegOp(LocID));
+    }
+  }
+
+  return BuildMI(MF, DL, Desc, Indirect, MOs, Var.getVariable(), Expr);
+}
+
+/// Default construct and initialize the pass.
+InstrRefBasedLDV::InstrRefBasedLDV() = default;
+
+bool InstrRefBasedLDV::isCalleeSaved(LocIdx L) const {
+  unsigned Reg = MTracker->LocIdxToLocID[L];
+  return isCalleeSavedReg(Reg);
+}
+bool InstrRefBasedLDV::isCalleeSavedReg(Register R) const {
+  for (MCRegAliasIterator RAI(R, TRI, true); RAI.isValid(); ++RAI)
+    if (CalleeSavedRegs.test(*RAI))
+      return true;
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//            Debug Range Extension Implementation
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+// Something to restore in the future.
+// void InstrRefBasedLDV::printVarLocInMBB(..)
+#endif
+
+std::optional<SpillLocationNo>
+InstrRefBasedLDV::extractSpillBaseRegAndOffset(const MachineInstr &MI) {
+  assert(MI.hasOneMemOperand() &&
+         "Spill instruction does not have exactly one memory operand?");
+  auto MMOI = MI.memoperands_begin();
+  const PseudoSourceValue *PVal = (*MMOI)->getPseudoValue();
+  assert(PVal->kind() == PseudoSourceValue::FixedStack &&
+         "Inconsistent memory operand in spill instruction");
+  int FI = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex();
+  const MachineBasicBlock *MBB = MI.getParent();
+  Register Reg;
+  StackOffset Offset = TFI->getFrameIndexReference(*MBB->getParent(), FI, Reg);
+  return MTracker->getOrTrackSpillLoc({Reg, Offset});
+}
+
+std::optional<LocIdx>
+InstrRefBasedLDV::findLocationForMemOperand(const MachineInstr &MI) {
+  std::optional<SpillLocationNo> SpillLoc = extractSpillBaseRegAndOffset(MI);
+  if (!SpillLoc)
+    return std::nullopt;
+
+  // Where in the stack slot is this value defined -- i.e., what size of value
+  // is this? An important question, because it could be loaded into a register
+  // from the stack at some point. Happily the memory operand will tell us
+  // the size written to the stack.
+  auto *MemOperand = *MI.memoperands_begin();
+  unsigned SizeInBits = MemOperand->getSizeInBits();
+
+  // Find that position in the stack indexes we're tracking.
+  auto IdxIt = MTracker->StackSlotIdxes.find({SizeInBits, 0});
+  if (IdxIt == MTracker->StackSlotIdxes.end())
+    // That index is not tracked. This is suprising, and unlikely to ever
+    // occur, but the safe action is to indicate the variable is optimised out.
+    return std::nullopt;
+
+  unsigned SpillID = MTracker->getSpillIDWithIdx(*SpillLoc, IdxIt->second);
+  return MTracker->getSpillMLoc(SpillID);
+}
+
+/// End all previous ranges related to @MI and start a new range from @MI
+/// if it is a DBG_VALUE instr.
+bool InstrRefBasedLDV::transferDebugValue(const MachineInstr &MI) {
+  if (!MI.isDebugValue())
+    return false;
+
+  const DILocalVariable *Var = MI.getDebugVariable();
+  const DIExpression *Expr = MI.getDebugExpression();
+  const DILocation *DebugLoc = MI.getDebugLoc();
+  const DILocation *InlinedAt = DebugLoc->getInlinedAt();
+  assert(Var->isValidLocationForIntrinsic(DebugLoc) &&
+         "Expected inlined-at fields to agree");
+
+  DebugVariable V(Var, Expr, InlinedAt);
+  DbgValueProperties Properties(MI);
+
+  // If there are no instructions in this lexical scope, do no location tracking
+  // at all, this variable shouldn't get a legitimate location range.
+  auto *Scope = LS.findLexicalScope(MI.getDebugLoc().get());
+  if (Scope == nullptr)
+    return true; // handled it; by doing nothing
+
+  // MLocTracker needs to know that this register is read, even if it's only
+  // read by a debug inst.
+  for (const MachineOperand &MO : MI.debug_operands())
+    if (MO.isReg() && MO.getReg() != 0)
+      (void)MTracker->readReg(MO.getReg());
+
+  // If we're preparing for the second analysis (variables), the machine value
+  // locations are already solved, and we report this DBG_VALUE and the value
+  // it refers to to VLocTracker.
+  if (VTracker) {
+    SmallVector<DbgOpID> DebugOps;
+    // Feed defVar the new variable location, or if this is a DBG_VALUE $noreg,
+    // feed defVar None.
+    if (!MI.isUndefDebugValue()) {
+      for (const MachineOperand &MO : MI.debug_operands()) {
+        // There should be no undef registers here, as we've screened for undef
+        // debug values.
+        if (MO.isReg()) {
+          DebugOps.push_back(DbgOpStore.insert(MTracker->readReg(MO.getReg())));
+        } else if (MO.isImm() || MO.isFPImm() || MO.isCImm()) {
+          DebugOps.push_back(DbgOpStore.insert(MO));
+        } else {
+          llvm_unreachable("Unexpected debug operand type.");
+        }
+      }
+    }
+    VTracker->defVar(MI, Properties, DebugOps);
+  }
+
+  // If performing final tracking of transfers, report this variable definition
+  // to the TransferTracker too.
+  if (TTracker)
+    TTracker->redefVar(MI);
+  return true;
+}
+
+std::optional<ValueIDNum> InstrRefBasedLDV::getValueForInstrRef(
+    unsigned InstNo, unsigned OpNo, MachineInstr &MI,
+    const ValueTable *MLiveOuts, const ValueTable *MLiveIns) {
+  // Various optimizations may have happened to the value during codegen,
+  // recorded in the value substitution table. Apply any substitutions to
+  // the instruction / operand number in this DBG_INSTR_REF, and collect
+  // any subregister extractions performed during optimization.
+  const MachineFunction &MF = *MI.getParent()->getParent();
+
+  // Create dummy substitution with Src set, for lookup.
+  auto SoughtSub =
+      MachineFunction::DebugSubstitution({InstNo, OpNo}, {0, 0}, 0);
+
+  SmallVector<unsigned, 4> SeenSubregs;
+  auto LowerBoundIt = llvm::lower_bound(MF.DebugValueSubstitutions, SoughtSub);
+  while (LowerBoundIt != MF.DebugValueSubstitutions.end() &&
+         LowerBoundIt->Src == SoughtSub.Src) {
+    std::tie(InstNo, OpNo) = LowerBoundIt->Dest;
+    SoughtSub.Src = LowerBoundIt->Dest;
+    if (unsigned Subreg = LowerBoundIt->Subreg)
+      SeenSubregs.push_back(Subreg);
+    LowerBoundIt = llvm::lower_bound(MF.DebugValueSubstitutions, SoughtSub);
+  }
+
+  // Default machine value number is <None> -- if no instruction defines
+  // the corresponding value, it must have been optimized out.
+  std::optional<ValueIDNum> NewID;
+
+  // Try to lookup the instruction number, and find the machine value number
+  // that it defines. It could be an instruction, or a PHI.
+  auto InstrIt = DebugInstrNumToInstr.find(InstNo);
+  auto PHIIt = llvm::lower_bound(DebugPHINumToValue, InstNo);
+  if (InstrIt != DebugInstrNumToInstr.end()) {
+    const MachineInstr &TargetInstr = *InstrIt->second.first;
+    uint64_t BlockNo = TargetInstr.getParent()->getNumber();
+
+    // Pick out the designated operand. It might be a memory reference, if
+    // a register def was folded into a stack store.
+    if (OpNo == MachineFunction::DebugOperandMemNumber &&
+        TargetInstr.hasOneMemOperand()) {
+      std::optional<LocIdx> L = findLocationForMemOperand(TargetInstr);
+      if (L)
+        NewID = ValueIDNum(BlockNo, InstrIt->second.second, *L);
+    } else if (OpNo != MachineFunction::DebugOperandMemNumber) {
+      // Permit the debug-info to be completely wrong: identifying a nonexistant
+      // operand, or one that is not a register definition, means something
+      // unexpected happened during optimisation. Broken debug-info, however,
+      // shouldn't crash the compiler -- instead leave the variable value as
+      // None, which will make it appear "optimised out".
+      if (OpNo < TargetInstr.getNumOperands()) {
+        const MachineOperand &MO = TargetInstr.getOperand(OpNo);
+
+        if (MO.isReg() && MO.isDef() && MO.getReg()) {
+          unsigned LocID = MTracker->getLocID(MO.getReg());
+          LocIdx L = MTracker->LocIDToLocIdx[LocID];
+          NewID = ValueIDNum(BlockNo, InstrIt->second.second, L);
+        }
+      }
+
+      if (!NewID) {
+        LLVM_DEBUG(
+            { dbgs() << "Seen instruction reference to illegal operand\n"; });
+      }
+    }
+    // else: NewID is left as None.
+  } else if (PHIIt != DebugPHINumToValue.end() && PHIIt->InstrNum == InstNo) {
+    // It's actually a PHI value. Which value it is might not be obvious, use
+    // the resolver helper to find out.
+    NewID = resolveDbgPHIs(*MI.getParent()->getParent(), MLiveOuts, MLiveIns,
+                           MI, InstNo);
+  }
+
+  // Apply any subregister extractions, in reverse. We might have seen code
+  // like this:
+  //    CALL64 @foo, implicit-def $rax
+  //    %0:gr64 = COPY $rax
+  //    %1:gr32 = COPY %0.sub_32bit
+  //    %2:gr16 = COPY %1.sub_16bit
+  //    %3:gr8  = COPY %2.sub_8bit
+  // In which case each copy would have been recorded as a substitution with
+  // a subregister qualifier. Apply those qualifiers now.
+  if (NewID && !SeenSubregs.empty()) {
+    unsigned Offset = 0;
+    unsigned Size = 0;
+
+    // Look at each subregister that we passed through, and progressively
+    // narrow in, accumulating any offsets that occur. Substitutions should
+    // only ever be the same or narrower width than what they read from;
+    // iterate in reverse order so that we go from wide to small.
+    for (unsigned Subreg : reverse(SeenSubregs)) {
+      unsigned ThisSize = TRI->getSubRegIdxSize(Subreg);
+      unsigned ThisOffset = TRI->getSubRegIdxOffset(Subreg);
+      Offset += ThisOffset;
+      Size = (Size == 0) ? ThisSize : std::min(Size, ThisSize);
+    }
+
+    // If that worked, look for an appropriate subregister with the register
+    // where the define happens. Don't look at values that were defined during
+    // a stack write: we can't currently express register locations within
+    // spills.
+    LocIdx L = NewID->getLoc();
+    if (NewID && !MTracker->isSpill(L)) {
+      // Find the register class for the register where this def happened.
+      // FIXME: no index for this?
+      Register Reg = MTracker->LocIdxToLocID[L];
+      const TargetRegisterClass *TRC = nullptr;
+      for (const auto *TRCI : TRI->regclasses())
+        if (TRCI->contains(Reg))
+          TRC = TRCI;
+      assert(TRC && "Couldn't find target register class?");
+
+      // If the register we have isn't the right size or in the right place,
+      // Try to find a subregister inside it.
+      unsigned MainRegSize = TRI->getRegSizeInBits(*TRC);
+      if (Size != MainRegSize || Offset) {
+        // Enumerate all subregisters, searching.
+        Register NewReg = 0;
+        for (MCPhysReg SR : TRI->subregs(Reg)) {
+          unsigned Subreg = TRI->getSubRegIndex(Reg, SR);
+          unsigned SubregSize = TRI->getSubRegIdxSize(Subreg);
+          unsigned SubregOffset = TRI->getSubRegIdxOffset(Subreg);
+          if (SubregSize == Size && SubregOffset == Offset) {
+            NewReg = SR;
+            break;
+          }
+        }
+
+        // If we didn't find anything: there's no way to express our value.
+        if (!NewReg) {
+          NewID = std::nullopt;
+        } else {
+          // Re-state the value as being defined within the subregister
+          // that we found.
+          LocIdx NewLoc = MTracker->lookupOrTrackRegister(NewReg);
+          NewID = ValueIDNum(NewID->getBlock(), NewID->getInst(), NewLoc);
+        }
+      }
+    } else {
+      // If we can't handle subregisters, unset the new value.
+      NewID = std::nullopt;
+    }
+  }
+
+  return NewID;
+}
+
+bool InstrRefBasedLDV::transferDebugInstrRef(MachineInstr &MI,
+                                             const ValueTable *MLiveOuts,
+                                             const ValueTable *MLiveIns) {
+  if (!MI.isDebugRef())
+    return false;
+
+  // Only handle this instruction when we are building the variable value
+  // transfer function.
+  if (!VTracker && !TTracker)
+    return false;
+
+  const DILocalVariable *Var = MI.getDebugVariable();
+  const DIExpression *Expr = MI.getDebugExpression();
+  const DILocation *DebugLoc = MI.getDebugLoc();
+  const DILocation *InlinedAt = DebugLoc->getInlinedAt();
+  assert(Var->isValidLocationForIntrinsic(DebugLoc) &&
+         "Expected inlined-at fields to agree");
+
+  DebugVariable V(Var, Expr, InlinedAt);
+
+  auto *Scope = LS.findLexicalScope(MI.getDebugLoc().get());
+  if (Scope == nullptr)
+    return true; // Handled by doing nothing. This variable is never in scope.
+
+  SmallVector<DbgOpID> DbgOpIDs;
+  for (const MachineOperand &MO : MI.debug_operands()) {
+    if (!MO.isDbgInstrRef()) {
+      assert(!MO.isReg() && "DBG_INSTR_REF should not contain registers");
+      DbgOpID ConstOpID = DbgOpStore.insert(DbgOp(MO));
+      DbgOpIDs.push_back(ConstOpID);
+      continue;
+    }
+
+    unsigned InstNo = MO.getInstrRefInstrIndex();
+    unsigned OpNo = MO.getInstrRefOpIndex();
+
+    // Default machine value number is <None> -- if no instruction defines
+    // the corresponding value, it must have been optimized out.
+    std::optional<ValueIDNum> NewID =
+        getValueForInstrRef(InstNo, OpNo, MI, MLiveOuts, MLiveIns);
+    // We have a value number or std::nullopt. If the latter, then kill the
+    // entire debug value.
+    if (NewID) {
+      DbgOpIDs.push_back(DbgOpStore.insert(*NewID));
+    } else {
+      DbgOpIDs.clear();
+      break;
+    }
+  }
+
+  // We have a DbgOpID for every value or for none. Tell the variable value
+  // tracker about it. The rest of this LiveDebugValues implementation acts
+  // exactly the same for DBG_INSTR_REFs as DBG_VALUEs (just, the former can
+  // refer to values that aren't immediately available).
+  DbgValueProperties Properties(Expr, false, true);
+  if (VTracker)
+    VTracker->defVar(MI, Properties, DbgOpIDs);
+
+  // If we're on the final pass through the function, decompose this INSTR_REF
+  // into a plain DBG_VALUE.
+  if (!TTracker)
+    return true;
+
+  // Fetch the concrete DbgOps now, as we will need them later.
+  SmallVector<DbgOp> DbgOps;
+  for (DbgOpID OpID : DbgOpIDs) {
+    DbgOps.push_back(DbgOpStore.find(OpID));
+  }
+
+  // Pick a location for the machine value number, if such a location exists.
+  // (This information could be stored in TransferTracker to make it faster).
+  SmallDenseMap<ValueIDNum, TransferTracker::LocationAndQuality> FoundLocs;
+  SmallVector<ValueIDNum> ValuesToFind;
+  // Initialized the preferred-location map with illegal locations, to be
+  // filled in later.
+  for (const DbgOp &Op : DbgOps) {
+    if (!Op.IsConst)
+      if (FoundLocs.insert({Op.ID, TransferTracker::LocationAndQuality()})
+              .second)
+        ValuesToFind.push_back(Op.ID);
+  }
+
+  for (auto Location : MTracker->locations()) {
+    LocIdx CurL = Location.Idx;
+    ValueIDNum ID = MTracker->readMLoc(CurL);
+    auto ValueToFindIt = find(ValuesToFind, ID);
+    if (ValueToFindIt == ValuesToFind.end())
+      continue;
+    auto &Previous = FoundLocs.find(ID)->second;
+    // If this is the first location with that value, pick it. Otherwise,
+    // consider whether it's a "longer term" location.
+    std::optional<TransferTracker::LocationQuality> ReplacementQuality =
+        TTracker->getLocQualityIfBetter(CurL, Previous.getQuality());
+    if (ReplacementQuality) {
+      Previous = TransferTracker::LocationAndQuality(CurL, *ReplacementQuality);
+      if (Previous.isBest()) {
+        ValuesToFind.erase(ValueToFindIt);
+        if (ValuesToFind.empty())
+          break;
+      }
+    }
+  }
+
+  SmallVector<ResolvedDbgOp> NewLocs;
+  for (const DbgOp &DbgOp : DbgOps) {
+    if (DbgOp.IsConst) {
+      NewLocs.push_back(DbgOp.MO);
+      continue;
+    }
+    LocIdx FoundLoc = FoundLocs.find(DbgOp.ID)->second.getLoc();
+    if (FoundLoc.isIllegal()) {
+      NewLocs.clear();
+      break;
+    }
+    NewLocs.push_back(FoundLoc);
+  }
+  // Tell transfer tracker that the variable value has changed.
+  TTracker->redefVar(MI, Properties, NewLocs);
+
+  // If there were values with no location, but all such values are defined in
+  // later instructions in this block, this is a block-local use-before-def.
+  if (!DbgOps.empty() && NewLocs.empty()) {
+    bool IsValidUseBeforeDef = true;
+    uint64_t LastUseBeforeDef = 0;
+    for (auto ValueLoc : FoundLocs) {
+      ValueIDNum NewID = ValueLoc.first;
+      LocIdx FoundLoc = ValueLoc.second.getLoc();
+      if (!FoundLoc.isIllegal())
+        continue;
+      // If we have an value with no location that is not defined in this block,
+      // then it has no location in this block, leaving this value undefined.
+      if (NewID.getBlock() != CurBB || NewID.getInst() <= CurInst) {
+        IsValidUseBeforeDef = false;
+        break;
+      }
+      LastUseBeforeDef = std::max(LastUseBeforeDef, NewID.getInst());
+    }
+    if (IsValidUseBeforeDef) {
+      TTracker->addUseBeforeDef(V, {MI.getDebugExpression(), false, true},
+                                DbgOps, LastUseBeforeDef);
+    }
+  }
+
+  // Produce a DBG_VALUE representing what this DBG_INSTR_REF meant.
+  // This DBG_VALUE is potentially a $noreg / undefined location, if
+  // FoundLoc is illegal.
+  // (XXX -- could morph the DBG_INSTR_REF in the future).
+  MachineInstr *DbgMI = MTracker->emitLoc(NewLocs, V, Properties);
+
+  TTracker->PendingDbgValues.push_back(DbgMI);
+  TTracker->flushDbgValues(MI.getIterator(), nullptr);
+  return true;
+}
+
+bool InstrRefBasedLDV::transferDebugPHI(MachineInstr &MI) {
+  if (!MI.isDebugPHI())
+    return false;
+
+  // Analyse these only when solving the machine value location problem.
+  if (VTracker || TTracker)
+    return true;
+
+  // First operand is the value location, either a stack slot or register.
+  // Second is the debug instruction number of the original PHI.
+  const MachineOperand &MO = MI.getOperand(0);
+  unsigned InstrNum = MI.getOperand(1).getImm();
+
+  auto EmitBadPHI = [this, &MI, InstrNum]() -> bool {
+    // Helper lambda to do any accounting when we fail to find a location for
+    // a DBG_PHI. This can happen if DBG_PHIs are malformed, or refer to a
+    // dead stack slot, for example.
+    // Record a DebugPHIRecord with an empty value + location.
+    DebugPHINumToValue.push_back(
+        {InstrNum, MI.getParent(), std::nullopt, std::nullopt});
+    return true;
+  };
+
+  if (MO.isReg() && MO.getReg()) {
+    // The value is whatever's currently in the register. Read and record it,
+    // to be analysed later.
+    Register Reg = MO.getReg();
+    ValueIDNum Num = MTracker->readReg(Reg);
+    auto PHIRec = DebugPHIRecord(
+        {InstrNum, MI.getParent(), Num, MTracker->lookupOrTrackRegister(Reg)});
+    DebugPHINumToValue.push_back(PHIRec);
+
+    // Ensure this register is tracked.
+    for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
+      MTracker->lookupOrTrackRegister(*RAI);
+  } else if (MO.isFI()) {
+    // The value is whatever's in this stack slot.
+    unsigned FI = MO.getIndex();
+
+    // If the stack slot is dead, then this was optimized away.
+    // FIXME: stack slot colouring should account for slots that get merged.
+    if (MFI->isDeadObjectIndex(FI))
+      return EmitBadPHI();
+
+    // Identify this spill slot, ensure it's tracked.
+    Register Base;
+    StackOffset Offs = TFI->getFrameIndexReference(*MI.getMF(), FI, Base);
+    SpillLoc SL = {Base, Offs};
+    std::optional<SpillLocationNo> SpillNo = MTracker->getOrTrackSpillLoc(SL);
+
+    // We might be able to find a value, but have chosen not to, to avoid
+    // tracking too much stack information.
+    if (!SpillNo)
+      return EmitBadPHI();
+
+    // Any stack location DBG_PHI should have an associate bit-size.
+    assert(MI.getNumOperands() == 3 && "Stack DBG_PHI with no size?");
+    unsigned slotBitSize = MI.getOperand(2).getImm();
+
+    unsigned SpillID = MTracker->getLocID(*SpillNo, {slotBitSize, 0});
+    LocIdx SpillLoc = MTracker->getSpillMLoc(SpillID);
+    ValueIDNum Result = MTracker->readMLoc(SpillLoc);
+
+    // Record this DBG_PHI for later analysis.
+    auto DbgPHI = DebugPHIRecord({InstrNum, MI.getParent(), Result, SpillLoc});
+    DebugPHINumToValue.push_back(DbgPHI);
+  } else {
+    // Else: if the operand is neither a legal register or a stack slot, then
+    // we're being fed illegal debug-info. Record an empty PHI, so that any
+    // debug users trying to read this number will be put off trying to
+    // interpret the value.
+    LLVM_DEBUG(
+        { dbgs() << "Seen DBG_PHI with unrecognised operand format\n"; });
+    return EmitBadPHI();
+  }
+
+  return true;
+}
+
+void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) {
+  // Meta Instructions do not affect the debug liveness of any register they
+  // define.
+  if (MI.isImplicitDef()) {
+    // Except when there's an implicit def, and the location it's defining has
+    // no value number. The whole point of an implicit def is to announce that
+    // the register is live, without be specific about it's value. So define
+    // a value if there isn't one already.
+    ValueIDNum Num = MTracker->readReg(MI.getOperand(0).getReg());
+    // Has a legitimate value -> ignore the implicit def.
+    if (Num.getLoc() != 0)
+      return;
+    // Otherwise, def it here.
+  } else if (MI.isMetaInstruction())
+    return;
+
+  // We always ignore SP defines on call instructions, they don't actually
+  // change the value of the stack pointer... except for win32's _chkstk. This
+  // is rare: filter quickly for the common case (no stack adjustments, not a
+  // call, etc). If it is a call that modifies SP, recognise the SP register
+  // defs.
+  bool CallChangesSP = false;
+  if (AdjustsStackInCalls && MI.isCall() && MI.getOperand(0).isSymbol() &&
+      !strcmp(MI.getOperand(0).getSymbolName(), StackProbeSymbolName.data()))
+    CallChangesSP = true;
+
+  // Test whether we should ignore a def of this register due to it being part
+  // of the stack pointer.
+  auto IgnoreSPAlias = [this, &MI, CallChangesSP](Register R) -> bool {
+    if (CallChangesSP)
+      return false;
+    return MI.isCall() && MTracker->SPAliases.count(R);
+  };
+
+  // Find the regs killed by MI, and find regmasks of preserved regs.
+  // Max out the number of statically allocated elements in `DeadRegs`, as this
+  // prevents fallback to std::set::count() operations.
+  SmallSet<uint32_t, 32> DeadRegs;
+  SmallVector<const uint32_t *, 4> RegMasks;
+  SmallVector<const MachineOperand *, 4> RegMaskPtrs;
+  for (const MachineOperand &MO : MI.operands()) {
+    // Determine whether the operand is a register def.
+    if (MO.isReg() && MO.isDef() && MO.getReg() && MO.getReg().isPhysical() &&
+        !IgnoreSPAlias(MO.getReg())) {
+      // Remove ranges of all aliased registers.
+      for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
+        // FIXME: Can we break out of this loop early if no insertion occurs?
+        DeadRegs.insert(*RAI);
+    } else if (MO.isRegMask()) {
+      RegMasks.push_back(MO.getRegMask());
+      RegMaskPtrs.push_back(&MO);
+    }
+  }
+
+  // Tell MLocTracker about all definitions, of regmasks and otherwise.
+  for (uint32_t DeadReg : DeadRegs)
+    MTracker->defReg(DeadReg, CurBB, CurInst);
+
+  for (const auto *MO : RegMaskPtrs)
+    MTracker->writeRegMask(MO, CurBB, CurInst);
+
+  // If this instruction writes to a spill slot, def that slot.
+  if (hasFoldedStackStore(MI)) {
+    if (std::optional<SpillLocationNo> SpillNo =
+            extractSpillBaseRegAndOffset(MI)) {
+      for (unsigned int I = 0; I < MTracker->NumSlotIdxes; ++I) {
+        unsigned SpillID = MTracker->getSpillIDWithIdx(*SpillNo, I);
+        LocIdx L = MTracker->getSpillMLoc(SpillID);
+        MTracker->setMLoc(L, ValueIDNum(CurBB, CurInst, L));
+      }
+    }
+  }
+
+  if (!TTracker)
+    return;
+
+  // When committing variable values to locations: tell transfer tracker that
+  // we've clobbered things. It may be able to recover the variable from a
+  // different location.
+
+  // Inform TTracker about any direct clobbers.
+  for (uint32_t DeadReg : DeadRegs) {
+    LocIdx Loc = MTracker->lookupOrTrackRegister(DeadReg);
+    TTracker->clobberMloc(Loc, MI.getIterator(), false);
+  }
+
+  // Look for any clobbers performed by a register mask. Only test locations
+  // that are actually being tracked.
+  if (!RegMaskPtrs.empty()) {
+    for (auto L : MTracker->locations()) {
+      // Stack locations can't be clobbered by regmasks.
+      if (MTracker->isSpill(L.Idx))
+        continue;
+
+      Register Reg = MTracker->LocIdxToLocID[L.Idx];
+      if (IgnoreSPAlias(Reg))
+        continue;
+
+      for (const auto *MO : RegMaskPtrs)
+        if (MO->clobbersPhysReg(Reg))
+          TTracker->clobberMloc(L.Idx, MI.getIterator(), false);
+    }
+  }
+
+  // Tell TTracker about any folded stack store.
+  if (hasFoldedStackStore(MI)) {
+    if (std::optional<SpillLocationNo> SpillNo =
+            extractSpillBaseRegAndOffset(MI)) {
+      for (unsigned int I = 0; I < MTracker->NumSlotIdxes; ++I) {
+        unsigned SpillID = MTracker->getSpillIDWithIdx(*SpillNo, I);
+        LocIdx L = MTracker->getSpillMLoc(SpillID);
+        TTracker->clobberMloc(L, MI.getIterator(), true);
+      }
+    }
+  }
+}
+
+void InstrRefBasedLDV::performCopy(Register SrcRegNum, Register DstRegNum) {
+  // In all circumstances, re-def all aliases. It's definitely a new value now.
+  for (MCRegAliasIterator RAI(DstRegNum, TRI, true); RAI.isValid(); ++RAI)
+    MTracker->defReg(*RAI, CurBB, CurInst);
+
+  ValueIDNum SrcValue = MTracker->readReg(SrcRegNum);
+  MTracker->setReg(DstRegNum, SrcValue);
+
+  // Copy subregisters from one location to another.
+  for (MCSubRegIndexIterator SRI(SrcRegNum, TRI); SRI.isValid(); ++SRI) {
+    unsigned SrcSubReg = SRI.getSubReg();
+    unsigned SubRegIdx = SRI.getSubRegIndex();
+    unsigned DstSubReg = TRI->getSubReg(DstRegNum, SubRegIdx);
+    if (!DstSubReg)
+      continue;
+
+    // Do copy. There are two matching subregisters, the source value should
+    // have been def'd when the super-reg was, the latter might not be tracked
+    // yet.
+    // This will force SrcSubReg to be tracked, if it isn't yet. Will read
+    // mphi values if it wasn't tracked.
+    LocIdx SrcL = MTracker->lookupOrTrackRegister(SrcSubReg);
+    LocIdx DstL = MTracker->lookupOrTrackRegister(DstSubReg);
+    (void)SrcL;
+    (void)DstL;
+    ValueIDNum CpyValue = MTracker->readReg(SrcSubReg);
+
+    MTracker->setReg(DstSubReg, CpyValue);
+  }
+}
+
+std::optional<SpillLocationNo>
+InstrRefBasedLDV::isSpillInstruction(const MachineInstr &MI,
+                                     MachineFunction *MF) {
+  // TODO: Handle multiple stores folded into one.
+  if (!MI.hasOneMemOperand())
+    return std::nullopt;
+
+  // Reject any memory operand that's aliased -- we can't guarantee its value.
+  auto MMOI = MI.memoperands_begin();
+  const PseudoSourceValue *PVal = (*MMOI)->getPseudoValue();
+  if (PVal->isAliased(MFI))
+    return std::nullopt;
+
+  if (!MI.getSpillSize(TII) && !MI.getFoldedSpillSize(TII))
+    return std::nullopt; // This is not a spill instruction, since no valid size
+                         // was returned from either function.
+
+  return extractSpillBaseRegAndOffset(MI);
+}
+
+bool InstrRefBasedLDV::isLocationSpill(const MachineInstr &MI,
+                                       MachineFunction *MF, unsigned &Reg) {
+  if (!isSpillInstruction(MI, MF))
+    return false;
+
+  int FI;
+  Reg = TII->isStoreToStackSlotPostFE(MI, FI);
+  return Reg != 0;
+}
+
+std::optional<SpillLocationNo>
+InstrRefBasedLDV::isRestoreInstruction(const MachineInstr &MI,
+                                       MachineFunction *MF, unsigned &Reg) {
+  if (!MI.hasOneMemOperand())
+    return std::nullopt;
+
+  // FIXME: Handle folded restore instructions with more than one memory
+  // operand.
+  if (MI.getRestoreSize(TII)) {
+    Reg = MI.getOperand(0).getReg();
+    return extractSpillBaseRegAndOffset(MI);
+  }
+  return std::nullopt;
+}
+
+bool InstrRefBasedLDV::transferSpillOrRestoreInst(MachineInstr &MI) {
+  // XXX -- it's too difficult to implement VarLocBasedImpl's  stack location
+  // limitations under the new model. Therefore, when comparing them, compare
+  // versions that don't attempt spills or restores at all.
+  if (EmulateOldLDV)
+    return false;
+
+  // Strictly limit ourselves to plain loads and stores, not all instructions
+  // that can access the stack.
+  int DummyFI = -1;
+  if (!TII->isStoreToStackSlotPostFE(MI, DummyFI) &&
+      !TII->isLoadFromStackSlotPostFE(MI, DummyFI))
+    return false;
+
+  MachineFunction *MF = MI.getMF();
+  unsigned Reg;
+
+  LLVM_DEBUG(dbgs() << "Examining instruction: "; MI.dump(););
+
+  // Strictly limit ourselves to plain loads and stores, not all instructions
+  // that can access the stack.
+  int FIDummy;
+  if (!TII->isStoreToStackSlotPostFE(MI, FIDummy) &&
+      !TII->isLoadFromStackSlotPostFE(MI, FIDummy))
+    return false;
+
+  // First, if there are any DBG_VALUEs pointing at a spill slot that is
+  // written to, terminate that variable location. The value in memory
+  // will have changed. DbgEntityHistoryCalculator doesn't try to detect this.
+  if (std::optional<SpillLocationNo> Loc = isSpillInstruction(MI, MF)) {
+    // Un-set this location and clobber, so that earlier locations don't
+    // continue past this store.
+    for (unsigned SlotIdx = 0; SlotIdx < MTracker->NumSlotIdxes; ++SlotIdx) {
+      unsigned SpillID = MTracker->getSpillIDWithIdx(*Loc, SlotIdx);
+      std::optional<LocIdx> MLoc = MTracker->getSpillMLoc(SpillID);
+      if (!MLoc)
+        continue;
+
+      // We need to over-write the stack slot with something (here, a def at
+      // this instruction) to ensure no values are preserved in this stack slot
+      // after the spill. It also prevents TTracker from trying to recover the
+      // location and re-installing it in the same place.
+      ValueIDNum Def(CurBB, CurInst, *MLoc);
+      MTracker->setMLoc(*MLoc, Def);
+      if (TTracker)
+        TTracker->clobberMloc(*MLoc, MI.getIterator());
+    }
+  }
+
+  // Try to recognise spill and restore instructions that may transfer a value.
+  if (isLocationSpill(MI, MF, Reg)) {
+    // isLocationSpill returning true should guarantee we can extract a
+    // location.
+    SpillLocationNo Loc = *extractSpillBaseRegAndOffset(MI);
+
+    auto DoTransfer = [&](Register SrcReg, unsigned SpillID) {
+      auto ReadValue = MTracker->readReg(SrcReg);
+      LocIdx DstLoc = MTracker->getSpillMLoc(SpillID);
+      MTracker->setMLoc(DstLoc, ReadValue);
+
+      if (TTracker) {
+        LocIdx SrcLoc = MTracker->getRegMLoc(SrcReg);
+        TTracker->transferMlocs(SrcLoc, DstLoc, MI.getIterator());
+      }
+    };
+
+    // Then, transfer subreg bits.
+    for (MCPhysReg SR : TRI->subregs(Reg)) {
+      // Ensure this reg is tracked,
+      (void)MTracker->lookupOrTrackRegister(SR);
+      unsigned SubregIdx = TRI->getSubRegIndex(Reg, SR);
+      unsigned SpillID = MTracker->getLocID(Loc, SubregIdx);
+      DoTransfer(SR, SpillID);
+    }
+
+    // Directly lookup size of main source reg, and transfer.
+    unsigned Size = TRI->getRegSizeInBits(Reg, *MRI);
+    unsigned SpillID = MTracker->getLocID(Loc, {Size, 0});
+    DoTransfer(Reg, SpillID);
+  } else {
+    std::optional<SpillLocationNo> Loc = isRestoreInstruction(MI, MF, Reg);
+    if (!Loc)
+      return false;
+
+    // Assumption: we're reading from the base of the stack slot, not some
+    // offset into it. It seems very unlikely LLVM would ever generate
+    // restores where this wasn't true. This then becomes a question of what
+    // subregisters in the destination register line up with positions in the
+    // stack slot.
+
+    // Def all registers that alias the destination.
+    for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
+      MTracker->defReg(*RAI, CurBB, CurInst);
+
+    // Now find subregisters within the destination register, and load values
+    // from stack slot positions.
+    auto DoTransfer = [&](Register DestReg, unsigned SpillID) {
+      LocIdx SrcIdx = MTracker->getSpillMLoc(SpillID);
+      auto ReadValue = MTracker->readMLoc(SrcIdx);
+      MTracker->setReg(DestReg, ReadValue);
+    };
+
+    for (MCPhysReg SR : TRI->subregs(Reg)) {
+      unsigned Subreg = TRI->getSubRegIndex(Reg, SR);
+      unsigned SpillID = MTracker->getLocID(*Loc, Subreg);
+      DoTransfer(SR, SpillID);
+    }
+
+    // Directly look up this registers slot idx by size, and transfer.
+    unsigned Size = TRI->getRegSizeInBits(Reg, *MRI);
+    unsigned SpillID = MTracker->getLocID(*Loc, {Size, 0});
+    DoTransfer(Reg, SpillID);
+  }
+  return true;
+}
+
+bool InstrRefBasedLDV::transferRegisterCopy(MachineInstr &MI) {
+  auto DestSrc = TII->isCopyInstr(MI);
+  if (!DestSrc)
+    return false;
+
+  const MachineOperand *DestRegOp = DestSrc->Destination;
+  const MachineOperand *SrcRegOp = DestSrc->Source;
+
+  Register SrcReg = SrcRegOp->getReg();
+  Register DestReg = DestRegOp->getReg();
+
+  // Ignore identity copies. Yep, these make it as far as LiveDebugValues.
+  if (SrcReg == DestReg)
+    return true;
+
+  // For emulating VarLocBasedImpl:
+  // We want to recognize instructions where destination register is callee
+  // saved register. If register that could be clobbered by the call is
+  // included, there would be a great chance that it is going to be clobbered
+  // soon. It is more likely that previous register, which is callee saved, is
+  // going to stay unclobbered longer, even if it is killed.
+  //
+  // For InstrRefBasedImpl, we can track multiple locations per value, so
+  // ignore this condition.
+  if (EmulateOldLDV && !isCalleeSavedReg(DestReg))
+    return false;
+
+  // InstrRefBasedImpl only followed killing copies.
+  if (EmulateOldLDV && !SrcRegOp->isKill())
+    return false;
+
+  // Before we update MTracker, remember which values were present in each of
+  // the locations about to be overwritten, so that we can recover any
+  // potentially clobbered variables.
+  DenseMap<LocIdx, ValueIDNum> ClobberedLocs;
+  if (TTracker) {
+    for (MCRegAliasIterator RAI(DestReg, TRI, true); RAI.isValid(); ++RAI) {
+      LocIdx ClobberedLoc = MTracker->getRegMLoc(*RAI);
+      auto MLocIt = TTracker->ActiveMLocs.find(ClobberedLoc);
+      // If ActiveMLocs isn't tracking this location or there are no variables
+      // using it, don't bother remembering.
+      if (MLocIt == TTracker->ActiveMLocs.end() || MLocIt->second.empty())
+        continue;
+      ValueIDNum Value = MTracker->readReg(*RAI);
+      ClobberedLocs[ClobberedLoc] = Value;
+    }
+  }
+
+  // Copy MTracker info, including subregs if available.
+  InstrRefBasedLDV::performCopy(SrcReg, DestReg);
+
+  // The copy might have clobbered variables based on the destination register.
+  // Tell TTracker about it, passing the old ValueIDNum to search for
+  // alternative locations (or else terminating those variables).
+  if (TTracker) {
+    for (auto LocVal : ClobberedLocs) {
+      TTracker->clobberMloc(LocVal.first, LocVal.second, MI.getIterator(), false);
+    }
+  }
+
+  // Only produce a transfer of DBG_VALUE within a block where old LDV
+  // would have. We might make use of the additional value tracking in some
+  // other way, later.
+  if (TTracker && isCalleeSavedReg(DestReg) && SrcRegOp->isKill())
+    TTracker->transferMlocs(MTracker->getRegMLoc(SrcReg),
+                            MTracker->getRegMLoc(DestReg), MI.getIterator());
+
+  // VarLocBasedImpl would quit tracking the old location after copying.
+  if (EmulateOldLDV && SrcReg != DestReg)
+    MTracker->defReg(SrcReg, CurBB, CurInst);
+
+  return true;
+}
+
+/// Accumulate a mapping between each DILocalVariable fragment and other
+/// fragments of that DILocalVariable which overlap. This reduces work during
+/// the data-flow stage from "Find any overlapping fragments" to "Check if the
+/// known-to-overlap fragments are present".
+/// \param MI A previously unprocessed debug instruction to analyze for
+///           fragment usage.
+void InstrRefBasedLDV::accumulateFragmentMap(MachineInstr &MI) {
+  assert(MI.isDebugValueLike());
+  DebugVariable MIVar(MI.getDebugVariable(), MI.getDebugExpression(),
+                      MI.getDebugLoc()->getInlinedAt());
+  FragmentInfo ThisFragment = MIVar.getFragmentOrDefault();
+
+  // If this is the first sighting of this variable, then we are guaranteed
+  // there are currently no overlapping fragments either. Initialize the set
+  // of seen fragments, record no overlaps for the current one, and return.
+  auto SeenIt = SeenFragments.find(MIVar.getVariable());
+  if (SeenIt == SeenFragments.end()) {
+    SmallSet<FragmentInfo, 4> OneFragment;
+    OneFragment.insert(ThisFragment);
+    SeenFragments.insert({MIVar.getVariable(), OneFragment});
+
+    OverlapFragments.insert({{MIVar.getVariable(), ThisFragment}, {}});
+    return;
+  }
+
+  // If this particular Variable/Fragment pair already exists in the overlap
+  // map, it has already been accounted for.
+  auto IsInOLapMap =
+      OverlapFragments.insert({{MIVar.getVariable(), ThisFragment}, {}});
+  if (!IsInOLapMap.second)
+    return;
+
+  auto &ThisFragmentsOverlaps = IsInOLapMap.first->second;
+  auto &AllSeenFragments = SeenIt->second;
+
+  // Otherwise, examine all other seen fragments for this variable, with "this"
+  // fragment being a previously unseen fragment. Record any pair of
+  // overlapping fragments.
+  for (const auto &ASeenFragment : AllSeenFragments) {
+    // Does this previously seen fragment overlap?
+    if (DIExpression::fragmentsOverlap(ThisFragment, ASeenFragment)) {
+      // Yes: Mark the current fragment as being overlapped.
+      ThisFragmentsOverlaps.push_back(ASeenFragment);
+      // Mark the previously seen fragment as being overlapped by the current
+      // one.
+      auto ASeenFragmentsOverlaps =
+          OverlapFragments.find({MIVar.getVariable(), ASeenFragment});
+      assert(ASeenFragmentsOverlaps != OverlapFragments.end() &&
+             "Previously seen var fragment has no vector of overlaps");
+      ASeenFragmentsOverlaps->second.push_back(ThisFragment);
+    }
+  }
+
+  AllSeenFragments.insert(ThisFragment);
+}
+
+void InstrRefBasedLDV::process(MachineInstr &MI, const ValueTable *MLiveOuts,
+                               const ValueTable *MLiveIns) {
+  // Try to interpret an MI as a debug or transfer instruction. Only if it's
+  // none of these should we interpret it's register defs as new value
+  // definitions.
+  if (transferDebugValue(MI))
+    return;
+  if (transferDebugInstrRef(MI, MLiveOuts, MLiveIns))
+    return;
+  if (transferDebugPHI(MI))
+    return;
+  if (transferRegisterCopy(MI))
+    return;
+  if (transferSpillOrRestoreInst(MI))
+    return;
+  transferRegisterDef(MI);
+}
+
+void InstrRefBasedLDV::produceMLocTransferFunction(
+    MachineFunction &MF, SmallVectorImpl<MLocTransferMap> &MLocTransfer,
+    unsigned MaxNumBlocks) {
+  // Because we try to optimize around register mask operands by ignoring regs
+  // that aren't currently tracked, we set up something ugly for later: RegMask
+  // operands that are seen earlier than the first use of a register, still need
+  // to clobber that register in the transfer function. But this information
+  // isn't actively recorded. Instead, we track each RegMask used in each block,
+  // and accumulated the clobbered but untracked registers in each block into
+  // the following bitvector. Later, if new values are tracked, we can add
+  // appropriate clobbers.
+  SmallVector<BitVector, 32> BlockMasks;
+  BlockMasks.resize(MaxNumBlocks);
+
+  // Reserve one bit per register for the masks described above.
+  unsigned BVWords = MachineOperand::getRegMaskSize(TRI->getNumRegs());
+  for (auto &BV : BlockMasks)
+    BV.resize(TRI->getNumRegs(), true);
+
+  // Step through all instructions and inhale the transfer function.
+  for (auto &MBB : MF) {
+    // Object fields that are read by trackers to know where we are in the
+    // function.
+    CurBB = MBB.getNumber();
+    CurInst = 1;
+
+    // Set all machine locations to a PHI value. For transfer function
+    // production only, this signifies the live-in value to the block.
+    MTracker->reset();
+    MTracker->setMPhis(CurBB);
+
+    // Step through each instruction in this block.
+    for (auto &MI : MBB) {
+      // Pass in an empty unique_ptr for the value tables when accumulating the
+      // machine transfer function.
+      process(MI, nullptr, nullptr);
+
+      // Also accumulate fragment map.
+      if (MI.isDebugValueLike())
+        accumulateFragmentMap(MI);
+
+      // Create a map from the instruction number (if present) to the
+      // MachineInstr and its position.
+      if (uint64_t InstrNo = MI.peekDebugInstrNum()) {
+        auto InstrAndPos = std::make_pair(&MI, CurInst);
+        auto InsertResult =
+            DebugInstrNumToInstr.insert(std::make_pair(InstrNo, InstrAndPos));
+
+        // There should never be duplicate instruction numbers.
+        assert(InsertResult.second);
+        (void)InsertResult;
+      }
+
+      ++CurInst;
+    }
+
+    // Produce the transfer function, a map of machine location to new value. If
+    // any machine location has the live-in phi value from the start of the
+    // block, it's live-through and doesn't need recording in the transfer
+    // function.
+    for (auto Location : MTracker->locations()) {
+      LocIdx Idx = Location.Idx;
+      ValueIDNum &P = Location.Value;
+      if (P.isPHI() && P.getLoc() == Idx.asU64())
+        continue;
+
+      // Insert-or-update.
+      auto &TransferMap = MLocTransfer[CurBB];
+      auto Result = TransferMap.insert(std::make_pair(Idx.asU64(), P));
+      if (!Result.second)
+        Result.first->second = P;
+    }
+
+    // Accumulate any bitmask operands into the clobbered reg mask for this
+    // block.
+    for (auto &P : MTracker->Masks) {
+      BlockMasks[CurBB].clearBitsNotInMask(P.first->getRegMask(), BVWords);
+    }
+  }
+
+  // Compute a bitvector of all the registers that are tracked in this block.
+  BitVector UsedRegs(TRI->getNumRegs());
+  for (auto Location : MTracker->locations()) {
+    unsigned ID = MTracker->LocIdxToLocID[Location.Idx];
+    // Ignore stack slots, and aliases of the stack pointer.
+    if (ID >= TRI->getNumRegs() || MTracker->SPAliases.count(ID))
+      continue;
+    UsedRegs.set(ID);
+  }
+
+  // Check that any regmask-clobber of a register that gets tracked, is not
+  // live-through in the transfer function. It needs to be clobbered at the
+  // very least.
+  for (unsigned int I = 0; I < MaxNumBlocks; ++I) {
+    BitVector &BV = BlockMasks[I];
+    BV.flip();
+    BV &= UsedRegs;
+    // This produces all the bits that we clobber, but also use. Check that
+    // they're all clobbered or at least set in the designated transfer
+    // elem.
+    for (unsigned Bit : BV.set_bits()) {
+      unsigned ID = MTracker->getLocID(Bit);
+      LocIdx Idx = MTracker->LocIDToLocIdx[ID];
+      auto &TransferMap = MLocTransfer[I];
+
+      // Install a value representing the fact that this location is effectively
+      // written to in this block. As there's no reserved value, instead use
+      // a value number that is never generated. Pick the value number for the
+      // first instruction in the block, def'ing this location, which we know
+      // this block never used anyway.
+      ValueIDNum NotGeneratedNum = ValueIDNum(I, 1, Idx);
+      auto Result =
+        TransferMap.insert(std::make_pair(Idx.asU64(), NotGeneratedNum));
+      if (!Result.second) {
+        ValueIDNum &ValueID = Result.first->second;
+        if (ValueID.getBlock() == I && ValueID.isPHI())
+          // It was left as live-through. Set it to clobbered.
+          ValueID = NotGeneratedNum;
+      }
+    }
+  }
+}
+
+bool InstrRefBasedLDV::mlocJoin(
+    MachineBasicBlock &MBB, SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
+    FuncValueTable &OutLocs, ValueTable &InLocs) {
+  LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n");
+  bool Changed = false;
+
+  // Handle value-propagation when control flow merges on entry to a block. For
+  // any location without a PHI already placed, the location has the same value
+  // as its predecessors. If a PHI is placed, test to see whether it's now a
+  // redundant PHI that we can eliminate.
+
+  SmallVector<const MachineBasicBlock *, 8> BlockOrders;
+  for (auto *Pred : MBB.predecessors())
+    BlockOrders.push_back(Pred);
+
+  // Visit predecessors in RPOT order.
+  auto Cmp = [&](const MachineBasicBlock *A, const MachineBasicBlock *B) {
+    return BBToOrder.find(A)->second < BBToOrder.find(B)->second;
+  };
+  llvm::sort(BlockOrders, Cmp);
+
+  // Skip entry block.
+  if (BlockOrders.size() == 0)
+    return false;
+
+  // Step through all machine locations, look at each predecessor and test
+  // whether we can eliminate redundant PHIs.
+  for (auto Location : MTracker->locations()) {
+    LocIdx Idx = Location.Idx;
+
+    // Pick out the first predecessors live-out value for this location. It's
+    // guaranteed to not be a backedge, as we order by RPO.
+    ValueIDNum FirstVal = OutLocs[BlockOrders[0]->getNumber()][Idx.asU64()];
+
+    // If we've already eliminated a PHI here, do no further checking, just
+    // propagate the first live-in value into this block.
+    if (InLocs[Idx.asU64()] != ValueIDNum(MBB.getNumber(), 0, Idx)) {
+      if (InLocs[Idx.asU64()] != FirstVal) {
+        InLocs[Idx.asU64()] = FirstVal;
+        Changed |= true;
+      }
+      continue;
+    }
+
+    // We're now examining a PHI to see whether it's un-necessary. Loop around
+    // the other live-in values and test whether they're all the same.
+    bool Disagree = false;
+    for (unsigned int I = 1; I < BlockOrders.size(); ++I) {
+      const MachineBasicBlock *PredMBB = BlockOrders[I];
+      const ValueIDNum &PredLiveOut =
+          OutLocs[PredMBB->getNumber()][Idx.asU64()];
+
+      // Incoming values agree, continue trying to eliminate this PHI.
+      if (FirstVal == PredLiveOut)
+        continue;
+
+      // We can also accept a PHI value that feeds back into itself.
+      if (PredLiveOut == ValueIDNum(MBB.getNumber(), 0, Idx))
+        continue;
+
+      // Live-out of a predecessor disagrees with the first predecessor.
+      Disagree = true;
+    }
+
+    // No disagreement? No PHI. Otherwise, leave the PHI in live-ins.
+    if (!Disagree) {
+      InLocs[Idx.asU64()] = FirstVal;
+      Changed |= true;
+    }
+  }
+
+  // TODO: Reimplement NumInserted and NumRemoved.
+  return Changed;
+}
+
+void InstrRefBasedLDV::findStackIndexInterference(
+    SmallVectorImpl<unsigned> &Slots) {
+  // We could spend a bit of time finding the exact, minimal, set of stack
+  // indexes that interfere with each other, much like reg units. Or, we can
+  // rely on the fact that:
+  //  * The smallest / lowest index will interfere with everything at zero
+  //    offset, which will be the largest set of registers,
+  //  * Most indexes with non-zero offset will end up being interference units
+  //    anyway.
+  // So just pick those out and return them.
+
+  // We can rely on a single-byte stack index existing already, because we
+  // initialize them in MLocTracker.
+  auto It = MTracker->StackSlotIdxes.find({8, 0});
+  assert(It != MTracker->StackSlotIdxes.end());
+  Slots.push_back(It->second);
+
+  // Find anything that has a non-zero offset and add that too.
+  for (auto &Pair : MTracker->StackSlotIdxes) {
+    // Is offset zero? If so, ignore.
+    if (!Pair.first.second)
+      continue;
+    Slots.push_back(Pair.second);
+  }
+}
+
+void InstrRefBasedLDV::placeMLocPHIs(
+    MachineFunction &MF, SmallPtrSetImpl<MachineBasicBlock *> &AllBlocks,
+    FuncValueTable &MInLocs, SmallVectorImpl<MLocTransferMap> &MLocTransfer) {
+  SmallVector<unsigned, 4> StackUnits;
+  findStackIndexInterference(StackUnits);
+
+  // To avoid repeatedly running the PHI placement algorithm, leverage the
+  // fact that a def of register MUST also def its register units. Find the
+  // units for registers, place PHIs for them, and then replicate them for
+  // aliasing registers. Some inputs that are never def'd (DBG_PHIs of
+  // arguments) don't lead to register units being tracked, just place PHIs for
+  // those registers directly. Stack slots have their own form of "unit",
+  // store them to one side.
+  SmallSet<Register, 32> RegUnitsToPHIUp;
+  SmallSet<LocIdx, 32> NormalLocsToPHI;
+  SmallSet<SpillLocationNo, 32> StackSlots;
+  for (auto Location : MTracker->locations()) {
+    LocIdx L = Location.Idx;
+    if (MTracker->isSpill(L)) {
+      StackSlots.insert(MTracker->locIDToSpill(MTracker->LocIdxToLocID[L]));
+      continue;
+    }
+
+    Register R = MTracker->LocIdxToLocID[L];
+    SmallSet<Register, 8> FoundRegUnits;
+    bool AnyIllegal = false;
+    for (MCRegUnit Unit : TRI->regunits(R.asMCReg())) {
+      for (MCRegUnitRootIterator URoot(Unit, TRI); URoot.isValid(); ++URoot) {
+        if (!MTracker->isRegisterTracked(*URoot)) {
+          // Not all roots were loaded into the tracking map: this register
+          // isn't actually def'd anywhere, we only read from it. Generate PHIs
+          // for this reg, but don't iterate units.
+          AnyIllegal = true;
+        } else {
+          FoundRegUnits.insert(*URoot);
+        }
+      }
+    }
+
+    if (AnyIllegal) {
+      NormalLocsToPHI.insert(L);
+      continue;
+    }
+
+    RegUnitsToPHIUp.insert(FoundRegUnits.begin(), FoundRegUnits.end());
+  }
+
+  // Lambda to fetch PHIs for a given location, and write into the PHIBlocks
+  // collection.
+  SmallVector<MachineBasicBlock *, 32> PHIBlocks;
+  auto CollectPHIsForLoc = [&](LocIdx L) {
+    // Collect the set of defs.
+    SmallPtrSet<MachineBasicBlock *, 32> DefBlocks;
+    for (unsigned int I = 0; I < OrderToBB.size(); ++I) {
+      MachineBasicBlock *MBB = OrderToBB[I];
+      const auto &TransferFunc = MLocTransfer[MBB->getNumber()];
+      if (TransferFunc.find(L) != TransferFunc.end())
+        DefBlocks.insert(MBB);
+    }
+
+    // The entry block defs the location too: it's the live-in / argument value.
+    // Only insert if there are other defs though; everything is trivially live
+    // through otherwise.
+    if (!DefBlocks.empty())
+      DefBlocks.insert(&*MF.begin());
+
+    // Ask the SSA construction algorithm where we should put PHIs. Clear
+    // anything that might have been hanging around from earlier.
+    PHIBlocks.clear();
+    BlockPHIPlacement(AllBlocks, DefBlocks, PHIBlocks);
+  };
+
+  auto InstallPHIsAtLoc = [&PHIBlocks, &MInLocs](LocIdx L) {
+    for (const MachineBasicBlock *MBB : PHIBlocks)
+      MInLocs[MBB->getNumber()][L.asU64()] = ValueIDNum(MBB->getNumber(), 0, L);
+  };
+
+  // For locations with no reg units, just place PHIs.
+  for (LocIdx L : NormalLocsToPHI) {
+    CollectPHIsForLoc(L);
+    // Install those PHI values into the live-in value array.
+    InstallPHIsAtLoc(L);
+  }
+
+  // For stack slots, calculate PHIs for the equivalent of the units, then
+  // install for each index.
+  for (SpillLocationNo Slot : StackSlots) {
+    for (unsigned Idx : StackUnits) {
+      unsigned SpillID = MTracker->getSpillIDWithIdx(Slot, Idx);
+      LocIdx L = MTracker->getSpillMLoc(SpillID);
+      CollectPHIsForLoc(L);
+      InstallPHIsAtLoc(L);
+
+      // Find anything that aliases this stack index, install PHIs for it too.
+      unsigned Size, Offset;
+      std::tie(Size, Offset) = MTracker->StackIdxesToPos[Idx];
+      for (auto &Pair : MTracker->StackSlotIdxes) {
+        unsigned ThisSize, ThisOffset;
+        std::tie(ThisSize, ThisOffset) = Pair.first;
+        if (ThisSize + ThisOffset <= Offset || Size + Offset <= ThisOffset)
+          continue;
+
+        unsigned ThisID = MTracker->getSpillIDWithIdx(Slot, Pair.second);
+        LocIdx ThisL = MTracker->getSpillMLoc(ThisID);
+        InstallPHIsAtLoc(ThisL);
+      }
+    }
+  }
+
+  // For reg units, place PHIs, and then place them for any aliasing registers.
+  for (Register R : RegUnitsToPHIUp) {
+    LocIdx L = MTracker->lookupOrTrackRegister(R);
+    CollectPHIsForLoc(L);
+
+    // Install those PHI values into the live-in value array.
+    InstallPHIsAtLoc(L);
+
+    // Now find aliases and install PHIs for those.
+    for (MCRegAliasIterator RAI(R, TRI, true); RAI.isValid(); ++RAI) {
+      // Super-registers that are "above" the largest register read/written by
+      // the function will alias, but will not be tracked.
+      if (!MTracker->isRegisterTracked(*RAI))
+        continue;
+
+      LocIdx AliasLoc = MTracker->lookupOrTrackRegister(*RAI);
+      InstallPHIsAtLoc(AliasLoc);
+    }
+  }
+}
+
+void InstrRefBasedLDV::buildMLocValueMap(
+    MachineFunction &MF, FuncValueTable &MInLocs, FuncValueTable &MOutLocs,
+    SmallVectorImpl<MLocTransferMap> &MLocTransfer) {
+  std::priority_queue<unsigned int, std::vector<unsigned int>,
+                      std::greater<unsigned int>>
+      Worklist, Pending;
+
+  // We track what is on the current and pending worklist to avoid inserting
+  // the same thing twice. We could avoid this with a custom priority queue,
+  // but this is probably not worth it.
+  SmallPtrSet<MachineBasicBlock *, 16> OnPending, OnWorklist;
+
+  // Initialize worklist with every block to be visited. Also produce list of
+  // all blocks.
+  SmallPtrSet<MachineBasicBlock *, 32> AllBlocks;
+  for (unsigned int I = 0; I < BBToOrder.size(); ++I) {
+    Worklist.push(I);
+    OnWorklist.insert(OrderToBB[I]);
+    AllBlocks.insert(OrderToBB[I]);
+  }
+
+  // Initialize entry block to PHIs. These represent arguments.
+  for (auto Location : MTracker->locations())
+    MInLocs[0][Location.Idx.asU64()] = ValueIDNum(0, 0, Location.Idx);
+
+  MTracker->reset();
+
+  // Start by placing PHIs, using the usual SSA constructor algorithm. Consider
+  // any machine-location that isn't live-through a block to be def'd in that
+  // block.
+  placeMLocPHIs(MF, AllBlocks, MInLocs, MLocTransfer);
+
+  // Propagate values to eliminate redundant PHIs. At the same time, this
+  // produces the table of Block x Location => Value for the entry to each
+  // block.
+  // The kind of PHIs we can eliminate are, for example, where one path in a
+  // conditional spills and restores a register, and the register still has
+  // the same value once control flow joins, unbeknowns to the PHI placement
+  // code. Propagating values allows us to identify such un-necessary PHIs and
+  // remove them.
+  SmallPtrSet<const MachineBasicBlock *, 16> Visited;
+  while (!Worklist.empty() || !Pending.empty()) {
+    // Vector for storing the evaluated block transfer function.
+    SmallVector<std::pair<LocIdx, ValueIDNum>, 32> ToRemap;
+
+    while (!Worklist.empty()) {
+      MachineBasicBlock *MBB = OrderToBB[Worklist.top()];
+      CurBB = MBB->getNumber();
+      Worklist.pop();
+
+      // Join the values in all predecessor blocks.
+      bool InLocsChanged;
+      InLocsChanged = mlocJoin(*MBB, Visited, MOutLocs, MInLocs[CurBB]);
+      InLocsChanged |= Visited.insert(MBB).second;
+
+      // Don't examine transfer function if we've visited this loc at least
+      // once, and inlocs haven't changed.
+      if (!InLocsChanged)
+        continue;
+
+      // Load the current set of live-ins into MLocTracker.
+      MTracker->loadFromArray(MInLocs[CurBB], CurBB);
+
+      // Each element of the transfer function can be a new def, or a read of
+      // a live-in value. Evaluate each element, and store to "ToRemap".
+      ToRemap.clear();
+      for (auto &P : MLocTransfer[CurBB]) {
+        if (P.second.getBlock() == CurBB && P.second.isPHI()) {
+          // This is a movement of whatever was live in. Read it.
+          ValueIDNum NewID = MTracker->readMLoc(P.second.getLoc());
+          ToRemap.push_back(std::make_pair(P.first, NewID));
+        } else {
+          // It's a def. Just set it.
+          assert(P.second.getBlock() == CurBB);
+          ToRemap.push_back(std::make_pair(P.first, P.second));
+        }
+      }
+
+      // Commit the transfer function changes into mloc tracker, which
+      // transforms the contents of the MLocTracker into the live-outs.
+      for (auto &P : ToRemap)
+        MTracker->setMLoc(P.first, P.second);
+
+      // Now copy out-locs from mloc tracker into out-loc vector, checking
+      // whether changes have occurred. These changes can have come from both
+      // the transfer function, and mlocJoin.
+      bool OLChanged = false;
+      for (auto Location : MTracker->locations()) {
+        OLChanged |= MOutLocs[CurBB][Location.Idx.asU64()] != Location.Value;
+        MOutLocs[CurBB][Location.Idx.asU64()] = Location.Value;
+      }
+
+      MTracker->reset();
+
+      // No need to examine successors again if out-locs didn't change.
+      if (!OLChanged)
+        continue;
+
+      // All successors should be visited: put any back-edges on the pending
+      // list for the next pass-through, and any other successors to be
+      // visited this pass, if they're not going to be already.
+      for (auto *s : MBB->successors()) {
+        // Does branching to this successor represent a back-edge?
+        if (BBToOrder[s] > BBToOrder[MBB]) {
+          // No: visit it during this dataflow iteration.
+          if (OnWorklist.insert(s).second)
+            Worklist.push(BBToOrder[s]);
+        } else {
+          // Yes: visit it on the next iteration.
+          if (OnPending.insert(s).second)
+            Pending.push(BBToOrder[s]);
+        }
+      }
+    }
+
+    Worklist.swap(Pending);
+    std::swap(OnPending, OnWorklist);
+    OnPending.clear();
+    // At this point, pending must be empty, since it was just the empty
+    // worklist
+    assert(Pending.empty() && "Pending should be empty");
+  }
+
+  // Once all the live-ins don't change on mlocJoin(), we've eliminated all
+  // redundant PHIs.
+}
+
+void InstrRefBasedLDV::BlockPHIPlacement(
+    const SmallPtrSetImpl<MachineBasicBlock *> &AllBlocks,
+    const SmallPtrSetImpl<MachineBasicBlock *> &DefBlocks,
+    SmallVectorImpl<MachineBasicBlock *> &PHIBlocks) {
+  // Apply IDF calculator to the designated set of location defs, storing
+  // required PHIs into PHIBlocks. Uses the dominator tree stored in the
+  // InstrRefBasedLDV object.
+  IDFCalculatorBase<MachineBasicBlock, false> IDF(DomTree->getBase());
+
+  IDF.setLiveInBlocks(AllBlocks);
+  IDF.setDefiningBlocks(DefBlocks);
+  IDF.calculate(PHIBlocks);
+}
+
+bool InstrRefBasedLDV::pickVPHILoc(
+    SmallVectorImpl<DbgOpID> &OutValues, const MachineBasicBlock &MBB,
+    const LiveIdxT &LiveOuts, FuncValueTable &MOutLocs,
+    const SmallVectorImpl<const MachineBasicBlock *> &BlockOrders) {
+
+  // No predecessors means no PHIs.
+  if (BlockOrders.empty())
+    return false;
+
+  // All the location operands that do not already agree need to be joined,
+  // track the indices of each such location operand here.
+  SmallDenseSet<unsigned> LocOpsToJoin;
+
+  auto FirstValueIt = LiveOuts.find(BlockOrders[0]);
+  if (FirstValueIt == LiveOuts.end())
+    return false;
+  const DbgValue &FirstValue = *FirstValueIt->second;
+
+  for (const auto p : BlockOrders) {
+    auto OutValIt = LiveOuts.find(p);
+    if (OutValIt == LiveOuts.end())
+      // If we have a predecessor not in scope, we'll never find a PHI position.
+      return false;
+    const DbgValue &OutVal = *OutValIt->second;
+
+    // No-values cannot have locations we can join on.
+    if (OutVal.Kind == DbgValue::NoVal)
+      return false;
+
+    // For unjoined VPHIs where we don't know the location, we definitely
+    // can't find a join loc unless the VPHI is a backedge.
+    if (OutVal.isUnjoinedPHI() && OutVal.BlockNo != MBB.getNumber())
+      return false;
+
+    if (!FirstValue.Properties.isJoinable(OutVal.Properties))
+      return false;
+
+    for (unsigned Idx = 0; Idx < FirstValue.getLocationOpCount(); ++Idx) {
+      // An unjoined PHI has no defined locations, and so a shared location must
+      // be found for every operand.
+      if (OutVal.isUnjoinedPHI()) {
+        LocOpsToJoin.insert(Idx);
+        continue;
+      }
+      DbgOpID FirstValOp = FirstValue.getDbgOpID(Idx);
+      DbgOpID OutValOp = OutVal.getDbgOpID(Idx);
+      if (FirstValOp != OutValOp) {
+        // We can never join constant ops - the ops must either both be equal
+        // constant ops or non-const ops.
+        if (FirstValOp.isConst() || OutValOp.isConst())
+          return false;
+        else
+          LocOpsToJoin.insert(Idx);
+      }
+    }
+  }
+
+  SmallVector<DbgOpID> NewDbgOps;
+
+  for (unsigned Idx = 0; Idx < FirstValue.getLocationOpCount(); ++Idx) {
+    // If this op doesn't need to be joined because the values agree, use that
+    // already-agreed value.
+    if (!LocOpsToJoin.contains(Idx)) {
+      NewDbgOps.push_back(FirstValue.getDbgOpID(Idx));
+      continue;
+    }
+
+    std::optional<ValueIDNum> JoinedOpLoc =
+        pickOperandPHILoc(Idx, MBB, LiveOuts, MOutLocs, BlockOrders);
+
+    if (!JoinedOpLoc)
+      return false;
+
+    NewDbgOps.push_back(DbgOpStore.insert(*JoinedOpLoc));
+  }
+
+  OutValues.append(NewDbgOps);
+  return true;
+}
+
+std::optional<ValueIDNum> InstrRefBasedLDV::pickOperandPHILoc(
+    unsigned DbgOpIdx, const MachineBasicBlock &MBB, const LiveIdxT &LiveOuts,
+    FuncValueTable &MOutLocs,
+    const SmallVectorImpl<const MachineBasicBlock *> &BlockOrders) {
+
+  // Collect a set of locations from predecessor where its live-out value can
+  // be found.
+  SmallVector<SmallVector<LocIdx, 4>, 8> Locs;
+  unsigned NumLocs = MTracker->getNumLocs();
+
+  for (const auto p : BlockOrders) {
+    unsigned ThisBBNum = p->getNumber();
+    auto OutValIt = LiveOuts.find(p);
+    assert(OutValIt != LiveOuts.end());
+    const DbgValue &OutVal = *OutValIt->second;
+    DbgOpID OutValOpID = OutVal.getDbgOpID(DbgOpIdx);
+    DbgOp OutValOp = DbgOpStore.find(OutValOpID);
+    assert(!OutValOp.IsConst);
+
+    // Create new empty vector of locations.
+    Locs.resize(Locs.size() + 1);
+
+    // If the live-in value is a def, find the locations where that value is
+    // present. Do the same for VPHIs where we know the VPHI value.
+    if (OutVal.Kind == DbgValue::Def ||
+        (OutVal.Kind == DbgValue::VPHI && OutVal.BlockNo != MBB.getNumber() &&
+         !OutValOp.isUndef())) {
+      ValueIDNum ValToLookFor = OutValOp.ID;
+      // Search the live-outs of the predecessor for the specified value.
+      for (unsigned int I = 0; I < NumLocs; ++I) {
+        if (MOutLocs[ThisBBNum][I] == ValToLookFor)
+          Locs.back().push_back(LocIdx(I));
+      }
+    } else {
+      assert(OutVal.Kind == DbgValue::VPHI);
+      // Otherwise: this is a VPHI on a backedge feeding back into itself, i.e.
+      // a value that's live-through the whole loop. (It has to be a backedge,
+      // because a block can't dominate itself). We can accept as a PHI location
+      // any location where the other predecessors agree, _and_ the machine
+      // locations feed back into themselves. Therefore, add all self-looping
+      // machine-value PHI locations.
+      for (unsigned int I = 0; I < NumLocs; ++I) {
+        ValueIDNum MPHI(MBB.getNumber(), 0, LocIdx(I));
+        if (MOutLocs[ThisBBNum][I] == MPHI)
+          Locs.back().push_back(LocIdx(I));
+      }
+    }
+  }
+  // We should have found locations for all predecessors, or returned.
+  assert(Locs.size() == BlockOrders.size());
+
+  // Starting with the first set of locations, take the intersection with
+  // subsequent sets.
+  SmallVector<LocIdx, 4> CandidateLocs = Locs[0];
+  for (unsigned int I = 1; I < Locs.size(); ++I) {
+    auto &LocVec = Locs[I];
+    SmallVector<LocIdx, 4> NewCandidates;
+    std::set_intersection(CandidateLocs.begin(), CandidateLocs.end(),
+                          LocVec.begin(), LocVec.end(), std::inserter(NewCandidates, NewCandidates.begin()));
+    CandidateLocs = NewCandidates;
+  }
+  if (CandidateLocs.empty())
+    return std::nullopt;
+
+  // We now have a set of LocIdxes that contain the right output value in
+  // each of the predecessors. Pick the lowest; if there's a register loc,
+  // that'll be it.
+  LocIdx L = *CandidateLocs.begin();
+
+  // Return a PHI-value-number for the found location.
+  ValueIDNum PHIVal = {(unsigned)MBB.getNumber(), 0, L};
+  return PHIVal;
+}
+
+bool InstrRefBasedLDV::vlocJoin(
+    MachineBasicBlock &MBB, LiveIdxT &VLOCOutLocs,
+    SmallPtrSet<const MachineBasicBlock *, 8> &BlocksToExplore,
+    DbgValue &LiveIn) {
+  LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n");
+  bool Changed = false;
+
+  // Order predecessors by RPOT order, for exploring them in that order.
+  SmallVector<MachineBasicBlock *, 8> BlockOrders(MBB.predecessors());
+
+  auto Cmp = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
+    return BBToOrder[A] < BBToOrder[B];
+  };
+
+  llvm::sort(BlockOrders, Cmp);
+
+  unsigned CurBlockRPONum = BBToOrder[&MBB];
+
+  // Collect all the incoming DbgValues for this variable, from predecessor
+  // live-out values.
+  SmallVector<InValueT, 8> Values;
+  bool Bail = false;
+  int BackEdgesStart = 0;
+  for (auto *p : BlockOrders) {
+    // If the predecessor isn't in scope / to be explored, we'll never be
+    // able to join any locations.
+    if (!BlocksToExplore.contains(p)) {
+      Bail = true;
+      break;
+    }
+
+    // All Live-outs will have been initialized.
+    DbgValue &OutLoc = *VLOCOutLocs.find(p)->second;
+
+    // Keep track of where back-edges begin in the Values vector. Relies on
+    // BlockOrders being sorted by RPO.
+    unsigned ThisBBRPONum = BBToOrder[p];
+    if (ThisBBRPONum < CurBlockRPONum)
+      ++BackEdgesStart;
+
+    Values.push_back(std::make_pair(p, &OutLoc));
+  }
+
+  // If there were no values, or one of the predecessors couldn't have a
+  // value, then give up immediately. It's not safe to produce a live-in
+  // value. Leave as whatever it was before.
+  if (Bail || Values.size() == 0)
+    return false;
+
+  // All (non-entry) blocks have at least one non-backedge predecessor.
+  // Pick the variable value from the first of these, to compare against
+  // all others.
+  const DbgValue &FirstVal = *Values[0].second;
+
+  // If the old live-in value is not a PHI then either a) no PHI is needed
+  // here, or b) we eliminated the PHI that was here. If so, we can just
+  // propagate in the first parent's incoming value.
+  if (LiveIn.Kind != DbgValue::VPHI || LiveIn.BlockNo != MBB.getNumber()) {
+    Changed = LiveIn != FirstVal;
+    if (Changed)
+      LiveIn = FirstVal;
+    return Changed;
+  }
+
+  // Scan for variable values that can never be resolved: if they have
+  // different DIExpressions, different indirectness, or are mixed constants /
+  // non-constants.
+  for (const auto &V : Values) {
+    if (!V.second->Properties.isJoinable(FirstVal.Properties))
+      return false;
+    if (V.second->Kind == DbgValue::NoVal)
+      return false;
+    if (!V.second->hasJoinableLocOps(FirstVal))
+      return false;
+  }
+
+  // Try to eliminate this PHI. Do the incoming values all agree?
+  bool Disagree = false;
+  for (auto &V : Values) {
+    if (*V.second == FirstVal)
+      continue; // No disagreement.
+
+    // If both values are not equal but have equal non-empty IDs then they refer
+    // to the same value from different sources (e.g. one is VPHI and the other
+    // is Def), which does not cause disagreement.
+    if (V.second->hasIdenticalValidLocOps(FirstVal))
+      continue;
+
+    // Eliminate if a backedge feeds a VPHI back into itself.
+    if (V.second->Kind == DbgValue::VPHI &&
+        V.second->BlockNo == MBB.getNumber() &&
+        // Is this a backedge?
+        std::distance(Values.begin(), &V) >= BackEdgesStart)
+      continue;
+
+    Disagree = true;
+  }
+
+  // No disagreement -> live-through value.
+  if (!Disagree) {
+    Changed = LiveIn != FirstVal;
+    if (Changed)
+      LiveIn = FirstVal;
+    return Changed;
+  } else {
+    // Otherwise use a VPHI.
+    DbgValue VPHI(MBB.getNumber(), FirstVal.Properties, DbgValue::VPHI);
+    Changed = LiveIn != VPHI;
+    if (Changed)
+      LiveIn = VPHI;
+    return Changed;
+  }
+}
+
+void InstrRefBasedLDV::getBlocksForScope(
+    const DILocation *DILoc,
+    SmallPtrSetImpl<const MachineBasicBlock *> &BlocksToExplore,
+    const SmallPtrSetImpl<MachineBasicBlock *> &AssignBlocks) {
+  // Get the set of "normal" in-lexical-scope blocks.
+  LS.getMachineBasicBlocks(DILoc, BlocksToExplore);
+
+  // VarLoc LiveDebugValues tracks variable locations that are defined in
+  // blocks not in scope. This is something we could legitimately ignore, but
+  // lets allow it for now for the sake of coverage.
+  BlocksToExplore.insert(AssignBlocks.begin(), AssignBlocks.end());
+
+  // Storage for artificial blocks we intend to add to BlocksToExplore.
+  DenseSet<const MachineBasicBlock *> ToAdd;
+
+  // To avoid needlessly dropping large volumes of variable locations, propagate
+  // variables through aritifical blocks, i.e. those that don't have any
+  // instructions in scope at all. To accurately replicate VarLoc
+  // LiveDebugValues, this means exploring all artificial successors too.
+  // Perform a depth-first-search to enumerate those blocks.
+  for (const auto *MBB : BlocksToExplore) {
+    // Depth-first-search state: each node is a block and which successor
+    // we're currently exploring.
+    SmallVector<std::pair<const MachineBasicBlock *,
+                          MachineBasicBlock::const_succ_iterator>,
+                8>
+        DFS;
+
+    // Find any artificial successors not already tracked.
+    for (auto *succ : MBB->successors()) {
+      if (BlocksToExplore.count(succ))
+        continue;
+      if (!ArtificialBlocks.count(succ))
+        continue;
+      ToAdd.insert(succ);
+      DFS.push_back({succ, succ->succ_begin()});
+    }
+
+    // Search all those blocks, depth first.
+    while (!DFS.empty()) {
+      const MachineBasicBlock *CurBB = DFS.back().first;
+      MachineBasicBlock::const_succ_iterator &CurSucc = DFS.back().second;
+      // Walk back if we've explored this blocks successors to the end.
+      if (CurSucc == CurBB->succ_end()) {
+        DFS.pop_back();
+        continue;
+      }
+
+      // If the current successor is artificial and unexplored, descend into
+      // it.
+      if (!ToAdd.count(*CurSucc) && ArtificialBlocks.count(*CurSucc)) {
+        ToAdd.insert(*CurSucc);
+        DFS.push_back({*CurSucc, (*CurSucc)->succ_begin()});
+        continue;
+      }
+
+      ++CurSucc;
+    }
+  };
+
+  BlocksToExplore.insert(ToAdd.begin(), ToAdd.end());
+}
+
+void InstrRefBasedLDV::buildVLocValueMap(
+    const DILocation *DILoc, const SmallSet<DebugVariable, 4> &VarsWeCareAbout,
+    SmallPtrSetImpl<MachineBasicBlock *> &AssignBlocks, LiveInsT &Output,
+    FuncValueTable &MOutLocs, FuncValueTable &MInLocs,
+    SmallVectorImpl<VLocTracker> &AllTheVLocs) {
+  // This method is much like buildMLocValueMap: but focuses on a single
+  // LexicalScope at a time. Pick out a set of blocks and variables that are
+  // to have their value assignments solved, then run our dataflow algorithm
+  // until a fixedpoint is reached.
+  std::priority_queue<unsigned int, std::vector<unsigned int>,
+                      std::greater<unsigned int>>
+      Worklist, Pending;
+  SmallPtrSet<MachineBasicBlock *, 16> OnWorklist, OnPending;
+
+  // The set of blocks we'll be examining.
+  SmallPtrSet<const MachineBasicBlock *, 8> BlocksToExplore;
+
+  // The order in which to examine them (RPO).
+  SmallVector<MachineBasicBlock *, 8> BlockOrders;
+
+  // RPO ordering function.
+  auto Cmp = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
+    return BBToOrder[A] < BBToOrder[B];
+  };
+
+  getBlocksForScope(DILoc, BlocksToExplore, AssignBlocks);
+
+  // Single block scope: not interesting! No propagation at all. Note that
+  // this could probably go above ArtificialBlocks without damage, but
+  // that then produces output differences from original-live-debug-values,
+  // which propagates from a single block into many artificial ones.
+  if (BlocksToExplore.size() == 1)
+    return;
+
+  // Convert a const set to a non-const set. LexicalScopes
+  // getMachineBasicBlocks returns const MBB pointers, IDF wants mutable ones.
+  // (Neither of them mutate anything).
+  SmallPtrSet<MachineBasicBlock *, 8> MutBlocksToExplore;
+  for (const auto *MBB : BlocksToExplore)
+    MutBlocksToExplore.insert(const_cast<MachineBasicBlock *>(MBB));
+
+  // Picks out relevants blocks RPO order and sort them.
+  for (const auto *MBB : BlocksToExplore)
+    BlockOrders.push_back(const_cast<MachineBasicBlock *>(MBB));
+
+  llvm::sort(BlockOrders, Cmp);
+  unsigned NumBlocks = BlockOrders.size();
+
+  // Allocate some vectors for storing the live ins and live outs. Large.
+  SmallVector<DbgValue, 32> LiveIns, LiveOuts;
+  LiveIns.reserve(NumBlocks);
+  LiveOuts.reserve(NumBlocks);
+
+  // Initialize all values to start as NoVals. This signifies "it's live
+  // through, but we don't know what it is".
+  DbgValueProperties EmptyProperties(EmptyExpr, false, false);
+  for (unsigned int I = 0; I < NumBlocks; ++I) {
+    DbgValue EmptyDbgValue(I, EmptyProperties, DbgValue::NoVal);
+    LiveIns.push_back(EmptyDbgValue);
+    LiveOuts.push_back(EmptyDbgValue);
+  }
+
+  // Produce by-MBB indexes of live-in/live-outs, to ease lookup within
+  // vlocJoin.
+  LiveIdxT LiveOutIdx, LiveInIdx;
+  LiveOutIdx.reserve(NumBlocks);
+  LiveInIdx.reserve(NumBlocks);
+  for (unsigned I = 0; I < NumBlocks; ++I) {
+    LiveOutIdx[BlockOrders[I]] = &LiveOuts[I];
+    LiveInIdx[BlockOrders[I]] = &LiveIns[I];
+  }
+
+  // Loop over each variable and place PHIs for it, then propagate values
+  // between blocks. This keeps the locality of working on one lexical scope at
+  // at time, but avoids re-processing variable values because some other
+  // variable has been assigned.
+  for (const auto &Var : VarsWeCareAbout) {
+    // Re-initialize live-ins and live-outs, to clear the remains of previous
+    // variables live-ins / live-outs.
+    for (unsigned int I = 0; I < NumBlocks; ++I) {
+      DbgValue EmptyDbgValue(I, EmptyProperties, DbgValue::NoVal);
+      LiveIns[I] = EmptyDbgValue;
+      LiveOuts[I] = EmptyDbgValue;
+    }
+
+    // Place PHIs for variable values, using the LLVM IDF calculator.
+    // Collect the set of blocks where variables are def'd.
+    SmallPtrSet<MachineBasicBlock *, 32> DefBlocks;
+    for (const MachineBasicBlock *ExpMBB : BlocksToExplore) {
+      auto &TransferFunc = AllTheVLocs[ExpMBB->getNumber()].Vars;
+      if (TransferFunc.contains(Var))
+        DefBlocks.insert(const_cast<MachineBasicBlock *>(ExpMBB));
+    }
+
+    SmallVector<MachineBasicBlock *, 32> PHIBlocks;
+
+    // Request the set of PHIs we should insert for this variable. If there's
+    // only one value definition, things are very simple.
+    if (DefBlocks.size() == 1) {
+      placePHIsForSingleVarDefinition(MutBlocksToExplore, *DefBlocks.begin(),
+                                      AllTheVLocs, Var, Output);
+      continue;
+    }
+
+    // Otherwise: we need to place PHIs through SSA and propagate values.
+    BlockPHIPlacement(MutBlocksToExplore, DefBlocks, PHIBlocks);
+
+    // Insert PHIs into the per-block live-in tables for this variable.
+    for (MachineBasicBlock *PHIMBB : PHIBlocks) {
+      unsigned BlockNo = PHIMBB->getNumber();
+      DbgValue *LiveIn = LiveInIdx[PHIMBB];
+      *LiveIn = DbgValue(BlockNo, EmptyProperties, DbgValue::VPHI);
+    }
+
+    for (auto *MBB : BlockOrders) {
+      Worklist.push(BBToOrder[MBB]);
+      OnWorklist.insert(MBB);
+    }
+
+    // Iterate over all the blocks we selected, propagating the variables value.
+    // This loop does two things:
+    //  * Eliminates un-necessary VPHIs in vlocJoin,
+    //  * Evaluates the blocks transfer function (i.e. variable assignments) and
+    //    stores the result to the blocks live-outs.
+    // Always evaluate the transfer function on the first iteration, and when
+    // the live-ins change thereafter.
+    bool FirstTrip = true;
+    while (!Worklist.empty() || !Pending.empty()) {
+      while (!Worklist.empty()) {
+        auto *MBB = OrderToBB[Worklist.top()];
+        CurBB = MBB->getNumber();
+        Worklist.pop();
+
+        auto LiveInsIt = LiveInIdx.find(MBB);
+        assert(LiveInsIt != LiveInIdx.end());
+        DbgValue *LiveIn = LiveInsIt->second;
+
+        // Join values from predecessors. Updates LiveInIdx, and writes output
+        // into JoinedInLocs.
+        bool InLocsChanged =
+            vlocJoin(*MBB, LiveOutIdx, BlocksToExplore, *LiveIn);
+
+        SmallVector<const MachineBasicBlock *, 8> Preds;
+        for (const auto *Pred : MBB->predecessors())
+          Preds.push_back(Pred);
+
+        // If this block's live-in value is a VPHI, try to pick a machine-value
+        // for it. This makes the machine-value available and propagated
+        // through all blocks by the time value propagation finishes. We can't
+        // do this any earlier as it needs to read the block live-outs.
+        if (LiveIn->Kind == DbgValue::VPHI && LiveIn->BlockNo == (int)CurBB) {
+          // There's a small possibility that on a preceeding path, a VPHI is
+          // eliminated and transitions from VPHI-with-location to
+          // live-through-value. As a result, the selected location of any VPHI
+          // might change, so we need to re-compute it on each iteration.
+          SmallVector<DbgOpID> JoinedOps;
+
+          if (pickVPHILoc(JoinedOps, *MBB, LiveOutIdx, MOutLocs, Preds)) {
+            bool NewLocPicked = !equal(LiveIn->getDbgOpIDs(), JoinedOps);
+            InLocsChanged |= NewLocPicked;
+            if (NewLocPicked)
+              LiveIn->setDbgOpIDs(JoinedOps);
+          }
+        }
+
+        if (!InLocsChanged && !FirstTrip)
+          continue;
+
+        DbgValue *LiveOut = LiveOutIdx[MBB];
+        bool OLChanged = false;
+
+        // Do transfer function.
+        auto &VTracker = AllTheVLocs[MBB->getNumber()];
+        auto TransferIt = VTracker.Vars.find(Var);
+        if (TransferIt != VTracker.Vars.end()) {
+          // Erase on empty transfer (DBG_VALUE $noreg).
+          if (TransferIt->second.Kind == DbgValue::Undef) {
+            DbgValue NewVal(MBB->getNumber(), EmptyProperties, DbgValue::NoVal);
+            if (*LiveOut != NewVal) {
+              *LiveOut = NewVal;
+              OLChanged = true;
+            }
+          } else {
+            // Insert new variable value; or overwrite.
+            if (*LiveOut != TransferIt->second) {
+              *LiveOut = TransferIt->second;
+              OLChanged = true;
+            }
+          }
+        } else {
+          // Just copy live-ins to live-outs, for anything not transferred.
+          if (*LiveOut != *LiveIn) {
+            *LiveOut = *LiveIn;
+            OLChanged = true;
+          }
+        }
+
+        // If no live-out value changed, there's no need to explore further.
+        if (!OLChanged)
+          continue;
+
+        // We should visit all successors. Ensure we'll visit any non-backedge
+        // successors during this dataflow iteration; book backedge successors
+        // to be visited next time around.
+        for (auto *s : MBB->successors()) {
+          // Ignore out of scope / not-to-be-explored successors.
+          if (!LiveInIdx.contains(s))
+            continue;
+
+          if (BBToOrder[s] > BBToOrder[MBB]) {
+            if (OnWorklist.insert(s).second)
+              Worklist.push(BBToOrder[s]);
+          } else if (OnPending.insert(s).second && (FirstTrip || OLChanged)) {
+            Pending.push(BBToOrder[s]);
+          }
+        }
+      }
+      Worklist.swap(Pending);
+      std::swap(OnWorklist, OnPending);
+      OnPending.clear();
+      assert(Pending.empty());
+      FirstTrip = false;
+    }
+
+    // Save live-ins to output vector. Ignore any that are still marked as being
+    // VPHIs with no location -- those are variables that we know the value of,
+    // but are not actually available in the register file.
+    for (auto *MBB : BlockOrders) {
+      DbgValue *BlockLiveIn = LiveInIdx[MBB];
+      if (BlockLiveIn->Kind == DbgValue::NoVal)
+        continue;
+      if (BlockLiveIn->isUnjoinedPHI())
+        continue;
+      if (BlockLiveIn->Kind == DbgValue::VPHI)
+        BlockLiveIn->Kind = DbgValue::Def;
+      assert(BlockLiveIn->Properties.DIExpr->getFragmentInfo() ==
+             Var.getFragment() && "Fragment info missing during value prop");
+      Output[MBB->getNumber()].push_back(std::make_pair(Var, *BlockLiveIn));
+    }
+  } // Per-variable loop.
+
+  BlockOrders.clear();
+  BlocksToExplore.clear();
+}
+
+void InstrRefBasedLDV::placePHIsForSingleVarDefinition(
+    const SmallPtrSetImpl<MachineBasicBlock *> &InScopeBlocks,
+    MachineBasicBlock *AssignMBB, SmallVectorImpl<VLocTracker> &AllTheVLocs,
+    const DebugVariable &Var, LiveInsT &Output) {
+  // If there is a single definition of the variable, then working out it's
+  // value everywhere is very simple: it's every block dominated by the
+  // definition. At the dominance frontier, the usual algorithm would:
+  //  * Place PHIs,
+  //  * Propagate values into them,
+  //  * Find there's no incoming variable value from the other incoming branches
+  //    of the dominance frontier,
+  //  * Specify there's no variable value in blocks past the frontier.
+  // This is a common case, hence it's worth special-casing it.
+
+  // Pick out the variables value from the block transfer function.
+  VLocTracker &VLocs = AllTheVLocs[AssignMBB->getNumber()];
+  auto ValueIt = VLocs.Vars.find(Var);
+  const DbgValue &Value = ValueIt->second;
+
+  // If it's an explicit assignment of "undef", that means there is no location
+  // anyway, anywhere.
+  if (Value.Kind == DbgValue::Undef)
+    return;
+
+  // Assign the variable value to entry to each dominated block that's in scope.
+  // Skip the definition block -- it's assigned the variable value in the middle
+  // of the block somewhere.
+  for (auto *ScopeBlock : InScopeBlocks) {
+    if (!DomTree->properlyDominates(AssignMBB, ScopeBlock))
+      continue;
+
+    Output[ScopeBlock->getNumber()].push_back({Var, Value});
+  }
+
+  // All blocks that aren't dominated have no live-in value, thus no variable
+  // value will be given to them.
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void InstrRefBasedLDV::dump_mloc_transfer(
+    const MLocTransferMap &mloc_transfer) const {
+  for (const auto &P : mloc_transfer) {
+    std::string foo = MTracker->LocIdxToName(P.first);
+    std::string bar = MTracker->IDAsString(P.second);
+    dbgs() << "Loc " << foo << " --> " << bar << "\n";
+  }
+}
+#endif
+
+void InstrRefBasedLDV::initialSetup(MachineFunction &MF) {
+  // Build some useful data structures.
+
+  LLVMContext &Context = MF.getFunction().getContext();
+  EmptyExpr = DIExpression::get(Context, {});
+
+  auto hasNonArtificialLocation = [](const MachineInstr &MI) -> bool {
+    if (const DebugLoc &DL = MI.getDebugLoc())
+      return DL.getLine() != 0;
+    return false;
+  };
+  // Collect a set of all the artificial blocks.
+  for (auto &MBB : MF)
+    if (none_of(MBB.instrs(), hasNonArtificialLocation))
+      ArtificialBlocks.insert(&MBB);
+
+  // Compute mappings of block <=> RPO order.
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  unsigned int RPONumber = 0;
+  auto processMBB = [&](MachineBasicBlock *MBB) {
+    OrderToBB[RPONumber] = MBB;
+    BBToOrder[MBB] = RPONumber;
+    BBNumToRPO[MBB->getNumber()] = RPONumber;
+    ++RPONumber;
+  };
+  for (MachineBasicBlock *MBB : RPOT)
+    processMBB(MBB);
+  for (MachineBasicBlock &MBB : MF)
+    if (!BBToOrder.contains(&MBB))
+      processMBB(&MBB);
+
+  // Order value substitutions by their "source" operand pair, for quick lookup.
+  llvm::sort(MF.DebugValueSubstitutions);
+
+#ifdef EXPENSIVE_CHECKS
+  // As an expensive check, test whether there are any duplicate substitution
+  // sources in the collection.
+  if (MF.DebugValueSubstitutions.size() > 2) {
+    for (auto It = MF.DebugValueSubstitutions.begin();
+         It != std::prev(MF.DebugValueSubstitutions.end()); ++It) {
+      assert(It->Src != std::next(It)->Src && "Duplicate variable location "
+                                              "substitution seen");
+    }
+  }
+#endif
+}
+
+// Produce an "ejection map" for blocks, i.e., what's the highest-numbered
+// lexical scope it's used in. When exploring in DFS order and we pass that
+// scope, the block can be processed and any tracking information freed.
+void InstrRefBasedLDV::makeDepthFirstEjectionMap(
+    SmallVectorImpl<unsigned> &EjectionMap,
+    const ScopeToDILocT &ScopeToDILocation,
+    ScopeToAssignBlocksT &ScopeToAssignBlocks) {
+  SmallPtrSet<const MachineBasicBlock *, 8> BlocksToExplore;
+  SmallVector<std::pair<LexicalScope *, ssize_t>, 4> WorkStack;
+  auto *TopScope = LS.getCurrentFunctionScope();
+
+  // Unlike lexical scope explorers, we explore in reverse order, to find the
+  // "last" lexical scope used for each block early.
+  WorkStack.push_back({TopScope, TopScope->getChildren().size() - 1});
+
+  while (!WorkStack.empty()) {
+    auto &ScopePosition = WorkStack.back();
+    LexicalScope *WS = ScopePosition.first;
+    ssize_t ChildNum = ScopePosition.second--;
+
+    const SmallVectorImpl<LexicalScope *> &Children = WS->getChildren();
+    if (ChildNum >= 0) {
+      // If ChildNum is positive, there are remaining children to explore.
+      // Push the child and its children-count onto the stack.
+      auto &ChildScope = Children[ChildNum];
+      WorkStack.push_back(
+          std::make_pair(ChildScope, ChildScope->getChildren().size() - 1));
+    } else {
+      WorkStack.pop_back();
+
+      // We've explored all children and any later blocks: examine all blocks
+      // in our scope. If they haven't yet had an ejection number set, then
+      // this scope will be the last to use that block.
+      auto DILocationIt = ScopeToDILocation.find(WS);
+      if (DILocationIt != ScopeToDILocation.end()) {
+        getBlocksForScope(DILocationIt->second, BlocksToExplore,
+                          ScopeToAssignBlocks.find(WS)->second);
+        for (const auto *MBB : BlocksToExplore) {
+          unsigned BBNum = MBB->getNumber();
+          if (EjectionMap[BBNum] == 0)
+            EjectionMap[BBNum] = WS->getDFSOut();
+        }
+
+        BlocksToExplore.clear();
+      }
+    }
+  }
+}
+
+bool InstrRefBasedLDV::depthFirstVLocAndEmit(
+    unsigned MaxNumBlocks, const ScopeToDILocT &ScopeToDILocation,
+    const ScopeToVarsT &ScopeToVars, ScopeToAssignBlocksT &ScopeToAssignBlocks,
+    LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs,
+    SmallVectorImpl<VLocTracker> &AllTheVLocs, MachineFunction &MF,
+    DenseMap<DebugVariable, unsigned> &AllVarsNumbering,
+    const TargetPassConfig &TPC) {
+  TTracker = new TransferTracker(TII, MTracker, MF, *TRI, CalleeSavedRegs, TPC);
+  unsigned NumLocs = MTracker->getNumLocs();
+  VTracker = nullptr;
+
+  // No scopes? No variable locations.
+  if (!LS.getCurrentFunctionScope())
+    return false;
+
+  // Build map from block number to the last scope that uses the block.
+  SmallVector<unsigned, 16> EjectionMap;
+  EjectionMap.resize(MaxNumBlocks, 0);
+  makeDepthFirstEjectionMap(EjectionMap, ScopeToDILocation,
+                            ScopeToAssignBlocks);
+
+  // Helper lambda for ejecting a block -- if nothing is going to use the block,
+  // we can translate the variable location information into DBG_VALUEs and then
+  // free all of InstrRefBasedLDV's data structures.
+  auto EjectBlock = [&](MachineBasicBlock &MBB) -> void {
+    unsigned BBNum = MBB.getNumber();
+    AllTheVLocs[BBNum].clear();
+
+    // Prime the transfer-tracker, and then step through all the block
+    // instructions, installing transfers.
+    MTracker->reset();
+    MTracker->loadFromArray(MInLocs[BBNum], BBNum);
+    TTracker->loadInlocs(MBB, MInLocs[BBNum], DbgOpStore, Output[BBNum],
+                         NumLocs);
+
+    CurBB = BBNum;
+    CurInst = 1;
+    for (auto &MI : MBB) {
+      process(MI, MOutLocs.get(), MInLocs.get());
+      TTracker->checkInstForNewValues(CurInst, MI.getIterator());
+      ++CurInst;
+    }
+
+    // Free machine-location tables for this block.
+    MInLocs[BBNum].reset();
+    MOutLocs[BBNum].reset();
+    // We don't need live-in variable values for this block either.
+    Output[BBNum].clear();
+    AllTheVLocs[BBNum].clear();
+  };
+
+  SmallPtrSet<const MachineBasicBlock *, 8> BlocksToExplore;
+  SmallVector<std::pair<LexicalScope *, ssize_t>, 4> WorkStack;
+  WorkStack.push_back({LS.getCurrentFunctionScope(), 0});
+  unsigned HighestDFSIn = 0;
+
+  // Proceed to explore in depth first order.
+  while (!WorkStack.empty()) {
+    auto &ScopePosition = WorkStack.back();
+    LexicalScope *WS = ScopePosition.first;
+    ssize_t ChildNum = ScopePosition.second++;
+
+    // We obesrve scopes with children twice here, once descending in, once
+    // ascending out of the scope nest. Use HighestDFSIn as a ratchet to ensure
+    // we don't process a scope twice. Additionally, ignore scopes that don't
+    // have a DILocation -- by proxy, this means we never tracked any variable
+    // assignments in that scope.
+    auto DILocIt = ScopeToDILocation.find(WS);
+    if (HighestDFSIn <= WS->getDFSIn() && DILocIt != ScopeToDILocation.end()) {
+      const DILocation *DILoc = DILocIt->second;
+      auto &VarsWeCareAbout = ScopeToVars.find(WS)->second;
+      auto &BlocksInScope = ScopeToAssignBlocks.find(WS)->second;
+
+      buildVLocValueMap(DILoc, VarsWeCareAbout, BlocksInScope, Output, MOutLocs,
+                        MInLocs, AllTheVLocs);
+    }
+
+    HighestDFSIn = std::max(HighestDFSIn, WS->getDFSIn());
+
+    // Descend into any scope nests.
+    const SmallVectorImpl<LexicalScope *> &Children = WS->getChildren();
+    if (ChildNum < (ssize_t)Children.size()) {
+      // There are children to explore -- push onto stack and continue.
+      auto &ChildScope = Children[ChildNum];
+      WorkStack.push_back(std::make_pair(ChildScope, 0));
+    } else {
+      WorkStack.pop_back();
+
+      // We've explored a leaf, or have explored all the children of a scope.
+      // Try to eject any blocks where this is the last scope it's relevant to.
+      auto DILocationIt = ScopeToDILocation.find(WS);
+      if (DILocationIt == ScopeToDILocation.end())
+        continue;
+
+      getBlocksForScope(DILocationIt->second, BlocksToExplore,
+                        ScopeToAssignBlocks.find(WS)->second);
+      for (const auto *MBB : BlocksToExplore)
+        if (WS->getDFSOut() == EjectionMap[MBB->getNumber()])
+          EjectBlock(const_cast<MachineBasicBlock &>(*MBB));
+
+      BlocksToExplore.clear();
+    }
+  }
+
+  // Some artificial blocks may not have been ejected, meaning they're not
+  // connected to an actual legitimate scope. This can technically happen
+  // with things like the entry block. In theory, we shouldn't need to do
+  // anything for such out-of-scope blocks, but for the sake of being similar
+  // to VarLocBasedLDV, eject these too.
+  for (auto *MBB : ArtificialBlocks)
+    if (MOutLocs[MBB->getNumber()])
+      EjectBlock(*MBB);
+
+  return emitTransfers(AllVarsNumbering);
+}
+
+bool InstrRefBasedLDV::emitTransfers(
+    DenseMap<DebugVariable, unsigned> &AllVarsNumbering) {
+  // Go through all the transfers recorded in the TransferTracker -- this is
+  // both the live-ins to a block, and any movements of values that happen
+  // in the middle.
+  for (const auto &P : TTracker->Transfers) {
+    // We have to insert DBG_VALUEs in a consistent order, otherwise they
+    // appear in DWARF in different orders. Use the order that they appear
+    // when walking through each block / each instruction, stored in
+    // AllVarsNumbering.
+    SmallVector<std::pair<unsigned, MachineInstr *>> Insts;
+    for (MachineInstr *MI : P.Insts) {
+      DebugVariable Var(MI->getDebugVariable(), MI->getDebugExpression(),
+                        MI->getDebugLoc()->getInlinedAt());
+      Insts.emplace_back(AllVarsNumbering.find(Var)->second, MI);
+    }
+    llvm::sort(Insts, llvm::less_first());
+
+    // Insert either before or after the designated point...
+    if (P.MBB) {
+      MachineBasicBlock &MBB = *P.MBB;
+      for (const auto &Pair : Insts)
+        MBB.insert(P.Pos, Pair.second);
+    } else {
+      // Terminators, like tail calls, can clobber things. Don't try and place
+      // transfers after them.
+      if (P.Pos->isTerminator())
+        continue;
+
+      MachineBasicBlock &MBB = *P.Pos->getParent();
+      for (const auto &Pair : Insts)
+        MBB.insertAfterBundle(P.Pos, Pair.second);
+    }
+  }
+
+  return TTracker->Transfers.size() != 0;
+}
+
+/// Calculate the liveness information for the given machine function and
+/// extend ranges across basic blocks.
+bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
+                                    MachineDominatorTree *DomTree,
+                                    TargetPassConfig *TPC,
+                                    unsigned InputBBLimit,
+                                    unsigned InputDbgValLimit) {
+  // No subprogram means this function contains no debuginfo.
+  if (!MF.getFunction().getSubprogram())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "\nDebug Range Extension\n");
+  this->TPC = TPC;
+
+  this->DomTree = DomTree;
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TFI = MF.getSubtarget().getFrameLowering();
+  TFI->getCalleeSaves(MF, CalleeSavedRegs);
+  MFI = &MF.getFrameInfo();
+  LS.initialize(MF);
+
+  const auto &STI = MF.getSubtarget();
+  AdjustsStackInCalls = MFI->adjustsStack() &&
+                        STI.getFrameLowering()->stackProbeFunctionModifiesSP();
+  if (AdjustsStackInCalls)
+    StackProbeSymbolName = STI.getTargetLowering()->getStackProbeSymbolName(MF);
+
+  MTracker =
+      new MLocTracker(MF, *TII, *TRI, *MF.getSubtarget().getTargetLowering());
+  VTracker = nullptr;
+  TTracker = nullptr;
+
+  SmallVector<MLocTransferMap, 32> MLocTransfer;
+  SmallVector<VLocTracker, 8> vlocs;
+  LiveInsT SavedLiveIns;
+
+  int MaxNumBlocks = -1;
+  for (auto &MBB : MF)
+    MaxNumBlocks = std::max(MBB.getNumber(), MaxNumBlocks);
+  assert(MaxNumBlocks >= 0);
+  ++MaxNumBlocks;
+
+  initialSetup(MF);
+
+  MLocTransfer.resize(MaxNumBlocks);
+  vlocs.resize(MaxNumBlocks, VLocTracker(OverlapFragments, EmptyExpr));
+  SavedLiveIns.resize(MaxNumBlocks);
+
+  produceMLocTransferFunction(MF, MLocTransfer, MaxNumBlocks);
+
+  // Allocate and initialize two array-of-arrays for the live-in and live-out
+  // machine values. The outer dimension is the block number; while the inner
+  // dimension is a LocIdx from MLocTracker.
+  FuncValueTable MOutLocs = std::make_unique<ValueTable[]>(MaxNumBlocks);
+  FuncValueTable MInLocs = std::make_unique<ValueTable[]>(MaxNumBlocks);
+  unsigned NumLocs = MTracker->getNumLocs();
+  for (int i = 0; i < MaxNumBlocks; ++i) {
+    // These all auto-initialize to ValueIDNum::EmptyValue
+    MOutLocs[i] = std::make_unique<ValueIDNum[]>(NumLocs);
+    MInLocs[i] = std::make_unique<ValueIDNum[]>(NumLocs);
+  }
+
+  // Solve the machine value dataflow problem using the MLocTransfer function,
+  // storing the computed live-ins / live-outs into the array-of-arrays. We use
+  // both live-ins and live-outs for decision making in the variable value
+  // dataflow problem.
+  buildMLocValueMap(MF, MInLocs, MOutLocs, MLocTransfer);
+
+  // Patch up debug phi numbers, turning unknown block-live-in values into
+  // either live-through machine values, or PHIs.
+  for (auto &DBG_PHI : DebugPHINumToValue) {
+    // Identify unresolved block-live-ins.
+    if (!DBG_PHI.ValueRead)
+      continue;
+
+    ValueIDNum &Num = *DBG_PHI.ValueRead;
+    if (!Num.isPHI())
+      continue;
+
+    unsigned BlockNo = Num.getBlock();
+    LocIdx LocNo = Num.getLoc();
+    ValueIDNum ResolvedValue = MInLocs[BlockNo][LocNo.asU64()];
+    // If there is no resolved value for this live-in then it is not directly
+    // reachable from the entry block -- model it as a PHI on entry to this
+    // block, which means we leave the ValueIDNum unchanged.
+    if (ResolvedValue != ValueIDNum::EmptyValue)
+      Num = ResolvedValue;
+  }
+  // Later, we'll be looking up ranges of instruction numbers.
+  llvm::sort(DebugPHINumToValue);
+
+  // Walk back through each block / instruction, collecting DBG_VALUE
+  // instructions and recording what machine value their operands refer to.
+  for (auto &OrderPair : OrderToBB) {
+    MachineBasicBlock &MBB = *OrderPair.second;
+    CurBB = MBB.getNumber();
+    VTracker = &vlocs[CurBB];
+    VTracker->MBB = &MBB;
+    MTracker->loadFromArray(MInLocs[CurBB], CurBB);
+    CurInst = 1;
+    for (auto &MI : MBB) {
+      process(MI, MOutLocs.get(), MInLocs.get());
+      ++CurInst;
+    }
+    MTracker->reset();
+  }
+
+  // Number all variables in the order that they appear, to be used as a stable
+  // insertion order later.
+  DenseMap<DebugVariable, unsigned> AllVarsNumbering;
+
+  // Map from one LexicalScope to all the variables in that scope.
+  ScopeToVarsT ScopeToVars;
+
+  // Map from One lexical scope to all blocks where assignments happen for
+  // that scope.
+  ScopeToAssignBlocksT ScopeToAssignBlocks;
+
+  // Store map of DILocations that describes scopes.
+  ScopeToDILocT ScopeToDILocation;
+
+  // To mirror old LiveDebugValues, enumerate variables in RPOT order. Otherwise
+  // the order is unimportant, it just has to be stable.
+  unsigned VarAssignCount = 0;
+  for (unsigned int I = 0; I < OrderToBB.size(); ++I) {
+    auto *MBB = OrderToBB[I];
+    auto *VTracker = &vlocs[MBB->getNumber()];
+    // Collect each variable with a DBG_VALUE in this block.
+    for (auto &idx : VTracker->Vars) {
+      const auto &Var = idx.first;
+      const DILocation *ScopeLoc = VTracker->Scopes[Var];
+      assert(ScopeLoc != nullptr);
+      auto *Scope = LS.findLexicalScope(ScopeLoc);
+
+      // No insts in scope -> shouldn't have been recorded.
+      assert(Scope != nullptr);
+
+      AllVarsNumbering.insert(std::make_pair(Var, AllVarsNumbering.size()));
+      ScopeToVars[Scope].insert(Var);
+      ScopeToAssignBlocks[Scope].insert(VTracker->MBB);
+      ScopeToDILocation[Scope] = ScopeLoc;
+      ++VarAssignCount;
+    }
+  }
+
+  bool Changed = false;
+
+  // If we have an extremely large number of variable assignments and blocks,
+  // bail out at this point. We've burnt some time doing analysis already,
+  // however we should cut our losses.
+  if ((unsigned)MaxNumBlocks > InputBBLimit &&
+      VarAssignCount > InputDbgValLimit) {
+    LLVM_DEBUG(dbgs() << "Disabling InstrRefBasedLDV: " << MF.getName()
+                      << " has " << MaxNumBlocks << " basic blocks and "
+                      << VarAssignCount
+                      << " variable assignments, exceeding limits.\n");
+  } else {
+    // Optionally, solve the variable value problem and emit to blocks by using
+    // a lexical-scope-depth search. It should be functionally identical to
+    // the "else" block of this condition.
+    Changed = depthFirstVLocAndEmit(
+        MaxNumBlocks, ScopeToDILocation, ScopeToVars, ScopeToAssignBlocks,
+        SavedLiveIns, MOutLocs, MInLocs, vlocs, MF, AllVarsNumbering, *TPC);
+  }
+
+  delete MTracker;
+  delete TTracker;
+  MTracker = nullptr;
+  VTracker = nullptr;
+  TTracker = nullptr;
+
+  ArtificialBlocks.clear();
+  OrderToBB.clear();
+  BBToOrder.clear();
+  BBNumToRPO.clear();
+  DebugInstrNumToInstr.clear();
+  DebugPHINumToValue.clear();
+  OverlapFragments.clear();
+  SeenFragments.clear();
+  SeenDbgPHIs.clear();
+  DbgOpStore.clear();
+
+  return Changed;
+}
+
+LDVImpl *llvm::makeInstrRefBasedLiveDebugValues() {
+  return new InstrRefBasedLDV();
+}
+
+namespace {
+class LDVSSABlock;
+class LDVSSAUpdater;
+
+// Pick a type to identify incoming block values as we construct SSA. We
+// can't use anything more robust than an integer unfortunately, as SSAUpdater
+// expects to zero-initialize the type.
+typedef uint64_t BlockValueNum;
+
+/// Represents an SSA PHI node for the SSA updater class. Contains the block
+/// this PHI is in, the value number it would have, and the expected incoming
+/// values from parent blocks.
+class LDVSSAPhi {
+public:
+  SmallVector<std::pair<LDVSSABlock *, BlockValueNum>, 4> IncomingValues;
+  LDVSSABlock *ParentBlock;
+  BlockValueNum PHIValNum;
+  LDVSSAPhi(BlockValueNum PHIValNum, LDVSSABlock *ParentBlock)
+      : ParentBlock(ParentBlock), PHIValNum(PHIValNum) {}
+
+  LDVSSABlock *getParent() { return ParentBlock; }
+};
+
+/// Thin wrapper around a block predecessor iterator. Only difference from a
+/// normal block iterator is that it dereferences to an LDVSSABlock.
+class LDVSSABlockIterator {
+public:
+  MachineBasicBlock::pred_iterator PredIt;
+  LDVSSAUpdater &Updater;
+
+  LDVSSABlockIterator(MachineBasicBlock::pred_iterator PredIt,
+                      LDVSSAUpdater &Updater)
+      : PredIt(PredIt), Updater(Updater) {}
+
+  bool operator!=(const LDVSSABlockIterator &OtherIt) const {
+    return OtherIt.PredIt != PredIt;
+  }
+
+  LDVSSABlockIterator &operator++() {
+    ++PredIt;
+    return *this;
+  }
+
+  LDVSSABlock *operator*();
+};
+
+/// Thin wrapper around a block for SSA Updater interface. Necessary because
+/// we need to track the PHI value(s) that we may have observed as necessary
+/// in this block.
+class LDVSSABlock {
+public:
+  MachineBasicBlock &BB;
+  LDVSSAUpdater &Updater;
+  using PHIListT = SmallVector<LDVSSAPhi, 1>;
+  /// List of PHIs in this block. There should only ever be one.
+  PHIListT PHIList;
+
+  LDVSSABlock(MachineBasicBlock &BB, LDVSSAUpdater &Updater)
+      : BB(BB), Updater(Updater) {}
+
+  LDVSSABlockIterator succ_begin() {
+    return LDVSSABlockIterator(BB.succ_begin(), Updater);
+  }
+
+  LDVSSABlockIterator succ_end() {
+    return LDVSSABlockIterator(BB.succ_end(), Updater);
+  }
+
+  /// SSAUpdater has requested a PHI: create that within this block record.
+  LDVSSAPhi *newPHI(BlockValueNum Value) {
+    PHIList.emplace_back(Value, this);
+    return &PHIList.back();
+  }
+
+  /// SSAUpdater wishes to know what PHIs already exist in this block.
+  PHIListT &phis() { return PHIList; }
+};
+
+/// Utility class for the SSAUpdater interface: tracks blocks, PHIs and values
+/// while SSAUpdater is exploring the CFG. It's passed as a handle / baton to
+// SSAUpdaterTraits<LDVSSAUpdater>.
+class LDVSSAUpdater {
+public:
+  /// Map of value numbers to PHI records.
+  DenseMap<BlockValueNum, LDVSSAPhi *> PHIs;
+  /// Map of which blocks generate Undef values -- blocks that are not
+  /// dominated by any Def.
+  DenseMap<MachineBasicBlock *, BlockValueNum> UndefMap;
+  /// Map of machine blocks to our own records of them.
+  DenseMap<MachineBasicBlock *, LDVSSABlock *> BlockMap;
+  /// Machine location where any PHI must occur.
+  LocIdx Loc;
+  /// Table of live-in machine value numbers for blocks / locations.
+  const ValueTable *MLiveIns;
+
+  LDVSSAUpdater(LocIdx L, const ValueTable *MLiveIns)
+      : Loc(L), MLiveIns(MLiveIns) {}
+
+  void reset() {
+    for (auto &Block : BlockMap)
+      delete Block.second;
+
+    PHIs.clear();
+    UndefMap.clear();
+    BlockMap.clear();
+  }
+
+  ~LDVSSAUpdater() { reset(); }
+
+  /// For a given MBB, create a wrapper block for it. Stores it in the
+  /// LDVSSAUpdater block map.
+  LDVSSABlock *getSSALDVBlock(MachineBasicBlock *BB) {
+    auto it = BlockMap.find(BB);
+    if (it == BlockMap.end()) {
+      BlockMap[BB] = new LDVSSABlock(*BB, *this);
+      it = BlockMap.find(BB);
+    }
+    return it->second;
+  }
+
+  /// Find the live-in value number for the given block. Looks up the value at
+  /// the PHI location on entry.
+  BlockValueNum getValue(LDVSSABlock *LDVBB) {
+    return MLiveIns[LDVBB->BB.getNumber()][Loc.asU64()].asU64();
+  }
+};
+
+LDVSSABlock *LDVSSABlockIterator::operator*() {
+  return Updater.getSSALDVBlock(*PredIt);
+}
+
+#ifndef NDEBUG
+
+raw_ostream &operator<<(raw_ostream &out, const LDVSSAPhi &PHI) {
+  out << "SSALDVPHI " << PHI.PHIValNum;
+  return out;
+}
+
+#endif
+
+} // namespace
+
+namespace llvm {
+
+/// Template specialization to give SSAUpdater access to CFG and value
+/// information. SSAUpdater calls methods in these traits, passing in the
+/// LDVSSAUpdater object, to learn about blocks and the values they define.
+/// It also provides methods to create PHI nodes and track them.
+template <> class SSAUpdaterTraits<LDVSSAUpdater> {
+public:
+  using BlkT = LDVSSABlock;
+  using ValT = BlockValueNum;
+  using PhiT = LDVSSAPhi;
+  using BlkSucc_iterator = LDVSSABlockIterator;
+
+  // Methods to access block successors -- dereferencing to our wrapper class.
+  static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return BB->succ_begin(); }
+  static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return BB->succ_end(); }
+
+  /// Iterator for PHI operands.
+  class PHI_iterator {
+  private:
+    LDVSSAPhi *PHI;
+    unsigned Idx;
+
+  public:
+    explicit PHI_iterator(LDVSSAPhi *P) // begin iterator
+        : PHI(P), Idx(0) {}
+    PHI_iterator(LDVSSAPhi *P, bool) // end iterator
+        : PHI(P), Idx(PHI->IncomingValues.size()) {}
+
+    PHI_iterator &operator++() {
+      Idx++;
+      return *this;
+    }
+    bool operator==(const PHI_iterator &X) const { return Idx == X.Idx; }
+    bool operator!=(const PHI_iterator &X) const { return !operator==(X); }
+
+    BlockValueNum getIncomingValue() { return PHI->IncomingValues[Idx].second; }
+
+    LDVSSABlock *getIncomingBlock() { return PHI->IncomingValues[Idx].first; }
+  };
+
+  static inline PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
+
+  static inline PHI_iterator PHI_end(PhiT *PHI) {
+    return PHI_iterator(PHI, true);
+  }
+
+  /// FindPredecessorBlocks - Put the predecessors of BB into the Preds
+  /// vector.
+  static void FindPredecessorBlocks(LDVSSABlock *BB,
+                                    SmallVectorImpl<LDVSSABlock *> *Preds) {
+    for (MachineBasicBlock *Pred : BB->BB.predecessors())
+      Preds->push_back(BB->Updater.getSSALDVBlock(Pred));
+  }
+
+  /// GetUndefVal - Normally creates an IMPLICIT_DEF instruction with a new
+  /// register. For LiveDebugValues, represents a block identified as not having
+  /// any DBG_PHI predecessors.
+  static BlockValueNum GetUndefVal(LDVSSABlock *BB, LDVSSAUpdater *Updater) {
+    // Create a value number for this block -- it needs to be unique and in the
+    // "undef" collection, so that we know it's not real. Use a number
+    // representing a PHI into this block.
+    BlockValueNum Num = ValueIDNum(BB->BB.getNumber(), 0, Updater->Loc).asU64();
+    Updater->UndefMap[&BB->BB] = Num;
+    return Num;
+  }
+
+  /// CreateEmptyPHI - Create a (representation of a) PHI in the given block.
+  /// SSAUpdater will populate it with information about incoming values. The
+  /// value number of this PHI is whatever the  machine value number problem
+  /// solution determined it to be. This includes non-phi values if SSAUpdater
+  /// tries to create a PHI where the incoming values are identical.
+  static BlockValueNum CreateEmptyPHI(LDVSSABlock *BB, unsigned NumPreds,
+                                   LDVSSAUpdater *Updater) {
+    BlockValueNum PHIValNum = Updater->getValue(BB);
+    LDVSSAPhi *PHI = BB->newPHI(PHIValNum);
+    Updater->PHIs[PHIValNum] = PHI;
+    return PHIValNum;
+  }
+
+  /// AddPHIOperand - Add the specified value as an operand of the PHI for
+  /// the specified predecessor block.
+  static void AddPHIOperand(LDVSSAPhi *PHI, BlockValueNum Val, LDVSSABlock *Pred) {
+    PHI->IncomingValues.push_back(std::make_pair(Pred, Val));
+  }
+
+  /// ValueIsPHI - Check if the instruction that defines the specified value
+  /// is a PHI instruction.
+  static LDVSSAPhi *ValueIsPHI(BlockValueNum Val, LDVSSAUpdater *Updater) {
+    return Updater->PHIs.lookup(Val);
+  }
+
+  /// ValueIsNewPHI - Like ValueIsPHI but also check if the PHI has no source
+  /// operands, i.e., it was just added.
+  static LDVSSAPhi *ValueIsNewPHI(BlockValueNum Val, LDVSSAUpdater *Updater) {
+    LDVSSAPhi *PHI = ValueIsPHI(Val, Updater);
+    if (PHI && PHI->IncomingValues.size() == 0)
+      return PHI;
+    return nullptr;
+  }
+
+  /// GetPHIValue - For the specified PHI instruction, return the value
+  /// that it defines.
+  static BlockValueNum GetPHIValue(LDVSSAPhi *PHI) { return PHI->PHIValNum; }
+};
+
+} // end namespace llvm
+
+std::optional<ValueIDNum> InstrRefBasedLDV::resolveDbgPHIs(
+    MachineFunction &MF, const ValueTable *MLiveOuts,
+    const ValueTable *MLiveIns, MachineInstr &Here, uint64_t InstrNum) {
+  assert(MLiveOuts && MLiveIns &&
+         "Tried to resolve DBG_PHI before location "
+         "tables allocated?");
+
+  // This function will be called twice per DBG_INSTR_REF, and might end up
+  // computing lots of SSA information: memoize it.
+  auto SeenDbgPHIIt = SeenDbgPHIs.find(std::make_pair(&Here, InstrNum));
+  if (SeenDbgPHIIt != SeenDbgPHIs.end())
+    return SeenDbgPHIIt->second;
+
+  std::optional<ValueIDNum> Result =
+      resolveDbgPHIsImpl(MF, MLiveOuts, MLiveIns, Here, InstrNum);
+  SeenDbgPHIs.insert({std::make_pair(&Here, InstrNum), Result});
+  return Result;
+}
+
+std::optional<ValueIDNum> InstrRefBasedLDV::resolveDbgPHIsImpl(
+    MachineFunction &MF, const ValueTable *MLiveOuts,
+    const ValueTable *MLiveIns, MachineInstr &Here, uint64_t InstrNum) {
+  // Pick out records of DBG_PHI instructions that have been observed. If there
+  // are none, then we cannot compute a value number.
+  auto RangePair = std::equal_range(DebugPHINumToValue.begin(),
+                                    DebugPHINumToValue.end(), InstrNum);
+  auto LowerIt = RangePair.first;
+  auto UpperIt = RangePair.second;
+
+  // No DBG_PHI means there can be no location.
+  if (LowerIt == UpperIt)
+    return std::nullopt;
+
+  // If any DBG_PHIs referred to a location we didn't understand, don't try to
+  // compute a value. There might be scenarios where we could recover a value
+  // for some range of DBG_INSTR_REFs, but at this point we can have high
+  // confidence that we've seen a bug.
+  auto DBGPHIRange = make_range(LowerIt, UpperIt);
+  for (const DebugPHIRecord &DBG_PHI : DBGPHIRange)
+    if (!DBG_PHI.ValueRead)
+      return std::nullopt;
+
+  // If there's only one DBG_PHI, then that is our value number.
+  if (std::distance(LowerIt, UpperIt) == 1)
+    return *LowerIt->ValueRead;
+
+  // Pick out the location (physreg, slot) where any PHIs must occur. It's
+  // technically possible for us to merge values in different registers in each
+  // block, but highly unlikely that LLVM will generate such code after register
+  // allocation.
+  LocIdx Loc = *LowerIt->ReadLoc;
+
+  // We have several DBG_PHIs, and a use position (the Here inst). All each
+  // DBG_PHI does is identify a value at a program position. We can treat each
+  // DBG_PHI like it's a Def of a value, and the use position is a Use of a
+  // value, just like SSA. We use the bulk-standard LLVM SSA updater class to
+  // determine which Def is used at the Use, and any PHIs that happen along
+  // the way.
+  // Adapted LLVM SSA Updater:
+  LDVSSAUpdater Updater(Loc, MLiveIns);
+  // Map of which Def or PHI is the current value in each block.
+  DenseMap<LDVSSABlock *, BlockValueNum> AvailableValues;
+  // Set of PHIs that we have created along the way.
+  SmallVector<LDVSSAPhi *, 8> CreatedPHIs;
+
+  // Each existing DBG_PHI is a Def'd value under this model. Record these Defs
+  // for the SSAUpdater.
+  for (const auto &DBG_PHI : DBGPHIRange) {
+    LDVSSABlock *Block = Updater.getSSALDVBlock(DBG_PHI.MBB);
+    const ValueIDNum &Num = *DBG_PHI.ValueRead;
+    AvailableValues.insert(std::make_pair(Block, Num.asU64()));
+  }
+
+  LDVSSABlock *HereBlock = Updater.getSSALDVBlock(Here.getParent());
+  const auto &AvailIt = AvailableValues.find(HereBlock);
+  if (AvailIt != AvailableValues.end()) {
+    // Actually, we already know what the value is -- the Use is in the same
+    // block as the Def.
+    return ValueIDNum::fromU64(AvailIt->second);
+  }
+
+  // Otherwise, we must use the SSA Updater. It will identify the value number
+  // that we are to use, and the PHIs that must happen along the way.
+  SSAUpdaterImpl<LDVSSAUpdater> Impl(&Updater, &AvailableValues, &CreatedPHIs);
+  BlockValueNum ResultInt = Impl.GetValue(Updater.getSSALDVBlock(Here.getParent()));
+  ValueIDNum Result = ValueIDNum::fromU64(ResultInt);
+
+  // We have the number for a PHI, or possibly live-through value, to be used
+  // at this Use. There are a number of things we have to check about it though:
+  //  * Does any PHI use an 'Undef' (like an IMPLICIT_DEF) value? If so, this
+  //    Use was not completely dominated by DBG_PHIs and we should abort.
+  //  * Are the Defs or PHIs clobbered in a block? SSAUpdater isn't aware that
+  //    we've left SSA form. Validate that the inputs to each PHI are the
+  //    expected values.
+  //  * Is a PHI we've created actually a merging of values, or are all the
+  //    predecessor values the same, leading to a non-PHI machine value number?
+  //    (SSAUpdater doesn't know that either). Remap validated PHIs into the
+  //    the ValidatedValues collection below to sort this out.
+  DenseMap<LDVSSABlock *, ValueIDNum> ValidatedValues;
+
+  // Define all the input DBG_PHI values in ValidatedValues.
+  for (const auto &DBG_PHI : DBGPHIRange) {
+    LDVSSABlock *Block = Updater.getSSALDVBlock(DBG_PHI.MBB);
+    const ValueIDNum &Num = *DBG_PHI.ValueRead;
+    ValidatedValues.insert(std::make_pair(Block, Num));
+  }
+
+  // Sort PHIs to validate into RPO-order.
+  SmallVector<LDVSSAPhi *, 8> SortedPHIs;
+  for (auto &PHI : CreatedPHIs)
+    SortedPHIs.push_back(PHI);
+
+  llvm::sort(SortedPHIs, [&](LDVSSAPhi *A, LDVSSAPhi *B) {
+    return BBToOrder[&A->getParent()->BB] < BBToOrder[&B->getParent()->BB];
+  });
+
+  for (auto &PHI : SortedPHIs) {
+    ValueIDNum ThisBlockValueNum =
+        MLiveIns[PHI->ParentBlock->BB.getNumber()][Loc.asU64()];
+
+    // Are all these things actually defined?
+    for (auto &PHIIt : PHI->IncomingValues) {
+      // Any undef input means DBG_PHIs didn't dominate the use point.
+      if (Updater.UndefMap.contains(&PHIIt.first->BB))
+        return std::nullopt;
+
+      ValueIDNum ValueToCheck;
+      const ValueTable &BlockLiveOuts = MLiveOuts[PHIIt.first->BB.getNumber()];
+
+      auto VVal = ValidatedValues.find(PHIIt.first);
+      if (VVal == ValidatedValues.end()) {
+        // We cross a loop, and this is a backedge. LLVMs tail duplication
+        // happens so late that DBG_PHI instructions should not be able to
+        // migrate into loops -- meaning we can only be live-through this
+        // loop.
+        ValueToCheck = ThisBlockValueNum;
+      } else {
+        // Does the block have as a live-out, in the location we're examining,
+        // the value that we expect? If not, it's been moved or clobbered.
+        ValueToCheck = VVal->second;
+      }
+
+      if (BlockLiveOuts[Loc.asU64()] != ValueToCheck)
+        return std::nullopt;
+    }
+
+    // Record this value as validated.
+    ValidatedValues.insert({PHI->ParentBlock, ThisBlockValueNum});
+  }
+
+  // All the PHIs are valid: we can return what the SSAUpdater said our value
+  // number was.
+  return Result;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
new file mode 100644
index 000000000000..30de18e53c4f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
@@ -0,0 +1,1441 @@
+//===- InstrRefBasedImpl.h - Tracking Debug Value MIs ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_LIVEDEBUGVALUES_INSTRREFBASEDLDV_H
+#define LLVM_LIB_CODEGEN_LIVEDEBUGVALUES_INSTRREFBASEDLDV_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/UniqueVector.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include <optional>
+
+#include "LiveDebugValues.h"
+
+class TransferTracker;
+
+// Forward dec of unit test class, so that we can peer into the LDV object.
+class InstrRefLDVTest;
+
+namespace LiveDebugValues {
+
+class MLocTracker;
+class DbgOpIDMap;
+
+using namespace llvm;
+
+/// Handle-class for a particular "location". This value-type uniquely
+/// symbolises a register or stack location, allowing manipulation of locations
+/// without concern for where that location is. Practically, this allows us to
+/// treat the state of the machine at a particular point as an array of values,
+/// rather than a map of values.
+class LocIdx {
+  unsigned Location;
+
+  // Default constructor is private, initializing to an illegal location number.
+  // Use only for "not an entry" elements in IndexedMaps.
+  LocIdx() : Location(UINT_MAX) {}
+
+public:
+#define NUM_LOC_BITS 24
+  LocIdx(unsigned L) : Location(L) {
+    assert(L < (1 << NUM_LOC_BITS) && "Machine locations must fit in 24 bits");
+  }
+
+  static LocIdx MakeIllegalLoc() { return LocIdx(); }
+  static LocIdx MakeTombstoneLoc() {
+    LocIdx L = LocIdx();
+    --L.Location;
+    return L;
+  }
+
+  bool isIllegal() const { return Location == UINT_MAX; }
+
+  uint64_t asU64() const { return Location; }
+
+  bool operator==(unsigned L) const { return Location == L; }
+
+  bool operator==(const LocIdx &L) const { return Location == L.Location; }
+
+  bool operator!=(unsigned L) const { return !(*this == L); }
+
+  bool operator!=(const LocIdx &L) const { return !(*this == L); }
+
+  bool operator<(const LocIdx &Other) const {
+    return Location < Other.Location;
+  }
+};
+
+// The location at which a spilled value resides. It consists of a register and
+// an offset.
+struct SpillLoc {
+  unsigned SpillBase;
+  StackOffset SpillOffset;
+  bool operator==(const SpillLoc &Other) const {
+    return std::make_pair(SpillBase, SpillOffset) ==
+           std::make_pair(Other.SpillBase, Other.SpillOffset);
+  }
+  bool operator<(const SpillLoc &Other) const {
+    return std::make_tuple(SpillBase, SpillOffset.getFixed(),
+                           SpillOffset.getScalable()) <
+           std::make_tuple(Other.SpillBase, Other.SpillOffset.getFixed(),
+                           Other.SpillOffset.getScalable());
+  }
+};
+
+/// Unique identifier for a value defined by an instruction, as a value type.
+/// Casts back and forth to a uint64_t. Probably replacable with something less
+/// bit-constrained. Each value identifies the instruction and machine location
+/// where the value is defined, although there may be no corresponding machine
+/// operand for it (ex: regmasks clobbering values). The instructions are
+/// one-based, and definitions that are PHIs have instruction number zero.
+///
+/// The obvious limits of a 1M block function or 1M instruction blocks are
+/// problematic; but by that point we should probably have bailed out of
+/// trying to analyse the function.
+class ValueIDNum {
+  union {
+    struct {
+      uint64_t BlockNo : 20; /// The block where the def happens.
+      uint64_t InstNo : 20;  /// The Instruction where the def happens.
+                             /// One based, is distance from start of block.
+      uint64_t LocNo
+          : NUM_LOC_BITS; /// The machine location where the def happens.
+    } s;
+    uint64_t Value;
+  } u;
+
+  static_assert(sizeof(u) == 8, "Badly packed ValueIDNum?");
+
+public:
+  // Default-initialize to EmptyValue. This is necessary to make IndexedMaps
+  // of values to work.
+  ValueIDNum() { u.Value = EmptyValue.asU64(); }
+
+  ValueIDNum(uint64_t Block, uint64_t Inst, uint64_t Loc) {
+    u.s = {Block, Inst, Loc};
+  }
+
+  ValueIDNum(uint64_t Block, uint64_t Inst, LocIdx Loc) {
+    u.s = {Block, Inst, Loc.asU64()};
+  }
+
+  uint64_t getBlock() const { return u.s.BlockNo; }
+  uint64_t getInst() const { return u.s.InstNo; }
+  uint64_t getLoc() const { return u.s.LocNo; }
+  bool isPHI() const { return u.s.InstNo == 0; }
+
+  uint64_t asU64() const { return u.Value; }
+
+  static ValueIDNum fromU64(uint64_t v) {
+    ValueIDNum Val;
+    Val.u.Value = v;
+    return Val;
+  }
+
+  bool operator<(const ValueIDNum &Other) const {
+    return asU64() < Other.asU64();
+  }
+
+  bool operator==(const ValueIDNum &Other) const {
+    return u.Value == Other.u.Value;
+  }
+
+  bool operator!=(const ValueIDNum &Other) const { return !(*this == Other); }
+
+  std::string asString(const std::string &mlocname) const {
+    return Twine("Value{bb: ")
+        .concat(Twine(u.s.BlockNo)
+                    .concat(Twine(", inst: ")
+                                .concat((u.s.InstNo ? Twine(u.s.InstNo)
+                                                    : Twine("live-in"))
+                                            .concat(Twine(", loc: ").concat(
+                                                Twine(mlocname)))
+                                            .concat(Twine("}")))))
+        .str();
+  }
+
+  static ValueIDNum EmptyValue;
+  static ValueIDNum TombstoneValue;
+};
+
+} // End namespace LiveDebugValues
+
+namespace llvm {
+using namespace LiveDebugValues;
+
+template <> struct DenseMapInfo<LocIdx> {
+  static inline LocIdx getEmptyKey() { return LocIdx::MakeIllegalLoc(); }
+  static inline LocIdx getTombstoneKey() { return LocIdx::MakeTombstoneLoc(); }
+
+  static unsigned getHashValue(const LocIdx &Loc) { return Loc.asU64(); }
+
+  static bool isEqual(const LocIdx &A, const LocIdx &B) { return A == B; }
+};
+
+template <> struct DenseMapInfo<ValueIDNum> {
+  static inline ValueIDNum getEmptyKey() { return ValueIDNum::EmptyValue; }
+  static inline ValueIDNum getTombstoneKey() {
+    return ValueIDNum::TombstoneValue;
+  }
+
+  static unsigned getHashValue(const ValueIDNum &Val) {
+    return hash_value(Val.asU64());
+  }
+
+  static bool isEqual(const ValueIDNum &A, const ValueIDNum &B) {
+    return A == B;
+  }
+};
+
+} // end namespace llvm
+
+namespace LiveDebugValues {
+using namespace llvm;
+
+/// Type for a table of values in a block.
+using ValueTable = std::unique_ptr<ValueIDNum[]>;
+
+/// Type for a table-of-table-of-values, i.e., the collection of either
+/// live-in or live-out values for each block in the function.
+using FuncValueTable = std::unique_ptr<ValueTable[]>;
+
+/// Thin wrapper around an integer -- designed to give more type safety to
+/// spill location numbers.
+class SpillLocationNo {
+public:
+  explicit SpillLocationNo(unsigned SpillNo) : SpillNo(SpillNo) {}
+  unsigned SpillNo;
+  unsigned id() const { return SpillNo; }
+
+  bool operator<(const SpillLocationNo &Other) const {
+    return SpillNo < Other.SpillNo;
+  }
+
+  bool operator==(const SpillLocationNo &Other) const {
+    return SpillNo == Other.SpillNo;
+  }
+  bool operator!=(const SpillLocationNo &Other) const {
+    return !(*this == Other);
+  }
+};
+
+/// Meta qualifiers for a value. Pair of whatever expression is used to qualify
+/// the value, and Boolean of whether or not it's indirect.
+class DbgValueProperties {
+public:
+  DbgValueProperties(const DIExpression *DIExpr, bool Indirect, bool IsVariadic)
+      : DIExpr(DIExpr), Indirect(Indirect), IsVariadic(IsVariadic) {}
+
+  /// Extract properties from an existing DBG_VALUE instruction.
+  DbgValueProperties(const MachineInstr &MI) {
+    assert(MI.isDebugValue());
+    assert(MI.getDebugExpression()->getNumLocationOperands() == 0 ||
+           MI.isDebugValueList() || MI.isUndefDebugValue());
+    IsVariadic = MI.isDebugValueList();
+    DIExpr = MI.getDebugExpression();
+    Indirect = MI.isDebugOffsetImm();
+  }
+
+  bool isJoinable(const DbgValueProperties &Other) const {
+    return DIExpression::isEqualExpression(DIExpr, Indirect, Other.DIExpr,
+                                           Other.Indirect);
+  }
+
+  bool operator==(const DbgValueProperties &Other) const {
+    return std::tie(DIExpr, Indirect, IsVariadic) ==
+           std::tie(Other.DIExpr, Other.Indirect, Other.IsVariadic);
+  }
+
+  bool operator!=(const DbgValueProperties &Other) const {
+    return !(*this == Other);
+  }
+
+  unsigned getLocationOpCount() const {
+    return IsVariadic ? DIExpr->getNumLocationOperands() : 1;
+  }
+
+  const DIExpression *DIExpr;
+  bool Indirect;
+  bool IsVariadic;
+};
+
+/// TODO: Might pack better if we changed this to a Struct of Arrays, since
+/// MachineOperand is width 32, making this struct width 33. We could also
+/// potentially avoid storing the whole MachineOperand (sizeof=32), instead
+/// choosing to store just the contents portion (sizeof=8) and a Kind enum,
+/// since we already know it is some type of immediate value.
+/// Stores a single debug operand, which can either be a MachineOperand for
+/// directly storing immediate values, or a ValueIDNum representing some value
+/// computed at some point in the program. IsConst is used as a discriminator.
+struct DbgOp {
+  union {
+    ValueIDNum ID;
+    MachineOperand MO;
+  };
+  bool IsConst;
+
+  DbgOp() : ID(ValueIDNum::EmptyValue), IsConst(false) {}
+  DbgOp(ValueIDNum ID) : ID(ID), IsConst(false) {}
+  DbgOp(MachineOperand MO) : MO(MO), IsConst(true) {}
+
+  bool isUndef() const { return !IsConst && ID == ValueIDNum::EmptyValue; }
+
+#ifndef NDEBUG
+  void dump(const MLocTracker *MTrack) const;
+#endif
+};
+
+/// A DbgOp whose ID (if any) has resolved to an actual location, LocIdx. Used
+/// when working with concrete debug values, i.e. when joining MLocs and VLocs
+/// in the TransferTracker or emitting DBG_VALUE/DBG_VALUE_LIST instructions in
+/// the MLocTracker.
+struct ResolvedDbgOp {
+  union {
+    LocIdx Loc;
+    MachineOperand MO;
+  };
+  bool IsConst;
+
+  ResolvedDbgOp(LocIdx Loc) : Loc(Loc), IsConst(false) {}
+  ResolvedDbgOp(MachineOperand MO) : MO(MO), IsConst(true) {}
+
+  bool operator==(const ResolvedDbgOp &Other) const {
+    if (IsConst != Other.IsConst)
+      return false;
+    if (IsConst)
+      return MO.isIdenticalTo(Other.MO);
+    return Loc == Other.Loc;
+  }
+
+#ifndef NDEBUG
+  void dump(const MLocTracker *MTrack) const;
+#endif
+};
+
+/// An ID used in the DbgOpIDMap (below) to lookup a stored DbgOp. This is used
+/// in place of actual DbgOps inside of a DbgValue to reduce its size, as
+/// DbgValue is very frequently used and passed around, and the actual DbgOp is
+/// over 8x larger than this class, due to storing a MachineOperand. This ID
+/// should be equal for all equal DbgOps, and also encodes whether the mapped
+/// DbgOp is a constant, meaning that for simple equality or const-ness checks
+/// it is not necessary to lookup this ID.
+struct DbgOpID {
+  struct IsConstIndexPair {
+    uint32_t IsConst : 1;
+    uint32_t Index : 31;
+  };
+
+  union {
+    struct IsConstIndexPair ID;
+    uint32_t RawID;
+  };
+
+  DbgOpID() : RawID(UndefID.RawID) {
+    static_assert(sizeof(DbgOpID) == 4, "DbgOpID should fit within 4 bytes.");
+  }
+  DbgOpID(uint32_t RawID) : RawID(RawID) {}
+  DbgOpID(bool IsConst, uint32_t Index) : ID({IsConst, Index}) {}
+
+  static DbgOpID UndefID;
+
+  bool operator==(const DbgOpID &Other) const { return RawID == Other.RawID; }
+  bool operator!=(const DbgOpID &Other) const { return !(*this == Other); }
+
+  uint32_t asU32() const { return RawID; }
+
+  bool isUndef() const { return *this == UndefID; }
+  bool isConst() const { return ID.IsConst && !isUndef(); }
+  uint32_t getIndex() const { return ID.Index; }
+
+#ifndef NDEBUG
+  void dump(const MLocTracker *MTrack, const DbgOpIDMap *OpStore) const;
+#endif
+};
+
+/// Class storing the complete set of values that are observed by DbgValues
+/// within the current function. Allows 2-way lookup, with `find` returning the
+/// Op for a given ID and `insert` returning the ID for a given Op (creating one
+/// if none exists).
+class DbgOpIDMap {
+
+  SmallVector<ValueIDNum, 0> ValueOps;
+  SmallVector<MachineOperand, 0> ConstOps;
+
+  DenseMap<ValueIDNum, DbgOpID> ValueOpToID;
+  DenseMap<MachineOperand, DbgOpID> ConstOpToID;
+
+public:
+  /// If \p Op does not already exist in this map, it is inserted and the
+  /// corresponding DbgOpID is returned. If Op already exists in this map, then
+  /// no change is made and the existing ID for Op is returned.
+  /// Calling this with the undef DbgOp will always return DbgOpID::UndefID.
+  DbgOpID insert(DbgOp Op) {
+    if (Op.isUndef())
+      return DbgOpID::UndefID;
+    if (Op.IsConst)
+      return insertConstOp(Op.MO);
+    return insertValueOp(Op.ID);
+  }
+  /// Returns the DbgOp associated with \p ID. Should only be used for IDs
+  /// returned from calling `insert` from this map or DbgOpID::UndefID.
+  DbgOp find(DbgOpID ID) const {
+    if (ID == DbgOpID::UndefID)
+      return DbgOp();
+    if (ID.isConst())
+      return DbgOp(ConstOps[ID.getIndex()]);
+    return DbgOp(ValueOps[ID.getIndex()]);
+  }
+
+  void clear() {
+    ValueOps.clear();
+    ConstOps.clear();
+    ValueOpToID.clear();
+    ConstOpToID.clear();
+  }
+
+private:
+  DbgOpID insertConstOp(MachineOperand &MO) {
+    auto ExistingIt = ConstOpToID.find(MO);
+    if (ExistingIt != ConstOpToID.end())
+      return ExistingIt->second;
+    DbgOpID ID(true, ConstOps.size());
+    ConstOpToID.insert(std::make_pair(MO, ID));
+    ConstOps.push_back(MO);
+    return ID;
+  }
+  DbgOpID insertValueOp(ValueIDNum VID) {
+    auto ExistingIt = ValueOpToID.find(VID);
+    if (ExistingIt != ValueOpToID.end())
+      return ExistingIt->second;
+    DbgOpID ID(false, ValueOps.size());
+    ValueOpToID.insert(std::make_pair(VID, ID));
+    ValueOps.push_back(VID);
+    return ID;
+  }
+};
+
+// We set the maximum number of operands that we will handle to keep DbgValue
+// within a reasonable size (64 bytes), as we store and pass a lot of them
+// around.
+#define MAX_DBG_OPS 8
+
+/// Class recording the (high level) _value_ of a variable. Identifies the value
+/// of the variable as a list of ValueIDNums and constant MachineOperands, or as
+/// an empty list for undef debug values or VPHI values which we have not found
+/// valid locations for.
+/// This class also stores meta-information about how the value is qualified.
+/// Used to reason about variable values when performing the second
+/// (DebugVariable specific) dataflow analysis.
+class DbgValue {
+private:
+  /// If Kind is Def or VPHI, the set of IDs corresponding to the DbgOps that
+  /// are used. VPHIs set every ID to EmptyID when we have not found a valid
+  /// machine-value for every operand, and sets them to the corresponding
+  /// machine-values when we have found all of them.
+  DbgOpID DbgOps[MAX_DBG_OPS];
+  unsigned OpCount;
+
+public:
+  /// For a NoVal or VPHI DbgValue, which block it was generated in.
+  int BlockNo;
+
+  /// Qualifiers for the ValueIDNum above.
+  DbgValueProperties Properties;
+
+  typedef enum {
+    Undef, // Represents a DBG_VALUE $noreg in the transfer function only.
+    Def,   // This value is defined by some combination of constants,
+           // instructions, or PHI values.
+    VPHI,  // Incoming values to BlockNo differ, those values must be joined by
+           // a PHI in this block.
+    NoVal, // Empty DbgValue indicating an unknown value. Used as initializer,
+           // before dominating blocks values are propagated in.
+  } KindT;
+  /// Discriminator for whether this is a constant or an in-program value.
+  KindT Kind;
+
+  DbgValue(ArrayRef<DbgOpID> DbgOps, const DbgValueProperties &Prop)
+      : OpCount(DbgOps.size()), BlockNo(0), Properties(Prop), Kind(Def) {
+    static_assert(sizeof(DbgValue) <= 64,
+                  "DbgValue should fit within 64 bytes.");
+    assert(DbgOps.size() == Prop.getLocationOpCount());
+    if (DbgOps.size() > MAX_DBG_OPS ||
+        any_of(DbgOps, [](DbgOpID ID) { return ID.isUndef(); })) {
+      Kind = Undef;
+      OpCount = 0;
+#define DEBUG_TYPE "LiveDebugValues"
+      if (DbgOps.size() > MAX_DBG_OPS) {
+        LLVM_DEBUG(dbgs() << "Found DbgValue with more than maximum allowed "
+                             "operands.\n");
+      }
+#undef DEBUG_TYPE
+    } else {
+      for (unsigned Idx = 0; Idx < DbgOps.size(); ++Idx)
+        this->DbgOps[Idx] = DbgOps[Idx];
+    }
+  }
+
+  DbgValue(unsigned BlockNo, const DbgValueProperties &Prop, KindT Kind)
+      : OpCount(0), BlockNo(BlockNo), Properties(Prop), Kind(Kind) {
+    assert(Kind == NoVal || Kind == VPHI);
+  }
+
+  DbgValue(const DbgValueProperties &Prop, KindT Kind)
+      : OpCount(0), BlockNo(0), Properties(Prop), Kind(Kind) {
+    assert(Kind == Undef &&
+           "Empty DbgValue constructor must pass in Undef kind");
+  }
+
+#ifndef NDEBUG
+  void dump(const MLocTracker *MTrack = nullptr,
+            const DbgOpIDMap *OpStore = nullptr) const;
+#endif
+
+  bool operator==(const DbgValue &Other) const {
+    if (std::tie(Kind, Properties) != std::tie(Other.Kind, Other.Properties))
+      return false;
+    else if (Kind == Def && !equal(getDbgOpIDs(), Other.getDbgOpIDs()))
+      return false;
+    else if (Kind == NoVal && BlockNo != Other.BlockNo)
+      return false;
+    else if (Kind == VPHI && BlockNo != Other.BlockNo)
+      return false;
+    else if (Kind == VPHI && !equal(getDbgOpIDs(), Other.getDbgOpIDs()))
+      return false;
+
+    return true;
+  }
+
+  bool operator!=(const DbgValue &Other) const { return !(*this == Other); }
+
+  // Returns an array of all the machine values used to calculate this variable
+  // value, or an empty list for an Undef or unjoined VPHI.
+  ArrayRef<DbgOpID> getDbgOpIDs() const { return {DbgOps, OpCount}; }
+
+  // Returns either DbgOps[Index] if this DbgValue has Debug Operands, or
+  // the ID for ValueIDNum::EmptyValue otherwise (i.e. if this is an Undef,
+  // NoVal, or an unjoined VPHI).
+  DbgOpID getDbgOpID(unsigned Index) const {
+    if (!OpCount)
+      return DbgOpID::UndefID;
+    assert(Index < OpCount);
+    return DbgOps[Index];
+  }
+  // Replaces this DbgValue's existing DbgOpIDs (if any) with the contents of
+  // \p NewIDs. The number of DbgOpIDs passed must be equal to the number of
+  // arguments expected by this DbgValue's properties (the return value of
+  // `getLocationOpCount()`).
+  void setDbgOpIDs(ArrayRef<DbgOpID> NewIDs) {
+    // We can go from no ops to some ops, but not from some ops to no ops.
+    assert(NewIDs.size() == getLocationOpCount() &&
+           "Incorrect number of Debug Operands for this DbgValue.");
+    OpCount = NewIDs.size();
+    for (unsigned Idx = 0; Idx < NewIDs.size(); ++Idx)
+      DbgOps[Idx] = NewIDs[Idx];
+  }
+
+  // The number of debug operands expected by this DbgValue's expression.
+  // getDbgOpIDs() should return an array of this length, unless this is an
+  // Undef or an unjoined VPHI.
+  unsigned getLocationOpCount() const {
+    return Properties.getLocationOpCount();
+  }
+
+  // Returns true if this or Other are unjoined PHIs, which do not have defined
+  // Loc Ops, or if the `n`th Loc Op for this has a different constness to the
+  // `n`th Loc Op for Other.
+  bool hasJoinableLocOps(const DbgValue &Other) const {
+    if (isUnjoinedPHI() || Other.isUnjoinedPHI())
+      return true;
+    for (unsigned Idx = 0; Idx < getLocationOpCount(); ++Idx) {
+      if (getDbgOpID(Idx).isConst() != Other.getDbgOpID(Idx).isConst())
+        return false;
+    }
+    return true;
+  }
+
+  bool isUnjoinedPHI() const { return Kind == VPHI && OpCount == 0; }
+
+  bool hasIdenticalValidLocOps(const DbgValue &Other) const {
+    if (!OpCount)
+      return false;
+    return equal(getDbgOpIDs(), Other.getDbgOpIDs());
+  }
+};
+
+class LocIdxToIndexFunctor {
+public:
+  using argument_type = LocIdx;
+  unsigned operator()(const LocIdx &L) const { return L.asU64(); }
+};
+
+/// Tracker for what values are in machine locations. Listens to the Things
+/// being Done by various instructions, and maintains a table of what machine
+/// locations have what values (as defined by a ValueIDNum).
+///
+/// There are potentially a much larger number of machine locations on the
+/// target machine than the actual working-set size of the function. On x86 for
+/// example, we're extremely unlikely to want to track values through control
+/// or debug registers. To avoid doing so, MLocTracker has several layers of
+/// indirection going on, described below, to avoid unnecessarily tracking
+/// any location.
+///
+/// Here's a sort of diagram of the indexes, read from the bottom up:
+///
+///           Size on stack   Offset on stack
+///                 \              /
+///          Stack Idx (Where in slot is this?)
+///                         /
+///                        /
+/// Slot Num (%stack.0)   /
+/// FrameIdx => SpillNum /
+///              \      /
+///           SpillID (int)              Register number (int)
+///                      \                  /
+///                      LocationID => LocIdx
+///                                |
+///                       LocIdx => ValueIDNum
+///
+/// The aim here is that the LocIdx => ValueIDNum vector is just an array of
+/// values in numbered locations, so that later analyses can ignore whether the
+/// location is a register or otherwise. To map a register / spill location to
+/// a LocIdx, you have to use the (sparse) LocationID => LocIdx map. And to
+/// build a LocationID for a stack slot, you need to combine identifiers for
+/// which stack slot it is and where within that slot is being described.
+///
+/// Register mask operands cause trouble by technically defining every register;
+/// various hacks are used to avoid tracking registers that are never read and
+/// only written by regmasks.
+class MLocTracker {
+public:
+  MachineFunction &MF;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  const TargetLowering &TLI;
+
+  /// IndexedMap type, mapping from LocIdx to ValueIDNum.
+  using LocToValueType = IndexedMap<ValueIDNum, LocIdxToIndexFunctor>;
+
+  /// Map of LocIdxes to the ValueIDNums that they store. This is tightly
+  /// packed, entries only exist for locations that are being tracked.
+  LocToValueType LocIdxToIDNum;
+
+  /// "Map" of machine location IDs (i.e., raw register or spill number) to the
+  /// LocIdx key / number for that location. There are always at least as many
+  /// as the number of registers on the target -- if the value in the register
+  /// is not being tracked, then the LocIdx value will be zero. New entries are
+  /// appended if a new spill slot begins being tracked.
+  /// This, and the corresponding reverse map persist for the analysis of the
+  /// whole function, and is necessarying for decoding various vectors of
+  /// values.
+  std::vector<LocIdx> LocIDToLocIdx;
+
+  /// Inverse map of LocIDToLocIdx.
+  IndexedMap<unsigned, LocIdxToIndexFunctor> LocIdxToLocID;
+
+  /// When clobbering register masks, we chose to not believe the machine model
+  /// and don't clobber SP. Do the same for SP aliases, and for efficiency,
+  /// keep a set of them here.
+  SmallSet<Register, 8> SPAliases;
+
+  /// Unique-ification of spill. Used to number them -- their LocID number is
+  /// the index in SpillLocs minus one plus NumRegs.
+  UniqueVector<SpillLoc> SpillLocs;
+
+  // If we discover a new machine location, assign it an mphi with this
+  // block number.
+  unsigned CurBB = -1;
+
+  /// Cached local copy of the number of registers the target has.
+  unsigned NumRegs;
+
+  /// Number of slot indexes the target has -- distinct segments of a stack
+  /// slot that can take on the value of a subregister, when a super-register
+  /// is written to the stack.
+  unsigned NumSlotIdxes;
+
+  /// Collection of register mask operands that have been observed. Second part
+  /// of pair indicates the instruction that they happened in. Used to
+  /// reconstruct where defs happened if we start tracking a location later
+  /// on.
+  SmallVector<std::pair<const MachineOperand *, unsigned>, 32> Masks;
+
+  /// Pair for describing a position within a stack slot -- first the size in
+  /// bits, then the offset.
+  typedef std::pair<unsigned short, unsigned short> StackSlotPos;
+
+  /// Map from a size/offset pair describing a position in a stack slot, to a
+  /// numeric identifier for that position. Allows easier identification of
+  /// individual positions.
+  DenseMap<StackSlotPos, unsigned> StackSlotIdxes;
+
+  /// Inverse of StackSlotIdxes.
+  DenseMap<unsigned, StackSlotPos> StackIdxesToPos;
+
+  /// Iterator for locations and the values they contain. Dereferencing
+  /// produces a struct/pair containing the LocIdx key for this location,
+  /// and a reference to the value currently stored. Simplifies the process
+  /// of seeking a particular location.
+  class MLocIterator {
+    LocToValueType &ValueMap;
+    LocIdx Idx;
+
+  public:
+    class value_type {
+    public:
+      value_type(LocIdx Idx, ValueIDNum &Value) : Idx(Idx), Value(Value) {}
+      const LocIdx Idx;  /// Read-only index of this location.
+      ValueIDNum &Value; /// Reference to the stored value at this location.
+    };
+
+    MLocIterator(LocToValueType &ValueMap, LocIdx Idx)
+        : ValueMap(ValueMap), Idx(Idx) {}
+
+    bool operator==(const MLocIterator &Other) const {
+      assert(&ValueMap == &Other.ValueMap);
+      return Idx == Other.Idx;
+    }
+
+    bool operator!=(const MLocIterator &Other) const {
+      return !(*this == Other);
+    }
+
+    void operator++() { Idx = LocIdx(Idx.asU64() + 1); }
+
+    value_type operator*() { return value_type(Idx, ValueMap[LocIdx(Idx)]); }
+  };
+
+  MLocTracker(MachineFunction &MF, const TargetInstrInfo &TII,
+              const TargetRegisterInfo &TRI, const TargetLowering &TLI);
+
+  /// Produce location ID number for a Register. Provides some small amount of
+  /// type safety.
+  /// \param Reg The register we're looking up.
+  unsigned getLocID(Register Reg) { return Reg.id(); }
+
+  /// Produce location ID number for a spill position.
+  /// \param Spill The number of the spill we're fetching the location for.
+  /// \param SpillSubReg Subregister within the spill we're addressing.
+  unsigned getLocID(SpillLocationNo Spill, unsigned SpillSubReg) {
+    unsigned short Size = TRI.getSubRegIdxSize(SpillSubReg);
+    unsigned short Offs = TRI.getSubRegIdxOffset(SpillSubReg);
+    return getLocID(Spill, {Size, Offs});
+  }
+
+  /// Produce location ID number for a spill position.
+  /// \param Spill The number of the spill we're fetching the location for.
+  /// \apram SpillIdx size/offset within the spill slot to be addressed.
+  unsigned getLocID(SpillLocationNo Spill, StackSlotPos Idx) {
+    unsigned SlotNo = Spill.id() - 1;
+    SlotNo *= NumSlotIdxes;
+    assert(StackSlotIdxes.contains(Idx));
+    SlotNo += StackSlotIdxes[Idx];
+    SlotNo += NumRegs;
+    return SlotNo;
+  }
+
+  /// Given a spill number, and a slot within the spill, calculate the ID number
+  /// for that location.
+  unsigned getSpillIDWithIdx(SpillLocationNo Spill, unsigned Idx) {
+    unsigned SlotNo = Spill.id() - 1;
+    SlotNo *= NumSlotIdxes;
+    SlotNo += Idx;
+    SlotNo += NumRegs;
+    return SlotNo;
+  }
+
+  /// Return the spill number that a location ID corresponds to.
+  SpillLocationNo locIDToSpill(unsigned ID) const {
+    assert(ID >= NumRegs);
+    ID -= NumRegs;
+    // Truncate away the index part, leaving only the spill number.
+    ID /= NumSlotIdxes;
+    return SpillLocationNo(ID + 1); // The UniqueVector is one-based.
+  }
+
+  /// Returns the spill-slot size/offs that a location ID corresponds to.
+  StackSlotPos locIDToSpillIdx(unsigned ID) const {
+    assert(ID >= NumRegs);
+    ID -= NumRegs;
+    unsigned Idx = ID % NumSlotIdxes;
+    return StackIdxesToPos.find(Idx)->second;
+  }
+
+  unsigned getNumLocs() const { return LocIdxToIDNum.size(); }
+
+  /// Reset all locations to contain a PHI value at the designated block. Used
+  /// sometimes for actual PHI values, othertimes to indicate the block entry
+  /// value (before any more information is known).
+  void setMPhis(unsigned NewCurBB) {
+    CurBB = NewCurBB;
+    for (auto Location : locations())
+      Location.Value = {CurBB, 0, Location.Idx};
+  }
+
+  /// Load values for each location from array of ValueIDNums. Take current
+  /// bbnum just in case we read a value from a hitherto untouched register.
+  void loadFromArray(ValueTable &Locs, unsigned NewCurBB) {
+    CurBB = NewCurBB;
+    // Iterate over all tracked locations, and load each locations live-in
+    // value into our local index.
+    for (auto Location : locations())
+      Location.Value = Locs[Location.Idx.asU64()];
+  }
+
+  /// Wipe any un-necessary location records after traversing a block.
+  void reset() {
+    // We could reset all the location values too; however either loadFromArray
+    // or setMPhis should be called before this object is re-used. Just
+    // clear Masks, they're definitely not needed.
+    Masks.clear();
+  }
+
+  /// Clear all data. Destroys the LocID <=> LocIdx map, which makes most of
+  /// the information in this pass uninterpretable.
+  void clear() {
+    reset();
+    LocIDToLocIdx.clear();
+    LocIdxToLocID.clear();
+    LocIdxToIDNum.clear();
+    // SpillLocs.reset(); XXX UniqueVector::reset assumes a SpillLoc casts from
+    // 0
+    SpillLocs = decltype(SpillLocs)();
+    StackSlotIdxes.clear();
+    StackIdxesToPos.clear();
+
+    LocIDToLocIdx.resize(NumRegs, LocIdx::MakeIllegalLoc());
+  }
+
+  /// Set a locaiton to a certain value.
+  void setMLoc(LocIdx L, ValueIDNum Num) {
+    assert(L.asU64() < LocIdxToIDNum.size());
+    LocIdxToIDNum[L] = Num;
+  }
+
+  /// Read the value of a particular location
+  ValueIDNum readMLoc(LocIdx L) {
+    assert(L.asU64() < LocIdxToIDNum.size());
+    return LocIdxToIDNum[L];
+  }
+
+  /// Create a LocIdx for an untracked register ID. Initialize it to either an
+  /// mphi value representing a live-in, or a recent register mask clobber.
+  LocIdx trackRegister(unsigned ID);
+
+  LocIdx lookupOrTrackRegister(unsigned ID) {
+    LocIdx &Index = LocIDToLocIdx[ID];
+    if (Index.isIllegal())
+      Index = trackRegister(ID);
+    return Index;
+  }
+
+  /// Is register R currently tracked by MLocTracker?
+  bool isRegisterTracked(Register R) {
+    LocIdx &Index = LocIDToLocIdx[R];
+    return !Index.isIllegal();
+  }
+
+  /// Record a definition of the specified register at the given block / inst.
+  /// This doesn't take a ValueIDNum, because the definition and its location
+  /// are synonymous.
+  void defReg(Register R, unsigned BB, unsigned Inst) {
+    unsigned ID = getLocID(R);
+    LocIdx Idx = lookupOrTrackRegister(ID);
+    ValueIDNum ValueID = {BB, Inst, Idx};
+    LocIdxToIDNum[Idx] = ValueID;
+  }
+
+  /// Set a register to a value number. To be used if the value number is
+  /// known in advance.
+  void setReg(Register R, ValueIDNum ValueID) {
+    unsigned ID = getLocID(R);
+    LocIdx Idx = lookupOrTrackRegister(ID);
+    LocIdxToIDNum[Idx] = ValueID;
+  }
+
+  ValueIDNum readReg(Register R) {
+    unsigned ID = getLocID(R);
+    LocIdx Idx = lookupOrTrackRegister(ID);
+    return LocIdxToIDNum[Idx];
+  }
+
+  /// Reset a register value to zero / empty. Needed to replicate the
+  /// VarLoc implementation where a copy to/from a register effectively
+  /// clears the contents of the source register. (Values can only have one
+  ///  machine location in VarLocBasedImpl).
+  void wipeRegister(Register R) {
+    unsigned ID = getLocID(R);
+    LocIdx Idx = LocIDToLocIdx[ID];
+    LocIdxToIDNum[Idx] = ValueIDNum::EmptyValue;
+  }
+
+  /// Determine the LocIdx of an existing register.
+  LocIdx getRegMLoc(Register R) {
+    unsigned ID = getLocID(R);
+    assert(ID < LocIDToLocIdx.size());
+    assert(LocIDToLocIdx[ID] != UINT_MAX); // Sentinal for IndexedMap.
+    return LocIDToLocIdx[ID];
+  }
+
+  /// Record a RegMask operand being executed. Defs any register we currently
+  /// track, stores a pointer to the mask in case we have to account for it
+  /// later.
+  void writeRegMask(const MachineOperand *MO, unsigned CurBB, unsigned InstID);
+
+  /// Find LocIdx for SpillLoc \p L, creating a new one if it's not tracked.
+  /// Returns std::nullopt when in scenarios where a spill slot could be
+  /// tracked, but we would likely run into resource limitations.
+  std::optional<SpillLocationNo> getOrTrackSpillLoc(SpillLoc L);
+
+  // Get LocIdx of a spill ID.
+  LocIdx getSpillMLoc(unsigned SpillID) {
+    assert(LocIDToLocIdx[SpillID] != UINT_MAX); // Sentinal for IndexedMap.
+    return LocIDToLocIdx[SpillID];
+  }
+
+  /// Return true if Idx is a spill machine location.
+  bool isSpill(LocIdx Idx) const { return LocIdxToLocID[Idx] >= NumRegs; }
+
+  /// How large is this location (aka, how wide is a value defined there?).
+  unsigned getLocSizeInBits(LocIdx L) const {
+    unsigned ID = LocIdxToLocID[L];
+    if (!isSpill(L)) {
+      return TRI.getRegSizeInBits(Register(ID), MF.getRegInfo());
+    } else {
+      // The slot location on the stack is uninteresting, we care about the
+      // position of the value within the slot (which comes with a size).
+      StackSlotPos Pos = locIDToSpillIdx(ID);
+      return Pos.first;
+    }
+  }
+
+  MLocIterator begin() { return MLocIterator(LocIdxToIDNum, 0); }
+
+  MLocIterator end() {
+    return MLocIterator(LocIdxToIDNum, LocIdxToIDNum.size());
+  }
+
+  /// Return a range over all locations currently tracked.
+  iterator_range<MLocIterator> locations() {
+    return llvm::make_range(begin(), end());
+  }
+
+  std::string LocIdxToName(LocIdx Idx) const;
+
+  std::string IDAsString(const ValueIDNum &Num) const;
+
+#ifndef NDEBUG
+  LLVM_DUMP_METHOD void dump();
+
+  LLVM_DUMP_METHOD void dump_mloc_map();
+#endif
+
+  /// Create a DBG_VALUE based on debug operands \p DbgOps. Qualify it with the
+  /// information in \pProperties, for variable Var. Don't insert it anywhere,
+  /// just return the builder for it.
+  MachineInstrBuilder emitLoc(const SmallVectorImpl<ResolvedDbgOp> &DbgOps,
+                              const DebugVariable &Var,
+                              const DbgValueProperties &Properties);
+};
+
+/// Types for recording sets of variable fragments that overlap. For a given
+/// local variable, we record all other fragments of that variable that could
+/// overlap it, to reduce search time.
+using FragmentOfVar =
+    std::pair<const DILocalVariable *, DIExpression::FragmentInfo>;
+using OverlapMap =
+    DenseMap<FragmentOfVar, SmallVector<DIExpression::FragmentInfo, 1>>;
+
+/// Collection of DBG_VALUEs observed when traversing a block. Records each
+/// variable and the value the DBG_VALUE refers to. Requires the machine value
+/// location dataflow algorithm to have run already, so that values can be
+/// identified.
+class VLocTracker {
+public:
+  /// Map DebugVariable to the latest Value it's defined to have.
+  /// Needs to be a MapVector because we determine order-in-the-input-MIR from
+  /// the order in this container.
+  /// We only retain the last DbgValue in each block for each variable, to
+  /// determine the blocks live-out variable value. The Vars container forms the
+  /// transfer function for this block, as part of the dataflow analysis. The
+  /// movement of values between locations inside of a block is handled at a
+  /// much later stage, in the TransferTracker class.
+  MapVector<DebugVariable, DbgValue> Vars;
+  SmallDenseMap<DebugVariable, const DILocation *, 8> Scopes;
+  MachineBasicBlock *MBB = nullptr;
+  const OverlapMap &OverlappingFragments;
+  DbgValueProperties EmptyProperties;
+
+public:
+  VLocTracker(const OverlapMap &O, const DIExpression *EmptyExpr)
+      : OverlappingFragments(O), EmptyProperties(EmptyExpr, false, false) {}
+
+  void defVar(const MachineInstr &MI, const DbgValueProperties &Properties,
+              const SmallVectorImpl<DbgOpID> &DebugOps) {
+    assert(MI.isDebugValueLike());
+    DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
+                      MI.getDebugLoc()->getInlinedAt());
+    DbgValue Rec = (DebugOps.size() > 0)
+                       ? DbgValue(DebugOps, Properties)
+                       : DbgValue(Properties, DbgValue::Undef);
+
+    // Attempt insertion; overwrite if it's already mapped.
+    auto Result = Vars.insert(std::make_pair(Var, Rec));
+    if (!Result.second)
+      Result.first->second = Rec;
+    Scopes[Var] = MI.getDebugLoc().get();
+
+    considerOverlaps(Var, MI.getDebugLoc().get());
+  }
+
+  void considerOverlaps(const DebugVariable &Var, const DILocation *Loc) {
+    auto Overlaps = OverlappingFragments.find(
+        {Var.getVariable(), Var.getFragmentOrDefault()});
+    if (Overlaps == OverlappingFragments.end())
+      return;
+
+    // Otherwise: terminate any overlapped variable locations.
+    for (auto FragmentInfo : Overlaps->second) {
+      // The "empty" fragment is stored as DebugVariable::DefaultFragment, so
+      // that it overlaps with everything, however its cannonical representation
+      // in a DebugVariable is as "None".
+      std::optional<DIExpression::FragmentInfo> OptFragmentInfo = FragmentInfo;
+      if (DebugVariable::isDefaultFragment(FragmentInfo))
+        OptFragmentInfo = std::nullopt;
+
+      DebugVariable Overlapped(Var.getVariable(), OptFragmentInfo,
+                               Var.getInlinedAt());
+      DbgValue Rec = DbgValue(EmptyProperties, DbgValue::Undef);
+
+      // Attempt insertion; overwrite if it's already mapped.
+      auto Result = Vars.insert(std::make_pair(Overlapped, Rec));
+      if (!Result.second)
+        Result.first->second = Rec;
+      Scopes[Overlapped] = Loc;
+    }
+  }
+
+  void clear() {
+    Vars.clear();
+    Scopes.clear();
+  }
+};
+
+// XXX XXX docs
+class InstrRefBasedLDV : public LDVImpl {
+public:
+  friend class ::InstrRefLDVTest;
+
+  using FragmentInfo = DIExpression::FragmentInfo;
+  using OptFragmentInfo = std::optional<DIExpression::FragmentInfo>;
+
+  // Helper while building OverlapMap, a map of all fragments seen for a given
+  // DILocalVariable.
+  using VarToFragments =
+      DenseMap<const DILocalVariable *, SmallSet<FragmentInfo, 4>>;
+
+  /// Machine location/value transfer function, a mapping of which locations
+  /// are assigned which new values.
+  using MLocTransferMap = SmallDenseMap<LocIdx, ValueIDNum>;
+
+  /// Live in/out structure for the variable values: a per-block map of
+  /// variables to their values.
+  using LiveIdxT = DenseMap<const MachineBasicBlock *, DbgValue *>;
+
+  using VarAndLoc = std::pair<DebugVariable, DbgValue>;
+
+  /// Type for a live-in value: the predecessor block, and its value.
+  using InValueT = std::pair<MachineBasicBlock *, DbgValue *>;
+
+  /// Vector (per block) of a collection (inner smallvector) of live-ins.
+  /// Used as the result type for the variable value dataflow problem.
+  using LiveInsT = SmallVector<SmallVector<VarAndLoc, 8>, 8>;
+
+  /// Mapping from lexical scopes to a DILocation in that scope.
+  using ScopeToDILocT = DenseMap<const LexicalScope *, const DILocation *>;
+
+  /// Mapping from lexical scopes to variables in that scope.
+  using ScopeToVarsT = DenseMap<const LexicalScope *, SmallSet<DebugVariable, 4>>;
+
+  /// Mapping from lexical scopes to blocks where variables in that scope are
+  /// assigned. Such blocks aren't necessarily "in" the lexical scope, it's
+  /// just a block where an assignment happens.
+  using ScopeToAssignBlocksT = DenseMap<const LexicalScope *, SmallPtrSet<MachineBasicBlock *, 4>>;
+
+private:
+  MachineDominatorTree *DomTree;
+  const TargetRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  const TargetInstrInfo *TII;
+  const TargetFrameLowering *TFI;
+  const MachineFrameInfo *MFI;
+  BitVector CalleeSavedRegs;
+  LexicalScopes LS;
+  TargetPassConfig *TPC;
+
+  // An empty DIExpression. Used default / placeholder DbgValueProperties
+  // objects, as we can't have null expressions.
+  const DIExpression *EmptyExpr;
+
+  /// Object to track machine locations as we step through a block. Could
+  /// probably be a field rather than a pointer, as it's always used.
+  MLocTracker *MTracker = nullptr;
+
+  /// Number of the current block LiveDebugValues is stepping through.
+  unsigned CurBB = -1;
+
+  /// Number of the current instruction LiveDebugValues is evaluating.
+  unsigned CurInst;
+
+  /// Variable tracker -- listens to DBG_VALUEs occurring as InstrRefBasedImpl
+  /// steps through a block. Reads the values at each location from the
+  /// MLocTracker object.
+  VLocTracker *VTracker = nullptr;
+
+  /// Tracker for transfers, listens to DBG_VALUEs and transfers of values
+  /// between locations during stepping, creates new DBG_VALUEs when values move
+  /// location.
+  TransferTracker *TTracker = nullptr;
+
+  /// Blocks which are artificial, i.e. blocks which exclusively contain
+  /// instructions without DebugLocs, or with line 0 locations.
+  SmallPtrSet<MachineBasicBlock *, 16> ArtificialBlocks;
+
+  // Mapping of blocks to and from their RPOT order.
+  DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;
+  DenseMap<const MachineBasicBlock *, unsigned int> BBToOrder;
+  DenseMap<unsigned, unsigned> BBNumToRPO;
+
+  /// Pair of MachineInstr, and its 1-based offset into the containing block.
+  using InstAndNum = std::pair<const MachineInstr *, unsigned>;
+  /// Map from debug instruction number to the MachineInstr labelled with that
+  /// number, and its location within the function. Used to transform
+  /// instruction numbers in DBG_INSTR_REFs into machine value numbers.
+  std::map<uint64_t, InstAndNum> DebugInstrNumToInstr;
+
+  /// Record of where we observed a DBG_PHI instruction.
+  class DebugPHIRecord {
+  public:
+    /// Instruction number of this DBG_PHI.
+    uint64_t InstrNum;
+    /// Block where DBG_PHI occurred.
+    MachineBasicBlock *MBB;
+    /// The value number read by the DBG_PHI -- or std::nullopt if it didn't
+    /// refer to a value.
+    std::optional<ValueIDNum> ValueRead;
+    /// Register/Stack location the DBG_PHI reads -- or std::nullopt if it
+    /// referred to something unexpected.
+    std::optional<LocIdx> ReadLoc;
+
+    operator unsigned() const { return InstrNum; }
+  };
+
+  /// Map from instruction numbers defined by DBG_PHIs to a record of what that
+  /// DBG_PHI read and where. Populated and edited during the machine value
+  /// location problem -- we use LLVMs SSA Updater to fix changes by
+  /// optimizations that destroy PHI instructions.
+  SmallVector<DebugPHIRecord, 32> DebugPHINumToValue;
+
+  // Map of overlapping variable fragments.
+  OverlapMap OverlapFragments;
+  VarToFragments SeenFragments;
+
+  /// Mapping of DBG_INSTR_REF instructions to their values, for those
+  /// DBG_INSTR_REFs that call resolveDbgPHIs. These variable references solve
+  /// a mini SSA problem caused by DBG_PHIs being cloned, this collection caches
+  /// the result.
+  DenseMap<std::pair<MachineInstr *, unsigned>, std::optional<ValueIDNum>>
+      SeenDbgPHIs;
+
+  DbgOpIDMap DbgOpStore;
+
+  /// True if we need to examine call instructions for stack clobbers. We
+  /// normally assume that they don't clobber SP, but stack probes on Windows
+  /// do.
+  bool AdjustsStackInCalls = false;
+
+  /// If AdjustsStackInCalls is true, this holds the name of the target's stack
+  /// probe function, which is the function we expect will alter the stack
+  /// pointer.
+  StringRef StackProbeSymbolName;
+
+  /// Tests whether this instruction is a spill to a stack slot.
+  std::optional<SpillLocationNo> isSpillInstruction(const MachineInstr &MI,
+                                                    MachineFunction *MF);
+
+  /// Decide if @MI is a spill instruction and return true if it is. We use 2
+  /// criteria to make this decision:
+  /// - Is this instruction a store to a spill slot?
+  /// - Is there a register operand that is both used and killed?
+  /// TODO: Store optimization can fold spills into other stores (including
+  /// other spills). We do not handle this yet (more than one memory operand).
+  bool isLocationSpill(const MachineInstr &MI, MachineFunction *MF,
+                       unsigned &Reg);
+
+  /// If a given instruction is identified as a spill, return the spill slot
+  /// and set \p Reg to the spilled register.
+  std::optional<SpillLocationNo> isRestoreInstruction(const MachineInstr &MI,
+                                                      MachineFunction *MF,
+                                                      unsigned &Reg);
+
+  /// Given a spill instruction, extract the spill slot information, ensure it's
+  /// tracked, and return the spill number.
+  std::optional<SpillLocationNo>
+  extractSpillBaseRegAndOffset(const MachineInstr &MI);
+
+  /// For an instruction reference given by \p InstNo and \p OpNo in instruction
+  /// \p MI returns the Value pointed to by that instruction reference if any
+  /// exists, otherwise returns std::nullopt.
+  std::optional<ValueIDNum> getValueForInstrRef(unsigned InstNo, unsigned OpNo,
+                                                MachineInstr &MI,
+                                                const ValueTable *MLiveOuts,
+                                                const ValueTable *MLiveIns);
+
+  /// Observe a single instruction while stepping through a block.
+  void process(MachineInstr &MI, const ValueTable *MLiveOuts,
+               const ValueTable *MLiveIns);
+
+  /// Examines whether \p MI is a DBG_VALUE and notifies trackers.
+  /// \returns true if MI was recognized and processed.
+  bool transferDebugValue(const MachineInstr &MI);
+
+  /// Examines whether \p MI is a DBG_INSTR_REF and notifies trackers.
+  /// \returns true if MI was recognized and processed.
+  bool transferDebugInstrRef(MachineInstr &MI, const ValueTable *MLiveOuts,
+                             const ValueTable *MLiveIns);
+
+  /// Stores value-information about where this PHI occurred, and what
+  /// instruction number is associated with it.
+  /// \returns true if MI was recognized and processed.
+  bool transferDebugPHI(MachineInstr &MI);
+
+  /// Examines whether \p MI is copy instruction, and notifies trackers.
+  /// \returns true if MI was recognized and processed.
+  bool transferRegisterCopy(MachineInstr &MI);
+
+  /// Examines whether \p MI is stack spill or restore  instruction, and
+  /// notifies trackers. \returns true if MI was recognized and processed.
+  bool transferSpillOrRestoreInst(MachineInstr &MI);
+
+  /// Examines \p MI for any registers that it defines, and notifies trackers.
+  void transferRegisterDef(MachineInstr &MI);
+
+  /// Copy one location to the other, accounting for movement of subregisters
+  /// too.
+  void performCopy(Register Src, Register Dst);
+
+  void accumulateFragmentMap(MachineInstr &MI);
+
+  /// Determine the machine value number referred to by (potentially several)
+  /// DBG_PHI instructions. Block duplication and tail folding can duplicate
+  /// DBG_PHIs, shifting the position where values in registers merge, and
+  /// forming another mini-ssa problem to solve.
+  /// \p Here the position of a DBG_INSTR_REF seeking a machine value number
+  /// \p InstrNum Debug instruction number defined by DBG_PHI instructions.
+  /// \returns The machine value number at position Here, or std::nullopt.
+  std::optional<ValueIDNum> resolveDbgPHIs(MachineFunction &MF,
+                                           const ValueTable *MLiveOuts,
+                                           const ValueTable *MLiveIns,
+                                           MachineInstr &Here,
+                                           uint64_t InstrNum);
+
+  std::optional<ValueIDNum> resolveDbgPHIsImpl(MachineFunction &MF,
+                                               const ValueTable *MLiveOuts,
+                                               const ValueTable *MLiveIns,
+                                               MachineInstr &Here,
+                                               uint64_t InstrNum);
+
+  /// Step through the function, recording register definitions and movements
+  /// in an MLocTracker. Convert the observations into a per-block transfer
+  /// function in \p MLocTransfer, suitable for using with the machine value
+  /// location dataflow problem.
+  void
+  produceMLocTransferFunction(MachineFunction &MF,
+                              SmallVectorImpl<MLocTransferMap> &MLocTransfer,
+                              unsigned MaxNumBlocks);
+
+  /// Solve the machine value location dataflow problem. Takes as input the
+  /// transfer functions in \p MLocTransfer. Writes the output live-in and
+  /// live-out arrays to the (initialized to zero) multidimensional arrays in
+  /// \p MInLocs and \p MOutLocs. The outer dimension is indexed by block
+  /// number, the inner by LocIdx.
+  void buildMLocValueMap(MachineFunction &MF, FuncValueTable &MInLocs,
+                         FuncValueTable &MOutLocs,
+                         SmallVectorImpl<MLocTransferMap> &MLocTransfer);
+
+  /// Examine the stack indexes (i.e. offsets within the stack) to find the
+  /// basic units of interference -- like reg units, but for the stack.
+  void findStackIndexInterference(SmallVectorImpl<unsigned> &Slots);
+
+  /// Install PHI values into the live-in array for each block, according to
+  /// the IDF of each register.
+  void placeMLocPHIs(MachineFunction &MF,
+                     SmallPtrSetImpl<MachineBasicBlock *> &AllBlocks,
+                     FuncValueTable &MInLocs,
+                     SmallVectorImpl<MLocTransferMap> &MLocTransfer);
+
+  /// Propagate variable values to blocks in the common case where there's
+  /// only one value assigned to the variable. This function has better
+  /// performance as it doesn't have to find the dominance frontier between
+  /// different assignments.
+  void placePHIsForSingleVarDefinition(
+          const SmallPtrSetImpl<MachineBasicBlock *> &InScopeBlocks,
+          MachineBasicBlock *MBB, SmallVectorImpl<VLocTracker> &AllTheVLocs,
+          const DebugVariable &Var, LiveInsT &Output);
+
+  /// Calculate the iterated-dominance-frontier for a set of defs, using the
+  /// existing LLVM facilities for this. Works for a single "value" or
+  /// machine/variable location.
+  /// \p AllBlocks Set of blocks where we might consume the value.
+  /// \p DefBlocks Set of blocks where the value/location is defined.
+  /// \p PHIBlocks Output set of blocks where PHIs must be placed.
+  void BlockPHIPlacement(const SmallPtrSetImpl<MachineBasicBlock *> &AllBlocks,
+                         const SmallPtrSetImpl<MachineBasicBlock *> &DefBlocks,
+                         SmallVectorImpl<MachineBasicBlock *> &PHIBlocks);
+
+  /// Perform a control flow join (lattice value meet) of the values in machine
+  /// locations at \p MBB. Follows the algorithm described in the file-comment,
+  /// reading live-outs of predecessors from \p OutLocs, the current live ins
+  /// from \p InLocs, and assigning the newly computed live ins back into
+  /// \p InLocs. \returns two bools -- the first indicates whether a change
+  /// was made, the second whether a lattice downgrade occurred. If the latter
+  /// is true, revisiting this block is necessary.
+  bool mlocJoin(MachineBasicBlock &MBB,
+                SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
+                FuncValueTable &OutLocs, ValueTable &InLocs);
+
+  /// Produce a set of blocks that are in the current lexical scope. This means
+  /// those blocks that contain instructions "in" the scope, blocks where
+  /// assignments to variables in scope occur, and artificial blocks that are
+  /// successors to any of the earlier blocks. See https://llvm.org/PR48091 for
+  /// more commentry on what "in scope" means.
+  /// \p DILoc A location in the scope that we're fetching blocks for.
+  /// \p Output Set to put in-scope-blocks into.
+  /// \p AssignBlocks Blocks known to contain assignments of variables in scope.
+  void
+  getBlocksForScope(const DILocation *DILoc,
+                    SmallPtrSetImpl<const MachineBasicBlock *> &Output,
+                    const SmallPtrSetImpl<MachineBasicBlock *> &AssignBlocks);
+
+  /// Solve the variable value dataflow problem, for a single lexical scope.
+  /// Uses the algorithm from the file comment to resolve control flow joins
+  /// using PHI placement and value propagation. Reads the locations of machine
+  /// values from the \p MInLocs and \p MOutLocs arrays (see buildMLocValueMap)
+  /// and reads the variable values transfer function from \p AllTheVlocs.
+  /// Live-in and Live-out variable values are stored locally, with the live-ins
+  /// permanently stored to \p Output once a fixedpoint is reached.
+  /// \p VarsWeCareAbout contains a collection of the variables in \p Scope
+  /// that we should be tracking.
+  /// \p AssignBlocks contains the set of blocks that aren't in \p DILoc's
+  /// scope, but which do contain DBG_VALUEs, which VarLocBasedImpl tracks
+  /// locations through.
+  void buildVLocValueMap(const DILocation *DILoc,
+                         const SmallSet<DebugVariable, 4> &VarsWeCareAbout,
+                         SmallPtrSetImpl<MachineBasicBlock *> &AssignBlocks,
+                         LiveInsT &Output, FuncValueTable &MOutLocs,
+                         FuncValueTable &MInLocs,
+                         SmallVectorImpl<VLocTracker> &AllTheVLocs);
+
+  /// Attempt to eliminate un-necessary PHIs on entry to a block. Examines the
+  /// live-in values coming from predecessors live-outs, and replaces any PHIs
+  /// already present in this blocks live-ins with a live-through value if the
+  /// PHI isn't needed.
+  /// \p LiveIn Old live-in value, overwritten with new one if live-in changes.
+  /// \returns true if any live-ins change value, either from value propagation
+  ///          or PHI elimination.
+  bool vlocJoin(MachineBasicBlock &MBB, LiveIdxT &VLOCOutLocs,
+                SmallPtrSet<const MachineBasicBlock *, 8> &BlocksToExplore,
+                DbgValue &LiveIn);
+
+  /// For the given block and live-outs feeding into it, try to find
+  /// machine locations for each debug operand where all the values feeding
+  /// into that operand join together.
+  /// \returns true if a joined location was found for every value that needed
+  ///          to be joined.
+  bool
+  pickVPHILoc(SmallVectorImpl<DbgOpID> &OutValues, const MachineBasicBlock &MBB,
+              const LiveIdxT &LiveOuts, FuncValueTable &MOutLocs,
+              const SmallVectorImpl<const MachineBasicBlock *> &BlockOrders);
+
+  std::optional<ValueIDNum> pickOperandPHILoc(
+      unsigned DbgOpIdx, const MachineBasicBlock &MBB, const LiveIdxT &LiveOuts,
+      FuncValueTable &MOutLocs,
+      const SmallVectorImpl<const MachineBasicBlock *> &BlockOrders);
+
+  /// Take collections of DBG_VALUE instructions stored in TTracker, and
+  /// install them into their output blocks. Preserves a stable order of
+  /// DBG_VALUEs produced (which would otherwise cause nondeterminism) through
+  /// the AllVarsNumbering order.
+  bool emitTransfers(DenseMap<DebugVariable, unsigned> &AllVarsNumbering);
+
+  /// Boilerplate computation of some initial sets, artifical blocks and
+  /// RPOT block ordering.
+  void initialSetup(MachineFunction &MF);
+
+  /// Produce a map of the last lexical scope that uses a block, using the
+  /// scopes DFSOut number. Mapping is block-number to DFSOut.
+  /// \p EjectionMap Pre-allocated vector in which to install the built ma.
+  /// \p ScopeToDILocation Mapping of LexicalScopes to their DILocations.
+  /// \p AssignBlocks Map of blocks where assignments happen for a scope.
+  void makeDepthFirstEjectionMap(SmallVectorImpl<unsigned> &EjectionMap,
+                                 const ScopeToDILocT &ScopeToDILocation,
+                                 ScopeToAssignBlocksT &AssignBlocks);
+
+  /// When determining per-block variable values and emitting to DBG_VALUEs,
+  /// this function explores by lexical scope depth. Doing so means that per
+  /// block information can be fully computed before exploration finishes,
+  /// allowing us to emit it and free data structures earlier than otherwise.
+  /// It's also good for locality.
+  bool depthFirstVLocAndEmit(
+      unsigned MaxNumBlocks, const ScopeToDILocT &ScopeToDILocation,
+      const ScopeToVarsT &ScopeToVars, ScopeToAssignBlocksT &ScopeToBlocks,
+      LiveInsT &Output, FuncValueTable &MOutLocs, FuncValueTable &MInLocs,
+      SmallVectorImpl<VLocTracker> &AllTheVLocs, MachineFunction &MF,
+      DenseMap<DebugVariable, unsigned> &AllVarsNumbering,
+      const TargetPassConfig &TPC);
+
+  bool ExtendRanges(MachineFunction &MF, MachineDominatorTree *DomTree,
+                    TargetPassConfig *TPC, unsigned InputBBLimit,
+                    unsigned InputDbgValLimit) override;
+
+public:
+  /// Default construct and initialize the pass.
+  InstrRefBasedLDV();
+
+  LLVM_DUMP_METHOD
+  void dump_mloc_transfer(const MLocTransferMap &mloc_transfer) const;
+
+  bool isCalleeSaved(LocIdx L) const;
+  bool isCalleeSavedReg(Register R) const;
+
+  bool hasFoldedStackStore(const MachineInstr &MI) {
+    // Instruction must have a memory operand that's a stack slot, and isn't
+    // aliased, meaning it's a spill from regalloc instead of a variable.
+    // If it's aliased, we can't guarantee its value.
+    if (!MI.hasOneMemOperand())
+      return false;
+    auto *MemOperand = *MI.memoperands_begin();
+    return MemOperand->isStore() &&
+           MemOperand->getPseudoValue() &&
+           MemOperand->getPseudoValue()->kind() == PseudoSourceValue::FixedStack
+           && !MemOperand->getPseudoValue()->isAliased(MFI);
+  }
+
+  std::optional<LocIdx> findLocationForMemOperand(const MachineInstr &MI);
+};
+
+} // namespace LiveDebugValues
+
+#endif /* LLVM_LIB_CODEGEN_LIVEDEBUGVALUES_INSTRREFBASEDLDV_H */
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
new file mode 100644
index 000000000000..0c0a4e13c7c9
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
@@ -0,0 +1,139 @@
+//===- LiveDebugValues.cpp - Tracking Debug Value MIs ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "LiveDebugValues.h"
+
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Triple.h"
+
+/// \file LiveDebugValues.cpp
+///
+/// The LiveDebugValues pass extends the range of variable locations
+/// (specified by DBG_VALUE instructions) from single blocks to successors
+/// and any other code locations where the variable location is valid.
+/// There are currently two implementations: the "VarLoc" implementation
+/// explicitly tracks the location of a variable, while the "InstrRef"
+/// implementation tracks the values defined by instructions through locations.
+///
+/// This file implements neither; it merely registers the pass, allows the
+/// user to pick which implementation will be used to propagate variable
+/// locations.
+
+#define DEBUG_TYPE "livedebugvalues"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    ForceInstrRefLDV("force-instr-ref-livedebugvalues", cl::Hidden,
+                     cl::desc("Use instruction-ref based LiveDebugValues with "
+                              "normal DBG_VALUE inputs"),
+                     cl::init(false));
+
+static cl::opt<cl::boolOrDefault> ValueTrackingVariableLocations(
+    "experimental-debug-variable-locations",
+    cl::desc("Use experimental new value-tracking variable locations"));
+
+// Options to prevent pathological compile-time behavior. If InputBBLimit and
+// InputDbgValueLimit are both exceeded, range extension is disabled.
+static cl::opt<unsigned> InputBBLimit(
+    "livedebugvalues-input-bb-limit",
+    cl::desc("Maximum input basic blocks before DBG_VALUE limit applies"),
+    cl::init(10000), cl::Hidden);
+static cl::opt<unsigned> InputDbgValueLimit(
+    "livedebugvalues-input-dbg-value-limit",
+    cl::desc(
+        "Maximum input DBG_VALUE insts supported by debug range extension"),
+    cl::init(50000), cl::Hidden);
+
+namespace {
+/// Generic LiveDebugValues pass. Calls through to VarLocBasedLDV or
+/// InstrRefBasedLDV to perform location propagation, via the LDVImpl
+/// base class.
+class LiveDebugValues : public MachineFunctionPass {
+public:
+  static char ID;
+
+  LiveDebugValues();
+  ~LiveDebugValues() = default;
+
+  /// Calculate the liveness information for the given machine function.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  std::unique_ptr<LDVImpl> InstrRefImpl;
+  std::unique_ptr<LDVImpl> VarLocImpl;
+  TargetPassConfig *TPC = nullptr;
+  MachineDominatorTree MDT;
+};
+} // namespace
+
+char LiveDebugValues::ID = 0;
+
+char &llvm::LiveDebugValuesID = LiveDebugValues::ID;
+
+INITIALIZE_PASS(LiveDebugValues, DEBUG_TYPE, "Live DEBUG_VALUE analysis", false,
+                false)
+
+/// Default construct and initialize the pass.
+LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) {
+  initializeLiveDebugValuesPass(*PassRegistry::getPassRegistry());
+  InstrRefImpl =
+      std::unique_ptr<LDVImpl>(llvm::makeInstrRefBasedLiveDebugValues());
+  VarLocImpl = std::unique_ptr<LDVImpl>(llvm::makeVarLocBasedLiveDebugValues());
+}
+
+bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
+  // Except for Wasm, all targets should be only using physical register at this
+  // point. Wasm only use virtual registers throught its pipeline, but its
+  // virtual registers don't participate  in this LiveDebugValues analysis; only
+  // its target indices do.
+  assert(MF.getTarget().getTargetTriple().isWasm() ||
+         MF.getProperties().hasProperty(
+             MachineFunctionProperties::Property::NoVRegs));
+
+  bool InstrRefBased = MF.useDebugInstrRef();
+  // Allow the user to force selection of InstrRef LDV.
+  InstrRefBased |= ForceInstrRefLDV;
+
+  TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  LDVImpl *TheImpl = &*VarLocImpl;
+
+  MachineDominatorTree *DomTree = nullptr;
+  if (InstrRefBased) {
+    DomTree = &MDT;
+    MDT.calculate(MF);
+    TheImpl = &*InstrRefImpl;
+  }
+
+  return TheImpl->ExtendRanges(MF, DomTree, TPC, InputBBLimit,
+                               InputDbgValueLimit);
+}
+
+bool llvm::debuginfoShouldUseDebugInstrRef(const Triple &T) {
+  // Enable by default on x86_64, disable if explicitly turned off on cmdline.
+  if (T.getArch() == llvm::Triple::x86_64 &&
+      ValueTrackingVariableLocations != cl::boolOrDefault::BOU_FALSE)
+    return true;
+
+  // Enable if explicitly requested on command line.
+  return ValueTrackingVariableLocations == cl::boolOrDefault::BOU_TRUE;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
new file mode 100644
index 000000000000..6cc1685c0022
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
@@ -0,0 +1,43 @@
+//===- LiveDebugValues.cpp - Tracking Debug Value MIs ---------*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_LIVEDEBUGVALUES_LIVEDEBUGVALUES_H
+#define LLVM_LIB_CODEGEN_LIVEDEBUGVALUES_LIVEDEBUGVALUES_H
+
+namespace llvm {
+class MachineDominatorTree;
+class MachineFunction;
+class TargetPassConfig;
+class Triple;
+
+// Inline namespace for types / symbols shared between different
+// LiveDebugValues implementations.
+inline namespace SharedLiveDebugValues {
+
+// Expose a base class for LiveDebugValues interfaces to inherit from. This
+// allows the generic LiveDebugValues pass handles to call into the
+// implementation.
+class LDVImpl {
+public:
+  virtual bool ExtendRanges(MachineFunction &MF, MachineDominatorTree *DomTree,
+                            TargetPassConfig *TPC, unsigned InputBBLimit,
+                            unsigned InputDbgValLimit) = 0;
+  virtual ~LDVImpl() = default;
+};
+
+} // namespace SharedLiveDebugValues
+
+// Factory functions for LiveDebugValues implementations.
+extern LDVImpl *makeVarLocBasedLiveDebugValues();
+extern LDVImpl *makeInstrRefBasedLiveDebugValues();
+
+extern bool debuginfoShouldUseDebugInstrRef(const Triple &T);
+
+} // namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_LIVEDEBUGVALUES_LIVEDEBUGVALUES_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
new file mode 100644
index 000000000000..116c6b7e2d19
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -0,0 +1,2405 @@
+//===- VarLocBasedImpl.cpp - Tracking Debug Value MIs with VarLoc class----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file VarLocBasedImpl.cpp
+///
+/// LiveDebugValues is an optimistic "available expressions" dataflow
+/// algorithm. The set of expressions is the set of machine locations
+/// (registers, spill slots, constants, and target indices) that a variable
+/// fragment might be located, qualified by a DIExpression and indirect-ness
+/// flag, while each variable is identified by a DebugVariable object. The
+/// availability of an expression begins when a DBG_VALUE instruction specifies
+/// the location of a DebugVariable, and continues until that location is
+/// clobbered or re-specified by a different DBG_VALUE for the same
+/// DebugVariable.
+///
+/// The output of LiveDebugValues is additional DBG_VALUE instructions,
+/// placed to extend variable locations as far they're available. This file
+/// and the VarLocBasedLDV class is an implementation that explicitly tracks
+/// locations, using the VarLoc class.
+///
+/// The canonical "available expressions" problem doesn't have expression
+/// clobbering, instead when a variable is re-assigned, any expressions using
+/// that variable get invalidated. LiveDebugValues can map onto "available
+/// expressions" by having every register represented by a variable, which is
+/// used in an expression that becomes available at a DBG_VALUE instruction.
+/// When the register is clobbered, its variable is effectively reassigned, and
+/// expressions computed from it become unavailable. A similar construct is
+/// needed when a DebugVariable has its location re-specified, to invalidate
+/// all other locations for that DebugVariable.
+///
+/// Using the dataflow analysis to compute the available expressions, we create
+/// a DBG_VALUE at the beginning of each block where the expression is
+/// live-in. This propagates variable locations into every basic block where
+/// the location can be determined, rather than only having DBG_VALUEs in blocks
+/// where locations are specified due to an assignment or some optimization.
+/// Movements of values between registers and spill slots are annotated with
+/// DBG_VALUEs too to track variable values bewteen locations. All this allows
+/// DbgEntityHistoryCalculator to focus on only the locations within individual
+/// blocks, facilitating testing and improving modularity.
+///
+/// We follow an optimisic dataflow approach, with this lattice:
+///
+/// \verbatim
+///                    ┬ "Unknown"
+///                          |
+///                          v
+///                         True
+///                          |
+///                          v
+///                      ⊥ False
+/// \endverbatim With "True" signifying that the expression is available (and
+/// thus a DebugVariable's location is the corresponding register), while
+/// "False" signifies that the expression is unavailable. "Unknown"s never
+/// survive to the end of the analysis (see below).
+///
+/// Formally, all DebugVariable locations that are live-out of a block are
+/// initialized to \top.  A blocks live-in values take the meet of the lattice
+/// value for every predecessors live-outs, except for the entry block, where
+/// all live-ins are \bot. The usual dataflow propagation occurs: the transfer
+/// function for a block assigns an expression for a DebugVariable to be "True"
+/// if a DBG_VALUE in the block specifies it; "False" if the location is
+/// clobbered; or the live-in value if it is unaffected by the block. We
+/// visit each block in reverse post order until a fixedpoint is reached. The
+/// solution produced is maximal.
+///
+/// Intuitively, we start by assuming that every expression / variable location
+/// is at least "True", and then propagate "False" from the entry block and any
+/// clobbers until there are no more changes to make. This gives us an accurate
+/// solution because all incorrect locations will have a "False" propagated into
+/// them. It also gives us a solution that copes well with loops by assuming
+/// that variable locations are live-through every loop, and then removing those
+/// that are not through dataflow.
+///
+/// Within LiveDebugValues: each variable location is represented by a
+/// VarLoc object that identifies the source variable, the set of
+/// machine-locations that currently describe it (a single location for
+/// DBG_VALUE or multiple for DBG_VALUE_LIST), and the DBG_VALUE inst that
+/// specifies the location. Each VarLoc is indexed in the (function-scope) \p
+/// VarLocMap, giving each VarLoc a set of unique indexes, each of which
+/// corresponds to one of the VarLoc's machine-locations and can be used to
+/// lookup the VarLoc in the VarLocMap. Rather than operate directly on machine
+/// locations, the dataflow analysis in this pass identifies locations by their
+/// indices in the VarLocMap, meaning all the variable locations in a block can
+/// be described by a sparse vector of VarLocMap indicies.
+///
+/// All the storage for the dataflow analysis is local to the ExtendRanges
+/// method and passed down to helper methods. "OutLocs" and "InLocs" record the
+/// in and out lattice values for each block. "OpenRanges" maintains a list of
+/// variable locations and, with the "process" method, evaluates the transfer
+/// function of each block. "flushPendingLocs" installs debug value instructions
+/// for each live-in location at the start of blocks, while "Transfers" records
+/// transfers of values between machine-locations.
+///
+/// We avoid explicitly representing the "Unknown" (\top) lattice value in the
+/// implementation. Instead, unvisited blocks implicitly have all lattice
+/// values set as "Unknown". After being visited, there will be path back to
+/// the entry block where the lattice value is "False", and as the transfer
+/// function cannot make new "Unknown" locations, there are no scenarios where
+/// a block can have an "Unknown" location after being visited. Similarly, we
+/// don't enumerate all possible variable locations before exploring the
+/// function: when a new location is discovered, all blocks previously explored
+/// were implicitly "False" but unrecorded, and become explicitly "False" when
+/// a new VarLoc is created with its bit not set in predecessor InLocs or
+/// OutLocs.
+///
+//===----------------------------------------------------------------------===//
+
+#include "LiveDebugValues.h"
+
+#include "llvm/ADT/CoalescingBitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TypeSize.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <optional>
+#include <queue>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "livedebugvalues"
+
+STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
+
+/// If \p Op is a stack or frame register return true, otherwise return false.
+/// This is used to avoid basing the debug entry values on the registers, since
+/// we do not support it at the moment.
+static bool isRegOtherThanSPAndFP(const MachineOperand &Op,
+                                  const MachineInstr &MI,
+                                  const TargetRegisterInfo *TRI) {
+  if (!Op.isReg())
+    return false;
+
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+  Register SP = TLI->getStackPointerRegisterToSaveRestore();
+  Register FP = TRI->getFrameRegister(*MF);
+  Register Reg = Op.getReg();
+
+  return Reg && Reg != SP && Reg != FP;
+}
+
+namespace {
+
+// Max out the number of statically allocated elements in DefinedRegsSet, as
+// this prevents fallback to std::set::count() operations.
+using DefinedRegsSet = SmallSet<Register, 32>;
+
+// The IDs in this set correspond to MachineLocs in VarLocs, as well as VarLocs
+// that represent Entry Values; every VarLoc in the set will also appear
+// exactly once at Location=0.
+// As a result, each VarLoc may appear more than once in this "set", but each
+// range corresponding to a Reg, SpillLoc, or EntryValue type will still be a
+// "true" set (i.e. each VarLoc may appear only once), and the range Location=0
+// is the set of all VarLocs.
+using VarLocSet = CoalescingBitVector<uint64_t>;
+
+/// A type-checked pair of {Register Location (or 0), Index}, used to index
+/// into a \ref VarLocMap. This can be efficiently converted to a 64-bit int
+/// for insertion into a \ref VarLocSet, and efficiently converted back. The
+/// type-checker helps ensure that the conversions aren't lossy.
+///
+/// Why encode a location /into/ the VarLocMap index? This makes it possible
+/// to find the open VarLocs killed by a register def very quickly. This is a
+/// performance-critical operation for LiveDebugValues.
+struct LocIndex {
+  using u32_location_t = uint32_t;
+  using u32_index_t = uint32_t;
+
+  u32_location_t Location; // Physical registers live in the range [1;2^30) (see
+                           // \ref MCRegister), so we have plenty of range left
+                           // here to encode non-register locations.
+  u32_index_t Index;
+
+  /// The location that has an entry for every VarLoc in the map.
+  static constexpr u32_location_t kUniversalLocation = 0;
+
+  /// The first location that is reserved for VarLocs with locations of kind
+  /// RegisterKind.
+  static constexpr u32_location_t kFirstRegLocation = 1;
+
+  /// The first location greater than 0 that is not reserved for VarLocs with
+  /// locations of kind RegisterKind.
+  static constexpr u32_location_t kFirstInvalidRegLocation = 1 << 30;
+
+  /// A special location reserved for VarLocs with locations of kind
+  /// SpillLocKind.
+  static constexpr u32_location_t kSpillLocation = kFirstInvalidRegLocation;
+
+  /// A special location reserved for VarLocs of kind EntryValueBackupKind and
+  /// EntryValueCopyBackupKind.
+  static constexpr u32_location_t kEntryValueBackupLocation =
+      kFirstInvalidRegLocation + 1;
+
+  /// A special location reserved for VarLocs with locations of kind
+  /// WasmLocKind.
+  /// TODO Placing all Wasm target index locations in this single kWasmLocation
+  /// may cause slowdown in compilation time in very large functions. Consider
+  /// giving a each target index/offset pair its own u32_location_t if this
+  /// becomes a problem.
+  static constexpr u32_location_t kWasmLocation = kFirstInvalidRegLocation + 2;
+
+  LocIndex(u32_location_t Location, u32_index_t Index)
+      : Location(Location), Index(Index) {}
+
+  uint64_t getAsRawInteger() const {
+    return (static_cast<uint64_t>(Location) << 32) | Index;
+  }
+
+  template<typename IntT> static LocIndex fromRawInteger(IntT ID) {
+    static_assert(std::is_unsigned_v<IntT> && sizeof(ID) == sizeof(uint64_t),
+                  "Cannot convert raw integer to LocIndex");
+    return {static_cast<u32_location_t>(ID >> 32),
+            static_cast<u32_index_t>(ID)};
+  }
+
+  /// Get the start of the interval reserved for VarLocs of kind RegisterKind
+  /// which reside in \p Reg. The end is at rawIndexForReg(Reg+1)-1.
+  static uint64_t rawIndexForReg(Register Reg) {
+    return LocIndex(Reg, 0).getAsRawInteger();
+  }
+
+  /// Return a range covering all set indices in the interval reserved for
+  /// \p Location in \p Set.
+  static auto indexRangeForLocation(const VarLocSet &Set,
+                                    u32_location_t Location) {
+    uint64_t Start = LocIndex(Location, 0).getAsRawInteger();
+    uint64_t End = LocIndex(Location + 1, 0).getAsRawInteger();
+    return Set.half_open_range(Start, End);
+  }
+};
+
+// Simple Set for storing all the VarLoc Indices at a Location bucket.
+using VarLocsInRange = SmallSet<LocIndex::u32_index_t, 32>;
+// Vector of all `LocIndex`s for a given VarLoc; the same Location should not
+// appear in any two of these, as each VarLoc appears at most once in any
+// Location bucket.
+using LocIndices = SmallVector<LocIndex, 2>;
+
+class VarLocBasedLDV : public LDVImpl {
+private:
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+  const TargetFrameLowering *TFI;
+  TargetPassConfig *TPC;
+  BitVector CalleeSavedRegs;
+  LexicalScopes LS;
+  VarLocSet::Allocator Alloc;
+
+  const MachineInstr *LastNonDbgMI;
+
+  enum struct TransferKind { TransferCopy, TransferSpill, TransferRestore };
+
+  using FragmentInfo = DIExpression::FragmentInfo;
+  using OptFragmentInfo = std::optional<DIExpression::FragmentInfo>;
+
+  /// A pair of debug variable and value location.
+  struct VarLoc {
+    // The location at which a spilled variable resides. It consists of a
+    // register and an offset.
+    struct SpillLoc {
+      unsigned SpillBase;
+      StackOffset SpillOffset;
+      bool operator==(const SpillLoc &Other) const {
+        return SpillBase == Other.SpillBase && SpillOffset == Other.SpillOffset;
+      }
+      bool operator!=(const SpillLoc &Other) const {
+        return !(*this == Other);
+      }
+    };
+
+    // Target indices used for wasm-specific locations.
+    struct WasmLoc {
+      // One of TargetIndex values defined in WebAssembly.h. We deal with
+      // local-related TargetIndex in this analysis (TI_LOCAL and
+      // TI_LOCAL_INDIRECT). Stack operands (TI_OPERAND_STACK) will be handled
+      // separately WebAssemblyDebugFixup pass, and we don't associate debug
+      // info with values in global operands (TI_GLOBAL_RELOC) at the moment.
+      int Index;
+      int64_t Offset;
+      bool operator==(const WasmLoc &Other) const {
+        return Index == Other.Index && Offset == Other.Offset;
+      }
+      bool operator!=(const WasmLoc &Other) const { return !(*this == Other); }
+    };
+
+    /// Identity of the variable at this location.
+    const DebugVariable Var;
+
+    /// The expression applied to this location.
+    const DIExpression *Expr;
+
+    /// DBG_VALUE to clone var/expr information from if this location
+    /// is moved.
+    const MachineInstr &MI;
+
+    enum class MachineLocKind {
+      InvalidKind = 0,
+      RegisterKind,
+      SpillLocKind,
+      ImmediateKind,
+      WasmLocKind
+    };
+
+    enum class EntryValueLocKind {
+      NonEntryValueKind = 0,
+      EntryValueKind,
+      EntryValueBackupKind,
+      EntryValueCopyBackupKind
+    } EVKind = EntryValueLocKind::NonEntryValueKind;
+
+    /// The value location. Stored separately to avoid repeatedly
+    /// extracting it from MI.
+    union MachineLocValue {
+      uint64_t RegNo;
+      SpillLoc SpillLocation;
+      uint64_t Hash;
+      int64_t Immediate;
+      const ConstantFP *FPImm;
+      const ConstantInt *CImm;
+      WasmLoc WasmLocation;
+      MachineLocValue() : Hash(0) {}
+    };
+
+    /// A single machine location; its Kind is either a register, spill
+    /// location, or immediate value.
+    /// If the VarLoc is not a NonEntryValueKind, then it will use only a
+    /// single MachineLoc of RegisterKind.
+    struct MachineLoc {
+      MachineLocKind Kind;
+      MachineLocValue Value;
+      bool operator==(const MachineLoc &Other) const {
+        if (Kind != Other.Kind)
+          return false;
+        switch (Kind) {
+        case MachineLocKind::SpillLocKind:
+          return Value.SpillLocation == Other.Value.SpillLocation;
+        case MachineLocKind::WasmLocKind:
+          return Value.WasmLocation == Other.Value.WasmLocation;
+        case MachineLocKind::RegisterKind:
+        case MachineLocKind::ImmediateKind:
+          return Value.Hash == Other.Value.Hash;
+        default:
+          llvm_unreachable("Invalid kind");
+        }
+      }
+      bool operator<(const MachineLoc &Other) const {
+        switch (Kind) {
+        case MachineLocKind::SpillLocKind:
+          return std::make_tuple(
+                     Kind, Value.SpillLocation.SpillBase,
+                     Value.SpillLocation.SpillOffset.getFixed(),
+                     Value.SpillLocation.SpillOffset.getScalable()) <
+                 std::make_tuple(
+                     Other.Kind, Other.Value.SpillLocation.SpillBase,
+                     Other.Value.SpillLocation.SpillOffset.getFixed(),
+                     Other.Value.SpillLocation.SpillOffset.getScalable());
+        case MachineLocKind::WasmLocKind:
+          return std::make_tuple(Kind, Value.WasmLocation.Index,
+                                 Value.WasmLocation.Offset) <
+                 std::make_tuple(Other.Kind, Other.Value.WasmLocation.Index,
+                                 Other.Value.WasmLocation.Offset);
+        case MachineLocKind::RegisterKind:
+        case MachineLocKind::ImmediateKind:
+          return std::tie(Kind, Value.Hash) <
+                 std::tie(Other.Kind, Other.Value.Hash);
+        default:
+          llvm_unreachable("Invalid kind");
+        }
+      }
+    };
+
+    /// The set of machine locations used to determine the variable's value, in
+    /// conjunction with Expr. Initially populated with MI's debug operands,
+    /// but may be transformed independently afterwards.
+    SmallVector<MachineLoc, 8> Locs;
+    /// Used to map the index of each location in Locs back to the index of its
+    /// original debug operand in MI. Used when multiple location operands are
+    /// coalesced and the original MI's operands need to be accessed while
+    /// emitting a debug value.
+    SmallVector<unsigned, 8> OrigLocMap;
+
+    VarLoc(const MachineInstr &MI)
+        : Var(MI.getDebugVariable(), MI.getDebugExpression(),
+              MI.getDebugLoc()->getInlinedAt()),
+          Expr(MI.getDebugExpression()), MI(MI) {
+      assert(MI.isDebugValue() && "not a DBG_VALUE");
+      assert((MI.isDebugValueList() || MI.getNumOperands() == 4) &&
+             "malformed DBG_VALUE");
+      for (const MachineOperand &Op : MI.debug_operands()) {
+        MachineLoc ML = GetLocForOp(Op);
+        auto It = find(Locs, ML);
+        if (It == Locs.end()) {
+          Locs.push_back(ML);
+          OrigLocMap.push_back(MI.getDebugOperandIndex(&Op));
+        } else {
+          // ML duplicates an element in Locs; replace references to Op
+          // with references to the duplicating element.
+          unsigned OpIdx = Locs.size();
+          unsigned DuplicatingIdx = std::distance(Locs.begin(), It);
+          Expr = DIExpression::replaceArg(Expr, OpIdx, DuplicatingIdx);
+        }
+      }
+
+      // We create the debug entry values from the factory functions rather
+      // than from this ctor.
+      assert(EVKind != EntryValueLocKind::EntryValueKind &&
+             !isEntryBackupLoc());
+    }
+
+    static MachineLoc GetLocForOp(const MachineOperand &Op) {
+      MachineLocKind Kind;
+      MachineLocValue Loc;
+      if (Op.isReg()) {
+        Kind = MachineLocKind::RegisterKind;
+        Loc.RegNo = Op.getReg();
+      } else if (Op.isImm()) {
+        Kind = MachineLocKind::ImmediateKind;
+        Loc.Immediate = Op.getImm();
+      } else if (Op.isFPImm()) {
+        Kind = MachineLocKind::ImmediateKind;
+        Loc.FPImm = Op.getFPImm();
+      } else if (Op.isCImm()) {
+        Kind = MachineLocKind::ImmediateKind;
+        Loc.CImm = Op.getCImm();
+      } else if (Op.isTargetIndex()) {
+        Kind = MachineLocKind::WasmLocKind;
+        Loc.WasmLocation = {Op.getIndex(), Op.getOffset()};
+      } else
+        llvm_unreachable("Invalid Op kind for MachineLoc.");
+      return {Kind, Loc};
+    }
+
+    /// Take the variable and machine-location in DBG_VALUE MI, and build an
+    /// entry location using the given expression.
+    static VarLoc CreateEntryLoc(const MachineInstr &MI,
+                                 const DIExpression *EntryExpr, Register Reg) {
+      VarLoc VL(MI);
+      assert(VL.Locs.size() == 1 &&
+             VL.Locs[0].Kind == MachineLocKind::RegisterKind);
+      VL.EVKind = EntryValueLocKind::EntryValueKind;
+      VL.Expr = EntryExpr;
+      VL.Locs[0].Value.RegNo = Reg;
+      return VL;
+    }
+
+    /// Take the variable and machine-location from the DBG_VALUE (from the
+    /// function entry), and build an entry value backup location. The backup
+    /// location will turn into the normal location if the backup is valid at
+    /// the time of the primary location clobbering.
+    static VarLoc CreateEntryBackupLoc(const MachineInstr &MI,
+                                       const DIExpression *EntryExpr) {
+      VarLoc VL(MI);
+      assert(VL.Locs.size() == 1 &&
+             VL.Locs[0].Kind == MachineLocKind::RegisterKind);
+      VL.EVKind = EntryValueLocKind::EntryValueBackupKind;
+      VL.Expr = EntryExpr;
+      return VL;
+    }
+
+    /// Take the variable and machine-location from the DBG_VALUE (from the
+    /// function entry), and build a copy of an entry value backup location by
+    /// setting the register location to NewReg.
+    static VarLoc CreateEntryCopyBackupLoc(const MachineInstr &MI,
+                                           const DIExpression *EntryExpr,
+                                           Register NewReg) {
+      VarLoc VL(MI);
+      assert(VL.Locs.size() == 1 &&
+             VL.Locs[0].Kind == MachineLocKind::RegisterKind);
+      VL.EVKind = EntryValueLocKind::EntryValueCopyBackupKind;
+      VL.Expr = EntryExpr;
+      VL.Locs[0].Value.RegNo = NewReg;
+      return VL;
+    }
+
+    /// Copy the register location in DBG_VALUE MI, updating the register to
+    /// be NewReg.
+    static VarLoc CreateCopyLoc(const VarLoc &OldVL, const MachineLoc &OldML,
+                                Register NewReg) {
+      VarLoc VL = OldVL;
+      for (MachineLoc &ML : VL.Locs)
+        if (ML == OldML) {
+          ML.Kind = MachineLocKind::RegisterKind;
+          ML.Value.RegNo = NewReg;
+          return VL;
+        }
+      llvm_unreachable("Should have found OldML in new VarLoc.");
+    }
+
+    /// Take the variable described by DBG_VALUE* MI, and create a VarLoc
+    /// locating it in the specified spill location.
+    static VarLoc CreateSpillLoc(const VarLoc &OldVL, const MachineLoc &OldML,
+                                 unsigned SpillBase, StackOffset SpillOffset) {
+      VarLoc VL = OldVL;
+      for (MachineLoc &ML : VL.Locs)
+        if (ML == OldML) {
+          ML.Kind = MachineLocKind::SpillLocKind;
+          ML.Value.SpillLocation = {SpillBase, SpillOffset};
+          return VL;
+        }
+      llvm_unreachable("Should have found OldML in new VarLoc.");
+    }
+
+    /// Create a DBG_VALUE representing this VarLoc in the given function.
+    /// Copies variable-specific information such as DILocalVariable and
+    /// inlining information from the original DBG_VALUE instruction, which may
+    /// have been several transfers ago.
+    MachineInstr *BuildDbgValue(MachineFunction &MF) const {
+      assert(!isEntryBackupLoc() &&
+             "Tried to produce DBG_VALUE for backup VarLoc");
+      const DebugLoc &DbgLoc = MI.getDebugLoc();
+      bool Indirect = MI.isIndirectDebugValue();
+      const auto &IID = MI.getDesc();
+      const DILocalVariable *Var = MI.getDebugVariable();
+      NumInserted++;
+
+      const DIExpression *DIExpr = Expr;
+      SmallVector<MachineOperand, 8> MOs;
+      for (unsigned I = 0, E = Locs.size(); I < E; ++I) {
+        MachineLocKind LocKind = Locs[I].Kind;
+        MachineLocValue Loc = Locs[I].Value;
+        const MachineOperand &Orig = MI.getDebugOperand(OrigLocMap[I]);
+        switch (LocKind) {
+        case MachineLocKind::RegisterKind:
+          // An entry value is a register location -- but with an updated
+          // expression. The register location of such DBG_VALUE is always the
+          // one from the entry DBG_VALUE, it does not matter if the entry value
+          // was copied in to another register due to some optimizations.
+          // Non-entry value register locations are like the source
+          // DBG_VALUE, but with the register number from this VarLoc.
+          MOs.push_back(MachineOperand::CreateReg(
+              EVKind == EntryValueLocKind::EntryValueKind ? Orig.getReg()
+                                                          : Register(Loc.RegNo),
+              false));
+          break;
+        case MachineLocKind::SpillLocKind: {
+          // Spills are indirect DBG_VALUEs, with a base register and offset.
+          // Use the original DBG_VALUEs expression to build the spilt location
+          // on top of. FIXME: spill locations created before this pass runs
+          // are not recognized, and not handled here.
+          unsigned Base = Loc.SpillLocation.SpillBase;
+          auto *TRI = MF.getSubtarget().getRegisterInfo();
+          if (MI.isNonListDebugValue()) {
+            auto Deref = Indirect ? DIExpression::DerefAfter : 0;
+            DIExpr = TRI->prependOffsetExpression(
+                DIExpr, DIExpression::ApplyOffset | Deref,
+                Loc.SpillLocation.SpillOffset);
+            Indirect = true;
+          } else {
+            SmallVector<uint64_t, 4> Ops;
+            TRI->getOffsetOpcodes(Loc.SpillLocation.SpillOffset, Ops);
+            Ops.push_back(dwarf::DW_OP_deref);
+            DIExpr = DIExpression::appendOpsToArg(DIExpr, Ops, I);
+          }
+          MOs.push_back(MachineOperand::CreateReg(Base, false));
+          break;
+        }
+        case MachineLocKind::ImmediateKind: {
+          MOs.push_back(Orig);
+          break;
+        }
+        case MachineLocKind::WasmLocKind: {
+          MOs.push_back(Orig);
+          break;
+        }
+        case MachineLocKind::InvalidKind:
+          llvm_unreachable("Tried to produce DBG_VALUE for invalid VarLoc");
+        }
+      }
+      return BuildMI(MF, DbgLoc, IID, Indirect, MOs, Var, DIExpr);
+    }
+
+    /// Is the Loc field a constant or constant object?
+    bool isConstant(MachineLocKind Kind) const {
+      return Kind == MachineLocKind::ImmediateKind;
+    }
+
+    /// Check if the Loc field is an entry backup location.
+    bool isEntryBackupLoc() const {
+      return EVKind == EntryValueLocKind::EntryValueBackupKind ||
+             EVKind == EntryValueLocKind::EntryValueCopyBackupKind;
+    }
+
+    /// If this variable is described by register \p Reg holding the entry
+    /// value, return true.
+    bool isEntryValueBackupReg(Register Reg) const {
+      return EVKind == EntryValueLocKind::EntryValueBackupKind && usesReg(Reg);
+    }
+
+    /// If this variable is described by register \p Reg holding a copy of the
+    /// entry value, return true.
+    bool isEntryValueCopyBackupReg(Register Reg) const {
+      return EVKind == EntryValueLocKind::EntryValueCopyBackupKind &&
+             usesReg(Reg);
+    }
+
+    /// If this variable is described in whole or part by \p Reg, return true.
+    bool usesReg(Register Reg) const {
+      MachineLoc RegML;
+      RegML.Kind = MachineLocKind::RegisterKind;
+      RegML.Value.RegNo = Reg;
+      return is_contained(Locs, RegML);
+    }
+
+    /// If this variable is described in whole or part by \p Reg, return true.
+    unsigned getRegIdx(Register Reg) const {
+      for (unsigned Idx = 0; Idx < Locs.size(); ++Idx)
+        if (Locs[Idx].Kind == MachineLocKind::RegisterKind &&
+            Register{static_cast<unsigned>(Locs[Idx].Value.RegNo)} == Reg)
+          return Idx;
+      llvm_unreachable("Could not find given Reg in Locs");
+    }
+
+    /// If this variable is described in whole or part by 1 or more registers,
+    /// add each of them to \p Regs and return true.
+    bool getDescribingRegs(SmallVectorImpl<uint32_t> &Regs) const {
+      bool AnyRegs = false;
+      for (const auto &Loc : Locs)
+        if (Loc.Kind == MachineLocKind::RegisterKind) {
+          Regs.push_back(Loc.Value.RegNo);
+          AnyRegs = true;
+        }
+      return AnyRegs;
+    }
+
+    bool containsSpillLocs() const {
+      return any_of(Locs, [](VarLoc::MachineLoc ML) {
+        return ML.Kind == VarLoc::MachineLocKind::SpillLocKind;
+      });
+    }
+
+    /// If this variable is described in whole or part by \p SpillLocation,
+    /// return true.
+    bool usesSpillLoc(SpillLoc SpillLocation) const {
+      MachineLoc SpillML;
+      SpillML.Kind = MachineLocKind::SpillLocKind;
+      SpillML.Value.SpillLocation = SpillLocation;
+      return is_contained(Locs, SpillML);
+    }
+
+    /// If this variable is described in whole or part by \p SpillLocation,
+    /// return the index .
+    unsigned getSpillLocIdx(SpillLoc SpillLocation) const {
+      for (unsigned Idx = 0; Idx < Locs.size(); ++Idx)
+        if (Locs[Idx].Kind == MachineLocKind::SpillLocKind &&
+            Locs[Idx].Value.SpillLocation == SpillLocation)
+          return Idx;
+      llvm_unreachable("Could not find given SpillLoc in Locs");
+    }
+
+    bool containsWasmLocs() const {
+      return any_of(Locs, [](VarLoc::MachineLoc ML) {
+        return ML.Kind == VarLoc::MachineLocKind::WasmLocKind;
+      });
+    }
+
+    /// If this variable is described in whole or part by \p WasmLocation,
+    /// return true.
+    bool usesWasmLoc(WasmLoc WasmLocation) const {
+      MachineLoc WasmML;
+      WasmML.Kind = MachineLocKind::WasmLocKind;
+      WasmML.Value.WasmLocation = WasmLocation;
+      return is_contained(Locs, WasmML);
+    }
+
+    /// Determine whether the lexical scope of this value's debug location
+    /// dominates MBB.
+    bool dominates(LexicalScopes &LS, MachineBasicBlock &MBB) const {
+      return LS.dominates(MI.getDebugLoc().get(), &MBB);
+    }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    // TRI and TII can be null.
+    void dump(const TargetRegisterInfo *TRI, const TargetInstrInfo *TII,
+              raw_ostream &Out = dbgs()) const {
+      Out << "VarLoc(";
+      for (const MachineLoc &MLoc : Locs) {
+        if (Locs.begin() != &MLoc)
+          Out << ", ";
+        switch (MLoc.Kind) {
+        case MachineLocKind::RegisterKind:
+          Out << printReg(MLoc.Value.RegNo, TRI);
+          break;
+        case MachineLocKind::SpillLocKind:
+          Out << printReg(MLoc.Value.SpillLocation.SpillBase, TRI);
+          Out << "[" << MLoc.Value.SpillLocation.SpillOffset.getFixed() << " + "
+              << MLoc.Value.SpillLocation.SpillOffset.getScalable()
+              << "x vscale"
+              << "]";
+          break;
+        case MachineLocKind::ImmediateKind:
+          Out << MLoc.Value.Immediate;
+          break;
+        case MachineLocKind::WasmLocKind: {
+          if (TII) {
+            auto Indices = TII->getSerializableTargetIndices();
+            auto Found =
+                find_if(Indices, [&](const std::pair<int, const char *> &I) {
+                  return I.first == MLoc.Value.WasmLocation.Index;
+                });
+            assert(Found != Indices.end());
+            Out << Found->second;
+            if (MLoc.Value.WasmLocation.Offset > 0)
+              Out << " + " << MLoc.Value.WasmLocation.Offset;
+          } else {
+            Out << "WasmLoc";
+          }
+          break;
+        }
+        case MachineLocKind::InvalidKind:
+          llvm_unreachable("Invalid VarLoc in dump method");
+        }
+      }
+
+      Out << ", \"" << Var.getVariable()->getName() << "\", " << *Expr << ", ";
+      if (Var.getInlinedAt())
+        Out << "!" << Var.getInlinedAt()->getMetadataID() << ")\n";
+      else
+        Out << "(null))";
+
+      if (isEntryBackupLoc())
+        Out << " (backup loc)\n";
+      else
+        Out << "\n";
+    }
+#endif
+
+    bool operator==(const VarLoc &Other) const {
+      return std::tie(EVKind, Var, Expr, Locs) ==
+             std::tie(Other.EVKind, Other.Var, Other.Expr, Other.Locs);
+    }
+
+    /// This operator guarantees that VarLocs are sorted by Variable first.
+    bool operator<(const VarLoc &Other) const {
+      return std::tie(Var, EVKind, Locs, Expr) <
+             std::tie(Other.Var, Other.EVKind, Other.Locs, Other.Expr);
+    }
+  };
+
+#ifndef NDEBUG
+  using VarVec = SmallVector<VarLoc, 32>;
+#endif
+
+  /// VarLocMap is used for two things:
+  /// 1) Assigning LocIndices to a VarLoc. The LocIndices can be used to
+  ///    virtually insert a VarLoc into a VarLocSet.
+  /// 2) Given a LocIndex, look up the unique associated VarLoc.
+  class VarLocMap {
+    /// Map a VarLoc to an index within the vector reserved for its location
+    /// within Loc2Vars.
+    std::map<VarLoc, LocIndices> Var2Indices;
+
+    /// Map a location to a vector which holds VarLocs which live in that
+    /// location.
+    SmallDenseMap<LocIndex::u32_location_t, std::vector<VarLoc>> Loc2Vars;
+
+  public:
+    /// Retrieve LocIndices for \p VL.
+    LocIndices insert(const VarLoc &VL) {
+      LocIndices &Indices = Var2Indices[VL];
+      // If Indices is not empty, VL is already in the map.
+      if (!Indices.empty())
+        return Indices;
+      SmallVector<LocIndex::u32_location_t, 4> Locations;
+      // LocIndices are determined by EVKind and MLs; each Register has a
+      // unique location, while all SpillLocs use a single bucket, and any EV
+      // VarLocs use only the Backup bucket or none at all (except the
+      // compulsory entry at the universal location index). LocIndices will
+      // always have an index at the universal location index as the last index.
+      if (VL.EVKind == VarLoc::EntryValueLocKind::NonEntryValueKind) {
+        VL.getDescribingRegs(Locations);
+        assert(all_of(Locations,
+                      [](auto RegNo) {
+                        return RegNo < LocIndex::kFirstInvalidRegLocation;
+                      }) &&
+               "Physreg out of range?");
+        if (VL.containsSpillLocs())
+          Locations.push_back(LocIndex::kSpillLocation);
+        if (VL.containsWasmLocs())
+          Locations.push_back(LocIndex::kWasmLocation);
+      } else if (VL.EVKind != VarLoc::EntryValueLocKind::EntryValueKind) {
+        LocIndex::u32_location_t Loc = LocIndex::kEntryValueBackupLocation;
+        Locations.push_back(Loc);
+      }
+      Locations.push_back(LocIndex::kUniversalLocation);
+      for (LocIndex::u32_location_t Location : Locations) {
+        auto &Vars = Loc2Vars[Location];
+        Indices.push_back(
+            {Location, static_cast<LocIndex::u32_index_t>(Vars.size())});
+        Vars.push_back(VL);
+      }
+      return Indices;
+    }
+
+    LocIndices getAllIndices(const VarLoc &VL) const {
+      auto IndIt = Var2Indices.find(VL);
+      assert(IndIt != Var2Indices.end() && "VarLoc not tracked");
+      return IndIt->second;
+    }
+
+    /// Retrieve the unique VarLoc associated with \p ID.
+    const VarLoc &operator[](LocIndex ID) const {
+      auto LocIt = Loc2Vars.find(ID.Location);
+      assert(LocIt != Loc2Vars.end() && "Location not tracked");
+      return LocIt->second[ID.Index];
+    }
+  };
+
+  using VarLocInMBB =
+      SmallDenseMap<const MachineBasicBlock *, std::unique_ptr<VarLocSet>>;
+  struct TransferDebugPair {
+    MachineInstr *TransferInst; ///< Instruction where this transfer occurs.
+    LocIndex LocationID;        ///< Location number for the transfer dest.
+  };
+  using TransferMap = SmallVector<TransferDebugPair, 4>;
+  // Types for recording Entry Var Locations emitted by a single MachineInstr,
+  // as well as recording MachineInstr which last defined a register.
+  using InstToEntryLocMap = std::multimap<const MachineInstr *, LocIndex>;
+  using RegDefToInstMap = DenseMap<Register, MachineInstr *>;
+
+  // Types for recording sets of variable fragments that overlap. For a given
+  // local variable, we record all other fragments of that variable that could
+  // overlap it, to reduce search time.
+  using FragmentOfVar =
+      std::pair<const DILocalVariable *, DIExpression::FragmentInfo>;
+  using OverlapMap =
+      DenseMap<FragmentOfVar, SmallVector<DIExpression::FragmentInfo, 1>>;
+
+  // Helper while building OverlapMap, a map of all fragments seen for a given
+  // DILocalVariable.
+  using VarToFragments =
+      DenseMap<const DILocalVariable *, SmallSet<FragmentInfo, 4>>;
+
+  /// Collects all VarLocs from \p CollectFrom. Each unique VarLoc is added
+  /// to \p Collected once, in order of insertion into \p VarLocIDs.
+  static void collectAllVarLocs(SmallVectorImpl<VarLoc> &Collected,
+                                const VarLocSet &CollectFrom,
+                                const VarLocMap &VarLocIDs);
+
+  /// Get the registers which are used by VarLocs of kind RegisterKind tracked
+  /// by \p CollectFrom.
+  void getUsedRegs(const VarLocSet &CollectFrom,
+                   SmallVectorImpl<Register> &UsedRegs) const;
+
+  /// This holds the working set of currently open ranges. For fast
+  /// access, this is done both as a set of VarLocIDs, and a map of
+  /// DebugVariable to recent VarLocID. Note that a DBG_VALUE ends all
+  /// previous open ranges for the same variable. In addition, we keep
+  /// two different maps (Vars/EntryValuesBackupVars), so erase/insert
+  /// methods act differently depending on whether a VarLoc is primary
+  /// location or backup one. In the case the VarLoc is backup location
+  /// we will erase/insert from the EntryValuesBackupVars map, otherwise
+  /// we perform the operation on the Vars.
+  class OpenRangesSet {
+    VarLocSet::Allocator &Alloc;
+    VarLocSet VarLocs;
+    // Map the DebugVariable to recent primary location ID.
+    SmallDenseMap<DebugVariable, LocIndices, 8> Vars;
+    // Map the DebugVariable to recent backup location ID.
+    SmallDenseMap<DebugVariable, LocIndices, 8> EntryValuesBackupVars;
+    OverlapMap &OverlappingFragments;
+
+  public:
+    OpenRangesSet(VarLocSet::Allocator &Alloc, OverlapMap &_OLapMap)
+        : Alloc(Alloc), VarLocs(Alloc), OverlappingFragments(_OLapMap) {}
+
+    const VarLocSet &getVarLocs() const { return VarLocs; }
+
+    // Fetches all VarLocs in \p VarLocIDs and inserts them into \p Collected.
+    // This method is needed to get every VarLoc once, as each VarLoc may have
+    // multiple indices in a VarLocMap (corresponding to each applicable
+    // location), but all VarLocs appear exactly once at the universal location
+    // index.
+    void getUniqueVarLocs(SmallVectorImpl<VarLoc> &Collected,
+                          const VarLocMap &VarLocIDs) const {
+      collectAllVarLocs(Collected, VarLocs, VarLocIDs);
+    }
+
+    /// Terminate all open ranges for VL.Var by removing it from the set.
+    void erase(const VarLoc &VL);
+
+    /// Terminate all open ranges listed as indices in \c KillSet with
+    /// \c Location by removing them from the set.
+    void erase(const VarLocsInRange &KillSet, const VarLocMap &VarLocIDs,
+               LocIndex::u32_location_t Location);
+
+    /// Insert a new range into the set.
+    void insert(LocIndices VarLocIDs, const VarLoc &VL);
+
+    /// Insert a set of ranges.
+    void insertFromLocSet(const VarLocSet &ToLoad, const VarLocMap &Map);
+
+    std::optional<LocIndices> getEntryValueBackup(DebugVariable Var);
+
+    /// Empty the set.
+    void clear() {
+      VarLocs.clear();
+      Vars.clear();
+      EntryValuesBackupVars.clear();
+    }
+
+    /// Return whether the set is empty or not.
+    bool empty() const {
+      assert(Vars.empty() == EntryValuesBackupVars.empty() &&
+             Vars.empty() == VarLocs.empty() &&
+             "open ranges are inconsistent");
+      return VarLocs.empty();
+    }
+
+    /// Get an empty range of VarLoc IDs.
+    auto getEmptyVarLocRange() const {
+      return iterator_range<VarLocSet::const_iterator>(getVarLocs().end(),
+                                                       getVarLocs().end());
+    }
+
+    /// Get all set IDs for VarLocs with MLs of kind RegisterKind in \p Reg.
+    auto getRegisterVarLocs(Register Reg) const {
+      return LocIndex::indexRangeForLocation(getVarLocs(), Reg);
+    }
+
+    /// Get all set IDs for VarLocs with MLs of kind SpillLocKind.
+    auto getSpillVarLocs() const {
+      return LocIndex::indexRangeForLocation(getVarLocs(),
+                                             LocIndex::kSpillLocation);
+    }
+
+    /// Get all set IDs for VarLocs of EVKind EntryValueBackupKind or
+    /// EntryValueCopyBackupKind.
+    auto getEntryValueBackupVarLocs() const {
+      return LocIndex::indexRangeForLocation(
+          getVarLocs(), LocIndex::kEntryValueBackupLocation);
+    }
+
+    /// Get all set IDs for VarLocs with MLs of kind WasmLocKind.
+    auto getWasmVarLocs() const {
+      return LocIndex::indexRangeForLocation(getVarLocs(),
+                                             LocIndex::kWasmLocation);
+    }
+  };
+
+  /// Collect all VarLoc IDs from \p CollectFrom for VarLocs with MLs of kind
+  /// RegisterKind which are located in any reg in \p Regs. The IDs for each
+  /// VarLoc correspond to entries in the universal location bucket, which every
+  /// VarLoc has exactly 1 entry for. Insert collected IDs into \p Collected.
+  static void collectIDsForRegs(VarLocsInRange &Collected,
+                                const DefinedRegsSet &Regs,
+                                const VarLocSet &CollectFrom,
+                                const VarLocMap &VarLocIDs);
+
+  VarLocSet &getVarLocsInMBB(const MachineBasicBlock *MBB, VarLocInMBB &Locs) {
+    std::unique_ptr<VarLocSet> &VLS = Locs[MBB];
+    if (!VLS)
+      VLS = std::make_unique<VarLocSet>(Alloc);
+    return *VLS;
+  }
+
+  const VarLocSet &getVarLocsInMBB(const MachineBasicBlock *MBB,
+                                   const VarLocInMBB &Locs) const {
+    auto It = Locs.find(MBB);
+    assert(It != Locs.end() && "MBB not in map");
+    return *It->second;
+  }
+
+  /// Tests whether this instruction is a spill to a stack location.
+  bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF);
+
+  /// Decide if @MI is a spill instruction and return true if it is. We use 2
+  /// criteria to make this decision:
+  /// - Is this instruction a store to a spill slot?
+  /// - Is there a register operand that is both used and killed?
+  /// TODO: Store optimization can fold spills into other stores (including
+  /// other spills). We do not handle this yet (more than one memory operand).
+  bool isLocationSpill(const MachineInstr &MI, MachineFunction *MF,
+                       Register &Reg);
+
+  /// Returns true if the given machine instruction is a debug value which we
+  /// can emit entry values for.
+  ///
+  /// Currently, we generate debug entry values only for parameters that are
+  /// unmodified throughout the function and located in a register.
+  bool isEntryValueCandidate(const MachineInstr &MI,
+                             const DefinedRegsSet &Regs) const;
+
+  /// If a given instruction is identified as a spill, return the spill location
+  /// and set \p Reg to the spilled register.
+  std::optional<VarLoc::SpillLoc> isRestoreInstruction(const MachineInstr &MI,
+                                                       MachineFunction *MF,
+                                                       Register &Reg);
+  /// Given a spill instruction, extract the register and offset used to
+  /// address the spill location in a target independent way.
+  VarLoc::SpillLoc extractSpillBaseRegAndOffset(const MachineInstr &MI);
+  void insertTransferDebugPair(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                               TransferMap &Transfers, VarLocMap &VarLocIDs,
+                               LocIndex OldVarID, TransferKind Kind,
+                               const VarLoc::MachineLoc &OldLoc,
+                               Register NewReg = Register());
+
+  void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
+                          VarLocMap &VarLocIDs,
+                          InstToEntryLocMap &EntryValTransfers,
+                          RegDefToInstMap &RegSetInstrs);
+  void transferSpillOrRestoreInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                                  VarLocMap &VarLocIDs, TransferMap &Transfers);
+  void cleanupEntryValueTransfers(const MachineInstr *MI,
+                                  OpenRangesSet &OpenRanges,
+                                  VarLocMap &VarLocIDs, const VarLoc &EntryVL,
+                                  InstToEntryLocMap &EntryValTransfers);
+  void removeEntryValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
+                        VarLocMap &VarLocIDs, const VarLoc &EntryVL,
+                        InstToEntryLocMap &EntryValTransfers,
+                        RegDefToInstMap &RegSetInstrs);
+  void emitEntryValues(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                       VarLocMap &VarLocIDs,
+                       InstToEntryLocMap &EntryValTransfers,
+                       VarLocsInRange &KillSet);
+  void recordEntryValue(const MachineInstr &MI,
+                        const DefinedRegsSet &DefinedRegs,
+                        OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs);
+  void transferRegisterCopy(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                            VarLocMap &VarLocIDs, TransferMap &Transfers);
+  void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                           VarLocMap &VarLocIDs,
+                           InstToEntryLocMap &EntryValTransfers,
+                           RegDefToInstMap &RegSetInstrs);
+  void transferWasmDef(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                       VarLocMap &VarLocIDs);
+  bool transferTerminator(MachineBasicBlock *MBB, OpenRangesSet &OpenRanges,
+                          VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);
+
+  void process(MachineInstr &MI, OpenRangesSet &OpenRanges,
+               VarLocMap &VarLocIDs, TransferMap &Transfers,
+               InstToEntryLocMap &EntryValTransfers,
+               RegDefToInstMap &RegSetInstrs);
+
+  void accumulateFragmentMap(MachineInstr &MI, VarToFragments &SeenFragments,
+                             OverlapMap &OLapMap);
+
+  bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
+            const VarLocMap &VarLocIDs,
+            SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
+            SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks);
+
+  /// Create DBG_VALUE insts for inlocs that have been propagated but
+  /// had their instruction creation deferred.
+  void flushPendingLocs(VarLocInMBB &PendingInLocs, VarLocMap &VarLocIDs);
+
+  bool ExtendRanges(MachineFunction &MF, MachineDominatorTree *DomTree,
+                    TargetPassConfig *TPC, unsigned InputBBLimit,
+                    unsigned InputDbgValLimit) override;
+
+public:
+  /// Default construct and initialize the pass.
+  VarLocBasedLDV();
+
+  ~VarLocBasedLDV();
+
+  /// Print to ostream with a message.
+  void printVarLocInMBB(const MachineFunction &MF, const VarLocInMBB &V,
+                        const VarLocMap &VarLocIDs, const char *msg,
+                        raw_ostream &Out) const;
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//            Implementation
+//===----------------------------------------------------------------------===//
+
+VarLocBasedLDV::VarLocBasedLDV() = default;
+
+VarLocBasedLDV::~VarLocBasedLDV() = default;
+
+/// Erase a variable from the set of open ranges, and additionally erase any
+/// fragments that may overlap it. If the VarLoc is a backup location, erase
+/// the variable from the EntryValuesBackupVars set, indicating we should stop
+/// tracking its backup entry location. Otherwise, if the VarLoc is primary
+/// location, erase the variable from the Vars set.
+void VarLocBasedLDV::OpenRangesSet::erase(const VarLoc &VL) {
+  // Erasure helper.
+  auto DoErase = [&VL, this](DebugVariable VarToErase) {
+    auto *EraseFrom = VL.isEntryBackupLoc() ? &EntryValuesBackupVars : &Vars;
+    auto It = EraseFrom->find(VarToErase);
+    if (It != EraseFrom->end()) {
+      LocIndices IDs = It->second;
+      for (LocIndex ID : IDs)
+        VarLocs.reset(ID.getAsRawInteger());
+      EraseFrom->erase(It);
+    }
+  };
+
+  DebugVariable Var = VL.Var;
+
+  // Erase the variable/fragment that ends here.
+  DoErase(Var);
+
+  // Extract the fragment. Interpret an empty fragment as one that covers all
+  // possible bits.
+  FragmentInfo ThisFragment = Var.getFragmentOrDefault();
+
+  // There may be fragments that overlap the designated fragment. Look them up
+  // in the pre-computed overlap map, and erase them too.
+  auto MapIt = OverlappingFragments.find({Var.getVariable(), ThisFragment});
+  if (MapIt != OverlappingFragments.end()) {
+    for (auto Fragment : MapIt->second) {
+      VarLocBasedLDV::OptFragmentInfo FragmentHolder;
+      if (!DebugVariable::isDefaultFragment(Fragment))
+        FragmentHolder = VarLocBasedLDV::OptFragmentInfo(Fragment);
+      DoErase({Var.getVariable(), FragmentHolder, Var.getInlinedAt()});
+    }
+  }
+}
+
+void VarLocBasedLDV::OpenRangesSet::erase(const VarLocsInRange &KillSet,
+                                          const VarLocMap &VarLocIDs,
+                                          LocIndex::u32_location_t Location) {
+  VarLocSet RemoveSet(Alloc);
+  for (LocIndex::u32_index_t ID : KillSet) {
+    const VarLoc &VL = VarLocIDs[LocIndex(Location, ID)];
+    auto *EraseFrom = VL.isEntryBackupLoc() ? &EntryValuesBackupVars : &Vars;
+    EraseFrom->erase(VL.Var);
+    LocIndices VLI = VarLocIDs.getAllIndices(VL);
+    for (LocIndex ID : VLI)
+      RemoveSet.set(ID.getAsRawInteger());
+  }
+  VarLocs.intersectWithComplement(RemoveSet);
+}
+
+void VarLocBasedLDV::OpenRangesSet::insertFromLocSet(const VarLocSet &ToLoad,
+                                                     const VarLocMap &Map) {
+  VarLocsInRange UniqueVarLocIDs;
+  DefinedRegsSet Regs;
+  Regs.insert(LocIndex::kUniversalLocation);
+  collectIDsForRegs(UniqueVarLocIDs, Regs, ToLoad, Map);
+  for (uint64_t ID : UniqueVarLocIDs) {
+    LocIndex Idx = LocIndex::fromRawInteger(ID);
+    const VarLoc &VarL = Map[Idx];
+    const LocIndices Indices = Map.getAllIndices(VarL);
+    insert(Indices, VarL);
+  }
+}
+
+void VarLocBasedLDV::OpenRangesSet::insert(LocIndices VarLocIDs,
+                                           const VarLoc &VL) {
+  auto *InsertInto = VL.isEntryBackupLoc() ? &EntryValuesBackupVars : &Vars;
+  for (LocIndex ID : VarLocIDs)
+    VarLocs.set(ID.getAsRawInteger());
+  InsertInto->insert({VL.Var, VarLocIDs});
+}
+
+/// Return the Loc ID of an entry value backup location, if it exists for the
+/// variable.
+std::optional<LocIndices>
+VarLocBasedLDV::OpenRangesSet::getEntryValueBackup(DebugVariable Var) {
+  auto It = EntryValuesBackupVars.find(Var);
+  if (It != EntryValuesBackupVars.end())
+    return It->second;
+
+  return std::nullopt;
+}
+
+void VarLocBasedLDV::collectIDsForRegs(VarLocsInRange &Collected,
+                                       const DefinedRegsSet &Regs,
+                                       const VarLocSet &CollectFrom,
+                                       const VarLocMap &VarLocIDs) {
+  assert(!Regs.empty() && "Nothing to collect");
+  SmallVector<Register, 32> SortedRegs;
+  append_range(SortedRegs, Regs);
+  array_pod_sort(SortedRegs.begin(), SortedRegs.end());
+  auto It = CollectFrom.find(LocIndex::rawIndexForReg(SortedRegs.front()));
+  auto End = CollectFrom.end();
+  for (Register Reg : SortedRegs) {
+    // The half-open interval [FirstIndexForReg, FirstInvalidIndex) contains
+    // all possible VarLoc IDs for VarLocs with MLs of kind RegisterKind which
+    // live in Reg.
+    uint64_t FirstIndexForReg = LocIndex::rawIndexForReg(Reg);
+    uint64_t FirstInvalidIndex = LocIndex::rawIndexForReg(Reg + 1);
+    It.advanceToLowerBound(FirstIndexForReg);
+
+    // Iterate through that half-open interval and collect all the set IDs.
+    for (; It != End && *It < FirstInvalidIndex; ++It) {
+      LocIndex ItIdx = LocIndex::fromRawInteger(*It);
+      const VarLoc &VL = VarLocIDs[ItIdx];
+      LocIndices LI = VarLocIDs.getAllIndices(VL);
+      // For now, the back index is always the universal location index.
+      assert(LI.back().Location == LocIndex::kUniversalLocation &&
+             "Unexpected order of LocIndices for VarLoc; was it inserted into "
+             "the VarLocMap correctly?");
+      Collected.insert(LI.back().Index);
+    }
+
+    if (It == End)
+      return;
+  }
+}
+
+void VarLocBasedLDV::getUsedRegs(const VarLocSet &CollectFrom,
+                                 SmallVectorImpl<Register> &UsedRegs) const {
+  // All register-based VarLocs are assigned indices greater than or equal to
+  // FirstRegIndex.
+  uint64_t FirstRegIndex =
+      LocIndex::rawIndexForReg(LocIndex::kFirstRegLocation);
+  uint64_t FirstInvalidIndex =
+      LocIndex::rawIndexForReg(LocIndex::kFirstInvalidRegLocation);
+  for (auto It = CollectFrom.find(FirstRegIndex),
+            End = CollectFrom.find(FirstInvalidIndex);
+       It != End;) {
+    // We found a VarLoc ID for a VarLoc that lives in a register. Figure out
+    // which register and add it to UsedRegs.
+    uint32_t FoundReg = LocIndex::fromRawInteger(*It).Location;
+    assert((UsedRegs.empty() || FoundReg != UsedRegs.back()) &&
+           "Duplicate used reg");
+    UsedRegs.push_back(FoundReg);
+
+    // Skip to the next /set/ register. Note that this finds a lower bound, so
+    // even if there aren't any VarLocs living in `FoundReg+1`, we're still
+    // guaranteed to move on to the next register (or to end()).
+    uint64_t NextRegIndex = LocIndex::rawIndexForReg(FoundReg + 1);
+    It.advanceToLowerBound(NextRegIndex);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//            Debug Range Extension Implementation
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+void VarLocBasedLDV::printVarLocInMBB(const MachineFunction &MF,
+                                       const VarLocInMBB &V,
+                                       const VarLocMap &VarLocIDs,
+                                       const char *msg,
+                                       raw_ostream &Out) const {
+  Out << '\n' << msg << '\n';
+  for (const MachineBasicBlock &BB : MF) {
+    if (!V.count(&BB))
+      continue;
+    const VarLocSet &L = getVarLocsInMBB(&BB, V);
+    if (L.empty())
+      continue;
+    SmallVector<VarLoc, 32> VarLocs;
+    collectAllVarLocs(VarLocs, L, VarLocIDs);
+    Out << "MBB: " << BB.getNumber() << ":\n";
+    for (const VarLoc &VL : VarLocs) {
+      Out << " Var: " << VL.Var.getVariable()->getName();
+      Out << " MI: ";
+      VL.dump(TRI, TII, Out);
+    }
+  }
+  Out << "\n";
+}
+#endif
+
+VarLocBasedLDV::VarLoc::SpillLoc
+VarLocBasedLDV::extractSpillBaseRegAndOffset(const MachineInstr &MI) {
+  assert(MI.hasOneMemOperand() &&
+         "Spill instruction does not have exactly one memory operand?");
+  auto MMOI = MI.memoperands_begin();
+  const PseudoSourceValue *PVal = (*MMOI)->getPseudoValue();
+  assert(PVal->kind() == PseudoSourceValue::FixedStack &&
+         "Inconsistent memory operand in spill instruction");
+  int FI = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex();
+  const MachineBasicBlock *MBB = MI.getParent();
+  Register Reg;
+  StackOffset Offset = TFI->getFrameIndexReference(*MBB->getParent(), FI, Reg);
+  return {Reg, Offset};
+}
+
+/// Do cleanup of \p EntryValTransfers created by \p TRInst, by removing the
+/// Transfer, which uses the to-be-deleted \p EntryVL.
+void VarLocBasedLDV::cleanupEntryValueTransfers(
+    const MachineInstr *TRInst, OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs,
+    const VarLoc &EntryVL, InstToEntryLocMap &EntryValTransfers) {
+  if (EntryValTransfers.empty() || TRInst == nullptr)
+    return;
+
+  auto TransRange = EntryValTransfers.equal_range(TRInst);
+  for (auto &TDPair : llvm::make_range(TransRange.first, TransRange.second)) {
+    const VarLoc &EmittedEV = VarLocIDs[TDPair.second];
+    if (std::tie(EntryVL.Var, EntryVL.Locs[0].Value.RegNo, EntryVL.Expr) ==
+        std::tie(EmittedEV.Var, EmittedEV.Locs[0].Value.RegNo,
+                 EmittedEV.Expr)) {
+      OpenRanges.erase(EmittedEV);
+      EntryValTransfers.erase(TRInst);
+      break;
+    }
+  }
+}
+
+/// Try to salvage the debug entry value if we encounter a new debug value
+/// describing the same parameter, otherwise stop tracking the value. Return
+/// true if we should stop tracking the entry value and do the cleanup of
+/// emitted Entry Value Transfers, otherwise return false.
+void VarLocBasedLDV::removeEntryValue(const MachineInstr &MI,
+                                      OpenRangesSet &OpenRanges,
+                                      VarLocMap &VarLocIDs,
+                                      const VarLoc &EntryVL,
+                                      InstToEntryLocMap &EntryValTransfers,
+                                      RegDefToInstMap &RegSetInstrs) {
+  // Skip the DBG_VALUE which is the debug entry value itself.
+  if (&MI == &EntryVL.MI)
+    return;
+
+  // If the parameter's location is not register location, we can not track
+  // the entry value any more. It doesn't have the TransferInst which defines
+  // register, so no Entry Value Transfers have been emitted already.
+  if (!MI.getDebugOperand(0).isReg())
+    return;
+
+  // Try to get non-debug instruction responsible for the DBG_VALUE.
+  const MachineInstr *TransferInst = nullptr;
+  Register Reg = MI.getDebugOperand(0).getReg();
+  if (Reg.isValid() && RegSetInstrs.contains(Reg))
+    TransferInst = RegSetInstrs.find(Reg)->second;
+
+  // Case of the parameter's DBG_VALUE at the start of entry MBB.
+  if (!TransferInst && !LastNonDbgMI && MI.getParent()->isEntryBlock())
+    return;
+
+  // If the debug expression from the DBG_VALUE is not empty, we can assume the
+  // parameter's value has changed indicating that we should stop tracking its
+  // entry value as well.
+  if (MI.getDebugExpression()->getNumElements() == 0 && TransferInst) {
+    // If the DBG_VALUE comes from a copy instruction that copies the entry
+    // value, it means the parameter's value has not changed and we should be
+    // able to use its entry value.
+    // TODO: Try to keep tracking of an entry value if we encounter a propagated
+    // DBG_VALUE describing the copy of the entry value. (Propagated entry value
+    // does not indicate the parameter modification.)
+    auto DestSrc = TII->isCopyInstr(*TransferInst);
+    if (DestSrc) {
+      const MachineOperand *SrcRegOp, *DestRegOp;
+      SrcRegOp = DestSrc->Source;
+      DestRegOp = DestSrc->Destination;
+      if (Reg == DestRegOp->getReg()) {
+        for (uint64_t ID : OpenRanges.getEntryValueBackupVarLocs()) {
+          const VarLoc &VL = VarLocIDs[LocIndex::fromRawInteger(ID)];
+          if (VL.isEntryValueCopyBackupReg(Reg) &&
+              // Entry Values should not be variadic.
+              VL.MI.getDebugOperand(0).getReg() == SrcRegOp->getReg())
+            return;
+        }
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Deleting a DBG entry value because of: ";
+             MI.print(dbgs(), /*IsStandalone*/ false,
+                      /*SkipOpers*/ false, /*SkipDebugLoc*/ false,
+                      /*AddNewLine*/ true, TII));
+  cleanupEntryValueTransfers(TransferInst, OpenRanges, VarLocIDs, EntryVL,
+                             EntryValTransfers);
+  OpenRanges.erase(EntryVL);
+}
+
+/// End all previous ranges related to @MI and start a new range from @MI
+/// if it is a DBG_VALUE instr.
+void VarLocBasedLDV::transferDebugValue(const MachineInstr &MI,
+                                        OpenRangesSet &OpenRanges,
+                                        VarLocMap &VarLocIDs,
+                                        InstToEntryLocMap &EntryValTransfers,
+                                        RegDefToInstMap &RegSetInstrs) {
+  if (!MI.isDebugValue())
+    return;
+  const DILocalVariable *Var = MI.getDebugVariable();
+  const DIExpression *Expr = MI.getDebugExpression();
+  const DILocation *DebugLoc = MI.getDebugLoc();
+  const DILocation *InlinedAt = DebugLoc->getInlinedAt();
+  assert(Var->isValidLocationForIntrinsic(DebugLoc) &&
+         "Expected inlined-at fields to agree");
+
+  DebugVariable V(Var, Expr, InlinedAt);
+
+  // Check if this DBG_VALUE indicates a parameter's value changing.
+  // If that is the case, we should stop tracking its entry value.
+  auto EntryValBackupID = OpenRanges.getEntryValueBackup(V);
+  if (Var->isParameter() && EntryValBackupID) {
+    const VarLoc &EntryVL = VarLocIDs[EntryValBackupID->back()];
+    removeEntryValue(MI, OpenRanges, VarLocIDs, EntryVL, EntryValTransfers,
+                     RegSetInstrs);
+  }
+
+  if (all_of(MI.debug_operands(), [](const MachineOperand &MO) {
+        return (MO.isReg() && MO.getReg()) || MO.isImm() || MO.isFPImm() ||
+               MO.isCImm() || MO.isTargetIndex();
+      })) {
+    // Use normal VarLoc constructor for registers and immediates.
+    VarLoc VL(MI);
+    // End all previous ranges of VL.Var.
+    OpenRanges.erase(VL);
+
+    LocIndices IDs = VarLocIDs.insert(VL);
+    // Add the VarLoc to OpenRanges from this DBG_VALUE.
+    OpenRanges.insert(IDs, VL);
+  } else if (MI.memoperands().size() > 0) {
+    llvm_unreachable("DBG_VALUE with mem operand encountered after regalloc?");
+  } else {
+    // This must be an undefined location. If it has an open range, erase it.
+    assert(MI.isUndefDebugValue() &&
+           "Unexpected non-undef DBG_VALUE encountered");
+    VarLoc VL(MI);
+    OpenRanges.erase(VL);
+  }
+}
+
+// This should be removed later, doesn't fit the new design.
+void VarLocBasedLDV::collectAllVarLocs(SmallVectorImpl<VarLoc> &Collected,
+                                       const VarLocSet &CollectFrom,
+                                       const VarLocMap &VarLocIDs) {
+  // The half-open interval [FirstIndexForReg, FirstInvalidIndex) contains all
+  // possible VarLoc IDs for VarLocs with MLs of kind RegisterKind which live
+  // in Reg.
+  uint64_t FirstIndex = LocIndex::rawIndexForReg(LocIndex::kUniversalLocation);
+  uint64_t FirstInvalidIndex =
+      LocIndex::rawIndexForReg(LocIndex::kUniversalLocation + 1);
+  // Iterate through that half-open interval and collect all the set IDs.
+  for (auto It = CollectFrom.find(FirstIndex), End = CollectFrom.end();
+       It != End && *It < FirstInvalidIndex; ++It) {
+    LocIndex RegIdx = LocIndex::fromRawInteger(*It);
+    Collected.push_back(VarLocIDs[RegIdx]);
+  }
+}
+
+/// Turn the entry value backup locations into primary locations.
+void VarLocBasedLDV::emitEntryValues(MachineInstr &MI,
+                                     OpenRangesSet &OpenRanges,
+                                     VarLocMap &VarLocIDs,
+                                     InstToEntryLocMap &EntryValTransfers,
+                                     VarLocsInRange &KillSet) {
+  // Do not insert entry value locations after a terminator.
+  if (MI.isTerminator())
+    return;
+
+  for (uint32_t ID : KillSet) {
+    // The KillSet IDs are indices for the universal location bucket.
+    LocIndex Idx = LocIndex(LocIndex::kUniversalLocation, ID);
+    const VarLoc &VL = VarLocIDs[Idx];
+    if (!VL.Var.getVariable()->isParameter())
+      continue;
+
+    auto DebugVar = VL.Var;
+    std::optional<LocIndices> EntryValBackupIDs =
+        OpenRanges.getEntryValueBackup(DebugVar);
+
+    // If the parameter has the entry value backup, it means we should
+    // be able to use its entry value.
+    if (!EntryValBackupIDs)
+      continue;
+
+    const VarLoc &EntryVL = VarLocIDs[EntryValBackupIDs->back()];
+    VarLoc EntryLoc = VarLoc::CreateEntryLoc(EntryVL.MI, EntryVL.Expr,
+                                             EntryVL.Locs[0].Value.RegNo);
+    LocIndices EntryValueIDs = VarLocIDs.insert(EntryLoc);
+    assert(EntryValueIDs.size() == 1 &&
+           "EntryValue loc should not be variadic");
+    EntryValTransfers.insert({&MI, EntryValueIDs.back()});
+    OpenRanges.insert(EntryValueIDs, EntryLoc);
+  }
+}
+
+/// Create new TransferDebugPair and insert it in \p Transfers. The VarLoc
+/// with \p OldVarID should be deleted form \p OpenRanges and replaced with
+/// new VarLoc. If \p NewReg is different than default zero value then the
+/// new location will be register location created by the copy like instruction,
+/// otherwise it is variable's location on the stack.
+void VarLocBasedLDV::insertTransferDebugPair(
+    MachineInstr &MI, OpenRangesSet &OpenRanges, TransferMap &Transfers,
+    VarLocMap &VarLocIDs, LocIndex OldVarID, TransferKind Kind,
+    const VarLoc::MachineLoc &OldLoc, Register NewReg) {
+  const VarLoc &OldVarLoc = VarLocIDs[OldVarID];
+
+  auto ProcessVarLoc = [&MI, &OpenRanges, &Transfers, &VarLocIDs](VarLoc &VL) {
+    LocIndices LocIds = VarLocIDs.insert(VL);
+
+    // Close this variable's previous location range.
+    OpenRanges.erase(VL);
+
+    // Record the new location as an open range, and a postponed transfer
+    // inserting a DBG_VALUE for this location.
+    OpenRanges.insert(LocIds, VL);
+    assert(!MI.isTerminator() && "Cannot insert DBG_VALUE after terminator");
+    TransferDebugPair MIP = {&MI, LocIds.back()};
+    Transfers.push_back(MIP);
+  };
+
+  // End all previous ranges of VL.Var.
+  OpenRanges.erase(VarLocIDs[OldVarID]);
+  switch (Kind) {
+  case TransferKind::TransferCopy: {
+    assert(NewReg &&
+           "No register supplied when handling a copy of a debug value");
+    // Create a DBG_VALUE instruction to describe the Var in its new
+    // register location.
+    VarLoc VL = VarLoc::CreateCopyLoc(OldVarLoc, OldLoc, NewReg);
+    ProcessVarLoc(VL);
+    LLVM_DEBUG({
+      dbgs() << "Creating VarLoc for register copy:";
+      VL.dump(TRI, TII);
+    });
+    return;
+  }
+  case TransferKind::TransferSpill: {
+    // Create a DBG_VALUE instruction to describe the Var in its spilled
+    // location.
+    VarLoc::SpillLoc SpillLocation = extractSpillBaseRegAndOffset(MI);
+    VarLoc VL = VarLoc::CreateSpillLoc(
+        OldVarLoc, OldLoc, SpillLocation.SpillBase, SpillLocation.SpillOffset);
+    ProcessVarLoc(VL);
+    LLVM_DEBUG({
+      dbgs() << "Creating VarLoc for spill:";
+      VL.dump(TRI, TII);
+    });
+    return;
+  }
+  case TransferKind::TransferRestore: {
+    assert(NewReg &&
+           "No register supplied when handling a restore of a debug value");
+    // DebugInstr refers to the pre-spill location, therefore we can reuse
+    // its expression.
+    VarLoc VL = VarLoc::CreateCopyLoc(OldVarLoc, OldLoc, NewReg);
+    ProcessVarLoc(VL);
+    LLVM_DEBUG({
+      dbgs() << "Creating VarLoc for restore:";
+      VL.dump(TRI, TII);
+    });
+    return;
+  }
+  }
+  llvm_unreachable("Invalid transfer kind");
+}
+
+/// A definition of a register may mark the end of a range.
+void VarLocBasedLDV::transferRegisterDef(MachineInstr &MI,
+                                         OpenRangesSet &OpenRanges,
+                                         VarLocMap &VarLocIDs,
+                                         InstToEntryLocMap &EntryValTransfers,
+                                         RegDefToInstMap &RegSetInstrs) {
+
+  // Meta Instructions do not affect the debug liveness of any register they
+  // define.
+  if (MI.isMetaInstruction())
+    return;
+
+  MachineFunction *MF = MI.getMF();
+  const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+  Register SP = TLI->getStackPointerRegisterToSaveRestore();
+
+  // Find the regs killed by MI, and find regmasks of preserved regs.
+  DefinedRegsSet DeadRegs;
+  SmallVector<const uint32_t *, 4> RegMasks;
+  for (const MachineOperand &MO : MI.operands()) {
+    // Determine whether the operand is a register def.
+    if (MO.isReg() && MO.isDef() && MO.getReg() && MO.getReg().isPhysical() &&
+        !(MI.isCall() && MO.getReg() == SP)) {
+      // Remove ranges of all aliased registers.
+      for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
+        // FIXME: Can we break out of this loop early if no insertion occurs?
+        DeadRegs.insert(*RAI);
+      RegSetInstrs.erase(MO.getReg());
+      RegSetInstrs.insert({MO.getReg(), &MI});
+    } else if (MO.isRegMask()) {
+      RegMasks.push_back(MO.getRegMask());
+    }
+  }
+
+  // Erase VarLocs which reside in one of the dead registers. For performance
+  // reasons, it's critical to not iterate over the full set of open VarLocs.
+  // Iterate over the set of dying/used regs instead.
+  if (!RegMasks.empty()) {
+    SmallVector<Register, 32> UsedRegs;
+    getUsedRegs(OpenRanges.getVarLocs(), UsedRegs);
+    for (Register Reg : UsedRegs) {
+      // Remove ranges of all clobbered registers. Register masks don't usually
+      // list SP as preserved. Assume that call instructions never clobber SP,
+      // because some backends (e.g., AArch64) never list SP in the regmask.
+      // While the debug info may be off for an instruction or two around
+      // callee-cleanup calls, transferring the DEBUG_VALUE across the call is
+      // still a better user experience.
+      if (Reg == SP)
+        continue;
+      bool AnyRegMaskKillsReg =
+          any_of(RegMasks, [Reg](const uint32_t *RegMask) {
+            return MachineOperand::clobbersPhysReg(RegMask, Reg);
+          });
+      if (AnyRegMaskKillsReg)
+        DeadRegs.insert(Reg);
+      if (AnyRegMaskKillsReg) {
+        RegSetInstrs.erase(Reg);
+        RegSetInstrs.insert({Reg, &MI});
+      }
+    }
+  }
+
+  if (DeadRegs.empty())
+    return;
+
+  VarLocsInRange KillSet;
+  collectIDsForRegs(KillSet, DeadRegs, OpenRanges.getVarLocs(), VarLocIDs);
+  OpenRanges.erase(KillSet, VarLocIDs, LocIndex::kUniversalLocation);
+
+  if (TPC) {
+    auto &TM = TPC->getTM<TargetMachine>();
+    if (TM.Options.ShouldEmitDebugEntryValues())
+      emitEntryValues(MI, OpenRanges, VarLocIDs, EntryValTransfers, KillSet);
+  }
+}
+
+void VarLocBasedLDV::transferWasmDef(MachineInstr &MI,
+                                     OpenRangesSet &OpenRanges,
+                                     VarLocMap &VarLocIDs) {
+  // If this is not a Wasm local.set or local.tee, which sets local values,
+  // return.
+  int Index;
+  int64_t Offset;
+  if (!TII->isExplicitTargetIndexDef(MI, Index, Offset))
+    return;
+
+  // Find the target indices killed by MI, and delete those variable locations
+  // from the open range.
+  VarLocsInRange KillSet;
+  VarLoc::WasmLoc Loc{Index, Offset};
+  for (uint64_t ID : OpenRanges.getWasmVarLocs()) {
+    LocIndex Idx = LocIndex::fromRawInteger(ID);
+    const VarLoc &VL = VarLocIDs[Idx];
+    assert(VL.containsWasmLocs() && "Broken VarLocSet?");
+    if (VL.usesWasmLoc(Loc))
+      KillSet.insert(ID);
+  }
+  OpenRanges.erase(KillSet, VarLocIDs, LocIndex::kWasmLocation);
+}
+
+bool VarLocBasedLDV::isSpillInstruction(const MachineInstr &MI,
+                                         MachineFunction *MF) {
+  // TODO: Handle multiple stores folded into one.
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  if (!MI.getSpillSize(TII) && !MI.getFoldedSpillSize(TII))
+    return false; // This is not a spill instruction, since no valid size was
+                  // returned from either function.
+
+  return true;
+}
+
+bool VarLocBasedLDV::isLocationSpill(const MachineInstr &MI,
+                                      MachineFunction *MF, Register &Reg) {
+  if (!isSpillInstruction(MI, MF))
+    return false;
+
+  auto isKilledReg = [&](const MachineOperand MO, Register &Reg) {
+    if (!MO.isReg() || !MO.isUse()) {
+      Reg = 0;
+      return false;
+    }
+    Reg = MO.getReg();
+    return MO.isKill();
+  };
+
+  for (const MachineOperand &MO : MI.operands()) {
+    // In a spill instruction generated by the InlineSpiller the spilled
+    // register has its kill flag set.
+    if (isKilledReg(MO, Reg))
+      return true;
+    if (Reg != 0) {
+      // Check whether next instruction kills the spilled register.
+      // FIXME: Current solution does not cover search for killed register in
+      // bundles and instructions further down the chain.
+      auto NextI = std::next(MI.getIterator());
+      // Skip next instruction that points to basic block end iterator.
+      if (MI.getParent()->end() == NextI)
+        continue;
+      Register RegNext;
+      for (const MachineOperand &MONext : NextI->operands()) {
+        // Return true if we came across the register from the
+        // previous spill instruction that is killed in NextI.
+        if (isKilledReg(MONext, RegNext) && RegNext == Reg)
+          return true;
+      }
+    }
+  }
+  // Return false if we didn't find spilled register.
+  return false;
+}
+
+std::optional<VarLocBasedLDV::VarLoc::SpillLoc>
+VarLocBasedLDV::isRestoreInstruction(const MachineInstr &MI,
+                                     MachineFunction *MF, Register &Reg) {
+  if (!MI.hasOneMemOperand())
+    return std::nullopt;
+
+  // FIXME: Handle folded restore instructions with more than one memory
+  // operand.
+  if (MI.getRestoreSize(TII)) {
+    Reg = MI.getOperand(0).getReg();
+    return extractSpillBaseRegAndOffset(MI);
+  }
+  return std::nullopt;
+}
+
+/// A spilled register may indicate that we have to end the current range of
+/// a variable and create a new one for the spill location.
+/// A restored register may indicate the reverse situation.
+/// We don't want to insert any instructions in process(), so we just create
+/// the DBG_VALUE without inserting it and keep track of it in \p Transfers.
+/// It will be inserted into the BB when we're done iterating over the
+/// instructions.
+void VarLocBasedLDV::transferSpillOrRestoreInst(MachineInstr &MI,
+                                                 OpenRangesSet &OpenRanges,
+                                                 VarLocMap &VarLocIDs,
+                                                 TransferMap &Transfers) {
+  MachineFunction *MF = MI.getMF();
+  TransferKind TKind;
+  Register Reg;
+  std::optional<VarLoc::SpillLoc> Loc;
+
+  LLVM_DEBUG(dbgs() << "Examining instruction: "; MI.dump(););
+
+  // First, if there are any DBG_VALUEs pointing at a spill slot that is
+  // written to, then close the variable location. The value in memory
+  // will have changed.
+  VarLocsInRange KillSet;
+  if (isSpillInstruction(MI, MF)) {
+    Loc = extractSpillBaseRegAndOffset(MI);
+    for (uint64_t ID : OpenRanges.getSpillVarLocs()) {
+      LocIndex Idx = LocIndex::fromRawInteger(ID);
+      const VarLoc &VL = VarLocIDs[Idx];
+      assert(VL.containsSpillLocs() && "Broken VarLocSet?");
+      if (VL.usesSpillLoc(*Loc)) {
+        // This location is overwritten by the current instruction -- terminate
+        // the open range, and insert an explicit DBG_VALUE $noreg.
+        //
+        // Doing this at a later stage would require re-interpreting all
+        // DBG_VALUes and DIExpressions to identify whether they point at
+        // memory, and then analysing all memory writes to see if they
+        // overwrite that memory, which is expensive.
+        //
+        // At this stage, we already know which DBG_VALUEs are for spills and
+        // where they are located; it's best to fix handle overwrites now.
+        KillSet.insert(ID);
+        unsigned SpillLocIdx = VL.getSpillLocIdx(*Loc);
+        VarLoc::MachineLoc OldLoc = VL.Locs[SpillLocIdx];
+        VarLoc UndefVL = VarLoc::CreateCopyLoc(VL, OldLoc, 0);
+        LocIndices UndefLocIDs = VarLocIDs.insert(UndefVL);
+        Transfers.push_back({&MI, UndefLocIDs.back()});
+      }
+    }
+    OpenRanges.erase(KillSet, VarLocIDs, LocIndex::kSpillLocation);
+  }
+
+  // Try to recognise spill and restore instructions that may create a new
+  // variable location.
+  if (isLocationSpill(MI, MF, Reg)) {
+    TKind = TransferKind::TransferSpill;
+    LLVM_DEBUG(dbgs() << "Recognized as spill: "; MI.dump(););
+    LLVM_DEBUG(dbgs() << "Register: " << Reg << " " << printReg(Reg, TRI)
+                      << "\n");
+  } else {
+    if (!(Loc = isRestoreInstruction(MI, MF, Reg)))
+      return;
+    TKind = TransferKind::TransferRestore;
+    LLVM_DEBUG(dbgs() << "Recognized as restore: "; MI.dump(););
+    LLVM_DEBUG(dbgs() << "Register: " << Reg << " " << printReg(Reg, TRI)
+                      << "\n");
+  }
+  // Check if the register or spill location is the location of a debug value.
+  auto TransferCandidates = OpenRanges.getEmptyVarLocRange();
+  if (TKind == TransferKind::TransferSpill)
+    TransferCandidates = OpenRanges.getRegisterVarLocs(Reg);
+  else if (TKind == TransferKind::TransferRestore)
+    TransferCandidates = OpenRanges.getSpillVarLocs();
+  for (uint64_t ID : TransferCandidates) {
+    LocIndex Idx = LocIndex::fromRawInteger(ID);
+    const VarLoc &VL = VarLocIDs[Idx];
+    unsigned LocIdx;
+    if (TKind == TransferKind::TransferSpill) {
+      assert(VL.usesReg(Reg) && "Broken VarLocSet?");
+      LLVM_DEBUG(dbgs() << "Spilling Register " << printReg(Reg, TRI) << '('
+                        << VL.Var.getVariable()->getName() << ")\n");
+      LocIdx = VL.getRegIdx(Reg);
+    } else {
+      assert(TKind == TransferKind::TransferRestore && VL.containsSpillLocs() &&
+             "Broken VarLocSet?");
+      if (!VL.usesSpillLoc(*Loc))
+        // The spill location is not the location of a debug value.
+        continue;
+      LLVM_DEBUG(dbgs() << "Restoring Register " << printReg(Reg, TRI) << '('
+                        << VL.Var.getVariable()->getName() << ")\n");
+      LocIdx = VL.getSpillLocIdx(*Loc);
+    }
+    VarLoc::MachineLoc MLoc = VL.Locs[LocIdx];
+    insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, Idx, TKind,
+                            MLoc, Reg);
+    // FIXME: A comment should explain why it's correct to return early here,
+    // if that is in fact correct.
+    return;
+  }
+}
+
+/// If \p MI is a register copy instruction, that copies a previously tracked
+/// value from one register to another register that is callee saved, we
+/// create new DBG_VALUE instruction  described with copy destination register.
+void VarLocBasedLDV::transferRegisterCopy(MachineInstr &MI,
+                                           OpenRangesSet &OpenRanges,
+                                           VarLocMap &VarLocIDs,
+                                           TransferMap &Transfers) {
+  auto DestSrc = TII->isCopyInstr(MI);
+  if (!DestSrc)
+    return;
+
+  const MachineOperand *DestRegOp = DestSrc->Destination;
+  const MachineOperand *SrcRegOp = DestSrc->Source;
+
+  if (!DestRegOp->isDef())
+    return;
+
+  auto isCalleeSavedReg = [&](Register Reg) {
+    for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
+      if (CalleeSavedRegs.test(*RAI))
+        return true;
+    return false;
+  };
+
+  Register SrcReg = SrcRegOp->getReg();
+  Register DestReg = DestRegOp->getReg();
+
+  // We want to recognize instructions where destination register is callee
+  // saved register. If register that could be clobbered by the call is
+  // included, there would be a great chance that it is going to be clobbered
+  // soon. It is more likely that previous register location, which is callee
+  // saved, is going to stay unclobbered longer, even if it is killed.
+  if (!isCalleeSavedReg(DestReg))
+    return;
+
+  // Remember an entry value movement. If we encounter a new debug value of
+  // a parameter describing only a moving of the value around, rather then
+  // modifying it, we are still able to use the entry value if needed.
+  if (isRegOtherThanSPAndFP(*DestRegOp, MI, TRI)) {
+    for (uint64_t ID : OpenRanges.getEntryValueBackupVarLocs()) {
+      LocIndex Idx = LocIndex::fromRawInteger(ID);
+      const VarLoc &VL = VarLocIDs[Idx];
+      if (VL.isEntryValueBackupReg(SrcReg)) {
+        LLVM_DEBUG(dbgs() << "Copy of the entry value: "; MI.dump(););
+        VarLoc EntryValLocCopyBackup =
+            VarLoc::CreateEntryCopyBackupLoc(VL.MI, VL.Expr, DestReg);
+        // Stop tracking the original entry value.
+        OpenRanges.erase(VL);
+
+        // Start tracking the entry value copy.
+        LocIndices EntryValCopyLocIDs = VarLocIDs.insert(EntryValLocCopyBackup);
+        OpenRanges.insert(EntryValCopyLocIDs, EntryValLocCopyBackup);
+        break;
+      }
+    }
+  }
+
+  if (!SrcRegOp->isKill())
+    return;
+
+  for (uint64_t ID : OpenRanges.getRegisterVarLocs(SrcReg)) {
+    LocIndex Idx = LocIndex::fromRawInteger(ID);
+    assert(VarLocIDs[Idx].usesReg(SrcReg) && "Broken VarLocSet?");
+    VarLoc::MachineLocValue Loc;
+    Loc.RegNo = SrcReg;
+    VarLoc::MachineLoc MLoc{VarLoc::MachineLocKind::RegisterKind, Loc};
+    insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, Idx,
+                            TransferKind::TransferCopy, MLoc, DestReg);
+    // FIXME: A comment should explain why it's correct to return early here,
+    // if that is in fact correct.
+    return;
+  }
+}
+
+/// Terminate all open ranges at the end of the current basic block.
+bool VarLocBasedLDV::transferTerminator(MachineBasicBlock *CurMBB,
+                                         OpenRangesSet &OpenRanges,
+                                         VarLocInMBB &OutLocs,
+                                         const VarLocMap &VarLocIDs) {
+  bool Changed = false;
+  LLVM_DEBUG({
+    VarVec VarLocs;
+    OpenRanges.getUniqueVarLocs(VarLocs, VarLocIDs);
+    for (VarLoc &VL : VarLocs) {
+      // Copy OpenRanges to OutLocs, if not already present.
+      dbgs() << "Add to OutLocs in MBB #" << CurMBB->getNumber() << ":  ";
+      VL.dump(TRI, TII);
+    }
+  });
+  VarLocSet &VLS = getVarLocsInMBB(CurMBB, OutLocs);
+  Changed = VLS != OpenRanges.getVarLocs();
+  // New OutLocs set may be different due to spill, restore or register
+  // copy instruction processing.
+  if (Changed)
+    VLS = OpenRanges.getVarLocs();
+  OpenRanges.clear();
+  return Changed;
+}
+
+/// Accumulate a mapping between each DILocalVariable fragment and other
+/// fragments of that DILocalVariable which overlap. This reduces work during
+/// the data-flow stage from "Find any overlapping fragments" to "Check if the
+/// known-to-overlap fragments are present".
+/// \param MI A previously unprocessed DEBUG_VALUE instruction to analyze for
+///           fragment usage.
+/// \param SeenFragments Map from DILocalVariable to all fragments of that
+///           Variable which are known to exist.
+/// \param OverlappingFragments The overlap map being constructed, from one
+///           Var/Fragment pair to a vector of fragments known to overlap.
+void VarLocBasedLDV::accumulateFragmentMap(MachineInstr &MI,
+                                            VarToFragments &SeenFragments,
+                                            OverlapMap &OverlappingFragments) {
+  DebugVariable MIVar(MI.getDebugVariable(), MI.getDebugExpression(),
+                      MI.getDebugLoc()->getInlinedAt());
+  FragmentInfo ThisFragment = MIVar.getFragmentOrDefault();
+
+  // If this is the first sighting of this variable, then we are guaranteed
+  // there are currently no overlapping fragments either. Initialize the set
+  // of seen fragments, record no overlaps for the current one, and return.
+  auto SeenIt = SeenFragments.find(MIVar.getVariable());
+  if (SeenIt == SeenFragments.end()) {
+    SmallSet<FragmentInfo, 4> OneFragment;
+    OneFragment.insert(ThisFragment);
+    SeenFragments.insert({MIVar.getVariable(), OneFragment});
+
+    OverlappingFragments.insert({{MIVar.getVariable(), ThisFragment}, {}});
+    return;
+  }
+
+  // If this particular Variable/Fragment pair already exists in the overlap
+  // map, it has already been accounted for.
+  auto IsInOLapMap =
+      OverlappingFragments.insert({{MIVar.getVariable(), ThisFragment}, {}});
+  if (!IsInOLapMap.second)
+    return;
+
+  auto &ThisFragmentsOverlaps = IsInOLapMap.first->second;
+  auto &AllSeenFragments = SeenIt->second;
+
+  // Otherwise, examine all other seen fragments for this variable, with "this"
+  // fragment being a previously unseen fragment. Record any pair of
+  // overlapping fragments.
+  for (const auto &ASeenFragment : AllSeenFragments) {
+    // Does this previously seen fragment overlap?
+    if (DIExpression::fragmentsOverlap(ThisFragment, ASeenFragment)) {
+      // Yes: Mark the current fragment as being overlapped.
+      ThisFragmentsOverlaps.push_back(ASeenFragment);
+      // Mark the previously seen fragment as being overlapped by the current
+      // one.
+      auto ASeenFragmentsOverlaps =
+          OverlappingFragments.find({MIVar.getVariable(), ASeenFragment});
+      assert(ASeenFragmentsOverlaps != OverlappingFragments.end() &&
+             "Previously seen var fragment has no vector of overlaps");
+      ASeenFragmentsOverlaps->second.push_back(ThisFragment);
+    }
+  }
+
+  AllSeenFragments.insert(ThisFragment);
+}
+
+/// This routine creates OpenRanges.
+void VarLocBasedLDV::process(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                             VarLocMap &VarLocIDs, TransferMap &Transfers,
+                             InstToEntryLocMap &EntryValTransfers,
+                             RegDefToInstMap &RegSetInstrs) {
+  if (!MI.isDebugInstr())
+    LastNonDbgMI = &MI;
+  transferDebugValue(MI, OpenRanges, VarLocIDs, EntryValTransfers,
+                     RegSetInstrs);
+  transferRegisterDef(MI, OpenRanges, VarLocIDs, EntryValTransfers,
+                      RegSetInstrs);
+  transferWasmDef(MI, OpenRanges, VarLocIDs);
+  transferRegisterCopy(MI, OpenRanges, VarLocIDs, Transfers);
+  transferSpillOrRestoreInst(MI, OpenRanges, VarLocIDs, Transfers);
+}
+
+/// This routine joins the analysis results of all incoming edges in @MBB by
+/// inserting a new DBG_VALUE instruction at the start of the @MBB - if the same
+/// source variable in all the predecessors of @MBB reside in the same location.
+bool VarLocBasedLDV::join(
+    MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
+    const VarLocMap &VarLocIDs,
+    SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
+    SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks) {
+  LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n");
+
+  VarLocSet InLocsT(Alloc); // Temporary incoming locations.
+
+  // For all predecessors of this MBB, find the set of VarLocs that
+  // can be joined.
+  int NumVisited = 0;
+  for (auto *p : MBB.predecessors()) {
+    // Ignore backedges if we have not visited the predecessor yet. As the
+    // predecessor hasn't yet had locations propagated into it, most locations
+    // will not yet be valid, so treat them as all being uninitialized and
+    // potentially valid. If a location guessed to be correct here is
+    // invalidated later, we will remove it when we revisit this block.
+    if (!Visited.count(p)) {
+      LLVM_DEBUG(dbgs() << "  ignoring unvisited pred MBB: " << p->getNumber()
+                        << "\n");
+      continue;
+    }
+    auto OL = OutLocs.find(p);
+    // Join is null in case of empty OutLocs from any of the pred.
+    if (OL == OutLocs.end())
+      return false;
+
+    // Just copy over the Out locs to incoming locs for the first visited
+    // predecessor, and for all other predecessors join the Out locs.
+    VarLocSet &OutLocVLS = *OL->second;
+    if (!NumVisited)
+      InLocsT = OutLocVLS;
+    else
+      InLocsT &= OutLocVLS;
+
+    LLVM_DEBUG({
+      if (!InLocsT.empty()) {
+        VarVec VarLocs;
+        collectAllVarLocs(VarLocs, InLocsT, VarLocIDs);
+        for (const VarLoc &VL : VarLocs)
+          dbgs() << "  gathered candidate incoming var: "
+                 << VL.Var.getVariable()->getName() << "\n";
+      }
+    });
+
+    NumVisited++;
+  }
+
+  // Filter out DBG_VALUES that are out of scope.
+  VarLocSet KillSet(Alloc);
+  bool IsArtificial = ArtificialBlocks.count(&MBB);
+  if (!IsArtificial) {
+    for (uint64_t ID : InLocsT) {
+      LocIndex Idx = LocIndex::fromRawInteger(ID);
+      if (!VarLocIDs[Idx].dominates(LS, MBB)) {
+        KillSet.set(ID);
+        LLVM_DEBUG({
+          auto Name = VarLocIDs[Idx].Var.getVariable()->getName();
+          dbgs() << "  killing " << Name << ", it doesn't dominate MBB\n";
+        });
+      }
+    }
+  }
+  InLocsT.intersectWithComplement(KillSet);
+
+  // As we are processing blocks in reverse post-order we
+  // should have processed at least one predecessor, unless it
+  // is the entry block which has no predecessor.
+  assert((NumVisited || MBB.pred_empty()) &&
+         "Should have processed at least one predecessor");
+
+  VarLocSet &ILS = getVarLocsInMBB(&MBB, InLocs);
+  bool Changed = false;
+  if (ILS != InLocsT) {
+    ILS = InLocsT;
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+void VarLocBasedLDV::flushPendingLocs(VarLocInMBB &PendingInLocs,
+                                       VarLocMap &VarLocIDs) {
+  // PendingInLocs records all locations propagated into blocks, which have
+  // not had DBG_VALUE insts created. Go through and create those insts now.
+  for (auto &Iter : PendingInLocs) {
+    // Map is keyed on a constant pointer, unwrap it so we can insert insts.
+    auto &MBB = const_cast<MachineBasicBlock &>(*Iter.first);
+    VarLocSet &Pending = *Iter.second;
+
+    SmallVector<VarLoc, 32> VarLocs;
+    collectAllVarLocs(VarLocs, Pending, VarLocIDs);
+
+    for (VarLoc DiffIt : VarLocs) {
+      // The ID location is live-in to MBB -- work out what kind of machine
+      // location it is and create a DBG_VALUE.
+      if (DiffIt.isEntryBackupLoc())
+        continue;
+      MachineInstr *MI = DiffIt.BuildDbgValue(*MBB.getParent());
+      MBB.insert(MBB.instr_begin(), MI);
+
+      (void)MI;
+      LLVM_DEBUG(dbgs() << "Inserted: "; MI->dump(););
+    }
+  }
+}
+
+bool VarLocBasedLDV::isEntryValueCandidate(
+    const MachineInstr &MI, const DefinedRegsSet &DefinedRegs) const {
+  assert(MI.isDebugValue() && "This must be DBG_VALUE.");
+
+  // TODO: Add support for local variables that are expressed in terms of
+  // parameters entry values.
+  // TODO: Add support for modified arguments that can be expressed
+  // by using its entry value.
+  auto *DIVar = MI.getDebugVariable();
+  if (!DIVar->isParameter())
+    return false;
+
+  // Do not consider parameters that belong to an inlined function.
+  if (MI.getDebugLoc()->getInlinedAt())
+    return false;
+
+  // Only consider parameters that are described using registers. Parameters
+  // that are passed on the stack are not yet supported, so ignore debug
+  // values that are described by the frame or stack pointer.
+  if (!isRegOtherThanSPAndFP(MI.getDebugOperand(0), MI, TRI))
+    return false;
+
+  // If a parameter's value has been propagated from the caller, then the
+  // parameter's DBG_VALUE may be described using a register defined by some
+  // instruction in the entry block, in which case we shouldn't create an
+  // entry value.
+  if (DefinedRegs.count(MI.getDebugOperand(0).getReg()))
+    return false;
+
+  // TODO: Add support for parameters that have a pre-existing debug expressions
+  // (e.g. fragments).
+  // A simple deref expression is equivalent to an indirect debug value.
+  const DIExpression *Expr = MI.getDebugExpression();
+  if (Expr->getNumElements() > 0 && !Expr->isDeref())
+    return false;
+
+  return true;
+}
+
+/// Collect all register defines (including aliases) for the given instruction.
+static void collectRegDefs(const MachineInstr &MI, DefinedRegsSet &Regs,
+                           const TargetRegisterInfo *TRI) {
+  for (const MachineOperand &MO : MI.all_defs()) {
+    if (MO.getReg() && MO.getReg().isPhysical()) {
+      Regs.insert(MO.getReg());
+      for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
+        Regs.insert(*AI);
+    }
+  }
+}
+
+/// This routine records the entry values of function parameters. The values
+/// could be used as backup values. If we loose the track of some unmodified
+/// parameters, the backup values will be used as a primary locations.
+void VarLocBasedLDV::recordEntryValue(const MachineInstr &MI,
+                                       const DefinedRegsSet &DefinedRegs,
+                                       OpenRangesSet &OpenRanges,
+                                       VarLocMap &VarLocIDs) {
+  if (TPC) {
+    auto &TM = TPC->getTM<TargetMachine>();
+    if (!TM.Options.ShouldEmitDebugEntryValues())
+      return;
+  }
+
+  DebugVariable V(MI.getDebugVariable(), MI.getDebugExpression(),
+                  MI.getDebugLoc()->getInlinedAt());
+
+  if (!isEntryValueCandidate(MI, DefinedRegs) ||
+      OpenRanges.getEntryValueBackup(V))
+    return;
+
+  LLVM_DEBUG(dbgs() << "Creating the backup entry location: "; MI.dump(););
+
+  // Create the entry value and use it as a backup location until it is
+  // valid. It is valid until a parameter is not changed.
+  DIExpression *NewExpr =
+      DIExpression::prepend(MI.getDebugExpression(), DIExpression::EntryValue);
+  VarLoc EntryValLocAsBackup = VarLoc::CreateEntryBackupLoc(MI, NewExpr);
+  LocIndices EntryValLocIDs = VarLocIDs.insert(EntryValLocAsBackup);
+  OpenRanges.insert(EntryValLocIDs, EntryValLocAsBackup);
+}
+
+/// Calculate the liveness information for the given machine function and
+/// extend ranges across basic blocks.
+bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF,
+                                  MachineDominatorTree *DomTree,
+                                  TargetPassConfig *TPC, unsigned InputBBLimit,
+                                  unsigned InputDbgValLimit) {
+  (void)DomTree;
+  LLVM_DEBUG(dbgs() << "\nDebug Range Extension: " << MF.getName() << "\n");
+
+  if (!MF.getFunction().getSubprogram())
+    // VarLocBaseLDV will already have removed all DBG_VALUEs.
+    return false;
+
+  // Skip functions from NoDebug compilation units.
+  if (MF.getFunction().getSubprogram()->getUnit()->getEmissionKind() ==
+      DICompileUnit::NoDebug)
+    return false;
+
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  TFI = MF.getSubtarget().getFrameLowering();
+  TFI->getCalleeSaves(MF, CalleeSavedRegs);
+  this->TPC = TPC;
+  LS.initialize(MF);
+
+  bool Changed = false;
+  bool OLChanged = false;
+  bool MBBJoined = false;
+
+  VarLocMap VarLocIDs;         // Map VarLoc<>unique ID for use in bitvectors.
+  OverlapMap OverlapFragments; // Map of overlapping variable fragments.
+  OpenRangesSet OpenRanges(Alloc, OverlapFragments);
+                              // Ranges that are open until end of bb.
+  VarLocInMBB OutLocs;        // Ranges that exist beyond bb.
+  VarLocInMBB InLocs;         // Ranges that are incoming after joining.
+  TransferMap Transfers;      // DBG_VALUEs associated with transfers (such as
+                              // spills, copies and restores).
+  // Map responsible MI to attached Transfer emitted from Backup Entry Value.
+  InstToEntryLocMap EntryValTransfers;
+  // Map a Register to the last MI which clobbered it.
+  RegDefToInstMap RegSetInstrs;
+
+  VarToFragments SeenFragments;
+
+  // Blocks which are artificial, i.e. blocks which exclusively contain
+  // instructions without locations, or with line 0 locations.
+  SmallPtrSet<const MachineBasicBlock *, 16> ArtificialBlocks;
+
+  DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;
+  DenseMap<MachineBasicBlock *, unsigned int> BBToOrder;
+  std::priority_queue<unsigned int, std::vector<unsigned int>,
+                      std::greater<unsigned int>>
+      Worklist;
+  std::priority_queue<unsigned int, std::vector<unsigned int>,
+                      std::greater<unsigned int>>
+      Pending;
+
+  // Set of register defines that are seen when traversing the entry block
+  // looking for debug entry value candidates.
+  DefinedRegsSet DefinedRegs;
+
+  // Only in the case of entry MBB collect DBG_VALUEs representing
+  // function parameters in order to generate debug entry values for them.
+  MachineBasicBlock &First_MBB = *(MF.begin());
+  for (auto &MI : First_MBB) {
+    collectRegDefs(MI, DefinedRegs, TRI);
+    if (MI.isDebugValue())
+      recordEntryValue(MI, DefinedRegs, OpenRanges, VarLocIDs);
+  }
+
+  // Initialize per-block structures and scan for fragment overlaps.
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
+      if (MI.isDebugValue())
+        accumulateFragmentMap(MI, SeenFragments, OverlapFragments);
+
+  auto hasNonArtificialLocation = [](const MachineInstr &MI) -> bool {
+    if (const DebugLoc &DL = MI.getDebugLoc())
+      return DL.getLine() != 0;
+    return false;
+  };
+  for (auto &MBB : MF)
+    if (none_of(MBB.instrs(), hasNonArtificialLocation))
+      ArtificialBlocks.insert(&MBB);
+
+  LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
+                              "OutLocs after initialization", dbgs()));
+
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  unsigned int RPONumber = 0;
+  for (MachineBasicBlock *MBB : RPOT) {
+    OrderToBB[RPONumber] = MBB;
+    BBToOrder[MBB] = RPONumber;
+    Worklist.push(RPONumber);
+    ++RPONumber;
+  }
+
+  if (RPONumber > InputBBLimit) {
+    unsigned NumInputDbgValues = 0;
+    for (auto &MBB : MF)
+      for (auto &MI : MBB)
+        if (MI.isDebugValue())
+          ++NumInputDbgValues;
+    if (NumInputDbgValues > InputDbgValLimit) {
+      LLVM_DEBUG(dbgs() << "Disabling VarLocBasedLDV: " << MF.getName()
+                        << " has " << RPONumber << " basic blocks and "
+                        << NumInputDbgValues
+                        << " input DBG_VALUEs, exceeding limits.\n");
+      return false;
+    }
+  }
+
+  // This is a standard "union of predecessor outs" dataflow problem.
+  // To solve it, we perform join() and process() using the two worklist method
+  // until the ranges converge.
+  // Ranges have converged when both worklists are empty.
+  SmallPtrSet<const MachineBasicBlock *, 16> Visited;
+  while (!Worklist.empty() || !Pending.empty()) {
+    // We track what is on the pending worklist to avoid inserting the same
+    // thing twice.  We could avoid this with a custom priority queue, but this
+    // is probably not worth it.
+    SmallPtrSet<MachineBasicBlock *, 16> OnPending;
+    LLVM_DEBUG(dbgs() << "Processing Worklist\n");
+    while (!Worklist.empty()) {
+      MachineBasicBlock *MBB = OrderToBB[Worklist.top()];
+      Worklist.pop();
+      MBBJoined = join(*MBB, OutLocs, InLocs, VarLocIDs, Visited,
+                       ArtificialBlocks);
+      MBBJoined |= Visited.insert(MBB).second;
+      if (MBBJoined) {
+        MBBJoined = false;
+        Changed = true;
+        // Now that we have started to extend ranges across BBs we need to
+        // examine spill, copy and restore instructions to see whether they
+        // operate with registers that correspond to user variables.
+        // First load any pending inlocs.
+        OpenRanges.insertFromLocSet(getVarLocsInMBB(MBB, InLocs), VarLocIDs);
+        LastNonDbgMI = nullptr;
+        RegSetInstrs.clear();
+        for (auto &MI : *MBB)
+          process(MI, OpenRanges, VarLocIDs, Transfers, EntryValTransfers,
+                  RegSetInstrs);
+        OLChanged |= transferTerminator(MBB, OpenRanges, OutLocs, VarLocIDs);
+
+        LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
+                                    "OutLocs after propagating", dbgs()));
+        LLVM_DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs,
+                                    "InLocs after propagating", dbgs()));
+
+        if (OLChanged) {
+          OLChanged = false;
+          for (auto *s : MBB->successors())
+            if (OnPending.insert(s).second) {
+              Pending.push(BBToOrder[s]);
+            }
+        }
+      }
+    }
+    Worklist.swap(Pending);
+    // At this point, pending must be empty, since it was just the empty
+    // worklist
+    assert(Pending.empty() && "Pending should be empty");
+  }
+
+  // Add any DBG_VALUE instructions created by location transfers.
+  for (auto &TR : Transfers) {
+    assert(!TR.TransferInst->isTerminator() &&
+           "Cannot insert DBG_VALUE after terminator");
+    MachineBasicBlock *MBB = TR.TransferInst->getParent();
+    const VarLoc &VL = VarLocIDs[TR.LocationID];
+    MachineInstr *MI = VL.BuildDbgValue(MF);
+    MBB->insertAfterBundle(TR.TransferInst->getIterator(), MI);
+  }
+  Transfers.clear();
+
+  // Add DBG_VALUEs created using Backup Entry Value location.
+  for (auto &TR : EntryValTransfers) {
+    MachineInstr *TRInst = const_cast<MachineInstr *>(TR.first);
+    assert(!TRInst->isTerminator() &&
+           "Cannot insert DBG_VALUE after terminator");
+    MachineBasicBlock *MBB = TRInst->getParent();
+    const VarLoc &VL = VarLocIDs[TR.second];
+    MachineInstr *MI = VL.BuildDbgValue(MF);
+    MBB->insertAfterBundle(TRInst->getIterator(), MI);
+  }
+  EntryValTransfers.clear();
+
+  // Deferred inlocs will not have had any DBG_VALUE insts created; do
+  // that now.
+  flushPendingLocs(InLocs, VarLocIDs);
+
+  LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "Final OutLocs", dbgs()));
+  LLVM_DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs, "Final InLocs", dbgs()));
+  return Changed;
+}
+
+LDVImpl *
+llvm::makeVarLocBasedLiveDebugValues()
+{
+  return new VarLocBasedLDV();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.cpp
new file mode 100644
index 000000000000..9603c1f01e08
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -0,0 +1,1970 @@
+//===- LiveDebugVariables.cpp - Tracking debug info variables -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveDebugVariables analysis.
+//
+// Remove all DBG_VALUE instructions referencing virtual registers and replace
+// them with a data structure tracking where live user variables are kept - in a
+// virtual register or in a stack slot.
+//
+// Allow the data structure to be updated during register allocation when values
+// are moved between registers and stack slots. Finally emit new DBG_VALUE
+// instructions after register allocation is complete.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LiveDebugVariables.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IntervalMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "livedebugvars"
+
+static cl::opt<bool>
+EnableLDV("live-debug-variables", cl::init(true),
+          cl::desc("Enable the live debug variables pass"), cl::Hidden);
+
+STATISTIC(NumInsertedDebugValues, "Number of DBG_VALUEs inserted");
+STATISTIC(NumInsertedDebugLabels, "Number of DBG_LABELs inserted");
+
+char LiveDebugVariables::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LiveDebugVariables, DEBUG_TYPE,
+                "Debug Variable Analysis", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(LiveDebugVariables, DEBUG_TYPE,
+                "Debug Variable Analysis", false, false)
+
+void LiveDebugVariables::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequiredTransitive<LiveIntervals>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+LiveDebugVariables::LiveDebugVariables() : MachineFunctionPass(ID) {
+  initializeLiveDebugVariablesPass(*PassRegistry::getPassRegistry());
+}
+
+enum : unsigned { UndefLocNo = ~0U };
+
+namespace {
+/// Describes a debug variable value by location number and expression along
+/// with some flags about the original usage of the location.
+class DbgVariableValue {
+public:
+  DbgVariableValue(ArrayRef<unsigned> NewLocs, bool WasIndirect, bool WasList,
+                   const DIExpression &Expr)
+      : WasIndirect(WasIndirect), WasList(WasList), Expression(&Expr) {
+    assert(!(WasIndirect && WasList) &&
+           "DBG_VALUE_LISTs should not be indirect.");
+    SmallVector<unsigned> LocNoVec;
+    for (unsigned LocNo : NewLocs) {
+      auto It = find(LocNoVec, LocNo);
+      if (It == LocNoVec.end())
+        LocNoVec.push_back(LocNo);
+      else {
+        // Loc duplicates an element in LocNos; replace references to Op
+        // with references to the duplicating element.
+        unsigned OpIdx = LocNoVec.size();
+        unsigned DuplicatingIdx = std::distance(LocNoVec.begin(), It);
+        Expression =
+            DIExpression::replaceArg(Expression, OpIdx, DuplicatingIdx);
+      }
+    }
+    // FIXME: Debug values referencing 64+ unique machine locations are rare and
+    // currently unsupported for performance reasons. If we can verify that
+    // performance is acceptable for such debug values, we can increase the
+    // bit-width of LocNoCount to 14 to enable up to 16384 unique machine
+    // locations. We will also need to verify that this does not cause issues
+    // with LiveDebugVariables' use of IntervalMap.
+    if (LocNoVec.size() < 64) {
+      LocNoCount = LocNoVec.size();
+      if (LocNoCount > 0) {
+        LocNos = std::make_unique<unsigned[]>(LocNoCount);
+        std::copy(LocNoVec.begin(), LocNoVec.end(), loc_nos_begin());
+      }
+    } else {
+      LLVM_DEBUG(dbgs() << "Found debug value with 64+ unique machine "
+                           "locations, dropping...\n");
+      LocNoCount = 1;
+      // Turn this into an undef debug value list; right now, the simplest form
+      // of this is an expression with one arg, and an undef debug operand.
+      Expression =
+          DIExpression::get(Expr.getContext(), {dwarf::DW_OP_LLVM_arg, 0});
+      if (auto FragmentInfoOpt = Expr.getFragmentInfo())
+        Expression = *DIExpression::createFragmentExpression(
+            Expression, FragmentInfoOpt->OffsetInBits,
+            FragmentInfoOpt->SizeInBits);
+      LocNos = std::make_unique<unsigned[]>(LocNoCount);
+      LocNos[0] = UndefLocNo;
+    }
+  }
+
+  DbgVariableValue() : LocNoCount(0), WasIndirect(false), WasList(false) {}
+  DbgVariableValue(const DbgVariableValue &Other)
+      : LocNoCount(Other.LocNoCount), WasIndirect(Other.getWasIndirect()),
+        WasList(Other.getWasList()), Expression(Other.getExpression()) {
+    if (Other.getLocNoCount()) {
+      LocNos.reset(new unsigned[Other.getLocNoCount()]);
+      std::copy(Other.loc_nos_begin(), Other.loc_nos_end(), loc_nos_begin());
+    }
+  }
+
+  DbgVariableValue &operator=(const DbgVariableValue &Other) {
+    if (this == &Other)
+      return *this;
+    if (Other.getLocNoCount()) {
+      LocNos.reset(new unsigned[Other.getLocNoCount()]);
+      std::copy(Other.loc_nos_begin(), Other.loc_nos_end(), loc_nos_begin());
+    } else {
+      LocNos.release();
+    }
+    LocNoCount = Other.getLocNoCount();
+    WasIndirect = Other.getWasIndirect();
+    WasList = Other.getWasList();
+    Expression = Other.getExpression();
+    return *this;
+  }
+
+  const DIExpression *getExpression() const { return Expression; }
+  uint8_t getLocNoCount() const { return LocNoCount; }
+  bool containsLocNo(unsigned LocNo) const {
+    return is_contained(loc_nos(), LocNo);
+  }
+  bool getWasIndirect() const { return WasIndirect; }
+  bool getWasList() const { return WasList; }
+  bool isUndef() const { return LocNoCount == 0 || containsLocNo(UndefLocNo); }
+
+  DbgVariableValue decrementLocNosAfterPivot(unsigned Pivot) const {
+    SmallVector<unsigned, 4> NewLocNos;
+    for (unsigned LocNo : loc_nos())
+      NewLocNos.push_back(LocNo != UndefLocNo && LocNo > Pivot ? LocNo - 1
+                                                               : LocNo);
+    return DbgVariableValue(NewLocNos, WasIndirect, WasList, *Expression);
+  }
+
+  DbgVariableValue remapLocNos(ArrayRef<unsigned> LocNoMap) const {
+    SmallVector<unsigned> NewLocNos;
+    for (unsigned LocNo : loc_nos())
+      // Undef values don't exist in locations (and thus not in LocNoMap
+      // either) so skip over them. See getLocationNo().
+      NewLocNos.push_back(LocNo == UndefLocNo ? UndefLocNo : LocNoMap[LocNo]);
+    return DbgVariableValue(NewLocNos, WasIndirect, WasList, *Expression);
+  }
+
+  DbgVariableValue changeLocNo(unsigned OldLocNo, unsigned NewLocNo) const {
+    SmallVector<unsigned> NewLocNos;
+    NewLocNos.assign(loc_nos_begin(), loc_nos_end());
+    auto OldLocIt = find(NewLocNos, OldLocNo);
+    assert(OldLocIt != NewLocNos.end() && "Old location must be present.");
+    *OldLocIt = NewLocNo;
+    return DbgVariableValue(NewLocNos, WasIndirect, WasList, *Expression);
+  }
+
+  bool hasLocNoGreaterThan(unsigned LocNo) const {
+    return any_of(loc_nos(),
+                  [LocNo](unsigned ThisLocNo) { return ThisLocNo > LocNo; });
+  }
+
+  void printLocNos(llvm::raw_ostream &OS) const {
+    for (const unsigned &Loc : loc_nos())
+      OS << (&Loc == loc_nos_begin() ? " " : ", ") << Loc;
+  }
+
+  friend inline bool operator==(const DbgVariableValue &LHS,
+                                const DbgVariableValue &RHS) {
+    if (std::tie(LHS.LocNoCount, LHS.WasIndirect, LHS.WasList,
+                 LHS.Expression) !=
+        std::tie(RHS.LocNoCount, RHS.WasIndirect, RHS.WasList, RHS.Expression))
+      return false;
+    return std::equal(LHS.loc_nos_begin(), LHS.loc_nos_end(),
+                      RHS.loc_nos_begin());
+  }
+
+  friend inline bool operator!=(const DbgVariableValue &LHS,
+                                const DbgVariableValue &RHS) {
+    return !(LHS == RHS);
+  }
+
+  unsigned *loc_nos_begin() { return LocNos.get(); }
+  const unsigned *loc_nos_begin() const { return LocNos.get(); }
+  unsigned *loc_nos_end() { return LocNos.get() + LocNoCount; }
+  const unsigned *loc_nos_end() const { return LocNos.get() + LocNoCount; }
+  ArrayRef<unsigned> loc_nos() const {
+    return ArrayRef<unsigned>(LocNos.get(), LocNoCount);
+  }
+
+private:
+  // IntervalMap requires the value object to be very small, to the extent
+  // that we do not have enough room for an std::vector. Using a C-style array
+  // (with a unique_ptr wrapper for convenience) allows us to optimize for this
+  // specific case by packing the array size into only 6 bits (it is highly
+  // unlikely that any debug value will need 64+ locations).
+  std::unique_ptr<unsigned[]> LocNos;
+  uint8_t LocNoCount : 6;
+  bool WasIndirect : 1;
+  bool WasList : 1;
+  const DIExpression *Expression = nullptr;
+};
+} // namespace
+
+/// Map of where a user value is live to that value.
+using LocMap = IntervalMap<SlotIndex, DbgVariableValue, 4>;
+
+/// Map of stack slot offsets for spilled locations.
+/// Non-spilled locations are not added to the map.
+using SpillOffsetMap = DenseMap<unsigned, unsigned>;
+
+/// Cache to save the location where it can be used as the starting
+/// position as input for calling MachineBasicBlock::SkipPHIsLabelsAndDebug.
+/// This is to prevent MachineBasicBlock::SkipPHIsLabelsAndDebug from
+/// repeatedly searching the same set of PHIs/Labels/Debug instructions
+/// if it is called many times for the same block.
+using BlockSkipInstsMap =
+    DenseMap<MachineBasicBlock *, MachineBasicBlock::iterator>;
+
+namespace {
+
+class LDVImpl;
+
+/// A user value is a part of a debug info user variable.
+///
+/// A DBG_VALUE instruction notes that (a sub-register of) a virtual register
+/// holds part of a user variable. The part is identified by a byte offset.
+///
+/// UserValues are grouped into equivalence classes for easier searching. Two
+/// user values are related if they are held by the same virtual register. The
+/// equivalence class is the transitive closure of that relation.
+class UserValue {
+  const DILocalVariable *Variable; ///< The debug info variable we are part of.
+  /// The part of the variable we describe.
+  const std::optional<DIExpression::FragmentInfo> Fragment;
+  DebugLoc dl;            ///< The debug location for the variable. This is
+                          ///< used by dwarf writer to find lexical scope.
+  UserValue *leader;      ///< Equivalence class leader.
+  UserValue *next = nullptr; ///< Next value in equivalence class, or null.
+
+  /// Numbered locations referenced by locmap.
+  SmallVector<MachineOperand, 4> locations;
+
+  /// Map of slot indices where this value is live.
+  LocMap locInts;
+
+  /// Set of interval start indexes that have been trimmed to the
+  /// lexical scope.
+  SmallSet<SlotIndex, 2> trimmedDefs;
+
+  /// Insert a DBG_VALUE into MBB at Idx for DbgValue.
+  void insertDebugValue(MachineBasicBlock *MBB, SlotIndex StartIdx,
+                        SlotIndex StopIdx, DbgVariableValue DbgValue,
+                        ArrayRef<bool> LocSpills,
+                        ArrayRef<unsigned> SpillOffsets, LiveIntervals &LIS,
+                        const TargetInstrInfo &TII,
+                        const TargetRegisterInfo &TRI,
+                        BlockSkipInstsMap &BBSkipInstsMap);
+
+  /// Replace OldLocNo ranges with NewRegs ranges where NewRegs
+  /// is live. Returns true if any changes were made.
+  bool splitLocation(unsigned OldLocNo, ArrayRef<Register> NewRegs,
+                     LiveIntervals &LIS);
+
+public:
+  /// Create a new UserValue.
+  UserValue(const DILocalVariable *var,
+            std::optional<DIExpression::FragmentInfo> Fragment, DebugLoc L,
+            LocMap::Allocator &alloc)
+      : Variable(var), Fragment(Fragment), dl(std::move(L)), leader(this),
+        locInts(alloc) {}
+
+  /// Get the leader of this value's equivalence class.
+  UserValue *getLeader() {
+    UserValue *l = leader;
+    while (l != l->leader)
+      l = l->leader;
+    return leader = l;
+  }
+
+  /// Return the next UserValue in the equivalence class.
+  UserValue *getNext() const { return next; }
+
+  /// Merge equivalence classes.
+  static UserValue *merge(UserValue *L1, UserValue *L2) {
+    L2 = L2->getLeader();
+    if (!L1)
+      return L2;
+    L1 = L1->getLeader();
+    if (L1 == L2)
+      return L1;
+    // Splice L2 before L1's members.
+    UserValue *End = L2;
+    while (End->next) {
+      End->leader = L1;
+      End = End->next;
+    }
+    End->leader = L1;
+    End->next = L1->next;
+    L1->next = L2;
+    return L1;
+  }
+
+  /// Return the location number that matches Loc.
+  ///
+  /// For undef values we always return location number UndefLocNo without
+  /// inserting anything in locations. Since locations is a vector and the
+  /// location number is the position in the vector and UndefLocNo is ~0,
+  /// we would need a very big vector to put the value at the right position.
+  unsigned getLocationNo(const MachineOperand &LocMO) {
+    if (LocMO.isReg()) {
+      if (LocMO.getReg() == 0)
+        return UndefLocNo;
+      // For register locations we dont care about use/def and other flags.
+      for (unsigned i = 0, e = locations.size(); i != e; ++i)
+        if (locations[i].isReg() &&
+            locations[i].getReg() == LocMO.getReg() &&
+            locations[i].getSubReg() == LocMO.getSubReg())
+          return i;
+    } else
+      for (unsigned i = 0, e = locations.size(); i != e; ++i)
+        if (LocMO.isIdenticalTo(locations[i]))
+          return i;
+    locations.push_back(LocMO);
+    // We are storing a MachineOperand outside a MachineInstr.
+    locations.back().clearParent();
+    // Don't store def operands.
+    if (locations.back().isReg()) {
+      if (locations.back().isDef())
+        locations.back().setIsDead(false);
+      locations.back().setIsUse();
+    }
+    return locations.size() - 1;
+  }
+
+  /// Remove (recycle) a location number. If \p LocNo still is used by the
+  /// locInts nothing is done.
+  void removeLocationIfUnused(unsigned LocNo) {
+    // Bail out if LocNo still is used.
+    for (LocMap::const_iterator I = locInts.begin(); I.valid(); ++I) {
+      const DbgVariableValue &DbgValue = I.value();
+      if (DbgValue.containsLocNo(LocNo))
+        return;
+    }
+    // Remove the entry in the locations vector, and adjust all references to
+    // location numbers above the removed entry.
+    locations.erase(locations.begin() + LocNo);
+    for (LocMap::iterator I = locInts.begin(); I.valid(); ++I) {
+      const DbgVariableValue &DbgValue = I.value();
+      if (DbgValue.hasLocNoGreaterThan(LocNo))
+        I.setValueUnchecked(DbgValue.decrementLocNosAfterPivot(LocNo));
+    }
+  }
+
+  /// Ensure that all virtual register locations are mapped.
+  void mapVirtRegs(LDVImpl *LDV);
+
+  /// Add a definition point to this user value.
+  void addDef(SlotIndex Idx, ArrayRef<MachineOperand> LocMOs, bool IsIndirect,
+              bool IsList, const DIExpression &Expr) {
+    SmallVector<unsigned> Locs;
+    for (const MachineOperand &Op : LocMOs)
+      Locs.push_back(getLocationNo(Op));
+    DbgVariableValue DbgValue(Locs, IsIndirect, IsList, Expr);
+    // Add a singular (Idx,Idx) -> value mapping.
+    LocMap::iterator I = locInts.find(Idx);
+    if (!I.valid() || I.start() != Idx)
+      I.insert(Idx, Idx.getNextSlot(), std::move(DbgValue));
+    else
+      // A later DBG_VALUE at the same SlotIndex overrides the old location.
+      I.setValue(std::move(DbgValue));
+  }
+
+  /// Extend the current definition as far as possible down.
+  ///
+  /// Stop when meeting an existing def or when leaving the live
+  /// range of VNI. End points where VNI is no longer live are added to Kills.
+  ///
+  /// We only propagate DBG_VALUES locally here. LiveDebugValues performs a
+  /// data-flow analysis to propagate them beyond basic block boundaries.
+  ///
+  /// \param Idx Starting point for the definition.
+  /// \param DbgValue value to propagate.
+  /// \param LiveIntervalInfo For each location number key in this map,
+  /// restricts liveness to where the LiveRange has the value equal to the\
+  /// VNInfo.
+  /// \param [out] Kills Append end points of VNI's live range to Kills.
+  /// \param LIS Live intervals analysis.
+  void
+  extendDef(SlotIndex Idx, DbgVariableValue DbgValue,
+            SmallDenseMap<unsigned, std::pair<LiveRange *, const VNInfo *>>
+                &LiveIntervalInfo,
+            std::optional<std::pair<SlotIndex, SmallVector<unsigned>>> &Kills,
+            LiveIntervals &LIS);
+
+  /// The value in LI may be copies to other registers. Determine if
+  /// any of the copies are available at the kill points, and add defs if
+  /// possible.
+  ///
+  /// \param DbgValue Location number of LI->reg, and DIExpression.
+  /// \param LocIntervals Scan for copies of the value for each location in the
+  /// corresponding LiveInterval->reg.
+  /// \param KilledAt The point where the range of DbgValue could be extended.
+  /// \param [in,out] NewDefs Append (Idx, DbgValue) of inserted defs here.
+  void addDefsFromCopies(
+      DbgVariableValue DbgValue,
+      SmallVectorImpl<std::pair<unsigned, LiveInterval *>> &LocIntervals,
+      SlotIndex KilledAt,
+      SmallVectorImpl<std::pair<SlotIndex, DbgVariableValue>> &NewDefs,
+      MachineRegisterInfo &MRI, LiveIntervals &LIS);
+
+  /// Compute the live intervals of all locations after collecting all their
+  /// def points.
+  void computeIntervals(MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+                        LiveIntervals &LIS, LexicalScopes &LS);
+
+  /// Replace OldReg ranges with NewRegs ranges where NewRegs is
+  /// live. Returns true if any changes were made.
+  bool splitRegister(Register OldReg, ArrayRef<Register> NewRegs,
+                     LiveIntervals &LIS);
+
+  /// Rewrite virtual register locations according to the provided virtual
+  /// register map. Record the stack slot offsets for the locations that
+  /// were spilled.
+  void rewriteLocations(VirtRegMap &VRM, const MachineFunction &MF,
+                        const TargetInstrInfo &TII,
+                        const TargetRegisterInfo &TRI,
+                        SpillOffsetMap &SpillOffsets);
+
+  /// Recreate DBG_VALUE instruction from data structures.
+  void emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
+                       const TargetInstrInfo &TII,
+                       const TargetRegisterInfo &TRI,
+                       const SpillOffsetMap &SpillOffsets,
+                       BlockSkipInstsMap &BBSkipInstsMap);
+
+  /// Return DebugLoc of this UserValue.
+  const DebugLoc &getDebugLoc() { return dl; }
+
+  void print(raw_ostream &, const TargetRegisterInfo *);
+};
+
+/// A user label is a part of a debug info user label.
+class UserLabel {
+  const DILabel *Label; ///< The debug info label we are part of.
+  DebugLoc dl;          ///< The debug location for the label. This is
+                        ///< used by dwarf writer to find lexical scope.
+  SlotIndex loc;        ///< Slot used by the debug label.
+
+  /// Insert a DBG_LABEL into MBB at Idx.
+  void insertDebugLabel(MachineBasicBlock *MBB, SlotIndex Idx,
+                        LiveIntervals &LIS, const TargetInstrInfo &TII,
+                        BlockSkipInstsMap &BBSkipInstsMap);
+
+public:
+  /// Create a new UserLabel.
+  UserLabel(const DILabel *label, DebugLoc L, SlotIndex Idx)
+      : Label(label), dl(std::move(L)), loc(Idx) {}
+
+  /// Does this UserLabel match the parameters?
+  bool matches(const DILabel *L, const DILocation *IA,
+             const SlotIndex Index) const {
+    return Label == L && dl->getInlinedAt() == IA && loc == Index;
+  }
+
+  /// Recreate DBG_LABEL instruction from data structures.
+  void emitDebugLabel(LiveIntervals &LIS, const TargetInstrInfo &TII,
+                      BlockSkipInstsMap &BBSkipInstsMap);
+
+  /// Return DebugLoc of this UserLabel.
+  const DebugLoc &getDebugLoc() { return dl; }
+
+  void print(raw_ostream &, const TargetRegisterInfo *);
+};
+
+/// Implementation of the LiveDebugVariables pass.
+class LDVImpl {
+  LiveDebugVariables &pass;
+  LocMap::Allocator allocator;
+  MachineFunction *MF = nullptr;
+  LiveIntervals *LIS;
+  const TargetRegisterInfo *TRI;
+
+  /// Position and VReg of a PHI instruction during register allocation.
+  struct PHIValPos {
+    SlotIndex SI;    /// Slot where this PHI occurs.
+    Register Reg;    /// VReg this PHI occurs in.
+    unsigned SubReg; /// Qualifiying subregister for Reg.
+  };
+
+  /// Map from debug instruction number to PHI position during allocation.
+  std::map<unsigned, PHIValPos> PHIValToPos;
+  /// Index of, for each VReg, which debug instruction numbers and corresponding
+  /// PHIs are sensitive to splitting. Each VReg may have multiple PHI defs,
+  /// at different positions.
+  DenseMap<Register, std::vector<unsigned>> RegToPHIIdx;
+
+  /// Record for any debug instructions unlinked from their blocks during
+  /// regalloc. Stores the instr and it's location, so that they can be
+  /// re-inserted after regalloc is over.
+  struct InstrPos {
+    MachineInstr *MI;       ///< Debug instruction, unlinked from it's block.
+    SlotIndex Idx;          ///< Slot position where MI should be re-inserted.
+    MachineBasicBlock *MBB; ///< Block that MI was in.
+  };
+
+  /// Collection of stored debug instructions, preserved until after regalloc.
+  SmallVector<InstrPos, 32> StashedDebugInstrs;
+
+  /// Whether emitDebugValues is called.
+  bool EmitDone = false;
+
+  /// Whether the machine function is modified during the pass.
+  bool ModifiedMF = false;
+
+  /// All allocated UserValue instances.
+  SmallVector<std::unique_ptr<UserValue>, 8> userValues;
+
+  /// All allocated UserLabel instances.
+  SmallVector<std::unique_ptr<UserLabel>, 2> userLabels;
+
+  /// Map virtual register to eq class leader.
+  using VRMap = DenseMap<unsigned, UserValue *>;
+  VRMap virtRegToEqClass;
+
+  /// Map to find existing UserValue instances.
+  using UVMap = DenseMap<DebugVariable, UserValue *>;
+  UVMap userVarMap;
+
+  /// Find or create a UserValue.
+  UserValue *getUserValue(const DILocalVariable *Var,
+                          std::optional<DIExpression::FragmentInfo> Fragment,
+                          const DebugLoc &DL);
+
+  /// Find the EC leader for VirtReg or null.
+  UserValue *lookupVirtReg(Register VirtReg);
+
+  /// Add DBG_VALUE instruction to our maps.
+  ///
+  /// \param MI DBG_VALUE instruction
+  /// \param Idx Last valid SLotIndex before instruction.
+  ///
+  /// \returns True if the DBG_VALUE instruction should be deleted.
+  bool handleDebugValue(MachineInstr &MI, SlotIndex Idx);
+
+  /// Track variable location debug instructions while using the instruction
+  /// referencing implementation. Such debug instructions do not need to be
+  /// updated during regalloc because they identify instructions rather than
+  /// register locations. However, they needs to be removed from the
+  /// MachineFunction during regalloc, then re-inserted later, to avoid
+  /// disrupting the allocator.
+  ///
+  /// \param MI Any DBG_VALUE / DBG_INSTR_REF / DBG_PHI instruction
+  /// \param Idx Last valid SlotIndex before instruction
+  ///
+  /// \returns Iterator to continue processing from after unlinking.
+  MachineBasicBlock::iterator handleDebugInstr(MachineInstr &MI, SlotIndex Idx);
+
+  /// Add DBG_LABEL instruction to UserLabel.
+  ///
+  /// \param MI DBG_LABEL instruction
+  /// \param Idx Last valid SlotIndex before instruction.
+  ///
+  /// \returns True if the DBG_LABEL instruction should be deleted.
+  bool handleDebugLabel(MachineInstr &MI, SlotIndex Idx);
+
+  /// Collect and erase all DBG_VALUE instructions, adding a UserValue def
+  /// for each instruction.
+  ///
+  /// \param mf MachineFunction to be scanned.
+  /// \param InstrRef Whether to operate in instruction referencing mode. If
+  ///        true, most of LiveDebugVariables doesn't run.
+  ///
+  /// \returns True if any debug values were found.
+  bool collectDebugValues(MachineFunction &mf, bool InstrRef);
+
+  /// Compute the live intervals of all user values after collecting all
+  /// their def points.
+  void computeIntervals();
+
+public:
+  LDVImpl(LiveDebugVariables *ps) : pass(*ps) {}
+
+  bool runOnMachineFunction(MachineFunction &mf, bool InstrRef);
+
+  /// Release all memory.
+  void clear() {
+    MF = nullptr;
+    PHIValToPos.clear();
+    RegToPHIIdx.clear();
+    StashedDebugInstrs.clear();
+    userValues.clear();
+    userLabels.clear();
+    virtRegToEqClass.clear();
+    userVarMap.clear();
+    // Make sure we call emitDebugValues if the machine function was modified.
+    assert((!ModifiedMF || EmitDone) &&
+           "Dbg values are not emitted in LDV");
+    EmitDone = false;
+    ModifiedMF = false;
+  }
+
+  /// Map virtual register to an equivalence class.
+  void mapVirtReg(Register VirtReg, UserValue *EC);
+
+  /// Replace any PHI referring to OldReg with its corresponding NewReg, if
+  /// present.
+  void splitPHIRegister(Register OldReg, ArrayRef<Register> NewRegs);
+
+  /// Replace all references to OldReg with NewRegs.
+  void splitRegister(Register OldReg, ArrayRef<Register> NewRegs);
+
+  /// Recreate DBG_VALUE instruction from data structures.
+  void emitDebugValues(VirtRegMap *VRM);
+
+  void print(raw_ostream&);
+};
+
+} // end anonymous namespace
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+static void printDebugLoc(const DebugLoc &DL, raw_ostream &CommentOS,
+                          const LLVMContext &Ctx) {
+  if (!DL)
+    return;
+
+  auto *Scope = cast<DIScope>(DL.getScope());
+  // Omit the directory, because it's likely to be long and uninteresting.
+  CommentOS << Scope->getFilename();
+  CommentOS << ':' << DL.getLine();
+  if (DL.getCol() != 0)
+    CommentOS << ':' << DL.getCol();
+
+  DebugLoc InlinedAtDL = DL.getInlinedAt();
+  if (!InlinedAtDL)
+    return;
+
+  CommentOS << " @[ ";
+  printDebugLoc(InlinedAtDL, CommentOS, Ctx);
+  CommentOS << " ]";
+}
+
+static void printExtendedName(raw_ostream &OS, const DINode *Node,
+                              const DILocation *DL) {
+  const LLVMContext &Ctx = Node->getContext();
+  StringRef Res;
+  unsigned Line = 0;
+  if (const auto *V = dyn_cast<const DILocalVariable>(Node)) {
+    Res = V->getName();
+    Line = V->getLine();
+  } else if (const auto *L = dyn_cast<const DILabel>(Node)) {
+    Res = L->getName();
+    Line = L->getLine();
+  }
+
+  if (!Res.empty())
+    OS << Res << "," << Line;
+  auto *InlinedAt = DL ? DL->getInlinedAt() : nullptr;
+  if (InlinedAt) {
+    if (DebugLoc InlinedAtDL = InlinedAt) {
+      OS << " @[";
+      printDebugLoc(InlinedAtDL, OS, Ctx);
+      OS << "]";
+    }
+  }
+}
+
+void UserValue::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
+  OS << "!\"";
+  printExtendedName(OS, Variable, dl);
+
+  OS << "\"\t";
+  for (LocMap::const_iterator I = locInts.begin(); I.valid(); ++I) {
+    OS << " [" << I.start() << ';' << I.stop() << "):";
+    if (I.value().isUndef())
+      OS << " undef";
+    else {
+      I.value().printLocNos(OS);
+      if (I.value().getWasIndirect())
+        OS << " ind";
+      else if (I.value().getWasList())
+        OS << " list";
+    }
+  }
+  for (unsigned i = 0, e = locations.size(); i != e; ++i) {
+    OS << " Loc" << i << '=';
+    locations[i].print(OS, TRI);
+  }
+  OS << '\n';
+}
+
+void UserLabel::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
+  OS << "!\"";
+  printExtendedName(OS, Label, dl);
+
+  OS << "\"\t";
+  OS << loc;
+  OS << '\n';
+}
+
+void LDVImpl::print(raw_ostream &OS) {
+  OS << "********** DEBUG VARIABLES **********\n";
+  for (auto &userValue : userValues)
+    userValue->print(OS, TRI);
+  OS << "********** DEBUG LABELS **********\n";
+  for (auto &userLabel : userLabels)
+    userLabel->print(OS, TRI);
+}
+#endif
+
+void UserValue::mapVirtRegs(LDVImpl *LDV) {
+  for (unsigned i = 0, e = locations.size(); i != e; ++i)
+    if (locations[i].isReg() && locations[i].getReg().isVirtual())
+      LDV->mapVirtReg(locations[i].getReg(), this);
+}
+
+UserValue *
+LDVImpl::getUserValue(const DILocalVariable *Var,
+                      std::optional<DIExpression::FragmentInfo> Fragment,
+                      const DebugLoc &DL) {
+  // FIXME: Handle partially overlapping fragments. See
+  // https://reviews.llvm.org/D70121#1849741.
+  DebugVariable ID(Var, Fragment, DL->getInlinedAt());
+  UserValue *&UV = userVarMap[ID];
+  if (!UV) {
+    userValues.push_back(
+        std::make_unique<UserValue>(Var, Fragment, DL, allocator));
+    UV = userValues.back().get();
+  }
+  return UV;
+}
+
+void LDVImpl::mapVirtReg(Register VirtReg, UserValue *EC) {
+  assert(VirtReg.isVirtual() && "Only map VirtRegs");
+  UserValue *&Leader = virtRegToEqClass[VirtReg];
+  Leader = UserValue::merge(Leader, EC);
+}
+
+UserValue *LDVImpl::lookupVirtReg(Register VirtReg) {
+  if (UserValue *UV = virtRegToEqClass.lookup(VirtReg))
+    return UV->getLeader();
+  return nullptr;
+}
+
+bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
+  // DBG_VALUE loc, offset, variable, expr
+  // DBG_VALUE_LIST variable, expr, locs...
+  if (!MI.isDebugValue()) {
+    LLVM_DEBUG(dbgs() << "Can't handle non-DBG_VALUE*: " << MI);
+    return false;
+  }
+  if (!MI.getDebugVariableOp().isMetadata()) {
+    LLVM_DEBUG(dbgs() << "Can't handle DBG_VALUE* with invalid variable: "
+                      << MI);
+    return false;
+  }
+  if (MI.isNonListDebugValue() &&
+      (MI.getNumOperands() != 4 ||
+       !(MI.getDebugOffset().isImm() || MI.getDebugOffset().isReg()))) {
+    LLVM_DEBUG(dbgs() << "Can't handle malformed DBG_VALUE: " << MI);
+    return false;
+  }
+
+  // Detect invalid DBG_VALUE instructions, with a debug-use of a virtual
+  // register that hasn't been defined yet. If we do not remove those here, then
+  // the re-insertion of the DBG_VALUE instruction after register allocation
+  // will be incorrect.
+  bool Discard = false;
+  for (const MachineOperand &Op : MI.debug_operands()) {
+    if (Op.isReg() && Op.getReg().isVirtual()) {
+      const Register Reg = Op.getReg();
+      if (!LIS->hasInterval(Reg)) {
+        // The DBG_VALUE is described by a virtual register that does not have a
+        // live interval. Discard the DBG_VALUE.
+        Discard = true;
+        LLVM_DEBUG(dbgs() << "Discarding debug info (no LIS interval): " << Idx
+                          << " " << MI);
+      } else {
+        // The DBG_VALUE is only valid if either Reg is live out from Idx, or
+        // Reg is defined dead at Idx (where Idx is the slot index for the
+        // instruction preceding the DBG_VALUE).
+        const LiveInterval &LI = LIS->getInterval(Reg);
+        LiveQueryResult LRQ = LI.Query(Idx);
+        if (!LRQ.valueOutOrDead()) {
+          // We have found a DBG_VALUE with the value in a virtual register that
+          // is not live. Discard the DBG_VALUE.
+          Discard = true;
+          LLVM_DEBUG(dbgs() << "Discarding debug info (reg not live): " << Idx
+                            << " " << MI);
+        }
+      }
+    }
+  }
+
+  // Get or create the UserValue for (variable,offset) here.
+  bool IsIndirect = MI.isDebugOffsetImm();
+  if (IsIndirect)
+    assert(MI.getDebugOffset().getImm() == 0 &&
+           "DBG_VALUE with nonzero offset");
+  bool IsList = MI.isDebugValueList();
+  const DILocalVariable *Var = MI.getDebugVariable();
+  const DIExpression *Expr = MI.getDebugExpression();
+  UserValue *UV = getUserValue(Var, Expr->getFragmentInfo(), MI.getDebugLoc());
+  if (!Discard)
+    UV->addDef(Idx,
+               ArrayRef<MachineOperand>(MI.debug_operands().begin(),
+                                        MI.debug_operands().end()),
+               IsIndirect, IsList, *Expr);
+  else {
+    MachineOperand MO = MachineOperand::CreateReg(0U, false);
+    MO.setIsDebug();
+    // We should still pass a list the same size as MI.debug_operands() even if
+    // all MOs are undef, so that DbgVariableValue can correctly adjust the
+    // expression while removing the duplicated undefs.
+    SmallVector<MachineOperand, 4> UndefMOs(MI.getNumDebugOperands(), MO);
+    UV->addDef(Idx, UndefMOs, false, IsList, *Expr);
+  }
+  return true;
+}
+
+MachineBasicBlock::iterator LDVImpl::handleDebugInstr(MachineInstr &MI,
+                                                      SlotIndex Idx) {
+  assert(MI.isDebugValueLike() || MI.isDebugPHI());
+
+  // In instruction referencing mode, there should be no DBG_VALUE instructions
+  // that refer to virtual registers. They might still refer to constants.
+  if (MI.isDebugValueLike())
+    assert(none_of(MI.debug_operands(),
+                   [](const MachineOperand &MO) {
+                     return MO.isReg() && MO.getReg().isVirtual();
+                   }) &&
+           "MIs should not refer to Virtual Registers in InstrRef mode.");
+
+  // Unlink the instruction, store it in the debug instructions collection.
+  auto NextInst = std::next(MI.getIterator());
+  auto *MBB = MI.getParent();
+  MI.removeFromParent();
+  StashedDebugInstrs.push_back({&MI, Idx, MBB});
+  return NextInst;
+}
+
+bool LDVImpl::handleDebugLabel(MachineInstr &MI, SlotIndex Idx) {
+  // DBG_LABEL label
+  if (MI.getNumOperands() != 1 || !MI.getOperand(0).isMetadata()) {
+    LLVM_DEBUG(dbgs() << "Can't handle " << MI);
+    return false;
+  }
+
+  // Get or create the UserLabel for label here.
+  const DILabel *Label = MI.getDebugLabel();
+  const DebugLoc &DL = MI.getDebugLoc();
+  bool Found = false;
+  for (auto const &L : userLabels) {
+    if (L->matches(Label, DL->getInlinedAt(), Idx)) {
+      Found = true;
+      break;
+    }
+  }
+  if (!Found)
+    userLabels.push_back(std::make_unique<UserLabel>(Label, DL, Idx));
+
+  return true;
+}
+
+bool LDVImpl::collectDebugValues(MachineFunction &mf, bool InstrRef) {
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : mf) {
+    for (MachineBasicBlock::iterator MBBI = MBB.begin(), MBBE = MBB.end();
+         MBBI != MBBE;) {
+      // Use the first debug instruction in the sequence to get a SlotIndex
+      // for following consecutive debug instructions.
+      if (!MBBI->isDebugOrPseudoInstr()) {
+        ++MBBI;
+        continue;
+      }
+      // Debug instructions has no slot index. Use the previous
+      // non-debug instruction's SlotIndex as its SlotIndex.
+      SlotIndex Idx =
+          MBBI == MBB.begin()
+              ? LIS->getMBBStartIdx(&MBB)
+              : LIS->getInstructionIndex(*std::prev(MBBI)).getRegSlot();
+      // Handle consecutive debug instructions with the same slot index.
+      do {
+        // In instruction referencing mode, pass each instr to handleDebugInstr
+        // to be unlinked. Ignore DBG_VALUE_LISTs -- they refer to vregs, and
+        // need to go through the normal live interval splitting process.
+        if (InstrRef && (MBBI->isNonListDebugValue() || MBBI->isDebugPHI() ||
+                         MBBI->isDebugRef())) {
+          MBBI = handleDebugInstr(*MBBI, Idx);
+          Changed = true;
+        // In normal debug mode, use the dedicated DBG_VALUE / DBG_LABEL handler
+        // to track things through register allocation, and erase the instr.
+        } else if ((MBBI->isDebugValue() && handleDebugValue(*MBBI, Idx)) ||
+                   (MBBI->isDebugLabel() && handleDebugLabel(*MBBI, Idx))) {
+          MBBI = MBB.erase(MBBI);
+          Changed = true;
+        } else
+          ++MBBI;
+      } while (MBBI != MBBE && MBBI->isDebugOrPseudoInstr());
+    }
+  }
+  return Changed;
+}
+
+void UserValue::extendDef(
+    SlotIndex Idx, DbgVariableValue DbgValue,
+    SmallDenseMap<unsigned, std::pair<LiveRange *, const VNInfo *>>
+        &LiveIntervalInfo,
+    std::optional<std::pair<SlotIndex, SmallVector<unsigned>>> &Kills,
+    LiveIntervals &LIS) {
+  SlotIndex Start = Idx;
+  MachineBasicBlock *MBB = LIS.getMBBFromIndex(Start);
+  SlotIndex Stop = LIS.getMBBEndIdx(MBB);
+  LocMap::iterator I = locInts.find(Start);
+
+  // Limit to the intersection of the VNIs' live ranges.
+  for (auto &LII : LiveIntervalInfo) {
+    LiveRange *LR = LII.second.first;
+    assert(LR && LII.second.second && "Missing range info for Idx.");
+    LiveInterval::Segment *Segment = LR->getSegmentContaining(Start);
+    assert(Segment && Segment->valno == LII.second.second &&
+           "Invalid VNInfo for Idx given?");
+    if (Segment->end < Stop) {
+      Stop = Segment->end;
+      Kills = {Stop, {LII.first}};
+    } else if (Segment->end == Stop && Kills) {
+      // If multiple locations end at the same place, track all of them in
+      // Kills.
+      Kills->second.push_back(LII.first);
+    }
+  }
+
+  // There could already be a short def at Start.
+  if (I.valid() && I.start() <= Start) {
+    // Stop when meeting a different location or an already extended interval.
+    Start = Start.getNextSlot();
+    if (I.value() != DbgValue || I.stop() != Start) {
+      // Clear `Kills`, as we have a new def available.
+      Kills = std::nullopt;
+      return;
+    }
+    // This is a one-slot placeholder. Just skip it.
+    ++I;
+  }
+
+  // Limited by the next def.
+  if (I.valid() && I.start() < Stop) {
+    Stop = I.start();
+    // Clear `Kills`, as we have a new def available.
+    Kills = std::nullopt;
+  }
+
+  if (Start < Stop) {
+    DbgVariableValue ExtDbgValue(DbgValue);
+    I.insert(Start, Stop, std::move(ExtDbgValue));
+  }
+}
+
+void UserValue::addDefsFromCopies(
+    DbgVariableValue DbgValue,
+    SmallVectorImpl<std::pair<unsigned, LiveInterval *>> &LocIntervals,
+    SlotIndex KilledAt,
+    SmallVectorImpl<std::pair<SlotIndex, DbgVariableValue>> &NewDefs,
+    MachineRegisterInfo &MRI, LiveIntervals &LIS) {
+  // Don't track copies from physregs, there are too many uses.
+  if (any_of(LocIntervals,
+             [](auto LocI) { return !LocI.second->reg().isVirtual(); }))
+    return;
+
+  // Collect all the (vreg, valno) pairs that are copies of LI.
+  SmallDenseMap<unsigned,
+                SmallVector<std::pair<LiveInterval *, const VNInfo *>, 4>>
+      CopyValues;
+  for (auto &LocInterval : LocIntervals) {
+    unsigned LocNo = LocInterval.first;
+    LiveInterval *LI = LocInterval.second;
+    for (MachineOperand &MO : MRI.use_nodbg_operands(LI->reg())) {
+      MachineInstr *MI = MO.getParent();
+      // Copies of the full value.
+      if (MO.getSubReg() || !MI->isCopy())
+        continue;
+      Register DstReg = MI->getOperand(0).getReg();
+
+      // Don't follow copies to physregs. These are usually setting up call
+      // arguments, and the argument registers are always call clobbered. We are
+      // better off in the source register which could be a callee-saved
+      // register, or it could be spilled.
+      if (!DstReg.isVirtual())
+        continue;
+
+      // Is the value extended to reach this copy? If not, another def may be
+      // blocking it, or we are looking at a wrong value of LI.
+      SlotIndex Idx = LIS.getInstructionIndex(*MI);
+      LocMap::iterator I = locInts.find(Idx.getRegSlot(true));
+      if (!I.valid() || I.value() != DbgValue)
+        continue;
+
+      if (!LIS.hasInterval(DstReg))
+        continue;
+      LiveInterval *DstLI = &LIS.getInterval(DstReg);
+      const VNInfo *DstVNI = DstLI->getVNInfoAt(Idx.getRegSlot());
+      assert(DstVNI && DstVNI->def == Idx.getRegSlot() && "Bad copy value");
+      CopyValues[LocNo].push_back(std::make_pair(DstLI, DstVNI));
+    }
+  }
+
+  if (CopyValues.empty())
+    return;
+
+#if !defined(NDEBUG)
+  for (auto &LocInterval : LocIntervals)
+    LLVM_DEBUG(dbgs() << "Got " << CopyValues[LocInterval.first].size()
+                      << " copies of " << *LocInterval.second << '\n');
+#endif
+
+  // Try to add defs of the copied values for the kill point. Check that there
+  // isn't already a def at Idx.
+  LocMap::iterator I = locInts.find(KilledAt);
+  if (I.valid() && I.start() <= KilledAt)
+    return;
+  DbgVariableValue NewValue(DbgValue);
+  for (auto &LocInterval : LocIntervals) {
+    unsigned LocNo = LocInterval.first;
+    bool FoundCopy = false;
+    for (auto &LIAndVNI : CopyValues[LocNo]) {
+      LiveInterval *DstLI = LIAndVNI.first;
+      const VNInfo *DstVNI = LIAndVNI.second;
+      if (DstLI->getVNInfoAt(KilledAt) != DstVNI)
+        continue;
+      LLVM_DEBUG(dbgs() << "Kill at " << KilledAt << " covered by valno #"
+                        << DstVNI->id << " in " << *DstLI << '\n');
+      MachineInstr *CopyMI = LIS.getInstructionFromIndex(DstVNI->def);
+      assert(CopyMI && CopyMI->isCopy() && "Bad copy value");
+      unsigned NewLocNo = getLocationNo(CopyMI->getOperand(0));
+      NewValue = NewValue.changeLocNo(LocNo, NewLocNo);
+      FoundCopy = true;
+      break;
+    }
+    // If there are any killed locations we can't find a copy for, we can't
+    // extend the variable value.
+    if (!FoundCopy)
+      return;
+  }
+  I.insert(KilledAt, KilledAt.getNextSlot(), NewValue);
+  NewDefs.push_back(std::make_pair(KilledAt, NewValue));
+}
+
+void UserValue::computeIntervals(MachineRegisterInfo &MRI,
+                                 const TargetRegisterInfo &TRI,
+                                 LiveIntervals &LIS, LexicalScopes &LS) {
+  SmallVector<std::pair<SlotIndex, DbgVariableValue>, 16> Defs;
+
+  // Collect all defs to be extended (Skipping undefs).
+  for (LocMap::const_iterator I = locInts.begin(); I.valid(); ++I)
+    if (!I.value().isUndef())
+      Defs.push_back(std::make_pair(I.start(), I.value()));
+
+  // Extend all defs, and possibly add new ones along the way.
+  for (unsigned i = 0; i != Defs.size(); ++i) {
+    SlotIndex Idx = Defs[i].first;
+    DbgVariableValue DbgValue = Defs[i].second;
+    SmallDenseMap<unsigned, std::pair<LiveRange *, const VNInfo *>> LIs;
+    SmallVector<const VNInfo *, 4> VNIs;
+    bool ShouldExtendDef = false;
+    for (unsigned LocNo : DbgValue.loc_nos()) {
+      const MachineOperand &LocMO = locations[LocNo];
+      if (!LocMO.isReg() || !LocMO.getReg().isVirtual()) {
+        ShouldExtendDef |= !LocMO.isReg();
+        continue;
+      }
+      ShouldExtendDef = true;
+      LiveInterval *LI = nullptr;
+      const VNInfo *VNI = nullptr;
+      if (LIS.hasInterval(LocMO.getReg())) {
+        LI = &LIS.getInterval(LocMO.getReg());
+        VNI = LI->getVNInfoAt(Idx);
+      }
+      if (LI && VNI)
+        LIs[LocNo] = {LI, VNI};
+    }
+    if (ShouldExtendDef) {
+      std::optional<std::pair<SlotIndex, SmallVector<unsigned>>> Kills;
+      extendDef(Idx, DbgValue, LIs, Kills, LIS);
+
+      if (Kills) {
+        SmallVector<std::pair<unsigned, LiveInterval *>, 2> KilledLocIntervals;
+        bool AnySubreg = false;
+        for (unsigned LocNo : Kills->second) {
+          const MachineOperand &LocMO = this->locations[LocNo];
+          if (LocMO.getSubReg()) {
+            AnySubreg = true;
+            break;
+          }
+          LiveInterval *LI = &LIS.getInterval(LocMO.getReg());
+          KilledLocIntervals.push_back({LocNo, LI});
+        }
+
+        // FIXME: Handle sub-registers in addDefsFromCopies. The problem is that
+        // if the original location for example is %vreg0:sub_hi, and we find a
+        // full register copy in addDefsFromCopies (at the moment it only
+        // handles full register copies), then we must add the sub1 sub-register
+        // index to the new location. However, that is only possible if the new
+        // virtual register is of the same regclass (or if there is an
+        // equivalent sub-register in that regclass). For now, simply skip
+        // handling copies if a sub-register is involved.
+        if (!AnySubreg)
+          addDefsFromCopies(DbgValue, KilledLocIntervals, Kills->first, Defs,
+                            MRI, LIS);
+      }
+    }
+
+    // For physregs, we only mark the start slot idx. DwarfDebug will see it
+    // as if the DBG_VALUE is valid up until the end of the basic block, or
+    // the next def of the physical register. So we do not need to extend the
+    // range. It might actually happen that the DBG_VALUE is the last use of
+    // the physical register (e.g. if this is an unused input argument to a
+    // function).
+  }
+
+  // The computed intervals may extend beyond the range of the debug
+  // location's lexical scope. In this case, splitting of an interval
+  // can result in an interval outside of the scope being created,
+  // causing extra unnecessary DBG_VALUEs to be emitted. To prevent
+  // this, trim the intervals to the lexical scope in the case of inlined
+  // variables, since heavy inlining may cause production of dramatically big
+  // number of DBG_VALUEs to be generated.
+  if (!dl.getInlinedAt())
+    return;
+
+  LexicalScope *Scope = LS.findLexicalScope(dl);
+  if (!Scope)
+    return;
+
+  SlotIndex PrevEnd;
+  LocMap::iterator I = locInts.begin();
+
+  // Iterate over the lexical scope ranges. Each time round the loop
+  // we check the intervals for overlap with the end of the previous
+  // range and the start of the next. The first range is handled as
+  // a special case where there is no PrevEnd.
+  for (const InsnRange &Range : Scope->getRanges()) {
+    SlotIndex RStart = LIS.getInstructionIndex(*Range.first);
+    SlotIndex REnd = LIS.getInstructionIndex(*Range.second);
+
+    // Variable locations at the first instruction of a block should be
+    // based on the block's SlotIndex, not the first instruction's index.
+    if (Range.first == Range.first->getParent()->begin())
+      RStart = LIS.getSlotIndexes()->getIndexBefore(*Range.first);
+
+    // At the start of each iteration I has been advanced so that
+    // I.stop() >= PrevEnd. Check for overlap.
+    if (PrevEnd && I.start() < PrevEnd) {
+      SlotIndex IStop = I.stop();
+      DbgVariableValue DbgValue = I.value();
+
+      // Stop overlaps previous end - trim the end of the interval to
+      // the scope range.
+      I.setStopUnchecked(PrevEnd);
+      ++I;
+
+      // If the interval also overlaps the start of the "next" (i.e.
+      // current) range create a new interval for the remainder (which
+      // may be further trimmed).
+      if (RStart < IStop)
+        I.insert(RStart, IStop, DbgValue);
+    }
+
+    // Advance I so that I.stop() >= RStart, and check for overlap.
+    I.advanceTo(RStart);
+    if (!I.valid())
+      return;
+
+    if (I.start() < RStart) {
+      // Interval start overlaps range - trim to the scope range.
+      I.setStartUnchecked(RStart);
+      // Remember that this interval was trimmed.
+      trimmedDefs.insert(RStart);
+    }
+
+    // The end of a lexical scope range is the last instruction in the
+    // range. To convert to an interval we need the index of the
+    // instruction after it.
+    REnd = REnd.getNextIndex();
+
+    // Advance I to first interval outside current range.
+    I.advanceTo(REnd);
+    if (!I.valid())
+      return;
+
+    PrevEnd = REnd;
+  }
+
+  // Check for overlap with end of final range.
+  if (PrevEnd && I.start() < PrevEnd)
+    I.setStopUnchecked(PrevEnd);
+}
+
+void LDVImpl::computeIntervals() {
+  LexicalScopes LS;
+  LS.initialize(*MF);
+
+  for (unsigned i = 0, e = userValues.size(); i != e; ++i) {
+    userValues[i]->computeIntervals(MF->getRegInfo(), *TRI, *LIS, LS);
+    userValues[i]->mapVirtRegs(this);
+  }
+}
+
+bool LDVImpl::runOnMachineFunction(MachineFunction &mf, bool InstrRef) {
+  clear();
+  MF = &mf;
+  LIS = &pass.getAnalysis<LiveIntervals>();
+  TRI = mf.getSubtarget().getRegisterInfo();
+  LLVM_DEBUG(dbgs() << "********** COMPUTING LIVE DEBUG VARIABLES: "
+                    << mf.getName() << " **********\n");
+
+  bool Changed = collectDebugValues(mf, InstrRef);
+  computeIntervals();
+  LLVM_DEBUG(print(dbgs()));
+
+  // Collect the set of VReg / SlotIndexs where PHIs occur; index the sensitive
+  // VRegs too, for when we're notified of a range split.
+  SlotIndexes *Slots = LIS->getSlotIndexes();
+  for (const auto &PHIIt : MF->DebugPHIPositions) {
+    const MachineFunction::DebugPHIRegallocPos &Position = PHIIt.second;
+    MachineBasicBlock *MBB = Position.MBB;
+    Register Reg = Position.Reg;
+    unsigned SubReg = Position.SubReg;
+    SlotIndex SI = Slots->getMBBStartIdx(MBB);
+    PHIValPos VP = {SI, Reg, SubReg};
+    PHIValToPos.insert(std::make_pair(PHIIt.first, VP));
+    RegToPHIIdx[Reg].push_back(PHIIt.first);
+  }
+
+  ModifiedMF = Changed;
+  return Changed;
+}
+
+static void removeDebugInstrs(MachineFunction &mf) {
+  for (MachineBasicBlock &MBB : mf) {
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB))
+      if (MI.isDebugInstr())
+        MBB.erase(&MI);
+  }
+}
+
+bool LiveDebugVariables::runOnMachineFunction(MachineFunction &mf) {
+  if (!EnableLDV)
+    return false;
+  if (!mf.getFunction().getSubprogram()) {
+    removeDebugInstrs(mf);
+    return false;
+  }
+
+  // Have we been asked to track variable locations using instruction
+  // referencing?
+  bool InstrRef = mf.useDebugInstrRef();
+
+  if (!pImpl)
+    pImpl = new LDVImpl(this);
+  return static_cast<LDVImpl *>(pImpl)->runOnMachineFunction(mf, InstrRef);
+}
+
+void LiveDebugVariables::releaseMemory() {
+  if (pImpl)
+    static_cast<LDVImpl*>(pImpl)->clear();
+}
+
+LiveDebugVariables::~LiveDebugVariables() {
+  if (pImpl)
+    delete static_cast<LDVImpl*>(pImpl);
+}
+
+//===----------------------------------------------------------------------===//
+//                           Live Range Splitting
+//===----------------------------------------------------------------------===//
+
+bool
+UserValue::splitLocation(unsigned OldLocNo, ArrayRef<Register> NewRegs,
+                         LiveIntervals& LIS) {
+  LLVM_DEBUG({
+    dbgs() << "Splitting Loc" << OldLocNo << '\t';
+    print(dbgs(), nullptr);
+  });
+  bool DidChange = false;
+  LocMap::iterator LocMapI;
+  LocMapI.setMap(locInts);
+  for (Register NewReg : NewRegs) {
+    LiveInterval *LI = &LIS.getInterval(NewReg);
+    if (LI->empty())
+      continue;
+
+    // Don't allocate the new LocNo until it is needed.
+    unsigned NewLocNo = UndefLocNo;
+
+    // Iterate over the overlaps between locInts and LI.
+    LocMapI.find(LI->beginIndex());
+    if (!LocMapI.valid())
+      continue;
+    LiveInterval::iterator LII = LI->advanceTo(LI->begin(), LocMapI.start());
+    LiveInterval::iterator LIE = LI->end();
+    while (LocMapI.valid() && LII != LIE) {
+      // At this point, we know that LocMapI.stop() > LII->start.
+      LII = LI->advanceTo(LII, LocMapI.start());
+      if (LII == LIE)
+        break;
+
+      // Now LII->end > LocMapI.start(). Do we have an overlap?
+      if (LocMapI.value().containsLocNo(OldLocNo) &&
+          LII->start < LocMapI.stop()) {
+        // Overlapping correct location. Allocate NewLocNo now.
+        if (NewLocNo == UndefLocNo) {
+          MachineOperand MO = MachineOperand::CreateReg(LI->reg(), false);
+          MO.setSubReg(locations[OldLocNo].getSubReg());
+          NewLocNo = getLocationNo(MO);
+          DidChange = true;
+        }
+
+        SlotIndex LStart = LocMapI.start();
+        SlotIndex LStop = LocMapI.stop();
+        DbgVariableValue OldDbgValue = LocMapI.value();
+
+        // Trim LocMapI down to the LII overlap.
+        if (LStart < LII->start)
+          LocMapI.setStartUnchecked(LII->start);
+        if (LStop > LII->end)
+          LocMapI.setStopUnchecked(LII->end);
+
+        // Change the value in the overlap. This may trigger coalescing.
+        LocMapI.setValue(OldDbgValue.changeLocNo(OldLocNo, NewLocNo));
+
+        // Re-insert any removed OldDbgValue ranges.
+        if (LStart < LocMapI.start()) {
+          LocMapI.insert(LStart, LocMapI.start(), OldDbgValue);
+          ++LocMapI;
+          assert(LocMapI.valid() && "Unexpected coalescing");
+        }
+        if (LStop > LocMapI.stop()) {
+          ++LocMapI;
+          LocMapI.insert(LII->end, LStop, OldDbgValue);
+          --LocMapI;
+        }
+      }
+
+      // Advance to the next overlap.
+      if (LII->end < LocMapI.stop()) {
+        if (++LII == LIE)
+          break;
+        LocMapI.advanceTo(LII->start);
+      } else {
+        ++LocMapI;
+        if (!LocMapI.valid())
+          break;
+        LII = LI->advanceTo(LII, LocMapI.start());
+      }
+    }
+  }
+
+  // Finally, remove OldLocNo unless it is still used by some interval in the
+  // locInts map. One case when OldLocNo still is in use is when the register
+  // has been spilled. In such situations the spilled register is kept as a
+  // location until rewriteLocations is called (VirtRegMap is mapping the old
+  // register to the spill slot). So for a while we can have locations that map
+  // to virtual registers that have been removed from both the MachineFunction
+  // and from LiveIntervals.
+  //
+  // We may also just be using the location for a value with a different
+  // expression.
+  removeLocationIfUnused(OldLocNo);
+
+  LLVM_DEBUG({
+    dbgs() << "Split result: \t";
+    print(dbgs(), nullptr);
+  });
+  return DidChange;
+}
+
+bool
+UserValue::splitRegister(Register OldReg, ArrayRef<Register> NewRegs,
+                         LiveIntervals &LIS) {
+  bool DidChange = false;
+  // Split locations referring to OldReg. Iterate backwards so splitLocation can
+  // safely erase unused locations.
+  for (unsigned i = locations.size(); i ; --i) {
+    unsigned LocNo = i-1;
+    const MachineOperand *Loc = &locations[LocNo];
+    if (!Loc->isReg() || Loc->getReg() != OldReg)
+      continue;
+    DidChange |= splitLocation(LocNo, NewRegs, LIS);
+  }
+  return DidChange;
+}
+
+void LDVImpl::splitPHIRegister(Register OldReg, ArrayRef<Register> NewRegs) {
+  auto RegIt = RegToPHIIdx.find(OldReg);
+  if (RegIt == RegToPHIIdx.end())
+    return;
+
+  std::vector<std::pair<Register, unsigned>> NewRegIdxes;
+  // Iterate over all the debug instruction numbers affected by this split.
+  for (unsigned InstrID : RegIt->second) {
+    auto PHIIt = PHIValToPos.find(InstrID);
+    assert(PHIIt != PHIValToPos.end());
+    const SlotIndex &Slot = PHIIt->second.SI;
+    assert(OldReg == PHIIt->second.Reg);
+
+    // Find the new register that covers this position.
+    for (auto NewReg : NewRegs) {
+      const LiveInterval &LI = LIS->getInterval(NewReg);
+      auto LII = LI.find(Slot);
+      if (LII != LI.end() && LII->start <= Slot) {
+        // This new register covers this PHI position, record this for indexing.
+        NewRegIdxes.push_back(std::make_pair(NewReg, InstrID));
+        // Record that this value lives in a different VReg now.
+        PHIIt->second.Reg = NewReg;
+        break;
+      }
+    }
+
+    // If we do not find a new register covering this PHI, then register
+    // allocation has dropped its location, for example because it's not live.
+    // The old VReg will not be mapped to a physreg, and the instruction
+    // number will have been optimized out.
+  }
+
+  // Re-create register index using the new register numbers.
+  RegToPHIIdx.erase(RegIt);
+  for (auto &RegAndInstr : NewRegIdxes)
+    RegToPHIIdx[RegAndInstr.first].push_back(RegAndInstr.second);
+}
+
+void LDVImpl::splitRegister(Register OldReg, ArrayRef<Register> NewRegs) {
+  // Consider whether this split range affects any PHI locations.
+  splitPHIRegister(OldReg, NewRegs);
+
+  // Check whether any intervals mapped by a DBG_VALUE were split and need
+  // updating.
+  bool DidChange = false;
+  for (UserValue *UV = lookupVirtReg(OldReg); UV; UV = UV->getNext())
+    DidChange |= UV->splitRegister(OldReg, NewRegs, *LIS);
+
+  if (!DidChange)
+    return;
+
+  // Map all of the new virtual registers.
+  UserValue *UV = lookupVirtReg(OldReg);
+  for (Register NewReg : NewRegs)
+    mapVirtReg(NewReg, UV);
+}
+
+void LiveDebugVariables::
+splitRegister(Register OldReg, ArrayRef<Register> NewRegs, LiveIntervals &LIS) {
+  if (pImpl)
+    static_cast<LDVImpl*>(pImpl)->splitRegister(OldReg, NewRegs);
+}
+
+void UserValue::rewriteLocations(VirtRegMap &VRM, const MachineFunction &MF,
+                                 const TargetInstrInfo &TII,
+                                 const TargetRegisterInfo &TRI,
+                                 SpillOffsetMap &SpillOffsets) {
+  // Build a set of new locations with new numbers so we can coalesce our
+  // IntervalMap if two vreg intervals collapse to the same physical location.
+  // Use MapVector instead of SetVector because MapVector::insert returns the
+  // position of the previously or newly inserted element. The boolean value
+  // tracks if the location was produced by a spill.
+  // FIXME: This will be problematic if we ever support direct and indirect
+  // frame index locations, i.e. expressing both variables in memory and
+  // 'int x, *px = &x'. The "spilled" bit must become part of the location.
+  MapVector<MachineOperand, std::pair<bool, unsigned>> NewLocations;
+  SmallVector<unsigned, 4> LocNoMap(locations.size());
+  for (unsigned I = 0, E = locations.size(); I != E; ++I) {
+    bool Spilled = false;
+    unsigned SpillOffset = 0;
+    MachineOperand Loc = locations[I];
+    // Only virtual registers are rewritten.
+    if (Loc.isReg() && Loc.getReg() && Loc.getReg().isVirtual()) {
+      Register VirtReg = Loc.getReg();
+      if (VRM.isAssignedReg(VirtReg) &&
+          Register::isPhysicalRegister(VRM.getPhys(VirtReg))) {
+        // This can create a %noreg operand in rare cases when the sub-register
+        // index is no longer available. That means the user value is in a
+        // non-existent sub-register, and %noreg is exactly what we want.
+        Loc.substPhysReg(VRM.getPhys(VirtReg), TRI);
+      } else if (VRM.getStackSlot(VirtReg) != VirtRegMap::NO_STACK_SLOT) {
+        // Retrieve the stack slot offset.
+        unsigned SpillSize;
+        const MachineRegisterInfo &MRI = MF.getRegInfo();
+        const TargetRegisterClass *TRC = MRI.getRegClass(VirtReg);
+        bool Success = TII.getStackSlotRange(TRC, Loc.getSubReg(), SpillSize,
+                                             SpillOffset, MF);
+
+        // FIXME: Invalidate the location if the offset couldn't be calculated.
+        (void)Success;
+
+        Loc = MachineOperand::CreateFI(VRM.getStackSlot(VirtReg));
+        Spilled = true;
+      } else {
+        Loc.setReg(0);
+        Loc.setSubReg(0);
+      }
+    }
+
+    // Insert this location if it doesn't already exist and record a mapping
+    // from the old number to the new number.
+    auto InsertResult = NewLocations.insert({Loc, {Spilled, SpillOffset}});
+    unsigned NewLocNo = std::distance(NewLocations.begin(), InsertResult.first);
+    LocNoMap[I] = NewLocNo;
+  }
+
+  // Rewrite the locations and record the stack slot offsets for spills.
+  locations.clear();
+  SpillOffsets.clear();
+  for (auto &Pair : NewLocations) {
+    bool Spilled;
+    unsigned SpillOffset;
+    std::tie(Spilled, SpillOffset) = Pair.second;
+    locations.push_back(Pair.first);
+    if (Spilled) {
+      unsigned NewLocNo = std::distance(&*NewLocations.begin(), &Pair);
+      SpillOffsets[NewLocNo] = SpillOffset;
+    }
+  }
+
+  // Update the interval map, but only coalesce left, since intervals to the
+  // right use the old location numbers. This should merge two contiguous
+  // DBG_VALUE intervals with different vregs that were allocated to the same
+  // physical register.
+  for (LocMap::iterator I = locInts.begin(); I.valid(); ++I) {
+    I.setValueUnchecked(I.value().remapLocNos(LocNoMap));
+    I.setStart(I.start());
+  }
+}
+
+/// Find an iterator for inserting a DBG_VALUE instruction.
+static MachineBasicBlock::iterator
+findInsertLocation(MachineBasicBlock *MBB, SlotIndex Idx, LiveIntervals &LIS,
+                   BlockSkipInstsMap &BBSkipInstsMap) {
+  SlotIndex Start = LIS.getMBBStartIdx(MBB);
+  Idx = Idx.getBaseIndex();
+
+  // Try to find an insert location by going backwards from Idx.
+  MachineInstr *MI;
+  while (!(MI = LIS.getInstructionFromIndex(Idx))) {
+    // We've reached the beginning of MBB.
+    if (Idx == Start) {
+      // Retrieve the last PHI/Label/Debug location found when calling
+      // SkipPHIsLabelsAndDebug last time. Start searching from there.
+      //
+      // Note the iterator kept in BBSkipInstsMap is one step back based
+      // on the iterator returned by SkipPHIsLabelsAndDebug last time.
+      // One exception is when SkipPHIsLabelsAndDebug returns MBB->begin(),
+      // BBSkipInstsMap won't save it. This is to consider the case that
+      // new instructions may be inserted at the beginning of MBB after
+      // last call of SkipPHIsLabelsAndDebug. If we save MBB->begin() in
+      // BBSkipInstsMap, after new non-phi/non-label/non-debug instructions
+      // are inserted at the beginning of the MBB, the iterator in
+      // BBSkipInstsMap won't point to the beginning of the MBB anymore.
+      // Therefore The next search in SkipPHIsLabelsAndDebug will skip those
+      // newly added instructions and that is unwanted.
+      MachineBasicBlock::iterator BeginIt;
+      auto MapIt = BBSkipInstsMap.find(MBB);
+      if (MapIt == BBSkipInstsMap.end())
+        BeginIt = MBB->begin();
+      else
+        BeginIt = std::next(MapIt->second);
+      auto I = MBB->SkipPHIsLabelsAndDebug(BeginIt);
+      if (I != BeginIt)
+        BBSkipInstsMap[MBB] = std::prev(I);
+      return I;
+    }
+    Idx = Idx.getPrevIndex();
+  }
+
+  // Don't insert anything after the first terminator, though.
+  return MI->isTerminator() ? MBB->getFirstTerminator() :
+                              std::next(MachineBasicBlock::iterator(MI));
+}
+
+/// Find an iterator for inserting the next DBG_VALUE instruction
+/// (or end if no more insert locations found).
+static MachineBasicBlock::iterator
+findNextInsertLocation(MachineBasicBlock *MBB, MachineBasicBlock::iterator I,
+                       SlotIndex StopIdx, ArrayRef<MachineOperand> LocMOs,
+                       LiveIntervals &LIS, const TargetRegisterInfo &TRI) {
+  SmallVector<Register, 4> Regs;
+  for (const MachineOperand &LocMO : LocMOs)
+    if (LocMO.isReg())
+      Regs.push_back(LocMO.getReg());
+  if (Regs.empty())
+    return MBB->instr_end();
+
+  // Find the next instruction in the MBB that define the register Reg.
+  while (I != MBB->end() && !I->isTerminator()) {
+    if (!LIS.isNotInMIMap(*I) &&
+        SlotIndex::isEarlierEqualInstr(StopIdx, LIS.getInstructionIndex(*I)))
+      break;
+    if (any_of(Regs, [&I, &TRI](Register &Reg) {
+          return I->definesRegister(Reg, &TRI);
+        }))
+      // The insert location is directly after the instruction/bundle.
+      return std::next(I);
+    ++I;
+  }
+  return MBB->end();
+}
+
+void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex StartIdx,
+                                 SlotIndex StopIdx, DbgVariableValue DbgValue,
+                                 ArrayRef<bool> LocSpills,
+                                 ArrayRef<unsigned> SpillOffsets,
+                                 LiveIntervals &LIS, const TargetInstrInfo &TII,
+                                 const TargetRegisterInfo &TRI,
+                                 BlockSkipInstsMap &BBSkipInstsMap) {
+  SlotIndex MBBEndIdx = LIS.getMBBEndIdx(&*MBB);
+  // Only search within the current MBB.
+  StopIdx = (MBBEndIdx < StopIdx) ? MBBEndIdx : StopIdx;
+  MachineBasicBlock::iterator I =
+      findInsertLocation(MBB, StartIdx, LIS, BBSkipInstsMap);
+  // Undef values don't exist in locations so create new "noreg" register MOs
+  // for them. See getLocationNo().
+  SmallVector<MachineOperand, 8> MOs;
+  if (DbgValue.isUndef()) {
+    MOs.assign(DbgValue.loc_nos().size(),
+               MachineOperand::CreateReg(
+                   /* Reg */ 0, /* isDef */ false, /* isImp */ false,
+                   /* isKill */ false, /* isDead */ false,
+                   /* isUndef */ false, /* isEarlyClobber */ false,
+                   /* SubReg */ 0, /* isDebug */ true));
+  } else {
+    for (unsigned LocNo : DbgValue.loc_nos())
+      MOs.push_back(locations[LocNo]);
+  }
+
+  ++NumInsertedDebugValues;
+
+  assert(cast<DILocalVariable>(Variable)
+             ->isValidLocationForIntrinsic(getDebugLoc()) &&
+         "Expected inlined-at fields to agree");
+
+  // If the location was spilled, the new DBG_VALUE will be indirect. If the
+  // original DBG_VALUE was indirect, we need to add DW_OP_deref to indicate
+  // that the original virtual register was a pointer. Also, add the stack slot
+  // offset for the spilled register to the expression.
+  const DIExpression *Expr = DbgValue.getExpression();
+  bool IsIndirect = DbgValue.getWasIndirect();
+  bool IsList = DbgValue.getWasList();
+  for (unsigned I = 0, E = LocSpills.size(); I != E; ++I) {
+    if (LocSpills[I]) {
+      if (!IsList) {
+        uint8_t DIExprFlags = DIExpression::ApplyOffset;
+        if (IsIndirect)
+          DIExprFlags |= DIExpression::DerefAfter;
+        Expr = DIExpression::prepend(Expr, DIExprFlags, SpillOffsets[I]);
+        IsIndirect = true;
+      } else {
+        SmallVector<uint64_t, 4> Ops;
+        DIExpression::appendOffset(Ops, SpillOffsets[I]);
+        Ops.push_back(dwarf::DW_OP_deref);
+        Expr = DIExpression::appendOpsToArg(Expr, Ops, I);
+      }
+    }
+
+    assert((!LocSpills[I] || MOs[I].isFI()) &&
+           "a spilled location must be a frame index");
+  }
+
+  unsigned DbgValueOpcode =
+      IsList ? TargetOpcode::DBG_VALUE_LIST : TargetOpcode::DBG_VALUE;
+  do {
+    BuildMI(*MBB, I, getDebugLoc(), TII.get(DbgValueOpcode), IsIndirect, MOs,
+            Variable, Expr);
+
+    // Continue and insert DBG_VALUES after every redefinition of a register
+    // associated with the debug value within the range
+    I = findNextInsertLocation(MBB, I, StopIdx, MOs, LIS, TRI);
+  } while (I != MBB->end());
+}
+
+void UserLabel::insertDebugLabel(MachineBasicBlock *MBB, SlotIndex Idx,
+                                 LiveIntervals &LIS, const TargetInstrInfo &TII,
+                                 BlockSkipInstsMap &BBSkipInstsMap) {
+  MachineBasicBlock::iterator I =
+      findInsertLocation(MBB, Idx, LIS, BBSkipInstsMap);
+  ++NumInsertedDebugLabels;
+  BuildMI(*MBB, I, getDebugLoc(), TII.get(TargetOpcode::DBG_LABEL))
+      .addMetadata(Label);
+}
+
+void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
+                                const TargetInstrInfo &TII,
+                                const TargetRegisterInfo &TRI,
+                                const SpillOffsetMap &SpillOffsets,
+                                BlockSkipInstsMap &BBSkipInstsMap) {
+  MachineFunction::iterator MFEnd = VRM->getMachineFunction().end();
+
+  for (LocMap::const_iterator I = locInts.begin(); I.valid();) {
+    SlotIndex Start = I.start();
+    SlotIndex Stop = I.stop();
+    DbgVariableValue DbgValue = I.value();
+
+    SmallVector<bool> SpilledLocs;
+    SmallVector<unsigned> LocSpillOffsets;
+    for (unsigned LocNo : DbgValue.loc_nos()) {
+      auto SpillIt =
+          !DbgValue.isUndef() ? SpillOffsets.find(LocNo) : SpillOffsets.end();
+      bool Spilled = SpillIt != SpillOffsets.end();
+      SpilledLocs.push_back(Spilled);
+      LocSpillOffsets.push_back(Spilled ? SpillIt->second : 0);
+    }
+
+    // If the interval start was trimmed to the lexical scope insert the
+    // DBG_VALUE at the previous index (otherwise it appears after the
+    // first instruction in the range).
+    if (trimmedDefs.count(Start))
+      Start = Start.getPrevIndex();
+
+    LLVM_DEBUG(auto &dbg = dbgs(); dbg << "\t[" << Start << ';' << Stop << "):";
+               DbgValue.printLocNos(dbg));
+    MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start)->getIterator();
+    SlotIndex MBBEnd = LIS.getMBBEndIdx(&*MBB);
+
+    LLVM_DEBUG(dbgs() << ' ' << printMBBReference(*MBB) << '-' << MBBEnd);
+    insertDebugValue(&*MBB, Start, Stop, DbgValue, SpilledLocs, LocSpillOffsets,
+                     LIS, TII, TRI, BBSkipInstsMap);
+    // This interval may span multiple basic blocks.
+    // Insert a DBG_VALUE into each one.
+    while (Stop > MBBEnd) {
+      // Move to the next block.
+      Start = MBBEnd;
+      if (++MBB == MFEnd)
+        break;
+      MBBEnd = LIS.getMBBEndIdx(&*MBB);
+      LLVM_DEBUG(dbgs() << ' ' << printMBBReference(*MBB) << '-' << MBBEnd);
+      insertDebugValue(&*MBB, Start, Stop, DbgValue, SpilledLocs,
+                       LocSpillOffsets, LIS, TII, TRI, BBSkipInstsMap);
+    }
+    LLVM_DEBUG(dbgs() << '\n');
+    if (MBB == MFEnd)
+      break;
+
+    ++I;
+  }
+}
+
+void UserLabel::emitDebugLabel(LiveIntervals &LIS, const TargetInstrInfo &TII,
+                               BlockSkipInstsMap &BBSkipInstsMap) {
+  LLVM_DEBUG(dbgs() << "\t" << loc);
+  MachineFunction::iterator MBB = LIS.getMBBFromIndex(loc)->getIterator();
+
+  LLVM_DEBUG(dbgs() << ' ' << printMBBReference(*MBB));
+  insertDebugLabel(&*MBB, loc, LIS, TII, BBSkipInstsMap);
+
+  LLVM_DEBUG(dbgs() << '\n');
+}
+
+void LDVImpl::emitDebugValues(VirtRegMap *VRM) {
+  LLVM_DEBUG(dbgs() << "********** EMITTING LIVE DEBUG VARIABLES **********\n");
+  if (!MF)
+    return;
+
+  BlockSkipInstsMap BBSkipInstsMap;
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  SpillOffsetMap SpillOffsets;
+  for (auto &userValue : userValues) {
+    LLVM_DEBUG(userValue->print(dbgs(), TRI));
+    userValue->rewriteLocations(*VRM, *MF, *TII, *TRI, SpillOffsets);
+    userValue->emitDebugValues(VRM, *LIS, *TII, *TRI, SpillOffsets,
+                               BBSkipInstsMap);
+  }
+  LLVM_DEBUG(dbgs() << "********** EMITTING LIVE DEBUG LABELS **********\n");
+  for (auto &userLabel : userLabels) {
+    LLVM_DEBUG(userLabel->print(dbgs(), TRI));
+    userLabel->emitDebugLabel(*LIS, *TII, BBSkipInstsMap);
+  }
+
+  LLVM_DEBUG(dbgs() << "********** EMITTING DEBUG PHIS **********\n");
+
+  auto Slots = LIS->getSlotIndexes();
+  for (auto &It : PHIValToPos) {
+    // For each ex-PHI, identify its physreg location or stack slot, and emit
+    // a DBG_PHI for it.
+    unsigned InstNum = It.first;
+    auto Slot = It.second.SI;
+    Register Reg = It.second.Reg;
+    unsigned SubReg = It.second.SubReg;
+
+    MachineBasicBlock *OrigMBB = Slots->getMBBFromIndex(Slot);
+    if (VRM->isAssignedReg(Reg) &&
+        Register::isPhysicalRegister(VRM->getPhys(Reg))) {
+      unsigned PhysReg = VRM->getPhys(Reg);
+      if (SubReg != 0)
+        PhysReg = TRI->getSubReg(PhysReg, SubReg);
+
+      auto Builder = BuildMI(*OrigMBB, OrigMBB->begin(), DebugLoc(),
+                             TII->get(TargetOpcode::DBG_PHI));
+      Builder.addReg(PhysReg);
+      Builder.addImm(InstNum);
+    } else if (VRM->getStackSlot(Reg) != VirtRegMap::NO_STACK_SLOT) {
+      const MachineRegisterInfo &MRI = MF->getRegInfo();
+      const TargetRegisterClass *TRC = MRI.getRegClass(Reg);
+      unsigned SpillSize, SpillOffset;
+
+      unsigned regSizeInBits = TRI->getRegSizeInBits(*TRC);
+      if (SubReg)
+        regSizeInBits = TRI->getSubRegIdxSize(SubReg);
+
+      // Test whether this location is legal with the given subreg. If the
+      // subregister has a nonzero offset, drop this location, it's too complex
+      // to describe. (TODO: future work).
+      bool Success =
+          TII->getStackSlotRange(TRC, SubReg, SpillSize, SpillOffset, *MF);
+
+      if (Success && SpillOffset == 0) {
+        auto Builder = BuildMI(*OrigMBB, OrigMBB->begin(), DebugLoc(),
+                               TII->get(TargetOpcode::DBG_PHI));
+        Builder.addFrameIndex(VRM->getStackSlot(Reg));
+        Builder.addImm(InstNum);
+        // Record how large the original value is. The stack slot might be
+        // merged and altered during optimisation, but we will want to know how
+        // large the value is, at this DBG_PHI.
+        Builder.addImm(regSizeInBits);
+      }
+
+      LLVM_DEBUG(
+      if (SpillOffset != 0) {
+        dbgs() << "DBG_PHI for Vreg " << Reg << " subreg " << SubReg <<
+                  " has nonzero offset\n";
+      }
+      );
+    }
+    // If there was no mapping for a value ID, it's optimized out. Create no
+    // DBG_PHI, and any variables using this value will become optimized out.
+  }
+  MF->DebugPHIPositions.clear();
+
+  LLVM_DEBUG(dbgs() << "********** EMITTING INSTR REFERENCES **********\n");
+
+  // Re-insert any debug instrs back in the position they were. We must
+  // re-insert in the same order to ensure that debug instructions don't swap,
+  // which could re-order assignments. Do so in a batch -- once we find the
+  // insert position, insert all instructions at the same SlotIdx. They are
+  // guaranteed to appear in-sequence in StashedDebugInstrs because we insert
+  // them in order.
+  for (auto *StashIt = StashedDebugInstrs.begin();
+       StashIt != StashedDebugInstrs.end(); ++StashIt) {
+    SlotIndex Idx = StashIt->Idx;
+    MachineBasicBlock *MBB = StashIt->MBB;
+    MachineInstr *MI = StashIt->MI;
+
+    auto EmitInstsHere = [this, &StashIt, MBB, Idx,
+                          MI](MachineBasicBlock::iterator InsertPos) {
+      // Insert this debug instruction.
+      MBB->insert(InsertPos, MI);
+
+      // Look at subsequent stashed debug instructions: if they're at the same
+      // index, insert those too.
+      auto NextItem = std::next(StashIt);
+      while (NextItem != StashedDebugInstrs.end() && NextItem->Idx == Idx) {
+        assert(NextItem->MBB == MBB && "Instrs with same slot index should be"
+               "in the same block");
+        MBB->insert(InsertPos, NextItem->MI);
+        StashIt = NextItem;
+        NextItem = std::next(StashIt);
+      };
+    };
+
+    // Start block index: find the first non-debug instr in the block, and
+    // insert before it.
+    if (Idx == Slots->getMBBStartIdx(MBB)) {
+      MachineBasicBlock::iterator InsertPos =
+          findInsertLocation(MBB, Idx, *LIS, BBSkipInstsMap);
+      EmitInstsHere(InsertPos);
+      continue;
+    }
+
+    if (MachineInstr *Pos = Slots->getInstructionFromIndex(Idx)) {
+      // Insert at the end of any debug instructions.
+      auto PostDebug = std::next(Pos->getIterator());
+      PostDebug = skipDebugInstructionsForward(PostDebug, MBB->instr_end());
+      EmitInstsHere(PostDebug);
+    } else {
+      // Insert position disappeared; walk forwards through slots until we
+      // find a new one.
+      SlotIndex End = Slots->getMBBEndIdx(MBB);
+      for (; Idx < End; Idx = Slots->getNextNonNullIndex(Idx)) {
+        Pos = Slots->getInstructionFromIndex(Idx);
+        if (Pos) {
+          EmitInstsHere(Pos->getIterator());
+          break;
+        }
+      }
+
+      // We have reached the end of the block and didn't find anywhere to
+      // insert! It's not safe to discard any debug instructions; place them
+      // in front of the first terminator, or in front of end().
+      if (Idx >= End) {
+        auto TermIt = MBB->getFirstTerminator();
+        EmitInstsHere(TermIt);
+      }
+    }
+  }
+
+  EmitDone = true;
+  BBSkipInstsMap.clear();
+}
+
+void LiveDebugVariables::emitDebugValues(VirtRegMap *VRM) {
+  if (pImpl)
+    static_cast<LDVImpl*>(pImpl)->emitDebugValues(VRM);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LiveDebugVariables::dump() const {
+  if (pImpl)
+    static_cast<LDVImpl*>(pImpl)->print(dbgs());
+}
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.h b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.h
new file mode 100644
index 000000000000..9998ce9e8dad
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveDebugVariables.h
@@ -0,0 +1,68 @@
+//===- LiveDebugVariables.h - Tracking debug info variables -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the interface to the LiveDebugVariables analysis.
+//
+// The analysis removes DBG_VALUE instructions for virtual registers and tracks
+// live user variables in a data structure that can be updated during register
+// allocation.
+//
+// After register allocation new DBG_VALUE instructions are emitted to reflect
+// the new locations of user variables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_LIVEDEBUGVARIABLES_H
+#define LLVM_LIB_CODEGEN_LIVEDEBUGVARIABLES_H
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+template <typename T> class ArrayRef;
+class LiveIntervals;
+class VirtRegMap;
+
+class LLVM_LIBRARY_VISIBILITY LiveDebugVariables : public MachineFunctionPass {
+  void *pImpl = nullptr;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  LiveDebugVariables();
+  ~LiveDebugVariables() override;
+
+  /// splitRegister - Move any user variables in OldReg to the live ranges in
+  /// NewRegs where they are live. Mark the values as unavailable where no new
+  /// register is live.
+  void splitRegister(Register OldReg, ArrayRef<Register> NewRegs,
+                     LiveIntervals &LIS);
+
+  /// emitDebugValues - Emit new DBG_VALUE instructions reflecting the changes
+  /// that happened during register allocation.
+  /// @param VRM Rename virtual registers according to map.
+  void emitDebugValues(VirtRegMap *VRM);
+
+  /// dump - Print data structures to dbgs().
+  void dump() const;
+
+private:
+  bool runOnMachineFunction(MachineFunction &) override;
+  void releaseMemory() override;
+  void getAnalysisUsage(AnalysisUsage &) const override;
+
+  MachineFunctionProperties getSetProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::TracksDebugUserValues);
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_LIVEDEBUGVARIABLES_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveInterval.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveInterval.cpp
new file mode 100644
index 000000000000..1cf354349c56
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveInterval.cpp
@@ -0,0 +1,1409 @@
+//===- LiveInterval.cpp - Live Interval Representation --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveRange and LiveInterval classes.  Given some
+// numbering of each the machine instructions an interval [i, j) is said to be a
+// live range for register v if there is no instruction with number j' >= j
+// such that v is live at j' and there is no instruction with number i' < i such
+// that v is live at i'. In this implementation ranges can have holes,
+// i.e. a range might look like [1,20), [50,65), [1000,1001).  Each
+// individual segment is represented as an instance of LiveRange::Segment,
+// and the whole range is represented as an instance of LiveRange.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "LiveRangeUtils.h"
+#include "RegisterCoalescer.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Implementation of various methods necessary for calculation of live ranges.
+// The implementation of the methods abstracts from the concrete type of the
+// segment collection.
+//
+// Implementation of the class follows the Template design pattern. The base
+// class contains generic algorithms that call collection-specific methods,
+// which are provided in concrete subclasses. In order to avoid virtual calls
+// these methods are provided by means of C++ template instantiation.
+// The base class calls the methods of the subclass through method impl(),
+// which casts 'this' pointer to the type of the subclass.
+//
+//===----------------------------------------------------------------------===//
+
+template <typename ImplT, typename IteratorT, typename CollectionT>
+class CalcLiveRangeUtilBase {
+protected:
+  LiveRange *LR;
+
+protected:
+  CalcLiveRangeUtilBase(LiveRange *LR) : LR(LR) {}
+
+public:
+  using Segment = LiveRange::Segment;
+  using iterator = IteratorT;
+
+  /// A counterpart of LiveRange::createDeadDef: Make sure the range has a
+  /// value defined at @p Def.
+  /// If @p ForVNI is null, and there is no value defined at @p Def, a new
+  /// value will be allocated using @p VNInfoAllocator.
+  /// If @p ForVNI is null, the return value is the value defined at @p Def,
+  /// either a pre-existing one, or the one newly created.
+  /// If @p ForVNI is not null, then @p Def should be the location where
+  /// @p ForVNI is defined. If the range does not have a value defined at
+  /// @p Def, the value @p ForVNI will be used instead of allocating a new
+  /// one. If the range already has a value defined at @p Def, it must be
+  /// same as @p ForVNI. In either case, @p ForVNI will be the return value.
+  VNInfo *createDeadDef(SlotIndex Def, VNInfo::Allocator *VNInfoAllocator,
+                        VNInfo *ForVNI) {
+    assert(!Def.isDead() && "Cannot define a value at the dead slot");
+    assert((!ForVNI || ForVNI->def == Def) &&
+           "If ForVNI is specified, it must match Def");
+    iterator I = impl().find(Def);
+    if (I == segments().end()) {
+      VNInfo *VNI = ForVNI ? ForVNI : LR->getNextValue(Def, *VNInfoAllocator);
+      impl().insertAtEnd(Segment(Def, Def.getDeadSlot(), VNI));
+      return VNI;
+    }
+
+    Segment *S = segmentAt(I);
+    if (SlotIndex::isSameInstr(Def, S->start)) {
+      assert((!ForVNI || ForVNI == S->valno) && "Value number mismatch");
+      assert(S->valno->def == S->start && "Inconsistent existing value def");
+
+      // It is possible to have both normal and early-clobber defs of the same
+      // register on an instruction. It doesn't make a lot of sense, but it is
+      // possible to specify in inline assembly.
+      //
+      // Just convert everything to early-clobber.
+      Def = std::min(Def, S->start);
+      if (Def != S->start)
+        S->start = S->valno->def = Def;
+      return S->valno;
+    }
+    assert(SlotIndex::isEarlierInstr(Def, S->start) && "Already live at def");
+    VNInfo *VNI = ForVNI ? ForVNI : LR->getNextValue(Def, *VNInfoAllocator);
+    segments().insert(I, Segment(Def, Def.getDeadSlot(), VNI));
+    return VNI;
+  }
+
+  VNInfo *extendInBlock(SlotIndex StartIdx, SlotIndex Use) {
+    if (segments().empty())
+      return nullptr;
+    iterator I =
+      impl().findInsertPos(Segment(Use.getPrevSlot(), Use, nullptr));
+    if (I == segments().begin())
+      return nullptr;
+    --I;
+    if (I->end <= StartIdx)
+      return nullptr;
+    if (I->end < Use)
+      extendSegmentEndTo(I, Use);
+    return I->valno;
+  }
+
+  std::pair<VNInfo*,bool> extendInBlock(ArrayRef<SlotIndex> Undefs,
+      SlotIndex StartIdx, SlotIndex Use) {
+    if (segments().empty())
+      return std::make_pair(nullptr, false);
+    SlotIndex BeforeUse = Use.getPrevSlot();
+    iterator I = impl().findInsertPos(Segment(BeforeUse, Use, nullptr));
+    if (I == segments().begin())
+      return std::make_pair(nullptr, LR->isUndefIn(Undefs, StartIdx, BeforeUse));
+    --I;
+    if (I->end <= StartIdx)
+      return std::make_pair(nullptr, LR->isUndefIn(Undefs, StartIdx, BeforeUse));
+    if (I->end < Use) {
+      if (LR->isUndefIn(Undefs, I->end, BeforeUse))
+        return std::make_pair(nullptr, true);
+      extendSegmentEndTo(I, Use);
+    }
+    return std::make_pair(I->valno, false);
+  }
+
+  /// This method is used when we want to extend the segment specified
+  /// by I to end at the specified endpoint. To do this, we should
+  /// merge and eliminate all segments that this will overlap
+  /// with. The iterator is not invalidated.
+  void extendSegmentEndTo(iterator I, SlotIndex NewEnd) {
+    assert(I != segments().end() && "Not a valid segment!");
+    Segment *S = segmentAt(I);
+    VNInfo *ValNo = I->valno;
+
+    // Search for the first segment that we can't merge with.
+    iterator MergeTo = std::next(I);
+    for (; MergeTo != segments().end() && NewEnd >= MergeTo->end; ++MergeTo)
+      assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
+
+    // If NewEnd was in the middle of a segment, make sure to get its endpoint.
+    S->end = std::max(NewEnd, std::prev(MergeTo)->end);
+
+    // If the newly formed segment now touches the segment after it and if they
+    // have the same value number, merge the two segments into one segment.
+    if (MergeTo != segments().end() && MergeTo->start <= I->end &&
+        MergeTo->valno == ValNo) {
+      S->end = MergeTo->end;
+      ++MergeTo;
+    }
+
+    // Erase any dead segments.
+    segments().erase(std::next(I), MergeTo);
+  }
+
+  /// This method is used when we want to extend the segment specified
+  /// by I to start at the specified endpoint.  To do this, we should
+  /// merge and eliminate all segments that this will overlap with.
+  iterator extendSegmentStartTo(iterator I, SlotIndex NewStart) {
+    assert(I != segments().end() && "Not a valid segment!");
+    Segment *S = segmentAt(I);
+    VNInfo *ValNo = I->valno;
+
+    // Search for the first segment that we can't merge with.
+    iterator MergeTo = I;
+    do {
+      if (MergeTo == segments().begin()) {
+        S->start = NewStart;
+        segments().erase(MergeTo, I);
+        return I;
+      }
+      assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
+      --MergeTo;
+    } while (NewStart <= MergeTo->start);
+
+    // If we start in the middle of another segment, just delete a range and
+    // extend that segment.
+    if (MergeTo->end >= NewStart && MergeTo->valno == ValNo) {
+      segmentAt(MergeTo)->end = S->end;
+    } else {
+      // Otherwise, extend the segment right after.
+      ++MergeTo;
+      Segment *MergeToSeg = segmentAt(MergeTo);
+      MergeToSeg->start = NewStart;
+      MergeToSeg->end = S->end;
+    }
+
+    segments().erase(std::next(MergeTo), std::next(I));
+    return MergeTo;
+  }
+
+  iterator addSegment(Segment S) {
+    SlotIndex Start = S.start, End = S.end;
+    iterator I = impl().findInsertPos(S);
+
+    // If the inserted segment starts in the middle or right at the end of
+    // another segment, just extend that segment to contain the segment of S.
+    if (I != segments().begin()) {
+      iterator B = std::prev(I);
+      if (S.valno == B->valno) {
+        if (B->start <= Start && B->end >= Start) {
+          extendSegmentEndTo(B, End);
+          return B;
+        }
+      } else {
+        // Check to make sure that we are not overlapping two live segments with
+        // different valno's.
+        assert(B->end <= Start &&
+               "Cannot overlap two segments with differing ValID's"
+               " (did you def the same reg twice in a MachineInstr?)");
+      }
+    }
+
+    // Otherwise, if this segment ends in the middle of, or right next
+    // to, another segment, merge it into that segment.
+    if (I != segments().end()) {
+      if (S.valno == I->valno) {
+        if (I->start <= End) {
+          I = extendSegmentStartTo(I, Start);
+
+          // If S is a complete superset of a segment, we may need to grow its
+          // endpoint as well.
+          if (End > I->end)
+            extendSegmentEndTo(I, End);
+          return I;
+        }
+      } else {
+        // Check to make sure that we are not overlapping two live segments with
+        // different valno's.
+        assert(I->start >= End &&
+               "Cannot overlap two segments with differing ValID's");
+      }
+    }
+
+    // Otherwise, this is just a new segment that doesn't interact with
+    // anything.
+    // Insert it.
+    return segments().insert(I, S);
+  }
+
+private:
+  ImplT &impl() { return *static_cast<ImplT *>(this); }
+
+  CollectionT &segments() { return impl().segmentsColl(); }
+
+  Segment *segmentAt(iterator I) { return const_cast<Segment *>(&(*I)); }
+};
+
+//===----------------------------------------------------------------------===//
+//   Instantiation of the methods for calculation of live ranges
+//   based on a segment vector.
+//===----------------------------------------------------------------------===//
+
+class CalcLiveRangeUtilVector;
+using CalcLiveRangeUtilVectorBase =
+    CalcLiveRangeUtilBase<CalcLiveRangeUtilVector, LiveRange::iterator,
+                          LiveRange::Segments>;
+
+class CalcLiveRangeUtilVector : public CalcLiveRangeUtilVectorBase {
+public:
+  CalcLiveRangeUtilVector(LiveRange *LR) : CalcLiveRangeUtilVectorBase(LR) {}
+
+private:
+  friend CalcLiveRangeUtilVectorBase;
+
+  LiveRange::Segments &segmentsColl() { return LR->segments; }
+
+  void insertAtEnd(const Segment &S) { LR->segments.push_back(S); }
+
+  iterator find(SlotIndex Pos) { return LR->find(Pos); }
+
+  iterator findInsertPos(Segment S) { return llvm::upper_bound(*LR, S.start); }
+};
+
+//===----------------------------------------------------------------------===//
+//   Instantiation of the methods for calculation of live ranges
+//   based on a segment set.
+//===----------------------------------------------------------------------===//
+
+class CalcLiveRangeUtilSet;
+using CalcLiveRangeUtilSetBase =
+    CalcLiveRangeUtilBase<CalcLiveRangeUtilSet, LiveRange::SegmentSet::iterator,
+                          LiveRange::SegmentSet>;
+
+class CalcLiveRangeUtilSet : public CalcLiveRangeUtilSetBase {
+public:
+  CalcLiveRangeUtilSet(LiveRange *LR) : CalcLiveRangeUtilSetBase(LR) {}
+
+private:
+  friend CalcLiveRangeUtilSetBase;
+
+  LiveRange::SegmentSet &segmentsColl() { return *LR->segmentSet; }
+
+  void insertAtEnd(const Segment &S) {
+    LR->segmentSet->insert(LR->segmentSet->end(), S);
+  }
+
+  iterator find(SlotIndex Pos) {
+    iterator I =
+        LR->segmentSet->upper_bound(Segment(Pos, Pos.getNextSlot(), nullptr));
+    if (I == LR->segmentSet->begin())
+      return I;
+    iterator PrevI = std::prev(I);
+    if (Pos < (*PrevI).end)
+      return PrevI;
+    return I;
+  }
+
+  iterator findInsertPos(Segment S) {
+    iterator I = LR->segmentSet->upper_bound(S);
+    if (I != LR->segmentSet->end() && !(S.start < *I))
+      ++I;
+    return I;
+  }
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//   LiveRange methods
+//===----------------------------------------------------------------------===//
+
+LiveRange::iterator LiveRange::find(SlotIndex Pos) {
+  return llvm::partition_point(*this,
+                               [&](const Segment &X) { return X.end <= Pos; });
+}
+
+VNInfo *LiveRange::createDeadDef(SlotIndex Def, VNInfo::Allocator &VNIAlloc) {
+  // Use the segment set, if it is available.
+  if (segmentSet != nullptr)
+    return CalcLiveRangeUtilSet(this).createDeadDef(Def, &VNIAlloc, nullptr);
+  // Otherwise use the segment vector.
+  return CalcLiveRangeUtilVector(this).createDeadDef(Def, &VNIAlloc, nullptr);
+}
+
+VNInfo *LiveRange::createDeadDef(VNInfo *VNI) {
+  // Use the segment set, if it is available.
+  if (segmentSet != nullptr)
+    return CalcLiveRangeUtilSet(this).createDeadDef(VNI->def, nullptr, VNI);
+  // Otherwise use the segment vector.
+  return CalcLiveRangeUtilVector(this).createDeadDef(VNI->def, nullptr, VNI);
+}
+
+// overlaps - Return true if the intersection of the two live ranges is
+// not empty.
+//
+// An example for overlaps():
+//
+// 0: A = ...
+// 4: B = ...
+// 8: C = A + B ;; last use of A
+//
+// The live ranges should look like:
+//
+// A = [3, 11)
+// B = [7, x)
+// C = [11, y)
+//
+// A->overlaps(C) should return false since we want to be able to join
+// A and C.
+//
+bool LiveRange::overlapsFrom(const LiveRange& other,
+                             const_iterator StartPos) const {
+  assert(!empty() && "empty range");
+  const_iterator i = begin();
+  const_iterator ie = end();
+  const_iterator j = StartPos;
+  const_iterator je = other.end();
+
+  assert((StartPos->start <= i->start || StartPos == other.begin()) &&
+         StartPos != other.end() && "Bogus start position hint!");
+
+  if (i->start < j->start) {
+    i = std::upper_bound(i, ie, j->start);
+    if (i != begin()) --i;
+  } else if (j->start < i->start) {
+    ++StartPos;
+    if (StartPos != other.end() && StartPos->start <= i->start) {
+      assert(StartPos < other.end() && i < end());
+      j = std::upper_bound(j, je, i->start);
+      if (j != other.begin()) --j;
+    }
+  } else {
+    return true;
+  }
+
+  if (j == je) return false;
+
+  while (i != ie) {
+    if (i->start > j->start) {
+      std::swap(i, j);
+      std::swap(ie, je);
+    }
+
+    if (i->end > j->start)
+      return true;
+    ++i;
+  }
+
+  return false;
+}
+
+bool LiveRange::overlaps(const LiveRange &Other, const CoalescerPair &CP,
+                         const SlotIndexes &Indexes) const {
+  assert(!empty() && "empty range");
+  if (Other.empty())
+    return false;
+
+  // Use binary searches to find initial positions.
+  const_iterator I = find(Other.beginIndex());
+  const_iterator IE = end();
+  if (I == IE)
+    return false;
+  const_iterator J = Other.find(I->start);
+  const_iterator JE = Other.end();
+  if (J == JE)
+    return false;
+
+  while (true) {
+    // J has just been advanced to satisfy:
+    assert(J->end > I->start);
+    // Check for an overlap.
+    if (J->start < I->end) {
+      // I and J are overlapping. Find the later start.
+      SlotIndex Def = std::max(I->start, J->start);
+      // Allow the overlap if Def is a coalescable copy.
+      if (Def.isBlock() ||
+          !CP.isCoalescable(Indexes.getInstructionFromIndex(Def)))
+        return true;
+    }
+    // Advance the iterator that ends first to check for more overlaps.
+    if (J->end > I->end) {
+      std::swap(I, J);
+      std::swap(IE, JE);
+    }
+    // Advance J until J->end > I->start.
+    do
+      if (++J == JE)
+        return false;
+    while (J->end <= I->start);
+  }
+}
+
+/// overlaps - Return true if the live range overlaps an interval specified
+/// by [Start, End).
+bool LiveRange::overlaps(SlotIndex Start, SlotIndex End) const {
+  assert(Start < End && "Invalid range");
+  const_iterator I = lower_bound(*this, End);
+  return I != begin() && (--I)->end > Start;
+}
+
+bool LiveRange::covers(const LiveRange &Other) const {
+  if (empty())
+    return Other.empty();
+
+  const_iterator I = begin();
+  for (const Segment &O : Other.segments) {
+    I = advanceTo(I, O.start);
+    if (I == end() || I->start > O.start)
+      return false;
+
+    // Check adjacent live segments and see if we can get behind O.end.
+    while (I->end < O.end) {
+      const_iterator Last = I;
+      // Get next segment and abort if it was not adjacent.
+      ++I;
+      if (I == end() || Last->end != I->start)
+        return false;
+    }
+  }
+  return true;
+}
+
+/// ValNo is dead, remove it.  If it is the largest value number, just nuke it
+/// (and any other deleted values neighboring it), otherwise mark it as ~1U so
+/// it can be nuked later.
+void LiveRange::markValNoForDeletion(VNInfo *ValNo) {
+  if (ValNo->id == getNumValNums()-1) {
+    do {
+      valnos.pop_back();
+    } while (!valnos.empty() && valnos.back()->isUnused());
+  } else {
+    ValNo->markUnused();
+  }
+}
+
+/// RenumberValues - Renumber all values in order of appearance and delete the
+/// remaining unused values.
+void LiveRange::RenumberValues() {
+  SmallPtrSet<VNInfo*, 8> Seen;
+  valnos.clear();
+  for (const Segment &S : segments) {
+    VNInfo *VNI = S.valno;
+    if (!Seen.insert(VNI).second)
+      continue;
+    assert(!VNI->isUnused() && "Unused valno used by live segment");
+    VNI->id = (unsigned)valnos.size();
+    valnos.push_back(VNI);
+  }
+}
+
+void LiveRange::addSegmentToSet(Segment S) {
+  CalcLiveRangeUtilSet(this).addSegment(S);
+}
+
+LiveRange::iterator LiveRange::addSegment(Segment S) {
+  // Use the segment set, if it is available.
+  if (segmentSet != nullptr) {
+    addSegmentToSet(S);
+    return end();
+  }
+  // Otherwise use the segment vector.
+  return CalcLiveRangeUtilVector(this).addSegment(S);
+}
+
+void LiveRange::append(const Segment S) {
+  // Check that the segment belongs to the back of the list.
+  assert(segments.empty() || segments.back().end <= S.start);
+  segments.push_back(S);
+}
+
+std::pair<VNInfo*,bool> LiveRange::extendInBlock(ArrayRef<SlotIndex> Undefs,
+    SlotIndex StartIdx, SlotIndex Kill) {
+  // Use the segment set, if it is available.
+  if (segmentSet != nullptr)
+    return CalcLiveRangeUtilSet(this).extendInBlock(Undefs, StartIdx, Kill);
+  // Otherwise use the segment vector.
+  return CalcLiveRangeUtilVector(this).extendInBlock(Undefs, StartIdx, Kill);
+}
+
+VNInfo *LiveRange::extendInBlock(SlotIndex StartIdx, SlotIndex Kill) {
+  // Use the segment set, if it is available.
+  if (segmentSet != nullptr)
+    return CalcLiveRangeUtilSet(this).extendInBlock(StartIdx, Kill);
+  // Otherwise use the segment vector.
+  return CalcLiveRangeUtilVector(this).extendInBlock(StartIdx, Kill);
+}
+
+/// Remove the specified segment from this range.  Note that the segment must
+/// be in a single Segment in its entirety.
+void LiveRange::removeSegment(SlotIndex Start, SlotIndex End,
+                              bool RemoveDeadValNo) {
+  // Find the Segment containing this span.
+  iterator I = find(Start);
+  assert(I != end() && "Segment is not in range!");
+  assert(I->containsInterval(Start, End)
+         && "Segment is not entirely in range!");
+
+  // If the span we are removing is at the start of the Segment, adjust it.
+  VNInfo *ValNo = I->valno;
+  if (I->start == Start) {
+    if (I->end == End) {
+      segments.erase(I);  // Removed the whole Segment.
+
+      if (RemoveDeadValNo)
+        removeValNoIfDead(ValNo);
+    } else
+      I->start = End;
+    return;
+  }
+
+  // Otherwise if the span we are removing is at the end of the Segment,
+  // adjust the other way.
+  if (I->end == End) {
+    I->end = Start;
+    return;
+  }
+
+  // Otherwise, we are splitting the Segment into two pieces.
+  SlotIndex OldEnd = I->end;
+  I->end = Start;   // Trim the old segment.
+
+  // Insert the new one.
+  segments.insert(std::next(I), Segment(End, OldEnd, ValNo));
+}
+
+LiveRange::iterator LiveRange::removeSegment(iterator I, bool RemoveDeadValNo) {
+  VNInfo *ValNo = I->valno;
+  I = segments.erase(I);
+  if (RemoveDeadValNo)
+    removeValNoIfDead(ValNo);
+  return I;
+}
+
+void LiveRange::removeValNoIfDead(VNInfo *ValNo) {
+  if (none_of(*this, [=](const Segment &S) { return S.valno == ValNo; }))
+    markValNoForDeletion(ValNo);
+}
+
+/// removeValNo - Remove all the segments defined by the specified value#.
+/// Also remove the value# from value# list.
+void LiveRange::removeValNo(VNInfo *ValNo) {
+  if (empty()) return;
+  llvm::erase_if(segments,
+                 [ValNo](const Segment &S) { return S.valno == ValNo; });
+  // Now that ValNo is dead, remove it.
+  markValNoForDeletion(ValNo);
+}
+
+void LiveRange::join(LiveRange &Other,
+                     const int *LHSValNoAssignments,
+                     const int *RHSValNoAssignments,
+                     SmallVectorImpl<VNInfo *> &NewVNInfo) {
+  verify();
+
+  // Determine if any of our values are mapped.  This is uncommon, so we want
+  // to avoid the range scan if not.
+  bool MustMapCurValNos = false;
+  unsigned NumVals = getNumValNums();
+  unsigned NumNewVals = NewVNInfo.size();
+  for (unsigned i = 0; i != NumVals; ++i) {
+    unsigned LHSValID = LHSValNoAssignments[i];
+    if (i != LHSValID ||
+        (NewVNInfo[LHSValID] && NewVNInfo[LHSValID] != getValNumInfo(i))) {
+      MustMapCurValNos = true;
+      break;
+    }
+  }
+
+  // If we have to apply a mapping to our base range assignment, rewrite it now.
+  if (MustMapCurValNos && !empty()) {
+    // Map the first live range.
+
+    iterator OutIt = begin();
+    OutIt->valno = NewVNInfo[LHSValNoAssignments[OutIt->valno->id]];
+    for (iterator I = std::next(OutIt), E = end(); I != E; ++I) {
+      VNInfo* nextValNo = NewVNInfo[LHSValNoAssignments[I->valno->id]];
+      assert(nextValNo && "Huh?");
+
+      // If this live range has the same value # as its immediate predecessor,
+      // and if they are neighbors, remove one Segment.  This happens when we
+      // have [0,4:0)[4,7:1) and map 0/1 onto the same value #.
+      if (OutIt->valno == nextValNo && OutIt->end == I->start) {
+        OutIt->end = I->end;
+      } else {
+        // Didn't merge. Move OutIt to the next segment,
+        ++OutIt;
+        OutIt->valno = nextValNo;
+        if (OutIt != I) {
+          OutIt->start = I->start;
+          OutIt->end = I->end;
+        }
+      }
+    }
+    // If we merge some segments, chop off the end.
+    ++OutIt;
+    segments.erase(OutIt, end());
+  }
+
+  // Rewrite Other values before changing the VNInfo ids.
+  // This can leave Other in an invalid state because we're not coalescing
+  // touching segments that now have identical values. That's OK since Other is
+  // not supposed to be valid after calling join();
+  for (Segment &S : Other.segments)
+    S.valno = NewVNInfo[RHSValNoAssignments[S.valno->id]];
+
+  // Update val# info. Renumber them and make sure they all belong to this
+  // LiveRange now. Also remove dead val#'s.
+  unsigned NumValNos = 0;
+  for (unsigned i = 0; i < NumNewVals; ++i) {
+    VNInfo *VNI = NewVNInfo[i];
+    if (VNI) {
+      if (NumValNos >= NumVals)
+        valnos.push_back(VNI);
+      else
+        valnos[NumValNos] = VNI;
+      VNI->id = NumValNos++;  // Renumber val#.
+    }
+  }
+  if (NumNewVals < NumVals)
+    valnos.resize(NumNewVals);  // shrinkify
+
+  // Okay, now insert the RHS live segments into the LHS.
+  LiveRangeUpdater Updater(this);
+  for (Segment &S : Other.segments)
+    Updater.add(S);
+}
+
+/// Merge all of the segments in RHS into this live range as the specified
+/// value number.  The segments in RHS are allowed to overlap with segments in
+/// the current range, but only if the overlapping segments have the
+/// specified value number.
+void LiveRange::MergeSegmentsInAsValue(const LiveRange &RHS,
+                                       VNInfo *LHSValNo) {
+  LiveRangeUpdater Updater(this);
+  for (const Segment &S : RHS.segments)
+    Updater.add(S.start, S.end, LHSValNo);
+}
+
+/// MergeValueInAsValue - Merge all of the live segments of a specific val#
+/// in RHS into this live range as the specified value number.
+/// The segments in RHS are allowed to overlap with segments in the
+/// current range, it will replace the value numbers of the overlaped
+/// segments with the specified value number.
+void LiveRange::MergeValueInAsValue(const LiveRange &RHS,
+                                    const VNInfo *RHSValNo,
+                                    VNInfo *LHSValNo) {
+  LiveRangeUpdater Updater(this);
+  for (const Segment &S : RHS.segments)
+    if (S.valno == RHSValNo)
+      Updater.add(S.start, S.end, LHSValNo);
+}
+
+/// MergeValueNumberInto - This method is called when two value nubmers
+/// are found to be equivalent.  This eliminates V1, replacing all
+/// segments with the V1 value number with the V2 value number.  This can
+/// cause merging of V1/V2 values numbers and compaction of the value space.
+VNInfo *LiveRange::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
+  assert(V1 != V2 && "Identical value#'s are always equivalent!");
+
+  // This code actually merges the (numerically) larger value number into the
+  // smaller value number, which is likely to allow us to compactify the value
+  // space.  The only thing we have to be careful of is to preserve the
+  // instruction that defines the result value.
+
+  // Make sure V2 is smaller than V1.
+  if (V1->id < V2->id) {
+    V1->copyFrom(*V2);
+    std::swap(V1, V2);
+  }
+
+  // Merge V1 segments into V2.
+  for (iterator I = begin(); I != end(); ) {
+    iterator S = I++;
+    if (S->valno != V1) continue;  // Not a V1 Segment.
+
+    // Okay, we found a V1 live range.  If it had a previous, touching, V2 live
+    // range, extend it.
+    if (S != begin()) {
+      iterator Prev = S-1;
+      if (Prev->valno == V2 && Prev->end == S->start) {
+        Prev->end = S->end;
+
+        // Erase this live-range.
+        segments.erase(S);
+        I = Prev+1;
+        S = Prev;
+      }
+    }
+
+    // Okay, now we have a V1 or V2 live range that is maximally merged forward.
+    // Ensure that it is a V2 live-range.
+    S->valno = V2;
+
+    // If we can merge it into later V2 segments, do so now.  We ignore any
+    // following V1 segments, as they will be merged in subsequent iterations
+    // of the loop.
+    if (I != end()) {
+      if (I->start == S->end && I->valno == V2) {
+        S->end = I->end;
+        segments.erase(I);
+        I = S+1;
+      }
+    }
+  }
+
+  // Now that V1 is dead, remove it.
+  markValNoForDeletion(V1);
+
+  return V2;
+}
+
+void LiveRange::flushSegmentSet() {
+  assert(segmentSet != nullptr && "segment set must have been created");
+  assert(
+      segments.empty() &&
+      "segment set can be used only initially before switching to the array");
+  segments.append(segmentSet->begin(), segmentSet->end());
+  segmentSet = nullptr;
+  verify();
+}
+
+bool LiveRange::isLiveAtIndexes(ArrayRef<SlotIndex> Slots) const {
+  ArrayRef<SlotIndex>::iterator SlotI = Slots.begin();
+  ArrayRef<SlotIndex>::iterator SlotE = Slots.end();
+
+  // If there are no regmask slots, we have nothing to search.
+  if (SlotI == SlotE)
+    return false;
+
+  // Start our search at the first segment that ends after the first slot.
+  const_iterator SegmentI = find(*SlotI);
+  const_iterator SegmentE = end();
+
+  // If there are no segments that end after the first slot, we're done.
+  if (SegmentI == SegmentE)
+    return false;
+
+  // Look for each slot in the live range.
+  for ( ; SlotI != SlotE; ++SlotI) {
+    // Go to the next segment that ends after the current slot.
+    // The slot may be within a hole in the range.
+    SegmentI = advanceTo(SegmentI, *SlotI);
+    if (SegmentI == SegmentE)
+      return false;
+
+    // If this segment contains the slot, we're done.
+    if (SegmentI->contains(*SlotI))
+      return true;
+    // Otherwise, look for the next slot.
+  }
+
+  // We didn't find a segment containing any of the slots.
+  return false;
+}
+
+void LiveInterval::freeSubRange(SubRange *S) {
+  S->~SubRange();
+  // Memory was allocated with BumpPtr allocator and is not freed here.
+}
+
+void LiveInterval::removeEmptySubRanges() {
+  SubRange **NextPtr = &SubRanges;
+  SubRange *I = *NextPtr;
+  while (I != nullptr) {
+    if (!I->empty()) {
+      NextPtr = &I->Next;
+      I = *NextPtr;
+      continue;
+    }
+    // Skip empty subranges until we find the first nonempty one.
+    do {
+      SubRange *Next = I->Next;
+      freeSubRange(I);
+      I = Next;
+    } while (I != nullptr && I->empty());
+    *NextPtr = I;
+  }
+}
+
+void LiveInterval::clearSubRanges() {
+  for (SubRange *I = SubRanges, *Next; I != nullptr; I = Next) {
+    Next = I->Next;
+    freeSubRange(I);
+  }
+  SubRanges = nullptr;
+}
+
+/// For each VNI in \p SR, check whether or not that value defines part
+/// of the mask describe by \p LaneMask and if not, remove that value
+/// from \p SR.
+static void stripValuesNotDefiningMask(unsigned Reg, LiveInterval::SubRange &SR,
+                                       LaneBitmask LaneMask,
+                                       const SlotIndexes &Indexes,
+                                       const TargetRegisterInfo &TRI,
+                                       unsigned ComposeSubRegIdx) {
+  // Phys reg should not be tracked at subreg level.
+  // Same for noreg (Reg == 0).
+  if (!Register::isVirtualRegister(Reg) || !Reg)
+    return;
+  // Remove the values that don't define those lanes.
+  SmallVector<VNInfo *, 8> ToBeRemoved;
+  for (VNInfo *VNI : SR.valnos) {
+    if (VNI->isUnused())
+      continue;
+    // PHI definitions don't have MI attached, so there is nothing
+    // we can use to strip the VNI.
+    if (VNI->isPHIDef())
+      continue;
+    const MachineInstr *MI = Indexes.getInstructionFromIndex(VNI->def);
+    assert(MI && "Cannot find the definition of a value");
+    bool hasDef = false;
+    for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) {
+      if (!MOI->isReg() || !MOI->isDef())
+        continue;
+      if (MOI->getReg() != Reg)
+        continue;
+      LaneBitmask OrigMask = TRI.getSubRegIndexLaneMask(MOI->getSubReg());
+      LaneBitmask ExpectedDefMask =
+          ComposeSubRegIdx
+              ? TRI.composeSubRegIndexLaneMask(ComposeSubRegIdx, OrigMask)
+              : OrigMask;
+      if ((ExpectedDefMask & LaneMask).none())
+        continue;
+      hasDef = true;
+      break;
+    }
+
+    if (!hasDef)
+      ToBeRemoved.push_back(VNI);
+  }
+  for (VNInfo *VNI : ToBeRemoved)
+    SR.removeValNo(VNI);
+
+  // If the subrange is empty at this point, the MIR is invalid. Do not assert
+  // and let the verifier catch this case.
+}
+
+void LiveInterval::refineSubRanges(
+    BumpPtrAllocator &Allocator, LaneBitmask LaneMask,
+    std::function<void(LiveInterval::SubRange &)> Apply,
+    const SlotIndexes &Indexes, const TargetRegisterInfo &TRI,
+    unsigned ComposeSubRegIdx) {
+  LaneBitmask ToApply = LaneMask;
+  for (SubRange &SR : subranges()) {
+    LaneBitmask SRMask = SR.LaneMask;
+    LaneBitmask Matching = SRMask & LaneMask;
+    if (Matching.none())
+      continue;
+
+    SubRange *MatchingRange;
+    if (SRMask == Matching) {
+      // The subrange fits (it does not cover bits outside \p LaneMask).
+      MatchingRange = &SR;
+    } else {
+      // We have to split the subrange into a matching and non-matching part.
+      // Reduce lanemask of existing lane to non-matching part.
+      SR.LaneMask = SRMask & ~Matching;
+      // Create a new subrange for the matching part
+      MatchingRange = createSubRangeFrom(Allocator, Matching, SR);
+      // Now that the subrange is split in half, make sure we
+      // only keep in the subranges the VNIs that touch the related half.
+      stripValuesNotDefiningMask(reg(), *MatchingRange, Matching, Indexes, TRI,
+                                 ComposeSubRegIdx);
+      stripValuesNotDefiningMask(reg(), SR, SR.LaneMask, Indexes, TRI,
+                                 ComposeSubRegIdx);
+    }
+    Apply(*MatchingRange);
+    ToApply &= ~Matching;
+  }
+  // Create a new subrange if there are uncovered bits left.
+  if (ToApply.any()) {
+    SubRange *NewRange = createSubRange(Allocator, ToApply);
+    Apply(*NewRange);
+  }
+}
+
+unsigned LiveInterval::getSize() const {
+  unsigned Sum = 0;
+  for (const Segment &S : segments)
+    Sum += S.start.distance(S.end);
+  return Sum;
+}
+
+void LiveInterval::computeSubRangeUndefs(SmallVectorImpl<SlotIndex> &Undefs,
+                                         LaneBitmask LaneMask,
+                                         const MachineRegisterInfo &MRI,
+                                         const SlotIndexes &Indexes) const {
+  assert(reg().isVirtual());
+  LaneBitmask VRegMask = MRI.getMaxLaneMaskForVReg(reg());
+  assert((VRegMask & LaneMask).any());
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  for (const MachineOperand &MO : MRI.def_operands(reg())) {
+    if (!MO.isUndef())
+      continue;
+    unsigned SubReg = MO.getSubReg();
+    assert(SubReg != 0 && "Undef should only be set on subreg defs");
+    LaneBitmask DefMask = TRI.getSubRegIndexLaneMask(SubReg);
+    LaneBitmask UndefMask = VRegMask & ~DefMask;
+    if ((UndefMask & LaneMask).any()) {
+      const MachineInstr &MI = *MO.getParent();
+      bool EarlyClobber = MO.isEarlyClobber();
+      SlotIndex Pos = Indexes.getInstructionIndex(MI).getRegSlot(EarlyClobber);
+      Undefs.push_back(Pos);
+    }
+  }
+}
+
+raw_ostream& llvm::operator<<(raw_ostream& OS, const LiveRange::Segment &S) {
+  return OS << '[' << S.start << ',' << S.end << ':' << S.valno->id << ')';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LiveRange::Segment::dump() const {
+  dbgs() << *this << '\n';
+}
+#endif
+
+void LiveRange::print(raw_ostream &OS) const {
+  if (empty())
+    OS << "EMPTY";
+  else {
+    for (const Segment &S : segments) {
+      OS << S;
+      assert(S.valno == getValNumInfo(S.valno->id) && "Bad VNInfo");
+    }
+  }
+
+  // Print value number info.
+  if (getNumValNums()) {
+    OS << ' ';
+    unsigned vnum = 0;
+    for (const_vni_iterator i = vni_begin(), e = vni_end(); i != e;
+         ++i, ++vnum) {
+      const VNInfo *vni = *i;
+      if (vnum) OS << ' ';
+      OS << vnum << '@';
+      if (vni->isUnused()) {
+        OS << 'x';
+      } else {
+        OS << vni->def;
+        if (vni->isPHIDef())
+          OS << "-phi";
+      }
+    }
+  }
+}
+
+void LiveInterval::SubRange::print(raw_ostream &OS) const {
+  OS << "  L" << PrintLaneMask(LaneMask) << ' '
+     << static_cast<const LiveRange &>(*this);
+}
+
+void LiveInterval::print(raw_ostream &OS) const {
+  OS << printReg(reg()) << ' ';
+  super::print(OS);
+  // Print subranges
+  for (const SubRange &SR : subranges())
+    OS << SR;
+  OS << "  weight:" << Weight;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LiveRange::dump() const {
+  dbgs() << *this << '\n';
+}
+
+LLVM_DUMP_METHOD void LiveInterval::SubRange::dump() const {
+  dbgs() << *this << '\n';
+}
+
+LLVM_DUMP_METHOD void LiveInterval::dump() const {
+  dbgs() << *this << '\n';
+}
+#endif
+
+#ifndef NDEBUG
+void LiveRange::verify() const {
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    assert(I->start.isValid());
+    assert(I->end.isValid());
+    assert(I->start < I->end);
+    assert(I->valno != nullptr);
+    assert(I->valno->id < valnos.size());
+    assert(I->valno == valnos[I->valno->id]);
+    if (std::next(I) != E) {
+      assert(I->end <= std::next(I)->start);
+      if (I->end == std::next(I)->start)
+        assert(I->valno != std::next(I)->valno);
+    }
+  }
+}
+
+void LiveInterval::verify(const MachineRegisterInfo *MRI) const {
+  super::verify();
+
+  // Make sure SubRanges are fine and LaneMasks are disjunct.
+  LaneBitmask Mask;
+  LaneBitmask MaxMask = MRI != nullptr ? MRI->getMaxLaneMaskForVReg(reg())
+                                       : LaneBitmask::getAll();
+  for (const SubRange &SR : subranges()) {
+    // Subrange lanemask should be disjunct to any previous subrange masks.
+    assert((Mask & SR.LaneMask).none());
+    Mask |= SR.LaneMask;
+
+    // subrange mask should not contained in maximum lane mask for the vreg.
+    assert((Mask & ~MaxMask).none());
+    // empty subranges must be removed.
+    assert(!SR.empty());
+
+    SR.verify();
+    // Main liverange should cover subrange.
+    assert(covers(SR));
+  }
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+//                           LiveRangeUpdater class
+//===----------------------------------------------------------------------===//
+//
+// The LiveRangeUpdater class always maintains these invariants:
+//
+// - When LastStart is invalid, Spills is empty and the iterators are invalid.
+//   This is the initial state, and the state created by flush().
+//   In this state, isDirty() returns false.
+//
+// Otherwise, segments are kept in three separate areas:
+//
+// 1. [begin; WriteI) at the front of LR.
+// 2. [ReadI; end) at the back of LR.
+// 3. Spills.
+//
+// - LR.begin() <= WriteI <= ReadI <= LR.end().
+// - Segments in all three areas are fully ordered and coalesced.
+// - Segments in area 1 precede and can't coalesce with segments in area 2.
+// - Segments in Spills precede and can't coalesce with segments in area 2.
+// - No coalescing is possible between segments in Spills and segments in area
+//   1, and there are no overlapping segments.
+//
+// The segments in Spills are not ordered with respect to the segments in area
+// 1. They need to be merged.
+//
+// When they exist, Spills.back().start <= LastStart,
+//                 and WriteI[-1].start <= LastStart.
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LiveRangeUpdater::print(raw_ostream &OS) const {
+  if (!isDirty()) {
+    if (LR)
+      OS << "Clean updater: " << *LR << '\n';
+    else
+      OS << "Null updater.\n";
+    return;
+  }
+  assert(LR && "Can't have null LR in dirty updater.");
+  OS << " updater with gap = " << (ReadI - WriteI)
+     << ", last start = " << LastStart
+     << ":\n  Area 1:";
+  for (const auto &S : make_range(LR->begin(), WriteI))
+    OS << ' ' << S;
+  OS << "\n  Spills:";
+  for (unsigned I = 0, E = Spills.size(); I != E; ++I)
+    OS << ' ' << Spills[I];
+  OS << "\n  Area 2:";
+  for (const auto &S : make_range(ReadI, LR->end()))
+    OS << ' ' << S;
+  OS << '\n';
+}
+
+LLVM_DUMP_METHOD void LiveRangeUpdater::dump() const {
+  print(errs());
+}
+#endif
+
+// Determine if A and B should be coalesced.
+static inline bool coalescable(const LiveRange::Segment &A,
+                               const LiveRange::Segment &B) {
+  assert(A.start <= B.start && "Unordered live segments.");
+  if (A.end == B.start)
+    return A.valno == B.valno;
+  if (A.end < B.start)
+    return false;
+  assert(A.valno == B.valno && "Cannot overlap different values");
+  return true;
+}
+
+void LiveRangeUpdater::add(LiveRange::Segment Seg) {
+  assert(LR && "Cannot add to a null destination");
+
+  // Fall back to the regular add method if the live range
+  // is using the segment set instead of the segment vector.
+  if (LR->segmentSet != nullptr) {
+    LR->addSegmentToSet(Seg);
+    return;
+  }
+
+  // Flush the state if Start moves backwards.
+  if (!LastStart.isValid() || LastStart > Seg.start) {
+    if (isDirty())
+      flush();
+    // This brings us to an uninitialized state. Reinitialize.
+    assert(Spills.empty() && "Leftover spilled segments");
+    WriteI = ReadI = LR->begin();
+  }
+
+  // Remember start for next time.
+  LastStart = Seg.start;
+
+  // Advance ReadI until it ends after Seg.start.
+  LiveRange::iterator E = LR->end();
+  if (ReadI != E && ReadI->end <= Seg.start) {
+    // First try to close the gap between WriteI and ReadI with spills.
+    if (ReadI != WriteI)
+      mergeSpills();
+    // Then advance ReadI.
+    if (ReadI == WriteI)
+      ReadI = WriteI = LR->find(Seg.start);
+    else
+      while (ReadI != E && ReadI->end <= Seg.start)
+        *WriteI++ = *ReadI++;
+  }
+
+  assert(ReadI == E || ReadI->end > Seg.start);
+
+  // Check if the ReadI segment begins early.
+  if (ReadI != E && ReadI->start <= Seg.start) {
+    assert(ReadI->valno == Seg.valno && "Cannot overlap different values");
+    // Bail if Seg is completely contained in ReadI.
+    if (ReadI->end >= Seg.end)
+      return;
+    // Coalesce into Seg.
+    Seg.start = ReadI->start;
+    ++ReadI;
+  }
+
+  // Coalesce as much as possible from ReadI into Seg.
+  while (ReadI != E && coalescable(Seg, *ReadI)) {
+    Seg.end = std::max(Seg.end, ReadI->end);
+    ++ReadI;
+  }
+
+  // Try coalescing Spills.back() into Seg.
+  if (!Spills.empty() && coalescable(Spills.back(), Seg)) {
+    Seg.start = Spills.back().start;
+    Seg.end = std::max(Spills.back().end, Seg.end);
+    Spills.pop_back();
+  }
+
+  // Try coalescing Seg into WriteI[-1].
+  if (WriteI != LR->begin() && coalescable(WriteI[-1], Seg)) {
+    WriteI[-1].end = std::max(WriteI[-1].end, Seg.end);
+    return;
+  }
+
+  // Seg doesn't coalesce with anything, and needs to be inserted somewhere.
+  if (WriteI != ReadI) {
+    *WriteI++ = Seg;
+    return;
+  }
+
+  // Finally, append to LR or Spills.
+  if (WriteI == E) {
+    LR->segments.push_back(Seg);
+    WriteI = ReadI = LR->end();
+  } else
+    Spills.push_back(Seg);
+}
+
+// Merge as many spilled segments as possible into the gap between WriteI
+// and ReadI. Advance WriteI to reflect the inserted instructions.
+void LiveRangeUpdater::mergeSpills() {
+  // Perform a backwards merge of Spills and [SpillI;WriteI).
+  size_t GapSize = ReadI - WriteI;
+  size_t NumMoved = std::min(Spills.size(), GapSize);
+  LiveRange::iterator Src = WriteI;
+  LiveRange::iterator Dst = Src + NumMoved;
+  LiveRange::iterator SpillSrc = Spills.end();
+  LiveRange::iterator B = LR->begin();
+
+  // This is the new WriteI position after merging spills.
+  WriteI = Dst;
+
+  // Now merge Src and Spills backwards.
+  while (Src != Dst) {
+    if (Src != B && Src[-1].start > SpillSrc[-1].start)
+      *--Dst = *--Src;
+    else
+      *--Dst = *--SpillSrc;
+  }
+  assert(NumMoved == size_t(Spills.end() - SpillSrc));
+  Spills.erase(SpillSrc, Spills.end());
+}
+
+void LiveRangeUpdater::flush() {
+  if (!isDirty())
+    return;
+  // Clear the dirty state.
+  LastStart = SlotIndex();
+
+  assert(LR && "Cannot add to a null destination");
+
+  // Nothing to merge?
+  if (Spills.empty()) {
+    LR->segments.erase(WriteI, ReadI);
+    LR->verify();
+    return;
+  }
+
+  // Resize the WriteI - ReadI gap to match Spills.
+  size_t GapSize = ReadI - WriteI;
+  if (GapSize < Spills.size()) {
+    // The gap is too small. Make some room.
+    size_t WritePos = WriteI - LR->begin();
+    LR->segments.insert(ReadI, Spills.size() - GapSize, LiveRange::Segment());
+    // This also invalidated ReadI, but it is recomputed below.
+    WriteI = LR->begin() + WritePos;
+  } else {
+    // Shrink the gap if necessary.
+    LR->segments.erase(WriteI + Spills.size(), ReadI);
+  }
+  ReadI = WriteI + Spills.size();
+  mergeSpills();
+  LR->verify();
+}
+
+unsigned ConnectedVNInfoEqClasses::Classify(const LiveRange &LR) {
+  // Create initial equivalence classes.
+  EqClass.clear();
+  EqClass.grow(LR.getNumValNums());
+
+  const VNInfo *used = nullptr, *unused = nullptr;
+
+  // Determine connections.
+  for (const VNInfo *VNI : LR.valnos) {
+    // Group all unused values into one class.
+    if (VNI->isUnused()) {
+      if (unused)
+        EqClass.join(unused->id, VNI->id);
+      unused = VNI;
+      continue;
+    }
+    used = VNI;
+    if (VNI->isPHIDef()) {
+      const MachineBasicBlock *MBB = LIS.getMBBFromIndex(VNI->def);
+      assert(MBB && "Phi-def has no defining MBB");
+      // Connect to values live out of predecessors.
+      for (MachineBasicBlock *Pred : MBB->predecessors())
+        if (const VNInfo *PVNI = LR.getVNInfoBefore(LIS.getMBBEndIdx(Pred)))
+          EqClass.join(VNI->id, PVNI->id);
+    } else {
+      // Normal value defined by an instruction. Check for two-addr redef.
+      // FIXME: This could be coincidental. Should we really check for a tied
+      // operand constraint?
+      // Note that VNI->def may be a use slot for an early clobber def.
+      if (const VNInfo *UVNI = LR.getVNInfoBefore(VNI->def))
+        EqClass.join(VNI->id, UVNI->id);
+    }
+  }
+
+  // Lump all the unused values in with the last used value.
+  if (used && unused)
+    EqClass.join(used->id, unused->id);
+
+  EqClass.compress();
+  return EqClass.getNumClasses();
+}
+
+void ConnectedVNInfoEqClasses::Distribute(LiveInterval &LI, LiveInterval *LIV[],
+                                          MachineRegisterInfo &MRI) {
+  // Rewrite instructions.
+  for (MachineOperand &MO :
+       llvm::make_early_inc_range(MRI.reg_operands(LI.reg()))) {
+    MachineInstr *MI = MO.getParent();
+    const VNInfo *VNI;
+    if (MI->isDebugValue()) {
+      // DBG_VALUE instructions don't have slot indexes, so get the index of
+      // the instruction before them. The value is defined there too.
+      SlotIndex Idx = LIS.getSlotIndexes()->getIndexBefore(*MI);
+      VNI = LI.Query(Idx).valueOut();
+    } else {
+      SlotIndex Idx = LIS.getInstructionIndex(*MI);
+      LiveQueryResult LRQ = LI.Query(Idx);
+      VNI = MO.readsReg() ? LRQ.valueIn() : LRQ.valueDefined();
+    }
+    // In the case of an <undef> use that isn't tied to any def, VNI will be
+    // NULL. If the use is tied to a def, VNI will be the defined value.
+    if (!VNI)
+      continue;
+    if (unsigned EqClass = getEqClass(VNI))
+      MO.setReg(LIV[EqClass - 1]->reg());
+  }
+
+  // Distribute subregister liveranges.
+  if (LI.hasSubRanges()) {
+    unsigned NumComponents = EqClass.getNumClasses();
+    SmallVector<unsigned, 8> VNIMapping;
+    SmallVector<LiveInterval::SubRange*, 8> SubRanges;
+    BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator();
+    for (LiveInterval::SubRange &SR : LI.subranges()) {
+      // Create new subranges in the split intervals and construct a mapping
+      // for the VNInfos in the subrange.
+      unsigned NumValNos = SR.valnos.size();
+      VNIMapping.clear();
+      VNIMapping.reserve(NumValNos);
+      SubRanges.clear();
+      SubRanges.resize(NumComponents-1, nullptr);
+      for (unsigned I = 0; I < NumValNos; ++I) {
+        const VNInfo &VNI = *SR.valnos[I];
+        unsigned ComponentNum;
+        if (VNI.isUnused()) {
+          ComponentNum = 0;
+        } else {
+          const VNInfo *MainRangeVNI = LI.getVNInfoAt(VNI.def);
+          assert(MainRangeVNI != nullptr
+                 && "SubRange def must have corresponding main range def");
+          ComponentNum = getEqClass(MainRangeVNI);
+          if (ComponentNum > 0 && SubRanges[ComponentNum-1] == nullptr) {
+            SubRanges[ComponentNum-1]
+              = LIV[ComponentNum-1]->createSubRange(Allocator, SR.LaneMask);
+          }
+        }
+        VNIMapping.push_back(ComponentNum);
+      }
+      DistributeRange(SR, SubRanges.data(), VNIMapping);
+    }
+    LI.removeEmptySubRanges();
+  }
+
+  // Distribute main liverange.
+  DistributeRange(LI, LIV, EqClass);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervalCalc.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervalCalc.cpp
new file mode 100644
index 000000000000..ccc5ae98086e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervalCalc.cpp
@@ -0,0 +1,196 @@
+//===- LiveIntervalCalc.cpp - Calculate live interval --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the LiveIntervalCalc class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveIntervalCalc.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+// Reserve an address that indicates a value that is known to be "undef".
+static VNInfo UndefVNI(0xbad, SlotIndex());
+
+static void createDeadDef(SlotIndexes &Indexes, VNInfo::Allocator &Alloc,
+                          LiveRange &LR, const MachineOperand &MO) {
+  const MachineInstr &MI = *MO.getParent();
+  SlotIndex DefIdx =
+      Indexes.getInstructionIndex(MI).getRegSlot(MO.isEarlyClobber());
+
+  // Create the def in LR. This may find an existing def.
+  LR.createDeadDef(DefIdx, Alloc);
+}
+
+void LiveIntervalCalc::calculate(LiveInterval &LI, bool TrackSubRegs) {
+  const MachineRegisterInfo *MRI = getRegInfo();
+  SlotIndexes *Indexes = getIndexes();
+  VNInfo::Allocator *Alloc = getVNAlloc();
+
+  assert(MRI && Indexes && "call reset() first");
+
+  // Step 1: Create minimal live segments for every definition of Reg.
+  // Visit all def operands. If the same instruction has multiple defs of Reg,
+  // createDeadDef() will deduplicate.
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
+  Register Reg = LI.reg();
+  for (const MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+    if (!MO.isDef() && !MO.readsReg())
+      continue;
+
+    unsigned SubReg = MO.getSubReg();
+    if (LI.hasSubRanges() || (SubReg != 0 && TrackSubRegs)) {
+      LaneBitmask SubMask = SubReg != 0 ? TRI.getSubRegIndexLaneMask(SubReg)
+                                        : MRI->getMaxLaneMaskForVReg(Reg);
+      // If this is the first time we see a subregister def, initialize
+      // subranges by creating a copy of the main range.
+      if (!LI.hasSubRanges() && !LI.empty()) {
+        LaneBitmask ClassMask = MRI->getMaxLaneMaskForVReg(Reg);
+        LI.createSubRangeFrom(*Alloc, ClassMask, LI);
+      }
+
+      LI.refineSubRanges(
+          *Alloc, SubMask,
+          [&MO, Indexes, Alloc](LiveInterval::SubRange &SR) {
+            if (MO.isDef())
+              createDeadDef(*Indexes, *Alloc, SR, MO);
+          },
+          *Indexes, TRI);
+    }
+
+    // Create the def in the main liverange. We do not have to do this if
+    // subranges are tracked as we recreate the main range later in this case.
+    if (MO.isDef() && !LI.hasSubRanges())
+      createDeadDef(*Indexes, *Alloc, LI, MO);
+  }
+
+  // We may have created empty live ranges for partially undefined uses, we
+  // can't keep them because we won't find defs in them later.
+  LI.removeEmptySubRanges();
+
+  const MachineFunction *MF = getMachineFunction();
+  MachineDominatorTree *DomTree = getDomTree();
+  // Step 2: Extend live segments to all uses, constructing SSA form as
+  // necessary.
+  if (LI.hasSubRanges()) {
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      LiveIntervalCalc SubLIC;
+      SubLIC.reset(MF, Indexes, DomTree, Alloc);
+      SubLIC.extendToUses(S, Reg, S.LaneMask, &LI);
+    }
+    LI.clear();
+    constructMainRangeFromSubranges(LI);
+  } else {
+    resetLiveOutMap();
+    extendToUses(LI, Reg, LaneBitmask::getAll());
+  }
+}
+
+void LiveIntervalCalc::constructMainRangeFromSubranges(LiveInterval &LI) {
+  // First create dead defs at all defs found in subranges.
+  LiveRange &MainRange = LI;
+  assert(MainRange.segments.empty() && MainRange.valnos.empty() &&
+         "Expect empty main liverange");
+
+  VNInfo::Allocator *Alloc = getVNAlloc();
+  for (const LiveInterval::SubRange &SR : LI.subranges()) {
+    for (const VNInfo *VNI : SR.valnos) {
+      if (!VNI->isUnused() && !VNI->isPHIDef())
+        MainRange.createDeadDef(VNI->def, *Alloc);
+    }
+  }
+  resetLiveOutMap();
+  extendToUses(MainRange, LI.reg(), LaneBitmask::getAll(), &LI);
+}
+
+void LiveIntervalCalc::createDeadDefs(LiveRange &LR, Register Reg) {
+  const MachineRegisterInfo *MRI = getRegInfo();
+  SlotIndexes *Indexes = getIndexes();
+  VNInfo::Allocator *Alloc = getVNAlloc();
+  assert(MRI && Indexes && "call reset() first");
+
+  // Visit all def operands. If the same instruction has multiple defs of Reg,
+  // LR.createDeadDef() will deduplicate.
+  for (MachineOperand &MO : MRI->def_operands(Reg))
+    createDeadDef(*Indexes, *Alloc, LR, MO);
+}
+
+void LiveIntervalCalc::extendToUses(LiveRange &LR, Register Reg,
+                                    LaneBitmask Mask, LiveInterval *LI) {
+  const MachineRegisterInfo *MRI = getRegInfo();
+  SlotIndexes *Indexes = getIndexes();
+  SmallVector<SlotIndex, 4> Undefs;
+  if (LI != nullptr)
+    LI->computeSubRangeUndefs(Undefs, Mask, *MRI, *Indexes);
+
+  // Visit all operands that read Reg. This may include partial defs.
+  bool IsSubRange = !Mask.all();
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
+  for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+    // Clear all kill flags. They will be reinserted after register allocation
+    // by LiveIntervals::addKillFlags().
+    if (MO.isUse())
+      MO.setIsKill(false);
+    // MO::readsReg returns "true" for subregister defs. This is for keeping
+    // liveness of the entire register (i.e. for the main range of the live
+    // interval). For subranges, definitions of non-overlapping subregisters
+    // do not count as uses.
+    if (!MO.readsReg() || (IsSubRange && MO.isDef()))
+      continue;
+
+    unsigned SubReg = MO.getSubReg();
+    if (SubReg != 0) {
+      LaneBitmask SLM = TRI.getSubRegIndexLaneMask(SubReg);
+      if (MO.isDef())
+        SLM = ~SLM;
+      // Ignore uses not reading the current (sub)range.
+      if ((SLM & Mask).none())
+        continue;
+    }
+
+    // Determine the actual place of the use.
+    const MachineInstr *MI = MO.getParent();
+    unsigned OpNo = (&MO - &MI->getOperand(0));
+    SlotIndex UseIdx;
+    if (MI->isPHI()) {
+      assert(!MO.isDef() && "Cannot handle PHI def of partial register.");
+      // The actual place where a phi operand is used is the end of the pred
+      // MBB. PHI operands are paired: (Reg, PredMBB).
+      UseIdx = Indexes->getMBBEndIdx(MI->getOperand(OpNo + 1).getMBB());
+    } else {
+      // Check for early-clobber redefs.
+      bool isEarlyClobber = false;
+      unsigned DefIdx;
+      if (MO.isDef())
+        isEarlyClobber = MO.isEarlyClobber();
+      else if (MI->isRegTiedToDefOperand(OpNo, &DefIdx)) {
+        // FIXME: This would be a lot easier if tied early-clobber uses also
+        // had an early-clobber flag.
+        isEarlyClobber = MI->getOperand(DefIdx).isEarlyClobber();
+      }
+      UseIdx = Indexes->getInstructionIndex(*MI).getRegSlot(isEarlyClobber);
+    }
+
+    // MI is reading Reg. We may have visited MI before if it happens to be
+    // reading Reg multiple times. That is OK, extend() is idempotent.
+    extend(LR, UseIdx, Reg, Undefs);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervalUnion.cpp
new file mode 100644
index 000000000000..11a4ecf0bef9
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervalUnion.cpp
@@ -0,0 +1,215 @@
+//===- LiveIntervalUnion.cpp - Live interval union data structure ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// LiveIntervalUnion represents a coalesced set of live intervals. This may be
+// used during coalescing to represent a congruence class, or during register
+// allocation to model liveness of a physical register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveIntervalUnion.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdlib>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+// Merge a LiveInterval's segments. Guarantee no overlaps.
+void LiveIntervalUnion::unify(const LiveInterval &VirtReg,
+                              const LiveRange &Range) {
+  if (Range.empty())
+    return;
+  ++Tag;
+
+  // Insert each of the virtual register's live segments into the map.
+  LiveRange::const_iterator RegPos = Range.begin();
+  LiveRange::const_iterator RegEnd = Range.end();
+  SegmentIter SegPos = Segments.find(RegPos->start);
+
+  while (SegPos.valid()) {
+    SegPos.insert(RegPos->start, RegPos->end, &VirtReg);
+    if (++RegPos == RegEnd)
+      return;
+    SegPos.advanceTo(RegPos->start);
+  }
+
+  // We have reached the end of Segments, so it is no longer necessary to search
+  // for the insertion position.
+  // It is faster to insert the end first.
+  --RegEnd;
+  SegPos.insert(RegEnd->start, RegEnd->end, &VirtReg);
+  for (; RegPos != RegEnd; ++RegPos, ++SegPos)
+    SegPos.insert(RegPos->start, RegPos->end, &VirtReg);
+}
+
+// Remove a live virtual register's segments from this union.
+void LiveIntervalUnion::extract(const LiveInterval &VirtReg,
+                                const LiveRange &Range) {
+  if (Range.empty())
+    return;
+  ++Tag;
+
+  // Remove each of the virtual register's live segments from the map.
+  LiveRange::const_iterator RegPos = Range.begin();
+  LiveRange::const_iterator RegEnd = Range.end();
+  SegmentIter SegPos = Segments.find(RegPos->start);
+
+  while (true) {
+    assert(SegPos.value() == &VirtReg && "Inconsistent LiveInterval");
+    SegPos.erase();
+    if (!SegPos.valid())
+      return;
+
+    // Skip all segments that may have been coalesced.
+    RegPos = Range.advanceTo(RegPos, SegPos.start());
+    if (RegPos == RegEnd)
+      return;
+
+    SegPos.advanceTo(RegPos->start);
+  }
+}
+
+void
+LiveIntervalUnion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
+  if (empty()) {
+    OS << " empty\n";
+    return;
+  }
+  for (LiveSegments::const_iterator SI = Segments.begin(); SI.valid(); ++SI) {
+    OS << " [" << SI.start() << ' ' << SI.stop()
+       << "):" << printReg(SI.value()->reg(), TRI);
+  }
+  OS << '\n';
+}
+
+#ifndef NDEBUG
+// Verify the live intervals in this union and add them to the visited set.
+void LiveIntervalUnion::verify(LiveVirtRegBitSet& VisitedVRegs) {
+  for (SegmentIter SI = Segments.begin(); SI.valid(); ++SI)
+    VisitedVRegs.set(SI.value()->reg());
+}
+#endif //!NDEBUG
+
+const LiveInterval *LiveIntervalUnion::getOneVReg() const {
+  if (empty())
+    return nullptr;
+  for (LiveSegments::const_iterator SI = Segments.begin(); SI.valid(); ++SI) {
+    // return the first valid live interval
+    return SI.value();
+  }
+  return nullptr;
+}
+
+// Scan the vector of interfering virtual registers in this union. Assume it's
+// quite small.
+bool LiveIntervalUnion::Query::isSeenInterference(
+    const LiveInterval *VirtReg) const {
+  return is_contained(InterferingVRegs, VirtReg);
+}
+
+// Collect virtual registers in this union that interfere with this
+// query's live virtual register.
+//
+// The query state is one of:
+//
+// 1. CheckedFirstInterference == false: Iterators are uninitialized.
+// 2. SeenAllInterferences == true: InterferingVRegs complete, iterators unused.
+// 3. Iterators left at the last seen intersection.
+//
+unsigned
+LiveIntervalUnion::Query::collectInterferingVRegs(unsigned MaxInterferingRegs) {
+  // Fast path return if we already have the desired information.
+  if (SeenAllInterferences || InterferingVRegs.size() >= MaxInterferingRegs)
+    return InterferingVRegs.size();
+
+  // Set up iterators on the first call.
+  if (!CheckedFirstInterference) {
+    CheckedFirstInterference = true;
+
+    // Quickly skip interference check for empty sets.
+    if (LR->empty() || LiveUnion->empty()) {
+      SeenAllInterferences = true;
+      return 0;
+    }
+
+    // In most cases, the union will start before LR.
+    LRI = LR->begin();
+    LiveUnionI.setMap(LiveUnion->getMap());
+    LiveUnionI.find(LRI->start);
+  }
+
+  LiveRange::const_iterator LREnd = LR->end();
+  const LiveInterval *RecentReg = nullptr;
+  while (LiveUnionI.valid()) {
+    assert(LRI != LREnd && "Reached end of LR");
+
+    // Check for overlapping interference.
+    while (LRI->start < LiveUnionI.stop() && LRI->end > LiveUnionI.start()) {
+      // This is an overlap, record the interfering register.
+      const LiveInterval *VReg = LiveUnionI.value();
+      if (VReg != RecentReg && !isSeenInterference(VReg)) {
+        RecentReg = VReg;
+        InterferingVRegs.push_back(VReg);
+        if (InterferingVRegs.size() >= MaxInterferingRegs)
+          return InterferingVRegs.size();
+      }
+      // This LiveUnion segment is no longer interesting.
+      if (!(++LiveUnionI).valid()) {
+        SeenAllInterferences = true;
+        return InterferingVRegs.size();
+      }
+    }
+
+    // The iterators are now not overlapping, LiveUnionI has been advanced
+    // beyond LRI.
+    assert(LRI->end <= LiveUnionI.start() && "Expected non-overlap");
+
+    // Advance the iterator that ends first.
+    LRI = LR->advanceTo(LRI, LiveUnionI.start());
+    if (LRI == LREnd)
+      break;
+
+    // Detect overlap, handle above.
+    if (LRI->start < LiveUnionI.stop())
+      continue;
+
+    // Still not overlapping. Catch up LiveUnionI.
+    LiveUnionI.advanceTo(LRI->start);
+  }
+  SeenAllInterferences = true;
+  return InterferingVRegs.size();
+}
+
+void LiveIntervalUnion::Array::init(LiveIntervalUnion::Allocator &Alloc,
+                                    unsigned NSize) {
+  // Reuse existing allocation.
+  if (NSize == Size)
+    return;
+  clear();
+  Size = NSize;
+  LIUs = static_cast<LiveIntervalUnion*>(
+      safe_malloc(sizeof(LiveIntervalUnion)*NSize));
+  for (unsigned i = 0; i != Size; ++i)
+    new(LIUs + i) LiveIntervalUnion(Alloc);
+}
+
+void LiveIntervalUnion::Array::clear() {
+  if (!LIUs)
+    return;
+  for (unsigned i = 0; i != Size; ++i)
+    LIUs[i].~LiveIntervalUnion();
+  free(LIUs);
+  Size =  0;
+  LIUs = nullptr;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervals.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervals.cpp
new file mode 100644
index 000000000000..da55e7f7284b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -0,0 +1,1748 @@
+//===- LiveIntervals.cpp - Live Interval Analysis -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file implements the LiveInterval analysis pass which is used
+/// by the Linear Scan Register allocator. This pass linearizes the
+/// basic blocks of the function in DFS order and computes live intervals for
+/// each virtual and physical register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalCalc.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+char LiveIntervals::ID = 0;
+char &llvm::LiveIntervalsID = LiveIntervals::ID;
+INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals", "Live Interval Analysis",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(LiveIntervals, "liveintervals",
+                "Live Interval Analysis", false, false)
+
+#ifndef NDEBUG
+static cl::opt<bool> EnablePrecomputePhysRegs(
+  "precompute-phys-liveness", cl::Hidden,
+  cl::desc("Eagerly compute live intervals for all physreg units."));
+#else
+static bool EnablePrecomputePhysRegs = false;
+#endif // NDEBUG
+
+namespace llvm {
+
+cl::opt<bool> UseSegmentSetForPhysRegs(
+    "use-segment-set-for-physregs", cl::Hidden, cl::init(true),
+    cl::desc(
+        "Use segment set for the computation of the live ranges of physregs."));
+
+} // end namespace llvm
+
+void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addPreserved<LiveVariables>();
+  AU.addPreservedID(MachineLoopInfoID);
+  AU.addRequiredTransitiveID(MachineDominatorsID);
+  AU.addPreservedID(MachineDominatorsID);
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequiredTransitive<SlotIndexes>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+LiveIntervals::LiveIntervals() : MachineFunctionPass(ID) {
+  initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
+}
+
+LiveIntervals::~LiveIntervals() { delete LICalc; }
+
+void LiveIntervals::releaseMemory() {
+  // Free the live intervals themselves.
+  for (unsigned i = 0, e = VirtRegIntervals.size(); i != e; ++i)
+    delete VirtRegIntervals[Register::index2VirtReg(i)];
+  VirtRegIntervals.clear();
+  RegMaskSlots.clear();
+  RegMaskBits.clear();
+  RegMaskBlocks.clear();
+
+  for (LiveRange *LR : RegUnitRanges)
+    delete LR;
+  RegUnitRanges.clear();
+
+  // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
+  VNInfoAllocator.Reset();
+}
+
+bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
+  MF = &fn;
+  MRI = &MF->getRegInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
+  Indexes = &getAnalysis<SlotIndexes>();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+
+  if (!LICalc)
+    LICalc = new LiveIntervalCalc();
+
+  // Allocate space for all virtual registers.
+  VirtRegIntervals.resize(MRI->getNumVirtRegs());
+
+  computeVirtRegs();
+  computeRegMasks();
+  computeLiveInRegUnits();
+
+  if (EnablePrecomputePhysRegs) {
+    // For stress testing, precompute live ranges of all physical register
+    // units, including reserved registers.
+    for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i)
+      getRegUnit(i);
+  }
+  LLVM_DEBUG(dump());
+  return false;
+}
+
+void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
+  OS << "********** INTERVALS **********\n";
+
+  // Dump the regunits.
+  for (unsigned Unit = 0, UnitE = RegUnitRanges.size(); Unit != UnitE; ++Unit)
+    if (LiveRange *LR = RegUnitRanges[Unit])
+      OS << printRegUnit(Unit, TRI) << ' ' << *LR << '\n';
+
+  // Dump the virtregs.
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    Register Reg = Register::index2VirtReg(i);
+    if (hasInterval(Reg))
+      OS << getInterval(Reg) << '\n';
+  }
+
+  OS << "RegMasks:";
+  for (SlotIndex Idx : RegMaskSlots)
+    OS << ' ' << Idx;
+  OS << '\n';
+
+  printInstrs(OS);
+}
+
+void LiveIntervals::printInstrs(raw_ostream &OS) const {
+  OS << "********** MACHINEINSTRS **********\n";
+  MF->print(OS, Indexes);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LiveIntervals::dumpInstrs() const {
+  printInstrs(dbgs());
+}
+#endif
+
+LiveInterval *LiveIntervals::createInterval(Register reg) {
+  float Weight = reg.isPhysical() ? huge_valf : 0.0F;
+  return new LiveInterval(reg, Weight);
+}
+
+/// Compute the live interval of a virtual register, based on defs and uses.
+bool LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
+  assert(LICalc && "LICalc not initialized.");
+  assert(LI.empty() && "Should only compute empty intervals.");
+  LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  LICalc->calculate(LI, MRI->shouldTrackSubRegLiveness(LI.reg()));
+  return computeDeadValues(LI, nullptr);
+}
+
+void LiveIntervals::computeVirtRegs() {
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    Register Reg = Register::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    LiveInterval &LI = createEmptyInterval(Reg);
+    bool NeedSplit = computeVirtRegInterval(LI);
+    if (NeedSplit) {
+      SmallVector<LiveInterval*, 8> SplitLIs;
+      splitSeparateComponents(LI, SplitLIs);
+    }
+  }
+}
+
+void LiveIntervals::computeRegMasks() {
+  RegMaskBlocks.resize(MF->getNumBlockIDs());
+
+  // Find all instructions with regmask operands.
+  for (const MachineBasicBlock &MBB : *MF) {
+    std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB.getNumber()];
+    RMB.first = RegMaskSlots.size();
+
+    // Some block starts, such as EH funclets, create masks.
+    if (const uint32_t *Mask = MBB.getBeginClobberMask(TRI)) {
+      RegMaskSlots.push_back(Indexes->getMBBStartIdx(&MBB));
+      RegMaskBits.push_back(Mask);
+    }
+
+    // Unwinders may clobber additional registers.
+    // FIXME: This functionality can possibly be merged into
+    // MachineBasicBlock::getBeginClobberMask().
+    if (MBB.isEHPad())
+      if (auto *Mask = TRI->getCustomEHPadPreservedMask(*MBB.getParent())) {
+        RegMaskSlots.push_back(Indexes->getMBBStartIdx(&MBB));
+        RegMaskBits.push_back(Mask);
+      }
+
+    for (const MachineInstr &MI : MBB) {
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isRegMask())
+          continue;
+        RegMaskSlots.push_back(Indexes->getInstructionIndex(MI).getRegSlot());
+        RegMaskBits.push_back(MO.getRegMask());
+      }
+    }
+
+    // Some block ends, such as funclet returns, create masks. Put the mask on
+    // the last instruction of the block, because MBB slot index intervals are
+    // half-open.
+    if (const uint32_t *Mask = MBB.getEndClobberMask(TRI)) {
+      assert(!MBB.empty() && "empty return block?");
+      RegMaskSlots.push_back(
+          Indexes->getInstructionIndex(MBB.back()).getRegSlot());
+      RegMaskBits.push_back(Mask);
+    }
+
+    // Compute the number of register mask instructions in this block.
+    RMB.second = RegMaskSlots.size() - RMB.first;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           Register Unit Liveness
+//===----------------------------------------------------------------------===//
+//
+// Fixed interference typically comes from ABI boundaries: Function arguments
+// and return values are passed in fixed registers, and so are exception
+// pointers entering landing pads. Certain instructions require values to be
+// present in specific registers. That is also represented through fixed
+// interference.
+//
+
+/// Compute the live range of a register unit, based on the uses and defs of
+/// aliasing registers.  The range should be empty, or contain only dead
+/// phi-defs from ABI blocks.
+void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
+  assert(LICalc && "LICalc not initialized.");
+  LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+
+  // The physregs aliasing Unit are the roots and their super-registers.
+  // Create all values as dead defs before extending to uses. Note that roots
+  // may share super-registers. That's OK because createDeadDefs() is
+  // idempotent. It is very rare for a register unit to have multiple roots, so
+  // uniquing super-registers is probably not worthwhile.
+  bool IsReserved = false;
+  for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) {
+    bool IsRootReserved = true;
+    for (MCPhysReg Reg : TRI->superregs_inclusive(*Root)) {
+      if (!MRI->reg_empty(Reg))
+        LICalc->createDeadDefs(LR, Reg);
+      // A register unit is considered reserved if all its roots and all their
+      // super registers are reserved.
+      if (!MRI->isReserved(Reg))
+        IsRootReserved = false;
+    }
+    IsReserved |= IsRootReserved;
+  }
+  assert(IsReserved == MRI->isReservedRegUnit(Unit) &&
+         "reserved computation mismatch");
+
+  // Now extend LR to reach all uses.
+  // Ignore uses of reserved registers. We only track defs of those.
+  if (!IsReserved) {
+    for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) {
+      for (MCPhysReg Reg : TRI->superregs_inclusive(*Root)) {
+        if (!MRI->reg_empty(Reg))
+          LICalc->extendToUses(LR, Reg);
+      }
+    }
+  }
+
+  // Flush the segment set to the segment vector.
+  if (UseSegmentSetForPhysRegs)
+    LR.flushSegmentSet();
+}
+
+/// Precompute the live ranges of any register units that are live-in to an ABI
+/// block somewhere. Register values can appear without a corresponding def when
+/// entering the entry block or a landing pad.
+void LiveIntervals::computeLiveInRegUnits() {
+  RegUnitRanges.resize(TRI->getNumRegUnits());
+  LLVM_DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n");
+
+  // Keep track of the live range sets allocated.
+  SmallVector<unsigned, 8> NewRanges;
+
+  // Check all basic blocks for live-ins.
+  for (const MachineBasicBlock &MBB : *MF) {
+    // We only care about ABI blocks: Entry + landing pads.
+    if ((&MBB != &MF->front() && !MBB.isEHPad()) || MBB.livein_empty())
+      continue;
+
+    // Create phi-defs at Begin for all live-in registers.
+    SlotIndex Begin = Indexes->getMBBStartIdx(&MBB);
+    LLVM_DEBUG(dbgs() << Begin << "\t" << printMBBReference(MBB));
+    for (const auto &LI : MBB.liveins()) {
+      for (MCRegUnit Unit : TRI->regunits(LI.PhysReg)) {
+        LiveRange *LR = RegUnitRanges[Unit];
+        if (!LR) {
+          // Use segment set to speed-up initial computation of the live range.
+          LR = RegUnitRanges[Unit] = new LiveRange(UseSegmentSetForPhysRegs);
+          NewRanges.push_back(Unit);
+        }
+        VNInfo *VNI = LR->createDeadDef(Begin, getVNInfoAllocator());
+        (void)VNI;
+        LLVM_DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI) << '#' << VNI->id);
+      }
+    }
+    LLVM_DEBUG(dbgs() << '\n');
+  }
+  LLVM_DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n");
+
+  // Compute the 'normal' part of the ranges.
+  for (unsigned Unit : NewRanges)
+    computeRegUnitRange(*RegUnitRanges[Unit], Unit);
+}
+
+static void createSegmentsForValues(LiveRange &LR,
+    iterator_range<LiveInterval::vni_iterator> VNIs) {
+  for (VNInfo *VNI : VNIs) {
+    if (VNI->isUnused())
+      continue;
+    SlotIndex Def = VNI->def;
+    LR.addSegment(LiveRange::Segment(Def, Def.getDeadSlot(), VNI));
+  }
+}
+
+void LiveIntervals::extendSegmentsToUses(LiveRange &Segments,
+                                         ShrinkToUsesWorkList &WorkList,
+                                         Register Reg, LaneBitmask LaneMask) {
+  // Keep track of the PHIs that are in use.
+  SmallPtrSet<VNInfo*, 8> UsedPHIs;
+  // Blocks that have already been added to WorkList as live-out.
+  SmallPtrSet<const MachineBasicBlock*, 16> LiveOut;
+
+  auto getSubRange = [](const LiveInterval &I, LaneBitmask M)
+        -> const LiveRange& {
+    if (M.none())
+      return I;
+    for (const LiveInterval::SubRange &SR : I.subranges()) {
+      if ((SR.LaneMask & M).any()) {
+        assert(SR.LaneMask == M && "Expecting lane masks to match exactly");
+        return SR;
+      }
+    }
+    llvm_unreachable("Subrange for mask not found");
+  };
+
+  const LiveInterval &LI = getInterval(Reg);
+  const LiveRange &OldRange = getSubRange(LI, LaneMask);
+
+  // Extend intervals to reach all uses in WorkList.
+  while (!WorkList.empty()) {
+    SlotIndex Idx = WorkList.back().first;
+    VNInfo *VNI = WorkList.back().second;
+    WorkList.pop_back();
+    const MachineBasicBlock *MBB = Indexes->getMBBFromIndex(Idx.getPrevSlot());
+    SlotIndex BlockStart = Indexes->getMBBStartIdx(MBB);
+
+    // Extend the live range for VNI to be live at Idx.
+    if (VNInfo *ExtVNI = Segments.extendInBlock(BlockStart, Idx)) {
+      assert(ExtVNI == VNI && "Unexpected existing value number");
+      (void)ExtVNI;
+      // Is this a PHIDef we haven't seen before?
+      if (!VNI->isPHIDef() || VNI->def != BlockStart ||
+          !UsedPHIs.insert(VNI).second)
+        continue;
+      // The PHI is live, make sure the predecessors are live-out.
+      for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+        if (!LiveOut.insert(Pred).second)
+          continue;
+        SlotIndex Stop = Indexes->getMBBEndIdx(Pred);
+        // A predecessor is not required to have a live-out value for a PHI.
+        if (VNInfo *PVNI = OldRange.getVNInfoBefore(Stop))
+          WorkList.push_back(std::make_pair(Stop, PVNI));
+      }
+      continue;
+    }
+
+    // VNI is live-in to MBB.
+    LLVM_DEBUG(dbgs() << " live-in at " << BlockStart << '\n');
+    Segments.addSegment(LiveRange::Segment(BlockStart, Idx, VNI));
+
+    // Make sure VNI is live-out from the predecessors.
+    for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+      if (!LiveOut.insert(Pred).second)
+        continue;
+      SlotIndex Stop = Indexes->getMBBEndIdx(Pred);
+      if (VNInfo *OldVNI = OldRange.getVNInfoBefore(Stop)) {
+        assert(OldVNI == VNI && "Wrong value out of predecessor");
+        (void)OldVNI;
+        WorkList.push_back(std::make_pair(Stop, VNI));
+      } else {
+#ifndef NDEBUG
+        // There was no old VNI. Verify that Stop is jointly dominated
+        // by <undef>s for this live range.
+        assert(LaneMask.any() &&
+               "Missing value out of predecessor for main range");
+        SmallVector<SlotIndex,8> Undefs;
+        LI.computeSubRangeUndefs(Undefs, LaneMask, *MRI, *Indexes);
+        assert(LiveRangeCalc::isJointlyDominated(Pred, Undefs, *Indexes) &&
+               "Missing value out of predecessor for subrange");
+#endif
+      }
+    }
+  }
+}
+
+bool LiveIntervals::shrinkToUses(LiveInterval *li,
+                                 SmallVectorImpl<MachineInstr*> *dead) {
+  LLVM_DEBUG(dbgs() << "Shrink: " << *li << '\n');
+  assert(li->reg().isVirtual() && "Can only shrink virtual registers");
+
+  // Shrink subregister live ranges.
+  bool NeedsCleanup = false;
+  for (LiveInterval::SubRange &S : li->subranges()) {
+    shrinkToUses(S, li->reg());
+    if (S.empty())
+      NeedsCleanup = true;
+  }
+  if (NeedsCleanup)
+    li->removeEmptySubRanges();
+
+  // Find all the values used, including PHI kills.
+  ShrinkToUsesWorkList WorkList;
+
+  // Visit all instructions reading li->reg().
+  Register Reg = li->reg();
+  for (MachineInstr &UseMI : MRI->reg_instructions(Reg)) {
+    if (UseMI.isDebugInstr() || !UseMI.readsVirtualRegister(Reg))
+      continue;
+    SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot();
+    LiveQueryResult LRQ = li->Query(Idx);
+    VNInfo *VNI = LRQ.valueIn();
+    if (!VNI) {
+      // This shouldn't happen: readsVirtualRegister returns true, but there is
+      // no live value. It is likely caused by a target getting <undef> flags
+      // wrong.
+      LLVM_DEBUG(
+          dbgs() << Idx << '\t' << UseMI
+                 << "Warning: Instr claims to read non-existent value in "
+                 << *li << '\n');
+      continue;
+    }
+    // Special case: An early-clobber tied operand reads and writes the
+    // register one slot early.
+    if (VNInfo *DefVNI = LRQ.valueDefined())
+      Idx = DefVNI->def;
+
+    WorkList.push_back(std::make_pair(Idx, VNI));
+  }
+
+  // Create new live ranges with only minimal live segments per def.
+  LiveRange NewLR;
+  createSegmentsForValues(NewLR, li->vnis());
+  extendSegmentsToUses(NewLR, WorkList, Reg, LaneBitmask::getNone());
+
+  // Move the trimmed segments back.
+  li->segments.swap(NewLR.segments);
+
+  // Handle dead values.
+  bool CanSeparate = computeDeadValues(*li, dead);
+  LLVM_DEBUG(dbgs() << "Shrunk: " << *li << '\n');
+  return CanSeparate;
+}
+
+bool LiveIntervals::computeDeadValues(LiveInterval &LI,
+                                      SmallVectorImpl<MachineInstr*> *dead) {
+  bool MayHaveSplitComponents = false;
+
+  for (VNInfo *VNI : LI.valnos) {
+    if (VNI->isUnused())
+      continue;
+    SlotIndex Def = VNI->def;
+    LiveRange::iterator I = LI.FindSegmentContaining(Def);
+    assert(I != LI.end() && "Missing segment for VNI");
+
+    // Is the register live before? Otherwise we may have to add a read-undef
+    // flag for subregister defs.
+    Register VReg = LI.reg();
+    if (MRI->shouldTrackSubRegLiveness(VReg)) {
+      if ((I == LI.begin() || std::prev(I)->end < Def) && !VNI->isPHIDef()) {
+        MachineInstr *MI = getInstructionFromIndex(Def);
+        MI->setRegisterDefReadUndef(VReg);
+      }
+    }
+
+    if (I->end != Def.getDeadSlot())
+      continue;
+    if (VNI->isPHIDef()) {
+      // This is a dead PHI. Remove it.
+      VNI->markUnused();
+      LI.removeSegment(I);
+      LLVM_DEBUG(dbgs() << "Dead PHI at " << Def << " may separate interval\n");
+    } else {
+      // This is a dead def. Make sure the instruction knows.
+      MachineInstr *MI = getInstructionFromIndex(Def);
+      assert(MI && "No instruction defining live value");
+      MI->addRegisterDead(LI.reg(), TRI);
+
+      if (dead && MI->allDefsAreDead()) {
+        LLVM_DEBUG(dbgs() << "All defs dead: " << Def << '\t' << *MI);
+        dead->push_back(MI);
+      }
+    }
+    MayHaveSplitComponents = true;
+  }
+  return MayHaveSplitComponents;
+}
+
+void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, Register Reg) {
+  LLVM_DEBUG(dbgs() << "Shrink: " << SR << '\n');
+  assert(Reg.isVirtual() && "Can only shrink virtual registers");
+  // Find all the values used, including PHI kills.
+  ShrinkToUsesWorkList WorkList;
+
+  // Visit all instructions reading Reg.
+  SlotIndex LastIdx;
+  for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+    // Skip "undef" uses.
+    if (!MO.readsReg())
+      continue;
+    // Maybe the operand is for a subregister we don't care about.
+    unsigned SubReg = MO.getSubReg();
+    if (SubReg != 0) {
+      LaneBitmask LaneMask = TRI->getSubRegIndexLaneMask(SubReg);
+      if ((LaneMask & SR.LaneMask).none())
+        continue;
+    }
+    // We only need to visit each instruction once.
+    MachineInstr *UseMI = MO.getParent();
+    SlotIndex Idx = getInstructionIndex(*UseMI).getRegSlot();
+    if (Idx == LastIdx)
+      continue;
+    LastIdx = Idx;
+
+    LiveQueryResult LRQ = SR.Query(Idx);
+    VNInfo *VNI = LRQ.valueIn();
+    // For Subranges it is possible that only undef values are left in that
+    // part of the subregister, so there is no real liverange at the use
+    if (!VNI)
+      continue;
+
+    // Special case: An early-clobber tied operand reads and writes the
+    // register one slot early.
+    if (VNInfo *DefVNI = LRQ.valueDefined())
+      Idx = DefVNI->def;
+
+    WorkList.push_back(std::make_pair(Idx, VNI));
+  }
+
+  // Create a new live ranges with only minimal live segments per def.
+  LiveRange NewLR;
+  createSegmentsForValues(NewLR, SR.vnis());
+  extendSegmentsToUses(NewLR, WorkList, Reg, SR.LaneMask);
+
+  // Move the trimmed ranges back.
+  SR.segments.swap(NewLR.segments);
+
+  // Remove dead PHI value numbers
+  for (VNInfo *VNI : SR.valnos) {
+    if (VNI->isUnused())
+      continue;
+    const LiveRange::Segment *Segment = SR.getSegmentContaining(VNI->def);
+    assert(Segment != nullptr && "Missing segment for VNI");
+    if (Segment->end != VNI->def.getDeadSlot())
+      continue;
+    if (VNI->isPHIDef()) {
+      // This is a dead PHI. Remove it.
+      LLVM_DEBUG(dbgs() << "Dead PHI at " << VNI->def
+                        << " may separate interval\n");
+      VNI->markUnused();
+      SR.removeSegment(*Segment);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Shrunk: " << SR << '\n');
+}
+
+void LiveIntervals::extendToIndices(LiveRange &LR,
+                                    ArrayRef<SlotIndex> Indices,
+                                    ArrayRef<SlotIndex> Undefs) {
+  assert(LICalc && "LICalc not initialized.");
+  LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  for (SlotIndex Idx : Indices)
+    LICalc->extend(LR, Idx, /*PhysReg=*/0, Undefs);
+}
+
+void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill,
+                               SmallVectorImpl<SlotIndex> *EndPoints) {
+  LiveQueryResult LRQ = LR.Query(Kill);
+  VNInfo *VNI = LRQ.valueOutOrDead();
+  if (!VNI)
+    return;
+
+  MachineBasicBlock *KillMBB = Indexes->getMBBFromIndex(Kill);
+  SlotIndex MBBEnd = Indexes->getMBBEndIdx(KillMBB);
+
+  // If VNI isn't live out from KillMBB, the value is trivially pruned.
+  if (LRQ.endPoint() < MBBEnd) {
+    LR.removeSegment(Kill, LRQ.endPoint());
+    if (EndPoints) EndPoints->push_back(LRQ.endPoint());
+    return;
+  }
+
+  // VNI is live out of KillMBB.
+  LR.removeSegment(Kill, MBBEnd);
+  if (EndPoints) EndPoints->push_back(MBBEnd);
+
+  // Find all blocks that are reachable from KillMBB without leaving VNI's live
+  // range. It is possible that KillMBB itself is reachable, so start a DFS
+  // from each successor.
+  using VisitedTy = df_iterator_default_set<MachineBasicBlock*,9>;
+  VisitedTy Visited;
+  for (MachineBasicBlock *Succ : KillMBB->successors()) {
+    for (df_ext_iterator<MachineBasicBlock*, VisitedTy>
+         I = df_ext_begin(Succ, Visited), E = df_ext_end(Succ, Visited);
+         I != E;) {
+      MachineBasicBlock *MBB = *I;
+
+      // Check if VNI is live in to MBB.
+      SlotIndex MBBStart, MBBEnd;
+      std::tie(MBBStart, MBBEnd) = Indexes->getMBBRange(MBB);
+      LiveQueryResult LRQ = LR.Query(MBBStart);
+      if (LRQ.valueIn() != VNI) {
+        // This block isn't part of the VNI segment. Prune the search.
+        I.skipChildren();
+        continue;
+      }
+
+      // Prune the search if VNI is killed in MBB.
+      if (LRQ.endPoint() < MBBEnd) {
+        LR.removeSegment(MBBStart, LRQ.endPoint());
+        if (EndPoints) EndPoints->push_back(LRQ.endPoint());
+        I.skipChildren();
+        continue;
+      }
+
+      // VNI is live through MBB.
+      LR.removeSegment(MBBStart, MBBEnd);
+      if (EndPoints) EndPoints->push_back(MBBEnd);
+      ++I;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Register allocator hooks.
+//
+
+void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
+  // Keep track of regunit ranges.
+  SmallVector<std::pair<const LiveRange*, LiveRange::const_iterator>, 8> RU;
+
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    Register Reg = Register::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    const LiveInterval &LI = getInterval(Reg);
+    if (LI.empty())
+      continue;
+
+    // Target may have not allocated this yet.
+    Register PhysReg = VRM->getPhys(Reg);
+    if (!PhysReg)
+      continue;
+
+    // Find the regunit intervals for the assigned register. They may overlap
+    // the virtual register live range, cancelling any kills.
+    RU.clear();
+    for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+      const LiveRange &RURange = getRegUnit(Unit);
+      if (RURange.empty())
+        continue;
+      RU.push_back(std::make_pair(&RURange, RURange.find(LI.begin()->end)));
+    }
+    // Every instruction that kills Reg corresponds to a segment range end
+    // point.
+    for (LiveInterval::const_iterator RI = LI.begin(), RE = LI.end(); RI != RE;
+         ++RI) {
+      // A block index indicates an MBB edge.
+      if (RI->end.isBlock())
+        continue;
+      MachineInstr *MI = getInstructionFromIndex(RI->end);
+      if (!MI)
+        continue;
+
+      // Check if any of the regunits are live beyond the end of RI. That could
+      // happen when a physreg is defined as a copy of a virtreg:
+      //
+      //   %eax = COPY %5
+      //   FOO %5             <--- MI, cancel kill because %eax is live.
+      //   BAR killed %eax
+      //
+      // There should be no kill flag on FOO when %5 is rewritten as %eax.
+      for (auto &RUP : RU) {
+        const LiveRange &RURange = *RUP.first;
+        LiveRange::const_iterator &I = RUP.second;
+        if (I == RURange.end())
+          continue;
+        I = RURange.advanceTo(I, RI->end);
+        if (I == RURange.end() || I->start >= RI->end)
+          continue;
+        // I is overlapping RI.
+        goto CancelKill;
+      }
+
+      if (MRI->subRegLivenessEnabled()) {
+        // When reading a partial undefined value we must not add a kill flag.
+        // The regalloc might have used the undef lane for something else.
+        // Example:
+        //     %1 = ...                  ; R32: %1
+        //     %2:high16 = ...           ; R64: %2
+        //        = read killed %2        ; R64: %2
+        //        = read %1              ; R32: %1
+        // The <kill> flag is correct for %2, but the register allocator may
+        // assign R0L to %1, and R0 to %2 because the low 32bits of R0
+        // are actually never written by %2. After assignment the <kill>
+        // flag at the read instruction is invalid.
+        LaneBitmask DefinedLanesMask;
+        if (LI.hasSubRanges()) {
+          // Compute a mask of lanes that are defined.
+          DefinedLanesMask = LaneBitmask::getNone();
+          for (const LiveInterval::SubRange &SR : LI.subranges())
+            for (const LiveRange::Segment &Segment : SR.segments) {
+              if (Segment.start >= RI->end)
+                break;
+              if (Segment.end == RI->end) {
+                DefinedLanesMask |= SR.LaneMask;
+                break;
+              }
+            }
+        } else
+          DefinedLanesMask = LaneBitmask::getAll();
+
+        bool IsFullWrite = false;
+        for (const MachineOperand &MO : MI->operands()) {
+          if (!MO.isReg() || MO.getReg() != Reg)
+            continue;
+          if (MO.isUse()) {
+            // Reading any undefined lanes?
+            unsigned SubReg = MO.getSubReg();
+            LaneBitmask UseMask = SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
+                                         : MRI->getMaxLaneMaskForVReg(Reg);
+            if ((UseMask & ~DefinedLanesMask).any())
+              goto CancelKill;
+          } else if (MO.getSubReg() == 0) {
+            // Writing to the full register?
+            assert(MO.isDef());
+            IsFullWrite = true;
+          }
+        }
+
+        // If an instruction writes to a subregister, a new segment starts in
+        // the LiveInterval. But as this is only overriding part of the register
+        // adding kill-flags is not correct here after registers have been
+        // assigned.
+        if (!IsFullWrite) {
+          // Next segment has to be adjacent in the subregister write case.
+          LiveRange::const_iterator N = std::next(RI);
+          if (N != LI.end() && N->start == RI->end)
+            goto CancelKill;
+        }
+      }
+
+      MI->addRegisterKilled(Reg, nullptr);
+      continue;
+CancelKill:
+      MI->clearRegisterKills(Reg, nullptr);
+    }
+  }
+}
+
+MachineBasicBlock*
+LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const {
+  assert(!LI.empty() && "LiveInterval is empty.");
+
+  // A local live range must be fully contained inside the block, meaning it is
+  // defined and killed at instructions, not at block boundaries. It is not
+  // live in or out of any block.
+  //
+  // It is technically possible to have a PHI-defined live range identical to a
+  // single block, but we are going to return false in that case.
+
+  SlotIndex Start = LI.beginIndex();
+  if (Start.isBlock())
+    return nullptr;
+
+  SlotIndex Stop = LI.endIndex();
+  if (Stop.isBlock())
+    return nullptr;
+
+  // getMBBFromIndex doesn't need to search the MBB table when both indexes
+  // belong to proper instructions.
+  MachineBasicBlock *MBB1 = Indexes->getMBBFromIndex(Start);
+  MachineBasicBlock *MBB2 = Indexes->getMBBFromIndex(Stop);
+  return MBB1 == MBB2 ? MBB1 : nullptr;
+}
+
+bool
+LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const {
+  for (const VNInfo *PHI : LI.valnos) {
+    if (PHI->isUnused() || !PHI->isPHIDef())
+      continue;
+    const MachineBasicBlock *PHIMBB = getMBBFromIndex(PHI->def);
+    // Conservatively return true instead of scanning huge predecessor lists.
+    if (PHIMBB->pred_size() > 100)
+      return true;
+    for (const MachineBasicBlock *Pred : PHIMBB->predecessors())
+      if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(Pred)))
+        return true;
+  }
+  return false;
+}
+
+float LiveIntervals::getSpillWeight(bool isDef, bool isUse,
+                                    const MachineBlockFrequencyInfo *MBFI,
+                                    const MachineInstr &MI) {
+  return getSpillWeight(isDef, isUse, MBFI, MI.getParent());
+}
+
+float LiveIntervals::getSpillWeight(bool isDef, bool isUse,
+                                    const MachineBlockFrequencyInfo *MBFI,
+                                    const MachineBasicBlock *MBB) {
+  return (isDef + isUse) * MBFI->getBlockFreqRelativeToEntryBlock(MBB);
+}
+
+LiveRange::Segment
+LiveIntervals::addSegmentToEndOfBlock(Register Reg, MachineInstr &startInst) {
+  LiveInterval &Interval = createEmptyInterval(Reg);
+  VNInfo *VN = Interval.getNextValue(
+      SlotIndex(getInstructionIndex(startInst).getRegSlot()),
+      getVNInfoAllocator());
+  LiveRange::Segment S(SlotIndex(getInstructionIndex(startInst).getRegSlot()),
+                       getMBBEndIdx(startInst.getParent()), VN);
+  Interval.addSegment(S);
+
+  return S;
+}
+
+//===----------------------------------------------------------------------===//
+//                          Register mask functions
+//===----------------------------------------------------------------------===//
+/// Check whether use of reg in MI is live-through. Live-through means that
+/// the value is alive on exit from Machine instruction. The example of such
+/// use is a deopt value in statepoint instruction.
+static bool hasLiveThroughUse(const MachineInstr *MI, Register Reg) {
+  if (MI->getOpcode() != TargetOpcode::STATEPOINT)
+    return false;
+  StatepointOpers SO(MI);
+  if (SO.getFlags() & (uint64_t)StatepointFlags::DeoptLiveIn)
+    return false;
+  for (unsigned Idx = SO.getNumDeoptArgsIdx(), E = SO.getNumGCPtrIdx(); Idx < E;
+       ++Idx) {
+    const MachineOperand &MO = MI->getOperand(Idx);
+    if (MO.isReg() && MO.getReg() == Reg)
+      return true;
+  }
+  return false;
+}
+
+bool LiveIntervals::checkRegMaskInterference(const LiveInterval &LI,
+                                             BitVector &UsableRegs) {
+  if (LI.empty())
+    return false;
+  LiveInterval::const_iterator LiveI = LI.begin(), LiveE = LI.end();
+
+  // Use a smaller arrays for local live ranges.
+  ArrayRef<SlotIndex> Slots;
+  ArrayRef<const uint32_t*> Bits;
+  if (MachineBasicBlock *MBB = intervalIsInOneMBB(LI)) {
+    Slots = getRegMaskSlotsInBlock(MBB->getNumber());
+    Bits = getRegMaskBitsInBlock(MBB->getNumber());
+  } else {
+    Slots = getRegMaskSlots();
+    Bits = getRegMaskBits();
+  }
+
+  // We are going to enumerate all the register mask slots contained in LI.
+  // Start with a binary search of RegMaskSlots to find a starting point.
+  ArrayRef<SlotIndex>::iterator SlotI = llvm::lower_bound(Slots, LiveI->start);
+  ArrayRef<SlotIndex>::iterator SlotE = Slots.end();
+
+  // No slots in range, LI begins after the last call.
+  if (SlotI == SlotE)
+    return false;
+
+  bool Found = false;
+  // Utility to union regmasks.
+  auto unionBitMask = [&](unsigned Idx) {
+      if (!Found) {
+        // This is the first overlap. Initialize UsableRegs to all ones.
+        UsableRegs.clear();
+        UsableRegs.resize(TRI->getNumRegs(), true);
+        Found = true;
+      }
+      // Remove usable registers clobbered by this mask.
+      UsableRegs.clearBitsNotInMask(Bits[Idx]);
+  };
+  while (true) {
+    assert(*SlotI >= LiveI->start);
+    // Loop over all slots overlapping this segment.
+    while (*SlotI < LiveI->end) {
+      // *SlotI overlaps LI. Collect mask bits.
+      unionBitMask(SlotI - Slots.begin());
+      if (++SlotI == SlotE)
+        return Found;
+    }
+    // If segment ends with live-through use we need to collect its regmask.
+    if (*SlotI == LiveI->end)
+      if (MachineInstr *MI = getInstructionFromIndex(*SlotI))
+        if (hasLiveThroughUse(MI, LI.reg()))
+          unionBitMask(SlotI++ - Slots.begin());
+    // *SlotI is beyond the current LI segment.
+    // Special advance implementation to not miss next LiveI->end.
+    if (++LiveI == LiveE || SlotI == SlotE || *SlotI > LI.endIndex())
+      return Found;
+    while (LiveI->end < *SlotI)
+      ++LiveI;
+    // Advance SlotI until it overlaps.
+    while (*SlotI < LiveI->start)
+      if (++SlotI == SlotE)
+        return Found;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         IntervalUpdate class.
+//===----------------------------------------------------------------------===//
+
+/// Toolkit used by handleMove to trim or extend live intervals.
+class LiveIntervals::HMEditor {
+private:
+  LiveIntervals& LIS;
+  const MachineRegisterInfo& MRI;
+  const TargetRegisterInfo& TRI;
+  SlotIndex OldIdx;
+  SlotIndex NewIdx;
+  SmallPtrSet<LiveRange*, 8> Updated;
+  bool UpdateFlags;
+
+public:
+  HMEditor(LiveIntervals& LIS, const MachineRegisterInfo& MRI,
+           const TargetRegisterInfo& TRI,
+           SlotIndex OldIdx, SlotIndex NewIdx, bool UpdateFlags)
+    : LIS(LIS), MRI(MRI), TRI(TRI), OldIdx(OldIdx), NewIdx(NewIdx),
+      UpdateFlags(UpdateFlags) {}
+
+  // FIXME: UpdateFlags is a workaround that creates live intervals for all
+  // physregs, even those that aren't needed for regalloc, in order to update
+  // kill flags. This is wasteful. Eventually, LiveVariables will strip all kill
+  // flags, and postRA passes will use a live register utility instead.
+  LiveRange *getRegUnitLI(unsigned Unit) {
+    if (UpdateFlags && !MRI.isReservedRegUnit(Unit))
+      return &LIS.getRegUnit(Unit);
+    return LIS.getCachedRegUnit(Unit);
+  }
+
+  /// Update all live ranges touched by MI, assuming a move from OldIdx to
+  /// NewIdx.
+  void updateAllRanges(MachineInstr *MI) {
+    LLVM_DEBUG(dbgs() << "handleMove " << OldIdx << " -> " << NewIdx << ": "
+                      << *MI);
+    bool hasRegMask = false;
+    for (MachineOperand &MO : MI->operands()) {
+      if (MO.isRegMask())
+        hasRegMask = true;
+      if (!MO.isReg())
+        continue;
+      if (MO.isUse()) {
+        if (!MO.readsReg())
+          continue;
+        // Aggressively clear all kill flags.
+        // They are reinserted by VirtRegRewriter.
+        MO.setIsKill(false);
+      }
+
+      Register Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      if (Reg.isVirtual()) {
+        LiveInterval &LI = LIS.getInterval(Reg);
+        if (LI.hasSubRanges()) {
+          unsigned SubReg = MO.getSubReg();
+          LaneBitmask LaneMask = SubReg ? TRI.getSubRegIndexLaneMask(SubReg)
+                                        : MRI.getMaxLaneMaskForVReg(Reg);
+          for (LiveInterval::SubRange &S : LI.subranges()) {
+            if ((S.LaneMask & LaneMask).none())
+              continue;
+            updateRange(S, Reg, S.LaneMask);
+          }
+        }
+        updateRange(LI, Reg, LaneBitmask::getNone());
+        // If main range has a hole and we are moving a subrange use across
+        // the hole updateRange() cannot properly handle it since it only
+        // gets the LiveRange and not the whole LiveInterval. As a result
+        // we may end up with a main range not covering all subranges.
+        // This is extremely rare case, so let's check and reconstruct the
+        // main range.
+        if (LI.hasSubRanges()) {
+          unsigned SubReg = MO.getSubReg();
+          LaneBitmask LaneMask = SubReg ? TRI.getSubRegIndexLaneMask(SubReg)
+                                        : MRI.getMaxLaneMaskForVReg(Reg);
+          for (LiveInterval::SubRange &S : LI.subranges()) {
+            if ((S.LaneMask & LaneMask).none() || LI.covers(S))
+              continue;
+            LI.clear();
+            LIS.constructMainRangeFromSubranges(LI);
+            break;
+          }
+        }
+
+        continue;
+      }
+
+      // For physregs, only update the regunits that actually have a
+      // precomputed live range.
+      for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg()))
+        if (LiveRange *LR = getRegUnitLI(Unit))
+          updateRange(*LR, Unit, LaneBitmask::getNone());
+    }
+    if (hasRegMask)
+      updateRegMaskSlots();
+  }
+
+private:
+  /// Update a single live range, assuming an instruction has been moved from
+  /// OldIdx to NewIdx.
+  void updateRange(LiveRange &LR, Register Reg, LaneBitmask LaneMask) {
+    if (!Updated.insert(&LR).second)
+      return;
+    LLVM_DEBUG({
+      dbgs() << "     ";
+      if (Reg.isVirtual()) {
+        dbgs() << printReg(Reg);
+        if (LaneMask.any())
+          dbgs() << " L" << PrintLaneMask(LaneMask);
+      } else {
+        dbgs() << printRegUnit(Reg, &TRI);
+      }
+      dbgs() << ":\t" << LR << '\n';
+    });
+    if (SlotIndex::isEarlierInstr(OldIdx, NewIdx))
+      handleMoveDown(LR);
+    else
+      handleMoveUp(LR, Reg, LaneMask);
+    LLVM_DEBUG(dbgs() << "        -->\t" << LR << '\n');
+    LR.verify();
+  }
+
+  /// Update LR to reflect an instruction has been moved downwards from OldIdx
+  /// to NewIdx (OldIdx < NewIdx).
+  void handleMoveDown(LiveRange &LR) {
+    LiveRange::iterator E = LR.end();
+    // Segment going into OldIdx.
+    LiveRange::iterator OldIdxIn = LR.find(OldIdx.getBaseIndex());
+
+    // No value live before or after OldIdx? Nothing to do.
+    if (OldIdxIn == E || SlotIndex::isEarlierInstr(OldIdx, OldIdxIn->start))
+      return;
+
+    LiveRange::iterator OldIdxOut;
+    // Do we have a value live-in to OldIdx?
+    if (SlotIndex::isEarlierInstr(OldIdxIn->start, OldIdx)) {
+      // If the live-in value already extends to NewIdx, there is nothing to do.
+      if (SlotIndex::isEarlierEqualInstr(NewIdx, OldIdxIn->end))
+        return;
+      // Aggressively remove all kill flags from the old kill point.
+      // Kill flags shouldn't be used while live intervals exist, they will be
+      // reinserted by VirtRegRewriter.
+      if (MachineInstr *KillMI = LIS.getInstructionFromIndex(OldIdxIn->end))
+        for (MachineOperand &MOP : mi_bundle_ops(*KillMI))
+          if (MOP.isReg() && MOP.isUse())
+            MOP.setIsKill(false);
+
+      // Is there a def before NewIdx which is not OldIdx?
+      LiveRange::iterator Next = std::next(OldIdxIn);
+      if (Next != E && !SlotIndex::isSameInstr(OldIdx, Next->start) &&
+          SlotIndex::isEarlierInstr(Next->start, NewIdx)) {
+        // If we are here then OldIdx was just a use but not a def. We only have
+        // to ensure liveness extends to NewIdx.
+        LiveRange::iterator NewIdxIn =
+          LR.advanceTo(Next, NewIdx.getBaseIndex());
+        // Extend the segment before NewIdx if necessary.
+        if (NewIdxIn == E ||
+            !SlotIndex::isEarlierInstr(NewIdxIn->start, NewIdx)) {
+          LiveRange::iterator Prev = std::prev(NewIdxIn);
+          Prev->end = NewIdx.getRegSlot();
+        }
+        // Extend OldIdxIn.
+        OldIdxIn->end = Next->start;
+        return;
+      }
+
+      // Adjust OldIdxIn->end to reach NewIdx. This may temporarily make LR
+      // invalid by overlapping ranges.
+      bool isKill = SlotIndex::isSameInstr(OldIdx, OldIdxIn->end);
+      OldIdxIn->end = NewIdx.getRegSlot(OldIdxIn->end.isEarlyClobber());
+      // If this was not a kill, then there was no def and we're done.
+      if (!isKill)
+        return;
+
+      // Did we have a Def at OldIdx?
+      OldIdxOut = Next;
+      if (OldIdxOut == E || !SlotIndex::isSameInstr(OldIdx, OldIdxOut->start))
+        return;
+    } else {
+      OldIdxOut = OldIdxIn;
+    }
+
+    // If we are here then there is a Definition at OldIdx. OldIdxOut points
+    // to the segment starting there.
+    assert(OldIdxOut != E && SlotIndex::isSameInstr(OldIdx, OldIdxOut->start) &&
+           "No def?");
+    VNInfo *OldIdxVNI = OldIdxOut->valno;
+    assert(OldIdxVNI->def == OldIdxOut->start && "Inconsistent def");
+
+    // If the defined value extends beyond NewIdx, just move the beginning
+    // of the segment to NewIdx.
+    SlotIndex NewIdxDef = NewIdx.getRegSlot(OldIdxOut->start.isEarlyClobber());
+    if (SlotIndex::isEarlierInstr(NewIdxDef, OldIdxOut->end)) {
+      OldIdxVNI->def = NewIdxDef;
+      OldIdxOut->start = OldIdxVNI->def;
+      return;
+    }
+
+    // If we are here then we have a Definition at OldIdx which ends before
+    // NewIdx.
+
+    // Is there an existing Def at NewIdx?
+    LiveRange::iterator AfterNewIdx
+      = LR.advanceTo(OldIdxOut, NewIdx.getRegSlot());
+    bool OldIdxDefIsDead = OldIdxOut->end.isDead();
+    if (!OldIdxDefIsDead &&
+        SlotIndex::isEarlierInstr(OldIdxOut->end, NewIdxDef)) {
+      // OldIdx is not a dead def, and NewIdxDef is inside a new interval.
+      VNInfo *DefVNI;
+      if (OldIdxOut != LR.begin() &&
+          !SlotIndex::isEarlierInstr(std::prev(OldIdxOut)->end,
+                                     OldIdxOut->start)) {
+        // There is no gap between OldIdxOut and its predecessor anymore,
+        // merge them.
+        LiveRange::iterator IPrev = std::prev(OldIdxOut);
+        DefVNI = OldIdxVNI;
+        IPrev->end = OldIdxOut->end;
+      } else {
+        // The value is live in to OldIdx
+        LiveRange::iterator INext = std::next(OldIdxOut);
+        assert(INext != E && "Must have following segment");
+        // We merge OldIdxOut and its successor. As we're dealing with subreg
+        // reordering, there is always a successor to OldIdxOut in the same BB
+        // We don't need INext->valno anymore and will reuse for the new segment
+        // we create later.
+        DefVNI = OldIdxVNI;
+        INext->start = OldIdxOut->end;
+        INext->valno->def = INext->start;
+      }
+      // If NewIdx is behind the last segment, extend that and append a new one.
+      if (AfterNewIdx == E) {
+        // OldIdxOut is undef at this point, Slide (OldIdxOut;AfterNewIdx] up
+        // one position.
+        //    |-  ?/OldIdxOut -| |- X0 -| ... |- Xn -| end
+        // => |- X0/OldIdxOut -| ... |- Xn -| |- undef/NewS -| end
+        std::copy(std::next(OldIdxOut), E, OldIdxOut);
+        // The last segment is undefined now, reuse it for a dead def.
+        LiveRange::iterator NewSegment = std::prev(E);
+        *NewSegment = LiveRange::Segment(NewIdxDef, NewIdxDef.getDeadSlot(),
+                                         DefVNI);
+        DefVNI->def = NewIdxDef;
+
+        LiveRange::iterator Prev = std::prev(NewSegment);
+        Prev->end = NewIdxDef;
+      } else {
+        // OldIdxOut is undef at this point, Slide (OldIdxOut;AfterNewIdx] up
+        // one position.
+        //    |-  ?/OldIdxOut -| |- X0 -| ... |- Xn/AfterNewIdx -| |- Next -|
+        // => |- X0/OldIdxOut -| ... |- Xn -| |- Xn/AfterNewIdx -| |- Next -|
+        std::copy(std::next(OldIdxOut), std::next(AfterNewIdx), OldIdxOut);
+        LiveRange::iterator Prev = std::prev(AfterNewIdx);
+        // We have two cases:
+        if (SlotIndex::isEarlierInstr(Prev->start, NewIdxDef)) {
+          // Case 1: NewIdx is inside a liverange. Split this liverange at
+          // NewIdxDef into the segment "Prev" followed by "NewSegment".
+          LiveRange::iterator NewSegment = AfterNewIdx;
+          *NewSegment = LiveRange::Segment(NewIdxDef, Prev->end, Prev->valno);
+          Prev->valno->def = NewIdxDef;
+
+          *Prev = LiveRange::Segment(Prev->start, NewIdxDef, DefVNI);
+          DefVNI->def = Prev->start;
+        } else {
+          // Case 2: NewIdx is in a lifetime hole. Keep AfterNewIdx as is and
+          // turn Prev into a segment from NewIdx to AfterNewIdx->start.
+          *Prev = LiveRange::Segment(NewIdxDef, AfterNewIdx->start, DefVNI);
+          DefVNI->def = NewIdxDef;
+          assert(DefVNI != AfterNewIdx->valno);
+        }
+      }
+      return;
+    }
+
+    if (AfterNewIdx != E &&
+        SlotIndex::isSameInstr(AfterNewIdx->start, NewIdxDef)) {
+      // There is an existing def at NewIdx. The def at OldIdx is coalesced into
+      // that value.
+      assert(AfterNewIdx->valno != OldIdxVNI && "Multiple defs of value?");
+      LR.removeValNo(OldIdxVNI);
+    } else {
+      // There was no existing def at NewIdx. We need to create a dead def
+      // at NewIdx. Shift segments over the old OldIdxOut segment, this frees
+      // a new segment at the place where we want to construct the dead def.
+      //    |- OldIdxOut -| |- X0 -| ... |- Xn -| |- AfterNewIdx -|
+      // => |- X0/OldIdxOut -| ... |- Xn -| |- undef/NewS. -| |- AfterNewIdx -|
+      assert(AfterNewIdx != OldIdxOut && "Inconsistent iterators");
+      std::copy(std::next(OldIdxOut), AfterNewIdx, OldIdxOut);
+      // We can reuse OldIdxVNI now.
+      LiveRange::iterator NewSegment = std::prev(AfterNewIdx);
+      VNInfo *NewSegmentVNI = OldIdxVNI;
+      NewSegmentVNI->def = NewIdxDef;
+      *NewSegment = LiveRange::Segment(NewIdxDef, NewIdxDef.getDeadSlot(),
+                                       NewSegmentVNI);
+    }
+  }
+
+  /// Update LR to reflect an instruction has been moved upwards from OldIdx
+  /// to NewIdx (NewIdx < OldIdx).
+  void handleMoveUp(LiveRange &LR, Register Reg, LaneBitmask LaneMask) {
+    LiveRange::iterator E = LR.end();
+    // Segment going into OldIdx.
+    LiveRange::iterator OldIdxIn = LR.find(OldIdx.getBaseIndex());
+
+    // No value live before or after OldIdx? Nothing to do.
+    if (OldIdxIn == E || SlotIndex::isEarlierInstr(OldIdx, OldIdxIn->start))
+      return;
+
+    LiveRange::iterator OldIdxOut;
+    // Do we have a value live-in to OldIdx?
+    if (SlotIndex::isEarlierInstr(OldIdxIn->start, OldIdx)) {
+      // If the live-in value isn't killed here, then we have no Def at
+      // OldIdx, moreover the value must be live at NewIdx so there is nothing
+      // to do.
+      bool isKill = SlotIndex::isSameInstr(OldIdx, OldIdxIn->end);
+      if (!isKill)
+        return;
+
+      // At this point we have to move OldIdxIn->end back to the nearest
+      // previous use or (dead-)def but no further than NewIdx.
+      SlotIndex DefBeforeOldIdx
+        = std::max(OldIdxIn->start.getDeadSlot(),
+                   NewIdx.getRegSlot(OldIdxIn->end.isEarlyClobber()));
+      OldIdxIn->end = findLastUseBefore(DefBeforeOldIdx, Reg, LaneMask);
+
+      // Did we have a Def at OldIdx? If not we are done now.
+      OldIdxOut = std::next(OldIdxIn);
+      if (OldIdxOut == E || !SlotIndex::isSameInstr(OldIdx, OldIdxOut->start))
+        return;
+    } else {
+      OldIdxOut = OldIdxIn;
+      OldIdxIn = OldIdxOut != LR.begin() ? std::prev(OldIdxOut) : E;
+    }
+
+    // If we are here then there is a Definition at OldIdx. OldIdxOut points
+    // to the segment starting there.
+    assert(OldIdxOut != E && SlotIndex::isSameInstr(OldIdx, OldIdxOut->start) &&
+           "No def?");
+    VNInfo *OldIdxVNI = OldIdxOut->valno;
+    assert(OldIdxVNI->def == OldIdxOut->start && "Inconsistent def");
+    bool OldIdxDefIsDead = OldIdxOut->end.isDead();
+
+    // Is there an existing def at NewIdx?
+    SlotIndex NewIdxDef = NewIdx.getRegSlot(OldIdxOut->start.isEarlyClobber());
+    LiveRange::iterator NewIdxOut = LR.find(NewIdx.getRegSlot());
+    if (SlotIndex::isSameInstr(NewIdxOut->start, NewIdx)) {
+      assert(NewIdxOut->valno != OldIdxVNI &&
+             "Same value defined more than once?");
+      // If OldIdx was a dead def remove it.
+      if (!OldIdxDefIsDead) {
+        // Remove segment starting at NewIdx and move begin of OldIdxOut to
+        // NewIdx so it can take its place.
+        OldIdxVNI->def = NewIdxDef;
+        OldIdxOut->start = NewIdxDef;
+        LR.removeValNo(NewIdxOut->valno);
+      } else {
+        // Simply remove the dead def at OldIdx.
+        LR.removeValNo(OldIdxVNI);
+      }
+    } else {
+      // Previously nothing was live after NewIdx, so all we have to do now is
+      // move the begin of OldIdxOut to NewIdx.
+      if (!OldIdxDefIsDead) {
+        // Do we have any intermediate Defs between OldIdx and NewIdx?
+        if (OldIdxIn != E &&
+            SlotIndex::isEarlierInstr(NewIdxDef, OldIdxIn->start)) {
+          // OldIdx is not a dead def and NewIdx is before predecessor start.
+          LiveRange::iterator NewIdxIn = NewIdxOut;
+          assert(NewIdxIn == LR.find(NewIdx.getBaseIndex()));
+          const SlotIndex SplitPos = NewIdxDef;
+          OldIdxVNI = OldIdxIn->valno;
+
+          SlotIndex NewDefEndPoint = std::next(NewIdxIn)->end;
+          LiveRange::iterator Prev = std::prev(OldIdxIn);
+          if (OldIdxIn != LR.begin() &&
+              SlotIndex::isEarlierInstr(NewIdx, Prev->end)) {
+            // If the segment before OldIdx read a value defined earlier than
+            // NewIdx, the moved instruction also reads and forwards that
+            // value. Extend the lifetime of the new def point.
+
+            // Extend to where the previous range started, unless there is
+            // another redef first.
+            NewDefEndPoint = std::min(OldIdxIn->start,
+                                      std::next(NewIdxOut)->start);
+          }
+
+          // Merge the OldIdxIn and OldIdxOut segments into OldIdxOut.
+          OldIdxOut->valno->def = OldIdxIn->start;
+          *OldIdxOut = LiveRange::Segment(OldIdxIn->start, OldIdxOut->end,
+                                          OldIdxOut->valno);
+          // OldIdxIn and OldIdxVNI are now undef and can be overridden.
+          // We Slide [NewIdxIn, OldIdxIn) down one position.
+          //    |- X0/NewIdxIn -| ... |- Xn-1 -||- Xn/OldIdxIn -||- OldIdxOut -|
+          // => |- undef/NexIdxIn -| |- X0 -| ... |- Xn-1 -| |- Xn/OldIdxOut -|
+          std::copy_backward(NewIdxIn, OldIdxIn, OldIdxOut);
+          // NewIdxIn is now considered undef so we can reuse it for the moved
+          // value.
+          LiveRange::iterator NewSegment = NewIdxIn;
+          LiveRange::iterator Next = std::next(NewSegment);
+          if (SlotIndex::isEarlierInstr(Next->start, NewIdx)) {
+            // There is no gap between NewSegment and its predecessor.
+            *NewSegment = LiveRange::Segment(Next->start, SplitPos,
+                                             Next->valno);
+
+            *Next = LiveRange::Segment(SplitPos, NewDefEndPoint, OldIdxVNI);
+            Next->valno->def = SplitPos;
+          } else {
+            // There is a gap between NewSegment and its predecessor
+            // Value becomes live in.
+            *NewSegment = LiveRange::Segment(SplitPos, Next->start, OldIdxVNI);
+            NewSegment->valno->def = SplitPos;
+          }
+        } else {
+          // Leave the end point of a live def.
+          OldIdxOut->start = NewIdxDef;
+          OldIdxVNI->def = NewIdxDef;
+          if (OldIdxIn != E && SlotIndex::isEarlierInstr(NewIdx, OldIdxIn->end))
+            OldIdxIn->end = NewIdxDef;
+        }
+      } else if (OldIdxIn != E
+          && SlotIndex::isEarlierInstr(NewIdxOut->start, NewIdx)
+          && SlotIndex::isEarlierInstr(NewIdx, NewIdxOut->end)) {
+        // OldIdxVNI is a dead def that has been moved into the middle of
+        // another value in LR. That can happen when LR is a whole register,
+        // but the dead def is a write to a subreg that is dead at NewIdx.
+        // The dead def may have been moved across other values
+        // in LR, so move OldIdxOut up to NewIdxOut. Slide [NewIdxOut;OldIdxOut)
+        // down one position.
+        //    |- X0/NewIdxOut -| ... |- Xn-1 -| |- Xn/OldIdxOut -| |- next - |
+        // => |- X0/NewIdxOut -| |- X0 -| ... |- Xn-1 -| |- next -|
+        std::copy_backward(NewIdxOut, OldIdxOut, std::next(OldIdxOut));
+        // Modify the segment at NewIdxOut and the following segment to meet at
+        // the point of the dead def, with the following segment getting
+        // OldIdxVNI as its value number.
+        *NewIdxOut = LiveRange::Segment(
+            NewIdxOut->start, NewIdxDef.getRegSlot(), NewIdxOut->valno);
+        *(NewIdxOut + 1) = LiveRange::Segment(
+            NewIdxDef.getRegSlot(), (NewIdxOut + 1)->end, OldIdxVNI);
+        OldIdxVNI->def = NewIdxDef;
+        // Modify subsequent segments to be defined by the moved def OldIdxVNI.
+        for (auto *Idx = NewIdxOut + 2; Idx <= OldIdxOut; ++Idx)
+          Idx->valno = OldIdxVNI;
+        // Aggressively remove all dead flags from the former dead definition.
+        // Kill/dead flags shouldn't be used while live intervals exist; they
+        // will be reinserted by VirtRegRewriter.
+        if (MachineInstr *KillMI = LIS.getInstructionFromIndex(NewIdx))
+          for (MIBundleOperands MO(*KillMI); MO.isValid(); ++MO)
+            if (MO->isReg() && !MO->isUse())
+              MO->setIsDead(false);
+      } else {
+        // OldIdxVNI is a dead def. It may have been moved across other values
+        // in LR, so move OldIdxOut up to NewIdxOut. Slide [NewIdxOut;OldIdxOut)
+        // down one position.
+        //    |- X0/NewIdxOut -| ... |- Xn-1 -| |- Xn/OldIdxOut -| |- next - |
+        // => |- undef/NewIdxOut -| |- X0 -| ... |- Xn-1 -| |- next -|
+        std::copy_backward(NewIdxOut, OldIdxOut, std::next(OldIdxOut));
+        // OldIdxVNI can be reused now to build a new dead def segment.
+        LiveRange::iterator NewSegment = NewIdxOut;
+        VNInfo *NewSegmentVNI = OldIdxVNI;
+        *NewSegment = LiveRange::Segment(NewIdxDef, NewIdxDef.getDeadSlot(),
+                                         NewSegmentVNI);
+        NewSegmentVNI->def = NewIdxDef;
+      }
+    }
+  }
+
+  void updateRegMaskSlots() {
+    SmallVectorImpl<SlotIndex>::iterator RI =
+        llvm::lower_bound(LIS.RegMaskSlots, OldIdx);
+    assert(RI != LIS.RegMaskSlots.end() && *RI == OldIdx.getRegSlot() &&
+           "No RegMask at OldIdx.");
+    *RI = NewIdx.getRegSlot();
+    assert((RI == LIS.RegMaskSlots.begin() ||
+            SlotIndex::isEarlierInstr(*std::prev(RI), *RI)) &&
+           "Cannot move regmask instruction above another call");
+    assert((std::next(RI) == LIS.RegMaskSlots.end() ||
+            SlotIndex::isEarlierInstr(*RI, *std::next(RI))) &&
+           "Cannot move regmask instruction below another call");
+  }
+
+  // Return the last use of reg between NewIdx and OldIdx.
+  SlotIndex findLastUseBefore(SlotIndex Before, Register Reg,
+                              LaneBitmask LaneMask) {
+    if (Reg.isVirtual()) {
+      SlotIndex LastUse = Before;
+      for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+        if (MO.isUndef())
+          continue;
+        unsigned SubReg = MO.getSubReg();
+        if (SubReg != 0 && LaneMask.any()
+            && (TRI.getSubRegIndexLaneMask(SubReg) & LaneMask).none())
+          continue;
+
+        const MachineInstr &MI = *MO.getParent();
+        SlotIndex InstSlot = LIS.getSlotIndexes()->getInstructionIndex(MI);
+        if (InstSlot > LastUse && InstSlot < OldIdx)
+          LastUse = InstSlot.getRegSlot();
+      }
+      return LastUse;
+    }
+
+    // This is a regunit interval, so scanning the use list could be very
+    // expensive. Scan upwards from OldIdx instead.
+    assert(Before < OldIdx && "Expected upwards move");
+    SlotIndexes *Indexes = LIS.getSlotIndexes();
+    MachineBasicBlock *MBB = Indexes->getMBBFromIndex(Before);
+
+    // OldIdx may not correspond to an instruction any longer, so set MII to
+    // point to the next instruction after OldIdx, or MBB->end().
+    MachineBasicBlock::iterator MII = MBB->end();
+    if (MachineInstr *MI = Indexes->getInstructionFromIndex(
+                           Indexes->getNextNonNullIndex(OldIdx)))
+      if (MI->getParent() == MBB)
+        MII = MI;
+
+    MachineBasicBlock::iterator Begin = MBB->begin();
+    while (MII != Begin) {
+      if ((--MII)->isDebugOrPseudoInstr())
+        continue;
+      SlotIndex Idx = Indexes->getInstructionIndex(*MII);
+
+      // Stop searching when Before is reached.
+      if (!SlotIndex::isEarlierInstr(Before, Idx))
+        return Before;
+
+      // Check if MII uses Reg.
+      for (MIBundleOperands MO(*MII); MO.isValid(); ++MO)
+        if (MO->isReg() && !MO->isUndef() && MO->getReg().isPhysical() &&
+            TRI.hasRegUnit(MO->getReg(), Reg))
+          return Idx.getRegSlot();
+    }
+    // Didn't reach Before. It must be the first instruction in the block.
+    return Before;
+  }
+};
+
+void LiveIntervals::handleMove(MachineInstr &MI, bool UpdateFlags) {
+  // It is fine to move a bundle as a whole, but not an individual instruction
+  // inside it.
+  assert((!MI.isBundled() || MI.getOpcode() == TargetOpcode::BUNDLE) &&
+         "Cannot move instruction in bundle");
+  SlotIndex OldIndex = Indexes->getInstructionIndex(MI);
+  Indexes->removeMachineInstrFromMaps(MI);
+  SlotIndex NewIndex = Indexes->insertMachineInstrInMaps(MI);
+  assert(getMBBStartIdx(MI.getParent()) <= OldIndex &&
+         OldIndex < getMBBEndIdx(MI.getParent()) &&
+         "Cannot handle moves across basic block boundaries.");
+
+  HMEditor HME(*this, *MRI, *TRI, OldIndex, NewIndex, UpdateFlags);
+  HME.updateAllRanges(&MI);
+}
+
+void LiveIntervals::handleMoveIntoNewBundle(MachineInstr &BundleStart,
+                                            bool UpdateFlags) {
+  assert((BundleStart.getOpcode() == TargetOpcode::BUNDLE) &&
+         "Bundle start is not a bundle");
+  SmallVector<SlotIndex, 16> ToProcess;
+  const SlotIndex NewIndex = Indexes->insertMachineInstrInMaps(BundleStart);
+  auto BundleEnd = getBundleEnd(BundleStart.getIterator());
+
+  auto I = BundleStart.getIterator();
+  I++;
+  while (I != BundleEnd) {
+    if (!Indexes->hasIndex(*I))
+      continue;
+    SlotIndex OldIndex = Indexes->getInstructionIndex(*I, true);
+    ToProcess.push_back(OldIndex);
+    Indexes->removeMachineInstrFromMaps(*I, true);
+    I++;
+  }
+  for (SlotIndex OldIndex : ToProcess) {
+    HMEditor HME(*this, *MRI, *TRI, OldIndex, NewIndex, UpdateFlags);
+    HME.updateAllRanges(&BundleStart);
+  }
+
+  // Fix up dead defs
+  const SlotIndex Index = getInstructionIndex(BundleStart);
+  for (unsigned Idx = 0, E = BundleStart.getNumOperands(); Idx != E; ++Idx) {
+    MachineOperand &MO = BundleStart.getOperand(Idx);
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (Reg.isVirtual() && hasInterval(Reg) && !MO.isUndef()) {
+      LiveInterval &LI = getInterval(Reg);
+      LiveQueryResult LRQ = LI.Query(Index);
+      if (LRQ.isDeadDef())
+        MO.setIsDead();
+    }
+  }
+}
+
+void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
+                                        const MachineBasicBlock::iterator End,
+                                        const SlotIndex EndIdx, LiveRange &LR,
+                                        const Register Reg,
+                                        LaneBitmask LaneMask) {
+  LiveInterval::iterator LII = LR.find(EndIdx);
+  SlotIndex lastUseIdx;
+  if (LII != LR.end() && LII->start < EndIdx) {
+    lastUseIdx = LII->end;
+  } else if (LII == LR.begin()) {
+    // We may not have a liverange at all if this is a subregister untouched
+    // between \p Begin and \p End.
+  } else {
+    --LII;
+  }
+
+  for (MachineBasicBlock::iterator I = End; I != Begin;) {
+    --I;
+    MachineInstr &MI = *I;
+    if (MI.isDebugOrPseudoInstr())
+      continue;
+
+    SlotIndex instrIdx = getInstructionIndex(MI);
+    bool isStartValid = getInstructionFromIndex(LII->start);
+    bool isEndValid = getInstructionFromIndex(LII->end);
+
+    // FIXME: This doesn't currently handle early-clobber or multiple removed
+    // defs inside of the region to repair.
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg() || MO.getReg() != Reg)
+        continue;
+
+      unsigned SubReg = MO.getSubReg();
+      LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubReg);
+      if ((Mask & LaneMask).none())
+        continue;
+
+      if (MO.isDef()) {
+        if (!isStartValid) {
+          if (LII->end.isDead()) {
+            LII = LR.removeSegment(LII, true);
+            if (LII != LR.begin())
+              --LII;
+          } else {
+            LII->start = instrIdx.getRegSlot();
+            LII->valno->def = instrIdx.getRegSlot();
+            if (MO.getSubReg() && !MO.isUndef())
+              lastUseIdx = instrIdx.getRegSlot();
+            else
+              lastUseIdx = SlotIndex();
+            continue;
+          }
+        }
+
+        if (!lastUseIdx.isValid()) {
+          VNInfo *VNI = LR.getNextValue(instrIdx.getRegSlot(), VNInfoAllocator);
+          LiveRange::Segment S(instrIdx.getRegSlot(),
+                               instrIdx.getDeadSlot(), VNI);
+          LII = LR.addSegment(S);
+        } else if (LII->start != instrIdx.getRegSlot()) {
+          VNInfo *VNI = LR.getNextValue(instrIdx.getRegSlot(), VNInfoAllocator);
+          LiveRange::Segment S(instrIdx.getRegSlot(), lastUseIdx, VNI);
+          LII = LR.addSegment(S);
+        }
+
+        if (MO.getSubReg() && !MO.isUndef())
+          lastUseIdx = instrIdx.getRegSlot();
+        else
+          lastUseIdx = SlotIndex();
+      } else if (MO.isUse()) {
+        // FIXME: This should probably be handled outside of this branch,
+        // either as part of the def case (for defs inside of the region) or
+        // after the loop over the region.
+        if (!isEndValid && !LII->end.isBlock())
+          LII->end = instrIdx.getRegSlot();
+        if (!lastUseIdx.isValid())
+          lastUseIdx = instrIdx.getRegSlot();
+      }
+    }
+  }
+
+  bool isStartValid = getInstructionFromIndex(LII->start);
+  if (!isStartValid && LII->end.isDead())
+    LR.removeSegment(*LII, true);
+}
+
+void
+LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator Begin,
+                                      MachineBasicBlock::iterator End,
+                                      ArrayRef<Register> OrigRegs) {
+  // Find anchor points, which are at the beginning/end of blocks or at
+  // instructions that already have indexes.
+  while (Begin != MBB->begin() && !Indexes->hasIndex(*std::prev(Begin)))
+    --Begin;
+  while (End != MBB->end() && !Indexes->hasIndex(*End))
+    ++End;
+
+  SlotIndex EndIdx;
+  if (End == MBB->end())
+    EndIdx = getMBBEndIdx(MBB).getPrevSlot();
+  else
+    EndIdx = getInstructionIndex(*End);
+
+  Indexes->repairIndexesInRange(MBB, Begin, End);
+
+  // Make sure a live interval exists for all register operands in the range.
+  SmallVector<Register> RegsToRepair(OrigRegs.begin(), OrigRegs.end());
+  for (MachineBasicBlock::iterator I = End; I != Begin;) {
+    --I;
+    MachineInstr &MI = *I;
+    if (MI.isDebugOrPseudoInstr())
+      continue;
+    for (const MachineOperand &MO : MI.operands()) {
+      if (MO.isReg() && MO.getReg().isVirtual()) {
+        Register Reg = MO.getReg();
+        // If the new instructions refer to subregs but the old instructions did
+        // not, throw away any old live interval so it will be recomputed with
+        // subranges.
+        if (MO.getSubReg() && hasInterval(Reg) &&
+            !getInterval(Reg).hasSubRanges() &&
+            MRI->shouldTrackSubRegLiveness(Reg))
+          removeInterval(Reg);
+        if (!hasInterval(Reg)) {
+          createAndComputeVirtRegInterval(Reg);
+          // Don't bother to repair a freshly calculated live interval.
+          erase_value(RegsToRepair, Reg);
+        }
+      }
+    }
+  }
+
+  for (Register Reg : RegsToRepair) {
+    if (!Reg.isVirtual())
+      continue;
+
+    LiveInterval &LI = getInterval(Reg);
+    // FIXME: Should we support undefs that gain defs?
+    if (!LI.hasAtLeastOneValue())
+      continue;
+
+    for (LiveInterval::SubRange &S : LI.subranges())
+      repairOldRegInRange(Begin, End, EndIdx, S, Reg, S.LaneMask);
+    LI.removeEmptySubRanges();
+
+    repairOldRegInRange(Begin, End, EndIdx, LI, Reg);
+  }
+}
+
+void LiveIntervals::removePhysRegDefAt(MCRegister Reg, SlotIndex Pos) {
+  for (MCRegUnit Unit : TRI->regunits(Reg)) {
+    if (LiveRange *LR = getCachedRegUnit(Unit))
+      if (VNInfo *VNI = LR->getVNInfoAt(Pos))
+        LR->removeValNo(VNI);
+  }
+}
+
+void LiveIntervals::removeVRegDefAt(LiveInterval &LI, SlotIndex Pos) {
+  // LI may not have the main range computed yet, but its subranges may
+  // be present.
+  VNInfo *VNI = LI.getVNInfoAt(Pos);
+  if (VNI != nullptr) {
+    assert(VNI->def.getBaseIndex() == Pos.getBaseIndex());
+    LI.removeValNo(VNI);
+  }
+
+  // Also remove the value defined in subranges.
+  for (LiveInterval::SubRange &S : LI.subranges()) {
+    if (VNInfo *SVNI = S.getVNInfoAt(Pos))
+      if (SVNI->def.getBaseIndex() == Pos.getBaseIndex())
+        S.removeValNo(SVNI);
+  }
+  LI.removeEmptySubRanges();
+}
+
+void LiveIntervals::splitSeparateComponents(LiveInterval &LI,
+    SmallVectorImpl<LiveInterval*> &SplitLIs) {
+  ConnectedVNInfoEqClasses ConEQ(*this);
+  unsigned NumComp = ConEQ.Classify(LI);
+  if (NumComp <= 1)
+    return;
+  LLVM_DEBUG(dbgs() << "  Split " << NumComp << " components: " << LI << '\n');
+  Register Reg = LI.reg();
+  for (unsigned I = 1; I < NumComp; ++I) {
+    Register NewVReg = MRI->cloneVirtualRegister(Reg);
+    LiveInterval &NewLI = createEmptyInterval(NewVReg);
+    SplitLIs.push_back(&NewLI);
+  }
+  ConEQ.Distribute(LI, SplitLIs.data(), *MRI);
+}
+
+void LiveIntervals::constructMainRangeFromSubranges(LiveInterval &LI) {
+  assert(LICalc && "LICalc not initialized.");
+  LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  LICalc->constructMainRangeFromSubranges(LI);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LivePhysRegs.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LivePhysRegs.cpp
new file mode 100644
index 000000000000..96380d408482
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LivePhysRegs.cpp
@@ -0,0 +1,340 @@
+//===--- LivePhysRegs.cpp - Live Physical Register Set --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LivePhysRegs utility for tracking liveness of
+// physical registers across machine instructions in forward or backward order.
+// A more detailed description can be found in the corresponding header file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+
+/// Remove all registers from the set that get clobbered by the register
+/// mask.
+/// The clobbers set will be the list of live registers clobbered
+/// by the regmask.
+void LivePhysRegs::removeRegsInMask(const MachineOperand &MO,
+    SmallVectorImpl<std::pair<MCPhysReg, const MachineOperand*>> *Clobbers) {
+  RegisterSet::iterator LRI = LiveRegs.begin();
+  while (LRI != LiveRegs.end()) {
+    if (MO.clobbersPhysReg(*LRI)) {
+      if (Clobbers)
+        Clobbers->push_back(std::make_pair(*LRI, &MO));
+      LRI = LiveRegs.erase(LRI);
+    } else
+      ++LRI;
+  }
+}
+
+/// Remove defined registers and regmask kills from the set.
+void LivePhysRegs::removeDefs(const MachineInstr &MI) {
+  for (const MachineOperand &MOP : phys_regs_and_masks(MI)) {
+    if (MOP.isRegMask()) {
+      removeRegsInMask(MOP);
+      continue;
+    }
+
+    if (MOP.isDef())
+      removeReg(MOP.getReg());
+  }
+}
+
+/// Add uses to the set.
+void LivePhysRegs::addUses(const MachineInstr &MI) {
+  for (const MachineOperand &MOP : phys_regs_and_masks(MI)) {
+    if (!MOP.isReg() || !MOP.readsReg())
+      continue;
+    addReg(MOP.getReg());
+  }
+}
+
+/// Simulates liveness when stepping backwards over an instruction(bundle):
+/// Remove Defs, add uses. This is the recommended way of calculating liveness.
+void LivePhysRegs::stepBackward(const MachineInstr &MI) {
+  // Remove defined registers and regmask kills from the set.
+  removeDefs(MI);
+
+  // Add uses to the set.
+  addUses(MI);
+}
+
+/// Simulates liveness when stepping forward over an instruction(bundle): Remove
+/// killed-uses, add defs. This is the not recommended way, because it depends
+/// on accurate kill flags. If possible use stepBackward() instead of this
+/// function.
+void LivePhysRegs::stepForward(const MachineInstr &MI,
+    SmallVectorImpl<std::pair<MCPhysReg, const MachineOperand*>> &Clobbers) {
+  // Remove killed registers from the set.
+  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+    if (O->isReg()) {
+      if (O->isDebug())
+        continue;
+      Register Reg = O->getReg();
+      if (!Reg.isPhysical())
+        continue;
+      if (O->isDef()) {
+        // Note, dead defs are still recorded.  The caller should decide how to
+        // handle them.
+        Clobbers.push_back(std::make_pair(Reg, &*O));
+      } else {
+        assert(O->isUse());
+        if (O->isKill())
+          removeReg(Reg);
+      }
+    } else if (O->isRegMask()) {
+      removeRegsInMask(*O, &Clobbers);
+    }
+  }
+
+  // Add defs to the set.
+  for (auto Reg : Clobbers) {
+    // Skip dead defs and registers clobbered by regmasks. They shouldn't
+    // be added to the set.
+    if (Reg.second->isReg() && Reg.second->isDead())
+      continue;
+    if (Reg.second->isRegMask() &&
+        MachineOperand::clobbersPhysReg(Reg.second->getRegMask(), Reg.first))
+      continue;
+    addReg(Reg.first);
+  }
+}
+
+/// Print the currently live registers to OS.
+void LivePhysRegs::print(raw_ostream &OS) const {
+  OS << "Live Registers:";
+  if (!TRI) {
+    OS << " (uninitialized)\n";
+    return;
+  }
+
+  if (empty()) {
+    OS << " (empty)\n";
+    return;
+  }
+
+  for (MCPhysReg R : *this)
+    OS << " " << printReg(R, TRI);
+  OS << "\n";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LivePhysRegs::dump() const {
+  dbgs() << "  " << *this;
+}
+#endif
+
+bool LivePhysRegs::available(const MachineRegisterInfo &MRI,
+                             MCPhysReg Reg) const {
+  if (LiveRegs.count(Reg))
+    return false;
+  if (MRI.isReserved(Reg))
+    return false;
+  for (MCRegAliasIterator R(Reg, TRI, false); R.isValid(); ++R) {
+    if (LiveRegs.count(*R))
+      return false;
+  }
+  return true;
+}
+
+/// Add live-in registers of basic block \p MBB to \p LiveRegs.
+void LivePhysRegs::addBlockLiveIns(const MachineBasicBlock &MBB) {
+  for (const auto &LI : MBB.liveins()) {
+    MCPhysReg Reg = LI.PhysReg;
+    LaneBitmask Mask = LI.LaneMask;
+    MCSubRegIndexIterator S(Reg, TRI);
+    assert(Mask.any() && "Invalid livein mask");
+    if (Mask.all() || !S.isValid()) {
+      addReg(Reg);
+      continue;
+    }
+    for (; S.isValid(); ++S) {
+      unsigned SI = S.getSubRegIndex();
+      if ((Mask & TRI->getSubRegIndexLaneMask(SI)).any())
+        addReg(S.getSubReg());
+    }
+  }
+}
+
+/// Adds all callee saved registers to \p LiveRegs.
+static void addCalleeSavedRegs(LivePhysRegs &LiveRegs,
+                               const MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; ++CSR)
+    LiveRegs.addReg(*CSR);
+}
+
+void LivePhysRegs::addPristines(const MachineFunction &MF) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (!MFI.isCalleeSavedInfoValid())
+    return;
+  /// This function will usually be called on an empty object, handle this
+  /// as a special case.
+  if (empty()) {
+    /// Add all callee saved regs, then remove the ones that are saved and
+    /// restored.
+    addCalleeSavedRegs(*this, MF);
+    /// Remove the ones that are not saved/restored; they are pristine.
+    for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
+      removeReg(Info.getReg());
+    return;
+  }
+  /// If a callee-saved register that is not pristine is already present
+  /// in the set, we should make sure that it stays in it. Precompute the
+  /// set of pristine registers in a separate object.
+  /// Add all callee saved regs, then remove the ones that are saved+restored.
+  LivePhysRegs Pristine(*TRI);
+  addCalleeSavedRegs(Pristine, MF);
+  /// Remove the ones that are not saved/restored; they are pristine.
+  for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
+    Pristine.removeReg(Info.getReg());
+  for (MCPhysReg R : Pristine)
+    addReg(R);
+}
+
+void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) {
+  // To get the live-outs we simply merge the live-ins of all successors.
+  for (const MachineBasicBlock *Succ : MBB.successors())
+    addBlockLiveIns(*Succ);
+  if (MBB.isReturnBlock()) {
+    // Return blocks are a special case because we currently don't mark up
+    // return instructions completely: specifically, there is no explicit
+    // use for callee-saved registers. So we add all callee saved registers
+    // that are saved and restored (somewhere). This does not include
+    // callee saved registers that are unused and hence not saved and
+    // restored; they are called pristine.
+    // FIXME: PEI should add explicit markings to return instructions
+    // instead of implicitly handling them here.
+    const MachineFunction &MF = *MBB.getParent();
+    const MachineFrameInfo &MFI = MF.getFrameInfo();
+    if (MFI.isCalleeSavedInfoValid()) {
+      for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
+        if (Info.isRestored())
+          addReg(Info.getReg());
+    }
+  }
+}
+
+void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
+  addPristines(MF);
+  addLiveOutsNoPristines(MBB);
+}
+
+void LivePhysRegs::addLiveIns(const MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
+  addPristines(MF);
+  addBlockLiveIns(MBB);
+}
+
+void LivePhysRegs::addLiveInsNoPristines(const MachineBasicBlock &MBB) {
+  addBlockLiveIns(MBB);
+}
+
+void llvm::computeLiveIns(LivePhysRegs &LiveRegs,
+                          const MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  LiveRegs.init(TRI);
+  LiveRegs.addLiveOutsNoPristines(MBB);
+  for (const MachineInstr &MI : llvm::reverse(MBB))
+    LiveRegs.stepBackward(MI);
+}
+
+void llvm::addLiveIns(MachineBasicBlock &MBB, const LivePhysRegs &LiveRegs) {
+  assert(MBB.livein_empty() && "Expected empty live-in list");
+  const MachineFunction &MF = *MBB.getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  for (MCPhysReg Reg : LiveRegs) {
+    if (MRI.isReserved(Reg))
+      continue;
+    // Skip the register if we are about to add one of its super registers.
+    if (any_of(TRI.superregs(Reg), [&](MCPhysReg SReg) {
+          return LiveRegs.contains(SReg) && !MRI.isReserved(SReg);
+        }))
+      continue;
+    MBB.addLiveIn(Reg);
+  }
+}
+
+void llvm::recomputeLivenessFlags(MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // We walk through the block backwards and start with the live outs.
+  LivePhysRegs LiveRegs;
+  LiveRegs.init(TRI);
+  LiveRegs.addLiveOutsNoPristines(MBB);
+
+  for (MachineInstr &MI : llvm::reverse(MBB)) {
+    // Recompute dead flags.
+    for (MIBundleOperands MO(MI); MO.isValid(); ++MO) {
+      if (!MO->isReg() || !MO->isDef() || MO->isDebug())
+        continue;
+
+      Register Reg = MO->getReg();
+      if (Reg == 0)
+        continue;
+      assert(Reg.isPhysical());
+
+      bool IsNotLive = LiveRegs.available(MRI, Reg);
+
+      // Special-case return instructions for cases when a return is not
+      // the last instruction in the block.
+      if (MI.isReturn() && MFI.isCalleeSavedInfoValid()) {
+        for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) {
+          if (Info.getReg() == Reg) {
+            IsNotLive = !Info.isRestored();
+            break;
+          }
+        }
+      }
+
+      MO->setIsDead(IsNotLive);
+    }
+
+    // Step backward over defs.
+    LiveRegs.removeDefs(MI);
+
+    // Recompute kill flags.
+    for (MIBundleOperands MO(MI); MO.isValid(); ++MO) {
+      if (!MO->isReg() || !MO->readsReg() || MO->isDebug())
+        continue;
+
+      Register Reg = MO->getReg();
+      if (Reg == 0)
+        continue;
+      assert(Reg.isPhysical());
+
+      bool IsNotLive = LiveRegs.available(MRI, Reg);
+      MO->setIsKill(IsNotLive);
+    }
+
+    // Complete the stepbackward.
+    LiveRegs.addUses(MI);
+  }
+}
+
+void llvm::computeAndAddLiveIns(LivePhysRegs &LiveRegs,
+                                MachineBasicBlock &MBB) {
+  computeLiveIns(LiveRegs, MBB);
+  addLiveIns(MBB, LiveRegs);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeCalc.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeCalc.cpp
new file mode 100644
index 000000000000..26f6e1ede1ad
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeCalc.cpp
@@ -0,0 +1,451 @@
+//===- LiveRangeCalc.cpp - Calculate live ranges -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the LiveRangeCalc class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveRangeCalc.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+// Reserve an address that indicates a value that is known to be "undef".
+static VNInfo UndefVNI(0xbad, SlotIndex());
+
+void LiveRangeCalc::resetLiveOutMap() {
+  unsigned NumBlocks = MF->getNumBlockIDs();
+  Seen.clear();
+  Seen.resize(NumBlocks);
+  EntryInfos.clear();
+  Map.resize(NumBlocks);
+}
+
+void LiveRangeCalc::reset(const MachineFunction *mf,
+                          SlotIndexes *SI,
+                          MachineDominatorTree *MDT,
+                          VNInfo::Allocator *VNIA) {
+  MF = mf;
+  MRI = &MF->getRegInfo();
+  Indexes = SI;
+  DomTree = MDT;
+  Alloc = VNIA;
+  resetLiveOutMap();
+  LiveIn.clear();
+}
+
+void LiveRangeCalc::updateFromLiveIns() {
+  LiveRangeUpdater Updater;
+  for (const LiveInBlock &I : LiveIn) {
+    if (!I.DomNode)
+      continue;
+    MachineBasicBlock *MBB = I.DomNode->getBlock();
+    assert(I.Value && "No live-in value found");
+    SlotIndex Start, End;
+    std::tie(Start, End) = Indexes->getMBBRange(MBB);
+
+    if (I.Kill.isValid())
+      // Value is killed inside this block.
+      End = I.Kill;
+    else {
+      // The value is live-through, update LiveOut as well.
+      // Defer the Domtree lookup until it is needed.
+      assert(Seen.test(MBB->getNumber()));
+      Map[MBB] = LiveOutPair(I.Value, nullptr);
+    }
+    Updater.setDest(&I.LR);
+    Updater.add(Start, End, I.Value);
+  }
+  LiveIn.clear();
+}
+
+void LiveRangeCalc::extend(LiveRange &LR, SlotIndex Use, unsigned PhysReg,
+                           ArrayRef<SlotIndex> Undefs) {
+  assert(Use.isValid() && "Invalid SlotIndex");
+  assert(Indexes && "Missing SlotIndexes");
+  assert(DomTree && "Missing dominator tree");
+
+  MachineBasicBlock *UseMBB = Indexes->getMBBFromIndex(Use.getPrevSlot());
+  assert(UseMBB && "No MBB at Use");
+
+  // Is there a def in the same MBB we can extend?
+  auto EP = LR.extendInBlock(Undefs, Indexes->getMBBStartIdx(UseMBB), Use);
+  if (EP.first != nullptr || EP.second)
+    return;
+
+  // Find the single reaching def, or determine if Use is jointly dominated by
+  // multiple values, and we may need to create even more phi-defs to preserve
+  // VNInfo SSA form.  Perform a search for all predecessor blocks where we
+  // know the dominating VNInfo.
+  if (findReachingDefs(LR, *UseMBB, Use, PhysReg, Undefs))
+    return;
+
+  // When there were multiple different values, we may need new PHIs.
+  calculateValues();
+}
+
+// This function is called by a client after using the low-level API to add
+// live-out and live-in blocks.  The unique value optimization is not
+// available, SplitEditor::transferValues handles that case directly anyway.
+void LiveRangeCalc::calculateValues() {
+  assert(Indexes && "Missing SlotIndexes");
+  assert(DomTree && "Missing dominator tree");
+  updateSSA();
+  updateFromLiveIns();
+}
+
+bool LiveRangeCalc::isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs,
+                                 MachineBasicBlock &MBB, BitVector &DefOnEntry,
+                                 BitVector &UndefOnEntry) {
+  unsigned BN = MBB.getNumber();
+  if (DefOnEntry[BN])
+    return true;
+  if (UndefOnEntry[BN])
+    return false;
+
+  auto MarkDefined = [BN, &DefOnEntry](MachineBasicBlock &B) -> bool {
+    for (MachineBasicBlock *S : B.successors())
+      DefOnEntry[S->getNumber()] = true;
+    DefOnEntry[BN] = true;
+    return true;
+  };
+
+  SetVector<unsigned> WorkList;
+  // Checking if the entry of MBB is reached by some def: add all predecessors
+  // that are potentially defined-on-exit to the work list.
+  for (MachineBasicBlock *P : MBB.predecessors())
+    WorkList.insert(P->getNumber());
+
+  for (unsigned i = 0; i != WorkList.size(); ++i) {
+    // Determine if the exit from the block is reached by some def.
+    unsigned N = WorkList[i];
+    MachineBasicBlock &B = *MF->getBlockNumbered(N);
+    if (Seen[N]) {
+      const LiveOutPair &LOB = Map[&B];
+      if (LOB.first != nullptr && LOB.first != &UndefVNI)
+        return MarkDefined(B);
+    }
+    SlotIndex Begin, End;
+    std::tie(Begin, End) = Indexes->getMBBRange(&B);
+    // Treat End as not belonging to B.
+    // If LR has a segment S that starts at the next block, i.e. [End, ...),
+    // std::upper_bound will return the segment following S. Instead,
+    // S should be treated as the first segment that does not overlap B.
+    LiveRange::iterator UB = upper_bound(LR, End.getPrevSlot());
+    if (UB != LR.begin()) {
+      LiveRange::Segment &Seg = *std::prev(UB);
+      if (Seg.end > Begin) {
+        // There is a segment that overlaps B. If the range is not explicitly
+        // undefined between the end of the segment and the end of the block,
+        // treat the block as defined on exit. If it is, go to the next block
+        // on the work list.
+        if (LR.isUndefIn(Undefs, Seg.end, End))
+          continue;
+        return MarkDefined(B);
+      }
+    }
+
+    // No segment overlaps with this block. If this block is not defined on
+    // entry, or it undefines the range, do not process its predecessors.
+    if (UndefOnEntry[N] || LR.isUndefIn(Undefs, Begin, End)) {
+      UndefOnEntry[N] = true;
+      continue;
+    }
+    if (DefOnEntry[N])
+      return MarkDefined(B);
+
+    // Still don't know: add all predecessors to the work list.
+    for (MachineBasicBlock *P : B.predecessors())
+      WorkList.insert(P->getNumber());
+  }
+
+  UndefOnEntry[BN] = true;
+  return false;
+}
+
+bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
+                                     SlotIndex Use, unsigned PhysReg,
+                                     ArrayRef<SlotIndex> Undefs) {
+  unsigned UseMBBNum = UseMBB.getNumber();
+
+  // Block numbers where LR should be live-in.
+  SmallVector<unsigned, 16> WorkList(1, UseMBBNum);
+
+  // Remember if we have seen more than one value.
+  bool UniqueVNI = true;
+  VNInfo *TheVNI = nullptr;
+
+  bool FoundUndef = false;
+
+  // Using Seen as a visited set, perform a BFS for all reaching defs.
+  for (unsigned i = 0; i != WorkList.size(); ++i) {
+    MachineBasicBlock *MBB = MF->getBlockNumbered(WorkList[i]);
+
+#ifndef NDEBUG
+    if (MBB->pred_empty()) {
+      MBB->getParent()->verify();
+      errs() << "Use of " << printReg(PhysReg, MRI->getTargetRegisterInfo())
+             << " does not have a corresponding definition on every path:\n";
+      const MachineInstr *MI = Indexes->getInstructionFromIndex(Use);
+      if (MI != nullptr)
+        errs() << Use << " " << *MI;
+      report_fatal_error("Use not jointly dominated by defs.");
+    }
+
+    if (Register::isPhysicalRegister(PhysReg) && !MBB->isLiveIn(PhysReg)) {
+      MBB->getParent()->verify();
+      const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+      errs() << "The register " << printReg(PhysReg, TRI)
+             << " needs to be live in to " << printMBBReference(*MBB)
+             << ", but is missing from the live-in list.\n";
+      report_fatal_error("Invalid global physical register");
+    }
+#endif
+    FoundUndef |= MBB->pred_empty();
+
+    for (MachineBasicBlock *Pred : MBB->predecessors()) {
+       // Is this a known live-out block?
+       if (Seen.test(Pred->getNumber())) {
+         if (VNInfo *VNI = Map[Pred].first) {
+           if (TheVNI && TheVNI != VNI)
+             UniqueVNI = false;
+           TheVNI = VNI;
+         }
+         continue;
+       }
+
+       SlotIndex Start, End;
+       std::tie(Start, End) = Indexes->getMBBRange(Pred);
+
+       // First time we see Pred.  Try to determine the live-out value, but set
+       // it as null if Pred is live-through with an unknown value.
+       auto EP = LR.extendInBlock(Undefs, Start, End);
+       VNInfo *VNI = EP.first;
+       FoundUndef |= EP.second;
+       setLiveOutValue(Pred, EP.second ? &UndefVNI : VNI);
+       if (VNI) {
+         if (TheVNI && TheVNI != VNI)
+           UniqueVNI = false;
+         TheVNI = VNI;
+       }
+       if (VNI || EP.second)
+         continue;
+
+       // No, we need a live-in value for Pred as well
+       if (Pred != &UseMBB)
+         WorkList.push_back(Pred->getNumber());
+       else
+          // Loopback to UseMBB, so value is really live through.
+         Use = SlotIndex();
+    }
+  }
+
+  LiveIn.clear();
+  FoundUndef |= (TheVNI == nullptr || TheVNI == &UndefVNI);
+  if (!Undefs.empty() && FoundUndef)
+    UniqueVNI = false;
+
+  // Both updateSSA() and LiveRangeUpdater benefit from ordered blocks, but
+  // neither require it. Skip the sorting overhead for small updates.
+  if (WorkList.size() > 4)
+    array_pod_sort(WorkList.begin(), WorkList.end());
+
+  // If a unique reaching def was found, blit in the live ranges immediately.
+  if (UniqueVNI) {
+    assert(TheVNI != nullptr && TheVNI != &UndefVNI);
+    LiveRangeUpdater Updater(&LR);
+    for (unsigned BN : WorkList) {
+      SlotIndex Start, End;
+      std::tie(Start, End) = Indexes->getMBBRange(BN);
+      // Trim the live range in UseMBB.
+      if (BN == UseMBBNum && Use.isValid())
+        End = Use;
+      else
+        Map[MF->getBlockNumbered(BN)] = LiveOutPair(TheVNI, nullptr);
+      Updater.add(Start, End, TheVNI);
+    }
+    return true;
+  }
+
+  // Prepare the defined/undefined bit vectors.
+  EntryInfoMap::iterator Entry;
+  bool DidInsert;
+  std::tie(Entry, DidInsert) = EntryInfos.insert(
+      std::make_pair(&LR, std::make_pair(BitVector(), BitVector())));
+  if (DidInsert) {
+    // Initialize newly inserted entries.
+    unsigned N = MF->getNumBlockIDs();
+    Entry->second.first.resize(N);
+    Entry->second.second.resize(N);
+  }
+  BitVector &DefOnEntry = Entry->second.first;
+  BitVector &UndefOnEntry = Entry->second.second;
+
+  // Multiple values were found, so transfer the work list to the LiveIn array
+  // where UpdateSSA will use it as a work list.
+  LiveIn.reserve(WorkList.size());
+  for (unsigned BN : WorkList) {
+    MachineBasicBlock *MBB = MF->getBlockNumbered(BN);
+    if (!Undefs.empty() &&
+        !isDefOnEntry(LR, Undefs, *MBB, DefOnEntry, UndefOnEntry))
+      continue;
+    addLiveInBlock(LR, DomTree->getNode(MBB));
+    if (MBB == &UseMBB)
+      LiveIn.back().Kill = Use;
+  }
+
+  return false;
+}
+
+// This is essentially the same iterative algorithm that SSAUpdater uses,
+// except we already have a dominator tree, so we don't have to recompute it.
+void LiveRangeCalc::updateSSA() {
+  assert(Indexes && "Missing SlotIndexes");
+  assert(DomTree && "Missing dominator tree");
+
+  // Interate until convergence.
+  bool Changed;
+  do {
+    Changed = false;
+    // Propagate live-out values down the dominator tree, inserting phi-defs
+    // when necessary.
+    for (LiveInBlock &I : LiveIn) {
+      MachineDomTreeNode *Node = I.DomNode;
+      // Skip block if the live-in value has already been determined.
+      if (!Node)
+        continue;
+      MachineBasicBlock *MBB = Node->getBlock();
+      MachineDomTreeNode *IDom = Node->getIDom();
+      LiveOutPair IDomValue;
+
+      // We need a live-in value to a block with no immediate dominator?
+      // This is probably an unreachable block that has survived somehow.
+      bool needPHI = !IDom || !Seen.test(IDom->getBlock()->getNumber());
+
+      // IDom dominates all of our predecessors, but it may not be their
+      // immediate dominator. Check if any of them have live-out values that are
+      // properly dominated by IDom. If so, we need a phi-def here.
+      if (!needPHI) {
+        IDomValue = Map[IDom->getBlock()];
+
+        // Cache the DomTree node that defined the value.
+        if (IDomValue.first && IDomValue.first != &UndefVNI &&
+            !IDomValue.second) {
+          Map[IDom->getBlock()].second = IDomValue.second =
+            DomTree->getNode(Indexes->getMBBFromIndex(IDomValue.first->def));
+        }
+
+        for (MachineBasicBlock *Pred : MBB->predecessors()) {
+          LiveOutPair &Value = Map[Pred];
+          if (!Value.first || Value.first == IDomValue.first)
+            continue;
+          if (Value.first == &UndefVNI) {
+            needPHI = true;
+            break;
+          }
+
+          // Cache the DomTree node that defined the value.
+          if (!Value.second)
+            Value.second =
+              DomTree->getNode(Indexes->getMBBFromIndex(Value.first->def));
+
+          // This predecessor is carrying something other than IDomValue.
+          // It could be because IDomValue hasn't propagated yet, or it could be
+          // because MBB is in the dominance frontier of that value.
+          if (DomTree->dominates(IDom, Value.second)) {
+            needPHI = true;
+            break;
+          }
+        }
+      }
+
+      // The value may be live-through even if Kill is set, as can happen when
+      // we are called from extendRange. In that case LiveOutSeen is true, and
+      // LiveOut indicates a foreign or missing value.
+      LiveOutPair &LOP = Map[MBB];
+
+      // Create a phi-def if required.
+      if (needPHI) {
+        Changed = true;
+        assert(Alloc && "Need VNInfo allocator to create PHI-defs");
+        SlotIndex Start, End;
+        std::tie(Start, End) = Indexes->getMBBRange(MBB);
+        LiveRange &LR = I.LR;
+        VNInfo *VNI = LR.getNextValue(Start, *Alloc);
+        I.Value = VNI;
+        // This block is done, we know the final value.
+        I.DomNode = nullptr;
+
+        // Add liveness since updateFromLiveIns now skips this node.
+        if (I.Kill.isValid()) {
+          if (VNI)
+            LR.addSegment(LiveInterval::Segment(Start, I.Kill, VNI));
+        } else {
+          if (VNI)
+            LR.addSegment(LiveInterval::Segment(Start, End, VNI));
+          LOP = LiveOutPair(VNI, Node);
+        }
+      } else if (IDomValue.first && IDomValue.first != &UndefVNI) {
+        // No phi-def here. Remember incoming value.
+        I.Value = IDomValue.first;
+
+        // If the IDomValue is killed in the block, don't propagate through.
+        if (I.Kill.isValid())
+          continue;
+
+        // Propagate IDomValue if it isn't killed:
+        // MBB is live-out and doesn't define its own value.
+        if (LOP.first == IDomValue.first)
+          continue;
+        Changed = true;
+        LOP = IDomValue;
+      }
+    }
+  } while (Changed);
+}
+
+bool LiveRangeCalc::isJointlyDominated(const MachineBasicBlock *MBB,
+                                       ArrayRef<SlotIndex> Defs,
+                                       const SlotIndexes &Indexes) {
+  const MachineFunction &MF = *MBB->getParent();
+  BitVector DefBlocks(MF.getNumBlockIDs());
+  for (SlotIndex I : Defs)
+    DefBlocks.set(Indexes.getMBBFromIndex(I)->getNumber());
+
+  SetVector<unsigned> PredQueue;
+  PredQueue.insert(MBB->getNumber());
+  for (unsigned i = 0; i != PredQueue.size(); ++i) {
+    unsigned BN = PredQueue[i];
+    if (DefBlocks[BN])
+      return true;
+    const MachineBasicBlock *B = MF.getBlockNumbered(BN);
+    for (const MachineBasicBlock *P : B->predecessors())
+      PredQueue.insert(P->getNumber());
+  }
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeEdit.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeEdit.cpp
new file mode 100644
index 000000000000..c3477cd8ce34
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -0,0 +1,508 @@
+//===-- LiveRangeEdit.cpp - Basic tools for editing a register live range -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The LiveRangeEdit class represents changes done to a virtual register when it
+// is spilled or split.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(NumDCEDeleted,        "Number of instructions deleted by DCE");
+STATISTIC(NumDCEFoldedLoads,    "Number of single use loads folded after DCE");
+STATISTIC(NumFracRanges,        "Number of live ranges fractured by DCE");
+STATISTIC(NumReMaterialization, "Number of instructions rematerialized");
+
+void LiveRangeEdit::Delegate::anchor() { }
+
+LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(Register OldReg,
+                                                     bool createSubRanges) {
+  Register VReg = MRI.cloneVirtualRegister(OldReg);
+  if (VRM)
+    VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
+
+  LiveInterval &LI = LIS.createEmptyInterval(VReg);
+  if (Parent && !Parent->isSpillable())
+    LI.markNotSpillable();
+  if (createSubRanges) {
+    // Create empty subranges if the OldReg's interval has them. Do not create
+    // the main range here---it will be constructed later after the subranges
+    // have been finalized.
+    LiveInterval &OldLI = LIS.getInterval(OldReg);
+    VNInfo::Allocator &Alloc = LIS.getVNInfoAllocator();
+    for (LiveInterval::SubRange &S : OldLI.subranges())
+      LI.createSubRange(Alloc, S.LaneMask);
+  }
+  return LI;
+}
+
+Register LiveRangeEdit::createFrom(Register OldReg) {
+  Register VReg = MRI.cloneVirtualRegister(OldReg);
+  if (VRM) {
+    VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
+  }
+  // FIXME: Getting the interval here actually computes it.
+  // In theory, this may not be what we want, but in practice
+  // the createEmptyIntervalFrom API is used when this is not
+  // the case. Generally speaking we just want to annotate the
+  // LiveInterval when it gets created but we cannot do that at
+  // the moment.
+  if (Parent && !Parent->isSpillable())
+    LIS.getInterval(VReg).markNotSpillable();
+  return VReg;
+}
+
+bool LiveRangeEdit::checkRematerializable(VNInfo *VNI,
+                                          const MachineInstr *DefMI) {
+  assert(DefMI && "Missing instruction");
+  ScannedRemattable = true;
+  if (!TII.isTriviallyReMaterializable(*DefMI))
+    return false;
+  Remattable.insert(VNI);
+  return true;
+}
+
+void LiveRangeEdit::scanRemattable() {
+  for (VNInfo *VNI : getParent().valnos) {
+    if (VNI->isUnused())
+      continue;
+    Register Original = VRM->getOriginal(getReg());
+    LiveInterval &OrigLI = LIS.getInterval(Original);
+    VNInfo *OrigVNI = OrigLI.getVNInfoAt(VNI->def);
+    if (!OrigVNI)
+      continue;
+    MachineInstr *DefMI = LIS.getInstructionFromIndex(OrigVNI->def);
+    if (!DefMI)
+      continue;
+    checkRematerializable(OrigVNI, DefMI);
+  }
+  ScannedRemattable = true;
+}
+
+bool LiveRangeEdit::anyRematerializable() {
+  if (!ScannedRemattable)
+    scanRemattable();
+  return !Remattable.empty();
+}
+
+/// allUsesAvailableAt - Return true if all registers used by OrigMI at
+/// OrigIdx are also available with the same value at UseIdx.
+bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
+                                       SlotIndex OrigIdx,
+                                       SlotIndex UseIdx) const {
+  OrigIdx = OrigIdx.getRegSlot(true);
+  UseIdx = std::max(UseIdx, UseIdx.getRegSlot(true));
+  for (const MachineOperand &MO : OrigMI->operands()) {
+    if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
+      continue;
+
+    // We can't remat physreg uses, unless it is a constant or target wants
+    // to ignore this use.
+    if (MO.getReg().isPhysical()) {
+      if (MRI.isConstantPhysReg(MO.getReg()) || TII.isIgnorableUse(MO))
+        continue;
+      return false;
+    }
+
+    LiveInterval &li = LIS.getInterval(MO.getReg());
+    const VNInfo *OVNI = li.getVNInfoAt(OrigIdx);
+    if (!OVNI)
+      continue;
+
+    // Don't allow rematerialization immediately after the original def.
+    // It would be incorrect if OrigMI redefines the register.
+    // See PR14098.
+    if (SlotIndex::isSameInstr(OrigIdx, UseIdx))
+      return false;
+
+    if (OVNI != li.getVNInfoAt(UseIdx))
+      return false;
+
+    // Check that subrange is live at UseIdx.
+    if (li.hasSubRanges()) {
+      const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+      unsigned SubReg = MO.getSubReg();
+      LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
+                              : MRI.getMaxLaneMaskForVReg(MO.getReg());
+      for (LiveInterval::SubRange &SR : li.subranges()) {
+        if ((SR.LaneMask & LM).none())
+          continue;
+        if (!SR.liveAt(UseIdx))
+          return false;
+        // Early exit if all used lanes are checked. No need to continue.
+        LM &= ~SR.LaneMask;
+        if (LM.none())
+          break;
+      }
+    }
+  }
+  return true;
+}
+
+bool LiveRangeEdit::canRematerializeAt(Remat &RM, VNInfo *OrigVNI,
+                                       SlotIndex UseIdx, bool cheapAsAMove) {
+  assert(ScannedRemattable && "Call anyRematerializable first");
+
+  // Use scanRemattable info.
+  if (!Remattable.count(OrigVNI))
+    return false;
+
+  // No defining instruction provided.
+  SlotIndex DefIdx;
+  assert(RM.OrigMI && "No defining instruction for remattable value");
+  DefIdx = LIS.getInstructionIndex(*RM.OrigMI);
+
+  // If only cheap remats were requested, bail out early.
+  if (cheapAsAMove && !TII.isAsCheapAsAMove(*RM.OrigMI))
+    return false;
+
+  // Verify that all used registers are available with the same values.
+  if (!allUsesAvailableAt(RM.OrigMI, DefIdx, UseIdx))
+    return false;
+
+  return true;
+}
+
+SlotIndex LiveRangeEdit::rematerializeAt(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                         Register DestReg, const Remat &RM,
+                                         const TargetRegisterInfo &tri,
+                                         bool Late, unsigned SubIdx,
+                                         MachineInstr *ReplaceIndexMI) {
+  assert(RM.OrigMI && "Invalid remat");
+  TII.reMaterialize(MBB, MI, DestReg, SubIdx, *RM.OrigMI, tri);
+  // DestReg of the cloned instruction cannot be Dead. Set isDead of DestReg
+  // to false anyway in case the isDead flag of RM.OrigMI's dest register
+  // is true.
+  (*--MI).getOperand(0).setIsDead(false);
+  Rematted.insert(RM.ParentVNI);
+  ++NumReMaterialization;
+
+  if (ReplaceIndexMI)
+    return LIS.ReplaceMachineInstrInMaps(*ReplaceIndexMI, *MI).getRegSlot();
+  return LIS.getSlotIndexes()->insertMachineInstrInMaps(*MI, Late).getRegSlot();
+}
+
+void LiveRangeEdit::eraseVirtReg(Register Reg) {
+  if (TheDelegate && TheDelegate->LRE_CanEraseVirtReg(Reg))
+    LIS.removeInterval(Reg);
+}
+
+bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
+                               SmallVectorImpl<MachineInstr*> &Dead) {
+  MachineInstr *DefMI = nullptr, *UseMI = nullptr;
+
+  // Check that there is a single def and a single use.
+  for (MachineOperand &MO : MRI.reg_nodbg_operands(LI->reg())) {
+    MachineInstr *MI = MO.getParent();
+    if (MO.isDef()) {
+      if (DefMI && DefMI != MI)
+        return false;
+      if (!MI->canFoldAsLoad())
+        return false;
+      DefMI = MI;
+    } else if (!MO.isUndef()) {
+      if (UseMI && UseMI != MI)
+        return false;
+      // FIXME: Targets don't know how to fold subreg uses.
+      if (MO.getSubReg())
+        return false;
+      UseMI = MI;
+    }
+  }
+  if (!DefMI || !UseMI)
+    return false;
+
+  // Since we're moving the DefMI load, make sure we're not extending any live
+  // ranges.
+  if (!allUsesAvailableAt(DefMI, LIS.getInstructionIndex(*DefMI),
+                          LIS.getInstructionIndex(*UseMI)))
+    return false;
+
+  // We also need to make sure it is safe to move the load.
+  // Assume there are stores between DefMI and UseMI.
+  bool SawStore = true;
+  if (!DefMI->isSafeToMove(nullptr, SawStore))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Try to fold single def: " << *DefMI
+                    << "       into single use: " << *UseMI);
+
+  SmallVector<unsigned, 8> Ops;
+  if (UseMI->readsWritesVirtualRegister(LI->reg(), &Ops).second)
+    return false;
+
+  MachineInstr *FoldMI = TII.foldMemoryOperand(*UseMI, Ops, *DefMI, &LIS);
+  if (!FoldMI)
+    return false;
+  LLVM_DEBUG(dbgs() << "                folded: " << *FoldMI);
+  LIS.ReplaceMachineInstrInMaps(*UseMI, *FoldMI);
+  // Update the call site info.
+  if (UseMI->shouldUpdateCallSiteInfo())
+    UseMI->getMF()->moveCallSiteInfo(UseMI, FoldMI);
+  UseMI->eraseFromParent();
+  DefMI->addRegisterDead(LI->reg(), nullptr);
+  Dead.push_back(DefMI);
+  ++NumDCEFoldedLoads;
+  return true;
+}
+
+bool LiveRangeEdit::useIsKill(const LiveInterval &LI,
+                              const MachineOperand &MO) const {
+  const MachineInstr &MI = *MO.getParent();
+  SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
+  if (LI.Query(Idx).isKill())
+    return true;
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  unsigned SubReg = MO.getSubReg();
+  LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubReg);
+  for (const LiveInterval::SubRange &S : LI.subranges()) {
+    if ((S.LaneMask & LaneMask).any() && S.Query(Idx).isKill())
+      return true;
+  }
+  return false;
+}
+
+/// Find all live intervals that need to shrink, then remove the instruction.
+void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
+  assert(MI->allDefsAreDead() && "Def isn't really dead");
+  SlotIndex Idx = LIS.getInstructionIndex(*MI).getRegSlot();
+
+  // Never delete a bundled instruction.
+  if (MI->isBundled()) {
+    // TODO: Handle deleting copy bundles
+    LLVM_DEBUG(dbgs() << "Won't delete dead bundled inst: " << Idx << '\t'
+                      << *MI);
+    return;
+  }
+
+  // Never delete inline asm.
+  if (MI->isInlineAsm()) {
+    LLVM_DEBUG(dbgs() << "Won't delete: " << Idx << '\t' << *MI);
+    return;
+  }
+
+  // Use the same criteria as DeadMachineInstructionElim.
+  bool SawStore = false;
+  if (!MI->isSafeToMove(nullptr, SawStore)) {
+    LLVM_DEBUG(dbgs() << "Can't delete: " << Idx << '\t' << *MI);
+    return;
+  }
+
+  LLVM_DEBUG(dbgs() << "Deleting dead def " << Idx << '\t' << *MI);
+
+  // Collect virtual registers to be erased after MI is gone.
+  SmallVector<Register, 8> RegsToErase;
+  bool ReadsPhysRegs = false;
+  bool isOrigDef = false;
+  Register Dest;
+  unsigned DestSubReg;
+  // Only optimize rematerialize case when the instruction has one def, since
+  // otherwise we could leave some dead defs in the code.  This case is
+  // extremely rare.
+  if (VRM && MI->getOperand(0).isReg() && MI->getOperand(0).isDef() &&
+      MI->getDesc().getNumDefs() == 1) {
+    Dest = MI->getOperand(0).getReg();
+    DestSubReg = MI->getOperand(0).getSubReg();
+    Register Original = VRM->getOriginal(Dest);
+    LiveInterval &OrigLI = LIS.getInterval(Original);
+    VNInfo *OrigVNI = OrigLI.getVNInfoAt(Idx);
+    // The original live-range may have been shrunk to
+    // an empty live-range. It happens when it is dead, but
+    // we still keep it around to be able to rematerialize
+    // other values that depend on it.
+    if (OrigVNI)
+      isOrigDef = SlotIndex::isSameInstr(OrigVNI->def, Idx);
+  }
+
+  bool HasLiveVRegUses = false;
+
+  // Check for live intervals that may shrink
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg.isVirtual()) {
+      // Check if MI reads any unreserved physregs.
+      if (Reg && MO.readsReg() && !MRI.isReserved(Reg))
+        ReadsPhysRegs = true;
+      else if (MO.isDef())
+        LIS.removePhysRegDefAt(Reg.asMCReg(), Idx);
+      continue;
+    }
+    LiveInterval &LI = LIS.getInterval(Reg);
+
+    // Shrink read registers, unless it is likely to be expensive and
+    // unlikely to change anything. We typically don't want to shrink the
+    // PIC base register that has lots of uses everywhere.
+    // Always shrink COPY uses that probably come from live range splitting.
+    if ((MI->readsVirtualRegister(Reg) && (MI->isCopy() || MO.isDef())) ||
+        (MO.readsReg() && (MRI.hasOneNonDBGUse(Reg) || useIsKill(LI, MO))))
+      ToShrink.insert(&LI);
+    else if (MO.readsReg())
+      HasLiveVRegUses = true;
+
+    // Remove defined value.
+    if (MO.isDef()) {
+      if (TheDelegate && LI.getVNInfoAt(Idx) != nullptr)
+        TheDelegate->LRE_WillShrinkVirtReg(LI.reg());
+      LIS.removeVRegDefAt(LI, Idx);
+      if (LI.empty())
+        RegsToErase.push_back(Reg);
+    }
+  }
+
+  // Currently, we don't support DCE of physreg live ranges. If MI reads
+  // any unreserved physregs, don't erase the instruction, but turn it into
+  // a KILL instead. This way, the physreg live ranges don't end up
+  // dangling.
+  // FIXME: It would be better to have something like shrinkToUses() for
+  // physregs. That could potentially enable more DCE and it would free up
+  // the physreg. It would not happen often, though.
+  if (ReadsPhysRegs) {
+    MI->setDesc(TII.get(TargetOpcode::KILL));
+    // Remove all operands that aren't physregs.
+    for (unsigned i = MI->getNumOperands(); i; --i) {
+      const MachineOperand &MO = MI->getOperand(i-1);
+      if (MO.isReg() && MO.getReg().isPhysical())
+        continue;
+      MI->removeOperand(i-1);
+    }
+    LLVM_DEBUG(dbgs() << "Converted physregs to:\t" << *MI);
+  } else {
+    // If the dest of MI is an original reg and MI is reMaterializable,
+    // don't delete the inst. Replace the dest with a new reg, and keep
+    // the inst for remat of other siblings. The inst is saved in
+    // LiveRangeEdit::DeadRemats and will be deleted after all the
+    // allocations of the func are done.
+    // However, immediately delete instructions which have unshrunk virtual
+    // register uses. That may provoke RA to split an interval at the KILL
+    // and later result in an invalid live segment end.
+    if (isOrigDef && DeadRemats && !HasLiveVRegUses &&
+        TII.isTriviallyReMaterializable(*MI)) {
+      LiveInterval &NewLI = createEmptyIntervalFrom(Dest, false);
+      VNInfo::Allocator &Alloc = LIS.getVNInfoAllocator();
+      VNInfo *VNI = NewLI.getNextValue(Idx, Alloc);
+      NewLI.addSegment(LiveInterval::Segment(Idx, Idx.getDeadSlot(), VNI));
+
+      if (DestSubReg) {
+        const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+        auto *SR = NewLI.createSubRange(
+            Alloc, TRI->getSubRegIndexLaneMask(DestSubReg));
+        SR->addSegment(LiveInterval::Segment(Idx, Idx.getDeadSlot(),
+                                             SR->getNextValue(Idx, Alloc)));
+      }
+
+      pop_back();
+      DeadRemats->insert(MI);
+      const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+      MI->substituteRegister(Dest, NewLI.reg(), 0, TRI);
+      MI->getOperand(0).setIsDead(true);
+    } else {
+      if (TheDelegate)
+        TheDelegate->LRE_WillEraseInstruction(MI);
+      LIS.RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+      ++NumDCEDeleted;
+    }
+  }
+
+  // Erase any virtregs that are now empty and unused. There may be <undef>
+  // uses around. Keep the empty live range in that case.
+  for (unsigned i = 0, e = RegsToErase.size(); i != e; ++i) {
+    Register Reg = RegsToErase[i];
+    if (LIS.hasInterval(Reg) && MRI.reg_nodbg_empty(Reg)) {
+      ToShrink.remove(&LIS.getInterval(Reg));
+      eraseVirtReg(Reg);
+    }
+  }
+}
+
+void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
+                                      ArrayRef<Register> RegsBeingSpilled) {
+  ToShrinkSet ToShrink;
+
+  for (;;) {
+    // Erase all dead defs.
+    while (!Dead.empty())
+      eliminateDeadDef(Dead.pop_back_val(), ToShrink);
+
+    if (ToShrink.empty())
+      break;
+
+    // Shrink just one live interval. Then delete new dead defs.
+    LiveInterval *LI = ToShrink.pop_back_val();
+    if (foldAsLoad(LI, Dead))
+      continue;
+    Register VReg = LI->reg();
+    if (TheDelegate)
+      TheDelegate->LRE_WillShrinkVirtReg(VReg);
+    if (!LIS.shrinkToUses(LI, &Dead))
+      continue;
+
+    // Don't create new intervals for a register being spilled.
+    // The new intervals would have to be spilled anyway so its not worth it.
+    // Also they currently aren't spilled so creating them and not spilling
+    // them results in incorrect code.
+    if (llvm::is_contained(RegsBeingSpilled, VReg))
+      continue;
+
+    // LI may have been separated, create new intervals.
+    LI->RenumberValues();
+    SmallVector<LiveInterval*, 8> SplitLIs;
+    LIS.splitSeparateComponents(*LI, SplitLIs);
+    if (!SplitLIs.empty())
+      ++NumFracRanges;
+
+    Register Original = VRM ? VRM->getOriginal(VReg) : Register();
+    for (const LiveInterval *SplitLI : SplitLIs) {
+      // If LI is an original interval that hasn't been split yet, make the new
+      // intervals their own originals instead of referring to LI. The original
+      // interval must contain all the split products, and LI doesn't.
+      if (Original != VReg && Original != 0)
+        VRM->setIsSplitFromReg(SplitLI->reg(), Original);
+      if (TheDelegate)
+        TheDelegate->LRE_DidCloneVirtReg(SplitLI->reg(), VReg);
+    }
+  }
+}
+
+// Keep track of new virtual registers created via
+// MachineRegisterInfo::createVirtualRegister.
+void
+LiveRangeEdit::MRI_NoteNewVirtualRegister(Register VReg) {
+  if (VRM)
+    VRM->grow();
+
+  NewRegs.push_back(VReg);
+}
+
+void LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF,
+                                             VirtRegAuxInfo &VRAI) {
+  for (unsigned I = 0, Size = size(); I < Size; ++I) {
+    LiveInterval &LI = LIS.getInterval(get(I));
+    if (MRI.recomputeRegClass(LI.reg()))
+      LLVM_DEBUG({
+        const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+        dbgs() << "Inflated " << printReg(LI.reg()) << " to "
+               << TRI->getRegClassName(MRI.getRegClass(LI.reg())) << '\n';
+      });
+    VRAI.calculateSpillWeightAndHint(LI);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeShrink.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeShrink.cpp
new file mode 100644
index 000000000000..93f5314539cd
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeShrink.cpp
@@ -0,0 +1,245 @@
+//===- LiveRangeShrink.cpp - Move instructions to shrink live range -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+///===---------------------------------------------------------------------===//
+///
+/// \file
+/// This pass moves instructions close to the definition of its operands to
+/// shrink live range of the def instruction. The code motion is limited within
+/// the basic block. The moved instruction should have 1 def, and more than one
+/// uses, all of which are the only use of the def.
+///
+///===---------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lrshrink"
+
+STATISTIC(NumInstrsHoistedToShrinkLiveRange,
+          "Number of insructions hoisted to shrink live range.");
+
+namespace {
+
+class LiveRangeShrink : public MachineFunctionPass {
+public:
+  static char ID;
+
+  LiveRangeShrink() : MachineFunctionPass(ID) {
+    initializeLiveRangeShrinkPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return "Live Range Shrink"; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // end anonymous namespace
+
+char LiveRangeShrink::ID = 0;
+
+char &llvm::LiveRangeShrinkID = LiveRangeShrink::ID;
+
+INITIALIZE_PASS(LiveRangeShrink, "lrshrink", "Live Range Shrink Pass", false,
+                false)
+
+using InstOrderMap = DenseMap<MachineInstr *, unsigned>;
+
+/// Returns \p New if it's dominated by \p Old, otherwise return \p Old.
+/// \p M maintains a map from instruction to its dominating order that satisfies
+/// M[A] > M[B] guarantees that A is dominated by B.
+/// If \p New is not in \p M, return \p Old. Otherwise if \p Old is null, return
+/// \p New.
+static MachineInstr *FindDominatedInstruction(MachineInstr &New,
+                                              MachineInstr *Old,
+                                              const InstOrderMap &M) {
+  auto NewIter = M.find(&New);
+  if (NewIter == M.end())
+    return Old;
+  if (Old == nullptr)
+    return &New;
+  unsigned OrderOld = M.find(Old)->second;
+  unsigned OrderNew = NewIter->second;
+  if (OrderOld != OrderNew)
+    return OrderOld < OrderNew ? &New : Old;
+  // OrderOld == OrderNew, we need to iterate down from Old to see if it
+  // can reach New, if yes, New is dominated by Old.
+  for (MachineInstr *I = Old->getNextNode(); M.find(I)->second == OrderNew;
+       I = I->getNextNode())
+    if (I == &New)
+      return &New;
+  return Old;
+}
+
+/// Builds Instruction to its dominating order number map \p M by traversing
+/// from instruction \p Start.
+static void BuildInstOrderMap(MachineBasicBlock::iterator Start,
+                              InstOrderMap &M) {
+  M.clear();
+  unsigned i = 0;
+  for (MachineInstr &I : make_range(Start, Start->getParent()->end()))
+    M[&I] = i++;
+}
+
+bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  LLVM_DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
+
+  InstOrderMap IOM;
+  // Map from register to instruction order (value of IOM) where the
+  // register is used last. When moving instructions up, we need to
+  // make sure all its defs (including dead def) will not cross its
+  // last use when moving up.
+  DenseMap<unsigned, std::pair<unsigned, MachineInstr *>> UseMap;
+
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBB.empty())
+      continue;
+    bool SawStore = false;
+    BuildInstOrderMap(MBB.begin(), IOM);
+    UseMap.clear();
+
+    for (MachineBasicBlock::iterator Next = MBB.begin(); Next != MBB.end();) {
+      MachineInstr &MI = *Next;
+      ++Next;
+      if (MI.isPHI() || MI.isDebugOrPseudoInstr())
+        continue;
+      if (MI.mayStore())
+        SawStore = true;
+
+      unsigned CurrentOrder = IOM[&MI];
+      unsigned Barrier = 0;
+      MachineInstr *BarrierMI = nullptr;
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg() || MO.isDebug())
+          continue;
+        if (MO.isUse())
+          UseMap[MO.getReg()] = std::make_pair(CurrentOrder, &MI);
+        else if (MO.isDead() && UseMap.count(MO.getReg()))
+          // Barrier is the last instruction where MO get used. MI should not
+          // be moved above Barrier.
+          if (Barrier < UseMap[MO.getReg()].first) {
+            Barrier = UseMap[MO.getReg()].first;
+            BarrierMI = UseMap[MO.getReg()].second;
+          }
+      }
+
+      if (!MI.isSafeToMove(nullptr, SawStore)) {
+        // If MI has side effects, it should become a barrier for code motion.
+        // IOM is rebuild from the next instruction to prevent later
+        // instructions from being moved before this MI.
+        if (MI.hasUnmodeledSideEffects() && !MI.isPseudoProbe() &&
+            Next != MBB.end()) {
+          BuildInstOrderMap(Next, IOM);
+          SawStore = false;
+        }
+        continue;
+      }
+
+      const MachineOperand *DefMO = nullptr;
+      MachineInstr *Insert = nullptr;
+
+      // Number of live-ranges that will be shortened. We do not count
+      // live-ranges that are defined by a COPY as it could be coalesced later.
+      unsigned NumEligibleUse = 0;
+
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg() || MO.isDead() || MO.isDebug())
+          continue;
+        Register Reg = MO.getReg();
+        // Do not move the instruction if it def/uses a physical register,
+        // unless it is a constant physical register or a noreg.
+        if (!Reg.isVirtual()) {
+          if (!Reg || MRI.isConstantPhysReg(Reg))
+            continue;
+          Insert = nullptr;
+          break;
+        }
+        if (MO.isDef()) {
+          // Do not move if there is more than one def.
+          if (DefMO) {
+            Insert = nullptr;
+            break;
+          }
+          DefMO = &MO;
+        } else if (MRI.hasOneNonDBGUse(Reg) && MRI.hasOneDef(Reg) && DefMO &&
+                   MRI.getRegClass(DefMO->getReg()) ==
+                       MRI.getRegClass(MO.getReg())) {
+          // The heuristic does not handle different register classes yet
+          // (registers of different sizes, looser/tighter constraints). This
+          // is because it needs more accurate model to handle register
+          // pressure correctly.
+          MachineInstr &DefInstr = *MRI.def_instr_begin(Reg);
+          if (!DefInstr.isCopy())
+            NumEligibleUse++;
+          Insert = FindDominatedInstruction(DefInstr, Insert, IOM);
+        } else {
+          Insert = nullptr;
+          break;
+        }
+      }
+
+      // If Barrier equals IOM[I], traverse forward to find if BarrierMI is
+      // after Insert, if yes, then we should not hoist.
+      for (MachineInstr *I = Insert; I && IOM[I] == Barrier;
+           I = I->getNextNode())
+        if (I == BarrierMI) {
+          Insert = nullptr;
+          break;
+        }
+      // Move the instruction when # of shrunk live range > 1.
+      if (DefMO && Insert && NumEligibleUse > 1 && Barrier <= IOM[Insert]) {
+        MachineBasicBlock::iterator I = std::next(Insert->getIterator());
+        // Skip all the PHI and debug instructions.
+        while (I != MBB.end() && (I->isPHI() || I->isDebugOrPseudoInstr()))
+          I = std::next(I);
+        if (I == MI.getIterator())
+          continue;
+
+        // Update the dominator order to be the same as the insertion point.
+        // We do this to maintain a non-decreasing order without need to update
+        // all instruction orders after the insertion point.
+        unsigned NewOrder = IOM[&*I];
+        IOM[&MI] = NewOrder;
+        NumInstrsHoistedToShrinkLiveRange++;
+
+        // Find MI's debug value following MI.
+        MachineBasicBlock::iterator EndIter = std::next(MI.getIterator());
+        if (MI.getOperand(0).isReg())
+          for (; EndIter != MBB.end() && EndIter->isDebugValue() &&
+                 EndIter->hasDebugOperandForReg(MI.getOperand(0).getReg());
+               ++EndIter, ++Next)
+            IOM[&*EndIter] = NewOrder;
+        MBB.splice(I, &MBB, MI.getIterator(), EndIter);
+      }
+    }
+  }
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeUtils.h b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeUtils.h
new file mode 100644
index 000000000000..ada5c5be484a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveRangeUtils.h
@@ -0,0 +1,61 @@
+//===-- LiveRangeUtils.h - Live Range modification utilities ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file contains helper functions to modify live ranges.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_LIVERANGEUTILS_H
+#define LLVM_LIB_CODEGEN_LIVERANGEUTILS_H
+
+#include "llvm/CodeGen/LiveInterval.h"
+
+namespace llvm {
+
+/// Helper function that distributes live range value numbers and the
+/// corresponding segments of a primary live range \p LR to a list of newly
+/// created live ranges \p SplitLRs. \p VNIClasses maps each value number in \p
+/// LR to 0 meaning it should stay or to 1..N meaning it should go to a specific
+/// live range in the \p SplitLRs array.
+template<typename LiveRangeT, typename EqClassesT>
+static void DistributeRange(LiveRangeT &LR, LiveRangeT *SplitLRs[],
+                            EqClassesT VNIClasses) {
+  // Move segments to new intervals.
+  typename LiveRangeT::iterator J = LR.begin(), E = LR.end();
+  while (J != E && VNIClasses[J->valno->id] == 0)
+    ++J;
+  for (typename LiveRangeT::iterator I = J; I != E; ++I) {
+    if (unsigned eq = VNIClasses[I->valno->id]) {
+      assert((SplitLRs[eq-1]->empty() || SplitLRs[eq-1]->expiredAt(I->start)) &&
+             "New intervals should be empty");
+      SplitLRs[eq-1]->segments.push_back(*I);
+    } else
+      *J++ = *I;
+  }
+  LR.segments.erase(J, E);
+
+  // Transfer VNInfos to their new owners and renumber them.
+  unsigned j = 0, e = LR.getNumValNums();
+  while (j != e && VNIClasses[j] == 0)
+    ++j;
+  for (unsigned i = j; i != e; ++i) {
+    VNInfo *VNI = LR.getValNumInfo(i);
+    if (unsigned eq = VNIClasses[i]) {
+      VNI->id = SplitLRs[eq-1]->getNumValNums();
+      SplitLRs[eq-1]->valnos.push_back(VNI);
+    } else {
+      VNI->id = j;
+      LR.valnos[j++] = VNI;
+    }
+  }
+  LR.valnos.resize(j);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveRegMatrix.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveRegMatrix.cpp
new file mode 100644
index 000000000000..6df7e5c10862
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -0,0 +1,248 @@
+//===- LiveRegMatrix.cpp - Track register interference --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the LiveRegMatrix analysis pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "RegisterCoalescer.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalUnion.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(NumAssigned   , "Number of registers assigned");
+STATISTIC(NumUnassigned , "Number of registers unassigned");
+
+char LiveRegMatrix::ID = 0;
+INITIALIZE_PASS_BEGIN(LiveRegMatrix, "liveregmatrix",
+                      "Live Register Matrix", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(LiveRegMatrix, "liveregmatrix",
+                    "Live Register Matrix", false, false)
+
+LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID) {}
+
+void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<LiveIntervals>();
+  AU.addRequiredTransitive<VirtRegMap>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool LiveRegMatrix::runOnMachineFunction(MachineFunction &MF) {
+  TRI = MF.getSubtarget().getRegisterInfo();
+  LIS = &getAnalysis<LiveIntervals>();
+  VRM = &getAnalysis<VirtRegMap>();
+
+  unsigned NumRegUnits = TRI->getNumRegUnits();
+  if (NumRegUnits != Matrix.size())
+    Queries.reset(new LiveIntervalUnion::Query[NumRegUnits]);
+  Matrix.init(LIUAlloc, NumRegUnits);
+
+  // Make sure no stale queries get reused.
+  invalidateVirtRegs();
+  return false;
+}
+
+void LiveRegMatrix::releaseMemory() {
+  for (unsigned i = 0, e = Matrix.size(); i != e; ++i) {
+    Matrix[i].clear();
+    // No need to clear Queries here, since LiveIntervalUnion::Query doesn't
+    // have anything important to clear and LiveRegMatrix's runOnFunction()
+    // does a std::unique_ptr::reset anyways.
+  }
+}
+
+template <typename Callable>
+static bool foreachUnit(const TargetRegisterInfo *TRI,
+                        const LiveInterval &VRegInterval, MCRegister PhysReg,
+                        Callable Func) {
+  if (VRegInterval.hasSubRanges()) {
+    for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+      unsigned Unit = (*Units).first;
+      LaneBitmask Mask = (*Units).second;
+      for (const LiveInterval::SubRange &S : VRegInterval.subranges()) {
+        if ((S.LaneMask & Mask).any()) {
+          if (Func(Unit, S))
+            return true;
+          break;
+        }
+      }
+    }
+  } else {
+    for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+      if (Func(Unit, VRegInterval))
+        return true;
+    }
+  }
+  return false;
+}
+
+void LiveRegMatrix::assign(const LiveInterval &VirtReg, MCRegister PhysReg) {
+  LLVM_DEBUG(dbgs() << "assigning " << printReg(VirtReg.reg(), TRI) << " to "
+                    << printReg(PhysReg, TRI) << ':');
+  assert(!VRM->hasPhys(VirtReg.reg()) && "Duplicate VirtReg assignment");
+  VRM->assignVirt2Phys(VirtReg.reg(), PhysReg);
+
+  foreachUnit(
+      TRI, VirtReg, PhysReg, [&](unsigned Unit, const LiveRange &Range) {
+        LLVM_DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI) << ' ' << Range);
+        Matrix[Unit].unify(VirtReg, Range);
+        return false;
+      });
+
+  ++NumAssigned;
+  LLVM_DEBUG(dbgs() << '\n');
+}
+
+void LiveRegMatrix::unassign(const LiveInterval &VirtReg) {
+  Register PhysReg = VRM->getPhys(VirtReg.reg());
+  LLVM_DEBUG(dbgs() << "unassigning " << printReg(VirtReg.reg(), TRI)
+                    << " from " << printReg(PhysReg, TRI) << ':');
+  VRM->clearVirt(VirtReg.reg());
+
+  foreachUnit(TRI, VirtReg, PhysReg,
+              [&](unsigned Unit, const LiveRange &Range) {
+                LLVM_DEBUG(dbgs() << ' ' << printRegUnit(Unit, TRI));
+                Matrix[Unit].extract(VirtReg, Range);
+                return false;
+              });
+
+  ++NumUnassigned;
+  LLVM_DEBUG(dbgs() << '\n');
+}
+
+bool LiveRegMatrix::isPhysRegUsed(MCRegister PhysReg) const {
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    if (!Matrix[Unit].empty())
+      return true;
+  }
+  return false;
+}
+
+bool LiveRegMatrix::checkRegMaskInterference(const LiveInterval &VirtReg,
+                                             MCRegister PhysReg) {
+  // Check if the cached information is valid.
+  // The same BitVector can be reused for all PhysRegs.
+  // We could cache multiple VirtRegs if it becomes necessary.
+  if (RegMaskVirtReg != VirtReg.reg() || RegMaskTag != UserTag) {
+    RegMaskVirtReg = VirtReg.reg();
+    RegMaskTag = UserTag;
+    RegMaskUsable.clear();
+    LIS->checkRegMaskInterference(VirtReg, RegMaskUsable);
+  }
+
+  // The BitVector is indexed by PhysReg, not register unit.
+  // Regmask interference is more fine grained than regunits.
+  // For example, a Win64 call can clobber %ymm8 yet preserve %xmm8.
+  return !RegMaskUsable.empty() && (!PhysReg || !RegMaskUsable.test(PhysReg));
+}
+
+bool LiveRegMatrix::checkRegUnitInterference(const LiveInterval &VirtReg,
+                                             MCRegister PhysReg) {
+  if (VirtReg.empty())
+    return false;
+  CoalescerPair CP(VirtReg.reg(), PhysReg, *TRI);
+
+  bool Result = foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
+                                                       const LiveRange &Range) {
+    const LiveRange &UnitRange = LIS->getRegUnit(Unit);
+    return Range.overlaps(UnitRange, CP, *LIS->getSlotIndexes());
+  });
+  return Result;
+}
+
+LiveIntervalUnion::Query &LiveRegMatrix::query(const LiveRange &LR,
+                                               MCRegister RegUnit) {
+  LiveIntervalUnion::Query &Q = Queries[RegUnit];
+  Q.init(UserTag, LR, Matrix[RegUnit]);
+  return Q;
+}
+
+LiveRegMatrix::InterferenceKind
+LiveRegMatrix::checkInterference(const LiveInterval &VirtReg,
+                                 MCRegister PhysReg) {
+  if (VirtReg.empty())
+    return IK_Free;
+
+  // Regmask interference is the fastest check.
+  if (checkRegMaskInterference(VirtReg, PhysReg))
+    return IK_RegMask;
+
+  // Check for fixed interference.
+  if (checkRegUnitInterference(VirtReg, PhysReg))
+    return IK_RegUnit;
+
+  // Check the matrix for virtual register interference.
+  bool Interference = foreachUnit(TRI, VirtReg, PhysReg,
+                                  [&](MCRegister Unit, const LiveRange &LR) {
+                                    return query(LR, Unit).checkInterference();
+                                  });
+  if (Interference)
+    return IK_VirtReg;
+
+  return IK_Free;
+}
+
+bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End,
+                                      MCRegister PhysReg) {
+  // Construct artificial live range containing only one segment [Start, End).
+  VNInfo valno(0, Start);
+  LiveRange::Segment Seg(Start, End, &valno);
+  LiveRange LR;
+  LR.addSegment(Seg);
+
+  // Check for interference with that segment
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    // LR is stack-allocated. LiveRegMatrix caches queries by a key that
+    // includes the address of the live range. If (for the same reg unit) this
+    // checkInterference overload is called twice, without any other query()
+    // calls in between (on heap-allocated LiveRanges)  - which would invalidate
+    // the cached query - the LR address seen the second time may well be the
+    // same as that seen the first time, while the Start/End/valno may not - yet
+    // the same cached result would be fetched. To avoid that, we don't cache
+    // this query.
+    //
+    // FIXME: the usability of the Query API needs to be improved to avoid
+    // subtle bugs due to query identity. Avoiding caching, for example, would
+    // greatly simplify things.
+    LiveIntervalUnion::Query Q;
+    Q.reset(UserTag, LR, Matrix[Unit]);
+    if (Q.checkInterference())
+      return true;
+  }
+  return false;
+}
+
+Register LiveRegMatrix::getOneVReg(unsigned PhysReg) const {
+  const LiveInterval *VRegInterval = nullptr;
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    if ((VRegInterval = Matrix[Unit].getOneVReg()))
+      return VRegInterval->reg();
+  }
+
+  return MCRegister::NoRegister;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveRegUnits.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveRegUnits.cpp
new file mode 100644
index 000000000000..34de09dd2944
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveRegUnits.cpp
@@ -0,0 +1,159 @@
+//===- LiveRegUnits.cpp - Register Unit Set -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file imlements the LiveRegUnits set.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+void LiveRegUnits::removeRegsNotPreserved(const uint32_t *RegMask) {
+  for (unsigned U = 0, E = TRI->getNumRegUnits(); U != E; ++U) {
+    for (MCRegUnitRootIterator RootReg(U, TRI); RootReg.isValid(); ++RootReg) {
+      if (MachineOperand::clobbersPhysReg(RegMask, *RootReg)) {
+        Units.reset(U);
+        break;
+      }
+    }
+  }
+}
+
+void LiveRegUnits::addRegsInMask(const uint32_t *RegMask) {
+  for (unsigned U = 0, E = TRI->getNumRegUnits(); U != E; ++U) {
+    for (MCRegUnitRootIterator RootReg(U, TRI); RootReg.isValid(); ++RootReg) {
+      if (MachineOperand::clobbersPhysReg(RegMask, *RootReg)) {
+        Units.set(U);
+        break;
+      }
+    }
+  }
+}
+
+void LiveRegUnits::stepBackward(const MachineInstr &MI) {
+  // Remove defined registers and regmask kills from the set.
+  for (const MachineOperand &MOP : MI.operands()) {
+    if (MOP.isReg()) {
+      if (MOP.isDef() && MOP.getReg().isPhysical())
+        removeReg(MOP.getReg());
+      continue;
+    }
+
+    if (MOP.isRegMask()) {
+      removeRegsNotPreserved(MOP.getRegMask());
+      continue;
+    }
+  }
+
+  // Add uses to the set.
+  for (const MachineOperand &MOP : MI.operands()) {
+    if (!MOP.isReg() || !MOP.readsReg())
+      continue;
+
+    if (MOP.getReg().isPhysical())
+      addReg(MOP.getReg());
+  }
+}
+
+void LiveRegUnits::accumulate(const MachineInstr &MI) {
+  // Add defs, uses and regmask clobbers to the set.
+  for (const MachineOperand &MOP : MI.operands()) {
+    if (MOP.isReg()) {
+      if (!MOP.getReg().isPhysical())
+        continue;
+      if (MOP.isDef() || MOP.readsReg())
+        addReg(MOP.getReg());
+      continue;
+    }
+
+    if (MOP.isRegMask()) {
+      addRegsInMask(MOP.getRegMask());
+      continue;
+    }
+  }
+}
+
+/// Add live-in registers of basic block \p MBB to \p LiveUnits.
+static void addBlockLiveIns(LiveRegUnits &LiveUnits,
+                            const MachineBasicBlock &MBB) {
+  for (const auto &LI : MBB.liveins())
+    LiveUnits.addRegMasked(LI.PhysReg, LI.LaneMask);
+}
+
+/// Adds all callee saved registers to \p LiveUnits.
+static void addCalleeSavedRegs(LiveRegUnits &LiveUnits,
+                               const MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; ++CSR) {
+    const unsigned N = *CSR;
+
+    const auto &CSI = MFI.getCalleeSavedInfo();
+    auto Info =
+        llvm::find_if(CSI, [N](auto Info) { return Info.getReg() == N; });
+    // If we have no info for this callee-saved register, assume it is liveout
+    if (Info == CSI.end() || Info->isRestored())
+      LiveUnits.addReg(N);
+  }
+}
+
+void LiveRegUnits::addPristines(const MachineFunction &MF) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (!MFI.isCalleeSavedInfoValid())
+    return;
+  /// This function will usually be called on an empty object, handle this
+  /// as a special case.
+  if (empty()) {
+    /// Add all callee saved regs, then remove the ones that are saved and
+    /// restored.
+    addCalleeSavedRegs(*this, MF);
+    /// Remove the ones that are not saved/restored; they are pristine.
+    for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
+      removeReg(Info.getReg());
+    return;
+  }
+  /// If a callee-saved register that is not pristine is already present
+  /// in the set, we should make sure that it stays in it. Precompute the
+  /// set of pristine registers in a separate object.
+  /// Add all callee saved regs, then remove the ones that are saved+restored.
+  LiveRegUnits Pristine(*TRI);
+  addCalleeSavedRegs(Pristine, MF);
+  /// Remove the ones that are not saved/restored; they are pristine.
+  for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
+    Pristine.removeReg(Info.getReg());
+  addUnits(Pristine.getBitVector());
+}
+
+void LiveRegUnits::addLiveOuts(const MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
+
+  addPristines(MF);
+
+  // To get the live-outs we simply merge the live-ins of all successors.
+  for (const MachineBasicBlock *Succ : MBB.successors())
+    addBlockLiveIns(*this, *Succ);
+
+  // For the return block: Add all callee saved registers.
+  if (MBB.isReturnBlock()) {
+    const MachineFrameInfo &MFI = MF.getFrameInfo();
+    if (MFI.isCalleeSavedInfoValid())
+      addCalleeSavedRegs(*this, MF);
+  }
+}
+
+void LiveRegUnits::addLiveIns(const MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
+  addPristines(MF);
+  addBlockLiveIns(*this, MBB);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveStacks.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveStacks.cpp
new file mode 100644
index 000000000000..8fc5a929d77b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveStacks.cpp
@@ -0,0 +1,85 @@
+//===-- LiveStacks.cpp - Live Stack Slot Analysis -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the live stack slot analysis pass. It is analogous to
+// live interval analysis except it's analyzing liveness of stack slots rather
+// than registers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveStacks.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "livestacks"
+
+char LiveStacks::ID = 0;
+INITIALIZE_PASS_BEGIN(LiveStacks, DEBUG_TYPE,
+                "Live Stack Slot Analysis", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(LiveStacks, DEBUG_TYPE,
+                "Live Stack Slot Analysis", false, false)
+
+char &llvm::LiveStacksID = LiveStacks::ID;
+
+void LiveStacks::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequiredTransitive<SlotIndexes>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void LiveStacks::releaseMemory() {
+  // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
+  VNInfoAllocator.Reset();
+  S2IMap.clear();
+  S2RCMap.clear();
+}
+
+bool LiveStacks::runOnMachineFunction(MachineFunction &MF) {
+  TRI = MF.getSubtarget().getRegisterInfo();
+  // FIXME: No analysis is being done right now. We are relying on the
+  // register allocators to provide the information.
+  return false;
+}
+
+LiveInterval &
+LiveStacks::getOrCreateInterval(int Slot, const TargetRegisterClass *RC) {
+  assert(Slot >= 0 && "Spill slot indice must be >= 0");
+  SS2IntervalMap::iterator I = S2IMap.find(Slot);
+  if (I == S2IMap.end()) {
+    I = S2IMap
+            .emplace(
+                std::piecewise_construct, std::forward_as_tuple(Slot),
+                std::forward_as_tuple(Register::index2StackSlot(Slot), 0.0F))
+            .first;
+    S2RCMap.insert(std::make_pair(Slot, RC));
+  } else {
+    // Use the largest common subclass register class.
+    const TargetRegisterClass *OldRC = S2RCMap[Slot];
+    S2RCMap[Slot] = TRI->getCommonSubClass(OldRC, RC);
+  }
+  return I->second;
+}
+
+/// print - Implement the dump method.
+void LiveStacks::print(raw_ostream &OS, const Module*) const {
+
+  OS << "********** INTERVALS **********\n";
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    I->second.print(OS);
+    int Slot = I->first;
+    const TargetRegisterClass *RC = getIntervalRegClass(Slot);
+    if (RC)
+      OS << " [" << TRI->getRegClassName(RC) << "]\n";
+    else
+      OS << " [Unknown]\n";
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LiveVariables.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LiveVariables.cpp
new file mode 100644
index 000000000000..9cd74689ba10
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LiveVariables.cpp
@@ -0,0 +1,888 @@
+//===-- LiveVariables.cpp - Live Variable Analysis for Machine Code -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveVariable analysis pass.  For each machine
+// instruction in the function, this pass calculates the set of registers that
+// are immediately dead after the instruction (i.e., the instruction calculates
+// the value, but it is never used) and the set of registers that are used by
+// the instruction, but are never used after the instruction (i.e., they are
+// killed).
+//
+// This class computes live variables using a sparse implementation based on
+// the machine code SSA form.  This class computes live variable information for
+// each virtual and _register allocatable_ physical register in a function.  It
+// uses the dominance properties of SSA form to efficiently compute live
+// variables for virtual registers, and assumes that physical registers are only
+// live within a single basic block (allowing it to do a single local analysis
+// to resolve physical register lifetimes in each basic block).  If a physical
+// register is not register allocatable, it is not tracked.  This is useful for
+// things like the stack pointer and condition codes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+char LiveVariables::ID = 0;
+char &llvm::LiveVariablesID = LiveVariables::ID;
+INITIALIZE_PASS_BEGIN(LiveVariables, "livevars",
+                "Live Variable Analysis", false, false)
+INITIALIZE_PASS_DEPENDENCY(UnreachableMachineBlockElim)
+INITIALIZE_PASS_END(LiveVariables, "livevars",
+                "Live Variable Analysis", false, false)
+
+
+void LiveVariables::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequiredID(UnreachableMachineBlockElimID);
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineInstr *
+LiveVariables::VarInfo::findKill(const MachineBasicBlock *MBB) const {
+  for (MachineInstr *MI : Kills)
+    if (MI->getParent() == MBB)
+      return MI;
+  return nullptr;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const {
+  dbgs() << "  Alive in blocks: ";
+  for (unsigned AB : AliveBlocks)
+    dbgs() << AB << ", ";
+  dbgs() << "\n  Killed by:";
+  if (Kills.empty())
+    dbgs() << " No instructions.\n";
+  else {
+    for (unsigned i = 0, e = Kills.size(); i != e; ++i)
+      dbgs() << "\n    #" << i << ": " << *Kills[i];
+    dbgs() << "\n";
+  }
+}
+#endif
+
+/// getVarInfo - Get (possibly creating) a VarInfo object for the given vreg.
+LiveVariables::VarInfo &LiveVariables::getVarInfo(Register Reg) {
+  assert(Reg.isVirtual() && "getVarInfo: not a virtual register!");
+  VirtRegInfo.grow(Reg);
+  return VirtRegInfo[Reg];
+}
+
+void LiveVariables::MarkVirtRegAliveInBlock(
+    VarInfo &VRInfo, MachineBasicBlock *DefBlock, MachineBasicBlock *MBB,
+    SmallVectorImpl<MachineBasicBlock *> &WorkList) {
+  unsigned BBNum = MBB->getNumber();
+
+  // Check to see if this basic block is one of the killing blocks.  If so,
+  // remove it.
+  for (unsigned i = 0, e = VRInfo.Kills.size(); i != e; ++i)
+    if (VRInfo.Kills[i]->getParent() == MBB) {
+      VRInfo.Kills.erase(VRInfo.Kills.begin()+i);  // Erase entry
+      break;
+    }
+
+  if (MBB == DefBlock) return;  // Terminate recursion
+
+  if (VRInfo.AliveBlocks.test(BBNum))
+    return;  // We already know the block is live
+
+  // Mark the variable known alive in this bb
+  VRInfo.AliveBlocks.set(BBNum);
+
+  assert(MBB != &MF->front() && "Can't find reaching def for virtreg");
+  WorkList.insert(WorkList.end(), MBB->pred_rbegin(), MBB->pred_rend());
+}
+
+void LiveVariables::MarkVirtRegAliveInBlock(VarInfo &VRInfo,
+                                            MachineBasicBlock *DefBlock,
+                                            MachineBasicBlock *MBB) {
+  SmallVector<MachineBasicBlock *, 16> WorkList;
+  MarkVirtRegAliveInBlock(VRInfo, DefBlock, MBB, WorkList);
+
+  while (!WorkList.empty()) {
+    MachineBasicBlock *Pred = WorkList.pop_back_val();
+    MarkVirtRegAliveInBlock(VRInfo, DefBlock, Pred, WorkList);
+  }
+}
+
+void LiveVariables::HandleVirtRegUse(Register Reg, MachineBasicBlock *MBB,
+                                     MachineInstr &MI) {
+  assert(MRI->getVRegDef(Reg) && "Register use before def!");
+
+  unsigned BBNum = MBB->getNumber();
+
+  VarInfo &VRInfo = getVarInfo(Reg);
+
+  // Check to see if this basic block is already a kill block.
+  if (!VRInfo.Kills.empty() && VRInfo.Kills.back()->getParent() == MBB) {
+    // Yes, this register is killed in this basic block already. Increase the
+    // live range by updating the kill instruction.
+    VRInfo.Kills.back() = &MI;
+    return;
+  }
+
+#ifndef NDEBUG
+  for (MachineInstr *Kill : VRInfo.Kills)
+    assert(Kill->getParent() != MBB && "entry should be at end!");
+#endif
+
+  // This situation can occur:
+  //
+  //     ,------.
+  //     |      |
+  //     |      v
+  //     |   t2 = phi ... t1 ...
+  //     |      |
+  //     |      v
+  //     |   t1 = ...
+  //     |  ... = ... t1 ...
+  //     |      |
+  //     `------'
+  //
+  // where there is a use in a PHI node that's a predecessor to the defining
+  // block. We don't want to mark all predecessors as having the value "alive"
+  // in this case.
+  if (MBB == MRI->getVRegDef(Reg)->getParent())
+    return;
+
+  // Add a new kill entry for this basic block. If this virtual register is
+  // already marked as alive in this basic block, that means it is alive in at
+  // least one of the successor blocks, it's not a kill.
+  if (!VRInfo.AliveBlocks.test(BBNum))
+    VRInfo.Kills.push_back(&MI);
+
+  // Update all dominating blocks to mark them as "known live".
+  for (MachineBasicBlock *Pred : MBB->predecessors())
+    MarkVirtRegAliveInBlock(VRInfo, MRI->getVRegDef(Reg)->getParent(), Pred);
+}
+
+void LiveVariables::HandleVirtRegDef(Register Reg, MachineInstr &MI) {
+  VarInfo &VRInfo = getVarInfo(Reg);
+
+  if (VRInfo.AliveBlocks.empty())
+    // If vr is not alive in any block, then defaults to dead.
+    VRInfo.Kills.push_back(&MI);
+}
+
+/// FindLastPartialDef - Return the last partial def of the specified register.
+/// Also returns the sub-registers that're defined by the instruction.
+MachineInstr *
+LiveVariables::FindLastPartialDef(Register Reg,
+                                  SmallSet<unsigned, 4> &PartDefRegs) {
+  unsigned LastDefReg = 0;
+  unsigned LastDefDist = 0;
+  MachineInstr *LastDef = nullptr;
+  for (MCPhysReg SubReg : TRI->subregs(Reg)) {
+    MachineInstr *Def = PhysRegDef[SubReg];
+    if (!Def)
+      continue;
+    unsigned Dist = DistanceMap[Def];
+    if (Dist > LastDefDist) {
+      LastDefReg  = SubReg;
+      LastDef     = Def;
+      LastDefDist = Dist;
+    }
+  }
+
+  if (!LastDef)
+    return nullptr;
+
+  PartDefRegs.insert(LastDefReg);
+  for (MachineOperand &MO : LastDef->all_defs()) {
+    if (MO.getReg() == 0)
+      continue;
+    Register DefReg = MO.getReg();
+    if (TRI->isSubRegister(Reg, DefReg)) {
+      for (MCPhysReg SubReg : TRI->subregs_inclusive(DefReg))
+        PartDefRegs.insert(SubReg);
+    }
+  }
+  return LastDef;
+}
+
+/// HandlePhysRegUse - Turn previous partial def's into read/mod/writes. Add
+/// implicit defs to a machine instruction if there was an earlier def of its
+/// super-register.
+void LiveVariables::HandlePhysRegUse(Register Reg, MachineInstr &MI) {
+  MachineInstr *LastDef = PhysRegDef[Reg];
+  // If there was a previous use or a "full" def all is well.
+  if (!LastDef && !PhysRegUse[Reg]) {
+    // Otherwise, the last sub-register def implicitly defines this register.
+    // e.g.
+    // AH =
+    // AL = ... implicit-def EAX, implicit killed AH
+    //    = AH
+    // ...
+    //    = EAX
+    // All of the sub-registers must have been defined before the use of Reg!
+    SmallSet<unsigned, 4> PartDefRegs;
+    MachineInstr *LastPartialDef = FindLastPartialDef(Reg, PartDefRegs);
+    // If LastPartialDef is NULL, it must be using a livein register.
+    if (LastPartialDef) {
+      LastPartialDef->addOperand(MachineOperand::CreateReg(Reg, true/*IsDef*/,
+                                                           true/*IsImp*/));
+      PhysRegDef[Reg] = LastPartialDef;
+      SmallSet<unsigned, 8> Processed;
+      for (MCPhysReg SubReg : TRI->subregs(Reg)) {
+        if (Processed.count(SubReg))
+          continue;
+        if (PartDefRegs.count(SubReg))
+          continue;
+        // This part of Reg was defined before the last partial def. It's killed
+        // here.
+        LastPartialDef->addOperand(MachineOperand::CreateReg(SubReg,
+                                                             false/*IsDef*/,
+                                                             true/*IsImp*/));
+        PhysRegDef[SubReg] = LastPartialDef;
+        for (MCPhysReg SS : TRI->subregs(SubReg))
+          Processed.insert(SS);
+      }
+    }
+  } else if (LastDef && !PhysRegUse[Reg] &&
+             !LastDef->findRegisterDefOperand(Reg))
+    // Last def defines the super register, add an implicit def of reg.
+    LastDef->addOperand(MachineOperand::CreateReg(Reg, true/*IsDef*/,
+                                                  true/*IsImp*/));
+
+  // Remember this use.
+  for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg))
+    PhysRegUse[SubReg] = &MI;
+}
+
+/// FindLastRefOrPartRef - Return the last reference or partial reference of
+/// the specified register.
+MachineInstr *LiveVariables::FindLastRefOrPartRef(Register Reg) {
+  MachineInstr *LastDef = PhysRegDef[Reg];
+  MachineInstr *LastUse = PhysRegUse[Reg];
+  if (!LastDef && !LastUse)
+    return nullptr;
+
+  MachineInstr *LastRefOrPartRef = LastUse ? LastUse : LastDef;
+  unsigned LastRefOrPartRefDist = DistanceMap[LastRefOrPartRef];
+  unsigned LastPartDefDist = 0;
+  for (MCPhysReg SubReg : TRI->subregs(Reg)) {
+    MachineInstr *Def = PhysRegDef[SubReg];
+    if (Def && Def != LastDef) {
+      // There was a def of this sub-register in between. This is a partial
+      // def, keep track of the last one.
+      unsigned Dist = DistanceMap[Def];
+      if (Dist > LastPartDefDist)
+        LastPartDefDist = Dist;
+    } else if (MachineInstr *Use = PhysRegUse[SubReg]) {
+      unsigned Dist = DistanceMap[Use];
+      if (Dist > LastRefOrPartRefDist) {
+        LastRefOrPartRefDist = Dist;
+        LastRefOrPartRef = Use;
+      }
+    }
+  }
+
+  return LastRefOrPartRef;
+}
+
+bool LiveVariables::HandlePhysRegKill(Register Reg, MachineInstr *MI) {
+  MachineInstr *LastDef = PhysRegDef[Reg];
+  MachineInstr *LastUse = PhysRegUse[Reg];
+  if (!LastDef && !LastUse)
+    return false;
+
+  MachineInstr *LastRefOrPartRef = LastUse ? LastUse : LastDef;
+  unsigned LastRefOrPartRefDist = DistanceMap[LastRefOrPartRef];
+  // The whole register is used.
+  // AL =
+  // AH =
+  //
+  //    = AX
+  //    = AL, implicit killed AX
+  // AX =
+  //
+  // Or whole register is defined, but not used at all.
+  // dead AX =
+  // ...
+  // AX =
+  //
+  // Or whole register is defined, but only partly used.
+  // dead AX = implicit-def AL
+  //    = killed AL
+  // AX =
+  MachineInstr *LastPartDef = nullptr;
+  unsigned LastPartDefDist = 0;
+  SmallSet<unsigned, 8> PartUses;
+  for (MCPhysReg SubReg : TRI->subregs(Reg)) {
+    MachineInstr *Def = PhysRegDef[SubReg];
+    if (Def && Def != LastDef) {
+      // There was a def of this sub-register in between. This is a partial
+      // def, keep track of the last one.
+      unsigned Dist = DistanceMap[Def];
+      if (Dist > LastPartDefDist) {
+        LastPartDefDist = Dist;
+        LastPartDef = Def;
+      }
+      continue;
+    }
+    if (MachineInstr *Use = PhysRegUse[SubReg]) {
+      for (MCPhysReg SS : TRI->subregs_inclusive(SubReg))
+        PartUses.insert(SS);
+      unsigned Dist = DistanceMap[Use];
+      if (Dist > LastRefOrPartRefDist) {
+        LastRefOrPartRefDist = Dist;
+        LastRefOrPartRef = Use;
+      }
+    }
+  }
+
+  if (!PhysRegUse[Reg]) {
+    // Partial uses. Mark register def dead and add implicit def of
+    // sub-registers which are used.
+    // dead EAX  = op  implicit-def AL
+    // That is, EAX def is dead but AL def extends pass it.
+    PhysRegDef[Reg]->addRegisterDead(Reg, TRI, true);
+    for (MCPhysReg SubReg : TRI->subregs(Reg)) {
+      if (!PartUses.count(SubReg))
+        continue;
+      bool NeedDef = true;
+      if (PhysRegDef[Reg] == PhysRegDef[SubReg]) {
+        MachineOperand *MO = PhysRegDef[Reg]->findRegisterDefOperand(SubReg);
+        if (MO) {
+          NeedDef = false;
+          assert(!MO->isDead());
+        }
+      }
+      if (NeedDef)
+        PhysRegDef[Reg]->addOperand(MachineOperand::CreateReg(SubReg,
+                                                 true/*IsDef*/, true/*IsImp*/));
+      MachineInstr *LastSubRef = FindLastRefOrPartRef(SubReg);
+      if (LastSubRef)
+        LastSubRef->addRegisterKilled(SubReg, TRI, true);
+      else {
+        LastRefOrPartRef->addRegisterKilled(SubReg, TRI, true);
+        for (MCPhysReg SS : TRI->subregs_inclusive(SubReg))
+          PhysRegUse[SS] = LastRefOrPartRef;
+      }
+      for (MCPhysReg SS : TRI->subregs(SubReg))
+        PartUses.erase(SS);
+    }
+  } else if (LastRefOrPartRef == PhysRegDef[Reg] && LastRefOrPartRef != MI) {
+    if (LastPartDef)
+      // The last partial def kills the register.
+      LastPartDef->addOperand(MachineOperand::CreateReg(Reg, false/*IsDef*/,
+                                                true/*IsImp*/, true/*IsKill*/));
+    else {
+      MachineOperand *MO =
+        LastRefOrPartRef->findRegisterDefOperand(Reg, false, false, TRI);
+      bool NeedEC = MO->isEarlyClobber() && MO->getReg() != Reg;
+      // If the last reference is the last def, then it's not used at all.
+      // That is, unless we are currently processing the last reference itself.
+      LastRefOrPartRef->addRegisterDead(Reg, TRI, true);
+      if (NeedEC) {
+        // If we are adding a subreg def and the superreg def is marked early
+        // clobber, add an early clobber marker to the subreg def.
+        MO = LastRefOrPartRef->findRegisterDefOperand(Reg);
+        if (MO)
+          MO->setIsEarlyClobber();
+      }
+    }
+  } else
+    LastRefOrPartRef->addRegisterKilled(Reg, TRI, true);
+  return true;
+}
+
+void LiveVariables::HandleRegMask(const MachineOperand &MO) {
+  // Call HandlePhysRegKill() for all live registers clobbered by Mask.
+  // Clobbered registers are always dead, sp there is no need to use
+  // HandlePhysRegDef().
+  for (unsigned Reg = 1, NumRegs = TRI->getNumRegs(); Reg != NumRegs; ++Reg) {
+    // Skip dead regs.
+    if (!PhysRegDef[Reg] && !PhysRegUse[Reg])
+      continue;
+    // Skip mask-preserved regs.
+    if (!MO.clobbersPhysReg(Reg))
+      continue;
+    // Kill the largest clobbered super-register.
+    // This avoids needless implicit operands.
+    unsigned Super = Reg;
+    for (MCPhysReg SR : TRI->superregs(Reg))
+      if ((PhysRegDef[SR] || PhysRegUse[SR]) && MO.clobbersPhysReg(SR))
+        Super = SR;
+    HandlePhysRegKill(Super, nullptr);
+  }
+}
+
+void LiveVariables::HandlePhysRegDef(Register Reg, MachineInstr *MI,
+                                     SmallVectorImpl<unsigned> &Defs) {
+  // What parts of the register are previously defined?
+  SmallSet<unsigned, 32> Live;
+  if (PhysRegDef[Reg] || PhysRegUse[Reg]) {
+    for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg))
+      Live.insert(SubReg);
+  } else {
+    for (MCPhysReg SubReg : TRI->subregs(Reg)) {
+      // If a register isn't itself defined, but all parts that make up of it
+      // are defined, then consider it also defined.
+      // e.g.
+      // AL =
+      // AH =
+      //    = AX
+      if (Live.count(SubReg))
+        continue;
+      if (PhysRegDef[SubReg] || PhysRegUse[SubReg]) {
+        for (MCPhysReg SS : TRI->subregs_inclusive(SubReg))
+          Live.insert(SS);
+      }
+    }
+  }
+
+  // Start from the largest piece, find the last time any part of the register
+  // is referenced.
+  HandlePhysRegKill(Reg, MI);
+  // Only some of the sub-registers are used.
+  for (MCPhysReg SubReg : TRI->subregs(Reg)) {
+    if (!Live.count(SubReg))
+      // Skip if this sub-register isn't defined.
+      continue;
+    HandlePhysRegKill(SubReg, MI);
+  }
+
+  if (MI)
+    Defs.push_back(Reg);  // Remember this def.
+}
+
+void LiveVariables::UpdatePhysRegDefs(MachineInstr &MI,
+                                      SmallVectorImpl<unsigned> &Defs) {
+  while (!Defs.empty()) {
+    Register Reg = Defs.pop_back_val();
+    for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) {
+      PhysRegDef[SubReg] = &MI;
+      PhysRegUse[SubReg]  = nullptr;
+    }
+  }
+}
+
+void LiveVariables::runOnInstr(MachineInstr &MI,
+                               SmallVectorImpl<unsigned> &Defs) {
+  assert(!MI.isDebugOrPseudoInstr());
+  // Process all of the operands of the instruction...
+  unsigned NumOperandsToProcess = MI.getNumOperands();
+
+  // Unless it is a PHI node.  In this case, ONLY process the DEF, not any
+  // of the uses.  They will be handled in other basic blocks.
+  if (MI.isPHI())
+    NumOperandsToProcess = 1;
+
+  // Clear kill and dead markers. LV will recompute them.
+  SmallVector<unsigned, 4> UseRegs;
+  SmallVector<unsigned, 4> DefRegs;
+  SmallVector<unsigned, 1> RegMasks;
+  for (unsigned i = 0; i != NumOperandsToProcess; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (MO.isRegMask()) {
+      RegMasks.push_back(i);
+      continue;
+    }
+    if (!MO.isReg() || MO.getReg() == 0)
+      continue;
+    Register MOReg = MO.getReg();
+    if (MO.isUse()) {
+      if (!(MOReg.isPhysical() && MRI->isReserved(MOReg)))
+        MO.setIsKill(false);
+      if (MO.readsReg())
+        UseRegs.push_back(MOReg);
+    } else {
+      assert(MO.isDef());
+      // FIXME: We should not remove any dead flags. However the MIPS RDDSP
+      // instruction needs it at the moment: http://llvm.org/PR27116.
+      if (MOReg.isPhysical() && !MRI->isReserved(MOReg))
+        MO.setIsDead(false);
+      DefRegs.push_back(MOReg);
+    }
+  }
+
+  MachineBasicBlock *MBB = MI.getParent();
+  // Process all uses.
+  for (unsigned MOReg : UseRegs) {
+    if (Register::isVirtualRegister(MOReg))
+      HandleVirtRegUse(MOReg, MBB, MI);
+    else if (!MRI->isReserved(MOReg))
+      HandlePhysRegUse(MOReg, MI);
+  }
+
+  // Process all masked registers. (Call clobbers).
+  for (unsigned Mask : RegMasks)
+    HandleRegMask(MI.getOperand(Mask));
+
+  // Process all defs.
+  for (unsigned MOReg : DefRegs) {
+    if (Register::isVirtualRegister(MOReg))
+      HandleVirtRegDef(MOReg, MI);
+    else if (!MRI->isReserved(MOReg))
+      HandlePhysRegDef(MOReg, &MI, Defs);
+  }
+  UpdatePhysRegDefs(MI, Defs);
+}
+
+void LiveVariables::runOnBlock(MachineBasicBlock *MBB, const unsigned NumRegs) {
+  // Mark live-in registers as live-in.
+  SmallVector<unsigned, 4> Defs;
+  for (const auto &LI : MBB->liveins()) {
+    assert(Register::isPhysicalRegister(LI.PhysReg) &&
+           "Cannot have a live-in virtual register!");
+    HandlePhysRegDef(LI.PhysReg, nullptr, Defs);
+  }
+
+  // Loop over all of the instructions, processing them.
+  DistanceMap.clear();
+  unsigned Dist = 0;
+  for (MachineInstr &MI : *MBB) {
+    if (MI.isDebugOrPseudoInstr())
+      continue;
+    DistanceMap.insert(std::make_pair(&MI, Dist++));
+
+    runOnInstr(MI, Defs);
+  }
+
+  // Handle any virtual assignments from PHI nodes which might be at the
+  // bottom of this basic block.  We check all of our successor blocks to see
+  // if they have PHI nodes, and if so, we simulate an assignment at the end
+  // of the current block.
+  if (!PHIVarInfo[MBB->getNumber()].empty()) {
+    SmallVectorImpl<unsigned> &VarInfoVec = PHIVarInfo[MBB->getNumber()];
+
+    for (unsigned I : VarInfoVec)
+      // Mark it alive only in the block we are representing.
+      MarkVirtRegAliveInBlock(getVarInfo(I), MRI->getVRegDef(I)->getParent(),
+                              MBB);
+  }
+
+  // MachineCSE may CSE instructions which write to non-allocatable physical
+  // registers across MBBs. Remember if any reserved register is liveout.
+  SmallSet<unsigned, 4> LiveOuts;
+  for (const MachineBasicBlock *SuccMBB : MBB->successors()) {
+    if (SuccMBB->isEHPad())
+      continue;
+    for (const auto &LI : SuccMBB->liveins()) {
+      if (!TRI->isInAllocatableClass(LI.PhysReg))
+        // Ignore other live-ins, e.g. those that are live into landing pads.
+        LiveOuts.insert(LI.PhysReg);
+    }
+  }
+
+  // Loop over PhysRegDef / PhysRegUse, killing any registers that are
+  // available at the end of the basic block.
+  for (unsigned i = 0; i != NumRegs; ++i)
+    if ((PhysRegDef[i] || PhysRegUse[i]) && !LiveOuts.count(i))
+      HandlePhysRegDef(i, nullptr, Defs);
+}
+
+bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  MRI = &mf.getRegInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+
+  const unsigned NumRegs = TRI->getNumRegs();
+  PhysRegDef.assign(NumRegs, nullptr);
+  PhysRegUse.assign(NumRegs, nullptr);
+  PHIVarInfo.resize(MF->getNumBlockIDs());
+  PHIJoins.clear();
+
+  // FIXME: LiveIntervals will be updated to remove its dependence on
+  // LiveVariables to improve compilation time and eliminate bizarre pass
+  // dependencies. Until then, we can't change much in -O0.
+  if (!MRI->isSSA())
+    report_fatal_error("regalloc=... not currently supported with -O0");
+
+  analyzePHINodes(mf);
+
+  // Calculate live variable information in depth first order on the CFG of the
+  // function.  This guarantees that we will see the definition of a virtual
+  // register before its uses due to dominance properties of SSA (except for PHI
+  // nodes, which are treated as a special case).
+  MachineBasicBlock *Entry = &MF->front();
+  df_iterator_default_set<MachineBasicBlock*,16> Visited;
+
+  for (MachineBasicBlock *MBB : depth_first_ext(Entry, Visited)) {
+    runOnBlock(MBB, NumRegs);
+
+    PhysRegDef.assign(NumRegs, nullptr);
+    PhysRegUse.assign(NumRegs, nullptr);
+  }
+
+  // Convert and transfer the dead / killed information we have gathered into
+  // VirtRegInfo onto MI's.
+  for (unsigned i = 0, e1 = VirtRegInfo.size(); i != e1; ++i) {
+    const Register Reg = Register::index2VirtReg(i);
+    for (unsigned j = 0, e2 = VirtRegInfo[Reg].Kills.size(); j != e2; ++j)
+      if (VirtRegInfo[Reg].Kills[j] == MRI->getVRegDef(Reg))
+        VirtRegInfo[Reg].Kills[j]->addRegisterDead(Reg, TRI);
+      else
+        VirtRegInfo[Reg].Kills[j]->addRegisterKilled(Reg, TRI);
+  }
+
+  // Check to make sure there are no unreachable blocks in the MC CFG for the
+  // function.  If so, it is due to a bug in the instruction selector or some
+  // other part of the code generator if this happens.
+#ifndef NDEBUG
+  for (const MachineBasicBlock &MBB : *MF)
+    assert(Visited.contains(&MBB) && "unreachable basic block found");
+#endif
+
+  PhysRegDef.clear();
+  PhysRegUse.clear();
+  PHIVarInfo.clear();
+
+  return false;
+}
+
+void LiveVariables::recomputeForSingleDefVirtReg(Register Reg) {
+  assert(Reg.isVirtual());
+
+  VarInfo &VI = getVarInfo(Reg);
+  VI.AliveBlocks.clear();
+  VI.Kills.clear();
+
+  MachineInstr &DefMI = *MRI->getUniqueVRegDef(Reg);
+  MachineBasicBlock &DefBB = *DefMI.getParent();
+
+  // Handle the case where all uses have been removed.
+  if (MRI->use_nodbg_empty(Reg)) {
+    VI.Kills.push_back(&DefMI);
+    DefMI.addRegisterDead(Reg, nullptr);
+    return;
+  }
+  DefMI.clearRegisterDeads(Reg);
+
+  // Initialize a worklist of BBs that Reg is live-to-end of. (Here
+  // "live-to-end" means Reg is live at the end of a block even if it is only
+  // live because of phi uses in a successor. This is different from isLiveOut()
+  // which does not consider phi uses.)
+  SmallVector<MachineBasicBlock *> LiveToEndBlocks;
+  SparseBitVector<> UseBlocks;
+  for (auto &UseMO : MRI->use_nodbg_operands(Reg)) {
+    UseMO.setIsKill(false);
+    MachineInstr &UseMI = *UseMO.getParent();
+    MachineBasicBlock &UseBB = *UseMI.getParent();
+    UseBlocks.set(UseBB.getNumber());
+    if (UseMI.isPHI()) {
+      // If Reg is used in a phi then it is live-to-end of the corresponding
+      // predecessor.
+      unsigned Idx = UseMO.getOperandNo();
+      LiveToEndBlocks.push_back(UseMI.getOperand(Idx + 1).getMBB());
+    } else if (&UseBB == &DefBB) {
+      // A non-phi use in the same BB as the single def must come after the def.
+    } else {
+      // Otherwise Reg must be live-to-end of all predecessors.
+      LiveToEndBlocks.append(UseBB.pred_begin(), UseBB.pred_end());
+    }
+  }
+
+  // Iterate over the worklist adding blocks to AliveBlocks.
+  bool LiveToEndOfDefBB = false;
+  while (!LiveToEndBlocks.empty()) {
+    MachineBasicBlock &BB = *LiveToEndBlocks.pop_back_val();
+    if (&BB == &DefBB) {
+      LiveToEndOfDefBB = true;
+      continue;
+    }
+    if (VI.AliveBlocks.test(BB.getNumber()))
+      continue;
+    VI.AliveBlocks.set(BB.getNumber());
+    LiveToEndBlocks.append(BB.pred_begin(), BB.pred_end());
+  }
+
+  // Recompute kill flags. For each block in which Reg is used but is not
+  // live-through, find the last instruction that uses Reg. Ignore phi nodes
+  // because they should not be included in Kills.
+  for (unsigned UseBBNum : UseBlocks) {
+    if (VI.AliveBlocks.test(UseBBNum))
+      continue;
+    MachineBasicBlock &UseBB = *MF->getBlockNumbered(UseBBNum);
+    if (&UseBB == &DefBB && LiveToEndOfDefBB)
+      continue;
+    for (auto &MI : reverse(UseBB)) {
+      if (MI.isDebugOrPseudoInstr())
+        continue;
+      if (MI.isPHI())
+        break;
+      if (MI.readsRegister(Reg)) {
+        assert(!MI.killsRegister(Reg));
+        MI.addRegisterKilled(Reg, nullptr);
+        VI.Kills.push_back(&MI);
+        break;
+      }
+    }
+  }
+}
+
+/// replaceKillInstruction - Update register kill info by replacing a kill
+/// instruction with a new one.
+void LiveVariables::replaceKillInstruction(Register Reg, MachineInstr &OldMI,
+                                           MachineInstr &NewMI) {
+  VarInfo &VI = getVarInfo(Reg);
+  std::replace(VI.Kills.begin(), VI.Kills.end(), &OldMI, &NewMI);
+}
+
+/// removeVirtualRegistersKilled - Remove all killed info for the specified
+/// instruction.
+void LiveVariables::removeVirtualRegistersKilled(MachineInstr &MI) {
+  for (MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.isKill()) {
+      MO.setIsKill(false);
+      Register Reg = MO.getReg();
+      if (Reg.isVirtual()) {
+        bool removed = getVarInfo(Reg).removeKill(MI);
+        assert(removed && "kill not in register's VarInfo?");
+        (void)removed;
+      }
+    }
+  }
+}
+
+/// analyzePHINodes - Gather information about the PHI nodes in here. In
+/// particular, we want to map the variable information of a virtual register
+/// which is used in a PHI node. We map that to the BB the vreg is coming from.
+///
+void LiveVariables::analyzePHINodes(const MachineFunction& Fn) {
+  for (const auto &MBB : Fn)
+    for (const auto &BBI : MBB) {
+      if (!BBI.isPHI())
+        break;
+      for (unsigned i = 1, e = BBI.getNumOperands(); i != e; i += 2)
+        if (BBI.getOperand(i).readsReg())
+          PHIVarInfo[BBI.getOperand(i + 1).getMBB()->getNumber()]
+            .push_back(BBI.getOperand(i).getReg());
+    }
+}
+
+bool LiveVariables::VarInfo::isLiveIn(const MachineBasicBlock &MBB,
+                                      Register Reg, MachineRegisterInfo &MRI) {
+  unsigned Num = MBB.getNumber();
+
+  // Reg is live-through.
+  if (AliveBlocks.test(Num))
+    return true;
+
+  // Registers defined in MBB cannot be live in.
+  const MachineInstr *Def = MRI.getVRegDef(Reg);
+  if (Def && Def->getParent() == &MBB)
+    return false;
+
+ // Reg was not defined in MBB, was it killed here?
+  return findKill(&MBB);
+}
+
+bool LiveVariables::isLiveOut(Register Reg, const MachineBasicBlock &MBB) {
+  LiveVariables::VarInfo &VI = getVarInfo(Reg);
+
+  SmallPtrSet<const MachineBasicBlock *, 8> Kills;
+  for (MachineInstr *MI : VI.Kills)
+    Kills.insert(MI->getParent());
+
+  // Loop over all of the successors of the basic block, checking to see if
+  // the value is either live in the block, or if it is killed in the block.
+  for (const MachineBasicBlock *SuccMBB : MBB.successors()) {
+    // Is it alive in this successor?
+    unsigned SuccIdx = SuccMBB->getNumber();
+    if (VI.AliveBlocks.test(SuccIdx))
+      return true;
+    // Or is it live because there is a use in a successor that kills it?
+    if (Kills.count(SuccMBB))
+      return true;
+  }
+
+  return false;
+}
+
+/// addNewBlock - Add a new basic block BB as an empty succcessor to DomBB. All
+/// variables that are live out of DomBB will be marked as passing live through
+/// BB.
+void LiveVariables::addNewBlock(MachineBasicBlock *BB,
+                                MachineBasicBlock *DomBB,
+                                MachineBasicBlock *SuccBB) {
+  const unsigned NumNew = BB->getNumber();
+
+  DenseSet<unsigned> Defs, Kills;
+
+  MachineBasicBlock::iterator BBI = SuccBB->begin(), BBE = SuccBB->end();
+  for (; BBI != BBE && BBI->isPHI(); ++BBI) {
+    // Record the def of the PHI node.
+    Defs.insert(BBI->getOperand(0).getReg());
+
+    // All registers used by PHI nodes in SuccBB must be live through BB.
+    for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
+      if (BBI->getOperand(i+1).getMBB() == BB)
+        getVarInfo(BBI->getOperand(i).getReg()).AliveBlocks.set(NumNew);
+  }
+
+  // Record all vreg defs and kills of all instructions in SuccBB.
+  for (; BBI != BBE; ++BBI) {
+    for (const MachineOperand &Op : BBI->operands()) {
+      if (Op.isReg() && Op.getReg().isVirtual()) {
+        if (Op.isDef())
+          Defs.insert(Op.getReg());
+        else if (Op.isKill())
+          Kills.insert(Op.getReg());
+      }
+    }
+  }
+
+  // Update info for all live variables
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    Register Reg = Register::index2VirtReg(i);
+
+    // If the Defs is defined in the successor it can't be live in BB.
+    if (Defs.count(Reg))
+      continue;
+
+    // If the register is either killed in or live through SuccBB it's also live
+    // through BB.
+    VarInfo &VI = getVarInfo(Reg);
+    if (Kills.count(Reg) || VI.AliveBlocks.test(SuccBB->getNumber()))
+      VI.AliveBlocks.set(NumNew);
+  }
+}
+
+/// addNewBlock - Add a new basic block BB as an empty succcessor to DomBB. All
+/// variables that are live out of DomBB will be marked as passing live through
+/// BB. LiveInSets[BB] is *not* updated (because it is not needed during
+/// PHIElimination).
+void LiveVariables::addNewBlock(MachineBasicBlock *BB,
+                                MachineBasicBlock *DomBB,
+                                MachineBasicBlock *SuccBB,
+                                std::vector<SparseBitVector<>> &LiveInSets) {
+  const unsigned NumNew = BB->getNumber();
+
+  SparseBitVector<> &BV = LiveInSets[SuccBB->getNumber()];
+  for (unsigned R : BV) {
+    Register VirtReg = Register::index2VirtReg(R);
+    LiveVariables::VarInfo &VI = getVarInfo(VirtReg);
+    VI.AliveBlocks.set(NumNew);
+  }
+  // All registers used by PHI nodes in SuccBB must be live through BB.
+  for (MachineBasicBlock::iterator BBI = SuccBB->begin(),
+         BBE = SuccBB->end();
+       BBI != BBE && BBI->isPHI(); ++BBI) {
+    for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
+      if (BBI->getOperand(i + 1).getMBB() == BB &&
+          BBI->getOperand(i).readsReg())
+        getVarInfo(BBI->getOperand(i).getReg())
+          .AliveBlocks.set(NumNew);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
new file mode 100644
index 000000000000..e491ed12034d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -0,0 +1,442 @@
+//===- LocalStackSlotAllocation.cpp - Pre-allocate locals to stack slots --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass assigns local frame indices to stack slots relative to one another
+// and allocates additional base registers to access them when the target
+// estimates they are likely to be out of range of stack pointer and frame
+// pointer relative addressing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "localstackalloc"
+
+STATISTIC(NumAllocations, "Number of frame indices allocated into local block");
+STATISTIC(NumBaseRegisters, "Number of virtual frame base registers allocated");
+STATISTIC(NumReplacements, "Number of frame indices references replaced");
+
+namespace {
+
+  class FrameRef {
+    MachineBasicBlock::iterator MI; // Instr referencing the frame
+    int64_t LocalOffset;            // Local offset of the frame idx referenced
+    int FrameIdx;                   // The frame index
+
+    // Order reference instruction appears in program. Used to ensure
+    // deterministic order when multiple instructions may reference the same
+    // location.
+    unsigned Order;
+
+  public:
+    FrameRef(MachineInstr *I, int64_t Offset, int Idx, unsigned Ord) :
+      MI(I), LocalOffset(Offset), FrameIdx(Idx), Order(Ord) {}
+
+    bool operator<(const FrameRef &RHS) const {
+      return std::tie(LocalOffset, FrameIdx, Order) <
+             std::tie(RHS.LocalOffset, RHS.FrameIdx, RHS.Order);
+    }
+
+    MachineBasicBlock::iterator getMachineInstr() const { return MI; }
+    int64_t getLocalOffset() const { return LocalOffset; }
+    int getFrameIndex() const { return FrameIdx; }
+  };
+
+  class LocalStackSlotPass: public MachineFunctionPass {
+    SmallVector<int64_t, 16> LocalOffsets;
+
+    /// StackObjSet - A set of stack object indexes
+    using StackObjSet = SmallSetVector<int, 8>;
+
+    void AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx, int64_t &Offset,
+                           bool StackGrowsDown, Align &MaxAlign);
+    void AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
+                               SmallSet<int, 16> &ProtectedObjs,
+                               MachineFrameInfo &MFI, bool StackGrowsDown,
+                               int64_t &Offset, Align &MaxAlign);
+    void calculateFrameObjectOffsets(MachineFunction &Fn);
+    bool insertFrameReferenceRegisters(MachineFunction &Fn);
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+
+    explicit LocalStackSlotPass() : MachineFunctionPass(ID) {
+      initializeLocalStackSlotPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+
+} // end anonymous namespace
+
+char LocalStackSlotPass::ID = 0;
+
+char &llvm::LocalStackSlotAllocationID = LocalStackSlotPass::ID;
+INITIALIZE_PASS(LocalStackSlotPass, DEBUG_TYPE,
+                "Local Stack Slot Allocation", false, false)
+
+bool LocalStackSlotPass::runOnMachineFunction(MachineFunction &MF) {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  unsigned LocalObjectCount = MFI.getObjectIndexEnd();
+
+  // If the target doesn't want/need this pass, or if there are no locals
+  // to consider, early exit.
+  if (LocalObjectCount == 0 || !TRI->requiresVirtualBaseRegisters(MF))
+    return false;
+
+  // Make sure we have enough space to store the local offsets.
+  LocalOffsets.resize(MFI.getObjectIndexEnd());
+
+  // Lay out the local blob.
+  calculateFrameObjectOffsets(MF);
+
+  // Insert virtual base registers to resolve frame index references.
+  bool UsedBaseRegs = insertFrameReferenceRegisters(MF);
+
+  // Tell MFI whether any base registers were allocated. PEI will only
+  // want to use the local block allocations from this pass if there were any.
+  // Otherwise, PEI can do a bit better job of getting the alignment right
+  // without a hole at the start since it knows the alignment of the stack
+  // at the start of local allocation, and this pass doesn't.
+  MFI.setUseLocalStackAllocationBlock(UsedBaseRegs);
+
+  return true;
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+void LocalStackSlotPass::AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
+                                           int64_t &Offset, bool StackGrowsDown,
+                                           Align &MaxAlign) {
+  // If the stack grows down, add the object size to find the lowest address.
+  if (StackGrowsDown)
+    Offset += MFI.getObjectSize(FrameIdx);
+
+  Align Alignment = MFI.getObjectAlign(FrameIdx);
+
+  // If the alignment of this object is greater than that of the stack, then
+  // increase the stack alignment to match.
+  MaxAlign = std::max(MaxAlign, Alignment);
+
+  // Adjust to alignment boundary.
+  Offset = alignTo(Offset, Alignment);
+
+  int64_t LocalOffset = StackGrowsDown ? -Offset : Offset;
+  LLVM_DEBUG(dbgs() << "Allocate FI(" << FrameIdx << ") to local offset "
+                    << LocalOffset << "\n");
+  // Keep the offset available for base register allocation
+  LocalOffsets[FrameIdx] = LocalOffset;
+  // And tell MFI about it for PEI to use later
+  MFI.mapLocalFrameObject(FrameIdx, LocalOffset);
+
+  if (!StackGrowsDown)
+    Offset += MFI.getObjectSize(FrameIdx);
+
+  ++NumAllocations;
+}
+
+/// AssignProtectedObjSet - Helper function to assign large stack objects (i.e.,
+/// those required to be close to the Stack Protector) to stack offsets.
+void LocalStackSlotPass::AssignProtectedObjSet(
+    const StackObjSet &UnassignedObjs, SmallSet<int, 16> &ProtectedObjs,
+    MachineFrameInfo &MFI, bool StackGrowsDown, int64_t &Offset,
+    Align &MaxAlign) {
+  for (int i : UnassignedObjs) {
+    AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign);
+    ProtectedObjs.insert(i);
+  }
+}
+
+/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
+/// abstract stack objects.
+void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
+  // Loop over all of the stack objects, assigning sequential addresses...
+  MachineFrameInfo &MFI = Fn.getFrameInfo();
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+  int64_t Offset = 0;
+  Align MaxAlign;
+
+  // Make sure that the stack protector comes before the local variables on the
+  // stack.
+  SmallSet<int, 16> ProtectedObjs;
+  if (MFI.hasStackProtectorIndex()) {
+    int StackProtectorFI = MFI.getStackProtectorIndex();
+
+    // We need to make sure we didn't pre-allocate the stack protector when
+    // doing this.
+    // If we already have a stack protector, this will re-assign it to a slot
+    // that is **not** covering the protected objects.
+    assert(!MFI.isObjectPreAllocated(StackProtectorFI) &&
+           "Stack protector pre-allocated in LocalStackSlotAllocation");
+
+    StackObjSet LargeArrayObjs;
+    StackObjSet SmallArrayObjs;
+    StackObjSet AddrOfObjs;
+
+    // Only place the stack protector in the local stack area if the target
+    // allows it.
+    if (TFI.isStackIdSafeForLocalArea(MFI.getStackID(StackProtectorFI)))
+      AdjustStackOffset(MFI, StackProtectorFI, Offset, StackGrowsDown,
+                        MaxAlign);
+
+    // Assign large stack objects first.
+    for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
+      if (MFI.isDeadObjectIndex(i))
+        continue;
+      if (StackProtectorFI == (int)i)
+        continue;
+      if (!TFI.isStackIdSafeForLocalArea(MFI.getStackID(i)))
+        continue;
+
+      switch (MFI.getObjectSSPLayout(i)) {
+      case MachineFrameInfo::SSPLK_None:
+        continue;
+      case MachineFrameInfo::SSPLK_SmallArray:
+        SmallArrayObjs.insert(i);
+        continue;
+      case MachineFrameInfo::SSPLK_AddrOf:
+        AddrOfObjs.insert(i);
+        continue;
+      case MachineFrameInfo::SSPLK_LargeArray:
+        LargeArrayObjs.insert(i);
+        continue;
+      }
+      llvm_unreachable("Unexpected SSPLayoutKind.");
+    }
+
+    AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
+                          Offset, MaxAlign);
+    AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
+                          Offset, MaxAlign);
+    AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown,
+                          Offset, MaxAlign);
+  }
+
+  // Then assign frame offsets to stack objects that are not used to spill
+  // callee saved registers.
+  for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
+    if (MFI.isDeadObjectIndex(i))
+      continue;
+    if (MFI.getStackProtectorIndex() == (int)i)
+      continue;
+    if (ProtectedObjs.count(i))
+      continue;
+    if (!TFI.isStackIdSafeForLocalArea(MFI.getStackID(i)))
+      continue;
+
+    AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign);
+  }
+
+  // Remember how big this blob of stack space is
+  MFI.setLocalFrameSize(Offset);
+  MFI.setLocalFrameMaxAlign(MaxAlign);
+}
+
+static inline bool
+lookupCandidateBaseReg(unsigned BaseReg,
+                       int64_t BaseOffset,
+                       int64_t FrameSizeAdjust,
+                       int64_t LocalFrameOffset,
+                       const MachineInstr &MI,
+                       const TargetRegisterInfo *TRI) {
+  // Check if the relative offset from the where the base register references
+  // to the target address is in range for the instruction.
+  int64_t Offset = FrameSizeAdjust + LocalFrameOffset - BaseOffset;
+  return TRI->isFrameOffsetLegal(&MI, BaseReg, Offset);
+}
+
+bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
+  // Scan the function's instructions looking for frame index references.
+  // For each, ask the target if it wants a virtual base register for it
+  // based on what we can tell it about where the local will end up in the
+  // stack frame. If it wants one, re-use a suitable one we've previously
+  // allocated, or if there isn't one that fits the bill, allocate a new one
+  // and ask the target to create a defining instruction for it.
+
+  MachineFrameInfo &MFI = Fn.getFrameInfo();
+  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  // Collect all of the instructions in the block that reference
+  // a frame index. Also store the frame index referenced to ease later
+  // lookup. (For any insn that has more than one FI reference, we arbitrarily
+  // choose the first one).
+  SmallVector<FrameRef, 64> FrameReferenceInsns;
+
+  unsigned Order = 0;
+
+  for (MachineBasicBlock &BB : Fn) {
+    for (MachineInstr &MI : BB) {
+      // Debug value, stackmap and patchpoint instructions can't be out of
+      // range, so they don't need any updates.
+      if (MI.isDebugInstr() || MI.getOpcode() == TargetOpcode::STATEPOINT ||
+          MI.getOpcode() == TargetOpcode::STACKMAP ||
+          MI.getOpcode() == TargetOpcode::PATCHPOINT)
+        continue;
+
+      // For now, allocate the base register(s) within the basic block
+      // where they're used, and don't try to keep them around outside
+      // of that. It may be beneficial to try sharing them more broadly
+      // than that, but the increased register pressure makes that a
+      // tricky thing to balance. Investigate if re-materializing these
+      // becomes an issue.
+      for (const MachineOperand &MO : MI.operands()) {
+        // Consider replacing all frame index operands that reference
+        // an object allocated in the local block.
+        if (MO.isFI()) {
+          // Don't try this with values not in the local block.
+          if (!MFI.isObjectPreAllocated(MO.getIndex()))
+            break;
+          int Idx = MO.getIndex();
+          int64_t LocalOffset = LocalOffsets[Idx];
+          if (!TRI->needsFrameBaseReg(&MI, LocalOffset))
+            break;
+          FrameReferenceInsns.push_back(FrameRef(&MI, LocalOffset, Idx, Order++));
+          break;
+        }
+      }
+    }
+  }
+
+  // Sort the frame references by local offset.
+  // Use frame index as a tie-breaker in case MI's have the same offset.
+  llvm::sort(FrameReferenceInsns);
+
+  MachineBasicBlock *Entry = &Fn.front();
+
+  Register BaseReg;
+  int64_t BaseOffset = 0;
+
+  // Loop through the frame references and allocate for them as necessary.
+  for (int ref = 0, e = FrameReferenceInsns.size(); ref < e ; ++ref) {
+    FrameRef &FR = FrameReferenceInsns[ref];
+    MachineInstr &MI = *FR.getMachineInstr();
+    int64_t LocalOffset = FR.getLocalOffset();
+    int FrameIdx = FR.getFrameIndex();
+    assert(MFI.isObjectPreAllocated(FrameIdx) &&
+           "Only pre-allocated locals expected!");
+
+    // We need to keep the references to the stack protector slot through frame
+    // index operands so that it gets resolved by PEI rather than this pass.
+    // This avoids accesses to the stack protector though virtual base
+    // registers, and forces PEI to address it using fp/sp/bp.
+    if (MFI.hasStackProtectorIndex() &&
+        FrameIdx == MFI.getStackProtectorIndex())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Considering: " << MI);
+
+    unsigned idx = 0;
+    for (unsigned f = MI.getNumOperands(); idx != f; ++idx) {
+      if (!MI.getOperand(idx).isFI())
+        continue;
+
+      if (FrameIdx == MI.getOperand(idx).getIndex())
+        break;
+    }
+
+    assert(idx < MI.getNumOperands() && "Cannot find FI operand");
+
+    int64_t Offset = 0;
+    int64_t FrameSizeAdjust = StackGrowsDown ? MFI.getLocalFrameSize() : 0;
+
+    LLVM_DEBUG(dbgs() << "  Replacing FI in: " << MI);
+
+    // If we have a suitable base register available, use it; otherwise
+    // create a new one. Note that any offset encoded in the
+    // instruction itself will be taken into account by the target,
+    // so we don't have to adjust for it here when reusing a base
+    // register.
+    if (BaseReg.isValid() &&
+        lookupCandidateBaseReg(BaseReg, BaseOffset, FrameSizeAdjust,
+                               LocalOffset, MI, TRI)) {
+      LLVM_DEBUG(dbgs() << "  Reusing base register " << BaseReg << "\n");
+      // We found a register to reuse.
+      Offset = FrameSizeAdjust + LocalOffset - BaseOffset;
+    } else {
+      // No previously defined register was in range, so create a new one.
+      int64_t InstrOffset = TRI->getFrameIndexInstrOffset(&MI, idx);
+
+      int64_t CandBaseOffset = FrameSizeAdjust + LocalOffset + InstrOffset;
+
+      // We'd like to avoid creating single-use virtual base registers.
+      // Because the FrameRefs are in sorted order, and we've already
+      // processed all FrameRefs before this one, just check whether or not
+      // the next FrameRef will be able to reuse this new register. If not,
+      // then don't bother creating it.
+      if (ref + 1 >= e ||
+          !lookupCandidateBaseReg(
+              BaseReg, CandBaseOffset, FrameSizeAdjust,
+              FrameReferenceInsns[ref + 1].getLocalOffset(),
+              *FrameReferenceInsns[ref + 1].getMachineInstr(), TRI))
+        continue;
+
+      // Save the base offset.
+      BaseOffset = CandBaseOffset;
+
+      // Tell the target to insert the instruction to initialize
+      // the base register.
+      //            MachineBasicBlock::iterator InsertionPt = Entry->begin();
+      BaseReg = TRI->materializeFrameBaseRegister(Entry, FrameIdx, InstrOffset);
+
+      LLVM_DEBUG(dbgs() << "  Materialized base register at frame local offset "
+                        << LocalOffset + InstrOffset
+                        << " into " << printReg(BaseReg, TRI) << '\n');
+
+      // The base register already includes any offset specified
+      // by the instruction, so account for that so it doesn't get
+      // applied twice.
+      Offset = -InstrOffset;
+
+      ++NumBaseRegisters;
+    }
+    assert(BaseReg && "Unable to allocate virtual base register!");
+
+    // Modify the instruction to use the new base register rather
+    // than the frame index operand.
+    TRI->resolveFrameIndex(MI, BaseReg, Offset);
+    LLVM_DEBUG(dbgs() << "Resolved: " << MI);
+
+    ++NumReplacements;
+  }
+
+  return BaseReg.isValid();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LoopTraversal.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LoopTraversal.cpp
new file mode 100644
index 000000000000..0d400253c652
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LoopTraversal.cpp
@@ -0,0 +1,75 @@
+//===- LoopTraversal.cpp - Optimal basic block traversal order --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LoopTraversal.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+using namespace llvm;
+
+bool LoopTraversal::isBlockDone(MachineBasicBlock *MBB) {
+  unsigned MBBNumber = MBB->getNumber();
+  assert(MBBNumber < MBBInfos.size() && "Unexpected basic block number.");
+  return MBBInfos[MBBNumber].PrimaryCompleted &&
+         MBBInfos[MBBNumber].IncomingCompleted ==
+             MBBInfos[MBBNumber].PrimaryIncoming &&
+         MBBInfos[MBBNumber].IncomingProcessed == MBB->pred_size();
+}
+
+LoopTraversal::TraversalOrder LoopTraversal::traverse(MachineFunction &MF) {
+  // Initialize the MMBInfos
+  MBBInfos.assign(MF.getNumBlockIDs(), MBBInfo());
+
+  MachineBasicBlock *Entry = &*MF.begin();
+  ReversePostOrderTraversal<MachineBasicBlock *> RPOT(Entry);
+  SmallVector<MachineBasicBlock *, 4> Workqueue;
+  SmallVector<TraversedMBBInfo, 4> MBBTraversalOrder;
+  for (MachineBasicBlock *MBB : RPOT) {
+    // N.B: IncomingProcessed and IncomingCompleted were already updated while
+    // processing this block's predecessors.
+    unsigned MBBNumber = MBB->getNumber();
+    assert(MBBNumber < MBBInfos.size() && "Unexpected basic block number.");
+    MBBInfos[MBBNumber].PrimaryCompleted = true;
+    MBBInfos[MBBNumber].PrimaryIncoming = MBBInfos[MBBNumber].IncomingProcessed;
+    bool Primary = true;
+    Workqueue.push_back(MBB);
+    while (!Workqueue.empty()) {
+      MachineBasicBlock *ActiveMBB = Workqueue.pop_back_val();
+      bool Done = isBlockDone(ActiveMBB);
+      MBBTraversalOrder.push_back(TraversedMBBInfo(ActiveMBB, Primary, Done));
+      for (MachineBasicBlock *Succ : ActiveMBB->successors()) {
+        unsigned SuccNumber = Succ->getNumber();
+        assert(SuccNumber < MBBInfos.size() &&
+               "Unexpected basic block number.");
+        if (!isBlockDone(Succ)) {
+          if (Primary)
+            MBBInfos[SuccNumber].IncomingProcessed++;
+          if (Done)
+            MBBInfos[SuccNumber].IncomingCompleted++;
+          if (isBlockDone(Succ))
+            Workqueue.push_back(Succ);
+        }
+      }
+      Primary = false;
+    }
+  }
+
+  // We need to go through again and finalize any blocks that are not done yet.
+  // This is possible if blocks have dead predecessors, so we didn't visit them
+  // above.
+  for (MachineBasicBlock *MBB : RPOT) {
+    if (!isBlockDone(MBB))
+      MBBTraversalOrder.push_back(TraversedMBBInfo(MBB, false, true));
+    // Don't update successors here. We'll get to them anyway through this
+    // loop.
+  }
+
+  MBBInfos.clear();
+
+  return MBBTraversalOrder;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LowLevelType.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LowLevelType.cpp
new file mode 100644
index 000000000000..24c30b756737
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LowLevelType.cpp
@@ -0,0 +1,66 @@
+//===-- llvm/CodeGen/LowLevelType.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file implements the more header-heavy bits of the LLT class to
+/// avoid polluting users' namespaces.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+LLT::LLT(MVT VT) {
+  if (VT.isVector()) {
+    bool asVector = VT.getVectorMinNumElements() > 1;
+    init(/*IsPointer=*/false, asVector, /*IsScalar=*/!asVector,
+         VT.getVectorElementCount(), VT.getVectorElementType().getSizeInBits(),
+         /*AddressSpace=*/0);
+  } else if (VT.isValid() && !VT.isScalableTargetExtVT()) {
+    // Aggregates are no different from real scalars as far as GlobalISel is
+    // concerned.
+    init(/*IsPointer=*/false, /*IsVector=*/false, /*IsScalar=*/true,
+         ElementCount::getFixed(0), VT.getSizeInBits(), /*AddressSpace=*/0);
+  } else {
+    IsScalar = false;
+    IsPointer = false;
+    IsVector = false;
+    RawData = 0;
+  }
+}
+
+void LLT::print(raw_ostream &OS) const {
+  if (isVector()) {
+    OS << "<";
+    OS << getElementCount() << " x " << getElementType() << ">";
+  } else if (isPointer())
+    OS << "p" << getAddressSpace();
+  else if (isValid()) {
+    assert(isScalar() && "unexpected type");
+    OS << "s" << getScalarSizeInBits();
+  } else
+    OS << "LLT_invalid";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LLT::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+#endif
+
+const constexpr LLT::BitFieldInfo LLT::ScalarSizeFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::PointerSizeFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::PointerAddressSpaceFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::VectorElementsFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::VectorScalableFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::VectorSizeFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::PointerVectorElementsFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::PointerVectorScalableFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::PointerVectorSizeFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::PointerVectorAddressSpaceFieldInfo;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LowLevelTypeUtils.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LowLevelTypeUtils.cpp
new file mode 100644
index 000000000000..bc2ea3f05b6d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LowLevelTypeUtils.cpp
@@ -0,0 +1,85 @@
+//===-- llvm/CodeGen/LowLevelTypeUtils.cpp --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file implements the more header-heavy bits of the LLT class to
+/// avoid polluting users' namespaces.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LowLevelTypeUtils.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+using namespace llvm;
+
+LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) {
+  if (auto VTy = dyn_cast<VectorType>(&Ty)) {
+    auto EC = VTy->getElementCount();
+    LLT ScalarTy = getLLTForType(*VTy->getElementType(), DL);
+    if (EC.isScalar())
+      return ScalarTy;
+    return LLT::vector(EC, ScalarTy);
+  }
+
+  if (auto PTy = dyn_cast<PointerType>(&Ty)) {
+    unsigned AddrSpace = PTy->getAddressSpace();
+    return LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
+  }
+
+  if (Ty.isSized() && !Ty.isScalableTargetExtTy()) {
+    // Aggregates are no different from real scalars as far as GlobalISel is
+    // concerned.
+    auto SizeInBits = DL.getTypeSizeInBits(&Ty);
+    assert(SizeInBits != 0 && "invalid zero-sized type");
+    return LLT::scalar(SizeInBits);
+  }
+
+  return LLT();
+}
+
+MVT llvm::getMVTForLLT(LLT Ty) {
+  if (!Ty.isVector())
+    return MVT::getIntegerVT(Ty.getSizeInBits());
+
+  return MVT::getVectorVT(
+      MVT::getIntegerVT(Ty.getElementType().getSizeInBits()),
+      Ty.getNumElements());
+}
+
+EVT llvm::getApproximateEVTForLLT(LLT Ty, const DataLayout &DL,
+                                  LLVMContext &Ctx) {
+  if (Ty.isVector()) {
+    EVT EltVT = getApproximateEVTForLLT(Ty.getElementType(), DL, Ctx);
+    return EVT::getVectorVT(Ctx, EltVT, Ty.getElementCount());
+  }
+
+  return EVT::getIntegerVT(Ctx, Ty.getSizeInBits());
+}
+
+LLT llvm::getLLTForMVT(MVT Ty) {
+  if (!Ty.isVector())
+    return LLT::scalar(Ty.getSizeInBits());
+
+  return LLT::scalarOrVector(Ty.getVectorElementCount(),
+                             Ty.getVectorElementType().getSizeInBits());
+}
+
+const llvm::fltSemantics &llvm::getFltSemanticForLLT(LLT Ty) {
+  assert(Ty.isScalar() && "Expected a scalar type.");
+  switch (Ty.getSizeInBits()) {
+  case 16:
+    return APFloat::IEEEhalf();
+  case 32:
+    return APFloat::IEEEsingle();
+  case 64:
+    return APFloat::IEEEdouble();
+  case 128:
+    return APFloat::IEEEquad();
+  }
+  llvm_unreachable("Invalid FP type size.");
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/LowerEmuTLS.cpp b/contrib/llvm-project/llvm/lib/CodeGen/LowerEmuTLS.cpp
new file mode 100644
index 000000000000..a517ee3794ca
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/LowerEmuTLS.cpp
@@ -0,0 +1,158 @@
+//===- LowerEmuTLS.cpp - Add __emutls_[vt].* variables --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation is required for targets depending on libgcc style
+// emulated thread local storage variables. For every defined TLS variable xyz,
+// an __emutls_v.xyz is generated. If there is non-zero initialized value
+// an __emutls_t.xyz is also generated.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loweremutls"
+
+namespace {
+
+class LowerEmuTLS : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  LowerEmuTLS() : ModulePass(ID) {
+    initializeLowerEmuTLSPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+private:
+  bool addEmuTlsVar(Module &M, const GlobalVariable *GV);
+  static void copyLinkageVisibility(Module &M,
+                                    const GlobalVariable *from,
+                                    GlobalVariable *to) {
+    to->setLinkage(from->getLinkage());
+    to->setVisibility(from->getVisibility());
+    to->setDSOLocal(from->isDSOLocal());
+    if (from->hasComdat()) {
+      to->setComdat(M.getOrInsertComdat(to->getName()));
+      to->getComdat()->setSelectionKind(from->getComdat()->getSelectionKind());
+    }
+  }
+};
+}
+
+char LowerEmuTLS::ID = 0;
+
+INITIALIZE_PASS(LowerEmuTLS, DEBUG_TYPE,
+                "Add __emutls_[vt]. variables for emultated TLS model", false,
+                false)
+
+ModulePass *llvm::createLowerEmuTLSPass() { return new LowerEmuTLS(); }
+
+bool LowerEmuTLS::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  auto &TM = TPC->getTM<TargetMachine>();
+  if (!TM.useEmulatedTLS())
+    return false;
+
+  bool Changed = false;
+  SmallVector<const GlobalVariable*, 8> TlsVars;
+  for (const auto &G : M.globals()) {
+    if (G.isThreadLocal())
+      TlsVars.append({&G});
+  }
+  for (const auto *const G : TlsVars)
+    Changed |= addEmuTlsVar(M, G);
+  return Changed;
+}
+
+bool LowerEmuTLS::addEmuTlsVar(Module &M, const GlobalVariable *GV) {
+  LLVMContext &C = M.getContext();
+  PointerType *VoidPtrType = Type::getInt8PtrTy(C);
+
+  std::string EmuTlsVarName = ("__emutls_v." + GV->getName()).str();
+  GlobalVariable *EmuTlsVar = M.getNamedGlobal(EmuTlsVarName);
+  if (EmuTlsVar)
+    return false;  // It has been added before.
+
+  const DataLayout &DL = M.getDataLayout();
+  Constant *NullPtr = ConstantPointerNull::get(VoidPtrType);
+
+  // Get non-zero initializer from GV's initializer.
+  const Constant *InitValue = nullptr;
+  if (GV->hasInitializer()) {
+    InitValue = GV->getInitializer();
+    const ConstantInt *InitIntValue = dyn_cast<ConstantInt>(InitValue);
+    // When GV's init value is all 0, omit the EmuTlsTmplVar and let
+    // the emutls library function to reset newly allocated TLS variables.
+    if (isa<ConstantAggregateZero>(InitValue) ||
+        (InitIntValue && InitIntValue->isZero()))
+      InitValue = nullptr;
+  }
+
+  // Create the __emutls_v. symbol, whose type has 4 fields:
+  //     word size;   // size of GV in bytes
+  //     word align;  // alignment of GV
+  //     void *ptr;   // initialized to 0; set at run time per thread.
+  //     void *templ; // 0 or point to __emutls_t.*
+  // sizeof(word) should be the same as sizeof(void*) on target.
+  IntegerType *WordType = DL.getIntPtrType(C);
+  PointerType *InitPtrType = InitValue ?
+      PointerType::getUnqual(InitValue->getType()) : VoidPtrType;
+  Type *ElementTypes[4] = {WordType, WordType, VoidPtrType, InitPtrType};
+  ArrayRef<Type*> ElementTypeArray(ElementTypes, 4);
+  StructType *EmuTlsVarType = StructType::create(ElementTypeArray);
+  EmuTlsVar = cast<GlobalVariable>(
+      M.getOrInsertGlobal(EmuTlsVarName, EmuTlsVarType));
+  copyLinkageVisibility(M, GV, EmuTlsVar);
+
+  // Define "__emutls_t.*" and "__emutls_v.*" only if GV is defined.
+  if (!GV->hasInitializer())
+    return true;
+
+  Type *GVType = GV->getValueType();
+  Align GVAlignment = DL.getValueOrABITypeAlignment(GV->getAlign(), GVType);
+
+  // Define "__emutls_t.*" if there is InitValue
+  GlobalVariable *EmuTlsTmplVar = nullptr;
+  if (InitValue) {
+    std::string EmuTlsTmplName = ("__emutls_t." + GV->getName()).str();
+    EmuTlsTmplVar = dyn_cast_or_null<GlobalVariable>(
+        M.getOrInsertGlobal(EmuTlsTmplName, GVType));
+    assert(EmuTlsTmplVar && "Failed to create emualted TLS initializer");
+    EmuTlsTmplVar->setConstant(true);
+    EmuTlsTmplVar->setInitializer(const_cast<Constant*>(InitValue));
+    EmuTlsTmplVar->setAlignment(GVAlignment);
+    copyLinkageVisibility(M, GV, EmuTlsTmplVar);
+  }
+
+  // Define "__emutls_v.*" with initializer and alignment.
+  Constant *ElementValues[4] = {
+      ConstantInt::get(WordType, DL.getTypeStoreSize(GVType)),
+      ConstantInt::get(WordType, GVAlignment.value()), NullPtr,
+      EmuTlsTmplVar ? EmuTlsTmplVar : NullPtr};
+  ArrayRef<Constant*> ElementValueArray(ElementValues, 4);
+  EmuTlsVar->setInitializer(
+      ConstantStruct::get(EmuTlsVarType, ElementValueArray));
+  Align MaxAlignment =
+      std::max(DL.getABITypeAlign(WordType), DL.getABITypeAlign(VoidPtrType));
+  EmuTlsVar->setAlignment(MaxAlignment);
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MBFIWrapper.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MBFIWrapper.cpp
new file mode 100644
index 000000000000..5b388be27839
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MBFIWrapper.cpp
@@ -0,0 +1,62 @@
+//===- MBFIWrapper.cpp - MachineBlockFrequencyInfo wrapper ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class keeps track of branch frequencies of newly created blocks and
+// tail-merged blocks. Used by the TailDuplication and MachineBlockPlacement.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MBFIWrapper.h"
+#include <optional>
+
+using namespace llvm;
+
+BlockFrequency MBFIWrapper::getBlockFreq(const MachineBasicBlock *MBB) const {
+  auto I = MergedBBFreq.find(MBB);
+
+  if (I != MergedBBFreq.end())
+    return I->second;
+
+  return MBFI.getBlockFreq(MBB);
+}
+
+void MBFIWrapper::setBlockFreq(const MachineBasicBlock *MBB,
+                               BlockFrequency F) {
+  MergedBBFreq[MBB] = F;
+}
+
+std::optional<uint64_t>
+MBFIWrapper::getBlockProfileCount(const MachineBasicBlock *MBB) const {
+  auto I = MergedBBFreq.find(MBB);
+
+  // Modified block frequency also impacts profile count. So we should compute
+  // profile count from new block frequency if it has been changed.
+  if (I != MergedBBFreq.end())
+    return MBFI.getProfileCountFromFreq(I->second.getFrequency());
+
+  return MBFI.getBlockProfileCount(MBB);
+}
+
+raw_ostream & MBFIWrapper::printBlockFreq(raw_ostream &OS,
+                                          const MachineBasicBlock *MBB) const {
+  return MBFI.printBlockFreq(OS, getBlockFreq(MBB));
+}
+
+raw_ostream & MBFIWrapper::printBlockFreq(raw_ostream &OS,
+                                          const BlockFrequency Freq) const {
+  return MBFI.printBlockFreq(OS, Freq);
+}
+
+void MBFIWrapper::view(const Twine &Name, bool isSimple) {
+  MBFI.view(Name, isSimple);
+}
+
+uint64_t MBFIWrapper::getEntryFreq() const {
+  return MBFI.getEntryFreq();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
new file mode 100644
index 000000000000..21b849244d9b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
@@ -0,0 +1,423 @@
+//===-------------- MIRCanonicalizer.cpp - MIR Canonicalizer --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The purpose of this pass is to employ a canonical code transformation so
+// that code compiled with slightly different IR passes can be diffed more
+// effectively than otherwise. This is done by renaming vregs in a given
+// LiveRange in a canonical way. This pass also does a pseudo-scheduling to
+// move defs closer to their use inorder to reduce diffs caused by slightly
+// different schedules.
+//
+// Basic Usage:
+//
+// llc -o - -run-pass mir-canonicalizer example.mir
+//
+// Reorders instructions canonically.
+// Renames virtual register operands canonically.
+// Strips certain MIR artifacts (optionally).
+//
+//===----------------------------------------------------------------------===//
+
+#include "MIRVRegNamerUtils.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mir-canonicalizer"
+
+static cl::opt<unsigned>
+    CanonicalizeFunctionNumber("canon-nth-function", cl::Hidden, cl::init(~0u),
+                               cl::value_desc("N"),
+                               cl::desc("Function number to canonicalize."));
+
+namespace {
+
+class MIRCanonicalizer : public MachineFunctionPass {
+public:
+  static char ID;
+  MIRCanonicalizer() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "Rename register operands in a canonical ordering.";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // end anonymous namespace
+
+char MIRCanonicalizer::ID;
+
+char &llvm::MIRCanonicalizerID = MIRCanonicalizer::ID;
+
+INITIALIZE_PASS_BEGIN(MIRCanonicalizer, "mir-canonicalizer",
+                      "Rename Register Operands Canonically", false, false)
+
+INITIALIZE_PASS_END(MIRCanonicalizer, "mir-canonicalizer",
+                    "Rename Register Operands Canonically", false, false)
+
+static std::vector<MachineBasicBlock *> GetRPOList(MachineFunction &MF) {
+  if (MF.empty())
+    return {};
+  ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
+  std::vector<MachineBasicBlock *> RPOList;
+  append_range(RPOList, RPOT);
+
+  return RPOList;
+}
+
+static bool
+rescheduleLexographically(std::vector<MachineInstr *> instructions,
+                          MachineBasicBlock *MBB,
+                          std::function<MachineBasicBlock::iterator()> getPos) {
+
+  bool Changed = false;
+  using StringInstrPair = std::pair<std::string, MachineInstr *>;
+  std::vector<StringInstrPair> StringInstrMap;
+
+  for (auto *II : instructions) {
+    std::string S;
+    raw_string_ostream OS(S);
+    II->print(OS);
+    OS.flush();
+
+    // Trim the assignment, or start from the beginning in the case of a store.
+    const size_t i = S.find('=');
+    StringInstrMap.push_back({(i == std::string::npos) ? S : S.substr(i), II});
+  }
+
+  llvm::sort(StringInstrMap, llvm::less_first());
+
+  for (auto &II : StringInstrMap) {
+
+    LLVM_DEBUG({
+      dbgs() << "Splicing ";
+      II.second->dump();
+      dbgs() << " right before: ";
+      getPos()->dump();
+    });
+
+    Changed = true;
+    MBB->splice(getPos(), MBB, II.second);
+  }
+
+  return Changed;
+}
+
+static bool rescheduleCanonically(unsigned &PseudoIdempotentInstCount,
+                                  MachineBasicBlock *MBB) {
+
+  bool Changed = false;
+
+  // Calculates the distance of MI from the beginning of its parent BB.
+  auto getInstrIdx = [](const MachineInstr &MI) {
+    unsigned i = 0;
+    for (const auto &CurMI : *MI.getParent()) {
+      if (&CurMI == &MI)
+        return i;
+      i++;
+    }
+    return ~0U;
+  };
+
+  // Pre-Populate vector of instructions to reschedule so that we don't
+  // clobber the iterator.
+  std::vector<MachineInstr *> Instructions;
+  for (auto &MI : *MBB) {
+    Instructions.push_back(&MI);
+  }
+
+  std::map<MachineInstr *, std::vector<MachineInstr *>> MultiUsers;
+  std::map<unsigned, MachineInstr *> MultiUserLookup;
+  unsigned UseToBringDefCloserToCount = 0;
+  std::vector<MachineInstr *> PseudoIdempotentInstructions;
+  std::vector<unsigned> PhysRegDefs;
+  for (auto *II : Instructions) {
+    for (unsigned i = 1; i < II->getNumOperands(); i++) {
+      MachineOperand &MO = II->getOperand(i);
+      if (!MO.isReg())
+        continue;
+
+      if (MO.getReg().isVirtual())
+        continue;
+
+      if (!MO.isDef())
+        continue;
+
+      PhysRegDefs.push_back(MO.getReg());
+    }
+  }
+
+  for (auto *II : Instructions) {
+    if (II->getNumOperands() == 0)
+      continue;
+    if (II->mayLoadOrStore())
+      continue;
+
+    MachineOperand &MO = II->getOperand(0);
+    if (!MO.isReg() || !MO.getReg().isVirtual())
+      continue;
+    if (!MO.isDef())
+      continue;
+
+    bool IsPseudoIdempotent = true;
+    for (unsigned i = 1; i < II->getNumOperands(); i++) {
+
+      if (II->getOperand(i).isImm()) {
+        continue;
+      }
+
+      if (II->getOperand(i).isReg()) {
+        if (!II->getOperand(i).getReg().isVirtual())
+          if (!llvm::is_contained(PhysRegDefs, II->getOperand(i).getReg())) {
+            continue;
+          }
+      }
+
+      IsPseudoIdempotent = false;
+      break;
+    }
+
+    if (IsPseudoIdempotent) {
+      PseudoIdempotentInstructions.push_back(II);
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "Operand " << 0 << " of "; II->dump(); MO.dump(););
+
+    MachineInstr *Def = II;
+    unsigned Distance = ~0U;
+    MachineInstr *UseToBringDefCloserTo = nullptr;
+    MachineRegisterInfo *MRI = &MBB->getParent()->getRegInfo();
+    for (auto &UO : MRI->use_nodbg_operands(MO.getReg())) {
+      MachineInstr *UseInst = UO.getParent();
+
+      const unsigned DefLoc = getInstrIdx(*Def);
+      const unsigned UseLoc = getInstrIdx(*UseInst);
+      const unsigned Delta = (UseLoc - DefLoc);
+
+      if (UseInst->getParent() != Def->getParent())
+        continue;
+      if (DefLoc >= UseLoc)
+        continue;
+
+      if (Delta < Distance) {
+        Distance = Delta;
+        UseToBringDefCloserTo = UseInst;
+        MultiUserLookup[UseToBringDefCloserToCount++] = UseToBringDefCloserTo;
+      }
+    }
+
+    const auto BBE = MBB->instr_end();
+    MachineBasicBlock::iterator DefI = BBE;
+    MachineBasicBlock::iterator UseI = BBE;
+
+    for (auto BBI = MBB->instr_begin(); BBI != BBE; ++BBI) {
+
+      if (DefI != BBE && UseI != BBE)
+        break;
+
+      if (&*BBI == Def) {
+        DefI = BBI;
+        continue;
+      }
+
+      if (&*BBI == UseToBringDefCloserTo) {
+        UseI = BBI;
+        continue;
+      }
+    }
+
+    if (DefI == BBE || UseI == BBE)
+      continue;
+
+    LLVM_DEBUG({
+      dbgs() << "Splicing ";
+      DefI->dump();
+      dbgs() << " right before: ";
+      UseI->dump();
+    });
+
+    MultiUsers[UseToBringDefCloserTo].push_back(Def);
+    Changed = true;
+    MBB->splice(UseI, MBB, DefI);
+  }
+
+  // Sort the defs for users of multiple defs lexographically.
+  for (const auto &E : MultiUserLookup) {
+
+    auto UseI = llvm::find_if(MBB->instrs(), [&](MachineInstr &MI) -> bool {
+      return &MI == E.second;
+    });
+
+    if (UseI == MBB->instr_end())
+      continue;
+
+    LLVM_DEBUG(
+        dbgs() << "Rescheduling Multi-Use Instructions Lexographically.";);
+    Changed |= rescheduleLexographically(
+        MultiUsers[E.second], MBB,
+        [&]() -> MachineBasicBlock::iterator { return UseI; });
+  }
+
+  PseudoIdempotentInstCount = PseudoIdempotentInstructions.size();
+  LLVM_DEBUG(
+      dbgs() << "Rescheduling Idempotent Instructions Lexographically.";);
+  Changed |= rescheduleLexographically(
+      PseudoIdempotentInstructions, MBB,
+      [&]() -> MachineBasicBlock::iterator { return MBB->begin(); });
+
+  return Changed;
+}
+
+static bool propagateLocalCopies(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  std::vector<MachineInstr *> Copies;
+  for (MachineInstr &MI : MBB->instrs()) {
+    if (MI.isCopy())
+      Copies.push_back(&MI);
+  }
+
+  for (MachineInstr *MI : Copies) {
+
+    if (!MI->getOperand(0).isReg())
+      continue;
+    if (!MI->getOperand(1).isReg())
+      continue;
+
+    const Register Dst = MI->getOperand(0).getReg();
+    const Register Src = MI->getOperand(1).getReg();
+
+    if (!Dst.isVirtual())
+      continue;
+    if (!Src.isVirtual())
+      continue;
+    // Not folding COPY instructions if regbankselect has not set the RCs.
+    // Why are we only considering Register Classes? Because the verifier
+    // sometimes gets upset if the register classes don't match even if the
+    // types do. A future patch might add COPY folding for matching types in
+    // pre-registerbankselect code.
+    if (!MRI.getRegClassOrNull(Dst))
+      continue;
+    if (MRI.getRegClass(Dst) != MRI.getRegClass(Src))
+      continue;
+
+    std::vector<MachineOperand *> Uses;
+    for (MachineOperand &MO : MRI.use_operands(Dst))
+      Uses.push_back(&MO);
+    for (auto *MO : Uses)
+      MO->setReg(Src);
+
+    Changed = true;
+    MI->eraseFromParent();
+  }
+
+  return Changed;
+}
+
+static bool doDefKillClear(MachineBasicBlock *MBB) {
+  bool Changed = false;
+
+  for (auto &MI : *MBB) {
+    for (auto &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      if (!MO.isDef() && MO.isKill()) {
+        Changed = true;
+        MO.setIsKill(false);
+      }
+
+      if (MO.isDef() && MO.isDead()) {
+        Changed = true;
+        MO.setIsDead(false);
+      }
+    }
+  }
+
+  return Changed;
+}
+
+static bool runOnBasicBlock(MachineBasicBlock *MBB,
+                            unsigned BasicBlockNum, VRegRenamer &Renamer) {
+  LLVM_DEBUG({
+    dbgs() << "\n\n  NEW BASIC BLOCK: " << MBB->getName() << "  \n\n";
+    dbgs() << "\n\n================================================\n\n";
+  });
+
+  bool Changed = false;
+
+  LLVM_DEBUG(dbgs() << "\n\n NEW BASIC BLOCK: " << MBB->getName() << "\n\n";);
+
+  LLVM_DEBUG(dbgs() << "MBB Before Canonical Copy Propagation:\n";
+             MBB->dump(););
+  Changed |= propagateLocalCopies(MBB);
+  LLVM_DEBUG(dbgs() << "MBB After Canonical Copy Propagation:\n"; MBB->dump(););
+
+  LLVM_DEBUG(dbgs() << "MBB Before Scheduling:\n"; MBB->dump(););
+  unsigned IdempotentInstCount = 0;
+  Changed |= rescheduleCanonically(IdempotentInstCount, MBB);
+  LLVM_DEBUG(dbgs() << "MBB After Scheduling:\n"; MBB->dump(););
+
+  Changed |= Renamer.renameVRegs(MBB, BasicBlockNum);
+
+  // TODO: Consider dropping this. Dropping kill defs is probably not
+  // semantically sound.
+  Changed |= doDefKillClear(MBB);
+
+  LLVM_DEBUG(dbgs() << "Updated MachineBasicBlock:\n"; MBB->dump();
+             dbgs() << "\n";);
+  LLVM_DEBUG(
+      dbgs() << "\n\n================================================\n\n");
+  return Changed;
+}
+
+bool MIRCanonicalizer::runOnMachineFunction(MachineFunction &MF) {
+
+  static unsigned functionNum = 0;
+  if (CanonicalizeFunctionNumber != ~0U) {
+    if (CanonicalizeFunctionNumber != functionNum++)
+      return false;
+    LLVM_DEBUG(dbgs() << "\n Canonicalizing Function " << MF.getName()
+                      << "\n";);
+  }
+
+  // we need a valid vreg to create a vreg type for skipping all those
+  // stray vreg numbers so reach alignment/canonical vreg values.
+  std::vector<MachineBasicBlock *> RPOList = GetRPOList(MF);
+
+  LLVM_DEBUG(
+      dbgs() << "\n\n  NEW MACHINE FUNCTION: " << MF.getName() << "  \n\n";
+      dbgs() << "\n\n================================================\n\n";
+      dbgs() << "Total Basic Blocks: " << RPOList.size() << "\n";
+      for (auto MBB
+           : RPOList) { dbgs() << MBB->getName() << "\n"; } dbgs()
+      << "\n\n================================================\n\n";);
+
+  unsigned BBNum = 0;
+  bool Changed = false;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  VRegRenamer Renamer(MRI);
+  for (auto *MBB : RPOList)
+    Changed |= runOnBasicBlock(MBB, BBNum++, Renamer);
+
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRFSDiscriminator.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
new file mode 100644
index 000000000000..8d17cceeb3cd
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
@@ -0,0 +1,202 @@
+//===-------- MIRFSDiscriminator.cpp: Flow Sensitive Discriminator --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the implementation of a machine pass that adds the flow
+// sensitive discriminator to the instruction debug information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MIRFSDiscriminator.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PseudoProbe.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/xxhash.h"
+#include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h"
+
+using namespace llvm;
+using namespace sampleprof;
+using namespace sampleprofutil;
+
+#define DEBUG_TYPE "mirfs-discriminators"
+
+// TODO(xur): Remove this option and related code once we make true as the
+// default.
+cl::opt<bool> ImprovedFSDiscriminator(
+    "improved-fs-discriminator", cl::Hidden, cl::init(false),
+    cl::desc("New FS discriminators encoding (incompatible with the original "
+             "encoding)"));
+
+char MIRAddFSDiscriminators::ID = 0;
+
+INITIALIZE_PASS(MIRAddFSDiscriminators, DEBUG_TYPE,
+                "Add MIR Flow Sensitive Discriminators",
+                /* cfg = */ false, /* is_analysis = */ false)
+
+char &llvm::MIRAddFSDiscriminatorsID = MIRAddFSDiscriminators::ID;
+
+FunctionPass *llvm::createMIRAddFSDiscriminatorsPass(FSDiscriminatorPass P) {
+  return new MIRAddFSDiscriminators(P);
+}
+
+// TODO(xur): Remove this once we switch to ImprovedFSDiscriminator.
+// Compute a hash value using debug line number, and the line numbers from the
+// inline stack.
+static uint64_t getCallStackHashV0(const MachineBasicBlock &BB,
+                                   const MachineInstr &MI,
+                                   const DILocation *DIL) {
+  auto updateHash = [](const StringRef &Str) -> uint64_t {
+    if (Str.empty())
+      return 0;
+    return MD5Hash(Str);
+  };
+  uint64_t Ret = updateHash(std::to_string(DIL->getLine()));
+  Ret ^= updateHash(BB.getName());
+  Ret ^= updateHash(DIL->getScope()->getSubprogram()->getLinkageName());
+  for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) {
+    Ret ^= updateHash(std::to_string(DIL->getLine()));
+    Ret ^= updateHash(DIL->getScope()->getSubprogram()->getLinkageName());
+  }
+  return Ret;
+}
+
+static uint64_t getCallStackHash(const DILocation *DIL) {
+  auto hashCombine = [](const uint64_t Seed, const uint64_t Val) {
+    std::hash<uint64_t> Hasher;
+    return Seed ^ (Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2));
+  };
+  uint64_t Ret = 0;
+  for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) {
+    Ret = hashCombine(Ret, xxh3_64bits(ArrayRef<uint8_t>(DIL->getLine())));
+    Ret = hashCombine(Ret, xxh3_64bits(DIL->getSubprogramLinkageName()));
+  }
+  return Ret;
+}
+
+// Traverse the CFG and assign FD discriminators. If two instructions
+// have the same lineno and discriminator, but residing in different BBs,
+// the latter instruction will get a new discriminator value. The new
+// discriminator keeps the existing discriminator value but sets new bits
+// b/w LowBit and HighBit.
+bool MIRAddFSDiscriminators::runOnMachineFunction(MachineFunction &MF) {
+  if (!EnableFSDiscriminator)
+    return false;
+
+  bool HasPseudoProbe = MF.getFunction().getParent()->getNamedMetadata(
+      PseudoProbeDescMetadataName);
+
+  if (!HasPseudoProbe && !MF.getFunction().shouldEmitDebugInfoForProfiling())
+    return false;
+
+  bool Changed = false;
+  using LocationDiscriminator =
+      std::tuple<StringRef, unsigned, unsigned, uint64_t>;
+  using BBSet = DenseSet<const MachineBasicBlock *>;
+  using LocationDiscriminatorBBMap = DenseMap<LocationDiscriminator, BBSet>;
+  using LocationDiscriminatorCurrPassMap =
+      DenseMap<LocationDiscriminator, unsigned>;
+
+  LocationDiscriminatorBBMap LDBM;
+  LocationDiscriminatorCurrPassMap LDCM;
+
+  // Mask of discriminators before this pass.
+  // TODO(xur): simplify this once we switch to ImprovedFSDiscriminator.
+  unsigned LowBitTemp = LowBit;
+  assert(LowBit > 0 && "LowBit in FSDiscriminator cannot be 0");
+  if (ImprovedFSDiscriminator)
+    LowBitTemp -= 1;
+  unsigned BitMaskBefore = getN1Bits(LowBitTemp);
+  // Mask of discriminators including this pass.
+  unsigned BitMaskNow = getN1Bits(HighBit);
+  // Mask of discriminators for bits specific to this pass.
+  unsigned BitMaskThisPass = BitMaskNow ^ BitMaskBefore;
+  unsigned NumNewD = 0;
+
+  LLVM_DEBUG(dbgs() << "MIRAddFSDiscriminators working on Func: "
+                    << MF.getFunction().getName() << " Highbit=" << HighBit
+                    << "\n");
+
+  for (MachineBasicBlock &BB : MF) {
+    for (MachineInstr &I : BB) {
+      if (HasPseudoProbe) {
+        // Only assign discriminators to pseudo probe instructions. Call
+        // instructions are excluded since their dwarf discriminators are used
+        // for other purposes, i.e, storing probe ids.
+        if (!I.isPseudoProbe())
+          continue;
+      } else if (ImprovedFSDiscriminator && I.isMetaInstruction()) {
+        continue;
+      }
+      const DILocation *DIL = I.getDebugLoc().get();
+      if (!DIL)
+        continue;
+
+      // Use the id of pseudo probe to compute the discriminator.
+      unsigned LineNo =
+          I.isPseudoProbe() ? I.getOperand(1).getImm() : DIL->getLine();
+      if (LineNo == 0)
+        continue;
+      unsigned Discriminator = DIL->getDiscriminator();
+      // Clean up discriminators for pseudo probes at the first FS discriminator
+      // pass as their discriminators should not ever be used.
+      if ((Pass == FSDiscriminatorPass::Pass1) && I.isPseudoProbe()) {
+        Discriminator = 0;
+        I.setDebugLoc(DIL->cloneWithDiscriminator(0));
+      }
+      uint64_t CallStackHashVal = 0;
+      if (ImprovedFSDiscriminator)
+        CallStackHashVal = getCallStackHash(DIL);
+
+      LocationDiscriminator LD{DIL->getFilename(), LineNo, Discriminator,
+                               CallStackHashVal};
+      auto &BBMap = LDBM[LD];
+      auto R = BBMap.insert(&BB);
+      if (BBMap.size() == 1)
+        continue;
+
+      unsigned DiscriminatorCurrPass;
+      DiscriminatorCurrPass = R.second ? ++LDCM[LD] : LDCM[LD];
+      DiscriminatorCurrPass = DiscriminatorCurrPass << LowBit;
+      if (!ImprovedFSDiscriminator)
+        DiscriminatorCurrPass += getCallStackHashV0(BB, I, DIL);
+      DiscriminatorCurrPass &= BitMaskThisPass;
+      unsigned NewD = Discriminator | DiscriminatorCurrPass;
+      const auto *const NewDIL = DIL->cloneWithDiscriminator(NewD);
+      if (!NewDIL) {
+        LLVM_DEBUG(dbgs() << "Could not encode discriminator: "
+                          << DIL->getFilename() << ":" << DIL->getLine() << ":"
+                          << DIL->getColumn() << ":" << Discriminator << " "
+                          << I << "\n");
+        continue;
+      }
+
+      I.setDebugLoc(NewDIL);
+      NumNewD++;
+      LLVM_DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
+                        << DIL->getColumn() << ": add FS discriminator, from "
+                        << Discriminator << " -> " << NewD << "\n");
+      Changed = true;
+    }
+  }
+
+  if (Changed) {
+    createFSDiscriminatorVariable(MF.getFunction().getParent());
+    LLVM_DEBUG(dbgs() << "Num of FS Discriminators: " << NumNewD << "\n");
+    (void) NumNewD;
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRNamerPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRNamerPass.cpp
new file mode 100644
index 000000000000..bc65700aba06
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRNamerPass.cpp
@@ -0,0 +1,75 @@
+//===----------------------- MIRNamer.cpp - MIR Namer ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The purpose of this pass is to rename virtual register operands with the goal
+// of making it easier to author easier to read tests for MIR. This pass reuses
+// the vreg renamer used by MIRCanonicalizerPass.
+//
+// Basic Usage:
+//
+// llc -o - -run-pass mir-namer example.mir
+//
+//===----------------------------------------------------------------------===//
+
+#include "MIRVRegNamerUtils.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+namespace llvm {
+extern char &MIRNamerID;
+} // namespace llvm
+
+#define DEBUG_TYPE "mir-namer"
+
+namespace {
+
+class MIRNamer : public MachineFunctionPass {
+public:
+  static char ID;
+  MIRNamer() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "Rename virtual register operands";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    bool Changed = false;
+
+    if (MF.empty())
+      return Changed;
+
+    VRegRenamer Renamer(MF.getRegInfo());
+
+    unsigned BBIndex = 0;
+    ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
+    for (auto &MBB : RPOT)
+      Changed |= Renamer.renameVRegs(MBB, BBIndex++);
+
+    return Changed;
+  }
+};
+
+} // end anonymous namespace
+
+char MIRNamer::ID;
+
+char &llvm::MIRNamerID = MIRNamer::ID;
+
+INITIALIZE_PASS_BEGIN(MIRNamer, "mir-namer", "Rename Register Operands", false,
+                      false)
+
+INITIALIZE_PASS_END(MIRNamer, "mir-namer", "Rename Register Operands", false,
+                    false)
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MILexer.cpp
new file mode 100644
index 000000000000..a4c1ba340e46
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -0,0 +1,767 @@
+//===- MILexer.cpp - Machine instructions lexer implementation ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the lexing of machine instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MILexer.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include <cassert>
+#include <cctype>
+#include <string>
+
+using namespace llvm;
+
+namespace {
+
+using ErrorCallbackType =
+    function_ref<void(StringRef::iterator Loc, const Twine &)>;
+
+/// This class provides a way to iterate and get characters from the source
+/// string.
+class Cursor {
+  const char *Ptr = nullptr;
+  const char *End = nullptr;
+
+public:
+  Cursor(std::nullopt_t) {}
+
+  explicit Cursor(StringRef Str) {
+    Ptr = Str.data();
+    End = Ptr + Str.size();
+  }
+
+  bool isEOF() const { return Ptr == End; }
+
+  char peek(int I = 0) const { return End - Ptr <= I ? 0 : Ptr[I]; }
+
+  void advance(unsigned I = 1) { Ptr += I; }
+
+  StringRef remaining() const { return StringRef(Ptr, End - Ptr); }
+
+  StringRef upto(Cursor C) const {
+    assert(C.Ptr >= Ptr && C.Ptr <= End);
+    return StringRef(Ptr, C.Ptr - Ptr);
+  }
+
+  StringRef::iterator location() const { return Ptr; }
+
+  operator bool() const { return Ptr != nullptr; }
+};
+
+} // end anonymous namespace
+
+MIToken &MIToken::reset(TokenKind Kind, StringRef Range) {
+  this->Kind = Kind;
+  this->Range = Range;
+  return *this;
+}
+
+MIToken &MIToken::setStringValue(StringRef StrVal) {
+  StringValue = StrVal;
+  return *this;
+}
+
+MIToken &MIToken::setOwnedStringValue(std::string StrVal) {
+  StringValueStorage = std::move(StrVal);
+  StringValue = StringValueStorage;
+  return *this;
+}
+
+MIToken &MIToken::setIntegerValue(APSInt IntVal) {
+  this->IntVal = std::move(IntVal);
+  return *this;
+}
+
+/// Skip the leading whitespace characters and return the updated cursor.
+static Cursor skipWhitespace(Cursor C) {
+  while (isblank(C.peek()))
+    C.advance();
+  return C;
+}
+
+static bool isNewlineChar(char C) { return C == '\n' || C == '\r'; }
+
+/// Skip a line comment and return the updated cursor.
+static Cursor skipComment(Cursor C) {
+  if (C.peek() != ';')
+    return C;
+  while (!isNewlineChar(C.peek()) && !C.isEOF())
+    C.advance();
+  return C;
+}
+
+/// Machine operands can have comments, enclosed between /* and */.
+/// This eats up all tokens, including /* and */.
+static Cursor skipMachineOperandComment(Cursor C) {
+  if (C.peek() != '/' || C.peek(1) != '*')
+    return C;
+
+  while (C.peek() != '*' || C.peek(1) != '/')
+    C.advance();
+
+  C.advance();
+  C.advance();
+  return C;
+}
+
+/// Return true if the given character satisfies the following regular
+/// expression: [-a-zA-Z$._0-9]
+static bool isIdentifierChar(char C) {
+  return isalpha(C) || isdigit(C) || C == '_' || C == '-' || C == '.' ||
+         C == '$';
+}
+
+/// Unescapes the given string value.
+///
+/// Expects the string value to be quoted.
+static std::string unescapeQuotedString(StringRef Value) {
+  assert(Value.front() == '"' && Value.back() == '"');
+  Cursor C = Cursor(Value.substr(1, Value.size() - 2));
+
+  std::string Str;
+  Str.reserve(C.remaining().size());
+  while (!C.isEOF()) {
+    char Char = C.peek();
+    if (Char == '\\') {
+      if (C.peek(1) == '\\') {
+        // Two '\' become one
+        Str += '\\';
+        C.advance(2);
+        continue;
+      }
+      if (isxdigit(C.peek(1)) && isxdigit(C.peek(2))) {
+        Str += hexDigitValue(C.peek(1)) * 16 + hexDigitValue(C.peek(2));
+        C.advance(3);
+        continue;
+      }
+    }
+    Str += Char;
+    C.advance();
+  }
+  return Str;
+}
+
+/// Lex a string constant using the following regular expression: \"[^\"]*\"
+static Cursor lexStringConstant(Cursor C, ErrorCallbackType ErrorCallback) {
+  assert(C.peek() == '"');
+  for (C.advance(); C.peek() != '"'; C.advance()) {
+    if (C.isEOF() || isNewlineChar(C.peek())) {
+      ErrorCallback(
+          C.location(),
+          "end of machine instruction reached before the closing '\"'");
+      return std::nullopt;
+    }
+  }
+  C.advance();
+  return C;
+}
+
+static Cursor lexName(Cursor C, MIToken &Token, MIToken::TokenKind Type,
+                      unsigned PrefixLength, ErrorCallbackType ErrorCallback) {
+  auto Range = C;
+  C.advance(PrefixLength);
+  if (C.peek() == '"') {
+    if (Cursor R = lexStringConstant(C, ErrorCallback)) {
+      StringRef String = Range.upto(R);
+      Token.reset(Type, String)
+          .setOwnedStringValue(
+              unescapeQuotedString(String.drop_front(PrefixLength)));
+      return R;
+    }
+    Token.reset(MIToken::Error, Range.remaining());
+    return Range;
+  }
+  while (isIdentifierChar(C.peek()))
+    C.advance();
+  Token.reset(Type, Range.upto(C))
+      .setStringValue(Range.upto(C).drop_front(PrefixLength));
+  return C;
+}
+
+static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
+  return StringSwitch<MIToken::TokenKind>(Identifier)
+      .Case("_", MIToken::underscore)
+      .Case("implicit", MIToken::kw_implicit)
+      .Case("implicit-def", MIToken::kw_implicit_define)
+      .Case("def", MIToken::kw_def)
+      .Case("dead", MIToken::kw_dead)
+      .Case("killed", MIToken::kw_killed)
+      .Case("undef", MIToken::kw_undef)
+      .Case("internal", MIToken::kw_internal)
+      .Case("early-clobber", MIToken::kw_early_clobber)
+      .Case("debug-use", MIToken::kw_debug_use)
+      .Case("renamable", MIToken::kw_renamable)
+      .Case("tied-def", MIToken::kw_tied_def)
+      .Case("frame-setup", MIToken::kw_frame_setup)
+      .Case("frame-destroy", MIToken::kw_frame_destroy)
+      .Case("nnan", MIToken::kw_nnan)
+      .Case("ninf", MIToken::kw_ninf)
+      .Case("nsz", MIToken::kw_nsz)
+      .Case("arcp", MIToken::kw_arcp)
+      .Case("contract", MIToken::kw_contract)
+      .Case("afn", MIToken::kw_afn)
+      .Case("reassoc", MIToken::kw_reassoc)
+      .Case("nuw", MIToken::kw_nuw)
+      .Case("nsw", MIToken::kw_nsw)
+      .Case("exact", MIToken::kw_exact)
+      .Case("nofpexcept", MIToken::kw_nofpexcept)
+      .Case("unpredictable", MIToken::kw_unpredictable)
+      .Case("debug-location", MIToken::kw_debug_location)
+      .Case("debug-instr-number", MIToken::kw_debug_instr_number)
+      .Case("dbg-instr-ref", MIToken::kw_dbg_instr_ref)
+      .Case("same_value", MIToken::kw_cfi_same_value)
+      .Case("offset", MIToken::kw_cfi_offset)
+      .Case("rel_offset", MIToken::kw_cfi_rel_offset)
+      .Case("def_cfa_register", MIToken::kw_cfi_def_cfa_register)
+      .Case("def_cfa_offset", MIToken::kw_cfi_def_cfa_offset)
+      .Case("adjust_cfa_offset", MIToken::kw_cfi_adjust_cfa_offset)
+      .Case("escape", MIToken::kw_cfi_escape)
+      .Case("def_cfa", MIToken::kw_cfi_def_cfa)
+      .Case("llvm_def_aspace_cfa", MIToken::kw_cfi_llvm_def_aspace_cfa)
+      .Case("remember_state", MIToken::kw_cfi_remember_state)
+      .Case("restore", MIToken::kw_cfi_restore)
+      .Case("restore_state", MIToken::kw_cfi_restore_state)
+      .Case("undefined", MIToken::kw_cfi_undefined)
+      .Case("register", MIToken::kw_cfi_register)
+      .Case("window_save", MIToken::kw_cfi_window_save)
+      .Case("negate_ra_sign_state",
+            MIToken::kw_cfi_aarch64_negate_ra_sign_state)
+      .Case("blockaddress", MIToken::kw_blockaddress)
+      .Case("intrinsic", MIToken::kw_intrinsic)
+      .Case("target-index", MIToken::kw_target_index)
+      .Case("half", MIToken::kw_half)
+      .Case("float", MIToken::kw_float)
+      .Case("double", MIToken::kw_double)
+      .Case("x86_fp80", MIToken::kw_x86_fp80)
+      .Case("fp128", MIToken::kw_fp128)
+      .Case("ppc_fp128", MIToken::kw_ppc_fp128)
+      .Case("target-flags", MIToken::kw_target_flags)
+      .Case("volatile", MIToken::kw_volatile)
+      .Case("non-temporal", MIToken::kw_non_temporal)
+      .Case("dereferenceable", MIToken::kw_dereferenceable)
+      .Case("invariant", MIToken::kw_invariant)
+      .Case("align", MIToken::kw_align)
+      .Case("basealign", MIToken::kw_basealign)
+      .Case("addrspace", MIToken::kw_addrspace)
+      .Case("stack", MIToken::kw_stack)
+      .Case("got", MIToken::kw_got)
+      .Case("jump-table", MIToken::kw_jump_table)
+      .Case("constant-pool", MIToken::kw_constant_pool)
+      .Case("call-entry", MIToken::kw_call_entry)
+      .Case("custom", MIToken::kw_custom)
+      .Case("liveout", MIToken::kw_liveout)
+      .Case("landing-pad", MIToken::kw_landing_pad)
+      .Case("inlineasm-br-indirect-target",
+            MIToken::kw_inlineasm_br_indirect_target)
+      .Case("ehfunclet-entry", MIToken::kw_ehfunclet_entry)
+      .Case("liveins", MIToken::kw_liveins)
+      .Case("successors", MIToken::kw_successors)
+      .Case("floatpred", MIToken::kw_floatpred)
+      .Case("intpred", MIToken::kw_intpred)
+      .Case("shufflemask", MIToken::kw_shufflemask)
+      .Case("pre-instr-symbol", MIToken::kw_pre_instr_symbol)
+      .Case("post-instr-symbol", MIToken::kw_post_instr_symbol)
+      .Case("heap-alloc-marker", MIToken::kw_heap_alloc_marker)
+      .Case("pcsections", MIToken::kw_pcsections)
+      .Case("cfi-type", MIToken::kw_cfi_type)
+      .Case("bbsections", MIToken::kw_bbsections)
+      .Case("bb_id", MIToken::kw_bb_id)
+      .Case("unknown-size", MIToken::kw_unknown_size)
+      .Case("unknown-address", MIToken::kw_unknown_address)
+      .Case("distinct", MIToken::kw_distinct)
+      .Case("ir-block-address-taken", MIToken::kw_ir_block_address_taken)
+      .Case("machine-block-address-taken",
+            MIToken::kw_machine_block_address_taken)
+      .Default(MIToken::Identifier);
+}
+
+static Cursor maybeLexIdentifier(Cursor C, MIToken &Token) {
+  if (!isalpha(C.peek()) && C.peek() != '_')
+    return std::nullopt;
+  auto Range = C;
+  while (isIdentifierChar(C.peek()))
+    C.advance();
+  auto Identifier = Range.upto(C);
+  Token.reset(getIdentifierKind(Identifier), Identifier)
+      .setStringValue(Identifier);
+  return C;
+}
+
+static Cursor maybeLexMachineBasicBlock(Cursor C, MIToken &Token,
+                                        ErrorCallbackType ErrorCallback) {
+  bool IsReference = C.remaining().startswith("%bb.");
+  if (!IsReference && !C.remaining().startswith("bb."))
+    return std::nullopt;
+  auto Range = C;
+  unsigned PrefixLength = IsReference ? 4 : 3;
+  C.advance(PrefixLength); // Skip '%bb.' or 'bb.'
+  if (!isdigit(C.peek())) {
+    Token.reset(MIToken::Error, C.remaining());
+    ErrorCallback(C.location(), "expected a number after '%bb.'");
+    return C;
+  }
+  auto NumberRange = C;
+  while (isdigit(C.peek()))
+    C.advance();
+  StringRef Number = NumberRange.upto(C);
+  unsigned StringOffset = PrefixLength + Number.size(); // Drop '%bb.<id>'
+  // TODO: The format bb.<id>.<irname> is supported only when it's not a
+  // reference. Once we deprecate the format where the irname shows up, we
+  // should only lex forward if it is a reference.
+  if (C.peek() == '.') {
+    C.advance(); // Skip '.'
+    ++StringOffset;
+    while (isIdentifierChar(C.peek()))
+      C.advance();
+  }
+  Token.reset(IsReference ? MIToken::MachineBasicBlock
+                          : MIToken::MachineBasicBlockLabel,
+              Range.upto(C))
+      .setIntegerValue(APSInt(Number))
+      .setStringValue(Range.upto(C).drop_front(StringOffset));
+  return C;
+}
+
+static Cursor maybeLexIndex(Cursor C, MIToken &Token, StringRef Rule,
+                            MIToken::TokenKind Kind) {
+  if (!C.remaining().startswith(Rule) || !isdigit(C.peek(Rule.size())))
+    return std::nullopt;
+  auto Range = C;
+  C.advance(Rule.size());
+  auto NumberRange = C;
+  while (isdigit(C.peek()))
+    C.advance();
+  Token.reset(Kind, Range.upto(C)).setIntegerValue(APSInt(NumberRange.upto(C)));
+  return C;
+}
+
+static Cursor maybeLexIndexAndName(Cursor C, MIToken &Token, StringRef Rule,
+                                   MIToken::TokenKind Kind) {
+  if (!C.remaining().startswith(Rule) || !isdigit(C.peek(Rule.size())))
+    return std::nullopt;
+  auto Range = C;
+  C.advance(Rule.size());
+  auto NumberRange = C;
+  while (isdigit(C.peek()))
+    C.advance();
+  StringRef Number = NumberRange.upto(C);
+  unsigned StringOffset = Rule.size() + Number.size();
+  if (C.peek() == '.') {
+    C.advance();
+    ++StringOffset;
+    while (isIdentifierChar(C.peek()))
+      C.advance();
+  }
+  Token.reset(Kind, Range.upto(C))
+      .setIntegerValue(APSInt(Number))
+      .setStringValue(Range.upto(C).drop_front(StringOffset));
+  return C;
+}
+
+static Cursor maybeLexJumpTableIndex(Cursor C, MIToken &Token) {
+  return maybeLexIndex(C, Token, "%jump-table.", MIToken::JumpTableIndex);
+}
+
+static Cursor maybeLexStackObject(Cursor C, MIToken &Token) {
+  return maybeLexIndexAndName(C, Token, "%stack.", MIToken::StackObject);
+}
+
+static Cursor maybeLexFixedStackObject(Cursor C, MIToken &Token) {
+  return maybeLexIndex(C, Token, "%fixed-stack.", MIToken::FixedStackObject);
+}
+
+static Cursor maybeLexConstantPoolItem(Cursor C, MIToken &Token) {
+  return maybeLexIndex(C, Token, "%const.", MIToken::ConstantPoolItem);
+}
+
+static Cursor maybeLexSubRegisterIndex(Cursor C, MIToken &Token,
+                                       ErrorCallbackType ErrorCallback) {
+  const StringRef Rule = "%subreg.";
+  if (!C.remaining().startswith(Rule))
+    return std::nullopt;
+  return lexName(C, Token, MIToken::SubRegisterIndex, Rule.size(),
+                 ErrorCallback);
+}
+
+static Cursor maybeLexIRBlock(Cursor C, MIToken &Token,
+                              ErrorCallbackType ErrorCallback) {
+  const StringRef Rule = "%ir-block.";
+  if (!C.remaining().startswith(Rule))
+    return std::nullopt;
+  if (isdigit(C.peek(Rule.size())))
+    return maybeLexIndex(C, Token, Rule, MIToken::IRBlock);
+  return lexName(C, Token, MIToken::NamedIRBlock, Rule.size(), ErrorCallback);
+}
+
+static Cursor maybeLexIRValue(Cursor C, MIToken &Token,
+                              ErrorCallbackType ErrorCallback) {
+  const StringRef Rule = "%ir.";
+  if (!C.remaining().startswith(Rule))
+    return std::nullopt;
+  if (isdigit(C.peek(Rule.size())))
+    return maybeLexIndex(C, Token, Rule, MIToken::IRValue);
+  return lexName(C, Token, MIToken::NamedIRValue, Rule.size(), ErrorCallback);
+}
+
+static Cursor maybeLexStringConstant(Cursor C, MIToken &Token,
+                                     ErrorCallbackType ErrorCallback) {
+  if (C.peek() != '"')
+    return std::nullopt;
+  return lexName(C, Token, MIToken::StringConstant, /*PrefixLength=*/0,
+                 ErrorCallback);
+}
+
+static Cursor lexVirtualRegister(Cursor C, MIToken &Token) {
+  auto Range = C;
+  C.advance(); // Skip '%'
+  auto NumberRange = C;
+  while (isdigit(C.peek()))
+    C.advance();
+  Token.reset(MIToken::VirtualRegister, Range.upto(C))
+      .setIntegerValue(APSInt(NumberRange.upto(C)));
+  return C;
+}
+
+/// Returns true for a character allowed in a register name.
+static bool isRegisterChar(char C) {
+  return isIdentifierChar(C) && C != '.';
+}
+
+static Cursor lexNamedVirtualRegister(Cursor C, MIToken &Token) {
+  Cursor Range = C;
+  C.advance(); // Skip '%'
+  while (isRegisterChar(C.peek()))
+    C.advance();
+  Token.reset(MIToken::NamedVirtualRegister, Range.upto(C))
+      .setStringValue(Range.upto(C).drop_front(1)); // Drop the '%'
+  return C;
+}
+
+static Cursor maybeLexRegister(Cursor C, MIToken &Token,
+                               ErrorCallbackType ErrorCallback) {
+  if (C.peek() != '%' && C.peek() != '$')
+    return std::nullopt;
+
+  if (C.peek() == '%') {
+    if (isdigit(C.peek(1)))
+      return lexVirtualRegister(C, Token);
+
+    if (isRegisterChar(C.peek(1)))
+      return lexNamedVirtualRegister(C, Token);
+
+    return std::nullopt;
+  }
+
+  assert(C.peek() == '$');
+  auto Range = C;
+  C.advance(); // Skip '$'
+  while (isRegisterChar(C.peek()))
+    C.advance();
+  Token.reset(MIToken::NamedRegister, Range.upto(C))
+      .setStringValue(Range.upto(C).drop_front(1)); // Drop the '$'
+  return C;
+}
+
+static Cursor maybeLexGlobalValue(Cursor C, MIToken &Token,
+                                  ErrorCallbackType ErrorCallback) {
+  if (C.peek() != '@')
+    return std::nullopt;
+  if (!isdigit(C.peek(1)))
+    return lexName(C, Token, MIToken::NamedGlobalValue, /*PrefixLength=*/1,
+                   ErrorCallback);
+  auto Range = C;
+  C.advance(1); // Skip the '@'
+  auto NumberRange = C;
+  while (isdigit(C.peek()))
+    C.advance();
+  Token.reset(MIToken::GlobalValue, Range.upto(C))
+      .setIntegerValue(APSInt(NumberRange.upto(C)));
+  return C;
+}
+
+static Cursor maybeLexExternalSymbol(Cursor C, MIToken &Token,
+                                     ErrorCallbackType ErrorCallback) {
+  if (C.peek() != '&')
+    return std::nullopt;
+  return lexName(C, Token, MIToken::ExternalSymbol, /*PrefixLength=*/1,
+                 ErrorCallback);
+}
+
+static Cursor maybeLexMCSymbol(Cursor C, MIToken &Token,
+                               ErrorCallbackType ErrorCallback) {
+  const StringRef Rule = "<mcsymbol ";
+  if (!C.remaining().startswith(Rule))
+    return std::nullopt;
+  auto Start = C;
+  C.advance(Rule.size());
+
+  // Try a simple unquoted name.
+  if (C.peek() != '"') {
+    while (isIdentifierChar(C.peek()))
+      C.advance();
+    StringRef String = Start.upto(C).drop_front(Rule.size());
+    if (C.peek() != '>') {
+      ErrorCallback(C.location(),
+                    "expected the '<mcsymbol ...' to be closed by a '>'");
+      Token.reset(MIToken::Error, Start.remaining());
+      return Start;
+    }
+    C.advance();
+
+    Token.reset(MIToken::MCSymbol, Start.upto(C)).setStringValue(String);
+    return C;
+  }
+
+  // Otherwise lex out a quoted name.
+  Cursor R = lexStringConstant(C, ErrorCallback);
+  if (!R) {
+    ErrorCallback(C.location(),
+                  "unable to parse quoted string from opening quote");
+    Token.reset(MIToken::Error, Start.remaining());
+    return Start;
+  }
+  StringRef String = Start.upto(R).drop_front(Rule.size());
+  if (R.peek() != '>') {
+    ErrorCallback(R.location(),
+                  "expected the '<mcsymbol ...' to be closed by a '>'");
+    Token.reset(MIToken::Error, Start.remaining());
+    return Start;
+  }
+  R.advance();
+
+  Token.reset(MIToken::MCSymbol, Start.upto(R))
+      .setOwnedStringValue(unescapeQuotedString(String));
+  return R;
+}
+
+static bool isValidHexFloatingPointPrefix(char C) {
+  return C == 'H' || C == 'K' || C == 'L' || C == 'M' || C == 'R';
+}
+
+static Cursor lexFloatingPointLiteral(Cursor Range, Cursor C, MIToken &Token) {
+  C.advance();
+  // Skip over [0-9]*([eE][-+]?[0-9]+)?
+  while (isdigit(C.peek()))
+    C.advance();
+  if ((C.peek() == 'e' || C.peek() == 'E') &&
+      (isdigit(C.peek(1)) ||
+       ((C.peek(1) == '-' || C.peek(1) == '+') && isdigit(C.peek(2))))) {
+    C.advance(2);
+    while (isdigit(C.peek()))
+      C.advance();
+  }
+  Token.reset(MIToken::FloatingPointLiteral, Range.upto(C));
+  return C;
+}
+
+static Cursor maybeLexHexadecimalLiteral(Cursor C, MIToken &Token) {
+  if (C.peek() != '0' || (C.peek(1) != 'x' && C.peek(1) != 'X'))
+    return std::nullopt;
+  Cursor Range = C;
+  C.advance(2);
+  unsigned PrefLen = 2;
+  if (isValidHexFloatingPointPrefix(C.peek())) {
+    C.advance();
+    PrefLen++;
+  }
+  while (isxdigit(C.peek()))
+    C.advance();
+  StringRef StrVal = Range.upto(C);
+  if (StrVal.size() <= PrefLen)
+    return std::nullopt;
+  if (PrefLen == 2)
+    Token.reset(MIToken::HexLiteral, Range.upto(C));
+  else // It must be 3, which means that there was a floating-point prefix.
+    Token.reset(MIToken::FloatingPointLiteral, Range.upto(C));
+  return C;
+}
+
+static Cursor maybeLexNumericalLiteral(Cursor C, MIToken &Token) {
+  if (!isdigit(C.peek()) && (C.peek() != '-' || !isdigit(C.peek(1))))
+    return std::nullopt;
+  auto Range = C;
+  C.advance();
+  while (isdigit(C.peek()))
+    C.advance();
+  if (C.peek() == '.')
+    return lexFloatingPointLiteral(Range, C, Token);
+  StringRef StrVal = Range.upto(C);
+  Token.reset(MIToken::IntegerLiteral, StrVal).setIntegerValue(APSInt(StrVal));
+  return C;
+}
+
+static MIToken::TokenKind getMetadataKeywordKind(StringRef Identifier) {
+  return StringSwitch<MIToken::TokenKind>(Identifier)
+      .Case("!tbaa", MIToken::md_tbaa)
+      .Case("!alias.scope", MIToken::md_alias_scope)
+      .Case("!noalias", MIToken::md_noalias)
+      .Case("!range", MIToken::md_range)
+      .Case("!DIExpression", MIToken::md_diexpr)
+      .Case("!DILocation", MIToken::md_dilocation)
+      .Default(MIToken::Error);
+}
+
+static Cursor maybeLexExclaim(Cursor C, MIToken &Token,
+                              ErrorCallbackType ErrorCallback) {
+  if (C.peek() != '!')
+    return std::nullopt;
+  auto Range = C;
+  C.advance(1);
+  if (isdigit(C.peek()) || !isIdentifierChar(C.peek())) {
+    Token.reset(MIToken::exclaim, Range.upto(C));
+    return C;
+  }
+  while (isIdentifierChar(C.peek()))
+    C.advance();
+  StringRef StrVal = Range.upto(C);
+  Token.reset(getMetadataKeywordKind(StrVal), StrVal);
+  if (Token.isError())
+    ErrorCallback(Token.location(),
+                  "use of unknown metadata keyword '" + StrVal + "'");
+  return C;
+}
+
+static MIToken::TokenKind symbolToken(char C) {
+  switch (C) {
+  case ',':
+    return MIToken::comma;
+  case '.':
+    return MIToken::dot;
+  case '=':
+    return MIToken::equal;
+  case ':':
+    return MIToken::colon;
+  case '(':
+    return MIToken::lparen;
+  case ')':
+    return MIToken::rparen;
+  case '{':
+    return MIToken::lbrace;
+  case '}':
+    return MIToken::rbrace;
+  case '+':
+    return MIToken::plus;
+  case '-':
+    return MIToken::minus;
+  case '<':
+    return MIToken::less;
+  case '>':
+    return MIToken::greater;
+  default:
+    return MIToken::Error;
+  }
+}
+
+static Cursor maybeLexSymbol(Cursor C, MIToken &Token) {
+  MIToken::TokenKind Kind;
+  unsigned Length = 1;
+  if (C.peek() == ':' && C.peek(1) == ':') {
+    Kind = MIToken::coloncolon;
+    Length = 2;
+  } else
+    Kind = symbolToken(C.peek());
+  if (Kind == MIToken::Error)
+    return std::nullopt;
+  auto Range = C;
+  C.advance(Length);
+  Token.reset(Kind, Range.upto(C));
+  return C;
+}
+
+static Cursor maybeLexNewline(Cursor C, MIToken &Token) {
+  if (!isNewlineChar(C.peek()))
+    return std::nullopt;
+  auto Range = C;
+  C.advance();
+  Token.reset(MIToken::Newline, Range.upto(C));
+  return C;
+}
+
+static Cursor maybeLexEscapedIRValue(Cursor C, MIToken &Token,
+                                     ErrorCallbackType ErrorCallback) {
+  if (C.peek() != '`')
+    return std::nullopt;
+  auto Range = C;
+  C.advance();
+  auto StrRange = C;
+  while (C.peek() != '`') {
+    if (C.isEOF() || isNewlineChar(C.peek())) {
+      ErrorCallback(
+          C.location(),
+          "end of machine instruction reached before the closing '`'");
+      Token.reset(MIToken::Error, Range.remaining());
+      return C;
+    }
+    C.advance();
+  }
+  StringRef Value = StrRange.upto(C);
+  C.advance();
+  Token.reset(MIToken::QuotedIRValue, Range.upto(C)).setStringValue(Value);
+  return C;
+}
+
+StringRef llvm::lexMIToken(StringRef Source, MIToken &Token,
+                           ErrorCallbackType ErrorCallback) {
+  auto C = skipComment(skipWhitespace(Cursor(Source)));
+  if (C.isEOF()) {
+    Token.reset(MIToken::Eof, C.remaining());
+    return C.remaining();
+  }
+
+  C = skipMachineOperandComment(C);
+
+  if (Cursor R = maybeLexMachineBasicBlock(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexIdentifier(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexJumpTableIndex(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexStackObject(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexFixedStackObject(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexConstantPoolItem(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexSubRegisterIndex(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexIRBlock(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexIRValue(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexRegister(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexGlobalValue(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexExternalSymbol(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexMCSymbol(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexHexadecimalLiteral(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexNumericalLiteral(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexExclaim(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexSymbol(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexNewline(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexEscapedIRValue(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexStringConstant(C, Token, ErrorCallback))
+    return R.remaining();
+
+  Token.reset(MIToken::Error, C.remaining());
+  ErrorCallback(C.location(),
+                Twine("unexpected character '") + Twine(C.peek()) + "'");
+  return C.remaining();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MILexer.h b/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MILexer.h
new file mode 100644
index 000000000000..7149c29d6ba7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -0,0 +1,253 @@
+//===- MILexer.h - Lexer for machine instructions ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the function that lexes the machine instruction source
+// string.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_MIRPARSER_MILEXER_H
+#define LLVM_LIB_CODEGEN_MIRPARSER_MILEXER_H
+
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/StringRef.h"
+#include <string>
+
+namespace llvm {
+
+class Twine;
+
+/// A token produced by the machine instruction lexer.
+struct MIToken {
+  enum TokenKind {
+    // Markers
+    Eof,
+    Error,
+    Newline,
+
+    // Tokens with no info.
+    comma,
+    equal,
+    underscore,
+    colon,
+    coloncolon,
+    dot,
+    exclaim,
+    lparen,
+    rparen,
+    lbrace,
+    rbrace,
+    plus,
+    minus,
+    less,
+    greater,
+
+    // Keywords
+    kw_implicit,
+    kw_implicit_define,
+    kw_def,
+    kw_dead,
+    kw_dereferenceable,
+    kw_killed,
+    kw_undef,
+    kw_internal,
+    kw_early_clobber,
+    kw_debug_use,
+    kw_renamable,
+    kw_tied_def,
+    kw_frame_setup,
+    kw_frame_destroy,
+    kw_nnan,
+    kw_ninf,
+    kw_nsz,
+    kw_arcp,
+    kw_contract,
+    kw_afn,
+    kw_reassoc,
+    kw_nuw,
+    kw_nsw,
+    kw_exact,
+    kw_nofpexcept,
+    kw_unpredictable,
+    kw_debug_location,
+    kw_debug_instr_number,
+    kw_dbg_instr_ref,
+    kw_cfi_same_value,
+    kw_cfi_offset,
+    kw_cfi_rel_offset,
+    kw_cfi_def_cfa_register,
+    kw_cfi_def_cfa_offset,
+    kw_cfi_adjust_cfa_offset,
+    kw_cfi_escape,
+    kw_cfi_def_cfa,
+    kw_cfi_llvm_def_aspace_cfa,
+    kw_cfi_register,
+    kw_cfi_remember_state,
+    kw_cfi_restore,
+    kw_cfi_restore_state,
+    kw_cfi_undefined,
+    kw_cfi_window_save,
+    kw_cfi_aarch64_negate_ra_sign_state,
+    kw_blockaddress,
+    kw_intrinsic,
+    kw_target_index,
+    kw_half,
+    kw_float,
+    kw_double,
+    kw_x86_fp80,
+    kw_fp128,
+    kw_ppc_fp128,
+    kw_target_flags,
+    kw_volatile,
+    kw_non_temporal,
+    kw_invariant,
+    kw_align,
+    kw_basealign,
+    kw_addrspace,
+    kw_stack,
+    kw_got,
+    kw_jump_table,
+    kw_constant_pool,
+    kw_call_entry,
+    kw_custom,
+    kw_liveout,
+    kw_landing_pad,
+    kw_inlineasm_br_indirect_target,
+    kw_ehfunclet_entry,
+    kw_liveins,
+    kw_successors,
+    kw_floatpred,
+    kw_intpred,
+    kw_shufflemask,
+    kw_pre_instr_symbol,
+    kw_post_instr_symbol,
+    kw_heap_alloc_marker,
+    kw_pcsections,
+    kw_cfi_type,
+    kw_bbsections,
+    kw_bb_id,
+    kw_unknown_size,
+    kw_unknown_address,
+    kw_ir_block_address_taken,
+    kw_machine_block_address_taken,
+
+    // Metadata types.
+    kw_distinct,
+
+    // Named metadata keywords
+    md_tbaa,
+    md_alias_scope,
+    md_noalias,
+    md_range,
+    md_diexpr,
+    md_dilocation,
+
+    // Identifier tokens
+    Identifier,
+    NamedRegister,
+    NamedVirtualRegister,
+    MachineBasicBlockLabel,
+    MachineBasicBlock,
+    StackObject,
+    FixedStackObject,
+    NamedGlobalValue,
+    GlobalValue,
+    ExternalSymbol,
+    MCSymbol,
+
+    // Other tokens
+    IntegerLiteral,
+    FloatingPointLiteral,
+    HexLiteral,
+    VectorLiteral,
+    VirtualRegister,
+    ConstantPoolItem,
+    JumpTableIndex,
+    NamedIRBlock,
+    IRBlock,
+    NamedIRValue,
+    IRValue,
+    QuotedIRValue, // `<constant value>`
+    SubRegisterIndex,
+    StringConstant
+  };
+
+private:
+  TokenKind Kind = Error;
+  StringRef Range;
+  StringRef StringValue;
+  std::string StringValueStorage;
+  APSInt IntVal;
+
+public:
+  MIToken() = default;
+
+  MIToken &reset(TokenKind Kind, StringRef Range);
+
+  MIToken &setStringValue(StringRef StrVal);
+  MIToken &setOwnedStringValue(std::string StrVal);
+  MIToken &setIntegerValue(APSInt IntVal);
+
+  TokenKind kind() const { return Kind; }
+
+  bool isError() const { return Kind == Error; }
+
+  bool isNewlineOrEOF() const { return Kind == Newline || Kind == Eof; }
+
+  bool isErrorOrEOF() const { return Kind == Error || Kind == Eof; }
+
+  bool isRegister() const {
+    return Kind == NamedRegister || Kind == underscore ||
+           Kind == NamedVirtualRegister || Kind == VirtualRegister;
+  }
+
+  bool isRegisterFlag() const {
+    return Kind == kw_implicit || Kind == kw_implicit_define ||
+           Kind == kw_def || Kind == kw_dead || Kind == kw_killed ||
+           Kind == kw_undef || Kind == kw_internal ||
+           Kind == kw_early_clobber || Kind == kw_debug_use ||
+           Kind == kw_renamable;
+  }
+
+  bool isMemoryOperandFlag() const {
+    return Kind == kw_volatile || Kind == kw_non_temporal ||
+           Kind == kw_dereferenceable || Kind == kw_invariant ||
+           Kind == StringConstant;
+  }
+
+  bool is(TokenKind K) const { return Kind == K; }
+
+  bool isNot(TokenKind K) const { return Kind != K; }
+
+  StringRef::iterator location() const { return Range.begin(); }
+
+  StringRef range() const { return Range; }
+
+  /// Return the token's string value.
+  StringRef stringValue() const { return StringValue; }
+
+  const APSInt &integerValue() const { return IntVal; }
+
+  bool hasIntegerValue() const {
+    return Kind == IntegerLiteral || Kind == MachineBasicBlock ||
+           Kind == MachineBasicBlockLabel || Kind == StackObject ||
+           Kind == FixedStackObject || Kind == GlobalValue ||
+           Kind == VirtualRegister || Kind == ConstantPoolItem ||
+           Kind == JumpTableIndex || Kind == IRBlock || Kind == IRValue;
+  }
+};
+
+/// Consume a single machine instruction token in the given source and return
+/// the remaining source string.
+StringRef lexMIToken(
+    StringRef Source, MIToken &Token,
+    function_ref<void(StringRef::iterator, const Twine &)> ErrorCallback);
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_MIRPARSER_MILEXER_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MIParser.cpp
new file mode 100644
index 000000000000..bfd9286ff59c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -0,0 +1,3620 @@
+//===- MIParser.cpp - Machine instructions parser implementation ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the parsing of machine instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MIRParser/MIParser.h"
+#include "MILexer.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/AsmParser/SlotMapping.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MIRFormatter.h"
+#include "llvm/CodeGen/MIRPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <utility>
+
+using namespace llvm;
+
+void PerTargetMIParsingState::setTarget(
+  const TargetSubtargetInfo &NewSubtarget) {
+
+  // If the subtarget changed, over conservatively assume everything is invalid.
+  if (&Subtarget == &NewSubtarget)
+    return;
+
+  Names2InstrOpCodes.clear();
+  Names2Regs.clear();
+  Names2RegMasks.clear();
+  Names2SubRegIndices.clear();
+  Names2TargetIndices.clear();
+  Names2DirectTargetFlags.clear();
+  Names2BitmaskTargetFlags.clear();
+  Names2MMOTargetFlags.clear();
+
+  initNames2RegClasses();
+  initNames2RegBanks();
+}
+
+void PerTargetMIParsingState::initNames2Regs() {
+  if (!Names2Regs.empty())
+    return;
+
+  // The '%noreg' register is the register 0.
+  Names2Regs.insert(std::make_pair("noreg", 0));
+  const auto *TRI = Subtarget.getRegisterInfo();
+  assert(TRI && "Expected target register info");
+
+  for (unsigned I = 0, E = TRI->getNumRegs(); I < E; ++I) {
+    bool WasInserted =
+        Names2Regs.insert(std::make_pair(StringRef(TRI->getName(I)).lower(), I))
+            .second;
+    (void)WasInserted;
+    assert(WasInserted && "Expected registers to be unique case-insensitively");
+  }
+}
+
+bool PerTargetMIParsingState::getRegisterByName(StringRef RegName,
+                                                Register &Reg) {
+  initNames2Regs();
+  auto RegInfo = Names2Regs.find(RegName);
+  if (RegInfo == Names2Regs.end())
+    return true;
+  Reg = RegInfo->getValue();
+  return false;
+}
+
+void PerTargetMIParsingState::initNames2InstrOpCodes() {
+  if (!Names2InstrOpCodes.empty())
+    return;
+  const auto *TII = Subtarget.getInstrInfo();
+  assert(TII && "Expected target instruction info");
+  for (unsigned I = 0, E = TII->getNumOpcodes(); I < E; ++I)
+    Names2InstrOpCodes.insert(std::make_pair(StringRef(TII->getName(I)), I));
+}
+
+bool PerTargetMIParsingState::parseInstrName(StringRef InstrName,
+                                             unsigned &OpCode) {
+  initNames2InstrOpCodes();
+  auto InstrInfo = Names2InstrOpCodes.find(InstrName);
+  if (InstrInfo == Names2InstrOpCodes.end())
+    return true;
+  OpCode = InstrInfo->getValue();
+  return false;
+}
+
+void PerTargetMIParsingState::initNames2RegMasks() {
+  if (!Names2RegMasks.empty())
+    return;
+  const auto *TRI = Subtarget.getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  ArrayRef<const uint32_t *> RegMasks = TRI->getRegMasks();
+  ArrayRef<const char *> RegMaskNames = TRI->getRegMaskNames();
+  assert(RegMasks.size() == RegMaskNames.size());
+  for (size_t I = 0, E = RegMasks.size(); I < E; ++I)
+    Names2RegMasks.insert(
+        std::make_pair(StringRef(RegMaskNames[I]).lower(), RegMasks[I]));
+}
+
+const uint32_t *PerTargetMIParsingState::getRegMask(StringRef Identifier) {
+  initNames2RegMasks();
+  auto RegMaskInfo = Names2RegMasks.find(Identifier);
+  if (RegMaskInfo == Names2RegMasks.end())
+    return nullptr;
+  return RegMaskInfo->getValue();
+}
+
+void PerTargetMIParsingState::initNames2SubRegIndices() {
+  if (!Names2SubRegIndices.empty())
+    return;
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  for (unsigned I = 1, E = TRI->getNumSubRegIndices(); I < E; ++I)
+    Names2SubRegIndices.insert(
+        std::make_pair(TRI->getSubRegIndexName(I), I));
+}
+
+unsigned PerTargetMIParsingState::getSubRegIndex(StringRef Name) {
+  initNames2SubRegIndices();
+  auto SubRegInfo = Names2SubRegIndices.find(Name);
+  if (SubRegInfo == Names2SubRegIndices.end())
+    return 0;
+  return SubRegInfo->getValue();
+}
+
+void PerTargetMIParsingState::initNames2TargetIndices() {
+  if (!Names2TargetIndices.empty())
+    return;
+  const auto *TII = Subtarget.getInstrInfo();
+  assert(TII && "Expected target instruction info");
+  auto Indices = TII->getSerializableTargetIndices();
+  for (const auto &I : Indices)
+    Names2TargetIndices.insert(std::make_pair(StringRef(I.second), I.first));
+}
+
+bool PerTargetMIParsingState::getTargetIndex(StringRef Name, int &Index) {
+  initNames2TargetIndices();
+  auto IndexInfo = Names2TargetIndices.find(Name);
+  if (IndexInfo == Names2TargetIndices.end())
+    return true;
+  Index = IndexInfo->second;
+  return false;
+}
+
+void PerTargetMIParsingState::initNames2DirectTargetFlags() {
+  if (!Names2DirectTargetFlags.empty())
+    return;
+
+  const auto *TII = Subtarget.getInstrInfo();
+  assert(TII && "Expected target instruction info");
+  auto Flags = TII->getSerializableDirectMachineOperandTargetFlags();
+  for (const auto &I : Flags)
+    Names2DirectTargetFlags.insert(
+        std::make_pair(StringRef(I.second), I.first));
+}
+
+bool PerTargetMIParsingState::getDirectTargetFlag(StringRef Name,
+                                                  unsigned &Flag) {
+  initNames2DirectTargetFlags();
+  auto FlagInfo = Names2DirectTargetFlags.find(Name);
+  if (FlagInfo == Names2DirectTargetFlags.end())
+    return true;
+  Flag = FlagInfo->second;
+  return false;
+}
+
+void PerTargetMIParsingState::initNames2BitmaskTargetFlags() {
+  if (!Names2BitmaskTargetFlags.empty())
+    return;
+
+  const auto *TII = Subtarget.getInstrInfo();
+  assert(TII && "Expected target instruction info");
+  auto Flags = TII->getSerializableBitmaskMachineOperandTargetFlags();
+  for (const auto &I : Flags)
+    Names2BitmaskTargetFlags.insert(
+        std::make_pair(StringRef(I.second), I.first));
+}
+
+bool PerTargetMIParsingState::getBitmaskTargetFlag(StringRef Name,
+                                                   unsigned &Flag) {
+  initNames2BitmaskTargetFlags();
+  auto FlagInfo = Names2BitmaskTargetFlags.find(Name);
+  if (FlagInfo == Names2BitmaskTargetFlags.end())
+    return true;
+  Flag = FlagInfo->second;
+  return false;
+}
+
+void PerTargetMIParsingState::initNames2MMOTargetFlags() {
+  if (!Names2MMOTargetFlags.empty())
+    return;
+
+  const auto *TII = Subtarget.getInstrInfo();
+  assert(TII && "Expected target instruction info");
+  auto Flags = TII->getSerializableMachineMemOperandTargetFlags();
+  for (const auto &I : Flags)
+    Names2MMOTargetFlags.insert(std::make_pair(StringRef(I.second), I.first));
+}
+
+bool PerTargetMIParsingState::getMMOTargetFlag(StringRef Name,
+                                               MachineMemOperand::Flags &Flag) {
+  initNames2MMOTargetFlags();
+  auto FlagInfo = Names2MMOTargetFlags.find(Name);
+  if (FlagInfo == Names2MMOTargetFlags.end())
+    return true;
+  Flag = FlagInfo->second;
+  return false;
+}
+
+void PerTargetMIParsingState::initNames2RegClasses() {
+  if (!Names2RegClasses.empty())
+    return;
+
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+  for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; ++I) {
+    const auto *RC = TRI->getRegClass(I);
+    Names2RegClasses.insert(
+        std::make_pair(StringRef(TRI->getRegClassName(RC)).lower(), RC));
+  }
+}
+
+void PerTargetMIParsingState::initNames2RegBanks() {
+  if (!Names2RegBanks.empty())
+    return;
+
+  const RegisterBankInfo *RBI = Subtarget.getRegBankInfo();
+  // If the target does not support GlobalISel, we may not have a
+  // register bank info.
+  if (!RBI)
+    return;
+
+  for (unsigned I = 0, E = RBI->getNumRegBanks(); I < E; ++I) {
+    const auto &RegBank = RBI->getRegBank(I);
+    Names2RegBanks.insert(
+        std::make_pair(StringRef(RegBank.getName()).lower(), &RegBank));
+  }
+}
+
+const TargetRegisterClass *
+PerTargetMIParsingState::getRegClass(StringRef Name) {
+  auto RegClassInfo = Names2RegClasses.find(Name);
+  if (RegClassInfo == Names2RegClasses.end())
+    return nullptr;
+  return RegClassInfo->getValue();
+}
+
+const RegisterBank *PerTargetMIParsingState::getRegBank(StringRef Name) {
+  auto RegBankInfo = Names2RegBanks.find(Name);
+  if (RegBankInfo == Names2RegBanks.end())
+    return nullptr;
+  return RegBankInfo->getValue();
+}
+
+PerFunctionMIParsingState::PerFunctionMIParsingState(MachineFunction &MF,
+    SourceMgr &SM, const SlotMapping &IRSlots, PerTargetMIParsingState &T)
+  : MF(MF), SM(&SM), IRSlots(IRSlots), Target(T) {
+}
+
+VRegInfo &PerFunctionMIParsingState::getVRegInfo(Register Num) {
+  auto I = VRegInfos.insert(std::make_pair(Num, nullptr));
+  if (I.second) {
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    VRegInfo *Info = new (Allocator) VRegInfo;
+    Info->VReg = MRI.createIncompleteVirtualRegister();
+    I.first->second = Info;
+  }
+  return *I.first->second;
+}
+
+VRegInfo &PerFunctionMIParsingState::getVRegInfoNamed(StringRef RegName) {
+  assert(RegName != "" && "Expected named reg.");
+
+  auto I = VRegInfosNamed.insert(std::make_pair(RegName.str(), nullptr));
+  if (I.second) {
+    VRegInfo *Info = new (Allocator) VRegInfo;
+    Info->VReg = MF.getRegInfo().createIncompleteVirtualRegister(RegName);
+    I.first->second = Info;
+  }
+  return *I.first->second;
+}
+
+static void mapValueToSlot(const Value *V, ModuleSlotTracker &MST,
+                           DenseMap<unsigned, const Value *> &Slots2Values) {
+  int Slot = MST.getLocalSlot(V);
+  if (Slot == -1)
+    return;
+  Slots2Values.insert(std::make_pair(unsigned(Slot), V));
+}
+
+/// Creates the mapping from slot numbers to function's unnamed IR values.
+static void initSlots2Values(const Function &F,
+                             DenseMap<unsigned, const Value *> &Slots2Values) {
+  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
+  MST.incorporateFunction(F);
+  for (const auto &Arg : F.args())
+    mapValueToSlot(&Arg, MST, Slots2Values);
+  for (const auto &BB : F) {
+    mapValueToSlot(&BB, MST, Slots2Values);
+    for (const auto &I : BB)
+      mapValueToSlot(&I, MST, Slots2Values);
+  }
+}
+
+const Value* PerFunctionMIParsingState::getIRValue(unsigned Slot) {
+  if (Slots2Values.empty())
+    initSlots2Values(MF.getFunction(), Slots2Values);
+  return Slots2Values.lookup(Slot);
+}
+
+namespace {
+
+/// A wrapper struct around the 'MachineOperand' struct that includes a source
+/// range and other attributes.
+struct ParsedMachineOperand {
+  MachineOperand Operand;
+  StringRef::iterator Begin;
+  StringRef::iterator End;
+  std::optional<unsigned> TiedDefIdx;
+
+  ParsedMachineOperand(const MachineOperand &Operand, StringRef::iterator Begin,
+                       StringRef::iterator End,
+                       std::optional<unsigned> &TiedDefIdx)
+      : Operand(Operand), Begin(Begin), End(End), TiedDefIdx(TiedDefIdx) {
+    if (TiedDefIdx)
+      assert(Operand.isReg() && Operand.isUse() &&
+             "Only used register operands can be tied");
+  }
+};
+
+class MIParser {
+  MachineFunction &MF;
+  SMDiagnostic &Error;
+  StringRef Source, CurrentSource;
+  SMRange SourceRange;
+  MIToken Token;
+  PerFunctionMIParsingState &PFS;
+  /// Maps from slot numbers to function's unnamed basic blocks.
+  DenseMap<unsigned, const BasicBlock *> Slots2BasicBlocks;
+
+public:
+  MIParser(PerFunctionMIParsingState &PFS, SMDiagnostic &Error,
+           StringRef Source);
+  MIParser(PerFunctionMIParsingState &PFS, SMDiagnostic &Error,
+           StringRef Source, SMRange SourceRange);
+
+  /// \p SkipChar gives the number of characters to skip before looking
+  /// for the next token.
+  void lex(unsigned SkipChar = 0);
+
+  /// Report an error at the current location with the given message.
+  ///
+  /// This function always return true.
+  bool error(const Twine &Msg);
+
+  /// Report an error at the given location with the given message.
+  ///
+  /// This function always return true.
+  bool error(StringRef::iterator Loc, const Twine &Msg);
+
+  bool
+  parseBasicBlockDefinitions(DenseMap<unsigned, MachineBasicBlock *> &MBBSlots);
+  bool parseBasicBlocks();
+  bool parse(MachineInstr *&MI);
+  bool parseStandaloneMBB(MachineBasicBlock *&MBB);
+  bool parseStandaloneNamedRegister(Register &Reg);
+  bool parseStandaloneVirtualRegister(VRegInfo *&Info);
+  bool parseStandaloneRegister(Register &Reg);
+  bool parseStandaloneStackObject(int &FI);
+  bool parseStandaloneMDNode(MDNode *&Node);
+  bool parseMachineMetadata();
+  bool parseMDTuple(MDNode *&MD, bool IsDistinct);
+  bool parseMDNodeVector(SmallVectorImpl<Metadata *> &Elts);
+  bool parseMetadata(Metadata *&MD);
+
+  bool
+  parseBasicBlockDefinition(DenseMap<unsigned, MachineBasicBlock *> &MBBSlots);
+  bool parseBasicBlock(MachineBasicBlock &MBB,
+                       MachineBasicBlock *&AddFalthroughFrom);
+  bool parseBasicBlockLiveins(MachineBasicBlock &MBB);
+  bool parseBasicBlockSuccessors(MachineBasicBlock &MBB);
+
+  bool parseNamedRegister(Register &Reg);
+  bool parseVirtualRegister(VRegInfo *&Info);
+  bool parseNamedVirtualRegister(VRegInfo *&Info);
+  bool parseRegister(Register &Reg, VRegInfo *&VRegInfo);
+  bool parseRegisterFlag(unsigned &Flags);
+  bool parseRegisterClassOrBank(VRegInfo &RegInfo);
+  bool parseSubRegisterIndex(unsigned &SubReg);
+  bool parseRegisterTiedDefIndex(unsigned &TiedDefIdx);
+  bool parseRegisterOperand(MachineOperand &Dest,
+                            std::optional<unsigned> &TiedDefIdx,
+                            bool IsDef = false);
+  bool parseImmediateOperand(MachineOperand &Dest);
+  bool parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
+                       const Constant *&C);
+  bool parseIRConstant(StringRef::iterator Loc, const Constant *&C);
+  bool parseLowLevelType(StringRef::iterator Loc, LLT &Ty);
+  bool parseTypedImmediateOperand(MachineOperand &Dest);
+  bool parseFPImmediateOperand(MachineOperand &Dest);
+  bool parseMBBReference(MachineBasicBlock *&MBB);
+  bool parseMBBOperand(MachineOperand &Dest);
+  bool parseStackFrameIndex(int &FI);
+  bool parseStackObjectOperand(MachineOperand &Dest);
+  bool parseFixedStackFrameIndex(int &FI);
+  bool parseFixedStackObjectOperand(MachineOperand &Dest);
+  bool parseGlobalValue(GlobalValue *&GV);
+  bool parseGlobalAddressOperand(MachineOperand &Dest);
+  bool parseConstantPoolIndexOperand(MachineOperand &Dest);
+  bool parseSubRegisterIndexOperand(MachineOperand &Dest);
+  bool parseJumpTableIndexOperand(MachineOperand &Dest);
+  bool parseExternalSymbolOperand(MachineOperand &Dest);
+  bool parseMCSymbolOperand(MachineOperand &Dest);
+  [[nodiscard]] bool parseMDNode(MDNode *&Node);
+  bool parseDIExpression(MDNode *&Expr);
+  bool parseDILocation(MDNode *&Expr);
+  bool parseMetadataOperand(MachineOperand &Dest);
+  bool parseCFIOffset(int &Offset);
+  bool parseCFIRegister(Register &Reg);
+  bool parseCFIAddressSpace(unsigned &AddressSpace);
+  bool parseCFIEscapeValues(std::string& Values);
+  bool parseCFIOperand(MachineOperand &Dest);
+  bool parseIRBlock(BasicBlock *&BB, const Function &F);
+  bool parseBlockAddressOperand(MachineOperand &Dest);
+  bool parseIntrinsicOperand(MachineOperand &Dest);
+  bool parsePredicateOperand(MachineOperand &Dest);
+  bool parseShuffleMaskOperand(MachineOperand &Dest);
+  bool parseTargetIndexOperand(MachineOperand &Dest);
+  bool parseDbgInstrRefOperand(MachineOperand &Dest);
+  bool parseCustomRegisterMaskOperand(MachineOperand &Dest);
+  bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest);
+  bool parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
+                           MachineOperand &Dest,
+                           std::optional<unsigned> &TiedDefIdx);
+  bool parseMachineOperandAndTargetFlags(const unsigned OpCode,
+                                         const unsigned OpIdx,
+                                         MachineOperand &Dest,
+                                         std::optional<unsigned> &TiedDefIdx);
+  bool parseOffset(int64_t &Offset);
+  bool parseIRBlockAddressTaken(BasicBlock *&BB);
+  bool parseAlignment(uint64_t &Alignment);
+  bool parseAddrspace(unsigned &Addrspace);
+  bool parseSectionID(std::optional<MBBSectionID> &SID);
+  bool parseBBID(std::optional<unsigned> &BBID);
+  bool parseOperandsOffset(MachineOperand &Op);
+  bool parseIRValue(const Value *&V);
+  bool parseMemoryOperandFlag(MachineMemOperand::Flags &Flags);
+  bool parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV);
+  bool parseMachinePointerInfo(MachinePointerInfo &Dest);
+  bool parseOptionalScope(LLVMContext &Context, SyncScope::ID &SSID);
+  bool parseOptionalAtomicOrdering(AtomicOrdering &Order);
+  bool parseMachineMemoryOperand(MachineMemOperand *&Dest);
+  bool parsePreOrPostInstrSymbol(MCSymbol *&Symbol);
+  bool parseHeapAllocMarker(MDNode *&Node);
+  bool parsePCSections(MDNode *&Node);
+
+  bool parseTargetImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+                              MachineOperand &Dest, const MIRFormatter &MF);
+
+private:
+  /// Convert the integer literal in the current token into an unsigned integer.
+  ///
+  /// Return true if an error occurred.
+  bool getUnsigned(unsigned &Result);
+
+  /// Convert the integer literal in the current token into an uint64.
+  ///
+  /// Return true if an error occurred.
+  bool getUint64(uint64_t &Result);
+
+  /// Convert the hexadecimal literal in the current token into an unsigned
+  ///  APInt with a minimum bitwidth required to represent the value.
+  ///
+  /// Return true if the literal does not represent an integer value.
+  bool getHexUint(APInt &Result);
+
+  /// If the current token is of the given kind, consume it and return false.
+  /// Otherwise report an error and return true.
+  bool expectAndConsume(MIToken::TokenKind TokenKind);
+
+  /// If the current token is of the given kind, consume it and return true.
+  /// Otherwise return false.
+  bool consumeIfPresent(MIToken::TokenKind TokenKind);
+
+  bool parseInstruction(unsigned &OpCode, unsigned &Flags);
+
+  bool assignRegisterTies(MachineInstr &MI,
+                          ArrayRef<ParsedMachineOperand> Operands);
+
+  bool verifyImplicitOperands(ArrayRef<ParsedMachineOperand> Operands,
+                              const MCInstrDesc &MCID);
+
+  const BasicBlock *getIRBlock(unsigned Slot);
+  const BasicBlock *getIRBlock(unsigned Slot, const Function &F);
+
+  /// Get or create an MCSymbol for a given name.
+  MCSymbol *getOrCreateMCSymbol(StringRef Name);
+
+  /// parseStringConstant
+  ///   ::= StringConstant
+  bool parseStringConstant(std::string &Result);
+
+  /// Map the location in the MI string to the corresponding location specified
+  /// in `SourceRange`.
+  SMLoc mapSMLoc(StringRef::iterator Loc);
+};
+
+} // end anonymous namespace
+
+MIParser::MIParser(PerFunctionMIParsingState &PFS, SMDiagnostic &Error,
+                   StringRef Source)
+    : MF(PFS.MF), Error(Error), Source(Source), CurrentSource(Source), PFS(PFS)
+{}
+
+MIParser::MIParser(PerFunctionMIParsingState &PFS, SMDiagnostic &Error,
+                   StringRef Source, SMRange SourceRange)
+    : MF(PFS.MF), Error(Error), Source(Source), CurrentSource(Source),
+      SourceRange(SourceRange), PFS(PFS) {}
+
+void MIParser::lex(unsigned SkipChar) {
+  CurrentSource = lexMIToken(
+      CurrentSource.slice(SkipChar, StringRef::npos), Token,
+      [this](StringRef::iterator Loc, const Twine &Msg) { error(Loc, Msg); });
+}
+
+bool MIParser::error(const Twine &Msg) { return error(Token.location(), Msg); }
+
+bool MIParser::error(StringRef::iterator Loc, const Twine &Msg) {
+  const SourceMgr &SM = *PFS.SM;
+  assert(Loc >= Source.data() && Loc <= (Source.data() + Source.size()));
+  const MemoryBuffer &Buffer = *SM.getMemoryBuffer(SM.getMainFileID());
+  if (Loc >= Buffer.getBufferStart() && Loc <= Buffer.getBufferEnd()) {
+    // Create an ordinary diagnostic when the source manager's buffer is the
+    // source string.
+    Error = SM.GetMessage(SMLoc::getFromPointer(Loc), SourceMgr::DK_Error, Msg);
+    return true;
+  }
+  // Create a diagnostic for a YAML string literal.
+  Error = SMDiagnostic(SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
+                       Loc - Source.data(), SourceMgr::DK_Error, Msg.str(),
+                       Source, std::nullopt, std::nullopt);
+  return true;
+}
+
+SMLoc MIParser::mapSMLoc(StringRef::iterator Loc) {
+  assert(SourceRange.isValid() && "Invalid source range");
+  assert(Loc >= Source.data() && Loc <= (Source.data() + Source.size()));
+  return SMLoc::getFromPointer(SourceRange.Start.getPointer() +
+                               (Loc - Source.data()));
+}
+
+typedef function_ref<bool(StringRef::iterator Loc, const Twine &)>
+    ErrorCallbackType;
+
+static const char *toString(MIToken::TokenKind TokenKind) {
+  switch (TokenKind) {
+  case MIToken::comma:
+    return "','";
+  case MIToken::equal:
+    return "'='";
+  case MIToken::colon:
+    return "':'";
+  case MIToken::lparen:
+    return "'('";
+  case MIToken::rparen:
+    return "')'";
+  default:
+    return "<unknown token>";
+  }
+}
+
+bool MIParser::expectAndConsume(MIToken::TokenKind TokenKind) {
+  if (Token.isNot(TokenKind))
+    return error(Twine("expected ") + toString(TokenKind));
+  lex();
+  return false;
+}
+
+bool MIParser::consumeIfPresent(MIToken::TokenKind TokenKind) {
+  if (Token.isNot(TokenKind))
+    return false;
+  lex();
+  return true;
+}
+
+// Parse Machine Basic Block Section ID.
+bool MIParser::parseSectionID(std::optional<MBBSectionID> &SID) {
+  assert(Token.is(MIToken::kw_bbsections));
+  lex();
+  if (Token.is(MIToken::IntegerLiteral)) {
+    unsigned Value = 0;
+    if (getUnsigned(Value))
+      return error("Unknown Section ID");
+    SID = MBBSectionID{Value};
+  } else {
+    const StringRef &S = Token.stringValue();
+    if (S == "Exception")
+      SID = MBBSectionID::ExceptionSectionID;
+    else if (S == "Cold")
+      SID = MBBSectionID::ColdSectionID;
+    else
+      return error("Unknown Section ID");
+  }
+  lex();
+  return false;
+}
+
+// Parse Machine Basic Block ID.
+bool MIParser::parseBBID(std::optional<unsigned> &BBID) {
+  assert(Token.is(MIToken::kw_bb_id));
+  lex();
+  unsigned Value = 0;
+  if (getUnsigned(Value))
+    return error("Unknown BB ID");
+  BBID = Value;
+  lex();
+  return false;
+}
+
+bool MIParser::parseBasicBlockDefinition(
+    DenseMap<unsigned, MachineBasicBlock *> &MBBSlots) {
+  assert(Token.is(MIToken::MachineBasicBlockLabel));
+  unsigned ID = 0;
+  if (getUnsigned(ID))
+    return true;
+  auto Loc = Token.location();
+  auto Name = Token.stringValue();
+  lex();
+  bool MachineBlockAddressTaken = false;
+  BasicBlock *AddressTakenIRBlock = nullptr;
+  bool IsLandingPad = false;
+  bool IsInlineAsmBrIndirectTarget = false;
+  bool IsEHFuncletEntry = false;
+  std::optional<MBBSectionID> SectionID;
+  uint64_t Alignment = 0;
+  std::optional<unsigned> BBID;
+  BasicBlock *BB = nullptr;
+  if (consumeIfPresent(MIToken::lparen)) {
+    do {
+      // TODO: Report an error when multiple same attributes are specified.
+      switch (Token.kind()) {
+      case MIToken::kw_machine_block_address_taken:
+        MachineBlockAddressTaken = true;
+        lex();
+        break;
+      case MIToken::kw_ir_block_address_taken:
+        if (parseIRBlockAddressTaken(AddressTakenIRBlock))
+          return true;
+        break;
+      case MIToken::kw_landing_pad:
+        IsLandingPad = true;
+        lex();
+        break;
+      case MIToken::kw_inlineasm_br_indirect_target:
+        IsInlineAsmBrIndirectTarget = true;
+        lex();
+        break;
+      case MIToken::kw_ehfunclet_entry:
+        IsEHFuncletEntry = true;
+        lex();
+        break;
+      case MIToken::kw_align:
+        if (parseAlignment(Alignment))
+          return true;
+        break;
+      case MIToken::IRBlock:
+      case MIToken::NamedIRBlock:
+        // TODO: Report an error when both name and ir block are specified.
+        if (parseIRBlock(BB, MF.getFunction()))
+          return true;
+        lex();
+        break;
+      case MIToken::kw_bbsections:
+        if (parseSectionID(SectionID))
+          return true;
+        break;
+      case MIToken::kw_bb_id:
+        if (parseBBID(BBID))
+          return true;
+        break;
+      default:
+        break;
+      }
+    } while (consumeIfPresent(MIToken::comma));
+    if (expectAndConsume(MIToken::rparen))
+      return true;
+  }
+  if (expectAndConsume(MIToken::colon))
+    return true;
+
+  if (!Name.empty()) {
+    BB = dyn_cast_or_null<BasicBlock>(
+        MF.getFunction().getValueSymbolTable()->lookup(Name));
+    if (!BB)
+      return error(Loc, Twine("basic block '") + Name +
+                            "' is not defined in the function '" +
+                            MF.getName() + "'");
+  }
+  auto *MBB = MF.CreateMachineBasicBlock(BB);
+  MF.insert(MF.end(), MBB);
+  bool WasInserted = MBBSlots.insert(std::make_pair(ID, MBB)).second;
+  if (!WasInserted)
+    return error(Loc, Twine("redefinition of machine basic block with id #") +
+                          Twine(ID));
+  if (Alignment)
+    MBB->setAlignment(Align(Alignment));
+  if (MachineBlockAddressTaken)
+    MBB->setMachineBlockAddressTaken();
+  if (AddressTakenIRBlock)
+    MBB->setAddressTakenIRBlock(AddressTakenIRBlock);
+  MBB->setIsEHPad(IsLandingPad);
+  MBB->setIsInlineAsmBrIndirectTarget(IsInlineAsmBrIndirectTarget);
+  MBB->setIsEHFuncletEntry(IsEHFuncletEntry);
+  if (SectionID) {
+    MBB->setSectionID(*SectionID);
+    MF.setBBSectionsType(BasicBlockSection::List);
+  }
+  if (BBID.has_value()) {
+    // BBSectionsType is set to `List` if any basic blocks has `SectionID`.
+    // Here, we set it to `Labels` if it hasn't been set above.
+    if (!MF.hasBBSections())
+      MF.setBBSectionsType(BasicBlockSection::Labels);
+    MBB->setBBID(BBID.value());
+  }
+  return false;
+}
+
+bool MIParser::parseBasicBlockDefinitions(
+    DenseMap<unsigned, MachineBasicBlock *> &MBBSlots) {
+  lex();
+  // Skip until the first machine basic block.
+  while (Token.is(MIToken::Newline))
+    lex();
+  if (Token.isErrorOrEOF())
+    return Token.isError();
+  if (Token.isNot(MIToken::MachineBasicBlockLabel))
+    return error("expected a basic block definition before instructions");
+  unsigned BraceDepth = 0;
+  do {
+    if (parseBasicBlockDefinition(MBBSlots))
+      return true;
+    bool IsAfterNewline = false;
+    // Skip until the next machine basic block.
+    while (true) {
+      if ((Token.is(MIToken::MachineBasicBlockLabel) && IsAfterNewline) ||
+          Token.isErrorOrEOF())
+        break;
+      else if (Token.is(MIToken::MachineBasicBlockLabel))
+        return error("basic block definition should be located at the start of "
+                     "the line");
+      else if (consumeIfPresent(MIToken::Newline)) {
+        IsAfterNewline = true;
+        continue;
+      }
+      IsAfterNewline = false;
+      if (Token.is(MIToken::lbrace))
+        ++BraceDepth;
+      if (Token.is(MIToken::rbrace)) {
+        if (!BraceDepth)
+          return error("extraneous closing brace ('}')");
+        --BraceDepth;
+      }
+      lex();
+    }
+    // Verify that we closed all of the '{' at the end of a file or a block.
+    if (!Token.isError() && BraceDepth)
+      return error("expected '}'"); // FIXME: Report a note that shows '{'.
+  } while (!Token.isErrorOrEOF());
+  return Token.isError();
+}
+
+bool MIParser::parseBasicBlockLiveins(MachineBasicBlock &MBB) {
+  assert(Token.is(MIToken::kw_liveins));
+  lex();
+  if (expectAndConsume(MIToken::colon))
+    return true;
+  if (Token.isNewlineOrEOF()) // Allow an empty list of liveins.
+    return false;
+  do {
+    if (Token.isNot(MIToken::NamedRegister))
+      return error("expected a named register");
+    Register Reg;
+    if (parseNamedRegister(Reg))
+      return true;
+    lex();
+    LaneBitmask Mask = LaneBitmask::getAll();
+    if (consumeIfPresent(MIToken::colon)) {
+      // Parse lane mask.
+      if (Token.isNot(MIToken::IntegerLiteral) &&
+          Token.isNot(MIToken::HexLiteral))
+        return error("expected a lane mask");
+      static_assert(sizeof(LaneBitmask::Type) == sizeof(uint64_t),
+                    "Use correct get-function for lane mask");
+      LaneBitmask::Type V;
+      if (getUint64(V))
+        return error("invalid lane mask value");
+      Mask = LaneBitmask(V);
+      lex();
+    }
+    MBB.addLiveIn(Reg, Mask);
+  } while (consumeIfPresent(MIToken::comma));
+  return false;
+}
+
+bool MIParser::parseBasicBlockSuccessors(MachineBasicBlock &MBB) {
+  assert(Token.is(MIToken::kw_successors));
+  lex();
+  if (expectAndConsume(MIToken::colon))
+    return true;
+  if (Token.isNewlineOrEOF()) // Allow an empty list of successors.
+    return false;
+  do {
+    if (Token.isNot(MIToken::MachineBasicBlock))
+      return error("expected a machine basic block reference");
+    MachineBasicBlock *SuccMBB = nullptr;
+    if (parseMBBReference(SuccMBB))
+      return true;
+    lex();
+    unsigned Weight = 0;
+    if (consumeIfPresent(MIToken::lparen)) {
+      if (Token.isNot(MIToken::IntegerLiteral) &&
+          Token.isNot(MIToken::HexLiteral))
+        return error("expected an integer literal after '('");
+      if (getUnsigned(Weight))
+        return true;
+      lex();
+      if (expectAndConsume(MIToken::rparen))
+        return true;
+    }
+    MBB.addSuccessor(SuccMBB, BranchProbability::getRaw(Weight));
+  } while (consumeIfPresent(MIToken::comma));
+  MBB.normalizeSuccProbs();
+  return false;
+}
+
+bool MIParser::parseBasicBlock(MachineBasicBlock &MBB,
+                               MachineBasicBlock *&AddFalthroughFrom) {
+  // Skip the definition.
+  assert(Token.is(MIToken::MachineBasicBlockLabel));
+  lex();
+  if (consumeIfPresent(MIToken::lparen)) {
+    while (Token.isNot(MIToken::rparen) && !Token.isErrorOrEOF())
+      lex();
+    consumeIfPresent(MIToken::rparen);
+  }
+  consumeIfPresent(MIToken::colon);
+
+  // Parse the liveins and successors.
+  // N.B: Multiple lists of successors and liveins are allowed and they're
+  // merged into one.
+  // Example:
+  //   liveins: $edi
+  //   liveins: $esi
+  //
+  // is equivalent to
+  //   liveins: $edi, $esi
+  bool ExplicitSuccessors = false;
+  while (true) {
+    if (Token.is(MIToken::kw_successors)) {
+      if (parseBasicBlockSuccessors(MBB))
+        return true;
+      ExplicitSuccessors = true;
+    } else if (Token.is(MIToken::kw_liveins)) {
+      if (parseBasicBlockLiveins(MBB))
+        return true;
+    } else if (consumeIfPresent(MIToken::Newline)) {
+      continue;
+    } else
+      break;
+    if (!Token.isNewlineOrEOF())
+      return error("expected line break at the end of a list");
+    lex();
+  }
+
+  // Parse the instructions.
+  bool IsInBundle = false;
+  MachineInstr *PrevMI = nullptr;
+  while (!Token.is(MIToken::MachineBasicBlockLabel) &&
+         !Token.is(MIToken::Eof)) {
+    if (consumeIfPresent(MIToken::Newline))
+      continue;
+    if (consumeIfPresent(MIToken::rbrace)) {
+      // The first parsing pass should verify that all closing '}' have an
+      // opening '{'.
+      assert(IsInBundle);
+      IsInBundle = false;
+      continue;
+    }
+    MachineInstr *MI = nullptr;
+    if (parse(MI))
+      return true;
+    MBB.insert(MBB.end(), MI);
+    if (IsInBundle) {
+      PrevMI->setFlag(MachineInstr::BundledSucc);
+      MI->setFlag(MachineInstr::BundledPred);
+    }
+    PrevMI = MI;
+    if (Token.is(MIToken::lbrace)) {
+      if (IsInBundle)
+        return error("nested instruction bundles are not allowed");
+      lex();
+      // This instruction is the start of the bundle.
+      MI->setFlag(MachineInstr::BundledSucc);
+      IsInBundle = true;
+      if (!Token.is(MIToken::Newline))
+        // The next instruction can be on the same line.
+        continue;
+    }
+    assert(Token.isNewlineOrEOF() && "MI is not fully parsed");
+    lex();
+  }
+
+  // Construct successor list by searching for basic block machine operands.
+  if (!ExplicitSuccessors) {
+    SmallVector<MachineBasicBlock*,4> Successors;
+    bool IsFallthrough;
+    guessSuccessors(MBB, Successors, IsFallthrough);
+    for (MachineBasicBlock *Succ : Successors)
+      MBB.addSuccessor(Succ);
+
+    if (IsFallthrough) {
+      AddFalthroughFrom = &MBB;
+    } else {
+      MBB.normalizeSuccProbs();
+    }
+  }
+
+  return false;
+}
+
+bool MIParser::parseBasicBlocks() {
+  lex();
+  // Skip until the first machine basic block.
+  while (Token.is(MIToken::Newline))
+    lex();
+  if (Token.isErrorOrEOF())
+    return Token.isError();
+  // The first parsing pass should have verified that this token is a MBB label
+  // in the 'parseBasicBlockDefinitions' method.
+  assert(Token.is(MIToken::MachineBasicBlockLabel));
+  MachineBasicBlock *AddFalthroughFrom = nullptr;
+  do {
+    MachineBasicBlock *MBB = nullptr;
+    if (parseMBBReference(MBB))
+      return true;
+    if (AddFalthroughFrom) {
+      if (!AddFalthroughFrom->isSuccessor(MBB))
+        AddFalthroughFrom->addSuccessor(MBB);
+      AddFalthroughFrom->normalizeSuccProbs();
+      AddFalthroughFrom = nullptr;
+    }
+    if (parseBasicBlock(*MBB, AddFalthroughFrom))
+      return true;
+    // The method 'parseBasicBlock' should parse the whole block until the next
+    // block or the end of file.
+    assert(Token.is(MIToken::MachineBasicBlockLabel) || Token.is(MIToken::Eof));
+  } while (Token.isNot(MIToken::Eof));
+  return false;
+}
+
+bool MIParser::parse(MachineInstr *&MI) {
+  // Parse any register operands before '='
+  MachineOperand MO = MachineOperand::CreateImm(0);
+  SmallVector<ParsedMachineOperand, 8> Operands;
+  while (Token.isRegister() || Token.isRegisterFlag()) {
+    auto Loc = Token.location();
+    std::optional<unsigned> TiedDefIdx;
+    if (parseRegisterOperand(MO, TiedDefIdx, /*IsDef=*/true))
+      return true;
+    Operands.push_back(
+        ParsedMachineOperand(MO, Loc, Token.location(), TiedDefIdx));
+    if (Token.isNot(MIToken::comma))
+      break;
+    lex();
+  }
+  if (!Operands.empty() && expectAndConsume(MIToken::equal))
+    return true;
+
+  unsigned OpCode, Flags = 0;
+  if (Token.isError() || parseInstruction(OpCode, Flags))
+    return true;
+
+  // Parse the remaining machine operands.
+  while (!Token.isNewlineOrEOF() && Token.isNot(MIToken::kw_pre_instr_symbol) &&
+         Token.isNot(MIToken::kw_post_instr_symbol) &&
+         Token.isNot(MIToken::kw_heap_alloc_marker) &&
+         Token.isNot(MIToken::kw_pcsections) &&
+         Token.isNot(MIToken::kw_cfi_type) &&
+         Token.isNot(MIToken::kw_debug_location) &&
+         Token.isNot(MIToken::kw_debug_instr_number) &&
+         Token.isNot(MIToken::coloncolon) && Token.isNot(MIToken::lbrace)) {
+    auto Loc = Token.location();
+    std::optional<unsigned> TiedDefIdx;
+    if (parseMachineOperandAndTargetFlags(OpCode, Operands.size(), MO, TiedDefIdx))
+      return true;
+    Operands.push_back(
+        ParsedMachineOperand(MO, Loc, Token.location(), TiedDefIdx));
+    if (Token.isNewlineOrEOF() || Token.is(MIToken::coloncolon) ||
+        Token.is(MIToken::lbrace))
+      break;
+    if (Token.isNot(MIToken::comma))
+      return error("expected ',' before the next machine operand");
+    lex();
+  }
+
+  MCSymbol *PreInstrSymbol = nullptr;
+  if (Token.is(MIToken::kw_pre_instr_symbol))
+    if (parsePreOrPostInstrSymbol(PreInstrSymbol))
+      return true;
+  MCSymbol *PostInstrSymbol = nullptr;
+  if (Token.is(MIToken::kw_post_instr_symbol))
+    if (parsePreOrPostInstrSymbol(PostInstrSymbol))
+      return true;
+  MDNode *HeapAllocMarker = nullptr;
+  if (Token.is(MIToken::kw_heap_alloc_marker))
+    if (parseHeapAllocMarker(HeapAllocMarker))
+      return true;
+  MDNode *PCSections = nullptr;
+  if (Token.is(MIToken::kw_pcsections))
+    if (parsePCSections(PCSections))
+      return true;
+
+  unsigned CFIType = 0;
+  if (Token.is(MIToken::kw_cfi_type)) {
+    lex();
+    if (Token.isNot(MIToken::IntegerLiteral))
+      return error("expected an integer literal after 'cfi-type'");
+    // getUnsigned is sufficient for 32-bit integers.
+    if (getUnsigned(CFIType))
+      return true;
+    lex();
+    // Lex past trailing comma if present.
+    if (Token.is(MIToken::comma))
+      lex();
+  }
+
+  unsigned InstrNum = 0;
+  if (Token.is(MIToken::kw_debug_instr_number)) {
+    lex();
+    if (Token.isNot(MIToken::IntegerLiteral))
+      return error("expected an integer literal after 'debug-instr-number'");
+    if (getUnsigned(InstrNum))
+      return true;
+    lex();
+    // Lex past trailing comma if present.
+    if (Token.is(MIToken::comma))
+      lex();
+  }
+
+  DebugLoc DebugLocation;
+  if (Token.is(MIToken::kw_debug_location)) {
+    lex();
+    MDNode *Node = nullptr;
+    if (Token.is(MIToken::exclaim)) {
+      if (parseMDNode(Node))
+        return true;
+    } else if (Token.is(MIToken::md_dilocation)) {
+      if (parseDILocation(Node))
+        return true;
+    } else
+      return error("expected a metadata node after 'debug-location'");
+    if (!isa<DILocation>(Node))
+      return error("referenced metadata is not a DILocation");
+    DebugLocation = DebugLoc(Node);
+  }
+
+  // Parse the machine memory operands.
+  SmallVector<MachineMemOperand *, 2> MemOperands;
+  if (Token.is(MIToken::coloncolon)) {
+    lex();
+    while (!Token.isNewlineOrEOF()) {
+      MachineMemOperand *MemOp = nullptr;
+      if (parseMachineMemoryOperand(MemOp))
+        return true;
+      MemOperands.push_back(MemOp);
+      if (Token.isNewlineOrEOF())
+        break;
+      if (Token.isNot(MIToken::comma))
+        return error("expected ',' before the next machine memory operand");
+      lex();
+    }
+  }
+
+  const auto &MCID = MF.getSubtarget().getInstrInfo()->get(OpCode);
+  if (!MCID.isVariadic()) {
+    // FIXME: Move the implicit operand verification to the machine verifier.
+    if (verifyImplicitOperands(Operands, MCID))
+      return true;
+  }
+
+  MI = MF.CreateMachineInstr(MCID, DebugLocation, /*NoImplicit=*/true);
+  MI->setFlags(Flags);
+
+  unsigned NumExplicitOps = 0;
+  for (const auto &Operand : Operands) {
+    bool IsImplicitOp = Operand.Operand.isReg() && Operand.Operand.isImplicit();
+    if (!IsImplicitOp) {
+      if (!MCID.isVariadic() && NumExplicitOps >= MCID.getNumOperands() &&
+          !Operand.Operand.isValidExcessOperand())
+        return error(Operand.Begin, "too many operands for instruction");
+
+      ++NumExplicitOps;
+    }
+
+    MI->addOperand(MF, Operand.Operand);
+  }
+
+  if (assignRegisterTies(*MI, Operands))
+    return true;
+  if (PreInstrSymbol)
+    MI->setPreInstrSymbol(MF, PreInstrSymbol);
+  if (PostInstrSymbol)
+    MI->setPostInstrSymbol(MF, PostInstrSymbol);
+  if (HeapAllocMarker)
+    MI->setHeapAllocMarker(MF, HeapAllocMarker);
+  if (PCSections)
+    MI->setPCSections(MF, PCSections);
+  if (CFIType)
+    MI->setCFIType(MF, CFIType);
+  if (!MemOperands.empty())
+    MI->setMemRefs(MF, MemOperands);
+  if (InstrNum)
+    MI->setDebugInstrNum(InstrNum);
+  return false;
+}
+
+bool MIParser::parseStandaloneMBB(MachineBasicBlock *&MBB) {
+  lex();
+  if (Token.isNot(MIToken::MachineBasicBlock))
+    return error("expected a machine basic block reference");
+  if (parseMBBReference(MBB))
+    return true;
+  lex();
+  if (Token.isNot(MIToken::Eof))
+    return error(
+        "expected end of string after the machine basic block reference");
+  return false;
+}
+
+bool MIParser::parseStandaloneNamedRegister(Register &Reg) {
+  lex();
+  if (Token.isNot(MIToken::NamedRegister))
+    return error("expected a named register");
+  if (parseNamedRegister(Reg))
+    return true;
+  lex();
+  if (Token.isNot(MIToken::Eof))
+    return error("expected end of string after the register reference");
+  return false;
+}
+
+bool MIParser::parseStandaloneVirtualRegister(VRegInfo *&Info) {
+  lex();
+  if (Token.isNot(MIToken::VirtualRegister))
+    return error("expected a virtual register");
+  if (parseVirtualRegister(Info))
+    return true;
+  lex();
+  if (Token.isNot(MIToken::Eof))
+    return error("expected end of string after the register reference");
+  return false;
+}
+
+bool MIParser::parseStandaloneRegister(Register &Reg) {
+  lex();
+  if (Token.isNot(MIToken::NamedRegister) &&
+      Token.isNot(MIToken::VirtualRegister))
+    return error("expected either a named or virtual register");
+
+  VRegInfo *Info;
+  if (parseRegister(Reg, Info))
+    return true;
+
+  lex();
+  if (Token.isNot(MIToken::Eof))
+    return error("expected end of string after the register reference");
+  return false;
+}
+
+bool MIParser::parseStandaloneStackObject(int &FI) {
+  lex();
+  if (Token.isNot(MIToken::StackObject))
+    return error("expected a stack object");
+  if (parseStackFrameIndex(FI))
+    return true;
+  if (Token.isNot(MIToken::Eof))
+    return error("expected end of string after the stack object reference");
+  return false;
+}
+
+bool MIParser::parseStandaloneMDNode(MDNode *&Node) {
+  lex();
+  if (Token.is(MIToken::exclaim)) {
+    if (parseMDNode(Node))
+      return true;
+  } else if (Token.is(MIToken::md_diexpr)) {
+    if (parseDIExpression(Node))
+      return true;
+  } else if (Token.is(MIToken::md_dilocation)) {
+    if (parseDILocation(Node))
+      return true;
+  } else
+    return error("expected a metadata node");
+  if (Token.isNot(MIToken::Eof))
+    return error("expected end of string after the metadata node");
+  return false;
+}
+
+bool MIParser::parseMachineMetadata() {
+  lex();
+  if (Token.isNot(MIToken::exclaim))
+    return error("expected a metadata node");
+
+  lex();
+  if (Token.isNot(MIToken::IntegerLiteral) || Token.integerValue().isSigned())
+    return error("expected metadata id after '!'");
+  unsigned ID = 0;
+  if (getUnsigned(ID))
+    return true;
+  lex();
+  if (expectAndConsume(MIToken::equal))
+    return true;
+  bool IsDistinct = Token.is(MIToken::kw_distinct);
+  if (IsDistinct)
+    lex();
+  if (Token.isNot(MIToken::exclaim))
+    return error("expected a metadata node");
+  lex();
+
+  MDNode *MD;
+  if (parseMDTuple(MD, IsDistinct))
+    return true;
+
+  auto FI = PFS.MachineForwardRefMDNodes.find(ID);
+  if (FI != PFS.MachineForwardRefMDNodes.end()) {
+    FI->second.first->replaceAllUsesWith(MD);
+    PFS.MachineForwardRefMDNodes.erase(FI);
+
+    assert(PFS.MachineMetadataNodes[ID] == MD && "Tracking VH didn't work");
+  } else {
+    if (PFS.MachineMetadataNodes.count(ID))
+      return error("Metadata id is already used");
+    PFS.MachineMetadataNodes[ID].reset(MD);
+  }
+
+  return false;
+}
+
+bool MIParser::parseMDTuple(MDNode *&MD, bool IsDistinct) {
+  SmallVector<Metadata *, 16> Elts;
+  if (parseMDNodeVector(Elts))
+    return true;
+  MD = (IsDistinct ? MDTuple::getDistinct
+                   : MDTuple::get)(MF.getFunction().getContext(), Elts);
+  return false;
+}
+
+bool MIParser::parseMDNodeVector(SmallVectorImpl<Metadata *> &Elts) {
+  if (Token.isNot(MIToken::lbrace))
+    return error("expected '{' here");
+  lex();
+
+  if (Token.is(MIToken::rbrace)) {
+    lex();
+    return false;
+  }
+
+  do {
+    Metadata *MD;
+    if (parseMetadata(MD))
+      return true;
+
+    Elts.push_back(MD);
+
+    if (Token.isNot(MIToken::comma))
+      break;
+    lex();
+  } while (true);
+
+  if (Token.isNot(MIToken::rbrace))
+    return error("expected end of metadata node");
+  lex();
+
+  return false;
+}
+
+// ::= !42
+// ::= !"string"
+bool MIParser::parseMetadata(Metadata *&MD) {
+  if (Token.isNot(MIToken::exclaim))
+    return error("expected '!' here");
+  lex();
+
+  if (Token.is(MIToken::StringConstant)) {
+    std::string Str;
+    if (parseStringConstant(Str))
+      return true;
+    MD = MDString::get(MF.getFunction().getContext(), Str);
+    return false;
+  }
+
+  if (Token.isNot(MIToken::IntegerLiteral) || Token.integerValue().isSigned())
+    return error("expected metadata id after '!'");
+
+  SMLoc Loc = mapSMLoc(Token.location());
+
+  unsigned ID = 0;
+  if (getUnsigned(ID))
+    return true;
+  lex();
+
+  auto NodeInfo = PFS.IRSlots.MetadataNodes.find(ID);
+  if (NodeInfo != PFS.IRSlots.MetadataNodes.end()) {
+    MD = NodeInfo->second.get();
+    return false;
+  }
+  // Check machine metadata.
+  NodeInfo = PFS.MachineMetadataNodes.find(ID);
+  if (NodeInfo != PFS.MachineMetadataNodes.end()) {
+    MD = NodeInfo->second.get();
+    return false;
+  }
+  // Forward reference.
+  auto &FwdRef = PFS.MachineForwardRefMDNodes[ID];
+  FwdRef = std::make_pair(
+      MDTuple::getTemporary(MF.getFunction().getContext(), std::nullopt), Loc);
+  PFS.MachineMetadataNodes[ID].reset(FwdRef.first.get());
+  MD = FwdRef.first.get();
+
+  return false;
+}
+
+static const char *printImplicitRegisterFlag(const MachineOperand &MO) {
+  assert(MO.isImplicit());
+  return MO.isDef() ? "implicit-def" : "implicit";
+}
+
+static std::string getRegisterName(const TargetRegisterInfo *TRI,
+                                   Register Reg) {
+  assert(Reg.isPhysical() && "expected phys reg");
+  return StringRef(TRI->getName(Reg)).lower();
+}
+
+/// Return true if the parsed machine operands contain a given machine operand.
+static bool isImplicitOperandIn(const MachineOperand &ImplicitOperand,
+                                ArrayRef<ParsedMachineOperand> Operands) {
+  for (const auto &I : Operands) {
+    if (ImplicitOperand.isIdenticalTo(I.Operand))
+      return true;
+  }
+  return false;
+}
+
+bool MIParser::verifyImplicitOperands(ArrayRef<ParsedMachineOperand> Operands,
+                                      const MCInstrDesc &MCID) {
+  if (MCID.isCall())
+    // We can't verify call instructions as they can contain arbitrary implicit
+    // register and register mask operands.
+    return false;
+
+  // Gather all the expected implicit operands.
+  SmallVector<MachineOperand, 4> ImplicitOperands;
+  for (MCPhysReg ImpDef : MCID.implicit_defs())
+    ImplicitOperands.push_back(MachineOperand::CreateReg(ImpDef, true, true));
+  for (MCPhysReg ImpUse : MCID.implicit_uses())
+    ImplicitOperands.push_back(MachineOperand::CreateReg(ImpUse, false, true));
+
+  const auto *TRI = MF.getSubtarget().getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  for (const auto &I : ImplicitOperands) {
+    if (isImplicitOperandIn(I, Operands))
+      continue;
+    return error(Operands.empty() ? Token.location() : Operands.back().End,
+                 Twine("missing implicit register operand '") +
+                     printImplicitRegisterFlag(I) + " $" +
+                     getRegisterName(TRI, I.getReg()) + "'");
+  }
+  return false;
+}
+
+bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) {
+  // Allow frame and fast math flags for OPCODE
+  while (Token.is(MIToken::kw_frame_setup) ||
+         Token.is(MIToken::kw_frame_destroy) ||
+         Token.is(MIToken::kw_nnan) ||
+         Token.is(MIToken::kw_ninf) ||
+         Token.is(MIToken::kw_nsz) ||
+         Token.is(MIToken::kw_arcp) ||
+         Token.is(MIToken::kw_contract) ||
+         Token.is(MIToken::kw_afn) ||
+         Token.is(MIToken::kw_reassoc) ||
+         Token.is(MIToken::kw_nuw) ||
+         Token.is(MIToken::kw_nsw) ||
+         Token.is(MIToken::kw_exact) ||
+         Token.is(MIToken::kw_nofpexcept) ||
+         Token.is(MIToken::kw_unpredictable)) {
+    // Mine frame and fast math flags
+    if (Token.is(MIToken::kw_frame_setup))
+      Flags |= MachineInstr::FrameSetup;
+    if (Token.is(MIToken::kw_frame_destroy))
+      Flags |= MachineInstr::FrameDestroy;
+    if (Token.is(MIToken::kw_nnan))
+      Flags |= MachineInstr::FmNoNans;
+    if (Token.is(MIToken::kw_ninf))
+      Flags |= MachineInstr::FmNoInfs;
+    if (Token.is(MIToken::kw_nsz))
+      Flags |= MachineInstr::FmNsz;
+    if (Token.is(MIToken::kw_arcp))
+      Flags |= MachineInstr::FmArcp;
+    if (Token.is(MIToken::kw_contract))
+      Flags |= MachineInstr::FmContract;
+    if (Token.is(MIToken::kw_afn))
+      Flags |= MachineInstr::FmAfn;
+    if (Token.is(MIToken::kw_reassoc))
+      Flags |= MachineInstr::FmReassoc;
+    if (Token.is(MIToken::kw_nuw))
+      Flags |= MachineInstr::NoUWrap;
+    if (Token.is(MIToken::kw_nsw))
+      Flags |= MachineInstr::NoSWrap;
+    if (Token.is(MIToken::kw_exact))
+      Flags |= MachineInstr::IsExact;
+    if (Token.is(MIToken::kw_nofpexcept))
+      Flags |= MachineInstr::NoFPExcept;
+    if (Token.is(MIToken::kw_unpredictable))
+      Flags |= MachineInstr::Unpredictable;
+
+    lex();
+  }
+  if (Token.isNot(MIToken::Identifier))
+    return error("expected a machine instruction");
+  StringRef InstrName = Token.stringValue();
+  if (PFS.Target.parseInstrName(InstrName, OpCode))
+    return error(Twine("unknown machine instruction name '") + InstrName + "'");
+  lex();
+  return false;
+}
+
+bool MIParser::parseNamedRegister(Register &Reg) {
+  assert(Token.is(MIToken::NamedRegister) && "Needs NamedRegister token");
+  StringRef Name = Token.stringValue();
+  if (PFS.Target.getRegisterByName(Name, Reg))
+    return error(Twine("unknown register name '") + Name + "'");
+  return false;
+}
+
+bool MIParser::parseNamedVirtualRegister(VRegInfo *&Info) {
+  assert(Token.is(MIToken::NamedVirtualRegister) && "Expected NamedVReg token");
+  StringRef Name = Token.stringValue();
+  // TODO: Check that the VReg name is not the same as a physical register name.
+  //       If it is, then print a warning (when warnings are implemented).
+  Info = &PFS.getVRegInfoNamed(Name);
+  return false;
+}
+
+bool MIParser::parseVirtualRegister(VRegInfo *&Info) {
+  if (Token.is(MIToken::NamedVirtualRegister))
+    return parseNamedVirtualRegister(Info);
+  assert(Token.is(MIToken::VirtualRegister) && "Needs VirtualRegister token");
+  unsigned ID;
+  if (getUnsigned(ID))
+    return true;
+  Info = &PFS.getVRegInfo(ID);
+  return false;
+}
+
+bool MIParser::parseRegister(Register &Reg, VRegInfo *&Info) {
+  switch (Token.kind()) {
+  case MIToken::underscore:
+    Reg = 0;
+    return false;
+  case MIToken::NamedRegister:
+    return parseNamedRegister(Reg);
+  case MIToken::NamedVirtualRegister:
+  case MIToken::VirtualRegister:
+    if (parseVirtualRegister(Info))
+      return true;
+    Reg = Info->VReg;
+    return false;
+  // TODO: Parse other register kinds.
+  default:
+    llvm_unreachable("The current token should be a register");
+  }
+}
+
+bool MIParser::parseRegisterClassOrBank(VRegInfo &RegInfo) {
+  if (Token.isNot(MIToken::Identifier) && Token.isNot(MIToken::underscore))
+    return error("expected '_', register class, or register bank name");
+  StringRef::iterator Loc = Token.location();
+  StringRef Name = Token.stringValue();
+
+  // Was it a register class?
+  const TargetRegisterClass *RC = PFS.Target.getRegClass(Name);
+  if (RC) {
+    lex();
+
+    switch (RegInfo.Kind) {
+    case VRegInfo::UNKNOWN:
+    case VRegInfo::NORMAL:
+      RegInfo.Kind = VRegInfo::NORMAL;
+      if (RegInfo.Explicit && RegInfo.D.RC != RC) {
+        const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+        return error(Loc, Twine("conflicting register classes, previously: ") +
+                     Twine(TRI.getRegClassName(RegInfo.D.RC)));
+      }
+      RegInfo.D.RC = RC;
+      RegInfo.Explicit = true;
+      return false;
+
+    case VRegInfo::GENERIC:
+    case VRegInfo::REGBANK:
+      return error(Loc, "register class specification on generic register");
+    }
+    llvm_unreachable("Unexpected register kind");
+  }
+
+  // Should be a register bank or a generic register.
+  const RegisterBank *RegBank = nullptr;
+  if (Name != "_") {
+    RegBank = PFS.Target.getRegBank(Name);
+    if (!RegBank)
+      return error(Loc, "expected '_', register class, or register bank name");
+  }
+
+  lex();
+
+  switch (RegInfo.Kind) {
+  case VRegInfo::UNKNOWN:
+  case VRegInfo::GENERIC:
+  case VRegInfo::REGBANK:
+    RegInfo.Kind = RegBank ? VRegInfo::REGBANK : VRegInfo::GENERIC;
+    if (RegInfo.Explicit && RegInfo.D.RegBank != RegBank)
+      return error(Loc, "conflicting generic register banks");
+    RegInfo.D.RegBank = RegBank;
+    RegInfo.Explicit = true;
+    return false;
+
+  case VRegInfo::NORMAL:
+    return error(Loc, "register bank specification on normal register");
+  }
+  llvm_unreachable("Unexpected register kind");
+}
+
+bool MIParser::parseRegisterFlag(unsigned &Flags) {
+  const unsigned OldFlags = Flags;
+  switch (Token.kind()) {
+  case MIToken::kw_implicit:
+    Flags |= RegState::Implicit;
+    break;
+  case MIToken::kw_implicit_define:
+    Flags |= RegState::ImplicitDefine;
+    break;
+  case MIToken::kw_def:
+    Flags |= RegState::Define;
+    break;
+  case MIToken::kw_dead:
+    Flags |= RegState::Dead;
+    break;
+  case MIToken::kw_killed:
+    Flags |= RegState::Kill;
+    break;
+  case MIToken::kw_undef:
+    Flags |= RegState::Undef;
+    break;
+  case MIToken::kw_internal:
+    Flags |= RegState::InternalRead;
+    break;
+  case MIToken::kw_early_clobber:
+    Flags |= RegState::EarlyClobber;
+    break;
+  case MIToken::kw_debug_use:
+    Flags |= RegState::Debug;
+    break;
+  case MIToken::kw_renamable:
+    Flags |= RegState::Renamable;
+    break;
+  default:
+    llvm_unreachable("The current token should be a register flag");
+  }
+  if (OldFlags == Flags)
+    // We know that the same flag is specified more than once when the flags
+    // weren't modified.
+    return error("duplicate '" + Token.stringValue() + "' register flag");
+  lex();
+  return false;
+}
+
+bool MIParser::parseSubRegisterIndex(unsigned &SubReg) {
+  assert(Token.is(MIToken::dot));
+  lex();
+  if (Token.isNot(MIToken::Identifier))
+    return error("expected a subregister index after '.'");
+  auto Name = Token.stringValue();
+  SubReg = PFS.Target.getSubRegIndex(Name);
+  if (!SubReg)
+    return error(Twine("use of unknown subregister index '") + Name + "'");
+  lex();
+  return false;
+}
+
+bool MIParser::parseRegisterTiedDefIndex(unsigned &TiedDefIdx) {
+  if (!consumeIfPresent(MIToken::kw_tied_def))
+    return true;
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error("expected an integer literal after 'tied-def'");
+  if (getUnsigned(TiedDefIdx))
+    return true;
+  lex();
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  return false;
+}
+
+bool MIParser::assignRegisterTies(MachineInstr &MI,
+                                  ArrayRef<ParsedMachineOperand> Operands) {
+  SmallVector<std::pair<unsigned, unsigned>, 4> TiedRegisterPairs;
+  for (unsigned I = 0, E = Operands.size(); I != E; ++I) {
+    if (!Operands[I].TiedDefIdx)
+      continue;
+    // The parser ensures that this operand is a register use, so we just have
+    // to check the tied-def operand.
+    unsigned DefIdx = *Operands[I].TiedDefIdx;
+    if (DefIdx >= E)
+      return error(Operands[I].Begin,
+                   Twine("use of invalid tied-def operand index '" +
+                         Twine(DefIdx) + "'; instruction has only ") +
+                       Twine(E) + " operands");
+    const auto &DefOperand = Operands[DefIdx].Operand;
+    if (!DefOperand.isReg() || !DefOperand.isDef())
+      // FIXME: add note with the def operand.
+      return error(Operands[I].Begin,
+                   Twine("use of invalid tied-def operand index '") +
+                       Twine(DefIdx) + "'; the operand #" + Twine(DefIdx) +
+                       " isn't a defined register");
+    // Check that the tied-def operand wasn't tied elsewhere.
+    for (const auto &TiedPair : TiedRegisterPairs) {
+      if (TiedPair.first == DefIdx)
+        return error(Operands[I].Begin,
+                     Twine("the tied-def operand #") + Twine(DefIdx) +
+                         " is already tied with another register operand");
+    }
+    TiedRegisterPairs.push_back(std::make_pair(DefIdx, I));
+  }
+  // FIXME: Verify that for non INLINEASM instructions, the def and use tied
+  // indices must be less than tied max.
+  for (const auto &TiedPair : TiedRegisterPairs)
+    MI.tieOperands(TiedPair.first, TiedPair.second);
+  return false;
+}
+
+bool MIParser::parseRegisterOperand(MachineOperand &Dest,
+                                    std::optional<unsigned> &TiedDefIdx,
+                                    bool IsDef) {
+  unsigned Flags = IsDef ? RegState::Define : 0;
+  while (Token.isRegisterFlag()) {
+    if (parseRegisterFlag(Flags))
+      return true;
+  }
+  if (!Token.isRegister())
+    return error("expected a register after register flags");
+  Register Reg;
+  VRegInfo *RegInfo;
+  if (parseRegister(Reg, RegInfo))
+    return true;
+  lex();
+  unsigned SubReg = 0;
+  if (Token.is(MIToken::dot)) {
+    if (parseSubRegisterIndex(SubReg))
+      return true;
+    if (!Reg.isVirtual())
+      return error("subregister index expects a virtual register");
+  }
+  if (Token.is(MIToken::colon)) {
+    if (!Reg.isVirtual())
+      return error("register class specification expects a virtual register");
+    lex();
+    if (parseRegisterClassOrBank(*RegInfo))
+        return true;
+  }
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  if ((Flags & RegState::Define) == 0) {
+    if (consumeIfPresent(MIToken::lparen)) {
+      unsigned Idx;
+      if (!parseRegisterTiedDefIndex(Idx))
+        TiedDefIdx = Idx;
+      else {
+        // Try a redundant low-level type.
+        LLT Ty;
+        if (parseLowLevelType(Token.location(), Ty))
+          return error("expected tied-def or low-level type after '('");
+
+        if (expectAndConsume(MIToken::rparen))
+          return true;
+
+        if (MRI.getType(Reg).isValid() && MRI.getType(Reg) != Ty)
+          return error("inconsistent type for generic virtual register");
+
+        MRI.setRegClassOrRegBank(Reg, static_cast<RegisterBank *>(nullptr));
+        MRI.setType(Reg, Ty);
+      }
+    }
+  } else if (consumeIfPresent(MIToken::lparen)) {
+    // Virtual registers may have a tpe with GlobalISel.
+    if (!Reg.isVirtual())
+      return error("unexpected type on physical register");
+
+    LLT Ty;
+    if (parseLowLevelType(Token.location(), Ty))
+      return true;
+
+    if (expectAndConsume(MIToken::rparen))
+      return true;
+
+    if (MRI.getType(Reg).isValid() && MRI.getType(Reg) != Ty)
+      return error("inconsistent type for generic virtual register");
+
+    MRI.setRegClassOrRegBank(Reg, static_cast<RegisterBank *>(nullptr));
+    MRI.setType(Reg, Ty);
+  } else if (Reg.isVirtual()) {
+    // Generic virtual registers must have a type.
+    // If we end up here this means the type hasn't been specified and
+    // this is bad!
+    if (RegInfo->Kind == VRegInfo::GENERIC ||
+        RegInfo->Kind == VRegInfo::REGBANK)
+      return error("generic virtual registers must have a type");
+  }
+
+  if (Flags & RegState::Define) {
+    if (Flags & RegState::Kill)
+      return error("cannot have a killed def operand");
+  } else {
+    if (Flags & RegState::Dead)
+      return error("cannot have a dead use operand");
+  }
+
+  Dest = MachineOperand::CreateReg(
+      Reg, Flags & RegState::Define, Flags & RegState::Implicit,
+      Flags & RegState::Kill, Flags & RegState::Dead, Flags & RegState::Undef,
+      Flags & RegState::EarlyClobber, SubReg, Flags & RegState::Debug,
+      Flags & RegState::InternalRead, Flags & RegState::Renamable);
+
+  return false;
+}
+
+bool MIParser::parseImmediateOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::IntegerLiteral));
+  const APSInt &Int = Token.integerValue();
+  if (auto SImm = Int.trySExtValue(); Int.isSigned() && SImm.has_value())
+    Dest = MachineOperand::CreateImm(*SImm);
+  else if (auto UImm = Int.tryZExtValue(); !Int.isSigned() && UImm.has_value())
+    Dest = MachineOperand::CreateImm(*UImm);
+  else
+    return error("integer literal is too large to be an immediate operand");
+  lex();
+  return false;
+}
+
+bool MIParser::parseTargetImmMnemonic(const unsigned OpCode,
+                                      const unsigned OpIdx,
+                                      MachineOperand &Dest,
+                                      const MIRFormatter &MF) {
+  assert(Token.is(MIToken::dot));
+  auto Loc = Token.location(); // record start position
+  size_t Len = 1;              // for "."
+  lex();
+
+  // Handle the case that mnemonic starts with number.
+  if (Token.is(MIToken::IntegerLiteral)) {
+    Len += Token.range().size();
+    lex();
+  }
+
+  StringRef Src;
+  if (Token.is(MIToken::comma))
+    Src = StringRef(Loc, Len);
+  else {
+    assert(Token.is(MIToken::Identifier));
+    Src = StringRef(Loc, Len + Token.stringValue().size());
+  }
+  int64_t Val;
+  if (MF.parseImmMnemonic(OpCode, OpIdx, Src, Val,
+                          [this](StringRef::iterator Loc, const Twine &Msg)
+                              -> bool { return error(Loc, Msg); }))
+    return true;
+
+  Dest = MachineOperand::CreateImm(Val);
+  if (!Token.is(MIToken::comma))
+    lex();
+  return false;
+}
+
+static bool parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
+                            PerFunctionMIParsingState &PFS, const Constant *&C,
+                            ErrorCallbackType ErrCB) {
+  auto Source = StringValue.str(); // The source has to be null terminated.
+  SMDiagnostic Err;
+  C = parseConstantValue(Source, Err, *PFS.MF.getFunction().getParent(),
+                         &PFS.IRSlots);
+  if (!C)
+    return ErrCB(Loc + Err.getColumnNo(), Err.getMessage());
+  return false;
+}
+
+bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
+                               const Constant *&C) {
+  return ::parseIRConstant(
+      Loc, StringValue, PFS, C,
+      [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
+bool MIParser::parseIRConstant(StringRef::iterator Loc, const Constant *&C) {
+  if (parseIRConstant(Loc, StringRef(Loc, Token.range().end() - Loc), C))
+    return true;
+  lex();
+  return false;
+}
+
+// See LLT implementation for bit size limits.
+static bool verifyScalarSize(uint64_t Size) {
+  return Size != 0 && isUInt<16>(Size);
+}
+
+static bool verifyVectorElementCount(uint64_t NumElts) {
+  return NumElts != 0 && isUInt<16>(NumElts);
+}
+
+static bool verifyAddrSpace(uint64_t AddrSpace) {
+  return isUInt<24>(AddrSpace);
+}
+
+bool MIParser::parseLowLevelType(StringRef::iterator Loc, LLT &Ty) {
+  if (Token.range().front() == 's' || Token.range().front() == 'p') {
+    StringRef SizeStr = Token.range().drop_front();
+    if (SizeStr.size() == 0 || !llvm::all_of(SizeStr, isdigit))
+      return error("expected integers after 's'/'p' type character");
+  }
+
+  if (Token.range().front() == 's') {
+    auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
+    if (!verifyScalarSize(ScalarSize))
+      return error("invalid size for scalar type");
+
+    Ty = LLT::scalar(ScalarSize);
+    lex();
+    return false;
+  } else if (Token.range().front() == 'p') {
+    const DataLayout &DL = MF.getDataLayout();
+    uint64_t AS = APSInt(Token.range().drop_front()).getZExtValue();
+    if (!verifyAddrSpace(AS))
+      return error("invalid address space number");
+
+    Ty = LLT::pointer(AS, DL.getPointerSizeInBits(AS));
+    lex();
+    return false;
+  }
+
+  // Now we're looking for a vector.
+  if (Token.isNot(MIToken::less))
+    return error(Loc,
+                 "expected sN, pA, <M x sN>, or <M x pA> for GlobalISel type");
+  lex();
+
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error(Loc, "expected <M x sN> or <M x pA> for vector type");
+  uint64_t NumElements = Token.integerValue().getZExtValue();
+  if (!verifyVectorElementCount(NumElements))
+    return error("invalid number of vector elements");
+
+  lex();
+
+  if (Token.isNot(MIToken::Identifier) || Token.stringValue() != "x")
+    return error(Loc, "expected <M x sN> or <M x pA> for vector type");
+  lex();
+
+  if (Token.range().front() != 's' && Token.range().front() != 'p')
+    return error(Loc, "expected <M x sN> or <M x pA> for vector type");
+  StringRef SizeStr = Token.range().drop_front();
+  if (SizeStr.size() == 0 || !llvm::all_of(SizeStr, isdigit))
+    return error("expected integers after 's'/'p' type character");
+
+  if (Token.range().front() == 's') {
+    auto ScalarSize = APSInt(Token.range().drop_front()).getZExtValue();
+    if (!verifyScalarSize(ScalarSize))
+      return error("invalid size for scalar type");
+    Ty = LLT::scalar(ScalarSize);
+  } else if (Token.range().front() == 'p') {
+    const DataLayout &DL = MF.getDataLayout();
+    uint64_t AS = APSInt(Token.range().drop_front()).getZExtValue();
+    if (!verifyAddrSpace(AS))
+      return error("invalid address space number");
+
+    Ty = LLT::pointer(AS, DL.getPointerSizeInBits(AS));
+  } else
+    return error(Loc, "expected <M x sN> or <M x pA> for vector type");
+  lex();
+
+  if (Token.isNot(MIToken::greater))
+    return error(Loc, "expected <M x sN> or <M x pA> for vector type");
+  lex();
+
+  Ty = LLT::fixed_vector(NumElements, Ty);
+  return false;
+}
+
+bool MIParser::parseTypedImmediateOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::Identifier));
+  StringRef TypeStr = Token.range();
+  if (TypeStr.front() != 'i' && TypeStr.front() != 's' &&
+      TypeStr.front() != 'p')
+    return error(
+        "a typed immediate operand should start with one of 'i', 's', or 'p'");
+  StringRef SizeStr = Token.range().drop_front();
+  if (SizeStr.size() == 0 || !llvm::all_of(SizeStr, isdigit))
+    return error("expected integers after 'i'/'s'/'p' type character");
+
+  auto Loc = Token.location();
+  lex();
+  if (Token.isNot(MIToken::IntegerLiteral)) {
+    if (Token.isNot(MIToken::Identifier) ||
+        !(Token.range() == "true" || Token.range() == "false"))
+      return error("expected an integer literal");
+  }
+  const Constant *C = nullptr;
+  if (parseIRConstant(Loc, C))
+    return true;
+  Dest = MachineOperand::CreateCImm(cast<ConstantInt>(C));
+  return false;
+}
+
+bool MIParser::parseFPImmediateOperand(MachineOperand &Dest) {
+  auto Loc = Token.location();
+  lex();
+  if (Token.isNot(MIToken::FloatingPointLiteral) &&
+      Token.isNot(MIToken::HexLiteral))
+    return error("expected a floating point literal");
+  const Constant *C = nullptr;
+  if (parseIRConstant(Loc, C))
+    return true;
+  Dest = MachineOperand::CreateFPImm(cast<ConstantFP>(C));
+  return false;
+}
+
+static bool getHexUint(const MIToken &Token, APInt &Result) {
+  assert(Token.is(MIToken::HexLiteral));
+  StringRef S = Token.range();
+  assert(S[0] == '0' && tolower(S[1]) == 'x');
+  // This could be a floating point literal with a special prefix.
+  if (!isxdigit(S[2]))
+    return true;
+  StringRef V = S.substr(2);
+  APInt A(V.size()*4, V, 16);
+
+  // If A is 0, then A.getActiveBits() is 0. This isn't a valid bitwidth. Make
+  // sure it isn't the case before constructing result.
+  unsigned NumBits = (A == 0) ? 32 : A.getActiveBits();
+  Result = APInt(NumBits, ArrayRef<uint64_t>(A.getRawData(), A.getNumWords()));
+  return false;
+}
+
+static bool getUnsigned(const MIToken &Token, unsigned &Result,
+                        ErrorCallbackType ErrCB) {
+  if (Token.hasIntegerValue()) {
+    const uint64_t Limit = uint64_t(std::numeric_limits<unsigned>::max()) + 1;
+    uint64_t Val64 = Token.integerValue().getLimitedValue(Limit);
+    if (Val64 == Limit)
+      return ErrCB(Token.location(), "expected 32-bit integer (too large)");
+    Result = Val64;
+    return false;
+  }
+  if (Token.is(MIToken::HexLiteral)) {
+    APInt A;
+    if (getHexUint(Token, A))
+      return true;
+    if (A.getBitWidth() > 32)
+      return ErrCB(Token.location(), "expected 32-bit integer (too large)");
+    Result = A.getZExtValue();
+    return false;
+  }
+  return true;
+}
+
+bool MIParser::getUnsigned(unsigned &Result) {
+  return ::getUnsigned(
+      Token, Result, [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
+bool MIParser::parseMBBReference(MachineBasicBlock *&MBB) {
+  assert(Token.is(MIToken::MachineBasicBlock) ||
+         Token.is(MIToken::MachineBasicBlockLabel));
+  unsigned Number;
+  if (getUnsigned(Number))
+    return true;
+  auto MBBInfo = PFS.MBBSlots.find(Number);
+  if (MBBInfo == PFS.MBBSlots.end())
+    return error(Twine("use of undefined machine basic block #") +
+                 Twine(Number));
+  MBB = MBBInfo->second;
+  // TODO: Only parse the name if it's a MachineBasicBlockLabel. Deprecate once
+  // we drop the <irname> from the bb.<id>.<irname> format.
+  if (!Token.stringValue().empty() && Token.stringValue() != MBB->getName())
+    return error(Twine("the name of machine basic block #") + Twine(Number) +
+                 " isn't '" + Token.stringValue() + "'");
+  return false;
+}
+
+bool MIParser::parseMBBOperand(MachineOperand &Dest) {
+  MachineBasicBlock *MBB;
+  if (parseMBBReference(MBB))
+    return true;
+  Dest = MachineOperand::CreateMBB(MBB);
+  lex();
+  return false;
+}
+
+bool MIParser::parseStackFrameIndex(int &FI) {
+  assert(Token.is(MIToken::StackObject));
+  unsigned ID;
+  if (getUnsigned(ID))
+    return true;
+  auto ObjectInfo = PFS.StackObjectSlots.find(ID);
+  if (ObjectInfo == PFS.StackObjectSlots.end())
+    return error(Twine("use of undefined stack object '%stack.") + Twine(ID) +
+                 "'");
+  StringRef Name;
+  if (const auto *Alloca =
+          MF.getFrameInfo().getObjectAllocation(ObjectInfo->second))
+    Name = Alloca->getName();
+  if (!Token.stringValue().empty() && Token.stringValue() != Name)
+    return error(Twine("the name of the stack object '%stack.") + Twine(ID) +
+                 "' isn't '" + Token.stringValue() + "'");
+  lex();
+  FI = ObjectInfo->second;
+  return false;
+}
+
+bool MIParser::parseStackObjectOperand(MachineOperand &Dest) {
+  int FI;
+  if (parseStackFrameIndex(FI))
+    return true;
+  Dest = MachineOperand::CreateFI(FI);
+  return false;
+}
+
+bool MIParser::parseFixedStackFrameIndex(int &FI) {
+  assert(Token.is(MIToken::FixedStackObject));
+  unsigned ID;
+  if (getUnsigned(ID))
+    return true;
+  auto ObjectInfo = PFS.FixedStackObjectSlots.find(ID);
+  if (ObjectInfo == PFS.FixedStackObjectSlots.end())
+    return error(Twine("use of undefined fixed stack object '%fixed-stack.") +
+                 Twine(ID) + "'");
+  lex();
+  FI = ObjectInfo->second;
+  return false;
+}
+
+bool MIParser::parseFixedStackObjectOperand(MachineOperand &Dest) {
+  int FI;
+  if (parseFixedStackFrameIndex(FI))
+    return true;
+  Dest = MachineOperand::CreateFI(FI);
+  return false;
+}
+
+static bool parseGlobalValue(const MIToken &Token,
+                             PerFunctionMIParsingState &PFS, GlobalValue *&GV,
+                             ErrorCallbackType ErrCB) {
+  switch (Token.kind()) {
+  case MIToken::NamedGlobalValue: {
+    const Module *M = PFS.MF.getFunction().getParent();
+    GV = M->getNamedValue(Token.stringValue());
+    if (!GV)
+      return ErrCB(Token.location(), Twine("use of undefined global value '") +
+                                         Token.range() + "'");
+    break;
+  }
+  case MIToken::GlobalValue: {
+    unsigned GVIdx;
+    if (getUnsigned(Token, GVIdx, ErrCB))
+      return true;
+    if (GVIdx >= PFS.IRSlots.GlobalValues.size())
+      return ErrCB(Token.location(), Twine("use of undefined global value '@") +
+                                         Twine(GVIdx) + "'");
+    GV = PFS.IRSlots.GlobalValues[GVIdx];
+    break;
+  }
+  default:
+    llvm_unreachable("The current token should be a global value");
+  }
+  return false;
+}
+
+bool MIParser::parseGlobalValue(GlobalValue *&GV) {
+  return ::parseGlobalValue(
+      Token, PFS, GV,
+      [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
+bool MIParser::parseGlobalAddressOperand(MachineOperand &Dest) {
+  GlobalValue *GV = nullptr;
+  if (parseGlobalValue(GV))
+    return true;
+  lex();
+  Dest = MachineOperand::CreateGA(GV, /*Offset=*/0);
+  if (parseOperandsOffset(Dest))
+    return true;
+  return false;
+}
+
+bool MIParser::parseConstantPoolIndexOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::ConstantPoolItem));
+  unsigned ID;
+  if (getUnsigned(ID))
+    return true;
+  auto ConstantInfo = PFS.ConstantPoolSlots.find(ID);
+  if (ConstantInfo == PFS.ConstantPoolSlots.end())
+    return error("use of undefined constant '%const." + Twine(ID) + "'");
+  lex();
+  Dest = MachineOperand::CreateCPI(ID, /*Offset=*/0);
+  if (parseOperandsOffset(Dest))
+    return true;
+  return false;
+}
+
+bool MIParser::parseJumpTableIndexOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::JumpTableIndex));
+  unsigned ID;
+  if (getUnsigned(ID))
+    return true;
+  auto JumpTableEntryInfo = PFS.JumpTableSlots.find(ID);
+  if (JumpTableEntryInfo == PFS.JumpTableSlots.end())
+    return error("use of undefined jump table '%jump-table." + Twine(ID) + "'");
+  lex();
+  Dest = MachineOperand::CreateJTI(JumpTableEntryInfo->second);
+  return false;
+}
+
+bool MIParser::parseExternalSymbolOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::ExternalSymbol));
+  const char *Symbol = MF.createExternalSymbolName(Token.stringValue());
+  lex();
+  Dest = MachineOperand::CreateES(Symbol);
+  if (parseOperandsOffset(Dest))
+    return true;
+  return false;
+}
+
+bool MIParser::parseMCSymbolOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::MCSymbol));
+  MCSymbol *Symbol = getOrCreateMCSymbol(Token.stringValue());
+  lex();
+  Dest = MachineOperand::CreateMCSymbol(Symbol);
+  if (parseOperandsOffset(Dest))
+    return true;
+  return false;
+}
+
+bool MIParser::parseSubRegisterIndexOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::SubRegisterIndex));
+  StringRef Name = Token.stringValue();
+  unsigned SubRegIndex = PFS.Target.getSubRegIndex(Token.stringValue());
+  if (SubRegIndex == 0)
+    return error(Twine("unknown subregister index '") + Name + "'");
+  lex();
+  Dest = MachineOperand::CreateImm(SubRegIndex);
+  return false;
+}
+
+bool MIParser::parseMDNode(MDNode *&Node) {
+  assert(Token.is(MIToken::exclaim));
+
+  auto Loc = Token.location();
+  lex();
+  if (Token.isNot(MIToken::IntegerLiteral) || Token.integerValue().isSigned())
+    return error("expected metadata id after '!'");
+  unsigned ID;
+  if (getUnsigned(ID))
+    return true;
+  auto NodeInfo = PFS.IRSlots.MetadataNodes.find(ID);
+  if (NodeInfo == PFS.IRSlots.MetadataNodes.end()) {
+    NodeInfo = PFS.MachineMetadataNodes.find(ID);
+    if (NodeInfo == PFS.MachineMetadataNodes.end())
+      return error(Loc, "use of undefined metadata '!" + Twine(ID) + "'");
+  }
+  lex();
+  Node = NodeInfo->second.get();
+  return false;
+}
+
+bool MIParser::parseDIExpression(MDNode *&Expr) {
+  assert(Token.is(MIToken::md_diexpr));
+  lex();
+
+  // FIXME: Share this parsing with the IL parser.
+  SmallVector<uint64_t, 8> Elements;
+
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+
+  if (Token.isNot(MIToken::rparen)) {
+    do {
+      if (Token.is(MIToken::Identifier)) {
+        if (unsigned Op = dwarf::getOperationEncoding(Token.stringValue())) {
+          lex();
+          Elements.push_back(Op);
+          continue;
+        }
+        if (unsigned Enc = dwarf::getAttributeEncoding(Token.stringValue())) {
+          lex();
+          Elements.push_back(Enc);
+          continue;
+        }
+        return error(Twine("invalid DWARF op '") + Token.stringValue() + "'");
+      }
+
+      if (Token.isNot(MIToken::IntegerLiteral) ||
+          Token.integerValue().isSigned())
+        return error("expected unsigned integer");
+
+      auto &U = Token.integerValue();
+      if (U.ugt(UINT64_MAX))
+        return error("element too large, limit is " + Twine(UINT64_MAX));
+      Elements.push_back(U.getZExtValue());
+      lex();
+
+    } while (consumeIfPresent(MIToken::comma));
+  }
+
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+
+  Expr = DIExpression::get(MF.getFunction().getContext(), Elements);
+  return false;
+}
+
+bool MIParser::parseDILocation(MDNode *&Loc) {
+  assert(Token.is(MIToken::md_dilocation));
+  lex();
+
+  bool HaveLine = false;
+  unsigned Line = 0;
+  unsigned Column = 0;
+  MDNode *Scope = nullptr;
+  MDNode *InlinedAt = nullptr;
+  bool ImplicitCode = false;
+
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+
+  if (Token.isNot(MIToken::rparen)) {
+    do {
+      if (Token.is(MIToken::Identifier)) {
+        if (Token.stringValue() == "line") {
+          lex();
+          if (expectAndConsume(MIToken::colon))
+            return true;
+          if (Token.isNot(MIToken::IntegerLiteral) ||
+              Token.integerValue().isSigned())
+            return error("expected unsigned integer");
+          Line = Token.integerValue().getZExtValue();
+          HaveLine = true;
+          lex();
+          continue;
+        }
+        if (Token.stringValue() == "column") {
+          lex();
+          if (expectAndConsume(MIToken::colon))
+            return true;
+          if (Token.isNot(MIToken::IntegerLiteral) ||
+              Token.integerValue().isSigned())
+            return error("expected unsigned integer");
+          Column = Token.integerValue().getZExtValue();
+          lex();
+          continue;
+        }
+        if (Token.stringValue() == "scope") {
+          lex();
+          if (expectAndConsume(MIToken::colon))
+            return true;
+          if (parseMDNode(Scope))
+            return error("expected metadata node");
+          if (!isa<DIScope>(Scope))
+            return error("expected DIScope node");
+          continue;
+        }
+        if (Token.stringValue() == "inlinedAt") {
+          lex();
+          if (expectAndConsume(MIToken::colon))
+            return true;
+          if (Token.is(MIToken::exclaim)) {
+            if (parseMDNode(InlinedAt))
+              return true;
+          } else if (Token.is(MIToken::md_dilocation)) {
+            if (parseDILocation(InlinedAt))
+              return true;
+          } else
+            return error("expected metadata node");
+          if (!isa<DILocation>(InlinedAt))
+            return error("expected DILocation node");
+          continue;
+        }
+        if (Token.stringValue() == "isImplicitCode") {
+          lex();
+          if (expectAndConsume(MIToken::colon))
+            return true;
+          if (!Token.is(MIToken::Identifier))
+            return error("expected true/false");
+          // As far as I can see, we don't have any existing need for parsing
+          // true/false in MIR yet. Do it ad-hoc until there's something else
+          // that needs it.
+          if (Token.stringValue() == "true")
+            ImplicitCode = true;
+          else if (Token.stringValue() == "false")
+            ImplicitCode = false;
+          else
+            return error("expected true/false");
+          lex();
+          continue;
+        }
+      }
+      return error(Twine("invalid DILocation argument '") +
+                   Token.stringValue() + "'");
+    } while (consumeIfPresent(MIToken::comma));
+  }
+
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+
+  if (!HaveLine)
+    return error("DILocation requires line number");
+  if (!Scope)
+    return error("DILocation requires a scope");
+
+  Loc = DILocation::get(MF.getFunction().getContext(), Line, Column, Scope,
+                        InlinedAt, ImplicitCode);
+  return false;
+}
+
+bool MIParser::parseMetadataOperand(MachineOperand &Dest) {
+  MDNode *Node = nullptr;
+  if (Token.is(MIToken::exclaim)) {
+    if (parseMDNode(Node))
+      return true;
+  } else if (Token.is(MIToken::md_diexpr)) {
+    if (parseDIExpression(Node))
+      return true;
+  }
+  Dest = MachineOperand::CreateMetadata(Node);
+  return false;
+}
+
+bool MIParser::parseCFIOffset(int &Offset) {
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error("expected a cfi offset");
+  if (Token.integerValue().getSignificantBits() > 32)
+    return error("expected a 32 bit integer (the cfi offset is too large)");
+  Offset = (int)Token.integerValue().getExtValue();
+  lex();
+  return false;
+}
+
+bool MIParser::parseCFIRegister(Register &Reg) {
+  if (Token.isNot(MIToken::NamedRegister))
+    return error("expected a cfi register");
+  Register LLVMReg;
+  if (parseNamedRegister(LLVMReg))
+    return true;
+  const auto *TRI = MF.getSubtarget().getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  int DwarfReg = TRI->getDwarfRegNum(LLVMReg, true);
+  if (DwarfReg < 0)
+    return error("invalid DWARF register");
+  Reg = (unsigned)DwarfReg;
+  lex();
+  return false;
+}
+
+bool MIParser::parseCFIAddressSpace(unsigned &AddressSpace) {
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error("expected a cfi address space literal");
+  if (Token.integerValue().isSigned())
+    return error("expected an unsigned integer (cfi address space)");
+  AddressSpace = Token.integerValue().getZExtValue();
+  lex();
+  return false;
+}
+
+bool MIParser::parseCFIEscapeValues(std::string &Values) {
+  do {
+    if (Token.isNot(MIToken::HexLiteral))
+      return error("expected a hexadecimal literal");
+    unsigned Value;
+    if (getUnsigned(Value))
+      return true;
+    if (Value > UINT8_MAX)
+      return error("expected a 8-bit integer (too large)");
+    Values.push_back(static_cast<uint8_t>(Value));
+    lex();
+  } while (consumeIfPresent(MIToken::comma));
+  return false;
+}
+
+bool MIParser::parseCFIOperand(MachineOperand &Dest) {
+  auto Kind = Token.kind();
+  lex();
+  int Offset;
+  Register Reg;
+  unsigned AddressSpace;
+  unsigned CFIIndex;
+  switch (Kind) {
+  case MIToken::kw_cfi_same_value:
+    if (parseCFIRegister(Reg))
+      return true;
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::createSameValue(nullptr, Reg));
+    break;
+  case MIToken::kw_cfi_offset:
+    if (parseCFIRegister(Reg) || expectAndConsume(MIToken::comma) ||
+        parseCFIOffset(Offset))
+      return true;
+    CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, Reg, Offset));
+    break;
+  case MIToken::kw_cfi_rel_offset:
+    if (parseCFIRegister(Reg) || expectAndConsume(MIToken::comma) ||
+        parseCFIOffset(Offset))
+      return true;
+    CFIIndex = MF.addFrameInst(
+        MCCFIInstruction::createRelOffset(nullptr, Reg, Offset));
+    break;
+  case MIToken::kw_cfi_def_cfa_register:
+    if (parseCFIRegister(Reg))
+      return true;
+    CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+    break;
+  case MIToken::kw_cfi_def_cfa_offset:
+    if (parseCFIOffset(Offset))
+      return true;
+    CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, Offset));
+    break;
+  case MIToken::kw_cfi_adjust_cfa_offset:
+    if (parseCFIOffset(Offset))
+      return true;
+    CFIIndex = MF.addFrameInst(
+        MCCFIInstruction::createAdjustCfaOffset(nullptr, Offset));
+    break;
+  case MIToken::kw_cfi_def_cfa:
+    if (parseCFIRegister(Reg) || expectAndConsume(MIToken::comma) ||
+        parseCFIOffset(Offset))
+      return true;
+    CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, Offset));
+    break;
+  case MIToken::kw_cfi_llvm_def_aspace_cfa:
+    if (parseCFIRegister(Reg) || expectAndConsume(MIToken::comma) ||
+        parseCFIOffset(Offset) || expectAndConsume(MIToken::comma) ||
+        parseCFIAddressSpace(AddressSpace))
+      return true;
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::createLLVMDefAspaceCfa(
+        nullptr, Reg, Offset, AddressSpace, SMLoc()));
+    break;
+  case MIToken::kw_cfi_remember_state:
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::createRememberState(nullptr));
+    break;
+  case MIToken::kw_cfi_restore:
+    if (parseCFIRegister(Reg))
+      return true;
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, Reg));
+    break;
+  case MIToken::kw_cfi_restore_state:
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestoreState(nullptr));
+    break;
+  case MIToken::kw_cfi_undefined:
+    if (parseCFIRegister(Reg))
+      return true;
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::createUndefined(nullptr, Reg));
+    break;
+  case MIToken::kw_cfi_register: {
+    Register Reg2;
+    if (parseCFIRegister(Reg) || expectAndConsume(MIToken::comma) ||
+        parseCFIRegister(Reg2))
+      return true;
+
+    CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::createRegister(nullptr, Reg, Reg2));
+    break;
+  }
+  case MIToken::kw_cfi_window_save:
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::createWindowSave(nullptr));
+    break;
+  case MIToken::kw_cfi_aarch64_negate_ra_sign_state:
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+    break;
+  case MIToken::kw_cfi_escape: {
+    std::string Values;
+    if (parseCFIEscapeValues(Values))
+      return true;
+    CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(nullptr, Values));
+    break;
+  }
+  default:
+    // TODO: Parse the other CFI operands.
+    llvm_unreachable("The current token should be a cfi operand");
+  }
+  Dest = MachineOperand::CreateCFIIndex(CFIIndex);
+  return false;
+}
+
+bool MIParser::parseIRBlock(BasicBlock *&BB, const Function &F) {
+  switch (Token.kind()) {
+  case MIToken::NamedIRBlock: {
+    BB = dyn_cast_or_null<BasicBlock>(
+        F.getValueSymbolTable()->lookup(Token.stringValue()));
+    if (!BB)
+      return error(Twine("use of undefined IR block '") + Token.range() + "'");
+    break;
+  }
+  case MIToken::IRBlock: {
+    unsigned SlotNumber = 0;
+    if (getUnsigned(SlotNumber))
+      return true;
+    BB = const_cast<BasicBlock *>(getIRBlock(SlotNumber, F));
+    if (!BB)
+      return error(Twine("use of undefined IR block '%ir-block.") +
+                   Twine(SlotNumber) + "'");
+    break;
+  }
+  default:
+    llvm_unreachable("The current token should be an IR block reference");
+  }
+  return false;
+}
+
+bool MIParser::parseBlockAddressOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::kw_blockaddress));
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+  if (Token.isNot(MIToken::GlobalValue) &&
+      Token.isNot(MIToken::NamedGlobalValue))
+    return error("expected a global value");
+  GlobalValue *GV = nullptr;
+  if (parseGlobalValue(GV))
+    return true;
+  auto *F = dyn_cast<Function>(GV);
+  if (!F)
+    return error("expected an IR function reference");
+  lex();
+  if (expectAndConsume(MIToken::comma))
+    return true;
+  BasicBlock *BB = nullptr;
+  if (Token.isNot(MIToken::IRBlock) && Token.isNot(MIToken::NamedIRBlock))
+    return error("expected an IR block reference");
+  if (parseIRBlock(BB, *F))
+    return true;
+  lex();
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  Dest = MachineOperand::CreateBA(BlockAddress::get(F, BB), /*Offset=*/0);
+  if (parseOperandsOffset(Dest))
+    return true;
+  return false;
+}
+
+bool MIParser::parseIntrinsicOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::kw_intrinsic));
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return error("expected syntax intrinsic(@llvm.whatever)");
+
+  if (Token.isNot(MIToken::NamedGlobalValue))
+    return error("expected syntax intrinsic(@llvm.whatever)");
+
+  std::string Name = std::string(Token.stringValue());
+  lex();
+
+  if (expectAndConsume(MIToken::rparen))
+    return error("expected ')' to terminate intrinsic name");
+
+  // Find out what intrinsic we're dealing with, first try the global namespace
+  // and then the target's private intrinsics if that fails.
+  const TargetIntrinsicInfo *TII = MF.getTarget().getIntrinsicInfo();
+  Intrinsic::ID ID = Function::lookupIntrinsicID(Name);
+  if (ID == Intrinsic::not_intrinsic && TII)
+    ID = static_cast<Intrinsic::ID>(TII->lookupName(Name));
+
+  if (ID == Intrinsic::not_intrinsic)
+    return error("unknown intrinsic name");
+  Dest = MachineOperand::CreateIntrinsicID(ID);
+
+  return false;
+}
+
+bool MIParser::parsePredicateOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::kw_intpred) || Token.is(MIToken::kw_floatpred));
+  bool IsFloat = Token.is(MIToken::kw_floatpred);
+  lex();
+
+  if (expectAndConsume(MIToken::lparen))
+    return error("expected syntax intpred(whatever) or floatpred(whatever");
+
+  if (Token.isNot(MIToken::Identifier))
+    return error("whatever");
+
+  CmpInst::Predicate Pred;
+  if (IsFloat) {
+    Pred = StringSwitch<CmpInst::Predicate>(Token.stringValue())
+               .Case("false", CmpInst::FCMP_FALSE)
+               .Case("oeq", CmpInst::FCMP_OEQ)
+               .Case("ogt", CmpInst::FCMP_OGT)
+               .Case("oge", CmpInst::FCMP_OGE)
+               .Case("olt", CmpInst::FCMP_OLT)
+               .Case("ole", CmpInst::FCMP_OLE)
+               .Case("one", CmpInst::FCMP_ONE)
+               .Case("ord", CmpInst::FCMP_ORD)
+               .Case("uno", CmpInst::FCMP_UNO)
+               .Case("ueq", CmpInst::FCMP_UEQ)
+               .Case("ugt", CmpInst::FCMP_UGT)
+               .Case("uge", CmpInst::FCMP_UGE)
+               .Case("ult", CmpInst::FCMP_ULT)
+               .Case("ule", CmpInst::FCMP_ULE)
+               .Case("une", CmpInst::FCMP_UNE)
+               .Case("true", CmpInst::FCMP_TRUE)
+               .Default(CmpInst::BAD_FCMP_PREDICATE);
+    if (!CmpInst::isFPPredicate(Pred))
+      return error("invalid floating-point predicate");
+  } else {
+    Pred = StringSwitch<CmpInst::Predicate>(Token.stringValue())
+               .Case("eq", CmpInst::ICMP_EQ)
+               .Case("ne", CmpInst::ICMP_NE)
+               .Case("sgt", CmpInst::ICMP_SGT)
+               .Case("sge", CmpInst::ICMP_SGE)
+               .Case("slt", CmpInst::ICMP_SLT)
+               .Case("sle", CmpInst::ICMP_SLE)
+               .Case("ugt", CmpInst::ICMP_UGT)
+               .Case("uge", CmpInst::ICMP_UGE)
+               .Case("ult", CmpInst::ICMP_ULT)
+               .Case("ule", CmpInst::ICMP_ULE)
+               .Default(CmpInst::BAD_ICMP_PREDICATE);
+    if (!CmpInst::isIntPredicate(Pred))
+      return error("invalid integer predicate");
+  }
+
+  lex();
+  Dest = MachineOperand::CreatePredicate(Pred);
+  if (expectAndConsume(MIToken::rparen))
+    return error("predicate should be terminated by ')'.");
+
+  return false;
+}
+
+bool MIParser::parseShuffleMaskOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::kw_shufflemask));
+
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return error("expected syntax shufflemask(<integer or undef>, ...)");
+
+  SmallVector<int, 32> ShufMask;
+  do {
+    if (Token.is(MIToken::kw_undef)) {
+      ShufMask.push_back(-1);
+    } else if (Token.is(MIToken::IntegerLiteral)) {
+      const APSInt &Int = Token.integerValue();
+      ShufMask.push_back(Int.getExtValue());
+    } else
+      return error("expected integer constant");
+
+    lex();
+  } while (consumeIfPresent(MIToken::comma));
+
+  if (expectAndConsume(MIToken::rparen))
+    return error("shufflemask should be terminated by ')'.");
+
+  ArrayRef<int> MaskAlloc = MF.allocateShuffleMask(ShufMask);
+  Dest = MachineOperand::CreateShuffleMask(MaskAlloc);
+  return false;
+}
+
+bool MIParser::parseDbgInstrRefOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::kw_dbg_instr_ref));
+
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return error("expected syntax dbg-instr-ref(<unsigned>, <unsigned>)");
+
+  if (Token.isNot(MIToken::IntegerLiteral) || Token.integerValue().isNegative())
+    return error("expected unsigned integer for instruction index");
+  uint64_t InstrIdx = Token.integerValue().getZExtValue();
+  assert(InstrIdx <= std::numeric_limits<unsigned>::max() &&
+         "Instruction reference's instruction index is too large");
+  lex();
+
+  if (expectAndConsume(MIToken::comma))
+    return error("expected syntax dbg-instr-ref(<unsigned>, <unsigned>)");
+
+  if (Token.isNot(MIToken::IntegerLiteral) || Token.integerValue().isNegative())
+    return error("expected unsigned integer for operand index");
+  uint64_t OpIdx = Token.integerValue().getZExtValue();
+  assert(OpIdx <= std::numeric_limits<unsigned>::max() &&
+         "Instruction reference's operand index is too large");
+  lex();
+
+  if (expectAndConsume(MIToken::rparen))
+    return error("expected syntax dbg-instr-ref(<unsigned>, <unsigned>)");
+
+  Dest = MachineOperand::CreateDbgInstrRef(InstrIdx, OpIdx);
+  return false;
+}
+
+bool MIParser::parseTargetIndexOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::kw_target_index));
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+  if (Token.isNot(MIToken::Identifier))
+    return error("expected the name of the target index");
+  int Index = 0;
+  if (PFS.Target.getTargetIndex(Token.stringValue(), Index))
+    return error("use of undefined target index '" + Token.stringValue() + "'");
+  lex();
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  Dest = MachineOperand::CreateTargetIndex(unsigned(Index), /*Offset=*/0);
+  if (parseOperandsOffset(Dest))
+    return true;
+  return false;
+}
+
+bool MIParser::parseCustomRegisterMaskOperand(MachineOperand &Dest) {
+  assert(Token.stringValue() == "CustomRegMask" && "Expected a custom RegMask");
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+
+  uint32_t *Mask = MF.allocateRegMask();
+  do {
+    if (Token.isNot(MIToken::rparen)) {
+      if (Token.isNot(MIToken::NamedRegister))
+        return error("expected a named register");
+      Register Reg;
+      if (parseNamedRegister(Reg))
+        return true;
+      lex();
+      Mask[Reg / 32] |= 1U << (Reg % 32);
+    }
+
+    // TODO: Report an error if the same register is used more than once.
+  } while (consumeIfPresent(MIToken::comma));
+
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  Dest = MachineOperand::CreateRegMask(Mask);
+  return false;
+}
+
+bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::kw_liveout));
+  uint32_t *Mask = MF.allocateRegMask();
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+  while (true) {
+    if (Token.isNot(MIToken::NamedRegister))
+      return error("expected a named register");
+    Register Reg;
+    if (parseNamedRegister(Reg))
+      return true;
+    lex();
+    Mask[Reg / 32] |= 1U << (Reg % 32);
+    // TODO: Report an error if the same register is used more than once.
+    if (Token.isNot(MIToken::comma))
+      break;
+    lex();
+  }
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  Dest = MachineOperand::CreateRegLiveOut(Mask);
+  return false;
+}
+
+bool MIParser::parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
+                                   MachineOperand &Dest,
+                                   std::optional<unsigned> &TiedDefIdx) {
+  switch (Token.kind()) {
+  case MIToken::kw_implicit:
+  case MIToken::kw_implicit_define:
+  case MIToken::kw_def:
+  case MIToken::kw_dead:
+  case MIToken::kw_killed:
+  case MIToken::kw_undef:
+  case MIToken::kw_internal:
+  case MIToken::kw_early_clobber:
+  case MIToken::kw_debug_use:
+  case MIToken::kw_renamable:
+  case MIToken::underscore:
+  case MIToken::NamedRegister:
+  case MIToken::VirtualRegister:
+  case MIToken::NamedVirtualRegister:
+    return parseRegisterOperand(Dest, TiedDefIdx);
+  case MIToken::IntegerLiteral:
+    return parseImmediateOperand(Dest);
+  case MIToken::kw_half:
+  case MIToken::kw_float:
+  case MIToken::kw_double:
+  case MIToken::kw_x86_fp80:
+  case MIToken::kw_fp128:
+  case MIToken::kw_ppc_fp128:
+    return parseFPImmediateOperand(Dest);
+  case MIToken::MachineBasicBlock:
+    return parseMBBOperand(Dest);
+  case MIToken::StackObject:
+    return parseStackObjectOperand(Dest);
+  case MIToken::FixedStackObject:
+    return parseFixedStackObjectOperand(Dest);
+  case MIToken::GlobalValue:
+  case MIToken::NamedGlobalValue:
+    return parseGlobalAddressOperand(Dest);
+  case MIToken::ConstantPoolItem:
+    return parseConstantPoolIndexOperand(Dest);
+  case MIToken::JumpTableIndex:
+    return parseJumpTableIndexOperand(Dest);
+  case MIToken::ExternalSymbol:
+    return parseExternalSymbolOperand(Dest);
+  case MIToken::MCSymbol:
+    return parseMCSymbolOperand(Dest);
+  case MIToken::SubRegisterIndex:
+    return parseSubRegisterIndexOperand(Dest);
+  case MIToken::md_diexpr:
+  case MIToken::exclaim:
+    return parseMetadataOperand(Dest);
+  case MIToken::kw_cfi_same_value:
+  case MIToken::kw_cfi_offset:
+  case MIToken::kw_cfi_rel_offset:
+  case MIToken::kw_cfi_def_cfa_register:
+  case MIToken::kw_cfi_def_cfa_offset:
+  case MIToken::kw_cfi_adjust_cfa_offset:
+  case MIToken::kw_cfi_escape:
+  case MIToken::kw_cfi_def_cfa:
+  case MIToken::kw_cfi_llvm_def_aspace_cfa:
+  case MIToken::kw_cfi_register:
+  case MIToken::kw_cfi_remember_state:
+  case MIToken::kw_cfi_restore:
+  case MIToken::kw_cfi_restore_state:
+  case MIToken::kw_cfi_undefined:
+  case MIToken::kw_cfi_window_save:
+  case MIToken::kw_cfi_aarch64_negate_ra_sign_state:
+    return parseCFIOperand(Dest);
+  case MIToken::kw_blockaddress:
+    return parseBlockAddressOperand(Dest);
+  case MIToken::kw_intrinsic:
+    return parseIntrinsicOperand(Dest);
+  case MIToken::kw_target_index:
+    return parseTargetIndexOperand(Dest);
+  case MIToken::kw_liveout:
+    return parseLiveoutRegisterMaskOperand(Dest);
+  case MIToken::kw_floatpred:
+  case MIToken::kw_intpred:
+    return parsePredicateOperand(Dest);
+  case MIToken::kw_shufflemask:
+    return parseShuffleMaskOperand(Dest);
+  case MIToken::kw_dbg_instr_ref:
+    return parseDbgInstrRefOperand(Dest);
+  case MIToken::Error:
+    return true;
+  case MIToken::Identifier:
+    if (const auto *RegMask = PFS.Target.getRegMask(Token.stringValue())) {
+      Dest = MachineOperand::CreateRegMask(RegMask);
+      lex();
+      break;
+    } else if (Token.stringValue() == "CustomRegMask") {
+      return parseCustomRegisterMaskOperand(Dest);
+    } else
+      return parseTypedImmediateOperand(Dest);
+  case MIToken::dot: {
+    const auto *TII = MF.getSubtarget().getInstrInfo();
+    if (const auto *Formatter = TII->getMIRFormatter()) {
+      return parseTargetImmMnemonic(OpCode, OpIdx, Dest, *Formatter);
+    }
+    [[fallthrough]];
+  }
+  default:
+    // FIXME: Parse the MCSymbol machine operand.
+    return error("expected a machine operand");
+  }
+  return false;
+}
+
+bool MIParser::parseMachineOperandAndTargetFlags(
+    const unsigned OpCode, const unsigned OpIdx, MachineOperand &Dest,
+    std::optional<unsigned> &TiedDefIdx) {
+  unsigned TF = 0;
+  bool HasTargetFlags = false;
+  if (Token.is(MIToken::kw_target_flags)) {
+    HasTargetFlags = true;
+    lex();
+    if (expectAndConsume(MIToken::lparen))
+      return true;
+    if (Token.isNot(MIToken::Identifier))
+      return error("expected the name of the target flag");
+    if (PFS.Target.getDirectTargetFlag(Token.stringValue(), TF)) {
+      if (PFS.Target.getBitmaskTargetFlag(Token.stringValue(), TF))
+        return error("use of undefined target flag '" + Token.stringValue() +
+                     "'");
+    }
+    lex();
+    while (Token.is(MIToken::comma)) {
+      lex();
+      if (Token.isNot(MIToken::Identifier))
+        return error("expected the name of the target flag");
+      unsigned BitFlag = 0;
+      if (PFS.Target.getBitmaskTargetFlag(Token.stringValue(), BitFlag))
+        return error("use of undefined target flag '" + Token.stringValue() +
+                     "'");
+      // TODO: Report an error when using a duplicate bit target flag.
+      TF |= BitFlag;
+      lex();
+    }
+    if (expectAndConsume(MIToken::rparen))
+      return true;
+  }
+  auto Loc = Token.location();
+  if (parseMachineOperand(OpCode, OpIdx, Dest, TiedDefIdx))
+    return true;
+  if (!HasTargetFlags)
+    return false;
+  if (Dest.isReg())
+    return error(Loc, "register operands can't have target flags");
+  Dest.setTargetFlags(TF);
+  return false;
+}
+
+bool MIParser::parseOffset(int64_t &Offset) {
+  if (Token.isNot(MIToken::plus) && Token.isNot(MIToken::minus))
+    return false;
+  StringRef Sign = Token.range();
+  bool IsNegative = Token.is(MIToken::minus);
+  lex();
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error("expected an integer literal after '" + Sign + "'");
+  if (Token.integerValue().getSignificantBits() > 64)
+    return error("expected 64-bit integer (too large)");
+  Offset = Token.integerValue().getExtValue();
+  if (IsNegative)
+    Offset = -Offset;
+  lex();
+  return false;
+}
+
+bool MIParser::parseIRBlockAddressTaken(BasicBlock *&BB) {
+  assert(Token.is(MIToken::kw_ir_block_address_taken));
+  lex();
+  if (Token.isNot(MIToken::IRBlock) && Token.isNot(MIToken::NamedIRBlock))
+    return error("expected basic block after 'ir_block_address_taken'");
+
+  if (parseIRBlock(BB, MF.getFunction()))
+    return true;
+
+  lex();
+  return false;
+}
+
+bool MIParser::parseAlignment(uint64_t &Alignment) {
+  assert(Token.is(MIToken::kw_align) || Token.is(MIToken::kw_basealign));
+  lex();
+  if (Token.isNot(MIToken::IntegerLiteral) || Token.integerValue().isSigned())
+    return error("expected an integer literal after 'align'");
+  if (getUint64(Alignment))
+    return true;
+  lex();
+
+  if (!isPowerOf2_64(Alignment))
+    return error("expected a power-of-2 literal after 'align'");
+
+  return false;
+}
+
+bool MIParser::parseAddrspace(unsigned &Addrspace) {
+  assert(Token.is(MIToken::kw_addrspace));
+  lex();
+  if (Token.isNot(MIToken::IntegerLiteral) || Token.integerValue().isSigned())
+    return error("expected an integer literal after 'addrspace'");
+  if (getUnsigned(Addrspace))
+    return true;
+  lex();
+  return false;
+}
+
+bool MIParser::parseOperandsOffset(MachineOperand &Op) {
+  int64_t Offset = 0;
+  if (parseOffset(Offset))
+    return true;
+  Op.setOffset(Offset);
+  return false;
+}
+
+static bool parseIRValue(const MIToken &Token, PerFunctionMIParsingState &PFS,
+                         const Value *&V, ErrorCallbackType ErrCB) {
+  switch (Token.kind()) {
+  case MIToken::NamedIRValue: {
+    V = PFS.MF.getFunction().getValueSymbolTable()->lookup(Token.stringValue());
+    break;
+  }
+  case MIToken::IRValue: {
+    unsigned SlotNumber = 0;
+    if (getUnsigned(Token, SlotNumber, ErrCB))
+      return true;
+    V = PFS.getIRValue(SlotNumber);
+    break;
+  }
+  case MIToken::NamedGlobalValue:
+  case MIToken::GlobalValue: {
+    GlobalValue *GV = nullptr;
+    if (parseGlobalValue(Token, PFS, GV, ErrCB))
+      return true;
+    V = GV;
+    break;
+  }
+  case MIToken::QuotedIRValue: {
+    const Constant *C = nullptr;
+    if (parseIRConstant(Token.location(), Token.stringValue(), PFS, C, ErrCB))
+      return true;
+    V = C;
+    break;
+  }
+  case MIToken::kw_unknown_address:
+    V = nullptr;
+    return false;
+  default:
+    llvm_unreachable("The current token should be an IR block reference");
+  }
+  if (!V)
+    return ErrCB(Token.location(), Twine("use of undefined IR value '") + Token.range() + "'");
+  return false;
+}
+
+bool MIParser::parseIRValue(const Value *&V) {
+  return ::parseIRValue(
+      Token, PFS, V, [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
+bool MIParser::getUint64(uint64_t &Result) {
+  if (Token.hasIntegerValue()) {
+    if (Token.integerValue().getActiveBits() > 64)
+      return error("expected 64-bit integer (too large)");
+    Result = Token.integerValue().getZExtValue();
+    return false;
+  }
+  if (Token.is(MIToken::HexLiteral)) {
+    APInt A;
+    if (getHexUint(A))
+      return true;
+    if (A.getBitWidth() > 64)
+      return error("expected 64-bit integer (too large)");
+    Result = A.getZExtValue();
+    return false;
+  }
+  return true;
+}
+
+bool MIParser::getHexUint(APInt &Result) {
+  return ::getHexUint(Token, Result);
+}
+
+bool MIParser::parseMemoryOperandFlag(MachineMemOperand::Flags &Flags) {
+  const auto OldFlags = Flags;
+  switch (Token.kind()) {
+  case MIToken::kw_volatile:
+    Flags |= MachineMemOperand::MOVolatile;
+    break;
+  case MIToken::kw_non_temporal:
+    Flags |= MachineMemOperand::MONonTemporal;
+    break;
+  case MIToken::kw_dereferenceable:
+    Flags |= MachineMemOperand::MODereferenceable;
+    break;
+  case MIToken::kw_invariant:
+    Flags |= MachineMemOperand::MOInvariant;
+    break;
+  case MIToken::StringConstant: {
+    MachineMemOperand::Flags TF;
+    if (PFS.Target.getMMOTargetFlag(Token.stringValue(), TF))
+      return error("use of undefined target MMO flag '" + Token.stringValue() +
+                   "'");
+    Flags |= TF;
+    break;
+  }
+  default:
+    llvm_unreachable("The current token should be a memory operand flag");
+  }
+  if (OldFlags == Flags)
+    // We know that the same flag is specified more than once when the flags
+    // weren't modified.
+    return error("duplicate '" + Token.stringValue() + "' memory operand flag");
+  lex();
+  return false;
+}
+
+bool MIParser::parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV) {
+  switch (Token.kind()) {
+  case MIToken::kw_stack:
+    PSV = MF.getPSVManager().getStack();
+    break;
+  case MIToken::kw_got:
+    PSV = MF.getPSVManager().getGOT();
+    break;
+  case MIToken::kw_jump_table:
+    PSV = MF.getPSVManager().getJumpTable();
+    break;
+  case MIToken::kw_constant_pool:
+    PSV = MF.getPSVManager().getConstantPool();
+    break;
+  case MIToken::FixedStackObject: {
+    int FI;
+    if (parseFixedStackFrameIndex(FI))
+      return true;
+    PSV = MF.getPSVManager().getFixedStack(FI);
+    // The token was already consumed, so use return here instead of break.
+    return false;
+  }
+  case MIToken::StackObject: {
+    int FI;
+    if (parseStackFrameIndex(FI))
+      return true;
+    PSV = MF.getPSVManager().getFixedStack(FI);
+    // The token was already consumed, so use return here instead of break.
+    return false;
+  }
+  case MIToken::kw_call_entry:
+    lex();
+    switch (Token.kind()) {
+    case MIToken::GlobalValue:
+    case MIToken::NamedGlobalValue: {
+      GlobalValue *GV = nullptr;
+      if (parseGlobalValue(GV))
+        return true;
+      PSV = MF.getPSVManager().getGlobalValueCallEntry(GV);
+      break;
+    }
+    case MIToken::ExternalSymbol:
+      PSV = MF.getPSVManager().getExternalSymbolCallEntry(
+          MF.createExternalSymbolName(Token.stringValue()));
+      break;
+    default:
+      return error(
+          "expected a global value or an external symbol after 'call-entry'");
+    }
+    break;
+  case MIToken::kw_custom: {
+    lex();
+    const auto *TII = MF.getSubtarget().getInstrInfo();
+    if (const auto *Formatter = TII->getMIRFormatter()) {
+      if (Formatter->parseCustomPseudoSourceValue(
+              Token.stringValue(), MF, PFS, PSV,
+              [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+                return error(Loc, Msg);
+              }))
+        return true;
+    } else
+      return error("unable to parse target custom pseudo source value");
+    break;
+  }
+  default:
+    llvm_unreachable("The current token should be pseudo source value");
+  }
+  lex();
+  return false;
+}
+
+bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) {
+  if (Token.is(MIToken::kw_constant_pool) || Token.is(MIToken::kw_stack) ||
+      Token.is(MIToken::kw_got) || Token.is(MIToken::kw_jump_table) ||
+      Token.is(MIToken::FixedStackObject) || Token.is(MIToken::StackObject) ||
+      Token.is(MIToken::kw_call_entry) || Token.is(MIToken::kw_custom)) {
+    const PseudoSourceValue *PSV = nullptr;
+    if (parseMemoryPseudoSourceValue(PSV))
+      return true;
+    int64_t Offset = 0;
+    if (parseOffset(Offset))
+      return true;
+    Dest = MachinePointerInfo(PSV, Offset);
+    return false;
+  }
+  if (Token.isNot(MIToken::NamedIRValue) && Token.isNot(MIToken::IRValue) &&
+      Token.isNot(MIToken::GlobalValue) &&
+      Token.isNot(MIToken::NamedGlobalValue) &&
+      Token.isNot(MIToken::QuotedIRValue) &&
+      Token.isNot(MIToken::kw_unknown_address))
+    return error("expected an IR value reference");
+  const Value *V = nullptr;
+  if (parseIRValue(V))
+    return true;
+  if (V && !V->getType()->isPointerTy())
+    return error("expected a pointer IR value");
+  lex();
+  int64_t Offset = 0;
+  if (parseOffset(Offset))
+    return true;
+  Dest = MachinePointerInfo(V, Offset);
+  return false;
+}
+
+bool MIParser::parseOptionalScope(LLVMContext &Context,
+                                  SyncScope::ID &SSID) {
+  SSID = SyncScope::System;
+  if (Token.is(MIToken::Identifier) && Token.stringValue() == "syncscope") {
+    lex();
+    if (expectAndConsume(MIToken::lparen))
+      return error("expected '(' in syncscope");
+
+    std::string SSN;
+    if (parseStringConstant(SSN))
+      return true;
+
+    SSID = Context.getOrInsertSyncScopeID(SSN);
+    if (expectAndConsume(MIToken::rparen))
+      return error("expected ')' in syncscope");
+  }
+
+  return false;
+}
+
+bool MIParser::parseOptionalAtomicOrdering(AtomicOrdering &Order) {
+  Order = AtomicOrdering::NotAtomic;
+  if (Token.isNot(MIToken::Identifier))
+    return false;
+
+  Order = StringSwitch<AtomicOrdering>(Token.stringValue())
+              .Case("unordered", AtomicOrdering::Unordered)
+              .Case("monotonic", AtomicOrdering::Monotonic)
+              .Case("acquire", AtomicOrdering::Acquire)
+              .Case("release", AtomicOrdering::Release)
+              .Case("acq_rel", AtomicOrdering::AcquireRelease)
+              .Case("seq_cst", AtomicOrdering::SequentiallyConsistent)
+              .Default(AtomicOrdering::NotAtomic);
+
+  if (Order != AtomicOrdering::NotAtomic) {
+    lex();
+    return false;
+  }
+
+  return error("expected an atomic scope, ordering or a size specification");
+}
+
+bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+  MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
+  while (Token.isMemoryOperandFlag()) {
+    if (parseMemoryOperandFlag(Flags))
+      return true;
+  }
+  if (Token.isNot(MIToken::Identifier) ||
+      (Token.stringValue() != "load" && Token.stringValue() != "store"))
+    return error("expected 'load' or 'store' memory operation");
+  if (Token.stringValue() == "load")
+    Flags |= MachineMemOperand::MOLoad;
+  else
+    Flags |= MachineMemOperand::MOStore;
+  lex();
+
+  // Optional 'store' for operands that both load and store.
+  if (Token.is(MIToken::Identifier) && Token.stringValue() == "store") {
+    Flags |= MachineMemOperand::MOStore;
+    lex();
+  }
+
+  // Optional synchronization scope.
+  SyncScope::ID SSID;
+  if (parseOptionalScope(MF.getFunction().getContext(), SSID))
+    return true;
+
+  // Up to two atomic orderings (cmpxchg provides guarantees on failure).
+  AtomicOrdering Order, FailureOrder;
+  if (parseOptionalAtomicOrdering(Order))
+    return true;
+
+  if (parseOptionalAtomicOrdering(FailureOrder))
+    return true;
+
+  LLT MemoryType;
+  if (Token.isNot(MIToken::IntegerLiteral) &&
+      Token.isNot(MIToken::kw_unknown_size) &&
+      Token.isNot(MIToken::lparen))
+    return error("expected memory LLT, the size integer literal or 'unknown-size' after "
+                 "memory operation");
+
+  uint64_t Size = MemoryLocation::UnknownSize;
+  if (Token.is(MIToken::IntegerLiteral)) {
+    if (getUint64(Size))
+      return true;
+
+    // Convert from bytes to bits for storage.
+    MemoryType = LLT::scalar(8 * Size);
+    lex();
+  } else if (Token.is(MIToken::kw_unknown_size)) {
+    Size = MemoryLocation::UnknownSize;
+    lex();
+  } else {
+    if (expectAndConsume(MIToken::lparen))
+      return true;
+    if (parseLowLevelType(Token.location(), MemoryType))
+      return true;
+    if (expectAndConsume(MIToken::rparen))
+      return true;
+
+    Size = MemoryType.getSizeInBytes();
+  }
+
+  MachinePointerInfo Ptr = MachinePointerInfo();
+  if (Token.is(MIToken::Identifier)) {
+    const char *Word =
+        ((Flags & MachineMemOperand::MOLoad) &&
+         (Flags & MachineMemOperand::MOStore))
+            ? "on"
+            : Flags & MachineMemOperand::MOLoad ? "from" : "into";
+    if (Token.stringValue() != Word)
+      return error(Twine("expected '") + Word + "'");
+    lex();
+
+    if (parseMachinePointerInfo(Ptr))
+      return true;
+  }
+  uint64_t BaseAlignment =
+      (Size != MemoryLocation::UnknownSize ? PowerOf2Ceil(Size) : 1);
+  AAMDNodes AAInfo;
+  MDNode *Range = nullptr;
+  while (consumeIfPresent(MIToken::comma)) {
+    switch (Token.kind()) {
+    case MIToken::kw_align: {
+      // align is printed if it is different than size.
+      uint64_t Alignment;
+      if (parseAlignment(Alignment))
+        return true;
+      if (Ptr.Offset & (Alignment - 1)) {
+        // MachineMemOperand::getAlign never returns a value greater than the
+        // alignment of offset, so this just guards against hand-written MIR
+        // that specifies a large "align" value when it should probably use
+        // "basealign" instead.
+        return error("specified alignment is more aligned than offset");
+      }
+      BaseAlignment = Alignment;
+      break;
+    }
+    case MIToken::kw_basealign:
+      // basealign is printed if it is different than align.
+      if (parseAlignment(BaseAlignment))
+        return true;
+      break;
+    case MIToken::kw_addrspace:
+      if (parseAddrspace(Ptr.AddrSpace))
+        return true;
+      break;
+    case MIToken::md_tbaa:
+      lex();
+      if (parseMDNode(AAInfo.TBAA))
+        return true;
+      break;
+    case MIToken::md_alias_scope:
+      lex();
+      if (parseMDNode(AAInfo.Scope))
+        return true;
+      break;
+    case MIToken::md_noalias:
+      lex();
+      if (parseMDNode(AAInfo.NoAlias))
+        return true;
+      break;
+    case MIToken::md_range:
+      lex();
+      if (parseMDNode(Range))
+        return true;
+      break;
+    // TODO: Report an error on duplicate metadata nodes.
+    default:
+      return error("expected 'align' or '!tbaa' or '!alias.scope' or "
+                   "'!noalias' or '!range'");
+    }
+  }
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  Dest = MF.getMachineMemOperand(Ptr, Flags, MemoryType, Align(BaseAlignment),
+                                 AAInfo, Range, SSID, Order, FailureOrder);
+  return false;
+}
+
+bool MIParser::parsePreOrPostInstrSymbol(MCSymbol *&Symbol) {
+  assert((Token.is(MIToken::kw_pre_instr_symbol) ||
+          Token.is(MIToken::kw_post_instr_symbol)) &&
+         "Invalid token for a pre- post-instruction symbol!");
+  lex();
+  if (Token.isNot(MIToken::MCSymbol))
+    return error("expected a symbol after 'pre-instr-symbol'");
+  Symbol = getOrCreateMCSymbol(Token.stringValue());
+  lex();
+  if (Token.isNewlineOrEOF() || Token.is(MIToken::coloncolon) ||
+      Token.is(MIToken::lbrace))
+    return false;
+  if (Token.isNot(MIToken::comma))
+    return error("expected ',' before the next machine operand");
+  lex();
+  return false;
+}
+
+bool MIParser::parseHeapAllocMarker(MDNode *&Node) {
+  assert(Token.is(MIToken::kw_heap_alloc_marker) &&
+         "Invalid token for a heap alloc marker!");
+  lex();
+  if (parseMDNode(Node))
+    return true;
+  if (!Node)
+    return error("expected a MDNode after 'heap-alloc-marker'");
+  if (Token.isNewlineOrEOF() || Token.is(MIToken::coloncolon) ||
+      Token.is(MIToken::lbrace))
+    return false;
+  if (Token.isNot(MIToken::comma))
+    return error("expected ',' before the next machine operand");
+  lex();
+  return false;
+}
+
+bool MIParser::parsePCSections(MDNode *&Node) {
+  assert(Token.is(MIToken::kw_pcsections) &&
+         "Invalid token for a PC sections!");
+  lex();
+  if (parseMDNode(Node))
+    return true;
+  if (!Node)
+    return error("expected a MDNode after 'pcsections'");
+  if (Token.isNewlineOrEOF() || Token.is(MIToken::coloncolon) ||
+      Token.is(MIToken::lbrace))
+    return false;
+  if (Token.isNot(MIToken::comma))
+    return error("expected ',' before the next machine operand");
+  lex();
+  return false;
+}
+
+static void initSlots2BasicBlocks(
+    const Function &F,
+    DenseMap<unsigned, const BasicBlock *> &Slots2BasicBlocks) {
+  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
+  MST.incorporateFunction(F);
+  for (const auto &BB : F) {
+    if (BB.hasName())
+      continue;
+    int Slot = MST.getLocalSlot(&BB);
+    if (Slot == -1)
+      continue;
+    Slots2BasicBlocks.insert(std::make_pair(unsigned(Slot), &BB));
+  }
+}
+
+static const BasicBlock *getIRBlockFromSlot(
+    unsigned Slot,
+    const DenseMap<unsigned, const BasicBlock *> &Slots2BasicBlocks) {
+  return Slots2BasicBlocks.lookup(Slot);
+}
+
+const BasicBlock *MIParser::getIRBlock(unsigned Slot) {
+  if (Slots2BasicBlocks.empty())
+    initSlots2BasicBlocks(MF.getFunction(), Slots2BasicBlocks);
+  return getIRBlockFromSlot(Slot, Slots2BasicBlocks);
+}
+
+const BasicBlock *MIParser::getIRBlock(unsigned Slot, const Function &F) {
+  if (&F == &MF.getFunction())
+    return getIRBlock(Slot);
+  DenseMap<unsigned, const BasicBlock *> CustomSlots2BasicBlocks;
+  initSlots2BasicBlocks(F, CustomSlots2BasicBlocks);
+  return getIRBlockFromSlot(Slot, CustomSlots2BasicBlocks);
+}
+
+MCSymbol *MIParser::getOrCreateMCSymbol(StringRef Name) {
+  // FIXME: Currently we can't recognize temporary or local symbols and call all
+  // of the appropriate forms to create them. However, this handles basic cases
+  // well as most of the special aspects are recognized by a prefix on their
+  // name, and the input names should already be unique. For test cases, keeping
+  // the symbol name out of the symbol table isn't terribly important.
+  return MF.getContext().getOrCreateSymbol(Name);
+}
+
+bool MIParser::parseStringConstant(std::string &Result) {
+  if (Token.isNot(MIToken::StringConstant))
+    return error("expected string constant");
+  Result = std::string(Token.stringValue());
+  lex();
+  return false;
+}
+
+bool llvm::parseMachineBasicBlockDefinitions(PerFunctionMIParsingState &PFS,
+                                             StringRef Src,
+                                             SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseBasicBlockDefinitions(PFS.MBBSlots);
+}
+
+bool llvm::parseMachineInstructions(PerFunctionMIParsingState &PFS,
+                                    StringRef Src, SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseBasicBlocks();
+}
+
+bool llvm::parseMBBReference(PerFunctionMIParsingState &PFS,
+                             MachineBasicBlock *&MBB, StringRef Src,
+                             SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseStandaloneMBB(MBB);
+}
+
+bool llvm::parseRegisterReference(PerFunctionMIParsingState &PFS,
+                                  Register &Reg, StringRef Src,
+                                  SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseStandaloneRegister(Reg);
+}
+
+bool llvm::parseNamedRegisterReference(PerFunctionMIParsingState &PFS,
+                                       Register &Reg, StringRef Src,
+                                       SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseStandaloneNamedRegister(Reg);
+}
+
+bool llvm::parseVirtualRegisterReference(PerFunctionMIParsingState &PFS,
+                                         VRegInfo *&Info, StringRef Src,
+                                         SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseStandaloneVirtualRegister(Info);
+}
+
+bool llvm::parseStackObjectReference(PerFunctionMIParsingState &PFS,
+                                     int &FI, StringRef Src,
+                                     SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseStandaloneStackObject(FI);
+}
+
+bool llvm::parseMDNode(PerFunctionMIParsingState &PFS,
+                       MDNode *&Node, StringRef Src, SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseStandaloneMDNode(Node);
+}
+
+bool llvm::parseMachineMetadata(PerFunctionMIParsingState &PFS, StringRef Src,
+                                SMRange SrcRange, SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src, SrcRange).parseMachineMetadata();
+}
+
+bool MIRFormatter::parseIRValue(StringRef Src, MachineFunction &MF,
+                                PerFunctionMIParsingState &PFS, const Value *&V,
+                                ErrorCallbackType ErrorCallback) {
+  MIToken Token;
+  Src = lexMIToken(Src, Token, [&](StringRef::iterator Loc, const Twine &Msg) {
+    ErrorCallback(Loc, Msg);
+  });
+  V = nullptr;
+
+  return ::parseIRValue(Token, PFS, V, ErrorCallback);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
new file mode 100644
index 000000000000..b2e570c5e67e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -0,0 +1,1133 @@
+//===- MIRParser.cpp - MIR serialization format parser implementation -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the class that parses the optional LLVM IR and machine
+// functions that are stored in MIR files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MIRParser/MIRParser.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/AsmParser/SlotMapping.h"
+#include "llvm/CodeGen/MIRParser/MIParser.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Target/TargetMachine.h"
+#include <memory>
+
+using namespace llvm;
+
+namespace llvm {
+class MDNode;
+class RegisterBank;
+
+/// This class implements the parsing of LLVM IR that's embedded inside a MIR
+/// file.
+class MIRParserImpl {
+  SourceMgr SM;
+  LLVMContext &Context;
+  yaml::Input In;
+  StringRef Filename;
+  SlotMapping IRSlots;
+  std::unique_ptr<PerTargetMIParsingState> Target;
+
+  /// True when the MIR file doesn't have LLVM IR. Dummy IR functions are
+  /// created and inserted into the given module when this is true.
+  bool NoLLVMIR = false;
+  /// True when a well formed MIR file does not contain any MIR/machine function
+  /// parts.
+  bool NoMIRDocuments = false;
+
+  std::function<void(Function &)> ProcessIRFunction;
+
+public:
+  MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents, StringRef Filename,
+                LLVMContext &Context,
+                std::function<void(Function &)> ProcessIRFunction);
+
+  void reportDiagnostic(const SMDiagnostic &Diag);
+
+  /// Report an error with the given message at unknown location.
+  ///
+  /// Always returns true.
+  bool error(const Twine &Message);
+
+  /// Report an error with the given message at the given location.
+  ///
+  /// Always returns true.
+  bool error(SMLoc Loc, const Twine &Message);
+
+  /// Report a given error with the location translated from the location in an
+  /// embedded string literal to a location in the MIR file.
+  ///
+  /// Always returns true.
+  bool error(const SMDiagnostic &Error, SMRange SourceRange);
+
+  /// Try to parse the optional LLVM module and the machine functions in the MIR
+  /// file.
+  ///
+  /// Return null if an error occurred.
+  std::unique_ptr<Module>
+  parseIRModule(DataLayoutCallbackTy DataLayoutCallback);
+
+  /// Create an empty function with the given name.
+  Function *createDummyFunction(StringRef Name, Module &M);
+
+  bool parseMachineFunctions(Module &M, MachineModuleInfo &MMI);
+
+  /// Parse the machine function in the current YAML document.
+  ///
+  ///
+  /// Return true if an error occurred.
+  bool parseMachineFunction(Module &M, MachineModuleInfo &MMI);
+
+  /// Initialize the machine function to the state that's described in the MIR
+  /// file.
+  ///
+  /// Return true if error occurred.
+  bool initializeMachineFunction(const yaml::MachineFunction &YamlMF,
+                                 MachineFunction &MF);
+
+  bool parseRegisterInfo(PerFunctionMIParsingState &PFS,
+                         const yaml::MachineFunction &YamlMF);
+
+  bool setupRegisterInfo(const PerFunctionMIParsingState &PFS,
+                         const yaml::MachineFunction &YamlMF);
+
+  bool initializeFrameInfo(PerFunctionMIParsingState &PFS,
+                           const yaml::MachineFunction &YamlMF);
+
+  bool initializeCallSiteInfo(PerFunctionMIParsingState &PFS,
+                              const yaml::MachineFunction &YamlMF);
+
+  bool parseCalleeSavedRegister(PerFunctionMIParsingState &PFS,
+                                std::vector<CalleeSavedInfo> &CSIInfo,
+                                const yaml::StringValue &RegisterSource,
+                                bool IsRestored, int FrameIdx);
+
+  struct VarExprLoc {
+    DILocalVariable *DIVar = nullptr;
+    DIExpression *DIExpr = nullptr;
+    DILocation *DILoc = nullptr;
+  };
+
+  std::optional<VarExprLoc> parseVarExprLoc(PerFunctionMIParsingState &PFS,
+                                            const yaml::StringValue &VarStr,
+                                            const yaml::StringValue &ExprStr,
+                                            const yaml::StringValue &LocStr);
+  template <typename T>
+  bool parseStackObjectsDebugInfo(PerFunctionMIParsingState &PFS,
+                                  const T &Object,
+                                  int FrameIdx);
+
+  bool initializeConstantPool(PerFunctionMIParsingState &PFS,
+                              MachineConstantPool &ConstantPool,
+                              const yaml::MachineFunction &YamlMF);
+
+  bool initializeJumpTableInfo(PerFunctionMIParsingState &PFS,
+                               const yaml::MachineJumpTable &YamlJTI);
+
+  bool parseMachineMetadataNodes(PerFunctionMIParsingState &PFS,
+                                 MachineFunction &MF,
+                                 const yaml::MachineFunction &YMF);
+
+private:
+  bool parseMDNode(PerFunctionMIParsingState &PFS, MDNode *&Node,
+                   const yaml::StringValue &Source);
+
+  bool parseMBBReference(PerFunctionMIParsingState &PFS,
+                         MachineBasicBlock *&MBB,
+                         const yaml::StringValue &Source);
+
+  bool parseMachineMetadata(PerFunctionMIParsingState &PFS,
+                            const yaml::StringValue &Source);
+
+  /// Return a MIR diagnostic converted from an MI string diagnostic.
+  SMDiagnostic diagFromMIStringDiag(const SMDiagnostic &Error,
+                                    SMRange SourceRange);
+
+  /// Return a MIR diagnostic converted from a diagnostic located in a YAML
+  /// block scalar string.
+  SMDiagnostic diagFromBlockStringDiag(const SMDiagnostic &Error,
+                                       SMRange SourceRange);
+
+  void computeFunctionProperties(MachineFunction &MF);
+
+  void setupDebugValueTracking(MachineFunction &MF,
+    PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF);
+};
+
+} // end namespace llvm
+
+static void handleYAMLDiag(const SMDiagnostic &Diag, void *Context) {
+  reinterpret_cast<MIRParserImpl *>(Context)->reportDiagnostic(Diag);
+}
+
+MIRParserImpl::MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents,
+                             StringRef Filename, LLVMContext &Context,
+                             std::function<void(Function &)> Callback)
+    : Context(Context),
+      In(SM.getMemoryBuffer(SM.AddNewSourceBuffer(std::move(Contents), SMLoc()))
+             ->getBuffer(),
+         nullptr, handleYAMLDiag, this),
+      Filename(Filename), ProcessIRFunction(Callback) {
+  In.setContext(&In);
+}
+
+bool MIRParserImpl::error(const Twine &Message) {
+  Context.diagnose(DiagnosticInfoMIRParser(
+      DS_Error, SMDiagnostic(Filename, SourceMgr::DK_Error, Message.str())));
+  return true;
+}
+
+bool MIRParserImpl::error(SMLoc Loc, const Twine &Message) {
+  Context.diagnose(DiagnosticInfoMIRParser(
+      DS_Error, SM.GetMessage(Loc, SourceMgr::DK_Error, Message)));
+  return true;
+}
+
+bool MIRParserImpl::error(const SMDiagnostic &Error, SMRange SourceRange) {
+  assert(Error.getKind() == SourceMgr::DK_Error && "Expected an error");
+  reportDiagnostic(diagFromMIStringDiag(Error, SourceRange));
+  return true;
+}
+
+void MIRParserImpl::reportDiagnostic(const SMDiagnostic &Diag) {
+  DiagnosticSeverity Kind;
+  switch (Diag.getKind()) {
+  case SourceMgr::DK_Error:
+    Kind = DS_Error;
+    break;
+  case SourceMgr::DK_Warning:
+    Kind = DS_Warning;
+    break;
+  case SourceMgr::DK_Note:
+    Kind = DS_Note;
+    break;
+  case SourceMgr::DK_Remark:
+    llvm_unreachable("remark unexpected");
+    break;
+  }
+  Context.diagnose(DiagnosticInfoMIRParser(Kind, Diag));
+}
+
+std::unique_ptr<Module>
+MIRParserImpl::parseIRModule(DataLayoutCallbackTy DataLayoutCallback) {
+  if (!In.setCurrentDocument()) {
+    if (In.error())
+      return nullptr;
+    // Create an empty module when the MIR file is empty.
+    NoMIRDocuments = true;
+    auto M = std::make_unique<Module>(Filename, Context);
+    if (auto LayoutOverride =
+            DataLayoutCallback(M->getTargetTriple(), M->getDataLayoutStr()))
+      M->setDataLayout(*LayoutOverride);
+    return M;
+  }
+
+  std::unique_ptr<Module> M;
+  // Parse the block scalar manually so that we can return unique pointer
+  // without having to go trough YAML traits.
+  if (const auto *BSN =
+          dyn_cast_or_null<yaml::BlockScalarNode>(In.getCurrentNode())) {
+    SMDiagnostic Error;
+    M = parseAssembly(MemoryBufferRef(BSN->getValue(), Filename), Error,
+                      Context, &IRSlots, DataLayoutCallback);
+    if (!M) {
+      reportDiagnostic(diagFromBlockStringDiag(Error, BSN->getSourceRange()));
+      return nullptr;
+    }
+    In.nextDocument();
+    if (!In.setCurrentDocument())
+      NoMIRDocuments = true;
+  } else {
+    // Create an new, empty module.
+    M = std::make_unique<Module>(Filename, Context);
+    if (auto LayoutOverride =
+            DataLayoutCallback(M->getTargetTriple(), M->getDataLayoutStr()))
+      M->setDataLayout(*LayoutOverride);
+    NoLLVMIR = true;
+  }
+  return M;
+}
+
+bool MIRParserImpl::parseMachineFunctions(Module &M, MachineModuleInfo &MMI) {
+  if (NoMIRDocuments)
+    return false;
+
+  // Parse the machine functions.
+  do {
+    if (parseMachineFunction(M, MMI))
+      return true;
+    In.nextDocument();
+  } while (In.setCurrentDocument());
+
+  return false;
+}
+
+Function *MIRParserImpl::createDummyFunction(StringRef Name, Module &M) {
+  auto &Context = M.getContext();
+  Function *F =
+      Function::Create(FunctionType::get(Type::getVoidTy(Context), false),
+                       Function::ExternalLinkage, Name, M);
+  BasicBlock *BB = BasicBlock::Create(Context, "entry", F);
+  new UnreachableInst(Context, BB);
+
+  if (ProcessIRFunction)
+    ProcessIRFunction(*F);
+
+  return F;
+}
+
+bool MIRParserImpl::parseMachineFunction(Module &M, MachineModuleInfo &MMI) {
+  // Parse the yaml.
+  yaml::MachineFunction YamlMF;
+  yaml::EmptyContext Ctx;
+
+  const LLVMTargetMachine &TM = MMI.getTarget();
+  YamlMF.MachineFuncInfo = std::unique_ptr<yaml::MachineFunctionInfo>(
+      TM.createDefaultFuncInfoYAML());
+
+  yaml::yamlize(In, YamlMF, false, Ctx);
+  if (In.error())
+    return true;
+
+  // Search for the corresponding IR function.
+  StringRef FunctionName = YamlMF.Name;
+  Function *F = M.getFunction(FunctionName);
+  if (!F) {
+    if (NoLLVMIR) {
+      F = createDummyFunction(FunctionName, M);
+    } else {
+      return error(Twine("function '") + FunctionName +
+                   "' isn't defined in the provided LLVM IR");
+    }
+  }
+  if (MMI.getMachineFunction(*F) != nullptr)
+    return error(Twine("redefinition of machine function '") + FunctionName +
+                 "'");
+
+  // Create the MachineFunction.
+  MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
+  if (initializeMachineFunction(YamlMF, MF))
+    return true;
+
+  return false;
+}
+
+static bool isSSA(const MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+    if (!MRI.hasOneDef(Reg) && !MRI.def_empty(Reg))
+      return false;
+
+    // Subregister defs are invalid in SSA.
+    const MachineOperand *RegDef = MRI.getOneDef(Reg);
+    if (RegDef && RegDef->getSubReg() != 0)
+      return false;
+  }
+  return true;
+}
+
+void MIRParserImpl::computeFunctionProperties(MachineFunction &MF) {
+  MachineFunctionProperties &Properties = MF.getProperties();
+
+  bool HasPHI = false;
+  bool HasInlineAsm = false;
+  bool AllTiedOpsRewritten = true, HasTiedOps = false;
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      if (MI.isPHI())
+        HasPHI = true;
+      if (MI.isInlineAsm())
+        HasInlineAsm = true;
+      for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+        const MachineOperand &MO = MI.getOperand(I);
+        if (!MO.isReg() || !MO.getReg())
+          continue;
+        unsigned DefIdx;
+        if (MO.isUse() && MI.isRegTiedToDefOperand(I, &DefIdx)) {
+          HasTiedOps = true;
+          if (MO.getReg() != MI.getOperand(DefIdx).getReg())
+            AllTiedOpsRewritten = false;
+        }
+      }
+    }
+  }
+  if (!HasPHI)
+    Properties.set(MachineFunctionProperties::Property::NoPHIs);
+  MF.setHasInlineAsm(HasInlineAsm);
+
+  if (HasTiedOps && AllTiedOpsRewritten)
+    Properties.set(MachineFunctionProperties::Property::TiedOpsRewritten);
+
+  if (isSSA(MF))
+    Properties.set(MachineFunctionProperties::Property::IsSSA);
+  else
+    Properties.reset(MachineFunctionProperties::Property::IsSSA);
+
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  if (MRI.getNumVirtRegs() == 0)
+    Properties.set(MachineFunctionProperties::Property::NoVRegs);
+}
+
+bool MIRParserImpl::initializeCallSiteInfo(
+    PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF) {
+  MachineFunction &MF = PFS.MF;
+  SMDiagnostic Error;
+  const LLVMTargetMachine &TM = MF.getTarget();
+  for (auto &YamlCSInfo : YamlMF.CallSitesInfo) {
+    yaml::CallSiteInfo::MachineInstrLoc MILoc = YamlCSInfo.CallLocation;
+    if (MILoc.BlockNum >= MF.size())
+      return error(Twine(MF.getName()) +
+                   Twine(" call instruction block out of range.") +
+                   " Unable to reference bb:" + Twine(MILoc.BlockNum));
+    auto CallB = std::next(MF.begin(), MILoc.BlockNum);
+    if (MILoc.Offset >= CallB->size())
+      return error(Twine(MF.getName()) +
+                   Twine(" call instruction offset out of range.") +
+                   " Unable to reference instruction at bb: " +
+                   Twine(MILoc.BlockNum) + " at offset:" + Twine(MILoc.Offset));
+    auto CallI = std::next(CallB->instr_begin(), MILoc.Offset);
+    if (!CallI->isCall(MachineInstr::IgnoreBundle))
+      return error(Twine(MF.getName()) +
+                   Twine(" call site info should reference call "
+                         "instruction. Instruction at bb:") +
+                   Twine(MILoc.BlockNum) + " at offset:" + Twine(MILoc.Offset) +
+                   " is not a call instruction");
+    MachineFunction::CallSiteInfo CSInfo;
+    for (auto ArgRegPair : YamlCSInfo.ArgForwardingRegs) {
+      Register Reg;
+      if (parseNamedRegisterReference(PFS, Reg, ArgRegPair.Reg.Value, Error))
+        return error(Error, ArgRegPair.Reg.SourceRange);
+      CSInfo.emplace_back(Reg, ArgRegPair.ArgNo);
+    }
+
+    if (TM.Options.EmitCallSiteInfo)
+      MF.addCallArgsForwardingRegs(&*CallI, std::move(CSInfo));
+  }
+
+  if (YamlMF.CallSitesInfo.size() && !TM.Options.EmitCallSiteInfo)
+    return error(Twine("Call site info provided but not used"));
+  return false;
+}
+
+void MIRParserImpl::setupDebugValueTracking(
+    MachineFunction &MF, PerFunctionMIParsingState &PFS,
+    const yaml::MachineFunction &YamlMF) {
+  // Compute the value of the "next instruction number" field.
+  unsigned MaxInstrNum = 0;
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
+      MaxInstrNum = std::max((unsigned)MI.peekDebugInstrNum(), MaxInstrNum);
+  MF.setDebugInstrNumberingCount(MaxInstrNum);
+
+  // Load any substitutions.
+  for (const auto &Sub : YamlMF.DebugValueSubstitutions) {
+    MF.makeDebugValueSubstitution({Sub.SrcInst, Sub.SrcOp},
+                                  {Sub.DstInst, Sub.DstOp}, Sub.Subreg);
+  }
+
+  // Flag for whether we're supposed to be using DBG_INSTR_REF.
+  MF.setUseDebugInstrRef(YamlMF.UseDebugInstrRef);
+}
+
+bool
+MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
+                                         MachineFunction &MF) {
+  // TODO: Recreate the machine function.
+  if (Target) {
+    // Avoid clearing state if we're using the same subtarget again.
+    Target->setTarget(MF.getSubtarget());
+  } else {
+    Target.reset(new PerTargetMIParsingState(MF.getSubtarget()));
+  }
+
+  MF.setAlignment(YamlMF.Alignment.valueOrOne());
+  MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice);
+  MF.setHasWinCFI(YamlMF.HasWinCFI);
+
+  MF.setCallsEHReturn(YamlMF.CallsEHReturn);
+  MF.setCallsUnwindInit(YamlMF.CallsUnwindInit);
+  MF.setHasEHCatchret(YamlMF.HasEHCatchret);
+  MF.setHasEHScopes(YamlMF.HasEHScopes);
+  MF.setHasEHFunclets(YamlMF.HasEHFunclets);
+  MF.setIsOutlined(YamlMF.IsOutlined);
+
+  if (YamlMF.Legalized)
+    MF.getProperties().set(MachineFunctionProperties::Property::Legalized);
+  if (YamlMF.RegBankSelected)
+    MF.getProperties().set(
+        MachineFunctionProperties::Property::RegBankSelected);
+  if (YamlMF.Selected)
+    MF.getProperties().set(MachineFunctionProperties::Property::Selected);
+  if (YamlMF.FailedISel)
+    MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+  if (YamlMF.FailsVerification)
+    MF.getProperties().set(
+        MachineFunctionProperties::Property::FailsVerification);
+  if (YamlMF.TracksDebugUserValues)
+    MF.getProperties().set(
+        MachineFunctionProperties::Property::TracksDebugUserValues);
+
+  PerFunctionMIParsingState PFS(MF, SM, IRSlots, *Target);
+  if (parseRegisterInfo(PFS, YamlMF))
+    return true;
+  if (!YamlMF.Constants.empty()) {
+    auto *ConstantPool = MF.getConstantPool();
+    assert(ConstantPool && "Constant pool must be created");
+    if (initializeConstantPool(PFS, *ConstantPool, YamlMF))
+      return true;
+  }
+  if (!YamlMF.MachineMetadataNodes.empty() &&
+      parseMachineMetadataNodes(PFS, MF, YamlMF))
+    return true;
+
+  StringRef BlockStr = YamlMF.Body.Value.Value;
+  SMDiagnostic Error;
+  SourceMgr BlockSM;
+  BlockSM.AddNewSourceBuffer(
+      MemoryBuffer::getMemBuffer(BlockStr, "",/*RequiresNullTerminator=*/false),
+      SMLoc());
+  PFS.SM = &BlockSM;
+  if (parseMachineBasicBlockDefinitions(PFS, BlockStr, Error)) {
+    reportDiagnostic(
+        diagFromBlockStringDiag(Error, YamlMF.Body.Value.SourceRange));
+    return true;
+  }
+  // Check Basic Block Section Flags.
+  if (MF.getTarget().getBBSectionsType() == BasicBlockSection::Labels) {
+    MF.setBBSectionsType(BasicBlockSection::Labels);
+  } else if (MF.hasBBSections()) {
+    MF.assignBeginEndSections();
+  }
+  PFS.SM = &SM;
+
+  // Initialize the frame information after creating all the MBBs so that the
+  // MBB references in the frame information can be resolved.
+  if (initializeFrameInfo(PFS, YamlMF))
+    return true;
+  // Initialize the jump table after creating all the MBBs so that the MBB
+  // references can be resolved.
+  if (!YamlMF.JumpTableInfo.Entries.empty() &&
+      initializeJumpTableInfo(PFS, YamlMF.JumpTableInfo))
+    return true;
+  // Parse the machine instructions after creating all of the MBBs so that the
+  // parser can resolve the MBB references.
+  StringRef InsnStr = YamlMF.Body.Value.Value;
+  SourceMgr InsnSM;
+  InsnSM.AddNewSourceBuffer(
+      MemoryBuffer::getMemBuffer(InsnStr, "", /*RequiresNullTerminator=*/false),
+      SMLoc());
+  PFS.SM = &InsnSM;
+  if (parseMachineInstructions(PFS, InsnStr, Error)) {
+    reportDiagnostic(
+        diagFromBlockStringDiag(Error, YamlMF.Body.Value.SourceRange));
+    return true;
+  }
+  PFS.SM = &SM;
+
+  if (setupRegisterInfo(PFS, YamlMF))
+    return true;
+
+  if (YamlMF.MachineFuncInfo) {
+    const LLVMTargetMachine &TM = MF.getTarget();
+    // Note this is called after the initial constructor of the
+    // MachineFunctionInfo based on the MachineFunction, which may depend on the
+    // IR.
+
+    SMRange SrcRange;
+    if (TM.parseMachineFunctionInfo(*YamlMF.MachineFuncInfo, PFS, Error,
+                                    SrcRange)) {
+      return error(Error, SrcRange);
+    }
+  }
+
+  // Set the reserved registers after parsing MachineFuncInfo. The target may
+  // have been recording information used to select the reserved registers
+  // there.
+  // FIXME: This is a temporary workaround until the reserved registers can be
+  // serialized.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MRI.freezeReservedRegs(MF);
+
+  computeFunctionProperties(MF);
+
+  if (initializeCallSiteInfo(PFS, YamlMF))
+    return false;
+
+  setupDebugValueTracking(MF, PFS, YamlMF);
+
+  MF.getSubtarget().mirFileLoaded(MF);
+
+  MF.verify();
+  return false;
+}
+
+bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS,
+                                      const yaml::MachineFunction &YamlMF) {
+  MachineFunction &MF = PFS.MF;
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  assert(RegInfo.tracksLiveness());
+  if (!YamlMF.TracksRegLiveness)
+    RegInfo.invalidateLiveness();
+
+  SMDiagnostic Error;
+  // Parse the virtual register information.
+  for (const auto &VReg : YamlMF.VirtualRegisters) {
+    VRegInfo &Info = PFS.getVRegInfo(VReg.ID.Value);
+    if (Info.Explicit)
+      return error(VReg.ID.SourceRange.Start,
+                   Twine("redefinition of virtual register '%") +
+                       Twine(VReg.ID.Value) + "'");
+    Info.Explicit = true;
+
+    if (StringRef(VReg.Class.Value).equals("_")) {
+      Info.Kind = VRegInfo::GENERIC;
+      Info.D.RegBank = nullptr;
+    } else {
+      const auto *RC = Target->getRegClass(VReg.Class.Value);
+      if (RC) {
+        Info.Kind = VRegInfo::NORMAL;
+        Info.D.RC = RC;
+      } else {
+        const RegisterBank *RegBank = Target->getRegBank(VReg.Class.Value);
+        if (!RegBank)
+          return error(
+              VReg.Class.SourceRange.Start,
+              Twine("use of undefined register class or register bank '") +
+                  VReg.Class.Value + "'");
+        Info.Kind = VRegInfo::REGBANK;
+        Info.D.RegBank = RegBank;
+      }
+    }
+
+    if (!VReg.PreferredRegister.Value.empty()) {
+      if (Info.Kind != VRegInfo::NORMAL)
+        return error(VReg.Class.SourceRange.Start,
+              Twine("preferred register can only be set for normal vregs"));
+
+      if (parseRegisterReference(PFS, Info.PreferredReg,
+                                 VReg.PreferredRegister.Value, Error))
+        return error(Error, VReg.PreferredRegister.SourceRange);
+    }
+  }
+
+  // Parse the liveins.
+  for (const auto &LiveIn : YamlMF.LiveIns) {
+    Register Reg;
+    if (parseNamedRegisterReference(PFS, Reg, LiveIn.Register.Value, Error))
+      return error(Error, LiveIn.Register.SourceRange);
+    Register VReg;
+    if (!LiveIn.VirtualRegister.Value.empty()) {
+      VRegInfo *Info;
+      if (parseVirtualRegisterReference(PFS, Info, LiveIn.VirtualRegister.Value,
+                                        Error))
+        return error(Error, LiveIn.VirtualRegister.SourceRange);
+      VReg = Info->VReg;
+    }
+    RegInfo.addLiveIn(Reg, VReg);
+  }
+
+  // Parse the callee saved registers (Registers that will
+  // be saved for the caller).
+  if (YamlMF.CalleeSavedRegisters) {
+    SmallVector<MCPhysReg, 16> CalleeSavedRegisters;
+    for (const auto &RegSource : *YamlMF.CalleeSavedRegisters) {
+      Register Reg;
+      if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error))
+        return error(Error, RegSource.SourceRange);
+      CalleeSavedRegisters.push_back(Reg);
+    }
+    RegInfo.setCalleeSavedRegs(CalleeSavedRegisters);
+  }
+
+  return false;
+}
+
+bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS,
+                                      const yaml::MachineFunction &YamlMF) {
+  MachineFunction &MF = PFS.MF;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+  bool Error = false;
+  // Create VRegs
+  auto populateVRegInfo = [&](const VRegInfo &Info, Twine Name) {
+    Register Reg = Info.VReg;
+    switch (Info.Kind) {
+    case VRegInfo::UNKNOWN:
+      error(Twine("Cannot determine class/bank of virtual register ") +
+            Name + " in function '" + MF.getName() + "'");
+      Error = true;
+      break;
+    case VRegInfo::NORMAL:
+      if (!Info.D.RC->isAllocatable()) {
+        error(Twine("Cannot use non-allocatable class '") +
+              TRI->getRegClassName(Info.D.RC) + "' for virtual register " +
+              Name + " in function '" + MF.getName() + "'");
+        Error = true;
+        break;
+      }
+
+      MRI.setRegClass(Reg, Info.D.RC);
+      if (Info.PreferredReg != 0)
+        MRI.setSimpleHint(Reg, Info.PreferredReg);
+      break;
+    case VRegInfo::GENERIC:
+      break;
+    case VRegInfo::REGBANK:
+      MRI.setRegBank(Reg, *Info.D.RegBank);
+      break;
+    }
+  };
+
+  for (const auto &P : PFS.VRegInfosNamed) {
+    const VRegInfo &Info = *P.second;
+    populateVRegInfo(Info, Twine(P.first()));
+  }
+
+  for (auto P : PFS.VRegInfos) {
+    const VRegInfo &Info = *P.second;
+    populateVRegInfo(Info, Twine(P.first));
+  }
+
+  // Compute MachineRegisterInfo::UsedPhysRegMask
+  for (const MachineBasicBlock &MBB : MF) {
+    // Make sure MRI knows about registers clobbered by unwinder.
+    if (MBB.isEHPad())
+      if (auto *RegMask = TRI->getCustomEHPadPreservedMask(MF))
+        MRI.addPhysRegsUsedFromRegMask(RegMask);
+
+    for (const MachineInstr &MI : MBB) {
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isRegMask())
+          continue;
+        MRI.addPhysRegsUsedFromRegMask(MO.getRegMask());
+      }
+    }
+  }
+
+  return Error;
+}
+
+bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
+                                        const yaml::MachineFunction &YamlMF) {
+  MachineFunction &MF = PFS.MF;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const Function &F = MF.getFunction();
+  const yaml::MachineFrameInfo &YamlMFI = YamlMF.FrameInfo;
+  MFI.setFrameAddressIsTaken(YamlMFI.IsFrameAddressTaken);
+  MFI.setReturnAddressIsTaken(YamlMFI.IsReturnAddressTaken);
+  MFI.setHasStackMap(YamlMFI.HasStackMap);
+  MFI.setHasPatchPoint(YamlMFI.HasPatchPoint);
+  MFI.setStackSize(YamlMFI.StackSize);
+  MFI.setOffsetAdjustment(YamlMFI.OffsetAdjustment);
+  if (YamlMFI.MaxAlignment)
+    MFI.ensureMaxAlignment(Align(YamlMFI.MaxAlignment));
+  MFI.setAdjustsStack(YamlMFI.AdjustsStack);
+  MFI.setHasCalls(YamlMFI.HasCalls);
+  if (YamlMFI.MaxCallFrameSize != ~0u)
+    MFI.setMaxCallFrameSize(YamlMFI.MaxCallFrameSize);
+  MFI.setCVBytesOfCalleeSavedRegisters(YamlMFI.CVBytesOfCalleeSavedRegisters);
+  MFI.setHasOpaqueSPAdjustment(YamlMFI.HasOpaqueSPAdjustment);
+  MFI.setHasVAStart(YamlMFI.HasVAStart);
+  MFI.setHasMustTailInVarArgFunc(YamlMFI.HasMustTailInVarArgFunc);
+  MFI.setHasTailCall(YamlMFI.HasTailCall);
+  MFI.setLocalFrameSize(YamlMFI.LocalFrameSize);
+  if (!YamlMFI.SavePoint.Value.empty()) {
+    MachineBasicBlock *MBB = nullptr;
+    if (parseMBBReference(PFS, MBB, YamlMFI.SavePoint))
+      return true;
+    MFI.setSavePoint(MBB);
+  }
+  if (!YamlMFI.RestorePoint.Value.empty()) {
+    MachineBasicBlock *MBB = nullptr;
+    if (parseMBBReference(PFS, MBB, YamlMFI.RestorePoint))
+      return true;
+    MFI.setRestorePoint(MBB);
+  }
+
+  std::vector<CalleeSavedInfo> CSIInfo;
+  // Initialize the fixed frame objects.
+  for (const auto &Object : YamlMF.FixedStackObjects) {
+    int ObjectIdx;
+    if (Object.Type != yaml::FixedMachineStackObject::SpillSlot)
+      ObjectIdx = MFI.CreateFixedObject(Object.Size, Object.Offset,
+                                        Object.IsImmutable, Object.IsAliased);
+    else
+      ObjectIdx = MFI.CreateFixedSpillStackObject(Object.Size, Object.Offset);
+
+    if (!TFI->isSupportedStackID(Object.StackID))
+      return error(Object.ID.SourceRange.Start,
+                   Twine("StackID is not supported by target"));
+    MFI.setStackID(ObjectIdx, Object.StackID);
+    MFI.setObjectAlignment(ObjectIdx, Object.Alignment.valueOrOne());
+    if (!PFS.FixedStackObjectSlots.insert(std::make_pair(Object.ID.Value,
+                                                         ObjectIdx))
+             .second)
+      return error(Object.ID.SourceRange.Start,
+                   Twine("redefinition of fixed stack object '%fixed-stack.") +
+                       Twine(Object.ID.Value) + "'");
+    if (parseCalleeSavedRegister(PFS, CSIInfo, Object.CalleeSavedRegister,
+                                 Object.CalleeSavedRestored, ObjectIdx))
+      return true;
+    if (parseStackObjectsDebugInfo(PFS, Object, ObjectIdx))
+      return true;
+  }
+
+  for (const auto &Object : YamlMF.EntryValueObjects) {
+    SMDiagnostic Error;
+    Register Reg;
+    if (parseNamedRegisterReference(PFS, Reg, Object.EntryValueRegister.Value,
+                                    Error))
+      return error(Error, Object.EntryValueRegister.SourceRange);
+    if (!Reg.isPhysical())
+      return error(Object.EntryValueRegister.SourceRange.Start,
+                   "Expected physical register for entry value field");
+    std::optional<VarExprLoc> MaybeInfo = parseVarExprLoc(
+        PFS, Object.DebugVar, Object.DebugExpr, Object.DebugLoc);
+    if (!MaybeInfo)
+      return true;
+    if (MaybeInfo->DIVar || MaybeInfo->DIExpr || MaybeInfo->DILoc)
+      PFS.MF.setVariableDbgInfo(MaybeInfo->DIVar, MaybeInfo->DIExpr,
+                                Reg.asMCReg(), MaybeInfo->DILoc);
+  }
+
+  // Initialize the ordinary frame objects.
+  for (const auto &Object : YamlMF.StackObjects) {
+    int ObjectIdx;
+    const AllocaInst *Alloca = nullptr;
+    const yaml::StringValue &Name = Object.Name;
+    if (!Name.Value.empty()) {
+      Alloca = dyn_cast_or_null<AllocaInst>(
+          F.getValueSymbolTable()->lookup(Name.Value));
+      if (!Alloca)
+        return error(Name.SourceRange.Start,
+                     "alloca instruction named '" + Name.Value +
+                         "' isn't defined in the function '" + F.getName() +
+                         "'");
+    }
+    if (!TFI->isSupportedStackID(Object.StackID))
+      return error(Object.ID.SourceRange.Start,
+                   Twine("StackID is not supported by target"));
+    if (Object.Type == yaml::MachineStackObject::VariableSized)
+      ObjectIdx =
+          MFI.CreateVariableSizedObject(Object.Alignment.valueOrOne(), Alloca);
+    else
+      ObjectIdx = MFI.CreateStackObject(
+          Object.Size, Object.Alignment.valueOrOne(),
+          Object.Type == yaml::MachineStackObject::SpillSlot, Alloca,
+          Object.StackID);
+    MFI.setObjectOffset(ObjectIdx, Object.Offset);
+
+    if (!PFS.StackObjectSlots.insert(std::make_pair(Object.ID.Value, ObjectIdx))
+             .second)
+      return error(Object.ID.SourceRange.Start,
+                   Twine("redefinition of stack object '%stack.") +
+                       Twine(Object.ID.Value) + "'");
+    if (parseCalleeSavedRegister(PFS, CSIInfo, Object.CalleeSavedRegister,
+                                 Object.CalleeSavedRestored, ObjectIdx))
+      return true;
+    if (Object.LocalOffset)
+      MFI.mapLocalFrameObject(ObjectIdx, *Object.LocalOffset);
+    if (parseStackObjectsDebugInfo(PFS, Object, ObjectIdx))
+      return true;
+  }
+  MFI.setCalleeSavedInfo(CSIInfo);
+  if (!CSIInfo.empty())
+    MFI.setCalleeSavedInfoValid(true);
+
+  // Initialize the various stack object references after initializing the
+  // stack objects.
+  if (!YamlMFI.StackProtector.Value.empty()) {
+    SMDiagnostic Error;
+    int FI;
+    if (parseStackObjectReference(PFS, FI, YamlMFI.StackProtector.Value, Error))
+      return error(Error, YamlMFI.StackProtector.SourceRange);
+    MFI.setStackProtectorIndex(FI);
+  }
+
+  if (!YamlMFI.FunctionContext.Value.empty()) {
+    SMDiagnostic Error;
+    int FI;
+    if (parseStackObjectReference(PFS, FI, YamlMFI.FunctionContext.Value, Error))
+      return error(Error, YamlMFI.FunctionContext.SourceRange);
+    MFI.setFunctionContextIndex(FI);
+  }
+
+  return false;
+}
+
+bool MIRParserImpl::parseCalleeSavedRegister(PerFunctionMIParsingState &PFS,
+    std::vector<CalleeSavedInfo> &CSIInfo,
+    const yaml::StringValue &RegisterSource, bool IsRestored, int FrameIdx) {
+  if (RegisterSource.Value.empty())
+    return false;
+  Register Reg;
+  SMDiagnostic Error;
+  if (parseNamedRegisterReference(PFS, Reg, RegisterSource.Value, Error))
+    return error(Error, RegisterSource.SourceRange);
+  CalleeSavedInfo CSI(Reg, FrameIdx);
+  CSI.setRestored(IsRestored);
+  CSIInfo.push_back(CSI);
+  return false;
+}
+
+/// Verify that given node is of a certain type. Return true on error.
+template <typename T>
+static bool typecheckMDNode(T *&Result, MDNode *Node,
+                            const yaml::StringValue &Source,
+                            StringRef TypeString, MIRParserImpl &Parser) {
+  if (!Node)
+    return false;
+  Result = dyn_cast<T>(Node);
+  if (!Result)
+    return Parser.error(Source.SourceRange.Start,
+                        "expected a reference to a '" + TypeString +
+                            "' metadata node");
+  return false;
+}
+
+std::optional<MIRParserImpl::VarExprLoc> MIRParserImpl::parseVarExprLoc(
+    PerFunctionMIParsingState &PFS, const yaml::StringValue &VarStr,
+    const yaml::StringValue &ExprStr, const yaml::StringValue &LocStr) {
+  MDNode *Var = nullptr;
+  MDNode *Expr = nullptr;
+  MDNode *Loc = nullptr;
+  if (parseMDNode(PFS, Var, VarStr) || parseMDNode(PFS, Expr, ExprStr) ||
+      parseMDNode(PFS, Loc, LocStr))
+    return std::nullopt;
+  DILocalVariable *DIVar = nullptr;
+  DIExpression *DIExpr = nullptr;
+  DILocation *DILoc = nullptr;
+  if (typecheckMDNode(DIVar, Var, VarStr, "DILocalVariable", *this) ||
+      typecheckMDNode(DIExpr, Expr, ExprStr, "DIExpression", *this) ||
+      typecheckMDNode(DILoc, Loc, LocStr, "DILocation", *this))
+    return std::nullopt;
+  return VarExprLoc{DIVar, DIExpr, DILoc};
+}
+
+template <typename T>
+bool MIRParserImpl::parseStackObjectsDebugInfo(PerFunctionMIParsingState &PFS,
+                                               const T &Object, int FrameIdx) {
+  std::optional<VarExprLoc> MaybeInfo =
+      parseVarExprLoc(PFS, Object.DebugVar, Object.DebugExpr, Object.DebugLoc);
+  if (!MaybeInfo)
+    return true;
+  // Debug information can only be attached to stack objects; Fixed stack
+  // objects aren't supported.
+  if (MaybeInfo->DIVar || MaybeInfo->DIExpr || MaybeInfo->DILoc)
+    PFS.MF.setVariableDbgInfo(MaybeInfo->DIVar, MaybeInfo->DIExpr, FrameIdx,
+                              MaybeInfo->DILoc);
+  return false;
+}
+
+bool MIRParserImpl::parseMDNode(PerFunctionMIParsingState &PFS,
+    MDNode *&Node, const yaml::StringValue &Source) {
+  if (Source.Value.empty())
+    return false;
+  SMDiagnostic Error;
+  if (llvm::parseMDNode(PFS, Node, Source.Value, Error))
+    return error(Error, Source.SourceRange);
+  return false;
+}
+
+bool MIRParserImpl::initializeConstantPool(PerFunctionMIParsingState &PFS,
+    MachineConstantPool &ConstantPool, const yaml::MachineFunction &YamlMF) {
+  DenseMap<unsigned, unsigned> &ConstantPoolSlots = PFS.ConstantPoolSlots;
+  const MachineFunction &MF = PFS.MF;
+  const auto &M = *MF.getFunction().getParent();
+  SMDiagnostic Error;
+  for (const auto &YamlConstant : YamlMF.Constants) {
+    if (YamlConstant.IsTargetSpecific)
+      // FIXME: Support target-specific constant pools
+      return error(YamlConstant.Value.SourceRange.Start,
+                   "Can't parse target-specific constant pool entries yet");
+    const Constant *Value = dyn_cast_or_null<Constant>(
+        parseConstantValue(YamlConstant.Value.Value, Error, M));
+    if (!Value)
+      return error(Error, YamlConstant.Value.SourceRange);
+    const Align PrefTypeAlign =
+        M.getDataLayout().getPrefTypeAlign(Value->getType());
+    const Align Alignment = YamlConstant.Alignment.value_or(PrefTypeAlign);
+    unsigned Index = ConstantPool.getConstantPoolIndex(Value, Alignment);
+    if (!ConstantPoolSlots.insert(std::make_pair(YamlConstant.ID.Value, Index))
+             .second)
+      return error(YamlConstant.ID.SourceRange.Start,
+                   Twine("redefinition of constant pool item '%const.") +
+                       Twine(YamlConstant.ID.Value) + "'");
+  }
+  return false;
+}
+
+bool MIRParserImpl::initializeJumpTableInfo(PerFunctionMIParsingState &PFS,
+    const yaml::MachineJumpTable &YamlJTI) {
+  MachineJumpTableInfo *JTI = PFS.MF.getOrCreateJumpTableInfo(YamlJTI.Kind);
+  for (const auto &Entry : YamlJTI.Entries) {
+    std::vector<MachineBasicBlock *> Blocks;
+    for (const auto &MBBSource : Entry.Blocks) {
+      MachineBasicBlock *MBB = nullptr;
+      if (parseMBBReference(PFS, MBB, MBBSource.Value))
+        return true;
+      Blocks.push_back(MBB);
+    }
+    unsigned Index = JTI->createJumpTableIndex(Blocks);
+    if (!PFS.JumpTableSlots.insert(std::make_pair(Entry.ID.Value, Index))
+             .second)
+      return error(Entry.ID.SourceRange.Start,
+                   Twine("redefinition of jump table entry '%jump-table.") +
+                       Twine(Entry.ID.Value) + "'");
+  }
+  return false;
+}
+
+bool MIRParserImpl::parseMBBReference(PerFunctionMIParsingState &PFS,
+                                      MachineBasicBlock *&MBB,
+                                      const yaml::StringValue &Source) {
+  SMDiagnostic Error;
+  if (llvm::parseMBBReference(PFS, MBB, Source.Value, Error))
+    return error(Error, Source.SourceRange);
+  return false;
+}
+
+bool MIRParserImpl::parseMachineMetadata(PerFunctionMIParsingState &PFS,
+                                         const yaml::StringValue &Source) {
+  SMDiagnostic Error;
+  if (llvm::parseMachineMetadata(PFS, Source.Value, Source.SourceRange, Error))
+    return error(Error, Source.SourceRange);
+  return false;
+}
+
+bool MIRParserImpl::parseMachineMetadataNodes(
+    PerFunctionMIParsingState &PFS, MachineFunction &MF,
+    const yaml::MachineFunction &YMF) {
+  for (const auto &MDS : YMF.MachineMetadataNodes) {
+    if (parseMachineMetadata(PFS, MDS))
+      return true;
+  }
+  // Report missing definitions from forward referenced nodes.
+  if (!PFS.MachineForwardRefMDNodes.empty())
+    return error(PFS.MachineForwardRefMDNodes.begin()->second.second,
+                 "use of undefined metadata '!" +
+                     Twine(PFS.MachineForwardRefMDNodes.begin()->first) + "'");
+  return false;
+}
+
+SMDiagnostic MIRParserImpl::diagFromMIStringDiag(const SMDiagnostic &Error,
+                                                 SMRange SourceRange) {
+  assert(SourceRange.isValid() && "Invalid source range");
+  SMLoc Loc = SourceRange.Start;
+  bool HasQuote = Loc.getPointer() < SourceRange.End.getPointer() &&
+                  *Loc.getPointer() == '\'';
+  // Translate the location of the error from the location in the MI string to
+  // the corresponding location in the MIR file.
+  Loc = Loc.getFromPointer(Loc.getPointer() + Error.getColumnNo() +
+                           (HasQuote ? 1 : 0));
+
+  // TODO: Translate any source ranges as well.
+  return SM.GetMessage(Loc, Error.getKind(), Error.getMessage(), std::nullopt,
+                       Error.getFixIts());
+}
+
+SMDiagnostic MIRParserImpl::diagFromBlockStringDiag(const SMDiagnostic &Error,
+                                                    SMRange SourceRange) {
+  assert(SourceRange.isValid());
+
+  // Translate the location of the error from the location in the llvm IR string
+  // to the corresponding location in the MIR file.
+  auto LineAndColumn = SM.getLineAndColumn(SourceRange.Start);
+  unsigned Line = LineAndColumn.first + Error.getLineNo() - 1;
+  unsigned Column = Error.getColumnNo();
+  StringRef LineStr = Error.getLineContents();
+  SMLoc Loc = Error.getLoc();
+
+  // Get the full line and adjust the column number by taking the indentation of
+  // LLVM IR into account.
+  for (line_iterator L(*SM.getMemoryBuffer(SM.getMainFileID()), false), E;
+       L != E; ++L) {
+    if (L.line_number() == Line) {
+      LineStr = *L;
+      Loc = SMLoc::getFromPointer(LineStr.data());
+      auto Indent = LineStr.find(Error.getLineContents());
+      if (Indent != StringRef::npos)
+        Column += Indent;
+      break;
+    }
+  }
+
+  return SMDiagnostic(SM, Loc, Filename, Line, Column, Error.getKind(),
+                      Error.getMessage(), LineStr, Error.getRanges(),
+                      Error.getFixIts());
+}
+
+MIRParser::MIRParser(std::unique_ptr<MIRParserImpl> Impl)
+    : Impl(std::move(Impl)) {}
+
+MIRParser::~MIRParser() = default;
+
+std::unique_ptr<Module>
+MIRParser::parseIRModule(DataLayoutCallbackTy DataLayoutCallback) {
+  return Impl->parseIRModule(DataLayoutCallback);
+}
+
+bool MIRParser::parseMachineFunctions(Module &M, MachineModuleInfo &MMI) {
+  return Impl->parseMachineFunctions(M, MMI);
+}
+
+std::unique_ptr<MIRParser> llvm::createMIRParserFromFile(
+    StringRef Filename, SMDiagnostic &Error, LLVMContext &Context,
+    std::function<void(Function &)> ProcessIRFunction) {
+  auto FileOrErr = MemoryBuffer::getFileOrSTDIN(Filename, /*IsText=*/true);
+  if (std::error_code EC = FileOrErr.getError()) {
+    Error = SMDiagnostic(Filename, SourceMgr::DK_Error,
+                         "Could not open input file: " + EC.message());
+    return nullptr;
+  }
+  return createMIRParser(std::move(FileOrErr.get()), Context,
+                         ProcessIRFunction);
+}
+
+std::unique_ptr<MIRParser>
+llvm::createMIRParser(std::unique_ptr<MemoryBuffer> Contents,
+                      LLVMContext &Context,
+                      std::function<void(Function &)> ProcessIRFunction) {
+  auto Filename = Contents->getBufferIdentifier();
+  if (Context.shouldDiscardValueNames()) {
+    Context.diagnose(DiagnosticInfoMIRParser(
+        DS_Error,
+        SMDiagnostic(
+            Filename, SourceMgr::DK_Error,
+            "Can't read MIR with a Context that discards named Values")));
+    return nullptr;
+  }
+  return std::make_unique<MIRParser>(std::make_unique<MIRParserImpl>(
+      std::move(Contents), Filename, Context, ProcessIRFunction));
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRPrinter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRPrinter.cpp
new file mode 100644
index 000000000000..b91d9c4727fc
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -0,0 +1,989 @@
+//===- MIRPrinter.cpp - MIR serialization format printer ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the class that prints out the LLVM IR and machine
+// functions using the MIR serialization format.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MIRPrinter.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleSlotTracker.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<bool> SimplifyMIR(
+    "simplify-mir", cl::Hidden,
+    cl::desc("Leave out unnecessary information when printing MIR"));
+
+static cl::opt<bool> PrintLocations("mir-debug-loc", cl::Hidden, cl::init(true),
+                                    cl::desc("Print MIR debug-locations"));
+
+namespace {
+
+/// This structure describes how to print out stack object references.
+struct FrameIndexOperand {
+  std::string Name;
+  unsigned ID;
+  bool IsFixed;
+
+  FrameIndexOperand(StringRef Name, unsigned ID, bool IsFixed)
+      : Name(Name.str()), ID(ID), IsFixed(IsFixed) {}
+
+  /// Return an ordinary stack object reference.
+  static FrameIndexOperand create(StringRef Name, unsigned ID) {
+    return FrameIndexOperand(Name, ID, /*IsFixed=*/false);
+  }
+
+  /// Return a fixed stack object reference.
+  static FrameIndexOperand createFixed(unsigned ID) {
+    return FrameIndexOperand("", ID, /*IsFixed=*/true);
+  }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+/// This class prints out the machine functions using the MIR serialization
+/// format.
+class MIRPrinter {
+  raw_ostream &OS;
+  DenseMap<const uint32_t *, unsigned> RegisterMaskIds;
+  /// Maps from stack object indices to operand indices which will be used when
+  /// printing frame index machine operands.
+  DenseMap<int, FrameIndexOperand> StackObjectOperandMapping;
+
+public:
+  MIRPrinter(raw_ostream &OS) : OS(OS) {}
+
+  void print(const MachineFunction &MF);
+
+  void convert(yaml::MachineFunction &MF, const MachineRegisterInfo &RegInfo,
+               const TargetRegisterInfo *TRI);
+  void convert(ModuleSlotTracker &MST, yaml::MachineFrameInfo &YamlMFI,
+               const MachineFrameInfo &MFI);
+  void convert(yaml::MachineFunction &MF,
+               const MachineConstantPool &ConstantPool);
+  void convert(ModuleSlotTracker &MST, yaml::MachineJumpTable &YamlJTI,
+               const MachineJumpTableInfo &JTI);
+  void convertStackObjects(yaml::MachineFunction &YMF,
+                           const MachineFunction &MF, ModuleSlotTracker &MST);
+  void convertEntryValueObjects(yaml::MachineFunction &YMF,
+                                const MachineFunction &MF,
+                                ModuleSlotTracker &MST);
+  void convertCallSiteObjects(yaml::MachineFunction &YMF,
+                              const MachineFunction &MF,
+                              ModuleSlotTracker &MST);
+  void convertMachineMetadataNodes(yaml::MachineFunction &YMF,
+                                   const MachineFunction &MF,
+                                   MachineModuleSlotTracker &MST);
+
+private:
+  void initRegisterMaskIds(const MachineFunction &MF);
+};
+
+/// This class prints out the machine instructions using the MIR serialization
+/// format.
+class MIPrinter {
+  raw_ostream &OS;
+  ModuleSlotTracker &MST;
+  const DenseMap<const uint32_t *, unsigned> &RegisterMaskIds;
+  const DenseMap<int, FrameIndexOperand> &StackObjectOperandMapping;
+  /// Synchronization scope names registered with LLVMContext.
+  SmallVector<StringRef, 8> SSNs;
+
+  bool canPredictBranchProbabilities(const MachineBasicBlock &MBB) const;
+  bool canPredictSuccessors(const MachineBasicBlock &MBB) const;
+
+public:
+  MIPrinter(raw_ostream &OS, ModuleSlotTracker &MST,
+            const DenseMap<const uint32_t *, unsigned> &RegisterMaskIds,
+            const DenseMap<int, FrameIndexOperand> &StackObjectOperandMapping)
+      : OS(OS), MST(MST), RegisterMaskIds(RegisterMaskIds),
+        StackObjectOperandMapping(StackObjectOperandMapping) {}
+
+  void print(const MachineBasicBlock &MBB);
+
+  void print(const MachineInstr &MI);
+  void printStackObjectReference(int FrameIndex);
+  void print(const MachineInstr &MI, unsigned OpIdx,
+             const TargetRegisterInfo *TRI, const TargetInstrInfo *TII,
+             bool ShouldPrintRegisterTies, LLT TypeToPrint,
+             bool PrintDef = true);
+};
+
+} // end namespace llvm
+
+namespace llvm {
+namespace yaml {
+
+/// This struct serializes the LLVM IR module.
+template <> struct BlockScalarTraits<Module> {
+  static void output(const Module &Mod, void *Ctxt, raw_ostream &OS) {
+    Mod.print(OS, nullptr);
+  }
+
+  static StringRef input(StringRef Str, void *Ctxt, Module &Mod) {
+    llvm_unreachable("LLVM Module is supposed to be parsed separately");
+    return "";
+  }
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+static void printRegMIR(unsigned Reg, yaml::StringValue &Dest,
+                        const TargetRegisterInfo *TRI) {
+  raw_string_ostream OS(Dest.Value);
+  OS << printReg(Reg, TRI);
+}
+
+void MIRPrinter::print(const MachineFunction &MF) {
+  initRegisterMaskIds(MF);
+
+  yaml::MachineFunction YamlMF;
+  YamlMF.Name = MF.getName();
+  YamlMF.Alignment = MF.getAlignment();
+  YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice();
+  YamlMF.HasWinCFI = MF.hasWinCFI();
+
+  YamlMF.CallsEHReturn = MF.callsEHReturn();
+  YamlMF.CallsUnwindInit = MF.callsUnwindInit();
+  YamlMF.HasEHCatchret = MF.hasEHCatchret();
+  YamlMF.HasEHScopes = MF.hasEHScopes();
+  YamlMF.HasEHFunclets = MF.hasEHFunclets();
+  YamlMF.IsOutlined = MF.isOutlined();
+  YamlMF.UseDebugInstrRef = MF.useDebugInstrRef();
+
+  YamlMF.Legalized = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::Legalized);
+  YamlMF.RegBankSelected = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::RegBankSelected);
+  YamlMF.Selected = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::Selected);
+  YamlMF.FailedISel = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::FailedISel);
+  YamlMF.FailsVerification = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::FailsVerification);
+  YamlMF.TracksDebugUserValues = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::TracksDebugUserValues);
+
+  convert(YamlMF, MF.getRegInfo(), MF.getSubtarget().getRegisterInfo());
+  MachineModuleSlotTracker MST(&MF);
+  MST.incorporateFunction(MF.getFunction());
+  convert(MST, YamlMF.FrameInfo, MF.getFrameInfo());
+  convertStackObjects(YamlMF, MF, MST);
+  convertEntryValueObjects(YamlMF, MF, MST);
+  convertCallSiteObjects(YamlMF, MF, MST);
+  for (const auto &Sub : MF.DebugValueSubstitutions) {
+    const auto &SubSrc = Sub.Src;
+    const auto &SubDest = Sub.Dest;
+    YamlMF.DebugValueSubstitutions.push_back({SubSrc.first, SubSrc.second,
+                                              SubDest.first,
+                                              SubDest.second,
+                                              Sub.Subreg});
+  }
+  if (const auto *ConstantPool = MF.getConstantPool())
+    convert(YamlMF, *ConstantPool);
+  if (const auto *JumpTableInfo = MF.getJumpTableInfo())
+    convert(MST, YamlMF.JumpTableInfo, *JumpTableInfo);
+
+  const TargetMachine &TM = MF.getTarget();
+  YamlMF.MachineFuncInfo =
+      std::unique_ptr<yaml::MachineFunctionInfo>(TM.convertFuncInfoToYAML(MF));
+
+  raw_string_ostream StrOS(YamlMF.Body.Value.Value);
+  bool IsNewlineNeeded = false;
+  for (const auto &MBB : MF) {
+    if (IsNewlineNeeded)
+      StrOS << "\n";
+    MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping)
+        .print(MBB);
+    IsNewlineNeeded = true;
+  }
+  StrOS.flush();
+  // Convert machine metadata collected during the print of the machine
+  // function.
+  convertMachineMetadataNodes(YamlMF, MF, MST);
+
+  yaml::Output Out(OS);
+  if (!SimplifyMIR)
+      Out.setWriteDefaultValues(true);
+  Out << YamlMF;
+}
+
+static void printCustomRegMask(const uint32_t *RegMask, raw_ostream &OS,
+                               const TargetRegisterInfo *TRI) {
+  assert(RegMask && "Can't print an empty register mask");
+  OS << StringRef("CustomRegMask(");
+
+  bool IsRegInRegMaskFound = false;
+  for (int I = 0, E = TRI->getNumRegs(); I < E; I++) {
+    // Check whether the register is asserted in regmask.
+    if (RegMask[I / 32] & (1u << (I % 32))) {
+      if (IsRegInRegMaskFound)
+        OS << ',';
+      OS << printReg(I, TRI);
+      IsRegInRegMaskFound = true;
+    }
+  }
+
+  OS << ')';
+}
+
+static void printRegClassOrBank(unsigned Reg, yaml::StringValue &Dest,
+                                const MachineRegisterInfo &RegInfo,
+                                const TargetRegisterInfo *TRI) {
+  raw_string_ostream OS(Dest.Value);
+  OS << printRegClassOrBank(Reg, RegInfo, TRI);
+}
+
+template <typename T>
+static void
+printStackObjectDbgInfo(const MachineFunction::VariableDbgInfo &DebugVar,
+                        T &Object, ModuleSlotTracker &MST) {
+  std::array<std::string *, 3> Outputs{{&Object.DebugVar.Value,
+                                        &Object.DebugExpr.Value,
+                                        &Object.DebugLoc.Value}};
+  std::array<const Metadata *, 3> Metas{{DebugVar.Var,
+                                        DebugVar.Expr,
+                                        DebugVar.Loc}};
+  for (unsigned i = 0; i < 3; ++i) {
+    raw_string_ostream StrOS(*Outputs[i]);
+    Metas[i]->printAsOperand(StrOS, MST);
+  }
+}
+
+void MIRPrinter::convert(yaml::MachineFunction &MF,
+                         const MachineRegisterInfo &RegInfo,
+                         const TargetRegisterInfo *TRI) {
+  MF.TracksRegLiveness = RegInfo.tracksLiveness();
+
+  // Print the virtual register definitions.
+  for (unsigned I = 0, E = RegInfo.getNumVirtRegs(); I < E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+    yaml::VirtualRegisterDefinition VReg;
+    VReg.ID = I;
+    if (RegInfo.getVRegName(Reg) != "")
+      continue;
+    ::printRegClassOrBank(Reg, VReg.Class, RegInfo, TRI);
+    Register PreferredReg = RegInfo.getSimpleHint(Reg);
+    if (PreferredReg)
+      printRegMIR(PreferredReg, VReg.PreferredRegister, TRI);
+    MF.VirtualRegisters.push_back(VReg);
+  }
+
+  // Print the live ins.
+  for (std::pair<unsigned, unsigned> LI : RegInfo.liveins()) {
+    yaml::MachineFunctionLiveIn LiveIn;
+    printRegMIR(LI.first, LiveIn.Register, TRI);
+    if (LI.second)
+      printRegMIR(LI.second, LiveIn.VirtualRegister, TRI);
+    MF.LiveIns.push_back(LiveIn);
+  }
+
+  // Prints the callee saved registers.
+  if (RegInfo.isUpdatedCSRsInitialized()) {
+    const MCPhysReg *CalleeSavedRegs = RegInfo.getCalleeSavedRegs();
+    std::vector<yaml::FlowStringValue> CalleeSavedRegisters;
+    for (const MCPhysReg *I = CalleeSavedRegs; *I; ++I) {
+      yaml::FlowStringValue Reg;
+      printRegMIR(*I, Reg, TRI);
+      CalleeSavedRegisters.push_back(Reg);
+    }
+    MF.CalleeSavedRegisters = CalleeSavedRegisters;
+  }
+}
+
+void MIRPrinter::convert(ModuleSlotTracker &MST,
+                         yaml::MachineFrameInfo &YamlMFI,
+                         const MachineFrameInfo &MFI) {
+  YamlMFI.IsFrameAddressTaken = MFI.isFrameAddressTaken();
+  YamlMFI.IsReturnAddressTaken = MFI.isReturnAddressTaken();
+  YamlMFI.HasStackMap = MFI.hasStackMap();
+  YamlMFI.HasPatchPoint = MFI.hasPatchPoint();
+  YamlMFI.StackSize = MFI.getStackSize();
+  YamlMFI.OffsetAdjustment = MFI.getOffsetAdjustment();
+  YamlMFI.MaxAlignment = MFI.getMaxAlign().value();
+  YamlMFI.AdjustsStack = MFI.adjustsStack();
+  YamlMFI.HasCalls = MFI.hasCalls();
+  YamlMFI.MaxCallFrameSize = MFI.isMaxCallFrameSizeComputed()
+    ? MFI.getMaxCallFrameSize() : ~0u;
+  YamlMFI.CVBytesOfCalleeSavedRegisters =
+      MFI.getCVBytesOfCalleeSavedRegisters();
+  YamlMFI.HasOpaqueSPAdjustment = MFI.hasOpaqueSPAdjustment();
+  YamlMFI.HasVAStart = MFI.hasVAStart();
+  YamlMFI.HasMustTailInVarArgFunc = MFI.hasMustTailInVarArgFunc();
+  YamlMFI.HasTailCall = MFI.hasTailCall();
+  YamlMFI.LocalFrameSize = MFI.getLocalFrameSize();
+  if (MFI.getSavePoint()) {
+    raw_string_ostream StrOS(YamlMFI.SavePoint.Value);
+    StrOS << printMBBReference(*MFI.getSavePoint());
+  }
+  if (MFI.getRestorePoint()) {
+    raw_string_ostream StrOS(YamlMFI.RestorePoint.Value);
+    StrOS << printMBBReference(*MFI.getRestorePoint());
+  }
+}
+
+void MIRPrinter::convertEntryValueObjects(yaml::MachineFunction &YMF,
+                                          const MachineFunction &MF,
+                                          ModuleSlotTracker &MST) {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  for (const MachineFunction::VariableDbgInfo &DebugVar :
+       MF.getEntryValueVariableDbgInfo()) {
+    yaml::EntryValueObject &Obj = YMF.EntryValueObjects.emplace_back();
+    printStackObjectDbgInfo(DebugVar, Obj, MST);
+    MCRegister EntryValReg = DebugVar.getEntryValueRegister();
+    printRegMIR(EntryValReg, Obj.EntryValueRegister, TRI);
+  }
+}
+
+void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF,
+                                     const MachineFunction &MF,
+                                     ModuleSlotTracker &MST) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+  // Process fixed stack objects.
+  assert(YMF.FixedStackObjects.empty());
+  SmallVector<int, 32> FixedStackObjectsIdx;
+  const int BeginIdx = MFI.getObjectIndexBegin();
+  if (BeginIdx < 0)
+    FixedStackObjectsIdx.reserve(-BeginIdx);
+
+  unsigned ID = 0;
+  for (int I = BeginIdx; I < 0; ++I, ++ID) {
+    FixedStackObjectsIdx.push_back(-1); // Fill index for possible dead.
+    if (MFI.isDeadObjectIndex(I))
+      continue;
+
+    yaml::FixedMachineStackObject YamlObject;
+    YamlObject.ID = ID;
+    YamlObject.Type = MFI.isSpillSlotObjectIndex(I)
+                          ? yaml::FixedMachineStackObject::SpillSlot
+                          : yaml::FixedMachineStackObject::DefaultType;
+    YamlObject.Offset = MFI.getObjectOffset(I);
+    YamlObject.Size = MFI.getObjectSize(I);
+    YamlObject.Alignment = MFI.getObjectAlign(I);
+    YamlObject.StackID = (TargetStackID::Value)MFI.getStackID(I);
+    YamlObject.IsImmutable = MFI.isImmutableObjectIndex(I);
+    YamlObject.IsAliased = MFI.isAliasedObjectIndex(I);
+    // Save the ID' position in FixedStackObjects storage vector.
+    FixedStackObjectsIdx[ID] = YMF.FixedStackObjects.size();
+    YMF.FixedStackObjects.push_back(YamlObject);
+    StackObjectOperandMapping.insert(
+        std::make_pair(I, FrameIndexOperand::createFixed(ID)));
+  }
+
+  // Process ordinary stack objects.
+  assert(YMF.StackObjects.empty());
+  SmallVector<unsigned, 32> StackObjectsIdx;
+  const int EndIdx = MFI.getObjectIndexEnd();
+  if (EndIdx > 0)
+    StackObjectsIdx.reserve(EndIdx);
+  ID = 0;
+  for (int I = 0; I < EndIdx; ++I, ++ID) {
+    StackObjectsIdx.push_back(-1); // Fill index for possible dead.
+    if (MFI.isDeadObjectIndex(I))
+      continue;
+
+    yaml::MachineStackObject YamlObject;
+    YamlObject.ID = ID;
+    if (const auto *Alloca = MFI.getObjectAllocation(I))
+      YamlObject.Name.Value = std::string(
+          Alloca->hasName() ? Alloca->getName() : "");
+    YamlObject.Type = MFI.isSpillSlotObjectIndex(I)
+                          ? yaml::MachineStackObject::SpillSlot
+                          : MFI.isVariableSizedObjectIndex(I)
+                                ? yaml::MachineStackObject::VariableSized
+                                : yaml::MachineStackObject::DefaultType;
+    YamlObject.Offset = MFI.getObjectOffset(I);
+    YamlObject.Size = MFI.getObjectSize(I);
+    YamlObject.Alignment = MFI.getObjectAlign(I);
+    YamlObject.StackID = (TargetStackID::Value)MFI.getStackID(I);
+
+    // Save the ID' position in StackObjects storage vector.
+    StackObjectsIdx[ID] = YMF.StackObjects.size();
+    YMF.StackObjects.push_back(YamlObject);
+    StackObjectOperandMapping.insert(std::make_pair(
+        I, FrameIndexOperand::create(YamlObject.Name.Value, ID)));
+  }
+
+  for (const auto &CSInfo : MFI.getCalleeSavedInfo()) {
+    const int FrameIdx = CSInfo.getFrameIdx();
+    if (!CSInfo.isSpilledToReg() && MFI.isDeadObjectIndex(FrameIdx))
+      continue;
+
+    yaml::StringValue Reg;
+    printRegMIR(CSInfo.getReg(), Reg, TRI);
+    if (!CSInfo.isSpilledToReg()) {
+      assert(FrameIdx >= MFI.getObjectIndexBegin() &&
+             FrameIdx < MFI.getObjectIndexEnd() &&
+             "Invalid stack object index");
+      if (FrameIdx < 0) { // Negative index means fixed objects.
+        auto &Object =
+            YMF.FixedStackObjects
+                [FixedStackObjectsIdx[FrameIdx + MFI.getNumFixedObjects()]];
+        Object.CalleeSavedRegister = Reg;
+        Object.CalleeSavedRestored = CSInfo.isRestored();
+      } else {
+        auto &Object = YMF.StackObjects[StackObjectsIdx[FrameIdx]];
+        Object.CalleeSavedRegister = Reg;
+        Object.CalleeSavedRestored = CSInfo.isRestored();
+      }
+    }
+  }
+  for (unsigned I = 0, E = MFI.getLocalFrameObjectCount(); I < E; ++I) {
+    auto LocalObject = MFI.getLocalFrameObjectMap(I);
+    assert(LocalObject.first >= 0 && "Expected a locally mapped stack object");
+    YMF.StackObjects[StackObjectsIdx[LocalObject.first]].LocalOffset =
+        LocalObject.second;
+  }
+
+  // Print the stack object references in the frame information class after
+  // converting the stack objects.
+  if (MFI.hasStackProtectorIndex()) {
+    raw_string_ostream StrOS(YMF.FrameInfo.StackProtector.Value);
+    MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping)
+        .printStackObjectReference(MFI.getStackProtectorIndex());
+  }
+
+  if (MFI.hasFunctionContextIndex()) {
+    raw_string_ostream StrOS(YMF.FrameInfo.FunctionContext.Value);
+    MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping)
+        .printStackObjectReference(MFI.getFunctionContextIndex());
+  }
+
+  // Print the debug variable information.
+  for (const MachineFunction::VariableDbgInfo &DebugVar :
+       MF.getInStackSlotVariableDbgInfo()) {
+    int Idx = DebugVar.getStackSlot();
+    assert(Idx >= MFI.getObjectIndexBegin() && Idx < MFI.getObjectIndexEnd() &&
+           "Invalid stack object index");
+    if (Idx < 0) { // Negative index means fixed objects.
+      auto &Object =
+          YMF.FixedStackObjects[FixedStackObjectsIdx[Idx +
+                                                     MFI.getNumFixedObjects()]];
+      printStackObjectDbgInfo(DebugVar, Object, MST);
+    } else {
+      auto &Object = YMF.StackObjects[StackObjectsIdx[Idx]];
+      printStackObjectDbgInfo(DebugVar, Object, MST);
+    }
+  }
+}
+
+void MIRPrinter::convertCallSiteObjects(yaml::MachineFunction &YMF,
+                                        const MachineFunction &MF,
+                                        ModuleSlotTracker &MST) {
+  const auto *TRI = MF.getSubtarget().getRegisterInfo();
+  for (auto CSInfo : MF.getCallSitesInfo()) {
+    yaml::CallSiteInfo YmlCS;
+    yaml::CallSiteInfo::MachineInstrLoc CallLocation;
+
+    // Prepare instruction position.
+    MachineBasicBlock::const_instr_iterator CallI = CSInfo.first->getIterator();
+    CallLocation.BlockNum = CallI->getParent()->getNumber();
+    // Get call instruction offset from the beginning of block.
+    CallLocation.Offset =
+        std::distance(CallI->getParent()->instr_begin(), CallI);
+    YmlCS.CallLocation = CallLocation;
+    // Construct call arguments and theirs forwarding register info.
+    for (auto ArgReg : CSInfo.second) {
+      yaml::CallSiteInfo::ArgRegPair YmlArgReg;
+      YmlArgReg.ArgNo = ArgReg.ArgNo;
+      printRegMIR(ArgReg.Reg, YmlArgReg.Reg, TRI);
+      YmlCS.ArgForwardingRegs.emplace_back(YmlArgReg);
+    }
+    YMF.CallSitesInfo.push_back(YmlCS);
+  }
+
+  // Sort call info by position of call instructions.
+  llvm::sort(YMF.CallSitesInfo.begin(), YMF.CallSitesInfo.end(),
+             [](yaml::CallSiteInfo A, yaml::CallSiteInfo B) {
+               if (A.CallLocation.BlockNum == B.CallLocation.BlockNum)
+                 return A.CallLocation.Offset < B.CallLocation.Offset;
+               return A.CallLocation.BlockNum < B.CallLocation.BlockNum;
+             });
+}
+
+void MIRPrinter::convertMachineMetadataNodes(yaml::MachineFunction &YMF,
+                                             const MachineFunction &MF,
+                                             MachineModuleSlotTracker &MST) {
+  MachineModuleSlotTracker::MachineMDNodeListType MDList;
+  MST.collectMachineMDNodes(MDList);
+  for (auto &MD : MDList) {
+    std::string NS;
+    raw_string_ostream StrOS(NS);
+    MD.second->print(StrOS, MST, MF.getFunction().getParent());
+    YMF.MachineMetadataNodes.push_back(StrOS.str());
+  }
+}
+
+void MIRPrinter::convert(yaml::MachineFunction &MF,
+                         const MachineConstantPool &ConstantPool) {
+  unsigned ID = 0;
+  for (const MachineConstantPoolEntry &Constant : ConstantPool.getConstants()) {
+    std::string Str;
+    raw_string_ostream StrOS(Str);
+    if (Constant.isMachineConstantPoolEntry()) {
+      Constant.Val.MachineCPVal->print(StrOS);
+    } else {
+      Constant.Val.ConstVal->printAsOperand(StrOS);
+    }
+
+    yaml::MachineConstantPoolValue YamlConstant;
+    YamlConstant.ID = ID++;
+    YamlConstant.Value = StrOS.str();
+    YamlConstant.Alignment = Constant.getAlign();
+    YamlConstant.IsTargetSpecific = Constant.isMachineConstantPoolEntry();
+
+    MF.Constants.push_back(YamlConstant);
+  }
+}
+
+void MIRPrinter::convert(ModuleSlotTracker &MST,
+                         yaml::MachineJumpTable &YamlJTI,
+                         const MachineJumpTableInfo &JTI) {
+  YamlJTI.Kind = JTI.getEntryKind();
+  unsigned ID = 0;
+  for (const auto &Table : JTI.getJumpTables()) {
+    std::string Str;
+    yaml::MachineJumpTable::Entry Entry;
+    Entry.ID = ID++;
+    for (const auto *MBB : Table.MBBs) {
+      raw_string_ostream StrOS(Str);
+      StrOS << printMBBReference(*MBB);
+      Entry.Blocks.push_back(StrOS.str());
+      Str.clear();
+    }
+    YamlJTI.Entries.push_back(Entry);
+  }
+}
+
+void MIRPrinter::initRegisterMaskIds(const MachineFunction &MF) {
+  const auto *TRI = MF.getSubtarget().getRegisterInfo();
+  unsigned I = 0;
+  for (const uint32_t *Mask : TRI->getRegMasks())
+    RegisterMaskIds.insert(std::make_pair(Mask, I++));
+}
+
+void llvm::guessSuccessors(const MachineBasicBlock &MBB,
+                           SmallVectorImpl<MachineBasicBlock*> &Result,
+                           bool &IsFallthrough) {
+  SmallPtrSet<MachineBasicBlock*,8> Seen;
+
+  for (const MachineInstr &MI : MBB) {
+    if (MI.isPHI())
+      continue;
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isMBB())
+        continue;
+      MachineBasicBlock *Succ = MO.getMBB();
+      auto RP = Seen.insert(Succ);
+      if (RP.second)
+        Result.push_back(Succ);
+    }
+  }
+  MachineBasicBlock::const_iterator I = MBB.getLastNonDebugInstr();
+  IsFallthrough = I == MBB.end() || !I->isBarrier();
+}
+
+bool
+MIPrinter::canPredictBranchProbabilities(const MachineBasicBlock &MBB) const {
+  if (MBB.succ_size() <= 1)
+    return true;
+  if (!MBB.hasSuccessorProbabilities())
+    return true;
+
+  SmallVector<BranchProbability,8> Normalized(MBB.Probs.begin(),
+                                              MBB.Probs.end());
+  BranchProbability::normalizeProbabilities(Normalized.begin(),
+                                            Normalized.end());
+  SmallVector<BranchProbability,8> Equal(Normalized.size());
+  BranchProbability::normalizeProbabilities(Equal.begin(), Equal.end());
+
+  return std::equal(Normalized.begin(), Normalized.end(), Equal.begin());
+}
+
+bool MIPrinter::canPredictSuccessors(const MachineBasicBlock &MBB) const {
+  SmallVector<MachineBasicBlock*,8> GuessedSuccs;
+  bool GuessedFallthrough;
+  guessSuccessors(MBB, GuessedSuccs, GuessedFallthrough);
+  if (GuessedFallthrough) {
+    const MachineFunction &MF = *MBB.getParent();
+    MachineFunction::const_iterator NextI = std::next(MBB.getIterator());
+    if (NextI != MF.end()) {
+      MachineBasicBlock *Next = const_cast<MachineBasicBlock*>(&*NextI);
+      if (!is_contained(GuessedSuccs, Next))
+        GuessedSuccs.push_back(Next);
+    }
+  }
+  if (GuessedSuccs.size() != MBB.succ_size())
+    return false;
+  return std::equal(MBB.succ_begin(), MBB.succ_end(), GuessedSuccs.begin());
+}
+
+void MIPrinter::print(const MachineBasicBlock &MBB) {
+  assert(MBB.getNumber() >= 0 && "Invalid MBB number");
+  MBB.printName(OS,
+                MachineBasicBlock::PrintNameIr |
+                    MachineBasicBlock::PrintNameAttributes,
+                &MST);
+  OS << ":\n";
+
+  bool HasLineAttributes = false;
+  // Print the successors
+  bool canPredictProbs = canPredictBranchProbabilities(MBB);
+  // Even if the list of successors is empty, if we cannot guess it,
+  // we need to print it to tell the parser that the list is empty.
+  // This is needed, because MI model unreachable as empty blocks
+  // with an empty successor list. If the parser would see that
+  // without the successor list, it would guess the code would
+  // fallthrough.
+  if ((!MBB.succ_empty() && !SimplifyMIR) || !canPredictProbs ||
+      !canPredictSuccessors(MBB)) {
+    OS.indent(2) << "successors: ";
+    for (auto I = MBB.succ_begin(), E = MBB.succ_end(); I != E; ++I) {
+      if (I != MBB.succ_begin())
+        OS << ", ";
+      OS << printMBBReference(**I);
+      if (!SimplifyMIR || !canPredictProbs)
+        OS << '('
+           << format("0x%08" PRIx32, MBB.getSuccProbability(I).getNumerator())
+           << ')';
+    }
+    OS << "\n";
+    HasLineAttributes = true;
+  }
+
+  // Print the live in registers.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  if (!MBB.livein_empty()) {
+    const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+    OS.indent(2) << "liveins: ";
+    bool First = true;
+    for (const auto &LI : MBB.liveins_dbg()) {
+      if (!First)
+        OS << ", ";
+      First = false;
+      OS << printReg(LI.PhysReg, &TRI);
+      if (!LI.LaneMask.all())
+        OS << ":0x" << PrintLaneMask(LI.LaneMask);
+    }
+    OS << "\n";
+    HasLineAttributes = true;
+  }
+
+  if (HasLineAttributes)
+    OS << "\n";
+  bool IsInBundle = false;
+  for (auto I = MBB.instr_begin(), E = MBB.instr_end(); I != E; ++I) {
+    const MachineInstr &MI = *I;
+    if (IsInBundle && !MI.isInsideBundle()) {
+      OS.indent(2) << "}\n";
+      IsInBundle = false;
+    }
+    OS.indent(IsInBundle ? 4 : 2);
+    print(MI);
+    if (!IsInBundle && MI.getFlag(MachineInstr::BundledSucc)) {
+      OS << " {";
+      IsInBundle = true;
+    }
+    OS << "\n";
+  }
+  if (IsInBundle)
+    OS.indent(2) << "}\n";
+}
+
+void MIPrinter::print(const MachineInstr &MI) {
+  const auto *MF = MI.getMF();
+  const auto &MRI = MF->getRegInfo();
+  const auto &SubTarget = MF->getSubtarget();
+  const auto *TRI = SubTarget.getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  const auto *TII = SubTarget.getInstrInfo();
+  assert(TII && "Expected target instruction info");
+  if (MI.isCFIInstruction())
+    assert(MI.getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
+
+  SmallBitVector PrintedTypes(8);
+  bool ShouldPrintRegisterTies = MI.hasComplexRegisterTies();
+  unsigned I = 0, E = MI.getNumOperands();
+  for (; I < E && MI.getOperand(I).isReg() && MI.getOperand(I).isDef() &&
+         !MI.getOperand(I).isImplicit();
+       ++I) {
+    if (I)
+      OS << ", ";
+    print(MI, I, TRI, TII, ShouldPrintRegisterTies,
+          MI.getTypeToPrint(I, PrintedTypes, MRI),
+          /*PrintDef=*/false);
+  }
+
+  if (I)
+    OS << " = ";
+  if (MI.getFlag(MachineInstr::FrameSetup))
+    OS << "frame-setup ";
+  if (MI.getFlag(MachineInstr::FrameDestroy))
+    OS << "frame-destroy ";
+  if (MI.getFlag(MachineInstr::FmNoNans))
+    OS << "nnan ";
+  if (MI.getFlag(MachineInstr::FmNoInfs))
+    OS << "ninf ";
+  if (MI.getFlag(MachineInstr::FmNsz))
+    OS << "nsz ";
+  if (MI.getFlag(MachineInstr::FmArcp))
+    OS << "arcp ";
+  if (MI.getFlag(MachineInstr::FmContract))
+    OS << "contract ";
+  if (MI.getFlag(MachineInstr::FmAfn))
+    OS << "afn ";
+  if (MI.getFlag(MachineInstr::FmReassoc))
+    OS << "reassoc ";
+  if (MI.getFlag(MachineInstr::NoUWrap))
+    OS << "nuw ";
+  if (MI.getFlag(MachineInstr::NoSWrap))
+    OS << "nsw ";
+  if (MI.getFlag(MachineInstr::IsExact))
+    OS << "exact ";
+  if (MI.getFlag(MachineInstr::NoFPExcept))
+    OS << "nofpexcept ";
+  if (MI.getFlag(MachineInstr::NoMerge))
+    OS << "nomerge ";
+  if (MI.getFlag(MachineInstr::Unpredictable))
+    OS << "unpredictable ";
+
+  OS << TII->getName(MI.getOpcode());
+  if (I < E)
+    OS << ' ';
+
+  bool NeedComma = false;
+  for (; I < E; ++I) {
+    if (NeedComma)
+      OS << ", ";
+    print(MI, I, TRI, TII, ShouldPrintRegisterTies,
+          MI.getTypeToPrint(I, PrintedTypes, MRI));
+    NeedComma = true;
+  }
+
+  // Print any optional symbols attached to this instruction as-if they were
+  // operands.
+  if (MCSymbol *PreInstrSymbol = MI.getPreInstrSymbol()) {
+    if (NeedComma)
+      OS << ',';
+    OS << " pre-instr-symbol ";
+    MachineOperand::printSymbol(OS, *PreInstrSymbol);
+    NeedComma = true;
+  }
+  if (MCSymbol *PostInstrSymbol = MI.getPostInstrSymbol()) {
+    if (NeedComma)
+      OS << ',';
+    OS << " post-instr-symbol ";
+    MachineOperand::printSymbol(OS, *PostInstrSymbol);
+    NeedComma = true;
+  }
+  if (MDNode *HeapAllocMarker = MI.getHeapAllocMarker()) {
+    if (NeedComma)
+      OS << ',';
+    OS << " heap-alloc-marker ";
+    HeapAllocMarker->printAsOperand(OS, MST);
+    NeedComma = true;
+  }
+  if (MDNode *PCSections = MI.getPCSections()) {
+    if (NeedComma)
+      OS << ',';
+    OS << " pcsections ";
+    PCSections->printAsOperand(OS, MST);
+    NeedComma = true;
+  }
+  if (uint32_t CFIType = MI.getCFIType()) {
+    if (NeedComma)
+      OS << ',';
+    OS << " cfi-type " << CFIType;
+    NeedComma = true;
+  }
+
+  if (auto Num = MI.peekDebugInstrNum()) {
+    if (NeedComma)
+      OS << ',';
+    OS << " debug-instr-number " << Num;
+    NeedComma = true;
+  }
+
+  if (PrintLocations) {
+    if (const DebugLoc &DL = MI.getDebugLoc()) {
+      if (NeedComma)
+        OS << ',';
+      OS << " debug-location ";
+      DL->printAsOperand(OS, MST);
+    }
+  }
+
+  if (!MI.memoperands_empty()) {
+    OS << " :: ";
+    const LLVMContext &Context = MF->getFunction().getContext();
+    const MachineFrameInfo &MFI = MF->getFrameInfo();
+    bool NeedComma = false;
+    for (const auto *Op : MI.memoperands()) {
+      if (NeedComma)
+        OS << ", ";
+      Op->print(OS, MST, SSNs, Context, &MFI, TII);
+      NeedComma = true;
+    }
+  }
+}
+
+void MIPrinter::printStackObjectReference(int FrameIndex) {
+  auto ObjectInfo = StackObjectOperandMapping.find(FrameIndex);
+  assert(ObjectInfo != StackObjectOperandMapping.end() &&
+         "Invalid frame index");
+  const FrameIndexOperand &Operand = ObjectInfo->second;
+  MachineOperand::printStackObjectReference(OS, Operand.ID, Operand.IsFixed,
+                                            Operand.Name);
+}
+
+static std::string formatOperandComment(std::string Comment) {
+  if (Comment.empty())
+    return Comment;
+  return std::string(" /* " + Comment + " */");
+}
+
+void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
+                      const TargetRegisterInfo *TRI,
+                      const TargetInstrInfo *TII,
+                      bool ShouldPrintRegisterTies, LLT TypeToPrint,
+                      bool PrintDef) {
+  const MachineOperand &Op = MI.getOperand(OpIdx);
+  std::string MOComment = TII->createMIROperandComment(MI, Op, OpIdx, TRI);
+
+  switch (Op.getType()) {
+  case MachineOperand::MO_Immediate:
+    if (MI.isOperandSubregIdx(OpIdx)) {
+      MachineOperand::printTargetFlags(OS, Op);
+      MachineOperand::printSubRegIdx(OS, Op.getImm(), TRI);
+      break;
+    }
+    [[fallthrough]];
+  case MachineOperand::MO_Register:
+  case MachineOperand::MO_CImmediate:
+  case MachineOperand::MO_FPImmediate:
+  case MachineOperand::MO_MachineBasicBlock:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_TargetIndex:
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_RegisterLiveOut:
+  case MachineOperand::MO_Metadata:
+  case MachineOperand::MO_MCSymbol:
+  case MachineOperand::MO_CFIIndex:
+  case MachineOperand::MO_IntrinsicID:
+  case MachineOperand::MO_Predicate:
+  case MachineOperand::MO_BlockAddress:
+  case MachineOperand::MO_DbgInstrRef:
+  case MachineOperand::MO_ShuffleMask: {
+    unsigned TiedOperandIdx = 0;
+    if (ShouldPrintRegisterTies && Op.isReg() && Op.isTied() && !Op.isDef())
+      TiedOperandIdx = Op.getParent()->findTiedOperandIdx(OpIdx);
+    const TargetIntrinsicInfo *TII = MI.getMF()->getTarget().getIntrinsicInfo();
+    Op.print(OS, MST, TypeToPrint, OpIdx, PrintDef, /*IsStandalone=*/false,
+             ShouldPrintRegisterTies, TiedOperandIdx, TRI, TII);
+      OS << formatOperandComment(MOComment);
+    break;
+  }
+  case MachineOperand::MO_FrameIndex:
+    printStackObjectReference(Op.getIndex());
+    break;
+  case MachineOperand::MO_RegisterMask: {
+    auto RegMaskInfo = RegisterMaskIds.find(Op.getRegMask());
+    if (RegMaskInfo != RegisterMaskIds.end())
+      OS << StringRef(TRI->getRegMaskNames()[RegMaskInfo->second]).lower();
+    else
+      printCustomRegMask(Op.getRegMask(), OS, TRI);
+    break;
+  }
+  }
+}
+
+void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V,
+                                ModuleSlotTracker &MST) {
+  if (isa<GlobalValue>(V)) {
+    V.printAsOperand(OS, /*PrintType=*/false, MST);
+    return;
+  }
+  if (isa<Constant>(V)) {
+    // Machine memory operands can load/store to/from constant value pointers.
+    OS << '`';
+    V.printAsOperand(OS, /*PrintType=*/true, MST);
+    OS << '`';
+    return;
+  }
+  OS << "%ir.";
+  if (V.hasName()) {
+    printLLVMNameWithoutPrefix(OS, V.getName());
+    return;
+  }
+  int Slot = MST.getCurrentFunction() ? MST.getLocalSlot(&V) : -1;
+  MachineOperand::printIRSlotNumber(OS, Slot);
+}
+
+void llvm::printMIR(raw_ostream &OS, const Module &M) {
+  yaml::Output Out(OS);
+  Out << const_cast<Module &>(M);
+}
+
+void llvm::printMIR(raw_ostream &OS, const MachineFunction &MF) {
+  MIRPrinter Printer(OS);
+  Printer.print(MF);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRPrintingPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRPrintingPass.cpp
new file mode 100644
index 000000000000..1b5a9ade0871
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRPrintingPass.cpp
@@ -0,0 +1,70 @@
+//===- MIRPrintingPass.cpp - Pass that prints out using the MIR format ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass that prints out the LLVM module using the MIR
+// serialization format.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MIRPrinter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+/// This pass prints out the LLVM IR to an output stream using the MIR
+/// serialization format.
+struct MIRPrintingPass : public MachineFunctionPass {
+  static char ID;
+  raw_ostream &OS;
+  std::string MachineFunctions;
+
+  MIRPrintingPass() : MachineFunctionPass(ID), OS(dbgs()) {}
+  MIRPrintingPass(raw_ostream &OS) : MachineFunctionPass(ID), OS(OS) {}
+
+  StringRef getPassName() const override { return "MIR Printing Pass"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    std::string Str;
+    raw_string_ostream StrOS(Str);
+    printMIR(StrOS, MF);
+    MachineFunctions.append(StrOS.str());
+    return false;
+  }
+
+  bool doFinalization(Module &M) override {
+    printMIR(OS, M);
+    OS << MachineFunctions;
+    return false;
+  }
+};
+
+char MIRPrintingPass::ID = 0;
+
+} // end anonymous namespace
+
+char &llvm::MIRPrintingPassID = MIRPrintingPass::ID;
+INITIALIZE_PASS(MIRPrintingPass, "mir-printer", "MIR Printer", false, false)
+
+namespace llvm {
+
+MachineFunctionPass *createPrintMIRPass(raw_ostream &OS) {
+  return new MIRPrintingPass(OS);
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRSampleProfile.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRSampleProfile.cpp
new file mode 100644
index 000000000000..96f8589e682d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRSampleProfile.cpp
@@ -0,0 +1,406 @@
+//===-------- MIRSampleProfile.cpp: MIRSampleFDO (For FSAFDO) -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the implementation of the MIRSampleProfile loader, mainly
+// for flow sensitive SampleFDO.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MIRSampleProfile.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PseudoProbe.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h"
+#include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h"
+#include <optional>
+
+using namespace llvm;
+using namespace sampleprof;
+using namespace llvm::sampleprofutil;
+using ProfileCount = Function::ProfileCount;
+
+#define DEBUG_TYPE "fs-profile-loader"
+
+static cl::opt<bool> ShowFSBranchProb(
+    "show-fs-branchprob", cl::Hidden, cl::init(false),
+    cl::desc("Print setting flow sensitive branch probabilities"));
+static cl::opt<unsigned> FSProfileDebugProbDiffThreshold(
+    "fs-profile-debug-prob-diff-threshold", cl::init(10),
+    cl::desc("Only show debug message if the branch probility is greater than "
+             "this value (in percentage)."));
+
+static cl::opt<unsigned> FSProfileDebugBWThreshold(
+    "fs-profile-debug-bw-threshold", cl::init(10000),
+    cl::desc("Only show debug message if the source branch weight is greater "
+             " than this value."));
+
+static cl::opt<bool> ViewBFIBefore("fs-viewbfi-before", cl::Hidden,
+                                   cl::init(false),
+                                   cl::desc("View BFI before MIR loader"));
+static cl::opt<bool> ViewBFIAfter("fs-viewbfi-after", cl::Hidden,
+                                  cl::init(false),
+                                  cl::desc("View BFI after MIR loader"));
+
+extern cl::opt<bool> ImprovedFSDiscriminator;
+char MIRProfileLoaderPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MIRProfileLoaderPass, DEBUG_TYPE,
+                      "Load MIR Sample Profile",
+                      /* cfg = */ false, /* is_analysis = */ false)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
+INITIALIZE_PASS_END(MIRProfileLoaderPass, DEBUG_TYPE, "Load MIR Sample Profile",
+                    /* cfg = */ false, /* is_analysis = */ false)
+
+char &llvm::MIRProfileLoaderPassID = MIRProfileLoaderPass::ID;
+
+FunctionPass *
+llvm::createMIRProfileLoaderPass(std::string File, std::string RemappingFile,
+                                 FSDiscriminatorPass P,
+                                 IntrusiveRefCntPtr<vfs::FileSystem> FS) {
+  return new MIRProfileLoaderPass(File, RemappingFile, P, std::move(FS));
+}
+
+namespace llvm {
+
+// Internal option used to control BFI display only after MBP pass.
+// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
+// -view-block-layout-with-bfi={none | fraction | integer | count}
+extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI;
+
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+
+std::optional<PseudoProbe> extractProbe(const MachineInstr &MI) {
+  if (MI.isPseudoProbe()) {
+    PseudoProbe Probe;
+    Probe.Id = MI.getOperand(1).getImm();
+    Probe.Type = MI.getOperand(2).getImm();
+    Probe.Attr = MI.getOperand(3).getImm();
+    Probe.Factor = 1;
+    DILocation *DebugLoc = MI.getDebugLoc();
+    Probe.Discriminator = DebugLoc ? DebugLoc->getDiscriminator() : 0;
+    return Probe;
+  }
+
+  // Ignore callsite probes since they do not have FS discriminators.
+  return std::nullopt;
+}
+
+namespace afdo_detail {
+template <> struct IRTraits<MachineBasicBlock> {
+  using InstructionT = MachineInstr;
+  using BasicBlockT = MachineBasicBlock;
+  using FunctionT = MachineFunction;
+  using BlockFrequencyInfoT = MachineBlockFrequencyInfo;
+  using LoopT = MachineLoop;
+  using LoopInfoPtrT = MachineLoopInfo *;
+  using DominatorTreePtrT = MachineDominatorTree *;
+  using PostDominatorTreePtrT = MachinePostDominatorTree *;
+  using PostDominatorTreeT = MachinePostDominatorTree;
+  using OptRemarkEmitterT = MachineOptimizationRemarkEmitter;
+  using OptRemarkAnalysisT = MachineOptimizationRemarkAnalysis;
+  using PredRangeT = iterator_range<std::vector<MachineBasicBlock *>::iterator>;
+  using SuccRangeT = iterator_range<std::vector<MachineBasicBlock *>::iterator>;
+  static Function &getFunction(MachineFunction &F) { return F.getFunction(); }
+  static const MachineBasicBlock *getEntryBB(const MachineFunction *F) {
+    return GraphTraits<const MachineFunction *>::getEntryNode(F);
+  }
+  static PredRangeT getPredecessors(MachineBasicBlock *BB) {
+    return BB->predecessors();
+  }
+  static SuccRangeT getSuccessors(MachineBasicBlock *BB) {
+    return BB->successors();
+  }
+};
+} // namespace afdo_detail
+
+class MIRProfileLoader final
+    : public SampleProfileLoaderBaseImpl<MachineFunction> {
+public:
+  void setInitVals(MachineDominatorTree *MDT, MachinePostDominatorTree *MPDT,
+                   MachineLoopInfo *MLI, MachineBlockFrequencyInfo *MBFI,
+                   MachineOptimizationRemarkEmitter *MORE) {
+    DT = MDT;
+    PDT = MPDT;
+    LI = MLI;
+    BFI = MBFI;
+    ORE = MORE;
+  }
+  void setFSPass(FSDiscriminatorPass Pass) {
+    P = Pass;
+    LowBit = getFSPassBitBegin(P);
+    HighBit = getFSPassBitEnd(P);
+    assert(LowBit < HighBit && "HighBit needs to be greater than Lowbit");
+  }
+
+  MIRProfileLoader(StringRef Name, StringRef RemapName,
+                   IntrusiveRefCntPtr<vfs::FileSystem> FS)
+      : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName),
+                                    std::move(FS)) {}
+
+  void setBranchProbs(MachineFunction &F);
+  bool runOnFunction(MachineFunction &F);
+  bool doInitialization(Module &M);
+  bool isValid() const { return ProfileIsValid; }
+
+protected:
+  friend class SampleCoverageTracker;
+
+  /// Hold the information of the basic block frequency.
+  MachineBlockFrequencyInfo *BFI;
+
+  /// PassNum is the sequence number this pass is called, start from 1.
+  FSDiscriminatorPass P;
+
+  // LowBit in the FS discriminator used by this instance. Note the number is
+  // 0-based. Base discrimnator use bit 0 to bit 11.
+  unsigned LowBit;
+  // HighwBit in the FS discriminator used by this instance. Note the number
+  // is 0-based.
+  unsigned HighBit;
+
+  bool ProfileIsValid = true;
+  ErrorOr<uint64_t> getInstWeight(const MachineInstr &MI) override {
+    if (FunctionSamples::ProfileIsProbeBased)
+      return getProbeWeight(MI);
+    if (ImprovedFSDiscriminator && MI.isMetaInstruction())
+      return std::error_code();
+    return getInstWeightImpl(MI);
+  }
+};
+
+template <>
+void SampleProfileLoaderBaseImpl<MachineFunction>::computeDominanceAndLoopInfo(
+    MachineFunction &F) {}
+
+void MIRProfileLoader::setBranchProbs(MachineFunction &F) {
+  LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch probs\n");
+  for (auto &BI : F) {
+    MachineBasicBlock *BB = &BI;
+    if (BB->succ_size() < 2)
+      continue;
+    const MachineBasicBlock *EC = EquivalenceClass[BB];
+    uint64_t BBWeight = BlockWeights[EC];
+    uint64_t SumEdgeWeight = 0;
+    for (MachineBasicBlock *Succ : BB->successors()) {
+      Edge E = std::make_pair(BB, Succ);
+      SumEdgeWeight += EdgeWeights[E];
+    }
+
+    if (BBWeight != SumEdgeWeight) {
+      LLVM_DEBUG(dbgs() << "BBweight is not equal to SumEdgeWeight: BBWWeight="
+                        << BBWeight << " SumEdgeWeight= " << SumEdgeWeight
+                        << "\n");
+      BBWeight = SumEdgeWeight;
+    }
+    if (BBWeight == 0) {
+      LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
+      continue;
+    }
+
+#ifndef NDEBUG
+    uint64_t BBWeightOrig = BBWeight;
+#endif
+    uint32_t MaxWeight = std::numeric_limits<uint32_t>::max();
+    uint32_t Factor = 1;
+    if (BBWeight > MaxWeight) {
+      Factor = BBWeight / MaxWeight + 1;
+      BBWeight /= Factor;
+      LLVM_DEBUG(dbgs() << "Scaling weights by " << Factor << "\n");
+    }
+
+    for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+                                          SE = BB->succ_end();
+         SI != SE; ++SI) {
+      MachineBasicBlock *Succ = *SI;
+      Edge E = std::make_pair(BB, Succ);
+      uint64_t EdgeWeight = EdgeWeights[E];
+      EdgeWeight /= Factor;
+
+      assert(BBWeight >= EdgeWeight &&
+             "BBweight is larger than EdgeWeight -- should not happen.\n");
+
+      BranchProbability OldProb = BFI->getMBPI()->getEdgeProbability(BB, SI);
+      BranchProbability NewProb(EdgeWeight, BBWeight);
+      if (OldProb == NewProb)
+        continue;
+      BB->setSuccProbability(SI, NewProb);
+#ifndef NDEBUG
+      if (!ShowFSBranchProb)
+        continue;
+      bool Show = false;
+      BranchProbability Diff;
+      if (OldProb > NewProb)
+        Diff = OldProb - NewProb;
+      else
+        Diff = NewProb - OldProb;
+      Show = (Diff >= BranchProbability(FSProfileDebugProbDiffThreshold, 100));
+      Show &= (BBWeightOrig >= FSProfileDebugBWThreshold);
+
+      auto DIL = BB->findBranchDebugLoc();
+      auto SuccDIL = Succ->findBranchDebugLoc();
+      if (Show) {
+        dbgs() << "Set branch fs prob: MBB (" << BB->getNumber() << " -> "
+               << Succ->getNumber() << "): ";
+        if (DIL)
+          dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
+                 << DIL->getColumn();
+        if (SuccDIL)
+          dbgs() << "-->" << SuccDIL->getFilename() << ":" << SuccDIL->getLine()
+                 << ":" << SuccDIL->getColumn();
+        dbgs() << " W=" << BBWeightOrig << "  " << OldProb << " --> " << NewProb
+               << "\n";
+      }
+#endif
+    }
+  }
+}
+
+bool MIRProfileLoader::doInitialization(Module &M) {
+  auto &Ctx = M.getContext();
+
+  auto ReaderOrErr = sampleprof::SampleProfileReader::create(
+      Filename, Ctx, *FS, P, RemappingFilename);
+  if (std::error_code EC = ReaderOrErr.getError()) {
+    std::string Msg = "Could not open profile: " + EC.message();
+    Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
+    return false;
+  }
+
+  Reader = std::move(ReaderOrErr.get());
+  Reader->setModule(&M);
+  ProfileIsValid = (Reader->read() == sampleprof_error::success);
+
+  // Load pseudo probe descriptors for probe-based function samples.
+  if (Reader->profileIsProbeBased()) {
+    ProbeManager = std::make_unique<PseudoProbeManager>(M);
+    if (!ProbeManager->moduleIsProbed(M)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool MIRProfileLoader::runOnFunction(MachineFunction &MF) {
+  // Do not load non-FS profiles. A line or probe can get a zero-valued
+  // discriminator at certain pass which could result in accidentally loading
+  // the corresponding base counter in the non-FS profile, while a non-zero
+  // discriminator would end up getting zero samples. This could in turn undo
+  // the sample distribution effort done by previous BFI maintenance and the
+  // probe distribution factor work for pseudo probes.
+  if (!Reader->profileIsFS())
+    return false;
+
+  Function &Func = MF.getFunction();
+  clearFunctionData(false);
+  Samples = Reader->getSamplesFor(Func);
+  if (!Samples || Samples->empty())
+    return false;
+
+  if (FunctionSamples::ProfileIsProbeBased) {
+    if (!ProbeManager->profileIsValid(MF.getFunction(), *Samples))
+      return false;
+  } else {
+    if (getFunctionLoc(MF) == 0)
+      return false;
+  }
+
+  DenseSet<GlobalValue::GUID> InlinedGUIDs;
+  bool Changed = computeAndPropagateWeights(MF, InlinedGUIDs);
+
+  // Set the new BPI, BFI.
+  setBranchProbs(MF);
+
+  return Changed;
+}
+
+} // namespace llvm
+
+MIRProfileLoaderPass::MIRProfileLoaderPass(
+    std::string FileName, std::string RemappingFileName, FSDiscriminatorPass P,
+    IntrusiveRefCntPtr<vfs::FileSystem> FS)
+    : MachineFunctionPass(ID), ProfileFileName(FileName), P(P) {
+  LowBit = getFSPassBitBegin(P);
+  HighBit = getFSPassBitEnd(P);
+
+  auto VFS = FS ? std::move(FS) : vfs::getRealFileSystem();
+  MIRSampleLoader = std::make_unique<MIRProfileLoader>(
+      FileName, RemappingFileName, std::move(VFS));
+  assert(LowBit < HighBit && "HighBit needs to be greater than Lowbit");
+}
+
+bool MIRProfileLoaderPass::runOnMachineFunction(MachineFunction &MF) {
+  if (!MIRSampleLoader->isValid())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "MIRProfileLoader pass working on Func: "
+                    << MF.getFunction().getName() << "\n");
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+  MIRSampleLoader->setInitVals(
+      &getAnalysis<MachineDominatorTree>(),
+      &getAnalysis<MachinePostDominatorTree>(), &getAnalysis<MachineLoopInfo>(),
+      MBFI, &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE());
+
+  MF.RenumberBlocks();
+  if (ViewBFIBefore && ViewBlockLayoutWithBFI != GVDT_None &&
+      (ViewBlockFreqFuncName.empty() ||
+       MF.getFunction().getName().equals(ViewBlockFreqFuncName))) {
+    MBFI->view("MIR_Prof_loader_b." + MF.getName(), false);
+  }
+
+  bool Changed = MIRSampleLoader->runOnFunction(MF);
+  if (Changed)
+    MBFI->calculate(MF, *MBFI->getMBPI(), *&getAnalysis<MachineLoopInfo>());
+
+  if (ViewBFIAfter && ViewBlockLayoutWithBFI != GVDT_None &&
+      (ViewBlockFreqFuncName.empty() ||
+       MF.getFunction().getName().equals(ViewBlockFreqFuncName))) {
+    MBFI->view("MIR_prof_loader_a." + MF.getName(), false);
+  }
+
+  return Changed;
+}
+
+bool MIRProfileLoaderPass::doInitialization(Module &M) {
+  LLVM_DEBUG(dbgs() << "MIRProfileLoader pass working on Module " << M.getName()
+                    << "\n");
+
+  MIRSampleLoader->setFSPass(P);
+  return MIRSampleLoader->doInitialization(M);
+}
+
+void MIRProfileLoaderPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineBlockFrequencyInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequiredTransitive<MachineLoopInfo>();
+  AU.addRequired<MachineOptimizationRemarkEmitterPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
new file mode 100644
index 000000000000..812d57984e6c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
@@ -0,0 +1,174 @@
+//===---------- MIRVRegNamerUtils.cpp - MIR VReg Renaming Utilities -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MIRVRegNamerUtils.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineStableHash.h"
+#include "llvm/IR/Constants.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mir-vregnamer-utils"
+
+static cl::opt<bool>
+    UseStableNamerHash("mir-vreg-namer-use-stable-hash", cl::init(false),
+                       cl::Hidden,
+                       cl::desc("Use Stable Hashing for MIR VReg Renaming"));
+
+using VRegRenameMap = std::map<unsigned, unsigned>;
+
+bool VRegRenamer::doVRegRenaming(const VRegRenameMap &VRM) {
+  bool Changed = false;
+
+  for (const auto &E : VRM) {
+    Changed = Changed || !MRI.reg_empty(E.first);
+    MRI.replaceRegWith(E.first, E.second);
+  }
+
+  return Changed;
+}
+
+VRegRenameMap
+VRegRenamer::getVRegRenameMap(const std::vector<NamedVReg> &VRegs) {
+
+  StringMap<unsigned> VRegNameCollisionMap;
+
+  auto GetUniqueVRegName = [&VRegNameCollisionMap](const NamedVReg &Reg) {
+    if (!VRegNameCollisionMap.contains(Reg.getName()))
+      VRegNameCollisionMap[Reg.getName()] = 0;
+    const unsigned Counter = ++VRegNameCollisionMap[Reg.getName()];
+    return Reg.getName() + "__" + std::to_string(Counter);
+  };
+
+  VRegRenameMap VRM;
+  for (const auto &VReg : VRegs) {
+    const unsigned Reg = VReg.getReg();
+    VRM[Reg] = createVirtualRegisterWithLowerName(Reg, GetUniqueVRegName(VReg));
+  }
+  return VRM;
+}
+
+std::string VRegRenamer::getInstructionOpcodeHash(MachineInstr &MI) {
+  std::string S;
+  raw_string_ostream OS(S);
+
+  if (UseStableNamerHash) {
+    auto Hash = stableHashValue(MI, /* HashVRegs */ true,
+                                /* HashConstantPoolIndices */ true,
+                                /* HashMemOperands */ true);
+    assert(Hash && "Expected non-zero Hash");
+    OS << format_hex_no_prefix(Hash, 16, true);
+    return OS.str();
+  }
+
+  // Gets a hashable artifact from a given MachineOperand (ie an unsigned).
+  auto GetHashableMO = [this](const MachineOperand &MO) -> unsigned {
+    switch (MO.getType()) {
+    case MachineOperand::MO_CImmediate:
+      return hash_combine(MO.getType(), MO.getTargetFlags(),
+                          MO.getCImm()->getZExtValue());
+    case MachineOperand::MO_FPImmediate:
+      return hash_combine(
+          MO.getType(), MO.getTargetFlags(),
+          MO.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
+    case MachineOperand::MO_Register:
+      if (MO.getReg().isVirtual())
+        return MRI.getVRegDef(MO.getReg())->getOpcode();
+      return MO.getReg();
+    case MachineOperand::MO_Immediate:
+      return MO.getImm();
+    case MachineOperand::MO_TargetIndex:
+      return MO.getOffset() | (MO.getTargetFlags() << 16);
+    case MachineOperand::MO_FrameIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_JumpTableIndex:
+      return llvm::hash_value(MO);
+
+    // We could explicitly handle all the types of the MachineOperand,
+    // here but we can just return a common number until we find a
+    // compelling test case where this is bad. The only side effect here
+    // is contributing to a hash collision but there's enough information
+    // (Opcodes,other registers etc) that this will likely not be a problem.
+
+    // TODO: Handle the following Index/ID/Predicate cases. They can
+    // be hashed on in a stable manner.
+    case MachineOperand::MO_CFIIndex:
+    case MachineOperand::MO_IntrinsicID:
+    case MachineOperand::MO_Predicate:
+
+    // In the cases below we havn't found a way to produce an artifact that will
+    // result in a stable hash, in most cases because they are pointers. We want
+    // stable hashes because we want the hash to be the same run to run.
+    case MachineOperand::MO_MachineBasicBlock:
+    case MachineOperand::MO_ExternalSymbol:
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_BlockAddress:
+    case MachineOperand::MO_RegisterMask:
+    case MachineOperand::MO_RegisterLiveOut:
+    case MachineOperand::MO_Metadata:
+    case MachineOperand::MO_MCSymbol:
+    case MachineOperand::MO_ShuffleMask:
+    case MachineOperand::MO_DbgInstrRef:
+      return 0;
+    }
+    llvm_unreachable("Unexpected MachineOperandType.");
+  };
+
+  SmallVector<unsigned, 16> MIOperands = {MI.getOpcode(), MI.getFlags()};
+  llvm::transform(MI.uses(), std::back_inserter(MIOperands), GetHashableMO);
+
+  for (const auto *Op : MI.memoperands()) {
+    MIOperands.push_back((unsigned)Op->getSize());
+    MIOperands.push_back((unsigned)Op->getFlags());
+    MIOperands.push_back((unsigned)Op->getOffset());
+    MIOperands.push_back((unsigned)Op->getSuccessOrdering());
+    MIOperands.push_back((unsigned)Op->getAddrSpace());
+    MIOperands.push_back((unsigned)Op->getSyncScopeID());
+    MIOperands.push_back((unsigned)Op->getBaseAlign().value());
+    MIOperands.push_back((unsigned)Op->getFailureOrdering());
+  }
+
+  auto HashMI = hash_combine_range(MIOperands.begin(), MIOperands.end());
+  OS << format_hex_no_prefix(HashMI, 16, true);
+  return OS.str();
+}
+
+unsigned VRegRenamer::createVirtualRegister(unsigned VReg) {
+  assert(Register::isVirtualRegister(VReg) && "Expected Virtual Registers");
+  std::string Name = getInstructionOpcodeHash(*MRI.getVRegDef(VReg));
+  return createVirtualRegisterWithLowerName(VReg, Name);
+}
+
+bool VRegRenamer::renameInstsInMBB(MachineBasicBlock *MBB) {
+  std::vector<NamedVReg> VRegs;
+  std::string Prefix = "bb" + std::to_string(CurrentBBNumber) + "_";
+  for (MachineInstr &Candidate : *MBB) {
+    // Don't rename stores/branches.
+    if (Candidate.mayStore() || Candidate.isBranch())
+      continue;
+    if (!Candidate.getNumOperands())
+      continue;
+    // Look for instructions that define VRegs in operand 0.
+    MachineOperand &MO = Candidate.getOperand(0);
+    // Avoid non regs, instructions defining physical regs.
+    if (!MO.isReg() || !MO.getReg().isVirtual())
+      continue;
+    VRegs.push_back(
+        NamedVReg(MO.getReg(), Prefix + getInstructionOpcodeHash(Candidate)));
+  }
+
+  return VRegs.size() ? doVRegRenaming(getVRegRenameMap(VRegs)) : false;
+}
+
+unsigned VRegRenamer::createVirtualRegisterWithLowerName(unsigned VReg,
+                                                         StringRef Name) {
+  std::string LowerName = Name.lower();
+  const TargetRegisterClass *RC = MRI.getRegClassOrNull(VReg);
+  return RC ? MRI.createVirtualRegister(RC, LowerName)
+            : MRI.createGenericVirtualRegister(MRI.getType(VReg), LowerName);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRVRegNamerUtils.h b/contrib/llvm-project/llvm/lib/CodeGen/MIRVRegNamerUtils.h
new file mode 100644
index 000000000000..a059bc5333c6
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRVRegNamerUtils.h
@@ -0,0 +1,97 @@
+
+//===------------ MIRVRegNamerUtils.h - MIR VReg Renaming Utilities -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The purpose of these utilities is to abstract out parts of the MIRCanon pass
+// that are responsible for renaming virtual registers with the purpose of
+// sharing code with a MIRVRegNamer pass that could be the analog of the
+// opt -instnamer pass.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_MIRVREGNAMERUTILS_H
+#define LLVM_LIB_CODEGEN_MIRVREGNAMERUTILS_H
+
+#include "llvm/CodeGen/Register.h"
+#include <map>
+#include <vector>
+#include <string>
+
+namespace llvm {
+
+class MachineBasicBlock;
+class MachineInstr;
+class MachineRegisterInfo;
+class StringRef;
+
+/// VRegRenamer - This class is used for renaming vregs in a machine basic
+/// block according to semantics of the instruction.
+class VRegRenamer {
+  class NamedVReg {
+    Register Reg;
+    std::string Name;
+
+  public:
+    NamedVReg(Register Reg, std::string Name = "") : Reg(Reg), Name(Name) {}
+    NamedVReg(std::string Name = "") : Reg(~0U), Name(Name) {}
+
+    const std::string &getName() const { return Name; }
+
+    Register getReg() const { return Reg; }
+  };
+
+  MachineRegisterInfo &MRI;
+
+  unsigned CurrentBBNumber = 0;
+
+  /// Given an Instruction, construct a hash of the operands
+  /// of the instructions along with the opcode.
+  /// When dealing with virtual registers, just hash the opcode of
+  /// the instruction defining that vreg.
+  /// Handle immediates, registers (physical and virtual) explicitly,
+  /// and return a common value for the other cases.
+  /// Instruction will be named in the following scheme
+  /// bb<block_no>_hash_<collission_count>.
+  std::string getInstructionOpcodeHash(MachineInstr &MI);
+
+  /// For all the VRegs that are candidates for renaming,
+  /// return a mapping from old vregs to new vregs with names.
+  std::map<unsigned, unsigned>
+  getVRegRenameMap(const std::vector<NamedVReg> &VRegs);
+
+  /// Perform replacing of registers based on the <old,new> vreg map.
+  bool doVRegRenaming(const std::map<unsigned, unsigned> &VRegRenameMap);
+
+  /// createVirtualRegister - Given an existing vreg, create a named vreg to
+  /// take its place. The name is determined by calling
+  /// getInstructionOpcodeHash.
+  unsigned createVirtualRegister(unsigned VReg);
+
+  /// Create a vreg with name and return it.
+  unsigned createVirtualRegisterWithLowerName(unsigned VReg, StringRef Name);
+
+  /// Linearly traverse the MachineBasicBlock and rename each instruction's
+  /// vreg definition based on the semantics of the instruction.
+  /// Names are as follows bb<BBNum>_hash_[0-9]+
+  bool renameInstsInMBB(MachineBasicBlock *MBB);
+
+public:
+  VRegRenamer() = delete;
+  VRegRenamer(MachineRegisterInfo &MRI) : MRI(MRI) {}
+
+  /// Same as the above, but sets a BBNum depending on BB traversal that
+  /// will be used as prefix for the vreg names.
+  bool renameVRegs(MachineBasicBlock *MBB, unsigned BBNum) {
+    CurrentBBNumber = BBNum;
+    return renameInstsInMBB(MBB);
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MIRYamlMapping.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MIRYamlMapping.cpp
new file mode 100644
index 000000000000..b1a538cad8a0
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MIRYamlMapping.cpp
@@ -0,0 +1,43 @@
+//===- MIRYamlMapping.cpp - Describe mapping between MIR and YAML ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the mapping between various MIR data structures and
+// their corresponding YAML representation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MIRYamlMapping.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace llvm;
+using namespace llvm::yaml;
+
+FrameIndex::FrameIndex(int FI, const llvm::MachineFrameInfo &MFI) {
+  IsFixed = MFI.isFixedObjectIndex(FI);
+  if (IsFixed)
+    FI -= MFI.getObjectIndexBegin();
+  this->FI = FI;
+}
+
+// Returns the value and if the frame index is fixed or not.
+Expected<int> FrameIndex::getFI(const llvm::MachineFrameInfo &MFI) const {
+  int FI = this->FI;
+  if (IsFixed) {
+    if (unsigned(FI) >= MFI.getNumFixedObjects())
+      return make_error<StringError>(
+          formatv("invalid fixed frame index {0}", FI).str(),
+          inconvertibleErrorCode());
+    FI += MFI.getObjectIndexBegin();
+  }
+  if (unsigned(FI + MFI.getNumFixedObjects()) >= MFI.getNumObjects())
+    return make_error<StringError>(formatv("invalid frame index {0}", FI).str(),
+                                   inconvertibleErrorCode());
+  return FI;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp
new file mode 100644
index 000000000000..7b3746fde503
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp
@@ -0,0 +1,1164 @@
+//===- MLRegAllocEvictAdvisor.cpp - ML eviction advisor -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the ML eviction advisor and reward injection pass
+//
+//===----------------------------------------------------------------------===//
+
+#include "AllocationOrder.h"
+#include "RegAllocEvictionAdvisor.h"
+#include "RegAllocGreedy.h"
+#include "llvm/Analysis/InteractiveModelRunner.h"
+#include "llvm/Analysis/MLModelRunner.h"
+#include "llvm/Analysis/TensorSpec.h"
+#if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL) || defined(LLVM_HAVE_TFLITE)
+#include "llvm/Analysis/ModelUnderTrainingRunner.h"
+#include "llvm/Analysis/NoInferenceModelRunner.h"
+#include "llvm/Analysis/Utils/TrainingLogger.h"
+#endif
+#include "MLRegallocEvictAdvisor.h"
+#include "llvm/Analysis/ReleaseModeModelRunner.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#include <array>
+#include <bitset>
+#include <memory>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ml-regalloc"
+
+// Generated header in release (AOT) mode
+#if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL)
+#include "RegallocEvictModel.h"
+using CompiledModelType = RegallocEvictModel;
+#else
+using CompiledModelType = NoopSavedModelImpl;
+#endif
+
+static cl::opt<std::string> InteractiveChannelBaseName(
+    "regalloc-evict-interactive-channel-base", cl::Hidden,
+    cl::desc(
+        "Base file path for the interactive mode. The incoming filename should "
+        "have the name <regalloc-evict-interactive-channel-base>.in, while the "
+        "outgoing name should be "
+        "<regalloc-evict-interactive-channel-base>.out"));
+
+// Options that only make sense in development mode
+#ifdef LLVM_HAVE_TFLITE
+#include "RegAllocScore.h"
+#include "llvm/Analysis/Utils/TFUtils.h"
+
+static cl::opt<std::string> TrainingLog(
+    "regalloc-training-log", cl::Hidden,
+    cl::desc("Training log for the register allocator eviction model"));
+
+static cl::opt<std::string> ModelUnderTraining(
+    "regalloc-model", cl::Hidden,
+    cl::desc("The model being trained for register allocation eviction"));
+
+static cl::opt<bool> EnableDevelopmentFeatures(
+    "regalloc-enable-development-features", cl::Hidden,
+    cl::desc("Whether or not to enable features under development for the ML "
+             "regalloc advisor"));
+
+#else
+static const bool EnableDevelopmentFeatures = false;
+#endif // #ifdef LLVM_HAVE_TFLITE
+
+/// The score injection pass.
+/// This pass calculates the score for a function and inserts it in the log, but
+/// this happens only in development mode. It's a no-op otherwise.
+namespace llvm {
+extern cl::opt<unsigned> EvictInterferenceCutoff;
+
+class RegAllocScoring : public MachineFunctionPass {
+public:
+  static char ID;
+
+  RegAllocScoring() : MachineFunctionPass(ID) {
+    initializeRegAllocScoringPass(*PassRegistry::getPassRegistry());
+  }
+
+  ~RegAllocScoring() override = default;
+
+  StringRef getPassName() const override {
+    return "Register Allocation Pass Scoring";
+  }
+
+  /// RegAllocReward analysis usage.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<RegAllocEvictionAdvisorAnalysis>();
+    AU.addRequired<RegAllocPriorityAdvisorAnalysis>();
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  /// Performs this pass
+  bool runOnMachineFunction(MachineFunction &) override;
+};
+
+char RegAllocScoring::ID = 0;
+FunctionPass *createRegAllocScoringPass() { return new RegAllocScoring(); }
+
+} // namespace llvm
+
+INITIALIZE_PASS(RegAllocScoring, "regallocscoringpass",
+                "Register Allocation Scoring Pass", false, false)
+
+// ===================================
+// Common ML Advisor declarations
+// ===================================
+namespace {
+// The model can only accept a specified number of opcodes and will error it if
+// fed an opcode it hasn't seen before. This constant sets the current cutoff.
+static const int OpcodeValueCutoff = 17716;
+
+// Most features are as described above, so we'll reuse this vector in defining
+// them.
+static const std::vector<int64_t> PerLiveRangeShape{1, NumberOfInterferences};
+
+// --------------
+// Features table
+// --------------
+// For each interfering live range (incl. the candidate) we collect a number of
+// features. However, because the features are of different types (and because
+// of ML best practices), we organize the tensors per feature, not per
+// candidate. Each such tensor has a scalar value corresponding to the
+// interferring live range at that position, in the order in AllocationOrder.
+// The last position corresponds to the virt reg seeking allocation.
+// Exception to all that is the progression feature, which is just a scalar (see
+// its documentation for details).
+// Note on naming: the "_by_max" are normalized using the largest value of that
+// tensor, as observed in the current decision making stage (i.e. for the
+// current call to the advisor's tryFindEvictionCandidate)
+//
+// The feature list format: type, name, shape, documentation.
+// Note: we can really just use int64 and float, hence the modeling of some
+// bools as int64 values.
+#define RA_EVICT_FEATURES_LIST(M)                                              \
+  M(int64_t, mask, PerLiveRangeShape,                                          \
+    "boolean values, 0 for unavailable candidates (i.e. if a position is 0, "  \
+    "it "                                                                      \
+    "can't be evicted)")                                                       \
+  M(int64_t, is_free, PerLiveRangeShape,                                       \
+    "boolean values, 1 if this phys reg is actually free (no interferences)")  \
+  M(float, nr_urgent, PerLiveRangeShape,                                       \
+    "number of 'urgent' intervals, normalized. Urgent are those that are OK "  \
+    "to break cascades")                                                       \
+  M(float, nr_broken_hints, PerLiveRangeShape,                                 \
+    "if this position were evicted, how many broken hints would there be")     \
+  M(int64_t, is_hint, PerLiveRangeShape,                                       \
+    "is this a preferred phys reg for the candidate")                          \
+  M(int64_t, is_local, PerLiveRangeShape,                                      \
+    "is this live range local to a basic block")                               \
+  M(float, nr_rematerializable, PerLiveRangeShape,                             \
+    "nr rematerializable ranges")                                              \
+  M(float, nr_defs_and_uses, PerLiveRangeShape,                                \
+    "bb freq - weighed nr defs and uses")                                      \
+  M(float, weighed_reads_by_max, PerLiveRangeShape,                            \
+    "bb freq - weighed nr of reads, normalized")                               \
+  M(float, weighed_writes_by_max, PerLiveRangeShape,                           \
+    "bb feq - weighed nr of writes, normalized")                               \
+  M(float, weighed_read_writes_by_max, PerLiveRangeShape,                      \
+    "bb freq - weighed nr of uses that are both read and writes, normalized")  \
+  M(float, weighed_indvars_by_max, PerLiveRangeShape,                          \
+    "bb freq - weighed nr of uses that are indvars, normalized")               \
+  M(float, hint_weights_by_max, PerLiveRangeShape,                             \
+    "bb freq - weighed nr of uses that are hints, normalized")                 \
+  M(float, start_bb_freq_by_max, PerLiveRangeShape,                            \
+    "the freq in the start block, normalized")                                 \
+  M(float, end_bb_freq_by_max, PerLiveRangeShape,                              \
+    "freq of end block, normalized")                                           \
+  M(float, hottest_bb_freq_by_max, PerLiveRangeShape,                          \
+    "hottest BB freq, normalized")                                             \
+  M(float, liverange_size, PerLiveRangeShape,                                  \
+    "size (instr index diff) of the LR")                                       \
+  M(float, use_def_density, PerLiveRangeShape,                                 \
+    "the max weight, as computed by the manual heuristic")                     \
+  M(int64_t, max_stage, PerLiveRangeShape,                                     \
+    "largest stage of an interval in this LR")                                 \
+  M(int64_t, min_stage, PerLiveRangeShape,                                     \
+    "lowest stage of an interval in this LR")                                  \
+  M(float, progress, {1}, "ratio of current queue size to initial size")
+
+#ifdef LLVM_HAVE_TFLITE
+#define RA_EVICT_FIRST_DEVELOPMENT_FEATURE(M)                                  \
+  M(int64_t, instructions, InstructionsShape,                                  \
+    "Opcodes of the instructions covered by the eviction problem")
+
+#define RA_EVICT_REST_DEVELOPMENT_FEATURES(M)                                  \
+  M(int64_t, instructions_mapping, InstructionsMappingShape,                   \
+    "A binary matrix mapping LRs to instruction opcodes")                      \
+  M(float, mbb_frequencies, MBBFrequencyShape,                                 \
+    "A vector of machine basic block frequencies")                             \
+  M(int64_t, mbb_mapping, InstructionsShape,                                   \
+    "A vector of indicies mapping instructions to MBBs")
+#else
+#define RA_EVICT_FIRST_DEVELOPMENT_FEATURE(M)
+#define RA_EVICT_REST_DEVELOPMENT_FEATURES(M)
+#endif
+
+// The model learns to pick one of the mask == 1 interferences. This is the
+// name of the output tensor. The contract with the model is that the output
+// will be guaranteed to be to a mask == 1 position. Using a macro here to
+// avoid 'not used' warnings (and keep cond compilation to a minimum)
+#define DecisionName "index_to_evict"
+static const TensorSpec DecisionSpec =
+    TensorSpec::createSpec<int64_t>(DecisionName, {1});
+
+// Named features index.
+enum FeatureIDs {
+#define _FEATURE_IDX_SIMPLE(_, name, __, ___) name
+#define _FEATURE_IDX(A, B, C, D) _FEATURE_IDX_SIMPLE(A, B, C, D),
+  RA_EVICT_FEATURES_LIST(_FEATURE_IDX) FeatureCount,
+#ifdef LLVM_HAVE_TFLITE
+  RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_FEATURE_IDX_SIMPLE) = FeatureCount,
+#else
+  RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_FEATURE_IDX)
+#endif // #ifdef LLVM_HAVE_TFLITE
+  RA_EVICT_REST_DEVELOPMENT_FEATURES(_FEATURE_IDX) FeaturesWithDevelopmentCount
+#undef _FEATURE_IDX
+#undef _FEATURE_IDX_SIMPLE
+};
+
+// The ML advisor will typically have a sparse input to the evaluator, because
+// various phys regs won't be available. It's easier (maintenance-wise) to
+// bulk-reset the state of the evaluator each time we are about to use it
+// again.
+template <typename T> size_t getTotalSize(const std::vector<int64_t> &Shape) {
+  size_t Ret = sizeof(T);
+  for (const auto V : Shape)
+    Ret *= V;
+  return Ret;
+}
+
+void resetInputs(MLModelRunner &Runner) {
+#define _RESET(TYPE, NAME, SHAPE, __)                                          \
+  std::memset(Runner.getTensorUntyped(FeatureIDs::NAME), 0,                    \
+              getTotalSize<TYPE>(SHAPE));
+  RA_EVICT_FEATURES_LIST(_RESET)
+  if (EnableDevelopmentFeatures) {
+    RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_RESET)
+    RA_EVICT_REST_DEVELOPMENT_FEATURES(_RESET)
+#undef _RESET
+  }
+}
+
+// Per-live interval components that get aggregated into the feature values
+// that will be passed to the evaluator.
+struct LIFeatureComponents {
+  double R = 0;
+  double W = 0;
+  double RW = 0;
+  double IndVarUpdates = 0;
+  double HintWeights = 0.0;
+  int64_t NrDefsAndUses = 0;
+  float HottestBlockFreq = 0.0;
+  bool IsRemat = false;
+};
+
+using CandidateRegList =
+    std::array<std::pair<MCRegister, bool>, NumberOfInterferences>;
+using FeaturesListNormalizer =
+    llvm::SmallVector<float, FeatureIDs::FeatureCount>;
+
+/// The ML evictor (commonalities between release and development mode)
+class MLEvictAdvisor : public RegAllocEvictionAdvisor {
+public:
+  MLEvictAdvisor(const MachineFunction &MF, const RAGreedy &RA,
+                 MLModelRunner *Runner, const MachineBlockFrequencyInfo &MBFI,
+                 const MachineLoopInfo &Loops);
+
+protected:
+  const RegAllocEvictionAdvisor &getDefaultAdvisor() const {
+    return static_cast<const RegAllocEvictionAdvisor &>(DefaultAdvisor);
+  }
+
+  // The assumption is that if the Runner could not be constructed, we emit-ed
+  // error, and we shouldn't be asking for it here.
+  const MLModelRunner &getRunner() const { return *Runner; }
+
+  /// This just calls Evaluate on the Runner, but in the development mode
+  /// case, if we're just capturing the log of the default advisor, it needs
+  /// to call the latter instead, so we need to pass all the necessary
+  /// parameters for it. In the development case, it will also log.
+  virtual int64_t
+  tryFindEvictionCandidatePosition(const LiveInterval &VirtReg,
+                                   const AllocationOrder &Order,
+                                   unsigned OrderLimit, uint8_t CostPerUseLimit,
+                                   const SmallVirtRegSet &FixedRegisters) const;
+
+  /// Load the features of the given VirtReg (allocated or not) at column Pos,
+  /// but if  that can't be evicted, return false instead.
+  bool
+  loadInterferenceFeatures(const LiveInterval &VirtReg, MCRegister PhysReg,
+                           bool IsHint, const SmallVirtRegSet &FixedRegisters,
+                           llvm::SmallVectorImpl<float> &Largest, size_t Pos,
+                           SmallVectorImpl<LRStartEndInfo> &LRPosInfo) const;
+
+private:
+  static float getInitialQueueSize(const MachineFunction &MF);
+
+  MCRegister tryFindEvictionCandidate(
+      const LiveInterval &VirtReg, const AllocationOrder &Order,
+      uint8_t CostPerUseLimit,
+      const SmallVirtRegSet &FixedRegisters) const override;
+
+  void extractFeatures(const SmallVectorImpl<const LiveInterval *> &Intervals,
+                       llvm::SmallVectorImpl<float> &Largest, size_t Pos,
+                       int64_t IsHint, int64_t LocalIntfsCount, float NrUrgent,
+                       SmallVectorImpl<LRStartEndInfo> &LRPosInfo) const;
+
+  // Point-in-time: we didn't learn this, so we always delegate to the
+  // default.
+  bool canEvictHintInterference(
+      const LiveInterval &VirtReg, MCRegister PhysReg,
+      const SmallVirtRegSet &FixedRegisters) const override {
+    return getDefaultAdvisor().canEvictHintInterference(VirtReg, PhysReg,
+                                                        FixedRegisters);
+  }
+
+  const LIFeatureComponents &
+  getLIFeatureComponents(const LiveInterval &LI) const;
+
+  // Hold on to a default advisor for:
+  // 1) the implementation of canEvictHintInterference, because we didn't
+  // learn that nuance yet; 2) for bootstrapping (logging) in the development
+  // mode case.
+  const DefaultEvictionAdvisor DefaultAdvisor;
+  MLModelRunner *const Runner;
+  const MachineBlockFrequencyInfo &MBFI;
+  const MachineLoopInfo &Loops;
+
+  // Indices of those features we don't want to normalize.
+  // This could be static and shared, but its initialization is non-trivial.
+  std::bitset<FeatureIDs::FeatureCount> DoNotNormalize;
+  const float InitialQSize;
+
+  using RegID = unsigned;
+  mutable DenseMap<RegID, LIFeatureComponents> CachedFeatures;
+};
+
+#define _DECL_FEATURES(type, name, shape, _)                                   \
+  TensorSpec::createSpec<type>(#name, shape),
+
+// ===================================
+// Release (AOT) - specifics
+// ===================================
+class ReleaseModeEvictionAdvisorAnalysis final
+    : public RegAllocEvictionAdvisorAnalysis {
+public:
+  ReleaseModeEvictionAdvisorAnalysis()
+      : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Release) {
+    if (EnableDevelopmentFeatures) {
+      InputFeatures = {RA_EVICT_FEATURES_LIST(
+          _DECL_FEATURES) RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_DECL_FEATURES)
+                           RA_EVICT_REST_DEVELOPMENT_FEATURES(_DECL_FEATURES)};
+    } else {
+      InputFeatures = {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)};
+    }
+  }
+  // support for isa<> and dyn_cast.
+  static bool classof(const RegAllocEvictionAdvisorAnalysis *R) {
+    return R->getAdvisorMode() == AdvisorMode::Release;
+  }
+
+private:
+  std::vector<TensorSpec> InputFeatures;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.addRequired<MachineLoopInfo>();
+    RegAllocEvictionAdvisorAnalysis::getAnalysisUsage(AU);
+  }
+
+  std::unique_ptr<RegAllocEvictionAdvisor>
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override {
+    if (!Runner) {
+      if (InteractiveChannelBaseName.empty())
+        Runner = std::make_unique<ReleaseModeModelRunner<CompiledModelType>>(
+            MF.getFunction().getContext(), InputFeatures, DecisionName);
+      else
+        Runner = std::make_unique<InteractiveModelRunner>(
+            MF.getFunction().getContext(), InputFeatures, DecisionSpec,
+            InteractiveChannelBaseName + ".out",
+            InteractiveChannelBaseName + ".in");
+    }
+    return std::make_unique<MLEvictAdvisor>(
+        MF, RA, Runner.get(), getAnalysis<MachineBlockFrequencyInfo>(),
+        getAnalysis<MachineLoopInfo>());
+  }
+  std::unique_ptr<MLModelRunner> Runner;
+};
+
+// ===================================
+// Development mode-specifics
+// ===================================
+//
+// Features we log
+#ifdef LLVM_HAVE_TFLITE
+static const TensorSpec Reward = TensorSpec::createSpec<float>("reward", {1});
+
+// Features we bind on the model. The tensor names have a prefix, and we also
+// need to include some tensors that are expected to be present by the
+// training algo.
+// TODO: can we just get rid of these?
+#define _DECL_TRAIN_FEATURES(type, name, shape, _)                             \
+  TensorSpec::createSpec<type>(std::string("action_") + #name, shape),
+
+class DevelopmentModeEvictAdvisor : public MLEvictAdvisor {
+public:
+  DevelopmentModeEvictAdvisor(const MachineFunction &MF, const RAGreedy &RA,
+                              MLModelRunner *Runner,
+                              const MachineBlockFrequencyInfo &MBFI,
+                              const MachineLoopInfo &Loops, Logger *Log)
+      : MLEvictAdvisor(MF, RA, Runner, MBFI, Loops), Log(Log) {}
+
+private:
+  int64_t tryFindEvictionCandidatePosition(
+      const LiveInterval &VirtReg, const AllocationOrder &Order,
+      unsigned OrderLimit, uint8_t CostPerUseLimit,
+      const SmallVirtRegSet &FixedRegisters) const override;
+
+  Logger *const Log;
+};
+
+class DevelopmentModeEvictionAdvisorAnalysis final
+    : public RegAllocEvictionAdvisorAnalysis {
+public:
+  DevelopmentModeEvictionAdvisorAnalysis()
+      : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Development) {
+    if (EnableDevelopmentFeatures) {
+      InputFeatures = {RA_EVICT_FEATURES_LIST(
+          _DECL_FEATURES) RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_DECL_FEATURES)
+                           RA_EVICT_REST_DEVELOPMENT_FEATURES(_DECL_FEATURES)};
+      TrainingInputFeatures = {
+          RA_EVICT_FEATURES_LIST(_DECL_TRAIN_FEATURES)
+              RA_EVICT_FIRST_DEVELOPMENT_FEATURE(_DECL_TRAIN_FEATURES)
+                  RA_EVICT_REST_DEVELOPMENT_FEATURES(_DECL_TRAIN_FEATURES)
+                      TensorSpec::createSpec<float>("action_discount", {1}),
+          TensorSpec::createSpec<int32_t>("action_step_type", {1}),
+          TensorSpec::createSpec<float>("action_reward", {1})};
+    } else {
+      InputFeatures = {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)};
+      TrainingInputFeatures = {
+          RA_EVICT_FEATURES_LIST(_DECL_TRAIN_FEATURES)
+              TensorSpec::createSpec<float>("action_discount", {1}),
+          TensorSpec::createSpec<int32_t>("action_step_type", {1}),
+          TensorSpec::createSpec<float>("action_reward", {1})};
+    }
+  }
+  // support for isa<> and dyn_cast.
+  static bool classof(const RegAllocEvictionAdvisorAnalysis *R) {
+    return R->getAdvisorMode() == AdvisorMode::Development;
+  }
+
+  void logRewardIfNeeded(const MachineFunction &MF,
+                         llvm::function_ref<float()> GetReward) override {
+    if (!Log || !Log->hasAnyObservationForContext(MF.getName()))
+      return;
+    // The function pass manager would run all the function passes for a
+    // function, so we assume the last context belongs to this function. If
+    // this invariant ever changes, we can implement at that time switching
+    // contexts. At this point, it'd be an error
+    if (Log->currentContext() != MF.getName()) {
+      MF.getFunction().getContext().emitError(
+          "The training log context shouldn't have had changed.");
+    }
+    if (Log->hasObservationInProgress())
+      Log->logReward<float>(GetReward());
+  }
+
+private:
+  std::vector<TensorSpec> InputFeatures;
+  std::vector<TensorSpec> TrainingInputFeatures;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.addRequired<MachineLoopInfo>();
+    RegAllocEvictionAdvisorAnalysis::getAnalysisUsage(AU);
+  }
+
+  bool doInitialization(Module &M) override {
+    LLVMContext &Ctx = M.getContext();
+    if (ModelUnderTraining.empty() && TrainingLog.empty()) {
+      Ctx.emitError("Regalloc development mode should be requested with at "
+                    "least logging enabled and/or a training model");
+      return false;
+    }
+    if (ModelUnderTraining.empty())
+      Runner = std::make_unique<NoInferenceModelRunner>(Ctx, InputFeatures);
+    else
+      Runner = ModelUnderTrainingRunner::createAndEnsureValid(
+          Ctx, ModelUnderTraining, DecisionName, TrainingInputFeatures);
+    if (!Runner) {
+      Ctx.emitError("Regalloc: could not set up the model runner");
+      return false;
+    }
+    if (TrainingLog.empty())
+      return false;
+    std::error_code EC;
+    auto OS = std::make_unique<raw_fd_ostream>(TrainingLog, EC);
+    if (EC) {
+      M.getContext().emitError(EC.message() + ":" + TrainingLog);
+      return false;
+    }
+    std::vector<TensorSpec> LFS = InputFeatures;
+    if (auto *MUTR = dyn_cast<ModelUnderTrainingRunner>(Runner.get()))
+      append_range(LFS, MUTR->extraOutputsForLoggingSpecs());
+    // We always log the output; in particular, if we're not evaluating, we
+    // don't have an output spec json file. That's why we handle the
+    // 'normal' output separately.
+    LFS.push_back(DecisionSpec);
+
+    Log = std::make_unique<Logger>(std::move(OS), LFS, Reward,
+                                   /*IncludeReward*/ true);
+    return false;
+  }
+
+  std::unique_ptr<RegAllocEvictionAdvisor>
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override {
+    if (!Runner)
+      return nullptr;
+    if (Log)
+      Log->switchContext(MF.getName());
+    return std::make_unique<DevelopmentModeEvictAdvisor>(
+        MF, RA, Runner.get(), getAnalysis<MachineBlockFrequencyInfo>(),
+        getAnalysis<MachineLoopInfo>(), Log.get());
+  }
+
+  std::unique_ptr<MLModelRunner> Runner;
+  std::unique_ptr<Logger> Log;
+};
+
+#endif //#ifdef LLVM_HAVE_TFLITE
+} // namespace
+
+float MLEvictAdvisor::getInitialQueueSize(const MachineFunction &MF) {
+  auto &MRI = MF.getRegInfo();
+  float Ret = 0.0;
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    ++Ret;
+  }
+  return Ret;
+}
+
+MLEvictAdvisor::MLEvictAdvisor(const MachineFunction &MF, const RAGreedy &RA,
+                               MLModelRunner *Runner,
+                               const MachineBlockFrequencyInfo &MBFI,
+                               const MachineLoopInfo &Loops)
+    : RegAllocEvictionAdvisor(MF, RA), DefaultAdvisor(MF, RA),
+      Runner(std::move(Runner)), MBFI(MBFI), Loops(Loops),
+      InitialQSize(MLEvictAdvisor::getInitialQueueSize(MF)) {
+  assert(this->Runner);
+  Runner->switchContext(MF.getName());
+  DoNotNormalize.set(FeatureIDs::mask);
+  DoNotNormalize.set(FeatureIDs::is_free);
+  DoNotNormalize.set(FeatureIDs::is_hint);
+  DoNotNormalize.set(FeatureIDs::is_local);
+  DoNotNormalize.set(FeatureIDs::min_stage);
+  DoNotNormalize.set(FeatureIDs::max_stage);
+  DoNotNormalize.set(FeatureIDs::progress);
+}
+
+int64_t MLEvictAdvisor::tryFindEvictionCandidatePosition(
+    const LiveInterval &, const AllocationOrder &, unsigned, uint8_t,
+    const SmallVirtRegSet &) const {
+  int64_t Ret = Runner->evaluate<int64_t>();
+  assert(Ret >= 0);
+  assert(Ret <= CandidateVirtRegPos);
+  return Ret;
+}
+
+bool MLEvictAdvisor::loadInterferenceFeatures(
+    const LiveInterval &VirtReg, MCRegister PhysReg, bool IsHint,
+    const SmallVirtRegSet &FixedRegisters,
+    llvm::SmallVectorImpl<float> &Largest, size_t Pos,
+    llvm::SmallVectorImpl<LRStartEndInfo> &LRPosInfo) const {
+  // It is only possible to evict virtual register interference.
+  if (Matrix->checkInterference(VirtReg, PhysReg) > LiveRegMatrix::IK_VirtReg) {
+    // leave unavailable
+    return false;
+  }
+
+  const bool IsLocal = LIS->intervalIsInOneMBB(VirtReg);
+  int64_t LocalIntfs = 0;
+  float NrUrgent = 0.0f;
+
+  // The cascade tracking is the same as in the default advisor
+  unsigned Cascade = RA.getExtraInfo().getCascadeOrCurrentNext(VirtReg.reg());
+
+  SmallVector<const LiveInterval *, MaxInterferences> InterferingIntervals;
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, Unit);
+    // Different from the default heuristic, we don't make any assumptions
+    // about what having more than 10 results in the query may mean.
+    const auto &IFIntervals = Q.interferingVRegs(EvictInterferenceCutoff);
+    if (IFIntervals.empty() && InterferingIntervals.empty())
+      continue;
+    if (IFIntervals.size() >= EvictInterferenceCutoff)
+      return false;
+    InterferingIntervals.append(IFIntervals.begin(), IFIntervals.end());
+    for (const LiveInterval *Intf : reverse(IFIntervals)) {
+      assert(Intf->reg().isVirtual() &&
+             "Only expecting virtual register interference from query");
+      // This is the same set of legality checks as in the default case: don't
+      // try to evict fixed regs or 'done' ones. Also don't break cascades,
+      // except in the urgent case, with the same nuances used in the default
+      // heuristic.
+      // We could try sharing this between the advisors, but it may end up
+      // more complex than it is right now.
+      if (FixedRegisters.count(Intf->reg()))
+        return false;
+      if (RA.getExtraInfo().getStage(*Intf) == RS_Done)
+        return false;
+      bool Urgent =
+          !VirtReg.isSpillable() &&
+          (Intf->isSpillable() ||
+           RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(VirtReg.reg())) <
+               RegClassInfo.getNumAllocatableRegs(
+                   MRI->getRegClass(Intf->reg())));
+      // Only evict older cascades or live ranges without a cascade.
+      unsigned IntfCascade = RA.getExtraInfo().getCascade(Intf->reg());
+      if (Cascade <= IntfCascade) {
+        if (!Urgent)
+          return false;
+        ++NrUrgent;
+      }
+
+      LocalIntfs += (IsLocal && LIS->intervalIsInOneMBB(*Intf) &&
+                     (!EnableLocalReassign || !canReassign(*Intf, PhysReg)));
+    }
+  }
+  // OK, so if we made it this far, this LR is an eviction candidate, load its
+  // features.
+  extractFeatures(InterferingIntervals, Largest, Pos, IsHint, LocalIntfs,
+                  NrUrgent, LRPosInfo);
+  return true;
+}
+
+MCRegister MLEvictAdvisor::tryFindEvictionCandidate(
+    const LiveInterval &VirtReg, const AllocationOrder &Order,
+    uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const {
+  auto MaybeOrderLimit = getOrderLimit(VirtReg, Order, CostPerUseLimit);
+  if (!MaybeOrderLimit)
+    return MCRegister::NoRegister;
+  unsigned OrderLimit = *MaybeOrderLimit;
+
+  // The heuristic sets initial costs such as, if CostPerUseLimit is
+  // max<uint8_t>, then any of the costs of the legally-evictable intervals
+  // would be lower. When that happens, one of those will be selected.
+  // Therefore, we allow the candidate be selected, unless the candidate is
+  // unspillable, in which case it would be incorrect to not find a register
+  // for it.
+  const bool MustFindEviction =
+      (!VirtReg.isSpillable() && CostPerUseLimit == static_cast<uint8_t>(~0u));
+  // Number of available candidates - if 0, no need to continue.
+  size_t Available = 0;
+  // Make sure we don't have leftover partial state from an attempt where we
+  // had no available candidates and bailed out early.
+  resetInputs(*Runner);
+
+  // Track the index->register mapping because AllocationOrder doesn't do that
+  // and we'd have to scan it.
+  // Also track their mask, to write asserts/debug.
+  CandidateRegList Regs;
+  Regs.fill({0, false});
+
+  // Track the largest value of features seen during this eviction session. We
+  // only normalize (some of) the float features, but it's just simpler to
+  // dimension 'Largest' to all the features, especially since we have the
+  // 'DoNotNormalize' list.
+  FeaturesListNormalizer Largest(FeatureIDs::FeatureCount, 0.0);
+
+  // Same overal idea as in the default eviction policy - we visit the values
+  // of AllocationOrder one at a time. If it's not legally available, we mask
+  // off the corresponding feature column (==do nothing because we already
+  // reset all the features to 0) Use Pos to capture the column we load
+  // features at - in AllocationOrder order.
+  size_t Pos = 0;
+  SmallVector<LRStartEndInfo, NumberOfInterferences> LRPosInfo;
+  for (auto I = Order.begin(), E = Order.getOrderLimitEnd(OrderLimit); I != E;
+       ++I, ++Pos) {
+    MCRegister PhysReg = *I;
+    assert(!Regs[Pos].second);
+    assert(PhysReg);
+    if (!canAllocatePhysReg(CostPerUseLimit, PhysReg)) {
+      continue;
+    }
+    if (loadInterferenceFeatures(VirtReg, PhysReg, I.isHint(), FixedRegisters,
+                                 Largest, Pos, LRPosInfo)) {
+      ++Available;
+      Regs[Pos] = std::make_pair(PhysReg, true);
+    }
+  }
+  if (Available == 0) {
+    // Nothing to decide, nothing to learn.
+    assert(!MustFindEviction);
+    return MCRegister::NoRegister;
+  }
+  const size_t ValidPosLimit = Pos;
+  // If we must find eviction, the candidate should be masked out of the
+  // decision making process.
+  Regs[CandidateVirtRegPos].second = !MustFindEviction;
+  if (!MustFindEviction)
+    extractFeatures(SmallVector<const LiveInterval *, 1>(1, &VirtReg), Largest,
+                    CandidateVirtRegPos, /*IsHint*/ 0,
+                    /*LocalIntfsCount*/ 0,
+                    /*NrUrgent*/ 0.0, LRPosInfo);
+  assert(InitialQSize > 0.0 && "We couldn't have gotten here if we had "
+                               "nothing to allocate initially.");
+#ifdef LLVM_HAVE_TFLITE
+  if (EnableDevelopmentFeatures) {
+    extractInstructionFeatures(
+        LRPosInfo, Runner,
+        [this](SlotIndex InputIndex) -> int {
+          auto *CurrentMachineInstruction =
+              LIS->getInstructionFromIndex(InputIndex);
+          if (!CurrentMachineInstruction) {
+            return -1;
+          }
+          return CurrentMachineInstruction->getOpcode();
+        },
+        [this](SlotIndex InputIndex) -> float {
+          auto *CurrentMachineInstruction =
+              LIS->getInstructionFromIndex(InputIndex);
+          return MBFI.getBlockFreqRelativeToEntryBlock(
+              CurrentMachineInstruction->getParent());
+        },
+        [this](SlotIndex InputIndex) -> MachineBasicBlock * {
+          auto *CurrentMachineInstruction =
+              LIS->getInstructionFromIndex(InputIndex);
+          return CurrentMachineInstruction->getParent();
+        },
+        FeatureIDs::instructions, FeatureIDs::instructions_mapping,
+        FeatureIDs::mbb_frequencies, FeatureIDs::mbb_mapping,
+        LIS->getSlotIndexes()->getLastIndex());
+  }
+#endif // #ifdef LLVM_HAVE_TFLITE
+  // Normalize the features.
+  for (auto &V : Largest)
+    V = V ? V : 1.0;
+  for (size_t FeatureIndex = 0; FeatureIndex < FeatureIDs::FeatureCount;
+       ++FeatureIndex) {
+    if (DoNotNormalize.test(FeatureIndex))
+      continue;
+    for (size_t Pos = 0; Pos < NumberOfInterferences; ++Pos) {
+      Runner->getTensor<float>(FeatureIndex)[Pos] /= Largest[FeatureIndex];
+    }
+  }
+  *Runner->getTensor<float>(FeatureIDs::progress) =
+      static_cast<float>(RA.getQueueSize()) / InitialQSize;
+
+  // Get a decision.
+  size_t CandidatePos = tryFindEvictionCandidatePosition(
+      VirtReg, Order, OrderLimit, CostPerUseLimit, FixedRegisters);
+  // The contract with the ML side is that CandidatePos is mask == 1 (i.e.
+  // Regs[CandidatePos].second)
+  assert(Regs[CandidatePos].second);
+  if (CandidatePos == CandidateVirtRegPos) {
+    assert(!MustFindEviction);
+    return MCRegister::NoRegister;
+  }
+  assert(CandidatePos < ValidPosLimit);
+  (void)ValidPosLimit;
+  return Regs[CandidatePos].first;
+}
+
+const LIFeatureComponents &
+MLEvictAdvisor::getLIFeatureComponents(const LiveInterval &LI) const {
+  RegID ID = LI.reg().id();
+  LIFeatureComponents Empty;
+  auto I = CachedFeatures.insert(std::make_pair(ID, Empty));
+  LIFeatureComponents &Ret = I.first->getSecond();
+  if (!I.second)
+    return Ret;
+
+  SmallPtrSet<MachineInstr *, 8> Visited;
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+
+  for (MachineRegisterInfo::reg_instr_nodbg_iterator
+           I = MRI->reg_instr_nodbg_begin(LI.reg()),
+           E = MRI->reg_instr_nodbg_end();
+       I != E;) {
+    MachineInstr *MI = &*(I++);
+
+    ++Ret.NrDefsAndUses;
+    if (!Visited.insert(MI).second)
+      continue;
+
+    if (MI->isIdentityCopy() || MI->isImplicitDef())
+      continue;
+
+    bool Reads, Writes;
+    std::tie(Reads, Writes) = MI->readsWritesVirtualRegister(LI.reg());
+
+    float Freq = MBFI.getBlockFreqRelativeToEntryBlock(MI->getParent());
+    Ret.HottestBlockFreq = std::max(Freq, Ret.HottestBlockFreq);
+
+    Ret.R += (Reads && !Writes) * Freq;
+    Ret.W += (!Reads && Writes) * Freq;
+    Ret.RW += (Reads && Writes) * Freq;
+
+    auto *MBB = MI->getParent();
+    auto *Loop = Loops.getLoopFor(MBB);
+    bool IsExiting = Loop ? Loop->isLoopExiting(MBB) : false;
+
+    if (Writes && IsExiting && LIS->isLiveOutOfMBB(LI, MBB))
+      Ret.IndVarUpdates += Freq;
+
+    if (MI->isCopy() && VirtRegAuxInfo::copyHint(MI, LI.reg(), TRI, *MRI))
+      Ret.HintWeights += Freq;
+  }
+  Ret.IsRemat = VirtRegAuxInfo::isRematerializable(
+      LI, *LIS, *VRM, *MF.getSubtarget().getInstrInfo());
+  return Ret;
+}
+
+// Overall, this currently mimics what we do for weight calculation, but instead
+// of accummulating the various features, we keep them separate.
+void MLEvictAdvisor::extractFeatures(
+    const SmallVectorImpl<const LiveInterval *> &Intervals,
+    llvm::SmallVectorImpl<float> &Largest, size_t Pos, int64_t IsHint,
+    int64_t LocalIntfsCount, float NrUrgent,
+    SmallVectorImpl<LRStartEndInfo> &LRPosInfo) const {
+  int64_t NrDefsAndUses = 0;
+  int64_t NrBrokenHints = 0;
+  double R = 0.0;
+  double W = 0.0;
+  double RW = 0.0;
+  double IndVarUpdates = 0.0;
+  double HintWeights = 0.0;
+  float StartBBFreq = 0.0;
+  float EndBBFreq = 0.0;
+  float HottestBlockFreq = 0.0;
+  int32_t NrRematerializable = 0;
+  float TotalWeight = 0.0;
+
+  SlotIndex EndSI = LIS->getSlotIndexes()->getZeroIndex();
+  SlotIndex StartSI = LIS->getSlotIndexes()->getLastIndex();
+  int64_t MaxStage = 0;
+  int64_t MinStage =
+      Intervals.empty() ? 0 : std::numeric_limits<int64_t>::max();
+
+  for (const auto *L : Intervals) {
+    const LiveInterval &LI = *L;
+    MaxStage = std::max<int64_t>(
+        MaxStage, static_cast<int64_t>(RA.getExtraInfo().getStage(LI)));
+    MinStage = std::min<int64_t>(
+        MinStage, static_cast<int64_t>(RA.getExtraInfo().getStage(LI)));
+
+    TotalWeight = std::max(TotalWeight, LI.weight());
+
+    if (LI.beginIndex() < StartSI)
+      StartSI = LI.beginIndex();
+
+    if (LI.endIndex() > EndSI)
+      EndSI = LI.endIndex();
+    const LIFeatureComponents &LIFC = getLIFeatureComponents(LI);
+    NrBrokenHints += VRM->hasPreferredPhys(LI.reg());
+
+    NrDefsAndUses += LIFC.NrDefsAndUses;
+    HottestBlockFreq = std::max(HottestBlockFreq, LIFC.HottestBlockFreq);
+    R += LIFC.R;
+    W += LIFC.W;
+    RW += LIFC.RW;
+
+    IndVarUpdates += LIFC.IndVarUpdates;
+
+    HintWeights += LIFC.HintWeights;
+    NrRematerializable += LIFC.IsRemat;
+
+    if (EnableDevelopmentFeatures) {
+      for (auto CurrentSegment : LI) {
+        LRPosInfo.push_back(
+            LRStartEndInfo{CurrentSegment.start, CurrentSegment.end, Pos});
+      }
+    }
+  }
+  size_t Size = 0;
+  if (!Intervals.empty()) {
+    StartBBFreq =
+        MBFI.getBlockFreqRelativeToEntryBlock(LIS->getMBBFromIndex(StartSI));
+    if (EndSI >= LIS->getSlotIndexes()->getLastIndex())
+      EndSI = LIS->getSlotIndexes()->getLastIndex().getPrevIndex();
+    EndBBFreq =
+        MBFI.getBlockFreqRelativeToEntryBlock(LIS->getMBBFromIndex(EndSI));
+    Size = StartSI.distance(EndSI);
+  }
+  // Set the features at the column 'Pos'.
+#define SET(ID, TYPE, VAL)                                                     \
+  do {                                                                         \
+    Runner->getTensor<TYPE>(FeatureIDs::ID)[Pos] = static_cast<TYPE>(VAL);     \
+    if (!DoNotNormalize.test(FeatureIDs::ID))                                  \
+      Largest[FeatureIDs::ID] =                                                \
+          std::max(Largest[FeatureIDs::ID], static_cast<float>(VAL));          \
+  } while (false)
+  SET(mask, int64_t, 1);
+  SET(is_free, int64_t, Intervals.empty());
+  SET(nr_urgent, float, NrUrgent);
+  SET(nr_broken_hints, float, NrBrokenHints);
+  SET(is_hint, int64_t, IsHint);
+  SET(is_local, int64_t, LocalIntfsCount);
+  SET(nr_rematerializable, float, NrRematerializable);
+  SET(nr_defs_and_uses, float, NrDefsAndUses);
+  SET(weighed_reads_by_max, float, R);
+  SET(weighed_writes_by_max, float, W);
+  SET(weighed_read_writes_by_max, float, RW);
+  SET(weighed_indvars_by_max, float, IndVarUpdates);
+  SET(hint_weights_by_max, float, HintWeights);
+  SET(start_bb_freq_by_max, float, StartBBFreq);
+  SET(end_bb_freq_by_max, float, EndBBFreq);
+  SET(hottest_bb_freq_by_max, float, HottestBlockFreq);
+  SET(liverange_size, float, Size);
+  SET(use_def_density, float, TotalWeight);
+  SET(max_stage, int64_t, MaxStage);
+  SET(min_stage, int64_t, MinStage);
+#undef SET
+}
+
+void extractInstructionFeatures(
+    SmallVectorImpl<LRStartEndInfo> &LRPosInfo, MLModelRunner *RegallocRunner,
+    function_ref<int(SlotIndex)> GetOpcode,
+    function_ref<float(SlotIndex)> GetMBBFreq,
+    function_ref<MachineBasicBlock *(SlotIndex)> GetMBBReference,
+    const int InstructionsIndex, const int InstructionsMappingIndex,
+    const int MBBFreqIndex, const int MBBMappingIndex,
+    const SlotIndex LastIndex) {
+  // This function extracts instruction based features relevant to the eviction
+  // problem currently being solved. This function ends up extracting two
+  // tensors.
+  // 1 - A vector of size max instruction count. It contains the opcodes of the
+  // instructions spanned by all the intervals in the current instance of the
+  // eviction problem.
+  // 2 - A binary mapping matrix of size (LR count * max
+  // instruction count) which maps where the LRs are live to the actual opcodes
+  // for which they are live.
+  // 3 - A vector of size max supported MBB count storing MBB frequencies,
+  // encompassing all of the MBBs covered by the eviction problem.
+  // 4 - A vector of size max instruction count of indices to members of the MBB
+  // frequency vector, mapping each instruction to its associated MBB.
+
+  // Start off by sorting the segments based on the beginning slot index.
+  std::sort(
+      LRPosInfo.begin(), LRPosInfo.end(),
+      [](LRStartEndInfo A, LRStartEndInfo B) { return A.Begin < B.Begin; });
+  size_t InstructionIndex = 0;
+  size_t CurrentSegmentIndex = 0;
+  SlotIndex CurrentIndex = LRPosInfo[0].Begin;
+  std::map<MachineBasicBlock *, size_t> VisitedMBBs;
+  size_t CurrentMBBIndex = 0;
+  // This loop processes all the segments sequentially by starting at the
+  // beginning slot index of the first segment, iterating through all the slot
+  // indices before the end slot index of that segment (while checking for
+  // overlaps with segments that start at greater slot indices). After hitting
+  // that end index, the current segment being processed gets bumped until they
+  // are all processed or the max instruction count is hit, where everything is
+  // just truncated.
+  while (true) {
+    // If the index that we are currently at is within the current segment and
+    // we haven't hit the max instruction count, continue processing the current
+    // segment.
+    while (CurrentIndex <= LRPosInfo[CurrentSegmentIndex].End &&
+           InstructionIndex < ModelMaxSupportedInstructionCount) {
+      int CurrentOpcode = GetOpcode(CurrentIndex);
+      // If the current machine instruction is null, skip it
+      if (CurrentOpcode == -1) {
+        // If we're currently at the last index in the SlotIndex analysis,
+        // we can't go any further, so return from the function
+        if (CurrentIndex >= LastIndex) {
+          return;
+        }
+        CurrentIndex = CurrentIndex.getNextIndex();
+        continue;
+      }
+      MachineBasicBlock *CurrentMBBReference = GetMBBReference(CurrentIndex);
+      if (VisitedMBBs.count(CurrentMBBReference) == 0) {
+        VisitedMBBs[CurrentMBBReference] = CurrentMBBIndex;
+        ++CurrentMBBIndex;
+      }
+      extractMBBFrequency(CurrentIndex, InstructionIndex, VisitedMBBs,
+                          GetMBBFreq, CurrentMBBReference, RegallocRunner,
+                          MBBFreqIndex, MBBMappingIndex);
+      // Current code assumes we're not going to get any disjointed segments
+      assert(LRPosInfo[CurrentSegmentIndex].Begin <= CurrentIndex);
+      RegallocRunner->getTensor<int64_t>(InstructionsIndex)[InstructionIndex] =
+          CurrentOpcode < OpcodeValueCutoff ? CurrentOpcode : 0;
+      // set value in the binary mapping matrix for the current instruction
+      auto CurrentSegmentPosition = LRPosInfo[CurrentSegmentIndex].Pos;
+      RegallocRunner->getTensor<int64_t>(
+          InstructionsMappingIndex)[CurrentSegmentPosition *
+                                        ModelMaxSupportedInstructionCount +
+                                    InstructionIndex] = 1;
+      // All of the segments are sorted based on the beginning slot index, but
+      // this doesn't mean that the beginning slot index of the next segment is
+      // after the end segment of the one being currently processed. This while
+      // loop checks for overlapping segments and modifies the portion of the
+      // column in the mapping matrix for the currently processed instruction
+      // for the LR it is checking. Also make sure that the beginning of the
+      // current segment we're checking for overlap in is less than the current
+      // index, otherwise we're done checking overlaps.
+      size_t OverlapCheckCurrentSegment = CurrentSegmentIndex + 1;
+      while (OverlapCheckCurrentSegment < LRPosInfo.size() &&
+             LRPosInfo[OverlapCheckCurrentSegment].Begin <= CurrentIndex) {
+        auto OverlapCurrentSegmentPosition =
+            LRPosInfo[OverlapCheckCurrentSegment].Pos;
+        if (LRPosInfo[OverlapCheckCurrentSegment].End >= CurrentIndex) {
+          RegallocRunner->getTensor<int64_t>(
+              InstructionsMappingIndex)[OverlapCurrentSegmentPosition *
+                                            ModelMaxSupportedInstructionCount +
+                                        InstructionIndex] = 1;
+        }
+        ++OverlapCheckCurrentSegment;
+      }
+      ++InstructionIndex;
+      if (CurrentIndex >= LastIndex) {
+        return;
+      }
+      CurrentIndex = CurrentIndex.getNextIndex();
+    }
+    // if we've just finished processing through the last segment or if we've
+    // hit the maximum number of instructions, break out of the loop.
+    if (CurrentSegmentIndex == LRPosInfo.size() - 1 ||
+        InstructionIndex >= ModelMaxSupportedInstructionCount) {
+      break;
+    }
+    // If the segments are not overlapping, we need to move to the beginning
+    // index of the next segment to avoid having instructions not attached to
+    // any register.
+    if (LRPosInfo[CurrentSegmentIndex + 1].Begin >
+        LRPosInfo[CurrentSegmentIndex].End) {
+      CurrentIndex = LRPosInfo[CurrentSegmentIndex + 1].Begin;
+    }
+    ++CurrentSegmentIndex;
+  }
+}
+
+void extractMBBFrequency(const SlotIndex CurrentIndex,
+                         const size_t CurrentInstructionIndex,
+                         std::map<MachineBasicBlock *, size_t> &VisitedMBBs,
+                         function_ref<float(SlotIndex)> GetMBBFreq,
+                         MachineBasicBlock *CurrentMBBReference,
+                         MLModelRunner *RegallocRunner, const int MBBFreqIndex,
+                         const int MBBMappingIndex) {
+  size_t CurrentMBBIndex = VisitedMBBs[CurrentMBBReference];
+  float CurrentMBBFreq = GetMBBFreq(CurrentIndex);
+  if (CurrentMBBIndex < ModelMaxSupportedMBBCount) {
+    RegallocRunner->getTensor<float>(MBBFreqIndex)[CurrentMBBIndex] =
+        CurrentMBBFreq;
+    RegallocRunner->getTensor<int64_t>(
+        MBBMappingIndex)[CurrentInstructionIndex] = CurrentMBBIndex;
+  }
+}
+
+// Development mode-specific implementations
+#ifdef LLVM_HAVE_TFLITE
+
+RegAllocEvictionAdvisorAnalysis *llvm::createDevelopmentModeAdvisor() {
+  return new DevelopmentModeEvictionAdvisorAnalysis();
+}
+
+int64_t DevelopmentModeEvictAdvisor::tryFindEvictionCandidatePosition(
+    const LiveInterval &VirtReg, const AllocationOrder &Order,
+    unsigned OrderLimit, uint8_t CostPerUseLimit,
+    const SmallVirtRegSet &FixedRegisters) const {
+  int64_t Ret = 0;
+  if (isa<ModelUnderTrainingRunner>(getRunner())) {
+    Ret = MLEvictAdvisor::tryFindEvictionCandidatePosition(
+        VirtReg, Order, OrderLimit, CostPerUseLimit, FixedRegisters);
+  } else {
+    MCRegister PhysReg = getDefaultAdvisor().tryFindEvictionCandidate(
+        VirtReg, Order, CostPerUseLimit, FixedRegisters);
+    // Find the index of the selected PhysReg. We need it for logging,
+    // otherwise this is wasted cycles (but so would starting development mode
+    // without a model nor logging)
+    if (!PhysReg)
+      Ret = CandidateVirtRegPos;
+    else
+      for (auto I = Order.begin(), E = Order.getOrderLimitEnd(OrderLimit);
+           I != E; ++I, ++Ret)
+        if (*I == PhysReg)
+          break;
+  }
+  if (TrainingLog.empty())
+    return Ret;
+  // TODO(mtrofin): when we support optional rewards, this can go away. In the
+  // meantime, we log the "pretend" reward (0) for the previous observation
+  // before starting a new one.
+  if (Log->hasObservationInProgress())
+    Log->logReward<float>(0.0);
+
+  Log->startObservation();
+  size_t CurrentFeature = 0;
+  size_t FeatureCount = EnableDevelopmentFeatures
+                            ? FeatureIDs::FeaturesWithDevelopmentCount
+                            : FeatureIDs::FeatureCount;
+  for (; CurrentFeature < FeatureCount; ++CurrentFeature) {
+    Log->logTensorValue(CurrentFeature,
+                        reinterpret_cast<const char *>(
+                            getRunner().getTensorUntyped(CurrentFeature)));
+  }
+  if (auto *MUTR = dyn_cast<ModelUnderTrainingRunner>(&getRunner()))
+    for (size_t I = 0; I < MUTR->extraOutputsForLoggingSpecs().size();
+         ++I, ++CurrentFeature)
+      Log->logTensorValue(
+          CurrentFeature,
+          reinterpret_cast<const char *>(MUTR->getUntypedExtraOutputValue(I)));
+  // The output is right after the features and the extra outputs
+  Log->logTensorValue(CurrentFeature, reinterpret_cast<const char *>(&Ret));
+  Log->endObservation();
+  return Ret;
+}
+
+bool RegAllocScoring::runOnMachineFunction(MachineFunction &MF) {
+  std::optional<float> CachedReward;
+  auto GetReward = [&]() {
+    if (!CachedReward)
+      CachedReward = static_cast<float>(
+          calculateRegAllocScore(MF, getAnalysis<MachineBlockFrequencyInfo>())
+              .getScore());
+    return *CachedReward;
+  };
+
+  getAnalysis<RegAllocEvictionAdvisorAnalysis>().logRewardIfNeeded(MF,
+                                                                   GetReward);
+  getAnalysis<RegAllocPriorityAdvisorAnalysis>().logRewardIfNeeded(MF,
+                                                                   GetReward);
+  return false;
+}
+#endif // #ifdef LLVM_HAVE_TFLITE
+
+RegAllocEvictionAdvisorAnalysis *llvm::createReleaseModeAdvisor() {
+  return llvm::isEmbeddedModelEvaluatorValid<CompiledModelType>() ||
+                 !InteractiveChannelBaseName.empty()
+             ? new ReleaseModeEvictionAdvisorAnalysis()
+             : nullptr;
+}
+
+// In all cases except development mode, we don't need scoring.
+#if !defined(LLVM_HAVE_TFLITE)
+bool RegAllocScoring::runOnMachineFunction(MachineFunction &) { return false; }
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MLRegallocEvictAdvisor.h b/contrib/llvm-project/llvm/lib/CodeGen/MLRegallocEvictAdvisor.h
new file mode 100644
index 000000000000..e36a41154096
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MLRegallocEvictAdvisor.h
@@ -0,0 +1,93 @@
+//===- MLRegAllocEvictAdvisor.cpp - ML eviction advisor -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Function declarations of utilities related to feature extraction for unit
+// testing.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MLREGALLOCEVICTIONADVISOR_H
+#define LLVM_CODEGEN_MLREGALLOCEVICTIONADVISOR_H
+
+#include "llvm/Analysis/MLModelRunner.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+
+using namespace llvm;
+
+// LRStartEndInfo contains the start and end of a specific live range as
+// slot indices as well as storing the index of the physical register it
+// is assigned to (or 1 above the phys reg count if its the candidate).
+// Used when extracting per-instruction features in the context of a
+// specific eviction problem.
+struct LRStartEndInfo {
+  SlotIndex Begin;
+  SlotIndex End;
+  size_t Pos = 0;
+};
+
+void extractInstructionFeatures(
+    llvm::SmallVectorImpl<LRStartEndInfo> &LRPosInfo,
+    MLModelRunner *RegallocRunner, function_ref<int(SlotIndex)> GetOpcode,
+    function_ref<float(SlotIndex)> GetMBBFreq,
+    function_ref<MachineBasicBlock *(SlotIndex)> GetMBBReference,
+    const int InstructionsIndex, const int InstructionsMappingIndex,
+    const int MBBFreqIndex, const int MBBMappingIndex,
+    const SlotIndex LastIndex);
+
+void extractMBBFrequency(const SlotIndex CurrentIndex,
+                         const size_t CurrentInstructionIndex,
+                         std::map<MachineBasicBlock *, size_t> &VisitedMBBs,
+                         function_ref<float(SlotIndex)> GetMBBFreq,
+                         MachineBasicBlock *CurrentMBBReference,
+                         MLModelRunner *RegallocRunner, const int MBBFreqIndex,
+                         const int MBBMappingIndex);
+
+// This is the maximum number of interfererring ranges. That's the number of
+// distinct AllocationOrder values, which comes from MCRegisterClass::RegsSize.
+// For X86, that's 32.
+// TODO: find a way to get this, statically, in a programmatic way.
+static const int64_t MaxInterferences = 32;
+
+// Logically, we can think of the feature set given to the evaluator as a 2D
+// matrix. The rows are the features (see next). The columns correspond to the
+// interferences. We treat the candidate virt reg as an 'interference', too, as
+// its feature set is the same as that of the interferring ranges. So we'll have
+// MaxInterferences + 1 columns and by convention, we will use the last column
+// for the virt reg seeking allocation.
+static const int64_t CandidateVirtRegPos = MaxInterferences;
+static const int64_t NumberOfInterferences = CandidateVirtRegPos + 1;
+
+// The number of instructions that a specific live range might have is variable,
+// but we're passing in a single matrix of instructions and tensorflow saved
+// models only support a fixed input size, so we have to cap the number of
+// instructions that can be passed along. The specific value was derived from
+// experimentation such that the majority of eviction problems would be
+// completely covered.
+static const int ModelMaxSupportedInstructionCount = 300;
+
+// When extracting per-instruction features, the advisor will currently create
+// a vector of size ModelMaxSupportedInstructionCount to hold the opcodes of the
+// instructions relevant to the eviction problem, and a NumberOfInterferences *
+// ModelMaxSupportedInstructionCount matrix that maps LRs to the instructions
+// that they span.
+static const std::vector<int64_t> InstructionsShape{
+    1, ModelMaxSupportedInstructionCount};
+static const std::vector<int64_t> InstructionsMappingShape{
+    1, NumberOfInterferences, ModelMaxSupportedInstructionCount};
+
+// When extracting mappings between MBBs and individual instructions, we create
+// a vector of MBB frequencies, currently of size 100, which was a value
+// determined through experimentation to encompass the vast majority of eviction
+// problems. The actual mapping is the same shape as the instruction opcodes
+// vector.
+static const int64_t ModelMaxSupportedMBBCount = 100;
+static const std::vector<int64_t> MBBFrequencyShape{1,
+                                                    ModelMaxSupportedMBBCount};
+
+#endif // LLVM_CODEGEN_MLREGALLOCEVICTIONADVISOR_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MLRegallocPriorityAdvisor.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MLRegallocPriorityAdvisor.cpp
new file mode 100644
index 000000000000..422781593a9c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MLRegallocPriorityAdvisor.cpp
@@ -0,0 +1,357 @@
+//===- MLRegAllocPriorityAdvisor.cpp - ML priority advisor-----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the ML priority advisor and reward injection pass
+//
+//===----------------------------------------------------------------------===//
+
+#include "AllocationOrder.h"
+#include "RegAllocGreedy.h"
+#include "RegAllocPriorityAdvisor.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/InteractiveModelRunner.h"
+#include "llvm/Analysis/MLModelRunner.h"
+#include "llvm/Analysis/ReleaseModeModelRunner.h"
+#include "llvm/Analysis/TensorSpec.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+
+#if defined(LLVM_HAVE_TFLITE)
+#include "llvm/Analysis/ModelUnderTrainingRunner.h"
+#include "llvm/Analysis/NoInferenceModelRunner.h"
+#include "llvm/Analysis/Utils/TrainingLogger.h"
+#endif
+
+using namespace llvm;
+
+static cl::opt<std::string> InteractiveChannelBaseName(
+    "regalloc-priority-interactive-channel-base", cl::Hidden,
+    cl::desc(
+        "Base file path for the interactive mode. The incoming filename should "
+        "have the name <regalloc-priority-interactive-channel-base>.in, while "
+        "the outgoing name should be "
+        "<regalloc-priority-interactive-channel-base>.out"));
+
+using CompiledModelType = NoopSavedModelImpl;
+
+// Options that only make sense in development mode
+#ifdef LLVM_HAVE_TFLITE
+#include "RegAllocScore.h"
+#include "llvm/Analysis/Utils/TFUtils.h"
+
+static cl::opt<std::string> TrainingLog(
+    "regalloc-priority-training-log", cl::Hidden,
+    cl::desc("Training log for the register allocator priority model"));
+
+static cl::opt<std::string> ModelUnderTraining(
+    "regalloc-priority-model", cl::Hidden,
+    cl::desc("The model being trained for register allocation priority"));
+
+#endif // #ifdef LLVM_HAVE_TFLITE
+
+namespace llvm {
+
+static const std::vector<int64_t> PerLiveRangeShape{1};
+
+#define RA_PRIORITY_FEATURES_LIST(M)                                           \
+  M(int64_t, li_size, PerLiveRangeShape, "size")                               \
+  M(int64_t, stage, PerLiveRangeShape, "stage")                                \
+  M(float, weight, PerLiveRangeShape, "weight")
+
+#define DecisionName "priority"
+static const TensorSpec DecisionSpec =
+    TensorSpec::createSpec<float>(DecisionName, {1});
+
+
+// Named features index.
+enum FeatureIDs {
+#define _FEATURE_IDX(_, name, __, ___) name,
+  RA_PRIORITY_FEATURES_LIST(_FEATURE_IDX)
+#undef _FEATURE_IDX
+      FeatureCount
+};
+
+class MLPriorityAdvisor : public RegAllocPriorityAdvisor {
+public:
+  MLPriorityAdvisor(const MachineFunction &MF, const RAGreedy &RA,
+                    SlotIndexes *const Indexes, MLModelRunner *Runner);
+
+protected:
+  const RegAllocPriorityAdvisor &getDefaultAdvisor() const {
+    return static_cast<const RegAllocPriorityAdvisor &>(DefaultAdvisor);
+  }
+
+  // The assumption is that if the Runner could not be constructed, we emit-ed
+  // error, and we shouldn't be asking for it here.
+  const MLModelRunner &getRunner() const { return *Runner; }
+  float getPriorityImpl(const LiveInterval &LI) const;
+  unsigned getPriority(const LiveInterval &LI) const override;
+
+private:
+  const DefaultPriorityAdvisor DefaultAdvisor;
+  MLModelRunner *const Runner;
+};
+
+#define _DECL_FEATURES(type, name, shape, _)                                   \
+  TensorSpec::createSpec<type>(#name, shape),
+
+static const std::vector<TensorSpec> InputFeatures{
+    {RA_PRIORITY_FEATURES_LIST(_DECL_FEATURES)},
+};
+#undef _DECL_FEATURES
+
+// ===================================
+// Release (AOT) - specifics
+// ===================================
+class ReleaseModePriorityAdvisorAnalysis final
+    : public RegAllocPriorityAdvisorAnalysis {
+public:
+  ReleaseModePriorityAdvisorAnalysis()
+      : RegAllocPriorityAdvisorAnalysis(AdvisorMode::Release) {}
+  // support for isa<> and dyn_cast.
+  static bool classof(const RegAllocPriorityAdvisorAnalysis *R) {
+    return R->getAdvisorMode() == AdvisorMode::Release;
+  }
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<SlotIndexes>();
+    RegAllocPriorityAdvisorAnalysis::getAnalysisUsage(AU);
+  }
+
+  std::unique_ptr<RegAllocPriorityAdvisor>
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override {
+    if (!Runner) {
+      if (InteractiveChannelBaseName.empty())
+        Runner = std::make_unique<ReleaseModeModelRunner<CompiledModelType>>(
+            MF.getFunction().getContext(), InputFeatures, DecisionName);
+      else
+        Runner = std::make_unique<InteractiveModelRunner>(
+            MF.getFunction().getContext(), InputFeatures, DecisionSpec,
+            InteractiveChannelBaseName + ".out",
+            InteractiveChannelBaseName + ".in");
+    }
+    return std::make_unique<MLPriorityAdvisor>(
+        MF, RA, &getAnalysis<SlotIndexes>(), Runner.get());
+  }
+  std::unique_ptr<MLModelRunner> Runner;
+};
+
+// ===================================
+// Development mode-specifics
+// ===================================
+//
+// Features we log
+#ifdef LLVM_HAVE_TFLITE
+static const TensorSpec Reward = TensorSpec::createSpec<float>("reward", {1});
+
+#define _DECL_TRAIN_FEATURES(type, name, shape, _)                             \
+  TensorSpec::createSpec<type>(std::string("action_") + #name, shape),
+
+static const std::vector<TensorSpec> TrainingInputFeatures{
+    {RA_PRIORITY_FEATURES_LIST(_DECL_TRAIN_FEATURES)
+         TensorSpec::createSpec<float>("action_discount", {1}),
+     TensorSpec::createSpec<int32_t>("action_step_type", {1}),
+     TensorSpec::createSpec<float>("action_reward", {1})}};
+#undef _DECL_TRAIN_FEATURES
+
+class DevelopmentModePriorityAdvisor : public MLPriorityAdvisor {
+public:
+  DevelopmentModePriorityAdvisor(const MachineFunction &MF, const RAGreedy &RA,
+                                 SlotIndexes *const Indexes,
+                                 MLModelRunner *Runner, Logger *Log)
+      : MLPriorityAdvisor(MF, RA, Indexes, Runner), Log(Log) {}
+
+private:
+  unsigned getPriority(const LiveInterval &LI) const override;
+  Logger *const Log;
+};
+
+class DevelopmentModePriorityAdvisorAnalysis final
+    : public RegAllocPriorityAdvisorAnalysis {
+public:
+  DevelopmentModePriorityAdvisorAnalysis()
+      : RegAllocPriorityAdvisorAnalysis(AdvisorMode::Development) {}
+  // support for isa<> and dyn_cast.
+  static bool classof(const RegAllocPriorityAdvisorAnalysis *R) {
+    return R->getAdvisorMode() == AdvisorMode::Development;
+  }
+
+  void logRewardIfNeeded(const MachineFunction &MF,
+                         llvm::function_ref<float()> GetReward) override {
+    if (!Log || !Log->hasAnyObservationForContext(MF.getName()))
+      return;
+    // The function pass manager would run all the function passes for a
+    // function, so we assume the last context belongs to this function. If
+    // this invariant ever changes, we can implement at that time switching
+    // contexts. At this point, it'd be an error
+    if (Log->currentContext() != MF.getName()) {
+      MF.getFunction().getContext().emitError(
+          "The training log context shouldn't have had changed.");
+    }
+    if (Log->hasObservationInProgress())
+      Log->logReward<float>(GetReward());
+  }
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<SlotIndexes>();
+    RegAllocPriorityAdvisorAnalysis::getAnalysisUsage(AU);
+  }
+
+  // Save all the logs (when requested).
+  bool doInitialization(Module &M) override {
+    LLVMContext &Ctx = M.getContext();
+    if (ModelUnderTraining.empty() && TrainingLog.empty()) {
+      Ctx.emitError("Regalloc development mode should be requested with at "
+                    "least logging enabled and/or a training model");
+      return false;
+    }
+    if (ModelUnderTraining.empty())
+      Runner = std::make_unique<NoInferenceModelRunner>(Ctx, InputFeatures);
+    else
+      Runner = ModelUnderTrainingRunner::createAndEnsureValid(
+          Ctx, ModelUnderTraining, DecisionName, TrainingInputFeatures);
+    if (!Runner) {
+      Ctx.emitError("Regalloc: could not set up the model runner");
+      return false;
+    }
+    if (TrainingLog.empty())
+      return false;
+    std::error_code EC;
+    auto OS = std::make_unique<raw_fd_ostream>(TrainingLog, EC);
+    if (EC) {
+      M.getContext().emitError(EC.message() + ":" + TrainingLog);
+      return false;
+    }
+    std::vector<TensorSpec> LFS = InputFeatures;
+    if (auto *MUTR = dyn_cast<ModelUnderTrainingRunner>(Runner.get()))
+      append_range(LFS, MUTR->extraOutputsForLoggingSpecs());
+    // We always log the output; in particular, if we're not evaluating, we
+    // don't have an output spec json file. That's why we handle the
+    // 'normal' output separately.
+    LFS.push_back(DecisionSpec);
+
+    Log = std::make_unique<Logger>(std::move(OS), LFS, Reward,
+                                   /*IncludeReward*/ true);
+    return false;
+  }
+
+  std::unique_ptr<RegAllocPriorityAdvisor>
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override {
+    if (!Runner)
+      return nullptr;
+    if (Log) {
+      Log->switchContext(MF.getName());
+    }
+
+    return std::make_unique<DevelopmentModePriorityAdvisor>(
+        MF, RA, &getAnalysis<SlotIndexes>(), Runner.get(), Log.get());
+  }
+
+  std::unique_ptr<MLModelRunner> Runner;
+  std::unique_ptr<Logger> Log;
+};
+#endif //#ifdef LLVM_HAVE_TFLITE
+
+} // namespace llvm
+
+RegAllocPriorityAdvisorAnalysis *llvm::createReleaseModePriorityAdvisor() {
+  return llvm::isEmbeddedModelEvaluatorValid<CompiledModelType>() ||
+                 !InteractiveChannelBaseName.empty()
+             ? new ReleaseModePriorityAdvisorAnalysis()
+             : nullptr;
+}
+
+MLPriorityAdvisor::MLPriorityAdvisor(const MachineFunction &MF,
+                                     const RAGreedy &RA,
+                                     SlotIndexes *const Indexes,
+                                     MLModelRunner *Runner)
+    : RegAllocPriorityAdvisor(MF, RA, Indexes), DefaultAdvisor(MF, RA, Indexes),
+      Runner(std::move(Runner)) {
+  assert(this->Runner);
+  Runner->switchContext(MF.getName());
+}
+
+float MLPriorityAdvisor::getPriorityImpl(const LiveInterval &LI) const {
+  const unsigned Size = LI.getSize();
+  LiveRangeStage Stage = RA.getExtraInfo().getStage(LI);
+
+  *Runner->getTensor<int64_t>(0) = static_cast<int64_t>(Size);
+  *Runner->getTensor<int64_t>(1) = static_cast<int64_t>(Stage);
+  *Runner->getTensor<float>(2) = static_cast<float>(LI.weight());
+
+  return Runner->evaluate<float>();
+}
+
+unsigned MLPriorityAdvisor::getPriority(const LiveInterval &LI) const {
+  return static_cast<unsigned>(getPriorityImpl(LI));
+}
+
+#ifdef LLVM_HAVE_TFLITE
+RegAllocPriorityAdvisorAnalysis *llvm::createDevelopmentModePriorityAdvisor() {
+  return new DevelopmentModePriorityAdvisorAnalysis();
+}
+
+unsigned
+DevelopmentModePriorityAdvisor::getPriority(const LiveInterval &LI) const {
+  double Prio = 0;
+
+  if (isa<ModelUnderTrainingRunner>(getRunner())) {
+    Prio = MLPriorityAdvisor::getPriorityImpl(LI);
+  } else {
+    Prio = getDefaultAdvisor().getPriority(LI);
+  }
+
+  if (TrainingLog.empty())
+    return Prio;
+
+  // TODO(mtrofin): when we support optional rewards, this can go away. In the
+  // meantime, we log the "pretend" reward (0) for the previous observation
+  // before starting a new one.
+  if (Log->hasObservationInProgress())
+    Log->logReward<float>(0.0);
+
+  Log->startObservation();
+  size_t CurrentFeature = 0;
+  for (; CurrentFeature < InputFeatures.size(); ++CurrentFeature) {
+    Log->logTensorValue(CurrentFeature,
+                        reinterpret_cast<const char *>(
+                            getRunner().getTensorUntyped(CurrentFeature)));
+  }
+
+  if (auto *MUTR = dyn_cast<ModelUnderTrainingRunner>(&getRunner())) {
+    for (size_t I = 0; I < MUTR->extraOutputsForLoggingSpecs().size();
+         ++I, ++CurrentFeature)
+      Log->logTensorValue(
+          CurrentFeature,
+          reinterpret_cast<const char *>(MUTR->getUntypedExtraOutputValue(I)));
+  }
+
+  float Ret = static_cast<float>(Prio);
+  Log->logTensorValue(CurrentFeature, reinterpret_cast<const char *>(&Ret));
+  Log->endObservation();
+
+  return static_cast<unsigned>(Prio);
+}
+
+#endif // #ifdef LLVM_HAVE_TFLITE
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineBasicBlock.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineBasicBlock.cpp
new file mode 100644
index 000000000000..231544494c32
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -0,0 +1,1740 @@
+//===-- llvm/CodeGen/MachineBasicBlock.cpp ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Collect the sequence of machine instructions for a basic block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cmath>
+using namespace llvm;
+
+#define DEBUG_TYPE "codegen"
+
+static cl::opt<bool> PrintSlotIndexes(
+    "print-slotindexes",
+    cl::desc("When printing machine IR, annotate instructions and blocks with "
+             "SlotIndexes when available"),
+    cl::init(true), cl::Hidden);
+
+MachineBasicBlock::MachineBasicBlock(MachineFunction &MF, const BasicBlock *B)
+    : BB(B), Number(-1), xParent(&MF) {
+  Insts.Parent = this;
+  if (B)
+    IrrLoopHeaderWeight = B->getIrrLoopHeaderWeight();
+}
+
+MachineBasicBlock::~MachineBasicBlock() = default;
+
+/// Return the MCSymbol for this basic block.
+MCSymbol *MachineBasicBlock::getSymbol() const {
+  if (!CachedMCSymbol) {
+    const MachineFunction *MF = getParent();
+    MCContext &Ctx = MF->getContext();
+
+    // We emit a non-temporary symbol -- with a descriptive name -- if it begins
+    // a section (with basic block sections). Otherwise we fall back to use temp
+    // label.
+    if (MF->hasBBSections() && isBeginSection()) {
+      SmallString<5> Suffix;
+      if (SectionID == MBBSectionID::ColdSectionID) {
+        Suffix += ".cold";
+      } else if (SectionID == MBBSectionID::ExceptionSectionID) {
+        Suffix += ".eh";
+      } else {
+        // For symbols that represent basic block sections, we add ".__part." to
+        // allow tools like symbolizers to know that this represents a part of
+        // the original function.
+        Suffix = (Suffix + Twine(".__part.") + Twine(SectionID.Number)).str();
+      }
+      CachedMCSymbol = Ctx.getOrCreateSymbol(MF->getName() + Suffix);
+    } else {
+      const StringRef Prefix = Ctx.getAsmInfo()->getPrivateLabelPrefix();
+      CachedMCSymbol = Ctx.getOrCreateSymbol(Twine(Prefix) + "BB" +
+                                             Twine(MF->getFunctionNumber()) +
+                                             "_" + Twine(getNumber()));
+    }
+  }
+  return CachedMCSymbol;
+}
+
+MCSymbol *MachineBasicBlock::getEHCatchretSymbol() const {
+  if (!CachedEHCatchretMCSymbol) {
+    const MachineFunction *MF = getParent();
+    SmallString<128> SymbolName;
+    raw_svector_ostream(SymbolName)
+        << "$ehgcr_" << MF->getFunctionNumber() << '_' << getNumber();
+    CachedEHCatchretMCSymbol = MF->getContext().getOrCreateSymbol(SymbolName);
+  }
+  return CachedEHCatchretMCSymbol;
+}
+
+MCSymbol *MachineBasicBlock::getEndSymbol() const {
+  if (!CachedEndMCSymbol) {
+    const MachineFunction *MF = getParent();
+    MCContext &Ctx = MF->getContext();
+    auto Prefix = Ctx.getAsmInfo()->getPrivateLabelPrefix();
+    CachedEndMCSymbol = Ctx.getOrCreateSymbol(Twine(Prefix) + "BB_END" +
+                                              Twine(MF->getFunctionNumber()) +
+                                              "_" + Twine(getNumber()));
+  }
+  return CachedEndMCSymbol;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineBasicBlock &MBB) {
+  MBB.print(OS);
+  return OS;
+}
+
+Printable llvm::printMBBReference(const MachineBasicBlock &MBB) {
+  return Printable([&MBB](raw_ostream &OS) { return MBB.printAsOperand(OS); });
+}
+
+/// When an MBB is added to an MF, we need to update the parent pointer of the
+/// MBB, the MBB numbering, and any instructions in the MBB to be on the right
+/// operand list for registers.
+///
+/// MBBs start out as #-1. When a MBB is added to a MachineFunction, it
+/// gets the next available unique MBB number. If it is removed from a
+/// MachineFunction, it goes back to being #-1.
+void ilist_callback_traits<MachineBasicBlock>::addNodeToList(
+    MachineBasicBlock *N) {
+  MachineFunction &MF = *N->getParent();
+  N->Number = MF.addToMBBNumbering(N);
+
+  // Make sure the instructions have their operands in the reginfo lists.
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  for (MachineInstr &MI : N->instrs())
+    MI.addRegOperandsToUseLists(RegInfo);
+}
+
+void ilist_callback_traits<MachineBasicBlock>::removeNodeFromList(
+    MachineBasicBlock *N) {
+  N->getParent()->removeFromMBBNumbering(N->Number);
+  N->Number = -1;
+}
+
+/// When we add an instruction to a basic block list, we update its parent
+/// pointer and add its operands from reg use/def lists if appropriate.
+void ilist_traits<MachineInstr>::addNodeToList(MachineInstr *N) {
+  assert(!N->getParent() && "machine instruction already in a basic block");
+  N->setParent(Parent);
+
+  // Add the instruction's register operands to their corresponding
+  // use/def lists.
+  MachineFunction *MF = Parent->getParent();
+  N->addRegOperandsToUseLists(MF->getRegInfo());
+  MF->handleInsertion(*N);
+}
+
+/// When we remove an instruction from a basic block list, we update its parent
+/// pointer and remove its operands from reg use/def lists if appropriate.
+void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr *N) {
+  assert(N->getParent() && "machine instruction not in a basic block");
+
+  // Remove from the use/def lists.
+  if (MachineFunction *MF = N->getMF()) {
+    MF->handleRemoval(*N);
+    N->removeRegOperandsFromUseLists(MF->getRegInfo());
+  }
+
+  N->setParent(nullptr);
+}
+
+/// When moving a range of instructions from one MBB list to another, we need to
+/// update the parent pointers and the use/def lists.
+void ilist_traits<MachineInstr>::transferNodesFromList(ilist_traits &FromList,
+                                                       instr_iterator First,
+                                                       instr_iterator Last) {
+  assert(Parent->getParent() == FromList.Parent->getParent() &&
+         "cannot transfer MachineInstrs between MachineFunctions");
+
+  // If it's within the same BB, there's nothing to do.
+  if (this == &FromList)
+    return;
+
+  assert(Parent != FromList.Parent && "Two lists have the same parent?");
+
+  // If splicing between two blocks within the same function, just update the
+  // parent pointers.
+  for (; First != Last; ++First)
+    First->setParent(Parent);
+}
+
+void ilist_traits<MachineInstr>::deleteNode(MachineInstr *MI) {
+  assert(!MI->getParent() && "MI is still in a block!");
+  Parent->getParent()->deleteMachineInstr(MI);
+}
+
+MachineBasicBlock::iterator MachineBasicBlock::getFirstNonPHI() {
+  instr_iterator I = instr_begin(), E = instr_end();
+  while (I != E && I->isPHI())
+    ++I;
+  assert((I == E || !I->isInsideBundle()) &&
+         "First non-phi MI cannot be inside a bundle!");
+  return I;
+}
+
+MachineBasicBlock::iterator
+MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {
+  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
+
+  iterator E = end();
+  while (I != E && (I->isPHI() || I->isPosition() ||
+                    TII->isBasicBlockPrologue(*I)))
+    ++I;
+  // FIXME: This needs to change if we wish to bundle labels
+  // inside the bundle.
+  assert((I == E || !I->isInsideBundle()) &&
+         "First non-phi / non-label instruction is inside a bundle!");
+  return I;
+}
+
+MachineBasicBlock::iterator
+MachineBasicBlock::SkipPHIsLabelsAndDebug(MachineBasicBlock::iterator I,
+                                          bool SkipPseudoOp) {
+  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
+
+  iterator E = end();
+  while (I != E && (I->isPHI() || I->isPosition() || I->isDebugInstr() ||
+                    (SkipPseudoOp && I->isPseudoProbe()) ||
+                    TII->isBasicBlockPrologue(*I)))
+    ++I;
+  // FIXME: This needs to change if we wish to bundle labels / dbg_values
+  // inside the bundle.
+  assert((I == E || !I->isInsideBundle()) &&
+         "First non-phi / non-label / non-debug "
+         "instruction is inside a bundle!");
+  return I;
+}
+
+MachineBasicBlock::iterator MachineBasicBlock::getFirstTerminator() {
+  iterator B = begin(), E = end(), I = E;
+  while (I != B && ((--I)->isTerminator() || I->isDebugInstr()))
+    ; /*noop */
+  while (I != E && !I->isTerminator())
+    ++I;
+  return I;
+}
+
+MachineBasicBlock::instr_iterator MachineBasicBlock::getFirstInstrTerminator() {
+  instr_iterator B = instr_begin(), E = instr_end(), I = E;
+  while (I != B && ((--I)->isTerminator() || I->isDebugInstr()))
+    ; /*noop */
+  while (I != E && !I->isTerminator())
+    ++I;
+  return I;
+}
+
+MachineBasicBlock::iterator MachineBasicBlock::getFirstTerminatorForward() {
+  return find_if(instrs(), [](auto &II) { return II.isTerminator(); });
+}
+
+MachineBasicBlock::iterator
+MachineBasicBlock::getFirstNonDebugInstr(bool SkipPseudoOp) {
+  // Skip over begin-of-block dbg_value instructions.
+  return skipDebugInstructionsForward(begin(), end(), SkipPseudoOp);
+}
+
+MachineBasicBlock::iterator
+MachineBasicBlock::getLastNonDebugInstr(bool SkipPseudoOp) {
+  // Skip over end-of-block dbg_value instructions.
+  instr_iterator B = instr_begin(), I = instr_end();
+  while (I != B) {
+    --I;
+    // Return instruction that starts a bundle.
+    if (I->isDebugInstr() || I->isInsideBundle())
+      continue;
+    if (SkipPseudoOp && I->isPseudoProbe())
+      continue;
+    return I;
+  }
+  // The block is all debug values.
+  return end();
+}
+
+bool MachineBasicBlock::hasEHPadSuccessor() const {
+  for (const MachineBasicBlock *Succ : successors())
+    if (Succ->isEHPad())
+      return true;
+  return false;
+}
+
+bool MachineBasicBlock::isEntryBlock() const {
+  return getParent()->begin() == getIterator();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineBasicBlock::dump() const {
+  print(dbgs());
+}
+#endif
+
+bool MachineBasicBlock::mayHaveInlineAsmBr() const {
+  for (const MachineBasicBlock *Succ : successors()) {
+    if (Succ->isInlineAsmBrIndirectTarget())
+      return true;
+  }
+  return false;
+}
+
+bool MachineBasicBlock::isLegalToHoistInto() const {
+  if (isReturnBlock() || hasEHPadSuccessor() || mayHaveInlineAsmBr())
+    return false;
+  return true;
+}
+
+StringRef MachineBasicBlock::getName() const {
+  if (const BasicBlock *LBB = getBasicBlock())
+    return LBB->getName();
+  else
+    return StringRef("", 0);
+}
+
+/// Return a hopefully unique identifier for this block.
+std::string MachineBasicBlock::getFullName() const {
+  std::string Name;
+  if (getParent())
+    Name = (getParent()->getName() + ":").str();
+  if (getBasicBlock())
+    Name += getBasicBlock()->getName();
+  else
+    Name += ("BB" + Twine(getNumber())).str();
+  return Name;
+}
+
+void MachineBasicBlock::print(raw_ostream &OS, const SlotIndexes *Indexes,
+                              bool IsStandalone) const {
+  const MachineFunction *MF = getParent();
+  if (!MF) {
+    OS << "Can't print out MachineBasicBlock because parent MachineFunction"
+       << " is null\n";
+    return;
+  }
+  const Function &F = MF->getFunction();
+  const Module *M = F.getParent();
+  ModuleSlotTracker MST(M);
+  MST.incorporateFunction(F);
+  print(OS, MST, Indexes, IsStandalone);
+}
+
+void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
+                              const SlotIndexes *Indexes,
+                              bool IsStandalone) const {
+  const MachineFunction *MF = getParent();
+  if (!MF) {
+    OS << "Can't print out MachineBasicBlock because parent MachineFunction"
+       << " is null\n";
+    return;
+  }
+
+  if (Indexes && PrintSlotIndexes)
+    OS << Indexes->getMBBStartIdx(this) << '\t';
+
+  printName(OS, PrintNameIr | PrintNameAttributes, &MST);
+  OS << ":\n";
+
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetInstrInfo &TII = *getParent()->getSubtarget().getInstrInfo();
+  bool HasLineAttributes = false;
+
+  // Print the preds of this block according to the CFG.
+  if (!pred_empty() && IsStandalone) {
+    if (Indexes) OS << '\t';
+    // Don't indent(2), align with previous line attributes.
+    OS << "; predecessors: ";
+    ListSeparator LS;
+    for (auto *Pred : predecessors())
+      OS << LS << printMBBReference(*Pred);
+    OS << '\n';
+    HasLineAttributes = true;
+  }
+
+  if (!succ_empty()) {
+    if (Indexes) OS << '\t';
+    // Print the successors
+    OS.indent(2) << "successors: ";
+    ListSeparator LS;
+    for (auto I = succ_begin(), E = succ_end(); I != E; ++I) {
+      OS << LS << printMBBReference(**I);
+      if (!Probs.empty())
+        OS << '('
+           << format("0x%08" PRIx32, getSuccProbability(I).getNumerator())
+           << ')';
+    }
+    if (!Probs.empty() && IsStandalone) {
+      // Print human readable probabilities as comments.
+      OS << "; ";
+      ListSeparator LS;
+      for (auto I = succ_begin(), E = succ_end(); I != E; ++I) {
+        const BranchProbability &BP = getSuccProbability(I);
+        OS << LS << printMBBReference(**I) << '('
+           << format("%.2f%%",
+                     rint(((double)BP.getNumerator() / BP.getDenominator()) *
+                          100.0 * 100.0) /
+                         100.0)
+           << ')';
+      }
+    }
+
+    OS << '\n';
+    HasLineAttributes = true;
+  }
+
+  if (!livein_empty() && MRI.tracksLiveness()) {
+    if (Indexes) OS << '\t';
+    OS.indent(2) << "liveins: ";
+
+    ListSeparator LS;
+    for (const auto &LI : liveins()) {
+      OS << LS << printReg(LI.PhysReg, TRI);
+      if (!LI.LaneMask.all())
+        OS << ":0x" << PrintLaneMask(LI.LaneMask);
+    }
+    HasLineAttributes = true;
+  }
+
+  if (HasLineAttributes)
+    OS << '\n';
+
+  bool IsInBundle = false;
+  for (const MachineInstr &MI : instrs()) {
+    if (Indexes && PrintSlotIndexes) {
+      if (Indexes->hasIndex(MI))
+        OS << Indexes->getInstructionIndex(MI);
+      OS << '\t';
+    }
+
+    if (IsInBundle && !MI.isInsideBundle()) {
+      OS.indent(2) << "}\n";
+      IsInBundle = false;
+    }
+
+    OS.indent(IsInBundle ? 4 : 2);
+    MI.print(OS, MST, IsStandalone, /*SkipOpers=*/false, /*SkipDebugLoc=*/false,
+             /*AddNewLine=*/false, &TII);
+
+    if (!IsInBundle && MI.getFlag(MachineInstr::BundledSucc)) {
+      OS << " {";
+      IsInBundle = true;
+    }
+    OS << '\n';
+  }
+
+  if (IsInBundle)
+    OS.indent(2) << "}\n";
+
+  if (IrrLoopHeaderWeight && IsStandalone) {
+    if (Indexes) OS << '\t';
+    OS.indent(2) << "; Irreducible loop header weight: " << *IrrLoopHeaderWeight
+                 << '\n';
+  }
+}
+
+/// Print the basic block's name as:
+///
+///    bb.{number}[.{ir-name}] [(attributes...)]
+///
+/// The {ir-name} is only printed when the \ref PrintNameIr flag is passed
+/// (which is the default). If the IR block has no name, it is identified
+/// numerically using the attribute syntax as "(%ir-block.{ir-slot})".
+///
+/// When the \ref PrintNameAttributes flag is passed, additional attributes
+/// of the block are printed when set.
+///
+/// \param printNameFlags Combination of \ref PrintNameFlag flags indicating
+///                       the parts to print.
+/// \param moduleSlotTracker Optional ModuleSlotTracker. This method will
+///                          incorporate its own tracker when necessary to
+///                          determine the block's IR name.
+void MachineBasicBlock::printName(raw_ostream &os, unsigned printNameFlags,
+                                  ModuleSlotTracker *moduleSlotTracker) const {
+  os << "bb." << getNumber();
+  bool hasAttributes = false;
+
+  auto PrintBBRef = [&](const BasicBlock *bb) {
+    os << "%ir-block.";
+    if (bb->hasName()) {
+      os << bb->getName();
+    } else {
+      int slot = -1;
+
+      if (moduleSlotTracker) {
+        slot = moduleSlotTracker->getLocalSlot(bb);
+      } else if (bb->getParent()) {
+        ModuleSlotTracker tmpTracker(bb->getModule(), false);
+        tmpTracker.incorporateFunction(*bb->getParent());
+        slot = tmpTracker.getLocalSlot(bb);
+      }
+
+      if (slot == -1)
+        os << "<ir-block badref>";
+      else
+        os << slot;
+    }
+  };
+
+  if (printNameFlags & PrintNameIr) {
+    if (const auto *bb = getBasicBlock()) {
+      if (bb->hasName()) {
+        os << '.' << bb->getName();
+      } else {
+        hasAttributes = true;
+        os << " (";
+        PrintBBRef(bb);
+      }
+    }
+  }
+
+  if (printNameFlags & PrintNameAttributes) {
+    if (isMachineBlockAddressTaken()) {
+      os << (hasAttributes ? ", " : " (");
+      os << "machine-block-address-taken";
+      hasAttributes = true;
+    }
+    if (isIRBlockAddressTaken()) {
+      os << (hasAttributes ? ", " : " (");
+      os << "ir-block-address-taken ";
+      PrintBBRef(getAddressTakenIRBlock());
+      hasAttributes = true;
+    }
+    if (isEHPad()) {
+      os << (hasAttributes ? ", " : " (");
+      os << "landing-pad";
+      hasAttributes = true;
+    }
+    if (isInlineAsmBrIndirectTarget()) {
+      os << (hasAttributes ? ", " : " (");
+      os << "inlineasm-br-indirect-target";
+      hasAttributes = true;
+    }
+    if (isEHFuncletEntry()) {
+      os << (hasAttributes ? ", " : " (");
+      os << "ehfunclet-entry";
+      hasAttributes = true;
+    }
+    if (getAlignment() != Align(1)) {
+      os << (hasAttributes ? ", " : " (");
+      os << "align " << getAlignment().value();
+      hasAttributes = true;
+    }
+    if (getSectionID() != MBBSectionID(0)) {
+      os << (hasAttributes ? ", " : " (");
+      os << "bbsections ";
+      switch (getSectionID().Type) {
+      case MBBSectionID::SectionType::Exception:
+        os << "Exception";
+        break;
+      case MBBSectionID::SectionType::Cold:
+        os << "Cold";
+        break;
+      default:
+        os << getSectionID().Number;
+      }
+      hasAttributes = true;
+    }
+    if (getBBID().has_value()) {
+      os << (hasAttributes ? ", " : " (");
+      os << "bb_id " << *getBBID();
+      hasAttributes = true;
+    }
+  }
+
+  if (hasAttributes)
+    os << ')';
+}
+
+void MachineBasicBlock::printAsOperand(raw_ostream &OS,
+                                       bool /*PrintType*/) const {
+  OS << '%';
+  printName(OS, 0);
+}
+
+void MachineBasicBlock::removeLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) {
+  LiveInVector::iterator I = find_if(
+      LiveIns, [Reg](const RegisterMaskPair &LI) { return LI.PhysReg == Reg; });
+  if (I == LiveIns.end())
+    return;
+
+  I->LaneMask &= ~LaneMask;
+  if (I->LaneMask.none())
+    LiveIns.erase(I);
+}
+
+MachineBasicBlock::livein_iterator
+MachineBasicBlock::removeLiveIn(MachineBasicBlock::livein_iterator I) {
+  // Get non-const version of iterator.
+  LiveInVector::iterator LI = LiveIns.begin() + (I - LiveIns.begin());
+  return LiveIns.erase(LI);
+}
+
+bool MachineBasicBlock::isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) const {
+  livein_iterator I = find_if(
+      LiveIns, [Reg](const RegisterMaskPair &LI) { return LI.PhysReg == Reg; });
+  return I != livein_end() && (I->LaneMask & LaneMask).any();
+}
+
+void MachineBasicBlock::sortUniqueLiveIns() {
+  llvm::sort(LiveIns,
+             [](const RegisterMaskPair &LI0, const RegisterMaskPair &LI1) {
+               return LI0.PhysReg < LI1.PhysReg;
+             });
+  // Liveins are sorted by physreg now we can merge their lanemasks.
+  LiveInVector::const_iterator I = LiveIns.begin();
+  LiveInVector::const_iterator J;
+  LiveInVector::iterator Out = LiveIns.begin();
+  for (; I != LiveIns.end(); ++Out, I = J) {
+    MCRegister PhysReg = I->PhysReg;
+    LaneBitmask LaneMask = I->LaneMask;
+    for (J = std::next(I); J != LiveIns.end() && J->PhysReg == PhysReg; ++J)
+      LaneMask |= J->LaneMask;
+    Out->PhysReg = PhysReg;
+    Out->LaneMask = LaneMask;
+  }
+  LiveIns.erase(Out, LiveIns.end());
+}
+
+Register
+MachineBasicBlock::addLiveIn(MCRegister PhysReg, const TargetRegisterClass *RC) {
+  assert(getParent() && "MBB must be inserted in function");
+  assert(Register::isPhysicalRegister(PhysReg) && "Expected physreg");
+  assert(RC && "Register class is required");
+  assert((isEHPad() || this == &getParent()->front()) &&
+         "Only the entry block and landing pads can have physreg live ins");
+
+  bool LiveIn = isLiveIn(PhysReg);
+  iterator I = SkipPHIsAndLabels(begin()), E = end();
+  MachineRegisterInfo &MRI = getParent()->getRegInfo();
+  const TargetInstrInfo &TII = *getParent()->getSubtarget().getInstrInfo();
+
+  // Look for an existing copy.
+  if (LiveIn)
+    for (;I != E && I->isCopy(); ++I)
+      if (I->getOperand(1).getReg() == PhysReg) {
+        Register VirtReg = I->getOperand(0).getReg();
+        if (!MRI.constrainRegClass(VirtReg, RC))
+          llvm_unreachable("Incompatible live-in register class.");
+        return VirtReg;
+      }
+
+  // No luck, create a virtual register.
+  Register VirtReg = MRI.createVirtualRegister(RC);
+  BuildMI(*this, I, DebugLoc(), TII.get(TargetOpcode::COPY), VirtReg)
+    .addReg(PhysReg, RegState::Kill);
+  if (!LiveIn)
+    addLiveIn(PhysReg);
+  return VirtReg;
+}
+
+void MachineBasicBlock::moveBefore(MachineBasicBlock *NewAfter) {
+  getParent()->splice(NewAfter->getIterator(), getIterator());
+}
+
+void MachineBasicBlock::moveAfter(MachineBasicBlock *NewBefore) {
+  getParent()->splice(++NewBefore->getIterator(), getIterator());
+}
+
+static int findJumpTableIndex(const MachineBasicBlock &MBB) {
+  MachineBasicBlock::const_iterator TerminatorI = MBB.getFirstTerminator();
+  if (TerminatorI == MBB.end())
+    return -1;
+  const MachineInstr &Terminator = *TerminatorI;
+  const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
+  return TII->getJumpTableIndex(Terminator);
+}
+
+void MachineBasicBlock::updateTerminator(
+    MachineBasicBlock *PreviousLayoutSuccessor) {
+  LLVM_DEBUG(dbgs() << "Updating terminators on " << printMBBReference(*this)
+                    << "\n");
+
+  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
+  // A block with no successors has no concerns with fall-through edges.
+  if (this->succ_empty())
+    return;
+
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  DebugLoc DL = findBranchDebugLoc();
+  bool B = TII->analyzeBranch(*this, TBB, FBB, Cond);
+  (void) B;
+  assert(!B && "UpdateTerminators requires analyzable predecessors!");
+  if (Cond.empty()) {
+    if (TBB) {
+      // The block has an unconditional branch. If its successor is now its
+      // layout successor, delete the branch.
+      if (isLayoutSuccessor(TBB))
+        TII->removeBranch(*this);
+    } else {
+      // The block has an unconditional fallthrough, or the end of the block is
+      // unreachable.
+
+      // Unfortunately, whether the end of the block is unreachable is not
+      // immediately obvious; we must fall back to checking the successor list,
+      // and assuming that if the passed in block is in the succesor list and
+      // not an EHPad, it must be the intended target.
+      if (!PreviousLayoutSuccessor || !isSuccessor(PreviousLayoutSuccessor) ||
+          PreviousLayoutSuccessor->isEHPad())
+        return;
+
+      // If the unconditional successor block is not the current layout
+      // successor, insert a branch to jump to it.
+      if (!isLayoutSuccessor(PreviousLayoutSuccessor))
+        TII->insertBranch(*this, PreviousLayoutSuccessor, nullptr, Cond, DL);
+    }
+    return;
+  }
+
+  if (FBB) {
+    // The block has a non-fallthrough conditional branch. If one of its
+    // successors is its layout successor, rewrite it to a fallthrough
+    // conditional branch.
+    if (isLayoutSuccessor(TBB)) {
+      if (TII->reverseBranchCondition(Cond))
+        return;
+      TII->removeBranch(*this);
+      TII->insertBranch(*this, FBB, nullptr, Cond, DL);
+    } else if (isLayoutSuccessor(FBB)) {
+      TII->removeBranch(*this);
+      TII->insertBranch(*this, TBB, nullptr, Cond, DL);
+    }
+    return;
+  }
+
+  // We now know we're going to fallthrough to PreviousLayoutSuccessor.
+  assert(PreviousLayoutSuccessor);
+  assert(!PreviousLayoutSuccessor->isEHPad());
+  assert(isSuccessor(PreviousLayoutSuccessor));
+
+  if (PreviousLayoutSuccessor == TBB) {
+    // We had a fallthrough to the same basic block as the conditional jump
+    // targets.  Remove the conditional jump, leaving an unconditional
+    // fallthrough or an unconditional jump.
+    TII->removeBranch(*this);
+    if (!isLayoutSuccessor(TBB)) {
+      Cond.clear();
+      TII->insertBranch(*this, TBB, nullptr, Cond, DL);
+    }
+    return;
+  }
+
+  // The block has a fallthrough conditional branch.
+  if (isLayoutSuccessor(TBB)) {
+    if (TII->reverseBranchCondition(Cond)) {
+      // We can't reverse the condition, add an unconditional branch.
+      Cond.clear();
+      TII->insertBranch(*this, PreviousLayoutSuccessor, nullptr, Cond, DL);
+      return;
+    }
+    TII->removeBranch(*this);
+    TII->insertBranch(*this, PreviousLayoutSuccessor, nullptr, Cond, DL);
+  } else if (!isLayoutSuccessor(PreviousLayoutSuccessor)) {
+    TII->removeBranch(*this);
+    TII->insertBranch(*this, TBB, PreviousLayoutSuccessor, Cond, DL);
+  }
+}
+
+void MachineBasicBlock::validateSuccProbs() const {
+#ifndef NDEBUG
+  int64_t Sum = 0;
+  for (auto Prob : Probs)
+    Sum += Prob.getNumerator();
+  // Due to precision issue, we assume that the sum of probabilities is one if
+  // the difference between the sum of their numerators and the denominator is
+  // no greater than the number of successors.
+  assert((uint64_t)std::abs(Sum - BranchProbability::getDenominator()) <=
+             Probs.size() &&
+         "The sum of successors's probabilities exceeds one.");
+#endif // NDEBUG
+}
+
+void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ,
+                                     BranchProbability Prob) {
+  // Probability list is either empty (if successor list isn't empty, this means
+  // disabled optimization) or has the same size as successor list.
+  if (!(Probs.empty() && !Successors.empty()))
+    Probs.push_back(Prob);
+  Successors.push_back(Succ);
+  Succ->addPredecessor(this);
+}
+
+void MachineBasicBlock::addSuccessorWithoutProb(MachineBasicBlock *Succ) {
+  // We need to make sure probability list is either empty or has the same size
+  // of successor list. When this function is called, we can safely delete all
+  // probability in the list.
+  Probs.clear();
+  Successors.push_back(Succ);
+  Succ->addPredecessor(this);
+}
+
+void MachineBasicBlock::splitSuccessor(MachineBasicBlock *Old,
+                                       MachineBasicBlock *New,
+                                       bool NormalizeSuccProbs) {
+  succ_iterator OldI = llvm::find(successors(), Old);
+  assert(OldI != succ_end() && "Old is not a successor of this block!");
+  assert(!llvm::is_contained(successors(), New) &&
+         "New is already a successor of this block!");
+
+  // Add a new successor with equal probability as the original one. Note
+  // that we directly copy the probability using the iterator rather than
+  // getting a potentially synthetic probability computed when unknown. This
+  // preserves the probabilities as-is and then we can renormalize them and
+  // query them effectively afterward.
+  addSuccessor(New, Probs.empty() ? BranchProbability::getUnknown()
+                                  : *getProbabilityIterator(OldI));
+  if (NormalizeSuccProbs)
+    normalizeSuccProbs();
+}
+
+void MachineBasicBlock::removeSuccessor(MachineBasicBlock *Succ,
+                                        bool NormalizeSuccProbs) {
+  succ_iterator I = find(Successors, Succ);
+  removeSuccessor(I, NormalizeSuccProbs);
+}
+
+MachineBasicBlock::succ_iterator
+MachineBasicBlock::removeSuccessor(succ_iterator I, bool NormalizeSuccProbs) {
+  assert(I != Successors.end() && "Not a current successor!");
+
+  // If probability list is empty it means we don't use it (disabled
+  // optimization).
+  if (!Probs.empty()) {
+    probability_iterator WI = getProbabilityIterator(I);
+    Probs.erase(WI);
+    if (NormalizeSuccProbs)
+      normalizeSuccProbs();
+  }
+
+  (*I)->removePredecessor(this);
+  return Successors.erase(I);
+}
+
+void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old,
+                                         MachineBasicBlock *New) {
+  if (Old == New)
+    return;
+
+  succ_iterator E = succ_end();
+  succ_iterator NewI = E;
+  succ_iterator OldI = E;
+  for (succ_iterator I = succ_begin(); I != E; ++I) {
+    if (*I == Old) {
+      OldI = I;
+      if (NewI != E)
+        break;
+    }
+    if (*I == New) {
+      NewI = I;
+      if (OldI != E)
+        break;
+    }
+  }
+  assert(OldI != E && "Old is not a successor of this block");
+
+  // If New isn't already a successor, let it take Old's place.
+  if (NewI == E) {
+    Old->removePredecessor(this);
+    New->addPredecessor(this);
+    *OldI = New;
+    return;
+  }
+
+  // New is already a successor.
+  // Update its probability instead of adding a duplicate edge.
+  if (!Probs.empty()) {
+    auto ProbIter = getProbabilityIterator(NewI);
+    if (!ProbIter->isUnknown())
+      *ProbIter += *getProbabilityIterator(OldI);
+  }
+  removeSuccessor(OldI);
+}
+
+void MachineBasicBlock::copySuccessor(MachineBasicBlock *Orig,
+                                      succ_iterator I) {
+  if (!Orig->Probs.empty())
+    addSuccessor(*I, Orig->getSuccProbability(I));
+  else
+    addSuccessorWithoutProb(*I);
+}
+
+void MachineBasicBlock::addPredecessor(MachineBasicBlock *Pred) {
+  Predecessors.push_back(Pred);
+}
+
+void MachineBasicBlock::removePredecessor(MachineBasicBlock *Pred) {
+  pred_iterator I = find(Predecessors, Pred);
+  assert(I != Predecessors.end() && "Pred is not a predecessor of this block!");
+  Predecessors.erase(I);
+}
+
+void MachineBasicBlock::transferSuccessors(MachineBasicBlock *FromMBB) {
+  if (this == FromMBB)
+    return;
+
+  while (!FromMBB->succ_empty()) {
+    MachineBasicBlock *Succ = *FromMBB->succ_begin();
+
+    // If probability list is empty it means we don't use it (disabled
+    // optimization).
+    if (!FromMBB->Probs.empty()) {
+      auto Prob = *FromMBB->Probs.begin();
+      addSuccessor(Succ, Prob);
+    } else
+      addSuccessorWithoutProb(Succ);
+
+    FromMBB->removeSuccessor(Succ);
+  }
+}
+
+void
+MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB) {
+  if (this == FromMBB)
+    return;
+
+  while (!FromMBB->succ_empty()) {
+    MachineBasicBlock *Succ = *FromMBB->succ_begin();
+    if (!FromMBB->Probs.empty()) {
+      auto Prob = *FromMBB->Probs.begin();
+      addSuccessor(Succ, Prob);
+    } else
+      addSuccessorWithoutProb(Succ);
+    FromMBB->removeSuccessor(Succ);
+
+    // Fix up any PHI nodes in the successor.
+    Succ->replacePhiUsesWith(FromMBB, this);
+  }
+  normalizeSuccProbs();
+}
+
+bool MachineBasicBlock::isPredecessor(const MachineBasicBlock *MBB) const {
+  return is_contained(predecessors(), MBB);
+}
+
+bool MachineBasicBlock::isSuccessor(const MachineBasicBlock *MBB) const {
+  return is_contained(successors(), MBB);
+}
+
+bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const {
+  MachineFunction::const_iterator I(this);
+  return std::next(I) == MachineFunction::const_iterator(MBB);
+}
+
+const MachineBasicBlock *MachineBasicBlock::getSingleSuccessor() const {
+  return Successors.size() == 1 ? Successors[0] : nullptr;
+}
+
+MachineBasicBlock *MachineBasicBlock::getFallThrough(bool JumpToFallThrough) {
+  MachineFunction::iterator Fallthrough = getIterator();
+  ++Fallthrough;
+  // If FallthroughBlock is off the end of the function, it can't fall through.
+  if (Fallthrough == getParent()->end())
+    return nullptr;
+
+  // If FallthroughBlock isn't a successor, no fallthrough is possible.
+  if (!isSuccessor(&*Fallthrough))
+    return nullptr;
+
+  // Analyze the branches, if any, at the end of the block.
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
+  if (TII->analyzeBranch(*this, TBB, FBB, Cond)) {
+    // If we couldn't analyze the branch, examine the last instruction.
+    // If the block doesn't end in a known control barrier, assume fallthrough
+    // is possible. The isPredicated check is needed because this code can be
+    // called during IfConversion, where an instruction which is normally a
+    // Barrier is predicated and thus no longer an actual control barrier.
+    return (empty() || !back().isBarrier() || TII->isPredicated(back()))
+               ? &*Fallthrough
+               : nullptr;
+  }
+
+  // If there is no branch, control always falls through.
+  if (!TBB) return &*Fallthrough;
+
+  // If there is some explicit branch to the fallthrough block, it can obviously
+  // reach, even though the branch should get folded to fall through implicitly.
+  if (JumpToFallThrough && (MachineFunction::iterator(TBB) == Fallthrough ||
+                            MachineFunction::iterator(FBB) == Fallthrough))
+    return &*Fallthrough;
+
+  // If it's an unconditional branch to some block not the fall through, it
+  // doesn't fall through.
+  if (Cond.empty()) return nullptr;
+
+  // Otherwise, if it is conditional and has no explicit false block, it falls
+  // through.
+  return (FBB == nullptr) ? &*Fallthrough : nullptr;
+}
+
+bool MachineBasicBlock::canFallThrough() {
+  return getFallThrough() != nullptr;
+}
+
+MachineBasicBlock *MachineBasicBlock::splitAt(MachineInstr &MI,
+                                              bool UpdateLiveIns,
+                                              LiveIntervals *LIS) {
+  MachineBasicBlock::iterator SplitPoint(&MI);
+  ++SplitPoint;
+
+  if (SplitPoint == end()) {
+    // Don't bother with a new block.
+    return this;
+  }
+
+  MachineFunction *MF = getParent();
+
+  LivePhysRegs LiveRegs;
+  if (UpdateLiveIns) {
+    // Make sure we add any physregs we define in the block as liveins to the
+    // new block.
+    MachineBasicBlock::iterator Prev(&MI);
+    LiveRegs.init(*MF->getSubtarget().getRegisterInfo());
+    LiveRegs.addLiveOuts(*this);
+    for (auto I = rbegin(), E = Prev.getReverse(); I != E; ++I)
+      LiveRegs.stepBackward(*I);
+  }
+
+  MachineBasicBlock *SplitBB = MF->CreateMachineBasicBlock(getBasicBlock());
+
+  MF->insert(++MachineFunction::iterator(this), SplitBB);
+  SplitBB->splice(SplitBB->begin(), this, SplitPoint, end());
+
+  SplitBB->transferSuccessorsAndUpdatePHIs(this);
+  addSuccessor(SplitBB);
+
+  if (UpdateLiveIns)
+    addLiveIns(*SplitBB, LiveRegs);
+
+  if (LIS)
+    LIS->insertMBBInMaps(SplitBB);
+
+  return SplitBB;
+}
+
+// Returns `true` if there are possibly other users of the jump table at
+// `JumpTableIndex` except for the ones in `IgnoreMBB`.
+static bool jumpTableHasOtherUses(const MachineFunction &MF,
+                                  const MachineBasicBlock &IgnoreMBB,
+                                  int JumpTableIndex) {
+  assert(JumpTableIndex >= 0 && "need valid index");
+  const MachineJumpTableInfo &MJTI = *MF.getJumpTableInfo();
+  const MachineJumpTableEntry &MJTE = MJTI.getJumpTables()[JumpTableIndex];
+  // Take any basic block from the table; every user of the jump table must
+  // show up in the predecessor list.
+  const MachineBasicBlock *MBB = nullptr;
+  for (MachineBasicBlock *B : MJTE.MBBs) {
+    if (B != nullptr) {
+      MBB = B;
+      break;
+    }
+  }
+  if (MBB == nullptr)
+    return true; // can't rule out other users if there isn't any block.
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  SmallVector<MachineOperand, 4> Cond;
+  for (MachineBasicBlock *Pred : MBB->predecessors()) {
+    if (Pred == &IgnoreMBB)
+      continue;
+    MachineBasicBlock *DummyT = nullptr;
+    MachineBasicBlock *DummyF = nullptr;
+    Cond.clear();
+    if (!TII.analyzeBranch(*Pred, DummyT, DummyF, Cond,
+                           /*AllowModify=*/false)) {
+      // analyzable direct jump
+      continue;
+    }
+    int PredJTI = findJumpTableIndex(*Pred);
+    if (PredJTI >= 0) {
+      if (PredJTI == JumpTableIndex)
+        return true;
+      continue;
+    }
+    // Be conservative for unanalyzable jumps.
+    return true;
+  }
+  return false;
+}
+
+MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
+    MachineBasicBlock *Succ, Pass &P,
+    std::vector<SparseBitVector<>> *LiveInSets) {
+  if (!canSplitCriticalEdge(Succ))
+    return nullptr;
+
+  MachineFunction *MF = getParent();
+  MachineBasicBlock *PrevFallthrough = getNextNode();
+  DebugLoc DL;  // FIXME: this is nowhere
+
+  MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
+
+  // Is there an indirect jump with jump table?
+  bool ChangedIndirectJump = false;
+  int JTI = findJumpTableIndex(*this);
+  if (JTI >= 0) {
+    MachineJumpTableInfo &MJTI = *MF->getJumpTableInfo();
+    MJTI.ReplaceMBBInJumpTable(JTI, Succ, NMBB);
+    ChangedIndirectJump = true;
+  }
+
+  MF->insert(std::next(MachineFunction::iterator(this)), NMBB);
+  LLVM_DEBUG(dbgs() << "Splitting critical edge: " << printMBBReference(*this)
+                    << " -- " << printMBBReference(*NMBB) << " -- "
+                    << printMBBReference(*Succ) << '\n');
+
+  LiveIntervals *LIS = P.getAnalysisIfAvailable<LiveIntervals>();
+  SlotIndexes *Indexes = P.getAnalysisIfAvailable<SlotIndexes>();
+  if (LIS)
+    LIS->insertMBBInMaps(NMBB);
+  else if (Indexes)
+    Indexes->insertMBBInMaps(NMBB);
+
+  // On some targets like Mips, branches may kill virtual registers. Make sure
+  // that LiveVariables is properly updated after updateTerminator replaces the
+  // terminators.
+  LiveVariables *LV = P.getAnalysisIfAvailable<LiveVariables>();
+
+  // Collect a list of virtual registers killed by the terminators.
+  SmallVector<Register, 4> KilledRegs;
+  if (LV)
+    for (MachineInstr &MI :
+         llvm::make_range(getFirstInstrTerminator(), instr_end())) {
+      for (MachineOperand &MO : MI.all_uses()) {
+        if (MO.getReg() == 0 || !MO.isKill() || MO.isUndef())
+          continue;
+        Register Reg = MO.getReg();
+        if (Reg.isPhysical() || LV->getVarInfo(Reg).removeKill(MI)) {
+          KilledRegs.push_back(Reg);
+          LLVM_DEBUG(dbgs() << "Removing terminator kill: " << MI);
+          MO.setIsKill(false);
+        }
+      }
+    }
+
+  SmallVector<Register, 4> UsedRegs;
+  if (LIS) {
+    for (MachineInstr &MI :
+         llvm::make_range(getFirstInstrTerminator(), instr_end())) {
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg() || MO.getReg() == 0)
+          continue;
+
+        Register Reg = MO.getReg();
+        if (!is_contained(UsedRegs, Reg))
+          UsedRegs.push_back(Reg);
+      }
+    }
+  }
+
+  ReplaceUsesOfBlockWith(Succ, NMBB);
+
+  // If updateTerminator() removes instructions, we need to remove them from
+  // SlotIndexes.
+  SmallVector<MachineInstr*, 4> Terminators;
+  if (Indexes) {
+    for (MachineInstr &MI :
+         llvm::make_range(getFirstInstrTerminator(), instr_end()))
+      Terminators.push_back(&MI);
+  }
+
+  // Since we replaced all uses of Succ with NMBB, that should also be treated
+  // as the fallthrough successor
+  if (Succ == PrevFallthrough)
+    PrevFallthrough = NMBB;
+
+  if (!ChangedIndirectJump)
+    updateTerminator(PrevFallthrough);
+
+  if (Indexes) {
+    SmallVector<MachineInstr*, 4> NewTerminators;
+    for (MachineInstr &MI :
+         llvm::make_range(getFirstInstrTerminator(), instr_end()))
+      NewTerminators.push_back(&MI);
+
+    for (MachineInstr *Terminator : Terminators) {
+      if (!is_contained(NewTerminators, Terminator))
+        Indexes->removeMachineInstrFromMaps(*Terminator);
+    }
+  }
+
+  // Insert unconditional "jump Succ" instruction in NMBB if necessary.
+  NMBB->addSuccessor(Succ);
+  if (!NMBB->isLayoutSuccessor(Succ)) {
+    SmallVector<MachineOperand, 4> Cond;
+    const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
+    TII->insertBranch(*NMBB, Succ, nullptr, Cond, DL);
+
+    if (Indexes) {
+      for (MachineInstr &MI : NMBB->instrs()) {
+        // Some instructions may have been moved to NMBB by updateTerminator(),
+        // so we first remove any instruction that already has an index.
+        if (Indexes->hasIndex(MI))
+          Indexes->removeMachineInstrFromMaps(MI);
+        Indexes->insertMachineInstrInMaps(MI);
+      }
+    }
+  }
+
+  // Fix PHI nodes in Succ so they refer to NMBB instead of this.
+  Succ->replacePhiUsesWith(this, NMBB);
+
+  // Inherit live-ins from the successor
+  for (const auto &LI : Succ->liveins())
+    NMBB->addLiveIn(LI);
+
+  // Update LiveVariables.
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  if (LV) {
+    // Restore kills of virtual registers that were killed by the terminators.
+    while (!KilledRegs.empty()) {
+      Register Reg = KilledRegs.pop_back_val();
+      for (instr_iterator I = instr_end(), E = instr_begin(); I != E;) {
+        if (!(--I)->addRegisterKilled(Reg, TRI, /* AddIfNotFound= */ false))
+          continue;
+        if (Reg.isVirtual())
+          LV->getVarInfo(Reg).Kills.push_back(&*I);
+        LLVM_DEBUG(dbgs() << "Restored terminator kill: " << *I);
+        break;
+      }
+    }
+    // Update relevant live-through information.
+    if (LiveInSets != nullptr)
+      LV->addNewBlock(NMBB, this, Succ, *LiveInSets);
+    else
+      LV->addNewBlock(NMBB, this, Succ);
+  }
+
+  if (LIS) {
+    // After splitting the edge and updating SlotIndexes, live intervals may be
+    // in one of two situations, depending on whether this block was the last in
+    // the function. If the original block was the last in the function, all
+    // live intervals will end prior to the beginning of the new split block. If
+    // the original block was not at the end of the function, all live intervals
+    // will extend to the end of the new split block.
+
+    bool isLastMBB =
+      std::next(MachineFunction::iterator(NMBB)) == getParent()->end();
+
+    SlotIndex StartIndex = Indexes->getMBBEndIdx(this);
+    SlotIndex PrevIndex = StartIndex.getPrevSlot();
+    SlotIndex EndIndex = Indexes->getMBBEndIdx(NMBB);
+
+    // Find the registers used from NMBB in PHIs in Succ.
+    SmallSet<Register, 8> PHISrcRegs;
+    for (MachineBasicBlock::instr_iterator
+         I = Succ->instr_begin(), E = Succ->instr_end();
+         I != E && I->isPHI(); ++I) {
+      for (unsigned ni = 1, ne = I->getNumOperands(); ni != ne; ni += 2) {
+        if (I->getOperand(ni+1).getMBB() == NMBB) {
+          MachineOperand &MO = I->getOperand(ni);
+          Register Reg = MO.getReg();
+          PHISrcRegs.insert(Reg);
+          if (MO.isUndef())
+            continue;
+
+          LiveInterval &LI = LIS->getInterval(Reg);
+          VNInfo *VNI = LI.getVNInfoAt(PrevIndex);
+          assert(VNI &&
+                 "PHI sources should be live out of their predecessors.");
+          LI.addSegment(LiveInterval::Segment(StartIndex, EndIndex, VNI));
+        }
+      }
+    }
+
+    MachineRegisterInfo *MRI = &getParent()->getRegInfo();
+    for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+      Register Reg = Register::index2VirtReg(i);
+      if (PHISrcRegs.count(Reg) || !LIS->hasInterval(Reg))
+        continue;
+
+      LiveInterval &LI = LIS->getInterval(Reg);
+      if (!LI.liveAt(PrevIndex))
+        continue;
+
+      bool isLiveOut = LI.liveAt(LIS->getMBBStartIdx(Succ));
+      if (isLiveOut && isLastMBB) {
+        VNInfo *VNI = LI.getVNInfoAt(PrevIndex);
+        assert(VNI && "LiveInterval should have VNInfo where it is live.");
+        LI.addSegment(LiveInterval::Segment(StartIndex, EndIndex, VNI));
+      } else if (!isLiveOut && !isLastMBB) {
+        LI.removeSegment(StartIndex, EndIndex);
+      }
+    }
+
+    // Update all intervals for registers whose uses may have been modified by
+    // updateTerminator().
+    LIS->repairIntervalsInRange(this, getFirstTerminator(), end(), UsedRegs);
+  }
+
+  if (MachineDominatorTree *MDT =
+          P.getAnalysisIfAvailable<MachineDominatorTree>())
+    MDT->recordSplitCriticalEdge(this, Succ, NMBB);
+
+  if (MachineLoopInfo *MLI = P.getAnalysisIfAvailable<MachineLoopInfo>())
+    if (MachineLoop *TIL = MLI->getLoopFor(this)) {
+      // If one or the other blocks were not in a loop, the new block is not
+      // either, and thus LI doesn't need to be updated.
+      if (MachineLoop *DestLoop = MLI->getLoopFor(Succ)) {
+        if (TIL == DestLoop) {
+          // Both in the same loop, the NMBB joins loop.
+          DestLoop->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else if (TIL->contains(DestLoop)) {
+          // Edge from an outer loop to an inner loop.  Add to the outer loop.
+          TIL->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else if (DestLoop->contains(TIL)) {
+          // Edge from an inner loop to an outer loop.  Add to the outer loop.
+          DestLoop->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else {
+          // Edge from two loops with no containment relation.  Because these
+          // are natural loops, we know that the destination block must be the
+          // header of its loop (adding a branch into a loop elsewhere would
+          // create an irreducible loop).
+          assert(DestLoop->getHeader() == Succ &&
+                 "Should not create irreducible loops!");
+          if (MachineLoop *P = DestLoop->getParentLoop())
+            P->addBasicBlockToLoop(NMBB, MLI->getBase());
+        }
+      }
+    }
+
+  return NMBB;
+}
+
+bool MachineBasicBlock::canSplitCriticalEdge(
+    const MachineBasicBlock *Succ) const {
+  // Splitting the critical edge to a landing pad block is non-trivial. Don't do
+  // it in this generic function.
+  if (Succ->isEHPad())
+    return false;
+
+  // Splitting the critical edge to a callbr's indirect block isn't advised.
+  // Don't do it in this generic function.
+  if (Succ->isInlineAsmBrIndirectTarget())
+    return false;
+
+  const MachineFunction *MF = getParent();
+  // Performance might be harmed on HW that implements branching using exec mask
+  // where both sides of the branches are always executed.
+  if (MF->getTarget().requiresStructuredCFG())
+    return false;
+
+  // Do we have an Indirect jump with a jumptable that we can rewrite?
+  int JTI = findJumpTableIndex(*this);
+  if (JTI >= 0 && !jumpTableHasOtherUses(*MF, *this, JTI))
+    return true;
+
+  // We may need to update this's terminator, but we can't do that if
+  // analyzeBranch fails.
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  // AnalyzeBanch should modify this, since we did not allow modification.
+  if (TII->analyzeBranch(*const_cast<MachineBasicBlock *>(this), TBB, FBB, Cond,
+                         /*AllowModify*/ false))
+    return false;
+
+  // Avoid bugpoint weirdness: A block may end with a conditional branch but
+  // jumps to the same MBB is either case. We have duplicate CFG edges in that
+  // case that we can't handle. Since this never happens in properly optimized
+  // code, just skip those edges.
+  if (TBB && TBB == FBB) {
+    LLVM_DEBUG(dbgs() << "Won't split critical edge after degenerate "
+                      << printMBBReference(*this) << '\n');
+    return false;
+  }
+  return true;
+}
+
+/// Prepare MI to be removed from its bundle. This fixes bundle flags on MI's
+/// neighboring instructions so the bundle won't be broken by removing MI.
+static void unbundleSingleMI(MachineInstr *MI) {
+  // Removing the first instruction in a bundle.
+  if (MI->isBundledWithSucc() && !MI->isBundledWithPred())
+    MI->unbundleFromSucc();
+  // Removing the last instruction in a bundle.
+  if (MI->isBundledWithPred() && !MI->isBundledWithSucc())
+    MI->unbundleFromPred();
+  // If MI is not bundled, or if it is internal to a bundle, the neighbor flags
+  // are already fine.
+}
+
+MachineBasicBlock::instr_iterator
+MachineBasicBlock::erase(MachineBasicBlock::instr_iterator I) {
+  unbundleSingleMI(&*I);
+  return Insts.erase(I);
+}
+
+MachineInstr *MachineBasicBlock::remove_instr(MachineInstr *MI) {
+  unbundleSingleMI(MI);
+  MI->clearFlag(MachineInstr::BundledPred);
+  MI->clearFlag(MachineInstr::BundledSucc);
+  return Insts.remove(MI);
+}
+
+MachineBasicBlock::instr_iterator
+MachineBasicBlock::insert(instr_iterator I, MachineInstr *MI) {
+  assert(!MI->isBundledWithPred() && !MI->isBundledWithSucc() &&
+         "Cannot insert instruction with bundle flags");
+  // Set the bundle flags when inserting inside a bundle.
+  if (I != instr_end() && I->isBundledWithPred()) {
+    MI->setFlag(MachineInstr::BundledPred);
+    MI->setFlag(MachineInstr::BundledSucc);
+  }
+  return Insts.insert(I, MI);
+}
+
+/// This method unlinks 'this' from the containing function, and returns it, but
+/// does not delete it.
+MachineBasicBlock *MachineBasicBlock::removeFromParent() {
+  assert(getParent() && "Not embedded in a function!");
+  getParent()->remove(this);
+  return this;
+}
+
+/// This method unlinks 'this' from the containing function, and deletes it.
+void MachineBasicBlock::eraseFromParent() {
+  assert(getParent() && "Not embedded in a function!");
+  getParent()->erase(this);
+}
+
+/// Given a machine basic block that branched to 'Old', change the code and CFG
+/// so that it branches to 'New' instead.
+void MachineBasicBlock::ReplaceUsesOfBlockWith(MachineBasicBlock *Old,
+                                               MachineBasicBlock *New) {
+  assert(Old != New && "Cannot replace self with self!");
+
+  MachineBasicBlock::instr_iterator I = instr_end();
+  while (I != instr_begin()) {
+    --I;
+    if (!I->isTerminator()) break;
+
+    // Scan the operands of this machine instruction, replacing any uses of Old
+    // with New.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+      if (I->getOperand(i).isMBB() &&
+          I->getOperand(i).getMBB() == Old)
+        I->getOperand(i).setMBB(New);
+  }
+
+  // Update the successor information.
+  replaceSuccessor(Old, New);
+}
+
+void MachineBasicBlock::replacePhiUsesWith(MachineBasicBlock *Old,
+                                           MachineBasicBlock *New) {
+  for (MachineInstr &MI : phis())
+    for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (MO.getMBB() == Old)
+        MO.setMBB(New);
+    }
+}
+
+/// Find the next valid DebugLoc starting at MBBI, skipping any debug
+/// instructions.  Return UnknownLoc if there is none.
+DebugLoc
+MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
+  // Skip debug declarations, we don't want a DebugLoc from them.
+  MBBI = skipDebugInstructionsForward(MBBI, instr_end());
+  if (MBBI != instr_end())
+    return MBBI->getDebugLoc();
+  return {};
+}
+
+DebugLoc MachineBasicBlock::rfindDebugLoc(reverse_instr_iterator MBBI) {
+  if (MBBI == instr_rend())
+    return findDebugLoc(instr_begin());
+  // Skip debug declarations, we don't want a DebugLoc from them.
+  MBBI = skipDebugInstructionsBackward(MBBI, instr_rbegin());
+  if (!MBBI->isDebugInstr())
+    return MBBI->getDebugLoc();
+  return {};
+}
+
+/// Find the previous valid DebugLoc preceding MBBI, skipping any debug
+/// instructions.  Return UnknownLoc if there is none.
+DebugLoc MachineBasicBlock::findPrevDebugLoc(instr_iterator MBBI) {
+  if (MBBI == instr_begin())
+    return {};
+  // Skip debug instructions, we don't want a DebugLoc from them.
+  MBBI = prev_nodbg(MBBI, instr_begin());
+  if (!MBBI->isDebugInstr())
+    return MBBI->getDebugLoc();
+  return {};
+}
+
+DebugLoc MachineBasicBlock::rfindPrevDebugLoc(reverse_instr_iterator MBBI) {
+  if (MBBI == instr_rend())
+    return {};
+  // Skip debug declarations, we don't want a DebugLoc from them.
+  MBBI = next_nodbg(MBBI, instr_rend());
+  if (MBBI != instr_rend())
+    return MBBI->getDebugLoc();
+  return {};
+}
+
+/// Find and return the merged DebugLoc of the branch instructions of the block.
+/// Return UnknownLoc if there is none.
+DebugLoc
+MachineBasicBlock::findBranchDebugLoc() {
+  DebugLoc DL;
+  auto TI = getFirstTerminator();
+  while (TI != end() && !TI->isBranch())
+    ++TI;
+
+  if (TI != end()) {
+    DL = TI->getDebugLoc();
+    for (++TI ; TI != end() ; ++TI)
+      if (TI->isBranch())
+        DL = DILocation::getMergedLocation(DL, TI->getDebugLoc());
+  }
+  return DL;
+}
+
+/// Return probability of the edge from this block to MBB.
+BranchProbability
+MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const {
+  if (Probs.empty())
+    return BranchProbability(1, succ_size());
+
+  const auto &Prob = *getProbabilityIterator(Succ);
+  if (Prob.isUnknown()) {
+    // For unknown probabilities, collect the sum of all known ones, and evenly
+    // ditribute the complemental of the sum to each unknown probability.
+    unsigned KnownProbNum = 0;
+    auto Sum = BranchProbability::getZero();
+    for (const auto &P : Probs) {
+      if (!P.isUnknown()) {
+        Sum += P;
+        KnownProbNum++;
+      }
+    }
+    return Sum.getCompl() / (Probs.size() - KnownProbNum);
+  } else
+    return Prob;
+}
+
+/// Set successor probability of a given iterator.
+void MachineBasicBlock::setSuccProbability(succ_iterator I,
+                                           BranchProbability Prob) {
+  assert(!Prob.isUnknown());
+  if (Probs.empty())
+    return;
+  *getProbabilityIterator(I) = Prob;
+}
+
+/// Return probability iterator corresonding to the I successor iterator
+MachineBasicBlock::const_probability_iterator
+MachineBasicBlock::getProbabilityIterator(
+    MachineBasicBlock::const_succ_iterator I) const {
+  assert(Probs.size() == Successors.size() && "Async probability list!");
+  const size_t index = std::distance(Successors.begin(), I);
+  assert(index < Probs.size() && "Not a current successor!");
+  return Probs.begin() + index;
+}
+
+/// Return probability iterator corresonding to the I successor iterator.
+MachineBasicBlock::probability_iterator
+MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) {
+  assert(Probs.size() == Successors.size() && "Async probability list!");
+  const size_t index = std::distance(Successors.begin(), I);
+  assert(index < Probs.size() && "Not a current successor!");
+  return Probs.begin() + index;
+}
+
+/// Return whether (physical) register "Reg" has been <def>ined and not <kill>ed
+/// as of just before "MI".
+///
+/// Search is localised to a neighborhood of
+/// Neighborhood instructions before (searching for defs or kills) and N
+/// instructions after (searching just for defs) MI.
+MachineBasicBlock::LivenessQueryResult
+MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
+                                           MCRegister Reg, const_iterator Before,
+                                           unsigned Neighborhood) const {
+  unsigned N = Neighborhood;
+
+  // Try searching forwards from Before, looking for reads or defs.
+  const_iterator I(Before);
+  for (; I != end() && N > 0; ++I) {
+    if (I->isDebugOrPseudoInstr())
+      continue;
+
+    --N;
+
+    PhysRegInfo Info = AnalyzePhysRegInBundle(*I, Reg, TRI);
+
+    // Register is live when we read it here.
+    if (Info.Read)
+      return LQR_Live;
+    // Register is dead if we can fully overwrite or clobber it here.
+    if (Info.FullyDefined || Info.Clobbered)
+      return LQR_Dead;
+  }
+
+  // If we reached the end, it is safe to clobber Reg at the end of a block of
+  // no successor has it live in.
+  if (I == end()) {
+    for (MachineBasicBlock *S : successors()) {
+      for (const MachineBasicBlock::RegisterMaskPair &LI : S->liveins()) {
+        if (TRI->regsOverlap(LI.PhysReg, Reg))
+          return LQR_Live;
+      }
+    }
+
+    return LQR_Dead;
+  }
+
+
+  N = Neighborhood;
+
+  // Start by searching backwards from Before, looking for kills, reads or defs.
+  I = const_iterator(Before);
+  // If this is the first insn in the block, don't search backwards.
+  if (I != begin()) {
+    do {
+      --I;
+
+      if (I->isDebugOrPseudoInstr())
+        continue;
+
+      --N;
+
+      PhysRegInfo Info = AnalyzePhysRegInBundle(*I, Reg, TRI);
+
+      // Defs happen after uses so they take precedence if both are present.
+
+      // Register is dead after a dead def of the full register.
+      if (Info.DeadDef)
+        return LQR_Dead;
+      // Register is (at least partially) live after a def.
+      if (Info.Defined) {
+        if (!Info.PartialDeadDef)
+          return LQR_Live;
+        // As soon as we saw a partial definition (dead or not),
+        // we cannot tell if the value is partial live without
+        // tracking the lanemasks. We are not going to do this,
+        // so fall back on the remaining of the analysis.
+        break;
+      }
+      // Register is dead after a full kill or clobber and no def.
+      if (Info.Killed || Info.Clobbered)
+        return LQR_Dead;
+      // Register must be live if we read it.
+      if (Info.Read)
+        return LQR_Live;
+
+    } while (I != begin() && N > 0);
+  }
+
+  // If all the instructions before this in the block are debug instructions,
+  // skip over them.
+  while (I != begin() && std::prev(I)->isDebugOrPseudoInstr())
+    --I;
+
+  // Did we get to the start of the block?
+  if (I == begin()) {
+    // If so, the register's state is definitely defined by the live-in state.
+    for (const MachineBasicBlock::RegisterMaskPair &LI : liveins())
+      if (TRI->regsOverlap(LI.PhysReg, Reg))
+        return LQR_Live;
+
+    return LQR_Dead;
+  }
+
+  // At this point we have no idea of the liveness of the register.
+  return LQR_Unknown;
+}
+
+const uint32_t *
+MachineBasicBlock::getBeginClobberMask(const TargetRegisterInfo *TRI) const {
+  // EH funclet entry does not preserve any registers.
+  return isEHFuncletEntry() ? TRI->getNoPreservedMask() : nullptr;
+}
+
+const uint32_t *
+MachineBasicBlock::getEndClobberMask(const TargetRegisterInfo *TRI) const {
+  // If we see a return block with successors, this must be a funclet return,
+  // which does not preserve any registers. If there are no successors, we don't
+  // care what kind of return it is, putting a mask after it is a no-op.
+  return isReturnBlock() && !succ_empty() ? TRI->getNoPreservedMask() : nullptr;
+}
+
+void MachineBasicBlock::clearLiveIns() {
+  LiveIns.clear();
+}
+
+MachineBasicBlock::livein_iterator MachineBasicBlock::livein_begin() const {
+  assert(getParent()->getProperties().hasProperty(
+      MachineFunctionProperties::Property::TracksLiveness) &&
+      "Liveness information is accurate");
+  return LiveIns.begin();
+}
+
+MachineBasicBlock::liveout_iterator MachineBasicBlock::liveout_begin() const {
+  const MachineFunction &MF = *getParent();
+  assert(MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::TracksLiveness) &&
+      "Liveness information is accurate");
+
+  const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
+  MCPhysReg ExceptionPointer = 0, ExceptionSelector = 0;
+  if (MF.getFunction().hasPersonalityFn()) {
+    auto PersonalityFn = MF.getFunction().getPersonalityFn();
+    ExceptionPointer = TLI.getExceptionPointerRegister(PersonalityFn);
+    ExceptionSelector = TLI.getExceptionSelectorRegister(PersonalityFn);
+  }
+
+  return liveout_iterator(*this, ExceptionPointer, ExceptionSelector, false);
+}
+
+bool MachineBasicBlock::sizeWithoutDebugLargerThan(unsigned Limit) const {
+  unsigned Cntr = 0;
+  auto R = instructionsWithoutDebug(begin(), end());
+  for (auto I = R.begin(), E = R.end(); I != E; ++I) {
+    if (++Cntr > Limit)
+      return true;
+  }
+  return false;
+}
+
+unsigned MachineBasicBlock::getBBIDOrNumber() const {
+  uint8_t BBAddrMapVersion = getParent()->getContext().getBBAddrMapVersion();
+  return BBAddrMapVersion < 2 ? getNumber() : *getBBID();
+}
+
+const MBBSectionID MBBSectionID::ColdSectionID(MBBSectionID::SectionType::Cold);
+const MBBSectionID
+    MBBSectionID::ExceptionSectionID(MBBSectionID::SectionType::Exception);
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
new file mode 100644
index 000000000000..b1cbe525d7e6
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -0,0 +1,291 @@
+//===- MachineBlockFrequencyInfo.cpp - MBB Frequency Analysis -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Loops should be simplified before this analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/GraphWriter.h"
+#include <optional>
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-block-freq"
+
+namespace llvm {
+static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
+    "view-machine-block-freq-propagation-dags", cl::Hidden,
+    cl::desc("Pop up a window to show a dag displaying how machine block "
+             "frequencies propagate through the CFG."),
+    cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."),
+               clEnumValN(GVDT_Fraction, "fraction",
+                          "display a graph using the "
+                          "fractional block frequency representation."),
+               clEnumValN(GVDT_Integer, "integer",
+                          "display a graph using the raw "
+                          "integer fractional block frequency representation."),
+               clEnumValN(GVDT_Count, "count", "display a graph using the real "
+                                               "profile count if available.")));
+
+// Similar option above, but used to control BFI display only after MBP pass
+cl::opt<GVDAGType> ViewBlockLayoutWithBFI(
+    "view-block-layout-with-bfi", cl::Hidden,
+    cl::desc(
+        "Pop up a window to show a dag displaying MBP layout and associated "
+        "block frequencies of the CFG."),
+    cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."),
+               clEnumValN(GVDT_Fraction, "fraction",
+                          "display a graph using the "
+                          "fractional block frequency representation."),
+               clEnumValN(GVDT_Integer, "integer",
+                          "display a graph using the raw "
+                          "integer fractional block frequency representation."),
+               clEnumValN(GVDT_Count, "count",
+                          "display a graph using the real "
+                          "profile count if available.")));
+
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+
+// Command line option to specify hot frequency threshold.
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-hot-freq-perc=
+extern cl::opt<unsigned> ViewHotFreqPercent;
+
+static cl::opt<bool> PrintMachineBlockFreq(
+    "print-machine-bfi", cl::init(false), cl::Hidden,
+    cl::desc("Print the machine block frequency info."));
+
+// Command line option to specify the name of the function for block frequency
+// dump. Defined in Analysis/BlockFrequencyInfo.cpp.
+extern cl::opt<std::string> PrintBlockFreqFuncName;
+} // namespace llvm
+
+static GVDAGType getGVDT() {
+  if (ViewBlockLayoutWithBFI != GVDT_None)
+    return ViewBlockLayoutWithBFI;
+
+  return ViewMachineBlockFreqPropagationDAG;
+}
+
+namespace llvm {
+
+template <> struct GraphTraits<MachineBlockFrequencyInfo *> {
+  using NodeRef = const MachineBasicBlock *;
+  using ChildIteratorType = MachineBasicBlock::const_succ_iterator;
+  using nodes_iterator = pointer_iterator<MachineFunction::const_iterator>;
+
+  static NodeRef getEntryNode(const MachineBlockFrequencyInfo *G) {
+    return &G->getFunction()->front();
+  }
+
+  static ChildIteratorType child_begin(const NodeRef N) {
+    return N->succ_begin();
+  }
+
+  static ChildIteratorType child_end(const NodeRef N) { return N->succ_end(); }
+
+  static nodes_iterator nodes_begin(const MachineBlockFrequencyInfo *G) {
+    return nodes_iterator(G->getFunction()->begin());
+  }
+
+  static nodes_iterator nodes_end(const MachineBlockFrequencyInfo *G) {
+    return nodes_iterator(G->getFunction()->end());
+  }
+};
+
+using MBFIDOTGraphTraitsBase =
+    BFIDOTGraphTraitsBase<MachineBlockFrequencyInfo,
+                          MachineBranchProbabilityInfo>;
+
+template <>
+struct DOTGraphTraits<MachineBlockFrequencyInfo *>
+    : public MBFIDOTGraphTraitsBase {
+  const MachineFunction *CurFunc = nullptr;
+  DenseMap<const MachineBasicBlock *, int> LayoutOrderMap;
+
+  explicit DOTGraphTraits(bool isSimple = false)
+      : MBFIDOTGraphTraitsBase(isSimple) {}
+
+  std::string getNodeLabel(const MachineBasicBlock *Node,
+                           const MachineBlockFrequencyInfo *Graph) {
+    int layout_order = -1;
+    // Attach additional ordering information if 'isSimple' is false.
+    if (!isSimple()) {
+      const MachineFunction *F = Node->getParent();
+      if (!CurFunc || F != CurFunc) {
+        if (CurFunc)
+          LayoutOrderMap.clear();
+
+        CurFunc = F;
+        int O = 0;
+        for (auto MBI = F->begin(); MBI != F->end(); ++MBI, ++O) {
+          LayoutOrderMap[&*MBI] = O;
+        }
+      }
+      layout_order = LayoutOrderMap[Node];
+    }
+    return MBFIDOTGraphTraitsBase::getNodeLabel(Node, Graph, getGVDT(),
+                                                layout_order);
+  }
+
+  std::string getNodeAttributes(const MachineBasicBlock *Node,
+                                const MachineBlockFrequencyInfo *Graph) {
+    return MBFIDOTGraphTraitsBase::getNodeAttributes(Node, Graph,
+                                                     ViewHotFreqPercent);
+  }
+
+  std::string getEdgeAttributes(const MachineBasicBlock *Node, EdgeIter EI,
+                                const MachineBlockFrequencyInfo *MBFI) {
+    return MBFIDOTGraphTraitsBase::getEdgeAttributes(
+        Node, EI, MBFI, MBFI->getMBPI(), ViewHotFreqPercent);
+  }
+};
+
+} // end namespace llvm
+
+INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, DEBUG_TYPE,
+                      "Machine Block Frequency Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(MachineBlockFrequencyInfo, DEBUG_TYPE,
+                    "Machine Block Frequency Analysis", true, true)
+
+char MachineBlockFrequencyInfo::ID = 0;
+
+MachineBlockFrequencyInfo::MachineBlockFrequencyInfo()
+    : MachineFunctionPass(ID) {
+  initializeMachineBlockFrequencyInfoPass(*PassRegistry::getPassRegistry());
+}
+
+MachineBlockFrequencyInfo::MachineBlockFrequencyInfo(
+      MachineFunction &F,
+      MachineBranchProbabilityInfo &MBPI,
+      MachineLoopInfo &MLI) : MachineFunctionPass(ID) {
+  calculate(F, MBPI, MLI);
+}
+
+MachineBlockFrequencyInfo::~MachineBlockFrequencyInfo() = default;
+
+void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void MachineBlockFrequencyInfo::calculate(
+    const MachineFunction &F, const MachineBranchProbabilityInfo &MBPI,
+    const MachineLoopInfo &MLI) {
+  if (!MBFI)
+    MBFI.reset(new ImplType);
+  MBFI->calculate(F, MBPI, MLI);
+  if (ViewMachineBlockFreqPropagationDAG != GVDT_None &&
+      (ViewBlockFreqFuncName.empty() ||
+       F.getName().equals(ViewBlockFreqFuncName))) {
+    view("MachineBlockFrequencyDAGS." + F.getName());
+  }
+  if (PrintMachineBlockFreq &&
+      (PrintBlockFreqFuncName.empty() ||
+       F.getName().equals(PrintBlockFreqFuncName))) {
+    MBFI->print(dbgs());
+  }
+}
+
+bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
+  MachineBranchProbabilityInfo &MBPI =
+      getAnalysis<MachineBranchProbabilityInfo>();
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  calculate(F, MBPI, MLI);
+  return false;
+}
+
+void MachineBlockFrequencyInfo::releaseMemory() { MBFI.reset(); }
+
+/// Pop up a ghostview window with the current block frequency propagation
+/// rendered using dot.
+void MachineBlockFrequencyInfo::view(const Twine &Name, bool isSimple) const {
+  // This code is only for debugging.
+  ViewGraph(const_cast<MachineBlockFrequencyInfo *>(this), Name, isSimple);
+}
+
+BlockFrequency
+MachineBlockFrequencyInfo::getBlockFreq(const MachineBasicBlock *MBB) const {
+  return MBFI ? MBFI->getBlockFreq(MBB) : 0;
+}
+
+std::optional<uint64_t> MachineBlockFrequencyInfo::getBlockProfileCount(
+    const MachineBasicBlock *MBB) const {
+  if (!MBFI)
+    return std::nullopt;
+
+  const Function &F = MBFI->getFunction()->getFunction();
+  return MBFI->getBlockProfileCount(F, MBB);
+}
+
+std::optional<uint64_t>
+MachineBlockFrequencyInfo::getProfileCountFromFreq(uint64_t Freq) const {
+  if (!MBFI)
+    return std::nullopt;
+
+  const Function &F = MBFI->getFunction()->getFunction();
+  return MBFI->getProfileCountFromFreq(F, Freq);
+}
+
+bool MachineBlockFrequencyInfo::isIrrLoopHeader(
+    const MachineBasicBlock *MBB) const {
+  assert(MBFI && "Expected analysis to be available");
+  return MBFI->isIrrLoopHeader(MBB);
+}
+
+void MachineBlockFrequencyInfo::onEdgeSplit(
+    const MachineBasicBlock &NewPredecessor,
+    const MachineBasicBlock &NewSuccessor,
+    const MachineBranchProbabilityInfo &MBPI) {
+  assert(MBFI && "Expected analysis to be available");
+  auto NewSuccFreq = MBFI->getBlockFreq(&NewPredecessor) *
+                     MBPI.getEdgeProbability(&NewPredecessor, &NewSuccessor);
+
+  MBFI->setBlockFreq(&NewSuccessor, NewSuccFreq.getFrequency());
+}
+
+const MachineFunction *MachineBlockFrequencyInfo::getFunction() const {
+  return MBFI ? MBFI->getFunction() : nullptr;
+}
+
+const MachineBranchProbabilityInfo *MachineBlockFrequencyInfo::getMBPI() const {
+  return MBFI ? &MBFI->getBPI() : nullptr;
+}
+
+raw_ostream &
+MachineBlockFrequencyInfo::printBlockFreq(raw_ostream &OS,
+                                          const BlockFrequency Freq) const {
+  return MBFI ? MBFI->printBlockFreq(OS, Freq) : OS;
+}
+
+raw_ostream &
+MachineBlockFrequencyInfo::printBlockFreq(raw_ostream &OS,
+                                          const MachineBasicBlock *MBB) const {
+  return MBFI ? MBFI->printBlockFreq(OS, MBB) : OS;
+}
+
+uint64_t MachineBlockFrequencyInfo::getEntryFreq() const {
+  return MBFI ? MBFI->getEntryFreq() : 0;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineBlockPlacement.cpp
new file mode 100644
index 000000000000..912e9ec993e3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -0,0 +1,3701 @@
+//===- MachineBlockPlacement.cpp - Basic Block Code Layout optimization ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements basic block placement transformations using the CFG
+// structure and branch probability estimates.
+//
+// The pass strives to preserve the structure of the CFG (that is, retain
+// a topological ordering of basic blocks) in the absence of a *strong* signal
+// to the contrary from probabilities. However, within the CFG structure, it
+// attempts to choose an ordering which favors placing more likely sequences of
+// blocks adjacent to each other.
+//
+// The algorithm works from the inner-most loop within a function outward, and
+// at each stage walks through the basic blocks, trying to coalesce them into
+// sequential chains where allowed by the CFG (or demanded by heavy
+// probabilities). Finally, it walks the blocks in topological order, and the
+// first time it reaches a chain of basic blocks, it schedules them in the
+// function in-order.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BranchFolding.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/MBFIWrapper.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
+#include "llvm/CodeGen/TailDuplicator.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PrintPasses.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/CodeLayout.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "block-placement"
+
+STATISTIC(NumCondBranches, "Number of conditional branches");
+STATISTIC(NumUncondBranches, "Number of unconditional branches");
+STATISTIC(CondBranchTakenFreq,
+          "Potential frequency of taking conditional branches");
+STATISTIC(UncondBranchTakenFreq,
+          "Potential frequency of taking unconditional branches");
+
+static cl::opt<unsigned> AlignAllBlock(
+    "align-all-blocks",
+    cl::desc("Force the alignment of all blocks in the function in log2 format "
+             "(e.g 4 means align on 16B boundaries)."),
+    cl::init(0), cl::Hidden);
+
+static cl::opt<unsigned> AlignAllNonFallThruBlocks(
+    "align-all-nofallthru-blocks",
+    cl::desc("Force the alignment of all blocks that have no fall-through "
+             "predecessors (i.e. don't add nops that are executed). In log2 "
+             "format (e.g 4 means align on 16B boundaries)."),
+    cl::init(0), cl::Hidden);
+
+static cl::opt<unsigned> MaxBytesForAlignmentOverride(
+    "max-bytes-for-alignment",
+    cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
+             "alignment"),
+    cl::init(0), cl::Hidden);
+
+// FIXME: Find a good default for this flag and remove the flag.
+static cl::opt<unsigned> ExitBlockBias(
+    "block-placement-exit-block-bias",
+    cl::desc("Block frequency percentage a loop exit block needs "
+             "over the original exit to be considered the new exit."),
+    cl::init(0), cl::Hidden);
+
+// Definition:
+// - Outlining: placement of a basic block outside the chain or hot path.
+
+static cl::opt<unsigned> LoopToColdBlockRatio(
+    "loop-to-cold-block-ratio",
+    cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
+             "(frequency of block) is greater than this ratio"),
+    cl::init(5), cl::Hidden);
+
+static cl::opt<bool> ForceLoopColdBlock(
+    "force-loop-cold-block",
+    cl::desc("Force outlining cold blocks from loops."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+    PreciseRotationCost("precise-rotation-cost",
+                        cl::desc("Model the cost of loop rotation more "
+                                 "precisely by using profile data."),
+                        cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+    ForcePreciseRotationCost("force-precise-rotation-cost",
+                             cl::desc("Force the use of precise cost "
+                                      "loop rotation strategy."),
+                             cl::init(false), cl::Hidden);
+
+static cl::opt<unsigned> MisfetchCost(
+    "misfetch-cost",
+    cl::desc("Cost that models the probabilistic risk of an instruction "
+             "misfetch due to a jump comparing to falling through, whose cost "
+             "is zero."),
+    cl::init(1), cl::Hidden);
+
+static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
+                                      cl::desc("Cost of jump instructions."),
+                                      cl::init(1), cl::Hidden);
+static cl::opt<bool>
+TailDupPlacement("tail-dup-placement",
+              cl::desc("Perform tail duplication during placement. "
+                       "Creates more fallthrough opportunites in "
+                       "outline branches."),
+              cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+BranchFoldPlacement("branch-fold-placement",
+              cl::desc("Perform branch folding during placement. "
+                       "Reduces code size."),
+              cl::init(true), cl::Hidden);
+
+// Heuristic for tail duplication.
+static cl::opt<unsigned> TailDupPlacementThreshold(
+    "tail-dup-placement-threshold",
+    cl::desc("Instruction cutoff for tail duplication during layout. "
+             "Tail merging during layout is forced to have a threshold "
+             "that won't conflict."), cl::init(2),
+    cl::Hidden);
+
+// Heuristic for aggressive tail duplication.
+static cl::opt<unsigned> TailDupPlacementAggressiveThreshold(
+    "tail-dup-placement-aggressive-threshold",
+    cl::desc("Instruction cutoff for aggressive tail duplication during "
+             "layout. Used at -O3. Tail merging during layout is forced to "
+             "have a threshold that won't conflict."), cl::init(4),
+    cl::Hidden);
+
+// Heuristic for tail duplication.
+static cl::opt<unsigned> TailDupPlacementPenalty(
+    "tail-dup-placement-penalty",
+    cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
+             "Copying can increase fallthrough, but it also increases icache "
+             "pressure. This parameter controls the penalty to account for that. "
+             "Percent as integer."),
+    cl::init(2),
+    cl::Hidden);
+
+// Heuristic for tail duplication if profile count is used in cost model.
+static cl::opt<unsigned> TailDupProfilePercentThreshold(
+    "tail-dup-profile-percent-threshold",
+    cl::desc("If profile count information is used in tail duplication cost "
+             "model, the gained fall through number from tail duplication "
+             "should be at least this percent of hot count."),
+    cl::init(50), cl::Hidden);
+
+// Heuristic for triangle chains.
+static cl::opt<unsigned> TriangleChainCount(
+    "triangle-chain-count",
+    cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
+             "triangle tail duplication heuristic to kick in. 0 to disable."),
+    cl::init(2),
+    cl::Hidden);
+
+// Use case: When block layout is visualized after MBP pass, the basic blocks
+// are labeled in layout order; meanwhile blocks could be numbered in a
+// different order. It's hard to map between the graph and pass output.
+// With this option on, the basic blocks are renumbered in function layout
+// order. For debugging only.
+static cl::opt<bool> RenumberBlocksBeforeView(
+    "renumber-blocks-before-view",
+    cl::desc(
+        "If true, basic blocks are re-numbered before MBP layout is printed "
+        "into a dot graph. Only used when a function is being printed."),
+    cl::init(false), cl::Hidden);
+
+namespace llvm {
+extern cl::opt<bool> EnableExtTspBlockPlacement;
+extern cl::opt<bool> ApplyExtTspWithoutProfile;
+extern cl::opt<unsigned> StaticLikelyProb;
+extern cl::opt<unsigned> ProfileLikelyProb;
+
+// Internal option used to control BFI display only after MBP pass.
+// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
+// -view-block-layout-with-bfi=
+extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI;
+
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+} // namespace llvm
+
+namespace {
+
+class BlockChain;
+
+/// Type for our function-wide basic block -> block chain mapping.
+using BlockToChainMapType = DenseMap<const MachineBasicBlock *, BlockChain *>;
+
+/// A chain of blocks which will be laid out contiguously.
+///
+/// This is the datastructure representing a chain of consecutive blocks that
+/// are profitable to layout together in order to maximize fallthrough
+/// probabilities and code locality. We also can use a block chain to represent
+/// a sequence of basic blocks which have some external (correctness)
+/// requirement for sequential layout.
+///
+/// Chains can be built around a single basic block and can be merged to grow
+/// them. They participate in a block-to-chain mapping, which is updated
+/// automatically as chains are merged together.
+class BlockChain {
+  /// The sequence of blocks belonging to this chain.
+  ///
+  /// This is the sequence of blocks for a particular chain. These will be laid
+  /// out in-order within the function.
+  SmallVector<MachineBasicBlock *, 4> Blocks;
+
+  /// A handle to the function-wide basic block to block chain mapping.
+  ///
+  /// This is retained in each block chain to simplify the computation of child
+  /// block chains for SCC-formation and iteration. We store the edges to child
+  /// basic blocks, and map them back to their associated chains using this
+  /// structure.
+  BlockToChainMapType &BlockToChain;
+
+public:
+  /// Construct a new BlockChain.
+  ///
+  /// This builds a new block chain representing a single basic block in the
+  /// function. It also registers itself as the chain that block participates
+  /// in with the BlockToChain mapping.
+  BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
+      : Blocks(1, BB), BlockToChain(BlockToChain) {
+    assert(BB && "Cannot create a chain with a null basic block");
+    BlockToChain[BB] = this;
+  }
+
+  /// Iterator over blocks within the chain.
+  using iterator = SmallVectorImpl<MachineBasicBlock *>::iterator;
+  using const_iterator = SmallVectorImpl<MachineBasicBlock *>::const_iterator;
+
+  /// Beginning of blocks within the chain.
+  iterator begin() { return Blocks.begin(); }
+  const_iterator begin() const { return Blocks.begin(); }
+
+  /// End of blocks within the chain.
+  iterator end() { return Blocks.end(); }
+  const_iterator end() const { return Blocks.end(); }
+
+  bool remove(MachineBasicBlock* BB) {
+    for(iterator i = begin(); i != end(); ++i) {
+      if (*i == BB) {
+        Blocks.erase(i);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /// Merge a block chain into this one.
+  ///
+  /// This routine merges a block chain into this one. It takes care of forming
+  /// a contiguous sequence of basic blocks, updating the edge list, and
+  /// updating the block -> chain mapping. It does not free or tear down the
+  /// old chain, but the old chain's block list is no longer valid.
+  void merge(MachineBasicBlock *BB, BlockChain *Chain) {
+    assert(BB && "Can't merge a null block.");
+    assert(!Blocks.empty() && "Can't merge into an empty chain.");
+
+    // Fast path in case we don't have a chain already.
+    if (!Chain) {
+      assert(!BlockToChain[BB] &&
+             "Passed chain is null, but BB has entry in BlockToChain.");
+      Blocks.push_back(BB);
+      BlockToChain[BB] = this;
+      return;
+    }
+
+    assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
+    assert(Chain->begin() != Chain->end());
+
+    // Update the incoming blocks to point to this chain, and add them to the
+    // chain structure.
+    for (MachineBasicBlock *ChainBB : *Chain) {
+      Blocks.push_back(ChainBB);
+      assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
+      BlockToChain[ChainBB] = this;
+    }
+  }
+
+#ifndef NDEBUG
+  /// Dump the blocks in this chain.
+  LLVM_DUMP_METHOD void dump() {
+    for (MachineBasicBlock *MBB : *this)
+      MBB->dump();
+  }
+#endif // NDEBUG
+
+  /// Count of predecessors of any block within the chain which have not
+  /// yet been scheduled.  In general, we will delay scheduling this chain
+  /// until those predecessors are scheduled (or we find a sufficiently good
+  /// reason to override this heuristic.)  Note that when forming loop chains,
+  /// blocks outside the loop are ignored and treated as if they were already
+  /// scheduled.
+  ///
+  /// Note: This field is reinitialized multiple times - once for each loop,
+  /// and then once for the function as a whole.
+  unsigned UnscheduledPredecessors = 0;
+};
+
+class MachineBlockPlacement : public MachineFunctionPass {
+  /// A type for a block filter set.
+  using BlockFilterSet = SmallSetVector<const MachineBasicBlock *, 16>;
+
+  /// Pair struct containing basic block and taildup profitability
+  struct BlockAndTailDupResult {
+    MachineBasicBlock *BB = nullptr;
+    bool ShouldTailDup;
+  };
+
+  /// Triple struct containing edge weight and the edge.
+  struct WeightedEdge {
+    BlockFrequency Weight;
+    MachineBasicBlock *Src = nullptr;
+    MachineBasicBlock *Dest = nullptr;
+  };
+
+  /// work lists of blocks that are ready to be laid out
+  SmallVector<MachineBasicBlock *, 16> BlockWorkList;
+  SmallVector<MachineBasicBlock *, 16> EHPadWorkList;
+
+  /// Edges that have already been computed as optimal.
+  DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges;
+
+  /// Machine Function
+  MachineFunction *F = nullptr;
+
+  /// A handle to the branch probability pass.
+  const MachineBranchProbabilityInfo *MBPI = nullptr;
+
+  /// A handle to the function-wide block frequency pass.
+  std::unique_ptr<MBFIWrapper> MBFI;
+
+  /// A handle to the loop info.
+  MachineLoopInfo *MLI = nullptr;
+
+  /// Preferred loop exit.
+  /// Member variable for convenience. It may be removed by duplication deep
+  /// in the call stack.
+  MachineBasicBlock *PreferredLoopExit = nullptr;
+
+  /// A handle to the target's instruction info.
+  const TargetInstrInfo *TII = nullptr;
+
+  /// A handle to the target's lowering info.
+  const TargetLoweringBase *TLI = nullptr;
+
+  /// A handle to the post dominator tree.
+  MachinePostDominatorTree *MPDT = nullptr;
+
+  ProfileSummaryInfo *PSI = nullptr;
+
+  /// Duplicator used to duplicate tails during placement.
+  ///
+  /// Placement decisions can open up new tail duplication opportunities, but
+  /// since tail duplication affects placement decisions of later blocks, it
+  /// must be done inline.
+  TailDuplicator TailDup;
+
+  /// Partial tail duplication threshold.
+  BlockFrequency DupThreshold;
+
+  /// True:  use block profile count to compute tail duplication cost.
+  /// False: use block frequency to compute tail duplication cost.
+  bool UseProfileCount = false;
+
+  /// Allocator and owner of BlockChain structures.
+  ///
+  /// We build BlockChains lazily while processing the loop structure of
+  /// a function. To reduce malloc traffic, we allocate them using this
+  /// slab-like allocator, and destroy them after the pass completes. An
+  /// important guarantee is that this allocator produces stable pointers to
+  /// the chains.
+  SpecificBumpPtrAllocator<BlockChain> ChainAllocator;
+
+  /// Function wide BasicBlock to BlockChain mapping.
+  ///
+  /// This mapping allows efficiently moving from any given basic block to the
+  /// BlockChain it participates in, if any. We use it to, among other things,
+  /// allow implicitly defining edges between chains as the existing edges
+  /// between basic blocks.
+  DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChain;
+
+#ifndef NDEBUG
+  /// The set of basic blocks that have terminators that cannot be fully
+  /// analyzed.  These basic blocks cannot be re-ordered safely by
+  /// MachineBlockPlacement, and we must preserve physical layout of these
+  /// blocks and their successors through the pass.
+  SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
+#endif
+
+  /// Get block profile count or frequency according to UseProfileCount.
+  /// The return value is used to model tail duplication cost.
+  BlockFrequency getBlockCountOrFrequency(const MachineBasicBlock *BB) {
+    if (UseProfileCount) {
+      auto Count = MBFI->getBlockProfileCount(BB);
+      if (Count)
+        return *Count;
+      else
+        return 0;
+    } else
+      return MBFI->getBlockFreq(BB);
+  }
+
+  /// Scale the DupThreshold according to basic block size.
+  BlockFrequency scaleThreshold(MachineBasicBlock *BB);
+  void initDupThreshold();
+
+  /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
+  /// if the count goes to 0, add them to the appropriate work list.
+  void markChainSuccessors(
+      const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
+      const BlockFilterSet *BlockFilter = nullptr);
+
+  /// Decrease the UnscheduledPredecessors count for a single block, and
+  /// if the count goes to 0, add them to the appropriate work list.
+  void markBlockSuccessors(
+      const BlockChain &Chain, const MachineBasicBlock *BB,
+      const MachineBasicBlock *LoopHeaderBB,
+      const BlockFilterSet *BlockFilter = nullptr);
+
+  BranchProbability
+  collectViableSuccessors(
+      const MachineBasicBlock *BB, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter,
+      SmallVector<MachineBasicBlock *, 4> &Successors);
+  bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
+                       BlockFilterSet *BlockFilter);
+  void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
+                               MachineBasicBlock *BB,
+                               BlockFilterSet *BlockFilter);
+  bool repeatedlyTailDuplicateBlock(
+      MachineBasicBlock *BB, MachineBasicBlock *&LPred,
+      const MachineBasicBlock *LoopHeaderBB,
+      BlockChain &Chain, BlockFilterSet *BlockFilter,
+      MachineFunction::iterator &PrevUnplacedBlockIt);
+  bool maybeTailDuplicateBlock(
+      MachineBasicBlock *BB, MachineBasicBlock *LPred,
+      BlockChain &Chain, BlockFilterSet *BlockFilter,
+      MachineFunction::iterator &PrevUnplacedBlockIt,
+      bool &DuplicatedToLPred);
+  bool hasBetterLayoutPredecessor(
+      const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+      const BlockChain &SuccChain, BranchProbability SuccProb,
+      BranchProbability RealSuccProb, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter);
+  BlockAndTailDupResult selectBestSuccessor(
+      const MachineBasicBlock *BB, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter);
+  MachineBasicBlock *selectBestCandidateBlock(
+      const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
+  MachineBasicBlock *getFirstUnplacedBlock(
+      const BlockChain &PlacedChain,
+      MachineFunction::iterator &PrevUnplacedBlockIt,
+      const BlockFilterSet *BlockFilter);
+
+  /// Add a basic block to the work list if it is appropriate.
+  ///
+  /// If the optional parameter BlockFilter is provided, only MBB
+  /// present in the set will be added to the worklist. If nullptr
+  /// is provided, no filtering occurs.
+  void fillWorkLists(const MachineBasicBlock *MBB,
+                     SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
+                     const BlockFilterSet *BlockFilter);
+
+  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
+                  BlockFilterSet *BlockFilter = nullptr);
+  bool canMoveBottomBlockToTop(const MachineBasicBlock *BottomBlock,
+                               const MachineBasicBlock *OldTop);
+  bool hasViableTopFallthrough(const MachineBasicBlock *Top,
+                               const BlockFilterSet &LoopBlockSet);
+  BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
+                                    const BlockFilterSet &LoopBlockSet);
+  BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
+                                  const MachineBasicBlock *OldTop,
+                                  const MachineBasicBlock *ExitBB,
+                                  const BlockFilterSet &LoopBlockSet);
+  MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
+      const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
+  MachineBasicBlock *findBestLoopTop(
+      const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
+  MachineBasicBlock *findBestLoopExit(
+      const MachineLoop &L, const BlockFilterSet &LoopBlockSet,
+      BlockFrequency &ExitFreq);
+  BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
+  void buildLoopChains(const MachineLoop &L);
+  void rotateLoop(
+      BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
+      BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
+  void rotateLoopWithProfile(
+      BlockChain &LoopChain, const MachineLoop &L,
+      const BlockFilterSet &LoopBlockSet);
+  void buildCFGChains();
+  void optimizeBranches();
+  void alignBlocks();
+  /// Returns true if a block should be tail-duplicated to increase fallthrough
+  /// opportunities.
+  bool shouldTailDuplicate(MachineBasicBlock *BB);
+  /// Check the edge frequencies to see if tail duplication will increase
+  /// fallthroughs.
+  bool isProfitableToTailDup(
+    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+    BranchProbability QProb,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+
+  /// Check for a trellis layout.
+  bool isTrellis(const MachineBasicBlock *BB,
+                 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+                 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+
+  /// Get the best successor given a trellis layout.
+  BlockAndTailDupResult getBestTrellisSuccessor(
+      const MachineBasicBlock *BB,
+      const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+      BranchProbability AdjustedSumProb, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter);
+
+  /// Get the best pair of non-conflicting edges.
+  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
+      const MachineBasicBlock *BB,
+      MutableArrayRef<SmallVector<WeightedEdge, 8>> Edges);
+
+  /// Returns true if a block can tail duplicate into all unplaced
+  /// predecessors. Filters based on loop.
+  bool canTailDuplicateUnplacedPreds(
+      const MachineBasicBlock *BB, MachineBasicBlock *Succ,
+      const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+
+  /// Find chains of triangles to tail-duplicate where a global analysis works,
+  /// but a local analysis would not find them.
+  void precomputeTriangleChains();
+
+  /// Apply a post-processing step optimizing block placement.
+  void applyExtTsp();
+
+  /// Modify the existing block placement in the function and adjust all jumps.
+  void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder);
+
+  /// Create a single CFG chain from the current block order.
+  void createCFGChainExtTsp();
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  MachineBlockPlacement() : MachineFunctionPass(ID) {
+    initializeMachineBlockPlacementPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  bool allowTailDupPlacement() const {
+    assert(F);
+    return TailDupPlacement && !F->getTarget().requiresStructuredCFG();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    if (TailDupPlacement)
+      AU.addRequired<MachinePostDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<TargetPassConfig>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char MachineBlockPlacement::ID = 0;
+
+char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID;
+
+INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
+                      "Branch Probability Basic Block Placement", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
+                    "Branch Probability Basic Block Placement", false, false)
+
+#ifndef NDEBUG
+/// Helper to print the name of a MBB.
+///
+/// Only used by debug logging.
+static std::string getBlockName(const MachineBasicBlock *BB) {
+  std::string Result;
+  raw_string_ostream OS(Result);
+  OS << printMBBReference(*BB);
+  OS << " ('" << BB->getName() << "')";
+  OS.flush();
+  return Result;
+}
+#endif
+
+/// Mark a chain's successors as having one fewer preds.
+///
+/// When a chain is being merged into the "placed" chain, this routine will
+/// quickly walk the successors of each block in the chain and mark them as
+/// having one fewer active predecessor. It also adds any successors of this
+/// chain which reach the zero-predecessor state to the appropriate worklist.
+void MachineBlockPlacement::markChainSuccessors(
+    const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
+    const BlockFilterSet *BlockFilter) {
+  // Walk all the blocks in this chain, marking their successors as having
+  // a predecessor placed.
+  for (MachineBasicBlock *MBB : Chain) {
+    markBlockSuccessors(Chain, MBB, LoopHeaderBB, BlockFilter);
+  }
+}
+
+/// Mark a single block's successors as having one fewer preds.
+///
+/// Under normal circumstances, this is only called by markChainSuccessors,
+/// but if a block that was to be placed is completely tail-duplicated away,
+/// and was duplicated into the chain end, we need to redo markBlockSuccessors
+/// for just that block.
+void MachineBlockPlacement::markBlockSuccessors(
+    const BlockChain &Chain, const MachineBasicBlock *MBB,
+    const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
+  // Add any successors for which this is the only un-placed in-loop
+  // predecessor to the worklist as a viable candidate for CFG-neutral
+  // placement. No subsequent placement of this block will violate the CFG
+  // shape, so we get to use heuristics to choose a favorable placement.
+  for (MachineBasicBlock *Succ : MBB->successors()) {
+    if (BlockFilter && !BlockFilter->count(Succ))
+      continue;
+    BlockChain &SuccChain = *BlockToChain[Succ];
+    // Disregard edges within a fixed chain, or edges to the loop header.
+    if (&Chain == &SuccChain || Succ == LoopHeaderBB)
+      continue;
+
+    // This is a cross-chain edge that is within the loop, so decrement the
+    // loop predecessor count of the destination chain.
+    if (SuccChain.UnscheduledPredecessors == 0 ||
+        --SuccChain.UnscheduledPredecessors > 0)
+      continue;
+
+    auto *NewBB = *SuccChain.begin();
+    if (NewBB->isEHPad())
+      EHPadWorkList.push_back(NewBB);
+    else
+      BlockWorkList.push_back(NewBB);
+  }
+}
+
+/// This helper function collects the set of successors of block
+/// \p BB that are allowed to be its layout successors, and return
+/// the total branch probability of edges from \p BB to those
+/// blocks.
+BranchProbability MachineBlockPlacement::collectViableSuccessors(
+    const MachineBasicBlock *BB, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter,
+    SmallVector<MachineBasicBlock *, 4> &Successors) {
+  // Adjust edge probabilities by excluding edges pointing to blocks that is
+  // either not in BlockFilter or is already in the current chain. Consider the
+  // following CFG:
+  //
+  //     --->A
+  //     |  / \
+  //     | B   C
+  //     |  \ / \
+  //     ----D   E
+  //
+  // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
+  // A->C is chosen as a fall-through, D won't be selected as a successor of C
+  // due to CFG constraint (the probability of C->D is not greater than
+  // HotProb to break topo-order). If we exclude E that is not in BlockFilter
+  // when calculating the probability of C->D, D will be selected and we
+  // will get A C D B as the layout of this loop.
+  auto AdjustedSumProb = BranchProbability::getOne();
+  for (MachineBasicBlock *Succ : BB->successors()) {
+    bool SkipSucc = false;
+    if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
+      SkipSucc = true;
+    } else {
+      BlockChain *SuccChain = BlockToChain[Succ];
+      if (SuccChain == &Chain) {
+        SkipSucc = true;
+      } else if (Succ != *SuccChain->begin()) {
+        LLVM_DEBUG(dbgs() << "    " << getBlockName(Succ)
+                          << " -> Mid chain!\n");
+        continue;
+      }
+    }
+    if (SkipSucc)
+      AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
+    else
+      Successors.push_back(Succ);
+  }
+
+  return AdjustedSumProb;
+}
+
+/// The helper function returns the branch probability that is adjusted
+/// or normalized over the new total \p AdjustedSumProb.
+static BranchProbability
+getAdjustedProbability(BranchProbability OrigProb,
+                       BranchProbability AdjustedSumProb) {
+  BranchProbability SuccProb;
+  uint32_t SuccProbN = OrigProb.getNumerator();
+  uint32_t SuccProbD = AdjustedSumProb.getNumerator();
+  if (SuccProbN >= SuccProbD)
+    SuccProb = BranchProbability::getOne();
+  else
+    SuccProb = BranchProbability(SuccProbN, SuccProbD);
+
+  return SuccProb;
+}
+
+/// Check if \p BB has exactly the successors in \p Successors.
+static bool
+hasSameSuccessors(MachineBasicBlock &BB,
+                  SmallPtrSetImpl<const MachineBasicBlock *> &Successors) {
+  if (BB.succ_size() != Successors.size())
+    return false;
+  // We don't want to count self-loops
+  if (Successors.count(&BB))
+    return false;
+  for (MachineBasicBlock *Succ : BB.successors())
+    if (!Successors.count(Succ))
+      return false;
+  return true;
+}
+
+/// Check if a block should be tail duplicated to increase fallthrough
+/// opportunities.
+/// \p BB Block to check.
+bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
+  // Blocks with single successors don't create additional fallthrough
+  // opportunities. Don't duplicate them. TODO: When conditional exits are
+  // analyzable, allow them to be duplicated.
+  bool IsSimple = TailDup.isSimpleBB(BB);
+
+  if (BB->succ_size() == 1)
+    return false;
+  return TailDup.shouldTailDuplicate(IsSimple, *BB);
+}
+
+/// Compare 2 BlockFrequency's with a small penalty for \p A.
+/// In order to be conservative, we apply a X% penalty to account for
+/// increased icache pressure and static heuristics. For small frequencies
+/// we use only the numerators to improve accuracy. For simplicity, we assume the
+/// penalty is less than 100%
+/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
+static bool greaterWithBias(BlockFrequency A, BlockFrequency B,
+                            uint64_t EntryFreq) {
+  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
+  BlockFrequency Gain = A - B;
+  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
+}
+
+/// Check the edge frequencies to see if tail duplication will increase
+/// fallthroughs. It only makes sense to call this function when
+/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
+/// always locally profitable if we would have picked \p Succ without
+/// considering duplication.
+bool MachineBlockPlacement::isProfitableToTailDup(
+    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+    BranchProbability QProb,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  // We need to do a probability calculation to make sure this is profitable.
+  // First: does succ have a successor that post-dominates? This affects the
+  // calculation. The 2 relevant cases are:
+  //    BB         BB
+  //    | \Qout    | \Qout
+  //   P|  C       |P C
+  //    =   C'     =   C'
+  //    |  /Qin    |  /Qin
+  //    | /        | /
+  //    Succ       Succ
+  //    / \        | \  V
+  //  U/   =V      |U \
+  //  /     \      =   D
+  //  D      E     |  /
+  //               | /
+  //               |/
+  //               PDom
+  //  '=' : Branch taken for that CFG edge
+  // In the second case, Placing Succ while duplicating it into C prevents the
+  // fallthrough of Succ into either D or PDom, because they now have C as an
+  // unplaced predecessor
+
+  // Start by figuring out which case we fall into
+  MachineBasicBlock *PDom = nullptr;
+  SmallVector<MachineBasicBlock *, 4> SuccSuccs;
+  // Only scan the relevant successors
+  auto AdjustedSuccSumProb =
+      collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
+  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
+  auto BBFreq = MBFI->getBlockFreq(BB);
+  auto SuccFreq = MBFI->getBlockFreq(Succ);
+  BlockFrequency P = BBFreq * PProb;
+  BlockFrequency Qout = BBFreq * QProb;
+  uint64_t EntryFreq = MBFI->getEntryFreq();
+  // If there are no more successors, it is profitable to copy, as it strictly
+  // increases fallthrough.
+  if (SuccSuccs.size() == 0)
+    return greaterWithBias(P, Qout, EntryFreq);
+
+  auto BestSuccSucc = BranchProbability::getZero();
+  // Find the PDom or the best Succ if no PDom exists.
+  for (MachineBasicBlock *SuccSucc : SuccSuccs) {
+    auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
+    if (Prob > BestSuccSucc)
+      BestSuccSucc = Prob;
+    if (PDom == nullptr)
+      if (MPDT->dominates(SuccSucc, Succ)) {
+        PDom = SuccSucc;
+        break;
+      }
+  }
+  // For the comparisons, we need to know Succ's best incoming edge that isn't
+  // from BB.
+  auto SuccBestPred = BlockFrequency(0);
+  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
+    if (SuccPred == Succ || SuccPred == BB
+        || BlockToChain[SuccPred] == &Chain
+        || (BlockFilter && !BlockFilter->count(SuccPred)))
+      continue;
+    auto Freq = MBFI->getBlockFreq(SuccPred)
+        * MBPI->getEdgeProbability(SuccPred, Succ);
+    if (Freq > SuccBestPred)
+      SuccBestPred = Freq;
+  }
+  // Qin is Succ's best unplaced incoming edge that isn't BB
+  BlockFrequency Qin = SuccBestPred;
+  // If it doesn't have a post-dominating successor, here is the calculation:
+  //    BB        BB
+  //    | \Qout   |  \
+  //   P|  C      |   =
+  //    =   C'    |    C
+  //    |  /Qin   |     |
+  //    | /       |     C' (+Succ)
+  //    Succ      Succ /|
+  //    / \       |  \/ |
+  //  U/   =V     |  == |
+  //  /     \     | /  \|
+  //  D      E    D     E
+  //  '=' : Branch taken for that CFG edge
+  //  Cost in the first case is: P + V
+  //  For this calculation, we always assume P > Qout. If Qout > P
+  //  The result of this function will be ignored at the caller.
+  //  Let F = SuccFreq - Qin
+  //  Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
+
+  if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
+    BranchProbability UProb = BestSuccSucc;
+    BranchProbability VProb = AdjustedSuccSumProb - UProb;
+    BlockFrequency F = SuccFreq - Qin;
+    BlockFrequency V = SuccFreq * VProb;
+    BlockFrequency QinU = std::min(Qin, F) * UProb;
+    BlockFrequency BaseCost = P + V;
+    BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
+    return greaterWithBias(BaseCost, DupCost, EntryFreq);
+  }
+  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
+  BranchProbability VProb = AdjustedSuccSumProb - UProb;
+  BlockFrequency U = SuccFreq * UProb;
+  BlockFrequency V = SuccFreq * VProb;
+  BlockFrequency F = SuccFreq - Qin;
+  // If there is a post-dominating successor, here is the calculation:
+  // BB         BB                 BB          BB
+  // | \Qout    |   \               | \Qout     |  \
+  // |P C       |    =              |P C        |   =
+  // =   C'     |P    C             =   C'      |P   C
+  // |  /Qin    |      |            |  /Qin     |     |
+  // | /        |      C' (+Succ)   | /         |     C' (+Succ)
+  // Succ       Succ  /|            Succ        Succ /|
+  // | \  V     |   \/ |            | \  V      |  \/ |
+  // |U \       |U  /\ =?           |U =        |U /\ |
+  // =   D      = =  =?|            |   D       | =  =|
+  // |  /       |/     D            |  /        |/    D
+  // | /        |     /             | =         |    /
+  // |/         |    /              |/          |   =
+  // Dom         Dom                Dom         Dom
+  //  '=' : Branch taken for that CFG edge
+  // The cost for taken branches in the first case is P + U
+  // Let F = SuccFreq - Qin
+  // The cost in the second case (assuming independence), given the layout:
+  // BB, Succ, (C+Succ), D, Dom or the layout:
+  // BB, Succ, D, Dom, (C+Succ)
+  // is Qout + max(F, Qin) * U + min(F, Qin)
+  // compare P + U vs Qout + P * U + Qin.
+  //
+  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
+  //
+  // For the 3rd case, the cost is P + 2 * V
+  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
+  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
+  if (UProb > AdjustedSuccSumProb / 2 &&
+      !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
+                                  Chain, BlockFilter))
+    // Cases 3 & 4
+    return greaterWithBias(
+        (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
+        EntryFreq);
+  // Cases 1 & 2
+  return greaterWithBias((P + U),
+                         (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
+                          std::max(Qin, F) * UProb),
+                         EntryFreq);
+}
+
+/// Check for a trellis layout. \p BB is the upper part of a trellis if its
+/// successors form the lower part of a trellis. A successor set S forms the
+/// lower part of a trellis if all of the predecessors of S are either in S or
+/// have all of S as successors. We ignore trellises where BB doesn't have 2
+/// successors because for fewer than 2, it's trivial, and for 3 or greater they
+/// are very uncommon and complex to compute optimally. Allowing edges within S
+/// is not strictly a trellis, but the same algorithm works, so we allow it.
+bool MachineBlockPlacement::isTrellis(
+    const MachineBasicBlock *BB,
+    const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  // Technically BB could form a trellis with branching factor higher than 2.
+  // But that's extremely uncommon.
+  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
+    return false;
+
+  SmallPtrSet<const MachineBasicBlock *, 2> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+  // To avoid reviewing the same predecessors twice.
+  SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds;
+
+  for (MachineBasicBlock *Succ : ViableSuccs) {
+    int PredCount = 0;
+    for (auto *SuccPred : Succ->predecessors()) {
+      // Allow triangle successors, but don't count them.
+      if (Successors.count(SuccPred)) {
+        // Make sure that it is actually a triangle.
+        for (MachineBasicBlock *CheckSucc : SuccPred->successors())
+          if (!Successors.count(CheckSucc))
+            return false;
+        continue;
+      }
+      const BlockChain *PredChain = BlockToChain[SuccPred];
+      if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
+          PredChain == &Chain || PredChain == BlockToChain[Succ])
+        continue;
+      ++PredCount;
+      // Perform the successor check only once.
+      if (!SeenPreds.insert(SuccPred).second)
+        continue;
+      if (!hasSameSuccessors(*SuccPred, Successors))
+        return false;
+    }
+    // If one of the successors has only BB as a predecessor, it is not a
+    // trellis.
+    if (PredCount < 1)
+      return false;
+  }
+  return true;
+}
+
+/// Pick the highest total weight pair of edges that can both be laid out.
+/// The edges in \p Edges[0] are assumed to have a different destination than
+/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
+/// the individual highest weight edges to the 2 different destinations, or in
+/// case of a conflict, one of them should be replaced with a 2nd best edge.
+std::pair<MachineBlockPlacement::WeightedEdge,
+          MachineBlockPlacement::WeightedEdge>
+MachineBlockPlacement::getBestNonConflictingEdges(
+    const MachineBasicBlock *BB,
+    MutableArrayRef<SmallVector<MachineBlockPlacement::WeightedEdge, 8>>
+        Edges) {
+  // Sort the edges, and then for each successor, find the best incoming
+  // predecessor. If the best incoming predecessors aren't the same,
+  // then that is clearly the best layout. If there is a conflict, one of the
+  // successors will have to fallthrough from the second best predecessor. We
+  // compare which combination is better overall.
+
+  // Sort for highest frequency.
+  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
+
+  llvm::stable_sort(Edges[0], Cmp);
+  llvm::stable_sort(Edges[1], Cmp);
+  auto BestA = Edges[0].begin();
+  auto BestB = Edges[1].begin();
+  // Arrange for the correct answer to be in BestA and BestB
+  // If the 2 best edges don't conflict, the answer is already there.
+  if (BestA->Src == BestB->Src) {
+    // Compare the total fallthrough of (Best + Second Best) for both pairs
+    auto SecondBestA = std::next(BestA);
+    auto SecondBestB = std::next(BestB);
+    BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
+    BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
+    if (BestAScore < BestBScore)
+      BestA = SecondBestA;
+    else
+      BestB = SecondBestB;
+  }
+  // Arrange for the BB edge to be in BestA if it exists.
+  if (BestB->Src == BB)
+    std::swap(BestA, BestB);
+  return std::make_pair(*BestA, *BestB);
+}
+
+/// Get the best successor from \p BB based on \p BB being part of a trellis.
+/// We only handle trellises with 2 successors, so the algorithm is
+/// straightforward: Find the best pair of edges that don't conflict. We find
+/// the best incoming edge for each successor in the trellis. If those conflict,
+/// we consider which of them should be replaced with the second best.
+/// Upon return the two best edges will be in \p BestEdges. If one of the edges
+/// comes from \p BB, it will be in \p BestEdges[0]
+MachineBlockPlacement::BlockAndTailDupResult
+MachineBlockPlacement::getBestTrellisSuccessor(
+    const MachineBasicBlock *BB,
+    const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+    BranchProbability AdjustedSumProb, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
+
+  BlockAndTailDupResult Result = {nullptr, false};
+  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+
+  // We assume size 2 because it's common. For general n, we would have to do
+  // the Hungarian algorithm, but it's not worth the complexity because more
+  // than 2 successors is fairly uncommon, and a trellis even more so.
+  if (Successors.size() != 2 || ViableSuccs.size() != 2)
+    return Result;
+
+  // Collect the edge frequencies of all edges that form the trellis.
+  SmallVector<WeightedEdge, 8> Edges[2];
+  int SuccIndex = 0;
+  for (auto *Succ : ViableSuccs) {
+    for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
+      // Skip any placed predecessors that are not BB
+      if (SuccPred != BB)
+        if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
+            BlockToChain[SuccPred] == &Chain ||
+            BlockToChain[SuccPred] == BlockToChain[Succ])
+          continue;
+      BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
+                                MBPI->getEdgeProbability(SuccPred, Succ);
+      Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
+    }
+    ++SuccIndex;
+  }
+
+  // Pick the best combination of 2 edges from all the edges in the trellis.
+  WeightedEdge BestA, BestB;
+  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
+
+  if (BestA.Src != BB) {
+    // If we have a trellis, and BB doesn't have the best fallthrough edges,
+    // we shouldn't choose any successor. We've already looked and there's a
+    // better fallthrough edge for all the successors.
+    LLVM_DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
+    return Result;
+  }
+
+  // Did we pick the triangle edge? If tail-duplication is profitable, do
+  // that instead. Otherwise merge the triangle edge now while we know it is
+  // optimal.
+  if (BestA.Dest == BestB.Src) {
+    // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
+    // would be better.
+    MachineBasicBlock *Succ1 = BestA.Dest;
+    MachineBasicBlock *Succ2 = BestB.Dest;
+    // Check to see if tail-duplication would be profitable.
+    if (allowTailDupPlacement() && shouldTailDuplicate(Succ2) &&
+        canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
+        isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
+                              Chain, BlockFilter)) {
+      LLVM_DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
+                     MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
+                 dbgs() << "    Selected: " << getBlockName(Succ2)
+                        << ", probability: " << Succ2Prob
+                        << " (Tail Duplicate)\n");
+      Result.BB = Succ2;
+      Result.ShouldTailDup = true;
+      return Result;
+    }
+  }
+  // We have already computed the optimal edge for the other side of the
+  // trellis.
+  ComputedEdges[BestB.Src] = { BestB.Dest, false };
+
+  auto TrellisSucc = BestA.Dest;
+  LLVM_DEBUG(BranchProbability SuccProb = getAdjustedProbability(
+                 MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
+             dbgs() << "    Selected: " << getBlockName(TrellisSucc)
+                    << ", probability: " << SuccProb << " (Trellis)\n");
+  Result.BB = TrellisSucc;
+  return Result;
+}
+
+/// When the option allowTailDupPlacement() is on, this method checks if the
+/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
+/// into all of its unplaced, unfiltered predecessors, that are not BB.
+bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
+    const MachineBasicBlock *BB, MachineBasicBlock *Succ,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  if (!shouldTailDuplicate(Succ))
+    return false;
+
+  // The result of canTailDuplicate.
+  bool Duplicate = true;
+  // Number of possible duplication.
+  unsigned int NumDup = 0;
+
+  // For CFG checking.
+  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+  for (MachineBasicBlock *Pred : Succ->predecessors()) {
+    // Make sure all unplaced and unfiltered predecessors can be
+    // tail-duplicated into.
+    // Skip any blocks that are already placed or not in this loop.
+    if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
+        || (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
+      continue;
+    if (!TailDup.canTailDuplicate(Succ, Pred)) {
+      if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
+        // This will result in a trellis after tail duplication, so we don't
+        // need to copy Succ into this predecessor. In the presence
+        // of a trellis tail duplication can continue to be profitable.
+        // For example:
+        // A            A
+        // |\           |\
+        // | \          | \
+        // |  C         |  C+BB
+        // | /          |  |
+        // |/           |  |
+        // BB    =>     BB |
+        // |\           |\/|
+        // | \          |/\|
+        // |  D         |  D
+        // | /          | /
+        // |/           |/
+        // Succ         Succ
+        //
+        // After BB was duplicated into C, the layout looks like the one on the
+        // right. BB and C now have the same successors. When considering
+        // whether Succ can be duplicated into all its unplaced predecessors, we
+        // ignore C.
+        // We can do this because C already has a profitable fallthrough, namely
+        // D. TODO(iteratee): ignore sufficiently cold predecessors for
+        // duplication and for this test.
+        //
+        // This allows trellises to be laid out in 2 separate chains
+        // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
+        // because it allows the creation of 2 fallthrough paths with links
+        // between them, and we correctly identify the best layout for these
+        // CFGs. We want to extend trellises that the user created in addition
+        // to trellises created by tail-duplication, so we just look for the
+        // CFG.
+        continue;
+      Duplicate = false;
+      continue;
+    }
+    NumDup++;
+  }
+
+  // No possible duplication in current filter set.
+  if (NumDup == 0)
+    return false;
+
+  // If profile information is available, findDuplicateCandidates can do more
+  // precise benefit analysis.
+  if (F->getFunction().hasProfileData())
+    return true;
+
+  // This is mainly for function exit BB.
+  // The integrated tail duplication is really designed for increasing
+  // fallthrough from predecessors from Succ to its successors. We may need
+  // other machanism to handle different cases.
+  if (Succ->succ_empty())
+    return true;
+
+  // Plus the already placed predecessor.
+  NumDup++;
+
+  // If the duplication candidate has more unplaced predecessors than
+  // successors, the extra duplication can't bring more fallthrough.
+  //
+  //     Pred1 Pred2 Pred3
+  //         \   |   /
+  //          \  |  /
+  //           \ | /
+  //            Dup
+  //            / \
+  //           /   \
+  //       Succ1  Succ2
+  //
+  // In this example Dup has 2 successors and 3 predecessors, duplication of Dup
+  // can increase the fallthrough from Pred1 to Succ1 and from Pred2 to Succ2,
+  // but the duplication into Pred3 can't increase fallthrough.
+  //
+  // A small number of extra duplication may not hurt too much. We need a better
+  // heuristic to handle it.
+  if ((NumDup > Succ->succ_size()) || !Duplicate)
+    return false;
+
+  return true;
+}
+
+/// Find chains of triangles where we believe it would be profitable to
+/// tail-duplicate them all, but a local analysis would not find them.
+/// There are 3 ways this can be profitable:
+/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
+///    longer chains)
+/// 2) The chains are statically correlated. Branch probabilities have a very
+///    U-shaped distribution.
+///    [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
+///    If the branches in a chain are likely to be from the same side of the
+///    distribution as their predecessor, but are independent at runtime, this
+///    transformation is profitable. (Because the cost of being wrong is a small
+///    fixed cost, unlike the standard triangle layout where the cost of being
+///    wrong scales with the # of triangles.)
+/// 3) The chains are dynamically correlated. If the probability that a previous
+///    branch was taken positively influences whether the next branch will be
+///    taken
+/// We believe that 2 and 3 are common enough to justify the small margin in 1.
+void MachineBlockPlacement::precomputeTriangleChains() {
+  struct TriangleChain {
+    std::vector<MachineBasicBlock *> Edges;
+
+    TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
+        : Edges({src, dst}) {}
+
+    void append(MachineBasicBlock *dst) {
+      assert(getKey()->isSuccessor(dst) &&
+             "Attempting to append a block that is not a successor.");
+      Edges.push_back(dst);
+    }
+
+    unsigned count() const { return Edges.size() - 1; }
+
+    MachineBasicBlock *getKey() const {
+      return Edges.back();
+    }
+  };
+
+  if (TriangleChainCount == 0)
+    return;
+
+  LLVM_DEBUG(dbgs() << "Pre-computing triangle chains.\n");
+  // Map from last block to the chain that contains it. This allows us to extend
+  // chains as we find new triangles.
+  DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap;
+  for (MachineBasicBlock &BB : *F) {
+    // If BB doesn't have 2 successors, it doesn't start a triangle.
+    if (BB.succ_size() != 2)
+      continue;
+    MachineBasicBlock *PDom = nullptr;
+    for (MachineBasicBlock *Succ : BB.successors()) {
+      if (!MPDT->dominates(Succ, &BB))
+        continue;
+      PDom = Succ;
+      break;
+    }
+    // If BB doesn't have a post-dominating successor, it doesn't form a
+    // triangle.
+    if (PDom == nullptr)
+      continue;
+    // If PDom has a hint that it is low probability, skip this triangle.
+    if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
+      continue;
+    // If PDom isn't eligible for duplication, this isn't the kind of triangle
+    // we're looking for.
+    if (!shouldTailDuplicate(PDom))
+      continue;
+    bool CanTailDuplicate = true;
+    // If PDom can't tail-duplicate into it's non-BB predecessors, then this
+    // isn't the kind of triangle we're looking for.
+    for (MachineBasicBlock* Pred : PDom->predecessors()) {
+      if (Pred == &BB)
+        continue;
+      if (!TailDup.canTailDuplicate(PDom, Pred)) {
+        CanTailDuplicate = false;
+        break;
+      }
+    }
+    // If we can't tail-duplicate PDom to its predecessors, then skip this
+    // triangle.
+    if (!CanTailDuplicate)
+      continue;
+
+    // Now we have an interesting triangle. Insert it if it's not part of an
+    // existing chain.
+    // Note: This cannot be replaced with a call insert() or emplace() because
+    // the find key is BB, but the insert/emplace key is PDom.
+    auto Found = TriangleChainMap.find(&BB);
+    // If it is, remove the chain from the map, grow it, and put it back in the
+    // map with the end as the new key.
+    if (Found != TriangleChainMap.end()) {
+      TriangleChain Chain = std::move(Found->second);
+      TriangleChainMap.erase(Found);
+      Chain.append(PDom);
+      TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
+    } else {
+      auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
+      assert(InsertResult.second && "Block seen twice.");
+      (void)InsertResult;
+    }
+  }
+
+  // Iterating over a DenseMap is safe here, because the only thing in the body
+  // of the loop is inserting into another DenseMap (ComputedEdges).
+  // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
+  for (auto &ChainPair : TriangleChainMap) {
+    TriangleChain &Chain = ChainPair.second;
+    // Benchmarking has shown that due to branch correlation duplicating 2 or
+    // more triangles is profitable, despite the calculations assuming
+    // independence.
+    if (Chain.count() < TriangleChainCount)
+      continue;
+    MachineBasicBlock *dst = Chain.Edges.back();
+    Chain.Edges.pop_back();
+    for (MachineBasicBlock *src : reverse(Chain.Edges)) {
+      LLVM_DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->"
+                        << getBlockName(dst)
+                        << " as pre-computed based on triangles.\n");
+
+      auto InsertResult = ComputedEdges.insert({src, {dst, true}});
+      assert(InsertResult.second && "Block seen twice.");
+      (void)InsertResult;
+
+      dst = src;
+    }
+  }
+}
+
+// When profile is not present, return the StaticLikelyProb.
+// When profile is available, we need to handle the triangle-shape CFG.
+static BranchProbability getLayoutSuccessorProbThreshold(
+      const MachineBasicBlock *BB) {
+  if (!BB->getParent()->getFunction().hasProfileData())
+    return BranchProbability(StaticLikelyProb, 100);
+  if (BB->succ_size() == 2) {
+    const MachineBasicBlock *Succ1 = *BB->succ_begin();
+    const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
+    if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
+      /* See case 1 below for the cost analysis. For BB->Succ to
+       * be taken with smaller cost, the following needs to hold:
+       *   Prob(BB->Succ) > 2 * Prob(BB->Pred)
+       *   So the threshold T in the calculation below
+       *   (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
+       *   So T / (1 - T) = 2, Yielding T = 2/3
+       * Also adding user specified branch bias, we have
+       *   T = (2/3)*(ProfileLikelyProb/50)
+       *     = (2*ProfileLikelyProb)/150)
+       */
+      return BranchProbability(2 * ProfileLikelyProb, 150);
+    }
+  }
+  return BranchProbability(ProfileLikelyProb, 100);
+}
+
+/// Checks to see if the layout candidate block \p Succ has a better layout
+/// predecessor than \c BB. If yes, returns true.
+/// \p SuccProb: The probability adjusted for only remaining blocks.
+///   Only used for logging
+/// \p RealSuccProb: The un-adjusted probability.
+/// \p Chain: The chain that BB belongs to and Succ is being considered for.
+/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
+///    considered
+bool MachineBlockPlacement::hasBetterLayoutPredecessor(
+    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+    const BlockChain &SuccChain, BranchProbability SuccProb,
+    BranchProbability RealSuccProb, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
+
+  // There isn't a better layout when there are no unscheduled predecessors.
+  if (SuccChain.UnscheduledPredecessors == 0)
+    return false;
+
+  // There are two basic scenarios here:
+  // -------------------------------------
+  // Case 1: triangular shape CFG (if-then):
+  //     BB
+  //     | \
+  //     |  \
+  //     |   Pred
+  //     |   /
+  //     Succ
+  // In this case, we are evaluating whether to select edge -> Succ, e.g.
+  // set Succ as the layout successor of BB. Picking Succ as BB's
+  // successor breaks the CFG constraints (FIXME: define these constraints).
+  // With this layout, Pred BB
+  // is forced to be outlined, so the overall cost will be cost of the
+  // branch taken from BB to Pred, plus the cost of back taken branch
+  // from Pred to Succ, as well as the additional cost associated
+  // with the needed unconditional jump instruction from Pred To Succ.
+
+  // The cost of the topological order layout is the taken branch cost
+  // from BB to Succ, so to make BB->Succ a viable candidate, the following
+  // must hold:
+  //     2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
+  //      < freq(BB->Succ) *  taken_branch_cost.
+  // Ignoring unconditional jump cost, we get
+  //    freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
+  //    prob(BB->Succ) > 2 * prob(BB->Pred)
+  //
+  // When real profile data is available, we can precisely compute the
+  // probability threshold that is needed for edge BB->Succ to be considered.
+  // Without profile data, the heuristic requires the branch bias to be
+  // a lot larger to make sure the signal is very strong (e.g. 80% default).
+  // -----------------------------------------------------------------
+  // Case 2: diamond like CFG (if-then-else):
+  //     S
+  //    / \
+  //   |   \
+  //  BB    Pred
+  //   \    /
+  //    Succ
+  //    ..
+  //
+  // The current block is BB and edge BB->Succ is now being evaluated.
+  // Note that edge S->BB was previously already selected because
+  // prob(S->BB) > prob(S->Pred).
+  // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
+  // choose Pred, we will have a topological ordering as shown on the left
+  // in the picture below. If we choose Succ, we have the solution as shown
+  // on the right:
+  //
+  //   topo-order:
+  //
+  //       S-----                             ---S
+  //       |    |                             |  |
+  //    ---BB   |                             |  BB
+  //    |       |                             |  |
+  //    |  Pred--                             |  Succ--
+  //    |  |                                  |       |
+  //    ---Succ                               ---Pred--
+  //
+  // cost = freq(S->Pred) + freq(BB->Succ)    cost = 2 * freq (S->Pred)
+  //      = freq(S->Pred) + freq(S->BB)
+  //
+  // If we have profile data (i.e, branch probabilities can be trusted), the
+  // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
+  // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
+  // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
+  // means the cost of topological order is greater.
+  // When profile data is not available, however, we need to be more
+  // conservative. If the branch prediction is wrong, breaking the topo-order
+  // will actually yield a layout with large cost. For this reason, we need
+  // strong biased branch at block S with Prob(S->BB) in order to select
+  // BB->Succ. This is equivalent to looking the CFG backward with backward
+  // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
+  // profile data).
+  // --------------------------------------------------------------------------
+  // Case 3: forked diamond
+  //       S
+  //      / \
+  //     /   \
+  //   BB    Pred
+  //   | \   / |
+  //   |  \ /  |
+  //   |   X   |
+  //   |  / \  |
+  //   | /   \ |
+  //   S1     S2
+  //
+  // The current block is BB and edge BB->S1 is now being evaluated.
+  // As above S->BB was already selected because
+  // prob(S->BB) > prob(S->Pred). Assume that prob(BB->S1) >= prob(BB->S2).
+  //
+  // topo-order:
+  //
+  //     S-------|                     ---S
+  //     |       |                     |  |
+  //  ---BB      |                     |  BB
+  //  |          |                     |  |
+  //  |  Pred----|                     |  S1----
+  //  |  |                             |       |
+  //  --(S1 or S2)                     ---Pred--
+  //                                        |
+  //                                       S2
+  //
+  // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
+  //    + min(freq(Pred->S1), freq(Pred->S2))
+  // Non-topo-order cost:
+  // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
+  // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
+  // is 0. Then the non topo layout is better when
+  // freq(S->Pred) < freq(BB->S1).
+  // This is exactly what is checked below.
+  // Note there are other shapes that apply (Pred may not be a single block,
+  // but they all fit this general pattern.)
+  BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);
+
+  // Make sure that a hot successor doesn't have a globally more
+  // important predecessor.
+  BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
+  bool BadCFGConflict = false;
+
+  for (MachineBasicBlock *Pred : Succ->predecessors()) {
+    BlockChain *PredChain = BlockToChain[Pred];
+    if (Pred == Succ || PredChain == &SuccChain ||
+        (BlockFilter && !BlockFilter->count(Pred)) ||
+        PredChain == &Chain || Pred != *std::prev(PredChain->end()) ||
+        // This check is redundant except for look ahead. This function is
+        // called for lookahead by isProfitableToTailDup when BB hasn't been
+        // placed yet.
+        (Pred == BB))
+      continue;
+    // Do backward checking.
+    // For all cases above, we need a backward checking to filter out edges that
+    // are not 'strongly' biased.
+    // BB  Pred
+    //  \ /
+    //  Succ
+    // We select edge BB->Succ if
+    //      freq(BB->Succ) > freq(Succ) * HotProb
+    //      i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
+    //      HotProb
+    //      i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
+    // Case 1 is covered too, because the first equation reduces to:
+    // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
+    BlockFrequency PredEdgeFreq =
+        MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
+    if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
+      BadCFGConflict = true;
+      break;
+    }
+  }
+
+  if (BadCFGConflict) {
+    LLVM_DEBUG(dbgs() << "    Not a candidate: " << getBlockName(Succ) << " -> "
+                      << SuccProb << " (prob) (non-cold CFG conflict)\n");
+    return true;
+  }
+
+  return false;
+}
+
+/// Select the best successor for a block.
+///
+/// This looks across all successors of a particular block and attempts to
+/// select the "best" one to be the layout successor. It only considers direct
+/// successors which also pass the block filter. It will attempt to avoid
+/// breaking CFG structure, but cave and break such structures in the case of
+/// very hot successor edges.
+///
+/// \returns The best successor block found, or null if none are viable, along
+/// with a boolean indicating if tail duplication is necessary.
+MachineBlockPlacement::BlockAndTailDupResult
+MachineBlockPlacement::selectBestSuccessor(
+    const MachineBasicBlock *BB, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
+  const BranchProbability HotProb(StaticLikelyProb, 100);
+
+  BlockAndTailDupResult BestSucc = { nullptr, false };
+  auto BestProb = BranchProbability::getZero();
+
+  SmallVector<MachineBasicBlock *, 4> Successors;
+  auto AdjustedSumProb =
+      collectViableSuccessors(BB, Chain, BlockFilter, Successors);
+
+  LLVM_DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB)
+                    << "\n");
+
+  // if we already precomputed the best successor for BB, return that if still
+  // applicable.
+  auto FoundEdge = ComputedEdges.find(BB);
+  if (FoundEdge != ComputedEdges.end()) {
+    MachineBasicBlock *Succ = FoundEdge->second.BB;
+    ComputedEdges.erase(FoundEdge);
+    BlockChain *SuccChain = BlockToChain[Succ];
+    if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
+        SuccChain != &Chain && Succ == *SuccChain->begin())
+      return FoundEdge->second;
+  }
+
+  // if BB is part of a trellis, Use the trellis to determine the optimal
+  // fallthrough edges
+  if (isTrellis(BB, Successors, Chain, BlockFilter))
+    return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
+                                   BlockFilter);
+
+  // For blocks with CFG violations, we may be able to lay them out anyway with
+  // tail-duplication. We keep this vector so we can perform the probability
+  // calculations the minimum number of times.
+  SmallVector<std::pair<BranchProbability, MachineBasicBlock *>, 4>
+      DupCandidates;
+  for (MachineBasicBlock *Succ : Successors) {
+    auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
+    BranchProbability SuccProb =
+        getAdjustedProbability(RealSuccProb, AdjustedSumProb);
+
+    BlockChain &SuccChain = *BlockToChain[Succ];
+    // Skip the edge \c BB->Succ if block \c Succ has a better layout
+    // predecessor that yields lower global cost.
+    if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
+                                   Chain, BlockFilter)) {
+      // If tail duplication would make Succ profitable, place it.
+      if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
+        DupCandidates.emplace_back(SuccProb, Succ);
+      continue;
+    }
+
+    LLVM_DEBUG(
+        dbgs() << "    Candidate: " << getBlockName(Succ)
+               << ", probability: " << SuccProb
+               << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
+               << "\n");
+
+    if (BestSucc.BB && BestProb >= SuccProb) {
+      LLVM_DEBUG(dbgs() << "    Not the best candidate, continuing\n");
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "    Setting it as best candidate\n");
+    BestSucc.BB = Succ;
+    BestProb = SuccProb;
+  }
+  // Handle the tail duplication candidates in order of decreasing probability.
+  // Stop at the first one that is profitable. Also stop if they are less
+  // profitable than BestSucc. Position is important because we preserve it and
+  // prefer first best match. Here we aren't comparing in order, so we capture
+  // the position instead.
+  llvm::stable_sort(DupCandidates,
+                    [](std::tuple<BranchProbability, MachineBasicBlock *> L,
+                       std::tuple<BranchProbability, MachineBasicBlock *> R) {
+                      return std::get<0>(L) > std::get<0>(R);
+                    });
+  for (auto &Tup : DupCandidates) {
+    BranchProbability DupProb;
+    MachineBasicBlock *Succ;
+    std::tie(DupProb, Succ) = Tup;
+    if (DupProb < BestProb)
+      break;
+    if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
+        && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
+      LLVM_DEBUG(dbgs() << "    Candidate: " << getBlockName(Succ)
+                        << ", probability: " << DupProb
+                        << " (Tail Duplicate)\n");
+      BestSucc.BB = Succ;
+      BestSucc.ShouldTailDup = true;
+      break;
+    }
+  }
+
+  if (BestSucc.BB)
+    LLVM_DEBUG(dbgs() << "    Selected: " << getBlockName(BestSucc.BB) << "\n");
+
+  return BestSucc;
+}
+
+/// Select the best block from a worklist.
+///
+/// This looks through the provided worklist as a list of candidate basic
+/// blocks and select the most profitable one to place. The definition of
+/// profitable only really makes sense in the context of a loop. This returns
+/// the most frequently visited block in the worklist, which in the case of
+/// a loop, is the one most desirable to be physically close to the rest of the
+/// loop body in order to improve i-cache behavior.
+///
+/// \returns The best block found, or null if none are viable.
+MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
+    const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
+  // Once we need to walk the worklist looking for a candidate, cleanup the
+  // worklist of already placed entries.
+  // FIXME: If this shows up on profiles, it could be folded (at the cost of
+  // some code complexity) into the loop below.
+  llvm::erase_if(WorkList, [&](MachineBasicBlock *BB) {
+    return BlockToChain.lookup(BB) == &Chain;
+  });
+
+  if (WorkList.empty())
+    return nullptr;
+
+  bool IsEHPad = WorkList[0]->isEHPad();
+
+  MachineBasicBlock *BestBlock = nullptr;
+  BlockFrequency BestFreq;
+  for (MachineBasicBlock *MBB : WorkList) {
+    assert(MBB->isEHPad() == IsEHPad &&
+           "EHPad mismatch between block and work list.");
+
+    BlockChain &SuccChain = *BlockToChain[MBB];
+    if (&SuccChain == &Chain)
+      continue;
+
+    assert(SuccChain.UnscheduledPredecessors == 0 &&
+           "Found CFG-violating block");
+
+    BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
+    LLVM_DEBUG(dbgs() << "    " << getBlockName(MBB) << " -> ";
+               MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
+
+    // For ehpad, we layout the least probable first as to avoid jumping back
+    // from least probable landingpads to more probable ones.
+    //
+    // FIXME: Using probability is probably (!) not the best way to achieve
+    // this. We should probably have a more principled approach to layout
+    // cleanup code.
+    //
+    // The goal is to get:
+    //
+    //                 +--------------------------+
+    //                 |                          V
+    // InnerLp -> InnerCleanup    OuterLp -> OuterCleanup -> Resume
+    //
+    // Rather than:
+    //
+    //                 +-------------------------------------+
+    //                 V                                     |
+    // OuterLp -> OuterCleanup -> Resume     InnerLp -> InnerCleanup
+    if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
+      continue;
+
+    BestBlock = MBB;
+    BestFreq = CandidateFreq;
+  }
+
+  return BestBlock;
+}
+
+/// Retrieve the first unplaced basic block.
+///
+/// This routine is called when we are unable to use the CFG to walk through
+/// all of the basic blocks and form a chain due to unnatural loops in the CFG.
+/// We walk through the function's blocks in order, starting from the
+/// LastUnplacedBlockIt. We update this iterator on each call to avoid
+/// re-scanning the entire sequence on repeated calls to this routine.
+MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
+    const BlockChain &PlacedChain,
+    MachineFunction::iterator &PrevUnplacedBlockIt,
+    const BlockFilterSet *BlockFilter) {
+  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
+       ++I) {
+    if (BlockFilter && !BlockFilter->count(&*I))
+      continue;
+    if (BlockToChain[&*I] != &PlacedChain) {
+      PrevUnplacedBlockIt = I;
+      // Now select the head of the chain to which the unplaced block belongs
+      // as the block to place. This will force the entire chain to be placed,
+      // and satisfies the requirements of merging chains.
+      return *BlockToChain[&*I]->begin();
+    }
+  }
+  return nullptr;
+}
+
+void MachineBlockPlacement::fillWorkLists(
+    const MachineBasicBlock *MBB,
+    SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
+    const BlockFilterSet *BlockFilter = nullptr) {
+  BlockChain &Chain = *BlockToChain[MBB];
+  if (!UpdatedPreds.insert(&Chain).second)
+    return;
+
+  assert(
+      Chain.UnscheduledPredecessors == 0 &&
+      "Attempting to place block with unscheduled predecessors in worklist.");
+  for (MachineBasicBlock *ChainBB : Chain) {
+    assert(BlockToChain[ChainBB] == &Chain &&
+           "Block in chain doesn't match BlockToChain map.");
+    for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
+      if (BlockFilter && !BlockFilter->count(Pred))
+        continue;
+      if (BlockToChain[Pred] == &Chain)
+        continue;
+      ++Chain.UnscheduledPredecessors;
+    }
+  }
+
+  if (Chain.UnscheduledPredecessors != 0)
+    return;
+
+  MachineBasicBlock *BB = *Chain.begin();
+  if (BB->isEHPad())
+    EHPadWorkList.push_back(BB);
+  else
+    BlockWorkList.push_back(BB);
+}
+
+void MachineBlockPlacement::buildChain(
+    const MachineBasicBlock *HeadBB, BlockChain &Chain,
+    BlockFilterSet *BlockFilter) {
+  assert(HeadBB && "BB must not be null.\n");
+  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
+  MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
+
+  const MachineBasicBlock *LoopHeaderBB = HeadBB;
+  markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
+  MachineBasicBlock *BB = *std::prev(Chain.end());
+  while (true) {
+    assert(BB && "null block found at end of chain in loop.");
+    assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
+    assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
+
+
+    // Look for the best viable successor if there is one to place immediately
+    // after this block.
+    auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
+    MachineBasicBlock* BestSucc = Result.BB;
+    bool ShouldTailDup = Result.ShouldTailDup;
+    if (allowTailDupPlacement())
+      ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(BB, BestSucc,
+                                                                  Chain,
+                                                                  BlockFilter));
+
+    // If an immediate successor isn't available, look for the best viable
+    // block among those we've identified as not violating the loop's CFG at
+    // this point. This won't be a fallthrough, but it will increase locality.
+    if (!BestSucc)
+      BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
+    if (!BestSucc)
+      BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
+
+    if (!BestSucc) {
+      BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
+      if (!BestSucc)
+        break;
+
+      LLVM_DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
+                           "layout successor until the CFG reduces\n");
+    }
+
+    // Placement may have changed tail duplication opportunities.
+    // Check for that now.
+    if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
+      repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
+                                       BlockFilter, PrevUnplacedBlockIt);
+      // If the chosen successor was duplicated into BB, don't bother laying
+      // it out, just go round the loop again with BB as the chain end.
+      if (!BB->isSuccessor(BestSucc))
+        continue;
+    }
+
+    // Place this block, updating the datastructures to reflect its placement.
+    BlockChain &SuccChain = *BlockToChain[BestSucc];
+    // Zero out UnscheduledPredecessors for the successor we're about to merge in case
+    // we selected a successor that didn't fit naturally into the CFG.
+    SuccChain.UnscheduledPredecessors = 0;
+    LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
+                      << getBlockName(BestSucc) << "\n");
+    markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
+    Chain.merge(BestSucc, &SuccChain);
+    BB = *std::prev(Chain.end());
+  }
+
+  LLVM_DEBUG(dbgs() << "Finished forming chain for header block "
+                    << getBlockName(*Chain.begin()) << "\n");
+}
+
+// If bottom of block BB has only one successor OldTop, in most cases it is
+// profitable to move it before OldTop, except the following case:
+//
+//     -->OldTop<-
+//     |    .    |
+//     |    .    |
+//     |    .    |
+//     ---Pred   |
+//          |    |
+//         BB-----
+//
+// If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
+// layout the other successor below it, so it can't reduce taken branch.
+// In this case we keep its original layout.
+bool
+MachineBlockPlacement::canMoveBottomBlockToTop(
+    const MachineBasicBlock *BottomBlock,
+    const MachineBasicBlock *OldTop) {
+  if (BottomBlock->pred_size() != 1)
+    return true;
+  MachineBasicBlock *Pred = *BottomBlock->pred_begin();
+  if (Pred->succ_size() != 2)
+    return true;
+
+  MachineBasicBlock *OtherBB = *Pred->succ_begin();
+  if (OtherBB == BottomBlock)
+    OtherBB = *Pred->succ_rbegin();
+  if (OtherBB == OldTop)
+    return false;
+
+  return true;
+}
+
+// Find out the possible fall through frequence to the top of a loop.
+BlockFrequency
+MachineBlockPlacement::TopFallThroughFreq(
+    const MachineBasicBlock *Top,
+    const BlockFilterSet &LoopBlockSet) {
+  BlockFrequency MaxFreq = 0;
+  for (MachineBasicBlock *Pred : Top->predecessors()) {
+    BlockChain *PredChain = BlockToChain[Pred];
+    if (!LoopBlockSet.count(Pred) &&
+        (!PredChain || Pred == *std::prev(PredChain->end()))) {
+      // Found a Pred block can be placed before Top.
+      // Check if Top is the best successor of Pred.
+      auto TopProb = MBPI->getEdgeProbability(Pred, Top);
+      bool TopOK = true;
+      for (MachineBasicBlock *Succ : Pred->successors()) {
+        auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
+        BlockChain *SuccChain = BlockToChain[Succ];
+        // Check if Succ can be placed after Pred.
+        // Succ should not be in any chain, or it is the head of some chain.
+        if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
+            (!SuccChain || Succ == *SuccChain->begin())) {
+          TopOK = false;
+          break;
+        }
+      }
+      if (TopOK) {
+        BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
+                                  MBPI->getEdgeProbability(Pred, Top);
+        if (EdgeFreq > MaxFreq)
+          MaxFreq = EdgeFreq;
+      }
+    }
+  }
+  return MaxFreq;
+}
+
+// Compute the fall through gains when move NewTop before OldTop.
+//
+// In following diagram, edges marked as "-" are reduced fallthrough, edges
+// marked as "+" are increased fallthrough, this function computes
+//
+//      SUM(increased fallthrough) - SUM(decreased fallthrough)
+//
+//              |
+//              | -
+//              V
+//        --->OldTop
+//        |     .
+//        |     .
+//       +|     .    +
+//        |   Pred --->
+//        |     |-
+//        |     V
+//        --- NewTop <---
+//              |-
+//              V
+//
+BlockFrequency
+MachineBlockPlacement::FallThroughGains(
+    const MachineBasicBlock *NewTop,
+    const MachineBasicBlock *OldTop,
+    const MachineBasicBlock *ExitBB,
+    const BlockFilterSet &LoopBlockSet) {
+  BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
+  BlockFrequency FallThrough2Exit = 0;
+  if (ExitBB)
+    FallThrough2Exit = MBFI->getBlockFreq(NewTop) *
+        MBPI->getEdgeProbability(NewTop, ExitBB);
+  BlockFrequency BackEdgeFreq = MBFI->getBlockFreq(NewTop) *
+      MBPI->getEdgeProbability(NewTop, OldTop);
+
+  // Find the best Pred of NewTop.
+   MachineBasicBlock *BestPred = nullptr;
+   BlockFrequency FallThroughFromPred = 0;
+   for (MachineBasicBlock *Pred : NewTop->predecessors()) {
+     if (!LoopBlockSet.count(Pred))
+       continue;
+     BlockChain *PredChain = BlockToChain[Pred];
+     if (!PredChain || Pred == *std::prev(PredChain->end())) {
+       BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
+           MBPI->getEdgeProbability(Pred, NewTop);
+       if (EdgeFreq > FallThroughFromPred) {
+         FallThroughFromPred = EdgeFreq;
+         BestPred = Pred;
+       }
+     }
+   }
+
+   // If NewTop is not placed after Pred, another successor can be placed
+   // after Pred.
+   BlockFrequency NewFreq = 0;
+   if (BestPred) {
+     for (MachineBasicBlock *Succ : BestPred->successors()) {
+       if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
+         continue;
+       if (ComputedEdges.contains(Succ))
+         continue;
+       BlockChain *SuccChain = BlockToChain[Succ];
+       if ((SuccChain && (Succ != *SuccChain->begin())) ||
+           (SuccChain == BlockToChain[BestPred]))
+         continue;
+       BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
+           MBPI->getEdgeProbability(BestPred, Succ);
+       if (EdgeFreq > NewFreq)
+         NewFreq = EdgeFreq;
+     }
+     BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
+         MBPI->getEdgeProbability(BestPred, NewTop);
+     if (NewFreq > OrigEdgeFreq) {
+       // If NewTop is not the best successor of Pred, then Pred doesn't
+       // fallthrough to NewTop. So there is no FallThroughFromPred and
+       // NewFreq.
+       NewFreq = 0;
+       FallThroughFromPred = 0;
+     }
+   }
+
+   BlockFrequency Result = 0;
+   BlockFrequency Gains = BackEdgeFreq + NewFreq;
+   BlockFrequency Lost = FallThrough2Top + FallThrough2Exit +
+       FallThroughFromPred;
+   if (Gains > Lost)
+     Result = Gains - Lost;
+   return Result;
+}
+
+/// Helper function of findBestLoopTop. Find the best loop top block
+/// from predecessors of old top.
+///
+/// Look for a block which is strictly better than the old top for laying
+/// out before the old top of the loop. This looks for only two patterns:
+///
+///     1. a block has only one successor, the old loop top
+///
+///        Because such a block will always result in an unconditional jump,
+///        rotating it in front of the old top is always profitable.
+///
+///     2. a block has two successors, one is old top, another is exit
+///        and it has more than one predecessors
+///
+///        If it is below one of its predecessors P, only P can fall through to
+///        it, all other predecessors need a jump to it, and another conditional
+///        jump to loop header. If it is moved before loop header, all its
+///        predecessors jump to it, then fall through to loop header. So all its
+///        predecessors except P can reduce one taken branch.
+///        At the same time, move it before old top increases the taken branch
+///        to loop exit block, so the reduced taken branch will be compared with
+///        the increased taken branch to the loop exit block.
+MachineBasicBlock *
+MachineBlockPlacement::findBestLoopTopHelper(
+    MachineBasicBlock *OldTop,
+    const MachineLoop &L,
+    const BlockFilterSet &LoopBlockSet) {
+  // Check that the header hasn't been fused with a preheader block due to
+  // crazy branches. If it has, we need to start with the header at the top to
+  // prevent pulling the preheader into the loop body.
+  BlockChain &HeaderChain = *BlockToChain[OldTop];
+  if (!LoopBlockSet.count(*HeaderChain.begin()))
+    return OldTop;
+  if (OldTop != *HeaderChain.begin())
+    return OldTop;
+
+  LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
+                    << "\n");
+
+  BlockFrequency BestGains = 0;
+  MachineBasicBlock *BestPred = nullptr;
+  for (MachineBasicBlock *Pred : OldTop->predecessors()) {
+    if (!LoopBlockSet.count(Pred))
+      continue;
+    if (Pred == L.getHeader())
+      continue;
+    LLVM_DEBUG(dbgs() << "   old top pred: " << getBlockName(Pred) << ", has "
+                      << Pred->succ_size() << " successors, ";
+               MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
+    if (Pred->succ_size() > 2)
+      continue;
+
+    MachineBasicBlock *OtherBB = nullptr;
+    if (Pred->succ_size() == 2) {
+      OtherBB = *Pred->succ_begin();
+      if (OtherBB == OldTop)
+        OtherBB = *Pred->succ_rbegin();
+    }
+
+    if (!canMoveBottomBlockToTop(Pred, OldTop))
+      continue;
+
+    BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB,
+                                            LoopBlockSet);
+    if ((Gains > 0) && (Gains > BestGains ||
+        ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
+      BestPred = Pred;
+      BestGains = Gains;
+    }
+  }
+
+  // If no direct predecessor is fine, just use the loop header.
+  if (!BestPred) {
+    LLVM_DEBUG(dbgs() << "    final top unchanged\n");
+    return OldTop;
+  }
+
+  // Walk backwards through any straight line of predecessors.
+  while (BestPred->pred_size() == 1 &&
+         (*BestPred->pred_begin())->succ_size() == 1 &&
+         *BestPred->pred_begin() != L.getHeader())
+    BestPred = *BestPred->pred_begin();
+
+  LLVM_DEBUG(dbgs() << "    final top: " << getBlockName(BestPred) << "\n");
+  return BestPred;
+}
+
+/// Find the best loop top block for layout.
+///
+/// This function iteratively calls findBestLoopTopHelper, until no new better
+/// BB can be found.
+MachineBasicBlock *
+MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
+                                       const BlockFilterSet &LoopBlockSet) {
+  // Placing the latch block before the header may introduce an extra branch
+  // that skips this block the first time the loop is executed, which we want
+  // to avoid when optimising for size.
+  // FIXME: in theory there is a case that does not introduce a new branch,
+  // i.e. when the layout predecessor does not fallthrough to the loop header.
+  // In practice this never happens though: there always seems to be a preheader
+  // that can fallthrough and that is also placed before the header.
+  bool OptForSize = F->getFunction().hasOptSize() ||
+                    llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get());
+  if (OptForSize)
+    return L.getHeader();
+
+  MachineBasicBlock *OldTop = nullptr;
+  MachineBasicBlock *NewTop = L.getHeader();
+  while (NewTop != OldTop) {
+    OldTop = NewTop;
+    NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
+    if (NewTop != OldTop)
+      ComputedEdges[NewTop] = { OldTop, false };
+  }
+  return NewTop;
+}
+
+/// Find the best loop exiting block for layout.
+///
+/// This routine implements the logic to analyze the loop looking for the best
+/// block to layout at the top of the loop. Typically this is done to maximize
+/// fallthrough opportunities.
+MachineBasicBlock *
+MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
+                                        const BlockFilterSet &LoopBlockSet,
+                                        BlockFrequency &ExitFreq) {
+  // We don't want to layout the loop linearly in all cases. If the loop header
+  // is just a normal basic block in the loop, we want to look for what block
+  // within the loop is the best one to layout at the top. However, if the loop
+  // header has be pre-merged into a chain due to predecessors not having
+  // analyzable branches, *and* the predecessor it is merged with is *not* part
+  // of the loop, rotating the header into the middle of the loop will create
+  // a non-contiguous range of blocks which is Very Bad. So start with the
+  // header and only rotate if safe.
+  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
+  if (!LoopBlockSet.count(*HeaderChain.begin()))
+    return nullptr;
+
+  BlockFrequency BestExitEdgeFreq;
+  unsigned BestExitLoopDepth = 0;
+  MachineBasicBlock *ExitingBB = nullptr;
+  // If there are exits to outer loops, loop rotation can severely limit
+  // fallthrough opportunities unless it selects such an exit. Keep a set of
+  // blocks where rotating to exit with that block will reach an outer loop.
+  SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
+
+  LLVM_DEBUG(dbgs() << "Finding best loop exit for: "
+                    << getBlockName(L.getHeader()) << "\n");
+  for (MachineBasicBlock *MBB : L.getBlocks()) {
+    BlockChain &Chain = *BlockToChain[MBB];
+    // Ensure that this block is at the end of a chain; otherwise it could be
+    // mid-way through an inner loop or a successor of an unanalyzable branch.
+    if (MBB != *std::prev(Chain.end()))
+      continue;
+
+    // Now walk the successors. We need to establish whether this has a viable
+    // exiting successor and whether it has a viable non-exiting successor.
+    // We store the old exiting state and restore it if a viable looping
+    // successor isn't found.
+    MachineBasicBlock *OldExitingBB = ExitingBB;
+    BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
+    bool HasLoopingSucc = false;
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      if (Succ->isEHPad())
+        continue;
+      if (Succ == MBB)
+        continue;
+      BlockChain &SuccChain = *BlockToChain[Succ];
+      // Don't split chains, either this chain or the successor's chain.
+      if (&Chain == &SuccChain) {
+        LLVM_DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
+                          << getBlockName(Succ) << " (chain conflict)\n");
+        continue;
+      }
+
+      auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
+      if (LoopBlockSet.count(Succ)) {
+        LLVM_DEBUG(dbgs() << "    looping: " << getBlockName(MBB) << " -> "
+                          << getBlockName(Succ) << " (" << SuccProb << ")\n");
+        HasLoopingSucc = true;
+        continue;
+      }
+
+      unsigned SuccLoopDepth = 0;
+      if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
+        SuccLoopDepth = ExitLoop->getLoopDepth();
+        if (ExitLoop->contains(&L))
+          BlocksExitingToOuterLoop.insert(MBB);
+      }
+
+      BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
+      LLVM_DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
+                        << getBlockName(Succ) << " [L:" << SuccLoopDepth
+                        << "] (";
+                 MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
+      // Note that we bias this toward an existing layout successor to retain
+      // incoming order in the absence of better information. The exit must have
+      // a frequency higher than the current exit before we consider breaking
+      // the layout.
+      BranchProbability Bias(100 - ExitBlockBias, 100);
+      if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
+          ExitEdgeFreq > BestExitEdgeFreq ||
+          (MBB->isLayoutSuccessor(Succ) &&
+           !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
+        BestExitEdgeFreq = ExitEdgeFreq;
+        ExitingBB = MBB;
+      }
+    }
+
+    if (!HasLoopingSucc) {
+      // Restore the old exiting state, no viable looping successor was found.
+      ExitingBB = OldExitingBB;
+      BestExitEdgeFreq = OldBestExitEdgeFreq;
+    }
+  }
+  // Without a candidate exiting block or with only a single block in the
+  // loop, just use the loop header to layout the loop.
+  if (!ExitingBB) {
+    LLVM_DEBUG(
+        dbgs() << "    No other candidate exit blocks, using loop header\n");
+    return nullptr;
+  }
+  if (L.getNumBlocks() == 1) {
+    LLVM_DEBUG(dbgs() << "    Loop has 1 block, using loop header as exit\n");
+    return nullptr;
+  }
+
+  // Also, if we have exit blocks which lead to outer loops but didn't select
+  // one of them as the exiting block we are rotating toward, disable loop
+  // rotation altogether.
+  if (!BlocksExitingToOuterLoop.empty() &&
+      !BlocksExitingToOuterLoop.count(ExitingBB))
+    return nullptr;
+
+  LLVM_DEBUG(dbgs() << "  Best exiting block: " << getBlockName(ExitingBB)
+                    << "\n");
+  ExitFreq = BestExitEdgeFreq;
+  return ExitingBB;
+}
+
+/// Check if there is a fallthrough to loop header Top.
+///
+///   1. Look for a Pred that can be layout before Top.
+///   2. Check if Top is the most possible successor of Pred.
+bool
+MachineBlockPlacement::hasViableTopFallthrough(
+    const MachineBasicBlock *Top,
+    const BlockFilterSet &LoopBlockSet) {
+  for (MachineBasicBlock *Pred : Top->predecessors()) {
+    BlockChain *PredChain = BlockToChain[Pred];
+    if (!LoopBlockSet.count(Pred) &&
+        (!PredChain || Pred == *std::prev(PredChain->end()))) {
+      // Found a Pred block can be placed before Top.
+      // Check if Top is the best successor of Pred.
+      auto TopProb = MBPI->getEdgeProbability(Pred, Top);
+      bool TopOK = true;
+      for (MachineBasicBlock *Succ : Pred->successors()) {
+        auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
+        BlockChain *SuccChain = BlockToChain[Succ];
+        // Check if Succ can be placed after Pred.
+        // Succ should not be in any chain, or it is the head of some chain.
+        if ((!SuccChain || Succ == *SuccChain->begin()) && SuccProb > TopProb) {
+          TopOK = false;
+          break;
+        }
+      }
+      if (TopOK)
+        return true;
+    }
+  }
+  return false;
+}
+
+/// Attempt to rotate an exiting block to the bottom of the loop.
+///
+/// Once we have built a chain, try to rotate it to line up the hot exit block
+/// with fallthrough out of the loop if doing so doesn't introduce unnecessary
+/// branches. For example, if the loop has fallthrough into its header and out
+/// of its bottom already, don't rotate it.
+void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
+                                       const MachineBasicBlock *ExitingBB,
+                                       BlockFrequency ExitFreq,
+                                       const BlockFilterSet &LoopBlockSet) {
+  if (!ExitingBB)
+    return;
+
+  MachineBasicBlock *Top = *LoopChain.begin();
+  MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
+
+  // If ExitingBB is already the last one in a chain then nothing to do.
+  if (Bottom == ExitingBB)
+    return;
+
+  // The entry block should always be the first BB in a function.
+  if (Top->isEntryBlock())
+    return;
+
+  bool ViableTopFallthrough = hasViableTopFallthrough(Top, LoopBlockSet);
+
+  // If the header has viable fallthrough, check whether the current loop
+  // bottom is a viable exiting block. If so, bail out as rotating will
+  // introduce an unnecessary branch.
+  if (ViableTopFallthrough) {
+    for (MachineBasicBlock *Succ : Bottom->successors()) {
+      BlockChain *SuccChain = BlockToChain[Succ];
+      if (!LoopBlockSet.count(Succ) &&
+          (!SuccChain || Succ == *SuccChain->begin()))
+        return;
+    }
+
+    // Rotate will destroy the top fallthrough, we need to ensure the new exit
+    // frequency is larger than top fallthrough.
+    BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
+    if (FallThrough2Top >= ExitFreq)
+      return;
+  }
+
+  BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
+  if (ExitIt == LoopChain.end())
+    return;
+
+  // Rotating a loop exit to the bottom when there is a fallthrough to top
+  // trades the entry fallthrough for an exit fallthrough.
+  // If there is no bottom->top edge, but the chosen exit block does have
+  // a fallthrough, we break that fallthrough for nothing in return.
+
+  // Let's consider an example. We have a built chain of basic blocks
+  // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
+  // By doing a rotation we get
+  // Bk+1, ..., Bn, B1, ..., Bk
+  // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
+  // If we had a fallthrough Bk -> Bk+1 it is broken now.
+  // It might be compensated by fallthrough Bn -> B1.
+  // So we have a condition to avoid creation of extra branch by loop rotation.
+  // All below must be true to avoid loop rotation:
+  //   If there is a fallthrough to top (B1)
+  //   There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
+  //   There is no fallthrough from bottom (Bn) to top (B1).
+  // Please note that there is no exit fallthrough from Bn because we checked it
+  // above.
+  if (ViableTopFallthrough) {
+    assert(std::next(ExitIt) != LoopChain.end() &&
+           "Exit should not be last BB");
+    MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
+    if (ExitingBB->isSuccessor(NextBlockInChain))
+      if (!Bottom->isSuccessor(Top))
+        return;
+  }
+
+  LLVM_DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
+                    << " at bottom\n");
+  std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
+}
+
+/// Attempt to rotate a loop based on profile data to reduce branch cost.
+///
+/// With profile data, we can determine the cost in terms of missed fall through
+/// opportunities when rotating a loop chain and select the best rotation.
+/// Basically, there are three kinds of cost to consider for each rotation:
+///    1. The possibly missed fall through edge (if it exists) from BB out of
+///    the loop to the loop header.
+///    2. The possibly missed fall through edges (if they exist) from the loop
+///    exits to BB out of the loop.
+///    3. The missed fall through edge (if it exists) from the last BB to the
+///    first BB in the loop chain.
+///  Therefore, the cost for a given rotation is the sum of costs listed above.
+///  We select the best rotation with the smallest cost.
+void MachineBlockPlacement::rotateLoopWithProfile(
+    BlockChain &LoopChain, const MachineLoop &L,
+    const BlockFilterSet &LoopBlockSet) {
+  auto RotationPos = LoopChain.end();
+  MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
+
+  // The entry block should always be the first BB in a function.
+  if (ChainHeaderBB->isEntryBlock())
+    return;
+
+  BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
+
+  // A utility lambda that scales up a block frequency by dividing it by a
+  // branch probability which is the reciprocal of the scale.
+  auto ScaleBlockFrequency = [](BlockFrequency Freq,
+                                unsigned Scale) -> BlockFrequency {
+    if (Scale == 0)
+      return 0;
+    // Use operator / between BlockFrequency and BranchProbability to implement
+    // saturating multiplication.
+    return Freq / BranchProbability(1, Scale);
+  };
+
+  // Compute the cost of the missed fall-through edge to the loop header if the
+  // chain head is not the loop header. As we only consider natural loops with
+  // single header, this computation can be done only once.
+  BlockFrequency HeaderFallThroughCost(0);
+  for (auto *Pred : ChainHeaderBB->predecessors()) {
+    BlockChain *PredChain = BlockToChain[Pred];
+    if (!LoopBlockSet.count(Pred) &&
+        (!PredChain || Pred == *std::prev(PredChain->end()))) {
+      auto EdgeFreq = MBFI->getBlockFreq(Pred) *
+          MBPI->getEdgeProbability(Pred, ChainHeaderBB);
+      auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
+      // If the predecessor has only an unconditional jump to the header, we
+      // need to consider the cost of this jump.
+      if (Pred->succ_size() == 1)
+        FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
+      HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
+    }
+  }
+
+  // Here we collect all exit blocks in the loop, and for each exit we find out
+  // its hottest exit edge. For each loop rotation, we define the loop exit cost
+  // as the sum of frequencies of exit edges we collect here, excluding the exit
+  // edge from the tail of the loop chain.
+  SmallVector<std::pair<MachineBasicBlock *, BlockFrequency>, 4> ExitsWithFreq;
+  for (auto *BB : LoopChain) {
+    auto LargestExitEdgeProb = BranchProbability::getZero();
+    for (auto *Succ : BB->successors()) {
+      BlockChain *SuccChain = BlockToChain[Succ];
+      if (!LoopBlockSet.count(Succ) &&
+          (!SuccChain || Succ == *SuccChain->begin())) {
+        auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
+        LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
+      }
+    }
+    if (LargestExitEdgeProb > BranchProbability::getZero()) {
+      auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
+      ExitsWithFreq.emplace_back(BB, ExitFreq);
+    }
+  }
+
+  // In this loop we iterate every block in the loop chain and calculate the
+  // cost assuming the block is the head of the loop chain. When the loop ends,
+  // we should have found the best candidate as the loop chain's head.
+  for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
+            EndIter = LoopChain.end();
+       Iter != EndIter; Iter++, TailIter++) {
+    // TailIter is used to track the tail of the loop chain if the block we are
+    // checking (pointed by Iter) is the head of the chain.
+    if (TailIter == LoopChain.end())
+      TailIter = LoopChain.begin();
+
+    auto TailBB = *TailIter;
+
+    // Calculate the cost by putting this BB to the top.
+    BlockFrequency Cost = 0;
+
+    // If the current BB is the loop header, we need to take into account the
+    // cost of the missed fall through edge from outside of the loop to the
+    // header.
+    if (Iter != LoopChain.begin())
+      Cost += HeaderFallThroughCost;
+
+    // Collect the loop exit cost by summing up frequencies of all exit edges
+    // except the one from the chain tail.
+    for (auto &ExitWithFreq : ExitsWithFreq)
+      if (TailBB != ExitWithFreq.first)
+        Cost += ExitWithFreq.second;
+
+    // The cost of breaking the once fall-through edge from the tail to the top
+    // of the loop chain. Here we need to consider three cases:
+    // 1. If the tail node has only one successor, then we will get an
+    //    additional jmp instruction. So the cost here is (MisfetchCost +
+    //    JumpInstCost) * tail node frequency.
+    // 2. If the tail node has two successors, then we may still get an
+    //    additional jmp instruction if the layout successor after the loop
+    //    chain is not its CFG successor. Note that the more frequently executed
+    //    jmp instruction will be put ahead of the other one. Assume the
+    //    frequency of those two branches are x and y, where x is the frequency
+    //    of the edge to the chain head, then the cost will be
+    //    (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
+    // 3. If the tail node has more than two successors (this rarely happens),
+    //    we won't consider any additional cost.
+    if (TailBB->isSuccessor(*Iter)) {
+      auto TailBBFreq = MBFI->getBlockFreq(TailBB);
+      if (TailBB->succ_size() == 1)
+        Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
+                                    MisfetchCost + JumpInstCost);
+      else if (TailBB->succ_size() == 2) {
+        auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
+        auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
+        auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
+                                  ? TailBBFreq * TailToHeadProb.getCompl()
+                                  : TailToHeadFreq;
+        Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
+                ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "The cost of loop rotation by making "
+                      << getBlockName(*Iter)
+                      << " to the top: " << Cost.getFrequency() << "\n");
+
+    if (Cost < SmallestRotationCost) {
+      SmallestRotationCost = Cost;
+      RotationPos = Iter;
+    }
+  }
+
+  if (RotationPos != LoopChain.end()) {
+    LLVM_DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
+                      << " to the top\n");
+    std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
+  }
+}
+
+/// Collect blocks in the given loop that are to be placed.
+///
+/// When profile data is available, exclude cold blocks from the returned set;
+/// otherwise, collect all blocks in the loop.
+MachineBlockPlacement::BlockFilterSet
+MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
+  BlockFilterSet LoopBlockSet;
+
+  // Filter cold blocks off from LoopBlockSet when profile data is available.
+  // Collect the sum of frequencies of incoming edges to the loop header from
+  // outside. If we treat the loop as a super block, this is the frequency of
+  // the loop. Then for each block in the loop, we calculate the ratio between
+  // its frequency and the frequency of the loop block. When it is too small,
+  // don't add it to the loop chain. If there are outer loops, then this block
+  // will be merged into the first outer loop chain for which this block is not
+  // cold anymore. This needs precise profile data and we only do this when
+  // profile data is available.
+  if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
+    BlockFrequency LoopFreq(0);
+    for (auto *LoopPred : L.getHeader()->predecessors())
+      if (!L.contains(LoopPred))
+        LoopFreq += MBFI->getBlockFreq(LoopPred) *
+                    MBPI->getEdgeProbability(LoopPred, L.getHeader());
+
+    for (MachineBasicBlock *LoopBB : L.getBlocks()) {
+      if (LoopBlockSet.count(LoopBB))
+        continue;
+      auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
+      if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
+        continue;
+      BlockChain *Chain = BlockToChain[LoopBB];
+      for (MachineBasicBlock *ChainBB : *Chain)
+        LoopBlockSet.insert(ChainBB);
+    }
+  } else
+    LoopBlockSet.insert(L.block_begin(), L.block_end());
+
+  return LoopBlockSet;
+}
+
+/// Forms basic block chains from the natural loop structures.
+///
+/// These chains are designed to preserve the existing *structure* of the code
+/// as much as possible. We can then stitch the chains together in a way which
+/// both preserves the topological structure and minimizes taken conditional
+/// branches.
+void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
+  // First recurse through any nested loops, building chains for those inner
+  // loops.
+  for (const MachineLoop *InnerLoop : L)
+    buildLoopChains(*InnerLoop);
+
+  assert(BlockWorkList.empty() &&
+         "BlockWorkList not empty when starting to build loop chains.");
+  assert(EHPadWorkList.empty() &&
+         "EHPadWorkList not empty when starting to build loop chains.");
+  BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
+
+  // Check if we have profile data for this function. If yes, we will rotate
+  // this loop by modeling costs more precisely which requires the profile data
+  // for better layout.
+  bool RotateLoopWithProfile =
+      ForcePreciseRotationCost ||
+      (PreciseRotationCost && F->getFunction().hasProfileData());
+
+  // First check to see if there is an obviously preferable top block for the
+  // loop. This will default to the header, but may end up as one of the
+  // predecessors to the header if there is one which will result in strictly
+  // fewer branches in the loop body.
+  MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
+
+  // If we selected just the header for the loop top, look for a potentially
+  // profitable exit block in the event that rotating the loop can eliminate
+  // branches by placing an exit edge at the bottom.
+  //
+  // Loops are processed innermost to uttermost, make sure we clear
+  // PreferredLoopExit before processing a new loop.
+  PreferredLoopExit = nullptr;
+  BlockFrequency ExitFreq;
+  if (!RotateLoopWithProfile && LoopTop == L.getHeader())
+    PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
+
+  BlockChain &LoopChain = *BlockToChain[LoopTop];
+
+  // FIXME: This is a really lame way of walking the chains in the loop: we
+  // walk the blocks, and use a set to prevent visiting a particular chain
+  // twice.
+  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
+  assert(LoopChain.UnscheduledPredecessors == 0 &&
+         "LoopChain should not have unscheduled predecessors.");
+  UpdatedPreds.insert(&LoopChain);
+
+  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
+    fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
+
+  buildChain(LoopTop, LoopChain, &LoopBlockSet);
+
+  if (RotateLoopWithProfile)
+    rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
+  else
+    rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
+
+  LLVM_DEBUG({
+    // Crash at the end so we get all of the debugging output first.
+    bool BadLoop = false;
+    if (LoopChain.UnscheduledPredecessors) {
+      BadLoop = true;
+      dbgs() << "Loop chain contains a block without its preds placed!\n"
+             << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
+             << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
+    }
+    for (MachineBasicBlock *ChainBB : LoopChain) {
+      dbgs() << "          ... " << getBlockName(ChainBB) << "\n";
+      if (!LoopBlockSet.remove(ChainBB)) {
+        // We don't mark the loop as bad here because there are real situations
+        // where this can occur. For example, with an unanalyzable fallthrough
+        // from a loop block to a non-loop block or vice versa.
+        dbgs() << "Loop chain contains a block not contained by the loop!\n"
+               << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
+               << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
+               << "  Bad block:    " << getBlockName(ChainBB) << "\n";
+      }
+    }
+
+    if (!LoopBlockSet.empty()) {
+      BadLoop = true;
+      for (const MachineBasicBlock *LoopBB : LoopBlockSet)
+        dbgs() << "Loop contains blocks never placed into a chain!\n"
+               << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
+               << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
+               << "  Bad block:    " << getBlockName(LoopBB) << "\n";
+    }
+    assert(!BadLoop && "Detected problems with the placement of this loop.");
+  });
+
+  BlockWorkList.clear();
+  EHPadWorkList.clear();
+}
+
+void MachineBlockPlacement::buildCFGChains() {
+  // Ensure that every BB in the function has an associated chain to simplify
+  // the assumptions of the remaining algorithm.
+  SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
+  for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
+       ++FI) {
+    MachineBasicBlock *BB = &*FI;
+    BlockChain *Chain =
+        new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
+    // Also, merge any blocks which we cannot reason about and must preserve
+    // the exact fallthrough behavior for.
+    while (true) {
+      Cond.clear();
+      MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
+      if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
+        break;
+
+      MachineFunction::iterator NextFI = std::next(FI);
+      MachineBasicBlock *NextBB = &*NextFI;
+      // Ensure that the layout successor is a viable block, as we know that
+      // fallthrough is a possibility.
+      assert(NextFI != FE && "Can't fallthrough past the last block.");
+      LLVM_DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
+                        << getBlockName(BB) << " -> " << getBlockName(NextBB)
+                        << "\n");
+      Chain->merge(NextBB, nullptr);
+#ifndef NDEBUG
+      BlocksWithUnanalyzableExits.insert(&*BB);
+#endif
+      FI = NextFI;
+      BB = NextBB;
+    }
+  }
+
+  // Build any loop-based chains.
+  PreferredLoopExit = nullptr;
+  for (MachineLoop *L : *MLI)
+    buildLoopChains(*L);
+
+  assert(BlockWorkList.empty() &&
+         "BlockWorkList should be empty before building final chain.");
+  assert(EHPadWorkList.empty() &&
+         "EHPadWorkList should be empty before building final chain.");
+
+  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
+  for (MachineBasicBlock &MBB : *F)
+    fillWorkLists(&MBB, UpdatedPreds);
+
+  BlockChain &FunctionChain = *BlockToChain[&F->front()];
+  buildChain(&F->front(), FunctionChain);
+
+#ifndef NDEBUG
+  using FunctionBlockSetType = SmallPtrSet<MachineBasicBlock *, 16>;
+#endif
+  LLVM_DEBUG({
+    // Crash at the end so we get all of the debugging output first.
+    bool BadFunc = false;
+    FunctionBlockSetType FunctionBlockSet;
+    for (MachineBasicBlock &MBB : *F)
+      FunctionBlockSet.insert(&MBB);
+
+    for (MachineBasicBlock *ChainBB : FunctionChain)
+      if (!FunctionBlockSet.erase(ChainBB)) {
+        BadFunc = true;
+        dbgs() << "Function chain contains a block not in the function!\n"
+               << "  Bad block:    " << getBlockName(ChainBB) << "\n";
+      }
+
+    if (!FunctionBlockSet.empty()) {
+      BadFunc = true;
+      for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
+        dbgs() << "Function contains blocks never placed into a chain!\n"
+               << "  Bad block:    " << getBlockName(RemainingBB) << "\n";
+    }
+    assert(!BadFunc && "Detected problems with the block placement.");
+  });
+
+  // Remember original layout ordering, so we can update terminators after
+  // reordering to point to the original layout successor.
+  SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors(
+      F->getNumBlockIDs());
+  {
+    MachineBasicBlock *LastMBB = nullptr;
+    for (auto &MBB : *F) {
+      if (LastMBB != nullptr)
+        OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB;
+      LastMBB = &MBB;
+    }
+    OriginalLayoutSuccessors[F->back().getNumber()] = nullptr;
+  }
+
+  // Splice the blocks into place.
+  MachineFunction::iterator InsertPos = F->begin();
+  LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
+  for (MachineBasicBlock *ChainBB : FunctionChain) {
+    LLVM_DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
+                                                            : "          ... ")
+                      << getBlockName(ChainBB) << "\n");
+    if (InsertPos != MachineFunction::iterator(ChainBB))
+      F->splice(InsertPos, ChainBB);
+    else
+      ++InsertPos;
+
+    // Update the terminator of the previous block.
+    if (ChainBB == *FunctionChain.begin())
+      continue;
+    MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
+
+    // FIXME: It would be awesome of updateTerminator would just return rather
+    // than assert when the branch cannot be analyzed in order to remove this
+    // boiler plate.
+    Cond.clear();
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
+
+#ifndef NDEBUG
+    if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
+      // Given the exact block placement we chose, we may actually not _need_ to
+      // be able to edit PrevBB's terminator sequence, but not being _able_ to
+      // do that at this point is a bug.
+      assert((!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) ||
+              !PrevBB->canFallThrough()) &&
+             "Unexpected block with un-analyzable fallthrough!");
+      Cond.clear();
+      TBB = FBB = nullptr;
+    }
+#endif
+
+    // The "PrevBB" is not yet updated to reflect current code layout, so,
+    //   o. it may fall-through to a block without explicit "goto" instruction
+    //      before layout, and no longer fall-through it after layout; or
+    //   o. just opposite.
+    //
+    // analyzeBranch() may return erroneous value for FBB when these two
+    // situations take place. For the first scenario FBB is mistakenly set NULL;
+    // for the 2nd scenario, the FBB, which is expected to be NULL, is
+    // mistakenly pointing to "*BI".
+    // Thus, if the future change needs to use FBB before the layout is set, it
+    // has to correct FBB first by using the code similar to the following:
+    //
+    // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
+    //   PrevBB->updateTerminator();
+    //   Cond.clear();
+    //   TBB = FBB = nullptr;
+    //   if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
+    //     // FIXME: This should never take place.
+    //     TBB = FBB = nullptr;
+    //   }
+    // }
+    if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
+      PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
+    }
+  }
+
+  // Fixup the last block.
+  Cond.clear();
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
+  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) {
+    MachineBasicBlock *PrevBB = &F->back();
+    PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
+  }
+
+  BlockWorkList.clear();
+  EHPadWorkList.clear();
+}
+
+void MachineBlockPlacement::optimizeBranches() {
+  BlockChain &FunctionChain = *BlockToChain[&F->front()];
+  SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
+
+  // Now that all the basic blocks in the chain have the proper layout,
+  // make a final call to analyzeBranch with AllowModify set.
+  // Indeed, the target may be able to optimize the branches in a way we
+  // cannot because all branches may not be analyzable.
+  // E.g., the target may be able to remove an unconditional branch to
+  // a fallthrough when it occurs after predicated terminators.
+  for (MachineBasicBlock *ChainBB : FunctionChain) {
+    Cond.clear();
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
+    if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
+      // If PrevBB has a two-way branch, try to re-order the branches
+      // such that we branch to the successor with higher probability first.
+      if (TBB && !Cond.empty() && FBB &&
+          MBPI->getEdgeProbability(ChainBB, FBB) >
+              MBPI->getEdgeProbability(ChainBB, TBB) &&
+          !TII->reverseBranchCondition(Cond)) {
+        LLVM_DEBUG(dbgs() << "Reverse order of the two branches: "
+                          << getBlockName(ChainBB) << "\n");
+        LLVM_DEBUG(dbgs() << "    Edge probability: "
+                          << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
+                          << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
+        DebugLoc dl; // FIXME: this is nowhere
+        TII->removeBranch(*ChainBB);
+        TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
+      }
+    }
+  }
+}
+
+void MachineBlockPlacement::alignBlocks() {
+  // Walk through the backedges of the function now that we have fully laid out
+  // the basic blocks and align the destination of each backedge. We don't rely
+  // exclusively on the loop info here so that we can align backedges in
+  // unnatural CFGs and backedges that were introduced purely because of the
+  // loop rotations done during this layout pass.
+  if (F->getFunction().hasMinSize() ||
+      (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
+    return;
+  BlockChain &FunctionChain = *BlockToChain[&F->front()];
+  if (FunctionChain.begin() == FunctionChain.end())
+    return; // Empty chain.
+
+  const BranchProbability ColdProb(1, 5); // 20%
+  BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
+  BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
+  for (MachineBasicBlock *ChainBB : FunctionChain) {
+    if (ChainBB == *FunctionChain.begin())
+      continue;
+
+    // Don't align non-looping basic blocks. These are unlikely to execute
+    // enough times to matter in practice. Note that we'll still handle
+    // unnatural CFGs inside of a natural outer loop (the common case) and
+    // rotated loops.
+    MachineLoop *L = MLI->getLoopFor(ChainBB);
+    if (!L)
+      continue;
+
+    const Align Align = TLI->getPrefLoopAlignment(L);
+    if (Align == 1)
+      continue; // Don't care about loop alignment.
+
+    // If the block is cold relative to the function entry don't waste space
+    // aligning it.
+    BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
+    if (Freq < WeightedEntryFreq)
+      continue;
+
+    // If the block is cold relative to its loop header, don't align it
+    // regardless of what edges into the block exist.
+    MachineBasicBlock *LoopHeader = L->getHeader();
+    BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
+    if (Freq < (LoopHeaderFreq * ColdProb))
+      continue;
+
+    // If the global profiles indicates so, don't align it.
+    if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&
+        !TLI->alignLoopsWithOptSize())
+      continue;
+
+    // Check for the existence of a non-layout predecessor which would benefit
+    // from aligning this block.
+    MachineBasicBlock *LayoutPred =
+        &*std::prev(MachineFunction::iterator(ChainBB));
+
+    auto DetermineMaxAlignmentPadding = [&]() {
+      // Set the maximum bytes allowed to be emitted for alignment.
+      unsigned MaxBytes;
+      if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
+        MaxBytes = MaxBytesForAlignmentOverride;
+      else
+        MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
+      ChainBB->setMaxBytesForAlignment(MaxBytes);
+    };
+
+    // Force alignment if all the predecessors are jumps. We already checked
+    // that the block isn't cold above.
+    if (!LayoutPred->isSuccessor(ChainBB)) {
+      ChainBB->setAlignment(Align);
+      DetermineMaxAlignmentPadding();
+      continue;
+    }
+
+    // Align this block if the layout predecessor's edge into this block is
+    // cold relative to the block. When this is true, other predecessors make up
+    // all of the hot entries into the block and thus alignment is likely to be
+    // important.
+    BranchProbability LayoutProb =
+        MBPI->getEdgeProbability(LayoutPred, ChainBB);
+    BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
+    if (LayoutEdgeFreq <= (Freq * ColdProb)) {
+      ChainBB->setAlignment(Align);
+      DetermineMaxAlignmentPadding();
+    }
+  }
+}
+
+/// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
+/// it was duplicated into its chain predecessor and removed.
+/// \p BB    - Basic block that may be duplicated.
+///
+/// \p LPred - Chosen layout predecessor of \p BB.
+///            Updated to be the chain end if LPred is removed.
+/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
+/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
+///                  Used to identify which blocks to update predecessor
+///                  counts.
+/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
+///                          chosen in the given order due to unnatural CFG
+///                          only needed if \p BB is removed and
+///                          \p PrevUnplacedBlockIt pointed to \p BB.
+/// @return true if \p BB was removed.
+bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
+    MachineBasicBlock *BB, MachineBasicBlock *&LPred,
+    const MachineBasicBlock *LoopHeaderBB,
+    BlockChain &Chain, BlockFilterSet *BlockFilter,
+    MachineFunction::iterator &PrevUnplacedBlockIt) {
+  bool Removed, DuplicatedToLPred;
+  bool DuplicatedToOriginalLPred;
+  Removed = maybeTailDuplicateBlock(BB, LPred, Chain, BlockFilter,
+                                    PrevUnplacedBlockIt,
+                                    DuplicatedToLPred);
+  if (!Removed)
+    return false;
+  DuplicatedToOriginalLPred = DuplicatedToLPred;
+  // Iteratively try to duplicate again. It can happen that a block that is
+  // duplicated into is still small enough to be duplicated again.
+  // No need to call markBlockSuccessors in this case, as the blocks being
+  // duplicated from here on are already scheduled.
+  while (DuplicatedToLPred && Removed) {
+    MachineBasicBlock *DupBB, *DupPred;
+    // The removal callback causes Chain.end() to be updated when a block is
+    // removed. On the first pass through the loop, the chain end should be the
+    // same as it was on function entry. On subsequent passes, because we are
+    // duplicating the block at the end of the chain, if it is removed the
+    // chain will have shrunk by one block.
+    BlockChain::iterator ChainEnd = Chain.end();
+    DupBB = *(--ChainEnd);
+    // Now try to duplicate again.
+    if (ChainEnd == Chain.begin())
+      break;
+    DupPred = *std::prev(ChainEnd);
+    Removed = maybeTailDuplicateBlock(DupBB, DupPred, Chain, BlockFilter,
+                                      PrevUnplacedBlockIt,
+                                      DuplicatedToLPred);
+  }
+  // If BB was duplicated into LPred, it is now scheduled. But because it was
+  // removed, markChainSuccessors won't be called for its chain. Instead we
+  // call markBlockSuccessors for LPred to achieve the same effect. This must go
+  // at the end because repeating the tail duplication can increase the number
+  // of unscheduled predecessors.
+  LPred = *std::prev(Chain.end());
+  if (DuplicatedToOriginalLPred)
+    markBlockSuccessors(Chain, LPred, LoopHeaderBB, BlockFilter);
+  return true;
+}
+
+/// Tail duplicate \p BB into (some) predecessors if profitable.
+/// \p BB    - Basic block that may be duplicated
+/// \p LPred - Chosen layout predecessor of \p BB
+/// \p Chain - Chain to which \p LPred belongs, and \p BB will belong.
+/// \p BlockFilter - Set of blocks that belong to the loop being laid out.
+///                  Used to identify which blocks to update predecessor
+///                  counts.
+/// \p PrevUnplacedBlockIt - Iterator pointing to the last block that was
+///                          chosen in the given order due to unnatural CFG
+///                          only needed if \p BB is removed and
+///                          \p PrevUnplacedBlockIt pointed to \p BB.
+/// \p DuplicatedToLPred - True if the block was duplicated into LPred.
+/// \return  - True if the block was duplicated into all preds and removed.
+bool MachineBlockPlacement::maybeTailDuplicateBlock(
+    MachineBasicBlock *BB, MachineBasicBlock *LPred,
+    BlockChain &Chain, BlockFilterSet *BlockFilter,
+    MachineFunction::iterator &PrevUnplacedBlockIt,
+    bool &DuplicatedToLPred) {
+  DuplicatedToLPred = false;
+  if (!shouldTailDuplicate(BB))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Redoing tail duplication for Succ#" << BB->getNumber()
+                    << "\n");
+
+  // This has to be a callback because none of it can be done after
+  // BB is deleted.
+  bool Removed = false;
+  auto RemovalCallback =
+      [&](MachineBasicBlock *RemBB) {
+        // Signal to outer function
+        Removed = true;
+
+        // Conservative default.
+        bool InWorkList = true;
+        // Remove from the Chain and Chain Map
+        if (BlockToChain.count(RemBB)) {
+          BlockChain *Chain = BlockToChain[RemBB];
+          InWorkList = Chain->UnscheduledPredecessors == 0;
+          Chain->remove(RemBB);
+          BlockToChain.erase(RemBB);
+        }
+
+        // Handle the unplaced block iterator
+        if (&(*PrevUnplacedBlockIt) == RemBB) {
+          PrevUnplacedBlockIt++;
+        }
+
+        // Handle the Work Lists
+        if (InWorkList) {
+          SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
+          if (RemBB->isEHPad())
+            RemoveList = EHPadWorkList;
+          llvm::erase_value(RemoveList, RemBB);
+        }
+
+        // Handle the filter set
+        if (BlockFilter) {
+          BlockFilter->remove(RemBB);
+        }
+
+        // Remove the block from loop info.
+        MLI->removeBlock(RemBB);
+        if (RemBB == PreferredLoopExit)
+          PreferredLoopExit = nullptr;
+
+        LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
+                          << getBlockName(RemBB) << "\n");
+      };
+  auto RemovalCallbackRef =
+      function_ref<void(MachineBasicBlock*)>(RemovalCallback);
+
+  SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
+  bool IsSimple = TailDup.isSimpleBB(BB);
+  SmallVector<MachineBasicBlock *, 8> CandidatePreds;
+  SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
+  if (F->getFunction().hasProfileData()) {
+    // We can do partial duplication with precise profile information.
+    findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
+    if (CandidatePreds.size() == 0)
+      return false;
+    if (CandidatePreds.size() < BB->pred_size())
+      CandidatePtr = &CandidatePreds;
+  }
+  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
+                                 &RemovalCallbackRef, CandidatePtr);
+
+  // Update UnscheduledPredecessors to reflect tail-duplication.
+  DuplicatedToLPred = false;
+  for (MachineBasicBlock *Pred : DuplicatedPreds) {
+    // We're only looking for unscheduled predecessors that match the filter.
+    BlockChain* PredChain = BlockToChain[Pred];
+    if (Pred == LPred)
+      DuplicatedToLPred = true;
+    if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
+        || PredChain == &Chain)
+      continue;
+    for (MachineBasicBlock *NewSucc : Pred->successors()) {
+      if (BlockFilter && !BlockFilter->count(NewSucc))
+        continue;
+      BlockChain *NewChain = BlockToChain[NewSucc];
+      if (NewChain != &Chain && NewChain != PredChain)
+        NewChain->UnscheduledPredecessors++;
+    }
+  }
+  return Removed;
+}
+
+// Count the number of actual machine instructions.
+static uint64_t countMBBInstruction(MachineBasicBlock *MBB) {
+  uint64_t InstrCount = 0;
+  for (MachineInstr &MI : *MBB) {
+    if (!MI.isPHI() && !MI.isMetaInstruction())
+      InstrCount += 1;
+  }
+  return InstrCount;
+}
+
+// The size cost of duplication is the instruction size of the duplicated block.
+// So we should scale the threshold accordingly. But the instruction size is not
+// available on all targets, so we use the number of instructions instead.
+BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
+  return DupThreshold.getFrequency() * countMBBInstruction(BB);
+}
+
+// Returns true if BB is Pred's best successor.
+bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
+                                            MachineBasicBlock *Pred,
+                                            BlockFilterSet *BlockFilter) {
+  if (BB == Pred)
+    return false;
+  if (BlockFilter && !BlockFilter->count(Pred))
+    return false;
+  BlockChain *PredChain = BlockToChain[Pred];
+  if (PredChain && (Pred != *std::prev(PredChain->end())))
+    return false;
+
+  // Find the successor with largest probability excluding BB.
+  BranchProbability BestProb = BranchProbability::getZero();
+  for (MachineBasicBlock *Succ : Pred->successors())
+    if (Succ != BB) {
+      if (BlockFilter && !BlockFilter->count(Succ))
+        continue;
+      BlockChain *SuccChain = BlockToChain[Succ];
+      if (SuccChain && (Succ != *SuccChain->begin()))
+        continue;
+      BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
+      if (SuccProb > BestProb)
+        BestProb = SuccProb;
+    }
+
+  BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
+  if (BBProb <= BestProb)
+    return false;
+
+  // Compute the number of reduced taken branches if Pred falls through to BB
+  // instead of another successor. Then compare it with threshold.
+  BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
+  BlockFrequency Gain = PredFreq * (BBProb - BestProb);
+  return Gain > scaleThreshold(BB);
+}
+
+// Find out the predecessors of BB and BB can be beneficially duplicated into
+// them.
+void MachineBlockPlacement::findDuplicateCandidates(
+    SmallVectorImpl<MachineBasicBlock *> &Candidates,
+    MachineBasicBlock *BB,
+    BlockFilterSet *BlockFilter) {
+  MachineBasicBlock *Fallthrough = nullptr;
+  BranchProbability DefaultBranchProb = BranchProbability::getZero();
+  BlockFrequency BBDupThreshold(scaleThreshold(BB));
+  SmallVector<MachineBasicBlock *, 8> Preds(BB->predecessors());
+  SmallVector<MachineBasicBlock *, 8> Succs(BB->successors());
+
+  // Sort for highest frequency.
+  auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
+    return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
+  };
+  auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
+    return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
+  };
+  llvm::stable_sort(Succs, CmpSucc);
+  llvm::stable_sort(Preds, CmpPred);
+
+  auto SuccIt = Succs.begin();
+  if (SuccIt != Succs.end()) {
+    DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
+  }
+
+  // For each predecessors of BB, compute the benefit of duplicating BB,
+  // if it is larger than the threshold, add it into Candidates.
+  //
+  // If we have following control flow.
+  //
+  //     PB1 PB2 PB3 PB4
+  //      \   |  /    /\
+  //       \  | /    /  \
+  //        \ |/    /    \
+  //         BB----/     OB
+  //         /\
+  //        /  \
+  //      SB1 SB2
+  //
+  // And it can be partially duplicated as
+  //
+  //   PB2+BB
+  //      |  PB1 PB3 PB4
+  //      |   |  /    /\
+  //      |   | /    /  \
+  //      |   |/    /    \
+  //      |  BB----/     OB
+  //      |\ /|
+  //      | X |
+  //      |/ \|
+  //     SB2 SB1
+  //
+  // The benefit of duplicating into a predecessor is defined as
+  //         Orig_taken_branch - Duplicated_taken_branch
+  //
+  // The Orig_taken_branch is computed with the assumption that predecessor
+  // jumps to BB and the most possible successor is laid out after BB.
+  //
+  // The Duplicated_taken_branch is computed with the assumption that BB is
+  // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
+  // SB2 for PB2 in our case). If there is no available successor, the combined
+  // block jumps to all BB's successor, like PB3 in this example.
+  //
+  // If a predecessor has multiple successors, so BB can't be duplicated into
+  // it. But it can beneficially fall through to BB, and duplicate BB into other
+  // predecessors.
+  for (MachineBasicBlock *Pred : Preds) {
+    BlockFrequency PredFreq = getBlockCountOrFrequency(Pred);
+
+    if (!TailDup.canTailDuplicate(BB, Pred)) {
+      // BB can't be duplicated into Pred, but it is possible to be layout
+      // below Pred.
+      if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
+        Fallthrough = Pred;
+        if (SuccIt != Succs.end())
+          SuccIt++;
+      }
+      continue;
+    }
+
+    BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
+    BlockFrequency DupCost;
+    if (SuccIt == Succs.end()) {
+      // Jump to all successors;
+      if (Succs.size() > 0)
+        DupCost += PredFreq;
+    } else {
+      // Fallthrough to *SuccIt, jump to all other successors;
+      DupCost += PredFreq;
+      DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
+    }
+
+    assert(OrigCost >= DupCost);
+    OrigCost -= DupCost;
+    if (OrigCost > BBDupThreshold) {
+      Candidates.push_back(Pred);
+      if (SuccIt != Succs.end())
+        SuccIt++;
+    }
+  }
+
+  // No predecessors can optimally fallthrough to BB.
+  // So we can change one duplication into fallthrough.
+  if (!Fallthrough) {
+    if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
+      Candidates[0] = Candidates.back();
+      Candidates.pop_back();
+    }
+  }
+}
+
+void MachineBlockPlacement::initDupThreshold() {
+  DupThreshold = 0;
+  if (!F->getFunction().hasProfileData())
+    return;
+
+  // We prefer to use prifile count.
+  uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
+  if (HotThreshold != UINT64_MAX) {
+    UseProfileCount = true;
+    DupThreshold = HotThreshold * TailDupProfilePercentThreshold / 100;
+    return;
+  }
+
+  // Profile count is not available, we can use block frequency instead.
+  BlockFrequency MaxFreq = 0;
+  for (MachineBasicBlock &MBB : *F) {
+    BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
+    if (Freq > MaxFreq)
+      MaxFreq = Freq;
+  }
+
+  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
+  DupThreshold = MaxFreq * ThresholdProb;
+  UseProfileCount = false;
+}
+
+bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  // Check for single-block functions and skip them.
+  if (std::next(MF.begin()) == MF.end())
+    return false;
+
+  F = &MF;
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MBFI = std::make_unique<MBFIWrapper>(
+      getAnalysis<MachineBlockFrequencyInfo>());
+  MLI = &getAnalysis<MachineLoopInfo>();
+  TII = MF.getSubtarget().getInstrInfo();
+  TLI = MF.getSubtarget().getTargetLowering();
+  MPDT = nullptr;
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+
+  initDupThreshold();
+
+  // Initialize PreferredLoopExit to nullptr here since it may never be set if
+  // there are no MachineLoops.
+  PreferredLoopExit = nullptr;
+
+  assert(BlockToChain.empty() &&
+         "BlockToChain map should be empty before starting placement.");
+  assert(ComputedEdges.empty() &&
+         "Computed Edge map should be empty before starting placement.");
+
+  unsigned TailDupSize = TailDupPlacementThreshold;
+  // If only the aggressive threshold is explicitly set, use it.
+  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
+      TailDupPlacementThreshold.getNumOccurrences() == 0)
+    TailDupSize = TailDupPlacementAggressiveThreshold;
+
+  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
+  // For aggressive optimization, we can adjust some thresholds to be less
+  // conservative.
+  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
+    // At O3 we should be more willing to copy blocks for tail duplication. This
+    // increases size pressure, so we only do it at O3
+    // Do this unless only the regular threshold is explicitly set.
+    if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
+        TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
+      TailDupSize = TailDupPlacementAggressiveThreshold;
+  }
+
+  // If there's no threshold provided through options, query the target
+  // information for a threshold instead.
+  if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
+      (PassConfig->getOptLevel() < CodeGenOpt::Aggressive ||
+       TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
+    TailDupSize = TII->getTailDuplicateSize(PassConfig->getOptLevel());
+
+  if (allowTailDupPlacement()) {
+    MPDT = &getAnalysis<MachinePostDominatorTree>();
+    bool OptForSize = MF.getFunction().hasOptSize() ||
+                      llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
+    if (OptForSize)
+      TailDupSize = 1;
+    bool PreRegAlloc = false;
+    TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
+                   /* LayoutMode */ true, TailDupSize);
+    precomputeTriangleChains();
+  }
+
+  buildCFGChains();
+
+  // Changing the layout can create new tail merging opportunities.
+  // TailMerge can create jump into if branches that make CFG irreducible for
+  // HW that requires structured CFG.
+  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
+                         PassConfig->getEnableTailMerge() &&
+                         BranchFoldPlacement;
+  // No tail merging opportunities if the block number is less than four.
+  if (MF.size() > 3 && EnableTailMerge) {
+    unsigned TailMergeSize = TailDupSize + 1;
+    BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
+                    *MBFI, *MBPI, PSI, TailMergeSize);
+
+    if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
+                            /*AfterPlacement=*/true)) {
+      // Redo the layout if tail merging creates/removes/moves blocks.
+      BlockToChain.clear();
+      ComputedEdges.clear();
+      // Must redo the post-dominator tree if blocks were changed.
+      if (MPDT)
+        MPDT->runOnMachineFunction(MF);
+      ChainAllocator.DestroyAll();
+      buildCFGChains();
+    }
+  }
+
+  // Apply a post-processing optimizing block placement.
+  if (MF.size() >= 3 && EnableExtTspBlockPlacement &&
+      (ApplyExtTspWithoutProfile || MF.getFunction().hasProfileData())) {
+    // Find a new placement and modify the layout of the blocks in the function.
+    applyExtTsp();
+
+    // Re-create CFG chain so that we can optimizeBranches and alignBlocks.
+    createCFGChainExtTsp();
+  }
+
+  optimizeBranches();
+  alignBlocks();
+
+  BlockToChain.clear();
+  ComputedEdges.clear();
+  ChainAllocator.DestroyAll();
+
+  bool HasMaxBytesOverride =
+      MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
+
+  if (AlignAllBlock)
+    // Align all of the blocks in the function to a specific alignment.
+    for (MachineBasicBlock &MBB : MF) {
+      if (HasMaxBytesOverride)
+        MBB.setAlignment(Align(1ULL << AlignAllBlock),
+                         MaxBytesForAlignmentOverride);
+      else
+        MBB.setAlignment(Align(1ULL << AlignAllBlock));
+    }
+  else if (AlignAllNonFallThruBlocks) {
+    // Align all of the blocks that have no fall-through predecessors to a
+    // specific alignment.
+    for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
+      auto LayoutPred = std::prev(MBI);
+      if (!LayoutPred->isSuccessor(&*MBI)) {
+        if (HasMaxBytesOverride)
+          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
+                            MaxBytesForAlignmentOverride);
+        else
+          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
+      }
+    }
+  }
+  if (ViewBlockLayoutWithBFI != GVDT_None &&
+      (ViewBlockFreqFuncName.empty() ||
+       F->getFunction().getName().equals(ViewBlockFreqFuncName))) {
+    if (RenumberBlocksBeforeView)
+      MF.RenumberBlocks();
+    MBFI->view("MBP." + MF.getName(), false);
+  }
+
+  // We always return true as we have no way to track whether the final order
+  // differs from the original order.
+  return true;
+}
+
+void MachineBlockPlacement::applyExtTsp() {
+  // Prepare data; blocks are indexed by their index in the current ordering.
+  DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex;
+  BlockIndex.reserve(F->size());
+  std::vector<const MachineBasicBlock *> CurrentBlockOrder;
+  CurrentBlockOrder.reserve(F->size());
+  size_t NumBlocks = 0;
+  for (const MachineBasicBlock &MBB : *F) {
+    BlockIndex[&MBB] = NumBlocks++;
+    CurrentBlockOrder.push_back(&MBB);
+  }
+
+  auto BlockSizes = std::vector<uint64_t>(F->size());
+  auto BlockCounts = std::vector<uint64_t>(F->size());
+  std::vector<EdgeCountT> JumpCounts;
+  for (MachineBasicBlock &MBB : *F) {
+    // Getting the block frequency.
+    BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
+    BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency();
+    // Getting the block size:
+    // - approximate the size of an instruction by 4 bytes, and
+    // - ignore debug instructions.
+    // Note: getting the exact size of each block is target-dependent and can be
+    // done by extending the interface of MCCodeEmitter. Experimentally we do
+    // not see a perf improvement with the exact block sizes.
+    auto NonDbgInsts =
+        instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end());
+    int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
+    BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts;
+    // Getting jump frequencies.
+    for (MachineBasicBlock *Succ : MBB.successors()) {
+      auto EP = MBPI->getEdgeProbability(&MBB, Succ);
+      BlockFrequency JumpFreq = BlockFreq * EP;
+      auto Jump = std::make_pair(BlockIndex[&MBB], BlockIndex[Succ]);
+      JumpCounts.push_back(std::make_pair(Jump, JumpFreq.getFrequency()));
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size()
+                    << " with profile = " << F->getFunction().hasProfileData()
+                    << " (" << F->getName().str() << ")"
+                    << "\n");
+  LLVM_DEBUG(
+      dbgs() << format("  original  layout score: %0.2f\n",
+                       calcExtTspScore(BlockSizes, BlockCounts, JumpCounts)));
+
+  // Run the layout algorithm.
+  auto NewOrder = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
+  std::vector<const MachineBasicBlock *> NewBlockOrder;
+  NewBlockOrder.reserve(F->size());
+  for (uint64_t Node : NewOrder) {
+    NewBlockOrder.push_back(CurrentBlockOrder[Node]);
+  }
+  LLVM_DEBUG(dbgs() << format("  optimized layout score: %0.2f\n",
+                              calcExtTspScore(NewOrder, BlockSizes, BlockCounts,
+                                              JumpCounts)));
+
+  // Assign new block order.
+  assignBlockOrder(NewBlockOrder);
+}
+
+void MachineBlockPlacement::assignBlockOrder(
+    const std::vector<const MachineBasicBlock *> &NewBlockOrder) {
+  assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order");
+  F->RenumberBlocks();
+
+  bool HasChanges = false;
+  for (size_t I = 0; I < NewBlockOrder.size(); I++) {
+    if (NewBlockOrder[I] != F->getBlockNumbered(I)) {
+      HasChanges = true;
+      break;
+    }
+  }
+  // Stop early if the new block order is identical to the existing one.
+  if (!HasChanges)
+    return;
+
+  SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs());
+  for (auto &MBB : *F) {
+    PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
+  }
+
+  // Sort basic blocks in the function according to the computed order.
+  DenseMap<const MachineBasicBlock *, size_t> NewIndex;
+  for (const MachineBasicBlock *MBB : NewBlockOrder) {
+    NewIndex[MBB] = NewIndex.size();
+  }
+  F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) {
+    return NewIndex[&L] < NewIndex[&R];
+  });
+
+  // Update basic block branches by inserting explicit fallthrough branches
+  // when required and re-optimize branches when possible.
+  const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
+  SmallVector<MachineOperand, 4> Cond;
+  for (auto &MBB : *F) {
+    MachineFunction::iterator NextMBB = std::next(MBB.getIterator());
+    MachineFunction::iterator EndIt = MBB.getParent()->end();
+    auto *FTMBB = PrevFallThroughs[MBB.getNumber()];
+    // If this block had a fallthrough before we need an explicit unconditional
+    // branch to that block if the fallthrough block is not adjacent to the
+    // block in the new order.
+    if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) {
+      TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
+    }
+
+    // It might be possible to optimize branches by flipping the condition.
+    Cond.clear();
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+    if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
+      continue;
+    MBB.updateTerminator(FTMBB);
+  }
+
+#ifndef NDEBUG
+  // Make sure we correctly constructed all branches.
+  F->verify(this, "After optimized block reordering");
+#endif
+}
+
+void MachineBlockPlacement::createCFGChainExtTsp() {
+  BlockToChain.clear();
+  ComputedEdges.clear();
+  ChainAllocator.DestroyAll();
+
+  MachineBasicBlock *HeadBB = &F->front();
+  BlockChain *FunctionChain =
+      new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB);
+
+  for (MachineBasicBlock &MBB : *F) {
+    if (HeadBB == &MBB)
+      continue; // Ignore head of the chain
+    FunctionChain->merge(&MBB, nullptr);
+  }
+}
+
+namespace {
+
+/// A pass to compute block placement statistics.
+///
+/// A separate pass to compute interesting statistics for evaluating block
+/// placement. This is separate from the actual placement pass so that they can
+/// be computed in the absence of any placement transformations or when using
+/// alternative placement strategies.
+class MachineBlockPlacementStats : public MachineFunctionPass {
+  /// A handle to the branch probability pass.
+  const MachineBranchProbabilityInfo *MBPI;
+
+  /// A handle to the function-wide block frequency pass.
+  const MachineBlockFrequencyInfo *MBFI;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  MachineBlockPlacementStats() : MachineFunctionPass(ID) {
+    initializeMachineBlockPlacementStatsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char MachineBlockPlacementStats::ID = 0;
+
+char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStats::ID;
+
+INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
+                      "Basic Block Placement Stats", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
+                    "Basic Block Placement Stats", false, false)
+
+bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
+  // Check for single-block functions and skip them.
+  if (std::next(F.begin()) == F.end())
+    return false;
+
+  if (!isFunctionInPrintList(F.getName()))
+    return false;
+
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+
+  for (MachineBasicBlock &MBB : F) {
+    BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
+    Statistic &NumBranches =
+        (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
+    Statistic &BranchTakenFreq =
+        (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
+    for (MachineBasicBlock *Succ : MBB.successors()) {
+      // Skip if this successor is a fallthrough.
+      if (MBB.isLayoutSuccessor(Succ))
+        continue;
+
+      BlockFrequency EdgeFreq =
+          BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
+      ++NumBranches;
+      BranchTakenFreq += EdgeFreq.getFrequency();
+    }
+  }
+
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
new file mode 100644
index 000000000000..a84377d70855
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -0,0 +1,79 @@
+//===- MachineBranchProbabilityInfo.cpp - Machine Branch Probability Info -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This analysis uses probability info stored in Machine Basic Blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+INITIALIZE_PASS_BEGIN(MachineBranchProbabilityInfo, "machine-branch-prob",
+                      "Machine Branch Probability Analysis", false, true)
+INITIALIZE_PASS_END(MachineBranchProbabilityInfo, "machine-branch-prob",
+                    "Machine Branch Probability Analysis", false, true)
+
+namespace llvm {
+cl::opt<unsigned>
+    StaticLikelyProb("static-likely-prob",
+                     cl::desc("branch probability threshold in percentage"
+                              "to be considered very likely"),
+                     cl::init(80), cl::Hidden);
+
+cl::opt<unsigned> ProfileLikelyProb(
+    "profile-likely-prob",
+    cl::desc("branch probability threshold in percentage to be considered"
+             " very likely when profile is available"),
+    cl::init(51), cl::Hidden);
+} // namespace llvm
+
+char MachineBranchProbabilityInfo::ID = 0;
+
+MachineBranchProbabilityInfo::MachineBranchProbabilityInfo()
+    : ImmutablePass(ID) {
+  PassRegistry &Registry = *PassRegistry::getPassRegistry();
+  initializeMachineBranchProbabilityInfoPass(Registry);
+}
+
+void MachineBranchProbabilityInfo::anchor() {}
+
+BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
+    const MachineBasicBlock *Src,
+    MachineBasicBlock::const_succ_iterator Dst) const {
+  return Src->getSuccProbability(Dst);
+}
+
+BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
+    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
+  // This is a linear search. Try to use the const_succ_iterator version when
+  // possible.
+  return getEdgeProbability(Src, find(Src->successors(), Dst));
+}
+
+bool MachineBranchProbabilityInfo::isEdgeHot(
+    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
+  BranchProbability HotProb(StaticLikelyProb, 100);
+  return getEdgeProbability(Src, Dst) > HotProb;
+}
+
+raw_ostream &MachineBranchProbabilityInfo::printEdgeProbability(
+    raw_ostream &OS, const MachineBasicBlock *Src,
+    const MachineBasicBlock *Dst) const {
+
+  const BranchProbability Prob = getEdgeProbability(Src, Dst);
+  OS << "edge " << printMBBReference(*Src) << " -> " << printMBBReference(*Dst)
+     << " probability is " << Prob
+     << (isEdgeHot(Src, Dst) ? " [HOT edge]\n" : "\n");
+
+  return OS;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineCFGPrinter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineCFGPrinter.cpp
new file mode 100644
index 000000000000..7bfb81771380
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineCFGPrinter.cpp
@@ -0,0 +1,95 @@
+//===- MachineCFGPrinter.cpp - DOT Printer for Machine Functions ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the `-dot-machine-cfg` analysis pass, which emits
+// Machine Function in DOT format in file titled `<prefix>.<function-name>.dot.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineCFGPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/GraphWriter.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dot-machine-cfg"
+
+static cl::opt<std::string>
+    MCFGFuncName("mcfg-func-name", cl::Hidden,
+                 cl::desc("The name of a function (or its substring)"
+                          " whose CFG is viewed/printed."));
+
+static cl::opt<std::string> MCFGDotFilenamePrefix(
+    "mcfg-dot-filename-prefix", cl::Hidden,
+    cl::desc("The prefix used for the Machine CFG dot file names."));
+
+static cl::opt<bool>
+    CFGOnly("dot-mcfg-only", cl::init(false), cl::Hidden,
+            cl::desc("Print only the CFG without blocks body"));
+
+static void writeMCFGToDotFile(MachineFunction &MF) {
+  std::string Filename =
+      (MCFGDotFilenamePrefix + "." + MF.getName() + ".dot").str();
+  errs() << "Writing '" << Filename << "'...";
+
+  std::error_code EC;
+  raw_fd_ostream File(Filename, EC, sys::fs::OF_Text);
+
+  DOTMachineFuncInfo MCFGInfo(&MF);
+
+  if (!EC)
+    WriteGraph(File, &MCFGInfo, CFGOnly);
+  else
+    errs() << "  error opening file for writing!";
+  errs() << '\n';
+}
+
+namespace {
+
+class MachineCFGPrinter : public MachineFunctionPass {
+public:
+  static char ID;
+
+  MachineCFGPrinter();
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // namespace
+
+char MachineCFGPrinter::ID = 0;
+
+char &llvm::MachineCFGPrinterID = MachineCFGPrinter::ID;
+
+INITIALIZE_PASS(MachineCFGPrinter, DEBUG_TYPE, "Machine CFG Printer Pass",
+                false, true)
+
+/// Default construct and initialize the pass.
+MachineCFGPrinter::MachineCFGPrinter() : MachineFunctionPass(ID) {
+  initializeMachineCFGPrinterPass(*PassRegistry::getPassRegistry());
+}
+
+bool MachineCFGPrinter::runOnMachineFunction(MachineFunction &MF) {
+  if (!MCFGFuncName.empty() && !MF.getName().contains(MCFGFuncName))
+    return false;
+  errs() << "Writing Machine CFG for function ";
+  errs().write_escaped(MF.getName()) << '\n';
+
+  writeMCFGToDotFile(MF);
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineCSE.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineCSE.cpp
new file mode 100644
index 000000000000..f879c5fcf20c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineCSE.cpp
@@ -0,0 +1,947 @@
+//===- MachineCSE.cpp - Machine Common Subexpression Elimination Pass -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs global common subexpression elimination on machine
+// instructions using a scoped hash table based value numbering scheme. It
+// must be run while the machine function is still in SSA form.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCRegister.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/RecyclingAllocator.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-cse"
+
+STATISTIC(NumCoalesces, "Number of copies coalesced");
+STATISTIC(NumCSEs,      "Number of common subexpression eliminated");
+STATISTIC(NumPREs,      "Number of partial redundant expression"
+                        " transformed to fully redundant");
+STATISTIC(NumPhysCSEs,
+          "Number of physreg referencing common subexpr eliminated");
+STATISTIC(NumCrossBBCSEs,
+          "Number of cross-MBB physreg referencing CS eliminated");
+STATISTIC(NumCommutes,  "Number of copies coalesced after commuting");
+
+// Threshold to avoid excessive cost to compute isProfitableToCSE.
+static cl::opt<int>
+    CSUsesThreshold("csuses-threshold", cl::Hidden, cl::init(1024),
+                    cl::desc("Threshold for the size of CSUses"));
+
+namespace {
+
+  class MachineCSE : public MachineFunctionPass {
+    const TargetInstrInfo *TII = nullptr;
+    const TargetRegisterInfo *TRI = nullptr;
+    AliasAnalysis *AA = nullptr;
+    MachineDominatorTree *DT = nullptr;
+    MachineRegisterInfo *MRI = nullptr;
+    MachineBlockFrequencyInfo *MBFI = nullptr;
+
+  public:
+    static char ID; // Pass identification
+
+    MachineCSE() : MachineFunctionPass(ID) {
+      initializeMachineCSEPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineBlockFrequencyInfo>();
+      AU.addPreserved<MachineBlockFrequencyInfo>();
+    }
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties()
+        .set(MachineFunctionProperties::Property::IsSSA);
+    }
+
+    void releaseMemory() override {
+      ScopeMap.clear();
+      PREMap.clear();
+      Exps.clear();
+    }
+
+  private:
+    using AllocatorTy = RecyclingAllocator<BumpPtrAllocator,
+                            ScopedHashTableVal<MachineInstr *, unsigned>>;
+    using ScopedHTType =
+        ScopedHashTable<MachineInstr *, unsigned, MachineInstrExpressionTrait,
+                        AllocatorTy>;
+    using ScopeType = ScopedHTType::ScopeTy;
+    using PhysDefVector = SmallVector<std::pair<unsigned, unsigned>, 2>;
+
+    unsigned LookAheadLimit = 0;
+    DenseMap<MachineBasicBlock *, ScopeType *> ScopeMap;
+    DenseMap<MachineInstr *, MachineBasicBlock *, MachineInstrExpressionTrait>
+        PREMap;
+    ScopedHTType VNT;
+    SmallVector<MachineInstr *, 64> Exps;
+    unsigned CurrVN = 0;
+
+    bool PerformTrivialCopyPropagation(MachineInstr *MI,
+                                       MachineBasicBlock *MBB);
+    bool isPhysDefTriviallyDead(MCRegister Reg,
+                                MachineBasicBlock::const_iterator I,
+                                MachineBasicBlock::const_iterator E) const;
+    bool hasLivePhysRegDefUses(const MachineInstr *MI,
+                               const MachineBasicBlock *MBB,
+                               SmallSet<MCRegister, 8> &PhysRefs,
+                               PhysDefVector &PhysDefs, bool &PhysUseDef) const;
+    bool PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
+                          SmallSet<MCRegister, 8> &PhysRefs,
+                          PhysDefVector &PhysDefs, bool &NonLocal) const;
+    bool isCSECandidate(MachineInstr *MI);
+    bool isProfitableToCSE(Register CSReg, Register Reg,
+                           MachineBasicBlock *CSBB, MachineInstr *MI);
+    void EnterScope(MachineBasicBlock *MBB);
+    void ExitScope(MachineBasicBlock *MBB);
+    bool ProcessBlockCSE(MachineBasicBlock *MBB);
+    void ExitScopeIfDone(MachineDomTreeNode *Node,
+                         DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren);
+    bool PerformCSE(MachineDomTreeNode *Node);
+
+    bool isPRECandidate(MachineInstr *MI, SmallSet<MCRegister, 8> &PhysRefs);
+    bool ProcessBlockPRE(MachineDominatorTree *MDT, MachineBasicBlock *MBB);
+    bool PerformSimplePRE(MachineDominatorTree *DT);
+    /// Heuristics to see if it's profitable to move common computations of MBB
+    /// and MBB1 to CandidateBB.
+    bool isProfitableToHoistInto(MachineBasicBlock *CandidateBB,
+                                 MachineBasicBlock *MBB,
+                                 MachineBasicBlock *MBB1);
+  };
+
+} // end anonymous namespace
+
+char MachineCSE::ID = 0;
+
+char &llvm::MachineCSEID = MachineCSE::ID;
+
+INITIALIZE_PASS_BEGIN(MachineCSE, DEBUG_TYPE,
+                      "Machine Common Subexpression Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(MachineCSE, DEBUG_TYPE,
+                    "Machine Common Subexpression Elimination", false, false)
+
+/// The source register of a COPY machine instruction can be propagated to all
+/// its users, and this propagation could increase the probability of finding
+/// common subexpressions. If the COPY has only one user, the COPY itself can
+/// be removed.
+bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
+                                               MachineBasicBlock *MBB) {
+  bool Changed = false;
+  for (MachineOperand &MO : MI->all_uses()) {
+    Register Reg = MO.getReg();
+    if (!Reg.isVirtual())
+      continue;
+    bool OnlyOneUse = MRI->hasOneNonDBGUse(Reg);
+    MachineInstr *DefMI = MRI->getVRegDef(Reg);
+    if (!DefMI->isCopy())
+      continue;
+    Register SrcReg = DefMI->getOperand(1).getReg();
+    if (!SrcReg.isVirtual())
+      continue;
+    if (DefMI->getOperand(0).getSubReg())
+      continue;
+    // FIXME: We should trivially coalesce subregister copies to expose CSE
+    // opportunities on instructions with truncated operands (see
+    // cse-add-with-overflow.ll). This can be done here as follows:
+    // if (SrcSubReg)
+    //  RC = TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), RC,
+    //                                     SrcSubReg);
+    // MO.substVirtReg(SrcReg, SrcSubReg, *TRI);
+    //
+    // The 2-addr pass has been updated to handle coalesced subregs. However,
+    // some machine-specific code still can't handle it.
+    // To handle it properly we also need a way find a constrained subregister
+    // class given a super-reg class and subreg index.
+    if (DefMI->getOperand(1).getSubReg())
+      continue;
+    if (!MRI->constrainRegAttrs(SrcReg, Reg))
+      continue;
+    LLVM_DEBUG(dbgs() << "Coalescing: " << *DefMI);
+    LLVM_DEBUG(dbgs() << "***     to: " << *MI);
+
+    // Propagate SrcReg of copies to MI.
+    MO.setReg(SrcReg);
+    MRI->clearKillFlags(SrcReg);
+    // Coalesce single use copies.
+    if (OnlyOneUse) {
+      // If (and only if) we've eliminated all uses of the copy, also
+      // copy-propagate to any debug-users of MI, or they'll be left using
+      // an undefined value.
+      DefMI->changeDebugValuesDefReg(SrcReg);
+
+      DefMI->eraseFromParent();
+      ++NumCoalesces;
+    }
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool MachineCSE::isPhysDefTriviallyDead(
+    MCRegister Reg, MachineBasicBlock::const_iterator I,
+    MachineBasicBlock::const_iterator E) const {
+  unsigned LookAheadLeft = LookAheadLimit;
+  while (LookAheadLeft) {
+    // Skip over dbg_value's.
+    I = skipDebugInstructionsForward(I, E);
+
+    if (I == E)
+      // Reached end of block, we don't know if register is dead or not.
+      return false;
+
+    bool SeenDef = false;
+    for (const MachineOperand &MO : I->operands()) {
+      if (MO.isRegMask() && MO.clobbersPhysReg(Reg))
+        SeenDef = true;
+      if (!MO.isReg() || !MO.getReg())
+        continue;
+      if (!TRI->regsOverlap(MO.getReg(), Reg))
+        continue;
+      if (MO.isUse())
+        // Found a use!
+        return false;
+      SeenDef = true;
+    }
+    if (SeenDef)
+      // See a def of Reg (or an alias) before encountering any use, it's
+      // trivially dead.
+      return true;
+
+    --LookAheadLeft;
+    ++I;
+  }
+  return false;
+}
+
+static bool isCallerPreservedOrConstPhysReg(MCRegister Reg,
+                                            const MachineOperand &MO,
+                                            const MachineFunction &MF,
+                                            const TargetRegisterInfo &TRI,
+                                            const TargetInstrInfo &TII) {
+  // MachineRegisterInfo::isConstantPhysReg directly called by
+  // MachineRegisterInfo::isCallerPreservedOrConstPhysReg expects the
+  // reserved registers to be frozen. That doesn't cause a problem  post-ISel as
+  // most (if not all) targets freeze reserved registers right after ISel.
+  //
+  // It does cause issues mid-GlobalISel, however, hence the additional
+  // reservedRegsFrozen check.
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  return TRI.isCallerPreservedPhysReg(Reg, MF) || TII.isIgnorableUse(MO) ||
+         (MRI.reservedRegsFrozen() && MRI.isConstantPhysReg(Reg));
+}
+
+/// hasLivePhysRegDefUses - Return true if the specified instruction read/write
+/// physical registers (except for dead defs of physical registers). It also
+/// returns the physical register def by reference if it's the only one and the
+/// instruction does not uses a physical register.
+bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
+                                       const MachineBasicBlock *MBB,
+                                       SmallSet<MCRegister, 8> &PhysRefs,
+                                       PhysDefVector &PhysDefs,
+                                       bool &PhysUseDef) const {
+  // First, add all uses to PhysRefs.
+  for (const MachineOperand &MO : MI->all_uses()) {
+    Register Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (Reg.isVirtual())
+      continue;
+    // Reading either caller preserved or constant physregs is ok.
+    if (!isCallerPreservedOrConstPhysReg(Reg.asMCReg(), MO, *MI->getMF(), *TRI,
+                                         *TII))
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        PhysRefs.insert(*AI);
+  }
+
+  // Next, collect all defs into PhysDefs.  If any is already in PhysRefs
+  // (which currently contains only uses), set the PhysUseDef flag.
+  PhysUseDef = false;
+  MachineBasicBlock::const_iterator I = MI; I = std::next(I);
+  for (const auto &MOP : llvm::enumerate(MI->operands())) {
+    const MachineOperand &MO = MOP.value();
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (Reg.isVirtual())
+      continue;
+    // Check against PhysRefs even if the def is "dead".
+    if (PhysRefs.count(Reg.asMCReg()))
+      PhysUseDef = true;
+    // If the def is dead, it's ok. But the def may not marked "dead". That's
+    // common since this pass is run before livevariables. We can scan
+    // forward a few instructions and check if it is obviously dead.
+    if (!MO.isDead() && !isPhysDefTriviallyDead(Reg.asMCReg(), I, MBB->end()))
+      PhysDefs.push_back(std::make_pair(MOP.index(), Reg));
+  }
+
+  // Finally, add all defs to PhysRefs as well.
+  for (unsigned i = 0, e = PhysDefs.size(); i != e; ++i)
+    for (MCRegAliasIterator AI(PhysDefs[i].second, TRI, true); AI.isValid();
+         ++AI)
+      PhysRefs.insert(*AI);
+
+  return !PhysRefs.empty();
+}
+
+bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
+                                  SmallSet<MCRegister, 8> &PhysRefs,
+                                  PhysDefVector &PhysDefs,
+                                  bool &NonLocal) const {
+  // For now conservatively returns false if the common subexpression is
+  // not in the same basic block as the given instruction. The only exception
+  // is if the common subexpression is in the sole predecessor block.
+  const MachineBasicBlock *MBB = MI->getParent();
+  const MachineBasicBlock *CSMBB = CSMI->getParent();
+
+  bool CrossMBB = false;
+  if (CSMBB != MBB) {
+    if (MBB->pred_size() != 1 || *MBB->pred_begin() != CSMBB)
+      return false;
+
+    for (unsigned i = 0, e = PhysDefs.size(); i != e; ++i) {
+      if (MRI->isAllocatable(PhysDefs[i].second) ||
+          MRI->isReserved(PhysDefs[i].second))
+        // Avoid extending live range of physical registers if they are
+        //allocatable or reserved.
+        return false;
+    }
+    CrossMBB = true;
+  }
+  MachineBasicBlock::const_iterator I = CSMI; I = std::next(I);
+  MachineBasicBlock::const_iterator E = MI;
+  MachineBasicBlock::const_iterator EE = CSMBB->end();
+  unsigned LookAheadLeft = LookAheadLimit;
+  while (LookAheadLeft) {
+    // Skip over dbg_value's.
+    while (I != E && I != EE && I->isDebugInstr())
+      ++I;
+
+    if (I == EE) {
+      assert(CrossMBB && "Reaching end-of-MBB without finding MI?");
+      (void)CrossMBB;
+      CrossMBB = false;
+      NonLocal = true;
+      I = MBB->begin();
+      EE = MBB->end();
+      continue;
+    }
+
+    if (I == E)
+      return true;
+
+    for (const MachineOperand &MO : I->operands()) {
+      // RegMasks go on instructions like calls that clobber lots of physregs.
+      // Don't attempt to CSE across such an instruction.
+      if (MO.isRegMask())
+        return false;
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      Register MOReg = MO.getReg();
+      if (MOReg.isVirtual())
+        continue;
+      if (PhysRefs.count(MOReg.asMCReg()))
+        return false;
+    }
+
+    --LookAheadLeft;
+    ++I;
+  }
+
+  return false;
+}
+
+bool MachineCSE::isCSECandidate(MachineInstr *MI) {
+  if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() || MI->isKill() ||
+      MI->isInlineAsm() || MI->isDebugInstr())
+    return false;
+
+  // Ignore copies.
+  if (MI->isCopyLike())
+    return false;
+
+  // Ignore stuff that we obviously can't move.
+  if (MI->mayStore() || MI->isCall() || MI->isTerminator() ||
+      MI->mayRaiseFPException() || MI->hasUnmodeledSideEffects())
+    return false;
+
+  if (MI->mayLoad()) {
+    // Okay, this instruction does a load. As a refinement, we allow the target
+    // to decide whether the loaded value is actually a constant. If so, we can
+    // actually use it as a load.
+    if (!MI->isDereferenceableInvariantLoad())
+      // FIXME: we should be able to hoist loads with no other side effects if
+      // there are no other instructions which can change memory in this loop.
+      // This is a trivial form of alias analysis.
+      return false;
+  }
+
+  // Ignore stack guard loads, otherwise the register that holds CSEed value may
+  // be spilled and get loaded back with corrupted data.
+  if (MI->getOpcode() == TargetOpcode::LOAD_STACK_GUARD)
+    return false;
+
+  return true;
+}
+
+/// isProfitableToCSE - Return true if it's profitable to eliminate MI with a
+/// common expression that defines Reg. CSBB is basic block where CSReg is
+/// defined.
+bool MachineCSE::isProfitableToCSE(Register CSReg, Register Reg,
+                                   MachineBasicBlock *CSBB, MachineInstr *MI) {
+  // FIXME: Heuristics that works around the lack the live range splitting.
+
+  // If CSReg is used at all uses of Reg, CSE should not increase register
+  // pressure of CSReg.
+  bool MayIncreasePressure = true;
+  if (CSReg.isVirtual() && Reg.isVirtual()) {
+    MayIncreasePressure = false;
+    SmallPtrSet<MachineInstr*, 8> CSUses;
+    int NumOfUses = 0;
+    for (MachineInstr &MI : MRI->use_nodbg_instructions(CSReg)) {
+      CSUses.insert(&MI);
+      // Too costly to compute if NumOfUses is very large. Conservatively assume
+      // MayIncreasePressure to avoid spending too much time here.
+      if (++NumOfUses > CSUsesThreshold) {
+        MayIncreasePressure = true;
+        break;
+      }
+    }
+    if (!MayIncreasePressure)
+      for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
+        if (!CSUses.count(&MI)) {
+          MayIncreasePressure = true;
+          break;
+        }
+      }
+  }
+  if (!MayIncreasePressure) return true;
+
+  // Heuristics #1: Don't CSE "cheap" computation if the def is not local or in
+  // an immediate predecessor. We don't want to increase register pressure and
+  // end up causing other computation to be spilled.
+  if (TII->isAsCheapAsAMove(*MI)) {
+    MachineBasicBlock *BB = MI->getParent();
+    if (CSBB != BB && !CSBB->isSuccessor(BB))
+      return false;
+  }
+
+  // Heuristics #2: If the expression doesn't not use a vr and the only use
+  // of the redundant computation are copies, do not cse.
+  bool HasVRegUse = false;
+  for (const MachineOperand &MO : MI->all_uses()) {
+    if (MO.getReg().isVirtual()) {
+      HasVRegUse = true;
+      break;
+    }
+  }
+  if (!HasVRegUse) {
+    bool HasNonCopyUse = false;
+    for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
+      // Ignore copies.
+      if (!MI.isCopyLike()) {
+        HasNonCopyUse = true;
+        break;
+      }
+    }
+    if (!HasNonCopyUse)
+      return false;
+  }
+
+  // Heuristics #3: If the common subexpression is used by PHIs, do not reuse
+  // it unless the defined value is already used in the BB of the new use.
+  bool HasPHI = false;
+  for (MachineInstr &UseMI : MRI->use_nodbg_instructions(CSReg)) {
+    HasPHI |= UseMI.isPHI();
+    if (UseMI.getParent() == MI->getParent())
+      return true;
+  }
+
+  return !HasPHI;
+}
+
+void MachineCSE::EnterScope(MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << "Entering: " << MBB->getName() << '\n');
+  ScopeType *Scope = new ScopeType(VNT);
+  ScopeMap[MBB] = Scope;
+}
+
+void MachineCSE::ExitScope(MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n');
+  DenseMap<MachineBasicBlock*, ScopeType*>::iterator SI = ScopeMap.find(MBB);
+  assert(SI != ScopeMap.end());
+  delete SI->second;
+  ScopeMap.erase(SI);
+}
+
+bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
+  bool Changed = false;
+
+  SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs;
+  SmallVector<unsigned, 2> ImplicitDefsToUpdate;
+  SmallVector<unsigned, 2> ImplicitDefs;
+  for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
+    if (!isCSECandidate(&MI))
+      continue;
+
+    bool FoundCSE = VNT.count(&MI);
+    if (!FoundCSE) {
+      // Using trivial copy propagation to find more CSE opportunities.
+      if (PerformTrivialCopyPropagation(&MI, MBB)) {
+        Changed = true;
+
+        // After coalescing MI itself may become a copy.
+        if (MI.isCopyLike())
+          continue;
+
+        // Try again to see if CSE is possible.
+        FoundCSE = VNT.count(&MI);
+      }
+    }
+
+    // Commute commutable instructions.
+    bool Commuted = false;
+    if (!FoundCSE && MI.isCommutable()) {
+      if (MachineInstr *NewMI = TII->commuteInstruction(MI)) {
+        Commuted = true;
+        FoundCSE = VNT.count(NewMI);
+        if (NewMI != &MI) {
+          // New instruction. It doesn't need to be kept.
+          NewMI->eraseFromParent();
+          Changed = true;
+        } else if (!FoundCSE)
+          // MI was changed but it didn't help, commute it back!
+          (void)TII->commuteInstruction(MI);
+      }
+    }
+
+    // If the instruction defines physical registers and the values *may* be
+    // used, then it's not safe to replace it with a common subexpression.
+    // It's also not safe if the instruction uses physical registers.
+    bool CrossMBBPhysDef = false;
+    SmallSet<MCRegister, 8> PhysRefs;
+    PhysDefVector PhysDefs;
+    bool PhysUseDef = false;
+    if (FoundCSE &&
+        hasLivePhysRegDefUses(&MI, MBB, PhysRefs, PhysDefs, PhysUseDef)) {
+      FoundCSE = false;
+
+      // ... Unless the CS is local or is in the sole predecessor block
+      // and it also defines the physical register which is not clobbered
+      // in between and the physical register uses were not clobbered.
+      // This can never be the case if the instruction both uses and
+      // defines the same physical register, which was detected above.
+      if (!PhysUseDef) {
+        unsigned CSVN = VNT.lookup(&MI);
+        MachineInstr *CSMI = Exps[CSVN];
+        if (PhysRegDefsReach(CSMI, &MI, PhysRefs, PhysDefs, CrossMBBPhysDef))
+          FoundCSE = true;
+      }
+    }
+
+    if (!FoundCSE) {
+      VNT.insert(&MI, CurrVN++);
+      Exps.push_back(&MI);
+      continue;
+    }
+
+    // Found a common subexpression, eliminate it.
+    unsigned CSVN = VNT.lookup(&MI);
+    MachineInstr *CSMI = Exps[CSVN];
+    LLVM_DEBUG(dbgs() << "Examining: " << MI);
+    LLVM_DEBUG(dbgs() << "*** Found a common subexpression: " << *CSMI);
+
+    // Prevent CSE-ing non-local convergent instructions.
+    // LLVM's current definition of `isConvergent` does not necessarily prove
+    // that non-local CSE is illegal. The following check extends the definition
+    // of `isConvergent` to assume a convergent instruction is dependent not
+    // only on additional conditions, but also on fewer conditions. LLVM does
+    // not have a MachineInstr attribute which expresses this extended
+    // definition, so it's necessary to use `isConvergent` to prevent illegally
+    // CSE-ing the subset of `isConvergent` instructions which do fall into this
+    // extended definition.
+    if (MI.isConvergent() && MI.getParent() != CSMI->getParent()) {
+      LLVM_DEBUG(dbgs() << "*** Convergent MI and subexpression exist in "
+                           "different BBs, avoid CSE!\n");
+      VNT.insert(&MI, CurrVN++);
+      Exps.push_back(&MI);
+      continue;
+    }
+
+    // Check if it's profitable to perform this CSE.
+    bool DoCSE = true;
+    unsigned NumDefs = MI.getNumDefs();
+
+    for (unsigned i = 0, e = MI.getNumOperands(); NumDefs && i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      Register OldReg = MO.getReg();
+      Register NewReg = CSMI->getOperand(i).getReg();
+
+      // Go through implicit defs of CSMI and MI, if a def is not dead at MI,
+      // we should make sure it is not dead at CSMI.
+      if (MO.isImplicit() && !MO.isDead() && CSMI->getOperand(i).isDead())
+        ImplicitDefsToUpdate.push_back(i);
+
+      // Keep track of implicit defs of CSMI and MI, to clear possibly
+      // made-redundant kill flags.
+      if (MO.isImplicit() && !MO.isDead() && OldReg == NewReg)
+        ImplicitDefs.push_back(OldReg);
+
+      if (OldReg == NewReg) {
+        --NumDefs;
+        continue;
+      }
+
+      assert(OldReg.isVirtual() && NewReg.isVirtual() &&
+             "Do not CSE physical register defs!");
+
+      if (!isProfitableToCSE(NewReg, OldReg, CSMI->getParent(), &MI)) {
+        LLVM_DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
+        DoCSE = false;
+        break;
+      }
+
+      // Don't perform CSE if the result of the new instruction cannot exist
+      // within the constraints (register class, bank, or low-level type) of
+      // the old instruction.
+      if (!MRI->constrainRegAttrs(NewReg, OldReg)) {
+        LLVM_DEBUG(
+            dbgs() << "*** Not the same register constraints, avoid CSE!\n");
+        DoCSE = false;
+        break;
+      }
+
+      CSEPairs.push_back(std::make_pair(OldReg, NewReg));
+      --NumDefs;
+    }
+
+    // Actually perform the elimination.
+    if (DoCSE) {
+      for (const std::pair<unsigned, unsigned> &CSEPair : CSEPairs) {
+        unsigned OldReg = CSEPair.first;
+        unsigned NewReg = CSEPair.second;
+        // OldReg may have been unused but is used now, clear the Dead flag
+        MachineInstr *Def = MRI->getUniqueVRegDef(NewReg);
+        assert(Def != nullptr && "CSEd register has no unique definition?");
+        Def->clearRegisterDeads(NewReg);
+        // Replace with NewReg and clear kill flags which may be wrong now.
+        MRI->replaceRegWith(OldReg, NewReg);
+        MRI->clearKillFlags(NewReg);
+      }
+
+      // Go through implicit defs of CSMI and MI, if a def is not dead at MI,
+      // we should make sure it is not dead at CSMI.
+      for (unsigned ImplicitDefToUpdate : ImplicitDefsToUpdate)
+        CSMI->getOperand(ImplicitDefToUpdate).setIsDead(false);
+      for (const auto &PhysDef : PhysDefs)
+        if (!MI.getOperand(PhysDef.first).isDead())
+          CSMI->getOperand(PhysDef.first).setIsDead(false);
+
+      // Go through implicit defs of CSMI and MI, and clear the kill flags on
+      // their uses in all the instructions between CSMI and MI.
+      // We might have made some of the kill flags redundant, consider:
+      //   subs  ... implicit-def %nzcv    <- CSMI
+      //   csinc ... implicit killed %nzcv <- this kill flag isn't valid anymore
+      //   subs  ... implicit-def %nzcv    <- MI, to be eliminated
+      //   csinc ... implicit killed %nzcv
+      // Since we eliminated MI, and reused a register imp-def'd by CSMI
+      // (here %nzcv), that register, if it was killed before MI, should have
+      // that kill flag removed, because it's lifetime was extended.
+      if (CSMI->getParent() == MI.getParent()) {
+        for (MachineBasicBlock::iterator II = CSMI, IE = &MI; II != IE; ++II)
+          for (auto ImplicitDef : ImplicitDefs)
+            if (MachineOperand *MO = II->findRegisterUseOperand(
+                    ImplicitDef, /*isKill=*/true, TRI))
+              MO->setIsKill(false);
+      } else {
+        // If the instructions aren't in the same BB, bail out and clear the
+        // kill flag on all uses of the imp-def'd register.
+        for (auto ImplicitDef : ImplicitDefs)
+          MRI->clearKillFlags(ImplicitDef);
+      }
+
+      if (CrossMBBPhysDef) {
+        // Add physical register defs now coming in from a predecessor to MBB
+        // livein list.
+        while (!PhysDefs.empty()) {
+          auto LiveIn = PhysDefs.pop_back_val();
+          if (!MBB->isLiveIn(LiveIn.second))
+            MBB->addLiveIn(LiveIn.second);
+        }
+        ++NumCrossBBCSEs;
+      }
+
+      MI.eraseFromParent();
+      ++NumCSEs;
+      if (!PhysRefs.empty())
+        ++NumPhysCSEs;
+      if (Commuted)
+        ++NumCommutes;
+      Changed = true;
+    } else {
+      VNT.insert(&MI, CurrVN++);
+      Exps.push_back(&MI);
+    }
+    CSEPairs.clear();
+    ImplicitDefsToUpdate.clear();
+    ImplicitDefs.clear();
+  }
+
+  return Changed;
+}
+
+/// ExitScopeIfDone - Destroy scope for the MBB that corresponds to the given
+/// dominator tree node if its a leaf or all of its children are done. Walk
+/// up the dominator tree to destroy ancestors which are now done.
+void
+MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
+                        DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren) {
+  if (OpenChildren[Node])
+    return;
+
+  // Pop scope.
+  ExitScope(Node->getBlock());
+
+  // Now traverse upwards to pop ancestors whose offsprings are all done.
+  while (MachineDomTreeNode *Parent = Node->getIDom()) {
+    unsigned Left = --OpenChildren[Parent];
+    if (Left != 0)
+      break;
+    ExitScope(Parent->getBlock());
+    Node = Parent;
+  }
+}
+
+bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
+  SmallVector<MachineDomTreeNode*, 32> Scopes;
+  SmallVector<MachineDomTreeNode*, 8> WorkList;
+  DenseMap<MachineDomTreeNode*, unsigned> OpenChildren;
+
+  CurrVN = 0;
+
+  // Perform a DFS walk to determine the order of visit.
+  WorkList.push_back(Node);
+  do {
+    Node = WorkList.pop_back_val();
+    Scopes.push_back(Node);
+    OpenChildren[Node] = Node->getNumChildren();
+    append_range(WorkList, Node->children());
+  } while (!WorkList.empty());
+
+  // Now perform CSE.
+  bool Changed = false;
+  for (MachineDomTreeNode *Node : Scopes) {
+    MachineBasicBlock *MBB = Node->getBlock();
+    EnterScope(MBB);
+    Changed |= ProcessBlockCSE(MBB);
+    // If it's a leaf node, it's done. Traverse upwards to pop ancestors.
+    ExitScopeIfDone(Node, OpenChildren);
+  }
+
+  return Changed;
+}
+
+// We use stronger checks for PRE candidate rather than for CSE ones to embrace
+// checks inside ProcessBlockCSE(), not only inside isCSECandidate(). This helps
+// to exclude instrs created by PRE that won't be CSEed later.
+bool MachineCSE::isPRECandidate(MachineInstr *MI,
+                                SmallSet<MCRegister, 8> &PhysRefs) {
+  if (!isCSECandidate(MI) ||
+      MI->isNotDuplicable() ||
+      MI->mayLoad() ||
+      TII->isAsCheapAsAMove(*MI) ||
+      MI->getNumDefs() != 1 ||
+      MI->getNumExplicitDefs() != 1)
+    return false;
+
+  for (const MachineOperand &MO : MI->operands()) {
+    if (MO.isReg() && !MO.getReg().isVirtual()) {
+      if (MO.isDef())
+        return false;
+      else
+        PhysRefs.insert(MO.getReg());
+    }
+  }
+
+  return true;
+}
+
+bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
+                                 MachineBasicBlock *MBB) {
+  bool Changed = false;
+  for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
+    SmallSet<MCRegister, 8> PhysRefs;
+    if (!isPRECandidate(&MI, PhysRefs))
+      continue;
+
+    if (!PREMap.count(&MI)) {
+      PREMap[&MI] = MBB;
+      continue;
+    }
+
+    auto MBB1 = PREMap[&MI];
+    assert(
+        !DT->properlyDominates(MBB, MBB1) &&
+        "MBB cannot properly dominate MBB1 while DFS through dominators tree!");
+    auto CMBB = DT->findNearestCommonDominator(MBB, MBB1);
+    if (!CMBB->isLegalToHoistInto())
+      continue;
+
+    if (!isProfitableToHoistInto(CMBB, MBB, MBB1))
+      continue;
+
+    // Two instrs are partial redundant if their basic blocks are reachable
+    // from one to another but one doesn't dominate another.
+    if (CMBB != MBB1) {
+      auto BB = MBB->getBasicBlock(), BB1 = MBB1->getBasicBlock();
+      if (BB != nullptr && BB1 != nullptr &&
+          (isPotentiallyReachable(BB1, BB) ||
+           isPotentiallyReachable(BB, BB1))) {
+        // The following check extends the definition of `isConvergent` to
+        // assume a convergent instruction is dependent not only on additional
+        // conditions, but also on fewer conditions. LLVM does not have a
+        // MachineInstr attribute which expresses this extended definition, so
+        // it's necessary to use `isConvergent` to prevent illegally PRE-ing the
+        // subset of `isConvergent` instructions which do fall into this
+        // extended definition.
+        if (MI.isConvergent() && CMBB != MBB)
+          continue;
+
+        // If this instruction uses physical registers then we can only do PRE
+        // if it's using the value that is live at the place we're hoisting to.
+        bool NonLocal;
+        PhysDefVector PhysDefs;
+        if (!PhysRefs.empty() &&
+            !PhysRegDefsReach(&*(CMBB->getFirstTerminator()), &MI, PhysRefs,
+                              PhysDefs, NonLocal))
+          continue;
+
+        assert(MI.getOperand(0).isDef() &&
+               "First operand of instr with one explicit def must be this def");
+        Register VReg = MI.getOperand(0).getReg();
+        Register NewReg = MRI->cloneVirtualRegister(VReg);
+        if (!isProfitableToCSE(NewReg, VReg, CMBB, &MI))
+          continue;
+        MachineInstr &NewMI =
+            TII->duplicate(*CMBB, CMBB->getFirstTerminator(), MI);
+
+        // When hoisting, make sure we don't carry the debug location of
+        // the original instruction, as that's not correct and can cause
+        // unexpected jumps when debugging optimized code.
+        auto EmptyDL = DebugLoc();
+        NewMI.setDebugLoc(EmptyDL);
+
+        NewMI.getOperand(0).setReg(NewReg);
+
+        PREMap[&MI] = CMBB;
+        ++NumPREs;
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+// This simple PRE (partial redundancy elimination) pass doesn't actually
+// eliminate partial redundancy but transforms it to full redundancy,
+// anticipating that the next CSE step will eliminate this created redundancy.
+// If CSE doesn't eliminate this, than created instruction will remain dead
+// and eliminated later by Remove Dead Machine Instructions pass.
+bool MachineCSE::PerformSimplePRE(MachineDominatorTree *DT) {
+  SmallVector<MachineDomTreeNode *, 32> BBs;
+
+  PREMap.clear();
+  bool Changed = false;
+  BBs.push_back(DT->getRootNode());
+  do {
+    auto Node = BBs.pop_back_val();
+    append_range(BBs, Node->children());
+
+    MachineBasicBlock *MBB = Node->getBlock();
+    Changed |= ProcessBlockPRE(DT, MBB);
+
+  } while (!BBs.empty());
+
+  return Changed;
+}
+
+bool MachineCSE::isProfitableToHoistInto(MachineBasicBlock *CandidateBB,
+                                         MachineBasicBlock *MBB,
+                                         MachineBasicBlock *MBB1) {
+  if (CandidateBB->getParent()->getFunction().hasMinSize())
+    return true;
+  assert(DT->dominates(CandidateBB, MBB) && "CandidateBB should dominate MBB");
+  assert(DT->dominates(CandidateBB, MBB1) &&
+         "CandidateBB should dominate MBB1");
+  return MBFI->getBlockFreq(CandidateBB) <=
+         MBFI->getBlockFreq(MBB) + MBFI->getBlockFreq(MBB1);
+}
+
+bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  DT = &getAnalysis<MachineDominatorTree>();
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+  LookAheadLimit = TII->getMachineCSELookAheadLimit();
+  bool ChangedPRE, ChangedCSE;
+  ChangedPRE = PerformSimplePRE(DT);
+  ChangedCSE = PerformCSE(DT->getRootNode());
+  return ChangedPRE || ChangedCSE;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineCheckDebugify.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineCheckDebugify.cpp
new file mode 100644
index 000000000000..874f726d2947
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineCheckDebugify.cpp
@@ -0,0 +1,127 @@
+//===- MachineCheckDebugify.cpp - Check debug info ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This checks debug info after mir-debugify (+ pass-to-test). Currently
+/// it simply checks the integrity of line info in DILocation and
+/// DILocalVariable which mir-debugifiy generated before.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+#define DEBUG_TYPE "mir-check-debugify"
+
+using namespace llvm;
+
+namespace {
+
+struct CheckDebugMachineModule : public ModulePass {
+  bool runOnModule(Module &M) override {
+    NamedMDNode *NMD = M.getNamedMetadata("llvm.mir.debugify");
+    if (!NMD) {
+      errs() << "WARNING: Please run mir-debugify to generate "
+                "llvm.mir.debugify metadata first.\n";
+      return false;
+    }
+
+    MachineModuleInfo &MMI =
+        getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+
+    auto getDebugifyOperand = [&](unsigned Idx) -> unsigned {
+      return mdconst::extract<ConstantInt>(NMD->getOperand(Idx)->getOperand(0))
+          ->getZExtValue();
+    };
+    assert(NMD->getNumOperands() == 2 &&
+           "llvm.mir.debugify should have exactly 2 operands!");
+    unsigned NumLines = getDebugifyOperand(0);
+    unsigned NumVars = getDebugifyOperand(1);
+    BitVector MissingLines{NumLines, true};
+    BitVector MissingVars{NumVars, true};
+
+    for (Function &F : M.functions()) {
+      MachineFunction *MF = MMI.getMachineFunction(F);
+      if (!MF)
+        continue;
+      for (MachineBasicBlock &MBB : *MF) {
+        // Find missing lines.
+        // TODO: Avoid meta instructions other than dbg_val.
+        for (MachineInstr &MI : MBB) {
+          if (MI.isDebugValue())
+            continue;
+          const DebugLoc DL = MI.getDebugLoc();
+          if (DL && DL.getLine() != 0) {
+            MissingLines.reset(DL.getLine() - 1);
+            continue;
+          }
+
+          if (!DL) {
+            errs() << "WARNING: Instruction with empty DebugLoc in function ";
+            errs() << F.getName() << " --";
+            MI.print(errs());
+          }
+        }
+
+        // Find missing variables.
+        // TODO: Handle DBG_INSTR_REF which is under an experimental option now.
+        for (MachineInstr &MI : MBB) {
+          if (!MI.isDebugValue())
+            continue;
+          const DILocalVariable *LocalVar = MI.getDebugVariable();
+          unsigned Var = ~0U;
+
+          (void)to_integer(LocalVar->getName(), Var, 10);
+          assert(Var <= NumVars && "Unexpected name for DILocalVariable");
+          MissingVars.reset(Var - 1);
+        }
+      }
+    }
+
+    bool Fail = false;
+    for (unsigned Idx : MissingLines.set_bits()) {
+      errs() << "WARNING: Missing line " << Idx + 1 << "\n";
+      Fail = true;
+    }
+
+    for (unsigned Idx : MissingVars.set_bits()) {
+      errs() << "WARNING: Missing variable " << Idx + 1 << "\n";
+      Fail = true;
+    }
+    errs() << "Machine IR debug info check: ";
+    errs() << (Fail ? "FAIL" : "PASS") << "\n";
+
+    return false;
+  }
+
+  CheckDebugMachineModule() : ModulePass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.setPreservesAll();
+  }
+
+  static char ID; // Pass identification.
+};
+char CheckDebugMachineModule::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(CheckDebugMachineModule, DEBUG_TYPE,
+                      "Machine Check Debug Module", false, false)
+INITIALIZE_PASS_END(CheckDebugMachineModule, DEBUG_TYPE,
+                    "Machine Check Debug Module", false, false)
+
+ModulePass *llvm::createCheckDebugMachineModulePass() {
+  return new CheckDebugMachineModule();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineCombiner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineCombiner.cpp
new file mode 100644
index 000000000000..c65937935ed8
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -0,0 +1,769 @@
+//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The machine combiner pass uses machine trace metrics to ensure the combined
+// instructions do not lengthen the critical path or the resource depth.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineCombinerPattern.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-combiner"
+
+STATISTIC(NumInstCombined, "Number of machineinst combined");
+
+static cl::opt<unsigned>
+inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
+              cl::desc("Incremental depth computation will be used for basic "
+                       "blocks with more instructions."), cl::init(500));
+
+static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
+                                cl::desc("Dump all substituted intrs"),
+                                cl::init(false));
+
+#ifdef EXPENSIVE_CHECKS
+static cl::opt<bool> VerifyPatternOrder(
+    "machine-combiner-verify-pattern-order", cl::Hidden,
+    cl::desc(
+        "Verify that the generated patterns are ordered by increasing latency"),
+    cl::init(true));
+#else
+static cl::opt<bool> VerifyPatternOrder(
+    "machine-combiner-verify-pattern-order", cl::Hidden,
+    cl::desc(
+        "Verify that the generated patterns are ordered by increasing latency"),
+    cl::init(false));
+#endif
+
+namespace {
+class MachineCombiner : public MachineFunctionPass {
+  const TargetSubtargetInfo *STI = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  MCSchedModel SchedModel;
+  MachineRegisterInfo *MRI = nullptr;
+  MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo
+  MachineTraceMetrics *Traces = nullptr;
+  MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr;
+  MachineBlockFrequencyInfo *MBFI = nullptr;
+  ProfileSummaryInfo *PSI = nullptr;
+  RegisterClassInfo RegClassInfo;
+
+  TargetSchedModel TSchedModel;
+
+  /// True if optimizing for code size.
+  bool OptSize = false;
+
+public:
+  static char ID;
+  MachineCombiner() : MachineFunctionPass(ID) {
+    initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override { return "Machine InstCombiner"; }
+
+private:
+  bool combineInstructions(MachineBasicBlock *);
+  MachineInstr *getOperandDef(const MachineOperand &MO);
+  bool isTransientMI(const MachineInstr *MI);
+  unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
+                    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+                    MachineTraceMetrics::Trace BlockTrace,
+                    const MachineBasicBlock &MBB);
+  unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
+                      MachineTraceMetrics::Trace BlockTrace);
+  bool
+  improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
+                          MachineTraceMetrics::Trace BlockTrace,
+                          SmallVectorImpl<MachineInstr *> &InsInstrs,
+                          SmallVectorImpl<MachineInstr *> &DelInstrs,
+                          DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+                          MachineCombinerPattern Pattern, bool SlackIsAccurate);
+  bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB,
+                              SmallVectorImpl<MachineInstr *> &InsInstrs,
+                              SmallVectorImpl<MachineInstr *> &DelInstrs,
+                              MachineCombinerPattern Pattern);
+  bool preservesResourceLen(MachineBasicBlock *MBB,
+                            MachineTraceMetrics::Trace BlockTrace,
+                            SmallVectorImpl<MachineInstr *> &InsInstrs,
+                            SmallVectorImpl<MachineInstr *> &DelInstrs);
+  void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
+                     SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
+  std::pair<unsigned, unsigned>
+  getLatenciesForInstrSequences(MachineInstr &MI,
+                                SmallVectorImpl<MachineInstr *> &InsInstrs,
+                                SmallVectorImpl<MachineInstr *> &DelInstrs,
+                                MachineTraceMetrics::Trace BlockTrace);
+
+  void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root,
+                          SmallVector<MachineCombinerPattern, 16> &Patterns);
+};
+}
+
+char MachineCombiner::ID = 0;
+char &llvm::MachineCombinerID = MachineCombiner::ID;
+
+INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE,
+                      "Machine InstCombiner", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
+                    false, false)
+
+void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<MachineTraceMetrics>();
+  AU.addPreserved<MachineTraceMetrics>();
+  AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineInstr *
+MachineCombiner::getOperandDef(const MachineOperand &MO) {
+  MachineInstr *DefInstr = nullptr;
+  // We need a virtual register definition.
+  if (MO.isReg() && MO.getReg().isVirtual())
+    DefInstr = MRI->getUniqueVRegDef(MO.getReg());
+  // PHI's have no depth etc.
+  if (DefInstr && DefInstr->isPHI())
+    DefInstr = nullptr;
+  return DefInstr;
+}
+
+/// Return true if MI is unlikely to generate an actual target instruction.
+bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
+  if (!MI->isCopy())
+    return MI->isTransient();
+
+  // If MI is a COPY, check if its src and dst registers can be coalesced.
+  Register Dst = MI->getOperand(0).getReg();
+  Register Src = MI->getOperand(1).getReg();
+
+  if (!MI->isFullCopy()) {
+    // If src RC contains super registers of dst RC, it can also be coalesced.
+    if (MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
+      return false;
+
+    auto SrcSub = MI->getOperand(1).getSubReg();
+    auto SrcRC = MRI->getRegClass(Src);
+    auto DstRC = MRI->getRegClass(Dst);
+    return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr;
+  }
+
+  if (Src.isPhysical() && Dst.isPhysical())
+    return Src == Dst;
+
+  if (Src.isVirtual() && Dst.isVirtual()) {
+    auto SrcRC = MRI->getRegClass(Src);
+    auto DstRC = MRI->getRegClass(Dst);
+    return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC);
+  }
+
+  if (Src.isVirtual())
+    std::swap(Src, Dst);
+
+  // Now Src is physical register, Dst is virtual register.
+  auto DstRC = MRI->getRegClass(Dst);
+  return DstRC->contains(Src);
+}
+
+/// Computes depth of instructions in vector \InsInstr.
+///
+/// \param InsInstrs is a vector of machine instructions
+/// \param InstrIdxForVirtReg is a dense map of virtual register to index
+/// of defining machine instruction in \p InsInstrs
+/// \param BlockTrace is a trace of machine instructions
+///
+/// \returns Depth of last instruction in \InsInstrs ("NewRoot")
+unsigned
+MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
+                          DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+                          MachineTraceMetrics::Trace BlockTrace,
+                          const MachineBasicBlock &MBB) {
+  SmallVector<unsigned, 16> InstrDepth;
+  // For each instruction in the new sequence compute the depth based on the
+  // operands. Use the trace information when possible. For new operands which
+  // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
+  for (auto *InstrPtr : InsInstrs) { // for each Use
+    unsigned IDepth = 0;
+    for (const MachineOperand &MO : InstrPtr->all_uses()) {
+      // Check for virtual register operand.
+      if (!MO.getReg().isVirtual())
+        continue;
+      unsigned DepthOp = 0;
+      unsigned LatencyOp = 0;
+      DenseMap<unsigned, unsigned>::iterator II =
+          InstrIdxForVirtReg.find(MO.getReg());
+      if (II != InstrIdxForVirtReg.end()) {
+        // Operand is new virtual register not in trace
+        assert(II->second < InstrDepth.size() && "Bad Index");
+        MachineInstr *DefInstr = InsInstrs[II->second];
+        assert(DefInstr &&
+               "There must be a definition for a new virtual register");
+        DepthOp = InstrDepth[II->second];
+        int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg());
+        int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg());
+        LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
+                                                      InstrPtr, UseIdx);
+      } else {
+        MachineInstr *DefInstr = getOperandDef(MO);
+        if (DefInstr && (TII->getMachineCombinerTraceStrategy() !=
+                             MachineTraceStrategy::TS_Local ||
+                         DefInstr->getParent() == &MBB)) {
+          DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
+          if (!isTransientMI(DefInstr))
+            LatencyOp = TSchedModel.computeOperandLatency(
+                DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
+                InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
+        }
+      }
+      IDepth = std::max(IDepth, DepthOp + LatencyOp);
+    }
+    InstrDepth.push_back(IDepth);
+  }
+  unsigned NewRootIdx = InsInstrs.size() - 1;
+  return InstrDepth[NewRootIdx];
+}
+
+/// Computes instruction latency as max of latency of defined operands.
+///
+/// \param Root is a machine instruction that could be replaced by NewRoot.
+/// It is used to compute a more accurate latency information for NewRoot in
+/// case there is a dependent instruction in the same trace (\p BlockTrace)
+/// \param NewRoot is the instruction for which the latency is computed
+/// \param BlockTrace is a trace of machine instructions
+///
+/// \returns Latency of \p NewRoot
+unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
+                                     MachineTraceMetrics::Trace BlockTrace) {
+  // Check each definition in NewRoot and compute the latency
+  unsigned NewRootLatency = 0;
+
+  for (const MachineOperand &MO : NewRoot->all_defs()) {
+    // Check for virtual register operand.
+    if (!MO.getReg().isVirtual())
+      continue;
+    // Get the first instruction that uses MO
+    MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
+    RI++;
+    if (RI == MRI->reg_end())
+      continue;
+    MachineInstr *UseMO = RI->getParent();
+    unsigned LatencyOp = 0;
+    if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
+      LatencyOp = TSchedModel.computeOperandLatency(
+          NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
+          UseMO->findRegisterUseOperandIdx(MO.getReg()));
+    } else {
+      LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
+    }
+    NewRootLatency = std::max(NewRootLatency, LatencyOp);
+  }
+  return NewRootLatency;
+}
+
+/// The combiner's goal may differ based on which pattern it is attempting
+/// to optimize.
+enum class CombinerObjective {
+  MustReduceDepth,            // The data dependency chain must be improved.
+  MustReduceRegisterPressure, // The register pressure must be reduced.
+  Default                     // The critical path must not be lengthened.
+};
+
+static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
+  // TODO: If C++ ever gets a real enum class, make this part of the
+  // MachineCombinerPattern class.
+  switch (P) {
+  case MachineCombinerPattern::REASSOC_AX_BY:
+  case MachineCombinerPattern::REASSOC_AX_YB:
+  case MachineCombinerPattern::REASSOC_XA_BY:
+  case MachineCombinerPattern::REASSOC_XA_YB:
+  case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
+  case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
+  case MachineCombinerPattern::SUBADD_OP1:
+  case MachineCombinerPattern::SUBADD_OP2:
+  case MachineCombinerPattern::FMADD_AX:
+  case MachineCombinerPattern::FMADD_XA:
+  case MachineCombinerPattern::FMSUB:
+  case MachineCombinerPattern::FNMSUB:
+    return CombinerObjective::MustReduceDepth;
+  case MachineCombinerPattern::REASSOC_XY_BCA:
+  case MachineCombinerPattern::REASSOC_XY_BAC:
+    return CombinerObjective::MustReduceRegisterPressure;
+  default:
+    return CombinerObjective::Default;
+  }
+}
+
+/// Estimate the latency of the new and original instruction sequence by summing
+/// up the latencies of the inserted and deleted instructions. This assumes
+/// that the inserted and deleted instructions are dependent instruction chains,
+/// which might not hold in all cases.
+std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
+    MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    MachineTraceMetrics::Trace BlockTrace) {
+  assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
+  unsigned NewRootLatency = 0;
+  // NewRoot is the last instruction in the \p InsInstrs vector.
+  MachineInstr *NewRoot = InsInstrs.back();
+  for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
+    NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
+  NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
+
+  unsigned RootLatency = 0;
+  for (auto *I : DelInstrs)
+    RootLatency += TSchedModel.computeInstrLatency(I);
+
+  return {NewRootLatency, RootLatency};
+}
+
+bool MachineCombiner::reduceRegisterPressure(
+    MachineInstr &Root, MachineBasicBlock *MBB,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    MachineCombinerPattern Pattern) {
+  // FIXME: for now, we don't do any check for the register pressure patterns.
+  // We treat them as always profitable. But we can do better if we make
+  // RegPressureTracker class be aware of TIE attribute. Then we can get an
+  // accurate compare of register pressure with DelInstrs or InsInstrs.
+  return true;
+}
+
+/// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
+/// The new code sequence ends in MI NewRoot. A necessary condition for the new
+/// sequence to replace the old sequence is that it cannot lengthen the critical
+/// path. The definition of "improve" may be restricted by specifying that the
+/// new path improves the data dependency chain (MustReduceDepth).
+bool MachineCombiner::improvesCriticalPathLen(
+    MachineBasicBlock *MBB, MachineInstr *Root,
+    MachineTraceMetrics::Trace BlockTrace,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+    MachineCombinerPattern Pattern,
+    bool SlackIsAccurate) {
+  // Get depth and latency of NewRoot and Root.
+  unsigned NewRootDepth =
+      getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, *MBB);
+  unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
+
+  LLVM_DEBUG(dbgs() << "  Dependence data for " << *Root << "\tNewRootDepth: "
+                    << NewRootDepth << "\tRootDepth: " << RootDepth);
+
+  // For a transform such as reassociation, the cost equation is
+  // conservatively calculated so that we must improve the depth (data
+  // dependency cycles) in the critical path to proceed with the transform.
+  // Being conservative also protects against inaccuracies in the underlying
+  // machine trace metrics and CPU models.
+  if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) {
+    LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
+    LLVM_DEBUG(NewRootDepth < RootDepth
+                   ? dbgs() << "\t  and it does it\n"
+                   : dbgs() << "\t  but it does NOT do it\n");
+    return NewRootDepth < RootDepth;
+  }
+
+  // A more flexible cost calculation for the critical path includes the slack
+  // of the original code sequence. This may allow the transform to proceed
+  // even if the instruction depths (data dependency cycles) become worse.
+
+  // Account for the latency of the inserted and deleted instructions by
+  unsigned NewRootLatency, RootLatency;
+  if (TII->accumulateInstrSeqToRootLatency(*Root)) {
+    std::tie(NewRootLatency, RootLatency) =
+        getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
+  } else {
+    NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.back());
+    RootLatency = TSchedModel.computeInstrLatency(Root);
+  }
+
+  unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
+  unsigned NewCycleCount = NewRootDepth + NewRootLatency;
+  unsigned OldCycleCount =
+      RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
+  LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
+                    << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
+                    << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
+                    << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
+                    << "\n\tRootDepth + RootLatency + RootSlack = "
+                    << OldCycleCount;);
+  LLVM_DEBUG(NewCycleCount <= OldCycleCount
+                 ? dbgs() << "\n\t  It IMPROVES PathLen because"
+                 : dbgs() << "\n\t  It DOES NOT improve PathLen because");
+  LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
+                    << ", OldCycleCount = " << OldCycleCount << "\n");
+
+  return NewCycleCount <= OldCycleCount;
+}
+
+/// helper routine to convert instructions into SC
+void MachineCombiner::instr2instrSC(
+    SmallVectorImpl<MachineInstr *> &Instrs,
+    SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
+  for (auto *InstrPtr : Instrs) {
+    unsigned Opc = InstrPtr->getOpcode();
+    unsigned Idx = TII->get(Opc).getSchedClass();
+    const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
+    InstrsSC.push_back(SC);
+  }
+}
+
+/// True when the new instructions do not increase resource length
+bool MachineCombiner::preservesResourceLen(
+    MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs) {
+  if (!TSchedModel.hasInstrSchedModel())
+    return true;
+
+  // Compute current resource length
+
+  //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
+  SmallVector <const MachineBasicBlock *, 1> MBBarr;
+  MBBarr.push_back(MBB);
+  unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
+
+  // Deal with SC rather than Instructions.
+  SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
+  SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
+
+  instr2instrSC(InsInstrs, InsInstrsSC);
+  instr2instrSC(DelInstrs, DelInstrsSC);
+
+  ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC};
+  ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC};
+
+  // Compute new resource length.
+  unsigned ResLenAfterCombine =
+      BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
+
+  LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
+                    << ResLenBeforeCombine
+                    << " and after: " << ResLenAfterCombine << "\n";);
+  LLVM_DEBUG(
+      ResLenAfterCombine <=
+      ResLenBeforeCombine + TII->getExtendResourceLenLimit()
+          ? dbgs() << "\t\t  As result it IMPROVES/PRESERVES Resource Length\n"
+          : dbgs() << "\t\t  As result it DOES NOT improve/preserve Resource "
+                      "Length\n");
+
+  return ResLenAfterCombine <=
+         ResLenBeforeCombine + TII->getExtendResourceLenLimit();
+}
+
+/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
+/// depths if requested.
+///
+/// \param MBB basic block to insert instructions in
+/// \param MI current machine instruction
+/// \param InsInstrs new instructions to insert in \p MBB
+/// \param DelInstrs instruction to delete from \p MBB
+/// \param TraceEnsemble is a pointer to the machine trace information
+/// \param RegUnits set of live registers, needed to compute instruction depths
+/// \param TII is target instruction info, used to call target hook
+/// \param Pattern is used to call target hook finalizeInsInstrs
+/// \param IncrementalUpdate if true, compute instruction depths incrementally,
+///                          otherwise invalidate the trace
+static void insertDeleteInstructions(
+    MachineBasicBlock *MBB, MachineInstr &MI,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    MachineTraceMetrics::Ensemble *TraceEnsemble,
+    SparseSet<LiveRegUnit> &RegUnits, const TargetInstrInfo *TII,
+    MachineCombinerPattern Pattern, bool IncrementalUpdate) {
+  // If we want to fix up some placeholder for some target, do it now.
+  // We need this because in genAlternativeCodeSequence, we have not decided the
+  // better pattern InsInstrs or DelInstrs, so we don't want generate some
+  // sideeffect to the function. For example we need to delay the constant pool
+  // entry creation here after InsInstrs is selected as better pattern.
+  // Otherwise the constant pool entry created for InsInstrs will not be deleted
+  // even if InsInstrs is not the better pattern.
+  TII->finalizeInsInstrs(MI, Pattern, InsInstrs);
+
+  for (auto *InstrPtr : InsInstrs)
+    MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
+
+  for (auto *InstrPtr : DelInstrs) {
+    InstrPtr->eraseFromParent();
+    // Erase all LiveRegs defined by the removed instruction
+    for (auto *I = RegUnits.begin(); I != RegUnits.end();) {
+      if (I->MI == InstrPtr)
+        I = RegUnits.erase(I);
+      else
+        I++;
+    }
+  }
+
+  if (IncrementalUpdate)
+    for (auto *InstrPtr : InsInstrs)
+      TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits);
+  else
+    TraceEnsemble->invalidate(MBB);
+
+  NumInstCombined++;
+}
+
+// Check that the difference between original and new latency is decreasing for
+// later patterns. This helps to discover sub-optimal pattern orderings.
+void MachineCombiner::verifyPatternOrder(
+    MachineBasicBlock *MBB, MachineInstr &Root,
+    SmallVector<MachineCombinerPattern, 16> &Patterns) {
+  long PrevLatencyDiff = std::numeric_limits<long>::max();
+  (void)PrevLatencyDiff; // Variable is used in assert only.
+  for (auto P : Patterns) {
+    SmallVector<MachineInstr *, 16> InsInstrs;
+    SmallVector<MachineInstr *, 16> DelInstrs;
+    DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
+    TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs,
+                                    InstrIdxForVirtReg);
+    // Found pattern, but did not generate alternative sequence.
+    // This can happen e.g. when an immediate could not be materialized
+    // in a single instruction.
+    if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
+      continue;
+
+    unsigned NewRootLatency, RootLatency;
+    std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences(
+        Root, InsInstrs, DelInstrs, TraceEnsemble->getTrace(MBB));
+    long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
+    assert(CurrentLatencyDiff <= PrevLatencyDiff &&
+           "Current pattern is better than previous pattern.");
+    PrevLatencyDiff = CurrentLatencyDiff;
+  }
+}
+
+/// Substitute a slow code sequence with a faster one by
+/// evaluating instruction combining pattern.
+/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
+/// combining based on machine trace metrics. Only combine a sequence of
+/// instructions  when this neither lengthens the critical path nor increases
+/// resource pressure. When optimizing for codesize always combine when the new
+/// sequence is shorter.
+bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
+
+  bool IncrementalUpdate = false;
+  auto BlockIter = MBB->begin();
+  decltype(BlockIter) LastUpdate;
+  // Check if the block is in a loop.
+  const MachineLoop *ML = MLI->getLoopFor(MBB);
+  if (!TraceEnsemble)
+    TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy());
+
+  SparseSet<LiveRegUnit> RegUnits;
+  RegUnits.setUniverse(TRI->getNumRegUnits());
+
+  bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
+
+  bool DoRegPressureReduce =
+      TII->shouldReduceRegisterPressure(MBB, &RegClassInfo);
+
+  while (BlockIter != MBB->end()) {
+    auto &MI = *BlockIter++;
+    SmallVector<MachineCombinerPattern, 16> Patterns;
+    // The motivating example is:
+    //
+    //     MUL  Other        MUL_op1 MUL_op2  Other
+    //      \    /               \      |    /
+    //      ADD/SUB      =>        MADD/MSUB
+    //      (=Root)                (=NewRoot)
+
+    // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
+    // usually beneficial for code size it unfortunately can hurt performance
+    // when the ADD is on the critical path, but the MUL is not. With the
+    // substitution the MUL becomes part of the critical path (in form of the
+    // MADD) and can lengthen it on architectures where the MADD latency is
+    // longer than the ADD latency.
+    //
+    // For each instruction we check if it can be the root of a combiner
+    // pattern. Then for each pattern the new code sequence in form of MI is
+    // generated and evaluated. When the efficiency criteria (don't lengthen
+    // critical path, don't use more resources) is met the new sequence gets
+    // hooked up into the basic block before the old sequence is removed.
+    //
+    // The algorithm does not try to evaluate all patterns and pick the best.
+    // This is only an artificial restriction though. In practice there is
+    // mostly one pattern, and getMachineCombinerPatterns() can order patterns
+    // based on an internal cost heuristic. If
+    // machine-combiner-verify-pattern-order is enabled, all patterns are
+    // checked to ensure later patterns do not provide better latency savings.
+
+    if (!TII->getMachineCombinerPatterns(MI, Patterns, DoRegPressureReduce))
+      continue;
+
+    if (VerifyPatternOrder)
+      verifyPatternOrder(MBB, MI, Patterns);
+
+    for (const auto P : Patterns) {
+      SmallVector<MachineInstr *, 16> InsInstrs;
+      SmallVector<MachineInstr *, 16> DelInstrs;
+      DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
+      TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
+                                      InstrIdxForVirtReg);
+      // Found pattern, but did not generate alternative sequence.
+      // This can happen e.g. when an immediate could not be materialized
+      // in a single instruction.
+      if (InsInstrs.empty())
+        continue;
+
+      LLVM_DEBUG(if (dump_intrs) {
+        dbgs() << "\tFor the Pattern (" << (int)P
+               << ") these instructions could be removed\n";
+        for (auto const *InstrPtr : DelInstrs)
+          InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
+                          /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
+        dbgs() << "\tThese instructions could replace the removed ones\n";
+        for (auto const *InstrPtr : InsInstrs)
+          InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
+                          /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
+      });
+
+      if (IncrementalUpdate && LastUpdate != BlockIter) {
+        // Update depths since the last incremental update.
+        TraceEnsemble->updateDepths(LastUpdate, BlockIter, RegUnits);
+        LastUpdate = BlockIter;
+      }
+
+      if (DoRegPressureReduce &&
+          getCombinerObjective(P) ==
+              CombinerObjective::MustReduceRegisterPressure) {
+        if (MBB->size() > inc_threshold) {
+          // Use incremental depth updates for basic blocks above threshold
+          IncrementalUpdate = true;
+          LastUpdate = BlockIter;
+        }
+        if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) {
+          // Replace DelInstrs with InsInstrs.
+          insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
+                                   RegUnits, TII, P, IncrementalUpdate);
+          Changed |= true;
+
+          // Go back to previous instruction as it may have ILP reassociation
+          // opportunity.
+          BlockIter--;
+          break;
+        }
+      }
+
+      if (ML && TII->isThroughputPattern(P)) {
+        LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n");
+        insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
+                                 RegUnits, TII, P, IncrementalUpdate);
+        // Eagerly stop after the first pattern fires.
+        Changed = true;
+        break;
+      } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) {
+        LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize ("
+                          << InsInstrs.size() << " < "
+                          << DelInstrs.size() << ")\n");
+        insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
+                                 RegUnits, TII, P, IncrementalUpdate);
+        // Eagerly stop after the first pattern fires.
+        Changed = true;
+        break;
+      } else {
+        // For big basic blocks, we only compute the full trace the first time
+        // we hit this. We do not invalidate the trace, but instead update the
+        // instruction depths incrementally.
+        // NOTE: Only the instruction depths up to MI are accurate. All other
+        // trace information is not updated.
+        MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB);
+        Traces->verifyAnalysis();
+        if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
+                                    InstrIdxForVirtReg, P,
+                                    !IncrementalUpdate) &&
+            preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
+          if (MBB->size() > inc_threshold) {
+            // Use incremental depth updates for basic blocks above treshold
+            IncrementalUpdate = true;
+            LastUpdate = BlockIter;
+          }
+
+          insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
+                                   RegUnits, TII, P, IncrementalUpdate);
+
+          // Eagerly stop after the first pattern fires.
+          Changed = true;
+          break;
+        }
+        // Cleanup instructions of the alternative code sequence. There is no
+        // use for them.
+        MachineFunction *MF = MBB->getParent();
+        for (auto *InstrPtr : InsInstrs)
+          MF->deleteMachineInstr(InstrPtr);
+      }
+      InstrIdxForVirtReg.clear();
+    }
+  }
+
+  if (Changed && IncrementalUpdate)
+    Traces->invalidate(MBB);
+  return Changed;
+}
+
+bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
+  STI = &MF.getSubtarget();
+  TII = STI->getInstrInfo();
+  TRI = STI->getRegisterInfo();
+  SchedModel = STI->getSchedModel();
+  TSchedModel.init(STI);
+  MRI = &MF.getRegInfo();
+  MLI = &getAnalysis<MachineLoopInfo>();
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  MBFI = (PSI && PSI->hasProfileSummary()) ?
+         &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+         nullptr;
+  TraceEnsemble = nullptr;
+  OptSize = MF.getFunction().hasOptSize();
+  RegClassInfo.runOnMachineFunction(MF);
+
+  LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
+  if (!TII->useMachineCombiner()) {
+    LLVM_DEBUG(
+        dbgs()
+        << "  Skipping pass: Target does not support machine combiner\n");
+    return false;
+  }
+
+  bool Changed = false;
+
+  // Try to combine instructions.
+  for (auto &MBB : MF)
+    Changed |= combineInstructions(&MBB);
+
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineCopyPropagation.cpp
new file mode 100644
index 000000000000..3453e6c0b8be
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -0,0 +1,1424 @@
+//===- MachineCopyPropagation.cpp - Machine Copy Propagation Pass ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an extremely simple MachineInstr-level copy propagation pass.
+//
+// This pass forwards the source of COPYs to the users of their destinations
+// when doing so is legal.  For example:
+//
+//   %reg1 = COPY %reg0
+//   ...
+//   ... = OP %reg1
+//
+// If
+//   - %reg0 has not been clobbered by the time of the use of %reg1
+//   - the register class constraints are satisfied
+//   - the COPY def is the only value that reaches OP
+// then this pass replaces the above with:
+//
+//   %reg1 = COPY %reg0
+//   ...
+//   ... = OP %reg0
+//
+// This pass also removes some redundant COPYs.  For example:
+//
+//    %R1 = COPY %R0
+//    ... // No clobber of %R1
+//    %R0 = COPY %R1 <<< Removed
+//
+// or
+//
+//    %R1 = COPY %R0
+//    ... // No clobber of %R0
+//    %R1 = COPY %R0 <<< Removed
+//
+// or
+//
+//    $R0 = OP ...
+//    ... // No read/clobber of $R0 and $R1
+//    $R1 = COPY $R0 // $R0 is killed
+// Replace $R0 with $R1 and remove the COPY
+//    $R1 = OP ...
+//    ...
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <iterator>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-cp"
+
+STATISTIC(NumDeletes, "Number of dead copies deleted");
+STATISTIC(NumCopyForwards, "Number of copy uses forwarded");
+STATISTIC(NumCopyBackwardPropagated, "Number of copy defs backward propagated");
+STATISTIC(SpillageChainsLength, "Length of spillage chains");
+STATISTIC(NumSpillageChains, "Number of spillage chains");
+DEBUG_COUNTER(FwdCounter, "machine-cp-fwd",
+              "Controls which register COPYs are forwarded");
+
+static cl::opt<bool> MCPUseCopyInstr("mcp-use-is-copy-instr", cl::init(false),
+                                     cl::Hidden);
+static cl::opt<cl::boolOrDefault>
+    EnableSpillageCopyElimination("enable-spill-copy-elim", cl::Hidden);
+
+namespace {
+
+static std::optional<DestSourcePair> isCopyInstr(const MachineInstr &MI,
+                                                 const TargetInstrInfo &TII,
+                                                 bool UseCopyInstr) {
+  if (UseCopyInstr)
+    return TII.isCopyInstr(MI);
+
+  if (MI.isCopy())
+    return std::optional<DestSourcePair>(
+        DestSourcePair{MI.getOperand(0), MI.getOperand(1)});
+
+  return std::nullopt;
+}
+
+class CopyTracker {
+  struct CopyInfo {
+    MachineInstr *MI, *LastSeenUseInCopy;
+    SmallVector<MCRegister, 4> DefRegs;
+    bool Avail;
+  };
+
+  DenseMap<MCRegister, CopyInfo> Copies;
+
+public:
+  /// Mark all of the given registers and their subregisters as unavailable for
+  /// copying.
+  void markRegsUnavailable(ArrayRef<MCRegister> Regs,
+                           const TargetRegisterInfo &TRI) {
+    for (MCRegister Reg : Regs) {
+      // Source of copy is no longer available for propagation.
+      for (MCRegUnit Unit : TRI.regunits(Reg)) {
+        auto CI = Copies.find(Unit);
+        if (CI != Copies.end())
+          CI->second.Avail = false;
+      }
+    }
+  }
+
+  /// Remove register from copy maps.
+  void invalidateRegister(MCRegister Reg, const TargetRegisterInfo &TRI,
+                          const TargetInstrInfo &TII, bool UseCopyInstr) {
+    // Since Reg might be a subreg of some registers, only invalidate Reg is not
+    // enough. We have to find the COPY defines Reg or registers defined by Reg
+    // and invalidate all of them.
+    SmallSet<MCRegister, 8> RegsToInvalidate;
+    RegsToInvalidate.insert(Reg);
+    for (MCRegUnit Unit : TRI.regunits(Reg)) {
+      auto I = Copies.find(Unit);
+      if (I != Copies.end()) {
+        if (MachineInstr *MI = I->second.MI) {
+          std::optional<DestSourcePair> CopyOperands =
+              isCopyInstr(*MI, TII, UseCopyInstr);
+          assert(CopyOperands && "Expect copy");
+
+          RegsToInvalidate.insert(
+              CopyOperands->Destination->getReg().asMCReg());
+          RegsToInvalidate.insert(CopyOperands->Source->getReg().asMCReg());
+        }
+        RegsToInvalidate.insert(I->second.DefRegs.begin(),
+                                I->second.DefRegs.end());
+      }
+    }
+    for (MCRegister InvalidReg : RegsToInvalidate)
+      for (MCRegUnit Unit : TRI.regunits(InvalidReg))
+        Copies.erase(Unit);
+  }
+
+  /// Clobber a single register, removing it from the tracker's copy maps.
+  void clobberRegister(MCRegister Reg, const TargetRegisterInfo &TRI,
+                       const TargetInstrInfo &TII, bool UseCopyInstr) {
+    for (MCRegUnit Unit : TRI.regunits(Reg)) {
+      auto I = Copies.find(Unit);
+      if (I != Copies.end()) {
+        // When we clobber the source of a copy, we need to clobber everything
+        // it defined.
+        markRegsUnavailable(I->second.DefRegs, TRI);
+        // When we clobber the destination of a copy, we need to clobber the
+        // whole register it defined.
+        if (MachineInstr *MI = I->second.MI) {
+          std::optional<DestSourcePair> CopyOperands =
+              isCopyInstr(*MI, TII, UseCopyInstr);
+          markRegsUnavailable({CopyOperands->Destination->getReg().asMCReg()},
+                              TRI);
+        }
+        // Now we can erase the copy.
+        Copies.erase(I);
+      }
+    }
+  }
+
+  /// Add this copy's registers into the tracker's copy maps.
+  void trackCopy(MachineInstr *MI, const TargetRegisterInfo &TRI,
+                 const TargetInstrInfo &TII, bool UseCopyInstr) {
+    std::optional<DestSourcePair> CopyOperands =
+        isCopyInstr(*MI, TII, UseCopyInstr);
+    assert(CopyOperands && "Tracking non-copy?");
+
+    MCRegister Src = CopyOperands->Source->getReg().asMCReg();
+    MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
+
+    // Remember Def is defined by the copy.
+    for (MCRegUnit Unit : TRI.regunits(Def))
+      Copies[Unit] = {MI, nullptr, {}, true};
+
+    // Remember source that's copied to Def. Once it's clobbered, then
+    // it's no longer available for copy propagation.
+    for (MCRegUnit Unit : TRI.regunits(Src)) {
+      auto I = Copies.insert({Unit, {nullptr, nullptr, {}, false}});
+      auto &Copy = I.first->second;
+      if (!is_contained(Copy.DefRegs, Def))
+        Copy.DefRegs.push_back(Def);
+      Copy.LastSeenUseInCopy = MI;
+    }
+  }
+
+  bool hasAnyCopies() {
+    return !Copies.empty();
+  }
+
+  MachineInstr *findCopyForUnit(MCRegister RegUnit,
+                                const TargetRegisterInfo &TRI,
+                                bool MustBeAvailable = false) {
+    auto CI = Copies.find(RegUnit);
+    if (CI == Copies.end())
+      return nullptr;
+    if (MustBeAvailable && !CI->second.Avail)
+      return nullptr;
+    return CI->second.MI;
+  }
+
+  MachineInstr *findCopyDefViaUnit(MCRegister RegUnit,
+                                   const TargetRegisterInfo &TRI) {
+    auto CI = Copies.find(RegUnit);
+    if (CI == Copies.end())
+      return nullptr;
+    if (CI->second.DefRegs.size() != 1)
+      return nullptr;
+    MCRegUnit RU = *TRI.regunits(CI->second.DefRegs[0]).begin();
+    return findCopyForUnit(RU, TRI, true);
+  }
+
+  MachineInstr *findAvailBackwardCopy(MachineInstr &I, MCRegister Reg,
+                                      const TargetRegisterInfo &TRI,
+                                      const TargetInstrInfo &TII,
+                                      bool UseCopyInstr) {
+    MCRegUnit RU = *TRI.regunits(Reg).begin();
+    MachineInstr *AvailCopy = findCopyDefViaUnit(RU, TRI);
+
+    if (!AvailCopy)
+      return nullptr;
+
+    std::optional<DestSourcePair> CopyOperands =
+        isCopyInstr(*AvailCopy, TII, UseCopyInstr);
+    Register AvailSrc = CopyOperands->Source->getReg();
+    Register AvailDef = CopyOperands->Destination->getReg();
+    if (!TRI.isSubRegisterEq(AvailSrc, Reg))
+      return nullptr;
+
+    for (const MachineInstr &MI :
+         make_range(AvailCopy->getReverseIterator(), I.getReverseIterator()))
+      for (const MachineOperand &MO : MI.operands())
+        if (MO.isRegMask())
+          // FIXME: Shall we simultaneously invalidate AvailSrc or AvailDef?
+          if (MO.clobbersPhysReg(AvailSrc) || MO.clobbersPhysReg(AvailDef))
+            return nullptr;
+
+    return AvailCopy;
+  }
+
+  MachineInstr *findAvailCopy(MachineInstr &DestCopy, MCRegister Reg,
+                              const TargetRegisterInfo &TRI,
+                              const TargetInstrInfo &TII, bool UseCopyInstr) {
+    // We check the first RegUnit here, since we'll only be interested in the
+    // copy if it copies the entire register anyway.
+    MCRegUnit RU = *TRI.regunits(Reg).begin();
+    MachineInstr *AvailCopy =
+        findCopyForUnit(RU, TRI, /*MustBeAvailable=*/true);
+
+    if (!AvailCopy)
+      return nullptr;
+
+    std::optional<DestSourcePair> CopyOperands =
+        isCopyInstr(*AvailCopy, TII, UseCopyInstr);
+    Register AvailSrc = CopyOperands->Source->getReg();
+    Register AvailDef = CopyOperands->Destination->getReg();
+    if (!TRI.isSubRegisterEq(AvailDef, Reg))
+      return nullptr;
+
+    // Check that the available copy isn't clobbered by any regmasks between
+    // itself and the destination.
+    for (const MachineInstr &MI :
+         make_range(AvailCopy->getIterator(), DestCopy.getIterator()))
+      for (const MachineOperand &MO : MI.operands())
+        if (MO.isRegMask())
+          if (MO.clobbersPhysReg(AvailSrc) || MO.clobbersPhysReg(AvailDef))
+            return nullptr;
+
+    return AvailCopy;
+  }
+
+  // Find last COPY that defines Reg before Current MachineInstr.
+  MachineInstr *findLastSeenDefInCopy(const MachineInstr &Current,
+                                      MCRegister Reg,
+                                      const TargetRegisterInfo &TRI,
+                                      const TargetInstrInfo &TII,
+                                      bool UseCopyInstr) {
+    MCRegUnit RU = *TRI.regunits(Reg).begin();
+    auto CI = Copies.find(RU);
+    if (CI == Copies.end() || !CI->second.Avail)
+      return nullptr;
+
+    MachineInstr *DefCopy = CI->second.MI;
+    std::optional<DestSourcePair> CopyOperands =
+        isCopyInstr(*DefCopy, TII, UseCopyInstr);
+    Register Def = CopyOperands->Destination->getReg();
+    if (!TRI.isSubRegisterEq(Def, Reg))
+      return nullptr;
+
+    for (const MachineInstr &MI :
+         make_range(static_cast<const MachineInstr *>(DefCopy)->getIterator(),
+                    Current.getIterator()))
+      for (const MachineOperand &MO : MI.operands())
+        if (MO.isRegMask())
+          if (MO.clobbersPhysReg(Def)) {
+            LLVM_DEBUG(dbgs() << "MCP: Removed tracking of "
+                              << printReg(Def, &TRI) << "\n");
+            return nullptr;
+          }
+
+    return DefCopy;
+  }
+
+  // Find last COPY that uses Reg.
+  MachineInstr *findLastSeenUseInCopy(MCRegister Reg,
+                                      const TargetRegisterInfo &TRI) {
+    MCRegUnit RU = *TRI.regunits(Reg).begin();
+    auto CI = Copies.find(RU);
+    if (CI == Copies.end())
+      return nullptr;
+    return CI->second.LastSeenUseInCopy;
+  }
+
+  void clear() {
+    Copies.clear();
+  }
+};
+
+class MachineCopyPropagation : public MachineFunctionPass {
+  const TargetRegisterInfo *TRI = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+  const MachineRegisterInfo *MRI = nullptr;
+
+  // Return true if this is a copy instruction and false otherwise.
+  bool UseCopyInstr;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  MachineCopyPropagation(bool CopyInstr = false)
+      : MachineFunctionPass(ID), UseCopyInstr(CopyInstr || MCPUseCopyInstr) {
+    initializeMachineCopyPropagationPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  typedef enum { DebugUse = false, RegularUse = true } DebugType;
+
+  void ReadRegister(MCRegister Reg, MachineInstr &Reader, DebugType DT);
+  void ForwardCopyPropagateBlock(MachineBasicBlock &MBB);
+  void BackwardCopyPropagateBlock(MachineBasicBlock &MBB);
+  void EliminateSpillageCopies(MachineBasicBlock &MBB);
+  bool eraseIfRedundant(MachineInstr &Copy, MCRegister Src, MCRegister Def);
+  void forwardUses(MachineInstr &MI);
+  void propagateDefs(MachineInstr &MI);
+  bool isForwardableRegClassCopy(const MachineInstr &Copy,
+                                 const MachineInstr &UseI, unsigned UseIdx);
+  bool isBackwardPropagatableRegClassCopy(const MachineInstr &Copy,
+                                          const MachineInstr &UseI,
+                                          unsigned UseIdx);
+  bool hasImplicitOverlap(const MachineInstr &MI, const MachineOperand &Use);
+  bool hasOverlappingMultipleDef(const MachineInstr &MI,
+                                 const MachineOperand &MODef, Register Def);
+
+  /// Candidates for deletion.
+  SmallSetVector<MachineInstr *, 8> MaybeDeadCopies;
+
+  /// Multimap tracking debug users in current BB
+  DenseMap<MachineInstr *, SmallSet<MachineInstr *, 2>> CopyDbgUsers;
+
+  CopyTracker Tracker;
+
+  bool Changed = false;
+};
+
+} // end anonymous namespace
+
+char MachineCopyPropagation::ID = 0;
+
+char &llvm::MachineCopyPropagationID = MachineCopyPropagation::ID;
+
+INITIALIZE_PASS(MachineCopyPropagation, DEBUG_TYPE,
+                "Machine Copy Propagation Pass", false, false)
+
+void MachineCopyPropagation::ReadRegister(MCRegister Reg, MachineInstr &Reader,
+                                          DebugType DT) {
+  // If 'Reg' is defined by a copy, the copy is no longer a candidate
+  // for elimination. If a copy is "read" by a debug user, record the user
+  // for propagation.
+  for (MCRegUnit Unit : TRI->regunits(Reg)) {
+    if (MachineInstr *Copy = Tracker.findCopyForUnit(Unit, *TRI)) {
+      if (DT == RegularUse) {
+        LLVM_DEBUG(dbgs() << "MCP: Copy is used - not dead: "; Copy->dump());
+        MaybeDeadCopies.remove(Copy);
+      } else {
+        CopyDbgUsers[Copy].insert(&Reader);
+      }
+    }
+  }
+}
+
+/// Return true if \p PreviousCopy did copy register \p Src to register \p Def.
+/// This fact may have been obscured by sub register usage or may not be true at
+/// all even though Src and Def are subregisters of the registers used in
+/// PreviousCopy. e.g.
+/// isNopCopy("ecx = COPY eax", AX, CX) == true
+/// isNopCopy("ecx = COPY eax", AH, CL) == false
+static bool isNopCopy(const MachineInstr &PreviousCopy, MCRegister Src,
+                      MCRegister Def, const TargetRegisterInfo *TRI,
+                      const TargetInstrInfo *TII, bool UseCopyInstr) {
+
+  std::optional<DestSourcePair> CopyOperands =
+      isCopyInstr(PreviousCopy, *TII, UseCopyInstr);
+  MCRegister PreviousSrc = CopyOperands->Source->getReg().asMCReg();
+  MCRegister PreviousDef = CopyOperands->Destination->getReg().asMCReg();
+  if (Src == PreviousSrc && Def == PreviousDef)
+    return true;
+  if (!TRI->isSubRegister(PreviousSrc, Src))
+    return false;
+  unsigned SubIdx = TRI->getSubRegIndex(PreviousSrc, Src);
+  return SubIdx == TRI->getSubRegIndex(PreviousDef, Def);
+}
+
+/// Remove instruction \p Copy if there exists a previous copy that copies the
+/// register \p Src to the register \p Def; This may happen indirectly by
+/// copying the super registers.
+bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy,
+                                              MCRegister Src, MCRegister Def) {
+  // Avoid eliminating a copy from/to a reserved registers as we cannot predict
+  // the value (Example: The sparc zero register is writable but stays zero).
+  if (MRI->isReserved(Src) || MRI->isReserved(Def))
+    return false;
+
+  // Search for an existing copy.
+  MachineInstr *PrevCopy =
+      Tracker.findAvailCopy(Copy, Def, *TRI, *TII, UseCopyInstr);
+  if (!PrevCopy)
+    return false;
+
+  auto PrevCopyOperands = isCopyInstr(*PrevCopy, *TII, UseCopyInstr);
+  // Check that the existing copy uses the correct sub registers.
+  if (PrevCopyOperands->Destination->isDead())
+    return false;
+  if (!isNopCopy(*PrevCopy, Src, Def, TRI, TII, UseCopyInstr))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "MCP: copy is a NOP, removing: "; Copy.dump());
+
+  // Copy was redundantly redefining either Src or Def. Remove earlier kill
+  // flags between Copy and PrevCopy because the value will be reused now.
+  std::optional<DestSourcePair> CopyOperands =
+      isCopyInstr(Copy, *TII, UseCopyInstr);
+  assert(CopyOperands);
+
+  Register CopyDef = CopyOperands->Destination->getReg();
+  assert(CopyDef == Src || CopyDef == Def);
+  for (MachineInstr &MI :
+       make_range(PrevCopy->getIterator(), Copy.getIterator()))
+    MI.clearRegisterKills(CopyDef, TRI);
+
+  // Clear undef flag from remaining copy if needed.
+  if (!CopyOperands->Source->isUndef()) {
+    PrevCopy->getOperand(PrevCopyOperands->Source->getOperandNo())
+        .setIsUndef(false);
+  }
+
+  Copy.eraseFromParent();
+  Changed = true;
+  ++NumDeletes;
+  return true;
+}
+
+bool MachineCopyPropagation::isBackwardPropagatableRegClassCopy(
+    const MachineInstr &Copy, const MachineInstr &UseI, unsigned UseIdx) {
+  std::optional<DestSourcePair> CopyOperands =
+      isCopyInstr(Copy, *TII, UseCopyInstr);
+  Register Def = CopyOperands->Destination->getReg();
+
+  if (const TargetRegisterClass *URC =
+          UseI.getRegClassConstraint(UseIdx, TII, TRI))
+    return URC->contains(Def);
+
+  // We don't process further if UseI is a COPY, since forward copy propagation
+  // should handle that.
+  return false;
+}
+
+/// Decide whether we should forward the source of \param Copy to its use in
+/// \param UseI based on the physical register class constraints of the opcode
+/// and avoiding introducing more cross-class COPYs.
+bool MachineCopyPropagation::isForwardableRegClassCopy(const MachineInstr &Copy,
+                                                       const MachineInstr &UseI,
+                                                       unsigned UseIdx) {
+  std::optional<DestSourcePair> CopyOperands =
+      isCopyInstr(Copy, *TII, UseCopyInstr);
+  Register CopySrcReg = CopyOperands->Source->getReg();
+
+  // If the new register meets the opcode register constraints, then allow
+  // forwarding.
+  if (const TargetRegisterClass *URC =
+          UseI.getRegClassConstraint(UseIdx, TII, TRI))
+    return URC->contains(CopySrcReg);
+
+  auto UseICopyOperands = isCopyInstr(UseI, *TII, UseCopyInstr);
+  if (!UseICopyOperands)
+    return false;
+
+  /// COPYs don't have register class constraints, so if the user instruction
+  /// is a COPY, we just try to avoid introducing additional cross-class
+  /// COPYs.  For example:
+  ///
+  ///   RegClassA = COPY RegClassB  // Copy parameter
+  ///   ...
+  ///   RegClassB = COPY RegClassA  // UseI parameter
+  ///
+  /// which after forwarding becomes
+  ///
+  ///   RegClassA = COPY RegClassB
+  ///   ...
+  ///   RegClassB = COPY RegClassB
+  ///
+  /// so we have reduced the number of cross-class COPYs and potentially
+  /// introduced a nop COPY that can be removed.
+
+  // Allow forwarding if src and dst belong to any common class, so long as they
+  // don't belong to any (possibly smaller) common class that requires copies to
+  // go via a different class.
+  Register UseDstReg = UseICopyOperands->Destination->getReg();
+  bool Found = false;
+  bool IsCrossClass = false;
+  for (const TargetRegisterClass *RC : TRI->regclasses()) {
+    if (RC->contains(CopySrcReg) && RC->contains(UseDstReg)) {
+      Found = true;
+      if (TRI->getCrossCopyRegClass(RC) != RC) {
+        IsCrossClass = true;
+        break;
+      }
+    }
+  }
+  if (!Found)
+    return false;
+  if (!IsCrossClass)
+    return true;
+  // The forwarded copy would be cross-class. Only do this if the original copy
+  // was also cross-class.
+  Register CopyDstReg = CopyOperands->Destination->getReg();
+  for (const TargetRegisterClass *RC : TRI->regclasses()) {
+    if (RC->contains(CopySrcReg) && RC->contains(CopyDstReg) &&
+        TRI->getCrossCopyRegClass(RC) != RC)
+      return true;
+  }
+  return false;
+}
+
+/// Check that \p MI does not have implicit uses that overlap with it's \p Use
+/// operand (the register being replaced), since these can sometimes be
+/// implicitly tied to other operands.  For example, on AMDGPU:
+///
+/// V_MOVRELS_B32_e32 %VGPR2, %M0<imp-use>, %EXEC<imp-use>, %VGPR2_VGPR3_VGPR4_VGPR5<imp-use>
+///
+/// the %VGPR2 is implicitly tied to the larger reg operand, but we have no
+/// way of knowing we need to update the latter when updating the former.
+bool MachineCopyPropagation::hasImplicitOverlap(const MachineInstr &MI,
+                                                const MachineOperand &Use) {
+  for (const MachineOperand &MIUse : MI.uses())
+    if (&MIUse != &Use && MIUse.isReg() && MIUse.isImplicit() &&
+        MIUse.isUse() && TRI->regsOverlap(Use.getReg(), MIUse.getReg()))
+      return true;
+
+  return false;
+}
+
+/// For an MI that has multiple definitions, check whether \p MI has
+/// a definition that overlaps with another of its definitions.
+/// For example, on ARM: umull   r9, r9, lr, r0
+/// The umull instruction is unpredictable unless RdHi and RdLo are different.
+bool MachineCopyPropagation::hasOverlappingMultipleDef(
+    const MachineInstr &MI, const MachineOperand &MODef, Register Def) {
+  for (const MachineOperand &MIDef : MI.defs()) {
+    if ((&MIDef != &MODef) && MIDef.isReg() &&
+        TRI->regsOverlap(Def, MIDef.getReg()))
+      return true;
+  }
+
+  return false;
+}
+
+/// Look for available copies whose destination register is used by \p MI and
+/// replace the use in \p MI with the copy's source register.
+void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
+  if (!Tracker.hasAnyCopies())
+    return;
+
+  // Look for non-tied explicit vreg uses that have an active COPY
+  // instruction that defines the physical register allocated to them.
+  // Replace the vreg with the source of the active COPY.
+  for (unsigned OpIdx = 0, OpEnd = MI.getNumOperands(); OpIdx < OpEnd;
+       ++OpIdx) {
+    MachineOperand &MOUse = MI.getOperand(OpIdx);
+    // Don't forward into undef use operands since doing so can cause problems
+    // with the machine verifier, since it doesn't treat undef reads as reads,
+    // so we can end up with a live range that ends on an undef read, leading to
+    // an error that the live range doesn't end on a read of the live range
+    // register.
+    if (!MOUse.isReg() || MOUse.isTied() || MOUse.isUndef() || MOUse.isDef() ||
+        MOUse.isImplicit())
+      continue;
+
+    if (!MOUse.getReg())
+      continue;
+
+    // Check that the register is marked 'renamable' so we know it is safe to
+    // rename it without violating any constraints that aren't expressed in the
+    // IR (e.g. ABI or opcode requirements).
+    if (!MOUse.isRenamable())
+      continue;
+
+    MachineInstr *Copy = Tracker.findAvailCopy(MI, MOUse.getReg().asMCReg(),
+                                               *TRI, *TII, UseCopyInstr);
+    if (!Copy)
+      continue;
+
+    std::optional<DestSourcePair> CopyOperands =
+        isCopyInstr(*Copy, *TII, UseCopyInstr);
+    Register CopyDstReg = CopyOperands->Destination->getReg();
+    const MachineOperand &CopySrc = *CopyOperands->Source;
+    Register CopySrcReg = CopySrc.getReg();
+
+    Register ForwardedReg = CopySrcReg;
+    // MI might use a sub-register of the Copy destination, in which case the
+    // forwarded register is the matching sub-register of the Copy source.
+    if (MOUse.getReg() != CopyDstReg) {
+      unsigned SubRegIdx = TRI->getSubRegIndex(CopyDstReg, MOUse.getReg());
+      assert(SubRegIdx &&
+             "MI source is not a sub-register of Copy destination");
+      ForwardedReg = TRI->getSubReg(CopySrcReg, SubRegIdx);
+      if (!ForwardedReg) {
+        LLVM_DEBUG(dbgs() << "MCP: Copy source does not have sub-register "
+                          << TRI->getSubRegIndexName(SubRegIdx) << '\n');
+        continue;
+      }
+    }
+
+    // Don't forward COPYs of reserved regs unless they are constant.
+    if (MRI->isReserved(CopySrcReg) && !MRI->isConstantPhysReg(CopySrcReg))
+      continue;
+
+    if (!isForwardableRegClassCopy(*Copy, MI, OpIdx))
+      continue;
+
+    if (hasImplicitOverlap(MI, MOUse))
+      continue;
+
+    // Check that the instruction is not a copy that partially overwrites the
+    // original copy source that we are about to use. The tracker mechanism
+    // cannot cope with that.
+    if (isCopyInstr(MI, *TII, UseCopyInstr) &&
+        MI.modifiesRegister(CopySrcReg, TRI) &&
+        !MI.definesRegister(CopySrcReg)) {
+      LLVM_DEBUG(dbgs() << "MCP: Copy source overlap with dest in " << MI);
+      continue;
+    }
+
+    if (!DebugCounter::shouldExecute(FwdCounter)) {
+      LLVM_DEBUG(dbgs() << "MCP: Skipping forwarding due to debug counter:\n  "
+                        << MI);
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MOUse.getReg(), TRI)
+                      << "\n     with " << printReg(ForwardedReg, TRI)
+                      << "\n     in " << MI << "     from " << *Copy);
+
+    MOUse.setReg(ForwardedReg);
+
+    if (!CopySrc.isRenamable())
+      MOUse.setIsRenamable(false);
+    MOUse.setIsUndef(CopySrc.isUndef());
+
+    LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");
+
+    // Clear kill markers that may have been invalidated.
+    for (MachineInstr &KMI :
+         make_range(Copy->getIterator(), std::next(MI.getIterator())))
+      KMI.clearRegisterKills(CopySrcReg, TRI);
+
+    ++NumCopyForwards;
+    Changed = true;
+  }
+}
+
+void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
+  LLVM_DEBUG(dbgs() << "MCP: ForwardCopyPropagateBlock " << MBB.getName()
+                    << "\n");
+
+  for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+    // Analyze copies (which don't overlap themselves).
+    std::optional<DestSourcePair> CopyOperands =
+        isCopyInstr(MI, *TII, UseCopyInstr);
+    if (CopyOperands) {
+
+      Register RegSrc = CopyOperands->Source->getReg();
+      Register RegDef = CopyOperands->Destination->getReg();
+
+      if (!TRI->regsOverlap(RegDef, RegSrc)) {
+        assert(RegDef.isPhysical() && RegSrc.isPhysical() &&
+              "MachineCopyPropagation should be run after register allocation!");
+
+        MCRegister Def = RegDef.asMCReg();
+        MCRegister Src = RegSrc.asMCReg();
+
+        // The two copies cancel out and the source of the first copy
+        // hasn't been overridden, eliminate the second one. e.g.
+        //  %ecx = COPY %eax
+        //  ... nothing clobbered eax.
+        //  %eax = COPY %ecx
+        // =>
+        //  %ecx = COPY %eax
+        //
+        // or
+        //
+        //  %ecx = COPY %eax
+        //  ... nothing clobbered eax.
+        //  %ecx = COPY %eax
+        // =>
+        //  %ecx = COPY %eax
+        if (eraseIfRedundant(MI, Def, Src) || eraseIfRedundant(MI, Src, Def))
+          continue;
+
+        forwardUses(MI);
+
+        // Src may have been changed by forwardUses()
+        CopyOperands = isCopyInstr(MI, *TII, UseCopyInstr);
+        Src = CopyOperands->Source->getReg().asMCReg();
+
+        // If Src is defined by a previous copy, the previous copy cannot be
+        // eliminated.
+        ReadRegister(Src, MI, RegularUse);
+        for (const MachineOperand &MO : MI.implicit_operands()) {
+          if (!MO.isReg() || !MO.readsReg())
+            continue;
+          MCRegister Reg = MO.getReg().asMCReg();
+          if (!Reg)
+            continue;
+          ReadRegister(Reg, MI, RegularUse);
+        }
+
+        LLVM_DEBUG(dbgs() << "MCP: Copy is a deletion candidate: "; MI.dump());
+
+        // Copy is now a candidate for deletion.
+        if (!MRI->isReserved(Def))
+          MaybeDeadCopies.insert(&MI);
+
+        // If 'Def' is previously source of another copy, then this earlier copy's
+        // source is no longer available. e.g.
+        // %xmm9 = copy %xmm2
+        // ...
+        // %xmm2 = copy %xmm0
+        // ...
+        // %xmm2 = copy %xmm9
+        Tracker.clobberRegister(Def, *TRI, *TII, UseCopyInstr);
+        for (const MachineOperand &MO : MI.implicit_operands()) {
+          if (!MO.isReg() || !MO.isDef())
+            continue;
+          MCRegister Reg = MO.getReg().asMCReg();
+          if (!Reg)
+            continue;
+          Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
+        }
+
+        Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr);
+
+        continue;
+      }
+    }
+
+    // Clobber any earlyclobber regs first.
+    for (const MachineOperand &MO : MI.operands())
+      if (MO.isReg() && MO.isEarlyClobber()) {
+        MCRegister Reg = MO.getReg().asMCReg();
+        // If we have a tied earlyclobber, that means it is also read by this
+        // instruction, so we need to make sure we don't remove it as dead
+        // later.
+        if (MO.isTied())
+          ReadRegister(Reg, MI, RegularUse);
+        Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
+      }
+
+    forwardUses(MI);
+
+    // Not a copy.
+    SmallVector<Register, 2> Defs;
+    const MachineOperand *RegMask = nullptr;
+    for (const MachineOperand &MO : MI.operands()) {
+      if (MO.isRegMask())
+        RegMask = &MO;
+      if (!MO.isReg())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg)
+        continue;
+
+      assert(!Reg.isVirtual() &&
+             "MachineCopyPropagation should be run after register allocation!");
+
+      if (MO.isDef() && !MO.isEarlyClobber()) {
+        Defs.push_back(Reg.asMCReg());
+        continue;
+      } else if (MO.readsReg())
+        ReadRegister(Reg.asMCReg(), MI, MO.isDebug() ? DebugUse : RegularUse);
+    }
+
+    // The instruction has a register mask operand which means that it clobbers
+    // a large set of registers.  Treat clobbered registers the same way as
+    // defined registers.
+    if (RegMask) {
+      // Erase any MaybeDeadCopies whose destination register is clobbered.
+      for (SmallSetVector<MachineInstr *, 8>::iterator DI =
+               MaybeDeadCopies.begin();
+           DI != MaybeDeadCopies.end();) {
+        MachineInstr *MaybeDead = *DI;
+        std::optional<DestSourcePair> CopyOperands =
+            isCopyInstr(*MaybeDead, *TII, UseCopyInstr);
+        MCRegister Reg = CopyOperands->Destination->getReg().asMCReg();
+        assert(!MRI->isReserved(Reg));
+
+        if (!RegMask->clobbersPhysReg(Reg)) {
+          ++DI;
+          continue;
+        }
+
+        LLVM_DEBUG(dbgs() << "MCP: Removing copy due to regmask clobbering: ";
+                   MaybeDead->dump());
+
+        // Make sure we invalidate any entries in the copy maps before erasing
+        // the instruction.
+        Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
+
+        // erase() will return the next valid iterator pointing to the next
+        // element after the erased one.
+        DI = MaybeDeadCopies.erase(DI);
+        MaybeDead->eraseFromParent();
+        Changed = true;
+        ++NumDeletes;
+      }
+    }
+
+    // Any previous copy definition or reading the Defs is no longer available.
+    for (MCRegister Reg : Defs)
+      Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
+  }
+
+  // If MBB doesn't have successors, delete the copies whose defs are not used.
+  // If MBB does have successors, then conservative assume the defs are live-out
+  // since we don't want to trust live-in lists.
+  if (MBB.succ_empty()) {
+    for (MachineInstr *MaybeDead : MaybeDeadCopies) {
+      LLVM_DEBUG(dbgs() << "MCP: Removing copy due to no live-out succ: ";
+                 MaybeDead->dump());
+
+      std::optional<DestSourcePair> CopyOperands =
+          isCopyInstr(*MaybeDead, *TII, UseCopyInstr);
+      assert(CopyOperands);
+
+      Register SrcReg = CopyOperands->Source->getReg();
+      Register DestReg = CopyOperands->Destination->getReg();
+      assert(!MRI->isReserved(DestReg));
+
+      // Update matching debug values, if any.
+      SmallVector<MachineInstr *> MaybeDeadDbgUsers(
+          CopyDbgUsers[MaybeDead].begin(), CopyDbgUsers[MaybeDead].end());
+      MRI->updateDbgUsersToReg(DestReg.asMCReg(), SrcReg.asMCReg(),
+                               MaybeDeadDbgUsers);
+
+      MaybeDead->eraseFromParent();
+      Changed = true;
+      ++NumDeletes;
+    }
+  }
+
+  MaybeDeadCopies.clear();
+  CopyDbgUsers.clear();
+  Tracker.clear();
+}
+
+static bool isBackwardPropagatableCopy(const DestSourcePair &CopyOperands,
+                                       const MachineRegisterInfo &MRI,
+                                       const TargetInstrInfo &TII) {
+  Register Def = CopyOperands.Destination->getReg();
+  Register Src = CopyOperands.Source->getReg();
+
+  if (!Def || !Src)
+    return false;
+
+  if (MRI.isReserved(Def) || MRI.isReserved(Src))
+    return false;
+
+  return CopyOperands.Source->isRenamable() && CopyOperands.Source->isKill();
+}
+
+void MachineCopyPropagation::propagateDefs(MachineInstr &MI) {
+  if (!Tracker.hasAnyCopies())
+    return;
+
+  for (unsigned OpIdx = 0, OpEnd = MI.getNumOperands(); OpIdx != OpEnd;
+       ++OpIdx) {
+    MachineOperand &MODef = MI.getOperand(OpIdx);
+
+    if (!MODef.isReg() || MODef.isUse())
+      continue;
+
+    // Ignore non-trivial cases.
+    if (MODef.isTied() || MODef.isUndef() || MODef.isImplicit())
+      continue;
+
+    if (!MODef.getReg())
+      continue;
+
+    // We only handle if the register comes from a vreg.
+    if (!MODef.isRenamable())
+      continue;
+
+    MachineInstr *Copy = Tracker.findAvailBackwardCopy(
+        MI, MODef.getReg().asMCReg(), *TRI, *TII, UseCopyInstr);
+    if (!Copy)
+      continue;
+
+    std::optional<DestSourcePair> CopyOperands =
+        isCopyInstr(*Copy, *TII, UseCopyInstr);
+    Register Def = CopyOperands->Destination->getReg();
+    Register Src = CopyOperands->Source->getReg();
+
+    if (MODef.getReg() != Src)
+      continue;
+
+    if (!isBackwardPropagatableRegClassCopy(*Copy, MI, OpIdx))
+      continue;
+
+    if (hasImplicitOverlap(MI, MODef))
+      continue;
+
+    if (hasOverlappingMultipleDef(MI, MODef, Def))
+      continue;
+
+    LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MODef.getReg(), TRI)
+                      << "\n     with " << printReg(Def, TRI) << "\n     in "
+                      << MI << "     from " << *Copy);
+
+    MODef.setReg(Def);
+    MODef.setIsRenamable(CopyOperands->Destination->isRenamable());
+
+    LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");
+    MaybeDeadCopies.insert(Copy);
+    Changed = true;
+    ++NumCopyBackwardPropagated;
+  }
+}
+
+void MachineCopyPropagation::BackwardCopyPropagateBlock(
+    MachineBasicBlock &MBB) {
+  LLVM_DEBUG(dbgs() << "MCP: BackwardCopyPropagateBlock " << MBB.getName()
+                    << "\n");
+
+  for (MachineInstr &MI : llvm::make_early_inc_range(llvm::reverse(MBB))) {
+    // Ignore non-trivial COPYs.
+    std::optional<DestSourcePair> CopyOperands =
+        isCopyInstr(MI, *TII, UseCopyInstr);
+    if (CopyOperands && MI.getNumOperands() == 2) {
+      Register DefReg = CopyOperands->Destination->getReg();
+      Register SrcReg = CopyOperands->Source->getReg();
+
+      if (!TRI->regsOverlap(DefReg, SrcReg)) {
+        // Unlike forward cp, we don't invoke propagateDefs here,
+        // just let forward cp do COPY-to-COPY propagation.
+        if (isBackwardPropagatableCopy(*CopyOperands, *MRI, *TII)) {
+          Tracker.invalidateRegister(SrcReg.asMCReg(), *TRI, *TII,
+                                     UseCopyInstr);
+          Tracker.invalidateRegister(DefReg.asMCReg(), *TRI, *TII,
+                                     UseCopyInstr);
+          Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr);
+          continue;
+        }
+      }
+    }
+
+    // Invalidate any earlyclobber regs first.
+    for (const MachineOperand &MO : MI.operands())
+      if (MO.isReg() && MO.isEarlyClobber()) {
+        MCRegister Reg = MO.getReg().asMCReg();
+        if (!Reg)
+          continue;
+        Tracker.invalidateRegister(Reg, *TRI, *TII, UseCopyInstr);
+      }
+
+    propagateDefs(MI);
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+
+      if (!MO.getReg())
+        continue;
+
+      if (MO.isDef())
+        Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI, *TII,
+                                   UseCopyInstr);
+
+      if (MO.readsReg()) {
+        if (MO.isDebug()) {
+          //  Check if the register in the debug instruction is utilized
+          // in a copy instruction, so we can update the debug info if the
+          // register is changed.
+          for (MCRegUnit Unit : TRI->regunits(MO.getReg().asMCReg())) {
+            if (auto *Copy = Tracker.findCopyDefViaUnit(Unit, *TRI)) {
+              CopyDbgUsers[Copy].insert(&MI);
+            }
+          }
+        } else {
+          Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI, *TII,
+                                     UseCopyInstr);
+        }
+      }
+    }
+  }
+
+  for (auto *Copy : MaybeDeadCopies) {
+    std::optional<DestSourcePair> CopyOperands =
+        isCopyInstr(*Copy, *TII, UseCopyInstr);
+    Register Src = CopyOperands->Source->getReg();
+    Register Def = CopyOperands->Destination->getReg();
+    SmallVector<MachineInstr *> MaybeDeadDbgUsers(CopyDbgUsers[Copy].begin(),
+                                                  CopyDbgUsers[Copy].end());
+
+    MRI->updateDbgUsersToReg(Src.asMCReg(), Def.asMCReg(), MaybeDeadDbgUsers);
+    Copy->eraseFromParent();
+    ++NumDeletes;
+  }
+
+  MaybeDeadCopies.clear();
+  CopyDbgUsers.clear();
+  Tracker.clear();
+}
+
+static void LLVM_ATTRIBUTE_UNUSED printSpillReloadChain(
+    DenseMap<MachineInstr *, SmallVector<MachineInstr *>> &SpillChain,
+    DenseMap<MachineInstr *, SmallVector<MachineInstr *>> &ReloadChain,
+    MachineInstr *Leader) {
+  auto &SC = SpillChain[Leader];
+  auto &RC = ReloadChain[Leader];
+  for (auto I = SC.rbegin(), E = SC.rend(); I != E; ++I)
+    (*I)->dump();
+  for (MachineInstr *MI : RC)
+    MI->dump();
+}
+
+// Remove spill-reload like copy chains. For example
+// r0 = COPY r1
+// r1 = COPY r2
+// r2 = COPY r3
+// r3 = COPY r4
+// <def-use r4>
+// r4 = COPY r3
+// r3 = COPY r2
+// r2 = COPY r1
+// r1 = COPY r0
+// will be folded into
+// r0 = COPY r1
+// r1 = COPY r4
+// <def-use r4>
+// r4 = COPY r1
+// r1 = COPY r0
+// TODO: Currently we don't track usage of r0 outside the chain, so we
+// conservatively keep its value as it was before the rewrite.
+//
+// The algorithm is trying to keep
+// property#1: No Def of spill COPY in the chain is used or defined until the
+// paired reload COPY in the chain uses the Def.
+//
+// property#2: NO Source of COPY in the chain is used or defined until the next
+// COPY in the chain defines the Source, except the innermost spill-reload
+// pair.
+//
+// The algorithm is conducted by checking every COPY inside the MBB, assuming
+// the COPY is a reload COPY, then try to find paired spill COPY by searching
+// the COPY defines the Src of the reload COPY backward. If such pair is found,
+// it either belongs to an existing chain or a new chain depends on
+// last available COPY uses the Def of the reload COPY.
+// Implementation notes, we use CopyTracker::findLastDefCopy(Reg, ...) to find
+// out last COPY that defines Reg; we use CopyTracker::findLastUseCopy(Reg, ...)
+// to find out last COPY that uses Reg. When we are encountered with a Non-COPY
+// instruction, we check registers in the operands of this instruction. If this
+// Reg is defined by a COPY, we untrack this Reg via
+// CopyTracker::clobberRegister(Reg, ...).
+void MachineCopyPropagation::EliminateSpillageCopies(MachineBasicBlock &MBB) {
+  // ChainLeader maps MI inside a spill-reload chain to its innermost reload COPY.
+  // Thus we can track if a MI belongs to an existing spill-reload chain.
+  DenseMap<MachineInstr *, MachineInstr *> ChainLeader;
+  // SpillChain maps innermost reload COPY of a spill-reload chain to a sequence
+  // of COPYs that forms spills of a spill-reload chain.
+  // ReloadChain maps innermost reload COPY of a spill-reload chain to a
+  // sequence of COPYs that forms reloads of a spill-reload chain.
+  DenseMap<MachineInstr *, SmallVector<MachineInstr *>> SpillChain, ReloadChain;
+  // If a COPY's Source has use or def until next COPY defines the Source,
+  // we put the COPY in this set to keep property#2.
+  DenseSet<const MachineInstr *> CopySourceInvalid;
+
+  auto TryFoldSpillageCopies =
+      [&, this](const SmallVectorImpl<MachineInstr *> &SC,
+                const SmallVectorImpl<MachineInstr *> &RC) {
+        assert(SC.size() == RC.size() && "Spill-reload should be paired");
+
+        // We need at least 3 pairs of copies for the transformation to apply,
+        // because the first outermost pair cannot be removed since we don't
+        // recolor outside of the chain and that we need at least one temporary
+        // spill slot to shorten the chain. If we only have a chain of two
+        // pairs, we already have the shortest sequence this code can handle:
+        // the outermost pair for the temporary spill slot, and the pair that
+        // use that temporary spill slot for the other end of the chain.
+        // TODO: We might be able to simplify to one spill-reload pair if collecting
+        // more infomation about the outermost COPY.
+        if (SC.size() <= 2)
+          return;
+
+        // If violate property#2, we don't fold the chain.
+        for (const MachineInstr *Spill : make_range(SC.begin() + 1, SC.end()))
+          if (CopySourceInvalid.count(Spill))
+            return;
+
+        for (const MachineInstr *Reload : make_range(RC.begin(), RC.end() - 1))
+          if (CopySourceInvalid.count(Reload))
+            return;
+
+        auto CheckCopyConstraint = [this](Register Def, Register Src) {
+          for (const TargetRegisterClass *RC : TRI->regclasses()) {
+            if (RC->contains(Def) && RC->contains(Src))
+              return true;
+          }
+          return false;
+        };
+
+        auto UpdateReg = [](MachineInstr *MI, const MachineOperand *Old,
+                            const MachineOperand *New) {
+          for (MachineOperand &MO : MI->operands()) {
+            if (&MO == Old)
+              MO.setReg(New->getReg());
+          }
+        };
+
+        std::optional<DestSourcePair> InnerMostSpillCopy =
+            isCopyInstr(*SC[0], *TII, UseCopyInstr);
+        std::optional<DestSourcePair> OuterMostSpillCopy =
+            isCopyInstr(*SC.back(), *TII, UseCopyInstr);
+        std::optional<DestSourcePair> InnerMostReloadCopy =
+            isCopyInstr(*RC[0], *TII, UseCopyInstr);
+        std::optional<DestSourcePair> OuterMostReloadCopy =
+            isCopyInstr(*RC.back(), *TII, UseCopyInstr);
+        if (!CheckCopyConstraint(OuterMostSpillCopy->Source->getReg(),
+                                 InnerMostSpillCopy->Source->getReg()) ||
+            !CheckCopyConstraint(InnerMostReloadCopy->Destination->getReg(),
+                                 OuterMostReloadCopy->Destination->getReg()))
+          return;
+
+        SpillageChainsLength += SC.size() + RC.size();
+        NumSpillageChains += 1;
+        UpdateReg(SC[0], InnerMostSpillCopy->Destination,
+                  OuterMostSpillCopy->Source);
+        UpdateReg(RC[0], InnerMostReloadCopy->Source,
+                  OuterMostReloadCopy->Destination);
+
+        for (size_t I = 1; I < SC.size() - 1; ++I) {
+          SC[I]->eraseFromParent();
+          RC[I]->eraseFromParent();
+          NumDeletes += 2;
+        }
+      };
+
+  auto IsFoldableCopy = [this](const MachineInstr &MaybeCopy) {
+    if (MaybeCopy.getNumImplicitOperands() > 0)
+      return false;
+    std::optional<DestSourcePair> CopyOperands =
+        isCopyInstr(MaybeCopy, *TII, UseCopyInstr);
+    if (!CopyOperands)
+      return false;
+    Register Src = CopyOperands->Source->getReg();
+    Register Def = CopyOperands->Destination->getReg();
+    return Src && Def && !TRI->regsOverlap(Src, Def) &&
+           CopyOperands->Source->isRenamable() &&
+           CopyOperands->Destination->isRenamable();
+  };
+
+  auto IsSpillReloadPair = [&, this](const MachineInstr &Spill,
+                                     const MachineInstr &Reload) {
+    if (!IsFoldableCopy(Spill) || !IsFoldableCopy(Reload))
+      return false;
+    std::optional<DestSourcePair> SpillCopy =
+        isCopyInstr(Spill, *TII, UseCopyInstr);
+    std::optional<DestSourcePair> ReloadCopy =
+        isCopyInstr(Reload, *TII, UseCopyInstr);
+    if (!SpillCopy || !ReloadCopy)
+      return false;
+    return SpillCopy->Source->getReg() == ReloadCopy->Destination->getReg() &&
+           SpillCopy->Destination->getReg() == ReloadCopy->Source->getReg();
+  };
+
+  auto IsChainedCopy = [&, this](const MachineInstr &Prev,
+                                 const MachineInstr &Current) {
+    if (!IsFoldableCopy(Prev) || !IsFoldableCopy(Current))
+      return false;
+    std::optional<DestSourcePair> PrevCopy =
+        isCopyInstr(Prev, *TII, UseCopyInstr);
+    std::optional<DestSourcePair> CurrentCopy =
+        isCopyInstr(Current, *TII, UseCopyInstr);
+    if (!PrevCopy || !CurrentCopy)
+      return false;
+    return PrevCopy->Source->getReg() == CurrentCopy->Destination->getReg();
+  };
+
+  for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+    std::optional<DestSourcePair> CopyOperands =
+        isCopyInstr(MI, *TII, UseCopyInstr);
+
+    // Update track information via non-copy instruction.
+    SmallSet<Register, 8> RegsToClobber;
+    if (!CopyOperands) {
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg())
+          continue;
+        Register Reg = MO.getReg();
+        if (!Reg)
+          continue;
+        MachineInstr *LastUseCopy =
+            Tracker.findLastSeenUseInCopy(Reg.asMCReg(), *TRI);
+        if (LastUseCopy) {
+          LLVM_DEBUG(dbgs() << "MCP: Copy source of\n");
+          LLVM_DEBUG(LastUseCopy->dump());
+          LLVM_DEBUG(dbgs() << "might be invalidated by\n");
+          LLVM_DEBUG(MI.dump());
+          CopySourceInvalid.insert(LastUseCopy);
+        }
+        // Must be noted Tracker.clobberRegister(Reg, ...) removes tracking of
+        // Reg, i.e, COPY that defines Reg is removed from the mapping as well
+        // as marking COPYs that uses Reg unavailable.
+        // We don't invoke CopyTracker::clobberRegister(Reg, ...) if Reg is not
+        // defined by a previous COPY, since we don't want to make COPYs uses
+        // Reg unavailable.
+        if (Tracker.findLastSeenDefInCopy(MI, Reg.asMCReg(), *TRI, *TII,
+                                    UseCopyInstr))
+          // Thus we can keep the property#1.
+          RegsToClobber.insert(Reg);
+      }
+      for (Register Reg : RegsToClobber) {
+        Tracker.clobberRegister(Reg, *TRI, *TII, UseCopyInstr);
+        LLVM_DEBUG(dbgs() << "MCP: Removed tracking of " << printReg(Reg, TRI)
+                          << "\n");
+      }
+      continue;
+    }
+
+    Register Src = CopyOperands->Source->getReg();
+    Register Def = CopyOperands->Destination->getReg();
+    // Check if we can find a pair spill-reload copy.
+    LLVM_DEBUG(dbgs() << "MCP: Searching paired spill for reload: ");
+    LLVM_DEBUG(MI.dump());
+    MachineInstr *MaybeSpill =
+        Tracker.findLastSeenDefInCopy(MI, Src.asMCReg(), *TRI, *TII, UseCopyInstr);
+    bool MaybeSpillIsChained = ChainLeader.count(MaybeSpill);
+    if (!MaybeSpillIsChained && MaybeSpill &&
+        IsSpillReloadPair(*MaybeSpill, MI)) {
+      // Check if we already have an existing chain. Now we have a
+      // spill-reload pair.
+      // L2: r2 = COPY r3
+      // L5: r3 = COPY r2
+      // Looking for a valid COPY before L5 which uses r3.
+      // This can be serverial cases.
+      // Case #1:
+      // No COPY is found, which can be r3 is def-use between (L2, L5), we
+      // create a new chain for L2 and L5.
+      // Case #2:
+      // L2: r2 = COPY r3
+      // L5: r3 = COPY r2
+      // Such COPY is found and is L2, we create a new chain for L2 and L5.
+      // Case #3:
+      // L2: r2 = COPY r3
+      // L3: r1 = COPY r3
+      // L5: r3 = COPY r2
+      // we create a new chain for L2 and L5.
+      // Case #4:
+      // L2: r2 = COPY r3
+      // L3: r1 = COPY r3
+      // L4: r3 = COPY r1
+      // L5: r3 = COPY r2
+      // Such COPY won't be found since L4 defines r3. we create a new chain
+      // for L2 and L5.
+      // Case #5:
+      // L2: r2 = COPY r3
+      // L3: r3 = COPY r1
+      // L4: r1 = COPY r3
+      // L5: r3 = COPY r2
+      // COPY is found and is L4 which belongs to an existing chain, we add
+      // L2 and L5 to this chain.
+      LLVM_DEBUG(dbgs() << "MCP: Found spill: ");
+      LLVM_DEBUG(MaybeSpill->dump());
+      MachineInstr *MaybePrevReload =
+          Tracker.findLastSeenUseInCopy(Def.asMCReg(), *TRI);
+      auto Leader = ChainLeader.find(MaybePrevReload);
+      MachineInstr *L = nullptr;
+      if (Leader == ChainLeader.end() ||
+          (MaybePrevReload && !IsChainedCopy(*MaybePrevReload, MI))) {
+        L = &MI;
+        assert(!SpillChain.count(L) &&
+               "SpillChain should not have contained newly found chain");
+      } else {
+        assert(MaybePrevReload &&
+               "Found a valid leader through nullptr should not happend");
+        L = Leader->second;
+        assert(SpillChain[L].size() > 0 &&
+               "Existing chain's length should be larger than zero");
+      }
+      assert(!ChainLeader.count(&MI) && !ChainLeader.count(MaybeSpill) &&
+             "Newly found paired spill-reload should not belong to any chain "
+             "at this point");
+      ChainLeader.insert({MaybeSpill, L});
+      ChainLeader.insert({&MI, L});
+      SpillChain[L].push_back(MaybeSpill);
+      ReloadChain[L].push_back(&MI);
+      LLVM_DEBUG(dbgs() << "MCP: Chain " << L << " now is:\n");
+      LLVM_DEBUG(printSpillReloadChain(SpillChain, ReloadChain, L));
+    } else if (MaybeSpill && !MaybeSpillIsChained) {
+      // MaybeSpill is unable to pair with MI. That's to say adding MI makes
+      // the chain invalid.
+      // The COPY defines Src is no longer considered as a candidate of a
+      // valid chain. Since we expect the Def of a spill copy isn't used by
+      // any COPY instruction until a reload copy. For example:
+      // L1: r1 = COPY r2
+      // L2: r3 = COPY r1
+      // If we later have
+      // L1: r1 = COPY r2
+      // L2: r3 = COPY r1
+      // L3: r2 = COPY r1
+      // L1 and L3 can't be a valid spill-reload pair.
+      // Thus we keep the property#1.
+      LLVM_DEBUG(dbgs() << "MCP: Not paired spill-reload:\n");
+      LLVM_DEBUG(MaybeSpill->dump());
+      LLVM_DEBUG(MI.dump());
+      Tracker.clobberRegister(Src.asMCReg(), *TRI, *TII, UseCopyInstr);
+      LLVM_DEBUG(dbgs() << "MCP: Removed tracking of " << printReg(Src, TRI)
+                        << "\n");
+    }
+    Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr);
+  }
+
+  for (auto I = SpillChain.begin(), E = SpillChain.end(); I != E; ++I) {
+    auto &SC = I->second;
+    assert(ReloadChain.count(I->first) &&
+           "Reload chain of the same leader should exist");
+    auto &RC = ReloadChain[I->first];
+    TryFoldSpillageCopies(SC, RC);
+  }
+
+  MaybeDeadCopies.clear();
+  CopyDbgUsers.clear();
+  Tracker.clear();
+}
+
+bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  bool isSpillageCopyElimEnabled = false;
+  switch (EnableSpillageCopyElimination) {
+  case cl::BOU_UNSET:
+    isSpillageCopyElimEnabled =
+        MF.getSubtarget().enableSpillageCopyElimination();
+    break;
+  case cl::BOU_TRUE:
+    isSpillageCopyElimEnabled = true;
+    break;
+  case cl::BOU_FALSE:
+    isSpillageCopyElimEnabled = false;
+    break;
+  }
+
+  Changed = false;
+
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
+
+  for (MachineBasicBlock &MBB : MF) {
+    if (isSpillageCopyElimEnabled)
+      EliminateSpillageCopies(MBB);
+    BackwardCopyPropagateBlock(MBB);
+    ForwardCopyPropagateBlock(MBB);
+  }
+
+  return Changed;
+}
+
+MachineFunctionPass *
+llvm::createMachineCopyPropagationPass(bool UseCopyInstr = false) {
+  return new MachineCopyPropagation(UseCopyInstr);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineCycleAnalysis.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineCycleAnalysis.cpp
new file mode 100644
index 000000000000..57f7a098ac17
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineCycleAnalysis.cpp
@@ -0,0 +1,151 @@
+//===- MachineCycleAnalysis.cpp - Compute CycleInfo for Machine IR --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
+#include "llvm/ADT/GenericCycleImpl.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAContext.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+template class llvm::GenericCycleInfo<llvm::MachineSSAContext>;
+template class llvm::GenericCycle<llvm::MachineSSAContext>;
+
+char MachineCycleInfoWrapperPass::ID = 0;
+
+MachineCycleInfoWrapperPass::MachineCycleInfoWrapperPass()
+    : MachineFunctionPass(ID) {
+  initializeMachineCycleInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+INITIALIZE_PASS_BEGIN(MachineCycleInfoWrapperPass, "machine-cycles",
+                      "Machine Cycle Info Analysis", true, true)
+INITIALIZE_PASS_END(MachineCycleInfoWrapperPass, "machine-cycles",
+                    "Machine Cycle Info Analysis", true, true)
+
+void MachineCycleInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineCycleInfoWrapperPass::runOnMachineFunction(MachineFunction &Func) {
+  CI.clear();
+
+  F = &Func;
+  CI.compute(Func);
+  return false;
+}
+
+void MachineCycleInfoWrapperPass::print(raw_ostream &OS, const Module *) const {
+  OS << "MachineCycleInfo for function: " << F->getName() << "\n";
+  CI.print(OS);
+}
+
+void MachineCycleInfoWrapperPass::releaseMemory() {
+  CI.clear();
+  F = nullptr;
+}
+
+namespace {
+class MachineCycleInfoPrinterPass : public MachineFunctionPass {
+public:
+  static char ID;
+
+  MachineCycleInfoPrinterPass();
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+} // namespace
+
+char MachineCycleInfoPrinterPass::ID = 0;
+
+MachineCycleInfoPrinterPass::MachineCycleInfoPrinterPass()
+    : MachineFunctionPass(ID) {
+  initializeMachineCycleInfoPrinterPassPass(*PassRegistry::getPassRegistry());
+}
+
+INITIALIZE_PASS_BEGIN(MachineCycleInfoPrinterPass, "print-machine-cycles",
+                      "Print Machine Cycle Info Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
+INITIALIZE_PASS_END(MachineCycleInfoPrinterPass, "print-machine-cycles",
+                    "Print Machine Cycle Info Analysis", true, true)
+
+void MachineCycleInfoPrinterPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineCycleInfoWrapperPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineCycleInfoPrinterPass::runOnMachineFunction(MachineFunction &F) {
+  auto &CI = getAnalysis<MachineCycleInfoWrapperPass>();
+  CI.print(errs());
+  return false;
+}
+
+bool llvm::isCycleInvariant(const MachineCycle *Cycle, MachineInstr &I) {
+  MachineFunction *MF = I.getParent()->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  const TargetSubtargetInfo &ST = MF->getSubtarget();
+  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+  const TargetInstrInfo *TII = ST.getInstrInfo();
+
+  // The instruction is cycle invariant if all of its operands are.
+  for (const MachineOperand &MO : I.operands()) {
+    if (!MO.isReg())
+      continue;
+
+    Register Reg = MO.getReg();
+    if (Reg == 0)
+      continue;
+
+    // An instruction that uses or defines a physical register can't e.g. be
+    // hoisted, so mark this as not invariant.
+    if (Reg.isPhysical()) {
+      if (MO.isUse()) {
+        // If the physreg has no defs anywhere, it's just an ambient register
+        // and we can freely move its uses. Alternatively, if it's allocatable,
+        // it could get allocated to something with a def during allocation.
+        // However, if the physreg is known to always be caller saved/restored
+        // then this use is safe to hoist.
+        if (!MRI->isConstantPhysReg(Reg) &&
+            !(TRI->isCallerPreservedPhysReg(Reg.asMCReg(), *I.getMF())) &&
+            !TII->isIgnorableUse(MO))
+          return false;
+        // Otherwise it's safe to move.
+        continue;
+      } else if (!MO.isDead()) {
+        // A def that isn't dead can't be moved.
+        return false;
+      } else if (any_of(Cycle->getEntries(),
+                        [&](const MachineBasicBlock *Block) {
+                          return Block->isLiveIn(Reg);
+                        })) {
+        // If the reg is live into any header of the cycle we can't hoist an
+        // instruction which would clobber it.
+        return false;
+      }
+    }
+
+    if (!MO.isUse())
+      continue;
+
+    assert(MRI->getVRegDef(Reg) && "Machine instr not mapped for this vreg?!");
+
+    // If the cycle contains the definition of an operand, then the instruction
+    // isn't cycle invariant.
+    if (Cycle->contains(MRI->getVRegDef(Reg)->getParent()))
+      return false;
+  }
+
+  // If we got this far, the instruction is cycle invariant!
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineDebugify.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineDebugify.cpp
new file mode 100644
index 000000000000..c264e199cf47
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineDebugify.cpp
@@ -0,0 +1,207 @@
+//===- MachineDebugify.cpp - Attach synthetic debug info to everything ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This pass attaches synthetic debug info to everything. It can be used
+/// to create targeted tests for debug info preservation, or test for CodeGen
+/// differences with vs. without debug info.
+///
+/// This isn't intended to have feature parity with Debugify.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/Debugify.h"
+
+#define DEBUG_TYPE "mir-debugify"
+
+using namespace llvm;
+
+namespace {
+bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI,
+                                            DIBuilder &DIB, Function &F) {
+  MachineFunction *MaybeMF = MMI.getMachineFunction(F);
+  if (!MaybeMF)
+    return false;
+  MachineFunction &MF = *MaybeMF;
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+
+  DISubprogram *SP = F.getSubprogram();
+  assert(SP && "IR Debugify just created it?");
+
+  Module &M = *F.getParent();
+  LLVMContext &Ctx = M.getContext();
+
+  unsigned NextLine = SP->getLine();
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      // This will likely emit line numbers beyond the end of the imagined
+      // source function and into subsequent ones. We don't do anything about
+      // that as it doesn't really matter to the compiler where the line is in
+      // the imaginary source code.
+      MI.setDebugLoc(DILocation::get(Ctx, NextLine++, 1, SP));
+    }
+  }
+
+  // Find local variables defined by debugify. No attempt is made to match up
+  // MIR-level regs to the 'correct' IR-level variables: there isn't a simple
+  // way to do that, and it isn't necessary to find interesting CodeGen bugs.
+  // Instead, simply keep track of one variable per line. Later, we can insert
+  // DBG_VALUE insts that point to these local variables. Emitting DBG_VALUEs
+  // which cover a wide range of lines can help stress the debug info passes:
+  // if we can't do that, fall back to using the local variable which precedes
+  // all the others.
+  Function *DbgValF = M.getFunction("llvm.dbg.value");
+  DbgValueInst *EarliestDVI = nullptr;
+  DenseMap<unsigned, DILocalVariable *> Line2Var;
+  DIExpression *Expr = nullptr;
+  if (DbgValF) {
+    for (const Use &U : DbgValF->uses()) {
+      auto *DVI = dyn_cast<DbgValueInst>(U.getUser());
+      if (!DVI || DVI->getFunction() != &F)
+        continue;
+      unsigned Line = DVI->getDebugLoc().getLine();
+      assert(Line != 0 && "debugify should not insert line 0 locations");
+      Line2Var[Line] = DVI->getVariable();
+      if (!EarliestDVI || Line < EarliestDVI->getDebugLoc().getLine())
+        EarliestDVI = DVI;
+      Expr = DVI->getExpression();
+    }
+  }
+  if (Line2Var.empty())
+    return true;
+
+  // Now, try to insert a DBG_VALUE instruction after each real instruction.
+  // Do this by introducing debug uses of each register definition. If that is
+  // not possible (e.g. we have a phi or a meta instruction), emit a constant.
+  uint64_t NextImm = 0;
+  SmallSet<DILocalVariable *, 16> VarSet;
+  const MCInstrDesc &DbgValDesc = TII.get(TargetOpcode::DBG_VALUE);
+  for (MachineBasicBlock &MBB : MF) {
+    MachineBasicBlock::iterator FirstNonPHIIt = MBB.getFirstNonPHI();
+    for (auto I = MBB.begin(), E = MBB.end(); I != E;) {
+      MachineInstr &MI = *I;
+      ++I;
+
+      // `I` may point to a DBG_VALUE created in the previous loop iteration.
+      if (MI.isDebugInstr())
+        continue;
+
+      // It's not allowed to insert DBG_VALUEs after a terminator.
+      if (MI.isTerminator())
+        continue;
+
+      // Find a suitable insertion point for the DBG_VALUE.
+      auto InsertBeforeIt = MI.isPHI() ? FirstNonPHIIt : I;
+
+      // Find a suitable local variable for the DBG_VALUE.
+      unsigned Line = MI.getDebugLoc().getLine();
+      if (!Line2Var.count(Line))
+        Line = EarliestDVI->getDebugLoc().getLine();
+      DILocalVariable *LocalVar = Line2Var[Line];
+      assert(LocalVar && "No variable for current line?");
+      VarSet.insert(LocalVar);
+
+      // Emit DBG_VALUEs for register definitions.
+      SmallVector<MachineOperand *, 4> RegDefs;
+      for (MachineOperand &MO : MI.all_defs())
+        if (MO.getReg())
+          RegDefs.push_back(&MO);
+      for (MachineOperand *MO : RegDefs)
+        BuildMI(MBB, InsertBeforeIt, MI.getDebugLoc(), DbgValDesc,
+                /*IsIndirect=*/false, *MO, LocalVar, Expr);
+
+      // OK, failing that, emit a constant DBG_VALUE.
+      if (RegDefs.empty()) {
+        auto ImmOp = MachineOperand::CreateImm(NextImm++);
+        BuildMI(MBB, InsertBeforeIt, MI.getDebugLoc(), DbgValDesc,
+                /*IsIndirect=*/false, ImmOp, LocalVar, Expr);
+      }
+    }
+  }
+
+  // Here we save the number of lines and variables into "llvm.mir.debugify".
+  // It is useful for mir-check-debugify.
+  NamedMDNode *NMD = M.getNamedMetadata("llvm.mir.debugify");
+  IntegerType *Int32Ty = Type::getInt32Ty(Ctx);
+  if (!NMD) {
+    NMD = M.getOrInsertNamedMetadata("llvm.mir.debugify");
+    auto addDebugifyOperand = [&](unsigned N) {
+      NMD->addOperand(MDNode::get(
+          Ctx, ValueAsMetadata::getConstant(ConstantInt::get(Int32Ty, N))));
+    };
+    // Add number of lines.
+    addDebugifyOperand(NextLine - 1);
+    // Add number of variables.
+    addDebugifyOperand(VarSet.size());
+  } else {
+    assert(NMD->getNumOperands() == 2 &&
+           "llvm.mir.debugify should have exactly 2 operands!");
+    auto setDebugifyOperand = [&](unsigned Idx, unsigned N) {
+      NMD->setOperand(Idx, MDNode::get(Ctx, ValueAsMetadata::getConstant(
+                                                ConstantInt::get(Int32Ty, N))));
+    };
+    auto getDebugifyOperand = [&](unsigned Idx) {
+      return mdconst::extract<ConstantInt>(NMD->getOperand(Idx)->getOperand(0))
+          ->getZExtValue();
+    };
+    // Set number of lines.
+    setDebugifyOperand(0, NextLine - 1);
+    // Set number of variables.
+    auto OldNumVars = getDebugifyOperand(1);
+    setDebugifyOperand(1, OldNumVars + VarSet.size());
+  }
+
+  return true;
+}
+
+/// ModulePass for attaching synthetic debug info to everything, used with the
+/// legacy module pass manager.
+struct DebugifyMachineModule : public ModulePass {
+  bool runOnModule(Module &M) override {
+    // We will insert new debugify metadata, so erasing the old one.
+    assert(!M.getNamedMetadata("llvm.mir.debugify") &&
+           "llvm.mir.debugify metadata already exists! Strip it first");
+    MachineModuleInfo &MMI =
+        getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+    return applyDebugifyMetadata(
+        M, M.functions(),
+        "ModuleDebugify: ", [&](DIBuilder &DIB, Function &F) -> bool {
+          return applyDebugifyMetadataToMachineFunction(MMI, DIB, F);
+        });
+  }
+
+  DebugifyMachineModule() : ModulePass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.addPreserved<MachineModuleInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+  static char ID; // Pass identification.
+};
+char DebugifyMachineModule::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(DebugifyMachineModule, DEBUG_TYPE,
+                      "Machine Debugify Module", false, false)
+INITIALIZE_PASS_END(DebugifyMachineModule, DEBUG_TYPE,
+                    "Machine Debugify Module", false, false)
+
+ModulePass *llvm::createDebugifyMachineModulePass() {
+  return new DebugifyMachineModule();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineDominanceFrontier.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineDominanceFrontier.cpp
new file mode 100644
index 000000000000..346cfedde390
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineDominanceFrontier.cpp
@@ -0,0 +1,53 @@
+//===- MachineDominanceFrontier.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+
+using namespace llvm;
+
+namespace llvm {
+template class DominanceFrontierBase<MachineBasicBlock, false>;
+template class DominanceFrontierBase<MachineBasicBlock, true>;
+template class ForwardDominanceFrontierBase<MachineBasicBlock>;
+}
+
+
+char MachineDominanceFrontier::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MachineDominanceFrontier, "machine-domfrontier",
+                "Machine Dominance Frontier Construction", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(MachineDominanceFrontier, "machine-domfrontier",
+                "Machine Dominance Frontier Construction", true, true)
+
+MachineDominanceFrontier::MachineDominanceFrontier() : MachineFunctionPass(ID) {
+  initializeMachineDominanceFrontierPass(*PassRegistry::getPassRegistry());
+}
+
+char &llvm::MachineDominanceFrontierID = MachineDominanceFrontier::ID;
+
+bool MachineDominanceFrontier::runOnMachineFunction(MachineFunction &) {
+  releaseMemory();
+  Base.analyze(getAnalysis<MachineDominatorTree>().getBase());
+  return false;
+}
+
+void MachineDominanceFrontier::releaseMemory() {
+  Base.releaseMemory();
+}
+
+void MachineDominanceFrontier::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineDominators.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineDominators.cpp
new file mode 100644
index 000000000000..0632cde9c6f4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineDominators.cpp
@@ -0,0 +1,152 @@
+//===- MachineDominators.cpp - Machine Dominator Calculation --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements simple dominator construction algorithms for finding
+// forward dominators on machine functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+namespace llvm {
+// Always verify dominfo if expensive checking is enabled.
+#ifdef EXPENSIVE_CHECKS
+bool VerifyMachineDomInfo = true;
+#else
+bool VerifyMachineDomInfo = false;
+#endif
+} // namespace llvm
+
+static cl::opt<bool, true> VerifyMachineDomInfoX(
+    "verify-machine-dom-info", cl::location(VerifyMachineDomInfo), cl::Hidden,
+    cl::desc("Verify machine dominator info (time consuming)"));
+
+namespace llvm {
+template class DomTreeNodeBase<MachineBasicBlock>;
+template class DominatorTreeBase<MachineBasicBlock, false>; // DomTreeBase
+}
+
+char MachineDominatorTree::ID = 0;
+
+INITIALIZE_PASS(MachineDominatorTree, "machinedomtree",
+                "MachineDominator Tree Construction", true, true)
+
+char &llvm::MachineDominatorsID = MachineDominatorTree::ID;
+
+void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  calculate(F);
+  return false;
+}
+
+void MachineDominatorTree::calculate(MachineFunction &F) {
+  CriticalEdgesToSplit.clear();
+  NewBBs.clear();
+  DT.reset(new DomTreeBase<MachineBasicBlock>());
+  DT->recalculate(F);
+}
+
+MachineDominatorTree::MachineDominatorTree()
+    : MachineFunctionPass(ID) {
+  initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
+}
+
+void MachineDominatorTree::releaseMemory() {
+  CriticalEdgesToSplit.clear();
+  DT.reset(nullptr);
+}
+
+void MachineDominatorTree::verifyAnalysis() const {
+  if (DT && VerifyMachineDomInfo)
+    if (!DT->verify(MachineDomTree::VerificationLevel::Basic)) {
+      errs() << "MachineDominatorTree verification failed\n";
+      abort();
+    }
+}
+
+void MachineDominatorTree::print(raw_ostream &OS, const Module*) const {
+  if (DT)
+    DT->print(OS);
+}
+
+void MachineDominatorTree::applySplitCriticalEdges() const {
+  // Bail out early if there is nothing to do.
+  if (CriticalEdgesToSplit.empty())
+    return;
+
+  // For each element in CriticalEdgesToSplit, remember whether or not element
+  // is the new immediate domminator of its successor. The mapping is done by
+  // index, i.e., the information for the ith element of CriticalEdgesToSplit is
+  // the ith element of IsNewIDom.
+  SmallBitVector IsNewIDom(CriticalEdgesToSplit.size(), true);
+  size_t Idx = 0;
+
+  // Collect all the dominance properties info, before invalidating
+  // the underlying DT.
+  for (CriticalEdge &Edge : CriticalEdgesToSplit) {
+    // Update dominator information.
+    MachineBasicBlock *Succ = Edge.ToBB;
+    MachineDomTreeNode *SuccDTNode = DT->getNode(Succ);
+
+    for (MachineBasicBlock *PredBB : Succ->predecessors()) {
+      if (PredBB == Edge.NewBB)
+        continue;
+      // If we are in this situation:
+      // FromBB1        FromBB2
+      //    +              +
+      //   + +            + +
+      //  +   +          +   +
+      // ...  Split1  Split2 ...
+      //           +   +
+      //            + +
+      //             +
+      //            Succ
+      // Instead of checking the domiance property with Split2, we check it with
+      // FromBB2 since Split2 is still unknown of the underlying DT structure.
+      if (NewBBs.count(PredBB)) {
+        assert(PredBB->pred_size() == 1 && "A basic block resulting from a "
+                                           "critical edge split has more "
+                                           "than one predecessor!");
+        PredBB = *PredBB->pred_begin();
+      }
+      if (!DT->dominates(SuccDTNode, DT->getNode(PredBB))) {
+        IsNewIDom[Idx] = false;
+        break;
+      }
+    }
+    ++Idx;
+  }
+
+  // Now, update DT with the collected dominance properties info.
+  Idx = 0;
+  for (CriticalEdge &Edge : CriticalEdgesToSplit) {
+    // We know FromBB dominates NewBB.
+    MachineDomTreeNode *NewDTNode = DT->addNewBlock(Edge.NewBB, Edge.FromBB);
+
+    // If all the other predecessors of "Succ" are dominated by "Succ" itself
+    // then the new block is the new immediate dominator of "Succ". Otherwise,
+    // the new block doesn't dominate anything.
+    if (IsNewIDom[Idx])
+      DT->changeImmediateDominator(DT->getNode(Edge.ToBB), NewDTNode);
+    ++Idx;
+  }
+  NewBBs.clear();
+  CriticalEdgesToSplit.clear();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineFrameInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineFrameInfo.cpp
new file mode 100644
index 000000000000..280d3a6a41ed
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineFrameInfo.cpp
@@ -0,0 +1,256 @@
+//===-- MachineFrameInfo.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Implements MachineFrameInfo that manages the stack frame.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+#define DEBUG_TYPE "codegen"
+
+using namespace llvm;
+
+void MachineFrameInfo::ensureMaxAlignment(Align Alignment) {
+  if (!StackRealignable)
+    assert(Alignment <= StackAlignment &&
+           "For targets without stack realignment, Alignment is out of limit!");
+  if (MaxAlignment < Alignment)
+    MaxAlignment = Alignment;
+}
+
+/// Clamp the alignment if requested and emit a warning.
+static inline Align clampStackAlignment(bool ShouldClamp, Align Alignment,
+                                        Align StackAlignment) {
+  if (!ShouldClamp || Alignment <= StackAlignment)
+    return Alignment;
+  LLVM_DEBUG(dbgs() << "Warning: requested alignment " << DebugStr(Alignment)
+                    << " exceeds the stack alignment "
+                    << DebugStr(StackAlignment)
+                    << " when stack realignment is off" << '\n');
+  return StackAlignment;
+}
+
+int MachineFrameInfo::CreateStackObject(uint64_t Size, Align Alignment,
+                                        bool IsSpillSlot,
+                                        const AllocaInst *Alloca,
+                                        uint8_t StackID) {
+  assert(Size != 0 && "Cannot allocate zero size stack objects!");
+  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
+  Objects.push_back(StackObject(Size, Alignment, 0, false, IsSpillSlot, Alloca,
+                                !IsSpillSlot, StackID));
+  int Index = (int)Objects.size() - NumFixedObjects - 1;
+  assert(Index >= 0 && "Bad frame index!");
+  if (contributesToMaxAlignment(StackID))
+    ensureMaxAlignment(Alignment);
+  return Index;
+}
+
+int MachineFrameInfo::CreateSpillStackObject(uint64_t Size, Align Alignment) {
+  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
+  CreateStackObject(Size, Alignment, true);
+  int Index = (int)Objects.size() - NumFixedObjects - 1;
+  ensureMaxAlignment(Alignment);
+  return Index;
+}
+
+int MachineFrameInfo::CreateVariableSizedObject(Align Alignment,
+                                                const AllocaInst *Alloca) {
+  HasVarSizedObjects = true;
+  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
+  Objects.push_back(StackObject(0, Alignment, 0, false, false, Alloca, true));
+  ensureMaxAlignment(Alignment);
+  return (int)Objects.size()-NumFixedObjects-1;
+}
+
+int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
+                                        bool IsImmutable, bool IsAliased) {
+  assert(Size != 0 && "Cannot allocate zero size fixed stack objects!");
+  // The alignment of the frame index can be determined from its offset from
+  // the incoming frame position.  If the frame object is at offset 32 and
+  // the stack is guaranteed to be 16-byte aligned, then we know that the
+  // object is 16-byte aligned. Note that unlike the non-fixed case, if the
+  // stack needs realignment, we can't assume that the stack will in fact be
+  // aligned.
+  Align Alignment =
+      commonAlignment(ForcedRealign ? Align(1) : StackAlignment, SPOffset);
+  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
+  Objects.insert(Objects.begin(),
+                 StackObject(Size, Alignment, SPOffset, IsImmutable,
+                             /*IsSpillSlot=*/false, /*Alloca=*/nullptr,
+                             IsAliased));
+  return -++NumFixedObjects;
+}
+
+int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size,
+                                                  int64_t SPOffset,
+                                                  bool IsImmutable) {
+  Align Alignment =
+      commonAlignment(ForcedRealign ? Align(1) : StackAlignment, SPOffset);
+  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
+  Objects.insert(Objects.begin(),
+                 StackObject(Size, Alignment, SPOffset, IsImmutable,
+                             /*IsSpillSlot=*/true, /*Alloca=*/nullptr,
+                             /*IsAliased=*/false));
+  return -++NumFixedObjects;
+}
+
+BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  BitVector BV(TRI->getNumRegs());
+
+  // Before CSI is calculated, no registers are considered pristine. They can be
+  // freely used and PEI will make sure they are saved.
+  if (!isCalleeSavedInfoValid())
+    return BV;
+
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR;
+       ++CSR)
+    BV.set(*CSR);
+
+  // Saved CSRs are not pristine.
+  for (const auto &I : getCalleeSavedInfo())
+    for (MCPhysReg S : TRI->subregs_inclusive(I.getReg()))
+      BV.reset(S);
+
+  return BV;
+}
+
+uint64_t MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  Align MaxAlign = getMaxAlign();
+  int64_t Offset = 0;
+
+  // This code is very, very similar to PEI::calculateFrameObjectOffsets().
+  // It really should be refactored to share code. Until then, changes
+  // should keep in mind that there's tight coupling between the two.
+
+  for (int i = getObjectIndexBegin(); i != 0; ++i) {
+    // Only estimate stack size of default stack.
+    if (getStackID(i) != TargetStackID::Default)
+      continue;
+    int64_t FixedOff = -getObjectOffset(i);
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = getObjectIndexEnd(); i != e; ++i) {
+    // Only estimate stack size of live objects on default stack.
+    if (isDeadObjectIndex(i) || getStackID(i) != TargetStackID::Default)
+      continue;
+    Offset += getObjectSize(i);
+    Align Alignment = getObjectAlign(i);
+    // Adjust to alignment boundary
+    Offset = alignTo(Offset, Alignment);
+
+    MaxAlign = std::max(Alignment, MaxAlign);
+  }
+
+  if (adjustsStack() && TFI->hasReservedCallFrame(MF))
+    Offset += getMaxCallFrameSize();
+
+  // Round up the size to a multiple of the alignment.  If the function has
+  // any calls or alloca's, align to the target's StackAlignment value to
+  // ensure that the callee's frame or the alloca data is suitably aligned;
+  // otherwise, for leaf functions, align to the TransientStackAlignment
+  // value.
+  Align StackAlign;
+  if (adjustsStack() || hasVarSizedObjects() ||
+      (RegInfo->hasStackRealignment(MF) && getObjectIndexEnd() != 0))
+    StackAlign = TFI->getStackAlign();
+  else
+    StackAlign = TFI->getTransientStackAlign();
+
+  // If the frame pointer is eliminated, all frame offsets will be relative to
+  // SP not FP. Align to MaxAlign so this works.
+  StackAlign = std::max(StackAlign, MaxAlign);
+  return alignTo(Offset, StackAlign);
+}
+
+void MachineFrameInfo::computeMaxCallFrameSize(const MachineFunction &MF) {
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
+  unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
+  assert(FrameSetupOpcode != ~0u && FrameDestroyOpcode != ~0u &&
+         "Can only compute MaxCallFrameSize if Setup/Destroy opcode are known");
+
+  MaxCallFrameSize = 0;
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      unsigned Opcode = MI.getOpcode();
+      if (Opcode == FrameSetupOpcode || Opcode == FrameDestroyOpcode) {
+        unsigned Size = TII.getFrameSize(MI);
+        MaxCallFrameSize = std::max(MaxCallFrameSize, Size);
+        AdjustsStack = true;
+      } else if (MI.isInlineAsm()) {
+        // Some inline asm's need a stack frame, as indicated by operand 1.
+        unsigned ExtraInfo = MI.getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+        if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
+          AdjustsStack = true;
+      }
+    }
+  }
+}
+
+void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
+  if (Objects.empty()) return;
+
+  const TargetFrameLowering *FI = MF.getSubtarget().getFrameLowering();
+  int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0);
+
+  OS << "Frame Objects:\n";
+
+  for (unsigned i = 0, e = Objects.size(); i != e; ++i) {
+    const StackObject &SO = Objects[i];
+    OS << "  fi#" << (int)(i-NumFixedObjects) << ": ";
+
+    if (SO.StackID != 0)
+      OS << "id=" << static_cast<unsigned>(SO.StackID) << ' ';
+
+    if (SO.Size == ~0ULL) {
+      OS << "dead\n";
+      continue;
+    }
+    if (SO.Size == 0)
+      OS << "variable sized";
+    else
+      OS << "size=" << SO.Size;
+    OS << ", align=" << SO.Alignment.value();
+
+    if (i < NumFixedObjects)
+      OS << ", fixed";
+    if (i < NumFixedObjects || SO.SPOffset != -1) {
+      int64_t Off = SO.SPOffset - ValOffset;
+      OS << ", at location [SP";
+      if (Off > 0)
+        OS << "+" << Off;
+      else if (Off < 0)
+        OS << Off;
+      OS << "]";
+    }
+    OS << "\n";
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineFrameInfo::dump(const MachineFunction &MF) const {
+  print(MF, dbgs());
+}
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineFunction.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunction.cpp
new file mode 100644
index 000000000000..88939e96e07f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunction.cpp
@@ -0,0 +1,1521 @@
+//===- MachineFunction.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Collect native machine code information for a function.  This allows
+// target-specific information about the generated code to be stored with each
+// function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/WasmEHFuncInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/EHPersonalities.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DOTGraphTraits.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "LiveDebugValues/LiveDebugValues.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "codegen"
+
+static cl::opt<unsigned> AlignAllFunctions(
+    "align-all-functions",
+    cl::desc("Force the alignment of all functions in log2 format (e.g. 4 "
+             "means align on 16B boundaries)."),
+    cl::init(0), cl::Hidden);
+
+static const char *getPropertyName(MachineFunctionProperties::Property Prop) {
+  using P = MachineFunctionProperties::Property;
+
+  // clang-format off
+  switch(Prop) {
+  case P::FailedISel: return "FailedISel";
+  case P::IsSSA: return "IsSSA";
+  case P::Legalized: return "Legalized";
+  case P::NoPHIs: return "NoPHIs";
+  case P::NoVRegs: return "NoVRegs";
+  case P::RegBankSelected: return "RegBankSelected";
+  case P::Selected: return "Selected";
+  case P::TracksLiveness: return "TracksLiveness";
+  case P::TiedOpsRewritten: return "TiedOpsRewritten";
+  case P::FailsVerification: return "FailsVerification";
+  case P::TracksDebugUserValues: return "TracksDebugUserValues";
+  }
+  // clang-format on
+  llvm_unreachable("Invalid machine function property");
+}
+
+void setUnsafeStackSize(const Function &F, MachineFrameInfo &FrameInfo) {
+  if (!F.hasFnAttribute(Attribute::SafeStack))
+    return;
+
+  auto *Existing =
+      dyn_cast_or_null<MDTuple>(F.getMetadata(LLVMContext::MD_annotation));
+
+  if (!Existing || Existing->getNumOperands() != 2)
+    return;
+
+  auto *MetadataName = "unsafe-stack-size";
+  if (auto &N = Existing->getOperand(0)) {
+    if (N.equalsStr(MetadataName)) {
+      if (auto &Op = Existing->getOperand(1)) {
+        auto Val = mdconst::extract<ConstantInt>(Op)->getZExtValue();
+        FrameInfo.setUnsafeStackSize(Val);
+      }
+    }
+  }
+}
+
+// Pin the vtable to this file.
+void MachineFunction::Delegate::anchor() {}
+
+void MachineFunctionProperties::print(raw_ostream &OS) const {
+  const char *Separator = "";
+  for (BitVector::size_type I = 0; I < Properties.size(); ++I) {
+    if (!Properties[I])
+      continue;
+    OS << Separator << getPropertyName(static_cast<Property>(I));
+    Separator = ", ";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// MachineFunction implementation
+//===----------------------------------------------------------------------===//
+
+// Out-of-line virtual method.
+MachineFunctionInfo::~MachineFunctionInfo() = default;
+
+void ilist_alloc_traits<MachineBasicBlock>::deleteNode(MachineBasicBlock *MBB) {
+  MBB->getParent()->deleteMachineBasicBlock(MBB);
+}
+
+static inline Align getFnStackAlignment(const TargetSubtargetInfo *STI,
+                                           const Function &F) {
+  if (auto MA = F.getFnStackAlign())
+    return *MA;
+  return STI->getFrameLowering()->getStackAlign();
+}
+
+MachineFunction::MachineFunction(Function &F, const LLVMTargetMachine &Target,
+                                 const TargetSubtargetInfo &STI,
+                                 unsigned FunctionNum, MachineModuleInfo &mmi)
+    : F(F), Target(Target), STI(&STI), Ctx(mmi.getContext()), MMI(mmi) {
+  FunctionNumber = FunctionNum;
+  init();
+}
+
+void MachineFunction::handleInsertion(MachineInstr &MI) {
+  if (TheDelegate)
+    TheDelegate->MF_HandleInsertion(MI);
+}
+
+void MachineFunction::handleRemoval(MachineInstr &MI) {
+  if (TheDelegate)
+    TheDelegate->MF_HandleRemoval(MI);
+}
+
+void MachineFunction::init() {
+  // Assume the function starts in SSA form with correct liveness.
+  Properties.set(MachineFunctionProperties::Property::IsSSA);
+  Properties.set(MachineFunctionProperties::Property::TracksLiveness);
+  if (STI->getRegisterInfo())
+    RegInfo = new (Allocator) MachineRegisterInfo(this);
+  else
+    RegInfo = nullptr;
+
+  MFInfo = nullptr;
+
+  // We can realign the stack if the target supports it and the user hasn't
+  // explicitly asked us not to.
+  bool CanRealignSP = STI->getFrameLowering()->isStackRealignable() &&
+                      !F.hasFnAttribute("no-realign-stack");
+  FrameInfo = new (Allocator) MachineFrameInfo(
+      getFnStackAlignment(STI, F), /*StackRealignable=*/CanRealignSP,
+      /*ForcedRealign=*/CanRealignSP &&
+          F.hasFnAttribute(Attribute::StackAlignment));
+
+  setUnsafeStackSize(F, *FrameInfo);
+
+  if (F.hasFnAttribute(Attribute::StackAlignment))
+    FrameInfo->ensureMaxAlignment(*F.getFnStackAlign());
+
+  ConstantPool = new (Allocator) MachineConstantPool(getDataLayout());
+  Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
+
+  // FIXME: Shouldn't use pref alignment if explicit alignment is set on F.
+  // FIXME: Use Function::hasOptSize().
+  if (!F.hasFnAttribute(Attribute::OptimizeForSize))
+    Alignment = std::max(Alignment,
+                         STI->getTargetLowering()->getPrefFunctionAlignment());
+
+  // -fsanitize=function and -fsanitize=kcfi instrument indirect function calls
+  // to load a type hash before the function label. Ensure functions are aligned
+  // by a least 4 to avoid unaligned access, which is especially important for
+  // -mno-unaligned-access.
+  if (F.hasMetadata(LLVMContext::MD_func_sanitize) ||
+      F.getMetadata(LLVMContext::MD_kcfi_type))
+    Alignment = std::max(Alignment, Align(4));
+
+  if (AlignAllFunctions)
+    Alignment = Align(1ULL << AlignAllFunctions);
+
+  JumpTableInfo = nullptr;
+
+  if (isFuncletEHPersonality(classifyEHPersonality(
+          F.hasPersonalityFn() ? F.getPersonalityFn() : nullptr))) {
+    WinEHInfo = new (Allocator) WinEHFuncInfo();
+  }
+
+  if (isScopedEHPersonality(classifyEHPersonality(
+          F.hasPersonalityFn() ? F.getPersonalityFn() : nullptr))) {
+    WasmEHInfo = new (Allocator) WasmEHFuncInfo();
+  }
+
+  assert(Target.isCompatibleDataLayout(getDataLayout()) &&
+         "Can't create a MachineFunction using a Module with a "
+         "Target-incompatible DataLayout attached\n");
+
+  PSVManager = std::make_unique<PseudoSourceValueManager>(getTarget());
+}
+
+void MachineFunction::initTargetMachineFunctionInfo(
+    const TargetSubtargetInfo &STI) {
+  assert(!MFInfo && "MachineFunctionInfo already set");
+  MFInfo = Target.createMachineFunctionInfo(Allocator, F, &STI);
+}
+
+MachineFunction::~MachineFunction() {
+  clear();
+}
+
+void MachineFunction::clear() {
+  Properties.reset();
+  // Don't call destructors on MachineInstr and MachineOperand. All of their
+  // memory comes from the BumpPtrAllocator which is about to be purged.
+  //
+  // Do call MachineBasicBlock destructors, it contains std::vectors.
+  for (iterator I = begin(), E = end(); I != E; I = BasicBlocks.erase(I))
+    I->Insts.clearAndLeakNodesUnsafely();
+  MBBNumbering.clear();
+
+  InstructionRecycler.clear(Allocator);
+  OperandRecycler.clear(Allocator);
+  BasicBlockRecycler.clear(Allocator);
+  CodeViewAnnotations.clear();
+  VariableDbgInfos.clear();
+  if (RegInfo) {
+    RegInfo->~MachineRegisterInfo();
+    Allocator.Deallocate(RegInfo);
+  }
+  if (MFInfo) {
+    MFInfo->~MachineFunctionInfo();
+    Allocator.Deallocate(MFInfo);
+  }
+
+  FrameInfo->~MachineFrameInfo();
+  Allocator.Deallocate(FrameInfo);
+
+  ConstantPool->~MachineConstantPool();
+  Allocator.Deallocate(ConstantPool);
+
+  if (JumpTableInfo) {
+    JumpTableInfo->~MachineJumpTableInfo();
+    Allocator.Deallocate(JumpTableInfo);
+  }
+
+  if (WinEHInfo) {
+    WinEHInfo->~WinEHFuncInfo();
+    Allocator.Deallocate(WinEHInfo);
+  }
+
+  if (WasmEHInfo) {
+    WasmEHInfo->~WasmEHFuncInfo();
+    Allocator.Deallocate(WasmEHInfo);
+  }
+}
+
+const DataLayout &MachineFunction::getDataLayout() const {
+  return F.getParent()->getDataLayout();
+}
+
+/// Get the JumpTableInfo for this function.
+/// If it does not already exist, allocate one.
+MachineJumpTableInfo *MachineFunction::
+getOrCreateJumpTableInfo(unsigned EntryKind) {
+  if (JumpTableInfo) return JumpTableInfo;
+
+  JumpTableInfo = new (Allocator)
+    MachineJumpTableInfo((MachineJumpTableInfo::JTEntryKind)EntryKind);
+  return JumpTableInfo;
+}
+
+DenormalMode MachineFunction::getDenormalMode(const fltSemantics &FPType) const {
+  return F.getDenormalMode(FPType);
+}
+
+/// Should we be emitting segmented stack stuff for the function
+bool MachineFunction::shouldSplitStack() const {
+  return getFunction().hasFnAttribute("split-stack");
+}
+
+[[nodiscard]] unsigned
+MachineFunction::addFrameInst(const MCCFIInstruction &Inst) {
+  FrameInstructions.push_back(Inst);
+  return FrameInstructions.size() - 1;
+}
+
+/// This discards all of the MachineBasicBlock numbers and recomputes them.
+/// This guarantees that the MBB numbers are sequential, dense, and match the
+/// ordering of the blocks within the function.  If a specific MachineBasicBlock
+/// is specified, only that block and those after it are renumbered.
+void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
+  if (empty()) { MBBNumbering.clear(); return; }
+  MachineFunction::iterator MBBI, E = end();
+  if (MBB == nullptr)
+    MBBI = begin();
+  else
+    MBBI = MBB->getIterator();
+
+  // Figure out the block number this should have.
+  unsigned BlockNo = 0;
+  if (MBBI != begin())
+    BlockNo = std::prev(MBBI)->getNumber() + 1;
+
+  for (; MBBI != E; ++MBBI, ++BlockNo) {
+    if (MBBI->getNumber() != (int)BlockNo) {
+      // Remove use of the old number.
+      if (MBBI->getNumber() != -1) {
+        assert(MBBNumbering[MBBI->getNumber()] == &*MBBI &&
+               "MBB number mismatch!");
+        MBBNumbering[MBBI->getNumber()] = nullptr;
+      }
+
+      // If BlockNo is already taken, set that block's number to -1.
+      if (MBBNumbering[BlockNo])
+        MBBNumbering[BlockNo]->setNumber(-1);
+
+      MBBNumbering[BlockNo] = &*MBBI;
+      MBBI->setNumber(BlockNo);
+    }
+  }
+
+  // Okay, all the blocks are renumbered.  If we have compactified the block
+  // numbering, shrink MBBNumbering now.
+  assert(BlockNo <= MBBNumbering.size() && "Mismatch!");
+  MBBNumbering.resize(BlockNo);
+}
+
+/// This method iterates over the basic blocks and assigns their IsBeginSection
+/// and IsEndSection fields. This must be called after MBB layout is finalized
+/// and the SectionID's are assigned to MBBs.
+void MachineFunction::assignBeginEndSections() {
+  front().setIsBeginSection();
+  auto CurrentSectionID = front().getSectionID();
+  for (auto MBBI = std::next(begin()), E = end(); MBBI != E; ++MBBI) {
+    if (MBBI->getSectionID() == CurrentSectionID)
+      continue;
+    MBBI->setIsBeginSection();
+    std::prev(MBBI)->setIsEndSection();
+    CurrentSectionID = MBBI->getSectionID();
+  }
+  back().setIsEndSection();
+}
+
+/// Allocate a new MachineInstr. Use this instead of `new MachineInstr'.
+MachineInstr *MachineFunction::CreateMachineInstr(const MCInstrDesc &MCID,
+                                                  DebugLoc DL,
+                                                  bool NoImplicit) {
+  return new (InstructionRecycler.Allocate<MachineInstr>(Allocator))
+      MachineInstr(*this, MCID, std::move(DL), NoImplicit);
+}
+
+/// Create a new MachineInstr which is a copy of the 'Orig' instruction,
+/// identical in all ways except the instruction has no parent, prev, or next.
+MachineInstr *
+MachineFunction::CloneMachineInstr(const MachineInstr *Orig) {
+  return new (InstructionRecycler.Allocate<MachineInstr>(Allocator))
+             MachineInstr(*this, *Orig);
+}
+
+MachineInstr &MachineFunction::cloneMachineInstrBundle(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    const MachineInstr &Orig) {
+  MachineInstr *FirstClone = nullptr;
+  MachineBasicBlock::const_instr_iterator I = Orig.getIterator();
+  while (true) {
+    MachineInstr *Cloned = CloneMachineInstr(&*I);
+    MBB.insert(InsertBefore, Cloned);
+    if (FirstClone == nullptr) {
+      FirstClone = Cloned;
+    } else {
+      Cloned->bundleWithPred();
+    }
+
+    if (!I->isBundledWithSucc())
+      break;
+    ++I;
+  }
+  // Copy over call site info to the cloned instruction if needed. If Orig is in
+  // a bundle, copyCallSiteInfo takes care of finding the call instruction in
+  // the bundle.
+  if (Orig.shouldUpdateCallSiteInfo())
+    copyCallSiteInfo(&Orig, FirstClone);
+  return *FirstClone;
+}
+
+/// Delete the given MachineInstr.
+///
+/// This function also serves as the MachineInstr destructor - the real
+/// ~MachineInstr() destructor must be empty.
+void MachineFunction::deleteMachineInstr(MachineInstr *MI) {
+  // Verify that a call site info is at valid state. This assertion should
+  // be triggered during the implementation of support for the
+  // call site info of a new architecture. If the assertion is triggered,
+  // back trace will tell where to insert a call to updateCallSiteInfo().
+  assert((!MI->isCandidateForCallSiteEntry() || !CallSitesInfo.contains(MI)) &&
+         "Call site info was not updated!");
+  // Strip it for parts. The operand array and the MI object itself are
+  // independently recyclable.
+  if (MI->Operands)
+    deallocateOperandArray(MI->CapOperands, MI->Operands);
+  // Don't call ~MachineInstr() which must be trivial anyway because
+  // ~MachineFunction drops whole lists of MachineInstrs wihout calling their
+  // destructors.
+  InstructionRecycler.Deallocate(Allocator, MI);
+}
+
+/// Allocate a new MachineBasicBlock. Use this instead of
+/// `new MachineBasicBlock'.
+MachineBasicBlock *
+MachineFunction::CreateMachineBasicBlock(const BasicBlock *bb) {
+  MachineBasicBlock *MBB =
+      new (BasicBlockRecycler.Allocate<MachineBasicBlock>(Allocator))
+          MachineBasicBlock(*this, bb);
+  // Set BBID for `-basic-block=sections=labels` and
+  // `-basic-block-sections=list` to allow robust mapping of profiles to basic
+  // blocks.
+  if (Target.getBBSectionsType() == BasicBlockSection::Labels ||
+      Target.getBBSectionsType() == BasicBlockSection::List)
+    MBB->setBBID(NextBBID++);
+  return MBB;
+}
+
+/// Delete the given MachineBasicBlock.
+void MachineFunction::deleteMachineBasicBlock(MachineBasicBlock *MBB) {
+  assert(MBB->getParent() == this && "MBB parent mismatch!");
+  // Clean up any references to MBB in jump tables before deleting it.
+  if (JumpTableInfo)
+    JumpTableInfo->RemoveMBBFromJumpTables(MBB);
+  MBB->~MachineBasicBlock();
+  BasicBlockRecycler.Deallocate(Allocator, MBB);
+}
+
+MachineMemOperand *MachineFunction::getMachineMemOperand(
+    MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s,
+    Align base_alignment, const AAMDNodes &AAInfo, const MDNode *Ranges,
+    SyncScope::ID SSID, AtomicOrdering Ordering,
+    AtomicOrdering FailureOrdering) {
+  return new (Allocator)
+      MachineMemOperand(PtrInfo, f, s, base_alignment, AAInfo, Ranges,
+                        SSID, Ordering, FailureOrdering);
+}
+
+MachineMemOperand *MachineFunction::getMachineMemOperand(
+    MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy,
+    Align base_alignment, const AAMDNodes &AAInfo, const MDNode *Ranges,
+    SyncScope::ID SSID, AtomicOrdering Ordering,
+    AtomicOrdering FailureOrdering) {
+  return new (Allocator)
+      MachineMemOperand(PtrInfo, f, MemTy, base_alignment, AAInfo, Ranges, SSID,
+                        Ordering, FailureOrdering);
+}
+
+MachineMemOperand *MachineFunction::getMachineMemOperand(
+    const MachineMemOperand *MMO, const MachinePointerInfo &PtrInfo, uint64_t Size) {
+  return new (Allocator)
+      MachineMemOperand(PtrInfo, MMO->getFlags(), Size, MMO->getBaseAlign(),
+                        AAMDNodes(), nullptr, MMO->getSyncScopeID(),
+                        MMO->getSuccessOrdering(), MMO->getFailureOrdering());
+}
+
+MachineMemOperand *MachineFunction::getMachineMemOperand(
+    const MachineMemOperand *MMO, const MachinePointerInfo &PtrInfo, LLT Ty) {
+  return new (Allocator)
+      MachineMemOperand(PtrInfo, MMO->getFlags(), Ty, MMO->getBaseAlign(),
+                        AAMDNodes(), nullptr, MMO->getSyncScopeID(),
+                        MMO->getSuccessOrdering(), MMO->getFailureOrdering());
+}
+
+MachineMemOperand *
+MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
+                                      int64_t Offset, LLT Ty) {
+  const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
+
+  // If there is no pointer value, the offset isn't tracked so we need to adjust
+  // the base alignment.
+  Align Alignment = PtrInfo.V.isNull()
+                        ? commonAlignment(MMO->getBaseAlign(), Offset)
+                        : MMO->getBaseAlign();
+
+  // Do not preserve ranges, since we don't necessarily know what the high bits
+  // are anymore.
+  return new (Allocator) MachineMemOperand(
+      PtrInfo.getWithOffset(Offset), MMO->getFlags(), Ty, Alignment,
+      MMO->getAAInfo(), nullptr, MMO->getSyncScopeID(),
+      MMO->getSuccessOrdering(), MMO->getFailureOrdering());
+}
+
+MachineMemOperand *
+MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
+                                      const AAMDNodes &AAInfo) {
+  MachinePointerInfo MPI = MMO->getValue() ?
+             MachinePointerInfo(MMO->getValue(), MMO->getOffset()) :
+             MachinePointerInfo(MMO->getPseudoValue(), MMO->getOffset());
+
+  return new (Allocator) MachineMemOperand(
+      MPI, MMO->getFlags(), MMO->getSize(), MMO->getBaseAlign(), AAInfo,
+      MMO->getRanges(), MMO->getSyncScopeID(), MMO->getSuccessOrdering(),
+      MMO->getFailureOrdering());
+}
+
+MachineMemOperand *
+MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
+                                      MachineMemOperand::Flags Flags) {
+  return new (Allocator) MachineMemOperand(
+      MMO->getPointerInfo(), Flags, MMO->getSize(), MMO->getBaseAlign(),
+      MMO->getAAInfo(), MMO->getRanges(), MMO->getSyncScopeID(),
+      MMO->getSuccessOrdering(), MMO->getFailureOrdering());
+}
+
+MachineInstr::ExtraInfo *MachineFunction::createMIExtraInfo(
+    ArrayRef<MachineMemOperand *> MMOs, MCSymbol *PreInstrSymbol,
+    MCSymbol *PostInstrSymbol, MDNode *HeapAllocMarker, MDNode *PCSections,
+    uint32_t CFIType) {
+  return MachineInstr::ExtraInfo::create(Allocator, MMOs, PreInstrSymbol,
+                                         PostInstrSymbol, HeapAllocMarker,
+                                         PCSections, CFIType);
+}
+
+const char *MachineFunction::createExternalSymbolName(StringRef Name) {
+  char *Dest = Allocator.Allocate<char>(Name.size() + 1);
+  llvm::copy(Name, Dest);
+  Dest[Name.size()] = 0;
+  return Dest;
+}
+
+uint32_t *MachineFunction::allocateRegMask() {
+  unsigned NumRegs = getSubtarget().getRegisterInfo()->getNumRegs();
+  unsigned Size = MachineOperand::getRegMaskSize(NumRegs);
+  uint32_t *Mask = Allocator.Allocate<uint32_t>(Size);
+  memset(Mask, 0, Size * sizeof(Mask[0]));
+  return Mask;
+}
+
+ArrayRef<int> MachineFunction::allocateShuffleMask(ArrayRef<int> Mask) {
+  int* AllocMask = Allocator.Allocate<int>(Mask.size());
+  copy(Mask, AllocMask);
+  return {AllocMask, Mask.size()};
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineFunction::dump() const {
+  print(dbgs());
+}
+#endif
+
+StringRef MachineFunction::getName() const {
+  return getFunction().getName();
+}
+
+void MachineFunction::print(raw_ostream &OS, const SlotIndexes *Indexes) const {
+  OS << "# Machine code for function " << getName() << ": ";
+  getProperties().print(OS);
+  OS << '\n';
+
+  // Print Frame Information
+  FrameInfo->print(*this, OS);
+
+  // Print JumpTable Information
+  if (JumpTableInfo)
+    JumpTableInfo->print(OS);
+
+  // Print Constant Pool
+  ConstantPool->print(OS);
+
+  const TargetRegisterInfo *TRI = getSubtarget().getRegisterInfo();
+
+  if (RegInfo && !RegInfo->livein_empty()) {
+    OS << "Function Live Ins: ";
+    for (MachineRegisterInfo::livein_iterator
+         I = RegInfo->livein_begin(), E = RegInfo->livein_end(); I != E; ++I) {
+      OS << printReg(I->first, TRI);
+      if (I->second)
+        OS << " in " << printReg(I->second, TRI);
+      if (std::next(I) != E)
+        OS << ", ";
+    }
+    OS << '\n';
+  }
+
+  ModuleSlotTracker MST(getFunction().getParent());
+  MST.incorporateFunction(getFunction());
+  for (const auto &BB : *this) {
+    OS << '\n';
+    // If we print the whole function, print it at its most verbose level.
+    BB.print(OS, MST, Indexes, /*IsStandalone=*/true);
+  }
+
+  OS << "\n# End machine code for function " << getName() << ".\n\n";
+}
+
+/// True if this function needs frame moves for debug or exceptions.
+bool MachineFunction::needsFrameMoves() const {
+  return getMMI().hasDebugInfo() ||
+         getTarget().Options.ForceDwarfFrameSection ||
+         F.needsUnwindTableEntry();
+}
+
+namespace llvm {
+
+  template<>
+  struct DOTGraphTraits<const MachineFunction*> : public DefaultDOTGraphTraits {
+    DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+    static std::string getGraphName(const MachineFunction *F) {
+      return ("CFG for '" + F->getName() + "' function").str();
+    }
+
+    std::string getNodeLabel(const MachineBasicBlock *Node,
+                             const MachineFunction *Graph) {
+      std::string OutStr;
+      {
+        raw_string_ostream OSS(OutStr);
+
+        if (isSimple()) {
+          OSS << printMBBReference(*Node);
+          if (const BasicBlock *BB = Node->getBasicBlock())
+            OSS << ": " << BB->getName();
+        } else
+          Node->print(OSS);
+      }
+
+      if (OutStr[0] == '\n') OutStr.erase(OutStr.begin());
+
+      // Process string output to make it nicer...
+      for (unsigned i = 0; i != OutStr.length(); ++i)
+        if (OutStr[i] == '\n') {                            // Left justify
+          OutStr[i] = '\\';
+          OutStr.insert(OutStr.begin()+i+1, 'l');
+        }
+      return OutStr;
+    }
+  };
+
+} // end namespace llvm
+
+void MachineFunction::viewCFG() const
+{
+#ifndef NDEBUG
+  ViewGraph(this, "mf" + getName());
+#else
+  errs() << "MachineFunction::viewCFG is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif // NDEBUG
+}
+
+void MachineFunction::viewCFGOnly() const
+{
+#ifndef NDEBUG
+  ViewGraph(this, "mf" + getName(), true);
+#else
+  errs() << "MachineFunction::viewCFGOnly is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif // NDEBUG
+}
+
+/// Add the specified physical register as a live-in value and
+/// create a corresponding virtual register for it.
+Register MachineFunction::addLiveIn(MCRegister PReg,
+                                    const TargetRegisterClass *RC) {
+  MachineRegisterInfo &MRI = getRegInfo();
+  Register VReg = MRI.getLiveInVirtReg(PReg);
+  if (VReg) {
+    const TargetRegisterClass *VRegRC = MRI.getRegClass(VReg);
+    (void)VRegRC;
+    // A physical register can be added several times.
+    // Between two calls, the register class of the related virtual register
+    // may have been constrained to match some operation constraints.
+    // In that case, check that the current register class includes the
+    // physical register and is a sub class of the specified RC.
+    assert((VRegRC == RC || (VRegRC->contains(PReg) &&
+                             RC->hasSubClassEq(VRegRC))) &&
+            "Register class mismatch!");
+    return VReg;
+  }
+  VReg = MRI.createVirtualRegister(RC);
+  MRI.addLiveIn(PReg, VReg);
+  return VReg;
+}
+
+/// Return the MCSymbol for the specified non-empty jump table.
+/// If isLinkerPrivate is specified, an 'l' label is returned, otherwise a
+/// normal 'L' label is returned.
+MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx,
+                                        bool isLinkerPrivate) const {
+  const DataLayout &DL = getDataLayout();
+  assert(JumpTableInfo && "No jump tables");
+  assert(JTI < JumpTableInfo->getJumpTables().size() && "Invalid JTI!");
+
+  StringRef Prefix = isLinkerPrivate ? DL.getLinkerPrivateGlobalPrefix()
+                                     : DL.getPrivateGlobalPrefix();
+  SmallString<60> Name;
+  raw_svector_ostream(Name)
+    << Prefix << "JTI" << getFunctionNumber() << '_' << JTI;
+  return Ctx.getOrCreateSymbol(Name);
+}
+
+/// Return a function-local symbol to represent the PIC base.
+MCSymbol *MachineFunction::getPICBaseSymbol() const {
+  const DataLayout &DL = getDataLayout();
+  return Ctx.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                               Twine(getFunctionNumber()) + "$pb");
+}
+
+/// \name Exception Handling
+/// \{
+
+LandingPadInfo &
+MachineFunction::getOrCreateLandingPadInfo(MachineBasicBlock *LandingPad) {
+  unsigned N = LandingPads.size();
+  for (unsigned i = 0; i < N; ++i) {
+    LandingPadInfo &LP = LandingPads[i];
+    if (LP.LandingPadBlock == LandingPad)
+      return LP;
+  }
+
+  LandingPads.push_back(LandingPadInfo(LandingPad));
+  return LandingPads[N];
+}
+
+void MachineFunction::addInvoke(MachineBasicBlock *LandingPad,
+                                MCSymbol *BeginLabel, MCSymbol *EndLabel) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  LP.BeginLabels.push_back(BeginLabel);
+  LP.EndLabels.push_back(EndLabel);
+}
+
+MCSymbol *MachineFunction::addLandingPad(MachineBasicBlock *LandingPad) {
+  MCSymbol *LandingPadLabel = Ctx.createTempSymbol();
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  LP.LandingPadLabel = LandingPadLabel;
+
+  const Instruction *FirstI = LandingPad->getBasicBlock()->getFirstNonPHI();
+  if (const auto *LPI = dyn_cast<LandingPadInst>(FirstI)) {
+    // If there's no typeid list specified, then "cleanup" is implicit.
+    // Otherwise, id 0 is reserved for the cleanup action.
+    if (LPI->isCleanup() && LPI->getNumClauses() != 0)
+      LP.TypeIds.push_back(0);
+
+    // FIXME: New EH - Add the clauses in reverse order. This isn't 100%
+    //        correct, but we need to do it this way because of how the DWARF EH
+    //        emitter processes the clauses.
+    for (unsigned I = LPI->getNumClauses(); I != 0; --I) {
+      Value *Val = LPI->getClause(I - 1);
+      if (LPI->isCatch(I - 1)) {
+        LP.TypeIds.push_back(
+            getTypeIDFor(dyn_cast<GlobalValue>(Val->stripPointerCasts())));
+      } else {
+        // Add filters in a list.
+        auto *CVal = cast<Constant>(Val);
+        SmallVector<unsigned, 4> FilterList;
+        for (const Use &U : CVal->operands())
+          FilterList.push_back(
+              getTypeIDFor(cast<GlobalValue>(U->stripPointerCasts())));
+
+        LP.TypeIds.push_back(getFilterIDFor(FilterList));
+      }
+    }
+
+  } else if (const auto *CPI = dyn_cast<CatchPadInst>(FirstI)) {
+    for (unsigned I = CPI->arg_size(); I != 0; --I) {
+      auto *TypeInfo =
+          dyn_cast<GlobalValue>(CPI->getArgOperand(I - 1)->stripPointerCasts());
+      LP.TypeIds.push_back(getTypeIDFor(TypeInfo));
+    }
+
+  } else {
+    assert(isa<CleanupPadInst>(FirstI) && "Invalid landingpad!");
+  }
+
+  return LandingPadLabel;
+}
+
+void MachineFunction::setCallSiteLandingPad(MCSymbol *Sym,
+                                            ArrayRef<unsigned> Sites) {
+  LPadToCallSiteMap[Sym].append(Sites.begin(), Sites.end());
+}
+
+unsigned MachineFunction::getTypeIDFor(const GlobalValue *TI) {
+  for (unsigned i = 0, N = TypeInfos.size(); i != N; ++i)
+    if (TypeInfos[i] == TI) return i + 1;
+
+  TypeInfos.push_back(TI);
+  return TypeInfos.size();
+}
+
+int MachineFunction::getFilterIDFor(ArrayRef<unsigned> TyIds) {
+  // If the new filter coincides with the tail of an existing filter, then
+  // re-use the existing filter.  Folding filters more than this requires
+  // re-ordering filters and/or their elements - probably not worth it.
+  for (unsigned i : FilterEnds) {
+    unsigned j = TyIds.size();
+
+    while (i && j)
+      if (FilterIds[--i] != TyIds[--j])
+        goto try_next;
+
+    if (!j)
+      // The new filter coincides with range [i, end) of the existing filter.
+      return -(1 + i);
+
+try_next:;
+  }
+
+  // Add the new filter.
+  int FilterID = -(1 + FilterIds.size());
+  FilterIds.reserve(FilterIds.size() + TyIds.size() + 1);
+  llvm::append_range(FilterIds, TyIds);
+  FilterEnds.push_back(FilterIds.size());
+  FilterIds.push_back(0); // terminator
+  return FilterID;
+}
+
+MachineFunction::CallSiteInfoMap::iterator
+MachineFunction::getCallSiteInfo(const MachineInstr *MI) {
+  assert(MI->isCandidateForCallSiteEntry() &&
+         "Call site info refers only to call (MI) candidates");
+
+  if (!Target.Options.EmitCallSiteInfo)
+    return CallSitesInfo.end();
+  return CallSitesInfo.find(MI);
+}
+
+/// Return the call machine instruction or find a call within bundle.
+static const MachineInstr *getCallInstr(const MachineInstr *MI) {
+  if (!MI->isBundle())
+    return MI;
+
+  for (const auto &BMI : make_range(getBundleStart(MI->getIterator()),
+                                    getBundleEnd(MI->getIterator())))
+    if (BMI.isCandidateForCallSiteEntry())
+      return &BMI;
+
+  llvm_unreachable("Unexpected bundle without a call site candidate");
+}
+
+void MachineFunction::eraseCallSiteInfo(const MachineInstr *MI) {
+  assert(MI->shouldUpdateCallSiteInfo() &&
+         "Call site info refers only to call (MI) candidates or "
+         "candidates inside bundles");
+
+  const MachineInstr *CallMI = getCallInstr(MI);
+  CallSiteInfoMap::iterator CSIt = getCallSiteInfo(CallMI);
+  if (CSIt == CallSitesInfo.end())
+    return;
+  CallSitesInfo.erase(CSIt);
+}
+
+void MachineFunction::copyCallSiteInfo(const MachineInstr *Old,
+                                       const MachineInstr *New) {
+  assert(Old->shouldUpdateCallSiteInfo() &&
+         "Call site info refers only to call (MI) candidates or "
+         "candidates inside bundles");
+
+  if (!New->isCandidateForCallSiteEntry())
+    return eraseCallSiteInfo(Old);
+
+  const MachineInstr *OldCallMI = getCallInstr(Old);
+  CallSiteInfoMap::iterator CSIt = getCallSiteInfo(OldCallMI);
+  if (CSIt == CallSitesInfo.end())
+    return;
+
+  CallSiteInfo CSInfo = CSIt->second;
+  CallSitesInfo[New] = CSInfo;
+}
+
+void MachineFunction::moveCallSiteInfo(const MachineInstr *Old,
+                                       const MachineInstr *New) {
+  assert(Old->shouldUpdateCallSiteInfo() &&
+         "Call site info refers only to call (MI) candidates or "
+         "candidates inside bundles");
+
+  if (!New->isCandidateForCallSiteEntry())
+    return eraseCallSiteInfo(Old);
+
+  const MachineInstr *OldCallMI = getCallInstr(Old);
+  CallSiteInfoMap::iterator CSIt = getCallSiteInfo(OldCallMI);
+  if (CSIt == CallSitesInfo.end())
+    return;
+
+  CallSiteInfo CSInfo = std::move(CSIt->second);
+  CallSitesInfo.erase(CSIt);
+  CallSitesInfo[New] = CSInfo;
+}
+
+void MachineFunction::setDebugInstrNumberingCount(unsigned Num) {
+  DebugInstrNumberingCount = Num;
+}
+
+void MachineFunction::makeDebugValueSubstitution(DebugInstrOperandPair A,
+                                                 DebugInstrOperandPair B,
+                                                 unsigned Subreg) {
+  // Catch any accidental self-loops.
+  assert(A.first != B.first);
+  // Don't allow any substitutions _from_ the memory operand number.
+  assert(A.second != DebugOperandMemNumber);
+
+  DebugValueSubstitutions.push_back({A, B, Subreg});
+}
+
+void MachineFunction::substituteDebugValuesForInst(const MachineInstr &Old,
+                                                   MachineInstr &New,
+                                                   unsigned MaxOperand) {
+  // If the Old instruction wasn't tracked at all, there is no work to do.
+  unsigned OldInstrNum = Old.peekDebugInstrNum();
+  if (!OldInstrNum)
+    return;
+
+  // Iterate over all operands looking for defs to create substitutions for.
+  // Avoid creating new instr numbers unless we create a new substitution.
+  // While this has no functional effect, it risks confusing someone reading
+  // MIR output.
+  // Examine all the operands, or the first N specified by the caller.
+  MaxOperand = std::min(MaxOperand, Old.getNumOperands());
+  for (unsigned int I = 0; I < MaxOperand; ++I) {
+    const auto &OldMO = Old.getOperand(I);
+    auto &NewMO = New.getOperand(I);
+    (void)NewMO;
+
+    if (!OldMO.isReg() || !OldMO.isDef())
+      continue;
+    assert(NewMO.isDef());
+
+    unsigned NewInstrNum = New.getDebugInstrNum();
+    makeDebugValueSubstitution(std::make_pair(OldInstrNum, I),
+                               std::make_pair(NewInstrNum, I));
+  }
+}
+
+auto MachineFunction::salvageCopySSA(
+    MachineInstr &MI, DenseMap<Register, DebugInstrOperandPair> &DbgPHICache)
+    -> DebugInstrOperandPair {
+  const TargetInstrInfo &TII = *getSubtarget().getInstrInfo();
+
+  // Check whether this copy-like instruction has already been salvaged into
+  // an operand pair.
+  Register Dest;
+  if (auto CopyDstSrc = TII.isCopyInstr(MI)) {
+    Dest = CopyDstSrc->Destination->getReg();
+  } else {
+    assert(MI.isSubregToReg());
+    Dest = MI.getOperand(0).getReg();
+  }
+
+  auto CacheIt = DbgPHICache.find(Dest);
+  if (CacheIt != DbgPHICache.end())
+    return CacheIt->second;
+
+  // Calculate the instruction number to use, or install a DBG_PHI.
+  auto OperandPair = salvageCopySSAImpl(MI);
+  DbgPHICache.insert({Dest, OperandPair});
+  return OperandPair;
+}
+
+auto MachineFunction::salvageCopySSAImpl(MachineInstr &MI)
+    -> DebugInstrOperandPair {
+  MachineRegisterInfo &MRI = getRegInfo();
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  const TargetInstrInfo &TII = *getSubtarget().getInstrInfo();
+
+  // Chase the value read by a copy-like instruction back to the instruction
+  // that ultimately _defines_ that value. This may pass:
+  //  * Through multiple intermediate copies, including subregister moves /
+  //    copies,
+  //  * Copies from physical registers that must then be traced back to the
+  //    defining instruction,
+  //  * Or, physical registers may be live-in to (only) the entry block, which
+  //    requires a DBG_PHI to be created.
+  // We can pursue this problem in that order: trace back through copies,
+  // optionally through a physical register, to a defining instruction. We
+  // should never move from physreg to vreg. As we're still in SSA form, no need
+  // to worry about partial definitions of registers.
+
+  // Helper lambda to interpret a copy-like instruction. Takes instruction,
+  // returns the register read and any subregister identifying which part is
+  // read.
+  auto GetRegAndSubreg =
+      [&](const MachineInstr &Cpy) -> std::pair<Register, unsigned> {
+    Register NewReg, OldReg;
+    unsigned SubReg;
+    if (Cpy.isCopy()) {
+      OldReg = Cpy.getOperand(0).getReg();
+      NewReg = Cpy.getOperand(1).getReg();
+      SubReg = Cpy.getOperand(1).getSubReg();
+    } else if (Cpy.isSubregToReg()) {
+      OldReg = Cpy.getOperand(0).getReg();
+      NewReg = Cpy.getOperand(2).getReg();
+      SubReg = Cpy.getOperand(3).getImm();
+    } else {
+      auto CopyDetails = *TII.isCopyInstr(Cpy);
+      const MachineOperand &Src = *CopyDetails.Source;
+      const MachineOperand &Dest = *CopyDetails.Destination;
+      OldReg = Dest.getReg();
+      NewReg = Src.getReg();
+      SubReg = Src.getSubReg();
+    }
+
+    return {NewReg, SubReg};
+  };
+
+  // First seek either the defining instruction, or a copy from a physreg.
+  // During search, the current state is the current copy instruction, and which
+  // register we've read. Accumulate qualifying subregisters into SubregsSeen;
+  // deal with those later.
+  auto State = GetRegAndSubreg(MI);
+  auto CurInst = MI.getIterator();
+  SmallVector<unsigned, 4> SubregsSeen;
+  while (true) {
+    // If we've found a copy from a physreg, first portion of search is over.
+    if (!State.first.isVirtual())
+      break;
+
+    // Record any subregister qualifier.
+    if (State.second)
+      SubregsSeen.push_back(State.second);
+
+    assert(MRI.hasOneDef(State.first));
+    MachineInstr &Inst = *MRI.def_begin(State.first)->getParent();
+    CurInst = Inst.getIterator();
+
+    // Any non-copy instruction is the defining instruction we're seeking.
+    if (!Inst.isCopyLike() && !TII.isCopyInstr(Inst))
+      break;
+    State = GetRegAndSubreg(Inst);
+  };
+
+  // Helper lambda to apply additional subregister substitutions to a known
+  // instruction/operand pair. Adds new (fake) substitutions so that we can
+  // record the subregister. FIXME: this isn't very space efficient if multiple
+  // values are tracked back through the same copies; cache something later.
+  auto ApplySubregisters =
+      [&](DebugInstrOperandPair P) -> DebugInstrOperandPair {
+    for (unsigned Subreg : reverse(SubregsSeen)) {
+      // Fetch a new instruction number, not attached to an actual instruction.
+      unsigned NewInstrNumber = getNewDebugInstrNum();
+      // Add a substitution from the "new" number to the known one, with a
+      // qualifying subreg.
+      makeDebugValueSubstitution({NewInstrNumber, 0}, P, Subreg);
+      // Return the new number; to find the underlying value, consumers need to
+      // deal with the qualifying subreg.
+      P = {NewInstrNumber, 0};
+    }
+    return P;
+  };
+
+  // If we managed to find the defining instruction after COPYs, return an
+  // instruction / operand pair after adding subregister qualifiers.
+  if (State.first.isVirtual()) {
+    // Virtual register def -- we can just look up where this happens.
+    MachineInstr *Inst = MRI.def_begin(State.first)->getParent();
+    for (auto &MO : Inst->all_defs()) {
+      if (MO.getReg() != State.first)
+        continue;
+      return ApplySubregisters({Inst->getDebugInstrNum(), MO.getOperandNo()});
+    }
+
+    llvm_unreachable("Vreg def with no corresponding operand?");
+  }
+
+  // Our search ended in a copy from a physreg: walk back up the function
+  // looking for whatever defines the physreg.
+  assert(CurInst->isCopyLike() || TII.isCopyInstr(*CurInst));
+  State = GetRegAndSubreg(*CurInst);
+  Register RegToSeek = State.first;
+
+  auto RMII = CurInst->getReverseIterator();
+  auto PrevInstrs = make_range(RMII, CurInst->getParent()->instr_rend());
+  for (auto &ToExamine : PrevInstrs) {
+    for (auto &MO : ToExamine.all_defs()) {
+      // Test for operand that defines something aliasing RegToSeek.
+      if (!TRI.regsOverlap(RegToSeek, MO.getReg()))
+        continue;
+
+      return ApplySubregisters(
+          {ToExamine.getDebugInstrNum(), MO.getOperandNo()});
+    }
+  }
+
+  MachineBasicBlock &InsertBB = *CurInst->getParent();
+
+  // We reached the start of the block before finding a defining instruction.
+  // There are numerous scenarios where this can happen:
+  // * Constant physical registers,
+  // * Several intrinsics that allow LLVM-IR to read arbitary registers,
+  // * Arguments in the entry block,
+  // * Exception handling landing pads.
+  // Validating all of them is too difficult, so just insert a DBG_PHI reading
+  // the variable value at this position, rather than checking it makes sense.
+
+  // Create DBG_PHI for specified physreg.
+  auto Builder = BuildMI(InsertBB, InsertBB.getFirstNonPHI(), DebugLoc(),
+                         TII.get(TargetOpcode::DBG_PHI));
+  Builder.addReg(State.first);
+  unsigned NewNum = getNewDebugInstrNum();
+  Builder.addImm(NewNum);
+  return ApplySubregisters({NewNum, 0u});
+}
+
+void MachineFunction::finalizeDebugInstrRefs() {
+  auto *TII = getSubtarget().getInstrInfo();
+
+  auto MakeUndefDbgValue = [&](MachineInstr &MI) {
+    const MCInstrDesc &RefII = TII->get(TargetOpcode::DBG_VALUE_LIST);
+    MI.setDesc(RefII);
+    MI.setDebugValueUndef();
+  };
+
+  DenseMap<Register, DebugInstrOperandPair> ArgDbgPHIs;
+  for (auto &MBB : *this) {
+    for (auto &MI : MBB) {
+      if (!MI.isDebugRef())
+        continue;
+
+      bool IsValidRef = true;
+
+      for (MachineOperand &MO : MI.debug_operands()) {
+        if (!MO.isReg())
+          continue;
+
+        Register Reg = MO.getReg();
+
+        // Some vregs can be deleted as redundant in the meantime. Mark those
+        // as DBG_VALUE $noreg. Additionally, some normal instructions are
+        // quickly deleted, leaving dangling references to vregs with no def.
+        if (Reg == 0 || !RegInfo->hasOneDef(Reg)) {
+          IsValidRef = false;
+          break;
+        }
+
+        assert(Reg.isVirtual());
+        MachineInstr &DefMI = *RegInfo->def_instr_begin(Reg);
+
+        // If we've found a copy-like instruction, follow it back to the
+        // instruction that defines the source value, see salvageCopySSA docs
+        // for why this is important.
+        if (DefMI.isCopyLike() || TII->isCopyInstr(DefMI)) {
+          auto Result = salvageCopySSA(DefMI, ArgDbgPHIs);
+          MO.ChangeToDbgInstrRef(Result.first, Result.second);
+        } else {
+          // Otherwise, identify the operand number that the VReg refers to.
+          unsigned OperandIdx = 0;
+          for (const auto &DefMO : DefMI.operands()) {
+            if (DefMO.isReg() && DefMO.isDef() && DefMO.getReg() == Reg)
+              break;
+            ++OperandIdx;
+          }
+          assert(OperandIdx < DefMI.getNumOperands());
+
+          // Morph this instr ref to point at the given instruction and operand.
+          unsigned ID = DefMI.getDebugInstrNum();
+          MO.ChangeToDbgInstrRef(ID, OperandIdx);
+        }
+      }
+
+      if (!IsValidRef)
+        MakeUndefDbgValue(MI);
+    }
+  }
+}
+
+bool MachineFunction::shouldUseDebugInstrRef() const {
+  // Disable instr-ref at -O0: it's very slow (in compile time). We can still
+  // have optimized code inlined into this unoptimized code, however with
+  // fewer and less aggressive optimizations happening, coverage and accuracy
+  // should not suffer.
+  if (getTarget().getOptLevel() == CodeGenOpt::None)
+    return false;
+
+  // Don't use instr-ref if this function is marked optnone.
+  if (F.hasFnAttribute(Attribute::OptimizeNone))
+    return false;
+
+  if (llvm::debuginfoShouldUseDebugInstrRef(getTarget().getTargetTriple()))
+    return true;
+
+  return false;
+}
+
+bool MachineFunction::useDebugInstrRef() const {
+  return UseDebugInstrRef;
+}
+
+void MachineFunction::setUseDebugInstrRef(bool Use) {
+  UseDebugInstrRef = Use;
+}
+
+// Use one million as a high / reserved number.
+const unsigned MachineFunction::DebugOperandMemNumber = 1000000;
+
+/// \}
+
+//===----------------------------------------------------------------------===//
+//  MachineJumpTableInfo implementation
+//===----------------------------------------------------------------------===//
+
+/// Return the size of each entry in the jump table.
+unsigned MachineJumpTableInfo::getEntrySize(const DataLayout &TD) const {
+  // The size of a jump table entry is 4 bytes unless the entry is just the
+  // address of a block, in which case it is the pointer size.
+  switch (getEntryKind()) {
+  case MachineJumpTableInfo::EK_BlockAddress:
+    return TD.getPointerSize();
+  case MachineJumpTableInfo::EK_GPRel64BlockAddress:
+    return 8;
+  case MachineJumpTableInfo::EK_GPRel32BlockAddress:
+  case MachineJumpTableInfo::EK_LabelDifference32:
+  case MachineJumpTableInfo::EK_Custom32:
+    return 4;
+  case MachineJumpTableInfo::EK_Inline:
+    return 0;
+  }
+  llvm_unreachable("Unknown jump table encoding!");
+}
+
+/// Return the alignment of each entry in the jump table.
+unsigned MachineJumpTableInfo::getEntryAlignment(const DataLayout &TD) const {
+  // The alignment of a jump table entry is the alignment of int32 unless the
+  // entry is just the address of a block, in which case it is the pointer
+  // alignment.
+  switch (getEntryKind()) {
+  case MachineJumpTableInfo::EK_BlockAddress:
+    return TD.getPointerABIAlignment(0).value();
+  case MachineJumpTableInfo::EK_GPRel64BlockAddress:
+    return TD.getABIIntegerTypeAlignment(64).value();
+  case MachineJumpTableInfo::EK_GPRel32BlockAddress:
+  case MachineJumpTableInfo::EK_LabelDifference32:
+  case MachineJumpTableInfo::EK_Custom32:
+    return TD.getABIIntegerTypeAlignment(32).value();
+  case MachineJumpTableInfo::EK_Inline:
+    return 1;
+  }
+  llvm_unreachable("Unknown jump table encoding!");
+}
+
+/// Create a new jump table entry in the jump table info.
+unsigned MachineJumpTableInfo::createJumpTableIndex(
+                               const std::vector<MachineBasicBlock*> &DestBBs) {
+  assert(!DestBBs.empty() && "Cannot create an empty jump table!");
+  JumpTables.push_back(MachineJumpTableEntry(DestBBs));
+  return JumpTables.size()-1;
+}
+
+/// If Old is the target of any jump tables, update the jump tables to branch
+/// to New instead.
+bool MachineJumpTableInfo::ReplaceMBBInJumpTables(MachineBasicBlock *Old,
+                                                  MachineBasicBlock *New) {
+  assert(Old != New && "Not making a change?");
+  bool MadeChange = false;
+  for (size_t i = 0, e = JumpTables.size(); i != e; ++i)
+    ReplaceMBBInJumpTable(i, Old, New);
+  return MadeChange;
+}
+
+/// If MBB is present in any jump tables, remove it.
+bool MachineJumpTableInfo::RemoveMBBFromJumpTables(MachineBasicBlock *MBB) {
+  bool MadeChange = false;
+  for (MachineJumpTableEntry &JTE : JumpTables) {
+    auto removeBeginItr = std::remove(JTE.MBBs.begin(), JTE.MBBs.end(), MBB);
+    MadeChange |= (removeBeginItr != JTE.MBBs.end());
+    JTE.MBBs.erase(removeBeginItr, JTE.MBBs.end());
+  }
+  return MadeChange;
+}
+
+/// If Old is a target of the jump tables, update the jump table to branch to
+/// New instead.
+bool MachineJumpTableInfo::ReplaceMBBInJumpTable(unsigned Idx,
+                                                 MachineBasicBlock *Old,
+                                                 MachineBasicBlock *New) {
+  assert(Old != New && "Not making a change?");
+  bool MadeChange = false;
+  MachineJumpTableEntry &JTE = JumpTables[Idx];
+  for (MachineBasicBlock *&MBB : JTE.MBBs)
+    if (MBB == Old) {
+      MBB = New;
+      MadeChange = true;
+    }
+  return MadeChange;
+}
+
+void MachineJumpTableInfo::print(raw_ostream &OS) const {
+  if (JumpTables.empty()) return;
+
+  OS << "Jump Tables:\n";
+
+  for (unsigned i = 0, e = JumpTables.size(); i != e; ++i) {
+    OS << printJumpTableEntryReference(i) << ':';
+    for (const MachineBasicBlock *MBB : JumpTables[i].MBBs)
+      OS << ' ' << printMBBReference(*MBB);
+    if (i != e)
+      OS << '\n';
+  }
+
+  OS << '\n';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineJumpTableInfo::dump() const { print(dbgs()); }
+#endif
+
+Printable llvm::printJumpTableEntryReference(unsigned Idx) {
+  return Printable([Idx](raw_ostream &OS) { OS << "%jump-table." << Idx; });
+}
+
+//===----------------------------------------------------------------------===//
+//  MachineConstantPool implementation
+//===----------------------------------------------------------------------===//
+
+void MachineConstantPoolValue::anchor() {}
+
+unsigned MachineConstantPoolValue::getSizeInBytes(const DataLayout &DL) const {
+  return DL.getTypeAllocSize(Ty);
+}
+
+unsigned MachineConstantPoolEntry::getSizeInBytes(const DataLayout &DL) const {
+  if (isMachineConstantPoolEntry())
+    return Val.MachineCPVal->getSizeInBytes(DL);
+  return DL.getTypeAllocSize(Val.ConstVal->getType());
+}
+
+bool MachineConstantPoolEntry::needsRelocation() const {
+  if (isMachineConstantPoolEntry())
+    return true;
+  return Val.ConstVal->needsDynamicRelocation();
+}
+
+SectionKind
+MachineConstantPoolEntry::getSectionKind(const DataLayout *DL) const {
+  if (needsRelocation())
+    return SectionKind::getReadOnlyWithRel();
+  switch (getSizeInBytes(*DL)) {
+  case 4:
+    return SectionKind::getMergeableConst4();
+  case 8:
+    return SectionKind::getMergeableConst8();
+  case 16:
+    return SectionKind::getMergeableConst16();
+  case 32:
+    return SectionKind::getMergeableConst32();
+  default:
+    return SectionKind::getReadOnly();
+  }
+}
+
+MachineConstantPool::~MachineConstantPool() {
+  // A constant may be a member of both Constants and MachineCPVsSharingEntries,
+  // so keep track of which we've deleted to avoid double deletions.
+  DenseSet<MachineConstantPoolValue*> Deleted;
+  for (const MachineConstantPoolEntry &C : Constants)
+    if (C.isMachineConstantPoolEntry()) {
+      Deleted.insert(C.Val.MachineCPVal);
+      delete C.Val.MachineCPVal;
+    }
+  for (MachineConstantPoolValue *CPV : MachineCPVsSharingEntries) {
+    if (Deleted.count(CPV) == 0)
+      delete CPV;
+  }
+}
+
+/// Test whether the given two constants can be allocated the same constant pool
+/// entry referenced by \param A.
+static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B,
+                                      const DataLayout &DL) {
+  // Handle the trivial case quickly.
+  if (A == B) return true;
+
+  // If they have the same type but weren't the same constant, quickly
+  // reject them.
+  if (A->getType() == B->getType()) return false;
+
+  // We can't handle structs or arrays.
+  if (isa<StructType>(A->getType()) || isa<ArrayType>(A->getType()) ||
+      isa<StructType>(B->getType()) || isa<ArrayType>(B->getType()))
+    return false;
+
+  // For now, only support constants with the same size.
+  uint64_t StoreSize = DL.getTypeStoreSize(A->getType());
+  if (StoreSize != DL.getTypeStoreSize(B->getType()) || StoreSize > 128)
+    return false;
+
+  bool ContainsUndefOrPoisonA = A->containsUndefOrPoisonElement();
+
+  Type *IntTy = IntegerType::get(A->getContext(), StoreSize*8);
+
+  // Try constant folding a bitcast of both instructions to an integer.  If we
+  // get two identical ConstantInt's, then we are good to share them.  We use
+  // the constant folding APIs to do this so that we get the benefit of
+  // DataLayout.
+  if (isa<PointerType>(A->getType()))
+    A = ConstantFoldCastOperand(Instruction::PtrToInt,
+                                const_cast<Constant *>(A), IntTy, DL);
+  else if (A->getType() != IntTy)
+    A = ConstantFoldCastOperand(Instruction::BitCast, const_cast<Constant *>(A),
+                                IntTy, DL);
+  if (isa<PointerType>(B->getType()))
+    B = ConstantFoldCastOperand(Instruction::PtrToInt,
+                                const_cast<Constant *>(B), IntTy, DL);
+  else if (B->getType() != IntTy)
+    B = ConstantFoldCastOperand(Instruction::BitCast, const_cast<Constant *>(B),
+                                IntTy, DL);
+
+  if (A != B)
+    return false;
+
+  // Constants only safely match if A doesn't contain undef/poison.
+  // As we'll be reusing A, it doesn't matter if B contain undef/poison.
+  // TODO: Handle cases where A and B have the same undef/poison elements.
+  // TODO: Merge A and B with mismatching undef/poison elements.
+  return !ContainsUndefOrPoisonA;
+}
+
+/// Create a new entry in the constant pool or return an existing one.
+/// User must specify the log2 of the minimum required alignment for the object.
+unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C,
+                                                   Align Alignment) {
+  if (Alignment > PoolAlignment) PoolAlignment = Alignment;
+
+  // Check to see if we already have this constant.
+  //
+  // FIXME, this could be made much more efficient for large constant pools.
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i)
+    if (!Constants[i].isMachineConstantPoolEntry() &&
+        CanShareConstantPoolEntry(Constants[i].Val.ConstVal, C, DL)) {
+      if (Constants[i].getAlign() < Alignment)
+        Constants[i].Alignment = Alignment;
+      return i;
+    }
+
+  Constants.push_back(MachineConstantPoolEntry(C, Alignment));
+  return Constants.size()-1;
+}
+
+unsigned MachineConstantPool::getConstantPoolIndex(MachineConstantPoolValue *V,
+                                                   Align Alignment) {
+  if (Alignment > PoolAlignment) PoolAlignment = Alignment;
+
+  // Check to see if we already have this constant.
+  //
+  // FIXME, this could be made much more efficient for large constant pools.
+  int Idx = V->getExistingMachineCPValue(this, Alignment);
+  if (Idx != -1) {
+    MachineCPVsSharingEntries.insert(V);
+    return (unsigned)Idx;
+  }
+
+  Constants.push_back(MachineConstantPoolEntry(V, Alignment));
+  return Constants.size()-1;
+}
+
+void MachineConstantPool::print(raw_ostream &OS) const {
+  if (Constants.empty()) return;
+
+  OS << "Constant Pool:\n";
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    OS << "  cp#" << i << ": ";
+    if (Constants[i].isMachineConstantPoolEntry())
+      Constants[i].Val.MachineCPVal->print(OS);
+    else
+      Constants[i].Val.ConstVal->printAsOperand(OS, /*PrintType=*/false);
+    OS << ", align=" << Constants[i].getAlign().value();
+    OS << "\n";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Template specialization for MachineFunction implementation of
+// ProfileSummaryInfo::getEntryCount().
+//===----------------------------------------------------------------------===//
+template <>
+std::optional<Function::ProfileCount>
+ProfileSummaryInfo::getEntryCount<llvm::MachineFunction>(
+    const llvm::MachineFunction *F) const {
+  return F->getFunction().getEntryCount();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineConstantPool::dump() const { print(dbgs()); }
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionPass.cpp
new file mode 100644
index 000000000000..3a1e1720be9c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionPass.cpp
@@ -0,0 +1,188 @@
+//===-- MachineFunctionPass.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definitions of the MachineFunctionPass members.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/DominanceFrontier.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PrintPasses.h"
+
+using namespace llvm;
+using namespace ore;
+
+Pass *MachineFunctionPass::createPrinterPass(raw_ostream &O,
+                                             const std::string &Banner) const {
+  return createMachineFunctionPrinterPass(O, Banner);
+}
+
+bool MachineFunctionPass::runOnFunction(Function &F) {
+  // Do not codegen any 'available_externally' functions at all, they have
+  // definitions outside the translation unit.
+  if (F.hasAvailableExternallyLinkage())
+    return false;
+
+  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+  MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
+
+  MachineFunctionProperties &MFProps = MF.getProperties();
+
+#ifndef NDEBUG
+  if (!MFProps.verifyRequiredProperties(RequiredProperties)) {
+    errs() << "MachineFunctionProperties required by " << getPassName()
+           << " pass are not met by function " << F.getName() << ".\n"
+           << "Required properties: ";
+    RequiredProperties.print(errs());
+    errs() << "\nCurrent properties: ";
+    MFProps.print(errs());
+    errs() << "\n";
+    llvm_unreachable("MachineFunctionProperties check failed");
+  }
+#endif
+  // Collect the MI count of the function before the pass.
+  unsigned CountBefore, CountAfter;
+
+  // Check if the user asked for size remarks.
+  bool ShouldEmitSizeRemarks =
+      F.getParent()->shouldEmitInstrCountChangedRemark();
+
+  // If we want size remarks, collect the number of MachineInstrs in our
+  // MachineFunction before the pass runs.
+  if (ShouldEmitSizeRemarks)
+    CountBefore = MF.getInstructionCount();
+
+  // For --print-changed, if the function name is a candidate, save the
+  // serialized MF to be compared later.
+  SmallString<0> BeforeStr, AfterStr;
+  StringRef PassID;
+  if (PrintChanged != ChangePrinter::None) {
+    if (const PassInfo *PI = Pass::lookupPassInfo(getPassID()))
+      PassID = PI->getPassArgument();
+  }
+  const bool IsInterestingPass = isPassInPrintList(PassID);
+  const bool ShouldPrintChanged = PrintChanged != ChangePrinter::None &&
+                                  IsInterestingPass &&
+                                  isFunctionInPrintList(MF.getName());
+  if (ShouldPrintChanged) {
+    raw_svector_ostream OS(BeforeStr);
+    MF.print(OS);
+  }
+
+  bool RV = runOnMachineFunction(MF);
+
+  if (ShouldEmitSizeRemarks) {
+    // We wanted size remarks. Check if there was a change to the number of
+    // MachineInstrs in the module. Emit a remark if there was a change.
+    CountAfter = MF.getInstructionCount();
+    if (CountBefore != CountAfter) {
+      MachineOptimizationRemarkEmitter MORE(MF, nullptr);
+      MORE.emit([&]() {
+        int64_t Delta = static_cast<int64_t>(CountAfter) -
+                        static_cast<int64_t>(CountBefore);
+        MachineOptimizationRemarkAnalysis R("size-info", "FunctionMISizeChange",
+                                            MF.getFunction().getSubprogram(),
+                                            &MF.front());
+        R << NV("Pass", getPassName())
+          << ": Function: " << NV("Function", F.getName()) << ": "
+          << "MI Instruction count changed from "
+          << NV("MIInstrsBefore", CountBefore) << " to "
+          << NV("MIInstrsAfter", CountAfter)
+          << "; Delta: " << NV("Delta", Delta);
+        return R;
+      });
+    }
+  }
+
+  MFProps.set(SetProperties);
+  MFProps.reset(ClearedProperties);
+
+  // For --print-changed, print if the serialized MF has changed. Modes other
+  // than quiet/verbose are unimplemented and treated the same as 'quiet'.
+  if (ShouldPrintChanged || !IsInterestingPass) {
+    if (ShouldPrintChanged) {
+      raw_svector_ostream OS(AfterStr);
+      MF.print(OS);
+    }
+    if (IsInterestingPass && BeforeStr != AfterStr) {
+      errs() << ("*** IR Dump After " + getPassName() + " (" + PassID +
+                 ") on " + MF.getName() + " ***\n");
+      switch (PrintChanged) {
+      case ChangePrinter::None:
+        llvm_unreachable("");
+      case ChangePrinter::Quiet:
+      case ChangePrinter::Verbose:
+      case ChangePrinter::DotCfgQuiet:   // unimplemented
+      case ChangePrinter::DotCfgVerbose: // unimplemented
+        errs() << AfterStr;
+        break;
+      case ChangePrinter::DiffQuiet:
+      case ChangePrinter::DiffVerbose:
+      case ChangePrinter::ColourDiffQuiet:
+      case ChangePrinter::ColourDiffVerbose: {
+        bool Color = llvm::is_contained(
+            {ChangePrinter::ColourDiffQuiet, ChangePrinter::ColourDiffVerbose},
+            PrintChanged.getValue());
+        StringRef Removed = Color ? "\033[31m-%l\033[0m\n" : "-%l\n";
+        StringRef Added = Color ? "\033[32m+%l\033[0m\n" : "+%l\n";
+        StringRef NoChange = " %l\n";
+        errs() << doSystemDiff(BeforeStr, AfterStr, Removed, Added, NoChange);
+        break;
+      }
+      }
+    } else if (llvm::is_contained({ChangePrinter::Verbose,
+                                   ChangePrinter::DiffVerbose,
+                                   ChangePrinter::ColourDiffVerbose},
+                                  PrintChanged.getValue())) {
+      const char *Reason =
+          IsInterestingPass ? " omitted because no change" : " filtered out";
+      errs() << "*** IR Dump After " << getPassName();
+      if (!PassID.empty())
+        errs() << " (" << PassID << ")";
+      errs() << " on " << MF.getName() + Reason + " ***\n";
+    }
+  }
+  return RV;
+}
+
+void MachineFunctionPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineModuleInfoWrapperPass>();
+  AU.addPreserved<MachineModuleInfoWrapperPass>();
+
+  // MachineFunctionPass preserves all LLVM IR passes, but there's no
+  // high-level way to express this. Instead, just list a bunch of
+  // passes explicitly. This does not include setPreservesCFG,
+  // because CodeGen overloads that to mean preserving the MachineBasicBlock
+  // CFG in addition to the LLVM IR CFG.
+  AU.addPreserved<BasicAAWrapperPass>();
+  AU.addPreserved<DominanceFrontierWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+  AU.addPreserved<IVUsersWrapperPass>();
+  AU.addPreserved<LoopInfoWrapperPass>();
+  AU.addPreserved<MemoryDependenceWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<SCEVAAWrapperPass>();
+
+  FunctionPass::getAnalysisUsage(AU);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
new file mode 100644
index 000000000000..c31c065b1976
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -0,0 +1,71 @@
+//===-- MachineFunctionPrinterPass.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineFunctionPrinterPass implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/IR/PrintPasses.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+/// MachineFunctionPrinterPass - This is a pass to dump the IR of a
+/// MachineFunction.
+///
+struct MachineFunctionPrinterPass : public MachineFunctionPass {
+  static char ID;
+
+  raw_ostream &OS;
+  const std::string Banner;
+
+  MachineFunctionPrinterPass() : MachineFunctionPass(ID), OS(dbgs()) { }
+  MachineFunctionPrinterPass(raw_ostream &os, const std::string &banner)
+      : MachineFunctionPass(ID), OS(os), Banner(banner) {}
+
+  StringRef getPassName() const override { return "MachineFunction Printer"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addUsedIfAvailable<SlotIndexes>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (!isFunctionInPrintList(MF.getName()))
+      return false;
+    OS << "# " << Banner << ":\n";
+    MF.print(OS, getAnalysisIfAvailable<SlotIndexes>());
+    return false;
+  }
+};
+
+char MachineFunctionPrinterPass::ID = 0;
+}
+
+char &llvm::MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
+INITIALIZE_PASS(MachineFunctionPrinterPass, "machineinstr-printer",
+                "Machine Function Printer", false, false)
+
+namespace llvm {
+/// Returns a newly-created MachineFunction Printer pass. The
+/// default banner is empty.
+///
+MachineFunctionPass *createMachineFunctionPrinterPass(raw_ostream &OS,
+                                                      const std::string &Banner){
+  return new MachineFunctionPrinterPass(OS, Banner);
+}
+
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
new file mode 100644
index 000000000000..fbc071536d22
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
@@ -0,0 +1,224 @@
+//===-- MachineFunctionSplitter.cpp - Split machine functions //-----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// Uses profile information to split out cold blocks.
+//
+// This pass splits out cold machine basic blocks from the parent function. This
+// implementation leverages the basic block section framework. Blocks marked
+// cold by this pass are grouped together in a separate section prefixed with
+// ".text.unlikely.*". The linker can then group these together as a cold
+// section. The split part of the function is a contiguous region identified by
+// the symbol "foo.cold". Grouping all cold blocks across functions together
+// decreases fragmentation and improves icache and itlb utilization. Note that
+// the overall changes to the binary size are negligible; only a small number of
+// additional jump instructions may be introduced.
+//
+// For the original RFC of this pass please see
+// https://groups.google.com/d/msg/llvm-dev/RUegaMg-iqc/wFAVxa6fCgAJ
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/EHUtils.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/BasicBlockSectionUtils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include <optional>
+
+using namespace llvm;
+
+// FIXME: This cutoff value is CPU dependent and should be moved to
+// TargetTransformInfo once we consider enabling this on other platforms.
+// The value is expressed as a ProfileSummaryInfo integer percentile cutoff.
+// Defaults to 999950, i.e. all blocks colder than 99.995 percentile are split.
+// The default was empirically determined to be optimal when considering cutoff
+// values between 99%-ile to 100%-ile with respect to iTLB and icache metrics on
+// Intel CPUs.
+static cl::opt<unsigned>
+    PercentileCutoff("mfs-psi-cutoff",
+                     cl::desc("Percentile profile summary cutoff used to "
+                              "determine cold blocks. Unused if set to zero."),
+                     cl::init(999950), cl::Hidden);
+
+static cl::opt<unsigned> ColdCountThreshold(
+    "mfs-count-threshold",
+    cl::desc(
+        "Minimum number of times a block must be executed to be retained."),
+    cl::init(1), cl::Hidden);
+
+static cl::opt<bool> SplitAllEHCode(
+    "mfs-split-ehcode",
+    cl::desc("Splits all EH code and it's descendants by default."),
+    cl::init(false), cl::Hidden);
+
+namespace {
+
+class MachineFunctionSplitter : public MachineFunctionPass {
+public:
+  static char ID;
+  MachineFunctionSplitter() : MachineFunctionPass(ID) {
+    initializeMachineFunctionSplitterPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Machine Function Splitter Transformation";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+};
+} // end anonymous namespace
+
+/// setDescendantEHBlocksCold - This splits all EH pads and blocks reachable
+/// only by EH pad as cold. This will help mark EH pads statically cold
+/// instead of relying on profile data.
+static void setDescendantEHBlocksCold(MachineFunction &MF) {
+  DenseSet<MachineBasicBlock *> EHBlocks;
+  computeEHOnlyBlocks(MF, EHBlocks);
+  for (auto Block : EHBlocks) {
+    Block->setSectionID(MBBSectionID::ColdSectionID);
+  }
+}
+
+static void finishAdjustingBasicBlocksAndLandingPads(MachineFunction &MF) {
+  auto Comparator = [](const MachineBasicBlock &X, const MachineBasicBlock &Y) {
+    return X.getSectionID().Type < Y.getSectionID().Type;
+  };
+  llvm::sortBasicBlocksAndUpdateBranches(MF, Comparator);
+  llvm::avoidZeroOffsetLandingPad(MF);
+}
+
+static bool isColdBlock(const MachineBasicBlock &MBB,
+                        const MachineBlockFrequencyInfo *MBFI,
+                        ProfileSummaryInfo *PSI) {
+  std::optional<uint64_t> Count = MBFI->getBlockProfileCount(&MBB);
+  // For instrumentation profiles and sample profiles, we use different ways
+  // to judge whether a block is cold and should be split.
+  if (PSI->hasInstrumentationProfile() || PSI->hasCSInstrumentationProfile()) {
+    // If using instrument profile, which is deemed "accurate", no count means
+    // cold.
+    if (!Count)
+      return true;
+    if (PercentileCutoff > 0)
+      return PSI->isColdCountNthPercentile(PercentileCutoff, *Count);
+    // Fallthrough to end of function.
+  } else if (PSI->hasSampleProfile()) {
+    // For sample profile, no count means "do not judege coldness".
+    if (!Count)
+      return false;
+  }
+
+  return (*Count < ColdCountThreshold);
+}
+
+bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
+  // We target functions with profile data. Static information in the form
+  // of exception handling code may be split to cold if user passes the
+  // mfs-split-ehcode flag.
+  bool UseProfileData = MF.getFunction().hasProfileData();
+  if (!UseProfileData && !SplitAllEHCode)
+    return false;
+
+  // TODO: We don't split functions where a section attribute has been set
+  // since the split part may not be placed in a contiguous region. It may also
+  // be more beneficial to augment the linker to ensure contiguous layout of
+  // split functions within the same section as specified by the attribute.
+  if (MF.getFunction().hasSection() ||
+      MF.getFunction().hasFnAttribute("implicit-section-name"))
+    return false;
+
+  // We don't want to proceed further for cold functions
+  // or functions of unknown hotness. Lukewarm functions have no prefix.
+  std::optional<StringRef> SectionPrefix = MF.getFunction().getSectionPrefix();
+  if (SectionPrefix &&
+      (*SectionPrefix == "unlikely" || *SectionPrefix == "unknown")) {
+    return false;
+  }
+
+  // Renumbering blocks here preserves the order of the blocks as
+  // sortBasicBlocksAndUpdateBranches uses the numeric identifier to sort
+  // blocks. Preserving the order of blocks is essential to retaining decisions
+  // made by prior passes such as MachineBlockPlacement.
+  MF.RenumberBlocks();
+  MF.setBBSectionsType(BasicBlockSection::Preset);
+
+  MachineBlockFrequencyInfo *MBFI = nullptr;
+  ProfileSummaryInfo *PSI = nullptr;
+  if (UseProfileData) {
+    MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+    PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+    // If we don't have a good profile (sample profile is not deemed
+    // as a "good profile") and the function is not hot, then early
+    // return. (Because we can only trust hot functions when profile
+    // quality is not good.)
+    if (PSI->hasSampleProfile() && !PSI->isFunctionHotInCallGraph(&MF, *MBFI)) {
+      // Split all EH code and it's descendant statically by default.
+      if (SplitAllEHCode)
+        setDescendantEHBlocksCold(MF);
+      finishAdjustingBasicBlocksAndLandingPads(MF);
+      return true;
+    }
+  }
+
+  SmallVector<MachineBasicBlock *, 2> LandingPads;
+  for (auto &MBB : MF) {
+    if (MBB.isEntryBlock())
+      continue;
+
+    if (MBB.isEHPad())
+      LandingPads.push_back(&MBB);
+    else if (UseProfileData && isColdBlock(MBB, MBFI, PSI) && !SplitAllEHCode)
+      MBB.setSectionID(MBBSectionID::ColdSectionID);
+  }
+
+  // Split all EH code and it's descendant statically by default.
+  if (SplitAllEHCode)
+    setDescendantEHBlocksCold(MF);
+  // We only split out eh pads if all of them are cold.
+  else {
+    // Here we have UseProfileData == true.
+    bool HasHotLandingPads = false;
+    for (const MachineBasicBlock *LP : LandingPads) {
+      if (!isColdBlock(*LP, MBFI, PSI))
+        HasHotLandingPads = true;
+    }
+    if (!HasHotLandingPads) {
+      for (MachineBasicBlock *LP : LandingPads)
+        LP->setSectionID(MBBSectionID::ColdSectionID);
+    }
+  }
+
+  finishAdjustingBasicBlocksAndLandingPads(MF);
+  return true;
+}
+
+void MachineFunctionSplitter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineModuleInfoWrapperPass>();
+  AU.addRequired<MachineBlockFrequencyInfo>();
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();
+}
+
+char MachineFunctionSplitter::ID = 0;
+INITIALIZE_PASS(MachineFunctionSplitter, "machine-function-splitter",
+                "Split machine functions using profile information", false,
+                false)
+
+MachineFunctionPass *llvm::createMachineFunctionSplitterPass() {
+  return new MachineFunctionSplitter();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineInstr.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineInstr.cpp
new file mode 100644
index 000000000000..a9309487a7a7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineInstr.cpp
@@ -0,0 +1,2462 @@
+//===- lib/CodeGen/MachineInstr.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Methods common to all machine instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <utility>
+
+using namespace llvm;
+
+static const MachineFunction *getMFIfAvailable(const MachineInstr &MI) {
+  if (const MachineBasicBlock *MBB = MI.getParent())
+    if (const MachineFunction *MF = MBB->getParent())
+      return MF;
+  return nullptr;
+}
+
+// Try to crawl up to the machine function and get TRI and IntrinsicInfo from
+// it.
+static void tryToGetTargetInfo(const MachineInstr &MI,
+                               const TargetRegisterInfo *&TRI,
+                               const MachineRegisterInfo *&MRI,
+                               const TargetIntrinsicInfo *&IntrinsicInfo,
+                               const TargetInstrInfo *&TII) {
+
+  if (const MachineFunction *MF = getMFIfAvailable(MI)) {
+    TRI = MF->getSubtarget().getRegisterInfo();
+    MRI = &MF->getRegInfo();
+    IntrinsicInfo = MF->getTarget().getIntrinsicInfo();
+    TII = MF->getSubtarget().getInstrInfo();
+  }
+}
+
+void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) {
+  for (MCPhysReg ImpDef : MCID->implicit_defs())
+    addOperand(MF, MachineOperand::CreateReg(ImpDef, true, true));
+  for (MCPhysReg ImpUse : MCID->implicit_uses())
+    addOperand(MF, MachineOperand::CreateReg(ImpUse, false, true));
+}
+
+/// MachineInstr ctor - This constructor creates a MachineInstr and adds the
+/// implicit operands. It reserves space for the number of operands specified by
+/// the MCInstrDesc.
+MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &TID,
+                           DebugLoc DL, bool NoImp)
+    : MCID(&TID), NumOperands(0), Flags(0), AsmPrinterFlags(0),
+      DbgLoc(std::move(DL)), DebugInstrNum(0) {
+  assert(DbgLoc.hasTrivialDestructor() && "Expected trivial destructor");
+
+  // Reserve space for the expected number of operands.
+  if (unsigned NumOps = MCID->getNumOperands() + MCID->implicit_defs().size() +
+                        MCID->implicit_uses().size()) {
+    CapOperands = OperandCapacity::get(NumOps);
+    Operands = MF.allocateOperandArray(CapOperands);
+  }
+
+  if (!NoImp)
+    addImplicitDefUseOperands(MF);
+}
+
+/// MachineInstr ctor - Copies MachineInstr arg exactly.
+/// Does not copy the number from debug instruction numbering, to preserve
+/// uniqueness.
+MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
+    : MCID(&MI.getDesc()), NumOperands(0), Flags(0), AsmPrinterFlags(0),
+      Info(MI.Info), DbgLoc(MI.getDebugLoc()), DebugInstrNum(0) {
+  assert(DbgLoc.hasTrivialDestructor() && "Expected trivial destructor");
+
+  CapOperands = OperandCapacity::get(MI.getNumOperands());
+  Operands = MF.allocateOperandArray(CapOperands);
+
+  // Copy operands.
+  for (const MachineOperand &MO : MI.operands())
+    addOperand(MF, MO);
+
+  // Replicate ties between the operands, which addOperand was not
+  // able to do reliably.
+  for (unsigned i = 0, e = getNumOperands(); i < e; ++i) {
+    MachineOperand &NewMO = getOperand(i);
+    const MachineOperand &OrigMO = MI.getOperand(i);
+    NewMO.TiedTo = OrigMO.TiedTo;
+  }
+
+  // Copy all the sensible flags.
+  setFlags(MI.Flags);
+}
+
+void MachineInstr::moveBefore(MachineInstr *MovePos) {
+  MovePos->getParent()->splice(MovePos, getParent(), getIterator());
+}
+
+/// getRegInfo - If this instruction is embedded into a MachineFunction,
+/// return the MachineRegisterInfo object for the current function, otherwise
+/// return null.
+MachineRegisterInfo *MachineInstr::getRegInfo() {
+  if (MachineBasicBlock *MBB = getParent())
+    return &MBB->getParent()->getRegInfo();
+  return nullptr;
+}
+
+const MachineRegisterInfo *MachineInstr::getRegInfo() const {
+  if (const MachineBasicBlock *MBB = getParent())
+    return &MBB->getParent()->getRegInfo();
+  return nullptr;
+}
+
+void MachineInstr::removeRegOperandsFromUseLists(MachineRegisterInfo &MRI) {
+  for (MachineOperand &MO : operands())
+    if (MO.isReg())
+      MRI.removeRegOperandFromUseList(&MO);
+}
+
+void MachineInstr::addRegOperandsToUseLists(MachineRegisterInfo &MRI) {
+  for (MachineOperand &MO : operands())
+    if (MO.isReg())
+      MRI.addRegOperandToUseList(&MO);
+}
+
+void MachineInstr::addOperand(const MachineOperand &Op) {
+  MachineBasicBlock *MBB = getParent();
+  assert(MBB && "Use MachineInstrBuilder to add operands to dangling instrs");
+  MachineFunction *MF = MBB->getParent();
+  assert(MF && "Use MachineInstrBuilder to add operands to dangling instrs");
+  addOperand(*MF, Op);
+}
+
+/// Move NumOps MachineOperands from Src to Dst, with support for overlapping
+/// ranges. If MRI is non-null also update use-def chains.
+static void moveOperands(MachineOperand *Dst, MachineOperand *Src,
+                         unsigned NumOps, MachineRegisterInfo *MRI) {
+  if (MRI)
+    return MRI->moveOperands(Dst, Src, NumOps);
+  // MachineOperand is a trivially copyable type so we can just use memmove.
+  assert(Dst && Src && "Unknown operands");
+  std::memmove(Dst, Src, NumOps * sizeof(MachineOperand));
+}
+
+/// addOperand - Add the specified operand to the instruction.  If it is an
+/// implicit operand, it is added to the end of the operand list.  If it is
+/// an explicit operand it is added at the end of the explicit operand list
+/// (before the first implicit operand).
+void MachineInstr::addOperand(MachineFunction &MF, const MachineOperand &Op) {
+  assert(isUInt<LLVM_MI_NUMOPERANDS_BITS>(NumOperands + 1) &&
+         "Cannot add more operands.");
+  assert(MCID && "Cannot add operands before providing an instr descriptor");
+
+  // Check if we're adding one of our existing operands.
+  if (&Op >= Operands && &Op < Operands + NumOperands) {
+    // This is unusual: MI->addOperand(MI->getOperand(i)).
+    // If adding Op requires reallocating or moving existing operands around,
+    // the Op reference could go stale. Support it by copying Op.
+    MachineOperand CopyOp(Op);
+    return addOperand(MF, CopyOp);
+  }
+
+  // Find the insert location for the new operand.  Implicit registers go at
+  // the end, everything else goes before the implicit regs.
+  //
+  // FIXME: Allow mixed explicit and implicit operands on inline asm.
+  // InstrEmitter::EmitSpecialNode() is marking inline asm clobbers as
+  // implicit-defs, but they must not be moved around.  See the FIXME in
+  // InstrEmitter.cpp.
+  unsigned OpNo = getNumOperands();
+  bool isImpReg = Op.isReg() && Op.isImplicit();
+  if (!isImpReg && !isInlineAsm()) {
+    while (OpNo && Operands[OpNo-1].isReg() && Operands[OpNo-1].isImplicit()) {
+      --OpNo;
+      assert(!Operands[OpNo].isTied() && "Cannot move tied operands");
+    }
+  }
+
+  // OpNo now points as the desired insertion point.  Unless this is a variadic
+  // instruction, only implicit regs are allowed beyond MCID->getNumOperands().
+  // RegMask operands go between the explicit and implicit operands.
+  assert((MCID->isVariadic() || OpNo < MCID->getNumOperands() ||
+          Op.isValidExcessOperand()) &&
+         "Trying to add an operand to a machine instr that is already done!");
+
+  MachineRegisterInfo *MRI = getRegInfo();
+
+  // Determine if the Operands array needs to be reallocated.
+  // Save the old capacity and operand array.
+  OperandCapacity OldCap = CapOperands;
+  MachineOperand *OldOperands = Operands;
+  if (!OldOperands || OldCap.getSize() == getNumOperands()) {
+    CapOperands = OldOperands ? OldCap.getNext() : OldCap.get(1);
+    Operands = MF.allocateOperandArray(CapOperands);
+    // Move the operands before the insertion point.
+    if (OpNo)
+      moveOperands(Operands, OldOperands, OpNo, MRI);
+  }
+
+  // Move the operands following the insertion point.
+  if (OpNo != NumOperands)
+    moveOperands(Operands + OpNo + 1, OldOperands + OpNo, NumOperands - OpNo,
+                 MRI);
+  ++NumOperands;
+
+  // Deallocate the old operand array.
+  if (OldOperands != Operands && OldOperands)
+    MF.deallocateOperandArray(OldCap, OldOperands);
+
+  // Copy Op into place. It still needs to be inserted into the MRI use lists.
+  MachineOperand *NewMO = new (Operands + OpNo) MachineOperand(Op);
+  NewMO->ParentMI = this;
+
+  // When adding a register operand, tell MRI about it.
+  if (NewMO->isReg()) {
+    // Ensure isOnRegUseList() returns false, regardless of Op's status.
+    NewMO->Contents.Reg.Prev = nullptr;
+    // Ignore existing ties. This is not a property that can be copied.
+    NewMO->TiedTo = 0;
+    // Add the new operand to MRI, but only for instructions in an MBB.
+    if (MRI)
+      MRI->addRegOperandToUseList(NewMO);
+    // The MCID operand information isn't accurate until we start adding
+    // explicit operands. The implicit operands are added first, then the
+    // explicits are inserted before them.
+    if (!isImpReg) {
+      // Tie uses to defs as indicated in MCInstrDesc.
+      if (NewMO->isUse()) {
+        int DefIdx = MCID->getOperandConstraint(OpNo, MCOI::TIED_TO);
+        if (DefIdx != -1)
+          tieOperands(DefIdx, OpNo);
+      }
+      // If the register operand is flagged as early, mark the operand as such.
+      if (MCID->getOperandConstraint(OpNo, MCOI::EARLY_CLOBBER) != -1)
+        NewMO->setIsEarlyClobber(true);
+    }
+    // Ensure debug instructions set debug flag on register uses.
+    if (NewMO->isUse() && isDebugInstr())
+      NewMO->setIsDebug();
+  }
+}
+
+void MachineInstr::removeOperand(unsigned OpNo) {
+  assert(OpNo < getNumOperands() && "Invalid operand number");
+  untieRegOperand(OpNo);
+
+#ifndef NDEBUG
+  // Moving tied operands would break the ties.
+  for (unsigned i = OpNo + 1, e = getNumOperands(); i != e; ++i)
+    if (Operands[i].isReg())
+      assert(!Operands[i].isTied() && "Cannot move tied operands");
+#endif
+
+  MachineRegisterInfo *MRI = getRegInfo();
+  if (MRI && Operands[OpNo].isReg())
+    MRI->removeRegOperandFromUseList(Operands + OpNo);
+
+  // Don't call the MachineOperand destructor. A lot of this code depends on
+  // MachineOperand having a trivial destructor anyway, and adding a call here
+  // wouldn't make it 'destructor-correct'.
+
+  if (unsigned N = NumOperands - 1 - OpNo)
+    moveOperands(Operands + OpNo, Operands + OpNo + 1, N, MRI);
+  --NumOperands;
+}
+
+void MachineInstr::setExtraInfo(MachineFunction &MF,
+                                ArrayRef<MachineMemOperand *> MMOs,
+                                MCSymbol *PreInstrSymbol,
+                                MCSymbol *PostInstrSymbol,
+                                MDNode *HeapAllocMarker, MDNode *PCSections,
+                                uint32_t CFIType) {
+  bool HasPreInstrSymbol = PreInstrSymbol != nullptr;
+  bool HasPostInstrSymbol = PostInstrSymbol != nullptr;
+  bool HasHeapAllocMarker = HeapAllocMarker != nullptr;
+  bool HasPCSections = PCSections != nullptr;
+  bool HasCFIType = CFIType != 0;
+  int NumPointers = MMOs.size() + HasPreInstrSymbol + HasPostInstrSymbol +
+                    HasHeapAllocMarker + HasPCSections + HasCFIType;
+
+  // Drop all extra info if there is none.
+  if (NumPointers <= 0) {
+    Info.clear();
+    return;
+  }
+
+  // If more than one pointer, then store out of line. Store heap alloc markers
+  // out of line because PointerSumType cannot hold more than 4 tag types with
+  // 32-bit pointers.
+  // FIXME: Maybe we should make the symbols in the extra info mutable?
+  else if (NumPointers > 1 || HasHeapAllocMarker || HasPCSections ||
+           HasCFIType) {
+    Info.set<EIIK_OutOfLine>(
+        MF.createMIExtraInfo(MMOs, PreInstrSymbol, PostInstrSymbol,
+                             HeapAllocMarker, PCSections, CFIType));
+    return;
+  }
+
+  // Otherwise store the single pointer inline.
+  if (HasPreInstrSymbol)
+    Info.set<EIIK_PreInstrSymbol>(PreInstrSymbol);
+  else if (HasPostInstrSymbol)
+    Info.set<EIIK_PostInstrSymbol>(PostInstrSymbol);
+  else
+    Info.set<EIIK_MMO>(MMOs[0]);
+}
+
+void MachineInstr::dropMemRefs(MachineFunction &MF) {
+  if (memoperands_empty())
+    return;
+
+  setExtraInfo(MF, {}, getPreInstrSymbol(), getPostInstrSymbol(),
+               getHeapAllocMarker(), getPCSections(), getCFIType());
+}
+
+void MachineInstr::setMemRefs(MachineFunction &MF,
+                              ArrayRef<MachineMemOperand *> MMOs) {
+  if (MMOs.empty()) {
+    dropMemRefs(MF);
+    return;
+  }
+
+  setExtraInfo(MF, MMOs, getPreInstrSymbol(), getPostInstrSymbol(),
+               getHeapAllocMarker(), getPCSections(), getCFIType());
+}
+
+void MachineInstr::addMemOperand(MachineFunction &MF,
+                                 MachineMemOperand *MO) {
+  SmallVector<MachineMemOperand *, 2> MMOs;
+  MMOs.append(memoperands_begin(), memoperands_end());
+  MMOs.push_back(MO);
+  setMemRefs(MF, MMOs);
+}
+
+void MachineInstr::cloneMemRefs(MachineFunction &MF, const MachineInstr &MI) {
+  if (this == &MI)
+    // Nothing to do for a self-clone!
+    return;
+
+  assert(&MF == MI.getMF() &&
+         "Invalid machine functions when cloning memory refrences!");
+  // See if we can just steal the extra info already allocated for the
+  // instruction. We can do this whenever the pre- and post-instruction symbols
+  // are the same (including null).
+  if (getPreInstrSymbol() == MI.getPreInstrSymbol() &&
+      getPostInstrSymbol() == MI.getPostInstrSymbol() &&
+      getHeapAllocMarker() == MI.getHeapAllocMarker() &&
+      getPCSections() == MI.getPCSections()) {
+    Info = MI.Info;
+    return;
+  }
+
+  // Otherwise, fall back on a copy-based clone.
+  setMemRefs(MF, MI.memoperands());
+}
+
+/// Check to see if the MMOs pointed to by the two MemRefs arrays are
+/// identical.
+static bool hasIdenticalMMOs(ArrayRef<MachineMemOperand *> LHS,
+                             ArrayRef<MachineMemOperand *> RHS) {
+  if (LHS.size() != RHS.size())
+    return false;
+
+  auto LHSPointees = make_pointee_range(LHS);
+  auto RHSPointees = make_pointee_range(RHS);
+  return std::equal(LHSPointees.begin(), LHSPointees.end(),
+                    RHSPointees.begin());
+}
+
+void MachineInstr::cloneMergedMemRefs(MachineFunction &MF,
+                                      ArrayRef<const MachineInstr *> MIs) {
+  // Try handling easy numbers of MIs with simpler mechanisms.
+  if (MIs.empty()) {
+    dropMemRefs(MF);
+    return;
+  }
+  if (MIs.size() == 1) {
+    cloneMemRefs(MF, *MIs[0]);
+    return;
+  }
+  // Because an empty memoperands list provides *no* information and must be
+  // handled conservatively (assuming the instruction can do anything), the only
+  // way to merge with it is to drop all other memoperands.
+  if (MIs[0]->memoperands_empty()) {
+    dropMemRefs(MF);
+    return;
+  }
+
+  // Handle the general case.
+  SmallVector<MachineMemOperand *, 2> MergedMMOs;
+  // Start with the first instruction.
+  assert(&MF == MIs[0]->getMF() &&
+         "Invalid machine functions when cloning memory references!");
+  MergedMMOs.append(MIs[0]->memoperands_begin(), MIs[0]->memoperands_end());
+  // Now walk all the other instructions and accumulate any different MMOs.
+  for (const MachineInstr &MI : make_pointee_range(MIs.slice(1))) {
+    assert(&MF == MI.getMF() &&
+           "Invalid machine functions when cloning memory references!");
+
+    // Skip MIs with identical operands to the first. This is a somewhat
+    // arbitrary hack but will catch common cases without being quadratic.
+    // TODO: We could fully implement merge semantics here if needed.
+    if (hasIdenticalMMOs(MIs[0]->memoperands(), MI.memoperands()))
+      continue;
+
+    // Because an empty memoperands list provides *no* information and must be
+    // handled conservatively (assuming the instruction can do anything), the
+    // only way to merge with it is to drop all other memoperands.
+    if (MI.memoperands_empty()) {
+      dropMemRefs(MF);
+      return;
+    }
+
+    // Otherwise accumulate these into our temporary buffer of the merged state.
+    MergedMMOs.append(MI.memoperands_begin(), MI.memoperands_end());
+  }
+
+  setMemRefs(MF, MergedMMOs);
+}
+
+void MachineInstr::setPreInstrSymbol(MachineFunction &MF, MCSymbol *Symbol) {
+  // Do nothing if old and new symbols are the same.
+  if (Symbol == getPreInstrSymbol())
+    return;
+
+  // If there was only one symbol and we're removing it, just clear info.
+  if (!Symbol && Info.is<EIIK_PreInstrSymbol>()) {
+    Info.clear();
+    return;
+  }
+
+  setExtraInfo(MF, memoperands(), Symbol, getPostInstrSymbol(),
+               getHeapAllocMarker(), getPCSections(), getCFIType());
+}
+
+void MachineInstr::setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol) {
+  // Do nothing if old and new symbols are the same.
+  if (Symbol == getPostInstrSymbol())
+    return;
+
+  // If there was only one symbol and we're removing it, just clear info.
+  if (!Symbol && Info.is<EIIK_PostInstrSymbol>()) {
+    Info.clear();
+    return;
+  }
+
+  setExtraInfo(MF, memoperands(), getPreInstrSymbol(), Symbol,
+               getHeapAllocMarker(), getPCSections(), getCFIType());
+}
+
+void MachineInstr::setHeapAllocMarker(MachineFunction &MF, MDNode *Marker) {
+  // Do nothing if old and new symbols are the same.
+  if (Marker == getHeapAllocMarker())
+    return;
+
+  setExtraInfo(MF, memoperands(), getPreInstrSymbol(), getPostInstrSymbol(),
+               Marker, getPCSections(), getCFIType());
+}
+
+void MachineInstr::setPCSections(MachineFunction &MF, MDNode *PCSections) {
+  // Do nothing if old and new symbols are the same.
+  if (PCSections == getPCSections())
+    return;
+
+  setExtraInfo(MF, memoperands(), getPreInstrSymbol(), getPostInstrSymbol(),
+               getHeapAllocMarker(), PCSections, getCFIType());
+}
+
+void MachineInstr::setCFIType(MachineFunction &MF, uint32_t Type) {
+  // Do nothing if old and new types are the same.
+  if (Type == getCFIType())
+    return;
+
+  setExtraInfo(MF, memoperands(), getPreInstrSymbol(), getPostInstrSymbol(),
+               getHeapAllocMarker(), getPCSections(), Type);
+}
+
+void MachineInstr::cloneInstrSymbols(MachineFunction &MF,
+                                     const MachineInstr &MI) {
+  if (this == &MI)
+    // Nothing to do for a self-clone!
+    return;
+
+  assert(&MF == MI.getMF() &&
+         "Invalid machine functions when cloning instruction symbols!");
+
+  setPreInstrSymbol(MF, MI.getPreInstrSymbol());
+  setPostInstrSymbol(MF, MI.getPostInstrSymbol());
+  setHeapAllocMarker(MF, MI.getHeapAllocMarker());
+  setPCSections(MF, MI.getPCSections());
+}
+
+uint32_t MachineInstr::mergeFlagsWith(const MachineInstr &Other) const {
+  // For now, the just return the union of the flags. If the flags get more
+  // complicated over time, we might need more logic here.
+  return getFlags() | Other.getFlags();
+}
+
+uint32_t MachineInstr::copyFlagsFromInstruction(const Instruction &I) {
+  uint32_t MIFlags = 0;
+  // Copy the wrapping flags.
+  if (const OverflowingBinaryOperator *OB =
+          dyn_cast<OverflowingBinaryOperator>(&I)) {
+    if (OB->hasNoSignedWrap())
+      MIFlags |= MachineInstr::MIFlag::NoSWrap;
+    if (OB->hasNoUnsignedWrap())
+      MIFlags |= MachineInstr::MIFlag::NoUWrap;
+  }
+
+  // Copy the exact flag.
+  if (const PossiblyExactOperator *PE = dyn_cast<PossiblyExactOperator>(&I))
+    if (PE->isExact())
+      MIFlags |= MachineInstr::MIFlag::IsExact;
+
+  // Copy the fast-math flags.
+  if (const FPMathOperator *FP = dyn_cast<FPMathOperator>(&I)) {
+    const FastMathFlags Flags = FP->getFastMathFlags();
+    if (Flags.noNaNs())
+      MIFlags |= MachineInstr::MIFlag::FmNoNans;
+    if (Flags.noInfs())
+      MIFlags |= MachineInstr::MIFlag::FmNoInfs;
+    if (Flags.noSignedZeros())
+      MIFlags |= MachineInstr::MIFlag::FmNsz;
+    if (Flags.allowReciprocal())
+      MIFlags |= MachineInstr::MIFlag::FmArcp;
+    if (Flags.allowContract())
+      MIFlags |= MachineInstr::MIFlag::FmContract;
+    if (Flags.approxFunc())
+      MIFlags |= MachineInstr::MIFlag::FmAfn;
+    if (Flags.allowReassoc())
+      MIFlags |= MachineInstr::MIFlag::FmReassoc;
+  }
+
+  if (I.getMetadata(LLVMContext::MD_unpredictable))
+    MIFlags |= MachineInstr::MIFlag::Unpredictable;
+
+  return MIFlags;
+}
+
+void MachineInstr::copyIRFlags(const Instruction &I) {
+  Flags = copyFlagsFromInstruction(I);
+}
+
+bool MachineInstr::hasPropertyInBundle(uint64_t Mask, QueryType Type) const {
+  assert(!isBundledWithPred() && "Must be called on bundle header");
+  for (MachineBasicBlock::const_instr_iterator MII = getIterator();; ++MII) {
+    if (MII->getDesc().getFlags() & Mask) {
+      if (Type == AnyInBundle)
+        return true;
+    } else {
+      if (Type == AllInBundle && !MII->isBundle())
+        return false;
+    }
+    // This was the last instruction in the bundle.
+    if (!MII->isBundledWithSucc())
+      return Type == AllInBundle;
+  }
+}
+
+bool MachineInstr::isIdenticalTo(const MachineInstr &Other,
+                                 MICheckType Check) const {
+  // If opcodes or number of operands are not the same then the two
+  // instructions are obviously not identical.
+  if (Other.getOpcode() != getOpcode() ||
+      Other.getNumOperands() != getNumOperands())
+    return false;
+
+  if (isBundle()) {
+    // We have passed the test above that both instructions have the same
+    // opcode, so we know that both instructions are bundles here. Let's compare
+    // MIs inside the bundle.
+    assert(Other.isBundle() && "Expected that both instructions are bundles.");
+    MachineBasicBlock::const_instr_iterator I1 = getIterator();
+    MachineBasicBlock::const_instr_iterator I2 = Other.getIterator();
+    // Loop until we analysed the last intruction inside at least one of the
+    // bundles.
+    while (I1->isBundledWithSucc() && I2->isBundledWithSucc()) {
+      ++I1;
+      ++I2;
+      if (!I1->isIdenticalTo(*I2, Check))
+        return false;
+    }
+    // If we've reached the end of just one of the two bundles, but not both,
+    // the instructions are not identical.
+    if (I1->isBundledWithSucc() || I2->isBundledWithSucc())
+      return false;
+  }
+
+  // Check operands to make sure they match.
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    const MachineOperand &OMO = Other.getOperand(i);
+    if (!MO.isReg()) {
+      if (!MO.isIdenticalTo(OMO))
+        return false;
+      continue;
+    }
+
+    // Clients may or may not want to ignore defs when testing for equality.
+    // For example, machine CSE pass only cares about finding common
+    // subexpressions, so it's safe to ignore virtual register defs.
+    if (MO.isDef()) {
+      if (Check == IgnoreDefs)
+        continue;
+      else if (Check == IgnoreVRegDefs) {
+        if (!MO.getReg().isVirtual() || !OMO.getReg().isVirtual())
+          if (!MO.isIdenticalTo(OMO))
+            return false;
+      } else {
+        if (!MO.isIdenticalTo(OMO))
+          return false;
+        if (Check == CheckKillDead && MO.isDead() != OMO.isDead())
+          return false;
+      }
+    } else {
+      if (!MO.isIdenticalTo(OMO))
+        return false;
+      if (Check == CheckKillDead && MO.isKill() != OMO.isKill())
+        return false;
+    }
+  }
+  // If DebugLoc does not match then two debug instructions are not identical.
+  if (isDebugInstr())
+    if (getDebugLoc() && Other.getDebugLoc() &&
+        getDebugLoc() != Other.getDebugLoc())
+      return false;
+  // If pre- or post-instruction symbols do not match then the two instructions
+  // are not identical.
+  if (getPreInstrSymbol() != Other.getPreInstrSymbol() ||
+      getPostInstrSymbol() != Other.getPostInstrSymbol())
+    return false;
+  // Call instructions with different CFI types are not identical.
+  if (isCall() && getCFIType() != Other.getCFIType())
+    return false;
+
+  return true;
+}
+
+bool MachineInstr::isEquivalentDbgInstr(const MachineInstr &Other) const {
+  if (!isDebugValueLike() || !Other.isDebugValueLike())
+    return false;
+  if (getDebugLoc() != Other.getDebugLoc())
+    return false;
+  if (getDebugVariable() != Other.getDebugVariable())
+    return false;
+  if (getNumDebugOperands() != Other.getNumDebugOperands())
+    return false;
+  for (unsigned OpIdx = 0; OpIdx < getNumDebugOperands(); ++OpIdx)
+    if (!getDebugOperand(OpIdx).isIdenticalTo(Other.getDebugOperand(OpIdx)))
+      return false;
+  if (!DIExpression::isEqualExpression(
+          getDebugExpression(), isIndirectDebugValue(),
+          Other.getDebugExpression(), Other.isIndirectDebugValue()))
+    return false;
+  return true;
+}
+
+const MachineFunction *MachineInstr::getMF() const {
+  return getParent()->getParent();
+}
+
+MachineInstr *MachineInstr::removeFromParent() {
+  assert(getParent() && "Not embedded in a basic block!");
+  return getParent()->remove(this);
+}
+
+MachineInstr *MachineInstr::removeFromBundle() {
+  assert(getParent() && "Not embedded in a basic block!");
+  return getParent()->remove_instr(this);
+}
+
+void MachineInstr::eraseFromParent() {
+  assert(getParent() && "Not embedded in a basic block!");
+  getParent()->erase(this);
+}
+
+void MachineInstr::eraseFromBundle() {
+  assert(getParent() && "Not embedded in a basic block!");
+  getParent()->erase_instr(this);
+}
+
+bool MachineInstr::isCandidateForCallSiteEntry(QueryType Type) const {
+  if (!isCall(Type))
+    return false;
+  switch (getOpcode()) {
+  case TargetOpcode::PATCHPOINT:
+  case TargetOpcode::STACKMAP:
+  case TargetOpcode::STATEPOINT:
+  case TargetOpcode::FENTRY_CALL:
+    return false;
+  }
+  return true;
+}
+
+bool MachineInstr::shouldUpdateCallSiteInfo() const {
+  if (isBundle())
+    return isCandidateForCallSiteEntry(MachineInstr::AnyInBundle);
+  return isCandidateForCallSiteEntry();
+}
+
+unsigned MachineInstr::getNumExplicitOperands() const {
+  unsigned NumOperands = MCID->getNumOperands();
+  if (!MCID->isVariadic())
+    return NumOperands;
+
+  for (unsigned I = NumOperands, E = getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = getOperand(I);
+    // The operands must always be in the following order:
+    // - explicit reg defs,
+    // - other explicit operands (reg uses, immediates, etc.),
+    // - implicit reg defs
+    // - implicit reg uses
+    if (MO.isReg() && MO.isImplicit())
+      break;
+    ++NumOperands;
+  }
+  return NumOperands;
+}
+
+unsigned MachineInstr::getNumExplicitDefs() const {
+  unsigned NumDefs = MCID->getNumDefs();
+  if (!MCID->isVariadic())
+    return NumDefs;
+
+  for (unsigned I = NumDefs, E = getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = getOperand(I);
+    if (!MO.isReg() || !MO.isDef() || MO.isImplicit())
+      break;
+    ++NumDefs;
+  }
+  return NumDefs;
+}
+
+void MachineInstr::bundleWithPred() {
+  assert(!isBundledWithPred() && "MI is already bundled with its predecessor");
+  setFlag(BundledPred);
+  MachineBasicBlock::instr_iterator Pred = getIterator();
+  --Pred;
+  assert(!Pred->isBundledWithSucc() && "Inconsistent bundle flags");
+  Pred->setFlag(BundledSucc);
+}
+
+void MachineInstr::bundleWithSucc() {
+  assert(!isBundledWithSucc() && "MI is already bundled with its successor");
+  setFlag(BundledSucc);
+  MachineBasicBlock::instr_iterator Succ = getIterator();
+  ++Succ;
+  assert(!Succ->isBundledWithPred() && "Inconsistent bundle flags");
+  Succ->setFlag(BundledPred);
+}
+
+void MachineInstr::unbundleFromPred() {
+  assert(isBundledWithPred() && "MI isn't bundled with its predecessor");
+  clearFlag(BundledPred);
+  MachineBasicBlock::instr_iterator Pred = getIterator();
+  --Pred;
+  assert(Pred->isBundledWithSucc() && "Inconsistent bundle flags");
+  Pred->clearFlag(BundledSucc);
+}
+
+void MachineInstr::unbundleFromSucc() {
+  assert(isBundledWithSucc() && "MI isn't bundled with its successor");
+  clearFlag(BundledSucc);
+  MachineBasicBlock::instr_iterator Succ = getIterator();
+  ++Succ;
+  assert(Succ->isBundledWithPred() && "Inconsistent bundle flags");
+  Succ->clearFlag(BundledPred);
+}
+
+bool MachineInstr::isStackAligningInlineAsm() const {
+  if (isInlineAsm()) {
+    unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+    if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
+      return true;
+  }
+  return false;
+}
+
+InlineAsm::AsmDialect MachineInstr::getInlineAsmDialect() const {
+  assert(isInlineAsm() && "getInlineAsmDialect() only works for inline asms!");
+  unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+  return InlineAsm::AsmDialect((ExtraInfo & InlineAsm::Extra_AsmDialect) != 0);
+}
+
+int MachineInstr::findInlineAsmFlagIdx(unsigned OpIdx,
+                                       unsigned *GroupNo) const {
+  assert(isInlineAsm() && "Expected an inline asm instruction");
+  assert(OpIdx < getNumOperands() && "OpIdx out of range");
+
+  // Ignore queries about the initial operands.
+  if (OpIdx < InlineAsm::MIOp_FirstOperand)
+    return -1;
+
+  unsigned Group = 0;
+  unsigned NumOps;
+  for (unsigned i = InlineAsm::MIOp_FirstOperand, e = getNumOperands(); i < e;
+       i += NumOps) {
+    const MachineOperand &FlagMO = getOperand(i);
+    // If we reach the implicit register operands, stop looking.
+    if (!FlagMO.isImm())
+      return -1;
+    NumOps = 1 + InlineAsm::getNumOperandRegisters(FlagMO.getImm());
+    if (i + NumOps > OpIdx) {
+      if (GroupNo)
+        *GroupNo = Group;
+      return i;
+    }
+    ++Group;
+  }
+  return -1;
+}
+
+const DILabel *MachineInstr::getDebugLabel() const {
+  assert(isDebugLabel() && "not a DBG_LABEL");
+  return cast<DILabel>(getOperand(0).getMetadata());
+}
+
+const MachineOperand &MachineInstr::getDebugVariableOp() const {
+  assert((isDebugValueLike()) && "not a DBG_VALUE*");
+  unsigned VariableOp = isNonListDebugValue() ? 2 : 0;
+  return getOperand(VariableOp);
+}
+
+MachineOperand &MachineInstr::getDebugVariableOp() {
+  assert((isDebugValueLike()) && "not a DBG_VALUE*");
+  unsigned VariableOp = isNonListDebugValue() ? 2 : 0;
+  return getOperand(VariableOp);
+}
+
+const DILocalVariable *MachineInstr::getDebugVariable() const {
+  return cast<DILocalVariable>(getDebugVariableOp().getMetadata());
+}
+
+const MachineOperand &MachineInstr::getDebugExpressionOp() const {
+  assert((isDebugValueLike()) && "not a DBG_VALUE*");
+  unsigned ExpressionOp = isNonListDebugValue() ? 3 : 1;
+  return getOperand(ExpressionOp);
+}
+
+MachineOperand &MachineInstr::getDebugExpressionOp() {
+  assert((isDebugValueLike()) && "not a DBG_VALUE*");
+  unsigned ExpressionOp = isNonListDebugValue() ? 3 : 1;
+  return getOperand(ExpressionOp);
+}
+
+const DIExpression *MachineInstr::getDebugExpression() const {
+  return cast<DIExpression>(getDebugExpressionOp().getMetadata());
+}
+
+bool MachineInstr::isDebugEntryValue() const {
+  return isDebugValue() && getDebugExpression()->isEntryValue();
+}
+
+const TargetRegisterClass*
+MachineInstr::getRegClassConstraint(unsigned OpIdx,
+                                    const TargetInstrInfo *TII,
+                                    const TargetRegisterInfo *TRI) const {
+  assert(getParent() && "Can't have an MBB reference here!");
+  assert(getMF() && "Can't have an MF reference here!");
+  const MachineFunction &MF = *getMF();
+
+  // Most opcodes have fixed constraints in their MCInstrDesc.
+  if (!isInlineAsm())
+    return TII->getRegClass(getDesc(), OpIdx, TRI, MF);
+
+  if (!getOperand(OpIdx).isReg())
+    return nullptr;
+
+  // For tied uses on inline asm, get the constraint from the def.
+  unsigned DefIdx;
+  if (getOperand(OpIdx).isUse() && isRegTiedToDefOperand(OpIdx, &DefIdx))
+    OpIdx = DefIdx;
+
+  // Inline asm stores register class constraints in the flag word.
+  int FlagIdx = findInlineAsmFlagIdx(OpIdx);
+  if (FlagIdx < 0)
+    return nullptr;
+
+  unsigned Flag = getOperand(FlagIdx).getImm();
+  unsigned RCID;
+  if ((InlineAsm::getKind(Flag) == InlineAsm::Kind_RegUse ||
+       InlineAsm::getKind(Flag) == InlineAsm::Kind_RegDef ||
+       InlineAsm::getKind(Flag) == InlineAsm::Kind_RegDefEarlyClobber) &&
+      InlineAsm::hasRegClassConstraint(Flag, RCID))
+    return TRI->getRegClass(RCID);
+
+  // Assume that all registers in a memory operand are pointers.
+  if (InlineAsm::getKind(Flag) == InlineAsm::Kind_Mem)
+    return TRI->getPointerRegClass(MF);
+
+  return nullptr;
+}
+
+const TargetRegisterClass *MachineInstr::getRegClassConstraintEffectForVReg(
+    Register Reg, const TargetRegisterClass *CurRC, const TargetInstrInfo *TII,
+    const TargetRegisterInfo *TRI, bool ExploreBundle) const {
+  // Check every operands inside the bundle if we have
+  // been asked to.
+  if (ExploreBundle)
+    for (ConstMIBundleOperands OpndIt(*this); OpndIt.isValid() && CurRC;
+         ++OpndIt)
+      CurRC = OpndIt->getParent()->getRegClassConstraintEffectForVRegImpl(
+          OpndIt.getOperandNo(), Reg, CurRC, TII, TRI);
+  else
+    // Otherwise, just check the current operands.
+    for (unsigned i = 0, e = NumOperands; i < e && CurRC; ++i)
+      CurRC = getRegClassConstraintEffectForVRegImpl(i, Reg, CurRC, TII, TRI);
+  return CurRC;
+}
+
+const TargetRegisterClass *MachineInstr::getRegClassConstraintEffectForVRegImpl(
+    unsigned OpIdx, Register Reg, const TargetRegisterClass *CurRC,
+    const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) const {
+  assert(CurRC && "Invalid initial register class");
+  // Check if Reg is constrained by some of its use/def from MI.
+  const MachineOperand &MO = getOperand(OpIdx);
+  if (!MO.isReg() || MO.getReg() != Reg)
+    return CurRC;
+  // If yes, accumulate the constraints through the operand.
+  return getRegClassConstraintEffect(OpIdx, CurRC, TII, TRI);
+}
+
+const TargetRegisterClass *MachineInstr::getRegClassConstraintEffect(
+    unsigned OpIdx, const TargetRegisterClass *CurRC,
+    const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) const {
+  const TargetRegisterClass *OpRC = getRegClassConstraint(OpIdx, TII, TRI);
+  const MachineOperand &MO = getOperand(OpIdx);
+  assert(MO.isReg() &&
+         "Cannot get register constraints for non-register operand");
+  assert(CurRC && "Invalid initial register class");
+  if (unsigned SubIdx = MO.getSubReg()) {
+    if (OpRC)
+      CurRC = TRI->getMatchingSuperRegClass(CurRC, OpRC, SubIdx);
+    else
+      CurRC = TRI->getSubClassWithSubReg(CurRC, SubIdx);
+  } else if (OpRC)
+    CurRC = TRI->getCommonSubClass(CurRC, OpRC);
+  return CurRC;
+}
+
+/// Return the number of instructions inside the MI bundle, not counting the
+/// header instruction.
+unsigned MachineInstr::getBundleSize() const {
+  MachineBasicBlock::const_instr_iterator I = getIterator();
+  unsigned Size = 0;
+  while (I->isBundledWithSucc()) {
+    ++Size;
+    ++I;
+  }
+  return Size;
+}
+
+/// Returns true if the MachineInstr has an implicit-use operand of exactly
+/// the given register (not considering sub/super-registers).
+bool MachineInstr::hasRegisterImplicitUseOperand(Register Reg) const {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.getReg() == Reg)
+      return true;
+  }
+  return false;
+}
+
+/// findRegisterUseOperandIdx() - Returns the MachineOperand that is a use of
+/// the specific register or -1 if it is not found. It further tightens
+/// the search criteria to a use that kills the register if isKill is true.
+int MachineInstr::findRegisterUseOperandIdx(
+    Register Reg, bool isKill, const TargetRegisterInfo *TRI) const {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    Register MOReg = MO.getReg();
+    if (!MOReg)
+      continue;
+    if (MOReg == Reg || (TRI && Reg && MOReg && TRI->regsOverlap(MOReg, Reg)))
+      if (!isKill || MO.isKill())
+        return i;
+  }
+  return -1;
+}
+
+/// readsWritesVirtualRegister - Return a pair of bools (reads, writes)
+/// indicating if this instruction reads or writes Reg. This also considers
+/// partial defines.
+std::pair<bool,bool>
+MachineInstr::readsWritesVirtualRegister(Register Reg,
+                                         SmallVectorImpl<unsigned> *Ops) const {
+  bool PartDef = false; // Partial redefine.
+  bool FullDef = false; // Full define.
+  bool Use = false;
+
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || MO.getReg() != Reg)
+      continue;
+    if (Ops)
+      Ops->push_back(i);
+    if (MO.isUse())
+      Use |= !MO.isUndef();
+    else if (MO.getSubReg() && !MO.isUndef())
+      // A partial def undef doesn't count as reading the register.
+      PartDef = true;
+    else
+      FullDef = true;
+  }
+  // A partial redefine uses Reg unless there is also a full define.
+  return std::make_pair(Use || (PartDef && !FullDef), PartDef || FullDef);
+}
+
+/// findRegisterDefOperandIdx() - Returns the operand index that is a def of
+/// the specified register or -1 if it is not found. If isDead is true, defs
+/// that are not dead are skipped. If TargetRegisterInfo is non-null, then it
+/// also checks if there is a def of a super-register.
+int
+MachineInstr::findRegisterDefOperandIdx(Register Reg, bool isDead, bool Overlap,
+                                        const TargetRegisterInfo *TRI) const {
+  bool isPhys = Reg.isPhysical();
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    // Accept regmask operands when Overlap is set.
+    // Ignore them when looking for a specific def operand (Overlap == false).
+    if (isPhys && Overlap && MO.isRegMask() && MO.clobbersPhysReg(Reg))
+      return i;
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    Register MOReg = MO.getReg();
+    bool Found = (MOReg == Reg);
+    if (!Found && TRI && isPhys && MOReg.isPhysical()) {
+      if (Overlap)
+        Found = TRI->regsOverlap(MOReg, Reg);
+      else
+        Found = TRI->isSubRegister(MOReg, Reg);
+    }
+    if (Found && (!isDead || MO.isDead()))
+      return i;
+  }
+  return -1;
+}
+
+/// findFirstPredOperandIdx() - Find the index of the first operand in the
+/// operand list that is used to represent the predicate. It returns -1 if
+/// none is found.
+int MachineInstr::findFirstPredOperandIdx() const {
+  // Don't call MCID.findFirstPredOperandIdx() because this variant
+  // is sometimes called on an instruction that's not yet complete, and
+  // so the number of operands is less than the MCID indicates. In
+  // particular, the PTX target does this.
+  const MCInstrDesc &MCID = getDesc();
+  if (MCID.isPredicable()) {
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+      if (MCID.operands()[i].isPredicate())
+        return i;
+  }
+
+  return -1;
+}
+
+// MachineOperand::TiedTo is 4 bits wide.
+const unsigned TiedMax = 15;
+
+/// tieOperands - Mark operands at DefIdx and UseIdx as tied to each other.
+///
+/// Use and def operands can be tied together, indicated by a non-zero TiedTo
+/// field. TiedTo can have these values:
+///
+/// 0:              Operand is not tied to anything.
+/// 1 to TiedMax-1: Tied to getOperand(TiedTo-1).
+/// TiedMax:        Tied to an operand >= TiedMax-1.
+///
+/// The tied def must be one of the first TiedMax operands on a normal
+/// instruction. INLINEASM instructions allow more tied defs.
+///
+void MachineInstr::tieOperands(unsigned DefIdx, unsigned UseIdx) {
+  MachineOperand &DefMO = getOperand(DefIdx);
+  MachineOperand &UseMO = getOperand(UseIdx);
+  assert(DefMO.isDef() && "DefIdx must be a def operand");
+  assert(UseMO.isUse() && "UseIdx must be a use operand");
+  assert(!DefMO.isTied() && "Def is already tied to another use");
+  assert(!UseMO.isTied() && "Use is already tied to another def");
+
+  if (DefIdx < TiedMax)
+    UseMO.TiedTo = DefIdx + 1;
+  else {
+    // Inline asm can use the group descriptors to find tied operands,
+    // statepoint tied operands are trivial to match (1-1 reg def with reg use),
+    // but on normal instruction, the tied def must be within the first TiedMax
+    // operands.
+    assert((isInlineAsm() || getOpcode() == TargetOpcode::STATEPOINT) &&
+           "DefIdx out of range");
+    UseMO.TiedTo = TiedMax;
+  }
+
+  // UseIdx can be out of range, we'll search for it in findTiedOperandIdx().
+  DefMO.TiedTo = std::min(UseIdx + 1, TiedMax);
+}
+
+/// Given the index of a tied register operand, find the operand it is tied to.
+/// Defs are tied to uses and vice versa. Returns the index of the tied operand
+/// which must exist.
+unsigned MachineInstr::findTiedOperandIdx(unsigned OpIdx) const {
+  const MachineOperand &MO = getOperand(OpIdx);
+  assert(MO.isTied() && "Operand isn't tied");
+
+  // Normally TiedTo is in range.
+  if (MO.TiedTo < TiedMax)
+    return MO.TiedTo - 1;
+
+  // Uses on normal instructions can be out of range.
+  if (!isInlineAsm() && getOpcode() != TargetOpcode::STATEPOINT) {
+    // Normal tied defs must be in the 0..TiedMax-1 range.
+    if (MO.isUse())
+      return TiedMax - 1;
+    // MO is a def. Search for the tied use.
+    for (unsigned i = TiedMax - 1, e = getNumOperands(); i != e; ++i) {
+      const MachineOperand &UseMO = getOperand(i);
+      if (UseMO.isReg() && UseMO.isUse() && UseMO.TiedTo == OpIdx + 1)
+        return i;
+    }
+    llvm_unreachable("Can't find tied use");
+  }
+
+  if (getOpcode() == TargetOpcode::STATEPOINT) {
+    // In STATEPOINT defs correspond 1-1 to GC pointer operands passed
+    // on registers.
+    StatepointOpers SO(this);
+    unsigned CurUseIdx = SO.getFirstGCPtrIdx();
+    assert(CurUseIdx != -1U && "only gc pointer statepoint operands can be tied");
+    unsigned NumDefs = getNumDefs();
+    for (unsigned CurDefIdx = 0; CurDefIdx < NumDefs; ++CurDefIdx) {
+      while (!getOperand(CurUseIdx).isReg())
+        CurUseIdx = StackMaps::getNextMetaArgIdx(this, CurUseIdx);
+      if (OpIdx == CurDefIdx)
+        return CurUseIdx;
+      if (OpIdx == CurUseIdx)
+        return CurDefIdx;
+      CurUseIdx = StackMaps::getNextMetaArgIdx(this, CurUseIdx);
+    }
+    llvm_unreachable("Can't find tied use");
+  }
+
+  // Now deal with inline asm by parsing the operand group descriptor flags.
+  // Find the beginning of each operand group.
+  SmallVector<unsigned, 8> GroupIdx;
+  unsigned OpIdxGroup = ~0u;
+  unsigned NumOps;
+  for (unsigned i = InlineAsm::MIOp_FirstOperand, e = getNumOperands(); i < e;
+       i += NumOps) {
+    const MachineOperand &FlagMO = getOperand(i);
+    assert(FlagMO.isImm() && "Invalid tied operand on inline asm");
+    unsigned CurGroup = GroupIdx.size();
+    GroupIdx.push_back(i);
+    NumOps = 1 + InlineAsm::getNumOperandRegisters(FlagMO.getImm());
+    // OpIdx belongs to this operand group.
+    if (OpIdx > i && OpIdx < i + NumOps)
+      OpIdxGroup = CurGroup;
+    unsigned TiedGroup;
+    if (!InlineAsm::isUseOperandTiedToDef(FlagMO.getImm(), TiedGroup))
+      continue;
+    // Operands in this group are tied to operands in TiedGroup which must be
+    // earlier. Find the number of operands between the two groups.
+    unsigned Delta = i - GroupIdx[TiedGroup];
+
+    // OpIdx is a use tied to TiedGroup.
+    if (OpIdxGroup == CurGroup)
+      return OpIdx - Delta;
+
+    // OpIdx is a def tied to this use group.
+    if (OpIdxGroup == TiedGroup)
+      return OpIdx + Delta;
+  }
+  llvm_unreachable("Invalid tied operand on inline asm");
+}
+
+/// clearKillInfo - Clears kill flags on all operands.
+///
+void MachineInstr::clearKillInfo() {
+  for (MachineOperand &MO : operands()) {
+    if (MO.isReg() && MO.isUse())
+      MO.setIsKill(false);
+  }
+}
+
+void MachineInstr::substituteRegister(Register FromReg, Register ToReg,
+                                      unsigned SubIdx,
+                                      const TargetRegisterInfo &RegInfo) {
+  if (ToReg.isPhysical()) {
+    if (SubIdx)
+      ToReg = RegInfo.getSubReg(ToReg, SubIdx);
+    for (MachineOperand &MO : operands()) {
+      if (!MO.isReg() || MO.getReg() != FromReg)
+        continue;
+      MO.substPhysReg(ToReg, RegInfo);
+    }
+  } else {
+    for (MachineOperand &MO : operands()) {
+      if (!MO.isReg() || MO.getReg() != FromReg)
+        continue;
+      MO.substVirtReg(ToReg, SubIdx, RegInfo);
+    }
+  }
+}
+
+/// isSafeToMove - Return true if it is safe to move this instruction. If
+/// SawStore is set to true, it means that there is a store (or call) between
+/// the instruction's location and its intended destination.
+bool MachineInstr::isSafeToMove(AAResults *AA, bool &SawStore) const {
+  // Ignore stuff that we obviously can't move.
+  //
+  // Treat volatile loads as stores. This is not strictly necessary for
+  // volatiles, but it is required for atomic loads. It is not allowed to move
+  // a load across an atomic load with Ordering > Monotonic.
+  if (mayStore() || isCall() || isPHI() ||
+      (mayLoad() && hasOrderedMemoryRef())) {
+    SawStore = true;
+    return false;
+  }
+
+  if (isPosition() || isDebugInstr() || isTerminator() ||
+      mayRaiseFPException() || hasUnmodeledSideEffects())
+    return false;
+
+  // See if this instruction does a load.  If so, we have to guarantee that the
+  // loaded value doesn't change between the load and the its intended
+  // destination. The check for isInvariantLoad gives the target the chance to
+  // classify the load as always returning a constant, e.g. a constant pool
+  // load.
+  if (mayLoad() && !isDereferenceableInvariantLoad())
+    // Otherwise, this is a real load.  If there is a store between the load and
+    // end of block, we can't move it.
+    return !SawStore;
+
+  return true;
+}
+
+static bool MemOperandsHaveAlias(const MachineFrameInfo &MFI, AAResults *AA,
+                                 bool UseTBAA, const MachineMemOperand *MMOa,
+                                 const MachineMemOperand *MMOb) {
+  // The following interface to AA is fashioned after DAGCombiner::isAlias and
+  // operates with MachineMemOperand offset with some important assumptions:
+  //   - LLVM fundamentally assumes flat address spaces.
+  //   - MachineOperand offset can *only* result from legalization and cannot
+  //     affect queries other than the trivial case of overlap checking.
+  //   - These offsets never wrap and never step outside of allocated objects.
+  //   - There should never be any negative offsets here.
+  //
+  // FIXME: Modify API to hide this math from "user"
+  // Even before we go to AA we can reason locally about some memory objects. It
+  // can save compile time, and possibly catch some corner cases not currently
+  // covered.
+
+  int64_t OffsetA = MMOa->getOffset();
+  int64_t OffsetB = MMOb->getOffset();
+  int64_t MinOffset = std::min(OffsetA, OffsetB);
+
+  uint64_t WidthA = MMOa->getSize();
+  uint64_t WidthB = MMOb->getSize();
+  bool KnownWidthA = WidthA != MemoryLocation::UnknownSize;
+  bool KnownWidthB = WidthB != MemoryLocation::UnknownSize;
+
+  const Value *ValA = MMOa->getValue();
+  const Value *ValB = MMOb->getValue();
+  bool SameVal = (ValA && ValB && (ValA == ValB));
+  if (!SameVal) {
+    const PseudoSourceValue *PSVa = MMOa->getPseudoValue();
+    const PseudoSourceValue *PSVb = MMOb->getPseudoValue();
+    if (PSVa && ValB && !PSVa->mayAlias(&MFI))
+      return false;
+    if (PSVb && ValA && !PSVb->mayAlias(&MFI))
+      return false;
+    if (PSVa && PSVb && (PSVa == PSVb))
+      SameVal = true;
+  }
+
+  if (SameVal) {
+    if (!KnownWidthA || !KnownWidthB)
+      return true;
+    int64_t MaxOffset = std::max(OffsetA, OffsetB);
+    int64_t LowWidth = (MinOffset == OffsetA) ? WidthA : WidthB;
+    return (MinOffset + LowWidth > MaxOffset);
+  }
+
+  if (!AA)
+    return true;
+
+  if (!ValA || !ValB)
+    return true;
+
+  assert((OffsetA >= 0) && "Negative MachineMemOperand offset");
+  assert((OffsetB >= 0) && "Negative MachineMemOperand offset");
+
+  int64_t OverlapA =
+      KnownWidthA ? WidthA + OffsetA - MinOffset : MemoryLocation::UnknownSize;
+  int64_t OverlapB =
+      KnownWidthB ? WidthB + OffsetB - MinOffset : MemoryLocation::UnknownSize;
+
+  return !AA->isNoAlias(
+      MemoryLocation(ValA, OverlapA, UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
+      MemoryLocation(ValB, OverlapB,
+                     UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
+}
+
+bool MachineInstr::mayAlias(AAResults *AA, const MachineInstr &Other,
+                            bool UseTBAA) const {
+  const MachineFunction *MF = getMF();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+
+  // Exclude call instruction which may alter the memory but can not be handled
+  // by this function.
+  if (isCall() || Other.isCall())
+    return true;
+
+  // If neither instruction stores to memory, they can't alias in any
+  // meaningful way, even if they read from the same address.
+  if (!mayStore() && !Other.mayStore())
+    return false;
+
+  // Both instructions must be memory operations to be able to alias.
+  if (!mayLoadOrStore() || !Other.mayLoadOrStore())
+    return false;
+
+  // Let the target decide if memory accesses cannot possibly overlap.
+  if (TII->areMemAccessesTriviallyDisjoint(*this, Other))
+    return false;
+
+  // Memory operations without memory operands may access anything. Be
+  // conservative and assume `MayAlias`.
+  if (memoperands_empty() || Other.memoperands_empty())
+    return true;
+
+  // Skip if there are too many memory operands.
+  auto NumChecks = getNumMemOperands() * Other.getNumMemOperands();
+  if (NumChecks > TII->getMemOperandAACheckLimit())
+    return true;
+
+  // Check each pair of memory operands from both instructions, which can't
+  // alias only if all pairs won't alias.
+  for (auto *MMOa : memoperands())
+    for (auto *MMOb : Other.memoperands())
+      if (MemOperandsHaveAlias(MFI, AA, UseTBAA, MMOa, MMOb))
+        return true;
+
+  return false;
+}
+
+/// hasOrderedMemoryRef - Return true if this instruction may have an ordered
+/// or volatile memory reference, or if the information describing the memory
+/// reference is not available. Return false if it is known to have no ordered
+/// memory references.
+bool MachineInstr::hasOrderedMemoryRef() const {
+  // An instruction known never to access memory won't have a volatile access.
+  if (!mayStore() &&
+      !mayLoad() &&
+      !isCall() &&
+      !hasUnmodeledSideEffects())
+    return false;
+
+  // Otherwise, if the instruction has no memory reference information,
+  // conservatively assume it wasn't preserved.
+  if (memoperands_empty())
+    return true;
+
+  // Check if any of our memory operands are ordered.
+  return llvm::any_of(memoperands(), [](const MachineMemOperand *MMO) {
+    return !MMO->isUnordered();
+  });
+}
+
+/// isDereferenceableInvariantLoad - Return true if this instruction will never
+/// trap and is loading from a location whose value is invariant across a run of
+/// this function.
+bool MachineInstr::isDereferenceableInvariantLoad() const {
+  // If the instruction doesn't load at all, it isn't an invariant load.
+  if (!mayLoad())
+    return false;
+
+  // If the instruction has lost its memoperands, conservatively assume that
+  // it may not be an invariant load.
+  if (memoperands_empty())
+    return false;
+
+  const MachineFrameInfo &MFI = getParent()->getParent()->getFrameInfo();
+
+  for (MachineMemOperand *MMO : memoperands()) {
+    if (!MMO->isUnordered())
+      // If the memory operand has ordering side effects, we can't move the
+      // instruction.  Such an instruction is technically an invariant load,
+      // but the caller code would need updated to expect that.
+      return false;
+    if (MMO->isStore()) return false;
+    if (MMO->isInvariant() && MMO->isDereferenceable())
+      continue;
+
+    // A load from a constant PseudoSourceValue is invariant.
+    if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) {
+      if (PSV->isConstant(&MFI))
+        continue;
+    }
+
+    // Otherwise assume conservatively.
+    return false;
+  }
+
+  // Everything checks out.
+  return true;
+}
+
+/// isConstantValuePHI - If the specified instruction is a PHI that always
+/// merges together the same virtual register, return the register, otherwise
+/// return 0.
+unsigned MachineInstr::isConstantValuePHI() const {
+  if (!isPHI())
+    return 0;
+  assert(getNumOperands() >= 3 &&
+         "It's illegal to have a PHI without source operands");
+
+  Register Reg = getOperand(1).getReg();
+  for (unsigned i = 3, e = getNumOperands(); i < e; i += 2)
+    if (getOperand(i).getReg() != Reg)
+      return 0;
+  return Reg;
+}
+
+bool MachineInstr::hasUnmodeledSideEffects() const {
+  if (hasProperty(MCID::UnmodeledSideEffects))
+    return true;
+  if (isInlineAsm()) {
+    unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+    if (ExtraInfo & InlineAsm::Extra_HasSideEffects)
+      return true;
+  }
+
+  return false;
+}
+
+bool MachineInstr::isLoadFoldBarrier() const {
+  return mayStore() || isCall() ||
+         (hasUnmodeledSideEffects() && !isPseudoProbe());
+}
+
+/// allDefsAreDead - Return true if all the defs of this instruction are dead.
+///
+bool MachineInstr::allDefsAreDead() const {
+  for (const MachineOperand &MO : operands()) {
+    if (!MO.isReg() || MO.isUse())
+      continue;
+    if (!MO.isDead())
+      return false;
+  }
+  return true;
+}
+
+/// copyImplicitOps - Copy implicit register operands from specified
+/// instruction to this instruction.
+void MachineInstr::copyImplicitOps(MachineFunction &MF,
+                                   const MachineInstr &MI) {
+  for (const MachineOperand &MO :
+       llvm::drop_begin(MI.operands(), MI.getDesc().getNumOperands()))
+    if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
+      addOperand(MF, MO);
+}
+
+bool MachineInstr::hasComplexRegisterTies() const {
+  const MCInstrDesc &MCID = getDesc();
+  if (MCID.Opcode == TargetOpcode::STATEPOINT)
+    return true;
+  for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
+    const auto &Operand = getOperand(I);
+    if (!Operand.isReg() || Operand.isDef())
+      // Ignore the defined registers as MCID marks only the uses as tied.
+      continue;
+    int ExpectedTiedIdx = MCID.getOperandConstraint(I, MCOI::TIED_TO);
+    int TiedIdx = Operand.isTied() ? int(findTiedOperandIdx(I)) : -1;
+    if (ExpectedTiedIdx != TiedIdx)
+      return true;
+  }
+  return false;
+}
+
+LLT MachineInstr::getTypeToPrint(unsigned OpIdx, SmallBitVector &PrintedTypes,
+                                 const MachineRegisterInfo &MRI) const {
+  const MachineOperand &Op = getOperand(OpIdx);
+  if (!Op.isReg())
+    return LLT{};
+
+  if (isVariadic() || OpIdx >= getNumExplicitOperands())
+    return MRI.getType(Op.getReg());
+
+  auto &OpInfo = getDesc().operands()[OpIdx];
+  if (!OpInfo.isGenericType())
+    return MRI.getType(Op.getReg());
+
+  if (PrintedTypes[OpInfo.getGenericTypeIndex()])
+    return LLT{};
+
+  LLT TypeToPrint = MRI.getType(Op.getReg());
+  // Don't mark the type index printed if it wasn't actually printed: maybe
+  // another operand with the same type index has an actual type attached:
+  if (TypeToPrint.isValid())
+    PrintedTypes.set(OpInfo.getGenericTypeIndex());
+  return TypeToPrint;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineInstr::dump() const {
+  dbgs() << "  ";
+  print(dbgs());
+}
+
+LLVM_DUMP_METHOD void MachineInstr::dumprImpl(
+    const MachineRegisterInfo &MRI, unsigned Depth, unsigned MaxDepth,
+    SmallPtrSetImpl<const MachineInstr *> &AlreadySeenInstrs) const {
+  if (Depth >= MaxDepth)
+    return;
+  if (!AlreadySeenInstrs.insert(this).second)
+    return;
+  // PadToColumn always inserts at least one space.
+  // Don't mess up the alignment if we don't want any space.
+  if (Depth)
+    fdbgs().PadToColumn(Depth * 2);
+  print(fdbgs());
+  for (const MachineOperand &MO : operands()) {
+    if (!MO.isReg() || MO.isDef())
+      continue;
+    Register Reg = MO.getReg();
+    if (Reg.isPhysical())
+      continue;
+    const MachineInstr *NewMI = MRI.getUniqueVRegDef(Reg);
+    if (NewMI == nullptr)
+      continue;
+    NewMI->dumprImpl(MRI, Depth + 1, MaxDepth, AlreadySeenInstrs);
+  }
+}
+
+LLVM_DUMP_METHOD void MachineInstr::dumpr(const MachineRegisterInfo &MRI,
+                                          unsigned MaxDepth) const {
+  SmallPtrSet<const MachineInstr *, 16> AlreadySeenInstrs;
+  dumprImpl(MRI, 0, MaxDepth, AlreadySeenInstrs);
+}
+#endif
+
+void MachineInstr::print(raw_ostream &OS, bool IsStandalone, bool SkipOpers,
+                         bool SkipDebugLoc, bool AddNewLine,
+                         const TargetInstrInfo *TII) const {
+  const Module *M = nullptr;
+  const Function *F = nullptr;
+  if (const MachineFunction *MF = getMFIfAvailable(*this)) {
+    F = &MF->getFunction();
+    M = F->getParent();
+    if (!TII)
+      TII = MF->getSubtarget().getInstrInfo();
+  }
+
+  ModuleSlotTracker MST(M);
+  if (F)
+    MST.incorporateFunction(*F);
+  print(OS, MST, IsStandalone, SkipOpers, SkipDebugLoc, AddNewLine, TII);
+}
+
+void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
+                         bool IsStandalone, bool SkipOpers, bool SkipDebugLoc,
+                         bool AddNewLine, const TargetInstrInfo *TII) const {
+  // We can be a bit tidier if we know the MachineFunction.
+  const TargetRegisterInfo *TRI = nullptr;
+  const MachineRegisterInfo *MRI = nullptr;
+  const TargetIntrinsicInfo *IntrinsicInfo = nullptr;
+  tryToGetTargetInfo(*this, TRI, MRI, IntrinsicInfo, TII);
+
+  if (isCFIInstruction())
+    assert(getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
+
+  SmallBitVector PrintedTypes(8);
+  bool ShouldPrintRegisterTies = IsStandalone || hasComplexRegisterTies();
+  auto getTiedOperandIdx = [&](unsigned OpIdx) {
+    if (!ShouldPrintRegisterTies)
+      return 0U;
+    const MachineOperand &MO = getOperand(OpIdx);
+    if (MO.isReg() && MO.isTied() && !MO.isDef())
+      return findTiedOperandIdx(OpIdx);
+    return 0U;
+  };
+  unsigned StartOp = 0;
+  unsigned e = getNumOperands();
+
+  // Print explicitly defined operands on the left of an assignment syntax.
+  while (StartOp < e) {
+    const MachineOperand &MO = getOperand(StartOp);
+    if (!MO.isReg() || !MO.isDef() || MO.isImplicit())
+      break;
+
+    if (StartOp != 0)
+      OS << ", ";
+
+    LLT TypeToPrint = MRI ? getTypeToPrint(StartOp, PrintedTypes, *MRI) : LLT{};
+    unsigned TiedOperandIdx = getTiedOperandIdx(StartOp);
+    MO.print(OS, MST, TypeToPrint, StartOp, /*PrintDef=*/false, IsStandalone,
+             ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
+    ++StartOp;
+  }
+
+  if (StartOp != 0)
+    OS << " = ";
+
+  if (getFlag(MachineInstr::FrameSetup))
+    OS << "frame-setup ";
+  if (getFlag(MachineInstr::FrameDestroy))
+    OS << "frame-destroy ";
+  if (getFlag(MachineInstr::FmNoNans))
+    OS << "nnan ";
+  if (getFlag(MachineInstr::FmNoInfs))
+    OS << "ninf ";
+  if (getFlag(MachineInstr::FmNsz))
+    OS << "nsz ";
+  if (getFlag(MachineInstr::FmArcp))
+    OS << "arcp ";
+  if (getFlag(MachineInstr::FmContract))
+    OS << "contract ";
+  if (getFlag(MachineInstr::FmAfn))
+    OS << "afn ";
+  if (getFlag(MachineInstr::FmReassoc))
+    OS << "reassoc ";
+  if (getFlag(MachineInstr::NoUWrap))
+    OS << "nuw ";
+  if (getFlag(MachineInstr::NoSWrap))
+    OS << "nsw ";
+  if (getFlag(MachineInstr::IsExact))
+    OS << "exact ";
+  if (getFlag(MachineInstr::NoFPExcept))
+    OS << "nofpexcept ";
+  if (getFlag(MachineInstr::NoMerge))
+    OS << "nomerge ";
+
+  // Print the opcode name.
+  if (TII)
+    OS << TII->getName(getOpcode());
+  else
+    OS << "UNKNOWN";
+
+  if (SkipOpers)
+    return;
+
+  // Print the rest of the operands.
+  bool FirstOp = true;
+  unsigned AsmDescOp = ~0u;
+  unsigned AsmOpCount = 0;
+
+  if (isInlineAsm() && e >= InlineAsm::MIOp_FirstOperand) {
+    // Print asm string.
+    OS << " ";
+    const unsigned OpIdx = InlineAsm::MIOp_AsmString;
+    LLT TypeToPrint = MRI ? getTypeToPrint(OpIdx, PrintedTypes, *MRI) : LLT{};
+    unsigned TiedOperandIdx = getTiedOperandIdx(OpIdx);
+    getOperand(OpIdx).print(OS, MST, TypeToPrint, OpIdx, /*PrintDef=*/true, IsStandalone,
+                            ShouldPrintRegisterTies, TiedOperandIdx, TRI,
+                            IntrinsicInfo);
+
+    // Print HasSideEffects, MayLoad, MayStore, IsAlignStack
+    unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+    if (ExtraInfo & InlineAsm::Extra_HasSideEffects)
+      OS << " [sideeffect]";
+    if (ExtraInfo & InlineAsm::Extra_MayLoad)
+      OS << " [mayload]";
+    if (ExtraInfo & InlineAsm::Extra_MayStore)
+      OS << " [maystore]";
+    if (ExtraInfo & InlineAsm::Extra_IsConvergent)
+      OS << " [isconvergent]";
+    if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
+      OS << " [alignstack]";
+    if (getInlineAsmDialect() == InlineAsm::AD_ATT)
+      OS << " [attdialect]";
+    if (getInlineAsmDialect() == InlineAsm::AD_Intel)
+      OS << " [inteldialect]";
+
+    StartOp = AsmDescOp = InlineAsm::MIOp_FirstOperand;
+    FirstOp = false;
+  }
+
+  for (unsigned i = StartOp, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+
+    if (FirstOp) FirstOp = false; else OS << ",";
+    OS << " ";
+
+    if (isDebugValueLike() && MO.isMetadata()) {
+      // Pretty print DBG_VALUE* instructions.
+      auto *DIV = dyn_cast<DILocalVariable>(MO.getMetadata());
+      if (DIV && !DIV->getName().empty())
+        OS << "!\"" << DIV->getName() << '\"';
+      else {
+        LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
+        unsigned TiedOperandIdx = getTiedOperandIdx(i);
+        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
+                 ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
+      }
+    } else if (isDebugLabel() && MO.isMetadata()) {
+      // Pretty print DBG_LABEL instructions.
+      auto *DIL = dyn_cast<DILabel>(MO.getMetadata());
+      if (DIL && !DIL->getName().empty())
+        OS << "\"" << DIL->getName() << '\"';
+      else {
+        LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
+        unsigned TiedOperandIdx = getTiedOperandIdx(i);
+        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
+                 ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
+      }
+    } else if (i == AsmDescOp && MO.isImm()) {
+      // Pretty print the inline asm operand descriptor.
+      OS << '$' << AsmOpCount++;
+      unsigned Flag = MO.getImm();
+      OS << ":[";
+      OS << InlineAsm::getKindName(InlineAsm::getKind(Flag));
+
+      unsigned RCID = 0;
+      if (!InlineAsm::isImmKind(Flag) && !InlineAsm::isMemKind(Flag) &&
+          InlineAsm::hasRegClassConstraint(Flag, RCID)) {
+        if (TRI) {
+          OS << ':' << TRI->getRegClassName(TRI->getRegClass(RCID));
+        } else
+          OS << ":RC" << RCID;
+      }
+
+      if (InlineAsm::isMemKind(Flag)) {
+        unsigned MCID = InlineAsm::getMemoryConstraintID(Flag);
+        OS << ":" << InlineAsm::getMemConstraintName(MCID);
+      }
+
+      unsigned TiedTo = 0;
+      if (InlineAsm::isUseOperandTiedToDef(Flag, TiedTo))
+        OS << " tiedto:$" << TiedTo;
+
+      OS << ']';
+
+      // Compute the index of the next operand descriptor.
+      AsmDescOp += 1 + InlineAsm::getNumOperandRegisters(Flag);
+    } else {
+      LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
+      unsigned TiedOperandIdx = getTiedOperandIdx(i);
+      if (MO.isImm() && isOperandSubregIdx(i))
+        MachineOperand::printSubRegIdx(OS, MO.getImm(), TRI);
+      else
+        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
+                 ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
+    }
+  }
+
+  // Print any optional symbols attached to this instruction as-if they were
+  // operands.
+  if (MCSymbol *PreInstrSymbol = getPreInstrSymbol()) {
+    if (!FirstOp) {
+      FirstOp = false;
+      OS << ',';
+    }
+    OS << " pre-instr-symbol ";
+    MachineOperand::printSymbol(OS, *PreInstrSymbol);
+  }
+  if (MCSymbol *PostInstrSymbol = getPostInstrSymbol()) {
+    if (!FirstOp) {
+      FirstOp = false;
+      OS << ',';
+    }
+    OS << " post-instr-symbol ";
+    MachineOperand::printSymbol(OS, *PostInstrSymbol);
+  }
+  if (MDNode *HeapAllocMarker = getHeapAllocMarker()) {
+    if (!FirstOp) {
+      FirstOp = false;
+      OS << ',';
+    }
+    OS << " heap-alloc-marker ";
+    HeapAllocMarker->printAsOperand(OS, MST);
+  }
+  if (MDNode *PCSections = getPCSections()) {
+    if (!FirstOp) {
+      FirstOp = false;
+      OS << ',';
+    }
+    OS << " pcsections ";
+    PCSections->printAsOperand(OS, MST);
+  }
+  if (uint32_t CFIType = getCFIType()) {
+    if (!FirstOp)
+      OS << ',';
+    OS << " cfi-type " << CFIType;
+  }
+
+  if (DebugInstrNum) {
+    if (!FirstOp)
+      OS << ",";
+    OS << " debug-instr-number " << DebugInstrNum;
+  }
+
+  if (!SkipDebugLoc) {
+    if (const DebugLoc &DL = getDebugLoc()) {
+      if (!FirstOp)
+        OS << ',';
+      OS << " debug-location ";
+      DL->printAsOperand(OS, MST);
+    }
+  }
+
+  if (!memoperands_empty()) {
+    SmallVector<StringRef, 0> SSNs;
+    const LLVMContext *Context = nullptr;
+    std::unique_ptr<LLVMContext> CtxPtr;
+    const MachineFrameInfo *MFI = nullptr;
+    if (const MachineFunction *MF = getMFIfAvailable(*this)) {
+      MFI = &MF->getFrameInfo();
+      Context = &MF->getFunction().getContext();
+    } else {
+      CtxPtr = std::make_unique<LLVMContext>();
+      Context = CtxPtr.get();
+    }
+
+    OS << " :: ";
+    bool NeedComma = false;
+    for (const MachineMemOperand *Op : memoperands()) {
+      if (NeedComma)
+        OS << ", ";
+      Op->print(OS, MST, SSNs, *Context, MFI, TII);
+      NeedComma = true;
+    }
+  }
+
+  if (SkipDebugLoc)
+    return;
+
+  bool HaveSemi = false;
+
+  // Print debug location information.
+  if (const DebugLoc &DL = getDebugLoc()) {
+    if (!HaveSemi) {
+      OS << ';';
+      HaveSemi = true;
+    }
+    OS << ' ';
+    DL.print(OS);
+  }
+
+  // Print extra comments for DEBUG_VALUE.
+  if (isDebugValueLike() && getDebugVariableOp().isMetadata()) {
+    if (!HaveSemi) {
+      OS << ";";
+      HaveSemi = true;
+    }
+    auto *DV = getDebugVariable();
+    OS << " line no:" <<  DV->getLine();
+    if (isIndirectDebugValue())
+      OS << " indirect";
+  }
+  // TODO: DBG_LABEL
+
+  if (AddNewLine)
+    OS << '\n';
+}
+
+bool MachineInstr::addRegisterKilled(Register IncomingReg,
+                                     const TargetRegisterInfo *RegInfo,
+                                     bool AddIfNotFound) {
+  bool isPhysReg = IncomingReg.isPhysical();
+  bool hasAliases = isPhysReg &&
+    MCRegAliasIterator(IncomingReg, RegInfo, false).isValid();
+  bool Found = false;
+  SmallVector<unsigned,4> DeadOps;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isUse() || MO.isUndef())
+      continue;
+
+    // DEBUG_VALUE nodes do not contribute to code generation and should
+    // always be ignored. Failure to do so may result in trying to modify
+    // KILL flags on DEBUG_VALUE nodes.
+    if (MO.isDebug())
+      continue;
+
+    Register Reg = MO.getReg();
+    if (!Reg)
+      continue;
+
+    if (Reg == IncomingReg) {
+      if (!Found) {
+        if (MO.isKill())
+          // The register is already marked kill.
+          return true;
+        if (isPhysReg && isRegTiedToDefOperand(i))
+          // Two-address uses of physregs must not be marked kill.
+          return true;
+        MO.setIsKill();
+        Found = true;
+      }
+    } else if (hasAliases && MO.isKill() && Reg.isPhysical()) {
+      // A super-register kill already exists.
+      if (RegInfo->isSuperRegister(IncomingReg, Reg))
+        return true;
+      if (RegInfo->isSubRegister(IncomingReg, Reg))
+        DeadOps.push_back(i);
+    }
+  }
+
+  // Trim unneeded kill operands.
+  while (!DeadOps.empty()) {
+    unsigned OpIdx = DeadOps.back();
+    if (getOperand(OpIdx).isImplicit() &&
+        (!isInlineAsm() || findInlineAsmFlagIdx(OpIdx) < 0))
+      removeOperand(OpIdx);
+    else
+      getOperand(OpIdx).setIsKill(false);
+    DeadOps.pop_back();
+  }
+
+  // If not found, this means an alias of one of the operands is killed. Add a
+  // new implicit operand if required.
+  if (!Found && AddIfNotFound) {
+    addOperand(MachineOperand::CreateReg(IncomingReg,
+                                         false /*IsDef*/,
+                                         true  /*IsImp*/,
+                                         true  /*IsKill*/));
+    return true;
+  }
+  return Found;
+}
+
+void MachineInstr::clearRegisterKills(Register Reg,
+                                      const TargetRegisterInfo *RegInfo) {
+  if (!Reg.isPhysical())
+    RegInfo = nullptr;
+  for (MachineOperand &MO : operands()) {
+    if (!MO.isReg() || !MO.isUse() || !MO.isKill())
+      continue;
+    Register OpReg = MO.getReg();
+    if ((RegInfo && RegInfo->regsOverlap(Reg, OpReg)) || Reg == OpReg)
+      MO.setIsKill(false);
+  }
+}
+
+bool MachineInstr::addRegisterDead(Register Reg,
+                                   const TargetRegisterInfo *RegInfo,
+                                   bool AddIfNotFound) {
+  bool isPhysReg = Reg.isPhysical();
+  bool hasAliases = isPhysReg &&
+    MCRegAliasIterator(Reg, RegInfo, false).isValid();
+  bool Found = false;
+  SmallVector<unsigned,4> DeadOps;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    Register MOReg = MO.getReg();
+    if (!MOReg)
+      continue;
+
+    if (MOReg == Reg) {
+      MO.setIsDead();
+      Found = true;
+    } else if (hasAliases && MO.isDead() && MOReg.isPhysical()) {
+      // There exists a super-register that's marked dead.
+      if (RegInfo->isSuperRegister(Reg, MOReg))
+        return true;
+      if (RegInfo->isSubRegister(Reg, MOReg))
+        DeadOps.push_back(i);
+    }
+  }
+
+  // Trim unneeded dead operands.
+  while (!DeadOps.empty()) {
+    unsigned OpIdx = DeadOps.back();
+    if (getOperand(OpIdx).isImplicit() &&
+        (!isInlineAsm() || findInlineAsmFlagIdx(OpIdx) < 0))
+      removeOperand(OpIdx);
+    else
+      getOperand(OpIdx).setIsDead(false);
+    DeadOps.pop_back();
+  }
+
+  // If not found, this means an alias of one of the operands is dead. Add a
+  // new implicit operand if required.
+  if (Found || !AddIfNotFound)
+    return Found;
+
+  addOperand(MachineOperand::CreateReg(Reg,
+                                       true  /*IsDef*/,
+                                       true  /*IsImp*/,
+                                       false /*IsKill*/,
+                                       true  /*IsDead*/));
+  return true;
+}
+
+void MachineInstr::clearRegisterDeads(Register Reg) {
+  for (MachineOperand &MO : operands()) {
+    if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg)
+      continue;
+    MO.setIsDead(false);
+  }
+}
+
+void MachineInstr::setRegisterDefReadUndef(Register Reg, bool IsUndef) {
+  for (MachineOperand &MO : operands()) {
+    if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg || MO.getSubReg() == 0)
+      continue;
+    MO.setIsUndef(IsUndef);
+  }
+}
+
+void MachineInstr::addRegisterDefined(Register Reg,
+                                      const TargetRegisterInfo *RegInfo) {
+  if (Reg.isPhysical()) {
+    MachineOperand *MO = findRegisterDefOperand(Reg, false, false, RegInfo);
+    if (MO)
+      return;
+  } else {
+    for (const MachineOperand &MO : operands()) {
+      if (MO.isReg() && MO.getReg() == Reg && MO.isDef() &&
+          MO.getSubReg() == 0)
+        return;
+    }
+  }
+  addOperand(MachineOperand::CreateReg(Reg,
+                                       true  /*IsDef*/,
+                                       true  /*IsImp*/));
+}
+
+void MachineInstr::setPhysRegsDeadExcept(ArrayRef<Register> UsedRegs,
+                                         const TargetRegisterInfo &TRI) {
+  bool HasRegMask = false;
+  for (MachineOperand &MO : operands()) {
+    if (MO.isRegMask()) {
+      HasRegMask = true;
+      continue;
+    }
+    if (!MO.isReg() || !MO.isDef()) continue;
+    Register Reg = MO.getReg();
+    if (!Reg.isPhysical())
+      continue;
+    // If there are no uses, including partial uses, the def is dead.
+    if (llvm::none_of(UsedRegs,
+                      [&](MCRegister Use) { return TRI.regsOverlap(Use, Reg); }))
+      MO.setIsDead();
+  }
+
+  // This is a call with a register mask operand.
+  // Mask clobbers are always dead, so add defs for the non-dead defines.
+  if (HasRegMask)
+    for (const Register &UsedReg : UsedRegs)
+      addRegisterDefined(UsedReg, &TRI);
+}
+
+unsigned
+MachineInstrExpressionTrait::getHashValue(const MachineInstr* const &MI) {
+  // Build up a buffer of hash code components.
+  SmallVector<size_t, 16> HashComponents;
+  HashComponents.reserve(MI->getNumOperands() + 1);
+  HashComponents.push_back(MI->getOpcode());
+  for (const MachineOperand &MO : MI->operands()) {
+    if (MO.isReg() && MO.isDef() && MO.getReg().isVirtual())
+      continue;  // Skip virtual register defs.
+
+    HashComponents.push_back(hash_value(MO));
+  }
+  return hash_combine_range(HashComponents.begin(), HashComponents.end());
+}
+
+void MachineInstr::emitError(StringRef Msg) const {
+  // Find the source location cookie.
+  uint64_t LocCookie = 0;
+  const MDNode *LocMD = nullptr;
+  for (unsigned i = getNumOperands(); i != 0; --i) {
+    if (getOperand(i-1).isMetadata() &&
+        (LocMD = getOperand(i-1).getMetadata()) &&
+        LocMD->getNumOperands() != 0) {
+      if (const ConstantInt *CI =
+              mdconst::dyn_extract<ConstantInt>(LocMD->getOperand(0))) {
+        LocCookie = CI->getZExtValue();
+        break;
+      }
+    }
+  }
+
+  if (const MachineBasicBlock *MBB = getParent())
+    if (const MachineFunction *MF = MBB->getParent())
+      return MF->getMMI().getModule()->getContext().emitError(LocCookie, Msg);
+  report_fatal_error(Msg);
+}
+
+MachineInstrBuilder llvm::BuildMI(MachineFunction &MF, const DebugLoc &DL,
+                                  const MCInstrDesc &MCID, bool IsIndirect,
+                                  Register Reg, const MDNode *Variable,
+                                  const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  auto MIB = BuildMI(MF, DL, MCID).addReg(Reg);
+  if (IsIndirect)
+    MIB.addImm(0U);
+  else
+    MIB.addReg(0U);
+  return MIB.addMetadata(Variable).addMetadata(Expr);
+}
+
+MachineInstrBuilder llvm::BuildMI(MachineFunction &MF, const DebugLoc &DL,
+                                  const MCInstrDesc &MCID, bool IsIndirect,
+                                  ArrayRef<MachineOperand> DebugOps,
+                                  const MDNode *Variable, const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  if (MCID.Opcode == TargetOpcode::DBG_VALUE) {
+    assert(DebugOps.size() == 1 &&
+           "DBG_VALUE must contain exactly one debug operand");
+    MachineOperand DebugOp = DebugOps[0];
+    if (DebugOp.isReg())
+      return BuildMI(MF, DL, MCID, IsIndirect, DebugOp.getReg(), Variable,
+                     Expr);
+
+    auto MIB = BuildMI(MF, DL, MCID).add(DebugOp);
+    if (IsIndirect)
+      MIB.addImm(0U);
+    else
+      MIB.addReg(0U);
+    return MIB.addMetadata(Variable).addMetadata(Expr);
+  }
+
+  auto MIB = BuildMI(MF, DL, MCID);
+  MIB.addMetadata(Variable).addMetadata(Expr);
+  for (const MachineOperand &DebugOp : DebugOps)
+    if (DebugOp.isReg())
+      MIB.addReg(DebugOp.getReg());
+    else
+      MIB.add(DebugOp);
+  return MIB;
+}
+
+MachineInstrBuilder llvm::BuildMI(MachineBasicBlock &BB,
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, const MCInstrDesc &MCID,
+                                  bool IsIndirect, Register Reg,
+                                  const MDNode *Variable, const MDNode *Expr) {
+  MachineFunction &MF = *BB.getParent();
+  MachineInstr *MI = BuildMI(MF, DL, MCID, IsIndirect, Reg, Variable, Expr);
+  BB.insert(I, MI);
+  return MachineInstrBuilder(MF, MI);
+}
+
+MachineInstrBuilder llvm::BuildMI(MachineBasicBlock &BB,
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, const MCInstrDesc &MCID,
+                                  bool IsIndirect,
+                                  ArrayRef<MachineOperand> DebugOps,
+                                  const MDNode *Variable, const MDNode *Expr) {
+  MachineFunction &MF = *BB.getParent();
+  MachineInstr *MI =
+      BuildMI(MF, DL, MCID, IsIndirect, DebugOps, Variable, Expr);
+  BB.insert(I, MI);
+  return MachineInstrBuilder(MF, *MI);
+}
+
+/// Compute the new DIExpression to use with a DBG_VALUE for a spill slot.
+/// This prepends DW_OP_deref when spilling an indirect DBG_VALUE.
+static const DIExpression *
+computeExprForSpill(const MachineInstr &MI,
+                    SmallVectorImpl<const MachineOperand *> &SpilledOperands) {
+  assert(MI.getDebugVariable()->isValidLocationForIntrinsic(MI.getDebugLoc()) &&
+         "Expected inlined-at fields to agree");
+
+  const DIExpression *Expr = MI.getDebugExpression();
+  if (MI.isIndirectDebugValue()) {
+    assert(MI.getDebugOffset().getImm() == 0 &&
+           "DBG_VALUE with nonzero offset");
+    Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore);
+  } else if (MI.isDebugValueList()) {
+    // We will replace the spilled register with a frame index, so
+    // immediately deref all references to the spilled register.
+    std::array<uint64_t, 1> Ops{{dwarf::DW_OP_deref}};
+    for (const MachineOperand *Op : SpilledOperands) {
+      unsigned OpIdx = MI.getDebugOperandIndex(Op);
+      Expr = DIExpression::appendOpsToArg(Expr, Ops, OpIdx);
+    }
+  }
+  return Expr;
+}
+static const DIExpression *computeExprForSpill(const MachineInstr &MI,
+                                               Register SpillReg) {
+  assert(MI.hasDebugOperandForReg(SpillReg) && "Spill Reg is not used in MI.");
+  SmallVector<const MachineOperand *> SpillOperands;
+  for (const MachineOperand &Op : MI.getDebugOperandsForReg(SpillReg))
+    SpillOperands.push_back(&Op);
+  return computeExprForSpill(MI, SpillOperands);
+}
+
+MachineInstr *llvm::buildDbgValueForSpill(MachineBasicBlock &BB,
+                                          MachineBasicBlock::iterator I,
+                                          const MachineInstr &Orig,
+                                          int FrameIndex, Register SpillReg) {
+  assert(!Orig.isDebugRef() &&
+         "DBG_INSTR_REF should not reference a virtual register.");
+  const DIExpression *Expr = computeExprForSpill(Orig, SpillReg);
+  MachineInstrBuilder NewMI =
+      BuildMI(BB, I, Orig.getDebugLoc(), Orig.getDesc());
+  // Non-Variadic Operands: Location, Offset, Variable, Expression
+  // Variadic Operands:     Variable, Expression, Locations...
+  if (Orig.isNonListDebugValue())
+    NewMI.addFrameIndex(FrameIndex).addImm(0U);
+  NewMI.addMetadata(Orig.getDebugVariable()).addMetadata(Expr);
+  if (Orig.isDebugValueList()) {
+    for (const MachineOperand &Op : Orig.debug_operands())
+      if (Op.isReg() && Op.getReg() == SpillReg)
+        NewMI.addFrameIndex(FrameIndex);
+      else
+        NewMI.add(MachineOperand(Op));
+  }
+  return NewMI;
+}
+MachineInstr *llvm::buildDbgValueForSpill(
+    MachineBasicBlock &BB, MachineBasicBlock::iterator I,
+    const MachineInstr &Orig, int FrameIndex,
+    SmallVectorImpl<const MachineOperand *> &SpilledOperands) {
+  const DIExpression *Expr = computeExprForSpill(Orig, SpilledOperands);
+  MachineInstrBuilder NewMI =
+      BuildMI(BB, I, Orig.getDebugLoc(), Orig.getDesc());
+  // Non-Variadic Operands: Location, Offset, Variable, Expression
+  // Variadic Operands:     Variable, Expression, Locations...
+  if (Orig.isNonListDebugValue())
+    NewMI.addFrameIndex(FrameIndex).addImm(0U);
+  NewMI.addMetadata(Orig.getDebugVariable()).addMetadata(Expr);
+  if (Orig.isDebugValueList()) {
+    for (const MachineOperand &Op : Orig.debug_operands())
+      if (is_contained(SpilledOperands, &Op))
+        NewMI.addFrameIndex(FrameIndex);
+      else
+        NewMI.add(MachineOperand(Op));
+  }
+  return NewMI;
+}
+
+void llvm::updateDbgValueForSpill(MachineInstr &Orig, int FrameIndex,
+                                  Register Reg) {
+  const DIExpression *Expr = computeExprForSpill(Orig, Reg);
+  if (Orig.isNonListDebugValue())
+    Orig.getDebugOffset().ChangeToImmediate(0U);
+  for (MachineOperand &Op : Orig.getDebugOperandsForReg(Reg))
+    Op.ChangeToFrameIndex(FrameIndex);
+  Orig.getDebugExpressionOp().setMetadata(Expr);
+}
+
+void MachineInstr::collectDebugValues(
+                                SmallVectorImpl<MachineInstr *> &DbgValues) {
+  MachineInstr &MI = *this;
+  if (!MI.getOperand(0).isReg())
+    return;
+
+  MachineBasicBlock::iterator DI = MI; ++DI;
+  for (MachineBasicBlock::iterator DE = MI.getParent()->end();
+       DI != DE; ++DI) {
+    if (!DI->isDebugValue())
+      return;
+    if (DI->hasDebugOperandForReg(MI.getOperand(0).getReg()))
+      DbgValues.push_back(&*DI);
+  }
+}
+
+void MachineInstr::changeDebugValuesDefReg(Register Reg) {
+  // Collect matching debug values.
+  SmallVector<MachineInstr *, 2> DbgValues;
+
+  if (!getOperand(0).isReg())
+    return;
+
+  Register DefReg = getOperand(0).getReg();
+  auto *MRI = getRegInfo();
+  for (auto &MO : MRI->use_operands(DefReg)) {
+    auto *DI = MO.getParent();
+    if (!DI->isDebugValue())
+      continue;
+    if (DI->hasDebugOperandForReg(DefReg)) {
+      DbgValues.push_back(DI);
+    }
+  }
+
+  // Propagate Reg to debug value instructions.
+  for (auto *DBI : DbgValues)
+    for (MachineOperand &Op : DBI->getDebugOperandsForReg(DefReg))
+      Op.setReg(Reg);
+}
+
+using MMOList = SmallVector<const MachineMemOperand *, 2>;
+
+static unsigned getSpillSlotSize(const MMOList &Accesses,
+                                 const MachineFrameInfo &MFI) {
+  unsigned Size = 0;
+  for (const auto *A : Accesses)
+    if (MFI.isSpillSlotObjectIndex(
+            cast<FixedStackPseudoSourceValue>(A->getPseudoValue())
+                ->getFrameIndex()))
+      Size += A->getSize();
+  return Size;
+}
+
+std::optional<unsigned>
+MachineInstr::getSpillSize(const TargetInstrInfo *TII) const {
+  int FI;
+  if (TII->isStoreToStackSlotPostFE(*this, FI)) {
+    const MachineFrameInfo &MFI = getMF()->getFrameInfo();
+    if (MFI.isSpillSlotObjectIndex(FI))
+      return (*memoperands_begin())->getSize();
+  }
+  return std::nullopt;
+}
+
+std::optional<unsigned>
+MachineInstr::getFoldedSpillSize(const TargetInstrInfo *TII) const {
+  MMOList Accesses;
+  if (TII->hasStoreToStackSlot(*this, Accesses))
+    return getSpillSlotSize(Accesses, getMF()->getFrameInfo());
+  return std::nullopt;
+}
+
+std::optional<unsigned>
+MachineInstr::getRestoreSize(const TargetInstrInfo *TII) const {
+  int FI;
+  if (TII->isLoadFromStackSlotPostFE(*this, FI)) {
+    const MachineFrameInfo &MFI = getMF()->getFrameInfo();
+    if (MFI.isSpillSlotObjectIndex(FI))
+      return (*memoperands_begin())->getSize();
+  }
+  return std::nullopt;
+}
+
+std::optional<unsigned>
+MachineInstr::getFoldedRestoreSize(const TargetInstrInfo *TII) const {
+  MMOList Accesses;
+  if (TII->hasLoadFromStackSlot(*this, Accesses))
+    return getSpillSlotSize(Accesses, getMF()->getFrameInfo());
+  return std::nullopt;
+}
+
+unsigned MachineInstr::getDebugInstrNum() {
+  if (DebugInstrNum == 0)
+    DebugInstrNum = getParent()->getParent()->getNewDebugInstrNum();
+  return DebugInstrNum;
+}
+
+unsigned MachineInstr::getDebugInstrNum(MachineFunction &MF) {
+  if (DebugInstrNum == 0)
+    DebugInstrNum = MF.getNewDebugInstrNum();
+  return DebugInstrNum;
+}
+
+std::tuple<LLT, LLT> MachineInstr::getFirst2LLTs() const {
+  return std::tuple(getRegInfo()->getType(getOperand(0).getReg()),
+                    getRegInfo()->getType(getOperand(1).getReg()));
+}
+
+std::tuple<LLT, LLT, LLT> MachineInstr::getFirst3LLTs() const {
+  return std::tuple(getRegInfo()->getType(getOperand(0).getReg()),
+                    getRegInfo()->getType(getOperand(1).getReg()),
+                    getRegInfo()->getType(getOperand(2).getReg()));
+}
+
+std::tuple<LLT, LLT, LLT, LLT> MachineInstr::getFirst4LLTs() const {
+  return std::tuple(getRegInfo()->getType(getOperand(0).getReg()),
+                    getRegInfo()->getType(getOperand(1).getReg()),
+                    getRegInfo()->getType(getOperand(2).getReg()),
+                    getRegInfo()->getType(getOperand(3).getReg()));
+}
+
+std::tuple<LLT, LLT, LLT, LLT, LLT> MachineInstr::getFirst5LLTs() const {
+  return std::tuple(getRegInfo()->getType(getOperand(0).getReg()),
+                    getRegInfo()->getType(getOperand(1).getReg()),
+                    getRegInfo()->getType(getOperand(2).getReg()),
+                    getRegInfo()->getType(getOperand(3).getReg()),
+                    getRegInfo()->getType(getOperand(4).getReg()));
+}
+
+std::tuple<Register, LLT, Register, LLT>
+MachineInstr::getFirst2RegLLTs() const {
+  Register Reg0 = getOperand(0).getReg();
+  Register Reg1 = getOperand(1).getReg();
+  return std::tuple(Reg0, getRegInfo()->getType(Reg0), Reg1,
+                    getRegInfo()->getType(Reg1));
+}
+
+std::tuple<Register, LLT, Register, LLT, Register, LLT>
+MachineInstr::getFirst3RegLLTs() const {
+  Register Reg0 = getOperand(0).getReg();
+  Register Reg1 = getOperand(1).getReg();
+  Register Reg2 = getOperand(2).getReg();
+  return std::tuple(Reg0, getRegInfo()->getType(Reg0), Reg1,
+                    getRegInfo()->getType(Reg1), Reg2,
+                    getRegInfo()->getType(Reg2));
+}
+
+std::tuple<Register, LLT, Register, LLT, Register, LLT, Register, LLT>
+MachineInstr::getFirst4RegLLTs() const {
+  Register Reg0 = getOperand(0).getReg();
+  Register Reg1 = getOperand(1).getReg();
+  Register Reg2 = getOperand(2).getReg();
+  Register Reg3 = getOperand(3).getReg();
+  return std::tuple(
+      Reg0, getRegInfo()->getType(Reg0), Reg1, getRegInfo()->getType(Reg1),
+      Reg2, getRegInfo()->getType(Reg2), Reg3, getRegInfo()->getType(Reg3));
+}
+
+std::tuple<Register, LLT, Register, LLT, Register, LLT, Register, LLT, Register,
+           LLT>
+MachineInstr::getFirst5RegLLTs() const {
+  Register Reg0 = getOperand(0).getReg();
+  Register Reg1 = getOperand(1).getReg();
+  Register Reg2 = getOperand(2).getReg();
+  Register Reg3 = getOperand(3).getReg();
+  Register Reg4 = getOperand(4).getReg();
+  return std::tuple(
+      Reg0, getRegInfo()->getType(Reg0), Reg1, getRegInfo()->getType(Reg1),
+      Reg2, getRegInfo()->getType(Reg2), Reg3, getRegInfo()->getType(Reg3),
+      Reg4, getRegInfo()->getType(Reg4));
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineInstrBundle.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineInstrBundle.cpp
new file mode 100644
index 000000000000..b9db34f7be95
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -0,0 +1,387 @@
+//===-- lib/CodeGen/MachineInstrBundle.cpp --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include <utility>
+using namespace llvm;
+
+namespace {
+  class UnpackMachineBundles : public MachineFunctionPass {
+  public:
+    static char ID; // Pass identification
+    UnpackMachineBundles(
+        std::function<bool(const MachineFunction &)> Ftor = nullptr)
+        : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) {
+      initializeUnpackMachineBundlesPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    std::function<bool(const MachineFunction &)> PredicateFtor;
+  };
+} // end anonymous namespace
+
+char UnpackMachineBundles::ID = 0;
+char &llvm::UnpackMachineBundlesID = UnpackMachineBundles::ID;
+INITIALIZE_PASS(UnpackMachineBundles, "unpack-mi-bundles",
+                "Unpack machine instruction bundles", false, false)
+
+bool UnpackMachineBundles::runOnMachineFunction(MachineFunction &MF) {
+  if (PredicateFtor && !PredicateFtor(MF))
+    return false;
+
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineBasicBlock::instr_iterator MII = MBB.instr_begin(),
+           MIE = MBB.instr_end(); MII != MIE; ) {
+      MachineInstr *MI = &*MII;
+
+      // Remove BUNDLE instruction and the InsideBundle flags from bundled
+      // instructions.
+      if (MI->isBundle()) {
+        while (++MII != MIE && MII->isBundledWithPred()) {
+          MII->unbundleFromPred();
+          for (MachineOperand &MO  : MII->operands()) {
+            if (MO.isReg() && MO.isInternalRead())
+              MO.setIsInternalRead(false);
+          }
+        }
+        MI->eraseFromParent();
+
+        Changed = true;
+        continue;
+      }
+
+      ++MII;
+    }
+  }
+
+  return Changed;
+}
+
+FunctionPass *
+llvm::createUnpackMachineBundles(
+    std::function<bool(const MachineFunction &)> Ftor) {
+  return new UnpackMachineBundles(std::move(Ftor));
+}
+
+namespace {
+  class FinalizeMachineBundles : public MachineFunctionPass {
+  public:
+    static char ID; // Pass identification
+    FinalizeMachineBundles() : MachineFunctionPass(ID) {
+      initializeFinalizeMachineBundlesPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+  };
+} // end anonymous namespace
+
+char FinalizeMachineBundles::ID = 0;
+char &llvm::FinalizeMachineBundlesID = FinalizeMachineBundles::ID;
+INITIALIZE_PASS(FinalizeMachineBundles, "finalize-mi-bundles",
+                "Finalize machine instruction bundles", false, false)
+
+bool FinalizeMachineBundles::runOnMachineFunction(MachineFunction &MF) {
+  return llvm::finalizeBundles(MF);
+}
+
+/// Return the first found DebugLoc that has a DILocation, given a range of
+/// instructions. The search range is from FirstMI to LastMI (exclusive). If no
+/// DILocation is found, then an empty location is returned.
+static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI,
+                            MachineBasicBlock::instr_iterator LastMI) {
+  for (auto MII = FirstMI; MII != LastMI; ++MII)
+    if (MII->getDebugLoc())
+      return MII->getDebugLoc();
+  return DebugLoc();
+}
+
+/// finalizeBundle - Finalize a machine instruction bundle which includes
+/// a sequence of instructions starting from FirstMI to LastMI (exclusive).
+/// This routine adds a BUNDLE instruction to represent the bundle, it adds
+/// IsInternalRead markers to MachineOperands which are defined inside the
+/// bundle, and it copies externally visible defs and uses to the BUNDLE
+/// instruction.
+void llvm::finalizeBundle(MachineBasicBlock &MBB,
+                          MachineBasicBlock::instr_iterator FirstMI,
+                          MachineBasicBlock::instr_iterator LastMI) {
+  assert(FirstMI != LastMI && "Empty bundle?");
+  MIBundleBuilder Bundle(MBB, FirstMI, LastMI);
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+  MachineInstrBuilder MIB =
+      BuildMI(MF, getDebugLoc(FirstMI, LastMI), TII->get(TargetOpcode::BUNDLE));
+  Bundle.prepend(MIB);
+
+  SmallVector<Register, 32> LocalDefs;
+  SmallSet<Register, 32> LocalDefSet;
+  SmallSet<Register, 8> DeadDefSet;
+  SmallSet<Register, 16> KilledDefSet;
+  SmallVector<Register, 8> ExternUses;
+  SmallSet<Register, 8> ExternUseSet;
+  SmallSet<Register, 8> KilledUseSet;
+  SmallSet<Register, 8> UndefUseSet;
+  SmallVector<MachineOperand*, 4> Defs;
+  for (auto MII = FirstMI; MII != LastMI; ++MII) {
+    // Debug instructions have no effects to track.
+    if (MII->isDebugInstr())
+      continue;
+
+    for (MachineOperand &MO : MII->operands()) {
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef()) {
+        Defs.push_back(&MO);
+        continue;
+      }
+
+      Register Reg = MO.getReg();
+      if (!Reg)
+        continue;
+
+      if (LocalDefSet.count(Reg)) {
+        MO.setIsInternalRead();
+        if (MO.isKill())
+          // Internal def is now killed.
+          KilledDefSet.insert(Reg);
+      } else {
+        if (ExternUseSet.insert(Reg).second) {
+          ExternUses.push_back(Reg);
+          if (MO.isUndef())
+            UndefUseSet.insert(Reg);
+        }
+        if (MO.isKill())
+          // External def is now killed.
+          KilledUseSet.insert(Reg);
+      }
+    }
+
+    for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+      MachineOperand &MO = *Defs[i];
+      Register Reg = MO.getReg();
+      if (!Reg)
+        continue;
+
+      if (LocalDefSet.insert(Reg).second) {
+        LocalDefs.push_back(Reg);
+        if (MO.isDead()) {
+          DeadDefSet.insert(Reg);
+        }
+      } else {
+        // Re-defined inside the bundle, it's no longer killed.
+        KilledDefSet.erase(Reg);
+        if (!MO.isDead())
+          // Previously defined but dead.
+          DeadDefSet.erase(Reg);
+      }
+
+      if (!MO.isDead() && Reg.isPhysical()) {
+        for (MCPhysReg SubReg : TRI->subregs(Reg)) {
+          if (LocalDefSet.insert(SubReg).second)
+            LocalDefs.push_back(SubReg);
+        }
+      }
+    }
+
+    Defs.clear();
+  }
+
+  SmallSet<Register, 32> Added;
+  for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
+    Register Reg = LocalDefs[i];
+    if (Added.insert(Reg).second) {
+      // If it's not live beyond end of the bundle, mark it dead.
+      bool isDead = DeadDefSet.count(Reg) || KilledDefSet.count(Reg);
+      MIB.addReg(Reg, getDefRegState(true) | getDeadRegState(isDead) |
+                 getImplRegState(true));
+    }
+  }
+
+  for (unsigned i = 0, e = ExternUses.size(); i != e; ++i) {
+    Register Reg = ExternUses[i];
+    bool isKill = KilledUseSet.count(Reg);
+    bool isUndef = UndefUseSet.count(Reg);
+    MIB.addReg(Reg, getKillRegState(isKill) | getUndefRegState(isUndef) |
+               getImplRegState(true));
+  }
+
+  // Set FrameSetup/FrameDestroy for the bundle. If any of the instructions got
+  // the property, then also set it on the bundle.
+  for (auto MII = FirstMI; MII != LastMI; ++MII) {
+    if (MII->getFlag(MachineInstr::FrameSetup))
+      MIB.setMIFlag(MachineInstr::FrameSetup);
+    if (MII->getFlag(MachineInstr::FrameDestroy))
+      MIB.setMIFlag(MachineInstr::FrameDestroy);
+  }
+}
+
+/// finalizeBundle - Same functionality as the previous finalizeBundle except
+/// the last instruction in the bundle is not provided as an input. This is
+/// used in cases where bundles are pre-determined by marking instructions
+/// with 'InsideBundle' marker. It returns the MBB instruction iterator that
+/// points to the end of the bundle.
+MachineBasicBlock::instr_iterator
+llvm::finalizeBundle(MachineBasicBlock &MBB,
+                     MachineBasicBlock::instr_iterator FirstMI) {
+  MachineBasicBlock::instr_iterator E = MBB.instr_end();
+  MachineBasicBlock::instr_iterator LastMI = std::next(FirstMI);
+  while (LastMI != E && LastMI->isInsideBundle())
+    ++LastMI;
+  finalizeBundle(MBB, FirstMI, LastMI);
+  return LastMI;
+}
+
+/// finalizeBundles - Finalize instruction bundles in the specified
+/// MachineFunction. Return true if any bundles are finalized.
+bool llvm::finalizeBundles(MachineFunction &MF) {
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF) {
+    MachineBasicBlock::instr_iterator MII = MBB.instr_begin();
+    MachineBasicBlock::instr_iterator MIE = MBB.instr_end();
+    if (MII == MIE)
+      continue;
+    assert(!MII->isInsideBundle() &&
+           "First instr cannot be inside bundle before finalization!");
+
+    for (++MII; MII != MIE; ) {
+      if (!MII->isInsideBundle())
+        ++MII;
+      else {
+        MII = finalizeBundle(MBB, std::prev(MII));
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+VirtRegInfo llvm::AnalyzeVirtRegInBundle(
+    MachineInstr &MI, Register Reg,
+    SmallVectorImpl<std::pair<MachineInstr *, unsigned>> *Ops) {
+  VirtRegInfo RI = {false, false, false};
+  for (MIBundleOperands O(MI); O.isValid(); ++O) {
+    MachineOperand &MO = *O;
+    if (!MO.isReg() || MO.getReg() != Reg)
+      continue;
+
+    // Remember each (MI, OpNo) that refers to Reg.
+    if (Ops)
+      Ops->push_back(std::make_pair(MO.getParent(), O.getOperandNo()));
+
+    // Both defs and uses can read virtual registers.
+    if (MO.readsReg()) {
+      RI.Reads = true;
+      if (MO.isDef())
+        RI.Tied = true;
+    }
+
+    // Only defs can write.
+    if (MO.isDef())
+      RI.Writes = true;
+    else if (!RI.Tied &&
+             MO.getParent()->isRegTiedToDefOperand(O.getOperandNo()))
+      RI.Tied = true;
+  }
+  return RI;
+}
+
+std::pair<LaneBitmask, LaneBitmask>
+llvm::AnalyzeVirtRegLanesInBundle(const MachineInstr &MI, Register Reg,
+                                  const MachineRegisterInfo &MRI,
+                                  const TargetRegisterInfo &TRI) {
+
+  LaneBitmask UseMask, DefMask;
+
+  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+    const MachineOperand &MO = *O;
+    if (!MO.isReg() || MO.getReg() != Reg)
+      continue;
+
+    unsigned SubReg = MO.getSubReg();
+    if (SubReg == 0 && MO.isUse() && !MO.isUndef())
+      UseMask |= MRI.getMaxLaneMaskForVReg(Reg);
+
+    LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(SubReg);
+    if (MO.isDef()) {
+      if (!MO.isUndef())
+        UseMask |= ~SubRegMask;
+      DefMask |= SubRegMask;
+    } else if (!MO.isUndef())
+      UseMask |= SubRegMask;
+  }
+
+  return {UseMask, DefMask};
+}
+
+PhysRegInfo llvm::AnalyzePhysRegInBundle(const MachineInstr &MI, Register Reg,
+                                         const TargetRegisterInfo *TRI) {
+  bool AllDefsDead = true;
+  PhysRegInfo PRI = {false, false, false, false, false, false, false, false};
+
+  assert(Reg.isPhysical() && "analyzePhysReg not given a physical register!");
+  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+    const MachineOperand &MO = *O;
+
+    if (MO.isRegMask() && MO.clobbersPhysReg(Reg)) {
+      PRI.Clobbered = true;
+      continue;
+    }
+
+    if (!MO.isReg())
+      continue;
+
+    Register MOReg = MO.getReg();
+    if (!MOReg || !MOReg.isPhysical())
+      continue;
+
+    if (!TRI->regsOverlap(MOReg, Reg))
+      continue;
+
+    bool Covered = TRI->isSuperRegisterEq(Reg, MOReg);
+    if (MO.readsReg()) {
+      PRI.Read = true;
+      if (Covered) {
+        PRI.FullyRead = true;
+        if (MO.isKill())
+          PRI.Killed = true;
+      }
+    } else if (MO.isDef()) {
+      PRI.Defined = true;
+      if (Covered)
+        PRI.FullyDefined = true;
+      if (!MO.isDead())
+        AllDefsDead = false;
+    }
+  }
+
+  if (AllDefsDead) {
+    if (PRI.FullyDefined || PRI.Clobbered)
+      PRI.DeadDef = true;
+    else if (PRI.Defined)
+      PRI.PartialDeadDef = true;
+  }
+
+  return PRI;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineLICM.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineLICM.cpp
new file mode 100644
index 000000000000..523e077fd9a2
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineLICM.cpp
@@ -0,0 +1,1522 @@
+//===- MachineLICM.cpp - Machine Loop Invariant Code Motion Pass ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs loop invariant code motion on machine instructions. We
+// attempt to remove as much code from the body of a loop as possible.
+//
+// This pass is not intended to be a replacement or a complete alternative
+// for the LLVM-IR-level LICM pass. It is only designed to hoist simple
+// constructs that are not exposed before lowering and instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegister.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machinelicm"
+
+static cl::opt<bool>
+AvoidSpeculation("avoid-speculation",
+                 cl::desc("MachineLICM should avoid speculation"),
+                 cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+HoistCheapInsts("hoist-cheap-insts",
+                cl::desc("MachineLICM should hoist even cheap instructions"),
+                cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+HoistConstStores("hoist-const-stores",
+                 cl::desc("Hoist invariant stores"),
+                 cl::init(true), cl::Hidden);
+// The default threshold of 100 (i.e. if target block is 100 times hotter)
+// is based on empirical data on a single target and is subject to tuning.
+static cl::opt<unsigned>
+BlockFrequencyRatioThreshold("block-freq-ratio-threshold",
+                             cl::desc("Do not hoist instructions if target"
+                             "block is N times hotter than the source."),
+                             cl::init(100), cl::Hidden);
+
+enum class UseBFI { None, PGO, All };
+
+static cl::opt<UseBFI>
+DisableHoistingToHotterBlocks("disable-hoisting-to-hotter-blocks",
+                              cl::desc("Disable hoisting instructions to"
+                              " hotter blocks"),
+                              cl::init(UseBFI::PGO), cl::Hidden,
+                              cl::values(clEnumValN(UseBFI::None, "none",
+                              "disable the feature"),
+                              clEnumValN(UseBFI::PGO, "pgo",
+                              "enable the feature when using profile data"),
+                              clEnumValN(UseBFI::All, "all",
+                              "enable the feature with/wo profile data")));
+
+STATISTIC(NumHoisted,
+          "Number of machine instructions hoisted out of loops");
+STATISTIC(NumLowRP,
+          "Number of instructions hoisted in low reg pressure situation");
+STATISTIC(NumHighLatency,
+          "Number of high latency instructions hoisted");
+STATISTIC(NumCSEed,
+          "Number of hoisted machine instructions CSEed");
+STATISTIC(NumPostRAHoisted,
+          "Number of machine instructions hoisted out of loops post regalloc");
+STATISTIC(NumStoreConst,
+          "Number of stores of const phys reg hoisted out of loops");
+STATISTIC(NumNotHoistedDueToHotness,
+          "Number of instructions not hoisted due to block frequency");
+
+namespace {
+
+  class MachineLICMBase : public MachineFunctionPass {
+    const TargetInstrInfo *TII = nullptr;
+    const TargetLoweringBase *TLI = nullptr;
+    const TargetRegisterInfo *TRI = nullptr;
+    const MachineFrameInfo *MFI = nullptr;
+    MachineRegisterInfo *MRI = nullptr;
+    TargetSchedModel SchedModel;
+    bool PreRegAlloc = false;
+    bool HasProfileData = false;
+
+    // Various analyses that we use...
+    AliasAnalysis *AA = nullptr;               // Alias analysis info.
+    MachineBlockFrequencyInfo *MBFI = nullptr; // Machine block frequncy info
+    MachineLoopInfo *MLI = nullptr;            // Current MachineLoopInfo
+    MachineDominatorTree *DT = nullptr; // Machine dominator tree for the cur loop
+
+    // State that is updated as we process loops
+    bool Changed = false;           // True if a loop is changed.
+    bool FirstInLoop = false;       // True if it's the first LICM in the loop.
+    MachineLoop *CurLoop = nullptr; // The current loop we are working on.
+    MachineBasicBlock *CurPreheader = nullptr; // The preheader for CurLoop.
+
+    // Exit blocks for CurLoop.
+    SmallVector<MachineBasicBlock *, 8> ExitBlocks;
+
+    bool isExitBlock(const MachineBasicBlock *MBB) const {
+      return is_contained(ExitBlocks, MBB);
+    }
+
+    // Track 'estimated' register pressure.
+    SmallSet<Register, 32> RegSeen;
+    SmallVector<unsigned, 8> RegPressure;
+
+    // Register pressure "limit" per register pressure set. If the pressure
+    // is higher than the limit, then it's considered high.
+    SmallVector<unsigned, 8> RegLimit;
+
+    // Register pressure on path leading from loop preheader to current BB.
+    SmallVector<SmallVector<unsigned, 8>, 16> BackTrace;
+
+    // For each opcode, keep a list of potential CSE instructions.
+    DenseMap<unsigned, std::vector<MachineInstr *>> CSEMap;
+
+    enum {
+      SpeculateFalse   = 0,
+      SpeculateTrue    = 1,
+      SpeculateUnknown = 2
+    };
+
+    // If a MBB does not dominate loop exiting blocks then it may not safe
+    // to hoist loads from this block.
+    // Tri-state: 0 - false, 1 - true, 2 - unknown
+    unsigned SpeculationState = SpeculateUnknown;
+
+  public:
+    MachineLICMBase(char &PassID, bool PreRegAlloc)
+        : MachineFunctionPass(PassID), PreRegAlloc(PreRegAlloc) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineLoopInfo>();
+      if (DisableHoistingToHotterBlocks != UseBFI::None)
+        AU.addRequired<MachineBlockFrequencyInfo>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    void releaseMemory() override {
+      RegSeen.clear();
+      RegPressure.clear();
+      RegLimit.clear();
+      BackTrace.clear();
+      CSEMap.clear();
+    }
+
+  private:
+    /// Keep track of information about hoisting candidates.
+    struct CandidateInfo {
+      MachineInstr *MI;
+      unsigned      Def;
+      int           FI;
+
+      CandidateInfo(MachineInstr *mi, unsigned def, int fi)
+        : MI(mi), Def(def), FI(fi) {}
+    };
+
+    void HoistRegionPostRA();
+
+    void HoistPostRA(MachineInstr *MI, unsigned Def);
+
+    void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
+                   BitVector &PhysRegClobbers, SmallSet<int, 32> &StoredFIs,
+                   SmallVectorImpl<CandidateInfo> &Candidates);
+
+    void AddToLiveIns(MCRegister Reg);
+
+    bool IsLICMCandidate(MachineInstr &I);
+
+    bool IsLoopInvariantInst(MachineInstr &I);
+
+    bool HasLoopPHIUse(const MachineInstr *MI) const;
+
+    bool HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx,
+                               Register Reg) const;
+
+    bool IsCheapInstruction(MachineInstr &MI) const;
+
+    bool CanCauseHighRegPressure(const DenseMap<unsigned, int> &Cost,
+                                 bool Cheap);
+
+    void UpdateBackTraceRegPressure(const MachineInstr *MI);
+
+    bool IsProfitableToHoist(MachineInstr &MI);
+
+    bool IsGuaranteedToExecute(MachineBasicBlock *BB);
+
+    bool isTriviallyReMaterializable(const MachineInstr &MI) const;
+
+    void EnterScope(MachineBasicBlock *MBB);
+
+    void ExitScope(MachineBasicBlock *MBB);
+
+    void ExitScopeIfDone(
+        MachineDomTreeNode *Node,
+        DenseMap<MachineDomTreeNode *, unsigned> &OpenChildren,
+        const DenseMap<MachineDomTreeNode *, MachineDomTreeNode *> &ParentMap);
+
+    void HoistOutOfLoop(MachineDomTreeNode *HeaderN);
+
+    void InitRegPressure(MachineBasicBlock *BB);
+
+    DenseMap<unsigned, int> calcRegisterCost(const MachineInstr *MI,
+                                             bool ConsiderSeen,
+                                             bool ConsiderUnseenAsDef);
+
+    void UpdateRegPressure(const MachineInstr *MI,
+                           bool ConsiderUnseenAsDef = false);
+
+    MachineInstr *ExtractHoistableLoad(MachineInstr *MI);
+
+    MachineInstr *LookForDuplicate(const MachineInstr *MI,
+                                   std::vector<MachineInstr *> &PrevMIs);
+
+    bool
+    EliminateCSE(MachineInstr *MI,
+                 DenseMap<unsigned, std::vector<MachineInstr *>>::iterator &CI);
+
+    bool MayCSE(MachineInstr *MI);
+
+    bool Hoist(MachineInstr *MI, MachineBasicBlock *Preheader);
+
+    void InitCSEMap(MachineBasicBlock *BB);
+
+    bool isTgtHotterThanSrc(MachineBasicBlock *SrcBlock,
+                            MachineBasicBlock *TgtBlock);
+    MachineBasicBlock *getCurPreheader();
+  };
+
+  class MachineLICM : public MachineLICMBase {
+  public:
+    static char ID;
+    MachineLICM() : MachineLICMBase(ID, false) {
+      initializeMachineLICMPass(*PassRegistry::getPassRegistry());
+    }
+  };
+
+  class EarlyMachineLICM : public MachineLICMBase {
+  public:
+    static char ID;
+    EarlyMachineLICM() : MachineLICMBase(ID, true) {
+      initializeEarlyMachineLICMPass(*PassRegistry::getPassRegistry());
+    }
+  };
+
+} // end anonymous namespace
+
+char MachineLICM::ID;
+char EarlyMachineLICM::ID;
+
+char &llvm::MachineLICMID = MachineLICM::ID;
+char &llvm::EarlyMachineLICMID = EarlyMachineLICM::ID;
+
+INITIALIZE_PASS_BEGIN(MachineLICM, DEBUG_TYPE,
+                      "Machine Loop Invariant Code Motion", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(MachineLICM, DEBUG_TYPE,
+                    "Machine Loop Invariant Code Motion", false, false)
+
+INITIALIZE_PASS_BEGIN(EarlyMachineLICM, "early-machinelicm",
+                      "Early Machine Loop Invariant Code Motion", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(EarlyMachineLICM, "early-machinelicm",
+                    "Early Machine Loop Invariant Code Motion", false, false)
+
+/// Test if the given loop is the outer-most loop that has a unique predecessor.
+static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) {
+  // Check whether this loop even has a unique predecessor.
+  if (!CurLoop->getLoopPredecessor())
+    return false;
+  // Ok, now check to see if any of its outer loops do.
+  for (MachineLoop *L = CurLoop->getParentLoop(); L; L = L->getParentLoop())
+    if (L->getLoopPredecessor())
+      return false;
+  // None of them did, so this is the outermost with a unique predecessor.
+  return true;
+}
+
+bool MachineLICMBase::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  Changed = FirstInLoop = false;
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+  TII = ST.getInstrInfo();
+  TLI = ST.getTargetLowering();
+  TRI = ST.getRegisterInfo();
+  MFI = &MF.getFrameInfo();
+  MRI = &MF.getRegInfo();
+  SchedModel.init(&ST);
+
+  PreRegAlloc = MRI->isSSA();
+  HasProfileData = MF.getFunction().hasProfileData();
+
+  if (PreRegAlloc)
+    LLVM_DEBUG(dbgs() << "******** Pre-regalloc Machine LICM: ");
+  else
+    LLVM_DEBUG(dbgs() << "******** Post-regalloc Machine LICM: ");
+  LLVM_DEBUG(dbgs() << MF.getName() << " ********\n");
+
+  if (PreRegAlloc) {
+    // Estimate register pressure during pre-regalloc pass.
+    unsigned NumRPS = TRI->getNumRegPressureSets();
+    RegPressure.resize(NumRPS);
+    std::fill(RegPressure.begin(), RegPressure.end(), 0);
+    RegLimit.resize(NumRPS);
+    for (unsigned i = 0, e = NumRPS; i != e; ++i)
+      RegLimit[i] = TRI->getRegPressureSetLimit(MF, i);
+  }
+
+  // Get our Loop information...
+  if (DisableHoistingToHotterBlocks != UseBFI::None)
+    MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+  MLI = &getAnalysis<MachineLoopInfo>();
+  DT  = &getAnalysis<MachineDominatorTree>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  SmallVector<MachineLoop *, 8> Worklist(MLI->begin(), MLI->end());
+  while (!Worklist.empty()) {
+    CurLoop = Worklist.pop_back_val();
+    CurPreheader = nullptr;
+    ExitBlocks.clear();
+
+    // If this is done before regalloc, only visit outer-most preheader-sporting
+    // loops.
+    if (PreRegAlloc && !LoopIsOuterMostWithPredecessor(CurLoop)) {
+      Worklist.append(CurLoop->begin(), CurLoop->end());
+      continue;
+    }
+
+    CurLoop->getExitBlocks(ExitBlocks);
+
+    if (!PreRegAlloc)
+      HoistRegionPostRA();
+    else {
+      // CSEMap is initialized for loop header when the first instruction is
+      // being hoisted.
+      MachineDomTreeNode *N = DT->getNode(CurLoop->getHeader());
+      FirstInLoop = true;
+      HoistOutOfLoop(N);
+      CSEMap.clear();
+    }
+  }
+
+  return Changed;
+}
+
+/// Return true if instruction stores to the specified frame.
+static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
+  // Check mayStore before memory operands so that e.g. DBG_VALUEs will return
+  // true since they have no memory operands.
+  if (!MI->mayStore())
+     return false;
+  // If we lost memory operands, conservatively assume that the instruction
+  // writes to all slots.
+  if (MI->memoperands_empty())
+    return true;
+  for (const MachineMemOperand *MemOp : MI->memoperands()) {
+    if (!MemOp->isStore() || !MemOp->getPseudoValue())
+      continue;
+    if (const FixedStackPseudoSourceValue *Value =
+        dyn_cast<FixedStackPseudoSourceValue>(MemOp->getPseudoValue())) {
+      if (Value->getFrameIndex() == FI)
+        return true;
+    }
+  }
+  return false;
+}
+
+/// Examine the instruction for potentai LICM candidate. Also
+/// gather register def and frame object update information.
+void MachineLICMBase::ProcessMI(MachineInstr *MI,
+                                BitVector &PhysRegDefs,
+                                BitVector &PhysRegClobbers,
+                                SmallSet<int, 32> &StoredFIs,
+                                SmallVectorImpl<CandidateInfo> &Candidates) {
+  bool RuledOut = false;
+  bool HasNonInvariantUse = false;
+  unsigned Def = 0;
+  for (const MachineOperand &MO : MI->operands()) {
+    if (MO.isFI()) {
+      // Remember if the instruction stores to the frame index.
+      int FI = MO.getIndex();
+      if (!StoredFIs.count(FI) &&
+          MFI->isSpillSlotObjectIndex(FI) &&
+          InstructionStoresToFI(MI, FI))
+        StoredFIs.insert(FI);
+      HasNonInvariantUse = true;
+      continue;
+    }
+
+    // We can't hoist an instruction defining a physreg that is clobbered in
+    // the loop.
+    if (MO.isRegMask()) {
+      PhysRegClobbers.setBitsNotInMask(MO.getRegMask());
+      continue;
+    }
+
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    assert(Reg.isPhysical() && "Not expecting virtual register!");
+
+    if (!MO.isDef()) {
+      if (Reg && (PhysRegDefs.test(Reg) || PhysRegClobbers.test(Reg)))
+        // If it's using a non-loop-invariant register, then it's obviously not
+        // safe to hoist.
+        HasNonInvariantUse = true;
+      continue;
+    }
+
+    if (MO.isImplicit()) {
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        PhysRegClobbers.set(*AI);
+      if (!MO.isDead())
+        // Non-dead implicit def? This cannot be hoisted.
+        RuledOut = true;
+      // No need to check if a dead implicit def is also defined by
+      // another instruction.
+      continue;
+    }
+
+    // FIXME: For now, avoid instructions with multiple defs, unless
+    // it's a dead implicit def.
+    if (Def)
+      RuledOut = true;
+    else
+      Def = Reg;
+
+    // If we have already seen another instruction that defines the same
+    // register, then this is not safe.  Two defs is indicated by setting a
+    // PhysRegClobbers bit.
+    for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) {
+      if (PhysRegDefs.test(*AS))
+        PhysRegClobbers.set(*AS);
+    }
+    // Need a second loop because MCRegAliasIterator can visit the same
+    // register twice.
+    for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS)
+      PhysRegDefs.set(*AS);
+
+    if (PhysRegClobbers.test(Reg))
+      // MI defined register is seen defined by another instruction in
+      // the loop, it cannot be a LICM candidate.
+      RuledOut = true;
+  }
+
+  // Only consider reloads for now and remats which do not have register
+  // operands. FIXME: Consider unfold load folding instructions.
+  if (Def && !RuledOut) {
+    int FI = std::numeric_limits<int>::min();
+    if ((!HasNonInvariantUse && IsLICMCandidate(*MI)) ||
+        (TII->isLoadFromStackSlot(*MI, FI) && MFI->isSpillSlotObjectIndex(FI)))
+      Candidates.push_back(CandidateInfo(MI, Def, FI));
+  }
+}
+
+/// Walk the specified region of the CFG and hoist loop invariants out to the
+/// preheader.
+void MachineLICMBase::HoistRegionPostRA() {
+  MachineBasicBlock *Preheader = getCurPreheader();
+  if (!Preheader)
+    return;
+
+  unsigned NumRegs = TRI->getNumRegs();
+  BitVector PhysRegDefs(NumRegs); // Regs defined once in the loop.
+  BitVector PhysRegClobbers(NumRegs); // Regs defined more than once.
+
+  SmallVector<CandidateInfo, 32> Candidates;
+  SmallSet<int, 32> StoredFIs;
+
+  // Walk the entire region, count number of defs for each register, and
+  // collect potential LICM candidates.
+  for (MachineBasicBlock *BB : CurLoop->getBlocks()) {
+    // If the header of the loop containing this basic block is a landing pad,
+    // then don't try to hoist instructions out of this loop.
+    const MachineLoop *ML = MLI->getLoopFor(BB);
+    if (ML && ML->getHeader()->isEHPad()) continue;
+
+    // Conservatively treat live-in's as an external def.
+    // FIXME: That means a reload that're reused in successor block(s) will not
+    // be LICM'ed.
+    for (const auto &LI : BB->liveins()) {
+      for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI)
+        PhysRegDefs.set(*AI);
+    }
+
+    // Funclet entry blocks will clobber all registers
+    if (const uint32_t *Mask = BB->getBeginClobberMask(TRI))
+      PhysRegClobbers.setBitsNotInMask(Mask);
+
+    SpeculationState = SpeculateUnknown;
+    for (MachineInstr &MI : *BB)
+      ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates);
+  }
+
+  // Gather the registers read / clobbered by the terminator.
+  BitVector TermRegs(NumRegs);
+  MachineBasicBlock::iterator TI = Preheader->getFirstTerminator();
+  if (TI != Preheader->end()) {
+    for (const MachineOperand &MO : TI->operands()) {
+      if (!MO.isReg())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        TermRegs.set(*AI);
+    }
+  }
+
+  // Now evaluate whether the potential candidates qualify.
+  // 1. Check if the candidate defined register is defined by another
+  //    instruction in the loop.
+  // 2. If the candidate is a load from stack slot (always true for now),
+  //    check if the slot is stored anywhere in the loop.
+  // 3. Make sure candidate def should not clobber
+  //    registers read by the terminator. Similarly its def should not be
+  //    clobbered by the terminator.
+  for (CandidateInfo &Candidate : Candidates) {
+    if (Candidate.FI != std::numeric_limits<int>::min() &&
+        StoredFIs.count(Candidate.FI))
+      continue;
+
+    unsigned Def = Candidate.Def;
+    if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) {
+      bool Safe = true;
+      MachineInstr *MI = Candidate.MI;
+      for (const MachineOperand &MO : MI->all_uses()) {
+        if (!MO.getReg())
+          continue;
+        Register Reg = MO.getReg();
+        if (PhysRegDefs.test(Reg) ||
+            PhysRegClobbers.test(Reg)) {
+          // If it's using a non-loop-invariant register, then it's obviously
+          // not safe to hoist.
+          Safe = false;
+          break;
+        }
+      }
+      if (Safe)
+        HoistPostRA(MI, Candidate.Def);
+    }
+  }
+}
+
+/// Add register 'Reg' to the livein sets of BBs in the current loop, and make
+/// sure it is not killed by any instructions in the loop.
+void MachineLICMBase::AddToLiveIns(MCRegister Reg) {
+  for (MachineBasicBlock *BB : CurLoop->getBlocks()) {
+    if (!BB->isLiveIn(Reg))
+      BB->addLiveIn(Reg);
+    for (MachineInstr &MI : *BB) {
+      for (MachineOperand &MO : MI.all_uses()) {
+        if (!MO.getReg())
+          continue;
+        if (MO.getReg() == Reg || TRI->isSuperRegister(Reg, MO.getReg()))
+          MO.setIsKill(false);
+      }
+    }
+  }
+}
+
+/// When an instruction is found to only use loop invariant operands that is
+/// safe to hoist, this instruction is called to do the dirty work.
+void MachineLICMBase::HoistPostRA(MachineInstr *MI, unsigned Def) {
+  MachineBasicBlock *Preheader = getCurPreheader();
+
+  // Now move the instructions to the predecessor, inserting it before any
+  // terminator instructions.
+  LLVM_DEBUG(dbgs() << "Hoisting to " << printMBBReference(*Preheader)
+                    << " from " << printMBBReference(*MI->getParent()) << ": "
+                    << *MI);
+
+  // Splice the instruction to the preheader.
+  MachineBasicBlock *MBB = MI->getParent();
+  Preheader->splice(Preheader->getFirstTerminator(), MBB, MI);
+
+  // Since we are moving the instruction out of its basic block, we do not
+  // retain its debug location. Doing so would degrade the debugging
+  // experience and adversely affect the accuracy of profiling information.
+  assert(!MI->isDebugInstr() && "Should not hoist debug inst");
+  MI->setDebugLoc(DebugLoc());
+
+  // Add register to livein list to all the BBs in the current loop since a
+  // loop invariant must be kept live throughout the whole loop. This is
+  // important to ensure later passes do not scavenge the def register.
+  AddToLiveIns(Def);
+
+  ++NumPostRAHoisted;
+  Changed = true;
+}
+
+/// Check if this mbb is guaranteed to execute. If not then a load from this mbb
+/// may not be safe to hoist.
+bool MachineLICMBase::IsGuaranteedToExecute(MachineBasicBlock *BB) {
+  if (SpeculationState != SpeculateUnknown)
+    return SpeculationState == SpeculateFalse;
+
+  if (BB != CurLoop->getHeader()) {
+    // Check loop exiting blocks.
+    SmallVector<MachineBasicBlock*, 8> CurrentLoopExitingBlocks;
+    CurLoop->getExitingBlocks(CurrentLoopExitingBlocks);
+    for (MachineBasicBlock *CurrentLoopExitingBlock : CurrentLoopExitingBlocks)
+      if (!DT->dominates(BB, CurrentLoopExitingBlock)) {
+        SpeculationState = SpeculateTrue;
+        return false;
+      }
+  }
+
+  SpeculationState = SpeculateFalse;
+  return true;
+}
+
+/// Check if \p MI is trivially remateralizable and if it does not have any
+/// virtual register uses. Even though rematerializable RA might not actually
+/// rematerialize it in this scenario. In that case we do not want to hoist such
+/// instruction out of the loop in a belief RA will sink it back if needed.
+bool MachineLICMBase::isTriviallyReMaterializable(
+    const MachineInstr &MI) const {
+  if (!TII->isTriviallyReMaterializable(MI))
+    return false;
+
+  for (const MachineOperand &MO : MI.all_uses()) {
+    if (MO.getReg().isVirtual())
+      return false;
+  }
+
+  return true;
+}
+
+void MachineLICMBase::EnterScope(MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << "Entering " << printMBBReference(*MBB) << '\n');
+
+  // Remember livein register pressure.
+  BackTrace.push_back(RegPressure);
+}
+
+void MachineLICMBase::ExitScope(MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << "Exiting " << printMBBReference(*MBB) << '\n');
+  BackTrace.pop_back();
+}
+
+/// Destroy scope for the MBB that corresponds to the given dominator tree node
+/// if its a leaf or all of its children are done. Walk up the dominator tree to
+/// destroy ancestors which are now done.
+void MachineLICMBase::ExitScopeIfDone(MachineDomTreeNode *Node,
+    DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
+    const DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap) {
+  if (OpenChildren[Node])
+    return;
+
+  for(;;) {
+    ExitScope(Node->getBlock());
+    // Now traverse upwards to pop ancestors whose offsprings are all done.
+    MachineDomTreeNode *Parent = ParentMap.lookup(Node);
+    if (!Parent || --OpenChildren[Parent] != 0)
+      break;
+    Node = Parent;
+  }
+}
+
+/// Walk the specified loop in the CFG (defined by all blocks dominated by the
+/// specified header block, and that are in the current loop) in depth first
+/// order w.r.t the DominatorTree. This allows us to visit definitions before
+/// uses, allowing us to hoist a loop body in one pass without iteration.
+void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
+  MachineBasicBlock *Preheader = getCurPreheader();
+  if (!Preheader)
+    return;
+
+  SmallVector<MachineDomTreeNode*, 32> Scopes;
+  SmallVector<MachineDomTreeNode*, 8> WorkList;
+  DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> ParentMap;
+  DenseMap<MachineDomTreeNode*, unsigned> OpenChildren;
+
+  // Perform a DFS walk to determine the order of visit.
+  WorkList.push_back(HeaderN);
+  while (!WorkList.empty()) {
+    MachineDomTreeNode *Node = WorkList.pop_back_val();
+    assert(Node && "Null dominator tree node?");
+    MachineBasicBlock *BB = Node->getBlock();
+
+    // If the header of the loop containing this basic block is a landing pad,
+    // then don't try to hoist instructions out of this loop.
+    const MachineLoop *ML = MLI->getLoopFor(BB);
+    if (ML && ML->getHeader()->isEHPad())
+      continue;
+
+    // If this subregion is not in the top level loop at all, exit.
+    if (!CurLoop->contains(BB))
+      continue;
+
+    Scopes.push_back(Node);
+    unsigned NumChildren = Node->getNumChildren();
+
+    // Don't hoist things out of a large switch statement.  This often causes
+    // code to be hoisted that wasn't going to be executed, and increases
+    // register pressure in a situation where it's likely to matter.
+    if (BB->succ_size() >= 25)
+      NumChildren = 0;
+
+    OpenChildren[Node] = NumChildren;
+    if (NumChildren) {
+      // Add children in reverse order as then the next popped worklist node is
+      // the first child of this node.  This means we ultimately traverse the
+      // DOM tree in exactly the same order as if we'd recursed.
+      for (MachineDomTreeNode *Child : reverse(Node->children())) {
+        ParentMap[Child] = Node;
+        WorkList.push_back(Child);
+      }
+    }
+  }
+
+  if (Scopes.size() == 0)
+    return;
+
+  // Compute registers which are livein into the loop headers.
+  RegSeen.clear();
+  BackTrace.clear();
+  InitRegPressure(Preheader);
+
+  // Now perform LICM.
+  for (MachineDomTreeNode *Node : Scopes) {
+    MachineBasicBlock *MBB = Node->getBlock();
+
+    EnterScope(MBB);
+
+    // Process the block
+    SpeculationState = SpeculateUnknown;
+    for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
+      if (!Hoist(&MI, Preheader))
+        UpdateRegPressure(&MI);
+      // If we have hoisted an instruction that may store, it can only be a
+      // constant store.
+    }
+
+    // If it's a leaf node, it's done. Traverse upwards to pop ancestors.
+    ExitScopeIfDone(Node, OpenChildren, ParentMap);
+  }
+}
+
+static bool isOperandKill(const MachineOperand &MO, MachineRegisterInfo *MRI) {
+  return MO.isKill() || MRI->hasOneNonDBGUse(MO.getReg());
+}
+
+/// Find all virtual register references that are liveout of the preheader to
+/// initialize the starting "register pressure". Note this does not count live
+/// through (livein but not used) registers.
+void MachineLICMBase::InitRegPressure(MachineBasicBlock *BB) {
+  std::fill(RegPressure.begin(), RegPressure.end(), 0);
+
+  // If the preheader has only a single predecessor and it ends with a
+  // fallthrough or an unconditional branch, then scan its predecessor for live
+  // defs as well. This happens whenever the preheader is created by splitting
+  // the critical edge from the loop predecessor to the loop header.
+  if (BB->pred_size() == 1) {
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+    SmallVector<MachineOperand, 4> Cond;
+    if (!TII->analyzeBranch(*BB, TBB, FBB, Cond, false) && Cond.empty())
+      InitRegPressure(*BB->pred_begin());
+  }
+
+  for (const MachineInstr &MI : *BB)
+    UpdateRegPressure(&MI, /*ConsiderUnseenAsDef=*/true);
+}
+
+/// Update estimate of register pressure after the specified instruction.
+void MachineLICMBase::UpdateRegPressure(const MachineInstr *MI,
+                                        bool ConsiderUnseenAsDef) {
+  auto Cost = calcRegisterCost(MI, /*ConsiderSeen=*/true, ConsiderUnseenAsDef);
+  for (const auto &RPIdAndCost : Cost) {
+    unsigned Class = RPIdAndCost.first;
+    if (static_cast<int>(RegPressure[Class]) < -RPIdAndCost.second)
+      RegPressure[Class] = 0;
+    else
+      RegPressure[Class] += RPIdAndCost.second;
+  }
+}
+
+/// Calculate the additional register pressure that the registers used in MI
+/// cause.
+///
+/// If 'ConsiderSeen' is true, updates 'RegSeen' and uses the information to
+/// figure out which usages are live-ins.
+/// FIXME: Figure out a way to consider 'RegSeen' from all code paths.
+DenseMap<unsigned, int>
+MachineLICMBase::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen,
+                                  bool ConsiderUnseenAsDef) {
+  DenseMap<unsigned, int> Cost;
+  if (MI->isImplicitDef())
+    return Cost;
+  for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg.isVirtual())
+      continue;
+
+    // FIXME: It seems bad to use RegSeen only for some of these calculations.
+    bool isNew = ConsiderSeen ? RegSeen.insert(Reg).second : false;
+    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+
+    RegClassWeight W = TRI->getRegClassWeight(RC);
+    int RCCost = 0;
+    if (MO.isDef())
+      RCCost = W.RegWeight;
+    else {
+      bool isKill = isOperandKill(MO, MRI);
+      if (isNew && !isKill && ConsiderUnseenAsDef)
+        // Haven't seen this, it must be a livein.
+        RCCost = W.RegWeight;
+      else if (!isNew && isKill)
+        RCCost = -W.RegWeight;
+    }
+    if (RCCost == 0)
+      continue;
+    const int *PS = TRI->getRegClassPressureSets(RC);
+    for (; *PS != -1; ++PS) {
+      if (!Cost.contains(*PS))
+        Cost[*PS] = RCCost;
+      else
+        Cost[*PS] += RCCost;
+    }
+  }
+  return Cost;
+}
+
+/// Return true if this machine instruction loads from global offset table or
+/// constant pool.
+static bool mayLoadFromGOTOrConstantPool(MachineInstr &MI) {
+  assert(MI.mayLoad() && "Expected MI that loads!");
+
+  // If we lost memory operands, conservatively assume that the instruction
+  // reads from everything..
+  if (MI.memoperands_empty())
+    return true;
+
+  for (MachineMemOperand *MemOp : MI.memoperands())
+    if (const PseudoSourceValue *PSV = MemOp->getPseudoValue())
+      if (PSV->isGOT() || PSV->isConstantPool())
+        return true;
+
+  return false;
+}
+
+// This function iterates through all the operands of the input store MI and
+// checks that each register operand statisfies isCallerPreservedPhysReg.
+// This means, the value being stored and the address where it is being stored
+// is constant throughout the body of the function (not including prologue and
+// epilogue). When called with an MI that isn't a store, it returns false.
+// A future improvement can be to check if the store registers are constant
+// throughout the loop rather than throughout the funtion.
+static bool isInvariantStore(const MachineInstr &MI,
+                             const TargetRegisterInfo *TRI,
+                             const MachineRegisterInfo *MRI) {
+
+  bool FoundCallerPresReg = false;
+  if (!MI.mayStore() || MI.hasUnmodeledSideEffects() ||
+      (MI.getNumOperands() == 0))
+    return false;
+
+  // Check that all register operands are caller-preserved physical registers.
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isReg()) {
+      Register Reg = MO.getReg();
+      // If operand is a virtual register, check if it comes from a copy of a
+      // physical register.
+      if (Reg.isVirtual())
+        Reg = TRI->lookThruCopyLike(MO.getReg(), MRI);
+      if (Reg.isVirtual())
+        return false;
+      if (!TRI->isCallerPreservedPhysReg(Reg.asMCReg(), *MI.getMF()))
+        return false;
+      else
+        FoundCallerPresReg = true;
+    } else if (!MO.isImm()) {
+        return false;
+    }
+  }
+  return FoundCallerPresReg;
+}
+
+// Return true if the input MI is a copy instruction that feeds an invariant
+// store instruction. This means that the src of the copy has to satisfy
+// isCallerPreservedPhysReg and atleast one of it's users should satisfy
+// isInvariantStore.
+static bool isCopyFeedingInvariantStore(const MachineInstr &MI,
+                                        const MachineRegisterInfo *MRI,
+                                        const TargetRegisterInfo *TRI) {
+
+  // FIXME: If targets would like to look through instructions that aren't
+  // pure copies, this can be updated to a query.
+  if (!MI.isCopy())
+    return false;
+
+  const MachineFunction *MF = MI.getMF();
+  // Check that we are copying a constant physical register.
+  Register CopySrcReg = MI.getOperand(1).getReg();
+  if (CopySrcReg.isVirtual())
+    return false;
+
+  if (!TRI->isCallerPreservedPhysReg(CopySrcReg.asMCReg(), *MF))
+    return false;
+
+  Register CopyDstReg = MI.getOperand(0).getReg();
+  // Check if any of the uses of the copy are invariant stores.
+  assert(CopyDstReg.isVirtual() && "copy dst is not a virtual reg");
+
+  for (MachineInstr &UseMI : MRI->use_instructions(CopyDstReg)) {
+    if (UseMI.mayStore() && isInvariantStore(UseMI, TRI, MRI))
+      return true;
+  }
+  return false;
+}
+
+/// Returns true if the instruction may be a suitable candidate for LICM.
+/// e.g. If the instruction is a call, then it's obviously not safe to hoist it.
+bool MachineLICMBase::IsLICMCandidate(MachineInstr &I) {
+  // Check if it's safe to move the instruction.
+  bool DontMoveAcrossStore = true;
+  if ((!I.isSafeToMove(AA, DontMoveAcrossStore)) &&
+      !(HoistConstStores && isInvariantStore(I, TRI, MRI))) {
+    LLVM_DEBUG(dbgs() << "LICM: Instruction not safe to move.\n");
+    return false;
+  }
+
+  // If it is a load then check if it is guaranteed to execute by making sure
+  // that it dominates all exiting blocks. If it doesn't, then there is a path
+  // out of the loop which does not execute this load, so we can't hoist it.
+  // Loads from constant memory are safe to speculate, for example indexed load
+  // from a jump table.
+  // Stores and side effects are already checked by isSafeToMove.
+  if (I.mayLoad() && !mayLoadFromGOTOrConstantPool(I) &&
+      !IsGuaranteedToExecute(I.getParent())) {
+    LLVM_DEBUG(dbgs() << "LICM: Load not guaranteed to execute.\n");
+    return false;
+  }
+
+  // Convergent attribute has been used on operations that involve inter-thread
+  // communication which results are implicitly affected by the enclosing
+  // control flows. It is not safe to hoist or sink such operations across
+  // control flow.
+  if (I.isConvergent())
+    return false;
+
+  if (!TII->shouldHoist(I, CurLoop))
+    return false;
+
+  return true;
+}
+
+/// Returns true if the instruction is loop invariant.
+bool MachineLICMBase::IsLoopInvariantInst(MachineInstr &I) {
+  if (!IsLICMCandidate(I)) {
+    LLVM_DEBUG(dbgs() << "LICM: Instruction not a LICM candidate\n");
+    return false;
+  }
+  return CurLoop->isLoopInvariant(I);
+}
+
+/// Return true if the specified instruction is used by a phi node and hoisting
+/// it could cause a copy to be inserted.
+bool MachineLICMBase::HasLoopPHIUse(const MachineInstr *MI) const {
+  SmallVector<const MachineInstr*, 8> Work(1, MI);
+  do {
+    MI = Work.pop_back_val();
+    for (const MachineOperand &MO : MI->all_defs()) {
+      Register Reg = MO.getReg();
+      if (!Reg.isVirtual())
+        continue;
+      for (MachineInstr &UseMI : MRI->use_instructions(Reg)) {
+        // A PHI may cause a copy to be inserted.
+        if (UseMI.isPHI()) {
+          // A PHI inside the loop causes a copy because the live range of Reg is
+          // extended across the PHI.
+          if (CurLoop->contains(&UseMI))
+            return true;
+          // A PHI in an exit block can cause a copy to be inserted if the PHI
+          // has multiple predecessors in the loop with different values.
+          // For now, approximate by rejecting all exit blocks.
+          if (isExitBlock(UseMI.getParent()))
+            return true;
+          continue;
+        }
+        // Look past copies as well.
+        if (UseMI.isCopy() && CurLoop->contains(&UseMI))
+          Work.push_back(&UseMI);
+      }
+    }
+  } while (!Work.empty());
+  return false;
+}
+
+/// Compute operand latency between a def of 'Reg' and an use in the current
+/// loop, return true if the target considered it high.
+bool MachineLICMBase::HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx,
+                                            Register Reg) const {
+  if (MRI->use_nodbg_empty(Reg))
+    return false;
+
+  for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg)) {
+    if (UseMI.isCopyLike())
+      continue;
+    if (!CurLoop->contains(UseMI.getParent()))
+      continue;
+    for (unsigned i = 0, e = UseMI.getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = UseMI.getOperand(i);
+      if (!MO.isReg() || !MO.isUse())
+        continue;
+      Register MOReg = MO.getReg();
+      if (MOReg != Reg)
+        continue;
+
+      if (TII->hasHighOperandLatency(SchedModel, MRI, MI, DefIdx, UseMI, i))
+        return true;
+    }
+
+    // Only look at the first in loop use.
+    break;
+  }
+
+  return false;
+}
+
+/// Return true if the instruction is marked "cheap" or the operand latency
+/// between its def and a use is one or less.
+bool MachineLICMBase::IsCheapInstruction(MachineInstr &MI) const {
+  if (TII->isAsCheapAsAMove(MI) || MI.isCopyLike())
+    return true;
+
+  bool isCheap = false;
+  unsigned NumDefs = MI.getDesc().getNumDefs();
+  for (unsigned i = 0, e = MI.getNumOperands(); NumDefs && i != e; ++i) {
+    MachineOperand &DefMO = MI.getOperand(i);
+    if (!DefMO.isReg() || !DefMO.isDef())
+      continue;
+    --NumDefs;
+    Register Reg = DefMO.getReg();
+    if (Reg.isPhysical())
+      continue;
+
+    if (!TII->hasLowDefLatency(SchedModel, MI, i))
+      return false;
+    isCheap = true;
+  }
+
+  return isCheap;
+}
+
+/// Visit BBs from header to current BB, check if hoisting an instruction of the
+/// given cost matrix can cause high register pressure.
+bool
+MachineLICMBase::CanCauseHighRegPressure(const DenseMap<unsigned, int>& Cost,
+                                         bool CheapInstr) {
+  for (const auto &RPIdAndCost : Cost) {
+    if (RPIdAndCost.second <= 0)
+      continue;
+
+    unsigned Class = RPIdAndCost.first;
+    int Limit = RegLimit[Class];
+
+    // Don't hoist cheap instructions if they would increase register pressure,
+    // even if we're under the limit.
+    if (CheapInstr && !HoistCheapInsts)
+      return true;
+
+    for (const auto &RP : BackTrace)
+      if (static_cast<int>(RP[Class]) + RPIdAndCost.second >= Limit)
+        return true;
+  }
+
+  return false;
+}
+
+/// Traverse the back trace from header to the current block and update their
+/// register pressures to reflect the effect of hoisting MI from the current
+/// block to the preheader.
+void MachineLICMBase::UpdateBackTraceRegPressure(const MachineInstr *MI) {
+  // First compute the 'cost' of the instruction, i.e. its contribution
+  // to register pressure.
+  auto Cost = calcRegisterCost(MI, /*ConsiderSeen=*/false,
+                               /*ConsiderUnseenAsDef=*/false);
+
+  // Update register pressure of blocks from loop header to current block.
+  for (auto &RP : BackTrace)
+    for (const auto &RPIdAndCost : Cost)
+      RP[RPIdAndCost.first] += RPIdAndCost.second;
+}
+
+/// Return true if it is potentially profitable to hoist the given loop
+/// invariant.
+bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI) {
+  if (MI.isImplicitDef())
+    return true;
+
+  // Besides removing computation from the loop, hoisting an instruction has
+  // these effects:
+  //
+  // - The value defined by the instruction becomes live across the entire
+  //   loop. This increases register pressure in the loop.
+  //
+  // - If the value is used by a PHI in the loop, a copy will be required for
+  //   lowering the PHI after extending the live range.
+  //
+  // - When hoisting the last use of a value in the loop, that value no longer
+  //   needs to be live in the loop. This lowers register pressure in the loop.
+
+  if (HoistConstStores &&  isCopyFeedingInvariantStore(MI, MRI, TRI))
+    return true;
+
+  bool CheapInstr = IsCheapInstruction(MI);
+  bool CreatesCopy = HasLoopPHIUse(&MI);
+
+  // Don't hoist a cheap instruction if it would create a copy in the loop.
+  if (CheapInstr && CreatesCopy) {
+    LLVM_DEBUG(dbgs() << "Won't hoist cheap instr with loop PHI use: " << MI);
+    return false;
+  }
+
+  // Rematerializable instructions should always be hoisted providing the
+  // register allocator can just pull them down again when needed.
+  if (isTriviallyReMaterializable(MI))
+    return true;
+
+  // FIXME: If there are long latency loop-invariant instructions inside the
+  // loop at this point, why didn't the optimizer's LICM hoist them?
+  for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg.isVirtual())
+      continue;
+    if (MO.isDef() && HasHighOperandLatency(MI, i, Reg)) {
+      LLVM_DEBUG(dbgs() << "Hoist High Latency: " << MI);
+      ++NumHighLatency;
+      return true;
+    }
+  }
+
+  // Estimate register pressure to determine whether to LICM the instruction.
+  // In low register pressure situation, we can be more aggressive about
+  // hoisting. Also, favors hoisting long latency instructions even in
+  // moderately high pressure situation.
+  // Cheap instructions will only be hoisted if they don't increase register
+  // pressure at all.
+  auto Cost = calcRegisterCost(&MI, /*ConsiderSeen=*/false,
+                               /*ConsiderUnseenAsDef=*/false);
+
+  // Visit BBs from header to current BB, if hoisting this doesn't cause
+  // high register pressure, then it's safe to proceed.
+  if (!CanCauseHighRegPressure(Cost, CheapInstr)) {
+    LLVM_DEBUG(dbgs() << "Hoist non-reg-pressure: " << MI);
+    ++NumLowRP;
+    return true;
+  }
+
+  // Don't risk increasing register pressure if it would create copies.
+  if (CreatesCopy) {
+    LLVM_DEBUG(dbgs() << "Won't hoist instr with loop PHI use: " << MI);
+    return false;
+  }
+
+  // Do not "speculate" in high register pressure situation. If an
+  // instruction is not guaranteed to be executed in the loop, it's best to be
+  // conservative.
+  if (AvoidSpeculation &&
+      (!IsGuaranteedToExecute(MI.getParent()) && !MayCSE(&MI))) {
+    LLVM_DEBUG(dbgs() << "Won't speculate: " << MI);
+    return false;
+  }
+
+  // High register pressure situation, only hoist if the instruction is going
+  // to be remat'ed.
+  if (!isTriviallyReMaterializable(MI) &&
+      !MI.isDereferenceableInvariantLoad()) {
+    LLVM_DEBUG(dbgs() << "Can't remat / high reg-pressure: " << MI);
+    return false;
+  }
+
+  return true;
+}
+
+/// Unfold a load from the given machineinstr if the load itself could be
+/// hoisted. Return the unfolded and hoistable load, or null if the load
+/// couldn't be unfolded or if it wouldn't be hoistable.
+MachineInstr *MachineLICMBase::ExtractHoistableLoad(MachineInstr *MI) {
+  // Don't unfold simple loads.
+  if (MI->canFoldAsLoad())
+    return nullptr;
+
+  // If not, we may be able to unfold a load and hoist that.
+  // First test whether the instruction is loading from an amenable
+  // memory location.
+  if (!MI->isDereferenceableInvariantLoad())
+    return nullptr;
+
+  // Next determine the register class for a temporary register.
+  unsigned LoadRegIndex;
+  unsigned NewOpc =
+    TII->getOpcodeAfterMemoryUnfold(MI->getOpcode(),
+                                    /*UnfoldLoad=*/true,
+                                    /*UnfoldStore=*/false,
+                                    &LoadRegIndex);
+  if (NewOpc == 0) return nullptr;
+  const MCInstrDesc &MID = TII->get(NewOpc);
+  MachineFunction &MF = *MI->getMF();
+  const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI, MF);
+  // Ok, we're unfolding. Create a temporary register and do the unfold.
+  Register Reg = MRI->createVirtualRegister(RC);
+
+  SmallVector<MachineInstr *, 2> NewMIs;
+  bool Success = TII->unfoldMemoryOperand(MF, *MI, Reg,
+                                          /*UnfoldLoad=*/true,
+                                          /*UnfoldStore=*/false, NewMIs);
+  (void)Success;
+  assert(Success &&
+         "unfoldMemoryOperand failed when getOpcodeAfterMemoryUnfold "
+         "succeeded!");
+  assert(NewMIs.size() == 2 &&
+         "Unfolded a load into multiple instructions!");
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineBasicBlock::iterator Pos = MI;
+  MBB->insert(Pos, NewMIs[0]);
+  MBB->insert(Pos, NewMIs[1]);
+  // If unfolding produced a load that wasn't loop-invariant or profitable to
+  // hoist, discard the new instructions and bail.
+  if (!IsLoopInvariantInst(*NewMIs[0]) || !IsProfitableToHoist(*NewMIs[0])) {
+    NewMIs[0]->eraseFromParent();
+    NewMIs[1]->eraseFromParent();
+    return nullptr;
+  }
+
+  // Update register pressure for the unfolded instruction.
+  UpdateRegPressure(NewMIs[1]);
+
+  // Otherwise we successfully unfolded a load that we can hoist.
+
+  // Update the call site info.
+  if (MI->shouldUpdateCallSiteInfo())
+    MF.eraseCallSiteInfo(MI);
+
+  MI->eraseFromParent();
+  return NewMIs[0];
+}
+
+/// Initialize the CSE map with instructions that are in the current loop
+/// preheader that may become duplicates of instructions that are hoisted
+/// out of the loop.
+void MachineLICMBase::InitCSEMap(MachineBasicBlock *BB) {
+  for (MachineInstr &MI : *BB)
+    CSEMap[MI.getOpcode()].push_back(&MI);
+}
+
+/// Find an instruction amount PrevMIs that is a duplicate of MI.
+/// Return this instruction if it's found.
+MachineInstr *
+MachineLICMBase::LookForDuplicate(const MachineInstr *MI,
+                                  std::vector<MachineInstr *> &PrevMIs) {
+  for (MachineInstr *PrevMI : PrevMIs)
+    if (TII->produceSameValue(*MI, *PrevMI, (PreRegAlloc ? MRI : nullptr)))
+      return PrevMI;
+
+  return nullptr;
+}
+
+/// Given a LICM'ed instruction, look for an instruction on the preheader that
+/// computes the same value. If it's found, do a RAU on with the definition of
+/// the existing instruction rather than hoisting the instruction to the
+/// preheader.
+bool MachineLICMBase::EliminateCSE(
+    MachineInstr *MI,
+    DenseMap<unsigned, std::vector<MachineInstr *>>::iterator &CI) {
+  // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate
+  // the undef property onto uses.
+  if (CI == CSEMap.end() || MI->isImplicitDef())
+    return false;
+
+  if (MachineInstr *Dup = LookForDuplicate(MI, CI->second)) {
+    LLVM_DEBUG(dbgs() << "CSEing " << *MI << " with " << *Dup);
+
+    // Replace virtual registers defined by MI by their counterparts defined
+    // by Dup.
+    SmallVector<unsigned, 2> Defs;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+
+      // Physical registers may not differ here.
+      assert((!MO.isReg() || MO.getReg() == 0 || !MO.getReg().isPhysical() ||
+              MO.getReg() == Dup->getOperand(i).getReg()) &&
+             "Instructions with different phys regs are not identical!");
+
+      if (MO.isReg() && MO.isDef() && !MO.getReg().isPhysical())
+        Defs.push_back(i);
+    }
+
+    SmallVector<const TargetRegisterClass*, 2> OrigRCs;
+    for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+      unsigned Idx = Defs[i];
+      Register Reg = MI->getOperand(Idx).getReg();
+      Register DupReg = Dup->getOperand(Idx).getReg();
+      OrigRCs.push_back(MRI->getRegClass(DupReg));
+
+      if (!MRI->constrainRegClass(DupReg, MRI->getRegClass(Reg))) {
+        // Restore old RCs if more than one defs.
+        for (unsigned j = 0; j != i; ++j)
+          MRI->setRegClass(Dup->getOperand(Defs[j]).getReg(), OrigRCs[j]);
+        return false;
+      }
+    }
+
+    for (unsigned Idx : Defs) {
+      Register Reg = MI->getOperand(Idx).getReg();
+      Register DupReg = Dup->getOperand(Idx).getReg();
+      MRI->replaceRegWith(Reg, DupReg);
+      MRI->clearKillFlags(DupReg);
+      // Clear Dup dead flag if any, we reuse it for Reg.
+      if (!MRI->use_nodbg_empty(DupReg))
+        Dup->getOperand(Idx).setIsDead(false);
+    }
+
+    MI->eraseFromParent();
+    ++NumCSEed;
+    return true;
+  }
+  return false;
+}
+
+/// Return true if the given instruction will be CSE'd if it's hoisted out of
+/// the loop.
+bool MachineLICMBase::MayCSE(MachineInstr *MI) {
+  unsigned Opcode = MI->getOpcode();
+  DenseMap<unsigned, std::vector<MachineInstr *>>::iterator CI =
+      CSEMap.find(Opcode);
+  // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate
+  // the undef property onto uses.
+  if (CI == CSEMap.end() || MI->isImplicitDef())
+    return false;
+
+  return LookForDuplicate(MI, CI->second) != nullptr;
+}
+
+/// When an instruction is found to use only loop invariant operands
+/// that are safe to hoist, this instruction is called to do the dirty work.
+/// It returns true if the instruction is hoisted.
+bool MachineLICMBase::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
+  MachineBasicBlock *SrcBlock = MI->getParent();
+
+  // Disable the instruction hoisting due to block hotness
+  if ((DisableHoistingToHotterBlocks == UseBFI::All ||
+      (DisableHoistingToHotterBlocks == UseBFI::PGO && HasProfileData)) &&
+      isTgtHotterThanSrc(SrcBlock, Preheader)) {
+    ++NumNotHoistedDueToHotness;
+    return false;
+  }
+  // First check whether we should hoist this instruction.
+  if (!IsLoopInvariantInst(*MI) || !IsProfitableToHoist(*MI)) {
+    // If not, try unfolding a hoistable load.
+    MI = ExtractHoistableLoad(MI);
+    if (!MI) return false;
+  }
+
+  // If we have hoisted an instruction that may store, it can only be a constant
+  // store.
+  if (MI->mayStore())
+    NumStoreConst++;
+
+  // Now move the instructions to the predecessor, inserting it before any
+  // terminator instructions.
+  LLVM_DEBUG({
+    dbgs() << "Hoisting " << *MI;
+    if (MI->getParent()->getBasicBlock())
+      dbgs() << " from " << printMBBReference(*MI->getParent());
+    if (Preheader->getBasicBlock())
+      dbgs() << " to " << printMBBReference(*Preheader);
+    dbgs() << "\n";
+  });
+
+  // If this is the first instruction being hoisted to the preheader,
+  // initialize the CSE map with potential common expressions.
+  if (FirstInLoop) {
+    InitCSEMap(Preheader);
+    FirstInLoop = false;
+  }
+
+  // Look for opportunity to CSE the hoisted instruction.
+  unsigned Opcode = MI->getOpcode();
+  DenseMap<unsigned, std::vector<MachineInstr *>>::iterator CI =
+      CSEMap.find(Opcode);
+  if (!EliminateCSE(MI, CI)) {
+    // Otherwise, splice the instruction to the preheader.
+    Preheader->splice(Preheader->getFirstTerminator(),MI->getParent(),MI);
+
+    // Since we are moving the instruction out of its basic block, we do not
+    // retain its debug location. Doing so would degrade the debugging
+    // experience and adversely affect the accuracy of profiling information.
+    assert(!MI->isDebugInstr() && "Should not hoist debug inst");
+    MI->setDebugLoc(DebugLoc());
+
+    // Update register pressure for BBs from header to this block.
+    UpdateBackTraceRegPressure(MI);
+
+    // Clear the kill flags of any register this instruction defines,
+    // since they may need to be live throughout the entire loop
+    // rather than just live for part of it.
+    for (MachineOperand &MO : MI->all_defs())
+      if (!MO.isDead())
+        MRI->clearKillFlags(MO.getReg());
+
+    // Add to the CSE map.
+    if (CI != CSEMap.end())
+      CI->second.push_back(MI);
+    else
+      CSEMap[Opcode].push_back(MI);
+  }
+
+  ++NumHoisted;
+  Changed = true;
+
+  return true;
+}
+
+/// Get the preheader for the current loop, splitting a critical edge if needed.
+MachineBasicBlock *MachineLICMBase::getCurPreheader() {
+  // Determine the block to which to hoist instructions. If we can't find a
+  // suitable loop predecessor, we can't do any hoisting.
+
+  // If we've tried to get a preheader and failed, don't try again.
+  if (CurPreheader == reinterpret_cast<MachineBasicBlock *>(-1))
+    return nullptr;
+
+  if (!CurPreheader) {
+    CurPreheader = CurLoop->getLoopPreheader();
+    if (!CurPreheader) {
+      MachineBasicBlock *Pred = CurLoop->getLoopPredecessor();
+      if (!Pred) {
+        CurPreheader = reinterpret_cast<MachineBasicBlock *>(-1);
+        return nullptr;
+      }
+
+      CurPreheader = Pred->SplitCriticalEdge(CurLoop->getHeader(), *this);
+      if (!CurPreheader) {
+        CurPreheader = reinterpret_cast<MachineBasicBlock *>(-1);
+        return nullptr;
+      }
+    }
+  }
+  return CurPreheader;
+}
+
+/// Is the target basic block at least "BlockFrequencyRatioThreshold"
+/// times hotter than the source basic block.
+bool MachineLICMBase::isTgtHotterThanSrc(MachineBasicBlock *SrcBlock,
+                                         MachineBasicBlock *TgtBlock) {
+  // Parse source and target basic block frequency from MBFI
+  uint64_t SrcBF = MBFI->getBlockFreq(SrcBlock).getFrequency();
+  uint64_t DstBF = MBFI->getBlockFreq(TgtBlock).getFrequency();
+
+  // Disable the hoisting if source block frequency is zero
+  if (!SrcBF)
+    return true;
+
+  double Ratio = (double)DstBF / SrcBF;
+
+  // Compare the block frequency ratio with the threshold
+  return Ratio > BlockFrequencyRatioThreshold;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
new file mode 100644
index 000000000000..c44b968b317d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
@@ -0,0 +1,249 @@
+//==--- MachineLateInstrsCleanup.cpp - Late Instructions Cleanup Pass -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This simple pass removes any identical and redundant immediate or address
+// loads to the same register. The immediate loads removed can originally be
+// the result of rematerialization, while the addresses are redundant frame
+// addressing anchor points created during Frame Indices elimination.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-latecleanup"
+
+STATISTIC(NumRemoved, "Number of redundant instructions removed.");
+
+namespace {
+
+class MachineLateInstrsCleanup : public MachineFunctionPass {
+  const TargetRegisterInfo *TRI = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+
+  // Data structures to map regs to their definitions and kills per MBB.
+  struct Reg2MIMap : public SmallDenseMap<Register, MachineInstr *> {
+    bool hasIdentical(Register Reg, MachineInstr *ArgMI) {
+      MachineInstr *MI = lookup(Reg);
+      return MI && MI->isIdenticalTo(*ArgMI);
+    }
+  };
+
+  std::vector<Reg2MIMap> RegDefs;
+  std::vector<Reg2MIMap> RegKills;
+
+  // Walk through the instructions in MBB and remove any redundant
+  // instructions.
+  bool processBlock(MachineBasicBlock *MBB);
+
+  void removeRedundantDef(MachineInstr *MI);
+  void clearKillsForDef(Register Reg, MachineBasicBlock *MBB,
+                        MachineBasicBlock::iterator I,
+                        BitVector &VisitedPreds);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  MachineLateInstrsCleanup() : MachineFunctionPass(ID) {
+    initializeMachineLateInstrsCleanupPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+};
+
+} // end anonymous namespace
+
+char MachineLateInstrsCleanup::ID = 0;
+
+char &llvm::MachineLateInstrsCleanupID = MachineLateInstrsCleanup::ID;
+
+INITIALIZE_PASS(MachineLateInstrsCleanup, DEBUG_TYPE,
+                "Machine Late Instructions Cleanup Pass", false, false)
+
+bool MachineLateInstrsCleanup::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  RegDefs.clear();
+  RegDefs.resize(MF.getNumBlockIDs());
+  RegKills.clear();
+  RegKills.resize(MF.getNumBlockIDs());
+
+  // Visit all MBBs in an order that maximises the reuse from predecessors.
+  bool Changed = false;
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  for (MachineBasicBlock *MBB : RPOT)
+    Changed |= processBlock(MBB);
+
+  return Changed;
+}
+
+// Clear any previous kill flag on Reg found before I in MBB. Walk backwards
+// in MBB and if needed continue in predecessors until a use/def of Reg is
+// encountered. This seems to be faster in practice than tracking kill flags
+// in a map.
+void MachineLateInstrsCleanup::
+clearKillsForDef(Register Reg, MachineBasicBlock *MBB,
+                 MachineBasicBlock::iterator I,
+                 BitVector &VisitedPreds) {
+  VisitedPreds.set(MBB->getNumber());
+
+  // Kill flag in MBB
+  if (MachineInstr *KillMI = RegKills[MBB->getNumber()].lookup(Reg)) {
+    KillMI->clearRegisterKills(Reg, TRI);
+    return;
+  }
+
+  // Def in MBB (missing kill flag)
+  if (MachineInstr *DefMI = RegDefs[MBB->getNumber()].lookup(Reg))
+    if (DefMI->getParent() == MBB)
+      return;
+
+  // If an earlier def is not in MBB, continue in predecessors.
+  if (!MBB->isLiveIn(Reg))
+    MBB->addLiveIn(Reg);
+  assert(!MBB->pred_empty() && "Predecessor def not found!");
+  for (MachineBasicBlock *Pred : MBB->predecessors())
+    if (!VisitedPreds.test(Pred->getNumber()))
+      clearKillsForDef(Reg, Pred, Pred->end(), VisitedPreds);
+}
+
+void MachineLateInstrsCleanup::removeRedundantDef(MachineInstr *MI) {
+  Register Reg = MI->getOperand(0).getReg();
+  BitVector VisitedPreds(MI->getMF()->getNumBlockIDs());
+  clearKillsForDef(Reg, MI->getParent(), MI->getIterator(), VisitedPreds);
+  MI->eraseFromParent();
+  ++NumRemoved;
+}
+
+// Return true if MI is a potential candidate for reuse/removal and if so
+// also the register it defines in DefedReg.  A candidate is a simple
+// instruction that does not touch memory, has only one register definition
+// and the only reg it may use is FrameReg. Typically this is an immediate
+// load or a load-address instruction.
+static bool isCandidate(const MachineInstr *MI, Register &DefedReg,
+                        Register FrameReg) {
+  DefedReg = MCRegister::NoRegister;
+  bool SawStore = true;
+  if (!MI->isSafeToMove(nullptr, SawStore) || MI->isImplicitDef() ||
+      MI->isInlineAsm())
+    return false;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg()) {
+      if (MO.isDef()) {
+        if (i == 0 && !MO.isImplicit() && !MO.isDead())
+          DefedReg = MO.getReg();
+        else
+          return false;
+      } else if (MO.getReg() && MO.getReg() != FrameReg)
+        return false;
+    } else if (!(MO.isImm() || MO.isCImm() || MO.isFPImm() || MO.isCPI() ||
+                 MO.isGlobal() || MO.isSymbol()))
+      return false;
+  }
+  return DefedReg.isValid();
+}
+
+bool MachineLateInstrsCleanup::processBlock(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  Reg2MIMap &MBBDefs = RegDefs[MBB->getNumber()];
+  Reg2MIMap &MBBKills = RegKills[MBB->getNumber()];
+
+  // Find reusable definitions in the predecessor(s).
+  if (!MBB->pred_empty() && !MBB->isEHPad() &&
+      !MBB->isInlineAsmBrIndirectTarget()) {
+    MachineBasicBlock *FirstPred = *MBB->pred_begin();
+    for (auto [Reg, DefMI] : RegDefs[FirstPred->getNumber()])
+      if (llvm::all_of(
+              drop_begin(MBB->predecessors()),
+              [&, &Reg = Reg, &DefMI = DefMI](const MachineBasicBlock *Pred) {
+                return RegDefs[Pred->getNumber()].hasIdentical(Reg, DefMI);
+              })) {
+        MBBDefs[Reg] = DefMI;
+        LLVM_DEBUG(dbgs() << "Reusable instruction from pred(s): in "
+                          << printMBBReference(*MBB) << ":  " << *DefMI;);
+      }
+  }
+
+  // Process MBB.
+  MachineFunction *MF = MBB->getParent();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  Register FrameReg = TRI->getFrameRegister(*MF);
+  for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
+    // If FrameReg is modified, no previous load-address instructions (using
+    // it) are valid.
+    if (MI.modifiesRegister(FrameReg, TRI)) {
+      MBBDefs.clear();
+      MBBKills.clear();
+      continue;
+    }
+
+    Register DefedReg;
+    bool IsCandidate = isCandidate(&MI, DefedReg, FrameReg);
+
+    // Check for an earlier identical and reusable instruction.
+    if (IsCandidate && MBBDefs.hasIdentical(DefedReg, &MI)) {
+      LLVM_DEBUG(dbgs() << "Removing redundant instruction in "
+                        << printMBBReference(*MBB) << ":  " << MI;);
+      removeRedundantDef(&MI);
+      Changed = true;
+      continue;
+    }
+
+    // Clear any entries in map that MI clobbers.
+    for (auto DefI : llvm::make_early_inc_range(MBBDefs)) {
+      Register Reg = DefI.first;
+      if (MI.modifiesRegister(Reg, TRI)) {
+        MBBDefs.erase(Reg);
+        MBBKills.erase(Reg);
+      } else if (MI.findRegisterUseOperandIdx(Reg, true /*isKill*/, TRI) != -1)
+        // Keep track of register kills.
+        MBBKills[Reg] = &MI;
+    }
+
+    // Record this MI for potential later reuse.
+    if (IsCandidate) {
+      LLVM_DEBUG(dbgs() << "Found interesting instruction in "
+                        << printMBBReference(*MBB) << ":  " << MI;);
+      MBBDefs[DefedReg] = &MI;
+      assert(!MBBKills.count(DefedReg) && "Should already have been removed.");
+    }
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineLoopInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineLoopInfo.cpp
new file mode 100644
index 000000000000..37a0ff3d71c8
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -0,0 +1,214 @@
+//===- MachineLoopInfo.cpp - Natural Loop Calculator ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MachineLoopInfo class that is used to identify natural
+// loops and determine the loop depth of various nodes of the CFG.  Note that
+// the loops identified may actually be several natural loops that share the
+// same header node... not just a single natural loop.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/GenericLoopInfoImpl.h"
+
+using namespace llvm;
+
+// Explicitly instantiate methods in LoopInfoImpl.h for MI-level Loops.
+template class llvm::LoopBase<MachineBasicBlock, MachineLoop>;
+template class llvm::LoopInfoBase<MachineBasicBlock, MachineLoop>;
+
+char MachineLoopInfo::ID = 0;
+MachineLoopInfo::MachineLoopInfo() : MachineFunctionPass(ID) {
+  initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
+}
+INITIALIZE_PASS_BEGIN(MachineLoopInfo, "machine-loops",
+                "Machine Natural Loop Construction", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(MachineLoopInfo, "machine-loops",
+                "Machine Natural Loop Construction", true, true)
+
+char &llvm::MachineLoopInfoID = MachineLoopInfo::ID;
+
+bool MachineLoopInfo::runOnMachineFunction(MachineFunction &) {
+  calculate(getAnalysis<MachineDominatorTree>());
+  return false;
+}
+
+void MachineLoopInfo::calculate(MachineDominatorTree &MDT) {
+  releaseMemory();
+  LI.analyze(MDT.getBase());
+}
+
+void MachineLoopInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineBasicBlock *MachineLoop::getTopBlock() {
+  MachineBasicBlock *TopMBB = getHeader();
+  MachineFunction::iterator Begin = TopMBB->getParent()->begin();
+  if (TopMBB->getIterator() != Begin) {
+    MachineBasicBlock *PriorMBB = &*std::prev(TopMBB->getIterator());
+    while (contains(PriorMBB)) {
+      TopMBB = PriorMBB;
+      if (TopMBB->getIterator() == Begin)
+        break;
+      PriorMBB = &*std::prev(TopMBB->getIterator());
+    }
+  }
+  return TopMBB;
+}
+
+MachineBasicBlock *MachineLoop::getBottomBlock() {
+  MachineBasicBlock *BotMBB = getHeader();
+  MachineFunction::iterator End = BotMBB->getParent()->end();
+  if (BotMBB->getIterator() != std::prev(End)) {
+    MachineBasicBlock *NextMBB = &*std::next(BotMBB->getIterator());
+    while (contains(NextMBB)) {
+      BotMBB = NextMBB;
+      if (BotMBB == &*std::next(BotMBB->getIterator()))
+        break;
+      NextMBB = &*std::next(BotMBB->getIterator());
+    }
+  }
+  return BotMBB;
+}
+
+MachineBasicBlock *MachineLoop::findLoopControlBlock() {
+  if (MachineBasicBlock *Latch = getLoopLatch()) {
+    if (isLoopExiting(Latch))
+      return Latch;
+    else
+      return getExitingBlock();
+  }
+  return nullptr;
+}
+
+DebugLoc MachineLoop::getStartLoc() const {
+  // Try the pre-header first.
+  if (MachineBasicBlock *PHeadMBB = getLoopPreheader())
+    if (const BasicBlock *PHeadBB = PHeadMBB->getBasicBlock())
+      if (DebugLoc DL = PHeadBB->getTerminator()->getDebugLoc())
+        return DL;
+
+  // If we have no pre-header or there are no instructions with debug
+  // info in it, try the header.
+  if (MachineBasicBlock *HeadMBB = getHeader())
+    if (const BasicBlock *HeadBB = HeadMBB->getBasicBlock())
+      return HeadBB->getTerminator()->getDebugLoc();
+
+  return DebugLoc();
+}
+
+MachineBasicBlock *
+MachineLoopInfo::findLoopPreheader(MachineLoop *L, bool SpeculativePreheader,
+                                   bool FindMultiLoopPreheader) const {
+  if (MachineBasicBlock *PB = L->getLoopPreheader())
+    return PB;
+
+  if (!SpeculativePreheader)
+    return nullptr;
+
+  MachineBasicBlock *HB = L->getHeader(), *LB = L->getLoopLatch();
+  if (HB->pred_size() != 2 || HB->hasAddressTaken())
+    return nullptr;
+  // Find the predecessor of the header that is not the latch block.
+  MachineBasicBlock *Preheader = nullptr;
+  for (MachineBasicBlock *P : HB->predecessors()) {
+    if (P == LB)
+      continue;
+    // Sanity.
+    if (Preheader)
+      return nullptr;
+    Preheader = P;
+  }
+
+  // Check if the preheader candidate is a successor of any other loop
+  // headers. We want to avoid having two loop setups in the same block.
+  if (!FindMultiLoopPreheader) {
+    for (MachineBasicBlock *S : Preheader->successors()) {
+      if (S == HB)
+        continue;
+      MachineLoop *T = getLoopFor(S);
+      if (T && T->getHeader() == S)
+        return nullptr;
+    }
+  }
+  return Preheader;
+}
+
+bool MachineLoop::isLoopInvariant(MachineInstr &I) const {
+  MachineFunction *MF = I.getParent()->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  const TargetSubtargetInfo &ST = MF->getSubtarget();
+  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+  const TargetInstrInfo *TII = ST.getInstrInfo();
+
+  // The instruction is loop invariant if all of its operands are.
+  for (const MachineOperand &MO : I.operands()) {
+    if (!MO.isReg())
+      continue;
+
+    Register Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    // An instruction that uses or defines a physical register can't e.g. be
+    // hoisted, so mark this as not invariant.
+    if (Reg.isPhysical()) {
+      if (MO.isUse()) {
+        // If the physreg has no defs anywhere, it's just an ambient register
+        // and we can freely move its uses. Alternatively, if it's allocatable,
+        // it could get allocated to something with a def during allocation.
+        // However, if the physreg is known to always be caller saved/restored
+        // then this use is safe to hoist.
+        if (!MRI->isConstantPhysReg(Reg) &&
+            !(TRI->isCallerPreservedPhysReg(Reg.asMCReg(), *I.getMF())) &&
+            !TII->isIgnorableUse(MO))
+          return false;
+        // Otherwise it's safe to move.
+        continue;
+      } else if (!MO.isDead()) {
+        // A def that isn't dead can't be moved.
+        return false;
+      } else if (getHeader()->isLiveIn(Reg)) {
+        // If the reg is live into the loop, we can't hoist an instruction
+        // which would clobber it.
+        return false;
+      }
+    }
+
+    if (!MO.isUse())
+      continue;
+
+    assert(MRI->getVRegDef(Reg) &&
+           "Machine instr not mapped for this vreg?!");
+
+    // If the loop contains the definition of an operand, then the instruction
+    // isn't loop invariant.
+    if (contains(MRI->getVRegDef(Reg)))
+      return false;
+  }
+
+  // If we got this far, the instruction is loop invariant!
+  return true;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineLoop::dump() const {
+  print(dbgs());
+}
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineLoopUtils.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineLoopUtils.cpp
new file mode 100644
index 000000000000..0e8335d4974d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineLoopUtils.cpp
@@ -0,0 +1,134 @@
+//=- MachineLoopUtils.cpp - Functions for manipulating loops ----------------=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineLoopUtils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+using namespace llvm;
+
+namespace {
+// MI's parent and BB are clones of each other. Find the equivalent copy of MI
+// in BB.
+MachineInstr &findEquivalentInstruction(MachineInstr &MI,
+                                        MachineBasicBlock *BB) {
+  MachineBasicBlock *PB = MI.getParent();
+  unsigned Offset = std::distance(PB->instr_begin(), MachineBasicBlock::instr_iterator(MI));
+  return *std::next(BB->instr_begin(), Offset);
+}
+} // namespace
+
+MachineBasicBlock *llvm::PeelSingleBlockLoop(LoopPeelDirection Direction,
+                                             MachineBasicBlock *Loop,
+                                             MachineRegisterInfo &MRI,
+                                             const TargetInstrInfo *TII) {
+  MachineFunction &MF = *Loop->getParent();
+  MachineBasicBlock *Preheader = *Loop->pred_begin();
+  if (Preheader == Loop)
+    Preheader = *std::next(Loop->pred_begin());
+  MachineBasicBlock *Exit = *Loop->succ_begin();
+  if (Exit == Loop)
+    Exit = *std::next(Loop->succ_begin());
+
+  MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(Loop->getBasicBlock());
+  if (Direction == LPD_Front)
+    MF.insert(Loop->getIterator(), NewBB);
+  else
+    MF.insert(std::next(Loop->getIterator()), NewBB);
+
+  DenseMap<Register, Register> Remaps;
+  auto InsertPt = NewBB->end();
+  for (MachineInstr &MI : *Loop) {
+    MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
+    NewBB->insert(InsertPt, NewMI);
+    for (MachineOperand &MO : NewMI->defs()) {
+      Register OrigR = MO.getReg();
+      if (OrigR.isPhysical())
+        continue;
+      Register &R = Remaps[OrigR];
+      R = MRI.createVirtualRegister(MRI.getRegClass(OrigR));
+      MO.setReg(R);
+
+      if (Direction == LPD_Back) {
+        // Replace all uses outside the original loop with the new register.
+        // FIXME: is the use_iterator stable enough to mutate register uses
+        // while iterating?
+        SmallVector<MachineOperand *, 4> Uses;
+        for (auto &Use : MRI.use_operands(OrigR))
+          if (Use.getParent()->getParent() != Loop)
+            Uses.push_back(&Use);
+        for (auto *Use : Uses) {
+          const TargetRegisterClass *ConstrainRegClass =
+              MRI.constrainRegClass(R, MRI.getRegClass(Use->getReg()));
+          assert(ConstrainRegClass &&
+                 "Expected a valid constrained register class!");
+          (void)ConstrainRegClass;
+          Use->setReg(R);
+        }
+      }
+    }
+  }
+
+  for (auto I = NewBB->getFirstNonPHI(); I != NewBB->end(); ++I)
+    for (MachineOperand &MO : I->uses())
+      if (MO.isReg() && Remaps.count(MO.getReg()))
+        MO.setReg(Remaps[MO.getReg()]);
+
+  for (auto I = NewBB->begin(); I->isPHI(); ++I) {
+    MachineInstr &MI = *I;
+    unsigned LoopRegIdx = 3, InitRegIdx = 1;
+    if (MI.getOperand(2).getMBB() != Preheader)
+      std::swap(LoopRegIdx, InitRegIdx);
+    MachineInstr &OrigPhi = findEquivalentInstruction(MI, Loop);
+    assert(OrigPhi.isPHI());
+    if (Direction == LPD_Front) {
+      // When peeling front, we are only left with the initial value from the
+      // preheader.
+      Register R = MI.getOperand(LoopRegIdx).getReg();
+      if (Remaps.count(R))
+        R = Remaps[R];
+      OrigPhi.getOperand(InitRegIdx).setReg(R);
+      MI.removeOperand(LoopRegIdx + 1);
+      MI.removeOperand(LoopRegIdx + 0);
+    } else {
+      // When peeling back, the initial value is the loop-carried value from
+      // the original loop.
+      Register LoopReg = OrigPhi.getOperand(LoopRegIdx).getReg();
+      MI.getOperand(LoopRegIdx).setReg(LoopReg);
+      MI.removeOperand(InitRegIdx + 1);
+      MI.removeOperand(InitRegIdx + 0);
+    }
+  }
+
+  DebugLoc DL;
+  if (Direction == LPD_Front) {
+    Preheader->ReplaceUsesOfBlockWith(Loop, NewBB);
+    NewBB->addSuccessor(Loop);
+    Loop->replacePhiUsesWith(Preheader, NewBB);
+    Preheader->updateTerminator(Loop);
+    TII->removeBranch(*NewBB);
+    TII->insertBranch(*NewBB, Loop, nullptr, {}, DL);
+  } else {
+    Loop->replaceSuccessor(Exit, NewBB);
+    Exit->replacePhiUsesWith(Loop, NewBB);
+    NewBB->addSuccessor(Exit);
+
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+    SmallVector<MachineOperand, 4> Cond;
+    bool CanAnalyzeBr = !TII->analyzeBranch(*Loop, TBB, FBB, Cond);
+    (void)CanAnalyzeBr;
+    assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
+    TII->removeBranch(*Loop);
+    TII->insertBranch(*Loop, TBB == Exit ? NewBB : TBB,
+                      FBB == Exit ? NewBB : FBB, Cond, DL);
+    if (TII->removeBranch(*NewBB) > 0)
+      TII->insertBranch(*NewBB, Exit, nullptr, {}, DL);
+  }
+
+  return NewBB;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineModuleInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineModuleInfo.cpp
new file mode 100644
index 000000000000..921feb253d64
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -0,0 +1,247 @@
+//===-- llvm/CodeGen/MachineModuleInfo.cpp ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::dwarf;
+
+static cl::opt<bool>
+    DisableDebugInfoPrinting("disable-debug-info-print", cl::Hidden,
+                             cl::desc("Disable debug info printing"));
+
+// Out of line virtual method.
+MachineModuleInfoImpl::~MachineModuleInfoImpl() = default;
+
+void MachineModuleInfo::initialize() {
+  ObjFileMMI = nullptr;
+  CurCallSite = 0;
+  NextFnNum = 0;
+  UsesMSVCFloatingPoint = false;
+  DbgInfoAvailable = false;
+}
+
+void MachineModuleInfo::finalize() {
+  Context.reset();
+  // We don't clear the ExternalContext.
+
+  delete ObjFileMMI;
+  ObjFileMMI = nullptr;
+}
+
+MachineModuleInfo::MachineModuleInfo(MachineModuleInfo &&MMI)
+    : TM(std::move(MMI.TM)),
+      Context(TM.getTargetTriple(), TM.getMCAsmInfo(), TM.getMCRegisterInfo(),
+              TM.getMCSubtargetInfo(), nullptr, &TM.Options.MCOptions, false),
+      MachineFunctions(std::move(MMI.MachineFunctions)) {
+  Context.setObjectFileInfo(TM.getObjFileLowering());
+  ObjFileMMI = MMI.ObjFileMMI;
+  CurCallSite = MMI.CurCallSite;
+  ExternalContext = MMI.ExternalContext;
+  TheModule = MMI.TheModule;
+}
+
+MachineModuleInfo::MachineModuleInfo(const LLVMTargetMachine *TM)
+    : TM(*TM), Context(TM->getTargetTriple(), TM->getMCAsmInfo(),
+                       TM->getMCRegisterInfo(), TM->getMCSubtargetInfo(),
+                       nullptr, &TM->Options.MCOptions, false) {
+  Context.setObjectFileInfo(TM->getObjFileLowering());
+  initialize();
+}
+
+MachineModuleInfo::MachineModuleInfo(const LLVMTargetMachine *TM,
+                                     MCContext *ExtContext)
+    : TM(*TM), Context(TM->getTargetTriple(), TM->getMCAsmInfo(),
+                       TM->getMCRegisterInfo(), TM->getMCSubtargetInfo(),
+                       nullptr, &TM->Options.MCOptions, false),
+      ExternalContext(ExtContext) {
+  Context.setObjectFileInfo(TM->getObjFileLowering());
+  initialize();
+}
+
+MachineModuleInfo::~MachineModuleInfo() { finalize(); }
+
+MachineFunction *
+MachineModuleInfo::getMachineFunction(const Function &F) const {
+  auto I = MachineFunctions.find(&F);
+  return I != MachineFunctions.end() ? I->second.get() : nullptr;
+}
+
+MachineFunction &MachineModuleInfo::getOrCreateMachineFunction(Function &F) {
+  // Shortcut for the common case where a sequence of MachineFunctionPasses
+  // all query for the same Function.
+  if (LastRequest == &F)
+    return *LastResult;
+
+  auto I = MachineFunctions.insert(
+      std::make_pair(&F, std::unique_ptr<MachineFunction>()));
+  MachineFunction *MF;
+  if (I.second) {
+    // No pre-existing machine function, create a new one.
+    const TargetSubtargetInfo &STI = *TM.getSubtargetImpl(F);
+    MF = new MachineFunction(F, TM, STI, NextFnNum++, *this);
+    MF->initTargetMachineFunctionInfo(STI);
+
+    // MRI callback for target specific initializations.
+    TM.registerMachineRegisterInfoCallback(*MF);
+
+    // Update the set entry.
+    I.first->second.reset(MF);
+  } else {
+    MF = I.first->second.get();
+  }
+
+  LastRequest = &F;
+  LastResult = MF;
+  return *MF;
+}
+
+void MachineModuleInfo::deleteMachineFunctionFor(Function &F) {
+  MachineFunctions.erase(&F);
+  LastRequest = nullptr;
+  LastResult = nullptr;
+}
+
+void MachineModuleInfo::insertFunction(const Function &F,
+                                       std::unique_ptr<MachineFunction> &&MF) {
+  auto I = MachineFunctions.insert(std::make_pair(&F, std::move(MF)));
+  assert(I.second && "machine function already mapped");
+  (void)I;
+}
+
+namespace {
+
+/// This pass frees the MachineFunction object associated with a Function.
+class FreeMachineFunction : public FunctionPass {
+public:
+  static char ID;
+
+  FreeMachineFunction() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.addPreserved<MachineModuleInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    MachineModuleInfo &MMI =
+        getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+    MMI.deleteMachineFunctionFor(F);
+    return true;
+  }
+
+  StringRef getPassName() const override {
+    return "Free MachineFunction";
+  }
+};
+
+} // end anonymous namespace
+
+char FreeMachineFunction::ID;
+
+FunctionPass *llvm::createFreeMachineFunctionPass() {
+  return new FreeMachineFunction();
+}
+
+MachineModuleInfoWrapperPass::MachineModuleInfoWrapperPass(
+    const LLVMTargetMachine *TM)
+    : ImmutablePass(ID), MMI(TM) {
+  initializeMachineModuleInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+MachineModuleInfoWrapperPass::MachineModuleInfoWrapperPass(
+    const LLVMTargetMachine *TM, MCContext *ExtContext)
+    : ImmutablePass(ID), MMI(TM, ExtContext) {
+  initializeMachineModuleInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+// Handle the Pass registration stuff necessary to use DataLayout's.
+INITIALIZE_PASS(MachineModuleInfoWrapperPass, "machinemoduleinfo",
+                "Machine Module Information", false, false)
+char MachineModuleInfoWrapperPass::ID = 0;
+
+static unsigned getLocCookie(const SMDiagnostic &SMD, const SourceMgr &SrcMgr,
+                             std::vector<const MDNode *> &LocInfos) {
+  // Look up a LocInfo for the buffer this diagnostic is coming from.
+  unsigned BufNum = SrcMgr.FindBufferContainingLoc(SMD.getLoc());
+  const MDNode *LocInfo = nullptr;
+  if (BufNum > 0 && BufNum <= LocInfos.size())
+    LocInfo = LocInfos[BufNum - 1];
+
+  // If the inline asm had metadata associated with it, pull out a location
+  // cookie corresponding to which line the error occurred on.
+  unsigned LocCookie = 0;
+  if (LocInfo) {
+    unsigned ErrorLine = SMD.getLineNo() - 1;
+    if (ErrorLine >= LocInfo->getNumOperands())
+      ErrorLine = 0;
+
+    if (LocInfo->getNumOperands() != 0)
+      if (const ConstantInt *CI =
+              mdconst::dyn_extract<ConstantInt>(LocInfo->getOperand(ErrorLine)))
+        LocCookie = CI->getZExtValue();
+  }
+
+  return LocCookie;
+}
+
+bool MachineModuleInfoWrapperPass::doInitialization(Module &M) {
+  MMI.initialize();
+  MMI.TheModule = &M;
+  // FIXME: Do this for new pass manager.
+  LLVMContext &Ctx = M.getContext();
+  MMI.getContext().setDiagnosticHandler(
+      [&Ctx, &M](const SMDiagnostic &SMD, bool IsInlineAsm,
+                 const SourceMgr &SrcMgr,
+                 std::vector<const MDNode *> &LocInfos) {
+        unsigned LocCookie = 0;
+        if (IsInlineAsm)
+          LocCookie = getLocCookie(SMD, SrcMgr, LocInfos);
+        Ctx.diagnose(
+            DiagnosticInfoSrcMgr(SMD, M.getName(), IsInlineAsm, LocCookie));
+      });
+  MMI.DbgInfoAvailable = !DisableDebugInfoPrinting &&
+                         !M.debug_compile_units().empty();
+  return false;
+}
+
+bool MachineModuleInfoWrapperPass::doFinalization(Module &M) {
+  MMI.finalize();
+  return false;
+}
+
+AnalysisKey MachineModuleAnalysis::Key;
+
+MachineModuleInfo MachineModuleAnalysis::run(Module &M,
+                                             ModuleAnalysisManager &) {
+  MachineModuleInfo MMI(TM);
+  MMI.TheModule = &M;
+  MMI.DbgInfoAvailable = !DisableDebugInfoPrinting &&
+                         !M.debug_compile_units().empty();
+  return MMI;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp
new file mode 100644
index 000000000000..9c3b31935f6d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp
@@ -0,0 +1,43 @@
+//===- llvm/CodeGen/MachineModuleInfoImpls.cpp ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements object-file format specific implementations of
+// MachineModuleInfoImpl.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCSymbol.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MachineModuleInfoMachO
+//===----------------------------------------------------------------------===//
+
+// Out of line virtual method.
+void MachineModuleInfoMachO::anchor() {}
+void MachineModuleInfoELF::anchor() {}
+void MachineModuleInfoCOFF::anchor() {}
+void MachineModuleInfoWasm::anchor() {}
+
+using PairTy = std::pair<MCSymbol *, MachineModuleInfoImpl::StubValueTy>;
+static int SortSymbolPair(const PairTy *LHS, const PairTy *RHS) {
+  return LHS->first->getName().compare(RHS->first->getName());
+}
+
+MachineModuleInfoImpl::SymbolListTy MachineModuleInfoImpl::getSortedStubs(
+    DenseMap<MCSymbol *, MachineModuleInfoImpl::StubValueTy> &Map) {
+  MachineModuleInfoImpl::SymbolListTy List(Map.begin(), Map.end());
+
+  array_pod_sort(List.begin(), List.end(), SortSymbolPair);
+
+  Map.clear();
+  return List;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineModuleSlotTracker.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineModuleSlotTracker.cpp
new file mode 100644
index 000000000000..aa63411df965
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineModuleSlotTracker.cpp
@@ -0,0 +1,80 @@
+//===-- llvm/CodeGen/MachineModuleInfo.cpp ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineModuleSlotTracker.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+
+using namespace llvm;
+
+void MachineModuleSlotTracker::processMachineFunctionMetadata(
+    AbstractSlotTrackerStorage *AST, const MachineFunction &MF) {
+  // Create metadata created within the backend.
+  for (const MachineBasicBlock &MBB : MF)
+    for (const MachineInstr &MI : MBB.instrs())
+      for (const MachineMemOperand *MMO : MI.memoperands()) {
+        AAMDNodes AAInfo = MMO->getAAInfo();
+        if (AAInfo.TBAA)
+          AST->createMetadataSlot(AAInfo.TBAA);
+        if (AAInfo.TBAAStruct)
+          AST->createMetadataSlot(AAInfo.TBAAStruct);
+        if (AAInfo.Scope)
+          AST->createMetadataSlot(AAInfo.Scope);
+        if (AAInfo.NoAlias)
+          AST->createMetadataSlot(AAInfo.NoAlias);
+      }
+}
+
+void MachineModuleSlotTracker::processMachineModule(
+    AbstractSlotTrackerStorage *AST, const Module *M,
+    bool ShouldInitializeAllMetadata) {
+  if (ShouldInitializeAllMetadata) {
+    for (const Function &F : *M) {
+      if (&F != &TheFunction)
+        continue;
+      MDNStartSlot = AST->getNextMetadataSlot();
+      if (auto *MF = TheMMI.getMachineFunction(F))
+        processMachineFunctionMetadata(AST, *MF);
+      MDNEndSlot = AST->getNextMetadataSlot();
+      break;
+    }
+  }
+}
+
+void MachineModuleSlotTracker::processMachineFunction(
+    AbstractSlotTrackerStorage *AST, const Function *F,
+    bool ShouldInitializeAllMetadata) {
+  if (!ShouldInitializeAllMetadata && F == &TheFunction) {
+    MDNStartSlot = AST->getNextMetadataSlot();
+    if (auto *MF = TheMMI.getMachineFunction(*F))
+      processMachineFunctionMetadata(AST, *MF);
+    MDNEndSlot = AST->getNextMetadataSlot();
+  }
+}
+
+void MachineModuleSlotTracker::collectMachineMDNodes(
+    MachineMDNodeListType &L) const {
+  collectMDNodes(L, MDNStartSlot, MDNEndSlot);
+}
+
+MachineModuleSlotTracker::MachineModuleSlotTracker(
+    const MachineFunction *MF, bool ShouldInitializeAllMetadata)
+    : ModuleSlotTracker(MF->getFunction().getParent(),
+                        ShouldInitializeAllMetadata),
+      TheFunction(MF->getFunction()), TheMMI(MF->getMMI()) {
+  setProcessHook([this](AbstractSlotTrackerStorage *AST, const Module *M,
+                        bool ShouldInitializeAllMetadata) {
+    this->processMachineModule(AST, M, ShouldInitializeAllMetadata);
+  });
+  setProcessHook([this](AbstractSlotTrackerStorage *AST, const Function *F,
+                        bool ShouldInitializeAllMetadata) {
+    this->processMachineFunction(AST, F, ShouldInitializeAllMetadata);
+  });
+}
+
+MachineModuleSlotTracker::~MachineModuleSlotTracker() = default;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineOperand.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineOperand.cpp
new file mode 100644
index 000000000000..788c134b6ee8
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineOperand.cpp
@@ -0,0 +1,1256 @@
+//===- lib/CodeGen/MachineOperand.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Methods common to all machine operands.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/CodeGen/MIRFormatter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StableHashing.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <optional>
+
+using namespace llvm;
+
+static cl::opt<int>
+    PrintRegMaskNumRegs("print-regmask-num-regs",
+                        cl::desc("Number of registers to limit to when "
+                                 "printing regmask operands in IR dumps. "
+                                 "unlimited = -1"),
+                        cl::init(32), cl::Hidden);
+
+static const MachineFunction *getMFIfAvailable(const MachineOperand &MO) {
+  if (const MachineInstr *MI = MO.getParent())
+    if (const MachineBasicBlock *MBB = MI->getParent())
+      if (const MachineFunction *MF = MBB->getParent())
+        return MF;
+  return nullptr;
+}
+
+static MachineFunction *getMFIfAvailable(MachineOperand &MO) {
+  return const_cast<MachineFunction *>(
+      getMFIfAvailable(const_cast<const MachineOperand &>(MO)));
+}
+
+unsigned MachineOperand::getOperandNo() const {
+  assert(getParent() && "Operand does not belong to any instruction!");
+  return getParent()->getOperandNo(this);
+}
+
+void MachineOperand::setReg(Register Reg) {
+  if (getReg() == Reg)
+    return; // No change.
+
+  // Clear the IsRenamable bit to keep it conservatively correct.
+  IsRenamable = false;
+
+  // Otherwise, we have to change the register.  If this operand is embedded
+  // into a machine function, we need to update the old and new register's
+  // use/def lists.
+  if (MachineFunction *MF = getMFIfAvailable(*this)) {
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    MRI.removeRegOperandFromUseList(this);
+    SmallContents.RegNo = Reg;
+    MRI.addRegOperandToUseList(this);
+    return;
+  }
+
+  // Otherwise, just change the register, no problem.  :)
+  SmallContents.RegNo = Reg;
+}
+
+void MachineOperand::substVirtReg(Register Reg, unsigned SubIdx,
+                                  const TargetRegisterInfo &TRI) {
+  assert(Reg.isVirtual());
+  if (SubIdx && getSubReg())
+    SubIdx = TRI.composeSubRegIndices(SubIdx, getSubReg());
+  setReg(Reg);
+  if (SubIdx)
+    setSubReg(SubIdx);
+}
+
+void MachineOperand::substPhysReg(MCRegister Reg, const TargetRegisterInfo &TRI) {
+  assert(Register::isPhysicalRegister(Reg));
+  if (getSubReg()) {
+    Reg = TRI.getSubReg(Reg, getSubReg());
+    // Note that getSubReg() may return 0 if the sub-register doesn't exist.
+    // That won't happen in legal code.
+    setSubReg(0);
+    if (isDef())
+      setIsUndef(false);
+  }
+  setReg(Reg);
+}
+
+/// Change a def to a use, or a use to a def.
+void MachineOperand::setIsDef(bool Val) {
+  assert(isReg() && "Wrong MachineOperand accessor");
+  assert((!Val || !isDebug()) && "Marking a debug operation as def");
+  if (IsDef == Val)
+    return;
+  assert(!IsDeadOrKill && "Changing def/use with dead/kill set not supported");
+  // MRI may keep uses and defs in different list positions.
+  if (MachineFunction *MF = getMFIfAvailable(*this)) {
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    MRI.removeRegOperandFromUseList(this);
+    IsDef = Val;
+    MRI.addRegOperandToUseList(this);
+    return;
+  }
+  IsDef = Val;
+}
+
+bool MachineOperand::isRenamable() const {
+  assert(isReg() && "Wrong MachineOperand accessor");
+  assert(getReg().isPhysical() &&
+         "isRenamable should only be checked on physical registers");
+  if (!IsRenamable)
+    return false;
+
+  const MachineInstr *MI = getParent();
+  if (!MI)
+    return true;
+
+  if (isDef())
+    return !MI->hasExtraDefRegAllocReq(MachineInstr::IgnoreBundle);
+
+  assert(isUse() && "Reg is not def or use");
+  return !MI->hasExtraSrcRegAllocReq(MachineInstr::IgnoreBundle);
+}
+
+void MachineOperand::setIsRenamable(bool Val) {
+  assert(isReg() && "Wrong MachineOperand accessor");
+  assert(getReg().isPhysical() &&
+         "setIsRenamable should only be called on physical registers");
+  IsRenamable = Val;
+}
+
+// If this operand is currently a register operand, and if this is in a
+// function, deregister the operand from the register's use/def list.
+void MachineOperand::removeRegFromUses() {
+  if (!isReg() || !isOnRegUseList())
+    return;
+
+  if (MachineFunction *MF = getMFIfAvailable(*this))
+    MF->getRegInfo().removeRegOperandFromUseList(this);
+}
+
+/// ChangeToImmediate - Replace this operand with a new immediate operand of
+/// the specified value.  If an operand is known to be an immediate already,
+/// the setImm method should be used.
+void MachineOperand::ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags) {
+  assert((!isReg() || !isTied()) && "Cannot change a tied operand into an imm");
+
+  removeRegFromUses();
+
+  OpKind = MO_Immediate;
+  Contents.ImmVal = ImmVal;
+  setTargetFlags(TargetFlags);
+}
+
+void MachineOperand::ChangeToFPImmediate(const ConstantFP *FPImm,
+                                         unsigned TargetFlags) {
+  assert((!isReg() || !isTied()) && "Cannot change a tied operand into an imm");
+
+  removeRegFromUses();
+
+  OpKind = MO_FPImmediate;
+  Contents.CFP = FPImm;
+  setTargetFlags(TargetFlags);
+}
+
+void MachineOperand::ChangeToES(const char *SymName,
+                                unsigned TargetFlags) {
+  assert((!isReg() || !isTied()) &&
+         "Cannot change a tied operand into an external symbol");
+
+  removeRegFromUses();
+
+  OpKind = MO_ExternalSymbol;
+  Contents.OffsetedInfo.Val.SymbolName = SymName;
+  setOffset(0); // Offset is always 0.
+  setTargetFlags(TargetFlags);
+}
+
+void MachineOperand::ChangeToGA(const GlobalValue *GV, int64_t Offset,
+                                unsigned TargetFlags) {
+  assert((!isReg() || !isTied()) &&
+         "Cannot change a tied operand into a global address");
+
+  removeRegFromUses();
+
+  OpKind = MO_GlobalAddress;
+  Contents.OffsetedInfo.Val.GV = GV;
+  setOffset(Offset);
+  setTargetFlags(TargetFlags);
+}
+
+void MachineOperand::ChangeToMCSymbol(MCSymbol *Sym, unsigned TargetFlags) {
+  assert((!isReg() || !isTied()) &&
+         "Cannot change a tied operand into an MCSymbol");
+
+  removeRegFromUses();
+
+  OpKind = MO_MCSymbol;
+  Contents.Sym = Sym;
+  setTargetFlags(TargetFlags);
+}
+
+void MachineOperand::ChangeToFrameIndex(int Idx, unsigned TargetFlags) {
+  assert((!isReg() || !isTied()) &&
+         "Cannot change a tied operand into a FrameIndex");
+
+  removeRegFromUses();
+
+  OpKind = MO_FrameIndex;
+  setIndex(Idx);
+  setTargetFlags(TargetFlags);
+}
+
+void MachineOperand::ChangeToTargetIndex(unsigned Idx, int64_t Offset,
+                                         unsigned TargetFlags) {
+  assert((!isReg() || !isTied()) &&
+         "Cannot change a tied operand into a FrameIndex");
+
+  removeRegFromUses();
+
+  OpKind = MO_TargetIndex;
+  setIndex(Idx);
+  setOffset(Offset);
+  setTargetFlags(TargetFlags);
+}
+
+void MachineOperand::ChangeToDbgInstrRef(unsigned InstrIdx, unsigned OpIdx,
+                                         unsigned TargetFlags) {
+  assert((!isReg() || !isTied()) &&
+         "Cannot change a tied operand into a DbgInstrRef");
+
+  removeRegFromUses();
+
+  OpKind = MO_DbgInstrRef;
+  setInstrRefInstrIndex(InstrIdx);
+  setInstrRefOpIndex(OpIdx);
+  setTargetFlags(TargetFlags);
+}
+
+/// ChangeToRegister - Replace this operand with a new register operand of
+/// the specified value.  If an operand is known to be an register already,
+/// the setReg method should be used.
+void MachineOperand::ChangeToRegister(Register Reg, bool isDef, bool isImp,
+                                      bool isKill, bool isDead, bool isUndef,
+                                      bool isDebug) {
+  MachineRegisterInfo *RegInfo = nullptr;
+  if (MachineFunction *MF = getMFIfAvailable(*this))
+    RegInfo = &MF->getRegInfo();
+  // If this operand is already a register operand, remove it from the
+  // register's use/def lists.
+  bool WasReg = isReg();
+  if (RegInfo && WasReg)
+    RegInfo->removeRegOperandFromUseList(this);
+
+  // Ensure debug instructions set debug flag on register uses.
+  const MachineInstr *MI = getParent();
+  if (!isDef && MI && MI->isDebugInstr())
+    isDebug = true;
+
+  // Change this to a register and set the reg#.
+  assert(!(isDead && !isDef) && "Dead flag on non-def");
+  assert(!(isKill && isDef) && "Kill flag on def");
+  OpKind = MO_Register;
+  SmallContents.RegNo = Reg;
+  SubReg_TargetFlags = 0;
+  IsDef = isDef;
+  IsImp = isImp;
+  IsDeadOrKill = isKill | isDead;
+  IsRenamable = false;
+  IsUndef = isUndef;
+  IsInternalRead = false;
+  IsEarlyClobber = false;
+  IsDebug = isDebug;
+  // Ensure isOnRegUseList() returns false.
+  Contents.Reg.Prev = nullptr;
+  // Preserve the tie when the operand was already a register.
+  if (!WasReg)
+    TiedTo = 0;
+
+  // If this operand is embedded in a function, add the operand to the
+  // register's use/def list.
+  if (RegInfo)
+    RegInfo->addRegOperandToUseList(this);
+}
+
+/// isIdenticalTo - Return true if this operand is identical to the specified
+/// operand. Note that this should stay in sync with the hash_value overload
+/// below.
+bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
+  if (getType() != Other.getType() ||
+      getTargetFlags() != Other.getTargetFlags())
+    return false;
+
+  switch (getType()) {
+  case MachineOperand::MO_Register:
+    return getReg() == Other.getReg() && isDef() == Other.isDef() &&
+           getSubReg() == Other.getSubReg();
+  case MachineOperand::MO_Immediate:
+    return getImm() == Other.getImm();
+  case MachineOperand::MO_CImmediate:
+    return getCImm() == Other.getCImm();
+  case MachineOperand::MO_FPImmediate:
+    return getFPImm() == Other.getFPImm();
+  case MachineOperand::MO_MachineBasicBlock:
+    return getMBB() == Other.getMBB();
+  case MachineOperand::MO_FrameIndex:
+    return getIndex() == Other.getIndex();
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_TargetIndex:
+    return getIndex() == Other.getIndex() && getOffset() == Other.getOffset();
+  case MachineOperand::MO_JumpTableIndex:
+    return getIndex() == Other.getIndex();
+  case MachineOperand::MO_GlobalAddress:
+    return getGlobal() == Other.getGlobal() && getOffset() == Other.getOffset();
+  case MachineOperand::MO_ExternalSymbol:
+    return strcmp(getSymbolName(), Other.getSymbolName()) == 0 &&
+           getOffset() == Other.getOffset();
+  case MachineOperand::MO_BlockAddress:
+    return getBlockAddress() == Other.getBlockAddress() &&
+           getOffset() == Other.getOffset();
+  case MachineOperand::MO_RegisterMask:
+  case MachineOperand::MO_RegisterLiveOut: {
+    // Shallow compare of the two RegMasks
+    const uint32_t *RegMask = getRegMask();
+    const uint32_t *OtherRegMask = Other.getRegMask();
+    if (RegMask == OtherRegMask)
+      return true;
+
+    if (const MachineFunction *MF = getMFIfAvailable(*this)) {
+      const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+      unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
+      // Deep compare of the two RegMasks
+      return std::equal(RegMask, RegMask + RegMaskSize, OtherRegMask);
+    }
+    // We don't know the size of the RegMask, so we can't deep compare the two
+    // reg masks.
+    return false;
+  }
+  case MachineOperand::MO_MCSymbol:
+    return getMCSymbol() == Other.getMCSymbol();
+  case MachineOperand::MO_DbgInstrRef:
+    return getInstrRefInstrIndex() == Other.getInstrRefInstrIndex() &&
+           getInstrRefOpIndex() == Other.getInstrRefOpIndex();
+  case MachineOperand::MO_CFIIndex:
+    return getCFIIndex() == Other.getCFIIndex();
+  case MachineOperand::MO_Metadata:
+    return getMetadata() == Other.getMetadata();
+  case MachineOperand::MO_IntrinsicID:
+    return getIntrinsicID() == Other.getIntrinsicID();
+  case MachineOperand::MO_Predicate:
+    return getPredicate() == Other.getPredicate();
+  case MachineOperand::MO_ShuffleMask:
+    return getShuffleMask() == Other.getShuffleMask();
+  }
+  llvm_unreachable("Invalid machine operand type");
+}
+
+// Note: this must stay exactly in sync with isIdenticalTo above.
+hash_code llvm::hash_value(const MachineOperand &MO) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    // Register operands don't have target flags.
+    return hash_combine(MO.getType(), (unsigned)MO.getReg(), MO.getSubReg(), MO.isDef());
+  case MachineOperand::MO_Immediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getImm());
+  case MachineOperand::MO_CImmediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getCImm());
+  case MachineOperand::MO_FPImmediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getFPImm());
+  case MachineOperand::MO_MachineBasicBlock:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMBB());
+  case MachineOperand::MO_FrameIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex());
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_TargetIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex(),
+                        MO.getOffset());
+  case MachineOperand::MO_JumpTableIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex());
+  case MachineOperand::MO_ExternalSymbol:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getOffset(),
+                        StringRef(MO.getSymbolName()));
+  case MachineOperand::MO_GlobalAddress:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getGlobal(),
+                        MO.getOffset());
+  case MachineOperand::MO_BlockAddress:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getBlockAddress(),
+                        MO.getOffset());
+  case MachineOperand::MO_RegisterMask:
+  case MachineOperand::MO_RegisterLiveOut: {
+    if (const MachineFunction *MF = getMFIfAvailable(MO)) {
+      const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+      unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
+      const uint32_t *RegMask = MO.getRegMask();
+      std::vector<stable_hash> RegMaskHashes(RegMask, RegMask + RegMaskSize);
+      return hash_combine(MO.getType(), MO.getTargetFlags(),
+                          stable_hash_combine_array(RegMaskHashes.data(),
+                                                    RegMaskHashes.size()));
+    }
+
+    assert(0 && "MachineOperand not associated with any MachineFunction");
+    return hash_combine(MO.getType(), MO.getTargetFlags());
+  }
+  case MachineOperand::MO_Metadata:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMetadata());
+  case MachineOperand::MO_MCSymbol:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMCSymbol());
+  case MachineOperand::MO_DbgInstrRef:
+    return hash_combine(MO.getType(), MO.getTargetFlags(),
+                        MO.getInstrRefInstrIndex(), MO.getInstrRefOpIndex());
+  case MachineOperand::MO_CFIIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getCFIIndex());
+  case MachineOperand::MO_IntrinsicID:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIntrinsicID());
+  case MachineOperand::MO_Predicate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getPredicate());
+  case MachineOperand::MO_ShuffleMask:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getShuffleMask());
+  }
+  llvm_unreachable("Invalid machine operand type");
+}
+
+// Try to crawl up to the machine function and get TRI and IntrinsicInfo from
+// it.
+static void tryToGetTargetInfo(const MachineOperand &MO,
+                               const TargetRegisterInfo *&TRI,
+                               const TargetIntrinsicInfo *&IntrinsicInfo) {
+  if (const MachineFunction *MF = getMFIfAvailable(MO)) {
+    TRI = MF->getSubtarget().getRegisterInfo();
+    IntrinsicInfo = MF->getTarget().getIntrinsicInfo();
+  }
+}
+
+static const char *getTargetIndexName(const MachineFunction &MF, int Index) {
+  const auto *TII = MF.getSubtarget().getInstrInfo();
+  assert(TII && "expected instruction info");
+  auto Indices = TII->getSerializableTargetIndices();
+  auto Found = find_if(Indices, [&](const std::pair<int, const char *> &I) {
+    return I.first == Index;
+  });
+  if (Found != Indices.end())
+    return Found->second;
+  return nullptr;
+}
+
+const char *MachineOperand::getTargetIndexName() const {
+  const MachineFunction *MF = getMFIfAvailable(*this);
+  return MF ? ::getTargetIndexName(*MF, this->getIndex()) : nullptr;
+}
+
+static const char *getTargetFlagName(const TargetInstrInfo *TII, unsigned TF) {
+  auto Flags = TII->getSerializableDirectMachineOperandTargetFlags();
+  for (const auto &I : Flags) {
+    if (I.first == TF) {
+      return I.second;
+    }
+  }
+  return nullptr;
+}
+
+static void printCFIRegister(unsigned DwarfReg, raw_ostream &OS,
+                             const TargetRegisterInfo *TRI) {
+  if (!TRI) {
+    OS << "%dwarfreg." << DwarfReg;
+    return;
+  }
+
+  if (std::optional<unsigned> Reg = TRI->getLLVMRegNum(DwarfReg, true))
+    OS << printReg(*Reg, TRI);
+  else
+    OS << "<badreg>";
+}
+
+static void printIRBlockReference(raw_ostream &OS, const BasicBlock &BB,
+                                  ModuleSlotTracker &MST) {
+  OS << "%ir-block.";
+  if (BB.hasName()) {
+    printLLVMNameWithoutPrefix(OS, BB.getName());
+    return;
+  }
+  std::optional<int> Slot;
+  if (const Function *F = BB.getParent()) {
+    if (F == MST.getCurrentFunction()) {
+      Slot = MST.getLocalSlot(&BB);
+    } else if (const Module *M = F->getParent()) {
+      ModuleSlotTracker CustomMST(M, /*ShouldInitializeAllMetadata=*/false);
+      CustomMST.incorporateFunction(*F);
+      Slot = CustomMST.getLocalSlot(&BB);
+    }
+  }
+  if (Slot)
+    MachineOperand::printIRSlotNumber(OS, *Slot);
+  else
+    OS << "<unknown>";
+}
+
+static void printSyncScope(raw_ostream &OS, const LLVMContext &Context,
+                           SyncScope::ID SSID,
+                           SmallVectorImpl<StringRef> &SSNs) {
+  switch (SSID) {
+  case SyncScope::System:
+    break;
+  default:
+    if (SSNs.empty())
+      Context.getSyncScopeNames(SSNs);
+
+    OS << "syncscope(\"";
+    printEscapedString(SSNs[SSID], OS);
+    OS << "\") ";
+    break;
+  }
+}
+
+static const char *getTargetMMOFlagName(const TargetInstrInfo &TII,
+                                        unsigned TMMOFlag) {
+  auto Flags = TII.getSerializableMachineMemOperandTargetFlags();
+  for (const auto &I : Flags) {
+    if (I.first == TMMOFlag) {
+      return I.second;
+    }
+  }
+  return nullptr;
+}
+
+static void printFrameIndex(raw_ostream& OS, int FrameIndex, bool IsFixed,
+                            const MachineFrameInfo *MFI) {
+  StringRef Name;
+  if (MFI) {
+    IsFixed = MFI->isFixedObjectIndex(FrameIndex);
+    if (const AllocaInst *Alloca = MFI->getObjectAllocation(FrameIndex))
+      if (Alloca->hasName())
+        Name = Alloca->getName();
+    if (IsFixed)
+      FrameIndex -= MFI->getObjectIndexBegin();
+  }
+  MachineOperand::printStackObjectReference(OS, FrameIndex, IsFixed, Name);
+}
+
+void MachineOperand::printSubRegIdx(raw_ostream &OS, uint64_t Index,
+                                    const TargetRegisterInfo *TRI) {
+  OS << "%subreg.";
+  if (TRI && Index != 0 && Index < TRI->getNumSubRegIndices())
+    OS << TRI->getSubRegIndexName(Index);
+  else
+    OS << Index;
+}
+
+void MachineOperand::printTargetFlags(raw_ostream &OS,
+                                      const MachineOperand &Op) {
+  if (!Op.getTargetFlags())
+    return;
+  const MachineFunction *MF = getMFIfAvailable(Op);
+  if (!MF)
+    return;
+
+  const auto *TII = MF->getSubtarget().getInstrInfo();
+  assert(TII && "expected instruction info");
+  auto Flags = TII->decomposeMachineOperandsTargetFlags(Op.getTargetFlags());
+  OS << "target-flags(";
+  const bool HasDirectFlags = Flags.first;
+  const bool HasBitmaskFlags = Flags.second;
+  if (!HasDirectFlags && !HasBitmaskFlags) {
+    OS << "<unknown>) ";
+    return;
+  }
+  if (HasDirectFlags) {
+    if (const auto *Name = getTargetFlagName(TII, Flags.first))
+      OS << Name;
+    else
+      OS << "<unknown target flag>";
+  }
+  if (!HasBitmaskFlags) {
+    OS << ") ";
+    return;
+  }
+  bool IsCommaNeeded = HasDirectFlags;
+  unsigned BitMask = Flags.second;
+  auto BitMasks = TII->getSerializableBitmaskMachineOperandTargetFlags();
+  for (const auto &Mask : BitMasks) {
+    // Check if the flag's bitmask has the bits of the current mask set.
+    if ((BitMask & Mask.first) == Mask.first) {
+      if (IsCommaNeeded)
+        OS << ", ";
+      IsCommaNeeded = true;
+      OS << Mask.second;
+      // Clear the bits which were serialized from the flag's bitmask.
+      BitMask &= ~(Mask.first);
+    }
+  }
+  if (BitMask) {
+    // When the resulting flag's bitmask isn't zero, we know that we didn't
+    // serialize all of the bit flags.
+    if (IsCommaNeeded)
+      OS << ", ";
+    OS << "<unknown bitmask target flag>";
+  }
+  OS << ") ";
+}
+
+void MachineOperand::printSymbol(raw_ostream &OS, MCSymbol &Sym) {
+  OS << "<mcsymbol " << Sym << ">";
+}
+
+void MachineOperand::printStackObjectReference(raw_ostream &OS,
+                                               unsigned FrameIndex,
+                                               bool IsFixed, StringRef Name) {
+  if (IsFixed) {
+    OS << "%fixed-stack." << FrameIndex;
+    return;
+  }
+
+  OS << "%stack." << FrameIndex;
+  if (!Name.empty())
+    OS << '.' << Name;
+}
+
+void MachineOperand::printOperandOffset(raw_ostream &OS, int64_t Offset) {
+  if (Offset == 0)
+    return;
+  if (Offset < 0) {
+    OS << " - " << -Offset;
+    return;
+  }
+  OS << " + " << Offset;
+}
+
+void MachineOperand::printIRSlotNumber(raw_ostream &OS, int Slot) {
+  if (Slot == -1)
+    OS << "<badref>";
+  else
+    OS << Slot;
+}
+
+static void printCFI(raw_ostream &OS, const MCCFIInstruction &CFI,
+                     const TargetRegisterInfo *TRI) {
+  switch (CFI.getOperation()) {
+  case MCCFIInstruction::OpSameValue:
+    OS << "same_value ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    break;
+  case MCCFIInstruction::OpRememberState:
+    OS << "remember_state ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    break;
+  case MCCFIInstruction::OpRestoreState:
+    OS << "restore_state ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    break;
+  case MCCFIInstruction::OpOffset:
+    OS << "offset ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    OS << ", " << CFI.getOffset();
+    break;
+  case MCCFIInstruction::OpDefCfaRegister:
+    OS << "def_cfa_register ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    break;
+  case MCCFIInstruction::OpDefCfaOffset:
+    OS << "def_cfa_offset ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    OS << CFI.getOffset();
+    break;
+  case MCCFIInstruction::OpDefCfa:
+    OS << "def_cfa ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    OS << ", " << CFI.getOffset();
+    break;
+  case MCCFIInstruction::OpLLVMDefAspaceCfa:
+    OS << "llvm_def_aspace_cfa ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    OS << ", " << CFI.getOffset();
+    OS << ", " << CFI.getAddressSpace();
+    break;
+  case MCCFIInstruction::OpRelOffset:
+    OS << "rel_offset ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    OS << ", " << CFI.getOffset();
+    break;
+  case MCCFIInstruction::OpAdjustCfaOffset:
+    OS << "adjust_cfa_offset ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    OS << CFI.getOffset();
+    break;
+  case MCCFIInstruction::OpRestore:
+    OS << "restore ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    break;
+  case MCCFIInstruction::OpEscape: {
+    OS << "escape ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    if (!CFI.getValues().empty()) {
+      size_t e = CFI.getValues().size() - 1;
+      for (size_t i = 0; i < e; ++i)
+        OS << format("0x%02x", uint8_t(CFI.getValues()[i])) << ", ";
+      OS << format("0x%02x", uint8_t(CFI.getValues()[e]));
+    }
+    break;
+  }
+  case MCCFIInstruction::OpUndefined:
+    OS << "undefined ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    break;
+  case MCCFIInstruction::OpRegister:
+    OS << "register ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    OS << ", ";
+    printCFIRegister(CFI.getRegister2(), OS, TRI);
+    break;
+  case MCCFIInstruction::OpWindowSave:
+    OS << "window_save ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    break;
+  case MCCFIInstruction::OpNegateRAState:
+    OS << "negate_ra_sign_state ";
+    if (MCSymbol *Label = CFI.getLabel())
+      MachineOperand::printSymbol(OS, *Label);
+    break;
+  default:
+    // TODO: Print the other CFI Operations.
+    OS << "<unserializable cfi directive>";
+    break;
+  }
+}
+
+void MachineOperand::print(raw_ostream &OS, const TargetRegisterInfo *TRI,
+                           const TargetIntrinsicInfo *IntrinsicInfo) const {
+  print(OS, LLT{}, TRI, IntrinsicInfo);
+}
+
+void MachineOperand::print(raw_ostream &OS, LLT TypeToPrint,
+                           const TargetRegisterInfo *TRI,
+                           const TargetIntrinsicInfo *IntrinsicInfo) const {
+  tryToGetTargetInfo(*this, TRI, IntrinsicInfo);
+  ModuleSlotTracker DummyMST(nullptr);
+  print(OS, DummyMST, TypeToPrint, std::nullopt, /*PrintDef=*/false,
+        /*IsStandalone=*/true,
+        /*ShouldPrintRegisterTies=*/true,
+        /*TiedOperandIdx=*/0, TRI, IntrinsicInfo);
+}
+
+void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
+                           LLT TypeToPrint, std::optional<unsigned> OpIdx,
+                           bool PrintDef, bool IsStandalone,
+                           bool ShouldPrintRegisterTies,
+                           unsigned TiedOperandIdx,
+                           const TargetRegisterInfo *TRI,
+                           const TargetIntrinsicInfo *IntrinsicInfo) const {
+  printTargetFlags(OS, *this);
+  switch (getType()) {
+  case MachineOperand::MO_Register: {
+    Register Reg = getReg();
+    if (isImplicit())
+      OS << (isDef() ? "implicit-def " : "implicit ");
+    else if (PrintDef && isDef())
+      // Print the 'def' flag only when the operand is defined after '='.
+      OS << "def ";
+    if (isInternalRead())
+      OS << "internal ";
+    if (isDead())
+      OS << "dead ";
+    if (isKill())
+      OS << "killed ";
+    if (isUndef())
+      OS << "undef ";
+    if (isEarlyClobber())
+      OS << "early-clobber ";
+    if (getReg().isPhysical() && isRenamable())
+      OS << "renamable ";
+    // isDebug() is exactly true for register operands of a DBG_VALUE. So we
+    // simply infer it when parsing and do not need to print it.
+
+    const MachineRegisterInfo *MRI = nullptr;
+    if (Reg.isVirtual()) {
+      if (const MachineFunction *MF = getMFIfAvailable(*this)) {
+        MRI = &MF->getRegInfo();
+      }
+    }
+
+    OS << printReg(Reg, TRI, 0, MRI);
+    // Print the sub register.
+    if (unsigned SubReg = getSubReg()) {
+      if (TRI)
+        OS << '.' << TRI->getSubRegIndexName(SubReg);
+      else
+        OS << ".subreg" << SubReg;
+    }
+    // Print the register class / bank.
+    if (Reg.isVirtual()) {
+      if (const MachineFunction *MF = getMFIfAvailable(*this)) {
+        const MachineRegisterInfo &MRI = MF->getRegInfo();
+        if (IsStandalone || !PrintDef || MRI.def_empty(Reg)) {
+          OS << ':';
+          OS << printRegClassOrBank(Reg, MRI, TRI);
+        }
+      }
+    }
+    // Print ties.
+    if (ShouldPrintRegisterTies && isTied() && !isDef())
+      OS << "(tied-def " << TiedOperandIdx << ")";
+    // Print types.
+    if (TypeToPrint.isValid())
+      OS << '(' << TypeToPrint << ')';
+    break;
+  }
+  case MachineOperand::MO_Immediate: {
+    const MIRFormatter *Formatter = nullptr;
+    if (const MachineFunction *MF = getMFIfAvailable(*this)) {
+      const auto *TII = MF->getSubtarget().getInstrInfo();
+      assert(TII && "expected instruction info");
+      Formatter = TII->getMIRFormatter();
+    }
+    if (Formatter)
+      Formatter->printImm(OS, *getParent(), OpIdx, getImm());
+    else
+      OS << getImm();
+    break;
+  }
+  case MachineOperand::MO_CImmediate:
+    getCImm()->printAsOperand(OS, /*PrintType=*/true, MST);
+    break;
+  case MachineOperand::MO_FPImmediate:
+    getFPImm()->printAsOperand(OS, /*PrintType=*/true, MST);
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    OS << printMBBReference(*getMBB());
+    break;
+  case MachineOperand::MO_FrameIndex: {
+    int FrameIndex = getIndex();
+    bool IsFixed = false;
+    const MachineFrameInfo *MFI = nullptr;
+    if (const MachineFunction *MF = getMFIfAvailable(*this))
+      MFI = &MF->getFrameInfo();
+    printFrameIndex(OS, FrameIndex, IsFixed, MFI);
+    break;
+  }
+  case MachineOperand::MO_ConstantPoolIndex:
+    OS << "%const." << getIndex();
+    printOperandOffset(OS, getOffset());
+    break;
+  case MachineOperand::MO_TargetIndex: {
+    OS << "target-index(";
+    const char *Name = "<unknown>";
+    if (const MachineFunction *MF = getMFIfAvailable(*this))
+      if (const auto *TargetIndexName = ::getTargetIndexName(*MF, getIndex()))
+        Name = TargetIndexName;
+    OS << Name << ')';
+    printOperandOffset(OS, getOffset());
+    break;
+  }
+  case MachineOperand::MO_JumpTableIndex:
+    OS << printJumpTableEntryReference(getIndex());
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    getGlobal()->printAsOperand(OS, /*PrintType=*/false, MST);
+    printOperandOffset(OS, getOffset());
+    break;
+  case MachineOperand::MO_ExternalSymbol: {
+    StringRef Name = getSymbolName();
+    OS << '&';
+    if (Name.empty()) {
+      OS << "\"\"";
+    } else {
+      printLLVMNameWithoutPrefix(OS, Name);
+    }
+    printOperandOffset(OS, getOffset());
+    break;
+  }
+  case MachineOperand::MO_BlockAddress: {
+    OS << "blockaddress(";
+    getBlockAddress()->getFunction()->printAsOperand(OS, /*PrintType=*/false,
+                                                     MST);
+    OS << ", ";
+    printIRBlockReference(OS, *getBlockAddress()->getBasicBlock(), MST);
+    OS << ')';
+    MachineOperand::printOperandOffset(OS, getOffset());
+    break;
+  }
+  case MachineOperand::MO_RegisterMask: {
+    OS << "<regmask";
+    if (TRI) {
+      unsigned NumRegsInMask = 0;
+      unsigned NumRegsEmitted = 0;
+      for (unsigned i = 0; i < TRI->getNumRegs(); ++i) {
+        unsigned MaskWord = i / 32;
+        unsigned MaskBit = i % 32;
+        if (getRegMask()[MaskWord] & (1 << MaskBit)) {
+          if (PrintRegMaskNumRegs < 0 ||
+              NumRegsEmitted <= static_cast<unsigned>(PrintRegMaskNumRegs)) {
+            OS << " " << printReg(i, TRI);
+            NumRegsEmitted++;
+          }
+          NumRegsInMask++;
+        }
+      }
+      if (NumRegsEmitted != NumRegsInMask)
+        OS << " and " << (NumRegsInMask - NumRegsEmitted) << " more...";
+    } else {
+      OS << " ...";
+    }
+    OS << ">";
+    break;
+  }
+  case MachineOperand::MO_RegisterLiveOut: {
+    const uint32_t *RegMask = getRegLiveOut();
+    OS << "liveout(";
+    if (!TRI) {
+      OS << "<unknown>";
+    } else {
+      bool IsCommaNeeded = false;
+      for (unsigned Reg = 0, E = TRI->getNumRegs(); Reg < E; ++Reg) {
+        if (RegMask[Reg / 32] & (1U << (Reg % 32))) {
+          if (IsCommaNeeded)
+            OS << ", ";
+          OS << printReg(Reg, TRI);
+          IsCommaNeeded = true;
+        }
+      }
+    }
+    OS << ")";
+    break;
+  }
+  case MachineOperand::MO_Metadata:
+    getMetadata()->printAsOperand(OS, MST);
+    break;
+  case MachineOperand::MO_MCSymbol:
+    printSymbol(OS, *getMCSymbol());
+    break;
+  case MachineOperand::MO_DbgInstrRef: {
+    OS << "dbg-instr-ref(" << getInstrRefInstrIndex() << ", "
+       << getInstrRefOpIndex() << ')';
+    break;
+  }
+  case MachineOperand::MO_CFIIndex: {
+    if (const MachineFunction *MF = getMFIfAvailable(*this))
+      printCFI(OS, MF->getFrameInstructions()[getCFIIndex()], TRI);
+    else
+      OS << "<cfi directive>";
+    break;
+  }
+  case MachineOperand::MO_IntrinsicID: {
+    Intrinsic::ID ID = getIntrinsicID();
+    if (ID < Intrinsic::num_intrinsics)
+      OS << "intrinsic(@" << Intrinsic::getBaseName(ID) << ')';
+    else if (IntrinsicInfo)
+      OS << "intrinsic(@" << IntrinsicInfo->getName(ID) << ')';
+    else
+      OS << "intrinsic(" << ID << ')';
+    break;
+  }
+  case MachineOperand::MO_Predicate: {
+    auto Pred = static_cast<CmpInst::Predicate>(getPredicate());
+    OS << (CmpInst::isIntPredicate(Pred) ? "int" : "float") << "pred("
+       << Pred << ')';
+    break;
+  }
+  case MachineOperand::MO_ShuffleMask:
+    OS << "shufflemask(";
+    ArrayRef<int> Mask = getShuffleMask();
+    StringRef Separator;
+    for (int Elt : Mask) {
+      if (Elt == -1)
+        OS << Separator << "undef";
+      else
+        OS << Separator << Elt;
+      Separator = ", ";
+    }
+
+    OS << ')';
+    break;
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineOperand::dump() const { dbgs() << *this << '\n'; }
+#endif
+
+//===----------------------------------------------------------------------===//
+// MachineMemOperand Implementation
+//===----------------------------------------------------------------------===//
+
+/// getAddrSpace - Return the LLVM IR address space number that this pointer
+/// points into.
+unsigned MachinePointerInfo::getAddrSpace() const { return AddrSpace; }
+
+/// isDereferenceable - Return true if V is always dereferenceable for
+/// Offset + Size byte.
+bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C,
+                                           const DataLayout &DL) const {
+  if (!isa<const Value *>(V))
+    return false;
+
+  const Value *BasePtr = cast<const Value *>(V);
+  if (BasePtr == nullptr)
+    return false;
+
+  return isDereferenceableAndAlignedPointer(
+      BasePtr, Align(1), APInt(DL.getPointerSizeInBits(), Offset + Size), DL);
+}
+
+/// getConstantPool - Return a MachinePointerInfo record that refers to the
+/// constant pool.
+MachinePointerInfo MachinePointerInfo::getConstantPool(MachineFunction &MF) {
+  return MachinePointerInfo(MF.getPSVManager().getConstantPool());
+}
+
+/// getFixedStack - Return a MachinePointerInfo record that refers to the
+/// the specified FrameIndex.
+MachinePointerInfo MachinePointerInfo::getFixedStack(MachineFunction &MF,
+                                                     int FI, int64_t Offset) {
+  return MachinePointerInfo(MF.getPSVManager().getFixedStack(FI), Offset);
+}
+
+MachinePointerInfo MachinePointerInfo::getJumpTable(MachineFunction &MF) {
+  return MachinePointerInfo(MF.getPSVManager().getJumpTable());
+}
+
+MachinePointerInfo MachinePointerInfo::getGOT(MachineFunction &MF) {
+  return MachinePointerInfo(MF.getPSVManager().getGOT());
+}
+
+MachinePointerInfo MachinePointerInfo::getStack(MachineFunction &MF,
+                                                int64_t Offset, uint8_t ID) {
+  return MachinePointerInfo(MF.getPSVManager().getStack(), Offset, ID);
+}
+
+MachinePointerInfo MachinePointerInfo::getUnknownStack(MachineFunction &MF) {
+  return MachinePointerInfo(MF.getDataLayout().getAllocaAddrSpace());
+}
+
+MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f,
+                                     LLT type, Align a, const AAMDNodes &AAInfo,
+                                     const MDNode *Ranges, SyncScope::ID SSID,
+                                     AtomicOrdering Ordering,
+                                     AtomicOrdering FailureOrdering)
+    : PtrInfo(ptrinfo), MemoryType(type), FlagVals(f), BaseAlign(a),
+      AAInfo(AAInfo), Ranges(Ranges) {
+  assert((PtrInfo.V.isNull() || isa<const PseudoSourceValue *>(PtrInfo.V) ||
+          isa<PointerType>(cast<const Value *>(PtrInfo.V)->getType())) &&
+         "invalid pointer value");
+  assert((isLoad() || isStore()) && "Not a load/store!");
+
+  AtomicInfo.SSID = static_cast<unsigned>(SSID);
+  assert(getSyncScopeID() == SSID && "Value truncated");
+  AtomicInfo.Ordering = static_cast<unsigned>(Ordering);
+  assert(getSuccessOrdering() == Ordering && "Value truncated");
+  AtomicInfo.FailureOrdering = static_cast<unsigned>(FailureOrdering);
+  assert(getFailureOrdering() == FailureOrdering && "Value truncated");
+}
+
+MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f,
+                                     uint64_t s, Align a,
+                                     const AAMDNodes &AAInfo,
+                                     const MDNode *Ranges, SyncScope::ID SSID,
+                                     AtomicOrdering Ordering,
+                                     AtomicOrdering FailureOrdering)
+    : MachineMemOperand(ptrinfo, f,
+                        s == ~UINT64_C(0) ? LLT() : LLT::scalar(8 * s), a,
+                        AAInfo, Ranges, SSID, Ordering, FailureOrdering) {}
+
+void MachineMemOperand::refineAlignment(const MachineMemOperand *MMO) {
+  // The Value and Offset may differ due to CSE. But the flags and size
+  // should be the same.
+  assert(MMO->getFlags() == getFlags() && "Flags mismatch!");
+  assert((MMO->getSize() == ~UINT64_C(0) || getSize() == ~UINT64_C(0) ||
+          MMO->getSize() == getSize()) &&
+         "Size mismatch!");
+
+  if (MMO->getBaseAlign() >= getBaseAlign()) {
+    // Update the alignment value.
+    BaseAlign = MMO->getBaseAlign();
+    // Also update the base and offset, because the new alignment may
+    // not be applicable with the old ones.
+    PtrInfo = MMO->PtrInfo;
+  }
+}
+
+/// getAlign - Return the minimum known alignment in bytes of the
+/// actual memory reference.
+Align MachineMemOperand::getAlign() const {
+  return commonAlignment(getBaseAlign(), getOffset());
+}
+
+void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
+                              SmallVectorImpl<StringRef> &SSNs,
+                              const LLVMContext &Context,
+                              const MachineFrameInfo *MFI,
+                              const TargetInstrInfo *TII) const {
+  OS << '(';
+  if (isVolatile())
+    OS << "volatile ";
+  if (isNonTemporal())
+    OS << "non-temporal ";
+  if (isDereferenceable())
+    OS << "dereferenceable ";
+  if (isInvariant())
+    OS << "invariant ";
+  if (TII) {
+    if (getFlags() & MachineMemOperand::MOTargetFlag1)
+      OS << '"' << getTargetMMOFlagName(*TII, MachineMemOperand::MOTargetFlag1)
+         << "\" ";
+    if (getFlags() & MachineMemOperand::MOTargetFlag2)
+      OS << '"' << getTargetMMOFlagName(*TII, MachineMemOperand::MOTargetFlag2)
+         << "\" ";
+    if (getFlags() & MachineMemOperand::MOTargetFlag3)
+      OS << '"' << getTargetMMOFlagName(*TII, MachineMemOperand::MOTargetFlag3)
+         << "\" ";
+  } else {
+    if (getFlags() & MachineMemOperand::MOTargetFlag1)
+      OS << "\"MOTargetFlag1\" ";
+    if (getFlags() & MachineMemOperand::MOTargetFlag2)
+      OS << "\"MOTargetFlag2\" ";
+    if (getFlags() & MachineMemOperand::MOTargetFlag3)
+      OS << "\"MOTargetFlag3\" ";
+  }
+
+  assert((isLoad() || isStore()) &&
+         "machine memory operand must be a load or store (or both)");
+  if (isLoad())
+    OS << "load ";
+  if (isStore())
+    OS << "store ";
+
+  printSyncScope(OS, Context, getSyncScopeID(), SSNs);
+
+  if (getSuccessOrdering() != AtomicOrdering::NotAtomic)
+    OS << toIRString(getSuccessOrdering()) << ' ';
+  if (getFailureOrdering() != AtomicOrdering::NotAtomic)
+    OS << toIRString(getFailureOrdering()) << ' ';
+
+  if (getMemoryType().isValid())
+    OS << '(' << getMemoryType() << ')';
+  else
+    OS << "unknown-size";
+
+  if (const Value *Val = getValue()) {
+    OS << ((isLoad() && isStore()) ? " on " : isLoad() ? " from " : " into ");
+    MIRFormatter::printIRValue(OS, *Val, MST);
+  } else if (const PseudoSourceValue *PVal = getPseudoValue()) {
+    OS << ((isLoad() && isStore()) ? " on " : isLoad() ? " from " : " into ");
+    assert(PVal && "Expected a pseudo source value");
+    switch (PVal->kind()) {
+    case PseudoSourceValue::Stack:
+      OS << "stack";
+      break;
+    case PseudoSourceValue::GOT:
+      OS << "got";
+      break;
+    case PseudoSourceValue::JumpTable:
+      OS << "jump-table";
+      break;
+    case PseudoSourceValue::ConstantPool:
+      OS << "constant-pool";
+      break;
+    case PseudoSourceValue::FixedStack: {
+      int FrameIndex = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex();
+      bool IsFixed = true;
+      printFrameIndex(OS, FrameIndex, IsFixed, MFI);
+      break;
+    }
+    case PseudoSourceValue::GlobalValueCallEntry:
+      OS << "call-entry ";
+      cast<GlobalValuePseudoSourceValue>(PVal)->getValue()->printAsOperand(
+          OS, /*PrintType=*/false, MST);
+      break;
+    case PseudoSourceValue::ExternalSymbolCallEntry:
+      OS << "call-entry &";
+      printLLVMNameWithoutPrefix(
+          OS, cast<ExternalSymbolPseudoSourceValue>(PVal)->getSymbol());
+      break;
+    default: {
+      const MIRFormatter *Formatter = TII->getMIRFormatter();
+      // FIXME: This is not necessarily the correct MIR serialization format for
+      // a custom pseudo source value, but at least it allows
+      // MIR printing to work on a target with custom pseudo source
+      // values.
+      OS << "custom \"";
+      Formatter->printCustomPseudoSourceValue(OS, MST, *PVal);
+      OS << '\"';
+      break;
+    }
+    }
+  } else if (getOpaqueValue() == nullptr && getOffset() != 0) {
+    OS << ((isLoad() && isStore()) ? " on "
+           : isLoad()              ? " from "
+                                   : " into ")
+       << "unknown-address";
+  }
+  MachineOperand::printOperandOffset(OS, getOffset());
+  if (getSize() > 0 && getAlign() != getSize())
+    OS << ", align " << getAlign().value();
+  if (getAlign() != getBaseAlign())
+    OS << ", basealign " << getBaseAlign().value();
+  auto AAInfo = getAAInfo();
+  if (AAInfo.TBAA) {
+    OS << ", !tbaa ";
+    AAInfo.TBAA->printAsOperand(OS, MST);
+  }
+  if (AAInfo.Scope) {
+    OS << ", !alias.scope ";
+    AAInfo.Scope->printAsOperand(OS, MST);
+  }
+  if (AAInfo.NoAlias) {
+    OS << ", !noalias ";
+    AAInfo.NoAlias->printAsOperand(OS, MST);
+  }
+  if (getRanges()) {
+    OS << ", !range ";
+    getRanges()->printAsOperand(OS, MST);
+  }
+  // FIXME: Implement addrspace printing/parsing in MIR.
+  // For now, print this even though parsing it is not available in MIR.
+  if (unsigned AS = getAddrSpace())
+    OS << ", addrspace " << AS;
+
+  OS << ')';
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
new file mode 100644
index 000000000000..1c31eba909e7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
@@ -0,0 +1,97 @@
+///===- MachineOptimizationRemarkEmitter.cpp - Opt Diagnostic -*- C++ -*---===//
+///
+/// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+/// See https://llvm.org/LICENSE.txt for license information.
+/// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+///
+///===---------------------------------------------------------------------===//
+/// \file
+/// Optimization diagnostic interfaces for machine passes.  It's packaged as an
+/// analysis pass so that by using this service passes become dependent on MBFI
+/// as well.  MBFI is used to compute the "hotness" of the diagnostic message.
+///
+///===---------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/InitializePasses.h"
+#include <optional>
+
+using namespace llvm;
+
+DiagnosticInfoMIROptimization::MachineArgument::MachineArgument(
+    StringRef MKey, const MachineInstr &MI) {
+  Key = std::string(MKey);
+
+  raw_string_ostream OS(Val);
+  MI.print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false,
+           /*SkipDebugLoc=*/true);
+}
+
+std::optional<uint64_t>
+MachineOptimizationRemarkEmitter::computeHotness(const MachineBasicBlock &MBB) {
+  if (!MBFI)
+    return std::nullopt;
+
+  return MBFI->getBlockProfileCount(&MBB);
+}
+
+void MachineOptimizationRemarkEmitter::computeHotness(
+    DiagnosticInfoMIROptimization &Remark) {
+  const MachineBasicBlock *MBB = Remark.getBlock();
+  if (MBB)
+    Remark.setHotness(computeHotness(*MBB));
+}
+
+void MachineOptimizationRemarkEmitter::emit(
+    DiagnosticInfoOptimizationBase &OptDiagCommon) {
+  auto &OptDiag = cast<DiagnosticInfoMIROptimization>(OptDiagCommon);
+  computeHotness(OptDiag);
+
+  LLVMContext &Ctx = MF.getFunction().getContext();
+
+  // Only emit it if its hotness meets the threshold.
+  if (OptDiag.getHotness().value_or(0) < Ctx.getDiagnosticsHotnessThreshold())
+    return;
+
+  Ctx.diagnose(OptDiag);
+}
+
+MachineOptimizationRemarkEmitterPass::MachineOptimizationRemarkEmitterPass()
+    : MachineFunctionPass(ID) {
+  initializeMachineOptimizationRemarkEmitterPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+bool MachineOptimizationRemarkEmitterPass::runOnMachineFunction(
+    MachineFunction &MF) {
+  MachineBlockFrequencyInfo *MBFI;
+
+  if (MF.getFunction().getContext().getDiagnosticsHotnessRequested())
+    MBFI = &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI();
+  else
+    MBFI = nullptr;
+
+  ORE = std::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
+  return false;
+}
+
+void MachineOptimizationRemarkEmitterPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+char MachineOptimizationRemarkEmitterPass::ID = 0;
+static const char ore_name[] = "Machine Optimization Remark Emitter";
+#define ORE_NAME "machine-opt-remark-emitter"
+
+INITIALIZE_PASS_BEGIN(MachineOptimizationRemarkEmitterPass, ORE_NAME, ore_name,
+                      true, true)
+INITIALIZE_PASS_DEPENDENCY(LazyMachineBlockFrequencyInfoPass)
+INITIALIZE_PASS_END(MachineOptimizationRemarkEmitterPass, ORE_NAME, ore_name,
+                    true, true)
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineOutliner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineOutliner.cpp
new file mode 100644
index 000000000000..a0769105c929
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -0,0 +1,1213 @@
+//===---- MachineOutliner.cpp - Outline instructions -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Replaces repeated sequences of instructions with function calls.
+///
+/// This works by placing every instruction from every basic block in a
+/// suffix tree, and repeatedly querying that tree for repeated sequences of
+/// instructions. If a sequence of instructions appears often, then it ought
+/// to be beneficial to pull out into a function.
+///
+/// The MachineOutliner communicates with a given target using hooks defined in
+/// TargetInstrInfo.h. The target supplies the outliner with information on how
+/// a specific sequence of instructions should be outlined. This information
+/// is used to deduce the number of instructions necessary to
+///
+/// * Create an outlined function
+/// * Call that outlined function
+///
+/// Targets must implement
+///   * getOutliningCandidateInfo
+///   * buildOutlinedFrame
+///   * insertOutlinedCall
+///   * isFunctionSafeToOutlineFrom
+///
+/// in order to make use of the MachineOutliner.
+///
+/// This was originally presented at the 2016 LLVM Developers' Meeting in the
+/// talk "Reducing Code Size Using Outlining". For a high-level overview of
+/// how this pass works, the talk is available on YouTube at
+///
+/// https://www.youtube.com/watch?v=yorld-WSOeU
+///
+/// The slides for the talk are available at
+///
+/// http://www.llvm.org/devmtg/2016-11/Slides/Paquette-Outliner.pdf
+///
+/// The talk provides an overview of how the outliner finds candidates and
+/// ultimately outlines them. It describes how the main data structure for this
+/// pass, the suffix tree, is queried and purged for candidates. It also gives
+/// a simplified suffix tree construction algorithm for suffix trees based off
+/// of the algorithm actually used here, Ukkonen's algorithm.
+///
+/// For the original RFC for this pass, please see
+///
+/// http://lists.llvm.org/pipermail/llvm-dev/2016-August/104170.html
+///
+/// For more information on the suffix tree data structure, please see
+/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
+///
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/MachineOutliner.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/SuffixTree.h"
+#include "llvm/Support/raw_ostream.h"
+#include <functional>
+#include <tuple>
+#include <vector>
+
+#define DEBUG_TYPE "machine-outliner"
+
+using namespace llvm;
+using namespace ore;
+using namespace outliner;
+
+// Statistics for outlined functions.
+STATISTIC(NumOutlined, "Number of candidates outlined");
+STATISTIC(FunctionsCreated, "Number of functions created");
+
+// Statistics for instruction mapping.
+STATISTIC(NumLegalInUnsignedVec, "Outlinable instructions mapped");
+STATISTIC(NumIllegalInUnsignedVec,
+          "Unoutlinable instructions mapped + number of sentinel values");
+STATISTIC(NumSentinels, "Sentinel values inserted during mapping");
+STATISTIC(NumInvisible,
+          "Invisible instructions skipped during mapping");
+STATISTIC(UnsignedVecSize,
+          "Total number of instructions mapped and saved to mapping vector");
+
+// Set to true if the user wants the outliner to run on linkonceodr linkage
+// functions. This is false by default because the linker can dedupe linkonceodr
+// functions. Since the outliner is confined to a single module (modulo LTO),
+// this is off by default. It should, however, be the default behaviour in
+// LTO.
+static cl::opt<bool> EnableLinkOnceODROutlining(
+    "enable-linkonceodr-outlining", cl::Hidden,
+    cl::desc("Enable the machine outliner on linkonceodr functions"),
+    cl::init(false));
+
+/// Number of times to re-run the outliner. This is not the total number of runs
+/// as the outliner will run at least one time. The default value is set to 0,
+/// meaning the outliner will run one time and rerun zero times after that.
+static cl::opt<unsigned> OutlinerReruns(
+    "machine-outliner-reruns", cl::init(0), cl::Hidden,
+    cl::desc(
+        "Number of times to rerun the outliner after the initial outline"));
+
+static cl::opt<unsigned> OutlinerBenefitThreshold(
+    "outliner-benefit-threshold", cl::init(1), cl::Hidden,
+    cl::desc(
+        "The minimum size in bytes before an outlining candidate is accepted"));
+
+namespace {
+
+/// Maps \p MachineInstrs to unsigned integers and stores the mappings.
+struct InstructionMapper {
+
+  /// The next available integer to assign to a \p MachineInstr that
+  /// cannot be outlined.
+  ///
+  /// Set to -3 for compatability with \p DenseMapInfo<unsigned>.
+  unsigned IllegalInstrNumber = -3;
+
+  /// The next available integer to assign to a \p MachineInstr that can
+  /// be outlined.
+  unsigned LegalInstrNumber = 0;
+
+  /// Correspondence from \p MachineInstrs to unsigned integers.
+  DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait>
+      InstructionIntegerMap;
+
+  /// Correspondence between \p MachineBasicBlocks and target-defined flags.
+  DenseMap<MachineBasicBlock *, unsigned> MBBFlagsMap;
+
+  /// The vector of unsigned integers that the module is mapped to.
+  SmallVector<unsigned> UnsignedVec;
+
+  /// Stores the location of the instruction associated with the integer
+  /// at index i in \p UnsignedVec for each index i.
+  SmallVector<MachineBasicBlock::iterator> InstrList;
+
+  // Set if we added an illegal number in the previous step.
+  // Since each illegal number is unique, we only need one of them between
+  // each range of legal numbers. This lets us make sure we don't add more
+  // than one illegal number per range.
+  bool AddedIllegalLastTime = false;
+
+  /// Maps \p *It to a legal integer.
+  ///
+  /// Updates \p CanOutlineWithPrevInstr, \p HaveLegalRange, \p InstrListForMBB,
+  /// \p UnsignedVecForMBB, \p InstructionIntegerMap, and \p LegalInstrNumber.
+  ///
+  /// \returns The integer that \p *It was mapped to.
+  unsigned mapToLegalUnsigned(
+      MachineBasicBlock::iterator &It, bool &CanOutlineWithPrevInstr,
+      bool &HaveLegalRange, unsigned &NumLegalInBlock,
+      SmallVector<unsigned> &UnsignedVecForMBB,
+      SmallVector<MachineBasicBlock::iterator> &InstrListForMBB) {
+    // We added something legal, so we should unset the AddedLegalLastTime
+    // flag.
+    AddedIllegalLastTime = false;
+
+    // If we have at least two adjacent legal instructions (which may have
+    // invisible instructions in between), remember that.
+    if (CanOutlineWithPrevInstr)
+      HaveLegalRange = true;
+    CanOutlineWithPrevInstr = true;
+
+    // Keep track of the number of legal instructions we insert.
+    NumLegalInBlock++;
+
+    // Get the integer for this instruction or give it the current
+    // LegalInstrNumber.
+    InstrListForMBB.push_back(It);
+    MachineInstr &MI = *It;
+    bool WasInserted;
+    DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait>::iterator
+        ResultIt;
+    std::tie(ResultIt, WasInserted) =
+        InstructionIntegerMap.insert(std::make_pair(&MI, LegalInstrNumber));
+    unsigned MINumber = ResultIt->second;
+
+    // There was an insertion.
+    if (WasInserted)
+      LegalInstrNumber++;
+
+    UnsignedVecForMBB.push_back(MINumber);
+
+    // Make sure we don't overflow or use any integers reserved by the DenseMap.
+    if (LegalInstrNumber >= IllegalInstrNumber)
+      report_fatal_error("Instruction mapping overflow!");
+
+    assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
+           "Tried to assign DenseMap tombstone or empty key to instruction.");
+    assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
+           "Tried to assign DenseMap tombstone or empty key to instruction.");
+
+    // Statistics.
+    ++NumLegalInUnsignedVec;
+    return MINumber;
+  }
+
+  /// Maps \p *It to an illegal integer.
+  ///
+  /// Updates \p InstrListForMBB, \p UnsignedVecForMBB, and \p
+  /// IllegalInstrNumber.
+  ///
+  /// \returns The integer that \p *It was mapped to.
+  unsigned mapToIllegalUnsigned(
+      MachineBasicBlock::iterator &It, bool &CanOutlineWithPrevInstr,
+      SmallVector<unsigned> &UnsignedVecForMBB,
+      SmallVector<MachineBasicBlock::iterator> &InstrListForMBB) {
+    // Can't outline an illegal instruction. Set the flag.
+    CanOutlineWithPrevInstr = false;
+
+    // Only add one illegal number per range of legal numbers.
+    if (AddedIllegalLastTime)
+      return IllegalInstrNumber;
+
+    // Remember that we added an illegal number last time.
+    AddedIllegalLastTime = true;
+    unsigned MINumber = IllegalInstrNumber;
+
+    InstrListForMBB.push_back(It);
+    UnsignedVecForMBB.push_back(IllegalInstrNumber);
+    IllegalInstrNumber--;
+    // Statistics.
+    ++NumIllegalInUnsignedVec;
+
+    assert(LegalInstrNumber < IllegalInstrNumber &&
+           "Instruction mapping overflow!");
+
+    assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey() &&
+           "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
+
+    assert(IllegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey() &&
+           "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
+
+    return MINumber;
+  }
+
+  /// Transforms a \p MachineBasicBlock into a \p vector of \p unsigneds
+  /// and appends it to \p UnsignedVec and \p InstrList.
+  ///
+  /// Two instructions are assigned the same integer if they are identical.
+  /// If an instruction is deemed unsafe to outline, then it will be assigned an
+  /// unique integer. The resulting mapping is placed into a suffix tree and
+  /// queried for candidates.
+  ///
+  /// \param MBB The \p MachineBasicBlock to be translated into integers.
+  /// \param TII \p TargetInstrInfo for the function.
+  void convertToUnsignedVec(MachineBasicBlock &MBB,
+                            const TargetInstrInfo &TII) {
+    LLVM_DEBUG(dbgs() << "*** Converting MBB '" << MBB.getName()
+                      << "' to unsigned vector ***\n");
+    unsigned Flags = 0;
+
+    // Don't even map in this case.
+    if (!TII.isMBBSafeToOutlineFrom(MBB, Flags))
+      return;
+
+    auto OutlinableRanges = TII.getOutlinableRanges(MBB, Flags);
+    LLVM_DEBUG(dbgs() << MBB.getName() << ": " << OutlinableRanges.size()
+                      << " outlinable range(s)\n");
+    if (OutlinableRanges.empty())
+      return;
+
+    // Store info for the MBB for later outlining.
+    MBBFlagsMap[&MBB] = Flags;
+
+    MachineBasicBlock::iterator It = MBB.begin();
+
+    // The number of instructions in this block that will be considered for
+    // outlining.
+    unsigned NumLegalInBlock = 0;
+
+    // True if we have at least two legal instructions which aren't separated
+    // by an illegal instruction.
+    bool HaveLegalRange = false;
+
+    // True if we can perform outlining given the last mapped (non-invisible)
+    // instruction. This lets us know if we have a legal range.
+    bool CanOutlineWithPrevInstr = false;
+
+    // FIXME: Should this all just be handled in the target, rather than using
+    // repeated calls to getOutliningType?
+    SmallVector<unsigned> UnsignedVecForMBB;
+    SmallVector<MachineBasicBlock::iterator> InstrListForMBB;
+
+    LLVM_DEBUG(dbgs() << "*** Mapping outlinable ranges ***\n");
+    for (auto &OutlinableRange : OutlinableRanges) {
+      auto OutlinableRangeBegin = OutlinableRange.first;
+      auto OutlinableRangeEnd = OutlinableRange.second;
+#ifndef NDEBUG
+      LLVM_DEBUG(
+          dbgs() << "Mapping "
+                 << std::distance(OutlinableRangeBegin, OutlinableRangeEnd)
+                 << " instruction range\n");
+      // Everything outside of an outlinable range is illegal.
+      unsigned NumSkippedInRange = 0;
+#endif
+      for (; It != OutlinableRangeBegin; ++It) {
+#ifndef NDEBUG
+        ++NumSkippedInRange;
+#endif
+        mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
+                             InstrListForMBB);
+      }
+#ifndef NDEBUG
+      LLVM_DEBUG(dbgs() << "Skipped " << NumSkippedInRange
+                        << " instructions outside outlinable range\n");
+#endif
+      assert(It != MBB.end() && "Should still have instructions?");
+      // `It` is now positioned at the beginning of a range of instructions
+      // which may be outlinable. Check if each instruction is known to be safe.
+      for (; It != OutlinableRangeEnd; ++It) {
+        // Keep track of where this instruction is in the module.
+        switch (TII.getOutliningType(It, Flags)) {
+        case InstrType::Illegal:
+          mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
+                               InstrListForMBB);
+          break;
+
+        case InstrType::Legal:
+          mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange,
+                             NumLegalInBlock, UnsignedVecForMBB,
+                             InstrListForMBB);
+          break;
+
+        case InstrType::LegalTerminator:
+          mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange,
+                             NumLegalInBlock, UnsignedVecForMBB,
+                             InstrListForMBB);
+          // The instruction also acts as a terminator, so we have to record
+          // that in the string.
+          mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
+                               InstrListForMBB);
+          break;
+
+        case InstrType::Invisible:
+          // Normally this is set by mapTo(Blah)Unsigned, but we just want to
+          // skip this instruction. So, unset the flag here.
+          ++NumInvisible;
+          AddedIllegalLastTime = false;
+          break;
+        }
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "HaveLegalRange = " << HaveLegalRange << "\n");
+
+    // Are there enough legal instructions in the block for outlining to be
+    // possible?
+    if (HaveLegalRange) {
+      // After we're done every insertion, uniquely terminate this part of the
+      // "string". This makes sure we won't match across basic block or function
+      // boundaries since the "end" is encoded uniquely and thus appears in no
+      // repeated substring.
+      mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
+                           InstrListForMBB);
+      ++NumSentinels;
+      append_range(InstrList, InstrListForMBB);
+      append_range(UnsignedVec, UnsignedVecForMBB);
+    }
+  }
+
+  InstructionMapper() {
+    // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
+    // changed.
+    assert(DenseMapInfo<unsigned>::getEmptyKey() == (unsigned)-1 &&
+           "DenseMapInfo<unsigned>'s empty key isn't -1!");
+    assert(DenseMapInfo<unsigned>::getTombstoneKey() == (unsigned)-2 &&
+           "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
+  }
+};
+
+/// An interprocedural pass which finds repeated sequences of
+/// instructions and replaces them with calls to functions.
+///
+/// Each instruction is mapped to an unsigned integer and placed in a string.
+/// The resulting mapping is then placed in a \p SuffixTree. The \p SuffixTree
+/// is then repeatedly queried for repeated sequences of instructions. Each
+/// non-overlapping repeated sequence is then placed in its own
+/// \p MachineFunction and each instance is then replaced with a call to that
+/// function.
+struct MachineOutliner : public ModulePass {
+
+  static char ID;
+
+  /// Set to true if the outliner should consider functions with
+  /// linkonceodr linkage.
+  bool OutlineFromLinkOnceODRs = false;
+
+  /// The current repeat number of machine outlining.
+  unsigned OutlineRepeatedNum = 0;
+
+  /// Set to true if the outliner should run on all functions in the module
+  /// considered safe for outlining.
+  /// Set to true by default for compatibility with llc's -run-pass option.
+  /// Set when the pass is constructed in TargetPassConfig.
+  bool RunOnAllFunctions = true;
+
+  StringRef getPassName() const override { return "Machine Outliner"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.addPreserved<MachineModuleInfoWrapperPass>();
+    AU.setPreservesAll();
+    ModulePass::getAnalysisUsage(AU);
+  }
+
+  MachineOutliner() : ModulePass(ID) {
+    initializeMachineOutlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  /// Remark output explaining that not outlining a set of candidates would be
+  /// better than outlining that set.
+  void emitNotOutliningCheaperRemark(
+      unsigned StringLen, std::vector<Candidate> &CandidatesForRepeatedSeq,
+      OutlinedFunction &OF);
+
+  /// Remark output explaining that a function was outlined.
+  void emitOutlinedFunctionRemark(OutlinedFunction &OF);
+
+  /// Find all repeated substrings that satisfy the outlining cost model by
+  /// constructing a suffix tree.
+  ///
+  /// If a substring appears at least twice, then it must be represented by
+  /// an internal node which appears in at least two suffixes. Each suffix
+  /// is represented by a leaf node. To do this, we visit each internal node
+  /// in the tree, using the leaf children of each internal node. If an
+  /// internal node represents a beneficial substring, then we use each of
+  /// its leaf children to find the locations of its substring.
+  ///
+  /// \param Mapper Contains outlining mapping information.
+  /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions
+  /// each type of candidate.
+  void findCandidates(InstructionMapper &Mapper,
+                      std::vector<OutlinedFunction> &FunctionList);
+
+  /// Replace the sequences of instructions represented by \p OutlinedFunctions
+  /// with calls to functions.
+  ///
+  /// \param M The module we are outlining from.
+  /// \param FunctionList A list of functions to be inserted into the module.
+  /// \param Mapper Contains the instruction mappings for the module.
+  bool outline(Module &M, std::vector<OutlinedFunction> &FunctionList,
+               InstructionMapper &Mapper, unsigned &OutlinedFunctionNum);
+
+  /// Creates a function for \p OF and inserts it into the module.
+  MachineFunction *createOutlinedFunction(Module &M, OutlinedFunction &OF,
+                                          InstructionMapper &Mapper,
+                                          unsigned Name);
+
+  /// Calls 'doOutline()' 1 + OutlinerReruns times.
+  bool runOnModule(Module &M) override;
+
+  /// Construct a suffix tree on the instructions in \p M and outline repeated
+  /// strings from that tree.
+  bool doOutline(Module &M, unsigned &OutlinedFunctionNum);
+
+  /// Return a DISubprogram for OF if one exists, and null otherwise. Helper
+  /// function for remark emission.
+  DISubprogram *getSubprogramOrNull(const OutlinedFunction &OF) {
+    for (const Candidate &C : OF.Candidates)
+      if (MachineFunction *MF = C.getMF())
+        if (DISubprogram *SP = MF->getFunction().getSubprogram())
+          return SP;
+    return nullptr;
+  }
+
+  /// Populate and \p InstructionMapper with instruction-to-integer mappings.
+  /// These are used to construct a suffix tree.
+  void populateMapper(InstructionMapper &Mapper, Module &M,
+                      MachineModuleInfo &MMI);
+
+  /// Initialize information necessary to output a size remark.
+  /// FIXME: This should be handled by the pass manager, not the outliner.
+  /// FIXME: This is nearly identical to the initSizeRemarkInfo in the legacy
+  /// pass manager.
+  void initSizeRemarkInfo(const Module &M, const MachineModuleInfo &MMI,
+                          StringMap<unsigned> &FunctionToInstrCount);
+
+  /// Emit the remark.
+  // FIXME: This should be handled by the pass manager, not the outliner.
+  void
+  emitInstrCountChangedRemark(const Module &M, const MachineModuleInfo &MMI,
+                              const StringMap<unsigned> &FunctionToInstrCount);
+};
+} // Anonymous namespace.
+
+char MachineOutliner::ID = 0;
+
+namespace llvm {
+ModulePass *createMachineOutlinerPass(bool RunOnAllFunctions) {
+  MachineOutliner *OL = new MachineOutliner();
+  OL->RunOnAllFunctions = RunOnAllFunctions;
+  return OL;
+}
+
+} // namespace llvm
+
+INITIALIZE_PASS(MachineOutliner, DEBUG_TYPE, "Machine Function Outliner", false,
+                false)
+
+void MachineOutliner::emitNotOutliningCheaperRemark(
+    unsigned StringLen, std::vector<Candidate> &CandidatesForRepeatedSeq,
+    OutlinedFunction &OF) {
+  // FIXME: Right now, we arbitrarily choose some Candidate from the
+  // OutlinedFunction. This isn't necessarily fixed, nor does it have to be.
+  // We should probably sort these by function name or something to make sure
+  // the remarks are stable.
+  Candidate &C = CandidatesForRepeatedSeq.front();
+  MachineOptimizationRemarkEmitter MORE(*(C.getMF()), nullptr);
+  MORE.emit([&]() {
+    MachineOptimizationRemarkMissed R(DEBUG_TYPE, "NotOutliningCheaper",
+                                      C.front()->getDebugLoc(), C.getMBB());
+    R << "Did not outline " << NV("Length", StringLen) << " instructions"
+      << " from " << NV("NumOccurrences", CandidatesForRepeatedSeq.size())
+      << " locations."
+      << " Bytes from outlining all occurrences ("
+      << NV("OutliningCost", OF.getOutliningCost()) << ")"
+      << " >= Unoutlined instruction bytes ("
+      << NV("NotOutliningCost", OF.getNotOutlinedCost()) << ")"
+      << " (Also found at: ";
+
+    // Tell the user the other places the candidate was found.
+    for (unsigned i = 1, e = CandidatesForRepeatedSeq.size(); i < e; i++) {
+      R << NV((Twine("OtherStartLoc") + Twine(i)).str(),
+              CandidatesForRepeatedSeq[i].front()->getDebugLoc());
+      if (i != e - 1)
+        R << ", ";
+    }
+
+    R << ")";
+    return R;
+  });
+}
+
+void MachineOutliner::emitOutlinedFunctionRemark(OutlinedFunction &OF) {
+  MachineBasicBlock *MBB = &*OF.MF->begin();
+  MachineOptimizationRemarkEmitter MORE(*OF.MF, nullptr);
+  MachineOptimizationRemark R(DEBUG_TYPE, "OutlinedFunction",
+                              MBB->findDebugLoc(MBB->begin()), MBB);
+  R << "Saved " << NV("OutliningBenefit", OF.getBenefit()) << " bytes by "
+    << "outlining " << NV("Length", OF.getNumInstrs()) << " instructions "
+    << "from " << NV("NumOccurrences", OF.getOccurrenceCount())
+    << " locations. "
+    << "(Found at: ";
+
+  // Tell the user the other places the candidate was found.
+  for (size_t i = 0, e = OF.Candidates.size(); i < e; i++) {
+
+    R << NV((Twine("StartLoc") + Twine(i)).str(),
+            OF.Candidates[i].front()->getDebugLoc());
+    if (i != e - 1)
+      R << ", ";
+  }
+
+  R << ")";
+
+  MORE.emit(R);
+}
+
+void MachineOutliner::findCandidates(
+    InstructionMapper &Mapper, std::vector<OutlinedFunction> &FunctionList) {
+  FunctionList.clear();
+  SuffixTree ST(Mapper.UnsignedVec);
+
+  // First, find all of the repeated substrings in the tree of minimum length
+  // 2.
+  std::vector<Candidate> CandidatesForRepeatedSeq;
+  LLVM_DEBUG(dbgs() << "*** Discarding overlapping candidates *** \n");
+  LLVM_DEBUG(
+      dbgs() << "Searching for overlaps in all repeated sequences...\n");
+  for (const SuffixTree::RepeatedSubstring &RS : ST) {
+    CandidatesForRepeatedSeq.clear();
+    unsigned StringLen = RS.Length;
+    LLVM_DEBUG(dbgs() << "  Sequence length: " << StringLen << "\n");
+    // Debug code to keep track of how many candidates we removed.
+#ifndef NDEBUG
+    unsigned NumDiscarded = 0;
+    unsigned NumKept = 0;
+#endif
+    for (const unsigned &StartIdx : RS.StartIndices) {
+      // Trick: Discard some candidates that would be incompatible with the
+      // ones we've already found for this sequence. This will save us some
+      // work in candidate selection.
+      //
+      // If two candidates overlap, then we can't outline them both. This
+      // happens when we have candidates that look like, say
+      //
+      // AA (where each "A" is an instruction).
+      //
+      // We might have some portion of the module that looks like this:
+      // AAAAAA (6 A's)
+      //
+      // In this case, there are 5 different copies of "AA" in this range, but
+      // at most 3 can be outlined. If only outlining 3 of these is going to
+      // be unbeneficial, then we ought to not bother.
+      //
+      // Note that two things DON'T overlap when they look like this:
+      // start1...end1 .... start2...end2
+      // That is, one must either
+      // * End before the other starts
+      // * Start after the other ends
+      unsigned EndIdx = StartIdx + StringLen - 1;
+      auto FirstOverlap = find_if(
+          CandidatesForRepeatedSeq, [StartIdx, EndIdx](const Candidate &C) {
+            return EndIdx >= C.getStartIdx() && StartIdx <= C.getEndIdx();
+          });
+      if (FirstOverlap != CandidatesForRepeatedSeq.end()) {
+#ifndef NDEBUG
+        ++NumDiscarded;
+        LLVM_DEBUG(dbgs() << "    .. DISCARD candidate @ [" << StartIdx
+                          << ", " << EndIdx << "]; overlaps with candidate @ ["
+                          << FirstOverlap->getStartIdx() << ", "
+                          << FirstOverlap->getEndIdx() << "]\n");
+#endif
+        continue;
+      }
+      // It doesn't overlap with anything, so we can outline it.
+      // Each sequence is over [StartIt, EndIt].
+      // Save the candidate and its location.
+#ifndef NDEBUG
+      ++NumKept;
+#endif
+      MachineBasicBlock::iterator StartIt = Mapper.InstrList[StartIdx];
+      MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx];
+      MachineBasicBlock *MBB = StartIt->getParent();
+      CandidatesForRepeatedSeq.emplace_back(StartIdx, StringLen, StartIt, EndIt,
+                                            MBB, FunctionList.size(),
+                                            Mapper.MBBFlagsMap[MBB]);
+    }
+#ifndef NDEBUG
+    LLVM_DEBUG(dbgs() << "    Candidates discarded: " << NumDiscarded
+                      << "\n");
+    LLVM_DEBUG(dbgs() << "    Candidates kept: " << NumKept << "\n\n");
+#endif
+
+    // We've found something we might want to outline.
+    // Create an OutlinedFunction to store it and check if it'd be beneficial
+    // to outline.
+    if (CandidatesForRepeatedSeq.size() < 2)
+      continue;
+
+    // Arbitrarily choose a TII from the first candidate.
+    // FIXME: Should getOutliningCandidateInfo move to TargetMachine?
+    const TargetInstrInfo *TII =
+        CandidatesForRepeatedSeq[0].getMF()->getSubtarget().getInstrInfo();
+
+    std::optional<OutlinedFunction> OF =
+        TII->getOutliningCandidateInfo(CandidatesForRepeatedSeq);
+
+    // If we deleted too many candidates, then there's nothing worth outlining.
+    // FIXME: This should take target-specified instruction sizes into account.
+    if (!OF || OF->Candidates.size() < 2)
+      continue;
+
+    // Is it better to outline this candidate than not?
+    if (OF->getBenefit() < OutlinerBenefitThreshold) {
+      emitNotOutliningCheaperRemark(StringLen, CandidatesForRepeatedSeq, *OF);
+      continue;
+    }
+
+    FunctionList.push_back(*OF);
+  }
+}
+
+MachineFunction *MachineOutliner::createOutlinedFunction(
+    Module &M, OutlinedFunction &OF, InstructionMapper &Mapper, unsigned Name) {
+
+  // Create the function name. This should be unique.
+  // FIXME: We should have a better naming scheme. This should be stable,
+  // regardless of changes to the outliner's cost model/traversal order.
+  std::string FunctionName = "OUTLINED_FUNCTION_";
+  if (OutlineRepeatedNum > 0)
+    FunctionName += std::to_string(OutlineRepeatedNum + 1) + "_";
+  FunctionName += std::to_string(Name);
+  LLVM_DEBUG(dbgs() << "NEW FUNCTION: " << FunctionName << "\n");
+
+  // Create the function using an IR-level function.
+  LLVMContext &C = M.getContext();
+  Function *F = Function::Create(FunctionType::get(Type::getVoidTy(C), false),
+                                 Function::ExternalLinkage, FunctionName, M);
+
+  // NOTE: If this is linkonceodr, then we can take advantage of linker deduping
+  // which gives us better results when we outline from linkonceodr functions.
+  F->setLinkage(GlobalValue::InternalLinkage);
+  F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+  // Set optsize/minsize, so we don't insert padding between outlined
+  // functions.
+  F->addFnAttr(Attribute::OptimizeForSize);
+  F->addFnAttr(Attribute::MinSize);
+
+  Candidate &FirstCand = OF.Candidates.front();
+  const TargetInstrInfo &TII =
+      *FirstCand.getMF()->getSubtarget().getInstrInfo();
+
+  TII.mergeOutliningCandidateAttributes(*F, OF.Candidates);
+
+  // Set uwtable, so we generate eh_frame.
+  UWTableKind UW = std::accumulate(
+      OF.Candidates.cbegin(), OF.Candidates.cend(), UWTableKind::None,
+      [](UWTableKind K, const outliner::Candidate &C) {
+        return std::max(K, C.getMF()->getFunction().getUWTableKind());
+      });
+  if (UW != UWTableKind::None)
+    F->setUWTableKind(UW);
+
+  BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
+  IRBuilder<> Builder(EntryBB);
+  Builder.CreateRetVoid();
+
+  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+  MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
+  MF.setIsOutlined(true);
+  MachineBasicBlock &MBB = *MF.CreateMachineBasicBlock();
+
+  // Insert the new function into the module.
+  MF.insert(MF.begin(), &MBB);
+
+  MachineFunction *OriginalMF = FirstCand.front()->getMF();
+  const std::vector<MCCFIInstruction> &Instrs =
+      OriginalMF->getFrameInstructions();
+  for (auto I = FirstCand.front(), E = std::next(FirstCand.back()); I != E;
+       ++I) {
+    if (I->isDebugInstr())
+      continue;
+
+    // Don't keep debug information for outlined instructions.
+    auto DL = DebugLoc();
+    if (I->isCFIInstruction()) {
+      unsigned CFIIndex = I->getOperand(0).getCFIIndex();
+      MCCFIInstruction CFI = Instrs[CFIIndex];
+      BuildMI(MBB, MBB.end(), DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(MF.addFrameInst(CFI));
+    } else {
+      MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
+      NewMI->dropMemRefs(MF);
+      NewMI->setDebugLoc(DL);
+      MBB.insert(MBB.end(), NewMI);
+    }
+  }
+
+  // Set normal properties for a late MachineFunction.
+  MF.getProperties().reset(MachineFunctionProperties::Property::IsSSA);
+  MF.getProperties().set(MachineFunctionProperties::Property::NoPHIs);
+  MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
+  MF.getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
+  MF.getRegInfo().freezeReservedRegs(MF);
+
+  // Compute live-in set for outlined fn
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  LivePhysRegs LiveIns(TRI);
+  for (auto &Cand : OF.Candidates) {
+    // Figure out live-ins at the first instruction.
+    MachineBasicBlock &OutlineBB = *Cand.front()->getParent();
+    LivePhysRegs CandLiveIns(TRI);
+    CandLiveIns.addLiveOuts(OutlineBB);
+    for (const MachineInstr &MI :
+         reverse(make_range(Cand.front(), OutlineBB.end())))
+      CandLiveIns.stepBackward(MI);
+
+    // The live-in set for the outlined function is the union of the live-ins
+    // from all the outlining points.
+    for (MCPhysReg Reg : CandLiveIns)
+      LiveIns.addReg(Reg);
+  }
+  addLiveIns(MBB, LiveIns);
+
+  TII.buildOutlinedFrame(MBB, MF, OF);
+
+  // If there's a DISubprogram associated with this outlined function, then
+  // emit debug info for the outlined function.
+  if (DISubprogram *SP = getSubprogramOrNull(OF)) {
+    // We have a DISubprogram. Get its DICompileUnit.
+    DICompileUnit *CU = SP->getUnit();
+    DIBuilder DB(M, true, CU);
+    DIFile *Unit = SP->getFile();
+    Mangler Mg;
+    // Get the mangled name of the function for the linkage name.
+    std::string Dummy;
+    raw_string_ostream MangledNameStream(Dummy);
+    Mg.getNameWithPrefix(MangledNameStream, F, false);
+
+    DISubprogram *OutlinedSP = DB.createFunction(
+        Unit /* Context */, F->getName(), StringRef(MangledNameStream.str()),
+        Unit /* File */,
+        0 /* Line 0 is reserved for compiler-generated code. */,
+        DB.createSubroutineType(
+            DB.getOrCreateTypeArray(std::nullopt)), /* void type */
+        0, /* Line 0 is reserved for compiler-generated code. */
+        DINode::DIFlags::FlagArtificial /* Compiler-generated code. */,
+        /* Outlined code is optimized code by definition. */
+        DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized);
+
+    // Don't add any new variables to the subprogram.
+    DB.finalizeSubprogram(OutlinedSP);
+
+    // Attach subprogram to the function.
+    F->setSubprogram(OutlinedSP);
+    // We're done with the DIBuilder.
+    DB.finalize();
+  }
+
+  return &MF;
+}
+
+bool MachineOutliner::outline(Module &M,
+                              std::vector<OutlinedFunction> &FunctionList,
+                              InstructionMapper &Mapper,
+                              unsigned &OutlinedFunctionNum) {
+  LLVM_DEBUG(dbgs() << "*** Outlining ***\n");
+  LLVM_DEBUG(dbgs() << "NUMBER OF POTENTIAL FUNCTIONS: " << FunctionList.size()
+                    << "\n");
+  bool OutlinedSomething = false;
+
+  // Sort by benefit. The most beneficial functions should be outlined first.
+  stable_sort(FunctionList,
+              [](const OutlinedFunction &LHS, const OutlinedFunction &RHS) {
+                return LHS.getBenefit() > RHS.getBenefit();
+              });
+
+  // Walk over each function, outlining them as we go along. Functions are
+  // outlined greedily, based off the sort above.
+  auto *UnsignedVecBegin = Mapper.UnsignedVec.begin();
+  LLVM_DEBUG(dbgs() << "WALKING FUNCTION LIST\n");
+  for (OutlinedFunction &OF : FunctionList) {
+#ifndef NDEBUG
+    auto NumCandidatesBefore = OF.Candidates.size();
+#endif
+    // If we outlined something that overlapped with a candidate in a previous
+    // step, then we can't outline from it.
+    erase_if(OF.Candidates, [&UnsignedVecBegin](Candidate &C) {
+      return std::any_of(UnsignedVecBegin + C.getStartIdx(),
+                         UnsignedVecBegin + C.getEndIdx() + 1, [](unsigned I) {
+                           return I == static_cast<unsigned>(-1);
+                         });
+    });
+
+#ifndef NDEBUG
+    auto NumCandidatesAfter = OF.Candidates.size();
+    LLVM_DEBUG(dbgs() << "PRUNED: " << NumCandidatesBefore - NumCandidatesAfter
+                      << "/" << NumCandidatesBefore << " candidates\n");
+#endif
+
+    // If we made it unbeneficial to outline this function, skip it.
+    if (OF.getBenefit() < OutlinerBenefitThreshold) {
+      LLVM_DEBUG(dbgs() << "SKIP: Expected benefit (" << OF.getBenefit()
+                        << " B) < threshold (" << OutlinerBenefitThreshold
+                        << " B)\n");
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "OUTLINE: Expected benefit (" << OF.getBenefit()
+                      << " B) > threshold (" << OutlinerBenefitThreshold
+                      << " B)\n");
+
+    // It's beneficial. Create the function and outline its sequence's
+    // occurrences.
+    OF.MF = createOutlinedFunction(M, OF, Mapper, OutlinedFunctionNum);
+    emitOutlinedFunctionRemark(OF);
+    FunctionsCreated++;
+    OutlinedFunctionNum++; // Created a function, move to the next name.
+    MachineFunction *MF = OF.MF;
+    const TargetSubtargetInfo &STI = MF->getSubtarget();
+    const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+    // Replace occurrences of the sequence with calls to the new function.
+    LLVM_DEBUG(dbgs() << "CREATE OUTLINED CALLS\n");
+    for (Candidate &C : OF.Candidates) {
+      MachineBasicBlock &MBB = *C.getMBB();
+      MachineBasicBlock::iterator StartIt = C.front();
+      MachineBasicBlock::iterator EndIt = C.back();
+
+      // Insert the call.
+      auto CallInst = TII.insertOutlinedCall(M, MBB, StartIt, *MF, C);
+// Insert the call.
+#ifndef NDEBUG
+      auto MBBBeingOutlinedFromName =
+          MBB.getName().empty() ? "<unknown>" : MBB.getName().str();
+      auto MFBeingOutlinedFromName = MBB.getParent()->getName().empty()
+                                         ? "<unknown>"
+                                         : MBB.getParent()->getName().str();
+      LLVM_DEBUG(dbgs() << "  CALL: " << MF->getName() << " in "
+                        << MFBeingOutlinedFromName << ":"
+                        << MBBBeingOutlinedFromName << "\n");
+      LLVM_DEBUG(dbgs() << "   .. " << *CallInst);
+#endif
+
+      // If the caller tracks liveness, then we need to make sure that
+      // anything we outline doesn't break liveness assumptions. The outlined
+      // functions themselves currently don't track liveness, but we should
+      // make sure that the ranges we yank things out of aren't wrong.
+      if (MBB.getParent()->getProperties().hasProperty(
+              MachineFunctionProperties::Property::TracksLiveness)) {
+        // The following code is to add implicit def operands to the call
+        // instruction. It also updates call site information for moved
+        // code.
+        SmallSet<Register, 2> UseRegs, DefRegs;
+        // Copy over the defs in the outlined range.
+        // First inst in outlined range <-- Anything that's defined in this
+        // ...                           .. range has to be added as an
+        // implicit Last inst in outlined range  <-- def to the call
+        // instruction. Also remove call site information for outlined block
+        // of code. The exposed uses need to be copied in the outlined range.
+        for (MachineBasicBlock::reverse_iterator
+                 Iter = EndIt.getReverse(),
+                 Last = std::next(CallInst.getReverse());
+             Iter != Last; Iter++) {
+          MachineInstr *MI = &*Iter;
+          SmallSet<Register, 2> InstrUseRegs;
+          for (MachineOperand &MOP : MI->operands()) {
+            // Skip over anything that isn't a register.
+            if (!MOP.isReg())
+              continue;
+
+            if (MOP.isDef()) {
+              // Introduce DefRegs set to skip the redundant register.
+              DefRegs.insert(MOP.getReg());
+              if (UseRegs.count(MOP.getReg()) &&
+                  !InstrUseRegs.count(MOP.getReg()))
+                // Since the regiester is modeled as defined,
+                // it is not necessary to be put in use register set.
+                UseRegs.erase(MOP.getReg());
+            } else if (!MOP.isUndef()) {
+              // Any register which is not undefined should
+              // be put in the use register set.
+              UseRegs.insert(MOP.getReg());
+              InstrUseRegs.insert(MOP.getReg());
+            }
+          }
+          if (MI->isCandidateForCallSiteEntry())
+            MI->getMF()->eraseCallSiteInfo(MI);
+        }
+
+        for (const Register &I : DefRegs)
+          // If it's a def, add it to the call instruction.
+          CallInst->addOperand(
+              MachineOperand::CreateReg(I, true, /* isDef = true */
+                                        true /* isImp = true */));
+
+        for (const Register &I : UseRegs)
+          // If it's a exposed use, add it to the call instruction.
+          CallInst->addOperand(
+              MachineOperand::CreateReg(I, false, /* isDef = false */
+                                        true /* isImp = true */));
+      }
+
+      // Erase from the point after where the call was inserted up to, and
+      // including, the final instruction in the sequence.
+      // Erase needs one past the end, so we need std::next there too.
+      MBB.erase(std::next(StartIt), std::next(EndIt));
+
+      // Keep track of what we removed by marking them all as -1.
+      for (unsigned &I : make_range(UnsignedVecBegin + C.getStartIdx(),
+                                    UnsignedVecBegin + C.getEndIdx() + 1))
+        I = static_cast<unsigned>(-1);
+      OutlinedSomething = true;
+
+      // Statistics.
+      NumOutlined++;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "OutlinedSomething = " << OutlinedSomething << "\n";);
+  return OutlinedSomething;
+}
+
+void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M,
+                                     MachineModuleInfo &MMI) {
+  // Build instruction mappings for each function in the module. Start by
+  // iterating over each Function in M.
+  LLVM_DEBUG(dbgs() << "*** Populating mapper ***\n");
+  for (Function &F : M) {
+    LLVM_DEBUG(dbgs() << "MAPPING FUNCTION: " << F.getName() << "\n");
+
+    if (F.hasFnAttribute("nooutline")) {
+      LLVM_DEBUG(dbgs() << "SKIP: Function has nooutline attribute\n");
+      continue;
+    }
+
+    // There's something in F. Check if it has a MachineFunction associated with
+    // it.
+    MachineFunction *MF = MMI.getMachineFunction(F);
+
+    // If it doesn't, then there's nothing to outline from. Move to the next
+    // Function.
+    if (!MF) {
+      LLVM_DEBUG(dbgs() << "SKIP: Function does not have a MachineFunction\n");
+      continue;
+    }
+
+    const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+    if (!RunOnAllFunctions && !TII->shouldOutlineFromFunctionByDefault(*MF)) {
+      LLVM_DEBUG(dbgs() << "SKIP: Target does not want to outline from "
+                           "function by default\n");
+      continue;
+    }
+
+    // We have a MachineFunction. Ask the target if it's suitable for outlining.
+    // If it isn't, then move on to the next Function in the module.
+    if (!TII->isFunctionSafeToOutlineFrom(*MF, OutlineFromLinkOnceODRs)) {
+      LLVM_DEBUG(dbgs() << "SKIP: " << MF->getName()
+                        << ": unsafe to outline from\n");
+      continue;
+    }
+
+    // We have a function suitable for outlining. Iterate over every
+    // MachineBasicBlock in MF and try to map its instructions to a list of
+    // unsigned integers.
+    const unsigned MinMBBSize = 2;
+
+    for (MachineBasicBlock &MBB : *MF) {
+      LLVM_DEBUG(dbgs() << "  MAPPING MBB: '" << MBB.getName() << "'\n");
+      // If there isn't anything in MBB, then there's no point in outlining from
+      // it.
+      // If there are fewer than 2 instructions in the MBB, then it can't ever
+      // contain something worth outlining.
+      // FIXME: This should be based off of the maximum size in B of an outlined
+      // call versus the size in B of the MBB.
+      if (MBB.size() < MinMBBSize) {
+        LLVM_DEBUG(dbgs() << "    SKIP: MBB size less than minimum size of "
+                          << MinMBBSize << "\n");
+        continue;
+      }
+
+      // Check if MBB could be the target of an indirect branch. If it is, then
+      // we don't want to outline from it.
+      if (MBB.hasAddressTaken()) {
+        LLVM_DEBUG(dbgs() << "    SKIP: MBB's address is taken\n");
+        continue;
+      }
+
+      // MBB is suitable for outlining. Map it to a list of unsigneds.
+      Mapper.convertToUnsignedVec(MBB, *TII);
+    }
+  }
+  // Statistics.
+  UnsignedVecSize = Mapper.UnsignedVec.size();
+}
+
+void MachineOutliner::initSizeRemarkInfo(
+    const Module &M, const MachineModuleInfo &MMI,
+    StringMap<unsigned> &FunctionToInstrCount) {
+  // Collect instruction counts for every function. We'll use this to emit
+  // per-function size remarks later.
+  for (const Function &F : M) {
+    MachineFunction *MF = MMI.getMachineFunction(F);
+
+    // We only care about MI counts here. If there's no MachineFunction at this
+    // point, then there won't be after the outliner runs, so let's move on.
+    if (!MF)
+      continue;
+    FunctionToInstrCount[F.getName().str()] = MF->getInstructionCount();
+  }
+}
+
+void MachineOutliner::emitInstrCountChangedRemark(
+    const Module &M, const MachineModuleInfo &MMI,
+    const StringMap<unsigned> &FunctionToInstrCount) {
+  // Iterate over each function in the module and emit remarks.
+  // Note that we won't miss anything by doing this, because the outliner never
+  // deletes functions.
+  for (const Function &F : M) {
+    MachineFunction *MF = MMI.getMachineFunction(F);
+
+    // The outliner never deletes functions. If we don't have a MF here, then we
+    // didn't have one prior to outlining either.
+    if (!MF)
+      continue;
+
+    std::string Fname = std::string(F.getName());
+    unsigned FnCountAfter = MF->getInstructionCount();
+    unsigned FnCountBefore = 0;
+
+    // Check if the function was recorded before.
+    auto It = FunctionToInstrCount.find(Fname);
+
+    // Did we have a previously-recorded size? If yes, then set FnCountBefore
+    // to that.
+    if (It != FunctionToInstrCount.end())
+      FnCountBefore = It->second;
+
+    // Compute the delta and emit a remark if there was a change.
+    int64_t FnDelta = static_cast<int64_t>(FnCountAfter) -
+                      static_cast<int64_t>(FnCountBefore);
+    if (FnDelta == 0)
+      continue;
+
+    MachineOptimizationRemarkEmitter MORE(*MF, nullptr);
+    MORE.emit([&]() {
+      MachineOptimizationRemarkAnalysis R("size-info", "FunctionMISizeChange",
+                                          DiagnosticLocation(), &MF->front());
+      R << DiagnosticInfoOptimizationBase::Argument("Pass", "Machine Outliner")
+        << ": Function: "
+        << DiagnosticInfoOptimizationBase::Argument("Function", F.getName())
+        << ": MI instruction count changed from "
+        << DiagnosticInfoOptimizationBase::Argument("MIInstrsBefore",
+                                                    FnCountBefore)
+        << " to "
+        << DiagnosticInfoOptimizationBase::Argument("MIInstrsAfter",
+                                                    FnCountAfter)
+        << "; Delta: "
+        << DiagnosticInfoOptimizationBase::Argument("Delta", FnDelta);
+      return R;
+    });
+  }
+}
+
+bool MachineOutliner::runOnModule(Module &M) {
+  // Check if there's anything in the module. If it's empty, then there's
+  // nothing to outline.
+  if (M.empty())
+    return false;
+
+  // Number to append to the current outlined function.
+  unsigned OutlinedFunctionNum = 0;
+
+  OutlineRepeatedNum = 0;
+  if (!doOutline(M, OutlinedFunctionNum))
+    return false;
+
+  for (unsigned I = 0; I < OutlinerReruns; ++I) {
+    OutlinedFunctionNum = 0;
+    OutlineRepeatedNum++;
+    if (!doOutline(M, OutlinedFunctionNum)) {
+      LLVM_DEBUG({
+        dbgs() << "Did not outline on iteration " << I + 2 << " out of "
+               << OutlinerReruns + 1 << "\n";
+      });
+      break;
+    }
+  }
+
+  return true;
+}
+
+bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) {
+  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+
+  // If the user passed -enable-machine-outliner=always or
+  // -enable-machine-outliner, the pass will run on all functions in the module.
+  // Otherwise, if the target supports default outlining, it will run on all
+  // functions deemed by the target to be worth outlining from by default. Tell
+  // the user how the outliner is running.
+  LLVM_DEBUG({
+    dbgs() << "Machine Outliner: Running on ";
+    if (RunOnAllFunctions)
+      dbgs() << "all functions";
+    else
+      dbgs() << "target-default functions";
+    dbgs() << "\n";
+  });
+
+  // If the user specifies that they want to outline from linkonceodrs, set
+  // it here.
+  OutlineFromLinkOnceODRs = EnableLinkOnceODROutlining;
+  InstructionMapper Mapper;
+
+  // Prepare instruction mappings for the suffix tree.
+  populateMapper(Mapper, M, MMI);
+  std::vector<OutlinedFunction> FunctionList;
+
+  // Find all of the outlining candidates.
+  findCandidates(Mapper, FunctionList);
+
+  // If we've requested size remarks, then collect the MI counts of every
+  // function before outlining, and the MI counts after outlining.
+  // FIXME: This shouldn't be in the outliner at all; it should ultimately be
+  // the pass manager's responsibility.
+  // This could pretty easily be placed in outline instead, but because we
+  // really ultimately *don't* want this here, it's done like this for now
+  // instead.
+
+  // Check if we want size remarks.
+  bool ShouldEmitSizeRemarks = M.shouldEmitInstrCountChangedRemark();
+  StringMap<unsigned> FunctionToInstrCount;
+  if (ShouldEmitSizeRemarks)
+    initSizeRemarkInfo(M, MMI, FunctionToInstrCount);
+
+  // Outline each of the candidates and return true if something was outlined.
+  bool OutlinedSomething =
+      outline(M, FunctionList, Mapper, OutlinedFunctionNum);
+
+  // If we outlined something, we definitely changed the MI count of the
+  // module. If we've asked for size remarks, then output them.
+  // FIXME: This should be in the pass manager.
+  if (ShouldEmitSizeRemarks && OutlinedSomething)
+    emitInstrCountChangedRemark(M, MMI, FunctionToInstrCount);
+
+  LLVM_DEBUG({
+    if (!OutlinedSomething)
+      dbgs() << "Stopped outlining at iteration " << OutlineRepeatedNum
+             << " because no changes were found.\n";
+  });
+
+  return OutlinedSomething;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachinePassManager.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachinePassManager.cpp
new file mode 100644
index 000000000000..439ff8babcc6
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachinePassManager.cpp
@@ -0,0 +1,108 @@
+//===---------- MachinePassManager.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass management machinery for machine functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachinePassManager.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/PassManagerImpl.h"
+
+using namespace llvm;
+
+namespace llvm {
+template class AllAnalysesOn<MachineFunction>;
+template class AnalysisManager<MachineFunction>;
+template class PassManager<MachineFunction>;
+
+Error MachineFunctionPassManager::run(Module &M,
+                                      MachineFunctionAnalysisManager &MFAM) {
+  // MachineModuleAnalysis is a module analysis pass that is never invalidated
+  // because we don't run any module pass in codegen pipeline. This is very
+  // important because the codegen state is stored in MMI which is the analysis
+  // result of MachineModuleAnalysis. MMI should not be recomputed.
+  auto &MMI = MFAM.getResult<MachineModuleAnalysis>(M);
+
+  (void)RequireCodeGenSCCOrder;
+  assert(!RequireCodeGenSCCOrder && "not implemented");
+
+  // Add a PIC to verify machine functions.
+  if (VerifyMachineFunction) {
+    PassInstrumentation PI = MFAM.getResult<PassInstrumentationAnalysis>(M);
+
+    // No need to pop this callback later since MIR pipeline is flat which means
+    // current pipeline is the top-level pipeline. Callbacks are not used after
+    // current pipeline.
+    PI.pushBeforeNonSkippedPassCallback([&MFAM](StringRef PassID, Any IR) {
+      assert(any_cast<const MachineFunction *>(&IR));
+      const MachineFunction *MF = any_cast<const MachineFunction *>(IR);
+      assert(MF && "Machine function should be valid for printing");
+      std::string Banner = std::string("After ") + std::string(PassID);
+      verifyMachineFunction(&MFAM, Banner, *MF);
+    });
+  }
+
+  for (auto &F : InitializationFuncs) {
+    if (auto Err = F(M, MFAM))
+      return Err;
+  }
+
+  unsigned Idx = 0;
+  size_t Size = Passes.size();
+  do {
+    // Run machine module passes
+    for (; MachineModulePasses.count(Idx) && Idx != Size; ++Idx) {
+      if (auto Err = MachineModulePasses.at(Idx)(M, MFAM))
+        return Err;
+    }
+
+    // Finish running all passes.
+    if (Idx == Size)
+      break;
+
+    // Run machine function passes
+
+    // Get index range of machine function passes.
+    unsigned Begin = Idx;
+    for (; !MachineModulePasses.count(Idx) && Idx != Size; ++Idx)
+      ;
+
+    for (Function &F : M) {
+      // Do not codegen any 'available_externally' functions at all, they have
+      // definitions outside the translation unit.
+      if (F.hasAvailableExternallyLinkage())
+        continue;
+
+      MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
+      PassInstrumentation PI = MFAM.getResult<PassInstrumentationAnalysis>(MF);
+
+      for (unsigned I = Begin, E = Idx; I != E; ++I) {
+        auto *P = Passes[I].get();
+
+        if (!PI.runBeforePass<MachineFunction>(*P, MF))
+          continue;
+
+        // TODO: EmitSizeRemarks
+        PreservedAnalyses PassPA = P->run(MF, MFAM);
+        MFAM.invalidate(MF, PassPA);
+        PI.runAfterPass(*P, MF, PassPA);
+      }
+    }
+  } while (true);
+
+  for (auto &F : FinalizationFuncs) {
+    if (auto Err = F(M, MFAM))
+      return Err;
+  }
+
+  return Error::success();
+}
+
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp
new file mode 100644
index 000000000000..c7e7497dab36
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -0,0 +1,3276 @@
+//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
+//
+// This SMS implementation is a target-independent back-end pass. When enabled,
+// the pass runs just prior to the register allocation pass, while the machine
+// IR is in SSA form. If software pipelining is successful, then the original
+// loop is replaced by the optimized loop. The optimized loop contains one or
+// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
+// the instructions cannot be scheduled in a given MII, we increase the MII by
+// one and try again.
+//
+// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
+// represent loop carried dependences in the DAG as order edges to the Phi
+// nodes. We also perform several passes over the DAG to eliminate unnecessary
+// edges that inhibit the ability to pipeline. The implementation uses the
+// DFAPacketizer class to compute the minimum initiation interval and the check
+// where an instruction may be inserted in the pipelined schedule.
+//
+// In order for the SMS pass to work, several target specific hooks need to be
+// implemented to get information about the loop structure and to rewrite
+// instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachinePipeliner.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CycleAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ModuloSchedule.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cstdint>
+#include <deque>
+#include <functional>
+#include <iomanip>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pipeliner"
+
+STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
+STATISTIC(NumPipelined, "Number of loops software pipelined");
+STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
+STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
+STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
+STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
+STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
+STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
+STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
+STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
+STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
+
+/// A command line option to turn software pipelining on or off.
+static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
+                               cl::desc("Enable Software Pipelining"));
+
+/// A command line option to enable SWP at -Os.
+static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
+                                      cl::desc("Enable SWP at Os."), cl::Hidden,
+                                      cl::init(false));
+
+/// A command line argument to limit minimum initial interval for pipelining.
+static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
+                              cl::desc("Size limit for the MII."),
+                              cl::Hidden, cl::init(27));
+
+/// A command line argument to force pipeliner to use specified initial
+/// interval.
+static cl::opt<int> SwpForceII("pipeliner-force-ii",
+                               cl::desc("Force pipeliner to use specified II."),
+                               cl::Hidden, cl::init(-1));
+
+/// A command line argument to limit the number of stages in the pipeline.
+static cl::opt<int>
+    SwpMaxStages("pipeliner-max-stages",
+                 cl::desc("Maximum stages allowed in the generated scheduled."),
+                 cl::Hidden, cl::init(3));
+
+/// A command line option to disable the pruning of chain dependences due to
+/// an unrelated Phi.
+static cl::opt<bool>
+    SwpPruneDeps("pipeliner-prune-deps",
+                 cl::desc("Prune dependences between unrelated Phi nodes."),
+                 cl::Hidden, cl::init(true));
+
+/// A command line option to disable the pruning of loop carried order
+/// dependences.
+static cl::opt<bool>
+    SwpPruneLoopCarried("pipeliner-prune-loop-carried",
+                        cl::desc("Prune loop carried order dependences."),
+                        cl::Hidden, cl::init(true));
+
+#ifndef NDEBUG
+static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
+#endif
+
+static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
+                                     cl::ReallyHidden,
+                                     cl::desc("Ignore RecMII"));
+
+static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
+                                    cl::init(false));
+static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
+                                      cl::init(false));
+
+static cl::opt<bool> EmitTestAnnotations(
+    "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
+    cl::desc("Instead of emitting the pipelined code, annotate instructions "
+             "with the generated schedule for feeding into the "
+             "-modulo-schedule-test pass"));
+
+static cl::opt<bool> ExperimentalCodeGen(
+    "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
+    cl::desc(
+        "Use the experimental peeling code generator for software pipelining"));
+
+namespace llvm {
+
+// A command line option to enable the CopyToPhi DAG mutation.
+cl::opt<bool> SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
+                                 cl::init(true),
+                                 cl::desc("Enable CopyToPhi DAG Mutation"));
+
+/// A command line argument to force pipeliner to use specified issue
+/// width.
+cl::opt<int> SwpForceIssueWidth(
+    "pipeliner-force-issue-width",
+    cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
+    cl::init(-1));
+
+} // end namespace llvm
+
+unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
+char MachinePipeliner::ID = 0;
+#ifndef NDEBUG
+int MachinePipeliner::NumTries = 0;
+#endif
+char &llvm::MachinePipelinerID = MachinePipeliner::ID;
+
+INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
+                      "Modulo Software Pipelining", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
+                    "Modulo Software Pipelining", false, false)
+
+/// The "main" function for implementing Swing Modulo Scheduling.
+bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
+  if (skipFunction(mf.getFunction()))
+    return false;
+
+  if (!EnableSWP)
+    return false;
+
+  if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
+      !EnableSWPOptSize.getPosition())
+    return false;
+
+  if (!mf.getSubtarget().enableMachinePipeliner())
+    return false;
+
+  // Cannot pipeline loops without instruction itineraries if we are using
+  // DFA for the pipeliner.
+  if (mf.getSubtarget().useDFAforSMS() &&
+      (!mf.getSubtarget().getInstrItineraryData() ||
+       mf.getSubtarget().getInstrItineraryData()->isEmpty()))
+    return false;
+
+  MF = &mf;
+  MLI = &getAnalysis<MachineLoopInfo>();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
+  TII = MF->getSubtarget().getInstrInfo();
+  RegClassInfo.runOnMachineFunction(*MF);
+
+  for (const auto &L : *MLI)
+    scheduleLoop(*L);
+
+  return false;
+}
+
+/// Attempt to perform the SMS algorithm on the specified loop. This function is
+/// the main entry point for the algorithm.  The function identifies candidate
+/// loops, calculates the minimum initiation interval, and attempts to schedule
+/// the loop.
+bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
+  bool Changed = false;
+  for (const auto &InnerLoop : L)
+    Changed |= scheduleLoop(*InnerLoop);
+
+#ifndef NDEBUG
+  // Stop trying after reaching the limit (if any).
+  int Limit = SwpLoopLimit;
+  if (Limit >= 0) {
+    if (NumTries >= SwpLoopLimit)
+      return Changed;
+    NumTries++;
+  }
+#endif
+
+  setPragmaPipelineOptions(L);
+  if (!canPipelineLoop(L)) {
+    LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
+                                             L.getStartLoc(), L.getHeader())
+             << "Failed to pipeline loop";
+    });
+
+    LI.LoopPipelinerInfo.reset();
+    return Changed;
+  }
+
+  ++NumTrytoPipeline;
+
+  Changed = swingModuloScheduler(L);
+
+  LI.LoopPipelinerInfo.reset();
+  return Changed;
+}
+
+void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
+  // Reset the pragma for the next loop in iteration.
+  disabledByPragma = false;
+  II_setByPragma = 0;
+
+  MachineBasicBlock *LBLK = L.getTopBlock();
+
+  if (LBLK == nullptr)
+    return;
+
+  const BasicBlock *BBLK = LBLK->getBasicBlock();
+  if (BBLK == nullptr)
+    return;
+
+  const Instruction *TI = BBLK->getTerminator();
+  if (TI == nullptr)
+    return;
+
+  MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
+  if (LoopID == nullptr)
+    return;
+
+  assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
+  assert(LoopID->getOperand(0) == LoopID && "invalid loop");
+
+  for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+    MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+
+    if (MD == nullptr)
+      continue;
+
+    MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+
+    if (S == nullptr)
+      continue;
+
+    if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
+      assert(MD->getNumOperands() == 2 &&
+             "Pipeline initiation interval hint metadata should have two operands.");
+      II_setByPragma =
+          mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+      assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
+    } else if (S->getString() == "llvm.loop.pipeline.disable") {
+      disabledByPragma = true;
+    }
+  }
+}
+
+/// Return true if the loop can be software pipelined.  The algorithm is
+/// restricted to loops with a single basic block.  Make sure that the
+/// branch in the loop can be analyzed.
+bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
+  if (L.getNumBlocks() != 1) {
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "Not a single basic block: "
+             << ore::NV("NumBlocks", L.getNumBlocks());
+    });
+    return false;
+  }
+
+  if (disabledByPragma) {
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "Disabled by Pragma.";
+    });
+    return false;
+  }
+
+  // Check if the branch can't be understood because we can't do pipelining
+  // if that's the case.
+  LI.TBB = nullptr;
+  LI.FBB = nullptr;
+  LI.BrCond.clear();
+  if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
+    LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
+    NumFailBranch++;
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "The branch can't be understood";
+    });
+    return false;
+  }
+
+  LI.LoopInductionVar = nullptr;
+  LI.LoopCompare = nullptr;
+  LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(L.getTopBlock());
+  if (!LI.LoopPipelinerInfo) {
+    LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
+    NumFailLoop++;
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "The loop structure is not supported";
+    });
+    return false;
+  }
+
+  if (!L.getLoopPreheader()) {
+    LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
+    NumFailPreheader++;
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "No loop preheader found";
+    });
+    return false;
+  }
+
+  // Remove any subregisters from inputs to phi nodes.
+  preprocessPhiNodes(*L.getHeader());
+  return true;
+}
+
+void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
+
+  for (MachineInstr &PI : B.phis()) {
+    MachineOperand &DefOp = PI.getOperand(0);
+    assert(DefOp.getSubReg() == 0);
+    auto *RC = MRI.getRegClass(DefOp.getReg());
+
+    for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
+      MachineOperand &RegOp = PI.getOperand(i);
+      if (RegOp.getSubReg() == 0)
+        continue;
+
+      // If the operand uses a subregister, replace it with a new register
+      // without subregisters, and generate a copy to the new register.
+      Register NewReg = MRI.createVirtualRegister(RC);
+      MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
+      MachineBasicBlock::iterator At = PredB.getFirstTerminator();
+      const DebugLoc &DL = PredB.findDebugLoc(At);
+      auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
+                    .addReg(RegOp.getReg(), getRegState(RegOp),
+                            RegOp.getSubReg());
+      Slots.insertMachineInstrInMaps(*Copy);
+      RegOp.setReg(NewReg);
+      RegOp.setSubReg(0);
+    }
+  }
+}
+
+/// The SMS algorithm consists of the following main steps:
+/// 1. Computation and analysis of the dependence graph.
+/// 2. Ordering of the nodes (instructions).
+/// 3. Attempt to Schedule the loop.
+bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
+  assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
+
+  SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
+                        II_setByPragma, LI.LoopPipelinerInfo.get());
+
+  MachineBasicBlock *MBB = L.getHeader();
+  // The kernel should not include any terminator instructions.  These
+  // will be added back later.
+  SMS.startBlock(MBB);
+
+  // Compute the number of 'real' instructions in the basic block by
+  // ignoring terminators.
+  unsigned size = MBB->size();
+  for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
+                                   E = MBB->instr_end();
+       I != E; ++I, --size)
+    ;
+
+  SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
+  SMS.schedule();
+  SMS.exitRegion();
+
+  SMS.finishBlock();
+  return SMS.hasNewSchedule();
+}
+
+void MachinePipeliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<LiveIntervals>();
+  AU.addRequired<MachineOptimizationRemarkEmitterPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
+  if (SwpForceII > 0)
+    MII = SwpForceII;
+  else if (II_setByPragma > 0)
+    MII = II_setByPragma;
+  else
+    MII = std::max(ResMII, RecMII);
+}
+
+void SwingSchedulerDAG::setMAX_II() {
+  if (SwpForceII > 0)
+    MAX_II = SwpForceII;
+  else if (II_setByPragma > 0)
+    MAX_II = II_setByPragma;
+  else
+    MAX_II = MII + 10;
+}
+
+/// We override the schedule function in ScheduleDAGInstrs to implement the
+/// scheduling part of the Swing Modulo Scheduling algorithm.
+void SwingSchedulerDAG::schedule() {
+  AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
+  buildSchedGraph(AA);
+  addLoopCarriedDependences(AA);
+  updatePhiDependences();
+  Topo.InitDAGTopologicalSorting();
+  changeDependences();
+  postProcessDAG();
+  LLVM_DEBUG(dump());
+
+  NodeSetType NodeSets;
+  findCircuits(NodeSets);
+  NodeSetType Circuits = NodeSets;
+
+  // Calculate the MII.
+  unsigned ResMII = calculateResMII();
+  unsigned RecMII = calculateRecMII(NodeSets);
+
+  fuseRecs(NodeSets);
+
+  // This flag is used for testing and can cause correctness problems.
+  if (SwpIgnoreRecMII)
+    RecMII = 0;
+
+  setMII(ResMII, RecMII);
+  setMAX_II();
+
+  LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
+                    << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
+
+  // Can't schedule a loop without a valid MII.
+  if (MII == 0) {
+    LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
+    NumFailZeroMII++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Invalid Minimal Initiation Interval: 0";
+    });
+    return;
+  }
+
+  // Don't pipeline large loops.
+  if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
+    LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
+                      << ", we don't pipeline large loops\n");
+    NumFailLargeMaxMII++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Minimal Initiation Interval too large: "
+             << ore::NV("MII", (int)MII) << " > "
+             << ore::NV("SwpMaxMii", SwpMaxMii) << "."
+             << "Refer to -pipeliner-max-mii.";
+    });
+    return;
+  }
+
+  computeNodeFunctions(NodeSets);
+
+  registerPressureFilter(NodeSets);
+
+  colocateNodeSets(NodeSets);
+
+  checkNodeSets(NodeSets);
+
+  LLVM_DEBUG({
+    for (auto &I : NodeSets) {
+      dbgs() << "  Rec NodeSet ";
+      I.dump();
+    }
+  });
+
+  llvm::stable_sort(NodeSets, std::greater<NodeSet>());
+
+  groupRemainingNodes(NodeSets);
+
+  removeDuplicateNodes(NodeSets);
+
+  LLVM_DEBUG({
+    for (auto &I : NodeSets) {
+      dbgs() << "  NodeSet ";
+      I.dump();
+    }
+  });
+
+  computeNodeOrder(NodeSets);
+
+  // check for node order issues
+  checkValidNodeOrder(Circuits);
+
+  SMSchedule Schedule(Pass.MF, this);
+  Scheduled = schedulePipeline(Schedule);
+
+  if (!Scheduled){
+    LLVM_DEBUG(dbgs() << "No schedule found, return\n");
+    NumFailNoSchedule++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Unable to find schedule";
+    });
+    return;
+  }
+
+  unsigned numStages = Schedule.getMaxStageCount();
+  // No need to generate pipeline if there are no overlapped iterations.
+  if (numStages == 0) {
+    LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
+    NumFailZeroStage++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "No need to pipeline - no overlapped iterations in schedule.";
+    });
+    return;
+  }
+  // Check that the maximum stage count is less than user-defined limit.
+  if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
+    LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
+                      << " : too many stages, abort\n");
+    NumFailLargeMaxStage++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Too many stages in schedule: "
+             << ore::NV("numStages", (int)numStages) << " > "
+             << ore::NV("SwpMaxStages", SwpMaxStages)
+             << ". Refer to -pipeliner-max-stages.";
+    });
+    return;
+  }
+
+  Pass.ORE->emit([&]() {
+    return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
+                                     Loop.getHeader())
+           << "Pipelined succesfully!";
+  });
+
+  // Generate the schedule as a ModuloSchedule.
+  DenseMap<MachineInstr *, int> Cycles, Stages;
+  std::vector<MachineInstr *> OrderedInsts;
+  for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
+       ++Cycle) {
+    for (SUnit *SU : Schedule.getInstructions(Cycle)) {
+      OrderedInsts.push_back(SU->getInstr());
+      Cycles[SU->getInstr()] = Cycle;
+      Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
+    }
+  }
+  DenseMap<MachineInstr *, std::pair<unsigned, int64_t>> NewInstrChanges;
+  for (auto &KV : NewMIs) {
+    Cycles[KV.first] = Cycles[KV.second];
+    Stages[KV.first] = Stages[KV.second];
+    NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
+  }
+
+  ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
+                    std::move(Stages));
+  if (EmitTestAnnotations) {
+    assert(NewInstrChanges.empty() &&
+           "Cannot serialize a schedule with InstrChanges!");
+    ModuloScheduleTestAnnotater MSTI(MF, MS);
+    MSTI.annotate();
+    return;
+  }
+  // The experimental code generator can't work if there are InstChanges.
+  if (ExperimentalCodeGen && NewInstrChanges.empty()) {
+    PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
+    MSE.expand();
+  } else {
+    ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
+    MSE.expand();
+    MSE.cleanup();
+  }
+  ++NumPipelined;
+}
+
+/// Clean up after the software pipeliner runs.
+void SwingSchedulerDAG::finishBlock() {
+  for (auto &KV : NewMIs)
+    MF.deleteMachineInstr(KV.second);
+  NewMIs.clear();
+
+  // Call the superclass.
+  ScheduleDAGInstrs::finishBlock();
+}
+
+/// Return the register values for  the operands of a Phi instruction.
+/// This function assume the instruction is a Phi.
+static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
+                       unsigned &InitVal, unsigned &LoopVal) {
+  assert(Phi.isPHI() && "Expecting a Phi.");
+
+  InitVal = 0;
+  LoopVal = 0;
+  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
+    if (Phi.getOperand(i + 1).getMBB() != Loop)
+      InitVal = Phi.getOperand(i).getReg();
+    else
+      LoopVal = Phi.getOperand(i).getReg();
+
+  assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
+}
+
+/// Return the Phi register value that comes the loop block.
+static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
+  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
+    if (Phi.getOperand(i + 1).getMBB() == LoopBB)
+      return Phi.getOperand(i).getReg();
+  return 0;
+}
+
+/// Return true if SUb can be reached from SUa following the chain edges.
+static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
+  SmallPtrSet<SUnit *, 8> Visited;
+  SmallVector<SUnit *, 8> Worklist;
+  Worklist.push_back(SUa);
+  while (!Worklist.empty()) {
+    const SUnit *SU = Worklist.pop_back_val();
+    for (const auto &SI : SU->Succs) {
+      SUnit *SuccSU = SI.getSUnit();
+      if (SI.getKind() == SDep::Order) {
+        if (Visited.count(SuccSU))
+          continue;
+        if (SuccSU == SUb)
+          return true;
+        Worklist.push_back(SuccSU);
+        Visited.insert(SuccSU);
+      }
+    }
+  }
+  return false;
+}
+
+/// Return true if the instruction causes a chain between memory
+/// references before and after it.
+static bool isDependenceBarrier(MachineInstr &MI) {
+  return MI.isCall() || MI.mayRaiseFPException() ||
+         MI.hasUnmodeledSideEffects() ||
+         (MI.hasOrderedMemoryRef() &&
+          (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
+}
+
+/// Return the underlying objects for the memory references of an instruction.
+/// This function calls the code in ValueTracking, but first checks that the
+/// instruction has a memory operand.
+static void getUnderlyingObjects(const MachineInstr *MI,
+                                 SmallVectorImpl<const Value *> &Objs) {
+  if (!MI->hasOneMemOperand())
+    return;
+  MachineMemOperand *MM = *MI->memoperands_begin();
+  if (!MM->getValue())
+    return;
+  getUnderlyingObjects(MM->getValue(), Objs);
+  for (const Value *V : Objs) {
+    if (!isIdentifiedObject(V)) {
+      Objs.clear();
+      return;
+    }
+    Objs.push_back(V);
+  }
+}
+
+/// Add a chain edge between a load and store if the store can be an
+/// alias of the load on a subsequent iteration, i.e., a loop carried
+/// dependence. This code is very similar to the code in ScheduleDAGInstrs
+/// but that code doesn't create loop carried dependences.
+void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
+  MapVector<const Value *, SmallVector<SUnit *, 4>> PendingLoads;
+  Value *UnknownValue =
+    UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
+  for (auto &SU : SUnits) {
+    MachineInstr &MI = *SU.getInstr();
+    if (isDependenceBarrier(MI))
+      PendingLoads.clear();
+    else if (MI.mayLoad()) {
+      SmallVector<const Value *, 4> Objs;
+      ::getUnderlyingObjects(&MI, Objs);
+      if (Objs.empty())
+        Objs.push_back(UnknownValue);
+      for (const auto *V : Objs) {
+        SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
+        SUs.push_back(&SU);
+      }
+    } else if (MI.mayStore()) {
+      SmallVector<const Value *, 4> Objs;
+      ::getUnderlyingObjects(&MI, Objs);
+      if (Objs.empty())
+        Objs.push_back(UnknownValue);
+      for (const auto *V : Objs) {
+        MapVector<const Value *, SmallVector<SUnit *, 4>>::iterator I =
+            PendingLoads.find(V);
+        if (I == PendingLoads.end())
+          continue;
+        for (auto *Load : I->second) {
+          if (isSuccOrder(Load, &SU))
+            continue;
+          MachineInstr &LdMI = *Load->getInstr();
+          // First, perform the cheaper check that compares the base register.
+          // If they are the same and the load offset is less than the store
+          // offset, then mark the dependence as loop carried potentially.
+          const MachineOperand *BaseOp1, *BaseOp2;
+          int64_t Offset1, Offset2;
+          bool Offset1IsScalable, Offset2IsScalable;
+          if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
+                                           Offset1IsScalable, TRI) &&
+              TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
+                                           Offset2IsScalable, TRI)) {
+            if (BaseOp1->isIdenticalTo(*BaseOp2) &&
+                Offset1IsScalable == Offset2IsScalable &&
+                (int)Offset1 < (int)Offset2) {
+              assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) &&
+                     "What happened to the chain edge?");
+              SDep Dep(Load, SDep::Barrier);
+              Dep.setLatency(1);
+              SU.addPred(Dep);
+              continue;
+            }
+          }
+          // Second, the more expensive check that uses alias analysis on the
+          // base registers. If they alias, and the load offset is less than
+          // the store offset, the mark the dependence as loop carried.
+          if (!AA) {
+            SDep Dep(Load, SDep::Barrier);
+            Dep.setLatency(1);
+            SU.addPred(Dep);
+            continue;
+          }
+          MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
+          MachineMemOperand *MMO2 = *MI.memoperands_begin();
+          if (!MMO1->getValue() || !MMO2->getValue()) {
+            SDep Dep(Load, SDep::Barrier);
+            Dep.setLatency(1);
+            SU.addPred(Dep);
+            continue;
+          }
+          if (MMO1->getValue() == MMO2->getValue() &&
+              MMO1->getOffset() <= MMO2->getOffset()) {
+            SDep Dep(Load, SDep::Barrier);
+            Dep.setLatency(1);
+            SU.addPred(Dep);
+            continue;
+          }
+          if (!AA->isNoAlias(
+                  MemoryLocation::getAfter(MMO1->getValue(), MMO1->getAAInfo()),
+                  MemoryLocation::getAfter(MMO2->getValue(),
+                                           MMO2->getAAInfo()))) {
+            SDep Dep(Load, SDep::Barrier);
+            Dep.setLatency(1);
+            SU.addPred(Dep);
+          }
+        }
+      }
+    }
+  }
+}
+
+/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
+/// processes dependences for PHIs. This function adds true dependences
+/// from a PHI to a use, and a loop carried dependence from the use to the
+/// PHI. The loop carried dependence is represented as an anti dependence
+/// edge. This function also removes chain dependences between unrelated
+/// PHIs.
+void SwingSchedulerDAG::updatePhiDependences() {
+  SmallVector<SDep, 4> RemoveDeps;
+  const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
+
+  // Iterate over each DAG node.
+  for (SUnit &I : SUnits) {
+    RemoveDeps.clear();
+    // Set to true if the instruction has an operand defined by a Phi.
+    unsigned HasPhiUse = 0;
+    unsigned HasPhiDef = 0;
+    MachineInstr *MI = I.getInstr();
+    // Iterate over each operand, and we process the definitions.
+    for (const MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+      Register Reg = MO.getReg();
+      if (MO.isDef()) {
+        // If the register is used by a Phi, then create an anti dependence.
+        for (MachineRegisterInfo::use_instr_iterator
+                 UI = MRI.use_instr_begin(Reg),
+                 UE = MRI.use_instr_end();
+             UI != UE; ++UI) {
+          MachineInstr *UseMI = &*UI;
+          SUnit *SU = getSUnit(UseMI);
+          if (SU != nullptr && UseMI->isPHI()) {
+            if (!MI->isPHI()) {
+              SDep Dep(SU, SDep::Anti, Reg);
+              Dep.setLatency(1);
+              I.addPred(Dep);
+            } else {
+              HasPhiDef = Reg;
+              // Add a chain edge to a dependent Phi that isn't an existing
+              // predecessor.
+              if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
+                I.addPred(SDep(SU, SDep::Barrier));
+            }
+          }
+        }
+      } else if (MO.isUse()) {
+        // If the register is defined by a Phi, then create a true dependence.
+        MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
+        if (DefMI == nullptr)
+          continue;
+        SUnit *SU = getSUnit(DefMI);
+        if (SU != nullptr && DefMI->isPHI()) {
+          if (!MI->isPHI()) {
+            SDep Dep(SU, SDep::Data, Reg);
+            Dep.setLatency(0);
+            ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep);
+            I.addPred(Dep);
+          } else {
+            HasPhiUse = Reg;
+            // Add a chain edge to a dependent Phi that isn't an existing
+            // predecessor.
+            if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
+              I.addPred(SDep(SU, SDep::Barrier));
+          }
+        }
+      }
+    }
+    // Remove order dependences from an unrelated Phi.
+    if (!SwpPruneDeps)
+      continue;
+    for (auto &PI : I.Preds) {
+      MachineInstr *PMI = PI.getSUnit()->getInstr();
+      if (PMI->isPHI() && PI.getKind() == SDep::Order) {
+        if (I.getInstr()->isPHI()) {
+          if (PMI->getOperand(0).getReg() == HasPhiUse)
+            continue;
+          if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
+            continue;
+        }
+        RemoveDeps.push_back(PI);
+      }
+    }
+    for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
+      I.removePred(RemoveDeps[i]);
+  }
+}
+
+/// Iterate over each DAG node and see if we can change any dependences
+/// in order to reduce the recurrence MII.
+void SwingSchedulerDAG::changeDependences() {
+  // See if an instruction can use a value from the previous iteration.
+  // If so, we update the base and offset of the instruction and change
+  // the dependences.
+  for (SUnit &I : SUnits) {
+    unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
+    int64_t NewOffset = 0;
+    if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
+                               NewOffset))
+      continue;
+
+    // Get the MI and SUnit for the instruction that defines the original base.
+    Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
+    MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
+    if (!DefMI)
+      continue;
+    SUnit *DefSU = getSUnit(DefMI);
+    if (!DefSU)
+      continue;
+    // Get the MI and SUnit for the instruction that defins the new base.
+    MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
+    if (!LastMI)
+      continue;
+    SUnit *LastSU = getSUnit(LastMI);
+    if (!LastSU)
+      continue;
+
+    if (Topo.IsReachable(&I, LastSU))
+      continue;
+
+    // Remove the dependence. The value now depends on a prior iteration.
+    SmallVector<SDep, 4> Deps;
+    for (const SDep &P : I.Preds)
+      if (P.getSUnit() == DefSU)
+        Deps.push_back(P);
+    for (int i = 0, e = Deps.size(); i != e; i++) {
+      Topo.RemovePred(&I, Deps[i].getSUnit());
+      I.removePred(Deps[i]);
+    }
+    // Remove the chain dependence between the instructions.
+    Deps.clear();
+    for (auto &P : LastSU->Preds)
+      if (P.getSUnit() == &I && P.getKind() == SDep::Order)
+        Deps.push_back(P);
+    for (int i = 0, e = Deps.size(); i != e; i++) {
+      Topo.RemovePred(LastSU, Deps[i].getSUnit());
+      LastSU->removePred(Deps[i]);
+    }
+
+    // Add a dependence between the new instruction and the instruction
+    // that defines the new base.
+    SDep Dep(&I, SDep::Anti, NewBase);
+    Topo.AddPred(LastSU, &I);
+    LastSU->addPred(Dep);
+
+    // Remember the base and offset information so that we can update the
+    // instruction during code generation.
+    InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
+  }
+}
+
+namespace {
+
+// FuncUnitSorter - Comparison operator used to sort instructions by
+// the number of functional unit choices.
+struct FuncUnitSorter {
+  const InstrItineraryData *InstrItins;
+  const MCSubtargetInfo *STI;
+  DenseMap<InstrStage::FuncUnits, unsigned> Resources;
+
+  FuncUnitSorter(const TargetSubtargetInfo &TSI)
+      : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
+
+  // Compute the number of functional unit alternatives needed
+  // at each stage, and take the minimum value. We prioritize the
+  // instructions by the least number of choices first.
+  unsigned minFuncUnits(const MachineInstr *Inst,
+                        InstrStage::FuncUnits &F) const {
+    unsigned SchedClass = Inst->getDesc().getSchedClass();
+    unsigned min = UINT_MAX;
+    if (InstrItins && !InstrItins->isEmpty()) {
+      for (const InstrStage &IS :
+           make_range(InstrItins->beginStage(SchedClass),
+                      InstrItins->endStage(SchedClass))) {
+        InstrStage::FuncUnits funcUnits = IS.getUnits();
+        unsigned numAlternatives = llvm::popcount(funcUnits);
+        if (numAlternatives < min) {
+          min = numAlternatives;
+          F = funcUnits;
+        }
+      }
+      return min;
+    }
+    if (STI && STI->getSchedModel().hasInstrSchedModel()) {
+      const MCSchedClassDesc *SCDesc =
+          STI->getSchedModel().getSchedClassDesc(SchedClass);
+      if (!SCDesc->isValid())
+        // No valid Schedule Class Desc for schedClass, should be
+        // Pseudo/PostRAPseudo
+        return min;
+
+      for (const MCWriteProcResEntry &PRE :
+           make_range(STI->getWriteProcResBegin(SCDesc),
+                      STI->getWriteProcResEnd(SCDesc))) {
+        if (!PRE.Cycles)
+          continue;
+        const MCProcResourceDesc *ProcResource =
+            STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
+        unsigned NumUnits = ProcResource->NumUnits;
+        if (NumUnits < min) {
+          min = NumUnits;
+          F = PRE.ProcResourceIdx;
+        }
+      }
+      return min;
+    }
+    llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
+  }
+
+  // Compute the critical resources needed by the instruction. This
+  // function records the functional units needed by instructions that
+  // must use only one functional unit. We use this as a tie breaker
+  // for computing the resource MII. The instrutions that require
+  // the same, highly used, functional unit have high priority.
+  void calcCriticalResources(MachineInstr &MI) {
+    unsigned SchedClass = MI.getDesc().getSchedClass();
+    if (InstrItins && !InstrItins->isEmpty()) {
+      for (const InstrStage &IS :
+           make_range(InstrItins->beginStage(SchedClass),
+                      InstrItins->endStage(SchedClass))) {
+        InstrStage::FuncUnits FuncUnits = IS.getUnits();
+        if (llvm::popcount(FuncUnits) == 1)
+          Resources[FuncUnits]++;
+      }
+      return;
+    }
+    if (STI && STI->getSchedModel().hasInstrSchedModel()) {
+      const MCSchedClassDesc *SCDesc =
+          STI->getSchedModel().getSchedClassDesc(SchedClass);
+      if (!SCDesc->isValid())
+        // No valid Schedule Class Desc for schedClass, should be
+        // Pseudo/PostRAPseudo
+        return;
+
+      for (const MCWriteProcResEntry &PRE :
+           make_range(STI->getWriteProcResBegin(SCDesc),
+                      STI->getWriteProcResEnd(SCDesc))) {
+        if (!PRE.Cycles)
+          continue;
+        Resources[PRE.ProcResourceIdx]++;
+      }
+      return;
+    }
+    llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
+  }
+
+  /// Return true if IS1 has less priority than IS2.
+  bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
+    InstrStage::FuncUnits F1 = 0, F2 = 0;
+    unsigned MFUs1 = minFuncUnits(IS1, F1);
+    unsigned MFUs2 = minFuncUnits(IS2, F2);
+    if (MFUs1 == MFUs2)
+      return Resources.lookup(F1) < Resources.lookup(F2);
+    return MFUs1 > MFUs2;
+  }
+};
+
+} // end anonymous namespace
+
+/// Calculate the resource constrained minimum initiation interval for the
+/// specified loop. We use the DFA to model the resources needed for
+/// each instruction, and we ignore dependences. A different DFA is created
+/// for each cycle that is required. When adding a new instruction, we attempt
+/// to add it to each existing DFA, until a legal space is found. If the
+/// instruction cannot be reserved in an existing DFA, we create a new one.
+unsigned SwingSchedulerDAG::calculateResMII() {
+  LLVM_DEBUG(dbgs() << "calculateResMII:\n");
+  ResourceManager RM(&MF.getSubtarget(), this);
+  return RM.calculateResMII();
+}
+
+/// Calculate the recurrence-constrainted minimum initiation interval.
+/// Iterate over each circuit.  Compute the delay(c) and distance(c)
+/// for each circuit. The II needs to satisfy the inequality
+/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
+/// II that satisfies the inequality, and the RecMII is the maximum
+/// of those values.
+unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
+  unsigned RecMII = 0;
+
+  for (NodeSet &Nodes : NodeSets) {
+    if (Nodes.empty())
+      continue;
+
+    unsigned Delay = Nodes.getLatency();
+    unsigned Distance = 1;
+
+    // ii = ceil(delay / distance)
+    unsigned CurMII = (Delay + Distance - 1) / Distance;
+    Nodes.setRecMII(CurMII);
+    if (CurMII > RecMII)
+      RecMII = CurMII;
+  }
+
+  return RecMII;
+}
+
+/// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
+/// but we do this to find the circuits, and then change them back.
+static void swapAntiDependences(std::vector<SUnit> &SUnits) {
+  SmallVector<std::pair<SUnit *, SDep>, 8> DepsAdded;
+  for (SUnit &SU : SUnits) {
+    for (SDep &Pred : SU.Preds)
+      if (Pred.getKind() == SDep::Anti)
+        DepsAdded.push_back(std::make_pair(&SU, Pred));
+  }
+  for (std::pair<SUnit *, SDep> &P : DepsAdded) {
+    // Remove this anti dependency and add one in the reverse direction.
+    SUnit *SU = P.first;
+    SDep &D = P.second;
+    SUnit *TargetSU = D.getSUnit();
+    unsigned Reg = D.getReg();
+    unsigned Lat = D.getLatency();
+    SU->removePred(D);
+    SDep Dep(SU, SDep::Anti, Reg);
+    Dep.setLatency(Lat);
+    TargetSU->addPred(Dep);
+  }
+}
+
+/// Create the adjacency structure of the nodes in the graph.
+void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
+    SwingSchedulerDAG *DAG) {
+  BitVector Added(SUnits.size());
+  DenseMap<int, int> OutputDeps;
+  for (int i = 0, e = SUnits.size(); i != e; ++i) {
+    Added.reset();
+    // Add any successor to the adjacency matrix and exclude duplicates.
+    for (auto &SI : SUnits[i].Succs) {
+      // Only create a back-edge on the first and last nodes of a dependence
+      // chain. This records any chains and adds them later.
+      if (SI.getKind() == SDep::Output) {
+        int N = SI.getSUnit()->NodeNum;
+        int BackEdge = i;
+        auto Dep = OutputDeps.find(BackEdge);
+        if (Dep != OutputDeps.end()) {
+          BackEdge = Dep->second;
+          OutputDeps.erase(Dep);
+        }
+        OutputDeps[N] = BackEdge;
+      }
+      // Do not process a boundary node, an artificial node.
+      // A back-edge is processed only if it goes to a Phi.
+      if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
+          (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
+        continue;
+      int N = SI.getSUnit()->NodeNum;
+      if (!Added.test(N)) {
+        AdjK[i].push_back(N);
+        Added.set(N);
+      }
+    }
+    // A chain edge between a store and a load is treated as a back-edge in the
+    // adjacency matrix.
+    for (auto &PI : SUnits[i].Preds) {
+      if (!SUnits[i].getInstr()->mayStore() ||
+          !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
+        continue;
+      if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
+        int N = PI.getSUnit()->NodeNum;
+        if (!Added.test(N)) {
+          AdjK[i].push_back(N);
+          Added.set(N);
+        }
+      }
+    }
+  }
+  // Add back-edges in the adjacency matrix for the output dependences.
+  for (auto &OD : OutputDeps)
+    if (!Added.test(OD.second)) {
+      AdjK[OD.first].push_back(OD.second);
+      Added.set(OD.second);
+    }
+}
+
+/// Identify an elementary circuit in the dependence graph starting at the
+/// specified node.
+bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
+                                          bool HasBackedge) {
+  SUnit *SV = &SUnits[V];
+  bool F = false;
+  Stack.insert(SV);
+  Blocked.set(V);
+
+  for (auto W : AdjK[V]) {
+    if (NumPaths > MaxPaths)
+      break;
+    if (W < S)
+      continue;
+    if (W == S) {
+      if (!HasBackedge)
+        NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
+      F = true;
+      ++NumPaths;
+      break;
+    } else if (!Blocked.test(W)) {
+      if (circuit(W, S, NodeSets,
+                  Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
+        F = true;
+    }
+  }
+
+  if (F)
+    unblock(V);
+  else {
+    for (auto W : AdjK[V]) {
+      if (W < S)
+        continue;
+      B[W].insert(SV);
+    }
+  }
+  Stack.pop_back();
+  return F;
+}
+
+/// Unblock a node in the circuit finding algorithm.
+void SwingSchedulerDAG::Circuits::unblock(int U) {
+  Blocked.reset(U);
+  SmallPtrSet<SUnit *, 4> &BU = B[U];
+  while (!BU.empty()) {
+    SmallPtrSet<SUnit *, 4>::iterator SI = BU.begin();
+    assert(SI != BU.end() && "Invalid B set.");
+    SUnit *W = *SI;
+    BU.erase(W);
+    if (Blocked.test(W->NodeNum))
+      unblock(W->NodeNum);
+  }
+}
+
+/// Identify all the elementary circuits in the dependence graph using
+/// Johnson's circuit algorithm.
+void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
+  // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
+  // but we do this to find the circuits, and then change them back.
+  swapAntiDependences(SUnits);
+
+  Circuits Cir(SUnits, Topo);
+  // Create the adjacency structure.
+  Cir.createAdjacencyStructure(this);
+  for (int i = 0, e = SUnits.size(); i != e; ++i) {
+    Cir.reset();
+    Cir.circuit(i, i, NodeSets);
+  }
+
+  // Change the dependences back so that we've created a DAG again.
+  swapAntiDependences(SUnits);
+}
+
+// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
+// is loop-carried to the USE in next iteration. This will help pipeliner avoid
+// additional copies that are needed across iterations. An artificial dependence
+// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
+
+// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
+// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
+// PHI-------True-Dep------> USEOfPhi
+
+// The mutation creates
+// USEOfPHI -------Artificial-Dep---> SRCOfCopy
+
+// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
+// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
+// late  to avoid additional copies across iterations. The possible scheduling
+// order would be
+// USEOfPHI --- SRCOfCopy---  COPY/REG_SEQUENCE.
+
+void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
+  for (SUnit &SU : DAG->SUnits) {
+    // Find the COPY/REG_SEQUENCE instruction.
+    if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
+      continue;
+
+    // Record the loop carried PHIs.
+    SmallVector<SUnit *, 4> PHISUs;
+    // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
+    SmallVector<SUnit *, 4> SrcSUs;
+
+    for (auto &Dep : SU.Preds) {
+      SUnit *TmpSU = Dep.getSUnit();
+      MachineInstr *TmpMI = TmpSU->getInstr();
+      SDep::Kind DepKind = Dep.getKind();
+      // Save the loop carried PHI.
+      if (DepKind == SDep::Anti && TmpMI->isPHI())
+        PHISUs.push_back(TmpSU);
+      // Save the source of COPY/REG_SEQUENCE.
+      // If the source has no pre-decessors, we will end up creating cycles.
+      else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
+        SrcSUs.push_back(TmpSU);
+    }
+
+    if (PHISUs.size() == 0 || SrcSUs.size() == 0)
+      continue;
+
+    // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
+    // SUnit to the container.
+    SmallVector<SUnit *, 8> UseSUs;
+    // Do not use iterator based loop here as we are updating the container.
+    for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
+      for (auto &Dep : PHISUs[Index]->Succs) {
+        if (Dep.getKind() != SDep::Data)
+          continue;
+
+        SUnit *TmpSU = Dep.getSUnit();
+        MachineInstr *TmpMI = TmpSU->getInstr();
+        if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
+          PHISUs.push_back(TmpSU);
+          continue;
+        }
+        UseSUs.push_back(TmpSU);
+      }
+    }
+
+    if (UseSUs.size() == 0)
+      continue;
+
+    SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
+    // Add the artificial dependencies if it does not form a cycle.
+    for (auto *I : UseSUs) {
+      for (auto *Src : SrcSUs) {
+        if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
+          Src->addPred(SDep(I, SDep::Artificial));
+          SDAG->Topo.AddPred(Src, I);
+        }
+      }
+    }
+  }
+}
+
+/// Return true for DAG nodes that we ignore when computing the cost functions.
+/// We ignore the back-edge recurrence in order to avoid unbounded recursion
+/// in the calculation of the ASAP, ALAP, etc functions.
+static bool ignoreDependence(const SDep &D, bool isPred) {
+  if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
+    return true;
+  return D.getKind() == SDep::Anti && isPred;
+}
+
+/// Compute several functions need to order the nodes for scheduling.
+///  ASAP - Earliest time to schedule a node.
+///  ALAP - Latest time to schedule a node.
+///  MOV - Mobility function, difference between ALAP and ASAP.
+///  D - Depth of each node.
+///  H - Height of each node.
+void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
+  ScheduleInfo.resize(SUnits.size());
+
+  LLVM_DEBUG({
+    for (int I : Topo) {
+      const SUnit &SU = SUnits[I];
+      dumpNode(SU);
+    }
+  });
+
+  int maxASAP = 0;
+  // Compute ASAP and ZeroLatencyDepth.
+  for (int I : Topo) {
+    int asap = 0;
+    int zeroLatencyDepth = 0;
+    SUnit *SU = &SUnits[I];
+    for (const SDep &P : SU->Preds) {
+      SUnit *pred = P.getSUnit();
+      if (P.getLatency() == 0)
+        zeroLatencyDepth =
+            std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
+      if (ignoreDependence(P, true))
+        continue;
+      asap = std::max(asap, (int)(getASAP(pred) + P.getLatency() -
+                                  getDistance(pred, SU, P) * MII));
+    }
+    maxASAP = std::max(maxASAP, asap);
+    ScheduleInfo[I].ASAP = asap;
+    ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
+  }
+
+  // Compute ALAP, ZeroLatencyHeight, and MOV.
+  for (int I : llvm::reverse(Topo)) {
+    int alap = maxASAP;
+    int zeroLatencyHeight = 0;
+    SUnit *SU = &SUnits[I];
+    for (const SDep &S : SU->Succs) {
+      SUnit *succ = S.getSUnit();
+      if (succ->isBoundaryNode())
+        continue;
+      if (S.getLatency() == 0)
+        zeroLatencyHeight =
+            std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
+      if (ignoreDependence(S, true))
+        continue;
+      alap = std::min(alap, (int)(getALAP(succ) - S.getLatency() +
+                                  getDistance(SU, succ, S) * MII));
+    }
+
+    ScheduleInfo[I].ALAP = alap;
+    ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
+  }
+
+  // After computing the node functions, compute the summary for each node set.
+  for (NodeSet &I : NodeSets)
+    I.computeNodeSetInfo(this);
+
+  LLVM_DEBUG({
+    for (unsigned i = 0; i < SUnits.size(); i++) {
+      dbgs() << "\tNode " << i << ":\n";
+      dbgs() << "\t   ASAP = " << getASAP(&SUnits[i]) << "\n";
+      dbgs() << "\t   ALAP = " << getALAP(&SUnits[i]) << "\n";
+      dbgs() << "\t   MOV  = " << getMOV(&SUnits[i]) << "\n";
+      dbgs() << "\t   D    = " << getDepth(&SUnits[i]) << "\n";
+      dbgs() << "\t   H    = " << getHeight(&SUnits[i]) << "\n";
+      dbgs() << "\t   ZLD  = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
+      dbgs() << "\t   ZLH  = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
+    }
+  });
+}
+
+/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
+/// as the predecessors of the elements of NodeOrder that are not also in
+/// NodeOrder.
+static bool pred_L(SetVector<SUnit *> &NodeOrder,
+                   SmallSetVector<SUnit *, 8> &Preds,
+                   const NodeSet *S = nullptr) {
+  Preds.clear();
+  for (const SUnit *SU : NodeOrder) {
+    for (const SDep &Pred : SU->Preds) {
+      if (S && S->count(Pred.getSUnit()) == 0)
+        continue;
+      if (ignoreDependence(Pred, true))
+        continue;
+      if (NodeOrder.count(Pred.getSUnit()) == 0)
+        Preds.insert(Pred.getSUnit());
+    }
+    // Back-edges are predecessors with an anti-dependence.
+    for (const SDep &Succ : SU->Succs) {
+      if (Succ.getKind() != SDep::Anti)
+        continue;
+      if (S && S->count(Succ.getSUnit()) == 0)
+        continue;
+      if (NodeOrder.count(Succ.getSUnit()) == 0)
+        Preds.insert(Succ.getSUnit());
+    }
+  }
+  return !Preds.empty();
+}
+
+/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
+/// as the successors of the elements of NodeOrder that are not also in
+/// NodeOrder.
+static bool succ_L(SetVector<SUnit *> &NodeOrder,
+                   SmallSetVector<SUnit *, 8> &Succs,
+                   const NodeSet *S = nullptr) {
+  Succs.clear();
+  for (const SUnit *SU : NodeOrder) {
+    for (const SDep &Succ : SU->Succs) {
+      if (S && S->count(Succ.getSUnit()) == 0)
+        continue;
+      if (ignoreDependence(Succ, false))
+        continue;
+      if (NodeOrder.count(Succ.getSUnit()) == 0)
+        Succs.insert(Succ.getSUnit());
+    }
+    for (const SDep &Pred : SU->Preds) {
+      if (Pred.getKind() != SDep::Anti)
+        continue;
+      if (S && S->count(Pred.getSUnit()) == 0)
+        continue;
+      if (NodeOrder.count(Pred.getSUnit()) == 0)
+        Succs.insert(Pred.getSUnit());
+    }
+  }
+  return !Succs.empty();
+}
+
+/// Return true if there is a path from the specified node to any of the nodes
+/// in DestNodes. Keep track and return the nodes in any path.
+static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
+                        SetVector<SUnit *> &DestNodes,
+                        SetVector<SUnit *> &Exclude,
+                        SmallPtrSet<SUnit *, 8> &Visited) {
+  if (Cur->isBoundaryNode())
+    return false;
+  if (Exclude.contains(Cur))
+    return false;
+  if (DestNodes.contains(Cur))
+    return true;
+  if (!Visited.insert(Cur).second)
+    return Path.contains(Cur);
+  bool FoundPath = false;
+  for (auto &SI : Cur->Succs)
+    if (!ignoreDependence(SI, false))
+      FoundPath |=
+          computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
+  for (auto &PI : Cur->Preds)
+    if (PI.getKind() == SDep::Anti)
+      FoundPath |=
+          computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
+  if (FoundPath)
+    Path.insert(Cur);
+  return FoundPath;
+}
+
+/// Compute the live-out registers for the instructions in a node-set.
+/// The live-out registers are those that are defined in the node-set,
+/// but not used. Except for use operands of Phis.
+static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
+                            NodeSet &NS) {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SmallVector<RegisterMaskPair, 8> LiveOutRegs;
+  SmallSet<unsigned, 4> Uses;
+  for (SUnit *SU : NS) {
+    const MachineInstr *MI = SU->getInstr();
+    if (MI->isPHI())
+      continue;
+    for (const MachineOperand &MO : MI->all_uses()) {
+      Register Reg = MO.getReg();
+      if (Reg.isVirtual())
+        Uses.insert(Reg);
+      else if (MRI.isAllocatable(Reg))
+        for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
+          Uses.insert(Unit);
+    }
+  }
+  for (SUnit *SU : NS)
+    for (const MachineOperand &MO : SU->getInstr()->all_defs())
+      if (!MO.isDead()) {
+        Register Reg = MO.getReg();
+        if (Reg.isVirtual()) {
+          if (!Uses.count(Reg))
+            LiveOutRegs.push_back(RegisterMaskPair(Reg,
+                                                   LaneBitmask::getNone()));
+        } else if (MRI.isAllocatable(Reg)) {
+          for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
+            if (!Uses.count(Unit))
+              LiveOutRegs.push_back(
+                  RegisterMaskPair(Unit, LaneBitmask::getNone()));
+        }
+      }
+  RPTracker.addLiveRegs(LiveOutRegs);
+}
+
+/// A heuristic to filter nodes in recurrent node-sets if the register
+/// pressure of a set is too high.
+void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
+  for (auto &NS : NodeSets) {
+    // Skip small node-sets since they won't cause register pressure problems.
+    if (NS.size() <= 2)
+      continue;
+    IntervalPressure RecRegPressure;
+    RegPressureTracker RecRPTracker(RecRegPressure);
+    RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
+    computeLiveOuts(MF, RecRPTracker, NS);
+    RecRPTracker.closeBottom();
+
+    std::vector<SUnit *> SUnits(NS.begin(), NS.end());
+    llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
+      return A->NodeNum > B->NodeNum;
+    });
+
+    for (auto &SU : SUnits) {
+      // Since we're computing the register pressure for a subset of the
+      // instructions in a block, we need to set the tracker for each
+      // instruction in the node-set. The tracker is set to the instruction
+      // just after the one we're interested in.
+      MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
+      RecRPTracker.setPos(std::next(CurInstI));
+
+      RegPressureDelta RPDelta;
+      ArrayRef<PressureChange> CriticalPSets;
+      RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
+                                             CriticalPSets,
+                                             RecRegPressure.MaxSetPressure);
+      if (RPDelta.Excess.isValid()) {
+        LLVM_DEBUG(
+            dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
+                   << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
+                   << ":" << RPDelta.Excess.getUnitInc() << "\n");
+        NS.setExceedPressure(SU);
+        break;
+      }
+      RecRPTracker.recede();
+    }
+  }
+}
+
+/// A heuristic to colocate node sets that have the same set of
+/// successors.
+void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
+  unsigned Colocate = 0;
+  for (int i = 0, e = NodeSets.size(); i < e; ++i) {
+    NodeSet &N1 = NodeSets[i];
+    SmallSetVector<SUnit *, 8> S1;
+    if (N1.empty() || !succ_L(N1, S1))
+      continue;
+    for (int j = i + 1; j < e; ++j) {
+      NodeSet &N2 = NodeSets[j];
+      if (N1.compareRecMII(N2) != 0)
+        continue;
+      SmallSetVector<SUnit *, 8> S2;
+      if (N2.empty() || !succ_L(N2, S2))
+        continue;
+      if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
+        N1.setColocate(++Colocate);
+        N2.setColocate(Colocate);
+        break;
+      }
+    }
+  }
+}
+
+/// Check if the existing node-sets are profitable. If not, then ignore the
+/// recurrent node-sets, and attempt to schedule all nodes together. This is
+/// a heuristic. If the MII is large and all the recurrent node-sets are small,
+/// then it's best to try to schedule all instructions together instead of
+/// starting with the recurrent node-sets.
+void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
+  // Look for loops with a large MII.
+  if (MII < 17)
+    return;
+  // Check if the node-set contains only a simple add recurrence.
+  for (auto &NS : NodeSets) {
+    if (NS.getRecMII() > 2)
+      return;
+    if (NS.getMaxDepth() > MII)
+      return;
+  }
+  NodeSets.clear();
+  LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
+}
+
+/// Add the nodes that do not belong to a recurrence set into groups
+/// based upon connected components.
+void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
+  SetVector<SUnit *> NodesAdded;
+  SmallPtrSet<SUnit *, 8> Visited;
+  // Add the nodes that are on a path between the previous node sets and
+  // the current node set.
+  for (NodeSet &I : NodeSets) {
+    SmallSetVector<SUnit *, 8> N;
+    // Add the nodes from the current node set to the previous node set.
+    if (succ_L(I, N)) {
+      SetVector<SUnit *> Path;
+      for (SUnit *NI : N) {
+        Visited.clear();
+        computePath(NI, Path, NodesAdded, I, Visited);
+      }
+      if (!Path.empty())
+        I.insert(Path.begin(), Path.end());
+    }
+    // Add the nodes from the previous node set to the current node set.
+    N.clear();
+    if (succ_L(NodesAdded, N)) {
+      SetVector<SUnit *> Path;
+      for (SUnit *NI : N) {
+        Visited.clear();
+        computePath(NI, Path, I, NodesAdded, Visited);
+      }
+      if (!Path.empty())
+        I.insert(Path.begin(), Path.end());
+    }
+    NodesAdded.insert(I.begin(), I.end());
+  }
+
+  // Create a new node set with the connected nodes of any successor of a node
+  // in a recurrent set.
+  NodeSet NewSet;
+  SmallSetVector<SUnit *, 8> N;
+  if (succ_L(NodesAdded, N))
+    for (SUnit *I : N)
+      addConnectedNodes(I, NewSet, NodesAdded);
+  if (!NewSet.empty())
+    NodeSets.push_back(NewSet);
+
+  // Create a new node set with the connected nodes of any predecessor of a node
+  // in a recurrent set.
+  NewSet.clear();
+  if (pred_L(NodesAdded, N))
+    for (SUnit *I : N)
+      addConnectedNodes(I, NewSet, NodesAdded);
+  if (!NewSet.empty())
+    NodeSets.push_back(NewSet);
+
+  // Create new nodes sets with the connected nodes any remaining node that
+  // has no predecessor.
+  for (SUnit &SU : SUnits) {
+    if (NodesAdded.count(&SU) == 0) {
+      NewSet.clear();
+      addConnectedNodes(&SU, NewSet, NodesAdded);
+      if (!NewSet.empty())
+        NodeSets.push_back(NewSet);
+    }
+  }
+}
+
+/// Add the node to the set, and add all of its connected nodes to the set.
+void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
+                                          SetVector<SUnit *> &NodesAdded) {
+  NewSet.insert(SU);
+  NodesAdded.insert(SU);
+  for (auto &SI : SU->Succs) {
+    SUnit *Successor = SI.getSUnit();
+    if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
+        NodesAdded.count(Successor) == 0)
+      addConnectedNodes(Successor, NewSet, NodesAdded);
+  }
+  for (auto &PI : SU->Preds) {
+    SUnit *Predecessor = PI.getSUnit();
+    if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
+      addConnectedNodes(Predecessor, NewSet, NodesAdded);
+  }
+}
+
+/// Return true if Set1 contains elements in Set2. The elements in common
+/// are returned in a different container.
+static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
+                        SmallSetVector<SUnit *, 8> &Result) {
+  Result.clear();
+  for (SUnit *SU : Set1) {
+    if (Set2.count(SU) != 0)
+      Result.insert(SU);
+  }
+  return !Result.empty();
+}
+
+/// Merge the recurrence node sets that have the same initial node.
+void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
+  for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
+       ++I) {
+    NodeSet &NI = *I;
+    for (NodeSetType::iterator J = I + 1; J != E;) {
+      NodeSet &NJ = *J;
+      if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
+        if (NJ.compareRecMII(NI) > 0)
+          NI.setRecMII(NJ.getRecMII());
+        for (SUnit *SU : *J)
+          I->insert(SU);
+        NodeSets.erase(J);
+        E = NodeSets.end();
+      } else {
+        ++J;
+      }
+    }
+  }
+}
+
+/// Remove nodes that have been scheduled in previous NodeSets.
+void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
+  for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
+       ++I)
+    for (NodeSetType::iterator J = I + 1; J != E;) {
+      J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
+
+      if (J->empty()) {
+        NodeSets.erase(J);
+        E = NodeSets.end();
+      } else {
+        ++J;
+      }
+    }
+}
+
+/// Compute an ordered list of the dependence graph nodes, which
+/// indicates the order that the nodes will be scheduled.  This is a
+/// two-level algorithm. First, a partial order is created, which
+/// consists of a list of sets ordered from highest to lowest priority.
+void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
+  SmallSetVector<SUnit *, 8> R;
+  NodeOrder.clear();
+
+  for (auto &Nodes : NodeSets) {
+    LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
+    OrderKind Order;
+    SmallSetVector<SUnit *, 8> N;
+    if (pred_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
+      R.insert(N.begin(), N.end());
+      Order = BottomUp;
+      LLVM_DEBUG(dbgs() << "  Bottom up (preds) ");
+    } else if (succ_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
+      R.insert(N.begin(), N.end());
+      Order = TopDown;
+      LLVM_DEBUG(dbgs() << "  Top down (succs) ");
+    } else if (isIntersect(N, Nodes, R)) {
+      // If some of the successors are in the existing node-set, then use the
+      // top-down ordering.
+      Order = TopDown;
+      LLVM_DEBUG(dbgs() << "  Top down (intersect) ");
+    } else if (NodeSets.size() == 1) {
+      for (const auto &N : Nodes)
+        if (N->Succs.size() == 0)
+          R.insert(N);
+      Order = BottomUp;
+      LLVM_DEBUG(dbgs() << "  Bottom up (all) ");
+    } else {
+      // Find the node with the highest ASAP.
+      SUnit *maxASAP = nullptr;
+      for (SUnit *SU : Nodes) {
+        if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
+            (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
+          maxASAP = SU;
+      }
+      R.insert(maxASAP);
+      Order = BottomUp;
+      LLVM_DEBUG(dbgs() << "  Bottom up (default) ");
+    }
+
+    while (!R.empty()) {
+      if (Order == TopDown) {
+        // Choose the node with the maximum height.  If more than one, choose
+        // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
+        // choose the node with the lowest MOV.
+        while (!R.empty()) {
+          SUnit *maxHeight = nullptr;
+          for (SUnit *I : R) {
+            if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
+              maxHeight = I;
+            else if (getHeight(I) == getHeight(maxHeight) &&
+                     getZeroLatencyHeight(I) > getZeroLatencyHeight(maxHeight))
+              maxHeight = I;
+            else if (getHeight(I) == getHeight(maxHeight) &&
+                     getZeroLatencyHeight(I) ==
+                         getZeroLatencyHeight(maxHeight) &&
+                     getMOV(I) < getMOV(maxHeight))
+              maxHeight = I;
+          }
+          NodeOrder.insert(maxHeight);
+          LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
+          R.remove(maxHeight);
+          for (const auto &I : maxHeight->Succs) {
+            if (Nodes.count(I.getSUnit()) == 0)
+              continue;
+            if (NodeOrder.contains(I.getSUnit()))
+              continue;
+            if (ignoreDependence(I, false))
+              continue;
+            R.insert(I.getSUnit());
+          }
+          // Back-edges are predecessors with an anti-dependence.
+          for (const auto &I : maxHeight->Preds) {
+            if (I.getKind() != SDep::Anti)
+              continue;
+            if (Nodes.count(I.getSUnit()) == 0)
+              continue;
+            if (NodeOrder.contains(I.getSUnit()))
+              continue;
+            R.insert(I.getSUnit());
+          }
+        }
+        Order = BottomUp;
+        LLVM_DEBUG(dbgs() << "\n   Switching order to bottom up ");
+        SmallSetVector<SUnit *, 8> N;
+        if (pred_L(NodeOrder, N, &Nodes))
+          R.insert(N.begin(), N.end());
+      } else {
+        // Choose the node with the maximum depth.  If more than one, choose
+        // the node with the maximum ZeroLatencyDepth. If still more than one,
+        // choose the node with the lowest MOV.
+        while (!R.empty()) {
+          SUnit *maxDepth = nullptr;
+          for (SUnit *I : R) {
+            if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
+              maxDepth = I;
+            else if (getDepth(I) == getDepth(maxDepth) &&
+                     getZeroLatencyDepth(I) > getZeroLatencyDepth(maxDepth))
+              maxDepth = I;
+            else if (getDepth(I) == getDepth(maxDepth) &&
+                     getZeroLatencyDepth(I) == getZeroLatencyDepth(maxDepth) &&
+                     getMOV(I) < getMOV(maxDepth))
+              maxDepth = I;
+          }
+          NodeOrder.insert(maxDepth);
+          LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
+          R.remove(maxDepth);
+          if (Nodes.isExceedSU(maxDepth)) {
+            Order = TopDown;
+            R.clear();
+            R.insert(Nodes.getNode(0));
+            break;
+          }
+          for (const auto &I : maxDepth->Preds) {
+            if (Nodes.count(I.getSUnit()) == 0)
+              continue;
+            if (NodeOrder.contains(I.getSUnit()))
+              continue;
+            R.insert(I.getSUnit());
+          }
+          // Back-edges are predecessors with an anti-dependence.
+          for (const auto &I : maxDepth->Succs) {
+            if (I.getKind() != SDep::Anti)
+              continue;
+            if (Nodes.count(I.getSUnit()) == 0)
+              continue;
+            if (NodeOrder.contains(I.getSUnit()))
+              continue;
+            R.insert(I.getSUnit());
+          }
+        }
+        Order = TopDown;
+        LLVM_DEBUG(dbgs() << "\n   Switching order to top down ");
+        SmallSetVector<SUnit *, 8> N;
+        if (succ_L(NodeOrder, N, &Nodes))
+          R.insert(N.begin(), N.end());
+      }
+    }
+    LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "Node order: ";
+    for (SUnit *I : NodeOrder)
+      dbgs() << " " << I->NodeNum << " ";
+    dbgs() << "\n";
+  });
+}
+
+/// Process the nodes in the computed order and create the pipelined schedule
+/// of the instructions, if possible. Return true if a schedule is found.
+bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
+
+  if (NodeOrder.empty()){
+    LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
+    return false;
+  }
+
+  bool scheduleFound = false;
+  // Keep increasing II until a valid schedule is found.
+  for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
+    Schedule.reset();
+    Schedule.setInitiationInterval(II);
+    LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
+
+    SetVector<SUnit *>::iterator NI = NodeOrder.begin();
+    SetVector<SUnit *>::iterator NE = NodeOrder.end();
+    do {
+      SUnit *SU = *NI;
+
+      // Compute the schedule time for the instruction, which is based
+      // upon the scheduled time for any predecessors/successors.
+      int EarlyStart = INT_MIN;
+      int LateStart = INT_MAX;
+      // These values are set when the size of the schedule window is limited
+      // due to chain dependences.
+      int SchedEnd = INT_MAX;
+      int SchedStart = INT_MIN;
+      Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
+                            II, this);
+      LLVM_DEBUG({
+        dbgs() << "\n";
+        dbgs() << "Inst (" << SU->NodeNum << ") ";
+        SU->getInstr()->dump();
+        dbgs() << "\n";
+      });
+      LLVM_DEBUG({
+        dbgs() << format("\tes: %8x ls: %8x me: %8x ms: %8x\n", EarlyStart,
+                         LateStart, SchedEnd, SchedStart);
+      });
+
+      if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
+          SchedStart > LateStart)
+        scheduleFound = false;
+      else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
+        SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
+        scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
+      } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
+        SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
+        scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
+      } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
+        SchedEnd =
+            std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
+        // When scheduling a Phi it is better to start at the late cycle and go
+        // backwards. The default order may insert the Phi too far away from
+        // its first dependence.
+        if (SU->getInstr()->isPHI())
+          scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
+        else
+          scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
+      } else {
+        int FirstCycle = Schedule.getFirstCycle();
+        scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
+                                        FirstCycle + getASAP(SU) + II - 1, II);
+      }
+      // Even if we find a schedule, make sure the schedule doesn't exceed the
+      // allowable number of stages. We keep trying if this happens.
+      if (scheduleFound)
+        if (SwpMaxStages > -1 &&
+            Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
+          scheduleFound = false;
+
+      LLVM_DEBUG({
+        if (!scheduleFound)
+          dbgs() << "\tCan't schedule\n";
+      });
+    } while (++NI != NE && scheduleFound);
+
+    // If a schedule is found, ensure non-pipelined instructions are in stage 0
+    if (scheduleFound)
+      scheduleFound =
+          Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
+
+    // If a schedule is found, check if it is a valid schedule too.
+    if (scheduleFound)
+      scheduleFound = Schedule.isValidSchedule(this);
+  }
+
+  LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
+                    << " (II=" << Schedule.getInitiationInterval()
+                    << ")\n");
+
+  if (scheduleFound) {
+    scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
+    if (!scheduleFound)
+      LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
+  }
+
+  if (scheduleFound) {
+    Schedule.finalizeSchedule(this);
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Schedule found with Initiation Interval: "
+             << ore::NV("II", Schedule.getInitiationInterval())
+             << ", MaxStageCount: "
+             << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
+    });
+  } else
+    Schedule.reset();
+
+  return scheduleFound && Schedule.getMaxStageCount() > 0;
+}
+
+/// Return true if we can compute the amount the instruction changes
+/// during each iteration. Set Delta to the amount of the change.
+bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const MachineOperand *BaseOp;
+  int64_t Offset;
+  bool OffsetIsScalable;
+  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
+    return false;
+
+  // FIXME: This algorithm assumes instructions have fixed-size offsets.
+  if (OffsetIsScalable)
+    return false;
+
+  if (!BaseOp->isReg())
+    return false;
+
+  Register BaseReg = BaseOp->getReg();
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  // Check if there is a Phi. If so, get the definition in the loop.
+  MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
+  if (BaseDef && BaseDef->isPHI()) {
+    BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
+    BaseDef = MRI.getVRegDef(BaseReg);
+  }
+  if (!BaseDef)
+    return false;
+
+  int D = 0;
+  if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
+    return false;
+
+  Delta = D;
+  return true;
+}
+
+/// Check if we can change the instruction to use an offset value from the
+/// previous iteration. If so, return true and set the base and offset values
+/// so that we can rewrite the load, if necessary.
+///   v1 = Phi(v0, v3)
+///   v2 = load v1, 0
+///   v3 = post_store v1, 4, x
+/// This function enables the load to be rewritten as v2 = load v3, 4.
+bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
+                                              unsigned &BasePos,
+                                              unsigned &OffsetPos,
+                                              unsigned &NewBase,
+                                              int64_t &Offset) {
+  // Get the load instruction.
+  if (TII->isPostIncrement(*MI))
+    return false;
+  unsigned BasePosLd, OffsetPosLd;
+  if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
+    return false;
+  Register BaseReg = MI->getOperand(BasePosLd).getReg();
+
+  // Look for the Phi instruction.
+  MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
+  MachineInstr *Phi = MRI.getVRegDef(BaseReg);
+  if (!Phi || !Phi->isPHI())
+    return false;
+  // Get the register defined in the loop block.
+  unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
+  if (!PrevReg)
+    return false;
+
+  // Check for the post-increment load/store instruction.
+  MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
+  if (!PrevDef || PrevDef == MI)
+    return false;
+
+  if (!TII->isPostIncrement(*PrevDef))
+    return false;
+
+  unsigned BasePos1 = 0, OffsetPos1 = 0;
+  if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
+    return false;
+
+  // Make sure that the instructions do not access the same memory location in
+  // the next iteration.
+  int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
+  int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
+  MachineInstr *NewMI = MF.CloneMachineInstr(MI);
+  NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
+  bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
+  MF.deleteMachineInstr(NewMI);
+  if (!Disjoint)
+    return false;
+
+  // Set the return value once we determine that we return true.
+  BasePos = BasePosLd;
+  OffsetPos = OffsetPosLd;
+  NewBase = PrevReg;
+  Offset = StoreOffset;
+  return true;
+}
+
+/// Apply changes to the instruction if needed. The changes are need
+/// to improve the scheduling and depend up on the final schedule.
+void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
+                                         SMSchedule &Schedule) {
+  SUnit *SU = getSUnit(MI);
+  DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
+      InstrChanges.find(SU);
+  if (It != InstrChanges.end()) {
+    std::pair<unsigned, int64_t> RegAndOffset = It->second;
+    unsigned BasePos, OffsetPos;
+    if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
+      return;
+    Register BaseReg = MI->getOperand(BasePos).getReg();
+    MachineInstr *LoopDef = findDefInLoop(BaseReg);
+    int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
+    int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
+    int BaseStageNum = Schedule.stageScheduled(SU);
+    int BaseCycleNum = Schedule.cycleScheduled(SU);
+    if (BaseStageNum < DefStageNum) {
+      MachineInstr *NewMI = MF.CloneMachineInstr(MI);
+      int OffsetDiff = DefStageNum - BaseStageNum;
+      if (DefCycleNum < BaseCycleNum) {
+        NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
+        if (OffsetDiff > 0)
+          --OffsetDiff;
+      }
+      int64_t NewOffset =
+          MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
+      NewMI->getOperand(OffsetPos).setImm(NewOffset);
+      SU->setInstr(NewMI);
+      MISUnitMap[NewMI] = SU;
+      NewMIs[MI] = NewMI;
+    }
+  }
+}
+
+/// Return the instruction in the loop that defines the register.
+/// If the definition is a Phi, then follow the Phi operand to
+/// the instruction in the loop.
+MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
+  SmallPtrSet<MachineInstr *, 8> Visited;
+  MachineInstr *Def = MRI.getVRegDef(Reg);
+  while (Def->isPHI()) {
+    if (!Visited.insert(Def).second)
+      break;
+    for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
+      if (Def->getOperand(i + 1).getMBB() == BB) {
+        Def = MRI.getVRegDef(Def->getOperand(i).getReg());
+        break;
+      }
+  }
+  return Def;
+}
+
+/// Return true for an order or output dependence that is loop carried
+/// potentially. A dependence is loop carried if the destination defines a valu
+/// that may be used or defined by the source in a subsequent iteration.
+bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
+                                         bool isSucc) {
+  if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
+      Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
+    return false;
+
+  if (!SwpPruneLoopCarried)
+    return true;
+
+  if (Dep.getKind() == SDep::Output)
+    return true;
+
+  MachineInstr *SI = Source->getInstr();
+  MachineInstr *DI = Dep.getSUnit()->getInstr();
+  if (!isSucc)
+    std::swap(SI, DI);
+  assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
+
+  // Assume ordered loads and stores may have a loop carried dependence.
+  if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
+      SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
+      SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
+    return true;
+
+  // Only chain dependences between a load and store can be loop carried.
+  if (!DI->mayStore() || !SI->mayLoad())
+    return false;
+
+  unsigned DeltaS, DeltaD;
+  if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
+    return true;
+
+  const MachineOperand *BaseOpS, *BaseOpD;
+  int64_t OffsetS, OffsetD;
+  bool OffsetSIsScalable, OffsetDIsScalable;
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
+                                    TRI) ||
+      !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
+                                    TRI))
+    return true;
+
+  assert(!OffsetSIsScalable && !OffsetDIsScalable &&
+         "Expected offsets to be byte offsets");
+
+  MachineInstr *DefS = MRI.getVRegDef(BaseOpS->getReg());
+  MachineInstr *DefD = MRI.getVRegDef(BaseOpD->getReg());
+  if (!DefS || !DefD || !DefS->isPHI() || !DefD->isPHI())
+    return true;
+
+  unsigned InitValS = 0;
+  unsigned LoopValS = 0;
+  unsigned InitValD = 0;
+  unsigned LoopValD = 0;
+  getPhiRegs(*DefS, BB, InitValS, LoopValS);
+  getPhiRegs(*DefD, BB, InitValD, LoopValD);
+  MachineInstr *InitDefS = MRI.getVRegDef(InitValS);
+  MachineInstr *InitDefD = MRI.getVRegDef(InitValD);
+
+  if (!InitDefS->isIdenticalTo(*InitDefD))
+    return true;
+
+  // Check that the base register is incremented by a constant value for each
+  // iteration.
+  MachineInstr *LoopDefS = MRI.getVRegDef(LoopValS);
+  int D = 0;
+  if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
+    return true;
+
+  uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
+  uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
+
+  // This is the main test, which checks the offset values and the loop
+  // increment value to determine if the accesses may be loop carried.
+  if (AccessSizeS == MemoryLocation::UnknownSize ||
+      AccessSizeD == MemoryLocation::UnknownSize)
+    return true;
+
+  if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD)
+    return true;
+
+  return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD);
+}
+
+void SwingSchedulerDAG::postProcessDAG() {
+  for (auto &M : Mutations)
+    M->apply(this);
+}
+
+/// Try to schedule the node at the specified StartCycle and continue
+/// until the node is schedule or the EndCycle is reached.  This function
+/// returns true if the node is scheduled.  This routine may search either
+/// forward or backward for a place to insert the instruction based upon
+/// the relative values of StartCycle and EndCycle.
+bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
+  bool forward = true;
+  LLVM_DEBUG({
+    dbgs() << "Trying to insert node between " << StartCycle << " and "
+           << EndCycle << " II: " << II << "\n";
+  });
+  if (StartCycle > EndCycle)
+    forward = false;
+
+  // The terminating condition depends on the direction.
+  int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
+  for (int curCycle = StartCycle; curCycle != termCycle;
+       forward ? ++curCycle : --curCycle) {
+
+    if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
+        ProcItinResources.canReserveResources(*SU, curCycle)) {
+      LLVM_DEBUG({
+        dbgs() << "\tinsert at cycle " << curCycle << " ";
+        SU->getInstr()->dump();
+      });
+
+      if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
+        ProcItinResources.reserveResources(*SU, curCycle);
+      ScheduledInstrs[curCycle].push_back(SU);
+      InstrToCycle.insert(std::make_pair(SU, curCycle));
+      if (curCycle > LastCycle)
+        LastCycle = curCycle;
+      if (curCycle < FirstCycle)
+        FirstCycle = curCycle;
+      return true;
+    }
+    LLVM_DEBUG({
+      dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
+      SU->getInstr()->dump();
+    });
+  }
+  return false;
+}
+
+// Return the cycle of the earliest scheduled instruction in the chain.
+int SMSchedule::earliestCycleInChain(const SDep &Dep) {
+  SmallPtrSet<SUnit *, 8> Visited;
+  SmallVector<SDep, 8> Worklist;
+  Worklist.push_back(Dep);
+  int EarlyCycle = INT_MAX;
+  while (!Worklist.empty()) {
+    const SDep &Cur = Worklist.pop_back_val();
+    SUnit *PrevSU = Cur.getSUnit();
+    if (Visited.count(PrevSU))
+      continue;
+    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
+    if (it == InstrToCycle.end())
+      continue;
+    EarlyCycle = std::min(EarlyCycle, it->second);
+    for (const auto &PI : PrevSU->Preds)
+      if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output)
+        Worklist.push_back(PI);
+    Visited.insert(PrevSU);
+  }
+  return EarlyCycle;
+}
+
+// Return the cycle of the latest scheduled instruction in the chain.
+int SMSchedule::latestCycleInChain(const SDep &Dep) {
+  SmallPtrSet<SUnit *, 8> Visited;
+  SmallVector<SDep, 8> Worklist;
+  Worklist.push_back(Dep);
+  int LateCycle = INT_MIN;
+  while (!Worklist.empty()) {
+    const SDep &Cur = Worklist.pop_back_val();
+    SUnit *SuccSU = Cur.getSUnit();
+    if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
+      continue;
+    std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
+    if (it == InstrToCycle.end())
+      continue;
+    LateCycle = std::max(LateCycle, it->second);
+    for (const auto &SI : SuccSU->Succs)
+      if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output)
+        Worklist.push_back(SI);
+    Visited.insert(SuccSU);
+  }
+  return LateCycle;
+}
+
+/// If an instruction has a use that spans multiple iterations, then
+/// return true. These instructions are characterized by having a back-ege
+/// to a Phi, which contains a reference to another Phi.
+static SUnit *multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG) {
+  for (auto &P : SU->Preds)
+    if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
+      for (auto &S : P.getSUnit()->Succs)
+        if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
+          return P.getSUnit();
+  return nullptr;
+}
+
+/// Compute the scheduling start slot for the instruction.  The start slot
+/// depends on any predecessor or successor nodes scheduled already.
+void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
+                              int *MinEnd, int *MaxStart, int II,
+                              SwingSchedulerDAG *DAG) {
+  // Iterate over each instruction that has been scheduled already.  The start
+  // slot computation depends on whether the previously scheduled instruction
+  // is a predecessor or successor of the specified instruction.
+  for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
+
+    // Iterate over each instruction in the current cycle.
+    for (SUnit *I : getInstructions(cycle)) {
+      // Because we're processing a DAG for the dependences, we recognize
+      // the back-edge in recurrences by anti dependences.
+      for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
+        const SDep &Dep = SU->Preds[i];
+        if (Dep.getSUnit() == I) {
+          if (!DAG->isBackedge(SU, Dep)) {
+            int EarlyStart = cycle + Dep.getLatency() -
+                             DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
+            *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
+            if (DAG->isLoopCarriedDep(SU, Dep, false)) {
+              int End = earliestCycleInChain(Dep) + (II - 1);
+              *MinEnd = std::min(*MinEnd, End);
+            }
+          } else {
+            int LateStart = cycle - Dep.getLatency() +
+                            DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
+            *MinLateStart = std::min(*MinLateStart, LateStart);
+          }
+        }
+        // For instruction that requires multiple iterations, make sure that
+        // the dependent instruction is not scheduled past the definition.
+        SUnit *BE = multipleIterations(I, DAG);
+        if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
+            !SU->isPred(I))
+          *MinLateStart = std::min(*MinLateStart, cycle);
+      }
+      for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
+        if (SU->Succs[i].getSUnit() == I) {
+          const SDep &Dep = SU->Succs[i];
+          if (!DAG->isBackedge(SU, Dep)) {
+            int LateStart = cycle - Dep.getLatency() +
+                            DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
+            *MinLateStart = std::min(*MinLateStart, LateStart);
+            if (DAG->isLoopCarriedDep(SU, Dep)) {
+              int Start = latestCycleInChain(Dep) + 1 - II;
+              *MaxStart = std::max(*MaxStart, Start);
+            }
+          } else {
+            int EarlyStart = cycle + Dep.getLatency() -
+                             DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
+            *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
+          }
+        }
+      }
+    }
+  }
+}
+
+/// Order the instructions within a cycle so that the definitions occur
+/// before the uses. Returns true if the instruction is added to the start
+/// of the list, or false if added to the end.
+void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
+                                 std::deque<SUnit *> &Insts) {
+  MachineInstr *MI = SU->getInstr();
+  bool OrderBeforeUse = false;
+  bool OrderAfterDef = false;
+  bool OrderBeforeDef = false;
+  unsigned MoveDef = 0;
+  unsigned MoveUse = 0;
+  int StageInst1 = stageScheduled(SU);
+
+  unsigned Pos = 0;
+  for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
+       ++I, ++Pos) {
+    for (MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg() || !MO.getReg().isVirtual())
+        continue;
+
+      Register Reg = MO.getReg();
+      unsigned BasePos, OffsetPos;
+      if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
+        if (MI->getOperand(BasePos).getReg() == Reg)
+          if (unsigned NewReg = SSD->getInstrBaseReg(SU))
+            Reg = NewReg;
+      bool Reads, Writes;
+      std::tie(Reads, Writes) =
+          (*I)->getInstr()->readsWritesVirtualRegister(Reg);
+      if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
+        OrderBeforeUse = true;
+        if (MoveUse == 0)
+          MoveUse = Pos;
+      } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
+        // Add the instruction after the scheduled instruction.
+        OrderAfterDef = true;
+        MoveDef = Pos;
+      } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
+        if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
+          OrderBeforeUse = true;
+          if (MoveUse == 0)
+            MoveUse = Pos;
+        } else {
+          OrderAfterDef = true;
+          MoveDef = Pos;
+        }
+      } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
+        OrderBeforeUse = true;
+        if (MoveUse == 0)
+          MoveUse = Pos;
+        if (MoveUse != 0) {
+          OrderAfterDef = true;
+          MoveDef = Pos - 1;
+        }
+      } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
+        // Add the instruction before the scheduled instruction.
+        OrderBeforeUse = true;
+        if (MoveUse == 0)
+          MoveUse = Pos;
+      } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
+                 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
+        if (MoveUse == 0) {
+          OrderBeforeDef = true;
+          MoveUse = Pos;
+        }
+      }
+    }
+    // Check for order dependences between instructions. Make sure the source
+    // is ordered before the destination.
+    for (auto &S : SU->Succs) {
+      if (S.getSUnit() != *I)
+        continue;
+      if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
+        OrderBeforeUse = true;
+        if (Pos < MoveUse)
+          MoveUse = Pos;
+      }
+      // We did not handle HW dependences in previous for loop,
+      // and we normally set Latency = 0 for Anti deps,
+      // so may have nodes in same cycle with Anti denpendent on HW regs.
+      else if (S.getKind() == SDep::Anti && stageScheduled(*I) == StageInst1) {
+        OrderBeforeUse = true;
+        if ((MoveUse == 0) || (Pos < MoveUse))
+          MoveUse = Pos;
+      }
+    }
+    for (auto &P : SU->Preds) {
+      if (P.getSUnit() != *I)
+        continue;
+      if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
+        OrderAfterDef = true;
+        MoveDef = Pos;
+      }
+    }
+  }
+
+  // A circular dependence.
+  if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
+    OrderBeforeUse = false;
+
+  // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
+  // to a loop-carried dependence.
+  if (OrderBeforeDef)
+    OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
+
+  // The uncommon case when the instruction order needs to be updated because
+  // there is both a use and def.
+  if (OrderBeforeUse && OrderAfterDef) {
+    SUnit *UseSU = Insts.at(MoveUse);
+    SUnit *DefSU = Insts.at(MoveDef);
+    if (MoveUse > MoveDef) {
+      Insts.erase(Insts.begin() + MoveUse);
+      Insts.erase(Insts.begin() + MoveDef);
+    } else {
+      Insts.erase(Insts.begin() + MoveDef);
+      Insts.erase(Insts.begin() + MoveUse);
+    }
+    orderDependence(SSD, UseSU, Insts);
+    orderDependence(SSD, SU, Insts);
+    orderDependence(SSD, DefSU, Insts);
+    return;
+  }
+  // Put the new instruction first if there is a use in the list. Otherwise,
+  // put it at the end of the list.
+  if (OrderBeforeUse)
+    Insts.push_front(SU);
+  else
+    Insts.push_back(SU);
+}
+
+/// Return true if the scheduled Phi has a loop carried operand.
+bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
+  if (!Phi.isPHI())
+    return false;
+  assert(Phi.isPHI() && "Expecting a Phi.");
+  SUnit *DefSU = SSD->getSUnit(&Phi);
+  unsigned DefCycle = cycleScheduled(DefSU);
+  int DefStage = stageScheduled(DefSU);
+
+  unsigned InitVal = 0;
+  unsigned LoopVal = 0;
+  getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
+  SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
+  if (!UseSU)
+    return true;
+  if (UseSU->getInstr()->isPHI())
+    return true;
+  unsigned LoopCycle = cycleScheduled(UseSU);
+  int LoopStage = stageScheduled(UseSU);
+  return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
+}
+
+/// Return true if the instruction is a definition that is loop carried
+/// and defines the use on the next iteration.
+///        v1 = phi(v2, v3)
+///  (Def) v3 = op v1
+///  (MO)   = v1
+/// If MO appears before Def, then then v1 and v3 may get assigned to the same
+/// register.
+bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
+                                       MachineInstr *Def, MachineOperand &MO) {
+  if (!MO.isReg())
+    return false;
+  if (Def->isPHI())
+    return false;
+  MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
+  if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
+    return false;
+  if (!isLoopCarried(SSD, *Phi))
+    return false;
+  unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
+  for (MachineOperand &DMO : Def->all_defs()) {
+    if (DMO.getReg() == LoopReg)
+      return true;
+  }
+  return false;
+}
+
+/// Determine transitive dependences of unpipelineable instructions
+SmallSet<SUnit *, 8> SMSchedule::computeUnpipelineableNodes(
+    SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
+  SmallSet<SUnit *, 8> DoNotPipeline;
+  SmallVector<SUnit *, 8> Worklist;
+
+  for (auto &SU : SSD->SUnits)
+    if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
+      Worklist.push_back(&SU);
+
+  while (!Worklist.empty()) {
+    auto SU = Worklist.pop_back_val();
+    if (DoNotPipeline.count(SU))
+      continue;
+    LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
+    DoNotPipeline.insert(SU);
+    for (auto &Dep : SU->Preds)
+      Worklist.push_back(Dep.getSUnit());
+    if (SU->getInstr()->isPHI())
+      for (auto &Dep : SU->Succs)
+        if (Dep.getKind() == SDep::Anti)
+          Worklist.push_back(Dep.getSUnit());
+  }
+  return DoNotPipeline;
+}
+
+// Determine all instructions upon which any unpipelineable instruction depends
+// and ensure that they are in stage 0.  If unable to do so, return false.
+bool SMSchedule::normalizeNonPipelinedInstructions(
+    SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) {
+  SmallSet<SUnit *, 8> DNP = computeUnpipelineableNodes(SSD, PLI);
+
+  int NewLastCycle = INT_MIN;
+  for (SUnit &SU : SSD->SUnits) {
+    if (!SU.isInstr())
+      continue;
+    if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
+      NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
+      continue;
+    }
+
+    // Put the non-pipelined instruction as early as possible in the schedule
+    int NewCycle = getFirstCycle();
+    for (auto &Dep : SU.Preds)
+      NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
+
+    int OldCycle = InstrToCycle[&SU];
+    if (OldCycle != NewCycle) {
+      InstrToCycle[&SU] = NewCycle;
+      auto &OldS = getInstructions(OldCycle);
+      llvm::erase_value(OldS, &SU);
+      getInstructions(NewCycle).emplace_back(&SU);
+      LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
+                        << ") is not pipelined; moving from cycle " << OldCycle
+                        << " to " << NewCycle << " Instr:" << *SU.getInstr());
+    }
+    NewLastCycle = std::max(NewLastCycle, NewCycle);
+  }
+  LastCycle = NewLastCycle;
+  return true;
+}
+
+// Check if the generated schedule is valid. This function checks if
+// an instruction that uses a physical register is scheduled in a
+// different stage than the definition. The pipeliner does not handle
+// physical register values that may cross a basic block boundary.
+// Furthermore, if a physical def/use pair is assigned to the same
+// cycle, orderDependence does not guarantee def/use ordering, so that
+// case should be considered invalid.  (The test checks for both
+// earlier and same-cycle use to be more robust.)
+bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
+  for (SUnit &SU : SSD->SUnits) {
+    if (!SU.hasPhysRegDefs)
+      continue;
+    int StageDef = stageScheduled(&SU);
+    int CycleDef = InstrToCycle[&SU];
+    assert(StageDef != -1 && "Instruction should have been scheduled.");
+    for (auto &SI : SU.Succs)
+      if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
+        if (Register::isPhysicalRegister(SI.getReg())) {
+          if (stageScheduled(SI.getSUnit()) != StageDef)
+            return false;
+          if (InstrToCycle[SI.getSUnit()] <= CycleDef)
+            return false;
+        }
+  }
+  return true;
+}
+
+/// A property of the node order in swing-modulo-scheduling is
+/// that for nodes outside circuits the following holds:
+/// none of them is scheduled after both a successor and a
+/// predecessor.
+/// The method below checks whether the property is met.
+/// If not, debug information is printed and statistics information updated.
+/// Note that we do not use an assert statement.
+/// The reason is that although an invalid node oder may prevent
+/// the pipeliner from finding a pipelined schedule for arbitrary II,
+/// it does not lead to the generation of incorrect code.
+void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
+
+  // a sorted vector that maps each SUnit to its index in the NodeOrder
+  typedef std::pair<SUnit *, unsigned> UnitIndex;
+  std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
+
+  for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
+    Indices.push_back(std::make_pair(NodeOrder[i], i));
+
+  auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
+    return std::get<0>(i1) < std::get<0>(i2);
+  };
+
+  // sort, so that we can perform a binary search
+  llvm::sort(Indices, CompareKey);
+
+  bool Valid = true;
+  (void)Valid;
+  // for each SUnit in the NodeOrder, check whether
+  // it appears after both a successor and a predecessor
+  // of the SUnit. If this is the case, and the SUnit
+  // is not part of circuit, then the NodeOrder is not
+  // valid.
+  for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
+    SUnit *SU = NodeOrder[i];
+    unsigned Index = i;
+
+    bool PredBefore = false;
+    bool SuccBefore = false;
+
+    SUnit *Succ;
+    SUnit *Pred;
+    (void)Succ;
+    (void)Pred;
+
+    for (SDep &PredEdge : SU->Preds) {
+      SUnit *PredSU = PredEdge.getSUnit();
+      unsigned PredIndex = std::get<1>(
+          *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
+      if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
+        PredBefore = true;
+        Pred = PredSU;
+        break;
+      }
+    }
+
+    for (SDep &SuccEdge : SU->Succs) {
+      SUnit *SuccSU = SuccEdge.getSUnit();
+      // Do not process a boundary node, it was not included in NodeOrder,
+      // hence not in Indices either, call to std::lower_bound() below will
+      // return Indices.end().
+      if (SuccSU->isBoundaryNode())
+        continue;
+      unsigned SuccIndex = std::get<1>(
+          *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
+      if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
+        SuccBefore = true;
+        Succ = SuccSU;
+        break;
+      }
+    }
+
+    if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
+      // instructions in circuits are allowed to be scheduled
+      // after both a successor and predecessor.
+      bool InCircuit = llvm::any_of(
+          Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
+      if (InCircuit)
+        LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
+      else {
+        Valid = false;
+        NumNodeOrderIssues++;
+        LLVM_DEBUG(dbgs() << "Predecessor ";);
+      }
+      LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
+                        << " are scheduled before node " << SU->NodeNum
+                        << "\n";);
+    }
+  }
+
+  LLVM_DEBUG({
+    if (!Valid)
+      dbgs() << "Invalid node order found!\n";
+  });
+}
+
+/// Attempt to fix the degenerate cases when the instruction serialization
+/// causes the register lifetimes to overlap. For example,
+///   p' = store_pi(p, b)
+///      = load p, offset
+/// In this case p and p' overlap, which means that two registers are needed.
+/// Instead, this function changes the load to use p' and updates the offset.
+void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
+  unsigned OverlapReg = 0;
+  unsigned NewBaseReg = 0;
+  for (SUnit *SU : Instrs) {
+    MachineInstr *MI = SU->getInstr();
+    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      // Look for an instruction that uses p. The instruction occurs in the
+      // same cycle but occurs later in the serialized order.
+      if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
+        // Check that the instruction appears in the InstrChanges structure,
+        // which contains instructions that can have the offset updated.
+        DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
+          InstrChanges.find(SU);
+        if (It != InstrChanges.end()) {
+          unsigned BasePos, OffsetPos;
+          // Update the base register and adjust the offset.
+          if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
+            MachineInstr *NewMI = MF.CloneMachineInstr(MI);
+            NewMI->getOperand(BasePos).setReg(NewBaseReg);
+            int64_t NewOffset =
+                MI->getOperand(OffsetPos).getImm() - It->second.second;
+            NewMI->getOperand(OffsetPos).setImm(NewOffset);
+            SU->setInstr(NewMI);
+            MISUnitMap[NewMI] = SU;
+            NewMIs[MI] = NewMI;
+          }
+        }
+        OverlapReg = 0;
+        NewBaseReg = 0;
+        break;
+      }
+      // Look for an instruction of the form p' = op(p), which uses and defines
+      // two virtual registers that get allocated to the same physical register.
+      unsigned TiedUseIdx = 0;
+      if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
+        // OverlapReg is p in the example above.
+        OverlapReg = MI->getOperand(TiedUseIdx).getReg();
+        // NewBaseReg is p' in the example above.
+        NewBaseReg = MI->getOperand(i).getReg();
+        break;
+      }
+    }
+  }
+}
+
+/// After the schedule has been formed, call this function to combine
+/// the instructions from the different stages/cycles.  That is, this
+/// function creates a schedule that represents a single iteration.
+void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
+  // Move all instructions to the first stage from later stages.
+  for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
+    for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
+         ++stage) {
+      std::deque<SUnit *> &cycleInstrs =
+          ScheduledInstrs[cycle + (stage * InitiationInterval)];
+      for (SUnit *SU : llvm::reverse(cycleInstrs))
+        ScheduledInstrs[cycle].push_front(SU);
+    }
+  }
+
+  // Erase all the elements in the later stages. Only one iteration should
+  // remain in the scheduled list, and it contains all the instructions.
+  for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
+    ScheduledInstrs.erase(cycle);
+
+  // Change the registers in instruction as specified in the InstrChanges
+  // map. We need to use the new registers to create the correct order.
+  for (const SUnit &SU : SSD->SUnits)
+    SSD->applyInstrChange(SU.getInstr(), *this);
+
+  // Reorder the instructions in each cycle to fix and improve the
+  // generated code.
+  for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
+    std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
+    std::deque<SUnit *> newOrderPhi;
+    for (SUnit *SU : cycleInstrs) {
+      if (SU->getInstr()->isPHI())
+        newOrderPhi.push_back(SU);
+    }
+    std::deque<SUnit *> newOrderI;
+    for (SUnit *SU : cycleInstrs) {
+      if (!SU->getInstr()->isPHI())
+        orderDependence(SSD, SU, newOrderI);
+    }
+    // Replace the old order with the new order.
+    cycleInstrs.swap(newOrderPhi);
+    llvm::append_range(cycleInstrs, newOrderI);
+    SSD->fixupRegisterOverlaps(cycleInstrs);
+  }
+
+  LLVM_DEBUG(dump(););
+}
+
+void NodeSet::print(raw_ostream &os) const {
+  os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
+     << " depth " << MaxDepth << " col " << Colocate << "\n";
+  for (const auto &I : Nodes)
+    os << "   SU(" << I->NodeNum << ") " << *(I->getInstr());
+  os << "\n";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+/// Print the schedule information to the given output.
+void SMSchedule::print(raw_ostream &os) const {
+  // Iterate over each cycle.
+  for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
+    // Iterate over each instruction in the cycle.
+    const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
+    for (SUnit *CI : cycleInstrs->second) {
+      os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
+      os << "(" << CI->NodeNum << ") ";
+      CI->getInstr()->print(os);
+      os << "\n";
+    }
+  }
+}
+
+/// Utility function used for debugging to print the schedule.
+LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void NodeSet::dump() const { print(dbgs()); }
+
+void ResourceManager::dumpMRT() const {
+  LLVM_DEBUG({
+    if (UseDFA)
+      return;
+    std::stringstream SS;
+    SS << "MRT:\n";
+    SS << std::setw(4) << "Slot";
+    for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
+      SS << std::setw(3) << I;
+    SS << std::setw(7) << "#Mops"
+       << "\n";
+    for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
+      SS << std::setw(4) << Slot;
+      for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
+        SS << std::setw(3) << MRT[Slot][I];
+      SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
+    }
+    dbgs() << SS.str();
+  });
+}
+#endif
+
+void ResourceManager::initProcResourceVectors(
+    const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
+  unsigned ProcResourceID = 0;
+
+  // We currently limit the resource kinds to 64 and below so that we can use
+  // uint64_t for Masks
+  assert(SM.getNumProcResourceKinds() < 64 &&
+         "Too many kinds of resources, unsupported");
+  // Create a unique bitmask for every processor resource unit.
+  // Skip resource at index 0, since it always references 'InvalidUnit'.
+  Masks.resize(SM.getNumProcResourceKinds());
+  for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    const MCProcResourceDesc &Desc = *SM.getProcResource(I);
+    if (Desc.SubUnitsIdxBegin)
+      continue;
+    Masks[I] = 1ULL << ProcResourceID;
+    ProcResourceID++;
+  }
+  // Create a unique bitmask for every processor resource group.
+  for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    const MCProcResourceDesc &Desc = *SM.getProcResource(I);
+    if (!Desc.SubUnitsIdxBegin)
+      continue;
+    Masks[I] = 1ULL << ProcResourceID;
+    for (unsigned U = 0; U < Desc.NumUnits; ++U)
+      Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
+    ProcResourceID++;
+  }
+  LLVM_DEBUG({
+    if (SwpShowResMask) {
+      dbgs() << "ProcResourceDesc:\n";
+      for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+        const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
+        dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
+                         ProcResource->Name, I, Masks[I],
+                         ProcResource->NumUnits);
+      }
+      dbgs() << " -----------------\n";
+    }
+  });
+}
+
+bool ResourceManager::canReserveResources(SUnit &SU, int Cycle) {
+  LLVM_DEBUG({
+    if (SwpDebugResource)
+      dbgs() << "canReserveResources:\n";
+  });
+  if (UseDFA)
+    return DFAResources[positiveModulo(Cycle, InitiationInterval)]
+        ->canReserveResources(&SU.getInstr()->getDesc());
+
+  const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
+  if (!SCDesc->isValid()) {
+    LLVM_DEBUG({
+      dbgs() << "No valid Schedule Class Desc for schedClass!\n";
+      dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
+    });
+    return true;
+  }
+
+  reserveResources(SCDesc, Cycle);
+  bool Result = !isOverbooked();
+  unreserveResources(SCDesc, Cycle);
+
+  LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n";);
+  return Result;
+}
+
+void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
+  LLVM_DEBUG({
+    if (SwpDebugResource)
+      dbgs() << "reserveResources:\n";
+  });
+  if (UseDFA)
+    return DFAResources[positiveModulo(Cycle, InitiationInterval)]
+        ->reserveResources(&SU.getInstr()->getDesc());
+
+  const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
+  if (!SCDesc->isValid()) {
+    LLVM_DEBUG({
+      dbgs() << "No valid Schedule Class Desc for schedClass!\n";
+      dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
+    });
+    return;
+  }
+
+  reserveResources(SCDesc, Cycle);
+
+  LLVM_DEBUG({
+    if (SwpDebugResource) {
+      dumpMRT();
+      dbgs() << "reserveResources: done!\n\n";
+    }
+  });
+}
+
+void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
+                                       int Cycle) {
+  assert(!UseDFA);
+  for (const MCWriteProcResEntry &PRE : make_range(
+           STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
+    for (int C = Cycle; C < Cycle + PRE.Cycles; ++C)
+      ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
+
+  for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
+    ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
+}
+
+void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
+                                         int Cycle) {
+  assert(!UseDFA);
+  for (const MCWriteProcResEntry &PRE : make_range(
+           STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
+    for (int C = Cycle; C < Cycle + PRE.Cycles; ++C)
+      --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
+
+  for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
+    --NumScheduledMops[positiveModulo(C, InitiationInterval)];
+}
+
+bool ResourceManager::isOverbooked() const {
+  assert(!UseDFA);
+  for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
+    for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+      const MCProcResourceDesc *Desc = SM.getProcResource(I);
+      if (MRT[Slot][I] > Desc->NumUnits)
+        return true;
+    }
+    if (NumScheduledMops[Slot] > IssueWidth)
+      return true;
+  }
+  return false;
+}
+
+int ResourceManager::calculateResMIIDFA() const {
+  assert(UseDFA);
+
+  // Sort the instructions by the number of available choices for scheduling,
+  // least to most. Use the number of critical resources as the tie breaker.
+  FuncUnitSorter FUS = FuncUnitSorter(*ST);
+  for (SUnit &SU : DAG->SUnits)
+    FUS.calcCriticalResources(*SU.getInstr());
+  PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter>
+      FuncUnitOrder(FUS);
+
+  for (SUnit &SU : DAG->SUnits)
+    FuncUnitOrder.push(SU.getInstr());
+
+  SmallVector<std::unique_ptr<DFAPacketizer>, 8> Resources;
+  Resources.push_back(
+      std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
+
+  while (!FuncUnitOrder.empty()) {
+    MachineInstr *MI = FuncUnitOrder.top();
+    FuncUnitOrder.pop();
+    if (TII->isZeroCost(MI->getOpcode()))
+      continue;
+
+    // Attempt to reserve the instruction in an existing DFA. At least one
+    // DFA is needed for each cycle.
+    unsigned NumCycles = DAG->getSUnit(MI)->Latency;
+    unsigned ReservedCycles = 0;
+    auto *RI = Resources.begin();
+    auto *RE = Resources.end();
+    LLVM_DEBUG({
+      dbgs() << "Trying to reserve resource for " << NumCycles
+             << " cycles for \n";
+      MI->dump();
+    });
+    for (unsigned C = 0; C < NumCycles; ++C)
+      while (RI != RE) {
+        if ((*RI)->canReserveResources(*MI)) {
+          (*RI)->reserveResources(*MI);
+          ++ReservedCycles;
+          break;
+        }
+        RI++;
+      }
+    LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
+                      << ", NumCycles:" << NumCycles << "\n");
+    // Add new DFAs, if needed, to reserve resources.
+    for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
+      LLVM_DEBUG(if (SwpDebugResource) dbgs()
+                 << "NewResource created to reserve resources"
+                 << "\n");
+      auto *NewResource = TII->CreateTargetScheduleState(*ST);
+      assert(NewResource->canReserveResources(*MI) && "Reserve error.");
+      NewResource->reserveResources(*MI);
+      Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
+    }
+  }
+
+  int Resmii = Resources.size();
+  LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
+  return Resmii;
+}
+
+int ResourceManager::calculateResMII() const {
+  if (UseDFA)
+    return calculateResMIIDFA();
+
+  // Count each resource consumption and divide it by the number of units.
+  // ResMII is the max value among them.
+
+  int NumMops = 0;
+  SmallVector<uint64_t> ResourceCount(SM.getNumProcResourceKinds());
+  for (SUnit &SU : DAG->SUnits) {
+    if (TII->isZeroCost(SU.getInstr()->getOpcode()))
+      continue;
+
+    const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
+    if (!SCDesc->isValid())
+      continue;
+
+    LLVM_DEBUG({
+      if (SwpDebugResource) {
+        DAG->dumpNode(SU);
+        dbgs() << "  #Mops: " << SCDesc->NumMicroOps << "\n"
+               << "  WriteProcRes: ";
+      }
+    });
+    NumMops += SCDesc->NumMicroOps;
+    for (const MCWriteProcResEntry &PRE :
+         make_range(STI->getWriteProcResBegin(SCDesc),
+                    STI->getWriteProcResEnd(SCDesc))) {
+      LLVM_DEBUG({
+        if (SwpDebugResource) {
+          const MCProcResourceDesc *Desc =
+              SM.getProcResource(PRE.ProcResourceIdx);
+          dbgs() << Desc->Name << ": " << PRE.Cycles << ", ";
+        }
+      });
+      ResourceCount[PRE.ProcResourceIdx] += PRE.Cycles;
+    }
+    LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
+  }
+
+  int Result = (NumMops + IssueWidth - 1) / IssueWidth;
+  LLVM_DEBUG({
+    if (SwpDebugResource)
+      dbgs() << "#Mops: " << NumMops << ", "
+             << "IssueWidth: " << IssueWidth << ", "
+             << "Cycles: " << Result << "\n";
+  });
+
+  LLVM_DEBUG({
+    if (SwpDebugResource) {
+      std::stringstream SS;
+      SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
+         << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
+         << "\n";
+      dbgs() << SS.str();
+    }
+  });
+  for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    const MCProcResourceDesc *Desc = SM.getProcResource(I);
+    int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
+    LLVM_DEBUG({
+      if (SwpDebugResource) {
+        std::stringstream SS;
+        SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
+           << Desc->NumUnits << std::setw(10) << ResourceCount[I]
+           << std::setw(10) << Cycles << "\n";
+        dbgs() << SS.str();
+      }
+    });
+    if (Cycles > Result)
+      Result = Cycles;
+  }
+  return Result;
+}
+
+void ResourceManager::init(int II) {
+  InitiationInterval = II;
+  DFAResources.clear();
+  DFAResources.resize(II);
+  for (auto &I : DFAResources)
+    I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
+  MRT.clear();
+  MRT.resize(II, SmallVector<uint64_t>(SM.getNumProcResourceKinds()));
+  NumScheduledMops.clear();
+  NumScheduledMops.resize(II);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachinePostDominators.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachinePostDominators.cpp
new file mode 100644
index 000000000000..fb96d0efa4d4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachinePostDominators.cpp
@@ -0,0 +1,79 @@
+//===- MachinePostDominators.cpp -Machine Post Dominator Calculation ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements simple dominator construction algorithms for finding
+// post dominators on machine functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+namespace llvm {
+template class DominatorTreeBase<MachineBasicBlock, true>; // PostDomTreeBase
+
+extern bool VerifyMachineDomInfo;
+} // namespace llvm
+
+char MachinePostDominatorTree::ID = 0;
+
+//declare initializeMachinePostDominatorTreePass
+INITIALIZE_PASS(MachinePostDominatorTree, "machinepostdomtree",
+                "MachinePostDominator Tree Construction", true, true)
+
+MachinePostDominatorTree::MachinePostDominatorTree()
+    : MachineFunctionPass(ID), PDT(nullptr) {
+  initializeMachinePostDominatorTreePass(*PassRegistry::getPassRegistry());
+}
+
+FunctionPass *MachinePostDominatorTree::createMachinePostDominatorTreePass() {
+  return new MachinePostDominatorTree();
+}
+
+bool MachinePostDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  PDT = std::make_unique<PostDomTreeT>();
+  PDT->recalculate(F);
+  return false;
+}
+
+void MachinePostDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineBasicBlock *MachinePostDominatorTree::findNearestCommonDominator(
+    ArrayRef<MachineBasicBlock *> Blocks) const {
+  assert(!Blocks.empty());
+
+  MachineBasicBlock *NCD = Blocks.front();
+  for (MachineBasicBlock *BB : Blocks.drop_front()) {
+    NCD = PDT->findNearestCommonDominator(NCD, BB);
+
+    // Stop when the root is reached.
+    if (PDT->isVirtualRoot(PDT->getNode(NCD)))
+      return nullptr;
+  }
+
+  return NCD;
+}
+
+void MachinePostDominatorTree::verifyAnalysis() const {
+  if (PDT && VerifyMachineDomInfo)
+    if (!PDT->verify(PostDomTreeT::VerificationLevel::Basic)) {
+      errs() << "MachinePostDominatorTree verification failed\n";
+
+      abort();
+    }
+}
+
+void MachinePostDominatorTree::print(llvm::raw_ostream &OS,
+                                     const Module *M) const {
+  PDT->print(OS);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineRegionInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineRegionInfo.cpp
new file mode 100644
index 000000000000..45cdcbfeab9f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineRegionInfo.cpp
@@ -0,0 +1,149 @@
+//===- lib/Codegen/MachineRegionInfo.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineRegionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/RegionInfoImpl.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "machine-region-info"
+
+using namespace llvm;
+
+STATISTIC(numMachineRegions,       "The # of machine regions");
+STATISTIC(numMachineSimpleRegions, "The # of simple machine regions");
+
+namespace llvm {
+
+template class RegionBase<RegionTraits<MachineFunction>>;
+template class RegionNodeBase<RegionTraits<MachineFunction>>;
+template class RegionInfoBase<RegionTraits<MachineFunction>>;
+
+} // end namespace llvm
+
+//===----------------------------------------------------------------------===//
+// MachineRegion implementation
+
+MachineRegion::MachineRegion(MachineBasicBlock *Entry, MachineBasicBlock *Exit,
+                             MachineRegionInfo* RI,
+                             MachineDominatorTree *DT, MachineRegion *Parent) :
+  RegionBase<RegionTraits<MachineFunction>>(Entry, Exit, RI, DT, Parent) {}
+
+MachineRegion::~MachineRegion() = default;
+
+//===----------------------------------------------------------------------===//
+// MachineRegionInfo implementation
+
+MachineRegionInfo::MachineRegionInfo() = default;
+
+MachineRegionInfo::~MachineRegionInfo() = default;
+
+void MachineRegionInfo::updateStatistics(MachineRegion *R) {
+  ++numMachineRegions;
+
+  // TODO: Slow. Should only be enabled if -stats is used.
+  if (R->isSimple())
+    ++numMachineSimpleRegions;
+}
+
+void MachineRegionInfo::recalculate(MachineFunction &F,
+                                    MachineDominatorTree *DT_,
+                                    MachinePostDominatorTree *PDT_,
+                                    MachineDominanceFrontier *DF_) {
+  DT = DT_;
+  PDT = PDT_;
+  DF = DF_;
+
+  MachineBasicBlock *Entry = GraphTraits<MachineFunction*>::getEntryNode(&F);
+
+  TopLevelRegion = new MachineRegion(Entry, nullptr, this, DT, nullptr);
+  updateStatistics(TopLevelRegion);
+  calculate(F);
+}
+
+//===----------------------------------------------------------------------===//
+// MachineRegionInfoPass implementation
+//
+
+MachineRegionInfoPass::MachineRegionInfoPass() : MachineFunctionPass(ID) {
+  initializeMachineRegionInfoPassPass(*PassRegistry::getPassRegistry());
+}
+
+MachineRegionInfoPass::~MachineRegionInfoPass() = default;
+
+bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) {
+  releaseMemory();
+
+  auto DT = &getAnalysis<MachineDominatorTree>();
+  auto PDT = &getAnalysis<MachinePostDominatorTree>();
+  auto DF = &getAnalysis<MachineDominanceFrontier>();
+
+  RI.recalculate(F, DT, PDT, DF);
+
+  LLVM_DEBUG(RI.dump());
+
+  return false;
+}
+
+void MachineRegionInfoPass::releaseMemory() {
+  RI.releaseMemory();
+}
+
+void MachineRegionInfoPass::verifyAnalysis() const {
+  // Only do verification when user wants to, otherwise this expensive check
+  // will be invoked by PMDataManager::verifyPreservedAnalysis when
+  // a regionpass (marked PreservedAll) finish.
+  if (MachineRegionInfo::VerifyRegionInfo)
+    RI.verifyAnalysis();
+}
+
+void MachineRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequired<MachineDominanceFrontier>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void MachineRegionInfoPass::print(raw_ostream &OS, const Module *) const {
+  RI.print(OS);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineRegionInfoPass::dump() const {
+  RI.dump();
+}
+#endif
+
+char MachineRegionInfoPass::ID = 0;
+char &MachineRegionInfoPassID = MachineRegionInfoPass::ID;
+
+INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, DEBUG_TYPE,
+                      "Detect single entry single exit regions", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
+INITIALIZE_PASS_END(MachineRegionInfoPass, DEBUG_TYPE,
+                    "Detect single entry single exit regions", true, true)
+
+// Create methods available outside of this file, to use them
+// "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by
+// the link time optimization.
+
+namespace llvm {
+
+FunctionPass *createMachineRegionInfoPass() {
+  return new MachineRegionInfoPass();
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineRegisterInfo.cpp
new file mode 100644
index 000000000000..0048918fc53b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -0,0 +1,667 @@
+//===- lib/Codegen/MachineRegisterInfo.cpp --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the MachineRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace llvm;
+
+static cl::opt<bool> EnableSubRegLiveness("enable-subreg-liveness", cl::Hidden,
+  cl::init(true), cl::desc("Enable subregister liveness tracking."));
+
+// Pin the vtable to this file.
+void MachineRegisterInfo::Delegate::anchor() {}
+
+MachineRegisterInfo::MachineRegisterInfo(MachineFunction *MF)
+    : MF(MF), TracksSubRegLiveness(MF->getSubtarget().enableSubRegLiveness() &&
+                                   EnableSubRegLiveness) {
+  unsigned NumRegs = getTargetRegisterInfo()->getNumRegs();
+  VRegInfo.reserve(256);
+  RegAllocHints.reserve(256);
+  UsedPhysRegMask.resize(NumRegs);
+  PhysRegUseDefLists.reset(new MachineOperand*[NumRegs]());
+  TheDelegates.clear();
+}
+
+/// setRegClass - Set the register class of the specified virtual register.
+///
+void
+MachineRegisterInfo::setRegClass(Register Reg, const TargetRegisterClass *RC) {
+  assert(RC && RC->isAllocatable() && "Invalid RC for virtual register");
+  VRegInfo[Reg].first = RC;
+}
+
+void MachineRegisterInfo::setRegBank(Register Reg,
+                                     const RegisterBank &RegBank) {
+  VRegInfo[Reg].first = &RegBank;
+}
+
+static const TargetRegisterClass *
+constrainRegClass(MachineRegisterInfo &MRI, Register Reg,
+                  const TargetRegisterClass *OldRC,
+                  const TargetRegisterClass *RC, unsigned MinNumRegs) {
+  if (OldRC == RC)
+    return RC;
+  const TargetRegisterClass *NewRC =
+      MRI.getTargetRegisterInfo()->getCommonSubClass(OldRC, RC);
+  if (!NewRC || NewRC == OldRC)
+    return NewRC;
+  if (NewRC->getNumRegs() < MinNumRegs)
+    return nullptr;
+  MRI.setRegClass(Reg, NewRC);
+  return NewRC;
+}
+
+const TargetRegisterClass *MachineRegisterInfo::constrainRegClass(
+    Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs) {
+  if (Reg.isPhysical())
+    return nullptr;
+  return ::constrainRegClass(*this, Reg, getRegClass(Reg), RC, MinNumRegs);
+}
+
+bool
+MachineRegisterInfo::constrainRegAttrs(Register Reg,
+                                       Register ConstrainingReg,
+                                       unsigned MinNumRegs) {
+  const LLT RegTy = getType(Reg);
+  const LLT ConstrainingRegTy = getType(ConstrainingReg);
+  if (RegTy.isValid() && ConstrainingRegTy.isValid() &&
+      RegTy != ConstrainingRegTy)
+    return false;
+  const auto ConstrainingRegCB = getRegClassOrRegBank(ConstrainingReg);
+  if (!ConstrainingRegCB.isNull()) {
+    const auto RegCB = getRegClassOrRegBank(Reg);
+    if (RegCB.isNull())
+      setRegClassOrRegBank(Reg, ConstrainingRegCB);
+    else if (isa<const TargetRegisterClass *>(RegCB) !=
+             isa<const TargetRegisterClass *>(ConstrainingRegCB))
+      return false;
+    else if (isa<const TargetRegisterClass *>(RegCB)) {
+      if (!::constrainRegClass(
+              *this, Reg, cast<const TargetRegisterClass *>(RegCB),
+              cast<const TargetRegisterClass *>(ConstrainingRegCB), MinNumRegs))
+        return false;
+    } else if (RegCB != ConstrainingRegCB)
+      return false;
+  }
+  if (ConstrainingRegTy.isValid())
+    setType(Reg, ConstrainingRegTy);
+  return true;
+}
+
+bool
+MachineRegisterInfo::recomputeRegClass(Register Reg) {
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const TargetRegisterClass *OldRC = getRegClass(Reg);
+  const TargetRegisterClass *NewRC =
+      getTargetRegisterInfo()->getLargestLegalSuperClass(OldRC, *MF);
+
+  // Stop early if there is no room to grow.
+  if (NewRC == OldRC)
+    return false;
+
+  // Accumulate constraints from all uses.
+  for (MachineOperand &MO : reg_nodbg_operands(Reg)) {
+    // Apply the effect of the given operand to NewRC.
+    MachineInstr *MI = MO.getParent();
+    unsigned OpNo = &MO - &MI->getOperand(0);
+    NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, TII,
+                                            getTargetRegisterInfo());
+    if (!NewRC || NewRC == OldRC)
+      return false;
+  }
+  setRegClass(Reg, NewRC);
+  return true;
+}
+
+Register MachineRegisterInfo::createIncompleteVirtualRegister(StringRef Name) {
+  Register Reg = Register::index2VirtReg(getNumVirtRegs());
+  VRegInfo.grow(Reg);
+  RegAllocHints.grow(Reg);
+  insertVRegByName(Name, Reg);
+  return Reg;
+}
+
+/// createVirtualRegister - Create and return a new virtual register in the
+/// function with the specified register class.
+///
+Register
+MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass,
+                                           StringRef Name) {
+  assert(RegClass && "Cannot create register without RegClass!");
+  assert(RegClass->isAllocatable() &&
+         "Virtual register RegClass must be allocatable.");
+
+  // New virtual register number.
+  Register Reg = createIncompleteVirtualRegister(Name);
+  VRegInfo[Reg].first = RegClass;
+  noteNewVirtualRegister(Reg);
+  return Reg;
+}
+
+Register MachineRegisterInfo::cloneVirtualRegister(Register VReg,
+                                                   StringRef Name) {
+  Register Reg = createIncompleteVirtualRegister(Name);
+  VRegInfo[Reg].first = VRegInfo[VReg].first;
+  setType(Reg, getType(VReg));
+  noteCloneVirtualRegister(Reg, VReg);
+  return Reg;
+}
+
+void MachineRegisterInfo::setType(Register VReg, LLT Ty) {
+  VRegToType.grow(VReg);
+  VRegToType[VReg] = Ty;
+}
+
+Register
+MachineRegisterInfo::createGenericVirtualRegister(LLT Ty, StringRef Name) {
+  // New virtual register number.
+  Register Reg = createIncompleteVirtualRegister(Name);
+  // FIXME: Should we use a dummy register class?
+  VRegInfo[Reg].first = static_cast<RegisterBank *>(nullptr);
+  setType(Reg, Ty);
+  noteNewVirtualRegister(Reg);
+  return Reg;
+}
+
+void MachineRegisterInfo::clearVirtRegTypes() { VRegToType.clear(); }
+
+/// clearVirtRegs - Remove all virtual registers (after physreg assignment).
+void MachineRegisterInfo::clearVirtRegs() {
+#ifndef NDEBUG
+  for (unsigned i = 0, e = getNumVirtRegs(); i != e; ++i) {
+    Register Reg = Register::index2VirtReg(i);
+    if (!VRegInfo[Reg].second)
+      continue;
+    verifyUseList(Reg);
+    errs() << "Remaining virtual register "
+           << printReg(Reg, getTargetRegisterInfo()) << "...\n";
+    for (MachineInstr &MI : reg_instructions(Reg))
+      errs() << "...in instruction: " << MI << "\n";
+    std::abort();
+  }
+#endif
+  VRegInfo.clear();
+  for (auto &I : LiveIns)
+    I.second = 0;
+}
+
+void MachineRegisterInfo::verifyUseList(Register Reg) const {
+#ifndef NDEBUG
+  bool Valid = true;
+  for (MachineOperand &M : reg_operands(Reg)) {
+    MachineOperand *MO = &M;
+    MachineInstr *MI = MO->getParent();
+    if (!MI) {
+      errs() << printReg(Reg, getTargetRegisterInfo())
+             << " use list MachineOperand " << MO
+             << " has no parent instruction.\n";
+      Valid = false;
+      continue;
+    }
+    MachineOperand *MO0 = &MI->getOperand(0);
+    unsigned NumOps = MI->getNumOperands();
+    if (!(MO >= MO0 && MO < MO0+NumOps)) {
+      errs() << printReg(Reg, getTargetRegisterInfo())
+             << " use list MachineOperand " << MO
+             << " doesn't belong to parent MI: " << *MI;
+      Valid = false;
+    }
+    if (!MO->isReg()) {
+      errs() << printReg(Reg, getTargetRegisterInfo())
+             << " MachineOperand " << MO << ": " << *MO
+             << " is not a register\n";
+      Valid = false;
+    }
+    if (MO->getReg() != Reg) {
+      errs() << printReg(Reg, getTargetRegisterInfo())
+             << " use-list MachineOperand " << MO << ": "
+             << *MO << " is the wrong register\n";
+      Valid = false;
+    }
+  }
+  assert(Valid && "Invalid use list");
+#endif
+}
+
+void MachineRegisterInfo::verifyUseLists() const {
+#ifndef NDEBUG
+  for (unsigned i = 0, e = getNumVirtRegs(); i != e; ++i)
+    verifyUseList(Register::index2VirtReg(i));
+  for (unsigned i = 1, e = getTargetRegisterInfo()->getNumRegs(); i != e; ++i)
+    verifyUseList(i);
+#endif
+}
+
+/// Add MO to the linked list of operands for its register.
+void MachineRegisterInfo::addRegOperandToUseList(MachineOperand *MO) {
+  assert(!MO->isOnRegUseList() && "Already on list");
+  MachineOperand *&HeadRef = getRegUseDefListHead(MO->getReg());
+  MachineOperand *const Head = HeadRef;
+
+  // Head points to the first list element.
+  // Next is NULL on the last list element.
+  // Prev pointers are circular, so Head->Prev == Last.
+
+  // Head is NULL for an empty list.
+  if (!Head) {
+    MO->Contents.Reg.Prev = MO;
+    MO->Contents.Reg.Next = nullptr;
+    HeadRef = MO;
+    return;
+  }
+  assert(MO->getReg() == Head->getReg() && "Different regs on the same list!");
+
+  // Insert MO between Last and Head in the circular Prev chain.
+  MachineOperand *Last = Head->Contents.Reg.Prev;
+  assert(Last && "Inconsistent use list");
+  assert(MO->getReg() == Last->getReg() && "Different regs on the same list!");
+  Head->Contents.Reg.Prev = MO;
+  MO->Contents.Reg.Prev = Last;
+
+  // Def operands always precede uses. This allows def_iterator to stop early.
+  // Insert def operands at the front, and use operands at the back.
+  if (MO->isDef()) {
+    // Insert def at the front.
+    MO->Contents.Reg.Next = Head;
+    HeadRef = MO;
+  } else {
+    // Insert use at the end.
+    MO->Contents.Reg.Next = nullptr;
+    Last->Contents.Reg.Next = MO;
+  }
+}
+
+/// Remove MO from its use-def list.
+void MachineRegisterInfo::removeRegOperandFromUseList(MachineOperand *MO) {
+  assert(MO->isOnRegUseList() && "Operand not on use list");
+  MachineOperand *&HeadRef = getRegUseDefListHead(MO->getReg());
+  MachineOperand *const Head = HeadRef;
+  assert(Head && "List already empty");
+
+  // Unlink this from the doubly linked list of operands.
+  MachineOperand *Next = MO->Contents.Reg.Next;
+  MachineOperand *Prev = MO->Contents.Reg.Prev;
+
+  // Prev links are circular, next link is NULL instead of looping back to Head.
+  if (MO == Head)
+    HeadRef = Next;
+  else
+    Prev->Contents.Reg.Next = Next;
+
+  (Next ? Next : Head)->Contents.Reg.Prev = Prev;
+
+  MO->Contents.Reg.Prev = nullptr;
+  MO->Contents.Reg.Next = nullptr;
+}
+
+/// Move NumOps operands from Src to Dst, updating use-def lists as needed.
+///
+/// The Dst range is assumed to be uninitialized memory. (Or it may contain
+/// operands that won't be destroyed, which is OK because the MO destructor is
+/// trivial anyway).
+///
+/// The Src and Dst ranges may overlap.
+void MachineRegisterInfo::moveOperands(MachineOperand *Dst,
+                                       MachineOperand *Src,
+                                       unsigned NumOps) {
+  assert(Src != Dst && NumOps && "Noop moveOperands");
+
+  // Copy backwards if Dst is within the Src range.
+  int Stride = 1;
+  if (Dst >= Src && Dst < Src + NumOps) {
+    Stride = -1;
+    Dst += NumOps - 1;
+    Src += NumOps - 1;
+  }
+
+  // Copy one operand at a time.
+  do {
+    new (Dst) MachineOperand(*Src);
+
+    // Dst takes Src's place in the use-def chain.
+    if (Src->isReg()) {
+      MachineOperand *&Head = getRegUseDefListHead(Src->getReg());
+      MachineOperand *Prev = Src->Contents.Reg.Prev;
+      MachineOperand *Next = Src->Contents.Reg.Next;
+      assert(Head && "List empty, but operand is chained");
+      assert(Prev && "Operand was not on use-def list");
+
+      // Prev links are circular, next link is NULL instead of looping back to
+      // Head.
+      if (Src == Head)
+        Head = Dst;
+      else
+        Prev->Contents.Reg.Next = Dst;
+
+      // Update Prev pointer. This also works when Src was pointing to itself
+      // in a 1-element list. In that case Head == Dst.
+      (Next ? Next : Head)->Contents.Reg.Prev = Dst;
+    }
+
+    Dst += Stride;
+    Src += Stride;
+  } while (--NumOps);
+}
+
+/// replaceRegWith - Replace all instances of FromReg with ToReg in the
+/// machine function.  This is like llvm-level X->replaceAllUsesWith(Y),
+/// except that it also changes any definitions of the register as well.
+/// If ToReg is a physical register we apply the sub register to obtain the
+/// final/proper physical register.
+void MachineRegisterInfo::replaceRegWith(Register FromReg, Register ToReg) {
+  assert(FromReg != ToReg && "Cannot replace a reg with itself");
+
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+
+  // TODO: This could be more efficient by bulk changing the operands.
+  for (MachineOperand &O : llvm::make_early_inc_range(reg_operands(FromReg))) {
+    if (ToReg.isPhysical()) {
+      O.substPhysReg(ToReg, *TRI);
+    } else {
+      O.setReg(ToReg);
+    }
+  }
+}
+
+/// getVRegDef - Return the machine instr that defines the specified virtual
+/// register or null if none is found.  This assumes that the code is in SSA
+/// form, so there should only be one definition.
+MachineInstr *MachineRegisterInfo::getVRegDef(Register Reg) const {
+  // Since we are in SSA form, we can use the first definition.
+  def_instr_iterator I = def_instr_begin(Reg);
+  assert((I.atEnd() || std::next(I) == def_instr_end()) &&
+         "getVRegDef assumes a single definition or no definition");
+  return !I.atEnd() ? &*I : nullptr;
+}
+
+/// getUniqueVRegDef - Return the unique machine instr that defines the
+/// specified virtual register or null if none is found.  If there are
+/// multiple definitions or no definition, return null.
+MachineInstr *MachineRegisterInfo::getUniqueVRegDef(Register Reg) const {
+  if (def_empty(Reg)) return nullptr;
+  def_instr_iterator I = def_instr_begin(Reg);
+  if (std::next(I) != def_instr_end())
+    return nullptr;
+  return &*I;
+}
+
+bool MachineRegisterInfo::hasOneNonDBGUse(Register RegNo) const {
+  return hasSingleElement(use_nodbg_operands(RegNo));
+}
+
+bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const {
+  return hasSingleElement(use_nodbg_instructions(RegNo));
+}
+
+bool MachineRegisterInfo::hasAtMostUserInstrs(Register Reg,
+                                              unsigned MaxUsers) const {
+  return hasNItemsOrLess(use_instr_nodbg_begin(Reg), use_instr_nodbg_end(),
+                         MaxUsers);
+}
+
+/// clearKillFlags - Iterate over all the uses of the given register and
+/// clear the kill flag from the MachineOperand. This function is used by
+/// optimization passes which extend register lifetimes and need only
+/// preserve conservative kill flag information.
+void MachineRegisterInfo::clearKillFlags(Register Reg) const {
+  for (MachineOperand &MO : use_operands(Reg))
+    MO.setIsKill(false);
+}
+
+bool MachineRegisterInfo::isLiveIn(Register Reg) const {
+  for (const std::pair<MCRegister, Register> &LI : liveins())
+    if ((Register)LI.first == Reg || LI.second == Reg)
+      return true;
+  return false;
+}
+
+/// getLiveInPhysReg - If VReg is a live-in virtual register, return the
+/// corresponding live-in physical register.
+MCRegister MachineRegisterInfo::getLiveInPhysReg(Register VReg) const {
+  for (const std::pair<MCRegister, Register> &LI : liveins())
+    if (LI.second == VReg)
+      return LI.first;
+  return MCRegister();
+}
+
+/// getLiveInVirtReg - If PReg is a live-in physical register, return the
+/// corresponding live-in physical register.
+Register MachineRegisterInfo::getLiveInVirtReg(MCRegister PReg) const {
+  for (const std::pair<MCRegister, Register> &LI : liveins())
+    if (LI.first == PReg)
+      return LI.second;
+  return Register();
+}
+
+/// EmitLiveInCopies - Emit copies to initialize livein virtual registers
+/// into the given entry block.
+void
+MachineRegisterInfo::EmitLiveInCopies(MachineBasicBlock *EntryMBB,
+                                      const TargetRegisterInfo &TRI,
+                                      const TargetInstrInfo &TII) {
+  // Emit the copies into the top of the block.
+  for (unsigned i = 0, e = LiveIns.size(); i != e; ++i)
+    if (LiveIns[i].second) {
+      if (use_nodbg_empty(LiveIns[i].second)) {
+        // The livein has no non-dbg uses. Drop it.
+        //
+        // It would be preferable to have isel avoid creating live-in
+        // records for unused arguments in the first place, but it's
+        // complicated by the debug info code for arguments.
+        LiveIns.erase(LiveIns.begin() + i);
+        --i; --e;
+      } else {
+        // Emit a copy.
+        BuildMI(*EntryMBB, EntryMBB->begin(), DebugLoc(),
+                TII.get(TargetOpcode::COPY), LiveIns[i].second)
+          .addReg(LiveIns[i].first);
+
+        // Add the register to the entry block live-in set.
+        EntryMBB->addLiveIn(LiveIns[i].first);
+      }
+    } else {
+      // Add the register to the entry block live-in set.
+      EntryMBB->addLiveIn(LiveIns[i].first);
+    }
+}
+
+LaneBitmask MachineRegisterInfo::getMaxLaneMaskForVReg(Register Reg) const {
+  // Lane masks are only defined for vregs.
+  assert(Reg.isVirtual());
+  const TargetRegisterClass &TRC = *getRegClass(Reg);
+  return TRC.getLaneMask();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineRegisterInfo::dumpUses(Register Reg) const {
+  for (MachineInstr &I : use_instructions(Reg))
+    I.dump();
+}
+#endif
+
+void MachineRegisterInfo::freezeReservedRegs(const MachineFunction &MF) {
+  ReservedRegs = getTargetRegisterInfo()->getReservedRegs(MF);
+  assert(ReservedRegs.size() == getTargetRegisterInfo()->getNumRegs() &&
+         "Invalid ReservedRegs vector from target");
+}
+
+bool MachineRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
+  assert(Register::isPhysicalRegister(PhysReg));
+
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  if (TRI->isConstantPhysReg(PhysReg))
+    return true;
+
+  // Check if any overlapping register is modified, or allocatable so it may be
+  // used later.
+  for (MCRegAliasIterator AI(PhysReg, TRI, true);
+       AI.isValid(); ++AI)
+    if (!def_empty(*AI) || isAllocatable(*AI))
+      return false;
+  return true;
+}
+
+/// markUsesInDebugValueAsUndef - Mark every DBG_VALUE referencing the
+/// specified register as undefined which causes the DBG_VALUE to be
+/// deleted during LiveDebugVariables analysis.
+void MachineRegisterInfo::markUsesInDebugValueAsUndef(Register Reg) const {
+  // Mark any DBG_VALUE* that uses Reg as undef (but don't delete it.)
+  // We use make_early_inc_range because setReg invalidates the iterator.
+  for (MachineInstr &UseMI : llvm::make_early_inc_range(use_instructions(Reg))) {
+    if (UseMI.isDebugValue() && UseMI.hasDebugOperandForReg(Reg))
+      UseMI.setDebugValueUndef();
+  }
+}
+
+static const Function *getCalledFunction(const MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isGlobal())
+      continue;
+    const Function *Func = dyn_cast<Function>(MO.getGlobal());
+    if (Func != nullptr)
+      return Func;
+  }
+  return nullptr;
+}
+
+static bool isNoReturnDef(const MachineOperand &MO) {
+  // Anything which is not a noreturn function is a real def.
+  const MachineInstr &MI = *MO.getParent();
+  if (!MI.isCall())
+    return false;
+  const MachineBasicBlock &MBB = *MI.getParent();
+  if (!MBB.succ_empty())
+    return false;
+  const MachineFunction &MF = *MBB.getParent();
+  // We need to keep correct unwind information even if the function will
+  // not return, since the runtime may need it.
+  if (MF.getFunction().hasFnAttribute(Attribute::UWTable))
+    return false;
+  const Function *Called = getCalledFunction(MI);
+  return !(Called == nullptr || !Called->hasFnAttribute(Attribute::NoReturn) ||
+           !Called->hasFnAttribute(Attribute::NoUnwind));
+}
+
+bool MachineRegisterInfo::isPhysRegModified(MCRegister PhysReg,
+                                            bool SkipNoReturnDef) const {
+  if (UsedPhysRegMask.test(PhysReg))
+    return true;
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI) {
+    for (const MachineOperand &MO : make_range(def_begin(*AI), def_end())) {
+      if (!SkipNoReturnDef && isNoReturnDef(MO))
+        continue;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool MachineRegisterInfo::isPhysRegUsed(MCRegister PhysReg,
+                                        bool SkipRegMaskTest) const {
+  if (!SkipRegMaskTest && UsedPhysRegMask.test(PhysReg))
+    return true;
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  for (MCRegAliasIterator AliasReg(PhysReg, TRI, true); AliasReg.isValid();
+       ++AliasReg) {
+    if (!reg_nodbg_empty(*AliasReg))
+      return true;
+  }
+  return false;
+}
+
+void MachineRegisterInfo::disableCalleeSavedRegister(MCRegister Reg) {
+
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  assert(Reg && (Reg < TRI->getNumRegs()) &&
+         "Trying to disable an invalid register");
+
+  if (!IsUpdatedCSRsInitialized) {
+    const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
+    for (const MCPhysReg *I = CSR; *I; ++I)
+      UpdatedCSRs.push_back(*I);
+
+    // Zero value represents the end of the register list
+    // (no more registers should be pushed).
+    UpdatedCSRs.push_back(0);
+
+    IsUpdatedCSRsInitialized = true;
+  }
+
+  // Remove the register (and its aliases from the list).
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    llvm::erase_value(UpdatedCSRs, *AI);
+}
+
+const MCPhysReg *MachineRegisterInfo::getCalleeSavedRegs() const {
+  if (IsUpdatedCSRsInitialized)
+    return UpdatedCSRs.data();
+
+  return getTargetRegisterInfo()->getCalleeSavedRegs(MF);
+}
+
+void MachineRegisterInfo::setCalleeSavedRegs(ArrayRef<MCPhysReg> CSRs) {
+  if (IsUpdatedCSRsInitialized)
+    UpdatedCSRs.clear();
+
+  append_range(UpdatedCSRs, CSRs);
+
+  // Zero value represents the end of the register list
+  // (no more registers should be pushed).
+  UpdatedCSRs.push_back(0);
+  IsUpdatedCSRsInitialized = true;
+}
+
+bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const {
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) {
+    if (all_of(TRI->superregs_inclusive(*Root),
+               [&](MCPhysReg Super) { return isReserved(Super); }))
+      return true;
+  }
+  return false;
+}
+
+bool MachineRegisterInfo::isArgumentRegister(const MachineFunction &MF,
+                                             MCRegister Reg) const {
+  return getTargetRegisterInfo()->isArgumentRegister(MF, Reg);
+}
+
+bool MachineRegisterInfo::isFixedRegister(const MachineFunction &MF,
+                                          MCRegister Reg) const {
+  return getTargetRegisterInfo()->isFixedRegister(MF, Reg);
+}
+
+bool MachineRegisterInfo::isGeneralPurposeRegister(const MachineFunction &MF,
+                                                   MCRegister Reg) const {
+  return getTargetRegisterInfo()->isGeneralPurposeRegister(MF, Reg);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineSSAContext.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineSSAContext.cpp
new file mode 100644
index 000000000000..324084fb9c32
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineSSAContext.cpp
@@ -0,0 +1,82 @@
+//===- MachineSSAContext.cpp ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a specialization of the GenericSSAContext<X>
+/// template class for Machine IR.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineSSAContext.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+void MachineSSAContext::setFunction(MachineFunction &Fn) {
+  MF = &Fn;
+  RegInfo = &MF->getRegInfo();
+}
+
+MachineBasicBlock *MachineSSAContext::getEntryBlock(MachineFunction &F) {
+  return &F.front();
+}
+
+void MachineSSAContext::appendBlockTerms(
+    SmallVectorImpl<const MachineInstr *> &terms,
+    const MachineBasicBlock &block) {
+  for (auto &T : block.terminators())
+    terms.push_back(&T);
+}
+
+void MachineSSAContext::appendBlockDefs(SmallVectorImpl<Register> &defs,
+                                        const MachineBasicBlock &block) {
+  for (const MachineInstr &instr : block.instrs()) {
+    for (const MachineOperand &op : instr.all_defs())
+      defs.push_back(op.getReg());
+  }
+}
+
+/// Get the defining block of a value.
+MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const {
+  if (!value)
+    return nullptr;
+  return RegInfo->getVRegDef(value)->getParent();
+}
+
+bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) {
+  return Phi.isConstantValuePHI();
+}
+
+Printable MachineSSAContext::print(const MachineBasicBlock *Block) const {
+  if (!Block)
+    return Printable([](raw_ostream &Out) { Out << "<nullptr>"; });
+  return Printable([Block](raw_ostream &Out) { Block->printName(Out); });
+}
+
+Printable MachineSSAContext::print(const MachineInstr *I) const {
+  return Printable([I](raw_ostream &Out) { I->print(Out); });
+}
+
+Printable MachineSSAContext::print(Register Value) const {
+  auto *MRI = RegInfo;
+  return Printable([MRI, Value](raw_ostream &Out) {
+    Out << printReg(Value, MRI->getTargetRegisterInfo(), 0, MRI);
+
+    if (Value) {
+      // Try to print the definition.
+      if (auto *Instr = MRI->getUniqueVRegDef(Value)) {
+        Out << ": ";
+        Instr->print(Out);
+      }
+    }
+  });
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineSSAUpdater.cpp
new file mode 100644
index 000000000000..48076663ddf5
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineSSAUpdater.cpp
@@ -0,0 +1,373 @@
+//===- MachineSSAUpdater.cpp - Unstructured SSA Update Tool ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MachineSSAUpdater class. It's based on SSAUpdater
+// class in lib/Transforms/Utils.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-ssaupdater"
+
+using AvailableValsTy = DenseMap<MachineBasicBlock *, Register>;
+
+static AvailableValsTy &getAvailableVals(void *AV) {
+  return *static_cast<AvailableValsTy*>(AV);
+}
+
+MachineSSAUpdater::MachineSSAUpdater(MachineFunction &MF,
+                                     SmallVectorImpl<MachineInstr*> *NewPHI)
+  : InsertedPHIs(NewPHI), TII(MF.getSubtarget().getInstrInfo()),
+    MRI(&MF.getRegInfo()) {}
+
+MachineSSAUpdater::~MachineSSAUpdater() {
+  delete static_cast<AvailableValsTy*>(AV);
+}
+
+/// Initialize - Reset this object to get ready for a new set of SSA
+/// updates.
+void MachineSSAUpdater::Initialize(const TargetRegisterClass *RC) {
+  if (!AV)
+    AV = new AvailableValsTy();
+  else
+    getAvailableVals(AV).clear();
+
+  VRC = RC;
+}
+
+void MachineSSAUpdater::Initialize(Register V) {
+  Initialize(MRI->getRegClass(V));
+}
+
+/// HasValueForBlock - Return true if the MachineSSAUpdater already has a value for
+/// the specified block.
+bool MachineSSAUpdater::HasValueForBlock(MachineBasicBlock *BB) const {
+  return getAvailableVals(AV).count(BB);
+}
+
+/// AddAvailableValue - Indicate that a rewritten value is available in the
+/// specified block with the specified value.
+void MachineSSAUpdater::AddAvailableValue(MachineBasicBlock *BB, Register V) {
+  getAvailableVals(AV)[BB] = V;
+}
+
+/// GetValueAtEndOfBlock - Construct SSA form, materializing a value that is
+/// live at the end of the specified block.
+Register MachineSSAUpdater::GetValueAtEndOfBlock(MachineBasicBlock *BB) {
+  return GetValueAtEndOfBlockInternal(BB);
+}
+
+static
+Register LookForIdenticalPHI(MachineBasicBlock *BB,
+        SmallVectorImpl<std::pair<MachineBasicBlock *, Register>> &PredValues) {
+  if (BB->empty())
+    return Register();
+
+  MachineBasicBlock::iterator I = BB->begin();
+  if (!I->isPHI())
+    return Register();
+
+  AvailableValsTy AVals;
+  for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
+    AVals[PredValues[i].first] = PredValues[i].second;
+  while (I != BB->end() && I->isPHI()) {
+    bool Same = true;
+    for (unsigned i = 1, e = I->getNumOperands(); i != e; i += 2) {
+      Register SrcReg = I->getOperand(i).getReg();
+      MachineBasicBlock *SrcBB = I->getOperand(i+1).getMBB();
+      if (AVals[SrcBB] != SrcReg) {
+        Same = false;
+        break;
+      }
+    }
+    if (Same)
+      return I->getOperand(0).getReg();
+    ++I;
+  }
+  return Register();
+}
+
+/// InsertNewDef - Insert an empty PHI or IMPLICIT_DEF instruction which define
+/// a value of the given register class at the start of the specified basic
+/// block. It returns the virtual register defined by the instruction.
+static
+MachineInstrBuilder InsertNewDef(unsigned Opcode,
+                           MachineBasicBlock *BB, MachineBasicBlock::iterator I,
+                           const TargetRegisterClass *RC,
+                           MachineRegisterInfo *MRI,
+                           const TargetInstrInfo *TII) {
+  Register NewVR = MRI->createVirtualRegister(RC);
+  return BuildMI(*BB, I, DebugLoc(), TII->get(Opcode), NewVR);
+}
+
+/// GetValueInMiddleOfBlock - Construct SSA form, materializing a value that
+/// is live in the middle of the specified block. If ExistingValueOnly is
+/// true then this will only return an existing value or $noreg; otherwise new
+/// instructions may be inserted to materialize a value.
+///
+/// GetValueInMiddleOfBlock is the same as GetValueAtEndOfBlock except in one
+/// important case: if there is a definition of the rewritten value after the
+/// 'use' in BB.  Consider code like this:
+///
+///      X1 = ...
+///   SomeBB:
+///      use(X)
+///      X2 = ...
+///      br Cond, SomeBB, OutBB
+///
+/// In this case, there are two values (X1 and X2) added to the AvailableVals
+/// set by the client of the rewriter, and those values are both live out of
+/// their respective blocks.  However, the use of X happens in the *middle* of
+/// a block.  Because of this, we need to insert a new PHI node in SomeBB to
+/// merge the appropriate values, and this value isn't live out of the block.
+Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB,
+                                                    bool ExistingValueOnly) {
+  // If there is no definition of the renamed variable in this block, just use
+  // GetValueAtEndOfBlock to do our work.
+  if (!HasValueForBlock(BB))
+    return GetValueAtEndOfBlockInternal(BB, ExistingValueOnly);
+
+  // If there are no predecessors, just return undef.
+  if (BB->pred_empty()) {
+    // If we cannot insert new instructions, just return $noreg.
+    if (ExistingValueOnly)
+      return Register();
+    // Insert an implicit_def to represent an undef value.
+    MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
+                                        BB, BB->getFirstTerminator(),
+                                        VRC, MRI, TII);
+    return NewDef->getOperand(0).getReg();
+  }
+
+  // Otherwise, we have the hard case.  Get the live-in values for each
+  // predecessor.
+  SmallVector<std::pair<MachineBasicBlock*, Register>, 8> PredValues;
+  Register SingularValue;
+
+  bool isFirstPred = true;
+  for (MachineBasicBlock *PredBB : BB->predecessors()) {
+    Register PredVal = GetValueAtEndOfBlockInternal(PredBB, ExistingValueOnly);
+    PredValues.push_back(std::make_pair(PredBB, PredVal));
+
+    // Compute SingularValue.
+    if (isFirstPred) {
+      SingularValue = PredVal;
+      isFirstPred = false;
+    } else if (PredVal != SingularValue)
+      SingularValue = Register();
+  }
+
+  // Otherwise, if all the merged values are the same, just use it.
+  if (SingularValue)
+    return SingularValue;
+
+  // If an identical PHI is already in BB, just reuse it.
+  Register DupPHI = LookForIdenticalPHI(BB, PredValues);
+  if (DupPHI)
+    return DupPHI;
+
+  // If we cannot create new instructions, return $noreg now.
+  if (ExistingValueOnly)
+    return Register();
+
+  // Otherwise, we do need a PHI: insert one now.
+  MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->begin();
+  MachineInstrBuilder InsertedPHI = InsertNewDef(TargetOpcode::PHI, BB,
+                                                 Loc, VRC, MRI, TII);
+
+  // Fill in all the predecessors of the PHI.
+  for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
+    InsertedPHI.addReg(PredValues[i].second).addMBB(PredValues[i].first);
+
+  // See if the PHI node can be merged to a single value.  This can happen in
+  // loop cases when we get a PHI of itself and one other value.
+  if (unsigned ConstVal = InsertedPHI->isConstantValuePHI()) {
+    InsertedPHI->eraseFromParent();
+    return ConstVal;
+  }
+
+  // If the client wants to know about all new instructions, tell it.
+  if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
+
+  LLVM_DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+  return InsertedPHI.getReg(0);
+}
+
+static
+MachineBasicBlock *findCorrespondingPred(const MachineInstr *MI,
+                                         MachineOperand *U) {
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+    if (&MI->getOperand(i) == U)
+      return MI->getOperand(i+1).getMBB();
+  }
+
+  llvm_unreachable("MachineOperand::getParent() failure?");
+}
+
+/// RewriteUse - Rewrite a use of the symbolic value.  This handles PHI nodes,
+/// which use their value in the corresponding predecessor.
+void MachineSSAUpdater::RewriteUse(MachineOperand &U) {
+  MachineInstr *UseMI = U.getParent();
+  Register NewVR;
+  if (UseMI->isPHI()) {
+    MachineBasicBlock *SourceBB = findCorrespondingPred(UseMI, &U);
+    NewVR = GetValueAtEndOfBlockInternal(SourceBB);
+  } else {
+    NewVR = GetValueInMiddleOfBlock(UseMI->getParent());
+  }
+
+  U.setReg(NewVR);
+}
+
+namespace llvm {
+
+/// SSAUpdaterTraits<MachineSSAUpdater> - Traits for the SSAUpdaterImpl
+/// template, specialized for MachineSSAUpdater.
+template<>
+class SSAUpdaterTraits<MachineSSAUpdater> {
+public:
+  using BlkT = MachineBasicBlock;
+  using ValT = Register;
+  using PhiT = MachineInstr;
+  using BlkSucc_iterator = MachineBasicBlock::succ_iterator;
+
+  static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return BB->succ_begin(); }
+  static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return BB->succ_end(); }
+
+  /// Iterator for PHI operands.
+  class PHI_iterator {
+  private:
+    MachineInstr *PHI;
+    unsigned idx;
+
+  public:
+    explicit PHI_iterator(MachineInstr *P) // begin iterator
+      : PHI(P), idx(1) {}
+    PHI_iterator(MachineInstr *P, bool) // end iterator
+      : PHI(P), idx(PHI->getNumOperands()) {}
+
+    PHI_iterator &operator++() { idx += 2; return *this; }
+    bool operator==(const PHI_iterator& x) const { return idx == x.idx; }
+    bool operator!=(const PHI_iterator& x) const { return !operator==(x); }
+
+    unsigned getIncomingValue() { return PHI->getOperand(idx).getReg(); }
+
+    MachineBasicBlock *getIncomingBlock() {
+      return PHI->getOperand(idx+1).getMBB();
+    }
+  };
+
+  static inline PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
+
+  static inline PHI_iterator PHI_end(PhiT *PHI) {
+    return PHI_iterator(PHI, true);
+  }
+
+  /// FindPredecessorBlocks - Put the predecessors of BB into the Preds
+  /// vector.
+  static void FindPredecessorBlocks(MachineBasicBlock *BB,
+                                    SmallVectorImpl<MachineBasicBlock*> *Preds){
+    append_range(*Preds, BB->predecessors());
+  }
+
+  /// GetUndefVal - Create an IMPLICIT_DEF instruction with a new register.
+  /// Add it into the specified block and return the register.
+  static Register GetUndefVal(MachineBasicBlock *BB,
+                              MachineSSAUpdater *Updater) {
+    // Insert an implicit_def to represent an undef value.
+    MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
+                                        BB, BB->getFirstNonPHI(),
+                                        Updater->VRC, Updater->MRI,
+                                        Updater->TII);
+    return NewDef->getOperand(0).getReg();
+  }
+
+  /// CreateEmptyPHI - Create a PHI instruction that defines a new register.
+  /// Add it into the specified block and return the register.
+  static Register CreateEmptyPHI(MachineBasicBlock *BB, unsigned NumPreds,
+                                 MachineSSAUpdater *Updater) {
+    MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->begin();
+    MachineInstr *PHI = InsertNewDef(TargetOpcode::PHI, BB, Loc,
+                                     Updater->VRC, Updater->MRI,
+                                     Updater->TII);
+    return PHI->getOperand(0).getReg();
+  }
+
+  /// AddPHIOperand - Add the specified value as an operand of the PHI for
+  /// the specified predecessor block.
+  static void AddPHIOperand(MachineInstr *PHI, Register Val,
+                            MachineBasicBlock *Pred) {
+    MachineInstrBuilder(*Pred->getParent(), PHI).addReg(Val).addMBB(Pred);
+  }
+
+  /// InstrIsPHI - Check if an instruction is a PHI.
+  static MachineInstr *InstrIsPHI(MachineInstr *I) {
+    if (I && I->isPHI())
+      return I;
+    return nullptr;
+  }
+
+  /// ValueIsPHI - Check if the instruction that defines the specified register
+  /// is a PHI instruction.
+  static MachineInstr *ValueIsPHI(Register Val, MachineSSAUpdater *Updater) {
+    return InstrIsPHI(Updater->MRI->getVRegDef(Val));
+  }
+
+  /// ValueIsNewPHI - Like ValueIsPHI but also check if the PHI has no source
+  /// operands, i.e., it was just added.
+  static MachineInstr *ValueIsNewPHI(Register Val, MachineSSAUpdater *Updater) {
+    MachineInstr *PHI = ValueIsPHI(Val, Updater);
+    if (PHI && PHI->getNumOperands() <= 1)
+      return PHI;
+    return nullptr;
+  }
+
+  /// GetPHIValue - For the specified PHI instruction, return the register
+  /// that it defines.
+  static Register GetPHIValue(MachineInstr *PHI) {
+    return PHI->getOperand(0).getReg();
+  }
+};
+
+} // end namespace llvm
+
+/// GetValueAtEndOfBlockInternal - Check to see if AvailableVals has an entry
+/// for the specified BB and if so, return it.  If not, construct SSA form by
+/// first calculating the required placement of PHIs and then inserting new
+/// PHIs where needed.
+Register
+MachineSSAUpdater::GetValueAtEndOfBlockInternal(MachineBasicBlock *BB,
+                                                bool ExistingValueOnly) {
+  AvailableValsTy &AvailableVals = getAvailableVals(AV);
+  Register ExistingVal = AvailableVals.lookup(BB);
+  if (ExistingVal || ExistingValueOnly)
+    return ExistingVal;
+
+  SSAUpdaterImpl<MachineSSAUpdater> Impl(this, &AvailableVals, InsertedPHIs);
+  return Impl.GetValue(BB);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineScheduler.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineScheduler.cpp
new file mode 100644
index 000000000000..ba5432459d12
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -0,0 +1,4332 @@
+//===- MachineScheduler.cpp - Machine Instruction Scheduler ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineScheduler schedules machine instructions after phi elimination. It
+// preserves LiveIntervals so it can be invoked before register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePassRegistry.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/ScheduleDFS.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-scheduler"
+
+STATISTIC(NumClustered, "Number of load/store pairs clustered");
+
+namespace llvm {
+
+cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
+                           cl::desc("Force top-down list scheduling"));
+cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
+                            cl::desc("Force bottom-up list scheduling"));
+cl::opt<bool>
+DumpCriticalPathLength("misched-dcpl", cl::Hidden,
+                       cl::desc("Print critical path length to stdout"));
+
+cl::opt<bool> VerifyScheduling(
+    "verify-misched", cl::Hidden,
+    cl::desc("Verify machine instrs before and after machine scheduling"));
+
+#ifndef NDEBUG
+cl::opt<bool> ViewMISchedDAGs(
+    "view-misched-dags", cl::Hidden,
+    cl::desc("Pop up a window to show MISched dags after they are processed"));
+cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden,
+                        cl::desc("Print schedule DAGs"));
+cl::opt<bool> MISchedDumpReservedCycles(
+    "misched-dump-reserved-cycles", cl::Hidden, cl::init(false),
+    cl::desc("Dump resource usage at schedule boundary."));
+cl::opt<bool> MischedDetailResourceBooking(
+    "misched-detail-resource-booking", cl::Hidden, cl::init(false),
+    cl::desc("Show details of invoking getNextResoufceCycle."));
+#else
+const bool ViewMISchedDAGs = false;
+const bool PrintDAGs = false;
+const bool MischedDetailResourceBooking = false;
+#ifdef LLVM_ENABLE_DUMP
+const bool MISchedDumpReservedCycles = false;
+#endif // LLVM_ENABLE_DUMP
+#endif // NDEBUG
+
+} // end namespace llvm
+
+#ifndef NDEBUG
+/// In some situations a few uninteresting nodes depend on nearly all other
+/// nodes in the graph, provide a cutoff to hide them.
+static cl::opt<unsigned> ViewMISchedCutoff("view-misched-cutoff", cl::Hidden,
+  cl::desc("Hide nodes with more predecessor/successor than cutoff"));
+
+static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
+  cl::desc("Stop scheduling after N instructions"), cl::init(~0U));
+
+static cl::opt<std::string> SchedOnlyFunc("misched-only-func", cl::Hidden,
+  cl::desc("Only schedule this function"));
+static cl::opt<unsigned> SchedOnlyBlock("misched-only-block", cl::Hidden,
+                                        cl::desc("Only schedule this MBB#"));
+#endif // NDEBUG
+
+/// Avoid quadratic complexity in unusually large basic blocks by limiting the
+/// size of the ready lists.
+static cl::opt<unsigned> ReadyListLimit("misched-limit", cl::Hidden,
+  cl::desc("Limit ready list to N instructions"), cl::init(256));
+
+static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden,
+  cl::desc("Enable register pressure scheduling."), cl::init(true));
+
+static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
+  cl::desc("Enable cyclic critical path analysis."), cl::init(true));
+
+static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
+                                        cl::desc("Enable memop clustering."),
+                                        cl::init(true));
+static cl::opt<bool>
+    ForceFastCluster("force-fast-cluster", cl::Hidden,
+                     cl::desc("Switch to fast cluster algorithm with the lost "
+                              "of some fusion opportunities"),
+                     cl::init(false));
+static cl::opt<unsigned>
+    FastClusterThreshold("fast-cluster-threshold", cl::Hidden,
+                         cl::desc("The threshold for fast cluster"),
+                         cl::init(1000));
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+static cl::opt<bool> MISchedDumpScheduleTrace(
+    "misched-dump-schedule-trace", cl::Hidden, cl::init(false),
+    cl::desc("Dump resource usage at schedule boundary."));
+static cl::opt<unsigned>
+    HeaderColWidth("misched-dump-schedule-trace-col-header-width", cl::Hidden,
+                   cl::desc("Set width of the columns with "
+                            "the resources and schedule units"),
+                   cl::init(19));
+static cl::opt<unsigned>
+    ColWidth("misched-dump-schedule-trace-col-width", cl::Hidden,
+             cl::desc("Set width of the columns showing resource booking."),
+             cl::init(5));
+static cl::opt<bool> MISchedSortResourcesInTrace(
+    "misched-sort-resources-in-trace", cl::Hidden, cl::init(true),
+    cl::desc("Sort the resources printed in the dump trace"));
+#endif
+
+static cl::opt<unsigned>
+    MIResourceCutOff("misched-resource-cutoff", cl::Hidden,
+                     cl::desc("Number of intervals to track"), cl::init(10));
+
+// DAG subtrees must have at least this many nodes.
+static const unsigned MinSubtreeSize = 8;
+
+// Pin the vtables to this file.
+void MachineSchedStrategy::anchor() {}
+
+void ScheduleDAGMutation::anchor() {}
+
+//===----------------------------------------------------------------------===//
+// Machine Instruction Scheduling Pass and Registry
+//===----------------------------------------------------------------------===//
+
+MachineSchedContext::MachineSchedContext() {
+  RegClassInfo = new RegisterClassInfo();
+}
+
+MachineSchedContext::~MachineSchedContext() {
+  delete RegClassInfo;
+}
+
+namespace {
+
+/// Base class for a machine scheduler class that can run at any point.
+class MachineSchedulerBase : public MachineSchedContext,
+                             public MachineFunctionPass {
+public:
+  MachineSchedulerBase(char &ID): MachineFunctionPass(ID) {}
+
+  void print(raw_ostream &O, const Module* = nullptr) const override;
+
+protected:
+  void scheduleRegions(ScheduleDAGInstrs &Scheduler, bool FixKillFlags);
+};
+
+/// MachineScheduler runs after coalescing and before register allocation.
+class MachineScheduler : public MachineSchedulerBase {
+public:
+  MachineScheduler();
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction&) override;
+
+  static char ID; // Class identification, replacement for typeinfo
+
+protected:
+  ScheduleDAGInstrs *createMachineScheduler();
+};
+
+/// PostMachineScheduler runs after shortly before code emission.
+class PostMachineScheduler : public MachineSchedulerBase {
+public:
+  PostMachineScheduler();
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction&) override;
+
+  static char ID; // Class identification, replacement for typeinfo
+
+protected:
+  ScheduleDAGInstrs *createPostMachineScheduler();
+};
+
+} // end anonymous namespace
+
+char MachineScheduler::ID = 0;
+
+char &llvm::MachineSchedulerID = MachineScheduler::ID;
+
+INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE,
+                      "Machine Instruction Scheduler", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(MachineScheduler, DEBUG_TYPE,
+                    "Machine Instruction Scheduler", false, false)
+
+MachineScheduler::MachineScheduler() : MachineSchedulerBase(ID) {
+  initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
+}
+
+void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addRequired<TargetPassConfig>();
+  AU.addRequired<SlotIndexes>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+char PostMachineScheduler::ID = 0;
+
+char &llvm::PostMachineSchedulerID = PostMachineScheduler::ID;
+
+INITIALIZE_PASS_BEGIN(PostMachineScheduler, "postmisched",
+                      "PostRA Machine Instruction Scheduler", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(PostMachineScheduler, "postmisched",
+                    "PostRA Machine Instruction Scheduler", false, false)
+
+PostMachineScheduler::PostMachineScheduler() : MachineSchedulerBase(ID) {
+  initializePostMachineSchedulerPass(*PassRegistry::getPassRegistry());
+}
+
+void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addRequired<TargetPassConfig>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachinePassRegistry<MachineSchedRegistry::ScheduleDAGCtor>
+    MachineSchedRegistry::Registry;
+
+/// A dummy default scheduler factory indicates whether the scheduler
+/// is overridden on the command line.
+static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) {
+  return nullptr;
+}
+
+/// MachineSchedOpt allows command line selection of the scheduler.
+static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false,
+               RegisterPassParser<MachineSchedRegistry>>
+MachineSchedOpt("misched",
+                cl::init(&useDefaultMachineSched), cl::Hidden,
+                cl::desc("Machine instruction scheduler to use"));
+
+static MachineSchedRegistry
+DefaultSchedRegistry("default", "Use the target's default scheduler choice.",
+                     useDefaultMachineSched);
+
+static cl::opt<bool> EnableMachineSched(
+    "enable-misched",
+    cl::desc("Enable the machine instruction scheduling pass."), cl::init(true),
+    cl::Hidden);
+
+static cl::opt<bool> EnablePostRAMachineSched(
+    "enable-post-misched",
+    cl::desc("Enable the post-ra machine instruction scheduling pass."),
+    cl::init(true), cl::Hidden);
+
+/// Decrement this iterator until reaching the top or a non-debug instr.
+static MachineBasicBlock::const_iterator
+priorNonDebug(MachineBasicBlock::const_iterator I,
+              MachineBasicBlock::const_iterator Beg) {
+  assert(I != Beg && "reached the top of the region, cannot decrement");
+  while (--I != Beg) {
+    if (!I->isDebugOrPseudoInstr())
+      break;
+  }
+  return I;
+}
+
+/// Non-const version.
+static MachineBasicBlock::iterator
+priorNonDebug(MachineBasicBlock::iterator I,
+              MachineBasicBlock::const_iterator Beg) {
+  return priorNonDebug(MachineBasicBlock::const_iterator(I), Beg)
+      .getNonConstIterator();
+}
+
+/// If this iterator is a debug value, increment until reaching the End or a
+/// non-debug instruction.
+static MachineBasicBlock::const_iterator
+nextIfDebug(MachineBasicBlock::const_iterator I,
+            MachineBasicBlock::const_iterator End) {
+  for(; I != End; ++I) {
+    if (!I->isDebugOrPseudoInstr())
+      break;
+  }
+  return I;
+}
+
+/// Non-const version.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I,
+            MachineBasicBlock::const_iterator End) {
+  return nextIfDebug(MachineBasicBlock::const_iterator(I), End)
+      .getNonConstIterator();
+}
+
+/// Instantiate a ScheduleDAGInstrs that will be owned by the caller.
+ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() {
+  // Select the scheduler, or set the default.
+  MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
+  if (Ctor != useDefaultMachineSched)
+    return Ctor(this);
+
+  // Get the default scheduler set by the target for this function.
+  ScheduleDAGInstrs *Scheduler = PassConfig->createMachineScheduler(this);
+  if (Scheduler)
+    return Scheduler;
+
+  // Default to GenericScheduler.
+  return createGenericSchedLive(this);
+}
+
+/// Instantiate a ScheduleDAGInstrs for PostRA scheduling that will be owned by
+/// the caller. We don't have a command line option to override the postRA
+/// scheduler. The Target must configure it.
+ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() {
+  // Get the postRA scheduler set by the target for this function.
+  ScheduleDAGInstrs *Scheduler = PassConfig->createPostMachineScheduler(this);
+  if (Scheduler)
+    return Scheduler;
+
+  // Default to GenericScheduler.
+  return createGenericSchedPostRA(this);
+}
+
+/// Top-level MachineScheduler pass driver.
+///
+/// Visit blocks in function order. Divide each block into scheduling regions
+/// and visit them bottom-up. Visiting regions bottom-up is not required, but is
+/// consistent with the DAG builder, which traverses the interior of the
+/// scheduling regions bottom-up.
+///
+/// This design avoids exposing scheduling boundaries to the DAG builder,
+/// simplifying the DAG builder's support for "special" target instructions.
+/// At the same time the design allows target schedulers to operate across
+/// scheduling boundaries, for example to bundle the boundary instructions
+/// without reordering them. This creates complexity, because the target
+/// scheduler must update the RegionBegin and RegionEnd positions cached by
+/// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler
+/// design would be to split blocks at scheduling boundaries, but LLVM has a
+/// general bias against block splitting purely for implementation simplicity.
+bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
+  if (skipFunction(mf.getFunction()))
+    return false;
+
+  if (EnableMachineSched.getNumOccurrences()) {
+    if (!EnableMachineSched)
+      return false;
+  } else if (!mf.getSubtarget().enableMachineScheduler())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Before MISched:\n"; mf.print(dbgs()));
+
+  // Initialize the context of the pass.
+  MF = &mf;
+  MLI = &getAnalysis<MachineLoopInfo>();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  PassConfig = &getAnalysis<TargetPassConfig>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  LIS = &getAnalysis<LiveIntervals>();
+
+  if (VerifyScheduling) {
+    LLVM_DEBUG(LIS->dump());
+    MF->verify(this, "Before machine scheduling.");
+  }
+  RegClassInfo->runOnMachineFunction(*MF);
+
+  // Instantiate the selected scheduler for this target, function, and
+  // optimization level.
+  std::unique_ptr<ScheduleDAGInstrs> Scheduler(createMachineScheduler());
+  scheduleRegions(*Scheduler, false);
+
+  LLVM_DEBUG(LIS->dump());
+  if (VerifyScheduling)
+    MF->verify(this, "After machine scheduling.");
+  return true;
+}
+
+bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
+  if (skipFunction(mf.getFunction()))
+    return false;
+
+  if (EnablePostRAMachineSched.getNumOccurrences()) {
+    if (!EnablePostRAMachineSched)
+      return false;
+  } else if (!mf.getSubtarget().enablePostRAMachineScheduler()) {
+    LLVM_DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "Before post-MI-sched:\n"; mf.print(dbgs()));
+
+  // Initialize the context of the pass.
+  MF = &mf;
+  MLI = &getAnalysis<MachineLoopInfo>();
+  PassConfig = &getAnalysis<TargetPassConfig>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  if (VerifyScheduling)
+    MF->verify(this, "Before post machine scheduling.");
+
+  // Instantiate the selected scheduler for this target, function, and
+  // optimization level.
+  std::unique_ptr<ScheduleDAGInstrs> Scheduler(createPostMachineScheduler());
+  scheduleRegions(*Scheduler, true);
+
+  if (VerifyScheduling)
+    MF->verify(this, "After post machine scheduling.");
+  return true;
+}
+
+/// Return true of the given instruction should not be included in a scheduling
+/// region.
+///
+/// MachineScheduler does not currently support scheduling across calls. To
+/// handle calls, the DAG builder needs to be modified to create register
+/// anti/output dependencies on the registers clobbered by the call's regmask
+/// operand. In PreRA scheduling, the stack pointer adjustment already prevents
+/// scheduling across calls. In PostRA scheduling, we need the isCall to enforce
+/// the boundary, but there would be no benefit to postRA scheduling across
+/// calls this late anyway.
+static bool isSchedBoundary(MachineBasicBlock::iterator MI,
+                            MachineBasicBlock *MBB,
+                            MachineFunction *MF,
+                            const TargetInstrInfo *TII) {
+  return MI->isCall() || TII->isSchedulingBoundary(*MI, MBB, *MF);
+}
+
+/// A region of an MBB for scheduling.
+namespace {
+struct SchedRegion {
+  /// RegionBegin is the first instruction in the scheduling region, and
+  /// RegionEnd is either MBB->end() or the scheduling boundary after the
+  /// last instruction in the scheduling region. These iterators cannot refer
+  /// to instructions outside of the identified scheduling region because
+  /// those may be reordered before scheduling this region.
+  MachineBasicBlock::iterator RegionBegin;
+  MachineBasicBlock::iterator RegionEnd;
+  unsigned NumRegionInstrs;
+
+  SchedRegion(MachineBasicBlock::iterator B, MachineBasicBlock::iterator E,
+              unsigned N) :
+    RegionBegin(B), RegionEnd(E), NumRegionInstrs(N) {}
+};
+} // end anonymous namespace
+
+using MBBRegionsVector = SmallVector<SchedRegion, 16>;
+
+static void
+getSchedRegions(MachineBasicBlock *MBB,
+                MBBRegionsVector &Regions,
+                bool RegionsTopDown) {
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+  MachineBasicBlock::iterator I = nullptr;
+  for(MachineBasicBlock::iterator RegionEnd = MBB->end();
+      RegionEnd != MBB->begin(); RegionEnd = I) {
+
+    // Avoid decrementing RegionEnd for blocks with no terminator.
+    if (RegionEnd != MBB->end() ||
+        isSchedBoundary(&*std::prev(RegionEnd), &*MBB, MF, TII)) {
+      --RegionEnd;
+    }
+
+    // The next region starts above the previous region. Look backward in the
+    // instruction stream until we find the nearest boundary.
+    unsigned NumRegionInstrs = 0;
+    I = RegionEnd;
+    for (;I != MBB->begin(); --I) {
+      MachineInstr &MI = *std::prev(I);
+      if (isSchedBoundary(&MI, &*MBB, MF, TII))
+        break;
+      if (!MI.isDebugOrPseudoInstr()) {
+        // MBB::size() uses instr_iterator to count. Here we need a bundle to
+        // count as a single instruction.
+        ++NumRegionInstrs;
+      }
+    }
+
+    // It's possible we found a scheduling region that only has debug
+    // instructions. Don't bother scheduling these.
+    if (NumRegionInstrs != 0)
+      Regions.push_back(SchedRegion(I, RegionEnd, NumRegionInstrs));
+  }
+
+  if (RegionsTopDown)
+    std::reverse(Regions.begin(), Regions.end());
+}
+
+/// Main driver for both MachineScheduler and PostMachineScheduler.
+void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
+                                           bool FixKillFlags) {
+  // Visit all machine basic blocks.
+  //
+  // TODO: Visit blocks in global postorder or postorder within the bottom-up
+  // loop tree. Then we can optionally compute global RegPressure.
+  for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
+       MBB != MBBEnd; ++MBB) {
+
+    Scheduler.startBlock(&*MBB);
+
+#ifndef NDEBUG
+    if (SchedOnlyFunc.getNumOccurrences() && SchedOnlyFunc != MF->getName())
+      continue;
+    if (SchedOnlyBlock.getNumOccurrences()
+        && (int)SchedOnlyBlock != MBB->getNumber())
+      continue;
+#endif
+
+    // Break the block into scheduling regions [I, RegionEnd). RegionEnd
+    // points to the scheduling boundary at the bottom of the region. The DAG
+    // does not include RegionEnd, but the region does (i.e. the next
+    // RegionEnd is above the previous RegionBegin). If the current block has
+    // no terminator then RegionEnd == MBB->end() for the bottom region.
+    //
+    // All the regions of MBB are first found and stored in MBBRegions, which
+    // will be processed (MBB) top-down if initialized with true.
+    //
+    // The Scheduler may insert instructions during either schedule() or
+    // exitRegion(), even for empty regions. So the local iterators 'I' and
+    // 'RegionEnd' are invalid across these calls. Instructions must not be
+    // added to other regions than the current one without updating MBBRegions.
+
+    MBBRegionsVector MBBRegions;
+    getSchedRegions(&*MBB, MBBRegions, Scheduler.doMBBSchedRegionsTopDown());
+    for (const SchedRegion &R : MBBRegions) {
+      MachineBasicBlock::iterator I = R.RegionBegin;
+      MachineBasicBlock::iterator RegionEnd = R.RegionEnd;
+      unsigned NumRegionInstrs = R.NumRegionInstrs;
+
+      // Notify the scheduler of the region, even if we may skip scheduling
+      // it. Perhaps it still needs to be bundled.
+      Scheduler.enterRegion(&*MBB, I, RegionEnd, NumRegionInstrs);
+
+      // Skip empty scheduling regions (0 or 1 schedulable instructions).
+      if (I == RegionEnd || I == std::prev(RegionEnd)) {
+        // Close the current region. Bundle the terminator if needed.
+        // This invalidates 'RegionEnd' and 'I'.
+        Scheduler.exitRegion();
+        continue;
+      }
+      LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
+      LLVM_DEBUG(dbgs() << MF->getName() << ":" << printMBBReference(*MBB)
+                        << " " << MBB->getName() << "\n  From: " << *I
+                        << "    To: ";
+                 if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+                 else dbgs() << "End\n";
+                 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+      if (DumpCriticalPathLength) {
+        errs() << MF->getName();
+        errs() << ":%bb. " << MBB->getNumber();
+        errs() << " " << MBB->getName() << " \n";
+      }
+
+      // Schedule a region: possibly reorder instructions.
+      // This invalidates the original region iterators.
+      Scheduler.schedule();
+
+      // Close the current region.
+      Scheduler.exitRegion();
+    }
+    Scheduler.finishBlock();
+    // FIXME: Ideally, no further passes should rely on kill flags. However,
+    // thumb2 size reduction is currently an exception, so the PostMIScheduler
+    // needs to do this.
+    if (FixKillFlags)
+      Scheduler.fixupKills(*MBB);
+  }
+  Scheduler.finalizeSchedule();
+}
+
+void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const {
+  // unimplemented
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void ReadyQueue::dump() const {
+  dbgs() << "Queue " << Name << ": ";
+  for (const SUnit *SU : Queue)
+    dbgs() << SU->NodeNum << " ";
+  dbgs() << "\n";
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// ScheduleDAGMI - Basic machine instruction scheduling. This is
+// independent of PreRA/PostRA scheduling and involves no extra book-keeping for
+// virtual registers.
+// ===----------------------------------------------------------------------===/
+
+// Provide a vtable anchor.
+ScheduleDAGMI::~ScheduleDAGMI() = default;
+
+/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
+/// NumPredsLeft reaches zero, release the successor node.
+///
+/// FIXME: Adjust SuccSU height based on MinLatency.
+void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+  if (SuccEdge->isWeak()) {
+    --SuccSU->WeakPredsLeft;
+    if (SuccEdge->isCluster())
+      NextClusterSucc = SuccSU;
+    return;
+  }
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    dumpNode(*SuccSU);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+  // SU->TopReadyCycle was set to CurrCycle when it was scheduled. However,
+  // CurrCycle may have advanced since then.
+  if (SuccSU->TopReadyCycle < SU->TopReadyCycle + SuccEdge->getLatency())
+    SuccSU->TopReadyCycle = SU->TopReadyCycle + SuccEdge->getLatency();
+
+  --SuccSU->NumPredsLeft;
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
+    SchedImpl->releaseTopNode(SuccSU);
+}
+
+/// releaseSuccessors - Call releaseSucc on each of SU's successors.
+void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
+  for (SDep &Succ : SU->Succs)
+    releaseSucc(SU, &Succ);
+}
+
+/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When
+/// NumSuccsLeft reaches zero, release the predecessor node.
+///
+/// FIXME: Adjust PredSU height based on MinLatency.
+void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+
+  if (PredEdge->isWeak()) {
+    --PredSU->WeakSuccsLeft;
+    if (PredEdge->isCluster())
+      NextClusterPred = PredSU;
+    return;
+  }
+#ifndef NDEBUG
+  if (PredSU->NumSuccsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    dumpNode(*PredSU);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+  // SU->BotReadyCycle was set to CurrCycle when it was scheduled. However,
+  // CurrCycle may have advanced since then.
+  if (PredSU->BotReadyCycle < SU->BotReadyCycle + PredEdge->getLatency())
+    PredSU->BotReadyCycle = SU->BotReadyCycle + PredEdge->getLatency();
+
+  --PredSU->NumSuccsLeft;
+  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU)
+    SchedImpl->releaseBottomNode(PredSU);
+}
+
+/// releasePredecessors - Call releasePred on each of SU's predecessors.
+void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
+  for (SDep &Pred : SU->Preds)
+    releasePred(SU, &Pred);
+}
+
+void ScheduleDAGMI::startBlock(MachineBasicBlock *bb) {
+  ScheduleDAGInstrs::startBlock(bb);
+  SchedImpl->enterMBB(bb);
+}
+
+void ScheduleDAGMI::finishBlock() {
+  SchedImpl->leaveMBB();
+  ScheduleDAGInstrs::finishBlock();
+}
+
+/// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
+/// crossing a scheduling boundary. [begin, end) includes all instructions in
+/// the region, including the boundary itself and single-instruction regions
+/// that don't get scheduled.
+void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
+                                     MachineBasicBlock::iterator begin,
+                                     MachineBasicBlock::iterator end,
+                                     unsigned regioninstrs)
+{
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
+
+  SchedImpl->initPolicy(begin, end, regioninstrs);
+}
+
+/// This is normally called from the main scheduler loop but may also be invoked
+/// by the scheduling strategy to perform additional code motion.
+void ScheduleDAGMI::moveInstruction(
+  MachineInstr *MI, MachineBasicBlock::iterator InsertPos) {
+  // Advance RegionBegin if the first instruction moves down.
+  if (&*RegionBegin == MI)
+    ++RegionBegin;
+
+  // Update the instruction stream.
+  BB->splice(InsertPos, BB, MI);
+
+  // Update LiveIntervals
+  if (LIS)
+    LIS->handleMove(*MI, /*UpdateFlags=*/true);
+
+  // Recede RegionBegin if an instruction moves above the first.
+  if (RegionBegin == InsertPos)
+    RegionBegin = MI;
+}
+
+bool ScheduleDAGMI::checkSchedLimit() {
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS && !defined(NDEBUG)
+  if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) {
+    CurrentTop = CurrentBottom;
+    return false;
+  }
+  ++NumInstrsScheduled;
+#endif
+  return true;
+}
+
+/// Per-region scheduling driver, called back from
+/// MachineScheduler::runOnMachineFunction. This is a simplified driver that
+/// does not consider liveness or register pressure. It is useful for PostRA
+/// scheduling and potentially other custom schedulers.
+void ScheduleDAGMI::schedule() {
+  LLVM_DEBUG(dbgs() << "ScheduleDAGMI::schedule starting\n");
+  LLVM_DEBUG(SchedImpl->dumpPolicy());
+
+  // Build the DAG.
+  buildSchedGraph(AA);
+
+  postProcessDAG();
+
+  SmallVector<SUnit*, 8> TopRoots, BotRoots;
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+
+  LLVM_DEBUG(dump());
+  if (PrintDAGs) dump();
+  if (ViewMISchedDAGs) viewGraph();
+
+  // Initialize the strategy before modifying the DAG.
+  // This may initialize a DFSResult to be used for queue priority.
+  SchedImpl->initialize(this);
+
+  // Initialize ready queues now that the DAG and priority data are finalized.
+  initQueues(TopRoots, BotRoots);
+
+  bool IsTopNode = false;
+  while (true) {
+    LLVM_DEBUG(dbgs() << "** ScheduleDAGMI::schedule picking next node\n");
+    SUnit *SU = SchedImpl->pickNode(IsTopNode);
+    if (!SU) break;
+
+    assert(!SU->isScheduled && "Node already scheduled");
+    if (!checkSchedLimit())
+      break;
+
+    MachineInstr *MI = SU->getInstr();
+    if (IsTopNode) {
+      assert(SU->isTopReady() && "node still has unscheduled dependencies");
+      if (&*CurrentTop == MI)
+        CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
+      else
+        moveInstruction(MI, CurrentTop);
+    } else {
+      assert(SU->isBottomReady() && "node still has unscheduled dependencies");
+      MachineBasicBlock::iterator priorII =
+        priorNonDebug(CurrentBottom, CurrentTop);
+      if (&*priorII == MI)
+        CurrentBottom = priorII;
+      else {
+        if (&*CurrentTop == MI)
+          CurrentTop = nextIfDebug(++CurrentTop, priorII);
+        moveInstruction(MI, CurrentBottom);
+        CurrentBottom = MI;
+      }
+    }
+    // Notify the scheduling strategy before updating the DAG.
+    // This sets the scheduled node's ReadyCycle to CurrCycle. When updateQueues
+    // runs, it can then use the accurate ReadyCycle time to determine whether
+    // newly released nodes can move to the readyQ.
+    SchedImpl->schedNode(SU, IsTopNode);
+
+    updateQueues(SU, IsTopNode);
+  }
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+
+  LLVM_DEBUG({
+    dbgs() << "*** Final schedule for "
+           << printMBBReference(*begin()->getParent()) << " ***\n";
+    dumpSchedule();
+    dbgs() << '\n';
+  });
+}
+
+/// Apply each ScheduleDAGMutation step in order.
+void ScheduleDAGMI::postProcessDAG() {
+  for (auto &m : Mutations)
+    m->apply(this);
+}
+
+void ScheduleDAGMI::
+findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
+                      SmallVectorImpl<SUnit*> &BotRoots) {
+  for (SUnit &SU : SUnits) {
+    assert(!SU.isBoundaryNode() && "Boundary node should not be in SUnits");
+
+    // Order predecessors so DFSResult follows the critical path.
+    SU.biasCriticalPath();
+
+    // A SUnit is ready to top schedule if it has no predecessors.
+    if (!SU.NumPredsLeft)
+      TopRoots.push_back(&SU);
+    // A SUnit is ready to bottom schedule if it has no successors.
+    if (!SU.NumSuccsLeft)
+      BotRoots.push_back(&SU);
+  }
+  ExitSU.biasCriticalPath();
+}
+
+/// Identify DAG roots and setup scheduler queues.
+void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
+                               ArrayRef<SUnit*> BotRoots) {
+  NextClusterSucc = nullptr;
+  NextClusterPred = nullptr;
+
+  // Release all DAG roots for scheduling, not including EntrySU/ExitSU.
+  //
+  // Nodes with unreleased weak edges can still be roots.
+  // Release top roots in forward order.
+  for (SUnit *SU : TopRoots)
+    SchedImpl->releaseTopNode(SU);
+
+  // Release bottom roots in reverse order so the higher priority nodes appear
+  // first. This is more natural and slightly more efficient.
+  for (SmallVectorImpl<SUnit*>::const_reverse_iterator
+         I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) {
+    SchedImpl->releaseBottomNode(*I);
+  }
+
+  releaseSuccessors(&EntrySU);
+  releasePredecessors(&ExitSU);
+
+  SchedImpl->registerRoots();
+
+  // Advance past initial DebugValues.
+  CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
+  CurrentBottom = RegionEnd;
+}
+
+/// Update scheduler queues after scheduling an instruction.
+void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) {
+  // Release dependent instructions for scheduling.
+  if (IsTopNode)
+    releaseSuccessors(SU);
+  else
+    releasePredecessors(SU);
+
+  SU->isScheduled = true;
+}
+
+/// Reinsert any remaining debug_values, just like the PostRA scheduler.
+void ScheduleDAGMI::placeDebugValues() {
+  // If first instruction was a DBG_VALUE then put it back.
+  if (FirstDbgValue) {
+    BB->splice(RegionBegin, BB, FirstDbgValue);
+    RegionBegin = FirstDbgValue;
+  }
+
+  for (std::vector<std::pair<MachineInstr *, MachineInstr *>>::iterator
+         DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
+    std::pair<MachineInstr *, MachineInstr *> P = *std::prev(DI);
+    MachineInstr *DbgValue = P.first;
+    MachineBasicBlock::iterator OrigPrevMI = P.second;
+    if (&*RegionBegin == DbgValue)
+      ++RegionBegin;
+    BB->splice(std::next(OrigPrevMI), BB, DbgValue);
+    if (RegionEnd != BB->end() && OrigPrevMI == &*RegionEnd)
+      RegionEnd = DbgValue;
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+static const char *scheduleTableLegend = "  i: issue\n  x: resource booked";
+
+LLVM_DUMP_METHOD void ScheduleDAGMI::dumpScheduleTraceTopDown() const {
+  // Bail off when there is no schedule model to query.
+  if (!SchedModel.hasInstrSchedModel())
+    return;
+
+  //  Nothing to show if there is no or just one instruction.
+  if (BB->size() < 2)
+    return;
+
+  dbgs() << " * Schedule table (TopDown):\n";
+  dbgs() << scheduleTableLegend << "\n";
+  const unsigned FirstCycle = getSUnit(&*(std::begin(*this)))->TopReadyCycle;
+  unsigned LastCycle = getSUnit(&*(std::prev(std::end(*this))))->TopReadyCycle;
+  for (MachineInstr &MI : *this) {
+    SUnit *SU = getSUnit(&MI);
+    if (!SU)
+      continue;
+    const MCSchedClassDesc *SC = getSchedClass(SU);
+    for (TargetSchedModel::ProcResIter PI = SchedModel.getWriteProcResBegin(SC),
+                                       PE = SchedModel.getWriteProcResEnd(SC);
+         PI != PE; ++PI) {
+      if (SU->TopReadyCycle + PI->Cycles - 1 > LastCycle)
+        LastCycle = SU->TopReadyCycle + PI->Cycles - 1;
+    }
+  }
+  // Print the header with the cycles
+  dbgs() << llvm::left_justify("Cycle", HeaderColWidth);
+  for (unsigned C = FirstCycle; C <= LastCycle; ++C)
+    dbgs() << llvm::left_justify("| " + std::to_string(C), ColWidth);
+  dbgs() << "|\n";
+
+  for (MachineInstr &MI : *this) {
+    SUnit *SU = getSUnit(&MI);
+    if (!SU) {
+      dbgs() << "Missing SUnit\n";
+      continue;
+    }
+    std::string NodeName("SU(");
+    NodeName += std::to_string(SU->NodeNum) + ")";
+    dbgs() << llvm::left_justify(NodeName, HeaderColWidth);
+    unsigned C = FirstCycle;
+    for (; C <= LastCycle; ++C) {
+      if (C == SU->TopReadyCycle)
+        dbgs() << llvm::left_justify("| i", ColWidth);
+      else
+        dbgs() << llvm::left_justify("|", ColWidth);
+    }
+    dbgs() << "|\n";
+    const MCSchedClassDesc *SC = getSchedClass(SU);
+
+    SmallVector<MCWriteProcResEntry, 4> ResourcesIt(
+        make_range(SchedModel.getWriteProcResBegin(SC),
+                   SchedModel.getWriteProcResEnd(SC)));
+
+    if (MISchedSortResourcesInTrace)
+      llvm::stable_sort(ResourcesIt,
+                        [](const MCWriteProcResEntry &LHS,
+                           const MCWriteProcResEntry &RHS) -> bool {
+                          return LHS.StartAtCycle < RHS.StartAtCycle ||
+                                 (LHS.StartAtCycle == RHS.StartAtCycle &&
+                                  LHS.Cycles < RHS.Cycles);
+                        });
+    for (const MCWriteProcResEntry &PI : ResourcesIt) {
+      C = FirstCycle;
+      const std::string ResName =
+          SchedModel.getResourceName(PI.ProcResourceIdx);
+      dbgs() << llvm::right_justify(ResName + " ", HeaderColWidth);
+      for (; C < SU->TopReadyCycle + PI.StartAtCycle; ++C) {
+        dbgs() << llvm::left_justify("|", ColWidth);
+      }
+      for (unsigned I = 0, E = PI.Cycles - PI.StartAtCycle; I != E; ++I, ++C)
+        dbgs() << llvm::left_justify("| x", ColWidth);
+      while (C++ <= LastCycle)
+        dbgs() << llvm::left_justify("|", ColWidth);
+      // Place end char
+      dbgs() << "| \n";
+    }
+  }
+}
+
+LLVM_DUMP_METHOD void ScheduleDAGMI::dumpScheduleTraceBottomUp() const {
+  // Bail off when there is no schedule model to query.
+  if (!SchedModel.hasInstrSchedModel())
+    return;
+
+  //  Nothing to show if there is no or just one instruction.
+  if (BB->size() < 2)
+    return;
+
+  dbgs() << " * Schedule table (BottomUp):\n";
+  dbgs() << scheduleTableLegend << "\n";
+
+  const int FirstCycle = getSUnit(&*(std::begin(*this)))->BotReadyCycle;
+  int LastCycle = getSUnit(&*(std::prev(std::end(*this))))->BotReadyCycle;
+  for (MachineInstr &MI : *this) {
+    SUnit *SU = getSUnit(&MI);
+    if (!SU)
+      continue;
+    const MCSchedClassDesc *SC = getSchedClass(SU);
+    for (TargetSchedModel::ProcResIter PI = SchedModel.getWriteProcResBegin(SC),
+                                       PE = SchedModel.getWriteProcResEnd(SC);
+         PI != PE; ++PI) {
+      if ((int)SU->BotReadyCycle - PI->Cycles + 1 < LastCycle)
+        LastCycle = (int)SU->BotReadyCycle - PI->Cycles + 1;
+    }
+  }
+  // Print the header with the cycles
+  dbgs() << llvm::left_justify("Cycle", HeaderColWidth);
+  for (int C = FirstCycle; C >= LastCycle; --C)
+    dbgs() << llvm::left_justify("| " + std::to_string(C), ColWidth);
+  dbgs() << "|\n";
+
+  for (MachineInstr &MI : *this) {
+    SUnit *SU = getSUnit(&MI);
+    if (!SU) {
+      dbgs() << "Missing SUnit\n";
+      continue;
+    }
+    std::string NodeName("SU(");
+    NodeName += std::to_string(SU->NodeNum) + ")";
+    dbgs() << llvm::left_justify(NodeName, HeaderColWidth);
+    int C = FirstCycle;
+    for (; C >= LastCycle; --C) {
+      if (C == (int)SU->BotReadyCycle)
+        dbgs() << llvm::left_justify("| i", ColWidth);
+      else
+        dbgs() << llvm::left_justify("|", ColWidth);
+    }
+    dbgs() << "|\n";
+    const MCSchedClassDesc *SC = getSchedClass(SU);
+    SmallVector<MCWriteProcResEntry, 4> ResourcesIt(
+        make_range(SchedModel.getWriteProcResBegin(SC),
+                   SchedModel.getWriteProcResEnd(SC)));
+
+    if (MISchedSortResourcesInTrace)
+      llvm::stable_sort(ResourcesIt,
+                        [](const MCWriteProcResEntry &LHS,
+                           const MCWriteProcResEntry &RHS) -> bool {
+                          return LHS.StartAtCycle < RHS.StartAtCycle ||
+                                 (LHS.StartAtCycle == RHS.StartAtCycle &&
+                                  LHS.Cycles < RHS.Cycles);
+                        });
+    for (const MCWriteProcResEntry &PI : ResourcesIt) {
+      C = FirstCycle;
+      const std::string ResName =
+          SchedModel.getResourceName(PI.ProcResourceIdx);
+      dbgs() << llvm::right_justify(ResName + " ", HeaderColWidth);
+      for (; C > ((int)SU->BotReadyCycle - (int)PI.StartAtCycle); --C) {
+        dbgs() << llvm::left_justify("|", ColWidth);
+      }
+      for (unsigned I = 0, E = PI.Cycles - PI.StartAtCycle; I != E; ++I, --C)
+        dbgs() << llvm::left_justify("| x", ColWidth);
+      while (C-- >= LastCycle)
+        dbgs() << llvm::left_justify("|", ColWidth);
+      // Place end char
+      dbgs() << "| \n";
+    }
+  }
+}
+#endif
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const {
+  if (MISchedDumpScheduleTrace) {
+    if (ForceTopDown)
+      dumpScheduleTraceTopDown();
+    else if (ForceBottomUp)
+      dumpScheduleTraceBottomUp();
+    else {
+      dbgs() << "* Schedule table (Bidirectional): not implemented\n";
+    }
+  }
+
+  for (MachineInstr &MI : *this) {
+    if (SUnit *SU = getSUnit(&MI))
+      dumpNode(*SU);
+    else
+      dbgs() << "Missing SUnit\n";
+  }
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// ScheduleDAGMILive - Base class for MachineInstr scheduling with LiveIntervals
+// preservation.
+//===----------------------------------------------------------------------===//
+
+ScheduleDAGMILive::~ScheduleDAGMILive() {
+  delete DFSResult;
+}
+
+void ScheduleDAGMILive::collectVRegUses(SUnit &SU) {
+  const MachineInstr &MI = *SU.getInstr();
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    if (!MO.readsReg())
+      continue;
+    if (TrackLaneMasks && !MO.isUse())
+      continue;
+
+    Register Reg = MO.getReg();
+    if (!Reg.isVirtual())
+      continue;
+
+    // Ignore re-defs.
+    if (TrackLaneMasks) {
+      bool FoundDef = false;
+      for (const MachineOperand &MO2 : MI.all_defs()) {
+        if (MO2.getReg() == Reg && !MO2.isDead()) {
+          FoundDef = true;
+          break;
+        }
+      }
+      if (FoundDef)
+        continue;
+    }
+
+    // Record this local VReg use.
+    VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg);
+    for (; UI != VRegUses.end(); ++UI) {
+      if (UI->SU == &SU)
+        break;
+    }
+    if (UI == VRegUses.end())
+      VRegUses.insert(VReg2SUnit(Reg, LaneBitmask::getNone(), &SU));
+  }
+}
+
+/// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
+/// crossing a scheduling boundary. [begin, end) includes all instructions in
+/// the region, including the boundary itself and single-instruction regions
+/// that don't get scheduled.
+void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb,
+                                MachineBasicBlock::iterator begin,
+                                MachineBasicBlock::iterator end,
+                                unsigned regioninstrs)
+{
+  // ScheduleDAGMI initializes SchedImpl's per-region policy.
+  ScheduleDAGMI::enterRegion(bb, begin, end, regioninstrs);
+
+  // For convenience remember the end of the liveness region.
+  LiveRegionEnd = (RegionEnd == bb->end()) ? RegionEnd : std::next(RegionEnd);
+
+  SUPressureDiffs.clear();
+
+  ShouldTrackPressure = SchedImpl->shouldTrackPressure();
+  ShouldTrackLaneMasks = SchedImpl->shouldTrackLaneMasks();
+
+  assert((!ShouldTrackLaneMasks || ShouldTrackPressure) &&
+         "ShouldTrackLaneMasks requires ShouldTrackPressure");
+}
+
+// Setup the register pressure trackers for the top scheduled and bottom
+// scheduled regions.
+void ScheduleDAGMILive::initRegPressure() {
+  VRegUses.clear();
+  VRegUses.setUniverse(MRI.getNumVirtRegs());
+  for (SUnit &SU : SUnits)
+    collectVRegUses(SU);
+
+  TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin,
+                    ShouldTrackLaneMasks, false);
+  BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
+                    ShouldTrackLaneMasks, false);
+
+  // Close the RPTracker to finalize live ins.
+  RPTracker.closeRegion();
+
+  LLVM_DEBUG(RPTracker.dump());
+
+  // Initialize the live ins and live outs.
+  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
+  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
+
+  // Close one end of the tracker so we can call
+  // getMaxUpward/DownwardPressureDelta before advancing across any
+  // instructions. This converts currently live regs into live ins/outs.
+  TopRPTracker.closeTop();
+  BotRPTracker.closeBottom();
+
+  BotRPTracker.initLiveThru(RPTracker);
+  if (!BotRPTracker.getLiveThru().empty()) {
+    TopRPTracker.initLiveThru(BotRPTracker.getLiveThru());
+    LLVM_DEBUG(dbgs() << "Live Thru: ";
+               dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI));
+  };
+
+  // For each live out vreg reduce the pressure change associated with other
+  // uses of the same vreg below the live-out reaching def.
+  updatePressureDiffs(RPTracker.getPressure().LiveOutRegs);
+
+  // Account for liveness generated by the region boundary.
+  if (LiveRegionEnd != RegionEnd) {
+    SmallVector<RegisterMaskPair, 8> LiveUses;
+    BotRPTracker.recede(&LiveUses);
+    updatePressureDiffs(LiveUses);
+  }
+
+  LLVM_DEBUG(dbgs() << "Top Pressure:\n";
+             dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI);
+             dbgs() << "Bottom Pressure:\n";
+             dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI););
+
+  assert((BotRPTracker.getPos() == RegionEnd ||
+          (RegionEnd->isDebugInstr() &&
+           BotRPTracker.getPos() == priorNonDebug(RegionEnd, RegionBegin))) &&
+         "Can't find the region bottom");
+
+  // Cache the list of excess pressure sets in this region. This will also track
+  // the max pressure in the scheduled code for these sets.
+  RegionCriticalPSets.clear();
+  const std::vector<unsigned> &RegionPressure =
+    RPTracker.getPressure().MaxSetPressure;
+  for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
+    unsigned Limit = RegClassInfo->getRegPressureSetLimit(i);
+    if (RegionPressure[i] > Limit) {
+      LLVM_DEBUG(dbgs() << TRI->getRegPressureSetName(i) << " Limit " << Limit
+                        << " Actual " << RegionPressure[i] << "\n");
+      RegionCriticalPSets.push_back(PressureChange(i));
+    }
+  }
+  LLVM_DEBUG(dbgs() << "Excess PSets: ";
+             for (const PressureChange &RCPS
+                  : RegionCriticalPSets) dbgs()
+             << TRI->getRegPressureSetName(RCPS.getPSet()) << " ";
+             dbgs() << "\n");
+}
+
+void ScheduleDAGMILive::
+updateScheduledPressure(const SUnit *SU,
+                        const std::vector<unsigned> &NewMaxPressure) {
+  const PressureDiff &PDiff = getPressureDiff(SU);
+  unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size();
+  for (const PressureChange &PC : PDiff) {
+    if (!PC.isValid())
+      break;
+    unsigned ID = PC.getPSet();
+    while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID)
+      ++CritIdx;
+    if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) {
+      if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc()
+          && NewMaxPressure[ID] <= (unsigned)std::numeric_limits<int16_t>::max())
+        RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]);
+    }
+    unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID);
+    if (NewMaxPressure[ID] >= Limit - 2) {
+      LLVM_DEBUG(dbgs() << "  " << TRI->getRegPressureSetName(ID) << ": "
+                        << NewMaxPressure[ID]
+                        << ((NewMaxPressure[ID] > Limit) ? " > " : " <= ")
+                        << Limit << "(+ " << BotRPTracker.getLiveThru()[ID]
+                        << " livethru)\n");
+    }
+  }
+}
+
+/// Update the PressureDiff array for liveness after scheduling this
+/// instruction.
+void ScheduleDAGMILive::updatePressureDiffs(
+    ArrayRef<RegisterMaskPair> LiveUses) {
+  for (const RegisterMaskPair &P : LiveUses) {
+    Register Reg = P.RegUnit;
+    /// FIXME: Currently assuming single-use physregs.
+    if (!Reg.isVirtual())
+      continue;
+
+    if (ShouldTrackLaneMasks) {
+      // If the register has just become live then other uses won't change
+      // this fact anymore => decrement pressure.
+      // If the register has just become dead then other uses make it come
+      // back to life => increment pressure.
+      bool Decrement = P.LaneMask.any();
+
+      for (const VReg2SUnit &V2SU
+           : make_range(VRegUses.find(Reg), VRegUses.end())) {
+        SUnit &SU = *V2SU.SU;
+        if (SU.isScheduled || &SU == &ExitSU)
+          continue;
+
+        PressureDiff &PDiff = getPressureDiff(&SU);
+        PDiff.addPressureChange(Reg, Decrement, &MRI);
+        LLVM_DEBUG(dbgs() << "  UpdateRegP: SU(" << SU.NodeNum << ") "
+                          << printReg(Reg, TRI) << ':'
+                          << PrintLaneMask(P.LaneMask) << ' ' << *SU.getInstr();
+                   dbgs() << "              to "; PDiff.dump(*TRI););
+      }
+    } else {
+      assert(P.LaneMask.any());
+      LLVM_DEBUG(dbgs() << "  LiveReg: " << printVRegOrUnit(Reg, TRI) << "\n");
+      // This may be called before CurrentBottom has been initialized. However,
+      // BotRPTracker must have a valid position. We want the value live into the
+      // instruction or live out of the block, so ask for the previous
+      // instruction's live-out.
+      const LiveInterval &LI = LIS->getInterval(Reg);
+      VNInfo *VNI;
+      MachineBasicBlock::const_iterator I =
+        nextIfDebug(BotRPTracker.getPos(), BB->end());
+      if (I == BB->end())
+        VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
+      else {
+        LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*I));
+        VNI = LRQ.valueIn();
+      }
+      // RegisterPressureTracker guarantees that readsReg is true for LiveUses.
+      assert(VNI && "No live value at use.");
+      for (const VReg2SUnit &V2SU
+           : make_range(VRegUses.find(Reg), VRegUses.end())) {
+        SUnit *SU = V2SU.SU;
+        // If this use comes before the reaching def, it cannot be a last use,
+        // so decrease its pressure change.
+        if (!SU->isScheduled && SU != &ExitSU) {
+          LiveQueryResult LRQ =
+              LI.Query(LIS->getInstructionIndex(*SU->getInstr()));
+          if (LRQ.valueIn() == VNI) {
+            PressureDiff &PDiff = getPressureDiff(SU);
+            PDiff.addPressureChange(Reg, true, &MRI);
+            LLVM_DEBUG(dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
+                              << *SU->getInstr();
+                       dbgs() << "              to "; PDiff.dump(*TRI););
+          }
+        }
+      }
+    }
+  }
+}
+
+void ScheduleDAGMILive::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  if (EntrySU.getInstr() != nullptr)
+    dumpNodeAll(EntrySU);
+  for (const SUnit &SU : SUnits) {
+    dumpNodeAll(SU);
+    if (ShouldTrackPressure) {
+      dbgs() << "  Pressure Diff      : ";
+      getPressureDiff(&SU).dump(*TRI);
+    }
+    dbgs() << "  Single Issue       : ";
+    if (SchedModel.mustBeginGroup(SU.getInstr()) &&
+        SchedModel.mustEndGroup(SU.getInstr()))
+      dbgs() << "true;";
+    else
+      dbgs() << "false;";
+    dbgs() << '\n';
+  }
+  if (ExitSU.getInstr() != nullptr)
+    dumpNodeAll(ExitSU);
+#endif
+}
+
+/// schedule - Called back from MachineScheduler::runOnMachineFunction
+/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
+/// only includes instructions that have DAG nodes, not scheduling boundaries.
+///
+/// This is a skeletal driver, with all the functionality pushed into helpers,
+/// so that it can be easily extended by experimental schedulers. Generally,
+/// implementing MachineSchedStrategy should be sufficient to implement a new
+/// scheduling algorithm. However, if a scheduler further subclasses
+/// ScheduleDAGMILive then it will want to override this virtual method in order
+/// to update any specialized state.
+void ScheduleDAGMILive::schedule() {
+  LLVM_DEBUG(dbgs() << "ScheduleDAGMILive::schedule starting\n");
+  LLVM_DEBUG(SchedImpl->dumpPolicy());
+  buildDAGWithRegPressure();
+
+  postProcessDAG();
+
+  SmallVector<SUnit*, 8> TopRoots, BotRoots;
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+
+  // Initialize the strategy before modifying the DAG.
+  // This may initialize a DFSResult to be used for queue priority.
+  SchedImpl->initialize(this);
+
+  LLVM_DEBUG(dump());
+  if (PrintDAGs) dump();
+  if (ViewMISchedDAGs) viewGraph();
+
+  // Initialize ready queues now that the DAG and priority data are finalized.
+  initQueues(TopRoots, BotRoots);
+
+  bool IsTopNode = false;
+  while (true) {
+    LLVM_DEBUG(dbgs() << "** ScheduleDAGMILive::schedule picking next node\n");
+    SUnit *SU = SchedImpl->pickNode(IsTopNode);
+    if (!SU) break;
+
+    assert(!SU->isScheduled && "Node already scheduled");
+    if (!checkSchedLimit())
+      break;
+
+    scheduleMI(SU, IsTopNode);
+
+    if (DFSResult) {
+      unsigned SubtreeID = DFSResult->getSubtreeID(SU);
+      if (!ScheduledTrees.test(SubtreeID)) {
+        ScheduledTrees.set(SubtreeID);
+        DFSResult->scheduleTree(SubtreeID);
+        SchedImpl->scheduleTree(SubtreeID);
+      }
+    }
+
+    // Notify the scheduling strategy after updating the DAG.
+    SchedImpl->schedNode(SU, IsTopNode);
+
+    updateQueues(SU, IsTopNode);
+  }
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+
+  LLVM_DEBUG({
+    dbgs() << "*** Final schedule for "
+           << printMBBReference(*begin()->getParent()) << " ***\n";
+    dumpSchedule();
+    dbgs() << '\n';
+  });
+}
+
+/// Build the DAG and setup three register pressure trackers.
+void ScheduleDAGMILive::buildDAGWithRegPressure() {
+  if (!ShouldTrackPressure) {
+    RPTracker.reset();
+    RegionCriticalPSets.clear();
+    buildSchedGraph(AA);
+    return;
+  }
+
+  // Initialize the register pressure tracker used by buildSchedGraph.
+  RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
+                 ShouldTrackLaneMasks, /*TrackUntiedDefs=*/true);
+
+  // Account for liveness generate by the region boundary.
+  if (LiveRegionEnd != RegionEnd)
+    RPTracker.recede();
+
+  // Build the DAG, and compute current register pressure.
+  buildSchedGraph(AA, &RPTracker, &SUPressureDiffs, LIS, ShouldTrackLaneMasks);
+
+  // Initialize top/bottom trackers after computing region pressure.
+  initRegPressure();
+}
+
+void ScheduleDAGMILive::computeDFSResult() {
+  if (!DFSResult)
+    DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize);
+  DFSResult->clear();
+  ScheduledTrees.clear();
+  DFSResult->resize(SUnits.size());
+  DFSResult->compute(SUnits);
+  ScheduledTrees.resize(DFSResult->getNumSubtrees());
+}
+
+/// Compute the max cyclic critical path through the DAG. The scheduling DAG
+/// only provides the critical path for single block loops. To handle loops that
+/// span blocks, we could use the vreg path latencies provided by
+/// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently
+/// available for use in the scheduler.
+///
+/// The cyclic path estimation identifies a def-use pair that crosses the back
+/// edge and considers the depth and height of the nodes. For example, consider
+/// the following instruction sequence where each instruction has unit latency
+/// and defines an eponymous virtual register:
+///
+/// a->b(a,c)->c(b)->d(c)->exit
+///
+/// The cyclic critical path is a two cycles: b->c->b
+/// The acyclic critical path is four cycles: a->b->c->d->exit
+/// LiveOutHeight = height(c) = len(c->d->exit) = 2
+/// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3
+/// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4
+/// LiveInDepth = depth(b) = len(a->b) = 1
+///
+/// LiveOutDepth - LiveInDepth = 3 - 1 = 2
+/// LiveInHeight - LiveOutHeight = 4 - 2 = 2
+/// CyclicCriticalPath = min(2, 2) = 2
+///
+/// This could be relevant to PostRA scheduling, but is currently implemented
+/// assuming LiveIntervals.
+unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
+  // This only applies to single block loop.
+  if (!BB->isSuccessor(BB))
+    return 0;
+
+  unsigned MaxCyclicLatency = 0;
+  // Visit each live out vreg def to find def/use pairs that cross iterations.
+  for (const RegisterMaskPair &P : RPTracker.getPressure().LiveOutRegs) {
+    Register Reg = P.RegUnit;
+    if (!Reg.isVirtual())
+      continue;
+    const LiveInterval &LI = LIS->getInterval(Reg);
+    const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
+    if (!DefVNI)
+      continue;
+
+    MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def);
+    const SUnit *DefSU = getSUnit(DefMI);
+    if (!DefSU)
+      continue;
+
+    unsigned LiveOutHeight = DefSU->getHeight();
+    unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency;
+    // Visit all local users of the vreg def.
+    for (const VReg2SUnit &V2SU
+         : make_range(VRegUses.find(Reg), VRegUses.end())) {
+      SUnit *SU = V2SU.SU;
+      if (SU == &ExitSU)
+        continue;
+
+      // Only consider uses of the phi.
+      LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*SU->getInstr()));
+      if (!LRQ.valueIn()->isPHIDef())
+        continue;
+
+      // Assume that a path spanning two iterations is a cycle, which could
+      // overestimate in strange cases. This allows cyclic latency to be
+      // estimated as the minimum slack of the vreg's depth or height.
+      unsigned CyclicLatency = 0;
+      if (LiveOutDepth > SU->getDepth())
+        CyclicLatency = LiveOutDepth - SU->getDepth();
+
+      unsigned LiveInHeight = SU->getHeight() + DefSU->Latency;
+      if (LiveInHeight > LiveOutHeight) {
+        if (LiveInHeight - LiveOutHeight < CyclicLatency)
+          CyclicLatency = LiveInHeight - LiveOutHeight;
+      } else
+        CyclicLatency = 0;
+
+      LLVM_DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
+                        << SU->NodeNum << ") = " << CyclicLatency << "c\n");
+      if (CyclicLatency > MaxCyclicLatency)
+        MaxCyclicLatency = CyclicLatency;
+    }
+  }
+  LLVM_DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n");
+  return MaxCyclicLatency;
+}
+
+/// Release ExitSU predecessors and setup scheduler queues. Re-position
+/// the Top RP tracker in case the region beginning has changed.
+void ScheduleDAGMILive::initQueues(ArrayRef<SUnit*> TopRoots,
+                                   ArrayRef<SUnit*> BotRoots) {
+  ScheduleDAGMI::initQueues(TopRoots, BotRoots);
+  if (ShouldTrackPressure) {
+    assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
+    TopRPTracker.setPos(CurrentTop);
+  }
+}
+
+/// Move an instruction and update register pressure.
+void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
+  // Move the instruction to its new location in the instruction stream.
+  MachineInstr *MI = SU->getInstr();
+
+  if (IsTopNode) {
+    assert(SU->isTopReady() && "node still has unscheduled dependencies");
+    if (&*CurrentTop == MI)
+      CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
+    else {
+      moveInstruction(MI, CurrentTop);
+      TopRPTracker.setPos(MI);
+    }
+
+    if (ShouldTrackPressure) {
+      // Update top scheduled pressure.
+      RegisterOperands RegOpers;
+      RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
+      if (ShouldTrackLaneMasks) {
+        // Adjust liveness and add missing dead+read-undef flags.
+        SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+        RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
+      } else {
+        // Adjust for missing dead-def flags.
+        RegOpers.detectDeadDefs(*MI, *LIS);
+      }
+
+      TopRPTracker.advance(RegOpers);
+      assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
+      LLVM_DEBUG(dbgs() << "Top Pressure:\n"; dumpRegSetPressure(
+                     TopRPTracker.getRegSetPressureAtPos(), TRI););
+
+      updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure);
+    }
+  } else {
+    assert(SU->isBottomReady() && "node still has unscheduled dependencies");
+    MachineBasicBlock::iterator priorII =
+      priorNonDebug(CurrentBottom, CurrentTop);
+    if (&*priorII == MI)
+      CurrentBottom = priorII;
+    else {
+      if (&*CurrentTop == MI) {
+        CurrentTop = nextIfDebug(++CurrentTop, priorII);
+        TopRPTracker.setPos(CurrentTop);
+      }
+      moveInstruction(MI, CurrentBottom);
+      CurrentBottom = MI;
+      BotRPTracker.setPos(CurrentBottom);
+    }
+    if (ShouldTrackPressure) {
+      RegisterOperands RegOpers;
+      RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
+      if (ShouldTrackLaneMasks) {
+        // Adjust liveness and add missing dead+read-undef flags.
+        SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+        RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
+      } else {
+        // Adjust for missing dead-def flags.
+        RegOpers.detectDeadDefs(*MI, *LIS);
+      }
+
+      if (BotRPTracker.getPos() != CurrentBottom)
+        BotRPTracker.recedeSkipDebugValues();
+      SmallVector<RegisterMaskPair, 8> LiveUses;
+      BotRPTracker.recede(RegOpers, &LiveUses);
+      assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
+      LLVM_DEBUG(dbgs() << "Bottom Pressure:\n"; dumpRegSetPressure(
+                     BotRPTracker.getRegSetPressureAtPos(), TRI););
+
+      updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure);
+      updatePressureDiffs(LiveUses);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// BaseMemOpClusterMutation - DAG post-processing to cluster loads or stores.
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Post-process the DAG to create cluster edges between neighboring
+/// loads or between neighboring stores.
+class BaseMemOpClusterMutation : public ScheduleDAGMutation {
+  struct MemOpInfo {
+    SUnit *SU;
+    SmallVector<const MachineOperand *, 4> BaseOps;
+    int64_t Offset;
+    unsigned Width;
+
+    MemOpInfo(SUnit *SU, ArrayRef<const MachineOperand *> BaseOps,
+              int64_t Offset, unsigned Width)
+        : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset),
+          Width(Width) {}
+
+    static bool Compare(const MachineOperand *const &A,
+                        const MachineOperand *const &B) {
+      if (A->getType() != B->getType())
+        return A->getType() < B->getType();
+      if (A->isReg())
+        return A->getReg() < B->getReg();
+      if (A->isFI()) {
+        const MachineFunction &MF = *A->getParent()->getParent()->getParent();
+        const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+        bool StackGrowsDown = TFI.getStackGrowthDirection() ==
+                              TargetFrameLowering::StackGrowsDown;
+        return StackGrowsDown ? A->getIndex() > B->getIndex()
+                              : A->getIndex() < B->getIndex();
+      }
+
+      llvm_unreachable("MemOpClusterMutation only supports register or frame "
+                       "index bases.");
+    }
+
+    bool operator<(const MemOpInfo &RHS) const {
+      // FIXME: Don't compare everything twice. Maybe use C++20 three way
+      // comparison instead when it's available.
+      if (std::lexicographical_compare(BaseOps.begin(), BaseOps.end(),
+                                       RHS.BaseOps.begin(), RHS.BaseOps.end(),
+                                       Compare))
+        return true;
+      if (std::lexicographical_compare(RHS.BaseOps.begin(), RHS.BaseOps.end(),
+                                       BaseOps.begin(), BaseOps.end(), Compare))
+        return false;
+      if (Offset != RHS.Offset)
+        return Offset < RHS.Offset;
+      return SU->NodeNum < RHS.SU->NodeNum;
+    }
+  };
+
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  bool IsLoad;
+
+public:
+  BaseMemOpClusterMutation(const TargetInstrInfo *tii,
+                           const TargetRegisterInfo *tri, bool IsLoad)
+      : TII(tii), TRI(tri), IsLoad(IsLoad) {}
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+
+protected:
+  void clusterNeighboringMemOps(ArrayRef<MemOpInfo> MemOps, bool FastCluster,
+                                ScheduleDAGInstrs *DAG);
+  void collectMemOpRecords(std::vector<SUnit> &SUnits,
+                           SmallVectorImpl<MemOpInfo> &MemOpRecords);
+  bool groupMemOps(ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG,
+                   DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups);
+};
+
+class StoreClusterMutation : public BaseMemOpClusterMutation {
+public:
+  StoreClusterMutation(const TargetInstrInfo *tii,
+                       const TargetRegisterInfo *tri)
+      : BaseMemOpClusterMutation(tii, tri, false) {}
+};
+
+class LoadClusterMutation : public BaseMemOpClusterMutation {
+public:
+  LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri)
+      : BaseMemOpClusterMutation(tii, tri, true) {}
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation>
+createLoadClusterDAGMutation(const TargetInstrInfo *TII,
+                             const TargetRegisterInfo *TRI) {
+  return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(TII, TRI)
+                            : nullptr;
+}
+
+std::unique_ptr<ScheduleDAGMutation>
+createStoreClusterDAGMutation(const TargetInstrInfo *TII,
+                              const TargetRegisterInfo *TRI) {
+  return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(TII, TRI)
+                            : nullptr;
+}
+
+} // end namespace llvm
+
+// Sorting all the loads/stores first, then for each load/store, checking the
+// following load/store one by one, until reach the first non-dependent one and
+// call target hook to see if they can cluster.
+// If FastCluster is enabled, we assume that, all the loads/stores have been
+// preprocessed and now, they didn't have dependencies on each other.
+void BaseMemOpClusterMutation::clusterNeighboringMemOps(
+    ArrayRef<MemOpInfo> MemOpRecords, bool FastCluster,
+    ScheduleDAGInstrs *DAG) {
+  // Keep track of the current cluster length and bytes for each SUnit.
+  DenseMap<unsigned, std::pair<unsigned, unsigned>> SUnit2ClusterInfo;
+
+  // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
+  // cluster mem ops collected within `MemOpRecords` array.
+  for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
+    // Decision to cluster mem ops is taken based on target dependent logic
+    auto MemOpa = MemOpRecords[Idx];
+
+    // Seek for the next load/store to do the cluster.
+    unsigned NextIdx = Idx + 1;
+    for (; NextIdx < End; ++NextIdx)
+      // Skip if MemOpb has been clustered already or has dependency with
+      // MemOpa.
+      if (!SUnit2ClusterInfo.count(MemOpRecords[NextIdx].SU->NodeNum) &&
+          (FastCluster ||
+           (!DAG->IsReachable(MemOpRecords[NextIdx].SU, MemOpa.SU) &&
+            !DAG->IsReachable(MemOpa.SU, MemOpRecords[NextIdx].SU))))
+        break;
+    if (NextIdx == End)
+      continue;
+
+    auto MemOpb = MemOpRecords[NextIdx];
+    unsigned ClusterLength = 2;
+    unsigned CurrentClusterBytes = MemOpa.Width + MemOpb.Width;
+    if (SUnit2ClusterInfo.count(MemOpa.SU->NodeNum)) {
+      ClusterLength = SUnit2ClusterInfo[MemOpa.SU->NodeNum].first + 1;
+      CurrentClusterBytes =
+          SUnit2ClusterInfo[MemOpa.SU->NodeNum].second + MemOpb.Width;
+    }
+
+    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
+                                  CurrentClusterBytes))
+      continue;
+
+    SUnit *SUa = MemOpa.SU;
+    SUnit *SUb = MemOpb.SU;
+    if (SUa->NodeNum > SUb->NodeNum)
+      std::swap(SUa, SUb);
+
+    // FIXME: Is this check really required?
+    if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster)))
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
+                      << SUb->NodeNum << ")\n");
+    ++NumClustered;
+
+    if (IsLoad) {
+      // Copy successor edges from SUa to SUb. Interleaving computation
+      // dependent on SUa can prevent load combining due to register reuse.
+      // Predecessor edges do not need to be copied from SUb to SUa since
+      // nearby loads should have effectively the same inputs.
+      for (const SDep &Succ : SUa->Succs) {
+        if (Succ.getSUnit() == SUb)
+          continue;
+        LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
+                          << ")\n");
+        DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
+      }
+    } else {
+      // Copy predecessor edges from SUb to SUa to avoid the SUnits that
+      // SUb dependent on scheduled in-between SUb and SUa. Successor edges
+      // do not need to be copied from SUa to SUb since no one will depend
+      // on stores.
+      // Notice that, we don't need to care about the memory dependency as
+      // we won't try to cluster them if they have any memory dependency.
+      for (const SDep &Pred : SUb->Preds) {
+        if (Pred.getSUnit() == SUa)
+          continue;
+        LLVM_DEBUG(dbgs() << "  Copy Pred SU(" << Pred.getSUnit()->NodeNum
+                          << ")\n");
+        DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial));
+      }
+    }
+
+    SUnit2ClusterInfo[MemOpb.SU->NodeNum] = {ClusterLength,
+                                             CurrentClusterBytes};
+
+    LLVM_DEBUG(dbgs() << "  Curr cluster length: " << ClusterLength
+                      << ", Curr cluster bytes: " << CurrentClusterBytes
+                      << "\n");
+  }
+}
+
+void BaseMemOpClusterMutation::collectMemOpRecords(
+    std::vector<SUnit> &SUnits, SmallVectorImpl<MemOpInfo> &MemOpRecords) {
+  for (auto &SU : SUnits) {
+    if ((IsLoad && !SU.getInstr()->mayLoad()) ||
+        (!IsLoad && !SU.getInstr()->mayStore()))
+      continue;
+
+    const MachineInstr &MI = *SU.getInstr();
+    SmallVector<const MachineOperand *, 4> BaseOps;
+    int64_t Offset;
+    bool OffsetIsScalable;
+    unsigned Width;
+    if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
+                                           OffsetIsScalable, Width, TRI)) {
+      MemOpRecords.push_back(MemOpInfo(&SU, BaseOps, Offset, Width));
+
+      LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
+                        << Offset << ", OffsetIsScalable: " << OffsetIsScalable
+                        << ", Width: " << Width << "\n");
+    }
+#ifndef NDEBUG
+    for (const auto *Op : BaseOps)
+      assert(Op);
+#endif
+  }
+}
+
+bool BaseMemOpClusterMutation::groupMemOps(
+    ArrayRef<MemOpInfo> MemOps, ScheduleDAGInstrs *DAG,
+    DenseMap<unsigned, SmallVector<MemOpInfo, 32>> &Groups) {
+  bool FastCluster =
+      ForceFastCluster ||
+      MemOps.size() * DAG->SUnits.size() / 1000 > FastClusterThreshold;
+
+  for (const auto &MemOp : MemOps) {
+    unsigned ChainPredID = DAG->SUnits.size();
+    if (FastCluster) {
+      for (const SDep &Pred : MemOp.SU->Preds) {
+        // We only want to cluster the mem ops that have the same ctrl(non-data)
+        // pred so that they didn't have ctrl dependency for each other. But for
+        // store instrs, we can still cluster them if the pred is load instr.
+        if ((Pred.isCtrl() &&
+             (IsLoad ||
+              (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()))) &&
+            !Pred.isArtificial()) {
+          ChainPredID = Pred.getSUnit()->NodeNum;
+          break;
+        }
+      }
+    } else
+      ChainPredID = 0;
+
+    Groups[ChainPredID].push_back(MemOp);
+  }
+  return FastCluster;
+}
+
+/// Callback from DAG postProcessing to create cluster edges for loads/stores.
+void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
+  // Collect all the clusterable loads/stores
+  SmallVector<MemOpInfo, 32> MemOpRecords;
+  collectMemOpRecords(DAG->SUnits, MemOpRecords);
+
+  if (MemOpRecords.size() < 2)
+    return;
+
+  // Put the loads/stores without dependency into the same group with some
+  // heuristic if the DAG is too complex to avoid compiling time blow up.
+  // Notice that, some fusion pair could be lost with this.
+  DenseMap<unsigned, SmallVector<MemOpInfo, 32>> Groups;
+  bool FastCluster = groupMemOps(MemOpRecords, DAG, Groups);
+
+  for (auto &Group : Groups) {
+    // Sorting the loads/stores, so that, we can stop the cluster as early as
+    // possible.
+    llvm::sort(Group.second);
+
+    // Trying to cluster all the neighboring loads/stores.
+    clusterNeighboringMemOps(Group.second, FastCluster, DAG);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// CopyConstrain - DAG post-processing to encourage copy elimination.
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Post-process the DAG to create weak edges from all uses of a copy to
+/// the one use that defines the copy's source vreg, most likely an induction
+/// variable increment.
+class CopyConstrain : public ScheduleDAGMutation {
+  // Transient state.
+  SlotIndex RegionBeginIdx;
+
+  // RegionEndIdx is the slot index of the last non-debug instruction in the
+  // scheduling region. So we may have RegionBeginIdx == RegionEndIdx.
+  SlotIndex RegionEndIdx;
+
+public:
+  CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+
+protected:
+  void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG);
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation>
+createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
+                               const TargetRegisterInfo *TRI) {
+  return std::make_unique<CopyConstrain>(TII, TRI);
+}
+
+} // end namespace llvm
+
+/// constrainLocalCopy handles two possibilities:
+/// 1) Local src:
+/// I0:     = dst
+/// I1: src = ...
+/// I2:     = dst
+/// I3: dst = src (copy)
+/// (create pred->succ edges I0->I1, I2->I1)
+///
+/// 2) Local copy:
+/// I0: dst = src (copy)
+/// I1:     = dst
+/// I2: src = ...
+/// I3:     = dst
+/// (create pred->succ edges I1->I2, I3->I2)
+///
+/// Although the MachineScheduler is currently constrained to single blocks,
+/// this algorithm should handle extended blocks. An EBB is a set of
+/// contiguously numbered blocks such that the previous block in the EBB is
+/// always the single predecessor.
+void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
+  LiveIntervals *LIS = DAG->getLIS();
+  MachineInstr *Copy = CopySU->getInstr();
+
+  // Check for pure vreg copies.
+  const MachineOperand &SrcOp = Copy->getOperand(1);
+  Register SrcReg = SrcOp.getReg();
+  if (!SrcReg.isVirtual() || !SrcOp.readsReg())
+    return;
+
+  const MachineOperand &DstOp = Copy->getOperand(0);
+  Register DstReg = DstOp.getReg();
+  if (!DstReg.isVirtual() || DstOp.isDead())
+    return;
+
+  // Check if either the dest or source is local. If it's live across a back
+  // edge, it's not local. Note that if both vregs are live across the back
+  // edge, we cannot successfully contrain the copy without cyclic scheduling.
+  // If both the copy's source and dest are local live intervals, then we
+  // should treat the dest as the global for the purpose of adding
+  // constraints. This adds edges from source's other uses to the copy.
+  unsigned LocalReg = SrcReg;
+  unsigned GlobalReg = DstReg;
+  LiveInterval *LocalLI = &LIS->getInterval(LocalReg);
+  if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) {
+    LocalReg = DstReg;
+    GlobalReg = SrcReg;
+    LocalLI = &LIS->getInterval(LocalReg);
+    if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx))
+      return;
+  }
+  LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg);
+
+  // Find the global segment after the start of the local LI.
+  LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex());
+  // If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a
+  // local live range. We could create edges from other global uses to the local
+  // start, but the coalescer should have already eliminated these cases, so
+  // don't bother dealing with it.
+  if (GlobalSegment == GlobalLI->end())
+    return;
+
+  // If GlobalSegment is killed at the LocalLI->start, the call to find()
+  // returned the next global segment. But if GlobalSegment overlaps with
+  // LocalLI->start, then advance to the next segment. If a hole in GlobalLI
+  // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole.
+  if (GlobalSegment->contains(LocalLI->beginIndex()))
+    ++GlobalSegment;
+
+  if (GlobalSegment == GlobalLI->end())
+    return;
+
+  // Check if GlobalLI contains a hole in the vicinity of LocalLI.
+  if (GlobalSegment != GlobalLI->begin()) {
+    // Two address defs have no hole.
+    if (SlotIndex::isSameInstr(std::prev(GlobalSegment)->end,
+                               GlobalSegment->start)) {
+      return;
+    }
+    // If the prior global segment may be defined by the same two-address
+    // instruction that also defines LocalLI, then can't make a hole here.
+    if (SlotIndex::isSameInstr(std::prev(GlobalSegment)->start,
+                               LocalLI->beginIndex())) {
+      return;
+    }
+    // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise
+    // it would be a disconnected component in the live range.
+    assert(std::prev(GlobalSegment)->start < LocalLI->beginIndex() &&
+           "Disconnected LRG within the scheduling region.");
+  }
+  MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start);
+  if (!GlobalDef)
+    return;
+
+  SUnit *GlobalSU = DAG->getSUnit(GlobalDef);
+  if (!GlobalSU)
+    return;
+
+  // GlobalDef is the bottom of the GlobalLI hole. Open the hole by
+  // constraining the uses of the last local def to precede GlobalDef.
+  SmallVector<SUnit*,8> LocalUses;
+  const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex());
+  MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def);
+  SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef);
+  for (const SDep &Succ : LastLocalSU->Succs) {
+    if (Succ.getKind() != SDep::Data || Succ.getReg() != LocalReg)
+      continue;
+    if (Succ.getSUnit() == GlobalSU)
+      continue;
+    if (!DAG->canAddEdge(GlobalSU, Succ.getSUnit()))
+      return;
+    LocalUses.push_back(Succ.getSUnit());
+  }
+  // Open the top of the GlobalLI hole by constraining any earlier global uses
+  // to precede the start of LocalLI.
+  SmallVector<SUnit*,8> GlobalUses;
+  MachineInstr *FirstLocalDef =
+    LIS->getInstructionFromIndex(LocalLI->beginIndex());
+  SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef);
+  for (const SDep &Pred : GlobalSU->Preds) {
+    if (Pred.getKind() != SDep::Anti || Pred.getReg() != GlobalReg)
+      continue;
+    if (Pred.getSUnit() == FirstLocalSU)
+      continue;
+    if (!DAG->canAddEdge(FirstLocalSU, Pred.getSUnit()))
+      return;
+    GlobalUses.push_back(Pred.getSUnit());
+  }
+  LLVM_DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");
+  // Add the weak edges.
+  for (SUnit *LU : LocalUses) {
+    LLVM_DEBUG(dbgs() << "  Local use SU(" << LU->NodeNum << ") -> SU("
+                      << GlobalSU->NodeNum << ")\n");
+    DAG->addEdge(GlobalSU, SDep(LU, SDep::Weak));
+  }
+  for (SUnit *GU : GlobalUses) {
+    LLVM_DEBUG(dbgs() << "  Global use SU(" << GU->NodeNum << ") -> SU("
+                      << FirstLocalSU->NodeNum << ")\n");
+    DAG->addEdge(FirstLocalSU, SDep(GU, SDep::Weak));
+  }
+}
+
+/// Callback from DAG postProcessing to create weak edges to encourage
+/// copy elimination.
+void CopyConstrain::apply(ScheduleDAGInstrs *DAGInstrs) {
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+  assert(DAG->hasVRegLiveness() && "Expect VRegs with LiveIntervals");
+
+  MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end());
+  if (FirstPos == DAG->end())
+    return;
+  RegionBeginIdx = DAG->getLIS()->getInstructionIndex(*FirstPos);
+  RegionEndIdx = DAG->getLIS()->getInstructionIndex(
+      *priorNonDebug(DAG->end(), DAG->begin()));
+
+  for (SUnit &SU : DAG->SUnits) {
+    if (!SU.getInstr()->isCopy())
+      continue;
+
+    constrainLocalCopy(&SU, static_cast<ScheduleDAGMILive*>(DAG));
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// MachineSchedStrategy helpers used by GenericScheduler, GenericPostScheduler
+// and possibly other custom schedulers.
+//===----------------------------------------------------------------------===//
+
+static const unsigned InvalidCycle = ~0U;
+
+SchedBoundary::~SchedBoundary() { delete HazardRec; }
+
+/// Given a Count of resource usage and a Latency value, return true if a
+/// SchedBoundary becomes resource limited.
+/// If we are checking after scheduling a node, we should return true when
+/// we just reach the resource limit.
+static bool checkResourceLimit(unsigned LFactor, unsigned Count,
+                               unsigned Latency, bool AfterSchedNode) {
+  int ResCntFactor = (int)(Count - (Latency * LFactor));
+  if (AfterSchedNode)
+    return ResCntFactor >= (int)LFactor;
+  else
+    return ResCntFactor > (int)LFactor;
+}
+
+void SchedBoundary::reset() {
+  // A new HazardRec is created for each DAG and owned by SchedBoundary.
+  // Destroying and reconstructing it is very expensive though. So keep
+  // invalid, placeholder HazardRecs.
+  if (HazardRec && HazardRec->isEnabled()) {
+    delete HazardRec;
+    HazardRec = nullptr;
+  }
+  Available.clear();
+  Pending.clear();
+  CheckPending = false;
+  CurrCycle = 0;
+  CurrMOps = 0;
+  MinReadyCycle = std::numeric_limits<unsigned>::max();
+  ExpectedLatency = 0;
+  DependentLatency = 0;
+  RetiredMOps = 0;
+  MaxExecutedResCount = 0;
+  ZoneCritResIdx = 0;
+  IsResourceLimited = false;
+  ReservedCycles.clear();
+  ReservedResourceSegments.clear();
+  ReservedCyclesIndex.clear();
+  ResourceGroupSubUnitMasks.clear();
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+  // Track the maximum number of stall cycles that could arise either from the
+  // latency of a DAG edge or the number of cycles that a processor resource is
+  // reserved (SchedBoundary::ReservedCycles).
+  MaxObservedStall = 0;
+#endif
+  // Reserve a zero-count for invalid CritResIdx.
+  ExecutedResCounts.resize(1);
+  assert(!ExecutedResCounts[0] && "nonzero count for bad resource");
+}
+
+void SchedRemainder::
+init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
+  reset();
+  if (!SchedModel->hasInstrSchedModel())
+    return;
+  RemainingCounts.resize(SchedModel->getNumProcResourceKinds());
+  for (SUnit &SU : DAG->SUnits) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(&SU);
+    RemIssueCount += SchedModel->getNumMicroOps(SU.getInstr(), SC)
+      * SchedModel->getMicroOpFactor();
+    for (TargetSchedModel::ProcResIter
+           PI = SchedModel->getWriteProcResBegin(SC),
+           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+      unsigned PIdx = PI->ProcResourceIdx;
+      unsigned Factor = SchedModel->getResourceFactor(PIdx);
+      assert(PI->Cycles >= PI->StartAtCycle);
+      RemainingCounts[PIdx] += (Factor * (PI->Cycles - PI->StartAtCycle));
+    }
+  }
+}
+
+void SchedBoundary::
+init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
+  reset();
+  DAG = dag;
+  SchedModel = smodel;
+  Rem = rem;
+  if (SchedModel->hasInstrSchedModel()) {
+    unsigned ResourceCount = SchedModel->getNumProcResourceKinds();
+    ReservedCyclesIndex.resize(ResourceCount);
+    ExecutedResCounts.resize(ResourceCount);
+    ResourceGroupSubUnitMasks.resize(ResourceCount, APInt(ResourceCount, 0));
+    unsigned NumUnits = 0;
+
+    for (unsigned i = 0; i < ResourceCount; ++i) {
+      ReservedCyclesIndex[i] = NumUnits;
+      NumUnits += SchedModel->getProcResource(i)->NumUnits;
+      if (isUnbufferedGroup(i)) {
+        auto SubUnits = SchedModel->getProcResource(i)->SubUnitsIdxBegin;
+        for (unsigned U = 0, UE = SchedModel->getProcResource(i)->NumUnits;
+             U != UE; ++U)
+          ResourceGroupSubUnitMasks[i].setBit(SubUnits[U]);
+      }
+    }
+
+    ReservedCycles.resize(NumUnits, InvalidCycle);
+  }
+}
+
+/// Compute the stall cycles based on this SUnit's ready time. Heuristics treat
+/// these "soft stalls" differently than the hard stall cycles based on CPU
+/// resources and computed by checkHazard(). A fully in-order model
+/// (MicroOpBufferSize==0) will not make use of this since instructions are not
+/// available for scheduling until they are ready. However, a weaker in-order
+/// model may use this for heuristics. For example, if a processor has in-order
+/// behavior when reading certain resources, this may come into play.
+unsigned SchedBoundary::getLatencyStallCycles(SUnit *SU) {
+  if (!SU->isUnbuffered)
+    return 0;
+
+  unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
+  if (ReadyCycle > CurrCycle)
+    return ReadyCycle - CurrCycle;
+  return 0;
+}
+
+/// Compute the next cycle at which the given processor resource unit
+/// can be scheduled.
+unsigned SchedBoundary::getNextResourceCycleByInstance(unsigned InstanceIdx,
+                                                       unsigned Cycles,
+                                                       unsigned StartAtCycle) {
+  if (SchedModel && SchedModel->enableIntervals()) {
+    if (isTop())
+      return ReservedResourceSegments[InstanceIdx].getFirstAvailableAtFromTop(
+          CurrCycle, StartAtCycle, Cycles);
+
+    return ReservedResourceSegments[InstanceIdx].getFirstAvailableAtFromBottom(
+        CurrCycle, StartAtCycle, Cycles);
+  }
+
+  unsigned NextUnreserved = ReservedCycles[InstanceIdx];
+  // If this resource has never been used, always return cycle zero.
+  if (NextUnreserved == InvalidCycle)
+    return CurrCycle;
+  // For bottom-up scheduling add the cycles needed for the current operation.
+  if (!isTop())
+    NextUnreserved = std::max(CurrCycle, NextUnreserved + Cycles);
+  return NextUnreserved;
+}
+
+/// Compute the next cycle at which the given processor resource can be
+/// scheduled.  Returns the next cycle and the index of the processor resource
+/// instance in the reserved cycles vector.
+std::pair<unsigned, unsigned>
+SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx,
+                                    unsigned Cycles, unsigned StartAtCycle) {
+  if (MischedDetailResourceBooking) {
+    LLVM_DEBUG(dbgs() << "  Resource booking (@" << CurrCycle << "c): \n");
+    LLVM_DEBUG(dumpReservedCycles());
+    LLVM_DEBUG(dbgs() << "  getNextResourceCycle (@" << CurrCycle << "c): \n");
+  }
+  unsigned MinNextUnreserved = InvalidCycle;
+  unsigned InstanceIdx = 0;
+  unsigned StartIndex = ReservedCyclesIndex[PIdx];
+  unsigned NumberOfInstances = SchedModel->getProcResource(PIdx)->NumUnits;
+  assert(NumberOfInstances > 0 &&
+         "Cannot have zero instances of a ProcResource");
+
+  if (isUnbufferedGroup(PIdx)) {
+    // If any subunits are used by the instruction, report that the resource
+    // group is available at 0, effectively removing the group record from
+    // hazarding and basing the hazarding decisions on the subunit records.
+    // Otherwise, choose the first available instance from among the subunits.
+    // Specifications which assign cycles to both the subunits and the group or
+    // which use an unbuffered group with buffered subunits will appear to
+    // schedule strangely. In the first case, the additional cycles for the
+    // group will be ignored.  In the second, the group will be ignored
+    // entirely.
+    for (const MCWriteProcResEntry &PE :
+         make_range(SchedModel->getWriteProcResBegin(SC),
+                    SchedModel->getWriteProcResEnd(SC)))
+      if (ResourceGroupSubUnitMasks[PIdx][PE.ProcResourceIdx])
+        return std::make_pair(0u, StartIndex);
+
+    auto SubUnits = SchedModel->getProcResource(PIdx)->SubUnitsIdxBegin;
+    for (unsigned I = 0, End = NumberOfInstances; I < End; ++I) {
+      unsigned NextUnreserved, NextInstanceIdx;
+      std::tie(NextUnreserved, NextInstanceIdx) =
+          getNextResourceCycle(SC, SubUnits[I], Cycles, StartAtCycle);
+      if (MinNextUnreserved > NextUnreserved) {
+        InstanceIdx = NextInstanceIdx;
+        MinNextUnreserved = NextUnreserved;
+      }
+    }
+    return std::make_pair(MinNextUnreserved, InstanceIdx);
+  }
+
+  for (unsigned I = StartIndex, End = StartIndex + NumberOfInstances; I < End;
+       ++I) {
+    unsigned NextUnreserved =
+        getNextResourceCycleByInstance(I, Cycles, StartAtCycle);
+    if (MischedDetailResourceBooking)
+      LLVM_DEBUG(dbgs() << "    Instance " << I - StartIndex << " available @"
+                        << NextUnreserved << "c\n");
+    if (MinNextUnreserved > NextUnreserved) {
+      InstanceIdx = I;
+      MinNextUnreserved = NextUnreserved;
+    }
+  }
+  if (MischedDetailResourceBooking)
+    LLVM_DEBUG(dbgs() << "    selecting " << SchedModel->getResourceName(PIdx)
+                      << "[" << InstanceIdx - StartIndex << "]"
+                      << " available @" << MinNextUnreserved << "c"
+                      << "\n");
+  return std::make_pair(MinNextUnreserved, InstanceIdx);
+}
+
+/// Does this SU have a hazard within the current instruction group.
+///
+/// The scheduler supports two modes of hazard recognition. The first is the
+/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
+/// supports highly complicated in-order reservation tables
+/// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
+///
+/// The second is a streamlined mechanism that checks for hazards based on
+/// simple counters that the scheduler itself maintains. It explicitly checks
+/// for instruction dispatch limitations, including the number of micro-ops that
+/// can dispatch per cycle.
+///
+/// TODO: Also check whether the SU must start a new group.
+bool SchedBoundary::checkHazard(SUnit *SU) {
+  if (HazardRec->isEnabled()
+      && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) {
+    return true;
+  }
+
+  unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
+  if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {
+    LLVM_DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
+                      << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
+    return true;
+  }
+
+  if (CurrMOps > 0 &&
+      ((isTop() && SchedModel->mustBeginGroup(SU->getInstr())) ||
+       (!isTop() && SchedModel->mustEndGroup(SU->getInstr())))) {
+    LLVM_DEBUG(dbgs() << "  hazard: SU(" << SU->NodeNum << ") must "
+                      << (isTop() ? "begin" : "end") << " group\n");
+    return true;
+  }
+
+  if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+    for (const MCWriteProcResEntry &PE :
+          make_range(SchedModel->getWriteProcResBegin(SC),
+                     SchedModel->getWriteProcResEnd(SC))) {
+      unsigned ResIdx = PE.ProcResourceIdx;
+      unsigned Cycles = PE.Cycles;
+      unsigned StartAtCycle = PE.StartAtCycle;
+      unsigned NRCycle, InstanceIdx;
+      std::tie(NRCycle, InstanceIdx) =
+          getNextResourceCycle(SC, ResIdx, Cycles, StartAtCycle);
+      if (NRCycle > CurrCycle) {
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+        MaxObservedStall = std::max(Cycles, MaxObservedStall);
+#endif
+        LLVM_DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") "
+                          << SchedModel->getResourceName(ResIdx)
+                          << '[' << InstanceIdx - ReservedCyclesIndex[ResIdx]  << ']'
+                          << "=" << NRCycle << "c\n");
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Find the unscheduled node in ReadySUs with the highest latency.
+unsigned SchedBoundary::
+findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
+  SUnit *LateSU = nullptr;
+  unsigned RemLatency = 0;
+  for (SUnit *SU : ReadySUs) {
+    unsigned L = getUnscheduledLatency(SU);
+    if (L > RemLatency) {
+      RemLatency = L;
+      LateSU = SU;
+    }
+  }
+  if (LateSU) {
+    LLVM_DEBUG(dbgs() << Available.getName() << " RemLatency SU("
+                      << LateSU->NodeNum << ") " << RemLatency << "c\n");
+  }
+  return RemLatency;
+}
+
+// Count resources in this zone and the remaining unscheduled
+// instruction. Return the max count, scaled. Set OtherCritIdx to the critical
+// resource index, or zero if the zone is issue limited.
+unsigned SchedBoundary::
+getOtherResourceCount(unsigned &OtherCritIdx) {
+  OtherCritIdx = 0;
+  if (!SchedModel->hasInstrSchedModel())
+    return 0;
+
+  unsigned OtherCritCount = Rem->RemIssueCount
+    + (RetiredMOps * SchedModel->getMicroOpFactor());
+  LLVM_DEBUG(dbgs() << "  " << Available.getName() << " + Remain MOps: "
+                    << OtherCritCount / SchedModel->getMicroOpFactor() << '\n');
+  for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds();
+       PIdx != PEnd; ++PIdx) {
+    unsigned OtherCount = getResourceCount(PIdx) + Rem->RemainingCounts[PIdx];
+    if (OtherCount > OtherCritCount) {
+      OtherCritCount = OtherCount;
+      OtherCritIdx = PIdx;
+    }
+  }
+  if (OtherCritIdx) {
+    LLVM_DEBUG(
+        dbgs() << "  " << Available.getName() << " + Remain CritRes: "
+               << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx)
+               << " " << SchedModel->getResourceName(OtherCritIdx) << "\n");
+  }
+  return OtherCritCount;
+}
+
+void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle, bool InPQueue,
+                                unsigned Idx) {
+  assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+  // ReadyCycle was been bumped up to the CurrCycle when this node was
+  // scheduled, but CurrCycle may have been eagerly advanced immediately after
+  // scheduling, so may now be greater than ReadyCycle.
+  if (ReadyCycle > CurrCycle)
+    MaxObservedStall = std::max(ReadyCycle - CurrCycle, MaxObservedStall);
+#endif
+
+  if (ReadyCycle < MinReadyCycle)
+    MinReadyCycle = ReadyCycle;
+
+  // Check for interlocks first. For the purpose of other heuristics, an
+  // instruction that cannot issue appears as if it's not in the ReadyQueue.
+  bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0;
+  bool HazardDetected = (!IsBuffered && ReadyCycle > CurrCycle) ||
+                        checkHazard(SU) || (Available.size() >= ReadyListLimit);
+
+  if (!HazardDetected) {
+    Available.push(SU);
+
+    if (InPQueue)
+      Pending.remove(Pending.begin() + Idx);
+    return;
+  }
+
+  if (!InPQueue)
+    Pending.push(SU);
+}
+
+/// Move the boundary of scheduled code by one cycle.
+void SchedBoundary::bumpCycle(unsigned NextCycle) {
+  if (SchedModel->getMicroOpBufferSize() == 0) {
+    assert(MinReadyCycle < std::numeric_limits<unsigned>::max() &&
+           "MinReadyCycle uninitialized");
+    if (MinReadyCycle > NextCycle)
+      NextCycle = MinReadyCycle;
+  }
+  // Update the current micro-ops, which will issue in the next cycle.
+  unsigned DecMOps = SchedModel->getIssueWidth() * (NextCycle - CurrCycle);
+  CurrMOps = (CurrMOps <= DecMOps) ? 0 : CurrMOps - DecMOps;
+
+  // Decrement DependentLatency based on the next cycle.
+  if ((NextCycle - CurrCycle) > DependentLatency)
+    DependentLatency = 0;
+  else
+    DependentLatency -= (NextCycle - CurrCycle);
+
+  if (!HazardRec->isEnabled()) {
+    // Bypass HazardRec virtual calls.
+    CurrCycle = NextCycle;
+  } else {
+    // Bypass getHazardType calls in case of long latency.
+    for (; CurrCycle != NextCycle; ++CurrCycle) {
+      if (isTop())
+        HazardRec->AdvanceCycle();
+      else
+        HazardRec->RecedeCycle();
+    }
+  }
+  CheckPending = true;
+  IsResourceLimited =
+      checkResourceLimit(SchedModel->getLatencyFactor(), getCriticalCount(),
+                         getScheduledLatency(), true);
+
+  LLVM_DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName()
+                    << '\n');
+}
+
+void SchedBoundary::incExecutedResources(unsigned PIdx, unsigned Count) {
+  ExecutedResCounts[PIdx] += Count;
+  if (ExecutedResCounts[PIdx] > MaxExecutedResCount)
+    MaxExecutedResCount = ExecutedResCounts[PIdx];
+}
+
+/// Add the given processor resource to this scheduled zone.
+///
+/// \param Cycles indicates the number of consecutive (non-pipelined) cycles
+/// during which this resource is consumed.
+///
+/// \return the next cycle at which the instruction may execute without
+/// oversubscribing resources.
+unsigned SchedBoundary::countResource(const MCSchedClassDesc *SC, unsigned PIdx,
+                                      unsigned Cycles, unsigned NextCycle,
+                                      unsigned StartAtCycle) {
+  unsigned Factor = SchedModel->getResourceFactor(PIdx);
+  unsigned Count = Factor * (Cycles - StartAtCycle);
+  LLVM_DEBUG(dbgs() << "  " << SchedModel->getResourceName(PIdx) << " +"
+                    << Cycles << "x" << Factor << "u\n");
+
+  // Update Executed resources counts.
+  incExecutedResources(PIdx, Count);
+  assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted");
+  Rem->RemainingCounts[PIdx] -= Count;
+
+  // Check if this resource exceeds the current critical resource. If so, it
+  // becomes the critical resource.
+  if (ZoneCritResIdx != PIdx && (getResourceCount(PIdx) > getCriticalCount())) {
+    ZoneCritResIdx = PIdx;
+    LLVM_DEBUG(dbgs() << "  *** Critical resource "
+                      << SchedModel->getResourceName(PIdx) << ": "
+                      << getResourceCount(PIdx) / SchedModel->getLatencyFactor()
+                      << "c\n");
+  }
+  // For reserved resources, record the highest cycle using the resource.
+  unsigned NextAvailable, InstanceIdx;
+  std::tie(NextAvailable, InstanceIdx) =
+      getNextResourceCycle(SC, PIdx, Cycles, StartAtCycle);
+  if (NextAvailable > CurrCycle) {
+    LLVM_DEBUG(dbgs() << "  Resource conflict: "
+                      << SchedModel->getResourceName(PIdx)
+                      << '[' << InstanceIdx - ReservedCyclesIndex[PIdx]  << ']'
+                      << " reserved until @" << NextAvailable << "\n");
+  }
+  return NextAvailable;
+}
+
+/// Move the boundary of scheduled code by one SUnit.
+void SchedBoundary::bumpNode(SUnit *SU) {
+  // Update the reservation table.
+  if (HazardRec->isEnabled()) {
+    if (!isTop() && SU->isCall) {
+      // Calls are scheduled with their preceding instructions. For bottom-up
+      // scheduling, clear the pipeline state before emitting.
+      HazardRec->Reset();
+    }
+    HazardRec->EmitInstruction(SU);
+    // Scheduling an instruction may have made pending instructions available.
+    CheckPending = true;
+  }
+  // checkHazard should prevent scheduling multiple instructions per cycle that
+  // exceed the issue width.
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr());
+  assert(
+      (CurrMOps == 0 || (CurrMOps + IncMOps) <= SchedModel->getIssueWidth()) &&
+      "Cannot schedule this instruction's MicroOps in the current cycle.");
+
+  unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
+  LLVM_DEBUG(dbgs() << "  Ready @" << ReadyCycle << "c\n");
+
+  unsigned NextCycle = CurrCycle;
+  switch (SchedModel->getMicroOpBufferSize()) {
+  case 0:
+    assert(ReadyCycle <= CurrCycle && "Broken PendingQueue");
+    break;
+  case 1:
+    if (ReadyCycle > NextCycle) {
+      NextCycle = ReadyCycle;
+      LLVM_DEBUG(dbgs() << "  *** Stall until: " << ReadyCycle << "\n");
+    }
+    break;
+  default:
+    // We don't currently model the OOO reorder buffer, so consider all
+    // scheduled MOps to be "retired". We do loosely model in-order resource
+    // latency. If this instruction uses an in-order resource, account for any
+    // likely stall cycles.
+    if (SU->isUnbuffered && ReadyCycle > NextCycle)
+      NextCycle = ReadyCycle;
+    break;
+  }
+  RetiredMOps += IncMOps;
+
+  // Update resource counts and critical resource.
+  if (SchedModel->hasInstrSchedModel()) {
+    unsigned DecRemIssue = IncMOps * SchedModel->getMicroOpFactor();
+    assert(Rem->RemIssueCount >= DecRemIssue && "MOps double counted");
+    Rem->RemIssueCount -= DecRemIssue;
+    if (ZoneCritResIdx) {
+      // Scale scheduled micro-ops for comparing with the critical resource.
+      unsigned ScaledMOps =
+        RetiredMOps * SchedModel->getMicroOpFactor();
+
+      // If scaled micro-ops are now more than the previous critical resource by
+      // a full cycle, then micro-ops issue becomes critical.
+      if ((int)(ScaledMOps - getResourceCount(ZoneCritResIdx))
+          >= (int)SchedModel->getLatencyFactor()) {
+        ZoneCritResIdx = 0;
+        LLVM_DEBUG(dbgs() << "  *** Critical resource NumMicroOps: "
+                          << ScaledMOps / SchedModel->getLatencyFactor()
+                          << "c\n");
+      }
+    }
+    for (TargetSchedModel::ProcResIter
+           PI = SchedModel->getWriteProcResBegin(SC),
+           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+      unsigned RCycle = countResource(SC, PI->ProcResourceIdx, PI->Cycles,
+                                      NextCycle, PI->StartAtCycle);
+      if (RCycle > NextCycle)
+        NextCycle = RCycle;
+    }
+    if (SU->hasReservedResource) {
+      // For reserved resources, record the highest cycle using the resource.
+      // For top-down scheduling, this is the cycle in which we schedule this
+      // instruction plus the number of cycles the operations reserves the
+      // resource. For bottom-up is it simply the instruction's cycle.
+      for (TargetSchedModel::ProcResIter
+             PI = SchedModel->getWriteProcResBegin(SC),
+             PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+        unsigned PIdx = PI->ProcResourceIdx;
+        if (SchedModel->getProcResource(PIdx)->BufferSize == 0) {
+
+          if (SchedModel && SchedModel->enableIntervals()) {
+            unsigned ReservedUntil, InstanceIdx;
+            std::tie(ReservedUntil, InstanceIdx) =
+                getNextResourceCycle(SC, PIdx, PI->Cycles, PI->StartAtCycle);
+            if (isTop()) {
+              ReservedResourceSegments[InstanceIdx].add(
+                  ResourceSegments::getResourceIntervalTop(
+                      NextCycle, PI->StartAtCycle, PI->Cycles),
+                  MIResourceCutOff);
+            } else {
+              ReservedResourceSegments[InstanceIdx].add(
+                  ResourceSegments::getResourceIntervalBottom(
+                      NextCycle, PI->StartAtCycle, PI->Cycles),
+                  MIResourceCutOff);
+            }
+          } else {
+
+            unsigned ReservedUntil, InstanceIdx;
+            std::tie(ReservedUntil, InstanceIdx) =
+                getNextResourceCycle(SC, PIdx, PI->Cycles, PI->StartAtCycle);
+            if (isTop()) {
+              ReservedCycles[InstanceIdx] =
+                  std::max(ReservedUntil, NextCycle + PI->Cycles);
+            } else
+              ReservedCycles[InstanceIdx] = NextCycle;
+          }
+        }
+      }
+    }
+  }
+  // Update ExpectedLatency and DependentLatency.
+  unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency;
+  unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency;
+  if (SU->getDepth() > TopLatency) {
+    TopLatency = SU->getDepth();
+    LLVM_DEBUG(dbgs() << "  " << Available.getName() << " TopLatency SU("
+                      << SU->NodeNum << ") " << TopLatency << "c\n");
+  }
+  if (SU->getHeight() > BotLatency) {
+    BotLatency = SU->getHeight();
+    LLVM_DEBUG(dbgs() << "  " << Available.getName() << " BotLatency SU("
+                      << SU->NodeNum << ") " << BotLatency << "c\n");
+  }
+  // If we stall for any reason, bump the cycle.
+  if (NextCycle > CurrCycle)
+    bumpCycle(NextCycle);
+  else
+    // After updating ZoneCritResIdx and ExpectedLatency, check if we're
+    // resource limited. If a stall occurred, bumpCycle does this.
+    IsResourceLimited =
+        checkResourceLimit(SchedModel->getLatencyFactor(), getCriticalCount(),
+                           getScheduledLatency(), true);
+
+  // Update CurrMOps after calling bumpCycle to handle stalls, since bumpCycle
+  // resets CurrMOps. Loop to handle instructions with more MOps than issue in
+  // one cycle.  Since we commonly reach the max MOps here, opportunistically
+  // bump the cycle to avoid uselessly checking everything in the readyQ.
+  CurrMOps += IncMOps;
+
+  // Bump the cycle count for issue group constraints.
+  // This must be done after NextCycle has been adjust for all other stalls.
+  // Calling bumpCycle(X) will reduce CurrMOps by one issue group and set
+  // currCycle to X.
+  if ((isTop() &&  SchedModel->mustEndGroup(SU->getInstr())) ||
+      (!isTop() && SchedModel->mustBeginGroup(SU->getInstr()))) {
+    LLVM_DEBUG(dbgs() << "  Bump cycle to " << (isTop() ? "end" : "begin")
+                      << " group\n");
+    bumpCycle(++NextCycle);
+  }
+
+  while (CurrMOps >= SchedModel->getIssueWidth()) {
+    LLVM_DEBUG(dbgs() << "  *** Max MOps " << CurrMOps << " at cycle "
+                      << CurrCycle << '\n');
+    bumpCycle(++NextCycle);
+  }
+  LLVM_DEBUG(dumpScheduledState());
+}
+
+/// Release pending ready nodes in to the available queue. This makes them
+/// visible to heuristics.
+void SchedBoundary::releasePending() {
+  // If the available queue is empty, it is safe to reset MinReadyCycle.
+  if (Available.empty())
+    MinReadyCycle = std::numeric_limits<unsigned>::max();
+
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  for (unsigned I = 0, E = Pending.size(); I < E; ++I) {
+    SUnit *SU = *(Pending.begin() + I);
+    unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+
+    if (ReadyCycle < MinReadyCycle)
+      MinReadyCycle = ReadyCycle;
+
+    if (Available.size() >= ReadyListLimit)
+      break;
+
+    releaseNode(SU, ReadyCycle, true, I);
+    if (E != Pending.size()) {
+      --I;
+      --E;
+    }
+  }
+  CheckPending = false;
+}
+
+/// Remove SU from the ready set for this boundary.
+void SchedBoundary::removeReady(SUnit *SU) {
+  if (Available.isInQueue(SU))
+    Available.remove(Available.find(SU));
+  else {
+    assert(Pending.isInQueue(SU) && "bad ready count");
+    Pending.remove(Pending.find(SU));
+  }
+}
+
+/// If this queue only has one ready candidate, return it. As a side effect,
+/// defer any nodes that now hit a hazard, and advance the cycle until at least
+/// one node is ready. If multiple instructions are ready, return NULL.
+SUnit *SchedBoundary::pickOnlyChoice() {
+  if (CheckPending)
+    releasePending();
+
+  // Defer any ready instrs that now have a hazard.
+  for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
+    if (checkHazard(*I)) {
+      Pending.push(*I);
+      I = Available.remove(I);
+      continue;
+    }
+    ++I;
+  }
+  for (unsigned i = 0; Available.empty(); ++i) {
+//  FIXME: Re-enable assert once PR20057 is resolved.
+//    assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedStall) &&
+//           "permanent hazard");
+    (void)i;
+    bumpCycle(CurrCycle + 1);
+    releasePending();
+  }
+
+  LLVM_DEBUG(Pending.dump());
+  LLVM_DEBUG(Available.dump());
+
+  if (Available.size() == 1)
+    return *Available.begin();
+  return nullptr;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+/// Dump the content of the \ref ReservedCycles vector for the
+/// resources that are used in the basic block.
+///
+LLVM_DUMP_METHOD void SchedBoundary::dumpReservedCycles() const {
+  if (!SchedModel->hasInstrSchedModel())
+    return;
+
+  unsigned ResourceCount = SchedModel->getNumProcResourceKinds();
+  unsigned StartIdx = 0;
+
+  for (unsigned ResIdx = 0; ResIdx < ResourceCount; ++ResIdx) {
+    const unsigned NumUnits = SchedModel->getProcResource(ResIdx)->NumUnits;
+    std::string ResName = SchedModel->getResourceName(ResIdx);
+    for (unsigned UnitIdx = 0; UnitIdx < NumUnits; ++UnitIdx) {
+      dbgs() << ResName << "(" << UnitIdx << ") = ";
+      if (SchedModel && SchedModel->enableIntervals()) {
+        if (ReservedResourceSegments.count(StartIdx + UnitIdx))
+          dbgs() << ReservedResourceSegments.at(StartIdx + UnitIdx);
+        else
+          dbgs() << "{ }\n";
+      } else
+        dbgs() << ReservedCycles[StartIdx + UnitIdx] << "\n";
+    }
+    StartIdx += NumUnits;
+  }
+}
+
+// This is useful information to dump after bumpNode.
+// Note that the Queue contents are more useful before pickNodeFromQueue.
+LLVM_DUMP_METHOD void SchedBoundary::dumpScheduledState() const {
+  unsigned ResFactor;
+  unsigned ResCount;
+  if (ZoneCritResIdx) {
+    ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx);
+    ResCount = getResourceCount(ZoneCritResIdx);
+  } else {
+    ResFactor = SchedModel->getMicroOpFactor();
+    ResCount = RetiredMOps * ResFactor;
+  }
+  unsigned LFactor = SchedModel->getLatencyFactor();
+  dbgs() << Available.getName() << " @" << CurrCycle << "c\n"
+         << "  Retired: " << RetiredMOps;
+  dbgs() << "\n  Executed: " << getExecutedCount() / LFactor << "c";
+  dbgs() << "\n  Critical: " << ResCount / LFactor << "c, "
+         << ResCount / ResFactor << " "
+         << SchedModel->getResourceName(ZoneCritResIdx)
+         << "\n  ExpectedLatency: " << ExpectedLatency << "c\n"
+         << (IsResourceLimited ? "  - Resource" : "  - Latency")
+         << " limited.\n";
+  if (MISchedDumpReservedCycles)
+    dumpReservedCycles();
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// GenericScheduler - Generic implementation of MachineSchedStrategy.
+//===----------------------------------------------------------------------===//
+
+void GenericSchedulerBase::SchedCandidate::
+initResourceDelta(const ScheduleDAGMI *DAG,
+                  const TargetSchedModel *SchedModel) {
+  if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
+    return;
+
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  for (TargetSchedModel::ProcResIter
+         PI = SchedModel->getWriteProcResBegin(SC),
+         PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+    if (PI->ProcResourceIdx == Policy.ReduceResIdx)
+      ResDelta.CritResources += PI->Cycles;
+    if (PI->ProcResourceIdx == Policy.DemandResIdx)
+      ResDelta.DemandedResources += PI->Cycles;
+  }
+}
+
+/// Compute remaining latency. We need this both to determine whether the
+/// overall schedule has become latency-limited and whether the instructions
+/// outside this zone are resource or latency limited.
+///
+/// The "dependent" latency is updated incrementally during scheduling as the
+/// max height/depth of scheduled nodes minus the cycles since it was
+/// scheduled:
+///   DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone
+///
+/// The "independent" latency is the max ready queue depth:
+///   ILat = max N.depth for N in Available|Pending
+///
+/// RemainingLatency is the greater of independent and dependent latency.
+///
+/// These computations are expensive, especially in DAGs with many edges, so
+/// only do them if necessary.
+static unsigned computeRemLatency(SchedBoundary &CurrZone) {
+  unsigned RemLatency = CurrZone.getDependentLatency();
+  RemLatency = std::max(RemLatency,
+                        CurrZone.findMaxLatency(CurrZone.Available.elements()));
+  RemLatency = std::max(RemLatency,
+                        CurrZone.findMaxLatency(CurrZone.Pending.elements()));
+  return RemLatency;
+}
+
+/// Returns true if the current cycle plus remaning latency is greater than
+/// the critical path in the scheduling region.
+bool GenericSchedulerBase::shouldReduceLatency(const CandPolicy &Policy,
+                                               SchedBoundary &CurrZone,
+                                               bool ComputeRemLatency,
+                                               unsigned &RemLatency) const {
+  // The current cycle is already greater than the critical path, so we are
+  // already latency limited and don't need to compute the remaining latency.
+  if (CurrZone.getCurrCycle() > Rem.CriticalPath)
+    return true;
+
+  // If we haven't scheduled anything yet, then we aren't latency limited.
+  if (CurrZone.getCurrCycle() == 0)
+    return false;
+
+  if (ComputeRemLatency)
+    RemLatency = computeRemLatency(CurrZone);
+
+  return RemLatency + CurrZone.getCurrCycle() > Rem.CriticalPath;
+}
+
+/// Set the CandPolicy given a scheduling zone given the current resources and
+/// latencies inside and outside the zone.
+void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA,
+                                     SchedBoundary &CurrZone,
+                                     SchedBoundary *OtherZone) {
+  // Apply preemptive heuristics based on the total latency and resources
+  // inside and outside this zone. Potential stalls should be considered before
+  // following this policy.
+
+  // Compute the critical resource outside the zone.
+  unsigned OtherCritIdx = 0;
+  unsigned OtherCount =
+    OtherZone ? OtherZone->getOtherResourceCount(OtherCritIdx) : 0;
+
+  bool OtherResLimited = false;
+  unsigned RemLatency = 0;
+  bool RemLatencyComputed = false;
+  if (SchedModel->hasInstrSchedModel() && OtherCount != 0) {
+    RemLatency = computeRemLatency(CurrZone);
+    RemLatencyComputed = true;
+    OtherResLimited = checkResourceLimit(SchedModel->getLatencyFactor(),
+                                         OtherCount, RemLatency, false);
+  }
+
+  // Schedule aggressively for latency in PostRA mode. We don't check for
+  // acyclic latency during PostRA, and highly out-of-order processors will
+  // skip PostRA scheduling.
+  if (!OtherResLimited &&
+      (IsPostRA || shouldReduceLatency(Policy, CurrZone, !RemLatencyComputed,
+                                       RemLatency))) {
+    Policy.ReduceLatency |= true;
+    LLVM_DEBUG(dbgs() << "  " << CurrZone.Available.getName()
+                      << " RemainingLatency " << RemLatency << " + "
+                      << CurrZone.getCurrCycle() << "c > CritPath "
+                      << Rem.CriticalPath << "\n");
+  }
+  // If the same resource is limiting inside and outside the zone, do nothing.
+  if (CurrZone.getZoneCritResIdx() == OtherCritIdx)
+    return;
+
+  LLVM_DEBUG(if (CurrZone.isResourceLimited()) {
+    dbgs() << "  " << CurrZone.Available.getName() << " ResourceLimited: "
+           << SchedModel->getResourceName(CurrZone.getZoneCritResIdx()) << "\n";
+  } if (OtherResLimited) dbgs()
+                 << "  RemainingLimit: "
+                 << SchedModel->getResourceName(OtherCritIdx) << "\n";
+             if (!CurrZone.isResourceLimited() && !OtherResLimited) dbgs()
+             << "  Latency limited both directions.\n");
+
+  if (CurrZone.isResourceLimited() && !Policy.ReduceResIdx)
+    Policy.ReduceResIdx = CurrZone.getZoneCritResIdx();
+
+  if (OtherResLimited)
+    Policy.DemandResIdx = OtherCritIdx;
+}
+
+#ifndef NDEBUG
+const char *GenericSchedulerBase::getReasonStr(
+  GenericSchedulerBase::CandReason Reason) {
+  switch (Reason) {
+  case NoCand:         return "NOCAND    ";
+  case Only1:          return "ONLY1     ";
+  case PhysReg:        return "PHYS-REG  ";
+  case RegExcess:      return "REG-EXCESS";
+  case RegCritical:    return "REG-CRIT  ";
+  case Stall:          return "STALL     ";
+  case Cluster:        return "CLUSTER   ";
+  case Weak:           return "WEAK      ";
+  case RegMax:         return "REG-MAX   ";
+  case ResourceReduce: return "RES-REDUCE";
+  case ResourceDemand: return "RES-DEMAND";
+  case TopDepthReduce: return "TOP-DEPTH ";
+  case TopPathReduce:  return "TOP-PATH  ";
+  case BotHeightReduce:return "BOT-HEIGHT";
+  case BotPathReduce:  return "BOT-PATH  ";
+  case NextDefUse:     return "DEF-USE   ";
+  case NodeOrder:      return "ORDER     ";
+  };
+  llvm_unreachable("Unknown reason!");
+}
+
+void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
+  PressureChange P;
+  unsigned ResIdx = 0;
+  unsigned Latency = 0;
+  switch (Cand.Reason) {
+  default:
+    break;
+  case RegExcess:
+    P = Cand.RPDelta.Excess;
+    break;
+  case RegCritical:
+    P = Cand.RPDelta.CriticalMax;
+    break;
+  case RegMax:
+    P = Cand.RPDelta.CurrentMax;
+    break;
+  case ResourceReduce:
+    ResIdx = Cand.Policy.ReduceResIdx;
+    break;
+  case ResourceDemand:
+    ResIdx = Cand.Policy.DemandResIdx;
+    break;
+  case TopDepthReduce:
+    Latency = Cand.SU->getDepth();
+    break;
+  case TopPathReduce:
+    Latency = Cand.SU->getHeight();
+    break;
+  case BotHeightReduce:
+    Latency = Cand.SU->getHeight();
+    break;
+  case BotPathReduce:
+    Latency = Cand.SU->getDepth();
+    break;
+  }
+  dbgs() << "  Cand SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
+  if (P.isValid())
+    dbgs() << " " << TRI->getRegPressureSetName(P.getPSet())
+           << ":" << P.getUnitInc() << " ";
+  else
+    dbgs() << "      ";
+  if (ResIdx)
+    dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " ";
+  else
+    dbgs() << "         ";
+  if (Latency)
+    dbgs() << " " << Latency << " cycles ";
+  else
+    dbgs() << "          ";
+  dbgs() << '\n';
+}
+#endif
+
+namespace llvm {
+/// Return true if this heuristic determines order.
+/// TODO: Consider refactor return type of these functions as integer or enum,
+/// as we may need to differentiate whether TryCand is better than Cand.
+bool tryLess(int TryVal, int CandVal,
+             GenericSchedulerBase::SchedCandidate &TryCand,
+             GenericSchedulerBase::SchedCandidate &Cand,
+             GenericSchedulerBase::CandReason Reason) {
+  if (TryVal < CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal > CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  return false;
+}
+
+bool tryGreater(int TryVal, int CandVal,
+                GenericSchedulerBase::SchedCandidate &TryCand,
+                GenericSchedulerBase::SchedCandidate &Cand,
+                GenericSchedulerBase::CandReason Reason) {
+  if (TryVal > CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal < CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  return false;
+}
+
+bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
+                GenericSchedulerBase::SchedCandidate &Cand,
+                SchedBoundary &Zone) {
+  if (Zone.isTop()) {
+    // Prefer the candidate with the lesser depth, but only if one of them has
+    // depth greater than the total latency scheduled so far, otherwise either
+    // of them could be scheduled now with no stall.
+    if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) >
+        Zone.getScheduledLatency()) {
+      if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                  TryCand, Cand, GenericSchedulerBase::TopDepthReduce))
+        return true;
+    }
+    if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                   TryCand, Cand, GenericSchedulerBase::TopPathReduce))
+      return true;
+  } else {
+    // Prefer the candidate with the lesser height, but only if one of them has
+    // height greater than the total latency scheduled so far, otherwise either
+    // of them could be scheduled now with no stall.
+    if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) >
+        Zone.getScheduledLatency()) {
+      if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                  TryCand, Cand, GenericSchedulerBase::BotHeightReduce))
+        return true;
+    }
+    if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                   TryCand, Cand, GenericSchedulerBase::BotPathReduce))
+      return true;
+  }
+  return false;
+}
+} // end namespace llvm
+
+static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) {
+  LLVM_DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
+                    << GenericSchedulerBase::getReasonStr(Reason) << '\n');
+}
+
+static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) {
+  tracePick(Cand.Reason, Cand.AtTop);
+}
+
+void GenericScheduler::initialize(ScheduleDAGMI *dag) {
+  assert(dag->hasVRegLiveness() &&
+         "(PreRA)GenericScheduler needs vreg liveness");
+  DAG = static_cast<ScheduleDAGMILive*>(dag);
+  SchedModel = DAG->getSchedModel();
+  TRI = DAG->TRI;
+
+  if (RegionPolicy.ComputeDFSResult)
+    DAG->computeDFSResult();
+
+  Rem.init(DAG, SchedModel);
+  Top.init(DAG, SchedModel, &Rem);
+  Bot.init(DAG, SchedModel, &Rem);
+
+  // Initialize resource counts.
+
+  // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
+  // are disabled, then these HazardRecs will be disabled.
+  const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
+  if (!Top.HazardRec) {
+    Top.HazardRec =
+        DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
+            Itin, DAG);
+  }
+  if (!Bot.HazardRec) {
+    Bot.HazardRec =
+        DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
+            Itin, DAG);
+  }
+  TopCand.SU = nullptr;
+  BotCand.SU = nullptr;
+}
+
+/// Initialize the per-region scheduling policy.
+void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
+                                  MachineBasicBlock::iterator End,
+                                  unsigned NumRegionInstrs) {
+  const MachineFunction &MF = *Begin->getMF();
+  const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
+
+  // Avoid setting up the register pressure tracker for small regions to save
+  // compile time. As a rough heuristic, only track pressure when the number of
+  // schedulable instructions exceeds half the integer register file.
+  RegionPolicy.ShouldTrackPressure = true;
+  for (unsigned VT = MVT::i32; VT > (unsigned)MVT::i1; --VT) {
+    MVT::SimpleValueType LegalIntVT = (MVT::SimpleValueType)VT;
+    if (TLI->isTypeLegal(LegalIntVT)) {
+      unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs(
+        TLI->getRegClassFor(LegalIntVT));
+      RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2);
+    }
+  }
+
+  // For generic targets, we default to bottom-up, because it's simpler and more
+  // compile-time optimizations have been implemented in that direction.
+  RegionPolicy.OnlyBottomUp = true;
+
+  // Allow the subtarget to override default policy.
+  MF.getSubtarget().overrideSchedPolicy(RegionPolicy, NumRegionInstrs);
+
+  // After subtarget overrides, apply command line options.
+  if (!EnableRegPressure) {
+    RegionPolicy.ShouldTrackPressure = false;
+    RegionPolicy.ShouldTrackLaneMasks = false;
+  }
+
+  // Check -misched-topdown/bottomup can force or unforce scheduling direction.
+  // e.g. -misched-bottomup=false allows scheduling in both directions.
+  assert((!ForceTopDown || !ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+  if (ForceBottomUp.getNumOccurrences() > 0) {
+    RegionPolicy.OnlyBottomUp = ForceBottomUp;
+    if (RegionPolicy.OnlyBottomUp)
+      RegionPolicy.OnlyTopDown = false;
+  }
+  if (ForceTopDown.getNumOccurrences() > 0) {
+    RegionPolicy.OnlyTopDown = ForceTopDown;
+    if (RegionPolicy.OnlyTopDown)
+      RegionPolicy.OnlyBottomUp = false;
+  }
+}
+
+void GenericScheduler::dumpPolicy() const {
+  // Cannot completely remove virtual function even in release mode.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  dbgs() << "GenericScheduler RegionPolicy: "
+         << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure
+         << " OnlyTopDown=" << RegionPolicy.OnlyTopDown
+         << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp
+         << "\n";
+#endif
+}
+
+/// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
+/// critical path by more cycles than it takes to drain the instruction buffer.
+/// We estimate an upper bounds on in-flight instructions as:
+///
+/// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
+/// InFlightIterations = AcyclicPath / CyclesPerIteration
+/// InFlightResources = InFlightIterations * LoopResources
+///
+/// TODO: Check execution resources in addition to IssueCount.
+void GenericScheduler::checkAcyclicLatency() {
+  if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
+    return;
+
+  // Scaled number of cycles per loop iteration.
+  unsigned IterCount =
+    std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(),
+             Rem.RemIssueCount);
+  // Scaled acyclic critical path.
+  unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor();
+  // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop
+  unsigned InFlightCount =
+    (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount;
+  unsigned BufferLimit =
+    SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
+
+  Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
+
+  LLVM_DEBUG(
+      dbgs() << "IssueCycles="
+             << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
+             << "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
+             << "c NumIters=" << (AcyclicCount + IterCount - 1) / IterCount
+             << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
+             << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
+      if (Rem.IsAcyclicLatencyLimited) dbgs() << "  ACYCLIC LATENCY LIMIT\n");
+}
+
+void GenericScheduler::registerRoots() {
+  Rem.CriticalPath = DAG->ExitSU.getDepth();
+
+  // Some roots may not feed into ExitSU. Check all of them in case.
+  for (const SUnit *SU : Bot.Available) {
+    if (SU->getDepth() > Rem.CriticalPath)
+      Rem.CriticalPath = SU->getDepth();
+  }
+  LLVM_DEBUG(dbgs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << '\n');
+  if (DumpCriticalPathLength) {
+    errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n";
+  }
+
+  if (EnableCyclicPath && SchedModel->getMicroOpBufferSize() > 0) {
+    Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
+    checkAcyclicLatency();
+  }
+}
+
+namespace llvm {
+bool tryPressure(const PressureChange &TryP,
+                 const PressureChange &CandP,
+                 GenericSchedulerBase::SchedCandidate &TryCand,
+                 GenericSchedulerBase::SchedCandidate &Cand,
+                 GenericSchedulerBase::CandReason Reason,
+                 const TargetRegisterInfo *TRI,
+                 const MachineFunction &MF) {
+  // If one candidate decreases and the other increases, go with it.
+  // Invalid candidates have UnitInc==0.
+  if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
+                 Reason)) {
+    return true;
+  }
+  // Do not compare the magnitude of pressure changes between top and bottom
+  // boundary.
+  if (Cand.AtTop != TryCand.AtTop)
+    return false;
+
+  // If both candidates affect the same set in the same boundary, go with the
+  // smallest increase.
+  unsigned TryPSet = TryP.getPSetOrMax();
+  unsigned CandPSet = CandP.getPSetOrMax();
+  if (TryPSet == CandPSet) {
+    return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
+                   Reason);
+  }
+
+  int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) :
+                                 std::numeric_limits<int>::max();
+
+  int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, CandPSet) :
+                                   std::numeric_limits<int>::max();
+
+  // If the candidates are decreasing pressure, reverse priority.
+  if (TryP.getUnitInc() < 0)
+    std::swap(TryRank, CandRank);
+  return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
+}
+
+unsigned getWeakLeft(const SUnit *SU, bool isTop) {
+  return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
+}
+
+/// Minimize physical register live ranges. Regalloc wants them adjacent to
+/// their physreg def/use.
+///
+/// FIXME: This is an unnecessary check on the critical path. Most are root/leaf
+/// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
+/// with the operation that produces or consumes the physreg. We'll do this when
+/// regalloc has support for parallel copies.
+int biasPhysReg(const SUnit *SU, bool isTop) {
+  const MachineInstr *MI = SU->getInstr();
+
+  if (MI->isCopy()) {
+    unsigned ScheduledOper = isTop ? 1 : 0;
+    unsigned UnscheduledOper = isTop ? 0 : 1;
+    // If we have already scheduled the physreg produce/consumer, immediately
+    // schedule the copy.
+    if (MI->getOperand(ScheduledOper).getReg().isPhysical())
+      return 1;
+    // If the physreg is at the boundary, defer it. Otherwise schedule it
+    // immediately to free the dependent. We can hoist the copy later.
+    bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft;
+    if (MI->getOperand(UnscheduledOper).getReg().isPhysical())
+      return AtBoundary ? -1 : 1;
+  }
+
+  if (MI->isMoveImmediate()) {
+    // If we have a move immediate and all successors have been assigned, bias
+    // towards scheduling this later. Make sure all register defs are to
+    // physical registers.
+    bool DoBias = true;
+    for (const MachineOperand &Op : MI->defs()) {
+      if (Op.isReg() && !Op.getReg().isPhysical()) {
+        DoBias = false;
+        break;
+      }
+    }
+
+    if (DoBias)
+      return isTop ? -1 : 1;
+  }
+
+  return 0;
+}
+} // end namespace llvm
+
+void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
+                                     bool AtTop,
+                                     const RegPressureTracker &RPTracker,
+                                     RegPressureTracker &TempTracker) {
+  Cand.SU = SU;
+  Cand.AtTop = AtTop;
+  if (DAG->isTrackingPressure()) {
+    if (AtTop) {
+      TempTracker.getMaxDownwardPressureDelta(
+        Cand.SU->getInstr(),
+        Cand.RPDelta,
+        DAG->getRegionCriticalPSets(),
+        DAG->getRegPressure().MaxSetPressure);
+    } else {
+      if (VerifyScheduling) {
+        TempTracker.getMaxUpwardPressureDelta(
+          Cand.SU->getInstr(),
+          &DAG->getPressureDiff(Cand.SU),
+          Cand.RPDelta,
+          DAG->getRegionCriticalPSets(),
+          DAG->getRegPressure().MaxSetPressure);
+      } else {
+        RPTracker.getUpwardPressureDelta(
+          Cand.SU->getInstr(),
+          DAG->getPressureDiff(Cand.SU),
+          Cand.RPDelta,
+          DAG->getRegionCriticalPSets(),
+          DAG->getRegPressure().MaxSetPressure);
+      }
+    }
+  }
+  LLVM_DEBUG(if (Cand.RPDelta.Excess.isValid()) dbgs()
+             << "  Try  SU(" << Cand.SU->NodeNum << ") "
+             << TRI->getRegPressureSetName(Cand.RPDelta.Excess.getPSet()) << ":"
+             << Cand.RPDelta.Excess.getUnitInc() << "\n");
+}
+
+/// Apply a set of heuristics to a new candidate. Heuristics are currently
+/// hierarchical. This may be more efficient than a graduated cost model because
+/// we don't need to evaluate all aspects of the model for each node in the
+/// queue. But it's really done to make the heuristics easier to debug and
+/// statistically analyze.
+///
+/// \param Cand provides the policy and current best candidate.
+/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
+/// \param Zone describes the scheduled zone that we are extending, or nullptr
+///             if Cand is from a different zone than TryCand.
+/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
+bool GenericScheduler::tryCandidate(SchedCandidate &Cand,
+                                    SchedCandidate &TryCand,
+                                    SchedBoundary *Zone) const {
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  // Bias PhysReg Defs and copies to their uses and defined respectively.
+  if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+                 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+    return TryCand.Reason != NoCand;
+
+  // Avoid exceeding the target's limit.
+  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
+                                               Cand.RPDelta.Excess,
+                                               TryCand, Cand, RegExcess, TRI,
+                                               DAG->MF))
+    return TryCand.Reason != NoCand;
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,
+                                               Cand.RPDelta.CriticalMax,
+                                               TryCand, Cand, RegCritical, TRI,
+                                               DAG->MF))
+    return TryCand.Reason != NoCand;
+
+  // We only compare a subset of features when comparing nodes between
+  // Top and Bottom boundary. Some properties are simply incomparable, in many
+  // other instances we should only override the other boundary if something
+  // is a clear good pick on one boundary. Skip heuristics that are more
+  // "tie-breaking" in nature.
+  bool SameBoundary = Zone != nullptr;
+  if (SameBoundary) {
+    // For loops that are acyclic path limited, aggressively schedule for
+    // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
+    // heuristics to take precedence.
+    if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
+        tryLatency(TryCand, Cand, *Zone))
+      return TryCand.Reason != NoCand;
+
+    // Prioritize instructions that read unbuffered resources by stall cycles.
+    if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
+                Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+      return TryCand.Reason != NoCand;
+  }
+
+  // Keep clustered nodes together to encourage downstream peephole
+  // optimizations which may reduce resource requirements.
+  //
+  // This is a best effort to set things up for a post-RA pass. Optimizations
+  // like generating loads of multiple registers should ideally be done within
+  // the scheduler pass by combining the loads during DAG postprocessing.
+  const SUnit *CandNextClusterSU =
+    Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+  const SUnit *TryCandNextClusterSU =
+    TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+  if (tryGreater(TryCand.SU == TryCandNextClusterSU,
+                 Cand.SU == CandNextClusterSU,
+                 TryCand, Cand, Cluster))
+    return TryCand.Reason != NoCand;
+
+  if (SameBoundary) {
+    // Weak edges are for clustering and other constraints.
+    if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
+                getWeakLeft(Cand.SU, Cand.AtTop),
+                TryCand, Cand, Weak))
+      return TryCand.Reason != NoCand;
+  }
+
+  // Avoid increasing the max pressure of the entire region.
+  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
+                                               Cand.RPDelta.CurrentMax,
+                                               TryCand, Cand, RegMax, TRI,
+                                               DAG->MF))
+    return TryCand.Reason != NoCand;
+
+  if (SameBoundary) {
+    // Avoid critical resource consumption and balance the schedule.
+    TryCand.initResourceDelta(DAG, SchedModel);
+    if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+                TryCand, Cand, ResourceReduce))
+      return TryCand.Reason != NoCand;
+    if (tryGreater(TryCand.ResDelta.DemandedResources,
+                   Cand.ResDelta.DemandedResources,
+                   TryCand, Cand, ResourceDemand))
+      return TryCand.Reason != NoCand;
+
+    // Avoid serializing long latency dependence chains.
+    // For acyclic path limited loops, latency was already checked above.
+    if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
+        !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
+      return TryCand.Reason != NoCand;
+
+    // Fall through to original instruction order.
+    if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
+        || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
+      TryCand.Reason = NodeOrder;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// Pick the best candidate from the queue.
+///
+/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
+/// DAG building. To adjust for the current scheduling location we need to
+/// maintain the number of vreg uses remaining to be top-scheduled.
+void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
+                                         const CandPolicy &ZonePolicy,
+                                         const RegPressureTracker &RPTracker,
+                                         SchedCandidate &Cand) {
+  // getMaxPressureDelta temporarily modifies the tracker.
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+  ReadyQueue &Q = Zone.Available;
+  for (SUnit *SU : Q) {
+
+    SchedCandidate TryCand(ZonePolicy);
+    initCandidate(TryCand, SU, Zone.isTop(), RPTracker, TempTracker);
+    // Pass SchedBoundary only when comparing nodes from the same boundary.
+    SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
+    if (tryCandidate(Cand, TryCand, ZoneArg)) {
+      // Initialize resource delta if needed in case future heuristics query it.
+      if (TryCand.ResDelta == SchedResourceDelta())
+        TryCand.initResourceDelta(DAG, SchedModel);
+      Cand.setBest(TryCand);
+      LLVM_DEBUG(traceCandidate(Cand));
+    }
+  }
+}
+
+/// Pick the best candidate node from either the top or bottom queue.
+SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
+  // Schedule as far as possible in the direction of no choice. This is most
+  // efficient, but also provides the best heuristics for CriticalPSets.
+  if (SUnit *SU = Bot.pickOnlyChoice()) {
+    IsTopNode = false;
+    tracePick(Only1, false);
+    return SU;
+  }
+  if (SUnit *SU = Top.pickOnlyChoice()) {
+    IsTopNode = true;
+    tracePick(Only1, true);
+    return SU;
+  }
+  // Set the bottom-up policy based on the state of the current bottom zone and
+  // the instructions outside the zone, including the top zone.
+  CandPolicy BotPolicy;
+  setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
+  // Set the top-down policy based on the state of the current top zone and
+  // the instructions outside the zone, including the bottom zone.
+  CandPolicy TopPolicy;
+  setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
+
+  // See if BotCand is still valid (because we previously scheduled from Top).
+  LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
+  if (!BotCand.isValid() || BotCand.SU->isScheduled ||
+      BotCand.Policy != BotPolicy) {
+    BotCand.reset(CandPolicy());
+    pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
+    assert(BotCand.Reason != NoCand && "failed to find the first candidate");
+  } else {
+    LLVM_DEBUG(traceCandidate(BotCand));
+#ifndef NDEBUG
+    if (VerifyScheduling) {
+      SchedCandidate TCand;
+      TCand.reset(CandPolicy());
+      pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
+      assert(TCand.SU == BotCand.SU &&
+             "Last pick result should correspond to re-picking right now");
+    }
+#endif
+  }
+
+  // Check if the top Q has a better candidate.
+  LLVM_DEBUG(dbgs() << "Picking from Top:\n");
+  if (!TopCand.isValid() || TopCand.SU->isScheduled ||
+      TopCand.Policy != TopPolicy) {
+    TopCand.reset(CandPolicy());
+    pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
+    assert(TopCand.Reason != NoCand && "failed to find the first candidate");
+  } else {
+    LLVM_DEBUG(traceCandidate(TopCand));
+#ifndef NDEBUG
+    if (VerifyScheduling) {
+      SchedCandidate TCand;
+      TCand.reset(CandPolicy());
+      pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
+      assert(TCand.SU == TopCand.SU &&
+           "Last pick result should correspond to re-picking right now");
+    }
+#endif
+  }
+
+  // Pick best from BotCand and TopCand.
+  assert(BotCand.isValid());
+  assert(TopCand.isValid());
+  SchedCandidate Cand = BotCand;
+  TopCand.Reason = NoCand;
+  if (tryCandidate(Cand, TopCand, nullptr)) {
+    Cand.setBest(TopCand);
+    LLVM_DEBUG(traceCandidate(Cand));
+  }
+
+  IsTopNode = Cand.AtTop;
+  tracePick(Cand);
+  return Cand.SU;
+}
+
+/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
+SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+    return nullptr;
+  }
+  SUnit *SU;
+  do {
+    if (RegionPolicy.OnlyTopDown) {
+      SU = Top.pickOnlyChoice();
+      if (!SU) {
+        CandPolicy NoPolicy;
+        TopCand.reset(NoPolicy);
+        pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
+        assert(TopCand.Reason != NoCand && "failed to find a candidate");
+        tracePick(TopCand);
+        SU = TopCand.SU;
+      }
+      IsTopNode = true;
+    } else if (RegionPolicy.OnlyBottomUp) {
+      SU = Bot.pickOnlyChoice();
+      if (!SU) {
+        CandPolicy NoPolicy;
+        BotCand.reset(NoPolicy);
+        pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
+        assert(BotCand.Reason != NoCand && "failed to find a candidate");
+        tracePick(BotCand);
+        SU = BotCand.SU;
+      }
+      IsTopNode = false;
+    } else {
+      SU = pickNodeBidirectional(IsTopNode);
+    }
+  } while (SU->isScheduled);
+
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
+
+  LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+                    << *SU->getInstr());
+  return SU;
+}
+
+void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) {
+  MachineBasicBlock::iterator InsertPos = SU->getInstr();
+  if (!isTop)
+    ++InsertPos;
+  SmallVectorImpl<SDep> &Deps = isTop ? SU->Preds : SU->Succs;
+
+  // Find already scheduled copies with a single physreg dependence and move
+  // them just above the scheduled instruction.
+  for (SDep &Dep : Deps) {
+    if (Dep.getKind() != SDep::Data ||
+        !Register::isPhysicalRegister(Dep.getReg()))
+      continue;
+    SUnit *DepSU = Dep.getSUnit();
+    if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1)
+      continue;
+    MachineInstr *Copy = DepSU->getInstr();
+    if (!Copy->isCopy() && !Copy->isMoveImmediate())
+      continue;
+    LLVM_DEBUG(dbgs() << "  Rescheduling physreg copy ";
+               DAG->dumpNode(*Dep.getSUnit()));
+    DAG->moveInstruction(Copy, InsertPos);
+  }
+}
+
+/// Update the scheduler's state after scheduling a node. This is the same node
+/// that was just returned by pickNode(). However, ScheduleDAGMILive needs to
+/// update it's state based on the current cycle before MachineSchedStrategy
+/// does.
+///
+/// FIXME: Eventually, we may bundle physreg copies rather than rescheduling
+/// them here. See comments in biasPhysReg.
+void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+  if (IsTopNode) {
+    SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
+    Top.bumpNode(SU);
+    if (SU->hasPhysRegUses)
+      reschedulePhysReg(SU, true);
+  } else {
+    SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
+    Bot.bumpNode(SU);
+    if (SU->hasPhysRegDefs)
+      reschedulePhysReg(SU, false);
+  }
+}
+
+/// Create the standard converging machine scheduler. This will be used as the
+/// default scheduler if the target does not set a default.
+ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) {
+  ScheduleDAGMILive *DAG =
+      new ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C));
+  // Register DAG post-processors.
+  //
+  // FIXME: extend the mutation API to allow earlier mutations to instantiate
+  // data and pass it to later mutations. Have a single mutation that gathers
+  // the interesting nodes in one pass.
+  DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
+  return DAG;
+}
+
+static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
+  return createGenericSchedLive(C);
+}
+
+static MachineSchedRegistry
+GenericSchedRegistry("converge", "Standard converging scheduler.",
+                     createConvergingSched);
+
+//===----------------------------------------------------------------------===//
+// PostGenericScheduler - Generic PostRA implementation of MachineSchedStrategy.
+//===----------------------------------------------------------------------===//
+
+void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
+  DAG = Dag;
+  SchedModel = DAG->getSchedModel();
+  TRI = DAG->TRI;
+
+  Rem.init(DAG, SchedModel);
+  Top.init(DAG, SchedModel, &Rem);
+  BotRoots.clear();
+
+  // Initialize the HazardRecognizers. If itineraries don't exist, are empty,
+  // or are disabled, then these HazardRecs will be disabled.
+  const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
+  if (!Top.HazardRec) {
+    Top.HazardRec =
+        DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
+            Itin, DAG);
+  }
+}
+
+void PostGenericScheduler::registerRoots() {
+  Rem.CriticalPath = DAG->ExitSU.getDepth();
+
+  // Some roots may not feed into ExitSU. Check all of them in case.
+  for (const SUnit *SU : BotRoots) {
+    if (SU->getDepth() > Rem.CriticalPath)
+      Rem.CriticalPath = SU->getDepth();
+  }
+  LLVM_DEBUG(dbgs() << "Critical Path: (PGS-RR) " << Rem.CriticalPath << '\n');
+  if (DumpCriticalPathLength) {
+    errs() << "Critical Path(PGS-RR ): " << Rem.CriticalPath << " \n";
+  }
+}
+
+/// Apply a set of heuristics to a new candidate for PostRA scheduling.
+///
+/// \param Cand provides the policy and current best candidate.
+/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
+/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
+bool PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
+                                        SchedCandidate &TryCand) {
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  // Prioritize instructions that read unbuffered resources by stall cycles.
+  if (tryLess(Top.getLatencyStallCycles(TryCand.SU),
+              Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+    return TryCand.Reason != NoCand;
+
+  // Keep clustered nodes together.
+  if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(),
+                 Cand.SU == DAG->getNextClusterSucc(),
+                 TryCand, Cand, Cluster))
+    return TryCand.Reason != NoCand;
+
+  // Avoid critical resource consumption and balance the schedule.
+  if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+              TryCand, Cand, ResourceReduce))
+    return TryCand.Reason != NoCand;
+  if (tryGreater(TryCand.ResDelta.DemandedResources,
+                 Cand.ResDelta.DemandedResources,
+                 TryCand, Cand, ResourceDemand))
+    return TryCand.Reason != NoCand;
+
+  // Avoid serializing long latency dependence chains.
+  if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) {
+    return TryCand.Reason != NoCand;
+  }
+
+  // Fall through to original instruction order.
+  if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
+    TryCand.Reason = NodeOrder;
+    return true;
+  }
+
+  return false;
+}
+
+void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
+  ReadyQueue &Q = Top.Available;
+  for (SUnit *SU : Q) {
+    SchedCandidate TryCand(Cand.Policy);
+    TryCand.SU = SU;
+    TryCand.AtTop = true;
+    TryCand.initResourceDelta(DAG, SchedModel);
+    if (tryCandidate(Cand, TryCand)) {
+      Cand.setBest(TryCand);
+      LLVM_DEBUG(traceCandidate(Cand));
+    }
+  }
+}
+
+/// Pick the next node to schedule.
+SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() && "ReadyQ garbage");
+    return nullptr;
+  }
+  SUnit *SU;
+  do {
+    SU = Top.pickOnlyChoice();
+    if (SU) {
+      tracePick(Only1, true);
+    } else {
+      CandPolicy NoPolicy;
+      SchedCandidate TopCand(NoPolicy);
+      // Set the top-down policy based on the state of the current top zone and
+      // the instructions outside the zone, including the bottom zone.
+      setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr);
+      pickNodeFromQueue(TopCand);
+      assert(TopCand.Reason != NoCand && "failed to find a candidate");
+      tracePick(TopCand);
+      SU = TopCand.SU;
+    }
+  } while (SU->isScheduled);
+
+  IsTopNode = true;
+  Top.removeReady(SU);
+
+  LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+                    << *SU->getInstr());
+  return SU;
+}
+
+/// Called after ScheduleDAGMI has scheduled an instruction and updated
+/// scheduled/remaining flags in the DAG nodes.
+void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+  SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
+  Top.bumpNode(SU);
+}
+
+ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {
+  return new ScheduleDAGMI(C, std::make_unique<PostGenericScheduler>(C),
+                           /*RemoveKillFlags=*/true);
+}
+
+//===----------------------------------------------------------------------===//
+// ILP Scheduler. Currently for experimental analysis of heuristics.
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Order nodes by the ILP metric.
+struct ILPOrder {
+  const SchedDFSResult *DFSResult = nullptr;
+  const BitVector *ScheduledTrees = nullptr;
+  bool MaximizeILP;
+
+  ILPOrder(bool MaxILP) : MaximizeILP(MaxILP) {}
+
+  /// Apply a less-than relation on node priority.
+  ///
+  /// (Return true if A comes after B in the Q.)
+  bool operator()(const SUnit *A, const SUnit *B) const {
+    unsigned SchedTreeA = DFSResult->getSubtreeID(A);
+    unsigned SchedTreeB = DFSResult->getSubtreeID(B);
+    if (SchedTreeA != SchedTreeB) {
+      // Unscheduled trees have lower priority.
+      if (ScheduledTrees->test(SchedTreeA) != ScheduledTrees->test(SchedTreeB))
+        return ScheduledTrees->test(SchedTreeB);
+
+      // Trees with shallower connections have have lower priority.
+      if (DFSResult->getSubtreeLevel(SchedTreeA)
+          != DFSResult->getSubtreeLevel(SchedTreeB)) {
+        return DFSResult->getSubtreeLevel(SchedTreeA)
+          < DFSResult->getSubtreeLevel(SchedTreeB);
+      }
+    }
+    if (MaximizeILP)
+      return DFSResult->getILP(A) < DFSResult->getILP(B);
+    else
+      return DFSResult->getILP(A) > DFSResult->getILP(B);
+  }
+};
+
+/// Schedule based on the ILP metric.
+class ILPScheduler : public MachineSchedStrategy {
+  ScheduleDAGMILive *DAG = nullptr;
+  ILPOrder Cmp;
+
+  std::vector<SUnit*> ReadyQ;
+
+public:
+  ILPScheduler(bool MaximizeILP) : Cmp(MaximizeILP) {}
+
+  void initialize(ScheduleDAGMI *dag) override {
+    assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness");
+    DAG = static_cast<ScheduleDAGMILive*>(dag);
+    DAG->computeDFSResult();
+    Cmp.DFSResult = DAG->getDFSResult();
+    Cmp.ScheduledTrees = &DAG->getScheduledTrees();
+    ReadyQ.clear();
+  }
+
+  void registerRoots() override {
+    // Restore the heap in ReadyQ with the updated DFS results.
+    std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
+  }
+
+  /// Implement MachineSchedStrategy interface.
+  /// -----------------------------------------
+
+  /// Callback to select the highest priority node from the ready Q.
+  SUnit *pickNode(bool &IsTopNode) override {
+    if (ReadyQ.empty()) return nullptr;
+    std::pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
+    SUnit *SU = ReadyQ.back();
+    ReadyQ.pop_back();
+    IsTopNode = false;
+    LLVM_DEBUG(dbgs() << "Pick node "
+                      << "SU(" << SU->NodeNum << ") "
+                      << " ILP: " << DAG->getDFSResult()->getILP(SU)
+                      << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU)
+                      << " @"
+                      << DAG->getDFSResult()->getSubtreeLevel(
+                             DAG->getDFSResult()->getSubtreeID(SU))
+                      << '\n'
+                      << "Scheduling " << *SU->getInstr());
+    return SU;
+  }
+
+  /// Scheduler callback to notify that a new subtree is scheduled.
+  void scheduleTree(unsigned SubtreeID) override {
+    std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
+  }
+
+  /// Callback after a node is scheduled. Mark a newly scheduled tree, notify
+  /// DFSResults, and resort the priority Q.
+  void schedNode(SUnit *SU, bool IsTopNode) override {
+    assert(!IsTopNode && "SchedDFSResult needs bottom-up");
+  }
+
+  void releaseTopNode(SUnit *) override { /*only called for top roots*/ }
+
+  void releaseBottomNode(SUnit *SU) override {
+    ReadyQ.push_back(SU);
+    std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
+  }
+};
+
+} // end anonymous namespace
+
+static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) {
+  return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(true));
+}
+static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) {
+  return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(false));
+}
+
+static MachineSchedRegistry ILPMaxRegistry(
+  "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler);
+static MachineSchedRegistry ILPMinRegistry(
+  "ilpmin", "Schedule bottom-up for min ILP", createILPMinScheduler);
+
+//===----------------------------------------------------------------------===//
+// Machine Instruction Shuffler for Correctness Testing
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+namespace {
+
+/// Apply a less-than relation on the node order, which corresponds to the
+/// instruction order prior to scheduling. IsReverse implements greater-than.
+template<bool IsReverse>
+struct SUnitOrder {
+  bool operator()(SUnit *A, SUnit *B) const {
+    if (IsReverse)
+      return A->NodeNum > B->NodeNum;
+    else
+      return A->NodeNum < B->NodeNum;
+  }
+};
+
+/// Reorder instructions as much as possible.
+class InstructionShuffler : public MachineSchedStrategy {
+  bool IsAlternating;
+  bool IsTopDown;
+
+  // Using a less-than relation (SUnitOrder<false>) for the TopQ priority
+  // gives nodes with a higher number higher priority causing the latest
+  // instructions to be scheduled first.
+  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false>>
+    TopQ;
+
+  // When scheduling bottom-up, use greater-than as the queue priority.
+  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true>>
+    BottomQ;
+
+public:
+  InstructionShuffler(bool alternate, bool topdown)
+    : IsAlternating(alternate), IsTopDown(topdown) {}
+
+  void initialize(ScheduleDAGMI*) override {
+    TopQ.clear();
+    BottomQ.clear();
+  }
+
+  /// Implement MachineSchedStrategy interface.
+  /// -----------------------------------------
+
+  SUnit *pickNode(bool &IsTopNode) override {
+    SUnit *SU;
+    if (IsTopDown) {
+      do {
+        if (TopQ.empty()) return nullptr;
+        SU = TopQ.top();
+        TopQ.pop();
+      } while (SU->isScheduled);
+      IsTopNode = true;
+    } else {
+      do {
+        if (BottomQ.empty()) return nullptr;
+        SU = BottomQ.top();
+        BottomQ.pop();
+      } while (SU->isScheduled);
+      IsTopNode = false;
+    }
+    if (IsAlternating)
+      IsTopDown = !IsTopDown;
+    return SU;
+  }
+
+  void schedNode(SUnit *SU, bool IsTopNode) override {}
+
+  void releaseTopNode(SUnit *SU) override {
+    TopQ.push(SU);
+  }
+  void releaseBottomNode(SUnit *SU) override {
+    BottomQ.push(SU);
+  }
+};
+
+} // end anonymous namespace
+
+static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
+  bool Alternate = !ForceTopDown && !ForceBottomUp;
+  bool TopDown = !ForceBottomUp;
+  assert((TopDown || !ForceTopDown) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+  return new ScheduleDAGMILive(
+      C, std::make_unique<InstructionShuffler>(Alternate, TopDown));
+}
+
+static MachineSchedRegistry ShufflerRegistry(
+  "shuffle", "Shuffle machine instructions alternating directions",
+  createInstructionShuffler);
+#endif // !NDEBUG
+
+//===----------------------------------------------------------------------===//
+// GraphWriter support for ScheduleDAGMILive.
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+namespace llvm {
+
+template<> struct GraphTraits<
+  ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {};
+
+template<>
+struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const ScheduleDAG *G) {
+    return std::string(G->MF.getName());
+  }
+
+  static bool renderGraphFromBottomUp() {
+    return true;
+  }
+
+  static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
+    if (ViewMISchedCutoff == 0)
+      return false;
+    return (Node->Preds.size() > ViewMISchedCutoff
+         || Node->Succs.size() > ViewMISchedCutoff);
+  }
+
+  /// If you want to override the dot attributes printed for a particular
+  /// edge, override this method.
+  static std::string getEdgeAttributes(const SUnit *Node,
+                                       SUnitIterator EI,
+                                       const ScheduleDAG *Graph) {
+    if (EI.isArtificialDep())
+      return "color=cyan,style=dashed";
+    if (EI.isCtrlDep())
+      return "color=blue,style=dashed";
+    return "";
+  }
+
+  static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) {
+    std::string Str;
+    raw_string_ostream SS(Str);
+    const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
+    const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
+      static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr;
+    SS << "SU:" << SU->NodeNum;
+    if (DFS)
+      SS << " I:" << DFS->getNumInstrs(SU);
+    return SS.str();
+  }
+
+  static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) {
+    return G->getGraphNodeLabel(SU);
+  }
+
+  static std::string getNodeAttributes(const SUnit *N, const ScheduleDAG *G) {
+    std::string Str("shape=Mrecord");
+    const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
+    const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
+      static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr;
+    if (DFS) {
+      Str += ",style=filled,fillcolor=\"#";
+      Str += DOT::getColorString(DFS->getSubtreeID(N));
+      Str += '"';
+    }
+    return Str;
+  }
+};
+
+} // end namespace llvm
+#endif // NDEBUG
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) {
+#ifndef NDEBUG
+  ViewGraph(this, Name, false, Title);
+#else
+  errs() << "ScheduleDAGMI::viewGraph is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif  // NDEBUG
+}
+
+/// Out-of-line implementation with no arguments is handy for gdb.
+void ScheduleDAGMI::viewGraph() {
+  viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName());
+}
+
+/// Sort predicate for the intervals stored in an instance of
+/// ResourceSegments. Intervals are always disjoint (no intersection
+/// for any pairs of intervals), therefore we can sort the totality of
+/// the intervals by looking only at the left boundary.
+static bool sortIntervals(const ResourceSegments::IntervalTy &A,
+                          const ResourceSegments::IntervalTy &B) {
+  return A.first < B.first;
+}
+
+unsigned ResourceSegments::getFirstAvailableAt(
+    unsigned CurrCycle, unsigned StartAtCycle, unsigned Cycle,
+    std::function<ResourceSegments::IntervalTy(unsigned, unsigned, unsigned)>
+        IntervalBuilder) const {
+  assert(std::is_sorted(std::begin(_Intervals), std::end(_Intervals),
+                        sortIntervals) &&
+         "Cannot execute on an un-sorted set of intervals.");
+  unsigned RetCycle = CurrCycle;
+  ResourceSegments::IntervalTy NewInterval =
+      IntervalBuilder(RetCycle, StartAtCycle, Cycle);
+  for (auto &Interval : _Intervals) {
+    if (!intersects(NewInterval, Interval))
+      continue;
+
+    // Move the interval right next to the top of the one it
+    // intersects.
+    assert(Interval.second > NewInterval.first &&
+           "Invalid intervals configuration.");
+    RetCycle += (unsigned)Interval.second - (unsigned)NewInterval.first;
+    NewInterval = IntervalBuilder(RetCycle, StartAtCycle, Cycle);
+  }
+  return RetCycle;
+}
+
+void ResourceSegments::add(ResourceSegments::IntervalTy A,
+                           const unsigned CutOff) {
+  assert(A.first < A.second && "Cannot add empty resource usage");
+  assert(CutOff > 0 && "0-size interval history has no use.");
+  assert(all_of(_Intervals,
+                [&A](const ResourceSegments::IntervalTy &Interval) -> bool {
+                  return !intersects(A, Interval);
+                }) &&
+         "A resource is being overwritten");
+  _Intervals.push_back(A);
+
+  sortAndMerge();
+
+  // Do not keep the full history of the intervals, just the
+  // latest #CutOff.
+  while (_Intervals.size() > CutOff)
+    _Intervals.pop_front();
+}
+
+bool ResourceSegments::intersects(ResourceSegments::IntervalTy A,
+                                  ResourceSegments::IntervalTy B) {
+  assert(A.first <= A.second && "Invalid interval");
+  assert(B.first <= B.second && "Invalid interval");
+
+  // Share one boundary.
+  if ((A.first == B.first) || (A.second == B.second))
+    return true;
+
+  // full intersersect: [    ***     )  B
+  //                        [***)       A
+  if ((A.first > B.first) && (A.second < B.second))
+    return true;
+
+  // right intersect: [     ***)        B
+  //                       [***      )  A
+  if ((A.first > B.first) && (A.first < B.second) && (A.second > B.second))
+    return true;
+
+  // left intersect:      [***      )  B
+  //                 [     ***)        A
+  if ((A.first < B.first) && (B.first < A.second) && (B.second > B.first))
+    return true;
+
+  return false;
+}
+
+void ResourceSegments::sortAndMerge() {
+  if (_Intervals.size() <= 1)
+    return;
+
+  // First sort the collection.
+  _Intervals.sort(sortIntervals);
+
+  // can use next because I have at least 2 elements in the list
+  auto next = std::next(std::begin(_Intervals));
+  auto E = std::end(_Intervals);
+  for (; next != E; ++next) {
+    if (std::prev(next)->second >= next->first) {
+      next->first = std::prev(next)->first;
+      _Intervals.erase(std::prev(next));
+      continue;
+    }
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineSink.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineSink.cpp
new file mode 100644
index 000000000000..8da97dc7e742
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineSink.cpp
@@ -0,0 +1,1892 @@
+//===- MachineSink.cpp - Sinking for machine instructions -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass moves instructions into successor blocks when possible, so that
+// they aren't executed on paths where their results aren't needed.
+//
+// This pass is not intended to be a replacement or a complete alternative
+// for an LLVM-IR-level sinking pass. It is only designed to sink simple
+// constructs that are not exposed before lowering and instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <map>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-sink"
+
+static cl::opt<bool>
+SplitEdges("machine-sink-split",
+           cl::desc("Split critical edges during machine sinking"),
+           cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+UseBlockFreqInfo("machine-sink-bfi",
+           cl::desc("Use block frequency info to find successors to sink"),
+           cl::init(true), cl::Hidden);
+
+static cl::opt<unsigned> SplitEdgeProbabilityThreshold(
+    "machine-sink-split-probability-threshold",
+    cl::desc(
+        "Percentage threshold for splitting single-instruction critical edge. "
+        "If the branch threshold is higher than this threshold, we allow "
+        "speculative execution of up to 1 instruction to avoid branching to "
+        "splitted critical edge"),
+    cl::init(40), cl::Hidden);
+
+static cl::opt<unsigned> SinkLoadInstsPerBlockThreshold(
+    "machine-sink-load-instrs-threshold",
+    cl::desc("Do not try to find alias store for a load if there is a in-path "
+             "block whose instruction number is higher than this threshold."),
+    cl::init(2000), cl::Hidden);
+
+static cl::opt<unsigned> SinkLoadBlocksThreshold(
+    "machine-sink-load-blocks-threshold",
+    cl::desc("Do not try to find alias store for a load if the block number in "
+             "the straight line is higher than this threshold."),
+    cl::init(20), cl::Hidden);
+
+static cl::opt<bool>
+    SinkInstsIntoCycle("sink-insts-to-avoid-spills",
+                       cl::desc("Sink instructions into cycles to avoid "
+                                "register spills"),
+                       cl::init(false), cl::Hidden);
+
+static cl::opt<unsigned> SinkIntoCycleLimit(
+    "machine-sink-cycle-limit",
+    cl::desc("The maximum number of instructions considered for cycle sinking."),
+    cl::init(50), cl::Hidden);
+
+STATISTIC(NumSunk,      "Number of machine instructions sunk");
+STATISTIC(NumCycleSunk,  "Number of machine instructions sunk into a cycle");
+STATISTIC(NumSplit,     "Number of critical edges split");
+STATISTIC(NumCoalesces, "Number of copies coalesced");
+STATISTIC(NumPostRACopySink, "Number of copies sunk after RA");
+
+namespace {
+
+  class MachineSinking : public MachineFunctionPass {
+    const TargetInstrInfo *TII = nullptr;
+    const TargetRegisterInfo *TRI = nullptr;
+    MachineRegisterInfo *MRI = nullptr;      // Machine register information
+    MachineDominatorTree *DT = nullptr;      // Machine dominator tree
+    MachinePostDominatorTree *PDT = nullptr; // Machine post dominator tree
+    MachineCycleInfo *CI = nullptr;
+    MachineBlockFrequencyInfo *MBFI = nullptr;
+    const MachineBranchProbabilityInfo *MBPI = nullptr;
+    AliasAnalysis *AA = nullptr;
+    RegisterClassInfo RegClassInfo;
+
+    // Remember which edges have been considered for breaking.
+    SmallSet<std::pair<MachineBasicBlock*, MachineBasicBlock*>, 8>
+    CEBCandidates;
+    // Remember which edges we are about to split.
+    // This is different from CEBCandidates since those edges
+    // will be split.
+    SetVector<std::pair<MachineBasicBlock *, MachineBasicBlock *>> ToSplit;
+
+    DenseSet<Register> RegsToClearKillFlags;
+
+    using AllSuccsCache =
+        std::map<MachineBasicBlock *, SmallVector<MachineBasicBlock *, 4>>;
+
+    /// DBG_VALUE pointer and flag. The flag is true if this DBG_VALUE is
+    /// post-dominated by another DBG_VALUE of the same variable location.
+    /// This is necessary to detect sequences such as:
+    ///     %0 = someinst
+    ///     DBG_VALUE %0, !123, !DIExpression()
+    ///     %1 = anotherinst
+    ///     DBG_VALUE %1, !123, !DIExpression()
+    /// Where if %0 were to sink, the DBG_VAUE should not sink with it, as that
+    /// would re-order assignments.
+    using SeenDbgUser = PointerIntPair<MachineInstr *, 1>;
+
+    /// Record of DBG_VALUE uses of vregs in a block, so that we can identify
+    /// debug instructions to sink.
+    SmallDenseMap<unsigned, TinyPtrVector<SeenDbgUser>> SeenDbgUsers;
+
+    /// Record of debug variables that have had their locations set in the
+    /// current block.
+    DenseSet<DebugVariable> SeenDbgVars;
+
+    std::map<std::pair<MachineBasicBlock *, MachineBasicBlock *>, bool>
+        HasStoreCache;
+    std::map<std::pair<MachineBasicBlock *, MachineBasicBlock *>,
+             std::vector<MachineInstr *>>
+        StoreInstrCache;
+
+    /// Cached BB's register pressure.
+    std::map<MachineBasicBlock *, std::vector<unsigned>> CachedRegisterPressure;
+
+  public:
+    static char ID; // Pass identification
+
+    MachineSinking() : MachineFunctionPass(ID) {
+      initializeMachineSinkingPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachinePostDominatorTree>();
+      AU.addRequired<MachineCycleInfoWrapperPass>();
+      AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addPreserved<MachineCycleInfoWrapperPass>();
+      AU.addPreserved<MachineLoopInfo>();
+      if (UseBlockFreqInfo)
+        AU.addRequired<MachineBlockFrequencyInfo>();
+    }
+
+    void releaseMemory() override {
+      CEBCandidates.clear();
+    }
+
+  private:
+    bool ProcessBlock(MachineBasicBlock &MBB);
+    void ProcessDbgInst(MachineInstr &MI);
+    bool isWorthBreakingCriticalEdge(MachineInstr &MI,
+                                     MachineBasicBlock *From,
+                                     MachineBasicBlock *To);
+
+    bool hasStoreBetween(MachineBasicBlock *From, MachineBasicBlock *To,
+                         MachineInstr &MI);
+
+    /// Postpone the splitting of the given critical
+    /// edge (\p From, \p To).
+    ///
+    /// We do not split the edges on the fly. Indeed, this invalidates
+    /// the dominance information and thus triggers a lot of updates
+    /// of that information underneath.
+    /// Instead, we postpone all the splits after each iteration of
+    /// the main loop. That way, the information is at least valid
+    /// for the lifetime of an iteration.
+    ///
+    /// \return True if the edge is marked as toSplit, false otherwise.
+    /// False can be returned if, for instance, this is not profitable.
+    bool PostponeSplitCriticalEdge(MachineInstr &MI,
+                                   MachineBasicBlock *From,
+                                   MachineBasicBlock *To,
+                                   bool BreakPHIEdge);
+    bool SinkInstruction(MachineInstr &MI, bool &SawStore,
+                         AllSuccsCache &AllSuccessors);
+
+    /// If we sink a COPY inst, some debug users of it's destination may no
+    /// longer be dominated by the COPY, and will eventually be dropped.
+    /// This is easily rectified by forwarding the non-dominated debug uses
+    /// to the copy source.
+    void SalvageUnsunkDebugUsersOfCopy(MachineInstr &,
+                                       MachineBasicBlock *TargetBlock);
+    bool AllUsesDominatedByBlock(Register Reg, MachineBasicBlock *MBB,
+                                 MachineBasicBlock *DefMBB, bool &BreakPHIEdge,
+                                 bool &LocalUse) const;
+    MachineBasicBlock *FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
+               bool &BreakPHIEdge, AllSuccsCache &AllSuccessors);
+
+    void FindCycleSinkCandidates(MachineCycle *Cycle, MachineBasicBlock *BB,
+                                 SmallVectorImpl<MachineInstr *> &Candidates);
+    bool SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I);
+
+    bool isProfitableToSinkTo(Register Reg, MachineInstr &MI,
+                              MachineBasicBlock *MBB,
+                              MachineBasicBlock *SuccToSinkTo,
+                              AllSuccsCache &AllSuccessors);
+
+    bool PerformTrivialForwardCoalescing(MachineInstr &MI,
+                                         MachineBasicBlock *MBB);
+
+    SmallVector<MachineBasicBlock *, 4> &
+    GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
+                           AllSuccsCache &AllSuccessors) const;
+
+    std::vector<unsigned> &getBBRegisterPressure(MachineBasicBlock &MBB);
+  };
+
+} // end anonymous namespace
+
+char MachineSinking::ID = 0;
+
+char &llvm::MachineSinkingID = MachineSinking::ID;
+
+INITIALIZE_PASS_BEGIN(MachineSinking, DEBUG_TYPE,
+                      "Machine code sinking", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(MachineSinking, DEBUG_TYPE,
+                    "Machine code sinking", false, false)
+
+/// Return true if a target defined block prologue instruction interferes
+/// with a sink candidate.
+static bool blockPrologueInterferes(const MachineBasicBlock *BB,
+                                    MachineBasicBlock::const_iterator End,
+                                    const MachineInstr &MI,
+                                    const TargetRegisterInfo *TRI,
+                                    const TargetInstrInfo *TII,
+                                    const MachineRegisterInfo *MRI) {
+  for (MachineBasicBlock::const_iterator PI = BB->getFirstNonPHI(); PI != End;
+       ++PI) {
+    // Only check target defined prologue instructions
+    if (!TII->isBasicBlockPrologue(*PI))
+      continue;
+    for (auto &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      if (MO.isUse()) {
+        if (Reg.isPhysical() && MRI && MRI->isConstantPhysReg(Reg))
+          continue;
+        if (PI->modifiesRegister(Reg, TRI))
+          return true;
+      } else {
+        if (PI->readsRegister(Reg, TRI))
+          return true;
+        // Check for interference with non-dead defs
+        auto *DefOp = PI->findRegisterDefOperand(Reg, false, true, TRI);
+        if (DefOp && !DefOp->isDead())
+          return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr &MI,
+                                                     MachineBasicBlock *MBB) {
+  if (!MI.isCopy())
+    return false;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
+  if (!SrcReg.isVirtual() || !DstReg.isVirtual() ||
+      !MRI->hasOneNonDBGUse(SrcReg))
+    return false;
+
+  const TargetRegisterClass *SRC = MRI->getRegClass(SrcReg);
+  const TargetRegisterClass *DRC = MRI->getRegClass(DstReg);
+  if (SRC != DRC)
+    return false;
+
+  MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
+  if (DefMI->isCopyLike())
+    return false;
+  LLVM_DEBUG(dbgs() << "Coalescing: " << *DefMI);
+  LLVM_DEBUG(dbgs() << "*** to: " << MI);
+  MRI->replaceRegWith(DstReg, SrcReg);
+  MI.eraseFromParent();
+
+  // Conservatively, clear any kill flags, since it's possible that they are no
+  // longer correct.
+  MRI->clearKillFlags(SrcReg);
+
+  ++NumCoalesces;
+  return true;
+}
+
+/// AllUsesDominatedByBlock - Return true if all uses of the specified register
+/// occur in blocks dominated by the specified block. If any use is in the
+/// definition block, then return false since it is never legal to move def
+/// after uses.
+bool MachineSinking::AllUsesDominatedByBlock(Register Reg,
+                                             MachineBasicBlock *MBB,
+                                             MachineBasicBlock *DefMBB,
+                                             bool &BreakPHIEdge,
+                                             bool &LocalUse) const {
+  assert(Reg.isVirtual() && "Only makes sense for vregs");
+
+  // Ignore debug uses because debug info doesn't affect the code.
+  if (MRI->use_nodbg_empty(Reg))
+    return true;
+
+  // BreakPHIEdge is true if all the uses are in the successor MBB being sunken
+  // into and they are all PHI nodes. In this case, machine-sink must break
+  // the critical edge first. e.g.
+  //
+  // %bb.1:
+  //   Predecessors according to CFG: %bb.0
+  //     ...
+  //     %def = DEC64_32r %x, implicit-def dead %eflags
+  //     ...
+  //     JE_4 <%bb.37>, implicit %eflags
+  //   Successors according to CFG: %bb.37 %bb.2
+  //
+  // %bb.2:
+  //     %p = PHI %y, %bb.0, %def, %bb.1
+  if (all_of(MRI->use_nodbg_operands(Reg), [&](MachineOperand &MO) {
+        MachineInstr *UseInst = MO.getParent();
+        unsigned OpNo = MO.getOperandNo();
+        MachineBasicBlock *UseBlock = UseInst->getParent();
+        return UseBlock == MBB && UseInst->isPHI() &&
+               UseInst->getOperand(OpNo + 1).getMBB() == DefMBB;
+      })) {
+    BreakPHIEdge = true;
+    return true;
+  }
+
+  for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+    // Determine the block of the use.
+    MachineInstr *UseInst = MO.getParent();
+    unsigned OpNo = &MO - &UseInst->getOperand(0);
+    MachineBasicBlock *UseBlock = UseInst->getParent();
+    if (UseInst->isPHI()) {
+      // PHI nodes use the operand in the predecessor block, not the block with
+      // the PHI.
+      UseBlock = UseInst->getOperand(OpNo+1).getMBB();
+    } else if (UseBlock == DefMBB) {
+      LocalUse = true;
+      return false;
+    }
+
+    // Check that it dominates.
+    if (!DT->dominates(MBB, UseBlock))
+      return false;
+  }
+
+  return true;
+}
+
+/// Return true if this machine instruction loads from global offset table or
+/// constant pool.
+static bool mayLoadFromGOTOrConstantPool(MachineInstr &MI) {
+  assert(MI.mayLoad() && "Expected MI that loads!");
+
+  // If we lost memory operands, conservatively assume that the instruction
+  // reads from everything..
+  if (MI.memoperands_empty())
+    return true;
+
+  for (MachineMemOperand *MemOp : MI.memoperands())
+    if (const PseudoSourceValue *PSV = MemOp->getPseudoValue())
+      if (PSV->isGOT() || PSV->isConstantPool())
+        return true;
+
+  return false;
+}
+
+void MachineSinking::FindCycleSinkCandidates(
+    MachineCycle *Cycle, MachineBasicBlock *BB,
+    SmallVectorImpl<MachineInstr *> &Candidates) {
+  for (auto &MI : *BB) {
+    LLVM_DEBUG(dbgs() << "CycleSink: Analysing candidate: " << MI);
+    if (!TII->shouldSink(MI)) {
+      LLVM_DEBUG(dbgs() << "CycleSink: Instruction not a candidate for this "
+                           "target\n");
+      continue;
+    }
+    if (!isCycleInvariant(Cycle, MI)) {
+      LLVM_DEBUG(dbgs() << "CycleSink: Instruction is not cycle invariant\n");
+      continue;
+    }
+    bool DontMoveAcrossStore = true;
+    if (!MI.isSafeToMove(AA, DontMoveAcrossStore)) {
+      LLVM_DEBUG(dbgs() << "CycleSink: Instruction not safe to move.\n");
+      continue;
+    }
+    if (MI.mayLoad() && !mayLoadFromGOTOrConstantPool(MI)) {
+      LLVM_DEBUG(dbgs() << "CycleSink: Dont sink GOT or constant pool loads\n");
+      continue;
+    }
+    if (MI.isConvergent())
+      continue;
+
+    const MachineOperand &MO = MI.getOperand(0);
+    if (!MO.isReg() || !MO.getReg() || !MO.isDef())
+      continue;
+    if (!MRI->hasOneDef(MO.getReg()))
+      continue;
+
+    LLVM_DEBUG(dbgs() << "CycleSink: Instruction added as candidate.\n");
+    Candidates.push_back(&MI);
+  }
+}
+
+bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "******** Machine Sinking ********\n");
+
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  DT = &getAnalysis<MachineDominatorTree>();
+  PDT = &getAnalysis<MachinePostDominatorTree>();
+  CI = &getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
+  MBFI = UseBlockFreqInfo ? &getAnalysis<MachineBlockFrequencyInfo>() : nullptr;
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  RegClassInfo.runOnMachineFunction(MF);
+
+  bool EverMadeChange = false;
+
+  while (true) {
+    bool MadeChange = false;
+
+    // Process all basic blocks.
+    CEBCandidates.clear();
+    ToSplit.clear();
+    for (auto &MBB: MF)
+      MadeChange |= ProcessBlock(MBB);
+
+    // If we have anything we marked as toSplit, split it now.
+    for (const auto &Pair : ToSplit) {
+      auto NewSucc = Pair.first->SplitCriticalEdge(Pair.second, *this);
+      if (NewSucc != nullptr) {
+        LLVM_DEBUG(dbgs() << " *** Splitting critical edge: "
+                          << printMBBReference(*Pair.first) << " -- "
+                          << printMBBReference(*NewSucc) << " -- "
+                          << printMBBReference(*Pair.second) << '\n');
+        if (MBFI)
+          MBFI->onEdgeSplit(*Pair.first, *NewSucc, *MBPI);
+
+        MadeChange = true;
+        ++NumSplit;
+      } else
+        LLVM_DEBUG(dbgs() << " *** Not legal to break critical edge\n");
+    }
+    // If this iteration over the code changed anything, keep iterating.
+    if (!MadeChange) break;
+    EverMadeChange = true;
+  }
+
+  if (SinkInstsIntoCycle) {
+    SmallVector<MachineCycle *, 8> Cycles(CI->toplevel_begin(),
+                                          CI->toplevel_end());
+    for (auto *Cycle : Cycles) {
+      MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
+      if (!Preheader) {
+        LLVM_DEBUG(dbgs() << "CycleSink: Can't find preheader\n");
+        continue;
+      }
+      SmallVector<MachineInstr *, 8> Candidates;
+      FindCycleSinkCandidates(Cycle, Preheader, Candidates);
+
+      // Walk the candidates in reverse order so that we start with the use
+      // of a def-use chain, if there is any.
+      // TODO: Sort the candidates using a cost-model.
+      unsigned i = 0;
+      for (MachineInstr *I : llvm::reverse(Candidates)) {
+        if (i++ == SinkIntoCycleLimit) {
+          LLVM_DEBUG(dbgs() << "CycleSink:   Limit reached of instructions to "
+                               "be analysed.");
+          break;
+        }
+
+        if (!SinkIntoCycle(Cycle, *I))
+          break;
+        EverMadeChange = true;
+        ++NumCycleSunk;
+      }
+    }
+  }
+
+  HasStoreCache.clear();
+  StoreInstrCache.clear();
+
+  // Now clear any kill flags for recorded registers.
+  for (auto I : RegsToClearKillFlags)
+    MRI->clearKillFlags(I);
+  RegsToClearKillFlags.clear();
+
+  return EverMadeChange;
+}
+
+bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
+  // Can't sink anything out of a block that has less than two successors.
+  if (MBB.succ_size() <= 1 || MBB.empty()) return false;
+
+  // Don't bother sinking code out of unreachable blocks. In addition to being
+  // unprofitable, it can also lead to infinite looping, because in an
+  // unreachable cycle there may be nowhere to stop.
+  if (!DT->isReachableFromEntry(&MBB)) return false;
+
+  bool MadeChange = false;
+
+  // Cache all successors, sorted by frequency info and cycle depth.
+  AllSuccsCache AllSuccessors;
+
+  // Walk the basic block bottom-up.  Remember if we saw a store.
+  MachineBasicBlock::iterator I = MBB.end();
+  --I;
+  bool ProcessedBegin, SawStore = false;
+  do {
+    MachineInstr &MI = *I;  // The instruction to sink.
+
+    // Predecrement I (if it's not begin) so that it isn't invalidated by
+    // sinking.
+    ProcessedBegin = I == MBB.begin();
+    if (!ProcessedBegin)
+      --I;
+
+    if (MI.isDebugOrPseudoInstr()) {
+      if (MI.isDebugValue())
+        ProcessDbgInst(MI);
+      continue;
+    }
+
+    bool Joined = PerformTrivialForwardCoalescing(MI, &MBB);
+    if (Joined) {
+      MadeChange = true;
+      continue;
+    }
+
+    if (SinkInstruction(MI, SawStore, AllSuccessors)) {
+      ++NumSunk;
+      MadeChange = true;
+    }
+
+    // If we just processed the first instruction in the block, we're done.
+  } while (!ProcessedBegin);
+
+  SeenDbgUsers.clear();
+  SeenDbgVars.clear();
+  // recalculate the bb register pressure after sinking one BB.
+  CachedRegisterPressure.clear();
+
+  return MadeChange;
+}
+
+void MachineSinking::ProcessDbgInst(MachineInstr &MI) {
+  // When we see DBG_VALUEs for registers, record any vreg it reads, so that
+  // we know what to sink if the vreg def sinks.
+  assert(MI.isDebugValue() && "Expected DBG_VALUE for processing");
+
+  DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
+                    MI.getDebugLoc()->getInlinedAt());
+  bool SeenBefore = SeenDbgVars.contains(Var);
+
+  for (MachineOperand &MO : MI.debug_operands()) {
+    if (MO.isReg() && MO.getReg().isVirtual())
+      SeenDbgUsers[MO.getReg()].push_back(SeenDbgUser(&MI, SeenBefore));
+  }
+
+  // Record the variable for any DBG_VALUE, to avoid re-ordering any of them.
+  SeenDbgVars.insert(Var);
+}
+
+bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr &MI,
+                                                 MachineBasicBlock *From,
+                                                 MachineBasicBlock *To) {
+  // FIXME: Need much better heuristics.
+
+  // If the pass has already considered breaking this edge (during this pass
+  // through the function), then let's go ahead and break it. This means
+  // sinking multiple "cheap" instructions into the same block.
+  if (!CEBCandidates.insert(std::make_pair(From, To)).second)
+    return true;
+
+  if (!MI.isCopy() && !TII->isAsCheapAsAMove(MI))
+    return true;
+
+  if (From->isSuccessor(To) && MBPI->getEdgeProbability(From, To) <=
+      BranchProbability(SplitEdgeProbabilityThreshold, 100))
+    return true;
+
+  // MI is cheap, we probably don't want to break the critical edge for it.
+  // However, if this would allow some definitions of its source operands
+  // to be sunk then it's probably worth it.
+  for (const MachineOperand &MO : MI.all_uses()) {
+    Register Reg = MO.getReg();
+    if (Reg == 0)
+      continue;
+
+    // We don't move live definitions of physical registers,
+    // so sinking their uses won't enable any opportunities.
+    if (Reg.isPhysical())
+      continue;
+
+    // If this instruction is the only user of a virtual register,
+    // check if breaking the edge will enable sinking
+    // both this instruction and the defining instruction.
+    if (MRI->hasOneNonDBGUse(Reg)) {
+      // If the definition resides in same MBB,
+      // claim it's likely we can sink these together.
+      // If definition resides elsewhere, we aren't
+      // blocking it from being sunk so don't break the edge.
+      MachineInstr *DefMI = MRI->getVRegDef(Reg);
+      if (DefMI->getParent() == MI.getParent())
+        return true;
+    }
+  }
+
+  return false;
+}
+
+bool MachineSinking::PostponeSplitCriticalEdge(MachineInstr &MI,
+                                               MachineBasicBlock *FromBB,
+                                               MachineBasicBlock *ToBB,
+                                               bool BreakPHIEdge) {
+  if (!isWorthBreakingCriticalEdge(MI, FromBB, ToBB))
+    return false;
+
+  // Avoid breaking back edge. From == To means backedge for single BB cycle.
+  if (!SplitEdges || FromBB == ToBB)
+    return false;
+
+  MachineCycle *FromCycle = CI->getCycle(FromBB);
+  MachineCycle *ToCycle = CI->getCycle(ToBB);
+
+  // Check for backedges of more "complex" cycles.
+  if (FromCycle == ToCycle && FromCycle &&
+      (!FromCycle->isReducible() || FromCycle->getHeader() == ToBB))
+    return false;
+
+  // It's not always legal to break critical edges and sink the computation
+  // to the edge.
+  //
+  // %bb.1:
+  // v1024
+  // Beq %bb.3
+  // <fallthrough>
+  // %bb.2:
+  // ... no uses of v1024
+  // <fallthrough>
+  // %bb.3:
+  // ...
+  //       = v1024
+  //
+  // If %bb.1 -> %bb.3 edge is broken and computation of v1024 is inserted:
+  //
+  // %bb.1:
+  // ...
+  // Bne %bb.2
+  // %bb.4:
+  // v1024 =
+  // B %bb.3
+  // %bb.2:
+  // ... no uses of v1024
+  // <fallthrough>
+  // %bb.3:
+  // ...
+  //       = v1024
+  //
+  // This is incorrect since v1024 is not computed along the %bb.1->%bb.2->%bb.3
+  // flow. We need to ensure the new basic block where the computation is
+  // sunk to dominates all the uses.
+  // It's only legal to break critical edge and sink the computation to the
+  // new block if all the predecessors of "To", except for "From", are
+  // not dominated by "From". Given SSA property, this means these
+  // predecessors are dominated by "To".
+  //
+  // There is no need to do this check if all the uses are PHI nodes. PHI
+  // sources are only defined on the specific predecessor edges.
+  if (!BreakPHIEdge) {
+    for (MachineBasicBlock *Pred : ToBB->predecessors())
+      if (Pred != FromBB && !DT->dominates(ToBB, Pred))
+        return false;
+  }
+
+  ToSplit.insert(std::make_pair(FromBB, ToBB));
+
+  return true;
+}
+
+std::vector<unsigned> &
+MachineSinking::getBBRegisterPressure(MachineBasicBlock &MBB) {
+  // Currently to save compiling time, MBB's register pressure will not change
+  // in one ProcessBlock iteration because of CachedRegisterPressure. but MBB's
+  // register pressure is changed after sinking any instructions into it.
+  // FIXME: need a accurate and cheap register pressure estiminate model here.
+  auto RP = CachedRegisterPressure.find(&MBB);
+  if (RP != CachedRegisterPressure.end())
+    return RP->second;
+
+  RegionPressure Pressure;
+  RegPressureTracker RPTracker(Pressure);
+
+  // Initialize the register pressure tracker.
+  RPTracker.init(MBB.getParent(), &RegClassInfo, nullptr, &MBB, MBB.end(),
+                 /*TrackLaneMasks*/ false, /*TrackUntiedDefs=*/true);
+
+  for (MachineBasicBlock::iterator MII = MBB.instr_end(),
+                                   MIE = MBB.instr_begin();
+       MII != MIE; --MII) {
+    MachineInstr &MI = *std::prev(MII);
+    if (MI.isDebugInstr() || MI.isPseudoProbe())
+      continue;
+    RegisterOperands RegOpers;
+    RegOpers.collect(MI, *TRI, *MRI, false, false);
+    RPTracker.recedeSkipDebugValues();
+    assert(&*RPTracker.getPos() == &MI && "RPTracker sync error!");
+    RPTracker.recede(RegOpers);
+  }
+
+  RPTracker.closeRegion();
+  auto It = CachedRegisterPressure.insert(
+      std::make_pair(&MBB, RPTracker.getPressure().MaxSetPressure));
+  return It.first->second;
+}
+
+/// isProfitableToSinkTo - Return true if it is profitable to sink MI.
+bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
+                                          MachineBasicBlock *MBB,
+                                          MachineBasicBlock *SuccToSinkTo,
+                                          AllSuccsCache &AllSuccessors) {
+  assert (SuccToSinkTo && "Invalid SinkTo Candidate BB");
+
+  if (MBB == SuccToSinkTo)
+    return false;
+
+  // It is profitable if SuccToSinkTo does not post dominate current block.
+  if (!PDT->dominates(SuccToSinkTo, MBB))
+    return true;
+
+  // It is profitable to sink an instruction from a deeper cycle to a shallower
+  // cycle, even if the latter post-dominates the former (PR21115).
+  if (CI->getCycleDepth(MBB) > CI->getCycleDepth(SuccToSinkTo))
+    return true;
+
+  // Check if only use in post dominated block is PHI instruction.
+  bool NonPHIUse = false;
+  for (MachineInstr &UseInst : MRI->use_nodbg_instructions(Reg)) {
+    MachineBasicBlock *UseBlock = UseInst.getParent();
+    if (UseBlock == SuccToSinkTo && !UseInst.isPHI())
+      NonPHIUse = true;
+  }
+  if (!NonPHIUse)
+    return true;
+
+  // If SuccToSinkTo post dominates then also it may be profitable if MI
+  // can further profitably sinked into another block in next round.
+  bool BreakPHIEdge = false;
+  // FIXME - If finding successor is compile time expensive then cache results.
+  if (MachineBasicBlock *MBB2 =
+          FindSuccToSinkTo(MI, SuccToSinkTo, BreakPHIEdge, AllSuccessors))
+    return isProfitableToSinkTo(Reg, MI, SuccToSinkTo, MBB2, AllSuccessors);
+
+  MachineCycle *MCycle = CI->getCycle(MBB);
+
+  // If the instruction is not inside a cycle, it is not profitable to sink MI to
+  // a post dominate block SuccToSinkTo.
+  if (!MCycle)
+    return false;
+
+  auto isRegisterPressureSetExceedLimit = [&](const TargetRegisterClass *RC) {
+    unsigned Weight = TRI->getRegClassWeight(RC).RegWeight;
+    const int *PS = TRI->getRegClassPressureSets(RC);
+    // Get register pressure for block SuccToSinkTo.
+    std::vector<unsigned> BBRegisterPressure =
+        getBBRegisterPressure(*SuccToSinkTo);
+    for (; *PS != -1; PS++)
+      // check if any register pressure set exceeds limit in block SuccToSinkTo
+      // after sinking.
+      if (Weight + BBRegisterPressure[*PS] >=
+          TRI->getRegPressureSetLimit(*MBB->getParent(), *PS))
+        return true;
+    return false;
+  };
+
+  // If this instruction is inside a Cycle and sinking this instruction can make
+  // more registers live range shorten, it is still prifitable.
+  for (const MachineOperand &MO : MI.operands()) {
+    // Ignore non-register operands.
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (Reg == 0)
+      continue;
+
+    if (Reg.isPhysical()) {
+      // Don't handle non-constant and non-ignorable physical register uses.
+      if (MO.isUse() && !MRI->isConstantPhysReg(Reg) && !TII->isIgnorableUse(MO))
+        return false;
+      continue;
+    }
+
+    // Users for the defs are all dominated by SuccToSinkTo.
+    if (MO.isDef()) {
+      // This def register's live range is shortened after sinking.
+      bool LocalUse = false;
+      if (!AllUsesDominatedByBlock(Reg, SuccToSinkTo, MBB, BreakPHIEdge,
+                                   LocalUse))
+        return false;
+    } else {
+      MachineInstr *DefMI = MRI->getVRegDef(Reg);
+      if (!DefMI)
+        continue;
+      MachineCycle *Cycle = CI->getCycle(DefMI->getParent());
+      // DefMI is defined outside of cycle. There should be no live range
+      // impact for this operand. Defination outside of cycle means:
+      // 1: defination is outside of cycle.
+      // 2: defination is in this cycle, but it is a PHI in the cycle header.
+      if (Cycle != MCycle || (DefMI->isPHI() && Cycle && Cycle->isReducible() &&
+                              Cycle->getHeader() == DefMI->getParent()))
+        continue;
+      // The DefMI is defined inside the cycle.
+      // If sinking this operand makes some register pressure set exceed limit,
+      // it is not profitable.
+      if (isRegisterPressureSetExceedLimit(MRI->getRegClass(Reg))) {
+        LLVM_DEBUG(dbgs() << "register pressure exceed limit, not profitable.");
+        return false;
+      }
+    }
+  }
+
+  // If MI is in cycle and all its operands are alive across the whole cycle or
+  // if no operand sinking make register pressure set exceed limit, it is
+  // profitable to sink MI.
+  return true;
+}
+
+/// Get the sorted sequence of successors for this MachineBasicBlock, possibly
+/// computing it if it was not already cached.
+SmallVector<MachineBasicBlock *, 4> &
+MachineSinking::GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
+                                       AllSuccsCache &AllSuccessors) const {
+  // Do we have the sorted successors in cache ?
+  auto Succs = AllSuccessors.find(MBB);
+  if (Succs != AllSuccessors.end())
+    return Succs->second;
+
+  SmallVector<MachineBasicBlock *, 4> AllSuccs(MBB->successors());
+
+  // Handle cases where sinking can happen but where the sink point isn't a
+  // successor. For example:
+  //
+  //   x = computation
+  //   if () {} else {}
+  //   use x
+  //
+  for (MachineDomTreeNode *DTChild : DT->getNode(MBB)->children()) {
+    // DomTree children of MBB that have MBB as immediate dominator are added.
+    if (DTChild->getIDom()->getBlock() == MI.getParent() &&
+        // Skip MBBs already added to the AllSuccs vector above.
+        !MBB->isSuccessor(DTChild->getBlock()))
+      AllSuccs.push_back(DTChild->getBlock());
+  }
+
+  // Sort Successors according to their cycle depth or block frequency info.
+  llvm::stable_sort(
+      AllSuccs, [this](const MachineBasicBlock *L, const MachineBasicBlock *R) {
+        uint64_t LHSFreq = MBFI ? MBFI->getBlockFreq(L).getFrequency() : 0;
+        uint64_t RHSFreq = MBFI ? MBFI->getBlockFreq(R).getFrequency() : 0;
+        bool HasBlockFreq = LHSFreq != 0 && RHSFreq != 0;
+        return HasBlockFreq ? LHSFreq < RHSFreq
+                            : CI->getCycleDepth(L) < CI->getCycleDepth(R);
+      });
+
+  auto it = AllSuccessors.insert(std::make_pair(MBB, AllSuccs));
+
+  return it.first->second;
+}
+
+/// FindSuccToSinkTo - Find a successor to sink this instruction to.
+MachineBasicBlock *
+MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
+                                 bool &BreakPHIEdge,
+                                 AllSuccsCache &AllSuccessors) {
+  assert (MBB && "Invalid MachineBasicBlock!");
+
+  // loop over all the operands of the specified instruction.  If there is
+  // anything we can't handle, bail out.
+
+  // SuccToSinkTo - This is the successor to sink this instruction to, once we
+  // decide.
+  MachineBasicBlock *SuccToSinkTo = nullptr;
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg()) continue;  // Ignore non-register operands.
+
+    Register Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    if (Reg.isPhysical()) {
+      if (MO.isUse()) {
+        // If the physreg has no defs anywhere, it's just an ambient register
+        // and we can freely move its uses. Alternatively, if it's allocatable,
+        // it could get allocated to something with a def during allocation.
+        if (!MRI->isConstantPhysReg(Reg) && !TII->isIgnorableUse(MO))
+          return nullptr;
+      } else if (!MO.isDead()) {
+        // A def that isn't dead. We can't move it.
+        return nullptr;
+      }
+    } else {
+      // Virtual register uses are always safe to sink.
+      if (MO.isUse()) continue;
+
+      // If it's not safe to move defs of the register class, then abort.
+      if (!TII->isSafeToMoveRegClassDefs(MRI->getRegClass(Reg)))
+        return nullptr;
+
+      // Virtual register defs can only be sunk if all their uses are in blocks
+      // dominated by one of the successors.
+      if (SuccToSinkTo) {
+        // If a previous operand picked a block to sink to, then this operand
+        // must be sinkable to the same block.
+        bool LocalUse = false;
+        if (!AllUsesDominatedByBlock(Reg, SuccToSinkTo, MBB,
+                                     BreakPHIEdge, LocalUse))
+          return nullptr;
+
+        continue;
+      }
+
+      // Otherwise, we should look at all the successors and decide which one
+      // we should sink to. If we have reliable block frequency information
+      // (frequency != 0) available, give successors with smaller frequencies
+      // higher priority, otherwise prioritize smaller cycle depths.
+      for (MachineBasicBlock *SuccBlock :
+           GetAllSortedSuccessors(MI, MBB, AllSuccessors)) {
+        bool LocalUse = false;
+        if (AllUsesDominatedByBlock(Reg, SuccBlock, MBB,
+                                    BreakPHIEdge, LocalUse)) {
+          SuccToSinkTo = SuccBlock;
+          break;
+        }
+        if (LocalUse)
+          // Def is used locally, it's never safe to move this def.
+          return nullptr;
+      }
+
+      // If we couldn't find a block to sink to, ignore this instruction.
+      if (!SuccToSinkTo)
+        return nullptr;
+      if (!isProfitableToSinkTo(Reg, MI, MBB, SuccToSinkTo, AllSuccessors))
+        return nullptr;
+    }
+  }
+
+  // It is not possible to sink an instruction into its own block.  This can
+  // happen with cycles.
+  if (MBB == SuccToSinkTo)
+    return nullptr;
+
+  if (!SuccToSinkTo)
+    return nullptr;
+
+  // It's not safe to sink instructions to EH landing pad. Control flow into
+  // landing pad is implicitly defined.
+  if (SuccToSinkTo->isEHPad())
+    return nullptr;
+
+  // It ought to be okay to sink instructions into an INLINEASM_BR target, but
+  // only if we make sure that MI occurs _before_ an INLINEASM_BR instruction in
+  // the source block (which this code does not yet do). So for now, forbid
+  // doing so.
+  if (SuccToSinkTo->isInlineAsmBrIndirectTarget())
+    return nullptr;
+
+  MachineBasicBlock::const_iterator InsertPos =
+      SuccToSinkTo->SkipPHIsAndLabels(SuccToSinkTo->begin());
+  if (blockPrologueInterferes(SuccToSinkTo, InsertPos, MI, TRI, TII, MRI))
+    return nullptr;
+
+  return SuccToSinkTo;
+}
+
+/// Return true if MI is likely to be usable as a memory operation by the
+/// implicit null check optimization.
+///
+/// This is a "best effort" heuristic, and should not be relied upon for
+/// correctness.  This returning true does not guarantee that the implicit null
+/// check optimization is legal over MI, and this returning false does not
+/// guarantee MI cannot possibly be used to do a null check.
+static bool SinkingPreventsImplicitNullCheck(MachineInstr &MI,
+                                             const TargetInstrInfo *TII,
+                                             const TargetRegisterInfo *TRI) {
+  using MachineBranchPredicate = TargetInstrInfo::MachineBranchPredicate;
+
+  auto *MBB = MI.getParent();
+  if (MBB->pred_size() != 1)
+    return false;
+
+  auto *PredMBB = *MBB->pred_begin();
+  auto *PredBB = PredMBB->getBasicBlock();
+
+  // Frontends that don't use implicit null checks have no reason to emit
+  // branches with make.implicit metadata, and this function should always
+  // return false for them.
+  if (!PredBB ||
+      !PredBB->getTerminator()->getMetadata(LLVMContext::MD_make_implicit))
+    return false;
+
+  const MachineOperand *BaseOp;
+  int64_t Offset;
+  bool OffsetIsScalable;
+  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
+    return false;
+
+  if (!BaseOp->isReg())
+    return false;
+
+  if (!(MI.mayLoad() && !MI.isPredicable()))
+    return false;
+
+  MachineBranchPredicate MBP;
+  if (TII->analyzeBranchPredicate(*PredMBB, MBP, false))
+    return false;
+
+  return MBP.LHS.isReg() && MBP.RHS.isImm() && MBP.RHS.getImm() == 0 &&
+         (MBP.Predicate == MachineBranchPredicate::PRED_NE ||
+          MBP.Predicate == MachineBranchPredicate::PRED_EQ) &&
+         MBP.LHS.getReg() == BaseOp->getReg();
+}
+
+/// If the sunk instruction is a copy, try to forward the copy instead of
+/// leaving an 'undef' DBG_VALUE in the original location. Don't do this if
+/// there's any subregister weirdness involved. Returns true if copy
+/// propagation occurred.
+static bool attemptDebugCopyProp(MachineInstr &SinkInst, MachineInstr &DbgMI,
+                                 Register Reg) {
+  const MachineRegisterInfo &MRI = SinkInst.getMF()->getRegInfo();
+  const TargetInstrInfo &TII = *SinkInst.getMF()->getSubtarget().getInstrInfo();
+
+  // Copy DBG_VALUE operand and set the original to undef. We then check to
+  // see whether this is something that can be copy-forwarded. If it isn't,
+  // continue around the loop.
+
+  const MachineOperand *SrcMO = nullptr, *DstMO = nullptr;
+  auto CopyOperands = TII.isCopyInstr(SinkInst);
+  if (!CopyOperands)
+    return false;
+  SrcMO = CopyOperands->Source;
+  DstMO = CopyOperands->Destination;
+
+  // Check validity of forwarding this copy.
+  bool PostRA = MRI.getNumVirtRegs() == 0;
+
+  // Trying to forward between physical and virtual registers is too hard.
+  if (Reg.isVirtual() != SrcMO->getReg().isVirtual())
+    return false;
+
+  // Only try virtual register copy-forwarding before regalloc, and physical
+  // register copy-forwarding after regalloc.
+  bool arePhysRegs = !Reg.isVirtual();
+  if (arePhysRegs != PostRA)
+    return false;
+
+  // Pre-regalloc, only forward if all subregisters agree (or there are no
+  // subregs at all). More analysis might recover some forwardable copies.
+  if (!PostRA)
+    for (auto &DbgMO : DbgMI.getDebugOperandsForReg(Reg))
+      if (DbgMO.getSubReg() != SrcMO->getSubReg() ||
+          DbgMO.getSubReg() != DstMO->getSubReg())
+        return false;
+
+  // Post-regalloc, we may be sinking a DBG_VALUE of a sub or super-register
+  // of this copy. Only forward the copy if the DBG_VALUE operand exactly
+  // matches the copy destination.
+  if (PostRA && Reg != DstMO->getReg())
+    return false;
+
+  for (auto &DbgMO : DbgMI.getDebugOperandsForReg(Reg)) {
+    DbgMO.setReg(SrcMO->getReg());
+    DbgMO.setSubReg(SrcMO->getSubReg());
+  }
+  return true;
+}
+
+using MIRegs = std::pair<MachineInstr *, SmallVector<unsigned, 2>>;
+/// Sink an instruction and its associated debug instructions.
+static void performSink(MachineInstr &MI, MachineBasicBlock &SuccToSinkTo,
+                        MachineBasicBlock::iterator InsertPos,
+                        ArrayRef<MIRegs> DbgValuesToSink) {
+  // If we cannot find a location to use (merge with), then we erase the debug
+  // location to prevent debug-info driven tools from potentially reporting
+  // wrong location information.
+  if (!SuccToSinkTo.empty() && InsertPos != SuccToSinkTo.end())
+    MI.setDebugLoc(DILocation::getMergedLocation(MI.getDebugLoc(),
+                                                 InsertPos->getDebugLoc()));
+  else
+    MI.setDebugLoc(DebugLoc());
+
+  // Move the instruction.
+  MachineBasicBlock *ParentBlock = MI.getParent();
+  SuccToSinkTo.splice(InsertPos, ParentBlock, MI,
+                      ++MachineBasicBlock::iterator(MI));
+
+  // Sink a copy of debug users to the insert position. Mark the original
+  // DBG_VALUE location as 'undef', indicating that any earlier variable
+  // location should be terminated as we've optimised away the value at this
+  // point.
+  for (const auto &DbgValueToSink : DbgValuesToSink) {
+    MachineInstr *DbgMI = DbgValueToSink.first;
+    MachineInstr *NewDbgMI = DbgMI->getMF()->CloneMachineInstr(DbgMI);
+    SuccToSinkTo.insert(InsertPos, NewDbgMI);
+
+    bool PropagatedAllSunkOps = true;
+    for (unsigned Reg : DbgValueToSink.second) {
+      if (DbgMI->hasDebugOperandForReg(Reg)) {
+        if (!attemptDebugCopyProp(MI, *DbgMI, Reg)) {
+          PropagatedAllSunkOps = false;
+          break;
+        }
+      }
+    }
+    if (!PropagatedAllSunkOps)
+      DbgMI->setDebugValueUndef();
+  }
+}
+
+/// hasStoreBetween - check if there is store betweeen straight line blocks From
+/// and To.
+bool MachineSinking::hasStoreBetween(MachineBasicBlock *From,
+                                     MachineBasicBlock *To, MachineInstr &MI) {
+  // Make sure From and To are in straight line which means From dominates To
+  // and To post dominates From.
+  if (!DT->dominates(From, To) || !PDT->dominates(To, From))
+    return true;
+
+  auto BlockPair = std::make_pair(From, To);
+
+  // Does these two blocks pair be queried before and have a definite cached
+  // result?
+  if (HasStoreCache.find(BlockPair) != HasStoreCache.end())
+    return HasStoreCache[BlockPair];
+
+  if (StoreInstrCache.find(BlockPair) != StoreInstrCache.end())
+    return llvm::any_of(StoreInstrCache[BlockPair], [&](MachineInstr *I) {
+      return I->mayAlias(AA, MI, false);
+    });
+
+  bool SawStore = false;
+  bool HasAliasedStore = false;
+  DenseSet<MachineBasicBlock *> HandledBlocks;
+  DenseSet<MachineBasicBlock *> HandledDomBlocks;
+  // Go through all reachable blocks from From.
+  for (MachineBasicBlock *BB : depth_first(From)) {
+    // We insert the instruction at the start of block To, so no need to worry
+    // about stores inside To.
+    // Store in block From should be already considered when just enter function
+    // SinkInstruction.
+    if (BB == To || BB == From)
+      continue;
+
+    // We already handle this BB in previous iteration.
+    if (HandledBlocks.count(BB))
+      continue;
+
+    HandledBlocks.insert(BB);
+    // To post dominates BB, it must be a path from block From.
+    if (PDT->dominates(To, BB)) {
+      if (!HandledDomBlocks.count(BB))
+        HandledDomBlocks.insert(BB);
+
+      // If this BB is too big or the block number in straight line between From
+      // and To is too big, stop searching to save compiling time.
+      if (BB->sizeWithoutDebugLargerThan(SinkLoadInstsPerBlockThreshold) ||
+          HandledDomBlocks.size() > SinkLoadBlocksThreshold) {
+        for (auto *DomBB : HandledDomBlocks) {
+          if (DomBB != BB && DT->dominates(DomBB, BB))
+            HasStoreCache[std::make_pair(DomBB, To)] = true;
+          else if(DomBB != BB && DT->dominates(BB, DomBB))
+            HasStoreCache[std::make_pair(From, DomBB)] = true;
+        }
+        HasStoreCache[BlockPair] = true;
+        return true;
+      }
+
+      for (MachineInstr &I : *BB) {
+        // Treat as alias conservatively for a call or an ordered memory
+        // operation.
+        if (I.isCall() || I.hasOrderedMemoryRef()) {
+          for (auto *DomBB : HandledDomBlocks) {
+            if (DomBB != BB && DT->dominates(DomBB, BB))
+              HasStoreCache[std::make_pair(DomBB, To)] = true;
+            else if(DomBB != BB && DT->dominates(BB, DomBB))
+              HasStoreCache[std::make_pair(From, DomBB)] = true;
+          }
+          HasStoreCache[BlockPair] = true;
+          return true;
+        }
+
+        if (I.mayStore()) {
+          SawStore = true;
+          // We still have chance to sink MI if all stores between are not
+          // aliased to MI.
+          // Cache all store instructions, so that we don't need to go through
+          // all From reachable blocks for next load instruction.
+          if (I.mayAlias(AA, MI, false))
+            HasAliasedStore = true;
+          StoreInstrCache[BlockPair].push_back(&I);
+        }
+      }
+    }
+  }
+  // If there is no store at all, cache the result.
+  if (!SawStore)
+    HasStoreCache[BlockPair] = false;
+  return HasAliasedStore;
+}
+
+/// Sink instructions into cycles if profitable. This especially tries to
+/// prevent register spills caused by register pressure if there is little to no
+/// overhead moving instructions into cycles.
+bool MachineSinking::SinkIntoCycle(MachineCycle *Cycle, MachineInstr &I) {
+  LLVM_DEBUG(dbgs() << "CycleSink: Finding sink block for: " << I);
+  MachineBasicBlock *Preheader = Cycle->getCyclePreheader();
+  assert(Preheader && "Cycle sink needs a preheader block");
+  MachineBasicBlock *SinkBlock = nullptr;
+  bool CanSink = true;
+  const MachineOperand &MO = I.getOperand(0);
+
+  for (MachineInstr &MI : MRI->use_instructions(MO.getReg())) {
+    LLVM_DEBUG(dbgs() << "CycleSink:   Analysing use: " << MI);
+    if (!Cycle->contains(MI.getParent())) {
+      LLVM_DEBUG(dbgs() << "CycleSink:   Use not in cycle, can't sink.\n");
+      CanSink = false;
+      break;
+    }
+
+    // FIXME: Come up with a proper cost model that estimates whether sinking
+    // the instruction (and thus possibly executing it on every cycle
+    // iteration) is more expensive than a register.
+    // For now assumes that copies are cheap and thus almost always worth it.
+    if (!MI.isCopy()) {
+      LLVM_DEBUG(dbgs() << "CycleSink:   Use is not a copy\n");
+      CanSink = false;
+      break;
+    }
+    if (!SinkBlock) {
+      SinkBlock = MI.getParent();
+      LLVM_DEBUG(dbgs() << "CycleSink:   Setting sink block to: "
+                        << printMBBReference(*SinkBlock) << "\n");
+      continue;
+    }
+    SinkBlock = DT->findNearestCommonDominator(SinkBlock, MI.getParent());
+    if (!SinkBlock) {
+      LLVM_DEBUG(dbgs() << "CycleSink:   Can't find nearest dominator\n");
+      CanSink = false;
+      break;
+    }
+    LLVM_DEBUG(dbgs() << "CycleSink:   Setting nearest common dom block: " <<
+               printMBBReference(*SinkBlock) << "\n");
+  }
+
+  if (!CanSink) {
+    LLVM_DEBUG(dbgs() << "CycleSink: Can't sink instruction.\n");
+    return false;
+  }
+  if (!SinkBlock) {
+    LLVM_DEBUG(dbgs() << "CycleSink: Not sinking, can't find sink block.\n");
+    return false;
+  }
+  if (SinkBlock == Preheader) {
+    LLVM_DEBUG(
+        dbgs() << "CycleSink: Not sinking, sink block is the preheader\n");
+    return false;
+  }
+  if (SinkBlock->sizeWithoutDebugLargerThan(SinkLoadInstsPerBlockThreshold)) {
+    LLVM_DEBUG(
+        dbgs() << "CycleSink: Not Sinking, block too large to analyse.\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "CycleSink: Sinking instruction!\n");
+  SinkBlock->splice(SinkBlock->SkipPHIsAndLabels(SinkBlock->begin()), Preheader,
+                    I);
+
+  // Conservatively clear any kill flags on uses of sunk instruction
+  for (MachineOperand &MO : I.operands()) {
+    if (MO.isReg() && MO.readsReg())
+      RegsToClearKillFlags.insert(MO.getReg());
+  }
+
+  // The instruction is moved from its basic block, so do not retain the
+  // debug information.
+  assert(!I.isDebugInstr() && "Should not sink debug inst");
+  I.setDebugLoc(DebugLoc());
+  return true;
+}
+
+/// SinkInstruction - Determine whether it is safe to sink the specified machine
+/// instruction out of its current block into a successor.
+bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
+                                     AllSuccsCache &AllSuccessors) {
+  // Don't sink instructions that the target prefers not to sink.
+  if (!TII->shouldSink(MI))
+    return false;
+
+  // Check if it's safe to move the instruction.
+  if (!MI.isSafeToMove(AA, SawStore))
+    return false;
+
+  // Convergent operations may not be made control-dependent on additional
+  // values.
+  if (MI.isConvergent())
+    return false;
+
+  // Don't break implicit null checks.  This is a performance heuristic, and not
+  // required for correctness.
+  if (SinkingPreventsImplicitNullCheck(MI, TII, TRI))
+    return false;
+
+  // FIXME: This should include support for sinking instructions within the
+  // block they are currently in to shorten the live ranges.  We often get
+  // instructions sunk into the top of a large block, but it would be better to
+  // also sink them down before their first use in the block.  This xform has to
+  // be careful not to *increase* register pressure though, e.g. sinking
+  // "x = y + z" down if it kills y and z would increase the live ranges of y
+  // and z and only shrink the live range of x.
+
+  bool BreakPHIEdge = false;
+  MachineBasicBlock *ParentBlock = MI.getParent();
+  MachineBasicBlock *SuccToSinkTo =
+      FindSuccToSinkTo(MI, ParentBlock, BreakPHIEdge, AllSuccessors);
+
+  // If there are no outputs, it must have side-effects.
+  if (!SuccToSinkTo)
+    return false;
+
+  // If the instruction to move defines a dead physical register which is live
+  // when leaving the basic block, don't move it because it could turn into a
+  // "zombie" define of that preg. E.g., EFLAGS. (<rdar://problem/8030636>)
+  for (const MachineOperand &MO : MI.all_defs()) {
+    Register Reg = MO.getReg();
+    if (Reg == 0 || !Reg.isPhysical())
+      continue;
+    if (SuccToSinkTo->isLiveIn(Reg))
+      return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Sink instr " << MI << "\tinto block " << *SuccToSinkTo);
+
+  // If the block has multiple predecessors, this is a critical edge.
+  // Decide if we can sink along it or need to break the edge.
+  if (SuccToSinkTo->pred_size() > 1) {
+    // We cannot sink a load across a critical edge - there may be stores in
+    // other code paths.
+    bool TryBreak = false;
+    bool Store =
+        MI.mayLoad() ? hasStoreBetween(ParentBlock, SuccToSinkTo, MI) : true;
+    if (!MI.isSafeToMove(AA, Store)) {
+      LLVM_DEBUG(dbgs() << " *** NOTE: Won't sink load along critical edge.\n");
+      TryBreak = true;
+    }
+
+    // We don't want to sink across a critical edge if we don't dominate the
+    // successor. We could be introducing calculations to new code paths.
+    if (!TryBreak && !DT->dominates(ParentBlock, SuccToSinkTo)) {
+      LLVM_DEBUG(dbgs() << " *** NOTE: Critical edge found\n");
+      TryBreak = true;
+    }
+
+    // Don't sink instructions into a cycle.
+    if (!TryBreak && CI->getCycle(SuccToSinkTo) &&
+        (!CI->getCycle(SuccToSinkTo)->isReducible() ||
+         CI->getCycle(SuccToSinkTo)->getHeader() == SuccToSinkTo)) {
+      LLVM_DEBUG(dbgs() << " *** NOTE: cycle header found\n");
+      TryBreak = true;
+    }
+
+    // Otherwise we are OK with sinking along a critical edge.
+    if (!TryBreak)
+      LLVM_DEBUG(dbgs() << "Sinking along critical edge.\n");
+    else {
+      // Mark this edge as to be split.
+      // If the edge can actually be split, the next iteration of the main loop
+      // will sink MI in the newly created block.
+      bool Status =
+        PostponeSplitCriticalEdge(MI, ParentBlock, SuccToSinkTo, BreakPHIEdge);
+      if (!Status)
+        LLVM_DEBUG(dbgs() << " *** PUNTING: Not legal or profitable to "
+                             "break critical edge\n");
+      // The instruction will not be sunk this time.
+      return false;
+    }
+  }
+
+  if (BreakPHIEdge) {
+    // BreakPHIEdge is true if all the uses are in the successor MBB being
+    // sunken into and they are all PHI nodes. In this case, machine-sink must
+    // break the critical edge first.
+    bool Status = PostponeSplitCriticalEdge(MI, ParentBlock,
+                                            SuccToSinkTo, BreakPHIEdge);
+    if (!Status)
+      LLVM_DEBUG(dbgs() << " *** PUNTING: Not legal or profitable to "
+                           "break critical edge\n");
+    // The instruction will not be sunk this time.
+    return false;
+  }
+
+  // Determine where to insert into. Skip phi nodes.
+  MachineBasicBlock::iterator InsertPos =
+      SuccToSinkTo->SkipPHIsAndLabels(SuccToSinkTo->begin());
+  if (blockPrologueInterferes(SuccToSinkTo, InsertPos, MI, TRI, TII, MRI)) {
+    LLVM_DEBUG(dbgs() << " *** Not sinking: prologue interference\n");
+    return false;
+  }
+
+  // Collect debug users of any vreg that this inst defines.
+  SmallVector<MIRegs, 4> DbgUsersToSink;
+  for (auto &MO : MI.all_defs()) {
+    if (!MO.getReg().isVirtual())
+      continue;
+    if (!SeenDbgUsers.count(MO.getReg()))
+      continue;
+
+    // Sink any users that don't pass any other DBG_VALUEs for this variable.
+    auto &Users = SeenDbgUsers[MO.getReg()];
+    for (auto &User : Users) {
+      MachineInstr *DbgMI = User.getPointer();
+      if (User.getInt()) {
+        // This DBG_VALUE would re-order assignments. If we can't copy-propagate
+        // it, it can't be recovered. Set it undef.
+        if (!attemptDebugCopyProp(MI, *DbgMI, MO.getReg()))
+          DbgMI->setDebugValueUndef();
+      } else {
+        DbgUsersToSink.push_back(
+            {DbgMI, SmallVector<unsigned, 2>(1, MO.getReg())});
+      }
+    }
+  }
+
+  // After sinking, some debug users may not be dominated any more. If possible,
+  // copy-propagate their operands. As it's expensive, don't do this if there's
+  // no debuginfo in the program.
+  if (MI.getMF()->getFunction().getSubprogram() && MI.isCopy())
+    SalvageUnsunkDebugUsersOfCopy(MI, SuccToSinkTo);
+
+  performSink(MI, *SuccToSinkTo, InsertPos, DbgUsersToSink);
+
+  // Conservatively, clear any kill flags, since it's possible that they are no
+  // longer correct.
+  // Note that we have to clear the kill flags for any register this instruction
+  // uses as we may sink over another instruction which currently kills the
+  // used registers.
+  for (MachineOperand &MO : MI.all_uses())
+    RegsToClearKillFlags.insert(MO.getReg()); // Remember to clear kill flags.
+
+  return true;
+}
+
+void MachineSinking::SalvageUnsunkDebugUsersOfCopy(
+    MachineInstr &MI, MachineBasicBlock *TargetBlock) {
+  assert(MI.isCopy());
+  assert(MI.getOperand(1).isReg());
+
+  // Enumerate all users of vreg operands that are def'd. Skip those that will
+  // be sunk. For the rest, if they are not dominated by the block we will sink
+  // MI into, propagate the copy source to them.
+  SmallVector<MachineInstr *, 4> DbgDefUsers;
+  SmallVector<Register, 4> DbgUseRegs;
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+  for (auto &MO : MI.all_defs()) {
+    if (!MO.getReg().isVirtual())
+      continue;
+    DbgUseRegs.push_back(MO.getReg());
+    for (auto &User : MRI.use_instructions(MO.getReg())) {
+      if (!User.isDebugValue() || DT->dominates(TargetBlock, User.getParent()))
+        continue;
+
+      // If is in same block, will either sink or be use-before-def.
+      if (User.getParent() == MI.getParent())
+        continue;
+
+      assert(User.hasDebugOperandForReg(MO.getReg()) &&
+             "DBG_VALUE user of vreg, but has no operand for it?");
+      DbgDefUsers.push_back(&User);
+    }
+  }
+
+  // Point the users of this copy that are no longer dominated, at the source
+  // of the copy.
+  for (auto *User : DbgDefUsers) {
+    for (auto &Reg : DbgUseRegs) {
+      for (auto &DbgOp : User->getDebugOperandsForReg(Reg)) {
+        DbgOp.setReg(MI.getOperand(1).getReg());
+        DbgOp.setSubReg(MI.getOperand(1).getSubReg());
+      }
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// This pass is not intended to be a replacement or a complete alternative
+// for the pre-ra machine sink pass. It is only designed to sink COPY
+// instructions which should be handled after RA.
+//
+// This pass sinks COPY instructions into a successor block, if the COPY is not
+// used in the current block and the COPY is live-in to a single successor
+// (i.e., doesn't require the COPY to be duplicated).  This avoids executing the
+// copy on paths where their results aren't needed.  This also exposes
+// additional opportunites for dead copy elimination and shrink wrapping.
+//
+// These copies were either not handled by or are inserted after the MachineSink
+// pass. As an example of the former case, the MachineSink pass cannot sink
+// COPY instructions with allocatable source registers; for AArch64 these type
+// of copy instructions are frequently used to move function parameters (PhyReg)
+// into virtual registers in the entry block.
+//
+// For the machine IR below, this pass will sink %w19 in the entry into its
+// successor (%bb.1) because %w19 is only live-in in %bb.1.
+// %bb.0:
+//   %wzr = SUBSWri %w1, 1
+//   %w19 = COPY %w0
+//   Bcc 11, %bb.2
+// %bb.1:
+//   Live Ins: %w19
+//   BL @fun
+//   %w0 = ADDWrr %w0, %w19
+//   RET %w0
+// %bb.2:
+//   %w0 = COPY %wzr
+//   RET %w0
+// As we sink %w19 (CSR in AArch64) into %bb.1, the shrink-wrapping pass will be
+// able to see %bb.0 as a candidate.
+//===----------------------------------------------------------------------===//
+namespace {
+
+class PostRAMachineSinking : public MachineFunctionPass {
+public:
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  static char ID;
+  PostRAMachineSinking() : MachineFunctionPass(ID) {}
+  StringRef getPassName() const override { return "PostRA Machine Sink"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  /// Track which register units have been modified and used.
+  LiveRegUnits ModifiedRegUnits, UsedRegUnits;
+
+  /// Track DBG_VALUEs of (unmodified) register units. Each DBG_VALUE has an
+  /// entry in this map for each unit it touches. The DBG_VALUE's entry
+  /// consists of a pointer to the instruction itself, and a vector of registers
+  /// referred to by the instruction that overlap the key register unit.
+  DenseMap<unsigned, SmallVector<MIRegs, 2>> SeenDbgInstrs;
+
+  /// Sink Copy instructions unused in the same block close to their uses in
+  /// successors.
+  bool tryToSinkCopy(MachineBasicBlock &BB, MachineFunction &MF,
+                     const TargetRegisterInfo *TRI, const TargetInstrInfo *TII);
+};
+} // namespace
+
+char PostRAMachineSinking::ID = 0;
+char &llvm::PostRAMachineSinkingID = PostRAMachineSinking::ID;
+
+INITIALIZE_PASS(PostRAMachineSinking, "postra-machine-sink",
+                "PostRA Machine Sink", false, false)
+
+static bool aliasWithRegsInLiveIn(MachineBasicBlock &MBB, unsigned Reg,
+                                  const TargetRegisterInfo *TRI) {
+  LiveRegUnits LiveInRegUnits(*TRI);
+  LiveInRegUnits.addLiveIns(MBB);
+  return !LiveInRegUnits.available(Reg);
+}
+
+static MachineBasicBlock *
+getSingleLiveInSuccBB(MachineBasicBlock &CurBB,
+                      const SmallPtrSetImpl<MachineBasicBlock *> &SinkableBBs,
+                      unsigned Reg, const TargetRegisterInfo *TRI) {
+  // Try to find a single sinkable successor in which Reg is live-in.
+  MachineBasicBlock *BB = nullptr;
+  for (auto *SI : SinkableBBs) {
+    if (aliasWithRegsInLiveIn(*SI, Reg, TRI)) {
+      // If BB is set here, Reg is live-in to at least two sinkable successors,
+      // so quit.
+      if (BB)
+        return nullptr;
+      BB = SI;
+    }
+  }
+  // Reg is not live-in to any sinkable successors.
+  if (!BB)
+    return nullptr;
+
+  // Check if any register aliased with Reg is live-in in other successors.
+  for (auto *SI : CurBB.successors()) {
+    if (!SinkableBBs.count(SI) && aliasWithRegsInLiveIn(*SI, Reg, TRI))
+      return nullptr;
+  }
+  return BB;
+}
+
+static MachineBasicBlock *
+getSingleLiveInSuccBB(MachineBasicBlock &CurBB,
+                      const SmallPtrSetImpl<MachineBasicBlock *> &SinkableBBs,
+                      ArrayRef<unsigned> DefedRegsInCopy,
+                      const TargetRegisterInfo *TRI) {
+  MachineBasicBlock *SingleBB = nullptr;
+  for (auto DefReg : DefedRegsInCopy) {
+    MachineBasicBlock *BB =
+        getSingleLiveInSuccBB(CurBB, SinkableBBs, DefReg, TRI);
+    if (!BB || (SingleBB && SingleBB != BB))
+      return nullptr;
+    SingleBB = BB;
+  }
+  return SingleBB;
+}
+
+static void clearKillFlags(MachineInstr *MI, MachineBasicBlock &CurBB,
+                           SmallVectorImpl<unsigned> &UsedOpsInCopy,
+                           LiveRegUnits &UsedRegUnits,
+                           const TargetRegisterInfo *TRI) {
+  for (auto U : UsedOpsInCopy) {
+    MachineOperand &MO = MI->getOperand(U);
+    Register SrcReg = MO.getReg();
+    if (!UsedRegUnits.available(SrcReg)) {
+      MachineBasicBlock::iterator NI = std::next(MI->getIterator());
+      for (MachineInstr &UI : make_range(NI, CurBB.end())) {
+        if (UI.killsRegister(SrcReg, TRI)) {
+          UI.clearRegisterKills(SrcReg, TRI);
+          MO.setIsKill(true);
+          break;
+        }
+      }
+    }
+  }
+}
+
+static void updateLiveIn(MachineInstr *MI, MachineBasicBlock *SuccBB,
+                         SmallVectorImpl<unsigned> &UsedOpsInCopy,
+                         SmallVectorImpl<unsigned> &DefedRegsInCopy) {
+  MachineFunction &MF = *SuccBB->getParent();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  for (unsigned DefReg : DefedRegsInCopy)
+    for (MCPhysReg S : TRI->subregs_inclusive(DefReg))
+      SuccBB->removeLiveIn(S);
+  for (auto U : UsedOpsInCopy) {
+    Register SrcReg = MI->getOperand(U).getReg();
+    LaneBitmask Mask;
+    for (MCRegUnitMaskIterator S(SrcReg, TRI); S.isValid(); ++S) {
+      Mask |= (*S).second;
+    }
+    SuccBB->addLiveIn(SrcReg, Mask.any() ? Mask : LaneBitmask::getAll());
+  }
+  SuccBB->sortUniqueLiveIns();
+}
+
+static bool hasRegisterDependency(MachineInstr *MI,
+                                  SmallVectorImpl<unsigned> &UsedOpsInCopy,
+                                  SmallVectorImpl<unsigned> &DefedRegsInCopy,
+                                  LiveRegUnits &ModifiedRegUnits,
+                                  LiveRegUnits &UsedRegUnits) {
+  bool HasRegDependency = false;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isDef()) {
+      if (!ModifiedRegUnits.available(Reg) || !UsedRegUnits.available(Reg)) {
+        HasRegDependency = true;
+        break;
+      }
+      DefedRegsInCopy.push_back(Reg);
+
+      // FIXME: instead of isUse(), readsReg() would be a better fix here,
+      // For example, we can ignore modifications in reg with undef. However,
+      // it's not perfectly clear if skipping the internal read is safe in all
+      // other targets.
+    } else if (MO.isUse()) {
+      if (!ModifiedRegUnits.available(Reg)) {
+        HasRegDependency = true;
+        break;
+      }
+      UsedOpsInCopy.push_back(i);
+    }
+  }
+  return HasRegDependency;
+}
+
+bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
+                                         MachineFunction &MF,
+                                         const TargetRegisterInfo *TRI,
+                                         const TargetInstrInfo *TII) {
+  SmallPtrSet<MachineBasicBlock *, 2> SinkableBBs;
+  // FIXME: For now, we sink only to a successor which has a single predecessor
+  // so that we can directly sink COPY instructions to the successor without
+  // adding any new block or branch instruction.
+  for (MachineBasicBlock *SI : CurBB.successors())
+    if (!SI->livein_empty() && SI->pred_size() == 1)
+      SinkableBBs.insert(SI);
+
+  if (SinkableBBs.empty())
+    return false;
+
+  bool Changed = false;
+
+  // Track which registers have been modified and used between the end of the
+  // block and the current instruction.
+  ModifiedRegUnits.clear();
+  UsedRegUnits.clear();
+  SeenDbgInstrs.clear();
+
+  for (MachineInstr &MI : llvm::make_early_inc_range(llvm::reverse(CurBB))) {
+    // Track the operand index for use in Copy.
+    SmallVector<unsigned, 2> UsedOpsInCopy;
+    // Track the register number defed in Copy.
+    SmallVector<unsigned, 2> DefedRegsInCopy;
+
+    // We must sink this DBG_VALUE if its operand is sunk. To avoid searching
+    // for DBG_VALUEs later, record them when they're encountered.
+    if (MI.isDebugValue() && !MI.isDebugRef()) {
+      SmallDenseMap<MCRegister, SmallVector<unsigned, 2>, 4> MIUnits;
+      bool IsValid = true;
+      for (MachineOperand &MO : MI.debug_operands()) {
+        if (MO.isReg() && MO.getReg().isPhysical()) {
+          // Bail if we can already tell the sink would be rejected, rather
+          // than needlessly accumulating lots of DBG_VALUEs.
+          if (hasRegisterDependency(&MI, UsedOpsInCopy, DefedRegsInCopy,
+                                    ModifiedRegUnits, UsedRegUnits)) {
+            IsValid = false;
+            break;
+          }
+
+          // Record debug use of each reg unit.
+          for (MCRegUnit Unit : TRI->regunits(MO.getReg()))
+            MIUnits[Unit].push_back(MO.getReg());
+        }
+      }
+      if (IsValid) {
+        for (auto &RegOps : MIUnits)
+          SeenDbgInstrs[RegOps.first].emplace_back(&MI,
+                                                   std::move(RegOps.second));
+      }
+      continue;
+    }
+
+    if (MI.isDebugOrPseudoInstr())
+      continue;
+
+    // Do not move any instruction across function call.
+    if (MI.isCall())
+      return false;
+
+    if (!MI.isCopy() || !MI.getOperand(0).isRenamable()) {
+      LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
+                                        TRI);
+      continue;
+    }
+
+    // Don't sink the COPY if it would violate a register dependency.
+    if (hasRegisterDependency(&MI, UsedOpsInCopy, DefedRegsInCopy,
+                              ModifiedRegUnits, UsedRegUnits)) {
+      LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
+                                        TRI);
+      continue;
+    }
+    assert((!UsedOpsInCopy.empty() && !DefedRegsInCopy.empty()) &&
+           "Unexpect SrcReg or DefReg");
+    MachineBasicBlock *SuccBB =
+        getSingleLiveInSuccBB(CurBB, SinkableBBs, DefedRegsInCopy, TRI);
+    // Don't sink if we cannot find a single sinkable successor in which Reg
+    // is live-in.
+    if (!SuccBB) {
+      LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
+                                        TRI);
+      continue;
+    }
+    assert((SuccBB->pred_size() == 1 && *SuccBB->pred_begin() == &CurBB) &&
+           "Unexpected predecessor");
+
+    // Collect DBG_VALUEs that must sink with this copy. We've previously
+    // recorded which reg units that DBG_VALUEs read, if this instruction
+    // writes any of those units then the corresponding DBG_VALUEs must sink.
+    MapVector<MachineInstr *, MIRegs::second_type> DbgValsToSinkMap;
+    for (auto &MO : MI.all_defs()) {
+      for (MCRegUnit Unit : TRI->regunits(MO.getReg())) {
+        for (const auto &MIRegs : SeenDbgInstrs.lookup(Unit)) {
+          auto &Regs = DbgValsToSinkMap[MIRegs.first];
+          for (unsigned Reg : MIRegs.second)
+            Regs.push_back(Reg);
+        }
+      }
+    }
+    auto DbgValsToSink = DbgValsToSinkMap.takeVector();
+
+    LLVM_DEBUG(dbgs() << "Sink instr " << MI << "\tinto block " << *SuccBB);
+
+    MachineBasicBlock::iterator InsertPos =
+        SuccBB->SkipPHIsAndLabels(SuccBB->begin());
+    if (blockPrologueInterferes(SuccBB, InsertPos, MI, TRI, TII, nullptr)) {
+      LLVM_DEBUG(
+          dbgs() << " *** Not sinking: prologue interference\n");
+      continue;
+    }
+
+    // Clear the kill flag if SrcReg is killed between MI and the end of the
+    // block.
+    clearKillFlags(&MI, CurBB, UsedOpsInCopy, UsedRegUnits, TRI);
+    performSink(MI, *SuccBB, InsertPos, DbgValsToSink);
+    updateLiveIn(&MI, SuccBB, UsedOpsInCopy, DefedRegsInCopy);
+
+    Changed = true;
+    ++NumPostRACopySink;
+  }
+  return Changed;
+}
+
+bool PostRAMachineSinking::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  bool Changed = false;
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+  ModifiedRegUnits.init(*TRI);
+  UsedRegUnits.init(*TRI);
+  for (auto &BB : MF)
+    Changed |= tryToSinkCopy(BB, MF, TRI, TII);
+
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineSizeOpts.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineSizeOpts.cpp
new file mode 100644
index 000000000000..53bed7397d09
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineSizeOpts.cpp
@@ -0,0 +1,52 @@
+//===- MachineSizeOpts.cpp - code size optimization related code ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains some shared machine IR code size optimization related
+// code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineSizeOpts.h"
+#include "llvm/CodeGen/MBFIWrapper.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+
+using namespace llvm;
+
+extern cl::opt<bool> EnablePGSO;
+extern cl::opt<bool> PGSOLargeWorkingSetSizeOnly;
+extern cl::opt<bool> ForcePGSO;
+extern cl::opt<int> PgsoCutoffInstrProf;
+extern cl::opt<int> PgsoCutoffSampleProf;
+
+bool llvm::shouldOptimizeForSize(const MachineFunction *MF,
+                                 ProfileSummaryInfo *PSI,
+                                 const MachineBlockFrequencyInfo *MBFI,
+                                 PGSOQueryType QueryType) {
+  return shouldFuncOptimizeForSizeImpl(MF, PSI, MBFI, QueryType);
+}
+
+bool llvm::shouldOptimizeForSize(const MachineBasicBlock *MBB,
+                                 ProfileSummaryInfo *PSI,
+                                 const MachineBlockFrequencyInfo *MBFI,
+                                 PGSOQueryType QueryType) {
+  assert(MBB);
+  return shouldOptimizeForSizeImpl(MBB, PSI, MBFI, QueryType);
+}
+
+bool llvm::shouldOptimizeForSize(const MachineBasicBlock *MBB,
+                                 ProfileSummaryInfo *PSI,
+                                 MBFIWrapper *MBFIW,
+                                 PGSOQueryType QueryType) {
+  assert(MBB);
+  if (!PSI || !MBFIW)
+    return false;
+  BlockFrequency BlockFreq = MBFIW->getBlockFreq(MBB);
+  return shouldOptimizeForSizeImpl(BlockFreq, PSI, &MBFIW->getMBFI(),
+                                   QueryType);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineStableHash.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineStableHash.cpp
new file mode 100644
index 000000000000..9628e4c5aeb5
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -0,0 +1,236 @@
+//===- lib/CodeGen/MachineStableHash.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Stable hashing for MachineInstr and MachineOperand. Useful or getting a
+// hash across runs, modules, etc.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineStableHash.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/ilist_iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundleIterator.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/StableHashing.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#define DEBUG_TYPE "machine-stable-hash"
+
+using namespace llvm;
+
+STATISTIC(StableHashBailingMachineBasicBlock,
+          "Number of encountered unsupported MachineOperands that were "
+          "MachineBasicBlocks while computing stable hashes");
+STATISTIC(StableHashBailingConstantPoolIndex,
+          "Number of encountered unsupported MachineOperands that were "
+          "ConstantPoolIndex while computing stable hashes");
+STATISTIC(StableHashBailingTargetIndexNoName,
+          "Number of encountered unsupported MachineOperands that were "
+          "TargetIndex with no name");
+STATISTIC(StableHashBailingGlobalAddress,
+          "Number of encountered unsupported MachineOperands that were "
+          "GlobalAddress while computing stable hashes");
+STATISTIC(StableHashBailingBlockAddress,
+          "Number of encountered unsupported MachineOperands that were "
+          "BlockAddress while computing stable hashes");
+STATISTIC(StableHashBailingMetadataUnsupported,
+          "Number of encountered unsupported MachineOperands that were "
+          "Metadata of an unsupported kind while computing stable hashes");
+
+stable_hash llvm::stableHashValue(const MachineOperand &MO) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    if (MO.getReg().isVirtual()) {
+      const MachineRegisterInfo &MRI = MO.getParent()->getMF()->getRegInfo();
+      SmallVector<unsigned> DefOpcodes;
+      for (auto &Def : MRI.def_instructions(MO.getReg()))
+        DefOpcodes.push_back(Def.getOpcode());
+      return hash_combine_range(DefOpcodes.begin(), DefOpcodes.end());
+    }
+
+    // Register operands don't have target flags.
+    return stable_hash_combine(MO.getType(), MO.getReg(), MO.getSubReg(),
+                               MO.isDef());
+  case MachineOperand::MO_Immediate:
+    return stable_hash_combine(MO.getType(), MO.getTargetFlags(), MO.getImm());
+  case MachineOperand::MO_CImmediate:
+  case MachineOperand::MO_FPImmediate: {
+    auto Val = MO.isCImm() ? MO.getCImm()->getValue()
+                           : MO.getFPImm()->getValueAPF().bitcastToAPInt();
+    auto ValHash =
+        stable_hash_combine_array(Val.getRawData(), Val.getNumWords());
+    return hash_combine(MO.getType(), MO.getTargetFlags(), ValHash);
+  }
+
+  case MachineOperand::MO_MachineBasicBlock:
+    StableHashBailingMachineBasicBlock++;
+    return 0;
+  case MachineOperand::MO_ConstantPoolIndex:
+    StableHashBailingConstantPoolIndex++;
+    return 0;
+  case MachineOperand::MO_BlockAddress:
+    StableHashBailingBlockAddress++;
+    return 0;
+  case MachineOperand::MO_Metadata:
+    StableHashBailingMetadataUnsupported++;
+    return 0;
+  case MachineOperand::MO_GlobalAddress:
+    StableHashBailingGlobalAddress++;
+    return 0;
+  case MachineOperand::MO_TargetIndex: {
+    if (const char *Name = MO.getTargetIndexName())
+      return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
+                                 stable_hash_combine_string(Name),
+                                 MO.getOffset());
+    StableHashBailingTargetIndexNoName++;
+    return 0;
+  }
+
+  case MachineOperand::MO_FrameIndex:
+  case MachineOperand::MO_JumpTableIndex:
+    return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
+                               MO.getIndex());
+
+  case MachineOperand::MO_ExternalSymbol:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getOffset(),
+                        stable_hash_combine_string(MO.getSymbolName()));
+
+  case MachineOperand::MO_RegisterMask:
+  case MachineOperand::MO_RegisterLiveOut: {
+    if (const MachineInstr *MI = MO.getParent()) {
+      if (const MachineBasicBlock *MBB = MI->getParent()) {
+        if (const MachineFunction *MF = MBB->getParent()) {
+          const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+          unsigned RegMaskSize =
+              MachineOperand::getRegMaskSize(TRI->getNumRegs());
+          const uint32_t *RegMask = MO.getRegMask();
+          std::vector<llvm::stable_hash> RegMaskHashes(RegMask,
+                                                       RegMask + RegMaskSize);
+          return hash_combine(MO.getType(), MO.getTargetFlags(),
+                              stable_hash_combine_array(RegMaskHashes.data(),
+                                                        RegMaskHashes.size()));
+        }
+      }
+    }
+
+    assert(0 && "MachineOperand not associated with any MachineFunction");
+    return hash_combine(MO.getType(), MO.getTargetFlags());
+  }
+
+  case MachineOperand::MO_ShuffleMask: {
+    std::vector<llvm::stable_hash> ShuffleMaskHashes;
+
+    llvm::transform(
+        MO.getShuffleMask(), std::back_inserter(ShuffleMaskHashes),
+        [](int S) -> llvm::stable_hash { return llvm::stable_hash(S); });
+
+    return hash_combine(MO.getType(), MO.getTargetFlags(),
+                        stable_hash_combine_array(ShuffleMaskHashes.data(),
+                                                  ShuffleMaskHashes.size()));
+  }
+  case MachineOperand::MO_MCSymbol: {
+    auto SymbolName = MO.getMCSymbol()->getName();
+    return hash_combine(MO.getType(), MO.getTargetFlags(),
+                        stable_hash_combine_string(SymbolName));
+  }
+  case MachineOperand::MO_CFIIndex:
+    return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
+                               MO.getCFIIndex());
+  case MachineOperand::MO_IntrinsicID:
+    return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
+                               MO.getIntrinsicID());
+  case MachineOperand::MO_Predicate:
+    return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
+                               MO.getPredicate());
+  case MachineOperand::MO_DbgInstrRef:
+    return stable_hash_combine(MO.getType(), MO.getInstrRefInstrIndex(),
+                               MO.getInstrRefOpIndex());
+  }
+  llvm_unreachable("Invalid machine operand type");
+}
+
+/// A stable hash value for machine instructions.
+/// Returns 0 if no stable hash could be computed.
+/// The hashing and equality testing functions ignore definitions so this is
+/// useful for CSE, etc.
+stable_hash llvm::stableHashValue(const MachineInstr &MI, bool HashVRegs,
+                                  bool HashConstantPoolIndices,
+                                  bool HashMemOperands) {
+  // Build up a buffer of hash code components.
+  SmallVector<stable_hash, 16> HashComponents;
+  HashComponents.reserve(MI.getNumOperands() + MI.getNumMemOperands() + 2);
+  HashComponents.push_back(MI.getOpcode());
+  HashComponents.push_back(MI.getFlags());
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!HashVRegs && MO.isReg() && MO.isDef() && MO.getReg().isVirtual())
+      continue; // Skip virtual register defs.
+
+    if (MO.isCPI()) {
+      HashComponents.push_back(stable_hash_combine(
+          MO.getType(), MO.getTargetFlags(), MO.getIndex()));
+      continue;
+    }
+
+    stable_hash StableHash = stableHashValue(MO);
+    if (!StableHash)
+      return 0;
+    HashComponents.push_back(StableHash);
+  }
+
+  for (const auto *Op : MI.memoperands()) {
+    if (!HashMemOperands)
+      break;
+    HashComponents.push_back(static_cast<unsigned>(Op->getSize()));
+    HashComponents.push_back(static_cast<unsigned>(Op->getFlags()));
+    HashComponents.push_back(static_cast<unsigned>(Op->getOffset()));
+    HashComponents.push_back(static_cast<unsigned>(Op->getSuccessOrdering()));
+    HashComponents.push_back(static_cast<unsigned>(Op->getAddrSpace()));
+    HashComponents.push_back(static_cast<unsigned>(Op->getSyncScopeID()));
+    HashComponents.push_back(static_cast<unsigned>(Op->getBaseAlign().value()));
+    HashComponents.push_back(static_cast<unsigned>(Op->getFailureOrdering()));
+  }
+
+  return stable_hash_combine_range(HashComponents.begin(),
+                                   HashComponents.end());
+}
+
+stable_hash llvm::stableHashValue(const MachineBasicBlock &MBB) {
+  SmallVector<stable_hash> HashComponents;
+  // TODO: Hash more stuff like block alignment and branch probabilities.
+  for (const auto &MI : MBB)
+    HashComponents.push_back(stableHashValue(MI));
+  return stable_hash_combine_range(HashComponents.begin(),
+                                   HashComponents.end());
+}
+
+stable_hash llvm::stableHashValue(const MachineFunction &MF) {
+  SmallVector<stable_hash> HashComponents;
+  // TODO: Hash lots more stuff like function alignment and stack objects.
+  for (const auto &MBB : MF)
+    HashComponents.push_back(stableHashValue(MBB));
+  return stable_hash_combine_range(HashComponents.begin(),
+                                   HashComponents.end());
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineStripDebug.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineStripDebug.cpp
new file mode 100644
index 000000000000..6128248a028e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineStripDebug.cpp
@@ -0,0 +1,108 @@
+//===- MachineStripDebug.cpp - Strip debug info ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This removes debug info from everything. It can be used to ensure
+/// tests can be debugified without affecting the output MIR.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Debugify.h"
+
+#define DEBUG_TYPE "mir-strip-debug"
+
+using namespace llvm;
+
+namespace {
+cl::opt<bool>
+    OnlyDebugifiedDefault("mir-strip-debugify-only",
+                          cl::desc("Should mir-strip-debug only strip debug "
+                                   "info from debugified modules by default"),
+                          cl::init(true));
+
+struct StripDebugMachineModule : public ModulePass {
+  bool runOnModule(Module &M) override {
+    if (OnlyDebugified) {
+      NamedMDNode *DebugifyMD = M.getNamedMetadata("llvm.debugify");
+      if (!DebugifyMD) {
+        LLVM_DEBUG(dbgs() << "Not stripping debug info"
+                             " (debugify metadata not found)?\n");
+        return false;
+      }
+    }
+
+    MachineModuleInfo &MMI =
+        getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+
+    bool Changed = false;
+    for (Function &F : M.functions()) {
+      MachineFunction *MaybeMF = MMI.getMachineFunction(F);
+      if (!MaybeMF)
+        continue;
+      MachineFunction &MF = *MaybeMF;
+      for (MachineBasicBlock &MBB : MF) {
+        for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+          if (MI.isDebugInstr()) {
+            // FIXME: We should remove all of them. However, AArch64 emits an
+            //        invalid `DBG_VALUE $lr` with only one operand instead of
+            //        the usual three and has a test that depends on it's
+            //        preservation. Preserve it for now.
+            if (MI.getNumOperands() > 1) {
+              LLVM_DEBUG(dbgs() << "Removing debug instruction " << MI);
+              MBB.erase(&MI);
+              Changed |= true;
+              continue;
+            }
+          }
+          if (MI.getDebugLoc()) {
+            LLVM_DEBUG(dbgs() << "Removing location " << MI);
+            MI.setDebugLoc(DebugLoc());
+            Changed |= true;
+            continue;
+          }
+          LLVM_DEBUG(dbgs() << "Keeping " << MI);
+        }
+      }
+    }
+
+    Changed |= stripDebugifyMetadata(M);
+
+    return Changed;
+  }
+
+  StripDebugMachineModule() : StripDebugMachineModule(OnlyDebugifiedDefault) {}
+  StripDebugMachineModule(bool OnlyDebugified)
+      : ModulePass(ID), OnlyDebugified(OnlyDebugified) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.addPreserved<MachineModuleInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+  static char ID; // Pass identification.
+
+protected:
+  bool OnlyDebugified;
+};
+char StripDebugMachineModule::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(StripDebugMachineModule, DEBUG_TYPE,
+                      "Machine Strip Debug Module", false, false)
+INITIALIZE_PASS_END(StripDebugMachineModule, DEBUG_TYPE,
+                    "Machine Strip Debug Module", false, false)
+
+ModulePass *llvm::createStripDebugMachineModulePass(bool OnlyDebugified) {
+  return new StripDebugMachineModule(OnlyDebugified);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineTraceMetrics.cpp
new file mode 100644
index 000000000000..4f66f2e672d1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -0,0 +1,1356 @@
+//===- lib/CodeGen/MachineTraceMetrics.cpp --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-trace-metrics"
+
+char MachineTraceMetrics::ID = 0;
+
+char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID;
+
+INITIALIZE_PASS_BEGIN(MachineTraceMetrics, DEBUG_TYPE,
+                      "Machine Trace Metrics", false, true)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(MachineTraceMetrics, DEBUG_TYPE,
+                    "Machine Trace Metrics", false, true)
+
+MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) {
+  std::fill(std::begin(Ensembles), std::end(Ensembles), nullptr);
+}
+
+void MachineTraceMetrics::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineLoopInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) {
+  MF = &Func;
+  const TargetSubtargetInfo &ST = MF->getSubtarget();
+  TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
+  MRI = &MF->getRegInfo();
+  Loops = &getAnalysis<MachineLoopInfo>();
+  SchedModel.init(&ST);
+  BlockInfo.resize(MF->getNumBlockIDs());
+  ProcResourceCycles.resize(MF->getNumBlockIDs() *
+                            SchedModel.getNumProcResourceKinds());
+  return false;
+}
+
+void MachineTraceMetrics::releaseMemory() {
+  MF = nullptr;
+  BlockInfo.clear();
+  for (Ensemble *&E : Ensembles) {
+    delete E;
+    E = nullptr;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                          Fixed block information
+//===----------------------------------------------------------------------===//
+//
+// The number of instructions in a basic block and the CPU resources used by
+// those instructions don't depend on any given trace strategy.
+
+/// Compute the resource usage in basic block MBB.
+const MachineTraceMetrics::FixedBlockInfo*
+MachineTraceMetrics::getResources(const MachineBasicBlock *MBB) {
+  assert(MBB && "No basic block");
+  FixedBlockInfo *FBI = &BlockInfo[MBB->getNumber()];
+  if (FBI->hasResources())
+    return FBI;
+
+  // Compute resource usage in the block.
+  FBI->HasCalls = false;
+  unsigned InstrCount = 0;
+
+  // Add up per-processor resource cycles as well.
+  unsigned PRKinds = SchedModel.getNumProcResourceKinds();
+  SmallVector<unsigned, 32> PRCycles(PRKinds);
+
+  for (const auto &MI : *MBB) {
+    if (MI.isTransient())
+      continue;
+    ++InstrCount;
+    if (MI.isCall())
+      FBI->HasCalls = true;
+
+    // Count processor resources used.
+    if (!SchedModel.hasInstrSchedModel())
+      continue;
+    const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(&MI);
+    if (!SC->isValid())
+      continue;
+
+    for (TargetSchedModel::ProcResIter
+         PI = SchedModel.getWriteProcResBegin(SC),
+         PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) {
+      assert(PI->ProcResourceIdx < PRKinds && "Bad processor resource kind");
+      PRCycles[PI->ProcResourceIdx] += PI->Cycles;
+    }
+  }
+  FBI->InstrCount = InstrCount;
+
+  // Scale the resource cycles so they are comparable.
+  unsigned PROffset = MBB->getNumber() * PRKinds;
+  for (unsigned K = 0; K != PRKinds; ++K)
+    ProcResourceCycles[PROffset + K] =
+      PRCycles[K] * SchedModel.getResourceFactor(K);
+
+  return FBI;
+}
+
+ArrayRef<unsigned>
+MachineTraceMetrics::getProcResourceCycles(unsigned MBBNum) const {
+  assert(BlockInfo[MBBNum].hasResources() &&
+         "getResources() must be called before getProcResourceCycles()");
+  unsigned PRKinds = SchedModel.getNumProcResourceKinds();
+  assert((MBBNum+1) * PRKinds <= ProcResourceCycles.size());
+  return ArrayRef(ProcResourceCycles.data() + MBBNum * PRKinds, PRKinds);
+}
+
+//===----------------------------------------------------------------------===//
+//                         Ensemble utility functions
+//===----------------------------------------------------------------------===//
+
+MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *ct)
+  : MTM(*ct) {
+  BlockInfo.resize(MTM.BlockInfo.size());
+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
+  ProcResourceDepths.resize(MTM.BlockInfo.size() * PRKinds);
+  ProcResourceHeights.resize(MTM.BlockInfo.size() * PRKinds);
+}
+
+// Virtual destructor serves as an anchor.
+MachineTraceMetrics::Ensemble::~Ensemble() = default;
+
+const MachineLoop*
+MachineTraceMetrics::Ensemble::getLoopFor(const MachineBasicBlock *MBB) const {
+  return MTM.Loops->getLoopFor(MBB);
+}
+
+// Update resource-related information in the TraceBlockInfo for MBB.
+// Only update resources related to the trace above MBB.
+void MachineTraceMetrics::Ensemble::
+computeDepthResources(const MachineBasicBlock *MBB) {
+  TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
+  unsigned PROffset = MBB->getNumber() * PRKinds;
+
+  // Compute resources from trace above. The top block is simple.
+  if (!TBI->Pred) {
+    TBI->InstrDepth = 0;
+    TBI->Head = MBB->getNumber();
+    std::fill(ProcResourceDepths.begin() + PROffset,
+              ProcResourceDepths.begin() + PROffset + PRKinds, 0);
+    return;
+  }
+
+  // Compute from the block above. A post-order traversal ensures the
+  // predecessor is always computed first.
+  unsigned PredNum = TBI->Pred->getNumber();
+  TraceBlockInfo *PredTBI = &BlockInfo[PredNum];
+  assert(PredTBI->hasValidDepth() && "Trace above has not been computed yet");
+  const FixedBlockInfo *PredFBI = MTM.getResources(TBI->Pred);
+  TBI->InstrDepth = PredTBI->InstrDepth + PredFBI->InstrCount;
+  TBI->Head = PredTBI->Head;
+
+  // Compute per-resource depths.
+  ArrayRef<unsigned> PredPRDepths = getProcResourceDepths(PredNum);
+  ArrayRef<unsigned> PredPRCycles = MTM.getProcResourceCycles(PredNum);
+  for (unsigned K = 0; K != PRKinds; ++K)
+    ProcResourceDepths[PROffset + K] = PredPRDepths[K] + PredPRCycles[K];
+}
+
+// Update resource-related information in the TraceBlockInfo for MBB.
+// Only update resources related to the trace below MBB.
+void MachineTraceMetrics::Ensemble::
+computeHeightResources(const MachineBasicBlock *MBB) {
+  TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
+  unsigned PROffset = MBB->getNumber() * PRKinds;
+
+  // Compute resources for the current block.
+  TBI->InstrHeight = MTM.getResources(MBB)->InstrCount;
+  ArrayRef<unsigned> PRCycles = MTM.getProcResourceCycles(MBB->getNumber());
+
+  // The trace tail is done.
+  if (!TBI->Succ) {
+    TBI->Tail = MBB->getNumber();
+    llvm::copy(PRCycles, ProcResourceHeights.begin() + PROffset);
+    return;
+  }
+
+  // Compute from the block below. A post-order traversal ensures the
+  // predecessor is always computed first.
+  unsigned SuccNum = TBI->Succ->getNumber();
+  TraceBlockInfo *SuccTBI = &BlockInfo[SuccNum];
+  assert(SuccTBI->hasValidHeight() && "Trace below has not been computed yet");
+  TBI->InstrHeight += SuccTBI->InstrHeight;
+  TBI->Tail = SuccTBI->Tail;
+
+  // Compute per-resource heights.
+  ArrayRef<unsigned> SuccPRHeights = getProcResourceHeights(SuccNum);
+  for (unsigned K = 0; K != PRKinds; ++K)
+    ProcResourceHeights[PROffset + K] = SuccPRHeights[K] + PRCycles[K];
+}
+
+// Check if depth resources for MBB are valid and return the TBI.
+// Return NULL if the resources have been invalidated.
+const MachineTraceMetrics::TraceBlockInfo*
+MachineTraceMetrics::Ensemble::
+getDepthResources(const MachineBasicBlock *MBB) const {
+  const TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+  return TBI->hasValidDepth() ? TBI : nullptr;
+}
+
+// Check if height resources for MBB are valid and return the TBI.
+// Return NULL if the resources have been invalidated.
+const MachineTraceMetrics::TraceBlockInfo*
+MachineTraceMetrics::Ensemble::
+getHeightResources(const MachineBasicBlock *MBB) const {
+  const TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+  return TBI->hasValidHeight() ? TBI : nullptr;
+}
+
+/// Get an array of processor resource depths for MBB. Indexed by processor
+/// resource kind, this array contains the scaled processor resources consumed
+/// by all blocks preceding MBB in its trace. It does not include instructions
+/// in MBB.
+///
+/// Compare TraceBlockInfo::InstrDepth.
+ArrayRef<unsigned>
+MachineTraceMetrics::Ensemble::
+getProcResourceDepths(unsigned MBBNum) const {
+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
+  assert((MBBNum+1) * PRKinds <= ProcResourceDepths.size());
+  return ArrayRef(ProcResourceDepths.data() + MBBNum * PRKinds, PRKinds);
+}
+
+/// Get an array of processor resource heights for MBB. Indexed by processor
+/// resource kind, this array contains the scaled processor resources consumed
+/// by this block and all blocks following it in its trace.
+///
+/// Compare TraceBlockInfo::InstrHeight.
+ArrayRef<unsigned>
+MachineTraceMetrics::Ensemble::
+getProcResourceHeights(unsigned MBBNum) const {
+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
+  assert((MBBNum+1) * PRKinds <= ProcResourceHeights.size());
+  return ArrayRef(ProcResourceHeights.data() + MBBNum * PRKinds, PRKinds);
+}
+
+//===----------------------------------------------------------------------===//
+//                         Trace Selection Strategies
+//===----------------------------------------------------------------------===//
+//
+// A trace selection strategy is implemented as a sub-class of Ensemble. The
+// trace through a block B is computed by two DFS traversals of the CFG
+// starting from B. One upwards, and one downwards. During the upwards DFS,
+// pickTracePred() is called on the post-ordered blocks. During the downwards
+// DFS, pickTraceSucc() is called in a post-order.
+//
+
+// We never allow traces that leave loops, but we do allow traces to enter
+// nested loops. We also never allow traces to contain back-edges.
+//
+// This means that a loop header can never appear above the center block of a
+// trace, except as the trace head. Below the center block, loop exiting edges
+// are banned.
+//
+// Return true if an edge from the From loop to the To loop is leaving a loop.
+// Either of To and From can be null.
+static bool isExitingLoop(const MachineLoop *From, const MachineLoop *To) {
+  return From && !From->contains(To);
+}
+
+// MinInstrCountEnsemble - Pick the trace that executes the least number of
+// instructions.
+namespace {
+
+class MinInstrCountEnsemble : public MachineTraceMetrics::Ensemble {
+  const char *getName() const override { return "MinInstr"; }
+  const MachineBasicBlock *pickTracePred(const MachineBasicBlock*) override;
+  const MachineBasicBlock *pickTraceSucc(const MachineBasicBlock*) override;
+
+public:
+  MinInstrCountEnsemble(MachineTraceMetrics *mtm)
+    : MachineTraceMetrics::Ensemble(mtm) {}
+};
+
+/// Pick only the current basic block for the trace and do not choose any
+/// predecessors/successors.
+class LocalEnsemble : public MachineTraceMetrics::Ensemble {
+  const char *getName() const override { return "Local"; }
+  const MachineBasicBlock *pickTracePred(const MachineBasicBlock *) override {
+    return nullptr;
+  };
+  const MachineBasicBlock *pickTraceSucc(const MachineBasicBlock *) override {
+    return nullptr;
+  };
+
+public:
+  LocalEnsemble(MachineTraceMetrics *MTM)
+      : MachineTraceMetrics::Ensemble(MTM) {}
+};
+} // end anonymous namespace
+
+// Select the preferred predecessor for MBB.
+const MachineBasicBlock*
+MinInstrCountEnsemble::pickTracePred(const MachineBasicBlock *MBB) {
+  if (MBB->pred_empty())
+    return nullptr;
+  const MachineLoop *CurLoop = getLoopFor(MBB);
+  // Don't leave loops, and never follow back-edges.
+  if (CurLoop && MBB == CurLoop->getHeader())
+    return nullptr;
+  unsigned CurCount = MTM.getResources(MBB)->InstrCount;
+  const MachineBasicBlock *Best = nullptr;
+  unsigned BestDepth = 0;
+  for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+    const MachineTraceMetrics::TraceBlockInfo *PredTBI =
+      getDepthResources(Pred);
+    // Ignore cycles that aren't natural loops.
+    if (!PredTBI)
+      continue;
+    // Pick the predecessor that would give this block the smallest InstrDepth.
+    unsigned Depth = PredTBI->InstrDepth + CurCount;
+    if (!Best || Depth < BestDepth) {
+      Best = Pred;
+      BestDepth = Depth;
+    }
+  }
+  return Best;
+}
+
+// Select the preferred successor for MBB.
+const MachineBasicBlock*
+MinInstrCountEnsemble::pickTraceSucc(const MachineBasicBlock *MBB) {
+  if (MBB->succ_empty())
+    return nullptr;
+  const MachineLoop *CurLoop = getLoopFor(MBB);
+  const MachineBasicBlock *Best = nullptr;
+  unsigned BestHeight = 0;
+  for (const MachineBasicBlock *Succ : MBB->successors()) {
+    // Don't consider back-edges.
+    if (CurLoop && Succ == CurLoop->getHeader())
+      continue;
+    // Don't consider successors exiting CurLoop.
+    if (isExitingLoop(CurLoop, getLoopFor(Succ)))
+      continue;
+    const MachineTraceMetrics::TraceBlockInfo *SuccTBI =
+      getHeightResources(Succ);
+    // Ignore cycles that aren't natural loops.
+    if (!SuccTBI)
+      continue;
+    // Pick the successor that would give this block the smallest InstrHeight.
+    unsigned Height = SuccTBI->InstrHeight;
+    if (!Best || Height < BestHeight) {
+      Best = Succ;
+      BestHeight = Height;
+    }
+  }
+  return Best;
+}
+
+// Get an Ensemble sub-class for the requested trace strategy.
+MachineTraceMetrics::Ensemble *
+MachineTraceMetrics::getEnsemble(MachineTraceStrategy strategy) {
+  assert(strategy < MachineTraceStrategy::TS_NumStrategies &&
+         "Invalid trace strategy enum");
+  Ensemble *&E = Ensembles[static_cast<size_t>(strategy)];
+  if (E)
+    return E;
+
+  // Allocate new Ensemble on demand.
+  switch (strategy) {
+  case MachineTraceStrategy::TS_MinInstrCount:
+    return (E = new MinInstrCountEnsemble(this));
+  case MachineTraceStrategy::TS_Local:
+    return (E = new LocalEnsemble(this));
+  default: llvm_unreachable("Invalid trace strategy enum");
+  }
+}
+
+void MachineTraceMetrics::invalidate(const MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << "Invalidate traces through " << printMBBReference(*MBB)
+                    << '\n');
+  BlockInfo[MBB->getNumber()].invalidate();
+  for (Ensemble *E : Ensembles)
+    if (E)
+      E->invalidate(MBB);
+}
+
+void MachineTraceMetrics::verifyAnalysis() const {
+  if (!MF)
+    return;
+#ifndef NDEBUG
+  assert(BlockInfo.size() == MF->getNumBlockIDs() && "Outdated BlockInfo size");
+  for (Ensemble *E : Ensembles)
+    if (E)
+      E->verify();
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                               Trace building
+//===----------------------------------------------------------------------===//
+//
+// Traces are built by two CFG traversals. To avoid recomputing too much, use a
+// set abstraction that confines the search to the current loop, and doesn't
+// revisit blocks.
+
+namespace {
+
+struct LoopBounds {
+  MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> Blocks;
+  SmallPtrSet<const MachineBasicBlock*, 8> Visited;
+  const MachineLoopInfo *Loops;
+  bool Downward = false;
+
+  LoopBounds(MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> blocks,
+             const MachineLoopInfo *loops) : Blocks(blocks), Loops(loops) {}
+};
+
+} // end anonymous namespace
+
+// Specialize po_iterator_storage in order to prune the post-order traversal so
+// it is limited to the current loop and doesn't traverse the loop back edges.
+namespace llvm {
+
+template<>
+class po_iterator_storage<LoopBounds, true> {
+  LoopBounds &LB;
+
+public:
+  po_iterator_storage(LoopBounds &lb) : LB(lb) {}
+
+  void finishPostorder(const MachineBasicBlock*) {}
+
+  bool insertEdge(std::optional<const MachineBasicBlock *> From,
+                  const MachineBasicBlock *To) {
+    // Skip already visited To blocks.
+    MachineTraceMetrics::TraceBlockInfo &TBI = LB.Blocks[To->getNumber()];
+    if (LB.Downward ? TBI.hasValidHeight() : TBI.hasValidDepth())
+      return false;
+    // From is null once when To is the trace center block.
+    if (From) {
+      if (const MachineLoop *FromLoop = LB.Loops->getLoopFor(*From)) {
+        // Don't follow backedges, don't leave FromLoop when going upwards.
+        if ((LB.Downward ? To : *From) == FromLoop->getHeader())
+          return false;
+        // Don't leave FromLoop.
+        if (isExitingLoop(FromLoop, LB.Loops->getLoopFor(To)))
+          return false;
+      }
+    }
+    // To is a new block. Mark the block as visited in case the CFG has cycles
+    // that MachineLoopInfo didn't recognize as a natural loop.
+    return LB.Visited.insert(To).second;
+  }
+};
+
+} // end namespace llvm
+
+/// Compute the trace through MBB.
+void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << "Computing " << getName() << " trace through "
+                    << printMBBReference(*MBB) << '\n');
+  // Set up loop bounds for the backwards post-order traversal.
+  LoopBounds Bounds(BlockInfo, MTM.Loops);
+
+  // Run an upwards post-order search for the trace start.
+  Bounds.Downward = false;
+  Bounds.Visited.clear();
+  for (const auto *I : inverse_post_order_ext(MBB, Bounds)) {
+    LLVM_DEBUG(dbgs() << "  pred for " << printMBBReference(*I) << ": ");
+    TraceBlockInfo &TBI = BlockInfo[I->getNumber()];
+    // All the predecessors have been visited, pick the preferred one.
+    TBI.Pred = pickTracePred(I);
+    LLVM_DEBUG({
+      if (TBI.Pred)
+        dbgs() << printMBBReference(*TBI.Pred) << '\n';
+      else
+        dbgs() << "null\n";
+    });
+    // The trace leading to I is now known, compute the depth resources.
+    computeDepthResources(I);
+  }
+
+  // Run a downwards post-order search for the trace end.
+  Bounds.Downward = true;
+  Bounds.Visited.clear();
+  for (const auto *I : post_order_ext(MBB, Bounds)) {
+    LLVM_DEBUG(dbgs() << "  succ for " << printMBBReference(*I) << ": ");
+    TraceBlockInfo &TBI = BlockInfo[I->getNumber()];
+    // All the successors have been visited, pick the preferred one.
+    TBI.Succ = pickTraceSucc(I);
+    LLVM_DEBUG({
+      if (TBI.Succ)
+        dbgs() << printMBBReference(*TBI.Succ) << '\n';
+      else
+        dbgs() << "null\n";
+    });
+    // The trace leaving I is now known, compute the height resources.
+    computeHeightResources(I);
+  }
+}
+
+/// Invalidate traces through BadMBB.
+void
+MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
+  SmallVector<const MachineBasicBlock*, 16> WorkList;
+  TraceBlockInfo &BadTBI = BlockInfo[BadMBB->getNumber()];
+
+  // Invalidate height resources of blocks above MBB.
+  if (BadTBI.hasValidHeight()) {
+    BadTBI.invalidateHeight();
+    WorkList.push_back(BadMBB);
+    do {
+      const MachineBasicBlock *MBB = WorkList.pop_back_val();
+      LLVM_DEBUG(dbgs() << "Invalidate " << printMBBReference(*MBB) << ' '
+                        << getName() << " height.\n");
+      // Find any MBB predecessors that have MBB as their preferred successor.
+      // They are the only ones that need to be invalidated.
+      for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+        TraceBlockInfo &TBI = BlockInfo[Pred->getNumber()];
+        if (!TBI.hasValidHeight())
+          continue;
+        if (TBI.Succ == MBB) {
+          TBI.invalidateHeight();
+          WorkList.push_back(Pred);
+          continue;
+        }
+        // Verify that TBI.Succ is actually a *I successor.
+        assert((!TBI.Succ || Pred->isSuccessor(TBI.Succ)) && "CFG changed");
+      }
+    } while (!WorkList.empty());
+  }
+
+  // Invalidate depth resources of blocks below MBB.
+  if (BadTBI.hasValidDepth()) {
+    BadTBI.invalidateDepth();
+    WorkList.push_back(BadMBB);
+    do {
+      const MachineBasicBlock *MBB = WorkList.pop_back_val();
+      LLVM_DEBUG(dbgs() << "Invalidate " << printMBBReference(*MBB) << ' '
+                        << getName() << " depth.\n");
+      // Find any MBB successors that have MBB as their preferred predecessor.
+      // They are the only ones that need to be invalidated.
+      for (const MachineBasicBlock *Succ : MBB->successors()) {
+        TraceBlockInfo &TBI = BlockInfo[Succ->getNumber()];
+        if (!TBI.hasValidDepth())
+          continue;
+        if (TBI.Pred == MBB) {
+          TBI.invalidateDepth();
+          WorkList.push_back(Succ);
+          continue;
+        }
+        // Verify that TBI.Pred is actually a *I predecessor.
+        assert((!TBI.Pred || Succ->isPredecessor(TBI.Pred)) && "CFG changed");
+      }
+    } while (!WorkList.empty());
+  }
+
+  // Clear any per-instruction data. We only have to do this for BadMBB itself
+  // because the instructions in that block may change. Other blocks may be
+  // invalidated, but their instructions will stay the same, so there is no
+  // need to erase the Cycle entries. They will be overwritten when we
+  // recompute.
+  for (const auto &I : *BadMBB)
+    Cycles.erase(&I);
+}
+
+void MachineTraceMetrics::Ensemble::verify() const {
+#ifndef NDEBUG
+  assert(BlockInfo.size() == MTM.MF->getNumBlockIDs() &&
+         "Outdated BlockInfo size");
+  for (unsigned Num = 0, e = BlockInfo.size(); Num != e; ++Num) {
+    const TraceBlockInfo &TBI = BlockInfo[Num];
+    if (TBI.hasValidDepth() && TBI.Pred) {
+      const MachineBasicBlock *MBB = MTM.MF->getBlockNumbered(Num);
+      assert(MBB->isPredecessor(TBI.Pred) && "CFG doesn't match trace");
+      assert(BlockInfo[TBI.Pred->getNumber()].hasValidDepth() &&
+             "Trace is broken, depth should have been invalidated.");
+      const MachineLoop *Loop = getLoopFor(MBB);
+      assert(!(Loop && MBB == Loop->getHeader()) && "Trace contains backedge");
+    }
+    if (TBI.hasValidHeight() && TBI.Succ) {
+      const MachineBasicBlock *MBB = MTM.MF->getBlockNumbered(Num);
+      assert(MBB->isSuccessor(TBI.Succ) && "CFG doesn't match trace");
+      assert(BlockInfo[TBI.Succ->getNumber()].hasValidHeight() &&
+             "Trace is broken, height should have been invalidated.");
+      const MachineLoop *Loop = getLoopFor(MBB);
+      const MachineLoop *SuccLoop = getLoopFor(TBI.Succ);
+      assert(!(Loop && Loop == SuccLoop && TBI.Succ == Loop->getHeader()) &&
+             "Trace contains backedge");
+    }
+  }
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                             Data Dependencies
+//===----------------------------------------------------------------------===//
+//
+// Compute the depth and height of each instruction based on data dependencies
+// and instruction latencies. These cycle numbers assume that the CPU can issue
+// an infinite number of instructions per cycle as long as their dependencies
+// are ready.
+
+// A data dependency is represented as a defining MI and operand numbers on the
+// defining and using MI.
+namespace {
+
+struct DataDep {
+  const MachineInstr *DefMI;
+  unsigned DefOp;
+  unsigned UseOp;
+
+  DataDep(const MachineInstr *DefMI, unsigned DefOp, unsigned UseOp)
+    : DefMI(DefMI), DefOp(DefOp), UseOp(UseOp) {}
+
+  /// Create a DataDep from an SSA form virtual register.
+  DataDep(const MachineRegisterInfo *MRI, unsigned VirtReg, unsigned UseOp)
+    : UseOp(UseOp) {
+    assert(Register::isVirtualRegister(VirtReg));
+    MachineRegisterInfo::def_iterator DefI = MRI->def_begin(VirtReg);
+    assert(!DefI.atEnd() && "Register has no defs");
+    DefMI = DefI->getParent();
+    DefOp = DefI.getOperandNo();
+    assert((++DefI).atEnd() && "Register has multiple defs");
+  }
+};
+
+} // end anonymous namespace
+
+// Get the input data dependencies that must be ready before UseMI can issue.
+// Return true if UseMI has any physreg operands.
+static bool getDataDeps(const MachineInstr &UseMI,
+                        SmallVectorImpl<DataDep> &Deps,
+                        const MachineRegisterInfo *MRI) {
+  // Debug values should not be included in any calculations.
+  if (UseMI.isDebugInstr())
+    return false;
+
+  bool HasPhysRegs = false;
+  for (const MachineOperand &MO : UseMI.operands()) {
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (Reg.isPhysical()) {
+      HasPhysRegs = true;
+      continue;
+    }
+    // Collect virtual register reads.
+    if (MO.readsReg())
+      Deps.push_back(DataDep(MRI, Reg, MO.getOperandNo()));
+  }
+  return HasPhysRegs;
+}
+
+// Get the input data dependencies of a PHI instruction, using Pred as the
+// preferred predecessor.
+// This will add at most one dependency to Deps.
+static void getPHIDeps(const MachineInstr &UseMI,
+                       SmallVectorImpl<DataDep> &Deps,
+                       const MachineBasicBlock *Pred,
+                       const MachineRegisterInfo *MRI) {
+  // No predecessor at the beginning of a trace. Ignore dependencies.
+  if (!Pred)
+    return;
+  assert(UseMI.isPHI() && UseMI.getNumOperands() % 2 && "Bad PHI");
+  for (unsigned i = 1; i != UseMI.getNumOperands(); i += 2) {
+    if (UseMI.getOperand(i + 1).getMBB() == Pred) {
+      Register Reg = UseMI.getOperand(i).getReg();
+      Deps.push_back(DataDep(MRI, Reg, i));
+      return;
+    }
+  }
+}
+
+// Identify physreg dependencies for UseMI, and update the live regunit
+// tracking set when scanning instructions downwards.
+static void updatePhysDepsDownwards(const MachineInstr *UseMI,
+                                    SmallVectorImpl<DataDep> &Deps,
+                                    SparseSet<LiveRegUnit> &RegUnits,
+                                    const TargetRegisterInfo *TRI) {
+  SmallVector<MCRegister, 8> Kills;
+  SmallVector<unsigned, 8> LiveDefOps;
+
+  for (const MachineOperand &MO : UseMI->operands()) {
+    if (!MO.isReg() || !MO.getReg().isPhysical())
+      continue;
+    MCRegister Reg = MO.getReg().asMCReg();
+    // Track live defs and kills for updating RegUnits.
+    if (MO.isDef()) {
+      if (MO.isDead())
+        Kills.push_back(Reg);
+      else
+        LiveDefOps.push_back(MO.getOperandNo());
+    } else if (MO.isKill())
+      Kills.push_back(Reg);
+    // Identify dependencies.
+    if (!MO.readsReg())
+      continue;
+    for (MCRegUnit Unit : TRI->regunits(Reg)) {
+      SparseSet<LiveRegUnit>::iterator I = RegUnits.find(Unit);
+      if (I == RegUnits.end())
+        continue;
+      Deps.push_back(DataDep(I->MI, I->Op, MO.getOperandNo()));
+      break;
+    }
+  }
+
+  // Update RegUnits to reflect live registers after UseMI.
+  // First kills.
+  for (MCRegister Kill : Kills)
+    for (MCRegUnit Unit : TRI->regunits(Kill))
+      RegUnits.erase(Unit);
+
+  // Second, live defs.
+  for (unsigned DefOp : LiveDefOps) {
+    for (MCRegUnit Unit :
+         TRI->regunits(UseMI->getOperand(DefOp).getReg().asMCReg())) {
+      LiveRegUnit &LRU = RegUnits[Unit];
+      LRU.MI = UseMI;
+      LRU.Op = DefOp;
+    }
+  }
+}
+
+/// The length of the critical path through a trace is the maximum of two path
+/// lengths:
+///
+/// 1. The maximum height+depth over all instructions in the trace center block.
+///
+/// 2. The longest cross-block dependency chain. For small blocks, it is
+///    possible that the critical path through the trace doesn't include any
+///    instructions in the block.
+///
+/// This function computes the second number from the live-in list of the
+/// center block.
+unsigned MachineTraceMetrics::Ensemble::
+computeCrossBlockCriticalPath(const TraceBlockInfo &TBI) {
+  assert(TBI.HasValidInstrDepths && "Missing depth info");
+  assert(TBI.HasValidInstrHeights && "Missing height info");
+  unsigned MaxLen = 0;
+  for (const LiveInReg &LIR : TBI.LiveIns) {
+    if (!LIR.Reg.isVirtual())
+      continue;
+    const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
+    // Ignore dependencies outside the current trace.
+    const TraceBlockInfo &DefTBI = BlockInfo[DefMI->getParent()->getNumber()];
+    if (!DefTBI.isUsefulDominator(TBI))
+      continue;
+    unsigned Len = LIR.Height + Cycles[DefMI].Depth;
+    MaxLen = std::max(MaxLen, Len);
+  }
+  return MaxLen;
+}
+
+void MachineTraceMetrics::Ensemble::
+updateDepth(MachineTraceMetrics::TraceBlockInfo &TBI, const MachineInstr &UseMI,
+            SparseSet<LiveRegUnit> &RegUnits) {
+  SmallVector<DataDep, 8> Deps;
+  // Collect all data dependencies.
+  if (UseMI.isPHI())
+    getPHIDeps(UseMI, Deps, TBI.Pred, MTM.MRI);
+  else if (getDataDeps(UseMI, Deps, MTM.MRI))
+    updatePhysDepsDownwards(&UseMI, Deps, RegUnits, MTM.TRI);
+
+  // Filter and process dependencies, computing the earliest issue cycle.
+  unsigned Cycle = 0;
+  for (const DataDep &Dep : Deps) {
+    const TraceBlockInfo&DepTBI =
+      BlockInfo[Dep.DefMI->getParent()->getNumber()];
+    // Ignore dependencies from outside the current trace.
+    if (!DepTBI.isUsefulDominator(TBI))
+      continue;
+    assert(DepTBI.HasValidInstrDepths && "Inconsistent dependency");
+    unsigned DepCycle = Cycles.lookup(Dep.DefMI).Depth;
+    // Add latency if DefMI is a real instruction. Transients get latency 0.
+    if (!Dep.DefMI->isTransient())
+      DepCycle += MTM.SchedModel
+        .computeOperandLatency(Dep.DefMI, Dep.DefOp, &UseMI, Dep.UseOp);
+    Cycle = std::max(Cycle, DepCycle);
+  }
+  // Remember the instruction depth.
+  InstrCycles &MICycles = Cycles[&UseMI];
+  MICycles.Depth = Cycle;
+
+  if (TBI.HasValidInstrHeights) {
+    // Update critical path length.
+    TBI.CriticalPath = std::max(TBI.CriticalPath, Cycle + MICycles.Height);
+    LLVM_DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << UseMI);
+  } else {
+    LLVM_DEBUG(dbgs() << Cycle << '\t' << UseMI);
+  }
+}
+
+void MachineTraceMetrics::Ensemble::
+updateDepth(const MachineBasicBlock *MBB, const MachineInstr &UseMI,
+            SparseSet<LiveRegUnit> &RegUnits) {
+  updateDepth(BlockInfo[MBB->getNumber()], UseMI, RegUnits);
+}
+
+void MachineTraceMetrics::Ensemble::
+updateDepths(MachineBasicBlock::iterator Start,
+             MachineBasicBlock::iterator End,
+             SparseSet<LiveRegUnit> &RegUnits) {
+    for (; Start != End; Start++)
+      updateDepth(Start->getParent(), *Start, RegUnits);
+}
+
+/// Compute instruction depths for all instructions above or in MBB in its
+/// trace. This assumes that the trace through MBB has already been computed.
+void MachineTraceMetrics::Ensemble::
+computeInstrDepths(const MachineBasicBlock *MBB) {
+  // The top of the trace may already be computed, and HasValidInstrDepths
+  // implies Head->HasValidInstrDepths, so we only need to start from the first
+  // block in the trace that needs to be recomputed.
+  SmallVector<const MachineBasicBlock*, 8> Stack;
+  do {
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    assert(TBI.hasValidDepth() && "Incomplete trace");
+    if (TBI.HasValidInstrDepths)
+      break;
+    Stack.push_back(MBB);
+    MBB = TBI.Pred;
+  } while (MBB);
+
+  // FIXME: If MBB is non-null at this point, it is the last pre-computed block
+  // in the trace. We should track any live-out physregs that were defined in
+  // the trace. This is quite rare in SSA form, typically created by CSE
+  // hoisting a compare.
+  SparseSet<LiveRegUnit> RegUnits;
+  RegUnits.setUniverse(MTM.TRI->getNumRegUnits());
+
+  // Go through trace blocks in top-down order, stopping after the center block.
+  while (!Stack.empty()) {
+    MBB = Stack.pop_back_val();
+    LLVM_DEBUG(dbgs() << "\nDepths for " << printMBBReference(*MBB) << ":\n");
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    TBI.HasValidInstrDepths = true;
+    TBI.CriticalPath = 0;
+
+    // Print out resource depths here as well.
+    LLVM_DEBUG({
+      dbgs() << format("%7u Instructions\n", TBI.InstrDepth);
+      ArrayRef<unsigned> PRDepths = getProcResourceDepths(MBB->getNumber());
+      for (unsigned K = 0; K != PRDepths.size(); ++K)
+        if (PRDepths[K]) {
+          unsigned Factor = MTM.SchedModel.getResourceFactor(K);
+          dbgs() << format("%6uc @ ", MTM.getCycles(PRDepths[K]))
+                 << MTM.SchedModel.getProcResource(K)->Name << " ("
+                 << PRDepths[K]/Factor << " ops x" << Factor << ")\n";
+        }
+    });
+
+    // Also compute the critical path length through MBB when possible.
+    if (TBI.HasValidInstrHeights)
+      TBI.CriticalPath = computeCrossBlockCriticalPath(TBI);
+
+    for (const auto &UseMI : *MBB) {
+      updateDepth(TBI, UseMI, RegUnits);
+    }
+  }
+}
+
+// Identify physreg dependencies for MI when scanning instructions upwards.
+// Return the issue height of MI after considering any live regunits.
+// Height is the issue height computed from virtual register dependencies alone.
+static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
+                                      SparseSet<LiveRegUnit> &RegUnits,
+                                      const TargetSchedModel &SchedModel,
+                                      const TargetInstrInfo *TII,
+                                      const TargetRegisterInfo *TRI) {
+  SmallVector<unsigned, 8> ReadOps;
+
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg.isPhysical())
+      continue;
+    if (MO.readsReg())
+      ReadOps.push_back(MO.getOperandNo());
+    if (!MO.isDef())
+      continue;
+    // This is a def of Reg. Remove corresponding entries from RegUnits, and
+    // update MI Height to consider the physreg dependencies.
+    for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
+      SparseSet<LiveRegUnit>::iterator I = RegUnits.find(Unit);
+      if (I == RegUnits.end())
+        continue;
+      unsigned DepHeight = I->Cycle;
+      if (!MI.isTransient()) {
+        // We may not know the UseMI of this dependency, if it came from the
+        // live-in list. SchedModel can handle a NULL UseMI.
+        DepHeight += SchedModel.computeOperandLatency(&MI, MO.getOperandNo(),
+                                                      I->MI, I->Op);
+      }
+      Height = std::max(Height, DepHeight);
+      // This regunit is dead above MI.
+      RegUnits.erase(I);
+    }
+  }
+
+  // Now we know the height of MI. Update any regunits read.
+  for (size_t I = 0, E = ReadOps.size(); I != E; ++I) {
+    MCRegister Reg = MI.getOperand(ReadOps[I]).getReg().asMCReg();
+    for (MCRegUnit Unit : TRI->regunits(Reg)) {
+      LiveRegUnit &LRU = RegUnits[Unit];
+      // Set the height to the highest reader of the unit.
+      if (LRU.Cycle <= Height && LRU.MI != &MI) {
+        LRU.Cycle = Height;
+        LRU.MI = &MI;
+        LRU.Op = ReadOps[I];
+      }
+    }
+  }
+
+  return Height;
+}
+
+using MIHeightMap = DenseMap<const MachineInstr *, unsigned>;
+
+// Push the height of DefMI upwards if required to match UseMI.
+// Return true if this is the first time DefMI was seen.
+static bool pushDepHeight(const DataDep &Dep, const MachineInstr &UseMI,
+                          unsigned UseHeight, MIHeightMap &Heights,
+                          const TargetSchedModel &SchedModel,
+                          const TargetInstrInfo *TII) {
+  // Adjust height by Dep.DefMI latency.
+  if (!Dep.DefMI->isTransient())
+    UseHeight += SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp, &UseMI,
+                                                  Dep.UseOp);
+
+  // Update Heights[DefMI] to be the maximum height seen.
+  MIHeightMap::iterator I;
+  bool New;
+  std::tie(I, New) = Heights.insert(std::make_pair(Dep.DefMI, UseHeight));
+  if (New)
+    return true;
+
+  // DefMI has been pushed before. Give it the max height.
+  if (I->second < UseHeight)
+    I->second = UseHeight;
+  return false;
+}
+
+/// Assuming that the virtual register defined by DefMI:DefOp was used by
+/// Trace.back(), add it to the live-in lists of all the blocks in Trace. Stop
+/// when reaching the block that contains DefMI.
+void MachineTraceMetrics::Ensemble::
+addLiveIns(const MachineInstr *DefMI, unsigned DefOp,
+           ArrayRef<const MachineBasicBlock*> Trace) {
+  assert(!Trace.empty() && "Trace should contain at least one block");
+  Register Reg = DefMI->getOperand(DefOp).getReg();
+  assert(Reg.isVirtual());
+  const MachineBasicBlock *DefMBB = DefMI->getParent();
+
+  // Reg is live-in to all blocks in Trace that follow DefMBB.
+  for (const MachineBasicBlock *MBB : llvm::reverse(Trace)) {
+    if (MBB == DefMBB)
+      return;
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    // Just add the register. The height will be updated later.
+    TBI.LiveIns.push_back(Reg);
+  }
+}
+
+/// Compute instruction heights in the trace through MBB. This updates MBB and
+/// the blocks below it in the trace. It is assumed that the trace has already
+/// been computed.
+void MachineTraceMetrics::Ensemble::
+computeInstrHeights(const MachineBasicBlock *MBB) {
+  // The bottom of the trace may already be computed.
+  // Find the blocks that need updating.
+  SmallVector<const MachineBasicBlock*, 8> Stack;
+  do {
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    assert(TBI.hasValidHeight() && "Incomplete trace");
+    if (TBI.HasValidInstrHeights)
+      break;
+    Stack.push_back(MBB);
+    TBI.LiveIns.clear();
+    MBB = TBI.Succ;
+  } while (MBB);
+
+  // As we move upwards in the trace, keep track of instructions that are
+  // required by deeper trace instructions. Map MI -> height required so far.
+  MIHeightMap Heights;
+
+  // For physregs, the def isn't known when we see the use.
+  // Instead, keep track of the highest use of each regunit.
+  SparseSet<LiveRegUnit> RegUnits;
+  RegUnits.setUniverse(MTM.TRI->getNumRegUnits());
+
+  // If the bottom of the trace was already precomputed, initialize heights
+  // from its live-in list.
+  // MBB is the highest precomputed block in the trace.
+  if (MBB) {
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    for (LiveInReg &LI : TBI.LiveIns) {
+      if (LI.Reg.isVirtual()) {
+        // For virtual registers, the def latency is included.
+        unsigned &Height = Heights[MTM.MRI->getVRegDef(LI.Reg)];
+        if (Height < LI.Height)
+          Height = LI.Height;
+      } else {
+        // For register units, the def latency is not included because we don't
+        // know the def yet.
+        RegUnits[LI.Reg].Cycle = LI.Height;
+      }
+    }
+  }
+
+  // Go through the trace blocks in bottom-up order.
+  SmallVector<DataDep, 8> Deps;
+  for (;!Stack.empty(); Stack.pop_back()) {
+    MBB = Stack.back();
+    LLVM_DEBUG(dbgs() << "Heights for " << printMBBReference(*MBB) << ":\n");
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    TBI.HasValidInstrHeights = true;
+    TBI.CriticalPath = 0;
+
+    LLVM_DEBUG({
+      dbgs() << format("%7u Instructions\n", TBI.InstrHeight);
+      ArrayRef<unsigned> PRHeights = getProcResourceHeights(MBB->getNumber());
+      for (unsigned K = 0; K != PRHeights.size(); ++K)
+        if (PRHeights[K]) {
+          unsigned Factor = MTM.SchedModel.getResourceFactor(K);
+          dbgs() << format("%6uc @ ", MTM.getCycles(PRHeights[K]))
+                 << MTM.SchedModel.getProcResource(K)->Name << " ("
+                 << PRHeights[K]/Factor << " ops x" << Factor << ")\n";
+        }
+    });
+
+    // Get dependencies from PHIs in the trace successor.
+    const MachineBasicBlock *Succ = TBI.Succ;
+    // If MBB is the last block in the trace, and it has a back-edge to the
+    // loop header, get loop-carried dependencies from PHIs in the header. For
+    // that purpose, pretend that all the loop header PHIs have height 0.
+    if (!Succ)
+      if (const MachineLoop *Loop = getLoopFor(MBB))
+        if (MBB->isSuccessor(Loop->getHeader()))
+          Succ = Loop->getHeader();
+
+    if (Succ) {
+      for (const auto &PHI : *Succ) {
+        if (!PHI.isPHI())
+          break;
+        Deps.clear();
+        getPHIDeps(PHI, Deps, MBB, MTM.MRI);
+        if (!Deps.empty()) {
+          // Loop header PHI heights are all 0.
+          unsigned Height = TBI.Succ ? Cycles.lookup(&PHI).Height : 0;
+          LLVM_DEBUG(dbgs() << "pred\t" << Height << '\t' << PHI);
+          if (pushDepHeight(Deps.front(), PHI, Height, Heights, MTM.SchedModel,
+                            MTM.TII))
+            addLiveIns(Deps.front().DefMI, Deps.front().DefOp, Stack);
+        }
+      }
+    }
+
+    // Go through the block backwards.
+    for (const MachineInstr &MI : reverse(*MBB)) {
+      // Find the MI height as determined by virtual register uses in the
+      // trace below.
+      unsigned Cycle = 0;
+      MIHeightMap::iterator HeightI = Heights.find(&MI);
+      if (HeightI != Heights.end()) {
+        Cycle = HeightI->second;
+        // We won't be seeing any more MI uses.
+        Heights.erase(HeightI);
+      }
+
+      // Don't process PHI deps. They depend on the specific predecessor, and
+      // we'll get them when visiting the predecessor.
+      Deps.clear();
+      bool HasPhysRegs = !MI.isPHI() && getDataDeps(MI, Deps, MTM.MRI);
+
+      // There may also be regunit dependencies to include in the height.
+      if (HasPhysRegs)
+        Cycle = updatePhysDepsUpwards(MI, Cycle, RegUnits, MTM.SchedModel,
+                                      MTM.TII, MTM.TRI);
+
+      // Update the required height of any virtual registers read by MI.
+      for (const DataDep &Dep : Deps)
+        if (pushDepHeight(Dep, MI, Cycle, Heights, MTM.SchedModel, MTM.TII))
+          addLiveIns(Dep.DefMI, Dep.DefOp, Stack);
+
+      InstrCycles &MICycles = Cycles[&MI];
+      MICycles.Height = Cycle;
+      if (!TBI.HasValidInstrDepths) {
+        LLVM_DEBUG(dbgs() << Cycle << '\t' << MI);
+        continue;
+      }
+      // Update critical path length.
+      TBI.CriticalPath = std::max(TBI.CriticalPath, Cycle + MICycles.Depth);
+      LLVM_DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << MI);
+    }
+
+    // Update virtual live-in heights. They were added by addLiveIns() with a 0
+    // height because the final height isn't known until now.
+    LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " Live-ins:");
+    for (LiveInReg &LIR : TBI.LiveIns) {
+      const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
+      LIR.Height = Heights.lookup(DefMI);
+      LLVM_DEBUG(dbgs() << ' ' << printReg(LIR.Reg) << '@' << LIR.Height);
+    }
+
+    // Transfer the live regunits to the live-in list.
+    for (const LiveRegUnit &RU : RegUnits) {
+      TBI.LiveIns.push_back(LiveInReg(RU.RegUnit, RU.Cycle));
+      LLVM_DEBUG(dbgs() << ' ' << printRegUnit(RU.RegUnit, MTM.TRI) << '@'
+                        << RU.Cycle);
+    }
+    LLVM_DEBUG(dbgs() << '\n');
+
+    if (!TBI.HasValidInstrDepths)
+      continue;
+    // Add live-ins to the critical path length.
+    TBI.CriticalPath = std::max(TBI.CriticalPath,
+                                computeCrossBlockCriticalPath(TBI));
+    LLVM_DEBUG(dbgs() << "Critical path: " << TBI.CriticalPath << '\n');
+  }
+}
+
+MachineTraceMetrics::Trace
+MachineTraceMetrics::Ensemble::getTrace(const MachineBasicBlock *MBB) {
+  TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+
+  if (!TBI.hasValidDepth() || !TBI.hasValidHeight())
+    computeTrace(MBB);
+  if (!TBI.HasValidInstrDepths)
+    computeInstrDepths(MBB);
+  if (!TBI.HasValidInstrHeights)
+    computeInstrHeights(MBB);
+
+  return Trace(*this, TBI);
+}
+
+unsigned
+MachineTraceMetrics::Trace::getInstrSlack(const MachineInstr &MI) const {
+  assert(getBlockNum() == unsigned(MI.getParent()->getNumber()) &&
+         "MI must be in the trace center block");
+  InstrCycles Cyc = getInstrCycles(MI);
+  return getCriticalPath() - (Cyc.Depth + Cyc.Height);
+}
+
+unsigned
+MachineTraceMetrics::Trace::getPHIDepth(const MachineInstr &PHI) const {
+  const MachineBasicBlock *MBB = TE.MTM.MF->getBlockNumbered(getBlockNum());
+  SmallVector<DataDep, 1> Deps;
+  getPHIDeps(PHI, Deps, MBB, TE.MTM.MRI);
+  assert(Deps.size() == 1 && "PHI doesn't have MBB as a predecessor");
+  DataDep &Dep = Deps.front();
+  unsigned DepCycle = getInstrCycles(*Dep.DefMI).Depth;
+  // Add latency if DefMI is a real instruction. Transients get latency 0.
+  if (!Dep.DefMI->isTransient())
+    DepCycle += TE.MTM.SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp,
+                                                        &PHI, Dep.UseOp);
+  return DepCycle;
+}
+
+/// When bottom is set include instructions in current block in estimate.
+unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const {
+  // Find the limiting processor resource.
+  // Numbers have been pre-scaled to be comparable.
+  unsigned PRMax = 0;
+  ArrayRef<unsigned> PRDepths = TE.getProcResourceDepths(getBlockNum());
+  if (Bottom) {
+    ArrayRef<unsigned> PRCycles = TE.MTM.getProcResourceCycles(getBlockNum());
+    for (unsigned K = 0; K != PRDepths.size(); ++K)
+      PRMax = std::max(PRMax, PRDepths[K] + PRCycles[K]);
+  } else {
+    for (unsigned PRD : PRDepths)
+      PRMax = std::max(PRMax, PRD);
+  }
+  // Convert to cycle count.
+  PRMax = TE.MTM.getCycles(PRMax);
+
+  /// All instructions before current block
+  unsigned Instrs = TBI.InstrDepth;
+  // plus instructions in current block
+  if (Bottom)
+    Instrs += TE.MTM.BlockInfo[getBlockNum()].InstrCount;
+  if (unsigned IW = TE.MTM.SchedModel.getIssueWidth())
+    Instrs /= IW;
+  // Assume issue width 1 without a schedule model.
+  return std::max(Instrs, PRMax);
+}
+
+unsigned MachineTraceMetrics::Trace::getResourceLength(
+    ArrayRef<const MachineBasicBlock *> Extrablocks,
+    ArrayRef<const MCSchedClassDesc *> ExtraInstrs,
+    ArrayRef<const MCSchedClassDesc *> RemoveInstrs) const {
+  // Add up resources above and below the center block.
+  ArrayRef<unsigned> PRDepths = TE.getProcResourceDepths(getBlockNum());
+  ArrayRef<unsigned> PRHeights = TE.getProcResourceHeights(getBlockNum());
+  unsigned PRMax = 0;
+
+  // Capture computing cycles from extra instructions
+  auto extraCycles = [this](ArrayRef<const MCSchedClassDesc *> Instrs,
+                            unsigned ResourceIdx)
+                         ->unsigned {
+    unsigned Cycles = 0;
+    for (const MCSchedClassDesc *SC : Instrs) {
+      if (!SC->isValid())
+        continue;
+      for (TargetSchedModel::ProcResIter
+               PI = TE.MTM.SchedModel.getWriteProcResBegin(SC),
+               PE = TE.MTM.SchedModel.getWriteProcResEnd(SC);
+           PI != PE; ++PI) {
+        if (PI->ProcResourceIdx != ResourceIdx)
+          continue;
+        Cycles +=
+            (PI->Cycles * TE.MTM.SchedModel.getResourceFactor(ResourceIdx));
+      }
+    }
+    return Cycles;
+  };
+
+  for (unsigned K = 0; K != PRDepths.size(); ++K) {
+    unsigned PRCycles = PRDepths[K] + PRHeights[K];
+    for (const MachineBasicBlock *MBB : Extrablocks)
+      PRCycles += TE.MTM.getProcResourceCycles(MBB->getNumber())[K];
+    PRCycles += extraCycles(ExtraInstrs, K);
+    PRCycles -= extraCycles(RemoveInstrs, K);
+    PRMax = std::max(PRMax, PRCycles);
+  }
+  // Convert to cycle count.
+  PRMax = TE.MTM.getCycles(PRMax);
+
+  // Instrs: #instructions in current trace outside current block.
+  unsigned Instrs = TBI.InstrDepth + TBI.InstrHeight;
+  // Add instruction count from the extra blocks.
+  for (const MachineBasicBlock *MBB : Extrablocks)
+    Instrs += TE.MTM.getResources(MBB)->InstrCount;
+  Instrs += ExtraInstrs.size();
+  Instrs -= RemoveInstrs.size();
+  if (unsigned IW = TE.MTM.SchedModel.getIssueWidth())
+    Instrs /= IW;
+  // Assume issue width 1 without a schedule model.
+  return std::max(Instrs, PRMax);
+}
+
+bool MachineTraceMetrics::Trace::isDepInTrace(const MachineInstr &DefMI,
+                                              const MachineInstr &UseMI) const {
+  if (DefMI.getParent() == UseMI.getParent())
+    return true;
+
+  const TraceBlockInfo &DepTBI = TE.BlockInfo[DefMI.getParent()->getNumber()];
+  const TraceBlockInfo &TBI = TE.BlockInfo[UseMI.getParent()->getNumber()];
+
+  return DepTBI.isUsefulDominator(TBI);
+}
+
+void MachineTraceMetrics::Ensemble::print(raw_ostream &OS) const {
+  OS << getName() << " ensemble:\n";
+  for (unsigned i = 0, e = BlockInfo.size(); i != e; ++i) {
+    OS << "  %bb." << i << '\t';
+    BlockInfo[i].print(OS);
+    OS << '\n';
+  }
+}
+
+void MachineTraceMetrics::TraceBlockInfo::print(raw_ostream &OS) const {
+  if (hasValidDepth()) {
+    OS << "depth=" << InstrDepth;
+    if (Pred)
+      OS << " pred=" << printMBBReference(*Pred);
+    else
+      OS << " pred=null";
+    OS << " head=%bb." << Head;
+    if (HasValidInstrDepths)
+      OS << " +instrs";
+  } else
+    OS << "depth invalid";
+  OS << ", ";
+  if (hasValidHeight()) {
+    OS << "height=" << InstrHeight;
+    if (Succ)
+      OS << " succ=" << printMBBReference(*Succ);
+    else
+      OS << " succ=null";
+    OS << " tail=%bb." << Tail;
+    if (HasValidInstrHeights)
+      OS << " +instrs";
+  } else
+    OS << "height invalid";
+  if (HasValidInstrDepths && HasValidInstrHeights)
+    OS << ", crit=" << CriticalPath;
+}
+
+void MachineTraceMetrics::Trace::print(raw_ostream &OS) const {
+  unsigned MBBNum = &TBI - &TE.BlockInfo[0];
+
+  OS << TE.getName() << " trace %bb." << TBI.Head << " --> %bb." << MBBNum
+     << " --> %bb." << TBI.Tail << ':';
+  if (TBI.hasValidHeight() && TBI.hasValidDepth())
+    OS << ' ' << getInstrCount() << " instrs.";
+  if (TBI.HasValidInstrDepths && TBI.HasValidInstrHeights)
+    OS << ' ' << TBI.CriticalPath << " cycles.";
+
+  const MachineTraceMetrics::TraceBlockInfo *Block = &TBI;
+  OS << "\n%bb." << MBBNum;
+  while (Block->hasValidDepth() && Block->Pred) {
+    unsigned Num = Block->Pred->getNumber();
+    OS << " <- " << printMBBReference(*Block->Pred);
+    Block = &TE.BlockInfo[Num];
+  }
+
+  Block = &TBI;
+  OS << "\n    ";
+  while (Block->hasValidHeight() && Block->Succ) {
+    unsigned Num = Block->Succ->getNumber();
+    OS << " -> " << printMBBReference(*Block->Succ);
+    Block = &TE.BlockInfo[Num];
+  }
+  OS << '\n';
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
new file mode 100644
index 000000000000..0e02c50284c6
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -0,0 +1,264 @@
+//===- MachineUniformityAnalysis.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineUniformityAnalysis.h"
+#include "llvm/ADT/GenericUniformityImpl.h"
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAContext.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+template <>
+bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::hasDivergentDefs(
+    const MachineInstr &I) const {
+  for (auto &op : I.all_defs()) {
+    if (isDivergent(op.getReg()))
+      return true;
+  }
+  return false;
+}
+
+template <>
+bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::markDefsDivergent(
+    const MachineInstr &Instr) {
+  bool insertedDivergent = false;
+  const auto &MRI = F.getRegInfo();
+  const auto &RBI = *F.getSubtarget().getRegBankInfo();
+  const auto &TRI = *MRI.getTargetRegisterInfo();
+  for (auto &op : Instr.all_defs()) {
+    if (!op.getReg().isVirtual())
+      continue;
+    assert(!op.getSubReg());
+    if (TRI.isUniformReg(MRI, RBI, op.getReg()))
+      continue;
+    insertedDivergent |= markDivergent(op.getReg());
+  }
+  return insertedDivergent;
+}
+
+template <>
+void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::initialize() {
+  const auto &InstrInfo = *F.getSubtarget().getInstrInfo();
+
+  for (const MachineBasicBlock &block : F) {
+    for (const MachineInstr &instr : block) {
+      auto uniformity = InstrInfo.getInstructionUniformity(instr);
+      if (uniformity == InstructionUniformity::AlwaysUniform) {
+        addUniformOverride(instr);
+        continue;
+      }
+
+      if (uniformity == InstructionUniformity::NeverUniform) {
+        markDivergent(instr);
+      }
+    }
+  }
+}
+
+template <>
+void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::pushUsers(
+    Register Reg) {
+  assert(isDivergent(Reg));
+  const auto &RegInfo = F.getRegInfo();
+  for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
+    markDivergent(UserInstr);
+  }
+}
+
+template <>
+void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::pushUsers(
+    const MachineInstr &Instr) {
+  assert(!isAlwaysUniform(Instr));
+  if (Instr.isTerminator())
+    return;
+  for (const MachineOperand &op : Instr.all_defs()) {
+    auto Reg = op.getReg();
+    if (isDivergent(Reg))
+      pushUsers(Reg);
+  }
+}
+
+template <>
+bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::usesValueFromCycle(
+    const MachineInstr &I, const MachineCycle &DefCycle) const {
+  assert(!isAlwaysUniform(I));
+  for (auto &Op : I.operands()) {
+    if (!Op.isReg() || !Op.readsReg())
+      continue;
+    auto Reg = Op.getReg();
+
+    // FIXME: Physical registers need to be properly checked instead of always
+    // returning true
+    if (Reg.isPhysical())
+      return true;
+
+    auto *Def = F.getRegInfo().getVRegDef(Reg);
+    if (DefCycle.contains(Def->getParent()))
+      return true;
+  }
+  return false;
+}
+
+template <>
+void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::
+    propagateTemporalDivergence(const MachineInstr &I,
+                                const MachineCycle &DefCycle) {
+  const auto &RegInfo = F.getRegInfo();
+  for (auto &Op : I.all_defs()) {
+    if (!Op.getReg().isVirtual())
+      continue;
+    auto Reg = Op.getReg();
+    if (isDivergent(Reg))
+      continue;
+    for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
+      if (DefCycle.contains(UserInstr.getParent()))
+        continue;
+      markDivergent(UserInstr);
+    }
+  }
+}
+
+template <>
+bool llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::isDivergentUse(
+    const MachineOperand &U) const {
+  if (!U.isReg())
+    return false;
+
+  auto Reg = U.getReg();
+  if (isDivergent(Reg))
+    return true;
+
+  const auto &RegInfo = F.getRegInfo();
+  auto *Def = RegInfo.getOneDef(Reg);
+  if (!Def)
+    return true;
+
+  auto *DefInstr = Def->getParent();
+  auto *UseInstr = U.getParent();
+  return isTemporalDivergent(*UseInstr->getParent(), *DefInstr);
+}
+
+// This ensures explicit instantiation of
+// GenericUniformityAnalysisImpl::ImplDeleter::operator()
+template class llvm::GenericUniformityInfo<MachineSSAContext>;
+template struct llvm::GenericUniformityAnalysisImplDeleter<
+    llvm::GenericUniformityAnalysisImpl<MachineSSAContext>>;
+
+MachineUniformityInfo llvm::computeMachineUniformityInfo(
+    MachineFunction &F, const MachineCycleInfo &cycleInfo,
+    const MachineDomTree &domTree, bool HasBranchDivergence) {
+  assert(F.getRegInfo().isSSA() && "Expected to be run on SSA form!");
+  MachineUniformityInfo UI(F, domTree, cycleInfo);
+  if (HasBranchDivergence)
+    UI.compute();
+  return UI;
+}
+
+namespace {
+
+/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
+class MachineUniformityAnalysisPass : public MachineFunctionPass {
+  MachineUniformityInfo UI;
+
+public:
+  static char ID;
+
+  MachineUniformityAnalysisPass();
+
+  MachineUniformityInfo &getUniformityInfo() { return UI; }
+  const MachineUniformityInfo &getUniformityInfo() const { return UI; }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
+
+  // TODO: verify analysis
+};
+
+class MachineUniformityInfoPrinterPass : public MachineFunctionPass {
+public:
+  static char ID;
+
+  MachineUniformityInfoPrinterPass();
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+} // namespace
+
+char MachineUniformityAnalysisPass::ID = 0;
+
+MachineUniformityAnalysisPass::MachineUniformityAnalysisPass()
+    : MachineFunctionPass(ID) {
+  initializeMachineUniformityAnalysisPassPass(*PassRegistry::getPassRegistry());
+}
+
+INITIALIZE_PASS_BEGIN(MachineUniformityAnalysisPass, "machine-uniformity",
+                      "Machine Uniformity Info Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(MachineUniformityAnalysisPass, "machine-uniformity",
+                    "Machine Uniformity Info Analysis", true, true)
+
+void MachineUniformityAnalysisPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineCycleInfoWrapperPass>();
+  AU.addRequired<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineUniformityAnalysisPass::runOnMachineFunction(MachineFunction &MF) {
+  auto &DomTree = getAnalysis<MachineDominatorTree>().getBase();
+  auto &CI = getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
+  // FIXME: Query TTI::hasBranchDivergence. -run-pass seems to end up with a
+  // default NoTTI
+  UI = computeMachineUniformityInfo(MF, CI, DomTree, true);
+  return false;
+}
+
+void MachineUniformityAnalysisPass::print(raw_ostream &OS,
+                                          const Module *) const {
+  OS << "MachineUniformityInfo for function: " << UI.getFunction().getName()
+     << "\n";
+  UI.print(OS);
+}
+
+char MachineUniformityInfoPrinterPass::ID = 0;
+
+MachineUniformityInfoPrinterPass::MachineUniformityInfoPrinterPass()
+    : MachineFunctionPass(ID) {
+  initializeMachineUniformityInfoPrinterPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+INITIALIZE_PASS_BEGIN(MachineUniformityInfoPrinterPass,
+                      "print-machine-uniformity",
+                      "Print Machine Uniformity Info Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
+INITIALIZE_PASS_END(MachineUniformityInfoPrinterPass,
+                    "print-machine-uniformity",
+                    "Print Machine Uniformity Info Analysis", true, true)
+
+void MachineUniformityInfoPrinterPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineUniformityAnalysisPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineUniformityInfoPrinterPass::runOnMachineFunction(
+    MachineFunction &F) {
+  auto &UI = getAnalysis<MachineUniformityAnalysisPass>();
+  UI.print(errs());
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp
new file mode 100644
index 000000000000..7acd3c4039e8
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -0,0 +1,3465 @@
+//===- MachineVerifier.cpp - Machine Code Verifier ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Pass to verify generated machine code. The following is checked:
+//
+// Operand counts: All explicit operands must be present.
+//
+// Register classes: All physical and virtual register operands must be
+// compatible with the register class required by the instruction descriptor.
+//
+// Register live intervals: Registers must be defined only once, and must be
+// defined before use.
+//
+// The machine code verifier is enabled with the command-line option
+// -verify-machineinstrs.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/CodeGenCommonISel.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRangeCalc.h"
+#include "llvm/CodeGen/LiveStacks.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/EHPersonalities.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ModRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <string>
+#include <utility>
+
+using namespace llvm;
+
+namespace {
+
+  struct MachineVerifier {
+    MachineVerifier(Pass *pass, const char *b) : PASS(pass), Banner(b) {}
+
+    unsigned verify(const MachineFunction &MF);
+
+    Pass *const PASS;
+    const char *Banner;
+    const MachineFunction *MF = nullptr;
+    const TargetMachine *TM = nullptr;
+    const TargetInstrInfo *TII = nullptr;
+    const TargetRegisterInfo *TRI = nullptr;
+    const MachineRegisterInfo *MRI = nullptr;
+    const RegisterBankInfo *RBI = nullptr;
+
+    unsigned foundErrors = 0;
+
+    // Avoid querying the MachineFunctionProperties for each operand.
+    bool isFunctionRegBankSelected = false;
+    bool isFunctionSelected = false;
+    bool isFunctionTracksDebugUserValues = false;
+
+    using RegVector = SmallVector<Register, 16>;
+    using RegMaskVector = SmallVector<const uint32_t *, 4>;
+    using RegSet = DenseSet<Register>;
+    using RegMap = DenseMap<Register, const MachineInstr *>;
+    using BlockSet = SmallPtrSet<const MachineBasicBlock *, 8>;
+
+    const MachineInstr *FirstNonPHI = nullptr;
+    const MachineInstr *FirstTerminator = nullptr;
+    BlockSet FunctionBlocks;
+
+    BitVector regsReserved;
+    RegSet regsLive;
+    RegVector regsDefined, regsDead, regsKilled;
+    RegMaskVector regMasks;
+
+    SlotIndex lastIndex;
+
+    // Add Reg and any sub-registers to RV
+    void addRegWithSubRegs(RegVector &RV, Register Reg) {
+      RV.push_back(Reg);
+      if (Reg.isPhysical())
+        append_range(RV, TRI->subregs(Reg.asMCReg()));
+    }
+
+    struct BBInfo {
+      // Is this MBB reachable from the MF entry point?
+      bool reachable = false;
+
+      // Vregs that must be live in because they are used without being
+      // defined. Map value is the user. vregsLiveIn doesn't include regs
+      // that only are used by PHI nodes.
+      RegMap vregsLiveIn;
+
+      // Regs killed in MBB. They may be defined again, and will then be in both
+      // regsKilled and regsLiveOut.
+      RegSet regsKilled;
+
+      // Regs defined in MBB and live out. Note that vregs passing through may
+      // be live out without being mentioned here.
+      RegSet regsLiveOut;
+
+      // Vregs that pass through MBB untouched. This set is disjoint from
+      // regsKilled and regsLiveOut.
+      RegSet vregsPassed;
+
+      // Vregs that must pass through MBB because they are needed by a successor
+      // block. This set is disjoint from regsLiveOut.
+      RegSet vregsRequired;
+
+      // Set versions of block's predecessor and successor lists.
+      BlockSet Preds, Succs;
+
+      BBInfo() = default;
+
+      // Add register to vregsRequired if it belongs there. Return true if
+      // anything changed.
+      bool addRequired(Register Reg) {
+        if (!Reg.isVirtual())
+          return false;
+        if (regsLiveOut.count(Reg))
+          return false;
+        return vregsRequired.insert(Reg).second;
+      }
+
+      // Same for a full set.
+      bool addRequired(const RegSet &RS) {
+        bool Changed = false;
+        for (Register Reg : RS)
+          Changed |= addRequired(Reg);
+        return Changed;
+      }
+
+      // Same for a full map.
+      bool addRequired(const RegMap &RM) {
+        bool Changed = false;
+        for (const auto &I : RM)
+          Changed |= addRequired(I.first);
+        return Changed;
+      }
+
+      // Live-out registers are either in regsLiveOut or vregsPassed.
+      bool isLiveOut(Register Reg) const {
+        return regsLiveOut.count(Reg) || vregsPassed.count(Reg);
+      }
+    };
+
+    // Extra register info per MBB.
+    DenseMap<const MachineBasicBlock*, BBInfo> MBBInfoMap;
+
+    bool isReserved(Register Reg) {
+      return Reg.id() < regsReserved.size() && regsReserved.test(Reg.id());
+    }
+
+    bool isAllocatable(Register Reg) const {
+      return Reg.id() < TRI->getNumRegs() && TRI->isInAllocatableClass(Reg) &&
+             !regsReserved.test(Reg.id());
+    }
+
+    // Analysis information if available
+    LiveVariables *LiveVars = nullptr;
+    LiveIntervals *LiveInts = nullptr;
+    LiveStacks *LiveStks = nullptr;
+    SlotIndexes *Indexes = nullptr;
+
+    void visitMachineFunctionBefore();
+    void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB);
+    void visitMachineBundleBefore(const MachineInstr *MI);
+
+    /// Verify that all of \p MI's virtual register operands are scalars.
+    /// \returns True if all virtual register operands are scalar. False
+    /// otherwise.
+    bool verifyAllRegOpsScalar(const MachineInstr &MI,
+                               const MachineRegisterInfo &MRI);
+    bool verifyVectorElementMatch(LLT Ty0, LLT Ty1, const MachineInstr *MI);
+    void verifyPreISelGenericInstruction(const MachineInstr *MI);
+    void visitMachineInstrBefore(const MachineInstr *MI);
+    void visitMachineOperand(const MachineOperand *MO, unsigned MONum);
+    void visitMachineBundleAfter(const MachineInstr *MI);
+    void visitMachineBasicBlockAfter(const MachineBasicBlock *MBB);
+    void visitMachineFunctionAfter();
+
+    void report(const char *msg, const MachineFunction *MF);
+    void report(const char *msg, const MachineBasicBlock *MBB);
+    void report(const char *msg, const MachineInstr *MI);
+    void report(const char *msg, const MachineOperand *MO, unsigned MONum,
+                LLT MOVRegType = LLT{});
+    void report(const Twine &Msg, const MachineInstr *MI);
+
+    void report_context(const LiveInterval &LI) const;
+    void report_context(const LiveRange &LR, Register VRegUnit,
+                        LaneBitmask LaneMask) const;
+    void report_context(const LiveRange::Segment &S) const;
+    void report_context(const VNInfo &VNI) const;
+    void report_context(SlotIndex Pos) const;
+    void report_context(MCPhysReg PhysReg) const;
+    void report_context_liverange(const LiveRange &LR) const;
+    void report_context_lanemask(LaneBitmask LaneMask) const;
+    void report_context_vreg(Register VReg) const;
+    void report_context_vreg_regunit(Register VRegOrUnit) const;
+
+    void verifyInlineAsm(const MachineInstr *MI);
+
+    void checkLiveness(const MachineOperand *MO, unsigned MONum);
+    void checkLivenessAtUse(const MachineOperand *MO, unsigned MONum,
+                            SlotIndex UseIdx, const LiveRange &LR,
+                            Register VRegOrUnit,
+                            LaneBitmask LaneMask = LaneBitmask::getNone());
+    void checkLivenessAtDef(const MachineOperand *MO, unsigned MONum,
+                            SlotIndex DefIdx, const LiveRange &LR,
+                            Register VRegOrUnit, bool SubRangeCheck = false,
+                            LaneBitmask LaneMask = LaneBitmask::getNone());
+
+    void markReachable(const MachineBasicBlock *MBB);
+    void calcRegsPassed();
+    void checkPHIOps(const MachineBasicBlock &MBB);
+
+    void calcRegsRequired();
+    void verifyLiveVariables();
+    void verifyLiveIntervals();
+    void verifyLiveInterval(const LiveInterval&);
+    void verifyLiveRangeValue(const LiveRange &, const VNInfo *, Register,
+                              LaneBitmask);
+    void verifyLiveRangeSegment(const LiveRange &,
+                                const LiveRange::const_iterator I, Register,
+                                LaneBitmask);
+    void verifyLiveRange(const LiveRange &, Register,
+                         LaneBitmask LaneMask = LaneBitmask::getNone());
+
+    void verifyStackFrame();
+
+    void verifySlotIndexes() const;
+    void verifyProperties(const MachineFunction &MF);
+  };
+
+  struct MachineVerifierPass : public MachineFunctionPass {
+    static char ID; // Pass ID, replacement for typeid
+
+    const std::string Banner;
+
+    MachineVerifierPass(std::string banner = std::string())
+      : MachineFunctionPass(ID), Banner(std::move(banner)) {
+        initializeMachineVerifierPassPass(*PassRegistry::getPassRegistry());
+      }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addUsedIfAvailable<LiveStacks>();
+      AU.addUsedIfAvailable<LiveVariables>();
+      AU.addUsedIfAvailable<SlotIndexes>();
+      AU.addUsedIfAvailable<LiveIntervals>();
+      AU.setPreservesAll();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      // Skip functions that have known verification problems.
+      // FIXME: Remove this mechanism when all problematic passes have been
+      // fixed.
+      if (MF.getProperties().hasProperty(
+              MachineFunctionProperties::Property::FailsVerification))
+        return false;
+
+      unsigned FoundErrors = MachineVerifier(this, Banner.c_str()).verify(MF);
+      if (FoundErrors)
+        report_fatal_error("Found "+Twine(FoundErrors)+" machine code errors.");
+      return false;
+    }
+  };
+
+} // end anonymous namespace
+
+char MachineVerifierPass::ID = 0;
+
+INITIALIZE_PASS(MachineVerifierPass, "machineverifier",
+                "Verify generated machine code", false, false)
+
+FunctionPass *llvm::createMachineVerifierPass(const std::string &Banner) {
+  return new MachineVerifierPass(Banner);
+}
+
+void llvm::verifyMachineFunction(MachineFunctionAnalysisManager *,
+                                 const std::string &Banner,
+                                 const MachineFunction &MF) {
+  // TODO: Use MFAM after porting below analyses.
+  // LiveVariables *LiveVars;
+  // LiveIntervals *LiveInts;
+  // LiveStacks *LiveStks;
+  // SlotIndexes *Indexes;
+  unsigned FoundErrors = MachineVerifier(nullptr, Banner.c_str()).verify(MF);
+  if (FoundErrors)
+    report_fatal_error("Found " + Twine(FoundErrors) + " machine code errors.");
+}
+
+bool MachineFunction::verify(Pass *p, const char *Banner, bool AbortOnErrors)
+    const {
+  MachineFunction &MF = const_cast<MachineFunction&>(*this);
+  unsigned FoundErrors = MachineVerifier(p, Banner).verify(MF);
+  if (AbortOnErrors && FoundErrors)
+    report_fatal_error("Found "+Twine(FoundErrors)+" machine code errors.");
+  return FoundErrors == 0;
+}
+
+void MachineVerifier::verifySlotIndexes() const {
+  if (Indexes == nullptr)
+    return;
+
+  // Ensure the IdxMBB list is sorted by slot indexes.
+  SlotIndex Last;
+  for (SlotIndexes::MBBIndexIterator I = Indexes->MBBIndexBegin(),
+       E = Indexes->MBBIndexEnd(); I != E; ++I) {
+    assert(!Last.isValid() || I->first > Last);
+    Last = I->first;
+  }
+}
+
+void MachineVerifier::verifyProperties(const MachineFunction &MF) {
+  // If a pass has introduced virtual registers without clearing the
+  // NoVRegs property (or set it without allocating the vregs)
+  // then report an error.
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::NoVRegs) &&
+      MRI->getNumVirtRegs())
+    report("Function has NoVRegs property but there are VReg operands", &MF);
+}
+
+unsigned MachineVerifier::verify(const MachineFunction &MF) {
+  foundErrors = 0;
+
+  this->MF = &MF;
+  TM = &MF.getTarget();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  RBI = MF.getSubtarget().getRegBankInfo();
+  MRI = &MF.getRegInfo();
+
+  const bool isFunctionFailedISel = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::FailedISel);
+
+  // If we're mid-GlobalISel and we already triggered the fallback path then
+  // it's expected that the MIR is somewhat broken but that's ok since we'll
+  // reset it and clear the FailedISel attribute in ResetMachineFunctions.
+  if (isFunctionFailedISel)
+    return foundErrors;
+
+  isFunctionRegBankSelected = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::RegBankSelected);
+  isFunctionSelected = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::Selected);
+  isFunctionTracksDebugUserValues = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::TracksDebugUserValues);
+
+  LiveVars = nullptr;
+  LiveInts = nullptr;
+  LiveStks = nullptr;
+  Indexes = nullptr;
+  if (PASS) {
+    LiveInts = PASS->getAnalysisIfAvailable<LiveIntervals>();
+    // We don't want to verify LiveVariables if LiveIntervals is available.
+    if (!LiveInts)
+      LiveVars = PASS->getAnalysisIfAvailable<LiveVariables>();
+    LiveStks = PASS->getAnalysisIfAvailable<LiveStacks>();
+    Indexes = PASS->getAnalysisIfAvailable<SlotIndexes>();
+  }
+
+  verifySlotIndexes();
+
+  verifyProperties(MF);
+
+  visitMachineFunctionBefore();
+  for (const MachineBasicBlock &MBB : MF) {
+    visitMachineBasicBlockBefore(&MBB);
+    // Keep track of the current bundle header.
+    const MachineInstr *CurBundle = nullptr;
+    // Do we expect the next instruction to be part of the same bundle?
+    bool InBundle = false;
+
+    for (const MachineInstr &MI : MBB.instrs()) {
+      if (MI.getParent() != &MBB) {
+        report("Bad instruction parent pointer", &MBB);
+        errs() << "Instruction: " << MI;
+        continue;
+      }
+
+      // Check for consistent bundle flags.
+      if (InBundle && !MI.isBundledWithPred())
+        report("Missing BundledPred flag, "
+               "BundledSucc was set on predecessor",
+               &MI);
+      if (!InBundle && MI.isBundledWithPred())
+        report("BundledPred flag is set, "
+               "but BundledSucc not set on predecessor",
+               &MI);
+
+      // Is this a bundle header?
+      if (!MI.isInsideBundle()) {
+        if (CurBundle)
+          visitMachineBundleAfter(CurBundle);
+        CurBundle = &MI;
+        visitMachineBundleBefore(CurBundle);
+      } else if (!CurBundle)
+        report("No bundle header", &MI);
+      visitMachineInstrBefore(&MI);
+      for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+        const MachineOperand &Op = MI.getOperand(I);
+        if (Op.getParent() != &MI) {
+          // Make sure to use correct addOperand / removeOperand / ChangeTo
+          // functions when replacing operands of a MachineInstr.
+          report("Instruction has operand with wrong parent set", &MI);
+        }
+
+        visitMachineOperand(&Op, I);
+      }
+
+      // Was this the last bundled instruction?
+      InBundle = MI.isBundledWithSucc();
+    }
+    if (CurBundle)
+      visitMachineBundleAfter(CurBundle);
+    if (InBundle)
+      report("BundledSucc flag set on last instruction in block", &MBB.back());
+    visitMachineBasicBlockAfter(&MBB);
+  }
+  visitMachineFunctionAfter();
+
+  // Clean up.
+  regsLive.clear();
+  regsDefined.clear();
+  regsDead.clear();
+  regsKilled.clear();
+  regMasks.clear();
+  MBBInfoMap.clear();
+
+  return foundErrors;
+}
+
+void MachineVerifier::report(const char *msg, const MachineFunction *MF) {
+  assert(MF);
+  errs() << '\n';
+  if (!foundErrors++) {
+    if (Banner)
+      errs() << "# " << Banner << '\n';
+    if (LiveInts != nullptr)
+      LiveInts->print(errs());
+    else
+      MF->print(errs(), Indexes);
+  }
+  errs() << "*** Bad machine code: " << msg << " ***\n"
+      << "- function:    " << MF->getName() << "\n";
+}
+
+void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB) {
+  assert(MBB);
+  report(msg, MBB->getParent());
+  errs() << "- basic block: " << printMBBReference(*MBB) << ' '
+         << MBB->getName() << " (" << (const void *)MBB << ')';
+  if (Indexes)
+    errs() << " [" << Indexes->getMBBStartIdx(MBB)
+        << ';' <<  Indexes->getMBBEndIdx(MBB) << ')';
+  errs() << '\n';
+}
+
+void MachineVerifier::report(const char *msg, const MachineInstr *MI) {
+  assert(MI);
+  report(msg, MI->getParent());
+  errs() << "- instruction: ";
+  if (Indexes && Indexes->hasIndex(*MI))
+    errs() << Indexes->getInstructionIndex(*MI) << '\t';
+  MI->print(errs(), /*IsStandalone=*/true);
+}
+
+void MachineVerifier::report(const char *msg, const MachineOperand *MO,
+                             unsigned MONum, LLT MOVRegType) {
+  assert(MO);
+  report(msg, MO->getParent());
+  errs() << "- operand " << MONum << ":   ";
+  MO->print(errs(), MOVRegType, TRI);
+  errs() << "\n";
+}
+
+void MachineVerifier::report(const Twine &Msg, const MachineInstr *MI) {
+  report(Msg.str().c_str(), MI);
+}
+
+void MachineVerifier::report_context(SlotIndex Pos) const {
+  errs() << "- at:          " << Pos << '\n';
+}
+
+void MachineVerifier::report_context(const LiveInterval &LI) const {
+  errs() << "- interval:    " << LI << '\n';
+}
+
+void MachineVerifier::report_context(const LiveRange &LR, Register VRegUnit,
+                                     LaneBitmask LaneMask) const {
+  report_context_liverange(LR);
+  report_context_vreg_regunit(VRegUnit);
+  if (LaneMask.any())
+    report_context_lanemask(LaneMask);
+}
+
+void MachineVerifier::report_context(const LiveRange::Segment &S) const {
+  errs() << "- segment:     " << S << '\n';
+}
+
+void MachineVerifier::report_context(const VNInfo &VNI) const {
+  errs() << "- ValNo:       " << VNI.id << " (def " << VNI.def << ")\n";
+}
+
+void MachineVerifier::report_context_liverange(const LiveRange &LR) const {
+  errs() << "- liverange:   " << LR << '\n';
+}
+
+void MachineVerifier::report_context(MCPhysReg PReg) const {
+  errs() << "- p. register: " << printReg(PReg, TRI) << '\n';
+}
+
+void MachineVerifier::report_context_vreg(Register VReg) const {
+  errs() << "- v. register: " << printReg(VReg, TRI) << '\n';
+}
+
+void MachineVerifier::report_context_vreg_regunit(Register VRegOrUnit) const {
+  if (VRegOrUnit.isVirtual()) {
+    report_context_vreg(VRegOrUnit);
+  } else {
+    errs() << "- regunit:     " << printRegUnit(VRegOrUnit, TRI) << '\n';
+  }
+}
+
+void MachineVerifier::report_context_lanemask(LaneBitmask LaneMask) const {
+  errs() << "- lanemask:    " << PrintLaneMask(LaneMask) << '\n';
+}
+
+void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
+  BBInfo &MInfo = MBBInfoMap[MBB];
+  if (!MInfo.reachable) {
+    MInfo.reachable = true;
+    for (const MachineBasicBlock *Succ : MBB->successors())
+      markReachable(Succ);
+  }
+}
+
+void MachineVerifier::visitMachineFunctionBefore() {
+  lastIndex = SlotIndex();
+  regsReserved = MRI->reservedRegsFrozen() ? MRI->getReservedRegs()
+                                           : TRI->getReservedRegs(*MF);
+
+  if (!MF->empty())
+    markReachable(&MF->front());
+
+  // Build a set of the basic blocks in the function.
+  FunctionBlocks.clear();
+  for (const auto &MBB : *MF) {
+    FunctionBlocks.insert(&MBB);
+    BBInfo &MInfo = MBBInfoMap[&MBB];
+
+    MInfo.Preds.insert(MBB.pred_begin(), MBB.pred_end());
+    if (MInfo.Preds.size() != MBB.pred_size())
+      report("MBB has duplicate entries in its predecessor list.", &MBB);
+
+    MInfo.Succs.insert(MBB.succ_begin(), MBB.succ_end());
+    if (MInfo.Succs.size() != MBB.succ_size())
+      report("MBB has duplicate entries in its successor list.", &MBB);
+  }
+
+  // Check that the register use lists are sane.
+  MRI->verifyUseLists();
+
+  if (!MF->empty())
+    verifyStackFrame();
+}
+
+void
+MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
+  FirstTerminator = nullptr;
+  FirstNonPHI = nullptr;
+
+  if (!MF->getProperties().hasProperty(
+      MachineFunctionProperties::Property::NoPHIs) && MRI->tracksLiveness()) {
+    // If this block has allocatable physical registers live-in, check that
+    // it is an entry block or landing pad.
+    for (const auto &LI : MBB->liveins()) {
+      if (isAllocatable(LI.PhysReg) && !MBB->isEHPad() &&
+          MBB->getIterator() != MBB->getParent()->begin() &&
+          !MBB->isInlineAsmBrIndirectTarget()) {
+        report("MBB has allocatable live-in, but isn't entry, landing-pad, or "
+               "inlineasm-br-indirect-target.",
+               MBB);
+        report_context(LI.PhysReg);
+      }
+    }
+  }
+
+  if (MBB->isIRBlockAddressTaken()) {
+    if (!MBB->getAddressTakenIRBlock()->hasAddressTaken())
+      report("ir-block-address-taken is associated with basic block not used by "
+             "a blockaddress.",
+             MBB);
+  }
+
+  // Count the number of landing pad successors.
+  SmallPtrSet<const MachineBasicBlock*, 4> LandingPadSuccs;
+  for (const auto *succ : MBB->successors()) {
+    if (succ->isEHPad())
+      LandingPadSuccs.insert(succ);
+    if (!FunctionBlocks.count(succ))
+      report("MBB has successor that isn't part of the function.", MBB);
+    if (!MBBInfoMap[succ].Preds.count(MBB)) {
+      report("Inconsistent CFG", MBB);
+      errs() << "MBB is not in the predecessor list of the successor "
+             << printMBBReference(*succ) << ".\n";
+    }
+  }
+
+  // Check the predecessor list.
+  for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+    if (!FunctionBlocks.count(Pred))
+      report("MBB has predecessor that isn't part of the function.", MBB);
+    if (!MBBInfoMap[Pred].Succs.count(MBB)) {
+      report("Inconsistent CFG", MBB);
+      errs() << "MBB is not in the successor list of the predecessor "
+             << printMBBReference(*Pred) << ".\n";
+    }
+  }
+
+  const MCAsmInfo *AsmInfo = TM->getMCAsmInfo();
+  const BasicBlock *BB = MBB->getBasicBlock();
+  const Function &F = MF->getFunction();
+  if (LandingPadSuccs.size() > 1 &&
+      !(AsmInfo &&
+        AsmInfo->getExceptionHandlingType() == ExceptionHandling::SjLj &&
+        BB && isa<SwitchInst>(BB->getTerminator())) &&
+      !isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+    report("MBB has more than one landing pad successor", MBB);
+
+  // Call analyzeBranch. If it succeeds, there several more conditions to check.
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  if (!TII->analyzeBranch(*const_cast<MachineBasicBlock *>(MBB), TBB, FBB,
+                          Cond)) {
+    // Ok, analyzeBranch thinks it knows what's going on with this block. Let's
+    // check whether its answers match up with reality.
+    if (!TBB && !FBB) {
+      // Block falls through to its successor.
+      if (!MBB->empty() && MBB->back().isBarrier() &&
+          !TII->isPredicated(MBB->back())) {
+        report("MBB exits via unconditional fall-through but ends with a "
+               "barrier instruction!", MBB);
+      }
+      if (!Cond.empty()) {
+        report("MBB exits via unconditional fall-through but has a condition!",
+               MBB);
+      }
+    } else if (TBB && !FBB && Cond.empty()) {
+      // Block unconditionally branches somewhere.
+      if (MBB->empty()) {
+        report("MBB exits via unconditional branch but doesn't contain "
+               "any instructions!", MBB);
+      } else if (!MBB->back().isBarrier()) {
+        report("MBB exits via unconditional branch but doesn't end with a "
+               "barrier instruction!", MBB);
+      } else if (!MBB->back().isTerminator()) {
+        report("MBB exits via unconditional branch but the branch isn't a "
+               "terminator instruction!", MBB);
+      }
+    } else if (TBB && !FBB && !Cond.empty()) {
+      // Block conditionally branches somewhere, otherwise falls through.
+      if (MBB->empty()) {
+        report("MBB exits via conditional branch/fall-through but doesn't "
+               "contain any instructions!", MBB);
+      } else if (MBB->back().isBarrier()) {
+        report("MBB exits via conditional branch/fall-through but ends with a "
+               "barrier instruction!", MBB);
+      } else if (!MBB->back().isTerminator()) {
+        report("MBB exits via conditional branch/fall-through but the branch "
+               "isn't a terminator instruction!", MBB);
+      }
+    } else if (TBB && FBB) {
+      // Block conditionally branches somewhere, otherwise branches
+      // somewhere else.
+      if (MBB->empty()) {
+        report("MBB exits via conditional branch/branch but doesn't "
+               "contain any instructions!", MBB);
+      } else if (!MBB->back().isBarrier()) {
+        report("MBB exits via conditional branch/branch but doesn't end with a "
+               "barrier instruction!", MBB);
+      } else if (!MBB->back().isTerminator()) {
+        report("MBB exits via conditional branch/branch but the branch "
+               "isn't a terminator instruction!", MBB);
+      }
+      if (Cond.empty()) {
+        report("MBB exits via conditional branch/branch but there's no "
+               "condition!", MBB);
+      }
+    } else {
+      report("analyzeBranch returned invalid data!", MBB);
+    }
+
+    // Now check that the successors match up with the answers reported by
+    // analyzeBranch.
+    if (TBB && !MBB->isSuccessor(TBB))
+      report("MBB exits via jump or conditional branch, but its target isn't a "
+             "CFG successor!",
+             MBB);
+    if (FBB && !MBB->isSuccessor(FBB))
+      report("MBB exits via conditional branch, but its target isn't a CFG "
+             "successor!",
+             MBB);
+
+    // There might be a fallthrough to the next block if there's either no
+    // unconditional true branch, or if there's a condition, and one of the
+    // branches is missing.
+    bool Fallthrough = !TBB || (!Cond.empty() && !FBB);
+
+    // A conditional fallthrough must be an actual CFG successor, not
+    // unreachable. (Conversely, an unconditional fallthrough might not really
+    // be a successor, because the block might end in unreachable.)
+    if (!Cond.empty() && !FBB) {
+      MachineFunction::const_iterator MBBI = std::next(MBB->getIterator());
+      if (MBBI == MF->end()) {
+        report("MBB conditionally falls through out of function!", MBB);
+      } else if (!MBB->isSuccessor(&*MBBI))
+        report("MBB exits via conditional branch/fall-through but the CFG "
+               "successors don't match the actual successors!",
+               MBB);
+    }
+
+    // Verify that there aren't any extra un-accounted-for successors.
+    for (const MachineBasicBlock *SuccMBB : MBB->successors()) {
+      // If this successor is one of the branch targets, it's okay.
+      if (SuccMBB == TBB || SuccMBB == FBB)
+        continue;
+      // If we might have a fallthrough, and the successor is the fallthrough
+      // block, that's also ok.
+      if (Fallthrough && SuccMBB == MBB->getNextNode())
+        continue;
+      // Also accept successors which are for exception-handling or might be
+      // inlineasm_br targets.
+      if (SuccMBB->isEHPad() || SuccMBB->isInlineAsmBrIndirectTarget())
+        continue;
+      report("MBB has unexpected successors which are not branch targets, "
+             "fallthrough, EHPads, or inlineasm_br targets.",
+             MBB);
+    }
+  }
+
+  regsLive.clear();
+  if (MRI->tracksLiveness()) {
+    for (const auto &LI : MBB->liveins()) {
+      if (!Register::isPhysicalRegister(LI.PhysReg)) {
+        report("MBB live-in list contains non-physical register", MBB);
+        continue;
+      }
+      for (const MCPhysReg &SubReg : TRI->subregs_inclusive(LI.PhysReg))
+        regsLive.insert(SubReg);
+    }
+  }
+
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  BitVector PR = MFI.getPristineRegs(*MF);
+  for (unsigned I : PR.set_bits()) {
+    for (const MCPhysReg &SubReg : TRI->subregs_inclusive(I))
+      regsLive.insert(SubReg);
+  }
+
+  regsKilled.clear();
+  regsDefined.clear();
+
+  if (Indexes)
+    lastIndex = Indexes->getMBBStartIdx(MBB);
+}
+
+// This function gets called for all bundle headers, including normal
+// stand-alone unbundled instructions.
+void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) {
+  if (Indexes && Indexes->hasIndex(*MI)) {
+    SlotIndex idx = Indexes->getInstructionIndex(*MI);
+    if (!(idx > lastIndex)) {
+      report("Instruction index out of order", MI);
+      errs() << "Last instruction was at " << lastIndex << '\n';
+    }
+    lastIndex = idx;
+  }
+
+  // Ensure non-terminators don't follow terminators.
+  if (MI->isTerminator()) {
+    if (!FirstTerminator)
+      FirstTerminator = MI;
+  } else if (FirstTerminator) {
+    // For GlobalISel, G_INVOKE_REGION_START is a terminator that we allow to
+    // precede non-terminators.
+    if (FirstTerminator->getOpcode() != TargetOpcode::G_INVOKE_REGION_START) {
+      report("Non-terminator instruction after the first terminator", MI);
+      errs() << "First terminator was:\t" << *FirstTerminator;
+    }
+  }
+}
+
+// The operands on an INLINEASM instruction must follow a template.
+// Verify that the flag operands make sense.
+void MachineVerifier::verifyInlineAsm(const MachineInstr *MI) {
+  // The first two operands on INLINEASM are the asm string and global flags.
+  if (MI->getNumOperands() < 2) {
+    report("Too few operands on inline asm", MI);
+    return;
+  }
+  if (!MI->getOperand(0).isSymbol())
+    report("Asm string must be an external symbol", MI);
+  if (!MI->getOperand(1).isImm())
+    report("Asm flags must be an immediate", MI);
+  // Allowed flags are Extra_HasSideEffects = 1, Extra_IsAlignStack = 2,
+  // Extra_AsmDialect = 4, Extra_MayLoad = 8, and Extra_MayStore = 16,
+  // and Extra_IsConvergent = 32.
+  if (!isUInt<6>(MI->getOperand(1).getImm()))
+    report("Unknown asm flags", &MI->getOperand(1), 1);
+
+  static_assert(InlineAsm::MIOp_FirstOperand == 2, "Asm format changed");
+
+  unsigned OpNo = InlineAsm::MIOp_FirstOperand;
+  unsigned NumOps;
+  for (unsigned e = MI->getNumOperands(); OpNo < e; OpNo += NumOps) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    // There may be implicit ops after the fixed operands.
+    if (!MO.isImm())
+      break;
+    NumOps = 1 + InlineAsm::getNumOperandRegisters(MO.getImm());
+  }
+
+  if (OpNo > MI->getNumOperands())
+    report("Missing operands in last group", MI);
+
+  // An optional MDNode follows the groups.
+  if (OpNo < MI->getNumOperands() && MI->getOperand(OpNo).isMetadata())
+    ++OpNo;
+
+  // All trailing operands must be implicit registers.
+  for (unsigned e = MI->getNumOperands(); OpNo < e; ++OpNo) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    if (!MO.isReg() || !MO.isImplicit())
+      report("Expected implicit register after groups", &MO, OpNo);
+  }
+
+  if (MI->getOpcode() == TargetOpcode::INLINEASM_BR) {
+    const MachineBasicBlock *MBB = MI->getParent();
+
+    for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands();
+         i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+
+      if (!MO.isMBB())
+        continue;
+
+      // Check the successor & predecessor lists look ok, assume they are
+      // not. Find the indirect target without going through the successors.
+      const MachineBasicBlock *IndirectTargetMBB = MO.getMBB();
+      if (!IndirectTargetMBB) {
+        report("INLINEASM_BR indirect target does not exist", &MO, i);
+        break;
+      }
+
+      if (!MBB->isSuccessor(IndirectTargetMBB))
+        report("INLINEASM_BR indirect target missing from successor list", &MO,
+               i);
+
+      if (!IndirectTargetMBB->isPredecessor(MBB))
+        report("INLINEASM_BR indirect target predecessor list missing parent",
+               &MO, i);
+    }
+  }
+}
+
+bool MachineVerifier::verifyAllRegOpsScalar(const MachineInstr &MI,
+                                            const MachineRegisterInfo &MRI) {
+  if (none_of(MI.explicit_operands(), [&MRI](const MachineOperand &Op) {
+        if (!Op.isReg())
+          return false;
+        const auto Reg = Op.getReg();
+        if (Reg.isPhysical())
+          return false;
+        return !MRI.getType(Reg).isScalar();
+      }))
+    return true;
+  report("All register operands must have scalar types", &MI);
+  return false;
+}
+
+/// Check that types are consistent when two operands need to have the same
+/// number of vector elements.
+/// \return true if the types are valid.
+bool MachineVerifier::verifyVectorElementMatch(LLT Ty0, LLT Ty1,
+                                               const MachineInstr *MI) {
+  if (Ty0.isVector() != Ty1.isVector()) {
+    report("operand types must be all-vector or all-scalar", MI);
+    // Generally we try to report as many issues as possible at once, but in
+    // this case it's not clear what should we be comparing the size of the
+    // scalar with: the size of the whole vector or its lane. Instead of
+    // making an arbitrary choice and emitting not so helpful message, let's
+    // avoid the extra noise and stop here.
+    return false;
+  }
+
+  if (Ty0.isVector() && Ty0.getNumElements() != Ty1.getNumElements()) {
+    report("operand types must preserve number of vector elements", MI);
+    return false;
+  }
+
+  return true;
+}
+
+void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
+  if (isFunctionSelected)
+    report("Unexpected generic instruction in a Selected function", MI);
+
+  const MCInstrDesc &MCID = MI->getDesc();
+  unsigned NumOps = MI->getNumOperands();
+
+  // Branches must reference a basic block if they are not indirect
+  if (MI->isBranch() && !MI->isIndirectBranch()) {
+    bool HasMBB = false;
+    for (const MachineOperand &Op : MI->operands()) {
+      if (Op.isMBB()) {
+        HasMBB = true;
+        break;
+      }
+    }
+
+    if (!HasMBB) {
+      report("Branch instruction is missing a basic block operand or "
+             "isIndirectBranch property",
+             MI);
+    }
+  }
+
+  // Check types.
+  SmallVector<LLT, 4> Types;
+  for (unsigned I = 0, E = std::min(MCID.getNumOperands(), NumOps);
+       I != E; ++I) {
+    if (!MCID.operands()[I].isGenericType())
+      continue;
+    // Generic instructions specify type equality constraints between some of
+    // their operands. Make sure these are consistent.
+    size_t TypeIdx = MCID.operands()[I].getGenericTypeIndex();
+    Types.resize(std::max(TypeIdx + 1, Types.size()));
+
+    const MachineOperand *MO = &MI->getOperand(I);
+    if (!MO->isReg()) {
+      report("generic instruction must use register operands", MI);
+      continue;
+    }
+
+    LLT OpTy = MRI->getType(MO->getReg());
+    // Don't report a type mismatch if there is no actual mismatch, only a
+    // type missing, to reduce noise:
+    if (OpTy.isValid()) {
+      // Only the first valid type for a type index will be printed: don't
+      // overwrite it later so it's always clear which type was expected:
+      if (!Types[TypeIdx].isValid())
+        Types[TypeIdx] = OpTy;
+      else if (Types[TypeIdx] != OpTy)
+        report("Type mismatch in generic instruction", MO, I, OpTy);
+    } else {
+      // Generic instructions must have types attached to their operands.
+      report("Generic instruction is missing a virtual register type", MO, I);
+    }
+  }
+
+  // Generic opcodes must not have physical register operands.
+  for (unsigned I = 0; I < MI->getNumOperands(); ++I) {
+    const MachineOperand *MO = &MI->getOperand(I);
+    if (MO->isReg() && MO->getReg().isPhysical())
+      report("Generic instruction cannot have physical register", MO, I);
+  }
+
+  // Avoid out of bounds in checks below. This was already reported earlier.
+  if (MI->getNumOperands() < MCID.getNumOperands())
+    return;
+
+  StringRef ErrorInfo;
+  if (!TII->verifyInstruction(*MI, ErrorInfo))
+    report(ErrorInfo.data(), MI);
+
+  // Verify properties of various specific instruction types
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  case TargetOpcode::G_ASSERT_SEXT:
+  case TargetOpcode::G_ASSERT_ZEXT: {
+    std::string OpcName =
+        Opc == TargetOpcode::G_ASSERT_ZEXT ? "G_ASSERT_ZEXT" : "G_ASSERT_SEXT";
+    if (!MI->getOperand(2).isImm()) {
+      report(Twine(OpcName, " expects an immediate operand #2"), MI);
+      break;
+    }
+
+    Register Dst = MI->getOperand(0).getReg();
+    Register Src = MI->getOperand(1).getReg();
+    LLT SrcTy = MRI->getType(Src);
+    int64_t Imm = MI->getOperand(2).getImm();
+    if (Imm <= 0) {
+      report(Twine(OpcName, " size must be >= 1"), MI);
+      break;
+    }
+
+    if (Imm >= SrcTy.getScalarSizeInBits()) {
+      report(Twine(OpcName, " size must be less than source bit width"), MI);
+      break;
+    }
+
+    const RegisterBank *SrcRB = RBI->getRegBank(Src, *MRI, *TRI);
+    const RegisterBank *DstRB = RBI->getRegBank(Dst, *MRI, *TRI);
+
+    // Allow only the source bank to be set.
+    if ((SrcRB && DstRB && SrcRB != DstRB) || (DstRB && !SrcRB)) {
+      report(Twine(OpcName, " cannot change register bank"), MI);
+      break;
+    }
+
+    // Don't allow a class change. Do allow member class->regbank.
+    const TargetRegisterClass *DstRC = MRI->getRegClassOrNull(Dst);
+    if (DstRC && DstRC != MRI->getRegClassOrNull(Src)) {
+      report(
+          Twine(OpcName, " source and destination register classes must match"),
+          MI);
+      break;
+    }
+
+    break;
+  }
+
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_FCONSTANT: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    if (DstTy.isVector())
+      report("Instruction cannot use a vector result type", MI);
+
+    if (MI->getOpcode() == TargetOpcode::G_CONSTANT) {
+      if (!MI->getOperand(1).isCImm()) {
+        report("G_CONSTANT operand must be cimm", MI);
+        break;
+      }
+
+      const ConstantInt *CI = MI->getOperand(1).getCImm();
+      if (CI->getBitWidth() != DstTy.getSizeInBits())
+        report("inconsistent constant size", MI);
+    } else {
+      if (!MI->getOperand(1).isFPImm()) {
+        report("G_FCONSTANT operand must be fpimm", MI);
+        break;
+      }
+      const ConstantFP *CF = MI->getOperand(1).getFPImm();
+
+      if (APFloat::getSizeInBits(CF->getValueAPF().getSemantics()) !=
+          DstTy.getSizeInBits()) {
+        report("inconsistent constant size", MI);
+      }
+    }
+
+    break;
+  }
+  case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_STORE:
+  case TargetOpcode::G_ZEXTLOAD:
+  case TargetOpcode::G_SEXTLOAD: {
+    LLT ValTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT PtrTy = MRI->getType(MI->getOperand(1).getReg());
+    if (!PtrTy.isPointer())
+      report("Generic memory instruction must access a pointer", MI);
+
+    // Generic loads and stores must have a single MachineMemOperand
+    // describing that access.
+    if (!MI->hasOneMemOperand()) {
+      report("Generic instruction accessing memory must have one mem operand",
+             MI);
+    } else {
+      const MachineMemOperand &MMO = **MI->memoperands_begin();
+      if (MI->getOpcode() == TargetOpcode::G_ZEXTLOAD ||
+          MI->getOpcode() == TargetOpcode::G_SEXTLOAD) {
+        if (MMO.getSizeInBits() >= ValTy.getSizeInBits())
+          report("Generic extload must have a narrower memory type", MI);
+      } else if (MI->getOpcode() == TargetOpcode::G_LOAD) {
+        if (MMO.getSize() > ValTy.getSizeInBytes())
+          report("load memory size cannot exceed result size", MI);
+      } else if (MI->getOpcode() == TargetOpcode::G_STORE) {
+        if (ValTy.getSizeInBytes() < MMO.getSize())
+          report("store memory size cannot exceed value size", MI);
+      }
+
+      const AtomicOrdering Order = MMO.getSuccessOrdering();
+      if (Opc == TargetOpcode::G_STORE) {
+        if (Order == AtomicOrdering::Acquire ||
+            Order == AtomicOrdering::AcquireRelease)
+          report("atomic store cannot use acquire ordering", MI);
+
+      } else {
+        if (Order == AtomicOrdering::Release ||
+            Order == AtomicOrdering::AcquireRelease)
+          report("atomic load cannot use release ordering", MI);
+      }
+    }
+
+    break;
+  }
+  case TargetOpcode::G_PHI: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    if (!DstTy.isValid() || !all_of(drop_begin(MI->operands()),
+                                    [this, &DstTy](const MachineOperand &MO) {
+                                      if (!MO.isReg())
+                                        return true;
+                                      LLT Ty = MRI->getType(MO.getReg());
+                                      if (!Ty.isValid() || (Ty != DstTy))
+                                        return false;
+                                      return true;
+                                    }))
+      report("Generic Instruction G_PHI has operands with incompatible/missing "
+             "types",
+             MI);
+    break;
+  }
+  case TargetOpcode::G_BITCAST: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    if (!DstTy.isValid() || !SrcTy.isValid())
+      break;
+
+    if (SrcTy.isPointer() != DstTy.isPointer())
+      report("bitcast cannot convert between pointers and other types", MI);
+
+    if (SrcTy.getSizeInBits() != DstTy.getSizeInBits())
+      report("bitcast sizes must match", MI);
+
+    if (SrcTy == DstTy)
+      report("bitcast must change the type", MI);
+
+    break;
+  }
+  case TargetOpcode::G_INTTOPTR:
+  case TargetOpcode::G_PTRTOINT:
+  case TargetOpcode::G_ADDRSPACE_CAST: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    if (!DstTy.isValid() || !SrcTy.isValid())
+      break;
+
+    verifyVectorElementMatch(DstTy, SrcTy, MI);
+
+    DstTy = DstTy.getScalarType();
+    SrcTy = SrcTy.getScalarType();
+
+    if (MI->getOpcode() == TargetOpcode::G_INTTOPTR) {
+      if (!DstTy.isPointer())
+        report("inttoptr result type must be a pointer", MI);
+      if (SrcTy.isPointer())
+        report("inttoptr source type must not be a pointer", MI);
+    } else if (MI->getOpcode() == TargetOpcode::G_PTRTOINT) {
+      if (!SrcTy.isPointer())
+        report("ptrtoint source type must be a pointer", MI);
+      if (DstTy.isPointer())
+        report("ptrtoint result type must not be a pointer", MI);
+    } else {
+      assert(MI->getOpcode() == TargetOpcode::G_ADDRSPACE_CAST);
+      if (!SrcTy.isPointer() || !DstTy.isPointer())
+        report("addrspacecast types must be pointers", MI);
+      else {
+        if (SrcTy.getAddressSpace() == DstTy.getAddressSpace())
+          report("addrspacecast must convert different address spaces", MI);
+      }
+    }
+
+    break;
+  }
+  case TargetOpcode::G_PTR_ADD: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT PtrTy = MRI->getType(MI->getOperand(1).getReg());
+    LLT OffsetTy = MRI->getType(MI->getOperand(2).getReg());
+    if (!DstTy.isValid() || !PtrTy.isValid() || !OffsetTy.isValid())
+      break;
+
+    if (!PtrTy.getScalarType().isPointer())
+      report("gep first operand must be a pointer", MI);
+
+    if (OffsetTy.getScalarType().isPointer())
+      report("gep offset operand must not be a pointer", MI);
+
+    // TODO: Is the offset allowed to be a scalar with a vector?
+    break;
+  }
+  case TargetOpcode::G_PTRMASK: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    LLT MaskTy = MRI->getType(MI->getOperand(2).getReg());
+    if (!DstTy.isValid() || !SrcTy.isValid() || !MaskTy.isValid())
+      break;
+
+    if (!DstTy.getScalarType().isPointer())
+      report("ptrmask result type must be a pointer", MI);
+
+    if (!MaskTy.getScalarType().isScalar())
+      report("ptrmask mask type must be an integer", MI);
+
+    verifyVectorElementMatch(DstTy, MaskTy, MI);
+    break;
+  }
+  case TargetOpcode::G_SEXT:
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_ANYEXT:
+  case TargetOpcode::G_TRUNC:
+  case TargetOpcode::G_FPEXT:
+  case TargetOpcode::G_FPTRUNC: {
+    // Number of operands and presense of types is already checked (and
+    // reported in case of any issues), so no need to report them again. As
+    // we're trying to report as many issues as possible at once, however, the
+    // instructions aren't guaranteed to have the right number of operands or
+    // types attached to them at this point
+    assert(MCID.getNumOperands() == 2 && "Expected 2 operands G_*{EXT,TRUNC}");
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    if (!DstTy.isValid() || !SrcTy.isValid())
+      break;
+
+    LLT DstElTy = DstTy.getScalarType();
+    LLT SrcElTy = SrcTy.getScalarType();
+    if (DstElTy.isPointer() || SrcElTy.isPointer())
+      report("Generic extend/truncate can not operate on pointers", MI);
+
+    verifyVectorElementMatch(DstTy, SrcTy, MI);
+
+    unsigned DstSize = DstElTy.getSizeInBits();
+    unsigned SrcSize = SrcElTy.getSizeInBits();
+    switch (MI->getOpcode()) {
+    default:
+      if (DstSize <= SrcSize)
+        report("Generic extend has destination type no larger than source", MI);
+      break;
+    case TargetOpcode::G_TRUNC:
+    case TargetOpcode::G_FPTRUNC:
+      if (DstSize >= SrcSize)
+        report("Generic truncate has destination type no smaller than source",
+               MI);
+      break;
+    }
+    break;
+  }
+  case TargetOpcode::G_SELECT: {
+    LLT SelTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT CondTy = MRI->getType(MI->getOperand(1).getReg());
+    if (!SelTy.isValid() || !CondTy.isValid())
+      break;
+
+    // Scalar condition select on a vector is valid.
+    if (CondTy.isVector())
+      verifyVectorElementMatch(SelTy, CondTy, MI);
+    break;
+  }
+  case TargetOpcode::G_MERGE_VALUES: {
+    // G_MERGE_VALUES should only be used to merge scalars into a larger scalar,
+    // e.g. s2N = MERGE sN, sN
+    // Merging multiple scalars into a vector is not allowed, should use
+    // G_BUILD_VECTOR for that.
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    if (DstTy.isVector() || SrcTy.isVector())
+      report("G_MERGE_VALUES cannot operate on vectors", MI);
+
+    const unsigned NumOps = MI->getNumOperands();
+    if (DstTy.getSizeInBits() != SrcTy.getSizeInBits() * (NumOps - 1))
+      report("G_MERGE_VALUES result size is inconsistent", MI);
+
+    for (unsigned I = 2; I != NumOps; ++I) {
+      if (MRI->getType(MI->getOperand(I).getReg()) != SrcTy)
+        report("G_MERGE_VALUES source types do not match", MI);
+    }
+
+    break;
+  }
+  case TargetOpcode::G_UNMERGE_VALUES: {
+    unsigned NumDsts = MI->getNumOperands() - 1;
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    for (unsigned i = 1; i < NumDsts; ++i) {
+      if (MRI->getType(MI->getOperand(i).getReg()) != DstTy) {
+        report("G_UNMERGE_VALUES destination types do not match", MI);
+        break;
+      }
+    }
+
+    LLT SrcTy = MRI->getType(MI->getOperand(NumDsts).getReg());
+    if (DstTy.isVector()) {
+      // This case is the converse of G_CONCAT_VECTORS.
+      if (!SrcTy.isVector() || SrcTy.getScalarType() != DstTy.getScalarType() ||
+          SrcTy.getNumElements() != NumDsts * DstTy.getNumElements())
+        report("G_UNMERGE_VALUES source operand does not match vector "
+               "destination operands",
+               MI);
+    } else if (SrcTy.isVector()) {
+      // This case is the converse of G_BUILD_VECTOR, but relaxed to allow
+      // mismatched types as long as the total size matches:
+      //   %0:_(s64), %1:_(s64) = G_UNMERGE_VALUES %2:_(<4 x s32>)
+      if (SrcTy.getSizeInBits() != NumDsts * DstTy.getSizeInBits())
+        report("G_UNMERGE_VALUES vector source operand does not match scalar "
+               "destination operands",
+               MI);
+    } else {
+      // This case is the converse of G_MERGE_VALUES.
+      if (SrcTy.getSizeInBits() != NumDsts * DstTy.getSizeInBits()) {
+        report("G_UNMERGE_VALUES scalar source operand does not match scalar "
+               "destination operands",
+               MI);
+      }
+    }
+    break;
+  }
+  case TargetOpcode::G_BUILD_VECTOR: {
+    // Source types must be scalars, dest type a vector. Total size of scalars
+    // must match the dest vector size.
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcEltTy = MRI->getType(MI->getOperand(1).getReg());
+    if (!DstTy.isVector() || SrcEltTy.isVector()) {
+      report("G_BUILD_VECTOR must produce a vector from scalar operands", MI);
+      break;
+    }
+
+    if (DstTy.getElementType() != SrcEltTy)
+      report("G_BUILD_VECTOR result element type must match source type", MI);
+
+    if (DstTy.getNumElements() != MI->getNumOperands() - 1)
+      report("G_BUILD_VECTOR must have an operand for each elemement", MI);
+
+    for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2))
+      if (MRI->getType(MI->getOperand(1).getReg()) != MRI->getType(MO.getReg()))
+        report("G_BUILD_VECTOR source operand types are not homogeneous", MI);
+
+    break;
+  }
+  case TargetOpcode::G_BUILD_VECTOR_TRUNC: {
+    // Source types must be scalars, dest type a vector. Scalar types must be
+    // larger than the dest vector elt type, as this is a truncating operation.
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcEltTy = MRI->getType(MI->getOperand(1).getReg());
+    if (!DstTy.isVector() || SrcEltTy.isVector())
+      report("G_BUILD_VECTOR_TRUNC must produce a vector from scalar operands",
+             MI);
+    for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2))
+      if (MRI->getType(MI->getOperand(1).getReg()) != MRI->getType(MO.getReg()))
+        report("G_BUILD_VECTOR_TRUNC source operand types are not homogeneous",
+               MI);
+    if (SrcEltTy.getSizeInBits() <= DstTy.getElementType().getSizeInBits())
+      report("G_BUILD_VECTOR_TRUNC source operand types are not larger than "
+             "dest elt type",
+             MI);
+    break;
+  }
+  case TargetOpcode::G_CONCAT_VECTORS: {
+    // Source types should be vectors, and total size should match the dest
+    // vector size.
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    if (!DstTy.isVector() || !SrcTy.isVector())
+      report("G_CONCAT_VECTOR requires vector source and destination operands",
+             MI);
+
+    if (MI->getNumOperands() < 3)
+      report("G_CONCAT_VECTOR requires at least 2 source operands", MI);
+
+    for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2))
+      if (MRI->getType(MI->getOperand(1).getReg()) != MRI->getType(MO.getReg()))
+        report("G_CONCAT_VECTOR source operand types are not homogeneous", MI);
+    if (DstTy.getNumElements() !=
+        SrcTy.getNumElements() * (MI->getNumOperands() - 1))
+      report("G_CONCAT_VECTOR num dest and source elements should match", MI);
+    break;
+  }
+  case TargetOpcode::G_ICMP:
+  case TargetOpcode::G_FCMP: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(2).getReg());
+
+    if ((DstTy.isVector() != SrcTy.isVector()) ||
+        (DstTy.isVector() && DstTy.getNumElements() != SrcTy.getNumElements()))
+      report("Generic vector icmp/fcmp must preserve number of lanes", MI);
+
+    break;
+  }
+  case TargetOpcode::G_EXTRACT: {
+    const MachineOperand &SrcOp = MI->getOperand(1);
+    if (!SrcOp.isReg()) {
+      report("extract source must be a register", MI);
+      break;
+    }
+
+    const MachineOperand &OffsetOp = MI->getOperand(2);
+    if (!OffsetOp.isImm()) {
+      report("extract offset must be a constant", MI);
+      break;
+    }
+
+    unsigned DstSize = MRI->getType(MI->getOperand(0).getReg()).getSizeInBits();
+    unsigned SrcSize = MRI->getType(SrcOp.getReg()).getSizeInBits();
+    if (SrcSize == DstSize)
+      report("extract source must be larger than result", MI);
+
+    if (DstSize + OffsetOp.getImm() > SrcSize)
+      report("extract reads past end of register", MI);
+    break;
+  }
+  case TargetOpcode::G_INSERT: {
+    const MachineOperand &SrcOp = MI->getOperand(2);
+    if (!SrcOp.isReg()) {
+      report("insert source must be a register", MI);
+      break;
+    }
+
+    const MachineOperand &OffsetOp = MI->getOperand(3);
+    if (!OffsetOp.isImm()) {
+      report("insert offset must be a constant", MI);
+      break;
+    }
+
+    unsigned DstSize = MRI->getType(MI->getOperand(0).getReg()).getSizeInBits();
+    unsigned SrcSize = MRI->getType(SrcOp.getReg()).getSizeInBits();
+
+    if (DstSize <= SrcSize)
+      report("inserted size must be smaller than total register", MI);
+
+    if (SrcSize + OffsetOp.getImm() > DstSize)
+      report("insert writes past end of register", MI);
+
+    break;
+  }
+  case TargetOpcode::G_JUMP_TABLE: {
+    if (!MI->getOperand(1).isJTI())
+      report("G_JUMP_TABLE source operand must be a jump table index", MI);
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    if (!DstTy.isPointer())
+      report("G_JUMP_TABLE dest operand must have a pointer type", MI);
+    break;
+  }
+  case TargetOpcode::G_BRJT: {
+    if (!MRI->getType(MI->getOperand(0).getReg()).isPointer())
+      report("G_BRJT src operand 0 must be a pointer type", MI);
+
+    if (!MI->getOperand(1).isJTI())
+      report("G_BRJT src operand 1 must be a jump table index", MI);
+
+    const auto &IdxOp = MI->getOperand(2);
+    if (!IdxOp.isReg() || MRI->getType(IdxOp.getReg()).isPointer())
+      report("G_BRJT src operand 2 must be a scalar reg type", MI);
+    break;
+  }
+  case TargetOpcode::G_INTRINSIC:
+  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: {
+    // TODO: Should verify number of def and use operands, but the current
+    // interface requires passing in IR types for mangling.
+    const MachineOperand &IntrIDOp = MI->getOperand(MI->getNumExplicitDefs());
+    if (!IntrIDOp.isIntrinsicID()) {
+      report("G_INTRINSIC first src operand must be an intrinsic ID", MI);
+      break;
+    }
+
+    bool NoSideEffects = MI->getOpcode() == TargetOpcode::G_INTRINSIC;
+    unsigned IntrID = IntrIDOp.getIntrinsicID();
+    if (IntrID != 0 && IntrID < Intrinsic::num_intrinsics) {
+      AttributeList Attrs = Intrinsic::getAttributes(
+          MF->getFunction().getContext(), static_cast<Intrinsic::ID>(IntrID));
+      bool DeclHasSideEffects = !Attrs.getMemoryEffects().doesNotAccessMemory();
+      if (NoSideEffects && DeclHasSideEffects) {
+        report("G_INTRINSIC used with intrinsic that accesses memory", MI);
+        break;
+      }
+      if (!NoSideEffects && !DeclHasSideEffects) {
+        report("G_INTRINSIC_W_SIDE_EFFECTS used with readnone intrinsic", MI);
+        break;
+      }
+    }
+
+    break;
+  }
+  case TargetOpcode::G_SEXT_INREG: {
+    if (!MI->getOperand(2).isImm()) {
+      report("G_SEXT_INREG expects an immediate operand #2", MI);
+      break;
+    }
+
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    int64_t Imm = MI->getOperand(2).getImm();
+    if (Imm <= 0)
+      report("G_SEXT_INREG size must be >= 1", MI);
+    if (Imm >= SrcTy.getScalarSizeInBits())
+      report("G_SEXT_INREG size must be less than source bit width", MI);
+    break;
+  }
+  case TargetOpcode::G_SHUFFLE_VECTOR: {
+    const MachineOperand &MaskOp = MI->getOperand(3);
+    if (!MaskOp.isShuffleMask()) {
+      report("Incorrect mask operand type for G_SHUFFLE_VECTOR", MI);
+      break;
+    }
+
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT Src0Ty = MRI->getType(MI->getOperand(1).getReg());
+    LLT Src1Ty = MRI->getType(MI->getOperand(2).getReg());
+
+    if (Src0Ty != Src1Ty)
+      report("Source operands must be the same type", MI);
+
+    if (Src0Ty.getScalarType() != DstTy.getScalarType())
+      report("G_SHUFFLE_VECTOR cannot change element type", MI);
+
+    // Don't check that all operands are vector because scalars are used in
+    // place of 1 element vectors.
+    int SrcNumElts = Src0Ty.isVector() ? Src0Ty.getNumElements() : 1;
+    int DstNumElts = DstTy.isVector() ? DstTy.getNumElements() : 1;
+
+    ArrayRef<int> MaskIdxes = MaskOp.getShuffleMask();
+
+    if (static_cast<int>(MaskIdxes.size()) != DstNumElts)
+      report("Wrong result type for shufflemask", MI);
+
+    for (int Idx : MaskIdxes) {
+      if (Idx < 0)
+        continue;
+
+      if (Idx >= 2 * SrcNumElts)
+        report("Out of bounds shuffle index", MI);
+    }
+
+    break;
+  }
+  case TargetOpcode::G_DYN_STACKALLOC: {
+    const MachineOperand &DstOp = MI->getOperand(0);
+    const MachineOperand &AllocOp = MI->getOperand(1);
+    const MachineOperand &AlignOp = MI->getOperand(2);
+
+    if (!DstOp.isReg() || !MRI->getType(DstOp.getReg()).isPointer()) {
+      report("dst operand 0 must be a pointer type", MI);
+      break;
+    }
+
+    if (!AllocOp.isReg() || !MRI->getType(AllocOp.getReg()).isScalar()) {
+      report("src operand 1 must be a scalar reg type", MI);
+      break;
+    }
+
+    if (!AlignOp.isImm()) {
+      report("src operand 2 must be an immediate type", MI);
+      break;
+    }
+    break;
+  }
+  case TargetOpcode::G_MEMCPY_INLINE:
+  case TargetOpcode::G_MEMCPY:
+  case TargetOpcode::G_MEMMOVE: {
+    ArrayRef<MachineMemOperand *> MMOs = MI->memoperands();
+    if (MMOs.size() != 2) {
+      report("memcpy/memmove must have 2 memory operands", MI);
+      break;
+    }
+
+    if ((!MMOs[0]->isStore() || MMOs[0]->isLoad()) ||
+        (MMOs[1]->isStore() || !MMOs[1]->isLoad())) {
+      report("wrong memory operand types", MI);
+      break;
+    }
+
+    if (MMOs[0]->getSize() != MMOs[1]->getSize())
+      report("inconsistent memory operand sizes", MI);
+
+    LLT DstPtrTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcPtrTy = MRI->getType(MI->getOperand(1).getReg());
+
+    if (!DstPtrTy.isPointer() || !SrcPtrTy.isPointer()) {
+      report("memory instruction operand must be a pointer", MI);
+      break;
+    }
+
+    if (DstPtrTy.getAddressSpace() != MMOs[0]->getAddrSpace())
+      report("inconsistent store address space", MI);
+    if (SrcPtrTy.getAddressSpace() != MMOs[1]->getAddrSpace())
+      report("inconsistent load address space", MI);
+
+    if (Opc != TargetOpcode::G_MEMCPY_INLINE)
+      if (!MI->getOperand(3).isImm() || (MI->getOperand(3).getImm() & ~1LL))
+        report("'tail' flag (operand 3) must be an immediate 0 or 1", MI);
+
+    break;
+  }
+  case TargetOpcode::G_BZERO:
+  case TargetOpcode::G_MEMSET: {
+    ArrayRef<MachineMemOperand *> MMOs = MI->memoperands();
+    std::string Name = Opc == TargetOpcode::G_MEMSET ? "memset" : "bzero";
+    if (MMOs.size() != 1) {
+      report(Twine(Name, " must have 1 memory operand"), MI);
+      break;
+    }
+
+    if ((!MMOs[0]->isStore() || MMOs[0]->isLoad())) {
+      report(Twine(Name, " memory operand must be a store"), MI);
+      break;
+    }
+
+    LLT DstPtrTy = MRI->getType(MI->getOperand(0).getReg());
+    if (!DstPtrTy.isPointer()) {
+      report(Twine(Name, " operand must be a pointer"), MI);
+      break;
+    }
+
+    if (DstPtrTy.getAddressSpace() != MMOs[0]->getAddrSpace())
+      report("inconsistent " + Twine(Name, " address space"), MI);
+
+    if (!MI->getOperand(MI->getNumOperands() - 1).isImm() ||
+        (MI->getOperand(MI->getNumOperands() - 1).getImm() & ~1LL))
+      report("'tail' flag (last operand) must be an immediate 0 or 1", MI);
+
+    break;
+  }
+  case TargetOpcode::G_VECREDUCE_SEQ_FADD:
+  case TargetOpcode::G_VECREDUCE_SEQ_FMUL: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT Src1Ty = MRI->getType(MI->getOperand(1).getReg());
+    LLT Src2Ty = MRI->getType(MI->getOperand(2).getReg());
+    if (!DstTy.isScalar())
+      report("Vector reduction requires a scalar destination type", MI);
+    if (!Src1Ty.isScalar())
+      report("Sequential FADD/FMUL vector reduction requires a scalar 1st operand", MI);
+    if (!Src2Ty.isVector())
+      report("Sequential FADD/FMUL vector reduction must have a vector 2nd operand", MI);
+    break;
+  }
+  case TargetOpcode::G_VECREDUCE_FADD:
+  case TargetOpcode::G_VECREDUCE_FMUL:
+  case TargetOpcode::G_VECREDUCE_FMAX:
+  case TargetOpcode::G_VECREDUCE_FMIN:
+  case TargetOpcode::G_VECREDUCE_ADD:
+  case TargetOpcode::G_VECREDUCE_MUL:
+  case TargetOpcode::G_VECREDUCE_AND:
+  case TargetOpcode::G_VECREDUCE_OR:
+  case TargetOpcode::G_VECREDUCE_XOR:
+  case TargetOpcode::G_VECREDUCE_SMAX:
+  case TargetOpcode::G_VECREDUCE_SMIN:
+  case TargetOpcode::G_VECREDUCE_UMAX:
+  case TargetOpcode::G_VECREDUCE_UMIN: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    if (!DstTy.isScalar())
+      report("Vector reduction requires a scalar destination type", MI);
+    break;
+  }
+
+  case TargetOpcode::G_SBFX:
+  case TargetOpcode::G_UBFX: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    if (DstTy.isVector()) {
+      report("Bitfield extraction is not supported on vectors", MI);
+      break;
+    }
+    break;
+  }
+  case TargetOpcode::G_SHL:
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_ASHR:
+  case TargetOpcode::G_ROTR:
+  case TargetOpcode::G_ROTL: {
+    LLT Src1Ty = MRI->getType(MI->getOperand(1).getReg());
+    LLT Src2Ty = MRI->getType(MI->getOperand(2).getReg());
+    if (Src1Ty.isVector() != Src2Ty.isVector()) {
+      report("Shifts and rotates require operands to be either all scalars or "
+             "all vectors",
+             MI);
+      break;
+    }
+    break;
+  }
+  case TargetOpcode::G_LLROUND:
+  case TargetOpcode::G_LROUND: {
+    verifyAllRegOpsScalar(*MI, *MRI);
+    break;
+  }
+  case TargetOpcode::G_IS_FPCLASS: {
+    LLT DestTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT DestEltTy = DestTy.getScalarType();
+    if (!DestEltTy.isScalar()) {
+      report("Destination must be a scalar or vector of scalars", MI);
+      break;
+    }
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    LLT SrcEltTy = SrcTy.getScalarType();
+    if (!SrcEltTy.isScalar()) {
+      report("Source must be a scalar or vector of scalars", MI);
+      break;
+    }
+    if (!verifyVectorElementMatch(DestTy, SrcTy, MI))
+      break;
+    const MachineOperand &TestMO = MI->getOperand(2);
+    if (!TestMO.isImm()) {
+      report("floating-point class set (operand 2) must be an immediate", MI);
+      break;
+    }
+    int64_t Test = TestMO.getImm();
+    if (Test < 0 || Test > fcAllFlags) {
+      report("Incorrect floating-point class set (operand 2)", MI);
+      break;
+    }
+    break;
+  }
+  case TargetOpcode::G_ASSERT_ALIGN: {
+    if (MI->getOperand(2).getImm() < 1)
+      report("alignment immediate must be >= 1", MI);
+    break;
+  }
+  case TargetOpcode::G_CONSTANT_POOL: {
+    if (!MI->getOperand(1).isCPI())
+      report("Src operand 1 must be a constant pool index", MI);
+    if (!MRI->getType(MI->getOperand(0).getReg()).isPointer())
+      report("Dst operand 0 must be a pointer", MI);
+    break;
+  }
+  default:
+    break;
+  }
+}
+
+void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MI->getNumOperands() < MCID.getNumOperands()) {
+    report("Too few operands", MI);
+    errs() << MCID.getNumOperands() << " operands expected, but "
+           << MI->getNumOperands() << " given.\n";
+  }
+
+  if (MI->isPHI()) {
+    if (MF->getProperties().hasProperty(
+            MachineFunctionProperties::Property::NoPHIs))
+      report("Found PHI instruction with NoPHIs property set", MI);
+
+    if (FirstNonPHI)
+      report("Found PHI instruction after non-PHI", MI);
+  } else if (FirstNonPHI == nullptr)
+    FirstNonPHI = MI;
+
+  // Check the tied operands.
+  if (MI->isInlineAsm())
+    verifyInlineAsm(MI);
+
+  // Check that unspillable terminators define a reg and have at most one use.
+  if (TII->isUnspillableTerminator(MI)) {
+    if (!MI->getOperand(0).isReg() || !MI->getOperand(0).isDef())
+      report("Unspillable Terminator does not define a reg", MI);
+    Register Def = MI->getOperand(0).getReg();
+    if (Def.isVirtual() &&
+        !MF->getProperties().hasProperty(
+            MachineFunctionProperties::Property::NoPHIs) &&
+        std::distance(MRI->use_nodbg_begin(Def), MRI->use_nodbg_end()) > 1)
+      report("Unspillable Terminator expected to have at most one use!", MI);
+  }
+
+  // A fully-formed DBG_VALUE must have a location. Ignore partially formed
+  // DBG_VALUEs: these are convenient to use in tests, but should never get
+  // generated.
+  if (MI->isDebugValue() && MI->getNumOperands() == 4)
+    if (!MI->getDebugLoc())
+      report("Missing DebugLoc for debug instruction", MI);
+
+  // Meta instructions should never be the subject of debug value tracking,
+  // they don't create a value in the output program at all.
+  if (MI->isMetaInstruction() && MI->peekDebugInstrNum())
+    report("Metadata instruction should not have a value tracking number", MI);
+
+  // Check the MachineMemOperands for basic consistency.
+  for (MachineMemOperand *Op : MI->memoperands()) {
+    if (Op->isLoad() && !MI->mayLoad())
+      report("Missing mayLoad flag", MI);
+    if (Op->isStore() && !MI->mayStore())
+      report("Missing mayStore flag", MI);
+  }
+
+  // Debug values must not have a slot index.
+  // Other instructions must have one, unless they are inside a bundle.
+  if (LiveInts) {
+    bool mapped = !LiveInts->isNotInMIMap(*MI);
+    if (MI->isDebugOrPseudoInstr()) {
+      if (mapped)
+        report("Debug instruction has a slot index", MI);
+    } else if (MI->isInsideBundle()) {
+      if (mapped)
+        report("Instruction inside bundle has a slot index", MI);
+    } else {
+      if (!mapped)
+        report("Missing slot index", MI);
+    }
+  }
+
+  unsigned Opc = MCID.getOpcode();
+  if (isPreISelGenericOpcode(Opc) || isPreISelGenericOptimizationHint(Opc)) {
+    verifyPreISelGenericInstruction(MI);
+    return;
+  }
+
+  StringRef ErrorInfo;
+  if (!TII->verifyInstruction(*MI, ErrorInfo))
+    report(ErrorInfo.data(), MI);
+
+  // Verify properties of various specific instruction types
+  switch (MI->getOpcode()) {
+  case TargetOpcode::COPY: {
+    const MachineOperand &DstOp = MI->getOperand(0);
+    const MachineOperand &SrcOp = MI->getOperand(1);
+    const Register SrcReg = SrcOp.getReg();
+    const Register DstReg = DstOp.getReg();
+
+    LLT DstTy = MRI->getType(DstReg);
+    LLT SrcTy = MRI->getType(SrcReg);
+    if (SrcTy.isValid() && DstTy.isValid()) {
+      // If both types are valid, check that the types are the same.
+      if (SrcTy != DstTy) {
+        report("Copy Instruction is illegal with mismatching types", MI);
+        errs() << "Def = " << DstTy << ", Src = " << SrcTy << "\n";
+      }
+
+      break;
+    }
+
+    if (!SrcTy.isValid() && !DstTy.isValid())
+      break;
+
+    // If we have only one valid type, this is likely a copy between a virtual
+    // and physical register.
+    unsigned SrcSize = 0;
+    unsigned DstSize = 0;
+    if (SrcReg.isPhysical() && DstTy.isValid()) {
+      const TargetRegisterClass *SrcRC =
+          TRI->getMinimalPhysRegClassLLT(SrcReg, DstTy);
+      if (SrcRC)
+        SrcSize = TRI->getRegSizeInBits(*SrcRC);
+    }
+
+    if (SrcSize == 0)
+      SrcSize = TRI->getRegSizeInBits(SrcReg, *MRI);
+
+    if (DstReg.isPhysical() && SrcTy.isValid()) {
+      const TargetRegisterClass *DstRC =
+          TRI->getMinimalPhysRegClassLLT(DstReg, SrcTy);
+      if (DstRC)
+        DstSize = TRI->getRegSizeInBits(*DstRC);
+    }
+
+    if (DstSize == 0)
+      DstSize = TRI->getRegSizeInBits(DstReg, *MRI);
+
+    if (SrcSize != 0 && DstSize != 0 && SrcSize != DstSize) {
+      if (!DstOp.getSubReg() && !SrcOp.getSubReg()) {
+        report("Copy Instruction is illegal with mismatching sizes", MI);
+        errs() << "Def Size = " << DstSize << ", Src Size = " << SrcSize
+               << "\n";
+      }
+    }
+    break;
+  }
+  case TargetOpcode::STATEPOINT: {
+    StatepointOpers SO(MI);
+    if (!MI->getOperand(SO.getIDPos()).isImm() ||
+        !MI->getOperand(SO.getNBytesPos()).isImm() ||
+        !MI->getOperand(SO.getNCallArgsPos()).isImm()) {
+      report("meta operands to STATEPOINT not constant!", MI);
+      break;
+    }
+
+    auto VerifyStackMapConstant = [&](unsigned Offset) {
+      if (Offset >= MI->getNumOperands()) {
+        report("stack map constant to STATEPOINT is out of range!", MI);
+        return;
+      }
+      if (!MI->getOperand(Offset - 1).isImm() ||
+          MI->getOperand(Offset - 1).getImm() != StackMaps::ConstantOp ||
+          !MI->getOperand(Offset).isImm())
+        report("stack map constant to STATEPOINT not well formed!", MI);
+    };
+    VerifyStackMapConstant(SO.getCCIdx());
+    VerifyStackMapConstant(SO.getFlagsIdx());
+    VerifyStackMapConstant(SO.getNumDeoptArgsIdx());
+    VerifyStackMapConstant(SO.getNumGCPtrIdx());
+    VerifyStackMapConstant(SO.getNumAllocaIdx());
+    VerifyStackMapConstant(SO.getNumGcMapEntriesIdx());
+
+    // Verify that all explicit statepoint defs are tied to gc operands as
+    // they are expected to be a relocation of gc operands.
+    unsigned FirstGCPtrIdx = SO.getFirstGCPtrIdx();
+    unsigned LastGCPtrIdx = SO.getNumAllocaIdx() - 2;
+    for (unsigned Idx = 0; Idx < MI->getNumDefs(); Idx++) {
+      unsigned UseOpIdx;
+      if (!MI->isRegTiedToUseOperand(Idx, &UseOpIdx)) {
+        report("STATEPOINT defs expected to be tied", MI);
+        break;
+      }
+      if (UseOpIdx < FirstGCPtrIdx || UseOpIdx > LastGCPtrIdx) {
+        report("STATEPOINT def tied to non-gc operand", MI);
+        break;
+      }
+    }
+
+    // TODO: verify we have properly encoded deopt arguments
+  } break;
+  case TargetOpcode::INSERT_SUBREG: {
+    unsigned InsertedSize;
+    if (unsigned SubIdx = MI->getOperand(2).getSubReg())
+      InsertedSize = TRI->getSubRegIdxSize(SubIdx);
+    else
+      InsertedSize = TRI->getRegSizeInBits(MI->getOperand(2).getReg(), *MRI);
+    unsigned SubRegSize = TRI->getSubRegIdxSize(MI->getOperand(3).getImm());
+    if (SubRegSize < InsertedSize) {
+      report("INSERT_SUBREG expected inserted value to have equal or lesser "
+             "size than the subreg it was inserted into", MI);
+      break;
+    }
+  } break;
+  case TargetOpcode::REG_SEQUENCE: {
+    unsigned NumOps = MI->getNumOperands();
+    if (!(NumOps & 1)) {
+      report("Invalid number of operands for REG_SEQUENCE", MI);
+      break;
+    }
+
+    for (unsigned I = 1; I != NumOps; I += 2) {
+      const MachineOperand &RegOp = MI->getOperand(I);
+      const MachineOperand &SubRegOp = MI->getOperand(I + 1);
+
+      if (!RegOp.isReg())
+        report("Invalid register operand for REG_SEQUENCE", &RegOp, I);
+
+      if (!SubRegOp.isImm() || SubRegOp.getImm() == 0 ||
+          SubRegOp.getImm() >= TRI->getNumSubRegIndices()) {
+        report("Invalid subregister index operand for REG_SEQUENCE",
+               &SubRegOp, I + 1);
+      }
+    }
+
+    Register DstReg = MI->getOperand(0).getReg();
+    if (DstReg.isPhysical())
+      report("REG_SEQUENCE does not support physical register results", MI);
+
+    if (MI->getOperand(0).getSubReg())
+      report("Invalid subreg result for REG_SEQUENCE", MI);
+
+    break;
+  }
+  }
+}
+
+void
+MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
+  const MachineInstr *MI = MO->getParent();
+  const MCInstrDesc &MCID = MI->getDesc();
+  unsigned NumDefs = MCID.getNumDefs();
+  if (MCID.getOpcode() == TargetOpcode::PATCHPOINT)
+    NumDefs = (MONum == 0 && MO->isReg()) ? NumDefs : 0;
+
+  // The first MCID.NumDefs operands must be explicit register defines
+  if (MONum < NumDefs) {
+    const MCOperandInfo &MCOI = MCID.operands()[MONum];
+    if (!MO->isReg())
+      report("Explicit definition must be a register", MO, MONum);
+    else if (!MO->isDef() && !MCOI.isOptionalDef())
+      report("Explicit definition marked as use", MO, MONum);
+    else if (MO->isImplicit())
+      report("Explicit definition marked as implicit", MO, MONum);
+  } else if (MONum < MCID.getNumOperands()) {
+    const MCOperandInfo &MCOI = MCID.operands()[MONum];
+    // Don't check if it's the last operand in a variadic instruction. See,
+    // e.g., LDM_RET in the arm back end. Check non-variadic operands only.
+    bool IsOptional = MI->isVariadic() && MONum == MCID.getNumOperands() - 1;
+    if (!IsOptional) {
+      if (MO->isReg()) {
+        if (MO->isDef() && !MCOI.isOptionalDef() && !MCID.variadicOpsAreDefs())
+          report("Explicit operand marked as def", MO, MONum);
+        if (MO->isImplicit())
+          report("Explicit operand marked as implicit", MO, MONum);
+      }
+
+      // Check that an instruction has register operands only as expected.
+      if (MCOI.OperandType == MCOI::OPERAND_REGISTER &&
+          !MO->isReg() && !MO->isFI())
+        report("Expected a register operand.", MO, MONum);
+      if (MO->isReg()) {
+        if (MCOI.OperandType == MCOI::OPERAND_IMMEDIATE ||
+            (MCOI.OperandType == MCOI::OPERAND_PCREL &&
+             !TII->isPCRelRegisterOperandLegal(*MO)))
+          report("Expected a non-register operand.", MO, MONum);
+      }
+    }
+
+    int TiedTo = MCID.getOperandConstraint(MONum, MCOI::TIED_TO);
+    if (TiedTo != -1) {
+      if (!MO->isReg())
+        report("Tied use must be a register", MO, MONum);
+      else if (!MO->isTied())
+        report("Operand should be tied", MO, MONum);
+      else if (unsigned(TiedTo) != MI->findTiedOperandIdx(MONum))
+        report("Tied def doesn't match MCInstrDesc", MO, MONum);
+      else if (MO->getReg().isPhysical()) {
+        const MachineOperand &MOTied = MI->getOperand(TiedTo);
+        if (!MOTied.isReg())
+          report("Tied counterpart must be a register", &MOTied, TiedTo);
+        else if (MOTied.getReg().isPhysical() &&
+                 MO->getReg() != MOTied.getReg())
+          report("Tied physical registers must match.", &MOTied, TiedTo);
+      }
+    } else if (MO->isReg() && MO->isTied())
+      report("Explicit operand should not be tied", MO, MONum);
+  } else {
+    // ARM adds %reg0 operands to indicate predicates. We'll allow that.
+    if (MO->isReg() && !MO->isImplicit() && !MI->isVariadic() && MO->getReg())
+      report("Extra explicit operand on non-variadic instruction", MO, MONum);
+  }
+
+  switch (MO->getType()) {
+  case MachineOperand::MO_Register: {
+    // Verify debug flag on debug instructions. Check this first because reg0
+    // indicates an undefined debug value.
+    if (MI->isDebugInstr() && MO->isUse()) {
+      if (!MO->isDebug())
+        report("Register operand must be marked debug", MO, MONum);
+    } else if (MO->isDebug()) {
+      report("Register operand must not be marked debug", MO, MONum);
+    }
+
+    const Register Reg = MO->getReg();
+    if (!Reg)
+      return;
+    if (MRI->tracksLiveness() && !MI->isDebugInstr())
+      checkLiveness(MO, MONum);
+
+    if (MO->isDef() && MO->isUndef() && !MO->getSubReg() &&
+        MO->getReg().isVirtual()) // TODO: Apply to physregs too
+      report("Undef virtual register def operands require a subregister", MO, MONum);
+
+    // Verify the consistency of tied operands.
+    if (MO->isTied()) {
+      unsigned OtherIdx = MI->findTiedOperandIdx(MONum);
+      const MachineOperand &OtherMO = MI->getOperand(OtherIdx);
+      if (!OtherMO.isReg())
+        report("Must be tied to a register", MO, MONum);
+      if (!OtherMO.isTied())
+        report("Missing tie flags on tied operand", MO, MONum);
+      if (MI->findTiedOperandIdx(OtherIdx) != MONum)
+        report("Inconsistent tie links", MO, MONum);
+      if (MONum < MCID.getNumDefs()) {
+        if (OtherIdx < MCID.getNumOperands()) {
+          if (-1 == MCID.getOperandConstraint(OtherIdx, MCOI::TIED_TO))
+            report("Explicit def tied to explicit use without tie constraint",
+                   MO, MONum);
+        } else {
+          if (!OtherMO.isImplicit())
+            report("Explicit def should be tied to implicit use", MO, MONum);
+        }
+      }
+    }
+
+    // Verify two-address constraints after the twoaddressinstruction pass.
+    // Both twoaddressinstruction pass and phi-node-elimination pass call
+    // MRI->leaveSSA() to set MF as NoSSA, we should do the verification after
+    // twoaddressinstruction pass not after phi-node-elimination pass. So we
+    // shouldn't use the NoSSA as the condition, we should based on
+    // TiedOpsRewritten property to verify two-address constraints, this
+    // property will be set in twoaddressinstruction pass.
+    unsigned DefIdx;
+    if (MF->getProperties().hasProperty(
+            MachineFunctionProperties::Property::TiedOpsRewritten) &&
+        MO->isUse() && MI->isRegTiedToDefOperand(MONum, &DefIdx) &&
+        Reg != MI->getOperand(DefIdx).getReg())
+      report("Two-address instruction operands must be identical", MO, MONum);
+
+    // Check register classes.
+    unsigned SubIdx = MO->getSubReg();
+
+    if (Reg.isPhysical()) {
+      if (SubIdx) {
+        report("Illegal subregister index for physical register", MO, MONum);
+        return;
+      }
+      if (MONum < MCID.getNumOperands()) {
+        if (const TargetRegisterClass *DRC =
+              TII->getRegClass(MCID, MONum, TRI, *MF)) {
+          if (!DRC->contains(Reg)) {
+            report("Illegal physical register for instruction", MO, MONum);
+            errs() << printReg(Reg, TRI) << " is not a "
+                   << TRI->getRegClassName(DRC) << " register.\n";
+          }
+        }
+      }
+      if (MO->isRenamable()) {
+        if (MRI->isReserved(Reg)) {
+          report("isRenamable set on reserved register", MO, MONum);
+          return;
+        }
+      }
+    } else {
+      // Virtual register.
+      const TargetRegisterClass *RC = MRI->getRegClassOrNull(Reg);
+      if (!RC) {
+        // This is a generic virtual register.
+
+        // Do not allow undef uses for generic virtual registers. This ensures
+        // getVRegDef can never fail and return null on a generic register.
+        //
+        // FIXME: This restriction should probably be broadened to all SSA
+        // MIR. However, DetectDeadLanes/ProcessImplicitDefs technically still
+        // run on the SSA function just before phi elimination.
+        if (MO->isUndef())
+          report("Generic virtual register use cannot be undef", MO, MONum);
+
+        // Debug value instruction is permitted to use undefined vregs.
+        // This is a performance measure to skip the overhead of immediately
+        // pruning unused debug operands. The final undef substitution occurs
+        // when debug values are allocated in LDVImpl::handleDebugValue, so
+        // these verifications always apply after this pass.
+        if (isFunctionTracksDebugUserValues || !MO->isUse() ||
+            !MI->isDebugValue() || !MRI->def_empty(Reg)) {
+          // If we're post-Select, we can't have gvregs anymore.
+          if (isFunctionSelected) {
+            report("Generic virtual register invalid in a Selected function",
+                   MO, MONum);
+            return;
+          }
+
+          // The gvreg must have a type and it must not have a SubIdx.
+          LLT Ty = MRI->getType(Reg);
+          if (!Ty.isValid()) {
+            report("Generic virtual register must have a valid type", MO,
+                   MONum);
+            return;
+          }
+
+          const RegisterBank *RegBank = MRI->getRegBankOrNull(Reg);
+          const RegisterBankInfo *RBI = MF->getSubtarget().getRegBankInfo();
+
+          // If we're post-RegBankSelect, the gvreg must have a bank.
+          if (!RegBank && isFunctionRegBankSelected) {
+            report("Generic virtual register must have a bank in a "
+                   "RegBankSelected function",
+                   MO, MONum);
+            return;
+          }
+
+          // Make sure the register fits into its register bank if any.
+          if (RegBank && Ty.isValid() &&
+              RBI->getMaximumSize(RegBank->getID()) < Ty.getSizeInBits()) {
+            report("Register bank is too small for virtual register", MO,
+                   MONum);
+            errs() << "Register bank " << RegBank->getName() << " too small("
+                   << RBI->getMaximumSize(RegBank->getID()) << ") to fit "
+                   << Ty.getSizeInBits() << "-bits\n";
+            return;
+          }
+        }
+
+        if (SubIdx)  {
+          report("Generic virtual register does not allow subregister index", MO,
+                 MONum);
+          return;
+        }
+
+        // If this is a target specific instruction and this operand
+        // has register class constraint, the virtual register must
+        // comply to it.
+        if (!isPreISelGenericOpcode(MCID.getOpcode()) &&
+            MONum < MCID.getNumOperands() &&
+            TII->getRegClass(MCID, MONum, TRI, *MF)) {
+          report("Virtual register does not match instruction constraint", MO,
+                 MONum);
+          errs() << "Expect register class "
+                 << TRI->getRegClassName(
+                        TII->getRegClass(MCID, MONum, TRI, *MF))
+                 << " but got nothing\n";
+          return;
+        }
+
+        break;
+      }
+      if (SubIdx) {
+        const TargetRegisterClass *SRC =
+          TRI->getSubClassWithSubReg(RC, SubIdx);
+        if (!SRC) {
+          report("Invalid subregister index for virtual register", MO, MONum);
+          errs() << "Register class " << TRI->getRegClassName(RC)
+              << " does not support subreg index " << SubIdx << "\n";
+          return;
+        }
+        if (RC != SRC) {
+          report("Invalid register class for subregister index", MO, MONum);
+          errs() << "Register class " << TRI->getRegClassName(RC)
+              << " does not fully support subreg index " << SubIdx << "\n";
+          return;
+        }
+      }
+      if (MONum < MCID.getNumOperands()) {
+        if (const TargetRegisterClass *DRC =
+              TII->getRegClass(MCID, MONum, TRI, *MF)) {
+          if (SubIdx) {
+            const TargetRegisterClass *SuperRC =
+                TRI->getLargestLegalSuperClass(RC, *MF);
+            if (!SuperRC) {
+              report("No largest legal super class exists.", MO, MONum);
+              return;
+            }
+            DRC = TRI->getMatchingSuperRegClass(SuperRC, DRC, SubIdx);
+            if (!DRC) {
+              report("No matching super-reg register class.", MO, MONum);
+              return;
+            }
+          }
+          if (!RC->hasSuperClassEq(DRC)) {
+            report("Illegal virtual register for instruction", MO, MONum);
+            errs() << "Expected a " << TRI->getRegClassName(DRC)
+                << " register, but got a " << TRI->getRegClassName(RC)
+                << " register\n";
+          }
+        }
+      }
+    }
+    break;
+  }
+
+  case MachineOperand::MO_RegisterMask:
+    regMasks.push_back(MO->getRegMask());
+    break;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    if (MI->isPHI() && !MO->getMBB()->isSuccessor(MI->getParent()))
+      report("PHI operand is not in the CFG", MO, MONum);
+    break;
+
+  case MachineOperand::MO_FrameIndex:
+    if (LiveStks && LiveStks->hasInterval(MO->getIndex()) &&
+        LiveInts && !LiveInts->isNotInMIMap(*MI)) {
+      int FI = MO->getIndex();
+      LiveInterval &LI = LiveStks->getInterval(FI);
+      SlotIndex Idx = LiveInts->getInstructionIndex(*MI);
+
+      bool stores = MI->mayStore();
+      bool loads = MI->mayLoad();
+      // For a memory-to-memory move, we need to check if the frame
+      // index is used for storing or loading, by inspecting the
+      // memory operands.
+      if (stores && loads) {
+        for (auto *MMO : MI->memoperands()) {
+          const PseudoSourceValue *PSV = MMO->getPseudoValue();
+          if (PSV == nullptr) continue;
+          const FixedStackPseudoSourceValue *Value =
+            dyn_cast<FixedStackPseudoSourceValue>(PSV);
+          if (Value == nullptr) continue;
+          if (Value->getFrameIndex() != FI) continue;
+
+          if (MMO->isStore())
+            loads = false;
+          else
+            stores = false;
+          break;
+        }
+        if (loads == stores)
+          report("Missing fixed stack memoperand.", MI);
+      }
+      if (loads && !LI.liveAt(Idx.getRegSlot(true))) {
+        report("Instruction loads from dead spill slot", MO, MONum);
+        errs() << "Live stack: " << LI << '\n';
+      }
+      if (stores && !LI.liveAt(Idx.getRegSlot())) {
+        report("Instruction stores to dead spill slot", MO, MONum);
+        errs() << "Live stack: " << LI << '\n';
+      }
+    }
+    break;
+
+  case MachineOperand::MO_CFIIndex:
+    if (MO->getCFIIndex() >= MF->getFrameInstructions().size())
+      report("CFI instruction has invalid index", MO, MONum);
+    break;
+
+  default:
+    break;
+  }
+}
+
+void MachineVerifier::checkLivenessAtUse(const MachineOperand *MO,
+                                         unsigned MONum, SlotIndex UseIdx,
+                                         const LiveRange &LR,
+                                         Register VRegOrUnit,
+                                         LaneBitmask LaneMask) {
+  LiveQueryResult LRQ = LR.Query(UseIdx);
+  // Check if we have a segment at the use, note however that we only need one
+  // live subregister range, the others may be dead.
+  if (!LRQ.valueIn() && LaneMask.none()) {
+    report("No live segment at use", MO, MONum);
+    report_context_liverange(LR);
+    report_context_vreg_regunit(VRegOrUnit);
+    report_context(UseIdx);
+  }
+  if (MO->isKill() && !LRQ.isKill()) {
+    report("Live range continues after kill flag", MO, MONum);
+    report_context_liverange(LR);
+    report_context_vreg_regunit(VRegOrUnit);
+    if (LaneMask.any())
+      report_context_lanemask(LaneMask);
+    report_context(UseIdx);
+  }
+}
+
+void MachineVerifier::checkLivenessAtDef(const MachineOperand *MO,
+                                         unsigned MONum, SlotIndex DefIdx,
+                                         const LiveRange &LR,
+                                         Register VRegOrUnit,
+                                         bool SubRangeCheck,
+                                         LaneBitmask LaneMask) {
+  if (const VNInfo *VNI = LR.getVNInfoAt(DefIdx)) {
+    // The LR can correspond to the whole reg and its def slot is not obliged
+    // to be the same as the MO' def slot. E.g. when we check here "normal"
+    // subreg MO but there is other EC subreg MO in the same instruction so the
+    // whole reg has EC def slot and differs from the currently checked MO' def
+    // slot. For example:
+    // %0 [16e,32r:0) 0@16e  L..3 [16e,32r:0) 0@16e  L..C [16r,32r:0) 0@16r
+    // Check that there is an early-clobber def of the same superregister
+    // somewhere is performed in visitMachineFunctionAfter()
+    if (((SubRangeCheck || MO->getSubReg() == 0) && VNI->def != DefIdx) ||
+        !SlotIndex::isSameInstr(VNI->def, DefIdx) ||
+        (VNI->def != DefIdx &&
+         (!VNI->def.isEarlyClobber() || !DefIdx.isRegister()))) {
+      report("Inconsistent valno->def", MO, MONum);
+      report_context_liverange(LR);
+      report_context_vreg_regunit(VRegOrUnit);
+      if (LaneMask.any())
+        report_context_lanemask(LaneMask);
+      report_context(*VNI);
+      report_context(DefIdx);
+    }
+  } else {
+    report("No live segment at def", MO, MONum);
+    report_context_liverange(LR);
+    report_context_vreg_regunit(VRegOrUnit);
+    if (LaneMask.any())
+      report_context_lanemask(LaneMask);
+    report_context(DefIdx);
+  }
+  // Check that, if the dead def flag is present, LiveInts agree.
+  if (MO->isDead()) {
+    LiveQueryResult LRQ = LR.Query(DefIdx);
+    if (!LRQ.isDeadDef()) {
+      assert(VRegOrUnit.isVirtual() && "Expecting a virtual register.");
+      // A dead subreg def only tells us that the specific subreg is dead. There
+      // could be other non-dead defs of other subregs, or we could have other
+      // parts of the register being live through the instruction. So unless we
+      // are checking liveness for a subrange it is ok for the live range to
+      // continue, given that we have a dead def of a subregister.
+      if (SubRangeCheck || MO->getSubReg() == 0) {
+        report("Live range continues after dead def flag", MO, MONum);
+        report_context_liverange(LR);
+        report_context_vreg_regunit(VRegOrUnit);
+        if (LaneMask.any())
+          report_context_lanemask(LaneMask);
+      }
+    }
+  }
+}
+
+void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
+  const MachineInstr *MI = MO->getParent();
+  const Register Reg = MO->getReg();
+  const unsigned SubRegIdx = MO->getSubReg();
+
+  const LiveInterval *LI = nullptr;
+  if (LiveInts && Reg.isVirtual()) {
+    if (LiveInts->hasInterval(Reg)) {
+      LI = &LiveInts->getInterval(Reg);
+      if (SubRegIdx != 0 && (MO->isDef() || !MO->isUndef()) && !LI->empty() &&
+          !LI->hasSubRanges() && MRI->shouldTrackSubRegLiveness(Reg))
+        report("Live interval for subreg operand has no subranges", MO, MONum);
+    } else {
+      report("Virtual register has no live interval", MO, MONum);
+    }
+  }
+
+  // Both use and def operands can read a register.
+  if (MO->readsReg()) {
+    if (MO->isKill())
+      addRegWithSubRegs(regsKilled, Reg);
+
+    // Check that LiveVars knows this kill (unless we are inside a bundle, in
+    // which case we have already checked that LiveVars knows any kills on the
+    // bundle header instead).
+    if (LiveVars && Reg.isVirtual() && MO->isKill() &&
+        !MI->isBundledWithPred()) {
+      LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
+      if (!is_contained(VI.Kills, MI))
+        report("Kill missing from LiveVariables", MO, MONum);
+    }
+
+    // Check LiveInts liveness and kill.
+    if (LiveInts && !LiveInts->isNotInMIMap(*MI)) {
+      SlotIndex UseIdx = LiveInts->getInstructionIndex(*MI);
+      // Check the cached regunit intervals.
+      if (Reg.isPhysical() && !isReserved(Reg)) {
+        for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
+          if (MRI->isReservedRegUnit(Unit))
+            continue;
+          if (const LiveRange *LR = LiveInts->getCachedRegUnit(Unit))
+            checkLivenessAtUse(MO, MONum, UseIdx, *LR, Unit);
+        }
+      }
+
+      if (Reg.isVirtual()) {
+        // This is a virtual register interval.
+        checkLivenessAtUse(MO, MONum, UseIdx, *LI, Reg);
+
+        if (LI->hasSubRanges() && !MO->isDef()) {
+          LaneBitmask MOMask = SubRegIdx != 0
+                                   ? TRI->getSubRegIndexLaneMask(SubRegIdx)
+                                   : MRI->getMaxLaneMaskForVReg(Reg);
+          LaneBitmask LiveInMask;
+          for (const LiveInterval::SubRange &SR : LI->subranges()) {
+            if ((MOMask & SR.LaneMask).none())
+              continue;
+            checkLivenessAtUse(MO, MONum, UseIdx, SR, Reg, SR.LaneMask);
+            LiveQueryResult LRQ = SR.Query(UseIdx);
+            if (LRQ.valueIn())
+              LiveInMask |= SR.LaneMask;
+          }
+          // At least parts of the register has to be live at the use.
+          if ((LiveInMask & MOMask).none()) {
+            report("No live subrange at use", MO, MONum);
+            report_context(*LI);
+            report_context(UseIdx);
+          }
+        }
+      }
+    }
+
+    // Use of a dead register.
+    if (!regsLive.count(Reg)) {
+      if (Reg.isPhysical()) {
+        // Reserved registers may be used even when 'dead'.
+        bool Bad = !isReserved(Reg);
+        // We are fine if just any subregister has a defined value.
+        if (Bad) {
+
+          for (const MCPhysReg &SubReg : TRI->subregs(Reg)) {
+            if (regsLive.count(SubReg)) {
+              Bad = false;
+              break;
+            }
+          }
+        }
+        // If there is an additional implicit-use of a super register we stop
+        // here. By definition we are fine if the super register is not
+        // (completely) dead, if the complete super register is dead we will
+        // get a report for its operand.
+        if (Bad) {
+          for (const MachineOperand &MOP : MI->uses()) {
+            if (!MOP.isReg() || !MOP.isImplicit())
+              continue;
+
+            if (!MOP.getReg().isPhysical())
+              continue;
+
+            if (llvm::is_contained(TRI->subregs(MOP.getReg()), Reg))
+              Bad = false;
+          }
+        }
+        if (Bad)
+          report("Using an undefined physical register", MO, MONum);
+      } else if (MRI->def_empty(Reg)) {
+        report("Reading virtual register without a def", MO, MONum);
+      } else {
+        BBInfo &MInfo = MBBInfoMap[MI->getParent()];
+        // We don't know which virtual registers are live in, so only complain
+        // if vreg was killed in this MBB. Otherwise keep track of vregs that
+        // must be live in. PHI instructions are handled separately.
+        if (MInfo.regsKilled.count(Reg))
+          report("Using a killed virtual register", MO, MONum);
+        else if (!MI->isPHI())
+          MInfo.vregsLiveIn.insert(std::make_pair(Reg, MI));
+      }
+    }
+  }
+
+  if (MO->isDef()) {
+    // Register defined.
+    // TODO: verify that earlyclobber ops are not used.
+    if (MO->isDead())
+      addRegWithSubRegs(regsDead, Reg);
+    else
+      addRegWithSubRegs(regsDefined, Reg);
+
+    // Verify SSA form.
+    if (MRI->isSSA() && Reg.isVirtual() &&
+        std::next(MRI->def_begin(Reg)) != MRI->def_end())
+      report("Multiple virtual register defs in SSA form", MO, MONum);
+
+    // Check LiveInts for a live segment, but only for virtual registers.
+    if (LiveInts && !LiveInts->isNotInMIMap(*MI)) {
+      SlotIndex DefIdx = LiveInts->getInstructionIndex(*MI);
+      DefIdx = DefIdx.getRegSlot(MO->isEarlyClobber());
+
+      if (Reg.isVirtual()) {
+        checkLivenessAtDef(MO, MONum, DefIdx, *LI, Reg);
+
+        if (LI->hasSubRanges()) {
+          LaneBitmask MOMask = SubRegIdx != 0
+                                   ? TRI->getSubRegIndexLaneMask(SubRegIdx)
+                                   : MRI->getMaxLaneMaskForVReg(Reg);
+          for (const LiveInterval::SubRange &SR : LI->subranges()) {
+            if ((SR.LaneMask & MOMask).none())
+              continue;
+            checkLivenessAtDef(MO, MONum, DefIdx, SR, Reg, true, SR.LaneMask);
+          }
+        }
+      }
+    }
+  }
+}
+
+// This function gets called after visiting all instructions in a bundle. The
+// argument points to the bundle header.
+// Normal stand-alone instructions are also considered 'bundles', and this
+// function is called for all of them.
+void MachineVerifier::visitMachineBundleAfter(const MachineInstr *MI) {
+  BBInfo &MInfo = MBBInfoMap[MI->getParent()];
+  set_union(MInfo.regsKilled, regsKilled);
+  set_subtract(regsLive, regsKilled); regsKilled.clear();
+  // Kill any masked registers.
+  while (!regMasks.empty()) {
+    const uint32_t *Mask = regMasks.pop_back_val();
+    for (Register Reg : regsLive)
+      if (Reg.isPhysical() &&
+          MachineOperand::clobbersPhysReg(Mask, Reg.asMCReg()))
+        regsDead.push_back(Reg);
+  }
+  set_subtract(regsLive, regsDead);   regsDead.clear();
+  set_union(regsLive, regsDefined);   regsDefined.clear();
+}
+
+void
+MachineVerifier::visitMachineBasicBlockAfter(const MachineBasicBlock *MBB) {
+  MBBInfoMap[MBB].regsLiveOut = regsLive;
+  regsLive.clear();
+
+  if (Indexes) {
+    SlotIndex stop = Indexes->getMBBEndIdx(MBB);
+    if (!(stop > lastIndex)) {
+      report("Block ends before last instruction index", MBB);
+      errs() << "Block ends at " << stop
+          << " last instruction was at " << lastIndex << '\n';
+    }
+    lastIndex = stop;
+  }
+}
+
+namespace {
+// This implements a set of registers that serves as a filter: can filter other
+// sets by passing through elements not in the filter and blocking those that
+// are. Any filter implicitly includes the full set of physical registers upon
+// creation, thus filtering them all out. The filter itself as a set only grows,
+// and needs to be as efficient as possible.
+struct VRegFilter {
+  // Add elements to the filter itself. \pre Input set \p FromRegSet must have
+  // no duplicates. Both virtual and physical registers are fine.
+  template <typename RegSetT> void add(const RegSetT &FromRegSet) {
+    SmallVector<Register, 0> VRegsBuffer;
+    filterAndAdd(FromRegSet, VRegsBuffer);
+  }
+  // Filter \p FromRegSet through the filter and append passed elements into \p
+  // ToVRegs. All elements appended are then added to the filter itself.
+  // \returns true if anything changed.
+  template <typename RegSetT>
+  bool filterAndAdd(const RegSetT &FromRegSet,
+                    SmallVectorImpl<Register> &ToVRegs) {
+    unsigned SparseUniverse = Sparse.size();
+    unsigned NewSparseUniverse = SparseUniverse;
+    unsigned NewDenseSize = Dense.size();
+    size_t Begin = ToVRegs.size();
+    for (Register Reg : FromRegSet) {
+      if (!Reg.isVirtual())
+        continue;
+      unsigned Index = Register::virtReg2Index(Reg);
+      if (Index < SparseUniverseMax) {
+        if (Index < SparseUniverse && Sparse.test(Index))
+          continue;
+        NewSparseUniverse = std::max(NewSparseUniverse, Index + 1);
+      } else {
+        if (Dense.count(Reg))
+          continue;
+        ++NewDenseSize;
+      }
+      ToVRegs.push_back(Reg);
+    }
+    size_t End = ToVRegs.size();
+    if (Begin == End)
+      return false;
+    // Reserving space in sets once performs better than doing so continuously
+    // and pays easily for double look-ups (even in Dense with SparseUniverseMax
+    // tuned all the way down) and double iteration (the second one is over a
+    // SmallVector, which is a lot cheaper compared to DenseSet or BitVector).
+    Sparse.resize(NewSparseUniverse);
+    Dense.reserve(NewDenseSize);
+    for (unsigned I = Begin; I < End; ++I) {
+      Register Reg = ToVRegs[I];
+      unsigned Index = Register::virtReg2Index(Reg);
+      if (Index < SparseUniverseMax)
+        Sparse.set(Index);
+      else
+        Dense.insert(Reg);
+    }
+    return true;
+  }
+
+private:
+  static constexpr unsigned SparseUniverseMax = 10 * 1024 * 8;
+  // VRegs indexed within SparseUniverseMax are tracked by Sparse, those beyound
+  // are tracked by Dense. The only purpose of the threashold and the Dense set
+  // is to have a reasonably growing memory usage in pathological cases (large
+  // number of very sparse VRegFilter instances live at the same time). In
+  // practice even in the worst-by-execution time cases having all elements
+  // tracked by Sparse (very large SparseUniverseMax scenario) tends to be more
+  // space efficient than if tracked by Dense. The threashold is set to keep the
+  // worst-case memory usage within 2x of figures determined empirically for
+  // "all Dense" scenario in such worst-by-execution-time cases.
+  BitVector Sparse;
+  DenseSet<unsigned> Dense;
+};
+
+// Implements both a transfer function and a (binary, in-place) join operator
+// for a dataflow over register sets with set union join and filtering transfer
+// (out_b = in_b \ filter_b). filter_b is expected to be set-up ahead of time.
+// Maintains out_b as its state, allowing for O(n) iteration over it at any
+// time, where n is the size of the set (as opposed to O(U) where U is the
+// universe). filter_b implicitly contains all physical registers at all times.
+class FilteringVRegSet {
+  VRegFilter Filter;
+  SmallVector<Register, 0> VRegs;
+
+public:
+  // Set-up the filter_b. \pre Input register set \p RS must have no duplicates.
+  // Both virtual and physical registers are fine.
+  template <typename RegSetT> void addToFilter(const RegSetT &RS) {
+    Filter.add(RS);
+  }
+  // Passes \p RS through the filter_b (transfer function) and adds what's left
+  // to itself (out_b).
+  template <typename RegSetT> bool add(const RegSetT &RS) {
+    // Double-duty the Filter: to maintain VRegs a set (and the join operation
+    // a set union) just add everything being added here to the Filter as well.
+    return Filter.filterAndAdd(RS, VRegs);
+  }
+  using const_iterator = decltype(VRegs)::const_iterator;
+  const_iterator begin() const { return VRegs.begin(); }
+  const_iterator end() const { return VRegs.end(); }
+  size_t size() const { return VRegs.size(); }
+};
+} // namespace
+
+// Calculate the largest possible vregsPassed sets. These are the registers that
+// can pass through an MBB live, but may not be live every time. It is assumed
+// that all vregsPassed sets are empty before the call.
+void MachineVerifier::calcRegsPassed() {
+  if (MF->empty())
+    // ReversePostOrderTraversal doesn't handle empty functions.
+    return;
+
+  for (const MachineBasicBlock *MB :
+       ReversePostOrderTraversal<const MachineFunction *>(MF)) {
+    FilteringVRegSet VRegs;
+    BBInfo &Info = MBBInfoMap[MB];
+    assert(Info.reachable);
+
+    VRegs.addToFilter(Info.regsKilled);
+    VRegs.addToFilter(Info.regsLiveOut);
+    for (const MachineBasicBlock *Pred : MB->predecessors()) {
+      const BBInfo &PredInfo = MBBInfoMap[Pred];
+      if (!PredInfo.reachable)
+        continue;
+
+      VRegs.add(PredInfo.regsLiveOut);
+      VRegs.add(PredInfo.vregsPassed);
+    }
+    Info.vregsPassed.reserve(VRegs.size());
+    Info.vregsPassed.insert(VRegs.begin(), VRegs.end());
+  }
+}
+
+// Calculate the set of virtual registers that must be passed through each basic
+// block in order to satisfy the requirements of successor blocks. This is very
+// similar to calcRegsPassed, only backwards.
+void MachineVerifier::calcRegsRequired() {
+  // First push live-in regs to predecessors' vregsRequired.
+  SmallPtrSet<const MachineBasicBlock*, 8> todo;
+  for (const auto &MBB : *MF) {
+    BBInfo &MInfo = MBBInfoMap[&MBB];
+    for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+      BBInfo &PInfo = MBBInfoMap[Pred];
+      if (PInfo.addRequired(MInfo.vregsLiveIn))
+        todo.insert(Pred);
+    }
+
+    // Handle the PHI node.
+    for (const MachineInstr &MI : MBB.phis()) {
+      for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
+        // Skip those Operands which are undef regs or not regs.
+        if (!MI.getOperand(i).isReg() || !MI.getOperand(i).readsReg())
+          continue;
+
+        // Get register and predecessor for one PHI edge.
+        Register Reg = MI.getOperand(i).getReg();
+        const MachineBasicBlock *Pred = MI.getOperand(i + 1).getMBB();
+
+        BBInfo &PInfo = MBBInfoMap[Pred];
+        if (PInfo.addRequired(Reg))
+          todo.insert(Pred);
+      }
+    }
+  }
+
+  // Iteratively push vregsRequired to predecessors. This will converge to the
+  // same final state regardless of DenseSet iteration order.
+  while (!todo.empty()) {
+    const MachineBasicBlock *MBB = *todo.begin();
+    todo.erase(MBB);
+    BBInfo &MInfo = MBBInfoMap[MBB];
+    for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+      if (Pred == MBB)
+        continue;
+      BBInfo &SInfo = MBBInfoMap[Pred];
+      if (SInfo.addRequired(MInfo.vregsRequired))
+        todo.insert(Pred);
+    }
+  }
+}
+
+// Check PHI instructions at the beginning of MBB. It is assumed that
+// calcRegsPassed has been run so BBInfo::isLiveOut is valid.
+void MachineVerifier::checkPHIOps(const MachineBasicBlock &MBB) {
+  BBInfo &MInfo = MBBInfoMap[&MBB];
+
+  SmallPtrSet<const MachineBasicBlock*, 8> seen;
+  for (const MachineInstr &Phi : MBB) {
+    if (!Phi.isPHI())
+      break;
+    seen.clear();
+
+    const MachineOperand &MODef = Phi.getOperand(0);
+    if (!MODef.isReg() || !MODef.isDef()) {
+      report("Expected first PHI operand to be a register def", &MODef, 0);
+      continue;
+    }
+    if (MODef.isTied() || MODef.isImplicit() || MODef.isInternalRead() ||
+        MODef.isEarlyClobber() || MODef.isDebug())
+      report("Unexpected flag on PHI operand", &MODef, 0);
+    Register DefReg = MODef.getReg();
+    if (!DefReg.isVirtual())
+      report("Expected first PHI operand to be a virtual register", &MODef, 0);
+
+    for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
+      const MachineOperand &MO0 = Phi.getOperand(I);
+      if (!MO0.isReg()) {
+        report("Expected PHI operand to be a register", &MO0, I);
+        continue;
+      }
+      if (MO0.isImplicit() || MO0.isInternalRead() || MO0.isEarlyClobber() ||
+          MO0.isDebug() || MO0.isTied())
+        report("Unexpected flag on PHI operand", &MO0, I);
+
+      const MachineOperand &MO1 = Phi.getOperand(I + 1);
+      if (!MO1.isMBB()) {
+        report("Expected PHI operand to be a basic block", &MO1, I + 1);
+        continue;
+      }
+
+      const MachineBasicBlock &Pre = *MO1.getMBB();
+      if (!Pre.isSuccessor(&MBB)) {
+        report("PHI input is not a predecessor block", &MO1, I + 1);
+        continue;
+      }
+
+      if (MInfo.reachable) {
+        seen.insert(&Pre);
+        BBInfo &PrInfo = MBBInfoMap[&Pre];
+        if (!MO0.isUndef() && PrInfo.reachable &&
+            !PrInfo.isLiveOut(MO0.getReg()))
+          report("PHI operand is not live-out from predecessor", &MO0, I);
+      }
+    }
+
+    // Did we see all predecessors?
+    if (MInfo.reachable) {
+      for (MachineBasicBlock *Pred : MBB.predecessors()) {
+        if (!seen.count(Pred)) {
+          report("Missing PHI operand", &Phi);
+          errs() << printMBBReference(*Pred)
+                 << " is a predecessor according to the CFG.\n";
+        }
+      }
+    }
+  }
+}
+
+void MachineVerifier::visitMachineFunctionAfter() {
+  calcRegsPassed();
+
+  for (const MachineBasicBlock &MBB : *MF)
+    checkPHIOps(MBB);
+
+  // Now check liveness info if available
+  calcRegsRequired();
+
+  // Check for killed virtual registers that should be live out.
+  for (const auto &MBB : *MF) {
+    BBInfo &MInfo = MBBInfoMap[&MBB];
+    for (Register VReg : MInfo.vregsRequired)
+      if (MInfo.regsKilled.count(VReg)) {
+        report("Virtual register killed in block, but needed live out.", &MBB);
+        errs() << "Virtual register " << printReg(VReg)
+               << " is used after the block.\n";
+      }
+  }
+
+  if (!MF->empty()) {
+    BBInfo &MInfo = MBBInfoMap[&MF->front()];
+    for (Register VReg : MInfo.vregsRequired) {
+      report("Virtual register defs don't dominate all uses.", MF);
+      report_context_vreg(VReg);
+    }
+  }
+
+  if (LiveVars)
+    verifyLiveVariables();
+  if (LiveInts)
+    verifyLiveIntervals();
+
+  // Check live-in list of each MBB. If a register is live into MBB, check
+  // that the register is in regsLiveOut of each predecessor block. Since
+  // this must come from a definition in the predecesssor or its live-in
+  // list, this will catch a live-through case where the predecessor does not
+  // have the register in its live-in list.  This currently only checks
+  // registers that have no aliases, are not allocatable and are not
+  // reserved, which could mean a condition code register for instance.
+  if (MRI->tracksLiveness())
+    for (const auto &MBB : *MF)
+      for (MachineBasicBlock::RegisterMaskPair P : MBB.liveins()) {
+        MCPhysReg LiveInReg = P.PhysReg;
+        bool hasAliases = MCRegAliasIterator(LiveInReg, TRI, false).isValid();
+        if (hasAliases || isAllocatable(LiveInReg) || isReserved(LiveInReg))
+          continue;
+        for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+          BBInfo &PInfo = MBBInfoMap[Pred];
+          if (!PInfo.regsLiveOut.count(LiveInReg)) {
+            report("Live in register not found to be live out from predecessor.",
+                   &MBB);
+            errs() << TRI->getName(LiveInReg)
+                   << " not found to be live out from "
+                   << printMBBReference(*Pred) << "\n";
+          }
+        }
+      }
+
+  for (auto CSInfo : MF->getCallSitesInfo())
+    if (!CSInfo.first->isCall())
+      report("Call site info referencing instruction that is not call", MF);
+
+  // If there's debug-info, check that we don't have any duplicate value
+  // tracking numbers.
+  if (MF->getFunction().getSubprogram()) {
+    DenseSet<unsigned> SeenNumbers;
+    for (const auto &MBB : *MF) {
+      for (const auto &MI : MBB) {
+        if (auto Num = MI.peekDebugInstrNum()) {
+          auto Result = SeenNumbers.insert((unsigned)Num);
+          if (!Result.second)
+            report("Instruction has a duplicated value tracking number", &MI);
+        }
+      }
+    }
+  }
+}
+
+void MachineVerifier::verifyLiveVariables() {
+  assert(LiveVars && "Don't call verifyLiveVariables without LiveVars");
+  for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+    LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
+    for (const auto &MBB : *MF) {
+      BBInfo &MInfo = MBBInfoMap[&MBB];
+
+      // Our vregsRequired should be identical to LiveVariables' AliveBlocks
+      if (MInfo.vregsRequired.count(Reg)) {
+        if (!VI.AliveBlocks.test(MBB.getNumber())) {
+          report("LiveVariables: Block missing from AliveBlocks", &MBB);
+          errs() << "Virtual register " << printReg(Reg)
+                 << " must be live through the block.\n";
+        }
+      } else {
+        if (VI.AliveBlocks.test(MBB.getNumber())) {
+          report("LiveVariables: Block should not be in AliveBlocks", &MBB);
+          errs() << "Virtual register " << printReg(Reg)
+                 << " is not needed live through the block.\n";
+        }
+      }
+    }
+  }
+}
+
+void MachineVerifier::verifyLiveIntervals() {
+  assert(LiveInts && "Don't call verifyLiveIntervals without LiveInts");
+  for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+
+    // Spilling and splitting may leave unused registers around. Skip them.
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+
+    if (!LiveInts->hasInterval(Reg)) {
+      report("Missing live interval for virtual register", MF);
+      errs() << printReg(Reg, TRI) << " still has defs or uses\n";
+      continue;
+    }
+
+    const LiveInterval &LI = LiveInts->getInterval(Reg);
+    assert(Reg == LI.reg() && "Invalid reg to interval mapping");
+    verifyLiveInterval(LI);
+  }
+
+  // Verify all the cached regunit intervals.
+  for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i)
+    if (const LiveRange *LR = LiveInts->getCachedRegUnit(i))
+      verifyLiveRange(*LR, i);
+}
+
+void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
+                                           const VNInfo *VNI, Register Reg,
+                                           LaneBitmask LaneMask) {
+  if (VNI->isUnused())
+    return;
+
+  const VNInfo *DefVNI = LR.getVNInfoAt(VNI->def);
+
+  if (!DefVNI) {
+    report("Value not live at VNInfo def and not marked unused", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(*VNI);
+    return;
+  }
+
+  if (DefVNI != VNI) {
+    report("Live segment at def has different VNInfo", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(*VNI);
+    return;
+  }
+
+  const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def);
+  if (!MBB) {
+    report("Invalid VNInfo definition index", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(*VNI);
+    return;
+  }
+
+  if (VNI->isPHIDef()) {
+    if (VNI->def != LiveInts->getMBBStartIdx(MBB)) {
+      report("PHIDef VNInfo is not defined at MBB start", MBB);
+      report_context(LR, Reg, LaneMask);
+      report_context(*VNI);
+    }
+    return;
+  }
+
+  // Non-PHI def.
+  const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def);
+  if (!MI) {
+    report("No instruction at VNInfo def index", MBB);
+    report_context(LR, Reg, LaneMask);
+    report_context(*VNI);
+    return;
+  }
+
+  if (Reg != 0) {
+    bool hasDef = false;
+    bool isEarlyClobber = false;
+    for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) {
+      if (!MOI->isReg() || !MOI->isDef())
+        continue;
+      if (Reg.isVirtual()) {
+        if (MOI->getReg() != Reg)
+          continue;
+      } else {
+        if (!MOI->getReg().isPhysical() || !TRI->hasRegUnit(MOI->getReg(), Reg))
+          continue;
+      }
+      if (LaneMask.any() &&
+          (TRI->getSubRegIndexLaneMask(MOI->getSubReg()) & LaneMask).none())
+        continue;
+      hasDef = true;
+      if (MOI->isEarlyClobber())
+        isEarlyClobber = true;
+    }
+
+    if (!hasDef) {
+      report("Defining instruction does not modify register", MI);
+      report_context(LR, Reg, LaneMask);
+      report_context(*VNI);
+    }
+
+    // Early clobber defs begin at USE slots, but other defs must begin at
+    // DEF slots.
+    if (isEarlyClobber) {
+      if (!VNI->def.isEarlyClobber()) {
+        report("Early clobber def must be at an early-clobber slot", MBB);
+        report_context(LR, Reg, LaneMask);
+        report_context(*VNI);
+      }
+    } else if (!VNI->def.isRegister()) {
+      report("Non-PHI, non-early clobber def must be at a register slot", MBB);
+      report_context(LR, Reg, LaneMask);
+      report_context(*VNI);
+    }
+  }
+}
+
+void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
+                                             const LiveRange::const_iterator I,
+                                             Register Reg,
+                                             LaneBitmask LaneMask) {
+  const LiveRange::Segment &S = *I;
+  const VNInfo *VNI = S.valno;
+  assert(VNI && "Live segment has no valno");
+
+  if (VNI->id >= LR.getNumValNums() || VNI != LR.getValNumInfo(VNI->id)) {
+    report("Foreign valno in live segment", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
+    report_context(*VNI);
+  }
+
+  if (VNI->isUnused()) {
+    report("Live segment valno is marked unused", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
+  }
+
+  const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(S.start);
+  if (!MBB) {
+    report("Bad start of live segment, no basic block", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
+    return;
+  }
+  SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB);
+  if (S.start != MBBStartIdx && S.start != VNI->def) {
+    report("Live segment must begin at MBB entry or valno def", MBB);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
+  }
+
+  const MachineBasicBlock *EndMBB =
+    LiveInts->getMBBFromIndex(S.end.getPrevSlot());
+  if (!EndMBB) {
+    report("Bad end of live segment, no basic block", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
+    return;
+  }
+
+  // Checks for non-live-out segments.
+  if (S.end != LiveInts->getMBBEndIdx(EndMBB)) {
+    // RegUnit intervals are allowed dead phis.
+    if (!Reg.isVirtual() && VNI->isPHIDef() && S.start == VNI->def &&
+        S.end == VNI->def.getDeadSlot())
+      return;
+
+    // The live segment is ending inside EndMBB
+    const MachineInstr *MI =
+        LiveInts->getInstructionFromIndex(S.end.getPrevSlot());
+    if (!MI) {
+      report("Live segment doesn't end at a valid instruction", EndMBB);
+      report_context(LR, Reg, LaneMask);
+      report_context(S);
+      return;
+    }
+
+    // The block slot must refer to a basic block boundary.
+    if (S.end.isBlock()) {
+      report("Live segment ends at B slot of an instruction", EndMBB);
+      report_context(LR, Reg, LaneMask);
+      report_context(S);
+    }
+
+    if (S.end.isDead()) {
+      // Segment ends on the dead slot.
+      // That means there must be a dead def.
+      if (!SlotIndex::isSameInstr(S.start, S.end)) {
+        report("Live segment ending at dead slot spans instructions", EndMBB);
+        report_context(LR, Reg, LaneMask);
+        report_context(S);
+      }
+    }
+
+    // After tied operands are rewritten, a live segment can only end at an
+    // early-clobber slot if it is being redefined by an early-clobber def.
+    // TODO: Before tied operands are rewritten, a live segment can only end at
+    // an early-clobber slot if the last use is tied to an early-clobber def.
+    if (MF->getProperties().hasProperty(
+            MachineFunctionProperties::Property::TiedOpsRewritten) &&
+        S.end.isEarlyClobber()) {
+      if (I + 1 == LR.end() || (I + 1)->start != S.end) {
+        report("Live segment ending at early clobber slot must be "
+               "redefined by an EC def in the same instruction",
+               EndMBB);
+        report_context(LR, Reg, LaneMask);
+        report_context(S);
+      }
+    }
+
+    // The following checks only apply to virtual registers. Physreg liveness
+    // is too weird to check.
+    if (Reg.isVirtual()) {
+      // A live segment can end with either a redefinition, a kill flag on a
+      // use, or a dead flag on a def.
+      bool hasRead = false;
+      bool hasSubRegDef = false;
+      bool hasDeadDef = false;
+      for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) {
+        if (!MOI->isReg() || MOI->getReg() != Reg)
+          continue;
+        unsigned Sub = MOI->getSubReg();
+        LaneBitmask SLM =
+            Sub != 0 ? TRI->getSubRegIndexLaneMask(Sub) : LaneBitmask::getAll();
+        if (MOI->isDef()) {
+          if (Sub != 0) {
+            hasSubRegDef = true;
+            // An operand %0:sub0 reads %0:sub1..n. Invert the lane
+            // mask for subregister defs. Read-undef defs will be handled by
+            // readsReg below.
+            SLM = ~SLM;
+          }
+          if (MOI->isDead())
+            hasDeadDef = true;
+        }
+        if (LaneMask.any() && (LaneMask & SLM).none())
+          continue;
+        if (MOI->readsReg())
+          hasRead = true;
+      }
+      if (S.end.isDead()) {
+        // Make sure that the corresponding machine operand for a "dead" live
+        // range has the dead flag. We cannot perform this check for subregister
+        // liveranges as partially dead values are allowed.
+        if (LaneMask.none() && !hasDeadDef) {
+          report(
+              "Instruction ending live segment on dead slot has no dead flag",
+              MI);
+          report_context(LR, Reg, LaneMask);
+          report_context(S);
+        }
+      } else {
+        if (!hasRead) {
+          // When tracking subregister liveness, the main range must start new
+          // values on partial register writes, even if there is no read.
+          if (!MRI->shouldTrackSubRegLiveness(Reg) || LaneMask.any() ||
+              !hasSubRegDef) {
+            report("Instruction ending live segment doesn't read the register",
+                   MI);
+            report_context(LR, Reg, LaneMask);
+            report_context(S);
+          }
+        }
+      }
+    }
+  }
+
+  // Now check all the basic blocks in this live segment.
+  MachineFunction::const_iterator MFI = MBB->getIterator();
+  // Is this live segment the beginning of a non-PHIDef VN?
+  if (S.start == VNI->def && !VNI->isPHIDef()) {
+    // Not live-in to any blocks.
+    if (MBB == EndMBB)
+      return;
+    // Skip this block.
+    ++MFI;
+  }
+
+  SmallVector<SlotIndex, 4> Undefs;
+  if (LaneMask.any()) {
+    LiveInterval &OwnerLI = LiveInts->getInterval(Reg);
+    OwnerLI.computeSubRangeUndefs(Undefs, LaneMask, *MRI, *Indexes);
+  }
+
+  while (true) {
+    assert(LiveInts->isLiveInToMBB(LR, &*MFI));
+    // We don't know how to track physregs into a landing pad.
+    if (!Reg.isVirtual() && MFI->isEHPad()) {
+      if (&*MFI == EndMBB)
+        break;
+      ++MFI;
+      continue;
+    }
+
+    // Is VNI a PHI-def in the current block?
+    bool IsPHI = VNI->isPHIDef() &&
+      VNI->def == LiveInts->getMBBStartIdx(&*MFI);
+
+    // Check that VNI is live-out of all predecessors.
+    for (const MachineBasicBlock *Pred : MFI->predecessors()) {
+      SlotIndex PEnd = LiveInts->getMBBEndIdx(Pred);
+      // Predecessor of landing pad live-out on last call.
+      if (MFI->isEHPad()) {
+        for (const MachineInstr &MI : llvm::reverse(*Pred)) {
+          if (MI.isCall()) {
+            PEnd = Indexes->getInstructionIndex(MI).getBoundaryIndex();
+            break;
+          }
+        }
+      }
+      const VNInfo *PVNI = LR.getVNInfoBefore(PEnd);
+
+      // All predecessors must have a live-out value. However for a phi
+      // instruction with subregister intervals
+      // only one of the subregisters (not necessarily the current one) needs to
+      // be defined.
+      if (!PVNI && (LaneMask.none() || !IsPHI)) {
+        if (LiveRangeCalc::isJointlyDominated(Pred, Undefs, *Indexes))
+          continue;
+        report("Register not marked live out of predecessor", Pred);
+        report_context(LR, Reg, LaneMask);
+        report_context(*VNI);
+        errs() << " live into " << printMBBReference(*MFI) << '@'
+               << LiveInts->getMBBStartIdx(&*MFI) << ", not live before "
+               << PEnd << '\n';
+        continue;
+      }
+
+      // Only PHI-defs can take different predecessor values.
+      if (!IsPHI && PVNI != VNI) {
+        report("Different value live out of predecessor", Pred);
+        report_context(LR, Reg, LaneMask);
+        errs() << "Valno #" << PVNI->id << " live out of "
+               << printMBBReference(*Pred) << '@' << PEnd << "\nValno #"
+               << VNI->id << " live into " << printMBBReference(*MFI) << '@'
+               << LiveInts->getMBBStartIdx(&*MFI) << '\n';
+      }
+    }
+    if (&*MFI == EndMBB)
+      break;
+    ++MFI;
+  }
+}
+
+void MachineVerifier::verifyLiveRange(const LiveRange &LR, Register Reg,
+                                      LaneBitmask LaneMask) {
+  for (const VNInfo *VNI : LR.valnos)
+    verifyLiveRangeValue(LR, VNI, Reg, LaneMask);
+
+  for (LiveRange::const_iterator I = LR.begin(), E = LR.end(); I != E; ++I)
+    verifyLiveRangeSegment(LR, I, Reg, LaneMask);
+}
+
+void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
+  Register Reg = LI.reg();
+  assert(Reg.isVirtual());
+  verifyLiveRange(LI, Reg);
+
+  LaneBitmask Mask;
+  LaneBitmask MaxMask = MRI->getMaxLaneMaskForVReg(Reg);
+  for (const LiveInterval::SubRange &SR : LI.subranges()) {
+    if ((Mask & SR.LaneMask).any()) {
+      report("Lane masks of sub ranges overlap in live interval", MF);
+      report_context(LI);
+    }
+    if ((SR.LaneMask & ~MaxMask).any()) {
+      report("Subrange lanemask is invalid", MF);
+      report_context(LI);
+    }
+    if (SR.empty()) {
+      report("Subrange must not be empty", MF);
+      report_context(SR, LI.reg(), SR.LaneMask);
+    }
+    Mask |= SR.LaneMask;
+    verifyLiveRange(SR, LI.reg(), SR.LaneMask);
+    if (!LI.covers(SR)) {
+      report("A Subrange is not covered by the main range", MF);
+      report_context(LI);
+    }
+  }
+
+  // Check the LI only has one connected component.
+  ConnectedVNInfoEqClasses ConEQ(*LiveInts);
+  unsigned NumComp = ConEQ.Classify(LI);
+  if (NumComp > 1) {
+    report("Multiple connected components in live interval", MF);
+    report_context(LI);
+    for (unsigned comp = 0; comp != NumComp; ++comp) {
+      errs() << comp << ": valnos";
+      for (const VNInfo *I : LI.valnos)
+        if (comp == ConEQ.getEqClass(I))
+          errs() << ' ' << I->id;
+      errs() << '\n';
+    }
+  }
+}
+
+namespace {
+
+  // FrameSetup and FrameDestroy can have zero adjustment, so using a single
+  // integer, we can't tell whether it is a FrameSetup or FrameDestroy if the
+  // value is zero.
+  // We use a bool plus an integer to capture the stack state.
+  struct StackStateOfBB {
+    StackStateOfBB() = default;
+    StackStateOfBB(int EntryVal, int ExitVal, bool EntrySetup, bool ExitSetup) :
+      EntryValue(EntryVal), ExitValue(ExitVal), EntryIsSetup(EntrySetup),
+      ExitIsSetup(ExitSetup) {}
+
+    // Can be negative, which means we are setting up a frame.
+    int EntryValue = 0;
+    int ExitValue = 0;
+    bool EntryIsSetup = false;
+    bool ExitIsSetup = false;
+  };
+
+} // end anonymous namespace
+
+/// Make sure on every path through the CFG, a FrameSetup <n> is always followed
+/// by a FrameDestroy <n>, stack adjustments are identical on all
+/// CFG edges to a merge point, and frame is destroyed at end of a return block.
+void MachineVerifier::verifyStackFrame() {
+  unsigned FrameSetupOpcode   = TII->getCallFrameSetupOpcode();
+  unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+  if (FrameSetupOpcode == ~0u && FrameDestroyOpcode == ~0u)
+    return;
+
+  SmallVector<StackStateOfBB, 8> SPState;
+  SPState.resize(MF->getNumBlockIDs());
+  df_iterator_default_set<const MachineBasicBlock*> Reachable;
+
+  // Visit the MBBs in DFS order.
+  for (df_ext_iterator<const MachineFunction *,
+                       df_iterator_default_set<const MachineBasicBlock *>>
+       DFI = df_ext_begin(MF, Reachable), DFE = df_ext_end(MF, Reachable);
+       DFI != DFE; ++DFI) {
+    const MachineBasicBlock *MBB = *DFI;
+
+    StackStateOfBB BBState;
+    // Check the exit state of the DFS stack predecessor.
+    if (DFI.getPathLength() >= 2) {
+      const MachineBasicBlock *StackPred = DFI.getPath(DFI.getPathLength() - 2);
+      assert(Reachable.count(StackPred) &&
+             "DFS stack predecessor is already visited.\n");
+      BBState.EntryValue = SPState[StackPred->getNumber()].ExitValue;
+      BBState.EntryIsSetup = SPState[StackPred->getNumber()].ExitIsSetup;
+      BBState.ExitValue = BBState.EntryValue;
+      BBState.ExitIsSetup = BBState.EntryIsSetup;
+    }
+
+    // Update stack state by checking contents of MBB.
+    for (const auto &I : *MBB) {
+      if (I.getOpcode() == FrameSetupOpcode) {
+        if (BBState.ExitIsSetup)
+          report("FrameSetup is after another FrameSetup", &I);
+        BBState.ExitValue -= TII->getFrameTotalSize(I);
+        BBState.ExitIsSetup = true;
+      }
+
+      if (I.getOpcode() == FrameDestroyOpcode) {
+        int Size = TII->getFrameTotalSize(I);
+        if (!BBState.ExitIsSetup)
+          report("FrameDestroy is not after a FrameSetup", &I);
+        int AbsSPAdj = BBState.ExitValue < 0 ? -BBState.ExitValue :
+                                               BBState.ExitValue;
+        if (BBState.ExitIsSetup && AbsSPAdj != Size) {
+          report("FrameDestroy <n> is after FrameSetup <m>", &I);
+          errs() << "FrameDestroy <" << Size << "> is after FrameSetup <"
+              << AbsSPAdj << ">.\n";
+        }
+        BBState.ExitValue += Size;
+        BBState.ExitIsSetup = false;
+      }
+    }
+    SPState[MBB->getNumber()] = BBState;
+
+    // Make sure the exit state of any predecessor is consistent with the entry
+    // state.
+    for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+      if (Reachable.count(Pred) &&
+          (SPState[Pred->getNumber()].ExitValue != BBState.EntryValue ||
+           SPState[Pred->getNumber()].ExitIsSetup != BBState.EntryIsSetup)) {
+        report("The exit stack state of a predecessor is inconsistent.", MBB);
+        errs() << "Predecessor " << printMBBReference(*Pred)
+               << " has exit state (" << SPState[Pred->getNumber()].ExitValue
+               << ", " << SPState[Pred->getNumber()].ExitIsSetup << "), while "
+               << printMBBReference(*MBB) << " has entry state ("
+               << BBState.EntryValue << ", " << BBState.EntryIsSetup << ").\n";
+      }
+    }
+
+    // Make sure the entry state of any successor is consistent with the exit
+    // state.
+    for (const MachineBasicBlock *Succ : MBB->successors()) {
+      if (Reachable.count(Succ) &&
+          (SPState[Succ->getNumber()].EntryValue != BBState.ExitValue ||
+           SPState[Succ->getNumber()].EntryIsSetup != BBState.ExitIsSetup)) {
+        report("The entry stack state of a successor is inconsistent.", MBB);
+        errs() << "Successor " << printMBBReference(*Succ)
+               << " has entry state (" << SPState[Succ->getNumber()].EntryValue
+               << ", " << SPState[Succ->getNumber()].EntryIsSetup << "), while "
+               << printMBBReference(*MBB) << " has exit state ("
+               << BBState.ExitValue << ", " << BBState.ExitIsSetup << ").\n";
+      }
+    }
+
+    // Make sure a basic block with return ends with zero stack adjustment.
+    if (!MBB->empty() && MBB->back().isReturn()) {
+      if (BBState.ExitIsSetup)
+        report("A return block ends with a FrameSetup.", MBB);
+      if (BBState.ExitValue)
+        report("A return block ends with a nonzero stack adjustment.", MBB);
+    }
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MacroFusion.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MacroFusion.cpp
new file mode 100644
index 000000000000..fa5df68b8abc
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MacroFusion.cpp
@@ -0,0 +1,213 @@
+//===- MacroFusion.cpp - Macro Fusion -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the implementation of the DAG scheduling mutation
+/// to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "machine-scheduler"
+
+STATISTIC(NumFused, "Number of instr pairs fused");
+
+using namespace llvm;
+
+static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
+  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
+
+static bool isHazard(const SDep &Dep) {
+  return Dep.getKind() == SDep::Anti || Dep.getKind() == SDep::Output;
+}
+
+static SUnit *getPredClusterSU(const SUnit &SU) {
+  for (const SDep &SI : SU.Preds)
+    if (SI.isCluster())
+      return SI.getSUnit();
+
+  return nullptr;
+}
+
+bool llvm::hasLessThanNumFused(const SUnit &SU, unsigned FuseLimit) {
+  unsigned Num = 1;
+  const SUnit *CurrentSU = &SU;
+  while ((CurrentSU = getPredClusterSU(*CurrentSU)) && Num < FuseLimit) Num ++;
+  return Num < FuseLimit;
+}
+
+bool llvm::fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU,
+                               SUnit &SecondSU) {
+  // Check that neither instr is already paired with another along the edge
+  // between them.
+  for (SDep &SI : FirstSU.Succs)
+    if (SI.isCluster())
+      return false;
+
+  for (SDep &SI : SecondSU.Preds)
+    if (SI.isCluster())
+      return false;
+  // Though the reachability checks above could be made more generic,
+  // perhaps as part of ScheduleDAGInstrs::addEdge(), since such edges are valid,
+  // the extra computation cost makes it less interesting in general cases.
+
+  // Create a single weak edge between the adjacent instrs. The only effect is
+  // to cause bottom-up scheduling to heavily prioritize the clustered instrs.
+  if (!DAG.addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster)))
+    return false;
+
+  // TODO - If we want to chain more than two instructions, we need to create
+  // artifical edges to make dependencies from the FirstSU also dependent
+  // on other chained instructions, and other chained instructions also
+  // dependent on the dependencies of the SecondSU, to prevent them from being
+  // scheduled into these chained instructions.
+  assert(hasLessThanNumFused(FirstSU, 2) &&
+         "Currently we only support chaining together two instructions");
+
+  // Adjust the latency between both instrs.
+  for (SDep &SI : FirstSU.Succs)
+    if (SI.getSUnit() == &SecondSU)
+      SI.setLatency(0);
+
+  for (SDep &SI : SecondSU.Preds)
+    if (SI.getSUnit() == &FirstSU)
+      SI.setLatency(0);
+
+  LLVM_DEBUG(
+      dbgs() << "Macro fuse: "; DAG.dumpNodeName(FirstSU); dbgs() << " - ";
+      DAG.dumpNodeName(SecondSU); dbgs() << " /  ";
+      dbgs() << DAG.TII->getName(FirstSU.getInstr()->getOpcode()) << " - "
+             << DAG.TII->getName(SecondSU.getInstr()->getOpcode()) << '\n';);
+
+  // Make data dependencies from the FirstSU also dependent on the SecondSU to
+  // prevent them from being scheduled between the FirstSU and the SecondSU.
+  if (&SecondSU != &DAG.ExitSU)
+    for (const SDep &SI : FirstSU.Succs) {
+      SUnit *SU = SI.getSUnit();
+      if (SI.isWeak() || isHazard(SI) ||
+          SU == &DAG.ExitSU || SU == &SecondSU || SU->isPred(&SecondSU))
+        continue;
+      LLVM_DEBUG(dbgs() << "  Bind "; DAG.dumpNodeName(SecondSU);
+                 dbgs() << " - "; DAG.dumpNodeName(*SU); dbgs() << '\n';);
+      DAG.addEdge(SU, SDep(&SecondSU, SDep::Artificial));
+    }
+
+  // Make the FirstSU also dependent on the dependencies of the SecondSU to
+  // prevent them from being scheduled between the FirstSU and the SecondSU.
+  if (&FirstSU != &DAG.EntrySU) {
+    for (const SDep &SI : SecondSU.Preds) {
+      SUnit *SU = SI.getSUnit();
+      if (SI.isWeak() || isHazard(SI) || &FirstSU == SU || FirstSU.isSucc(SU))
+        continue;
+      LLVM_DEBUG(dbgs() << "  Bind "; DAG.dumpNodeName(*SU); dbgs() << " - ";
+                 DAG.dumpNodeName(FirstSU); dbgs() << '\n';);
+      DAG.addEdge(&FirstSU, SDep(SU, SDep::Artificial));
+    }
+    // ExitSU comes last by design, which acts like an implicit dependency
+    // between ExitSU and any bottom root in the graph. We should transfer
+    // this to FirstSU as well.
+    if (&SecondSU == &DAG.ExitSU) {
+      for (SUnit &SU : DAG.SUnits) {
+        if (SU.Succs.empty())
+          DAG.addEdge(&FirstSU, SDep(&SU, SDep::Artificial));
+      }
+    }
+  }
+
+  ++NumFused;
+  return true;
+}
+
+namespace {
+
+/// Post-process the DAG to create cluster edges between instrs that may
+/// be fused by the processor into a single operation.
+class MacroFusion : public ScheduleDAGMutation {
+  ShouldSchedulePredTy shouldScheduleAdjacent;
+  bool FuseBlock;
+  bool scheduleAdjacentImpl(ScheduleDAGInstrs &DAG, SUnit &AnchorSU);
+
+public:
+  MacroFusion(ShouldSchedulePredTy shouldScheduleAdjacent, bool FuseBlock)
+    : shouldScheduleAdjacent(shouldScheduleAdjacent), FuseBlock(FuseBlock) {}
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+};
+
+} // end anonymous namespace
+
+void MacroFusion::apply(ScheduleDAGInstrs *DAG) {
+  if (FuseBlock)
+    // For each of the SUnits in the scheduling block, try to fuse the instr in
+    // it with one in its predecessors.
+    for (SUnit &ISU : DAG->SUnits)
+        scheduleAdjacentImpl(*DAG, ISU);
+
+  if (DAG->ExitSU.getInstr())
+    // Try to fuse the instr in the ExitSU with one in its predecessors.
+    scheduleAdjacentImpl(*DAG, DAG->ExitSU);
+}
+
+/// Implement the fusion of instr pairs in the scheduling DAG,
+/// anchored at the instr in AnchorSU..
+bool MacroFusion::scheduleAdjacentImpl(ScheduleDAGInstrs &DAG, SUnit &AnchorSU) {
+  const MachineInstr &AnchorMI = *AnchorSU.getInstr();
+  const TargetInstrInfo &TII = *DAG.TII;
+  const TargetSubtargetInfo &ST = DAG.MF.getSubtarget();
+
+  // Check if the anchor instr may be fused.
+  if (!shouldScheduleAdjacent(TII, ST, nullptr, AnchorMI))
+    return false;
+
+  // Explorer for fusion candidates among the dependencies of the anchor instr.
+  for (SDep &Dep : AnchorSU.Preds) {
+    // Ignore dependencies other than data or strong ordering.
+    if (Dep.isWeak() || isHazard(Dep))
+      continue;
+
+    SUnit &DepSU = *Dep.getSUnit();
+    if (DepSU.isBoundaryNode())
+      continue;
+
+    // Only chain two instructions together at most.
+    const MachineInstr *DepMI = DepSU.getInstr();
+    if (!hasLessThanNumFused(DepSU, 2) ||
+        !shouldScheduleAdjacent(TII, ST, DepMI, AnchorMI))
+      continue;
+
+    if (fuseInstructionPair(DAG, DepSU, AnchorSU))
+      return true;
+  }
+
+  return false;
+}
+
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createMacroFusionDAGMutation(
+     ShouldSchedulePredTy shouldScheduleAdjacent) {
+  if(EnableMacroFusion)
+    return std::make_unique<MacroFusion>(shouldScheduleAdjacent, true);
+  return nullptr;
+}
+
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createBranchMacroFusionDAGMutation(
+     ShouldSchedulePredTy shouldScheduleAdjacent) {
+  if(EnableMacroFusion)
+    return std::make_unique<MacroFusion>(shouldScheduleAdjacent, false);
+  return nullptr;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp
new file mode 100644
index 000000000000..0bef513342ff
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -0,0 +1,2208 @@
+//===- ModuloSchedule.cpp - Software pipeline schedule expansion ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ModuloSchedule.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "pipeliner"
+using namespace llvm;
+
+void ModuloSchedule::print(raw_ostream &OS) {
+  for (MachineInstr *MI : ScheduledInstrs)
+    OS << "[stage " << getStage(MI) << " @" << getCycle(MI) << "c] " << *MI;
+}
+
+//===----------------------------------------------------------------------===//
+// ModuloScheduleExpander implementation
+//===----------------------------------------------------------------------===//
+
+/// Return the register values for  the operands of a Phi instruction.
+/// This function assume the instruction is a Phi.
+static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
+                       unsigned &InitVal, unsigned &LoopVal) {
+  assert(Phi.isPHI() && "Expecting a Phi.");
+
+  InitVal = 0;
+  LoopVal = 0;
+  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
+    if (Phi.getOperand(i + 1).getMBB() != Loop)
+      InitVal = Phi.getOperand(i).getReg();
+    else
+      LoopVal = Phi.getOperand(i).getReg();
+
+  assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
+}
+
+/// Return the Phi register value that comes from the incoming block.
+static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
+  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
+    if (Phi.getOperand(i + 1).getMBB() != LoopBB)
+      return Phi.getOperand(i).getReg();
+  return 0;
+}
+
+/// Return the Phi register value that comes the loop block.
+static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
+  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
+    if (Phi.getOperand(i + 1).getMBB() == LoopBB)
+      return Phi.getOperand(i).getReg();
+  return 0;
+}
+
+void ModuloScheduleExpander::expand() {
+  BB = Schedule.getLoop()->getTopBlock();
+  Preheader = *BB->pred_begin();
+  if (Preheader == BB)
+    Preheader = *std::next(BB->pred_begin());
+
+  // Iterate over the definitions in each instruction, and compute the
+  // stage difference for each use.  Keep the maximum value.
+  for (MachineInstr *MI : Schedule.getInstructions()) {
+    int DefStage = Schedule.getStage(MI);
+    for (const MachineOperand &Op : MI->all_defs()) {
+      Register Reg = Op.getReg();
+      unsigned MaxDiff = 0;
+      bool PhiIsSwapped = false;
+      for (MachineOperand &UseOp : MRI.use_operands(Reg)) {
+        MachineInstr *UseMI = UseOp.getParent();
+        int UseStage = Schedule.getStage(UseMI);
+        unsigned Diff = 0;
+        if (UseStage != -1 && UseStage >= DefStage)
+          Diff = UseStage - DefStage;
+        if (MI->isPHI()) {
+          if (isLoopCarried(*MI))
+            ++Diff;
+          else
+            PhiIsSwapped = true;
+        }
+        MaxDiff = std::max(Diff, MaxDiff);
+      }
+      RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
+    }
+  }
+
+  generatePipelinedLoop();
+}
+
+void ModuloScheduleExpander::generatePipelinedLoop() {
+  LoopInfo = TII->analyzeLoopForPipelining(BB);
+  assert(LoopInfo && "Must be able to analyze loop!");
+
+  // Create a new basic block for the kernel and add it to the CFG.
+  MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
+
+  unsigned MaxStageCount = Schedule.getNumStages() - 1;
+
+  // Remember the registers that are used in different stages. The index is
+  // the iteration, or stage, that the instruction is scheduled in.  This is
+  // a map between register names in the original block and the names created
+  // in each stage of the pipelined loop.
+  ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
+
+  // The renaming destination by Phis for the registers across stages.
+  // This map is updated during Phis generation to point to the most recent
+  // renaming destination.
+  ValueMapTy *VRMapPhi = new ValueMapTy[(MaxStageCount + 1) * 2];
+
+  InstrMapTy InstrMap;
+
+  SmallVector<MachineBasicBlock *, 4> PrologBBs;
+
+  // Generate the prolog instructions that set up the pipeline.
+  generateProlog(MaxStageCount, KernelBB, VRMap, PrologBBs);
+  MF.insert(BB->getIterator(), KernelBB);
+
+  // Rearrange the instructions to generate the new, pipelined loop,
+  // and update register names as needed.
+  for (MachineInstr *CI : Schedule.getInstructions()) {
+    if (CI->isPHI())
+      continue;
+    unsigned StageNum = Schedule.getStage(CI);
+    MachineInstr *NewMI = cloneInstr(CI, MaxStageCount, StageNum);
+    updateInstruction(NewMI, false, MaxStageCount, StageNum, VRMap);
+    KernelBB->push_back(NewMI);
+    InstrMap[NewMI] = CI;
+  }
+
+  // Copy any terminator instructions to the new kernel, and update
+  // names as needed.
+  for (MachineInstr &MI : BB->terminators()) {
+    MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
+    updateInstruction(NewMI, false, MaxStageCount, 0, VRMap);
+    KernelBB->push_back(NewMI);
+    InstrMap[NewMI] = &MI;
+  }
+
+  NewKernel = KernelBB;
+  KernelBB->transferSuccessors(BB);
+  KernelBB->replaceSuccessor(BB, KernelBB);
+
+  generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap,
+                       InstrMap, MaxStageCount, MaxStageCount, false);
+  generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap, VRMapPhi,
+               InstrMap, MaxStageCount, MaxStageCount, false);
+
+  LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
+
+  SmallVector<MachineBasicBlock *, 4> EpilogBBs;
+  // Generate the epilog instructions to complete the pipeline.
+  generateEpilog(MaxStageCount, KernelBB, BB, VRMap, VRMapPhi, EpilogBBs,
+                 PrologBBs);
+
+  // We need this step because the register allocation doesn't handle some
+  // situations well, so we insert copies to help out.
+  splitLifetimes(KernelBB, EpilogBBs);
+
+  // Remove dead instructions due to loop induction variables.
+  removeDeadInstructions(KernelBB, EpilogBBs);
+
+  // Add branches between prolog and epilog blocks.
+  addBranches(*Preheader, PrologBBs, KernelBB, EpilogBBs, VRMap);
+
+  delete[] VRMap;
+  delete[] VRMapPhi;
+}
+
+void ModuloScheduleExpander::cleanup() {
+  // Remove the original loop since it's no longer referenced.
+  for (auto &I : *BB)
+    LIS.RemoveMachineInstrFromMaps(I);
+  BB->clear();
+  BB->eraseFromParent();
+}
+
+/// Generate the pipeline prolog code.
+void ModuloScheduleExpander::generateProlog(unsigned LastStage,
+                                            MachineBasicBlock *KernelBB,
+                                            ValueMapTy *VRMap,
+                                            MBBVectorTy &PrologBBs) {
+  MachineBasicBlock *PredBB = Preheader;
+  InstrMapTy InstrMap;
+
+  // Generate a basic block for each stage, not including the last stage,
+  // which will be generated in the kernel. Each basic block may contain
+  // instructions from multiple stages/iterations.
+  for (unsigned i = 0; i < LastStage; ++i) {
+    // Create and insert the prolog basic block prior to the original loop
+    // basic block.  The original loop is removed later.
+    MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
+    PrologBBs.push_back(NewBB);
+    MF.insert(BB->getIterator(), NewBB);
+    NewBB->transferSuccessors(PredBB);
+    PredBB->addSuccessor(NewBB);
+    PredBB = NewBB;
+
+    // Generate instructions for each appropriate stage. Process instructions
+    // in original program order.
+    for (int StageNum = i; StageNum >= 0; --StageNum) {
+      for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
+                                       BBE = BB->getFirstTerminator();
+           BBI != BBE; ++BBI) {
+        if (Schedule.getStage(&*BBI) == StageNum) {
+          if (BBI->isPHI())
+            continue;
+          MachineInstr *NewMI =
+              cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum);
+          updateInstruction(NewMI, false, i, (unsigned)StageNum, VRMap);
+          NewBB->push_back(NewMI);
+          InstrMap[NewMI] = &*BBI;
+        }
+      }
+    }
+    rewritePhiValues(NewBB, i, VRMap, InstrMap);
+    LLVM_DEBUG({
+      dbgs() << "prolog:\n";
+      NewBB->dump();
+    });
+  }
+
+  PredBB->replaceSuccessor(BB, KernelBB);
+
+  // Check if we need to remove the branch from the preheader to the original
+  // loop, and replace it with a branch to the new loop.
+  unsigned numBranches = TII->removeBranch(*Preheader);
+  if (numBranches) {
+    SmallVector<MachineOperand, 0> Cond;
+    TII->insertBranch(*Preheader, PrologBBs[0], nullptr, Cond, DebugLoc());
+  }
+}
+
+/// Generate the pipeline epilog code. The epilog code finishes the iterations
+/// that were started in either the prolog or the kernel.  We create a basic
+/// block for each stage that needs to complete.
+void ModuloScheduleExpander::generateEpilog(
+    unsigned LastStage, MachineBasicBlock *KernelBB, MachineBasicBlock *OrigBB,
+    ValueMapTy *VRMap, ValueMapTy *VRMapPhi, MBBVectorTy &EpilogBBs,
+    MBBVectorTy &PrologBBs) {
+  // We need to change the branch from the kernel to the first epilog block, so
+  // this call to analyze branch uses the kernel rather than the original BB.
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
+  assert(!checkBranch && "generateEpilog must be able to analyze the branch");
+  if (checkBranch)
+    return;
+
+  MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
+  if (*LoopExitI == KernelBB)
+    ++LoopExitI;
+  assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
+  MachineBasicBlock *LoopExitBB = *LoopExitI;
+
+  MachineBasicBlock *PredBB = KernelBB;
+  MachineBasicBlock *EpilogStart = LoopExitBB;
+  InstrMapTy InstrMap;
+
+  // Generate a basic block for each stage, not including the last stage,
+  // which was generated for the kernel.  Each basic block may contain
+  // instructions from multiple stages/iterations.
+  int EpilogStage = LastStage + 1;
+  for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
+    MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
+    EpilogBBs.push_back(NewBB);
+    MF.insert(BB->getIterator(), NewBB);
+
+    PredBB->replaceSuccessor(LoopExitBB, NewBB);
+    NewBB->addSuccessor(LoopExitBB);
+
+    if (EpilogStart == LoopExitBB)
+      EpilogStart = NewBB;
+
+    // Add instructions to the epilog depending on the current block.
+    // Process instructions in original program order.
+    for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
+      for (auto &BBI : *BB) {
+        if (BBI.isPHI())
+          continue;
+        MachineInstr *In = &BBI;
+        if ((unsigned)Schedule.getStage(In) == StageNum) {
+          // Instructions with memoperands in the epilog are updated with
+          // conservative values.
+          MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
+          updateInstruction(NewMI, i == 1, EpilogStage, 0, VRMap);
+          NewBB->push_back(NewMI);
+          InstrMap[NewMI] = In;
+        }
+      }
+    }
+    generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap,
+                         InstrMap, LastStage, EpilogStage, i == 1);
+    generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap, VRMapPhi,
+                 InstrMap, LastStage, EpilogStage, i == 1);
+    PredBB = NewBB;
+
+    LLVM_DEBUG({
+      dbgs() << "epilog:\n";
+      NewBB->dump();
+    });
+  }
+
+  // Fix any Phi nodes in the loop exit block.
+  LoopExitBB->replacePhiUsesWith(BB, PredBB);
+
+  // Create a branch to the new epilog from the kernel.
+  // Remove the original branch and add a new branch to the epilog.
+  TII->removeBranch(*KernelBB);
+  assert((OrigBB == TBB || OrigBB == FBB) &&
+         "Unable to determine looping branch direction");
+  if (OrigBB != TBB)
+    TII->insertBranch(*KernelBB, EpilogStart, KernelBB, Cond, DebugLoc());
+  else
+    TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
+  // Add a branch to the loop exit.
+  if (EpilogBBs.size() > 0) {
+    MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
+    SmallVector<MachineOperand, 4> Cond1;
+    TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
+  }
+}
+
+/// Replace all uses of FromReg that appear outside the specified
+/// basic block with ToReg.
+static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
+                                    MachineBasicBlock *MBB,
+                                    MachineRegisterInfo &MRI,
+                                    LiveIntervals &LIS) {
+  for (MachineOperand &O :
+       llvm::make_early_inc_range(MRI.use_operands(FromReg)))
+    if (O.getParent()->getParent() != MBB)
+      O.setReg(ToReg);
+  if (!LIS.hasInterval(ToReg))
+    LIS.createEmptyInterval(ToReg);
+}
+
+/// Return true if the register has a use that occurs outside the
+/// specified loop.
+static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
+                            MachineRegisterInfo &MRI) {
+  for (const MachineOperand &MO : MRI.use_operands(Reg))
+    if (MO.getParent()->getParent() != BB)
+      return true;
+  return false;
+}
+
+/// Generate Phis for the specific block in the generated pipelined code.
+/// This function looks at the Phis from the original code to guide the
+/// creation of new Phis.
+void ModuloScheduleExpander::generateExistingPhis(
+    MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
+    MachineBasicBlock *KernelBB, ValueMapTy *VRMap, InstrMapTy &InstrMap,
+    unsigned LastStageNum, unsigned CurStageNum, bool IsLast) {
+  // Compute the stage number for the initial value of the Phi, which
+  // comes from the prolog. The prolog to use depends on to which kernel/
+  // epilog that we're adding the Phi.
+  unsigned PrologStage = 0;
+  unsigned PrevStage = 0;
+  bool InKernel = (LastStageNum == CurStageNum);
+  if (InKernel) {
+    PrologStage = LastStageNum - 1;
+    PrevStage = CurStageNum;
+  } else {
+    PrologStage = LastStageNum - (CurStageNum - LastStageNum);
+    PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
+  }
+
+  for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
+                                   BBE = BB->getFirstNonPHI();
+       BBI != BBE; ++BBI) {
+    Register Def = BBI->getOperand(0).getReg();
+
+    unsigned InitVal = 0;
+    unsigned LoopVal = 0;
+    getPhiRegs(*BBI, BB, InitVal, LoopVal);
+
+    unsigned PhiOp1 = 0;
+    // The Phi value from the loop body typically is defined in the loop, but
+    // not always. So, we need to check if the value is defined in the loop.
+    unsigned PhiOp2 = LoopVal;
+    if (VRMap[LastStageNum].count(LoopVal))
+      PhiOp2 = VRMap[LastStageNum][LoopVal];
+
+    int StageScheduled = Schedule.getStage(&*BBI);
+    int LoopValStage = Schedule.getStage(MRI.getVRegDef(LoopVal));
+    unsigned NumStages = getStagesForReg(Def, CurStageNum);
+    if (NumStages == 0) {
+      // We don't need to generate a Phi anymore, but we need to rename any uses
+      // of the Phi value.
+      unsigned NewReg = VRMap[PrevStage][LoopVal];
+      rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, 0, &*BBI, Def,
+                            InitVal, NewReg);
+      if (VRMap[CurStageNum].count(LoopVal))
+        VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
+    }
+    // Adjust the number of Phis needed depending on the number of prologs left,
+    // and the distance from where the Phi is first scheduled. The number of
+    // Phis cannot exceed the number of prolog stages. Each stage can
+    // potentially define two values.
+    unsigned MaxPhis = PrologStage + 2;
+    if (!InKernel && (int)PrologStage <= LoopValStage)
+      MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
+    unsigned NumPhis = std::min(NumStages, MaxPhis);
+
+    unsigned NewReg = 0;
+    unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
+    // In the epilog, we may need to look back one stage to get the correct
+    // Phi name, because the epilog and prolog blocks execute the same stage.
+    // The correct name is from the previous block only when the Phi has
+    // been completely scheduled prior to the epilog, and Phi value is not
+    // needed in multiple stages.
+    int StageDiff = 0;
+    if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
+        NumPhis == 1)
+      StageDiff = 1;
+    // Adjust the computations below when the phi and the loop definition
+    // are scheduled in different stages.
+    if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
+      StageDiff = StageScheduled - LoopValStage;
+    for (unsigned np = 0; np < NumPhis; ++np) {
+      // If the Phi hasn't been scheduled, then use the initial Phi operand
+      // value. Otherwise, use the scheduled version of the instruction. This
+      // is a little complicated when a Phi references another Phi.
+      if (np > PrologStage || StageScheduled >= (int)LastStageNum)
+        PhiOp1 = InitVal;
+      // Check if the Phi has already been scheduled in a prolog stage.
+      else if (PrologStage >= AccessStage + StageDiff + np &&
+               VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
+        PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
+      // Check if the Phi has already been scheduled, but the loop instruction
+      // is either another Phi, or doesn't occur in the loop.
+      else if (PrologStage >= AccessStage + StageDiff + np) {
+        // If the Phi references another Phi, we need to examine the other
+        // Phi to get the correct value.
+        PhiOp1 = LoopVal;
+        MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
+        int Indirects = 1;
+        while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
+          int PhiStage = Schedule.getStage(InstOp1);
+          if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
+            PhiOp1 = getInitPhiReg(*InstOp1, BB);
+          else
+            PhiOp1 = getLoopPhiReg(*InstOp1, BB);
+          InstOp1 = MRI.getVRegDef(PhiOp1);
+          int PhiOpStage = Schedule.getStage(InstOp1);
+          int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
+          if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
+              VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
+            PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
+            break;
+          }
+          ++Indirects;
+        }
+      } else
+        PhiOp1 = InitVal;
+      // If this references a generated Phi in the kernel, get the Phi operand
+      // from the incoming block.
+      if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
+        if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
+          PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
+
+      MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
+      bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
+      // In the epilog, a map lookup is needed to get the value from the kernel,
+      // or previous epilog block. How is does this depends on if the
+      // instruction is scheduled in the previous block.
+      if (!InKernel) {
+        int StageDiffAdj = 0;
+        if (LoopValStage != -1 && StageScheduled > LoopValStage)
+          StageDiffAdj = StageScheduled - LoopValStage;
+        // Use the loop value defined in the kernel, unless the kernel
+        // contains the last definition of the Phi.
+        if (np == 0 && PrevStage == LastStageNum &&
+            (StageScheduled != 0 || LoopValStage != 0) &&
+            VRMap[PrevStage - StageDiffAdj].count(LoopVal))
+          PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
+        // Use the value defined by the Phi. We add one because we switch
+        // from looking at the loop value to the Phi definition.
+        else if (np > 0 && PrevStage == LastStageNum &&
+                 VRMap[PrevStage - np + 1].count(Def))
+          PhiOp2 = VRMap[PrevStage - np + 1][Def];
+        // Use the loop value defined in the kernel.
+        else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
+                 VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
+          PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
+        // Use the value defined by the Phi, unless we're generating the first
+        // epilog and the Phi refers to a Phi in a different stage.
+        else if (VRMap[PrevStage - np].count(Def) &&
+                 (!LoopDefIsPhi || (PrevStage != LastStageNum) ||
+                  (LoopValStage == StageScheduled)))
+          PhiOp2 = VRMap[PrevStage - np][Def];
+      }
+
+      // Check if we can reuse an existing Phi. This occurs when a Phi
+      // references another Phi, and the other Phi is scheduled in an
+      // earlier stage. We can try to reuse an existing Phi up until the last
+      // stage of the current Phi.
+      if (LoopDefIsPhi) {
+        if (static_cast<int>(PrologStage - np) >= StageScheduled) {
+          int LVNumStages = getStagesForPhi(LoopVal);
+          int StageDiff = (StageScheduled - LoopValStage);
+          LVNumStages -= StageDiff;
+          // Make sure the loop value Phi has been processed already.
+          if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
+            NewReg = PhiOp2;
+            unsigned ReuseStage = CurStageNum;
+            if (isLoopCarried(*PhiInst))
+              ReuseStage -= LVNumStages;
+            // Check if the Phi to reuse has been generated yet. If not, then
+            // there is nothing to reuse.
+            if (VRMap[ReuseStage - np].count(LoopVal)) {
+              NewReg = VRMap[ReuseStage - np][LoopVal];
+
+              rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI,
+                                    Def, NewReg);
+              // Update the map with the new Phi name.
+              VRMap[CurStageNum - np][Def] = NewReg;
+              PhiOp2 = NewReg;
+              if (VRMap[LastStageNum - np - 1].count(LoopVal))
+                PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
+
+              if (IsLast && np == NumPhis - 1)
+                replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
+              continue;
+            }
+          }
+        }
+        if (InKernel && StageDiff > 0 &&
+            VRMap[CurStageNum - StageDiff - np].count(LoopVal))
+          PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
+      }
+
+      const TargetRegisterClass *RC = MRI.getRegClass(Def);
+      NewReg = MRI.createVirtualRegister(RC);
+
+      MachineInstrBuilder NewPhi =
+          BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
+                  TII->get(TargetOpcode::PHI), NewReg);
+      NewPhi.addReg(PhiOp1).addMBB(BB1);
+      NewPhi.addReg(PhiOp2).addMBB(BB2);
+      if (np == 0)
+        InstrMap[NewPhi] = &*BBI;
+
+      // We define the Phis after creating the new pipelined code, so
+      // we need to rename the Phi values in scheduled instructions.
+
+      unsigned PrevReg = 0;
+      if (InKernel && VRMap[PrevStage - np].count(LoopVal))
+        PrevReg = VRMap[PrevStage - np][LoopVal];
+      rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
+                            NewReg, PrevReg);
+      // If the Phi has been scheduled, use the new name for rewriting.
+      if (VRMap[CurStageNum - np].count(Def)) {
+        unsigned R = VRMap[CurStageNum - np][Def];
+        rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, R,
+                              NewReg);
+      }
+
+      // Check if we need to rename any uses that occurs after the loop. The
+      // register to replace depends on whether the Phi is scheduled in the
+      // epilog.
+      if (IsLast && np == NumPhis - 1)
+        replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
+
+      // In the kernel, a dependent Phi uses the value from this Phi.
+      if (InKernel)
+        PhiOp2 = NewReg;
+
+      // Update the map with the new Phi name.
+      VRMap[CurStageNum - np][Def] = NewReg;
+    }
+
+    while (NumPhis++ < NumStages) {
+      rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, NumPhis, &*BBI, Def,
+                            NewReg, 0);
+    }
+
+    // Check if we need to rename a Phi that has been eliminated due to
+    // scheduling.
+    if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
+      replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
+  }
+}
+
+/// Generate Phis for the specified block in the generated pipelined code.
+/// These are new Phis needed because the definition is scheduled after the
+/// use in the pipelined sequence.
+void ModuloScheduleExpander::generatePhis(
+    MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
+    MachineBasicBlock *KernelBB, ValueMapTy *VRMap, ValueMapTy *VRMapPhi,
+    InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
+    bool IsLast) {
+  // Compute the stage number that contains the initial Phi value, and
+  // the Phi from the previous stage.
+  unsigned PrologStage = 0;
+  unsigned PrevStage = 0;
+  unsigned StageDiff = CurStageNum - LastStageNum;
+  bool InKernel = (StageDiff == 0);
+  if (InKernel) {
+    PrologStage = LastStageNum - 1;
+    PrevStage = CurStageNum;
+  } else {
+    PrologStage = LastStageNum - StageDiff;
+    PrevStage = LastStageNum + StageDiff - 1;
+  }
+
+  for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
+                                   BBE = BB->instr_end();
+       BBI != BBE; ++BBI) {
+    for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = BBI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual())
+        continue;
+
+      int StageScheduled = Schedule.getStage(&*BBI);
+      assert(StageScheduled != -1 && "Expecting scheduled instruction.");
+      Register Def = MO.getReg();
+      unsigned NumPhis = getStagesForReg(Def, CurStageNum);
+      // An instruction scheduled in stage 0 and is used after the loop
+      // requires a phi in the epilog for the last definition from either
+      // the kernel or prolog.
+      if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
+          hasUseAfterLoop(Def, BB, MRI))
+        NumPhis = 1;
+      if (!InKernel && (unsigned)StageScheduled > PrologStage)
+        continue;
+
+      unsigned PhiOp2;
+      if (InKernel) {
+        PhiOp2 = VRMap[PrevStage][Def];
+        if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
+          if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
+            PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
+      }
+      // The number of Phis can't exceed the number of prolog stages. The
+      // prolog stage number is zero based.
+      if (NumPhis > PrologStage + 1 - StageScheduled)
+        NumPhis = PrologStage + 1 - StageScheduled;
+      for (unsigned np = 0; np < NumPhis; ++np) {
+        // Example for
+        // Org:
+        //   %Org = ... (Scheduled at Stage#0, NumPhi = 2)
+        //
+        // Prolog0 (Stage0):
+        //   %Clone0 = ...
+        // Prolog1 (Stage1):
+        //   %Clone1 = ...
+        // Kernel (Stage2):
+        //   %Phi0 = Phi %Clone1, Prolog1, %Clone2, Kernel
+        //   %Phi1 = Phi %Clone0, Prolog1, %Phi0, Kernel
+        //   %Clone2 = ...
+        // Epilog0 (Stage3):
+        //   %Phi2 = Phi %Clone1, Prolog1, %Clone2, Kernel
+        //   %Phi3 = Phi %Clone0, Prolog1, %Phi0, Kernel
+        // Epilog1 (Stage4):
+        //   %Phi4 = Phi %Clone0, Prolog0, %Phi2, Epilog0
+        //
+        // VRMap = {0: %Clone0, 1: %Clone1, 2: %Clone2}
+        // VRMapPhi (after Kernel) = {0: %Phi1, 1: %Phi0}
+        // VRMapPhi (after Epilog0) = {0: %Phi3, 1: %Phi2}
+
+        unsigned PhiOp1 = VRMap[PrologStage][Def];
+        if (np <= PrologStage)
+          PhiOp1 = VRMap[PrologStage - np][Def];
+        if (!InKernel) {
+          if (PrevStage == LastStageNum && np == 0)
+            PhiOp2 = VRMap[LastStageNum][Def];
+          else
+            PhiOp2 = VRMapPhi[PrevStage - np][Def];
+        }
+
+        const TargetRegisterClass *RC = MRI.getRegClass(Def);
+        Register NewReg = MRI.createVirtualRegister(RC);
+
+        MachineInstrBuilder NewPhi =
+            BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
+                    TII->get(TargetOpcode::PHI), NewReg);
+        NewPhi.addReg(PhiOp1).addMBB(BB1);
+        NewPhi.addReg(PhiOp2).addMBB(BB2);
+        if (np == 0)
+          InstrMap[NewPhi] = &*BBI;
+
+        // Rewrite uses and update the map. The actions depend upon whether
+        // we generating code for the kernel or epilog blocks.
+        if (InKernel) {
+          rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp1,
+                                NewReg);
+          rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp2,
+                                NewReg);
+
+          PhiOp2 = NewReg;
+          VRMapPhi[PrevStage - np - 1][Def] = NewReg;
+        } else {
+          VRMapPhi[CurStageNum - np][Def] = NewReg;
+          if (np == NumPhis - 1)
+            rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
+                                  NewReg);
+        }
+        if (IsLast && np == NumPhis - 1)
+          replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
+      }
+    }
+  }
+}
+
+/// Remove instructions that generate values with no uses.
+/// Typically, these are induction variable operations that generate values
+/// used in the loop itself.  A dead instruction has a definition with
+/// no uses, or uses that occur in the original loop only.
+void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
+                                                    MBBVectorTy &EpilogBBs) {
+  // For each epilog block, check that the value defined by each instruction
+  // is used.  If not, delete it.
+  for (MachineBasicBlock *MBB : llvm::reverse(EpilogBBs))
+    for (MachineBasicBlock::reverse_instr_iterator MI = MBB->instr_rbegin(),
+                                                   ME = MBB->instr_rend();
+         MI != ME;) {
+      // From DeadMachineInstructionElem. Don't delete inline assembly.
+      if (MI->isInlineAsm()) {
+        ++MI;
+        continue;
+      }
+      bool SawStore = false;
+      // Check if it's safe to remove the instruction due to side effects.
+      // We can, and want to, remove Phis here.
+      if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
+        ++MI;
+        continue;
+      }
+      bool used = true;
+      for (const MachineOperand &MO : MI->all_defs()) {
+        Register reg = MO.getReg();
+        // Assume physical registers are used, unless they are marked dead.
+        if (reg.isPhysical()) {
+          used = !MO.isDead();
+          if (used)
+            break;
+          continue;
+        }
+        unsigned realUses = 0;
+        for (const MachineOperand &U : MRI.use_operands(reg)) {
+          // Check if there are any uses that occur only in the original
+          // loop.  If so, that's not a real use.
+          if (U.getParent()->getParent() != BB) {
+            realUses++;
+            used = true;
+            break;
+          }
+        }
+        if (realUses > 0)
+          break;
+        used = false;
+      }
+      if (!used) {
+        LIS.RemoveMachineInstrFromMaps(*MI);
+        MI++->eraseFromParent();
+        continue;
+      }
+      ++MI;
+    }
+  // In the kernel block, check if we can remove a Phi that generates a value
+  // used in an instruction removed in the epilog block.
+  for (MachineInstr &MI : llvm::make_early_inc_range(KernelBB->phis())) {
+    Register reg = MI.getOperand(0).getReg();
+    if (MRI.use_begin(reg) == MRI.use_end()) {
+      LIS.RemoveMachineInstrFromMaps(MI);
+      MI.eraseFromParent();
+    }
+  }
+}
+
+/// For loop carried definitions, we split the lifetime of a virtual register
+/// that has uses past the definition in the next iteration. A copy with a new
+/// virtual register is inserted before the definition, which helps with
+/// generating a better register assignment.
+///
+///   v1 = phi(a, v2)     v1 = phi(a, v2)
+///   v2 = phi(b, v3)     v2 = phi(b, v3)
+///   v3 = ..             v4 = copy v1
+///   .. = V1             v3 = ..
+///                       .. = v4
+void ModuloScheduleExpander::splitLifetimes(MachineBasicBlock *KernelBB,
+                                            MBBVectorTy &EpilogBBs) {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  for (auto &PHI : KernelBB->phis()) {
+    Register Def = PHI.getOperand(0).getReg();
+    // Check for any Phi definition that used as an operand of another Phi
+    // in the same block.
+    for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
+                                                 E = MRI.use_instr_end();
+         I != E; ++I) {
+      if (I->isPHI() && I->getParent() == KernelBB) {
+        // Get the loop carried definition.
+        unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
+        if (!LCDef)
+          continue;
+        MachineInstr *MI = MRI.getVRegDef(LCDef);
+        if (!MI || MI->getParent() != KernelBB || MI->isPHI())
+          continue;
+        // Search through the rest of the block looking for uses of the Phi
+        // definition. If one occurs, then split the lifetime.
+        unsigned SplitReg = 0;
+        for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
+                                    KernelBB->instr_end()))
+          if (BBJ.readsRegister(Def)) {
+            // We split the lifetime when we find the first use.
+            if (SplitReg == 0) {
+              SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
+              BuildMI(*KernelBB, MI, MI->getDebugLoc(),
+                      TII->get(TargetOpcode::COPY), SplitReg)
+                  .addReg(Def);
+            }
+            BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
+          }
+        if (!SplitReg)
+          continue;
+        // Search through each of the epilog blocks for any uses to be renamed.
+        for (auto &Epilog : EpilogBBs)
+          for (auto &I : *Epilog)
+            if (I.readsRegister(Def))
+              I.substituteRegister(Def, SplitReg, 0, *TRI);
+        break;
+      }
+    }
+  }
+}
+
+/// Remove the incoming block from the Phis in a basic block.
+static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
+  for (MachineInstr &MI : *BB) {
+    if (!MI.isPHI())
+      break;
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
+      if (MI.getOperand(i + 1).getMBB() == Incoming) {
+        MI.removeOperand(i + 1);
+        MI.removeOperand(i);
+        break;
+      }
+  }
+}
+
+/// Create branches from each prolog basic block to the appropriate epilog
+/// block.  These edges are needed if the loop ends before reaching the
+/// kernel.
+void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB,
+                                         MBBVectorTy &PrologBBs,
+                                         MachineBasicBlock *KernelBB,
+                                         MBBVectorTy &EpilogBBs,
+                                         ValueMapTy *VRMap) {
+  assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
+  MachineBasicBlock *LastPro = KernelBB;
+  MachineBasicBlock *LastEpi = KernelBB;
+
+  // Start from the blocks connected to the kernel and work "out"
+  // to the first prolog and the last epilog blocks.
+  SmallVector<MachineInstr *, 4> PrevInsts;
+  unsigned MaxIter = PrologBBs.size() - 1;
+  for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
+    // Add branches to the prolog that go to the corresponding
+    // epilog, and the fall-thru prolog/kernel block.
+    MachineBasicBlock *Prolog = PrologBBs[j];
+    MachineBasicBlock *Epilog = EpilogBBs[i];
+
+    SmallVector<MachineOperand, 4> Cond;
+    std::optional<bool> StaticallyGreater =
+        LoopInfo->createTripCountGreaterCondition(j + 1, *Prolog, Cond);
+    unsigned numAdded = 0;
+    if (!StaticallyGreater) {
+      Prolog->addSuccessor(Epilog);
+      numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
+    } else if (*StaticallyGreater == false) {
+      Prolog->addSuccessor(Epilog);
+      Prolog->removeSuccessor(LastPro);
+      LastEpi->removeSuccessor(Epilog);
+      numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
+      removePhis(Epilog, LastEpi);
+      // Remove the blocks that are no longer referenced.
+      if (LastPro != LastEpi) {
+        LastEpi->clear();
+        LastEpi->eraseFromParent();
+      }
+      if (LastPro == KernelBB) {
+        LoopInfo->disposed();
+        NewKernel = nullptr;
+      }
+      LastPro->clear();
+      LastPro->eraseFromParent();
+    } else {
+      numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
+      removePhis(Epilog, Prolog);
+    }
+    LastPro = Prolog;
+    LastEpi = Epilog;
+    for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
+                                                   E = Prolog->instr_rend();
+         I != E && numAdded > 0; ++I, --numAdded)
+      updateInstruction(&*I, false, j, 0, VRMap);
+  }
+
+  if (NewKernel) {
+    LoopInfo->setPreheader(PrologBBs[MaxIter]);
+    LoopInfo->adjustTripCount(-(MaxIter + 1));
+  }
+}
+
+/// Return true if we can compute the amount the instruction changes
+/// during each iteration. Set Delta to the amount of the change.
+bool ModuloScheduleExpander::computeDelta(MachineInstr &MI, unsigned &Delta) {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const MachineOperand *BaseOp;
+  int64_t Offset;
+  bool OffsetIsScalable;
+  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
+    return false;
+
+  // FIXME: This algorithm assumes instructions have fixed-size offsets.
+  if (OffsetIsScalable)
+    return false;
+
+  if (!BaseOp->isReg())
+    return false;
+
+  Register BaseReg = BaseOp->getReg();
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  // Check if there is a Phi. If so, get the definition in the loop.
+  MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
+  if (BaseDef && BaseDef->isPHI()) {
+    BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
+    BaseDef = MRI.getVRegDef(BaseReg);
+  }
+  if (!BaseDef)
+    return false;
+
+  int D = 0;
+  if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
+    return false;
+
+  Delta = D;
+  return true;
+}
+
+/// Update the memory operand with a new offset when the pipeliner
+/// generates a new copy of the instruction that refers to a
+/// different memory location.
+void ModuloScheduleExpander::updateMemOperands(MachineInstr &NewMI,
+                                               MachineInstr &OldMI,
+                                               unsigned Num) {
+  if (Num == 0)
+    return;
+  // If the instruction has memory operands, then adjust the offset
+  // when the instruction appears in different stages.
+  if (NewMI.memoperands_empty())
+    return;
+  SmallVector<MachineMemOperand *, 2> NewMMOs;
+  for (MachineMemOperand *MMO : NewMI.memoperands()) {
+    // TODO: Figure out whether isAtomic is really necessary (see D57601).
+    if (MMO->isVolatile() || MMO->isAtomic() ||
+        (MMO->isInvariant() && MMO->isDereferenceable()) ||
+        (!MMO->getValue())) {
+      NewMMOs.push_back(MMO);
+      continue;
+    }
+    unsigned Delta;
+    if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
+      int64_t AdjOffset = Delta * Num;
+      NewMMOs.push_back(
+          MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
+    } else {
+      NewMMOs.push_back(
+          MF.getMachineMemOperand(MMO, 0, MemoryLocation::UnknownSize));
+    }
+  }
+  NewMI.setMemRefs(MF, NewMMOs);
+}
+
+/// Clone the instruction for the new pipelined loop and update the
+/// memory operands, if needed.
+MachineInstr *ModuloScheduleExpander::cloneInstr(MachineInstr *OldMI,
+                                                 unsigned CurStageNum,
+                                                 unsigned InstStageNum) {
+  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
+  updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
+  return NewMI;
+}
+
+/// Clone the instruction for the new pipelined loop. If needed, this
+/// function updates the instruction using the values saved in the
+/// InstrChanges structure.
+MachineInstr *ModuloScheduleExpander::cloneAndChangeInstr(
+    MachineInstr *OldMI, unsigned CurStageNum, unsigned InstStageNum) {
+  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
+  auto It = InstrChanges.find(OldMI);
+  if (It != InstrChanges.end()) {
+    std::pair<unsigned, int64_t> RegAndOffset = It->second;
+    unsigned BasePos, OffsetPos;
+    if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
+      return nullptr;
+    int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
+    MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
+    if (Schedule.getStage(LoopDef) > (signed)InstStageNum)
+      NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
+    NewMI->getOperand(OffsetPos).setImm(NewOffset);
+  }
+  updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
+  return NewMI;
+}
+
+/// Update the machine instruction with new virtual registers.  This
+/// function may change the definitions and/or uses.
+void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI,
+                                               bool LastDef,
+                                               unsigned CurStageNum,
+                                               unsigned InstrStageNum,
+                                               ValueMapTy *VRMap) {
+  for (MachineOperand &MO : NewMI->operands()) {
+    if (!MO.isReg() || !MO.getReg().isVirtual())
+      continue;
+    Register reg = MO.getReg();
+    if (MO.isDef()) {
+      // Create a new virtual register for the definition.
+      const TargetRegisterClass *RC = MRI.getRegClass(reg);
+      Register NewReg = MRI.createVirtualRegister(RC);
+      MO.setReg(NewReg);
+      VRMap[CurStageNum][reg] = NewReg;
+      if (LastDef)
+        replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
+    } else if (MO.isUse()) {
+      MachineInstr *Def = MRI.getVRegDef(reg);
+      // Compute the stage that contains the last definition for instruction.
+      int DefStageNum = Schedule.getStage(Def);
+      unsigned StageNum = CurStageNum;
+      if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
+        // Compute the difference in stages between the defintion and the use.
+        unsigned StageDiff = (InstrStageNum - DefStageNum);
+        // Make an adjustment to get the last definition.
+        StageNum -= StageDiff;
+      }
+      if (VRMap[StageNum].count(reg))
+        MO.setReg(VRMap[StageNum][reg]);
+    }
+  }
+}
+
+/// Return the instruction in the loop that defines the register.
+/// If the definition is a Phi, then follow the Phi operand to
+/// the instruction in the loop.
+MachineInstr *ModuloScheduleExpander::findDefInLoop(unsigned Reg) {
+  SmallPtrSet<MachineInstr *, 8> Visited;
+  MachineInstr *Def = MRI.getVRegDef(Reg);
+  while (Def->isPHI()) {
+    if (!Visited.insert(Def).second)
+      break;
+    for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
+      if (Def->getOperand(i + 1).getMBB() == BB) {
+        Def = MRI.getVRegDef(Def->getOperand(i).getReg());
+        break;
+      }
+  }
+  return Def;
+}
+
+/// Return the new name for the value from the previous stage.
+unsigned ModuloScheduleExpander::getPrevMapVal(
+    unsigned StageNum, unsigned PhiStage, unsigned LoopVal, unsigned LoopStage,
+    ValueMapTy *VRMap, MachineBasicBlock *BB) {
+  unsigned PrevVal = 0;
+  if (StageNum > PhiStage) {
+    MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
+    if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
+      // The name is defined in the previous stage.
+      PrevVal = VRMap[StageNum - 1][LoopVal];
+    else if (VRMap[StageNum].count(LoopVal))
+      // The previous name is defined in the current stage when the instruction
+      // order is swapped.
+      PrevVal = VRMap[StageNum][LoopVal];
+    else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
+      // The loop value hasn't yet been scheduled.
+      PrevVal = LoopVal;
+    else if (StageNum == PhiStage + 1)
+      // The loop value is another phi, which has not been scheduled.
+      PrevVal = getInitPhiReg(*LoopInst, BB);
+    else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
+      // The loop value is another phi, which has been scheduled.
+      PrevVal =
+          getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
+                        LoopStage, VRMap, BB);
+  }
+  return PrevVal;
+}
+
+/// Rewrite the Phi values in the specified block to use the mappings
+/// from the initial operand. Once the Phi is scheduled, we switch
+/// to using the loop value instead of the Phi value, so those names
+/// do not need to be rewritten.
+void ModuloScheduleExpander::rewritePhiValues(MachineBasicBlock *NewBB,
+                                              unsigned StageNum,
+                                              ValueMapTy *VRMap,
+                                              InstrMapTy &InstrMap) {
+  for (auto &PHI : BB->phis()) {
+    unsigned InitVal = 0;
+    unsigned LoopVal = 0;
+    getPhiRegs(PHI, BB, InitVal, LoopVal);
+    Register PhiDef = PHI.getOperand(0).getReg();
+
+    unsigned PhiStage = (unsigned)Schedule.getStage(MRI.getVRegDef(PhiDef));
+    unsigned LoopStage = (unsigned)Schedule.getStage(MRI.getVRegDef(LoopVal));
+    unsigned NumPhis = getStagesForPhi(PhiDef);
+    if (NumPhis > StageNum)
+      NumPhis = StageNum;
+    for (unsigned np = 0; np <= NumPhis; ++np) {
+      unsigned NewVal =
+          getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
+      if (!NewVal)
+        NewVal = InitVal;
+      rewriteScheduledInstr(NewBB, InstrMap, StageNum - np, np, &PHI, PhiDef,
+                            NewVal);
+    }
+  }
+}
+
+/// Rewrite a previously scheduled instruction to use the register value
+/// from the new instruction. Make sure the instruction occurs in the
+/// basic block, and we don't change the uses in the new instruction.
+void ModuloScheduleExpander::rewriteScheduledInstr(
+    MachineBasicBlock *BB, InstrMapTy &InstrMap, unsigned CurStageNum,
+    unsigned PhiNum, MachineInstr *Phi, unsigned OldReg, unsigned NewReg,
+    unsigned PrevReg) {
+  bool InProlog = (CurStageNum < (unsigned)Schedule.getNumStages() - 1);
+  int StagePhi = Schedule.getStage(Phi) + PhiNum;
+  // Rewrite uses that have been scheduled already to use the new
+  // Phi register.
+  for (MachineOperand &UseOp :
+       llvm::make_early_inc_range(MRI.use_operands(OldReg))) {
+    MachineInstr *UseMI = UseOp.getParent();
+    if (UseMI->getParent() != BB)
+      continue;
+    if (UseMI->isPHI()) {
+      if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
+        continue;
+      if (getLoopPhiReg(*UseMI, BB) != OldReg)
+        continue;
+    }
+    InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
+    assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
+    MachineInstr *OrigMI = OrigInstr->second;
+    int StageSched = Schedule.getStage(OrigMI);
+    int CycleSched = Schedule.getCycle(OrigMI);
+    unsigned ReplaceReg = 0;
+    // This is the stage for the scheduled instruction.
+    if (StagePhi == StageSched && Phi->isPHI()) {
+      int CyclePhi = Schedule.getCycle(Phi);
+      if (PrevReg && InProlog)
+        ReplaceReg = PrevReg;
+      else if (PrevReg && !isLoopCarried(*Phi) &&
+               (CyclePhi <= CycleSched || OrigMI->isPHI()))
+        ReplaceReg = PrevReg;
+      else
+        ReplaceReg = NewReg;
+    }
+    // The scheduled instruction occurs before the scheduled Phi, and the
+    // Phi is not loop carried.
+    if (!InProlog && StagePhi + 1 == StageSched && !isLoopCarried(*Phi))
+      ReplaceReg = NewReg;
+    if (StagePhi > StageSched && Phi->isPHI())
+      ReplaceReg = NewReg;
+    if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
+      ReplaceReg = NewReg;
+    if (ReplaceReg) {
+      const TargetRegisterClass *NRC =
+          MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
+      if (NRC)
+        UseOp.setReg(ReplaceReg);
+      else {
+        Register SplitReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+        BuildMI(*BB, UseMI, UseMI->getDebugLoc(), TII->get(TargetOpcode::COPY),
+                SplitReg)
+            .addReg(ReplaceReg);
+        UseOp.setReg(SplitReg);
+      }
+    }
+  }
+}
+
+bool ModuloScheduleExpander::isLoopCarried(MachineInstr &Phi) {
+  if (!Phi.isPHI())
+    return false;
+  int DefCycle = Schedule.getCycle(&Phi);
+  int DefStage = Schedule.getStage(&Phi);
+
+  unsigned InitVal = 0;
+  unsigned LoopVal = 0;
+  getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
+  MachineInstr *Use = MRI.getVRegDef(LoopVal);
+  if (!Use || Use->isPHI())
+    return true;
+  int LoopCycle = Schedule.getCycle(Use);
+  int LoopStage = Schedule.getStage(Use);
+  return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
+}
+
+//===----------------------------------------------------------------------===//
+// PeelingModuloScheduleExpander implementation
+//===----------------------------------------------------------------------===//
+// This is a reimplementation of ModuloScheduleExpander that works by creating
+// a fully correct steady-state kernel and peeling off the prolog and epilogs.
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Remove any dead phis in MBB. Dead phis either have only one block as input
+// (in which case they are the identity) or have no uses.
+void EliminateDeadPhis(MachineBasicBlock *MBB, MachineRegisterInfo &MRI,
+                       LiveIntervals *LIS, bool KeepSingleSrcPhi = false) {
+  bool Changed = true;
+  while (Changed) {
+    Changed = false;
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB->phis())) {
+      assert(MI.isPHI());
+      if (MRI.use_empty(MI.getOperand(0).getReg())) {
+        if (LIS)
+          LIS->RemoveMachineInstrFromMaps(MI);
+        MI.eraseFromParent();
+        Changed = true;
+      } else if (!KeepSingleSrcPhi && MI.getNumExplicitOperands() == 3) {
+        const TargetRegisterClass *ConstrainRegClass =
+            MRI.constrainRegClass(MI.getOperand(1).getReg(),
+                                  MRI.getRegClass(MI.getOperand(0).getReg()));
+        assert(ConstrainRegClass &&
+               "Expected a valid constrained register class!");
+        (void)ConstrainRegClass;
+        MRI.replaceRegWith(MI.getOperand(0).getReg(),
+                           MI.getOperand(1).getReg());
+        if (LIS)
+          LIS->RemoveMachineInstrFromMaps(MI);
+        MI.eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+}
+
+/// Rewrites the kernel block in-place to adhere to the given schedule.
+/// KernelRewriter holds all of the state required to perform the rewriting.
+class KernelRewriter {
+  ModuloSchedule &S;
+  MachineBasicBlock *BB;
+  MachineBasicBlock *PreheaderBB, *ExitBB;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo *TII;
+  LiveIntervals *LIS;
+
+  // Map from register class to canonical undef register for that class.
+  DenseMap<const TargetRegisterClass *, Register> Undefs;
+  // Map from <LoopReg, InitReg> to phi register for all created phis. Note that
+  // this map is only used when InitReg is non-undef.
+  DenseMap<std::pair<unsigned, unsigned>, Register> Phis;
+  // Map from LoopReg to phi register where the InitReg is undef.
+  DenseMap<Register, Register> UndefPhis;
+
+  // Reg is used by MI. Return the new register MI should use to adhere to the
+  // schedule. Insert phis as necessary.
+  Register remapUse(Register Reg, MachineInstr &MI);
+  // Insert a phi that carries LoopReg from the loop body and InitReg otherwise.
+  // If InitReg is not given it is chosen arbitrarily. It will either be undef
+  // or will be chosen so as to share another phi.
+  Register phi(Register LoopReg, std::optional<Register> InitReg = {},
+               const TargetRegisterClass *RC = nullptr);
+  // Create an undef register of the given register class.
+  Register undef(const TargetRegisterClass *RC);
+
+public:
+  KernelRewriter(MachineLoop &L, ModuloSchedule &S, MachineBasicBlock *LoopBB,
+                 LiveIntervals *LIS = nullptr);
+  void rewrite();
+};
+} // namespace
+
+KernelRewriter::KernelRewriter(MachineLoop &L, ModuloSchedule &S,
+                               MachineBasicBlock *LoopBB, LiveIntervals *LIS)
+    : S(S), BB(LoopBB), PreheaderBB(L.getLoopPreheader()),
+      ExitBB(L.getExitBlock()), MRI(BB->getParent()->getRegInfo()),
+      TII(BB->getParent()->getSubtarget().getInstrInfo()), LIS(LIS) {
+  PreheaderBB = *BB->pred_begin();
+  if (PreheaderBB == BB)
+    PreheaderBB = *std::next(BB->pred_begin());
+}
+
+void KernelRewriter::rewrite() {
+  // Rearrange the loop to be in schedule order. Note that the schedule may
+  // contain instructions that are not owned by the loop block (InstrChanges and
+  // friends), so we gracefully handle unowned instructions and delete any
+  // instructions that weren't in the schedule.
+  auto InsertPt = BB->getFirstTerminator();
+  MachineInstr *FirstMI = nullptr;
+  for (MachineInstr *MI : S.getInstructions()) {
+    if (MI->isPHI())
+      continue;
+    if (MI->getParent())
+      MI->removeFromParent();
+    BB->insert(InsertPt, MI);
+    if (!FirstMI)
+      FirstMI = MI;
+  }
+  assert(FirstMI && "Failed to find first MI in schedule");
+
+  // At this point all of the scheduled instructions are between FirstMI
+  // and the end of the block. Kill from the first non-phi to FirstMI.
+  for (auto I = BB->getFirstNonPHI(); I != FirstMI->getIterator();) {
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(*I);
+    (I++)->eraseFromParent();
+  }
+
+  // Now remap every instruction in the loop.
+  for (MachineInstr &MI : *BB) {
+    if (MI.isPHI() || MI.isTerminator())
+      continue;
+    for (MachineOperand &MO : MI.uses()) {
+      if (!MO.isReg() || MO.getReg().isPhysical() || MO.isImplicit())
+        continue;
+      Register Reg = remapUse(MO.getReg(), MI);
+      MO.setReg(Reg);
+    }
+  }
+  EliminateDeadPhis(BB, MRI, LIS);
+
+  // Ensure a phi exists for all instructions that are either referenced by
+  // an illegal phi or by an instruction outside the loop. This allows us to
+  // treat remaps of these values the same as "normal" values that come from
+  // loop-carried phis.
+  for (auto MI = BB->getFirstNonPHI(); MI != BB->end(); ++MI) {
+    if (MI->isPHI()) {
+      Register R = MI->getOperand(0).getReg();
+      phi(R);
+      continue;
+    }
+
+    for (MachineOperand &Def : MI->defs()) {
+      for (MachineInstr &MI : MRI.use_instructions(Def.getReg())) {
+        if (MI.getParent() != BB) {
+          phi(Def.getReg());
+          break;
+        }
+      }
+    }
+  }
+}
+
+Register KernelRewriter::remapUse(Register Reg, MachineInstr &MI) {
+  MachineInstr *Producer = MRI.getUniqueVRegDef(Reg);
+  if (!Producer)
+    return Reg;
+
+  int ConsumerStage = S.getStage(&MI);
+  if (!Producer->isPHI()) {
+    // Non-phi producers are simple to remap. Insert as many phis as the
+    // difference between the consumer and producer stages.
+    if (Producer->getParent() != BB)
+      // Producer was not inside the loop. Use the register as-is.
+      return Reg;
+    int ProducerStage = S.getStage(Producer);
+    assert(ConsumerStage != -1 &&
+           "In-loop consumer should always be scheduled!");
+    assert(ConsumerStage >= ProducerStage);
+    unsigned StageDiff = ConsumerStage - ProducerStage;
+
+    for (unsigned I = 0; I < StageDiff; ++I)
+      Reg = phi(Reg);
+    return Reg;
+  }
+
+  // First, dive through the phi chain to find the defaults for the generated
+  // phis.
+  SmallVector<std::optional<Register>, 4> Defaults;
+  Register LoopReg = Reg;
+  auto LoopProducer = Producer;
+  while (LoopProducer->isPHI() && LoopProducer->getParent() == BB) {
+    LoopReg = getLoopPhiReg(*LoopProducer, BB);
+    Defaults.emplace_back(getInitPhiReg(*LoopProducer, BB));
+    LoopProducer = MRI.getUniqueVRegDef(LoopReg);
+    assert(LoopProducer);
+  }
+  int LoopProducerStage = S.getStage(LoopProducer);
+
+  std::optional<Register> IllegalPhiDefault;
+
+  if (LoopProducerStage == -1) {
+    // Do nothing.
+  } else if (LoopProducerStage > ConsumerStage) {
+    // This schedule is only representable if ProducerStage == ConsumerStage+1.
+    // In addition, Consumer's cycle must be scheduled after Producer in the
+    // rescheduled loop. This is enforced by the pipeliner's ASAP and ALAP
+    // functions.
+#ifndef NDEBUG // Silence unused variables in non-asserts mode.
+    int LoopProducerCycle = S.getCycle(LoopProducer);
+    int ConsumerCycle = S.getCycle(&MI);
+#endif
+    assert(LoopProducerCycle <= ConsumerCycle);
+    assert(LoopProducerStage == ConsumerStage + 1);
+    // Peel off the first phi from Defaults and insert a phi between producer
+    // and consumer. This phi will not be at the front of the block so we
+    // consider it illegal. It will only exist during the rewrite process; it
+    // needs to exist while we peel off prologs because these could take the
+    // default value. After that we can replace all uses with the loop producer
+    // value.
+    IllegalPhiDefault = Defaults.front();
+    Defaults.erase(Defaults.begin());
+  } else {
+    assert(ConsumerStage >= LoopProducerStage);
+    int StageDiff = ConsumerStage - LoopProducerStage;
+    if (StageDiff > 0) {
+      LLVM_DEBUG(dbgs() << " -- padding defaults array from " << Defaults.size()
+                        << " to " << (Defaults.size() + StageDiff) << "\n");
+      // If we need more phis than we have defaults for, pad out with undefs for
+      // the earliest phis, which are at the end of the defaults chain (the
+      // chain is in reverse order).
+      Defaults.resize(Defaults.size() + StageDiff,
+                      Defaults.empty() ? std::optional<Register>()
+                                       : Defaults.back());
+    }
+  }
+
+  // Now we know the number of stages to jump back, insert the phi chain.
+  auto DefaultI = Defaults.rbegin();
+  while (DefaultI != Defaults.rend())
+    LoopReg = phi(LoopReg, *DefaultI++, MRI.getRegClass(Reg));
+
+  if (IllegalPhiDefault) {
+    // The consumer optionally consumes LoopProducer in the same iteration
+    // (because the producer is scheduled at an earlier cycle than the consumer)
+    // or the initial value. To facilitate this we create an illegal block here
+    // by embedding a phi in the middle of the block. We will fix this up
+    // immediately prior to pruning.
+    auto RC = MRI.getRegClass(Reg);
+    Register R = MRI.createVirtualRegister(RC);
+    MachineInstr *IllegalPhi =
+        BuildMI(*BB, MI, DebugLoc(), TII->get(TargetOpcode::PHI), R)
+            .addReg(*IllegalPhiDefault)
+            .addMBB(PreheaderBB) // Block choice is arbitrary and has no effect.
+            .addReg(LoopReg)
+            .addMBB(BB); // Block choice is arbitrary and has no effect.
+    // Illegal phi should belong to the producer stage so that it can be
+    // filtered correctly during peeling.
+    S.setStage(IllegalPhi, LoopProducerStage);
+    return R;
+  }
+
+  return LoopReg;
+}
+
+Register KernelRewriter::phi(Register LoopReg, std::optional<Register> InitReg,
+                             const TargetRegisterClass *RC) {
+  // If the init register is not undef, try and find an existing phi.
+  if (InitReg) {
+    auto I = Phis.find({LoopReg, *InitReg});
+    if (I != Phis.end())
+      return I->second;
+  } else {
+    for (auto &KV : Phis) {
+      if (KV.first.first == LoopReg)
+        return KV.second;
+    }
+  }
+
+  // InitReg is either undef or no existing phi takes InitReg as input. Try and
+  // find a phi that takes undef as input.
+  auto I = UndefPhis.find(LoopReg);
+  if (I != UndefPhis.end()) {
+    Register R = I->second;
+    if (!InitReg)
+      // Found a phi taking undef as input, and this input is undef so return
+      // without any more changes.
+      return R;
+    // Found a phi taking undef as input, so rewrite it to take InitReg.
+    MachineInstr *MI = MRI.getVRegDef(R);
+    MI->getOperand(1).setReg(*InitReg);
+    Phis.insert({{LoopReg, *InitReg}, R});
+    const TargetRegisterClass *ConstrainRegClass =
+        MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
+    assert(ConstrainRegClass && "Expected a valid constrained register class!");
+    (void)ConstrainRegClass;
+    UndefPhis.erase(I);
+    return R;
+  }
+
+  // Failed to find any existing phi to reuse, so create a new one.
+  if (!RC)
+    RC = MRI.getRegClass(LoopReg);
+  Register R = MRI.createVirtualRegister(RC);
+  if (InitReg) {
+    const TargetRegisterClass *ConstrainRegClass =
+        MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
+    assert(ConstrainRegClass && "Expected a valid constrained register class!");
+    (void)ConstrainRegClass;
+  }
+  BuildMI(*BB, BB->getFirstNonPHI(), DebugLoc(), TII->get(TargetOpcode::PHI), R)
+      .addReg(InitReg ? *InitReg : undef(RC))
+      .addMBB(PreheaderBB)
+      .addReg(LoopReg)
+      .addMBB(BB);
+  if (!InitReg)
+    UndefPhis[LoopReg] = R;
+  else
+    Phis[{LoopReg, *InitReg}] = R;
+  return R;
+}
+
+Register KernelRewriter::undef(const TargetRegisterClass *RC) {
+  Register &R = Undefs[RC];
+  if (R == 0) {
+    // Create an IMPLICIT_DEF that defines this register if we need it.
+    // All uses of this should be removed by the time we have finished unrolling
+    // prologs and epilogs.
+    R = MRI.createVirtualRegister(RC);
+    auto *InsertBB = &PreheaderBB->getParent()->front();
+    BuildMI(*InsertBB, InsertBB->getFirstTerminator(), DebugLoc(),
+            TII->get(TargetOpcode::IMPLICIT_DEF), R);
+  }
+  return R;
+}
+
+namespace {
+/// Describes an operand in the kernel of a pipelined loop. Characteristics of
+/// the operand are discovered, such as how many in-loop PHIs it has to jump
+/// through and defaults for these phis.
+class KernelOperandInfo {
+  MachineBasicBlock *BB;
+  MachineRegisterInfo &MRI;
+  SmallVector<Register, 4> PhiDefaults;
+  MachineOperand *Source;
+  MachineOperand *Target;
+
+public:
+  KernelOperandInfo(MachineOperand *MO, MachineRegisterInfo &MRI,
+                    const SmallPtrSetImpl<MachineInstr *> &IllegalPhis)
+      : MRI(MRI) {
+    Source = MO;
+    BB = MO->getParent()->getParent();
+    while (isRegInLoop(MO)) {
+      MachineInstr *MI = MRI.getVRegDef(MO->getReg());
+      if (MI->isFullCopy()) {
+        MO = &MI->getOperand(1);
+        continue;
+      }
+      if (!MI->isPHI())
+        break;
+      // If this is an illegal phi, don't count it in distance.
+      if (IllegalPhis.count(MI)) {
+        MO = &MI->getOperand(3);
+        continue;
+      }
+
+      Register Default = getInitPhiReg(*MI, BB);
+      MO = MI->getOperand(2).getMBB() == BB ? &MI->getOperand(1)
+                                            : &MI->getOperand(3);
+      PhiDefaults.push_back(Default);
+    }
+    Target = MO;
+  }
+
+  bool operator==(const KernelOperandInfo &Other) const {
+    return PhiDefaults.size() == Other.PhiDefaults.size();
+  }
+
+  void print(raw_ostream &OS) const {
+    OS << "use of " << *Source << ": distance(" << PhiDefaults.size() << ") in "
+       << *Source->getParent();
+  }
+
+private:
+  bool isRegInLoop(MachineOperand *MO) {
+    return MO->isReg() && MO->getReg().isVirtual() &&
+           MRI.getVRegDef(MO->getReg())->getParent() == BB;
+  }
+};
+} // namespace
+
+MachineBasicBlock *
+PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
+  MachineBasicBlock *NewBB = PeelSingleBlockLoop(LPD, BB, MRI, TII);
+  if (LPD == LPD_Front)
+    PeeledFront.push_back(NewBB);
+  else
+    PeeledBack.push_front(NewBB);
+  for (auto I = BB->begin(), NI = NewBB->begin(); !I->isTerminator();
+       ++I, ++NI) {
+    CanonicalMIs[&*I] = &*I;
+    CanonicalMIs[&*NI] = &*I;
+    BlockMIs[{NewBB, &*I}] = &*NI;
+    BlockMIs[{BB, &*I}] = &*I;
+  }
+  return NewBB;
+}
+
+void PeelingModuloScheduleExpander::filterInstructions(MachineBasicBlock *MB,
+                                                       int MinStage) {
+  for (auto I = MB->getFirstInstrTerminator()->getReverseIterator();
+       I != std::next(MB->getFirstNonPHI()->getReverseIterator());) {
+    MachineInstr *MI = &*I++;
+    int Stage = getStage(MI);
+    if (Stage == -1 || Stage >= MinStage)
+      continue;
+
+    for (MachineOperand &DefMO : MI->defs()) {
+      SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
+      for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
+        // Only PHIs can use values from this block by construction.
+        // Match with the equivalent PHI in B.
+        assert(UseMI.isPHI());
+        Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
+                                               MI->getParent());
+        Subs.emplace_back(&UseMI, Reg);
+      }
+      for (auto &Sub : Subs)
+        Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
+                                      *MRI.getTargetRegisterInfo());
+    }
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(*MI);
+    MI->eraseFromParent();
+  }
+}
+
+void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
+    MachineBasicBlock *DestBB, MachineBasicBlock *SourceBB, unsigned Stage) {
+  auto InsertPt = DestBB->getFirstNonPHI();
+  DenseMap<Register, Register> Remaps;
+  for (MachineInstr &MI : llvm::make_early_inc_range(
+           llvm::make_range(SourceBB->getFirstNonPHI(), SourceBB->end()))) {
+    if (MI.isPHI()) {
+      // This is an illegal PHI. If we move any instructions using an illegal
+      // PHI, we need to create a legal Phi.
+      if (getStage(&MI) != Stage) {
+        // The legal Phi is not necessary if the illegal phi's stage
+        // is being moved.
+        Register PhiR = MI.getOperand(0).getReg();
+        auto RC = MRI.getRegClass(PhiR);
+        Register NR = MRI.createVirtualRegister(RC);
+        MachineInstr *NI = BuildMI(*DestBB, DestBB->getFirstNonPHI(),
+                                   DebugLoc(), TII->get(TargetOpcode::PHI), NR)
+                               .addReg(PhiR)
+                               .addMBB(SourceBB);
+        BlockMIs[{DestBB, CanonicalMIs[&MI]}] = NI;
+        CanonicalMIs[NI] = CanonicalMIs[&MI];
+        Remaps[PhiR] = NR;
+      }
+    }
+    if (getStage(&MI) != Stage)
+      continue;
+    MI.removeFromParent();
+    DestBB->insert(InsertPt, &MI);
+    auto *KernelMI = CanonicalMIs[&MI];
+    BlockMIs[{DestBB, KernelMI}] = &MI;
+    BlockMIs.erase({SourceBB, KernelMI});
+  }
+  SmallVector<MachineInstr *, 4> PhiToDelete;
+  for (MachineInstr &MI : DestBB->phis()) {
+    assert(MI.getNumOperands() == 3);
+    MachineInstr *Def = MRI.getVRegDef(MI.getOperand(1).getReg());
+    // If the instruction referenced by the phi is moved inside the block
+    // we don't need the phi anymore.
+    if (getStage(Def) == Stage) {
+      Register PhiReg = MI.getOperand(0).getReg();
+      assert(Def->findRegisterDefOperandIdx(MI.getOperand(1).getReg()) != -1);
+      MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
+      MI.getOperand(0).setReg(PhiReg);
+      PhiToDelete.push_back(&MI);
+    }
+  }
+  for (auto *P : PhiToDelete)
+    P->eraseFromParent();
+  InsertPt = DestBB->getFirstNonPHI();
+  // Helper to clone Phi instructions into the destination block. We clone Phi
+  // greedily to avoid combinatorial explosion of Phi instructions.
+  auto clonePhi = [&](MachineInstr *Phi) {
+    MachineInstr *NewMI = MF.CloneMachineInstr(Phi);
+    DestBB->insert(InsertPt, NewMI);
+    Register OrigR = Phi->getOperand(0).getReg();
+    Register R = MRI.createVirtualRegister(MRI.getRegClass(OrigR));
+    NewMI->getOperand(0).setReg(R);
+    NewMI->getOperand(1).setReg(OrigR);
+    NewMI->getOperand(2).setMBB(*DestBB->pred_begin());
+    Remaps[OrigR] = R;
+    CanonicalMIs[NewMI] = CanonicalMIs[Phi];
+    BlockMIs[{DestBB, CanonicalMIs[Phi]}] = NewMI;
+    PhiNodeLoopIteration[NewMI] = PhiNodeLoopIteration[Phi];
+    return R;
+  };
+  for (auto I = DestBB->getFirstNonPHI(); I != DestBB->end(); ++I) {
+    for (MachineOperand &MO : I->uses()) {
+      if (!MO.isReg())
+        continue;
+      if (Remaps.count(MO.getReg()))
+        MO.setReg(Remaps[MO.getReg()]);
+      else {
+        // If we are using a phi from the source block we need to add a new phi
+        // pointing to the old one.
+        MachineInstr *Use = MRI.getUniqueVRegDef(MO.getReg());
+        if (Use && Use->isPHI() && Use->getParent() == SourceBB) {
+          Register R = clonePhi(Use);
+          MO.setReg(R);
+        }
+      }
+    }
+  }
+}
+
+Register
+PeelingModuloScheduleExpander::getPhiCanonicalReg(MachineInstr *CanonicalPhi,
+                                                  MachineInstr *Phi) {
+  unsigned distance = PhiNodeLoopIteration[Phi];
+  MachineInstr *CanonicalUse = CanonicalPhi;
+  Register CanonicalUseReg = CanonicalUse->getOperand(0).getReg();
+  for (unsigned I = 0; I < distance; ++I) {
+    assert(CanonicalUse->isPHI());
+    assert(CanonicalUse->getNumOperands() == 5);
+    unsigned LoopRegIdx = 3, InitRegIdx = 1;
+    if (CanonicalUse->getOperand(2).getMBB() == CanonicalUse->getParent())
+      std::swap(LoopRegIdx, InitRegIdx);
+    CanonicalUseReg = CanonicalUse->getOperand(LoopRegIdx).getReg();
+    CanonicalUse = MRI.getVRegDef(CanonicalUseReg);
+  }
+  return CanonicalUseReg;
+}
+
+void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
+  BitVector LS(Schedule.getNumStages(), true);
+  BitVector AS(Schedule.getNumStages(), true);
+  LiveStages[BB] = LS;
+  AvailableStages[BB] = AS;
+
+  // Peel out the prologs.
+  LS.reset();
+  for (int I = 0; I < Schedule.getNumStages() - 1; ++I) {
+    LS[I] = true;
+    Prologs.push_back(peelKernel(LPD_Front));
+    LiveStages[Prologs.back()] = LS;
+    AvailableStages[Prologs.back()] = LS;
+  }
+
+  // Create a block that will end up as the new loop exiting block (dominated by
+  // all prologs and epilogs). It will only contain PHIs, in the same order as
+  // BB's PHIs. This gives us a poor-man's LCSSA with the inductive property
+  // that the exiting block is a (sub) clone of BB. This in turn gives us the
+  // property that any value deffed in BB but used outside of BB is used by a
+  // PHI in the exiting block.
+  MachineBasicBlock *ExitingBB = CreateLCSSAExitingBlock();
+  EliminateDeadPhis(ExitingBB, MRI, LIS, /*KeepSingleSrcPhi=*/true);
+  // Push out the epilogs, again in reverse order.
+  // We can't assume anything about the minumum loop trip count at this point,
+  // so emit a fairly complex epilog.
+
+  // We first peel number of stages minus one epilogue. Then we remove dead
+  // stages and reorder instructions based on their stage. If we have 3 stages
+  // we generate first:
+  // E0[3, 2, 1]
+  // E1[3', 2']
+  // E2[3'']
+  // And then we move instructions based on their stages to have:
+  // E0[3]
+  // E1[2, 3']
+  // E2[1, 2', 3'']
+  // The transformation is legal because we only move instructions past
+  // instructions of a previous loop iteration.
+  for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
+    Epilogs.push_back(peelKernel(LPD_Back));
+    MachineBasicBlock *B = Epilogs.back();
+    filterInstructions(B, Schedule.getNumStages() - I);
+    // Keep track at which iteration each phi belongs to. We need it to know
+    // what version of the variable to use during prologue/epilogue stitching.
+    EliminateDeadPhis(B, MRI, LIS, /*KeepSingleSrcPhi=*/true);
+    for (MachineInstr &Phi : B->phis())
+      PhiNodeLoopIteration[&Phi] = Schedule.getNumStages() - I;
+  }
+  for (size_t I = 0; I < Epilogs.size(); I++) {
+    LS.reset();
+    for (size_t J = I; J < Epilogs.size(); J++) {
+      int Iteration = J;
+      unsigned Stage = Schedule.getNumStages() - 1 + I - J;
+      // Move stage one block at a time so that Phi nodes are updated correctly.
+      for (size_t K = Iteration; K > I; K--)
+        moveStageBetweenBlocks(Epilogs[K - 1], Epilogs[K], Stage);
+      LS[Stage] = true;
+    }
+    LiveStages[Epilogs[I]] = LS;
+    AvailableStages[Epilogs[I]] = AS;
+  }
+
+  // Now we've defined all the prolog and epilog blocks as a fallthrough
+  // sequence, add the edges that will be followed if the loop trip count is
+  // lower than the number of stages (connecting prologs directly with epilogs).
+  auto PI = Prologs.begin();
+  auto EI = Epilogs.begin();
+  assert(Prologs.size() == Epilogs.size());
+  for (; PI != Prologs.end(); ++PI, ++EI) {
+    MachineBasicBlock *Pred = *(*EI)->pred_begin();
+    (*PI)->addSuccessor(*EI);
+    for (MachineInstr &MI : (*EI)->phis()) {
+      Register Reg = MI.getOperand(1).getReg();
+      MachineInstr *Use = MRI.getUniqueVRegDef(Reg);
+      if (Use && Use->getParent() == Pred) {
+        MachineInstr *CanonicalUse = CanonicalMIs[Use];
+        if (CanonicalUse->isPHI()) {
+          // If the use comes from a phi we need to skip as many phi as the
+          // distance between the epilogue and the kernel. Trace through the phi
+          // chain to find the right value.
+          Reg = getPhiCanonicalReg(CanonicalUse, Use);
+        }
+        Reg = getEquivalentRegisterIn(Reg, *PI);
+      }
+      MI.addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/false));
+      MI.addOperand(MachineOperand::CreateMBB(*PI));
+    }
+  }
+
+  // Create a list of all blocks in order.
+  SmallVector<MachineBasicBlock *, 8> Blocks;
+  llvm::copy(PeeledFront, std::back_inserter(Blocks));
+  Blocks.push_back(BB);
+  llvm::copy(PeeledBack, std::back_inserter(Blocks));
+
+  // Iterate in reverse order over all instructions, remapping as we go.
+  for (MachineBasicBlock *B : reverse(Blocks)) {
+    for (auto I = B->instr_rbegin();
+         I != std::next(B->getFirstNonPHI()->getReverseIterator());) {
+      MachineBasicBlock::reverse_instr_iterator MI = I++;
+      rewriteUsesOf(&*MI);
+    }
+  }
+  for (auto *MI : IllegalPhisToDelete) {
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(*MI);
+    MI->eraseFromParent();
+  }
+  IllegalPhisToDelete.clear();
+
+  // Now all remapping has been done, we're free to optimize the generated code.
+  for (MachineBasicBlock *B : reverse(Blocks))
+    EliminateDeadPhis(B, MRI, LIS);
+  EliminateDeadPhis(ExitingBB, MRI, LIS);
+}
+
+MachineBasicBlock *PeelingModuloScheduleExpander::CreateLCSSAExitingBlock() {
+  MachineFunction &MF = *BB->getParent();
+  MachineBasicBlock *Exit = *BB->succ_begin();
+  if (Exit == BB)
+    Exit = *std::next(BB->succ_begin());
+
+  MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
+  MF.insert(std::next(BB->getIterator()), NewBB);
+
+  // Clone all phis in BB into NewBB and rewrite.
+  for (MachineInstr &MI : BB->phis()) {
+    auto RC = MRI.getRegClass(MI.getOperand(0).getReg());
+    Register OldR = MI.getOperand(3).getReg();
+    Register R = MRI.createVirtualRegister(RC);
+    SmallVector<MachineInstr *, 4> Uses;
+    for (MachineInstr &Use : MRI.use_instructions(OldR))
+      if (Use.getParent() != BB)
+        Uses.push_back(&Use);
+    for (MachineInstr *Use : Uses)
+      Use->substituteRegister(OldR, R, /*SubIdx=*/0,
+                              *MRI.getTargetRegisterInfo());
+    MachineInstr *NI = BuildMI(NewBB, DebugLoc(), TII->get(TargetOpcode::PHI), R)
+        .addReg(OldR)
+        .addMBB(BB);
+    BlockMIs[{NewBB, &MI}] = NI;
+    CanonicalMIs[NI] = &MI;
+  }
+  BB->replaceSuccessor(Exit, NewBB);
+  Exit->replacePhiUsesWith(BB, NewBB);
+  NewBB->addSuccessor(Exit);
+
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  bool CanAnalyzeBr = !TII->analyzeBranch(*BB, TBB, FBB, Cond);
+  (void)CanAnalyzeBr;
+  assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
+  TII->removeBranch(*BB);
+  TII->insertBranch(*BB, TBB == Exit ? NewBB : TBB, FBB == Exit ? NewBB : FBB,
+                    Cond, DebugLoc());
+  TII->insertUnconditionalBranch(*NewBB, Exit, DebugLoc());
+  return NewBB;
+}
+
+Register
+PeelingModuloScheduleExpander::getEquivalentRegisterIn(Register Reg,
+                                                       MachineBasicBlock *BB) {
+  MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+  unsigned OpIdx = MI->findRegisterDefOperandIdx(Reg);
+  return BlockMIs[{BB, CanonicalMIs[MI]}]->getOperand(OpIdx).getReg();
+}
+
+void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
+  if (MI->isPHI()) {
+    // This is an illegal PHI. The loop-carried (desired) value is operand 3,
+    // and it is produced by this block.
+    Register PhiR = MI->getOperand(0).getReg();
+    Register R = MI->getOperand(3).getReg();
+    int RMIStage = getStage(MRI.getUniqueVRegDef(R));
+    if (RMIStage != -1 && !AvailableStages[MI->getParent()].test(RMIStage))
+      R = MI->getOperand(1).getReg();
+    MRI.setRegClass(R, MRI.getRegClass(PhiR));
+    MRI.replaceRegWith(PhiR, R);
+    // Postpone deleting the Phi as it may be referenced by BlockMIs and used
+    // later to figure out how to remap registers.
+    MI->getOperand(0).setReg(PhiR);
+    IllegalPhisToDelete.push_back(MI);
+    return;
+  }
+
+  int Stage = getStage(MI);
+  if (Stage == -1 || LiveStages.count(MI->getParent()) == 0 ||
+      LiveStages[MI->getParent()].test(Stage))
+    // Instruction is live, no rewriting to do.
+    return;
+
+  for (MachineOperand &DefMO : MI->defs()) {
+    SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
+    for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
+      // Only PHIs can use values from this block by construction.
+      // Match with the equivalent PHI in B.
+      assert(UseMI.isPHI());
+      Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
+                                             MI->getParent());
+      Subs.emplace_back(&UseMI, Reg);
+    }
+    for (auto &Sub : Subs)
+      Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
+                                    *MRI.getTargetRegisterInfo());
+  }
+  if (LIS)
+    LIS->RemoveMachineInstrFromMaps(*MI);
+  MI->eraseFromParent();
+}
+
+void PeelingModuloScheduleExpander::fixupBranches() {
+  // Work outwards from the kernel.
+  bool KernelDisposed = false;
+  int TC = Schedule.getNumStages() - 1;
+  for (auto PI = Prologs.rbegin(), EI = Epilogs.rbegin(); PI != Prologs.rend();
+       ++PI, ++EI, --TC) {
+    MachineBasicBlock *Prolog = *PI;
+    MachineBasicBlock *Fallthrough = *Prolog->succ_begin();
+    MachineBasicBlock *Epilog = *EI;
+    SmallVector<MachineOperand, 4> Cond;
+    TII->removeBranch(*Prolog);
+    std::optional<bool> StaticallyGreater =
+        LoopInfo->createTripCountGreaterCondition(TC, *Prolog, Cond);
+    if (!StaticallyGreater) {
+      LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n");
+      // Dynamically branch based on Cond.
+      TII->insertBranch(*Prolog, Epilog, Fallthrough, Cond, DebugLoc());
+    } else if (*StaticallyGreater == false) {
+      LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n");
+      // Prolog never falls through; branch to epilog and orphan interior
+      // blocks. Leave it to unreachable-block-elim to clean up.
+      Prolog->removeSuccessor(Fallthrough);
+      for (MachineInstr &P : Fallthrough->phis()) {
+        P.removeOperand(2);
+        P.removeOperand(1);
+      }
+      TII->insertUnconditionalBranch(*Prolog, Epilog, DebugLoc());
+      KernelDisposed = true;
+    } else {
+      LLVM_DEBUG(dbgs() << "Static-true: TC > " << TC << "\n");
+      // Prolog always falls through; remove incoming values in epilog.
+      Prolog->removeSuccessor(Epilog);
+      for (MachineInstr &P : Epilog->phis()) {
+        P.removeOperand(4);
+        P.removeOperand(3);
+      }
+    }
+  }
+
+  if (!KernelDisposed) {
+    LoopInfo->adjustTripCount(-(Schedule.getNumStages() - 1));
+    LoopInfo->setPreheader(Prologs.back());
+  } else {
+    LoopInfo->disposed();
+  }
+}
+
+void PeelingModuloScheduleExpander::rewriteKernel() {
+  KernelRewriter KR(*Schedule.getLoop(), Schedule, BB);
+  KR.rewrite();
+}
+
+void PeelingModuloScheduleExpander::expand() {
+  BB = Schedule.getLoop()->getTopBlock();
+  Preheader = Schedule.getLoop()->getLoopPreheader();
+  LLVM_DEBUG(Schedule.dump());
+  LoopInfo = TII->analyzeLoopForPipelining(BB);
+  assert(LoopInfo);
+
+  rewriteKernel();
+  peelPrologAndEpilogs();
+  fixupBranches();
+}
+
+void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
+  BB = Schedule.getLoop()->getTopBlock();
+  Preheader = Schedule.getLoop()->getLoopPreheader();
+
+  // Dump the schedule before we invalidate and remap all its instructions.
+  // Stash it in a string so we can print it if we found an error.
+  std::string ScheduleDump;
+  raw_string_ostream OS(ScheduleDump);
+  Schedule.print(OS);
+  OS.flush();
+
+  // First, run the normal ModuleScheduleExpander. We don't support any
+  // InstrChanges.
+  assert(LIS && "Requires LiveIntervals!");
+  ModuloScheduleExpander MSE(MF, Schedule, *LIS,
+                             ModuloScheduleExpander::InstrChangesTy());
+  MSE.expand();
+  MachineBasicBlock *ExpandedKernel = MSE.getRewrittenKernel();
+  if (!ExpandedKernel) {
+    // The expander optimized away the kernel. We can't do any useful checking.
+    MSE.cleanup();
+    return;
+  }
+  // Before running the KernelRewriter, re-add BB into the CFG.
+  Preheader->addSuccessor(BB);
+
+  // Now run the new expansion algorithm.
+  KernelRewriter KR(*Schedule.getLoop(), Schedule, BB);
+  KR.rewrite();
+  peelPrologAndEpilogs();
+
+  // Collect all illegal phis that the new algorithm created. We'll give these
+  // to KernelOperandInfo.
+  SmallPtrSet<MachineInstr *, 4> IllegalPhis;
+  for (auto NI = BB->getFirstNonPHI(); NI != BB->end(); ++NI) {
+    if (NI->isPHI())
+      IllegalPhis.insert(&*NI);
+  }
+
+  // Co-iterate across both kernels. We expect them to be identical apart from
+  // phis and full COPYs (we look through both).
+  SmallVector<std::pair<KernelOperandInfo, KernelOperandInfo>, 8> KOIs;
+  auto OI = ExpandedKernel->begin();
+  auto NI = BB->begin();
+  for (; !OI->isTerminator() && !NI->isTerminator(); ++OI, ++NI) {
+    while (OI->isPHI() || OI->isFullCopy())
+      ++OI;
+    while (NI->isPHI() || NI->isFullCopy())
+      ++NI;
+    assert(OI->getOpcode() == NI->getOpcode() && "Opcodes don't match?!");
+    // Analyze every operand separately.
+    for (auto OOpI = OI->operands_begin(), NOpI = NI->operands_begin();
+         OOpI != OI->operands_end(); ++OOpI, ++NOpI)
+      KOIs.emplace_back(KernelOperandInfo(&*OOpI, MRI, IllegalPhis),
+                        KernelOperandInfo(&*NOpI, MRI, IllegalPhis));
+  }
+
+  bool Failed = false;
+  for (auto &OldAndNew : KOIs) {
+    if (OldAndNew.first == OldAndNew.second)
+      continue;
+    Failed = true;
+    errs() << "Modulo kernel validation error: [\n";
+    errs() << " [golden] ";
+    OldAndNew.first.print(errs());
+    errs() << "          ";
+    OldAndNew.second.print(errs());
+    errs() << "]\n";
+  }
+
+  if (Failed) {
+    errs() << "Golden reference kernel:\n";
+    ExpandedKernel->print(errs());
+    errs() << "New kernel:\n";
+    BB->print(errs());
+    errs() << ScheduleDump;
+    report_fatal_error(
+        "Modulo kernel validation (-pipeliner-experimental-cg) failed");
+  }
+
+  // Cleanup by removing BB from the CFG again as the original
+  // ModuloScheduleExpander intended.
+  Preheader->removeSuccessor(BB);
+  MSE.cleanup();
+}
+
+//===----------------------------------------------------------------------===//
+// ModuloScheduleTestPass implementation
+//===----------------------------------------------------------------------===//
+// This pass constructs a ModuloSchedule from its module and runs
+// ModuloScheduleExpander.
+//
+// The module is expected to contain a single-block analyzable loop.
+// The total order of instructions is taken from the loop as-is.
+// Instructions are expected to be annotated with a PostInstrSymbol.
+// This PostInstrSymbol must have the following format:
+//  "Stage=%d Cycle=%d".
+//===----------------------------------------------------------------------===//
+
+namespace {
+class ModuloScheduleTest : public MachineFunctionPass {
+public:
+  static char ID;
+
+  ModuloScheduleTest() : MachineFunctionPass(ID) {
+    initializeModuloScheduleTestPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void runOnLoop(MachineFunction &MF, MachineLoop &L);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<LiveIntervals>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+} // namespace
+
+char ModuloScheduleTest::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ModuloScheduleTest, "modulo-schedule-test",
+                      "Modulo Schedule test pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(ModuloScheduleTest, "modulo-schedule-test",
+                    "Modulo Schedule test pass", false, false)
+
+bool ModuloScheduleTest::runOnMachineFunction(MachineFunction &MF) {
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  for (auto *L : MLI) {
+    if (L->getTopBlock() != L->getBottomBlock())
+      continue;
+    runOnLoop(MF, *L);
+    return false;
+  }
+  return false;
+}
+
+static void parseSymbolString(StringRef S, int &Cycle, int &Stage) {
+  std::pair<StringRef, StringRef> StageAndCycle = getToken(S, "_");
+  std::pair<StringRef, StringRef> StageTokenAndValue =
+      getToken(StageAndCycle.first, "-");
+  std::pair<StringRef, StringRef> CycleTokenAndValue =
+      getToken(StageAndCycle.second, "-");
+  if (StageTokenAndValue.first != "Stage" ||
+      CycleTokenAndValue.first != "_Cycle") {
+    llvm_unreachable(
+        "Bad post-instr symbol syntax: see comment in ModuloScheduleTest");
+    return;
+  }
+
+  StageTokenAndValue.second.drop_front().getAsInteger(10, Stage);
+  CycleTokenAndValue.second.drop_front().getAsInteger(10, Cycle);
+
+  dbgs() << "  Stage=" << Stage << ", Cycle=" << Cycle << "\n";
+}
+
+void ModuloScheduleTest::runOnLoop(MachineFunction &MF, MachineLoop &L) {
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+  MachineBasicBlock *BB = L.getTopBlock();
+  dbgs() << "--- ModuloScheduleTest running on BB#" << BB->getNumber() << "\n";
+
+  DenseMap<MachineInstr *, int> Cycle, Stage;
+  std::vector<MachineInstr *> Instrs;
+  for (MachineInstr &MI : *BB) {
+    if (MI.isTerminator())
+      continue;
+    Instrs.push_back(&MI);
+    if (MCSymbol *Sym = MI.getPostInstrSymbol()) {
+      dbgs() << "Parsing post-instr symbol for " << MI;
+      parseSymbolString(Sym->getName(), Cycle[&MI], Stage[&MI]);
+    }
+  }
+
+  ModuloSchedule MS(MF, &L, std::move(Instrs), std::move(Cycle),
+                    std::move(Stage));
+  ModuloScheduleExpander MSE(
+      MF, MS, LIS, /*InstrChanges=*/ModuloScheduleExpander::InstrChangesTy());
+  MSE.expand();
+  MSE.cleanup();
+}
+
+//===----------------------------------------------------------------------===//
+// ModuloScheduleTestAnnotater implementation
+//===----------------------------------------------------------------------===//
+
+void ModuloScheduleTestAnnotater::annotate() {
+  for (MachineInstr *MI : S.getInstructions()) {
+    SmallVector<char, 16> SV;
+    raw_svector_ostream OS(SV);
+    OS << "Stage-" << S.getStage(MI) << "_Cycle-" << S.getCycle(MI);
+    MCSymbol *Sym = MF.getContext().getOrCreateSymbol(OS.str());
+    MI->setPostInstrSymbol(MF, Sym);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/MultiHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/MultiHazardRecognizer.cpp
new file mode 100644
index 000000000000..e4cd92ac4868
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/MultiHazardRecognizer.cpp
@@ -0,0 +1,92 @@
+//===- MultiHazardRecognizer.cpp - Scheduler Support ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MultiHazardRecognizer class, which is a wrapper
+// for a set of ScheduleHazardRecognizer instances
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MultiHazardRecognizer.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+#include <functional>
+#include <numeric>
+
+using namespace llvm;
+
+void MultiHazardRecognizer::AddHazardRecognizer(
+    std::unique_ptr<ScheduleHazardRecognizer> &&R) {
+  MaxLookAhead = std::max(MaxLookAhead, R->getMaxLookAhead());
+  Recognizers.push_back(std::move(R));
+}
+
+bool MultiHazardRecognizer::atIssueLimit() const {
+  return llvm::any_of(Recognizers,
+                      std::mem_fn(&ScheduleHazardRecognizer::atIssueLimit));
+}
+
+ScheduleHazardRecognizer::HazardType
+MultiHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  for (auto &R : Recognizers) {
+    auto res = R->getHazardType(SU, Stalls);
+    if (res != NoHazard)
+      return res;
+  }
+  return NoHazard;
+}
+
+void MultiHazardRecognizer::Reset() {
+  for (auto &R : Recognizers)
+    R->Reset();
+}
+
+void MultiHazardRecognizer::EmitInstruction(SUnit *SU) {
+  for (auto &R : Recognizers)
+    R->EmitInstruction(SU);
+}
+
+void MultiHazardRecognizer::EmitInstruction(MachineInstr *MI) {
+  for (auto &R : Recognizers)
+    R->EmitInstruction(MI);
+}
+
+unsigned MultiHazardRecognizer::PreEmitNoops(SUnit *SU) {
+  auto MN = [=](unsigned a, std::unique_ptr<ScheduleHazardRecognizer> &R) {
+    return std::max(a, R->PreEmitNoops(SU));
+  };
+  return std::accumulate(Recognizers.begin(), Recognizers.end(), 0u, MN);
+}
+
+unsigned MultiHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
+  auto MN = [=](unsigned a, std::unique_ptr<ScheduleHazardRecognizer> &R) {
+    return std::max(a, R->PreEmitNoops(MI));
+  };
+  return std::accumulate(Recognizers.begin(), Recognizers.end(), 0u, MN);
+}
+
+bool MultiHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
+  auto SPA = [=](std::unique_ptr<ScheduleHazardRecognizer> &R) {
+    return R->ShouldPreferAnother(SU);
+  };
+  return llvm::any_of(Recognizers, SPA);
+}
+
+void MultiHazardRecognizer::AdvanceCycle() {
+  for (auto &R : Recognizers)
+    R->AdvanceCycle();
+}
+
+void MultiHazardRecognizer::RecedeCycle() {
+  for (auto &R : Recognizers)
+    R->RecedeCycle();
+}
+
+void MultiHazardRecognizer::EmitNoop() {
+  for (auto &R : Recognizers)
+    R->EmitNoop();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/NonRelocatableStringpool.cpp b/contrib/llvm-project/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
new file mode 100644
index 000000000000..7304bfef55cb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
@@ -0,0 +1,55 @@
+//===-- NonRelocatableStringpool.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/NonRelocatableStringpool.h"
+#include "llvm/ADT/STLExtras.h"
+
+namespace llvm {
+
+DwarfStringPoolEntryRef NonRelocatableStringpool::getEntry(StringRef S) {
+  if (S.empty() && !Strings.empty())
+    return EmptyString;
+
+  if (Translator)
+    S = Translator(S);
+  auto I = Strings.insert({S, DwarfStringPoolEntry()});
+  auto &Entry = I.first->second;
+  if (I.second || !Entry.isIndexed()) {
+    Entry.Index = NumEntries++;
+    Entry.Offset = CurrentEndOffset;
+    Entry.Symbol = nullptr;
+    CurrentEndOffset += S.size() + 1;
+  }
+  return DwarfStringPoolEntryRef(*I.first);
+}
+
+StringRef NonRelocatableStringpool::internString(StringRef S) {
+  DwarfStringPoolEntry Entry{nullptr, 0, DwarfStringPoolEntry::NotIndexed};
+
+  if (Translator)
+    S = Translator(S);
+
+  auto InsertResult = Strings.insert({S, Entry});
+  return InsertResult.first->getKey();
+}
+
+std::vector<DwarfStringPoolEntryRef>
+NonRelocatableStringpool::getEntriesForEmission() const {
+  std::vector<DwarfStringPoolEntryRef> Result;
+  Result.reserve(Strings.size());
+  for (const auto &E : Strings)
+    if (E.getValue().isIndexed())
+      Result.emplace_back(E);
+  llvm::sort(Result, [](const DwarfStringPoolEntryRef A,
+                        const DwarfStringPoolEntryRef B) {
+    return A.getIndex() < B.getIndex();
+  });
+  return Result;
+}
+
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/OptimizePHIs.cpp b/contrib/llvm-project/llvm/lib/CodeGen/OptimizePHIs.cpp
new file mode 100644
index 000000000000..d997fbbed5a6
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/OptimizePHIs.cpp
@@ -0,0 +1,206 @@
+//===- OptimizePHIs.cpp - Optimize machine instruction PHIs ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes machine instruction PHIs to take advantage of
+// opportunities created during DAG legalization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "opt-phis"
+
+STATISTIC(NumPHICycles, "Number of PHI cycles replaced");
+STATISTIC(NumDeadPHICycles, "Number of dead PHI cycles");
+
+namespace {
+
+  class OptimizePHIs : public MachineFunctionPass {
+    MachineRegisterInfo *MRI = nullptr;
+    const TargetInstrInfo *TII = nullptr;
+
+  public:
+    static char ID; // Pass identification
+
+    OptimizePHIs() : MachineFunctionPass(ID) {
+      initializeOptimizePHIsPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    using InstrSet = SmallPtrSet<MachineInstr *, 16>;
+    using InstrSetIterator = SmallPtrSetIterator<MachineInstr *>;
+
+    bool IsSingleValuePHICycle(MachineInstr *MI, unsigned &SingleValReg,
+                               InstrSet &PHIsInCycle);
+    bool IsDeadPHICycle(MachineInstr *MI, InstrSet &PHIsInCycle);
+    bool OptimizeBB(MachineBasicBlock &MBB);
+  };
+
+} // end anonymous namespace
+
+char OptimizePHIs::ID = 0;
+
+char &llvm::OptimizePHIsID = OptimizePHIs::ID;
+
+INITIALIZE_PASS(OptimizePHIs, DEBUG_TYPE,
+                "Optimize machine instruction PHIs", false, false)
+
+bool OptimizePHIs::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(Fn.getFunction()))
+    return false;
+
+  MRI = &Fn.getRegInfo();
+  TII = Fn.getSubtarget().getInstrInfo();
+
+  // Find dead PHI cycles and PHI cycles that can be replaced by a single
+  // value.  InstCombine does these optimizations, but DAG legalization may
+  // introduce new opportunities, e.g., when i64 values are split up for
+  // 32-bit targets.
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : Fn)
+    Changed |= OptimizeBB(MBB);
+
+  return Changed;
+}
+
+/// IsSingleValuePHICycle - Check if MI is a PHI where all the source operands
+/// are copies of SingleValReg, possibly via copies through other PHIs. If
+/// SingleValReg is zero on entry, it is set to the register with the single
+/// non-copy value. PHIsInCycle is a set used to keep track of the PHIs that
+/// have been scanned. PHIs may be grouped by cycle, several cycles or chains.
+bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
+                                         unsigned &SingleValReg,
+                                         InstrSet &PHIsInCycle) {
+  assert(MI->isPHI() && "IsSingleValuePHICycle expects a PHI instruction");
+  Register DstReg = MI->getOperand(0).getReg();
+
+  // See if we already saw this register.
+  if (!PHIsInCycle.insert(MI).second)
+    return true;
+
+  // Don't scan crazily complex things.
+  if (PHIsInCycle.size() == 16)
+    return false;
+
+  // Scan the PHI operands.
+  for (unsigned i = 1; i != MI->getNumOperands(); i += 2) {
+    Register SrcReg = MI->getOperand(i).getReg();
+    if (SrcReg == DstReg)
+      continue;
+    MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
+
+    // Skip over register-to-register moves.
+    if (SrcMI && SrcMI->isCopy() && !SrcMI->getOperand(0).getSubReg() &&
+        !SrcMI->getOperand(1).getSubReg() &&
+        SrcMI->getOperand(1).getReg().isVirtual()) {
+      SrcReg = SrcMI->getOperand(1).getReg();
+      SrcMI = MRI->getVRegDef(SrcReg);
+    }
+    if (!SrcMI)
+      return false;
+
+    if (SrcMI->isPHI()) {
+      if (!IsSingleValuePHICycle(SrcMI, SingleValReg, PHIsInCycle))
+        return false;
+    } else {
+      // Fail if there is more than one non-phi/non-move register.
+      if (SingleValReg != 0 && SingleValReg != SrcReg)
+        return false;
+      SingleValReg = SrcReg;
+    }
+  }
+  return true;
+}
+
+/// IsDeadPHICycle - Check if the register defined by a PHI is only used by
+/// other PHIs in a cycle.
+bool OptimizePHIs::IsDeadPHICycle(MachineInstr *MI, InstrSet &PHIsInCycle) {
+  assert(MI->isPHI() && "IsDeadPHICycle expects a PHI instruction");
+  Register DstReg = MI->getOperand(0).getReg();
+  assert(DstReg.isVirtual() && "PHI destination is not a virtual register");
+
+  // See if we already saw this register.
+  if (!PHIsInCycle.insert(MI).second)
+    return true;
+
+  // Don't scan crazily complex things.
+  if (PHIsInCycle.size() == 16)
+    return false;
+
+  for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DstReg)) {
+    if (!UseMI.isPHI() || !IsDeadPHICycle(&UseMI, PHIsInCycle))
+      return false;
+  }
+
+  return true;
+}
+
+/// OptimizeBB - Remove dead PHI cycles and PHI cycles that can be replaced by
+/// a single value.
+bool OptimizePHIs::OptimizeBB(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator
+         MII = MBB.begin(), E = MBB.end(); MII != E; ) {
+    MachineInstr *MI = &*MII++;
+    if (!MI->isPHI())
+      break;
+
+    // Check for single-value PHI cycles.
+    unsigned SingleValReg = 0;
+    InstrSet PHIsInCycle;
+    if (IsSingleValuePHICycle(MI, SingleValReg, PHIsInCycle) &&
+        SingleValReg != 0) {
+      Register OldReg = MI->getOperand(0).getReg();
+      if (!MRI->constrainRegClass(SingleValReg, MRI->getRegClass(OldReg)))
+        continue;
+
+      MRI->replaceRegWith(OldReg, SingleValReg);
+      MI->eraseFromParent();
+
+      // The kill flags on OldReg and SingleValReg may no longer be correct.
+      MRI->clearKillFlags(SingleValReg);
+
+      ++NumPHICycles;
+      Changed = true;
+      continue;
+    }
+
+    // Check for dead PHI cycles.
+    PHIsInCycle.clear();
+    if (IsDeadPHICycle(MI, PHIsInCycle)) {
+      for (MachineInstr *PhiMI : PHIsInCycle) {
+        if (MII == PhiMI)
+          ++MII;
+        PhiMI->eraseFromParent();
+      }
+      ++NumDeadPHICycles;
+      Changed = true;
+    }
+  }
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PHIElimination.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PHIElimination.cpp
new file mode 100644
index 000000000000..dbb9a9ffdf60
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PHIElimination.cpp
@@ -0,0 +1,759 @@
+//===- PhiElimination.cpp - Eliminate PHI nodes by inserting copies -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates machine instruction PHI nodes by inserting copy
+// instructions.  This destroys SSA information, but is the desired input for
+// some register allocators.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PHIEliminationUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "phi-node-elimination"
+
+static cl::opt<bool>
+DisableEdgeSplitting("disable-phi-elim-edge-splitting", cl::init(false),
+                     cl::Hidden, cl::desc("Disable critical edge splitting "
+                                          "during PHI elimination"));
+
+static cl::opt<bool>
+SplitAllCriticalEdges("phi-elim-split-all-critical-edges", cl::init(false),
+                      cl::Hidden, cl::desc("Split all critical edges during "
+                                           "PHI elimination"));
+
+static cl::opt<bool> NoPhiElimLiveOutEarlyExit(
+    "no-phi-elim-live-out-early-exit", cl::init(false), cl::Hidden,
+    cl::desc("Do not use an early exit if isLiveOutPastPHIs returns true."));
+
+namespace {
+
+  class PHIElimination : public MachineFunctionPass {
+    MachineRegisterInfo *MRI = nullptr; // Machine register information
+    LiveVariables *LV = nullptr;
+    LiveIntervals *LIS = nullptr;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+
+    PHIElimination() : MachineFunctionPass(ID) {
+      initializePHIEliminationPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  private:
+    /// EliminatePHINodes - Eliminate phi nodes by inserting copy instructions
+    /// in predecessor basic blocks.
+    bool EliminatePHINodes(MachineFunction &MF, MachineBasicBlock &MBB);
+
+    void LowerPHINode(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator LastPHIIt);
+
+    /// analyzePHINodes - Gather information about the PHI nodes in
+    /// here. In particular, we want to map the number of uses of a virtual
+    /// register which is used in a PHI node. We map that to the BB the
+    /// vreg is coming from. This is used later to determine when the vreg
+    /// is killed in the BB.
+    void analyzePHINodes(const MachineFunction& MF);
+
+    /// Split critical edges where necessary for good coalescer performance.
+    bool SplitPHIEdges(MachineFunction &MF, MachineBasicBlock &MBB,
+                       MachineLoopInfo *MLI,
+                       std::vector<SparseBitVector<>> *LiveInSets);
+
+    // These functions are temporary abstractions around LiveVariables and
+    // LiveIntervals, so they can go away when LiveVariables does.
+    bool isLiveIn(Register Reg, const MachineBasicBlock *MBB);
+    bool isLiveOutPastPHIs(Register Reg, const MachineBasicBlock *MBB);
+
+    using BBVRegPair = std::pair<unsigned, Register>;
+    using VRegPHIUse = DenseMap<BBVRegPair, unsigned>;
+
+    // Count the number of non-undef PHI uses of each register in each BB.
+    VRegPHIUse VRegPHIUseCount;
+
+    // Defs of PHI sources which are implicit_def.
+    SmallPtrSet<MachineInstr*, 4> ImpDefs;
+
+    // Map reusable lowered PHI node -> incoming join register.
+    using LoweredPHIMap =
+        DenseMap<MachineInstr*, unsigned, MachineInstrExpressionTrait>;
+    LoweredPHIMap LoweredPHIs;
+  };
+
+} // end anonymous namespace
+
+STATISTIC(NumLowered, "Number of phis lowered");
+STATISTIC(NumCriticalEdgesSplit, "Number of critical edges split");
+STATISTIC(NumReused, "Number of reused lowered phis");
+
+char PHIElimination::ID = 0;
+
+char& llvm::PHIEliminationID = PHIElimination::ID;
+
+INITIALIZE_PASS_BEGIN(PHIElimination, DEBUG_TYPE,
+                      "Eliminate PHI nodes for register allocation",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveVariables)
+INITIALIZE_PASS_END(PHIElimination, DEBUG_TYPE,
+                    "Eliminate PHI nodes for register allocation", false, false)
+
+void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addUsedIfAvailable<LiveVariables>();
+  AU.addPreserved<LiveVariables>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addPreserved<LiveIntervals>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addPreserved<MachineLoopInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool PHIElimination::runOnMachineFunction(MachineFunction &MF) {
+  MRI = &MF.getRegInfo();
+  LV = getAnalysisIfAvailable<LiveVariables>();
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
+
+  bool Changed = false;
+
+  // Split critical edges to help the coalescer.
+  if (!DisableEdgeSplitting && (LV || LIS)) {
+    // A set of live-in regs for each MBB which is used to update LV
+    // efficiently also with large functions.
+    std::vector<SparseBitVector<>> LiveInSets;
+    if (LV) {
+      LiveInSets.resize(MF.size());
+      for (unsigned Index = 0, e = MRI->getNumVirtRegs(); Index != e; ++Index) {
+        // Set the bit for this register for each MBB where it is
+        // live-through or live-in (killed).
+        Register VirtReg = Register::index2VirtReg(Index);
+        MachineInstr *DefMI = MRI->getVRegDef(VirtReg);
+        if (!DefMI)
+          continue;
+        LiveVariables::VarInfo &VI = LV->getVarInfo(VirtReg);
+        SparseBitVector<>::iterator AliveBlockItr = VI.AliveBlocks.begin();
+        SparseBitVector<>::iterator EndItr = VI.AliveBlocks.end();
+        while (AliveBlockItr != EndItr) {
+          unsigned BlockNum = *(AliveBlockItr++);
+          LiveInSets[BlockNum].set(Index);
+        }
+        // The register is live into an MBB in which it is killed but not
+        // defined. See comment for VarInfo in LiveVariables.h.
+        MachineBasicBlock *DefMBB = DefMI->getParent();
+        if (VI.Kills.size() > 1 ||
+            (!VI.Kills.empty() && VI.Kills.front()->getParent() != DefMBB))
+          for (auto *MI : VI.Kills)
+            LiveInSets[MI->getParent()->getNumber()].set(Index);
+      }
+    }
+
+    MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+    for (auto &MBB : MF)
+      Changed |= SplitPHIEdges(MF, MBB, MLI, (LV ? &LiveInSets : nullptr));
+  }
+
+  // This pass takes the function out of SSA form.
+  MRI->leaveSSA();
+
+  // Populate VRegPHIUseCount
+  analyzePHINodes(MF);
+
+  // Eliminate PHI instructions by inserting copies into predecessor blocks.
+  for (auto &MBB : MF)
+    Changed |= EliminatePHINodes(MF, MBB);
+
+  // Remove dead IMPLICIT_DEF instructions.
+  for (MachineInstr *DefMI : ImpDefs) {
+    Register DefReg = DefMI->getOperand(0).getReg();
+    if (MRI->use_nodbg_empty(DefReg)) {
+      if (LIS)
+        LIS->RemoveMachineInstrFromMaps(*DefMI);
+      DefMI->eraseFromParent();
+    }
+  }
+
+  // Clean up the lowered PHI instructions.
+  for (auto &I : LoweredPHIs) {
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(*I.first);
+    MF.deleteMachineInstr(I.first);
+  }
+
+  // TODO: we should use the incremental DomTree updater here.
+  if (Changed)
+    if (auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>())
+      MDT->getBase().recalculate(MF);
+
+  LoweredPHIs.clear();
+  ImpDefs.clear();
+  VRegPHIUseCount.clear();
+
+  MF.getProperties().set(MachineFunctionProperties::Property::NoPHIs);
+
+  return Changed;
+}
+
+/// EliminatePHINodes - Eliminate phi nodes by inserting copy instructions in
+/// predecessor basic blocks.
+bool PHIElimination::EliminatePHINodes(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) {
+  if (MBB.empty() || !MBB.front().isPHI())
+    return false;   // Quick exit for basic blocks without PHIs.
+
+  // Get an iterator to the last PHI node.
+  MachineBasicBlock::iterator LastPHIIt =
+    std::prev(MBB.SkipPHIsAndLabels(MBB.begin()));
+
+  while (MBB.front().isPHI())
+    LowerPHINode(MBB, LastPHIIt);
+
+  return true;
+}
+
+/// Return true if all defs of VirtReg are implicit-defs.
+/// This includes registers with no defs.
+static bool isImplicitlyDefined(unsigned VirtReg,
+                                const MachineRegisterInfo &MRI) {
+  for (MachineInstr &DI : MRI.def_instructions(VirtReg))
+    if (!DI.isImplicitDef())
+      return false;
+  return true;
+}
+
+/// Return true if all sources of the phi node are implicit_def's, or undef's.
+static bool allPhiOperandsUndefined(const MachineInstr &MPhi,
+                                    const MachineRegisterInfo &MRI) {
+  for (unsigned I = 1, E = MPhi.getNumOperands(); I != E; I += 2) {
+    const MachineOperand &MO = MPhi.getOperand(I);
+    if (!isImplicitlyDefined(MO.getReg(), MRI) && !MO.isUndef())
+      return false;
+  }
+  return true;
+}
+/// LowerPHINode - Lower the PHI node at the top of the specified block.
+void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator LastPHIIt) {
+  ++NumLowered;
+
+  MachineBasicBlock::iterator AfterPHIsIt = std::next(LastPHIIt);
+
+  // Unlink the PHI node from the basic block, but don't delete the PHI yet.
+  MachineInstr *MPhi = MBB.remove(&*MBB.begin());
+
+  unsigned NumSrcs = (MPhi->getNumOperands() - 1) / 2;
+  Register DestReg = MPhi->getOperand(0).getReg();
+  assert(MPhi->getOperand(0).getSubReg() == 0 && "Can't handle sub-reg PHIs");
+  bool isDead = MPhi->getOperand(0).isDead();
+
+  // Create a new register for the incoming PHI arguments.
+  MachineFunction &MF = *MBB.getParent();
+  unsigned IncomingReg = 0;
+  bool reusedIncoming = false;  // Is IncomingReg reused from an earlier PHI?
+
+  // Insert a register to register copy at the top of the current block (but
+  // after any remaining phi nodes) which copies the new incoming register
+  // into the phi node destination.
+  MachineInstr *PHICopy = nullptr;
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  if (allPhiOperandsUndefined(*MPhi, *MRI))
+    // If all sources of a PHI node are implicit_def or undef uses, just emit an
+    // implicit_def instead of a copy.
+    PHICopy = BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
+            TII->get(TargetOpcode::IMPLICIT_DEF), DestReg);
+  else {
+    // Can we reuse an earlier PHI node? This only happens for critical edges,
+    // typically those created by tail duplication.
+    unsigned &entry = LoweredPHIs[MPhi];
+    if (entry) {
+      // An identical PHI node was already lowered. Reuse the incoming register.
+      IncomingReg = entry;
+      reusedIncoming = true;
+      ++NumReused;
+      LLVM_DEBUG(dbgs() << "Reusing " << printReg(IncomingReg) << " for "
+                        << *MPhi);
+    } else {
+      const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(DestReg);
+      entry = IncomingReg = MF.getRegInfo().createVirtualRegister(RC);
+    }
+    // Give the target possiblity to handle special cases fallthrough otherwise
+    PHICopy = TII->createPHIDestinationCopy(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
+                                  IncomingReg, DestReg);
+  }
+
+  if (MPhi->peekDebugInstrNum()) {
+    // If referred to by debug-info, store where this PHI was.
+    MachineFunction *MF = MBB.getParent();
+    unsigned ID = MPhi->peekDebugInstrNum();
+    auto P = MachineFunction::DebugPHIRegallocPos(&MBB, IncomingReg, 0);
+    auto Res = MF->DebugPHIPositions.insert({ID, P});
+    assert(Res.second);
+    (void)Res;
+  }
+
+  // Update live variable information if there is any.
+  if (LV) {
+    if (IncomingReg) {
+      LiveVariables::VarInfo &VI = LV->getVarInfo(IncomingReg);
+
+      // Increment use count of the newly created virtual register.
+      LV->setPHIJoin(IncomingReg);
+
+      MachineInstr *OldKill = nullptr;
+      bool IsPHICopyAfterOldKill = false;
+
+      if (reusedIncoming && (OldKill = VI.findKill(&MBB))) {
+        // Calculate whether the PHICopy is after the OldKill.
+        // In general, the PHICopy is inserted as the first non-phi instruction
+        // by default, so it's before the OldKill. But some Target hooks for
+        // createPHIDestinationCopy() may modify the default insert position of
+        // PHICopy.
+        for (auto I = MBB.SkipPHIsAndLabels(MBB.begin()), E = MBB.end();
+             I != E; ++I) {
+          if (I == PHICopy)
+            break;
+
+          if (I == OldKill) {
+            IsPHICopyAfterOldKill = true;
+            break;
+          }
+        }
+      }
+
+      // When we are reusing the incoming register and it has been marked killed
+      // by OldKill, if the PHICopy is after the OldKill, we should remove the
+      // killed flag from OldKill.
+      if (IsPHICopyAfterOldKill) {
+        LLVM_DEBUG(dbgs() << "Remove old kill from " << *OldKill);
+        LV->removeVirtualRegisterKilled(IncomingReg, *OldKill);
+        LLVM_DEBUG(MBB.dump());
+      }
+
+      // Add information to LiveVariables to know that the first used incoming
+      // value or the resued incoming value whose PHICopy is after the OldKIll
+      // is killed. Note that because the value is defined in several places
+      // (once each for each incoming block), the "def" block and instruction
+      // fields for the VarInfo is not filled in.
+      if (!OldKill || IsPHICopyAfterOldKill)
+        LV->addVirtualRegisterKilled(IncomingReg, *PHICopy);
+    }
+
+    // Since we are going to be deleting the PHI node, if it is the last use of
+    // any registers, or if the value itself is dead, we need to move this
+    // information over to the new copy we just inserted.
+    LV->removeVirtualRegistersKilled(*MPhi);
+
+    // If the result is dead, update LV.
+    if (isDead) {
+      LV->addVirtualRegisterDead(DestReg, *PHICopy);
+      LV->removeVirtualRegisterDead(DestReg, *MPhi);
+    }
+  }
+
+  // Update LiveIntervals for the new copy or implicit def.
+  if (LIS) {
+    SlotIndex DestCopyIndex = LIS->InsertMachineInstrInMaps(*PHICopy);
+
+    SlotIndex MBBStartIndex = LIS->getMBBStartIdx(&MBB);
+    if (IncomingReg) {
+      // Add the region from the beginning of MBB to the copy instruction to
+      // IncomingReg's live interval.
+      LiveInterval &IncomingLI = LIS->createEmptyInterval(IncomingReg);
+      VNInfo *IncomingVNI = IncomingLI.getVNInfoAt(MBBStartIndex);
+      if (!IncomingVNI)
+        IncomingVNI = IncomingLI.getNextValue(MBBStartIndex,
+                                              LIS->getVNInfoAllocator());
+      IncomingLI.addSegment(LiveInterval::Segment(MBBStartIndex,
+                                                  DestCopyIndex.getRegSlot(),
+                                                  IncomingVNI));
+    }
+
+    LiveInterval &DestLI = LIS->getInterval(DestReg);
+    assert(!DestLI.empty() && "PHIs should have nonempty LiveIntervals.");
+    if (DestLI.endIndex().isDead()) {
+      // A dead PHI's live range begins and ends at the start of the MBB, but
+      // the lowered copy, which will still be dead, needs to begin and end at
+      // the copy instruction.
+      VNInfo *OrigDestVNI = DestLI.getVNInfoAt(MBBStartIndex);
+      assert(OrigDestVNI && "PHI destination should be live at block entry.");
+      DestLI.removeSegment(MBBStartIndex, MBBStartIndex.getDeadSlot());
+      DestLI.createDeadDef(DestCopyIndex.getRegSlot(),
+                           LIS->getVNInfoAllocator());
+      DestLI.removeValNo(OrigDestVNI);
+    } else {
+      // Otherwise, remove the region from the beginning of MBB to the copy
+      // instruction from DestReg's live interval.
+      DestLI.removeSegment(MBBStartIndex, DestCopyIndex.getRegSlot());
+      VNInfo *DestVNI = DestLI.getVNInfoAt(DestCopyIndex.getRegSlot());
+      assert(DestVNI && "PHI destination should be live at its definition.");
+      DestVNI->def = DestCopyIndex.getRegSlot();
+    }
+  }
+
+  // Adjust the VRegPHIUseCount map to account for the removal of this PHI node.
+  for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) {
+    if (!MPhi->getOperand(i).isUndef()) {
+      --VRegPHIUseCount[BBVRegPair(
+          MPhi->getOperand(i + 1).getMBB()->getNumber(),
+          MPhi->getOperand(i).getReg())];
+    }
+  }
+
+  // Now loop over all of the incoming arguments, changing them to copy into the
+  // IncomingReg register in the corresponding predecessor basic block.
+  SmallPtrSet<MachineBasicBlock*, 8> MBBsInsertedInto;
+  for (int i = NumSrcs - 1; i >= 0; --i) {
+    Register SrcReg = MPhi->getOperand(i * 2 + 1).getReg();
+    unsigned SrcSubReg = MPhi->getOperand(i*2+1).getSubReg();
+    bool SrcUndef = MPhi->getOperand(i*2+1).isUndef() ||
+      isImplicitlyDefined(SrcReg, *MRI);
+    assert(SrcReg.isVirtual() &&
+           "Machine PHI Operands must all be virtual registers!");
+
+    // Get the MachineBasicBlock equivalent of the BasicBlock that is the source
+    // path the PHI.
+    MachineBasicBlock &opBlock = *MPhi->getOperand(i*2+2).getMBB();
+
+    // Check to make sure we haven't already emitted the copy for this block.
+    // This can happen because PHI nodes may have multiple entries for the same
+    // basic block.
+    if (!MBBsInsertedInto.insert(&opBlock).second)
+      continue;  // If the copy has already been emitted, we're done.
+
+    MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg);
+    if (SrcRegDef && TII->isUnspillableTerminator(SrcRegDef)) {
+      assert(SrcRegDef->getOperand(0).isReg() &&
+             SrcRegDef->getOperand(0).isDef() &&
+             "Expected operand 0 to be a reg def!");
+      // Now that the PHI's use has been removed (as the instruction was
+      // removed) there should be no other uses of the SrcReg.
+      assert(MRI->use_empty(SrcReg) &&
+             "Expected a single use from UnspillableTerminator");
+      SrcRegDef->getOperand(0).setReg(IncomingReg);
+
+      // Update LiveVariables.
+      if (LV) {
+        LiveVariables::VarInfo &SrcVI = LV->getVarInfo(SrcReg);
+        LiveVariables::VarInfo &IncomingVI = LV->getVarInfo(IncomingReg);
+        IncomingVI.AliveBlocks = std::move(SrcVI.AliveBlocks);
+        SrcVI.AliveBlocks.clear();
+      }
+
+      continue;
+    }
+
+    // Find a safe location to insert the copy, this may be the first terminator
+    // in the block (or end()).
+    MachineBasicBlock::iterator InsertPos =
+      findPHICopyInsertPoint(&opBlock, &MBB, SrcReg);
+
+    // Insert the copy.
+    MachineInstr *NewSrcInstr = nullptr;
+    if (!reusedIncoming && IncomingReg) {
+      if (SrcUndef) {
+        // The source register is undefined, so there is no need for a real
+        // COPY, but we still need to ensure joint dominance by defs.
+        // Insert an IMPLICIT_DEF instruction.
+        NewSrcInstr = BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
+                              TII->get(TargetOpcode::IMPLICIT_DEF),
+                              IncomingReg);
+
+        // Clean up the old implicit-def, if there even was one.
+        if (MachineInstr *DefMI = MRI->getVRegDef(SrcReg))
+          if (DefMI->isImplicitDef())
+            ImpDefs.insert(DefMI);
+      } else {
+        // Delete the debug location, since the copy is inserted into a
+        // different basic block.
+        NewSrcInstr = TII->createPHISourceCopy(opBlock, InsertPos, nullptr,
+                                               SrcReg, SrcSubReg, IncomingReg);
+      }
+    }
+
+    // We only need to update the LiveVariables kill of SrcReg if this was the
+    // last PHI use of SrcReg to be lowered on this CFG edge and it is not live
+    // out of the predecessor. We can also ignore undef sources.
+    if (LV && !SrcUndef &&
+        !VRegPHIUseCount[BBVRegPair(opBlock.getNumber(), SrcReg)] &&
+        !LV->isLiveOut(SrcReg, opBlock)) {
+      // We want to be able to insert a kill of the register if this PHI (aka,
+      // the copy we just inserted) is the last use of the source value. Live
+      // variable analysis conservatively handles this by saying that the value
+      // is live until the end of the block the PHI entry lives in. If the value
+      // really is dead at the PHI copy, there will be no successor blocks which
+      // have the value live-in.
+
+      // Okay, if we now know that the value is not live out of the block, we
+      // can add a kill marker in this block saying that it kills the incoming
+      // value!
+
+      // In our final twist, we have to decide which instruction kills the
+      // register.  In most cases this is the copy, however, terminator
+      // instructions at the end of the block may also use the value. In this
+      // case, we should mark the last such terminator as being the killing
+      // block, not the copy.
+      MachineBasicBlock::iterator KillInst = opBlock.end();
+      for (MachineBasicBlock::iterator Term = InsertPos; Term != opBlock.end();
+           ++Term) {
+        if (Term->readsRegister(SrcReg))
+          KillInst = Term;
+      }
+
+      if (KillInst == opBlock.end()) {
+        // No terminator uses the register.
+
+        if (reusedIncoming || !IncomingReg) {
+          // We may have to rewind a bit if we didn't insert a copy this time.
+          KillInst = InsertPos;
+          while (KillInst != opBlock.begin()) {
+            --KillInst;
+            if (KillInst->isDebugInstr())
+              continue;
+            if (KillInst->readsRegister(SrcReg))
+              break;
+          }
+        } else {
+          // We just inserted this copy.
+          KillInst = NewSrcInstr;
+        }
+      }
+      assert(KillInst->readsRegister(SrcReg) && "Cannot find kill instruction");
+
+      // Finally, mark it killed.
+      LV->addVirtualRegisterKilled(SrcReg, *KillInst);
+
+      // This vreg no longer lives all of the way through opBlock.
+      unsigned opBlockNum = opBlock.getNumber();
+      LV->getVarInfo(SrcReg).AliveBlocks.reset(opBlockNum);
+    }
+
+    if (LIS) {
+      if (NewSrcInstr) {
+        LIS->InsertMachineInstrInMaps(*NewSrcInstr);
+        LIS->addSegmentToEndOfBlock(IncomingReg, *NewSrcInstr);
+      }
+
+      if (!SrcUndef &&
+          !VRegPHIUseCount[BBVRegPair(opBlock.getNumber(), SrcReg)]) {
+        LiveInterval &SrcLI = LIS->getInterval(SrcReg);
+
+        bool isLiveOut = false;
+        for (MachineBasicBlock *Succ : opBlock.successors()) {
+          SlotIndex startIdx = LIS->getMBBStartIdx(Succ);
+          VNInfo *VNI = SrcLI.getVNInfoAt(startIdx);
+
+          // Definitions by other PHIs are not truly live-in for our purposes.
+          if (VNI && VNI->def != startIdx) {
+            isLiveOut = true;
+            break;
+          }
+        }
+
+        if (!isLiveOut) {
+          MachineBasicBlock::iterator KillInst = opBlock.end();
+          for (MachineBasicBlock::iterator Term = InsertPos;
+               Term != opBlock.end(); ++Term) {
+            if (Term->readsRegister(SrcReg))
+              KillInst = Term;
+          }
+
+          if (KillInst == opBlock.end()) {
+            // No terminator uses the register.
+
+            if (reusedIncoming || !IncomingReg) {
+              // We may have to rewind a bit if we didn't just insert a copy.
+              KillInst = InsertPos;
+              while (KillInst != opBlock.begin()) {
+                --KillInst;
+                if (KillInst->isDebugInstr())
+                  continue;
+                if (KillInst->readsRegister(SrcReg))
+                  break;
+              }
+            } else {
+              // We just inserted this copy.
+              KillInst = std::prev(InsertPos);
+            }
+          }
+          assert(KillInst->readsRegister(SrcReg) &&
+                 "Cannot find kill instruction");
+
+          SlotIndex LastUseIndex = LIS->getInstructionIndex(*KillInst);
+          SrcLI.removeSegment(LastUseIndex.getRegSlot(),
+                              LIS->getMBBEndIdx(&opBlock));
+        }
+      }
+    }
+  }
+
+  // Really delete the PHI instruction now, if it is not in the LoweredPHIs map.
+  if (reusedIncoming || !IncomingReg) {
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(*MPhi);
+    MF.deleteMachineInstr(MPhi);
+  }
+}
+
+/// analyzePHINodes - Gather information about the PHI nodes in here. In
+/// particular, we want to map the number of uses of a virtual register which is
+/// used in a PHI node. We map that to the BB the vreg is coming from. This is
+/// used later to determine when the vreg is killed in the BB.
+void PHIElimination::analyzePHINodes(const MachineFunction& MF) {
+  for (const auto &MBB : MF) {
+    for (const auto &BBI : MBB) {
+      if (!BBI.isPHI())
+        break;
+      for (unsigned i = 1, e = BBI.getNumOperands(); i != e; i += 2) {
+        if (!BBI.getOperand(i).isUndef()) {
+          ++VRegPHIUseCount[BBVRegPair(
+              BBI.getOperand(i + 1).getMBB()->getNumber(),
+              BBI.getOperand(i).getReg())];
+        }
+      }
+    }
+  }
+}
+
+bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
+                                   MachineBasicBlock &MBB,
+                                   MachineLoopInfo *MLI,
+                                   std::vector<SparseBitVector<>> *LiveInSets) {
+  if (MBB.empty() || !MBB.front().isPHI() || MBB.isEHPad())
+    return false;   // Quick exit for basic blocks without PHIs.
+
+  const MachineLoop *CurLoop = MLI ? MLI->getLoopFor(&MBB) : nullptr;
+  bool IsLoopHeader = CurLoop && &MBB == CurLoop->getHeader();
+
+  bool Changed = false;
+  for (MachineBasicBlock::iterator BBI = MBB.begin(), BBE = MBB.end();
+       BBI != BBE && BBI->isPHI(); ++BBI) {
+    for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2) {
+      Register Reg = BBI->getOperand(i).getReg();
+      MachineBasicBlock *PreMBB = BBI->getOperand(i+1).getMBB();
+      // Is there a critical edge from PreMBB to MBB?
+      if (PreMBB->succ_size() == 1)
+        continue;
+
+      // Avoid splitting backedges of loops. It would introduce small
+      // out-of-line blocks into the loop which is very bad for code placement.
+      if (PreMBB == &MBB && !SplitAllCriticalEdges)
+        continue;
+      const MachineLoop *PreLoop = MLI ? MLI->getLoopFor(PreMBB) : nullptr;
+      if (IsLoopHeader && PreLoop == CurLoop && !SplitAllCriticalEdges)
+        continue;
+
+      // LV doesn't consider a phi use live-out, so isLiveOut only returns true
+      // when the source register is live-out for some other reason than a phi
+      // use. That means the copy we will insert in PreMBB won't be a kill, and
+      // there is a risk it may not be coalesced away.
+      //
+      // If the copy would be a kill, there is no need to split the edge.
+      bool ShouldSplit = isLiveOutPastPHIs(Reg, PreMBB);
+      if (!ShouldSplit && !NoPhiElimLiveOutEarlyExit)
+        continue;
+      if (ShouldSplit) {
+        LLVM_DEBUG(dbgs() << printReg(Reg) << " live-out before critical edge "
+                          << printMBBReference(*PreMBB) << " -> "
+                          << printMBBReference(MBB) << ": " << *BBI);
+      }
+
+      // If Reg is not live-in to MBB, it means it must be live-in to some
+      // other PreMBB successor, and we can avoid the interference by splitting
+      // the edge.
+      //
+      // If Reg *is* live-in to MBB, the interference is inevitable and a copy
+      // is likely to be left after coalescing. If we are looking at a loop
+      // exiting edge, split it so we won't insert code in the loop, otherwise
+      // don't bother.
+      ShouldSplit = ShouldSplit && !isLiveIn(Reg, &MBB);
+
+      // Check for a loop exiting edge.
+      if (!ShouldSplit && CurLoop != PreLoop) {
+        LLVM_DEBUG({
+          dbgs() << "Split wouldn't help, maybe avoid loop copies?\n";
+          if (PreLoop)
+            dbgs() << "PreLoop: " << *PreLoop;
+          if (CurLoop)
+            dbgs() << "CurLoop: " << *CurLoop;
+        });
+        // This edge could be entering a loop, exiting a loop, or it could be
+        // both: Jumping directly form one loop to the header of a sibling
+        // loop.
+        // Split unless this edge is entering CurLoop from an outer loop.
+        ShouldSplit = PreLoop && !PreLoop->contains(CurLoop);
+      }
+      if (!ShouldSplit && !SplitAllCriticalEdges)
+        continue;
+      if (!PreMBB->SplitCriticalEdge(&MBB, *this, LiveInSets)) {
+        LLVM_DEBUG(dbgs() << "Failed to split critical edge.\n");
+        continue;
+      }
+      Changed = true;
+      ++NumCriticalEdgesSplit;
+    }
+  }
+  return Changed;
+}
+
+bool PHIElimination::isLiveIn(Register Reg, const MachineBasicBlock *MBB) {
+  assert((LV || LIS) &&
+         "isLiveIn() requires either LiveVariables or LiveIntervals");
+  if (LIS)
+    return LIS->isLiveInToMBB(LIS->getInterval(Reg), MBB);
+  else
+    return LV->isLiveIn(Reg, *MBB);
+}
+
+bool PHIElimination::isLiveOutPastPHIs(Register Reg,
+                                       const MachineBasicBlock *MBB) {
+  assert((LV || LIS) &&
+         "isLiveOutPastPHIs() requires either LiveVariables or LiveIntervals");
+  // LiveVariables considers uses in PHIs to be in the predecessor basic block,
+  // so that a register used only in a PHI is not live out of the block. In
+  // contrast, LiveIntervals considers uses in PHIs to be on the edge rather than
+  // in the predecessor basic block, so that a register used only in a PHI is live
+  // out of the block.
+  if (LIS) {
+    const LiveInterval &LI = LIS->getInterval(Reg);
+    for (const MachineBasicBlock *SI : MBB->successors())
+      if (LI.liveAt(LIS->getMBBStartIdx(SI)))
+        return true;
+    return false;
+  } else {
+    return LV->isLiveOut(Reg, *MBB);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PHIEliminationUtils.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PHIEliminationUtils.cpp
new file mode 100644
index 000000000000..016335f420d3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PHIEliminationUtils.cpp
@@ -0,0 +1,64 @@
+//===-- PHIEliminationUtils.cpp - Helper functions for PHI elimination ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PHIEliminationUtils.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+// findCopyInsertPoint - Find a safe place in MBB to insert a copy from SrcReg
+// when following the CFG edge to SuccMBB. This needs to be after any def of
+// SrcReg, but before any subsequent point where control flow might jump out of
+// the basic block.
+MachineBasicBlock::iterator
+llvm::findPHICopyInsertPoint(MachineBasicBlock* MBB, MachineBasicBlock* SuccMBB,
+                             unsigned SrcReg) {
+  // Handle the trivial case trivially.
+  if (MBB->empty())
+    return MBB->begin();
+
+  // Usually, we just want to insert the copy before the first terminator
+  // instruction. However, for the edge going to a landing pad, we must insert
+  // the copy before the call/invoke instruction. Similarly for an INLINEASM_BR
+  // going to an indirect target. This is similar to SplitKit.cpp's
+  // computeLastInsertPoint, and similarly assumes that there cannot be multiple
+  // instructions that are Calls with EHPad successors or INLINEASM_BR in a
+  // block.
+  bool EHPadSuccessor = SuccMBB->isEHPad();
+  if (!EHPadSuccessor && !SuccMBB->isInlineAsmBrIndirectTarget())
+    return MBB->getFirstTerminator();
+
+  // Discover any defs in this basic block.
+  SmallPtrSet<MachineInstr *, 8> DefsInMBB;
+  MachineRegisterInfo& MRI = MBB->getParent()->getRegInfo();
+  for (MachineInstr &RI : MRI.def_instructions(SrcReg))
+    if (RI.getParent() == MBB)
+      DefsInMBB.insert(&RI);
+
+  MachineBasicBlock::iterator InsertPoint = MBB->begin();
+  // Insert the copy at the _latest_ point of:
+  // 1. Immediately AFTER the last def
+  // 2. Immediately BEFORE a call/inlineasm_br.
+  for (auto I = MBB->rbegin(), E = MBB->rend(); I != E; ++I) {
+    if (DefsInMBB.contains(&*I)) {
+      InsertPoint = std::next(I.getReverse());
+      break;
+    }
+    if ((EHPadSuccessor && I->isCall()) ||
+        I->getOpcode() == TargetOpcode::INLINEASM_BR) {
+      InsertPoint = I.getReverse();
+      break;
+    }
+  }
+
+  // Make sure the copy goes after any phi nodes but before
+  // any debug nodes.
+  return MBB->SkipPHIsAndLabels(InsertPoint);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PHIEliminationUtils.h b/contrib/llvm-project/llvm/lib/CodeGen/PHIEliminationUtils.h
new file mode 100644
index 000000000000..0ff3a41f47d3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PHIEliminationUtils.h
@@ -0,0 +1,24 @@
+//=- PHIEliminationUtils.h - Helper functions for PHI elimination -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_PHIELIMINATIONUTILS_H
+#define LLVM_LIB_CODEGEN_PHIELIMINATIONUTILS_H
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+
+namespace llvm {
+    /// findPHICopyInsertPoint - Find a safe place in MBB to insert a copy from
+    /// SrcReg when following the CFG edge to SuccMBB. This needs to be after
+    /// any def of SrcReg, but before any subsequent point where control flow
+    /// might jump out of the basic block.
+    MachineBasicBlock::iterator
+    findPHICopyInsertPoint(MachineBasicBlock* MBB, MachineBasicBlock* SuccMBB,
+                           unsigned SrcReg);
+}
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ParallelCG.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ParallelCG.cpp
new file mode 100644
index 000000000000..43b23368ead2
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ParallelCG.cpp
@@ -0,0 +1,97 @@
+//===-- ParallelCG.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines functions that can be used for parallel code generation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ParallelCG.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/MemoryBufferRef.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SplitModule.h"
+
+using namespace llvm;
+
+static void codegen(Module *M, llvm::raw_pwrite_stream &OS,
+                    function_ref<std::unique_ptr<TargetMachine>()> TMFactory,
+                    CodeGenFileType FileType) {
+  std::unique_ptr<TargetMachine> TM = TMFactory();
+  assert(TM && "Failed to create target machine!");
+
+  legacy::PassManager CodeGenPasses;
+  if (TM->addPassesToEmitFile(CodeGenPasses, OS, nullptr, FileType))
+    report_fatal_error("Failed to setup codegen");
+  CodeGenPasses.run(*M);
+}
+
+void llvm::splitCodeGen(
+    Module &M, ArrayRef<llvm::raw_pwrite_stream *> OSs,
+    ArrayRef<llvm::raw_pwrite_stream *> BCOSs,
+    const std::function<std::unique_ptr<TargetMachine>()> &TMFactory,
+    CodeGenFileType FileType, bool PreserveLocals) {
+  assert(BCOSs.empty() || BCOSs.size() == OSs.size());
+
+  if (OSs.size() == 1) {
+    if (!BCOSs.empty())
+      WriteBitcodeToFile(M, *BCOSs[0]);
+    codegen(&M, *OSs[0], TMFactory, FileType);
+    return;
+  }
+
+  // Create ThreadPool in nested scope so that threads will be joined
+  // on destruction.
+  {
+    ThreadPool CodegenThreadPool(hardware_concurrency(OSs.size()));
+    int ThreadCount = 0;
+
+    SplitModule(
+        M, OSs.size(),
+        [&](std::unique_ptr<Module> MPart) {
+          // We want to clone the module in a new context to multi-thread the
+          // codegen. We do it by serializing partition modules to bitcode
+          // (while still on the main thread, in order to avoid data races) and
+          // spinning up new threads which deserialize the partitions into
+          // separate contexts.
+          // FIXME: Provide a more direct way to do this in LLVM.
+          SmallString<0> BC;
+          raw_svector_ostream BCOS(BC);
+          WriteBitcodeToFile(*MPart, BCOS);
+
+          if (!BCOSs.empty()) {
+            BCOSs[ThreadCount]->write(BC.begin(), BC.size());
+            BCOSs[ThreadCount]->flush();
+          }
+
+          llvm::raw_pwrite_stream *ThreadOS = OSs[ThreadCount++];
+          // Enqueue the task
+          CodegenThreadPool.async(
+              [TMFactory, FileType, ThreadOS](const SmallString<0> &BC) {
+                LLVMContext Ctx;
+                Expected<std::unique_ptr<Module>> MOrErr = parseBitcodeFile(
+                    MemoryBufferRef(StringRef(BC.data(), BC.size()),
+                                    "<split-module>"),
+                    Ctx);
+                if (!MOrErr)
+                  report_fatal_error("Failed to read bitcode");
+                std::unique_ptr<Module> MPartInCtx = std::move(MOrErr.get());
+
+                codegen(MPartInCtx.get(), *ThreadOS, TMFactory, FileType);
+              },
+              // Pass BC using std::move to ensure that it get moved rather than
+              // copied into the thread's context.
+              std::move(BC));
+        },
+        PreserveLocals);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PatchableFunction.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PatchableFunction.cpp
new file mode 100644
index 000000000000..9449f143366f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PatchableFunction.cpp
@@ -0,0 +1,98 @@
+//===-- PatchableFunction.cpp - Patchable prologues for LLVM -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements edits function bodies in place to support the
+// "patchable-function" attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+
+using namespace llvm;
+
+namespace {
+struct PatchableFunction : public MachineFunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  PatchableFunction() : MachineFunctionPass(ID) {
+    initializePatchableFunctionPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+   MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+};
+}
+
+bool PatchableFunction::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getFunction().hasFnAttribute("patchable-function-entry")) {
+    MachineBasicBlock &FirstMBB = *MF.begin();
+    const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+    // The initial .loc covers PATCHABLE_FUNCTION_ENTER.
+    BuildMI(FirstMBB, FirstMBB.begin(), DebugLoc(),
+            TII->get(TargetOpcode::PATCHABLE_FUNCTION_ENTER));
+    return true;
+  }
+
+  if (!MF.getFunction().hasFnAttribute("patchable-function"))
+    return false;
+
+#ifndef NDEBUG
+  Attribute PatchAttr = MF.getFunction().getFnAttribute("patchable-function");
+  StringRef PatchType = PatchAttr.getValueAsString();
+  assert(PatchType == "prologue-short-redirect" && "Only possibility today!");
+#endif
+
+  auto &FirstMBB = *MF.begin();
+  auto *TII = MF.getSubtarget().getInstrInfo();
+
+  MachineBasicBlock::iterator FirstActualI = llvm::find_if(
+      FirstMBB, [](const MachineInstr &MI) { return !MI.isMetaInstruction(); });
+
+  if (FirstActualI == FirstMBB.end()) {
+    // As of Microsoft documentation on /hotpatch feature, we must ensure that
+    // "the first instruction of each function is at least two bytes, and no
+    // jump within the function goes to the first instruction"
+
+    // When the first MBB is empty, insert a patchable no-op. This ensures the
+    // first instruction is patchable in two special cases:
+    // - the function is empty (e.g. unreachable)
+    // - the function jumps back to the first instruction, which is in a
+    // successor MBB.
+    BuildMI(&FirstMBB, DebugLoc(), TII->get(TargetOpcode::PATCHABLE_OP))
+        .addImm(2)
+        .addImm(TargetOpcode::PATCHABLE_OP);
+    MF.ensureAlignment(Align(16));
+    return true;
+  }
+
+  auto MIB = BuildMI(FirstMBB, FirstActualI, FirstActualI->getDebugLoc(),
+                     TII->get(TargetOpcode::PATCHABLE_OP))
+                 .addImm(2)
+                 .addImm(FirstActualI->getOpcode());
+
+  for (auto &MO : FirstActualI->operands())
+    MIB.add(MO);
+
+  FirstActualI->eraseFromParent();
+  MF.ensureAlignment(Align(16));
+  return true;
+}
+
+char PatchableFunction::ID = 0;
+char &llvm::PatchableFunctionID = PatchableFunction::ID;
+INITIALIZE_PASS(PatchableFunction, "patchable-function",
+                "Implement the 'patchable-function' attribute", false, false)
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PeepholeOptimizer.cpp
new file mode 100644
index 000000000000..a08cc78f11b1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -0,0 +1,2128 @@
+//===- PeepholeOptimizer.cpp - Peephole Optimizations ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Perform peephole optimizations on the machine code:
+//
+// - Optimize Extensions
+//
+//     Optimization of sign / zero extension instructions. It may be extended to
+//     handle other instructions with similar properties.
+//
+//     On some targets, some instructions, e.g. X86 sign / zero extension, may
+//     leave the source value in the lower part of the result. This optimization
+//     will replace some uses of the pre-extension value with uses of the
+//     sub-register of the results.
+//
+// - Optimize Comparisons
+//
+//     Optimization of comparison instructions. For instance, in this code:
+//
+//       sub r1, 1
+//       cmp r1, 0
+//       bz  L1
+//
+//     If the "sub" instruction all ready sets (or could be modified to set) the
+//     same flag that the "cmp" instruction sets and that "bz" uses, then we can
+//     eliminate the "cmp" instruction.
+//
+//     Another instance, in this code:
+//
+//       sub r1, r3 | sub r1, imm
+//       cmp r3, r1 or cmp r1, r3 | cmp r1, imm
+//       bge L1
+//
+//     If the branch instruction can use flag from "sub", then we can replace
+//     "sub" with "subs" and eliminate the "cmp" instruction.
+//
+// - Optimize Loads:
+//
+//     Loads that can be folded into a later instruction. A load is foldable
+//     if it loads to virtual registers and the virtual register defined has
+//     a single use.
+//
+// - Optimize Copies and Bitcast (more generally, target specific copies):
+//
+//     Rewrite copies and bitcasts to avoid cross register bank copies
+//     when possible.
+//     E.g., Consider the following example, where capital and lower
+//     letters denote different register file:
+//     b = copy A <-- cross-bank copy
+//     C = copy b <-- cross-bank copy
+//   =>
+//     b = copy A <-- cross-bank copy
+//     C = copy A <-- same-bank copy
+//
+//     E.g., for bitcast:
+//     b = bitcast A <-- cross-bank copy
+//     C = bitcast b <-- cross-bank copy
+//   =>
+//     b = bitcast A <-- cross-bank copy
+//     C = copy A    <-- same-bank copy
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+using namespace llvm;
+using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
+using RegSubRegPairAndIdx = TargetInstrInfo::RegSubRegPairAndIdx;
+
+#define DEBUG_TYPE "peephole-opt"
+
+// Optimize Extensions
+static cl::opt<bool>
+Aggressive("aggressive-ext-opt", cl::Hidden,
+           cl::desc("Aggressive extension optimization"));
+
+static cl::opt<bool>
+DisablePeephole("disable-peephole", cl::Hidden, cl::init(false),
+                cl::desc("Disable the peephole optimizer"));
+
+/// Specifiy whether or not the value tracking looks through
+/// complex instructions. When this is true, the value tracker
+/// bails on everything that is not a copy or a bitcast.
+static cl::opt<bool>
+DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(false),
+                  cl::desc("Disable advanced copy optimization"));
+
+static cl::opt<bool> DisableNAPhysCopyOpt(
+    "disable-non-allocatable-phys-copy-opt", cl::Hidden, cl::init(false),
+    cl::desc("Disable non-allocatable physical register copy optimization"));
+
+// Limit the number of PHI instructions to process
+// in PeepholeOptimizer::getNextSource.
+static cl::opt<unsigned> RewritePHILimit(
+    "rewrite-phi-limit", cl::Hidden, cl::init(10),
+    cl::desc("Limit the length of PHI chains to lookup"));
+
+// Limit the length of recurrence chain when evaluating the benefit of
+// commuting operands.
+static cl::opt<unsigned> MaxRecurrenceChain(
+    "recurrence-chain-limit", cl::Hidden, cl::init(3),
+    cl::desc("Maximum length of recurrence chain when evaluating the benefit "
+             "of commuting operands"));
+
+
+STATISTIC(NumReuse, "Number of extension results reused");
+STATISTIC(NumCmps, "Number of compares eliminated");
+STATISTIC(NumImmFold, "Number of move immediate folded");
+STATISTIC(NumLoadFold, "Number of loads folded");
+STATISTIC(NumSelects, "Number of selects optimized");
+STATISTIC(NumUncoalescableCopies, "Number of uncoalescable copies optimized");
+STATISTIC(NumRewrittenCopies, "Number of copies rewritten");
+STATISTIC(NumNAPhysCopies, "Number of non-allocatable physical copies removed");
+
+namespace {
+
+  class ValueTrackerResult;
+  class RecurrenceInstr;
+
+  class PeepholeOptimizer : public MachineFunctionPass {
+    const TargetInstrInfo *TII = nullptr;
+    const TargetRegisterInfo *TRI = nullptr;
+    MachineRegisterInfo *MRI = nullptr;
+    MachineDominatorTree *DT = nullptr; // Machine dominator tree
+    MachineLoopInfo *MLI = nullptr;
+
+  public:
+    static char ID; // Pass identification
+
+    PeepholeOptimizer() : MachineFunctionPass(ID) {
+      initializePeepholeOptimizerPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      if (Aggressive) {
+        AU.addRequired<MachineDominatorTree>();
+        AU.addPreserved<MachineDominatorTree>();
+      }
+    }
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties()
+        .set(MachineFunctionProperties::Property::IsSSA);
+    }
+
+    /// Track Def -> Use info used for rewriting copies.
+    using RewriteMapTy = SmallDenseMap<RegSubRegPair, ValueTrackerResult>;
+
+    /// Sequence of instructions that formulate recurrence cycle.
+    using RecurrenceCycle = SmallVector<RecurrenceInstr, 4>;
+
+  private:
+    bool optimizeCmpInstr(MachineInstr &MI);
+    bool optimizeExtInstr(MachineInstr &MI, MachineBasicBlock &MBB,
+                          SmallPtrSetImpl<MachineInstr*> &LocalMIs);
+    bool optimizeSelect(MachineInstr &MI,
+                        SmallPtrSetImpl<MachineInstr *> &LocalMIs);
+    bool optimizeCondBranch(MachineInstr &MI);
+    bool optimizeCoalescableCopy(MachineInstr &MI);
+    bool optimizeUncoalescableCopy(MachineInstr &MI,
+                                   SmallPtrSetImpl<MachineInstr *> &LocalMIs);
+    bool optimizeRecurrence(MachineInstr &PHI);
+    bool findNextSource(RegSubRegPair RegSubReg, RewriteMapTy &RewriteMap);
+    bool isMoveImmediate(MachineInstr &MI, SmallSet<Register, 4> &ImmDefRegs,
+                         DenseMap<Register, MachineInstr *> &ImmDefMIs);
+    bool foldImmediate(MachineInstr &MI, SmallSet<Register, 4> &ImmDefRegs,
+                       DenseMap<Register, MachineInstr *> &ImmDefMIs);
+
+    /// Finds recurrence cycles, but only ones that formulated around
+    /// a def operand and a use operand that are tied. If there is a use
+    /// operand commutable with the tied use operand, find recurrence cycle
+    /// along that operand as well.
+    bool findTargetRecurrence(Register Reg,
+                              const SmallSet<Register, 2> &TargetReg,
+                              RecurrenceCycle &RC);
+
+    /// If copy instruction \p MI is a virtual register copy or a copy of a
+    /// constant physical register to a virtual register, track it in the
+    /// set \p CopyMIs. If this virtual register was previously seen as a
+    /// copy, replace the uses of this copy with the previously seen copy's
+    /// destination register.
+    bool foldRedundantCopy(MachineInstr &MI,
+                           DenseMap<RegSubRegPair, MachineInstr *> &CopyMIs);
+
+    /// Is the register \p Reg a non-allocatable physical register?
+    bool isNAPhysCopy(Register Reg);
+
+    /// If copy instruction \p MI is a non-allocatable virtual<->physical
+    /// register copy, track it in the \p NAPhysToVirtMIs map. If this
+    /// non-allocatable physical register was previously copied to a virtual
+    /// registered and hasn't been clobbered, the virt->phys copy can be
+    /// deleted.
+    bool foldRedundantNAPhysCopy(
+        MachineInstr &MI, DenseMap<Register, MachineInstr *> &NAPhysToVirtMIs);
+
+    bool isLoadFoldable(MachineInstr &MI,
+                        SmallSet<Register, 16> &FoldAsLoadDefCandidates);
+
+    /// Check whether \p MI is understood by the register coalescer
+    /// but may require some rewriting.
+    bool isCoalescableCopy(const MachineInstr &MI) {
+      // SubregToRegs are not interesting, because they are already register
+      // coalescer friendly.
+      return MI.isCopy() || (!DisableAdvCopyOpt &&
+                             (MI.isRegSequence() || MI.isInsertSubreg() ||
+                              MI.isExtractSubreg()));
+    }
+
+    /// Check whether \p MI is a copy like instruction that is
+    /// not recognized by the register coalescer.
+    bool isUncoalescableCopy(const MachineInstr &MI) {
+      return MI.isBitcast() ||
+             (!DisableAdvCopyOpt &&
+              (MI.isRegSequenceLike() || MI.isInsertSubregLike() ||
+               MI.isExtractSubregLike()));
+    }
+
+    MachineInstr &rewriteSource(MachineInstr &CopyLike,
+                                RegSubRegPair Def, RewriteMapTy &RewriteMap);
+  };
+
+  /// Helper class to hold instructions that are inside recurrence cycles.
+  /// The recurrence cycle is formulated around 1) a def operand and its
+  /// tied use operand, or 2) a def operand and a use operand that is commutable
+  /// with another use operand which is tied to the def operand. In the latter
+  /// case, index of the tied use operand and the commutable use operand are
+  /// maintained with CommutePair.
+  class RecurrenceInstr {
+  public:
+    using IndexPair = std::pair<unsigned, unsigned>;
+
+    RecurrenceInstr(MachineInstr *MI) : MI(MI) {}
+    RecurrenceInstr(MachineInstr *MI, unsigned Idx1, unsigned Idx2)
+      : MI(MI), CommutePair(std::make_pair(Idx1, Idx2)) {}
+
+    MachineInstr *getMI() const { return MI; }
+    std::optional<IndexPair> getCommutePair() const { return CommutePair; }
+
+  private:
+    MachineInstr *MI;
+    std::optional<IndexPair> CommutePair;
+  };
+
+  /// Helper class to hold a reply for ValueTracker queries.
+  /// Contains the returned sources for a given search and the instructions
+  /// where the sources were tracked from.
+  class ValueTrackerResult {
+  private:
+    /// Track all sources found by one ValueTracker query.
+    SmallVector<RegSubRegPair, 2> RegSrcs;
+
+    /// Instruction using the sources in 'RegSrcs'.
+    const MachineInstr *Inst = nullptr;
+
+  public:
+    ValueTrackerResult() = default;
+
+    ValueTrackerResult(Register Reg, unsigned SubReg) {
+      addSource(Reg, SubReg);
+    }
+
+    bool isValid() const { return getNumSources() > 0; }
+
+    void setInst(const MachineInstr *I) { Inst = I; }
+    const MachineInstr *getInst() const { return Inst; }
+
+    void clear() {
+      RegSrcs.clear();
+      Inst = nullptr;
+    }
+
+    void addSource(Register SrcReg, unsigned SrcSubReg) {
+      RegSrcs.push_back(RegSubRegPair(SrcReg, SrcSubReg));
+    }
+
+    void setSource(int Idx, Register SrcReg, unsigned SrcSubReg) {
+      assert(Idx < getNumSources() && "Reg pair source out of index");
+      RegSrcs[Idx] = RegSubRegPair(SrcReg, SrcSubReg);
+    }
+
+    int getNumSources() const { return RegSrcs.size(); }
+
+    RegSubRegPair getSrc(int Idx) const {
+      return RegSrcs[Idx];
+    }
+
+    Register getSrcReg(int Idx) const {
+      assert(Idx < getNumSources() && "Reg source out of index");
+      return RegSrcs[Idx].Reg;
+    }
+
+    unsigned getSrcSubReg(int Idx) const {
+      assert(Idx < getNumSources() && "SubReg source out of index");
+      return RegSrcs[Idx].SubReg;
+    }
+
+    bool operator==(const ValueTrackerResult &Other) const {
+      if (Other.getInst() != getInst())
+        return false;
+
+      if (Other.getNumSources() != getNumSources())
+        return false;
+
+      for (int i = 0, e = Other.getNumSources(); i != e; ++i)
+        if (Other.getSrcReg(i) != getSrcReg(i) ||
+            Other.getSrcSubReg(i) != getSrcSubReg(i))
+          return false;
+      return true;
+    }
+  };
+
+  /// Helper class to track the possible sources of a value defined by
+  /// a (chain of) copy related instructions.
+  /// Given a definition (instruction and definition index), this class
+  /// follows the use-def chain to find successive suitable sources.
+  /// The given source can be used to rewrite the definition into
+  /// def = COPY src.
+  ///
+  /// For instance, let us consider the following snippet:
+  /// v0 =
+  /// v2 = INSERT_SUBREG v1, v0, sub0
+  /// def = COPY v2.sub0
+  ///
+  /// Using a ValueTracker for def = COPY v2.sub0 will give the following
+  /// suitable sources:
+  /// v2.sub0 and v0.
+  /// Then, def can be rewritten into def = COPY v0.
+  class ValueTracker {
+  private:
+    /// The current point into the use-def chain.
+    const MachineInstr *Def = nullptr;
+
+    /// The index of the definition in Def.
+    unsigned DefIdx = 0;
+
+    /// The sub register index of the definition.
+    unsigned DefSubReg;
+
+    /// The register where the value can be found.
+    Register Reg;
+
+    /// MachineRegisterInfo used to perform tracking.
+    const MachineRegisterInfo &MRI;
+
+    /// Optional TargetInstrInfo used to perform some complex tracking.
+    const TargetInstrInfo *TII;
+
+    /// Dispatcher to the right underlying implementation of getNextSource.
+    ValueTrackerResult getNextSourceImpl();
+
+    /// Specialized version of getNextSource for Copy instructions.
+    ValueTrackerResult getNextSourceFromCopy();
+
+    /// Specialized version of getNextSource for Bitcast instructions.
+    ValueTrackerResult getNextSourceFromBitcast();
+
+    /// Specialized version of getNextSource for RegSequence instructions.
+    ValueTrackerResult getNextSourceFromRegSequence();
+
+    /// Specialized version of getNextSource for InsertSubreg instructions.
+    ValueTrackerResult getNextSourceFromInsertSubreg();
+
+    /// Specialized version of getNextSource for ExtractSubreg instructions.
+    ValueTrackerResult getNextSourceFromExtractSubreg();
+
+    /// Specialized version of getNextSource for SubregToReg instructions.
+    ValueTrackerResult getNextSourceFromSubregToReg();
+
+    /// Specialized version of getNextSource for PHI instructions.
+    ValueTrackerResult getNextSourceFromPHI();
+
+  public:
+    /// Create a ValueTracker instance for the value defined by \p Reg.
+    /// \p DefSubReg represents the sub register index the value tracker will
+    /// track. It does not need to match the sub register index used in the
+    /// definition of \p Reg.
+    /// If \p Reg is a physical register, a value tracker constructed with
+    /// this constructor will not find any alternative source.
+    /// Indeed, when \p Reg is a physical register that constructor does not
+    /// know which definition of \p Reg it should track.
+    /// Use the next constructor to track a physical register.
+    ValueTracker(Register Reg, unsigned DefSubReg,
+                 const MachineRegisterInfo &MRI,
+                 const TargetInstrInfo *TII = nullptr)
+        : DefSubReg(DefSubReg), Reg(Reg), MRI(MRI), TII(TII) {
+      if (!Reg.isPhysical()) {
+        Def = MRI.getVRegDef(Reg);
+        DefIdx = MRI.def_begin(Reg).getOperandNo();
+      }
+    }
+
+    /// Following the use-def chain, get the next available source
+    /// for the tracked value.
+    /// \return A ValueTrackerResult containing a set of registers
+    /// and sub registers with tracked values. A ValueTrackerResult with
+    /// an empty set of registers means no source was found.
+    ValueTrackerResult getNextSource();
+  };
+
+} // end anonymous namespace
+
+char PeepholeOptimizer::ID = 0;
+
+char &llvm::PeepholeOptimizerID = PeepholeOptimizer::ID;
+
+INITIALIZE_PASS_BEGIN(PeepholeOptimizer, DEBUG_TYPE,
+                      "Peephole Optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(PeepholeOptimizer, DEBUG_TYPE,
+                    "Peephole Optimizations", false, false)
+
+/// If instruction is a copy-like instruction, i.e. it reads a single register
+/// and writes a single register and it does not modify the source, and if the
+/// source value is preserved as a sub-register of the result, then replace all
+/// reachable uses of the source with the subreg of the result.
+///
+/// Do not generate an EXTRACT that is used only in a debug use, as this changes
+/// the code. Since this code does not currently share EXTRACTs, just ignore all
+/// debug uses.
+bool PeepholeOptimizer::
+optimizeExtInstr(MachineInstr &MI, MachineBasicBlock &MBB,
+                 SmallPtrSetImpl<MachineInstr*> &LocalMIs) {
+  Register SrcReg, DstReg;
+  unsigned SubIdx;
+  if (!TII->isCoalescableExtInstr(MI, SrcReg, DstReg, SubIdx))
+    return false;
+
+  if (DstReg.isPhysical() || SrcReg.isPhysical())
+    return false;
+
+  if (MRI->hasOneNonDBGUse(SrcReg))
+    // No other uses.
+    return false;
+
+  // Ensure DstReg can get a register class that actually supports
+  // sub-registers. Don't change the class until we commit.
+  const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
+  DstRC = TRI->getSubClassWithSubReg(DstRC, SubIdx);
+  if (!DstRC)
+    return false;
+
+  // The ext instr may be operating on a sub-register of SrcReg as well.
+  // PPC::EXTSW is a 32 -> 64-bit sign extension, but it reads a 64-bit
+  // register.
+  // If UseSrcSubIdx is Set, SubIdx also applies to SrcReg, and only uses of
+  // SrcReg:SubIdx should be replaced.
+  bool UseSrcSubIdx =
+      TRI->getSubClassWithSubReg(MRI->getRegClass(SrcReg), SubIdx) != nullptr;
+
+  // The source has other uses. See if we can replace the other uses with use of
+  // the result of the extension.
+  SmallPtrSet<MachineBasicBlock*, 4> ReachedBBs;
+  for (MachineInstr &UI : MRI->use_nodbg_instructions(DstReg))
+    ReachedBBs.insert(UI.getParent());
+
+  // Uses that are in the same BB of uses of the result of the instruction.
+  SmallVector<MachineOperand*, 8> Uses;
+
+  // Uses that the result of the instruction can reach.
+  SmallVector<MachineOperand*, 8> ExtendedUses;
+
+  bool ExtendLife = true;
+  for (MachineOperand &UseMO : MRI->use_nodbg_operands(SrcReg)) {
+    MachineInstr *UseMI = UseMO.getParent();
+    if (UseMI == &MI)
+      continue;
+
+    if (UseMI->isPHI()) {
+      ExtendLife = false;
+      continue;
+    }
+
+    // Only accept uses of SrcReg:SubIdx.
+    if (UseSrcSubIdx && UseMO.getSubReg() != SubIdx)
+      continue;
+
+    // It's an error to translate this:
+    //
+    //    %reg1025 = <sext> %reg1024
+    //     ...
+    //    %reg1026 = SUBREG_TO_REG 0, %reg1024, 4
+    //
+    // into this:
+    //
+    //    %reg1025 = <sext> %reg1024
+    //     ...
+    //    %reg1027 = COPY %reg1025:4
+    //    %reg1026 = SUBREG_TO_REG 0, %reg1027, 4
+    //
+    // The problem here is that SUBREG_TO_REG is there to assert that an
+    // implicit zext occurs. It doesn't insert a zext instruction. If we allow
+    // the COPY here, it will give us the value after the <sext>, not the
+    // original value of %reg1024 before <sext>.
+    if (UseMI->getOpcode() == TargetOpcode::SUBREG_TO_REG)
+      continue;
+
+    MachineBasicBlock *UseMBB = UseMI->getParent();
+    if (UseMBB == &MBB) {
+      // Local uses that come after the extension.
+      if (!LocalMIs.count(UseMI))
+        Uses.push_back(&UseMO);
+    } else if (ReachedBBs.count(UseMBB)) {
+      // Non-local uses where the result of the extension is used. Always
+      // replace these unless it's a PHI.
+      Uses.push_back(&UseMO);
+    } else if (Aggressive && DT->dominates(&MBB, UseMBB)) {
+      // We may want to extend the live range of the extension result in order
+      // to replace these uses.
+      ExtendedUses.push_back(&UseMO);
+    } else {
+      // Both will be live out of the def MBB anyway. Don't extend live range of
+      // the extension result.
+      ExtendLife = false;
+      break;
+    }
+  }
+
+  if (ExtendLife && !ExtendedUses.empty())
+    // Extend the liveness of the extension result.
+    Uses.append(ExtendedUses.begin(), ExtendedUses.end());
+
+  // Now replace all uses.
+  bool Changed = false;
+  if (!Uses.empty()) {
+    SmallPtrSet<MachineBasicBlock*, 4> PHIBBs;
+
+    // Look for PHI uses of the extended result, we don't want to extend the
+    // liveness of a PHI input. It breaks all kinds of assumptions down
+    // stream. A PHI use is expected to be the kill of its source values.
+    for (MachineInstr &UI : MRI->use_nodbg_instructions(DstReg))
+      if (UI.isPHI())
+        PHIBBs.insert(UI.getParent());
+
+    const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
+    for (unsigned i = 0, e = Uses.size(); i != e; ++i) {
+      MachineOperand *UseMO = Uses[i];
+      MachineInstr *UseMI = UseMO->getParent();
+      MachineBasicBlock *UseMBB = UseMI->getParent();
+      if (PHIBBs.count(UseMBB))
+        continue;
+
+      // About to add uses of DstReg, clear DstReg's kill flags.
+      if (!Changed) {
+        MRI->clearKillFlags(DstReg);
+        MRI->constrainRegClass(DstReg, DstRC);
+      }
+
+      // SubReg defs are illegal in machine SSA phase,
+      // we should not generate SubReg defs.
+      //
+      // For example, for the instructions:
+      //
+      // %1:g8rc_and_g8rc_nox0 = EXTSW %0:g8rc
+      // %3:gprc_and_gprc_nor0 = COPY %0.sub_32:g8rc
+      //
+      // We should generate:
+      //
+      // %1:g8rc_and_g8rc_nox0 = EXTSW %0:g8rc
+      // %6:gprc_and_gprc_nor0 = COPY %1.sub_32:g8rc_and_g8rc_nox0
+      // %3:gprc_and_gprc_nor0 = COPY %6:gprc_and_gprc_nor0
+      //
+      if (UseSrcSubIdx)
+        RC = MRI->getRegClass(UseMI->getOperand(0).getReg());
+
+      Register NewVR = MRI->createVirtualRegister(RC);
+      BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), NewVR)
+        .addReg(DstReg, 0, SubIdx);
+      if (UseSrcSubIdx)
+        UseMO->setSubReg(0);
+
+      UseMO->setReg(NewVR);
+      ++NumReuse;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+/// If the instruction is a compare and the previous instruction it's comparing
+/// against already sets (or could be modified to set) the same flag as the
+/// compare, then we can remove the comparison and use the flag from the
+/// previous instruction.
+bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr &MI) {
+  // If this instruction is a comparison against zero and isn't comparing a
+  // physical register, we can try to optimize it.
+  Register SrcReg, SrcReg2;
+  int64_t CmpMask, CmpValue;
+  if (!TII->analyzeCompare(MI, SrcReg, SrcReg2, CmpMask, CmpValue) ||
+      SrcReg.isPhysical() || SrcReg2.isPhysical())
+    return false;
+
+  // Attempt to optimize the comparison instruction.
+  LLVM_DEBUG(dbgs() << "Attempting to optimize compare: " << MI);
+  if (TII->optimizeCompareInstr(MI, SrcReg, SrcReg2, CmpMask, CmpValue, MRI)) {
+    LLVM_DEBUG(dbgs() << "  -> Successfully optimized compare!\n");
+    ++NumCmps;
+    return true;
+  }
+
+  return false;
+}
+
+/// Optimize a select instruction.
+bool PeepholeOptimizer::optimizeSelect(MachineInstr &MI,
+                            SmallPtrSetImpl<MachineInstr *> &LocalMIs) {
+  unsigned TrueOp = 0;
+  unsigned FalseOp = 0;
+  bool Optimizable = false;
+  SmallVector<MachineOperand, 4> Cond;
+  if (TII->analyzeSelect(MI, Cond, TrueOp, FalseOp, Optimizable))
+    return false;
+  if (!Optimizable)
+    return false;
+  if (!TII->optimizeSelect(MI, LocalMIs))
+    return false;
+  LLVM_DEBUG(dbgs() << "Deleting select: " << MI);
+  MI.eraseFromParent();
+  ++NumSelects;
+  return true;
+}
+
+/// Check if a simpler conditional branch can be generated.
+bool PeepholeOptimizer::optimizeCondBranch(MachineInstr &MI) {
+  return TII->optimizeCondBranch(MI);
+}
+
+/// Try to find the next source that share the same register file
+/// for the value defined by \p Reg and \p SubReg.
+/// When true is returned, the \p RewriteMap can be used by the client to
+/// retrieve all Def -> Use along the way up to the next source. Any found
+/// Use that is not itself a key for another entry, is the next source to
+/// use. During the search for the next source, multiple sources can be found
+/// given multiple incoming sources of a PHI instruction. In this case, we
+/// look in each PHI source for the next source; all found next sources must
+/// share the same register file as \p Reg and \p SubReg. The client should
+/// then be capable to rewrite all intermediate PHIs to get the next source.
+/// \return False if no alternative sources are available. True otherwise.
+bool PeepholeOptimizer::findNextSource(RegSubRegPair RegSubReg,
+                                       RewriteMapTy &RewriteMap) {
+  // Do not try to find a new source for a physical register.
+  // So far we do not have any motivating example for doing that.
+  // Thus, instead of maintaining untested code, we will revisit that if
+  // that changes at some point.
+  Register Reg = RegSubReg.Reg;
+  if (Reg.isPhysical())
+    return false;
+  const TargetRegisterClass *DefRC = MRI->getRegClass(Reg);
+
+  SmallVector<RegSubRegPair, 4> SrcToLook;
+  RegSubRegPair CurSrcPair = RegSubReg;
+  SrcToLook.push_back(CurSrcPair);
+
+  unsigned PHICount = 0;
+  do {
+    CurSrcPair = SrcToLook.pop_back_val();
+    // As explained above, do not handle physical registers
+    if (CurSrcPair.Reg.isPhysical())
+      return false;
+
+    ValueTracker ValTracker(CurSrcPair.Reg, CurSrcPair.SubReg, *MRI, TII);
+
+    // Follow the chain of copies until we find a more suitable source, a phi
+    // or have to abort.
+    while (true) {
+      ValueTrackerResult Res = ValTracker.getNextSource();
+      // Abort at the end of a chain (without finding a suitable source).
+      if (!Res.isValid())
+        return false;
+
+      // Insert the Def -> Use entry for the recently found source.
+      ValueTrackerResult CurSrcRes = RewriteMap.lookup(CurSrcPair);
+      if (CurSrcRes.isValid()) {
+        assert(CurSrcRes == Res && "ValueTrackerResult found must match");
+        // An existent entry with multiple sources is a PHI cycle we must avoid.
+        // Otherwise it's an entry with a valid next source we already found.
+        if (CurSrcRes.getNumSources() > 1) {
+          LLVM_DEBUG(dbgs()
+                     << "findNextSource: found PHI cycle, aborting...\n");
+          return false;
+        }
+        break;
+      }
+      RewriteMap.insert(std::make_pair(CurSrcPair, Res));
+
+      // ValueTrackerResult usually have one source unless it's the result from
+      // a PHI instruction. Add the found PHI edges to be looked up further.
+      unsigned NumSrcs = Res.getNumSources();
+      if (NumSrcs > 1) {
+        PHICount++;
+        if (PHICount >= RewritePHILimit) {
+          LLVM_DEBUG(dbgs() << "findNextSource: PHI limit reached\n");
+          return false;
+        }
+
+        for (unsigned i = 0; i < NumSrcs; ++i)
+          SrcToLook.push_back(Res.getSrc(i));
+        break;
+      }
+
+      CurSrcPair = Res.getSrc(0);
+      // Do not extend the live-ranges of physical registers as they add
+      // constraints to the register allocator. Moreover, if we want to extend
+      // the live-range of a physical register, unlike SSA virtual register,
+      // we will have to check that they aren't redefine before the related use.
+      if (CurSrcPair.Reg.isPhysical())
+        return false;
+
+      // Keep following the chain if the value isn't any better yet.
+      const TargetRegisterClass *SrcRC = MRI->getRegClass(CurSrcPair.Reg);
+      if (!TRI->shouldRewriteCopySrc(DefRC, RegSubReg.SubReg, SrcRC,
+                                     CurSrcPair.SubReg))
+        continue;
+
+      // We currently cannot deal with subreg operands on PHI instructions
+      // (see insertPHI()).
+      if (PHICount > 0 && CurSrcPair.SubReg != 0)
+        continue;
+
+      // We found a suitable source, and are done with this chain.
+      break;
+    }
+  } while (!SrcToLook.empty());
+
+  // If we did not find a more suitable source, there is nothing to optimize.
+  return CurSrcPair.Reg != Reg;
+}
+
+/// Insert a PHI instruction with incoming edges \p SrcRegs that are
+/// guaranteed to have the same register class. This is necessary whenever we
+/// successfully traverse a PHI instruction and find suitable sources coming
+/// from its edges. By inserting a new PHI, we provide a rewritten PHI def
+/// suitable to be used in a new COPY instruction.
+static MachineInstr &
+insertPHI(MachineRegisterInfo &MRI, const TargetInstrInfo &TII,
+          const SmallVectorImpl<RegSubRegPair> &SrcRegs,
+          MachineInstr &OrigPHI) {
+  assert(!SrcRegs.empty() && "No sources to create a PHI instruction?");
+
+  const TargetRegisterClass *NewRC = MRI.getRegClass(SrcRegs[0].Reg);
+  // NewRC is only correct if no subregisters are involved. findNextSource()
+  // should have rejected those cases already.
+  assert(SrcRegs[0].SubReg == 0 && "should not have subreg operand");
+  Register NewVR = MRI.createVirtualRegister(NewRC);
+  MachineBasicBlock *MBB = OrigPHI.getParent();
+  MachineInstrBuilder MIB = BuildMI(*MBB, &OrigPHI, OrigPHI.getDebugLoc(),
+                                    TII.get(TargetOpcode::PHI), NewVR);
+
+  unsigned MBBOpIdx = 2;
+  for (const RegSubRegPair &RegPair : SrcRegs) {
+    MIB.addReg(RegPair.Reg, 0, RegPair.SubReg);
+    MIB.addMBB(OrigPHI.getOperand(MBBOpIdx).getMBB());
+    // Since we're extended the lifetime of RegPair.Reg, clear the
+    // kill flags to account for that and make RegPair.Reg reaches
+    // the new PHI.
+    MRI.clearKillFlags(RegPair.Reg);
+    MBBOpIdx += 2;
+  }
+
+  return *MIB;
+}
+
+namespace {
+
+/// Interface to query instructions amenable to copy rewriting.
+class Rewriter {
+protected:
+  MachineInstr &CopyLike;
+  unsigned CurrentSrcIdx = 0;   ///< The index of the source being rewritten.
+public:
+  Rewriter(MachineInstr &CopyLike) : CopyLike(CopyLike) {}
+  virtual ~Rewriter() = default;
+
+  /// Get the next rewritable source (SrcReg, SrcSubReg) and
+  /// the related value that it affects (DstReg, DstSubReg).
+  /// A source is considered rewritable if its register class and the
+  /// register class of the related DstReg may not be register
+  /// coalescer friendly. In other words, given a copy-like instruction
+  /// not all the arguments may be returned at rewritable source, since
+  /// some arguments are none to be register coalescer friendly.
+  ///
+  /// Each call of this method moves the current source to the next
+  /// rewritable source.
+  /// For instance, let CopyLike be the instruction to rewrite.
+  /// CopyLike has one definition and one source:
+  /// dst.dstSubIdx = CopyLike src.srcSubIdx.
+  ///
+  /// The first call will give the first rewritable source, i.e.,
+  /// the only source this instruction has:
+  /// (SrcReg, SrcSubReg) = (src, srcSubIdx).
+  /// This source defines the whole definition, i.e.,
+  /// (DstReg, DstSubReg) = (dst, dstSubIdx).
+  ///
+  /// The second and subsequent calls will return false, as there is only one
+  /// rewritable source.
+  ///
+  /// \return True if a rewritable source has been found, false otherwise.
+  /// The output arguments are valid if and only if true is returned.
+  virtual bool getNextRewritableSource(RegSubRegPair &Src,
+                                       RegSubRegPair &Dst) = 0;
+
+  /// Rewrite the current source with \p NewReg and \p NewSubReg if possible.
+  /// \return True if the rewriting was possible, false otherwise.
+  virtual bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) = 0;
+};
+
+/// Rewriter for COPY instructions.
+class CopyRewriter : public Rewriter {
+public:
+  CopyRewriter(MachineInstr &MI) : Rewriter(MI) {
+    assert(MI.isCopy() && "Expected copy instruction");
+  }
+  virtual ~CopyRewriter() = default;
+
+  bool getNextRewritableSource(RegSubRegPair &Src,
+                               RegSubRegPair &Dst) override {
+    // CurrentSrcIdx > 0 means this function has already been called.
+    if (CurrentSrcIdx > 0)
+      return false;
+    // This is the first call to getNextRewritableSource.
+    // Move the CurrentSrcIdx to remember that we made that call.
+    CurrentSrcIdx = 1;
+    // The rewritable source is the argument.
+    const MachineOperand &MOSrc = CopyLike.getOperand(1);
+    Src = RegSubRegPair(MOSrc.getReg(), MOSrc.getSubReg());
+    // What we track are the alternative sources of the definition.
+    const MachineOperand &MODef = CopyLike.getOperand(0);
+    Dst = RegSubRegPair(MODef.getReg(), MODef.getSubReg());
+    return true;
+  }
+
+  bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override {
+    if (CurrentSrcIdx != 1)
+      return false;
+    MachineOperand &MOSrc = CopyLike.getOperand(CurrentSrcIdx);
+    MOSrc.setReg(NewReg);
+    MOSrc.setSubReg(NewSubReg);
+    return true;
+  }
+};
+
+/// Helper class to rewrite uncoalescable copy like instructions
+/// into new COPY (coalescable friendly) instructions.
+class UncoalescableRewriter : public Rewriter {
+  unsigned NumDefs;  ///< Number of defs in the bitcast.
+
+public:
+  UncoalescableRewriter(MachineInstr &MI) : Rewriter(MI) {
+    NumDefs = MI.getDesc().getNumDefs();
+  }
+
+  /// \see See Rewriter::getNextRewritableSource()
+  /// All such sources need to be considered rewritable in order to
+  /// rewrite a uncoalescable copy-like instruction. This method return
+  /// each definition that must be checked if rewritable.
+  bool getNextRewritableSource(RegSubRegPair &Src,
+                               RegSubRegPair &Dst) override {
+    // Find the next non-dead definition and continue from there.
+    if (CurrentSrcIdx == NumDefs)
+      return false;
+
+    while (CopyLike.getOperand(CurrentSrcIdx).isDead()) {
+      ++CurrentSrcIdx;
+      if (CurrentSrcIdx == NumDefs)
+        return false;
+    }
+
+    // What we track are the alternative sources of the definition.
+    Src = RegSubRegPair(0, 0);
+    const MachineOperand &MODef = CopyLike.getOperand(CurrentSrcIdx);
+    Dst = RegSubRegPair(MODef.getReg(), MODef.getSubReg());
+
+    CurrentSrcIdx++;
+    return true;
+  }
+
+  bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override {
+    return false;
+  }
+};
+
+/// Specialized rewriter for INSERT_SUBREG instruction.
+class InsertSubregRewriter : public Rewriter {
+public:
+  InsertSubregRewriter(MachineInstr &MI) : Rewriter(MI) {
+    assert(MI.isInsertSubreg() && "Invalid instruction");
+  }
+
+  /// \see See Rewriter::getNextRewritableSource()
+  /// Here CopyLike has the following form:
+  /// dst = INSERT_SUBREG Src1, Src2.src2SubIdx, subIdx.
+  /// Src1 has the same register class has dst, hence, there is
+  /// nothing to rewrite.
+  /// Src2.src2SubIdx, may not be register coalescer friendly.
+  /// Therefore, the first call to this method returns:
+  /// (SrcReg, SrcSubReg) = (Src2, src2SubIdx).
+  /// (DstReg, DstSubReg) = (dst, subIdx).
+  ///
+  /// Subsequence calls will return false.
+  bool getNextRewritableSource(RegSubRegPair &Src,
+                               RegSubRegPair &Dst) override {
+    // If we already get the only source we can rewrite, return false.
+    if (CurrentSrcIdx == 2)
+      return false;
+    // We are looking at v2 = INSERT_SUBREG v0, v1, sub0.
+    CurrentSrcIdx = 2;
+    const MachineOperand &MOInsertedReg = CopyLike.getOperand(2);
+    Src = RegSubRegPair(MOInsertedReg.getReg(), MOInsertedReg.getSubReg());
+    const MachineOperand &MODef = CopyLike.getOperand(0);
+
+    // We want to track something that is compatible with the
+    // partial definition.
+    if (MODef.getSubReg())
+      // Bail if we have to compose sub-register indices.
+      return false;
+    Dst = RegSubRegPair(MODef.getReg(),
+                        (unsigned)CopyLike.getOperand(3).getImm());
+    return true;
+  }
+
+  bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override {
+    if (CurrentSrcIdx != 2)
+      return false;
+    // We are rewriting the inserted reg.
+    MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx);
+    MO.setReg(NewReg);
+    MO.setSubReg(NewSubReg);
+    return true;
+  }
+};
+
+/// Specialized rewriter for EXTRACT_SUBREG instruction.
+class ExtractSubregRewriter : public Rewriter {
+  const TargetInstrInfo &TII;
+
+public:
+  ExtractSubregRewriter(MachineInstr &MI, const TargetInstrInfo &TII)
+      : Rewriter(MI), TII(TII) {
+    assert(MI.isExtractSubreg() && "Invalid instruction");
+  }
+
+  /// \see Rewriter::getNextRewritableSource()
+  /// Here CopyLike has the following form:
+  /// dst.dstSubIdx = EXTRACT_SUBREG Src, subIdx.
+  /// There is only one rewritable source: Src.subIdx,
+  /// which defines dst.dstSubIdx.
+  bool getNextRewritableSource(RegSubRegPair &Src,
+                               RegSubRegPair &Dst) override {
+    // If we already get the only source we can rewrite, return false.
+    if (CurrentSrcIdx == 1)
+      return false;
+    // We are looking at v1 = EXTRACT_SUBREG v0, sub0.
+    CurrentSrcIdx = 1;
+    const MachineOperand &MOExtractedReg = CopyLike.getOperand(1);
+    // If we have to compose sub-register indices, bail out.
+    if (MOExtractedReg.getSubReg())
+      return false;
+
+    Src = RegSubRegPair(MOExtractedReg.getReg(),
+                        CopyLike.getOperand(2).getImm());
+
+    // We want to track something that is compatible with the definition.
+    const MachineOperand &MODef = CopyLike.getOperand(0);
+    Dst = RegSubRegPair(MODef.getReg(), MODef.getSubReg());
+    return true;
+  }
+
+  bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override {
+    // The only source we can rewrite is the input register.
+    if (CurrentSrcIdx != 1)
+      return false;
+
+    CopyLike.getOperand(CurrentSrcIdx).setReg(NewReg);
+
+    // If we find a source that does not require to extract something,
+    // rewrite the operation with a copy.
+    if (!NewSubReg) {
+      // Move the current index to an invalid position.
+      // We do not want another call to this method to be able
+      // to do any change.
+      CurrentSrcIdx = -1;
+      // Rewrite the operation as a COPY.
+      // Get rid of the sub-register index.
+      CopyLike.removeOperand(2);
+      // Morph the operation into a COPY.
+      CopyLike.setDesc(TII.get(TargetOpcode::COPY));
+      return true;
+    }
+    CopyLike.getOperand(CurrentSrcIdx + 1).setImm(NewSubReg);
+    return true;
+  }
+};
+
+/// Specialized rewriter for REG_SEQUENCE instruction.
+class RegSequenceRewriter : public Rewriter {
+public:
+  RegSequenceRewriter(MachineInstr &MI) : Rewriter(MI) {
+    assert(MI.isRegSequence() && "Invalid instruction");
+  }
+
+  /// \see Rewriter::getNextRewritableSource()
+  /// Here CopyLike has the following form:
+  /// dst = REG_SEQUENCE Src1.src1SubIdx, subIdx1, Src2.src2SubIdx, subIdx2.
+  /// Each call will return a different source, walking all the available
+  /// source.
+  ///
+  /// The first call returns:
+  /// (SrcReg, SrcSubReg) = (Src1, src1SubIdx).
+  /// (DstReg, DstSubReg) = (dst, subIdx1).
+  ///
+  /// The second call returns:
+  /// (SrcReg, SrcSubReg) = (Src2, src2SubIdx).
+  /// (DstReg, DstSubReg) = (dst, subIdx2).
+  ///
+  /// And so on, until all the sources have been traversed, then
+  /// it returns false.
+  bool getNextRewritableSource(RegSubRegPair &Src,
+                               RegSubRegPair &Dst) override {
+    // We are looking at v0 = REG_SEQUENCE v1, sub1, v2, sub2, etc.
+
+    // If this is the first call, move to the first argument.
+    if (CurrentSrcIdx == 0) {
+      CurrentSrcIdx = 1;
+    } else {
+      // Otherwise, move to the next argument and check that it is valid.
+      CurrentSrcIdx += 2;
+      if (CurrentSrcIdx >= CopyLike.getNumOperands())
+        return false;
+    }
+    const MachineOperand &MOInsertedReg = CopyLike.getOperand(CurrentSrcIdx);
+    Src.Reg = MOInsertedReg.getReg();
+    // If we have to compose sub-register indices, bail out.
+    if ((Src.SubReg = MOInsertedReg.getSubReg()))
+      return false;
+
+    // We want to track something that is compatible with the related
+    // partial definition.
+    Dst.SubReg = CopyLike.getOperand(CurrentSrcIdx + 1).getImm();
+
+    const MachineOperand &MODef = CopyLike.getOperand(0);
+    Dst.Reg = MODef.getReg();
+    // If we have to compose sub-registers, bail.
+    return MODef.getSubReg() == 0;
+  }
+
+  bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override {
+    // We cannot rewrite out of bound operands.
+    // Moreover, rewritable sources are at odd positions.
+    if ((CurrentSrcIdx & 1) != 1 || CurrentSrcIdx > CopyLike.getNumOperands())
+      return false;
+
+    MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx);
+    MO.setReg(NewReg);
+    MO.setSubReg(NewSubReg);
+    return true;
+  }
+};
+
+} // end anonymous namespace
+
+/// Get the appropriated Rewriter for \p MI.
+/// \return A pointer to a dynamically allocated Rewriter or nullptr if no
+/// rewriter works for \p MI.
+static Rewriter *getCopyRewriter(MachineInstr &MI, const TargetInstrInfo &TII) {
+  // Handle uncoalescable copy-like instructions.
+  if (MI.isBitcast() || MI.isRegSequenceLike() || MI.isInsertSubregLike() ||
+      MI.isExtractSubregLike())
+    return new UncoalescableRewriter(MI);
+
+  switch (MI.getOpcode()) {
+  default:
+    return nullptr;
+  case TargetOpcode::COPY:
+    return new CopyRewriter(MI);
+  case TargetOpcode::INSERT_SUBREG:
+    return new InsertSubregRewriter(MI);
+  case TargetOpcode::EXTRACT_SUBREG:
+    return new ExtractSubregRewriter(MI, TII);
+  case TargetOpcode::REG_SEQUENCE:
+    return new RegSequenceRewriter(MI);
+  }
+}
+
+/// Given a \p Def.Reg and Def.SubReg  pair, use \p RewriteMap to find
+/// the new source to use for rewrite. If \p HandleMultipleSources is true and
+/// multiple sources for a given \p Def are found along the way, we found a
+/// PHI instructions that needs to be rewritten.
+/// TODO: HandleMultipleSources should be removed once we test PHI handling
+/// with coalescable copies.
+static RegSubRegPair
+getNewSource(MachineRegisterInfo *MRI, const TargetInstrInfo *TII,
+             RegSubRegPair Def,
+             const PeepholeOptimizer::RewriteMapTy &RewriteMap,
+             bool HandleMultipleSources = true) {
+  RegSubRegPair LookupSrc(Def.Reg, Def.SubReg);
+  while (true) {
+    ValueTrackerResult Res = RewriteMap.lookup(LookupSrc);
+    // If there are no entries on the map, LookupSrc is the new source.
+    if (!Res.isValid())
+      return LookupSrc;
+
+    // There's only one source for this definition, keep searching...
+    unsigned NumSrcs = Res.getNumSources();
+    if (NumSrcs == 1) {
+      LookupSrc.Reg = Res.getSrcReg(0);
+      LookupSrc.SubReg = Res.getSrcSubReg(0);
+      continue;
+    }
+
+    // TODO: Remove once multiple srcs w/ coalescable copies are supported.
+    if (!HandleMultipleSources)
+      break;
+
+    // Multiple sources, recurse into each source to find a new source
+    // for it. Then, rewrite the PHI accordingly to its new edges.
+    SmallVector<RegSubRegPair, 4> NewPHISrcs;
+    for (unsigned i = 0; i < NumSrcs; ++i) {
+      RegSubRegPair PHISrc(Res.getSrcReg(i), Res.getSrcSubReg(i));
+      NewPHISrcs.push_back(
+          getNewSource(MRI, TII, PHISrc, RewriteMap, HandleMultipleSources));
+    }
+
+    // Build the new PHI node and return its def register as the new source.
+    MachineInstr &OrigPHI = const_cast<MachineInstr &>(*Res.getInst());
+    MachineInstr &NewPHI = insertPHI(*MRI, *TII, NewPHISrcs, OrigPHI);
+    LLVM_DEBUG(dbgs() << "-- getNewSource\n");
+    LLVM_DEBUG(dbgs() << "   Replacing: " << OrigPHI);
+    LLVM_DEBUG(dbgs() << "        With: " << NewPHI);
+    const MachineOperand &MODef = NewPHI.getOperand(0);
+    return RegSubRegPair(MODef.getReg(), MODef.getSubReg());
+  }
+
+  return RegSubRegPair(0, 0);
+}
+
+/// Optimize generic copy instructions to avoid cross register bank copy.
+/// The optimization looks through a chain of copies and tries to find a source
+/// that has a compatible register class.
+/// Two register classes are considered to be compatible if they share the same
+/// register bank.
+/// New copies issued by this optimization are register allocator
+/// friendly. This optimization does not remove any copy as it may
+/// overconstrain the register allocator, but replaces some operands
+/// when possible.
+/// \pre isCoalescableCopy(*MI) is true.
+/// \return True, when \p MI has been rewritten. False otherwise.
+bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr &MI) {
+  assert(isCoalescableCopy(MI) && "Invalid argument");
+  assert(MI.getDesc().getNumDefs() == 1 &&
+         "Coalescer can understand multiple defs?!");
+  const MachineOperand &MODef = MI.getOperand(0);
+  // Do not rewrite physical definitions.
+  if (MODef.getReg().isPhysical())
+    return false;
+
+  bool Changed = false;
+  // Get the right rewriter for the current copy.
+  std::unique_ptr<Rewriter> CpyRewriter(getCopyRewriter(MI, *TII));
+  // If none exists, bail out.
+  if (!CpyRewriter)
+    return false;
+  // Rewrite each rewritable source.
+  RegSubRegPair Src;
+  RegSubRegPair TrackPair;
+  while (CpyRewriter->getNextRewritableSource(Src, TrackPair)) {
+    // Keep track of PHI nodes and its incoming edges when looking for sources.
+    RewriteMapTy RewriteMap;
+    // Try to find a more suitable source. If we failed to do so, or get the
+    // actual source, move to the next source.
+    if (!findNextSource(TrackPair, RewriteMap))
+      continue;
+
+    // Get the new source to rewrite. TODO: Only enable handling of multiple
+    // sources (PHIs) once we have a motivating example and testcases for it.
+    RegSubRegPair NewSrc = getNewSource(MRI, TII, TrackPair, RewriteMap,
+                                        /*HandleMultipleSources=*/false);
+    if (Src.Reg == NewSrc.Reg || NewSrc.Reg == 0)
+      continue;
+
+    // Rewrite source.
+    if (CpyRewriter->RewriteCurrentSource(NewSrc.Reg, NewSrc.SubReg)) {
+      // We may have extended the live-range of NewSrc, account for that.
+      MRI->clearKillFlags(NewSrc.Reg);
+      Changed = true;
+    }
+  }
+  // TODO: We could have a clean-up method to tidy the instruction.
+  // E.g., v0 = INSERT_SUBREG v1, v1.sub0, sub0
+  // => v0 = COPY v1
+  // Currently we haven't seen motivating example for that and we
+  // want to avoid untested code.
+  NumRewrittenCopies += Changed;
+  return Changed;
+}
+
+/// Rewrite the source found through \p Def, by using the \p RewriteMap
+/// and create a new COPY instruction. More info about RewriteMap in
+/// PeepholeOptimizer::findNextSource. Right now this is only used to handle
+/// Uncoalescable copies, since they are copy like instructions that aren't
+/// recognized by the register allocator.
+MachineInstr &
+PeepholeOptimizer::rewriteSource(MachineInstr &CopyLike,
+                                 RegSubRegPair Def, RewriteMapTy &RewriteMap) {
+  assert(!Def.Reg.isPhysical() && "We do not rewrite physical registers");
+
+  // Find the new source to use in the COPY rewrite.
+  RegSubRegPair NewSrc = getNewSource(MRI, TII, Def, RewriteMap);
+
+  // Insert the COPY.
+  const TargetRegisterClass *DefRC = MRI->getRegClass(Def.Reg);
+  Register NewVReg = MRI->createVirtualRegister(DefRC);
+
+  MachineInstr *NewCopy =
+      BuildMI(*CopyLike.getParent(), &CopyLike, CopyLike.getDebugLoc(),
+              TII->get(TargetOpcode::COPY), NewVReg)
+          .addReg(NewSrc.Reg, 0, NewSrc.SubReg);
+
+  if (Def.SubReg) {
+    NewCopy->getOperand(0).setSubReg(Def.SubReg);
+    NewCopy->getOperand(0).setIsUndef();
+  }
+
+  LLVM_DEBUG(dbgs() << "-- RewriteSource\n");
+  LLVM_DEBUG(dbgs() << "   Replacing: " << CopyLike);
+  LLVM_DEBUG(dbgs() << "        With: " << *NewCopy);
+  MRI->replaceRegWith(Def.Reg, NewVReg);
+  MRI->clearKillFlags(NewVReg);
+
+  // We extended the lifetime of NewSrc.Reg, clear the kill flags to
+  // account for that.
+  MRI->clearKillFlags(NewSrc.Reg);
+
+  return *NewCopy;
+}
+
+/// Optimize copy-like instructions to create
+/// register coalescer friendly instruction.
+/// The optimization tries to kill-off the \p MI by looking
+/// through a chain of copies to find a source that has a compatible
+/// register class.
+/// If such a source is found, it replace \p MI by a generic COPY
+/// operation.
+/// \pre isUncoalescableCopy(*MI) is true.
+/// \return True, when \p MI has been optimized. In that case, \p MI has
+/// been removed from its parent.
+/// All COPY instructions created, are inserted in \p LocalMIs.
+bool PeepholeOptimizer::optimizeUncoalescableCopy(
+    MachineInstr &MI, SmallPtrSetImpl<MachineInstr *> &LocalMIs) {
+  assert(isUncoalescableCopy(MI) && "Invalid argument");
+  UncoalescableRewriter CpyRewriter(MI);
+
+  // Rewrite each rewritable source by generating new COPYs. This works
+  // differently from optimizeCoalescableCopy since it first makes sure that all
+  // definitions can be rewritten.
+  RewriteMapTy RewriteMap;
+  RegSubRegPair Src;
+  RegSubRegPair Def;
+  SmallVector<RegSubRegPair, 4> RewritePairs;
+  while (CpyRewriter.getNextRewritableSource(Src, Def)) {
+    // If a physical register is here, this is probably for a good reason.
+    // Do not rewrite that.
+    if (Def.Reg.isPhysical())
+      return false;
+
+    // If we do not know how to rewrite this definition, there is no point
+    // in trying to kill this instruction.
+    if (!findNextSource(Def, RewriteMap))
+      return false;
+
+    RewritePairs.push_back(Def);
+  }
+
+  // The change is possible for all defs, do it.
+  for (const RegSubRegPair &Def : RewritePairs) {
+    // Rewrite the "copy" in a way the register coalescer understands.
+    MachineInstr &NewCopy = rewriteSource(MI, Def, RewriteMap);
+    LocalMIs.insert(&NewCopy);
+  }
+
+  // MI is now dead.
+  LLVM_DEBUG(dbgs() << "Deleting uncoalescable copy: " << MI);
+  MI.eraseFromParent();
+  ++NumUncoalescableCopies;
+  return true;
+}
+
+/// Check whether MI is a candidate for folding into a later instruction.
+/// We only fold loads to virtual registers and the virtual register defined
+/// has a single user.
+bool PeepholeOptimizer::isLoadFoldable(
+    MachineInstr &MI, SmallSet<Register, 16> &FoldAsLoadDefCandidates) {
+  if (!MI.canFoldAsLoad() || !MI.mayLoad())
+    return false;
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (MCID.getNumDefs() != 1)
+    return false;
+
+  Register Reg = MI.getOperand(0).getReg();
+  // To reduce compilation time, we check MRI->hasOneNonDBGUser when inserting
+  // loads. It should be checked when processing uses of the load, since
+  // uses can be removed during peephole.
+  if (Reg.isVirtual() && !MI.getOperand(0).getSubReg() &&
+      MRI->hasOneNonDBGUser(Reg)) {
+    FoldAsLoadDefCandidates.insert(Reg);
+    return true;
+  }
+  return false;
+}
+
+bool PeepholeOptimizer::isMoveImmediate(
+    MachineInstr &MI, SmallSet<Register, 4> &ImmDefRegs,
+    DenseMap<Register, MachineInstr *> &ImmDefMIs) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (!MI.isMoveImmediate())
+    return false;
+  if (MCID.getNumDefs() != 1)
+    return false;
+  Register Reg = MI.getOperand(0).getReg();
+  if (Reg.isVirtual()) {
+    ImmDefMIs.insert(std::make_pair(Reg, &MI));
+    ImmDefRegs.insert(Reg);
+    return true;
+  }
+
+  return false;
+}
+
+/// Try folding register operands that are defined by move immediate
+/// instructions, i.e. a trivial constant folding optimization, if
+/// and only if the def and use are in the same BB.
+bool PeepholeOptimizer::foldImmediate(
+    MachineInstr &MI, SmallSet<Register, 4> &ImmDefRegs,
+    DenseMap<Register, MachineInstr *> &ImmDefMIs) {
+  for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isDef())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg.isVirtual())
+      continue;
+    if (ImmDefRegs.count(Reg) == 0)
+      continue;
+    DenseMap<Register, MachineInstr *>::iterator II = ImmDefMIs.find(Reg);
+    assert(II != ImmDefMIs.end() && "couldn't find immediate definition");
+    if (TII->FoldImmediate(MI, *II->second, Reg, MRI)) {
+      ++NumImmFold;
+      return true;
+    }
+  }
+  return false;
+}
+
+// FIXME: This is very simple and misses some cases which should be handled when
+// motivating examples are found.
+//
+// The copy rewriting logic should look at uses as well as defs and be able to
+// eliminate copies across blocks.
+//
+// Later copies that are subregister extracts will also not be eliminated since
+// only the first copy is considered.
+//
+// e.g.
+// %1 = COPY %0
+// %2 = COPY %0:sub1
+//
+// Should replace %2 uses with %1:sub1
+bool PeepholeOptimizer::foldRedundantCopy(
+    MachineInstr &MI, DenseMap<RegSubRegPair, MachineInstr *> &CopyMIs) {
+  assert(MI.isCopy() && "expected a COPY machine instruction");
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  unsigned SrcSubReg = MI.getOperand(1).getSubReg();
+  if (!SrcReg.isVirtual() && !MRI->isConstantPhysReg(SrcReg))
+    return false;
+
+  Register DstReg = MI.getOperand(0).getReg();
+  if (!DstReg.isVirtual())
+    return false;
+
+  RegSubRegPair SrcPair(SrcReg, SrcSubReg);
+
+  if (CopyMIs.insert(std::make_pair(SrcPair, &MI)).second) {
+    // First copy of this reg seen.
+    return false;
+  }
+
+  MachineInstr *PrevCopy = CopyMIs.find(SrcPair)->second;
+
+  assert(SrcSubReg == PrevCopy->getOperand(1).getSubReg() &&
+         "Unexpected mismatching subreg!");
+
+  Register PrevDstReg = PrevCopy->getOperand(0).getReg();
+
+  // Only replace if the copy register class is the same.
+  //
+  // TODO: If we have multiple copies to different register classes, we may want
+  // to track multiple copies of the same source register.
+  if (MRI->getRegClass(DstReg) != MRI->getRegClass(PrevDstReg))
+    return false;
+
+  MRI->replaceRegWith(DstReg, PrevDstReg);
+
+  // Lifetime of the previous copy has been extended.
+  MRI->clearKillFlags(PrevDstReg);
+  return true;
+}
+
+bool PeepholeOptimizer::isNAPhysCopy(Register Reg) {
+  return Reg.isPhysical() && !MRI->isAllocatable(Reg);
+}
+
+bool PeepholeOptimizer::foldRedundantNAPhysCopy(
+    MachineInstr &MI, DenseMap<Register, MachineInstr *> &NAPhysToVirtMIs) {
+  assert(MI.isCopy() && "expected a COPY machine instruction");
+
+  if (DisableNAPhysCopyOpt)
+    return false;
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  if (isNAPhysCopy(SrcReg) && DstReg.isVirtual()) {
+    // %vreg = COPY $physreg
+    // Avoid using a datastructure which can track multiple live non-allocatable
+    // phys->virt copies since LLVM doesn't seem to do this.
+    NAPhysToVirtMIs.insert({SrcReg, &MI});
+    return false;
+  }
+
+  if (!(SrcReg.isVirtual() && isNAPhysCopy(DstReg)))
+    return false;
+
+  // $physreg = COPY %vreg
+  auto PrevCopy = NAPhysToVirtMIs.find(DstReg);
+  if (PrevCopy == NAPhysToVirtMIs.end()) {
+    // We can't remove the copy: there was an intervening clobber of the
+    // non-allocatable physical register after the copy to virtual.
+    LLVM_DEBUG(dbgs() << "NAPhysCopy: intervening clobber forbids erasing "
+                      << MI);
+    return false;
+  }
+
+  Register PrevDstReg = PrevCopy->second->getOperand(0).getReg();
+  if (PrevDstReg == SrcReg) {
+    // Remove the virt->phys copy: we saw the virtual register definition, and
+    // the non-allocatable physical register's state hasn't changed since then.
+    LLVM_DEBUG(dbgs() << "NAPhysCopy: erasing " << MI);
+    ++NumNAPhysCopies;
+    return true;
+  }
+
+  // Potential missed optimization opportunity: we saw a different virtual
+  // register get a copy of the non-allocatable physical register, and we only
+  // track one such copy. Avoid getting confused by this new non-allocatable
+  // physical register definition, and remove it from the tracked copies.
+  LLVM_DEBUG(dbgs() << "NAPhysCopy: missed opportunity " << MI);
+  NAPhysToVirtMIs.erase(PrevCopy);
+  return false;
+}
+
+/// \bried Returns true if \p MO is a virtual register operand.
+static bool isVirtualRegisterOperand(MachineOperand &MO) {
+  return MO.isReg() && MO.getReg().isVirtual();
+}
+
+bool PeepholeOptimizer::findTargetRecurrence(
+    Register Reg, const SmallSet<Register, 2> &TargetRegs,
+    RecurrenceCycle &RC) {
+  // Recurrence found if Reg is in TargetRegs.
+  if (TargetRegs.count(Reg))
+    return true;
+
+  // TODO: Curerntly, we only allow the last instruction of the recurrence
+  // cycle (the instruction that feeds the PHI instruction) to have more than
+  // one uses to guarantee that commuting operands does not tie registers
+  // with overlapping live range. Once we have actual live range info of
+  // each register, this constraint can be relaxed.
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return false;
+
+  // Give up if the reccurrence chain length is longer than the limit.
+  if (RC.size() >= MaxRecurrenceChain)
+    return false;
+
+  MachineInstr &MI = *(MRI->use_instr_nodbg_begin(Reg));
+  unsigned Idx = MI.findRegisterUseOperandIdx(Reg);
+
+  // Only interested in recurrences whose instructions have only one def, which
+  // is a virtual register.
+  if (MI.getDesc().getNumDefs() != 1)
+    return false;
+
+  MachineOperand &DefOp = MI.getOperand(0);
+  if (!isVirtualRegisterOperand(DefOp))
+    return false;
+
+  // Check if def operand of MI is tied to any use operand. We are only
+  // interested in the case that all the instructions in the recurrence chain
+  // have there def operand tied with one of the use operand.
+  unsigned TiedUseIdx;
+  if (!MI.isRegTiedToUseOperand(0, &TiedUseIdx))
+    return false;
+
+  if (Idx == TiedUseIdx) {
+    RC.push_back(RecurrenceInstr(&MI));
+    return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC);
+  } else {
+    // If Idx is not TiedUseIdx, check if Idx is commutable with TiedUseIdx.
+    unsigned CommIdx = TargetInstrInfo::CommuteAnyOperandIndex;
+    if (TII->findCommutedOpIndices(MI, Idx, CommIdx) && CommIdx == TiedUseIdx) {
+      RC.push_back(RecurrenceInstr(&MI, Idx, CommIdx));
+      return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC);
+    }
+  }
+
+  return false;
+}
+
+/// Phi instructions will eventually be lowered to copy instructions.
+/// If phi is in a loop header, a recurrence may formulated around the source
+/// and destination of the phi. For such case commuting operands of the
+/// instructions in the recurrence may enable coalescing of the copy instruction
+/// generated from the phi. For example, if there is a recurrence of
+///
+/// LoopHeader:
+///   %1 = phi(%0, %100)
+/// LoopLatch:
+///   %0<def, tied1> = ADD %2<def, tied0>, %1
+///
+/// , the fact that %0 and %2 are in the same tied operands set makes
+/// the coalescing of copy instruction generated from the phi in
+/// LoopHeader(i.e. %1 = COPY %0) impossible, because %1 and
+/// %2 have overlapping live range. This introduces additional move
+/// instruction to the final assembly. However, if we commute %2 and
+/// %1 of ADD instruction, the redundant move instruction can be
+/// avoided.
+bool PeepholeOptimizer::optimizeRecurrence(MachineInstr &PHI) {
+  SmallSet<Register, 2> TargetRegs;
+  for (unsigned Idx = 1; Idx < PHI.getNumOperands(); Idx += 2) {
+    MachineOperand &MO = PHI.getOperand(Idx);
+    assert(isVirtualRegisterOperand(MO) && "Invalid PHI instruction");
+    TargetRegs.insert(MO.getReg());
+  }
+
+  bool Changed = false;
+  RecurrenceCycle RC;
+  if (findTargetRecurrence(PHI.getOperand(0).getReg(), TargetRegs, RC)) {
+    // Commutes operands of instructions in RC if necessary so that the copy to
+    // be generated from PHI can be coalesced.
+    LLVM_DEBUG(dbgs() << "Optimize recurrence chain from " << PHI);
+    for (auto &RI : RC) {
+      LLVM_DEBUG(dbgs() << "\tInst: " << *(RI.getMI()));
+      auto CP = RI.getCommutePair();
+      if (CP) {
+        Changed = true;
+        TII->commuteInstruction(*(RI.getMI()), false, (*CP).first,
+                                (*CP).second);
+        LLVM_DEBUG(dbgs() << "\t\tCommuted: " << *(RI.getMI()));
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "********** PEEPHOLE OPTIMIZER **********\n");
+  LLVM_DEBUG(dbgs() << "********** Function: " << MF.getName() << '\n');
+
+  if (DisablePeephole)
+    return false;
+
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : nullptr;
+  MLI = &getAnalysis<MachineLoopInfo>();
+
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    bool SeenMoveImm = false;
+
+    // During this forward scan, at some point it needs to answer the question
+    // "given a pointer to an MI in the current BB, is it located before or
+    // after the current instruction".
+    // To perform this, the following set keeps track of the MIs already seen
+    // during the scan, if a MI is not in the set, it is assumed to be located
+    // after. Newly created MIs have to be inserted in the set as well.
+    SmallPtrSet<MachineInstr*, 16> LocalMIs;
+    SmallSet<Register, 4> ImmDefRegs;
+    DenseMap<Register, MachineInstr *> ImmDefMIs;
+    SmallSet<Register, 16> FoldAsLoadDefCandidates;
+
+    // Track when a non-allocatable physical register is copied to a virtual
+    // register so that useless moves can be removed.
+    //
+    // $physreg is the map index; MI is the last valid `%vreg = COPY $physreg`
+    // without any intervening re-definition of $physreg.
+    DenseMap<Register, MachineInstr *> NAPhysToVirtMIs;
+
+    // Set of copies to virtual registers keyed by source register.  Never
+    // holds any physreg which requires def tracking.
+    DenseMap<RegSubRegPair, MachineInstr *> CopySrcMIs;
+
+    bool IsLoopHeader = MLI->isLoopHeader(&MBB);
+
+    for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
+         MII != MIE; ) {
+      MachineInstr *MI = &*MII;
+      // We may be erasing MI below, increment MII now.
+      ++MII;
+      LocalMIs.insert(MI);
+
+      // Skip debug instructions. They should not affect this peephole
+      // optimization.
+      if (MI->isDebugInstr())
+        continue;
+
+      if (MI->isPosition())
+        continue;
+
+      if (IsLoopHeader && MI->isPHI()) {
+        if (optimizeRecurrence(*MI)) {
+          Changed = true;
+          continue;
+        }
+      }
+
+      if (!MI->isCopy()) {
+        for (const MachineOperand &MO : MI->operands()) {
+          // Visit all operands: definitions can be implicit or explicit.
+          if (MO.isReg()) {
+            Register Reg = MO.getReg();
+            if (MO.isDef() && isNAPhysCopy(Reg)) {
+              const auto &Def = NAPhysToVirtMIs.find(Reg);
+              if (Def != NAPhysToVirtMIs.end()) {
+                // A new definition of the non-allocatable physical register
+                // invalidates previous copies.
+                LLVM_DEBUG(dbgs()
+                           << "NAPhysCopy: invalidating because of " << *MI);
+                NAPhysToVirtMIs.erase(Def);
+              }
+            }
+          } else if (MO.isRegMask()) {
+            const uint32_t *RegMask = MO.getRegMask();
+            for (auto &RegMI : NAPhysToVirtMIs) {
+              Register Def = RegMI.first;
+              if (MachineOperand::clobbersPhysReg(RegMask, Def)) {
+                LLVM_DEBUG(dbgs()
+                           << "NAPhysCopy: invalidating because of " << *MI);
+                NAPhysToVirtMIs.erase(Def);
+              }
+            }
+          }
+        }
+      }
+
+      if (MI->isImplicitDef() || MI->isKill())
+        continue;
+
+      if (MI->isInlineAsm() || MI->hasUnmodeledSideEffects()) {
+        // Blow away all non-allocatable physical registers knowledge since we
+        // don't know what's correct anymore.
+        //
+        // FIXME: handle explicit asm clobbers.
+        LLVM_DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to "
+                          << *MI);
+        NAPhysToVirtMIs.clear();
+      }
+
+      if ((isUncoalescableCopy(*MI) &&
+           optimizeUncoalescableCopy(*MI, LocalMIs)) ||
+          (MI->isCompare() && optimizeCmpInstr(*MI)) ||
+          (MI->isSelect() && optimizeSelect(*MI, LocalMIs))) {
+        // MI is deleted.
+        LocalMIs.erase(MI);
+        Changed = true;
+        continue;
+      }
+
+      if (MI->isConditionalBranch() && optimizeCondBranch(*MI)) {
+        Changed = true;
+        continue;
+      }
+
+      if (isCoalescableCopy(*MI) && optimizeCoalescableCopy(*MI)) {
+        // MI is just rewritten.
+        Changed = true;
+        continue;
+      }
+
+      if (MI->isCopy() && (foldRedundantCopy(*MI, CopySrcMIs) ||
+                           foldRedundantNAPhysCopy(*MI, NAPhysToVirtMIs))) {
+        LocalMIs.erase(MI);
+        LLVM_DEBUG(dbgs() << "Deleting redundant copy: " << *MI << "\n");
+        MI->eraseFromParent();
+        Changed = true;
+        continue;
+      }
+
+      if (isMoveImmediate(*MI, ImmDefRegs, ImmDefMIs)) {
+        SeenMoveImm = true;
+      } else {
+        Changed |= optimizeExtInstr(*MI, MBB, LocalMIs);
+        // optimizeExtInstr might have created new instructions after MI
+        // and before the already incremented MII. Adjust MII so that the
+        // next iteration sees the new instructions.
+        MII = MI;
+        ++MII;
+        if (SeenMoveImm)
+          Changed |= foldImmediate(*MI, ImmDefRegs, ImmDefMIs);
+      }
+
+      // Check whether MI is a load candidate for folding into a later
+      // instruction. If MI is not a candidate, check whether we can fold an
+      // earlier load into MI.
+      if (!isLoadFoldable(*MI, FoldAsLoadDefCandidates) &&
+          !FoldAsLoadDefCandidates.empty()) {
+
+        // We visit each operand even after successfully folding a previous
+        // one.  This allows us to fold multiple loads into a single
+        // instruction.  We do assume that optimizeLoadInstr doesn't insert
+        // foldable uses earlier in the argument list.  Since we don't restart
+        // iteration, we'd miss such cases.
+        const MCInstrDesc &MIDesc = MI->getDesc();
+        for (unsigned i = MIDesc.getNumDefs(); i != MI->getNumOperands();
+             ++i) {
+          const MachineOperand &MOp = MI->getOperand(i);
+          if (!MOp.isReg())
+            continue;
+          Register FoldAsLoadDefReg = MOp.getReg();
+          if (FoldAsLoadDefCandidates.count(FoldAsLoadDefReg)) {
+            // We need to fold load after optimizeCmpInstr, since
+            // optimizeCmpInstr can enable folding by converting SUB to CMP.
+            // Save FoldAsLoadDefReg because optimizeLoadInstr() resets it and
+            // we need it for markUsesInDebugValueAsUndef().
+            Register FoldedReg = FoldAsLoadDefReg;
+            MachineInstr *DefMI = nullptr;
+            if (MachineInstr *FoldMI =
+                    TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI)) {
+              // Update LocalMIs since we replaced MI with FoldMI and deleted
+              // DefMI.
+              LLVM_DEBUG(dbgs() << "Replacing: " << *MI);
+              LLVM_DEBUG(dbgs() << "     With: " << *FoldMI);
+              LocalMIs.erase(MI);
+              LocalMIs.erase(DefMI);
+              LocalMIs.insert(FoldMI);
+              // Update the call site info.
+              if (MI->shouldUpdateCallSiteInfo())
+                MI->getMF()->moveCallSiteInfo(MI, FoldMI);
+              MI->eraseFromParent();
+              DefMI->eraseFromParent();
+              MRI->markUsesInDebugValueAsUndef(FoldedReg);
+              FoldAsLoadDefCandidates.erase(FoldedReg);
+              ++NumLoadFold;
+
+              // MI is replaced with FoldMI so we can continue trying to fold
+              Changed = true;
+              MI = FoldMI;
+            }
+          }
+        }
+      }
+
+      // If we run into an instruction we can't fold across, discard
+      // the load candidates.  Note: We might be able to fold *into* this
+      // instruction, so this needs to be after the folding logic.
+      if (MI->isLoadFoldBarrier()) {
+        LLVM_DEBUG(dbgs() << "Encountered load fold barrier on " << *MI);
+        FoldAsLoadDefCandidates.clear();
+      }
+    }
+  }
+
+  return Changed;
+}
+
+ValueTrackerResult ValueTracker::getNextSourceFromCopy() {
+  assert(Def->isCopy() && "Invalid definition");
+  // Copy instruction are supposed to be: Def = Src.
+  // If someone breaks this assumption, bad things will happen everywhere.
+  // There may be implicit uses preventing the copy to be moved across
+  // some target specific register definitions
+  assert(Def->getNumOperands() - Def->getNumImplicitOperands() == 2 &&
+         "Invalid number of operands");
+  assert(!Def->hasImplicitDef() && "Only implicit uses are allowed");
+
+  if (Def->getOperand(DefIdx).getSubReg() != DefSubReg)
+    // If we look for a different subreg, it means we want a subreg of src.
+    // Bails as we do not support composing subregs yet.
+    return ValueTrackerResult();
+  // Otherwise, we want the whole source.
+  const MachineOperand &Src = Def->getOperand(1);
+  if (Src.isUndef())
+    return ValueTrackerResult();
+  return ValueTrackerResult(Src.getReg(), Src.getSubReg());
+}
+
+ValueTrackerResult ValueTracker::getNextSourceFromBitcast() {
+  assert(Def->isBitcast() && "Invalid definition");
+
+  // Bail if there are effects that a plain copy will not expose.
+  if (Def->mayRaiseFPException() || Def->hasUnmodeledSideEffects())
+    return ValueTrackerResult();
+
+  // Bitcasts with more than one def are not supported.
+  if (Def->getDesc().getNumDefs() != 1)
+    return ValueTrackerResult();
+  const MachineOperand DefOp = Def->getOperand(DefIdx);
+  if (DefOp.getSubReg() != DefSubReg)
+    // If we look for a different subreg, it means we want a subreg of the src.
+    // Bails as we do not support composing subregs yet.
+    return ValueTrackerResult();
+
+  unsigned SrcIdx = Def->getNumOperands();
+  for (unsigned OpIdx = DefIdx + 1, EndOpIdx = SrcIdx; OpIdx != EndOpIdx;
+       ++OpIdx) {
+    const MachineOperand &MO = Def->getOperand(OpIdx);
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+    // Ignore dead implicit defs.
+    if (MO.isImplicit() && MO.isDead())
+      continue;
+    assert(!MO.isDef() && "We should have skipped all the definitions by now");
+    if (SrcIdx != EndOpIdx)
+      // Multiple sources?
+      return ValueTrackerResult();
+    SrcIdx = OpIdx;
+  }
+
+  // In some rare case, Def has no input, SrcIdx is out of bound,
+  // getOperand(SrcIdx) will fail below.
+  if (SrcIdx >= Def->getNumOperands())
+    return ValueTrackerResult();
+
+  // Stop when any user of the bitcast is a SUBREG_TO_REG, replacing with a COPY
+  // will break the assumed guarantees for the upper bits.
+  for (const MachineInstr &UseMI : MRI.use_nodbg_instructions(DefOp.getReg())) {
+    if (UseMI.isSubregToReg())
+      return ValueTrackerResult();
+  }
+
+  const MachineOperand &Src = Def->getOperand(SrcIdx);
+  if (Src.isUndef())
+    return ValueTrackerResult();
+  return ValueTrackerResult(Src.getReg(), Src.getSubReg());
+}
+
+ValueTrackerResult ValueTracker::getNextSourceFromRegSequence() {
+  assert((Def->isRegSequence() || Def->isRegSequenceLike()) &&
+         "Invalid definition");
+
+  if (Def->getOperand(DefIdx).getSubReg())
+    // If we are composing subregs, bail out.
+    // The case we are checking is Def.<subreg> = REG_SEQUENCE.
+    // This should almost never happen as the SSA property is tracked at
+    // the register level (as opposed to the subreg level).
+    // I.e.,
+    // Def.sub0 =
+    // Def.sub1 =
+    // is a valid SSA representation for Def.sub0 and Def.sub1, but not for
+    // Def. Thus, it must not be generated.
+    // However, some code could theoretically generates a single
+    // Def.sub0 (i.e, not defining the other subregs) and we would
+    // have this case.
+    // If we can ascertain (or force) that this never happens, we could
+    // turn that into an assertion.
+    return ValueTrackerResult();
+
+  if (!TII)
+    // We could handle the REG_SEQUENCE here, but we do not want to
+    // duplicate the code from the generic TII.
+    return ValueTrackerResult();
+
+  SmallVector<RegSubRegPairAndIdx, 8> RegSeqInputRegs;
+  if (!TII->getRegSequenceInputs(*Def, DefIdx, RegSeqInputRegs))
+    return ValueTrackerResult();
+
+  // We are looking at:
+  // Def = REG_SEQUENCE v0, sub0, v1, sub1, ...
+  // Check if one of the operand defines the subreg we are interested in.
+  for (const RegSubRegPairAndIdx &RegSeqInput : RegSeqInputRegs) {
+    if (RegSeqInput.SubIdx == DefSubReg)
+      return ValueTrackerResult(RegSeqInput.Reg, RegSeqInput.SubReg);
+  }
+
+  // If the subreg we are tracking is super-defined by another subreg,
+  // we could follow this value. However, this would require to compose
+  // the subreg and we do not do that for now.
+  return ValueTrackerResult();
+}
+
+ValueTrackerResult ValueTracker::getNextSourceFromInsertSubreg() {
+  assert((Def->isInsertSubreg() || Def->isInsertSubregLike()) &&
+         "Invalid definition");
+
+  if (Def->getOperand(DefIdx).getSubReg())
+    // If we are composing subreg, bail out.
+    // Same remark as getNextSourceFromRegSequence.
+    // I.e., this may be turned into an assert.
+    return ValueTrackerResult();
+
+  if (!TII)
+    // We could handle the REG_SEQUENCE here, but we do not want to
+    // duplicate the code from the generic TII.
+    return ValueTrackerResult();
+
+  RegSubRegPair BaseReg;
+  RegSubRegPairAndIdx InsertedReg;
+  if (!TII->getInsertSubregInputs(*Def, DefIdx, BaseReg, InsertedReg))
+    return ValueTrackerResult();
+
+  // We are looking at:
+  // Def = INSERT_SUBREG v0, v1, sub1
+  // There are two cases:
+  // 1. DefSubReg == sub1, get v1.
+  // 2. DefSubReg != sub1, the value may be available through v0.
+
+  // #1 Check if the inserted register matches the required sub index.
+  if (InsertedReg.SubIdx == DefSubReg) {
+    return ValueTrackerResult(InsertedReg.Reg, InsertedReg.SubReg);
+  }
+  // #2 Otherwise, if the sub register we are looking for is not partial
+  // defined by the inserted element, we can look through the main
+  // register (v0).
+  const MachineOperand &MODef = Def->getOperand(DefIdx);
+  // If the result register (Def) and the base register (v0) do not
+  // have the same register class or if we have to compose
+  // subregisters, bail out.
+  if (MRI.getRegClass(MODef.getReg()) != MRI.getRegClass(BaseReg.Reg) ||
+      BaseReg.SubReg)
+    return ValueTrackerResult();
+
+  // Get the TRI and check if the inserted sub-register overlaps with the
+  // sub-register we are tracking.
+  const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+  if (!TRI ||
+      !(TRI->getSubRegIndexLaneMask(DefSubReg) &
+        TRI->getSubRegIndexLaneMask(InsertedReg.SubIdx)).none())
+    return ValueTrackerResult();
+  // At this point, the value is available in v0 via the same subreg
+  // we used for Def.
+  return ValueTrackerResult(BaseReg.Reg, DefSubReg);
+}
+
+ValueTrackerResult ValueTracker::getNextSourceFromExtractSubreg() {
+  assert((Def->isExtractSubreg() ||
+          Def->isExtractSubregLike()) && "Invalid definition");
+  // We are looking at:
+  // Def = EXTRACT_SUBREG v0, sub0
+
+  // Bail if we have to compose sub registers.
+  // Indeed, if DefSubReg != 0, we would have to compose it with sub0.
+  if (DefSubReg)
+    return ValueTrackerResult();
+
+  if (!TII)
+    // We could handle the EXTRACT_SUBREG here, but we do not want to
+    // duplicate the code from the generic TII.
+    return ValueTrackerResult();
+
+  RegSubRegPairAndIdx ExtractSubregInputReg;
+  if (!TII->getExtractSubregInputs(*Def, DefIdx, ExtractSubregInputReg))
+    return ValueTrackerResult();
+
+  // Bail if we have to compose sub registers.
+  // Likewise, if v0.subreg != 0, we would have to compose v0.subreg with sub0.
+  if (ExtractSubregInputReg.SubReg)
+    return ValueTrackerResult();
+  // Otherwise, the value is available in the v0.sub0.
+  return ValueTrackerResult(ExtractSubregInputReg.Reg,
+                            ExtractSubregInputReg.SubIdx);
+}
+
+ValueTrackerResult ValueTracker::getNextSourceFromSubregToReg() {
+  assert(Def->isSubregToReg() && "Invalid definition");
+  // We are looking at:
+  // Def = SUBREG_TO_REG Imm, v0, sub0
+
+  // Bail if we have to compose sub registers.
+  // If DefSubReg != sub0, we would have to check that all the bits
+  // we track are included in sub0 and if yes, we would have to
+  // determine the right subreg in v0.
+  if (DefSubReg != Def->getOperand(3).getImm())
+    return ValueTrackerResult();
+  // Bail if we have to compose sub registers.
+  // Likewise, if v0.subreg != 0, we would have to compose it with sub0.
+  if (Def->getOperand(2).getSubReg())
+    return ValueTrackerResult();
+
+  return ValueTrackerResult(Def->getOperand(2).getReg(),
+                            Def->getOperand(3).getImm());
+}
+
+/// Explore each PHI incoming operand and return its sources.
+ValueTrackerResult ValueTracker::getNextSourceFromPHI() {
+  assert(Def->isPHI() && "Invalid definition");
+  ValueTrackerResult Res;
+
+  // If we look for a different subreg, bail as we do not support composing
+  // subregs yet.
+  if (Def->getOperand(0).getSubReg() != DefSubReg)
+    return ValueTrackerResult();
+
+  // Return all register sources for PHI instructions.
+  for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2) {
+    const MachineOperand &MO = Def->getOperand(i);
+    assert(MO.isReg() && "Invalid PHI instruction");
+    // We have no code to deal with undef operands. They shouldn't happen in
+    // normal programs anyway.
+    if (MO.isUndef())
+      return ValueTrackerResult();
+    Res.addSource(MO.getReg(), MO.getSubReg());
+  }
+
+  return Res;
+}
+
+ValueTrackerResult ValueTracker::getNextSourceImpl() {
+  assert(Def && "This method needs a valid definition");
+
+  assert(((Def->getOperand(DefIdx).isDef() &&
+           (DefIdx < Def->getDesc().getNumDefs() ||
+            Def->getDesc().isVariadic())) ||
+          Def->getOperand(DefIdx).isImplicit()) &&
+         "Invalid DefIdx");
+  if (Def->isCopy())
+    return getNextSourceFromCopy();
+  if (Def->isBitcast())
+    return getNextSourceFromBitcast();
+  // All the remaining cases involve "complex" instructions.
+  // Bail if we did not ask for the advanced tracking.
+  if (DisableAdvCopyOpt)
+    return ValueTrackerResult();
+  if (Def->isRegSequence() || Def->isRegSequenceLike())
+    return getNextSourceFromRegSequence();
+  if (Def->isInsertSubreg() || Def->isInsertSubregLike())
+    return getNextSourceFromInsertSubreg();
+  if (Def->isExtractSubreg() || Def->isExtractSubregLike())
+    return getNextSourceFromExtractSubreg();
+  if (Def->isSubregToReg())
+    return getNextSourceFromSubregToReg();
+  if (Def->isPHI())
+    return getNextSourceFromPHI();
+  return ValueTrackerResult();
+}
+
+ValueTrackerResult ValueTracker::getNextSource() {
+  // If we reach a point where we cannot move up in the use-def chain,
+  // there is nothing we can get.
+  if (!Def)
+    return ValueTrackerResult();
+
+  ValueTrackerResult Res = getNextSourceImpl();
+  if (Res.isValid()) {
+    // Update definition, definition index, and subregister for the
+    // next call of getNextSource.
+    // Update the current register.
+    bool OneRegSrc = Res.getNumSources() == 1;
+    if (OneRegSrc)
+      Reg = Res.getSrcReg(0);
+    // Update the result before moving up in the use-def chain
+    // with the instruction containing the last found sources.
+    Res.setInst(Def);
+
+    // If we can still move up in the use-def chain, move to the next
+    // definition.
+    if (!Reg.isPhysical() && OneRegSrc) {
+      MachineRegisterInfo::def_iterator DI = MRI.def_begin(Reg);
+      if (DI != MRI.def_end()) {
+        Def = DI->getParent();
+        DefIdx = DI.getOperandNo();
+        DefSubReg = Res.getSrcSubReg(0);
+      } else {
+        Def = nullptr;
+      }
+      return Res;
+    }
+  }
+  // If we end up here, this means we will not be able to find another source
+  // for the next iteration. Make sure any new call to getNextSource bails out
+  // early by cutting the use-def chain.
+  Def = nullptr;
+  return Res;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
new file mode 100644
index 000000000000..97b1532300b1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
@@ -0,0 +1,96 @@
+//===----- PostRAHazardRecognizer.cpp - hazard recognizer -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This runs the hazard recognizer and emits noops when necessary.  This
+/// gives targets a way to run the hazard recognizer without running one of
+/// the schedulers.  Example use cases for this pass would be:
+///
+/// - Targets that need the hazard recognizer to be run at -O0.
+/// - Targets that want to guarantee that hazards at the beginning of
+///   scheduling regions are handled correctly.  The post-RA scheduler is
+///   a top-down scheduler, but when there are multiple scheduling regions
+///   in a basic block, it visits the regions in bottom-up order.  This
+///   makes it impossible for the scheduler to gauranttee it can correctly
+///   handle hazards at the beginning of scheduling regions.
+///
+/// This pass traverses all the instructions in a program in top-down order.
+/// In contrast to the instruction scheduling passes, this pass never resets
+/// the hazard recognizer to ensure it can correctly handles noop hazards at
+/// the beginning of blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "post-RA-hazard-rec"
+
+STATISTIC(NumNoops, "Number of noops inserted");
+
+namespace {
+  class PostRAHazardRecognizer : public MachineFunctionPass {
+
+  public:
+    static char ID;
+    PostRAHazardRecognizer() : MachineFunctionPass(ID) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  };
+  char PostRAHazardRecognizer::ID = 0;
+
+}
+
+char &llvm::PostRAHazardRecognizerID = PostRAHazardRecognizer::ID;
+
+INITIALIZE_PASS(PostRAHazardRecognizer, DEBUG_TYPE,
+                "Post RA hazard recognizer", false, false)
+
+bool PostRAHazardRecognizer::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
+  std::unique_ptr<ScheduleHazardRecognizer> HazardRec(
+      TII->CreateTargetPostRAHazardRecognizer(Fn));
+
+  // Return if the target has not implemented a hazard recognizer.
+  if (!HazardRec)
+    return false;
+
+  // Loop over all of the basic blocks
+  bool Changed = false;
+  for (auto &MBB : Fn) {
+    // We do not call HazardRec->reset() here to make sure we are handling noop
+    // hazards at the start of basic blocks.
+    for (MachineInstr &MI : MBB) {
+      // If we need to emit noops prior to this instruction, then do so.
+      unsigned NumPreNoops = HazardRec->PreEmitNoops(&MI);
+      HazardRec->EmitNoops(NumPreNoops);
+      TII->insertNoops(MBB, MachineBasicBlock::iterator(MI), NumPreNoops);
+      NumNoops += NumPreNoops;
+      if (NumPreNoops)
+        Changed = true;
+
+      HazardRec->EmitInstruction(&MI);
+      if (HazardRec->atIssueLimit()) {
+        HazardRec->AdvanceCycle();
+      }
+    }
+  }
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PostRASchedulerList.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PostRASchedulerList.cpp
new file mode 100644
index 000000000000..170008ab67cb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PostRASchedulerList.cpp
@@ -0,0 +1,696 @@
+//===----- SchedulePostRAList.cpp - list scheduler ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a top-down list scheduler, using standard algorithms.
+// The basic approach uses a priority queue of available nodes to schedule.
+// One at a time, nodes are taken from the priority queue (thus in priority
+// order), checked for legality to schedule, and emitted if legal.
+//
+// Nodes may not be legal to schedule either due to structural hazards (e.g.
+// pipeline or resource constraints) or because an input to the instruction has
+// not completed execution.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/AntiDepBreaker.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "post-RA-sched"
+
+STATISTIC(NumNoops, "Number of noops inserted");
+STATISTIC(NumStalls, "Number of pipeline stalls");
+STATISTIC(NumFixedAnti, "Number of fixed anti-dependencies");
+
+// Post-RA scheduling is enabled with
+// TargetSubtargetInfo.enablePostRAScheduler(). This flag can be used to
+// override the target.
+static cl::opt<bool>
+EnablePostRAScheduler("post-RA-scheduler",
+                       cl::desc("Enable scheduling after register allocation"),
+                       cl::init(false), cl::Hidden);
+static cl::opt<std::string>
+EnableAntiDepBreaking("break-anti-dependencies",
+                      cl::desc("Break post-RA scheduling anti-dependencies: "
+                               "\"critical\", \"all\", or \"none\""),
+                      cl::init("none"), cl::Hidden);
+
+// If DebugDiv > 0 then only schedule MBB with (ID % DebugDiv) == DebugMod
+static cl::opt<int>
+DebugDiv("postra-sched-debugdiv",
+                      cl::desc("Debug control MBBs that are scheduled"),
+                      cl::init(0), cl::Hidden);
+static cl::opt<int>
+DebugMod("postra-sched-debugmod",
+                      cl::desc("Debug control MBBs that are scheduled"),
+                      cl::init(0), cl::Hidden);
+
+AntiDepBreaker::~AntiDepBreaker() = default;
+
+namespace {
+  class PostRAScheduler : public MachineFunctionPass {
+    const TargetInstrInfo *TII = nullptr;
+    RegisterClassInfo RegClassInfo;
+
+  public:
+    static char ID;
+    PostRAScheduler() : MachineFunctionPass(ID) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<TargetPassConfig>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  private:
+    bool enablePostRAScheduler(
+        const TargetSubtargetInfo &ST, CodeGenOpt::Level OptLevel,
+        TargetSubtargetInfo::AntiDepBreakMode &Mode,
+        TargetSubtargetInfo::RegClassVector &CriticalPathRCs) const;
+  };
+  char PostRAScheduler::ID = 0;
+
+  class SchedulePostRATDList : public ScheduleDAGInstrs {
+    /// AvailableQueue - The priority queue to use for the available SUnits.
+    ///
+    LatencyPriorityQueue AvailableQueue;
+
+    /// PendingQueue - This contains all of the instructions whose operands have
+    /// been issued, but their results are not ready yet (due to the latency of
+    /// the operation).  Once the operands becomes available, the instruction is
+    /// added to the AvailableQueue.
+    std::vector<SUnit*> PendingQueue;
+
+    /// HazardRec - The hazard recognizer to use.
+    ScheduleHazardRecognizer *HazardRec;
+
+    /// AntiDepBreak - Anti-dependence breaking object, or NULL if none
+    AntiDepBreaker *AntiDepBreak;
+
+    /// AA - AliasAnalysis for making memory reference queries.
+    AliasAnalysis *AA;
+
+    /// The schedule. Null SUnit*'s represent noop instructions.
+    std::vector<SUnit*> Sequence;
+
+    /// Ordered list of DAG postprocessing steps.
+    std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
+
+    /// The index in BB of RegionEnd.
+    ///
+    /// This is the instruction number from the top of the current block, not
+    /// the SlotIndex. It is only used by the AntiDepBreaker.
+    unsigned EndIndex = 0;
+
+  public:
+    SchedulePostRATDList(
+        MachineFunction &MF, MachineLoopInfo &MLI, AliasAnalysis *AA,
+        const RegisterClassInfo &,
+        TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
+        SmallVectorImpl<const TargetRegisterClass *> &CriticalPathRCs);
+
+    ~SchedulePostRATDList() override;
+
+    /// startBlock - Initialize register live-range state for scheduling in
+    /// this block.
+    ///
+    void startBlock(MachineBasicBlock *BB) override;
+
+    // Set the index of RegionEnd within the current BB.
+    void setEndIndex(unsigned EndIdx) { EndIndex = EndIdx; }
+
+    /// Initialize the scheduler state for the next scheduling region.
+    void enterRegion(MachineBasicBlock *bb,
+                     MachineBasicBlock::iterator begin,
+                     MachineBasicBlock::iterator end,
+                     unsigned regioninstrs) override;
+
+    /// Notify that the scheduler has finished scheduling the current region.
+    void exitRegion() override;
+
+    /// Schedule - Schedule the instruction range using list scheduling.
+    ///
+    void schedule() override;
+
+    void EmitSchedule();
+
+    /// Observe - Update liveness information to account for the current
+    /// instruction, which will not be scheduled.
+    ///
+    void Observe(MachineInstr &MI, unsigned Count);
+
+    /// finishBlock - Clean up register live-range state.
+    ///
+    void finishBlock() override;
+
+  private:
+    /// Apply each ScheduleDAGMutation step in order.
+    void postProcessDAG();
+
+    void ReleaseSucc(SUnit *SU, SDep *SuccEdge);
+    void ReleaseSuccessors(SUnit *SU);
+    void ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle);
+    void ListScheduleTopDown();
+
+    void dumpSchedule() const;
+    void emitNoop(unsigned CurCycle);
+  };
+}
+
+char &llvm::PostRASchedulerID = PostRAScheduler::ID;
+
+INITIALIZE_PASS(PostRAScheduler, DEBUG_TYPE,
+                "Post RA top-down list latency scheduler", false, false)
+
+SchedulePostRATDList::SchedulePostRATDList(
+    MachineFunction &MF, MachineLoopInfo &MLI, AliasAnalysis *AA,
+    const RegisterClassInfo &RCI,
+    TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
+    SmallVectorImpl<const TargetRegisterClass *> &CriticalPathRCs)
+    : ScheduleDAGInstrs(MF, &MLI), AA(AA) {
+
+  const InstrItineraryData *InstrItins =
+      MF.getSubtarget().getInstrItineraryData();
+  HazardRec =
+      MF.getSubtarget().getInstrInfo()->CreateTargetPostRAHazardRecognizer(
+          InstrItins, this);
+  MF.getSubtarget().getPostRAMutations(Mutations);
+
+  assert((AntiDepMode == TargetSubtargetInfo::ANTIDEP_NONE ||
+          MRI.tracksLiveness()) &&
+         "Live-ins must be accurate for anti-dependency breaking");
+  AntiDepBreak = ((AntiDepMode == TargetSubtargetInfo::ANTIDEP_ALL)
+                      ? createAggressiveAntiDepBreaker(MF, RCI, CriticalPathRCs)
+                      : ((AntiDepMode == TargetSubtargetInfo::ANTIDEP_CRITICAL)
+                             ? createCriticalAntiDepBreaker(MF, RCI)
+                             : nullptr));
+}
+
+SchedulePostRATDList::~SchedulePostRATDList() {
+  delete HazardRec;
+  delete AntiDepBreak;
+}
+
+/// Initialize state associated with the next scheduling region.
+void SchedulePostRATDList::enterRegion(MachineBasicBlock *bb,
+                 MachineBasicBlock::iterator begin,
+                 MachineBasicBlock::iterator end,
+                 unsigned regioninstrs) {
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
+  Sequence.clear();
+}
+
+/// Print the schedule before exiting the region.
+void SchedulePostRATDList::exitRegion() {
+  LLVM_DEBUG({
+    dbgs() << "*** Final schedule ***\n";
+    dumpSchedule();
+    dbgs() << '\n';
+  });
+  ScheduleDAGInstrs::exitRegion();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+/// dumpSchedule - dump the scheduled Sequence.
+LLVM_DUMP_METHOD void SchedulePostRATDList::dumpSchedule() const {
+  for (const SUnit *SU : Sequence) {
+    if (SU)
+      dumpNode(*SU);
+    else
+      dbgs() << "**** NOOP ****\n";
+  }
+}
+#endif
+
+bool PostRAScheduler::enablePostRAScheduler(
+    const TargetSubtargetInfo &ST,
+    CodeGenOpt::Level OptLevel,
+    TargetSubtargetInfo::AntiDepBreakMode &Mode,
+    TargetSubtargetInfo::RegClassVector &CriticalPathRCs) const {
+  Mode = ST.getAntiDepBreakMode();
+  ST.getCriticalPathRCs(CriticalPathRCs);
+
+  // Check for explicit enable/disable of post-ra scheduling.
+  if (EnablePostRAScheduler.getPosition() > 0)
+    return EnablePostRAScheduler;
+
+  return ST.enablePostRAScheduler() &&
+         OptLevel >= ST.getOptLevelToEnablePostRAScheduler();
+}
+
+bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(Fn.getFunction()))
+    return false;
+
+  TII = Fn.getSubtarget().getInstrInfo();
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
+
+  RegClassInfo.runOnMachineFunction(Fn);
+
+  TargetSubtargetInfo::AntiDepBreakMode AntiDepMode =
+    TargetSubtargetInfo::ANTIDEP_NONE;
+  SmallVector<const TargetRegisterClass*, 4> CriticalPathRCs;
+
+  // Check that post-RA scheduling is enabled for this target.
+  // This may upgrade the AntiDepMode.
+  if (!enablePostRAScheduler(Fn.getSubtarget(), PassConfig->getOptLevel(),
+                             AntiDepMode, CriticalPathRCs))
+    return false;
+
+  // Check for antidep breaking override...
+  if (EnableAntiDepBreaking.getPosition() > 0) {
+    AntiDepMode = (EnableAntiDepBreaking == "all")
+      ? TargetSubtargetInfo::ANTIDEP_ALL
+      : ((EnableAntiDepBreaking == "critical")
+         ? TargetSubtargetInfo::ANTIDEP_CRITICAL
+         : TargetSubtargetInfo::ANTIDEP_NONE);
+  }
+
+  LLVM_DEBUG(dbgs() << "PostRAScheduler\n");
+
+  SchedulePostRATDList Scheduler(Fn, MLI, AA, RegClassInfo, AntiDepMode,
+                                 CriticalPathRCs);
+
+  // Loop over all of the basic blocks
+  for (auto &MBB : Fn) {
+#ifndef NDEBUG
+    // If DebugDiv > 0 then only schedule MBB with (ID % DebugDiv) == DebugMod
+    if (DebugDiv > 0) {
+      static int bbcnt = 0;
+      if (bbcnt++ % DebugDiv != DebugMod)
+        continue;
+      dbgs() << "*** DEBUG scheduling " << Fn.getName() << ":"
+             << printMBBReference(MBB) << " ***\n";
+    }
+#endif
+
+    // Initialize register live-range state for scheduling in this block.
+    Scheduler.startBlock(&MBB);
+
+    // Schedule each sequence of instructions not interrupted by a label
+    // or anything else that effectively needs to shut down scheduling.
+    MachineBasicBlock::iterator Current = MBB.end();
+    unsigned Count = MBB.size(), CurrentCount = Count;
+    for (MachineBasicBlock::iterator I = Current; I != MBB.begin();) {
+      MachineInstr &MI = *std::prev(I);
+      --Count;
+      // Calls are not scheduling boundaries before register allocation, but
+      // post-ra we don't gain anything by scheduling across calls since we
+      // don't need to worry about register pressure.
+      if (MI.isCall() || TII->isSchedulingBoundary(MI, &MBB, Fn)) {
+        Scheduler.enterRegion(&MBB, I, Current, CurrentCount - Count);
+        Scheduler.setEndIndex(CurrentCount);
+        Scheduler.schedule();
+        Scheduler.exitRegion();
+        Scheduler.EmitSchedule();
+        Current = &MI;
+        CurrentCount = Count;
+        Scheduler.Observe(MI, CurrentCount);
+      }
+      I = MI;
+      if (MI.isBundle())
+        Count -= MI.getBundleSize();
+    }
+    assert(Count == 0 && "Instruction count mismatch!");
+    assert((MBB.begin() == Current || CurrentCount != 0) &&
+           "Instruction count mismatch!");
+    Scheduler.enterRegion(&MBB, MBB.begin(), Current, CurrentCount);
+    Scheduler.setEndIndex(CurrentCount);
+    Scheduler.schedule();
+    Scheduler.exitRegion();
+    Scheduler.EmitSchedule();
+
+    // Clean up register live-range state.
+    Scheduler.finishBlock();
+
+    // Update register kills
+    Scheduler.fixupKills(MBB);
+  }
+
+  return true;
+}
+
+/// StartBlock - Initialize register live-range state for scheduling in
+/// this block.
+///
+void SchedulePostRATDList::startBlock(MachineBasicBlock *BB) {
+  // Call the superclass.
+  ScheduleDAGInstrs::startBlock(BB);
+
+  // Reset the hazard recognizer and anti-dep breaker.
+  HazardRec->Reset();
+  if (AntiDepBreak)
+    AntiDepBreak->StartBlock(BB);
+}
+
+/// Schedule - Schedule the instruction range using list scheduling.
+///
+void SchedulePostRATDList::schedule() {
+  // Build the scheduling graph.
+  buildSchedGraph(AA);
+
+  if (AntiDepBreak) {
+    unsigned Broken =
+      AntiDepBreak->BreakAntiDependencies(SUnits, RegionBegin, RegionEnd,
+                                          EndIndex, DbgValues);
+
+    if (Broken != 0) {
+      // We made changes. Update the dependency graph.
+      // Theoretically we could update the graph in place:
+      // When a live range is changed to use a different register, remove
+      // the def's anti-dependence *and* output-dependence edges due to
+      // that register, and add new anti-dependence and output-dependence
+      // edges based on the next live range of the register.
+      ScheduleDAG::clearDAG();
+      buildSchedGraph(AA);
+
+      NumFixedAnti += Broken;
+    }
+  }
+
+  postProcessDAG();
+
+  LLVM_DEBUG(dbgs() << "********** List Scheduling **********\n");
+  LLVM_DEBUG(dump());
+
+  AvailableQueue.initNodes(SUnits);
+  ListScheduleTopDown();
+  AvailableQueue.releaseState();
+}
+
+/// Observe - Update liveness information to account for the current
+/// instruction, which will not be scheduled.
+///
+void SchedulePostRATDList::Observe(MachineInstr &MI, unsigned Count) {
+  if (AntiDepBreak)
+    AntiDepBreak->Observe(MI, Count, EndIndex);
+}
+
+/// FinishBlock - Clean up register live-range state.
+///
+void SchedulePostRATDList::finishBlock() {
+  if (AntiDepBreak)
+    AntiDepBreak->FinishBlock();
+
+  // Call the superclass.
+  ScheduleDAGInstrs::finishBlock();
+}
+
+/// Apply each ScheduleDAGMutation step in order.
+void SchedulePostRATDList::postProcessDAG() {
+  for (auto &M : Mutations)
+    M->apply(this);
+}
+
+//===----------------------------------------------------------------------===//
+//  Top-Down Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. Add it to
+/// the PendingQueue if the count reaches zero.
+void SchedulePostRATDList::ReleaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+  if (SuccEdge->isWeak()) {
+    --SuccSU->WeakPredsLeft;
+    return;
+  }
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    dumpNode(*SuccSU);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+  --SuccSU->NumPredsLeft;
+
+  // Standard scheduler algorithms will recompute the depth of the successor
+  // here as such:
+  //   SuccSU->setDepthToAtLeast(SU->getDepth() + SuccEdge->getLatency());
+  //
+  // However, we lazily compute node depth instead. Note that
+  // ScheduleNodeTopDown has already updated the depth of this node which causes
+  // all descendents to be marked dirty. Setting the successor depth explicitly
+  // here would cause depth to be recomputed for all its ancestors. If the
+  // successor is not yet ready (because of a transitively redundant edge) then
+  // this causes depth computation to be quadratic in the size of the DAG.
+
+  // If all the node's predecessors are scheduled, this node is ready
+  // to be scheduled. Ignore the special ExitSU node.
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
+    PendingQueue.push_back(SuccSU);
+}
+
+/// ReleaseSuccessors - Call ReleaseSucc on each of SU's successors.
+void SchedulePostRATDList::ReleaseSuccessors(SUnit *SU) {
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    ReleaseSucc(SU, &*I);
+  }
+}
+
+/// ScheduleNodeTopDown - Add the node to the schedule. Decrement the pending
+/// count of its successors. If a successor pending count is zero, add it to
+/// the Available queue.
+void SchedulePostRATDList::ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
+  LLVM_DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  LLVM_DEBUG(dumpNode(*SU));
+
+  Sequence.push_back(SU);
+  assert(CurCycle >= SU->getDepth() &&
+         "Node scheduled above its depth!");
+  SU->setDepthToAtLeast(CurCycle);
+
+  ReleaseSuccessors(SU);
+  SU->isScheduled = true;
+  AvailableQueue.scheduledNode(SU);
+}
+
+/// emitNoop - Add a noop to the current instruction sequence.
+void SchedulePostRATDList::emitNoop(unsigned CurCycle) {
+  LLVM_DEBUG(dbgs() << "*** Emitting noop in cycle " << CurCycle << '\n');
+  HazardRec->EmitNoop();
+  Sequence.push_back(nullptr);   // NULL here means noop
+  ++NumNoops;
+}
+
+/// ListScheduleTopDown - The main loop of list scheduling for top-down
+/// schedulers.
+void SchedulePostRATDList::ListScheduleTopDown() {
+  unsigned CurCycle = 0;
+
+  // We're scheduling top-down but we're visiting the regions in
+  // bottom-up order, so we don't know the hazards at the start of a
+  // region. So assume no hazards (this should usually be ok as most
+  // blocks are a single region).
+  HazardRec->Reset();
+
+  // Release any successors of the special Entry node.
+  ReleaseSuccessors(&EntrySU);
+
+  // Add all leaves to Available queue.
+  for (SUnit &SUnit : SUnits) {
+    // It is available if it has no predecessors.
+    if (!SUnit.NumPredsLeft && !SUnit.isAvailable) {
+      AvailableQueue.push(&SUnit);
+      SUnit.isAvailable = true;
+    }
+  }
+
+  // In any cycle where we can't schedule any instructions, we must
+  // stall or emit a noop, depending on the target.
+  bool CycleHasInsts = false;
+
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  std::vector<SUnit*> NotReady;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue.empty() || !PendingQueue.empty()) {
+    // Check to see if any of the pending instructions are ready to issue.  If
+    // so, add them to the available queue.
+    unsigned MinDepth = ~0u;
+    for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) {
+      if (PendingQueue[i]->getDepth() <= CurCycle) {
+        AvailableQueue.push(PendingQueue[i]);
+        PendingQueue[i]->isAvailable = true;
+        PendingQueue[i] = PendingQueue.back();
+        PendingQueue.pop_back();
+        --i; --e;
+      } else if (PendingQueue[i]->getDepth() < MinDepth)
+        MinDepth = PendingQueue[i]->getDepth();
+    }
+
+    LLVM_DEBUG(dbgs() << "\n*** Examining Available\n";
+               AvailableQueue.dump(this));
+
+    SUnit *FoundSUnit = nullptr, *NotPreferredSUnit = nullptr;
+    bool HasNoopHazards = false;
+    while (!AvailableQueue.empty()) {
+      SUnit *CurSUnit = AvailableQueue.pop();
+
+      ScheduleHazardRecognizer::HazardType HT =
+        HazardRec->getHazardType(CurSUnit, 0/*no stalls*/);
+      if (HT == ScheduleHazardRecognizer::NoHazard) {
+        if (HazardRec->ShouldPreferAnother(CurSUnit)) {
+          if (!NotPreferredSUnit) {
+            // If this is the first non-preferred node for this cycle, then
+            // record it and continue searching for a preferred node. If this
+            // is not the first non-preferred node, then treat it as though
+            // there had been a hazard.
+            NotPreferredSUnit = CurSUnit;
+            continue;
+          }
+        } else {
+          FoundSUnit = CurSUnit;
+          break;
+        }
+      }
+
+      // Remember if this is a noop hazard.
+      HasNoopHazards |= HT == ScheduleHazardRecognizer::NoopHazard;
+
+      NotReady.push_back(CurSUnit);
+    }
+
+    // If we have a non-preferred node, push it back onto the available list.
+    // If we did not find a preferred node, then schedule this first
+    // non-preferred node.
+    if (NotPreferredSUnit) {
+      if (!FoundSUnit) {
+        LLVM_DEBUG(
+            dbgs() << "*** Will schedule a non-preferred instruction...\n");
+        FoundSUnit = NotPreferredSUnit;
+      } else {
+        AvailableQueue.push(NotPreferredSUnit);
+      }
+
+      NotPreferredSUnit = nullptr;
+    }
+
+    // Add the nodes that aren't ready back onto the available list.
+    if (!NotReady.empty()) {
+      AvailableQueue.push_all(NotReady);
+      NotReady.clear();
+    }
+
+    // If we found a node to schedule...
+    if (FoundSUnit) {
+      // If we need to emit noops prior to this instruction, then do so.
+      unsigned NumPreNoops = HazardRec->PreEmitNoops(FoundSUnit);
+      for (unsigned i = 0; i != NumPreNoops; ++i)
+        emitNoop(CurCycle);
+
+      // ... schedule the node...
+      ScheduleNodeTopDown(FoundSUnit, CurCycle);
+      HazardRec->EmitInstruction(FoundSUnit);
+      CycleHasInsts = true;
+      if (HazardRec->atIssueLimit()) {
+        LLVM_DEBUG(dbgs() << "*** Max instructions per cycle " << CurCycle
+                          << '\n');
+        HazardRec->AdvanceCycle();
+        ++CurCycle;
+        CycleHasInsts = false;
+      }
+    } else {
+      if (CycleHasInsts) {
+        LLVM_DEBUG(dbgs() << "*** Finished cycle " << CurCycle << '\n');
+        HazardRec->AdvanceCycle();
+      } else if (!HasNoopHazards) {
+        // Otherwise, we have a pipeline stall, but no other problem,
+        // just advance the current cycle and try again.
+        LLVM_DEBUG(dbgs() << "*** Stall in cycle " << CurCycle << '\n');
+        HazardRec->AdvanceCycle();
+        ++NumStalls;
+      } else {
+        // Otherwise, we have no instructions to issue and we have instructions
+        // that will fault if we don't do this right.  This is the case for
+        // processors without pipeline interlocks and other cases.
+        emitNoop(CurCycle);
+      }
+
+      ++CurCycle;
+      CycleHasInsts = false;
+    }
+  }
+
+#ifndef NDEBUG
+  unsigned ScheduledNodes = VerifyScheduledDAG(/*isBottomUp=*/false);
+  unsigned Noops = llvm::count(Sequence, nullptr);
+  assert(Sequence.size() - Noops == ScheduledNodes &&
+         "The number of nodes scheduled doesn't match the expected number!");
+#endif // NDEBUG
+}
+
+// EmitSchedule - Emit the machine code in scheduled order.
+void SchedulePostRATDList::EmitSchedule() {
+  RegionBegin = RegionEnd;
+
+  // If first instruction was a DBG_VALUE then put it back.
+  if (FirstDbgValue)
+    BB->splice(RegionEnd, BB, FirstDbgValue);
+
+  // Then re-insert them according to the given schedule.
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    if (SUnit *SU = Sequence[i])
+      BB->splice(RegionEnd, BB, SU->getInstr());
+    else
+      // Null SUnit* is a noop.
+      TII->insertNoop(*BB, RegionEnd);
+
+    // Update the Begin iterator, as the first instruction in the block
+    // may have been scheduled later.
+    if (i == 0)
+      RegionBegin = std::prev(RegionEnd);
+  }
+
+  // Reinsert any remaining debug_values.
+  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
+         DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
+    std::pair<MachineInstr *, MachineInstr *> P = *std::prev(DI);
+    MachineInstr *DbgValue = P.first;
+    MachineBasicBlock::iterator OrigPrivMI = P.second;
+    BB->splice(++OrigPrivMI, BB, DbgValue);
+  }
+  DbgValues.clear();
+  FirstDbgValue = nullptr;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
new file mode 100644
index 000000000000..5b822b5d7b95
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -0,0 +1,415 @@
+//===- PreISelIntrinsicLowering.cpp - Pre-ISel intrinsic lowering pass ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR lowering for the llvm.memcpy, llvm.memmove,
+// llvm.memset, llvm.load.relative and llvm.objc.* intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/PreISelIntrinsicLowering.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
+#include "llvm/Analysis/ObjCARCUtil.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+
+using namespace llvm;
+
+/// Threshold to leave statically sized memory intrinsic calls. Calls of known
+/// size larger than this will be expanded by the pass. Calls of unknown or
+/// lower size will be left for expansion in codegen.
+static cl::opt<int64_t> MemIntrinsicExpandSizeThresholdOpt(
+    "mem-intrinsic-expand-size",
+    cl::desc("Set minimum mem intrinsic size to expand in IR"), cl::init(-1),
+    cl::Hidden);
+
+namespace {
+
+struct PreISelIntrinsicLowering {
+  const TargetMachine &TM;
+  const function_ref<TargetTransformInfo &(Function &)> LookupTTI;
+
+  /// If this is true, assume it's preferably to leave memory intrinsic calls
+  /// for replacement with a library call later. Otherwise this depends on
+  /// TargetLoweringInfo availability of the corresponding function.
+  const bool UseMemIntrinsicLibFunc;
+
+  explicit PreISelIntrinsicLowering(
+      const TargetMachine &TM_,
+      function_ref<TargetTransformInfo &(Function &)> LookupTTI_,
+      bool UseMemIntrinsicLibFunc_ = true)
+      : TM(TM_), LookupTTI(LookupTTI_),
+        UseMemIntrinsicLibFunc(UseMemIntrinsicLibFunc_) {}
+
+  static bool shouldExpandMemIntrinsicWithSize(Value *Size,
+                                               const TargetTransformInfo &TTI);
+  bool expandMemIntrinsicUses(Function &F) const;
+  bool lowerIntrinsics(Module &M) const;
+};
+
+} // namespace
+
+static bool lowerLoadRelative(Function &F) {
+  if (F.use_empty())
+    return false;
+
+  bool Changed = false;
+  Type *Int32Ty = Type::getInt32Ty(F.getContext());
+  Type *Int32PtrTy = Int32Ty->getPointerTo();
+  Type *Int8Ty = Type::getInt8Ty(F.getContext());
+
+  for (Use &U : llvm::make_early_inc_range(F.uses())) {
+    auto CI = dyn_cast<CallInst>(U.getUser());
+    if (!CI || CI->getCalledOperand() != &F)
+      continue;
+
+    IRBuilder<> B(CI);
+    Value *OffsetPtr =
+        B.CreateGEP(Int8Ty, CI->getArgOperand(0), CI->getArgOperand(1));
+    Value *OffsetPtrI32 = B.CreateBitCast(OffsetPtr, Int32PtrTy);
+    Value *OffsetI32 = B.CreateAlignedLoad(Int32Ty, OffsetPtrI32, Align(4));
+
+    Value *ResultPtr = B.CreateGEP(Int8Ty, CI->getArgOperand(0), OffsetI32);
+
+    CI->replaceAllUsesWith(ResultPtr);
+    CI->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+// ObjCARC has knowledge about whether an obj-c runtime function needs to be
+// always tail-called or never tail-called.
+static CallInst::TailCallKind getOverridingTailCallKind(const Function &F) {
+  objcarc::ARCInstKind Kind = objcarc::GetFunctionClass(&F);
+  if (objcarc::IsAlwaysTail(Kind))
+    return CallInst::TCK_Tail;
+  else if (objcarc::IsNeverTail(Kind))
+    return CallInst::TCK_NoTail;
+  return CallInst::TCK_None;
+}
+
+static bool lowerObjCCall(Function &F, const char *NewFn,
+                          bool setNonLazyBind = false) {
+  assert(IntrinsicInst::mayLowerToFunctionCall(F.getIntrinsicID()) &&
+         "Pre-ISel intrinsics do lower into regular function calls");
+  if (F.use_empty())
+    return false;
+
+  // If we haven't already looked up this function, check to see if the
+  // program already contains a function with this name.
+  Module *M = F.getParent();
+  FunctionCallee FCache = M->getOrInsertFunction(NewFn, F.getFunctionType());
+
+  if (Function *Fn = dyn_cast<Function>(FCache.getCallee())) {
+    Fn->setLinkage(F.getLinkage());
+    if (setNonLazyBind && !Fn->isWeakForLinker()) {
+      // If we have Native ARC, set nonlazybind attribute for these APIs for
+      // performance.
+      Fn->addFnAttr(Attribute::NonLazyBind);
+    }
+  }
+
+  CallInst::TailCallKind OverridingTCK = getOverridingTailCallKind(F);
+
+  for (Use &U : llvm::make_early_inc_range(F.uses())) {
+    auto *CB = cast<CallBase>(U.getUser());
+
+    if (CB->getCalledFunction() != &F) {
+      objcarc::ARCInstKind Kind = objcarc::getAttachedARCFunctionKind(CB);
+      (void)Kind;
+      assert((Kind == objcarc::ARCInstKind::RetainRV ||
+              Kind == objcarc::ARCInstKind::UnsafeClaimRV) &&
+             "use expected to be the argument of operand bundle "
+             "\"clang.arc.attachedcall\"");
+      U.set(FCache.getCallee());
+      continue;
+    }
+
+    auto *CI = cast<CallInst>(CB);
+    assert(CI->getCalledFunction() && "Cannot lower an indirect call!");
+
+    IRBuilder<> Builder(CI->getParent(), CI->getIterator());
+    SmallVector<Value *, 8> Args(CI->args());
+    SmallVector<llvm::OperandBundleDef, 1> BundleList;
+    CI->getOperandBundlesAsDefs(BundleList);
+    CallInst *NewCI = Builder.CreateCall(FCache, Args, BundleList);
+    NewCI->setName(CI->getName());
+
+    // Try to set the most appropriate TailCallKind based on both the current
+    // attributes and the ones that we could get from ObjCARC's special
+    // knowledge of the runtime functions.
+    //
+    // std::max respects both requirements of notail and tail here:
+    // * notail on either the call or from ObjCARC becomes notail
+    // * tail on either side is stronger than none, but not notail
+    CallInst::TailCallKind TCK = CI->getTailCallKind();
+    NewCI->setTailCallKind(std::max(TCK, OverridingTCK));
+
+    if (!CI->use_empty())
+      CI->replaceAllUsesWith(NewCI);
+    CI->eraseFromParent();
+  }
+
+  return true;
+}
+
+// TODO: Should refine based on estimated number of accesses (e.g. does it
+// require splitting based on alignment)
+bool PreISelIntrinsicLowering::shouldExpandMemIntrinsicWithSize(
+    Value *Size, const TargetTransformInfo &TTI) {
+  ConstantInt *CI = dyn_cast<ConstantInt>(Size);
+  if (!CI)
+    return true;
+  uint64_t Threshold = MemIntrinsicExpandSizeThresholdOpt.getNumOccurrences()
+                           ? MemIntrinsicExpandSizeThresholdOpt
+                           : TTI.getMaxMemIntrinsicInlineSizeThreshold();
+  uint64_t SizeVal = CI->getZExtValue();
+
+  // Treat a threshold of 0 as a special case to force expansion of all
+  // intrinsics, including size 0.
+  return SizeVal > Threshold || Threshold == 0;
+}
+
+static bool canEmitLibcall(const TargetMachine &TM, Function *F,
+                           RTLIB::Libcall LC) {
+  // TODO: Should this consider the address space of the memcpy?
+  const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering();
+  return TLI->getLibcallName(LC) != nullptr;
+}
+
+// TODO: Handle atomic memcpy and memcpy.inline
+// TODO: Pass ScalarEvolution
+bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
+  Intrinsic::ID ID = F.getIntrinsicID();
+  bool Changed = false;
+
+  for (User *U : llvm::make_early_inc_range(F.users())) {
+    Instruction *Inst = cast<Instruction>(U);
+
+    switch (ID) {
+    case Intrinsic::memcpy: {
+      auto *Memcpy = cast<MemCpyInst>(Inst);
+      Function *ParentFunc = Memcpy->getFunction();
+      const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+      if (shouldExpandMemIntrinsicWithSize(Memcpy->getLength(), TTI)) {
+        if (UseMemIntrinsicLibFunc &&
+            canEmitLibcall(TM, ParentFunc, RTLIB::MEMCPY))
+          break;
+
+        // TODO: For optsize, emit the loop into a separate function
+        expandMemCpyAsLoop(Memcpy, TTI);
+        Changed = true;
+        Memcpy->eraseFromParent();
+      }
+
+      break;
+    }
+    case Intrinsic::memmove: {
+      auto *Memmove = cast<MemMoveInst>(Inst);
+      Function *ParentFunc = Memmove->getFunction();
+      const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+      if (shouldExpandMemIntrinsicWithSize(Memmove->getLength(), TTI)) {
+        if (UseMemIntrinsicLibFunc &&
+            canEmitLibcall(TM, ParentFunc, RTLIB::MEMMOVE))
+          break;
+
+        if (expandMemMoveAsLoop(Memmove, TTI)) {
+          Changed = true;
+          Memmove->eraseFromParent();
+        }
+      }
+
+      break;
+    }
+    case Intrinsic::memset: {
+      auto *Memset = cast<MemSetInst>(Inst);
+      Function *ParentFunc = Memset->getFunction();
+      const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+      if (shouldExpandMemIntrinsicWithSize(Memset->getLength(), TTI)) {
+        if (UseMemIntrinsicLibFunc &&
+            canEmitLibcall(TM, ParentFunc, RTLIB::MEMSET))
+          break;
+
+        expandMemSetAsLoop(Memset);
+        Changed = true;
+        Memset->eraseFromParent();
+      }
+
+      break;
+    }
+    default:
+      llvm_unreachable("unhandled intrinsic");
+    }
+  }
+
+  return Changed;
+}
+
+bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
+  bool Changed = false;
+  for (Function &F : M) {
+    switch (F.getIntrinsicID()) {
+    default:
+      break;
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove:
+    case Intrinsic::memset:
+      Changed |= expandMemIntrinsicUses(F);
+      break;
+    case Intrinsic::load_relative:
+      Changed |= lowerLoadRelative(F);
+      break;
+    case Intrinsic::objc_autorelease:
+      Changed |= lowerObjCCall(F, "objc_autorelease");
+      break;
+    case Intrinsic::objc_autoreleasePoolPop:
+      Changed |= lowerObjCCall(F, "objc_autoreleasePoolPop");
+      break;
+    case Intrinsic::objc_autoreleasePoolPush:
+      Changed |= lowerObjCCall(F, "objc_autoreleasePoolPush");
+      break;
+    case Intrinsic::objc_autoreleaseReturnValue:
+      Changed |= lowerObjCCall(F, "objc_autoreleaseReturnValue");
+      break;
+    case Intrinsic::objc_copyWeak:
+      Changed |= lowerObjCCall(F, "objc_copyWeak");
+      break;
+    case Intrinsic::objc_destroyWeak:
+      Changed |= lowerObjCCall(F, "objc_destroyWeak");
+      break;
+    case Intrinsic::objc_initWeak:
+      Changed |= lowerObjCCall(F, "objc_initWeak");
+      break;
+    case Intrinsic::objc_loadWeak:
+      Changed |= lowerObjCCall(F, "objc_loadWeak");
+      break;
+    case Intrinsic::objc_loadWeakRetained:
+      Changed |= lowerObjCCall(F, "objc_loadWeakRetained");
+      break;
+    case Intrinsic::objc_moveWeak:
+      Changed |= lowerObjCCall(F, "objc_moveWeak");
+      break;
+    case Intrinsic::objc_release:
+      Changed |= lowerObjCCall(F, "objc_release", true);
+      break;
+    case Intrinsic::objc_retain:
+      Changed |= lowerObjCCall(F, "objc_retain", true);
+      break;
+    case Intrinsic::objc_retainAutorelease:
+      Changed |= lowerObjCCall(F, "objc_retainAutorelease");
+      break;
+    case Intrinsic::objc_retainAutoreleaseReturnValue:
+      Changed |= lowerObjCCall(F, "objc_retainAutoreleaseReturnValue");
+      break;
+    case Intrinsic::objc_retainAutoreleasedReturnValue:
+      Changed |= lowerObjCCall(F, "objc_retainAutoreleasedReturnValue");
+      break;
+    case Intrinsic::objc_retainBlock:
+      Changed |= lowerObjCCall(F, "objc_retainBlock");
+      break;
+    case Intrinsic::objc_storeStrong:
+      Changed |= lowerObjCCall(F, "objc_storeStrong");
+      break;
+    case Intrinsic::objc_storeWeak:
+      Changed |= lowerObjCCall(F, "objc_storeWeak");
+      break;
+    case Intrinsic::objc_unsafeClaimAutoreleasedReturnValue:
+      Changed |= lowerObjCCall(F, "objc_unsafeClaimAutoreleasedReturnValue");
+      break;
+    case Intrinsic::objc_retainedObject:
+      Changed |= lowerObjCCall(F, "objc_retainedObject");
+      break;
+    case Intrinsic::objc_unretainedObject:
+      Changed |= lowerObjCCall(F, "objc_unretainedObject");
+      break;
+    case Intrinsic::objc_unretainedPointer:
+      Changed |= lowerObjCCall(F, "objc_unretainedPointer");
+      break;
+    case Intrinsic::objc_retain_autorelease:
+      Changed |= lowerObjCCall(F, "objc_retain_autorelease");
+      break;
+    case Intrinsic::objc_sync_enter:
+      Changed |= lowerObjCCall(F, "objc_sync_enter");
+      break;
+    case Intrinsic::objc_sync_exit:
+      Changed |= lowerObjCCall(F, "objc_sync_exit");
+      break;
+    }
+  }
+  return Changed;
+}
+
+namespace {
+
+class PreISelIntrinsicLoweringLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  PreISelIntrinsicLoweringLegacyPass() : ModulePass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<TargetPassConfig>();
+  }
+
+  bool runOnModule(Module &M) override {
+    auto LookupTTI = [this](Function &F) -> TargetTransformInfo & {
+      return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    };
+
+    const auto &TM = getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+    PreISelIntrinsicLowering Lowering(TM, LookupTTI);
+    return Lowering.lowerIntrinsics(M);
+  }
+};
+
+} // end anonymous namespace
+
+char PreISelIntrinsicLoweringLegacyPass::ID;
+
+INITIALIZE_PASS_BEGIN(PreISelIntrinsicLoweringLegacyPass,
+                      "pre-isel-intrinsic-lowering",
+                      "Pre-ISel Intrinsic Lowering", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(PreISelIntrinsicLoweringLegacyPass,
+                    "pre-isel-intrinsic-lowering",
+                    "Pre-ISel Intrinsic Lowering", false, false)
+
+ModulePass *llvm::createPreISelIntrinsicLoweringPass() {
+  return new PreISelIntrinsicLoweringLegacyPass();
+}
+
+PreservedAnalyses PreISelIntrinsicLoweringPass::run(Module &M,
+                                                    ModuleAnalysisManager &AM) {
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  auto LookupTTI = [&FAM](Function &F) -> TargetTransformInfo & {
+    return FAM.getResult<TargetIRAnalysis>(F);
+  };
+
+  PreISelIntrinsicLowering Lowering(TM, LookupTTI);
+  if (!Lowering.lowerIntrinsics(M))
+    return PreservedAnalyses::all();
+  else
+    return PreservedAnalyses::none();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ProcessImplicitDefs.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
new file mode 100644
index 000000000000..be81ecab9c89
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -0,0 +1,168 @@
+//===---------------------- ProcessImplicitDefs.cpp -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "processimpdefs"
+
+namespace {
+/// Process IMPLICIT_DEF instructions and make sure there is one implicit_def
+/// for each use. Add isUndef marker to implicit_def defs and their uses.
+class ProcessImplicitDefs : public MachineFunctionPass {
+  const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+
+  SmallSetVector<MachineInstr*, 16> WorkList;
+
+  void processImplicitDef(MachineInstr *MI);
+  bool canTurnIntoImplicitDef(MachineInstr *MI);
+
+public:
+  static char ID;
+
+  ProcessImplicitDefs() : MachineFunctionPass(ID) {
+    initializeProcessImplicitDefsPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &au) const override;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
+};
+} // end anonymous namespace
+
+char ProcessImplicitDefs::ID = 0;
+char &llvm::ProcessImplicitDefsID = ProcessImplicitDefs::ID;
+
+INITIALIZE_PASS(ProcessImplicitDefs, DEBUG_TYPE,
+                "Process Implicit Definitions", false, false)
+
+void ProcessImplicitDefs::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addPreserved<AAResultsWrapperPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool ProcessImplicitDefs::canTurnIntoImplicitDef(MachineInstr *MI) {
+  if (!MI->isCopyLike() &&
+      !MI->isInsertSubreg() &&
+      !MI->isRegSequence() &&
+      !MI->isPHI())
+    return false;
+  for (const MachineOperand &MO : MI->all_uses())
+    if (MO.readsReg())
+      return false;
+  return true;
+}
+
+void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
+  LLVM_DEBUG(dbgs() << "Processing " << *MI);
+  Register Reg = MI->getOperand(0).getReg();
+
+  if (Reg.isVirtual()) {
+    // For virtual registers, mark all uses as <undef>, and convert users to
+    // implicit-def when possible.
+    for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+      MO.setIsUndef();
+      MachineInstr *UserMI = MO.getParent();
+      if (!canTurnIntoImplicitDef(UserMI))
+        continue;
+      LLVM_DEBUG(dbgs() << "Converting to IMPLICIT_DEF: " << *UserMI);
+      UserMI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
+      WorkList.insert(UserMI);
+    }
+    MI->eraseFromParent();
+    return;
+  }
+
+  // This is a physreg implicit-def.
+  // Look for the first instruction to use or define an alias.
+  MachineBasicBlock::instr_iterator UserMI = MI->getIterator();
+  MachineBasicBlock::instr_iterator UserE = MI->getParent()->instr_end();
+  bool Found = false;
+  for (++UserMI; UserMI != UserE; ++UserMI) {
+    for (MachineOperand &MO : UserMI->operands()) {
+      if (!MO.isReg())
+        continue;
+      Register UserReg = MO.getReg();
+      if (!UserReg.isPhysical() || !TRI->regsOverlap(Reg, UserReg))
+        continue;
+      // UserMI uses or redefines Reg. Set <undef> flags on all uses.
+      Found = true;
+      if (MO.isUse())
+        MO.setIsUndef();
+    }
+    if (Found)
+      break;
+  }
+
+  // If we found the using MI, we can erase the IMPLICIT_DEF.
+  if (Found) {
+    LLVM_DEBUG(dbgs() << "Physreg user: " << *UserMI);
+    MI->eraseFromParent();
+    return;
+  }
+
+  // Using instr wasn't found, it could be in another block.
+  // Leave the physreg IMPLICIT_DEF, but trim any extra operands.
+  for (unsigned i = MI->getNumOperands() - 1; i; --i)
+    MI->removeOperand(i);
+  LLVM_DEBUG(dbgs() << "Keeping physreg: " << *MI);
+}
+
+/// processImplicitDefs - Process IMPLICIT_DEF instructions and turn them into
+/// <undef> operands.
+bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &MF) {
+
+  LLVM_DEBUG(dbgs() << "********** PROCESS IMPLICIT DEFS **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
+
+  bool Changed = false;
+
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  assert(WorkList.empty() && "Inconsistent worklist state");
+
+  for (MachineBasicBlock &MBB : MF) {
+    // Scan the basic block for implicit defs.
+    for (MachineInstr &MI : MBB)
+      if (MI.isImplicitDef())
+        WorkList.insert(&MI);
+
+    if (WorkList.empty())
+      continue;
+
+    LLVM_DEBUG(dbgs() << printMBBReference(MBB) << " has " << WorkList.size()
+                      << " implicit defs.\n");
+    Changed = true;
+
+    // Drain the WorkList to recursively process any new implicit defs.
+    do processImplicitDef(WorkList.pop_back_val());
+    while (!WorkList.empty());
+  }
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PrologEpilogInserter.cpp
new file mode 100644
index 000000000000..e323aaaeefaf
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -0,0 +1,1580 @@
+//===- PrologEpilogInserter.cpp - Insert Prolog/Epilog code in function ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is responsible for finalizing the functions frame layout, saving
+// callee saved registers, and for emitting prolog & epilog code for the
+// function.
+//
+// This pass must be run after register allocation.  After this pass is
+// executed, it is illegal to construct MO_FrameIndex operands.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "prologepilog"
+
+using MBBVector = SmallVector<MachineBasicBlock *, 4>;
+
+STATISTIC(NumLeafFuncWithSpills, "Number of leaf functions with CSRs");
+STATISTIC(NumFuncSeen, "Number of functions seen in PEI");
+
+
+namespace {
+
+class PEI : public MachineFunctionPass {
+public:
+  static char ID;
+
+  PEI() : MachineFunctionPass(ID) {
+    initializePEIPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
+  /// frame indexes with appropriate references.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  RegScavenger *RS = nullptr;
+
+  // MinCSFrameIndex, MaxCSFrameIndex - Keeps the range of callee saved
+  // stack frame indexes.
+  unsigned MinCSFrameIndex = std::numeric_limits<unsigned>::max();
+  unsigned MaxCSFrameIndex = 0;
+
+  // Save and Restore blocks of the current function. Typically there is a
+  // single save block, unless Windows EH funclets are involved.
+  MBBVector SaveBlocks;
+  MBBVector RestoreBlocks;
+
+  // Flag to control whether to use the register scavenger to resolve
+  // frame index materialization registers. Set according to
+  // TRI->requiresFrameIndexScavenging() for the current function.
+  bool FrameIndexVirtualScavenging = false;
+
+  // Flag to control whether the scavenger should be passed even though
+  // FrameIndexVirtualScavenging is used.
+  bool FrameIndexEliminationScavenging = false;
+
+  // Emit remarks.
+  MachineOptimizationRemarkEmitter *ORE = nullptr;
+
+  void calculateCallFrameInfo(MachineFunction &MF);
+  void calculateSaveRestoreBlocks(MachineFunction &MF);
+  void spillCalleeSavedRegs(MachineFunction &MF);
+
+  void calculateFrameObjectOffsets(MachineFunction &MF);
+  void replaceFrameIndices(MachineFunction &MF);
+  void replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF,
+                           int &SPAdj);
+  // Frame indices in debug values are encoded in a target independent
+  // way with simply the frame index and offset rather than any
+  // target-specific addressing mode.
+  bool replaceFrameIndexDebugInstr(MachineFunction &MF, MachineInstr &MI,
+                                   unsigned OpIdx, int SPAdj = 0);
+  // Does same as replaceFrameIndices but using the backward MIR walk and
+  // backward register scavenger walk. Does not yet support call sequence
+  // processing.
+  void replaceFrameIndicesBackward(MachineBasicBlock *BB, MachineFunction &MF,
+                                   int &SPAdj);
+
+  void insertPrologEpilogCode(MachineFunction &MF);
+  void insertZeroCallUsedRegs(MachineFunction &MF);
+};
+
+} // end anonymous namespace
+
+char PEI::ID = 0;
+
+char &llvm::PrologEpilogCodeInserterID = PEI::ID;
+
+INITIALIZE_PASS_BEGIN(PEI, DEBUG_TYPE, "Prologue/Epilogue Insertion", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
+INITIALIZE_PASS_END(PEI, DEBUG_TYPE,
+                    "Prologue/Epilogue Insertion & Frame Finalization", false,
+                    false)
+
+MachineFunctionPass *llvm::createPrologEpilogInserterPass() {
+  return new PEI();
+}
+
+STATISTIC(NumBytesStackSpace,
+          "Number of bytes used for stack in all functions");
+
+void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineOptimizationRemarkEmitterPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// StackObjSet - A set of stack object indexes
+using StackObjSet = SmallSetVector<int, 8>;
+
+using SavedDbgValuesMap =
+    SmallDenseMap<MachineBasicBlock *, SmallVector<MachineInstr *, 4>, 4>;
+
+/// Stash DBG_VALUEs that describe parameters and which are placed at the start
+/// of the block. Later on, after the prologue code has been emitted, the
+/// stashed DBG_VALUEs will be reinserted at the start of the block.
+static void stashEntryDbgValues(MachineBasicBlock &MBB,
+                                SavedDbgValuesMap &EntryDbgValues) {
+  SmallVector<const MachineInstr *, 4> FrameIndexValues;
+
+  for (auto &MI : MBB) {
+    if (!MI.isDebugInstr())
+      break;
+    if (!MI.isDebugValue() || !MI.getDebugVariable()->isParameter())
+      continue;
+    if (any_of(MI.debug_operands(),
+               [](const MachineOperand &MO) { return MO.isFI(); })) {
+      // We can only emit valid locations for frame indices after the frame
+      // setup, so do not stash away them.
+      FrameIndexValues.push_back(&MI);
+      continue;
+    }
+    const DILocalVariable *Var = MI.getDebugVariable();
+    const DIExpression *Expr = MI.getDebugExpression();
+    auto Overlaps = [Var, Expr](const MachineInstr *DV) {
+      return Var == DV->getDebugVariable() &&
+             Expr->fragmentsOverlap(DV->getDebugExpression());
+    };
+    // See if the debug value overlaps with any preceding debug value that will
+    // not be stashed. If that is the case, then we can't stash this value, as
+    // we would then reorder the values at reinsertion.
+    if (llvm::none_of(FrameIndexValues, Overlaps))
+      EntryDbgValues[&MBB].push_back(&MI);
+  }
+
+  // Remove stashed debug values from the block.
+  if (EntryDbgValues.count(&MBB))
+    for (auto *MI : EntryDbgValues[&MBB])
+      MI->removeFromParent();
+}
+
+/// runOnMachineFunction - Insert prolog/epilog code and replace abstract
+/// frame indexes with appropriate references.
+bool PEI::runOnMachineFunction(MachineFunction &MF) {
+  NumFuncSeen++;
+  const Function &F = MF.getFunction();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+
+  RS = TRI->requiresRegisterScavenging(MF) ? new RegScavenger() : nullptr;
+  FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(MF);
+  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
+
+  // Calculate the MaxCallFrameSize and AdjustsStack variables for the
+  // function's frame information. Also eliminates call frame pseudo
+  // instructions.
+  calculateCallFrameInfo(MF);
+
+  // Determine placement of CSR spill/restore code and prolog/epilog code:
+  // place all spills in the entry block, all restores in return blocks.
+  calculateSaveRestoreBlocks(MF);
+
+  // Stash away DBG_VALUEs that should not be moved by insertion of prolog code.
+  SavedDbgValuesMap EntryDbgValues;
+  for (MachineBasicBlock *SaveBlock : SaveBlocks)
+    stashEntryDbgValues(*SaveBlock, EntryDbgValues);
+
+  // Handle CSR spilling and restoring, for targets that need it.
+  if (MF.getTarget().usesPhysRegsForValues())
+    spillCalleeSavedRegs(MF);
+
+  // Allow the target machine to make final modifications to the function
+  // before the frame layout is finalized.
+  TFI->processFunctionBeforeFrameFinalized(MF, RS);
+
+  // Calculate actual frame offsets for all abstract stack objects...
+  calculateFrameObjectOffsets(MF);
+
+  // Add prolog and epilog code to the function.  This function is required
+  // to align the stack frame as necessary for any stack variables or
+  // called functions.  Because of this, calculateCalleeSavedRegisters()
+  // must be called before this function in order to set the AdjustsStack
+  // and MaxCallFrameSize variables.
+  if (!F.hasFnAttribute(Attribute::Naked))
+    insertPrologEpilogCode(MF);
+
+  // Reinsert stashed debug values at the start of the entry blocks.
+  for (auto &I : EntryDbgValues)
+    I.first->insert(I.first->begin(), I.second.begin(), I.second.end());
+
+  // Allow the target machine to make final modifications to the function
+  // before the frame layout is finalized.
+  TFI->processFunctionBeforeFrameIndicesReplaced(MF, RS);
+
+  // Replace all MO_FrameIndex operands with physical register references
+  // and actual offsets.
+  //
+  replaceFrameIndices(MF);
+
+  // If register scavenging is needed, as we've enabled doing it as a
+  // post-pass, scavenge the virtual registers that frame index elimination
+  // inserted.
+  if (TRI->requiresRegisterScavenging(MF) && FrameIndexVirtualScavenging)
+    scavengeFrameVirtualRegs(MF, *RS);
+
+  // Warn on stack size when we exceeds the given limit.
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  uint64_t StackSize = MFI.getStackSize();
+
+  unsigned Threshold = UINT_MAX;
+  if (MF.getFunction().hasFnAttribute("warn-stack-size")) {
+    bool Failed = MF.getFunction()
+                      .getFnAttribute("warn-stack-size")
+                      .getValueAsString()
+                      .getAsInteger(10, Threshold);
+    // Verifier should have caught this.
+    assert(!Failed && "Invalid warn-stack-size fn attr value");
+    (void)Failed;
+  }
+  uint64_t UnsafeStackSize = MFI.getUnsafeStackSize();
+  if (MF.getFunction().hasFnAttribute(Attribute::SafeStack))
+    StackSize += UnsafeStackSize;
+
+  if (StackSize > Threshold) {
+    DiagnosticInfoStackSize DiagStackSize(F, StackSize, Threshold, DS_Warning);
+    F.getContext().diagnose(DiagStackSize);
+    int64_t SpillSize = 0;
+    for (int Idx = MFI.getObjectIndexBegin(), End = MFI.getObjectIndexEnd();
+         Idx != End; ++Idx) {
+      if (MFI.isSpillSlotObjectIndex(Idx))
+        SpillSize += MFI.getObjectSize(Idx);
+    }
+
+    [[maybe_unused]] float SpillPct =
+        static_cast<float>(SpillSize) / static_cast<float>(StackSize);
+    LLVM_DEBUG(
+        dbgs() << formatv("{0}/{1} ({3:P}) spills, {2}/{1} ({4:P}) variables",
+                          SpillSize, StackSize, StackSize - SpillSize, SpillPct,
+                          1.0f - SpillPct));
+    if (UnsafeStackSize != 0) {
+      LLVM_DEBUG(dbgs() << formatv(", {0}/{2} ({1:P}) unsafe stack",
+                                   UnsafeStackSize,
+                                   static_cast<float>(UnsafeStackSize) /
+                                       static_cast<float>(StackSize),
+                                   StackSize));
+    }
+    LLVM_DEBUG(dbgs() << "\n");
+  }
+
+  ORE->emit([&]() {
+    return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "StackSize",
+                                             MF.getFunction().getSubprogram(),
+                                             &MF.front())
+           << ore::NV("NumStackBytes", StackSize) << " stack bytes in function";
+  });
+
+  delete RS;
+  SaveBlocks.clear();
+  RestoreBlocks.clear();
+  MFI.setSavePoint(nullptr);
+  MFI.setRestorePoint(nullptr);
+  return true;
+}
+
+/// Calculate the MaxCallFrameSize and AdjustsStack
+/// variables for the function's frame information and eliminate call frame
+/// pseudo instructions.
+void PEI::calculateCallFrameInfo(MachineFunction &MF) {
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  unsigned MaxCallFrameSize = 0;
+  bool AdjustsStack = MFI.adjustsStack();
+
+  // Get the function call frame set-up and tear-down instruction opcode
+  unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
+  unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
+
+  // Early exit for targets which have no call frame setup/destroy pseudo
+  // instructions.
+  if (FrameSetupOpcode == ~0u && FrameDestroyOpcode == ~0u)
+    return;
+
+  std::vector<MachineBasicBlock::iterator> FrameSDOps;
+  for (MachineBasicBlock &BB : MF)
+    for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I)
+      if (TII.isFrameInstr(*I)) {
+        unsigned Size = TII.getFrameSize(*I);
+        if (Size > MaxCallFrameSize) MaxCallFrameSize = Size;
+        AdjustsStack = true;
+        FrameSDOps.push_back(I);
+      } else if (I->isInlineAsm()) {
+        // Some inline asm's need a stack frame, as indicated by operand 1.
+        unsigned ExtraInfo = I->getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+        if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
+          AdjustsStack = true;
+      }
+
+  assert(!MFI.isMaxCallFrameSizeComputed() ||
+         (MFI.getMaxCallFrameSize() >= MaxCallFrameSize &&
+          !(AdjustsStack && !MFI.adjustsStack())));
+  MFI.setAdjustsStack(AdjustsStack);
+  MFI.setMaxCallFrameSize(MaxCallFrameSize);
+
+  for (MachineBasicBlock::iterator I : FrameSDOps) {
+    // If call frames are not being included as part of the stack frame, and
+    // the target doesn't indicate otherwise, remove the call frame pseudos
+    // here. The sub/add sp instruction pairs are still inserted, but we don't
+    // need to track the SP adjustment for frame index elimination.
+    if (TFI->canSimplifyCallFramePseudos(MF))
+      TFI->eliminateCallFramePseudoInstr(MF, *I->getParent(), I);
+  }
+}
+
+/// Compute the sets of entry and return blocks for saving and restoring
+/// callee-saved registers, and placing prolog and epilog code.
+void PEI::calculateSaveRestoreBlocks(MachineFunction &MF) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // Even when we do not change any CSR, we still want to insert the
+  // prologue and epilogue of the function.
+  // So set the save points for those.
+
+  // Use the points found by shrink-wrapping, if any.
+  if (MFI.getSavePoint()) {
+    SaveBlocks.push_back(MFI.getSavePoint());
+    assert(MFI.getRestorePoint() && "Both restore and save must be set");
+    MachineBasicBlock *RestoreBlock = MFI.getRestorePoint();
+    // If RestoreBlock does not have any successor and is not a return block
+    // then the end point is unreachable and we do not need to insert any
+    // epilogue.
+    if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock())
+      RestoreBlocks.push_back(RestoreBlock);
+    return;
+  }
+
+  // Save refs to entry and return blocks.
+  SaveBlocks.push_back(&MF.front());
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBB.isEHFuncletEntry())
+      SaveBlocks.push_back(&MBB);
+    if (MBB.isReturnBlock())
+      RestoreBlocks.push_back(&MBB);
+  }
+}
+
+static void assignCalleeSavedSpillSlots(MachineFunction &F,
+                                        const BitVector &SavedRegs,
+                                        unsigned &MinCSFrameIndex,
+                                        unsigned &MaxCSFrameIndex) {
+  if (SavedRegs.empty())
+    return;
+
+  const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo();
+  const MCPhysReg *CSRegs = F.getRegInfo().getCalleeSavedRegs();
+  BitVector CSMask(SavedRegs.size());
+
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    CSMask.set(CSRegs[i]);
+
+  std::vector<CalleeSavedInfo> CSI;
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    if (SavedRegs.test(Reg)) {
+      bool SavedSuper = false;
+      for (const MCPhysReg &SuperReg : RegInfo->superregs(Reg)) {
+        // Some backends set all aliases for some registers as saved, such as
+        // Mips's $fp, so they appear in SavedRegs but not CSRegs.
+        if (SavedRegs.test(SuperReg) && CSMask.test(SuperReg)) {
+          SavedSuper = true;
+          break;
+        }
+      }
+
+      if (!SavedSuper)
+        CSI.push_back(CalleeSavedInfo(Reg));
+    }
+  }
+
+  const TargetFrameLowering *TFI = F.getSubtarget().getFrameLowering();
+  MachineFrameInfo &MFI = F.getFrameInfo();
+  if (!TFI->assignCalleeSavedSpillSlots(F, RegInfo, CSI, MinCSFrameIndex,
+                                        MaxCSFrameIndex)) {
+    // If target doesn't implement this, use generic code.
+
+    if (CSI.empty())
+      return; // Early exit if no callee saved registers are modified!
+
+    unsigned NumFixedSpillSlots;
+    const TargetFrameLowering::SpillSlot *FixedSpillSlots =
+        TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots);
+
+    // Now that we know which registers need to be saved and restored, allocate
+    // stack slots for them.
+    for (auto &CS : CSI) {
+      // If the target has spilled this register to another register, we don't
+      // need to allocate a stack slot.
+      if (CS.isSpilledToReg())
+        continue;
+
+      unsigned Reg = CS.getReg();
+      const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
+
+      int FrameIdx;
+      if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) {
+        CS.setFrameIdx(FrameIdx);
+        continue;
+      }
+
+      // Check to see if this physreg must be spilled to a particular stack slot
+      // on this target.
+      const TargetFrameLowering::SpillSlot *FixedSlot = FixedSpillSlots;
+      while (FixedSlot != FixedSpillSlots + NumFixedSpillSlots &&
+             FixedSlot->Reg != Reg)
+        ++FixedSlot;
+
+      unsigned Size = RegInfo->getSpillSize(*RC);
+      if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) {
+        // Nope, just spill it anywhere convenient.
+        Align Alignment = RegInfo->getSpillAlign(*RC);
+        // We may not be able to satisfy the desired alignment specification of
+        // the TargetRegisterClass if the stack alignment is smaller. Use the
+        // min.
+        Alignment = std::min(Alignment, TFI->getStackAlign());
+        FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
+        if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
+        if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
+      } else {
+        // Spill it to the stack where we must.
+        FrameIdx = MFI.CreateFixedSpillStackObject(Size, FixedSlot->Offset);
+      }
+
+      CS.setFrameIdx(FrameIdx);
+    }
+  }
+
+  MFI.setCalleeSavedInfo(CSI);
+}
+
+/// Helper function to update the liveness information for the callee-saved
+/// registers.
+static void updateLiveness(MachineFunction &MF) {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  // Visited will contain all the basic blocks that are in the region
+  // where the callee saved registers are alive:
+  // - Anything that is not Save or Restore -> LiveThrough.
+  // - Save -> LiveIn.
+  // - Restore -> LiveOut.
+  // The live-out is not attached to the block, so no need to keep
+  // Restore in this set.
+  SmallPtrSet<MachineBasicBlock *, 8> Visited;
+  SmallVector<MachineBasicBlock *, 8> WorkList;
+  MachineBasicBlock *Entry = &MF.front();
+  MachineBasicBlock *Save = MFI.getSavePoint();
+
+  if (!Save)
+    Save = Entry;
+
+  if (Entry != Save) {
+    WorkList.push_back(Entry);
+    Visited.insert(Entry);
+  }
+  Visited.insert(Save);
+
+  MachineBasicBlock *Restore = MFI.getRestorePoint();
+  if (Restore)
+    // By construction Restore cannot be visited, otherwise it
+    // means there exists a path to Restore that does not go
+    // through Save.
+    WorkList.push_back(Restore);
+
+  while (!WorkList.empty()) {
+    const MachineBasicBlock *CurBB = WorkList.pop_back_val();
+    // By construction, the region that is after the save point is
+    // dominated by the Save and post-dominated by the Restore.
+    if (CurBB == Save && Save != Restore)
+      continue;
+    // Enqueue all the successors not already visited.
+    // Those are by construction either before Save or after Restore.
+    for (MachineBasicBlock *SuccBB : CurBB->successors())
+      if (Visited.insert(SuccBB).second)
+        WorkList.push_back(SuccBB);
+  }
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (const CalleeSavedInfo &I : CSI) {
+    for (MachineBasicBlock *MBB : Visited) {
+      MCPhysReg Reg = I.getReg();
+      // Add the callee-saved register as live-in.
+      // It's killed at the spill.
+      if (!MRI.isReserved(Reg) && !MBB->isLiveIn(Reg))
+        MBB->addLiveIn(Reg);
+    }
+    // If callee-saved register is spilled to another register rather than
+    // spilling to stack, the destination register has to be marked as live for
+    // each MBB between the prologue and epilogue so that it is not clobbered
+    // before it is reloaded in the epilogue. The Visited set contains all
+    // blocks outside of the region delimited by prologue/epilogue.
+    if (I.isSpilledToReg()) {
+      for (MachineBasicBlock &MBB : MF) {
+        if (Visited.count(&MBB))
+          continue;
+        MCPhysReg DstReg = I.getDstReg();
+        if (!MBB.isLiveIn(DstReg))
+          MBB.addLiveIn(DstReg);
+      }
+    }
+  }
+}
+
+/// Insert spill code for the callee-saved registers used in the function.
+static void insertCSRSaves(MachineBasicBlock &SaveBlock,
+                           ArrayRef<CalleeSavedInfo> CSI) {
+  MachineFunction &MF = *SaveBlock.getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+  MachineBasicBlock::iterator I = SaveBlock.begin();
+  if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
+    for (const CalleeSavedInfo &CS : CSI) {
+      // Insert the spill to the stack frame.
+      unsigned Reg = CS.getReg();
+
+      if (CS.isSpilledToReg()) {
+        BuildMI(SaveBlock, I, DebugLoc(),
+                TII.get(TargetOpcode::COPY), CS.getDstReg())
+          .addReg(Reg, getKillRegState(true));
+      } else {
+        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC,
+                                TRI, Register());
+      }
+    }
+  }
+}
+
+/// Insert restore code for the callee-saved registers used in the function.
+static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
+                              std::vector<CalleeSavedInfo> &CSI) {
+  MachineFunction &MF = *RestoreBlock.getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+  // Restore all registers immediately before the return and any
+  // terminators that precede it.
+  MachineBasicBlock::iterator I = RestoreBlock.getFirstTerminator();
+
+  if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
+    for (const CalleeSavedInfo &CI : reverse(CSI)) {
+      unsigned Reg = CI.getReg();
+      if (CI.isSpilledToReg()) {
+        BuildMI(RestoreBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY), Reg)
+          .addReg(CI.getDstReg(), getKillRegState(true));
+      } else {
+        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC,
+                                 TRI, Register());
+        assert(I != RestoreBlock.begin() &&
+               "loadRegFromStackSlot didn't insert any code!");
+        // Insert in reverse order.  loadRegFromStackSlot can insert
+        // multiple instructions.
+      }
+    }
+  }
+}
+
+void PEI::spillCalleeSavedRegs(MachineFunction &MF) {
+  // We can't list this requirement in getRequiredProperties because some
+  // targets (WebAssembly) use virtual registers past this point, and the pass
+  // pipeline is set up without giving the passes a chance to look at the
+  // TargetMachine.
+  // FIXME: Find a way to express this in getRequiredProperties.
+  assert(MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::NoVRegs));
+
+  const Function &F = MF.getFunction();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MinCSFrameIndex = std::numeric_limits<unsigned>::max();
+  MaxCSFrameIndex = 0;
+
+  // Determine which of the registers in the callee save list should be saved.
+  BitVector SavedRegs;
+  TFI->determineCalleeSaves(MF, SavedRegs, RS);
+
+  // Assign stack slots for any callee-saved registers that must be spilled.
+  assignCalleeSavedSpillSlots(MF, SavedRegs, MinCSFrameIndex, MaxCSFrameIndex);
+
+  // Add the code to save and restore the callee saved registers.
+  if (!F.hasFnAttribute(Attribute::Naked)) {
+    MFI.setCalleeSavedInfoValid(true);
+
+    std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+    if (!CSI.empty()) {
+      if (!MFI.hasCalls())
+        NumLeafFuncWithSpills++;
+
+      for (MachineBasicBlock *SaveBlock : SaveBlocks)
+        insertCSRSaves(*SaveBlock, CSI);
+
+      // Update the live-in information of all the blocks up to the save point.
+      updateLiveness(MF);
+
+      for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
+        insertCSRRestores(*RestoreBlock, CSI);
+    }
+  }
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+static inline void AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
+                                     bool StackGrowsDown, int64_t &Offset,
+                                     Align &MaxAlign) {
+  // If the stack grows down, add the object size to find the lowest address.
+  if (StackGrowsDown)
+    Offset += MFI.getObjectSize(FrameIdx);
+
+  Align Alignment = MFI.getObjectAlign(FrameIdx);
+
+  // If the alignment of this object is greater than that of the stack, then
+  // increase the stack alignment to match.
+  MaxAlign = std::max(MaxAlign, Alignment);
+
+  // Adjust to alignment boundary.
+  Offset = alignTo(Offset, Alignment);
+
+  if (StackGrowsDown) {
+    LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset
+                      << "]\n");
+    MFI.setObjectOffset(FrameIdx, -Offset); // Set the computed offset
+  } else {
+    LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset
+                      << "]\n");
+    MFI.setObjectOffset(FrameIdx, Offset);
+    Offset += MFI.getObjectSize(FrameIdx);
+  }
+}
+
+/// Compute which bytes of fixed and callee-save stack area are unused and keep
+/// track of them in StackBytesFree.
+static inline void
+computeFreeStackSlots(MachineFrameInfo &MFI, bool StackGrowsDown,
+                      unsigned MinCSFrameIndex, unsigned MaxCSFrameIndex,
+                      int64_t FixedCSEnd, BitVector &StackBytesFree) {
+  // Avoid undefined int64_t -> int conversion below in extreme case.
+  if (FixedCSEnd > std::numeric_limits<int>::max())
+    return;
+
+  StackBytesFree.resize(FixedCSEnd, true);
+
+  SmallVector<int, 16> AllocatedFrameSlots;
+  // Add fixed objects.
+  for (int i = MFI.getObjectIndexBegin(); i != 0; ++i)
+    // StackSlot scavenging is only implemented for the default stack.
+    if (MFI.getStackID(i) == TargetStackID::Default)
+      AllocatedFrameSlots.push_back(i);
+  // Add callee-save objects if there are any.
+  if (MinCSFrameIndex <= MaxCSFrameIndex) {
+    for (int i = MinCSFrameIndex; i <= (int)MaxCSFrameIndex; ++i)
+      if (MFI.getStackID(i) == TargetStackID::Default)
+        AllocatedFrameSlots.push_back(i);
+  }
+
+  for (int i : AllocatedFrameSlots) {
+    // These are converted from int64_t, but they should always fit in int
+    // because of the FixedCSEnd check above.
+    int ObjOffset = MFI.getObjectOffset(i);
+    int ObjSize = MFI.getObjectSize(i);
+    int ObjStart, ObjEnd;
+    if (StackGrowsDown) {
+      // ObjOffset is negative when StackGrowsDown is true.
+      ObjStart = -ObjOffset - ObjSize;
+      ObjEnd = -ObjOffset;
+    } else {
+      ObjStart = ObjOffset;
+      ObjEnd = ObjOffset + ObjSize;
+    }
+    // Ignore fixed holes that are in the previous stack frame.
+    if (ObjEnd > 0)
+      StackBytesFree.reset(ObjStart, ObjEnd);
+  }
+}
+
+/// Assign frame object to an unused portion of the stack in the fixed stack
+/// object range.  Return true if the allocation was successful.
+static inline bool scavengeStackSlot(MachineFrameInfo &MFI, int FrameIdx,
+                                     bool StackGrowsDown, Align MaxAlign,
+                                     BitVector &StackBytesFree) {
+  if (MFI.isVariableSizedObjectIndex(FrameIdx))
+    return false;
+
+  if (StackBytesFree.none()) {
+    // clear it to speed up later scavengeStackSlot calls to
+    // StackBytesFree.none()
+    StackBytesFree.clear();
+    return false;
+  }
+
+  Align ObjAlign = MFI.getObjectAlign(FrameIdx);
+  if (ObjAlign > MaxAlign)
+    return false;
+
+  int64_t ObjSize = MFI.getObjectSize(FrameIdx);
+  int FreeStart;
+  for (FreeStart = StackBytesFree.find_first(); FreeStart != -1;
+       FreeStart = StackBytesFree.find_next(FreeStart)) {
+
+    // Check that free space has suitable alignment.
+    unsigned ObjStart = StackGrowsDown ? FreeStart + ObjSize : FreeStart;
+    if (alignTo(ObjStart, ObjAlign) != ObjStart)
+      continue;
+
+    if (FreeStart + ObjSize > StackBytesFree.size())
+      return false;
+
+    bool AllBytesFree = true;
+    for (unsigned Byte = 0; Byte < ObjSize; ++Byte)
+      if (!StackBytesFree.test(FreeStart + Byte)) {
+        AllBytesFree = false;
+        break;
+      }
+    if (AllBytesFree)
+      break;
+  }
+
+  if (FreeStart == -1)
+    return false;
+
+  if (StackGrowsDown) {
+    int ObjStart = -(FreeStart + ObjSize);
+    LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") scavenged at SP["
+                      << ObjStart << "]\n");
+    MFI.setObjectOffset(FrameIdx, ObjStart);
+  } else {
+    LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") scavenged at SP["
+                      << FreeStart << "]\n");
+    MFI.setObjectOffset(FrameIdx, FreeStart);
+  }
+
+  StackBytesFree.reset(FreeStart, FreeStart + ObjSize);
+  return true;
+}
+
+/// AssignProtectedObjSet - Helper function to assign large stack objects (i.e.,
+/// those required to be close to the Stack Protector) to stack offsets.
+static void AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
+                                  SmallSet<int, 16> &ProtectedObjs,
+                                  MachineFrameInfo &MFI, bool StackGrowsDown,
+                                  int64_t &Offset, Align &MaxAlign) {
+
+  for (int i : UnassignedObjs) {
+    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
+    ProtectedObjs.insert(i);
+  }
+}
+
+/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
+/// abstract stack objects.
+void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
+  const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  // Loop over all of the stack objects, assigning sequential addresses...
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // Start at the beginning of the local area.
+  // The Offset is the distance from the stack top in the direction
+  // of stack growth -- so it's always nonnegative.
+  int LocalAreaOffset = TFI.getOffsetOfLocalArea();
+  if (StackGrowsDown)
+    LocalAreaOffset = -LocalAreaOffset;
+  assert(LocalAreaOffset >= 0
+         && "Local area offset should be in direction of stack growth");
+  int64_t Offset = LocalAreaOffset;
+
+#ifdef EXPENSIVE_CHECKS
+  for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i)
+    if (!MFI.isDeadObjectIndex(i) &&
+        MFI.getStackID(i) == TargetStackID::Default)
+      assert(MFI.getObjectAlign(i) <= MFI.getMaxAlign() &&
+             "MaxAlignment is invalid");
+#endif
+
+  // If there are fixed sized objects that are preallocated in the local area,
+  // non-fixed objects can't be allocated right at the start of local area.
+  // Adjust 'Offset' to point to the end of last fixed sized preallocated
+  // object.
+  for (int i = MFI.getObjectIndexBegin(); i != 0; ++i) {
+    // Only allocate objects on the default stack.
+    if (MFI.getStackID(i) != TargetStackID::Default)
+      continue;
+
+    int64_t FixedOff;
+    if (StackGrowsDown) {
+      // The maximum distance from the stack pointer is at lower address of
+      // the object -- which is given by offset. For down growing stack
+      // the offset is negative, so we negate the offset to get the distance.
+      FixedOff = -MFI.getObjectOffset(i);
+    } else {
+      // The maximum distance from the start pointer is at the upper
+      // address of the object.
+      FixedOff = MFI.getObjectOffset(i) + MFI.getObjectSize(i);
+    }
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+
+  Align MaxAlign = MFI.getMaxAlign();
+  // First assign frame offsets to stack objects that are used to spill
+  // callee saved registers.
+  if (MaxCSFrameIndex >= MinCSFrameIndex) {
+    for (unsigned i = 0; i <= MaxCSFrameIndex - MinCSFrameIndex; ++i) {
+      unsigned FrameIndex =
+          StackGrowsDown ? MinCSFrameIndex + i : MaxCSFrameIndex - i;
+
+      // Only allocate objects on the default stack.
+      if (MFI.getStackID(FrameIndex) != TargetStackID::Default)
+        continue;
+
+      // TODO: should this just be if (MFI.isDeadObjectIndex(FrameIndex))
+      if (!StackGrowsDown && MFI.isDeadObjectIndex(FrameIndex))
+        continue;
+
+      AdjustStackOffset(MFI, FrameIndex, StackGrowsDown, Offset, MaxAlign);
+    }
+  }
+
+  assert(MaxAlign == MFI.getMaxAlign() &&
+         "MFI.getMaxAlign should already account for all callee-saved "
+         "registers without a fixed stack slot");
+
+  // FixedCSEnd is the stack offset to the end of the fixed and callee-save
+  // stack area.
+  int64_t FixedCSEnd = Offset;
+
+  // Make sure the special register scavenging spill slot is closest to the
+  // incoming stack pointer if a frame pointer is required and is closer
+  // to the incoming rather than the final stack pointer.
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  bool EarlyScavengingSlots = TFI.allocateScavengingFrameIndexesNearIncomingSP(MF);
+  if (RS && EarlyScavengingSlots) {
+    SmallVector<int, 2> SFIs;
+    RS->getScavengingFrameIndices(SFIs);
+    for (int SFI : SFIs)
+      AdjustStackOffset(MFI, SFI, StackGrowsDown, Offset, MaxAlign);
+  }
+
+  // FIXME: Once this is working, then enable flag will change to a target
+  // check for whether the frame is large enough to want to use virtual
+  // frame index registers. Functions which don't want/need this optimization
+  // will continue to use the existing code path.
+  if (MFI.getUseLocalStackAllocationBlock()) {
+    Align Alignment = MFI.getLocalFrameMaxAlign();
+
+    // Adjust to alignment boundary.
+    Offset = alignTo(Offset, Alignment);
+
+    LLVM_DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+
+    // Resolve offsets for objects in the local block.
+    for (unsigned i = 0, e = MFI.getLocalFrameObjectCount(); i != e; ++i) {
+      std::pair<int, int64_t> Entry = MFI.getLocalFrameObjectMap(i);
+      int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
+      LLVM_DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" << FIOffset
+                        << "]\n");
+      MFI.setObjectOffset(Entry.first, FIOffset);
+    }
+    // Allocate the local block
+    Offset += MFI.getLocalFrameSize();
+
+    MaxAlign = std::max(Alignment, MaxAlign);
+  }
+
+  // Retrieve the Exception Handler registration node.
+  int EHRegNodeFrameIndex = std::numeric_limits<int>::max();
+  if (const WinEHFuncInfo *FuncInfo = MF.getWinEHFuncInfo())
+    EHRegNodeFrameIndex = FuncInfo->EHRegNodeFrameIndex;
+
+  // Make sure that the stack protector comes before the local variables on the
+  // stack.
+  SmallSet<int, 16> ProtectedObjs;
+  if (MFI.hasStackProtectorIndex()) {
+    int StackProtectorFI = MFI.getStackProtectorIndex();
+    StackObjSet LargeArrayObjs;
+    StackObjSet SmallArrayObjs;
+    StackObjSet AddrOfObjs;
+
+    // If we need a stack protector, we need to make sure that
+    // LocalStackSlotPass didn't already allocate a slot for it.
+    // If we are told to use the LocalStackAllocationBlock, the stack protector
+    // is expected to be already pre-allocated.
+    if (MFI.getStackID(StackProtectorFI) != TargetStackID::Default) {
+      // If the stack protector isn't on the default stack then it's up to the
+      // target to set the stack offset.
+      assert(MFI.getObjectOffset(StackProtectorFI) != 0 &&
+             "Offset of stack protector on non-default stack expected to be "
+             "already set.");
+      assert(!MFI.isObjectPreAllocated(MFI.getStackProtectorIndex()) &&
+             "Stack protector on non-default stack expected to not be "
+             "pre-allocated by LocalStackSlotPass.");
+    } else if (!MFI.getUseLocalStackAllocationBlock()) {
+      AdjustStackOffset(MFI, StackProtectorFI, StackGrowsDown, Offset,
+                        MaxAlign);
+    } else if (!MFI.isObjectPreAllocated(MFI.getStackProtectorIndex())) {
+      llvm_unreachable(
+          "Stack protector not pre-allocated by LocalStackSlotPass.");
+    }
+
+    // Assign large stack objects first.
+    for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
+      if (MFI.isObjectPreAllocated(i) && MFI.getUseLocalStackAllocationBlock())
+        continue;
+      if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
+        continue;
+      if (RS && RS->isScavengingFrameIndex((int)i))
+        continue;
+      if (MFI.isDeadObjectIndex(i))
+        continue;
+      if (StackProtectorFI == (int)i || EHRegNodeFrameIndex == (int)i)
+        continue;
+      // Only allocate objects on the default stack.
+      if (MFI.getStackID(i) != TargetStackID::Default)
+        continue;
+
+      switch (MFI.getObjectSSPLayout(i)) {
+      case MachineFrameInfo::SSPLK_None:
+        continue;
+      case MachineFrameInfo::SSPLK_SmallArray:
+        SmallArrayObjs.insert(i);
+        continue;
+      case MachineFrameInfo::SSPLK_AddrOf:
+        AddrOfObjs.insert(i);
+        continue;
+      case MachineFrameInfo::SSPLK_LargeArray:
+        LargeArrayObjs.insert(i);
+        continue;
+      }
+      llvm_unreachable("Unexpected SSPLayoutKind.");
+    }
+
+    // We expect **all** the protected stack objects to be pre-allocated by
+    // LocalStackSlotPass. If it turns out that PEI still has to allocate some
+    // of them, we may end up messing up the expected order of the objects.
+    if (MFI.getUseLocalStackAllocationBlock() &&
+        !(LargeArrayObjs.empty() && SmallArrayObjs.empty() &&
+          AddrOfObjs.empty()))
+      llvm_unreachable("Found protected stack objects not pre-allocated by "
+                       "LocalStackSlotPass.");
+
+    AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
+                          Offset, MaxAlign);
+    AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
+                          Offset, MaxAlign);
+    AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown,
+                          Offset, MaxAlign);
+  }
+
+  SmallVector<int, 8> ObjectsToAllocate;
+
+  // Then prepare to assign frame offsets to stack objects that are not used to
+  // spill callee saved registers.
+  for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
+    if (MFI.isObjectPreAllocated(i) && MFI.getUseLocalStackAllocationBlock())
+      continue;
+    if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
+      continue;
+    if (RS && RS->isScavengingFrameIndex((int)i))
+      continue;
+    if (MFI.isDeadObjectIndex(i))
+      continue;
+    if (MFI.getStackProtectorIndex() == (int)i || EHRegNodeFrameIndex == (int)i)
+      continue;
+    if (ProtectedObjs.count(i))
+      continue;
+    // Only allocate objects on the default stack.
+    if (MFI.getStackID(i) != TargetStackID::Default)
+      continue;
+
+    // Add the objects that we need to allocate to our working set.
+    ObjectsToAllocate.push_back(i);
+  }
+
+  // Allocate the EH registration node first if one is present.
+  if (EHRegNodeFrameIndex != std::numeric_limits<int>::max())
+    AdjustStackOffset(MFI, EHRegNodeFrameIndex, StackGrowsDown, Offset,
+                      MaxAlign);
+
+  // Give the targets a chance to order the objects the way they like it.
+  if (MF.getTarget().getOptLevel() != CodeGenOpt::None &&
+      MF.getTarget().Options.StackSymbolOrdering)
+    TFI.orderFrameObjects(MF, ObjectsToAllocate);
+
+  // Keep track of which bytes in the fixed and callee-save range are used so we
+  // can use the holes when allocating later stack objects.  Only do this if
+  // stack protector isn't being used and the target requests it and we're
+  // optimizing.
+  BitVector StackBytesFree;
+  if (!ObjectsToAllocate.empty() &&
+      MF.getTarget().getOptLevel() != CodeGenOpt::None &&
+      MFI.getStackProtectorIndex() < 0 && TFI.enableStackSlotScavenging(MF))
+    computeFreeStackSlots(MFI, StackGrowsDown, MinCSFrameIndex, MaxCSFrameIndex,
+                          FixedCSEnd, StackBytesFree);
+
+  // Now walk the objects and actually assign base offsets to them.
+  for (auto &Object : ObjectsToAllocate)
+    if (!scavengeStackSlot(MFI, Object, StackGrowsDown, MaxAlign,
+                           StackBytesFree))
+      AdjustStackOffset(MFI, Object, StackGrowsDown, Offset, MaxAlign);
+
+  // Make sure the special register scavenging spill slot is closest to the
+  // stack pointer.
+  if (RS && !EarlyScavengingSlots) {
+    SmallVector<int, 2> SFIs;
+    RS->getScavengingFrameIndices(SFIs);
+    for (int SFI : SFIs)
+      AdjustStackOffset(MFI, SFI, StackGrowsDown, Offset, MaxAlign);
+  }
+
+  if (!TFI.targetHandlesStackFrameRounding()) {
+    // If we have reserved argument space for call sites in the function
+    // immediately on entry to the current function, count it as part of the
+    // overall stack size.
+    if (MFI.adjustsStack() && TFI.hasReservedCallFrame(MF))
+      Offset += MFI.getMaxCallFrameSize();
+
+    // Round up the size to a multiple of the alignment.  If the function has
+    // any calls or alloca's, align to the target's StackAlignment value to
+    // ensure that the callee's frame or the alloca data is suitably aligned;
+    // otherwise, for leaf functions, align to the TransientStackAlignment
+    // value.
+    Align StackAlign;
+    if (MFI.adjustsStack() || MFI.hasVarSizedObjects() ||
+        (RegInfo->hasStackRealignment(MF) && MFI.getObjectIndexEnd() != 0))
+      StackAlign = TFI.getStackAlign();
+    else
+      StackAlign = TFI.getTransientStackAlign();
+
+    // If the frame pointer is eliminated, all frame offsets will be relative to
+    // SP not FP. Align to MaxAlign so this works.
+    StackAlign = std::max(StackAlign, MaxAlign);
+    int64_t OffsetBeforeAlignment = Offset;
+    Offset = alignTo(Offset, StackAlign);
+
+    // If we have increased the offset to fulfill the alignment constrants,
+    // then the scavenging spill slots may become harder to reach from the
+    // stack pointer, float them so they stay close.
+    if (StackGrowsDown && OffsetBeforeAlignment != Offset && RS &&
+        !EarlyScavengingSlots) {
+      SmallVector<int, 2> SFIs;
+      RS->getScavengingFrameIndices(SFIs);
+      LLVM_DEBUG(if (!SFIs.empty()) llvm::dbgs()
+                     << "Adjusting emergency spill slots!\n";);
+      int64_t Delta = Offset - OffsetBeforeAlignment;
+      for (int SFI : SFIs) {
+        LLVM_DEBUG(llvm::dbgs()
+                       << "Adjusting offset of emergency spill slot #" << SFI
+                       << " from " << MFI.getObjectOffset(SFI););
+        MFI.setObjectOffset(SFI, MFI.getObjectOffset(SFI) - Delta);
+        LLVM_DEBUG(llvm::dbgs() << " to " << MFI.getObjectOffset(SFI) << "\n";);
+      }
+    }
+  }
+
+  // Update frame info to pretend that this is part of the stack...
+  int64_t StackSize = Offset - LocalAreaOffset;
+  MFI.setStackSize(StackSize);
+  NumBytesStackSpace += StackSize;
+}
+
+/// insertPrologEpilogCode - Scan the function for modified callee saved
+/// registers, insert spill code for these callee saved registers, then add
+/// prolog and epilog code to the function.
+void PEI::insertPrologEpilogCode(MachineFunction &MF) {
+  const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+
+  // Add prologue to the function...
+  for (MachineBasicBlock *SaveBlock : SaveBlocks)
+    TFI.emitPrologue(MF, *SaveBlock);
+
+  // Add epilogue to restore the callee-save registers in each exiting block.
+  for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
+    TFI.emitEpilogue(MF, *RestoreBlock);
+
+  // Zero call used registers before restoring callee-saved registers.
+  insertZeroCallUsedRegs(MF);
+
+  for (MachineBasicBlock *SaveBlock : SaveBlocks)
+    TFI.inlineStackProbe(MF, *SaveBlock);
+
+  // Emit additional code that is required to support segmented stacks, if
+  // we've been asked for it.  This, when linked with a runtime with support
+  // for segmented stacks (libgcc is one), will result in allocating stack
+  // space in small chunks instead of one large contiguous block.
+  if (MF.shouldSplitStack()) {
+    for (MachineBasicBlock *SaveBlock : SaveBlocks)
+      TFI.adjustForSegmentedStacks(MF, *SaveBlock);
+  }
+
+  // Emit additional code that is required to explicitly handle the stack in
+  // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The
+  // approach is rather similar to that of Segmented Stacks, but it uses a
+  // different conditional check and another BIF for allocating more stack
+  // space.
+  if (MF.getFunction().getCallingConv() == CallingConv::HiPE)
+    for (MachineBasicBlock *SaveBlock : SaveBlocks)
+      TFI.adjustForHiPEPrologue(MF, *SaveBlock);
+}
+
+/// insertZeroCallUsedRegs - Zero out call used registers.
+void PEI::insertZeroCallUsedRegs(MachineFunction &MF) {
+  const Function &F = MF.getFunction();
+
+  if (!F.hasFnAttribute("zero-call-used-regs"))
+    return;
+
+  using namespace ZeroCallUsedRegs;
+
+  ZeroCallUsedRegsKind ZeroRegsKind =
+      StringSwitch<ZeroCallUsedRegsKind>(
+          F.getFnAttribute("zero-call-used-regs").getValueAsString())
+          .Case("skip", ZeroCallUsedRegsKind::Skip)
+          .Case("used-gpr-arg", ZeroCallUsedRegsKind::UsedGPRArg)
+          .Case("used-gpr", ZeroCallUsedRegsKind::UsedGPR)
+          .Case("used-arg", ZeroCallUsedRegsKind::UsedArg)
+          .Case("used", ZeroCallUsedRegsKind::Used)
+          .Case("all-gpr-arg", ZeroCallUsedRegsKind::AllGPRArg)
+          .Case("all-gpr", ZeroCallUsedRegsKind::AllGPR)
+          .Case("all-arg", ZeroCallUsedRegsKind::AllArg)
+          .Case("all", ZeroCallUsedRegsKind::All);
+
+  if (ZeroRegsKind == ZeroCallUsedRegsKind::Skip)
+    return;
+
+  const bool OnlyGPR = static_cast<unsigned>(ZeroRegsKind) & ONLY_GPR;
+  const bool OnlyUsed = static_cast<unsigned>(ZeroRegsKind) & ONLY_USED;
+  const bool OnlyArg = static_cast<unsigned>(ZeroRegsKind) & ONLY_ARG;
+
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  const BitVector AllocatableSet(TRI.getAllocatableSet(MF));
+
+  // Mark all used registers.
+  BitVector UsedRegs(TRI.getNumRegs());
+  if (OnlyUsed)
+    for (const MachineBasicBlock &MBB : MF)
+      for (const MachineInstr &MI : MBB) {
+        // skip debug instructions
+        if (MI.isDebugInstr())
+          continue;
+
+        for (const MachineOperand &MO : MI.operands()) {
+          if (!MO.isReg())
+            continue;
+
+          MCRegister Reg = MO.getReg();
+          if (AllocatableSet[Reg] && !MO.isImplicit() &&
+              (MO.isDef() || MO.isUse()))
+            UsedRegs.set(Reg);
+        }
+      }
+
+  // Get a list of registers that are used.
+  BitVector LiveIns(TRI.getNumRegs());
+  for (const MachineBasicBlock::RegisterMaskPair &LI : MF.front().liveins())
+    LiveIns.set(LI.PhysReg);
+
+  BitVector RegsToZero(TRI.getNumRegs());
+  for (MCRegister Reg : AllocatableSet.set_bits()) {
+    // Skip over fixed registers.
+    if (TRI.isFixedRegister(MF, Reg))
+      continue;
+
+    // Want only general purpose registers.
+    if (OnlyGPR && !TRI.isGeneralPurposeRegister(MF, Reg))
+      continue;
+
+    // Want only used registers.
+    if (OnlyUsed && !UsedRegs[Reg])
+      continue;
+
+    // Want only registers used for arguments.
+    if (OnlyArg) {
+      if (OnlyUsed) {
+        if (!LiveIns[Reg])
+          continue;
+      } else if (!TRI.isArgumentRegister(MF, Reg)) {
+        continue;
+      }
+    }
+
+    RegsToZero.set(Reg);
+  }
+
+  // Don't clear registers that are live when leaving the function.
+  for (const MachineBasicBlock &MBB : MF)
+    for (const MachineInstr &MI : MBB.terminators()) {
+      if (!MI.isReturn())
+        continue;
+
+      for (const auto &MO : MI.operands()) {
+        if (!MO.isReg())
+          continue;
+
+        MCRegister Reg = MO.getReg();
+
+        // This picks up sibling registers (e.q. %al -> %ah).
+        for (MCRegUnit Unit : TRI.regunits(Reg))
+          RegsToZero.reset(Unit);
+
+        for (MCPhysReg SReg : TRI.sub_and_superregs_inclusive(Reg))
+          RegsToZero.reset(SReg);
+      }
+    }
+
+  // Don't need to clear registers that are used/clobbered by terminating
+  // instructions.
+  for (const MachineBasicBlock &MBB : MF) {
+    if (!MBB.isReturnBlock())
+      continue;
+
+    MachineBasicBlock::const_iterator MBBI = MBB.getFirstTerminator();
+    for (MachineBasicBlock::const_iterator I = MBBI, E = MBB.end(); I != E;
+         ++I) {
+      for (const MachineOperand &MO : I->operands()) {
+        if (!MO.isReg())
+          continue;
+
+        for (const MCPhysReg &Reg :
+             TRI.sub_and_superregs_inclusive(MO.getReg()))
+          RegsToZero.reset(Reg);
+      }
+    }
+  }
+
+  // Don't clear registers that must be preserved.
+  for (const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
+       MCPhysReg CSReg = *CSRegs; ++CSRegs)
+    for (MCRegister Reg : TRI.sub_and_superregs_inclusive(CSReg))
+      RegsToZero.reset(Reg);
+
+  const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+  for (MachineBasicBlock &MBB : MF)
+    if (MBB.isReturnBlock())
+      TFI.emitZeroCallUsedRegs(RegsToZero, MBB);
+}
+
+/// replaceFrameIndices - Replace all MO_FrameIndex operands with physical
+/// register references and actual offsets.
+void PEI::replaceFrameIndices(MachineFunction &MF) {
+  const auto &ST = MF.getSubtarget();
+  const TargetFrameLowering &TFI = *ST.getFrameLowering();
+  if (!TFI.needsFrameIndexResolution(MF))
+    return;
+
+  const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+
+  // Allow the target to determine this after knowing the frame size.
+  FrameIndexEliminationScavenging = (RS && !FrameIndexVirtualScavenging) ||
+    TRI->requiresFrameIndexReplacementScavenging(MF);
+
+  // Store SPAdj at exit of a basic block.
+  SmallVector<int, 8> SPState;
+  SPState.resize(MF.getNumBlockIDs());
+  df_iterator_default_set<MachineBasicBlock*> Reachable;
+
+  // Iterate over the reachable blocks in DFS order.
+  for (auto DFI = df_ext_begin(&MF, Reachable), DFE = df_ext_end(&MF, Reachable);
+       DFI != DFE; ++DFI) {
+    int SPAdj = 0;
+    // Check the exit state of the DFS stack predecessor.
+    if (DFI.getPathLength() >= 2) {
+      MachineBasicBlock *StackPred = DFI.getPath(DFI.getPathLength() - 2);
+      assert(Reachable.count(StackPred) &&
+             "DFS stack predecessor is already visited.\n");
+      SPAdj = SPState[StackPred->getNumber()];
+    }
+    MachineBasicBlock *BB = *DFI;
+    replaceFrameIndices(BB, MF, SPAdj);
+    SPState[BB->getNumber()] = SPAdj;
+  }
+
+  // Handle the unreachable blocks.
+  for (auto &BB : MF) {
+    if (Reachable.count(&BB))
+      // Already handled in DFS traversal.
+      continue;
+    int SPAdj = 0;
+    replaceFrameIndices(&BB, MF, SPAdj);
+  }
+}
+
+bool PEI::replaceFrameIndexDebugInstr(MachineFunction &MF, MachineInstr &MI,
+                                      unsigned OpIdx, int SPAdj) {
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  if (MI.isDebugValue()) {
+
+    MachineOperand &Op = MI.getOperand(OpIdx);
+    assert(MI.isDebugOperand(&Op) &&
+           "Frame indices can only appear as a debug operand in a DBG_VALUE*"
+           " machine instruction");
+    Register Reg;
+    unsigned FrameIdx = Op.getIndex();
+    unsigned Size = MF.getFrameInfo().getObjectSize(FrameIdx);
+
+    StackOffset Offset = TFI->getFrameIndexReference(MF, FrameIdx, Reg);
+    Op.ChangeToRegister(Reg, false /*isDef*/);
+
+    const DIExpression *DIExpr = MI.getDebugExpression();
+
+    // If we have a direct DBG_VALUE, and its location expression isn't
+    // currently complex, then adding an offset will morph it into a
+    // complex location that is interpreted as being a memory address.
+    // This changes a pointer-valued variable to dereference that pointer,
+    // which is incorrect. Fix by adding DW_OP_stack_value.
+
+    if (MI.isNonListDebugValue()) {
+      unsigned PrependFlags = DIExpression::ApplyOffset;
+      if (!MI.isIndirectDebugValue() && !DIExpr->isComplex())
+        PrependFlags |= DIExpression::StackValue;
+
+      // If we have DBG_VALUE that is indirect and has a Implicit location
+      // expression need to insert a deref before prepending a Memory
+      // location expression. Also after doing this we change the DBG_VALUE
+      // to be direct.
+      if (MI.isIndirectDebugValue() && DIExpr->isImplicit()) {
+        SmallVector<uint64_t, 2> Ops = {dwarf::DW_OP_deref_size, Size};
+        bool WithStackValue = true;
+        DIExpr = DIExpression::prependOpcodes(DIExpr, Ops, WithStackValue);
+        // Make the DBG_VALUE direct.
+        MI.getDebugOffset().ChangeToRegister(0, false);
+      }
+      DIExpr = TRI.prependOffsetExpression(DIExpr, PrependFlags, Offset);
+    } else {
+      // The debug operand at DebugOpIndex was a frame index at offset
+      // `Offset`; now the operand has been replaced with the frame
+      // register, we must add Offset with `register x, plus Offset`.
+      unsigned DebugOpIndex = MI.getDebugOperandIndex(&Op);
+      SmallVector<uint64_t, 3> Ops;
+      TRI.getOffsetOpcodes(Offset, Ops);
+      DIExpr = DIExpression::appendOpsToArg(DIExpr, Ops, DebugOpIndex);
+    }
+    MI.getDebugExpressionOp().setMetadata(DIExpr);
+    return true;
+  }
+
+  if (MI.isDebugPHI()) {
+    // Allow stack ref to continue onwards.
+    return true;
+  }
+
+  // TODO: This code should be commoned with the code for
+  // PATCHPOINT. There's no good reason for the difference in
+  // implementation other than historical accident.  The only
+  // remaining difference is the unconditional use of the stack
+  // pointer as the base register.
+  if (MI.getOpcode() == TargetOpcode::STATEPOINT) {
+    assert((!MI.isDebugValue() || OpIdx == 0) &&
+           "Frame indicies can only appear as the first operand of a "
+           "DBG_VALUE machine instruction");
+    Register Reg;
+    MachineOperand &Offset = MI.getOperand(OpIdx + 1);
+    StackOffset refOffset = TFI->getFrameIndexReferencePreferSP(
+        MF, MI.getOperand(OpIdx).getIndex(), Reg, /*IgnoreSPUpdates*/ false);
+    assert(!refOffset.getScalable() &&
+           "Frame offsets with a scalable component are not supported");
+    Offset.setImm(Offset.getImm() + refOffset.getFixed() + SPAdj);
+    MI.getOperand(OpIdx).ChangeToRegister(Reg, false /*isDef*/);
+    return true;
+  }
+  return false;
+}
+
+void PEI::replaceFrameIndicesBackward(MachineBasicBlock *BB,
+                                      MachineFunction &MF, int &SPAdj) {
+  assert(MF.getSubtarget().getRegisterInfo() &&
+         "getRegisterInfo() must be implemented!");
+
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+
+  RegScavenger *LocalRS = FrameIndexEliminationScavenging ? RS : nullptr;
+  if (LocalRS)
+    LocalRS->enterBasicBlockEnd(*BB);
+
+  for (MachineInstr &MI : make_early_inc_range(reverse(*BB))) {
+    if (TII.isFrameInstr(MI)) {
+      TFI.eliminateCallFramePseudoInstr(MF, *BB, &MI);
+      continue;
+    }
+
+    // Step backwards to get the liveness state at (immedately after) MI.
+    if (LocalRS)
+      LocalRS->backward(MI);
+
+    for (unsigned i = 0; i != MI.getNumOperands(); ++i) {
+      if (!MI.getOperand(i).isFI())
+        continue;
+
+      if (replaceFrameIndexDebugInstr(MF, MI, i, SPAdj))
+        continue;
+
+      // Eliminate this FrameIndex operand.
+      //
+      // Save and restore the scavenger's position around the call to
+      // eliminateFrameIndex in case it erases MI and invalidates the iterator.
+      MachineBasicBlock::iterator Save;
+      if (LocalRS)
+	Save = std::next(LocalRS->getCurrentPosition());
+      bool Removed = TRI.eliminateFrameIndex(MI, SPAdj, i, RS);
+      if (LocalRS)
+	LocalRS->skipTo(std::prev(Save));
+
+      if (Removed)
+        break;
+    }
+  }
+}
+
+void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF,
+                              int &SPAdj) {
+  assert(MF.getSubtarget().getRegisterInfo() &&
+         "getRegisterInfo() must be implemented!");
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+
+  if (TRI.supportsBackwardScavenger())
+    return replaceFrameIndicesBackward(BB, MF, SPAdj);
+
+  if (RS && FrameIndexEliminationScavenging)
+    RS->enterBasicBlock(*BB);
+
+  bool InsideCallSequence = false;
+
+  for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
+    if (TII.isFrameInstr(*I)) {
+      InsideCallSequence = TII.isFrameSetup(*I);
+      SPAdj += TII.getSPAdjust(*I);
+      I = TFI->eliminateCallFramePseudoInstr(MF, *BB, I);
+      continue;
+    }
+
+    MachineInstr &MI = *I;
+    bool DoIncr = true;
+    bool DidFinishLoop = true;
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      if (!MI.getOperand(i).isFI())
+        continue;
+
+      if (replaceFrameIndexDebugInstr(MF, MI, i, SPAdj))
+        continue;
+
+      // Some instructions (e.g. inline asm instructions) can have
+      // multiple frame indices and/or cause eliminateFrameIndex
+      // to insert more than one instruction. We need the register
+      // scavenger to go through all of these instructions so that
+      // it can update its register information. We keep the
+      // iterator at the point before insertion so that we can
+      // revisit them in full.
+      bool AtBeginning = (I == BB->begin());
+      if (!AtBeginning) --I;
+
+      // If this instruction has a FrameIndex operand, we need to
+      // use that target machine register info object to eliminate
+      // it.
+      TRI.eliminateFrameIndex(MI, SPAdj, i,
+                              FrameIndexEliminationScavenging ?  RS : nullptr);
+
+      // Reset the iterator if we were at the beginning of the BB.
+      if (AtBeginning) {
+        I = BB->begin();
+        DoIncr = false;
+      }
+
+      DidFinishLoop = false;
+      break;
+    }
+
+    // If we are looking at a call sequence, we need to keep track of
+    // the SP adjustment made by each instruction in the sequence.
+    // This includes both the frame setup/destroy pseudos (handled above),
+    // as well as other instructions that have side effects w.r.t the SP.
+    // Note that this must come after eliminateFrameIndex, because
+    // if I itself referred to a frame index, we shouldn't count its own
+    // adjustment.
+    if (DidFinishLoop && InsideCallSequence)
+      SPAdj += TII.getSPAdjust(MI);
+
+    if (DoIncr && I != BB->end()) ++I;
+
+    // Update register states.
+    if (RS && FrameIndexEliminationScavenging && DidFinishLoop)
+      RS->forward(MI);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PseudoProbeInserter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PseudoProbeInserter.cpp
new file mode 100644
index 000000000000..913e0035b046
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PseudoProbeInserter.cpp
@@ -0,0 +1,150 @@
+//===- PseudoProbeInserter.cpp - Insert annotation for callsite profiling -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements PseudoProbeInserter pass, which inserts pseudo probe
+// annotations for call instructions with a pseudo-probe-specific dwarf
+// discriminator. such discriminator indicates that the call instruction comes
+// with a pseudo probe, and the discriminator value holds information to
+// identify the corresponding counter.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PseudoProbe.h"
+#include "llvm/InitializePasses.h"
+
+#define DEBUG_TYPE "pseudo-probe-inserter"
+
+using namespace llvm;
+
+namespace {
+class PseudoProbeInserter : public MachineFunctionPass {
+public:
+  static char ID;
+
+  PseudoProbeInserter() : MachineFunctionPass(ID) {
+    initializePseudoProbeInserterPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "Pseudo Probe Inserter"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool doInitialization(Module &M) override {
+    ShouldRun = M.getNamedMetadata(PseudoProbeDescMetadataName);
+    return false;
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (!ShouldRun)
+      return false;
+    const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+    bool Changed = false;
+    for (MachineBasicBlock &MBB : MF) {
+      MachineInstr *FirstInstr = nullptr;
+      for (MachineInstr &MI : MBB) {
+        if (!MI.isPseudo())
+          FirstInstr = &MI;
+        if (MI.isCall()) {
+          if (DILocation *DL = MI.getDebugLoc()) {
+            auto Value = DL->getDiscriminator();
+            if (DILocation::isPseudoProbeDiscriminator(Value)) {
+              BuildMI(MBB, MI, DL, TII->get(TargetOpcode::PSEUDO_PROBE))
+                  .addImm(getFuncGUID(MF.getFunction().getParent(), DL))
+                  .addImm(
+                      PseudoProbeDwarfDiscriminator::extractProbeIndex(Value))
+                  .addImm(
+                      PseudoProbeDwarfDiscriminator::extractProbeType(Value))
+                  .addImm(PseudoProbeDwarfDiscriminator::extractProbeAttributes(
+                      Value));
+              Changed = true;
+            }
+          }
+        }
+      }
+
+      // Walk the block backwards, move PSEUDO_PROBE before the first real
+      // instruction to fix out-of-order probes. There is a problem with probes
+      // as the terminator of the block. During the offline counts processing,
+      // the samples collected on the first physical instruction following a
+      // probe will be counted towards the probe. This logically equals to
+      // treating the instruction next to a probe as if it is from the same
+      // block of the probe. This is accurate most of the time unless the
+      // instruction can be reached from multiple flows, which means it actually
+      // starts a new block. Samples collected on such probes may cause
+      // imprecision with the counts inference algorithm. Fortunately, if
+      // there are still other native instructions preceding the probe we can
+      // use them as a place holder to collect samples for the probe.
+      if (FirstInstr) {
+        auto MII = MBB.rbegin();
+        while (MII != MBB.rend()) {
+          // Skip all pseudo probes followed by a real instruction since they
+          // are not dangling.
+          if (!MII->isPseudo())
+            break;
+          auto Cur = MII++;
+          if (Cur->getOpcode() != TargetOpcode::PSEUDO_PROBE)
+            continue;
+          // Move the dangling probe before FirstInstr.
+          auto *ProbeInstr = &*Cur;
+          MBB.remove(ProbeInstr);
+          MBB.insert(FirstInstr, ProbeInstr);
+          Changed = true;
+        }
+      } else {
+        // Probes not surrounded by any real instructions in the same block are
+        // called dangling probes. Since there's no good way to pick up a sample
+        // collection point for dangling probes at compile time, they are being
+        // removed so that the profile correlation tool will not report any
+        // samples collected for them and it's up to the counts inference tool
+        // to get them a reasonable count.
+        SmallVector<MachineInstr *, 4> ToBeRemoved;
+        for (MachineInstr &MI : MBB) {
+          if (MI.isPseudoProbe())
+            ToBeRemoved.push_back(&MI);
+        }
+
+        for (auto *MI : ToBeRemoved)
+          MI->eraseFromParent();
+
+        Changed |= !ToBeRemoved.empty();
+      }
+    }
+
+    return Changed;
+  }
+
+private:
+  uint64_t getFuncGUID(Module *M, DILocation *DL) {
+    auto Name = DL->getSubprogramLinkageName();
+    return Function::getGUID(Name);
+  }
+
+  bool ShouldRun = false;
+};
+} // namespace
+
+char PseudoProbeInserter::ID = 0;
+INITIALIZE_PASS_BEGIN(PseudoProbeInserter, DEBUG_TYPE,
+                      "Insert pseudo probe annotations for value profiling",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(PseudoProbeInserter, DEBUG_TYPE,
+                    "Insert pseudo probe annotations for value profiling",
+                    false, false)
+
+FunctionPass *llvm::createPseudoProbeInserter() {
+  return new PseudoProbeInserter();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/PseudoSourceValue.cpp b/contrib/llvm-project/llvm/lib/CodeGen/PseudoSourceValue.cpp
new file mode 100644
index 000000000000..40c52b9d9707
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/PseudoSourceValue.cpp
@@ -0,0 +1,146 @@
+//===-- llvm/CodeGen/PseudoSourceValue.cpp ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PseudoSourceValue class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+static const char *const PSVNames[] = {
+    "Stack", "GOT", "JumpTable", "ConstantPool", "FixedStack",
+    "GlobalValueCallEntry", "ExternalSymbolCallEntry"};
+
+PseudoSourceValue::PseudoSourceValue(unsigned Kind, const TargetMachine &TM)
+    : Kind(Kind) {
+  AddressSpace = TM.getAddressSpaceForPseudoSourceKind(Kind);
+}
+
+PseudoSourceValue::~PseudoSourceValue() = default;
+
+void PseudoSourceValue::printCustom(raw_ostream &O) const {
+  if (Kind < TargetCustom)
+    O << PSVNames[Kind];
+  else
+    O << "TargetCustom" << Kind;
+}
+
+bool PseudoSourceValue::isConstant(const MachineFrameInfo *) const {
+  if (isStack())
+    return false;
+  if (isGOT() || isConstantPool() || isJumpTable())
+    return true;
+  llvm_unreachable("Unknown PseudoSourceValue!");
+}
+
+bool PseudoSourceValue::isAliased(const MachineFrameInfo *) const {
+  if (isStack() || isGOT() || isConstantPool() || isJumpTable())
+    return false;
+  llvm_unreachable("Unknown PseudoSourceValue!");
+}
+
+bool PseudoSourceValue::mayAlias(const MachineFrameInfo *) const {
+  return !(isGOT() || isConstantPool() || isJumpTable());
+}
+
+bool FixedStackPseudoSourceValue::isConstant(
+    const MachineFrameInfo *MFI) const {
+  return MFI && MFI->isImmutableObjectIndex(FI);
+}
+
+bool FixedStackPseudoSourceValue::isAliased(const MachineFrameInfo *MFI) const {
+  if (!MFI)
+    return true;
+  return MFI->isAliasedObjectIndex(FI);
+}
+
+bool FixedStackPseudoSourceValue::mayAlias(const MachineFrameInfo *MFI) const {
+  if (!MFI)
+    return true;
+  // Spill slots will not alias any LLVM IR value.
+  return !MFI->isSpillSlotObjectIndex(FI);
+}
+
+void FixedStackPseudoSourceValue::printCustom(raw_ostream &OS) const {
+  OS << "FixedStack" << FI;
+}
+
+CallEntryPseudoSourceValue::CallEntryPseudoSourceValue(unsigned Kind,
+                                                       const TargetMachine &TM)
+    : PseudoSourceValue(Kind, TM) {}
+
+bool CallEntryPseudoSourceValue::isConstant(const MachineFrameInfo *) const {
+  return false;
+}
+
+bool CallEntryPseudoSourceValue::isAliased(const MachineFrameInfo *) const {
+  return false;
+}
+
+bool CallEntryPseudoSourceValue::mayAlias(const MachineFrameInfo *) const {
+  return false;
+}
+
+GlobalValuePseudoSourceValue::GlobalValuePseudoSourceValue(
+    const GlobalValue *GV, const TargetMachine &TM)
+    : CallEntryPseudoSourceValue(GlobalValueCallEntry, TM), GV(GV) {}
+ExternalSymbolPseudoSourceValue::ExternalSymbolPseudoSourceValue(
+    const char *ES, const TargetMachine &TM)
+    : CallEntryPseudoSourceValue(ExternalSymbolCallEntry, TM), ES(ES) {}
+
+PseudoSourceValueManager::PseudoSourceValueManager(const TargetMachine &TMInfo)
+    : TM(TMInfo), StackPSV(PseudoSourceValue::Stack, TM),
+      GOTPSV(PseudoSourceValue::GOT, TM),
+      JumpTablePSV(PseudoSourceValue::JumpTable, TM),
+      ConstantPoolPSV(PseudoSourceValue::ConstantPool, TM) {}
+
+const PseudoSourceValue *PseudoSourceValueManager::getStack() {
+  return &StackPSV;
+}
+
+const PseudoSourceValue *PseudoSourceValueManager::getGOT() { return &GOTPSV; }
+
+const PseudoSourceValue *PseudoSourceValueManager::getConstantPool() {
+  return &ConstantPoolPSV;
+}
+
+const PseudoSourceValue *PseudoSourceValueManager::getJumpTable() {
+  return &JumpTablePSV;
+}
+
+const PseudoSourceValue *
+PseudoSourceValueManager::getFixedStack(int FI) {
+  std::unique_ptr<FixedStackPseudoSourceValue> &V = FSValues[FI];
+  if (!V)
+    V = std::make_unique<FixedStackPseudoSourceValue>(FI, TM);
+  return V.get();
+}
+
+const PseudoSourceValue *
+PseudoSourceValueManager::getGlobalValueCallEntry(const GlobalValue *GV) {
+  std::unique_ptr<const GlobalValuePseudoSourceValue> &E =
+      GlobalCallEntries[GV];
+  if (!E)
+    E = std::make_unique<GlobalValuePseudoSourceValue>(GV, TM);
+  return E.get();
+}
+
+const PseudoSourceValue *
+PseudoSourceValueManager::getExternalSymbolCallEntry(const char *ES) {
+  std::unique_ptr<const ExternalSymbolPseudoSourceValue> &E =
+      ExternalCallEntries[ES];
+  if (!E)
+    E = std::make_unique<ExternalSymbolPseudoSourceValue>(ES, TM);
+  return E.get();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RDFGraph.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RDFGraph.cpp
new file mode 100644
index 000000000000..abf3b1e6fbb9
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RDFGraph.cpp
@@ -0,0 +1,1799 @@
+//===- RDFGraph.cpp -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Target-independent, SSA-based data flow graph for register data flow (RDF).
+//
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFRegisters.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <set>
+#include <utility>
+#include <vector>
+
+// Printing functions. Have them here first, so that the rest of the code
+// can use them.
+namespace llvm::rdf {
+
+raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterRef> &P) {
+  P.G.getPRI().print(OS, P.Obj);
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Print<NodeId> &P) {
+  if (P.Obj == 0)
+    return OS << "null";
+  auto NA = P.G.addr<NodeBase *>(P.Obj);
+  uint16_t Attrs = NA.Addr->getAttrs();
+  uint16_t Kind = NodeAttrs::kind(Attrs);
+  uint16_t Flags = NodeAttrs::flags(Attrs);
+  switch (NodeAttrs::type(Attrs)) {
+  case NodeAttrs::Code:
+    switch (Kind) {
+    case NodeAttrs::Func:
+      OS << 'f';
+      break;
+    case NodeAttrs::Block:
+      OS << 'b';
+      break;
+    case NodeAttrs::Stmt:
+      OS << 's';
+      break;
+    case NodeAttrs::Phi:
+      OS << 'p';
+      break;
+    default:
+      OS << "c?";
+      break;
+    }
+    break;
+  case NodeAttrs::Ref:
+    if (Flags & NodeAttrs::Undef)
+      OS << '/';
+    if (Flags & NodeAttrs::Dead)
+      OS << '\\';
+    if (Flags & NodeAttrs::Preserving)
+      OS << '+';
+    if (Flags & NodeAttrs::Clobbering)
+      OS << '~';
+    switch (Kind) {
+    case NodeAttrs::Use:
+      OS << 'u';
+      break;
+    case NodeAttrs::Def:
+      OS << 'd';
+      break;
+    case NodeAttrs::Block:
+      OS << 'b';
+      break;
+    default:
+      OS << "r?";
+      break;
+    }
+    break;
+  default:
+    OS << '?';
+    break;
+  }
+  OS << P.Obj;
+  if (Flags & NodeAttrs::Shadow)
+    OS << '"';
+  return OS;
+}
+
+static void printRefHeader(raw_ostream &OS, const Ref RA,
+                           const DataFlowGraph &G) {
+  OS << Print(RA.Id, G) << '<' << Print(RA.Addr->getRegRef(G), G) << '>';
+  if (RA.Addr->getFlags() & NodeAttrs::Fixed)
+    OS << '!';
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Print<Def> &P) {
+  printRefHeader(OS, P.Obj, P.G);
+  OS << '(';
+  if (NodeId N = P.Obj.Addr->getReachingDef())
+    OS << Print(N, P.G);
+  OS << ',';
+  if (NodeId N = P.Obj.Addr->getReachedDef())
+    OS << Print(N, P.G);
+  OS << ',';
+  if (NodeId N = P.Obj.Addr->getReachedUse())
+    OS << Print(N, P.G);
+  OS << "):";
+  if (NodeId N = P.Obj.Addr->getSibling())
+    OS << Print(N, P.G);
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Print<Use> &P) {
+  printRefHeader(OS, P.Obj, P.G);
+  OS << '(';
+  if (NodeId N = P.Obj.Addr->getReachingDef())
+    OS << Print(N, P.G);
+  OS << "):";
+  if (NodeId N = P.Obj.Addr->getSibling())
+    OS << Print(N, P.G);
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Print<PhiUse> &P) {
+  printRefHeader(OS, P.Obj, P.G);
+  OS << '(';
+  if (NodeId N = P.Obj.Addr->getReachingDef())
+    OS << Print(N, P.G);
+  OS << ',';
+  if (NodeId N = P.Obj.Addr->getPredecessor())
+    OS << Print(N, P.G);
+  OS << "):";
+  if (NodeId N = P.Obj.Addr->getSibling())
+    OS << Print(N, P.G);
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Print<Ref> &P) {
+  switch (P.Obj.Addr->getKind()) {
+  case NodeAttrs::Def:
+    OS << PrintNode<DefNode *>(P.Obj, P.G);
+    break;
+  case NodeAttrs::Use:
+    if (P.Obj.Addr->getFlags() & NodeAttrs::PhiRef)
+      OS << PrintNode<PhiUseNode *>(P.Obj, P.G);
+    else
+      OS << PrintNode<UseNode *>(P.Obj, P.G);
+    break;
+  }
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Print<NodeList> &P) {
+  unsigned N = P.Obj.size();
+  for (auto I : P.Obj) {
+    OS << Print(I.Id, P.G);
+    if (--N)
+      OS << ' ';
+  }
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Print<NodeSet> &P) {
+  unsigned N = P.Obj.size();
+  for (auto I : P.Obj) {
+    OS << Print(I, P.G);
+    if (--N)
+      OS << ' ';
+  }
+  return OS;
+}
+
+namespace {
+
+template <typename T> struct PrintListV {
+  PrintListV(const NodeList &L, const DataFlowGraph &G) : List(L), G(G) {}
+
+  using Type = T;
+  const NodeList &List;
+  const DataFlowGraph &G;
+};
+
+template <typename T>
+raw_ostream &operator<<(raw_ostream &OS, const PrintListV<T> &P) {
+  unsigned N = P.List.size();
+  for (NodeAddr<T> A : P.List) {
+    OS << PrintNode<T>(A, P.G);
+    if (--N)
+      OS << ", ";
+  }
+  return OS;
+}
+
+} // end anonymous namespace
+
+raw_ostream &operator<<(raw_ostream &OS, const Print<Phi> &P) {
+  OS << Print(P.Obj.Id, P.G) << ": phi ["
+     << PrintListV<RefNode *>(P.Obj.Addr->members(P.G), P.G) << ']';
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Print<Stmt> &P) {
+  const MachineInstr &MI = *P.Obj.Addr->getCode();
+  unsigned Opc = MI.getOpcode();
+  OS << Print(P.Obj.Id, P.G) << ": " << P.G.getTII().getName(Opc);
+  // Print the target for calls and branches (for readability).
+  if (MI.isCall() || MI.isBranch()) {
+    MachineInstr::const_mop_iterator T =
+        llvm::find_if(MI.operands(), [](const MachineOperand &Op) -> bool {
+          return Op.isMBB() || Op.isGlobal() || Op.isSymbol();
+        });
+    if (T != MI.operands_end()) {
+      OS << ' ';
+      if (T->isMBB())
+        OS << printMBBReference(*T->getMBB());
+      else if (T->isGlobal())
+        OS << T->getGlobal()->getName();
+      else if (T->isSymbol())
+        OS << T->getSymbolName();
+    }
+  }
+  OS << " [" << PrintListV<RefNode *>(P.Obj.Addr->members(P.G), P.G) << ']';
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Print<Instr> &P) {
+  switch (P.Obj.Addr->getKind()) {
+  case NodeAttrs::Phi:
+    OS << PrintNode<PhiNode *>(P.Obj, P.G);
+    break;
+  case NodeAttrs::Stmt:
+    OS << PrintNode<StmtNode *>(P.Obj, P.G);
+    break;
+  default:
+    OS << "instr? " << Print(P.Obj.Id, P.G);
+    break;
+  }
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Print<Block> &P) {
+  MachineBasicBlock *BB = P.Obj.Addr->getCode();
+  unsigned NP = BB->pred_size();
+  std::vector<int> Ns;
+  auto PrintBBs = [&OS](std::vector<int> Ns) -> void {
+    unsigned N = Ns.size();
+    for (int I : Ns) {
+      OS << "%bb." << I;
+      if (--N)
+        OS << ", ";
+    }
+  };
+
+  OS << Print(P.Obj.Id, P.G) << ": --- " << printMBBReference(*BB)
+     << " --- preds(" << NP << "): ";
+  for (MachineBasicBlock *B : BB->predecessors())
+    Ns.push_back(B->getNumber());
+  PrintBBs(Ns);
+
+  unsigned NS = BB->succ_size();
+  OS << "  succs(" << NS << "): ";
+  Ns.clear();
+  for (MachineBasicBlock *B : BB->successors())
+    Ns.push_back(B->getNumber());
+  PrintBBs(Ns);
+  OS << '\n';
+
+  for (auto I : P.Obj.Addr->members(P.G))
+    OS << PrintNode<InstrNode *>(I, P.G) << '\n';
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Print<Func> &P) {
+  OS << "DFG dump:[\n"
+     << Print(P.Obj.Id, P.G)
+     << ": Function: " << P.Obj.Addr->getCode()->getName() << '\n';
+  for (auto I : P.Obj.Addr->members(P.G))
+    OS << PrintNode<BlockNode *>(I, P.G) << '\n';
+  OS << "]\n";
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterSet> &P) {
+  OS << '{';
+  for (auto I : P.Obj)
+    OS << ' ' << Print(I, P.G);
+  OS << " }";
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterAggr> &P) {
+  OS << P.Obj;
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const Print<DataFlowGraph::DefStack> &P) {
+  for (auto I = P.Obj.top(), E = P.Obj.bottom(); I != E;) {
+    OS << Print(I->Id, P.G) << '<' << Print(I->Addr->getRegRef(P.G), P.G)
+       << '>';
+    I.down();
+    if (I != E)
+      OS << ' ';
+  }
+  return OS;
+}
+
+// Node allocation functions.
+//
+// Node allocator is like a slab memory allocator: it allocates blocks of
+// memory in sizes that are multiples of the size of a node. Each block has
+// the same size. Nodes are allocated from the currently active block, and
+// when it becomes full, a new one is created.
+// There is a mapping scheme between node id and its location in a block,
+// and within that block is described in the header file.
+//
+void NodeAllocator::startNewBlock() {
+  void *T = MemPool.Allocate(NodesPerBlock * NodeMemSize, NodeMemSize);
+  char *P = static_cast<char *>(T);
+  Blocks.push_back(P);
+  // Check if the block index is still within the allowed range, i.e. less
+  // than 2^N, where N is the number of bits in NodeId for the block index.
+  // BitsPerIndex is the number of bits per node index.
+  assert((Blocks.size() < ((size_t)1 << (8 * sizeof(NodeId) - BitsPerIndex))) &&
+         "Out of bits for block index");
+  ActiveEnd = P;
+}
+
+bool NodeAllocator::needNewBlock() {
+  if (Blocks.empty())
+    return true;
+
+  char *ActiveBegin = Blocks.back();
+  uint32_t Index = (ActiveEnd - ActiveBegin) / NodeMemSize;
+  return Index >= NodesPerBlock;
+}
+
+Node NodeAllocator::New() {
+  if (needNewBlock())
+    startNewBlock();
+
+  uint32_t ActiveB = Blocks.size() - 1;
+  uint32_t Index = (ActiveEnd - Blocks[ActiveB]) / NodeMemSize;
+  Node NA = {reinterpret_cast<NodeBase *>(ActiveEnd), makeId(ActiveB, Index)};
+  ActiveEnd += NodeMemSize;
+  return NA;
+}
+
+NodeId NodeAllocator::id(const NodeBase *P) const {
+  uintptr_t A = reinterpret_cast<uintptr_t>(P);
+  for (unsigned i = 0, n = Blocks.size(); i != n; ++i) {
+    uintptr_t B = reinterpret_cast<uintptr_t>(Blocks[i]);
+    if (A < B || A >= B + NodesPerBlock * NodeMemSize)
+      continue;
+    uint32_t Idx = (A - B) / NodeMemSize;
+    return makeId(i, Idx);
+  }
+  llvm_unreachable("Invalid node address");
+}
+
+void NodeAllocator::clear() {
+  MemPool.Reset();
+  Blocks.clear();
+  ActiveEnd = nullptr;
+}
+
+// Insert node NA after "this" in the circular chain.
+void NodeBase::append(Node NA) {
+  NodeId Nx = Next;
+  // If NA is already "next", do nothing.
+  if (Next != NA.Id) {
+    Next = NA.Id;
+    NA.Addr->Next = Nx;
+  }
+}
+
+// Fundamental node manipulator functions.
+
+// Obtain the register reference from a reference node.
+RegisterRef RefNode::getRegRef(const DataFlowGraph &G) const {
+  assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
+  if (NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef)
+    return G.unpack(RefData.PR);
+  assert(RefData.Op != nullptr);
+  return G.makeRegRef(*RefData.Op);
+}
+
+// Set the register reference in the reference node directly (for references
+// in phi nodes).
+void RefNode::setRegRef(RegisterRef RR, DataFlowGraph &G) {
+  assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
+  assert(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef);
+  RefData.PR = G.pack(RR);
+}
+
+// Set the register reference in the reference node based on a machine
+// operand (for references in statement nodes).
+void RefNode::setRegRef(MachineOperand *Op, DataFlowGraph &G) {
+  assert(NodeAttrs::type(Attrs) == NodeAttrs::Ref);
+  assert(!(NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef));
+  (void)G;
+  RefData.Op = Op;
+}
+
+// Get the owner of a given reference node.
+Node RefNode::getOwner(const DataFlowGraph &G) {
+  Node NA = G.addr<NodeBase *>(getNext());
+
+  while (NA.Addr != this) {
+    if (NA.Addr->getType() == NodeAttrs::Code)
+      return NA;
+    NA = G.addr<NodeBase *>(NA.Addr->getNext());
+  }
+  llvm_unreachable("No owner in circular list");
+}
+
+// Connect the def node to the reaching def node.
+void DefNode::linkToDef(NodeId Self, Def DA) {
+  RefData.RD = DA.Id;
+  RefData.Sib = DA.Addr->getReachedDef();
+  DA.Addr->setReachedDef(Self);
+}
+
+// Connect the use node to the reaching def node.
+void UseNode::linkToDef(NodeId Self, Def DA) {
+  RefData.RD = DA.Id;
+  RefData.Sib = DA.Addr->getReachedUse();
+  DA.Addr->setReachedUse(Self);
+}
+
+// Get the first member of the code node.
+Node CodeNode::getFirstMember(const DataFlowGraph &G) const {
+  if (CodeData.FirstM == 0)
+    return Node();
+  return G.addr<NodeBase *>(CodeData.FirstM);
+}
+
+// Get the last member of the code node.
+Node CodeNode::getLastMember(const DataFlowGraph &G) const {
+  if (CodeData.LastM == 0)
+    return Node();
+  return G.addr<NodeBase *>(CodeData.LastM);
+}
+
+// Add node NA at the end of the member list of the given code node.
+void CodeNode::addMember(Node NA, const DataFlowGraph &G) {
+  Node ML = getLastMember(G);
+  if (ML.Id != 0) {
+    ML.Addr->append(NA);
+  } else {
+    CodeData.FirstM = NA.Id;
+    NodeId Self = G.id(this);
+    NA.Addr->setNext(Self);
+  }
+  CodeData.LastM = NA.Id;
+}
+
+// Add node NA after member node MA in the given code node.
+void CodeNode::addMemberAfter(Node MA, Node NA, const DataFlowGraph &G) {
+  MA.Addr->append(NA);
+  if (CodeData.LastM == MA.Id)
+    CodeData.LastM = NA.Id;
+}
+
+// Remove member node NA from the given code node.
+void CodeNode::removeMember(Node NA, const DataFlowGraph &G) {
+  Node MA = getFirstMember(G);
+  assert(MA.Id != 0);
+
+  // Special handling if the member to remove is the first member.
+  if (MA.Id == NA.Id) {
+    if (CodeData.LastM == MA.Id) {
+      // If it is the only member, set both first and last to 0.
+      CodeData.FirstM = CodeData.LastM = 0;
+    } else {
+      // Otherwise, advance the first member.
+      CodeData.FirstM = MA.Addr->getNext();
+    }
+    return;
+  }
+
+  while (MA.Addr != this) {
+    NodeId MX = MA.Addr->getNext();
+    if (MX == NA.Id) {
+      MA.Addr->setNext(NA.Addr->getNext());
+      // If the member to remove happens to be the last one, update the
+      // LastM indicator.
+      if (CodeData.LastM == NA.Id)
+        CodeData.LastM = MA.Id;
+      return;
+    }
+    MA = G.addr<NodeBase *>(MX);
+  }
+  llvm_unreachable("No such member");
+}
+
+// Return the list of all members of the code node.
+NodeList CodeNode::members(const DataFlowGraph &G) const {
+  static auto True = [](Node) -> bool { return true; };
+  return members_if(True, G);
+}
+
+// Return the owner of the given instr node.
+Node InstrNode::getOwner(const DataFlowGraph &G) {
+  Node NA = G.addr<NodeBase *>(getNext());
+
+  while (NA.Addr != this) {
+    assert(NA.Addr->getType() == NodeAttrs::Code);
+    if (NA.Addr->getKind() == NodeAttrs::Block)
+      return NA;
+    NA = G.addr<NodeBase *>(NA.Addr->getNext());
+  }
+  llvm_unreachable("No owner in circular list");
+}
+
+// Add the phi node PA to the given block node.
+void BlockNode::addPhi(Phi PA, const DataFlowGraph &G) {
+  Node M = getFirstMember(G);
+  if (M.Id == 0) {
+    addMember(PA, G);
+    return;
+  }
+
+  assert(M.Addr->getType() == NodeAttrs::Code);
+  if (M.Addr->getKind() == NodeAttrs::Stmt) {
+    // If the first member of the block is a statement, insert the phi as
+    // the first member.
+    CodeData.FirstM = PA.Id;
+    PA.Addr->setNext(M.Id);
+  } else {
+    // If the first member is a phi, find the last phi, and append PA to it.
+    assert(M.Addr->getKind() == NodeAttrs::Phi);
+    Node MN = M;
+    do {
+      M = MN;
+      MN = G.addr<NodeBase *>(M.Addr->getNext());
+      assert(MN.Addr->getType() == NodeAttrs::Code);
+    } while (MN.Addr->getKind() == NodeAttrs::Phi);
+
+    // M is the last phi.
+    addMemberAfter(M, PA, G);
+  }
+}
+
+// Find the block node corresponding to the machine basic block BB in the
+// given func node.
+Block FuncNode::findBlock(const MachineBasicBlock *BB,
+                          const DataFlowGraph &G) const {
+  auto EqBB = [BB](Node NA) -> bool { return Block(NA).Addr->getCode() == BB; };
+  NodeList Ms = members_if(EqBB, G);
+  if (!Ms.empty())
+    return Ms[0];
+  return Block();
+}
+
+// Get the block node for the entry block in the given function.
+Block FuncNode::getEntryBlock(const DataFlowGraph &G) {
+  MachineBasicBlock *EntryB = &getCode()->front();
+  return findBlock(EntryB, G);
+}
+
+// Target operand information.
+//
+
+// For a given instruction, check if there are any bits of RR that can remain
+// unchanged across this def.
+bool TargetOperandInfo::isPreserving(const MachineInstr &In,
+                                     unsigned OpNum) const {
+  return TII.isPredicated(In);
+}
+
+// Check if the definition of RR produces an unspecified value.
+bool TargetOperandInfo::isClobbering(const MachineInstr &In,
+                                     unsigned OpNum) const {
+  const MachineOperand &Op = In.getOperand(OpNum);
+  if (Op.isRegMask())
+    return true;
+  assert(Op.isReg());
+  if (In.isCall())
+    if (Op.isDef() && Op.isDead())
+      return true;
+  return false;
+}
+
+// Check if the given instruction specifically requires
+bool TargetOperandInfo::isFixedReg(const MachineInstr &In,
+                                   unsigned OpNum) const {
+  if (In.isCall() || In.isReturn() || In.isInlineAsm())
+    return true;
+  // Check for a tail call.
+  if (In.isBranch())
+    for (const MachineOperand &O : In.operands())
+      if (O.isGlobal() || O.isSymbol())
+        return true;
+
+  const MCInstrDesc &D = In.getDesc();
+  if (D.implicit_defs().empty() && D.implicit_uses().empty())
+    return false;
+  const MachineOperand &Op = In.getOperand(OpNum);
+  // If there is a sub-register, treat the operand as non-fixed. Currently,
+  // fixed registers are those that are listed in the descriptor as implicit
+  // uses or defs, and those lists do not allow sub-registers.
+  if (Op.getSubReg() != 0)
+    return false;
+  Register Reg = Op.getReg();
+  ArrayRef<MCPhysReg> ImpOps =
+      Op.isDef() ? D.implicit_defs() : D.implicit_uses();
+  return is_contained(ImpOps, Reg);
+}
+
+//
+// The data flow graph construction.
+//
+
+DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
+                             const TargetRegisterInfo &tri,
+                             const MachineDominatorTree &mdt,
+                             const MachineDominanceFrontier &mdf)
+    : DefaultTOI(std::make_unique<TargetOperandInfo>(tii)), MF(mf), TII(tii),
+      TRI(tri), PRI(tri, mf), MDT(mdt), MDF(mdf), TOI(*DefaultTOI),
+      LiveIns(PRI) {}
+
+DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
+                             const TargetRegisterInfo &tri,
+                             const MachineDominatorTree &mdt,
+                             const MachineDominanceFrontier &mdf,
+                             const TargetOperandInfo &toi)
+    : MF(mf), TII(tii), TRI(tri), PRI(tri, mf), MDT(mdt), MDF(mdf), TOI(toi),
+      LiveIns(PRI) {}
+
+// The implementation of the definition stack.
+// Each register reference has its own definition stack. In particular,
+// for a register references "Reg" and "Reg:subreg" will each have their
+// own definition stacks.
+
+// Construct a stack iterator.
+DataFlowGraph::DefStack::Iterator::Iterator(const DataFlowGraph::DefStack &S,
+                                            bool Top)
+    : DS(S) {
+  if (!Top) {
+    // Initialize to bottom.
+    Pos = 0;
+    return;
+  }
+  // Initialize to the top, i.e. top-most non-delimiter (or 0, if empty).
+  Pos = DS.Stack.size();
+  while (Pos > 0 && DS.isDelimiter(DS.Stack[Pos - 1]))
+    Pos--;
+}
+
+// Return the size of the stack, including block delimiters.
+unsigned DataFlowGraph::DefStack::size() const {
+  unsigned S = 0;
+  for (auto I = top(), E = bottom(); I != E; I.down())
+    S++;
+  return S;
+}
+
+// Remove the top entry from the stack. Remove all intervening delimiters
+// so that after this, the stack is either empty, or the top of the stack
+// is a non-delimiter.
+void DataFlowGraph::DefStack::pop() {
+  assert(!empty());
+  unsigned P = nextDown(Stack.size());
+  Stack.resize(P);
+}
+
+// Push a delimiter for block node N on the stack.
+void DataFlowGraph::DefStack::start_block(NodeId N) {
+  assert(N != 0);
+  Stack.push_back(Def(nullptr, N));
+}
+
+// Remove all nodes from the top of the stack, until the delimited for
+// block node N is encountered. Remove the delimiter as well. In effect,
+// this will remove from the stack all definitions from block N.
+void DataFlowGraph::DefStack::clear_block(NodeId N) {
+  assert(N != 0);
+  unsigned P = Stack.size();
+  while (P > 0) {
+    bool Found = isDelimiter(Stack[P - 1], N);
+    P--;
+    if (Found)
+      break;
+  }
+  // This will also remove the delimiter, if found.
+  Stack.resize(P);
+}
+
+// Move the stack iterator up by one.
+unsigned DataFlowGraph::DefStack::nextUp(unsigned P) const {
+  // Get the next valid position after P (skipping all delimiters).
+  // The input position P does not have to point to a non-delimiter.
+  unsigned SS = Stack.size();
+  bool IsDelim;
+  assert(P < SS);
+  do {
+    P++;
+    IsDelim = isDelimiter(Stack[P - 1]);
+  } while (P < SS && IsDelim);
+  assert(!IsDelim);
+  return P;
+}
+
+// Move the stack iterator down by one.
+unsigned DataFlowGraph::DefStack::nextDown(unsigned P) const {
+  // Get the preceding valid position before P (skipping all delimiters).
+  // The input position P does not have to point to a non-delimiter.
+  assert(P > 0 && P <= Stack.size());
+  bool IsDelim = isDelimiter(Stack[P - 1]);
+  do {
+    if (--P == 0)
+      break;
+    IsDelim = isDelimiter(Stack[P - 1]);
+  } while (P > 0 && IsDelim);
+  assert(!IsDelim);
+  return P;
+}
+
+// Register information.
+
+RegisterAggr DataFlowGraph::getLandingPadLiveIns() const {
+  RegisterAggr LR(getPRI());
+  const Function &F = MF.getFunction();
+  const Constant *PF = F.hasPersonalityFn() ? F.getPersonalityFn() : nullptr;
+  const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
+  if (RegisterId R = TLI.getExceptionPointerRegister(PF))
+    LR.insert(RegisterRef(R));
+  if (!isFuncletEHPersonality(classifyEHPersonality(PF))) {
+    if (RegisterId R = TLI.getExceptionSelectorRegister(PF))
+      LR.insert(RegisterRef(R));
+  }
+  return LR;
+}
+
+// Node management functions.
+
+// Get the pointer to the node with the id N.
+NodeBase *DataFlowGraph::ptr(NodeId N) const {
+  if (N == 0)
+    return nullptr;
+  return Memory.ptr(N);
+}
+
+// Get the id of the node at the address P.
+NodeId DataFlowGraph::id(const NodeBase *P) const {
+  if (P == nullptr)
+    return 0;
+  return Memory.id(P);
+}
+
+// Allocate a new node and set the attributes to Attrs.
+Node DataFlowGraph::newNode(uint16_t Attrs) {
+  Node P = Memory.New();
+  P.Addr->init();
+  P.Addr->setAttrs(Attrs);
+  return P;
+}
+
+// Make a copy of the given node B, except for the data-flow links, which
+// are set to 0.
+Node DataFlowGraph::cloneNode(const Node B) {
+  Node NA = newNode(0);
+  memcpy(NA.Addr, B.Addr, sizeof(NodeBase));
+  // Ref nodes need to have the data-flow links reset.
+  if (NA.Addr->getType() == NodeAttrs::Ref) {
+    Ref RA = NA;
+    RA.Addr->setReachingDef(0);
+    RA.Addr->setSibling(0);
+    if (NA.Addr->getKind() == NodeAttrs::Def) {
+      Def DA = NA;
+      DA.Addr->setReachedDef(0);
+      DA.Addr->setReachedUse(0);
+    }
+  }
+  return NA;
+}
+
+// Allocation routines for specific node types/kinds.
+
+Use DataFlowGraph::newUse(Instr Owner, MachineOperand &Op, uint16_t Flags) {
+  Use UA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags);
+  UA.Addr->setRegRef(&Op, *this);
+  return UA;
+}
+
+PhiUse DataFlowGraph::newPhiUse(Phi Owner, RegisterRef RR, Block PredB,
+                                uint16_t Flags) {
+  PhiUse PUA = newNode(NodeAttrs::Ref | NodeAttrs::Use | Flags);
+  assert(Flags & NodeAttrs::PhiRef);
+  PUA.Addr->setRegRef(RR, *this);
+  PUA.Addr->setPredecessor(PredB.Id);
+  return PUA;
+}
+
+Def DataFlowGraph::newDef(Instr Owner, MachineOperand &Op, uint16_t Flags) {
+  Def DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags);
+  DA.Addr->setRegRef(&Op, *this);
+  return DA;
+}
+
+Def DataFlowGraph::newDef(Instr Owner, RegisterRef RR, uint16_t Flags) {
+  Def DA = newNode(NodeAttrs::Ref | NodeAttrs::Def | Flags);
+  assert(Flags & NodeAttrs::PhiRef);
+  DA.Addr->setRegRef(RR, *this);
+  return DA;
+}
+
+Phi DataFlowGraph::newPhi(Block Owner) {
+  Phi PA = newNode(NodeAttrs::Code | NodeAttrs::Phi);
+  Owner.Addr->addPhi(PA, *this);
+  return PA;
+}
+
+Stmt DataFlowGraph::newStmt(Block Owner, MachineInstr *MI) {
+  Stmt SA = newNode(NodeAttrs::Code | NodeAttrs::Stmt);
+  SA.Addr->setCode(MI);
+  Owner.Addr->addMember(SA, *this);
+  return SA;
+}
+
+Block DataFlowGraph::newBlock(Func Owner, MachineBasicBlock *BB) {
+  Block BA = newNode(NodeAttrs::Code | NodeAttrs::Block);
+  BA.Addr->setCode(BB);
+  Owner.Addr->addMember(BA, *this);
+  return BA;
+}
+
+Func DataFlowGraph::newFunc(MachineFunction *MF) {
+  Func FA = newNode(NodeAttrs::Code | NodeAttrs::Func);
+  FA.Addr->setCode(MF);
+  return FA;
+}
+
+// Build the data flow graph.
+void DataFlowGraph::build(const Config &config) {
+  reset();
+  BuildCfg = config;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  ReservedRegs = MRI.getReservedRegs();
+  bool SkipReserved = BuildCfg.Options & BuildOptions::OmitReserved;
+
+  auto Insert = [](auto &Set, auto &&Range) {
+    Set.insert(Range.begin(), Range.end());
+  };
+
+  if (BuildCfg.TrackRegs.empty()) {
+    std::set<RegisterId> BaseSet;
+    if (BuildCfg.Classes.empty()) {
+      // Insert every register.
+      for (unsigned R = 0, E = getPRI().getTRI().getNumRegs(); R != E; ++R)
+        BaseSet.insert(R);
+    } else {
+      for (const TargetRegisterClass *RC : BuildCfg.Classes) {
+        for (MCPhysReg R : *RC)
+          BaseSet.insert(R);
+      }
+    }
+    for (RegisterId R : BaseSet) {
+      if (SkipReserved && ReservedRegs[R])
+        continue;
+      Insert(TrackedUnits, getPRI().getUnits(RegisterRef(R)));
+    }
+  } else {
+    // Track set in Config overrides everything.
+    for (unsigned R : BuildCfg.TrackRegs) {
+      if (SkipReserved && ReservedRegs[R])
+        continue;
+      Insert(TrackedUnits, getPRI().getUnits(RegisterRef(R)));
+    }
+  }
+
+  TheFunc = newFunc(&MF);
+
+  if (MF.empty())
+    return;
+
+  for (MachineBasicBlock &B : MF) {
+    Block BA = newBlock(TheFunc, &B);
+    BlockNodes.insert(std::make_pair(&B, BA));
+    for (MachineInstr &I : B) {
+      if (I.isDebugInstr())
+        continue;
+      buildStmt(BA, I);
+    }
+  }
+
+  Block EA = TheFunc.Addr->getEntryBlock(*this);
+  NodeList Blocks = TheFunc.Addr->members(*this);
+
+  // Collect function live-ins and entry block live-ins.
+  MachineBasicBlock &EntryB = *EA.Addr->getCode();
+  assert(EntryB.pred_empty() && "Function entry block has predecessors");
+  for (std::pair<unsigned, unsigned> P : MRI.liveins())
+    LiveIns.insert(RegisterRef(P.first));
+  if (MRI.tracksLiveness()) {
+    for (auto I : EntryB.liveins())
+      LiveIns.insert(RegisterRef(I.PhysReg, I.LaneMask));
+  }
+
+  // Add function-entry phi nodes for the live-in registers.
+  for (RegisterRef RR : LiveIns.refs()) {
+    if (RR.isReg() && !isTracked(RR)) // isReg is likely guaranteed
+      continue;
+    Phi PA = newPhi(EA);
+    uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving;
+    Def DA = newDef(PA, RR, PhiFlags);
+    PA.Addr->addMember(DA, *this);
+  }
+
+  // Add phis for landing pads.
+  // Landing pads, unlike usual backs blocks, are not entered through
+  // branches in the program, or fall-throughs from other blocks. They
+  // are entered from the exception handling runtime and target's ABI
+  // may define certain registers as defined on entry to such a block.
+  RegisterAggr EHRegs = getLandingPadLiveIns();
+  if (!EHRegs.empty()) {
+    for (Block BA : Blocks) {
+      const MachineBasicBlock &B = *BA.Addr->getCode();
+      if (!B.isEHPad())
+        continue;
+
+      // Prepare a list of NodeIds of the block's predecessors.
+      NodeList Preds;
+      for (MachineBasicBlock *PB : B.predecessors())
+        Preds.push_back(findBlock(PB));
+
+      // Build phi nodes for each live-in.
+      for (RegisterRef RR : EHRegs.refs()) {
+        if (RR.isReg() && !isTracked(RR))
+          continue;
+        Phi PA = newPhi(BA);
+        uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving;
+        // Add def:
+        Def DA = newDef(PA, RR, PhiFlags);
+        PA.Addr->addMember(DA, *this);
+        // Add uses (no reaching defs for phi uses):
+        for (Block PBA : Preds) {
+          PhiUse PUA = newPhiUse(PA, RR, PBA);
+          PA.Addr->addMember(PUA, *this);
+        }
+      }
+    }
+  }
+
+  // Build a map "PhiM" which will contain, for each block, the set
+  // of references that will require phi definitions in that block.
+  BlockRefsMap PhiM(getPRI());
+  for (Block BA : Blocks)
+    recordDefsForDF(PhiM, BA);
+  for (Block BA : Blocks)
+    buildPhis(PhiM, BA);
+
+  // Link all the refs. This will recursively traverse the dominator tree.
+  DefStackMap DM;
+  linkBlockRefs(DM, EA);
+
+  // Finally, remove all unused phi nodes.
+  if (!(BuildCfg.Options & BuildOptions::KeepDeadPhis))
+    removeUnusedPhis();
+}
+
+RegisterRef DataFlowGraph::makeRegRef(unsigned Reg, unsigned Sub) const {
+  assert(RegisterRef::isRegId(Reg) || RegisterRef::isMaskId(Reg));
+  assert(Reg != 0);
+  if (Sub != 0)
+    Reg = TRI.getSubReg(Reg, Sub);
+  return RegisterRef(Reg);
+}
+
+RegisterRef DataFlowGraph::makeRegRef(const MachineOperand &Op) const {
+  assert(Op.isReg() || Op.isRegMask());
+  if (Op.isReg())
+    return makeRegRef(Op.getReg(), Op.getSubReg());
+  return RegisterRef(getPRI().getRegMaskId(Op.getRegMask()),
+                     LaneBitmask::getAll());
+}
+
+// For each stack in the map DefM, push the delimiter for block B on it.
+void DataFlowGraph::markBlock(NodeId B, DefStackMap &DefM) {
+  // Push block delimiters.
+  for (auto &P : DefM)
+    P.second.start_block(B);
+}
+
+// Remove all definitions coming from block B from each stack in DefM.
+void DataFlowGraph::releaseBlock(NodeId B, DefStackMap &DefM) {
+  // Pop all defs from this block from the definition stack. Defs that were
+  // added to the map during the traversal of instructions will not have a
+  // delimiter, but for those, the whole stack will be emptied.
+  for (auto &P : DefM)
+    P.second.clear_block(B);
+
+  // Finally, remove empty stacks from the map.
+  for (auto I = DefM.begin(), E = DefM.end(), NextI = I; I != E; I = NextI) {
+    NextI = std::next(I);
+    // This preserves the validity of iterators other than I.
+    if (I->second.empty())
+      DefM.erase(I);
+  }
+}
+
+// Push all definitions from the instruction node IA to an appropriate
+// stack in DefM.
+void DataFlowGraph::pushAllDefs(Instr IA, DefStackMap &DefM) {
+  pushClobbers(IA, DefM);
+  pushDefs(IA, DefM);
+}
+
+// Push all definitions from the instruction node IA to an appropriate
+// stack in DefM.
+void DataFlowGraph::pushClobbers(Instr IA, DefStackMap &DefM) {
+  NodeSet Visited;
+  std::set<RegisterId> Defined;
+
+  // The important objectives of this function are:
+  // - to be able to handle instructions both while the graph is being
+  //   constructed, and after the graph has been constructed, and
+  // - maintain proper ordering of definitions on the stack for each
+  //   register reference:
+  //   - if there are two or more related defs in IA (i.e. coming from
+  //     the same machine operand), then only push one def on the stack,
+  //   - if there are multiple unrelated defs of non-overlapping
+  //     subregisters of S, then the stack for S will have both (in an
+  //     unspecified order), but the order does not matter from the data-
+  //     -flow perspective.
+
+  for (Def DA : IA.Addr->members_if(IsDef, *this)) {
+    if (Visited.count(DA.Id))
+      continue;
+    if (!(DA.Addr->getFlags() & NodeAttrs::Clobbering))
+      continue;
+
+    NodeList Rel = getRelatedRefs(IA, DA);
+    Def PDA = Rel.front();
+    RegisterRef RR = PDA.Addr->getRegRef(*this);
+
+    // Push the definition on the stack for the register and all aliases.
+    // The def stack traversal in linkNodeUp will check the exact aliasing.
+    DefM[RR.Reg].push(DA);
+    Defined.insert(RR.Reg);
+    for (RegisterId A : getPRI().getAliasSet(RR.Reg)) {
+      if (RegisterRef::isRegId(A) && !isTracked(RegisterRef(A)))
+        continue;
+      // Check that we don't push the same def twice.
+      assert(A != RR.Reg);
+      if (!Defined.count(A))
+        DefM[A].push(DA);
+    }
+    // Mark all the related defs as visited.
+    for (Node T : Rel)
+      Visited.insert(T.Id);
+  }
+}
+
+// Push all definitions from the instruction node IA to an appropriate
+// stack in DefM.
+void DataFlowGraph::pushDefs(Instr IA, DefStackMap &DefM) {
+  NodeSet Visited;
+#ifndef NDEBUG
+  std::set<RegisterId> Defined;
+#endif
+
+  // The important objectives of this function are:
+  // - to be able to handle instructions both while the graph is being
+  //   constructed, and after the graph has been constructed, and
+  // - maintain proper ordering of definitions on the stack for each
+  //   register reference:
+  //   - if there are two or more related defs in IA (i.e. coming from
+  //     the same machine operand), then only push one def on the stack,
+  //   - if there are multiple unrelated defs of non-overlapping
+  //     subregisters of S, then the stack for S will have both (in an
+  //     unspecified order), but the order does not matter from the data-
+  //     -flow perspective.
+
+  for (Def DA : IA.Addr->members_if(IsDef, *this)) {
+    if (Visited.count(DA.Id))
+      continue;
+    if (DA.Addr->getFlags() & NodeAttrs::Clobbering)
+      continue;
+
+    NodeList Rel = getRelatedRefs(IA, DA);
+    Def PDA = Rel.front();
+    RegisterRef RR = PDA.Addr->getRegRef(*this);
+#ifndef NDEBUG
+    // Assert if the register is defined in two or more unrelated defs.
+    // This could happen if there are two or more def operands defining it.
+    if (!Defined.insert(RR.Reg).second) {
+      MachineInstr *MI = Stmt(IA).Addr->getCode();
+      dbgs() << "Multiple definitions of register: " << Print(RR, *this)
+             << " in\n  " << *MI << "in " << printMBBReference(*MI->getParent())
+             << '\n';
+      llvm_unreachable(nullptr);
+    }
+#endif
+    // Push the definition on the stack for the register and all aliases.
+    // The def stack traversal in linkNodeUp will check the exact aliasing.
+    DefM[RR.Reg].push(DA);
+    for (RegisterId A : getPRI().getAliasSet(RR.Reg)) {
+      if (RegisterRef::isRegId(A) && !isTracked(RegisterRef(A)))
+        continue;
+      // Check that we don't push the same def twice.
+      assert(A != RR.Reg);
+      DefM[A].push(DA);
+    }
+    // Mark all the related defs as visited.
+    for (Node T : Rel)
+      Visited.insert(T.Id);
+  }
+}
+
+// Return the list of all reference nodes related to RA, including RA itself.
+// See "getNextRelated" for the meaning of a "related reference".
+NodeList DataFlowGraph::getRelatedRefs(Instr IA, Ref RA) const {
+  assert(IA.Id != 0 && RA.Id != 0);
+
+  NodeList Refs;
+  NodeId Start = RA.Id;
+  do {
+    Refs.push_back(RA);
+    RA = getNextRelated(IA, RA);
+  } while (RA.Id != 0 && RA.Id != Start);
+  return Refs;
+}
+
+// Clear all information in the graph.
+void DataFlowGraph::reset() {
+  Memory.clear();
+  BlockNodes.clear();
+  TrackedUnits.clear();
+  ReservedRegs.clear();
+  TheFunc = Func();
+}
+
+// Return the next reference node in the instruction node IA that is related
+// to RA. Conceptually, two reference nodes are related if they refer to the
+// same instance of a register access, but differ in flags or other minor
+// characteristics. Specific examples of related nodes are shadow reference
+// nodes.
+// Return the equivalent of nullptr if there are no more related references.
+Ref DataFlowGraph::getNextRelated(Instr IA, Ref RA) const {
+  assert(IA.Id != 0 && RA.Id != 0);
+
+  auto IsRelated = [this, RA](Ref TA) -> bool {
+    if (TA.Addr->getKind() != RA.Addr->getKind())
+      return false;
+    if (!getPRI().equal_to(TA.Addr->getRegRef(*this),
+                           RA.Addr->getRegRef(*this))) {
+      return false;
+    }
+    return true;
+  };
+
+  RegisterRef RR = RA.Addr->getRegRef(*this);
+  if (IA.Addr->getKind() == NodeAttrs::Stmt) {
+    auto Cond = [&IsRelated, RA](Ref TA) -> bool {
+      return IsRelated(TA) && &RA.Addr->getOp() == &TA.Addr->getOp();
+    };
+    return RA.Addr->getNextRef(RR, Cond, true, *this);
+  }
+
+  assert(IA.Addr->getKind() == NodeAttrs::Phi);
+  auto Cond = [&IsRelated, RA](Ref TA) -> bool {
+    if (!IsRelated(TA))
+      return false;
+    if (TA.Addr->getKind() != NodeAttrs::Use)
+      return true;
+    // For phi uses, compare predecessor blocks.
+    return PhiUse(TA).Addr->getPredecessor() ==
+           PhiUse(RA).Addr->getPredecessor();
+  };
+  return RA.Addr->getNextRef(RR, Cond, true, *this);
+}
+
+// Find the next node related to RA in IA that satisfies condition P.
+// If such a node was found, return a pair where the second element is the
+// located node. If such a node does not exist, return a pair where the
+// first element is the element after which such a node should be inserted,
+// and the second element is a null-address.
+template <typename Predicate>
+std::pair<Ref, Ref> DataFlowGraph::locateNextRef(Instr IA, Ref RA,
+                                                 Predicate P) const {
+  assert(IA.Id != 0 && RA.Id != 0);
+
+  Ref NA;
+  NodeId Start = RA.Id;
+  while (true) {
+    NA = getNextRelated(IA, RA);
+    if (NA.Id == 0 || NA.Id == Start)
+      break;
+    if (P(NA))
+      break;
+    RA = NA;
+  }
+
+  if (NA.Id != 0 && NA.Id != Start)
+    return std::make_pair(RA, NA);
+  return std::make_pair(RA, Ref());
+}
+
+// Get the next shadow node in IA corresponding to RA, and optionally create
+// such a node if it does not exist.
+Ref DataFlowGraph::getNextShadow(Instr IA, Ref RA, bool Create) {
+  assert(IA.Id != 0 && RA.Id != 0);
+
+  uint16_t Flags = RA.Addr->getFlags() | NodeAttrs::Shadow;
+  auto IsShadow = [Flags](Ref TA) -> bool {
+    return TA.Addr->getFlags() == Flags;
+  };
+  auto Loc = locateNextRef(IA, RA, IsShadow);
+  if (Loc.second.Id != 0 || !Create)
+    return Loc.second;
+
+  // Create a copy of RA and mark is as shadow.
+  Ref NA = cloneNode(RA);
+  NA.Addr->setFlags(Flags | NodeAttrs::Shadow);
+  IA.Addr->addMemberAfter(Loc.first, NA, *this);
+  return NA;
+}
+
+// Create a new statement node in the block node BA that corresponds to
+// the machine instruction MI.
+void DataFlowGraph::buildStmt(Block BA, MachineInstr &In) {
+  Stmt SA = newStmt(BA, &In);
+
+  auto isCall = [](const MachineInstr &In) -> bool {
+    if (In.isCall())
+      return true;
+    // Is tail call?
+    if (In.isBranch()) {
+      for (const MachineOperand &Op : In.operands())
+        if (Op.isGlobal() || Op.isSymbol())
+          return true;
+      // Assume indirect branches are calls. This is for the purpose of
+      // keeping implicit operands, and so it won't hurt on intra-function
+      // indirect branches.
+      if (In.isIndirectBranch())
+        return true;
+    }
+    return false;
+  };
+
+  auto isDefUndef = [this](const MachineInstr &In, RegisterRef DR) -> bool {
+    // This instruction defines DR. Check if there is a use operand that
+    // would make DR live on entry to the instruction.
+    for (const MachineOperand &Op : In.all_uses()) {
+      if (Op.getReg() == 0 || Op.isUndef())
+        continue;
+      RegisterRef UR = makeRegRef(Op);
+      if (getPRI().alias(DR, UR))
+        return false;
+    }
+    return true;
+  };
+
+  bool IsCall = isCall(In);
+  unsigned NumOps = In.getNumOperands();
+
+  // Avoid duplicate implicit defs. This will not detect cases of implicit
+  // defs that define registers that overlap, but it is not clear how to
+  // interpret that in the absence of explicit defs. Overlapping explicit
+  // defs are likely illegal already.
+  BitVector DoneDefs(TRI.getNumRegs());
+  // Process explicit defs first.
+  for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+    MachineOperand &Op = In.getOperand(OpN);
+    if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
+      continue;
+    Register R = Op.getReg();
+    if (!R || !R.isPhysical() || !isTracked(RegisterRef(R)))
+      continue;
+    uint16_t Flags = NodeAttrs::None;
+    if (TOI.isPreserving(In, OpN)) {
+      Flags |= NodeAttrs::Preserving;
+      // If the def is preserving, check if it is also undefined.
+      if (isDefUndef(In, makeRegRef(Op)))
+        Flags |= NodeAttrs::Undef;
+    }
+    if (TOI.isClobbering(In, OpN))
+      Flags |= NodeAttrs::Clobbering;
+    if (TOI.isFixedReg(In, OpN))
+      Flags |= NodeAttrs::Fixed;
+    if (IsCall && Op.isDead())
+      Flags |= NodeAttrs::Dead;
+    Def DA = newDef(SA, Op, Flags);
+    SA.Addr->addMember(DA, *this);
+    assert(!DoneDefs.test(R));
+    DoneDefs.set(R);
+  }
+
+  // Process reg-masks (as clobbers).
+  BitVector DoneClobbers(TRI.getNumRegs());
+  for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+    MachineOperand &Op = In.getOperand(OpN);
+    if (!Op.isRegMask())
+      continue;
+    uint16_t Flags = NodeAttrs::Clobbering | NodeAttrs::Fixed | NodeAttrs::Dead;
+    Def DA = newDef(SA, Op, Flags);
+    SA.Addr->addMember(DA, *this);
+    // Record all clobbered registers in DoneDefs.
+    const uint32_t *RM = Op.getRegMask();
+    for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i) {
+      if (!isTracked(RegisterRef(i)))
+        continue;
+      if (!(RM[i / 32] & (1u << (i % 32))))
+        DoneClobbers.set(i);
+    }
+  }
+
+  // Process implicit defs, skipping those that have already been added
+  // as explicit.
+  for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+    MachineOperand &Op = In.getOperand(OpN);
+    if (!Op.isReg() || !Op.isDef() || !Op.isImplicit())
+      continue;
+    Register R = Op.getReg();
+    if (!R || !R.isPhysical() || !isTracked(RegisterRef(R)) || DoneDefs.test(R))
+      continue;
+    RegisterRef RR = makeRegRef(Op);
+    uint16_t Flags = NodeAttrs::None;
+    if (TOI.isPreserving(In, OpN)) {
+      Flags |= NodeAttrs::Preserving;
+      // If the def is preserving, check if it is also undefined.
+      if (isDefUndef(In, RR))
+        Flags |= NodeAttrs::Undef;
+    }
+    if (TOI.isClobbering(In, OpN))
+      Flags |= NodeAttrs::Clobbering;
+    if (TOI.isFixedReg(In, OpN))
+      Flags |= NodeAttrs::Fixed;
+    if (IsCall && Op.isDead()) {
+      if (DoneClobbers.test(R))
+        continue;
+      Flags |= NodeAttrs::Dead;
+    }
+    Def DA = newDef(SA, Op, Flags);
+    SA.Addr->addMember(DA, *this);
+    DoneDefs.set(R);
+  }
+
+  for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+    MachineOperand &Op = In.getOperand(OpN);
+    if (!Op.isReg() || !Op.isUse())
+      continue;
+    Register R = Op.getReg();
+    if (!R || !R.isPhysical() || !isTracked(RegisterRef(R)))
+      continue;
+    uint16_t Flags = NodeAttrs::None;
+    if (Op.isUndef())
+      Flags |= NodeAttrs::Undef;
+    if (TOI.isFixedReg(In, OpN))
+      Flags |= NodeAttrs::Fixed;
+    Use UA = newUse(SA, Op, Flags);
+    SA.Addr->addMember(UA, *this);
+  }
+}
+
+// Scan all defs in the block node BA and record in PhiM the locations of
+// phi nodes corresponding to these defs.
+void DataFlowGraph::recordDefsForDF(BlockRefsMap &PhiM, Block BA) {
+  // Check all defs from block BA and record them in each block in BA's
+  // iterated dominance frontier. This information will later be used to
+  // create phi nodes.
+  MachineBasicBlock *BB = BA.Addr->getCode();
+  assert(BB);
+  auto DFLoc = MDF.find(BB);
+  if (DFLoc == MDF.end() || DFLoc->second.empty())
+    return;
+
+  // Traverse all instructions in the block and collect the set of all
+  // defined references. For each reference there will be a phi created
+  // in the block's iterated dominance frontier.
+  // This is done to make sure that each defined reference gets only one
+  // phi node, even if it is defined multiple times.
+  RegisterAggr Defs(getPRI());
+  for (Instr IA : BA.Addr->members(*this)) {
+    for (Ref RA : IA.Addr->members_if(IsDef, *this)) {
+      RegisterRef RR = RA.Addr->getRegRef(*this);
+      if (RR.isReg() && isTracked(RR))
+        Defs.insert(RR);
+    }
+  }
+
+  // Calculate the iterated dominance frontier of BB.
+  const MachineDominanceFrontier::DomSetType &DF = DFLoc->second;
+  SetVector<MachineBasicBlock *> IDF(DF.begin(), DF.end());
+  for (unsigned i = 0; i < IDF.size(); ++i) {
+    auto F = MDF.find(IDF[i]);
+    if (F != MDF.end())
+      IDF.insert(F->second.begin(), F->second.end());
+  }
+
+  // Finally, add the set of defs to each block in the iterated dominance
+  // frontier.
+  for (auto *DB : IDF) {
+    Block DBA = findBlock(DB);
+    PhiM[DBA.Id].insert(Defs);
+  }
+}
+
+// Given the locations of phi nodes in the map PhiM, create the phi nodes
+// that are located in the block node BA.
+void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, Block BA) {
+  // Check if this blocks has any DF defs, i.e. if there are any defs
+  // that this block is in the iterated dominance frontier of.
+  auto HasDF = PhiM.find(BA.Id);
+  if (HasDF == PhiM.end() || HasDF->second.empty())
+    return;
+
+  // Prepare a list of NodeIds of the block's predecessors.
+  NodeList Preds;
+  const MachineBasicBlock *MBB = BA.Addr->getCode();
+  for (MachineBasicBlock *PB : MBB->predecessors())
+    Preds.push_back(findBlock(PB));
+
+  const RegisterAggr &Defs = PhiM[BA.Id];
+  uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving;
+
+  for (RegisterRef RR : Defs.refs()) {
+    Phi PA = newPhi(BA);
+    PA.Addr->addMember(newDef(PA, RR, PhiFlags), *this);
+
+    // Add phi uses.
+    for (Block PBA : Preds) {
+      PA.Addr->addMember(newPhiUse(PA, RR, PBA), *this);
+    }
+  }
+}
+
+// Remove any unneeded phi nodes that were created during the build process.
+void DataFlowGraph::removeUnusedPhis() {
+  // This will remove unused phis, i.e. phis where each def does not reach
+  // any uses or other defs. This will not detect or remove circular phi
+  // chains that are otherwise dead. Unused/dead phis are created during
+  // the build process and this function is intended to remove these cases
+  // that are easily determinable to be unnecessary.
+
+  SetVector<NodeId> PhiQ;
+  for (Block BA : TheFunc.Addr->members(*this)) {
+    for (auto P : BA.Addr->members_if(IsPhi, *this))
+      PhiQ.insert(P.Id);
+  }
+
+  static auto HasUsedDef = [](NodeList &Ms) -> bool {
+    for (Node M : Ms) {
+      if (M.Addr->getKind() != NodeAttrs::Def)
+        continue;
+      Def DA = M;
+      if (DA.Addr->getReachedDef() != 0 || DA.Addr->getReachedUse() != 0)
+        return true;
+    }
+    return false;
+  };
+
+  // Any phi, if it is removed, may affect other phis (make them dead).
+  // For each removed phi, collect the potentially affected phis and add
+  // them back to the queue.
+  while (!PhiQ.empty()) {
+    auto PA = addr<PhiNode *>(PhiQ[0]);
+    PhiQ.remove(PA.Id);
+    NodeList Refs = PA.Addr->members(*this);
+    if (HasUsedDef(Refs))
+      continue;
+    for (Ref RA : Refs) {
+      if (NodeId RD = RA.Addr->getReachingDef()) {
+        auto RDA = addr<DefNode *>(RD);
+        Instr OA = RDA.Addr->getOwner(*this);
+        if (IsPhi(OA))
+          PhiQ.insert(OA.Id);
+      }
+      if (RA.Addr->isDef())
+        unlinkDef(RA, true);
+      else
+        unlinkUse(RA, true);
+    }
+    Block BA = PA.Addr->getOwner(*this);
+    BA.Addr->removeMember(PA, *this);
+  }
+}
+
+// For a given reference node TA in an instruction node IA, connect the
+// reaching def of TA to the appropriate def node. Create any shadow nodes
+// as appropriate.
+template <typename T>
+void DataFlowGraph::linkRefUp(Instr IA, NodeAddr<T> TA, DefStack &DS) {
+  if (DS.empty())
+    return;
+  RegisterRef RR = TA.Addr->getRegRef(*this);
+  NodeAddr<T> TAP;
+
+  // References from the def stack that have been examined so far.
+  RegisterAggr Defs(getPRI());
+
+  for (auto I = DS.top(), E = DS.bottom(); I != E; I.down()) {
+    RegisterRef QR = I->Addr->getRegRef(*this);
+
+    // Skip all defs that are aliased to any of the defs that we have already
+    // seen. If this completes a cover of RR, stop the stack traversal.
+    bool Alias = Defs.hasAliasOf(QR);
+    bool Cover = Defs.insert(QR).hasCoverOf(RR);
+    if (Alias) {
+      if (Cover)
+        break;
+      continue;
+    }
+
+    // The reaching def.
+    Def RDA = *I;
+
+    // Pick the reached node.
+    if (TAP.Id == 0) {
+      TAP = TA;
+    } else {
+      // Mark the existing ref as "shadow" and create a new shadow.
+      TAP.Addr->setFlags(TAP.Addr->getFlags() | NodeAttrs::Shadow);
+      TAP = getNextShadow(IA, TAP, true);
+    }
+
+    // Create the link.
+    TAP.Addr->linkToDef(TAP.Id, RDA);
+
+    if (Cover)
+      break;
+  }
+}
+
+// Create data-flow links for all reference nodes in the statement node SA.
+template <typename Predicate>
+void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, Stmt SA, Predicate P) {
+#ifndef NDEBUG
+  RegisterSet Defs(getPRI());
+#endif
+
+  // Link all nodes (upwards in the data-flow) with their reaching defs.
+  for (Ref RA : SA.Addr->members_if(P, *this)) {
+    uint16_t Kind = RA.Addr->getKind();
+    assert(Kind == NodeAttrs::Def || Kind == NodeAttrs::Use);
+    RegisterRef RR = RA.Addr->getRegRef(*this);
+#ifndef NDEBUG
+    // Do not expect multiple defs of the same reference.
+    assert(Kind != NodeAttrs::Def || !Defs.count(RR));
+    Defs.insert(RR);
+#endif
+
+    auto F = DefM.find(RR.Reg);
+    if (F == DefM.end())
+      continue;
+    DefStack &DS = F->second;
+    if (Kind == NodeAttrs::Use)
+      linkRefUp<UseNode *>(SA, RA, DS);
+    else if (Kind == NodeAttrs::Def)
+      linkRefUp<DefNode *>(SA, RA, DS);
+    else
+      llvm_unreachable("Unexpected node in instruction");
+  }
+}
+
+// Create data-flow links for all instructions in the block node BA. This
+// will include updating any phi nodes in BA.
+void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, Block BA) {
+  // Push block delimiters.
+  markBlock(BA.Id, DefM);
+
+  auto IsClobber = [](Ref RA) -> bool {
+    return IsDef(RA) && (RA.Addr->getFlags() & NodeAttrs::Clobbering);
+  };
+  auto IsNoClobber = [](Ref RA) -> bool {
+    return IsDef(RA) && !(RA.Addr->getFlags() & NodeAttrs::Clobbering);
+  };
+
+  assert(BA.Addr && "block node address is needed to create a data-flow link");
+  // For each non-phi instruction in the block, link all the defs and uses
+  // to their reaching defs. For any member of the block (including phis),
+  // push the defs on the corresponding stacks.
+  for (Instr IA : BA.Addr->members(*this)) {
+    // Ignore phi nodes here. They will be linked part by part from the
+    // predecessors.
+    if (IA.Addr->getKind() == NodeAttrs::Stmt) {
+      linkStmtRefs(DefM, IA, IsUse);
+      linkStmtRefs(DefM, IA, IsClobber);
+    }
+
+    // Push the definitions on the stack.
+    pushClobbers(IA, DefM);
+
+    if (IA.Addr->getKind() == NodeAttrs::Stmt)
+      linkStmtRefs(DefM, IA, IsNoClobber);
+
+    pushDefs(IA, DefM);
+  }
+
+  // Recursively process all children in the dominator tree.
+  MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode());
+  for (auto *I : *N) {
+    MachineBasicBlock *SB = I->getBlock();
+    Block SBA = findBlock(SB);
+    linkBlockRefs(DefM, SBA);
+  }
+
+  // Link the phi uses from the successor blocks.
+  auto IsUseForBA = [BA](Node NA) -> bool {
+    if (NA.Addr->getKind() != NodeAttrs::Use)
+      return false;
+    assert(NA.Addr->getFlags() & NodeAttrs::PhiRef);
+    return PhiUse(NA).Addr->getPredecessor() == BA.Id;
+  };
+
+  RegisterAggr EHLiveIns = getLandingPadLiveIns();
+  MachineBasicBlock *MBB = BA.Addr->getCode();
+
+  for (MachineBasicBlock *SB : MBB->successors()) {
+    bool IsEHPad = SB->isEHPad();
+    Block SBA = findBlock(SB);
+    for (Instr IA : SBA.Addr->members_if(IsPhi, *this)) {
+      // Do not link phi uses for landing pad live-ins.
+      if (IsEHPad) {
+        // Find what register this phi is for.
+        Ref RA = IA.Addr->getFirstMember(*this);
+        assert(RA.Id != 0);
+        if (EHLiveIns.hasCoverOf(RA.Addr->getRegRef(*this)))
+          continue;
+      }
+      // Go over each phi use associated with MBB, and link it.
+      for (auto U : IA.Addr->members_if(IsUseForBA, *this)) {
+        PhiUse PUA = U;
+        RegisterRef RR = PUA.Addr->getRegRef(*this);
+        linkRefUp<UseNode *>(IA, PUA, DefM[RR.Reg]);
+      }
+    }
+  }
+
+  // Pop all defs from this block from the definition stacks.
+  releaseBlock(BA.Id, DefM);
+}
+
+// Remove the use node UA from any data-flow and structural links.
+void DataFlowGraph::unlinkUseDF(Use UA) {
+  NodeId RD = UA.Addr->getReachingDef();
+  NodeId Sib = UA.Addr->getSibling();
+
+  if (RD == 0) {
+    assert(Sib == 0);
+    return;
+  }
+
+  auto RDA = addr<DefNode *>(RD);
+  auto TA = addr<UseNode *>(RDA.Addr->getReachedUse());
+  if (TA.Id == UA.Id) {
+    RDA.Addr->setReachedUse(Sib);
+    return;
+  }
+
+  while (TA.Id != 0) {
+    NodeId S = TA.Addr->getSibling();
+    if (S == UA.Id) {
+      TA.Addr->setSibling(UA.Addr->getSibling());
+      return;
+    }
+    TA = addr<UseNode *>(S);
+  }
+}
+
+// Remove the def node DA from any data-flow and structural links.
+void DataFlowGraph::unlinkDefDF(Def DA) {
+  //
+  //         RD
+  //         | reached
+  //         | def
+  //         :
+  //         .
+  //        +----+
+  // ... -- | DA | -- ... -- 0  : sibling chain of DA
+  //        +----+
+  //         |  | reached
+  //         |  : def
+  //         |  .
+  //         | ...  : Siblings (defs)
+  //         |
+  //         : reached
+  //         . use
+  //        ... : sibling chain of reached uses
+
+  NodeId RD = DA.Addr->getReachingDef();
+
+  // Visit all siblings of the reached def and reset their reaching defs.
+  // Also, defs reached by DA are now "promoted" to being reached by RD,
+  // so all of them will need to be spliced into the sibling chain where
+  // DA belongs.
+  auto getAllNodes = [this](NodeId N) -> NodeList {
+    NodeList Res;
+    while (N) {
+      auto RA = addr<RefNode *>(N);
+      // Keep the nodes in the exact sibling order.
+      Res.push_back(RA);
+      N = RA.Addr->getSibling();
+    }
+    return Res;
+  };
+  NodeList ReachedDefs = getAllNodes(DA.Addr->getReachedDef());
+  NodeList ReachedUses = getAllNodes(DA.Addr->getReachedUse());
+
+  if (RD == 0) {
+    for (Ref I : ReachedDefs)
+      I.Addr->setSibling(0);
+    for (Ref I : ReachedUses)
+      I.Addr->setSibling(0);
+  }
+  for (Def I : ReachedDefs)
+    I.Addr->setReachingDef(RD);
+  for (Use I : ReachedUses)
+    I.Addr->setReachingDef(RD);
+
+  NodeId Sib = DA.Addr->getSibling();
+  if (RD == 0) {
+    assert(Sib == 0);
+    return;
+  }
+
+  // Update the reaching def node and remove DA from the sibling list.
+  auto RDA = addr<DefNode *>(RD);
+  auto TA = addr<DefNode *>(RDA.Addr->getReachedDef());
+  if (TA.Id == DA.Id) {
+    // If DA is the first reached def, just update the RD's reached def
+    // to the DA's sibling.
+    RDA.Addr->setReachedDef(Sib);
+  } else {
+    // Otherwise, traverse the sibling list of the reached defs and remove
+    // DA from it.
+    while (TA.Id != 0) {
+      NodeId S = TA.Addr->getSibling();
+      if (S == DA.Id) {
+        TA.Addr->setSibling(Sib);
+        break;
+      }
+      TA = addr<DefNode *>(S);
+    }
+  }
+
+  // Splice the DA's reached defs into the RDA's reached def chain.
+  if (!ReachedDefs.empty()) {
+    auto Last = Def(ReachedDefs.back());
+    Last.Addr->setSibling(RDA.Addr->getReachedDef());
+    RDA.Addr->setReachedDef(ReachedDefs.front().Id);
+  }
+  // Splice the DA's reached uses into the RDA's reached use chain.
+  if (!ReachedUses.empty()) {
+    auto Last = Use(ReachedUses.back());
+    Last.Addr->setSibling(RDA.Addr->getReachedUse());
+    RDA.Addr->setReachedUse(ReachedUses.front().Id);
+  }
+}
+
+bool DataFlowGraph::isTracked(RegisterRef RR) const {
+  return !disjoint(getPRI().getUnits(RR), TrackedUnits);
+}
+
+bool DataFlowGraph::hasUntrackedRef(Stmt S, bool IgnoreReserved) const {
+  SmallVector<MachineOperand *> Ops;
+
+  for (Ref R : S.Addr->members(*this)) {
+    Ops.push_back(&R.Addr->getOp());
+    RegisterRef RR = R.Addr->getRegRef(*this);
+    if (IgnoreReserved && RR.isReg() && ReservedRegs[RR.idx()])
+      continue;
+    if (!isTracked(RR))
+      return true;
+  }
+  for (const MachineOperand &Op : S.Addr->getCode()->operands()) {
+    if (!Op.isReg() && !Op.isRegMask())
+      continue;
+    if (llvm::find(Ops, &Op) == Ops.end())
+      return true;
+  }
+  return false;
+}
+
+} // end namespace llvm::rdf
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RDFLiveness.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RDFLiveness.cpp
new file mode 100644
index 000000000000..11f3fedaa5f9
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RDFLiveness.cpp
@@ -0,0 +1,1177 @@
+//===- RDFLiveness.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Computation of the liveness information from the data-flow graph.
+//
+// The main functionality of this code is to compute block live-in
+// information. With the live-in information in place, the placement
+// of kill flags can also be recalculated.
+//
+// The block live-in calculation is based on the ideas from the following
+// publication:
+//
+// Dibyendu Das, Ramakrishna Upadrasta, Benoit Dupont de Dinechin.
+// "Efficient Liveness Computation Using Merge Sets and DJ-Graphs."
+// ACM Transactions on Architecture and Code Optimization, Association for
+// Computing Machinery, 2012, ACM TACO Special Issue on "High-Performance
+// and Embedded Architectures and Compilers", 8 (4),
+// <10.1145/2086696.2086706>. <hal-00647369>
+//
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFLiveness.h"
+#include "llvm/CodeGen/RDFRegisters.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<unsigned> MaxRecNest("rdf-liveness-max-rec", cl::init(25),
+                                    cl::Hidden,
+                                    cl::desc("Maximum recursion level"));
+
+namespace llvm::rdf {
+
+raw_ostream &operator<<(raw_ostream &OS, const Print<Liveness::RefMap> &P) {
+  OS << '{';
+  for (const auto &I : P.Obj) {
+    OS << ' ' << printReg(I.first, &P.G.getTRI()) << '{';
+    for (auto J = I.second.begin(), E = I.second.end(); J != E;) {
+      OS << Print(J->first, P.G) << PrintLaneMaskShort(J->second);
+      if (++J != E)
+        OS << ',';
+    }
+    OS << '}';
+  }
+  OS << " }";
+  return OS;
+}
+
+// The order in the returned sequence is the order of reaching defs in the
+// upward traversal: the first def is the closest to the given reference RefA,
+// the next one is further up, and so on.
+// The list ends at a reaching phi def, or when the reference from RefA is
+// covered by the defs in the list (see FullChain).
+// This function provides two modes of operation:
+// (1) Returning the sequence of reaching defs for a particular reference
+// node. This sequence will terminate at the first phi node [1].
+// (2) Returning a partial sequence of reaching defs, where the final goal
+// is to traverse past phi nodes to the actual defs arising from the code
+// itself.
+// In mode (2), the register reference for which the search was started
+// may be different from the reference node RefA, for which this call was
+// made, hence the argument RefRR, which holds the original register.
+// Also, some definitions may have already been encountered in a previous
+// call that will influence register covering. The register references
+// already defined are passed in through DefRRs.
+// In mode (1), the "continuation" considerations do not apply, and the
+// RefRR is the same as the register in RefA, and the set DefRRs is empty.
+//
+// [1] It is possible for multiple phi nodes to be included in the returned
+// sequence:
+//   SubA = phi ...
+//   SubB = phi ...
+//   ...  = SuperAB(rdef:SubA), SuperAB"(rdef:SubB)
+// However, these phi nodes are independent from one another in terms of
+// the data-flow.
+
+NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
+                                      NodeAddr<RefNode *> RefA, bool TopShadows,
+                                      bool FullChain,
+                                      const RegisterAggr &DefRRs) {
+  NodeList RDefs; // Return value.
+  SetVector<NodeId> DefQ;
+  DenseMap<MachineInstr *, uint32_t> OrdMap;
+
+  // Dead defs will be treated as if they were live, since they are actually
+  // on the data-flow path. They cannot be ignored because even though they
+  // do not generate meaningful values, they still modify registers.
+
+  // If the reference is undefined, there is nothing to do.
+  if (RefA.Addr->getFlags() & NodeAttrs::Undef)
+    return RDefs;
+
+  // The initial queue should not have reaching defs for shadows. The
+  // whole point of a shadow is that it will have a reaching def that
+  // is not aliased to the reaching defs of the related shadows.
+  NodeId Start = RefA.Id;
+  auto SNA = DFG.addr<RefNode *>(Start);
+  if (NodeId RD = SNA.Addr->getReachingDef())
+    DefQ.insert(RD);
+  if (TopShadows) {
+    for (auto S : DFG.getRelatedRefs(RefA.Addr->getOwner(DFG), RefA))
+      if (NodeId RD = NodeAddr<RefNode *>(S).Addr->getReachingDef())
+        DefQ.insert(RD);
+  }
+
+  // Collect all the reaching defs, going up until a phi node is encountered,
+  // or there are no more reaching defs. From this set, the actual set of
+  // reaching defs will be selected.
+  // The traversal upwards must go on until a covering def is encountered.
+  // It is possible that a collection of non-covering (individually) defs
+  // will be sufficient, but keep going until a covering one is found.
+  for (unsigned i = 0; i < DefQ.size(); ++i) {
+    auto TA = DFG.addr<DefNode *>(DefQ[i]);
+    if (TA.Addr->getFlags() & NodeAttrs::PhiRef)
+      continue;
+    // Stop at the covering/overwriting def of the initial register reference.
+    RegisterRef RR = TA.Addr->getRegRef(DFG);
+    if (!DFG.IsPreservingDef(TA))
+      if (RegisterAggr::isCoverOf(RR, RefRR, PRI))
+        continue;
+    // Get the next level of reaching defs. This will include multiple
+    // reaching defs for shadows.
+    for (auto S : DFG.getRelatedRefs(TA.Addr->getOwner(DFG), TA))
+      if (NodeId RD = NodeAddr<RefNode *>(S).Addr->getReachingDef())
+        DefQ.insert(RD);
+    // Don't visit sibling defs. They share the same reaching def (which
+    // will be visited anyway), but they define something not aliased to
+    // this ref.
+  }
+
+  // Return the MachineBasicBlock containing a given instruction.
+  auto Block = [this](NodeAddr<InstrNode *> IA) -> MachineBasicBlock * {
+    if (IA.Addr->getKind() == NodeAttrs::Stmt)
+      return NodeAddr<StmtNode *>(IA).Addr->getCode()->getParent();
+    assert(IA.Addr->getKind() == NodeAttrs::Phi);
+    NodeAddr<PhiNode *> PA = IA;
+    NodeAddr<BlockNode *> BA = PA.Addr->getOwner(DFG);
+    return BA.Addr->getCode();
+  };
+
+  SmallSet<NodeId, 32> Defs;
+
+  // Remove all non-phi defs that are not aliased to RefRR, and separate
+  // the the remaining defs into buckets for containing blocks.
+  std::map<NodeId, NodeAddr<InstrNode *>> Owners;
+  std::map<MachineBasicBlock *, SmallVector<NodeId, 32>> Blocks;
+  for (NodeId N : DefQ) {
+    auto TA = DFG.addr<DefNode *>(N);
+    bool IsPhi = TA.Addr->getFlags() & NodeAttrs::PhiRef;
+    if (!IsPhi && !PRI.alias(RefRR, TA.Addr->getRegRef(DFG)))
+      continue;
+    Defs.insert(TA.Id);
+    NodeAddr<InstrNode *> IA = TA.Addr->getOwner(DFG);
+    Owners[TA.Id] = IA;
+    Blocks[Block(IA)].push_back(IA.Id);
+  }
+
+  auto Precedes = [this, &OrdMap](NodeId A, NodeId B) {
+    if (A == B)
+      return false;
+    NodeAddr<InstrNode *> OA = DFG.addr<InstrNode *>(A);
+    NodeAddr<InstrNode *> OB = DFG.addr<InstrNode *>(B);
+    bool StmtA = OA.Addr->getKind() == NodeAttrs::Stmt;
+    bool StmtB = OB.Addr->getKind() == NodeAttrs::Stmt;
+    if (StmtA && StmtB) {
+      const MachineInstr *InA = NodeAddr<StmtNode *>(OA).Addr->getCode();
+      const MachineInstr *InB = NodeAddr<StmtNode *>(OB).Addr->getCode();
+      assert(InA->getParent() == InB->getParent());
+      auto FA = OrdMap.find(InA);
+      if (FA != OrdMap.end())
+        return FA->second < OrdMap.find(InB)->second;
+      const MachineBasicBlock *BB = InA->getParent();
+      for (auto It = BB->begin(), E = BB->end(); It != E; ++It) {
+        if (It == InA->getIterator())
+          return true;
+        if (It == InB->getIterator())
+          return false;
+      }
+      llvm_unreachable("InA and InB should be in the same block");
+    }
+    // One of them is a phi node.
+    if (!StmtA && !StmtB) {
+      // Both are phis, which are unordered. Break the tie by id numbers.
+      return A < B;
+    }
+    // Only one of them is a phi. Phis always precede statements.
+    return !StmtA;
+  };
+
+  auto GetOrder = [&OrdMap](MachineBasicBlock &B) {
+    uint32_t Pos = 0;
+    for (MachineInstr &In : B)
+      OrdMap.insert({&In, ++Pos});
+  };
+
+  // For each block, sort the nodes in it.
+  std::vector<MachineBasicBlock *> TmpBB;
+  for (auto &Bucket : Blocks) {
+    TmpBB.push_back(Bucket.first);
+    if (Bucket.second.size() > 2)
+      GetOrder(*Bucket.first);
+    llvm::sort(Bucket.second, Precedes);
+  }
+
+  // Sort the blocks with respect to dominance.
+  llvm::sort(TmpBB,
+             [this](auto A, auto B) { return MDT.properlyDominates(A, B); });
+
+  std::vector<NodeId> TmpInst;
+  for (MachineBasicBlock *MBB : llvm::reverse(TmpBB)) {
+    auto &Bucket = Blocks[MBB];
+    TmpInst.insert(TmpInst.end(), Bucket.rbegin(), Bucket.rend());
+  }
+
+  // The vector is a list of instructions, so that defs coming from
+  // the same instruction don't need to be artificially ordered.
+  // Then, when computing the initial segment, and iterating over an
+  // instruction, pick the defs that contribute to the covering (i.e. is
+  // not covered by previously added defs). Check the defs individually,
+  // i.e. first check each def if is covered or not (without adding them
+  // to the tracking set), and then add all the selected ones.
+
+  // The reason for this is this example:
+  // *d1<A>, *d2<B>, ... Assume A and B are aliased (can happen in phi nodes).
+  // *d3<C>              If A \incl BuC, and B \incl AuC, then *d2 would be
+  //                     covered if we added A first, and A would be covered
+  //                     if we added B first.
+  // In this example we want both A and B, because we don't want to give
+  // either one priority over the other, since they belong to the same
+  // statement.
+
+  RegisterAggr RRs(DefRRs);
+
+  auto DefInSet = [&Defs](NodeAddr<RefNode *> TA) -> bool {
+    return TA.Addr->getKind() == NodeAttrs::Def && Defs.count(TA.Id);
+  };
+
+  for (NodeId T : TmpInst) {
+    if (!FullChain && RRs.hasCoverOf(RefRR))
+      break;
+    auto TA = DFG.addr<InstrNode *>(T);
+    bool IsPhi = DFG.IsCode<NodeAttrs::Phi>(TA);
+    NodeList Ds;
+    for (NodeAddr<DefNode *> DA : TA.Addr->members_if(DefInSet, DFG)) {
+      RegisterRef QR = DA.Addr->getRegRef(DFG);
+      // Add phi defs even if they are covered by subsequent defs. This is
+      // for cases where the reached use is not covered by any of the defs
+      // encountered so far: the phi def is needed to expose the liveness
+      // of that use to the entry of the block.
+      // Example:
+      //   phi d1<R3>(,d2,), ...  Phi def d1 is covered by d2.
+      //   d2<R3>(d1,,u3), ...
+      //   ..., u3<D1>(d2)        This use needs to be live on entry.
+      if (FullChain || IsPhi || !RRs.hasCoverOf(QR))
+        Ds.push_back(DA);
+    }
+    llvm::append_range(RDefs, Ds);
+    for (NodeAddr<DefNode *> DA : Ds) {
+      // When collecting a full chain of definitions, do not consider phi
+      // defs to actually define a register.
+      uint16_t Flags = DA.Addr->getFlags();
+      if (!FullChain || !(Flags & NodeAttrs::PhiRef))
+        if (!(Flags & NodeAttrs::Preserving)) // Don't care about Undef here.
+          RRs.insert(DA.Addr->getRegRef(DFG));
+    }
+  }
+
+  auto DeadP = [](const NodeAddr<DefNode *> DA) -> bool {
+    return DA.Addr->getFlags() & NodeAttrs::Dead;
+  };
+  llvm::erase_if(RDefs, DeadP);
+
+  return RDefs;
+}
+
+std::pair<NodeSet, bool>
+Liveness::getAllReachingDefsRec(RegisterRef RefRR, NodeAddr<RefNode *> RefA,
+                                NodeSet &Visited, const NodeSet &Defs) {
+  return getAllReachingDefsRecImpl(RefRR, RefA, Visited, Defs, 0, MaxRecNest);
+}
+
+std::pair<NodeSet, bool>
+Liveness::getAllReachingDefsRecImpl(RegisterRef RefRR, NodeAddr<RefNode *> RefA,
+                                    NodeSet &Visited, const NodeSet &Defs,
+                                    unsigned Nest, unsigned MaxNest) {
+  if (Nest > MaxNest)
+    return {NodeSet(), false};
+  // Collect all defined registers. Do not consider phis to be defining
+  // anything, only collect "real" definitions.
+  RegisterAggr DefRRs(PRI);
+  for (NodeId D : Defs) {
+    const auto DA = DFG.addr<const DefNode *>(D);
+    if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef))
+      DefRRs.insert(DA.Addr->getRegRef(DFG));
+  }
+
+  NodeList RDs = getAllReachingDefs(RefRR, RefA, false, true, DefRRs);
+  if (RDs.empty())
+    return {Defs, true};
+
+  // Make a copy of the preexisting definitions and add the newly found ones.
+  NodeSet TmpDefs = Defs;
+  for (NodeAddr<NodeBase *> R : RDs)
+    TmpDefs.insert(R.Id);
+
+  NodeSet Result = Defs;
+
+  for (NodeAddr<DefNode *> DA : RDs) {
+    Result.insert(DA.Id);
+    if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef))
+      continue;
+    NodeAddr<PhiNode *> PA = DA.Addr->getOwner(DFG);
+    if (!Visited.insert(PA.Id).second)
+      continue;
+    // Go over all phi uses and get the reaching defs for each use.
+    for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
+      const auto &T = getAllReachingDefsRecImpl(RefRR, U, Visited, TmpDefs,
+                                                Nest + 1, MaxNest);
+      if (!T.second)
+        return {T.first, false};
+      Result.insert(T.first.begin(), T.first.end());
+    }
+  }
+
+  return {Result, true};
+}
+
+/// Find the nearest ref node aliased to RefRR, going upwards in the data
+/// flow, starting from the instruction immediately preceding Inst.
+NodeAddr<RefNode *> Liveness::getNearestAliasedRef(RegisterRef RefRR,
+                                                   NodeAddr<InstrNode *> IA) {
+  NodeAddr<BlockNode *> BA = IA.Addr->getOwner(DFG);
+  NodeList Ins = BA.Addr->members(DFG);
+  NodeId FindId = IA.Id;
+  auto E = Ins.rend();
+  auto B =
+      std::find_if(Ins.rbegin(), E, [FindId](const NodeAddr<InstrNode *> T) {
+        return T.Id == FindId;
+      });
+  // Do not scan IA (which is what B would point to).
+  if (B != E)
+    ++B;
+
+  do {
+    // Process the range of instructions from B to E.
+    for (NodeAddr<InstrNode *> I : make_range(B, E)) {
+      NodeList Refs = I.Addr->members(DFG);
+      NodeAddr<RefNode *> Clob, Use;
+      // Scan all the refs in I aliased to RefRR, and return the one that
+      // is the closest to the output of I, i.e. def > clobber > use.
+      for (NodeAddr<RefNode *> R : Refs) {
+        if (!PRI.alias(R.Addr->getRegRef(DFG), RefRR))
+          continue;
+        if (DFG.IsDef(R)) {
+          // If it's a non-clobbering def, just return it.
+          if (!(R.Addr->getFlags() & NodeAttrs::Clobbering))
+            return R;
+          Clob = R;
+        } else {
+          Use = R;
+        }
+      }
+      if (Clob.Id != 0)
+        return Clob;
+      if (Use.Id != 0)
+        return Use;
+    }
+
+    // Go up to the immediate dominator, if any.
+    MachineBasicBlock *BB = BA.Addr->getCode();
+    BA = NodeAddr<BlockNode *>();
+    if (MachineDomTreeNode *N = MDT.getNode(BB)) {
+      if ((N = N->getIDom()))
+        BA = DFG.findBlock(N->getBlock());
+    }
+    if (!BA.Id)
+      break;
+
+    Ins = BA.Addr->members(DFG);
+    B = Ins.rbegin();
+    E = Ins.rend();
+  } while (true);
+
+  return NodeAddr<RefNode *>();
+}
+
+NodeSet Liveness::getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode *> DefA,
+                                    const RegisterAggr &DefRRs) {
+  NodeSet Uses;
+
+  // If the original register is already covered by all the intervening
+  // defs, no more uses can be reached.
+  if (DefRRs.hasCoverOf(RefRR))
+    return Uses;
+
+  // Add all directly reached uses.
+  // If the def is dead, it does not provide a value for any use.
+  bool IsDead = DefA.Addr->getFlags() & NodeAttrs::Dead;
+  NodeId U = !IsDead ? DefA.Addr->getReachedUse() : 0;
+  while (U != 0) {
+    auto UA = DFG.addr<UseNode *>(U);
+    if (!(UA.Addr->getFlags() & NodeAttrs::Undef)) {
+      RegisterRef UR = UA.Addr->getRegRef(DFG);
+      if (PRI.alias(RefRR, UR) && !DefRRs.hasCoverOf(UR))
+        Uses.insert(U);
+    }
+    U = UA.Addr->getSibling();
+  }
+
+  // Traverse all reached defs. This time dead defs cannot be ignored.
+  for (NodeId D = DefA.Addr->getReachedDef(), NextD; D != 0; D = NextD) {
+    auto DA = DFG.addr<DefNode *>(D);
+    NextD = DA.Addr->getSibling();
+    RegisterRef DR = DA.Addr->getRegRef(DFG);
+    // If this def is already covered, it cannot reach anything new.
+    // Similarly, skip it if it is not aliased to the interesting register.
+    if (DefRRs.hasCoverOf(DR) || !PRI.alias(RefRR, DR))
+      continue;
+    NodeSet T;
+    if (DFG.IsPreservingDef(DA)) {
+      // If it is a preserving def, do not update the set of intervening defs.
+      T = getAllReachedUses(RefRR, DA, DefRRs);
+    } else {
+      RegisterAggr NewDefRRs = DefRRs;
+      NewDefRRs.insert(DR);
+      T = getAllReachedUses(RefRR, DA, NewDefRRs);
+    }
+    Uses.insert(T.begin(), T.end());
+  }
+  return Uses;
+}
+
+void Liveness::computePhiInfo() {
+  RealUseMap.clear();
+
+  NodeList Phis;
+  NodeAddr<FuncNode *> FA = DFG.getFunc();
+  NodeList Blocks = FA.Addr->members(DFG);
+  for (NodeAddr<BlockNode *> BA : Blocks) {
+    auto Ps = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG);
+    llvm::append_range(Phis, Ps);
+  }
+
+  // phi use -> (map: reaching phi -> set of registers defined in between)
+  std::map<NodeId, std::map<NodeId, RegisterAggr>> PhiUp;
+  std::vector<NodeId> PhiUQ; // Work list of phis for upward propagation.
+  std::unordered_map<NodeId, RegisterAggr>
+      PhiDRs; // Phi -> registers defined by it.
+
+  // Go over all phis.
+  for (NodeAddr<PhiNode *> PhiA : Phis) {
+    // Go over all defs and collect the reached uses that are non-phi uses
+    // (i.e. the "real uses").
+    RefMap &RealUses = RealUseMap[PhiA.Id];
+    NodeList PhiRefs = PhiA.Addr->members(DFG);
+
+    // Have a work queue of defs whose reached uses need to be found.
+    // For each def, add to the queue all reached (non-phi) defs.
+    SetVector<NodeId> DefQ;
+    NodeSet PhiDefs;
+    RegisterAggr DRs(PRI);
+    for (NodeAddr<RefNode *> R : PhiRefs) {
+      if (!DFG.IsRef<NodeAttrs::Def>(R))
+        continue;
+      DRs.insert(R.Addr->getRegRef(DFG));
+      DefQ.insert(R.Id);
+      PhiDefs.insert(R.Id);
+    }
+    PhiDRs.insert(std::make_pair(PhiA.Id, DRs));
+
+    // Collect the super-set of all possible reached uses. This set will
+    // contain all uses reached from this phi, either directly from the
+    // phi defs, or (recursively) via non-phi defs reached by the phi defs.
+    // This set of uses will later be trimmed to only contain these uses that
+    // are actually reached by the phi defs.
+    for (unsigned i = 0; i < DefQ.size(); ++i) {
+      NodeAddr<DefNode *> DA = DFG.addr<DefNode *>(DefQ[i]);
+      // Visit all reached uses. Phi defs should not really have the "dead"
+      // flag set, but check it anyway for consistency.
+      bool IsDead = DA.Addr->getFlags() & NodeAttrs::Dead;
+      NodeId UN = !IsDead ? DA.Addr->getReachedUse() : 0;
+      while (UN != 0) {
+        NodeAddr<UseNode *> A = DFG.addr<UseNode *>(UN);
+        uint16_t F = A.Addr->getFlags();
+        if ((F & (NodeAttrs::Undef | NodeAttrs::PhiRef)) == 0) {
+          RegisterRef R = A.Addr->getRegRef(DFG);
+          RealUses[R.Reg].insert({A.Id, R.Mask});
+        }
+        UN = A.Addr->getSibling();
+      }
+      // Visit all reached defs, and add them to the queue. These defs may
+      // override some of the uses collected here, but that will be handled
+      // later.
+      NodeId DN = DA.Addr->getReachedDef();
+      while (DN != 0) {
+        NodeAddr<DefNode *> A = DFG.addr<DefNode *>(DN);
+        for (auto T : DFG.getRelatedRefs(A.Addr->getOwner(DFG), A)) {
+          uint16_t Flags = NodeAddr<DefNode *>(T).Addr->getFlags();
+          // Must traverse the reached-def chain. Consider:
+          //   def(D0) -> def(R0) -> def(R0) -> use(D0)
+          // The reachable use of D0 passes through a def of R0.
+          if (!(Flags & NodeAttrs::PhiRef))
+            DefQ.insert(T.Id);
+        }
+        DN = A.Addr->getSibling();
+      }
+    }
+    // Filter out these uses that appear to be reachable, but really
+    // are not. For example:
+    //
+    // R1:0 =          d1
+    //      = R1:0     u2     Reached by d1.
+    //   R0 =          d3
+    //      = R1:0     u4     Still reached by d1: indirectly through
+    //                        the def d3.
+    //   R1 =          d5
+    //      = R1:0     u6     Not reached by d1 (covered collectively
+    //                        by d3 and d5), but following reached
+    //                        defs and uses from d1 will lead here.
+    for (auto UI = RealUses.begin(), UE = RealUses.end(); UI != UE;) {
+      // For each reached register UI->first, there is a set UI->second, of
+      // uses of it. For each such use, check if it is reached by this phi,
+      // i.e. check if the set of its reaching uses intersects the set of
+      // this phi's defs.
+      NodeRefSet Uses = UI->second;
+      UI->second.clear();
+      for (std::pair<NodeId, LaneBitmask> I : Uses) {
+        auto UA = DFG.addr<UseNode *>(I.first);
+        // Undef flag is checked above.
+        assert((UA.Addr->getFlags() & NodeAttrs::Undef) == 0);
+        RegisterRef UseR(UI->first, I.second); // Ref from Uses
+        // R = intersection of the ref from the phi and the ref from Uses
+        RegisterRef R = PhiDRs.at(PhiA.Id).intersectWith(UseR);
+        if (!R)
+          continue;
+        // Calculate the exposed part of the reached use.
+        RegisterAggr Covered(PRI);
+        for (NodeAddr<DefNode *> DA : getAllReachingDefs(R, UA)) {
+          if (PhiDefs.count(DA.Id))
+            break;
+          Covered.insert(DA.Addr->getRegRef(DFG));
+        }
+        if (RegisterRef RC = Covered.clearIn(R)) {
+          // We are updating the map for register UI->first, so we need
+          // to map RC to be expressed in terms of that register.
+          RegisterRef S = PRI.mapTo(RC, UI->first);
+          UI->second.insert({I.first, S.Mask});
+        }
+      }
+      UI = UI->second.empty() ? RealUses.erase(UI) : std::next(UI);
+    }
+
+    // If this phi reaches some "real" uses, add it to the queue for upward
+    // propagation.
+    if (!RealUses.empty())
+      PhiUQ.push_back(PhiA.Id);
+
+    // Go over all phi uses and check if the reaching def is another phi.
+    // Collect the phis that are among the reaching defs of these uses.
+    // While traversing the list of reaching defs for each phi use, accumulate
+    // the set of registers defined between this phi (PhiA) and the owner phi
+    // of the reaching def.
+    NodeSet SeenUses;
+
+    for (auto I : PhiRefs) {
+      if (!DFG.IsRef<NodeAttrs::Use>(I) || SeenUses.count(I.Id))
+        continue;
+      NodeAddr<PhiUseNode *> PUA = I;
+      if (PUA.Addr->getReachingDef() == 0)
+        continue;
+
+      RegisterRef UR = PUA.Addr->getRegRef(DFG);
+      NodeList Ds = getAllReachingDefs(UR, PUA, true, false, NoRegs);
+      RegisterAggr DefRRs(PRI);
+
+      for (NodeAddr<DefNode *> D : Ds) {
+        if (D.Addr->getFlags() & NodeAttrs::PhiRef) {
+          NodeId RP = D.Addr->getOwner(DFG).Id;
+          std::map<NodeId, RegisterAggr> &M = PhiUp[PUA.Id];
+          auto F = M.find(RP);
+          if (F == M.end())
+            M.insert(std::make_pair(RP, DefRRs));
+          else
+            F->second.insert(DefRRs);
+        }
+        DefRRs.insert(D.Addr->getRegRef(DFG));
+      }
+
+      for (NodeAddr<PhiUseNode *> T : DFG.getRelatedRefs(PhiA, PUA))
+        SeenUses.insert(T.Id);
+    }
+  }
+
+  if (Trace) {
+    dbgs() << "Phi-up-to-phi map with intervening defs:\n";
+    for (auto I : PhiUp) {
+      dbgs() << "phi " << Print(I.first, DFG) << " -> {";
+      for (auto R : I.second)
+        dbgs() << ' ' << Print(R.first, DFG) << Print(R.second, DFG);
+      dbgs() << " }\n";
+    }
+  }
+
+  // Propagate the reached registers up in the phi chain.
+  //
+  // The following type of situation needs careful handling:
+  //
+  //   phi d1<R1:0>  (1)
+  //        |
+  //   ... d2<R1>
+  //        |
+  //   phi u3<R1:0>  (2)
+  //        |
+  //   ... u4<R1>
+  //
+  // The phi node (2) defines a register pair R1:0, and reaches a "real"
+  // use u4 of just R1. The same phi node is also known to reach (upwards)
+  // the phi node (1). However, the use u4 is not reached by phi (1),
+  // because of the intervening definition d2 of R1. The data flow between
+  // phis (1) and (2) is restricted to R1:0 minus R1, i.e. R0.
+  //
+  // When propagating uses up the phi chains, get the all reaching defs
+  // for a given phi use, and traverse the list until the propagated ref
+  // is covered, or until reaching the final phi. Only assume that the
+  // reference reaches the phi in the latter case.
+
+  // The operation "clearIn" can be expensive. For a given set of intervening
+  // defs, cache the result of subtracting these defs from a given register
+  // ref.
+  using RefHash = std::hash<RegisterRef>;
+  using RefEqual = std::equal_to<RegisterRef>;
+  using SubMap = std::unordered_map<RegisterRef, RegisterRef>;
+  std::unordered_map<RegisterAggr, SubMap> Subs;
+  auto ClearIn = [](RegisterRef RR, const RegisterAggr &Mid, SubMap &SM) {
+    if (Mid.empty())
+      return RR;
+    auto F = SM.find(RR);
+    if (F != SM.end())
+      return F->second;
+    RegisterRef S = Mid.clearIn(RR);
+    SM.insert({RR, S});
+    return S;
+  };
+
+  // Go over all phis.
+  for (unsigned i = 0; i < PhiUQ.size(); ++i) {
+    auto PA = DFG.addr<PhiNode *>(PhiUQ[i]);
+    NodeList PUs = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG);
+    RefMap &RUM = RealUseMap[PA.Id];
+
+    for (NodeAddr<UseNode *> UA : PUs) {
+      std::map<NodeId, RegisterAggr> &PUM = PhiUp[UA.Id];
+      RegisterRef UR = UA.Addr->getRegRef(DFG);
+      for (const std::pair<const NodeId, RegisterAggr> &P : PUM) {
+        bool Changed = false;
+        const RegisterAggr &MidDefs = P.second;
+        // Collect the set PropUp of uses that are reached by the current
+        // phi PA, and are not covered by any intervening def between the
+        // currently visited use UA and the upward phi P.
+
+        if (MidDefs.hasCoverOf(UR))
+          continue;
+        if (Subs.find(MidDefs) == Subs.end()) {
+          Subs.insert({MidDefs, SubMap(1, RefHash(), RefEqual(PRI))});
+        }
+        SubMap &SM = Subs.at(MidDefs);
+
+        // General algorithm:
+        //   for each (R,U) : U is use node of R, U is reached by PA
+        //     if MidDefs does not cover (R,U)
+        //       then add (R-MidDefs,U) to RealUseMap[P]
+        //
+        for (const std::pair<const RegisterId, NodeRefSet> &T : RUM) {
+          RegisterRef R(T.first);
+          // The current phi (PA) could be a phi for a regmask. It could
+          // reach a whole variety of uses that are not related to the
+          // specific upward phi (P.first).
+          const RegisterAggr &DRs = PhiDRs.at(P.first);
+          if (!DRs.hasAliasOf(R))
+            continue;
+          R = PRI.mapTo(DRs.intersectWith(R), T.first);
+          for (std::pair<NodeId, LaneBitmask> V : T.second) {
+            LaneBitmask M = R.Mask & V.second;
+            if (M.none())
+              continue;
+            if (RegisterRef SS = ClearIn(RegisterRef(R.Reg, M), MidDefs, SM)) {
+              NodeRefSet &RS = RealUseMap[P.first][SS.Reg];
+              Changed |= RS.insert({V.first, SS.Mask}).second;
+            }
+          }
+        }
+
+        if (Changed)
+          PhiUQ.push_back(P.first);
+      }
+    }
+  }
+
+  if (Trace) {
+    dbgs() << "Real use map:\n";
+    for (auto I : RealUseMap) {
+      dbgs() << "phi " << Print(I.first, DFG);
+      NodeAddr<PhiNode *> PA = DFG.addr<PhiNode *>(I.first);
+      NodeList Ds = PA.Addr->members_if(DFG.IsRef<NodeAttrs::Def>, DFG);
+      if (!Ds.empty()) {
+        RegisterRef RR = NodeAddr<DefNode *>(Ds[0]).Addr->getRegRef(DFG);
+        dbgs() << '<' << Print(RR, DFG) << '>';
+      } else {
+        dbgs() << "<noreg>";
+      }
+      dbgs() << " -> " << Print(I.second, DFG) << '\n';
+    }
+  }
+}
+
+void Liveness::computeLiveIns() {
+  // Populate the node-to-block map. This speeds up the calculations
+  // significantly.
+  NBMap.clear();
+  for (NodeAddr<BlockNode *> BA : DFG.getFunc().Addr->members(DFG)) {
+    MachineBasicBlock *BB = BA.Addr->getCode();
+    for (NodeAddr<InstrNode *> IA : BA.Addr->members(DFG)) {
+      for (NodeAddr<RefNode *> RA : IA.Addr->members(DFG))
+        NBMap.insert(std::make_pair(RA.Id, BB));
+      NBMap.insert(std::make_pair(IA.Id, BB));
+    }
+  }
+
+  MachineFunction &MF = DFG.getMF();
+
+  // Compute IDF first, then the inverse.
+  decltype(IIDF) IDF;
+  for (MachineBasicBlock &B : MF) {
+    auto F1 = MDF.find(&B);
+    if (F1 == MDF.end())
+      continue;
+    SetVector<MachineBasicBlock *> IDFB(F1->second.begin(), F1->second.end());
+    for (unsigned i = 0; i < IDFB.size(); ++i) {
+      auto F2 = MDF.find(IDFB[i]);
+      if (F2 != MDF.end())
+        IDFB.insert(F2->second.begin(), F2->second.end());
+    }
+    // Add B to the IDF(B). This will put B in the IIDF(B).
+    IDFB.insert(&B);
+    IDF[&B].insert(IDFB.begin(), IDFB.end());
+  }
+
+  for (auto I : IDF)
+    for (auto *S : I.second)
+      IIDF[S].insert(I.first);
+
+  computePhiInfo();
+
+  NodeAddr<FuncNode *> FA = DFG.getFunc();
+  NodeList Blocks = FA.Addr->members(DFG);
+
+  // Build the phi live-on-entry map.
+  for (NodeAddr<BlockNode *> BA : Blocks) {
+    MachineBasicBlock *MB = BA.Addr->getCode();
+    RefMap &LON = PhiLON[MB];
+    for (auto P : BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG)) {
+      for (const RefMap::value_type &S : RealUseMap[P.Id])
+        LON[S.first].insert(S.second.begin(), S.second.end());
+    }
+  }
+
+  if (Trace) {
+    dbgs() << "Phi live-on-entry map:\n";
+    for (auto &I : PhiLON)
+      dbgs() << "block #" << I.first->getNumber() << " -> "
+             << Print(I.second, DFG) << '\n';
+  }
+
+  // Build the phi live-on-exit map. Each phi node has some set of reached
+  // "real" uses. Propagate this set backwards into the block predecessors
+  // through the reaching defs of the corresponding phi uses.
+  for (NodeAddr<BlockNode *> BA : Blocks) {
+    NodeList Phis = BA.Addr->members_if(DFG.IsCode<NodeAttrs::Phi>, DFG);
+    for (NodeAddr<PhiNode *> PA : Phis) {
+      RefMap &RUs = RealUseMap[PA.Id];
+      if (RUs.empty())
+        continue;
+
+      NodeSet SeenUses;
+      for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
+        if (!SeenUses.insert(U.Id).second)
+          continue;
+        NodeAddr<PhiUseNode *> PUA = U;
+        if (PUA.Addr->getReachingDef() == 0)
+          continue;
+
+        // Each phi has some set (possibly empty) of reached "real" uses,
+        // that is, uses that are part of the compiled program. Such a use
+        // may be located in some farther block, but following a chain of
+        // reaching defs will eventually lead to this phi.
+        // Any chain of reaching defs may fork at a phi node, but there
+        // will be a path upwards that will lead to this phi. Now, this
+        // chain will need to fork at this phi, since some of the reached
+        // uses may have definitions joining in from multiple predecessors.
+        // For each reached "real" use, identify the set of reaching defs
+        // coming from each predecessor P, and add them to PhiLOX[P].
+        //
+        auto PrA = DFG.addr<BlockNode *>(PUA.Addr->getPredecessor());
+        RefMap &LOX = PhiLOX[PrA.Addr->getCode()];
+
+        for (const std::pair<const RegisterId, NodeRefSet> &RS : RUs) {
+          // We need to visit each individual use.
+          for (std::pair<NodeId, LaneBitmask> P : RS.second) {
+            // Create a register ref corresponding to the use, and find
+            // all reaching defs starting from the phi use, and treating
+            // all related shadows as a single use cluster.
+            RegisterRef S(RS.first, P.second);
+            NodeList Ds = getAllReachingDefs(S, PUA, true, false, NoRegs);
+            for (NodeAddr<DefNode *> D : Ds) {
+              // Calculate the mask corresponding to the visited def.
+              RegisterAggr TA(PRI);
+              TA.insert(D.Addr->getRegRef(DFG)).intersect(S);
+              LaneBitmask TM = TA.makeRegRef().Mask;
+              LOX[S.Reg].insert({D.Id, TM});
+            }
+          }
+        }
+
+        for (NodeAddr<PhiUseNode *> T : DFG.getRelatedRefs(PA, PUA))
+          SeenUses.insert(T.Id);
+      } // for U : phi uses
+    }   // for P : Phis
+  }     // for B : Blocks
+
+  if (Trace) {
+    dbgs() << "Phi live-on-exit map:\n";
+    for (auto &I : PhiLOX)
+      dbgs() << "block #" << I.first->getNumber() << " -> "
+             << Print(I.second, DFG) << '\n';
+  }
+
+  RefMap LiveIn;
+  traverse(&MF.front(), LiveIn);
+
+  // Add function live-ins to the live-in set of the function entry block.
+  LiveMap[&MF.front()].insert(DFG.getLiveIns());
+
+  if (Trace) {
+    // Dump the liveness map
+    for (MachineBasicBlock &B : MF) {
+      std::vector<RegisterRef> LV;
+      for (const MachineBasicBlock::RegisterMaskPair &LI : B.liveins())
+        LV.push_back(RegisterRef(LI.PhysReg, LI.LaneMask));
+      llvm::sort(LV, std::less<RegisterRef>(PRI));
+      dbgs() << printMBBReference(B) << "\t rec = {";
+      for (auto I : LV)
+        dbgs() << ' ' << Print(I, DFG);
+      dbgs() << " }\n";
+      // dbgs() << "\tcomp = " << Print(LiveMap[&B], DFG) << '\n';
+
+      LV.clear();
+      for (RegisterRef RR : LiveMap[&B].refs())
+        LV.push_back(RR);
+      llvm::sort(LV, std::less<RegisterRef>(PRI));
+      dbgs() << "\tcomp = {";
+      for (auto I : LV)
+        dbgs() << ' ' << Print(I, DFG);
+      dbgs() << " }\n";
+    }
+  }
+}
+
+void Liveness::resetLiveIns() {
+  for (auto &B : DFG.getMF()) {
+    // Remove all live-ins.
+    std::vector<unsigned> T;
+    for (const MachineBasicBlock::RegisterMaskPair &LI : B.liveins())
+      T.push_back(LI.PhysReg);
+    for (auto I : T)
+      B.removeLiveIn(I);
+    // Add the newly computed live-ins.
+    const RegisterAggr &LiveIns = LiveMap[&B];
+    for (RegisterRef R : LiveIns.refs())
+      B.addLiveIn({MCPhysReg(R.Reg), R.Mask});
+  }
+}
+
+void Liveness::resetKills() {
+  for (auto &B : DFG.getMF())
+    resetKills(&B);
+}
+
+void Liveness::resetKills(MachineBasicBlock *B) {
+  auto CopyLiveIns = [this](MachineBasicBlock *B, BitVector &LV) -> void {
+    for (auto I : B->liveins()) {
+      MCSubRegIndexIterator S(I.PhysReg, &TRI);
+      if (!S.isValid()) {
+        LV.set(I.PhysReg);
+        continue;
+      }
+      do {
+        LaneBitmask M = TRI.getSubRegIndexLaneMask(S.getSubRegIndex());
+        if ((M & I.LaneMask).any())
+          LV.set(S.getSubReg());
+        ++S;
+      } while (S.isValid());
+    }
+  };
+
+  BitVector LiveIn(TRI.getNumRegs()), Live(TRI.getNumRegs());
+  CopyLiveIns(B, LiveIn);
+  for (auto *SI : B->successors())
+    CopyLiveIns(SI, Live);
+
+  for (MachineInstr &MI : llvm::reverse(*B)) {
+    if (MI.isDebugInstr())
+      continue;
+
+    MI.clearKillInfo();
+    for (auto &Op : MI.all_defs()) {
+      // An implicit def of a super-register may not necessarily start a
+      // live range of it, since an implicit use could be used to keep parts
+      // of it live. Instead of analyzing the implicit operands, ignore
+      // implicit defs.
+      if (Op.isImplicit())
+        continue;
+      Register R = Op.getReg();
+      if (!R.isPhysical())
+        continue;
+      for (MCPhysReg SR : TRI.subregs_inclusive(R))
+        Live.reset(SR);
+    }
+    for (auto &Op : MI.all_uses()) {
+      if (Op.isUndef())
+        continue;
+      Register R = Op.getReg();
+      if (!R.isPhysical())
+        continue;
+      bool IsLive = false;
+      for (MCRegAliasIterator AR(R, &TRI, true); AR.isValid(); ++AR) {
+        if (!Live[*AR])
+          continue;
+        IsLive = true;
+        break;
+      }
+      if (!IsLive)
+        Op.setIsKill(true);
+      for (MCPhysReg SR : TRI.subregs_inclusive(R))
+        Live.set(SR);
+    }
+  }
+}
+
+// Helper function to obtain the basic block containing the reaching def
+// of the given use.
+MachineBasicBlock *Liveness::getBlockWithRef(NodeId RN) const {
+  auto F = NBMap.find(RN);
+  if (F != NBMap.end())
+    return F->second;
+  llvm_unreachable("Node id not in map");
+}
+
+void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
+  // The LiveIn map, for each (physical) register, contains the set of live
+  // reaching defs of that register that are live on entry to the associated
+  // block.
+
+  // The summary of the traversal algorithm:
+  //
+  // R is live-in in B, if there exists a U(R), such that rdef(R) dom B
+  // and (U \in IDF(B) or B dom U).
+  //
+  // for (C : children) {
+  //   LU = {}
+  //   traverse(C, LU)
+  //   LiveUses += LU
+  // }
+  //
+  // LiveUses -= Defs(B);
+  // LiveUses += UpwardExposedUses(B);
+  // for (C : IIDF[B])
+  //   for (U : LiveUses)
+  //     if (Rdef(U) dom C)
+  //       C.addLiveIn(U)
+  //
+
+  // Go up the dominator tree (depth-first).
+  MachineDomTreeNode *N = MDT.getNode(B);
+  for (auto *I : *N) {
+    RefMap L;
+    MachineBasicBlock *SB = I->getBlock();
+    traverse(SB, L);
+
+    for (auto S : L)
+      LiveIn[S.first].insert(S.second.begin(), S.second.end());
+  }
+
+  if (Trace) {
+    dbgs() << "\n-- " << printMBBReference(*B) << ": " << __func__
+           << " after recursion into: {";
+    for (auto *I : *N)
+      dbgs() << ' ' << I->getBlock()->getNumber();
+    dbgs() << " }\n";
+    dbgs() << "  LiveIn: " << Print(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print(LiveMap[B], DFG) << '\n';
+  }
+
+  // Add reaching defs of phi uses that are live on exit from this block.
+  RefMap &PUs = PhiLOX[B];
+  for (auto &S : PUs)
+    LiveIn[S.first].insert(S.second.begin(), S.second.end());
+
+  if (Trace) {
+    dbgs() << "after LOX\n";
+    dbgs() << "  LiveIn: " << Print(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print(LiveMap[B], DFG) << '\n';
+  }
+
+  // The LiveIn map at this point has all defs that are live-on-exit from B,
+  // as if they were live-on-entry to B. First, we need to filter out all
+  // defs that are present in this block. Then we will add reaching defs of
+  // all upward-exposed uses.
+
+  // To filter out the defs, first make a copy of LiveIn, and then re-populate
+  // LiveIn with the defs that should remain.
+  RefMap LiveInCopy = LiveIn;
+  LiveIn.clear();
+
+  for (const std::pair<const RegisterId, NodeRefSet> &LE : LiveInCopy) {
+    RegisterRef LRef(LE.first);
+    NodeRefSet &NewDefs = LiveIn[LRef.Reg]; // To be filled.
+    const NodeRefSet &OldDefs = LE.second;
+    for (NodeRef OR : OldDefs) {
+      // R is a def node that was live-on-exit
+      auto DA = DFG.addr<DefNode *>(OR.first);
+      NodeAddr<InstrNode *> IA = DA.Addr->getOwner(DFG);
+      NodeAddr<BlockNode *> BA = IA.Addr->getOwner(DFG);
+      if (B != BA.Addr->getCode()) {
+        // Defs from a different block need to be preserved. Defs from this
+        // block will need to be processed further, except for phi defs, the
+        // liveness of which is handled through the PhiLON/PhiLOX maps.
+        NewDefs.insert(OR);
+        continue;
+      }
+
+      // Defs from this block need to stop the liveness from being
+      // propagated upwards. This only applies to non-preserving defs,
+      // and to the parts of the register actually covered by those defs.
+      // (Note that phi defs should always be preserving.)
+      RegisterAggr RRs(PRI);
+      LRef.Mask = OR.second;
+
+      if (!DFG.IsPreservingDef(DA)) {
+        assert(!(IA.Addr->getFlags() & NodeAttrs::Phi));
+        // DA is a non-phi def that is live-on-exit from this block, and
+        // that is also located in this block. LRef is a register ref
+        // whose use this def reaches. If DA covers LRef, then no part
+        // of LRef is exposed upwards.A
+        if (RRs.insert(DA.Addr->getRegRef(DFG)).hasCoverOf(LRef))
+          continue;
+      }
+
+      // DA itself was not sufficient to cover LRef. In general, it is
+      // the last in a chain of aliased defs before the exit from this block.
+      // There could be other defs in this block that are a part of that
+      // chain. Check that now: accumulate the registers from these defs,
+      // and if they all together cover LRef, it is not live-on-entry.
+      for (NodeAddr<DefNode *> TA : getAllReachingDefs(DA)) {
+        // DefNode -> InstrNode -> BlockNode.
+        NodeAddr<InstrNode *> ITA = TA.Addr->getOwner(DFG);
+        NodeAddr<BlockNode *> BTA = ITA.Addr->getOwner(DFG);
+        // Reaching defs are ordered in the upward direction.
+        if (BTA.Addr->getCode() != B) {
+          // We have reached past the beginning of B, and the accumulated
+          // registers are not covering LRef. The first def from the
+          // upward chain will be live.
+          // Subtract all accumulated defs (RRs) from LRef.
+          RegisterRef T = RRs.clearIn(LRef);
+          assert(T);
+          NewDefs.insert({TA.Id, T.Mask});
+          break;
+        }
+
+        // TA is in B. Only add this def to the accumulated cover if it is
+        // not preserving.
+        if (!(TA.Addr->getFlags() & NodeAttrs::Preserving))
+          RRs.insert(TA.Addr->getRegRef(DFG));
+        // If this is enough to cover LRef, then stop.
+        if (RRs.hasCoverOf(LRef))
+          break;
+      }
+    }
+  }
+
+  emptify(LiveIn);
+
+  if (Trace) {
+    dbgs() << "after defs in block\n";
+    dbgs() << "  LiveIn: " << Print(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print(LiveMap[B], DFG) << '\n';
+  }
+
+  // Scan the block for upward-exposed uses and add them to the tracking set.
+  for (auto I : DFG.getFunc().Addr->findBlock(B, DFG).Addr->members(DFG)) {
+    NodeAddr<InstrNode *> IA = I;
+    if (IA.Addr->getKind() != NodeAttrs::Stmt)
+      continue;
+    for (NodeAddr<UseNode *> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
+      if (UA.Addr->getFlags() & NodeAttrs::Undef)
+        continue;
+      RegisterRef RR = UA.Addr->getRegRef(DFG);
+      for (NodeAddr<DefNode *> D : getAllReachingDefs(UA))
+        if (getBlockWithRef(D.Id) != B)
+          LiveIn[RR.Reg].insert({D.Id, RR.Mask});
+    }
+  }
+
+  if (Trace) {
+    dbgs() << "after uses in block\n";
+    dbgs() << "  LiveIn: " << Print(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print(LiveMap[B], DFG) << '\n';
+  }
+
+  // Phi uses should not be propagated up the dominator tree, since they
+  // are not dominated by their corresponding reaching defs.
+  RegisterAggr &Local = LiveMap[B];
+  RefMap &LON = PhiLON[B];
+  for (auto &R : LON) {
+    LaneBitmask M;
+    for (auto P : R.second)
+      M |= P.second;
+    Local.insert(RegisterRef(R.first, M));
+  }
+
+  if (Trace) {
+    dbgs() << "after phi uses in block\n";
+    dbgs() << "  LiveIn: " << Print(LiveIn, DFG) << '\n';
+    dbgs() << "  Local:  " << Print(Local, DFG) << '\n';
+  }
+
+  for (auto *C : IIDF[B]) {
+    RegisterAggr &LiveC = LiveMap[C];
+    for (const std::pair<const RegisterId, NodeRefSet> &S : LiveIn)
+      for (auto R : S.second)
+        if (MDT.properlyDominates(getBlockWithRef(R.first), C))
+          LiveC.insert(RegisterRef(S.first, R.second));
+  }
+}
+
+void Liveness::emptify(RefMap &M) {
+  for (auto I = M.begin(), E = M.end(); I != E;)
+    I = I->second.empty() ? M.erase(I) : std::next(I);
+}
+
+} // namespace llvm::rdf
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RDFRegisters.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RDFRegisters.cpp
new file mode 100644
index 000000000000..90520c4c3c71
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RDFRegisters.cpp
@@ -0,0 +1,444 @@
+//===- RDFRegisters.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/RDFRegisters.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <set>
+#include <utility>
+
+namespace llvm::rdf {
+
+PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri,
+                                           const MachineFunction &mf)
+    : TRI(tri) {
+  RegInfos.resize(TRI.getNumRegs());
+
+  BitVector BadRC(TRI.getNumRegs());
+  for (const TargetRegisterClass *RC : TRI.regclasses()) {
+    for (MCPhysReg R : *RC) {
+      RegInfo &RI = RegInfos[R];
+      if (RI.RegClass != nullptr && !BadRC[R]) {
+        if (RC->LaneMask != RI.RegClass->LaneMask) {
+          BadRC.set(R);
+          RI.RegClass = nullptr;
+        }
+      } else
+        RI.RegClass = RC;
+    }
+  }
+
+  UnitInfos.resize(TRI.getNumRegUnits());
+
+  for (uint32_t U = 0, NU = TRI.getNumRegUnits(); U != NU; ++U) {
+    if (UnitInfos[U].Reg != 0)
+      continue;
+    MCRegUnitRootIterator R(U, &TRI);
+    assert(R.isValid());
+    RegisterId F = *R;
+    ++R;
+    if (R.isValid()) {
+      UnitInfos[U].Mask = LaneBitmask::getAll();
+      UnitInfos[U].Reg = F;
+    } else {
+      for (MCRegUnitMaskIterator I(F, &TRI); I.isValid(); ++I) {
+        std::pair<uint32_t, LaneBitmask> P = *I;
+        UnitInfo &UI = UnitInfos[P.first];
+        UI.Reg = F;
+        if (P.second.any()) {
+          UI.Mask = P.second;
+        } else {
+          if (const TargetRegisterClass *RC = RegInfos[F].RegClass)
+            UI.Mask = RC->LaneMask;
+          else
+            UI.Mask = LaneBitmask::getAll();
+        }
+      }
+    }
+  }
+
+  for (const uint32_t *RM : TRI.getRegMasks())
+    RegMasks.insert(RM);
+  for (const MachineBasicBlock &B : mf)
+    for (const MachineInstr &In : B)
+      for (const MachineOperand &Op : In.operands())
+        if (Op.isRegMask())
+          RegMasks.insert(Op.getRegMask());
+
+  MaskInfos.resize(RegMasks.size() + 1);
+  for (uint32_t M = 1, NM = RegMasks.size(); M <= NM; ++M) {
+    BitVector PU(TRI.getNumRegUnits());
+    const uint32_t *MB = RegMasks.get(M);
+    for (unsigned I = 1, E = TRI.getNumRegs(); I != E; ++I) {
+      if (!(MB[I / 32] & (1u << (I % 32))))
+        continue;
+      for (MCRegUnit Unit : TRI.regunits(MCRegister::from(I)))
+        PU.set(Unit);
+    }
+    MaskInfos[M].Units = PU.flip();
+  }
+
+  AliasInfos.resize(TRI.getNumRegUnits());
+  for (uint32_t U = 0, NU = TRI.getNumRegUnits(); U != NU; ++U) {
+    BitVector AS(TRI.getNumRegs());
+    for (MCRegUnitRootIterator R(U, &TRI); R.isValid(); ++R)
+      for (MCPhysReg S : TRI.superregs_inclusive(*R))
+        AS.set(S);
+    AliasInfos[U].Regs = AS;
+  }
+}
+
+bool PhysicalRegisterInfo::alias(RegisterRef RA, RegisterRef RB) const {
+  return !disjoint(getUnits(RA), getUnits(RB));
+}
+
+std::set<RegisterId> PhysicalRegisterInfo::getAliasSet(RegisterId Reg) const {
+  // Do not include Reg in the alias set.
+  std::set<RegisterId> AS;
+  assert(!RegisterRef::isUnitId(Reg) && "No units allowed");
+  if (RegisterRef::isMaskId(Reg)) {
+    // XXX SLOW
+    const uint32_t *MB = getRegMaskBits(Reg);
+    for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i) {
+      if (MB[i / 32] & (1u << (i % 32)))
+        continue;
+      AS.insert(i);
+    }
+    return AS;
+  }
+
+  assert(RegisterRef::isRegId(Reg));
+  for (MCRegAliasIterator AI(Reg, &TRI, false); AI.isValid(); ++AI)
+    AS.insert(*AI);
+
+  return AS;
+}
+
+std::set<RegisterId> PhysicalRegisterInfo::getUnits(RegisterRef RR) const {
+  std::set<RegisterId> Units;
+
+  if (RR.Reg == 0)
+    return Units; // Empty
+
+  if (RR.isReg()) {
+    if (RR.Mask.none())
+      return Units; // Empty
+    for (MCRegUnitMaskIterator UM(RR.idx(), &TRI); UM.isValid(); ++UM) {
+      auto [U, M] = *UM;
+      if (M.none() || (M & RR.Mask).any())
+        Units.insert(U);
+    }
+    return Units;
+  }
+
+  assert(RR.isMask());
+  unsigned NumRegs = TRI.getNumRegs();
+  const uint32_t *MB = getRegMaskBits(RR.idx());
+  for (unsigned I = 0, E = (NumRegs + 31) / 32; I != E; ++I) {
+    uint32_t C = ~MB[I]; // Clobbered regs
+    if (I == 0)          // Reg 0 should be ignored
+      C &= maskLeadingOnes<unsigned>(31);
+    if (I + 1 == E && NumRegs % 32 != 0) // Last word may be partial
+      C &= maskTrailingOnes<unsigned>(NumRegs % 32);
+    if (C == 0)
+      continue;
+    while (C != 0) {
+      unsigned T = llvm::countr_zero(C);
+      unsigned CR = 32 * I + T; // Clobbered reg
+      for (MCRegUnit U : TRI.regunits(CR))
+        Units.insert(U);
+      C &= ~(1u << T);
+    }
+  }
+  return Units;
+}
+
+RegisterRef PhysicalRegisterInfo::mapTo(RegisterRef RR, unsigned R) const {
+  if (RR.Reg == R)
+    return RR;
+  if (unsigned Idx = TRI.getSubRegIndex(R, RR.Reg))
+    return RegisterRef(R, TRI.composeSubRegIndexLaneMask(Idx, RR.Mask));
+  if (unsigned Idx = TRI.getSubRegIndex(RR.Reg, R)) {
+    const RegInfo &RI = RegInfos[R];
+    LaneBitmask RCM =
+        RI.RegClass ? RI.RegClass->LaneMask : LaneBitmask::getAll();
+    LaneBitmask M = TRI.reverseComposeSubRegIndexLaneMask(Idx, RR.Mask);
+    return RegisterRef(R, M & RCM);
+  }
+  llvm_unreachable("Invalid arguments: unrelated registers?");
+}
+
+bool PhysicalRegisterInfo::equal_to(RegisterRef A, RegisterRef B) const {
+  if (!A.isReg() || !B.isReg()) {
+    // For non-regs, or comparing reg and non-reg, use only the Reg member.
+    return A.Reg == B.Reg;
+  }
+
+  if (A.Reg == B.Reg)
+    return A.Mask == B.Mask;
+
+  // Compare reg units lexicographically.
+  MCRegUnitMaskIterator AI(A.Reg, &getTRI());
+  MCRegUnitMaskIterator BI(B.Reg, &getTRI());
+  while (AI.isValid() && BI.isValid()) {
+    auto [AReg, AMask] = *AI;
+    auto [BReg, BMask] = *BI;
+
+    // Lane masks are "none" for units that don't correspond to subregs
+    // e.g. a single unit in a leaf register, or aliased unit.
+    if (AMask.none())
+      AMask = LaneBitmask::getAll();
+    if (BMask.none())
+      BMask = LaneBitmask::getAll();
+
+    // If both iterators point to a unit contained in both A and B, then
+    // compare the units.
+    if ((AMask & A.Mask).any() && (BMask & B.Mask).any()) {
+      if (AReg != BReg)
+        return false;
+      // Units are equal, move on to the next ones.
+      ++AI;
+      ++BI;
+      continue;
+    }
+
+    if ((AMask & A.Mask).none())
+      ++AI;
+    if ((BMask & B.Mask).none())
+      ++BI;
+  }
+  // One or both have reached the end.
+  return static_cast<int>(AI.isValid()) == static_cast<int>(BI.isValid());
+}
+
+bool PhysicalRegisterInfo::less(RegisterRef A, RegisterRef B) const {
+  if (!A.isReg() || !B.isReg()) {
+    // For non-regs, or comparing reg and non-reg, use only the Reg member.
+    return A.Reg < B.Reg;
+  }
+
+  if (A.Reg == B.Reg)
+    return A.Mask < B.Mask;
+  if (A.Mask == B.Mask)
+    return A.Reg < B.Reg;
+
+  // Compare reg units lexicographically.
+  llvm::MCRegUnitMaskIterator AI(A.Reg, &getTRI());
+  llvm::MCRegUnitMaskIterator BI(B.Reg, &getTRI());
+  while (AI.isValid() && BI.isValid()) {
+    auto [AReg, AMask] = *AI;
+    auto [BReg, BMask] = *BI;
+
+    // Lane masks are "none" for units that don't correspond to subregs
+    // e.g. a single unit in a leaf register, or aliased unit.
+    if (AMask.none())
+      AMask = LaneBitmask::getAll();
+    if (BMask.none())
+      BMask = LaneBitmask::getAll();
+
+    // If both iterators point to a unit contained in both A and B, then
+    // compare the units.
+    if ((AMask & A.Mask).any() && (BMask & B.Mask).any()) {
+      if (AReg != BReg)
+        return AReg < BReg;
+      // Units are equal, move on to the next ones.
+      ++AI;
+      ++BI;
+      continue;
+    }
+
+    if ((AMask & A.Mask).none())
+      ++AI;
+    if ((BMask & B.Mask).none())
+      ++BI;
+  }
+  // One or both have reached the end: assume invalid < valid.
+  return static_cast<int>(AI.isValid()) < static_cast<int>(BI.isValid());
+}
+
+void PhysicalRegisterInfo::print(raw_ostream &OS, RegisterRef A) const {
+  if (A.Reg == 0 || A.isReg()) {
+    if (0 < A.idx() && A.idx() < TRI.getNumRegs())
+      OS << TRI.getName(A.idx());
+    else
+      OS << printReg(A.idx(), &TRI);
+    OS << PrintLaneMaskShort(A.Mask);
+  } else if (A.isUnit()) {
+    OS << printRegUnit(A.idx(), &TRI);
+  } else {
+    assert(A.isMask());
+    // RegMask SS flag is preserved by idx().
+    unsigned Idx = Register::stackSlot2Index(A.idx());
+    const char *Fmt = Idx < 0x10000 ? "%04x" : "%08x";
+    OS << "M#" << format(Fmt, Idx);
+  }
+}
+
+void PhysicalRegisterInfo::print(raw_ostream &OS, const RegisterAggr &A) const {
+  OS << '{';
+  for (unsigned U : A.units())
+    OS << ' ' << printRegUnit(U, &TRI);
+  OS << " }";
+}
+
+bool RegisterAggr::hasAliasOf(RegisterRef RR) const {
+  if (RR.isMask())
+    return Units.anyCommon(PRI.getMaskUnits(RR.Reg));
+
+  for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
+    std::pair<uint32_t, LaneBitmask> P = *U;
+    if (P.second.none() || (P.second & RR.Mask).any())
+      if (Units.test(P.first))
+        return true;
+  }
+  return false;
+}
+
+bool RegisterAggr::hasCoverOf(RegisterRef RR) const {
+  if (RR.isMask()) {
+    BitVector T(PRI.getMaskUnits(RR.Reg));
+    return T.reset(Units).none();
+  }
+
+  for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
+    std::pair<uint32_t, LaneBitmask> P = *U;
+    if (P.second.none() || (P.second & RR.Mask).any())
+      if (!Units.test(P.first))
+        return false;
+  }
+  return true;
+}
+
+RegisterAggr &RegisterAggr::insert(RegisterRef RR) {
+  if (RR.isMask()) {
+    Units |= PRI.getMaskUnits(RR.Reg);
+    return *this;
+  }
+
+  for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
+    std::pair<uint32_t, LaneBitmask> P = *U;
+    if (P.second.none() || (P.second & RR.Mask).any())
+      Units.set(P.first);
+  }
+  return *this;
+}
+
+RegisterAggr &RegisterAggr::insert(const RegisterAggr &RG) {
+  Units |= RG.Units;
+  return *this;
+}
+
+RegisterAggr &RegisterAggr::intersect(RegisterRef RR) {
+  return intersect(RegisterAggr(PRI).insert(RR));
+}
+
+RegisterAggr &RegisterAggr::intersect(const RegisterAggr &RG) {
+  Units &= RG.Units;
+  return *this;
+}
+
+RegisterAggr &RegisterAggr::clear(RegisterRef RR) {
+  return clear(RegisterAggr(PRI).insert(RR));
+}
+
+RegisterAggr &RegisterAggr::clear(const RegisterAggr &RG) {
+  Units.reset(RG.Units);
+  return *this;
+}
+
+RegisterRef RegisterAggr::intersectWith(RegisterRef RR) const {
+  RegisterAggr T(PRI);
+  T.insert(RR).intersect(*this);
+  if (T.empty())
+    return RegisterRef();
+  RegisterRef NR = T.makeRegRef();
+  assert(NR);
+  return NR;
+}
+
+RegisterRef RegisterAggr::clearIn(RegisterRef RR) const {
+  return RegisterAggr(PRI).insert(RR).clear(*this).makeRegRef();
+}
+
+RegisterRef RegisterAggr::makeRegRef() const {
+  int U = Units.find_first();
+  if (U < 0)
+    return RegisterRef();
+
+  // Find the set of all registers that are aliased to all the units
+  // in this aggregate.
+
+  // Get all the registers aliased to the first unit in the bit vector.
+  BitVector Regs = PRI.getUnitAliases(U);
+  U = Units.find_next(U);
+
+  // For each other unit, intersect it with the set of all registers
+  // aliased that unit.
+  while (U >= 0) {
+    Regs &= PRI.getUnitAliases(U);
+    U = Units.find_next(U);
+  }
+
+  // If there is at least one register remaining, pick the first one,
+  // and consolidate the masks of all of its units contained in this
+  // aggregate.
+
+  int F = Regs.find_first();
+  if (F <= 0)
+    return RegisterRef();
+
+  LaneBitmask M;
+  for (MCRegUnitMaskIterator I(F, &PRI.getTRI()); I.isValid(); ++I) {
+    std::pair<uint32_t, LaneBitmask> P = *I;
+    if (Units.test(P.first))
+      M |= P.second.none() ? LaneBitmask::getAll() : P.second;
+  }
+  return RegisterRef(F, M);
+}
+
+RegisterAggr::ref_iterator::ref_iterator(const RegisterAggr &RG, bool End)
+    : Owner(&RG) {
+  for (int U = RG.Units.find_first(); U >= 0; U = RG.Units.find_next(U)) {
+    RegisterRef R = RG.PRI.getRefForUnit(U);
+    Masks[R.Reg] |= R.Mask;
+  }
+  Pos = End ? Masks.end() : Masks.begin();
+  Index = End ? Masks.size() : 0;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const RegisterAggr &A) {
+  A.getPRI().print(OS, A);
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const PrintLaneMaskShort &P) {
+  if (P.Mask.all())
+    return OS;
+  if (P.Mask.none())
+    return OS << ":*none*";
+
+  LaneBitmask::Type Val = P.Mask.getAsInteger();
+  if ((Val & 0xffff) == Val)
+    return OS << ':' << format("%04llX", Val);
+  if ((Val & 0xffffffff) == Val)
+    return OS << ':' << format("%08llX", Val);
+  return OS << ':' << PrintLaneMask(P.Mask);
+}
+
+} // namespace llvm::rdf
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
new file mode 100644
index 000000000000..75fbc8ba35b1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -0,0 +1,712 @@
+//===---- ReachingDefAnalysis.cpp - Reaching Def Analysis ---*- C++ -*-----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/ReachingDefAnalysis.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "reaching-deps-analysis"
+
+char ReachingDefAnalysis::ID = 0;
+INITIALIZE_PASS(ReachingDefAnalysis, DEBUG_TYPE, "ReachingDefAnalysis", false,
+                true)
+
+static bool isValidReg(const MachineOperand &MO) {
+  return MO.isReg() && MO.getReg();
+}
+
+static bool isValidRegUse(const MachineOperand &MO) {
+  return isValidReg(MO) && MO.isUse();
+}
+
+static bool isValidRegUseOf(const MachineOperand &MO, MCRegister PhysReg,
+                            const TargetRegisterInfo *TRI) {
+  if (!isValidRegUse(MO))
+    return false;
+  return TRI->regsOverlap(MO.getReg(), PhysReg);
+}
+
+static bool isValidRegDef(const MachineOperand &MO) {
+  return isValidReg(MO) && MO.isDef();
+}
+
+static bool isValidRegDefOf(const MachineOperand &MO, MCRegister PhysReg,
+                            const TargetRegisterInfo *TRI) {
+  if (!isValidRegDef(MO))
+    return false;
+  return TRI->regsOverlap(MO.getReg(), PhysReg);
+}
+
+void ReachingDefAnalysis::enterBasicBlock(MachineBasicBlock *MBB) {
+  unsigned MBBNumber = MBB->getNumber();
+  assert(MBBNumber < MBBReachingDefs.size() &&
+         "Unexpected basic block number.");
+  MBBReachingDefs[MBBNumber].resize(NumRegUnits);
+
+  // Reset instruction counter in each basic block.
+  CurInstr = 0;
+
+  // Set up LiveRegs to represent registers entering MBB.
+  // Default values are 'nothing happened a long time ago'.
+  if (LiveRegs.empty())
+    LiveRegs.assign(NumRegUnits, ReachingDefDefaultVal);
+
+  // This is the entry block.
+  if (MBB->pred_empty()) {
+    for (const auto &LI : MBB->liveins()) {
+      for (MCRegUnit Unit : TRI->regunits(LI.PhysReg)) {
+        // Treat function live-ins as if they were defined just before the first
+        // instruction.  Usually, function arguments are set up immediately
+        // before the call.
+        if (LiveRegs[Unit] != -1) {
+          LiveRegs[Unit] = -1;
+          MBBReachingDefs[MBBNumber][Unit].push_back(-1);
+        }
+      }
+    }
+    LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << ": entry\n");
+    return;
+  }
+
+  // Try to coalesce live-out registers from predecessors.
+  for (MachineBasicBlock *pred : MBB->predecessors()) {
+    assert(unsigned(pred->getNumber()) < MBBOutRegsInfos.size() &&
+           "Should have pre-allocated MBBInfos for all MBBs");
+    const LiveRegsDefInfo &Incoming = MBBOutRegsInfos[pred->getNumber()];
+    // Incoming is null if this is a backedge from a BB
+    // we haven't processed yet
+    if (Incoming.empty())
+      continue;
+
+    // Find the most recent reaching definition from a predecessor.
+    for (unsigned Unit = 0; Unit != NumRegUnits; ++Unit)
+      LiveRegs[Unit] = std::max(LiveRegs[Unit], Incoming[Unit]);
+  }
+
+  // Insert the most recent reaching definition we found.
+  for (unsigned Unit = 0; Unit != NumRegUnits; ++Unit)
+    if (LiveRegs[Unit] != ReachingDefDefaultVal)
+      MBBReachingDefs[MBBNumber][Unit].push_back(LiveRegs[Unit]);
+}
+
+void ReachingDefAnalysis::leaveBasicBlock(MachineBasicBlock *MBB) {
+  assert(!LiveRegs.empty() && "Must enter basic block first.");
+  unsigned MBBNumber = MBB->getNumber();
+  assert(MBBNumber < MBBOutRegsInfos.size() &&
+         "Unexpected basic block number.");
+  // Save register clearances at end of MBB - used by enterBasicBlock().
+  MBBOutRegsInfos[MBBNumber] = LiveRegs;
+
+  // While processing the basic block, we kept `Def` relative to the start
+  // of the basic block for convenience. However, future use of this information
+  // only cares about the clearance from the end of the block, so adjust
+  // everything to be relative to the end of the basic block.
+  for (int &OutLiveReg : MBBOutRegsInfos[MBBNumber])
+    if (OutLiveReg != ReachingDefDefaultVal)
+      OutLiveReg -= CurInstr;
+  LiveRegs.clear();
+}
+
+void ReachingDefAnalysis::processDefs(MachineInstr *MI) {
+  assert(!MI->isDebugInstr() && "Won't process debug instructions");
+
+  unsigned MBBNumber = MI->getParent()->getNumber();
+  assert(MBBNumber < MBBReachingDefs.size() &&
+         "Unexpected basic block number.");
+
+  for (auto &MO : MI->operands()) {
+    if (!isValidRegDef(MO))
+      continue;
+    for (MCRegUnit Unit : TRI->regunits(MO.getReg().asMCReg())) {
+      // This instruction explicitly defines the current reg unit.
+      LLVM_DEBUG(dbgs() << printRegUnit(Unit, TRI) << ":\t" << CurInstr << '\t'
+                        << *MI);
+
+      // How many instructions since this reg unit was last written?
+      if (LiveRegs[Unit] != CurInstr) {
+        LiveRegs[Unit] = CurInstr;
+        MBBReachingDefs[MBBNumber][Unit].push_back(CurInstr);
+      }
+    }
+  }
+  InstIds[MI] = CurInstr;
+  ++CurInstr;
+}
+
+void ReachingDefAnalysis::reprocessBasicBlock(MachineBasicBlock *MBB) {
+  unsigned MBBNumber = MBB->getNumber();
+  assert(MBBNumber < MBBReachingDefs.size() &&
+         "Unexpected basic block number.");
+
+  // Count number of non-debug instructions for end of block adjustment.
+  auto NonDbgInsts =
+    instructionsWithoutDebug(MBB->instr_begin(), MBB->instr_end());
+  int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end());
+
+  // When reprocessing a block, the only thing we need to do is check whether
+  // there is now a more recent incoming reaching definition from a predecessor.
+  for (MachineBasicBlock *pred : MBB->predecessors()) {
+    assert(unsigned(pred->getNumber()) < MBBOutRegsInfos.size() &&
+           "Should have pre-allocated MBBInfos for all MBBs");
+    const LiveRegsDefInfo &Incoming = MBBOutRegsInfos[pred->getNumber()];
+    // Incoming may be empty for dead predecessors.
+    if (Incoming.empty())
+      continue;
+
+    for (unsigned Unit = 0; Unit != NumRegUnits; ++Unit) {
+      int Def = Incoming[Unit];
+      if (Def == ReachingDefDefaultVal)
+        continue;
+
+      auto Start = MBBReachingDefs[MBBNumber][Unit].begin();
+      if (Start != MBBReachingDefs[MBBNumber][Unit].end() && *Start < 0) {
+        if (*Start >= Def)
+          continue;
+
+        // Update existing reaching def from predecessor to a more recent one.
+        *Start = Def;
+      } else {
+        // Insert new reaching def from predecessor.
+        MBBReachingDefs[MBBNumber][Unit].insert(Start, Def);
+      }
+
+      // Update reaching def at end of of BB. Keep in mind that these are
+      // adjusted relative to the end of the basic block.
+      if (MBBOutRegsInfos[MBBNumber][Unit] < Def - NumInsts)
+        MBBOutRegsInfos[MBBNumber][Unit] = Def - NumInsts;
+    }
+  }
+}
+
+void ReachingDefAnalysis::processBasicBlock(
+    const LoopTraversal::TraversedMBBInfo &TraversedMBB) {
+  MachineBasicBlock *MBB = TraversedMBB.MBB;
+  LLVM_DEBUG(dbgs() << printMBBReference(*MBB)
+                    << (!TraversedMBB.IsDone ? ": incomplete\n"
+                                             : ": all preds known\n"));
+
+  if (!TraversedMBB.PrimaryPass) {
+    // Reprocess MBB that is part of a loop.
+    reprocessBasicBlock(MBB);
+    return;
+  }
+
+  enterBasicBlock(MBB);
+  for (MachineInstr &MI :
+       instructionsWithoutDebug(MBB->instr_begin(), MBB->instr_end()))
+    processDefs(&MI);
+  leaveBasicBlock(MBB);
+}
+
+bool ReachingDefAnalysis::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  TRI = MF->getSubtarget().getRegisterInfo();
+  LLVM_DEBUG(dbgs() << "********** REACHING DEFINITION ANALYSIS **********\n");
+  init();
+  traverse();
+  return false;
+}
+
+void ReachingDefAnalysis::releaseMemory() {
+  // Clear the internal vectors.
+  MBBOutRegsInfos.clear();
+  MBBReachingDefs.clear();
+  InstIds.clear();
+  LiveRegs.clear();
+}
+
+void ReachingDefAnalysis::reset() {
+  releaseMemory();
+  init();
+  traverse();
+}
+
+void ReachingDefAnalysis::init() {
+  NumRegUnits = TRI->getNumRegUnits();
+  MBBReachingDefs.resize(MF->getNumBlockIDs());
+  // Initialize the MBBOutRegsInfos
+  MBBOutRegsInfos.resize(MF->getNumBlockIDs());
+  LoopTraversal Traversal;
+  TraversedMBBOrder = Traversal.traverse(*MF);
+}
+
+void ReachingDefAnalysis::traverse() {
+  // Traverse the basic blocks.
+  for (LoopTraversal::TraversedMBBInfo TraversedMBB : TraversedMBBOrder)
+    processBasicBlock(TraversedMBB);
+#ifndef NDEBUG
+  // Make sure reaching defs are sorted and unique.
+  for (MBBDefsInfo &MBBDefs : MBBReachingDefs) {
+    for (MBBRegUnitDefs &RegUnitDefs : MBBDefs) {
+      int LastDef = ReachingDefDefaultVal;
+      for (int Def : RegUnitDefs) {
+        assert(Def > LastDef && "Defs must be sorted and unique");
+        LastDef = Def;
+      }
+    }
+  }
+#endif
+}
+
+int ReachingDefAnalysis::getReachingDef(MachineInstr *MI,
+                                        MCRegister PhysReg) const {
+  assert(InstIds.count(MI) && "Unexpected machine instuction.");
+  int InstId = InstIds.lookup(MI);
+  int DefRes = ReachingDefDefaultVal;
+  unsigned MBBNumber = MI->getParent()->getNumber();
+  assert(MBBNumber < MBBReachingDefs.size() &&
+         "Unexpected basic block number.");
+  int LatestDef = ReachingDefDefaultVal;
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    for (int Def : MBBReachingDefs[MBBNumber][Unit]) {
+      if (Def >= InstId)
+        break;
+      DefRes = Def;
+    }
+    LatestDef = std::max(LatestDef, DefRes);
+  }
+  return LatestDef;
+}
+
+MachineInstr *
+ReachingDefAnalysis::getReachingLocalMIDef(MachineInstr *MI,
+                                           MCRegister PhysReg) const {
+  return hasLocalDefBefore(MI, PhysReg)
+    ? getInstFromId(MI->getParent(), getReachingDef(MI, PhysReg))
+    : nullptr;
+}
+
+bool ReachingDefAnalysis::hasSameReachingDef(MachineInstr *A, MachineInstr *B,
+                                             MCRegister PhysReg) const {
+  MachineBasicBlock *ParentA = A->getParent();
+  MachineBasicBlock *ParentB = B->getParent();
+  if (ParentA != ParentB)
+    return false;
+
+  return getReachingDef(A, PhysReg) == getReachingDef(B, PhysReg);
+}
+
+MachineInstr *ReachingDefAnalysis::getInstFromId(MachineBasicBlock *MBB,
+                                                 int InstId) const {
+  assert(static_cast<size_t>(MBB->getNumber()) < MBBReachingDefs.size() &&
+         "Unexpected basic block number.");
+  assert(InstId < static_cast<int>(MBB->size()) &&
+         "Unexpected instruction id.");
+
+  if (InstId < 0)
+    return nullptr;
+
+  for (auto &MI : *MBB) {
+    auto F = InstIds.find(&MI);
+    if (F != InstIds.end() && F->second == InstId)
+      return &MI;
+  }
+
+  return nullptr;
+}
+
+int ReachingDefAnalysis::getClearance(MachineInstr *MI,
+                                      MCRegister PhysReg) const {
+  assert(InstIds.count(MI) && "Unexpected machine instuction.");
+  return InstIds.lookup(MI) - getReachingDef(MI, PhysReg);
+}
+
+bool ReachingDefAnalysis::hasLocalDefBefore(MachineInstr *MI,
+                                            MCRegister PhysReg) const {
+  return getReachingDef(MI, PhysReg) >= 0;
+}
+
+void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def,
+                                               MCRegister PhysReg,
+                                               InstSet &Uses) const {
+  MachineBasicBlock *MBB = Def->getParent();
+  MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def);
+  while (++MI != MBB->end()) {
+    if (MI->isDebugInstr())
+      continue;
+
+    // If/when we find a new reaching def, we know that there's no more uses
+    // of 'Def'.
+    if (getReachingLocalMIDef(&*MI, PhysReg) != Def)
+      return;
+
+    for (auto &MO : MI->operands()) {
+      if (!isValidRegUseOf(MO, PhysReg, TRI))
+        continue;
+
+      Uses.insert(&*MI);
+      if (MO.isKill())
+        return;
+    }
+  }
+}
+
+bool ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB,
+                                        MCRegister PhysReg,
+                                        InstSet &Uses) const {
+  for (MachineInstr &MI :
+       instructionsWithoutDebug(MBB->instr_begin(), MBB->instr_end())) {
+    for (auto &MO : MI.operands()) {
+      if (!isValidRegUseOf(MO, PhysReg, TRI))
+        continue;
+      if (getReachingDef(&MI, PhysReg) >= 0)
+        return false;
+      Uses.insert(&MI);
+    }
+  }
+  auto Last = MBB->getLastNonDebugInstr();
+  if (Last == MBB->end())
+    return true;
+  return isReachingDefLiveOut(&*Last, PhysReg);
+}
+
+void ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, MCRegister PhysReg,
+                                        InstSet &Uses) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // Collect the uses that each def touches within the block.
+  getReachingLocalUses(MI, PhysReg, Uses);
+
+  // Handle live-out values.
+  if (auto *LiveOut = getLocalLiveOutMIDef(MI->getParent(), PhysReg)) {
+    if (LiveOut != MI)
+      return;
+
+    SmallVector<MachineBasicBlock *, 4> ToVisit(MBB->successors());
+    SmallPtrSet<MachineBasicBlock*, 4>Visited;
+    while (!ToVisit.empty()) {
+      MachineBasicBlock *MBB = ToVisit.pop_back_val();
+      if (Visited.count(MBB) || !MBB->isLiveIn(PhysReg))
+        continue;
+      if (getLiveInUses(MBB, PhysReg, Uses))
+        llvm::append_range(ToVisit, MBB->successors());
+      Visited.insert(MBB);
+    }
+  }
+}
+
+void ReachingDefAnalysis::getGlobalReachingDefs(MachineInstr *MI,
+                                                MCRegister PhysReg,
+                                                InstSet &Defs) const {
+  if (auto *Def = getUniqueReachingMIDef(MI, PhysReg)) {
+    Defs.insert(Def);
+    return;
+  }
+
+  for (auto *MBB : MI->getParent()->predecessors())
+    getLiveOuts(MBB, PhysReg, Defs);
+}
+
+void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB,
+                                      MCRegister PhysReg, InstSet &Defs) const {
+  SmallPtrSet<MachineBasicBlock*, 2> VisitedBBs;
+  getLiveOuts(MBB, PhysReg, Defs, VisitedBBs);
+}
+
+void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB,
+                                      MCRegister PhysReg, InstSet &Defs,
+                                      BlockSet &VisitedBBs) const {
+  if (VisitedBBs.count(MBB))
+    return;
+
+  VisitedBBs.insert(MBB);
+  LivePhysRegs LiveRegs(*TRI);
+  LiveRegs.addLiveOuts(*MBB);
+  if (LiveRegs.available(MBB->getParent()->getRegInfo(), PhysReg))
+    return;
+
+  if (auto *Def = getLocalLiveOutMIDef(MBB, PhysReg))
+    Defs.insert(Def);
+  else
+    for (auto *Pred : MBB->predecessors())
+      getLiveOuts(Pred, PhysReg, Defs, VisitedBBs);
+}
+
+MachineInstr *
+ReachingDefAnalysis::getUniqueReachingMIDef(MachineInstr *MI,
+                                            MCRegister PhysReg) const {
+  // If there's a local def before MI, return it.
+  MachineInstr *LocalDef = getReachingLocalMIDef(MI, PhysReg);
+  if (LocalDef && InstIds.lookup(LocalDef) < InstIds.lookup(MI))
+    return LocalDef;
+
+  SmallPtrSet<MachineInstr*, 2> Incoming;
+  MachineBasicBlock *Parent = MI->getParent();
+  for (auto *Pred : Parent->predecessors())
+    getLiveOuts(Pred, PhysReg, Incoming);
+
+  // Check that we have a single incoming value and that it does not
+  // come from the same block as MI - since it would mean that the def
+  // is executed after MI.
+  if (Incoming.size() == 1 && (*Incoming.begin())->getParent() != Parent)
+    return *Incoming.begin();
+  return nullptr;
+}
+
+MachineInstr *ReachingDefAnalysis::getMIOperand(MachineInstr *MI,
+                                                unsigned Idx) const {
+  assert(MI->getOperand(Idx).isReg() && "Expected register operand");
+  return getUniqueReachingMIDef(MI, MI->getOperand(Idx).getReg());
+}
+
+MachineInstr *ReachingDefAnalysis::getMIOperand(MachineInstr *MI,
+                                                MachineOperand &MO) const {
+  assert(MO.isReg() && "Expected register operand");
+  return getUniqueReachingMIDef(MI, MO.getReg());
+}
+
+bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI,
+                                         MCRegister PhysReg) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  LivePhysRegs LiveRegs(*TRI);
+  LiveRegs.addLiveOuts(*MBB);
+
+  // Yes if the register is live out of the basic block.
+  if (!LiveRegs.available(MBB->getParent()->getRegInfo(), PhysReg))
+    return true;
+
+  // Walk backwards through the block to see if the register is live at some
+  // point.
+  for (MachineInstr &Last :
+       instructionsWithoutDebug(MBB->instr_rbegin(), MBB->instr_rend())) {
+    LiveRegs.stepBackward(Last);
+    if (!LiveRegs.available(MBB->getParent()->getRegInfo(), PhysReg))
+      return InstIds.lookup(&Last) > InstIds.lookup(MI);
+  }
+  return false;
+}
+
+bool ReachingDefAnalysis::isRegDefinedAfter(MachineInstr *MI,
+                                            MCRegister PhysReg) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  auto Last = MBB->getLastNonDebugInstr();
+  if (Last != MBB->end() &&
+      getReachingDef(MI, PhysReg) != getReachingDef(&*Last, PhysReg))
+    return true;
+
+  if (auto *Def = getLocalLiveOutMIDef(MBB, PhysReg))
+    return Def == getReachingLocalMIDef(MI, PhysReg);
+
+  return false;
+}
+
+bool ReachingDefAnalysis::isReachingDefLiveOut(MachineInstr *MI,
+                                               MCRegister PhysReg) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  LivePhysRegs LiveRegs(*TRI);
+  LiveRegs.addLiveOuts(*MBB);
+  if (LiveRegs.available(MBB->getParent()->getRegInfo(), PhysReg))
+    return false;
+
+  auto Last = MBB->getLastNonDebugInstr();
+  int Def = getReachingDef(MI, PhysReg);
+  if (Last != MBB->end() && getReachingDef(&*Last, PhysReg) != Def)
+    return false;
+
+  // Finally check that the last instruction doesn't redefine the register.
+  for (auto &MO : Last->operands())
+    if (isValidRegDefOf(MO, PhysReg, TRI))
+      return false;
+
+  return true;
+}
+
+MachineInstr *
+ReachingDefAnalysis::getLocalLiveOutMIDef(MachineBasicBlock *MBB,
+                                          MCRegister PhysReg) const {
+  LivePhysRegs LiveRegs(*TRI);
+  LiveRegs.addLiveOuts(*MBB);
+  if (LiveRegs.available(MBB->getParent()->getRegInfo(), PhysReg))
+    return nullptr;
+
+  auto Last = MBB->getLastNonDebugInstr();
+  if (Last == MBB->end())
+    return nullptr;
+
+  int Def = getReachingDef(&*Last, PhysReg);
+  for (auto &MO : Last->operands())
+    if (isValidRegDefOf(MO, PhysReg, TRI))
+      return &*Last;
+
+  return Def < 0 ? nullptr : getInstFromId(MBB, Def);
+}
+
+static bool mayHaveSideEffects(MachineInstr &MI) {
+  return MI.mayLoadOrStore() || MI.mayRaiseFPException() ||
+         MI.hasUnmodeledSideEffects() || MI.isTerminator() ||
+         MI.isCall() || MI.isBarrier() || MI.isBranch() || MI.isReturn();
+}
+
+// Can we safely move 'From' to just before 'To'? To satisfy this, 'From' must
+// not define a register that is used by any instructions, after and including,
+// 'To'. These instructions also must not redefine any of Froms operands.
+template<typename Iterator>
+bool ReachingDefAnalysis::isSafeToMove(MachineInstr *From,
+                                       MachineInstr *To) const {
+  if (From->getParent() != To->getParent() || From == To)
+    return false;
+
+  SmallSet<int, 2> Defs;
+  // First check that From would compute the same value if moved.
+  for (auto &MO : From->operands()) {
+    if (!isValidReg(MO))
+      continue;
+    if (MO.isDef())
+      Defs.insert(MO.getReg());
+    else if (!hasSameReachingDef(From, To, MO.getReg()))
+      return false;
+  }
+
+  // Now walk checking that the rest of the instructions will compute the same
+  // value and that we're not overwriting anything. Don't move the instruction
+  // past any memory, control-flow or other ambiguous instructions.
+  for (auto I = ++Iterator(From), E = Iterator(To); I != E; ++I) {
+    if (mayHaveSideEffects(*I))
+      return false;
+    for (auto &MO : I->operands())
+      if (MO.isReg() && MO.getReg() && Defs.count(MO.getReg()))
+        return false;
+  }
+  return true;
+}
+
+bool ReachingDefAnalysis::isSafeToMoveForwards(MachineInstr *From,
+                                               MachineInstr *To) const {
+  using Iterator = MachineBasicBlock::iterator;
+  // Walk forwards until we find the instruction.
+  for (auto I = Iterator(From), E = From->getParent()->end(); I != E; ++I)
+    if (&*I == To)
+      return isSafeToMove<Iterator>(From, To);
+  return false;
+}
+
+bool ReachingDefAnalysis::isSafeToMoveBackwards(MachineInstr *From,
+                                                MachineInstr *To) const {
+  using Iterator = MachineBasicBlock::reverse_iterator;
+  // Walk backwards until we find the instruction.
+  for (auto I = Iterator(From), E = From->getParent()->rend(); I != E; ++I)
+    if (&*I == To)
+      return isSafeToMove<Iterator>(From, To);
+  return false;
+}
+
+bool ReachingDefAnalysis::isSafeToRemove(MachineInstr *MI,
+                                         InstSet &ToRemove) const {
+  SmallPtrSet<MachineInstr*, 1> Ignore;
+  SmallPtrSet<MachineInstr*, 2> Visited;
+  return isSafeToRemove(MI, Visited, ToRemove, Ignore);
+}
+
+bool
+ReachingDefAnalysis::isSafeToRemove(MachineInstr *MI, InstSet &ToRemove,
+                                    InstSet &Ignore) const {
+  SmallPtrSet<MachineInstr*, 2> Visited;
+  return isSafeToRemove(MI, Visited, ToRemove, Ignore);
+}
+
+bool
+ReachingDefAnalysis::isSafeToRemove(MachineInstr *MI, InstSet &Visited,
+                                    InstSet &ToRemove, InstSet &Ignore) const {
+  if (Visited.count(MI) || Ignore.count(MI))
+    return true;
+  else if (mayHaveSideEffects(*MI)) {
+    // Unless told to ignore the instruction, don't remove anything which has
+    // side effects.
+    return false;
+  }
+
+  Visited.insert(MI);
+  for (auto &MO : MI->operands()) {
+    if (!isValidRegDef(MO))
+      continue;
+
+    SmallPtrSet<MachineInstr*, 4> Uses;
+    getGlobalUses(MI, MO.getReg(), Uses);
+
+    for (auto *I : Uses) {
+      if (Ignore.count(I) || ToRemove.count(I))
+        continue;
+      if (!isSafeToRemove(I, Visited, ToRemove, Ignore))
+        return false;
+    }
+  }
+  ToRemove.insert(MI);
+  return true;
+}
+
+void ReachingDefAnalysis::collectKilledOperands(MachineInstr *MI,
+                                                InstSet &Dead) const {
+  Dead.insert(MI);
+  auto IsDead = [this, &Dead](MachineInstr *Def, MCRegister PhysReg) {
+    if (mayHaveSideEffects(*Def))
+      return false;
+
+    unsigned LiveDefs = 0;
+    for (auto &MO : Def->operands()) {
+      if (!isValidRegDef(MO))
+        continue;
+      if (!MO.isDead())
+        ++LiveDefs;
+    }
+
+    if (LiveDefs > 1)
+      return false;
+
+    SmallPtrSet<MachineInstr*, 4> Uses;
+    getGlobalUses(Def, PhysReg, Uses);
+    return llvm::set_is_subset(Uses, Dead);
+  };
+
+  for (auto &MO : MI->operands()) {
+    if (!isValidRegUse(MO))
+      continue;
+    if (MachineInstr *Def = getMIOperand(MI, MO))
+      if (IsDead(Def, MO.getReg()))
+        collectKilledOperands(Def, Dead);
+  }
+}
+
+bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI,
+                                           MCRegister PhysReg) const {
+  SmallPtrSet<MachineInstr*, 1> Ignore;
+  return isSafeToDefRegAt(MI, PhysReg, Ignore);
+}
+
+bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, MCRegister PhysReg,
+                                           InstSet &Ignore) const {
+  // Check for any uses of the register after MI.
+  if (isRegUsedAfter(MI, PhysReg)) {
+    if (auto *Def = getReachingLocalMIDef(MI, PhysReg)) {
+      SmallPtrSet<MachineInstr*, 2> Uses;
+      getGlobalUses(Def, PhysReg, Uses);
+      if (!llvm::set_is_subset(Uses, Ignore))
+        return false;
+    } else
+      return false;
+  }
+
+  MachineBasicBlock *MBB = MI->getParent();
+  // Check for any defs after MI.
+  if (isRegDefinedAfter(MI, PhysReg)) {
+    auto I = MachineBasicBlock::iterator(MI);
+    for (auto E = MBB->end(); I != E; ++I) {
+      if (Ignore.count(&*I))
+        continue;
+      for (auto &MO : I->operands())
+        if (isValidRegDefOf(MO, PhysReg, TRI))
+          return false;
+    }
+  }
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBase.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBase.cpp
new file mode 100644
index 000000000000..900f0e9079d6
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -0,0 +1,192 @@
+//===- RegAllocBase.cpp - Register Allocator Base Class -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RegAllocBase class which provides common functionality
+// for LiveIntervalUnion-based register allocators.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RegAllocBase.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Spiller.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(NumNewQueued, "Number of new live ranges queued");
+
+// Temporary verification option until we can put verification inside
+// MachineVerifier.
+static cl::opt<bool, true>
+    VerifyRegAlloc("verify-regalloc", cl::location(RegAllocBase::VerifyEnabled),
+                   cl::Hidden, cl::desc("Verify during register allocation"));
+
+const char RegAllocBase::TimerGroupName[] = "regalloc";
+const char RegAllocBase::TimerGroupDescription[] = "Register Allocation";
+bool RegAllocBase::VerifyEnabled = false;
+
+//===----------------------------------------------------------------------===//
+//                         RegAllocBase Implementation
+//===----------------------------------------------------------------------===//
+
+// Pin the vtable to this file.
+void RegAllocBase::anchor() {}
+
+void RegAllocBase::init(VirtRegMap &vrm, LiveIntervals &lis,
+                        LiveRegMatrix &mat) {
+  TRI = &vrm.getTargetRegInfo();
+  MRI = &vrm.getRegInfo();
+  VRM = &vrm;
+  LIS = &lis;
+  Matrix = &mat;
+  MRI->freezeReservedRegs(vrm.getMachineFunction());
+  RegClassInfo.runOnMachineFunction(vrm.getMachineFunction());
+}
+
+// Visit all the live registers. If they are already assigned to a physical
+// register, unify them with the corresponding LiveIntervalUnion, otherwise push
+// them on the priority queue for later assignment.
+void RegAllocBase::seedLiveRegs() {
+  NamedRegionTimer T("seed", "Seed Live Regs", TimerGroupName,
+                     TimerGroupDescription, TimePassesIsEnabled);
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    Register Reg = Register::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    enqueue(&LIS->getInterval(Reg));
+  }
+}
+
+// Top-level driver to manage the queue of unassigned VirtRegs and call the
+// selectOrSplit implementation.
+void RegAllocBase::allocatePhysRegs() {
+  seedLiveRegs();
+
+  // Continue assigning vregs one at a time to available physical registers.
+  while (const LiveInterval *VirtReg = dequeue()) {
+    assert(!VRM->hasPhys(VirtReg->reg()) && "Register already assigned");
+
+    // Unused registers can appear when the spiller coalesces snippets.
+    if (MRI->reg_nodbg_empty(VirtReg->reg())) {
+      LLVM_DEBUG(dbgs() << "Dropping unused " << *VirtReg << '\n');
+      aboutToRemoveInterval(*VirtReg);
+      LIS->removeInterval(VirtReg->reg());
+      continue;
+    }
+
+    // Invalidate all interference queries, live ranges could have changed.
+    Matrix->invalidateVirtRegs();
+
+    // selectOrSplit requests the allocator to return an available physical
+    // register if possible and populate a list of new live intervals that
+    // result from splitting.
+    LLVM_DEBUG(dbgs() << "\nselectOrSplit "
+                      << TRI->getRegClassName(MRI->getRegClass(VirtReg->reg()))
+                      << ':' << *VirtReg << " w=" << VirtReg->weight() << '\n');
+
+    using VirtRegVec = SmallVector<Register, 4>;
+
+    VirtRegVec SplitVRegs;
+    MCRegister AvailablePhysReg = selectOrSplit(*VirtReg, SplitVRegs);
+
+    if (AvailablePhysReg == ~0u) {
+      // selectOrSplit failed to find a register!
+      // Probably caused by an inline asm.
+      MachineInstr *MI = nullptr;
+      for (MachineRegisterInfo::reg_instr_iterator
+               I = MRI->reg_instr_begin(VirtReg->reg()),
+               E = MRI->reg_instr_end();
+           I != E;) {
+        MI = &*(I++);
+        if (MI->isInlineAsm())
+          break;
+      }
+
+      const TargetRegisterClass *RC = MRI->getRegClass(VirtReg->reg());
+      ArrayRef<MCPhysReg> AllocOrder = RegClassInfo.getOrder(RC);
+      if (AllocOrder.empty())
+        report_fatal_error("no registers from class available to allocate");
+      else if (MI && MI->isInlineAsm()) {
+        MI->emitError("inline assembly requires more registers than available");
+      } else if (MI) {
+        LLVMContext &Context =
+            MI->getParent()->getParent()->getMMI().getModule()->getContext();
+        Context.emitError("ran out of registers during register allocation");
+      } else {
+        report_fatal_error("ran out of registers during register allocation");
+      }
+
+      // Keep going after reporting the error.
+      VRM->assignVirt2Phys(VirtReg->reg(), AllocOrder.front());
+    } else if (AvailablePhysReg)
+      Matrix->assign(*VirtReg, AvailablePhysReg);
+
+    for (Register Reg : SplitVRegs) {
+      assert(LIS->hasInterval(Reg));
+
+      LiveInterval *SplitVirtReg = &LIS->getInterval(Reg);
+      assert(!VRM->hasPhys(SplitVirtReg->reg()) && "Register already assigned");
+      if (MRI->reg_nodbg_empty(SplitVirtReg->reg())) {
+        assert(SplitVirtReg->empty() && "Non-empty but used interval");
+        LLVM_DEBUG(dbgs() << "not queueing unused  " << *SplitVirtReg << '\n');
+        aboutToRemoveInterval(*SplitVirtReg);
+        LIS->removeInterval(SplitVirtReg->reg());
+        continue;
+      }
+      LLVM_DEBUG(dbgs() << "queuing new interval: " << *SplitVirtReg << "\n");
+      assert(SplitVirtReg->reg().isVirtual() &&
+             "expect split value in virtual register");
+      enqueue(SplitVirtReg);
+      ++NumNewQueued;
+    }
+  }
+}
+
+void RegAllocBase::postOptimization() {
+  spiller().postOptimization();
+  for (auto *DeadInst : DeadRemats) {
+    LIS->RemoveMachineInstrFromMaps(*DeadInst);
+    DeadInst->eraseFromParent();
+  }
+  DeadRemats.clear();
+}
+
+void RegAllocBase::enqueue(const LiveInterval *LI) {
+  const Register Reg = LI->reg();
+
+  assert(Reg.isVirtual() && "Can only enqueue virtual registers");
+
+  if (VRM->hasPhys(Reg))
+    return;
+
+  const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
+  if (ShouldAllocateClass(*TRI, RC)) {
+    LLVM_DEBUG(dbgs() << "Enqueuing " << printReg(Reg, TRI) << '\n');
+    enqueueImpl(LI);
+  } else {
+    LLVM_DEBUG(dbgs() << "Not enqueueing " << printReg(Reg, TRI)
+                      << " in skipped register class\n");
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBase.h b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBase.h
new file mode 100644
index 000000000000..a8bf305a50c9
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBase.h
@@ -0,0 +1,131 @@
+//===- RegAllocBase.h - basic regalloc interface and driver -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RegAllocBase class, which is the skeleton of a basic
+// register allocation algorithm and interface for extending it. It provides the
+// building blocks on which to construct other experimental allocators and test
+// the validity of two principles:
+//
+// - If virtual and physical register liveness is modeled using intervals, then
+// on-the-fly interference checking is cheap. Furthermore, interferences can be
+// lazily cached and reused.
+//
+// - Register allocation complexity, and generated code performance is
+// determined by the effectiveness of live range splitting rather than optimal
+// coloring.
+//
+// Following the first principle, interfering checking revolves around the
+// LiveIntervalUnion data structure.
+//
+// To fulfill the second principle, the basic allocator provides a driver for
+// incremental splitting. It essentially punts on the problem of register
+// coloring, instead driving the assignment of virtual to physical registers by
+// the cost of splitting. The basic allocator allows for heuristic reassignment
+// of registers, if a more sophisticated allocator chooses to do that.
+//
+// This framework provides a way to engineer the compile time vs. code
+// quality trade-off without relying on a particular theoretical solver.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_REGALLOCBASE_H
+#define LLVM_LIB_CODEGEN_REGALLOCBASE_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/RegAllocCommon.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+
+namespace llvm {
+
+class LiveInterval;
+class LiveIntervals;
+class LiveRegMatrix;
+class MachineInstr;
+class MachineRegisterInfo;
+template<typename T> class SmallVectorImpl;
+class Spiller;
+class TargetRegisterInfo;
+class VirtRegMap;
+
+/// RegAllocBase provides the register allocation driver and interface that can
+/// be extended to add interesting heuristics.
+///
+/// Register allocators must override the selectOrSplit() method to implement
+/// live range splitting. They must also override enqueue/dequeue to provide an
+/// assignment order.
+class RegAllocBase {
+  virtual void anchor();
+
+protected:
+  const TargetRegisterInfo *TRI = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  VirtRegMap *VRM = nullptr;
+  LiveIntervals *LIS = nullptr;
+  LiveRegMatrix *Matrix = nullptr;
+  RegisterClassInfo RegClassInfo;
+  const RegClassFilterFunc ShouldAllocateClass;
+
+  /// Inst which is a def of an original reg and whose defs are already all
+  /// dead after remat is saved in DeadRemats. The deletion of such inst is
+  /// postponed till all the allocations are done, so its remat expr is
+  /// always available for the remat of all the siblings of the original reg.
+  SmallPtrSet<MachineInstr *, 32> DeadRemats;
+
+  RegAllocBase(const RegClassFilterFunc F = allocateAllRegClasses) :
+    ShouldAllocateClass(F) {}
+
+  virtual ~RegAllocBase() = default;
+
+  // A RegAlloc pass should call this before allocatePhysRegs.
+  void init(VirtRegMap &vrm, LiveIntervals &lis, LiveRegMatrix &mat);
+
+  // The top-level driver. The output is a VirtRegMap that us updated with
+  // physical register assignments.
+  void allocatePhysRegs();
+
+  // Include spiller post optimization and removing dead defs left because of
+  // rematerialization.
+  virtual void postOptimization();
+
+  // Get a temporary reference to a Spiller instance.
+  virtual Spiller &spiller() = 0;
+
+  /// enqueue - Add VirtReg to the priority queue of unassigned registers.
+  virtual void enqueueImpl(const LiveInterval *LI) = 0;
+
+  /// enqueue - Add VirtReg to the priority queue of unassigned registers.
+  void enqueue(const LiveInterval *LI);
+
+  /// dequeue - Return the next unassigned register, or NULL.
+  virtual const LiveInterval *dequeue() = 0;
+
+  // A RegAlloc pass should override this to provide the allocation heuristics.
+  // Each call must guarantee forward progess by returning an available PhysReg
+  // or new set of split live virtual registers. It is up to the splitter to
+  // converge quickly toward fully spilled live ranges.
+  virtual MCRegister selectOrSplit(const LiveInterval &VirtReg,
+                                   SmallVectorImpl<Register> &splitLVRs) = 0;
+
+  // Use this group name for NamedRegionTimer.
+  static const char TimerGroupName[];
+  static const char TimerGroupDescription[];
+
+  /// Method called when the allocator is about to remove a LiveInterval.
+  virtual void aboutToRemoveInterval(const LiveInterval &LI) {}
+
+public:
+  /// VerifyEnabled - True when -verify-regalloc is given.
+  static bool VerifyEnabled;
+
+private:
+  void seedLiveRegs();
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_REGALLOCBASE_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBasic.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBasic.cpp
new file mode 100644
index 000000000000..666199139630
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -0,0 +1,339 @@
+//===-- RegAllocBasic.cpp - Basic Register Allocator ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RABasic function pass, which provides a minimal
+// implementation of the basic register allocator.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AllocationOrder.h"
+#include "LiveDebugVariables.h"
+#include "RegAllocBase.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/LiveStacks.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/Spiller.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <queue>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+static RegisterRegAlloc basicRegAlloc("basic", "basic register allocator",
+                                      createBasicRegisterAllocator);
+
+namespace {
+  struct CompSpillWeight {
+    bool operator()(const LiveInterval *A, const LiveInterval *B) const {
+      return A->weight() < B->weight();
+    }
+  };
+}
+
+namespace {
+/// RABasic provides a minimal implementation of the basic register allocation
+/// algorithm. It prioritizes live virtual registers by spill weight and spills
+/// whenever a register is unavailable. This is not practical in production but
+/// provides a useful baseline both for measuring other allocators and comparing
+/// the speed of the basic algorithm against other styles of allocators.
+class RABasic : public MachineFunctionPass,
+                public RegAllocBase,
+                private LiveRangeEdit::Delegate {
+  // context
+  MachineFunction *MF = nullptr;
+
+  // state
+  std::unique_ptr<Spiller> SpillerInstance;
+  std::priority_queue<const LiveInterval *, std::vector<const LiveInterval *>,
+                      CompSpillWeight>
+      Queue;
+
+  // Scratch space.  Allocated here to avoid repeated malloc calls in
+  // selectOrSplit().
+  BitVector UsableRegs;
+
+  bool LRE_CanEraseVirtReg(Register) override;
+  void LRE_WillShrinkVirtReg(Register) override;
+
+public:
+  RABasic(const RegClassFilterFunc F = allocateAllRegClasses);
+
+  /// Return the pass name.
+  StringRef getPassName() const override { return "Basic Register Allocator"; }
+
+  /// RABasic analysis usage.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  void releaseMemory() override;
+
+  Spiller &spiller() override { return *SpillerInstance; }
+
+  void enqueueImpl(const LiveInterval *LI) override { Queue.push(LI); }
+
+  const LiveInterval *dequeue() override {
+    if (Queue.empty())
+      return nullptr;
+    const LiveInterval *LI = Queue.top();
+    Queue.pop();
+    return LI;
+  }
+
+  MCRegister selectOrSplit(const LiveInterval &VirtReg,
+                           SmallVectorImpl<Register> &SplitVRegs) override;
+
+  /// Perform register allocation.
+  bool runOnMachineFunction(MachineFunction &mf) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoPHIs);
+  }
+
+  MachineFunctionProperties getClearedProperties() const override {
+    return MachineFunctionProperties().set(
+      MachineFunctionProperties::Property::IsSSA);
+  }
+
+  // Helper for spilling all live virtual registers currently unified under preg
+  // that interfere with the most recently queried lvr.  Return true if spilling
+  // was successful, and append any new spilled/split intervals to splitLVRs.
+  bool spillInterferences(const LiveInterval &VirtReg, MCRegister PhysReg,
+                          SmallVectorImpl<Register> &SplitVRegs);
+
+  static char ID;
+};
+
+char RABasic::ID = 0;
+
+} // end anonymous namespace
+
+char &llvm::RABasicID = RABasic::ID;
+
+INITIALIZE_PASS_BEGIN(RABasic, "regallocbasic", "Basic Register Allocator",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(RegisterCoalescer)
+INITIALIZE_PASS_DEPENDENCY(MachineScheduler)
+INITIALIZE_PASS_DEPENDENCY(LiveStacks)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
+INITIALIZE_PASS_END(RABasic, "regallocbasic", "Basic Register Allocator", false,
+                    false)
+
+bool RABasic::LRE_CanEraseVirtReg(Register VirtReg) {
+  LiveInterval &LI = LIS->getInterval(VirtReg);
+  if (VRM->hasPhys(VirtReg)) {
+    Matrix->unassign(LI);
+    aboutToRemoveInterval(LI);
+    return true;
+  }
+  // Unassigned virtreg is probably in the priority queue.
+  // RegAllocBase will erase it after dequeueing.
+  // Nonetheless, clear the live-range so that the debug
+  // dump will show the right state for that VirtReg.
+  LI.clear();
+  return false;
+}
+
+void RABasic::LRE_WillShrinkVirtReg(Register VirtReg) {
+  if (!VRM->hasPhys(VirtReg))
+    return;
+
+  // Register is assigned, put it back on the queue for reassignment.
+  LiveInterval &LI = LIS->getInterval(VirtReg);
+  Matrix->unassign(LI);
+  enqueue(&LI);
+}
+
+RABasic::RABasic(RegClassFilterFunc F):
+  MachineFunctionPass(ID),
+  RegAllocBase(F) {
+}
+
+void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<LiveDebugVariables>();
+  AU.addPreserved<LiveDebugVariables>();
+  AU.addRequired<LiveStacks>();
+  AU.addPreserved<LiveStacks>();
+  AU.addRequired<MachineBlockFrequencyInfo>();
+  AU.addPreserved<MachineBlockFrequencyInfo>();
+  AU.addRequiredID(MachineDominatorsID);
+  AU.addPreservedID(MachineDominatorsID);
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<VirtRegMap>();
+  AU.addPreserved<VirtRegMap>();
+  AU.addRequired<LiveRegMatrix>();
+  AU.addPreserved<LiveRegMatrix>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void RABasic::releaseMemory() {
+  SpillerInstance.reset();
+}
+
+
+// Spill or split all live virtual registers currently unified under PhysReg
+// that interfere with VirtReg. The newly spilled or split live intervals are
+// returned by appending them to SplitVRegs.
+bool RABasic::spillInterferences(const LiveInterval &VirtReg,
+                                 MCRegister PhysReg,
+                                 SmallVectorImpl<Register> &SplitVRegs) {
+  // Record each interference and determine if all are spillable before mutating
+  // either the union or live intervals.
+  SmallVector<const LiveInterval *, 8> Intfs;
+
+  // Collect interferences assigned to any alias of the physical register.
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, Unit);
+    for (const auto *Intf : reverse(Q.interferingVRegs())) {
+      if (!Intf->isSpillable() || Intf->weight() > VirtReg.weight())
+        return false;
+      Intfs.push_back(Intf);
+    }
+  }
+  LLVM_DEBUG(dbgs() << "spilling " << printReg(PhysReg, TRI)
+                    << " interferences with " << VirtReg << "\n");
+  assert(!Intfs.empty() && "expected interference");
+
+  // Spill each interfering vreg allocated to PhysReg or an alias.
+  for (unsigned i = 0, e = Intfs.size(); i != e; ++i) {
+    const LiveInterval &Spill = *Intfs[i];
+
+    // Skip duplicates.
+    if (!VRM->hasPhys(Spill.reg()))
+      continue;
+
+    // Deallocate the interfering vreg by removing it from the union.
+    // A LiveInterval instance may not be in a union during modification!
+    Matrix->unassign(Spill);
+
+    // Spill the extracted interval.
+    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM, this, &DeadRemats);
+    spiller().spill(LRE);
+  }
+  return true;
+}
+
+// Driver for the register assignment and splitting heuristics.
+// Manages iteration over the LiveIntervalUnions.
+//
+// This is a minimal implementation of register assignment and splitting that
+// spills whenever we run out of registers.
+//
+// selectOrSplit can only be called once per live virtual register. We then do a
+// single interference test for each register the correct class until we find an
+// available register. So, the number of interference tests in the worst case is
+// |vregs| * |machineregs|. And since the number of interference tests is
+// minimal, there is no value in caching them outside the scope of
+// selectOrSplit().
+MCRegister RABasic::selectOrSplit(const LiveInterval &VirtReg,
+                                  SmallVectorImpl<Register> &SplitVRegs) {
+  // Populate a list of physical register spill candidates.
+  SmallVector<MCRegister, 8> PhysRegSpillCands;
+
+  // Check for an available register in this class.
+  auto Order =
+      AllocationOrder::create(VirtReg.reg(), *VRM, RegClassInfo, Matrix);
+  for (MCRegister PhysReg : Order) {
+    assert(PhysReg.isValid());
+    // Check for interference in PhysReg
+    switch (Matrix->checkInterference(VirtReg, PhysReg)) {
+    case LiveRegMatrix::IK_Free:
+      // PhysReg is available, allocate it.
+      return PhysReg;
+
+    case LiveRegMatrix::IK_VirtReg:
+      // Only virtual registers in the way, we may be able to spill them.
+      PhysRegSpillCands.push_back(PhysReg);
+      continue;
+
+    default:
+      // RegMask or RegUnit interference.
+      continue;
+    }
+  }
+
+  // Try to spill another interfering reg with less spill weight.
+  for (MCRegister &PhysReg : PhysRegSpillCands) {
+    if (!spillInterferences(VirtReg, PhysReg, SplitVRegs))
+      continue;
+
+    assert(!Matrix->checkInterference(VirtReg, PhysReg) &&
+           "Interference after spill.");
+    // Tell the caller to allocate to this newly freed physical register.
+    return PhysReg;
+  }
+
+  // No other spill candidates were found, so spill the current VirtReg.
+  LLVM_DEBUG(dbgs() << "spilling: " << VirtReg << '\n');
+  if (!VirtReg.isSpillable())
+    return ~0u;
+  LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM, this, &DeadRemats);
+  spiller().spill(LRE);
+
+  // The live virtual register requesting allocation was spilled, so tell
+  // the caller not to allocate anything during this round.
+  return 0;
+}
+
+bool RABasic::runOnMachineFunction(MachineFunction &mf) {
+  LLVM_DEBUG(dbgs() << "********** BASIC REGISTER ALLOCATION **********\n"
+                    << "********** Function: " << mf.getName() << '\n');
+
+  MF = &mf;
+  RegAllocBase::init(getAnalysis<VirtRegMap>(),
+                     getAnalysis<LiveIntervals>(),
+                     getAnalysis<LiveRegMatrix>());
+  VirtRegAuxInfo VRAI(*MF, *LIS, *VRM, getAnalysis<MachineLoopInfo>(),
+                      getAnalysis<MachineBlockFrequencyInfo>());
+  VRAI.calculateSpillWeightsAndHints();
+
+  SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, VRAI));
+
+  allocatePhysRegs();
+  postOptimization();
+
+  // Diagnostic output before rewriting
+  LLVM_DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *VRM << "\n");
+
+  releaseMemory();
+  return true;
+}
+
+FunctionPass* llvm::createBasicRegisterAllocator() {
+  return new RABasic();
+}
+
+FunctionPass* llvm::createBasicRegisterAllocator(RegClassFilterFunc F) {
+  return new RABasic(F);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
new file mode 100644
index 000000000000..81f3d2c8099f
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
@@ -0,0 +1,311 @@
+//===- RegAllocEvictionAdvisor.cpp - eviction advisor ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the default eviction advisor and of the Analysis pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RegAllocEvictionAdvisor.h"
+#include "AllocationOrder.h"
+#include "RegAllocGreedy.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+static cl::opt<RegAllocEvictionAdvisorAnalysis::AdvisorMode> Mode(
+    "regalloc-enable-advisor", cl::Hidden,
+    cl::init(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default),
+    cl::desc("Enable regalloc advisor mode"),
+    cl::values(
+        clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default,
+                   "default", "Default"),
+        clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Release,
+                   "release", "precompiled"),
+        clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Development,
+                   "development", "for training")));
+
+static cl::opt<bool> EnableLocalReassignment(
+    "enable-local-reassign", cl::Hidden,
+    cl::desc("Local reassignment can yield better allocation decisions, but "
+             "may be compile time intensive"),
+    cl::init(false));
+
+namespace llvm {
+cl::opt<unsigned> EvictInterferenceCutoff(
+    "regalloc-eviction-max-interference-cutoff", cl::Hidden,
+    cl::desc("Number of interferences after which we declare "
+             "an interference unevictable and bail out. This "
+             "is a compilation cost-saving consideration. To "
+             "disable, pass a very large number."),
+    cl::init(10));
+}
+
+#define DEBUG_TYPE "regalloc"
+#ifdef LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL
+#define LLVM_HAVE_TF_AOT
+#endif
+
+char RegAllocEvictionAdvisorAnalysis::ID = 0;
+INITIALIZE_PASS(RegAllocEvictionAdvisorAnalysis, "regalloc-evict",
+                "Regalloc eviction policy", false, true)
+
+namespace {
+class DefaultEvictionAdvisorAnalysis final
+    : public RegAllocEvictionAdvisorAnalysis {
+public:
+  DefaultEvictionAdvisorAnalysis(bool NotAsRequested)
+      : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Default),
+        NotAsRequested(NotAsRequested) {}
+
+  // support for isa<> and dyn_cast.
+  static bool classof(const RegAllocEvictionAdvisorAnalysis *R) {
+    return R->getAdvisorMode() == AdvisorMode::Default;
+  }
+
+private:
+  std::unique_ptr<RegAllocEvictionAdvisor>
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override {
+    return std::make_unique<DefaultEvictionAdvisor>(MF, RA);
+  }
+  bool doInitialization(Module &M) override {
+    if (NotAsRequested)
+      M.getContext().emitError("Requested regalloc eviction advisor analysis "
+                               "could be created. Using default");
+    return RegAllocEvictionAdvisorAnalysis::doInitialization(M);
+  }
+  const bool NotAsRequested;
+};
+} // namespace
+
+template <> Pass *llvm::callDefaultCtor<RegAllocEvictionAdvisorAnalysis>() {
+  Pass *Ret = nullptr;
+  switch (Mode) {
+  case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default:
+    Ret = new DefaultEvictionAdvisorAnalysis(/*NotAsRequested*/ false);
+    break;
+  case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Development:
+#if defined(LLVM_HAVE_TFLITE)
+    Ret = createDevelopmentModeAdvisor();
+#endif
+    break;
+  case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Release:
+    Ret = createReleaseModeAdvisor();
+    break;
+  }
+  if (Ret)
+    return Ret;
+  return new DefaultEvictionAdvisorAnalysis(/*NotAsRequested*/ true);
+}
+
+StringRef RegAllocEvictionAdvisorAnalysis::getPassName() const {
+  switch (getAdvisorMode()) {
+  case AdvisorMode::Default:
+    return "Default Regalloc Eviction Advisor";
+  case AdvisorMode::Release:
+    return "Release mode Regalloc Eviction Advisor";
+  case AdvisorMode::Development:
+    return "Development mode Regalloc Eviction Advisor";
+  }
+  llvm_unreachable("Unknown advisor kind");
+}
+
+RegAllocEvictionAdvisor::RegAllocEvictionAdvisor(const MachineFunction &MF,
+                                                 const RAGreedy &RA)
+    : MF(MF), RA(RA), Matrix(RA.getInterferenceMatrix()),
+      LIS(RA.getLiveIntervals()), VRM(RA.getVirtRegMap()),
+      MRI(&VRM->getRegInfo()), TRI(MF.getSubtarget().getRegisterInfo()),
+      RegClassInfo(RA.getRegClassInfo()), RegCosts(TRI->getRegisterCosts(MF)),
+      EnableLocalReassign(EnableLocalReassignment ||
+                          MF.getSubtarget().enableRALocalReassignment(
+                              MF.getTarget().getOptLevel())) {}
+
+/// shouldEvict - determine if A should evict the assigned live range B. The
+/// eviction policy defined by this function together with the allocation order
+/// defined by enqueue() decides which registers ultimately end up being split
+/// and spilled.
+///
+/// Cascade numbers are used to prevent infinite loops if this function is a
+/// cyclic relation.
+///
+/// @param A          The live range to be assigned.
+/// @param IsHint     True when A is about to be assigned to its preferred
+///                   register.
+/// @param B          The live range to be evicted.
+/// @param BreaksHint True when B is already assigned to its preferred register.
+bool DefaultEvictionAdvisor::shouldEvict(const LiveInterval &A, bool IsHint,
+                                         const LiveInterval &B,
+                                         bool BreaksHint) const {
+  bool CanSplit = RA.getExtraInfo().getStage(B) < RS_Spill;
+
+  // Be fairly aggressive about following hints as long as the evictee can be
+  // split.
+  if (CanSplit && IsHint && !BreaksHint)
+    return true;
+
+  if (A.weight() > B.weight()) {
+    LLVM_DEBUG(dbgs() << "should evict: " << B << " w= " << B.weight() << '\n');
+    return true;
+  }
+  return false;
+}
+
+/// canEvictHintInterference - return true if the interference for VirtReg
+/// on the PhysReg, which is VirtReg's hint, can be evicted in favor of VirtReg.
+bool DefaultEvictionAdvisor::canEvictHintInterference(
+    const LiveInterval &VirtReg, MCRegister PhysReg,
+    const SmallVirtRegSet &FixedRegisters) const {
+  EvictionCost MaxCost;
+  MaxCost.setBrokenHints(1);
+  return canEvictInterferenceBasedOnCost(VirtReg, PhysReg, true, MaxCost,
+                                         FixedRegisters);
+}
+
+/// canEvictInterferenceBasedOnCost - Return true if all interferences between
+/// VirtReg and PhysReg can be evicted.
+///
+/// @param VirtReg Live range that is about to be assigned.
+/// @param PhysReg Desired register for assignment.
+/// @param IsHint  True when PhysReg is VirtReg's preferred register.
+/// @param MaxCost Only look for cheaper candidates and update with new cost
+///                when returning true.
+/// @returns True when interference can be evicted cheaper than MaxCost.
+bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost(
+    const LiveInterval &VirtReg, MCRegister PhysReg, bool IsHint,
+    EvictionCost &MaxCost, const SmallVirtRegSet &FixedRegisters) const {
+  // It is only possible to evict virtual register interference.
+  if (Matrix->checkInterference(VirtReg, PhysReg) > LiveRegMatrix::IK_VirtReg)
+    return false;
+
+  bool IsLocal = VirtReg.empty() || LIS->intervalIsInOneMBB(VirtReg);
+
+  // Find VirtReg's cascade number. This will be unassigned if VirtReg was never
+  // involved in an eviction before. If a cascade number was assigned, deny
+  // evicting anything with the same or a newer cascade number. This prevents
+  // infinite eviction loops.
+  //
+  // This works out so a register without a cascade number is allowed to evict
+  // anything, and it can be evicted by anything.
+  unsigned Cascade = RA.getExtraInfo().getCascadeOrCurrentNext(VirtReg.reg());
+
+  EvictionCost Cost;
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, Unit);
+    // If there is 10 or more interferences, chances are one is heavier.
+    const auto &Interferences = Q.interferingVRegs(EvictInterferenceCutoff);
+    if (Interferences.size() >= EvictInterferenceCutoff)
+      return false;
+
+    // Check if any interfering live range is heavier than MaxWeight.
+    for (const LiveInterval *Intf : reverse(Interferences)) {
+      assert(Intf->reg().isVirtual() &&
+             "Only expecting virtual register interference from query");
+
+      // Do not allow eviction of a virtual register if we are in the middle
+      // of last-chance recoloring and this virtual register is one that we
+      // have scavenged a physical register for.
+      if (FixedRegisters.count(Intf->reg()))
+        return false;
+
+      // Never evict spill products. They cannot split or spill.
+      if (RA.getExtraInfo().getStage(*Intf) == RS_Done)
+        return false;
+      // Once a live range becomes small enough, it is urgent that we find a
+      // register for it. This is indicated by an infinite spill weight. These
+      // urgent live ranges get to evict almost anything.
+      //
+      // Also allow urgent evictions of unspillable ranges from a strictly
+      // larger allocation order.
+      bool Urgent =
+          !VirtReg.isSpillable() &&
+          (Intf->isSpillable() ||
+           RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(VirtReg.reg())) <
+               RegClassInfo.getNumAllocatableRegs(
+                   MRI->getRegClass(Intf->reg())));
+      // Only evict older cascades or live ranges without a cascade.
+      unsigned IntfCascade = RA.getExtraInfo().getCascade(Intf->reg());
+      if (Cascade == IntfCascade)
+        return false;
+
+      if (Cascade < IntfCascade) {
+        if (!Urgent)
+          return false;
+        // We permit breaking cascades for urgent evictions. It should be the
+        // last resort, though, so make it really expensive.
+        Cost.BrokenHints += 10;
+      }
+      // Would this break a satisfied hint?
+      bool BreaksHint = VRM->hasPreferredPhys(Intf->reg());
+      // Update eviction cost.
+      Cost.BrokenHints += BreaksHint;
+      Cost.MaxWeight = std::max(Cost.MaxWeight, Intf->weight());
+      // Abort if this would be too expensive.
+      if (!(Cost < MaxCost))
+        return false;
+      if (Urgent)
+        continue;
+      // Apply the eviction policy for non-urgent evictions.
+      if (!shouldEvict(VirtReg, IsHint, *Intf, BreaksHint))
+        return false;
+      // If !MaxCost.isMax(), then we're just looking for a cheap register.
+      // Evicting another local live range in this case could lead to suboptimal
+      // coloring.
+      if (!MaxCost.isMax() && IsLocal && LIS->intervalIsInOneMBB(*Intf) &&
+          (!EnableLocalReassign || !canReassign(*Intf, PhysReg))) {
+        return false;
+      }
+    }
+  }
+  MaxCost = Cost;
+  return true;
+}
+
+MCRegister DefaultEvictionAdvisor::tryFindEvictionCandidate(
+    const LiveInterval &VirtReg, const AllocationOrder &Order,
+    uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const {
+  // Keep track of the cheapest interference seen so far.
+  EvictionCost BestCost;
+  BestCost.setMax();
+  MCRegister BestPhys;
+  auto MaybeOrderLimit = getOrderLimit(VirtReg, Order, CostPerUseLimit);
+  if (!MaybeOrderLimit)
+    return MCRegister::NoRegister;
+  unsigned OrderLimit = *MaybeOrderLimit;
+
+  // When we are just looking for a reduced cost per use, don't break any
+  // hints, and only evict smaller spill weights.
+  if (CostPerUseLimit < uint8_t(~0u)) {
+    BestCost.BrokenHints = 0;
+    BestCost.MaxWeight = VirtReg.weight();
+  }
+
+  for (auto I = Order.begin(), E = Order.getOrderLimitEnd(OrderLimit); I != E;
+       ++I) {
+    MCRegister PhysReg = *I;
+    assert(PhysReg);
+    if (!canAllocatePhysReg(CostPerUseLimit, PhysReg) ||
+        !canEvictInterferenceBasedOnCost(VirtReg, PhysReg, false, BestCost,
+                                         FixedRegisters))
+      continue;
+
+    // Best so far.
+    BestPhys = PhysReg;
+
+    // Stop if the hint can be used.
+    if (I.isHint())
+      break;
+  }
+  return BestPhys;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h
new file mode 100644
index 000000000000..52dd946a6854
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h
@@ -0,0 +1,223 @@
+//===- RegAllocEvictionAdvisor.h - Interference resolution ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGALLOCEVICTIONADVISOR_H
+#define LLVM_CODEGEN_REGALLOCEVICTIONADVISOR_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/MC/MCRegister.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+class AllocationOrder;
+class LiveInterval;
+class LiveIntervals;
+class LiveRegMatrix;
+class MachineFunction;
+class MachineRegisterInfo;
+class RegisterClassInfo;
+class TargetRegisterInfo;
+class VirtRegMap;
+
+using SmallVirtRegSet = SmallSet<Register, 16>;
+
+// Live ranges pass through a number of stages as we try to allocate them.
+// Some of the stages may also create new live ranges:
+//
+// - Region splitting.
+// - Per-block splitting.
+// - Local splitting.
+// - Spilling.
+//
+// Ranges produced by one of the stages skip the previous stages when they are
+// dequeued. This improves performance because we can skip interference checks
+// that are unlikely to give any results. It also guarantees that the live
+// range splitting algorithm terminates, something that is otherwise hard to
+// ensure.
+enum LiveRangeStage {
+  /// Newly created live range that has never been queued.
+  RS_New,
+
+  /// Only attempt assignment and eviction. Then requeue as RS_Split.
+  RS_Assign,
+
+  /// Attempt live range splitting if assignment is impossible.
+  RS_Split,
+
+  /// Attempt more aggressive live range splitting that is guaranteed to make
+  /// progress.  This is used for split products that may not be making
+  /// progress.
+  RS_Split2,
+
+  /// Live range will be spilled.  No more splitting will be attempted.
+  RS_Spill,
+
+  /// Live range is in memory. Because of other evictions, it might get moved
+  /// in a register in the end.
+  RS_Memory,
+
+  /// There is nothing more we can do to this live range.  Abort compilation
+  /// if it can't be assigned.
+  RS_Done
+};
+
+/// Cost of evicting interference - used by default advisor, and the eviction
+/// chain heuristic in RegAllocGreedy.
+// FIXME: this can be probably made an implementation detail of the default
+// advisor, if the eviction chain logic can be refactored.
+struct EvictionCost {
+  unsigned BrokenHints = 0; ///< Total number of broken hints.
+  float MaxWeight = 0;      ///< Maximum spill weight evicted.
+
+  EvictionCost() = default;
+
+  bool isMax() const { return BrokenHints == ~0u; }
+
+  void setMax() { BrokenHints = ~0u; }
+
+  void setBrokenHints(unsigned NHints) { BrokenHints = NHints; }
+
+  bool operator<(const EvictionCost &O) const {
+    return std::tie(BrokenHints, MaxWeight) <
+           std::tie(O.BrokenHints, O.MaxWeight);
+  }
+};
+
+/// Interface to the eviction advisor, which is responsible for making a
+/// decision as to which live ranges should be evicted (if any).
+class RAGreedy;
+class RegAllocEvictionAdvisor {
+public:
+  RegAllocEvictionAdvisor(const RegAllocEvictionAdvisor &) = delete;
+  RegAllocEvictionAdvisor(RegAllocEvictionAdvisor &&) = delete;
+  virtual ~RegAllocEvictionAdvisor() = default;
+
+  /// Find a physical register that can be freed by evicting the FixedRegisters,
+  /// or return NoRegister. The eviction decision is assumed to be correct (i.e.
+  /// no fixed live ranges are evicted) and profitable.
+  virtual MCRegister tryFindEvictionCandidate(
+      const LiveInterval &VirtReg, const AllocationOrder &Order,
+      uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const = 0;
+
+  /// Find out if we can evict the live ranges occupying the given PhysReg,
+  /// which is a hint (preferred register) for VirtReg.
+  virtual bool
+  canEvictHintInterference(const LiveInterval &VirtReg, MCRegister PhysReg,
+                           const SmallVirtRegSet &FixedRegisters) const = 0;
+
+  /// Returns true if the given \p PhysReg is a callee saved register and has
+  /// not been used for allocation yet.
+  bool isUnusedCalleeSavedReg(MCRegister PhysReg) const;
+
+protected:
+  RegAllocEvictionAdvisor(const MachineFunction &MF, const RAGreedy &RA);
+
+  bool canReassign(const LiveInterval &VirtReg, MCRegister FromReg) const;
+
+  // Get the upper limit of elements in the given Order we need to analize.
+  // TODO: is this heuristic,  we could consider learning it.
+  std::optional<unsigned> getOrderLimit(const LiveInterval &VirtReg,
+                                        const AllocationOrder &Order,
+                                        unsigned CostPerUseLimit) const;
+
+  // Determine if it's worth trying to allocate this reg, given the
+  // CostPerUseLimit
+  // TODO: this is a heuristic component we could consider learning, too.
+  bool canAllocatePhysReg(unsigned CostPerUseLimit, MCRegister PhysReg) const;
+
+  const MachineFunction &MF;
+  const RAGreedy &RA;
+  LiveRegMatrix *const Matrix;
+  LiveIntervals *const LIS;
+  VirtRegMap *const VRM;
+  MachineRegisterInfo *const MRI;
+  const TargetRegisterInfo *const TRI;
+  const RegisterClassInfo &RegClassInfo;
+  const ArrayRef<uint8_t> RegCosts;
+
+  /// Run or not the local reassignment heuristic. This information is
+  /// obtained from the TargetSubtargetInfo.
+  const bool EnableLocalReassign;
+};
+
+/// ImmutableAnalysis abstraction for fetching the Eviction Advisor. We model it
+/// as an analysis to decouple the user from the implementation insofar as
+/// dependencies on other analyses goes. The motivation for it being an
+/// immutable pass is twofold:
+/// - in the ML implementation case, the evaluator is stateless but (especially
+/// in the development mode) expensive to set up. With an immutable pass, we set
+/// it up once.
+/// - in the 'development' mode ML case, we want to capture the training log
+/// during allocation (this is a log of features encountered and decisions
+/// made), and then measure a score, potentially a few steps after allocation
+/// completes. So we need the properties of an immutable pass to keep the logger
+/// state around until we can make that measurement.
+///
+/// Because we need to offer additional services in 'development' mode, the
+/// implementations of this analysis need to implement RTTI support.
+class RegAllocEvictionAdvisorAnalysis : public ImmutablePass {
+public:
+  enum class AdvisorMode : int { Default, Release, Development };
+
+  RegAllocEvictionAdvisorAnalysis(AdvisorMode Mode)
+      : ImmutablePass(ID), Mode(Mode){};
+  static char ID;
+
+  /// Get an advisor for the given context (i.e. machine function, etc)
+  virtual std::unique_ptr<RegAllocEvictionAdvisor>
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) = 0;
+  AdvisorMode getAdvisorMode() const { return Mode; }
+  virtual void logRewardIfNeeded(const MachineFunction &MF,
+                                 llvm::function_ref<float()> GetReward){};
+
+protected:
+  // This analysis preserves everything, and subclasses may have additional
+  // requirements.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+private:
+  StringRef getPassName() const override;
+  const AdvisorMode Mode;
+};
+
+/// Specialization for the API used by the analysis infrastructure to create
+/// an instance of the eviction advisor.
+template <> Pass *callDefaultCtor<RegAllocEvictionAdvisorAnalysis>();
+
+RegAllocEvictionAdvisorAnalysis *createReleaseModeAdvisor();
+
+RegAllocEvictionAdvisorAnalysis *createDevelopmentModeAdvisor();
+
+// TODO: move to RegAllocEvictionAdvisor.cpp when we move implementation
+// out of RegAllocGreedy.cpp
+class DefaultEvictionAdvisor : public RegAllocEvictionAdvisor {
+public:
+  DefaultEvictionAdvisor(const MachineFunction &MF, const RAGreedy &RA)
+      : RegAllocEvictionAdvisor(MF, RA) {}
+
+private:
+  MCRegister tryFindEvictionCandidate(const LiveInterval &,
+                                      const AllocationOrder &, uint8_t,
+                                      const SmallVirtRegSet &) const override;
+  bool canEvictHintInterference(const LiveInterval &, MCRegister,
+                                const SmallVirtRegSet &) const override;
+  bool canEvictInterferenceBasedOnCost(const LiveInterval &, MCRegister, bool,
+                                       EvictionCost &,
+                                       const SmallVirtRegSet &) const;
+  bool shouldEvict(const LiveInterval &A, bool, const LiveInterval &B,
+                   bool) const;
+};
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_REGALLOCEVICTIONADVISOR_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocFast.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocFast.cpp
new file mode 100644
index 000000000000..864beb8720f4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -0,0 +1,1673 @@
+//===- RegAllocFast.cpp - A fast register allocator for debug code --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This register allocator allocates registers to a basic block at a
+/// time, attempting to keep values in registers and reusing registers as
+/// appropriate.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegAllocCommon.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <tuple>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(NumStores, "Number of stores added");
+STATISTIC(NumLoads , "Number of loads added");
+STATISTIC(NumCoalesced, "Number of copies coalesced");
+
+// FIXME: Remove this switch when all testcases are fixed!
+static cl::opt<bool> IgnoreMissingDefs("rafast-ignore-missing-defs",
+                                       cl::Hidden);
+
+static RegisterRegAlloc
+  fastRegAlloc("fast", "fast register allocator", createFastRegisterAllocator);
+
+namespace {
+
+  class RegAllocFast : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    RegAllocFast(const RegClassFilterFunc F = allocateAllRegClasses,
+                 bool ClearVirtRegs_ = true) :
+      MachineFunctionPass(ID),
+      ShouldAllocateClass(F),
+      StackSlotForVirtReg(-1),
+      ClearVirtRegs(ClearVirtRegs_) {
+    }
+
+  private:
+    MachineFrameInfo *MFI = nullptr;
+    MachineRegisterInfo *MRI = nullptr;
+    const TargetRegisterInfo *TRI = nullptr;
+    const TargetInstrInfo *TII = nullptr;
+    RegisterClassInfo RegClassInfo;
+    const RegClassFilterFunc ShouldAllocateClass;
+
+    /// Basic block currently being allocated.
+    MachineBasicBlock *MBB = nullptr;
+
+    /// Maps virtual regs to the frame index where these values are spilled.
+    IndexedMap<int, VirtReg2IndexFunctor> StackSlotForVirtReg;
+
+    bool ClearVirtRegs;
+
+    /// Everything we know about a live virtual register.
+    struct LiveReg {
+      MachineInstr *LastUse = nullptr; ///< Last instr to use reg.
+      Register VirtReg;                ///< Virtual register number.
+      MCPhysReg PhysReg = 0;           ///< Currently held here.
+      bool LiveOut = false;            ///< Register is possibly live out.
+      bool Reloaded = false;           ///< Register was reloaded.
+      bool Error = false;              ///< Could not allocate.
+
+      explicit LiveReg(Register VirtReg) : VirtReg(VirtReg) {}
+
+      unsigned getSparseSetIndex() const {
+        return Register::virtReg2Index(VirtReg);
+      }
+    };
+
+    using LiveRegMap = SparseSet<LiveReg, identity<unsigned>, uint16_t>;
+    /// This map contains entries for each virtual register that is currently
+    /// available in a physical register.
+    LiveRegMap LiveVirtRegs;
+
+    /// Stores assigned virtual registers present in the bundle MI.
+    DenseMap<Register, MCPhysReg> BundleVirtRegsMap;
+
+    DenseMap<unsigned, SmallVector<MachineOperand *, 2>> LiveDbgValueMap;
+    /// List of DBG_VALUE that we encountered without the vreg being assigned
+    /// because they were placed after the last use of the vreg.
+    DenseMap<unsigned, SmallVector<MachineInstr *, 1>> DanglingDbgValues;
+
+    /// Has a bit set for every virtual register for which it was determined
+    /// that it is alive across blocks.
+    BitVector MayLiveAcrossBlocks;
+
+    /// State of a register unit.
+    enum RegUnitState {
+      /// A free register is not currently in use and can be allocated
+      /// immediately without checking aliases.
+      regFree,
+
+      /// A pre-assigned register has been assigned before register allocation
+      /// (e.g., setting up a call parameter).
+      regPreAssigned,
+
+      /// Used temporarily in reloadAtBegin() to mark register units that are
+      /// live-in to the basic block.
+      regLiveIn,
+
+      /// A register state may also be a virtual register number, indication
+      /// that the physical register is currently allocated to a virtual
+      /// register. In that case, LiveVirtRegs contains the inverse mapping.
+    };
+
+    /// Maps each physical register to a RegUnitState enum or virtual register.
+    std::vector<unsigned> RegUnitStates;
+
+    SmallVector<MachineInstr *, 32> Coalesced;
+
+    using RegUnitSet = SparseSet<uint16_t, identity<uint16_t>>;
+    /// Set of register units that are used in the current instruction, and so
+    /// cannot be allocated.
+    RegUnitSet UsedInInstr;
+    RegUnitSet PhysRegUses;
+    SmallVector<uint16_t, 8> DefOperandIndexes;
+    // Register masks attached to the current instruction.
+    SmallVector<const uint32_t *> RegMasks;
+
+    void setPhysRegState(MCPhysReg PhysReg, unsigned NewState);
+    bool isPhysRegFree(MCPhysReg PhysReg) const;
+
+    /// Mark a physreg as used in this instruction.
+    void markRegUsedInInstr(MCPhysReg PhysReg) {
+      for (MCRegUnit Unit : TRI->regunits(PhysReg))
+        UsedInInstr.insert(Unit);
+    }
+
+    // Check if physreg is clobbered by instruction's regmask(s).
+    bool isClobberedByRegMasks(MCPhysReg PhysReg) const {
+      return llvm::any_of(RegMasks, [PhysReg](const uint32_t *Mask) {
+        return MachineOperand::clobbersPhysReg(Mask, PhysReg);
+      });
+    }
+
+    /// Check if a physreg or any of its aliases are used in this instruction.
+    bool isRegUsedInInstr(MCPhysReg PhysReg, bool LookAtPhysRegUses) const {
+      if (LookAtPhysRegUses && isClobberedByRegMasks(PhysReg))
+        return true;
+      for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+        if (UsedInInstr.count(Unit))
+          return true;
+        if (LookAtPhysRegUses && PhysRegUses.count(Unit))
+          return true;
+      }
+      return false;
+    }
+
+    /// Mark physical register as being used in a register use operand.
+    /// This is only used by the special livethrough handling code.
+    void markPhysRegUsedInInstr(MCPhysReg PhysReg) {
+      for (MCRegUnit Unit : TRI->regunits(PhysReg))
+        PhysRegUses.insert(Unit);
+    }
+
+    /// Remove mark of physical register being used in the instruction.
+    void unmarkRegUsedInInstr(MCPhysReg PhysReg) {
+      for (MCRegUnit Unit : TRI->regunits(PhysReg))
+        UsedInInstr.erase(Unit);
+    }
+
+    enum : unsigned {
+      spillClean = 50,
+      spillDirty = 100,
+      spillPrefBonus = 20,
+      spillImpossible = ~0u
+    };
+
+  public:
+    StringRef getPassName() const override { return "Fast Register Allocator"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoPHIs);
+    }
+
+    MachineFunctionProperties getSetProperties() const override {
+      if (ClearVirtRegs) {
+        return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::NoVRegs);
+      }
+
+      return MachineFunctionProperties();
+    }
+
+    MachineFunctionProperties getClearedProperties() const override {
+      return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+    }
+
+  private:
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    void allocateBasicBlock(MachineBasicBlock &MBB);
+
+    void addRegClassDefCounts(std::vector<unsigned> &RegClassDefCounts,
+                              Register Reg) const;
+
+    void findAndSortDefOperandIndexes(const MachineInstr &MI);
+
+    void allocateInstruction(MachineInstr &MI);
+    void handleDebugValue(MachineInstr &MI);
+    void handleBundle(MachineInstr &MI);
+
+    bool usePhysReg(MachineInstr &MI, MCPhysReg PhysReg);
+    bool definePhysReg(MachineInstr &MI, MCPhysReg PhysReg);
+    bool displacePhysReg(MachineInstr &MI, MCPhysReg PhysReg);
+    void freePhysReg(MCPhysReg PhysReg);
+
+    unsigned calcSpillCost(MCPhysReg PhysReg) const;
+
+    LiveRegMap::iterator findLiveVirtReg(Register VirtReg) {
+      return LiveVirtRegs.find(Register::virtReg2Index(VirtReg));
+    }
+
+    LiveRegMap::const_iterator findLiveVirtReg(Register VirtReg) const {
+      return LiveVirtRegs.find(Register::virtReg2Index(VirtReg));
+    }
+
+    void assignVirtToPhysReg(MachineInstr &MI, LiveReg &, MCPhysReg PhysReg);
+    void allocVirtReg(MachineInstr &MI, LiveReg &LR, Register Hint,
+                      bool LookAtPhysRegUses = false);
+    void allocVirtRegUndef(MachineOperand &MO);
+    void assignDanglingDebugValues(MachineInstr &Def, Register VirtReg,
+                                   MCPhysReg Reg);
+    bool defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum,
+                                  Register VirtReg);
+    bool defineVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg,
+                       bool LookAtPhysRegUses = false);
+    bool useVirtReg(MachineInstr &MI, unsigned OpNum, Register VirtReg);
+
+    MachineBasicBlock::iterator
+    getMBBBeginInsertionPoint(MachineBasicBlock &MBB,
+                              SmallSet<Register, 2> &PrologLiveIns) const;
+
+    void reloadAtBegin(MachineBasicBlock &MBB);
+    bool setPhysReg(MachineInstr &MI, MachineOperand &MO, MCPhysReg PhysReg);
+
+    Register traceCopies(Register VirtReg) const;
+    Register traceCopyChain(Register Reg) const;
+
+    bool shouldAllocateRegister(const Register Reg) const;
+    int getStackSpaceFor(Register VirtReg);
+    void spill(MachineBasicBlock::iterator Before, Register VirtReg,
+               MCPhysReg AssignedReg, bool Kill, bool LiveOut);
+    void reload(MachineBasicBlock::iterator Before, Register VirtReg,
+                MCPhysReg PhysReg);
+
+    bool mayLiveOut(Register VirtReg);
+    bool mayLiveIn(Register VirtReg);
+
+    void dumpState() const;
+  };
+
+} // end anonymous namespace
+
+char RegAllocFast::ID = 0;
+
+INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false,
+                false)
+
+bool RegAllocFast::shouldAllocateRegister(const Register Reg) const {
+  assert(Reg.isVirtual());
+  const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
+  return ShouldAllocateClass(*TRI, RC);
+}
+
+void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) {
+  for (MCRegUnit Unit : TRI->regunits(PhysReg))
+    RegUnitStates[Unit] = NewState;
+}
+
+bool RegAllocFast::isPhysRegFree(MCPhysReg PhysReg) const {
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    if (RegUnitStates[Unit] != regFree)
+      return false;
+  }
+  return true;
+}
+
+/// This allocates space for the specified virtual register to be held on the
+/// stack.
+int RegAllocFast::getStackSpaceFor(Register VirtReg) {
+  // Find the location Reg would belong...
+  int SS = StackSlotForVirtReg[VirtReg];
+  // Already has space allocated?
+  if (SS != -1)
+    return SS;
+
+  // Allocate a new stack object for this spill location...
+  const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+  unsigned Size = TRI->getSpillSize(RC);
+  Align Alignment = TRI->getSpillAlign(RC);
+  int FrameIdx = MFI->CreateSpillStackObject(Size, Alignment);
+
+  // Assign the slot.
+  StackSlotForVirtReg[VirtReg] = FrameIdx;
+  return FrameIdx;
+}
+
+static bool dominates(MachineBasicBlock &MBB,
+                      MachineBasicBlock::const_iterator A,
+                      MachineBasicBlock::const_iterator B) {
+  auto MBBEnd = MBB.end();
+  if (B == MBBEnd)
+    return true;
+
+  MachineBasicBlock::const_iterator I = MBB.begin();
+  for (; &*I != A && &*I != B; ++I)
+    ;
+
+  return &*I == A;
+}
+
+/// Returns false if \p VirtReg is known to not live out of the current block.
+bool RegAllocFast::mayLiveOut(Register VirtReg) {
+  if (MayLiveAcrossBlocks.test(Register::virtReg2Index(VirtReg))) {
+    // Cannot be live-out if there are no successors.
+    return !MBB->succ_empty();
+  }
+
+  const MachineInstr *SelfLoopDef = nullptr;
+
+  // If this block loops back to itself, it is necessary to check whether the
+  // use comes after the def.
+  if (MBB->isSuccessor(MBB)) {
+    // Find the first def in the self loop MBB.
+    for (const MachineInstr &DefInst : MRI->def_instructions(VirtReg)) {
+      if (DefInst.getParent() != MBB) {
+        MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+        return true;
+      } else {
+        if (!SelfLoopDef || dominates(*MBB, DefInst.getIterator(), SelfLoopDef))
+          SelfLoopDef = &DefInst;
+      }
+    }
+    if (!SelfLoopDef) {
+      MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+      return true;
+    }
+  }
+
+  // See if the first \p Limit uses of the register are all in the current
+  // block.
+  static const unsigned Limit = 8;
+  unsigned C = 0;
+  for (const MachineInstr &UseInst : MRI->use_nodbg_instructions(VirtReg)) {
+    if (UseInst.getParent() != MBB || ++C >= Limit) {
+      MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+      // Cannot be live-out if there are no successors.
+      return !MBB->succ_empty();
+    }
+
+    if (SelfLoopDef) {
+      // Try to handle some simple cases to avoid spilling and reloading every
+      // value inside a self looping block.
+      if (SelfLoopDef == &UseInst ||
+          !dominates(*MBB, SelfLoopDef->getIterator(), UseInst.getIterator())) {
+        MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/// Returns false if \p VirtReg is known to not be live into the current block.
+bool RegAllocFast::mayLiveIn(Register VirtReg) {
+  if (MayLiveAcrossBlocks.test(Register::virtReg2Index(VirtReg)))
+    return !MBB->pred_empty();
+
+  // See if the first \p Limit def of the register are all in the current block.
+  static const unsigned Limit = 8;
+  unsigned C = 0;
+  for (const MachineInstr &DefInst : MRI->def_instructions(VirtReg)) {
+    if (DefInst.getParent() != MBB || ++C >= Limit) {
+      MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
+      return !MBB->pred_empty();
+    }
+  }
+
+  return false;
+}
+
+/// Insert spill instruction for \p AssignedReg before \p Before. Update
+/// DBG_VALUEs with \p VirtReg operands with the stack slot.
+void RegAllocFast::spill(MachineBasicBlock::iterator Before, Register VirtReg,
+                         MCPhysReg AssignedReg, bool Kill, bool LiveOut) {
+  LLVM_DEBUG(dbgs() << "Spilling " << printReg(VirtReg, TRI)
+                    << " in " << printReg(AssignedReg, TRI));
+  int FI = getStackSpaceFor(VirtReg);
+  LLVM_DEBUG(dbgs() << " to stack slot #" << FI << '\n');
+
+  const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+  TII->storeRegToStackSlot(*MBB, Before, AssignedReg, Kill, FI, &RC, TRI,
+                           VirtReg);
+  ++NumStores;
+
+  MachineBasicBlock::iterator FirstTerm = MBB->getFirstTerminator();
+
+  // When we spill a virtual register, we will have spill instructions behind
+  // every definition of it, meaning we can switch all the DBG_VALUEs over
+  // to just reference the stack slot.
+  SmallVectorImpl<MachineOperand *> &LRIDbgOperands = LiveDbgValueMap[VirtReg];
+  SmallMapVector<MachineInstr *, SmallVector<const MachineOperand *>, 2>
+      SpilledOperandsMap;
+  for (MachineOperand *MO : LRIDbgOperands)
+    SpilledOperandsMap[MO->getParent()].push_back(MO);
+  for (auto MISpilledOperands : SpilledOperandsMap) {
+    MachineInstr &DBG = *MISpilledOperands.first;
+    // We don't have enough support for tracking operands of DBG_VALUE_LISTs.
+    if (DBG.isDebugValueList())
+      continue;
+    MachineInstr *NewDV = buildDbgValueForSpill(
+        *MBB, Before, *MISpilledOperands.first, FI, MISpilledOperands.second);
+    assert(NewDV->getParent() == MBB && "dangling parent pointer");
+    (void)NewDV;
+    LLVM_DEBUG(dbgs() << "Inserting debug info due to spill:\n" << *NewDV);
+
+    if (LiveOut) {
+      // We need to insert a DBG_VALUE at the end of the block if the spill slot
+      // is live out, but there is another use of the value after the
+      // spill. This will allow LiveDebugValues to see the correct live out
+      // value to propagate to the successors.
+      MachineInstr *ClonedDV = MBB->getParent()->CloneMachineInstr(NewDV);
+      MBB->insert(FirstTerm, ClonedDV);
+      LLVM_DEBUG(dbgs() << "Cloning debug info due to live out spill\n");
+    }
+
+    // Rewrite unassigned dbg_values to use the stack slot.
+    // TODO We can potentially do this for list debug values as well if we know
+    // how the dbg_values are getting unassigned.
+    if (DBG.isNonListDebugValue()) {
+      MachineOperand &MO = DBG.getDebugOperand(0);
+      if (MO.isReg() && MO.getReg() == 0) {
+        updateDbgValueForSpill(DBG, FI, 0);
+      }
+    }
+  }
+  // Now this register is spilled there is should not be any DBG_VALUE
+  // pointing to this register because they are all pointing to spilled value
+  // now.
+  LRIDbgOperands.clear();
+}
+
+/// Insert reload instruction for \p PhysReg before \p Before.
+void RegAllocFast::reload(MachineBasicBlock::iterator Before, Register VirtReg,
+                          MCPhysReg PhysReg) {
+  LLVM_DEBUG(dbgs() << "Reloading " << printReg(VirtReg, TRI) << " into "
+                    << printReg(PhysReg, TRI) << '\n');
+  int FI = getStackSpaceFor(VirtReg);
+  const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+  TII->loadRegFromStackSlot(*MBB, Before, PhysReg, FI, &RC, TRI, VirtReg);
+  ++NumLoads;
+}
+
+/// Get basic block begin insertion point.
+/// This is not just MBB.begin() because surprisingly we have EH_LABEL
+/// instructions marking the begin of a basic block. This means we must insert
+/// new instructions after such labels...
+MachineBasicBlock::iterator
+RegAllocFast::getMBBBeginInsertionPoint(
+  MachineBasicBlock &MBB, SmallSet<Register, 2> &PrologLiveIns) const {
+  MachineBasicBlock::iterator I = MBB.begin();
+  while (I != MBB.end()) {
+    if (I->isLabel()) {
+      ++I;
+      continue;
+    }
+
+    // Most reloads should be inserted after prolog instructions.
+    if (!TII->isBasicBlockPrologue(*I))
+      break;
+
+    // However if a prolog instruction reads a register that needs to be
+    // reloaded, the reload should be inserted before the prolog.
+    for (MachineOperand &MO : I->operands()) {
+      if (MO.isReg())
+        PrologLiveIns.insert(MO.getReg());
+    }
+
+    ++I;
+  }
+
+  return I;
+}
+
+/// Reload all currently assigned virtual registers.
+void RegAllocFast::reloadAtBegin(MachineBasicBlock &MBB) {
+  if (LiveVirtRegs.empty())
+    return;
+
+  for (MachineBasicBlock::RegisterMaskPair P : MBB.liveins()) {
+    MCPhysReg Reg = P.PhysReg;
+    // Set state to live-in. This possibly overrides mappings to virtual
+    // registers but we don't care anymore at this point.
+    setPhysRegState(Reg, regLiveIn);
+  }
+
+
+  SmallSet<Register, 2> PrologLiveIns;
+
+  // The LiveRegMap is keyed by an unsigned (the virtreg number), so the order
+  // of spilling here is deterministic, if arbitrary.
+  MachineBasicBlock::iterator InsertBefore
+    = getMBBBeginInsertionPoint(MBB, PrologLiveIns);
+  for (const LiveReg &LR : LiveVirtRegs) {
+    MCPhysReg PhysReg = LR.PhysReg;
+    if (PhysReg == 0)
+      continue;
+
+    MCRegister FirstUnit = *TRI->regunits(PhysReg).begin();
+    if (RegUnitStates[FirstUnit] == regLiveIn)
+      continue;
+
+    assert((&MBB != &MBB.getParent()->front() || IgnoreMissingDefs) &&
+           "no reload in start block. Missing vreg def?");
+
+    if (PrologLiveIns.count(PhysReg)) {
+      // FIXME: Theoretically this should use an insert point skipping labels
+      // but I'm not sure how labels should interact with prolog instruction
+      // that need reloads.
+      reload(MBB.begin(), LR.VirtReg, PhysReg);
+    } else
+      reload(InsertBefore, LR.VirtReg, PhysReg);
+  }
+  LiveVirtRegs.clear();
+}
+
+/// Handle the direct use of a physical register.  Check that the register is
+/// not used by a virtreg. Kill the physreg, marking it free. This may add
+/// implicit kills to MO->getParent() and invalidate MO.
+bool RegAllocFast::usePhysReg(MachineInstr &MI, MCPhysReg Reg) {
+  assert(Register::isPhysicalRegister(Reg) && "expected physreg");
+  bool displacedAny = displacePhysReg(MI, Reg);
+  setPhysRegState(Reg, regPreAssigned);
+  markRegUsedInInstr(Reg);
+  return displacedAny;
+}
+
+bool RegAllocFast::definePhysReg(MachineInstr &MI, MCPhysReg Reg) {
+  bool displacedAny = displacePhysReg(MI, Reg);
+  setPhysRegState(Reg, regPreAssigned);
+  return displacedAny;
+}
+
+/// Mark PhysReg as reserved or free after spilling any virtregs. This is very
+/// similar to defineVirtReg except the physreg is reserved instead of
+/// allocated.
+bool RegAllocFast::displacePhysReg(MachineInstr &MI, MCPhysReg PhysReg) {
+  bool displacedAny = false;
+
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    switch (unsigned VirtReg = RegUnitStates[Unit]) {
+    default: {
+      LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
+      assert(LRI != LiveVirtRegs.end() && "datastructures in sync");
+      MachineBasicBlock::iterator ReloadBefore =
+          std::next((MachineBasicBlock::iterator)MI.getIterator());
+      reload(ReloadBefore, VirtReg, LRI->PhysReg);
+
+      setPhysRegState(LRI->PhysReg, regFree);
+      LRI->PhysReg = 0;
+      LRI->Reloaded = true;
+      displacedAny = true;
+      break;
+    }
+    case regPreAssigned:
+      RegUnitStates[Unit] = regFree;
+      displacedAny = true;
+      break;
+    case regFree:
+      break;
+    }
+  }
+  return displacedAny;
+}
+
+void RegAllocFast::freePhysReg(MCPhysReg PhysReg) {
+  LLVM_DEBUG(dbgs() << "Freeing " << printReg(PhysReg, TRI) << ':');
+
+  MCRegister FirstUnit = *TRI->regunits(PhysReg).begin();
+  switch (unsigned VirtReg = RegUnitStates[FirstUnit]) {
+  case regFree:
+    LLVM_DEBUG(dbgs() << '\n');
+    return;
+  case regPreAssigned:
+    LLVM_DEBUG(dbgs() << '\n');
+    setPhysRegState(PhysReg, regFree);
+    return;
+  default: {
+      LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
+      assert(LRI != LiveVirtRegs.end());
+      LLVM_DEBUG(dbgs() << ' ' << printReg(LRI->VirtReg, TRI) << '\n');
+      setPhysRegState(LRI->PhysReg, regFree);
+      LRI->PhysReg = 0;
+    }
+    return;
+  }
+}
+
+/// Return the cost of spilling clearing out PhysReg and aliases so it is free
+/// for allocation. Returns 0 when PhysReg is free or disabled with all aliases
+/// disabled - it can be allocated directly.
+/// \returns spillImpossible when PhysReg or an alias can't be spilled.
+unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    switch (unsigned VirtReg = RegUnitStates[Unit]) {
+    case regFree:
+      break;
+    case regPreAssigned:
+      LLVM_DEBUG(dbgs() << "Cannot spill pre-assigned "
+                        << printReg(PhysReg, TRI) << '\n');
+      return spillImpossible;
+    default: {
+      bool SureSpill = StackSlotForVirtReg[VirtReg] != -1 ||
+                       findLiveVirtReg(VirtReg)->LiveOut;
+      return SureSpill ? spillClean : spillDirty;
+    }
+    }
+  }
+  return 0;
+}
+
+void RegAllocFast::assignDanglingDebugValues(MachineInstr &Definition,
+                                             Register VirtReg, MCPhysReg Reg) {
+  auto UDBGValIter = DanglingDbgValues.find(VirtReg);
+  if (UDBGValIter == DanglingDbgValues.end())
+    return;
+
+  SmallVectorImpl<MachineInstr*> &Dangling = UDBGValIter->second;
+  for (MachineInstr *DbgValue : Dangling) {
+    assert(DbgValue->isDebugValue());
+    if (!DbgValue->hasDebugOperandForReg(VirtReg))
+      continue;
+
+    // Test whether the physreg survives from the definition to the DBG_VALUE.
+    MCPhysReg SetToReg = Reg;
+    unsigned Limit = 20;
+    for (MachineBasicBlock::iterator I = std::next(Definition.getIterator()),
+         E = DbgValue->getIterator(); I != E; ++I) {
+      if (I->modifiesRegister(Reg, TRI) || --Limit == 0) {
+        LLVM_DEBUG(dbgs() << "Register did not survive for " << *DbgValue
+                   << '\n');
+        SetToReg = 0;
+        break;
+      }
+    }
+    for (MachineOperand &MO : DbgValue->getDebugOperandsForReg(VirtReg)) {
+      MO.setReg(SetToReg);
+      if (SetToReg != 0)
+        MO.setIsRenamable();
+    }
+  }
+  Dangling.clear();
+}
+
+/// This method updates local state so that we know that PhysReg is the
+/// proper container for VirtReg now.  The physical register must not be used
+/// for anything else when this is called.
+void RegAllocFast::assignVirtToPhysReg(MachineInstr &AtMI, LiveReg &LR,
+                                       MCPhysReg PhysReg) {
+  Register VirtReg = LR.VirtReg;
+  LLVM_DEBUG(dbgs() << "Assigning " << printReg(VirtReg, TRI) << " to "
+                    << printReg(PhysReg, TRI) << '\n');
+  assert(LR.PhysReg == 0 && "Already assigned a physreg");
+  assert(PhysReg != 0 && "Trying to assign no register");
+  LR.PhysReg = PhysReg;
+  setPhysRegState(PhysReg, VirtReg);
+
+  assignDanglingDebugValues(AtMI, VirtReg, PhysReg);
+}
+
+static bool isCoalescable(const MachineInstr &MI) {
+  return MI.isFullCopy();
+}
+
+Register RegAllocFast::traceCopyChain(Register Reg) const {
+  static const unsigned ChainLengthLimit = 3;
+  unsigned C = 0;
+  do {
+    if (Reg.isPhysical())
+      return Reg;
+    assert(Reg.isVirtual());
+
+    MachineInstr *VRegDef = MRI->getUniqueVRegDef(Reg);
+    if (!VRegDef || !isCoalescable(*VRegDef))
+      return 0;
+    Reg = VRegDef->getOperand(1).getReg();
+  } while (++C <= ChainLengthLimit);
+  return 0;
+}
+
+/// Check if any of \p VirtReg's definitions is a copy. If it is follow the
+/// chain of copies to check whether we reach a physical register we can
+/// coalesce with.
+Register RegAllocFast::traceCopies(Register VirtReg) const {
+  static const unsigned DefLimit = 3;
+  unsigned C = 0;
+  for (const MachineInstr &MI : MRI->def_instructions(VirtReg)) {
+    if (isCoalescable(MI)) {
+      Register Reg = MI.getOperand(1).getReg();
+      Reg = traceCopyChain(Reg);
+      if (Reg.isValid())
+        return Reg;
+    }
+
+    if (++C >= DefLimit)
+      break;
+  }
+  return Register();
+}
+
+/// Allocates a physical register for VirtReg.
+void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR,
+                                Register Hint0, bool LookAtPhysRegUses) {
+  const Register VirtReg = LR.VirtReg;
+  assert(LR.PhysReg == 0);
+
+  const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+  LLVM_DEBUG(dbgs() << "Search register for " << printReg(VirtReg)
+                    << " in class " << TRI->getRegClassName(&RC)
+                    << " with hint " << printReg(Hint0, TRI) << '\n');
+
+  // Take hint when possible.
+  if (Hint0.isPhysical() && MRI->isAllocatable(Hint0) && RC.contains(Hint0) &&
+      !isRegUsedInInstr(Hint0, LookAtPhysRegUses)) {
+    // Take hint if the register is currently free.
+    if (isPhysRegFree(Hint0)) {
+      LLVM_DEBUG(dbgs() << "\tPreferred Register 1: " << printReg(Hint0, TRI)
+                        << '\n');
+      assignVirtToPhysReg(MI, LR, Hint0);
+      return;
+    } else {
+      LLVM_DEBUG(dbgs() << "\tPreferred Register 0: " << printReg(Hint0, TRI)
+                        << " occupied\n");
+    }
+  } else {
+    Hint0 = Register();
+  }
+
+
+  // Try other hint.
+  Register Hint1 = traceCopies(VirtReg);
+  if (Hint1.isPhysical() && MRI->isAllocatable(Hint1) && RC.contains(Hint1) &&
+      !isRegUsedInInstr(Hint1, LookAtPhysRegUses)) {
+    // Take hint if the register is currently free.
+    if (isPhysRegFree(Hint1)) {
+      LLVM_DEBUG(dbgs() << "\tPreferred Register 0: " << printReg(Hint1, TRI)
+                 << '\n');
+      assignVirtToPhysReg(MI, LR, Hint1);
+      return;
+    } else {
+      LLVM_DEBUG(dbgs() << "\tPreferred Register 1: " << printReg(Hint1, TRI)
+                 << " occupied\n");
+    }
+  } else {
+    Hint1 = Register();
+  }
+
+  MCPhysReg BestReg = 0;
+  unsigned BestCost = spillImpossible;
+  ArrayRef<MCPhysReg> AllocationOrder = RegClassInfo.getOrder(&RC);
+  for (MCPhysReg PhysReg : AllocationOrder) {
+    LLVM_DEBUG(dbgs() << "\tRegister: " << printReg(PhysReg, TRI) << ' ');
+    if (isRegUsedInInstr(PhysReg, LookAtPhysRegUses)) {
+      LLVM_DEBUG(dbgs() << "already used in instr.\n");
+      continue;
+    }
+
+    unsigned Cost = calcSpillCost(PhysReg);
+    LLVM_DEBUG(dbgs() << "Cost: " << Cost << " BestCost: " << BestCost << '\n');
+    // Immediate take a register with cost 0.
+    if (Cost == 0) {
+      assignVirtToPhysReg(MI, LR, PhysReg);
+      return;
+    }
+
+    if (PhysReg == Hint0 || PhysReg == Hint1)
+      Cost -= spillPrefBonus;
+
+    if (Cost < BestCost) {
+      BestReg = PhysReg;
+      BestCost = Cost;
+    }
+  }
+
+  if (!BestReg) {
+    // Nothing we can do: Report an error and keep going with an invalid
+    // allocation.
+    if (MI.isInlineAsm())
+      MI.emitError("inline assembly requires more registers than available");
+    else
+      MI.emitError("ran out of registers during register allocation");
+
+    LR.Error = true;
+    LR.PhysReg = 0;
+    return;
+  }
+
+  displacePhysReg(MI, BestReg);
+  assignVirtToPhysReg(MI, LR, BestReg);
+}
+
+void RegAllocFast::allocVirtRegUndef(MachineOperand &MO) {
+  assert(MO.isUndef() && "expected undef use");
+  Register VirtReg = MO.getReg();
+  assert(VirtReg.isVirtual() && "Expected virtreg");
+  if (!shouldAllocateRegister(VirtReg))
+    return;
+
+  LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg);
+  MCPhysReg PhysReg;
+  if (LRI != LiveVirtRegs.end() && LRI->PhysReg) {
+    PhysReg = LRI->PhysReg;
+  } else {
+    const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+    ArrayRef<MCPhysReg> AllocationOrder = RegClassInfo.getOrder(&RC);
+    assert(!AllocationOrder.empty() && "Allocation order must not be empty");
+    PhysReg = AllocationOrder[0];
+  }
+
+  unsigned SubRegIdx = MO.getSubReg();
+  if (SubRegIdx != 0) {
+    PhysReg = TRI->getSubReg(PhysReg, SubRegIdx);
+    MO.setSubReg(0);
+  }
+  MO.setReg(PhysReg);
+  MO.setIsRenamable(true);
+}
+
+/// Variation of defineVirtReg() with special handling for livethrough regs
+/// (tied or earlyclobber) that may interfere with preassigned uses.
+/// \return true if MI's MachineOperands were re-arranged/invalidated.
+bool RegAllocFast::defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum,
+                                            Register VirtReg) {
+  if (!shouldAllocateRegister(VirtReg))
+    return false;
+  LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
+  if (LRI != LiveVirtRegs.end()) {
+    MCPhysReg PrevReg = LRI->PhysReg;
+    if (PrevReg != 0 && isRegUsedInInstr(PrevReg, true)) {
+      LLVM_DEBUG(dbgs() << "Need new assignment for " << printReg(PrevReg, TRI)
+                        << " (tied/earlyclobber resolution)\n");
+      freePhysReg(PrevReg);
+      LRI->PhysReg = 0;
+      allocVirtReg(MI, *LRI, 0, true);
+      MachineBasicBlock::iterator InsertBefore =
+        std::next((MachineBasicBlock::iterator)MI.getIterator());
+      LLVM_DEBUG(dbgs() << "Copy " << printReg(LRI->PhysReg, TRI) << " to "
+                        << printReg(PrevReg, TRI) << '\n');
+      BuildMI(*MBB, InsertBefore, MI.getDebugLoc(),
+              TII->get(TargetOpcode::COPY), PrevReg)
+        .addReg(LRI->PhysReg, llvm::RegState::Kill);
+    }
+    MachineOperand &MO = MI.getOperand(OpNum);
+    if (MO.getSubReg() && !MO.isUndef()) {
+      LRI->LastUse = &MI;
+    }
+  }
+  return defineVirtReg(MI, OpNum, VirtReg, true);
+}
+
+/// Allocates a register for VirtReg definition. Typically the register is
+/// already assigned from a use of the virtreg, however we still need to
+/// perform an allocation if:
+/// - It is a dead definition without any uses.
+/// - The value is live out and all uses are in different basic blocks.
+///
+/// \return true if MI's MachineOperands were re-arranged/invalidated.
+bool RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum,
+                                 Register VirtReg, bool LookAtPhysRegUses) {
+  assert(VirtReg.isVirtual() && "Not a virtual register");
+  if (!shouldAllocateRegister(VirtReg))
+    return false;
+  MachineOperand &MO = MI.getOperand(OpNum);
+  LiveRegMap::iterator LRI;
+  bool New;
+  std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
+  if (New) {
+    if (!MO.isDead()) {
+      if (mayLiveOut(VirtReg)) {
+        LRI->LiveOut = true;
+      } else {
+        // It is a dead def without the dead flag; add the flag now.
+        MO.setIsDead(true);
+      }
+    }
+  }
+  if (LRI->PhysReg == 0)
+    allocVirtReg(MI, *LRI, 0, LookAtPhysRegUses);
+  else {
+    assert(!isRegUsedInInstr(LRI->PhysReg, LookAtPhysRegUses) &&
+           "TODO: preassign mismatch");
+    LLVM_DEBUG(dbgs() << "In def of " << printReg(VirtReg, TRI)
+                      << " use existing assignment to "
+                      << printReg(LRI->PhysReg, TRI) << '\n');
+  }
+
+  MCPhysReg PhysReg = LRI->PhysReg;
+  assert(PhysReg != 0 && "Register not assigned");
+  if (LRI->Reloaded || LRI->LiveOut) {
+    if (!MI.isImplicitDef()) {
+      MachineBasicBlock::iterator SpillBefore =
+          std::next((MachineBasicBlock::iterator)MI.getIterator());
+      LLVM_DEBUG(dbgs() << "Spill Reason: LO: " << LRI->LiveOut << " RL: "
+                        << LRI->Reloaded << '\n');
+      bool Kill = LRI->LastUse == nullptr;
+      spill(SpillBefore, VirtReg, PhysReg, Kill, LRI->LiveOut);
+
+      // We need to place additional spills for each indirect destination of an
+      // INLINEASM_BR.
+      if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) {
+        int FI = StackSlotForVirtReg[VirtReg];
+        const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+        for (MachineOperand &MO : MI.operands()) {
+          if (MO.isMBB()) {
+            MachineBasicBlock *Succ = MO.getMBB();
+            TII->storeRegToStackSlot(*Succ, Succ->begin(), PhysReg, Kill,
+                FI, &RC, TRI, VirtReg);
+            ++NumStores;
+            Succ->addLiveIn(PhysReg);
+          }
+        }
+      }
+
+      LRI->LastUse = nullptr;
+    }
+    LRI->LiveOut = false;
+    LRI->Reloaded = false;
+  }
+  if (MI.getOpcode() == TargetOpcode::BUNDLE) {
+    BundleVirtRegsMap[VirtReg] = PhysReg;
+  }
+  markRegUsedInInstr(PhysReg);
+  return setPhysReg(MI, MO, PhysReg);
+}
+
+/// Allocates a register for a VirtReg use.
+/// \return true if MI's MachineOperands were re-arranged/invalidated.
+bool RegAllocFast::useVirtReg(MachineInstr &MI, unsigned OpNum,
+                              Register VirtReg) {
+  assert(VirtReg.isVirtual() && "Not a virtual register");
+  if (!shouldAllocateRegister(VirtReg))
+    return false;
+  MachineOperand &MO = MI.getOperand(OpNum);
+  LiveRegMap::iterator LRI;
+  bool New;
+  std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
+  if (New) {
+    MachineOperand &MO = MI.getOperand(OpNum);
+    if (!MO.isKill()) {
+      if (mayLiveOut(VirtReg)) {
+        LRI->LiveOut = true;
+      } else {
+        // It is a last (killing) use without the kill flag; add the flag now.
+        MO.setIsKill(true);
+      }
+    }
+  } else {
+    assert((!MO.isKill() || LRI->LastUse == &MI) && "Invalid kill flag");
+  }
+
+  // If necessary allocate a register.
+  if (LRI->PhysReg == 0) {
+    assert(!MO.isTied() && "tied op should be allocated");
+    Register Hint;
+    if (MI.isCopy() && MI.getOperand(1).getSubReg() == 0) {
+      Hint = MI.getOperand(0).getReg();
+      if (Hint.isVirtual()) {
+        assert(!shouldAllocateRegister(Hint));
+        Hint = Register();
+      } else {
+        assert(Hint.isPhysical() &&
+               "Copy destination should already be assigned");
+      }
+    }
+    allocVirtReg(MI, *LRI, Hint, false);
+    if (LRI->Error) {
+      const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+      ArrayRef<MCPhysReg> AllocationOrder = RegClassInfo.getOrder(&RC);
+      return setPhysReg(MI, MO, *AllocationOrder.begin());
+    }
+  }
+
+  LRI->LastUse = &MI;
+
+  if (MI.getOpcode() == TargetOpcode::BUNDLE) {
+    BundleVirtRegsMap[VirtReg] = LRI->PhysReg;
+  }
+  markRegUsedInInstr(LRI->PhysReg);
+  return setPhysReg(MI, MO, LRI->PhysReg);
+}
+
+/// Changes operand OpNum in MI the refer the PhysReg, considering subregs.
+/// \return true if MI's MachineOperands were re-arranged/invalidated.
+bool RegAllocFast::setPhysReg(MachineInstr &MI, MachineOperand &MO,
+                              MCPhysReg PhysReg) {
+  if (!MO.getSubReg()) {
+    MO.setReg(PhysReg);
+    MO.setIsRenamable(true);
+    return false;
+  }
+
+  // Handle subregister index.
+  MO.setReg(PhysReg ? TRI->getSubReg(PhysReg, MO.getSubReg()) : MCRegister());
+  MO.setIsRenamable(true);
+  // Note: We leave the subreg number around a little longer in case of defs.
+  // This is so that the register freeing logic in allocateInstruction can still
+  // recognize this as subregister defs. The code there will clear the number.
+  if (!MO.isDef())
+    MO.setSubReg(0);
+
+  // A kill flag implies killing the full register. Add corresponding super
+  // register kill.
+  if (MO.isKill()) {
+    MI.addRegisterKilled(PhysReg, TRI, true);
+    // Conservatively assume implicit MOs were re-arranged
+    return true;
+  }
+
+  // A <def,read-undef> of a sub-register requires an implicit def of the full
+  // register.
+  if (MO.isDef() && MO.isUndef()) {
+    if (MO.isDead())
+      MI.addRegisterDead(PhysReg, TRI, true);
+    else
+      MI.addRegisterDefined(PhysReg, TRI);
+    // Conservatively assume implicit MOs were re-arranged
+    return true;
+  }
+  return false;
+}
+
+#ifndef NDEBUG
+
+void RegAllocFast::dumpState() const {
+  for (unsigned Unit = 1, UnitE = TRI->getNumRegUnits(); Unit != UnitE;
+       ++Unit) {
+    switch (unsigned VirtReg = RegUnitStates[Unit]) {
+    case regFree:
+      break;
+    case regPreAssigned:
+      dbgs() << " " << printRegUnit(Unit, TRI) << "[P]";
+      break;
+    case regLiveIn:
+      llvm_unreachable("Should not have regLiveIn in map");
+    default: {
+      dbgs() << ' ' << printRegUnit(Unit, TRI) << '=' << printReg(VirtReg);
+      LiveRegMap::const_iterator I = findLiveVirtReg(VirtReg);
+      assert(I != LiveVirtRegs.end() && "have LiveVirtRegs entry");
+      if (I->LiveOut || I->Reloaded) {
+        dbgs() << '[';
+        if (I->LiveOut) dbgs() << 'O';
+        if (I->Reloaded) dbgs() << 'R';
+        dbgs() << ']';
+      }
+      assert(TRI->hasRegUnit(I->PhysReg, Unit) && "inverse mapping present");
+      break;
+    }
+    }
+  }
+  dbgs() << '\n';
+  // Check that LiveVirtRegs is the inverse.
+  for (const LiveReg &LR : LiveVirtRegs) {
+    Register VirtReg = LR.VirtReg;
+    assert(VirtReg.isVirtual() && "Bad map key");
+    MCPhysReg PhysReg = LR.PhysReg;
+    if (PhysReg != 0) {
+      assert(Register::isPhysicalRegister(PhysReg) &&
+             "mapped to physreg");
+      for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+        assert(RegUnitStates[Unit] == VirtReg && "inverse map valid");
+      }
+    }
+  }
+}
+#endif
+
+/// Count number of defs consumed from each register class by \p Reg
+void RegAllocFast::addRegClassDefCounts(std::vector<unsigned> &RegClassDefCounts,
+                                        Register Reg) const {
+  assert(RegClassDefCounts.size() == TRI->getNumRegClasses());
+
+  if (Reg.isVirtual()) {
+    if (!shouldAllocateRegister(Reg))
+      return;
+    const TargetRegisterClass *OpRC = MRI->getRegClass(Reg);
+    for (unsigned RCIdx = 0, RCIdxEnd = TRI->getNumRegClasses();
+         RCIdx != RCIdxEnd; ++RCIdx) {
+      const TargetRegisterClass *IdxRC = TRI->getRegClass(RCIdx);
+      // FIXME: Consider aliasing sub/super registers.
+      if (OpRC->hasSubClassEq(IdxRC))
+        ++RegClassDefCounts[RCIdx];
+    }
+
+    return;
+  }
+
+  for (unsigned RCIdx = 0, RCIdxEnd = TRI->getNumRegClasses();
+       RCIdx != RCIdxEnd; ++RCIdx) {
+    const TargetRegisterClass *IdxRC = TRI->getRegClass(RCIdx);
+    for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) {
+      if (IdxRC->contains(*Alias)) {
+        ++RegClassDefCounts[RCIdx];
+        break;
+      }
+    }
+  }
+}
+
+/// Compute \ref DefOperandIndexes so it contains the indices of "def" operands
+/// that are to be allocated. Those are ordered in a way that small classes,
+/// early clobbers and livethroughs are allocated first.
+void RegAllocFast::findAndSortDefOperandIndexes(const MachineInstr &MI) {
+  DefOperandIndexes.clear();
+
+  // Track number of defs which may consume a register from the class.
+  std::vector<unsigned> RegClassDefCounts(TRI->getNumRegClasses(), 0);
+  assert(RegClassDefCounts[0] == 0);
+
+  LLVM_DEBUG(dbgs() << "Need to assign livethroughs\n");
+  for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I) {
+    const MachineOperand &MO = MI.getOperand(I);
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (MO.readsReg()) {
+      if (Reg.isPhysical()) {
+        LLVM_DEBUG(dbgs() << "mark extra used: " << printReg(Reg, TRI) << '\n');
+        markPhysRegUsedInInstr(Reg);
+      }
+    }
+
+    if (MO.isDef()) {
+      if (Reg.isVirtual() && shouldAllocateRegister(Reg))
+        DefOperandIndexes.push_back(I);
+
+      addRegClassDefCounts(RegClassDefCounts, Reg);
+    }
+  }
+
+  llvm::sort(DefOperandIndexes, [&](uint16_t I0, uint16_t I1) {
+    const MachineOperand &MO0 = MI.getOperand(I0);
+    const MachineOperand &MO1 = MI.getOperand(I1);
+    Register Reg0 = MO0.getReg();
+    Register Reg1 = MO1.getReg();
+    const TargetRegisterClass &RC0 = *MRI->getRegClass(Reg0);
+    const TargetRegisterClass &RC1 = *MRI->getRegClass(Reg1);
+
+    // Identify regclass that are easy to use up completely just in this
+    // instruction.
+    unsigned ClassSize0 = RegClassInfo.getOrder(&RC0).size();
+    unsigned ClassSize1 = RegClassInfo.getOrder(&RC1).size();
+
+    bool SmallClass0 = ClassSize0 < RegClassDefCounts[RC0.getID()];
+    bool SmallClass1 = ClassSize1 < RegClassDefCounts[RC1.getID()];
+    if (SmallClass0 > SmallClass1)
+      return true;
+    if (SmallClass0 < SmallClass1)
+      return false;
+
+    // Allocate early clobbers and livethrough operands first.
+    bool Livethrough0 = MO0.isEarlyClobber() || MO0.isTied() ||
+                        (MO0.getSubReg() == 0 && !MO0.isUndef());
+    bool Livethrough1 = MO1.isEarlyClobber() || MO1.isTied() ||
+                        (MO1.getSubReg() == 0 && !MO1.isUndef());
+    if (Livethrough0 > Livethrough1)
+      return true;
+    if (Livethrough0 < Livethrough1)
+      return false;
+
+    // Tie-break rule: operand index.
+    return I0 < I1;
+  });
+}
+
+void RegAllocFast::allocateInstruction(MachineInstr &MI) {
+  // The basic algorithm here is:
+  // 1. Mark registers of def operands as free
+  // 2. Allocate registers to use operands and place reload instructions for
+  //    registers displaced by the allocation.
+  //
+  // However we need to handle some corner cases:
+  // - pre-assigned defs and uses need to be handled before the other def/use
+  //   operands are processed to avoid the allocation heuristics clashing with
+  //   the pre-assignment.
+  // - The "free def operands" step has to come last instead of first for tied
+  //   operands and early-clobbers.
+
+  UsedInInstr.clear();
+  RegMasks.clear();
+  BundleVirtRegsMap.clear();
+
+  auto TiedOpIsUndef = [&](const MachineOperand &MO, unsigned Idx) {
+    assert(MO.isTied());
+    unsigned TiedIdx = MI.findTiedOperandIdx(Idx);
+    const MachineOperand &TiedMO = MI.getOperand(TiedIdx);
+    return TiedMO.isUndef();
+  };
+  // Scan for special cases; Apply pre-assigned register defs to state.
+  bool HasPhysRegUse = false;
+  bool HasRegMask = false;
+  bool HasVRegDef = false;
+  bool HasDef = false;
+  bool HasEarlyClobber = false;
+  bool NeedToAssignLiveThroughs = false;
+  for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+    MachineOperand &MO = MI.getOperand(I);
+    if (MO.isReg()) {
+      Register Reg = MO.getReg();
+      if (Reg.isVirtual()) {
+        if (!shouldAllocateRegister(Reg))
+          continue;
+        if (MO.isDef()) {
+          HasDef = true;
+          HasVRegDef = true;
+          if (MO.isEarlyClobber()) {
+            HasEarlyClobber = true;
+            NeedToAssignLiveThroughs = true;
+          }
+          if ((MO.isTied() && !TiedOpIsUndef(MO, I)) ||
+              (MO.getSubReg() != 0 && !MO.isUndef()))
+            NeedToAssignLiveThroughs = true;
+        }
+      } else if (Reg.isPhysical()) {
+        if (!MRI->isReserved(Reg)) {
+          if (MO.isDef()) {
+            HasDef = true;
+            bool displacedAny = definePhysReg(MI, Reg);
+            if (MO.isEarlyClobber())
+              HasEarlyClobber = true;
+            if (!displacedAny)
+              MO.setIsDead(true);
+          }
+          if (MO.readsReg())
+            HasPhysRegUse = true;
+        }
+      }
+    } else if (MO.isRegMask()) {
+      HasRegMask = true;
+      RegMasks.push_back(MO.getRegMask());
+    }
+  }
+
+  // Allocate virtreg defs.
+  if (HasDef) {
+    if (HasVRegDef) {
+      // Note that Implicit MOs can get re-arranged by defineVirtReg(), so loop
+      // multiple times to ensure no operand is missed.
+      bool ReArrangedImplicitOps = true;
+
+      // Special handling for early clobbers, tied operands or subregister defs:
+      // Compared to "normal" defs these:
+      // - Must not use a register that is pre-assigned for a use operand.
+      // - In order to solve tricky inline assembly constraints we change the
+      //   heuristic to figure out a good operand order before doing
+      //   assignments.
+      if (NeedToAssignLiveThroughs) {
+        PhysRegUses.clear();
+
+        while (ReArrangedImplicitOps) {
+          ReArrangedImplicitOps = false;
+          findAndSortDefOperandIndexes(MI);
+          for (uint16_t OpIdx : DefOperandIndexes) {
+            MachineOperand &MO = MI.getOperand(OpIdx);
+            LLVM_DEBUG(dbgs() << "Allocating " << MO << '\n');
+            unsigned Reg = MO.getReg();
+            if (MO.isEarlyClobber() ||
+                (MO.isTied() && !TiedOpIsUndef(MO, OpIdx)) ||
+                (MO.getSubReg() && !MO.isUndef())) {
+              ReArrangedImplicitOps = defineLiveThroughVirtReg(MI, OpIdx, Reg);
+            } else {
+              ReArrangedImplicitOps = defineVirtReg(MI, OpIdx, Reg);
+            }
+            if (ReArrangedImplicitOps) {
+              // Implicit operands of MI were re-arranged,
+              // re-compute DefOperandIndexes.
+              break;
+            }
+          }
+        }
+      } else {
+        // Assign virtual register defs.
+        while (ReArrangedImplicitOps) {
+          ReArrangedImplicitOps = false;
+          for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I) {
+            MachineOperand &MO = MI.getOperand(I);
+            if (!MO.isReg() || !MO.isDef())
+              continue;
+            Register Reg = MO.getReg();
+            if (Reg.isVirtual()) {
+              ReArrangedImplicitOps = defineVirtReg(MI, I, Reg);
+              if (ReArrangedImplicitOps) {
+                break;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Free registers occupied by defs.
+    // Iterate operands in reverse order, so we see the implicit super register
+    // defs first (we added them earlier in case of <def,read-undef>).
+    for (signed I = MI.getNumOperands() - 1; I >= 0; --I) {
+      MachineOperand &MO = MI.getOperand(I);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+
+      Register Reg = MO.getReg();
+
+      // subreg defs don't free the full register. We left the subreg number
+      // around as a marker in setPhysReg() to recognize this case here.
+      if (Reg.isPhysical() && MO.getSubReg() != 0) {
+        MO.setSubReg(0);
+        continue;
+      }
+
+      assert((!MO.isTied() || !isClobberedByRegMasks(MO.getReg())) &&
+             "tied def assigned to clobbered register");
+
+      // Do not free tied operands and early clobbers.
+      if ((MO.isTied() && !TiedOpIsUndef(MO, I)) || MO.isEarlyClobber())
+        continue;
+      if (!Reg)
+        continue;
+      if (Reg.isVirtual()) {
+        assert(!shouldAllocateRegister(Reg));
+        continue;
+      }
+      assert(Reg.isPhysical());
+      if (MRI->isReserved(Reg))
+        continue;
+      freePhysReg(Reg);
+      unmarkRegUsedInInstr(Reg);
+    }
+  }
+
+  // Displace clobbered registers.
+  if (HasRegMask) {
+    assert(!RegMasks.empty() && "expected RegMask");
+    // MRI bookkeeping.
+    for (const auto *RM : RegMasks)
+      MRI->addPhysRegsUsedFromRegMask(RM);
+
+    // Displace clobbered registers.
+    for (const LiveReg &LR : LiveVirtRegs) {
+      MCPhysReg PhysReg = LR.PhysReg;
+      if (PhysReg != 0 && isClobberedByRegMasks(PhysReg))
+        displacePhysReg(MI, PhysReg);
+    }
+  }
+
+  // Apply pre-assigned register uses to state.
+  if (HasPhysRegUse) {
+    for (MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg() || !MO.readsReg())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg.isPhysical())
+        continue;
+      if (MRI->isReserved(Reg))
+        continue;
+      bool displacedAny = usePhysReg(MI, Reg);
+      if (!displacedAny)
+        MO.setIsKill(true);
+    }
+  }
+
+  // Allocate virtreg uses and insert reloads as necessary.
+  // Implicit MOs can get moved/removed by useVirtReg(), so loop multiple
+  // times to ensure no operand is missed.
+  bool HasUndefUse = false;
+  bool ReArrangedImplicitMOs = true;
+  while (ReArrangedImplicitMOs) {
+    ReArrangedImplicitMOs = false;
+    for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+      MachineOperand &MO = MI.getOperand(I);
+      if (!MO.isReg() || !MO.isUse())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg.isVirtual() || !shouldAllocateRegister(Reg))
+        continue;
+
+      if (MO.isUndef()) {
+        HasUndefUse = true;
+        continue;
+      }
+
+      // Populate MayLiveAcrossBlocks in case the use block is allocated before
+      // the def block (removing the vreg uses).
+      mayLiveIn(Reg);
+
+      assert(!MO.isInternalRead() && "Bundles not supported");
+      assert(MO.readsReg() && "reading use");
+      ReArrangedImplicitMOs = useVirtReg(MI, I, Reg);
+      if (ReArrangedImplicitMOs)
+        break;
+    }
+  }
+
+  // Allocate undef operands. This is a separate step because in a situation
+  // like  ` = OP undef %X, %X`    both operands need the same register assign
+  // so we should perform the normal assignment first.
+  if (HasUndefUse) {
+    for (MachineOperand &MO : MI.all_uses()) {
+      Register Reg = MO.getReg();
+      if (!Reg.isVirtual() || !shouldAllocateRegister(Reg))
+        continue;
+
+      assert(MO.isUndef() && "Should only have undef virtreg uses left");
+      allocVirtRegUndef(MO);
+    }
+  }
+
+  // Free early clobbers.
+  if (HasEarlyClobber) {
+    for (MachineOperand &MO : llvm::reverse(MI.all_defs())) {
+      if (!MO.isEarlyClobber())
+        continue;
+      assert(!MO.getSubReg() && "should be already handled in def processing");
+
+      Register Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      if (Reg.isVirtual()) {
+        assert(!shouldAllocateRegister(Reg));
+        continue;
+      }
+      assert(Reg.isPhysical() && "should have register assigned");
+
+      // We sometimes get odd situations like:
+      //    early-clobber %x0 = INSTRUCTION %x0
+      // which is semantically questionable as the early-clobber should
+      // apply before the use. But in practice we consider the use to
+      // happen before the early clobber now. Don't free the early clobber
+      // register in this case.
+      if (MI.readsRegister(Reg, TRI))
+        continue;
+
+      freePhysReg(Reg);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "<< " << MI);
+  if (MI.isCopy() && MI.getOperand(0).getReg() == MI.getOperand(1).getReg() &&
+      MI.getNumOperands() == 2) {
+    LLVM_DEBUG(dbgs() << "Mark identity copy for removal\n");
+    Coalesced.push_back(&MI);
+  }
+}
+
+void RegAllocFast::handleDebugValue(MachineInstr &MI) {
+  // Ignore DBG_VALUEs that aren't based on virtual registers. These are
+  // mostly constants and frame indices.
+  for (Register Reg : MI.getUsedDebugRegs()) {
+    if (!Reg.isVirtual())
+      continue;
+    if (!shouldAllocateRegister(Reg))
+      continue;
+
+    // Already spilled to a stackslot?
+    int SS = StackSlotForVirtReg[Reg];
+    if (SS != -1) {
+      // Modify DBG_VALUE now that the value is in a spill slot.
+      updateDbgValueForSpill(MI, SS, Reg);
+      LLVM_DEBUG(dbgs() << "Rewrite DBG_VALUE for spilled memory: " << MI);
+      continue;
+    }
+
+    // See if this virtual register has already been allocated to a physical
+    // register or spilled to a stack slot.
+    LiveRegMap::iterator LRI = findLiveVirtReg(Reg);
+    SmallVector<MachineOperand *> DbgOps;
+    for (MachineOperand &Op : MI.getDebugOperandsForReg(Reg))
+      DbgOps.push_back(&Op);
+
+    if (LRI != LiveVirtRegs.end() && LRI->PhysReg) {
+      // Update every use of Reg within MI.
+      for (auto &RegMO : DbgOps)
+        setPhysReg(MI, *RegMO, LRI->PhysReg);
+    } else {
+      DanglingDbgValues[Reg].push_back(&MI);
+    }
+
+    // If Reg hasn't been spilled, put this DBG_VALUE in LiveDbgValueMap so
+    // that future spills of Reg will have DBG_VALUEs.
+    LiveDbgValueMap[Reg].append(DbgOps.begin(), DbgOps.end());
+  }
+}
+
+void RegAllocFast::handleBundle(MachineInstr &MI) {
+  MachineBasicBlock::instr_iterator BundledMI = MI.getIterator();
+  ++BundledMI;
+  while (BundledMI->isBundledWithPred()) {
+    for (MachineOperand &MO : BundledMI->operands()) {
+      if (!MO.isReg())
+        continue;
+
+      Register Reg = MO.getReg();
+      if (!Reg.isVirtual() || !shouldAllocateRegister(Reg))
+        continue;
+
+      DenseMap<Register, MCPhysReg>::iterator DI;
+      DI = BundleVirtRegsMap.find(Reg);
+      assert(DI != BundleVirtRegsMap.end() && "Unassigned virtual register");
+
+      setPhysReg(MI, MO, DI->second);
+    }
+
+    ++BundledMI;
+  }
+}
+
+void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
+  this->MBB = &MBB;
+  LLVM_DEBUG(dbgs() << "\nAllocating " << MBB);
+
+  RegUnitStates.assign(TRI->getNumRegUnits(), regFree);
+  assert(LiveVirtRegs.empty() && "Mapping not cleared from last block?");
+
+  for (const auto &LiveReg : MBB.liveouts())
+    setPhysRegState(LiveReg.PhysReg, regPreAssigned);
+
+  Coalesced.clear();
+
+  // Traverse block in reverse order allocating instructions one by one.
+  for (MachineInstr &MI : reverse(MBB)) {
+    LLVM_DEBUG(
+      dbgs() << "\n>> " << MI << "Regs:";
+      dumpState()
+    );
+
+    // Special handling for debug values. Note that they are not allowed to
+    // affect codegen of the other instructions in any way.
+    if (MI.isDebugValue()) {
+      handleDebugValue(MI);
+      continue;
+    }
+
+    allocateInstruction(MI);
+
+    // Once BUNDLE header is assigned registers, same assignments need to be
+    // done for bundled MIs.
+    if (MI.getOpcode() == TargetOpcode::BUNDLE) {
+      handleBundle(MI);
+    }
+  }
+
+  LLVM_DEBUG(
+    dbgs() << "Begin Regs:";
+    dumpState()
+  );
+
+  // Spill all physical registers holding virtual registers now.
+  LLVM_DEBUG(dbgs() << "Loading live registers at begin of block.\n");
+  reloadAtBegin(MBB);
+
+  // Erase all the coalesced copies. We are delaying it until now because
+  // LiveVirtRegs might refer to the instrs.
+  for (MachineInstr *MI : Coalesced)
+    MBB.erase(MI);
+  NumCoalesced += Coalesced.size();
+
+  for (auto &UDBGPair : DanglingDbgValues) {
+    for (MachineInstr *DbgValue : UDBGPair.second) {
+      assert(DbgValue->isDebugValue() && "expected DBG_VALUE");
+      // Nothing to do if the vreg was spilled in the meantime.
+      if (!DbgValue->hasDebugOperandForReg(UDBGPair.first))
+        continue;
+      LLVM_DEBUG(dbgs() << "Register did not survive for " << *DbgValue
+                 << '\n');
+      DbgValue->setDebugValueUndef();
+    }
+  }
+  DanglingDbgValues.clear();
+
+  LLVM_DEBUG(MBB.dump());
+}
+
+bool RegAllocFast::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "********** FAST REGISTER ALLOCATION **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
+  MRI = &MF.getRegInfo();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  TRI = STI.getRegisterInfo();
+  TII = STI.getInstrInfo();
+  MFI = &MF.getFrameInfo();
+  MRI->freezeReservedRegs(MF);
+  RegClassInfo.runOnMachineFunction(MF);
+  unsigned NumRegUnits = TRI->getNumRegUnits();
+  UsedInInstr.clear();
+  UsedInInstr.setUniverse(NumRegUnits);
+  PhysRegUses.clear();
+  PhysRegUses.setUniverse(NumRegUnits);
+
+  // initialize the virtual->physical register map to have a 'null'
+  // mapping for all virtual registers
+  unsigned NumVirtRegs = MRI->getNumVirtRegs();
+  StackSlotForVirtReg.resize(NumVirtRegs);
+  LiveVirtRegs.setUniverse(NumVirtRegs);
+  MayLiveAcrossBlocks.clear();
+  MayLiveAcrossBlocks.resize(NumVirtRegs);
+
+  // Loop over all of the basic blocks, eliminating virtual register references
+  for (MachineBasicBlock &MBB : MF)
+    allocateBasicBlock(MBB);
+
+  if (ClearVirtRegs) {
+    // All machine operands and other references to virtual registers have been
+    // replaced. Remove the virtual registers.
+    MRI->clearVirtRegs();
+  }
+
+  StackSlotForVirtReg.clear();
+  LiveDbgValueMap.clear();
+  return true;
+}
+
+FunctionPass *llvm::createFastRegisterAllocator() {
+  return new RegAllocFast();
+}
+
+FunctionPass *llvm::createFastRegisterAllocator(RegClassFilterFunc Ftor,
+                                                bool ClearVirtRegs) {
+  return new RegAllocFast(Ftor, ClearVirtRegs);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp
new file mode 100644
index 000000000000..48187e575494
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -0,0 +1,2669 @@
+//===- RegAllocGreedy.cpp - greedy register allocator ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RAGreedy function pass for register allocation in
+// optimized builds.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RegAllocGreedy.h"
+#include "AllocationOrder.h"
+#include "InterferenceCache.h"
+#include "LiveDebugVariables.h"
+#include "RegAllocBase.h"
+#include "RegAllocEvictionAdvisor.h"
+#include "RegAllocPriorityAdvisor.h"
+#include "SpillPlacement.h"
+#include "SplitKit.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalUnion.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/LiveStacks.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/Spiller.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(NumGlobalSplits, "Number of split global live ranges");
+STATISTIC(NumLocalSplits,  "Number of split local live ranges");
+STATISTIC(NumEvicted,      "Number of interferences evicted");
+
+static cl::opt<SplitEditor::ComplementSpillMode> SplitSpillMode(
+    "split-spill-mode", cl::Hidden,
+    cl::desc("Spill mode for splitting live ranges"),
+    cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"),
+               clEnumValN(SplitEditor::SM_Size, "size", "Optimize for size"),
+               clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed")),
+    cl::init(SplitEditor::SM_Speed));
+
+static cl::opt<unsigned>
+LastChanceRecoloringMaxDepth("lcr-max-depth", cl::Hidden,
+                             cl::desc("Last chance recoloring max depth"),
+                             cl::init(5));
+
+static cl::opt<unsigned> LastChanceRecoloringMaxInterference(
+    "lcr-max-interf", cl::Hidden,
+    cl::desc("Last chance recoloring maximum number of considered"
+             " interference at a time"),
+    cl::init(8));
+
+static cl::opt<bool> ExhaustiveSearch(
+    "exhaustive-register-search", cl::NotHidden,
+    cl::desc("Exhaustive Search for registers bypassing the depth "
+             "and interference cutoffs of last chance recoloring"),
+    cl::Hidden);
+
+static cl::opt<bool> EnableDeferredSpilling(
+    "enable-deferred-spilling", cl::Hidden,
+    cl::desc("Instead of spilling a variable right away, defer the actual "
+             "code insertion to the end of the allocation. That way the "
+             "allocator might still find a suitable coloring for this "
+             "variable because of other evicted variables."),
+    cl::init(false));
+
+// FIXME: Find a good default for this flag and remove the flag.
+static cl::opt<unsigned>
+CSRFirstTimeCost("regalloc-csr-first-time-cost",
+              cl::desc("Cost for first time use of callee-saved register."),
+              cl::init(0), cl::Hidden);
+
+static cl::opt<unsigned long> GrowRegionComplexityBudget(
+    "grow-region-complexity-budget",
+    cl::desc("growRegion() does not scale with the number of BB edges, so "
+             "limit its budget and bail out once we reach the limit."),
+    cl::init(10000), cl::Hidden);
+
+static cl::opt<bool> GreedyRegClassPriorityTrumpsGlobalness(
+    "greedy-regclass-priority-trumps-globalness",
+    cl::desc("Change the greedy register allocator's live range priority "
+             "calculation to make the AllocationPriority of the register class "
+             "more important then whether the range is global"),
+    cl::Hidden);
+
+static cl::opt<bool> GreedyReverseLocalAssignment(
+    "greedy-reverse-local-assignment",
+    cl::desc("Reverse allocation order of local live ranges, such that "
+             "shorter local live ranges will tend to be allocated first"),
+    cl::Hidden);
+
+static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator",
+                                       createGreedyRegisterAllocator);
+
+char RAGreedy::ID = 0;
+char &llvm::RAGreedyID = RAGreedy::ID;
+
+INITIALIZE_PASS_BEGIN(RAGreedy, "greedy",
+                "Greedy Register Allocator", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(RegisterCoalescer)
+INITIALIZE_PASS_DEPENDENCY(MachineScheduler)
+INITIALIZE_PASS_DEPENDENCY(LiveStacks)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
+INITIALIZE_PASS_DEPENDENCY(EdgeBundles)
+INITIALIZE_PASS_DEPENDENCY(SpillPlacement)
+INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
+INITIALIZE_PASS_DEPENDENCY(RegAllocEvictionAdvisorAnalysis)
+INITIALIZE_PASS_DEPENDENCY(RegAllocPriorityAdvisorAnalysis)
+INITIALIZE_PASS_END(RAGreedy, "greedy",
+                "Greedy Register Allocator", false, false)
+
+#ifndef NDEBUG
+const char *const RAGreedy::StageName[] = {
+    "RS_New",
+    "RS_Assign",
+    "RS_Split",
+    "RS_Split2",
+    "RS_Spill",
+    "RS_Memory",
+    "RS_Done"
+};
+#endif
+
+// Hysteresis to use when comparing floats.
+// This helps stabilize decisions based on float comparisons.
+const float Hysteresis = (2007 / 2048.0f); // 0.97998046875
+
+FunctionPass* llvm::createGreedyRegisterAllocator() {
+  return new RAGreedy();
+}
+
+FunctionPass *llvm::createGreedyRegisterAllocator(RegClassFilterFunc Ftor) {
+  return new RAGreedy(Ftor);
+}
+
+RAGreedy::RAGreedy(RegClassFilterFunc F):
+  MachineFunctionPass(ID),
+  RegAllocBase(F) {
+}
+
+void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<MachineBlockFrequencyInfo>();
+  AU.addPreserved<MachineBlockFrequencyInfo>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  AU.addRequired<SlotIndexes>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<LiveDebugVariables>();
+  AU.addPreserved<LiveDebugVariables>();
+  AU.addRequired<LiveStacks>();
+  AU.addPreserved<LiveStacks>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<VirtRegMap>();
+  AU.addPreserved<VirtRegMap>();
+  AU.addRequired<LiveRegMatrix>();
+  AU.addPreserved<LiveRegMatrix>();
+  AU.addRequired<EdgeBundles>();
+  AU.addRequired<SpillPlacement>();
+  AU.addRequired<MachineOptimizationRemarkEmitterPass>();
+  AU.addRequired<RegAllocEvictionAdvisorAnalysis>();
+  AU.addRequired<RegAllocPriorityAdvisorAnalysis>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+//===----------------------------------------------------------------------===//
+//                     LiveRangeEdit delegate methods
+//===----------------------------------------------------------------------===//
+
+bool RAGreedy::LRE_CanEraseVirtReg(Register VirtReg) {
+  LiveInterval &LI = LIS->getInterval(VirtReg);
+  if (VRM->hasPhys(VirtReg)) {
+    Matrix->unassign(LI);
+    aboutToRemoveInterval(LI);
+    return true;
+  }
+  // Unassigned virtreg is probably in the priority queue.
+  // RegAllocBase will erase it after dequeueing.
+  // Nonetheless, clear the live-range so that the debug
+  // dump will show the right state for that VirtReg.
+  LI.clear();
+  return false;
+}
+
+void RAGreedy::LRE_WillShrinkVirtReg(Register VirtReg) {
+  if (!VRM->hasPhys(VirtReg))
+    return;
+
+  // Register is assigned, put it back on the queue for reassignment.
+  LiveInterval &LI = LIS->getInterval(VirtReg);
+  Matrix->unassign(LI);
+  RegAllocBase::enqueue(&LI);
+}
+
+void RAGreedy::LRE_DidCloneVirtReg(Register New, Register Old) {
+  ExtraInfo->LRE_DidCloneVirtReg(New, Old);
+}
+
+void RAGreedy::ExtraRegInfo::LRE_DidCloneVirtReg(Register New, Register Old) {
+  // Cloning a register we haven't even heard about yet?  Just ignore it.
+  if (!Info.inBounds(Old))
+    return;
+
+  // LRE may clone a virtual register because dead code elimination causes it to
+  // be split into connected components. The new components are much smaller
+  // than the original, so they should get a new chance at being assigned.
+  // same stage as the parent.
+  Info[Old].Stage = RS_Assign;
+  Info.grow(New.id());
+  Info[New] = Info[Old];
+}
+
+void RAGreedy::releaseMemory() {
+  SpillerInstance.reset();
+  GlobalCand.clear();
+}
+
+void RAGreedy::enqueueImpl(const LiveInterval *LI) { enqueue(Queue, LI); }
+
+void RAGreedy::enqueue(PQueue &CurQueue, const LiveInterval *LI) {
+  // Prioritize live ranges by size, assigning larger ranges first.
+  // The queue holds (size, reg) pairs.
+  const Register Reg = LI->reg();
+  assert(Reg.isVirtual() && "Can only enqueue virtual registers");
+
+  auto Stage = ExtraInfo->getOrInitStage(Reg);
+  if (Stage == RS_New) {
+    Stage = RS_Assign;
+    ExtraInfo->setStage(Reg, Stage);
+  }
+
+  unsigned Ret = PriorityAdvisor->getPriority(*LI);
+
+  // The virtual register number is a tie breaker for same-sized ranges.
+  // Give lower vreg numbers higher priority to assign them first.
+  CurQueue.push(std::make_pair(Ret, ~Reg));
+}
+
+unsigned DefaultPriorityAdvisor::getPriority(const LiveInterval &LI) const {
+  const unsigned Size = LI.getSize();
+  const Register Reg = LI.reg();
+  unsigned Prio;
+  LiveRangeStage Stage = RA.getExtraInfo().getStage(LI);
+
+  if (Stage == RS_Split) {
+    // Unsplit ranges that couldn't be allocated immediately are deferred until
+    // everything else has been allocated.
+    Prio = Size;
+  } else if (Stage == RS_Memory) {
+    // Memory operand should be considered last.
+    // Change the priority such that Memory operand are assigned in
+    // the reverse order that they came in.
+    // TODO: Make this a member variable and probably do something about hints.
+    static unsigned MemOp = 0;
+    Prio = MemOp++;
+  } else {
+    // Giant live ranges fall back to the global assignment heuristic, which
+    // prevents excessive spilling in pathological cases.
+    const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
+    bool ForceGlobal = RC.GlobalPriority ||
+                       (!ReverseLocalAssignment &&
+                        (Size / SlotIndex::InstrDist) >
+                            (2 * RegClassInfo.getNumAllocatableRegs(&RC)));
+    unsigned GlobalBit = 0;
+
+    if (Stage == RS_Assign && !ForceGlobal && !LI.empty() &&
+        LIS->intervalIsInOneMBB(LI)) {
+      // Allocate original local ranges in linear instruction order. Since they
+      // are singly defined, this produces optimal coloring in the absence of
+      // global interference and other constraints.
+      if (!ReverseLocalAssignment)
+        Prio = LI.beginIndex().getApproxInstrDistance(Indexes->getLastIndex());
+      else {
+        // Allocating bottom up may allow many short LRGs to be assigned first
+        // to one of the cheap registers. This could be much faster for very
+        // large blocks on targets with many physical registers.
+        Prio = Indexes->getZeroIndex().getApproxInstrDistance(LI.endIndex());
+      }
+    } else {
+      // Allocate global and split ranges in long->short order. Long ranges that
+      // don't fit should be spilled (or split) ASAP so they don't create
+      // interference.  Mark a bit to prioritize global above local ranges.
+      Prio = Size;
+      GlobalBit = 1;
+    }
+
+    // Priority bit layout:
+    // 31 RS_Assign priority
+    // 30 Preference priority
+    // if (RegClassPriorityTrumpsGlobalness)
+    //   29-25 AllocPriority
+    //   24 GlobalBit
+    // else
+    //   29 Global bit
+    //   28-24 AllocPriority
+    // 0-23 Size/Instr distance
+
+    // Clamp the size to fit with the priority masking scheme
+    Prio = std::min(Prio, (unsigned)maxUIntN(24));
+    assert(isUInt<5>(RC.AllocationPriority) && "allocation priority overflow");
+
+    if (RegClassPriorityTrumpsGlobalness)
+      Prio |= RC.AllocationPriority << 25 | GlobalBit << 24;
+    else
+      Prio |= GlobalBit << 29 | RC.AllocationPriority << 24;
+
+    // Mark a higher bit to prioritize global and local above RS_Split.
+    Prio |= (1u << 31);
+
+    // Boost ranges that have a physical register hint.
+    if (VRM->hasKnownPreference(Reg))
+      Prio |= (1u << 30);
+  }
+
+  return Prio;
+}
+
+const LiveInterval *RAGreedy::dequeue() { return dequeue(Queue); }
+
+const LiveInterval *RAGreedy::dequeue(PQueue &CurQueue) {
+  if (CurQueue.empty())
+    return nullptr;
+  LiveInterval *LI = &LIS->getInterval(~CurQueue.top().second);
+  CurQueue.pop();
+  return LI;
+}
+
+//===----------------------------------------------------------------------===//
+//                            Direct Assignment
+//===----------------------------------------------------------------------===//
+
+/// tryAssign - Try to assign VirtReg to an available register.
+MCRegister RAGreedy::tryAssign(const LiveInterval &VirtReg,
+                               AllocationOrder &Order,
+                               SmallVectorImpl<Register> &NewVRegs,
+                               const SmallVirtRegSet &FixedRegisters) {
+  MCRegister PhysReg;
+  for (auto I = Order.begin(), E = Order.end(); I != E && !PhysReg; ++I) {
+    assert(*I);
+    if (!Matrix->checkInterference(VirtReg, *I)) {
+      if (I.isHint())
+        return *I;
+      else
+        PhysReg = *I;
+    }
+  }
+  if (!PhysReg.isValid())
+    return PhysReg;
+
+  // PhysReg is available, but there may be a better choice.
+
+  // If we missed a simple hint, try to cheaply evict interference from the
+  // preferred register.
+  if (Register Hint = MRI->getSimpleHint(VirtReg.reg()))
+    if (Order.isHint(Hint)) {
+      MCRegister PhysHint = Hint.asMCReg();
+      LLVM_DEBUG(dbgs() << "missed hint " << printReg(PhysHint, TRI) << '\n');
+
+      if (EvictAdvisor->canEvictHintInterference(VirtReg, PhysHint,
+                                                 FixedRegisters)) {
+        evictInterference(VirtReg, PhysHint, NewVRegs);
+        return PhysHint;
+      }
+      // Record the missed hint, we may be able to recover
+      // at the end if the surrounding allocation changed.
+      SetOfBrokenHints.insert(&VirtReg);
+    }
+
+  // Try to evict interference from a cheaper alternative.
+  uint8_t Cost = RegCosts[PhysReg];
+
+  // Most registers have 0 additional cost.
+  if (!Cost)
+    return PhysReg;
+
+  LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << " is available at cost "
+                    << (unsigned)Cost << '\n');
+  MCRegister CheapReg = tryEvict(VirtReg, Order, NewVRegs, Cost, FixedRegisters);
+  return CheapReg ? CheapReg : PhysReg;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Interference eviction
+//===----------------------------------------------------------------------===//
+
+bool RegAllocEvictionAdvisor::canReassign(const LiveInterval &VirtReg,
+                                          MCRegister FromReg) const {
+  auto HasRegUnitInterference = [&](MCRegUnit Unit) {
+    // Instantiate a "subquery", not to be confused with the Queries array.
+    LiveIntervalUnion::Query SubQ(VirtReg, Matrix->getLiveUnions()[Unit]);
+    return SubQ.checkInterference();
+  };
+
+  for (MCRegister Reg :
+       AllocationOrder::create(VirtReg.reg(), *VRM, RegClassInfo, Matrix)) {
+    if (Reg == FromReg)
+      continue;
+    // If no units have interference, reassignment is possible.
+    if (none_of(TRI->regunits(Reg), HasRegUnitInterference)) {
+      LLVM_DEBUG(dbgs() << "can reassign: " << VirtReg << " from "
+                        << printReg(FromReg, TRI) << " to "
+                        << printReg(Reg, TRI) << '\n');
+      return true;
+    }
+  }
+  return false;
+}
+
+/// evictInterference - Evict any interferring registers that prevent VirtReg
+/// from being assigned to Physreg. This assumes that canEvictInterference
+/// returned true.
+void RAGreedy::evictInterference(const LiveInterval &VirtReg,
+                                 MCRegister PhysReg,
+                                 SmallVectorImpl<Register> &NewVRegs) {
+  // Make sure that VirtReg has a cascade number, and assign that cascade
+  // number to every evicted register. These live ranges than then only be
+  // evicted by a newer cascade, preventing infinite loops.
+  unsigned Cascade = ExtraInfo->getOrAssignNewCascade(VirtReg.reg());
+
+  LLVM_DEBUG(dbgs() << "evicting " << printReg(PhysReg, TRI)
+                    << " interference: Cascade " << Cascade << '\n');
+
+  // Collect all interfering virtregs first.
+  SmallVector<const LiveInterval *, 8> Intfs;
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, Unit);
+    // We usually have the interfering VRegs cached so collectInterferingVRegs()
+    // should be fast, we may need to recalculate if when different physregs
+    // overlap the same register unit so we had different SubRanges queried
+    // against it.
+    ArrayRef<const LiveInterval *> IVR = Q.interferingVRegs();
+    Intfs.append(IVR.begin(), IVR.end());
+  }
+
+  // Evict them second. This will invalidate the queries.
+  for (const LiveInterval *Intf : Intfs) {
+    // The same VirtReg may be present in multiple RegUnits. Skip duplicates.
+    if (!VRM->hasPhys(Intf->reg()))
+      continue;
+
+    Matrix->unassign(*Intf);
+    assert((ExtraInfo->getCascade(Intf->reg()) < Cascade ||
+            VirtReg.isSpillable() < Intf->isSpillable()) &&
+           "Cannot decrease cascade number, illegal eviction");
+    ExtraInfo->setCascade(Intf->reg(), Cascade);
+    ++NumEvicted;
+    NewVRegs.push_back(Intf->reg());
+  }
+}
+
+/// Returns true if the given \p PhysReg is a callee saved register and has not
+/// been used for allocation yet.
+bool RegAllocEvictionAdvisor::isUnusedCalleeSavedReg(MCRegister PhysReg) const {
+  MCRegister CSR = RegClassInfo.getLastCalleeSavedAlias(PhysReg);
+  if (!CSR)
+    return false;
+
+  return !Matrix->isPhysRegUsed(PhysReg);
+}
+
+std::optional<unsigned>
+RegAllocEvictionAdvisor::getOrderLimit(const LiveInterval &VirtReg,
+                                       const AllocationOrder &Order,
+                                       unsigned CostPerUseLimit) const {
+  unsigned OrderLimit = Order.getOrder().size();
+
+  if (CostPerUseLimit < uint8_t(~0u)) {
+    // Check of any registers in RC are below CostPerUseLimit.
+    const TargetRegisterClass *RC = MRI->getRegClass(VirtReg.reg());
+    uint8_t MinCost = RegClassInfo.getMinCost(RC);
+    if (MinCost >= CostPerUseLimit) {
+      LLVM_DEBUG(dbgs() << TRI->getRegClassName(RC) << " minimum cost = "
+                        << MinCost << ", no cheaper registers to be found.\n");
+      return std::nullopt;
+    }
+
+    // It is normal for register classes to have a long tail of registers with
+    // the same cost. We don't need to look at them if they're too expensive.
+    if (RegCosts[Order.getOrder().back()] >= CostPerUseLimit) {
+      OrderLimit = RegClassInfo.getLastCostChange(RC);
+      LLVM_DEBUG(dbgs() << "Only trying the first " << OrderLimit
+                        << " regs.\n");
+    }
+  }
+  return OrderLimit;
+}
+
+bool RegAllocEvictionAdvisor::canAllocatePhysReg(unsigned CostPerUseLimit,
+                                                 MCRegister PhysReg) const {
+  if (RegCosts[PhysReg] >= CostPerUseLimit)
+    return false;
+  // The first use of a callee-saved register in a function has cost 1.
+  // Don't start using a CSR when the CostPerUseLimit is low.
+  if (CostPerUseLimit == 1 && isUnusedCalleeSavedReg(PhysReg)) {
+    LLVM_DEBUG(
+        dbgs() << printReg(PhysReg, TRI) << " would clobber CSR "
+               << printReg(RegClassInfo.getLastCalleeSavedAlias(PhysReg), TRI)
+               << '\n');
+    return false;
+  }
+  return true;
+}
+
+/// tryEvict - Try to evict all interferences for a physreg.
+/// @param  VirtReg Currently unassigned virtual register.
+/// @param  Order   Physregs to try.
+/// @return         Physreg to assign VirtReg, or 0.
+MCRegister RAGreedy::tryEvict(const LiveInterval &VirtReg,
+                              AllocationOrder &Order,
+                              SmallVectorImpl<Register> &NewVRegs,
+                              uint8_t CostPerUseLimit,
+                              const SmallVirtRegSet &FixedRegisters) {
+  NamedRegionTimer T("evict", "Evict", TimerGroupName, TimerGroupDescription,
+                     TimePassesIsEnabled);
+
+  MCRegister BestPhys = EvictAdvisor->tryFindEvictionCandidate(
+      VirtReg, Order, CostPerUseLimit, FixedRegisters);
+  if (BestPhys.isValid())
+    evictInterference(VirtReg, BestPhys, NewVRegs);
+  return BestPhys;
+}
+
+//===----------------------------------------------------------------------===//
+//                              Region Splitting
+//===----------------------------------------------------------------------===//
+
+/// addSplitConstraints - Fill out the SplitConstraints vector based on the
+/// interference pattern in Physreg and its aliases. Add the constraints to
+/// SpillPlacement and return the static cost of this split in Cost, assuming
+/// that all preferences in SplitConstraints are met.
+/// Return false if there are no bundles with positive bias.
+bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
+                                   BlockFrequency &Cost) {
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+
+  // Reset interference dependent info.
+  SplitConstraints.resize(UseBlocks.size());
+  BlockFrequency StaticCost = 0;
+  for (unsigned I = 0; I != UseBlocks.size(); ++I) {
+    const SplitAnalysis::BlockInfo &BI = UseBlocks[I];
+    SpillPlacement::BlockConstraint &BC = SplitConstraints[I];
+
+    BC.Number = BI.MBB->getNumber();
+    Intf.moveToBlock(BC.Number);
+    BC.Entry = BI.LiveIn ? SpillPlacement::PrefReg : SpillPlacement::DontCare;
+    BC.Exit = (BI.LiveOut &&
+               !LIS->getInstructionFromIndex(BI.LastInstr)->isImplicitDef())
+                  ? SpillPlacement::PrefReg
+                  : SpillPlacement::DontCare;
+    BC.ChangesValue = BI.FirstDef.isValid();
+
+    if (!Intf.hasInterference())
+      continue;
+
+    // Number of spill code instructions to insert.
+    unsigned Ins = 0;
+
+    // Interference for the live-in value.
+    if (BI.LiveIn) {
+      if (Intf.first() <= Indexes->getMBBStartIdx(BC.Number)) {
+        BC.Entry = SpillPlacement::MustSpill;
+        ++Ins;
+      } else if (Intf.first() < BI.FirstInstr) {
+        BC.Entry = SpillPlacement::PrefSpill;
+        ++Ins;
+      } else if (Intf.first() < BI.LastInstr) {
+        ++Ins;
+      }
+
+      // Abort if the spill cannot be inserted at the MBB' start
+      if (((BC.Entry == SpillPlacement::MustSpill) ||
+           (BC.Entry == SpillPlacement::PrefSpill)) &&
+          SlotIndex::isEarlierInstr(BI.FirstInstr,
+                                    SA->getFirstSplitPoint(BC.Number)))
+        return false;
+    }
+
+    // Interference for the live-out value.
+    if (BI.LiveOut) {
+      if (Intf.last() >= SA->getLastSplitPoint(BC.Number)) {
+        BC.Exit = SpillPlacement::MustSpill;
+        ++Ins;
+      } else if (Intf.last() > BI.LastInstr) {
+        BC.Exit = SpillPlacement::PrefSpill;
+        ++Ins;
+      } else if (Intf.last() > BI.FirstInstr) {
+        ++Ins;
+      }
+    }
+
+    // Accumulate the total frequency of inserted spill code.
+    while (Ins--)
+      StaticCost += SpillPlacer->getBlockFrequency(BC.Number);
+  }
+  Cost = StaticCost;
+
+  // Add constraints for use-blocks. Note that these are the only constraints
+  // that may add a positive bias, it is downhill from here.
+  SpillPlacer->addConstraints(SplitConstraints);
+  return SpillPlacer->scanActiveBundles();
+}
+
+/// addThroughConstraints - Add constraints and links to SpillPlacer from the
+/// live-through blocks in Blocks.
+bool RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
+                                     ArrayRef<unsigned> Blocks) {
+  const unsigned GroupSize = 8;
+  SpillPlacement::BlockConstraint BCS[GroupSize];
+  unsigned TBS[GroupSize];
+  unsigned B = 0, T = 0;
+
+  for (unsigned Number : Blocks) {
+    Intf.moveToBlock(Number);
+
+    if (!Intf.hasInterference()) {
+      assert(T < GroupSize && "Array overflow");
+      TBS[T] = Number;
+      if (++T == GroupSize) {
+        SpillPlacer->addLinks(ArrayRef(TBS, T));
+        T = 0;
+      }
+      continue;
+    }
+
+    assert(B < GroupSize && "Array overflow");
+    BCS[B].Number = Number;
+
+    // Abort if the spill cannot be inserted at the MBB' start
+    MachineBasicBlock *MBB = MF->getBlockNumbered(Number);
+    auto FirstNonDebugInstr = MBB->getFirstNonDebugInstr();
+    if (FirstNonDebugInstr != MBB->end() &&
+        SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*FirstNonDebugInstr),
+                                  SA->getFirstSplitPoint(Number)))
+      return false;
+    // Interference for the live-in value.
+    if (Intf.first() <= Indexes->getMBBStartIdx(Number))
+      BCS[B].Entry = SpillPlacement::MustSpill;
+    else
+      BCS[B].Entry = SpillPlacement::PrefSpill;
+
+    // Interference for the live-out value.
+    if (Intf.last() >= SA->getLastSplitPoint(Number))
+      BCS[B].Exit = SpillPlacement::MustSpill;
+    else
+      BCS[B].Exit = SpillPlacement::PrefSpill;
+
+    if (++B == GroupSize) {
+      SpillPlacer->addConstraints(ArrayRef(BCS, B));
+      B = 0;
+    }
+  }
+
+  SpillPlacer->addConstraints(ArrayRef(BCS, B));
+  SpillPlacer->addLinks(ArrayRef(TBS, T));
+  return true;
+}
+
+bool RAGreedy::growRegion(GlobalSplitCandidate &Cand) {
+  // Keep track of through blocks that have not been added to SpillPlacer.
+  BitVector Todo = SA->getThroughBlocks();
+  SmallVectorImpl<unsigned> &ActiveBlocks = Cand.ActiveBlocks;
+  unsigned AddedTo = 0;
+#ifndef NDEBUG
+  unsigned Visited = 0;
+#endif
+
+  unsigned long Budget = GrowRegionComplexityBudget;
+  while (true) {
+    ArrayRef<unsigned> NewBundles = SpillPlacer->getRecentPositive();
+    // Find new through blocks in the periphery of PrefRegBundles.
+    for (unsigned Bundle : NewBundles) {
+      // Look at all blocks connected to Bundle in the full graph.
+      ArrayRef<unsigned> Blocks = Bundles->getBlocks(Bundle);
+      // Limit compilation time by bailing out after we use all our budget.
+      if (Blocks.size() >= Budget)
+        return false;
+      Budget -= Blocks.size();
+      for (unsigned Block : Blocks) {
+        if (!Todo.test(Block))
+          continue;
+        Todo.reset(Block);
+        // This is a new through block. Add it to SpillPlacer later.
+        ActiveBlocks.push_back(Block);
+#ifndef NDEBUG
+        ++Visited;
+#endif
+      }
+    }
+    // Any new blocks to add?
+    if (ActiveBlocks.size() == AddedTo)
+      break;
+
+    // Compute through constraints from the interference, or assume that all
+    // through blocks prefer spilling when forming compact regions.
+    auto NewBlocks = ArrayRef(ActiveBlocks).slice(AddedTo);
+    if (Cand.PhysReg) {
+      if (!addThroughConstraints(Cand.Intf, NewBlocks))
+        return false;
+    } else
+      // Provide a strong negative bias on through blocks to prevent unwanted
+      // liveness on loop backedges.
+      SpillPlacer->addPrefSpill(NewBlocks, /* Strong= */ true);
+    AddedTo = ActiveBlocks.size();
+
+    // Perhaps iterating can enable more bundles?
+    SpillPlacer->iterate();
+  }
+  LLVM_DEBUG(dbgs() << ", v=" << Visited);
+  return true;
+}
+
+/// calcCompactRegion - Compute the set of edge bundles that should be live
+/// when splitting the current live range into compact regions.  Compact
+/// regions can be computed without looking at interference.  They are the
+/// regions formed by removing all the live-through blocks from the live range.
+///
+/// Returns false if the current live range is already compact, or if the
+/// compact regions would form single block regions anyway.
+bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) {
+  // Without any through blocks, the live range is already compact.
+  if (!SA->getNumThroughBlocks())
+    return false;
+
+  // Compact regions don't correspond to any physreg.
+  Cand.reset(IntfCache, MCRegister::NoRegister);
+
+  LLVM_DEBUG(dbgs() << "Compact region bundles");
+
+  // Use the spill placer to determine the live bundles. GrowRegion pretends
+  // that all the through blocks have interference when PhysReg is unset.
+  SpillPlacer->prepare(Cand.LiveBundles);
+
+  // The static split cost will be zero since Cand.Intf reports no interference.
+  BlockFrequency Cost;
+  if (!addSplitConstraints(Cand.Intf, Cost)) {
+    LLVM_DEBUG(dbgs() << ", none.\n");
+    return false;
+  }
+
+  if (!growRegion(Cand)) {
+    LLVM_DEBUG(dbgs() << ", cannot spill all interferences.\n");
+    return false;
+  }
+
+  SpillPlacer->finish();
+
+  if (!Cand.LiveBundles.any()) {
+    LLVM_DEBUG(dbgs() << ", none.\n");
+    return false;
+  }
+
+  LLVM_DEBUG({
+    for (int I : Cand.LiveBundles.set_bits())
+      dbgs() << " EB#" << I;
+    dbgs() << ".\n";
+  });
+  return true;
+}
+
+/// calcSpillCost - Compute how expensive it would be to split the live range in
+/// SA around all use blocks instead of forming bundle regions.
+BlockFrequency RAGreedy::calcSpillCost() {
+  BlockFrequency Cost = 0;
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+  for (const SplitAnalysis::BlockInfo &BI : UseBlocks) {
+    unsigned Number = BI.MBB->getNumber();
+    // We normally only need one spill instruction - a load or a store.
+    Cost += SpillPlacer->getBlockFrequency(Number);
+
+    // Unless the value is redefined in the block.
+    if (BI.LiveIn && BI.LiveOut && BI.FirstDef)
+      Cost += SpillPlacer->getBlockFrequency(Number);
+  }
+  return Cost;
+}
+
+/// calcGlobalSplitCost - Return the global split cost of following the split
+/// pattern in LiveBundles. This cost should be added to the local cost of the
+/// interference pattern in SplitConstraints.
+///
+BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand,
+                                             const AllocationOrder &Order) {
+  BlockFrequency GlobalCost = 0;
+  const BitVector &LiveBundles = Cand.LiveBundles;
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+  for (unsigned I = 0; I != UseBlocks.size(); ++I) {
+    const SplitAnalysis::BlockInfo &BI = UseBlocks[I];
+    SpillPlacement::BlockConstraint &BC = SplitConstraints[I];
+    bool RegIn  = LiveBundles[Bundles->getBundle(BC.Number, false)];
+    bool RegOut = LiveBundles[Bundles->getBundle(BC.Number, true)];
+    unsigned Ins = 0;
+
+    Cand.Intf.moveToBlock(BC.Number);
+
+    if (BI.LiveIn)
+      Ins += RegIn != (BC.Entry == SpillPlacement::PrefReg);
+    if (BI.LiveOut)
+      Ins += RegOut != (BC.Exit == SpillPlacement::PrefReg);
+    while (Ins--)
+      GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
+  }
+
+  for (unsigned Number : Cand.ActiveBlocks) {
+    bool RegIn  = LiveBundles[Bundles->getBundle(Number, false)];
+    bool RegOut = LiveBundles[Bundles->getBundle(Number, true)];
+    if (!RegIn && !RegOut)
+      continue;
+    if (RegIn && RegOut) {
+      // We need double spill code if this block has interference.
+      Cand.Intf.moveToBlock(Number);
+      if (Cand.Intf.hasInterference()) {
+        GlobalCost += SpillPlacer->getBlockFrequency(Number);
+        GlobalCost += SpillPlacer->getBlockFrequency(Number);
+      }
+      continue;
+    }
+    // live-in / stack-out or stack-in live-out.
+    GlobalCost += SpillPlacer->getBlockFrequency(Number);
+  }
+  return GlobalCost;
+}
+
+/// splitAroundRegion - Split the current live range around the regions
+/// determined by BundleCand and GlobalCand.
+///
+/// Before calling this function, GlobalCand and BundleCand must be initialized
+/// so each bundle is assigned to a valid candidate, or NoCand for the
+/// stack-bound bundles.  The shared SA/SE SplitAnalysis and SplitEditor
+/// objects must be initialized for the current live range, and intervals
+/// created for the used candidates.
+///
+/// @param LREdit    The LiveRangeEdit object handling the current split.
+/// @param UsedCands List of used GlobalCand entries. Every BundleCand value
+///                  must appear in this list.
+void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
+                                 ArrayRef<unsigned> UsedCands) {
+  // These are the intervals created for new global ranges. We may create more
+  // intervals for local ranges.
+  const unsigned NumGlobalIntvs = LREdit.size();
+  LLVM_DEBUG(dbgs() << "splitAroundRegion with " << NumGlobalIntvs
+                    << " globals.\n");
+  assert(NumGlobalIntvs && "No global intervals configured");
+
+  // Isolate even single instructions when dealing with a proper sub-class.
+  // That guarantees register class inflation for the stack interval because it
+  // is all copies.
+  Register Reg = SA->getParent().reg();
+  bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
+
+  // First handle all the blocks with uses.
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+  for (const SplitAnalysis::BlockInfo &BI : UseBlocks) {
+    unsigned Number = BI.MBB->getNumber();
+    unsigned IntvIn = 0, IntvOut = 0;
+    SlotIndex IntfIn, IntfOut;
+    if (BI.LiveIn) {
+      unsigned CandIn = BundleCand[Bundles->getBundle(Number, false)];
+      if (CandIn != NoCand) {
+        GlobalSplitCandidate &Cand = GlobalCand[CandIn];
+        IntvIn = Cand.IntvIdx;
+        Cand.Intf.moveToBlock(Number);
+        IntfIn = Cand.Intf.first();
+      }
+    }
+    if (BI.LiveOut) {
+      unsigned CandOut = BundleCand[Bundles->getBundle(Number, true)];
+      if (CandOut != NoCand) {
+        GlobalSplitCandidate &Cand = GlobalCand[CandOut];
+        IntvOut = Cand.IntvIdx;
+        Cand.Intf.moveToBlock(Number);
+        IntfOut = Cand.Intf.last();
+      }
+    }
+
+    // Create separate intervals for isolated blocks with multiple uses.
+    if (!IntvIn && !IntvOut) {
+      LLVM_DEBUG(dbgs() << printMBBReference(*BI.MBB) << " isolated.\n");
+      if (SA->shouldSplitSingleBlock(BI, SingleInstrs))
+        SE->splitSingleBlock(BI);
+      continue;
+    }
+
+    if (IntvIn && IntvOut)
+      SE->splitLiveThroughBlock(Number, IntvIn, IntfIn, IntvOut, IntfOut);
+    else if (IntvIn)
+      SE->splitRegInBlock(BI, IntvIn, IntfIn);
+    else
+      SE->splitRegOutBlock(BI, IntvOut, IntfOut);
+  }
+
+  // Handle live-through blocks. The relevant live-through blocks are stored in
+  // the ActiveBlocks list with each candidate. We need to filter out
+  // duplicates.
+  BitVector Todo = SA->getThroughBlocks();
+  for (unsigned UsedCand : UsedCands) {
+    ArrayRef<unsigned> Blocks = GlobalCand[UsedCand].ActiveBlocks;
+    for (unsigned Number : Blocks) {
+      if (!Todo.test(Number))
+        continue;
+      Todo.reset(Number);
+
+      unsigned IntvIn = 0, IntvOut = 0;
+      SlotIndex IntfIn, IntfOut;
+
+      unsigned CandIn = BundleCand[Bundles->getBundle(Number, false)];
+      if (CandIn != NoCand) {
+        GlobalSplitCandidate &Cand = GlobalCand[CandIn];
+        IntvIn = Cand.IntvIdx;
+        Cand.Intf.moveToBlock(Number);
+        IntfIn = Cand.Intf.first();
+      }
+
+      unsigned CandOut = BundleCand[Bundles->getBundle(Number, true)];
+      if (CandOut != NoCand) {
+        GlobalSplitCandidate &Cand = GlobalCand[CandOut];
+        IntvOut = Cand.IntvIdx;
+        Cand.Intf.moveToBlock(Number);
+        IntfOut = Cand.Intf.last();
+      }
+      if (!IntvIn && !IntvOut)
+        continue;
+      SE->splitLiveThroughBlock(Number, IntvIn, IntfIn, IntvOut, IntfOut);
+    }
+  }
+
+  ++NumGlobalSplits;
+
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+  DebugVars->splitRegister(Reg, LREdit.regs(), *LIS);
+
+  unsigned OrigBlocks = SA->getNumLiveBlocks();
+
+  // Sort out the new intervals created by splitting. We get four kinds:
+  // - Remainder intervals should not be split again.
+  // - Candidate intervals can be assigned to Cand.PhysReg.
+  // - Block-local splits are candidates for local splitting.
+  // - DCE leftovers should go back on the queue.
+  for (unsigned I = 0, E = LREdit.size(); I != E; ++I) {
+    const LiveInterval &Reg = LIS->getInterval(LREdit.get(I));
+
+    // Ignore old intervals from DCE.
+    if (ExtraInfo->getOrInitStage(Reg.reg()) != RS_New)
+      continue;
+
+    // Remainder interval. Don't try splitting again, spill if it doesn't
+    // allocate.
+    if (IntvMap[I] == 0) {
+      ExtraInfo->setStage(Reg, RS_Spill);
+      continue;
+    }
+
+    // Global intervals. Allow repeated splitting as long as the number of live
+    // blocks is strictly decreasing.
+    if (IntvMap[I] < NumGlobalIntvs) {
+      if (SA->countLiveBlocks(&Reg) >= OrigBlocks) {
+        LLVM_DEBUG(dbgs() << "Main interval covers the same " << OrigBlocks
+                          << " blocks as original.\n");
+        // Don't allow repeated splitting as a safe guard against looping.
+        ExtraInfo->setStage(Reg, RS_Split2);
+      }
+      continue;
+    }
+
+    // Other intervals are treated as new. This includes local intervals created
+    // for blocks with multiple uses, and anything created by DCE.
+  }
+
+  if (VerifyEnabled)
+    MF->verify(this, "After splitting live range around region");
+}
+
+MCRegister RAGreedy::tryRegionSplit(const LiveInterval &VirtReg,
+                                    AllocationOrder &Order,
+                                    SmallVectorImpl<Register> &NewVRegs) {
+  if (!TRI->shouldRegionSplitForVirtReg(*MF, VirtReg))
+    return MCRegister::NoRegister;
+  unsigned NumCands = 0;
+  BlockFrequency SpillCost = calcSpillCost();
+  BlockFrequency BestCost;
+
+  // Check if we can split this live range around a compact region.
+  bool HasCompact = calcCompactRegion(GlobalCand.front());
+  if (HasCompact) {
+    // Yes, keep GlobalCand[0] as the compact region candidate.
+    NumCands = 1;
+    BestCost = BlockFrequency::getMaxFrequency();
+  } else {
+    // No benefit from the compact region, our fallback will be per-block
+    // splitting. Make sure we find a solution that is cheaper than spilling.
+    BestCost = SpillCost;
+    LLVM_DEBUG(dbgs() << "Cost of isolating all blocks = ";
+               MBFI->printBlockFreq(dbgs(), BestCost) << '\n');
+  }
+
+  unsigned BestCand = calculateRegionSplitCost(VirtReg, Order, BestCost,
+                                               NumCands, false /*IgnoreCSR*/);
+
+  // No solutions found, fall back to single block splitting.
+  if (!HasCompact && BestCand == NoCand)
+    return MCRegister::NoRegister;
+
+  return doRegionSplit(VirtReg, BestCand, HasCompact, NewVRegs);
+}
+
+unsigned RAGreedy::calculateRegionSplitCost(const LiveInterval &VirtReg,
+                                            AllocationOrder &Order,
+                                            BlockFrequency &BestCost,
+                                            unsigned &NumCands,
+                                            bool IgnoreCSR) {
+  unsigned BestCand = NoCand;
+  for (MCPhysReg PhysReg : Order) {
+    assert(PhysReg);
+    if (IgnoreCSR && EvictAdvisor->isUnusedCalleeSavedReg(PhysReg))
+      continue;
+
+    // Discard bad candidates before we run out of interference cache cursors.
+    // This will only affect register classes with a lot of registers (>32).
+    if (NumCands == IntfCache.getMaxCursors()) {
+      unsigned WorstCount = ~0u;
+      unsigned Worst = 0;
+      for (unsigned CandIndex = 0; CandIndex != NumCands; ++CandIndex) {
+        if (CandIndex == BestCand || !GlobalCand[CandIndex].PhysReg)
+          continue;
+        unsigned Count = GlobalCand[CandIndex].LiveBundles.count();
+        if (Count < WorstCount) {
+          Worst = CandIndex;
+          WorstCount = Count;
+        }
+      }
+      --NumCands;
+      GlobalCand[Worst] = GlobalCand[NumCands];
+      if (BestCand == NumCands)
+        BestCand = Worst;
+    }
+
+    if (GlobalCand.size() <= NumCands)
+      GlobalCand.resize(NumCands+1);
+    GlobalSplitCandidate &Cand = GlobalCand[NumCands];
+    Cand.reset(IntfCache, PhysReg);
+
+    SpillPlacer->prepare(Cand.LiveBundles);
+    BlockFrequency Cost;
+    if (!addSplitConstraints(Cand.Intf, Cost)) {
+      LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << "\tno positive bundles\n");
+      continue;
+    }
+    LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << "\tstatic = ";
+               MBFI->printBlockFreq(dbgs(), Cost));
+    if (Cost >= BestCost) {
+      LLVM_DEBUG({
+        if (BestCand == NoCand)
+          dbgs() << " worse than no bundles\n";
+        else
+          dbgs() << " worse than "
+                 << printReg(GlobalCand[BestCand].PhysReg, TRI) << '\n';
+      });
+      continue;
+    }
+    if (!growRegion(Cand)) {
+      LLVM_DEBUG(dbgs() << ", cannot spill all interferences.\n");
+      continue;
+    }
+
+    SpillPlacer->finish();
+
+    // No live bundles, defer to splitSingleBlocks().
+    if (!Cand.LiveBundles.any()) {
+      LLVM_DEBUG(dbgs() << " no bundles.\n");
+      continue;
+    }
+
+    Cost += calcGlobalSplitCost(Cand, Order);
+    LLVM_DEBUG({
+      dbgs() << ", total = ";
+      MBFI->printBlockFreq(dbgs(), Cost) << " with bundles";
+      for (int I : Cand.LiveBundles.set_bits())
+        dbgs() << " EB#" << I;
+      dbgs() << ".\n";
+    });
+    if (Cost < BestCost) {
+      BestCand = NumCands;
+      BestCost = Cost;
+    }
+    ++NumCands;
+  }
+
+  return BestCand;
+}
+
+unsigned RAGreedy::doRegionSplit(const LiveInterval &VirtReg, unsigned BestCand,
+                                 bool HasCompact,
+                                 SmallVectorImpl<Register> &NewVRegs) {
+  SmallVector<unsigned, 8> UsedCands;
+  // Prepare split editor.
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
+  SE->reset(LREdit, SplitSpillMode);
+
+  // Assign all edge bundles to the preferred candidate, or NoCand.
+  BundleCand.assign(Bundles->getNumBundles(), NoCand);
+
+  // Assign bundles for the best candidate region.
+  if (BestCand != NoCand) {
+    GlobalSplitCandidate &Cand = GlobalCand[BestCand];
+    if (unsigned B = Cand.getBundles(BundleCand, BestCand)) {
+      UsedCands.push_back(BestCand);
+      Cand.IntvIdx = SE->openIntv();
+      LLVM_DEBUG(dbgs() << "Split for " << printReg(Cand.PhysReg, TRI) << " in "
+                        << B << " bundles, intv " << Cand.IntvIdx << ".\n");
+      (void)B;
+    }
+  }
+
+  // Assign bundles for the compact region.
+  if (HasCompact) {
+    GlobalSplitCandidate &Cand = GlobalCand.front();
+    assert(!Cand.PhysReg && "Compact region has no physreg");
+    if (unsigned B = Cand.getBundles(BundleCand, 0)) {
+      UsedCands.push_back(0);
+      Cand.IntvIdx = SE->openIntv();
+      LLVM_DEBUG(dbgs() << "Split for compact region in " << B
+                        << " bundles, intv " << Cand.IntvIdx << ".\n");
+      (void)B;
+    }
+  }
+
+  splitAroundRegion(LREdit, UsedCands);
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+//                            Per-Block Splitting
+//===----------------------------------------------------------------------===//
+
+/// tryBlockSplit - Split a global live range around every block with uses. This
+/// creates a lot of local live ranges, that will be split by tryLocalSplit if
+/// they don't allocate.
+unsigned RAGreedy::tryBlockSplit(const LiveInterval &VirtReg,
+                                 AllocationOrder &Order,
+                                 SmallVectorImpl<Register> &NewVRegs) {
+  assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
+  Register Reg = VirtReg.reg();
+  bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
+  SE->reset(LREdit, SplitSpillMode);
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+  for (const SplitAnalysis::BlockInfo &BI : UseBlocks) {
+    if (SA->shouldSplitSingleBlock(BI, SingleInstrs))
+      SE->splitSingleBlock(BI);
+  }
+  // No blocks were split.
+  if (LREdit.empty())
+    return 0;
+
+  // We did split for some blocks.
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+
+  // Tell LiveDebugVariables about the new ranges.
+  DebugVars->splitRegister(Reg, LREdit.regs(), *LIS);
+
+  // Sort out the new intervals created by splitting. The remainder interval
+  // goes straight to spilling, the new local ranges get to stay RS_New.
+  for (unsigned I = 0, E = LREdit.size(); I != E; ++I) {
+    const LiveInterval &LI = LIS->getInterval(LREdit.get(I));
+    if (ExtraInfo->getOrInitStage(LI.reg()) == RS_New && IntvMap[I] == 0)
+      ExtraInfo->setStage(LI, RS_Spill);
+  }
+
+  if (VerifyEnabled)
+    MF->verify(this, "After splitting live range around basic blocks");
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Per-Instruction Splitting
+//===----------------------------------------------------------------------===//
+
+/// Get the number of allocatable registers that match the constraints of \p Reg
+/// on \p MI and that are also in \p SuperRC.
+static unsigned getNumAllocatableRegsForConstraints(
+    const MachineInstr *MI, Register Reg, const TargetRegisterClass *SuperRC,
+    const TargetInstrInfo *TII, const TargetRegisterInfo *TRI,
+    const RegisterClassInfo &RCI) {
+  assert(SuperRC && "Invalid register class");
+
+  const TargetRegisterClass *ConstrainedRC =
+      MI->getRegClassConstraintEffectForVReg(Reg, SuperRC, TII, TRI,
+                                             /* ExploreBundle */ true);
+  if (!ConstrainedRC)
+    return 0;
+  return RCI.getNumAllocatableRegs(ConstrainedRC);
+}
+
+static LaneBitmask getInstReadLaneMask(const MachineRegisterInfo &MRI,
+                                       const TargetRegisterInfo &TRI,
+                                       const MachineInstr &MI, Register Reg) {
+  LaneBitmask Mask;
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || MO.getReg() != Reg)
+      continue;
+
+    unsigned SubReg = MO.getSubReg();
+    if (SubReg == 0 && MO.isUse()) {
+      Mask |= MRI.getMaxLaneMaskForVReg(Reg);
+      continue;
+    }
+
+    LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(SubReg);
+    if (MO.isDef()) {
+      if (!MO.isUndef())
+        Mask |= ~SubRegMask;
+    } else
+      Mask |= SubRegMask;
+  }
+
+  return Mask;
+}
+
+/// Return true if \p MI at \P Use reads a subset of the lanes live in \p
+/// VirtReg.
+static bool readsLaneSubset(const MachineRegisterInfo &MRI,
+                            const MachineInstr *MI, const LiveInterval &VirtReg,
+                            const TargetRegisterInfo *TRI, SlotIndex Use) {
+  // Early check the common case.
+  if (MI->isCopy() &&
+      MI->getOperand(0).getSubReg() == MI->getOperand(1).getSubReg())
+    return false;
+
+  // FIXME: We're only considering uses, but should be consider defs too?
+  LaneBitmask ReadMask = getInstReadLaneMask(MRI, *TRI, *MI, VirtReg.reg());
+
+  LaneBitmask LiveAtMask;
+  for (const LiveInterval::SubRange &S : VirtReg.subranges()) {
+    if (S.liveAt(Use))
+      LiveAtMask |= S.LaneMask;
+  }
+
+  // If the live lanes aren't different from the lanes used by the instruction,
+  // this doesn't help.
+  return (ReadMask & ~(LiveAtMask & TRI->getCoveringLanes())).any();
+}
+
+/// tryInstructionSplit - Split a live range around individual instructions.
+/// This is normally not worthwhile since the spiller is doing essentially the
+/// same thing. However, when the live range is in a constrained register
+/// class, it may help to insert copies such that parts of the live range can
+/// be moved to a larger register class.
+///
+/// This is similar to spilling to a larger register class.
+unsigned RAGreedy::tryInstructionSplit(const LiveInterval &VirtReg,
+                                       AllocationOrder &Order,
+                                       SmallVectorImpl<Register> &NewVRegs) {
+  const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg());
+  // There is no point to this if there are no larger sub-classes.
+
+  bool SplitSubClass = true;
+  if (!RegClassInfo.isProperSubClass(CurRC)) {
+    if (!VirtReg.hasSubRanges())
+      return 0;
+    SplitSubClass = false;
+  }
+
+  // Always enable split spill mode, since we're effectively spilling to a
+  // register.
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
+  SE->reset(LREdit, SplitEditor::SM_Size);
+
+  ArrayRef<SlotIndex> Uses = SA->getUseSlots();
+  if (Uses.size() <= 1)
+    return 0;
+
+  LLVM_DEBUG(dbgs() << "Split around " << Uses.size()
+                    << " individual instrs.\n");
+
+  const TargetRegisterClass *SuperRC =
+      TRI->getLargestLegalSuperClass(CurRC, *MF);
+  unsigned SuperRCNumAllocatableRegs =
+      RegClassInfo.getNumAllocatableRegs(SuperRC);
+  // Split around every non-copy instruction if this split will relax
+  // the constraints on the virtual register.
+  // Otherwise, splitting just inserts uncoalescable copies that do not help
+  // the allocation.
+  for (const SlotIndex Use : Uses) {
+    if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Use)) {
+      if (MI->isFullCopy() ||
+          (SplitSubClass &&
+           SuperRCNumAllocatableRegs ==
+               getNumAllocatableRegsForConstraints(MI, VirtReg.reg(), SuperRC,
+                                                   TII, TRI, RegClassInfo)) ||
+          // TODO: Handle split for subranges with subclass constraints?
+          (!SplitSubClass && VirtReg.hasSubRanges() &&
+           !readsLaneSubset(*MRI, MI, VirtReg, TRI, Use))) {
+        LLVM_DEBUG(dbgs() << "    skip:\t" << Use << '\t' << *MI);
+        continue;
+      }
+    }
+    SE->openIntv();
+    SlotIndex SegStart = SE->enterIntvBefore(Use);
+    SlotIndex SegStop = SE->leaveIntvAfter(Use);
+    SE->useIntv(SegStart, SegStop);
+  }
+
+  if (LREdit.empty()) {
+    LLVM_DEBUG(dbgs() << "All uses were copies.\n");
+    return 0;
+  }
+
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+  DebugVars->splitRegister(VirtReg.reg(), LREdit.regs(), *LIS);
+  // Assign all new registers to RS_Spill. This was the last chance.
+  ExtraInfo->setStage(LREdit.begin(), LREdit.end(), RS_Spill);
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+//                             Local Splitting
+//===----------------------------------------------------------------------===//
+
+/// calcGapWeights - Compute the maximum spill weight that needs to be evicted
+/// in order to use PhysReg between two entries in SA->UseSlots.
+///
+/// GapWeight[I] represents the gap between UseSlots[I] and UseSlots[I + 1].
+///
+void RAGreedy::calcGapWeights(MCRegister PhysReg,
+                              SmallVectorImpl<float> &GapWeight) {
+  assert(SA->getUseBlocks().size() == 1 && "Not a local interval");
+  const SplitAnalysis::BlockInfo &BI = SA->getUseBlocks().front();
+  ArrayRef<SlotIndex> Uses = SA->getUseSlots();
+  const unsigned NumGaps = Uses.size()-1;
+
+  // Start and end points for the interference check.
+  SlotIndex StartIdx =
+    BI.LiveIn ? BI.FirstInstr.getBaseIndex() : BI.FirstInstr;
+  SlotIndex StopIdx =
+    BI.LiveOut ? BI.LastInstr.getBoundaryIndex() : BI.LastInstr;
+
+  GapWeight.assign(NumGaps, 0.0f);
+
+  // Add interference from each overlapping register.
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    if (!Matrix->query(const_cast<LiveInterval &>(SA->getParent()), Unit)
+             .checkInterference())
+      continue;
+
+    // We know that VirtReg is a continuous interval from FirstInstr to
+    // LastInstr, so we don't need InterferenceQuery.
+    //
+    // Interference that overlaps an instruction is counted in both gaps
+    // surrounding the instruction. The exception is interference before
+    // StartIdx and after StopIdx.
+    //
+    LiveIntervalUnion::SegmentIter IntI =
+        Matrix->getLiveUnions()[Unit].find(StartIdx);
+    for (unsigned Gap = 0; IntI.valid() && IntI.start() < StopIdx; ++IntI) {
+      // Skip the gaps before IntI.
+      while (Uses[Gap+1].getBoundaryIndex() < IntI.start())
+        if (++Gap == NumGaps)
+          break;
+      if (Gap == NumGaps)
+        break;
+
+      // Update the gaps covered by IntI.
+      const float weight = IntI.value()->weight();
+      for (; Gap != NumGaps; ++Gap) {
+        GapWeight[Gap] = std::max(GapWeight[Gap], weight);
+        if (Uses[Gap+1].getBaseIndex() >= IntI.stop())
+          break;
+      }
+      if (Gap == NumGaps)
+        break;
+    }
+  }
+
+  // Add fixed interference.
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    const LiveRange &LR = LIS->getRegUnit(Unit);
+    LiveRange::const_iterator I = LR.find(StartIdx);
+    LiveRange::const_iterator E = LR.end();
+
+    // Same loop as above. Mark any overlapped gaps as HUGE_VALF.
+    for (unsigned Gap = 0; I != E && I->start < StopIdx; ++I) {
+      while (Uses[Gap+1].getBoundaryIndex() < I->start)
+        if (++Gap == NumGaps)
+          break;
+      if (Gap == NumGaps)
+        break;
+
+      for (; Gap != NumGaps; ++Gap) {
+        GapWeight[Gap] = huge_valf;
+        if (Uses[Gap+1].getBaseIndex() >= I->end)
+          break;
+      }
+      if (Gap == NumGaps)
+        break;
+    }
+  }
+}
+
+/// tryLocalSplit - Try to split VirtReg into smaller intervals inside its only
+/// basic block.
+///
+unsigned RAGreedy::tryLocalSplit(const LiveInterval &VirtReg,
+                                 AllocationOrder &Order,
+                                 SmallVectorImpl<Register> &NewVRegs) {
+  // TODO: the function currently only handles a single UseBlock; it should be
+  // possible to generalize.
+  if (SA->getUseBlocks().size() != 1)
+    return 0;
+
+  const SplitAnalysis::BlockInfo &BI = SA->getUseBlocks().front();
+
+  // Note that it is possible to have an interval that is live-in or live-out
+  // while only covering a single block - A phi-def can use undef values from
+  // predecessors, and the block could be a single-block loop.
+  // We don't bother doing anything clever about such a case, we simply assume
+  // that the interval is continuous from FirstInstr to LastInstr. We should
+  // make sure that we don't do anything illegal to such an interval, though.
+
+  ArrayRef<SlotIndex> Uses = SA->getUseSlots();
+  if (Uses.size() <= 2)
+    return 0;
+  const unsigned NumGaps = Uses.size()-1;
+
+  LLVM_DEBUG({
+    dbgs() << "tryLocalSplit: ";
+    for (const auto &Use : Uses)
+      dbgs() << ' ' << Use;
+    dbgs() << '\n';
+  });
+
+  // If VirtReg is live across any register mask operands, compute a list of
+  // gaps with register masks.
+  SmallVector<unsigned, 8> RegMaskGaps;
+  if (Matrix->checkRegMaskInterference(VirtReg)) {
+    // Get regmask slots for the whole block.
+    ArrayRef<SlotIndex> RMS = LIS->getRegMaskSlotsInBlock(BI.MBB->getNumber());
+    LLVM_DEBUG(dbgs() << RMS.size() << " regmasks in block:");
+    // Constrain to VirtReg's live range.
+    unsigned RI =
+        llvm::lower_bound(RMS, Uses.front().getRegSlot()) - RMS.begin();
+    unsigned RE = RMS.size();
+    for (unsigned I = 0; I != NumGaps && RI != RE; ++I) {
+      // Look for Uses[I] <= RMS <= Uses[I + 1].
+      assert(!SlotIndex::isEarlierInstr(RMS[RI], Uses[I]));
+      if (SlotIndex::isEarlierInstr(Uses[I + 1], RMS[RI]))
+        continue;
+      // Skip a regmask on the same instruction as the last use. It doesn't
+      // overlap the live range.
+      if (SlotIndex::isSameInstr(Uses[I + 1], RMS[RI]) && I + 1 == NumGaps)
+        break;
+      LLVM_DEBUG(dbgs() << ' ' << RMS[RI] << ':' << Uses[I] << '-'
+                        << Uses[I + 1]);
+      RegMaskGaps.push_back(I);
+      // Advance ri to the next gap. A regmask on one of the uses counts in
+      // both gaps.
+      while (RI != RE && SlotIndex::isEarlierInstr(RMS[RI], Uses[I + 1]))
+        ++RI;
+    }
+    LLVM_DEBUG(dbgs() << '\n');
+  }
+
+  // Since we allow local split results to be split again, there is a risk of
+  // creating infinite loops. It is tempting to require that the new live
+  // ranges have less instructions than the original. That would guarantee
+  // convergence, but it is too strict. A live range with 3 instructions can be
+  // split 2+3 (including the COPY), and we want to allow that.
+  //
+  // Instead we use these rules:
+  //
+  // 1. Allow any split for ranges with getStage() < RS_Split2. (Except for the
+  //    noop split, of course).
+  // 2. Require progress be made for ranges with getStage() == RS_Split2. All
+  //    the new ranges must have fewer instructions than before the split.
+  // 3. New ranges with the same number of instructions are marked RS_Split2,
+  //    smaller ranges are marked RS_New.
+  //
+  // These rules allow a 3 -> 2+3 split once, which we need. They also prevent
+  // excessive splitting and infinite loops.
+  //
+  bool ProgressRequired = ExtraInfo->getStage(VirtReg) >= RS_Split2;
+
+  // Best split candidate.
+  unsigned BestBefore = NumGaps;
+  unsigned BestAfter = 0;
+  float BestDiff = 0;
+
+  const float blockFreq =
+    SpillPlacer->getBlockFrequency(BI.MBB->getNumber()).getFrequency() *
+    (1.0f / MBFI->getEntryFreq());
+  SmallVector<float, 8> GapWeight;
+
+  for (MCPhysReg PhysReg : Order) {
+    assert(PhysReg);
+    // Keep track of the largest spill weight that would need to be evicted in
+    // order to make use of PhysReg between UseSlots[I] and UseSlots[I + 1].
+    calcGapWeights(PhysReg, GapWeight);
+
+    // Remove any gaps with regmask clobbers.
+    if (Matrix->checkRegMaskInterference(VirtReg, PhysReg))
+      for (unsigned I = 0, E = RegMaskGaps.size(); I != E; ++I)
+        GapWeight[RegMaskGaps[I]] = huge_valf;
+
+    // Try to find the best sequence of gaps to close.
+    // The new spill weight must be larger than any gap interference.
+
+    // We will split before Uses[SplitBefore] and after Uses[SplitAfter].
+    unsigned SplitBefore = 0, SplitAfter = 1;
+
+    // MaxGap should always be max(GapWeight[SplitBefore..SplitAfter-1]).
+    // It is the spill weight that needs to be evicted.
+    float MaxGap = GapWeight[0];
+
+    while (true) {
+      // Live before/after split?
+      const bool LiveBefore = SplitBefore != 0 || BI.LiveIn;
+      const bool LiveAfter = SplitAfter != NumGaps || BI.LiveOut;
+
+      LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << ' ' << Uses[SplitBefore]
+                        << '-' << Uses[SplitAfter] << " I=" << MaxGap);
+
+      // Stop before the interval gets so big we wouldn't be making progress.
+      if (!LiveBefore && !LiveAfter) {
+        LLVM_DEBUG(dbgs() << " all\n");
+        break;
+      }
+      // Should the interval be extended or shrunk?
+      bool Shrink = true;
+
+      // How many gaps would the new range have?
+      unsigned NewGaps = LiveBefore + SplitAfter - SplitBefore + LiveAfter;
+
+      // Legally, without causing looping?
+      bool Legal = !ProgressRequired || NewGaps < NumGaps;
+
+      if (Legal && MaxGap < huge_valf) {
+        // Estimate the new spill weight. Each instruction reads or writes the
+        // register. Conservatively assume there are no read-modify-write
+        // instructions.
+        //
+        // Try to guess the size of the new interval.
+        const float EstWeight = normalizeSpillWeight(
+            blockFreq * (NewGaps + 1),
+            Uses[SplitBefore].distance(Uses[SplitAfter]) +
+                (LiveBefore + LiveAfter) * SlotIndex::InstrDist,
+            1);
+        // Would this split be possible to allocate?
+        // Never allocate all gaps, we wouldn't be making progress.
+        LLVM_DEBUG(dbgs() << " w=" << EstWeight);
+        if (EstWeight * Hysteresis >= MaxGap) {
+          Shrink = false;
+          float Diff = EstWeight - MaxGap;
+          if (Diff > BestDiff) {
+            LLVM_DEBUG(dbgs() << " (best)");
+            BestDiff = Hysteresis * Diff;
+            BestBefore = SplitBefore;
+            BestAfter = SplitAfter;
+          }
+        }
+      }
+
+      // Try to shrink.
+      if (Shrink) {
+        if (++SplitBefore < SplitAfter) {
+          LLVM_DEBUG(dbgs() << " shrink\n");
+          // Recompute the max when necessary.
+          if (GapWeight[SplitBefore - 1] >= MaxGap) {
+            MaxGap = GapWeight[SplitBefore];
+            for (unsigned I = SplitBefore + 1; I != SplitAfter; ++I)
+              MaxGap = std::max(MaxGap, GapWeight[I]);
+          }
+          continue;
+        }
+        MaxGap = 0;
+      }
+
+      // Try to extend the interval.
+      if (SplitAfter >= NumGaps) {
+        LLVM_DEBUG(dbgs() << " end\n");
+        break;
+      }
+
+      LLVM_DEBUG(dbgs() << " extend\n");
+      MaxGap = std::max(MaxGap, GapWeight[SplitAfter++]);
+    }
+  }
+
+  // Didn't find any candidates?
+  if (BestBefore == NumGaps)
+    return 0;
+
+  LLVM_DEBUG(dbgs() << "Best local split range: " << Uses[BestBefore] << '-'
+                    << Uses[BestAfter] << ", " << BestDiff << ", "
+                    << (BestAfter - BestBefore + 1) << " instrs\n");
+
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
+  SE->reset(LREdit);
+
+  SE->openIntv();
+  SlotIndex SegStart = SE->enterIntvBefore(Uses[BestBefore]);
+  SlotIndex SegStop  = SE->leaveIntvAfter(Uses[BestAfter]);
+  SE->useIntv(SegStart, SegStop);
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+  DebugVars->splitRegister(VirtReg.reg(), LREdit.regs(), *LIS);
+  // If the new range has the same number of instructions as before, mark it as
+  // RS_Split2 so the next split will be forced to make progress. Otherwise,
+  // leave the new intervals as RS_New so they can compete.
+  bool LiveBefore = BestBefore != 0 || BI.LiveIn;
+  bool LiveAfter = BestAfter != NumGaps || BI.LiveOut;
+  unsigned NewGaps = LiveBefore + BestAfter - BestBefore + LiveAfter;
+  if (NewGaps >= NumGaps) {
+    LLVM_DEBUG(dbgs() << "Tagging non-progress ranges:");
+    assert(!ProgressRequired && "Didn't make progress when it was required.");
+    for (unsigned I = 0, E = IntvMap.size(); I != E; ++I)
+      if (IntvMap[I] == 1) {
+        ExtraInfo->setStage(LIS->getInterval(LREdit.get(I)), RS_Split2);
+        LLVM_DEBUG(dbgs() << ' ' << printReg(LREdit.get(I)));
+      }
+    LLVM_DEBUG(dbgs() << '\n');
+  }
+  ++NumLocalSplits;
+
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+//                          Live Range Splitting
+//===----------------------------------------------------------------------===//
+
+/// trySplit - Try to split VirtReg or one of its interferences, making it
+/// assignable.
+/// @return Physreg when VirtReg may be assigned and/or new NewVRegs.
+unsigned RAGreedy::trySplit(const LiveInterval &VirtReg, AllocationOrder &Order,
+                            SmallVectorImpl<Register> &NewVRegs,
+                            const SmallVirtRegSet &FixedRegisters) {
+  // Ranges must be Split2 or less.
+  if (ExtraInfo->getStage(VirtReg) >= RS_Spill)
+    return 0;
+
+  // Local intervals are handled separately.
+  if (LIS->intervalIsInOneMBB(VirtReg)) {
+    NamedRegionTimer T("local_split", "Local Splitting", TimerGroupName,
+                       TimerGroupDescription, TimePassesIsEnabled);
+    SA->analyze(&VirtReg);
+    Register PhysReg = tryLocalSplit(VirtReg, Order, NewVRegs);
+    if (PhysReg || !NewVRegs.empty())
+      return PhysReg;
+    return tryInstructionSplit(VirtReg, Order, NewVRegs);
+  }
+
+  NamedRegionTimer T("global_split", "Global Splitting", TimerGroupName,
+                     TimerGroupDescription, TimePassesIsEnabled);
+
+  SA->analyze(&VirtReg);
+
+  // First try to split around a region spanning multiple blocks. RS_Split2
+  // ranges already made dubious progress with region splitting, so they go
+  // straight to single block splitting.
+  if (ExtraInfo->getStage(VirtReg) < RS_Split2) {
+    MCRegister PhysReg = tryRegionSplit(VirtReg, Order, NewVRegs);
+    if (PhysReg || !NewVRegs.empty())
+      return PhysReg;
+  }
+
+  // Then isolate blocks.
+  return tryBlockSplit(VirtReg, Order, NewVRegs);
+}
+
+//===----------------------------------------------------------------------===//
+//                          Last Chance Recoloring
+//===----------------------------------------------------------------------===//
+
+/// Return true if \p reg has any tied def operand.
+static bool hasTiedDef(MachineRegisterInfo *MRI, unsigned reg) {
+  for (const MachineOperand &MO : MRI->def_operands(reg))
+    if (MO.isTied())
+      return true;
+
+  return false;
+}
+
+/// Return true if the existing assignment of \p Intf overlaps, but is not the
+/// same, as \p PhysReg.
+static bool assignedRegPartiallyOverlaps(const TargetRegisterInfo &TRI,
+                                         const VirtRegMap &VRM,
+                                         MCRegister PhysReg,
+                                         const LiveInterval &Intf) {
+  MCRegister AssignedReg = VRM.getPhys(Intf.reg());
+  if (PhysReg == AssignedReg)
+    return false;
+  return TRI.regsOverlap(PhysReg, AssignedReg);
+}
+
+/// mayRecolorAllInterferences - Check if the virtual registers that
+/// interfere with \p VirtReg on \p PhysReg (or one of its aliases) may be
+/// recolored to free \p PhysReg.
+/// When true is returned, \p RecoloringCandidates has been augmented with all
+/// the live intervals that need to be recolored in order to free \p PhysReg
+/// for \p VirtReg.
+/// \p FixedRegisters contains all the virtual registers that cannot be
+/// recolored.
+bool RAGreedy::mayRecolorAllInterferences(
+    MCRegister PhysReg, const LiveInterval &VirtReg,
+    SmallLISet &RecoloringCandidates, const SmallVirtRegSet &FixedRegisters) {
+  const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg());
+
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, Unit);
+    // If there is LastChanceRecoloringMaxInterference or more interferences,
+    // chances are one would not be recolorable.
+    if (Q.interferingVRegs(LastChanceRecoloringMaxInterference).size() >=
+            LastChanceRecoloringMaxInterference &&
+        !ExhaustiveSearch) {
+      LLVM_DEBUG(dbgs() << "Early abort: too many interferences.\n");
+      CutOffInfo |= CO_Interf;
+      return false;
+    }
+    for (const LiveInterval *Intf : reverse(Q.interferingVRegs())) {
+      // If Intf is done and sits on the same register class as VirtReg, it
+      // would not be recolorable as it is in the same state as
+      // VirtReg. However there are at least two exceptions.
+      //
+      // If VirtReg has tied defs and Intf doesn't, then
+      // there is still a point in examining if it can be recolorable.
+      //
+      // Additionally, if the register class has overlapping tuple members, it
+      // may still be recolorable using a different tuple. This is more likely
+      // if the existing assignment aliases with the candidate.
+      //
+      if (((ExtraInfo->getStage(*Intf) == RS_Done &&
+            MRI->getRegClass(Intf->reg()) == CurRC &&
+            !assignedRegPartiallyOverlaps(*TRI, *VRM, PhysReg, *Intf)) &&
+           !(hasTiedDef(MRI, VirtReg.reg()) &&
+             !hasTiedDef(MRI, Intf->reg()))) ||
+          FixedRegisters.count(Intf->reg())) {
+        LLVM_DEBUG(
+            dbgs() << "Early abort: the interference is not recolorable.\n");
+        return false;
+      }
+      RecoloringCandidates.insert(Intf);
+    }
+  }
+  return true;
+}
+
+/// tryLastChanceRecoloring - Try to assign a color to \p VirtReg by recoloring
+/// its interferences.
+/// Last chance recoloring chooses a color for \p VirtReg and recolors every
+/// virtual register that was using it. The recoloring process may recursively
+/// use the last chance recoloring. Therefore, when a virtual register has been
+/// assigned a color by this mechanism, it is marked as Fixed, i.e., it cannot
+/// be last-chance-recolored again during this recoloring "session".
+/// E.g.,
+/// Let
+/// vA can use {R1, R2    }
+/// vB can use {    R2, R3}
+/// vC can use {R1        }
+/// Where vA, vB, and vC cannot be split anymore (they are reloads for
+/// instance) and they all interfere.
+///
+/// vA is assigned R1
+/// vB is assigned R2
+/// vC tries to evict vA but vA is already done.
+/// Regular register allocation fails.
+///
+/// Last chance recoloring kicks in:
+/// vC does as if vA was evicted => vC uses R1.
+/// vC is marked as fixed.
+/// vA needs to find a color.
+/// None are available.
+/// vA cannot evict vC: vC is a fixed virtual register now.
+/// vA does as if vB was evicted => vA uses R2.
+/// vB needs to find a color.
+/// R3 is available.
+/// Recoloring => vC = R1, vA = R2, vB = R3
+///
+/// \p Order defines the preferred allocation order for \p VirtReg.
+/// \p NewRegs will contain any new virtual register that have been created
+/// (split, spill) during the process and that must be assigned.
+/// \p FixedRegisters contains all the virtual registers that cannot be
+/// recolored.
+///
+/// \p RecolorStack tracks the original assignments of successfully recolored
+/// registers.
+///
+/// \p Depth gives the current depth of the last chance recoloring.
+/// \return a physical register that can be used for VirtReg or ~0u if none
+/// exists.
+unsigned RAGreedy::tryLastChanceRecoloring(const LiveInterval &VirtReg,
+                                           AllocationOrder &Order,
+                                           SmallVectorImpl<Register> &NewVRegs,
+                                           SmallVirtRegSet &FixedRegisters,
+                                           RecoloringStack &RecolorStack,
+                                           unsigned Depth) {
+  if (!TRI->shouldUseLastChanceRecoloringForVirtReg(*MF, VirtReg))
+    return ~0u;
+
+  LLVM_DEBUG(dbgs() << "Try last chance recoloring for " << VirtReg << '\n');
+
+  const ssize_t EntryStackSize = RecolorStack.size();
+
+  // Ranges must be Done.
+  assert((ExtraInfo->getStage(VirtReg) >= RS_Done || !VirtReg.isSpillable()) &&
+         "Last chance recoloring should really be last chance");
+  // Set the max depth to LastChanceRecoloringMaxDepth.
+  // We may want to reconsider that if we end up with a too large search space
+  // for target with hundreds of registers.
+  // Indeed, in that case we may want to cut the search space earlier.
+  if (Depth >= LastChanceRecoloringMaxDepth && !ExhaustiveSearch) {
+    LLVM_DEBUG(dbgs() << "Abort because max depth has been reached.\n");
+    CutOffInfo |= CO_Depth;
+    return ~0u;
+  }
+
+  // Set of Live intervals that will need to be recolored.
+  SmallLISet RecoloringCandidates;
+
+  // Mark VirtReg as fixed, i.e., it will not be recolored pass this point in
+  // this recoloring "session".
+  assert(!FixedRegisters.count(VirtReg.reg()));
+  FixedRegisters.insert(VirtReg.reg());
+  SmallVector<Register, 4> CurrentNewVRegs;
+
+  for (MCRegister PhysReg : Order) {
+    assert(PhysReg.isValid());
+    LLVM_DEBUG(dbgs() << "Try to assign: " << VirtReg << " to "
+                      << printReg(PhysReg, TRI) << '\n');
+    RecoloringCandidates.clear();
+    CurrentNewVRegs.clear();
+
+    // It is only possible to recolor virtual register interference.
+    if (Matrix->checkInterference(VirtReg, PhysReg) >
+        LiveRegMatrix::IK_VirtReg) {
+      LLVM_DEBUG(
+          dbgs() << "Some interferences are not with virtual registers.\n");
+
+      continue;
+    }
+
+    // Early give up on this PhysReg if it is obvious we cannot recolor all
+    // the interferences.
+    if (!mayRecolorAllInterferences(PhysReg, VirtReg, RecoloringCandidates,
+                                    FixedRegisters)) {
+      LLVM_DEBUG(dbgs() << "Some interferences cannot be recolored.\n");
+      continue;
+    }
+
+    // RecoloringCandidates contains all the virtual registers that interfere
+    // with VirtReg on PhysReg (or one of its aliases). Enqueue them for
+    // recoloring and perform the actual recoloring.
+    PQueue RecoloringQueue;
+    for (const LiveInterval *RC : RecoloringCandidates) {
+      Register ItVirtReg = RC->reg();
+      enqueue(RecoloringQueue, RC);
+      assert(VRM->hasPhys(ItVirtReg) &&
+             "Interferences are supposed to be with allocated variables");
+
+      // Record the current allocation.
+      RecolorStack.push_back(std::make_pair(RC, VRM->getPhys(ItVirtReg)));
+
+      // unset the related struct.
+      Matrix->unassign(*RC);
+    }
+
+    // Do as if VirtReg was assigned to PhysReg so that the underlying
+    // recoloring has the right information about the interferes and
+    // available colors.
+    Matrix->assign(VirtReg, PhysReg);
+
+    // Save the current recoloring state.
+    // If we cannot recolor all the interferences, we will have to start again
+    // at this point for the next physical register.
+    SmallVirtRegSet SaveFixedRegisters(FixedRegisters);
+    if (tryRecoloringCandidates(RecoloringQueue, CurrentNewVRegs,
+                                FixedRegisters, RecolorStack, Depth)) {
+      // Push the queued vregs into the main queue.
+      for (Register NewVReg : CurrentNewVRegs)
+        NewVRegs.push_back(NewVReg);
+      // Do not mess up with the global assignment process.
+      // I.e., VirtReg must be unassigned.
+      Matrix->unassign(VirtReg);
+      return PhysReg;
+    }
+
+    LLVM_DEBUG(dbgs() << "Fail to assign: " << VirtReg << " to "
+                      << printReg(PhysReg, TRI) << '\n');
+
+    // The recoloring attempt failed, undo the changes.
+    FixedRegisters = SaveFixedRegisters;
+    Matrix->unassign(VirtReg);
+
+    // For a newly created vreg which is also in RecoloringCandidates,
+    // don't add it to NewVRegs because its physical register will be restored
+    // below. Other vregs in CurrentNewVRegs are created by calling
+    // selectOrSplit and should be added into NewVRegs.
+    for (Register R : CurrentNewVRegs) {
+      if (RecoloringCandidates.count(&LIS->getInterval(R)))
+        continue;
+      NewVRegs.push_back(R);
+    }
+
+    // Roll back our unsuccessful recoloring. Also roll back any successful
+    // recolorings in any recursive recoloring attempts, since it's possible
+    // they would have introduced conflicts with assignments we will be
+    // restoring further up the stack. Perform all unassignments prior to
+    // reassigning, since sub-recolorings may have conflicted with the registers
+    // we are going to restore to their original assignments.
+    for (ssize_t I = RecolorStack.size() - 1; I >= EntryStackSize; --I) {
+      const LiveInterval *LI;
+      MCRegister PhysReg;
+      std::tie(LI, PhysReg) = RecolorStack[I];
+
+      if (VRM->hasPhys(LI->reg()))
+        Matrix->unassign(*LI);
+    }
+
+    for (size_t I = EntryStackSize; I != RecolorStack.size(); ++I) {
+      const LiveInterval *LI;
+      MCRegister PhysReg;
+      std::tie(LI, PhysReg) = RecolorStack[I];
+      if (!LI->empty() && !MRI->reg_nodbg_empty(LI->reg()))
+        Matrix->assign(*LI, PhysReg);
+    }
+
+    // Pop the stack of recoloring attempts.
+    RecolorStack.resize(EntryStackSize);
+  }
+
+  // Last chance recoloring did not worked either, give up.
+  return ~0u;
+}
+
+/// tryRecoloringCandidates - Try to assign a new color to every register
+/// in \RecoloringQueue.
+/// \p NewRegs will contain any new virtual register created during the
+/// recoloring process.
+/// \p FixedRegisters[in/out] contains all the registers that have been
+/// recolored.
+/// \return true if all virtual registers in RecoloringQueue were successfully
+/// recolored, false otherwise.
+bool RAGreedy::tryRecoloringCandidates(PQueue &RecoloringQueue,
+                                       SmallVectorImpl<Register> &NewVRegs,
+                                       SmallVirtRegSet &FixedRegisters,
+                                       RecoloringStack &RecolorStack,
+                                       unsigned Depth) {
+  while (!RecoloringQueue.empty()) {
+    const LiveInterval *LI = dequeue(RecoloringQueue);
+    LLVM_DEBUG(dbgs() << "Try to recolor: " << *LI << '\n');
+    MCRegister PhysReg = selectOrSplitImpl(*LI, NewVRegs, FixedRegisters,
+                                           RecolorStack, Depth + 1);
+    // When splitting happens, the live-range may actually be empty.
+    // In that case, this is okay to continue the recoloring even
+    // if we did not find an alternative color for it. Indeed,
+    // there will not be anything to color for LI in the end.
+    if (PhysReg == ~0u || (!PhysReg && !LI->empty()))
+      return false;
+
+    if (!PhysReg) {
+      assert(LI->empty() && "Only empty live-range do not require a register");
+      LLVM_DEBUG(dbgs() << "Recoloring of " << *LI
+                        << " succeeded. Empty LI.\n");
+      continue;
+    }
+    LLVM_DEBUG(dbgs() << "Recoloring of " << *LI
+                      << " succeeded with: " << printReg(PhysReg, TRI) << '\n');
+
+    Matrix->assign(*LI, PhysReg);
+    FixedRegisters.insert(LI->reg());
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                            Main Entry Point
+//===----------------------------------------------------------------------===//
+
+MCRegister RAGreedy::selectOrSplit(const LiveInterval &VirtReg,
+                                   SmallVectorImpl<Register> &NewVRegs) {
+  CutOffInfo = CO_None;
+  LLVMContext &Ctx = MF->getFunction().getContext();
+  SmallVirtRegSet FixedRegisters;
+  RecoloringStack RecolorStack;
+  MCRegister Reg =
+      selectOrSplitImpl(VirtReg, NewVRegs, FixedRegisters, RecolorStack);
+  if (Reg == ~0U && (CutOffInfo != CO_None)) {
+    uint8_t CutOffEncountered = CutOffInfo & (CO_Depth | CO_Interf);
+    if (CutOffEncountered == CO_Depth)
+      Ctx.emitError("register allocation failed: maximum depth for recoloring "
+                    "reached. Use -fexhaustive-register-search to skip "
+                    "cutoffs");
+    else if (CutOffEncountered == CO_Interf)
+      Ctx.emitError("register allocation failed: maximum interference for "
+                    "recoloring reached. Use -fexhaustive-register-search "
+                    "to skip cutoffs");
+    else if (CutOffEncountered == (CO_Depth | CO_Interf))
+      Ctx.emitError("register allocation failed: maximum interference and "
+                    "depth for recoloring reached. Use "
+                    "-fexhaustive-register-search to skip cutoffs");
+  }
+  return Reg;
+}
+
+/// Using a CSR for the first time has a cost because it causes push|pop
+/// to be added to prologue|epilogue. Splitting a cold section of the live
+/// range can have lower cost than using the CSR for the first time;
+/// Spilling a live range in the cold path can have lower cost than using
+/// the CSR for the first time. Returns the physical register if we decide
+/// to use the CSR; otherwise return 0.
+MCRegister RAGreedy::tryAssignCSRFirstTime(
+    const LiveInterval &VirtReg, AllocationOrder &Order, MCRegister PhysReg,
+    uint8_t &CostPerUseLimit, SmallVectorImpl<Register> &NewVRegs) {
+  if (ExtraInfo->getStage(VirtReg) == RS_Spill && VirtReg.isSpillable()) {
+    // We choose spill over using the CSR for the first time if the spill cost
+    // is lower than CSRCost.
+    SA->analyze(&VirtReg);
+    if (calcSpillCost() >= CSRCost)
+      return PhysReg;
+
+    // We are going to spill, set CostPerUseLimit to 1 to make sure that
+    // we will not use a callee-saved register in tryEvict.
+    CostPerUseLimit = 1;
+    return 0;
+  }
+  if (ExtraInfo->getStage(VirtReg) < RS_Split) {
+    // We choose pre-splitting over using the CSR for the first time if
+    // the cost of splitting is lower than CSRCost.
+    SA->analyze(&VirtReg);
+    unsigned NumCands = 0;
+    BlockFrequency BestCost = CSRCost; // Don't modify CSRCost.
+    unsigned BestCand = calculateRegionSplitCost(VirtReg, Order, BestCost,
+                                                 NumCands, true /*IgnoreCSR*/);
+    if (BestCand == NoCand)
+      // Use the CSR if we can't find a region split below CSRCost.
+      return PhysReg;
+
+    // Perform the actual pre-splitting.
+    doRegionSplit(VirtReg, BestCand, false/*HasCompact*/, NewVRegs);
+    return 0;
+  }
+  return PhysReg;
+}
+
+void RAGreedy::aboutToRemoveInterval(const LiveInterval &LI) {
+  // Do not keep invalid information around.
+  SetOfBrokenHints.remove(&LI);
+}
+
+void RAGreedy::initializeCSRCost() {
+  // We use the larger one out of the command-line option and the value report
+  // by TRI.
+  CSRCost = BlockFrequency(
+      std::max((unsigned)CSRFirstTimeCost, TRI->getCSRFirstUseCost()));
+  if (!CSRCost.getFrequency())
+    return;
+
+  // Raw cost is relative to Entry == 2^14; scale it appropriately.
+  uint64_t ActualEntry = MBFI->getEntryFreq();
+  if (!ActualEntry) {
+    CSRCost = 0;
+    return;
+  }
+  uint64_t FixedEntry = 1 << 14;
+  if (ActualEntry < FixedEntry)
+    CSRCost *= BranchProbability(ActualEntry, FixedEntry);
+  else if (ActualEntry <= UINT32_MAX)
+    // Invert the fraction and divide.
+    CSRCost /= BranchProbability(FixedEntry, ActualEntry);
+  else
+    // Can't use BranchProbability in general, since it takes 32-bit numbers.
+    CSRCost = CSRCost.getFrequency() * (ActualEntry / FixedEntry);
+}
+
+/// Collect the hint info for \p Reg.
+/// The results are stored into \p Out.
+/// \p Out is not cleared before being populated.
+void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) {
+  for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
+    if (!Instr.isFullCopy())
+      continue;
+    // Look for the other end of the copy.
+    Register OtherReg = Instr.getOperand(0).getReg();
+    if (OtherReg == Reg) {
+      OtherReg = Instr.getOperand(1).getReg();
+      if (OtherReg == Reg)
+        continue;
+    }
+    // Get the current assignment.
+    MCRegister OtherPhysReg =
+        OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg);
+    // Push the collected information.
+    Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
+                           OtherPhysReg));
+  }
+}
+
+/// Using the given \p List, compute the cost of the broken hints if
+/// \p PhysReg was used.
+/// \return The cost of \p List for \p PhysReg.
+BlockFrequency RAGreedy::getBrokenHintFreq(const HintsInfo &List,
+                                           MCRegister PhysReg) {
+  BlockFrequency Cost = 0;
+  for (const HintInfo &Info : List) {
+    if (Info.PhysReg != PhysReg)
+      Cost += Info.Freq;
+  }
+  return Cost;
+}
+
+/// Using the register assigned to \p VirtReg, try to recolor
+/// all the live ranges that are copy-related with \p VirtReg.
+/// The recoloring is then propagated to all the live-ranges that have
+/// been recolored and so on, until no more copies can be coalesced or
+/// it is not profitable.
+/// For a given live range, profitability is determined by the sum of the
+/// frequencies of the non-identity copies it would introduce with the old
+/// and new register.
+void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
+  // We have a broken hint, check if it is possible to fix it by
+  // reusing PhysReg for the copy-related live-ranges. Indeed, we evicted
+  // some register and PhysReg may be available for the other live-ranges.
+  SmallSet<Register, 4> Visited;
+  SmallVector<unsigned, 2> RecoloringCandidates;
+  HintsInfo Info;
+  Register Reg = VirtReg.reg();
+  MCRegister PhysReg = VRM->getPhys(Reg);
+  // Start the recoloring algorithm from the input live-interval, then
+  // it will propagate to the ones that are copy-related with it.
+  Visited.insert(Reg);
+  RecoloringCandidates.push_back(Reg);
+
+  LLVM_DEBUG(dbgs() << "Trying to reconcile hints for: " << printReg(Reg, TRI)
+                    << '(' << printReg(PhysReg, TRI) << ")\n");
+
+  do {
+    Reg = RecoloringCandidates.pop_back_val();
+
+    // We cannot recolor physical register.
+    if (Reg.isPhysical())
+      continue;
+
+    // This may be a skipped class
+    if (!VRM->hasPhys(Reg)) {
+      assert(!ShouldAllocateClass(*TRI, *MRI->getRegClass(Reg)) &&
+             "We have an unallocated variable which should have been handled");
+      continue;
+    }
+
+    // Get the live interval mapped with this virtual register to be able
+    // to check for the interference with the new color.
+    LiveInterval &LI = LIS->getInterval(Reg);
+    MCRegister CurrPhys = VRM->getPhys(Reg);
+    // Check that the new color matches the register class constraints and
+    // that it is free for this live range.
+    if (CurrPhys != PhysReg && (!MRI->getRegClass(Reg)->contains(PhysReg) ||
+                                Matrix->checkInterference(LI, PhysReg)))
+      continue;
+
+    LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << '(' << printReg(CurrPhys, TRI)
+                      << ") is recolorable.\n");
+
+    // Gather the hint info.
+    Info.clear();
+    collectHintInfo(Reg, Info);
+    // Check if recoloring the live-range will increase the cost of the
+    // non-identity copies.
+    if (CurrPhys != PhysReg) {
+      LLVM_DEBUG(dbgs() << "Checking profitability:\n");
+      BlockFrequency OldCopiesCost = getBrokenHintFreq(Info, CurrPhys);
+      BlockFrequency NewCopiesCost = getBrokenHintFreq(Info, PhysReg);
+      LLVM_DEBUG(dbgs() << "Old Cost: " << OldCopiesCost.getFrequency()
+                        << "\nNew Cost: " << NewCopiesCost.getFrequency()
+                        << '\n');
+      if (OldCopiesCost < NewCopiesCost) {
+        LLVM_DEBUG(dbgs() << "=> Not profitable.\n");
+        continue;
+      }
+      // At this point, the cost is either cheaper or equal. If it is
+      // equal, we consider this is profitable because it may expose
+      // more recoloring opportunities.
+      LLVM_DEBUG(dbgs() << "=> Profitable.\n");
+      // Recolor the live-range.
+      Matrix->unassign(LI);
+      Matrix->assign(LI, PhysReg);
+    }
+    // Push all copy-related live-ranges to keep reconciling the broken
+    // hints.
+    for (const HintInfo &HI : Info) {
+      if (Visited.insert(HI.Reg).second)
+        RecoloringCandidates.push_back(HI.Reg);
+    }
+  } while (!RecoloringCandidates.empty());
+}
+
+/// Try to recolor broken hints.
+/// Broken hints may be repaired by recoloring when an evicted variable
+/// freed up a register for a larger live-range.
+/// Consider the following example:
+/// BB1:
+///   a =
+///   b =
+/// BB2:
+///   ...
+///   = b
+///   = a
+/// Let us assume b gets split:
+/// BB1:
+///   a =
+///   b =
+/// BB2:
+///   c = b
+///   ...
+///   d = c
+///   = d
+///   = a
+/// Because of how the allocation work, b, c, and d may be assigned different
+/// colors. Now, if a gets evicted later:
+/// BB1:
+///   a =
+///   st a, SpillSlot
+///   b =
+/// BB2:
+///   c = b
+///   ...
+///   d = c
+///   = d
+///   e = ld SpillSlot
+///   = e
+/// This is likely that we can assign the same register for b, c, and d,
+/// getting rid of 2 copies.
+void RAGreedy::tryHintsRecoloring() {
+  for (const LiveInterval *LI : SetOfBrokenHints) {
+    assert(LI->reg().isVirtual() &&
+           "Recoloring is possible only for virtual registers");
+    // Some dead defs may be around (e.g., because of debug uses).
+    // Ignore those.
+    if (!VRM->hasPhys(LI->reg()))
+      continue;
+    tryHintRecoloring(*LI);
+  }
+}
+
+MCRegister RAGreedy::selectOrSplitImpl(const LiveInterval &VirtReg,
+                                       SmallVectorImpl<Register> &NewVRegs,
+                                       SmallVirtRegSet &FixedRegisters,
+                                       RecoloringStack &RecolorStack,
+                                       unsigned Depth) {
+  uint8_t CostPerUseLimit = uint8_t(~0u);
+  // First try assigning a free register.
+  auto Order =
+      AllocationOrder::create(VirtReg.reg(), *VRM, RegClassInfo, Matrix);
+  if (MCRegister PhysReg =
+          tryAssign(VirtReg, Order, NewVRegs, FixedRegisters)) {
+    // When NewVRegs is not empty, we may have made decisions such as evicting
+    // a virtual register, go with the earlier decisions and use the physical
+    // register.
+    if (CSRCost.getFrequency() &&
+        EvictAdvisor->isUnusedCalleeSavedReg(PhysReg) && NewVRegs.empty()) {
+      MCRegister CSRReg = tryAssignCSRFirstTime(VirtReg, Order, PhysReg,
+                                                CostPerUseLimit, NewVRegs);
+      if (CSRReg || !NewVRegs.empty())
+        // Return now if we decide to use a CSR or create new vregs due to
+        // pre-splitting.
+        return CSRReg;
+    } else
+      return PhysReg;
+  }
+
+  LiveRangeStage Stage = ExtraInfo->getStage(VirtReg);
+  LLVM_DEBUG(dbgs() << StageName[Stage] << " Cascade "
+                    << ExtraInfo->getCascade(VirtReg.reg()) << '\n');
+
+  // Try to evict a less worthy live range, but only for ranges from the primary
+  // queue. The RS_Split ranges already failed to do this, and they should not
+  // get a second chance until they have been split.
+  if (Stage != RS_Split)
+    if (Register PhysReg =
+            tryEvict(VirtReg, Order, NewVRegs, CostPerUseLimit,
+                     FixedRegisters)) {
+      Register Hint = MRI->getSimpleHint(VirtReg.reg());
+      // If VirtReg has a hint and that hint is broken record this
+      // virtual register as a recoloring candidate for broken hint.
+      // Indeed, since we evicted a variable in its neighborhood it is
+      // likely we can at least partially recolor some of the
+      // copy-related live-ranges.
+      if (Hint && Hint != PhysReg)
+        SetOfBrokenHints.insert(&VirtReg);
+      return PhysReg;
+    }
+
+  assert((NewVRegs.empty() || Depth) && "Cannot append to existing NewVRegs");
+
+  // The first time we see a live range, don't try to split or spill.
+  // Wait until the second time, when all smaller ranges have been allocated.
+  // This gives a better picture of the interference to split around.
+  if (Stage < RS_Split) {
+    ExtraInfo->setStage(VirtReg, RS_Split);
+    LLVM_DEBUG(dbgs() << "wait for second round\n");
+    NewVRegs.push_back(VirtReg.reg());
+    return 0;
+  }
+
+  if (Stage < RS_Spill) {
+    // Try splitting VirtReg or interferences.
+    unsigned NewVRegSizeBefore = NewVRegs.size();
+    Register PhysReg = trySplit(VirtReg, Order, NewVRegs, FixedRegisters);
+    if (PhysReg || (NewVRegs.size() - NewVRegSizeBefore))
+      return PhysReg;
+  }
+
+  // If we couldn't allocate a register from spilling, there is probably some
+  // invalid inline assembly. The base class will report it.
+  if (Stage >= RS_Done || !VirtReg.isSpillable()) {
+    return tryLastChanceRecoloring(VirtReg, Order, NewVRegs, FixedRegisters,
+                                   RecolorStack, Depth);
+  }
+
+  // Finally spill VirtReg itself.
+  if ((EnableDeferredSpilling ||
+       TRI->shouldUseDeferredSpillingForVirtReg(*MF, VirtReg)) &&
+      ExtraInfo->getStage(VirtReg) < RS_Memory) {
+    // TODO: This is experimental and in particular, we do not model
+    // the live range splitting done by spilling correctly.
+    // We would need a deep integration with the spiller to do the
+    // right thing here. Anyway, that is still good for early testing.
+    ExtraInfo->setStage(VirtReg, RS_Memory);
+    LLVM_DEBUG(dbgs() << "Do as if this register is in memory\n");
+    NewVRegs.push_back(VirtReg.reg());
+  } else {
+    NamedRegionTimer T("spill", "Spiller", TimerGroupName,
+                       TimerGroupDescription, TimePassesIsEnabled);
+    LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
+    spiller().spill(LRE);
+    ExtraInfo->setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
+
+    // Tell LiveDebugVariables about the new ranges. Ranges not being covered by
+    // the new regs are kept in LDV (still mapping to the old register), until
+    // we rewrite spilled locations in LDV at a later stage.
+    DebugVars->splitRegister(VirtReg.reg(), LRE.regs(), *LIS);
+
+    if (VerifyEnabled)
+      MF->verify(this, "After spilling");
+  }
+
+  // The live virtual register requesting allocation was spilled, so tell
+  // the caller not to allocate anything during this round.
+  return 0;
+}
+
+void RAGreedy::RAGreedyStats::report(MachineOptimizationRemarkMissed &R) {
+  using namespace ore;
+  if (Spills) {
+    R << NV("NumSpills", Spills) << " spills ";
+    R << NV("TotalSpillsCost", SpillsCost) << " total spills cost ";
+  }
+  if (FoldedSpills) {
+    R << NV("NumFoldedSpills", FoldedSpills) << " folded spills ";
+    R << NV("TotalFoldedSpillsCost", FoldedSpillsCost)
+      << " total folded spills cost ";
+  }
+  if (Reloads) {
+    R << NV("NumReloads", Reloads) << " reloads ";
+    R << NV("TotalReloadsCost", ReloadsCost) << " total reloads cost ";
+  }
+  if (FoldedReloads) {
+    R << NV("NumFoldedReloads", FoldedReloads) << " folded reloads ";
+    R << NV("TotalFoldedReloadsCost", FoldedReloadsCost)
+      << " total folded reloads cost ";
+  }
+  if (ZeroCostFoldedReloads)
+    R << NV("NumZeroCostFoldedReloads", ZeroCostFoldedReloads)
+      << " zero cost folded reloads ";
+  if (Copies) {
+    R << NV("NumVRCopies", Copies) << " virtual registers copies ";
+    R << NV("TotalCopiesCost", CopiesCost) << " total copies cost ";
+  }
+}
+
+RAGreedy::RAGreedyStats RAGreedy::computeStats(MachineBasicBlock &MBB) {
+  RAGreedyStats Stats;
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  int FI;
+
+  auto isSpillSlotAccess = [&MFI](const MachineMemOperand *A) {
+    return MFI.isSpillSlotObjectIndex(cast<FixedStackPseudoSourceValue>(
+        A->getPseudoValue())->getFrameIndex());
+  };
+  auto isPatchpointInstr = [](const MachineInstr &MI) {
+    return MI.getOpcode() == TargetOpcode::PATCHPOINT ||
+           MI.getOpcode() == TargetOpcode::STACKMAP ||
+           MI.getOpcode() == TargetOpcode::STATEPOINT;
+  };
+  for (MachineInstr &MI : MBB) {
+    if (MI.isCopy()) {
+      const MachineOperand &Dest = MI.getOperand(0);
+      const MachineOperand &Src = MI.getOperand(1);
+      Register SrcReg = Src.getReg();
+      Register DestReg = Dest.getReg();
+      // Only count `COPY`s with a virtual register as source or destination.
+      if (SrcReg.isVirtual() || DestReg.isVirtual()) {
+        if (SrcReg.isVirtual()) {
+          SrcReg = VRM->getPhys(SrcReg);
+          if (SrcReg && Src.getSubReg())
+            SrcReg = TRI->getSubReg(SrcReg, Src.getSubReg());
+        }
+        if (DestReg.isVirtual()) {
+          DestReg = VRM->getPhys(DestReg);
+          if (DestReg && Dest.getSubReg())
+            DestReg = TRI->getSubReg(DestReg, Dest.getSubReg());
+        }
+        if (SrcReg != DestReg)
+          ++Stats.Copies;
+      }
+      continue;
+    }
+
+    SmallVector<const MachineMemOperand *, 2> Accesses;
+    if (TII->isLoadFromStackSlot(MI, FI) && MFI.isSpillSlotObjectIndex(FI)) {
+      ++Stats.Reloads;
+      continue;
+    }
+    if (TII->isStoreToStackSlot(MI, FI) && MFI.isSpillSlotObjectIndex(FI)) {
+      ++Stats.Spills;
+      continue;
+    }
+    if (TII->hasLoadFromStackSlot(MI, Accesses) &&
+        llvm::any_of(Accesses, isSpillSlotAccess)) {
+      if (!isPatchpointInstr(MI)) {
+        Stats.FoldedReloads += Accesses.size();
+        continue;
+      }
+      // For statepoint there may be folded and zero cost folded stack reloads.
+      std::pair<unsigned, unsigned> NonZeroCostRange =
+          TII->getPatchpointUnfoldableRange(MI);
+      SmallSet<unsigned, 16> FoldedReloads;
+      SmallSet<unsigned, 16> ZeroCostFoldedReloads;
+      for (unsigned Idx = 0, E = MI.getNumOperands(); Idx < E; ++Idx) {
+        MachineOperand &MO = MI.getOperand(Idx);
+        if (!MO.isFI() || !MFI.isSpillSlotObjectIndex(MO.getIndex()))
+          continue;
+        if (Idx >= NonZeroCostRange.first && Idx < NonZeroCostRange.second)
+          FoldedReloads.insert(MO.getIndex());
+        else
+          ZeroCostFoldedReloads.insert(MO.getIndex());
+      }
+      // If stack slot is used in folded reload it is not zero cost then.
+      for (unsigned Slot : FoldedReloads)
+        ZeroCostFoldedReloads.erase(Slot);
+      Stats.FoldedReloads += FoldedReloads.size();
+      Stats.ZeroCostFoldedReloads += ZeroCostFoldedReloads.size();
+      continue;
+    }
+    Accesses.clear();
+    if (TII->hasStoreToStackSlot(MI, Accesses) &&
+        llvm::any_of(Accesses, isSpillSlotAccess)) {
+      Stats.FoldedSpills += Accesses.size();
+    }
+  }
+  // Set cost of collected statistic by multiplication to relative frequency of
+  // this basic block.
+  float RelFreq = MBFI->getBlockFreqRelativeToEntryBlock(&MBB);
+  Stats.ReloadsCost = RelFreq * Stats.Reloads;
+  Stats.FoldedReloadsCost = RelFreq * Stats.FoldedReloads;
+  Stats.SpillsCost = RelFreq * Stats.Spills;
+  Stats.FoldedSpillsCost = RelFreq * Stats.FoldedSpills;
+  Stats.CopiesCost = RelFreq * Stats.Copies;
+  return Stats;
+}
+
+RAGreedy::RAGreedyStats RAGreedy::reportStats(MachineLoop *L) {
+  RAGreedyStats Stats;
+
+  // Sum up the spill and reloads in subloops.
+  for (MachineLoop *SubLoop : *L)
+    Stats.add(reportStats(SubLoop));
+
+  for (MachineBasicBlock *MBB : L->getBlocks())
+    // Handle blocks that were not included in subloops.
+    if (Loops->getLoopFor(MBB) == L)
+      Stats.add(computeStats(*MBB));
+
+  if (!Stats.isEmpty()) {
+    using namespace ore;
+
+    ORE->emit([&]() {
+      MachineOptimizationRemarkMissed R(DEBUG_TYPE, "LoopSpillReloadCopies",
+                                        L->getStartLoc(), L->getHeader());
+      Stats.report(R);
+      R << "generated in loop";
+      return R;
+    });
+  }
+  return Stats;
+}
+
+void RAGreedy::reportStats() {
+  if (!ORE->allowExtraAnalysis(DEBUG_TYPE))
+    return;
+  RAGreedyStats Stats;
+  for (MachineLoop *L : *Loops)
+    Stats.add(reportStats(L));
+  // Process non-loop blocks.
+  for (MachineBasicBlock &MBB : *MF)
+    if (!Loops->getLoopFor(&MBB))
+      Stats.add(computeStats(MBB));
+  if (!Stats.isEmpty()) {
+    using namespace ore;
+
+    ORE->emit([&]() {
+      DebugLoc Loc;
+      if (auto *SP = MF->getFunction().getSubprogram())
+        Loc = DILocation::get(SP->getContext(), SP->getLine(), 1, SP);
+      MachineOptimizationRemarkMissed R(DEBUG_TYPE, "SpillReloadCopies", Loc,
+                                        &MF->front());
+      Stats.report(R);
+      R << "generated in function";
+      return R;
+    });
+  }
+}
+
+bool RAGreedy::hasVirtRegAlloc() {
+  for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+    if (!RC)
+      continue;
+    if (ShouldAllocateClass(*TRI, *RC))
+      return true;
+  }
+
+  return false;
+}
+
+bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
+  LLVM_DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n"
+                    << "********** Function: " << mf.getName() << '\n');
+
+  MF = &mf;
+  TII = MF->getSubtarget().getInstrInfo();
+
+  if (VerifyEnabled)
+    MF->verify(this, "Before greedy register allocator");
+
+  RegAllocBase::init(getAnalysis<VirtRegMap>(),
+                     getAnalysis<LiveIntervals>(),
+                     getAnalysis<LiveRegMatrix>());
+
+  // Early return if there is no virtual register to be allocated to a
+  // physical register.
+  if (!hasVirtRegAlloc())
+    return false;
+
+  Indexes = &getAnalysis<SlotIndexes>();
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
+  Loops = &getAnalysis<MachineLoopInfo>();
+  Bundles = &getAnalysis<EdgeBundles>();
+  SpillPlacer = &getAnalysis<SpillPlacement>();
+  DebugVars = &getAnalysis<LiveDebugVariables>();
+
+  initializeCSRCost();
+
+  RegCosts = TRI->getRegisterCosts(*MF);
+  RegClassPriorityTrumpsGlobalness =
+      GreedyRegClassPriorityTrumpsGlobalness.getNumOccurrences()
+          ? GreedyRegClassPriorityTrumpsGlobalness
+          : TRI->regClassPriorityTrumpsGlobalness(*MF);
+
+  ReverseLocalAssignment = GreedyReverseLocalAssignment.getNumOccurrences()
+                               ? GreedyReverseLocalAssignment
+                               : TRI->reverseLocalAssignment();
+
+  ExtraInfo.emplace();
+  EvictAdvisor =
+      getAnalysis<RegAllocEvictionAdvisorAnalysis>().getAdvisor(*MF, *this);
+  PriorityAdvisor =
+      getAnalysis<RegAllocPriorityAdvisorAnalysis>().getAdvisor(*MF, *this);
+
+  VRAI = std::make_unique<VirtRegAuxInfo>(*MF, *LIS, *VRM, *Loops, *MBFI);
+  SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM, *VRAI));
+
+  VRAI->calculateSpillWeightsAndHints();
+
+  LLVM_DEBUG(LIS->dump());
+
+  SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops));
+  SE.reset(new SplitEditor(*SA, *LIS, *VRM, *DomTree, *MBFI, *VRAI));
+
+  IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI);
+  GlobalCand.resize(32);  // This will grow as needed.
+  SetOfBrokenHints.clear();
+
+  allocatePhysRegs();
+  tryHintsRecoloring();
+
+  if (VerifyEnabled)
+    MF->verify(this, "Before post optimization");
+  postOptimization();
+  reportStats();
+
+  releaseMemory();
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.h b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.h
new file mode 100644
index 000000000000..0f8f9a7d5811
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocGreedy.h
@@ -0,0 +1,447 @@
+//==- RegAllocGreedy.h ------- greedy register allocator  ----------*-C++-*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file defines the RAGreedy function pass for register allocation in
+// optimized builds.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGALLOCGREEDY_H_
+#define LLVM_CODEGEN_REGALLOCGREEDY_H_
+
+#include "InterferenceCache.h"
+#include "RegAllocBase.h"
+#include "RegAllocEvictionAdvisor.h"
+#include "RegAllocPriorityAdvisor.h"
+#include "SpillPlacement.h"
+#include "SplitKit.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/Spiller.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <queue>
+#include <utility>
+
+namespace llvm {
+class AllocationOrder;
+class AnalysisUsage;
+class EdgeBundles;
+class LiveDebugVariables;
+class LiveIntervals;
+class LiveRegMatrix;
+class MachineBasicBlock;
+class MachineBlockFrequencyInfo;
+class MachineDominatorTree;
+class MachineLoop;
+class MachineLoopInfo;
+class MachineOptimizationRemarkEmitter;
+class MachineOptimizationRemarkMissed;
+class SlotIndexes;
+class TargetInstrInfo;
+class VirtRegMap;
+
+class LLVM_LIBRARY_VISIBILITY RAGreedy : public MachineFunctionPass,
+                                         public RegAllocBase,
+                                         private LiveRangeEdit::Delegate {
+  // Interface to eviction advisers
+public:
+  /// Track allocation stage and eviction loop prevention during allocation.
+  class ExtraRegInfo final {
+    // RegInfo - Keep additional information about each live range.
+    struct RegInfo {
+      LiveRangeStage Stage = RS_New;
+
+      // Cascade - Eviction loop prevention. See
+      // canEvictInterferenceBasedOnCost().
+      unsigned Cascade = 0;
+
+      RegInfo() = default;
+    };
+
+    IndexedMap<RegInfo, VirtReg2IndexFunctor> Info;
+    unsigned NextCascade = 1;
+
+  public:
+    ExtraRegInfo() {}
+    ExtraRegInfo(const ExtraRegInfo &) = delete;
+
+    LiveRangeStage getStage(Register Reg) const { return Info[Reg].Stage; }
+
+    LiveRangeStage getStage(const LiveInterval &VirtReg) const {
+      return getStage(VirtReg.reg());
+    }
+
+    void setStage(Register Reg, LiveRangeStage Stage) {
+      Info.grow(Reg.id());
+      Info[Reg].Stage = Stage;
+    }
+
+    void setStage(const LiveInterval &VirtReg, LiveRangeStage Stage) {
+      setStage(VirtReg.reg(), Stage);
+    }
+
+    /// Return the current stage of the register, if present, otherwise
+    /// initialize it and return that.
+    LiveRangeStage getOrInitStage(Register Reg) {
+      Info.grow(Reg.id());
+      return getStage(Reg);
+    }
+
+    unsigned getCascade(Register Reg) const { return Info[Reg].Cascade; }
+
+    void setCascade(Register Reg, unsigned Cascade) {
+      Info.grow(Reg.id());
+      Info[Reg].Cascade = Cascade;
+    }
+
+    unsigned getOrAssignNewCascade(Register Reg) {
+      unsigned Cascade = getCascade(Reg);
+      if (!Cascade) {
+        Cascade = NextCascade++;
+        setCascade(Reg, Cascade);
+      }
+      return Cascade;
+    }
+
+    unsigned getCascadeOrCurrentNext(Register Reg) const {
+      unsigned Cascade = getCascade(Reg);
+      if (!Cascade)
+        Cascade = NextCascade;
+      return Cascade;
+    }
+
+    template <typename Iterator>
+    void setStage(Iterator Begin, Iterator End, LiveRangeStage NewStage) {
+      for (; Begin != End; ++Begin) {
+        Register Reg = *Begin;
+        Info.grow(Reg.id());
+        if (Info[Reg].Stage == RS_New)
+          Info[Reg].Stage = NewStage;
+      }
+    }
+    void LRE_DidCloneVirtReg(Register New, Register Old);
+  };
+
+  LiveRegMatrix *getInterferenceMatrix() const { return Matrix; }
+  LiveIntervals *getLiveIntervals() const { return LIS; }
+  VirtRegMap *getVirtRegMap() const { return VRM; }
+  const RegisterClassInfo &getRegClassInfo() const { return RegClassInfo; }
+  const ExtraRegInfo &getExtraInfo() const { return *ExtraInfo; }
+  size_t getQueueSize() const { return Queue.size(); }
+  // end (interface to eviction advisers)
+
+  // Interface to priority advisers
+  bool getRegClassPriorityTrumpsGlobalness() const {
+    return RegClassPriorityTrumpsGlobalness;
+  }
+  bool getReverseLocalAssignment() const { return ReverseLocalAssignment; }
+  // end (interface to priority advisers)
+
+private:
+  // Convenient shortcuts.
+  using PQueue = std::priority_queue<std::pair<unsigned, unsigned>>;
+  using SmallLISet = SmallSetVector<const LiveInterval *, 4>;
+
+  // We need to track all tentative recolorings so we can roll back any
+  // successful and unsuccessful recoloring attempts.
+  using RecoloringStack =
+      SmallVector<std::pair<const LiveInterval *, MCRegister>, 8>;
+
+  // context
+  MachineFunction *MF = nullptr;
+
+  // Shortcuts to some useful interface.
+  const TargetInstrInfo *TII = nullptr;
+
+  // analyses
+  SlotIndexes *Indexes = nullptr;
+  MachineBlockFrequencyInfo *MBFI = nullptr;
+  MachineDominatorTree *DomTree = nullptr;
+  MachineLoopInfo *Loops = nullptr;
+  MachineOptimizationRemarkEmitter *ORE = nullptr;
+  EdgeBundles *Bundles = nullptr;
+  SpillPlacement *SpillPlacer = nullptr;
+  LiveDebugVariables *DebugVars = nullptr;
+
+  // state
+  std::unique_ptr<Spiller> SpillerInstance;
+  PQueue Queue;
+  std::unique_ptr<VirtRegAuxInfo> VRAI;
+  std::optional<ExtraRegInfo> ExtraInfo;
+  std::unique_ptr<RegAllocEvictionAdvisor> EvictAdvisor;
+
+  std::unique_ptr<RegAllocPriorityAdvisor> PriorityAdvisor;
+
+  // Enum CutOffStage to keep a track whether the register allocation failed
+  // because of the cutoffs encountered in last chance recoloring.
+  // Note: This is used as bitmask. New value should be next power of 2.
+  enum CutOffStage {
+    // No cutoffs encountered
+    CO_None = 0,
+
+    // lcr-max-depth cutoff encountered
+    CO_Depth = 1,
+
+    // lcr-max-interf cutoff encountered
+    CO_Interf = 2
+  };
+
+  uint8_t CutOffInfo = CutOffStage::CO_None;
+
+#ifndef NDEBUG
+  static const char *const StageName[];
+#endif
+
+  // splitting state.
+  std::unique_ptr<SplitAnalysis> SA;
+  std::unique_ptr<SplitEditor> SE;
+
+  /// Cached per-block interference maps
+  InterferenceCache IntfCache;
+
+  /// All basic blocks where the current register has uses.
+  SmallVector<SpillPlacement::BlockConstraint, 8> SplitConstraints;
+
+  /// Global live range splitting candidate info.
+  struct GlobalSplitCandidate {
+    // Register intended for assignment, or 0.
+    MCRegister PhysReg;
+
+    // SplitKit interval index for this candidate.
+    unsigned IntvIdx;
+
+    // Interference for PhysReg.
+    InterferenceCache::Cursor Intf;
+
+    // Bundles where this candidate should be live.
+    BitVector LiveBundles;
+    SmallVector<unsigned, 8> ActiveBlocks;
+
+    void reset(InterferenceCache &Cache, MCRegister Reg) {
+      PhysReg = Reg;
+      IntvIdx = 0;
+      Intf.setPhysReg(Cache, Reg);
+      LiveBundles.clear();
+      ActiveBlocks.clear();
+    }
+
+    // Set B[I] = C for every live bundle where B[I] was NoCand.
+    unsigned getBundles(SmallVectorImpl<unsigned> &B, unsigned C) {
+      unsigned Count = 0;
+      for (unsigned I : LiveBundles.set_bits())
+        if (B[I] == NoCand) {
+          B[I] = C;
+          Count++;
+        }
+      return Count;
+    }
+  };
+
+  /// Candidate info for each PhysReg in AllocationOrder.
+  /// This vector never shrinks, but grows to the size of the largest register
+  /// class.
+  SmallVector<GlobalSplitCandidate, 32> GlobalCand;
+
+  enum : unsigned { NoCand = ~0u };
+
+  /// Candidate map. Each edge bundle is assigned to a GlobalCand entry, or to
+  /// NoCand which indicates the stack interval.
+  SmallVector<unsigned, 32> BundleCand;
+
+  /// Callee-save register cost, calculated once per machine function.
+  BlockFrequency CSRCost;
+
+  /// Set of broken hints that may be reconciled later because of eviction.
+  SmallSetVector<const LiveInterval *, 8> SetOfBrokenHints;
+
+  /// The register cost values. This list will be recreated for each Machine
+  /// Function
+  ArrayRef<uint8_t> RegCosts;
+
+  /// Flags for the live range priority calculation, determined once per
+  /// machine function.
+  bool RegClassPriorityTrumpsGlobalness = false;
+
+  bool ReverseLocalAssignment = false;
+
+public:
+  RAGreedy(const RegClassFilterFunc F = allocateAllRegClasses);
+
+  /// Return the pass name.
+  StringRef getPassName() const override { return "Greedy Register Allocator"; }
+
+  /// RAGreedy analysis usage.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void releaseMemory() override;
+  Spiller &spiller() override { return *SpillerInstance; }
+  void enqueueImpl(const LiveInterval *LI) override;
+  const LiveInterval *dequeue() override;
+  MCRegister selectOrSplit(const LiveInterval &,
+                           SmallVectorImpl<Register> &) override;
+  void aboutToRemoveInterval(const LiveInterval &) override;
+
+  /// Perform register allocation.
+  bool runOnMachineFunction(MachineFunction &mf) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoPHIs);
+  }
+
+  MachineFunctionProperties getClearedProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
+
+  static char ID;
+
+private:
+  MCRegister selectOrSplitImpl(const LiveInterval &,
+                               SmallVectorImpl<Register> &, SmallVirtRegSet &,
+                               RecoloringStack &, unsigned = 0);
+
+  bool LRE_CanEraseVirtReg(Register) override;
+  void LRE_WillShrinkVirtReg(Register) override;
+  void LRE_DidCloneVirtReg(Register, Register) override;
+  void enqueue(PQueue &CurQueue, const LiveInterval *LI);
+  const LiveInterval *dequeue(PQueue &CurQueue);
+
+  bool hasVirtRegAlloc();
+  BlockFrequency calcSpillCost();
+  bool addSplitConstraints(InterferenceCache::Cursor, BlockFrequency &);
+  bool addThroughConstraints(InterferenceCache::Cursor, ArrayRef<unsigned>);
+  bool growRegion(GlobalSplitCandidate &Cand);
+  BlockFrequency calcGlobalSplitCost(GlobalSplitCandidate &,
+                                     const AllocationOrder &Order);
+  bool calcCompactRegion(GlobalSplitCandidate &);
+  void splitAroundRegion(LiveRangeEdit &, ArrayRef<unsigned>);
+  void calcGapWeights(MCRegister, SmallVectorImpl<float> &);
+  void evictInterference(const LiveInterval &, MCRegister,
+                         SmallVectorImpl<Register> &);
+  bool mayRecolorAllInterferences(MCRegister PhysReg,
+                                  const LiveInterval &VirtReg,
+                                  SmallLISet &RecoloringCandidates,
+                                  const SmallVirtRegSet &FixedRegisters);
+
+  MCRegister tryAssign(const LiveInterval &, AllocationOrder &,
+                       SmallVectorImpl<Register> &, const SmallVirtRegSet &);
+  MCRegister tryEvict(const LiveInterval &, AllocationOrder &,
+                      SmallVectorImpl<Register> &, uint8_t,
+                      const SmallVirtRegSet &);
+  MCRegister tryRegionSplit(const LiveInterval &, AllocationOrder &,
+                            SmallVectorImpl<Register> &);
+  /// Calculate cost of region splitting.
+  unsigned calculateRegionSplitCost(const LiveInterval &VirtReg,
+                                    AllocationOrder &Order,
+                                    BlockFrequency &BestCost,
+                                    unsigned &NumCands, bool IgnoreCSR);
+  /// Perform region splitting.
+  unsigned doRegionSplit(const LiveInterval &VirtReg, unsigned BestCand,
+                         bool HasCompact, SmallVectorImpl<Register> &NewVRegs);
+  /// Check other options before using a callee-saved register for the first
+  /// time.
+  MCRegister tryAssignCSRFirstTime(const LiveInterval &VirtReg,
+                                   AllocationOrder &Order, MCRegister PhysReg,
+                                   uint8_t &CostPerUseLimit,
+                                   SmallVectorImpl<Register> &NewVRegs);
+  void initializeCSRCost();
+  unsigned tryBlockSplit(const LiveInterval &, AllocationOrder &,
+                         SmallVectorImpl<Register> &);
+  unsigned tryInstructionSplit(const LiveInterval &, AllocationOrder &,
+                               SmallVectorImpl<Register> &);
+  unsigned tryLocalSplit(const LiveInterval &, AllocationOrder &,
+                         SmallVectorImpl<Register> &);
+  unsigned trySplit(const LiveInterval &, AllocationOrder &,
+                    SmallVectorImpl<Register> &, const SmallVirtRegSet &);
+  unsigned tryLastChanceRecoloring(const LiveInterval &, AllocationOrder &,
+                                   SmallVectorImpl<Register> &,
+                                   SmallVirtRegSet &, RecoloringStack &,
+                                   unsigned);
+  bool tryRecoloringCandidates(PQueue &, SmallVectorImpl<Register> &,
+                               SmallVirtRegSet &, RecoloringStack &, unsigned);
+  void tryHintRecoloring(const LiveInterval &);
+  void tryHintsRecoloring();
+
+  /// Model the information carried by one end of a copy.
+  struct HintInfo {
+    /// The frequency of the copy.
+    BlockFrequency Freq;
+    /// The virtual register or physical register.
+    Register Reg;
+    /// Its currently assigned register.
+    /// In case of a physical register Reg == PhysReg.
+    MCRegister PhysReg;
+
+    HintInfo(BlockFrequency Freq, Register Reg, MCRegister PhysReg)
+        : Freq(Freq), Reg(Reg), PhysReg(PhysReg) {}
+  };
+  using HintsInfo = SmallVector<HintInfo, 4>;
+
+  BlockFrequency getBrokenHintFreq(const HintsInfo &, MCRegister);
+  void collectHintInfo(Register, HintsInfo &);
+
+  /// Greedy RA statistic to remark.
+  struct RAGreedyStats {
+    unsigned Reloads = 0;
+    unsigned FoldedReloads = 0;
+    unsigned ZeroCostFoldedReloads = 0;
+    unsigned Spills = 0;
+    unsigned FoldedSpills = 0;
+    unsigned Copies = 0;
+    float ReloadsCost = 0.0f;
+    float FoldedReloadsCost = 0.0f;
+    float SpillsCost = 0.0f;
+    float FoldedSpillsCost = 0.0f;
+    float CopiesCost = 0.0f;
+
+    bool isEmpty() {
+      return !(Reloads || FoldedReloads || Spills || FoldedSpills ||
+               ZeroCostFoldedReloads || Copies);
+    }
+
+    void add(RAGreedyStats other) {
+      Reloads += other.Reloads;
+      FoldedReloads += other.FoldedReloads;
+      ZeroCostFoldedReloads += other.ZeroCostFoldedReloads;
+      Spills += other.Spills;
+      FoldedSpills += other.FoldedSpills;
+      Copies += other.Copies;
+      ReloadsCost += other.ReloadsCost;
+      FoldedReloadsCost += other.FoldedReloadsCost;
+      SpillsCost += other.SpillsCost;
+      FoldedSpillsCost += other.FoldedSpillsCost;
+      CopiesCost += other.CopiesCost;
+    }
+
+    void report(MachineOptimizationRemarkMissed &R);
+  };
+
+  /// Compute statistic for a basic block.
+  RAGreedyStats computeStats(MachineBasicBlock &MBB);
+
+  /// Compute and report statistic through a remark.
+  RAGreedyStats reportStats(MachineLoop *L);
+
+  /// Report the statistic for each loop.
+  void reportStats();
+};
+} // namespace llvm
+#endif // #ifndef LLVM_CODEGEN_REGALLOCGREEDY_H_
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocPBQP.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocPBQP.cpp
new file mode 100644
index 000000000000..925a0f085c4b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -0,0 +1,954 @@
+//===- RegAllocPBQP.cpp ---- PBQP Register Allocator ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a Partitioned Boolean Quadratic Programming (PBQP) based
+// register allocator for LLVM. This allocator works by constructing a PBQP
+// problem representing the register allocation problem under consideration,
+// solving this using a PBQP solver, and mapping the solution back to a
+// register assignment. If any variables are selected for spilling then spill
+// code is inserted and the process repeated.
+//
+// The PBQP solver (pbqp.c) provided for this allocator uses a heuristic tuned
+// for register allocation. For more information on PBQP for register
+// allocation, see the following papers:
+//
+//   (1) Hames, L. and Scholz, B. 2006. Nearly optimal register allocation with
+//   PBQP. In Proceedings of the 7th Joint Modular Languages Conference
+//   (JMLC'06). LNCS, vol. 4228. Springer, New York, NY, USA. 346-361.
+//
+//   (2) Scholz, B., Eckstein, E. 2002. Register allocation for irregular
+//   architectures. In Proceedings of the Joint Conference on Languages,
+//   Compilers and Tools for Embedded Systems (LCTES'02), ACM Press, New York,
+//   NY, USA, 139-148.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegAllocPBQP.h"
+#include "RegisterCoalescer.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveStacks.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PBQP/Graph.h"
+#include "llvm/CodeGen/PBQP/Math.h"
+#include "llvm/CodeGen/PBQP/Solution.h"
+#include "llvm/CodeGen/PBQPRAConstraint.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/Spiller.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Printable.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <limits>
+#include <map>
+#include <memory>
+#include <queue>
+#include <set>
+#include <sstream>
+#include <string>
+#include <system_error>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+static RegisterRegAlloc
+RegisterPBQPRepAlloc("pbqp", "PBQP register allocator",
+                       createDefaultPBQPRegisterAllocator);
+
+static cl::opt<bool>
+PBQPCoalescing("pbqp-coalescing",
+                cl::desc("Attempt coalescing during PBQP register allocation."),
+                cl::init(false), cl::Hidden);
+
+#ifndef NDEBUG
+static cl::opt<bool>
+PBQPDumpGraphs("pbqp-dump-graphs",
+               cl::desc("Dump graphs for each function/round in the compilation unit."),
+               cl::init(false), cl::Hidden);
+#endif
+
+namespace {
+
+///
+/// PBQP based allocators solve the register allocation problem by mapping
+/// register allocation problems to Partitioned Boolean Quadratic
+/// Programming problems.
+class RegAllocPBQP : public MachineFunctionPass {
+public:
+  static char ID;
+
+  /// Construct a PBQP register allocator.
+  RegAllocPBQP(char *cPassID = nullptr)
+      : MachineFunctionPass(ID), customPassID(cPassID) {
+    initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
+    initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
+    initializeLiveStacksPass(*PassRegistry::getPassRegistry());
+    initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
+  }
+
+  /// Return the pass name.
+  StringRef getPassName() const override { return "PBQP Register Allocator"; }
+
+  /// PBQP analysis usage.
+  void getAnalysisUsage(AnalysisUsage &au) const override;
+
+  /// Perform register allocation
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoPHIs);
+  }
+
+  MachineFunctionProperties getClearedProperties() const override {
+    return MachineFunctionProperties().set(
+      MachineFunctionProperties::Property::IsSSA);
+  }
+
+private:
+  using RegSet = std::set<Register>;
+
+  char *customPassID;
+
+  RegSet VRegsToAlloc, EmptyIntervalVRegs;
+
+  /// Inst which is a def of an original reg and whose defs are already all
+  /// dead after remat is saved in DeadRemats. The deletion of such inst is
+  /// postponed till all the allocations are done, so its remat expr is
+  /// always available for the remat of all the siblings of the original reg.
+  SmallPtrSet<MachineInstr *, 32> DeadRemats;
+
+  /// Finds the initial set of vreg intervals to allocate.
+  void findVRegIntervalsToAlloc(const MachineFunction &MF, LiveIntervals &LIS);
+
+  /// Constructs an initial graph.
+  void initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM, Spiller &VRegSpiller);
+
+  /// Spill the given VReg.
+  void spillVReg(Register VReg, SmallVectorImpl<Register> &NewIntervals,
+                 MachineFunction &MF, LiveIntervals &LIS, VirtRegMap &VRM,
+                 Spiller &VRegSpiller);
+
+  /// Given a solved PBQP problem maps this solution back to a register
+  /// assignment.
+  bool mapPBQPToRegAlloc(const PBQPRAGraph &G,
+                         const PBQP::Solution &Solution,
+                         VirtRegMap &VRM,
+                         Spiller &VRegSpiller);
+
+  /// Postprocessing before final spilling. Sets basic block "live in"
+  /// variables.
+  void finalizeAlloc(MachineFunction &MF, LiveIntervals &LIS,
+                     VirtRegMap &VRM) const;
+
+  void postOptimization(Spiller &VRegSpiller, LiveIntervals &LIS);
+};
+
+char RegAllocPBQP::ID = 0;
+
+/// Set spill costs for each node in the PBQP reg-alloc graph.
+class SpillCosts : public PBQPRAConstraint {
+public:
+  void apply(PBQPRAGraph &G) override {
+    LiveIntervals &LIS = G.getMetadata().LIS;
+
+    // A minimum spill costs, so that register constraints can can be set
+    // without normalization in the [0.0:MinSpillCost( interval.
+    const PBQP::PBQPNum MinSpillCost = 10.0;
+
+    for (auto NId : G.nodeIds()) {
+      PBQP::PBQPNum SpillCost =
+          LIS.getInterval(G.getNodeMetadata(NId).getVReg()).weight();
+      if (SpillCost == 0.0)
+        SpillCost = std::numeric_limits<PBQP::PBQPNum>::min();
+      else
+        SpillCost += MinSpillCost;
+      PBQPRAGraph::RawVector NodeCosts(G.getNodeCosts(NId));
+      NodeCosts[PBQP::RegAlloc::getSpillOptionIdx()] = SpillCost;
+      G.setNodeCosts(NId, std::move(NodeCosts));
+    }
+  }
+};
+
+/// Add interference edges between overlapping vregs.
+class Interference : public PBQPRAConstraint {
+private:
+  using AllowedRegVecPtr = const PBQP::RegAlloc::AllowedRegVector *;
+  using IKey = std::pair<AllowedRegVecPtr, AllowedRegVecPtr>;
+  using IMatrixCache = DenseMap<IKey, PBQPRAGraph::MatrixPtr>;
+  using DisjointAllowedRegsCache = DenseSet<IKey>;
+  using IEdgeKey = std::pair<PBQP::GraphBase::NodeId, PBQP::GraphBase::NodeId>;
+  using IEdgeCache = DenseSet<IEdgeKey>;
+
+  bool haveDisjointAllowedRegs(const PBQPRAGraph &G, PBQPRAGraph::NodeId NId,
+                               PBQPRAGraph::NodeId MId,
+                               const DisjointAllowedRegsCache &D) const {
+    const auto *NRegs = &G.getNodeMetadata(NId).getAllowedRegs();
+    const auto *MRegs = &G.getNodeMetadata(MId).getAllowedRegs();
+
+    if (NRegs == MRegs)
+      return false;
+
+    if (NRegs < MRegs)
+      return D.contains(IKey(NRegs, MRegs));
+
+    return D.contains(IKey(MRegs, NRegs));
+  }
+
+  void setDisjointAllowedRegs(const PBQPRAGraph &G, PBQPRAGraph::NodeId NId,
+                              PBQPRAGraph::NodeId MId,
+                              DisjointAllowedRegsCache &D) {
+    const auto *NRegs = &G.getNodeMetadata(NId).getAllowedRegs();
+    const auto *MRegs = &G.getNodeMetadata(MId).getAllowedRegs();
+
+    assert(NRegs != MRegs && "AllowedRegs can not be disjoint with itself");
+
+    if (NRegs < MRegs)
+      D.insert(IKey(NRegs, MRegs));
+    else
+      D.insert(IKey(MRegs, NRegs));
+  }
+
+  // Holds (Interval, CurrentSegmentID, and NodeId). The first two are required
+  // for the fast interference graph construction algorithm. The last is there
+  // to save us from looking up node ids via the VRegToNode map in the graph
+  // metadata.
+  using IntervalInfo =
+      std::tuple<LiveInterval*, size_t, PBQP::GraphBase::NodeId>;
+
+  static SlotIndex getStartPoint(const IntervalInfo &I) {
+    return std::get<0>(I)->segments[std::get<1>(I)].start;
+  }
+
+  static SlotIndex getEndPoint(const IntervalInfo &I) {
+    return std::get<0>(I)->segments[std::get<1>(I)].end;
+  }
+
+  static PBQP::GraphBase::NodeId getNodeId(const IntervalInfo &I) {
+    return std::get<2>(I);
+  }
+
+  static bool lowestStartPoint(const IntervalInfo &I1,
+                               const IntervalInfo &I2) {
+    // Condition reversed because priority queue has the *highest* element at
+    // the front, rather than the lowest.
+    return getStartPoint(I1) > getStartPoint(I2);
+  }
+
+  static bool lowestEndPoint(const IntervalInfo &I1,
+                             const IntervalInfo &I2) {
+    SlotIndex E1 = getEndPoint(I1);
+    SlotIndex E2 = getEndPoint(I2);
+
+    if (E1 < E2)
+      return true;
+
+    if (E1 > E2)
+      return false;
+
+    // If two intervals end at the same point, we need a way to break the tie or
+    // the set will assume they're actually equal and refuse to insert a
+    // "duplicate". Just compare the vregs - fast and guaranteed unique.
+    return std::get<0>(I1)->reg() < std::get<0>(I2)->reg();
+  }
+
+  static bool isAtLastSegment(const IntervalInfo &I) {
+    return std::get<1>(I) == std::get<0>(I)->size() - 1;
+  }
+
+  static IntervalInfo nextSegment(const IntervalInfo &I) {
+    return std::make_tuple(std::get<0>(I), std::get<1>(I) + 1, std::get<2>(I));
+  }
+
+public:
+  void apply(PBQPRAGraph &G) override {
+    // The following is loosely based on the linear scan algorithm introduced in
+    // "Linear Scan Register Allocation" by Poletto and Sarkar. This version
+    // isn't linear, because the size of the active set isn't bound by the
+    // number of registers, but rather the size of the largest clique in the
+    // graph. Still, we expect this to be better than N^2.
+    LiveIntervals &LIS = G.getMetadata().LIS;
+
+    // Interferenc matrices are incredibly regular - they're only a function of
+    // the allowed sets, so we cache them to avoid the overhead of constructing
+    // and uniquing them.
+    IMatrixCache C;
+
+    // Finding an edge is expensive in the worst case (O(max_clique(G))). So
+    // cache locally edges we have already seen.
+    IEdgeCache EC;
+
+    // Cache known disjoint allowed registers pairs
+    DisjointAllowedRegsCache D;
+
+    using IntervalSet = std::set<IntervalInfo, decltype(&lowestEndPoint)>;
+    using IntervalQueue =
+        std::priority_queue<IntervalInfo, std::vector<IntervalInfo>,
+                            decltype(&lowestStartPoint)>;
+    IntervalSet Active(lowestEndPoint);
+    IntervalQueue Inactive(lowestStartPoint);
+
+    // Start by building the inactive set.
+    for (auto NId : G.nodeIds()) {
+      Register VReg = G.getNodeMetadata(NId).getVReg();
+      LiveInterval &LI = LIS.getInterval(VReg);
+      assert(!LI.empty() && "PBQP graph contains node for empty interval");
+      Inactive.push(std::make_tuple(&LI, 0, NId));
+    }
+
+    while (!Inactive.empty()) {
+      // Tentatively grab the "next" interval - this choice may be overriden
+      // below.
+      IntervalInfo Cur = Inactive.top();
+
+      // Retire any active intervals that end before Cur starts.
+      IntervalSet::iterator RetireItr = Active.begin();
+      while (RetireItr != Active.end() &&
+             (getEndPoint(*RetireItr) <= getStartPoint(Cur))) {
+        // If this interval has subsequent segments, add the next one to the
+        // inactive list.
+        if (!isAtLastSegment(*RetireItr))
+          Inactive.push(nextSegment(*RetireItr));
+
+        ++RetireItr;
+      }
+      Active.erase(Active.begin(), RetireItr);
+
+      // One of the newly retired segments may actually start before the
+      // Cur segment, so re-grab the front of the inactive list.
+      Cur = Inactive.top();
+      Inactive.pop();
+
+      // At this point we know that Cur overlaps all active intervals. Add the
+      // interference edges.
+      PBQP::GraphBase::NodeId NId = getNodeId(Cur);
+      for (const auto &A : Active) {
+        PBQP::GraphBase::NodeId MId = getNodeId(A);
+
+        // Do not add an edge when the nodes' allowed registers do not
+        // intersect: there is obviously no interference.
+        if (haveDisjointAllowedRegs(G, NId, MId, D))
+          continue;
+
+        // Check that we haven't already added this edge
+        IEdgeKey EK(std::min(NId, MId), std::max(NId, MId));
+        if (EC.count(EK))
+          continue;
+
+        // This is a new edge - add it to the graph.
+        if (!createInterferenceEdge(G, NId, MId, C))
+          setDisjointAllowedRegs(G, NId, MId, D);
+        else
+          EC.insert(EK);
+      }
+
+      // Finally, add Cur to the Active set.
+      Active.insert(Cur);
+    }
+  }
+
+private:
+  // Create an Interference edge and add it to the graph, unless it is
+  // a null matrix, meaning the nodes' allowed registers do not have any
+  // interference. This case occurs frequently between integer and floating
+  // point registers for example.
+  // return true iff both nodes interferes.
+  bool createInterferenceEdge(PBQPRAGraph &G,
+                              PBQPRAGraph::NodeId NId, PBQPRAGraph::NodeId MId,
+                              IMatrixCache &C) {
+    const TargetRegisterInfo &TRI =
+        *G.getMetadata().MF.getSubtarget().getRegisterInfo();
+    const auto &NRegs = G.getNodeMetadata(NId).getAllowedRegs();
+    const auto &MRegs = G.getNodeMetadata(MId).getAllowedRegs();
+
+    // Try looking the edge costs up in the IMatrixCache first.
+    IKey K(&NRegs, &MRegs);
+    IMatrixCache::iterator I = C.find(K);
+    if (I != C.end()) {
+      G.addEdgeBypassingCostAllocator(NId, MId, I->second);
+      return true;
+    }
+
+    PBQPRAGraph::RawMatrix M(NRegs.size() + 1, MRegs.size() + 1, 0);
+    bool NodesInterfere = false;
+    for (unsigned I = 0; I != NRegs.size(); ++I) {
+      MCRegister PRegN = NRegs[I];
+      for (unsigned J = 0; J != MRegs.size(); ++J) {
+        MCRegister PRegM = MRegs[J];
+        if (TRI.regsOverlap(PRegN, PRegM)) {
+          M[I + 1][J + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity();
+          NodesInterfere = true;
+        }
+      }
+    }
+
+    if (!NodesInterfere)
+      return false;
+
+    PBQPRAGraph::EdgeId EId = G.addEdge(NId, MId, std::move(M));
+    C[K] = G.getEdgeCostsPtr(EId);
+
+    return true;
+  }
+};
+
+class Coalescing : public PBQPRAConstraint {
+public:
+  void apply(PBQPRAGraph &G) override {
+    MachineFunction &MF = G.getMetadata().MF;
+    MachineBlockFrequencyInfo &MBFI = G.getMetadata().MBFI;
+    CoalescerPair CP(*MF.getSubtarget().getRegisterInfo());
+
+    // Scan the machine function and add a coalescing cost whenever CoalescerPair
+    // gives the Ok.
+    for (const auto &MBB : MF) {
+      for (const auto &MI : MBB) {
+        // Skip not-coalescable or already coalesced copies.
+        if (!CP.setRegisters(&MI) || CP.getSrcReg() == CP.getDstReg())
+          continue;
+
+        Register DstReg = CP.getDstReg();
+        Register SrcReg = CP.getSrcReg();
+
+        PBQP::PBQPNum CBenefit = MBFI.getBlockFreqRelativeToEntryBlock(&MBB);
+
+        if (CP.isPhys()) {
+          if (!MF.getRegInfo().isAllocatable(DstReg))
+            continue;
+
+          PBQPRAGraph::NodeId NId = G.getMetadata().getNodeIdForVReg(SrcReg);
+
+          const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed =
+            G.getNodeMetadata(NId).getAllowedRegs();
+
+          unsigned PRegOpt = 0;
+          while (PRegOpt < Allowed.size() && Allowed[PRegOpt].id() != DstReg)
+            ++PRegOpt;
+
+          if (PRegOpt < Allowed.size()) {
+            PBQPRAGraph::RawVector NewCosts(G.getNodeCosts(NId));
+            NewCosts[PRegOpt + 1] -= CBenefit;
+            G.setNodeCosts(NId, std::move(NewCosts));
+          }
+        } else {
+          PBQPRAGraph::NodeId N1Id = G.getMetadata().getNodeIdForVReg(DstReg);
+          PBQPRAGraph::NodeId N2Id = G.getMetadata().getNodeIdForVReg(SrcReg);
+          const PBQPRAGraph::NodeMetadata::AllowedRegVector *Allowed1 =
+            &G.getNodeMetadata(N1Id).getAllowedRegs();
+          const PBQPRAGraph::NodeMetadata::AllowedRegVector *Allowed2 =
+            &G.getNodeMetadata(N2Id).getAllowedRegs();
+
+          PBQPRAGraph::EdgeId EId = G.findEdge(N1Id, N2Id);
+          if (EId == G.invalidEdgeId()) {
+            PBQPRAGraph::RawMatrix Costs(Allowed1->size() + 1,
+                                         Allowed2->size() + 1, 0);
+            addVirtRegCoalesce(Costs, *Allowed1, *Allowed2, CBenefit);
+            G.addEdge(N1Id, N2Id, std::move(Costs));
+          } else {
+            if (G.getEdgeNode1Id(EId) == N2Id) {
+              std::swap(N1Id, N2Id);
+              std::swap(Allowed1, Allowed2);
+            }
+            PBQPRAGraph::RawMatrix Costs(G.getEdgeCosts(EId));
+            addVirtRegCoalesce(Costs, *Allowed1, *Allowed2, CBenefit);
+            G.updateEdgeCosts(EId, std::move(Costs));
+          }
+        }
+      }
+    }
+  }
+
+private:
+  void addVirtRegCoalesce(
+                    PBQPRAGraph::RawMatrix &CostMat,
+                    const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed1,
+                    const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed2,
+                    PBQP::PBQPNum Benefit) {
+    assert(CostMat.getRows() == Allowed1.size() + 1 && "Size mismatch.");
+    assert(CostMat.getCols() == Allowed2.size() + 1 && "Size mismatch.");
+    for (unsigned I = 0; I != Allowed1.size(); ++I) {
+      MCRegister PReg1 = Allowed1[I];
+      for (unsigned J = 0; J != Allowed2.size(); ++J) {
+        MCRegister PReg2 = Allowed2[J];
+        if (PReg1 == PReg2)
+          CostMat[I + 1][J + 1] -= Benefit;
+      }
+    }
+  }
+};
+
+/// PBQP-specific implementation of weight normalization.
+class PBQPVirtRegAuxInfo final : public VirtRegAuxInfo {
+  float normalize(float UseDefFreq, unsigned Size, unsigned NumInstr) override {
+    // All intervals have a spill weight that is mostly proportional to the
+    // number of uses, with uses in loops having a bigger weight.
+    return NumInstr * VirtRegAuxInfo::normalize(UseDefFreq, Size, 1);
+  }
+
+public:
+  PBQPVirtRegAuxInfo(MachineFunction &MF, LiveIntervals &LIS, VirtRegMap &VRM,
+                     const MachineLoopInfo &Loops,
+                     const MachineBlockFrequencyInfo &MBFI)
+      : VirtRegAuxInfo(MF, LIS, VRM, Loops, MBFI) {}
+};
+} // end anonymous namespace
+
+// Out-of-line destructor/anchor for PBQPRAConstraint.
+PBQPRAConstraint::~PBQPRAConstraint() = default;
+
+void PBQPRAConstraint::anchor() {}
+
+void PBQPRAConstraintList::anchor() {}
+
+void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
+  au.setPreservesCFG();
+  au.addRequired<AAResultsWrapperPass>();
+  au.addPreserved<AAResultsWrapperPass>();
+  au.addRequired<SlotIndexes>();
+  au.addPreserved<SlotIndexes>();
+  au.addRequired<LiveIntervals>();
+  au.addPreserved<LiveIntervals>();
+  //au.addRequiredID(SplitCriticalEdgesID);
+  if (customPassID)
+    au.addRequiredID(*customPassID);
+  au.addRequired<LiveStacks>();
+  au.addPreserved<LiveStacks>();
+  au.addRequired<MachineBlockFrequencyInfo>();
+  au.addPreserved<MachineBlockFrequencyInfo>();
+  au.addRequired<MachineLoopInfo>();
+  au.addPreserved<MachineLoopInfo>();
+  au.addRequired<MachineDominatorTree>();
+  au.addPreserved<MachineDominatorTree>();
+  au.addRequired<VirtRegMap>();
+  au.addPreserved<VirtRegMap>();
+  MachineFunctionPass::getAnalysisUsage(au);
+}
+
+void RegAllocPBQP::findVRegIntervalsToAlloc(const MachineFunction &MF,
+                                            LiveIntervals &LIS) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Iterate over all live ranges.
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    VRegsToAlloc.insert(Reg);
+  }
+}
+
+static bool isACalleeSavedRegister(MCRegister Reg,
+                                   const TargetRegisterInfo &TRI,
+                                   const MachineFunction &MF) {
+  const MCPhysReg *CSR = MF.getRegInfo().getCalleeSavedRegs();
+  for (unsigned i = 0; CSR[i] != 0; ++i)
+    if (TRI.regsOverlap(Reg, CSR[i]))
+      return true;
+  return false;
+}
+
+void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM,
+                                   Spiller &VRegSpiller) {
+  MachineFunction &MF = G.getMetadata().MF;
+
+  LiveIntervals &LIS = G.getMetadata().LIS;
+  const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo();
+  const TargetRegisterInfo &TRI =
+      *G.getMetadata().MF.getSubtarget().getRegisterInfo();
+
+  std::vector<Register> Worklist(VRegsToAlloc.begin(), VRegsToAlloc.end());
+
+  std::map<Register, std::vector<MCRegister>> VRegAllowedMap;
+
+  while (!Worklist.empty()) {
+    Register VReg = Worklist.back();
+    Worklist.pop_back();
+
+    LiveInterval &VRegLI = LIS.getInterval(VReg);
+
+    // If this is an empty interval move it to the EmptyIntervalVRegs set then
+    // continue.
+    if (VRegLI.empty()) {
+      EmptyIntervalVRegs.insert(VRegLI.reg());
+      VRegsToAlloc.erase(VRegLI.reg());
+      continue;
+    }
+
+    const TargetRegisterClass *TRC = MRI.getRegClass(VReg);
+
+    // Record any overlaps with regmask operands.
+    BitVector RegMaskOverlaps;
+    LIS.checkRegMaskInterference(VRegLI, RegMaskOverlaps);
+
+    // Compute an initial allowed set for the current vreg.
+    std::vector<MCRegister> VRegAllowed;
+    ArrayRef<MCPhysReg> RawPRegOrder = TRC->getRawAllocationOrder(MF);
+    for (MCPhysReg R : RawPRegOrder) {
+      MCRegister PReg(R);
+      if (MRI.isReserved(PReg))
+        continue;
+
+      // vregLI crosses a regmask operand that clobbers preg.
+      if (!RegMaskOverlaps.empty() && !RegMaskOverlaps.test(PReg))
+        continue;
+
+      // vregLI overlaps fixed regunit interference.
+      bool Interference = false;
+      for (MCRegUnit Unit : TRI.regunits(PReg)) {
+        if (VRegLI.overlaps(LIS.getRegUnit(Unit))) {
+          Interference = true;
+          break;
+        }
+      }
+      if (Interference)
+        continue;
+
+      // preg is usable for this virtual register.
+      VRegAllowed.push_back(PReg);
+    }
+
+    // Check for vregs that have no allowed registers. These should be
+    // pre-spilled and the new vregs added to the worklist.
+    if (VRegAllowed.empty()) {
+      SmallVector<Register, 8> NewVRegs;
+      spillVReg(VReg, NewVRegs, MF, LIS, VRM, VRegSpiller);
+      llvm::append_range(Worklist, NewVRegs);
+      continue;
+    }
+
+    VRegAllowedMap[VReg.id()] = std::move(VRegAllowed);
+  }
+
+  for (auto &KV : VRegAllowedMap) {
+    auto VReg = KV.first;
+
+    // Move empty intervals to the EmptyIntervalVReg set.
+    if (LIS.getInterval(VReg).empty()) {
+      EmptyIntervalVRegs.insert(VReg);
+      VRegsToAlloc.erase(VReg);
+      continue;
+    }
+
+    auto &VRegAllowed = KV.second;
+
+    PBQPRAGraph::RawVector NodeCosts(VRegAllowed.size() + 1, 0);
+
+    // Tweak cost of callee saved registers, as using then force spilling and
+    // restoring them. This would only happen in the prologue / epilogue though.
+    for (unsigned i = 0; i != VRegAllowed.size(); ++i)
+      if (isACalleeSavedRegister(VRegAllowed[i], TRI, MF))
+        NodeCosts[1 + i] += 1.0;
+
+    PBQPRAGraph::NodeId NId = G.addNode(std::move(NodeCosts));
+    G.getNodeMetadata(NId).setVReg(VReg);
+    G.getNodeMetadata(NId).setAllowedRegs(
+      G.getMetadata().getAllowedRegs(std::move(VRegAllowed)));
+    G.getMetadata().setNodeIdForVReg(VReg, NId);
+  }
+}
+
+void RegAllocPBQP::spillVReg(Register VReg,
+                             SmallVectorImpl<Register> &NewIntervals,
+                             MachineFunction &MF, LiveIntervals &LIS,
+                             VirtRegMap &VRM, Spiller &VRegSpiller) {
+  VRegsToAlloc.erase(VReg);
+  LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, MF, LIS, &VRM,
+                    nullptr, &DeadRemats);
+  VRegSpiller.spill(LRE);
+
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  (void)TRI;
+  LLVM_DEBUG(dbgs() << "VREG " << printReg(VReg, &TRI) << " -> SPILLED (Cost: "
+                    << LRE.getParent().weight() << ", New vregs: ");
+
+  // Copy any newly inserted live intervals into the list of regs to
+  // allocate.
+  for (const Register &R : LRE) {
+    const LiveInterval &LI = LIS.getInterval(R);
+    assert(!LI.empty() && "Empty spill range.");
+    LLVM_DEBUG(dbgs() << printReg(LI.reg(), &TRI) << " ");
+    VRegsToAlloc.insert(LI.reg());
+  }
+
+  LLVM_DEBUG(dbgs() << ")\n");
+}
+
+bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAGraph &G,
+                                     const PBQP::Solution &Solution,
+                                     VirtRegMap &VRM,
+                                     Spiller &VRegSpiller) {
+  MachineFunction &MF = G.getMetadata().MF;
+  LiveIntervals &LIS = G.getMetadata().LIS;
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  (void)TRI;
+
+  // Set to true if we have any spills
+  bool AnotherRoundNeeded = false;
+
+  // Clear the existing allocation.
+  VRM.clearAllVirt();
+
+  // Iterate over the nodes mapping the PBQP solution to a register
+  // assignment.
+  for (auto NId : G.nodeIds()) {
+    Register VReg = G.getNodeMetadata(NId).getVReg();
+    unsigned AllocOpt = Solution.getSelection(NId);
+
+    if (AllocOpt != PBQP::RegAlloc::getSpillOptionIdx()) {
+      MCRegister PReg = G.getNodeMetadata(NId).getAllowedRegs()[AllocOpt - 1];
+      LLVM_DEBUG(dbgs() << "VREG " << printReg(VReg, &TRI) << " -> "
+                        << TRI.getName(PReg) << "\n");
+      assert(PReg != 0 && "Invalid preg selected.");
+      VRM.assignVirt2Phys(VReg, PReg);
+    } else {
+      // Spill VReg. If this introduces new intervals we'll need another round
+      // of allocation.
+      SmallVector<Register, 8> NewVRegs;
+      spillVReg(VReg, NewVRegs, MF, LIS, VRM, VRegSpiller);
+      AnotherRoundNeeded |= !NewVRegs.empty();
+    }
+  }
+
+  return !AnotherRoundNeeded;
+}
+
+void RegAllocPBQP::finalizeAlloc(MachineFunction &MF,
+                                 LiveIntervals &LIS,
+                                 VirtRegMap &VRM) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // First allocate registers for the empty intervals.
+  for (const Register &R : EmptyIntervalVRegs) {
+    LiveInterval &LI = LIS.getInterval(R);
+
+    Register PReg = MRI.getSimpleHint(LI.reg());
+
+    if (PReg == 0) {
+      const TargetRegisterClass &RC = *MRI.getRegClass(LI.reg());
+      const ArrayRef<MCPhysReg> RawPRegOrder = RC.getRawAllocationOrder(MF);
+      for (MCRegister CandidateReg : RawPRegOrder) {
+        if (!VRM.getRegInfo().isReserved(CandidateReg)) {
+          PReg = CandidateReg;
+          break;
+        }
+      }
+      assert(PReg &&
+             "No un-reserved physical registers in this register class");
+    }
+
+    VRM.assignVirt2Phys(LI.reg(), PReg);
+  }
+}
+
+void RegAllocPBQP::postOptimization(Spiller &VRegSpiller, LiveIntervals &LIS) {
+  VRegSpiller.postOptimization();
+  /// Remove dead defs because of rematerialization.
+  for (auto *DeadInst : DeadRemats) {
+    LIS.RemoveMachineInstrFromMaps(*DeadInst);
+    DeadInst->eraseFromParent();
+  }
+  DeadRemats.clear();
+}
+
+bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+  MachineBlockFrequencyInfo &MBFI =
+    getAnalysis<MachineBlockFrequencyInfo>();
+
+  VirtRegMap &VRM = getAnalysis<VirtRegMap>();
+
+  PBQPVirtRegAuxInfo VRAI(MF, LIS, VRM, getAnalysis<MachineLoopInfo>(), MBFI);
+  VRAI.calculateSpillWeightsAndHints();
+
+  // FIXME: we create DefaultVRAI here to match existing behavior pre-passing
+  // the VRAI through the spiller to the live range editor. However, it probably
+  // makes more sense to pass the PBQP VRAI. The existing behavior had
+  // LiveRangeEdit make its own VirtRegAuxInfo object.
+  VirtRegAuxInfo DefaultVRAI(MF, LIS, VRM, getAnalysis<MachineLoopInfo>(),
+                             MBFI);
+  std::unique_ptr<Spiller> VRegSpiller(
+      createInlineSpiller(*this, MF, VRM, DefaultVRAI));
+
+  MF.getRegInfo().freezeReservedRegs(MF);
+
+  LLVM_DEBUG(dbgs() << "PBQP Register Allocating for " << MF.getName() << "\n");
+
+  // Allocator main loop:
+  //
+  // * Map current regalloc problem to a PBQP problem
+  // * Solve the PBQP problem
+  // * Map the solution back to a register allocation
+  // * Spill if necessary
+  //
+  // This process is continued till no more spills are generated.
+
+  // Find the vreg intervals in need of allocation.
+  findVRegIntervalsToAlloc(MF, LIS);
+
+#ifndef NDEBUG
+  const Function &F = MF.getFunction();
+  std::string FullyQualifiedName =
+    F.getParent()->getModuleIdentifier() + "." + F.getName().str();
+#endif
+
+  // If there are non-empty intervals allocate them using pbqp.
+  if (!VRegsToAlloc.empty()) {
+    const TargetSubtargetInfo &Subtarget = MF.getSubtarget();
+    std::unique_ptr<PBQPRAConstraintList> ConstraintsRoot =
+      std::make_unique<PBQPRAConstraintList>();
+    ConstraintsRoot->addConstraint(std::make_unique<SpillCosts>());
+    ConstraintsRoot->addConstraint(std::make_unique<Interference>());
+    if (PBQPCoalescing)
+      ConstraintsRoot->addConstraint(std::make_unique<Coalescing>());
+    ConstraintsRoot->addConstraint(Subtarget.getCustomPBQPConstraints());
+
+    bool PBQPAllocComplete = false;
+    unsigned Round = 0;
+
+    while (!PBQPAllocComplete) {
+      LLVM_DEBUG(dbgs() << "  PBQP Regalloc round " << Round << ":\n");
+      (void) Round;
+
+      PBQPRAGraph G(PBQPRAGraph::GraphMetadata(MF, LIS, MBFI));
+      initializeGraph(G, VRM, *VRegSpiller);
+      ConstraintsRoot->apply(G);
+
+#ifndef NDEBUG
+      if (PBQPDumpGraphs) {
+        std::ostringstream RS;
+        RS << Round;
+        std::string GraphFileName = FullyQualifiedName + "." + RS.str() +
+                                    ".pbqpgraph";
+        std::error_code EC;
+        raw_fd_ostream OS(GraphFileName, EC, sys::fs::OF_TextWithCRLF);
+        LLVM_DEBUG(dbgs() << "Dumping graph for round " << Round << " to \""
+                          << GraphFileName << "\"\n");
+        G.dump(OS);
+      }
+#endif
+
+      PBQP::Solution Solution = PBQP::RegAlloc::solve(G);
+      PBQPAllocComplete = mapPBQPToRegAlloc(G, Solution, VRM, *VRegSpiller);
+      ++Round;
+    }
+  }
+
+  // Finalise allocation, allocate empty ranges.
+  finalizeAlloc(MF, LIS, VRM);
+  postOptimization(*VRegSpiller, LIS);
+  VRegsToAlloc.clear();
+  EmptyIntervalVRegs.clear();
+
+  LLVM_DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << VRM << "\n");
+
+  return true;
+}
+
+/// Create Printable object for node and register info.
+static Printable PrintNodeInfo(PBQP::RegAlloc::PBQPRAGraph::NodeId NId,
+                               const PBQP::RegAlloc::PBQPRAGraph &G) {
+  return Printable([NId, &G](raw_ostream &OS) {
+    const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo();
+    const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+    Register VReg = G.getNodeMetadata(NId).getVReg();
+    const char *RegClassName = TRI->getRegClassName(MRI.getRegClass(VReg));
+    OS << NId << " (" << RegClassName << ':' << printReg(VReg, TRI) << ')';
+  });
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {
+  for (auto NId : nodeIds()) {
+    const Vector &Costs = getNodeCosts(NId);
+    assert(Costs.getLength() != 0 && "Empty vector in graph.");
+    OS << PrintNodeInfo(NId, *this) << ": " << Costs << '\n';
+  }
+  OS << '\n';
+
+  for (auto EId : edgeIds()) {
+    NodeId N1Id = getEdgeNode1Id(EId);
+    NodeId N2Id = getEdgeNode2Id(EId);
+    assert(N1Id != N2Id && "PBQP graphs should not have self-edges.");
+    const Matrix &M = getEdgeCosts(EId);
+    assert(M.getRows() != 0 && "No rows in matrix.");
+    assert(M.getCols() != 0 && "No cols in matrix.");
+    OS << PrintNodeInfo(N1Id, *this) << ' ' << M.getRows() << " rows / ";
+    OS << PrintNodeInfo(N2Id, *this) << ' ' << M.getCols() << " cols:\n";
+    OS << M << '\n';
+  }
+}
+
+LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump() const {
+  dump(dbgs());
+}
+#endif
+
+void PBQP::RegAlloc::PBQPRAGraph::printDot(raw_ostream &OS) const {
+  OS << "graph {\n";
+  for (auto NId : nodeIds()) {
+    OS << "  node" << NId << " [ label=\""
+       << PrintNodeInfo(NId, *this) << "\\n"
+       << getNodeCosts(NId) << "\" ]\n";
+  }
+
+  OS << "  edge [ len=" << nodeIds().size() << " ]\n";
+  for (auto EId : edgeIds()) {
+    OS << "  node" << getEdgeNode1Id(EId)
+       << " -- node" << getEdgeNode2Id(EId)
+       << " [ label=\"";
+    const Matrix &EdgeCosts = getEdgeCosts(EId);
+    for (unsigned i = 0; i < EdgeCosts.getRows(); ++i) {
+      OS << EdgeCosts.getRowAsVector(i) << "\\n";
+    }
+    OS << "\" ]\n";
+  }
+  OS << "}\n";
+}
+
+FunctionPass *llvm::createPBQPRegisterAllocator(char *customPassID) {
+  return new RegAllocPBQP(customPassID);
+}
+
+FunctionPass* llvm::createDefaultPBQPRegisterAllocator() {
+  return createPBQPRegisterAllocator();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp
new file mode 100644
index 000000000000..e031019a4c91
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocPriorityAdvisor.cpp
@@ -0,0 +1,112 @@
+//===- RegAllocPriorityAdvisor.cpp - live ranges priority advisor ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the default priority advisor and of the Analysis pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RegAllocPriorityAdvisor.h"
+#include "RegAllocGreedy.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+static cl::opt<RegAllocPriorityAdvisorAnalysis::AdvisorMode> Mode(
+    "regalloc-enable-priority-advisor", cl::Hidden,
+    cl::init(RegAllocPriorityAdvisorAnalysis::AdvisorMode::Default),
+    cl::desc("Enable regalloc advisor mode"),
+    cl::values(
+        clEnumValN(RegAllocPriorityAdvisorAnalysis::AdvisorMode::Default,
+                   "default", "Default"),
+        clEnumValN(RegAllocPriorityAdvisorAnalysis::AdvisorMode::Release,
+                   "release", "precompiled"),
+        clEnumValN(RegAllocPriorityAdvisorAnalysis::AdvisorMode::Development,
+                   "development", "for training")));
+
+char RegAllocPriorityAdvisorAnalysis::ID = 0;
+INITIALIZE_PASS(RegAllocPriorityAdvisorAnalysis, "regalloc-priority",
+                "Regalloc priority policy", false, true)
+
+namespace {
+class DefaultPriorityAdvisorAnalysis final
+    : public RegAllocPriorityAdvisorAnalysis {
+public:
+  DefaultPriorityAdvisorAnalysis(bool NotAsRequested)
+      : RegAllocPriorityAdvisorAnalysis(AdvisorMode::Default),
+        NotAsRequested(NotAsRequested) {}
+
+  // support for isa<> and dyn_cast.
+  static bool classof(const RegAllocPriorityAdvisorAnalysis *R) {
+    return R->getAdvisorMode() == AdvisorMode::Default;
+  }
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<SlotIndexes>();
+    RegAllocPriorityAdvisorAnalysis::getAnalysisUsage(AU);
+  }
+  std::unique_ptr<RegAllocPriorityAdvisor>
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override {
+    return std::make_unique<DefaultPriorityAdvisor>(
+        MF, RA, &getAnalysis<SlotIndexes>());
+  }
+  bool doInitialization(Module &M) override {
+    if (NotAsRequested)
+      M.getContext().emitError("Requested regalloc priority advisor analysis "
+                               "could be created. Using default");
+    return RegAllocPriorityAdvisorAnalysis::doInitialization(M);
+  }
+  const bool NotAsRequested;
+};
+} // namespace
+
+template <> Pass *llvm::callDefaultCtor<RegAllocPriorityAdvisorAnalysis>() {
+  Pass *Ret = nullptr;
+  switch (Mode) {
+  case RegAllocPriorityAdvisorAnalysis::AdvisorMode::Default:
+    Ret = new DefaultPriorityAdvisorAnalysis(/*NotAsRequested*/ false);
+    break;
+  case RegAllocPriorityAdvisorAnalysis::AdvisorMode::Development:
+#if defined(LLVM_HAVE_TFLITE)
+    Ret = createDevelopmentModePriorityAdvisor();
+#endif
+    break;
+  case RegAllocPriorityAdvisorAnalysis::AdvisorMode::Release:
+    Ret = createReleaseModePriorityAdvisor();
+    break;
+  }
+  if (Ret)
+    return Ret;
+  return new DefaultPriorityAdvisorAnalysis(/*NotAsRequested*/ true);
+}
+
+StringRef RegAllocPriorityAdvisorAnalysis::getPassName() const {
+  switch (getAdvisorMode()) {
+  case AdvisorMode::Default:
+    return "Default Regalloc Priority Advisor";
+  case AdvisorMode::Release:
+    return "Release mode Regalloc Priority Advisor";
+  case AdvisorMode::Development:
+    return "Development mode Regalloc Priority Advisor";
+  }
+  llvm_unreachable("Unknown advisor kind");
+}
+
+RegAllocPriorityAdvisor::RegAllocPriorityAdvisor(const MachineFunction &MF,
+                                                 const RAGreedy &RA,
+                                                 SlotIndexes *const Indexes)
+    : RA(RA), LIS(RA.getLiveIntervals()), VRM(RA.getVirtRegMap()),
+      MRI(&VRM->getRegInfo()), TRI(MF.getSubtarget().getRegisterInfo()),
+      RegClassInfo(RA.getRegClassInfo()), Indexes(Indexes),
+      RegClassPriorityTrumpsGlobalness(
+          RA.getRegClassPriorityTrumpsGlobalness()),
+      ReverseLocalAssignment(RA.getReverseLocalAssignment()) {}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h
new file mode 100644
index 000000000000..1e9fa967214c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocPriorityAdvisor.h
@@ -0,0 +1,96 @@
+//===- RegAllocPriorityAdvisor.h - live ranges priority advisor -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGALLOCPRIORITYADVISOR_H
+#define LLVM_CODEGEN_REGALLOCPRIORITYADVISOR_H
+
+#include "RegAllocEvictionAdvisor.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+class MachineFunction;
+class VirtRegMap;
+class RAGreedy;
+
+/// Interface to the priority advisor, which is responsible for prioritizing
+/// live ranges.
+class RegAllocPriorityAdvisor {
+public:
+  RegAllocPriorityAdvisor(const RegAllocPriorityAdvisor &) = delete;
+  RegAllocPriorityAdvisor(RegAllocPriorityAdvisor &&) = delete;
+  virtual ~RegAllocPriorityAdvisor() = default;
+
+  /// Find the priority value for a live range. A float value is used since ML
+  /// prefers it.
+  virtual unsigned getPriority(const LiveInterval &LI) const = 0;
+
+  RegAllocPriorityAdvisor(const MachineFunction &MF, const RAGreedy &RA,
+                          SlotIndexes *const Indexes);
+
+protected:
+  const RAGreedy &RA;
+  LiveIntervals *const LIS;
+  VirtRegMap *const VRM;
+  MachineRegisterInfo *const MRI;
+  const TargetRegisterInfo *const TRI;
+  const RegisterClassInfo &RegClassInfo;
+  SlotIndexes *const Indexes;
+  const bool RegClassPriorityTrumpsGlobalness;
+  const bool ReverseLocalAssignment;
+};
+
+class DefaultPriorityAdvisor : public RegAllocPriorityAdvisor {
+public:
+  DefaultPriorityAdvisor(const MachineFunction &MF, const RAGreedy &RA,
+                         SlotIndexes *const Indexes)
+      : RegAllocPriorityAdvisor(MF, RA, Indexes) {}
+
+private:
+  unsigned getPriority(const LiveInterval &LI) const override;
+};
+
+class RegAllocPriorityAdvisorAnalysis : public ImmutablePass {
+public:
+  enum class AdvisorMode : int { Default, Release, Development };
+
+  RegAllocPriorityAdvisorAnalysis(AdvisorMode Mode)
+      : ImmutablePass(ID), Mode(Mode){};
+  static char ID;
+
+  /// Get an advisor for the given context (i.e. machine function, etc)
+  virtual std::unique_ptr<RegAllocPriorityAdvisor>
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) = 0;
+  AdvisorMode getAdvisorMode() const { return Mode; }
+  virtual void logRewardIfNeeded(const MachineFunction &MF,
+                                 llvm::function_ref<float()> GetReward){};
+
+protected:
+  // This analysis preserves everything, and subclasses may have additional
+  // requirements.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+private:
+  StringRef getPassName() const override;
+  const AdvisorMode Mode;
+};
+
+/// Specialization for the API used by the analysis infrastructure to create
+/// an instance of the priority advisor.
+template <> Pass *callDefaultCtor<RegAllocPriorityAdvisorAnalysis>();
+
+RegAllocPriorityAdvisorAnalysis *createReleaseModePriorityAdvisor();
+
+RegAllocPriorityAdvisorAnalysis *createDevelopmentModePriorityAdvisor();
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_REGALLOCPRIORITYADVISOR_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocScore.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocScore.cpp
new file mode 100644
index 000000000000..e420283dfcfa
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocScore.cpp
@@ -0,0 +1,121 @@
+//===- RegAllocScore.cpp - evaluate regalloc policy quality ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// Calculate a measure of the register allocation policy quality. This is used
+/// to construct a reward for the training of the ML-driven allocation policy.
+/// Currently, the score is the sum of the machine basic block frequency-weighed
+/// number of loads, stores, copies, and remat instructions, each factored with
+/// a relative weight.
+//===----------------------------------------------------------------------===//
+
+#include "RegAllocScore.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/ilist_iterator.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundleIterator.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+cl::opt<double> CopyWeight("regalloc-copy-weight", cl::init(0.2), cl::Hidden);
+cl::opt<double> LoadWeight("regalloc-load-weight", cl::init(4.0), cl::Hidden);
+cl::opt<double> StoreWeight("regalloc-store-weight", cl::init(1.0), cl::Hidden);
+cl::opt<double> CheapRematWeight("regalloc-cheap-remat-weight", cl::init(0.2),
+                                 cl::Hidden);
+cl::opt<double> ExpensiveRematWeight("regalloc-expensive-remat-weight",
+                                     cl::init(1.0), cl::Hidden);
+#define DEBUG_TYPE "regalloc-score"
+
+RegAllocScore &RegAllocScore::operator+=(const RegAllocScore &Other) {
+  CopyCounts += Other.copyCounts();
+  LoadCounts += Other.loadCounts();
+  StoreCounts += Other.storeCounts();
+  LoadStoreCounts += Other.loadStoreCounts();
+  CheapRematCounts += Other.cheapRematCounts();
+  ExpensiveRematCounts += Other.expensiveRematCounts();
+  return *this;
+}
+
+bool RegAllocScore::operator==(const RegAllocScore &Other) const {
+  return copyCounts() == Other.copyCounts() &&
+         loadCounts() == Other.loadCounts() &&
+         storeCounts() == Other.storeCounts() &&
+         loadStoreCounts() == Other.loadStoreCounts() &&
+         cheapRematCounts() == Other.cheapRematCounts() &&
+         expensiveRematCounts() == Other.expensiveRematCounts();
+}
+
+bool RegAllocScore::operator!=(const RegAllocScore &Other) const {
+  return !(*this == Other);
+}
+
+double RegAllocScore::getScore() const {
+  double Ret = 0.0;
+  Ret += CopyWeight * copyCounts();
+  Ret += LoadWeight * loadCounts();
+  Ret += StoreWeight * storeCounts();
+  Ret += (LoadWeight + StoreWeight) * loadStoreCounts();
+  Ret += CheapRematWeight * cheapRematCounts();
+  Ret += ExpensiveRematWeight * expensiveRematCounts();
+
+  return Ret;
+}
+
+RegAllocScore
+llvm::calculateRegAllocScore(const MachineFunction &MF,
+                             const MachineBlockFrequencyInfo &MBFI) {
+  return calculateRegAllocScore(
+      MF,
+      [&](const MachineBasicBlock &MBB) {
+        return MBFI.getBlockFreqRelativeToEntryBlock(&MBB);
+      },
+      [&](const MachineInstr &MI) {
+        return MF.getSubtarget().getInstrInfo()->isTriviallyReMaterializable(
+            MI);
+      });
+}
+
+RegAllocScore llvm::calculateRegAllocScore(
+    const MachineFunction &MF,
+    llvm::function_ref<double(const MachineBasicBlock &)> GetBBFreq,
+    llvm::function_ref<bool(const MachineInstr &)>
+        IsTriviallyRematerializable) {
+  RegAllocScore Total;
+
+  for (const MachineBasicBlock &MBB : MF) {
+    double BlockFreqRelativeToEntrypoint = GetBBFreq(MBB);
+    RegAllocScore MBBScore;
+
+    for (const MachineInstr &MI : MBB) {
+      if (MI.isDebugInstr() || MI.isKill() || MI.isInlineAsm()) {
+        continue;
+      }
+      if (MI.isCopy()) {
+        MBBScore.onCopy(BlockFreqRelativeToEntrypoint);
+      } else if (IsTriviallyRematerializable(MI)) {
+        if (MI.getDesc().isAsCheapAsAMove()) {
+          MBBScore.onCheapRemat(BlockFreqRelativeToEntrypoint);
+        } else {
+          MBBScore.onExpensiveRemat(BlockFreqRelativeToEntrypoint);
+        }
+      } else if (MI.mayLoad() && MI.mayStore()) {
+        MBBScore.onLoadStore(BlockFreqRelativeToEntrypoint);
+      } else if (MI.mayLoad()) {
+        MBBScore.onLoad(BlockFreqRelativeToEntrypoint);
+      } else if (MI.mayStore()) {
+        MBBScore.onStore(BlockFreqRelativeToEntrypoint);
+      }
+    }
+    Total += MBBScore;
+  }
+  return Total;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegAllocScore.h b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocScore.h
new file mode 100644
index 000000000000..b80adae29f23
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegAllocScore.h
@@ -0,0 +1,73 @@
+//==- RegAllocScore.h - evaluate regalloc policy quality  ----------*-C++-*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// Calculate a measure of the register allocation policy quality. This is used
+/// to construct a reward for the training of the ML-driven allocation policy.
+/// Currently, the score is the sum of the machine basic block frequency-weighed
+/// number of loads, stores, copies, and remat instructions, each factored with
+/// a relative weight.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGALLOCSCORE_H_
+#define LLVM_CODEGEN_REGALLOCSCORE_H_
+
+#include "llvm/ADT/STLFunctionalExtras.h"
+
+namespace llvm {
+
+class MachineBasicBlock;
+class MachineBlockFrequencyInfo;
+class MachineFunction;
+class MachineInstr;
+
+/// Regalloc score.
+class RegAllocScore final {
+  double CopyCounts = 0.0;
+  double LoadCounts = 0.0;
+  double StoreCounts = 0.0;
+  double CheapRematCounts = 0.0;
+  double LoadStoreCounts = 0.0;
+  double ExpensiveRematCounts = 0.0;
+
+public:
+  RegAllocScore() = default;
+  RegAllocScore(const RegAllocScore &) = default;
+
+  double copyCounts() const { return CopyCounts; }
+  double loadCounts() const { return LoadCounts; }
+  double storeCounts() const { return StoreCounts; }
+  double loadStoreCounts() const { return LoadStoreCounts; }
+  double expensiveRematCounts() const { return ExpensiveRematCounts; }
+  double cheapRematCounts() const { return CheapRematCounts; }
+
+  void onCopy(double Freq) { CopyCounts += Freq; }
+  void onLoad(double Freq) { LoadCounts += Freq; }
+  void onStore(double Freq) { StoreCounts += Freq; }
+  void onLoadStore(double Freq) { LoadStoreCounts += Freq; }
+  void onExpensiveRemat(double Freq) { ExpensiveRematCounts += Freq; }
+  void onCheapRemat(double Freq) { CheapRematCounts += Freq; }
+
+  RegAllocScore &operator+=(const RegAllocScore &Other);
+  bool operator==(const RegAllocScore &Other) const;
+  bool operator!=(const RegAllocScore &Other) const;
+  double getScore() const;
+};
+
+/// Calculate a score. When comparing 2 scores for the same function but
+/// different policies, the better policy would have a smaller score.
+/// The implementation is the overload below (which is also easily unittestable)
+RegAllocScore calculateRegAllocScore(const MachineFunction &MF,
+                                     const MachineBlockFrequencyInfo &MBFI);
+
+/// Implementation of the above, which is also more easily unittestable.
+RegAllocScore calculateRegAllocScore(
+    const MachineFunction &MF,
+    llvm::function_ref<double(const MachineBasicBlock &)> GetBBFreq,
+    llvm::function_ref<bool(const MachineInstr &)> IsTriviallyRematerializable);
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_REGALLOCSCORE_H_
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegUsageInfoCollector.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegUsageInfoCollector.cpp
new file mode 100644
index 000000000000..6657cf3c1ef4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegUsageInfoCollector.cpp
@@ -0,0 +1,215 @@
+//===-- RegUsageInfoCollector.cpp - Register Usage Information Collector --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This pass is required to take advantage of the interprocedural register
+/// allocation infrastructure.
+///
+/// This pass is simple MachineFunction pass which collects register usage
+/// details by iterating through each physical registers and checking
+/// MRI::isPhysRegUsed() then creates a RegMask based on this details.
+/// The pass then stores this RegMask in PhysicalRegisterUsageInfo.cpp
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterUsageInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ip-regalloc"
+
+STATISTIC(NumCSROpt,
+          "Number of functions optimized for callee saved registers");
+
+namespace {
+
+class RegUsageInfoCollector : public MachineFunctionPass {
+public:
+  RegUsageInfoCollector() : MachineFunctionPass(ID) {
+    PassRegistry &Registry = *PassRegistry::getPassRegistry();
+    initializeRegUsageInfoCollectorPass(Registry);
+  }
+
+  StringRef getPassName() const override {
+    return "Register Usage Information Collector Pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<PhysicalRegisterUsageInfo>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  // Call getCalleeSaves and then also set the bits for subregs and
+  // fully saved superregs.
+  static void computeCalleeSavedRegs(BitVector &SavedRegs, MachineFunction &MF);
+
+  static char ID;
+};
+
+} // end of anonymous namespace
+
+char RegUsageInfoCollector::ID = 0;
+
+INITIALIZE_PASS_BEGIN(RegUsageInfoCollector, "RegUsageInfoCollector",
+                      "Register Usage Information Collector", false, false)
+INITIALIZE_PASS_DEPENDENCY(PhysicalRegisterUsageInfo)
+INITIALIZE_PASS_END(RegUsageInfoCollector, "RegUsageInfoCollector",
+                    "Register Usage Information Collector", false, false)
+
+FunctionPass *llvm::createRegUsageInfoCollector() {
+  return new RegUsageInfoCollector();
+}
+
+// TODO: Move to hook somwehere?
+
+// Return true if it is useful to track the used registers for IPRA / no CSR
+// optimizations. This is not useful for entry points, and computing the
+// register usage information is expensive.
+static bool isCallableFunction(const MachineFunction &MF) {
+  switch (MF.getFunction().getCallingConv()) {
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_HS:
+  case CallingConv::AMDGPU_ES:
+  case CallingConv::AMDGPU_LS:
+  case CallingConv::AMDGPU_KERNEL:
+    return false;
+  default:
+    return true;
+  }
+}
+
+bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const LLVMTargetMachine &TM = MF.getTarget();
+
+  LLVM_DEBUG(dbgs() << " -------------------- " << getPassName()
+                    << " -------------------- \nFunction Name : "
+                    << MF.getName() << '\n');
+
+  // Analyzing the register usage may be expensive on some targets.
+  if (!isCallableFunction(MF)) {
+    LLVM_DEBUG(dbgs() << "Not analyzing non-callable function\n");
+    return false;
+  }
+
+  // If there are no callers, there's no point in computing more precise
+  // register usage here.
+  if (MF.getFunction().use_empty()) {
+    LLVM_DEBUG(dbgs() << "Not analyzing function with no callers\n");
+    return false;
+  }
+
+  std::vector<uint32_t> RegMask;
+
+  // Compute the size of the bit vector to represent all the registers.
+  // The bit vector is broken into 32-bit chunks, thus takes the ceil of
+  // the number of registers divided by 32 for the size.
+  unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
+  RegMask.resize(RegMaskSize, ~((uint32_t)0));
+
+  const Function &F = MF.getFunction();
+
+  PhysicalRegisterUsageInfo &PRUI = getAnalysis<PhysicalRegisterUsageInfo>();
+  PRUI.setTargetMachine(TM);
+
+  LLVM_DEBUG(dbgs() << "Clobbered Registers: ");
+
+  BitVector SavedRegs;
+  computeCalleeSavedRegs(SavedRegs, MF);
+
+  const BitVector &UsedPhysRegsMask = MRI->getUsedPhysRegsMask();
+  auto SetRegAsDefined = [&RegMask] (unsigned Reg) {
+    RegMask[Reg / 32] &= ~(1u << Reg % 32);
+  };
+
+  // Some targets can clobber registers "inside" a call, typically in
+  // linker-generated code.
+  for (const MCPhysReg Reg : TRI->getIntraCallClobberedRegs(&MF))
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+      SetRegAsDefined(*AI);
+
+  // Scan all the physical registers. When a register is defined in the current
+  // function set it and all the aliasing registers as defined in the regmask.
+  // FIXME: Rewrite to use regunits.
+  for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) {
+    // Don't count registers that are saved and restored.
+    if (SavedRegs.test(PReg))
+      continue;
+    // If a register is defined by an instruction mark it as defined together
+    // with all it's unsaved aliases.
+    if (!MRI->def_empty(PReg)) {
+      for (MCRegAliasIterator AI(PReg, TRI, true); AI.isValid(); ++AI)
+        if (!SavedRegs.test(*AI))
+          SetRegAsDefined(*AI);
+      continue;
+    }
+    // If a register is in the UsedPhysRegsMask set then mark it as defined.
+    // All clobbered aliases will also be in the set, so we can skip setting
+    // as defined all the aliases here.
+    if (UsedPhysRegsMask.test(PReg))
+      SetRegAsDefined(PReg);
+  }
+
+  if (TargetFrameLowering::isSafeForNoCSROpt(F) &&
+      MF.getSubtarget().getFrameLowering()->isProfitableForNoCSROpt(F)) {
+    ++NumCSROpt;
+    LLVM_DEBUG(dbgs() << MF.getName()
+                      << " function optimized for not having CSR.\n");
+  }
+
+  LLVM_DEBUG(
+    for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) {
+      if (MachineOperand::clobbersPhysReg(&(RegMask[0]), PReg))
+        dbgs() << printReg(PReg, TRI) << " ";
+    }
+
+    dbgs() << " \n----------------------------------------\n";
+  );
+
+  PRUI.storeUpdateRegUsageInfo(F, RegMask);
+
+  return false;
+}
+
+void RegUsageInfoCollector::
+computeCalleeSavedRegs(BitVector &SavedRegs, MachineFunction &MF) {
+  const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+
+  // Target will return the set of registers that it saves/restores as needed.
+  SavedRegs.clear();
+  TFI.getCalleeSaves(MF, SavedRegs);
+  if (SavedRegs.none())
+    return;
+
+  // Insert subregs.
+  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    MCPhysReg Reg = CSRegs[i];
+    if (SavedRegs.test(Reg)) {
+      // Save subregisters
+      for (MCPhysReg SR : TRI.subregs(Reg))
+        SavedRegs.set(SR);
+    }
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp
new file mode 100644
index 000000000000..d356962e0d78
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp
@@ -0,0 +1,154 @@
+//=--- RegUsageInfoPropagate.cpp - Register Usage Informartion Propagation --=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This pass is required to take advantage of the interprocedural register
+/// allocation infrastructure.
+///
+/// This pass iterates through MachineInstrs in a given MachineFunction and at
+/// each callsite queries RegisterUsageInfo for RegMask (calculated based on
+/// actual register allocation) of the callee function, if the RegMask detail
+/// is available then this pass will update the RegMask of the call instruction.
+/// This updated RegMask will be used by the register allocator while allocating
+/// the current MachineFunction.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterUsageInfo.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ip-regalloc"
+
+#define RUIP_NAME "Register Usage Information Propagation"
+
+namespace {
+
+class RegUsageInfoPropagation : public MachineFunctionPass {
+public:
+  RegUsageInfoPropagation() : MachineFunctionPass(ID) {
+    PassRegistry &Registry = *PassRegistry::getPassRegistry();
+    initializeRegUsageInfoPropagationPass(Registry);
+  }
+
+  StringRef getPassName() const override { return RUIP_NAME; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<PhysicalRegisterUsageInfo>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  static char ID;
+
+private:
+  static void setRegMask(MachineInstr &MI, ArrayRef<uint32_t> RegMask) {
+    assert(RegMask.size() ==
+           MachineOperand::getRegMaskSize(MI.getParent()->getParent()
+                                          ->getRegInfo().getTargetRegisterInfo()
+                                          ->getNumRegs())
+           && "expected register mask size");
+    for (MachineOperand &MO : MI.operands()) {
+      if (MO.isRegMask())
+        MO.setRegMask(RegMask.data());
+    }
+  }
+};
+
+} // end of anonymous namespace
+
+INITIALIZE_PASS_BEGIN(RegUsageInfoPropagation, "reg-usage-propagation",
+                      RUIP_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(PhysicalRegisterUsageInfo)
+INITIALIZE_PASS_END(RegUsageInfoPropagation, "reg-usage-propagation",
+                    RUIP_NAME, false, false)
+
+char RegUsageInfoPropagation::ID = 0;
+
+// Assumes call instructions have a single reference to a function.
+static const Function *findCalledFunction(const Module &M,
+                                          const MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isGlobal())
+      return dyn_cast<const Function>(MO.getGlobal());
+
+    if (MO.isSymbol())
+      return M.getFunction(MO.getSymbolName());
+  }
+
+  return nullptr;
+}
+
+bool RegUsageInfoPropagation::runOnMachineFunction(MachineFunction &MF) {
+  const Module &M = *MF.getFunction().getParent();
+  PhysicalRegisterUsageInfo *PRUI = &getAnalysis<PhysicalRegisterUsageInfo>();
+
+  LLVM_DEBUG(dbgs() << " ++++++++++++++++++++ " << getPassName()
+                    << " ++++++++++++++++++++  \n");
+  LLVM_DEBUG(dbgs() << "MachineFunction : " << MF.getName() << "\n");
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (!MFI.hasCalls() && !MFI.hasTailCall())
+    return false;
+
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (!MI.isCall())
+        continue;
+      LLVM_DEBUG(
+          dbgs()
+          << "Call Instruction Before Register Usage Info Propagation : \n"
+          << MI << "\n");
+
+      auto UpdateRegMask = [&](const Function &F) {
+        const ArrayRef<uint32_t> RegMask = PRUI->getRegUsageInfo(F);
+        if (RegMask.empty())
+          return;
+        setRegMask(MI, RegMask);
+        Changed = true;
+      };
+
+      if (const Function *F = findCalledFunction(M, MI)) {
+        if (F->isDefinitionExact()) {
+          UpdateRegMask(*F);
+        } else {
+          LLVM_DEBUG(dbgs() << "Function definition is not exact\n");
+        }
+      } else {
+        LLVM_DEBUG(dbgs() << "Failed to find call target function\n");
+      }
+
+      LLVM_DEBUG(
+          dbgs()
+          << "Call Instruction After Register Usage Info Propagation : \n"
+          << MI << '\n');
+    }
+  }
+
+  LLVM_DEBUG(
+      dbgs() << " +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+                "++++++ \n");
+  return Changed;
+}
+
+FunctionPass *llvm::createRegUsageInfoPropPass() {
+  return new RegUsageInfoPropagation();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegisterBank.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegisterBank.cpp
new file mode 100644
index 000000000000..8e0a0b0dc282
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegisterBank.cpp
@@ -0,0 +1,112 @@
+//===- llvm/CodeGen/GlobalISel/RegisterBank.cpp - Register Bank --*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the RegisterBank class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/RegisterBankInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "registerbank"
+
+using namespace llvm;
+
+const unsigned RegisterBank::InvalidID = UINT_MAX;
+
+RegisterBank::RegisterBank(unsigned ID, const char *Name,
+                           const uint32_t *CoveredClasses,
+                           unsigned NumRegClasses)
+    : ID(ID), Name(Name) {
+  ContainedRegClasses.resize(NumRegClasses);
+  ContainedRegClasses.setBitsInMask(CoveredClasses);
+}
+
+bool RegisterBank::verify(const RegisterBankInfo &RBI,
+                          const TargetRegisterInfo &TRI) const {
+  assert(isValid() && "Invalid register bank");
+  for (unsigned RCId = 0, End = TRI.getNumRegClasses(); RCId != End; ++RCId) {
+    const TargetRegisterClass &RC = *TRI.getRegClass(RCId);
+
+    if (!covers(RC))
+      continue;
+    // Verify that the register bank covers all the sub classes of the
+    // classes it covers.
+
+    // Use a different (slow in that case) method than
+    // RegisterBankInfo to find the subclasses of RC, to make sure
+    // both agree on the covers.
+    for (unsigned SubRCId = 0; SubRCId != End; ++SubRCId) {
+      const TargetRegisterClass &SubRC = *TRI.getRegClass(RCId);
+
+      if (!RC.hasSubClassEq(&SubRC))
+        continue;
+
+      // Verify that the Size of the register bank is big enough to cover
+      // all the register classes it covers.
+      assert(RBI.getMaximumSize(getID()) >= TRI.getRegSizeInBits(SubRC) &&
+             "Size is not big enough for all the subclasses!");
+      assert(covers(SubRC) && "Not all subclasses are covered");
+    }
+  }
+  return true;
+}
+
+bool RegisterBank::covers(const TargetRegisterClass &RC) const {
+  assert(isValid() && "RB hasn't been initialized yet");
+  return ContainedRegClasses.test(RC.getID());
+}
+
+bool RegisterBank::isValid() const {
+  return ID != InvalidID && Name != nullptr &&
+         // A register bank that does not cover anything is useless.
+         !ContainedRegClasses.empty();
+}
+
+bool RegisterBank::operator==(const RegisterBank &OtherRB) const {
+  // There must be only one instance of a given register bank alive
+  // for the whole compilation.
+  // The RegisterBankInfo is supposed to enforce that.
+  assert((OtherRB.getID() != getID() || &OtherRB == this) &&
+         "ID does not uniquely identify a RegisterBank");
+  return &OtherRB == this;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegisterBank::dump(const TargetRegisterInfo *TRI) const {
+  print(dbgs(), /* IsForDebug */ true, TRI);
+}
+#endif
+
+void RegisterBank::print(raw_ostream &OS, bool IsForDebug,
+                         const TargetRegisterInfo *TRI) const {
+  OS << getName();
+  if (!IsForDebug)
+    return;
+  OS << "(ID:" << getID() << ")\n"
+     << "isValid:" << isValid() << '\n'
+     << "Number of Covered register classes: " << ContainedRegClasses.count()
+     << '\n';
+  // Print all the subclasses if we can.
+  // This register classes may not be properly initialized yet.
+  if (!TRI || ContainedRegClasses.empty())
+    return;
+  assert(ContainedRegClasses.size() == TRI->getNumRegClasses() &&
+         "TRI does not match the initialization process?");
+  OS << "Covered register classes:\n";
+  ListSeparator LS;
+  for (unsigned RCId = 0, End = TRI->getNumRegClasses(); RCId != End; ++RCId) {
+    const TargetRegisterClass &RC = *TRI->getRegClass(RCId);
+
+    if (covers(RC))
+      OS << LS << TRI->getRegClassName(&RC);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegisterBankInfo.cpp
new file mode 100644
index 000000000000..658a09fd8700
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegisterBankInfo.cpp
@@ -0,0 +1,817 @@
+//===- llvm/CodeGen/GlobalISel/RegisterBankInfo.cpp --------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the RegisterBankInfo class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegisterBankInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterBank.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm> // For std::max.
+
+#define DEBUG_TYPE "registerbankinfo"
+
+using namespace llvm;
+
+STATISTIC(NumPartialMappingsCreated,
+          "Number of partial mappings dynamically created");
+STATISTIC(NumPartialMappingsAccessed,
+          "Number of partial mappings dynamically accessed");
+STATISTIC(NumValueMappingsCreated,
+          "Number of value mappings dynamically created");
+STATISTIC(NumValueMappingsAccessed,
+          "Number of value mappings dynamically accessed");
+STATISTIC(NumOperandsMappingsCreated,
+          "Number of operands mappings dynamically created");
+STATISTIC(NumOperandsMappingsAccessed,
+          "Number of operands mappings dynamically accessed");
+STATISTIC(NumInstructionMappingsCreated,
+          "Number of instruction mappings dynamically created");
+STATISTIC(NumInstructionMappingsAccessed,
+          "Number of instruction mappings dynamically accessed");
+
+const unsigned RegisterBankInfo::DefaultMappingID = UINT_MAX;
+const unsigned RegisterBankInfo::InvalidMappingID = UINT_MAX - 1;
+
+//------------------------------------------------------------------------------
+// RegisterBankInfo implementation.
+//------------------------------------------------------------------------------
+RegisterBankInfo::RegisterBankInfo(const RegisterBank **RegBanks,
+                                   unsigned NumRegBanks, const unsigned *Sizes,
+                                   unsigned HwMode)
+    : RegBanks(RegBanks), NumRegBanks(NumRegBanks), Sizes(Sizes),
+      HwMode(HwMode) {
+#ifndef NDEBUG
+  for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
+    assert(RegBanks[Idx] != nullptr && "Invalid RegisterBank");
+    assert(RegBanks[Idx]->isValid() && "RegisterBank should be valid");
+  }
+#endif // NDEBUG
+}
+
+bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const {
+#ifndef NDEBUG
+  for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
+    const RegisterBank &RegBank = getRegBank(Idx);
+    assert(Idx == RegBank.getID() &&
+           "ID does not match the index in the array");
+    LLVM_DEBUG(dbgs() << "Verify " << RegBank << '\n');
+    assert(RegBank.verify(*this, TRI) && "RegBank is invalid");
+  }
+#endif // NDEBUG
+  return true;
+}
+
+const RegisterBank *
+RegisterBankInfo::getRegBank(Register Reg, const MachineRegisterInfo &MRI,
+                             const TargetRegisterInfo &TRI) const {
+  if (!Reg.isVirtual()) {
+    // FIXME: This was probably a copy to a virtual register that does have a
+    // type we could use.
+    const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg, TRI);
+    return RC ? &getRegBankFromRegClass(*RC, LLT()) : nullptr;
+  }
+
+  const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
+  if (auto *RB = dyn_cast_if_present<const RegisterBank *>(RegClassOrBank))
+    return RB;
+  if (auto *RC =
+          dyn_cast_if_present<const TargetRegisterClass *>(RegClassOrBank))
+    return &getRegBankFromRegClass(*RC, MRI.getType(Reg));
+  return nullptr;
+}
+
+const TargetRegisterClass *
+RegisterBankInfo::getMinimalPhysRegClass(Register Reg,
+                                         const TargetRegisterInfo &TRI) const {
+  assert(Reg.isPhysical() && "Reg must be a physreg");
+  const auto &RegRCIt = PhysRegMinimalRCs.find(Reg);
+  if (RegRCIt != PhysRegMinimalRCs.end())
+    return RegRCIt->second;
+  const TargetRegisterClass *PhysRC = TRI.getMinimalPhysRegClassLLT(Reg, LLT());
+  PhysRegMinimalRCs[Reg] = PhysRC;
+  return PhysRC;
+}
+
+const RegisterBank *RegisterBankInfo::getRegBankFromConstraints(
+    const MachineInstr &MI, unsigned OpIdx, const TargetInstrInfo &TII,
+    const MachineRegisterInfo &MRI) const {
+  const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+
+  // The mapping of the registers may be available via the
+  // register class constraints.
+  const TargetRegisterClass *RC = MI.getRegClassConstraint(OpIdx, &TII, TRI);
+
+  if (!RC)
+    return nullptr;
+
+  Register Reg = MI.getOperand(OpIdx).getReg();
+  const RegisterBank &RegBank = getRegBankFromRegClass(*RC, MRI.getType(Reg));
+  // Check that the target properly implemented getRegBankFromRegClass.
+  assert(RegBank.covers(*RC) &&
+         "The mapping of the register bank does not make sense");
+  return &RegBank;
+}
+
+const TargetRegisterClass *RegisterBankInfo::constrainGenericRegister(
+    Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI) {
+
+  // If the register already has a class, fallback to MRI::constrainRegClass.
+  auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
+  if (isa<const TargetRegisterClass *>(RegClassOrBank))
+    return MRI.constrainRegClass(Reg, &RC);
+
+  const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
+  // Otherwise, all we can do is ensure the bank covers the class, and set it.
+  if (RB && !RB->covers(RC))
+    return nullptr;
+
+  // If nothing was set or the class is simply compatible, set it.
+  MRI.setRegClass(Reg, &RC);
+  return &RC;
+}
+
+/// Check whether or not \p MI should be treated like a copy
+/// for the mappings.
+/// Copy like instruction are special for mapping because
+/// they don't have actual register constraints. Moreover,
+/// they sometimes have register classes assigned and we can
+/// just use that instead of failing to provide a generic mapping.
+static bool isCopyLike(const MachineInstr &MI) {
+  return MI.isCopy() || MI.isPHI() ||
+         MI.getOpcode() == TargetOpcode::REG_SEQUENCE;
+}
+
+const RegisterBankInfo::InstructionMapping &
+RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
+  // For copies we want to walk over the operands and try to find one
+  // that has a register bank since the instruction itself will not get
+  // us any constraint.
+  bool IsCopyLike = isCopyLike(MI);
+  // For copy like instruction, only the mapping of the definition
+  // is important. The rest is not constrained.
+  unsigned NumOperandsForMapping = IsCopyLike ? 1 : MI.getNumOperands();
+
+  const MachineFunction &MF = *MI.getMF();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  // We may need to query the instruction encoding to guess the mapping.
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+  // Before doing anything complicated check if the mapping is not
+  // directly available.
+  bool CompleteMapping = true;
+
+  SmallVector<const ValueMapping *, 8> OperandsMapping(NumOperandsForMapping);
+  for (unsigned OpIdx = 0, EndIdx = MI.getNumOperands(); OpIdx != EndIdx;
+       ++OpIdx) {
+    const MachineOperand &MO = MI.getOperand(OpIdx);
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    // The register bank of Reg is just a side effect of the current
+    // excution and in particular, there is no reason to believe this
+    // is the best default mapping for the current instruction.  Keep
+    // it as an alternative register bank if we cannot figure out
+    // something.
+    const RegisterBank *AltRegBank = getRegBank(Reg, MRI, TRI);
+    // For copy-like instruction, we want to reuse the register bank
+    // that is already set on Reg, if any, since those instructions do
+    // not have any constraints.
+    const RegisterBank *CurRegBank = IsCopyLike ? AltRegBank : nullptr;
+    if (!CurRegBank) {
+      // If this is a target specific instruction, we can deduce
+      // the register bank from the encoding constraints.
+      CurRegBank = getRegBankFromConstraints(MI, OpIdx, TII, MRI);
+      if (!CurRegBank) {
+        // All our attempts failed, give up.
+        CompleteMapping = false;
+
+        if (!IsCopyLike)
+          // MI does not carry enough information to guess the mapping.
+          return getInvalidInstructionMapping();
+        continue;
+      }
+    }
+
+    unsigned Size = getSizeInBits(Reg, MRI, TRI);
+    const ValueMapping *ValMapping = &getValueMapping(0, Size, *CurRegBank);
+    if (IsCopyLike) {
+      if (!OperandsMapping[0]) {
+        if (MI.isRegSequence()) {
+          // For reg_sequence, the result size does not match the input.
+          unsigned ResultSize = getSizeInBits(MI.getOperand(0).getReg(),
+                                              MRI, TRI);
+          OperandsMapping[0] = &getValueMapping(0, ResultSize, *CurRegBank);
+        } else {
+          OperandsMapping[0] = ValMapping;
+        }
+      }
+
+      // The default handling assumes any register bank can be copied to any
+      // other. If this isn't the case, the target should specially deal with
+      // reg_sequence/phi. There may also be unsatisfiable copies.
+      for (; OpIdx != EndIdx; ++OpIdx) {
+        const MachineOperand &MO = MI.getOperand(OpIdx);
+        if (!MO.isReg())
+          continue;
+        Register Reg = MO.getReg();
+        if (!Reg)
+          continue;
+
+        const RegisterBank *AltRegBank = getRegBank(Reg, MRI, TRI);
+        if (AltRegBank &&
+            cannotCopy(*CurRegBank, *AltRegBank, getSizeInBits(Reg, MRI, TRI)))
+          return getInvalidInstructionMapping();
+      }
+
+      CompleteMapping = true;
+      break;
+    }
+
+    OperandsMapping[OpIdx] = ValMapping;
+  }
+
+  if (IsCopyLike && !CompleteMapping) {
+    // No way to deduce the type from what we have.
+    return getInvalidInstructionMapping();
+  }
+
+  assert(CompleteMapping && "Setting an uncomplete mapping");
+  return getInstructionMapping(
+      DefaultMappingID, /*Cost*/ 1,
+      /*OperandsMapping*/ getOperandsMapping(OperandsMapping),
+      NumOperandsForMapping);
+}
+
+/// Hashing function for PartialMapping.
+static hash_code hashPartialMapping(unsigned StartIdx, unsigned Length,
+                                    const RegisterBank *RegBank) {
+  return hash_combine(StartIdx, Length, RegBank ? RegBank->getID() : 0);
+}
+
+/// Overloaded version of hash_value for a PartialMapping.
+hash_code
+llvm::hash_value(const RegisterBankInfo::PartialMapping &PartMapping) {
+  return hashPartialMapping(PartMapping.StartIdx, PartMapping.Length,
+                            PartMapping.RegBank);
+}
+
+const RegisterBankInfo::PartialMapping &
+RegisterBankInfo::getPartialMapping(unsigned StartIdx, unsigned Length,
+                                    const RegisterBank &RegBank) const {
+  ++NumPartialMappingsAccessed;
+
+  hash_code Hash = hashPartialMapping(StartIdx, Length, &RegBank);
+  const auto &It = MapOfPartialMappings.find(Hash);
+  if (It != MapOfPartialMappings.end())
+    return *It->second;
+
+  ++NumPartialMappingsCreated;
+
+  auto &PartMapping = MapOfPartialMappings[Hash];
+  PartMapping = std::make_unique<PartialMapping>(StartIdx, Length, RegBank);
+  return *PartMapping;
+}
+
+const RegisterBankInfo::ValueMapping &
+RegisterBankInfo::getValueMapping(unsigned StartIdx, unsigned Length,
+                                  const RegisterBank &RegBank) const {
+  return getValueMapping(&getPartialMapping(StartIdx, Length, RegBank), 1);
+}
+
+static hash_code
+hashValueMapping(const RegisterBankInfo::PartialMapping *BreakDown,
+                 unsigned NumBreakDowns) {
+  if (LLVM_LIKELY(NumBreakDowns == 1))
+    return hash_value(*BreakDown);
+  SmallVector<size_t, 8> Hashes(NumBreakDowns);
+  for (unsigned Idx = 0; Idx != NumBreakDowns; ++Idx)
+    Hashes.push_back(hash_value(BreakDown[Idx]));
+  return hash_combine_range(Hashes.begin(), Hashes.end());
+}
+
+const RegisterBankInfo::ValueMapping &
+RegisterBankInfo::getValueMapping(const PartialMapping *BreakDown,
+                                  unsigned NumBreakDowns) const {
+  ++NumValueMappingsAccessed;
+
+  hash_code Hash = hashValueMapping(BreakDown, NumBreakDowns);
+  const auto &It = MapOfValueMappings.find(Hash);
+  if (It != MapOfValueMappings.end())
+    return *It->second;
+
+  ++NumValueMappingsCreated;
+
+  auto &ValMapping = MapOfValueMappings[Hash];
+  ValMapping = std::make_unique<ValueMapping>(BreakDown, NumBreakDowns);
+  return *ValMapping;
+}
+
+template <typename Iterator>
+const RegisterBankInfo::ValueMapping *
+RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const {
+
+  ++NumOperandsMappingsAccessed;
+
+  // The addresses of the value mapping are unique.
+  // Therefore, we can use them directly to hash the operand mapping.
+  hash_code Hash = hash_combine_range(Begin, End);
+  auto &Res = MapOfOperandsMappings[Hash];
+  if (Res)
+    return Res.get();
+
+  ++NumOperandsMappingsCreated;
+
+  // Create the array of ValueMapping.
+  // Note: this array will not hash to this instance of operands
+  // mapping, because we use the pointer of the ValueMapping
+  // to hash and we expect them to uniquely identify an instance
+  // of value mapping.
+  Res = std::make_unique<ValueMapping[]>(std::distance(Begin, End));
+  unsigned Idx = 0;
+  for (Iterator It = Begin; It != End; ++It, ++Idx) {
+    const ValueMapping *ValMap = *It;
+    if (!ValMap)
+      continue;
+    Res[Idx] = *ValMap;
+  }
+  return Res.get();
+}
+
+const RegisterBankInfo::ValueMapping *RegisterBankInfo::getOperandsMapping(
+    const SmallVectorImpl<const RegisterBankInfo::ValueMapping *> &OpdsMapping)
+    const {
+  return getOperandsMapping(OpdsMapping.begin(), OpdsMapping.end());
+}
+
+const RegisterBankInfo::ValueMapping *RegisterBankInfo::getOperandsMapping(
+    std::initializer_list<const RegisterBankInfo::ValueMapping *> OpdsMapping)
+    const {
+  return getOperandsMapping(OpdsMapping.begin(), OpdsMapping.end());
+}
+
+static hash_code
+hashInstructionMapping(unsigned ID, unsigned Cost,
+                       const RegisterBankInfo::ValueMapping *OperandsMapping,
+                       unsigned NumOperands) {
+  return hash_combine(ID, Cost, OperandsMapping, NumOperands);
+}
+
+const RegisterBankInfo::InstructionMapping &
+RegisterBankInfo::getInstructionMappingImpl(
+    bool IsInvalid, unsigned ID, unsigned Cost,
+    const RegisterBankInfo::ValueMapping *OperandsMapping,
+    unsigned NumOperands) const {
+  assert(((IsInvalid && ID == InvalidMappingID && Cost == 0 &&
+           OperandsMapping == nullptr && NumOperands == 0) ||
+          !IsInvalid) &&
+         "Mismatch argument for invalid input");
+  ++NumInstructionMappingsAccessed;
+
+  hash_code Hash =
+      hashInstructionMapping(ID, Cost, OperandsMapping, NumOperands);
+  const auto &It = MapOfInstructionMappings.find(Hash);
+  if (It != MapOfInstructionMappings.end())
+    return *It->second;
+
+  ++NumInstructionMappingsCreated;
+
+  auto &InstrMapping = MapOfInstructionMappings[Hash];
+  InstrMapping = std::make_unique<InstructionMapping>(
+      ID, Cost, OperandsMapping, NumOperands);
+  return *InstrMapping;
+}
+
+const RegisterBankInfo::InstructionMapping &
+RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+  const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
+  if (Mapping.isValid())
+    return Mapping;
+  llvm_unreachable("The target must implement this");
+}
+
+RegisterBankInfo::InstructionMappings
+RegisterBankInfo::getInstrPossibleMappings(const MachineInstr &MI) const {
+  InstructionMappings PossibleMappings;
+  const auto &Mapping = getInstrMapping(MI);
+  if (Mapping.isValid()) {
+    // Put the default mapping first.
+    PossibleMappings.push_back(&Mapping);
+  }
+
+  // Then the alternative mapping, if any.
+  InstructionMappings AltMappings = getInstrAlternativeMappings(MI);
+  append_range(PossibleMappings, AltMappings);
+#ifndef NDEBUG
+  for (const InstructionMapping *Mapping : PossibleMappings)
+    assert(Mapping->verify(MI) && "Mapping is invalid");
+#endif
+  return PossibleMappings;
+}
+
+RegisterBankInfo::InstructionMappings
+RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const {
+  // No alternative for MI.
+  return InstructionMappings();
+}
+
+void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
+  MachineInstr &MI = OpdMapper.getMI();
+  MachineRegisterInfo &MRI = OpdMapper.getMRI();
+  LLVM_DEBUG(dbgs() << "Applying default-like mapping\n");
+  for (unsigned OpIdx = 0,
+                EndIdx = OpdMapper.getInstrMapping().getNumOperands();
+       OpIdx != EndIdx; ++OpIdx) {
+    LLVM_DEBUG(dbgs() << "OpIdx " << OpIdx);
+    MachineOperand &MO = MI.getOperand(OpIdx);
+    if (!MO.isReg()) {
+      LLVM_DEBUG(dbgs() << " is not a register, nothing to be done\n");
+      continue;
+    }
+    if (!MO.getReg()) {
+      LLVM_DEBUG(dbgs() << " is $noreg, nothing to be done\n");
+      continue;
+    }
+    LLT Ty = MRI.getType(MO.getReg());
+    if (!Ty.isValid())
+      continue;
+    assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns !=
+               0 &&
+           "Invalid mapping");
+    assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns ==
+               1 &&
+           "This mapping is too complex for this function");
+    iterator_range<SmallVectorImpl<Register>::const_iterator> NewRegs =
+        OpdMapper.getVRegs(OpIdx);
+    if (NewRegs.empty()) {
+      LLVM_DEBUG(dbgs() << " has not been repaired, nothing to be done\n");
+      continue;
+    }
+    Register OrigReg = MO.getReg();
+    Register NewReg = *NewRegs.begin();
+    LLVM_DEBUG(dbgs() << " changed, replace " << printReg(OrigReg, nullptr));
+    MO.setReg(NewReg);
+    LLVM_DEBUG(dbgs() << " with " << printReg(NewReg, nullptr));
+
+    // The OperandsMapper creates plain scalar, we may have to fix that.
+    // Check if the types match and if not, fix that.
+    LLT OrigTy = MRI.getType(OrigReg);
+    LLT NewTy = MRI.getType(NewReg);
+    if (OrigTy != NewTy) {
+      // The default mapping is not supposed to change the size of
+      // the storage. However, right now we don't necessarily bump all
+      // the types to storage size. For instance, we can consider
+      // s16 G_AND legal whereas the storage size is going to be 32.
+      assert(OrigTy.getSizeInBits() <= NewTy.getSizeInBits() &&
+             "Types with difference size cannot be handled by the default "
+             "mapping");
+      LLVM_DEBUG(dbgs() << "\nChange type of new opd from " << NewTy << " to "
+                        << OrigTy);
+      MRI.setType(NewReg, OrigTy);
+    }
+    LLVM_DEBUG(dbgs() << '\n');
+  }
+}
+
+unsigned RegisterBankInfo::getSizeInBits(Register Reg,
+                                         const MachineRegisterInfo &MRI,
+                                         const TargetRegisterInfo &TRI) const {
+  if (Reg.isPhysical()) {
+    // The size is not directly available for physical registers.
+    // Instead, we need to access a register class that contains Reg and
+    // get the size of that register class.
+    // Because this is expensive, we'll cache the register class by calling
+    auto *RC = getMinimalPhysRegClass(Reg, TRI);
+    assert(RC && "Expecting Register class");
+    return TRI.getRegSizeInBits(*RC);
+  }
+  return TRI.getRegSizeInBits(Reg, MRI);
+}
+
+//------------------------------------------------------------------------------
+// Helper classes implementation.
+//------------------------------------------------------------------------------
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegisterBankInfo::PartialMapping::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+#endif
+
+bool RegisterBankInfo::PartialMapping::verify(
+    const RegisterBankInfo &RBI) const {
+  assert(RegBank && "Register bank not set");
+  assert(Length && "Empty mapping");
+  assert((StartIdx <= getHighBitIdx()) && "Overflow, switch to APInt?");
+  // Check if the minimum width fits into RegBank.
+  assert(RBI.getMaximumSize(RegBank->getID()) >= Length &&
+         "Register bank too small for Mask");
+  return true;
+}
+
+void RegisterBankInfo::PartialMapping::print(raw_ostream &OS) const {
+  OS << "[" << StartIdx << ", " << getHighBitIdx() << "], RegBank = ";
+  if (RegBank)
+    OS << *RegBank;
+  else
+    OS << "nullptr";
+}
+
+bool RegisterBankInfo::ValueMapping::partsAllUniform() const {
+  if (NumBreakDowns < 2)
+    return true;
+
+  const PartialMapping *First = begin();
+  for (const PartialMapping *Part = First + 1; Part != end(); ++Part) {
+    if (Part->Length != First->Length || Part->RegBank != First->RegBank)
+      return false;
+  }
+
+  return true;
+}
+
+bool RegisterBankInfo::ValueMapping::verify(const RegisterBankInfo &RBI,
+                                            unsigned MeaningfulBitWidth) const {
+  assert(NumBreakDowns && "Value mapped nowhere?!");
+  unsigned OrigValueBitWidth = 0;
+  for (const RegisterBankInfo::PartialMapping &PartMap : *this) {
+    // Check that each register bank is big enough to hold the partial value:
+    // this check is done by PartialMapping::verify
+    assert(PartMap.verify(RBI) && "Partial mapping is invalid");
+    // The original value should completely be mapped.
+    // Thus the maximum accessed index + 1 is the size of the original value.
+    OrigValueBitWidth =
+        std::max(OrigValueBitWidth, PartMap.getHighBitIdx() + 1);
+  }
+  assert(OrigValueBitWidth >= MeaningfulBitWidth &&
+         "Meaningful bits not covered by the mapping");
+  APInt ValueMask(OrigValueBitWidth, 0);
+  for (const RegisterBankInfo::PartialMapping &PartMap : *this) {
+    // Check that the union of the partial mappings covers the whole value,
+    // without overlaps.
+    // The high bit is exclusive in the APInt API, thus getHighBitIdx + 1.
+    APInt PartMapMask = APInt::getBitsSet(OrigValueBitWidth, PartMap.StartIdx,
+                                          PartMap.getHighBitIdx() + 1);
+    ValueMask ^= PartMapMask;
+    assert((ValueMask & PartMapMask) == PartMapMask &&
+           "Some partial mappings overlap");
+  }
+  assert(ValueMask.isAllOnes() && "Value is not fully mapped");
+  return true;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegisterBankInfo::ValueMapping::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+#endif
+
+void RegisterBankInfo::ValueMapping::print(raw_ostream &OS) const {
+  OS << "#BreakDown: " << NumBreakDowns << " ";
+  bool IsFirst = true;
+  for (const PartialMapping &PartMap : *this) {
+    if (!IsFirst)
+      OS << ", ";
+    OS << '[' << PartMap << ']';
+    IsFirst = false;
+  }
+}
+
+bool RegisterBankInfo::InstructionMapping::verify(
+    const MachineInstr &MI) const {
+  // Check that all the register operands are properly mapped.
+  // Check the constructor invariant.
+  // For PHI, we only care about mapping the definition.
+  assert(NumOperands == (isCopyLike(MI) ? 1 : MI.getNumOperands()) &&
+         "NumOperands must match, see constructor");
+  assert(MI.getParent() && MI.getMF() &&
+         "MI must be connected to a MachineFunction");
+  const MachineFunction &MF = *MI.getMF();
+  const RegisterBankInfo *RBI = MF.getSubtarget().getRegBankInfo();
+  (void)RBI;
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+    const MachineOperand &MO = MI.getOperand(Idx);
+    if (!MO.isReg()) {
+      assert(!getOperandMapping(Idx).isValid() &&
+             "We should not care about non-reg mapping");
+      continue;
+    }
+    Register Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    LLT Ty = MRI.getType(Reg);
+    if (!Ty.isValid())
+      continue;
+    assert(getOperandMapping(Idx).isValid() &&
+           "We must have a mapping for reg operands");
+    const RegisterBankInfo::ValueMapping &MOMapping = getOperandMapping(Idx);
+    (void)MOMapping;
+    // Register size in bits.
+    // This size must match what the mapping expects.
+    assert(MOMapping.verify(*RBI, RBI->getSizeInBits(
+                                      Reg, MF.getRegInfo(),
+                                      *MF.getSubtarget().getRegisterInfo())) &&
+           "Value mapping is invalid");
+  }
+  return true;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegisterBankInfo::InstructionMapping::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+#endif
+
+void RegisterBankInfo::InstructionMapping::print(raw_ostream &OS) const {
+  OS << "ID: " << getID() << " Cost: " << getCost() << " Mapping: ";
+
+  for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+    const ValueMapping &ValMapping = getOperandMapping(OpIdx);
+    if (OpIdx)
+      OS << ", ";
+    OS << "{ Idx: " << OpIdx << " Map: " << ValMapping << '}';
+  }
+}
+
+const int RegisterBankInfo::OperandsMapper::DontKnowIdx = -1;
+
+RegisterBankInfo::OperandsMapper::OperandsMapper(
+    MachineInstr &MI, const InstructionMapping &InstrMapping,
+    MachineRegisterInfo &MRI)
+    : MRI(MRI), MI(MI), InstrMapping(InstrMapping) {
+  unsigned NumOpds = InstrMapping.getNumOperands();
+  OpToNewVRegIdx.resize(NumOpds, OperandsMapper::DontKnowIdx);
+  assert(InstrMapping.verify(MI) && "Invalid mapping for MI");
+}
+
+iterator_range<SmallVectorImpl<Register>::iterator>
+RegisterBankInfo::OperandsMapper::getVRegsMem(unsigned OpIdx) {
+  assert(OpIdx < getInstrMapping().getNumOperands() && "Out-of-bound access");
+  unsigned NumPartialVal =
+      getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns;
+  int StartIdx = OpToNewVRegIdx[OpIdx];
+
+  if (StartIdx == OperandsMapper::DontKnowIdx) {
+    // This is the first time we try to access OpIdx.
+    // Create the cells that will hold all the partial values at the
+    // end of the list of NewVReg.
+    StartIdx = NewVRegs.size();
+    OpToNewVRegIdx[OpIdx] = StartIdx;
+    for (unsigned i = 0; i < NumPartialVal; ++i)
+      NewVRegs.push_back(0);
+  }
+  SmallVectorImpl<Register>::iterator End =
+      getNewVRegsEnd(StartIdx, NumPartialVal);
+
+  return make_range(&NewVRegs[StartIdx], End);
+}
+
+SmallVectorImpl<Register>::const_iterator
+RegisterBankInfo::OperandsMapper::getNewVRegsEnd(unsigned StartIdx,
+                                                 unsigned NumVal) const {
+  return const_cast<OperandsMapper *>(this)->getNewVRegsEnd(StartIdx, NumVal);
+}
+SmallVectorImpl<Register>::iterator
+RegisterBankInfo::OperandsMapper::getNewVRegsEnd(unsigned StartIdx,
+                                                 unsigned NumVal) {
+  assert((NewVRegs.size() == StartIdx + NumVal ||
+          NewVRegs.size() > StartIdx + NumVal) &&
+         "NewVRegs too small to contain all the partial mapping");
+  return NewVRegs.size() <= StartIdx + NumVal ? NewVRegs.end()
+                                              : &NewVRegs[StartIdx + NumVal];
+}
+
+void RegisterBankInfo::OperandsMapper::createVRegs(unsigned OpIdx) {
+  assert(OpIdx < getInstrMapping().getNumOperands() && "Out-of-bound access");
+  iterator_range<SmallVectorImpl<Register>::iterator> NewVRegsForOpIdx =
+      getVRegsMem(OpIdx);
+  const ValueMapping &ValMapping = getInstrMapping().getOperandMapping(OpIdx);
+  const PartialMapping *PartMap = ValMapping.begin();
+  for (Register &NewVReg : NewVRegsForOpIdx) {
+    assert(PartMap != ValMapping.end() && "Out-of-bound access");
+    assert(NewVReg == 0 && "Register has already been created");
+    // The new registers are always bound to scalar with the right size.
+    // The actual type has to be set when the target does the mapping
+    // of the instruction.
+    // The rationale is that this generic code cannot guess how the
+    // target plans to split the input type.
+    NewVReg = MRI.createGenericVirtualRegister(LLT::scalar(PartMap->Length));
+    MRI.setRegBank(NewVReg, *PartMap->RegBank);
+    ++PartMap;
+  }
+}
+
+void RegisterBankInfo::OperandsMapper::setVRegs(unsigned OpIdx,
+                                                unsigned PartialMapIdx,
+                                                Register NewVReg) {
+  assert(OpIdx < getInstrMapping().getNumOperands() && "Out-of-bound access");
+  assert(getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns >
+             PartialMapIdx &&
+         "Out-of-bound access for partial mapping");
+  // Make sure the memory is initialized for that operand.
+  (void)getVRegsMem(OpIdx);
+  assert(NewVRegs[OpToNewVRegIdx[OpIdx] + PartialMapIdx] == 0 &&
+         "This value is already set");
+  NewVRegs[OpToNewVRegIdx[OpIdx] + PartialMapIdx] = NewVReg;
+}
+
+iterator_range<SmallVectorImpl<Register>::const_iterator>
+RegisterBankInfo::OperandsMapper::getVRegs(unsigned OpIdx,
+                                           bool ForDebug) const {
+  (void)ForDebug;
+  assert(OpIdx < getInstrMapping().getNumOperands() && "Out-of-bound access");
+  int StartIdx = OpToNewVRegIdx[OpIdx];
+
+  if (StartIdx == OperandsMapper::DontKnowIdx)
+    return make_range(NewVRegs.end(), NewVRegs.end());
+
+  unsigned PartMapSize =
+      getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns;
+  SmallVectorImpl<Register>::const_iterator End =
+      getNewVRegsEnd(StartIdx, PartMapSize);
+  iterator_range<SmallVectorImpl<Register>::const_iterator> Res =
+      make_range(&NewVRegs[StartIdx], End);
+#ifndef NDEBUG
+  for (Register VReg : Res)
+    assert((VReg || ForDebug) && "Some registers are uninitialized");
+#endif
+  return Res;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegisterBankInfo::OperandsMapper::dump() const {
+  print(dbgs(), true);
+  dbgs() << '\n';
+}
+#endif
+
+void RegisterBankInfo::OperandsMapper::print(raw_ostream &OS,
+                                             bool ForDebug) const {
+  unsigned NumOpds = getInstrMapping().getNumOperands();
+  if (ForDebug) {
+    OS << "Mapping for " << getMI() << "\nwith " << getInstrMapping() << '\n';
+    // Print out the internal state of the index table.
+    OS << "Populated indices (CellNumber, IndexInNewVRegs): ";
+    bool IsFirst = true;
+    for (unsigned Idx = 0; Idx != NumOpds; ++Idx) {
+      if (OpToNewVRegIdx[Idx] != DontKnowIdx) {
+        if (!IsFirst)
+          OS << ", ";
+        OS << '(' << Idx << ", " << OpToNewVRegIdx[Idx] << ')';
+        IsFirst = false;
+      }
+    }
+    OS << '\n';
+  } else
+    OS << "Mapping ID: " << getInstrMapping().getID() << ' ';
+
+  OS << "Operand Mapping: ";
+  // If we have a function, we can pretty print the name of the registers.
+  // Otherwise we will print the raw numbers.
+  const TargetRegisterInfo *TRI =
+      getMI().getParent() && getMI().getMF()
+          ? getMI().getMF()->getSubtarget().getRegisterInfo()
+          : nullptr;
+  bool IsFirst = true;
+  for (unsigned Idx = 0; Idx != NumOpds; ++Idx) {
+    if (OpToNewVRegIdx[Idx] == DontKnowIdx)
+      continue;
+    if (!IsFirst)
+      OS << ", ";
+    IsFirst = false;
+    OS << '(' << printReg(getMI().getOperand(Idx).getReg(), TRI) << ", [";
+    bool IsFirstNewVReg = true;
+    for (Register VReg : getVRegs(Idx)) {
+      if (!IsFirstNewVReg)
+        OS << ", ";
+      IsFirstNewVReg = false;
+      OS << printReg(VReg, TRI);
+    }
+    OS << "])";
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegisterClassInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegisterClassInfo.cpp
new file mode 100644
index 000000000000..fba8c35ecec2
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -0,0 +1,236 @@
+//===- RegisterClassInfo.cpp - Dynamic Register Class Info ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the RegisterClassInfo class which provides dynamic
+// information about target register classes. Callee-saved vs. caller-saved and
+// reserved registers depend on calling conventions and other dynamic
+// information, so some things cannot be determined statically.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+static cl::opt<unsigned>
+StressRA("stress-regalloc", cl::Hidden, cl::init(0), cl::value_desc("N"),
+         cl::desc("Limit all regclasses to N registers"));
+
+RegisterClassInfo::RegisterClassInfo() = default;
+
+void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
+  bool Update = false;
+  MF = &mf;
+
+  auto &STI = MF->getSubtarget();
+
+  // Allocate new array the first time we see a new target.
+  if (STI.getRegisterInfo() != TRI) {
+    TRI = STI.getRegisterInfo();
+    RegClass.reset(new RCInfo[TRI->getNumRegClasses()]);
+    Update = true;
+  }
+
+  // Test if CSRs have changed from the previous function.
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const MCPhysReg *CSR = MRI.getCalleeSavedRegs();
+  bool CSRChanged = true;
+  if (!Update) {
+    CSRChanged = false;
+    size_t LastSize = LastCalleeSavedRegs.size();
+    for (unsigned I = 0;; ++I) {
+      if (CSR[I] == 0) {
+        CSRChanged = I != LastSize;
+        break;
+      }
+      if (I >= LastSize) {
+        CSRChanged = true;
+        break;
+      }
+      if (CSR[I] != LastCalleeSavedRegs[I]) {
+        CSRChanged = true;
+        break;
+      }
+    }
+  }
+
+  // Get the callee saved registers.
+  if (CSRChanged) {
+    LastCalleeSavedRegs.clear();
+    // Build a CSRAlias map. Every CSR alias saves the last
+    // overlapping CSR.
+    CalleeSavedAliases.assign(TRI->getNumRegs(), 0);
+    for (const MCPhysReg *I = CSR; *I; ++I) {
+      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI)
+        CalleeSavedAliases[*AI] = *I;
+      LastCalleeSavedRegs.push_back(*I);
+    }
+
+    Update = true;
+  }
+
+  // Even if CSR list is same, we could have had a different allocation order
+  // if ignoreCSRForAllocationOrder is evaluated differently.
+  BitVector CSRHintsForAllocOrder(TRI->getNumRegs());
+  for (const MCPhysReg *I = CSR; *I; ++I)
+    for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI)
+      CSRHintsForAllocOrder[*AI] = STI.ignoreCSRForAllocationOrder(mf, *AI);
+  if (IgnoreCSRForAllocOrder.size() != CSRHintsForAllocOrder.size() ||
+      IgnoreCSRForAllocOrder != CSRHintsForAllocOrder) {
+    Update = true;
+    IgnoreCSRForAllocOrder = CSRHintsForAllocOrder;
+  }
+
+  RegCosts = TRI->getRegisterCosts(*MF);
+
+  // Different reserved registers?
+  const BitVector &RR = MF->getRegInfo().getReservedRegs();
+  if (Reserved.size() != RR.size() || RR != Reserved) {
+    Update = true;
+    Reserved = RR;
+  }
+
+  // Invalidate cached information from previous function.
+  if (Update) {
+    unsigned NumPSets = TRI->getNumRegPressureSets();
+    PSetLimits.reset(new unsigned[NumPSets]);
+    std::fill(&PSetLimits[0], &PSetLimits[NumPSets], 0);
+    ++Tag;
+  }
+}
+
+/// compute - Compute the preferred allocation order for RC with reserved
+/// registers filtered out. Volatile registers come first followed by CSR
+/// aliases ordered according to the CSR order specified by the target.
+void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
+  assert(RC && "no register class given");
+  RCInfo &RCI = RegClass[RC->getID()];
+  auto &STI = MF->getSubtarget();
+
+  // Raw register count, including all reserved regs.
+  unsigned NumRegs = RC->getNumRegs();
+
+  if (!RCI.Order)
+    RCI.Order.reset(new MCPhysReg[NumRegs]);
+
+  unsigned N = 0;
+  SmallVector<MCPhysReg, 16> CSRAlias;
+  uint8_t MinCost = uint8_t(~0u);
+  uint8_t LastCost = uint8_t(~0u);
+  unsigned LastCostChange = 0;
+
+  // FIXME: Once targets reserve registers instead of removing them from the
+  // allocation order, we can simply use begin/end here.
+  ArrayRef<MCPhysReg> RawOrder = RC->getRawAllocationOrder(*MF);
+  for (unsigned PhysReg : RawOrder) {
+    // Remove reserved registers from the allocation order.
+    if (Reserved.test(PhysReg))
+      continue;
+    uint8_t Cost = RegCosts[PhysReg];
+    MinCost = std::min(MinCost, Cost);
+
+    if (CalleeSavedAliases[PhysReg] &&
+        !STI.ignoreCSRForAllocationOrder(*MF, PhysReg))
+      // PhysReg aliases a CSR, save it for later.
+      CSRAlias.push_back(PhysReg);
+    else {
+      if (Cost != LastCost)
+        LastCostChange = N;
+      RCI.Order[N++] = PhysReg;
+      LastCost = Cost;
+    }
+  }
+  RCI.NumRegs = N + CSRAlias.size();
+  assert(RCI.NumRegs <= NumRegs && "Allocation order larger than regclass");
+
+  // CSR aliases go after the volatile registers, preserve the target's order.
+  for (unsigned i = 0, e = CSRAlias.size(); i != e; ++i) {
+    unsigned PhysReg = CSRAlias[i];
+    uint8_t Cost = RegCosts[PhysReg];
+    if (Cost != LastCost)
+      LastCostChange = N;
+    RCI.Order[N++] = PhysReg;
+    LastCost = Cost;
+  }
+
+  // Register allocator stress test.  Clip register class to N registers.
+  if (StressRA && RCI.NumRegs > StressRA)
+    RCI.NumRegs = StressRA;
+
+  // Check if RC is a proper sub-class.
+  if (const TargetRegisterClass *Super =
+          TRI->getLargestLegalSuperClass(RC, *MF))
+    if (Super != RC && getNumAllocatableRegs(Super) > RCI.NumRegs)
+      RCI.ProperSubClass = true;
+
+  RCI.MinCost = MinCost;
+  RCI.LastCostChange = LastCostChange;
+
+  LLVM_DEBUG({
+    dbgs() << "AllocationOrder(" << TRI->getRegClassName(RC) << ") = [";
+    for (unsigned I = 0; I != RCI.NumRegs; ++I)
+      dbgs() << ' ' << printReg(RCI.Order[I], TRI);
+    dbgs() << (RCI.ProperSubClass ? " ] (sub-class)\n" : " ]\n");
+  });
+
+  // RCI is now up-to-date.
+  RCI.Tag = Tag;
+}
+
+/// This is not accurate because two overlapping register sets may have some
+/// nonoverlapping reserved registers. However, computing the allocation order
+/// for all register classes would be too expensive.
+unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const {
+  const TargetRegisterClass *RC = nullptr;
+  unsigned NumRCUnits = 0;
+  for (const TargetRegisterClass *C : TRI->regclasses()) {
+    const int *PSetID = TRI->getRegClassPressureSets(C);
+    for (; *PSetID != -1; ++PSetID) {
+      if ((unsigned)*PSetID == Idx)
+        break;
+    }
+    if (*PSetID == -1)
+      continue;
+
+    // Found a register class that counts against this pressure set.
+    // For efficiency, only compute the set order for the largest set.
+    unsigned NUnits = TRI->getRegClassWeight(C).WeightLimit;
+    if (!RC || NUnits > NumRCUnits) {
+      RC = C;
+      NumRCUnits = NUnits;
+    }
+  }
+  assert(RC && "Failed to find register class");
+  compute(RC);
+  unsigned NAllocatableRegs = getNumAllocatableRegs(RC);
+  unsigned RegPressureSetLimit = TRI->getRegPressureSetLimit(*MF, Idx);
+  // If all the regs are reserved, return raw RegPressureSetLimit.
+  // One example is VRSAVERC in PowerPC.
+  // Avoid returning zero, getRegPressureSetLimit(Idx) assumes computePSetLimit
+  // return non-zero value.
+  if (NAllocatableRegs == 0)
+    return RegPressureSetLimit;
+  unsigned NReserved = RC->getNumRegs() - NAllocatableRegs;
+  return RegPressureSetLimit - TRI->getRegClassWeight(RC).RegWeight * NReserved;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.cpp
new file mode 100644
index 000000000000..e49885b6ad96
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -0,0 +1,4220 @@
+//===- RegisterCoalescer.cpp - Generic Register Coalescing Interface ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the generic RegisterCoalescer interface which
+// is used as the common interface used by all clients and
+// implementations of register coalescing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RegisterCoalescer.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(numJoins    , "Number of interval joins performed");
+STATISTIC(numCrossRCs , "Number of cross class joins performed");
+STATISTIC(numCommutes , "Number of instruction commuting performed");
+STATISTIC(numExtends  , "Number of copies extended");
+STATISTIC(NumReMats   , "Number of instructions re-materialized");
+STATISTIC(NumInflated , "Number of register classes inflated");
+STATISTIC(NumLaneConflicts, "Number of dead lane conflicts tested");
+STATISTIC(NumLaneResolves,  "Number of dead lane conflicts resolved");
+STATISTIC(NumShrinkToUses,  "Number of shrinkToUses called");
+
+static cl::opt<bool> EnableJoining("join-liveintervals",
+                                   cl::desc("Coalesce copies (default=true)"),
+                                   cl::init(true), cl::Hidden);
+
+static cl::opt<bool> UseTerminalRule("terminal-rule",
+                                     cl::desc("Apply the terminal rule"),
+                                     cl::init(false), cl::Hidden);
+
+/// Temporary flag to test critical edge unsplitting.
+static cl::opt<bool>
+EnableJoinSplits("join-splitedges",
+  cl::desc("Coalesce copies on split edges (default=subtarget)"), cl::Hidden);
+
+/// Temporary flag to test global copy optimization.
+static cl::opt<cl::boolOrDefault>
+EnableGlobalCopies("join-globalcopies",
+  cl::desc("Coalesce copies that span blocks (default=subtarget)"),
+  cl::init(cl::BOU_UNSET), cl::Hidden);
+
+static cl::opt<bool>
+VerifyCoalescing("verify-coalescing",
+         cl::desc("Verify machine instrs before and after register coalescing"),
+         cl::Hidden);
+
+static cl::opt<unsigned> LateRematUpdateThreshold(
+    "late-remat-update-threshold", cl::Hidden,
+    cl::desc("During rematerialization for a copy, if the def instruction has "
+             "many other copy uses to be rematerialized, delay the multiple "
+             "separate live interval update work and do them all at once after "
+             "all those rematerialization are done. It will save a lot of "
+             "repeated work. "),
+    cl::init(100));
+
+static cl::opt<unsigned> LargeIntervalSizeThreshold(
+    "large-interval-size-threshold", cl::Hidden,
+    cl::desc("If the valnos size of an interval is larger than the threshold, "
+             "it is regarded as a large interval. "),
+    cl::init(100));
+
+static cl::opt<unsigned> LargeIntervalFreqThreshold(
+    "large-interval-freq-threshold", cl::Hidden,
+    cl::desc("For a large interval, if it is coalesed with other live "
+             "intervals many times more than the threshold, stop its "
+             "coalescing to control the compile time. "),
+    cl::init(256));
+
+namespace {
+
+  class JoinVals;
+
+  class RegisterCoalescer : public MachineFunctionPass,
+                            private LiveRangeEdit::Delegate {
+    MachineFunction* MF = nullptr;
+    MachineRegisterInfo* MRI = nullptr;
+    const TargetRegisterInfo* TRI = nullptr;
+    const TargetInstrInfo* TII = nullptr;
+    LiveIntervals *LIS = nullptr;
+    const MachineLoopInfo* Loops = nullptr;
+    AliasAnalysis *AA = nullptr;
+    RegisterClassInfo RegClassInfo;
+
+    /// Position and VReg of a PHI instruction during coalescing.
+    struct PHIValPos {
+      SlotIndex SI;    ///< Slot where this PHI occurs.
+      Register Reg;    ///< VReg the PHI occurs in.
+      unsigned SubReg; ///< Qualifying subregister for Reg.
+    };
+
+    /// Map from debug instruction number to PHI position during coalescing.
+    DenseMap<unsigned, PHIValPos> PHIValToPos;
+    /// Index of, for each VReg, which debug instruction numbers and
+    /// corresponding PHIs are sensitive to coalescing. Each VReg may have
+    /// multiple PHI defs, at different positions.
+    DenseMap<Register, SmallVector<unsigned, 2>> RegToPHIIdx;
+
+    /// Debug variable location tracking -- for each VReg, maintain an
+    /// ordered-by-slot-index set of DBG_VALUEs, to help quick
+    /// identification of whether coalescing may change location validity.
+    using DbgValueLoc = std::pair<SlotIndex, MachineInstr*>;
+    DenseMap<Register, std::vector<DbgValueLoc>> DbgVRegToValues;
+
+    /// A LaneMask to remember on which subregister live ranges we need to call
+    /// shrinkToUses() later.
+    LaneBitmask ShrinkMask;
+
+    /// True if the main range of the currently coalesced intervals should be
+    /// checked for smaller live intervals.
+    bool ShrinkMainRange = false;
+
+    /// True if the coalescer should aggressively coalesce global copies
+    /// in favor of keeping local copies.
+    bool JoinGlobalCopies = false;
+
+    /// True if the coalescer should aggressively coalesce fall-thru
+    /// blocks exclusively containing copies.
+    bool JoinSplitEdges = false;
+
+    /// Copy instructions yet to be coalesced.
+    SmallVector<MachineInstr*, 8> WorkList;
+    SmallVector<MachineInstr*, 8> LocalWorkList;
+
+    /// Set of instruction pointers that have been erased, and
+    /// that may be present in WorkList.
+    SmallPtrSet<MachineInstr*, 8> ErasedInstrs;
+
+    /// Dead instructions that are about to be deleted.
+    SmallVector<MachineInstr*, 8> DeadDefs;
+
+    /// Virtual registers to be considered for register class inflation.
+    SmallVector<Register, 8> InflateRegs;
+
+    /// The collection of live intervals which should have been updated
+    /// immediately after rematerialiation but delayed until
+    /// lateLiveIntervalUpdate is called.
+    DenseSet<Register> ToBeUpdated;
+
+    /// Record how many times the large live interval with many valnos
+    /// has been tried to join with other live interval.
+    DenseMap<Register, unsigned long> LargeLIVisitCounter;
+
+    /// Recursively eliminate dead defs in DeadDefs.
+    void eliminateDeadDefs(LiveRangeEdit *Edit = nullptr);
+
+    /// LiveRangeEdit callback for eliminateDeadDefs().
+    void LRE_WillEraseInstruction(MachineInstr *MI) override;
+
+    /// Coalesce the LocalWorkList.
+    void coalesceLocals();
+
+    /// Join compatible live intervals
+    void joinAllIntervals();
+
+    /// Coalesce copies in the specified MBB, putting
+    /// copies that cannot yet be coalesced into WorkList.
+    void copyCoalesceInMBB(MachineBasicBlock *MBB);
+
+    /// Tries to coalesce all copies in CurrList. Returns true if any progress
+    /// was made.
+    bool copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList);
+
+    /// If one def has many copy like uses, and those copy uses are all
+    /// rematerialized, the live interval update needed for those
+    /// rematerializations will be delayed and done all at once instead
+    /// of being done multiple times. This is to save compile cost because
+    /// live interval update is costly.
+    void lateLiveIntervalUpdate();
+
+    /// Check if the incoming value defined by a COPY at \p SLRQ in the subrange
+    /// has no value defined in the predecessors. If the incoming value is the
+    /// same as defined by the copy itself, the value is considered undefined.
+    bool copyValueUndefInPredecessors(LiveRange &S,
+                                      const MachineBasicBlock *MBB,
+                                      LiveQueryResult SLRQ);
+
+    /// Set necessary undef flags on subregister uses after pruning out undef
+    /// lane segments from the subrange.
+    void setUndefOnPrunedSubRegUses(LiveInterval &LI, Register Reg,
+                                    LaneBitmask PrunedLanes);
+
+    /// Attempt to join intervals corresponding to SrcReg/DstReg, which are the
+    /// src/dst of the copy instruction CopyMI.  This returns true if the copy
+    /// was successfully coalesced away. If it is not currently possible to
+    /// coalesce this interval, but it may be possible if other things get
+    /// coalesced, then it returns true by reference in 'Again'.
+    bool joinCopy(MachineInstr *CopyMI, bool &Again);
+
+    /// Attempt to join these two intervals.  On failure, this
+    /// returns false.  The output "SrcInt" will not have been modified, so we
+    /// can use this information below to update aliases.
+    bool joinIntervals(CoalescerPair &CP);
+
+    /// Attempt joining two virtual registers. Return true on success.
+    bool joinVirtRegs(CoalescerPair &CP);
+
+    /// If a live interval has many valnos and is coalesced with other
+    /// live intervals many times, we regard such live interval as having
+    /// high compile time cost.
+    bool isHighCostLiveInterval(LiveInterval &LI);
+
+    /// Attempt joining with a reserved physreg.
+    bool joinReservedPhysReg(CoalescerPair &CP);
+
+    /// Add the LiveRange @p ToMerge as a subregister liverange of @p LI.
+    /// Subranges in @p LI which only partially interfere with the desired
+    /// LaneMask are split as necessary. @p LaneMask are the lanes that
+    /// @p ToMerge will occupy in the coalescer register. @p LI has its subrange
+    /// lanemasks already adjusted to the coalesced register.
+    void mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge,
+                           LaneBitmask LaneMask, CoalescerPair &CP,
+                           unsigned DstIdx);
+
+    /// Join the liveranges of two subregisters. Joins @p RRange into
+    /// @p LRange, @p RRange may be invalid afterwards.
+    void joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
+                          LaneBitmask LaneMask, const CoalescerPair &CP);
+
+    /// We found a non-trivially-coalescable copy. If the source value number is
+    /// defined by a copy from the destination reg see if we can merge these two
+    /// destination reg valno# into a single value number, eliminating a copy.
+    /// This returns true if an interval was modified.
+    bool adjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
+
+    /// Return true if there are definitions of IntB
+    /// other than BValNo val# that can reach uses of AValno val# of IntA.
+    bool hasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
+                              VNInfo *AValNo, VNInfo *BValNo);
+
+    /// We found a non-trivially-coalescable copy.
+    /// If the source value number is defined by a commutable instruction and
+    /// its other operand is coalesced to the copy dest register, see if we
+    /// can transform the copy into a noop by commuting the definition.
+    /// This returns a pair of two flags:
+    /// - the first element is true if an interval was modified,
+    /// - the second element is true if the destination interval needs
+    ///   to be shrunk after deleting the copy.
+    std::pair<bool,bool> removeCopyByCommutingDef(const CoalescerPair &CP,
+                                                  MachineInstr *CopyMI);
+
+    /// We found a copy which can be moved to its less frequent predecessor.
+    bool removePartialRedundancy(const CoalescerPair &CP, MachineInstr &CopyMI);
+
+    /// If the source of a copy is defined by a
+    /// trivial computation, replace the copy by rematerialize the definition.
+    bool reMaterializeTrivialDef(const CoalescerPair &CP, MachineInstr *CopyMI,
+                                 bool &IsDefCopy);
+
+    /// Return true if a copy involving a physreg should be joined.
+    bool canJoinPhys(const CoalescerPair &CP);
+
+    /// Replace all defs and uses of SrcReg to DstReg and update the subregister
+    /// number if it is not zero. If DstReg is a physical register and the
+    /// existing subregister number of the def / use being updated is not zero,
+    /// make sure to set it to the correct physical subregister.
+    void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx);
+
+    /// If the given machine operand reads only undefined lanes add an undef
+    /// flag.
+    /// This can happen when undef uses were previously concealed by a copy
+    /// which we coalesced. Example:
+    ///    %0:sub0<def,read-undef> = ...
+    ///    %1 = COPY %0           <-- Coalescing COPY reveals undef
+    ///       = use %1:sub1       <-- hidden undef use
+    void addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
+                      MachineOperand &MO, unsigned SubRegIdx);
+
+    /// Handle copies of undef values. If the undef value is an incoming
+    /// PHI value, it will convert @p CopyMI to an IMPLICIT_DEF.
+    /// Returns nullptr if @p CopyMI was not in any way eliminable. Otherwise,
+    /// it returns @p CopyMI (which could be an IMPLICIT_DEF at this point).
+    MachineInstr *eliminateUndefCopy(MachineInstr *CopyMI);
+
+    /// Check whether or not we should apply the terminal rule on the
+    /// destination (Dst) of \p Copy.
+    /// When the terminal rule applies, Copy is not profitable to
+    /// coalesce.
+    /// Dst is terminal if it has exactly one affinity (Dst, Src) and
+    /// at least one interference (Dst, Dst2). If Dst is terminal, the
+    /// terminal rule consists in checking that at least one of
+    /// interfering node, say Dst2, has an affinity of equal or greater
+    /// weight with Src.
+    /// In that case, Dst2 and Dst will not be able to be both coalesced
+    /// with Src. Since Dst2 exposes more coalescing opportunities than
+    /// Dst, we can drop \p Copy.
+    bool applyTerminalRule(const MachineInstr &Copy) const;
+
+    /// Wrapper method for \see LiveIntervals::shrinkToUses.
+    /// This method does the proper fixing of the live-ranges when the afore
+    /// mentioned method returns true.
+    void shrinkToUses(LiveInterval *LI,
+                      SmallVectorImpl<MachineInstr * > *Dead = nullptr) {
+      NumShrinkToUses++;
+      if (LIS->shrinkToUses(LI, Dead)) {
+        /// Check whether or not \p LI is composed by multiple connected
+        /// components and if that is the case, fix that.
+        SmallVector<LiveInterval*, 8> SplitLIs;
+        LIS->splitSeparateComponents(*LI, SplitLIs);
+      }
+    }
+
+    /// Wrapper Method to do all the necessary work when an Instruction is
+    /// deleted.
+    /// Optimizations should use this to make sure that deleted instructions
+    /// are always accounted for.
+    void deleteInstr(MachineInstr* MI) {
+      ErasedInstrs.insert(MI);
+      LIS->RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+    }
+
+    /// Walk over function and initialize the DbgVRegToValues map.
+    void buildVRegToDbgValueMap(MachineFunction &MF);
+
+    /// Test whether, after merging, any DBG_VALUEs would refer to a
+    /// different value number than before merging, and whether this can
+    /// be resolved. If not, mark the DBG_VALUE as being undef.
+    void checkMergingChangesDbgValues(CoalescerPair &CP, LiveRange &LHS,
+                                      JoinVals &LHSVals, LiveRange &RHS,
+                                      JoinVals &RHSVals);
+
+    void checkMergingChangesDbgValuesImpl(Register Reg, LiveRange &OtherRange,
+                                          LiveRange &RegRange, JoinVals &Vals2);
+
+  public:
+    static char ID; ///< Class identification, replacement for typeinfo
+
+    RegisterCoalescer() : MachineFunctionPass(ID) {
+      initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+    void releaseMemory() override;
+
+    /// This is the pass entry point.
+    bool runOnMachineFunction(MachineFunction&) override;
+
+    /// Implement the dump method.
+    void print(raw_ostream &O, const Module* = nullptr) const override;
+  };
+
+} // end anonymous namespace
+
+char RegisterCoalescer::ID = 0;
+
+char &llvm::RegisterCoalescerID = RegisterCoalescer::ID;
+
+INITIALIZE_PASS_BEGIN(RegisterCoalescer, "register-coalescer",
+                      "Register Coalescer", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(RegisterCoalescer, "register-coalescer",
+                    "Register Coalescer", false, false)
+
+[[nodiscard]] static bool isMoveInstr(const TargetRegisterInfo &tri,
+                                      const MachineInstr *MI, Register &Src,
+                                      Register &Dst, unsigned &SrcSub,
+                                      unsigned &DstSub) {
+    if (MI->isCopy()) {
+      Dst = MI->getOperand(0).getReg();
+      DstSub = MI->getOperand(0).getSubReg();
+      Src = MI->getOperand(1).getReg();
+      SrcSub = MI->getOperand(1).getSubReg();
+    } else if (MI->isSubregToReg()) {
+      Dst = MI->getOperand(0).getReg();
+      DstSub = tri.composeSubRegIndices(MI->getOperand(0).getSubReg(),
+                                        MI->getOperand(3).getImm());
+      Src = MI->getOperand(2).getReg();
+      SrcSub = MI->getOperand(2).getSubReg();
+    } else
+      return false;
+    return true;
+}
+
+/// Return true if this block should be vacated by the coalescer to eliminate
+/// branches. The important cases to handle in the coalescer are critical edges
+/// split during phi elimination which contain only copies. Simple blocks that
+/// contain non-branches should also be vacated, but this can be handled by an
+/// earlier pass similar to early if-conversion.
+static bool isSplitEdge(const MachineBasicBlock *MBB) {
+  if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+    return false;
+
+  for (const auto &MI : *MBB) {
+    if (!MI.isCopyLike() && !MI.isUnconditionalBranch())
+      return false;
+  }
+  return true;
+}
+
+bool CoalescerPair::setRegisters(const MachineInstr *MI) {
+  SrcReg = DstReg = Register();
+  SrcIdx = DstIdx = 0;
+  NewRC = nullptr;
+  Flipped = CrossClass = false;
+
+  Register Src, Dst;
+  unsigned SrcSub = 0, DstSub = 0;
+  if (!isMoveInstr(TRI, MI, Src, Dst, SrcSub, DstSub))
+    return false;
+  Partial = SrcSub || DstSub;
+
+  // If one register is a physreg, it must be Dst.
+  if (Src.isPhysical()) {
+    if (Dst.isPhysical())
+      return false;
+    std::swap(Src, Dst);
+    std::swap(SrcSub, DstSub);
+    Flipped = true;
+  }
+
+  const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
+
+  if (Dst.isPhysical()) {
+    // Eliminate DstSub on a physreg.
+    if (DstSub) {
+      Dst = TRI.getSubReg(Dst, DstSub);
+      if (!Dst) return false;
+      DstSub = 0;
+    }
+
+    // Eliminate SrcSub by picking a corresponding Dst superregister.
+    if (SrcSub) {
+      Dst = TRI.getMatchingSuperReg(Dst, SrcSub, MRI.getRegClass(Src));
+      if (!Dst) return false;
+    } else if (!MRI.getRegClass(Src)->contains(Dst)) {
+      return false;
+    }
+  } else {
+    // Both registers are virtual.
+    const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
+    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+
+    // Both registers have subreg indices.
+    if (SrcSub && DstSub) {
+      // Copies between different sub-registers are never coalescable.
+      if (Src == Dst && SrcSub != DstSub)
+        return false;
+
+      NewRC = TRI.getCommonSuperRegClass(SrcRC, SrcSub, DstRC, DstSub,
+                                         SrcIdx, DstIdx);
+      if (!NewRC)
+        return false;
+    } else if (DstSub) {
+      // SrcReg will be merged with a sub-register of DstReg.
+      SrcIdx = DstSub;
+      NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+    } else if (SrcSub) {
+      // DstReg will be merged with a sub-register of SrcReg.
+      DstIdx = SrcSub;
+      NewRC = TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSub);
+    } else {
+      // This is a straight copy without sub-registers.
+      NewRC = TRI.getCommonSubClass(DstRC, SrcRC);
+    }
+
+    // The combined constraint may be impossible to satisfy.
+    if (!NewRC)
+      return false;
+
+    // Prefer SrcReg to be a sub-register of DstReg.
+    // FIXME: Coalescer should support subregs symmetrically.
+    if (DstIdx && !SrcIdx) {
+      std::swap(Src, Dst);
+      std::swap(SrcIdx, DstIdx);
+      Flipped = !Flipped;
+    }
+
+    CrossClass = NewRC != DstRC || NewRC != SrcRC;
+  }
+  // Check our invariants
+  assert(Src.isVirtual() && "Src must be virtual");
+  assert(!(Dst.isPhysical() && DstSub) && "Cannot have a physical SubIdx");
+  SrcReg = Src;
+  DstReg = Dst;
+  return true;
+}
+
+bool CoalescerPair::flip() {
+  if (DstReg.isPhysical())
+    return false;
+  std::swap(SrcReg, DstReg);
+  std::swap(SrcIdx, DstIdx);
+  Flipped = !Flipped;
+  return true;
+}
+
+bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
+  if (!MI)
+    return false;
+  Register Src, Dst;
+  unsigned SrcSub = 0, DstSub = 0;
+  if (!isMoveInstr(TRI, MI, Src, Dst, SrcSub, DstSub))
+    return false;
+
+  // Find the virtual register that is SrcReg.
+  if (Dst == SrcReg) {
+    std::swap(Src, Dst);
+    std::swap(SrcSub, DstSub);
+  } else if (Src != SrcReg) {
+    return false;
+  }
+
+  // Now check that Dst matches DstReg.
+  if (DstReg.isPhysical()) {
+    if (!Dst.isPhysical())
+      return false;
+    assert(!DstIdx && !SrcIdx && "Inconsistent CoalescerPair state.");
+    // DstSub could be set for a physreg from INSERT_SUBREG.
+    if (DstSub)
+      Dst = TRI.getSubReg(Dst, DstSub);
+    // Full copy of Src.
+    if (!SrcSub)
+      return DstReg == Dst;
+    // This is a partial register copy. Check that the parts match.
+    return Register(TRI.getSubReg(DstReg, SrcSub)) == Dst;
+  } else {
+    // DstReg is virtual.
+    if (DstReg != Dst)
+      return false;
+    // Registers match, do the subregisters line up?
+    return TRI.composeSubRegIndices(SrcIdx, SrcSub) ==
+           TRI.composeSubRegIndices(DstIdx, DstSub);
+  }
+}
+
+void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreservedID(MachineDominatorsID);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void RegisterCoalescer::eliminateDeadDefs(LiveRangeEdit *Edit) {
+  if (Edit) {
+    Edit->eliminateDeadDefs(DeadDefs);
+    return;
+  }
+  SmallVector<Register, 8> NewRegs;
+  LiveRangeEdit(nullptr, NewRegs, *MF, *LIS,
+                nullptr, this).eliminateDeadDefs(DeadDefs);
+}
+
+void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) {
+  // MI may be in WorkList. Make sure we don't visit it.
+  ErasedInstrs.insert(MI);
+}
+
+bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
+                                             MachineInstr *CopyMI) {
+  assert(!CP.isPartial() && "This doesn't work for partial copies.");
+  assert(!CP.isPhys() && "This doesn't work for physreg copies.");
+
+  LiveInterval &IntA =
+    LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+  LiveInterval &IntB =
+    LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+  SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
+
+  // We have a non-trivially-coalescable copy with IntA being the source and
+  // IntB being the dest, thus this defines a value number in IntB.  If the
+  // source value number (in IntA) is defined by a copy from B, see if we can
+  // merge these two pieces of B into a single value number, eliminating a copy.
+  // For example:
+  //
+  //  A3 = B0
+  //    ...
+  //  B1 = A3      <- this copy
+  //
+  // In this case, B0 can be extended to where the B1 copy lives, allowing the
+  // B1 value number to be replaced with B0 (which simplifies the B
+  // liveinterval).
+
+  // BValNo is a value number in B that is defined by a copy from A.  'B1' in
+  // the example above.
+  LiveInterval::iterator BS = IntB.FindSegmentContaining(CopyIdx);
+  if (BS == IntB.end()) return false;
+  VNInfo *BValNo = BS->valno;
+
+  // Get the location that B is defined at.  Two options: either this value has
+  // an unknown definition point or it is defined at CopyIdx.  If unknown, we
+  // can't process it.
+  if (BValNo->def != CopyIdx) return false;
+
+  // AValNo is the value number in A that defines the copy, A3 in the example.
+  SlotIndex CopyUseIdx = CopyIdx.getRegSlot(true);
+  LiveInterval::iterator AS = IntA.FindSegmentContaining(CopyUseIdx);
+  // The live segment might not exist after fun with physreg coalescing.
+  if (AS == IntA.end()) return false;
+  VNInfo *AValNo = AS->valno;
+
+  // If AValNo is defined as a copy from IntB, we can potentially process this.
+  // Get the instruction that defines this value number.
+  MachineInstr *ACopyMI = LIS->getInstructionFromIndex(AValNo->def);
+  // Don't allow any partial copies, even if isCoalescable() allows them.
+  if (!CP.isCoalescable(ACopyMI) || !ACopyMI->isFullCopy())
+    return false;
+
+  // Get the Segment in IntB that this value number starts with.
+  LiveInterval::iterator ValS =
+    IntB.FindSegmentContaining(AValNo->def.getPrevSlot());
+  if (ValS == IntB.end())
+    return false;
+
+  // Make sure that the end of the live segment is inside the same block as
+  // CopyMI.
+  MachineInstr *ValSEndInst =
+    LIS->getInstructionFromIndex(ValS->end.getPrevSlot());
+  if (!ValSEndInst || ValSEndInst->getParent() != CopyMI->getParent())
+    return false;
+
+  // Okay, we now know that ValS ends in the same block that the CopyMI
+  // live-range starts.  If there are no intervening live segments between them
+  // in IntB, we can merge them.
+  if (ValS+1 != BS) return false;
+
+  LLVM_DEBUG(dbgs() << "Extending: " << printReg(IntB.reg(), TRI));
+
+  SlotIndex FillerStart = ValS->end, FillerEnd = BS->start;
+  // We are about to delete CopyMI, so need to remove it as the 'instruction
+  // that defines this value #'. Update the valnum with the new defining
+  // instruction #.
+  BValNo->def = FillerStart;
+
+  // Okay, we can merge them.  We need to insert a new liverange:
+  // [ValS.end, BS.begin) of either value number, then we merge the
+  // two value numbers.
+  IntB.addSegment(LiveInterval::Segment(FillerStart, FillerEnd, BValNo));
+
+  // Okay, merge "B1" into the same value number as "B0".
+  if (BValNo != ValS->valno)
+    IntB.MergeValueNumberInto(BValNo, ValS->valno);
+
+  // Do the same for the subregister segments.
+  for (LiveInterval::SubRange &S : IntB.subranges()) {
+    // Check for SubRange Segments of the form [1234r,1234d:0) which can be
+    // removed to prevent creating bogus SubRange Segments.
+    LiveInterval::iterator SS = S.FindSegmentContaining(CopyIdx);
+    if (SS != S.end() && SlotIndex::isSameInstr(SS->start, SS->end)) {
+      S.removeSegment(*SS, true);
+      continue;
+    }
+    // The subrange may have ended before FillerStart. If so, extend it.
+    if (!S.getVNInfoAt(FillerStart)) {
+      SlotIndex BBStart =
+          LIS->getMBBStartIdx(LIS->getMBBFromIndex(FillerStart));
+      S.extendInBlock(BBStart, FillerStart);
+    }
+    VNInfo *SubBValNo = S.getVNInfoAt(CopyIdx);
+    S.addSegment(LiveInterval::Segment(FillerStart, FillerEnd, SubBValNo));
+    VNInfo *SubValSNo = S.getVNInfoAt(AValNo->def.getPrevSlot());
+    if (SubBValNo != SubValSNo)
+      S.MergeValueNumberInto(SubBValNo, SubValSNo);
+  }
+
+  LLVM_DEBUG(dbgs() << "   result = " << IntB << '\n');
+
+  // If the source instruction was killing the source register before the
+  // merge, unset the isKill marker given the live range has been extended.
+  int UIdx = ValSEndInst->findRegisterUseOperandIdx(IntB.reg(), true);
+  if (UIdx != -1) {
+    ValSEndInst->getOperand(UIdx).setIsKill(false);
+  }
+
+  // Rewrite the copy.
+  CopyMI->substituteRegister(IntA.reg(), IntB.reg(), 0, *TRI);
+  // If the copy instruction was killing the destination register or any
+  // subrange before the merge trim the live range.
+  bool RecomputeLiveRange = AS->end == CopyIdx;
+  if (!RecomputeLiveRange) {
+    for (LiveInterval::SubRange &S : IntA.subranges()) {
+      LiveInterval::iterator SS = S.FindSegmentContaining(CopyUseIdx);
+      if (SS != S.end() && SS->end == CopyIdx) {
+        RecomputeLiveRange = true;
+        break;
+      }
+    }
+  }
+  if (RecomputeLiveRange)
+    shrinkToUses(&IntA);
+
+  ++numExtends;
+  return true;
+}
+
+bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
+                                             LiveInterval &IntB,
+                                             VNInfo *AValNo,
+                                             VNInfo *BValNo) {
+  // If AValNo has PHI kills, conservatively assume that IntB defs can reach
+  // the PHI values.
+  if (LIS->hasPHIKill(IntA, AValNo))
+    return true;
+
+  for (LiveRange::Segment &ASeg : IntA.segments) {
+    if (ASeg.valno != AValNo) continue;
+    LiveInterval::iterator BI = llvm::upper_bound(IntB, ASeg.start);
+    if (BI != IntB.begin())
+      --BI;
+    for (; BI != IntB.end() && ASeg.end >= BI->start; ++BI) {
+      if (BI->valno == BValNo)
+        continue;
+      if (BI->start <= ASeg.start && BI->end > ASeg.start)
+        return true;
+      if (BI->start > ASeg.start && BI->start < ASeg.end)
+        return true;
+    }
+  }
+  return false;
+}
+
+/// Copy segments with value number @p SrcValNo from liverange @p Src to live
+/// range @Dst and use value number @p DstValNo there.
+static std::pair<bool,bool>
+addSegmentsWithValNo(LiveRange &Dst, VNInfo *DstValNo, const LiveRange &Src,
+                     const VNInfo *SrcValNo) {
+  bool Changed = false;
+  bool MergedWithDead = false;
+  for (const LiveRange::Segment &S : Src.segments) {
+    if (S.valno != SrcValNo)
+      continue;
+    // This is adding a segment from Src that ends in a copy that is about
+    // to be removed. This segment is going to be merged with a pre-existing
+    // segment in Dst. This works, except in cases when the corresponding
+    // segment in Dst is dead. For example: adding [192r,208r:1) from Src
+    // to [208r,208d:1) in Dst would create [192r,208d:1) in Dst.
+    // Recognized such cases, so that the segments can be shrunk.
+    LiveRange::Segment Added = LiveRange::Segment(S.start, S.end, DstValNo);
+    LiveRange::Segment &Merged = *Dst.addSegment(Added);
+    if (Merged.end.isDead())
+      MergedWithDead = true;
+    Changed = true;
+  }
+  return std::make_pair(Changed, MergedWithDead);
+}
+
+std::pair<bool,bool>
+RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
+                                            MachineInstr *CopyMI) {
+  assert(!CP.isPhys());
+
+  LiveInterval &IntA =
+      LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+  LiveInterval &IntB =
+      LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+
+  // We found a non-trivially-coalescable copy with IntA being the source and
+  // IntB being the dest, thus this defines a value number in IntB.  If the
+  // source value number (in IntA) is defined by a commutable instruction and
+  // its other operand is coalesced to the copy dest register, see if we can
+  // transform the copy into a noop by commuting the definition. For example,
+  //
+  //  A3 = op A2 killed B0
+  //    ...
+  //  B1 = A3      <- this copy
+  //    ...
+  //     = op A3   <- more uses
+  //
+  // ==>
+  //
+  //  B2 = op B0 killed A2
+  //    ...
+  //  B1 = B2      <- now an identity copy
+  //    ...
+  //     = op B2   <- more uses
+
+  // BValNo is a value number in B that is defined by a copy from A. 'B1' in
+  // the example above.
+  SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
+  VNInfo *BValNo = IntB.getVNInfoAt(CopyIdx);
+  assert(BValNo != nullptr && BValNo->def == CopyIdx);
+
+  // AValNo is the value number in A that defines the copy, A3 in the example.
+  VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx.getRegSlot(true));
+  assert(AValNo && !AValNo->isUnused() && "COPY source not live");
+  if (AValNo->isPHIDef())
+    return { false, false };
+  MachineInstr *DefMI = LIS->getInstructionFromIndex(AValNo->def);
+  if (!DefMI)
+    return { false, false };
+  if (!DefMI->isCommutable())
+    return { false, false };
+  // If DefMI is a two-address instruction then commuting it will change the
+  // destination register.
+  int DefIdx = DefMI->findRegisterDefOperandIdx(IntA.reg());
+  assert(DefIdx != -1);
+  unsigned UseOpIdx;
+  if (!DefMI->isRegTiedToUseOperand(DefIdx, &UseOpIdx))
+    return { false, false };
+
+  // FIXME: The code below tries to commute 'UseOpIdx' operand with some other
+  // commutable operand which is expressed by 'CommuteAnyOperandIndex'value
+  // passed to the method. That _other_ operand is chosen by
+  // the findCommutedOpIndices() method.
+  //
+  // That is obviously an area for improvement in case of instructions having
+  // more than 2 operands. For example, if some instruction has 3 commutable
+  // operands then all possible variants (i.e. op#1<->op#2, op#1<->op#3,
+  // op#2<->op#3) of commute transformation should be considered/tried here.
+  unsigned NewDstIdx = TargetInstrInfo::CommuteAnyOperandIndex;
+  if (!TII->findCommutedOpIndices(*DefMI, UseOpIdx, NewDstIdx))
+    return { false, false };
+
+  MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
+  Register NewReg = NewDstMO.getReg();
+  if (NewReg != IntB.reg() || !IntB.Query(AValNo->def).isKill())
+    return { false, false };
+
+  // Make sure there are no other definitions of IntB that would reach the
+  // uses which the new definition can reach.
+  if (hasOtherReachingDefs(IntA, IntB, AValNo, BValNo))
+    return { false, false };
+
+  // If some of the uses of IntA.reg is already coalesced away, return false.
+  // It's not possible to determine whether it's safe to perform the coalescing.
+  for (MachineOperand &MO : MRI->use_nodbg_operands(IntA.reg())) {
+    MachineInstr *UseMI = MO.getParent();
+    unsigned OpNo = &MO - &UseMI->getOperand(0);
+    SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI);
+    LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
+    if (US == IntA.end() || US->valno != AValNo)
+      continue;
+    // If this use is tied to a def, we can't rewrite the register.
+    if (UseMI->isRegTiedToDefOperand(OpNo))
+      return { false, false };
+  }
+
+  LLVM_DEBUG(dbgs() << "\tremoveCopyByCommutingDef: " << AValNo->def << '\t'
+                    << *DefMI);
+
+  // At this point we have decided that it is legal to do this
+  // transformation.  Start by commuting the instruction.
+  MachineBasicBlock *MBB = DefMI->getParent();
+  MachineInstr *NewMI =
+      TII->commuteInstruction(*DefMI, false, UseOpIdx, NewDstIdx);
+  if (!NewMI)
+    return { false, false };
+  if (IntA.reg().isVirtual() && IntB.reg().isVirtual() &&
+      !MRI->constrainRegClass(IntB.reg(), MRI->getRegClass(IntA.reg())))
+    return { false, false };
+  if (NewMI != DefMI) {
+    LIS->ReplaceMachineInstrInMaps(*DefMI, *NewMI);
+    MachineBasicBlock::iterator Pos = DefMI;
+    MBB->insert(Pos, NewMI);
+    MBB->erase(DefMI);
+  }
+
+  // If ALR and BLR overlaps and end of BLR extends beyond end of ALR, e.g.
+  // A = or A, B
+  // ...
+  // B = A
+  // ...
+  // C = killed A
+  // ...
+  //   = B
+
+  // Update uses of IntA of the specific Val# with IntB.
+  for (MachineOperand &UseMO :
+       llvm::make_early_inc_range(MRI->use_operands(IntA.reg()))) {
+    if (UseMO.isUndef())
+      continue;
+    MachineInstr *UseMI = UseMO.getParent();
+    if (UseMI->isDebugInstr()) {
+      // FIXME These don't have an instruction index.  Not clear we have enough
+      // info to decide whether to do this replacement or not.  For now do it.
+      UseMO.setReg(NewReg);
+      continue;
+    }
+    SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI).getRegSlot(true);
+    LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
+    assert(US != IntA.end() && "Use must be live");
+    if (US->valno != AValNo)
+      continue;
+    // Kill flags are no longer accurate. They are recomputed after RA.
+    UseMO.setIsKill(false);
+    if (NewReg.isPhysical())
+      UseMO.substPhysReg(NewReg, *TRI);
+    else
+      UseMO.setReg(NewReg);
+    if (UseMI == CopyMI)
+      continue;
+    if (!UseMI->isCopy())
+      continue;
+    if (UseMI->getOperand(0).getReg() != IntB.reg() ||
+        UseMI->getOperand(0).getSubReg())
+      continue;
+
+    // This copy will become a noop. If it's defining a new val#, merge it into
+    // BValNo.
+    SlotIndex DefIdx = UseIdx.getRegSlot();
+    VNInfo *DVNI = IntB.getVNInfoAt(DefIdx);
+    if (!DVNI)
+      continue;
+    LLVM_DEBUG(dbgs() << "\t\tnoop: " << DefIdx << '\t' << *UseMI);
+    assert(DVNI->def == DefIdx);
+    BValNo = IntB.MergeValueNumberInto(DVNI, BValNo);
+    for (LiveInterval::SubRange &S : IntB.subranges()) {
+      VNInfo *SubDVNI = S.getVNInfoAt(DefIdx);
+      if (!SubDVNI)
+        continue;
+      VNInfo *SubBValNo = S.getVNInfoAt(CopyIdx);
+      assert(SubBValNo->def == CopyIdx);
+      S.MergeValueNumberInto(SubDVNI, SubBValNo);
+    }
+
+    deleteInstr(UseMI);
+  }
+
+  // Extend BValNo by merging in IntA live segments of AValNo. Val# definition
+  // is updated.
+  bool ShrinkB = false;
+  BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+  if (IntA.hasSubRanges() || IntB.hasSubRanges()) {
+    if (!IntA.hasSubRanges()) {
+      LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(IntA.reg());
+      IntA.createSubRangeFrom(Allocator, Mask, IntA);
+    } else if (!IntB.hasSubRanges()) {
+      LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(IntB.reg());
+      IntB.createSubRangeFrom(Allocator, Mask, IntB);
+    }
+    SlotIndex AIdx = CopyIdx.getRegSlot(true);
+    LaneBitmask MaskA;
+    const SlotIndexes &Indexes = *LIS->getSlotIndexes();
+    for (LiveInterval::SubRange &SA : IntA.subranges()) {
+      VNInfo *ASubValNo = SA.getVNInfoAt(AIdx);
+      // Even if we are dealing with a full copy, some lanes can
+      // still be undefined.
+      // E.g.,
+      // undef A.subLow = ...
+      // B = COPY A <== A.subHigh is undefined here and does
+      //                not have a value number.
+      if (!ASubValNo)
+        continue;
+      MaskA |= SA.LaneMask;
+
+      IntB.refineSubRanges(
+          Allocator, SA.LaneMask,
+          [&Allocator, &SA, CopyIdx, ASubValNo,
+           &ShrinkB](LiveInterval::SubRange &SR) {
+            VNInfo *BSubValNo = SR.empty() ? SR.getNextValue(CopyIdx, Allocator)
+                                           : SR.getVNInfoAt(CopyIdx);
+            assert(BSubValNo != nullptr);
+            auto P = addSegmentsWithValNo(SR, BSubValNo, SA, ASubValNo);
+            ShrinkB |= P.second;
+            if (P.first)
+              BSubValNo->def = ASubValNo->def;
+          },
+          Indexes, *TRI);
+    }
+    // Go over all subranges of IntB that have not been covered by IntA,
+    // and delete the segments starting at CopyIdx. This can happen if
+    // IntA has undef lanes that are defined in IntB.
+    for (LiveInterval::SubRange &SB : IntB.subranges()) {
+      if ((SB.LaneMask & MaskA).any())
+        continue;
+      if (LiveRange::Segment *S = SB.getSegmentContaining(CopyIdx))
+        if (S->start.getBaseIndex() == CopyIdx.getBaseIndex())
+          SB.removeSegment(*S, true);
+    }
+  }
+
+  BValNo->def = AValNo->def;
+  auto P = addSegmentsWithValNo(IntB, BValNo, IntA, AValNo);
+  ShrinkB |= P.second;
+  LLVM_DEBUG(dbgs() << "\t\textended: " << IntB << '\n');
+
+  LIS->removeVRegDefAt(IntA, AValNo->def);
+
+  LLVM_DEBUG(dbgs() << "\t\ttrimmed:  " << IntA << '\n');
+  ++numCommutes;
+  return { true, ShrinkB };
+}
+
+/// For copy B = A in BB2, if A is defined by A = B in BB0 which is a
+/// predecessor of BB2, and if B is not redefined on the way from A = B
+/// in BB0 to B = A in BB2, B = A in BB2 is partially redundant if the
+/// execution goes through the path from BB0 to BB2. We may move B = A
+/// to the predecessor without such reversed copy.
+/// So we will transform the program from:
+///   BB0:
+///      A = B;    BB1:
+///       ...         ...
+///     /     \      /
+///             BB2:
+///               ...
+///               B = A;
+///
+/// to:
+///
+///   BB0:         BB1:
+///      A = B;        ...
+///       ...          B = A;
+///     /     \       /
+///             BB2:
+///               ...
+///
+/// A special case is when BB0 and BB2 are the same BB which is the only
+/// BB in a loop:
+///   BB1:
+///        ...
+///   BB0/BB2:  ----
+///        B = A;   |
+///        ...      |
+///        A = B;   |
+///          |-------
+///          |
+/// We may hoist B = A from BB0/BB2 to BB1.
+///
+/// The major preconditions for correctness to remove such partial
+/// redundancy include:
+/// 1. A in B = A in BB2 is defined by a PHI in BB2, and one operand of
+///    the PHI is defined by the reversed copy A = B in BB0.
+/// 2. No B is referenced from the start of BB2 to B = A.
+/// 3. No B is defined from A = B to the end of BB0.
+/// 4. BB1 has only one successor.
+///
+/// 2 and 4 implicitly ensure B is not live at the end of BB1.
+/// 4 guarantees BB2 is hotter than BB1, so we can only move a copy to a
+/// colder place, which not only prevent endless loop, but also make sure
+/// the movement of copy is beneficial.
+bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
+                                                MachineInstr &CopyMI) {
+  assert(!CP.isPhys());
+  if (!CopyMI.isFullCopy())
+    return false;
+
+  MachineBasicBlock &MBB = *CopyMI.getParent();
+  // If this block is the target of an invoke/inlineasm_br, moving the copy into
+  // the predecessor is tricker, and we don't handle it.
+  if (MBB.isEHPad() || MBB.isInlineAsmBrIndirectTarget())
+    return false;
+
+  if (MBB.pred_size() != 2)
+    return false;
+
+  LiveInterval &IntA =
+      LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+  LiveInterval &IntB =
+      LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+
+  // A is defined by PHI at the entry of MBB.
+  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot(true);
+  VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx);
+  assert(AValNo && !AValNo->isUnused() && "COPY source not live");
+  if (!AValNo->isPHIDef())
+    return false;
+
+  // No B is referenced before CopyMI in MBB.
+  if (IntB.overlaps(LIS->getMBBStartIdx(&MBB), CopyIdx))
+    return false;
+
+  // MBB has two predecessors: one contains A = B so no copy will be inserted
+  // for it. The other one will have a copy moved from MBB.
+  bool FoundReverseCopy = false;
+  MachineBasicBlock *CopyLeftBB = nullptr;
+  for (MachineBasicBlock *Pred : MBB.predecessors()) {
+    VNInfo *PVal = IntA.getVNInfoBefore(LIS->getMBBEndIdx(Pred));
+    MachineInstr *DefMI = LIS->getInstructionFromIndex(PVal->def);
+    if (!DefMI || !DefMI->isFullCopy()) {
+      CopyLeftBB = Pred;
+      continue;
+    }
+    // Check DefMI is a reverse copy and it is in BB Pred.
+    if (DefMI->getOperand(0).getReg() != IntA.reg() ||
+        DefMI->getOperand(1).getReg() != IntB.reg() ||
+        DefMI->getParent() != Pred) {
+      CopyLeftBB = Pred;
+      continue;
+    }
+    // If there is any other def of B after DefMI and before the end of Pred,
+    // we need to keep the copy of B = A at the end of Pred if we remove
+    // B = A from MBB.
+    bool ValB_Changed = false;
+    for (auto *VNI : IntB.valnos) {
+      if (VNI->isUnused())
+        continue;
+      if (PVal->def < VNI->def && VNI->def < LIS->getMBBEndIdx(Pred)) {
+        ValB_Changed = true;
+        break;
+      }
+    }
+    if (ValB_Changed) {
+      CopyLeftBB = Pred;
+      continue;
+    }
+    FoundReverseCopy = true;
+  }
+
+  // If no reverse copy is found in predecessors, nothing to do.
+  if (!FoundReverseCopy)
+    return false;
+
+  // If CopyLeftBB is nullptr, it means every predecessor of MBB contains
+  // reverse copy, CopyMI can be removed trivially if only IntA/IntB is updated.
+  // If CopyLeftBB is not nullptr, move CopyMI from MBB to CopyLeftBB and
+  // update IntA/IntB.
+  //
+  // If CopyLeftBB is not nullptr, ensure CopyLeftBB has a single succ so
+  // MBB is hotter than CopyLeftBB.
+  if (CopyLeftBB && CopyLeftBB->succ_size() > 1)
+    return false;
+
+  // Now (almost sure it's) ok to move copy.
+  if (CopyLeftBB) {
+    // Position in CopyLeftBB where we should insert new copy.
+    auto InsPos = CopyLeftBB->getFirstTerminator();
+
+    // Make sure that B isn't referenced in the terminators (if any) at the end
+    // of the predecessor since we're about to insert a new definition of B
+    // before them.
+    if (InsPos != CopyLeftBB->end()) {
+      SlotIndex InsPosIdx = LIS->getInstructionIndex(*InsPos).getRegSlot(true);
+      if (IntB.overlaps(InsPosIdx, LIS->getMBBEndIdx(CopyLeftBB)))
+        return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "\tremovePartialRedundancy: Move the copy to "
+                      << printMBBReference(*CopyLeftBB) << '\t' << CopyMI);
+
+    // Insert new copy to CopyLeftBB.
+    MachineInstr *NewCopyMI = BuildMI(*CopyLeftBB, InsPos, CopyMI.getDebugLoc(),
+                                      TII->get(TargetOpcode::COPY), IntB.reg())
+                                  .addReg(IntA.reg());
+    SlotIndex NewCopyIdx =
+        LIS->InsertMachineInstrInMaps(*NewCopyMI).getRegSlot();
+    IntB.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator());
+    for (LiveInterval::SubRange &SR : IntB.subranges())
+      SR.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator());
+
+    // If the newly created Instruction has an address of an instruction that was
+    // deleted before (object recycled by the allocator) it needs to be removed from
+    // the deleted list.
+    ErasedInstrs.erase(NewCopyMI);
+  } else {
+    LLVM_DEBUG(dbgs() << "\tremovePartialRedundancy: Remove the copy from "
+                      << printMBBReference(MBB) << '\t' << CopyMI);
+  }
+
+  // Remove CopyMI.
+  // Note: This is fine to remove the copy before updating the live-ranges.
+  // While updating the live-ranges, we only look at slot indices and
+  // never go back to the instruction.
+  // Mark instructions as deleted.
+  deleteInstr(&CopyMI);
+
+  // Update the liveness.
+  SmallVector<SlotIndex, 8> EndPoints;
+  VNInfo *BValNo = IntB.Query(CopyIdx).valueOutOrDead();
+  LIS->pruneValue(*static_cast<LiveRange *>(&IntB), CopyIdx.getRegSlot(),
+                  &EndPoints);
+  BValNo->markUnused();
+  // Extend IntB to the EndPoints of its original live interval.
+  LIS->extendToIndices(IntB, EndPoints);
+
+  // Now, do the same for its subranges.
+  for (LiveInterval::SubRange &SR : IntB.subranges()) {
+    EndPoints.clear();
+    VNInfo *BValNo = SR.Query(CopyIdx).valueOutOrDead();
+    assert(BValNo && "All sublanes should be live");
+    LIS->pruneValue(SR, CopyIdx.getRegSlot(), &EndPoints);
+    BValNo->markUnused();
+    // We can have a situation where the result of the original copy is live,
+    // but is immediately dead in this subrange, e.g. [336r,336d:0). That makes
+    // the copy appear as an endpoint from pruneValue(), but we don't want it
+    // to because the copy has been removed.  We can go ahead and remove that
+    // endpoint; there is no other situation here that there could be a use at
+    // the same place as we know that the copy is a full copy.
+    for (unsigned I = 0; I != EndPoints.size(); ) {
+      if (SlotIndex::isSameInstr(EndPoints[I], CopyIdx)) {
+        EndPoints[I] = EndPoints.back();
+        EndPoints.pop_back();
+        continue;
+      }
+      ++I;
+    }
+    SmallVector<SlotIndex, 8> Undefs;
+    IntB.computeSubRangeUndefs(Undefs, SR.LaneMask, *MRI,
+                               *LIS->getSlotIndexes());
+    LIS->extendToIndices(SR, EndPoints, Undefs);
+  }
+  // If any dead defs were extended, truncate them.
+  shrinkToUses(&IntB);
+
+  // Finally, update the live-range of IntA.
+  shrinkToUses(&IntA);
+  return true;
+}
+
+/// Returns true if @p MI defines the full vreg @p Reg, as opposed to just
+/// defining a subregister.
+static bool definesFullReg(const MachineInstr &MI, Register Reg) {
+  assert(!Reg.isPhysical() && "This code cannot handle physreg aliasing");
+
+  for (const MachineOperand &Op : MI.all_defs()) {
+    if (Op.getReg() != Reg)
+      continue;
+    // Return true if we define the full register or don't care about the value
+    // inside other subregisters.
+    if (Op.getSubReg() == 0 || Op.isUndef())
+      return true;
+  }
+  return false;
+}
+
+bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
+                                                MachineInstr *CopyMI,
+                                                bool &IsDefCopy) {
+  IsDefCopy = false;
+  Register SrcReg = CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg();
+  unsigned SrcIdx = CP.isFlipped() ? CP.getDstIdx() : CP.getSrcIdx();
+  Register DstReg = CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg();
+  unsigned DstIdx = CP.isFlipped() ? CP.getSrcIdx() : CP.getDstIdx();
+  if (SrcReg.isPhysical())
+    return false;
+
+  LiveInterval &SrcInt = LIS->getInterval(SrcReg);
+  SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI);
+  VNInfo *ValNo = SrcInt.Query(CopyIdx).valueIn();
+  if (!ValNo)
+    return false;
+  if (ValNo->isPHIDef() || ValNo->isUnused())
+    return false;
+  MachineInstr *DefMI = LIS->getInstructionFromIndex(ValNo->def);
+  if (!DefMI)
+    return false;
+  if (DefMI->isCopyLike()) {
+    IsDefCopy = true;
+    return false;
+  }
+  if (!TII->isAsCheapAsAMove(*DefMI))
+    return false;
+
+  SmallVector<Register, 8> NewRegs;
+  LiveRangeEdit Edit(&SrcInt, NewRegs, *MF, *LIS, nullptr, this);
+  if (!Edit.checkRematerializable(ValNo, DefMI))
+    return false;
+
+  if (!definesFullReg(*DefMI, SrcReg))
+    return false;
+  bool SawStore = false;
+  if (!DefMI->isSafeToMove(AA, SawStore))
+    return false;
+  const MCInstrDesc &MCID = DefMI->getDesc();
+  if (MCID.getNumDefs() != 1)
+    return false;
+  // Only support subregister destinations when the def is read-undef.
+  MachineOperand &DstOperand = CopyMI->getOperand(0);
+  Register CopyDstReg = DstOperand.getReg();
+  if (DstOperand.getSubReg() && !DstOperand.isUndef())
+    return false;
+
+  // If both SrcIdx and DstIdx are set, correct rematerialization would widen
+  // the register substantially (beyond both source and dest size). This is bad
+  // for performance since it can cascade through a function, introducing many
+  // extra spills and fills (e.g. ARM can easily end up copying QQQQPR registers
+  // around after a few subreg copies).
+  if (SrcIdx && DstIdx)
+    return false;
+
+  const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI, *MF);
+  if (!DefMI->isImplicitDef()) {
+    if (DstReg.isPhysical()) {
+      Register NewDstReg = DstReg;
+
+      unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(),
+                                              DefMI->getOperand(0).getSubReg());
+      if (NewDstIdx)
+        NewDstReg = TRI->getSubReg(DstReg, NewDstIdx);
+
+      // Finally, make sure that the physical subregister that will be
+      // constructed later is permitted for the instruction.
+      if (!DefRC->contains(NewDstReg))
+        return false;
+    } else {
+      // Theoretically, some stack frame reference could exist. Just make sure
+      // it hasn't actually happened.
+      assert(DstReg.isVirtual() &&
+             "Only expect to deal with virtual or physical registers");
+    }
+  }
+
+  LiveRangeEdit::Remat RM(ValNo);
+  RM.OrigMI = DefMI;
+  if (!Edit.canRematerializeAt(RM, ValNo, CopyIdx, true))
+    return false;
+
+  DebugLoc DL = CopyMI->getDebugLoc();
+  MachineBasicBlock *MBB = CopyMI->getParent();
+  MachineBasicBlock::iterator MII =
+    std::next(MachineBasicBlock::iterator(CopyMI));
+  Edit.rematerializeAt(*MBB, MII, DstReg, RM, *TRI, false, SrcIdx, CopyMI);
+  MachineInstr &NewMI = *std::prev(MII);
+  NewMI.setDebugLoc(DL);
+
+  // In a situation like the following:
+  //     %0:subreg = instr              ; DefMI, subreg = DstIdx
+  //     %1        = copy %0:subreg ; CopyMI, SrcIdx = 0
+  // instead of widening %1 to the register class of %0 simply do:
+  //     %1 = instr
+  const TargetRegisterClass *NewRC = CP.getNewRC();
+  if (DstIdx != 0) {
+    MachineOperand &DefMO = NewMI.getOperand(0);
+    if (DefMO.getSubReg() == DstIdx) {
+      assert(SrcIdx == 0 && CP.isFlipped()
+             && "Shouldn't have SrcIdx+DstIdx at this point");
+      const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
+      const TargetRegisterClass *CommonRC =
+        TRI->getCommonSubClass(DefRC, DstRC);
+      if (CommonRC != nullptr) {
+        NewRC = CommonRC;
+
+        // Instruction might contain "undef %0:subreg" as use operand:
+        //   %0:subreg = instr op_1, ..., op_N, undef %0:subreg, op_N+2, ...
+        //
+        // Need to check all operands.
+        for (MachineOperand &MO : NewMI.operands()) {
+          if (MO.isReg() && MO.getReg() == DstReg && MO.getSubReg() == DstIdx) {
+            MO.setSubReg(0);
+          }
+        }
+
+        DstIdx = 0;
+        DefMO.setIsUndef(false); // Only subregs can have def+undef.
+      }
+    }
+  }
+
+  // CopyMI may have implicit operands, save them so that we can transfer them
+  // over to the newly materialized instruction after CopyMI is removed.
+  SmallVector<MachineOperand, 4> ImplicitOps;
+  ImplicitOps.reserve(CopyMI->getNumOperands() -
+                      CopyMI->getDesc().getNumOperands());
+  for (unsigned I = CopyMI->getDesc().getNumOperands(),
+                E = CopyMI->getNumOperands();
+       I != E; ++I) {
+    MachineOperand &MO = CopyMI->getOperand(I);
+    if (MO.isReg()) {
+      assert(MO.isImplicit() && "No explicit operands after implicit operands.");
+      // Discard VReg implicit defs.
+      if (MO.getReg().isPhysical())
+        ImplicitOps.push_back(MO);
+    }
+  }
+
+  CopyMI->eraseFromParent();
+  ErasedInstrs.insert(CopyMI);
+
+  // NewMI may have dead implicit defs (E.g. EFLAGS for MOV<bits>r0 on X86).
+  // We need to remember these so we can add intervals once we insert
+  // NewMI into SlotIndexes.
+  SmallVector<MCRegister, 4> NewMIImplDefs;
+  for (unsigned i = NewMI.getDesc().getNumOperands(),
+                e = NewMI.getNumOperands();
+       i != e; ++i) {
+    MachineOperand &MO = NewMI.getOperand(i);
+    if (MO.isReg() && MO.isDef()) {
+      assert(MO.isImplicit() && MO.isDead() && MO.getReg().isPhysical());
+      NewMIImplDefs.push_back(MO.getReg().asMCReg());
+    }
+  }
+
+  if (DstReg.isVirtual()) {
+    unsigned NewIdx = NewMI.getOperand(0).getSubReg();
+
+    if (DefRC != nullptr) {
+      if (NewIdx)
+        NewRC = TRI->getMatchingSuperRegClass(NewRC, DefRC, NewIdx);
+      else
+        NewRC = TRI->getCommonSubClass(NewRC, DefRC);
+      assert(NewRC && "subreg chosen for remat incompatible with instruction");
+    }
+    // Remap subranges to new lanemask and change register class.
+    LiveInterval &DstInt = LIS->getInterval(DstReg);
+    for (LiveInterval::SubRange &SR : DstInt.subranges()) {
+      SR.LaneMask = TRI->composeSubRegIndexLaneMask(DstIdx, SR.LaneMask);
+    }
+    MRI->setRegClass(DstReg, NewRC);
+
+    // Update machine operands and add flags.
+    updateRegDefsUses(DstReg, DstReg, DstIdx);
+    NewMI.getOperand(0).setSubReg(NewIdx);
+    // updateRegDefUses can add an "undef" flag to the definition, since
+    // it will replace DstReg with DstReg.DstIdx. If NewIdx is 0, make
+    // sure that "undef" is not set.
+    if (NewIdx == 0)
+      NewMI.getOperand(0).setIsUndef(false);
+    // Add dead subregister definitions if we are defining the whole register
+    // but only part of it is live.
+    // This could happen if the rematerialization instruction is rematerializing
+    // more than actually is used in the register.
+    // An example would be:
+    // %1 = LOAD CONSTANTS 5, 8 ; Loading both 5 and 8 in different subregs
+    // ; Copying only part of the register here, but the rest is undef.
+    // %2:sub_16bit<def, read-undef> = COPY %1:sub_16bit
+    // ==>
+    // ; Materialize all the constants but only using one
+    // %2 = LOAD_CONSTANTS 5, 8
+    //
+    // at this point for the part that wasn't defined before we could have
+    // subranges missing the definition.
+    if (NewIdx == 0 && DstInt.hasSubRanges()) {
+      SlotIndex CurrIdx = LIS->getInstructionIndex(NewMI);
+      SlotIndex DefIndex =
+          CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber());
+      LaneBitmask MaxMask = MRI->getMaxLaneMaskForVReg(DstReg);
+      VNInfo::Allocator& Alloc = LIS->getVNInfoAllocator();
+      for (LiveInterval::SubRange &SR : DstInt.subranges()) {
+        if (!SR.liveAt(DefIndex))
+          SR.createDeadDef(DefIndex, Alloc);
+        MaxMask &= ~SR.LaneMask;
+      }
+      if (MaxMask.any()) {
+        LiveInterval::SubRange *SR = DstInt.createSubRange(Alloc, MaxMask);
+        SR->createDeadDef(DefIndex, Alloc);
+      }
+    }
+
+    // Make sure that the subrange for resultant undef is removed
+    // For example:
+    //   %1:sub1<def,read-undef> = LOAD CONSTANT 1
+    //   %2 = COPY %1
+    // ==>
+    //   %2:sub1<def, read-undef> = LOAD CONSTANT 1
+    //     ; Correct but need to remove the subrange for %2:sub0
+    //     ; as it is now undef
+    if (NewIdx != 0 && DstInt.hasSubRanges()) {
+      // The affected subregister segments can be removed.
+      SlotIndex CurrIdx = LIS->getInstructionIndex(NewMI);
+      LaneBitmask DstMask = TRI->getSubRegIndexLaneMask(NewIdx);
+      bool UpdatedSubRanges = false;
+      SlotIndex DefIndex =
+          CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber());
+      VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator();
+      for (LiveInterval::SubRange &SR : DstInt.subranges()) {
+        if ((SR.LaneMask & DstMask).none()) {
+          LLVM_DEBUG(dbgs()
+                     << "Removing undefined SubRange "
+                     << PrintLaneMask(SR.LaneMask) << " : " << SR << "\n");
+
+          if (VNInfo *RmValNo = SR.getVNInfoAt(CurrIdx.getRegSlot())) {
+            // VNI is in ValNo - remove any segments in this SubRange that have
+            // this ValNo
+            SR.removeValNo(RmValNo);
+          }
+
+          // We may not have a defined value at this point, but still need to
+          // clear out any empty subranges tentatively created by
+          // updateRegDefUses. The original subrange def may have only undefed
+          // some lanes.
+          UpdatedSubRanges = true;
+        } else {
+          // We know that this lane is defined by this instruction,
+          // but at this point it may be empty because it is not used by
+          // anything. This happens when updateRegDefUses adds the missing
+          // lanes. Assign that lane a dead def so that the interferences
+          // are properly modeled.
+          if (SR.empty())
+            SR.createDeadDef(DefIndex, Alloc);
+        }
+      }
+      if (UpdatedSubRanges)
+        DstInt.removeEmptySubRanges();
+    }
+  } else if (NewMI.getOperand(0).getReg() != CopyDstReg) {
+    // The New instruction may be defining a sub-register of what's actually
+    // been asked for. If so it must implicitly define the whole thing.
+    assert(DstReg.isPhysical() &&
+           "Only expect virtual or physical registers in remat");
+    NewMI.getOperand(0).setIsDead(true);
+    NewMI.addOperand(MachineOperand::CreateReg(
+        CopyDstReg, true /*IsDef*/, true /*IsImp*/, false /*IsKill*/));
+    // Record small dead def live-ranges for all the subregisters
+    // of the destination register.
+    // Otherwise, variables that live through may miss some
+    // interferences, thus creating invalid allocation.
+    // E.g., i386 code:
+    // %1 = somedef ; %1 GR8
+    // %2 = remat ; %2 GR32
+    // CL = COPY %2.sub_8bit
+    // = somedef %1 ; %1 GR8
+    // =>
+    // %1 = somedef ; %1 GR8
+    // dead ECX = remat ; implicit-def CL
+    // = somedef %1 ; %1 GR8
+    // %1 will see the interferences with CL but not with CH since
+    // no live-ranges would have been created for ECX.
+    // Fix that!
+    SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
+    for (MCRegUnit Unit : TRI->regunits(NewMI.getOperand(0).getReg()))
+      if (LiveRange *LR = LIS->getCachedRegUnit(Unit))
+        LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
+  }
+
+  if (NewMI.getOperand(0).getSubReg())
+    NewMI.getOperand(0).setIsUndef();
+
+  // Transfer over implicit operands to the rematerialized instruction.
+  for (MachineOperand &MO : ImplicitOps)
+    NewMI.addOperand(MO);
+
+  SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
+  for (unsigned i = 0, e = NewMIImplDefs.size(); i != e; ++i) {
+    MCRegister Reg = NewMIImplDefs[i];
+    for (MCRegUnit Unit : TRI->regunits(Reg))
+      if (LiveRange *LR = LIS->getCachedRegUnit(Unit))
+        LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
+  }
+
+  LLVM_DEBUG(dbgs() << "Remat: " << NewMI);
+  ++NumReMats;
+
+  // If the virtual SrcReg is completely eliminated, update all DBG_VALUEs
+  // to describe DstReg instead.
+  if (MRI->use_nodbg_empty(SrcReg)) {
+    for (MachineOperand &UseMO :
+         llvm::make_early_inc_range(MRI->use_operands(SrcReg))) {
+      MachineInstr *UseMI = UseMO.getParent();
+      if (UseMI->isDebugInstr()) {
+        if (DstReg.isPhysical())
+          UseMO.substPhysReg(DstReg, *TRI);
+        else
+          UseMO.setReg(DstReg);
+        // Move the debug value directly after the def of the rematerialized
+        // value in DstReg.
+        MBB->splice(std::next(NewMI.getIterator()), UseMI->getParent(), UseMI);
+        LLVM_DEBUG(dbgs() << "\t\tupdated: " << *UseMI);
+      }
+    }
+  }
+
+  if (ToBeUpdated.count(SrcReg))
+    return true;
+
+  unsigned NumCopyUses = 0;
+  for (MachineOperand &UseMO : MRI->use_nodbg_operands(SrcReg)) {
+    if (UseMO.getParent()->isCopyLike())
+      NumCopyUses++;
+  }
+  if (NumCopyUses < LateRematUpdateThreshold) {
+    // The source interval can become smaller because we removed a use.
+    shrinkToUses(&SrcInt, &DeadDefs);
+    if (!DeadDefs.empty())
+      eliminateDeadDefs(&Edit);
+  } else {
+    ToBeUpdated.insert(SrcReg);
+  }
+  return true;
+}
+
+MachineInstr *RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
+  // ProcessImplicitDefs may leave some copies of <undef> values, it only
+  // removes local variables. When we have a copy like:
+  //
+  //   %1 = COPY undef %2
+  //
+  // We delete the copy and remove the corresponding value number from %1.
+  // Any uses of that value number are marked as <undef>.
+
+  // Note that we do not query CoalescerPair here but redo isMoveInstr as the
+  // CoalescerPair may have a new register class with adjusted subreg indices
+  // at this point.
+  Register SrcReg, DstReg;
+  unsigned SrcSubIdx = 0, DstSubIdx = 0;
+  if(!isMoveInstr(*TRI, CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
+    return nullptr;
+
+  SlotIndex Idx = LIS->getInstructionIndex(*CopyMI);
+  const LiveInterval &SrcLI = LIS->getInterval(SrcReg);
+  // CopyMI is undef iff SrcReg is not live before the instruction.
+  if (SrcSubIdx != 0 && SrcLI.hasSubRanges()) {
+    LaneBitmask SrcMask = TRI->getSubRegIndexLaneMask(SrcSubIdx);
+    for (const LiveInterval::SubRange &SR : SrcLI.subranges()) {
+      if ((SR.LaneMask & SrcMask).none())
+        continue;
+      if (SR.liveAt(Idx))
+        return nullptr;
+    }
+  } else if (SrcLI.liveAt(Idx))
+    return nullptr;
+
+  // If the undef copy defines a live-out value (i.e. an input to a PHI def),
+  // then replace it with an IMPLICIT_DEF.
+  LiveInterval &DstLI = LIS->getInterval(DstReg);
+  SlotIndex RegIndex = Idx.getRegSlot();
+  LiveRange::Segment *Seg = DstLI.getSegmentContaining(RegIndex);
+  assert(Seg != nullptr && "No segment for defining instruction");
+  VNInfo *V = DstLI.getVNInfoAt(Seg->end);
+
+  // The source interval may also have been on an undef use, in which case the
+  // copy introduced a live value.
+  if (((V && V->isPHIDef()) || (!V && !DstLI.liveAt(Idx)))) {
+    CopyMI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
+    for (unsigned i = CopyMI->getNumOperands(); i != 0; --i) {
+      MachineOperand &MO = CopyMI->getOperand(i-1);
+      if (MO.isReg() && MO.isUse())
+        CopyMI->removeOperand(i-1);
+    }
+    LLVM_DEBUG(dbgs() << "\tReplaced copy of <undef> value with an "
+               "implicit def\n");
+    return CopyMI;
+  }
+
+  // Remove any DstReg segments starting at the instruction.
+  LLVM_DEBUG(dbgs() << "\tEliminating copy of <undef> value\n");
+
+  // Remove value or merge with previous one in case of a subregister def.
+  if (VNInfo *PrevVNI = DstLI.getVNInfoAt(Idx)) {
+    VNInfo *VNI = DstLI.getVNInfoAt(RegIndex);
+    DstLI.MergeValueNumberInto(VNI, PrevVNI);
+
+    // The affected subregister segments can be removed.
+    LaneBitmask DstMask = TRI->getSubRegIndexLaneMask(DstSubIdx);
+    for (LiveInterval::SubRange &SR : DstLI.subranges()) {
+      if ((SR.LaneMask & DstMask).none())
+        continue;
+
+      VNInfo *SVNI = SR.getVNInfoAt(RegIndex);
+      assert(SVNI != nullptr && SlotIndex::isSameInstr(SVNI->def, RegIndex));
+      SR.removeValNo(SVNI);
+    }
+    DstLI.removeEmptySubRanges();
+  } else
+    LIS->removeVRegDefAt(DstLI, RegIndex);
+
+  // Mark uses as undef.
+  for (MachineOperand &MO : MRI->reg_nodbg_operands(DstReg)) {
+    if (MO.isDef() /*|| MO.isUndef()*/)
+      continue;
+    const MachineInstr &MI = *MO.getParent();
+    SlotIndex UseIdx = LIS->getInstructionIndex(MI);
+    LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
+    bool isLive;
+    if (!UseMask.all() && DstLI.hasSubRanges()) {
+      isLive = false;
+      for (const LiveInterval::SubRange &SR : DstLI.subranges()) {
+        if ((SR.LaneMask & UseMask).none())
+          continue;
+        if (SR.liveAt(UseIdx)) {
+          isLive = true;
+          break;
+        }
+      }
+    } else
+      isLive = DstLI.liveAt(UseIdx);
+    if (isLive)
+      continue;
+    MO.setIsUndef(true);
+    LLVM_DEBUG(dbgs() << "\tnew undef: " << UseIdx << '\t' << MI);
+  }
+
+  // A def of a subregister may be a use of the other subregisters, so
+  // deleting a def of a subregister may also remove uses. Since CopyMI
+  // is still part of the function (but about to be erased), mark all
+  // defs of DstReg in it as <undef>, so that shrinkToUses would
+  // ignore them.
+  for (MachineOperand &MO : CopyMI->all_defs())
+    if (MO.getReg() == DstReg)
+      MO.setIsUndef(true);
+  LIS->shrinkToUses(&DstLI);
+
+  return CopyMI;
+}
+
+void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
+                                     MachineOperand &MO, unsigned SubRegIdx) {
+  LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubRegIdx);
+  if (MO.isDef())
+    Mask = ~Mask;
+  bool IsUndef = true;
+  for (const LiveInterval::SubRange &S : Int.subranges()) {
+    if ((S.LaneMask & Mask).none())
+      continue;
+    if (S.liveAt(UseIdx)) {
+      IsUndef = false;
+      break;
+    }
+  }
+  if (IsUndef) {
+    MO.setIsUndef(true);
+    // We found out some subregister use is actually reading an undefined
+    // value. In some cases the whole vreg has become undefined at this
+    // point so we have to potentially shrink the main range if the
+    // use was ending a live segment there.
+    LiveQueryResult Q = Int.Query(UseIdx);
+    if (Q.valueOut() == nullptr)
+      ShrinkMainRange = true;
+  }
+}
+
+void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
+                                          unsigned SubIdx) {
+  bool DstIsPhys = DstReg.isPhysical();
+  LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);
+
+  if (DstInt && DstInt->hasSubRanges() && DstReg != SrcReg) {
+    for (MachineOperand &MO : MRI->reg_operands(DstReg)) {
+      unsigned SubReg = MO.getSubReg();
+      if (SubReg == 0 || MO.isUndef())
+        continue;
+      MachineInstr &MI = *MO.getParent();
+      if (MI.isDebugInstr())
+        continue;
+      SlotIndex UseIdx = LIS->getInstructionIndex(MI).getRegSlot(true);
+      addUndefFlag(*DstInt, UseIdx, MO, SubReg);
+    }
+  }
+
+  SmallPtrSet<MachineInstr*, 8> Visited;
+  for (MachineRegisterInfo::reg_instr_iterator
+       I = MRI->reg_instr_begin(SrcReg), E = MRI->reg_instr_end();
+       I != E; ) {
+    MachineInstr *UseMI = &*(I++);
+
+    // Each instruction can only be rewritten once because sub-register
+    // composition is not always idempotent. When SrcReg != DstReg, rewriting
+    // the UseMI operands removes them from the SrcReg use-def chain, but when
+    // SrcReg is DstReg we could encounter UseMI twice if it has multiple
+    // operands mentioning the virtual register.
+    if (SrcReg == DstReg && !Visited.insert(UseMI).second)
+      continue;
+
+    SmallVector<unsigned,8> Ops;
+    bool Reads, Writes;
+    std::tie(Reads, Writes) = UseMI->readsWritesVirtualRegister(SrcReg, &Ops);
+
+    // If SrcReg wasn't read, it may still be the case that DstReg is live-in
+    // because SrcReg is a sub-register.
+    if (DstInt && !Reads && SubIdx && !UseMI->isDebugInstr())
+      Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));
+
+    // Replace SrcReg with DstReg in all UseMI operands.
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      MachineOperand &MO = UseMI->getOperand(Ops[i]);
+
+      // Adjust <undef> flags in case of sub-register joins. We don't want to
+      // turn a full def into a read-modify-write sub-register def and vice
+      // versa.
+      if (SubIdx && MO.isDef())
+        MO.setIsUndef(!Reads);
+
+      // A subreg use of a partially undef (super) register may be a complete
+      // undef use now and then has to be marked that way.
+      if (MO.isUse() && !DstIsPhys) {
+        unsigned SubUseIdx = TRI->composeSubRegIndices(SubIdx, MO.getSubReg());
+        if (SubUseIdx != 0 && MRI->shouldTrackSubRegLiveness(DstReg)) {
+          if (!DstInt->hasSubRanges()) {
+            BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+            LaneBitmask FullMask = MRI->getMaxLaneMaskForVReg(DstInt->reg());
+            LaneBitmask UsedLanes = TRI->getSubRegIndexLaneMask(SubIdx);
+            LaneBitmask UnusedLanes = FullMask & ~UsedLanes;
+            DstInt->createSubRangeFrom(Allocator, UsedLanes, *DstInt);
+            // The unused lanes are just empty live-ranges at this point.
+            // It is the caller responsibility to set the proper
+            // dead segments if there is an actual dead def of the
+            // unused lanes. This may happen with rematerialization.
+            DstInt->createSubRange(Allocator, UnusedLanes);
+          }
+          SlotIndex MIIdx = UseMI->isDebugInstr()
+            ? LIS->getSlotIndexes()->getIndexBefore(*UseMI)
+            : LIS->getInstructionIndex(*UseMI);
+          SlotIndex UseIdx = MIIdx.getRegSlot(true);
+          addUndefFlag(*DstInt, UseIdx, MO, SubUseIdx);
+        }
+      }
+
+      if (DstIsPhys)
+        MO.substPhysReg(DstReg, *TRI);
+      else
+        MO.substVirtReg(DstReg, SubIdx, *TRI);
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "\t\tupdated: ";
+      if (!UseMI->isDebugInstr())
+        dbgs() << LIS->getInstructionIndex(*UseMI) << "\t";
+      dbgs() << *UseMI;
+    });
+  }
+}
+
+bool RegisterCoalescer::canJoinPhys(const CoalescerPair &CP) {
+  // Always join simple intervals that are defined by a single copy from a
+  // reserved register. This doesn't increase register pressure, so it is
+  // always beneficial.
+  if (!MRI->isReserved(CP.getDstReg())) {
+    LLVM_DEBUG(dbgs() << "\tCan only merge into reserved registers.\n");
+    return false;
+  }
+
+  LiveInterval &JoinVInt = LIS->getInterval(CP.getSrcReg());
+  if (JoinVInt.containsOneValue())
+    return true;
+
+  LLVM_DEBUG(
+      dbgs() << "\tCannot join complex intervals into reserved register.\n");
+  return false;
+}
+
+bool RegisterCoalescer::copyValueUndefInPredecessors(
+    LiveRange &S, const MachineBasicBlock *MBB, LiveQueryResult SLRQ) {
+  for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+    SlotIndex PredEnd = LIS->getMBBEndIdx(Pred);
+    if (VNInfo *V = S.getVNInfoAt(PredEnd.getPrevSlot())) {
+      // If this is a self loop, we may be reading the same value.
+      if (V->id != SLRQ.valueOutOrDead()->id)
+        return false;
+    }
+  }
+
+  return true;
+}
+
+void RegisterCoalescer::setUndefOnPrunedSubRegUses(LiveInterval &LI,
+                                                   Register Reg,
+                                                   LaneBitmask PrunedLanes) {
+  // If we had other instructions in the segment reading the undef sublane
+  // value, we need to mark them with undef.
+  for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+    unsigned SubRegIdx = MO.getSubReg();
+    if (SubRegIdx == 0 || MO.isUndef())
+      continue;
+
+    LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(SubRegIdx);
+    SlotIndex Pos = LIS->getInstructionIndex(*MO.getParent());
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      if (!S.liveAt(Pos) && (PrunedLanes & SubRegMask).any()) {
+        MO.setIsUndef();
+        break;
+      }
+    }
+  }
+
+  LI.removeEmptySubRanges();
+
+  // A def of a subregister may be a use of other register lanes. Replacing
+  // such a def with a def of a different register will eliminate the use,
+  // and may cause the recorded live range to be larger than the actual
+  // liveness in the program IR.
+  LIS->shrinkToUses(&LI);
+}
+
+bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
+  Again = false;
+  LLVM_DEBUG(dbgs() << LIS->getInstructionIndex(*CopyMI) << '\t' << *CopyMI);
+
+  CoalescerPair CP(*TRI);
+  if (!CP.setRegisters(CopyMI)) {
+    LLVM_DEBUG(dbgs() << "\tNot coalescable.\n");
+    return false;
+  }
+
+  if (CP.getNewRC()) {
+    auto SrcRC = MRI->getRegClass(CP.getSrcReg());
+    auto DstRC = MRI->getRegClass(CP.getDstReg());
+    unsigned SrcIdx = CP.getSrcIdx();
+    unsigned DstIdx = CP.getDstIdx();
+    if (CP.isFlipped()) {
+      std::swap(SrcIdx, DstIdx);
+      std::swap(SrcRC, DstRC);
+    }
+    if (!TRI->shouldCoalesce(CopyMI, SrcRC, SrcIdx, DstRC, DstIdx,
+                             CP.getNewRC(), *LIS)) {
+      LLVM_DEBUG(dbgs() << "\tSubtarget bailed on coalescing.\n");
+      return false;
+    }
+  }
+
+  // Dead code elimination. This really should be handled by MachineDCE, but
+  // sometimes dead copies slip through, and we can't generate invalid live
+  // ranges.
+  if (!CP.isPhys() && CopyMI->allDefsAreDead()) {
+    LLVM_DEBUG(dbgs() << "\tCopy is dead.\n");
+    DeadDefs.push_back(CopyMI);
+    eliminateDeadDefs();
+    return true;
+  }
+
+  // Eliminate undefs.
+  if (!CP.isPhys()) {
+    // If this is an IMPLICIT_DEF, leave it alone, but don't try to coalesce.
+    if (MachineInstr *UndefMI = eliminateUndefCopy(CopyMI)) {
+      if (UndefMI->isImplicitDef())
+        return false;
+      deleteInstr(CopyMI);
+      return false;  // Not coalescable.
+    }
+  }
+
+  // Coalesced copies are normally removed immediately, but transformations
+  // like removeCopyByCommutingDef() can inadvertently create identity copies.
+  // When that happens, just join the values and remove the copy.
+  if (CP.getSrcReg() == CP.getDstReg()) {
+    LiveInterval &LI = LIS->getInterval(CP.getSrcReg());
+    LLVM_DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n');
+    const SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI);
+    LiveQueryResult LRQ = LI.Query(CopyIdx);
+    if (VNInfo *DefVNI = LRQ.valueDefined()) {
+      VNInfo *ReadVNI = LRQ.valueIn();
+      assert(ReadVNI && "No value before copy and no <undef> flag.");
+      assert(ReadVNI != DefVNI && "Cannot read and define the same value.");
+
+      // Track incoming undef lanes we need to eliminate from the subrange.
+      LaneBitmask PrunedLanes;
+      MachineBasicBlock *MBB = CopyMI->getParent();
+
+      // Process subregister liveranges.
+      for (LiveInterval::SubRange &S : LI.subranges()) {
+        LiveQueryResult SLRQ = S.Query(CopyIdx);
+        if (VNInfo *SDefVNI = SLRQ.valueDefined()) {
+          if (VNInfo *SReadVNI = SLRQ.valueIn())
+            SDefVNI = S.MergeValueNumberInto(SDefVNI, SReadVNI);
+
+          // If this copy introduced an undef subrange from an incoming value,
+          // we need to eliminate the undef live in values from the subrange.
+          if (copyValueUndefInPredecessors(S, MBB, SLRQ)) {
+            LLVM_DEBUG(dbgs() << "Incoming sublane value is undef at copy\n");
+            PrunedLanes |= S.LaneMask;
+            S.removeValNo(SDefVNI);
+          }
+        }
+      }
+
+      LI.MergeValueNumberInto(DefVNI, ReadVNI);
+      if (PrunedLanes.any()) {
+        LLVM_DEBUG(dbgs() << "Pruning undef incoming lanes: "
+                          << PrunedLanes << '\n');
+        setUndefOnPrunedSubRegUses(LI, CP.getSrcReg(), PrunedLanes);
+      }
+
+      LLVM_DEBUG(dbgs() << "\tMerged values:          " << LI << '\n');
+    }
+    deleteInstr(CopyMI);
+    return true;
+  }
+
+  // Enforce policies.
+  if (CP.isPhys()) {
+    LLVM_DEBUG(dbgs() << "\tConsidering merging "
+                      << printReg(CP.getSrcReg(), TRI) << " with "
+                      << printReg(CP.getDstReg(), TRI, CP.getSrcIdx()) << '\n');
+    if (!canJoinPhys(CP)) {
+      // Before giving up coalescing, if definition of source is defined by
+      // trivial computation, try rematerializing it.
+      bool IsDefCopy = false;
+      if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy))
+        return true;
+      if (IsDefCopy)
+        Again = true;  // May be possible to coalesce later.
+      return false;
+    }
+  } else {
+    // When possible, let DstReg be the larger interval.
+    if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).size() >
+                           LIS->getInterval(CP.getDstReg()).size())
+      CP.flip();
+
+    LLVM_DEBUG({
+      dbgs() << "\tConsidering merging to "
+             << TRI->getRegClassName(CP.getNewRC()) << " with ";
+      if (CP.getDstIdx() && CP.getSrcIdx())
+        dbgs() << printReg(CP.getDstReg()) << " in "
+               << TRI->getSubRegIndexName(CP.getDstIdx()) << " and "
+               << printReg(CP.getSrcReg()) << " in "
+               << TRI->getSubRegIndexName(CP.getSrcIdx()) << '\n';
+      else
+        dbgs() << printReg(CP.getSrcReg(), TRI) << " in "
+               << printReg(CP.getDstReg(), TRI, CP.getSrcIdx()) << '\n';
+    });
+  }
+
+  ShrinkMask = LaneBitmask::getNone();
+  ShrinkMainRange = false;
+
+  // Okay, attempt to join these two intervals.  On failure, this returns false.
+  // Otherwise, if one of the intervals being joined is a physreg, this method
+  // always canonicalizes DstInt to be it.  The output "SrcInt" will not have
+  // been modified, so we can use this information below to update aliases.
+  if (!joinIntervals(CP)) {
+    // Coalescing failed.
+
+    // If definition of source is defined by trivial computation, try
+    // rematerializing it.
+    bool IsDefCopy = false;
+    if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy))
+      return true;
+
+    // If we can eliminate the copy without merging the live segments, do so
+    // now.
+    if (!CP.isPartial() && !CP.isPhys()) {
+      bool Changed = adjustCopiesBackFrom(CP, CopyMI);
+      bool Shrink = false;
+      if (!Changed)
+        std::tie(Changed, Shrink) = removeCopyByCommutingDef(CP, CopyMI);
+      if (Changed) {
+        deleteInstr(CopyMI);
+        if (Shrink) {
+          Register DstReg = CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg();
+          LiveInterval &DstLI = LIS->getInterval(DstReg);
+          shrinkToUses(&DstLI);
+          LLVM_DEBUG(dbgs() << "\t\tshrunk:   " << DstLI << '\n');
+        }
+        LLVM_DEBUG(dbgs() << "\tTrivial!\n");
+        return true;
+      }
+    }
+
+    // Try and see if we can partially eliminate the copy by moving the copy to
+    // its predecessor.
+    if (!CP.isPartial() && !CP.isPhys())
+      if (removePartialRedundancy(CP, *CopyMI))
+        return true;
+
+    // Otherwise, we are unable to join the intervals.
+    LLVM_DEBUG(dbgs() << "\tInterference!\n");
+    Again = true;  // May be possible to coalesce later.
+    return false;
+  }
+
+  // Coalescing to a virtual register that is of a sub-register class of the
+  // other. Make sure the resulting register is set to the right register class.
+  if (CP.isCrossClass()) {
+    ++numCrossRCs;
+    MRI->setRegClass(CP.getDstReg(), CP.getNewRC());
+  }
+
+  // Removing sub-register copies can ease the register class constraints.
+  // Make sure we attempt to inflate the register class of DstReg.
+  if (!CP.isPhys() && RegClassInfo.isProperSubClass(CP.getNewRC()))
+    InflateRegs.push_back(CP.getDstReg());
+
+  // CopyMI has been erased by joinIntervals at this point. Remove it from
+  // ErasedInstrs since copyCoalesceWorkList() won't add a successful join back
+  // to the work list. This keeps ErasedInstrs from growing needlessly.
+  ErasedInstrs.erase(CopyMI);
+
+  // Rewrite all SrcReg operands to DstReg.
+  // Also update DstReg operands to include DstIdx if it is set.
+  if (CP.getDstIdx())
+    updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
+  updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
+
+  // Shrink subregister ranges if necessary.
+  if (ShrinkMask.any()) {
+    LiveInterval &LI = LIS->getInterval(CP.getDstReg());
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      if ((S.LaneMask & ShrinkMask).none())
+        continue;
+      LLVM_DEBUG(dbgs() << "Shrink LaneUses (Lane " << PrintLaneMask(S.LaneMask)
+                        << ")\n");
+      LIS->shrinkToUses(S, LI.reg());
+      ShrinkMainRange = true;
+    }
+    LI.removeEmptySubRanges();
+  }
+
+  // CP.getSrcReg()'s live interval has been merged into CP.getDstReg's live
+  // interval. Since CP.getSrcReg() is in ToBeUpdated set and its live interval
+  // is not up-to-date, need to update the merged live interval here.
+  if (ToBeUpdated.count(CP.getSrcReg()))
+    ShrinkMainRange = true;
+
+  if (ShrinkMainRange) {
+    LiveInterval &LI = LIS->getInterval(CP.getDstReg());
+    shrinkToUses(&LI);
+  }
+
+  // SrcReg is guaranteed to be the register whose live interval that is
+  // being merged.
+  LIS->removeInterval(CP.getSrcReg());
+
+  // Update regalloc hint.
+  TRI->updateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
+
+  LLVM_DEBUG({
+    dbgs() << "\tSuccess: " << printReg(CP.getSrcReg(), TRI, CP.getSrcIdx())
+           << " -> " << printReg(CP.getDstReg(), TRI, CP.getDstIdx()) << '\n';
+    dbgs() << "\tResult = ";
+    if (CP.isPhys())
+      dbgs() << printReg(CP.getDstReg(), TRI);
+    else
+      dbgs() << LIS->getInterval(CP.getDstReg());
+    dbgs() << '\n';
+  });
+
+  ++numJoins;
+  return true;
+}
+
+bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
+  Register DstReg = CP.getDstReg();
+  Register SrcReg = CP.getSrcReg();
+  assert(CP.isPhys() && "Must be a physreg copy");
+  assert(MRI->isReserved(DstReg) && "Not a reserved register");
+  LiveInterval &RHS = LIS->getInterval(SrcReg);
+  LLVM_DEBUG(dbgs() << "\t\tRHS = " << RHS << '\n');
+
+  assert(RHS.containsOneValue() && "Invalid join with reserved register");
+
+  // Optimization for reserved registers like ESP. We can only merge with a
+  // reserved physreg if RHS has a single value that is a copy of DstReg.
+  // The live range of the reserved register will look like a set of dead defs
+  // - we don't properly track the live range of reserved registers.
+
+  // Deny any overlapping intervals.  This depends on all the reserved
+  // register live ranges to look like dead defs.
+  if (!MRI->isConstantPhysReg(DstReg)) {
+    for (MCRegUnit Unit : TRI->regunits(DstReg)) {
+      // Abort if not all the regunits are reserved.
+      for (MCRegUnitRootIterator RI(Unit, TRI); RI.isValid(); ++RI) {
+        if (!MRI->isReserved(*RI))
+          return false;
+      }
+      if (RHS.overlaps(LIS->getRegUnit(Unit))) {
+        LLVM_DEBUG(dbgs() << "\t\tInterference: " << printRegUnit(Unit, TRI)
+                          << '\n');
+        return false;
+      }
+    }
+
+    // We must also check for overlaps with regmask clobbers.
+    BitVector RegMaskUsable;
+    if (LIS->checkRegMaskInterference(RHS, RegMaskUsable) &&
+        !RegMaskUsable.test(DstReg)) {
+      LLVM_DEBUG(dbgs() << "\t\tRegMask interference\n");
+      return false;
+    }
+  }
+
+  // Skip any value computations, we are not adding new values to the
+  // reserved register.  Also skip merging the live ranges, the reserved
+  // register live range doesn't need to be accurate as long as all the
+  // defs are there.
+
+  // Delete the identity copy.
+  MachineInstr *CopyMI;
+  if (CP.isFlipped()) {
+    // Physreg is copied into vreg
+    //   %y = COPY %physreg_x
+    //   ...  //< no other def of %physreg_x here
+    //   use %y
+    // =>
+    //   ...
+    //   use %physreg_x
+    CopyMI = MRI->getVRegDef(SrcReg);
+    deleteInstr(CopyMI);
+  } else {
+    // VReg is copied into physreg:
+    //   %y = def
+    //   ... //< no other def or use of %physreg_x here
+    //   %physreg_x = COPY %y
+    // =>
+    //   %physreg_x = def
+    //   ...
+    if (!MRI->hasOneNonDBGUse(SrcReg)) {
+      LLVM_DEBUG(dbgs() << "\t\tMultiple vreg uses!\n");
+      return false;
+    }
+
+    if (!LIS->intervalIsInOneMBB(RHS)) {
+      LLVM_DEBUG(dbgs() << "\t\tComplex control flow!\n");
+      return false;
+    }
+
+    MachineInstr &DestMI = *MRI->getVRegDef(SrcReg);
+    CopyMI = &*MRI->use_instr_nodbg_begin(SrcReg);
+    SlotIndex CopyRegIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
+    SlotIndex DestRegIdx = LIS->getInstructionIndex(DestMI).getRegSlot();
+
+    if (!MRI->isConstantPhysReg(DstReg)) {
+      // We checked above that there are no interfering defs of the physical
+      // register. However, for this case, where we intend to move up the def of
+      // the physical register, we also need to check for interfering uses.
+      SlotIndexes *Indexes = LIS->getSlotIndexes();
+      for (SlotIndex SI = Indexes->getNextNonNullIndex(DestRegIdx);
+           SI != CopyRegIdx; SI = Indexes->getNextNonNullIndex(SI)) {
+        MachineInstr *MI = LIS->getInstructionFromIndex(SI);
+        if (MI->readsRegister(DstReg, TRI)) {
+          LLVM_DEBUG(dbgs() << "\t\tInterference (read): " << *MI);
+          return false;
+        }
+      }
+    }
+
+    // We're going to remove the copy which defines a physical reserved
+    // register, so remove its valno, etc.
+    LLVM_DEBUG(dbgs() << "\t\tRemoving phys reg def of "
+                      << printReg(DstReg, TRI) << " at " << CopyRegIdx << "\n");
+
+    LIS->removePhysRegDefAt(DstReg.asMCReg(), CopyRegIdx);
+    deleteInstr(CopyMI);
+
+    // Create a new dead def at the new def location.
+    for (MCRegUnit Unit : TRI->regunits(DstReg)) {
+      LiveRange &LR = LIS->getRegUnit(Unit);
+      LR.createDeadDef(DestRegIdx, LIS->getVNInfoAllocator());
+    }
+  }
+
+  // We don't track kills for reserved registers.
+  MRI->clearKillFlags(CP.getSrcReg());
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                 Interference checking and interval joining
+//===----------------------------------------------------------------------===//
+//
+// In the easiest case, the two live ranges being joined are disjoint, and
+// there is no interference to consider. It is quite common, though, to have
+// overlapping live ranges, and we need to check if the interference can be
+// resolved.
+//
+// The live range of a single SSA value forms a sub-tree of the dominator tree.
+// This means that two SSA values overlap if and only if the def of one value
+// is contained in the live range of the other value. As a special case, the
+// overlapping values can be defined at the same index.
+//
+// The interference from an overlapping def can be resolved in these cases:
+//
+// 1. Coalescable copies. The value is defined by a copy that would become an
+//    identity copy after joining SrcReg and DstReg. The copy instruction will
+//    be removed, and the value will be merged with the source value.
+//
+//    There can be several copies back and forth, causing many values to be
+//    merged into one. We compute a list of ultimate values in the joined live
+//    range as well as a mappings from the old value numbers.
+//
+// 2. IMPLICIT_DEF. This instruction is only inserted to ensure all PHI
+//    predecessors have a live out value. It doesn't cause real interference,
+//    and can be merged into the value it overlaps. Like a coalescable copy, it
+//    can be erased after joining.
+//
+// 3. Copy of external value. The overlapping def may be a copy of a value that
+//    is already in the other register. This is like a coalescable copy, but
+//    the live range of the source register must be trimmed after erasing the
+//    copy instruction:
+//
+//      %src = COPY %ext
+//      %dst = COPY %ext  <-- Remove this COPY, trim the live range of %ext.
+//
+// 4. Clobbering undefined lanes. Vector registers are sometimes built by
+//    defining one lane at a time:
+//
+//      %dst:ssub0<def,read-undef> = FOO
+//      %src = BAR
+//      %dst:ssub1 = COPY %src
+//
+//    The live range of %src overlaps the %dst value defined by FOO, but
+//    merging %src into %dst:ssub1 is only going to clobber the ssub1 lane
+//    which was undef anyway.
+//
+//    The value mapping is more complicated in this case. The final live range
+//    will have different value numbers for both FOO and BAR, but there is no
+//    simple mapping from old to new values. It may even be necessary to add
+//    new PHI values.
+//
+// 5. Clobbering dead lanes. A def may clobber a lane of a vector register that
+//    is live, but never read. This can happen because we don't compute
+//    individual live ranges per lane.
+//
+//      %dst = FOO
+//      %src = BAR
+//      %dst:ssub1 = COPY %src
+//
+//    This kind of interference is only resolved locally. If the clobbered
+//    lane value escapes the block, the join is aborted.
+
+namespace {
+
+/// Track information about values in a single virtual register about to be
+/// joined. Objects of this class are always created in pairs - one for each
+/// side of the CoalescerPair (or one for each lane of a side of the coalescer
+/// pair)
+class JoinVals {
+  /// Live range we work on.
+  LiveRange &LR;
+
+  /// (Main) register we work on.
+  const Register Reg;
+
+  /// Reg (and therefore the values in this liverange) will end up as
+  /// subregister SubIdx in the coalesced register. Either CP.DstIdx or
+  /// CP.SrcIdx.
+  const unsigned SubIdx;
+
+  /// The LaneMask that this liverange will occupy the coalesced register. May
+  /// be smaller than the lanemask produced by SubIdx when merging subranges.
+  const LaneBitmask LaneMask;
+
+  /// This is true when joining sub register ranges, false when joining main
+  /// ranges.
+  const bool SubRangeJoin;
+
+  /// Whether the current LiveInterval tracks subregister liveness.
+  const bool TrackSubRegLiveness;
+
+  /// Values that will be present in the final live range.
+  SmallVectorImpl<VNInfo*> &NewVNInfo;
+
+  const CoalescerPair &CP;
+  LiveIntervals *LIS;
+  SlotIndexes *Indexes;
+  const TargetRegisterInfo *TRI;
+
+  /// Value number assignments. Maps value numbers in LI to entries in
+  /// NewVNInfo. This is suitable for passing to LiveInterval::join().
+  SmallVector<int, 8> Assignments;
+
+  public:
+  /// Conflict resolution for overlapping values.
+  enum ConflictResolution {
+    /// No overlap, simply keep this value.
+    CR_Keep,
+
+    /// Merge this value into OtherVNI and erase the defining instruction.
+    /// Used for IMPLICIT_DEF, coalescable copies, and copies from external
+    /// values.
+    CR_Erase,
+
+    /// Merge this value into OtherVNI but keep the defining instruction.
+    /// This is for the special case where OtherVNI is defined by the same
+    /// instruction.
+    CR_Merge,
+
+    /// Keep this value, and have it replace OtherVNI where possible. This
+    /// complicates value mapping since OtherVNI maps to two different values
+    /// before and after this def.
+    /// Used when clobbering undefined or dead lanes.
+    CR_Replace,
+
+    /// Unresolved conflict. Visit later when all values have been mapped.
+    CR_Unresolved,
+
+    /// Unresolvable conflict. Abort the join.
+    CR_Impossible
+  };
+
+  private:
+  /// Per-value info for LI. The lane bit masks are all relative to the final
+  /// joined register, so they can be compared directly between SrcReg and
+  /// DstReg.
+  struct Val {
+    ConflictResolution Resolution = CR_Keep;
+
+    /// Lanes written by this def, 0 for unanalyzed values.
+    LaneBitmask WriteLanes;
+
+    /// Lanes with defined values in this register. Other lanes are undef and
+    /// safe to clobber.
+    LaneBitmask ValidLanes;
+
+    /// Value in LI being redefined by this def.
+    VNInfo *RedefVNI = nullptr;
+
+    /// Value in the other live range that overlaps this def, if any.
+    VNInfo *OtherVNI = nullptr;
+
+    /// Is this value an IMPLICIT_DEF that can be erased?
+    ///
+    /// IMPLICIT_DEF values should only exist at the end of a basic block that
+    /// is a predecessor to a phi-value. These IMPLICIT_DEF instructions can be
+    /// safely erased if they are overlapping a live value in the other live
+    /// interval.
+    ///
+    /// Weird control flow graphs and incomplete PHI handling in
+    /// ProcessImplicitDefs can very rarely create IMPLICIT_DEF values with
+    /// longer live ranges. Such IMPLICIT_DEF values should be treated like
+    /// normal values.
+    bool ErasableImplicitDef = false;
+
+    /// True when the live range of this value will be pruned because of an
+    /// overlapping CR_Replace value in the other live range.
+    bool Pruned = false;
+
+    /// True once Pruned above has been computed.
+    bool PrunedComputed = false;
+
+    /// True if this value is determined to be identical to OtherVNI
+    /// (in valuesIdentical). This is used with CR_Erase where the erased
+    /// copy is redundant, i.e. the source value is already the same as
+    /// the destination. In such cases the subranges need to be updated
+    /// properly. See comment at pruneSubRegValues for more info.
+    bool Identical = false;
+
+    Val() = default;
+
+    bool isAnalyzed() const { return WriteLanes.any(); }
+  };
+
+  /// One entry per value number in LI.
+  SmallVector<Val, 8> Vals;
+
+  /// Compute the bitmask of lanes actually written by DefMI.
+  /// Set Redef if there are any partial register definitions that depend on the
+  /// previous value of the register.
+  LaneBitmask computeWriteLanes(const MachineInstr *DefMI, bool &Redef) const;
+
+  /// Find the ultimate value that VNI was copied from.
+  std::pair<const VNInfo *, Register> followCopyChain(const VNInfo *VNI) const;
+
+  bool valuesIdentical(VNInfo *Value0, VNInfo *Value1, const JoinVals &Other) const;
+
+  /// Analyze ValNo in this live range, and set all fields of Vals[ValNo].
+  /// Return a conflict resolution when possible, but leave the hard cases as
+  /// CR_Unresolved.
+  /// Recursively calls computeAssignment() on this and Other, guaranteeing that
+  /// both OtherVNI and RedefVNI have been analyzed and mapped before returning.
+  /// The recursion always goes upwards in the dominator tree, making loops
+  /// impossible.
+  ConflictResolution analyzeValue(unsigned ValNo, JoinVals &Other);
+
+  /// Compute the value assignment for ValNo in RI.
+  /// This may be called recursively by analyzeValue(), but never for a ValNo on
+  /// the stack.
+  void computeAssignment(unsigned ValNo, JoinVals &Other);
+
+  /// Assuming ValNo is going to clobber some valid lanes in Other.LR, compute
+  /// the extent of the tainted lanes in the block.
+  ///
+  /// Multiple values in Other.LR can be affected since partial redefinitions
+  /// can preserve previously tainted lanes.
+  ///
+  ///   1 %dst = VLOAD           <-- Define all lanes in %dst
+  ///   2 %src = FOO             <-- ValNo to be joined with %dst:ssub0
+  ///   3 %dst:ssub1 = BAR       <-- Partial redef doesn't clear taint in ssub0
+  ///   4 %dst:ssub0 = COPY %src <-- Conflict resolved, ssub0 wasn't read
+  ///
+  /// For each ValNo in Other that is affected, add an (EndIndex, TaintedLanes)
+  /// entry to TaintedVals.
+  ///
+  /// Returns false if the tainted lanes extend beyond the basic block.
+  bool
+  taintExtent(unsigned ValNo, LaneBitmask TaintedLanes, JoinVals &Other,
+              SmallVectorImpl<std::pair<SlotIndex, LaneBitmask>> &TaintExtent);
+
+  /// Return true if MI uses any of the given Lanes from Reg.
+  /// This does not include partial redefinitions of Reg.
+  bool usesLanes(const MachineInstr &MI, Register, unsigned, LaneBitmask) const;
+
+  /// Determine if ValNo is a copy of a value number in LR or Other.LR that will
+  /// be pruned:
+  ///
+  ///   %dst = COPY %src
+  ///   %src = COPY %dst  <-- This value to be pruned.
+  ///   %dst = COPY %src  <-- This value is a copy of a pruned value.
+  bool isPrunedValue(unsigned ValNo, JoinVals &Other);
+
+public:
+  JoinVals(LiveRange &LR, Register Reg, unsigned SubIdx, LaneBitmask LaneMask,
+           SmallVectorImpl<VNInfo *> &newVNInfo, const CoalescerPair &cp,
+           LiveIntervals *lis, const TargetRegisterInfo *TRI, bool SubRangeJoin,
+           bool TrackSubRegLiveness)
+      : LR(LR), Reg(Reg), SubIdx(SubIdx), LaneMask(LaneMask),
+        SubRangeJoin(SubRangeJoin), TrackSubRegLiveness(TrackSubRegLiveness),
+        NewVNInfo(newVNInfo), CP(cp), LIS(lis), Indexes(LIS->getSlotIndexes()),
+        TRI(TRI), Assignments(LR.getNumValNums(), -1),
+        Vals(LR.getNumValNums()) {}
+
+  /// Analyze defs in LR and compute a value mapping in NewVNInfo.
+  /// Returns false if any conflicts were impossible to resolve.
+  bool mapValues(JoinVals &Other);
+
+  /// Try to resolve conflicts that require all values to be mapped.
+  /// Returns false if any conflicts were impossible to resolve.
+  bool resolveConflicts(JoinVals &Other);
+
+  /// Prune the live range of values in Other.LR where they would conflict with
+  /// CR_Replace values in LR. Collect end points for restoring the live range
+  /// after joining.
+  void pruneValues(JoinVals &Other, SmallVectorImpl<SlotIndex> &EndPoints,
+                   bool changeInstrs);
+
+  /// Removes subranges starting at copies that get removed. This sometimes
+  /// happens when undefined subranges are copied around. These ranges contain
+  /// no useful information and can be removed.
+  void pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask);
+
+  /// Pruning values in subranges can lead to removing segments in these
+  /// subranges started by IMPLICIT_DEFs. The corresponding segments in
+  /// the main range also need to be removed. This function will mark
+  /// the corresponding values in the main range as pruned, so that
+  /// eraseInstrs can do the final cleanup.
+  /// The parameter @p LI must be the interval whose main range is the
+  /// live range LR.
+  void pruneMainSegments(LiveInterval &LI, bool &ShrinkMainRange);
+
+  /// Erase any machine instructions that have been coalesced away.
+  /// Add erased instructions to ErasedInstrs.
+  /// Add foreign virtual registers to ShrinkRegs if their live range ended at
+  /// the erased instrs.
+  void eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
+                   SmallVectorImpl<Register> &ShrinkRegs,
+                   LiveInterval *LI = nullptr);
+
+  /// Remove liverange defs at places where implicit defs will be removed.
+  void removeImplicitDefs();
+
+  /// Get the value assignments suitable for passing to LiveInterval::join.
+  const int *getAssignments() const { return Assignments.data(); }
+
+  /// Get the conflict resolution for a value number.
+  ConflictResolution getResolution(unsigned Num) const {
+    return Vals[Num].Resolution;
+  }
+};
+
+} // end anonymous namespace
+
+LaneBitmask JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef)
+  const {
+  LaneBitmask L;
+  for (const MachineOperand &MO : DefMI->all_defs()) {
+    if (MO.getReg() != Reg)
+      continue;
+    L |= TRI->getSubRegIndexLaneMask(
+           TRI->composeSubRegIndices(SubIdx, MO.getSubReg()));
+    if (MO.readsReg())
+      Redef = true;
+  }
+  return L;
+}
+
+std::pair<const VNInfo *, Register>
+JoinVals::followCopyChain(const VNInfo *VNI) const {
+  Register TrackReg = Reg;
+
+  while (!VNI->isPHIDef()) {
+    SlotIndex Def = VNI->def;
+    MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
+    assert(MI && "No defining instruction");
+    if (!MI->isFullCopy())
+      return std::make_pair(VNI, TrackReg);
+    Register SrcReg = MI->getOperand(1).getReg();
+    if (!SrcReg.isVirtual())
+      return std::make_pair(VNI, TrackReg);
+
+    const LiveInterval &LI = LIS->getInterval(SrcReg);
+    const VNInfo *ValueIn;
+    // No subrange involved.
+    if (!SubRangeJoin || !LI.hasSubRanges()) {
+      LiveQueryResult LRQ = LI.Query(Def);
+      ValueIn = LRQ.valueIn();
+    } else {
+      // Query subranges. Ensure that all matching ones take us to the same def
+      // (allowing some of them to be undef).
+      ValueIn = nullptr;
+      for (const LiveInterval::SubRange &S : LI.subranges()) {
+        // Transform lanemask to a mask in the joined live interval.
+        LaneBitmask SMask = TRI->composeSubRegIndexLaneMask(SubIdx, S.LaneMask);
+        if ((SMask & LaneMask).none())
+          continue;
+        LiveQueryResult LRQ = S.Query(Def);
+        if (!ValueIn) {
+          ValueIn = LRQ.valueIn();
+          continue;
+        }
+        if (LRQ.valueIn() && ValueIn != LRQ.valueIn())
+          return std::make_pair(VNI, TrackReg);
+      }
+    }
+    if (ValueIn == nullptr) {
+      // Reaching an undefined value is legitimate, for example:
+      //
+      // 1   undef %0.sub1 = ...  ;; %0.sub0 == undef
+      // 2   %1 = COPY %0         ;; %1 is defined here.
+      // 3   %0 = COPY %1         ;; Now %0.sub0 has a definition,
+      //                          ;; but it's equivalent to "undef".
+      return std::make_pair(nullptr, SrcReg);
+    }
+    VNI = ValueIn;
+    TrackReg = SrcReg;
+  }
+  return std::make_pair(VNI, TrackReg);
+}
+
+bool JoinVals::valuesIdentical(VNInfo *Value0, VNInfo *Value1,
+                               const JoinVals &Other) const {
+  const VNInfo *Orig0;
+  Register Reg0;
+  std::tie(Orig0, Reg0) = followCopyChain(Value0);
+  if (Orig0 == Value1 && Reg0 == Other.Reg)
+    return true;
+
+  const VNInfo *Orig1;
+  Register Reg1;
+  std::tie(Orig1, Reg1) = Other.followCopyChain(Value1);
+  // If both values are undefined, and the source registers are the same
+  // register, the values are identical. Filter out cases where only one
+  // value is defined.
+  if (Orig0 == nullptr || Orig1 == nullptr)
+    return Orig0 == Orig1 && Reg0 == Reg1;
+
+  // The values are equal if they are defined at the same place and use the
+  // same register. Note that we cannot compare VNInfos directly as some of
+  // them might be from a copy created in mergeSubRangeInto()  while the other
+  // is from the original LiveInterval.
+  return Orig0->def == Orig1->def && Reg0 == Reg1;
+}
+
+JoinVals::ConflictResolution
+JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
+  Val &V = Vals[ValNo];
+  assert(!V.isAnalyzed() && "Value has already been analyzed!");
+  VNInfo *VNI = LR.getValNumInfo(ValNo);
+  if (VNI->isUnused()) {
+    V.WriteLanes = LaneBitmask::getAll();
+    return CR_Keep;
+  }
+
+  // Get the instruction defining this value, compute the lanes written.
+  const MachineInstr *DefMI = nullptr;
+  if (VNI->isPHIDef()) {
+    // Conservatively assume that all lanes in a PHI are valid.
+    LaneBitmask Lanes = SubRangeJoin ? LaneBitmask::getLane(0)
+                                     : TRI->getSubRegIndexLaneMask(SubIdx);
+    V.ValidLanes = V.WriteLanes = Lanes;
+  } else {
+    DefMI = Indexes->getInstructionFromIndex(VNI->def);
+    assert(DefMI != nullptr);
+    if (SubRangeJoin) {
+      // We don't care about the lanes when joining subregister ranges.
+      V.WriteLanes = V.ValidLanes = LaneBitmask::getLane(0);
+      if (DefMI->isImplicitDef()) {
+        V.ValidLanes = LaneBitmask::getNone();
+        V.ErasableImplicitDef = true;
+      }
+    } else {
+      bool Redef = false;
+      V.ValidLanes = V.WriteLanes = computeWriteLanes(DefMI, Redef);
+
+      // If this is a read-modify-write instruction, there may be more valid
+      // lanes than the ones written by this instruction.
+      // This only covers partial redef operands. DefMI may have normal use
+      // operands reading the register. They don't contribute valid lanes.
+      //
+      // This adds ssub1 to the set of valid lanes in %src:
+      //
+      //   %src:ssub1 = FOO
+      //
+      // This leaves only ssub1 valid, making any other lanes undef:
+      //
+      //   %src:ssub1<def,read-undef> = FOO %src:ssub2
+      //
+      // The <read-undef> flag on the def operand means that old lane values are
+      // not important.
+      if (Redef) {
+        V.RedefVNI = LR.Query(VNI->def).valueIn();
+        assert((TrackSubRegLiveness || V.RedefVNI) &&
+               "Instruction is reading nonexistent value");
+        if (V.RedefVNI != nullptr) {
+          computeAssignment(V.RedefVNI->id, Other);
+          V.ValidLanes |= Vals[V.RedefVNI->id].ValidLanes;
+        }
+      }
+
+      // An IMPLICIT_DEF writes undef values.
+      if (DefMI->isImplicitDef()) {
+        // We normally expect IMPLICIT_DEF values to be live only until the end
+        // of their block. If the value is really live longer and gets pruned in
+        // another block, this flag is cleared again.
+        //
+        // Clearing the valid lanes is deferred until it is sure this can be
+        // erased.
+        V.ErasableImplicitDef = true;
+      }
+    }
+  }
+
+  // Find the value in Other that overlaps VNI->def, if any.
+  LiveQueryResult OtherLRQ = Other.LR.Query(VNI->def);
+
+  // It is possible that both values are defined by the same instruction, or
+  // the values are PHIs defined in the same block. When that happens, the two
+  // values should be merged into one, but not into any preceding value.
+  // The first value defined or visited gets CR_Keep, the other gets CR_Merge.
+  if (VNInfo *OtherVNI = OtherLRQ.valueDefined()) {
+    assert(SlotIndex::isSameInstr(VNI->def, OtherVNI->def) && "Broken LRQ");
+
+    // One value stays, the other is merged. Keep the earlier one, or the first
+    // one we see.
+    if (OtherVNI->def < VNI->def)
+      Other.computeAssignment(OtherVNI->id, *this);
+    else if (VNI->def < OtherVNI->def && OtherLRQ.valueIn()) {
+      // This is an early-clobber def overlapping a live-in value in the other
+      // register. Not mergeable.
+      V.OtherVNI = OtherLRQ.valueIn();
+      return CR_Impossible;
+    }
+    V.OtherVNI = OtherVNI;
+    Val &OtherV = Other.Vals[OtherVNI->id];
+    // Keep this value, check for conflicts when analyzing OtherVNI. Avoid
+    // revisiting OtherVNI->id in JoinVals::computeAssignment() below before it
+    // is assigned.
+    if (!OtherV.isAnalyzed() || Other.Assignments[OtherVNI->id] == -1)
+      return CR_Keep;
+    // Both sides have been analyzed now.
+    // Allow overlapping PHI values. Any real interference would show up in a
+    // predecessor, the PHI itself can't introduce any conflicts.
+    if (VNI->isPHIDef())
+      return CR_Merge;
+    if ((V.ValidLanes & OtherV.ValidLanes).any())
+      // Overlapping lanes can't be resolved.
+      return CR_Impossible;
+    else
+      return CR_Merge;
+  }
+
+  // No simultaneous def. Is Other live at the def?
+  V.OtherVNI = OtherLRQ.valueIn();
+  if (!V.OtherVNI)
+    // No overlap, no conflict.
+    return CR_Keep;
+
+  assert(!SlotIndex::isSameInstr(VNI->def, V.OtherVNI->def) && "Broken LRQ");
+
+  // We have overlapping values, or possibly a kill of Other.
+  // Recursively compute assignments up the dominator tree.
+  Other.computeAssignment(V.OtherVNI->id, *this);
+  Val &OtherV = Other.Vals[V.OtherVNI->id];
+
+  if (OtherV.ErasableImplicitDef) {
+    // Check if OtherV is an IMPLICIT_DEF that extends beyond its basic block.
+    // This shouldn't normally happen, but ProcessImplicitDefs can leave such
+    // IMPLICIT_DEF instructions behind, and there is nothing wrong with it
+    // technically.
+    //
+    // When it happens, treat that IMPLICIT_DEF as a normal value, and don't try
+    // to erase the IMPLICIT_DEF instruction.
+    MachineBasicBlock *OtherMBB = Indexes->getMBBFromIndex(V.OtherVNI->def);
+    if (DefMI && DefMI->getParent() != OtherMBB) {
+      LLVM_DEBUG(dbgs() << "IMPLICIT_DEF defined at " << V.OtherVNI->def
+                 << " extends into "
+                 << printMBBReference(*DefMI->getParent())
+                 << ", keeping it.\n");
+      OtherV.ErasableImplicitDef = false;
+    } else if (OtherMBB->hasEHPadSuccessor()) {
+      // If OtherV is defined in a basic block that has EH pad successors then
+      // we get the same problem not just if OtherV is live beyond its basic
+      // block, but beyond the last call instruction in its basic block. Handle
+      // this case conservatively.
+      LLVM_DEBUG(
+          dbgs() << "IMPLICIT_DEF defined at " << V.OtherVNI->def
+                 << " may be live into EH pad successors, keeping it.\n");
+      OtherV.ErasableImplicitDef = false;
+    } else {
+      // We deferred clearing these lanes in case we needed to save them
+      OtherV.ValidLanes &= ~OtherV.WriteLanes;
+    }
+  }
+
+  // Allow overlapping PHI values. Any real interference would show up in a
+  // predecessor, the PHI itself can't introduce any conflicts.
+  if (VNI->isPHIDef())
+    return CR_Replace;
+
+  // Check for simple erasable conflicts.
+  if (DefMI->isImplicitDef())
+    return CR_Erase;
+
+  // Include the non-conflict where DefMI is a coalescable copy that kills
+  // OtherVNI. We still want the copy erased and value numbers merged.
+  if (CP.isCoalescable(DefMI)) {
+    // Some of the lanes copied from OtherVNI may be undef, making them undef
+    // here too.
+    V.ValidLanes &= ~V.WriteLanes | OtherV.ValidLanes;
+    return CR_Erase;
+  }
+
+  // This may not be a real conflict if DefMI simply kills Other and defines
+  // VNI.
+  if (OtherLRQ.isKill() && OtherLRQ.endPoint() <= VNI->def)
+    return CR_Keep;
+
+  // Handle the case where VNI and OtherVNI can be proven to be identical:
+  //
+  //   %other = COPY %ext
+  //   %this  = COPY %ext <-- Erase this copy
+  //
+  if (DefMI->isFullCopy() && !CP.isPartial() &&
+      valuesIdentical(VNI, V.OtherVNI, Other)) {
+    V.Identical = true;
+    return CR_Erase;
+  }
+
+  // The remaining checks apply to the lanes, which aren't tracked here.  This
+  // was already decided to be OK via the following CR_Replace condition.
+  // CR_Replace.
+  if (SubRangeJoin)
+    return CR_Replace;
+
+  // If the lanes written by this instruction were all undef in OtherVNI, it is
+  // still safe to join the live ranges. This can't be done with a simple value
+  // mapping, though - OtherVNI will map to multiple values:
+  //
+  //   1 %dst:ssub0 = FOO                <-- OtherVNI
+  //   2 %src = BAR                      <-- VNI
+  //   3 %dst:ssub1 = COPY killed %src    <-- Eliminate this copy.
+  //   4 BAZ killed %dst
+  //   5 QUUX killed %src
+  //
+  // Here OtherVNI will map to itself in [1;2), but to VNI in [2;5). CR_Replace
+  // handles this complex value mapping.
+  if ((V.WriteLanes & OtherV.ValidLanes).none())
+    return CR_Replace;
+
+  // If the other live range is killed by DefMI and the live ranges are still
+  // overlapping, it must be because we're looking at an early clobber def:
+  //
+  //   %dst<def,early-clobber> = ASM killed %src
+  //
+  // In this case, it is illegal to merge the two live ranges since the early
+  // clobber def would clobber %src before it was read.
+  if (OtherLRQ.isKill()) {
+    // This case where the def doesn't overlap the kill is handled above.
+    assert(VNI->def.isEarlyClobber() &&
+           "Only early clobber defs can overlap a kill");
+    return CR_Impossible;
+  }
+
+  // VNI is clobbering live lanes in OtherVNI, but there is still the
+  // possibility that no instructions actually read the clobbered lanes.
+  // If we're clobbering all the lanes in OtherVNI, at least one must be read.
+  // Otherwise Other.RI wouldn't be live here.
+  if ((TRI->getSubRegIndexLaneMask(Other.SubIdx) & ~V.WriteLanes).none())
+    return CR_Impossible;
+
+  if (TrackSubRegLiveness) {
+    auto &OtherLI = LIS->getInterval(Other.Reg);
+    // If OtherVNI does not have subranges, it means all the lanes of OtherVNI
+    // share the same live range, so we just need to check whether they have
+    // any conflict bit in their LaneMask.
+    if (!OtherLI.hasSubRanges()) {
+      LaneBitmask OtherMask = TRI->getSubRegIndexLaneMask(Other.SubIdx);
+      return (OtherMask & V.WriteLanes).none() ? CR_Replace : CR_Impossible;
+    }
+
+    // If we are clobbering some active lanes of OtherVNI at VNI->def, it is
+    // impossible to resolve the conflict. Otherwise, we can just replace
+    // OtherVNI because of no real conflict.
+    for (LiveInterval::SubRange &OtherSR : OtherLI.subranges()) {
+      LaneBitmask OtherMask =
+          TRI->composeSubRegIndexLaneMask(Other.SubIdx, OtherSR.LaneMask);
+      if ((OtherMask & V.WriteLanes).none())
+        continue;
+
+      auto OtherSRQ = OtherSR.Query(VNI->def);
+      if (OtherSRQ.valueIn() && OtherSRQ.endPoint() > VNI->def) {
+        // VNI is clobbering some lanes of OtherVNI, they have real conflict.
+        return CR_Impossible;
+      }
+    }
+
+    // VNI is NOT clobbering any lane of OtherVNI, just replace OtherVNI.
+    return CR_Replace;
+  }
+
+  // We need to verify that no instructions are reading the clobbered lanes.
+  // To save compile time, we'll only check that locally. Don't allow the
+  // tainted value to escape the basic block.
+  MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
+  if (OtherLRQ.endPoint() >= Indexes->getMBBEndIdx(MBB))
+    return CR_Impossible;
+
+  // There are still some things that could go wrong besides clobbered lanes
+  // being read, for example OtherVNI may be only partially redefined in MBB,
+  // and some clobbered lanes could escape the block. Save this analysis for
+  // resolveConflicts() when all values have been mapped. We need to know
+  // RedefVNI and WriteLanes for any later defs in MBB, and we can't compute
+  // that now - the recursive analyzeValue() calls must go upwards in the
+  // dominator tree.
+  return CR_Unresolved;
+}
+
+void JoinVals::computeAssignment(unsigned ValNo, JoinVals &Other) {
+  Val &V = Vals[ValNo];
+  if (V.isAnalyzed()) {
+    // Recursion should always move up the dominator tree, so ValNo is not
+    // supposed to reappear before it has been assigned.
+    assert(Assignments[ValNo] != -1 && "Bad recursion?");
+    return;
+  }
+  switch ((V.Resolution = analyzeValue(ValNo, Other))) {
+  case CR_Erase:
+  case CR_Merge:
+    // Merge this ValNo into OtherVNI.
+    assert(V.OtherVNI && "OtherVNI not assigned, can't merge.");
+    assert(Other.Vals[V.OtherVNI->id].isAnalyzed() && "Missing recursion");
+    Assignments[ValNo] = Other.Assignments[V.OtherVNI->id];
+    LLVM_DEBUG(dbgs() << "\t\tmerge " << printReg(Reg) << ':' << ValNo << '@'
+                      << LR.getValNumInfo(ValNo)->def << " into "
+                      << printReg(Other.Reg) << ':' << V.OtherVNI->id << '@'
+                      << V.OtherVNI->def << " --> @"
+                      << NewVNInfo[Assignments[ValNo]]->def << '\n');
+    break;
+  case CR_Replace:
+  case CR_Unresolved: {
+    // The other value is going to be pruned if this join is successful.
+    assert(V.OtherVNI && "OtherVNI not assigned, can't prune");
+    Val &OtherV = Other.Vals[V.OtherVNI->id];
+    // We cannot erase an IMPLICIT_DEF if we don't have valid values for all
+    // its lanes.
+    if (OtherV.ErasableImplicitDef &&
+        TrackSubRegLiveness &&
+        (OtherV.ValidLanes & ~V.ValidLanes).any()) {
+      LLVM_DEBUG(dbgs() << "Cannot erase implicit_def with missing values\n");
+
+      OtherV.ErasableImplicitDef = false;
+      // The valid lanes written by the implicit_def were speculatively cleared
+      // before, so make this more conservative. It may be better to track this,
+      // I haven't found a testcase where it matters.
+      OtherV.ValidLanes = LaneBitmask::getAll();
+    }
+
+    OtherV.Pruned = true;
+    [[fallthrough]];
+  }
+  default:
+    // This value number needs to go in the final joined live range.
+    Assignments[ValNo] = NewVNInfo.size();
+    NewVNInfo.push_back(LR.getValNumInfo(ValNo));
+    break;
+  }
+}
+
+bool JoinVals::mapValues(JoinVals &Other) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    computeAssignment(i, Other);
+    if (Vals[i].Resolution == CR_Impossible) {
+      LLVM_DEBUG(dbgs() << "\t\tinterference at " << printReg(Reg) << ':' << i
+                        << '@' << LR.getValNumInfo(i)->def << '\n');
+      return false;
+    }
+  }
+  return true;
+}
+
+bool JoinVals::
+taintExtent(unsigned ValNo, LaneBitmask TaintedLanes, JoinVals &Other,
+            SmallVectorImpl<std::pair<SlotIndex, LaneBitmask>> &TaintExtent) {
+  VNInfo *VNI = LR.getValNumInfo(ValNo);
+  MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
+  SlotIndex MBBEnd = Indexes->getMBBEndIdx(MBB);
+
+  // Scan Other.LR from VNI.def to MBBEnd.
+  LiveInterval::iterator OtherI = Other.LR.find(VNI->def);
+  assert(OtherI != Other.LR.end() && "No conflict?");
+  do {
+    // OtherI is pointing to a tainted value. Abort the join if the tainted
+    // lanes escape the block.
+    SlotIndex End = OtherI->end;
+    if (End >= MBBEnd) {
+      LLVM_DEBUG(dbgs() << "\t\ttaints global " << printReg(Other.Reg) << ':'
+                        << OtherI->valno->id << '@' << OtherI->start << '\n');
+      return false;
+    }
+    LLVM_DEBUG(dbgs() << "\t\ttaints local " << printReg(Other.Reg) << ':'
+                      << OtherI->valno->id << '@' << OtherI->start << " to "
+                      << End << '\n');
+    // A dead def is not a problem.
+    if (End.isDead())
+      break;
+    TaintExtent.push_back(std::make_pair(End, TaintedLanes));
+
+    // Check for another def in the MBB.
+    if (++OtherI == Other.LR.end() || OtherI->start >= MBBEnd)
+      break;
+
+    // Lanes written by the new def are no longer tainted.
+    const Val &OV = Other.Vals[OtherI->valno->id];
+    TaintedLanes &= ~OV.WriteLanes;
+    if (!OV.RedefVNI)
+      break;
+  } while (TaintedLanes.any());
+  return true;
+}
+
+bool JoinVals::usesLanes(const MachineInstr &MI, Register Reg, unsigned SubIdx,
+                         LaneBitmask Lanes) const {
+  if (MI.isDebugOrPseudoInstr())
+    return false;
+  for (const MachineOperand &MO : MI.all_uses()) {
+    if (MO.getReg() != Reg)
+      continue;
+    if (!MO.readsReg())
+      continue;
+    unsigned S = TRI->composeSubRegIndices(SubIdx, MO.getSubReg());
+    if ((Lanes & TRI->getSubRegIndexLaneMask(S)).any())
+      return true;
+  }
+  return false;
+}
+
+bool JoinVals::resolveConflicts(JoinVals &Other) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    Val &V = Vals[i];
+    assert(V.Resolution != CR_Impossible && "Unresolvable conflict");
+    if (V.Resolution != CR_Unresolved)
+      continue;
+    LLVM_DEBUG(dbgs() << "\t\tconflict at " << printReg(Reg) << ':' << i << '@'
+                      << LR.getValNumInfo(i)->def
+                      << ' ' << PrintLaneMask(LaneMask) << '\n');
+    if (SubRangeJoin)
+      return false;
+
+    ++NumLaneConflicts;
+    assert(V.OtherVNI && "Inconsistent conflict resolution.");
+    VNInfo *VNI = LR.getValNumInfo(i);
+    const Val &OtherV = Other.Vals[V.OtherVNI->id];
+
+    // VNI is known to clobber some lanes in OtherVNI. If we go ahead with the
+    // join, those lanes will be tainted with a wrong value. Get the extent of
+    // the tainted lanes.
+    LaneBitmask TaintedLanes = V.WriteLanes & OtherV.ValidLanes;
+    SmallVector<std::pair<SlotIndex, LaneBitmask>, 8> TaintExtent;
+    if (!taintExtent(i, TaintedLanes, Other, TaintExtent))
+      // Tainted lanes would extend beyond the basic block.
+      return false;
+
+    assert(!TaintExtent.empty() && "There should be at least one conflict.");
+
+    // Now look at the instructions from VNI->def to TaintExtent (inclusive).
+    MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
+    MachineBasicBlock::iterator MI = MBB->begin();
+    if (!VNI->isPHIDef()) {
+      MI = Indexes->getInstructionFromIndex(VNI->def);
+      if (!VNI->def.isEarlyClobber()) {
+        // No need to check the instruction defining VNI for reads.
+        ++MI;
+      }
+    }
+    assert(!SlotIndex::isSameInstr(VNI->def, TaintExtent.front().first) &&
+           "Interference ends on VNI->def. Should have been handled earlier");
+    MachineInstr *LastMI =
+      Indexes->getInstructionFromIndex(TaintExtent.front().first);
+    assert(LastMI && "Range must end at a proper instruction");
+    unsigned TaintNum = 0;
+    while (true) {
+      assert(MI != MBB->end() && "Bad LastMI");
+      if (usesLanes(*MI, Other.Reg, Other.SubIdx, TaintedLanes)) {
+        LLVM_DEBUG(dbgs() << "\t\ttainted lanes used by: " << *MI);
+        return false;
+      }
+      // LastMI is the last instruction to use the current value.
+      if (&*MI == LastMI) {
+        if (++TaintNum == TaintExtent.size())
+          break;
+        LastMI = Indexes->getInstructionFromIndex(TaintExtent[TaintNum].first);
+        assert(LastMI && "Range must end at a proper instruction");
+        TaintedLanes = TaintExtent[TaintNum].second;
+      }
+      ++MI;
+    }
+
+    // The tainted lanes are unused.
+    V.Resolution = CR_Replace;
+    ++NumLaneResolves;
+  }
+  return true;
+}
+
+bool JoinVals::isPrunedValue(unsigned ValNo, JoinVals &Other) {
+  Val &V = Vals[ValNo];
+  if (V.Pruned || V.PrunedComputed)
+    return V.Pruned;
+
+  if (V.Resolution != CR_Erase && V.Resolution != CR_Merge)
+    return V.Pruned;
+
+  // Follow copies up the dominator tree and check if any intermediate value
+  // has been pruned.
+  V.PrunedComputed = true;
+  V.Pruned = Other.isPrunedValue(V.OtherVNI->id, *this);
+  return V.Pruned;
+}
+
+void JoinVals::pruneValues(JoinVals &Other,
+                           SmallVectorImpl<SlotIndex> &EndPoints,
+                           bool changeInstrs) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    SlotIndex Def = LR.getValNumInfo(i)->def;
+    switch (Vals[i].Resolution) {
+    case CR_Keep:
+      break;
+    case CR_Replace: {
+      // This value takes precedence over the value in Other.LR.
+      LIS->pruneValue(Other.LR, Def, &EndPoints);
+      // Check if we're replacing an IMPLICIT_DEF value. The IMPLICIT_DEF
+      // instructions are only inserted to provide a live-out value for PHI
+      // predecessors, so the instruction should simply go away once its value
+      // has been replaced.
+      Val &OtherV = Other.Vals[Vals[i].OtherVNI->id];
+      bool EraseImpDef = OtherV.ErasableImplicitDef &&
+                         OtherV.Resolution == CR_Keep;
+      if (!Def.isBlock()) {
+        if (changeInstrs) {
+          // Remove <def,read-undef> flags. This def is now a partial redef.
+          // Also remove dead flags since the joined live range will
+          // continue past this instruction.
+          for (MachineOperand &MO :
+               Indexes->getInstructionFromIndex(Def)->operands()) {
+            if (MO.isReg() && MO.isDef() && MO.getReg() == Reg) {
+              if (MO.getSubReg() != 0 && MO.isUndef() && !EraseImpDef)
+                MO.setIsUndef(false);
+              MO.setIsDead(false);
+            }
+          }
+        }
+        // This value will reach instructions below, but we need to make sure
+        // the live range also reaches the instruction at Def.
+        if (!EraseImpDef)
+          EndPoints.push_back(Def);
+      }
+      LLVM_DEBUG(dbgs() << "\t\tpruned " << printReg(Other.Reg) << " at " << Def
+                        << ": " << Other.LR << '\n');
+      break;
+    }
+    case CR_Erase:
+    case CR_Merge:
+      if (isPrunedValue(i, Other)) {
+        // This value is ultimately a copy of a pruned value in LR or Other.LR.
+        // We can no longer trust the value mapping computed by
+        // computeAssignment(), the value that was originally copied could have
+        // been replaced.
+        LIS->pruneValue(LR, Def, &EndPoints);
+        LLVM_DEBUG(dbgs() << "\t\tpruned all of " << printReg(Reg) << " at "
+                          << Def << ": " << LR << '\n');
+      }
+      break;
+    case CR_Unresolved:
+    case CR_Impossible:
+      llvm_unreachable("Unresolved conflicts");
+    }
+  }
+}
+
+// Check if the segment consists of a copied live-through value (i.e. the copy
+// in the block only extended the liveness, of an undef value which we may need
+// to handle).
+static bool isLiveThrough(const LiveQueryResult Q) {
+  return Q.valueIn() && Q.valueIn()->isPHIDef() && Q.valueIn() == Q.valueOut();
+}
+
+/// Consider the following situation when coalescing the copy between
+/// %31 and %45 at 800. (The vertical lines represent live range segments.)
+///
+///                              Main range         Subrange 0004 (sub2)
+///                              %31    %45           %31    %45
+///  544    %45 = COPY %28               +                    +
+///                                      | v1                 | v1
+///  560B bb.1:                          +                    +
+///  624        = %45.sub2               | v2                 | v2
+///  800    %31 = COPY %45        +      +             +      +
+///                               | v0                 | v0
+///  816    %31.sub1 = ...        +                    |
+///  880    %30 = COPY %31        | v1                 +
+///  928    %45 = COPY %30        |      +                    +
+///                               |      | v0                 | v0  <--+
+///  992B   ; backedge -> bb.1    |      +                    +        |
+/// 1040        = %31.sub0        +                                    |
+///                                                 This value must remain
+///                                                 live-out!
+///
+/// Assuming that %31 is coalesced into %45, the copy at 928 becomes
+/// redundant, since it copies the value from %45 back into it. The
+/// conflict resolution for the main range determines that %45.v0 is
+/// to be erased, which is ok since %31.v1 is identical to it.
+/// The problem happens with the subrange for sub2: it has to be live
+/// on exit from the block, but since 928 was actually a point of
+/// definition of %45.sub2, %45.sub2 was not live immediately prior
+/// to that definition. As a result, when 928 was erased, the value v0
+/// for %45.sub2 was pruned in pruneSubRegValues. Consequently, an
+/// IMPLICIT_DEF was inserted as a "backedge" definition for %45.sub2,
+/// providing an incorrect value to the use at 624.
+///
+/// Since the main-range values %31.v1 and %45.v0 were proved to be
+/// identical, the corresponding values in subranges must also be the
+/// same. A redundant copy is removed because it's not needed, and not
+/// because it copied an undefined value, so any liveness that originated
+/// from that copy cannot disappear. When pruning a value that started
+/// at the removed copy, the corresponding identical value must be
+/// extended to replace it.
+void JoinVals::pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask) {
+  // Look for values being erased.
+  bool DidPrune = false;
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    Val &V = Vals[i];
+    // We should trigger in all cases in which eraseInstrs() does something.
+    // match what eraseInstrs() is doing, print a message so
+    if (V.Resolution != CR_Erase &&
+        (V.Resolution != CR_Keep || !V.ErasableImplicitDef || !V.Pruned))
+      continue;
+
+    // Check subranges at the point where the copy will be removed.
+    SlotIndex Def = LR.getValNumInfo(i)->def;
+    SlotIndex OtherDef;
+    if (V.Identical)
+      OtherDef = V.OtherVNI->def;
+
+    // Print message so mismatches with eraseInstrs() can be diagnosed.
+    LLVM_DEBUG(dbgs() << "\t\tExpecting instruction removal at " << Def
+                      << '\n');
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      LiveQueryResult Q = S.Query(Def);
+
+      // If a subrange starts at the copy then an undefined value has been
+      // copied and we must remove that subrange value as well.
+      VNInfo *ValueOut = Q.valueOutOrDead();
+      if (ValueOut != nullptr && (Q.valueIn() == nullptr ||
+                                  (V.Identical && V.Resolution == CR_Erase &&
+                                   ValueOut->def == Def))) {
+        LLVM_DEBUG(dbgs() << "\t\tPrune sublane " << PrintLaneMask(S.LaneMask)
+                          << " at " << Def << "\n");
+        SmallVector<SlotIndex,8> EndPoints;
+        LIS->pruneValue(S, Def, &EndPoints);
+        DidPrune = true;
+        // Mark value number as unused.
+        ValueOut->markUnused();
+
+        if (V.Identical && S.Query(OtherDef).valueOutOrDead()) {
+          // If V is identical to V.OtherVNI (and S was live at OtherDef),
+          // then we can't simply prune V from S. V needs to be replaced
+          // with V.OtherVNI.
+          LIS->extendToIndices(S, EndPoints);
+        }
+
+        // We may need to eliminate the subrange if the copy introduced a live
+        // out undef value.
+        if (ValueOut->isPHIDef())
+          ShrinkMask |= S.LaneMask;
+        continue;
+      }
+
+      // If a subrange ends at the copy, then a value was copied but only
+      // partially used later. Shrink the subregister range appropriately.
+      //
+      // Ultimately this calls shrinkToUses, so assuming ShrinkMask is
+      // conservatively correct.
+      if ((Q.valueIn() != nullptr && Q.valueOut() == nullptr) ||
+          (V.Resolution == CR_Erase && isLiveThrough(Q))) {
+        LLVM_DEBUG(dbgs() << "\t\tDead uses at sublane "
+                          << PrintLaneMask(S.LaneMask) << " at " << Def
+                          << "\n");
+        ShrinkMask |= S.LaneMask;
+      }
+    }
+  }
+  if (DidPrune)
+    LI.removeEmptySubRanges();
+}
+
+/// Check if any of the subranges of @p LI contain a definition at @p Def.
+static bool isDefInSubRange(LiveInterval &LI, SlotIndex Def) {
+  for (LiveInterval::SubRange &SR : LI.subranges()) {
+    if (VNInfo *VNI = SR.Query(Def).valueOutOrDead())
+      if (VNI->def == Def)
+        return true;
+  }
+  return false;
+}
+
+void JoinVals::pruneMainSegments(LiveInterval &LI, bool &ShrinkMainRange) {
+  assert(&static_cast<LiveRange&>(LI) == &LR);
+
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    if (Vals[i].Resolution != CR_Keep)
+      continue;
+    VNInfo *VNI = LR.getValNumInfo(i);
+    if (VNI->isUnused() || VNI->isPHIDef() || isDefInSubRange(LI, VNI->def))
+      continue;
+    Vals[i].Pruned = true;
+    ShrinkMainRange = true;
+  }
+}
+
+void JoinVals::removeImplicitDefs() {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    Val &V = Vals[i];
+    if (V.Resolution != CR_Keep || !V.ErasableImplicitDef || !V.Pruned)
+      continue;
+
+    VNInfo *VNI = LR.getValNumInfo(i);
+    VNI->markUnused();
+    LR.removeValNo(VNI);
+  }
+}
+
+void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
+                           SmallVectorImpl<Register> &ShrinkRegs,
+                           LiveInterval *LI) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    // Get the def location before markUnused() below invalidates it.
+    VNInfo *VNI = LR.getValNumInfo(i);
+    SlotIndex Def = VNI->def;
+    switch (Vals[i].Resolution) {
+    case CR_Keep: {
+      // If an IMPLICIT_DEF value is pruned, it doesn't serve a purpose any
+      // longer. The IMPLICIT_DEF instructions are only inserted by
+      // PHIElimination to guarantee that all PHI predecessors have a value.
+      if (!Vals[i].ErasableImplicitDef || !Vals[i].Pruned)
+        break;
+      // Remove value number i from LR.
+      // For intervals with subranges, removing a segment from the main range
+      // may require extending the previous segment: for each definition of
+      // a subregister, there will be a corresponding def in the main range.
+      // That def may fall in the middle of a segment from another subrange.
+      // In such cases, removing this def from the main range must be
+      // complemented by extending the main range to account for the liveness
+      // of the other subrange.
+      // The new end point of the main range segment to be extended.
+      SlotIndex NewEnd;
+      if (LI != nullptr) {
+        LiveRange::iterator I = LR.FindSegmentContaining(Def);
+        assert(I != LR.end());
+        // Do not extend beyond the end of the segment being removed.
+        // The segment may have been pruned in preparation for joining
+        // live ranges.
+        NewEnd = I->end;
+      }
+
+      LR.removeValNo(VNI);
+      // Note that this VNInfo is reused and still referenced in NewVNInfo,
+      // make it appear like an unused value number.
+      VNI->markUnused();
+
+      if (LI != nullptr && LI->hasSubRanges()) {
+        assert(static_cast<LiveRange*>(LI) == &LR);
+        // Determine the end point based on the subrange information:
+        // minimum of (earliest def of next segment,
+        //             latest end point of containing segment)
+        SlotIndex ED, LE;
+        for (LiveInterval::SubRange &SR : LI->subranges()) {
+          LiveRange::iterator I = SR.find(Def);
+          if (I == SR.end())
+            continue;
+          if (I->start > Def)
+            ED = ED.isValid() ? std::min(ED, I->start) : I->start;
+          else
+            LE = LE.isValid() ? std::max(LE, I->end) : I->end;
+        }
+        if (LE.isValid())
+          NewEnd = std::min(NewEnd, LE);
+        if (ED.isValid())
+          NewEnd = std::min(NewEnd, ED);
+
+        // We only want to do the extension if there was a subrange that
+        // was live across Def.
+        if (LE.isValid()) {
+          LiveRange::iterator S = LR.find(Def);
+          if (S != LR.begin())
+            std::prev(S)->end = NewEnd;
+        }
+      }
+      LLVM_DEBUG({
+        dbgs() << "\t\tremoved " << i << '@' << Def << ": " << LR << '\n';
+        if (LI != nullptr)
+          dbgs() << "\t\t  LHS = " << *LI << '\n';
+      });
+      [[fallthrough]];
+    }
+
+    case CR_Erase: {
+      MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
+      assert(MI && "No instruction to erase");
+      if (MI->isCopy()) {
+        Register Reg = MI->getOperand(1).getReg();
+        if (Reg.isVirtual() && Reg != CP.getSrcReg() && Reg != CP.getDstReg())
+          ShrinkRegs.push_back(Reg);
+      }
+      ErasedInstrs.insert(MI);
+      LLVM_DEBUG(dbgs() << "\t\terased:\t" << Def << '\t' << *MI);
+      LIS->RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+      break;
+    }
+    default:
+      break;
+    }
+  }
+}
+
+void RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
+                                         LaneBitmask LaneMask,
+                                         const CoalescerPair &CP) {
+  SmallVector<VNInfo*, 16> NewVNInfo;
+  JoinVals RHSVals(RRange, CP.getSrcReg(), CP.getSrcIdx(), LaneMask,
+                   NewVNInfo, CP, LIS, TRI, true, true);
+  JoinVals LHSVals(LRange, CP.getDstReg(), CP.getDstIdx(), LaneMask,
+                   NewVNInfo, CP, LIS, TRI, true, true);
+
+  // Compute NewVNInfo and resolve conflicts (see also joinVirtRegs())
+  // We should be able to resolve all conflicts here as we could successfully do
+  // it on the mainrange already. There is however a problem when multiple
+  // ranges get mapped to the "overflow" lane mask bit which creates unexpected
+  // interferences.
+  if (!LHSVals.mapValues(RHSVals) || !RHSVals.mapValues(LHSVals)) {
+    // We already determined that it is legal to merge the intervals, so this
+    // should never fail.
+    llvm_unreachable("*** Couldn't join subrange!\n");
+  }
+  if (!LHSVals.resolveConflicts(RHSVals) ||
+      !RHSVals.resolveConflicts(LHSVals)) {
+    // We already determined that it is legal to merge the intervals, so this
+    // should never fail.
+    llvm_unreachable("*** Couldn't join subrange!\n");
+  }
+
+  // The merging algorithm in LiveInterval::join() can't handle conflicting
+  // value mappings, so we need to remove any live ranges that overlap a
+  // CR_Replace resolution. Collect a set of end points that can be used to
+  // restore the live range after joining.
+  SmallVector<SlotIndex, 8> EndPoints;
+  LHSVals.pruneValues(RHSVals, EndPoints, false);
+  RHSVals.pruneValues(LHSVals, EndPoints, false);
+
+  LHSVals.removeImplicitDefs();
+  RHSVals.removeImplicitDefs();
+
+  LRange.verify();
+  RRange.verify();
+
+  // Join RRange into LHS.
+  LRange.join(RRange, LHSVals.getAssignments(), RHSVals.getAssignments(),
+              NewVNInfo);
+
+  LLVM_DEBUG(dbgs() << "\t\tjoined lanes: " << PrintLaneMask(LaneMask)
+                    << ' ' << LRange << "\n");
+  if (EndPoints.empty())
+    return;
+
+  // Recompute the parts of the live range we had to remove because of
+  // CR_Replace conflicts.
+  LLVM_DEBUG({
+    dbgs() << "\t\trestoring liveness to " << EndPoints.size() << " points: ";
+    for (unsigned i = 0, n = EndPoints.size(); i != n; ++i) {
+      dbgs() << EndPoints[i];
+      if (i != n-1)
+        dbgs() << ',';
+    }
+    dbgs() << ":  " << LRange << '\n';
+  });
+  LIS->extendToIndices(LRange, EndPoints);
+}
+
+void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI,
+                                          const LiveRange &ToMerge,
+                                          LaneBitmask LaneMask,
+                                          CoalescerPair &CP,
+                                          unsigned ComposeSubRegIdx) {
+  BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+  LI.refineSubRanges(
+      Allocator, LaneMask,
+      [this, &Allocator, &ToMerge, &CP](LiveInterval::SubRange &SR) {
+        if (SR.empty()) {
+          SR.assign(ToMerge, Allocator);
+        } else {
+          // joinSubRegRange() destroys the merged range, so we need a copy.
+          LiveRange RangeCopy(ToMerge, Allocator);
+          joinSubRegRanges(SR, RangeCopy, SR.LaneMask, CP);
+        }
+      },
+      *LIS->getSlotIndexes(), *TRI, ComposeSubRegIdx);
+}
+
+bool RegisterCoalescer::isHighCostLiveInterval(LiveInterval &LI) {
+  if (LI.valnos.size() < LargeIntervalSizeThreshold)
+    return false;
+  auto &Counter = LargeLIVisitCounter[LI.reg()];
+  if (Counter < LargeIntervalFreqThreshold) {
+    Counter++;
+    return false;
+  }
+  return true;
+}
+
+bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
+  SmallVector<VNInfo*, 16> NewVNInfo;
+  LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
+  LiveInterval &LHS = LIS->getInterval(CP.getDstReg());
+  bool TrackSubRegLiveness = MRI->shouldTrackSubRegLiveness(*CP.getNewRC());
+  JoinVals RHSVals(RHS, CP.getSrcReg(), CP.getSrcIdx(), LaneBitmask::getNone(),
+                   NewVNInfo, CP, LIS, TRI, false, TrackSubRegLiveness);
+  JoinVals LHSVals(LHS, CP.getDstReg(), CP.getDstIdx(), LaneBitmask::getNone(),
+                   NewVNInfo, CP, LIS, TRI, false, TrackSubRegLiveness);
+
+  LLVM_DEBUG(dbgs() << "\t\tRHS = " << RHS << "\n\t\tLHS = " << LHS << '\n');
+
+  if (isHighCostLiveInterval(LHS) || isHighCostLiveInterval(RHS))
+    return false;
+
+  // First compute NewVNInfo and the simple value mappings.
+  // Detect impossible conflicts early.
+  if (!LHSVals.mapValues(RHSVals) || !RHSVals.mapValues(LHSVals))
+    return false;
+
+  // Some conflicts can only be resolved after all values have been mapped.
+  if (!LHSVals.resolveConflicts(RHSVals) || !RHSVals.resolveConflicts(LHSVals))
+    return false;
+
+  // All clear, the live ranges can be merged.
+  if (RHS.hasSubRanges() || LHS.hasSubRanges()) {
+    BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+
+    // Transform lanemasks from the LHS to masks in the coalesced register and
+    // create initial subranges if necessary.
+    unsigned DstIdx = CP.getDstIdx();
+    if (!LHS.hasSubRanges()) {
+      LaneBitmask Mask = DstIdx == 0 ? CP.getNewRC()->getLaneMask()
+                                     : TRI->getSubRegIndexLaneMask(DstIdx);
+      // LHS must support subregs or we wouldn't be in this codepath.
+      assert(Mask.any());
+      LHS.createSubRangeFrom(Allocator, Mask, LHS);
+    } else if (DstIdx != 0) {
+      // Transform LHS lanemasks to new register class if necessary.
+      for (LiveInterval::SubRange &R : LHS.subranges()) {
+        LaneBitmask Mask = TRI->composeSubRegIndexLaneMask(DstIdx, R.LaneMask);
+        R.LaneMask = Mask;
+      }
+    }
+    LLVM_DEBUG(dbgs() << "\t\tLHST = " << printReg(CP.getDstReg()) << ' ' << LHS
+                      << '\n');
+
+    // Determine lanemasks of RHS in the coalesced register and merge subranges.
+    unsigned SrcIdx = CP.getSrcIdx();
+    if (!RHS.hasSubRanges()) {
+      LaneBitmask Mask = SrcIdx == 0 ? CP.getNewRC()->getLaneMask()
+                                     : TRI->getSubRegIndexLaneMask(SrcIdx);
+      mergeSubRangeInto(LHS, RHS, Mask, CP, DstIdx);
+    } else {
+      // Pair up subranges and merge.
+      for (LiveInterval::SubRange &R : RHS.subranges()) {
+        LaneBitmask Mask = TRI->composeSubRegIndexLaneMask(SrcIdx, R.LaneMask);
+        mergeSubRangeInto(LHS, R, Mask, CP, DstIdx);
+      }
+    }
+    LLVM_DEBUG(dbgs() << "\tJoined SubRanges " << LHS << "\n");
+
+    // Pruning implicit defs from subranges may result in the main range
+    // having stale segments.
+    LHSVals.pruneMainSegments(LHS, ShrinkMainRange);
+
+    LHSVals.pruneSubRegValues(LHS, ShrinkMask);
+    RHSVals.pruneSubRegValues(LHS, ShrinkMask);
+  }
+
+  // The merging algorithm in LiveInterval::join() can't handle conflicting
+  // value mappings, so we need to remove any live ranges that overlap a
+  // CR_Replace resolution. Collect a set of end points that can be used to
+  // restore the live range after joining.
+  SmallVector<SlotIndex, 8> EndPoints;
+  LHSVals.pruneValues(RHSVals, EndPoints, true);
+  RHSVals.pruneValues(LHSVals, EndPoints, true);
+
+  // Erase COPY and IMPLICIT_DEF instructions. This may cause some external
+  // registers to require trimming.
+  SmallVector<Register, 8> ShrinkRegs;
+  LHSVals.eraseInstrs(ErasedInstrs, ShrinkRegs, &LHS);
+  RHSVals.eraseInstrs(ErasedInstrs, ShrinkRegs);
+  while (!ShrinkRegs.empty())
+    shrinkToUses(&LIS->getInterval(ShrinkRegs.pop_back_val()));
+
+  // Scan and mark undef any DBG_VALUEs that would refer to a different value.
+  checkMergingChangesDbgValues(CP, LHS, LHSVals, RHS, RHSVals);
+
+  // If the RHS covers any PHI locations that were tracked for debug-info, we
+  // must update tracking information to reflect the join.
+  auto RegIt = RegToPHIIdx.find(CP.getSrcReg());
+  if (RegIt != RegToPHIIdx.end()) {
+    // Iterate over all the debug instruction numbers assigned this register.
+    for (unsigned InstID : RegIt->second) {
+      auto PHIIt = PHIValToPos.find(InstID);
+      assert(PHIIt != PHIValToPos.end());
+      const SlotIndex &SI = PHIIt->second.SI;
+
+      // Does the RHS cover the position of this PHI?
+      auto LII = RHS.find(SI);
+      if (LII == RHS.end() || LII->start > SI)
+        continue;
+
+      // Accept two kinds of subregister movement:
+      //  * When we merge from one register class into a larger register:
+      //        %1:gr16 = some-inst
+      //                ->
+      //        %2:gr32.sub_16bit = some-inst
+      //  * When the PHI is already in a subregister, and the larger class
+      //    is coalesced:
+      //        %2:gr32.sub_16bit = some-inst
+      //        %3:gr32 = COPY %2
+      //                ->
+      //        %3:gr32.sub_16bit = some-inst
+      // Test for subregister move:
+      if (CP.getSrcIdx() != 0 || CP.getDstIdx() != 0)
+        // If we're moving between different subregisters, ignore this join.
+        // The PHI will not get a location, dropping variable locations.
+        if (PHIIt->second.SubReg && PHIIt->second.SubReg != CP.getSrcIdx())
+          continue;
+
+      // Update our tracking of where the PHI is.
+      PHIIt->second.Reg = CP.getDstReg();
+
+      // If we merge into a sub-register of a larger class (test above),
+      // update SubReg.
+      if (CP.getSrcIdx() != 0)
+        PHIIt->second.SubReg = CP.getSrcIdx();
+    }
+
+    // Rebuild the register index in RegToPHIIdx to account for PHIs tracking
+    // different VRegs now. Copy old collection of debug instruction numbers and
+    // erase the old one:
+    auto InstrNums = RegIt->second;
+    RegToPHIIdx.erase(RegIt);
+
+    // There might already be PHIs being tracked in the destination VReg. Insert
+    // into an existing tracking collection, or insert a new one.
+    RegIt = RegToPHIIdx.find(CP.getDstReg());
+    if (RegIt != RegToPHIIdx.end())
+      RegIt->second.insert(RegIt->second.end(), InstrNums.begin(),
+                           InstrNums.end());
+    else
+      RegToPHIIdx.insert({CP.getDstReg(), InstrNums});
+  }
+
+  // Join RHS into LHS.
+  LHS.join(RHS, LHSVals.getAssignments(), RHSVals.getAssignments(), NewVNInfo);
+
+  // Kill flags are going to be wrong if the live ranges were overlapping.
+  // Eventually, we should simply clear all kill flags when computing live
+  // ranges. They are reinserted after register allocation.
+  MRI->clearKillFlags(LHS.reg());
+  MRI->clearKillFlags(RHS.reg());
+
+  if (!EndPoints.empty()) {
+    // Recompute the parts of the live range we had to remove because of
+    // CR_Replace conflicts.
+    LLVM_DEBUG({
+      dbgs() << "\t\trestoring liveness to " << EndPoints.size() << " points: ";
+      for (unsigned i = 0, n = EndPoints.size(); i != n; ++i) {
+        dbgs() << EndPoints[i];
+        if (i != n-1)
+          dbgs() << ',';
+      }
+      dbgs() << ":  " << LHS << '\n';
+    });
+    LIS->extendToIndices((LiveRange&)LHS, EndPoints);
+  }
+
+  return true;
+}
+
+bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) {
+  return CP.isPhys() ? joinReservedPhysReg(CP) : joinVirtRegs(CP);
+}
+
+void RegisterCoalescer::buildVRegToDbgValueMap(MachineFunction &MF)
+{
+  const SlotIndexes &Slots = *LIS->getSlotIndexes();
+  SmallVector<MachineInstr *, 8> ToInsert;
+
+  // After collecting a block of DBG_VALUEs into ToInsert, enter them into the
+  // vreg => DbgValueLoc map.
+  auto CloseNewDVRange = [this, &ToInsert](SlotIndex Slot) {
+    for (auto *X : ToInsert) {
+      for (const auto &Op : X->debug_operands()) {
+        if (Op.isReg() && Op.getReg().isVirtual())
+          DbgVRegToValues[Op.getReg()].push_back({Slot, X});
+      }
+    }
+
+    ToInsert.clear();
+  };
+
+  // Iterate over all instructions, collecting them into the ToInsert vector.
+  // Once a non-debug instruction is found, record the slot index of the
+  // collected DBG_VALUEs.
+  for (auto &MBB : MF) {
+    SlotIndex CurrentSlot = Slots.getMBBStartIdx(&MBB);
+
+    for (auto &MI : MBB) {
+      if (MI.isDebugValue()) {
+        if (any_of(MI.debug_operands(), [](const MachineOperand &MO) {
+              return MO.isReg() && MO.getReg().isVirtual();
+            }))
+          ToInsert.push_back(&MI);
+      } else if (!MI.isDebugOrPseudoInstr()) {
+        CurrentSlot = Slots.getInstructionIndex(MI);
+        CloseNewDVRange(CurrentSlot);
+      }
+    }
+
+    // Close range of DBG_VALUEs at the end of blocks.
+    CloseNewDVRange(Slots.getMBBEndIdx(&MBB));
+  }
+
+  // Sort all DBG_VALUEs we've seen by slot number.
+  for (auto &Pair : DbgVRegToValues)
+    llvm::sort(Pair.second);
+}
+
+void RegisterCoalescer::checkMergingChangesDbgValues(CoalescerPair &CP,
+                                                     LiveRange &LHS,
+                                                     JoinVals &LHSVals,
+                                                     LiveRange &RHS,
+                                                     JoinVals &RHSVals) {
+  auto ScanForDstReg = [&](Register Reg) {
+    checkMergingChangesDbgValuesImpl(Reg, RHS, LHS, LHSVals);
+  };
+
+  auto ScanForSrcReg = [&](Register Reg) {
+    checkMergingChangesDbgValuesImpl(Reg, LHS, RHS, RHSVals);
+  };
+
+  // Scan for unsound updates of both the source and destination register.
+  ScanForSrcReg(CP.getSrcReg());
+  ScanForDstReg(CP.getDstReg());
+}
+
+void RegisterCoalescer::checkMergingChangesDbgValuesImpl(Register Reg,
+                                                         LiveRange &OtherLR,
+                                                         LiveRange &RegLR,
+                                                         JoinVals &RegVals) {
+  // Are there any DBG_VALUEs to examine?
+  auto VRegMapIt = DbgVRegToValues.find(Reg);
+  if (VRegMapIt == DbgVRegToValues.end())
+    return;
+
+  auto &DbgValueSet = VRegMapIt->second;
+  auto DbgValueSetIt = DbgValueSet.begin();
+  auto SegmentIt = OtherLR.begin();
+
+  bool LastUndefResult = false;
+  SlotIndex LastUndefIdx;
+
+  // If the "Other" register is live at a slot Idx, test whether Reg can
+  // safely be merged with it, or should be marked undef.
+  auto ShouldUndef = [&RegVals, &RegLR, &LastUndefResult,
+                      &LastUndefIdx](SlotIndex Idx) -> bool {
+    // Our worst-case performance typically happens with asan, causing very
+    // many DBG_VALUEs of the same location. Cache a copy of the most recent
+    // result for this edge-case.
+    if (LastUndefIdx == Idx)
+      return LastUndefResult;
+
+    // If the other range was live, and Reg's was not, the register coalescer
+    // will not have tried to resolve any conflicts. We don't know whether
+    // the DBG_VALUE will refer to the same value number, so it must be made
+    // undef.
+    auto OtherIt = RegLR.find(Idx);
+    if (OtherIt == RegLR.end())
+      return true;
+
+    // Both the registers were live: examine the conflict resolution record for
+    // the value number Reg refers to. CR_Keep meant that this value number
+    // "won" and the merged register definitely refers to that value. CR_Erase
+    // means the value number was a redundant copy of the other value, which
+    // was coalesced and Reg deleted. It's safe to refer to the other register
+    // (which will be the source of the copy).
+    auto Resolution = RegVals.getResolution(OtherIt->valno->id);
+    LastUndefResult = Resolution != JoinVals::CR_Keep &&
+                      Resolution != JoinVals::CR_Erase;
+    LastUndefIdx = Idx;
+    return LastUndefResult;
+  };
+
+  // Iterate over both the live-range of the "Other" register, and the set of
+  // DBG_VALUEs for Reg at the same time. Advance whichever one has the lowest
+  // slot index. This relies on the DbgValueSet being ordered.
+  while (DbgValueSetIt != DbgValueSet.end() && SegmentIt != OtherLR.end()) {
+    if (DbgValueSetIt->first < SegmentIt->end) {
+      // "Other" is live and there is a DBG_VALUE of Reg: test if we should
+      // set it undef.
+      if (DbgValueSetIt->first >= SegmentIt->start) {
+        bool HasReg = DbgValueSetIt->second->hasDebugOperandForReg(Reg);
+        bool ShouldUndefReg = ShouldUndef(DbgValueSetIt->first);
+        if (HasReg && ShouldUndefReg) {
+          // Mark undef, erase record of this DBG_VALUE to avoid revisiting.
+          DbgValueSetIt->second->setDebugValueUndef();
+          continue;
+        }
+      }
+      ++DbgValueSetIt;
+    } else {
+      ++SegmentIt;
+    }
+  }
+}
+
+namespace {
+
+/// Information concerning MBB coalescing priority.
+struct MBBPriorityInfo {
+  MachineBasicBlock *MBB;
+  unsigned Depth;
+  bool IsSplit;
+
+  MBBPriorityInfo(MachineBasicBlock *mbb, unsigned depth, bool issplit)
+    : MBB(mbb), Depth(depth), IsSplit(issplit) {}
+};
+
+} // end anonymous namespace
+
+/// C-style comparator that sorts first based on the loop depth of the basic
+/// block (the unsigned), and then on the MBB number.
+///
+/// EnableGlobalCopies assumes that the primary sort key is loop depth.
+static int compareMBBPriority(const MBBPriorityInfo *LHS,
+                              const MBBPriorityInfo *RHS) {
+  // Deeper loops first
+  if (LHS->Depth != RHS->Depth)
+    return LHS->Depth > RHS->Depth ? -1 : 1;
+
+  // Try to unsplit critical edges next.
+  if (LHS->IsSplit != RHS->IsSplit)
+    return LHS->IsSplit ? -1 : 1;
+
+  // Prefer blocks that are more connected in the CFG. This takes care of
+  // the most difficult copies first while intervals are short.
+  unsigned cl = LHS->MBB->pred_size() + LHS->MBB->succ_size();
+  unsigned cr = RHS->MBB->pred_size() + RHS->MBB->succ_size();
+  if (cl != cr)
+    return cl > cr ? -1 : 1;
+
+  // As a last resort, sort by block number.
+  return LHS->MBB->getNumber() < RHS->MBB->getNumber() ? -1 : 1;
+}
+
+/// \returns true if the given copy uses or defines a local live range.
+static bool isLocalCopy(MachineInstr *Copy, const LiveIntervals *LIS) {
+  if (!Copy->isCopy())
+    return false;
+
+  if (Copy->getOperand(1).isUndef())
+    return false;
+
+  Register SrcReg = Copy->getOperand(1).getReg();
+  Register DstReg = Copy->getOperand(0).getReg();
+  if (SrcReg.isPhysical() || DstReg.isPhysical())
+    return false;
+
+  return LIS->intervalIsInOneMBB(LIS->getInterval(SrcReg))
+    || LIS->intervalIsInOneMBB(LIS->getInterval(DstReg));
+}
+
+void RegisterCoalescer::lateLiveIntervalUpdate() {
+  for (Register reg : ToBeUpdated) {
+    if (!LIS->hasInterval(reg))
+      continue;
+    LiveInterval &LI = LIS->getInterval(reg);
+    shrinkToUses(&LI, &DeadDefs);
+    if (!DeadDefs.empty())
+      eliminateDeadDefs();
+  }
+  ToBeUpdated.clear();
+}
+
+bool RegisterCoalescer::
+copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
+  bool Progress = false;
+  for (MachineInstr *&MI : CurrList) {
+    if (!MI)
+      continue;
+    // Skip instruction pointers that have already been erased, for example by
+    // dead code elimination.
+    if (ErasedInstrs.count(MI)) {
+      MI = nullptr;
+      continue;
+    }
+    bool Again = false;
+    bool Success = joinCopy(MI, Again);
+    Progress |= Success;
+    if (Success || !Again)
+      MI = nullptr;
+  }
+  return Progress;
+}
+
+/// Check if DstReg is a terminal node.
+/// I.e., it does not have any affinity other than \p Copy.
+static bool isTerminalReg(Register DstReg, const MachineInstr &Copy,
+                          const MachineRegisterInfo *MRI) {
+  assert(Copy.isCopyLike());
+  // Check if the destination of this copy as any other affinity.
+  for (const MachineInstr &MI : MRI->reg_nodbg_instructions(DstReg))
+    if (&MI != &Copy && MI.isCopyLike())
+      return false;
+  return true;
+}
+
+bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const {
+  assert(Copy.isCopyLike());
+  if (!UseTerminalRule)
+    return false;
+  Register SrcReg, DstReg;
+  unsigned SrcSubReg = 0, DstSubReg = 0;
+  if (!isMoveInstr(*TRI, &Copy, SrcReg, DstReg, SrcSubReg, DstSubReg))
+    return false;
+  // Check if the destination of this copy has any other affinity.
+  if (DstReg.isPhysical() ||
+      // If SrcReg is a physical register, the copy won't be coalesced.
+      // Ignoring it may have other side effect (like missing
+      // rematerialization). So keep it.
+      SrcReg.isPhysical() || !isTerminalReg(DstReg, Copy, MRI))
+    return false;
+
+  // DstReg is a terminal node. Check if it interferes with any other
+  // copy involving SrcReg.
+  const MachineBasicBlock *OrigBB = Copy.getParent();
+  const LiveInterval &DstLI = LIS->getInterval(DstReg);
+  for (const MachineInstr &MI : MRI->reg_nodbg_instructions(SrcReg)) {
+    // Technically we should check if the weight of the new copy is
+    // interesting compared to the other one and update the weight
+    // of the copies accordingly. However, this would only work if
+    // we would gather all the copies first then coalesce, whereas
+    // right now we interleave both actions.
+    // For now, just consider the copies that are in the same block.
+    if (&MI == &Copy || !MI.isCopyLike() || MI.getParent() != OrigBB)
+      continue;
+    Register OtherSrcReg, OtherReg;
+    unsigned OtherSrcSubReg = 0, OtherSubReg = 0;
+    if (!isMoveInstr(*TRI, &Copy, OtherSrcReg, OtherReg, OtherSrcSubReg,
+                OtherSubReg))
+      return false;
+    if (OtherReg == SrcReg)
+      OtherReg = OtherSrcReg;
+    // Check if OtherReg is a non-terminal.
+    if (OtherReg.isPhysical() || isTerminalReg(OtherReg, MI, MRI))
+      continue;
+    // Check that OtherReg interfere with DstReg.
+    if (LIS->getInterval(OtherReg).overlaps(DstLI)) {
+      LLVM_DEBUG(dbgs() << "Apply terminal rule for: " << printReg(DstReg)
+                        << '\n');
+      return true;
+    }
+  }
+  return false;
+}
+
+void
+RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << MBB->getName() << ":\n");
+
+  // Collect all copy-like instructions in MBB. Don't start coalescing anything
+  // yet, it might invalidate the iterator.
+  const unsigned PrevSize = WorkList.size();
+  if (JoinGlobalCopies) {
+    SmallVector<MachineInstr*, 2> LocalTerminals;
+    SmallVector<MachineInstr*, 2> GlobalTerminals;
+    // Coalesce copies bottom-up to coalesce local defs before local uses. They
+    // are not inherently easier to resolve, but slightly preferable until we
+    // have local live range splitting. In particular this is required by
+    // cmp+jmp macro fusion.
+    for (MachineInstr &MI : *MBB) {
+      if (!MI.isCopyLike())
+        continue;
+      bool ApplyTerminalRule = applyTerminalRule(MI);
+      if (isLocalCopy(&MI, LIS)) {
+        if (ApplyTerminalRule)
+          LocalTerminals.push_back(&MI);
+        else
+          LocalWorkList.push_back(&MI);
+      } else {
+        if (ApplyTerminalRule)
+          GlobalTerminals.push_back(&MI);
+        else
+          WorkList.push_back(&MI);
+      }
+    }
+    // Append the copies evicted by the terminal rule at the end of the list.
+    LocalWorkList.append(LocalTerminals.begin(), LocalTerminals.end());
+    WorkList.append(GlobalTerminals.begin(), GlobalTerminals.end());
+  }
+  else {
+    SmallVector<MachineInstr*, 2> Terminals;
+    for (MachineInstr &MII : *MBB)
+      if (MII.isCopyLike()) {
+        if (applyTerminalRule(MII))
+          Terminals.push_back(&MII);
+        else
+          WorkList.push_back(&MII);
+      }
+    // Append the copies evicted by the terminal rule at the end of the list.
+    WorkList.append(Terminals.begin(), Terminals.end());
+  }
+  // Try coalescing the collected copies immediately, and remove the nulls.
+  // This prevents the WorkList from getting too large since most copies are
+  // joinable on the first attempt.
+  MutableArrayRef<MachineInstr*>
+    CurrList(WorkList.begin() + PrevSize, WorkList.end());
+  if (copyCoalesceWorkList(CurrList))
+    WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(),
+                               nullptr), WorkList.end());
+}
+
+void RegisterCoalescer::coalesceLocals() {
+  copyCoalesceWorkList(LocalWorkList);
+  for (unsigned j = 0, je = LocalWorkList.size(); j != je; ++j) {
+    if (LocalWorkList[j])
+      WorkList.push_back(LocalWorkList[j]);
+  }
+  LocalWorkList.clear();
+}
+
+void RegisterCoalescer::joinAllIntervals() {
+  LLVM_DEBUG(dbgs() << "********** JOINING INTERVALS ***********\n");
+  assert(WorkList.empty() && LocalWorkList.empty() && "Old data still around.");
+
+  std::vector<MBBPriorityInfo> MBBs;
+  MBBs.reserve(MF->size());
+  for (MachineBasicBlock &MBB : *MF) {
+    MBBs.push_back(MBBPriorityInfo(&MBB, Loops->getLoopDepth(&MBB),
+                                   JoinSplitEdges && isSplitEdge(&MBB)));
+  }
+  array_pod_sort(MBBs.begin(), MBBs.end(), compareMBBPriority);
+
+  // Coalesce intervals in MBB priority order.
+  unsigned CurrDepth = std::numeric_limits<unsigned>::max();
+  for (MBBPriorityInfo &MBB : MBBs) {
+    // Try coalescing the collected local copies for deeper loops.
+    if (JoinGlobalCopies && MBB.Depth < CurrDepth) {
+      coalesceLocals();
+      CurrDepth = MBB.Depth;
+    }
+    copyCoalesceInMBB(MBB.MBB);
+  }
+  lateLiveIntervalUpdate();
+  coalesceLocals();
+
+  // Joining intervals can allow other intervals to be joined.  Iteratively join
+  // until we make no progress.
+  while (copyCoalesceWorkList(WorkList))
+    /* empty */ ;
+  lateLiveIntervalUpdate();
+}
+
+void RegisterCoalescer::releaseMemory() {
+  ErasedInstrs.clear();
+  WorkList.clear();
+  DeadDefs.clear();
+  InflateRegs.clear();
+  LargeLIVisitCounter.clear();
+}
+
+bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
+  LLVM_DEBUG(dbgs() << "********** REGISTER COALESCER **********\n"
+                    << "********** Function: " << fn.getName() << '\n');
+
+  // Variables changed between a setjmp and a longjump can have undefined value
+  // after the longjmp. This behaviour can be observed if such a variable is
+  // spilled, so longjmp won't restore the value in the spill slot.
+  // RegisterCoalescer should not run in functions with a setjmp to avoid
+  // merging such undefined variables with predictable ones.
+  //
+  // TODO: Could specifically disable coalescing registers live across setjmp
+  // calls
+  if (fn.exposesReturnsTwice()) {
+    LLVM_DEBUG(
+        dbgs() << "* Skipped as it exposes functions that returns twice.\n");
+    return false;
+  }
+
+  MF = &fn;
+  MRI = &fn.getRegInfo();
+  const TargetSubtargetInfo &STI = fn.getSubtarget();
+  TRI = STI.getRegisterInfo();
+  TII = STI.getInstrInfo();
+  LIS = &getAnalysis<LiveIntervals>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  Loops = &getAnalysis<MachineLoopInfo>();
+  if (EnableGlobalCopies == cl::BOU_UNSET)
+    JoinGlobalCopies = STI.enableJoinGlobalCopies();
+  else
+    JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE);
+
+  // If there are PHIs tracked by debug-info, they will need updating during
+  // coalescing. Build an index of those PHIs to ease updating.
+  SlotIndexes *Slots = LIS->getSlotIndexes();
+  for (const auto &DebugPHI : MF->DebugPHIPositions) {
+    MachineBasicBlock *MBB = DebugPHI.second.MBB;
+    Register Reg = DebugPHI.second.Reg;
+    unsigned SubReg = DebugPHI.second.SubReg;
+    SlotIndex SI = Slots->getMBBStartIdx(MBB);
+    PHIValPos P = {SI, Reg, SubReg};
+    PHIValToPos.insert(std::make_pair(DebugPHI.first, P));
+    RegToPHIIdx[Reg].push_back(DebugPHI.first);
+  }
+
+  // The MachineScheduler does not currently require JoinSplitEdges. This will
+  // either be enabled unconditionally or replaced by a more general live range
+  // splitting optimization.
+  JoinSplitEdges = EnableJoinSplits;
+
+  if (VerifyCoalescing)
+    MF->verify(this, "Before register coalescing");
+
+  DbgVRegToValues.clear();
+  buildVRegToDbgValueMap(fn);
+
+  RegClassInfo.runOnMachineFunction(fn);
+
+  // Join (coalesce) intervals if requested.
+  if (EnableJoining)
+    joinAllIntervals();
+
+  // After deleting a lot of copies, register classes may be less constrained.
+  // Removing sub-register operands may allow GR32_ABCD -> GR32 and DPR_VFP2 ->
+  // DPR inflation.
+  array_pod_sort(InflateRegs.begin(), InflateRegs.end());
+  InflateRegs.erase(std::unique(InflateRegs.begin(), InflateRegs.end()),
+                    InflateRegs.end());
+  LLVM_DEBUG(dbgs() << "Trying to inflate " << InflateRegs.size()
+                    << " regs.\n");
+  for (unsigned i = 0, e = InflateRegs.size(); i != e; ++i) {
+    Register Reg = InflateRegs[i];
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    if (MRI->recomputeRegClass(Reg)) {
+      LLVM_DEBUG(dbgs() << printReg(Reg) << " inflated to "
+                        << TRI->getRegClassName(MRI->getRegClass(Reg)) << '\n');
+      ++NumInflated;
+
+      LiveInterval &LI = LIS->getInterval(Reg);
+      if (LI.hasSubRanges()) {
+        // If the inflated register class does not support subregisters anymore
+        // remove the subranges.
+        if (!MRI->shouldTrackSubRegLiveness(Reg)) {
+          LI.clearSubRanges();
+        } else {
+#ifndef NDEBUG
+          LaneBitmask MaxMask = MRI->getMaxLaneMaskForVReg(Reg);
+          // If subranges are still supported, then the same subregs
+          // should still be supported.
+          for (LiveInterval::SubRange &S : LI.subranges()) {
+            assert((S.LaneMask & ~MaxMask).none());
+          }
+#endif
+        }
+      }
+    }
+  }
+
+  // After coalescing, update any PHIs that are being tracked by debug-info
+  // with their new VReg locations.
+  for (auto &p : MF->DebugPHIPositions) {
+    auto it = PHIValToPos.find(p.first);
+    assert(it != PHIValToPos.end());
+    p.second.Reg = it->second.Reg;
+    p.second.SubReg = it->second.SubReg;
+  }
+
+  PHIValToPos.clear();
+  RegToPHIIdx.clear();
+
+  LLVM_DEBUG(dump());
+  if (VerifyCoalescing)
+    MF->verify(this, "After register coalescing");
+  return true;
+}
+
+void RegisterCoalescer::print(raw_ostream &O, const Module* m) const {
+   LIS->print(O, m);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.h b/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.h
new file mode 100644
index 000000000000..f265d93fb0d6
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegisterCoalescer.h
@@ -0,0 +1,114 @@
+//===- RegisterCoalescer.h - Register Coalescing Interface ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the abstract interface for register coalescers,
+// allowing them to interact with and query register allocators.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_REGISTERCOALESCER_H
+#define LLVM_LIB_CODEGEN_REGISTERCOALESCER_H
+
+#include "llvm/CodeGen/Register.h"
+
+namespace llvm {
+
+class MachineInstr;
+class TargetRegisterClass;
+class TargetRegisterInfo;
+
+  /// A helper class for register coalescers. When deciding if
+  /// two registers can be coalesced, CoalescerPair can determine if a copy
+  /// instruction would become an identity copy after coalescing.
+  class CoalescerPair {
+    const TargetRegisterInfo &TRI;
+
+    /// The register that will be left after coalescing. It can be a
+    /// virtual or physical register.
+    Register DstReg;
+
+    /// The virtual register that will be coalesced into dstReg.
+    Register SrcReg;
+
+    /// The sub-register index of the old DstReg in the new coalesced register.
+    unsigned DstIdx = 0;
+
+    /// The sub-register index of the old SrcReg in the new coalesced register.
+    unsigned SrcIdx = 0;
+
+    /// True when the original copy was a partial subregister copy.
+    bool Partial = false;
+
+    /// True when both regs are virtual and newRC is constrained.
+    bool CrossClass = false;
+
+    /// True when DstReg and SrcReg are reversed from the original
+    /// copy instruction.
+    bool Flipped = false;
+
+    /// The register class of the coalesced register, or NULL if DstReg
+    /// is a physreg. This register class may be a super-register of both
+    /// SrcReg and DstReg.
+    const TargetRegisterClass *NewRC = nullptr;
+
+  public:
+    CoalescerPair(const TargetRegisterInfo &tri) : TRI(tri) {}
+
+    /// Create a CoalescerPair representing a virtreg-to-physreg copy.
+    /// No need to call setRegisters().
+    CoalescerPair(Register VirtReg, MCRegister PhysReg,
+                  const TargetRegisterInfo &tri)
+        : TRI(tri), DstReg(PhysReg), SrcReg(VirtReg) {}
+
+    /// Set registers to match the copy instruction MI. Return
+    /// false if MI is not a coalescable copy instruction.
+    bool setRegisters(const MachineInstr*);
+
+    /// Swap SrcReg and DstReg. Return false if swapping is impossible
+    /// because DstReg is a physical register, or SubIdx is set.
+    bool flip();
+
+    /// Return true if MI is a copy instruction that will become
+    /// an identity copy after coalescing.
+    bool isCoalescable(const MachineInstr*) const;
+
+    /// Return true if DstReg is a physical register.
+    bool isPhys() const { return !NewRC; }
+
+    /// Return true if the original copy instruction did not copy
+    /// the full register, but was a subreg operation.
+    bool isPartial() const { return Partial; }
+
+    /// Return true if DstReg is virtual and NewRC is a smaller
+    /// register class than DstReg's.
+    bool isCrossClass() const { return CrossClass; }
+
+    /// Return true when getSrcReg is the register being defined by
+    /// the original copy instruction.
+    bool isFlipped() const { return Flipped; }
+
+    /// Return the register (virtual or physical) that will remain
+    /// after coalescing.
+    Register getDstReg() const { return DstReg; }
+
+    /// Return the virtual register that will be coalesced away.
+    Register getSrcReg() const { return SrcReg; }
+
+    /// Return the subregister index that DstReg will be coalesced into, or 0.
+    unsigned getDstIdx() const { return DstIdx; }
+
+    /// Return the subregister index that SrcReg will be coalesced into, or 0.
+    unsigned getSrcIdx() const { return SrcIdx; }
+
+    /// Return the register class of the coalesced register.
+    const TargetRegisterClass *getNewRC() const { return NewRC; }
+  };
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_REGISTERCOALESCER_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegisterPressure.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegisterPressure.cpp
new file mode 100644
index 000000000000..f86aa3a16720
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -0,0 +1,1392 @@
+//===- RegisterPressure.cpp - Dynamic Register Pressure -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the RegisterPressure class which can be used to track
+// MachineInstr level register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+/// Increase pressure for each pressure set provided by TargetRegisterInfo.
+static void increaseSetPressure(std::vector<unsigned> &CurrSetPressure,
+                                const MachineRegisterInfo &MRI, unsigned Reg,
+                                LaneBitmask PrevMask, LaneBitmask NewMask) {
+  assert((PrevMask & ~NewMask).none() && "Must not remove bits");
+  if (PrevMask.any() || NewMask.none())
+    return;
+
+  PSetIterator PSetI = MRI.getPressureSets(Reg);
+  unsigned Weight = PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI)
+    CurrSetPressure[*PSetI] += Weight;
+}
+
+/// Decrease pressure for each pressure set provided by TargetRegisterInfo.
+static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure,
+                                const MachineRegisterInfo &MRI, Register Reg,
+                                LaneBitmask PrevMask, LaneBitmask NewMask) {
+  //assert((NewMask & !PrevMask) == 0 && "Must not add bits");
+  if (NewMask.any() || PrevMask.none())
+    return;
+
+  PSetIterator PSetI = MRI.getPressureSets(Reg);
+  unsigned Weight = PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI) {
+    assert(CurrSetPressure[*PSetI] >= Weight && "register pressure underflow");
+    CurrSetPressure[*PSetI] -= Weight;
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void llvm::dumpRegSetPressure(ArrayRef<unsigned> SetPressure,
+                              const TargetRegisterInfo *TRI) {
+  bool Empty = true;
+  for (unsigned i = 0, e = SetPressure.size(); i < e; ++i) {
+    if (SetPressure[i] != 0) {
+      dbgs() << TRI->getRegPressureSetName(i) << "=" << SetPressure[i] << '\n';
+      Empty = false;
+    }
+  }
+  if (Empty)
+    dbgs() << "\n";
+}
+
+LLVM_DUMP_METHOD
+void RegisterPressure::dump(const TargetRegisterInfo *TRI) const {
+  dbgs() << "Max Pressure: ";
+  dumpRegSetPressure(MaxSetPressure, TRI);
+  dbgs() << "Live In: ";
+  for (const RegisterMaskPair &P : LiveInRegs) {
+    dbgs() << printVRegOrUnit(P.RegUnit, TRI);
+    if (!P.LaneMask.all())
+      dbgs() << ':' << PrintLaneMask(P.LaneMask);
+    dbgs() << ' ';
+  }
+  dbgs() << '\n';
+  dbgs() << "Live Out: ";
+  for (const RegisterMaskPair &P : LiveOutRegs) {
+    dbgs() << printVRegOrUnit(P.RegUnit, TRI);
+    if (!P.LaneMask.all())
+      dbgs() << ':' << PrintLaneMask(P.LaneMask);
+    dbgs() << ' ';
+  }
+  dbgs() << '\n';
+}
+
+LLVM_DUMP_METHOD
+void RegPressureTracker::dump() const {
+  if (!isTopClosed() || !isBottomClosed()) {
+    dbgs() << "Curr Pressure: ";
+    dumpRegSetPressure(CurrSetPressure, TRI);
+  }
+  P.dump(TRI);
+}
+
+LLVM_DUMP_METHOD
+void PressureDiff::dump(const TargetRegisterInfo &TRI) const {
+  const char *sep = "";
+  for (const PressureChange &Change : *this) {
+    if (!Change.isValid())
+      break;
+    dbgs() << sep << TRI.getRegPressureSetName(Change.getPSet())
+           << " " << Change.getUnitInc();
+    sep = "    ";
+  }
+  dbgs() << '\n';
+}
+
+LLVM_DUMP_METHOD
+void PressureChange::dump() const {
+  dbgs() << "[" << getPSetOrMax() << ", " << getUnitInc() << "]\n";
+}
+
+void RegPressureDelta::dump() const {
+  dbgs() << "[Excess=";
+  Excess.dump();
+  dbgs() << ", CriticalMax=";
+  CriticalMax.dump();
+  dbgs() << ", CurrentMax=";
+  CurrentMax.dump();
+  dbgs() << "]\n";
+}
+
+#endif
+
+void RegPressureTracker::increaseRegPressure(Register RegUnit,
+                                             LaneBitmask PreviousMask,
+                                             LaneBitmask NewMask) {
+  if (PreviousMask.any() || NewMask.none())
+    return;
+
+  PSetIterator PSetI = MRI->getPressureSets(RegUnit);
+  unsigned Weight = PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI) {
+    CurrSetPressure[*PSetI] += Weight;
+    P.MaxSetPressure[*PSetI] =
+        std::max(P.MaxSetPressure[*PSetI], CurrSetPressure[*PSetI]);
+  }
+}
+
+void RegPressureTracker::decreaseRegPressure(Register RegUnit,
+                                             LaneBitmask PreviousMask,
+                                             LaneBitmask NewMask) {
+  decreaseSetPressure(CurrSetPressure, *MRI, RegUnit, PreviousMask, NewMask);
+}
+
+/// Clear the result so it can be used for another round of pressure tracking.
+void IntervalPressure::reset() {
+  TopIdx = BottomIdx = SlotIndex();
+  MaxSetPressure.clear();
+  LiveInRegs.clear();
+  LiveOutRegs.clear();
+}
+
+/// Clear the result so it can be used for another round of pressure tracking.
+void RegionPressure::reset() {
+  TopPos = BottomPos = MachineBasicBlock::const_iterator();
+  MaxSetPressure.clear();
+  LiveInRegs.clear();
+  LiveOutRegs.clear();
+}
+
+/// If the current top is not less than or equal to the next index, open it.
+/// We happen to need the SlotIndex for the next top for pressure update.
+void IntervalPressure::openTop(SlotIndex NextTop) {
+  if (TopIdx <= NextTop)
+    return;
+  TopIdx = SlotIndex();
+  LiveInRegs.clear();
+}
+
+/// If the current top is the previous instruction (before receding), open it.
+void RegionPressure::openTop(MachineBasicBlock::const_iterator PrevTop) {
+  if (TopPos != PrevTop)
+    return;
+  TopPos = MachineBasicBlock::const_iterator();
+  LiveInRegs.clear();
+}
+
+/// If the current bottom is not greater than the previous index, open it.
+void IntervalPressure::openBottom(SlotIndex PrevBottom) {
+  if (BottomIdx > PrevBottom)
+    return;
+  BottomIdx = SlotIndex();
+  LiveInRegs.clear();
+}
+
+/// If the current bottom is the previous instr (before advancing), open it.
+void RegionPressure::openBottom(MachineBasicBlock::const_iterator PrevBottom) {
+  if (BottomPos != PrevBottom)
+    return;
+  BottomPos = MachineBasicBlock::const_iterator();
+  LiveInRegs.clear();
+}
+
+void LiveRegSet::init(const MachineRegisterInfo &MRI) {
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  unsigned NumRegUnits = TRI.getNumRegs();
+  unsigned NumVirtRegs = MRI.getNumVirtRegs();
+  Regs.setUniverse(NumRegUnits + NumVirtRegs);
+  this->NumRegUnits = NumRegUnits;
+}
+
+void LiveRegSet::clear() {
+  Regs.clear();
+}
+
+static const LiveRange *getLiveRange(const LiveIntervals &LIS, unsigned Reg) {
+  if (Register::isVirtualRegister(Reg))
+    return &LIS.getInterval(Reg);
+  return LIS.getCachedRegUnit(Reg);
+}
+
+void RegPressureTracker::reset() {
+  MBB = nullptr;
+  LIS = nullptr;
+
+  CurrSetPressure.clear();
+  LiveThruPressure.clear();
+  P.MaxSetPressure.clear();
+
+  if (RequireIntervals)
+    static_cast<IntervalPressure&>(P).reset();
+  else
+    static_cast<RegionPressure&>(P).reset();
+
+  LiveRegs.clear();
+  UntiedDefs.clear();
+}
+
+/// Setup the RegPressureTracker.
+///
+/// TODO: Add support for pressure without LiveIntervals.
+void RegPressureTracker::init(const MachineFunction *mf,
+                              const RegisterClassInfo *rci,
+                              const LiveIntervals *lis,
+                              const MachineBasicBlock *mbb,
+                              MachineBasicBlock::const_iterator pos,
+                              bool TrackLaneMasks, bool TrackUntiedDefs) {
+  reset();
+
+  MF = mf;
+  TRI = MF->getSubtarget().getRegisterInfo();
+  RCI = rci;
+  MRI = &MF->getRegInfo();
+  MBB = mbb;
+  this->TrackUntiedDefs = TrackUntiedDefs;
+  this->TrackLaneMasks = TrackLaneMasks;
+
+  if (RequireIntervals) {
+    assert(lis && "IntervalPressure requires LiveIntervals");
+    LIS = lis;
+  }
+
+  CurrPos = pos;
+  CurrSetPressure.assign(TRI->getNumRegPressureSets(), 0);
+
+  P.MaxSetPressure = CurrSetPressure;
+
+  LiveRegs.init(*MRI);
+  if (TrackUntiedDefs)
+    UntiedDefs.setUniverse(MRI->getNumVirtRegs());
+}
+
+/// Does this pressure result have a valid top position and live ins.
+bool RegPressureTracker::isTopClosed() const {
+  if (RequireIntervals)
+    return static_cast<IntervalPressure&>(P).TopIdx.isValid();
+  return (static_cast<RegionPressure&>(P).TopPos ==
+          MachineBasicBlock::const_iterator());
+}
+
+/// Does this pressure result have a valid bottom position and live outs.
+bool RegPressureTracker::isBottomClosed() const {
+  if (RequireIntervals)
+    return static_cast<IntervalPressure&>(P).BottomIdx.isValid();
+  return (static_cast<RegionPressure&>(P).BottomPos ==
+          MachineBasicBlock::const_iterator());
+}
+
+SlotIndex RegPressureTracker::getCurrSlot() const {
+  MachineBasicBlock::const_iterator IdxPos =
+    skipDebugInstructionsForward(CurrPos, MBB->end());
+  if (IdxPos == MBB->end())
+    return LIS->getMBBEndIdx(MBB);
+  return LIS->getInstructionIndex(*IdxPos).getRegSlot();
+}
+
+/// Set the boundary for the top of the region and summarize live ins.
+void RegPressureTracker::closeTop() {
+  if (RequireIntervals)
+    static_cast<IntervalPressure&>(P).TopIdx = getCurrSlot();
+  else
+    static_cast<RegionPressure&>(P).TopPos = CurrPos;
+
+  assert(P.LiveInRegs.empty() && "inconsistent max pressure result");
+  P.LiveInRegs.reserve(LiveRegs.size());
+  LiveRegs.appendTo(P.LiveInRegs);
+}
+
+/// Set the boundary for the bottom of the region and summarize live outs.
+void RegPressureTracker::closeBottom() {
+  if (RequireIntervals)
+    static_cast<IntervalPressure&>(P).BottomIdx = getCurrSlot();
+  else
+    static_cast<RegionPressure&>(P).BottomPos = CurrPos;
+
+  assert(P.LiveOutRegs.empty() && "inconsistent max pressure result");
+  P.LiveOutRegs.reserve(LiveRegs.size());
+  LiveRegs.appendTo(P.LiveOutRegs);
+}
+
+/// Finalize the region boundaries and record live ins and live outs.
+void RegPressureTracker::closeRegion() {
+  if (!isTopClosed() && !isBottomClosed()) {
+    assert(LiveRegs.size() == 0 && "no region boundary");
+    return;
+  }
+  if (!isBottomClosed())
+    closeBottom();
+  else if (!isTopClosed())
+    closeTop();
+  // If both top and bottom are closed, do nothing.
+}
+
+/// The register tracker is unaware of global liveness so ignores normal
+/// live-thru ranges. However, two-address or coalesced chains can also lead
+/// to live ranges with no holes. Count these to inform heuristics that we
+/// can never drop below this pressure.
+void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
+  LiveThruPressure.assign(TRI->getNumRegPressureSets(), 0);
+  assert(isBottomClosed() && "need bottom-up tracking to intialize.");
+  for (const RegisterMaskPair &Pair : P.LiveOutRegs) {
+    Register RegUnit = Pair.RegUnit;
+    if (RegUnit.isVirtual() && !RPTracker.hasUntiedDef(RegUnit))
+      increaseSetPressure(LiveThruPressure, *MRI, RegUnit,
+                          LaneBitmask::getNone(), Pair.LaneMask);
+  }
+}
+
+static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
+                               Register RegUnit) {
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+    return Other.RegUnit == RegUnit;
+  });
+  if (I == RegUnits.end())
+    return LaneBitmask::getNone();
+  return I->LaneMask;
+}
+
+static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
+                        RegisterMaskPair Pair) {
+  Register RegUnit = Pair.RegUnit;
+  assert(Pair.LaneMask.any());
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+    return Other.RegUnit == RegUnit;
+  });
+  if (I == RegUnits.end()) {
+    RegUnits.push_back(Pair);
+  } else {
+    I->LaneMask |= Pair.LaneMask;
+  }
+}
+
+static void setRegZero(SmallVectorImpl<RegisterMaskPair> &RegUnits,
+                       Register RegUnit) {
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+    return Other.RegUnit == RegUnit;
+  });
+  if (I == RegUnits.end()) {
+    RegUnits.push_back(RegisterMaskPair(RegUnit, LaneBitmask::getNone()));
+  } else {
+    I->LaneMask = LaneBitmask::getNone();
+  }
+}
+
+static void removeRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
+                           RegisterMaskPair Pair) {
+  Register RegUnit = Pair.RegUnit;
+  assert(Pair.LaneMask.any());
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+    return Other.RegUnit == RegUnit;
+  });
+  if (I != RegUnits.end()) {
+    I->LaneMask &= ~Pair.LaneMask;
+    if (I->LaneMask.none())
+      RegUnits.erase(I);
+  }
+}
+
+static LaneBitmask
+getLanesWithProperty(const LiveIntervals &LIS, const MachineRegisterInfo &MRI,
+                     bool TrackLaneMasks, Register RegUnit, SlotIndex Pos,
+                     LaneBitmask SafeDefault,
+                     bool (*Property)(const LiveRange &LR, SlotIndex Pos)) {
+  if (RegUnit.isVirtual()) {
+    const LiveInterval &LI = LIS.getInterval(RegUnit);
+    LaneBitmask Result;
+    if (TrackLaneMasks && LI.hasSubRanges()) {
+        for (const LiveInterval::SubRange &SR : LI.subranges()) {
+          if (Property(SR, Pos))
+            Result |= SR.LaneMask;
+        }
+    } else if (Property(LI, Pos)) {
+      Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit)
+                              : LaneBitmask::getAll();
+    }
+
+    return Result;
+  } else {
+    const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
+    // Be prepared for missing liveranges: We usually do not compute liveranges
+    // for physical registers on targets with many registers (GPUs).
+    if (LR == nullptr)
+      return SafeDefault;
+    return Property(*LR, Pos) ? LaneBitmask::getAll() : LaneBitmask::getNone();
+  }
+}
+
+static LaneBitmask getLiveLanesAt(const LiveIntervals &LIS,
+                                  const MachineRegisterInfo &MRI,
+                                  bool TrackLaneMasks, Register RegUnit,
+                                  SlotIndex Pos) {
+  return getLanesWithProperty(LIS, MRI, TrackLaneMasks, RegUnit, Pos,
+                              LaneBitmask::getAll(),
+                              [](const LiveRange &LR, SlotIndex Pos) {
+                                return LR.liveAt(Pos);
+                              });
+}
+
+namespace {
+
+/// Collect this instruction's unique uses and defs into SmallVectors for
+/// processing defs and uses in order.
+///
+/// FIXME: always ignore tied opers
+class RegisterOperandsCollector {
+  friend class llvm::RegisterOperands;
+
+  RegisterOperands &RegOpers;
+  const TargetRegisterInfo &TRI;
+  const MachineRegisterInfo &MRI;
+  bool IgnoreDead;
+
+  RegisterOperandsCollector(RegisterOperands &RegOpers,
+                            const TargetRegisterInfo &TRI,
+                            const MachineRegisterInfo &MRI, bool IgnoreDead)
+    : RegOpers(RegOpers), TRI(TRI), MRI(MRI), IgnoreDead(IgnoreDead) {}
+
+  void collectInstr(const MachineInstr &MI) const {
+    for (ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI)
+      collectOperand(*OperI);
+
+    // Remove redundant physreg dead defs.
+    for (const RegisterMaskPair &P : RegOpers.Defs)
+      removeRegLanes(RegOpers.DeadDefs, P);
+  }
+
+  void collectInstrLanes(const MachineInstr &MI) const {
+    for (ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI)
+      collectOperandLanes(*OperI);
+
+    // Remove redundant physreg dead defs.
+    for (const RegisterMaskPair &P : RegOpers.Defs)
+      removeRegLanes(RegOpers.DeadDefs, P);
+  }
+
+  /// Push this operand's register onto the correct vectors.
+  void collectOperand(const MachineOperand &MO) const {
+    if (!MO.isReg() || !MO.getReg())
+      return;
+    Register Reg = MO.getReg();
+    if (MO.isUse()) {
+      if (!MO.isUndef() && !MO.isInternalRead())
+        pushReg(Reg, RegOpers.Uses);
+    } else {
+      assert(MO.isDef());
+      // Subregister definitions may imply a register read.
+      if (MO.readsReg())
+        pushReg(Reg, RegOpers.Uses);
+
+      if (MO.isDead()) {
+        if (!IgnoreDead)
+          pushReg(Reg, RegOpers.DeadDefs);
+      } else
+        pushReg(Reg, RegOpers.Defs);
+    }
+  }
+
+  void pushReg(Register Reg,
+               SmallVectorImpl<RegisterMaskPair> &RegUnits) const {
+    if (Reg.isVirtual()) {
+      addRegLanes(RegUnits, RegisterMaskPair(Reg, LaneBitmask::getAll()));
+    } else if (MRI.isAllocatable(Reg)) {
+      for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg()))
+        addRegLanes(RegUnits, RegisterMaskPair(Unit, LaneBitmask::getAll()));
+    }
+  }
+
+  void collectOperandLanes(const MachineOperand &MO) const {
+    if (!MO.isReg() || !MO.getReg())
+      return;
+    Register Reg = MO.getReg();
+    unsigned SubRegIdx = MO.getSubReg();
+    if (MO.isUse()) {
+      if (!MO.isUndef() && !MO.isInternalRead())
+        pushRegLanes(Reg, SubRegIdx, RegOpers.Uses);
+    } else {
+      assert(MO.isDef());
+      // Treat read-undef subreg defs as definitions of the whole register.
+      if (MO.isUndef())
+        SubRegIdx = 0;
+
+      if (MO.isDead()) {
+        if (!IgnoreDead)
+          pushRegLanes(Reg, SubRegIdx, RegOpers.DeadDefs);
+      } else
+        pushRegLanes(Reg, SubRegIdx, RegOpers.Defs);
+    }
+  }
+
+  void pushRegLanes(Register Reg, unsigned SubRegIdx,
+                    SmallVectorImpl<RegisterMaskPair> &RegUnits) const {
+    if (Reg.isVirtual()) {
+      LaneBitmask LaneMask = SubRegIdx != 0
+                             ? TRI.getSubRegIndexLaneMask(SubRegIdx)
+                             : MRI.getMaxLaneMaskForVReg(Reg);
+      addRegLanes(RegUnits, RegisterMaskPair(Reg, LaneMask));
+    } else if (MRI.isAllocatable(Reg)) {
+      for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg()))
+        addRegLanes(RegUnits, RegisterMaskPair(Unit, LaneBitmask::getAll()));
+    }
+  }
+};
+
+} // end anonymous namespace
+
+void RegisterOperands::collect(const MachineInstr &MI,
+                               const TargetRegisterInfo &TRI,
+                               const MachineRegisterInfo &MRI,
+                               bool TrackLaneMasks, bool IgnoreDead) {
+  RegisterOperandsCollector Collector(*this, TRI, MRI, IgnoreDead);
+  if (TrackLaneMasks)
+    Collector.collectInstrLanes(MI);
+  else
+    Collector.collectInstr(MI);
+}
+
+void RegisterOperands::detectDeadDefs(const MachineInstr &MI,
+                                      const LiveIntervals &LIS) {
+  SlotIndex SlotIdx = LIS.getInstructionIndex(MI);
+  for (auto *RI = Defs.begin(); RI != Defs.end(); /*empty*/) {
+    Register Reg = RI->RegUnit;
+    const LiveRange *LR = getLiveRange(LIS, Reg);
+    if (LR != nullptr) {
+      LiveQueryResult LRQ = LR->Query(SlotIdx);
+      if (LRQ.isDeadDef()) {
+        // LiveIntervals knows this is a dead even though it's MachineOperand is
+        // not flagged as such.
+        DeadDefs.push_back(*RI);
+        RI = Defs.erase(RI);
+        continue;
+      }
+    }
+    ++RI;
+  }
+}
+
+void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS,
+                                          const MachineRegisterInfo &MRI,
+                                          SlotIndex Pos,
+                                          MachineInstr *AddFlagsMI) {
+  for (auto *I = Defs.begin(); I != Defs.end();) {
+    LaneBitmask LiveAfter = getLiveLanesAt(LIS, MRI, true, I->RegUnit,
+                                           Pos.getDeadSlot());
+    // If the def is all that is live after the instruction, then in case
+    // of a subregister def we need a read-undef flag.
+    Register RegUnit = I->RegUnit;
+    if (RegUnit.isVirtual() && AddFlagsMI != nullptr &&
+        (LiveAfter & ~I->LaneMask).none())
+      AddFlagsMI->setRegisterDefReadUndef(RegUnit);
+
+    LaneBitmask ActualDef = I->LaneMask & LiveAfter;
+    if (ActualDef.none()) {
+      I = Defs.erase(I);
+    } else {
+      I->LaneMask = ActualDef;
+      ++I;
+    }
+  }
+  for (auto *I = Uses.begin(); I != Uses.end();) {
+    LaneBitmask LiveBefore = getLiveLanesAt(LIS, MRI, true, I->RegUnit,
+                                            Pos.getBaseIndex());
+    LaneBitmask LaneMask = I->LaneMask & LiveBefore;
+    if (LaneMask.none()) {
+      I = Uses.erase(I);
+    } else {
+      I->LaneMask = LaneMask;
+      ++I;
+    }
+  }
+  if (AddFlagsMI != nullptr) {
+    for (const RegisterMaskPair &P : DeadDefs) {
+      Register RegUnit = P.RegUnit;
+      if (!RegUnit.isVirtual())
+        continue;
+      LaneBitmask LiveAfter = getLiveLanesAt(LIS, MRI, true, RegUnit,
+                                             Pos.getDeadSlot());
+      if (LiveAfter.none())
+        AddFlagsMI->setRegisterDefReadUndef(RegUnit);
+    }
+  }
+}
+
+/// Initialize an array of N PressureDiffs.
+void PressureDiffs::init(unsigned N) {
+  Size = N;
+  if (N <= Max) {
+    memset(PDiffArray, 0, N * sizeof(PressureDiff));
+    return;
+  }
+  Max = Size;
+  free(PDiffArray);
+  PDiffArray = static_cast<PressureDiff*>(safe_calloc(N, sizeof(PressureDiff)));
+}
+
+void PressureDiffs::addInstruction(unsigned Idx,
+                                   const RegisterOperands &RegOpers,
+                                   const MachineRegisterInfo &MRI) {
+  PressureDiff &PDiff = (*this)[Idx];
+  assert(!PDiff.begin()->isValid() && "stale PDiff");
+  for (const RegisterMaskPair &P : RegOpers.Defs)
+    PDiff.addPressureChange(P.RegUnit, true, &MRI);
+
+  for (const RegisterMaskPair &P : RegOpers.Uses)
+    PDiff.addPressureChange(P.RegUnit, false, &MRI);
+}
+
+/// Add a change in pressure to the pressure diff of a given instruction.
+void PressureDiff::addPressureChange(Register RegUnit, bool IsDec,
+                                     const MachineRegisterInfo *MRI) {
+  PSetIterator PSetI = MRI->getPressureSets(RegUnit);
+  int Weight = IsDec ? -PSetI.getWeight() : PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI) {
+    // Find an existing entry in the pressure diff for this PSet.
+    PressureDiff::iterator I = nonconst_begin(), E = nonconst_end();
+    for (; I != E && I->isValid(); ++I) {
+      if (I->getPSet() >= *PSetI)
+        break;
+    }
+    // If all pressure sets are more constrained, skip the remaining PSets.
+    if (I == E)
+      break;
+    // Insert this PressureChange.
+    if (!I->isValid() || I->getPSet() != *PSetI) {
+      PressureChange PTmp = PressureChange(*PSetI);
+      for (PressureDiff::iterator J = I; J != E && PTmp.isValid(); ++J)
+        std::swap(*J, PTmp);
+    }
+    // Update the units for this pressure set.
+    unsigned NewUnitInc = I->getUnitInc() + Weight;
+    if (NewUnitInc != 0) {
+      I->setUnitInc(NewUnitInc);
+    } else {
+      // Remove entry
+      PressureDiff::iterator J;
+      for (J = std::next(I); J != E && J->isValid(); ++J, ++I)
+        *I = *J;
+      *I = PressureChange();
+    }
+  }
+}
+
+/// Force liveness of registers.
+void RegPressureTracker::addLiveRegs(ArrayRef<RegisterMaskPair> Regs) {
+  for (const RegisterMaskPair &P : Regs) {
+    LaneBitmask PrevMask = LiveRegs.insert(P);
+    LaneBitmask NewMask = PrevMask | P.LaneMask;
+    increaseRegPressure(P.RegUnit, PrevMask, NewMask);
+  }
+}
+
+void RegPressureTracker::discoverLiveInOrOut(RegisterMaskPair Pair,
+    SmallVectorImpl<RegisterMaskPair> &LiveInOrOut) {
+  assert(Pair.LaneMask.any());
+
+  Register RegUnit = Pair.RegUnit;
+  auto I = llvm::find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) {
+    return Other.RegUnit == RegUnit;
+  });
+  LaneBitmask PrevMask;
+  LaneBitmask NewMask;
+  if (I == LiveInOrOut.end()) {
+    PrevMask = LaneBitmask::getNone();
+    NewMask = Pair.LaneMask;
+    LiveInOrOut.push_back(Pair);
+  } else {
+    PrevMask = I->LaneMask;
+    NewMask = PrevMask | Pair.LaneMask;
+    I->LaneMask = NewMask;
+  }
+  increaseSetPressure(P.MaxSetPressure, *MRI, RegUnit, PrevMask, NewMask);
+}
+
+void RegPressureTracker::discoverLiveIn(RegisterMaskPair Pair) {
+  discoverLiveInOrOut(Pair, P.LiveInRegs);
+}
+
+void RegPressureTracker::discoverLiveOut(RegisterMaskPair Pair) {
+  discoverLiveInOrOut(Pair, P.LiveOutRegs);
+}
+
+void RegPressureTracker::bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs) {
+  for (const RegisterMaskPair &P : DeadDefs) {
+    Register Reg = P.RegUnit;
+    LaneBitmask LiveMask = LiveRegs.contains(Reg);
+    LaneBitmask BumpedMask = LiveMask | P.LaneMask;
+    increaseRegPressure(Reg, LiveMask, BumpedMask);
+  }
+  for (const RegisterMaskPair &P : DeadDefs) {
+    Register Reg = P.RegUnit;
+    LaneBitmask LiveMask = LiveRegs.contains(Reg);
+    LaneBitmask BumpedMask = LiveMask | P.LaneMask;
+    decreaseRegPressure(Reg, BumpedMask, LiveMask);
+  }
+}
+
+/// Recede across the previous instruction. If LiveUses is provided, record any
+/// RegUnits that are made live by the current instruction's uses. This includes
+/// registers that are both defined and used by the instruction.  If a pressure
+/// difference pointer is provided record the changes is pressure caused by this
+/// instruction independent of liveness.
+void RegPressureTracker::recede(const RegisterOperands &RegOpers,
+                                SmallVectorImpl<RegisterMaskPair> *LiveUses) {
+  assert(!CurrPos->isDebugOrPseudoInstr());
+
+  // Boost pressure for all dead defs together.
+  bumpDeadDefs(RegOpers.DeadDefs);
+
+  // Kill liveness at live defs.
+  // TODO: consider earlyclobbers?
+  for (const RegisterMaskPair &Def : RegOpers.Defs) {
+    Register Reg = Def.RegUnit;
+
+    LaneBitmask PreviousMask = LiveRegs.erase(Def);
+    LaneBitmask NewMask = PreviousMask & ~Def.LaneMask;
+
+    LaneBitmask LiveOut = Def.LaneMask & ~PreviousMask;
+    if (LiveOut.any()) {
+      discoverLiveOut(RegisterMaskPair(Reg, LiveOut));
+      // Retroactively model effects on pressure of the live out lanes.
+      increaseSetPressure(CurrSetPressure, *MRI, Reg, LaneBitmask::getNone(),
+                          LiveOut);
+      PreviousMask = LiveOut;
+    }
+
+    if (NewMask.none()) {
+      // Add a 0 entry to LiveUses as a marker that the complete vreg has become
+      // dead.
+      if (TrackLaneMasks && LiveUses != nullptr)
+        setRegZero(*LiveUses, Reg);
+    }
+
+    decreaseRegPressure(Reg, PreviousMask, NewMask);
+  }
+
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = LIS->getInstructionIndex(*CurrPos).getRegSlot();
+
+  // Generate liveness for uses.
+  for (const RegisterMaskPair &Use : RegOpers.Uses) {
+    Register Reg = Use.RegUnit;
+    assert(Use.LaneMask.any());
+    LaneBitmask PreviousMask = LiveRegs.insert(Use);
+    LaneBitmask NewMask = PreviousMask | Use.LaneMask;
+    if (NewMask == PreviousMask)
+      continue;
+
+    // Did the register just become live?
+    if (PreviousMask.none()) {
+      if (LiveUses != nullptr) {
+        if (!TrackLaneMasks) {
+          addRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask));
+        } else {
+          auto I =
+              llvm::find_if(*LiveUses, [Reg](const RegisterMaskPair Other) {
+                return Other.RegUnit == Reg;
+              });
+          bool IsRedef = I != LiveUses->end();
+          if (IsRedef) {
+            // ignore re-defs here...
+            assert(I->LaneMask.none());
+            removeRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask));
+          } else {
+            addRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask));
+          }
+        }
+      }
+
+      // Discover live outs if this may be the first occurance of this register.
+      if (RequireIntervals) {
+        LaneBitmask LiveOut = getLiveThroughAt(Reg, SlotIdx);
+        if (LiveOut.any())
+          discoverLiveOut(RegisterMaskPair(Reg, LiveOut));
+      }
+    }
+
+    increaseRegPressure(Reg, PreviousMask, NewMask);
+  }
+  if (TrackUntiedDefs) {
+    for (const RegisterMaskPair &Def : RegOpers.Defs) {
+      Register RegUnit = Def.RegUnit;
+      if (RegUnit.isVirtual() &&
+          (LiveRegs.contains(RegUnit) & Def.LaneMask).none())
+        UntiedDefs.insert(RegUnit);
+    }
+  }
+}
+
+void RegPressureTracker::recedeSkipDebugValues() {
+  assert(CurrPos != MBB->begin());
+  if (!isBottomClosed())
+    closeBottom();
+
+  // Open the top of the region using block iterators.
+  if (!RequireIntervals && isTopClosed())
+    static_cast<RegionPressure&>(P).openTop(CurrPos);
+
+  // Find the previous instruction.
+  CurrPos = prev_nodbg(CurrPos, MBB->begin());
+
+  SlotIndex SlotIdx;
+  if (RequireIntervals && !CurrPos->isDebugOrPseudoInstr())
+    SlotIdx = LIS->getInstructionIndex(*CurrPos).getRegSlot();
+
+  // Open the top of the region using slot indexes.
+  if (RequireIntervals && isTopClosed())
+    static_cast<IntervalPressure&>(P).openTop(SlotIdx);
+}
+
+void RegPressureTracker::recede(SmallVectorImpl<RegisterMaskPair> *LiveUses) {
+  recedeSkipDebugValues();
+  if (CurrPos->isDebugInstr() || CurrPos->isPseudoProbe()) {
+    // It's possible to only have debug_value and pseudo probe instructions and
+    // hit the start of the block.
+    assert(CurrPos == MBB->begin());
+    return;
+  }
+
+  const MachineInstr &MI = *CurrPos;
+  RegisterOperands RegOpers;
+  RegOpers.collect(MI, *TRI, *MRI, TrackLaneMasks, false);
+  if (TrackLaneMasks) {
+    SlotIndex SlotIdx = LIS->getInstructionIndex(*CurrPos).getRegSlot();
+    RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
+  } else if (RequireIntervals) {
+    RegOpers.detectDeadDefs(MI, *LIS);
+  }
+
+  recede(RegOpers, LiveUses);
+}
+
+/// Advance across the current instruction.
+void RegPressureTracker::advance(const RegisterOperands &RegOpers) {
+  assert(!TrackUntiedDefs && "unsupported mode");
+  assert(CurrPos != MBB->end());
+  if (!isTopClosed())
+    closeTop();
+
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = getCurrSlot();
+
+  // Open the bottom of the region using slot indexes.
+  if (isBottomClosed()) {
+    if (RequireIntervals)
+      static_cast<IntervalPressure&>(P).openBottom(SlotIdx);
+    else
+      static_cast<RegionPressure&>(P).openBottom(CurrPos);
+  }
+
+  for (const RegisterMaskPair &Use : RegOpers.Uses) {
+    Register Reg = Use.RegUnit;
+    LaneBitmask LiveMask = LiveRegs.contains(Reg);
+    LaneBitmask LiveIn = Use.LaneMask & ~LiveMask;
+    if (LiveIn.any()) {
+      discoverLiveIn(RegisterMaskPair(Reg, LiveIn));
+      increaseRegPressure(Reg, LiveMask, LiveMask | LiveIn);
+      LiveRegs.insert(RegisterMaskPair(Reg, LiveIn));
+    }
+    // Kill liveness at last uses.
+    if (RequireIntervals) {
+      LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
+      if (LastUseMask.any()) {
+        LiveRegs.erase(RegisterMaskPair(Reg, LastUseMask));
+        decreaseRegPressure(Reg, LiveMask, LiveMask & ~LastUseMask);
+      }
+    }
+  }
+
+  // Generate liveness for defs.
+  for (const RegisterMaskPair &Def : RegOpers.Defs) {
+    LaneBitmask PreviousMask = LiveRegs.insert(Def);
+    LaneBitmask NewMask = PreviousMask | Def.LaneMask;
+    increaseRegPressure(Def.RegUnit, PreviousMask, NewMask);
+  }
+
+  // Boost pressure for all dead defs together.
+  bumpDeadDefs(RegOpers.DeadDefs);
+
+  // Find the next instruction.
+  CurrPos = next_nodbg(CurrPos, MBB->end());
+}
+
+void RegPressureTracker::advance() {
+  const MachineInstr &MI = *CurrPos;
+  RegisterOperands RegOpers;
+  RegOpers.collect(MI, *TRI, *MRI, TrackLaneMasks, false);
+  if (TrackLaneMasks) {
+    SlotIndex SlotIdx = getCurrSlot();
+    RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
+  }
+  advance(RegOpers);
+}
+
+/// Find the max change in excess pressure across all sets.
+static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
+                                       ArrayRef<unsigned> NewPressureVec,
+                                       RegPressureDelta &Delta,
+                                       const RegisterClassInfo *RCI,
+                                       ArrayRef<unsigned> LiveThruPressureVec) {
+  Delta.Excess = PressureChange();
+  for (unsigned i = 0, e = OldPressureVec.size(); i < e; ++i) {
+    unsigned POld = OldPressureVec[i];
+    unsigned PNew = NewPressureVec[i];
+    int PDiff = (int)PNew - (int)POld;
+    if (!PDiff) // No change in this set in the common case.
+      continue;
+    // Only consider change beyond the limit.
+    unsigned Limit = RCI->getRegPressureSetLimit(i);
+    if (!LiveThruPressureVec.empty())
+      Limit += LiveThruPressureVec[i];
+
+    if (Limit > POld) {
+      if (Limit > PNew)
+        PDiff = 0;            // Under the limit
+      else
+        PDiff = PNew - Limit; // Just exceeded limit.
+    } else if (Limit > PNew)
+      PDiff = Limit - POld;   // Just obeyed limit.
+
+    if (PDiff) {
+      Delta.Excess = PressureChange(i);
+      Delta.Excess.setUnitInc(PDiff);
+      break;
+    }
+  }
+}
+
+/// Find the max change in max pressure that either surpasses a critical PSet
+/// limit or exceeds the current MaxPressureLimit.
+///
+/// FIXME: comparing each element of the old and new MaxPressure vectors here is
+/// silly. It's done now to demonstrate the concept but will go away with a
+/// RegPressureTracker API change to work with pressure differences.
+static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
+                                    ArrayRef<unsigned> NewMaxPressureVec,
+                                    ArrayRef<PressureChange> CriticalPSets,
+                                    ArrayRef<unsigned> MaxPressureLimit,
+                                    RegPressureDelta &Delta) {
+  Delta.CriticalMax = PressureChange();
+  Delta.CurrentMax = PressureChange();
+
+  unsigned CritIdx = 0, CritEnd = CriticalPSets.size();
+  for (unsigned i = 0, e = OldMaxPressureVec.size(); i < e; ++i) {
+    unsigned POld = OldMaxPressureVec[i];
+    unsigned PNew = NewMaxPressureVec[i];
+    if (PNew == POld) // No change in this set in the common case.
+      continue;
+
+    if (!Delta.CriticalMax.isValid()) {
+      while (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() < i)
+        ++CritIdx;
+
+      if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == i) {
+        int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].getUnitInc();
+        if (PDiff > 0) {
+          Delta.CriticalMax = PressureChange(i);
+          Delta.CriticalMax.setUnitInc(PDiff);
+        }
+      }
+    }
+    // Find the first increase above MaxPressureLimit.
+    // (Ignores negative MDiff).
+    if (!Delta.CurrentMax.isValid() && PNew > MaxPressureLimit[i]) {
+      Delta.CurrentMax = PressureChange(i);
+      Delta.CurrentMax.setUnitInc(PNew - POld);
+      if (CritIdx == CritEnd || Delta.CriticalMax.isValid())
+        break;
+    }
+  }
+}
+
+/// Record the upward impact of a single instruction on current register
+/// pressure. Unlike the advance/recede pressure tracking interface, this does
+/// not discover live in/outs.
+///
+/// This is intended for speculative queries. It leaves pressure inconsistent
+/// with the current position, so must be restored by the caller.
+void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
+  assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
+
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+
+  // Account for register pressure similar to RegPressureTracker::recede().
+  RegisterOperands RegOpers;
+  RegOpers.collect(*MI, *TRI, *MRI, TrackLaneMasks, /*IgnoreDead=*/true);
+  assert(RegOpers.DeadDefs.size() == 0);
+  if (TrackLaneMasks)
+    RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
+  else if (RequireIntervals)
+    RegOpers.detectDeadDefs(*MI, *LIS);
+
+  // Boost max pressure for all dead defs together.
+  // Since CurrSetPressure and MaxSetPressure
+  bumpDeadDefs(RegOpers.DeadDefs);
+
+  // Kill liveness at live defs.
+  for (const RegisterMaskPair &P : RegOpers.Defs) {
+    Register Reg = P.RegUnit;
+    LaneBitmask LiveLanes = LiveRegs.contains(Reg);
+    LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, Reg);
+    LaneBitmask DefLanes = P.LaneMask;
+    LaneBitmask LiveAfter = (LiveLanes & ~DefLanes) | UseLanes;
+    decreaseRegPressure(Reg, LiveLanes, LiveAfter);
+  }
+  // Generate liveness for uses.
+  for (const RegisterMaskPair &P : RegOpers.Uses) {
+    Register Reg = P.RegUnit;
+    LaneBitmask LiveLanes = LiveRegs.contains(Reg);
+    LaneBitmask LiveAfter = LiveLanes | P.LaneMask;
+    increaseRegPressure(Reg, LiveLanes, LiveAfter);
+  }
+}
+
+/// Consider the pressure increase caused by traversing this instruction
+/// bottom-up. Find the pressure set with the most change beyond its pressure
+/// limit based on the tracker's current pressure, and return the change in
+/// number of register units of that pressure set introduced by this
+/// instruction.
+///
+/// This assumes that the current LiveOut set is sufficient.
+///
+/// This is expensive for an on-the-fly query because it calls
+/// bumpUpwardPressure to recompute the pressure sets based on current
+/// liveness. This mainly exists to verify correctness, e.g. with
+/// -verify-misched. getUpwardPressureDelta is the fast version of this query
+/// that uses the per-SUnit cache of the PressureDiff.
+void RegPressureTracker::
+getMaxUpwardPressureDelta(const MachineInstr *MI, PressureDiff *PDiff,
+                          RegPressureDelta &Delta,
+                          ArrayRef<PressureChange> CriticalPSets,
+                          ArrayRef<unsigned> MaxPressureLimit) {
+  // Snapshot Pressure.
+  // FIXME: The snapshot heap space should persist. But I'm planning to
+  // summarize the pressure effect so we don't need to snapshot at all.
+  std::vector<unsigned> SavedPressure = CurrSetPressure;
+  std::vector<unsigned> SavedMaxPressure = P.MaxSetPressure;
+
+  bumpUpwardPressure(MI);
+
+  computeExcessPressureDelta(SavedPressure, CurrSetPressure, Delta, RCI,
+                             LiveThruPressure);
+  computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets,
+                          MaxPressureLimit, Delta);
+  assert(Delta.CriticalMax.getUnitInc() >= 0 &&
+         Delta.CurrentMax.getUnitInc() >= 0 && "cannot decrease max pressure");
+
+  // Restore the tracker's state.
+  P.MaxSetPressure.swap(SavedMaxPressure);
+  CurrSetPressure.swap(SavedPressure);
+
+#ifndef NDEBUG
+  if (!PDiff)
+    return;
+
+  // Check if the alternate algorithm yields the same result.
+  RegPressureDelta Delta2;
+  getUpwardPressureDelta(MI, *PDiff, Delta2, CriticalPSets, MaxPressureLimit);
+  if (Delta != Delta2) {
+    dbgs() << "PDiff: ";
+    PDiff->dump(*TRI);
+    dbgs() << "DELTA: " << *MI;
+    if (Delta.Excess.isValid())
+      dbgs() << "Excess1 " << TRI->getRegPressureSetName(Delta.Excess.getPSet())
+             << " " << Delta.Excess.getUnitInc() << "\n";
+    if (Delta.CriticalMax.isValid())
+      dbgs() << "Critic1 " << TRI->getRegPressureSetName(Delta.CriticalMax.getPSet())
+             << " " << Delta.CriticalMax.getUnitInc() << "\n";
+    if (Delta.CurrentMax.isValid())
+      dbgs() << "CurrMx1 " << TRI->getRegPressureSetName(Delta.CurrentMax.getPSet())
+             << " " << Delta.CurrentMax.getUnitInc() << "\n";
+    if (Delta2.Excess.isValid())
+      dbgs() << "Excess2 " << TRI->getRegPressureSetName(Delta2.Excess.getPSet())
+             << " " << Delta2.Excess.getUnitInc() << "\n";
+    if (Delta2.CriticalMax.isValid())
+      dbgs() << "Critic2 " << TRI->getRegPressureSetName(Delta2.CriticalMax.getPSet())
+             << " " << Delta2.CriticalMax.getUnitInc() << "\n";
+    if (Delta2.CurrentMax.isValid())
+      dbgs() << "CurrMx2 " << TRI->getRegPressureSetName(Delta2.CurrentMax.getPSet())
+             << " " << Delta2.CurrentMax.getUnitInc() << "\n";
+    llvm_unreachable("RegP Delta Mismatch");
+  }
+#endif
+}
+
+/// This is the fast version of querying register pressure that does not
+/// directly depend on current liveness.
+///
+/// @param Delta captures information needed for heuristics.
+///
+/// @param CriticalPSets Are the pressure sets that are known to exceed some
+/// limit within the region, not necessarily at the current position.
+///
+/// @param MaxPressureLimit Is the max pressure within the region, not
+/// necessarily at the current position.
+void RegPressureTracker::
+getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,
+                       RegPressureDelta &Delta,
+                       ArrayRef<PressureChange> CriticalPSets,
+                       ArrayRef<unsigned> MaxPressureLimit) const {
+  unsigned CritIdx = 0, CritEnd = CriticalPSets.size();
+  for (PressureDiff::const_iterator
+         PDiffI = PDiff.begin(), PDiffE = PDiff.end();
+       PDiffI != PDiffE && PDiffI->isValid(); ++PDiffI) {
+
+    unsigned PSetID = PDiffI->getPSet();
+    unsigned Limit = RCI->getRegPressureSetLimit(PSetID);
+    if (!LiveThruPressure.empty())
+      Limit += LiveThruPressure[PSetID];
+
+    unsigned POld = CurrSetPressure[PSetID];
+    unsigned MOld = P.MaxSetPressure[PSetID];
+    unsigned MNew = MOld;
+    // Ignore DeadDefs here because they aren't captured by PressureChange.
+    unsigned PNew = POld + PDiffI->getUnitInc();
+    assert((PDiffI->getUnitInc() >= 0) == (PNew >= POld)
+           && "PSet overflow/underflow");
+    if (PNew > MOld)
+      MNew = PNew;
+    // Check if current pressure has exceeded the limit.
+    if (!Delta.Excess.isValid()) {
+      unsigned ExcessInc = 0;
+      if (PNew > Limit)
+        ExcessInc = POld > Limit ? PNew - POld : PNew - Limit;
+      else if (POld > Limit)
+        ExcessInc = Limit - POld;
+      if (ExcessInc) {
+        Delta.Excess = PressureChange(PSetID);
+        Delta.Excess.setUnitInc(ExcessInc);
+      }
+    }
+    // Check if max pressure has exceeded a critical pressure set max.
+    if (MNew == MOld)
+      continue;
+    if (!Delta.CriticalMax.isValid()) {
+      while (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() < PSetID)
+        ++CritIdx;
+
+      if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) {
+        int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc();
+        if (CritInc > 0 && CritInc <= std::numeric_limits<int16_t>::max()) {
+          Delta.CriticalMax = PressureChange(PSetID);
+          Delta.CriticalMax.setUnitInc(CritInc);
+        }
+      }
+    }
+    // Check if max pressure has exceeded the current max.
+    if (!Delta.CurrentMax.isValid() && MNew > MaxPressureLimit[PSetID]) {
+      Delta.CurrentMax = PressureChange(PSetID);
+      Delta.CurrentMax.setUnitInc(MNew - MOld);
+    }
+  }
+}
+
+/// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
+/// The query starts with a lane bitmask which gets lanes/bits removed for every
+/// use we find.
+static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
+                                  SlotIndex PriorUseIdx, SlotIndex NextUseIdx,
+                                  const MachineRegisterInfo &MRI,
+                                  const LiveIntervals *LIS) {
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+    if (MO.isUndef())
+      continue;
+    const MachineInstr *MI = MO.getParent();
+    SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
+    if (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx) {
+      unsigned SubRegIdx = MO.getSubReg();
+      LaneBitmask UseMask = TRI.getSubRegIndexLaneMask(SubRegIdx);
+      LastUseMask &= ~UseMask;
+      if (LastUseMask.none())
+        return LaneBitmask::getNone();
+    }
+  }
+  return LastUseMask;
+}
+
+LaneBitmask RegPressureTracker::getLiveLanesAt(Register RegUnit,
+                                               SlotIndex Pos) const {
+  assert(RequireIntervals);
+  return getLanesWithProperty(*LIS, *MRI, TrackLaneMasks, RegUnit, Pos,
+                              LaneBitmask::getAll(),
+      [](const LiveRange &LR, SlotIndex Pos) {
+        return LR.liveAt(Pos);
+      });
+}
+
+LaneBitmask RegPressureTracker::getLastUsedLanes(Register RegUnit,
+                                                 SlotIndex Pos) const {
+  assert(RequireIntervals);
+  return getLanesWithProperty(*LIS, *MRI, TrackLaneMasks, RegUnit,
+                              Pos.getBaseIndex(), LaneBitmask::getNone(),
+      [](const LiveRange &LR, SlotIndex Pos) {
+        const LiveRange::Segment *S = LR.getSegmentContaining(Pos);
+        return S != nullptr && S->end == Pos.getRegSlot();
+      });
+}
+
+LaneBitmask RegPressureTracker::getLiveThroughAt(Register RegUnit,
+                                                 SlotIndex Pos) const {
+  assert(RequireIntervals);
+  return getLanesWithProperty(*LIS, *MRI, TrackLaneMasks, RegUnit, Pos,
+                              LaneBitmask::getNone(),
+      [](const LiveRange &LR, SlotIndex Pos) {
+        const LiveRange::Segment *S = LR.getSegmentContaining(Pos);
+        return S != nullptr && S->start < Pos.getRegSlot(true) &&
+               S->end != Pos.getDeadSlot();
+      });
+}
+
+/// Record the downward impact of a single instruction on current register
+/// pressure. Unlike the advance/recede pressure tracking interface, this does
+/// not discover live in/outs.
+///
+/// This is intended for speculative queries. It leaves pressure inconsistent
+/// with the current position, so must be restored by the caller.
+void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
+  assert(!MI->isDebugOrPseudoInstr() && "Expect a nondebug instruction.");
+
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+
+  // Account for register pressure similar to RegPressureTracker::recede().
+  RegisterOperands RegOpers;
+  RegOpers.collect(*MI, *TRI, *MRI, TrackLaneMasks, false);
+  if (TrackLaneMasks)
+    RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
+
+  if (RequireIntervals) {
+    for (const RegisterMaskPair &Use : RegOpers.Uses) {
+      Register Reg = Use.RegUnit;
+      LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
+      if (LastUseMask.none())
+        continue;
+      // The LastUseMask is queried from the liveness information of instruction
+      // which may be further down the schedule. Some lanes may actually not be
+      // last uses for the current position.
+      // FIXME: allow the caller to pass in the list of vreg uses that remain
+      // to be bottom-scheduled to avoid searching uses at each query.
+      SlotIndex CurrIdx = getCurrSlot();
+      LastUseMask
+        = findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, LIS);
+      if (LastUseMask.none())
+        continue;
+
+      LaneBitmask LiveMask = LiveRegs.contains(Reg);
+      LaneBitmask NewMask = LiveMask & ~LastUseMask;
+      decreaseRegPressure(Reg, LiveMask, NewMask);
+    }
+  }
+
+  // Generate liveness for defs.
+  for (const RegisterMaskPair &Def : RegOpers.Defs) {
+    Register Reg = Def.RegUnit;
+    LaneBitmask LiveMask = LiveRegs.contains(Reg);
+    LaneBitmask NewMask = LiveMask | Def.LaneMask;
+    increaseRegPressure(Reg, LiveMask, NewMask);
+  }
+
+  // Boost pressure for all dead defs together.
+  bumpDeadDefs(RegOpers.DeadDefs);
+}
+
+/// Consider the pressure increase caused by traversing this instruction
+/// top-down. Find the register class with the most change in its pressure limit
+/// based on the tracker's current pressure, and return the number of excess
+/// register units of that pressure set introduced by this instruction.
+///
+/// This assumes that the current LiveIn set is sufficient.
+///
+/// This is expensive for an on-the-fly query because it calls
+/// bumpDownwardPressure to recompute the pressure sets based on current
+/// liveness. We don't yet have a fast version of downward pressure tracking
+/// analogous to getUpwardPressureDelta.
+void RegPressureTracker::
+getMaxDownwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
+                            ArrayRef<PressureChange> CriticalPSets,
+                            ArrayRef<unsigned> MaxPressureLimit) {
+  // Snapshot Pressure.
+  std::vector<unsigned> SavedPressure = CurrSetPressure;
+  std::vector<unsigned> SavedMaxPressure = P.MaxSetPressure;
+
+  bumpDownwardPressure(MI);
+
+  computeExcessPressureDelta(SavedPressure, CurrSetPressure, Delta, RCI,
+                             LiveThruPressure);
+  computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets,
+                          MaxPressureLimit, Delta);
+  assert(Delta.CriticalMax.getUnitInc() >= 0 &&
+         Delta.CurrentMax.getUnitInc() >= 0 && "cannot decrease max pressure");
+
+  // Restore the tracker's state.
+  P.MaxSetPressure.swap(SavedMaxPressure);
+  CurrSetPressure.swap(SavedPressure);
+}
+
+/// Get the pressure of each PSet after traversing this instruction bottom-up.
+void RegPressureTracker::
+getUpwardPressure(const MachineInstr *MI,
+                  std::vector<unsigned> &PressureResult,
+                  std::vector<unsigned> &MaxPressureResult) {
+  // Snapshot pressure.
+  PressureResult = CurrSetPressure;
+  MaxPressureResult = P.MaxSetPressure;
+
+  bumpUpwardPressure(MI);
+
+  // Current pressure becomes the result. Restore current pressure.
+  P.MaxSetPressure.swap(MaxPressureResult);
+  CurrSetPressure.swap(PressureResult);
+}
+
+/// Get the pressure of each PSet after traversing this instruction top-down.
+void RegPressureTracker::
+getDownwardPressure(const MachineInstr *MI,
+                    std::vector<unsigned> &PressureResult,
+                    std::vector<unsigned> &MaxPressureResult) {
+  // Snapshot pressure.
+  PressureResult = CurrSetPressure;
+  MaxPressureResult = P.MaxSetPressure;
+
+  bumpDownwardPressure(MI);
+
+  // Current pressure becomes the result. Restore current pressure.
+  P.MaxSetPressure.swap(MaxPressureResult);
+  CurrSetPressure.swap(PressureResult);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegisterScavenging.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegisterScavenging.cpp
new file mode 100644
index 000000000000..c00d3fde6426
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -0,0 +1,686 @@
+//===- RegisterScavenging.cpp - Machine register scavenging ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements the machine register scavenger. It can provide
+/// information, such as unused registers, at any point in a machine basic
+/// block. It also provides a mechanism to make registers available by evicting
+/// them to spill slots.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "reg-scavenging"
+
+STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
+
+void RegScavenger::setRegUsed(Register Reg, LaneBitmask LaneMask) {
+  LiveUnits.addRegMasked(Reg, LaneMask);
+}
+
+void RegScavenger::init(MachineBasicBlock &MBB) {
+  MachineFunction &MF = *MBB.getParent();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  LiveUnits.init(*TRI);
+
+  assert((NumRegUnits == 0 || NumRegUnits == TRI->getNumRegUnits()) &&
+         "Target changed?");
+
+  // Self-initialize.
+  if (!this->MBB) {
+    NumRegUnits = TRI->getNumRegUnits();
+    KillRegUnits.resize(NumRegUnits);
+    DefRegUnits.resize(NumRegUnits);
+    TmpRegUnits.resize(NumRegUnits);
+  }
+  this->MBB = &MBB;
+
+  for (ScavengedInfo &SI : Scavenged) {
+    SI.Reg = 0;
+    SI.Restore = nullptr;
+  }
+
+  Tracking = false;
+}
+
+void RegScavenger::enterBasicBlock(MachineBasicBlock &MBB) {
+  init(MBB);
+  LiveUnits.addLiveIns(MBB);
+}
+
+void RegScavenger::enterBasicBlockEnd(MachineBasicBlock &MBB) {
+  init(MBB);
+  LiveUnits.addLiveOuts(MBB);
+
+  // Move internal iterator at the last instruction of the block.
+  if (!MBB.empty()) {
+    MBBI = std::prev(MBB.end());
+    Tracking = true;
+  }
+}
+
+void RegScavenger::addRegUnits(BitVector &BV, MCRegister Reg) {
+  for (MCRegUnit Unit : TRI->regunits(Reg))
+    BV.set(Unit);
+}
+
+void RegScavenger::removeRegUnits(BitVector &BV, MCRegister Reg) {
+  for (MCRegUnit Unit : TRI->regunits(Reg))
+    BV.reset(Unit);
+}
+
+void RegScavenger::determineKillsAndDefs() {
+  assert(Tracking && "Must be tracking to determine kills and defs");
+
+  MachineInstr &MI = *MBBI;
+  assert(!MI.isDebugInstr() && "Debug values have no kills or defs");
+
+  // Find out which registers are early clobbered, killed, defined, and marked
+  // def-dead in this instruction.
+  KillRegUnits.reset();
+  DefRegUnits.reset();
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isRegMask()) {
+      TmpRegUnits.reset();
+      for (unsigned RU = 0, RUEnd = TRI->getNumRegUnits(); RU != RUEnd; ++RU) {
+        for (MCRegUnitRootIterator RURI(RU, TRI); RURI.isValid(); ++RURI) {
+          if (MO.clobbersPhysReg(*RURI)) {
+            TmpRegUnits.set(RU);
+            break;
+          }
+        }
+      }
+
+      // Apply the mask.
+      KillRegUnits |= TmpRegUnits;
+    }
+    if (!MO.isReg())
+      continue;
+    if (!MO.getReg().isPhysical() || isReserved(MO.getReg()))
+      continue;
+    MCRegister Reg = MO.getReg().asMCReg();
+
+    if (MO.isUse()) {
+      // Ignore undef uses.
+      if (MO.isUndef())
+        continue;
+      if (MO.isKill())
+        addRegUnits(KillRegUnits, Reg);
+    } else {
+      assert(MO.isDef());
+      if (MO.isDead())
+        addRegUnits(KillRegUnits, Reg);
+      else
+        addRegUnits(DefRegUnits, Reg);
+    }
+  }
+}
+
+void RegScavenger::forward() {
+  // Move ptr forward.
+  if (!Tracking) {
+    MBBI = MBB->begin();
+    Tracking = true;
+  } else {
+    assert(MBBI != MBB->end() && "Already past the end of the basic block!");
+    MBBI = std::next(MBBI);
+  }
+  assert(MBBI != MBB->end() && "Already at the end of the basic block!");
+
+  MachineInstr &MI = *MBBI;
+
+  for (ScavengedInfo &I : Scavenged) {
+    if (I.Restore != &MI)
+      continue;
+
+    I.Reg = 0;
+    I.Restore = nullptr;
+  }
+
+  if (MI.isDebugOrPseudoInstr())
+    return;
+
+  determineKillsAndDefs();
+
+  // Verify uses and defs.
+#ifndef NDEBUG
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg.isPhysical() || isReserved(Reg))
+      continue;
+    if (MO.isUse()) {
+      if (MO.isUndef())
+        continue;
+      if (!isRegUsed(Reg)) {
+        // Check if it's partial live: e.g.
+        // D0 = insert_subreg undef D0, S0
+        // ... D0
+        // The problem is the insert_subreg could be eliminated. The use of
+        // D0 is using a partially undef value. This is not *incorrect* since
+        // S1 is can be freely clobbered.
+        // Ideally we would like a way to model this, but leaving the
+        // insert_subreg around causes both correctness and performance issues.
+        if (none_of(TRI->subregs(Reg),
+                    [&](MCPhysReg SR) { return isRegUsed(SR); }) &&
+            none_of(TRI->superregs(Reg),
+                    [&](MCPhysReg SR) { return isRegUsed(SR); })) {
+          MBB->getParent()->verify(nullptr, "In Register Scavenger");
+          llvm_unreachable("Using an undefined register!");
+        }
+      }
+    } else {
+      assert(MO.isDef());
+#if 0
+      // FIXME: Enable this once we've figured out how to correctly transfer
+      // implicit kills during codegen passes like the coalescer.
+      assert((KillRegs.test(Reg) || isUnused(Reg) ||
+              isLiveInButUnusedBefore(Reg, MI, MBB, TRI, MRI)) &&
+             "Re-defining a live register!");
+#endif
+    }
+  }
+#endif // NDEBUG
+
+  // Commit the changes.
+  setUnused(KillRegUnits);
+  setUsed(DefRegUnits);
+}
+
+void RegScavenger::backward() {
+  assert(Tracking && "Must be tracking to determine kills and defs");
+
+  const MachineInstr &MI = *MBBI;
+  LiveUnits.stepBackward(MI);
+
+  // Expire scavenge spill frameindex uses.
+  for (ScavengedInfo &I : Scavenged) {
+    if (I.Restore == &MI) {
+      I.Reg = 0;
+      I.Restore = nullptr;
+    }
+  }
+
+  if (MBBI == MBB->begin()) {
+    MBBI = MachineBasicBlock::iterator(nullptr);
+    Tracking = false;
+  } else
+    --MBBI;
+}
+
+bool RegScavenger::isRegUsed(Register Reg, bool includeReserved) const {
+  if (isReserved(Reg))
+    return includeReserved;
+  return !LiveUnits.available(Reg);
+}
+
+Register RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
+  for (Register Reg : *RC) {
+    if (!isRegUsed(Reg)) {
+      LLVM_DEBUG(dbgs() << "Scavenger found unused reg: " << printReg(Reg, TRI)
+                        << "\n");
+      return Reg;
+    }
+  }
+  return 0;
+}
+
+BitVector RegScavenger::getRegsAvailable(const TargetRegisterClass *RC) {
+  BitVector Mask(TRI->getNumRegs());
+  for (Register Reg : *RC)
+    if (!isRegUsed(Reg))
+      Mask.set(Reg);
+  return Mask;
+}
+
+/// Given the bitvector \p Available of free register units at position
+/// \p From. Search backwards to find a register that is part of \p
+/// Candidates and not used/clobbered until the point \p To. If there is
+/// multiple candidates continue searching and pick the one that is not used/
+/// clobbered for the longest time.
+/// Returns the register and the earliest position we know it to be free or
+/// the position MBB.end() if no register is available.
+static std::pair<MCPhysReg, MachineBasicBlock::iterator>
+findSurvivorBackwards(const MachineRegisterInfo &MRI,
+    MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
+    const LiveRegUnits &LiveOut, ArrayRef<MCPhysReg> AllocationOrder,
+    bool RestoreAfter) {
+  bool FoundTo = false;
+  MCPhysReg Survivor = 0;
+  MachineBasicBlock::iterator Pos;
+  MachineBasicBlock &MBB = *From->getParent();
+  unsigned InstrLimit = 25;
+  unsigned InstrCountDown = InstrLimit;
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  LiveRegUnits Used(TRI);
+
+  assert(From->getParent() == To->getParent() &&
+         "Target instruction is in other than current basic block, use "
+         "enterBasicBlockEnd first");
+
+  for (MachineBasicBlock::iterator I = From;; --I) {
+    const MachineInstr &MI = *I;
+
+    Used.accumulate(MI);
+
+    if (I == To) {
+      // See if one of the registers in RC wasn't used so far.
+      for (MCPhysReg Reg : AllocationOrder) {
+        if (!MRI.isReserved(Reg) && Used.available(Reg) &&
+            LiveOut.available(Reg))
+          return std::make_pair(Reg, MBB.end());
+      }
+      // Otherwise we will continue up to InstrLimit instructions to find
+      // the register which is not defined/used for the longest time.
+      FoundTo = true;
+      Pos = To;
+      // Note: It was fine so far to start our search at From, however now that
+      // we have to spill, and can only place the restore after From then
+      // add the regs used/defed by std::next(From) to the set.
+      if (RestoreAfter)
+        Used.accumulate(*std::next(From));
+    }
+    if (FoundTo) {
+      // Don't search to FrameSetup instructions if we were searching from
+      // Non-FrameSetup instructions. Otherwise, the spill position may point
+      // before FrameSetup instructions.
+      if (!From->getFlag(MachineInstr::FrameSetup) &&
+          MI.getFlag(MachineInstr::FrameSetup))
+        break;
+
+      if (Survivor == 0 || !Used.available(Survivor)) {
+        MCPhysReg AvilableReg = 0;
+        for (MCPhysReg Reg : AllocationOrder) {
+          if (!MRI.isReserved(Reg) && Used.available(Reg)) {
+            AvilableReg = Reg;
+            break;
+          }
+        }
+        if (AvilableReg == 0)
+          break;
+        Survivor = AvilableReg;
+      }
+      if (--InstrCountDown == 0)
+        break;
+
+      // Keep searching when we find a vreg since the spilled register will
+      // be usefull for this other vreg as well later.
+      bool FoundVReg = false;
+      for (const MachineOperand &MO : MI.operands()) {
+        if (MO.isReg() && MO.getReg().isVirtual()) {
+          FoundVReg = true;
+          break;
+        }
+      }
+      if (FoundVReg) {
+        InstrCountDown = InstrLimit;
+        Pos = I;
+      }
+      if (I == MBB.begin())
+        break;
+    }
+    assert(I != MBB.begin() && "Did not find target instruction while "
+                               "iterating backwards");
+  }
+
+  return std::make_pair(Survivor, Pos);
+}
+
+static unsigned getFrameIndexOperandNum(MachineInstr &MI) {
+  unsigned i = 0;
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  return i;
+}
+
+RegScavenger::ScavengedInfo &
+RegScavenger::spill(Register Reg, const TargetRegisterClass &RC, int SPAdj,
+                    MachineBasicBlock::iterator Before,
+                    MachineBasicBlock::iterator &UseMI) {
+  // Find an available scavenging slot with size and alignment matching
+  // the requirements of the class RC.
+  const MachineFunction &MF = *Before->getMF();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  unsigned NeedSize = TRI->getSpillSize(RC);
+  Align NeedAlign = TRI->getSpillAlign(RC);
+
+  unsigned SI = Scavenged.size(), Diff = std::numeric_limits<unsigned>::max();
+  int FIB = MFI.getObjectIndexBegin(), FIE = MFI.getObjectIndexEnd();
+  for (unsigned I = 0; I < Scavenged.size(); ++I) {
+    if (Scavenged[I].Reg != 0)
+      continue;
+    // Verify that this slot is valid for this register.
+    int FI = Scavenged[I].FrameIndex;
+    if (FI < FIB || FI >= FIE)
+      continue;
+    unsigned S = MFI.getObjectSize(FI);
+    Align A = MFI.getObjectAlign(FI);
+    if (NeedSize > S || NeedAlign > A)
+      continue;
+    // Avoid wasting slots with large size and/or large alignment. Pick one
+    // that is the best fit for this register class (in street metric).
+    // Picking a larger slot than necessary could happen if a slot for a
+    // larger register is reserved before a slot for a smaller one. When
+    // trying to spill a smaller register, the large slot would be found
+    // first, thus making it impossible to spill the larger register later.
+    unsigned D = (S - NeedSize) + (A.value() - NeedAlign.value());
+    if (D < Diff) {
+      SI = I;
+      Diff = D;
+    }
+  }
+
+  if (SI == Scavenged.size()) {
+    // We need to scavenge a register but have no spill slot, the target
+    // must know how to do it (if not, we'll assert below).
+    Scavenged.push_back(ScavengedInfo(FIE));
+  }
+
+  // Avoid infinite regress
+  Scavenged[SI].Reg = Reg;
+
+  // If the target knows how to save/restore the register, let it do so;
+  // otherwise, use the emergency stack spill slot.
+  if (!TRI->saveScavengerRegister(*MBB, Before, UseMI, &RC, Reg)) {
+    // Spill the scavenged register before \p Before.
+    int FI = Scavenged[SI].FrameIndex;
+    if (FI < FIB || FI >= FIE) {
+      report_fatal_error(Twine("Error while trying to spill ") +
+                         TRI->getName(Reg) + " from class " +
+                         TRI->getRegClassName(&RC) +
+                         ": Cannot scavenge register without an emergency "
+                         "spill slot!");
+    }
+    TII->storeRegToStackSlot(*MBB, Before, Reg, true, FI, &RC, TRI, Register());
+    MachineBasicBlock::iterator II = std::prev(Before);
+
+    unsigned FIOperandNum = getFrameIndexOperandNum(*II);
+    TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
+
+    // Restore the scavenged register before its use (or first terminator).
+    TII->loadRegFromStackSlot(*MBB, UseMI, Reg, FI, &RC, TRI, Register());
+    II = std::prev(UseMI);
+
+    FIOperandNum = getFrameIndexOperandNum(*II);
+    TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
+  }
+  return Scavenged[SI];
+}
+
+Register RegScavenger::scavengeRegisterBackwards(const TargetRegisterClass &RC,
+                                                 MachineBasicBlock::iterator To,
+                                                 bool RestoreAfter, int SPAdj,
+                                                 bool AllowSpill) {
+  const MachineBasicBlock &MBB = *To->getParent();
+  const MachineFunction &MF = *MBB.getParent();
+
+  // Find the register whose use is furthest away.
+  MachineBasicBlock::iterator UseMI;
+  ArrayRef<MCPhysReg> AllocationOrder = RC.getRawAllocationOrder(MF);
+  std::pair<MCPhysReg, MachineBasicBlock::iterator> P =
+      findSurvivorBackwards(*MRI, MBBI, To, LiveUnits, AllocationOrder,
+                            RestoreAfter);
+  MCPhysReg Reg = P.first;
+  MachineBasicBlock::iterator SpillBefore = P.second;
+  // Found an available register?
+  if (Reg != 0 && SpillBefore == MBB.end()) {
+    LLVM_DEBUG(dbgs() << "Scavenged free register: " << printReg(Reg, TRI)
+               << '\n');
+    return Reg;
+  }
+
+  if (!AllowSpill)
+    return 0;
+
+  assert(Reg != 0 && "No register left to scavenge!");
+
+  MachineBasicBlock::iterator ReloadAfter =
+    RestoreAfter ? std::next(MBBI) : MBBI;
+  MachineBasicBlock::iterator ReloadBefore = std::next(ReloadAfter);
+  if (ReloadBefore != MBB.end())
+    LLVM_DEBUG(dbgs() << "Reload before: " << *ReloadBefore << '\n');
+  ScavengedInfo &Scavenged = spill(Reg, RC, SPAdj, SpillBefore, ReloadBefore);
+  Scavenged.Restore = &*std::prev(SpillBefore);
+  LiveUnits.removeReg(Reg);
+  LLVM_DEBUG(dbgs() << "Scavenged register with spill: " << printReg(Reg, TRI)
+             << " until " << *SpillBefore);
+  return Reg;
+}
+
+/// Allocate a register for the virtual register \p VReg. The last use of
+/// \p VReg is around the current position of the register scavenger \p RS.
+/// \p ReserveAfter controls whether the scavenged register needs to be reserved
+/// after the current instruction, otherwise it will only be reserved before the
+/// current instruction.
+static Register scavengeVReg(MachineRegisterInfo &MRI, RegScavenger &RS,
+                             Register VReg, bool ReserveAfter) {
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+#ifndef NDEBUG
+  // Verify that all definitions and uses are in the same basic block.
+  const MachineBasicBlock *CommonMBB = nullptr;
+  // Real definition for the reg, re-definitions are not considered.
+  const MachineInstr *RealDef = nullptr;
+  for (MachineOperand &MO : MRI.reg_nodbg_operands(VReg)) {
+    MachineBasicBlock *MBB = MO.getParent()->getParent();
+    if (CommonMBB == nullptr)
+      CommonMBB = MBB;
+    assert(MBB == CommonMBB && "All defs+uses must be in the same basic block");
+    if (MO.isDef()) {
+      const MachineInstr &MI = *MO.getParent();
+      if (!MI.readsRegister(VReg, &TRI)) {
+        assert((!RealDef || RealDef == &MI) &&
+               "Can have at most one definition which is not a redefinition");
+        RealDef = &MI;
+      }
+    }
+  }
+  assert(RealDef != nullptr && "Must have at least 1 Def");
+#endif
+
+  // We should only have one definition of the register. However to accommodate
+  // the requirements of two address code we also allow definitions in
+  // subsequent instructions provided they also read the register. That way
+  // we get a single contiguous lifetime.
+  //
+  // Definitions in MRI.def_begin() are unordered, search for the first.
+  MachineRegisterInfo::def_iterator FirstDef = llvm::find_if(
+      MRI.def_operands(VReg), [VReg, &TRI](const MachineOperand &MO) {
+        return !MO.getParent()->readsRegister(VReg, &TRI);
+      });
+  assert(FirstDef != MRI.def_end() &&
+         "Must have one definition that does not redefine vreg");
+  MachineInstr &DefMI = *FirstDef->getParent();
+
+  // The register scavenger will report a free register inserting an emergency
+  // spill/reload if necessary.
+  int SPAdj = 0;
+  const TargetRegisterClass &RC = *MRI.getRegClass(VReg);
+  Register SReg = RS.scavengeRegisterBackwards(RC, DefMI.getIterator(),
+                                               ReserveAfter, SPAdj);
+  MRI.replaceRegWith(VReg, SReg);
+  ++NumScavengedRegs;
+  return SReg;
+}
+
+/// Allocate (scavenge) vregs inside a single basic block.
+/// Returns true if the target spill callback created new vregs and a 2nd pass
+/// is necessary.
+static bool scavengeFrameVirtualRegsInBlock(MachineRegisterInfo &MRI,
+                                            RegScavenger &RS,
+                                            MachineBasicBlock &MBB) {
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  RS.enterBasicBlockEnd(MBB);
+
+  unsigned InitialNumVirtRegs = MRI.getNumVirtRegs();
+  bool NextInstructionReadsVReg = false;
+  for (MachineBasicBlock::iterator I = MBB.end(); I != MBB.begin(); ) {
+    --I;
+    // Move RegScavenger to the position between *I and *std::next(I).
+    RS.backward(I);
+
+    // Look for unassigned vregs in the uses of *std::next(I).
+    if (NextInstructionReadsVReg) {
+      MachineBasicBlock::iterator N = std::next(I);
+      const MachineInstr &NMI = *N;
+      for (const MachineOperand &MO : NMI.operands()) {
+        if (!MO.isReg())
+          continue;
+        Register Reg = MO.getReg();
+        // We only care about virtual registers and ignore virtual registers
+        // created by the target callbacks in the process (those will be handled
+        // in a scavenging round).
+        if (!Reg.isVirtual() ||
+            Register::virtReg2Index(Reg) >= InitialNumVirtRegs)
+          continue;
+        if (!MO.readsReg())
+          continue;
+
+        Register SReg = scavengeVReg(MRI, RS, Reg, true);
+        N->addRegisterKilled(SReg, &TRI, false);
+        RS.setRegUsed(SReg);
+      }
+    }
+
+    // Look for unassigned vregs in the defs of *I.
+    NextInstructionReadsVReg = false;
+    const MachineInstr &MI = *I;
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      Register Reg = MO.getReg();
+      // Only vregs, no newly created vregs (see above).
+      if (!Reg.isVirtual() ||
+          Register::virtReg2Index(Reg) >= InitialNumVirtRegs)
+        continue;
+      // We have to look at all operands anyway so we can precalculate here
+      // whether there is a reading operand. This allows use to skip the use
+      // step in the next iteration if there was none.
+      assert(!MO.isInternalRead() && "Cannot assign inside bundles");
+      assert((!MO.isUndef() || MO.isDef()) && "Cannot handle undef uses");
+      if (MO.readsReg()) {
+        NextInstructionReadsVReg = true;
+      }
+      if (MO.isDef()) {
+        Register SReg = scavengeVReg(MRI, RS, Reg, false);
+        I->addRegisterDead(SReg, &TRI, false);
+      }
+    }
+  }
+#ifndef NDEBUG
+  for (const MachineOperand &MO : MBB.front().operands()) {
+    if (!MO.isReg() || !MO.getReg().isVirtual())
+      continue;
+    assert(!MO.isInternalRead() && "Cannot assign inside bundles");
+    assert((!MO.isUndef() || MO.isDef()) && "Cannot handle undef uses");
+    assert(!MO.readsReg() && "Vreg use in first instruction not allowed");
+  }
+#endif
+
+  return MRI.getNumVirtRegs() != InitialNumVirtRegs;
+}
+
+void llvm::scavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger &RS) {
+  // FIXME: Iterating over the instruction stream is unnecessary. We can simply
+  // iterate over the vreg use list, which at this point only contains machine
+  // operands for which eliminateFrameIndex need a new scratch reg.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  // Shortcut.
+  if (MRI.getNumVirtRegs() == 0) {
+    MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
+    return;
+  }
+
+  // Run through the instructions and find any virtual registers.
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBB.empty())
+      continue;
+
+    bool Again = scavengeFrameVirtualRegsInBlock(MRI, RS, MBB);
+    if (Again) {
+      LLVM_DEBUG(dbgs() << "Warning: Required two scavenging passes for block "
+                        << MBB.getName() << '\n');
+      Again = scavengeFrameVirtualRegsInBlock(MRI, RS, MBB);
+      // The target required a 2nd run (because it created new vregs while
+      // spilling). Refuse to do another pass to keep compiletime in check.
+      if (Again)
+        report_fatal_error("Incomplete scavenging after 2nd pass");
+    }
+  }
+
+  MRI.clearVirtRegs();
+  MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
+}
+
+namespace {
+
+/// This class runs register scavenging independ of the PrologEpilogInserter.
+/// This is used in for testing.
+class ScavengerTest : public MachineFunctionPass {
+public:
+  static char ID;
+
+  ScavengerTest() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    const TargetSubtargetInfo &STI = MF.getSubtarget();
+    const TargetFrameLowering &TFL = *STI.getFrameLowering();
+
+    RegScavenger RS;
+    // Let's hope that calling those outside of PrologEpilogueInserter works
+    // well enough to initialize the scavenger with some emergency spillslots
+    // for the target.
+    BitVector SavedRegs;
+    TFL.determineCalleeSaves(MF, SavedRegs, &RS);
+    TFL.processFunctionBeforeFrameFinalized(MF, &RS);
+
+    // Let's scavenge the current function
+    scavengeFrameVirtualRegs(MF, RS);
+    return true;
+  }
+};
+
+} // end anonymous namespace
+
+char ScavengerTest::ID;
+
+INITIALIZE_PASS(ScavengerTest, "scavenger-test",
+                "Scavenge virtual registers inside basic blocks", false, false)
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RegisterUsageInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RegisterUsageInfo.cpp
new file mode 100644
index 000000000000..51bac3fc0a23
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RegisterUsageInfo.cpp
@@ -0,0 +1,99 @@
+//===- RegisterUsageInfo.cpp - Register Usage Information Storage ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This pass is required to take advantage of the interprocedural register
+/// allocation infrastructure.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegisterUsageInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+static cl::opt<bool> DumpRegUsage(
+    "print-regusage", cl::init(false), cl::Hidden,
+    cl::desc("print register usage details collected for analysis."));
+
+INITIALIZE_PASS(PhysicalRegisterUsageInfo, "reg-usage-info",
+                "Register Usage Information Storage", false, true)
+
+char PhysicalRegisterUsageInfo::ID = 0;
+
+void PhysicalRegisterUsageInfo::setTargetMachine(const LLVMTargetMachine &TM) {
+  this->TM = &TM;
+}
+
+bool PhysicalRegisterUsageInfo::doInitialization(Module &M) {
+  RegMasks.grow(M.size());
+  return false;
+}
+
+bool PhysicalRegisterUsageInfo::doFinalization(Module &M) {
+  if (DumpRegUsage)
+    print(errs());
+
+  RegMasks.shrink_and_clear();
+  return false;
+}
+
+void PhysicalRegisterUsageInfo::storeUpdateRegUsageInfo(
+    const Function &FP, ArrayRef<uint32_t> RegMask) {
+  RegMasks[&FP] = RegMask;
+}
+
+ArrayRef<uint32_t>
+PhysicalRegisterUsageInfo::getRegUsageInfo(const Function &FP) {
+  auto It = RegMasks.find(&FP);
+  if (It != RegMasks.end())
+    return ArrayRef<uint32_t>(It->second);
+  return ArrayRef<uint32_t>();
+}
+
+void PhysicalRegisterUsageInfo::print(raw_ostream &OS, const Module *M) const {
+  using FuncPtrRegMaskPair = std::pair<const Function *, std::vector<uint32_t>>;
+
+  SmallVector<const FuncPtrRegMaskPair *, 64> FPRMPairVector;
+
+  // Create a vector of pointer to RegMasks entries
+  for (const auto &RegMask : RegMasks)
+    FPRMPairVector.push_back(&RegMask);
+
+  // sort the vector to print analysis in alphabatic order of function name.
+  llvm::sort(
+      FPRMPairVector,
+      [](const FuncPtrRegMaskPair *A, const FuncPtrRegMaskPair *B) -> bool {
+        return A->first->getName() < B->first->getName();
+      });
+
+  for (const FuncPtrRegMaskPair *FPRMPair : FPRMPairVector) {
+    OS << FPRMPair->first->getName() << " "
+       << "Clobbered Registers: ";
+    const TargetRegisterInfo *TRI
+        = TM->getSubtarget<TargetSubtargetInfo>(*(FPRMPair->first))
+          .getRegisterInfo();
+
+    for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) {
+      if (MachineOperand::clobbersPhysReg(&(FPRMPair->second[0]), PReg))
+        OS << printReg(PReg, TRI) << " ";
+    }
+    OS << "\n";
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp
new file mode 100644
index 000000000000..feb31e59f5fd
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp
@@ -0,0 +1,227 @@
+//===- RemoveRedundantDebugValues.cpp - Remove Redundant Debug Value MIs --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+
+/// \file RemoveRedundantDebugValues.cpp
+///
+/// The RemoveRedundantDebugValues pass removes redundant DBG_VALUEs that
+/// appear in MIR after the register allocator.
+
+#define DEBUG_TYPE "removeredundantdebugvalues"
+
+using namespace llvm;
+
+STATISTIC(NumRemovedBackward, "Number of DBG_VALUEs removed (backward scan)");
+STATISTIC(NumRemovedForward, "Number of DBG_VALUEs removed (forward scan)");
+
+namespace {
+
+class RemoveRedundantDebugValues : public MachineFunctionPass {
+public:
+  static char ID;
+
+  RemoveRedundantDebugValues();
+
+  bool reduceDbgValues(MachineFunction &MF);
+
+  /// Remove redundant debug value MIs for the given machine function.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+//            Implementation
+//===----------------------------------------------------------------------===//
+
+char RemoveRedundantDebugValues::ID = 0;
+
+char &llvm::RemoveRedundantDebugValuesID = RemoveRedundantDebugValues::ID;
+
+INITIALIZE_PASS(RemoveRedundantDebugValues, DEBUG_TYPE,
+                "Remove Redundant DEBUG_VALUE analysis", false, false)
+
+/// Default construct and initialize the pass.
+RemoveRedundantDebugValues::RemoveRedundantDebugValues()
+    : MachineFunctionPass(ID) {
+  initializeRemoveRedundantDebugValuesPass(*PassRegistry::getPassRegistry());
+}
+
+// This analysis aims to remove redundant DBG_VALUEs by going forward
+// in the basic block by considering the first DBG_VALUE as a valid
+// until its first (location) operand is not clobbered/modified.
+// For example:
+//   (1) DBG_VALUE $edi, !"var1", ...
+//   (2) <block of code that does affect $edi>
+//   (3) DBG_VALUE $edi, !"var1", ...
+//   ...
+// in this case, we can remove (3).
+// TODO: Support DBG_VALUE_LIST and other debug instructions.
+static bool reduceDbgValsForwardScan(MachineBasicBlock &MBB) {
+  LLVM_DEBUG(dbgs() << "\n == Forward Scan == \n");
+
+  SmallVector<MachineInstr *, 8> DbgValsToBeRemoved;
+  DenseMap<DebugVariable, std::pair<MachineOperand *, const DIExpression *>>
+      VariableMap;
+  const auto *TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
+
+  for (auto &MI : MBB) {
+    if (MI.isDebugValue()) {
+      DebugVariable Var(MI.getDebugVariable(), std::nullopt,
+                        MI.getDebugLoc()->getInlinedAt());
+      auto VMI = VariableMap.find(Var);
+      // Just stop tracking this variable, until we cover DBG_VALUE_LIST.
+      // 1  DBG_VALUE $rax, "x", DIExpression()
+      // ...
+      // 2  DBG_VALUE_LIST "x", DIExpression(...), $rax, $rbx
+      // ...
+      // 3  DBG_VALUE $rax, "x", DIExpression()
+      if (MI.isDebugValueList() && VMI != VariableMap.end()) {
+        VariableMap.erase(VMI);
+        continue;
+      }
+
+      MachineOperand &Loc = MI.getDebugOperand(0);
+      if (!Loc.isReg()) {
+        // If it it's not a register, just stop tracking such variable.
+        if (VMI != VariableMap.end())
+          VariableMap.erase(VMI);
+        continue;
+      }
+
+      // We have found a new value for a variable.
+      if (VMI == VariableMap.end() ||
+          VMI->second.first->getReg() != Loc.getReg() ||
+          VMI->second.second != MI.getDebugExpression()) {
+        VariableMap[Var] = {&Loc, MI.getDebugExpression()};
+        continue;
+      }
+
+      // Found an identical DBG_VALUE, so it can be considered
+      // for later removal.
+      DbgValsToBeRemoved.push_back(&MI);
+    }
+
+    if (MI.isMetaInstruction())
+      continue;
+
+    // Stop tracking any location that is clobbered by this instruction.
+    for (auto &Var : VariableMap) {
+      auto &LocOp = Var.second.first;
+      if (MI.modifiesRegister(LocOp->getReg(), TRI))
+        VariableMap.erase(Var.first);
+    }
+  }
+
+  for (auto &Instr : DbgValsToBeRemoved) {
+    LLVM_DEBUG(dbgs() << "removing "; Instr->dump());
+    Instr->eraseFromParent();
+    ++NumRemovedForward;
+  }
+
+  return !DbgValsToBeRemoved.empty();
+}
+
+// This analysis aims to remove redundant DBG_VALUEs by going backward
+// in the basic block and removing all but the last DBG_VALUE for any
+// given variable in a set of consecutive DBG_VALUE instructions.
+// For example:
+//   (1) DBG_VALUE $edi, !"var1", ...
+//   (2) DBG_VALUE $esi, !"var2", ...
+//   (3) DBG_VALUE $edi, !"var1", ...
+//   ...
+// in this case, we can remove (1).
+static bool reduceDbgValsBackwardScan(MachineBasicBlock &MBB) {
+  LLVM_DEBUG(dbgs() << "\n == Backward Scan == \n");
+  SmallVector<MachineInstr *, 8> DbgValsToBeRemoved;
+  SmallDenseSet<DebugVariable> VariableSet;
+
+  for (MachineInstr &MI : llvm::reverse(MBB)) {
+    if (MI.isDebugValue()) {
+      DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(),
+                        MI.getDebugLoc()->getInlinedAt());
+      auto R = VariableSet.insert(Var);
+      // If it is a DBG_VALUE describing a constant as:
+      //   DBG_VALUE 0, ...
+      // we just don't consider such instructions as candidates
+      // for redundant removal.
+      if (MI.isNonListDebugValue()) {
+        MachineOperand &Loc = MI.getDebugOperand(0);
+        if (!Loc.isReg()) {
+          // If we have already encountered this variable, just stop
+          // tracking it.
+          if (!R.second)
+            VariableSet.erase(Var);
+          continue;
+        }
+      }
+
+      // We have already encountered the value for this variable,
+      // so this one can be deleted.
+      if (!R.second)
+        DbgValsToBeRemoved.push_back(&MI);
+      continue;
+    }
+
+    // If we encountered a non-DBG_VALUE, try to find the next
+    // sequence with consecutive DBG_VALUE instructions.
+    VariableSet.clear();
+  }
+
+  for (auto &Instr : DbgValsToBeRemoved) {
+    LLVM_DEBUG(dbgs() << "removing "; Instr->dump());
+    Instr->eraseFromParent();
+    ++NumRemovedBackward;
+  }
+
+  return !DbgValsToBeRemoved.empty();
+}
+
+bool RemoveRedundantDebugValues::reduceDbgValues(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "\nDebug Value Reduction\n");
+
+  bool Changed = false;
+
+  for (auto &MBB : MF) {
+    Changed |= reduceDbgValsBackwardScan(MBB);
+    Changed |= reduceDbgValsForwardScan(MBB);
+  }
+
+  return Changed;
+}
+
+bool RemoveRedundantDebugValues::runOnMachineFunction(MachineFunction &MF) {
+  // Skip functions without debugging information.
+  if (!MF.getFunction().getSubprogram())
+    return false;
+
+  // Skip functions from NoDebug compilation units.
+  if (MF.getFunction().getSubprogram()->getUnit()->getEmissionKind() ==
+      DICompileUnit::NoDebug)
+    return false;
+
+  bool Changed = reduceDbgValues(MF);
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/RenameIndependentSubregs.cpp b/contrib/llvm-project/llvm/lib/CodeGen/RenameIndependentSubregs.cpp
new file mode 100644
index 000000000000..bc3ef1c0329a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/RenameIndependentSubregs.cpp
@@ -0,0 +1,405 @@
+//===-- RenameIndependentSubregs.cpp - Live Interval Analysis -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// Rename independent subregisters looks for virtual registers with
+/// independently used subregisters and renames them to new virtual registers.
+/// Example: In the following:
+///   %0:sub0<read-undef> = ...
+///   %0:sub1 = ...
+///   use %0:sub0
+///   %0:sub0 = ...
+///   use %0:sub0
+///   use %0:sub1
+/// sub0 and sub1 are never used together, and we have two independent sub0
+/// definitions. This pass will rename to:
+///   %0:sub0<read-undef> = ...
+///   %1:sub1<read-undef> = ...
+///   use %1:sub1
+///   %2:sub1<read-undef> = ...
+///   use %2:sub1
+///   use %0:sub0
+//
+//===----------------------------------------------------------------------===//
+
+#include "LiveRangeUtils.h"
+#include "PHIEliminationUtils.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "rename-independent-subregs"
+
+namespace {
+
+class RenameIndependentSubregs : public MachineFunctionPass {
+public:
+  static char ID;
+  RenameIndependentSubregs() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "Rename Disconnected Subregister Components";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addRequired<SlotIndexes>();
+    AU.addPreserved<SlotIndexes>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  struct SubRangeInfo {
+    ConnectedVNInfoEqClasses ConEQ;
+    LiveInterval::SubRange *SR;
+    unsigned Index;
+
+    SubRangeInfo(LiveIntervals &LIS, LiveInterval::SubRange &SR,
+                 unsigned Index)
+      : ConEQ(LIS), SR(&SR), Index(Index) {}
+  };
+
+  /// Split unrelated subregister components and rename them to new vregs.
+  bool renameComponents(LiveInterval &LI) const;
+
+  /// Build a vector of SubRange infos and a union find set of
+  /// equivalence classes.
+  /// Returns true if more than 1 equivalence class was found.
+  bool findComponents(IntEqClasses &Classes,
+                      SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+                      LiveInterval &LI) const;
+
+  /// Distribute the LiveInterval segments into the new LiveIntervals
+  /// belonging to their class.
+  void distribute(const IntEqClasses &Classes,
+                  const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+                  const SmallVectorImpl<LiveInterval*> &Intervals) const;
+
+  /// Constructs main liverange and add missing undef+dead flags.
+  void computeMainRangesFixFlags(const IntEqClasses &Classes,
+      const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+      const SmallVectorImpl<LiveInterval*> &Intervals) const;
+
+  /// Rewrite Machine Operands to use the new vreg belonging to their class.
+  void rewriteOperands(const IntEqClasses &Classes,
+                       const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+                       const SmallVectorImpl<LiveInterval*> &Intervals) const;
+
+
+  LiveIntervals *LIS = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+};
+
+} // end anonymous namespace
+
+char RenameIndependentSubregs::ID;
+
+char &llvm::RenameIndependentSubregsID = RenameIndependentSubregs::ID;
+
+INITIALIZE_PASS_BEGIN(RenameIndependentSubregs, DEBUG_TYPE,
+                      "Rename Independent Subregisters", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(RenameIndependentSubregs, DEBUG_TYPE,
+                    "Rename Independent Subregisters", false, false)
+
+bool RenameIndependentSubregs::renameComponents(LiveInterval &LI) const {
+  // Shortcut: We cannot have split components with a single definition.
+  if (LI.valnos.size() < 2)
+    return false;
+
+  SmallVector<SubRangeInfo, 4> SubRangeInfos;
+  IntEqClasses Classes;
+  if (!findComponents(Classes, SubRangeInfos, LI))
+    return false;
+
+  // Create a new VReg for each class.
+  Register Reg = LI.reg();
+  const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
+  SmallVector<LiveInterval*, 4> Intervals;
+  Intervals.push_back(&LI);
+  LLVM_DEBUG(dbgs() << printReg(Reg) << ": Found " << Classes.getNumClasses()
+                    << " equivalence classes.\n");
+  LLVM_DEBUG(dbgs() << printReg(Reg) << ": Splitting into newly created:");
+  for (unsigned I = 1, NumClasses = Classes.getNumClasses(); I < NumClasses;
+       ++I) {
+    Register NewVReg = MRI->createVirtualRegister(RegClass);
+    LiveInterval &NewLI = LIS->createEmptyInterval(NewVReg);
+    Intervals.push_back(&NewLI);
+    LLVM_DEBUG(dbgs() << ' ' << printReg(NewVReg));
+  }
+  LLVM_DEBUG(dbgs() << '\n');
+
+  rewriteOperands(Classes, SubRangeInfos, Intervals);
+  distribute(Classes, SubRangeInfos, Intervals);
+  computeMainRangesFixFlags(Classes, SubRangeInfos, Intervals);
+  return true;
+}
+
+bool RenameIndependentSubregs::findComponents(IntEqClasses &Classes,
+    SmallVectorImpl<RenameIndependentSubregs::SubRangeInfo> &SubRangeInfos,
+    LiveInterval &LI) const {
+  // First step: Create connected components for the VNInfos inside the
+  // subranges and count the global number of such components.
+  unsigned NumComponents = 0;
+  for (LiveInterval::SubRange &SR : LI.subranges()) {
+    SubRangeInfos.push_back(SubRangeInfo(*LIS, SR, NumComponents));
+    ConnectedVNInfoEqClasses &ConEQ = SubRangeInfos.back().ConEQ;
+
+    unsigned NumSubComponents = ConEQ.Classify(SR);
+    NumComponents += NumSubComponents;
+  }
+  // Shortcut: With only 1 subrange, the normal separate component tests are
+  // enough and we do not need to perform the union-find on the subregister
+  // segments.
+  if (SubRangeInfos.size() < 2)
+    return false;
+
+  // Next step: Build union-find structure over all subranges and merge classes
+  // across subranges when they are affected by the same MachineOperand.
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
+  Classes.grow(NumComponents);
+  Register Reg = LI.reg();
+  for (const MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+    if (!MO.isDef() && !MO.readsReg())
+      continue;
+    unsigned SubRegIdx = MO.getSubReg();
+    LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubRegIdx);
+    unsigned MergedID = ~0u;
+    for (RenameIndependentSubregs::SubRangeInfo &SRInfo : SubRangeInfos) {
+      const LiveInterval::SubRange &SR = *SRInfo.SR;
+      if ((SR.LaneMask & LaneMask).none())
+        continue;
+      SlotIndex Pos = LIS->getInstructionIndex(*MO.getParent());
+      Pos = MO.isDef() ? Pos.getRegSlot(MO.isEarlyClobber())
+                       : Pos.getBaseIndex();
+      const VNInfo *VNI = SR.getVNInfoAt(Pos);
+      if (VNI == nullptr)
+        continue;
+
+      // Map to local representant ID.
+      unsigned LocalID = SRInfo.ConEQ.getEqClass(VNI);
+      // Global ID
+      unsigned ID = LocalID + SRInfo.Index;
+      // Merge other sets
+      MergedID = MergedID == ~0u ? ID : Classes.join(MergedID, ID);
+    }
+  }
+
+  // Early exit if we ended up with a single equivalence class.
+  Classes.compress();
+  unsigned NumClasses = Classes.getNumClasses();
+  return NumClasses > 1;
+}
+
+void RenameIndependentSubregs::rewriteOperands(const IntEqClasses &Classes,
+    const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+    const SmallVectorImpl<LiveInterval*> &Intervals) const {
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
+  unsigned Reg = Intervals[0]->reg();
+  for (MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(Reg),
+       E = MRI->reg_nodbg_end(); I != E; ) {
+    MachineOperand &MO = *I++;
+    if (!MO.isDef() && !MO.readsReg())
+      continue;
+
+    auto *MI = MO.getParent();
+    SlotIndex Pos = LIS->getInstructionIndex(*MI);
+    Pos = MO.isDef() ? Pos.getRegSlot(MO.isEarlyClobber())
+                     : Pos.getBaseIndex();
+    unsigned SubRegIdx = MO.getSubReg();
+    LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubRegIdx);
+
+    unsigned ID = ~0u;
+    for (const SubRangeInfo &SRInfo : SubRangeInfos) {
+      const LiveInterval::SubRange &SR = *SRInfo.SR;
+      if ((SR.LaneMask & LaneMask).none())
+        continue;
+      const VNInfo *VNI = SR.getVNInfoAt(Pos);
+      if (VNI == nullptr)
+        continue;
+
+      // Map to local representant ID.
+      unsigned LocalID = SRInfo.ConEQ.getEqClass(VNI);
+      // Global ID
+      ID = Classes[LocalID + SRInfo.Index];
+      break;
+    }
+
+    unsigned VReg = Intervals[ID]->reg();
+    MO.setReg(VReg);
+
+    if (MO.isTied() && Reg != VReg) {
+      /// Undef use operands are not tracked in the equivalence class,
+      /// but need to be updated if they are tied; take care to only
+      /// update the tied operand.
+      unsigned OperandNo = MO.getOperandNo();
+      unsigned TiedIdx = MI->findTiedOperandIdx(OperandNo);
+      MI->getOperand(TiedIdx).setReg(VReg);
+
+      // above substitution breaks the iterator, so restart.
+      I = MRI->reg_nodbg_begin(Reg);
+    }
+  }
+  // TODO: We could attempt to recompute new register classes while visiting
+  // the operands: Some of the split register may be fine with less constraint
+  // classes than the original vreg.
+}
+
+void RenameIndependentSubregs::distribute(const IntEqClasses &Classes,
+    const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+    const SmallVectorImpl<LiveInterval*> &Intervals) const {
+  unsigned NumClasses = Classes.getNumClasses();
+  SmallVector<unsigned, 8> VNIMapping;
+  SmallVector<LiveInterval::SubRange*, 8> SubRanges;
+  BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+  for (const SubRangeInfo &SRInfo : SubRangeInfos) {
+    LiveInterval::SubRange &SR = *SRInfo.SR;
+    unsigned NumValNos = SR.valnos.size();
+    VNIMapping.clear();
+    VNIMapping.reserve(NumValNos);
+    SubRanges.clear();
+    SubRanges.resize(NumClasses-1, nullptr);
+    for (unsigned I = 0; I < NumValNos; ++I) {
+      const VNInfo &VNI = *SR.valnos[I];
+      unsigned LocalID = SRInfo.ConEQ.getEqClass(&VNI);
+      unsigned ID = Classes[LocalID + SRInfo.Index];
+      VNIMapping.push_back(ID);
+      if (ID > 0 && SubRanges[ID-1] == nullptr)
+        SubRanges[ID-1] = Intervals[ID]->createSubRange(Allocator, SR.LaneMask);
+    }
+    DistributeRange(SR, SubRanges.data(), VNIMapping);
+  }
+}
+
+static bool subRangeLiveAt(const LiveInterval &LI, SlotIndex Pos) {
+  for (const LiveInterval::SubRange &SR : LI.subranges()) {
+    if (SR.liveAt(Pos))
+      return true;
+  }
+  return false;
+}
+
+void RenameIndependentSubregs::computeMainRangesFixFlags(
+    const IntEqClasses &Classes,
+    const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+    const SmallVectorImpl<LiveInterval*> &Intervals) const {
+  BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+  const SlotIndexes &Indexes = *LIS->getSlotIndexes();
+  for (size_t I = 0, E = Intervals.size(); I < E; ++I) {
+    LiveInterval &LI = *Intervals[I];
+    Register Reg = LI.reg();
+
+    LI.removeEmptySubRanges();
+
+    // There must be a def (or live-in) before every use. Splitting vregs may
+    // violate this principle as the splitted vreg may not have a definition on
+    // every path. Fix this by creating IMPLICIT_DEF instruction as necessary.
+    for (const LiveInterval::SubRange &SR : LI.subranges()) {
+      // Search for "PHI" value numbers in the subranges. We must find a live
+      // value in each predecessor block, add an IMPLICIT_DEF where it is
+      // missing.
+      for (unsigned I = 0; I < SR.valnos.size(); ++I) {
+        const VNInfo &VNI = *SR.valnos[I];
+        if (VNI.isUnused() || !VNI.isPHIDef())
+          continue;
+
+        SlotIndex Def = VNI.def;
+        MachineBasicBlock &MBB = *Indexes.getMBBFromIndex(Def);
+        for (MachineBasicBlock *PredMBB : MBB.predecessors()) {
+          SlotIndex PredEnd = Indexes.getMBBEndIdx(PredMBB);
+          if (subRangeLiveAt(LI, PredEnd.getPrevSlot()))
+            continue;
+
+          MachineBasicBlock::iterator InsertPos =
+            llvm::findPHICopyInsertPoint(PredMBB, &MBB, Reg);
+          const MCInstrDesc &MCDesc = TII->get(TargetOpcode::IMPLICIT_DEF);
+          MachineInstrBuilder ImpDef = BuildMI(*PredMBB, InsertPos,
+                                               DebugLoc(), MCDesc, Reg);
+          SlotIndex DefIdx = LIS->InsertMachineInstrInMaps(*ImpDef);
+          SlotIndex RegDefIdx = DefIdx.getRegSlot();
+          for (LiveInterval::SubRange &SR : LI.subranges()) {
+            VNInfo *SRVNI = SR.getNextValue(RegDefIdx, Allocator);
+            SR.addSegment(LiveRange::Segment(RegDefIdx, PredEnd, SRVNI));
+          }
+        }
+      }
+    }
+
+    for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+      if (!MO.isDef())
+        continue;
+      unsigned SubRegIdx = MO.getSubReg();
+      if (SubRegIdx == 0)
+        continue;
+      // After assigning the new vreg we may not have any other sublanes living
+      // in and out of the instruction anymore. We need to add new dead and
+      // undef flags in these cases.
+      if (!MO.isUndef()) {
+        SlotIndex Pos = LIS->getInstructionIndex(*MO.getParent());
+        if (!subRangeLiveAt(LI, Pos))
+          MO.setIsUndef();
+      }
+      if (!MO.isDead()) {
+        SlotIndex Pos = LIS->getInstructionIndex(*MO.getParent()).getDeadSlot();
+        if (!subRangeLiveAt(LI, Pos))
+          MO.setIsDead();
+      }
+    }
+
+    if (I == 0)
+      LI.clear();
+    LIS->constructMainRangeFromSubranges(LI);
+    // A def of a subregister may be a use of other register lanes. Replacing
+    // such a def with a def of a different register will eliminate the use,
+    // and may cause the recorded live range to be larger than the actual
+    // liveness in the program IR.
+    LIS->shrinkToUses(&LI);
+  }
+}
+
+bool RenameIndependentSubregs::runOnMachineFunction(MachineFunction &MF) {
+  // Skip renaming if liveness of subregister is not tracked.
+  MRI = &MF.getRegInfo();
+  if (!MRI->subRegLivenessEnabled())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Renaming independent subregister live ranges in "
+                    << MF.getName() << '\n');
+
+  LIS = &getAnalysis<LiveIntervals>();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  // Iterate over all vregs. Note that we query getNumVirtRegs() the newly
+  // created vregs end up with higher numbers but do not need to be visited as
+  // there can't be any further splitting.
+  bool Changed = false;
+  for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+    if (!LIS->hasInterval(Reg))
+      continue;
+    LiveInterval &LI = LIS->getInterval(Reg);
+    if (!LI.hasSubRanges())
+      continue;
+
+    Changed |= renameComponents(LI);
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
new file mode 100644
index 000000000000..57cd1fcffb61
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -0,0 +1,251 @@
+//=== ReplaceWithVeclib.cpp - Replace vector intrinsics with veclib calls -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Replaces calls to LLVM vector intrinsics (i.e., calls to LLVM intrinsics
+// with vector operands) with matching calls to functions from a vector
+// library (e.g., libmvec, SVML) according to TargetLibraryInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ReplaceWithVeclib.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "replace-with-veclib"
+
+STATISTIC(NumCallsReplaced,
+          "Number of calls to intrinsics that have been replaced.");
+
+STATISTIC(NumTLIFuncDeclAdded,
+          "Number of vector library function declarations added.");
+
+STATISTIC(NumFuncUsedAdded,
+          "Number of functions added to `llvm.compiler.used`");
+
+static bool replaceWithTLIFunction(CallInst &CI, const StringRef TLIName) {
+  Module *M = CI.getModule();
+
+  Function *OldFunc = CI.getCalledFunction();
+
+  // Check if the vector library function is already declared in this module,
+  // otherwise insert it.
+  Function *TLIFunc = M->getFunction(TLIName);
+  if (!TLIFunc) {
+    TLIFunc = Function::Create(OldFunc->getFunctionType(),
+                               Function::ExternalLinkage, TLIName, *M);
+    TLIFunc->copyAttributesFrom(OldFunc);
+
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added vector library function `"
+                      << TLIName << "` of type `" << *(TLIFunc->getType())
+                      << "` to module.\n");
+
+    ++NumTLIFuncDeclAdded;
+
+    // Add the freshly created function to llvm.compiler.used,
+    // similar to as it is done in InjectTLIMappings
+    appendToCompilerUsed(*M, {TLIFunc});
+
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << TLIName
+                      << "` to `@llvm.compiler.used`.\n");
+    ++NumFuncUsedAdded;
+  }
+
+  // Replace the call to the vector intrinsic with a call
+  // to the corresponding function from the vector library.
+  IRBuilder<> IRBuilder(&CI);
+  SmallVector<Value *> Args(CI.args());
+  // Preserve the operand bundles.
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  CI.getOperandBundlesAsDefs(OpBundles);
+  CallInst *Replacement = IRBuilder.CreateCall(TLIFunc, Args, OpBundles);
+  assert(OldFunc->getFunctionType() == TLIFunc->getFunctionType() &&
+         "Expecting function types to be identical");
+  CI.replaceAllUsesWith(Replacement);
+  if (isa<FPMathOperator>(Replacement)) {
+    // Preserve fast math flags for FP math.
+    Replacement->copyFastMathFlags(&CI);
+  }
+
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
+                    << OldFunc->getName() << "` with call to `" << TLIName
+                    << "`.\n");
+  ++NumCallsReplaced;
+  return true;
+}
+
+static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
+                                    CallInst &CI) {
+  if (!CI.getCalledFunction()) {
+    return false;
+  }
+
+  auto IntrinsicID = CI.getCalledFunction()->getIntrinsicID();
+  if (IntrinsicID == Intrinsic::not_intrinsic) {
+    // Replacement is only performed for intrinsic functions
+    return false;
+  }
+
+  // Convert vector arguments to scalar type and check that
+  // all vector operands have identical vector width.
+  ElementCount VF = ElementCount::getFixed(0);
+  SmallVector<Type *> ScalarTypes;
+  for (auto Arg : enumerate(CI.args())) {
+    auto *ArgType = Arg.value()->getType();
+    // Vector calls to intrinsics can still have
+    // scalar operands for specific arguments.
+    if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, Arg.index())) {
+      ScalarTypes.push_back(ArgType);
+    } else {
+      // The argument in this place should be a vector if
+      // this is a call to a vector intrinsic.
+      auto *VectorArgTy = dyn_cast<VectorType>(ArgType);
+      if (!VectorArgTy) {
+        // The argument is not a vector, do not perform
+        // the replacement.
+        return false;
+      }
+      ElementCount NumElements = VectorArgTy->getElementCount();
+      if (NumElements.isScalable()) {
+        // The current implementation does not support
+        // scalable vectors.
+        return false;
+      }
+      if (VF.isNonZero() && VF != NumElements) {
+        // The different arguments differ in vector size.
+        return false;
+      } else {
+        VF = NumElements;
+      }
+      ScalarTypes.push_back(VectorArgTy->getElementType());
+    }
+  }
+
+  // Try to reconstruct the name for the scalar version of this
+  // intrinsic using the intrinsic ID and the argument types
+  // converted to scalar above.
+  std::string ScalarName;
+  if (Intrinsic::isOverloaded(IntrinsicID)) {
+    ScalarName = Intrinsic::getName(IntrinsicID, ScalarTypes, CI.getModule());
+  } else {
+    ScalarName = Intrinsic::getName(IntrinsicID).str();
+  }
+
+  if (!TLI.isFunctionVectorizable(ScalarName)) {
+    // The TargetLibraryInfo does not contain a vectorized version of
+    // the scalar function.
+    return false;
+  }
+
+  // Try to find the mapping for the scalar version of this intrinsic
+  // and the exact vector width of the call operands in the
+  // TargetLibraryInfo.
+  const std::string TLIName =
+      std::string(TLI.getVectorizedFunction(ScalarName, VF));
+
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `"
+                    << ScalarName << "` and vector width " << VF << ".\n");
+
+  if (!TLIName.empty()) {
+    // Found the correct mapping in the TargetLibraryInfo,
+    // replace the call to the intrinsic with a call to
+    // the vector library function.
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI function `" << TLIName
+                      << "`.\n");
+    return replaceWithTLIFunction(CI, TLIName);
+  }
+
+  return false;
+}
+
+static bool runImpl(const TargetLibraryInfo &TLI, Function &F) {
+  bool Changed = false;
+  SmallVector<CallInst *> ReplacedCalls;
+  for (auto &I : instructions(F)) {
+    if (auto *CI = dyn_cast<CallInst>(&I)) {
+      if (replaceWithCallToVeclib(TLI, *CI)) {
+        ReplacedCalls.push_back(CI);
+        Changed = true;
+      }
+    }
+  }
+  // Erase the calls to the intrinsics that have been replaced
+  // with calls to the vector library.
+  for (auto *CI : ReplacedCalls) {
+    CI->eraseFromParent();
+  }
+  return Changed;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// New pass manager implementation.
+////////////////////////////////////////////////////////////////////////////////
+PreservedAnalyses ReplaceWithVeclib::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto Changed = runImpl(TLI, F);
+  if (Changed) {
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    PA.preserve<TargetLibraryAnalysis>();
+    PA.preserve<ScalarEvolutionAnalysis>();
+    PA.preserve<LoopAccessAnalysis>();
+    PA.preserve<DemandedBitsAnalysis>();
+    PA.preserve<OptimizationRemarkEmitterAnalysis>();
+    return PA;
+  } else {
+    // The pass did not replace any calls, hence it preserves all analyses.
+    return PreservedAnalyses::all();
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Legacy PM Implementation.
+////////////////////////////////////////////////////////////////////////////////
+bool ReplaceWithVeclibLegacy::runOnFunction(Function &F) {
+  const TargetLibraryInfo &TLI =
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  return runImpl(TLI, F);
+}
+
+void ReplaceWithVeclibLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addPreserved<TargetLibraryInfoWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<OptimizationRemarkEmitterWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Legacy Pass manager initialization
+////////////////////////////////////////////////////////////////////////////////
+char ReplaceWithVeclibLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ReplaceWithVeclibLegacy, DEBUG_TYPE,
+                      "Replace intrinsics with calls to vector library", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(ReplaceWithVeclibLegacy, DEBUG_TYPE,
+                    "Replace intrinsics with calls to vector library", false,
+                    false)
+
+FunctionPass *llvm::createReplaceWithVeclibLegacyPass() {
+  return new ReplaceWithVeclibLegacy();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp
new file mode 100644
index 000000000000..11bdf3bb2ba8
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp
@@ -0,0 +1,97 @@
+//===-- ResetMachineFunctionPass.cpp - Reset Machine Function ----*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a pass that will conditionally reset a machine
+/// function as if it was just created. This is used to provide a fallback
+/// mechanism when GlobalISel fails, thus the condition for the reset to
+/// happen is that the MachineFunction has the FailedISel property.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "reset-machine-function"
+
+STATISTIC(NumFunctionsReset, "Number of functions reset");
+STATISTIC(NumFunctionsVisited, "Number of functions visited");
+
+namespace {
+  class ResetMachineFunction : public MachineFunctionPass {
+    /// Tells whether or not this pass should emit a fallback
+    /// diagnostic when it resets a function.
+    bool EmitFallbackDiag;
+    /// Whether we should abort immediately instead of resetting the function.
+    bool AbortOnFailedISel;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    ResetMachineFunction(bool EmitFallbackDiag = false,
+                         bool AbortOnFailedISel = false)
+        : MachineFunctionPass(ID), EmitFallbackDiag(EmitFallbackDiag),
+          AbortOnFailedISel(AbortOnFailedISel) {}
+
+    StringRef getPassName() const override { return "ResetMachineFunction"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addPreserved<StackProtector>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      ++NumFunctionsVisited;
+      // No matter what happened, whether we successfully selected the function
+      // or not, nothing is going to use the vreg types after us. Make sure they
+      // disappear.
+      auto ClearVRegTypesOnReturn =
+          make_scope_exit([&MF]() { MF.getRegInfo().clearVirtRegTypes(); });
+
+      if (MF.getProperties().hasProperty(
+              MachineFunctionProperties::Property::FailedISel)) {
+        if (AbortOnFailedISel)
+          report_fatal_error("Instruction selection failed");
+        LLVM_DEBUG(dbgs() << "Resetting: " << MF.getName() << '\n');
+        ++NumFunctionsReset;
+        MF.reset();
+        MF.initTargetMachineFunctionInfo(MF.getSubtarget());
+
+        const LLVMTargetMachine &TM = MF.getTarget();
+        // MRI callback for target specific initializations.
+        TM.registerMachineRegisterInfoCallback(MF);
+
+        if (EmitFallbackDiag) {
+          const Function &F = MF.getFunction();
+          DiagnosticInfoISelFallback DiagFallback(F);
+          F.getContext().diagnose(DiagFallback);
+        }
+        return true;
+      }
+      return false;
+    }
+
+  };
+} // end anonymous namespace
+
+char ResetMachineFunction::ID = 0;
+INITIALIZE_PASS(ResetMachineFunction, DEBUG_TYPE,
+                "Reset machine function if ISel failed", false, false)
+
+MachineFunctionPass *
+llvm::createResetMachineFunctionPass(bool EmitFallbackDiag = false,
+                                     bool AbortOnFailedISel = false) {
+  return new ResetMachineFunction(EmitFallbackDiag, AbortOnFailedISel);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SafeStack.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SafeStack.cpp
new file mode 100644
index 000000000000..bcad7a3f24da
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SafeStack.cpp
@@ -0,0 +1,939 @@
+//===- SafeStack.cpp - Safe Stack Insertion -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass splits the stack into the safe stack (kept as-is for LLVM backend)
+// and the unsafe stack (explicitly allocated and managed through the runtime
+// support library).
+//
+// http://clang.llvm.org/docs/SafeStack.html
+//
+//===----------------------------------------------------------------------===//
+
+#include "SafeStackLayout.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/StackLifetime.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::safestack;
+
+#define DEBUG_TYPE "safe-stack"
+
+namespace llvm {
+
+STATISTIC(NumFunctions, "Total number of functions");
+STATISTIC(NumUnsafeStackFunctions, "Number of functions with unsafe stack");
+STATISTIC(NumUnsafeStackRestorePointsFunctions,
+          "Number of functions that use setjmp or exceptions");
+
+STATISTIC(NumAllocas, "Total number of allocas");
+STATISTIC(NumUnsafeStaticAllocas, "Number of unsafe static allocas");
+STATISTIC(NumUnsafeDynamicAllocas, "Number of unsafe dynamic allocas");
+STATISTIC(NumUnsafeByValArguments, "Number of unsafe byval arguments");
+STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads");
+
+} // namespace llvm
+
+/// Use __safestack_pointer_address even if the platform has a faster way of
+/// access safe stack pointer.
+static cl::opt<bool>
+    SafeStackUsePointerAddress("safestack-use-pointer-address",
+                                  cl::init(false), cl::Hidden);
+
+static cl::opt<bool> ClColoring("safe-stack-coloring",
+                                cl::desc("enable safe stack coloring"),
+                                cl::Hidden, cl::init(true));
+
+namespace {
+
+/// The SafeStack pass splits the stack of each function into the safe
+/// stack, which is only accessed through memory safe dereferences (as
+/// determined statically), and the unsafe stack, which contains all
+/// local variables that are accessed in ways that we can't prove to
+/// be safe.
+class SafeStack {
+  Function &F;
+  const TargetLoweringBase &TL;
+  const DataLayout &DL;
+  DomTreeUpdater *DTU;
+  ScalarEvolution &SE;
+
+  Type *StackPtrTy;
+  Type *IntPtrTy;
+  Type *Int32Ty;
+  Type *Int8Ty;
+
+  Value *UnsafeStackPtr = nullptr;
+
+  /// Unsafe stack alignment. Each stack frame must ensure that the stack is
+  /// aligned to this value. We need to re-align the unsafe stack if the
+  /// alignment of any object on the stack exceeds this value.
+  ///
+  /// 16 seems like a reasonable upper bound on the alignment of objects that we
+  /// might expect to appear on the stack on most common targets.
+  static constexpr Align StackAlignment = Align::Constant<16>();
+
+  /// Return the value of the stack canary.
+  Value *getStackGuard(IRBuilder<> &IRB, Function &F);
+
+  /// Load stack guard from the frame and check if it has changed.
+  void checkStackGuard(IRBuilder<> &IRB, Function &F, Instruction &RI,
+                       AllocaInst *StackGuardSlot, Value *StackGuard);
+
+  /// Find all static allocas, dynamic allocas, return instructions and
+  /// stack restore points (exception unwind blocks and setjmp calls) in the
+  /// given function and append them to the respective vectors.
+  void findInsts(Function &F, SmallVectorImpl<AllocaInst *> &StaticAllocas,
+                 SmallVectorImpl<AllocaInst *> &DynamicAllocas,
+                 SmallVectorImpl<Argument *> &ByValArguments,
+                 SmallVectorImpl<Instruction *> &Returns,
+                 SmallVectorImpl<Instruction *> &StackRestorePoints);
+
+  /// Calculate the allocation size of a given alloca. Returns 0 if the
+  /// size can not be statically determined.
+  uint64_t getStaticAllocaAllocationSize(const AllocaInst* AI);
+
+  /// Allocate space for all static allocas in \p StaticAllocas,
+  /// replace allocas with pointers into the unsafe stack.
+  ///
+  /// \returns A pointer to the top of the unsafe stack after all unsafe static
+  /// allocas are allocated.
+  Value *moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,
+                                        ArrayRef<AllocaInst *> StaticAllocas,
+                                        ArrayRef<Argument *> ByValArguments,
+                                        Instruction *BasePointer,
+                                        AllocaInst *StackGuardSlot);
+
+  /// Generate code to restore the stack after all stack restore points
+  /// in \p StackRestorePoints.
+  ///
+  /// \returns A local variable in which to maintain the dynamic top of the
+  /// unsafe stack if needed.
+  AllocaInst *
+  createStackRestorePoints(IRBuilder<> &IRB, Function &F,
+                           ArrayRef<Instruction *> StackRestorePoints,
+                           Value *StaticTop, bool NeedDynamicTop);
+
+  /// Replace all allocas in \p DynamicAllocas with code to allocate
+  /// space dynamically on the unsafe stack and store the dynamic unsafe stack
+  /// top to \p DynamicTop if non-null.
+  void moveDynamicAllocasToUnsafeStack(Function &F, Value *UnsafeStackPtr,
+                                       AllocaInst *DynamicTop,
+                                       ArrayRef<AllocaInst *> DynamicAllocas);
+
+  bool IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize);
+
+  bool IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U,
+                          const Value *AllocaPtr, uint64_t AllocaSize);
+  bool IsAccessSafe(Value *Addr, uint64_t Size, const Value *AllocaPtr,
+                    uint64_t AllocaSize);
+
+  bool ShouldInlinePointerAddress(CallInst &CI);
+  void TryInlinePointerAddress();
+
+public:
+  SafeStack(Function &F, const TargetLoweringBase &TL, const DataLayout &DL,
+            DomTreeUpdater *DTU, ScalarEvolution &SE)
+      : F(F), TL(TL), DL(DL), DTU(DTU), SE(SE),
+        StackPtrTy(Type::getInt8PtrTy(F.getContext())),
+        IntPtrTy(DL.getIntPtrType(F.getContext())),
+        Int32Ty(Type::getInt32Ty(F.getContext())),
+        Int8Ty(Type::getInt8Ty(F.getContext())) {}
+
+  // Run the transformation on the associated function.
+  // Returns whether the function was changed.
+  bool run();
+};
+
+constexpr Align SafeStack::StackAlignment;
+
+uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
+  uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType());
+  if (AI->isArrayAllocation()) {
+    auto C = dyn_cast<ConstantInt>(AI->getArraySize());
+    if (!C)
+      return 0;
+    Size *= C->getZExtValue();
+  }
+  return Size;
+}
+
+bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
+                             const Value *AllocaPtr, uint64_t AllocaSize) {
+  const SCEV *AddrExpr = SE.getSCEV(Addr);
+  const auto *Base = dyn_cast<SCEVUnknown>(SE.getPointerBase(AddrExpr));
+  if (!Base || Base->getValue() != AllocaPtr) {
+    LLVM_DEBUG(
+        dbgs() << "[SafeStack] "
+               << (isa<AllocaInst>(AllocaPtr) ? "Alloca " : "ByValArgument ")
+               << *AllocaPtr << "\n"
+               << "SCEV " << *AddrExpr << " not directly based on alloca\n");
+    return false;
+  }
+
+  const SCEV *Expr = SE.removePointerBase(AddrExpr);
+  uint64_t BitWidth = SE.getTypeSizeInBits(Expr->getType());
+  ConstantRange AccessStartRange = SE.getUnsignedRange(Expr);
+  ConstantRange SizeRange =
+      ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize));
+  ConstantRange AccessRange = AccessStartRange.add(SizeRange);
+  ConstantRange AllocaRange =
+      ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AllocaSize));
+  bool Safe = AllocaRange.contains(AccessRange);
+
+  LLVM_DEBUG(
+      dbgs() << "[SafeStack] "
+             << (isa<AllocaInst>(AllocaPtr) ? "Alloca " : "ByValArgument ")
+             << *AllocaPtr << "\n"
+             << "            Access " << *Addr << "\n"
+             << "            SCEV " << *Expr
+             << " U: " << SE.getUnsignedRange(Expr)
+             << ", S: " << SE.getSignedRange(Expr) << "\n"
+             << "            Range " << AccessRange << "\n"
+             << "            AllocaRange " << AllocaRange << "\n"
+             << "            " << (Safe ? "safe" : "unsafe") << "\n");
+
+  return Safe;
+}
+
+bool SafeStack::IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U,
+                                   const Value *AllocaPtr,
+                                   uint64_t AllocaSize) {
+  if (auto MTI = dyn_cast<MemTransferInst>(MI)) {
+    if (MTI->getRawSource() != U && MTI->getRawDest() != U)
+      return true;
+  } else {
+    if (MI->getRawDest() != U)
+      return true;
+  }
+
+  const auto *Len = dyn_cast<ConstantInt>(MI->getLength());
+  // Non-constant size => unsafe. FIXME: try SCEV getRange.
+  if (!Len) return false;
+  return IsAccessSafe(U, Len->getZExtValue(), AllocaPtr, AllocaSize);
+}
+
+/// Check whether a given allocation must be put on the safe
+/// stack or not. The function analyzes all uses of AI and checks whether it is
+/// only accessed in a memory safe way (as decided statically).
+bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
+  // Go through all uses of this alloca and check whether all accesses to the
+  // allocated object are statically known to be memory safe and, hence, the
+  // object can be placed on the safe stack.
+  SmallPtrSet<const Value *, 16> Visited;
+  SmallVector<const Value *, 8> WorkList;
+  WorkList.push_back(AllocaPtr);
+
+  // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc.
+  while (!WorkList.empty()) {
+    const Value *V = WorkList.pop_back_val();
+    for (const Use &UI : V->uses()) {
+      auto I = cast<const Instruction>(UI.getUser());
+      assert(V == UI.get());
+
+      switch (I->getOpcode()) {
+      case Instruction::Load:
+        if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getType()), AllocaPtr,
+                          AllocaSize))
+          return false;
+        break;
+
+      case Instruction::VAArg:
+        // "va-arg" from a pointer is safe.
+        break;
+      case Instruction::Store:
+        if (V == I->getOperand(0)) {
+          // Stored the pointer - conservatively assume it may be unsafe.
+          LLVM_DEBUG(dbgs()
+                     << "[SafeStack] Unsafe alloca: " << *AllocaPtr
+                     << "\n            store of address: " << *I << "\n");
+          return false;
+        }
+
+        if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getOperand(0)->getType()),
+                          AllocaPtr, AllocaSize))
+          return false;
+        break;
+
+      case Instruction::Ret:
+        // Information leak.
+        return false;
+
+      case Instruction::Call:
+      case Instruction::Invoke: {
+        const CallBase &CS = *cast<CallBase>(I);
+
+        if (I->isLifetimeStartOrEnd())
+          continue;
+
+        if (const MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
+          if (!IsMemIntrinsicSafe(MI, UI, AllocaPtr, AllocaSize)) {
+            LLVM_DEBUG(dbgs()
+                       << "[SafeStack] Unsafe alloca: " << *AllocaPtr
+                       << "\n            unsafe memintrinsic: " << *I << "\n");
+            return false;
+          }
+          continue;
+        }
+
+        // LLVM 'nocapture' attribute is only set for arguments whose address
+        // is not stored, passed around, or used in any other non-trivial way.
+        // We assume that passing a pointer to an object as a 'nocapture
+        // readnone' argument is safe.
+        // FIXME: a more precise solution would require an interprocedural
+        // analysis here, which would look at all uses of an argument inside
+        // the function being called.
+        auto B = CS.arg_begin(), E = CS.arg_end();
+        for (const auto *A = B; A != E; ++A)
+          if (A->get() == V)
+            if (!(CS.doesNotCapture(A - B) && (CS.doesNotAccessMemory(A - B) ||
+                                               CS.doesNotAccessMemory()))) {
+              LLVM_DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
+                                << "\n            unsafe call: " << *I << "\n");
+              return false;
+            }
+        continue;
+      }
+
+      default:
+        if (Visited.insert(I).second)
+          WorkList.push_back(cast<const Instruction>(I));
+      }
+    }
+  }
+
+  // All uses of the alloca are safe, we can place it on the safe stack.
+  return true;
+}
+
+Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) {
+  Value *StackGuardVar = TL.getIRStackGuard(IRB);
+  Module *M = F.getParent();
+
+  if (!StackGuardVar) {
+    TL.insertSSPDeclarations(*M);
+    return IRB.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackguard));
+  }
+
+  return IRB.CreateLoad(StackPtrTy, StackGuardVar, "StackGuard");
+}
+
+void SafeStack::findInsts(Function &F,
+                          SmallVectorImpl<AllocaInst *> &StaticAllocas,
+                          SmallVectorImpl<AllocaInst *> &DynamicAllocas,
+                          SmallVectorImpl<Argument *> &ByValArguments,
+                          SmallVectorImpl<Instruction *> &Returns,
+                          SmallVectorImpl<Instruction *> &StackRestorePoints) {
+  for (Instruction &I : instructions(&F)) {
+    if (auto AI = dyn_cast<AllocaInst>(&I)) {
+      ++NumAllocas;
+
+      uint64_t Size = getStaticAllocaAllocationSize(AI);
+      if (IsSafeStackAlloca(AI, Size))
+        continue;
+
+      if (AI->isStaticAlloca()) {
+        ++NumUnsafeStaticAllocas;
+        StaticAllocas.push_back(AI);
+      } else {
+        ++NumUnsafeDynamicAllocas;
+        DynamicAllocas.push_back(AI);
+      }
+    } else if (auto RI = dyn_cast<ReturnInst>(&I)) {
+      if (CallInst *CI = I.getParent()->getTerminatingMustTailCall())
+        Returns.push_back(CI);
+      else
+        Returns.push_back(RI);
+    } else if (auto CI = dyn_cast<CallInst>(&I)) {
+      // setjmps require stack restore.
+      if (CI->getCalledFunction() && CI->canReturnTwice())
+        StackRestorePoints.push_back(CI);
+    } else if (auto LP = dyn_cast<LandingPadInst>(&I)) {
+      // Exception landing pads require stack restore.
+      StackRestorePoints.push_back(LP);
+    } else if (auto II = dyn_cast<IntrinsicInst>(&I)) {
+      if (II->getIntrinsicID() == Intrinsic::gcroot)
+        report_fatal_error(
+            "gcroot intrinsic not compatible with safestack attribute");
+    }
+  }
+  for (Argument &Arg : F.args()) {
+    if (!Arg.hasByValAttr())
+      continue;
+    uint64_t Size = DL.getTypeStoreSize(Arg.getParamByValType());
+    if (IsSafeStackAlloca(&Arg, Size))
+      continue;
+
+    ++NumUnsafeByValArguments;
+    ByValArguments.push_back(&Arg);
+  }
+}
+
+AllocaInst *
+SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F,
+                                    ArrayRef<Instruction *> StackRestorePoints,
+                                    Value *StaticTop, bool NeedDynamicTop) {
+  assert(StaticTop && "The stack top isn't set.");
+
+  if (StackRestorePoints.empty())
+    return nullptr;
+
+  // We need the current value of the shadow stack pointer to restore
+  // after longjmp or exception catching.
+
+  // FIXME: On some platforms this could be handled by the longjmp/exception
+  // runtime itself.
+
+  AllocaInst *DynamicTop = nullptr;
+  if (NeedDynamicTop) {
+    // If we also have dynamic alloca's, the stack pointer value changes
+    // throughout the function. For now we store it in an alloca.
+    DynamicTop = IRB.CreateAlloca(StackPtrTy, /*ArraySize=*/nullptr,
+                                  "unsafe_stack_dynamic_ptr");
+    IRB.CreateStore(StaticTop, DynamicTop);
+  }
+
+  // Restore current stack pointer after longjmp/exception catch.
+  for (Instruction *I : StackRestorePoints) {
+    ++NumUnsafeStackRestorePoints;
+
+    IRB.SetInsertPoint(I->getNextNode());
+    Value *CurrentTop =
+        DynamicTop ? IRB.CreateLoad(StackPtrTy, DynamicTop) : StaticTop;
+    IRB.CreateStore(CurrentTop, UnsafeStackPtr);
+  }
+
+  return DynamicTop;
+}
+
+void SafeStack::checkStackGuard(IRBuilder<> &IRB, Function &F, Instruction &RI,
+                                AllocaInst *StackGuardSlot, Value *StackGuard) {
+  Value *V = IRB.CreateLoad(StackPtrTy, StackGuardSlot);
+  Value *Cmp = IRB.CreateICmpNE(StackGuard, V);
+
+  auto SuccessProb = BranchProbabilityInfo::getBranchProbStackProtector(true);
+  auto FailureProb = BranchProbabilityInfo::getBranchProbStackProtector(false);
+  MDNode *Weights = MDBuilder(F.getContext())
+                        .createBranchWeights(SuccessProb.getNumerator(),
+                                             FailureProb.getNumerator());
+  Instruction *CheckTerm =
+      SplitBlockAndInsertIfThen(Cmp, &RI, /* Unreachable */ true, Weights, DTU);
+  IRBuilder<> IRBFail(CheckTerm);
+  // FIXME: respect -fsanitize-trap / -ftrap-function here?
+  FunctionCallee StackChkFail =
+      F.getParent()->getOrInsertFunction("__stack_chk_fail", IRB.getVoidTy());
+  IRBFail.CreateCall(StackChkFail, {});
+}
+
+/// We explicitly compute and set the unsafe stack layout for all unsafe
+/// static alloca instructions. We save the unsafe "base pointer" in the
+/// prologue into a local variable and restore it in the epilogue.
+Value *SafeStack::moveStaticAllocasToUnsafeStack(
+    IRBuilder<> &IRB, Function &F, ArrayRef<AllocaInst *> StaticAllocas,
+    ArrayRef<Argument *> ByValArguments, Instruction *BasePointer,
+    AllocaInst *StackGuardSlot) {
+  if (StaticAllocas.empty() && ByValArguments.empty())
+    return BasePointer;
+
+  DIBuilder DIB(*F.getParent());
+
+  StackLifetime SSC(F, StaticAllocas, StackLifetime::LivenessType::May);
+  static const StackLifetime::LiveRange NoColoringRange(1, true);
+  if (ClColoring)
+    SSC.run();
+
+  for (const auto *I : SSC.getMarkers()) {
+    auto *Op = dyn_cast<Instruction>(I->getOperand(1));
+    const_cast<IntrinsicInst *>(I)->eraseFromParent();
+    // Remove the operand bitcast, too, if it has no more uses left.
+    if (Op && Op->use_empty())
+      Op->eraseFromParent();
+  }
+
+  // Unsafe stack always grows down.
+  StackLayout SSL(StackAlignment);
+  if (StackGuardSlot) {
+    Type *Ty = StackGuardSlot->getAllocatedType();
+    Align Align = std::max(DL.getPrefTypeAlign(Ty), StackGuardSlot->getAlign());
+    SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot),
+                  Align, SSC.getFullLiveRange());
+  }
+
+  for (Argument *Arg : ByValArguments) {
+    Type *Ty = Arg->getParamByValType();
+    uint64_t Size = DL.getTypeStoreSize(Ty);
+    if (Size == 0)
+      Size = 1; // Don't create zero-sized stack objects.
+
+    // Ensure the object is properly aligned.
+    Align Align = DL.getPrefTypeAlign(Ty);
+    if (auto A = Arg->getParamAlign())
+      Align = std::max(Align, *A);
+    SSL.addObject(Arg, Size, Align, SSC.getFullLiveRange());
+  }
+
+  for (AllocaInst *AI : StaticAllocas) {
+    Type *Ty = AI->getAllocatedType();
+    uint64_t Size = getStaticAllocaAllocationSize(AI);
+    if (Size == 0)
+      Size = 1; // Don't create zero-sized stack objects.
+
+    // Ensure the object is properly aligned.
+    Align Align = std::max(DL.getPrefTypeAlign(Ty), AI->getAlign());
+
+    SSL.addObject(AI, Size, Align,
+                  ClColoring ? SSC.getLiveRange(AI) : NoColoringRange);
+  }
+
+  SSL.computeLayout();
+  Align FrameAlignment = SSL.getFrameAlignment();
+
+  // FIXME: tell SSL that we start at a less-then-MaxAlignment aligned location
+  // (AlignmentSkew).
+  if (FrameAlignment > StackAlignment) {
+    // Re-align the base pointer according to the max requested alignment.
+    IRB.SetInsertPoint(BasePointer->getNextNode());
+    BasePointer = cast<Instruction>(IRB.CreateIntToPtr(
+        IRB.CreateAnd(
+            IRB.CreatePtrToInt(BasePointer, IntPtrTy),
+            ConstantInt::get(IntPtrTy, ~(FrameAlignment.value() - 1))),
+        StackPtrTy));
+  }
+
+  IRB.SetInsertPoint(BasePointer->getNextNode());
+
+  if (StackGuardSlot) {
+    unsigned Offset = SSL.getObjectOffset(StackGuardSlot);
+    Value *Off = IRB.CreateGEP(Int8Ty, BasePointer, // BasePointer is i8*
+                               ConstantInt::get(Int32Ty, -Offset));
+    Value *NewAI =
+        IRB.CreateBitCast(Off, StackGuardSlot->getType(), "StackGuardSlot");
+
+    // Replace alloc with the new location.
+    StackGuardSlot->replaceAllUsesWith(NewAI);
+    StackGuardSlot->eraseFromParent();
+  }
+
+  for (Argument *Arg : ByValArguments) {
+    unsigned Offset = SSL.getObjectOffset(Arg);
+    MaybeAlign Align(SSL.getObjectAlignment(Arg));
+    Type *Ty = Arg->getParamByValType();
+
+    uint64_t Size = DL.getTypeStoreSize(Ty);
+    if (Size == 0)
+      Size = 1; // Don't create zero-sized stack objects.
+
+    Value *Off = IRB.CreateGEP(Int8Ty, BasePointer, // BasePointer is i8*
+                               ConstantInt::get(Int32Ty, -Offset));
+    Value *NewArg = IRB.CreateBitCast(Off, Arg->getType(),
+                                     Arg->getName() + ".unsafe-byval");
+
+    // Replace alloc with the new location.
+    replaceDbgDeclare(Arg, BasePointer, DIB, DIExpression::ApplyOffset,
+                      -Offset);
+    Arg->replaceAllUsesWith(NewArg);
+    IRB.SetInsertPoint(cast<Instruction>(NewArg)->getNextNode());
+    IRB.CreateMemCpy(Off, Align, Arg, Arg->getParamAlign(), Size);
+  }
+
+  // Allocate space for every unsafe static AllocaInst on the unsafe stack.
+  for (AllocaInst *AI : StaticAllocas) {
+    IRB.SetInsertPoint(AI);
+    unsigned Offset = SSL.getObjectOffset(AI);
+
+    replaceDbgDeclare(AI, BasePointer, DIB, DIExpression::ApplyOffset, -Offset);
+    replaceDbgValueForAlloca(AI, BasePointer, DIB, -Offset);
+
+    // Replace uses of the alloca with the new location.
+    // Insert address calculation close to each use to work around PR27844.
+    std::string Name = std::string(AI->getName()) + ".unsafe";
+    while (!AI->use_empty()) {
+      Use &U = *AI->use_begin();
+      Instruction *User = cast<Instruction>(U.getUser());
+
+      Instruction *InsertBefore;
+      if (auto *PHI = dyn_cast<PHINode>(User))
+        InsertBefore = PHI->getIncomingBlock(U)->getTerminator();
+      else
+        InsertBefore = User;
+
+      IRBuilder<> IRBUser(InsertBefore);
+      Value *Off = IRBUser.CreateGEP(Int8Ty, BasePointer, // BasePointer is i8*
+                                     ConstantInt::get(Int32Ty, -Offset));
+      Value *Replacement = IRBUser.CreateBitCast(Off, AI->getType(), Name);
+
+      if (auto *PHI = dyn_cast<PHINode>(User))
+        // PHI nodes may have multiple incoming edges from the same BB (why??),
+        // all must be updated at once with the same incoming value.
+        PHI->setIncomingValueForBlock(PHI->getIncomingBlock(U), Replacement);
+      else
+        U.set(Replacement);
+    }
+
+    AI->eraseFromParent();
+  }
+
+  // Re-align BasePointer so that our callees would see it aligned as
+  // expected.
+  // FIXME: no need to update BasePointer in leaf functions.
+  unsigned FrameSize = alignTo(SSL.getFrameSize(), StackAlignment);
+
+  MDBuilder MDB(F.getContext());
+  SmallVector<Metadata *, 2> Data;
+  Data.push_back(MDB.createString("unsafe-stack-size"));
+  Data.push_back(MDB.createConstant(ConstantInt::get(Int32Ty, FrameSize)));
+  MDNode *MD = MDTuple::get(F.getContext(), Data);
+  F.setMetadata(LLVMContext::MD_annotation, MD);
+
+  // Update shadow stack pointer in the function epilogue.
+  IRB.SetInsertPoint(BasePointer->getNextNode());
+
+  Value *StaticTop =
+      IRB.CreateGEP(Int8Ty, BasePointer, ConstantInt::get(Int32Ty, -FrameSize),
+                    "unsafe_stack_static_top");
+  IRB.CreateStore(StaticTop, UnsafeStackPtr);
+  return StaticTop;
+}
+
+void SafeStack::moveDynamicAllocasToUnsafeStack(
+    Function &F, Value *UnsafeStackPtr, AllocaInst *DynamicTop,
+    ArrayRef<AllocaInst *> DynamicAllocas) {
+  DIBuilder DIB(*F.getParent());
+
+  for (AllocaInst *AI : DynamicAllocas) {
+    IRBuilder<> IRB(AI);
+
+    // Compute the new SP value (after AI).
+    Value *ArraySize = AI->getArraySize();
+    if (ArraySize->getType() != IntPtrTy)
+      ArraySize = IRB.CreateIntCast(ArraySize, IntPtrTy, false);
+
+    Type *Ty = AI->getAllocatedType();
+    uint64_t TySize = DL.getTypeAllocSize(Ty);
+    Value *Size = IRB.CreateMul(ArraySize, ConstantInt::get(IntPtrTy, TySize));
+
+    Value *SP = IRB.CreatePtrToInt(IRB.CreateLoad(StackPtrTy, UnsafeStackPtr),
+                                   IntPtrTy);
+    SP = IRB.CreateSub(SP, Size);
+
+    // Align the SP value to satisfy the AllocaInst, type and stack alignments.
+    auto Align = std::max(std::max(DL.getPrefTypeAlign(Ty), AI->getAlign()),
+                          StackAlignment);
+
+    Value *NewTop = IRB.CreateIntToPtr(
+        IRB.CreateAnd(SP,
+                      ConstantInt::get(IntPtrTy, ~uint64_t(Align.value() - 1))),
+        StackPtrTy);
+
+    // Save the stack pointer.
+    IRB.CreateStore(NewTop, UnsafeStackPtr);
+    if (DynamicTop)
+      IRB.CreateStore(NewTop, DynamicTop);
+
+    Value *NewAI = IRB.CreatePointerCast(NewTop, AI->getType());
+    if (AI->hasName() && isa<Instruction>(NewAI))
+      NewAI->takeName(AI);
+
+    replaceDbgDeclare(AI, NewAI, DIB, DIExpression::ApplyOffset, 0);
+    AI->replaceAllUsesWith(NewAI);
+    AI->eraseFromParent();
+  }
+
+  if (!DynamicAllocas.empty()) {
+    // Now go through the instructions again, replacing stacksave/stackrestore.
+    for (Instruction &I : llvm::make_early_inc_range(instructions(&F))) {
+      auto *II = dyn_cast<IntrinsicInst>(&I);
+      if (!II)
+        continue;
+
+      if (II->getIntrinsicID() == Intrinsic::stacksave) {
+        IRBuilder<> IRB(II);
+        Instruction *LI = IRB.CreateLoad(StackPtrTy, UnsafeStackPtr);
+        LI->takeName(II);
+        II->replaceAllUsesWith(LI);
+        II->eraseFromParent();
+      } else if (II->getIntrinsicID() == Intrinsic::stackrestore) {
+        IRBuilder<> IRB(II);
+        Instruction *SI = IRB.CreateStore(II->getArgOperand(0), UnsafeStackPtr);
+        SI->takeName(II);
+        assert(II->use_empty());
+        II->eraseFromParent();
+      }
+    }
+  }
+}
+
+bool SafeStack::ShouldInlinePointerAddress(CallInst &CI) {
+  Function *Callee = CI.getCalledFunction();
+  if (CI.hasFnAttr(Attribute::AlwaysInline) &&
+      isInlineViable(*Callee).isSuccess())
+    return true;
+  if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
+      CI.isNoInline())
+    return false;
+  return true;
+}
+
+void SafeStack::TryInlinePointerAddress() {
+  auto *CI = dyn_cast<CallInst>(UnsafeStackPtr);
+  if (!CI)
+    return;
+
+  if(F.hasOptNone())
+    return;
+
+  Function *Callee = CI->getCalledFunction();
+  if (!Callee || Callee->isDeclaration())
+    return;
+
+  if (!ShouldInlinePointerAddress(*CI))
+    return;
+
+  InlineFunctionInfo IFI;
+  InlineFunction(*CI, IFI);
+}
+
+bool SafeStack::run() {
+  assert(F.hasFnAttribute(Attribute::SafeStack) &&
+         "Can't run SafeStack on a function without the attribute");
+  assert(!F.isDeclaration() && "Can't run SafeStack on a function declaration");
+
+  ++NumFunctions;
+
+  SmallVector<AllocaInst *, 16> StaticAllocas;
+  SmallVector<AllocaInst *, 4> DynamicAllocas;
+  SmallVector<Argument *, 4> ByValArguments;
+  SmallVector<Instruction *, 4> Returns;
+
+  // Collect all points where stack gets unwound and needs to be restored
+  // This is only necessary because the runtime (setjmp and unwind code) is
+  // not aware of the unsafe stack and won't unwind/restore it properly.
+  // To work around this problem without changing the runtime, we insert
+  // instrumentation to restore the unsafe stack pointer when necessary.
+  SmallVector<Instruction *, 4> StackRestorePoints;
+
+  // Find all static and dynamic alloca instructions that must be moved to the
+  // unsafe stack, all return instructions and stack restore points.
+  findInsts(F, StaticAllocas, DynamicAllocas, ByValArguments, Returns,
+            StackRestorePoints);
+
+  if (StaticAllocas.empty() && DynamicAllocas.empty() &&
+      ByValArguments.empty() && StackRestorePoints.empty())
+    return false; // Nothing to do in this function.
+
+  if (!StaticAllocas.empty() || !DynamicAllocas.empty() ||
+      !ByValArguments.empty())
+    ++NumUnsafeStackFunctions; // This function has the unsafe stack.
+
+  if (!StackRestorePoints.empty())
+    ++NumUnsafeStackRestorePointsFunctions;
+
+  IRBuilder<> IRB(&F.front(), F.begin()->getFirstInsertionPt());
+  // Calls must always have a debug location, or else inlining breaks. So
+  // we explicitly set a artificial debug location here.
+  if (DISubprogram *SP = F.getSubprogram())
+    IRB.SetCurrentDebugLocation(
+        DILocation::get(SP->getContext(), SP->getScopeLine(), 0, SP));
+  if (SafeStackUsePointerAddress) {
+    FunctionCallee Fn = F.getParent()->getOrInsertFunction(
+        "__safestack_pointer_address", StackPtrTy->getPointerTo(0));
+    UnsafeStackPtr = IRB.CreateCall(Fn);
+  } else {
+    UnsafeStackPtr = TL.getSafeStackPointerLocation(IRB);
+  }
+
+  // Load the current stack pointer (we'll also use it as a base pointer).
+  // FIXME: use a dedicated register for it ?
+  Instruction *BasePointer =
+      IRB.CreateLoad(StackPtrTy, UnsafeStackPtr, false, "unsafe_stack_ptr");
+  assert(BasePointer->getType() == StackPtrTy);
+
+  AllocaInst *StackGuardSlot = nullptr;
+  // FIXME: implement weaker forms of stack protector.
+  if (F.hasFnAttribute(Attribute::StackProtect) ||
+      F.hasFnAttribute(Attribute::StackProtectStrong) ||
+      F.hasFnAttribute(Attribute::StackProtectReq)) {
+    Value *StackGuard = getStackGuard(IRB, F);
+    StackGuardSlot = IRB.CreateAlloca(StackPtrTy, nullptr);
+    IRB.CreateStore(StackGuard, StackGuardSlot);
+
+    for (Instruction *RI : Returns) {
+      IRBuilder<> IRBRet(RI);
+      checkStackGuard(IRBRet, F, *RI, StackGuardSlot, StackGuard);
+    }
+  }
+
+  // The top of the unsafe stack after all unsafe static allocas are
+  // allocated.
+  Value *StaticTop = moveStaticAllocasToUnsafeStack(
+      IRB, F, StaticAllocas, ByValArguments, BasePointer, StackGuardSlot);
+
+  // Safe stack object that stores the current unsafe stack top. It is updated
+  // as unsafe dynamic (non-constant-sized) allocas are allocated and freed.
+  // This is only needed if we need to restore stack pointer after longjmp
+  // or exceptions, and we have dynamic allocations.
+  // FIXME: a better alternative might be to store the unsafe stack pointer
+  // before setjmp / invoke instructions.
+  AllocaInst *DynamicTop = createStackRestorePoints(
+      IRB, F, StackRestorePoints, StaticTop, !DynamicAllocas.empty());
+
+  // Handle dynamic allocas.
+  moveDynamicAllocasToUnsafeStack(F, UnsafeStackPtr, DynamicTop,
+                                  DynamicAllocas);
+
+  // Restore the unsafe stack pointer before each return.
+  for (Instruction *RI : Returns) {
+    IRB.SetInsertPoint(RI);
+    IRB.CreateStore(BasePointer, UnsafeStackPtr);
+  }
+
+  TryInlinePointerAddress();
+
+  LLVM_DEBUG(dbgs() << "[SafeStack]     safestack applied\n");
+  return true;
+}
+
+class SafeStackLegacyPass : public FunctionPass {
+  const TargetMachine *TM = nullptr;
+
+public:
+  static char ID; // Pass identification, replacement for typeid..
+
+  SafeStackLegacyPass() : FunctionPass(ID) {
+    initializeSafeStackLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    LLVM_DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");
+
+    if (!F.hasFnAttribute(Attribute::SafeStack)) {
+      LLVM_DEBUG(dbgs() << "[SafeStack]     safestack is not requested"
+                           " for this function\n");
+      return false;
+    }
+
+    if (F.isDeclaration()) {
+      LLVM_DEBUG(dbgs() << "[SafeStack]     function definition"
+                           " is not available\n");
+      return false;
+    }
+
+    TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+    auto *TL = TM->getSubtargetImpl(F)->getTargetLowering();
+    if (!TL)
+      report_fatal_error("TargetLowering instance is required");
+
+    auto *DL = &F.getParent()->getDataLayout();
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    auto &ACT = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+    // Compute DT and LI only for functions that have the attribute.
+    // This is only useful because the legacy pass manager doesn't let us
+    // compute analyzes lazily.
+
+    DominatorTree *DT;
+    bool ShouldPreserveDominatorTree;
+    std::optional<DominatorTree> LazilyComputedDomTree;
+
+    // Do we already have a DominatorTree avaliable from the previous pass?
+    // Note that we should *NOT* require it, to avoid the case where we end up
+    // not needing it, but the legacy PM would have computed it for us anyways.
+    if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>()) {
+      DT = &DTWP->getDomTree();
+      ShouldPreserveDominatorTree = true;
+    } else {
+      // Otherwise, we need to compute it.
+      LazilyComputedDomTree.emplace(F);
+      DT = &*LazilyComputedDomTree;
+      ShouldPreserveDominatorTree = false;
+    }
+
+    // Likewise, lazily compute loop info.
+    LoopInfo LI(*DT);
+
+    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
+    ScalarEvolution SE(F, TLI, ACT, *DT, LI);
+
+    return SafeStack(F, *TL, *DL, ShouldPreserveDominatorTree ? &DTU : nullptr,
+                     SE)
+        .run();
+  }
+};
+
+} // end anonymous namespace
+
+char SafeStackLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SafeStackLegacyPass, DEBUG_TYPE,
+                      "Safe Stack instrumentation pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(SafeStackLegacyPass, DEBUG_TYPE,
+                    "Safe Stack instrumentation pass", false, false)
+
+FunctionPass *llvm::createSafeStackPass() { return new SafeStackLegacyPass(); }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SafeStackLayout.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SafeStackLayout.cpp
new file mode 100644
index 000000000000..f821145f4b63
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SafeStackLayout.cpp
@@ -0,0 +1,152 @@
+//===- SafeStackLayout.cpp - SafeStack frame layout -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SafeStackLayout.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+
+using namespace llvm;
+using namespace llvm::safestack;
+
+#define DEBUG_TYPE "safestacklayout"
+
+static cl::opt<bool> ClLayout("safe-stack-layout",
+                              cl::desc("enable safe stack layout"), cl::Hidden,
+                              cl::init(true));
+
+LLVM_DUMP_METHOD void StackLayout::print(raw_ostream &OS) {
+  OS << "Stack regions:\n";
+  for (unsigned i = 0; i < Regions.size(); ++i) {
+    OS << "  " << i << ": [" << Regions[i].Start << ", " << Regions[i].End
+       << "), range " << Regions[i].Range << "\n";
+  }
+  OS << "Stack objects:\n";
+  for (auto &IT : ObjectOffsets) {
+    OS << "  at " << IT.getSecond() << ": " << *IT.getFirst() << "\n";
+  }
+}
+
+void StackLayout::addObject(const Value *V, unsigned Size, Align Alignment,
+                            const StackLifetime::LiveRange &Range) {
+  StackObjects.push_back({V, Size, Alignment, Range});
+  ObjectAlignments[V] = Alignment;
+  MaxAlignment = std::max(MaxAlignment, Alignment);
+}
+
+static unsigned AdjustStackOffset(unsigned Offset, unsigned Size,
+                                  Align Alignment) {
+  return alignTo(Offset + Size, Alignment) - Size;
+}
+
+void StackLayout::layoutObject(StackObject &Obj) {
+  if (!ClLayout) {
+    // If layout is disabled, just grab the next aligned address.
+    // This effectively disables stack coloring as well.
+    unsigned LastRegionEnd = Regions.empty() ? 0 : Regions.back().End;
+    unsigned Start = AdjustStackOffset(LastRegionEnd, Obj.Size, Obj.Alignment);
+    unsigned End = Start + Obj.Size;
+    Regions.emplace_back(Start, End, Obj.Range);
+    ObjectOffsets[Obj.Handle] = End;
+    return;
+  }
+
+  LLVM_DEBUG(dbgs() << "Layout: size " << Obj.Size << ", align "
+                    << Obj.Alignment.value() << ", range " << Obj.Range
+                    << "\n");
+  assert(Obj.Alignment <= MaxAlignment);
+  unsigned Start = AdjustStackOffset(0, Obj.Size, Obj.Alignment);
+  unsigned End = Start + Obj.Size;
+  LLVM_DEBUG(dbgs() << "  First candidate: " << Start << " .. " << End << "\n");
+  for (const StackRegion &R : Regions) {
+    LLVM_DEBUG(dbgs() << "  Examining region: " << R.Start << " .. " << R.End
+                      << ", range " << R.Range << "\n");
+    assert(End >= R.Start);
+    if (Start >= R.End) {
+      LLVM_DEBUG(dbgs() << "  Does not intersect, skip.\n");
+      continue;
+    }
+    if (Obj.Range.overlaps(R.Range)) {
+      // Find the next appropriate location.
+      Start = AdjustStackOffset(R.End, Obj.Size, Obj.Alignment);
+      End = Start + Obj.Size;
+      LLVM_DEBUG(dbgs() << "  Overlaps. Next candidate: " << Start << " .. "
+                        << End << "\n");
+      continue;
+    }
+    if (End <= R.End) {
+      LLVM_DEBUG(dbgs() << "  Reusing region(s).\n");
+      break;
+    }
+  }
+
+  unsigned LastRegionEnd = Regions.empty() ? 0 : Regions.back().End;
+  if (End > LastRegionEnd) {
+    // Insert a new region at the end. Maybe two.
+    if (Start > LastRegionEnd) {
+      LLVM_DEBUG(dbgs() << "  Creating gap region: " << LastRegionEnd << " .. "
+                        << Start << "\n");
+      Regions.emplace_back(LastRegionEnd, Start, StackLifetime::LiveRange(0));
+      LastRegionEnd = Start;
+    }
+    LLVM_DEBUG(dbgs() << "  Creating new region: " << LastRegionEnd << " .. "
+                      << End << ", range " << Obj.Range << "\n");
+    Regions.emplace_back(LastRegionEnd, End, Obj.Range);
+    LastRegionEnd = End;
+  }
+
+  // Split starting and ending regions if necessary.
+  for (unsigned i = 0; i < Regions.size(); ++i) {
+    StackRegion &R = Regions[i];
+    if (Start > R.Start && Start < R.End) {
+      StackRegion R0 = R;
+      R.Start = R0.End = Start;
+      Regions.insert(&R, R0);
+      continue;
+    }
+    if (End > R.Start && End < R.End) {
+      StackRegion R0 = R;
+      R0.End = R.Start = End;
+      Regions.insert(&R, R0);
+      break;
+    }
+  }
+
+  // Update live ranges for all affected regions.
+  for (StackRegion &R : Regions) {
+    if (Start < R.End && End > R.Start)
+      R.Range.join(Obj.Range);
+    if (End <= R.End)
+      break;
+  }
+
+  ObjectOffsets[Obj.Handle] = End;
+}
+
+void StackLayout::computeLayout() {
+  // Simple greedy algorithm.
+  // If this is replaced with something smarter, it must preserve the property
+  // that the first object is always at the offset 0 in the stack frame (for
+  // StackProtectorSlot), or handle stack protector in some other way.
+
+  // Sort objects by size (largest first) to reduce fragmentation.
+  if (StackObjects.size() > 2)
+    llvm::stable_sort(drop_begin(StackObjects),
+                      [](const StackObject &a, const StackObject &b) {
+                        return a.Size > b.Size;
+                      });
+
+  for (auto &Obj : StackObjects)
+    layoutObject(Obj);
+
+  LLVM_DEBUG(print(dbgs()));
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SafeStackLayout.h b/contrib/llvm-project/llvm/lib/CodeGen/SafeStackLayout.h
new file mode 100644
index 000000000000..6126c7a67854
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SafeStackLayout.h
@@ -0,0 +1,84 @@
+//===- SafeStackLayout.h - SafeStack frame layout --------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SAFESTACKLAYOUT_H
+#define LLVM_LIB_CODEGEN_SAFESTACKLAYOUT_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/StackLifetime.h"
+
+namespace llvm {
+
+class raw_ostream;
+class Value;
+
+namespace safestack {
+
+/// Compute the layout of an unsafe stack frame.
+class StackLayout {
+  Align MaxAlignment;
+
+  struct StackRegion {
+    unsigned Start;
+    unsigned End;
+    StackLifetime::LiveRange Range;
+
+    StackRegion(unsigned Start, unsigned End,
+                const StackLifetime::LiveRange &Range)
+        : Start(Start), End(End), Range(Range) {}
+  };
+
+  /// The list of current stack regions, sorted by StackRegion::Start.
+  SmallVector<StackRegion, 16> Regions;
+
+  struct StackObject {
+    const Value *Handle;
+    unsigned Size;
+    Align Alignment;
+    StackLifetime::LiveRange Range;
+  };
+
+  SmallVector<StackObject, 8> StackObjects;
+
+  DenseMap<const Value *, unsigned> ObjectOffsets;
+  DenseMap<const Value *, Align> ObjectAlignments;
+
+  void layoutObject(StackObject &Obj);
+
+public:
+  StackLayout(Align StackAlignment) : MaxAlignment(StackAlignment) {}
+
+  /// Add an object to the stack frame. Value pointer is opaque and used as a
+  /// handle to retrieve the object's offset in the frame later.
+  void addObject(const Value *V, unsigned Size, Align Alignment,
+                 const StackLifetime::LiveRange &Range);
+
+  /// Run the layout computation for all previously added objects.
+  void computeLayout();
+
+  /// Returns the offset to the object start in the stack frame.
+  unsigned getObjectOffset(const Value *V) { return ObjectOffsets[V]; }
+
+  /// Returns the alignment of the object
+  Align getObjectAlignment(const Value *V) { return ObjectAlignments[V]; }
+
+  /// Returns the size of the entire frame.
+  unsigned getFrameSize() { return Regions.empty() ? 0 : Regions.back().End; }
+
+  /// Returns the alignment of the frame.
+  Align getFrameAlignment() { return MaxAlignment; }
+
+  void print(raw_ostream &OS);
+};
+
+} // end namespace safestack
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_SAFESTACKLAYOUT_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SanitizerBinaryMetadata.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SanitizerBinaryMetadata.cpp
new file mode 100644
index 000000000000..cc29bdce1210
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SanitizerBinaryMetadata.cpp
@@ -0,0 +1,87 @@
+//===- SanitizerBinaryMetadata.cpp
+//----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of SanitizerBinaryMetadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include <algorithm>
+
+using namespace llvm;
+
+namespace {
+class MachineSanitizerBinaryMetadata : public MachineFunctionPass {
+public:
+  static char ID;
+
+  MachineSanitizerBinaryMetadata();
+  bool runOnMachineFunction(MachineFunction &F) override;
+};
+} // namespace
+
+INITIALIZE_PASS(MachineSanitizerBinaryMetadata, "machine-sanmd",
+                "Machine Sanitizer Binary Metadata", false, false)
+
+char MachineSanitizerBinaryMetadata::ID = 0;
+char &llvm::MachineSanitizerBinaryMetadataID =
+    MachineSanitizerBinaryMetadata::ID;
+
+MachineSanitizerBinaryMetadata::MachineSanitizerBinaryMetadata()
+    : MachineFunctionPass(ID) {
+  initializeMachineSanitizerBinaryMetadataPass(
+      *PassRegistry::getPassRegistry());
+}
+
+bool MachineSanitizerBinaryMetadata::runOnMachineFunction(MachineFunction &MF) {
+  MDNode *MD = MF.getFunction().getMetadata(LLVMContext::MD_pcsections);
+  if (!MD)
+    return false;
+  const auto &Section = *cast<MDString>(MD->getOperand(0));
+  if (!Section.getString().startswith(kSanitizerBinaryMetadataCoveredSection))
+    return false;
+  auto &AuxMDs = *cast<MDTuple>(MD->getOperand(1));
+  // Assume it currently only has features.
+  assert(AuxMDs.getNumOperands() == 1);
+  Constant *Features =
+      cast<ConstantAsMetadata>(AuxMDs.getOperand(0))->getValue();
+  if (!Features->getUniqueInteger()[kSanitizerBinaryMetadataUARBit])
+    return false;
+  // Calculate size of stack args for the function.
+  int64_t Size = 0;
+  uint64_t Align = 0;
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  for (int i = -1; i >= (int)-MFI.getNumFixedObjects(); --i) {
+    Size = std::max(Size, MFI.getObjectOffset(i) + MFI.getObjectSize(i));
+    Align = std::max(Align, MFI.getObjectAlign(i).value());
+  }
+  Size = (Size + Align - 1) & ~(Align - 1);
+  if (!Size)
+    return false;
+  // Non-zero size, update metadata.
+  auto &F = MF.getFunction();
+  IRBuilder<> IRB(F.getContext());
+  MDBuilder MDB(F.getContext());
+  // Keep the features and append size of stack args to the metadata.
+  APInt NewFeatures = Features->getUniqueInteger();
+  NewFeatures.setBit(kSanitizerBinaryMetadataUARHasSizeBit);
+  F.setMetadata(
+      LLVMContext::MD_pcsections,
+      MDB.createPCSections({{Section.getString(),
+                             {IRB.getInt(NewFeatures), IRB.getInt32(Size)}}}));
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAG.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAG.cpp
new file mode 100644
index 000000000000..14ec41920e3e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAG.cpp
@@ -0,0 +1,754 @@
+//===- ScheduleDAG.cpp - Implement the ScheduleDAG class ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Implements the ScheduleDAG class, which is a base class used by
+/// scheduling implementation classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pre-RA-sched"
+
+STATISTIC(NumNewPredsAdded, "Number of times a  single predecessor was added");
+STATISTIC(NumTopoInits,
+          "Number of times the topological order has been recomputed");
+
+#ifndef NDEBUG
+static cl::opt<bool> StressSchedOpt(
+  "stress-sched", cl::Hidden, cl::init(false),
+  cl::desc("Stress test instruction scheduling"));
+#endif
+
+void SchedulingPriorityQueue::anchor() {}
+
+ScheduleDAG::ScheduleDAG(MachineFunction &mf)
+    : TM(mf.getTarget()), TII(mf.getSubtarget().getInstrInfo()),
+      TRI(mf.getSubtarget().getRegisterInfo()), MF(mf),
+      MRI(mf.getRegInfo()) {
+#ifndef NDEBUG
+  StressSched = StressSchedOpt;
+#endif
+}
+
+ScheduleDAG::~ScheduleDAG() = default;
+
+void ScheduleDAG::clearDAG() {
+  SUnits.clear();
+  EntrySU = SUnit();
+  ExitSU = SUnit();
+}
+
+const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const {
+  if (!Node || !Node->isMachineOpcode()) return nullptr;
+  return &TII->get(Node->getMachineOpcode());
+}
+
+LLVM_DUMP_METHOD void SDep::dump(const TargetRegisterInfo *TRI) const {
+  switch (getKind()) {
+  case Data:   dbgs() << "Data"; break;
+  case Anti:   dbgs() << "Anti"; break;
+  case Output: dbgs() << "Out "; break;
+  case Order:  dbgs() << "Ord "; break;
+  }
+
+  switch (getKind()) {
+  case Data:
+    dbgs() << " Latency=" << getLatency();
+    if (TRI && isAssignedRegDep())
+      dbgs() << " Reg=" << printReg(getReg(), TRI);
+    break;
+  case Anti:
+  case Output:
+    dbgs() << " Latency=" << getLatency();
+    break;
+  case Order:
+    dbgs() << " Latency=" << getLatency();
+    switch(Contents.OrdKind) {
+    case Barrier:      dbgs() << " Barrier"; break;
+    case MayAliasMem:
+    case MustAliasMem: dbgs() << " Memory"; break;
+    case Artificial:   dbgs() << " Artificial"; break;
+    case Weak:         dbgs() << " Weak"; break;
+    case Cluster:      dbgs() << " Cluster"; break;
+    }
+    break;
+  }
+}
+
+bool SUnit::addPred(const SDep &D, bool Required) {
+  // If this node already has this dependence, don't add a redundant one.
+  for (SDep &PredDep : Preds) {
+    // Zero-latency weak edges may be added purely for heuristic ordering. Don't
+    // add them if another kind of edge already exists.
+    if (!Required && PredDep.getSUnit() == D.getSUnit())
+      return false;
+    if (PredDep.overlaps(D)) {
+      // Extend the latency if needed. Equivalent to
+      // removePred(PredDep) + addPred(D).
+      if (PredDep.getLatency() < D.getLatency()) {
+        SUnit *PredSU = PredDep.getSUnit();
+        // Find the corresponding successor in N.
+        SDep ForwardD = PredDep;
+        ForwardD.setSUnit(this);
+        for (SDep &SuccDep : PredSU->Succs) {
+          if (SuccDep == ForwardD) {
+            SuccDep.setLatency(D.getLatency());
+            break;
+          }
+        }
+        PredDep.setLatency(D.getLatency());
+      }
+      return false;
+    }
+  }
+  // Now add a corresponding succ to N.
+  SDep P = D;
+  P.setSUnit(this);
+  SUnit *N = D.getSUnit();
+  // Update the bookkeeping.
+  if (D.getKind() == SDep::Data) {
+    assert(NumPreds < std::numeric_limits<unsigned>::max() &&
+           "NumPreds will overflow!");
+    assert(N->NumSuccs < std::numeric_limits<unsigned>::max() &&
+           "NumSuccs will overflow!");
+    ++NumPreds;
+    ++N->NumSuccs;
+  }
+  if (!N->isScheduled) {
+    if (D.isWeak()) {
+      ++WeakPredsLeft;
+    }
+    else {
+      assert(NumPredsLeft < std::numeric_limits<unsigned>::max() &&
+             "NumPredsLeft will overflow!");
+      ++NumPredsLeft;
+    }
+  }
+  if (!isScheduled) {
+    if (D.isWeak()) {
+      ++N->WeakSuccsLeft;
+    }
+    else {
+      assert(N->NumSuccsLeft < std::numeric_limits<unsigned>::max() &&
+             "NumSuccsLeft will overflow!");
+      ++N->NumSuccsLeft;
+    }
+  }
+  Preds.push_back(D);
+  N->Succs.push_back(P);
+  if (P.getLatency() != 0) {
+    this->setDepthDirty();
+    N->setHeightDirty();
+  }
+  return true;
+}
+
+void SUnit::removePred(const SDep &D) {
+  // Find the matching predecessor.
+  SmallVectorImpl<SDep>::iterator I = llvm::find(Preds, D);
+  if (I == Preds.end())
+    return;
+  // Find the corresponding successor in N.
+  SDep P = D;
+  P.setSUnit(this);
+  SUnit *N = D.getSUnit();
+  SmallVectorImpl<SDep>::iterator Succ = llvm::find(N->Succs, P);
+  assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!");
+  // Update the bookkeeping.
+  if (P.getKind() == SDep::Data) {
+    assert(NumPreds > 0 && "NumPreds will underflow!");
+    assert(N->NumSuccs > 0 && "NumSuccs will underflow!");
+    --NumPreds;
+    --N->NumSuccs;
+  }
+  if (!N->isScheduled) {
+    if (D.isWeak()) {
+      assert(WeakPredsLeft > 0 && "WeakPredsLeft will underflow!");
+      --WeakPredsLeft;
+    } else {
+      assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!");
+      --NumPredsLeft;
+    }
+  }
+  if (!isScheduled) {
+    if (D.isWeak()) {
+      assert(WeakSuccsLeft > 0 && "WeakSuccsLeft will underflow!");
+      --N->WeakSuccsLeft;
+    } else {
+      assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!");
+      --N->NumSuccsLeft;
+    }
+  }
+  N->Succs.erase(Succ);
+  Preds.erase(I);
+  if (P.getLatency() != 0) {
+    this->setDepthDirty();
+    N->setHeightDirty();
+  }
+}
+
+void SUnit::setDepthDirty() {
+  if (!isDepthCurrent) return;
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *SU = WorkList.pop_back_val();
+    SU->isDepthCurrent = false;
+    for (SDep &SuccDep : SU->Succs) {
+      SUnit *SuccSU = SuccDep.getSUnit();
+      if (SuccSU->isDepthCurrent)
+        WorkList.push_back(SuccSU);
+    }
+  } while (!WorkList.empty());
+}
+
+void SUnit::setHeightDirty() {
+  if (!isHeightCurrent) return;
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *SU = WorkList.pop_back_val();
+    SU->isHeightCurrent = false;
+    for (SDep &PredDep : SU->Preds) {
+      SUnit *PredSU = PredDep.getSUnit();
+      if (PredSU->isHeightCurrent)
+        WorkList.push_back(PredSU);
+    }
+  } while (!WorkList.empty());
+}
+
+void SUnit::setDepthToAtLeast(unsigned NewDepth) {
+  if (NewDepth <= getDepth())
+    return;
+  setDepthDirty();
+  Depth = NewDepth;
+  isDepthCurrent = true;
+}
+
+void SUnit::setHeightToAtLeast(unsigned NewHeight) {
+  if (NewHeight <= getHeight())
+    return;
+  setHeightDirty();
+  Height = NewHeight;
+  isHeightCurrent = true;
+}
+
+/// Calculates the maximal path from the node to the exit.
+void SUnit::ComputeDepth() {
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *Cur = WorkList.back();
+
+    bool Done = true;
+    unsigned MaxPredDepth = 0;
+    for (const SDep &PredDep : Cur->Preds) {
+      SUnit *PredSU = PredDep.getSUnit();
+      if (PredSU->isDepthCurrent)
+        MaxPredDepth = std::max(MaxPredDepth,
+                                PredSU->Depth + PredDep.getLatency());
+      else {
+        Done = false;
+        WorkList.push_back(PredSU);
+      }
+    }
+
+    if (Done) {
+      WorkList.pop_back();
+      if (MaxPredDepth != Cur->Depth) {
+        Cur->setDepthDirty();
+        Cur->Depth = MaxPredDepth;
+      }
+      Cur->isDepthCurrent = true;
+    }
+  } while (!WorkList.empty());
+}
+
+/// Calculates the maximal path from the node to the entry.
+void SUnit::ComputeHeight() {
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *Cur = WorkList.back();
+
+    bool Done = true;
+    unsigned MaxSuccHeight = 0;
+    for (const SDep &SuccDep : Cur->Succs) {
+      SUnit *SuccSU = SuccDep.getSUnit();
+      if (SuccSU->isHeightCurrent)
+        MaxSuccHeight = std::max(MaxSuccHeight,
+                                 SuccSU->Height + SuccDep.getLatency());
+      else {
+        Done = false;
+        WorkList.push_back(SuccSU);
+      }
+    }
+
+    if (Done) {
+      WorkList.pop_back();
+      if (MaxSuccHeight != Cur->Height) {
+        Cur->setHeightDirty();
+        Cur->Height = MaxSuccHeight;
+      }
+      Cur->isHeightCurrent = true;
+    }
+  } while (!WorkList.empty());
+}
+
+void SUnit::biasCriticalPath() {
+  if (NumPreds < 2)
+    return;
+
+  SUnit::pred_iterator BestI = Preds.begin();
+  unsigned MaxDepth = BestI->getSUnit()->getDepth();
+  for (SUnit::pred_iterator I = std::next(BestI), E = Preds.end(); I != E;
+       ++I) {
+    if (I->getKind() == SDep::Data && I->getSUnit()->getDepth() > MaxDepth)
+      BestI = I;
+  }
+  if (BestI != Preds.begin())
+    std::swap(*Preds.begin(), *BestI);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void SUnit::dumpAttributes() const {
+  dbgs() << "  # preds left       : " << NumPredsLeft << "\n";
+  dbgs() << "  # succs left       : " << NumSuccsLeft << "\n";
+  if (WeakPredsLeft)
+    dbgs() << "  # weak preds left  : " << WeakPredsLeft << "\n";
+  if (WeakSuccsLeft)
+    dbgs() << "  # weak succs left  : " << WeakSuccsLeft << "\n";
+  dbgs() << "  # rdefs left       : " << NumRegDefsLeft << "\n";
+  dbgs() << "  Latency            : " << Latency << "\n";
+  dbgs() << "  Depth              : " << getDepth() << "\n";
+  dbgs() << "  Height             : " << getHeight() << "\n";
+}
+
+LLVM_DUMP_METHOD void ScheduleDAG::dumpNodeName(const SUnit &SU) const {
+  if (&SU == &EntrySU)
+    dbgs() << "EntrySU";
+  else if (&SU == &ExitSU)
+    dbgs() << "ExitSU";
+  else
+    dbgs() << "SU(" << SU.NodeNum << ")";
+}
+
+LLVM_DUMP_METHOD void ScheduleDAG::dumpNodeAll(const SUnit &SU) const {
+  dumpNode(SU);
+  SU.dumpAttributes();
+  if (SU.Preds.size() > 0) {
+    dbgs() << "  Predecessors:\n";
+    for (const SDep &Dep : SU.Preds) {
+      dbgs() << "    ";
+      dumpNodeName(*Dep.getSUnit());
+      dbgs() << ": ";
+      Dep.dump(TRI);
+      dbgs() << '\n';
+    }
+  }
+  if (SU.Succs.size() > 0) {
+    dbgs() << "  Successors:\n";
+    for (const SDep &Dep : SU.Succs) {
+      dbgs() << "    ";
+      dumpNodeName(*Dep.getSUnit());
+      dbgs() << ": ";
+      Dep.dump(TRI);
+      dbgs() << '\n';
+    }
+  }
+}
+#endif
+
+#ifndef NDEBUG
+unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) {
+  bool AnyNotSched = false;
+  unsigned DeadNodes = 0;
+  for (const SUnit &SUnit : SUnits) {
+    if (!SUnit.isScheduled) {
+      if (SUnit.NumPreds == 0 && SUnit.NumSuccs == 0) {
+        ++DeadNodes;
+        continue;
+      }
+      if (!AnyNotSched)
+        dbgs() << "*** Scheduling failed! ***\n";
+      dumpNode(SUnit);
+      dbgs() << "has not been scheduled!\n";
+      AnyNotSched = true;
+    }
+    if (SUnit.isScheduled &&
+        (isBottomUp ? SUnit.getHeight() : SUnit.getDepth()) >
+          unsigned(std::numeric_limits<int>::max())) {
+      if (!AnyNotSched)
+        dbgs() << "*** Scheduling failed! ***\n";
+      dumpNode(SUnit);
+      dbgs() << "has an unexpected "
+           << (isBottomUp ? "Height" : "Depth") << " value!\n";
+      AnyNotSched = true;
+    }
+    if (isBottomUp) {
+      if (SUnit.NumSuccsLeft != 0) {
+        if (!AnyNotSched)
+          dbgs() << "*** Scheduling failed! ***\n";
+        dumpNode(SUnit);
+        dbgs() << "has successors left!\n";
+        AnyNotSched = true;
+      }
+    } else {
+      if (SUnit.NumPredsLeft != 0) {
+        if (!AnyNotSched)
+          dbgs() << "*** Scheduling failed! ***\n";
+        dumpNode(SUnit);
+        dbgs() << "has predecessors left!\n";
+        AnyNotSched = true;
+      }
+    }
+  }
+  assert(!AnyNotSched);
+  return SUnits.size() - DeadNodes;
+}
+#endif
+
+void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
+  // The idea of the algorithm is taken from
+  // "Online algorithms for managing the topological order of
+  // a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly
+  // This is the MNR algorithm, which was first introduced by
+  // A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in
+  // "Maintaining a topological order under edge insertions".
+  //
+  // Short description of the algorithm:
+  //
+  // Topological ordering, ord, of a DAG maps each node to a topological
+  // index so that for all edges X->Y it is the case that ord(X) < ord(Y).
+  //
+  // This means that if there is a path from the node X to the node Z,
+  // then ord(X) < ord(Z).
+  //
+  // This property can be used to check for reachability of nodes:
+  // if Z is reachable from X, then an insertion of the edge Z->X would
+  // create a cycle.
+  //
+  // The algorithm first computes a topological ordering for the DAG by
+  // initializing the Index2Node and Node2Index arrays and then tries to keep
+  // the ordering up-to-date after edge insertions by reordering the DAG.
+  //
+  // On insertion of the edge X->Y, the algorithm first marks by calling DFS
+  // the nodes reachable from Y, and then shifts them using Shift to lie
+  // immediately after X in Index2Node.
+
+  // Cancel pending updates, mark as valid.
+  Dirty = false;
+  Updates.clear();
+
+  unsigned DAGSize = SUnits.size();
+  std::vector<SUnit*> WorkList;
+  WorkList.reserve(DAGSize);
+
+  Index2Node.resize(DAGSize);
+  Node2Index.resize(DAGSize);
+
+  // Initialize the data structures.
+  if (ExitSU)
+    WorkList.push_back(ExitSU);
+  for (SUnit &SU : SUnits) {
+    int NodeNum = SU.NodeNum;
+    unsigned Degree = SU.Succs.size();
+    // Temporarily use the Node2Index array as scratch space for degree counts.
+    Node2Index[NodeNum] = Degree;
+
+    // Is it a node without dependencies?
+    if (Degree == 0) {
+      assert(SU.Succs.empty() && "SUnit should have no successors");
+      // Collect leaf nodes.
+      WorkList.push_back(&SU);
+    }
+  }
+
+  int Id = DAGSize;
+  while (!WorkList.empty()) {
+    SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    if (SU->NodeNum < DAGSize)
+      Allocate(SU->NodeNum, --Id);
+    for (const SDep &PredDep : SU->Preds) {
+      SUnit *SU = PredDep.getSUnit();
+      if (SU->NodeNum < DAGSize && !--Node2Index[SU->NodeNum])
+        // If all dependencies of the node are processed already,
+        // then the node can be computed now.
+        WorkList.push_back(SU);
+    }
+  }
+
+  Visited.resize(DAGSize);
+  NumTopoInits++;
+
+#ifndef NDEBUG
+  // Check correctness of the ordering
+  for (SUnit &SU : SUnits)  {
+    for (const SDep &PD : SU.Preds) {
+      assert(Node2Index[SU.NodeNum] > Node2Index[PD.getSUnit()->NodeNum] &&
+      "Wrong topological sorting");
+    }
+  }
+#endif
+}
+
+void ScheduleDAGTopologicalSort::FixOrder() {
+  // Recompute from scratch after new nodes have been added.
+  if (Dirty) {
+    InitDAGTopologicalSorting();
+    return;
+  }
+
+  // Otherwise apply updates one-by-one.
+  for (auto &U : Updates)
+    AddPred(U.first, U.second);
+  Updates.clear();
+}
+
+void ScheduleDAGTopologicalSort::AddPredQueued(SUnit *Y, SUnit *X) {
+  // Recomputing the order from scratch is likely more efficient than applying
+  // updates one-by-one for too many updates. The current cut-off is arbitrarily
+  // chosen.
+  Dirty = Dirty || Updates.size() > 10;
+
+  if (Dirty)
+    return;
+
+  Updates.emplace_back(Y, X);
+}
+
+void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) {
+  int UpperBound, LowerBound;
+  LowerBound = Node2Index[Y->NodeNum];
+  UpperBound = Node2Index[X->NodeNum];
+  bool HasLoop = false;
+  // Is Ord(X) < Ord(Y) ?
+  if (LowerBound < UpperBound) {
+    // Update the topological order.
+    Visited.reset();
+    DFS(Y, UpperBound, HasLoop);
+    assert(!HasLoop && "Inserted edge creates a loop!");
+    // Recompute topological indexes.
+    Shift(Visited, LowerBound, UpperBound);
+  }
+
+  NumNewPredsAdded++;
+}
+
+void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) {
+  // InitDAGTopologicalSorting();
+}
+
+void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
+                                     bool &HasLoop) {
+  std::vector<const SUnit*> WorkList;
+  WorkList.reserve(SUnits.size());
+
+  WorkList.push_back(SU);
+  do {
+    SU = WorkList.back();
+    WorkList.pop_back();
+    Visited.set(SU->NodeNum);
+    for (const SDep &SuccDep : llvm::reverse(SU->Succs)) {
+      unsigned s = SuccDep.getSUnit()->NodeNum;
+      // Edges to non-SUnits are allowed but ignored (e.g. ExitSU).
+      if (s >= Node2Index.size())
+        continue;
+      if (Node2Index[s] == UpperBound) {
+        HasLoop = true;
+        return;
+      }
+      // Visit successors if not already and in affected region.
+      if (!Visited.test(s) && Node2Index[s] < UpperBound) {
+        WorkList.push_back(SuccDep.getSUnit());
+      }
+    }
+  } while (!WorkList.empty());
+}
+
+std::vector<int> ScheduleDAGTopologicalSort::GetSubGraph(const SUnit &StartSU,
+                                                         const SUnit &TargetSU,
+                                                         bool &Success) {
+  std::vector<const SUnit*> WorkList;
+  int LowerBound = Node2Index[StartSU.NodeNum];
+  int UpperBound = Node2Index[TargetSU.NodeNum];
+  bool Found = false;
+  BitVector VisitedBack;
+  std::vector<int> Nodes;
+
+  if (LowerBound > UpperBound) {
+    Success = false;
+    return Nodes;
+  }
+
+  WorkList.reserve(SUnits.size());
+  Visited.reset();
+
+  // Starting from StartSU, visit all successors up
+  // to UpperBound.
+  WorkList.push_back(&StartSU);
+  do {
+    const SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    for (const SDep &SD : llvm::reverse(SU->Succs)) {
+      const SUnit *Succ = SD.getSUnit();
+      unsigned s = Succ->NodeNum;
+      // Edges to non-SUnits are allowed but ignored (e.g. ExitSU).
+      if (Succ->isBoundaryNode())
+        continue;
+      if (Node2Index[s] == UpperBound) {
+        Found = true;
+        continue;
+      }
+      // Visit successors if not already and in affected region.
+      if (!Visited.test(s) && Node2Index[s] < UpperBound) {
+        Visited.set(s);
+        WorkList.push_back(Succ);
+      }
+    }
+  } while (!WorkList.empty());
+
+  if (!Found) {
+    Success = false;
+    return Nodes;
+  }
+
+  WorkList.clear();
+  VisitedBack.resize(SUnits.size());
+  Found = false;
+
+  // Starting from TargetSU, visit all predecessors up
+  // to LowerBound. SUs that are visited by the two
+  // passes are added to Nodes.
+  WorkList.push_back(&TargetSU);
+  do {
+    const SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    for (const SDep &SD : llvm::reverse(SU->Preds)) {
+      const SUnit *Pred = SD.getSUnit();
+      unsigned s = Pred->NodeNum;
+      // Edges to non-SUnits are allowed but ignored (e.g. EntrySU).
+      if (Pred->isBoundaryNode())
+        continue;
+      if (Node2Index[s] == LowerBound) {
+        Found = true;
+        continue;
+      }
+      if (!VisitedBack.test(s) && Visited.test(s)) {
+        VisitedBack.set(s);
+        WorkList.push_back(Pred);
+        Nodes.push_back(s);
+      }
+    }
+  } while (!WorkList.empty());
+
+  assert(Found && "Error in SUnit Graph!");
+  Success = true;
+  return Nodes;
+}
+
+void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound,
+                                       int UpperBound) {
+  std::vector<int> L;
+  int shift = 0;
+  int i;
+
+  for (i = LowerBound; i <= UpperBound; ++i) {
+    // w is node at topological index i.
+    int w = Index2Node[i];
+    if (Visited.test(w)) {
+      // Unmark.
+      Visited.reset(w);
+      L.push_back(w);
+      shift = shift + 1;
+    } else {
+      Allocate(w, i - shift);
+    }
+  }
+
+  for (unsigned LI : L) {
+    Allocate(LI, i - shift);
+    i = i + 1;
+  }
+}
+
+bool ScheduleDAGTopologicalSort::WillCreateCycle(SUnit *TargetSU, SUnit *SU) {
+  FixOrder();
+  // Is SU reachable from TargetSU via successor edges?
+  if (IsReachable(SU, TargetSU))
+    return true;
+  for (const SDep &PredDep : TargetSU->Preds)
+    if (PredDep.isAssignedRegDep() &&
+        IsReachable(SU, PredDep.getSUnit()))
+      return true;
+  return false;
+}
+
+void ScheduleDAGTopologicalSort::AddSUnitWithoutPredecessors(const SUnit *SU) {
+  assert(SU->NodeNum == Index2Node.size() && "Node cannot be added at the end");
+  assert(SU->NumPreds == 0 && "Can only add SU's with no predecessors");
+  Node2Index.push_back(Index2Node.size());
+  Index2Node.push_back(SU->NodeNum);
+  Visited.resize(Node2Index.size());
+}
+
+bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
+                                             const SUnit *TargetSU) {
+  assert(TargetSU != nullptr && "Invalid target SUnit");
+  assert(SU != nullptr && "Invalid SUnit");
+  FixOrder();
+  // If insertion of the edge SU->TargetSU would create a cycle
+  // then there is a path from TargetSU to SU.
+  int UpperBound, LowerBound;
+  LowerBound = Node2Index[TargetSU->NodeNum];
+  UpperBound = Node2Index[SU->NodeNum];
+  bool HasLoop = false;
+  // Is Ord(TargetSU) < Ord(SU) ?
+  if (LowerBound < UpperBound) {
+    Visited.reset();
+    // There may be a path from TargetSU to SU. Check for it.
+    DFS(TargetSU, UpperBound, HasLoop);
+  }
+  return HasLoop;
+}
+
+void ScheduleDAGTopologicalSort::Allocate(int n, int index) {
+  Node2Index[n] = index;
+  Index2Node[index] = n;
+}
+
+ScheduleDAGTopologicalSort::
+ScheduleDAGTopologicalSort(std::vector<SUnit> &sunits, SUnit *exitsu)
+  : SUnits(sunits), ExitSU(exitsu) {}
+
+ScheduleHazardRecognizer::~ScheduleHazardRecognizer() = default;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
new file mode 100644
index 000000000000..239b44857c28
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -0,0 +1,1531 @@
+//===---- ScheduleDAGInstrs.cpp - MachineInstr Rescheduling ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This implements the ScheduleDAGInstrs class, which implements
+/// re-scheduling of MachineInstrs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+
+#include "llvm/ADT/IntEqClasses.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDFS.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-scheduler"
+
+static cl::opt<bool>
+    EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
+                    cl::desc("Enable use of AA during MI DAG construction"));
+
+static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
+    cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction"));
+
+// Note: the two options below might be used in tuning compile time vs
+// output quality. Setting HugeRegion so large that it will never be
+// reached means best-effort, but may be slow.
+
+// When Stores and Loads maps (or NonAliasStores and NonAliasLoads)
+// together hold this many SUs, a reduction of maps will be done.
+static cl::opt<unsigned> HugeRegion("dag-maps-huge-region", cl::Hidden,
+    cl::init(1000), cl::desc("The limit to use while constructing the DAG "
+                             "prior to scheduling, at which point a trade-off "
+                             "is made to avoid excessive compile time."));
+
+static cl::opt<unsigned> ReductionSize(
+    "dag-maps-reduction-size", cl::Hidden,
+    cl::desc("A huge scheduling region will have maps reduced by this many "
+             "nodes at a time. Defaults to HugeRegion / 2."));
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+static cl::opt<bool> SchedPrintCycles(
+    "sched-print-cycles", cl::Hidden, cl::init(false),
+    cl::desc("Report top/bottom cycles when dumping SUnit instances"));
+#endif
+
+static unsigned getReductionSize() {
+  // Always reduce a huge region with half of the elements, except
+  // when user sets this number explicitly.
+  if (ReductionSize.getNumOccurrences() == 0)
+    return HugeRegion / 2;
+  return ReductionSize;
+}
+
+static void dumpSUList(const ScheduleDAGInstrs::SUList &L) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  dbgs() << "{ ";
+  for (const SUnit *SU : L) {
+    dbgs() << "SU(" << SU->NodeNum << ")";
+    if (SU != L.back())
+      dbgs() << ", ";
+  }
+  dbgs() << "}\n";
+#endif
+}
+
+ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
+                                     const MachineLoopInfo *mli,
+                                     bool RemoveKillFlags)
+    : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()),
+      RemoveKillFlags(RemoveKillFlags),
+      UnknownValue(UndefValue::get(
+                             Type::getVoidTy(mf.getFunction().getContext()))), Topo(SUnits, &ExitSU) {
+  DbgValues.clear();
+
+  const TargetSubtargetInfo &ST = mf.getSubtarget();
+  SchedModel.init(&ST);
+}
+
+/// If this machine instr has memory reference information and it can be
+/// tracked to a normal reference to a known object, return the Value
+/// for that object. This function returns false the memory location is
+/// unknown or may alias anything.
+static bool getUnderlyingObjectsForInstr(const MachineInstr *MI,
+                                         const MachineFrameInfo &MFI,
+                                         UnderlyingObjectsVector &Objects,
+                                         const DataLayout &DL) {
+  auto AllMMOsOkay = [&]() {
+    for (const MachineMemOperand *MMO : MI->memoperands()) {
+      // TODO: Figure out whether isAtomic is really necessary (see D57601).
+      if (MMO->isVolatile() || MMO->isAtomic())
+        return false;
+
+      if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) {
+        // Function that contain tail calls don't have unique PseudoSourceValue
+        // objects. Two PseudoSourceValues might refer to the same or
+        // overlapping locations. The client code calling this function assumes
+        // this is not the case. So return a conservative answer of no known
+        // object.
+        if (MFI.hasTailCall())
+          return false;
+
+        // For now, ignore PseudoSourceValues which may alias LLVM IR values
+        // because the code that uses this function has no way to cope with
+        // such aliases.
+        if (PSV->isAliased(&MFI))
+          return false;
+
+        bool MayAlias = PSV->mayAlias(&MFI);
+        Objects.emplace_back(PSV, MayAlias);
+      } else if (const Value *V = MMO->getValue()) {
+        SmallVector<Value *, 4> Objs;
+        if (!getUnderlyingObjectsForCodeGen(V, Objs))
+          return false;
+
+        for (Value *V : Objs) {
+          assert(isIdentifiedObject(V));
+          Objects.emplace_back(V, true);
+        }
+      } else
+        return false;
+    }
+    return true;
+  };
+
+  if (!AllMMOsOkay()) {
+    Objects.clear();
+    return false;
+  }
+
+  return true;
+}
+
+void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) {
+  BB = bb;
+}
+
+void ScheduleDAGInstrs::finishBlock() {
+  // Subclasses should no longer refer to the old block.
+  BB = nullptr;
+}
+
+void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
+                                    MachineBasicBlock::iterator begin,
+                                    MachineBasicBlock::iterator end,
+                                    unsigned regioninstrs) {
+  assert(bb == BB && "startBlock should set BB");
+  RegionBegin = begin;
+  RegionEnd = end;
+  NumRegionInstrs = regioninstrs;
+}
+
+void ScheduleDAGInstrs::exitRegion() {
+  // Nothing to do.
+}
+
+void ScheduleDAGInstrs::addSchedBarrierDeps() {
+  MachineInstr *ExitMI =
+      RegionEnd != BB->end()
+          ? &*skipDebugInstructionsBackward(RegionEnd, RegionBegin)
+          : nullptr;
+  ExitSU.setInstr(ExitMI);
+  // Add dependencies on the defs and uses of the instruction.
+  if (ExitMI) {
+    for (const MachineOperand &MO : ExitMI->all_uses()) {
+      Register Reg = MO.getReg();
+      if (Reg.isPhysical()) {
+        Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg));
+      } else if (Reg.isVirtual() && MO.readsReg()) {
+        addVRegUseDeps(&ExitSU, MO.getOperandNo());
+      }
+    }
+  }
+  if (!ExitMI || (!ExitMI->isCall() && !ExitMI->isBarrier())) {
+    // For others, e.g. fallthrough, conditional branch, assume the exit
+    // uses all the registers that are livein to the successor blocks.
+    for (const MachineBasicBlock *Succ : BB->successors()) {
+      for (const auto &LI : Succ->liveins()) {
+        if (!Uses.contains(LI.PhysReg))
+          Uses.insert(PhysRegSUOper(&ExitSU, -1, LI.PhysReg));
+      }
+    }
+  }
+}
+
+/// MO is an operand of SU's instruction that defines a physical register. Adds
+/// data dependencies from SU to any uses of the physical register.
+void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
+  const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx);
+  assert(MO.isDef() && "expect physreg def");
+
+  // Ask the target if address-backscheduling is desirable, and if so how much.
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+
+  // Only use any non-zero latency for real defs/uses, in contrast to
+  // "fake" operands added by regalloc.
+  const MCInstrDesc *DefMIDesc = &SU->getInstr()->getDesc();
+  bool ImplicitPseudoDef = (OperIdx >= DefMIDesc->getNumOperands() &&
+                            !DefMIDesc->hasImplicitDefOfPhysReg(MO.getReg()));
+  for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
+       Alias.isValid(); ++Alias) {
+    for (Reg2SUnitsMap::iterator I = Uses.find(*Alias); I != Uses.end(); ++I) {
+      SUnit *UseSU = I->SU;
+      if (UseSU == SU)
+        continue;
+
+      // Adjust the dependence latency using operand def/use information,
+      // then allow the target to perform its own adjustments.
+      int UseOp = I->OpIdx;
+      MachineInstr *RegUse = nullptr;
+      SDep Dep;
+      if (UseOp < 0)
+        Dep = SDep(SU, SDep::Artificial);
+      else {
+        // Set the hasPhysRegDefs only for physreg defs that have a use within
+        // the scheduling region.
+        SU->hasPhysRegDefs = true;
+        Dep = SDep(SU, SDep::Data, *Alias);
+        RegUse = UseSU->getInstr();
+      }
+      const MCInstrDesc *UseMIDesc =
+          (RegUse ? &UseSU->getInstr()->getDesc() : nullptr);
+      bool ImplicitPseudoUse =
+          (UseMIDesc && UseOp >= ((int)UseMIDesc->getNumOperands()) &&
+           !UseMIDesc->hasImplicitUseOfPhysReg(*Alias));
+      if (!ImplicitPseudoDef && !ImplicitPseudoUse) {
+        Dep.setLatency(SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
+                                                        RegUse, UseOp));
+      } else {
+        Dep.setLatency(0);
+      }
+      ST.adjustSchedDependency(SU, OperIdx, UseSU, UseOp, Dep);
+      UseSU->addPred(Dep);
+    }
+  }
+}
+
+/// Adds register dependencies (data, anti, and output) from this SUnit
+/// to following instructions in the same scheduling region that depend the
+/// physical register referenced at OperIdx.
+void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
+  MachineInstr *MI = SU->getInstr();
+  MachineOperand &MO = MI->getOperand(OperIdx);
+  Register Reg = MO.getReg();
+  // We do not need to track any dependencies for constant registers.
+  if (MRI.isConstantPhysReg(Reg))
+    return;
+
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+
+  // Optionally add output and anti dependencies. For anti
+  // dependencies we use a latency of 0 because for a multi-issue
+  // target we want to allow the defining instruction to issue
+  // in the same cycle as the using instruction.
+  // TODO: Using a latency of 1 here for output dependencies assumes
+  //       there's no cost for reusing registers.
+  SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output;
+  for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) {
+    if (!Defs.contains(*Alias))
+      continue;
+    for (Reg2SUnitsMap::iterator I = Defs.find(*Alias); I != Defs.end(); ++I) {
+      SUnit *DefSU = I->SU;
+      if (DefSU == &ExitSU)
+        continue;
+      if (DefSU != SU &&
+          (Kind != SDep::Output || !MO.isDead() ||
+           !DefSU->getInstr()->registerDefIsDead(*Alias))) {
+        SDep Dep(SU, Kind, /*Reg=*/*Alias);
+        if (Kind != SDep::Anti)
+          Dep.setLatency(
+            SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
+        ST.adjustSchedDependency(SU, OperIdx, DefSU, I->OpIdx, Dep);
+        DefSU->addPred(Dep);
+      }
+    }
+  }
+
+  if (!MO.isDef()) {
+    SU->hasPhysRegUses = true;
+    // Either insert a new Reg2SUnits entry with an empty SUnits list, or
+    // retrieve the existing SUnits list for this register's uses.
+    // Push this SUnit on the use list.
+    Uses.insert(PhysRegSUOper(SU, OperIdx, Reg));
+    if (RemoveKillFlags)
+      MO.setIsKill(false);
+  } else {
+    addPhysRegDataDeps(SU, OperIdx);
+
+    // Clear previous uses and defs of this register and its subergisters.
+    for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) {
+      if (Uses.contains(SubReg))
+        Uses.eraseAll(SubReg);
+      if (!MO.isDead())
+        Defs.eraseAll(SubReg);
+    }
+    if (MO.isDead() && SU->isCall) {
+      // Calls will not be reordered because of chain dependencies (see
+      // below). Since call operands are dead, calls may continue to be added
+      // to the DefList making dependence checking quadratic in the size of
+      // the block. Instead, we leave only one call at the back of the
+      // DefList.
+      Reg2SUnitsMap::RangePair P = Defs.equal_range(Reg);
+      Reg2SUnitsMap::iterator B = P.first;
+      Reg2SUnitsMap::iterator I = P.second;
+      for (bool isBegin = I == B; !isBegin; /* empty */) {
+        isBegin = (--I) == B;
+        if (!I->SU->isCall)
+          break;
+        I = Defs.erase(I);
+      }
+    }
+
+    // Defs are pushed in the order they are visited and never reordered.
+    Defs.insert(PhysRegSUOper(SU, OperIdx, Reg));
+  }
+}
+
+LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
+{
+  Register Reg = MO.getReg();
+  // No point in tracking lanemasks if we don't have interesting subregisters.
+  const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
+  if (!RC.HasDisjunctSubRegs)
+    return LaneBitmask::getAll();
+
+  unsigned SubReg = MO.getSubReg();
+  if (SubReg == 0)
+    return RC.getLaneMask();
+  return TRI->getSubRegIndexLaneMask(SubReg);
+}
+
+bool ScheduleDAGInstrs::deadDefHasNoUse(const MachineOperand &MO) {
+  auto RegUse = CurrentVRegUses.find(MO.getReg());
+  if (RegUse == CurrentVRegUses.end())
+    return true;
+  return (RegUse->LaneMask & getLaneMaskForMO(MO)).none();
+}
+
+/// Adds register output and data dependencies from this SUnit to instructions
+/// that occur later in the same scheduling region if they read from or write to
+/// the virtual register defined at OperIdx.
+///
+/// TODO: Hoist loop induction variable increments. This has to be
+/// reevaluated. Generally, IV scheduling should be done before coalescing.
+void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
+  MachineInstr *MI = SU->getInstr();
+  MachineOperand &MO = MI->getOperand(OperIdx);
+  Register Reg = MO.getReg();
+
+  LaneBitmask DefLaneMask;
+  LaneBitmask KillLaneMask;
+  if (TrackLaneMasks) {
+    bool IsKill = MO.getSubReg() == 0 || MO.isUndef();
+    DefLaneMask = getLaneMaskForMO(MO);
+    // If we have a <read-undef> flag, none of the lane values comes from an
+    // earlier instruction.
+    KillLaneMask = IsKill ? LaneBitmask::getAll() : DefLaneMask;
+
+    if (MO.getSubReg() != 0 && MO.isUndef()) {
+      // There may be other subregister defs on the same instruction of the same
+      // register in later operands. The lanes of other defs will now be live
+      // after this instruction, so these should not be treated as killed by the
+      // instruction even though they appear to be killed in this one operand.
+      for (const MachineOperand &OtherMO :
+           llvm::drop_begin(MI->operands(), OperIdx + 1))
+        if (OtherMO.isReg() && OtherMO.isDef() && OtherMO.getReg() == Reg)
+          KillLaneMask &= ~getLaneMaskForMO(OtherMO);
+    }
+
+    // Clear undef flag, we'll re-add it later once we know which subregister
+    // Def is first.
+    MO.setIsUndef(false);
+  } else {
+    DefLaneMask = LaneBitmask::getAll();
+    KillLaneMask = LaneBitmask::getAll();
+  }
+
+  if (MO.isDead()) {
+    assert(deadDefHasNoUse(MO) && "Dead defs should have no uses");
+  } else {
+    // Add data dependence to all uses we found so far.
+    const TargetSubtargetInfo &ST = MF.getSubtarget();
+    for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Reg),
+         E = CurrentVRegUses.end(); I != E; /*empty*/) {
+      LaneBitmask LaneMask = I->LaneMask;
+      // Ignore uses of other lanes.
+      if ((LaneMask & KillLaneMask).none()) {
+        ++I;
+        continue;
+      }
+
+      if ((LaneMask & DefLaneMask).any()) {
+        SUnit *UseSU = I->SU;
+        MachineInstr *Use = UseSU->getInstr();
+        SDep Dep(SU, SDep::Data, Reg);
+        Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use,
+                                                        I->OperandIndex));
+        ST.adjustSchedDependency(SU, OperIdx, UseSU, I->OperandIndex, Dep);
+        UseSU->addPred(Dep);
+      }
+
+      LaneMask &= ~KillLaneMask;
+      // If we found a Def for all lanes of this use, remove it from the list.
+      if (LaneMask.any()) {
+        I->LaneMask = LaneMask;
+        ++I;
+      } else
+        I = CurrentVRegUses.erase(I);
+    }
+  }
+
+  // Shortcut: Singly defined vregs do not have output/anti dependencies.
+  if (MRI.hasOneDef(Reg))
+    return;
+
+  // Add output dependence to the next nearest defs of this vreg.
+  //
+  // Unless this definition is dead, the output dependence should be
+  // transitively redundant with antidependencies from this definition's
+  // uses. We're conservative for now until we have a way to guarantee the uses
+  // are not eliminated sometime during scheduling. The output dependence edge
+  // is also useful if output latency exceeds def-use latency.
+  LaneBitmask LaneMask = DefLaneMask;
+  for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
+                                     CurrentVRegDefs.end())) {
+    // Ignore defs for other lanes.
+    if ((V2SU.LaneMask & LaneMask).none())
+      continue;
+    // Add an output dependence.
+    SUnit *DefSU = V2SU.SU;
+    // Ignore additional defs of the same lanes in one instruction. This can
+    // happen because lanemasks are shared for targets with too many
+    // subregisters. We also use some representration tricks/hacks where we
+    // add super-register defs/uses, to imply that although we only access parts
+    // of the reg we care about the full one.
+    if (DefSU == SU)
+      continue;
+    SDep Dep(SU, SDep::Output, Reg);
+    Dep.setLatency(
+      SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
+    DefSU->addPred(Dep);
+
+    // Update current definition. This can get tricky if the def was about a
+    // bigger lanemask before. We then have to shrink it and create a new
+    // VReg2SUnit for the non-overlapping part.
+    LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask;
+    LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask;
+    V2SU.SU = SU;
+    V2SU.LaneMask = OverlapMask;
+    if (NonOverlapMask.any())
+      CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, DefSU));
+  }
+  // If there was no CurrentVRegDefs entry for some lanes yet, create one.
+  if (LaneMask.any())
+    CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU));
+}
+
+/// Adds a register data dependency if the instruction that defines the
+/// virtual register used at OperIdx is mapped to an SUnit. Add a register
+/// antidependency from this SUnit to instructions that occur later in the same
+/// scheduling region if they write the virtual register.
+///
+/// TODO: Handle ExitSU "uses" properly.
+void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
+  const MachineInstr *MI = SU->getInstr();
+  assert(!MI->isDebugOrPseudoInstr());
+
+  const MachineOperand &MO = MI->getOperand(OperIdx);
+  Register Reg = MO.getReg();
+
+  // Remember the use. Data dependencies will be added when we find the def.
+  LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO)
+                                        : LaneBitmask::getAll();
+  CurrentVRegUses.insert(VReg2SUnitOperIdx(Reg, LaneMask, OperIdx, SU));
+
+  // Add antidependences to the following defs of the vreg.
+  for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
+                                     CurrentVRegDefs.end())) {
+    // Ignore defs for unrelated lanes.
+    LaneBitmask PrevDefLaneMask = V2SU.LaneMask;
+    if ((PrevDefLaneMask & LaneMask).none())
+      continue;
+    if (V2SU.SU == SU)
+      continue;
+
+    V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg));
+  }
+}
+
+/// Returns true if MI is an instruction we are unable to reason about
+/// (like a call or something with unmodeled side effects).
+static inline bool isGlobalMemoryObject(MachineInstr *MI) {
+  return MI->isCall() || MI->hasUnmodeledSideEffects() ||
+         (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad());
+}
+
+void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
+                                            unsigned Latency) {
+  if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) {
+    SDep Dep(SUa, SDep::MayAliasMem);
+    Dep.setLatency(Latency);
+    SUb->addPred(Dep);
+  }
+}
+
+/// Creates an SUnit for each real instruction, numbered in top-down
+/// topological order. The instruction order A < B, implies that no edge exists
+/// from B to A.
+///
+/// Map each real instruction to its SUnit.
+///
+/// After initSUnits, the SUnits vector cannot be resized and the scheduler may
+/// hang onto SUnit pointers. We may relax this in the future by using SUnit IDs
+/// instead of pointers.
+///
+/// MachineScheduler relies on initSUnits numbering the nodes by their order in
+/// the original instruction list.
+void ScheduleDAGInstrs::initSUnits() {
+  // We'll be allocating one SUnit for each real instruction in the region,
+  // which is contained within a basic block.
+  SUnits.reserve(NumRegionInstrs);
+
+  for (MachineInstr &MI : make_range(RegionBegin, RegionEnd)) {
+    if (MI.isDebugOrPseudoInstr())
+      continue;
+
+    SUnit *SU = newSUnit(&MI);
+    MISUnitMap[&MI] = SU;
+
+    SU->isCall = MI.isCall();
+    SU->isCommutable = MI.isCommutable();
+
+    // Assign the Latency field of SU using target-provided information.
+    SU->Latency = SchedModel.computeInstrLatency(SU->getInstr());
+
+    // If this SUnit uses a reserved or unbuffered resource, mark it as such.
+    //
+    // Reserved resources block an instruction from issuing and stall the
+    // entire pipeline. These are identified by BufferSize=0.
+    //
+    // Unbuffered resources prevent execution of subsequent instructions that
+    // require the same resources. This is used for in-order execution pipelines
+    // within an out-of-order core. These are identified by BufferSize=1.
+    if (SchedModel.hasInstrSchedModel()) {
+      const MCSchedClassDesc *SC = getSchedClass(SU);
+      for (const MCWriteProcResEntry &PRE :
+           make_range(SchedModel.getWriteProcResBegin(SC),
+                      SchedModel.getWriteProcResEnd(SC))) {
+        switch (SchedModel.getProcResource(PRE.ProcResourceIdx)->BufferSize) {
+        case 0:
+          SU->hasReservedResource = true;
+          break;
+        case 1:
+          SU->isUnbuffered = true;
+          break;
+        default:
+          break;
+        }
+      }
+    }
+  }
+}
+
+class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> {
+  /// Current total number of SUs in map.
+  unsigned NumNodes = 0;
+
+  /// 1 for loads, 0 for stores. (see comment in SUList)
+  unsigned TrueMemOrderLatency;
+
+public:
+  Value2SUsMap(unsigned lat = 0) : TrueMemOrderLatency(lat) {}
+
+  /// To keep NumNodes up to date, insert() is used instead of
+  /// this operator w/ push_back().
+  ValueType &operator[](const SUList &Key) {
+    llvm_unreachable("Don't use. Use insert() instead."); };
+
+  /// Adds SU to the SUList of V. If Map grows huge, reduce its size by calling
+  /// reduce().
+  void inline insert(SUnit *SU, ValueType V) {
+    MapVector::operator[](V).push_back(SU);
+    NumNodes++;
+  }
+
+  /// Clears the list of SUs mapped to V.
+  void inline clearList(ValueType V) {
+    iterator Itr = find(V);
+    if (Itr != end()) {
+      assert(NumNodes >= Itr->second.size());
+      NumNodes -= Itr->second.size();
+
+      Itr->second.clear();
+    }
+  }
+
+  /// Clears map from all contents.
+  void clear() {
+    MapVector<ValueType, SUList>::clear();
+    NumNodes = 0;
+  }
+
+  unsigned inline size() const { return NumNodes; }
+
+  /// Counts the number of SUs in this map after a reduction.
+  void reComputeSize() {
+    NumNodes = 0;
+    for (auto &I : *this)
+      NumNodes += I.second.size();
+  }
+
+  unsigned inline getTrueMemOrderLatency() const {
+    return TrueMemOrderLatency;
+  }
+
+  void dump();
+};
+
+void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
+                                             Value2SUsMap &Val2SUsMap) {
+  for (auto &I : Val2SUsMap)
+    addChainDependencies(SU, I.second,
+                         Val2SUsMap.getTrueMemOrderLatency());
+}
+
+void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
+                                             Value2SUsMap &Val2SUsMap,
+                                             ValueType V) {
+  Value2SUsMap::iterator Itr = Val2SUsMap.find(V);
+  if (Itr != Val2SUsMap.end())
+    addChainDependencies(SU, Itr->second,
+                         Val2SUsMap.getTrueMemOrderLatency());
+}
+
+void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) {
+  assert(BarrierChain != nullptr);
+
+  for (auto &[V, SUs] : map) {
+    (void)V;
+    for (auto *SU : SUs)
+      SU->addPredBarrier(BarrierChain);
+  }
+  map.clear();
+}
+
+void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) {
+  assert(BarrierChain != nullptr);
+
+  // Go through all lists of SUs.
+  for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) {
+    Value2SUsMap::iterator CurrItr = I++;
+    SUList &sus = CurrItr->second;
+    SUList::iterator SUItr = sus.begin(), SUEE = sus.end();
+    for (; SUItr != SUEE; ++SUItr) {
+      // Stop on BarrierChain or any instruction above it.
+      if ((*SUItr)->NodeNum <= BarrierChain->NodeNum)
+        break;
+
+      (*SUItr)->addPredBarrier(BarrierChain);
+    }
+
+    // Remove also the BarrierChain from list if present.
+    if (SUItr != SUEE && *SUItr == BarrierChain)
+      SUItr++;
+
+    // Remove all SUs that are now successors of BarrierChain.
+    if (SUItr != sus.begin())
+      sus.erase(sus.begin(), SUItr);
+  }
+
+  // Remove all entries with empty su lists.
+  map.remove_if([&](std::pair<ValueType, SUList> &mapEntry) {
+      return (mapEntry.second.empty()); });
+
+  // Recompute the size of the map (NumNodes).
+  map.reComputeSize();
+}
+
+void ScheduleDAGInstrs::buildSchedGraph(AAResults *AA,
+                                        RegPressureTracker *RPTracker,
+                                        PressureDiffs *PDiffs,
+                                        LiveIntervals *LIS,
+                                        bool TrackLaneMasks) {
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+  bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
+                                                       : ST.useAA();
+  AAForDep = UseAA ? AA : nullptr;
+
+  BarrierChain = nullptr;
+
+  this->TrackLaneMasks = TrackLaneMasks;
+  MISUnitMap.clear();
+  ScheduleDAG::clearDAG();
+
+  // Create an SUnit for each real instruction.
+  initSUnits();
+
+  if (PDiffs)
+    PDiffs->init(SUnits.size());
+
+  // We build scheduling units by walking a block's instruction list
+  // from bottom to top.
+
+  // Each MIs' memory operand(s) is analyzed to a list of underlying
+  // objects. The SU is then inserted in the SUList(s) mapped from the
+  // Value(s). Each Value thus gets mapped to lists of SUs depending
+  // on it, stores and loads kept separately. Two SUs are trivially
+  // non-aliasing if they both depend on only identified Values and do
+  // not share any common Value.
+  Value2SUsMap Stores, Loads(1 /*TrueMemOrderLatency*/);
+
+  // Certain memory accesses are known to not alias any SU in Stores
+  // or Loads, and have therefore their own 'NonAlias'
+  // domain. E.g. spill / reload instructions never alias LLVM I/R
+  // Values. It would be nice to assume that this type of memory
+  // accesses always have a proper memory operand modelling, and are
+  // therefore never unanalyzable, but this is conservatively not
+  // done.
+  Value2SUsMap NonAliasStores, NonAliasLoads(1 /*TrueMemOrderLatency*/);
+
+  // Track all instructions that may raise floating-point exceptions.
+  // These do not depend on one other (or normal loads or stores), but
+  // must not be rescheduled across global barriers.  Note that we don't
+  // really need a "map" here since we don't track those MIs by value;
+  // using the same Value2SUsMap data type here is simply a matter of
+  // convenience.
+  Value2SUsMap FPExceptions;
+
+  // Remove any stale debug info; sometimes BuildSchedGraph is called again
+  // without emitting the info from the previous call.
+  DbgValues.clear();
+  FirstDbgValue = nullptr;
+
+  assert(Defs.empty() && Uses.empty() &&
+         "Only BuildGraph should update Defs/Uses");
+  Defs.setUniverse(TRI->getNumRegs());
+  Uses.setUniverse(TRI->getNumRegs());
+
+  assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs");
+  assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses");
+  unsigned NumVirtRegs = MRI.getNumVirtRegs();
+  CurrentVRegDefs.setUniverse(NumVirtRegs);
+  CurrentVRegUses.setUniverse(NumVirtRegs);
+
+  // Model data dependencies between instructions being scheduled and the
+  // ExitSU.
+  addSchedBarrierDeps();
+
+  // Walk the list of instructions, from bottom moving up.
+  MachineInstr *DbgMI = nullptr;
+  for (MachineBasicBlock::iterator MII = RegionEnd, MIE = RegionBegin;
+       MII != MIE; --MII) {
+    MachineInstr &MI = *std::prev(MII);
+    if (DbgMI) {
+      DbgValues.emplace_back(DbgMI, &MI);
+      DbgMI = nullptr;
+    }
+
+    if (MI.isDebugValue() || MI.isDebugPHI()) {
+      DbgMI = &MI;
+      continue;
+    }
+
+    if (MI.isDebugLabel() || MI.isDebugRef() || MI.isPseudoProbe())
+      continue;
+
+    SUnit *SU = MISUnitMap[&MI];
+    assert(SU && "No SUnit mapped to this MI");
+
+    if (RPTracker) {
+      RegisterOperands RegOpers;
+      RegOpers.collect(MI, *TRI, MRI, TrackLaneMasks, false);
+      if (TrackLaneMasks) {
+        SlotIndex SlotIdx = LIS->getInstructionIndex(MI);
+        RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx);
+      }
+      if (PDiffs != nullptr)
+        PDiffs->addInstruction(SU->NodeNum, RegOpers, MRI);
+
+      if (RPTracker->getPos() == RegionEnd || &*RPTracker->getPos() != &MI)
+        RPTracker->recedeSkipDebugValues();
+      assert(&*RPTracker->getPos() == &MI && "RPTracker in sync");
+      RPTracker->recede(RegOpers);
+    }
+
+    assert(
+        (CanHandleTerminators || (!MI.isTerminator() && !MI.isPosition())) &&
+        "Cannot schedule terminators or labels!");
+
+    // Add register-based dependencies (data, anti, and output).
+    // For some instructions (calls, returns, inline-asm, etc.) there can
+    // be explicit uses and implicit defs, in which case the use will appear
+    // on the operand list before the def. Do two passes over the operand
+    // list to make sure that defs are processed before any uses.
+    bool HasVRegDef = false;
+    for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) {
+      const MachineOperand &MO = MI.getOperand(j);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      Register Reg = MO.getReg();
+      if (Reg.isPhysical()) {
+        addPhysRegDeps(SU, j);
+      } else if (Reg.isVirtual()) {
+        HasVRegDef = true;
+        addVRegDefDeps(SU, j);
+      }
+    }
+    // Now process all uses.
+    for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) {
+      const MachineOperand &MO = MI.getOperand(j);
+      // Only look at use operands.
+      // We do not need to check for MO.readsReg() here because subsequent
+      // subregister defs will get output dependence edges and need no
+      // additional use dependencies.
+      if (!MO.isReg() || !MO.isUse())
+        continue;
+      Register Reg = MO.getReg();
+      if (Reg.isPhysical()) {
+        addPhysRegDeps(SU, j);
+      } else if (Reg.isVirtual() && MO.readsReg()) {
+        addVRegUseDeps(SU, j);
+      }
+    }
+
+    // If we haven't seen any uses in this scheduling region, create a
+    // dependence edge to ExitSU to model the live-out latency. This is required
+    // for vreg defs with no in-region use, and prefetches with no vreg def.
+    //
+    // FIXME: NumDataSuccs would be more precise than NumSuccs here. This
+    // check currently relies on being called before adding chain deps.
+    if (SU->NumSuccs == 0 && SU->Latency > 1 && (HasVRegDef || MI.mayLoad())) {
+      SDep Dep(SU, SDep::Artificial);
+      Dep.setLatency(SU->Latency - 1);
+      ExitSU.addPred(Dep);
+    }
+
+    // Add memory dependencies (Note: isStoreToStackSlot and
+    // isLoadFromStackSLot are not usable after stack slots are lowered to
+    // actual addresses).
+
+    // This is a barrier event that acts as a pivotal node in the DAG.
+    if (isGlobalMemoryObject(&MI)) {
+
+      // Become the barrier chain.
+      if (BarrierChain)
+        BarrierChain->addPredBarrier(SU);
+      BarrierChain = SU;
+
+      LLVM_DEBUG(dbgs() << "Global memory object and new barrier chain: SU("
+                        << BarrierChain->NodeNum << ").\n";);
+
+      // Add dependencies against everything below it and clear maps.
+      addBarrierChain(Stores);
+      addBarrierChain(Loads);
+      addBarrierChain(NonAliasStores);
+      addBarrierChain(NonAliasLoads);
+      addBarrierChain(FPExceptions);
+
+      continue;
+    }
+
+    // Instructions that may raise FP exceptions may not be moved
+    // across any global barriers.
+    if (MI.mayRaiseFPException()) {
+      if (BarrierChain)
+        BarrierChain->addPredBarrier(SU);
+
+      FPExceptions.insert(SU, UnknownValue);
+
+      if (FPExceptions.size() >= HugeRegion) {
+        LLVM_DEBUG(dbgs() << "Reducing FPExceptions map.\n";);
+        Value2SUsMap empty;
+        reduceHugeMemNodeMaps(FPExceptions, empty, getReductionSize());
+      }
+    }
+
+    // If it's not a store or a variant load, we're done.
+    if (!MI.mayStore() &&
+        !(MI.mayLoad() && !MI.isDereferenceableInvariantLoad()))
+      continue;
+
+    // Always add dependecy edge to BarrierChain if present.
+    if (BarrierChain)
+      BarrierChain->addPredBarrier(SU);
+
+    // Find the underlying objects for MI. The Objs vector is either
+    // empty, or filled with the Values of memory locations which this
+    // SU depends on.
+    UnderlyingObjectsVector Objs;
+    bool ObjsFound = getUnderlyingObjectsForInstr(&MI, MFI, Objs,
+                                                  MF.getDataLayout());
+
+    if (MI.mayStore()) {
+      if (!ObjsFound) {
+        // An unknown store depends on all stores and loads.
+        addChainDependencies(SU, Stores);
+        addChainDependencies(SU, NonAliasStores);
+        addChainDependencies(SU, Loads);
+        addChainDependencies(SU, NonAliasLoads);
+
+        // Map this store to 'UnknownValue'.
+        Stores.insert(SU, UnknownValue);
+      } else {
+        // Add precise dependencies against all previously seen memory
+        // accesses mapped to the same Value(s).
+        for (const UnderlyingObject &UnderlObj : Objs) {
+          ValueType V = UnderlObj.getValue();
+          bool ThisMayAlias = UnderlObj.mayAlias();
+
+          // Add dependencies to previous stores and loads mapped to V.
+          addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V);
+          addChainDependencies(SU, (ThisMayAlias ? Loads : NonAliasLoads), V);
+        }
+        // Update the store map after all chains have been added to avoid adding
+        // self-loop edge if multiple underlying objects are present.
+        for (const UnderlyingObject &UnderlObj : Objs) {
+          ValueType V = UnderlObj.getValue();
+          bool ThisMayAlias = UnderlObj.mayAlias();
+
+          // Map this store to V.
+          (ThisMayAlias ? Stores : NonAliasStores).insert(SU, V);
+        }
+        // The store may have dependencies to unanalyzable loads and
+        // stores.
+        addChainDependencies(SU, Loads, UnknownValue);
+        addChainDependencies(SU, Stores, UnknownValue);
+      }
+    } else { // SU is a load.
+      if (!ObjsFound) {
+        // An unknown load depends on all stores.
+        addChainDependencies(SU, Stores);
+        addChainDependencies(SU, NonAliasStores);
+
+        Loads.insert(SU, UnknownValue);
+      } else {
+        for (const UnderlyingObject &UnderlObj : Objs) {
+          ValueType V = UnderlObj.getValue();
+          bool ThisMayAlias = UnderlObj.mayAlias();
+
+          // Add precise dependencies against all previously seen stores
+          // mapping to the same Value(s).
+          addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V);
+
+          // Map this load to V.
+          (ThisMayAlias ? Loads : NonAliasLoads).insert(SU, V);
+        }
+        // The load may have dependencies to unanalyzable stores.
+        addChainDependencies(SU, Stores, UnknownValue);
+      }
+    }
+
+    // Reduce maps if they grow huge.
+    if (Stores.size() + Loads.size() >= HugeRegion) {
+      LLVM_DEBUG(dbgs() << "Reducing Stores and Loads maps.\n";);
+      reduceHugeMemNodeMaps(Stores, Loads, getReductionSize());
+    }
+    if (NonAliasStores.size() + NonAliasLoads.size() >= HugeRegion) {
+      LLVM_DEBUG(
+          dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n";);
+      reduceHugeMemNodeMaps(NonAliasStores, NonAliasLoads, getReductionSize());
+    }
+  }
+
+  if (DbgMI)
+    FirstDbgValue = DbgMI;
+
+  Defs.clear();
+  Uses.clear();
+  CurrentVRegDefs.clear();
+  CurrentVRegUses.clear();
+
+  Topo.MarkDirty();
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const PseudoSourceValue* PSV) {
+  PSV->printCustom(OS);
+  return OS;
+}
+
+void ScheduleDAGInstrs::Value2SUsMap::dump() {
+  for (const auto &[ValType, SUs] : *this) {
+    if (isa<const Value *>(ValType)) {
+      const Value *V = cast<const Value *>(ValType);
+      if (isa<UndefValue>(V))
+        dbgs() << "Unknown";
+      else
+        V->printAsOperand(dbgs());
+    } else if (isa<const PseudoSourceValue *>(ValType))
+      dbgs() << cast<const PseudoSourceValue *>(ValType);
+    else
+      llvm_unreachable("Unknown Value type.");
+
+    dbgs() << " : ";
+    dumpSUList(SUs);
+  }
+}
+
+void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
+                                              Value2SUsMap &loads, unsigned N) {
+  LLVM_DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n"; stores.dump();
+             dbgs() << "Loading SUnits:\n"; loads.dump());
+
+  // Insert all SU's NodeNums into a vector and sort it.
+  std::vector<unsigned> NodeNums;
+  NodeNums.reserve(stores.size() + loads.size());
+  for (const auto &[V, SUs] : stores) {
+    (void)V;
+    for (const auto *SU : SUs)
+      NodeNums.push_back(SU->NodeNum);
+  }
+  for (const auto &[V, SUs] : loads) {
+    (void)V;
+    for (const auto *SU : SUs)
+      NodeNums.push_back(SU->NodeNum);
+  }
+  llvm::sort(NodeNums);
+
+  // The N last elements in NodeNums will be removed, and the SU with
+  // the lowest NodeNum of them will become the new BarrierChain to
+  // let the not yet seen SUs have a dependency to the removed SUs.
+  assert(N <= NodeNums.size());
+  SUnit *newBarrierChain = &SUnits[*(NodeNums.end() - N)];
+  if (BarrierChain) {
+    // The aliasing and non-aliasing maps reduce independently of each
+    // other, but share a common BarrierChain. Check if the
+    // newBarrierChain is above the former one. If it is not, it may
+    // introduce a loop to use newBarrierChain, so keep the old one.
+    if (newBarrierChain->NodeNum < BarrierChain->NodeNum) {
+      BarrierChain->addPredBarrier(newBarrierChain);
+      BarrierChain = newBarrierChain;
+      LLVM_DEBUG(dbgs() << "Inserting new barrier chain: SU("
+                        << BarrierChain->NodeNum << ").\n";);
+    }
+    else
+      LLVM_DEBUG(dbgs() << "Keeping old barrier chain: SU("
+                        << BarrierChain->NodeNum << ").\n";);
+  }
+  else
+    BarrierChain = newBarrierChain;
+
+  insertBarrierChain(stores);
+  insertBarrierChain(loads);
+
+  LLVM_DEBUG(dbgs() << "After reduction:\nStoring SUnits:\n"; stores.dump();
+             dbgs() << "Loading SUnits:\n"; loads.dump());
+}
+
+static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs,
+                        MachineInstr &MI, bool addToLiveRegs) {
+  for (MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.readsReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg)
+      continue;
+
+    // Things that are available after the instruction are killed by it.
+    bool IsKill = LiveRegs.available(MRI, Reg);
+    MO.setIsKill(IsKill);
+    if (addToLiveRegs)
+      LiveRegs.addReg(Reg);
+  }
+}
+
+void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) {
+  LLVM_DEBUG(dbgs() << "Fixup kills for " << printMBBReference(MBB) << '\n');
+
+  LiveRegs.init(*TRI);
+  LiveRegs.addLiveOuts(MBB);
+
+  // Examine block from end to start...
+  for (MachineInstr &MI : llvm::reverse(MBB)) {
+    if (MI.isDebugOrPseudoInstr())
+      continue;
+
+    // Update liveness.  Registers that are defed but not used in this
+    // instruction are now dead. Mark register and all subregs as they
+    // are completely defined.
+    for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+      const MachineOperand &MO = *O;
+      if (MO.isReg()) {
+        if (!MO.isDef())
+          continue;
+        Register Reg = MO.getReg();
+        if (!Reg)
+          continue;
+        LiveRegs.removeReg(Reg);
+      } else if (MO.isRegMask()) {
+        LiveRegs.removeRegsInMask(MO);
+      }
+    }
+
+    // If there is a bundle header fix it up first.
+    if (!MI.isBundled()) {
+      toggleKills(MRI, LiveRegs, MI, true);
+    } else {
+      MachineBasicBlock::instr_iterator Bundle = MI.getIterator();
+      if (MI.isBundle())
+        toggleKills(MRI, LiveRegs, MI, false);
+
+      // Some targets make the (questionable) assumtion that the instructions
+      // inside the bundle are ordered and consequently only the last use of
+      // a register inside the bundle can kill it.
+      MachineBasicBlock::instr_iterator I = std::next(Bundle);
+      while (I->isBundledWithSucc())
+        ++I;
+      do {
+        if (!I->isDebugOrPseudoInstr())
+          toggleKills(MRI, LiveRegs, *I, true);
+        --I;
+      } while (I != Bundle);
+    }
+  }
+}
+
+void ScheduleDAGInstrs::dumpNode(const SUnit &SU) const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  dumpNodeName(SU);
+  if (SchedPrintCycles)
+    dbgs() << " [TopReadyCycle = " << SU.TopReadyCycle
+           << ", BottomReadyCycle = " << SU.BotReadyCycle << "]";
+  dbgs() << ": ";
+  SU.getInstr()->dump();
+#endif
+}
+
+void ScheduleDAGInstrs::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  if (EntrySU.getInstr() != nullptr)
+    dumpNodeAll(EntrySU);
+  for (const SUnit &SU : SUnits)
+    dumpNodeAll(SU);
+  if (ExitSU.getInstr() != nullptr)
+    dumpNodeAll(ExitSU);
+#endif
+}
+
+std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
+  std::string s;
+  raw_string_ostream oss(s);
+  if (SU == &EntrySU)
+    oss << "<entry>";
+  else if (SU == &ExitSU)
+    oss << "<exit>";
+  else
+    SU->getInstr()->print(oss, /*IsStandalone=*/true);
+  return oss.str();
+}
+
+/// Return the basic block label. It is not necessarilly unique because a block
+/// contains multiple scheduling regions. But it is fine for visualization.
+std::string ScheduleDAGInstrs::getDAGName() const {
+  return "dag." + BB->getFullName();
+}
+
+bool ScheduleDAGInstrs::canAddEdge(SUnit *SuccSU, SUnit *PredSU) {
+  return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU);
+}
+
+bool ScheduleDAGInstrs::addEdge(SUnit *SuccSU, const SDep &PredDep) {
+  if (SuccSU != &ExitSU) {
+    // Do not use WillCreateCycle, it assumes SD scheduling.
+    // If Pred is reachable from Succ, then the edge creates a cycle.
+    if (Topo.IsReachable(PredDep.getSUnit(), SuccSU))
+      return false;
+    Topo.AddPredQueued(SuccSU, PredDep.getSUnit());
+  }
+  SuccSU->addPred(PredDep, /*Required=*/!PredDep.isArtificial());
+  // Return true regardless of whether a new edge needed to be inserted.
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// SchedDFSResult Implementation
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+/// Internal state used to compute SchedDFSResult.
+class SchedDFSImpl {
+  SchedDFSResult &R;
+
+  /// Join DAG nodes into equivalence classes by their subtree.
+  IntEqClasses SubtreeClasses;
+  /// List PredSU, SuccSU pairs that represent data edges between subtrees.
+  std::vector<std::pair<const SUnit *, const SUnit*>> ConnectionPairs;
+
+  struct RootData {
+    unsigned NodeID;
+    unsigned ParentNodeID;  ///< Parent node (member of the parent subtree).
+    unsigned SubInstrCount = 0; ///< Instr count in this tree only, not
+                                /// children.
+
+    RootData(unsigned id): NodeID(id),
+                           ParentNodeID(SchedDFSResult::InvalidSubtreeID) {}
+
+    unsigned getSparseSetIndex() const { return NodeID; }
+  };
+
+  SparseSet<RootData> RootSet;
+
+public:
+  SchedDFSImpl(SchedDFSResult &r): R(r), SubtreeClasses(R.DFSNodeData.size()) {
+    RootSet.setUniverse(R.DFSNodeData.size());
+  }
+
+  /// Returns true if this node been visited by the DFS traversal.
+  ///
+  /// During visitPostorderNode the Node's SubtreeID is assigned to the Node
+  /// ID. Later, SubtreeID is updated but remains valid.
+  bool isVisited(const SUnit *SU) const {
+    return R.DFSNodeData[SU->NodeNum].SubtreeID
+      != SchedDFSResult::InvalidSubtreeID;
+  }
+
+  /// Initializes this node's instruction count. We don't need to flag the node
+  /// visited until visitPostorder because the DAG cannot have cycles.
+  void visitPreorder(const SUnit *SU) {
+    R.DFSNodeData[SU->NodeNum].InstrCount =
+      SU->getInstr()->isTransient() ? 0 : 1;
+  }
+
+  /// Called once for each node after all predecessors are visited. Revisit this
+  /// node's predecessors and potentially join them now that we know the ILP of
+  /// the other predecessors.
+  void visitPostorderNode(const SUnit *SU) {
+    // Mark this node as the root of a subtree. It may be joined with its
+    // successors later.
+    R.DFSNodeData[SU->NodeNum].SubtreeID = SU->NodeNum;
+    RootData RData(SU->NodeNum);
+    RData.SubInstrCount = SU->getInstr()->isTransient() ? 0 : 1;
+
+    // If any predecessors are still in their own subtree, they either cannot be
+    // joined or are large enough to remain separate. If this parent node's
+    // total instruction count is not greater than a child subtree by at least
+    // the subtree limit, then try to join it now since splitting subtrees is
+    // only useful if multiple high-pressure paths are possible.
+    unsigned InstrCount = R.DFSNodeData[SU->NodeNum].InstrCount;
+    for (const SDep &PredDep : SU->Preds) {
+      if (PredDep.getKind() != SDep::Data)
+        continue;
+      unsigned PredNum = PredDep.getSUnit()->NodeNum;
+      if ((InstrCount - R.DFSNodeData[PredNum].InstrCount) < R.SubtreeLimit)
+        joinPredSubtree(PredDep, SU, /*CheckLimit=*/false);
+
+      // Either link or merge the TreeData entry from the child to the parent.
+      if (R.DFSNodeData[PredNum].SubtreeID == PredNum) {
+        // If the predecessor's parent is invalid, this is a tree edge and the
+        // current node is the parent.
+        if (RootSet[PredNum].ParentNodeID == SchedDFSResult::InvalidSubtreeID)
+          RootSet[PredNum].ParentNodeID = SU->NodeNum;
+      }
+      else if (RootSet.count(PredNum)) {
+        // The predecessor is not a root, but is still in the root set. This
+        // must be the new parent that it was just joined to. Note that
+        // RootSet[PredNum].ParentNodeID may either be invalid or may still be
+        // set to the original parent.
+        RData.SubInstrCount += RootSet[PredNum].SubInstrCount;
+        RootSet.erase(PredNum);
+      }
+    }
+    RootSet[SU->NodeNum] = RData;
+  }
+
+  /// Called once for each tree edge after calling visitPostOrderNode on
+  /// the predecessor. Increment the parent node's instruction count and
+  /// preemptively join this subtree to its parent's if it is small enough.
+  void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) {
+    R.DFSNodeData[Succ->NodeNum].InstrCount
+      += R.DFSNodeData[PredDep.getSUnit()->NodeNum].InstrCount;
+    joinPredSubtree(PredDep, Succ);
+  }
+
+  /// Adds a connection for cross edges.
+  void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) {
+    ConnectionPairs.emplace_back(PredDep.getSUnit(), Succ);
+  }
+
+  /// Sets each node's subtree ID to the representative ID and record
+  /// connections between trees.
+  void finalize() {
+    SubtreeClasses.compress();
+    R.DFSTreeData.resize(SubtreeClasses.getNumClasses());
+    assert(SubtreeClasses.getNumClasses() == RootSet.size()
+           && "number of roots should match trees");
+    for (const RootData &Root : RootSet) {
+      unsigned TreeID = SubtreeClasses[Root.NodeID];
+      if (Root.ParentNodeID != SchedDFSResult::InvalidSubtreeID)
+        R.DFSTreeData[TreeID].ParentTreeID = SubtreeClasses[Root.ParentNodeID];
+      R.DFSTreeData[TreeID].SubInstrCount = Root.SubInstrCount;
+      // Note that SubInstrCount may be greater than InstrCount if we joined
+      // subtrees across a cross edge. InstrCount will be attributed to the
+      // original parent, while SubInstrCount will be attributed to the joined
+      // parent.
+    }
+    R.SubtreeConnections.resize(SubtreeClasses.getNumClasses());
+    R.SubtreeConnectLevels.resize(SubtreeClasses.getNumClasses());
+    LLVM_DEBUG(dbgs() << R.getNumSubtrees() << " subtrees:\n");
+    for (unsigned Idx = 0, End = R.DFSNodeData.size(); Idx != End; ++Idx) {
+      R.DFSNodeData[Idx].SubtreeID = SubtreeClasses[Idx];
+      LLVM_DEBUG(dbgs() << "  SU(" << Idx << ") in tree "
+                        << R.DFSNodeData[Idx].SubtreeID << '\n');
+    }
+    for (const auto &[Pred, Succ] : ConnectionPairs) {
+      unsigned PredTree = SubtreeClasses[Pred->NodeNum];
+      unsigned SuccTree = SubtreeClasses[Succ->NodeNum];
+      if (PredTree == SuccTree)
+        continue;
+      unsigned Depth = Pred->getDepth();
+      addConnection(PredTree, SuccTree, Depth);
+      addConnection(SuccTree, PredTree, Depth);
+    }
+  }
+
+protected:
+  /// Joins the predecessor subtree with the successor that is its DFS parent.
+  /// Applies some heuristics before joining.
+  bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ,
+                       bool CheckLimit = true) {
+    assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges");
+
+    // Check if the predecessor is already joined.
+    const SUnit *PredSU = PredDep.getSUnit();
+    unsigned PredNum = PredSU->NodeNum;
+    if (R.DFSNodeData[PredNum].SubtreeID != PredNum)
+      return false;
+
+    // Four is the magic number of successors before a node is considered a
+    // pinch point.
+    unsigned NumDataSucs = 0;
+    for (const SDep &SuccDep : PredSU->Succs) {
+      if (SuccDep.getKind() == SDep::Data) {
+        if (++NumDataSucs >= 4)
+          return false;
+      }
+    }
+    if (CheckLimit && R.DFSNodeData[PredNum].InstrCount > R.SubtreeLimit)
+      return false;
+    R.DFSNodeData[PredNum].SubtreeID = Succ->NodeNum;
+    SubtreeClasses.join(Succ->NodeNum, PredNum);
+    return true;
+  }
+
+  /// Called by finalize() to record a connection between trees.
+  void addConnection(unsigned FromTree, unsigned ToTree, unsigned Depth) {
+    if (!Depth)
+      return;
+
+    do {
+      SmallVectorImpl<SchedDFSResult::Connection> &Connections =
+        R.SubtreeConnections[FromTree];
+      for (SchedDFSResult::Connection &C : Connections) {
+        if (C.TreeID == ToTree) {
+          C.Level = std::max(C.Level, Depth);
+          return;
+        }
+      }
+      Connections.push_back(SchedDFSResult::Connection(ToTree, Depth));
+      FromTree = R.DFSTreeData[FromTree].ParentTreeID;
+    } while (FromTree != SchedDFSResult::InvalidSubtreeID);
+  }
+};
+
+} // end namespace llvm
+
+namespace {
+
+/// Manage the stack used by a reverse depth-first search over the DAG.
+class SchedDAGReverseDFS {
+  std::vector<std::pair<const SUnit *, SUnit::const_pred_iterator>> DFSStack;
+
+public:
+  bool isComplete() const { return DFSStack.empty(); }
+
+  void follow(const SUnit *SU) {
+    DFSStack.emplace_back(SU, SU->Preds.begin());
+  }
+  void advance() { ++DFSStack.back().second; }
+
+  const SDep *backtrack() {
+    DFSStack.pop_back();
+    return DFSStack.empty() ? nullptr : std::prev(DFSStack.back().second);
+  }
+
+  const SUnit *getCurr() const { return DFSStack.back().first; }
+
+  SUnit::const_pred_iterator getPred() const { return DFSStack.back().second; }
+
+  SUnit::const_pred_iterator getPredEnd() const {
+    return getCurr()->Preds.end();
+  }
+};
+
+} // end anonymous namespace
+
+static bool hasDataSucc(const SUnit *SU) {
+  for (const SDep &SuccDep : SU->Succs) {
+    if (SuccDep.getKind() == SDep::Data &&
+        !SuccDep.getSUnit()->isBoundaryNode())
+      return true;
+  }
+  return false;
+}
+
+/// Computes an ILP metric for all nodes in the subDAG reachable via depth-first
+/// search from this root.
+void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) {
+  if (!IsBottomUp)
+    llvm_unreachable("Top-down ILP metric is unimplemented");
+
+  SchedDFSImpl Impl(*this);
+  for (const SUnit &SU : SUnits) {
+    if (Impl.isVisited(&SU) || hasDataSucc(&SU))
+      continue;
+
+    SchedDAGReverseDFS DFS;
+    Impl.visitPreorder(&SU);
+    DFS.follow(&SU);
+    while (true) {
+      // Traverse the leftmost path as far as possible.
+      while (DFS.getPred() != DFS.getPredEnd()) {
+        const SDep &PredDep = *DFS.getPred();
+        DFS.advance();
+        // Ignore non-data edges.
+        if (PredDep.getKind() != SDep::Data
+            || PredDep.getSUnit()->isBoundaryNode()) {
+          continue;
+        }
+        // An already visited edge is a cross edge, assuming an acyclic DAG.
+        if (Impl.isVisited(PredDep.getSUnit())) {
+          Impl.visitCrossEdge(PredDep, DFS.getCurr());
+          continue;
+        }
+        Impl.visitPreorder(PredDep.getSUnit());
+        DFS.follow(PredDep.getSUnit());
+      }
+      // Visit the top of the stack in postorder and backtrack.
+      const SUnit *Child = DFS.getCurr();
+      const SDep *PredDep = DFS.backtrack();
+      Impl.visitPostorderNode(Child);
+      if (PredDep)
+        Impl.visitPostorderEdge(*PredDep, DFS.getCurr());
+      if (DFS.isComplete())
+        break;
+    }
+  }
+  Impl.finalize();
+}
+
+/// The root of the given SubtreeID was just scheduled. For all subtrees
+/// connected to this tree, record the depth of the connection so that the
+/// nearest connected subtrees can be prioritized.
+void SchedDFSResult::scheduleTree(unsigned SubtreeID) {
+  for (const Connection &C : SubtreeConnections[SubtreeID]) {
+    SubtreeConnectLevels[C.TreeID] =
+      std::max(SubtreeConnectLevels[C.TreeID], C.Level);
+    LLVM_DEBUG(dbgs() << "  Tree: " << C.TreeID << " @"
+                      << SubtreeConnectLevels[C.TreeID] << '\n');
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void ILPValue::print(raw_ostream &OS) const {
+  OS << InstrCount << " / " << Length << " = ";
+  if (!Length)
+    OS << "BADILP";
+  else
+    OS << format("%g", ((double)InstrCount / Length));
+}
+
+LLVM_DUMP_METHOD void ILPValue::dump() const {
+  dbgs() << *this << '\n';
+}
+
+namespace llvm {
+
+LLVM_ATTRIBUTE_UNUSED
+raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
+  Val.print(OS);
+  return OS;
+}
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
new file mode 100644
index 000000000000..e7b14944acfe
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -0,0 +1,92 @@
+//===-- ScheduleDAGPrinter.cpp - Implement ScheduleDAG::viewGraph() -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAG::viewGraph method.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace llvm {
+  template<>
+  struct DOTGraphTraits<ScheduleDAG*> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+
+    static std::string getGraphName(const ScheduleDAG *G) {
+      return std::string(G->MF.getName());
+    }
+
+    static bool renderGraphFromBottomUp() {
+      return true;
+    }
+
+    static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
+      return (Node->NumPreds > 10 || Node->NumSuccs > 10);
+    }
+
+    static std::string getNodeIdentifierLabel(const SUnit *Node,
+                                              const ScheduleDAG *Graph) {
+      std::string R;
+      raw_string_ostream OS(R);
+      OS << static_cast<const void *>(Node);
+      return R;
+    }
+
+    /// If you want to override the dot attributes printed for a particular
+    /// edge, override this method.
+    static std::string getEdgeAttributes(const SUnit *Node,
+                                         SUnitIterator EI,
+                                         const ScheduleDAG *Graph) {
+      if (EI.isArtificialDep())
+        return "color=cyan,style=dashed";
+      if (EI.isCtrlDep())
+        return "color=blue,style=dashed";
+      return "";
+    }
+
+
+    std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *Graph);
+    static std::string getNodeAttributes(const SUnit *N,
+                                         const ScheduleDAG *Graph) {
+      return "shape=Mrecord";
+    }
+
+    static void addCustomGraphFeatures(ScheduleDAG *G,
+                                       GraphWriter<ScheduleDAG*> &GW) {
+      return G->addCustomGraphFeatures(GW);
+    }
+  };
+}
+
+std::string DOTGraphTraits<ScheduleDAG*>::getNodeLabel(const SUnit *SU,
+                                                       const ScheduleDAG *G) {
+  return G->getGraphNodeLabel(SU);
+}
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void ScheduleDAG::viewGraph(const Twine &Name, const Twine &Title) {
+  // This code is only for debugging!
+#ifndef NDEBUG
+  ViewGraph(this, Name, false, Title);
+#else
+  errs() << "ScheduleDAG::viewGraph is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif  // NDEBUG
+}
+
+/// Out-of-line implementation with no arguments is handy for gdb.
+void ScheduleDAG::viewGraph() {
+  viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName());
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
new file mode 100644
index 000000000000..209c6d81f602
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -0,0 +1,241 @@
+//===- ScoreboardHazardRecognizer.cpp - Scheduler Support -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ScoreboardHazardRecognizer class, which
+// encapsultes hazard-avoidance heuristics for scheduling, based on the
+// scheduling itineraries specified for the target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE DebugType
+
+ScoreboardHazardRecognizer::ScoreboardHazardRecognizer(
+    const InstrItineraryData *II, const ScheduleDAG *SchedDAG,
+    const char *ParentDebugType)
+    : DebugType(ParentDebugType), ItinData(II), DAG(SchedDAG) {
+  (void)DebugType;
+  // Determine the maximum depth of any itinerary. This determines the depth of
+  // the scoreboard. We always make the scoreboard at least 1 cycle deep to
+  // avoid dealing with the boundary condition.
+  unsigned ScoreboardDepth = 1;
+  if (ItinData && !ItinData->isEmpty()) {
+    for (unsigned idx = 0; ; ++idx) {
+      if (ItinData->isEndMarker(idx))
+        break;
+
+      const InstrStage *IS = ItinData->beginStage(idx);
+      const InstrStage *E = ItinData->endStage(idx);
+      unsigned CurCycle = 0;
+      unsigned ItinDepth = 0;
+      for (; IS != E; ++IS) {
+        unsigned StageDepth = CurCycle + IS->getCycles();
+        if (ItinDepth < StageDepth) ItinDepth = StageDepth;
+        CurCycle += IS->getNextCycles();
+      }
+
+      // Find the next power-of-2 >= ItinDepth
+      while (ItinDepth > ScoreboardDepth) {
+        ScoreboardDepth *= 2;
+        // Don't set MaxLookAhead until we find at least one nonzero stage.
+        // This way, an itinerary with no stages has MaxLookAhead==0, which
+        // completely bypasses the scoreboard hazard logic.
+        MaxLookAhead = ScoreboardDepth;
+      }
+    }
+  }
+
+  ReservedScoreboard.reset(ScoreboardDepth);
+  RequiredScoreboard.reset(ScoreboardDepth);
+
+  // If MaxLookAhead is not set above, then we are not enabled.
+  if (!isEnabled())
+    LLVM_DEBUG(dbgs() << "Disabled scoreboard hazard recognizer\n");
+  else {
+    // A nonempty itinerary must have a SchedModel.
+    IssueWidth = ItinData->SchedModel.IssueWidth;
+    LLVM_DEBUG(dbgs() << "Using scoreboard hazard recognizer: Depth = "
+                      << ScoreboardDepth << '\n');
+  }
+}
+
+void ScoreboardHazardRecognizer::Reset() {
+  IssueCount = 0;
+  RequiredScoreboard.reset();
+  ReservedScoreboard.reset();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void ScoreboardHazardRecognizer::Scoreboard::dump() const {
+  dbgs() << "Scoreboard:\n";
+
+  unsigned last = Depth - 1;
+  while ((last > 0) && ((*this)[last] == 0))
+    last--;
+
+  for (unsigned i = 0; i <= last; i++) {
+    InstrStage::FuncUnits FUs = (*this)[i];
+    dbgs() << "\t";
+    for (int j = std::numeric_limits<InstrStage::FuncUnits>::digits - 1;
+         j >= 0; j--)
+      dbgs() << ((FUs & (1ULL << j)) ? '1' : '0');
+    dbgs() << '\n';
+  }
+}
+#endif
+
+bool ScoreboardHazardRecognizer::atIssueLimit() const {
+  if (IssueWidth == 0)
+    return false;
+
+  return IssueCount == IssueWidth;
+}
+
+ScheduleHazardRecognizer::HazardType
+ScoreboardHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  if (!ItinData || ItinData->isEmpty())
+    return NoHazard;
+
+  // Note that stalls will be negative for bottom-up scheduling.
+  int cycle = Stalls;
+
+  // Use the itinerary for the underlying instruction to check for
+  // free FU's in the scoreboard at the appropriate future cycles.
+
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  if (!MCID) {
+    // Don't check hazards for non-machineinstr Nodes.
+    return NoHazard;
+  }
+  unsigned idx = MCID->getSchedClass();
+  for (const InstrStage *IS = ItinData->beginStage(idx),
+         *E = ItinData->endStage(idx); IS != E; ++IS) {
+    // We must find one of the stage's units free for every cycle the
+    // stage is occupied. FIXME it would be more accurate to find the
+    // same unit free in all the cycles.
+    for (unsigned int i = 0; i < IS->getCycles(); ++i) {
+      int StageCycle = cycle + (int)i;
+      if (StageCycle < 0)
+        continue;
+
+      if (StageCycle >= (int)RequiredScoreboard.getDepth()) {
+        assert((StageCycle - Stalls) < (int)RequiredScoreboard.getDepth() &&
+               "Scoreboard depth exceeded!");
+        // This stage was stalled beyond pipeline depth, so cannot conflict.
+        break;
+      }
+
+      InstrStage::FuncUnits freeUnits = IS->getUnits();
+      switch (IS->getReservationKind()) {
+      case InstrStage::Required:
+        // Required FUs conflict with both reserved and required ones
+        freeUnits &= ~ReservedScoreboard[StageCycle];
+        [[fallthrough]];
+      case InstrStage::Reserved:
+        // Reserved FUs can conflict only with required ones.
+        freeUnits &= ~RequiredScoreboard[StageCycle];
+        break;
+      }
+
+      if (!freeUnits) {
+        LLVM_DEBUG(dbgs() << "*** Hazard in cycle +" << StageCycle << ", ");
+        LLVM_DEBUG(DAG->dumpNode(*SU));
+        return Hazard;
+      }
+    }
+
+    // Advance the cycle to the next stage.
+    cycle += IS->getNextCycles();
+  }
+
+  return NoHazard;
+}
+
+void ScoreboardHazardRecognizer::EmitInstruction(SUnit *SU) {
+  if (!ItinData || ItinData->isEmpty())
+    return;
+
+  // Use the itinerary for the underlying instruction to reserve FU's
+  // in the scoreboard at the appropriate future cycles.
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  assert(MCID && "The scheduler must filter non-machineinstrs");
+  if (DAG->TII->isZeroCost(MCID->Opcode))
+    return;
+
+  ++IssueCount;
+
+  unsigned cycle = 0;
+
+  unsigned idx = MCID->getSchedClass();
+  for (const InstrStage *IS = ItinData->beginStage(idx),
+         *E = ItinData->endStage(idx); IS != E; ++IS) {
+    // We must reserve one of the stage's units for every cycle the
+    // stage is occupied. FIXME it would be more accurate to reserve
+    // the same unit free in all the cycles.
+    for (unsigned int i = 0; i < IS->getCycles(); ++i) {
+      assert(((cycle + i) < RequiredScoreboard.getDepth()) &&
+             "Scoreboard depth exceeded!");
+
+      InstrStage::FuncUnits freeUnits = IS->getUnits();
+      switch (IS->getReservationKind()) {
+      case InstrStage::Required:
+        // Required FUs conflict with both reserved and required ones
+        freeUnits &= ~ReservedScoreboard[cycle + i];
+        [[fallthrough]];
+      case InstrStage::Reserved:
+        // Reserved FUs can conflict only with required ones.
+        freeUnits &= ~RequiredScoreboard[cycle + i];
+        break;
+      }
+
+      // reduce to a single unit
+      InstrStage::FuncUnits freeUnit = 0;
+      do {
+        freeUnit = freeUnits;
+        freeUnits = freeUnit & (freeUnit - 1);
+      } while (freeUnits);
+
+      if (IS->getReservationKind() == InstrStage::Required)
+        RequiredScoreboard[cycle + i] |= freeUnit;
+      else
+        ReservedScoreboard[cycle + i] |= freeUnit;
+    }
+
+    // Advance the cycle to the next stage.
+    cycle += IS->getNextCycles();
+  }
+
+  LLVM_DEBUG(ReservedScoreboard.dump());
+  LLVM_DEBUG(RequiredScoreboard.dump());
+}
+
+void ScoreboardHazardRecognizer::AdvanceCycle() {
+  IssueCount = 0;
+  ReservedScoreboard[0] = 0; ReservedScoreboard.advance();
+  RequiredScoreboard[0] = 0; RequiredScoreboard.advance();
+}
+
+void ScoreboardHazardRecognizer::RecedeCycle() {
+  IssueCount = 0;
+  ReservedScoreboard[ReservedScoreboard.getDepth()-1] = 0;
+  ReservedScoreboard.recede();
+  RequiredScoreboard[RequiredScoreboard.getDepth()-1] = 0;
+  RequiredScoreboard.recede();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectOptimize.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectOptimize.cpp
new file mode 100644
index 000000000000..30d959704745
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -0,0 +1,1046 @@
+//===--- SelectOptimize.cpp - Convert select to branches if profitable ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts selects to conditional jumps when profitable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/ScaledNumber.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include <algorithm>
+#include <memory>
+#include <queue>
+#include <stack>
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "select-optimize"
+
+STATISTIC(NumSelectOptAnalyzed,
+          "Number of select groups considered for conversion to branch");
+STATISTIC(NumSelectConvertedExpColdOperand,
+          "Number of select groups converted due to expensive cold operand");
+STATISTIC(NumSelectConvertedHighPred,
+          "Number of select groups converted due to high-predictability");
+STATISTIC(NumSelectUnPred,
+          "Number of select groups not converted due to unpredictability");
+STATISTIC(NumSelectColdBB,
+          "Number of select groups not converted due to cold basic block");
+STATISTIC(NumSelectConvertedLoop,
+          "Number of select groups converted due to loop-level analysis");
+STATISTIC(NumSelectsConverted, "Number of selects converted");
+
+static cl::opt<unsigned> ColdOperandThreshold(
+    "cold-operand-threshold",
+    cl::desc("Maximum frequency of path for an operand to be considered cold."),
+    cl::init(20), cl::Hidden);
+
+static cl::opt<unsigned> ColdOperandMaxCostMultiplier(
+    "cold-operand-max-cost-multiplier",
+    cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
+             "slice of a cold operand to be considered inexpensive."),
+    cl::init(1), cl::Hidden);
+
+static cl::opt<unsigned>
+    GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
+                          cl::desc("Gradient gain threshold (%)."),
+                          cl::init(25), cl::Hidden);
+
+static cl::opt<unsigned>
+    GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
+                       cl::desc("Minimum gain per loop (in cycles) threshold."),
+                       cl::init(4), cl::Hidden);
+
+static cl::opt<unsigned> GainRelativeThreshold(
+    "select-opti-loop-relative-gain-threshold",
+    cl::desc(
+        "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
+    cl::init(8), cl::Hidden);
+
+static cl::opt<unsigned> MispredictDefaultRate(
+    "mispredict-default-rate", cl::Hidden, cl::init(25),
+    cl::desc("Default mispredict rate (initialized to 25%)."));
+
+static cl::opt<bool>
+    DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden,
+                               cl::init(false),
+                               cl::desc("Disable loop-level heuristics."));
+
+namespace {
+
+class SelectOptimize : public FunctionPass {
+  const TargetMachine *TM = nullptr;
+  const TargetSubtargetInfo *TSI = nullptr;
+  const TargetLowering *TLI = nullptr;
+  const TargetTransformInfo *TTI = nullptr;
+  const LoopInfo *LI = nullptr;
+  DominatorTree *DT = nullptr;
+  std::unique_ptr<BlockFrequencyInfo> BFI;
+  std::unique_ptr<BranchProbabilityInfo> BPI;
+  ProfileSummaryInfo *PSI = nullptr;
+  OptimizationRemarkEmitter *ORE = nullptr;
+  TargetSchedModel TSchedModel;
+
+public:
+  static char ID;
+
+  SelectOptimize() : FunctionPass(ID) {
+    initializeSelectOptimizePass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<TargetPassConfig>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+  }
+
+private:
+  // Select groups consist of consecutive select instructions with the same
+  // condition.
+  using SelectGroup = SmallVector<SelectInst *, 2>;
+  using SelectGroups = SmallVector<SelectGroup, 2>;
+
+  using Scaled64 = ScaledNumber<uint64_t>;
+
+  struct CostInfo {
+    /// Predicated cost (with selects as conditional moves).
+    Scaled64 PredCost;
+    /// Non-predicated cost (with selects converted to branches).
+    Scaled64 NonPredCost;
+  };
+
+  // Converts select instructions of a function to conditional jumps when deemed
+  // profitable. Returns true if at least one select was converted.
+  bool optimizeSelects(Function &F);
+
+  // Heuristics for determining which select instructions can be profitably
+  // conveted to branches. Separate heuristics for selects in inner-most loops
+  // and the rest of code regions (base heuristics for non-inner-most loop
+  // regions).
+  void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
+  void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
+
+  // Converts to branches the select groups that were deemed
+  // profitable-to-convert.
+  void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
+
+  // Splits selects of a given basic block into select groups.
+  void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
+
+  // Determines for which select groups it is profitable converting to branches
+  // (base and inner-most-loop heuristics).
+  void findProfitableSIGroupsBase(SelectGroups &SIGroups,
+                                  SelectGroups &ProfSIGroups);
+  void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
+                                        SelectGroups &ProfSIGroups);
+
+  // Determines if a select group should be converted to a branch (base
+  // heuristics).
+  bool isConvertToBranchProfitableBase(const SmallVector<SelectInst *, 2> &ASI);
+
+  // Returns true if there are expensive instructions in the cold value
+  // operand's (if any) dependence slice of any of the selects of the given
+  // group.
+  bool hasExpensiveColdOperand(const SmallVector<SelectInst *, 2> &ASI);
+
+  // For a given source instruction, collect its backwards dependence slice
+  // consisting of instructions exclusively computed for producing the operands
+  // of the source instruction.
+  void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
+                             Instruction *SI, bool ForSinking = false);
+
+  // Returns true if the condition of the select is highly predictable.
+  bool isSelectHighlyPredictable(const SelectInst *SI);
+
+  // Loop-level checks to determine if a non-predicated version (with branches)
+  // of the given loop is more profitable than its predicated version.
+  bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]);
+
+  // Computes instruction and loop-critical-path costs for both the predicated
+  // and non-predicated version of the given loop.
+  bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
+                        DenseMap<const Instruction *, CostInfo> &InstCostMap,
+                        CostInfo *LoopCost);
+
+  // Returns a set of all the select instructions in the given select groups.
+  SmallPtrSet<const Instruction *, 2> getSIset(const SelectGroups &SIGroups);
+
+  // Returns the latency cost of a given instruction.
+  std::optional<uint64_t> computeInstCost(const Instruction *I);
+
+  // Returns the misprediction cost of a given select when converted to branch.
+  Scaled64 getMispredictionCost(const SelectInst *SI, const Scaled64 CondCost);
+
+  // Returns the cost of a branch when the prediction is correct.
+  Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
+                                const SelectInst *SI);
+
+  // Returns true if the target architecture supports lowering a given select.
+  bool isSelectKindSupported(SelectInst *SI);
+};
+} // namespace
+
+char SelectOptimize::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
+                    false)
+
+FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); }
+
+bool SelectOptimize::runOnFunction(Function &F) {
+  TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+  TSI = TM->getSubtargetImpl(F);
+  TLI = TSI->getTargetLowering();
+
+  // If none of the select types is supported then skip this pass.
+  // This is an optimization pass. Legality issues will be handled by
+  // instruction selection.
+  if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
+      !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
+      !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
+    return false;
+
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  if (!TTI->enableSelectOptimize())
+    return false;
+
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  BPI.reset(new BranchProbabilityInfo(F, *LI));
+  BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
+  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+  TSchedModel.init(TSI);
+
+  // When optimizing for size, selects are preferable over branches.
+  if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI.get()))
+    return false;
+
+  return optimizeSelects(F);
+}
+
+bool SelectOptimize::optimizeSelects(Function &F) {
+  // Determine for which select groups it is profitable converting to branches.
+  SelectGroups ProfSIGroups;
+  // Base heuristics apply only to non-loops and outer loops.
+  optimizeSelectsBase(F, ProfSIGroups);
+  // Separate heuristics for inner-most loops.
+  optimizeSelectsInnerLoops(F, ProfSIGroups);
+
+  // Convert to branches the select groups that were deemed
+  // profitable-to-convert.
+  convertProfitableSIGroups(ProfSIGroups);
+
+  // Code modified if at least one select group was converted.
+  return !ProfSIGroups.empty();
+}
+
+void SelectOptimize::optimizeSelectsBase(Function &F,
+                                         SelectGroups &ProfSIGroups) {
+  // Collect all the select groups.
+  SelectGroups SIGroups;
+  for (BasicBlock &BB : F) {
+    // Base heuristics apply only to non-loops and outer loops.
+    Loop *L = LI->getLoopFor(&BB);
+    if (L && L->isInnermost())
+      continue;
+    collectSelectGroups(BB, SIGroups);
+  }
+
+  // Determine for which select groups it is profitable converting to branches.
+  findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
+}
+
+void SelectOptimize::optimizeSelectsInnerLoops(Function &F,
+                                               SelectGroups &ProfSIGroups) {
+  SmallVector<Loop *, 4> Loops(LI->begin(), LI->end());
+  // Need to check size on each iteration as we accumulate child loops.
+  for (unsigned long i = 0; i < Loops.size(); ++i)
+    for (Loop *ChildL : Loops[i]->getSubLoops())
+      Loops.push_back(ChildL);
+
+  for (Loop *L : Loops) {
+    if (!L->isInnermost())
+      continue;
+
+    SelectGroups SIGroups;
+    for (BasicBlock *BB : L->getBlocks())
+      collectSelectGroups(*BB, SIGroups);
+
+    findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
+  }
+}
+
+/// If \p isTrue is true, return the true value of \p SI, otherwise return
+/// false value of \p SI. If the true/false value of \p SI is defined by any
+/// select instructions in \p Selects, look through the defining select
+/// instruction until the true/false value is not defined in \p Selects.
+static Value *
+getTrueOrFalseValue(SelectInst *SI, bool isTrue,
+                    const SmallPtrSet<const Instruction *, 2> &Selects) {
+  Value *V = nullptr;
+  for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
+       DefSI = dyn_cast<SelectInst>(V)) {
+    assert(DefSI->getCondition() == SI->getCondition() &&
+           "The condition of DefSI does not match with SI");
+    V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
+  }
+  assert(V && "Failed to get select true/false value");
+  return V;
+}
+
+void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
+  for (SelectGroup &ASI : ProfSIGroups) {
+    // The code transformation here is a modified version of the sinking
+    // transformation in CodeGenPrepare::optimizeSelectInst with a more
+    // aggressive strategy of which instructions to sink.
+    //
+    // TODO: eliminate the redundancy of logic transforming selects to branches
+    // by removing CodeGenPrepare::optimizeSelectInst and optimizing here
+    // selects for all cases (with and without profile information).
+
+    // Transform a sequence like this:
+    //    start:
+    //       %cmp = cmp uge i32 %a, %b
+    //       %sel = select i1 %cmp, i32 %c, i32 %d
+    //
+    // Into:
+    //    start:
+    //       %cmp = cmp uge i32 %a, %b
+    //       %cmp.frozen = freeze %cmp
+    //       br i1 %cmp.frozen, label %select.true, label %select.false
+    //    select.true:
+    //       br label %select.end
+    //    select.false:
+    //       br label %select.end
+    //    select.end:
+    //       %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
+    //
+    // %cmp should be frozen, otherwise it may introduce undefined behavior.
+    // In addition, we may sink instructions that produce %c or %d into the
+    // destination(s) of the new branch.
+    // If the true or false blocks do not contain a sunken instruction, that
+    // block and its branch may be optimized away. In that case, one side of the
+    // first branch will point directly to select.end, and the corresponding PHI
+    // predecessor block will be the start block.
+
+    // Find all the instructions that can be soundly sunk to the true/false
+    // blocks. These are instructions that are computed solely for producing the
+    // operands of the select instructions in the group and can be sunk without
+    // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
+    // with side effects).
+    SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
+    typedef std::stack<Instruction *>::size_type StackSizeType;
+    StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
+    for (SelectInst *SI : ASI) {
+      // For each select, compute the sinkable dependence chains of the true and
+      // false operands.
+      if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue())) {
+        std::stack<Instruction *> TrueSlice;
+        getExclBackwardsSlice(TI, TrueSlice, SI, true);
+        maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
+        TrueSlices.push_back(TrueSlice);
+      }
+      if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue())) {
+        std::stack<Instruction *> FalseSlice;
+        getExclBackwardsSlice(FI, FalseSlice, SI, true);
+        maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
+        FalseSlices.push_back(FalseSlice);
+      }
+    }
+    // In the case of multiple select instructions in the same group, the order
+    // of non-dependent instructions (instructions of different dependence
+    // slices) in the true/false blocks appears to affect performance.
+    // Interleaving the slices seems to experimentally be the optimal approach.
+    // This interleaving scheduling allows for more ILP (with a natural downside
+    // of increasing a bit register pressure) compared to a simple ordering of
+    // one whole chain after another. One would expect that this ordering would
+    // not matter since the scheduling in the backend of the compiler  would
+    // take care of it, but apparently the scheduler fails to deliver optimal
+    // ILP with a naive ordering here.
+    SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
+    for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) {
+      for (auto &S : TrueSlices) {
+        if (!S.empty()) {
+          TrueSlicesInterleaved.push_back(S.top());
+          S.pop();
+        }
+      }
+    }
+    for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) {
+      for (auto &S : FalseSlices) {
+        if (!S.empty()) {
+          FalseSlicesInterleaved.push_back(S.top());
+          S.pop();
+        }
+      }
+    }
+
+    // We split the block containing the select(s) into two blocks.
+    SelectInst *SI = ASI.front();
+    SelectInst *LastSI = ASI.back();
+    BasicBlock *StartBlock = SI->getParent();
+    BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
+    BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
+    BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());
+    // Delete the unconditional branch that was just created by the split.
+    StartBlock->getTerminator()->eraseFromParent();
+
+    // Move any debug/pseudo instructions that were in-between the select
+    // group to the newly-created end block.
+    SmallVector<Instruction *, 2> DebugPseudoINS;
+    auto DIt = SI->getIterator();
+    while (&*DIt != LastSI) {
+      if (DIt->isDebugOrPseudoInst())
+        DebugPseudoINS.push_back(&*DIt);
+      DIt++;
+    }
+    for (auto *DI : DebugPseudoINS) {
+      DI->moveBefore(&*EndBlock->getFirstInsertionPt());
+    }
+
+    // These are the new basic blocks for the conditional branch.
+    // At least one will become an actual new basic block.
+    BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
+    BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
+    if (!TrueSlicesInterleaved.empty()) {
+      TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink",
+                                     EndBlock->getParent(), EndBlock);
+      TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
+      TrueBranch->setDebugLoc(LastSI->getDebugLoc());
+      for (Instruction *TrueInst : TrueSlicesInterleaved)
+        TrueInst->moveBefore(TrueBranch);
+    }
+    if (!FalseSlicesInterleaved.empty()) {
+      FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink",
+                                      EndBlock->getParent(), EndBlock);
+      FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
+      FalseBranch->setDebugLoc(LastSI->getDebugLoc());
+      for (Instruction *FalseInst : FalseSlicesInterleaved)
+        FalseInst->moveBefore(FalseBranch);
+    }
+    // If there was nothing to sink, then arbitrarily choose the 'false' side
+    // for a new input value to the PHI.
+    if (TrueBlock == FalseBlock) {
+      assert(TrueBlock == nullptr &&
+             "Unexpected basic block transform while optimizing select");
+
+      FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
+                                      EndBlock->getParent(), EndBlock);
+      auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
+      FalseBranch->setDebugLoc(SI->getDebugLoc());
+    }
+
+    // Insert the real conditional branch based on the original condition.
+    // If we did not create a new block for one of the 'true' or 'false' paths
+    // of the condition, it means that side of the branch goes to the end block
+    // directly and the path originates from the start block from the point of
+    // view of the new PHI.
+    BasicBlock *TT, *FT;
+    if (TrueBlock == nullptr) {
+      TT = EndBlock;
+      FT = FalseBlock;
+      TrueBlock = StartBlock;
+    } else if (FalseBlock == nullptr) {
+      TT = TrueBlock;
+      FT = EndBlock;
+      FalseBlock = StartBlock;
+    } else {
+      TT = TrueBlock;
+      FT = FalseBlock;
+    }
+    IRBuilder<> IB(SI);
+    auto *CondFr =
+        IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
+    IB.CreateCondBr(CondFr, TT, FT, SI);
+
+    SmallPtrSet<const Instruction *, 2> INS;
+    INS.insert(ASI.begin(), ASI.end());
+    // Use reverse iterator because later select may use the value of the
+    // earlier select, and we need to propagate value through earlier select
+    // to get the PHI operand.
+    for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
+      SelectInst *SI = *It;
+      // The select itself is replaced with a PHI Node.
+      PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
+      PN->takeName(SI);
+      PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
+      PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
+      PN->setDebugLoc(SI->getDebugLoc());
+
+      SI->replaceAllUsesWith(PN);
+      SI->eraseFromParent();
+      INS.erase(SI);
+      ++NumSelectsConverted;
+    }
+  }
+}
+
+static bool isSpecialSelect(SelectInst *SI) {
+  using namespace llvm::PatternMatch;
+
+  // If the select is a logical-and/logical-or then it is better treated as a
+  // and/or by the backend.
+  if (match(SI, m_CombineOr(m_LogicalAnd(m_Value(), m_Value()),
+                            m_LogicalOr(m_Value(), m_Value()))))
+    return true;
+
+  return false;
+}
+
+void SelectOptimize::collectSelectGroups(BasicBlock &BB,
+                                         SelectGroups &SIGroups) {
+  BasicBlock::iterator BBIt = BB.begin();
+  while (BBIt != BB.end()) {
+    Instruction *I = &*BBIt++;
+    if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+      if (isSpecialSelect(SI))
+        continue;
+
+      SelectGroup SIGroup;
+      SIGroup.push_back(SI);
+      while (BBIt != BB.end()) {
+        Instruction *NI = &*BBIt;
+        SelectInst *NSI = dyn_cast<SelectInst>(NI);
+        if (NSI && SI->getCondition() == NSI->getCondition()) {
+          SIGroup.push_back(NSI);
+        } else if (!NI->isDebugOrPseudoInst()) {
+          // Debug/pseudo instructions should be skipped and not prevent the
+          // formation of a select group.
+          break;
+        }
+        ++BBIt;
+      }
+
+      // If the select type is not supported, no point optimizing it.
+      // Instruction selection will take care of it.
+      if (!isSelectKindSupported(SI))
+        continue;
+
+      SIGroups.push_back(SIGroup);
+    }
+  }
+}
+
+void SelectOptimize::findProfitableSIGroupsBase(SelectGroups &SIGroups,
+                                                SelectGroups &ProfSIGroups) {
+  for (SelectGroup &ASI : SIGroups) {
+    ++NumSelectOptAnalyzed;
+    if (isConvertToBranchProfitableBase(ASI))
+      ProfSIGroups.push_back(ASI);
+  }
+}
+
+static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE,
+                               DiagnosticInfoOptimizationBase &Rem) {
+  LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n");
+  ORE->emit(Rem);
+}
+
+void SelectOptimize::findProfitableSIGroupsInnerLoops(
+    const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
+  NumSelectOptAnalyzed += SIGroups.size();
+  // For each select group in an inner-most loop,
+  // a branch is more preferable than a select/conditional-move if:
+  // i) conversion to branches for all the select groups of the loop satisfies
+  //    loop-level heuristics including reducing the loop's critical path by
+  //    some threshold (see SelectOptimize::checkLoopHeuristics); and
+  // ii) the total cost of the select group is cheaper with a branch compared
+  //     to its predicated version. The cost is in terms of latency and the cost
+  //     of a select group is the cost of its most expensive select instruction
+  //     (assuming infinite resources and thus fully leveraging available ILP).
+
+  DenseMap<const Instruction *, CostInfo> InstCostMap;
+  CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()},
+                          {Scaled64::getZero(), Scaled64::getZero()}};
+  if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) ||
+      !checkLoopHeuristics(L, LoopCost)) {
+    return;
+  }
+
+  for (SelectGroup &ASI : SIGroups) {
+    // Assuming infinite resources, the cost of a group of instructions is the
+    // cost of the most expensive instruction of the group.
+    Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
+    for (SelectInst *SI : ASI) {
+      SelectCost = std::max(SelectCost, InstCostMap[SI].PredCost);
+      BranchCost = std::max(BranchCost, InstCostMap[SI].NonPredCost);
+    }
+    if (BranchCost < SelectCost) {
+      OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front());
+      OR << "Profitable to convert to branch (loop analysis). BranchCost="
+         << BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
+         << ". ";
+      EmitAndPrintRemark(ORE, OR);
+      ++NumSelectConvertedLoop;
+      ProfSIGroups.push_back(ASI);
+    } else {
+      OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
+      ORmiss << "Select is more profitable (loop analysis). BranchCost="
+             << BranchCost.toString()
+             << ", SelectCost=" << SelectCost.toString() << ". ";
+      EmitAndPrintRemark(ORE, ORmiss);
+    }
+  }
+}
+
+bool SelectOptimize::isConvertToBranchProfitableBase(
+    const SmallVector<SelectInst *, 2> &ASI) {
+  SelectInst *SI = ASI.front();
+  LLVM_DEBUG(dbgs() << "Analyzing select group containing " << *SI << "\n");
+  OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI);
+  OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI);
+
+  // Skip cold basic blocks. Better to optimize for size for cold blocks.
+  if (PSI->isColdBlock(SI->getParent(), BFI.get())) {
+    ++NumSelectColdBB;
+    ORmiss << "Not converted to branch because of cold basic block. ";
+    EmitAndPrintRemark(ORE, ORmiss);
+    return false;
+  }
+
+  // If unpredictable, branch form is less profitable.
+  if (SI->getMetadata(LLVMContext::MD_unpredictable)) {
+    ++NumSelectUnPred;
+    ORmiss << "Not converted to branch because of unpredictable branch. ";
+    EmitAndPrintRemark(ORE, ORmiss);
+    return false;
+  }
+
+  // If highly predictable, branch form is more profitable, unless a
+  // predictable select is inexpensive in the target architecture.
+  if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
+    ++NumSelectConvertedHighPred;
+    OR << "Converted to branch because of highly predictable branch. ";
+    EmitAndPrintRemark(ORE, OR);
+    return true;
+  }
+
+  // Look for expensive instructions in the cold operand's (if any) dependence
+  // slice of any of the selects in the group.
+  if (hasExpensiveColdOperand(ASI)) {
+    ++NumSelectConvertedExpColdOperand;
+    OR << "Converted to branch because of expensive cold operand.";
+    EmitAndPrintRemark(ORE, OR);
+    return true;
+  }
+
+  ORmiss << "Not profitable to convert to branch (base heuristic).";
+  EmitAndPrintRemark(ORE, ORmiss);
+  return false;
+}
+
+static InstructionCost divideNearest(InstructionCost Numerator,
+                                     uint64_t Denominator) {
+  return (Numerator + (Denominator / 2)) / Denominator;
+}
+
+bool SelectOptimize::hasExpensiveColdOperand(
+    const SmallVector<SelectInst *, 2> &ASI) {
+  bool ColdOperand = false;
+  uint64_t TrueWeight, FalseWeight, TotalWeight;
+  if (extractBranchWeights(*ASI.front(), TrueWeight, FalseWeight)) {
+    uint64_t MinWeight = std::min(TrueWeight, FalseWeight);
+    TotalWeight = TrueWeight + FalseWeight;
+    // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
+    ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
+  } else if (PSI->hasProfileSummary()) {
+    OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
+    ORmiss << "Profile data available but missing branch-weights metadata for "
+              "select instruction. ";
+    EmitAndPrintRemark(ORE, ORmiss);
+  }
+  if (!ColdOperand)
+    return false;
+  // Check if the cold path's dependence slice is expensive for any of the
+  // selects of the group.
+  for (SelectInst *SI : ASI) {
+    Instruction *ColdI = nullptr;
+    uint64_t HotWeight;
+    if (TrueWeight < FalseWeight) {
+      ColdI = dyn_cast<Instruction>(SI->getTrueValue());
+      HotWeight = FalseWeight;
+    } else {
+      ColdI = dyn_cast<Instruction>(SI->getFalseValue());
+      HotWeight = TrueWeight;
+    }
+    if (ColdI) {
+      std::stack<Instruction *> ColdSlice;
+      getExclBackwardsSlice(ColdI, ColdSlice, SI);
+      InstructionCost SliceCost = 0;
+      while (!ColdSlice.empty()) {
+        SliceCost += TTI->getInstructionCost(ColdSlice.top(),
+                                             TargetTransformInfo::TCK_Latency);
+        ColdSlice.pop();
+      }
+      // The colder the cold value operand of the select is the more expensive
+      // the cmov becomes for computing the cold value operand every time. Thus,
+      // the colder the cold operand is the more its cost counts.
+      // Get nearest integer cost adjusted for coldness.
+      InstructionCost AdjSliceCost =
+          divideNearest(SliceCost * HotWeight, TotalWeight);
+      if (AdjSliceCost >=
+          ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive)
+        return true;
+    }
+  }
+  return false;
+}
+
+// Check if it is safe to move LoadI next to the SI.
+// Conservatively assume it is safe only if there is no instruction
+// modifying memory in-between the load and the select instruction.
+static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) {
+  // Assume loads from different basic blocks are unsafe to move.
+  if (LoadI->getParent() != SI->getParent())
+    return false;
+  auto It = LoadI->getIterator();
+  while (&*It != SI) {
+    if (It->mayWriteToMemory())
+      return false;
+    It++;
+  }
+  return true;
+}
+
+// For a given source instruction, collect its backwards dependence slice
+// consisting of instructions exclusively computed for the purpose of producing
+// the operands of the source instruction. As an approximation
+// (sufficiently-accurate in practice), we populate this set with the
+// instructions of the backwards dependence slice that only have one-use and
+// form an one-use chain that leads to the source instruction.
+void SelectOptimize::getExclBackwardsSlice(Instruction *I,
+                                           std::stack<Instruction *> &Slice,
+                                           Instruction *SI, bool ForSinking) {
+  SmallPtrSet<Instruction *, 2> Visited;
+  std::queue<Instruction *> Worklist;
+  Worklist.push(I);
+  while (!Worklist.empty()) {
+    Instruction *II = Worklist.front();
+    Worklist.pop();
+
+    // Avoid cycles.
+    if (!Visited.insert(II).second)
+      continue;
+
+    if (!II->hasOneUse())
+      continue;
+
+    // Cannot soundly sink instructions with side-effects.
+    // Terminator or phi instructions cannot be sunk.
+    // Avoid sinking other select instructions (should be handled separetely).
+    if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() ||
+                       isa<SelectInst>(II) || isa<PHINode>(II)))
+      continue;
+
+    // Avoid sinking loads in order not to skip state-modifying instructions,
+    // that may alias with the loaded address.
+    // Only allow sinking of loads within the same basic block that are
+    // conservatively proven to be safe.
+    if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(II, SI))
+      continue;
+
+    // Avoid considering instructions with less frequency than the source
+    // instruction (i.e., avoid colder code regions of the dependence slice).
+    if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))
+      continue;
+
+    // Eligible one-use instruction added to the dependence slice.
+    Slice.push(II);
+
+    // Explore all the operands of the current instruction to expand the slice.
+    for (unsigned k = 0; k < II->getNumOperands(); ++k)
+      if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k)))
+        Worklist.push(OpI);
+  }
+}
+
+bool SelectOptimize::isSelectHighlyPredictable(const SelectInst *SI) {
+  uint64_t TrueWeight, FalseWeight;
+  if (extractBranchWeights(*SI, TrueWeight, FalseWeight)) {
+    uint64_t Max = std::max(TrueWeight, FalseWeight);
+    uint64_t Sum = TrueWeight + FalseWeight;
+    if (Sum != 0) {
+      auto Probability = BranchProbability::getBranchProbability(Max, Sum);
+      if (Probability > TTI->getPredictableBranchThreshold())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool SelectOptimize::checkLoopHeuristics(const Loop *L,
+                                         const CostInfo LoopCost[2]) {
+  // Loop-level checks to determine if a non-predicated version (with branches)
+  // of the loop is more profitable than its predicated version.
+
+  if (DisableLoopLevelHeuristics)
+    return true;
+
+  OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
+                                   L->getHeader()->getFirstNonPHI());
+
+  if (LoopCost[0].NonPredCost > LoopCost[0].PredCost ||
+      LoopCost[1].NonPredCost >= LoopCost[1].PredCost) {
+    ORmissL << "No select conversion in the loop due to no reduction of loop's "
+               "critical path. ";
+    EmitAndPrintRemark(ORE, ORmissL);
+    return false;
+  }
+
+  Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost,
+                      LoopCost[1].PredCost - LoopCost[1].NonPredCost};
+
+  // Profitably converting to branches need to reduce the loop's critical path
+  // by at least some threshold (absolute gain of GainCycleThreshold cycles and
+  // relative gain of 12.5%).
+  if (Gain[1] < Scaled64::get(GainCycleThreshold) ||
+      Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) {
+    Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost;
+    ORmissL << "No select conversion in the loop due to small reduction of "
+               "loop's critical path. Gain="
+            << Gain[1].toString()
+            << ", RelativeGain=" << RelativeGain.toString() << "%. ";
+    EmitAndPrintRemark(ORE, ORmissL);
+    return false;
+  }
+
+  // If the loop's critical path involves loop-carried dependences, the gradient
+  // of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
+  // This check ensures that the latency reduction for the loop's critical path
+  // keeps decreasing with sufficient rate beyond the two analyzed loop
+  // iterations.
+  if (Gain[1] > Gain[0]) {
+    Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) /
+                            (LoopCost[1].PredCost - LoopCost[0].PredCost);
+    if (GradientGain < Scaled64::get(GainGradientThreshold)) {
+      ORmissL << "No select conversion in the loop due to small gradient gain. "
+                 "GradientGain="
+              << GradientGain.toString() << "%. ";
+      EmitAndPrintRemark(ORE, ORmissL);
+      return false;
+    }
+  }
+  // If the gain decreases it is not profitable to convert.
+  else if (Gain[1] < Gain[0]) {
+    ORmissL
+        << "No select conversion in the loop due to negative gradient gain. ";
+    EmitAndPrintRemark(ORE, ORmissL);
+    return false;
+  }
+
+  // Non-predicated version of the loop is more profitable than its
+  // predicated version.
+  return true;
+}
+
+// Computes instruction and loop-critical-path costs for both the predicated
+// and non-predicated version of the given loop.
+// Returns false if unable to compute these costs due to invalid cost of loop
+// instruction(s).
+bool SelectOptimize::computeLoopCosts(
+    const Loop *L, const SelectGroups &SIGroups,
+    DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) {
+  LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop "
+                    << L->getHeader()->getName() << "\n");
+  const auto &SIset = getSIset(SIGroups);
+  // Compute instruction and loop-critical-path costs across two iterations for
+  // both predicated and non-predicated version.
+  const unsigned Iterations = 2;
+  for (unsigned Iter = 0; Iter < Iterations; ++Iter) {
+    // Cost of the loop's critical path.
+    CostInfo &MaxCost = LoopCost[Iter];
+    for (BasicBlock *BB : L->getBlocks()) {
+      for (const Instruction &I : *BB) {
+        if (I.isDebugOrPseudoInst())
+          continue;
+        // Compute the predicated and non-predicated cost of the instruction.
+        Scaled64 IPredCost = Scaled64::getZero(),
+                 INonPredCost = Scaled64::getZero();
+
+        // Assume infinite resources that allow to fully exploit the available
+        // instruction-level parallelism.
+        // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
+        for (const Use &U : I.operands()) {
+          auto UI = dyn_cast<Instruction>(U.get());
+          if (!UI)
+            continue;
+          if (InstCostMap.count(UI)) {
+            IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost);
+            INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost);
+          }
+        }
+        auto ILatency = computeInstCost(&I);
+        if (!ILatency) {
+          OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I);
+          ORmissL << "Invalid instruction cost preventing analysis and "
+                     "optimization of the inner-most loop containing this "
+                     "instruction. ";
+          EmitAndPrintRemark(ORE, ORmissL);
+          return false;
+        }
+        IPredCost += Scaled64::get(*ILatency);
+        INonPredCost += Scaled64::get(*ILatency);
+
+        // For a select that can be converted to branch,
+        // compute its cost as a branch (non-predicated cost).
+        //
+        // BranchCost = PredictedPathCost + MispredictCost
+        // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
+        // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
+        if (SIset.contains(&I)) {
+          auto SI = cast<SelectInst>(&I);
+
+          Scaled64 TrueOpCost = Scaled64::getZero(),
+                   FalseOpCost = Scaled64::getZero();
+          if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue()))
+            if (InstCostMap.count(TI))
+              TrueOpCost = InstCostMap[TI].NonPredCost;
+          if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue()))
+            if (InstCostMap.count(FI))
+              FalseOpCost = InstCostMap[FI].NonPredCost;
+          Scaled64 PredictedPathCost =
+              getPredictedPathCost(TrueOpCost, FalseOpCost, SI);
+
+          Scaled64 CondCost = Scaled64::getZero();
+          if (auto *CI = dyn_cast<Instruction>(SI->getCondition()))
+            if (InstCostMap.count(CI))
+              CondCost = InstCostMap[CI].NonPredCost;
+          Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
+
+          INonPredCost = PredictedPathCost + MispredictCost;
+        }
+        LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/"
+                          << INonPredCost << " for " << I << "\n");
+
+        InstCostMap[&I] = {IPredCost, INonPredCost};
+        MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost);
+        MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost);
+      }
+    }
+    LLVM_DEBUG(dbgs() << "Iteration " << Iter + 1
+                      << " MaxCost = " << MaxCost.PredCost << " "
+                      << MaxCost.NonPredCost << "\n");
+  }
+  return true;
+}
+
+SmallPtrSet<const Instruction *, 2>
+SelectOptimize::getSIset(const SelectGroups &SIGroups) {
+  SmallPtrSet<const Instruction *, 2> SIset;
+  for (const SelectGroup &ASI : SIGroups)
+    for (const SelectInst *SI : ASI)
+      SIset.insert(SI);
+  return SIset;
+}
+
+std::optional<uint64_t> SelectOptimize::computeInstCost(const Instruction *I) {
+  InstructionCost ICost =
+      TTI->getInstructionCost(I, TargetTransformInfo::TCK_Latency);
+  if (auto OC = ICost.getValue())
+    return std::optional<uint64_t>(*OC);
+  return std::nullopt;
+}
+
+ScaledNumber<uint64_t>
+SelectOptimize::getMispredictionCost(const SelectInst *SI,
+                                     const Scaled64 CondCost) {
+  uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
+
+  // Account for the default misprediction rate when using a branch
+  // (conservatively set to 25% by default).
+  uint64_t MispredictRate = MispredictDefaultRate;
+  // If the select condition is obviously predictable, then the misprediction
+  // rate is zero.
+  if (isSelectHighlyPredictable(SI))
+    MispredictRate = 0;
+
+  // CondCost is included to account for cases where the computation of the
+  // condition is part of a long dependence chain (potentially loop-carried)
+  // that would delay detection of a misprediction and increase its cost.
+  Scaled64 MispredictCost =
+      std::max(Scaled64::get(MispredictPenalty), CondCost) *
+      Scaled64::get(MispredictRate);
+  MispredictCost /= Scaled64::get(100);
+
+  return MispredictCost;
+}
+
+// Returns the cost of a branch when the prediction is correct.
+// TrueCost * TrueProbability + FalseCost * FalseProbability.
+ScaledNumber<uint64_t>
+SelectOptimize::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
+                                     const SelectInst *SI) {
+  Scaled64 PredPathCost;
+  uint64_t TrueWeight, FalseWeight;
+  if (extractBranchWeights(*SI, TrueWeight, FalseWeight)) {
+    uint64_t SumWeight = TrueWeight + FalseWeight;
+    if (SumWeight != 0) {
+      PredPathCost = TrueCost * Scaled64::get(TrueWeight) +
+                     FalseCost * Scaled64::get(FalseWeight);
+      PredPathCost /= Scaled64::get(SumWeight);
+      return PredPathCost;
+    }
+  }
+  // Without branch weight metadata, we assume 75% for the one path and 25% for
+  // the other, and pick the result with the biggest cost.
+  PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost,
+                          FalseCost * Scaled64::get(3) + TrueCost);
+  PredPathCost /= Scaled64::get(4);
+  return PredPathCost;
+}
+
+bool SelectOptimize::isSelectKindSupported(SelectInst *SI) {
+  bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
+  if (VectorCond)
+    return false;
+  TargetLowering::SelectSupportKind SelectKind;
+  if (SI->getType()->isVectorTy())
+    SelectKind = TargetLowering::ScalarCondVectorVal;
+  else
+    SelectKind = TargetLowering::ScalarValSelect;
+  return TLI->isSelectSupported(SelectKind);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
new file mode 100644
index 000000000000..235f0da86b90
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -0,0 +1,27593 @@
+//===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass combines dag nodes to form fewer, simpler DAG nodes.  It can be run
+// both before and after the DAG is legalized.
+//
+// This pass is not a substitute for the LLVM IR instcombine pass. This pass is
+// primarily intended to handle simplification opportunities that are implicit
+// in the LLVM IR and exposed by the various codegen lowering phases.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IntervalMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ByteProvider.h"
+#include "llvm/CodeGen/DAGCombine.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <variant>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dagcombine"
+
+STATISTIC(NodesCombined   , "Number of dag nodes combined");
+STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
+STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
+STATISTIC(OpsNarrowed     , "Number of load/op/store narrowed");
+STATISTIC(LdStFP2Int      , "Number of fp load/store pairs transformed to int");
+STATISTIC(SlicedLoads, "Number of load sliced");
+STATISTIC(NumFPLogicOpsConv, "Number of logic ops converted to fp ops");
+
+static cl::opt<bool>
+CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
+                 cl::desc("Enable DAG combiner's use of IR alias analysis"));
+
+static cl::opt<bool>
+UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true),
+        cl::desc("Enable DAG combiner's use of TBAA"));
+
+#ifndef NDEBUG
+static cl::opt<std::string>
+CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden,
+                   cl::desc("Only use DAG-combiner alias analysis in this"
+                            " function"));
+#endif
+
+/// Hidden option to stress test load slicing, i.e., when this option
+/// is enabled, load slicing bypasses most of its profitability guards.
+static cl::opt<bool>
+StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
+                  cl::desc("Bypass the profitability model of load slicing"),
+                  cl::init(false));
+
+static cl::opt<bool>
+  MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true),
+                    cl::desc("DAG combiner may split indexing from loads"));
+
+static cl::opt<bool>
+    EnableStoreMerging("combiner-store-merging", cl::Hidden, cl::init(true),
+                       cl::desc("DAG combiner enable merging multiple stores "
+                                "into a wider store"));
+
+static cl::opt<unsigned> TokenFactorInlineLimit(
+    "combiner-tokenfactor-inline-limit", cl::Hidden, cl::init(2048),
+    cl::desc("Limit the number of operands to inline for Token Factors"));
+
+static cl::opt<unsigned> StoreMergeDependenceLimit(
+    "combiner-store-merge-dependence-limit", cl::Hidden, cl::init(10),
+    cl::desc("Limit the number of times for the same StoreNode and RootNode "
+             "to bail out in store merging dependence check"));
+
+static cl::opt<bool> EnableReduceLoadOpStoreWidth(
+    "combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true),
+    cl::desc("DAG combiner enable reducing the width of load/op/store "
+             "sequence"));
+
+static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore(
+    "combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true),
+    cl::desc("DAG combiner enable load/<replace bytes>/store with "
+             "a narrower store"));
+
+static cl::opt<bool> EnableVectorFCopySignExtendRound(
+    "combiner-vector-fcopysign-extend-round", cl::Hidden, cl::init(false),
+    cl::desc(
+        "Enable merging extends and rounds into FCOPYSIGN on vector types"));
+
+namespace {
+
+  class DAGCombiner {
+    SelectionDAG &DAG;
+    const TargetLowering &TLI;
+    const SelectionDAGTargetInfo *STI;
+    CombineLevel Level = BeforeLegalizeTypes;
+    CodeGenOpt::Level OptLevel;
+    bool LegalDAG = false;
+    bool LegalOperations = false;
+    bool LegalTypes = false;
+    bool ForCodeSize;
+    bool DisableGenericCombines;
+
+    /// Worklist of all of the nodes that need to be simplified.
+    ///
+    /// This must behave as a stack -- new nodes to process are pushed onto the
+    /// back and when processing we pop off of the back.
+    ///
+    /// The worklist will not contain duplicates but may contain null entries
+    /// due to nodes being deleted from the underlying DAG.
+    SmallVector<SDNode *, 64> Worklist;
+
+    /// Mapping from an SDNode to its position on the worklist.
+    ///
+    /// This is used to find and remove nodes from the worklist (by nulling
+    /// them) when they are deleted from the underlying DAG. It relies on
+    /// stable indices of nodes within the worklist.
+    DenseMap<SDNode *, unsigned> WorklistMap;
+
+    /// This records all nodes attempted to be added to the worklist since we
+    /// considered a new worklist entry. As we keep do not add duplicate nodes
+    /// in the worklist, this is different from the tail of the worklist.
+    SmallSetVector<SDNode *, 32> PruningList;
+
+    /// Set of nodes which have been combined (at least once).
+    ///
+    /// This is used to allow us to reliably add any operands of a DAG node
+    /// which have not yet been combined to the worklist.
+    SmallPtrSet<SDNode *, 32> CombinedNodes;
+
+    /// Map from candidate StoreNode to the pair of RootNode and count.
+    /// The count is used to track how many times we have seen the StoreNode
+    /// with the same RootNode bail out in dependence check. If we have seen
+    /// the bail out for the same pair many times over a limit, we won't
+    /// consider the StoreNode with the same RootNode as store merging
+    /// candidate again.
+    DenseMap<SDNode *, std::pair<SDNode *, unsigned>> StoreRootCountMap;
+
+    // AA - Used for DAG load/store alias analysis.
+    AliasAnalysis *AA;
+
+    /// When an instruction is simplified, add all users of the instruction to
+    /// the work lists because they might get more simplified now.
+    void AddUsersToWorklist(SDNode *N) {
+      for (SDNode *Node : N->uses())
+        AddToWorklist(Node);
+    }
+
+    /// Convenient shorthand to add a node and all of its user to the worklist.
+    void AddToWorklistWithUsers(SDNode *N) {
+      AddUsersToWorklist(N);
+      AddToWorklist(N);
+    }
+
+    // Prune potentially dangling nodes. This is called after
+    // any visit to a node, but should also be called during a visit after any
+    // failed combine which may have created a DAG node.
+    void clearAddedDanglingWorklistEntries() {
+      // Check any nodes added to the worklist to see if they are prunable.
+      while (!PruningList.empty()) {
+        auto *N = PruningList.pop_back_val();
+        if (N->use_empty())
+          recursivelyDeleteUnusedNodes(N);
+      }
+    }
+
+    SDNode *getNextWorklistEntry() {
+      // Before we do any work, remove nodes that are not in use.
+      clearAddedDanglingWorklistEntries();
+      SDNode *N = nullptr;
+      // The Worklist holds the SDNodes in order, but it may contain null
+      // entries.
+      while (!N && !Worklist.empty()) {
+        N = Worklist.pop_back_val();
+      }
+
+      if (N) {
+        bool GoodWorklistEntry = WorklistMap.erase(N);
+        (void)GoodWorklistEntry;
+        assert(GoodWorklistEntry &&
+               "Found a worklist entry without a corresponding map entry!");
+      }
+      return N;
+    }
+
+    /// Call the node-specific routine that folds each particular type of node.
+    SDValue visit(SDNode *N);
+
+  public:
+    DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
+        : DAG(D), TLI(D.getTargetLoweringInfo()),
+          STI(D.getSubtarget().getSelectionDAGInfo()), OptLevel(OL), AA(AA) {
+      ForCodeSize = DAG.shouldOptForSize();
+      DisableGenericCombines = STI && STI->disableGenericCombines(OptLevel);
+
+      MaximumLegalStoreInBits = 0;
+      // We use the minimum store size here, since that's all we can guarantee
+      // for the scalable vector types.
+      for (MVT VT : MVT::all_valuetypes())
+        if (EVT(VT).isSimple() && VT != MVT::Other &&
+            TLI.isTypeLegal(EVT(VT)) &&
+            VT.getSizeInBits().getKnownMinValue() >= MaximumLegalStoreInBits)
+          MaximumLegalStoreInBits = VT.getSizeInBits().getKnownMinValue();
+    }
+
+    void ConsiderForPruning(SDNode *N) {
+      // Mark this for potential pruning.
+      PruningList.insert(N);
+    }
+
+    /// Add to the worklist making sure its instance is at the back (next to be
+    /// processed.)
+    void AddToWorklist(SDNode *N, bool IsCandidateForPruning = true) {
+      assert(N->getOpcode() != ISD::DELETED_NODE &&
+             "Deleted Node added to Worklist");
+
+      // Skip handle nodes as they can't usefully be combined and confuse the
+      // zero-use deletion strategy.
+      if (N->getOpcode() == ISD::HANDLENODE)
+        return;
+
+      if (IsCandidateForPruning)
+        ConsiderForPruning(N);
+
+      if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
+        Worklist.push_back(N);
+    }
+
+    /// Remove all instances of N from the worklist.
+    void removeFromWorklist(SDNode *N) {
+      CombinedNodes.erase(N);
+      PruningList.remove(N);
+      StoreRootCountMap.erase(N);
+
+      auto It = WorklistMap.find(N);
+      if (It == WorklistMap.end())
+        return; // Not in the worklist.
+
+      // Null out the entry rather than erasing it to avoid a linear operation.
+      Worklist[It->second] = nullptr;
+      WorklistMap.erase(It);
+    }
+
+    void deleteAndRecombine(SDNode *N);
+    bool recursivelyDeleteUnusedNodes(SDNode *N);
+
+    /// Replaces all uses of the results of one DAG node with new values.
+    SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
+                      bool AddTo = true);
+
+    /// Replaces all uses of the results of one DAG node with new values.
+    SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) {
+      return CombineTo(N, &Res, 1, AddTo);
+    }
+
+    /// Replaces all uses of the results of one DAG node with new values.
+    SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1,
+                      bool AddTo = true) {
+      SDValue To[] = { Res0, Res1 };
+      return CombineTo(N, To, 2, AddTo);
+    }
+
+    void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);
+
+  private:
+    unsigned MaximumLegalStoreInBits;
+
+    /// Check the specified integer node value to see if it can be simplified or
+    /// if things it uses can be simplified by bit propagation.
+    /// If so, return true.
+    bool SimplifyDemandedBits(SDValue Op) {
+      unsigned BitWidth = Op.getScalarValueSizeInBits();
+      APInt DemandedBits = APInt::getAllOnes(BitWidth);
+      return SimplifyDemandedBits(Op, DemandedBits);
+    }
+
+    bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits) {
+      TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
+      KnownBits Known;
+      if (!TLI.SimplifyDemandedBits(Op, DemandedBits, Known, TLO, 0, false))
+        return false;
+
+      // Revisit the node.
+      AddToWorklist(Op.getNode());
+
+      CommitTargetLoweringOpt(TLO);
+      return true;
+    }
+
+    /// Check the specified vector node value to see if it can be simplified or
+    /// if things it uses can be simplified as it only uses some of the
+    /// elements. If so, return true.
+    bool SimplifyDemandedVectorElts(SDValue Op) {
+      // TODO: For now just pretend it cannot be simplified.
+      if (Op.getValueType().isScalableVector())
+        return false;
+
+      unsigned NumElts = Op.getValueType().getVectorNumElements();
+      APInt DemandedElts = APInt::getAllOnes(NumElts);
+      return SimplifyDemandedVectorElts(Op, DemandedElts);
+    }
+
+    bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
+                              const APInt &DemandedElts,
+                              bool AssumeSingleUse = false);
+    bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
+                                    bool AssumeSingleUse = false);
+
+    bool CombineToPreIndexedLoadStore(SDNode *N);
+    bool CombineToPostIndexedLoadStore(SDNode *N);
+    SDValue SplitIndexingFromLoad(LoadSDNode *LD);
+    bool SliceUpLoad(SDNode *N);
+
+    // Looks up the chain to find a unique (unaliased) store feeding the passed
+    // load. If no such store is found, returns a nullptr.
+    // Note: This will look past a CALLSEQ_START if the load is chained to it so
+    //       so that it can find stack stores for byval params.
+    StoreSDNode *getUniqueStoreFeeding(LoadSDNode *LD, int64_t &Offset);
+    // Scalars have size 0 to distinguish from singleton vectors.
+    SDValue ForwardStoreValueToDirectLoad(LoadSDNode *LD);
+    bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val);
+    bool extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val);
+
+    /// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
+    ///   load.
+    ///
+    /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced.
+    /// \param InVecVT type of the input vector to EVE with bitcasts resolved.
+    /// \param EltNo index of the vector element to load.
+    /// \param OriginalLoad load that EVE came from to be replaced.
+    /// \returns EVE on success SDValue() on failure.
+    SDValue scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
+                                         SDValue EltNo,
+                                         LoadSDNode *OriginalLoad);
+    void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
+    SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
+    SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
+    SDValue ZExtPromoteOperand(SDValue Op, EVT PVT);
+    SDValue PromoteIntBinOp(SDValue Op);
+    SDValue PromoteIntShiftOp(SDValue Op);
+    SDValue PromoteExtend(SDValue Op);
+    bool PromoteLoad(SDValue Op);
+
+    SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
+                                SDValue RHS, SDValue True, SDValue False,
+                                ISD::CondCode CC);
+
+    /// Call the node-specific routine that knows how to fold each
+    /// particular type of node. If that doesn't do anything, try the
+    /// target-specific DAG combines.
+    SDValue combine(SDNode *N);
+
+    // Visitation implementation - Implement dag node combining for different
+    // node types.  The semantics are as follows:
+    // Return Value:
+    //   SDValue.getNode() == 0 - No change was made
+    //   SDValue.getNode() == N - N was replaced, is dead and has been handled.
+    //   otherwise              - N should be replaced by the returned Operand.
+    //
+    SDValue visitTokenFactor(SDNode *N);
+    SDValue visitMERGE_VALUES(SDNode *N);
+    SDValue visitADD(SDNode *N);
+    SDValue visitADDLike(SDNode *N);
+    SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference);
+    SDValue visitSUB(SDNode *N);
+    SDValue visitADDSAT(SDNode *N);
+    SDValue visitSUBSAT(SDNode *N);
+    SDValue visitADDC(SDNode *N);
+    SDValue visitADDO(SDNode *N);
+    SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N);
+    SDValue visitSUBC(SDNode *N);
+    SDValue visitSUBO(SDNode *N);
+    SDValue visitADDE(SDNode *N);
+    SDValue visitUADDO_CARRY(SDNode *N);
+    SDValue visitSADDO_CARRY(SDNode *N);
+    SDValue visitUADDO_CARRYLike(SDValue N0, SDValue N1, SDValue CarryIn,
+                                 SDNode *N);
+    SDValue visitSUBE(SDNode *N);
+    SDValue visitUSUBO_CARRY(SDNode *N);
+    SDValue visitSSUBO_CARRY(SDNode *N);
+    SDValue visitMUL(SDNode *N);
+    SDValue visitMULFIX(SDNode *N);
+    SDValue useDivRem(SDNode *N);
+    SDValue visitSDIV(SDNode *N);
+    SDValue visitSDIVLike(SDValue N0, SDValue N1, SDNode *N);
+    SDValue visitUDIV(SDNode *N);
+    SDValue visitUDIVLike(SDValue N0, SDValue N1, SDNode *N);
+    SDValue visitREM(SDNode *N);
+    SDValue visitMULHU(SDNode *N);
+    SDValue visitMULHS(SDNode *N);
+    SDValue visitAVG(SDNode *N);
+    SDValue visitABD(SDNode *N);
+    SDValue visitSMUL_LOHI(SDNode *N);
+    SDValue visitUMUL_LOHI(SDNode *N);
+    SDValue visitMULO(SDNode *N);
+    SDValue visitIMINMAX(SDNode *N);
+    SDValue visitAND(SDNode *N);
+    SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *N);
+    SDValue visitOR(SDNode *N);
+    SDValue visitORLike(SDValue N0, SDValue N1, SDNode *N);
+    SDValue visitXOR(SDNode *N);
+    SDValue SimplifyVCastOp(SDNode *N, const SDLoc &DL);
+    SDValue SimplifyVBinOp(SDNode *N, const SDLoc &DL);
+    SDValue visitSHL(SDNode *N);
+    SDValue visitSRA(SDNode *N);
+    SDValue visitSRL(SDNode *N);
+    SDValue visitFunnelShift(SDNode *N);
+    SDValue visitSHLSAT(SDNode *N);
+    SDValue visitRotate(SDNode *N);
+    SDValue visitABS(SDNode *N);
+    SDValue visitBSWAP(SDNode *N);
+    SDValue visitBITREVERSE(SDNode *N);
+    SDValue visitCTLZ(SDNode *N);
+    SDValue visitCTLZ_ZERO_UNDEF(SDNode *N);
+    SDValue visitCTTZ(SDNode *N);
+    SDValue visitCTTZ_ZERO_UNDEF(SDNode *N);
+    SDValue visitCTPOP(SDNode *N);
+    SDValue visitSELECT(SDNode *N);
+    SDValue visitVSELECT(SDNode *N);
+    SDValue visitSELECT_CC(SDNode *N);
+    SDValue visitSETCC(SDNode *N);
+    SDValue visitSETCCCARRY(SDNode *N);
+    SDValue visitSIGN_EXTEND(SDNode *N);
+    SDValue visitZERO_EXTEND(SDNode *N);
+    SDValue visitANY_EXTEND(SDNode *N);
+    SDValue visitAssertExt(SDNode *N);
+    SDValue visitAssertAlign(SDNode *N);
+    SDValue visitSIGN_EXTEND_INREG(SDNode *N);
+    SDValue visitEXTEND_VECTOR_INREG(SDNode *N);
+    SDValue visitTRUNCATE(SDNode *N);
+    SDValue visitBITCAST(SDNode *N);
+    SDValue visitFREEZE(SDNode *N);
+    SDValue visitBUILD_PAIR(SDNode *N);
+    SDValue visitFADD(SDNode *N);
+    SDValue visitVP_FADD(SDNode *N);
+    SDValue visitVP_FSUB(SDNode *N);
+    SDValue visitSTRICT_FADD(SDNode *N);
+    SDValue visitFSUB(SDNode *N);
+    SDValue visitFMUL(SDNode *N);
+    template <class MatchContextClass> SDValue visitFMA(SDNode *N);
+    SDValue visitFDIV(SDNode *N);
+    SDValue visitFREM(SDNode *N);
+    SDValue visitFSQRT(SDNode *N);
+    SDValue visitFCOPYSIGN(SDNode *N);
+    SDValue visitFPOW(SDNode *N);
+    SDValue visitSINT_TO_FP(SDNode *N);
+    SDValue visitUINT_TO_FP(SDNode *N);
+    SDValue visitFP_TO_SINT(SDNode *N);
+    SDValue visitFP_TO_UINT(SDNode *N);
+    SDValue visitFP_ROUND(SDNode *N);
+    SDValue visitFP_EXTEND(SDNode *N);
+    SDValue visitFNEG(SDNode *N);
+    SDValue visitFABS(SDNode *N);
+    SDValue visitFCEIL(SDNode *N);
+    SDValue visitFTRUNC(SDNode *N);
+    SDValue visitFFREXP(SDNode *N);
+    SDValue visitFFLOOR(SDNode *N);
+    SDValue visitFMinMax(SDNode *N);
+    SDValue visitBRCOND(SDNode *N);
+    SDValue visitBR_CC(SDNode *N);
+    SDValue visitLOAD(SDNode *N);
+
+    SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain);
+    SDValue replaceStoreOfFPConstant(StoreSDNode *ST);
+    SDValue replaceStoreOfInsertLoad(StoreSDNode *ST);
+
+    bool refineExtractVectorEltIntoMultipleNarrowExtractVectorElts(SDNode *N);
+
+    SDValue visitSTORE(SDNode *N);
+    SDValue visitLIFETIME_END(SDNode *N);
+    SDValue visitINSERT_VECTOR_ELT(SDNode *N);
+    SDValue visitEXTRACT_VECTOR_ELT(SDNode *N);
+    SDValue visitBUILD_VECTOR(SDNode *N);
+    SDValue visitCONCAT_VECTORS(SDNode *N);
+    SDValue visitEXTRACT_SUBVECTOR(SDNode *N);
+    SDValue visitVECTOR_SHUFFLE(SDNode *N);
+    SDValue visitSCALAR_TO_VECTOR(SDNode *N);
+    SDValue visitINSERT_SUBVECTOR(SDNode *N);
+    SDValue visitMLOAD(SDNode *N);
+    SDValue visitMSTORE(SDNode *N);
+    SDValue visitMGATHER(SDNode *N);
+    SDValue visitMSCATTER(SDNode *N);
+    SDValue visitVPGATHER(SDNode *N);
+    SDValue visitVPSCATTER(SDNode *N);
+    SDValue visitFP_TO_FP16(SDNode *N);
+    SDValue visitFP16_TO_FP(SDNode *N);
+    SDValue visitFP_TO_BF16(SDNode *N);
+    SDValue visitVECREDUCE(SDNode *N);
+    SDValue visitVPOp(SDNode *N);
+    SDValue visitGET_FPENV_MEM(SDNode *N);
+    SDValue visitSET_FPENV_MEM(SDNode *N);
+
+    template <class MatchContextClass>
+    SDValue visitFADDForFMACombine(SDNode *N);
+    template <class MatchContextClass>
+    SDValue visitFSUBForFMACombine(SDNode *N);
+    SDValue visitFMULForFMADistributiveCombine(SDNode *N);
+
+    SDValue XformToShuffleWithZero(SDNode *N);
+    bool reassociationCanBreakAddressingModePattern(unsigned Opc,
+                                                    const SDLoc &DL,
+                                                    SDNode *N,
+                                                    SDValue N0,
+                                                    SDValue N1);
+    SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0,
+                                      SDValue N1, SDNodeFlags Flags);
+    SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
+                           SDValue N1, SDNodeFlags Flags);
+    SDValue reassociateReduction(unsigned ResOpc, unsigned Opc, const SDLoc &DL,
+                                 EVT VT, SDValue N0, SDValue N1,
+                                 SDNodeFlags Flags = SDNodeFlags());
+
+    SDValue visitShiftByConstant(SDNode *N);
+
+    SDValue foldSelectOfConstants(SDNode *N);
+    SDValue foldVSelectOfConstants(SDNode *N);
+    SDValue foldBinOpIntoSelect(SDNode *BO);
+    bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
+    SDValue hoistLogicOpWithSameOpcodeHands(SDNode *N);
+    SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2);
+    SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
+                             SDValue N2, SDValue N3, ISD::CondCode CC,
+                             bool NotExtCompare = false);
+    SDValue convertSelectOfFPConstantsToLoadOffset(
+        const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3,
+        ISD::CondCode CC);
+    SDValue foldSignChangeInBitcast(SDNode *N);
+    SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
+                                   SDValue N2, SDValue N3, ISD::CondCode CC);
+    SDValue foldSelectOfBinops(SDNode *N);
+    SDValue foldSextSetcc(SDNode *N);
+    SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
+                              const SDLoc &DL);
+    SDValue foldSubToUSubSat(EVT DstVT, SDNode *N);
+    SDValue foldABSToABD(SDNode *N);
+    SDValue unfoldMaskedMerge(SDNode *N);
+    SDValue unfoldExtremeBitClearingToShifts(SDNode *N);
+    SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
+                          const SDLoc &DL, bool foldBooleans);
+    SDValue rebuildSetCC(SDValue N);
+
+    bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
+                           SDValue &CC, bool MatchStrict = false) const;
+    bool isOneUseSetCC(SDValue N) const;
+
+    SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
+                                         unsigned HiOp);
+    SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
+    SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG,
+                                 const TargetLowering &TLI);
+
+    SDValue CombineExtLoad(SDNode *N);
+    SDValue CombineZExtLogicopShiftLoad(SDNode *N);
+    SDValue combineRepeatedFPDivisors(SDNode *N);
+    SDValue mergeInsertEltWithShuffle(SDNode *N, unsigned InsIndex);
+    SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
+    SDValue combineInsertEltToLoad(SDNode *N, unsigned InsIndex);
+    SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
+    SDValue BuildSDIV(SDNode *N);
+    SDValue BuildSDIVPow2(SDNode *N);
+    SDValue BuildUDIV(SDNode *N);
+    SDValue BuildSREMPow2(SDNode *N);
+    SDValue buildOptimizedSREM(SDValue N0, SDValue N1, SDNode *N);
+    SDValue BuildLogBase2(SDValue V, const SDLoc &DL);
+    SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags);
+    SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
+    SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags);
+    SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip);
+    SDValue buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations,
+                                SDNodeFlags Flags, bool Reciprocal);
+    SDValue buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations,
+                                SDNodeFlags Flags, bool Reciprocal);
+    SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
+                               bool DemandHighBits = true);
+    SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
+    SDValue MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
+                              SDValue InnerPos, SDValue InnerNeg, bool HasPos,
+                              unsigned PosOpcode, unsigned NegOpcode,
+                              const SDLoc &DL);
+    SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg,
+                              SDValue InnerPos, SDValue InnerNeg, bool HasPos,
+                              unsigned PosOpcode, unsigned NegOpcode,
+                              const SDLoc &DL);
+    SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
+    SDValue MatchLoadCombine(SDNode *N);
+    SDValue mergeTruncStores(StoreSDNode *N);
+    SDValue reduceLoadWidth(SDNode *N);
+    SDValue ReduceLoadOpStoreWidth(SDNode *N);
+    SDValue splitMergedValStore(StoreSDNode *ST);
+    SDValue TransformFPLoadStorePair(SDNode *N);
+    SDValue convertBuildVecZextToZext(SDNode *N);
+    SDValue convertBuildVecZextToBuildVecWithZeros(SDNode *N);
+    SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
+    SDValue reduceBuildVecTruncToBitCast(SDNode *N);
+    SDValue reduceBuildVecToShuffle(SDNode *N);
+    SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
+                                  ArrayRef<int> VectorMask, SDValue VecIn1,
+                                  SDValue VecIn2, unsigned LeftIdx,
+                                  bool DidSplitVec);
+    SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast);
+
+    /// Walk up chain skipping non-aliasing memory nodes,
+    /// looking for aliasing nodes and adding them to the Aliases vector.
+    void GatherAllAliases(SDNode *N, SDValue OriginalChain,
+                          SmallVectorImpl<SDValue> &Aliases);
+
+    /// Return true if there is any possibility that the two addresses overlap.
+    bool mayAlias(SDNode *Op0, SDNode *Op1) const;
+
+    /// Walk up chain skipping non-aliasing memory nodes, looking for a better
+    /// chain (aliasing node.)
+    SDValue FindBetterChain(SDNode *N, SDValue Chain);
+
+    /// Try to replace a store and any possibly adjacent stores on
+    /// consecutive chains with better chains. Return true only if St is
+    /// replaced.
+    ///
+    /// Notice that other chains may still be replaced even if the function
+    /// returns false.
+    bool findBetterNeighborChains(StoreSDNode *St);
+
+    // Helper for findBetterNeighborChains. Walk up store chain add additional
+    // chained stores that do not overlap and can be parallelized.
+    bool parallelizeChainedStores(StoreSDNode *St);
+
+    /// Holds a pointer to an LSBaseSDNode as well as information on where it
+    /// is located in a sequence of memory operations connected by a chain.
+    struct MemOpLink {
+      // Ptr to the mem node.
+      LSBaseSDNode *MemNode;
+
+      // Offset from the base ptr.
+      int64_t OffsetFromBase;
+
+      MemOpLink(LSBaseSDNode *N, int64_t Offset)
+          : MemNode(N), OffsetFromBase(Offset) {}
+    };
+
+    // Classify the origin of a stored value.
+    enum class StoreSource { Unknown, Constant, Extract, Load };
+    StoreSource getStoreSource(SDValue StoreVal) {
+      switch (StoreVal.getOpcode()) {
+      case ISD::Constant:
+      case ISD::ConstantFP:
+        return StoreSource::Constant;
+      case ISD::EXTRACT_VECTOR_ELT:
+      case ISD::EXTRACT_SUBVECTOR:
+        return StoreSource::Extract;
+      case ISD::LOAD:
+        return StoreSource::Load;
+      default:
+        return StoreSource::Unknown;
+      }
+    }
+
+    /// This is a helper function for visitMUL to check the profitability
+    /// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
+    /// MulNode is the original multiply, AddNode is (add x, c1),
+    /// and ConstNode is c2.
+    bool isMulAddWithConstProfitable(SDNode *MulNode, SDValue AddNode,
+                                     SDValue ConstNode);
+
+    /// This is a helper function for visitAND and visitZERO_EXTEND.  Returns
+    /// true if the (and (load x) c) pattern matches an extload.  ExtVT returns
+    /// the type of the loaded value to be extended.
+    bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
+                          EVT LoadResultTy, EVT &ExtVT);
+
+    /// Helper function to calculate whether the given Load/Store can have its
+    /// width reduced to ExtVT.
+    bool isLegalNarrowLdSt(LSBaseSDNode *LDSTN, ISD::LoadExtType ExtType,
+                           EVT &MemVT, unsigned ShAmt = 0);
+
+    /// Used by BackwardsPropagateMask to find suitable loads.
+    bool SearchForAndLoads(SDNode *N, SmallVectorImpl<LoadSDNode*> &Loads,
+                           SmallPtrSetImpl<SDNode*> &NodesWithConsts,
+                           ConstantSDNode *Mask, SDNode *&NodeToMask);
+    /// Attempt to propagate a given AND node back to load leaves so that they
+    /// can be combined into narrow loads.
+    bool BackwardsPropagateMask(SDNode *N);
+
+    /// Helper function for mergeConsecutiveStores which merges the component
+    /// store chains.
+    SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                unsigned NumStores);
+
+    /// Helper function for mergeConsecutiveStores which checks if all the store
+    /// nodes have the same underlying object. We can still reuse the first
+    /// store's pointer info if all the stores are from the same object.
+    bool hasSameUnderlyingObj(ArrayRef<MemOpLink> StoreNodes);
+
+    /// This is a helper function for mergeConsecutiveStores. When the source
+    /// elements of the consecutive stores are all constants or all extracted
+    /// vector elements, try to merge them into one larger store introducing
+    /// bitcasts if necessary.  \return True if a merged store was created.
+    bool mergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                         EVT MemVT, unsigned NumStores,
+                                         bool IsConstantSrc, bool UseVector,
+                                         bool UseTrunc);
+
+    /// This is a helper function for mergeConsecutiveStores. Stores that
+    /// potentially may be merged with St are placed in StoreNodes. RootNode is
+    /// a chain predecessor to all store candidates.
+    void getStoreMergeCandidates(StoreSDNode *St,
+                                 SmallVectorImpl<MemOpLink> &StoreNodes,
+                                 SDNode *&Root);
+
+    /// Helper function for mergeConsecutiveStores. Checks if candidate stores
+    /// have indirect dependency through their operands. RootNode is the
+    /// predecessor to all stores calculated by getStoreMergeCandidates and is
+    /// used to prune the dependency check. \return True if safe to merge.
+    bool checkMergeStoreCandidatesForDependencies(
+        SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
+        SDNode *RootNode);
+
+    /// This is a helper function for mergeConsecutiveStores. Given a list of
+    /// store candidates, find the first N that are consecutive in memory.
+    /// Returns 0 if there are not at least 2 consecutive stores to try merging.
+    unsigned getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                  int64_t ElementSizeBytes) const;
+
+    /// This is a helper function for mergeConsecutiveStores. It is used for
+    /// store chains that are composed entirely of constant values.
+    bool tryStoreMergeOfConstants(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                  unsigned NumConsecutiveStores,
+                                  EVT MemVT, SDNode *Root, bool AllowVectors);
+
+    /// This is a helper function for mergeConsecutiveStores. It is used for
+    /// store chains that are composed entirely of extracted vector elements.
+    /// When extracting multiple vector elements, try to store them in one
+    /// vector store rather than a sequence of scalar stores.
+    bool tryStoreMergeOfExtracts(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                 unsigned NumConsecutiveStores, EVT MemVT,
+                                 SDNode *Root);
+
+    /// This is a helper function for mergeConsecutiveStores. It is used for
+    /// store chains that are composed entirely of loaded values.
+    bool tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
+                              unsigned NumConsecutiveStores, EVT MemVT,
+                              SDNode *Root, bool AllowVectors,
+                              bool IsNonTemporalStore, bool IsNonTemporalLoad);
+
+    /// Merge consecutive store operations into a wide store.
+    /// This optimization uses wide integers or vectors when possible.
+    /// \return true if stores were merged.
+    bool mergeConsecutiveStores(StoreSDNode *St);
+
+    /// Try to transform a truncation where C is a constant:
+    ///     (trunc (and X, C)) -> (and (trunc X), (trunc C))
+    ///
+    /// \p N needs to be a truncation and its first operand an AND. Other
+    /// requirements are checked by the function (e.g. that trunc is
+    /// single-use) and if missed an empty SDValue is returned.
+    SDValue distributeTruncateThroughAnd(SDNode *N);
+
+    /// Helper function to determine whether the target supports operation
+    /// given by \p Opcode for type \p VT, that is, whether the operation
+    /// is legal or custom before legalizing operations, and whether is
+    /// legal (but not custom) after legalization.
+    bool hasOperation(unsigned Opcode, EVT VT) {
+      return TLI.isOperationLegalOrCustom(Opcode, VT, LegalOperations);
+    }
+
+  public:
+    /// Runs the dag combiner on all nodes in the work list
+    void Run(CombineLevel AtLevel);
+
+    SelectionDAG &getDAG() const { return DAG; }
+
+    /// Returns a type large enough to hold any valid shift amount - before type
+    /// legalization these can be huge.
+    EVT getShiftAmountTy(EVT LHSTy) {
+      assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
+      return TLI.getShiftAmountTy(LHSTy, DAG.getDataLayout(), LegalTypes);
+    }
+
+    /// This method returns true if we are running before type legalization or
+    /// if the specified VT is legal.
+    bool isTypeLegal(const EVT &VT) {
+      if (!LegalTypes) return true;
+      return TLI.isTypeLegal(VT);
+    }
+
+    /// Convenience wrapper around TargetLowering::getSetCCResultType
+    EVT getSetCCResultType(EVT VT) const {
+      return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+    }
+
+    void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
+                         SDValue OrigLoad, SDValue ExtLoad,
+                         ISD::NodeType ExtType);
+  };
+
+/// This class is a DAGUpdateListener that removes any deleted
+/// nodes from the worklist.
+class WorklistRemover : public SelectionDAG::DAGUpdateListener {
+  DAGCombiner &DC;
+
+public:
+  explicit WorklistRemover(DAGCombiner &dc)
+    : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
+
+  void NodeDeleted(SDNode *N, SDNode *E) override {
+    DC.removeFromWorklist(N);
+  }
+};
+
+class WorklistInserter : public SelectionDAG::DAGUpdateListener {
+  DAGCombiner &DC;
+
+public:
+  explicit WorklistInserter(DAGCombiner &dc)
+      : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
+
+  // FIXME: Ideally we could add N to the worklist, but this causes exponential
+  //        compile time costs in large DAGs, e.g. Halide.
+  void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); }
+};
+
+class EmptyMatchContext {
+  SelectionDAG &DAG;
+  const TargetLowering &TLI;
+
+public:
+  EmptyMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root)
+      : DAG(DAG), TLI(TLI) {}
+
+  bool match(SDValue OpN, unsigned Opcode) const {
+    return Opcode == OpN->getOpcode();
+  }
+
+  // Same as SelectionDAG::getNode().
+  template <typename... ArgT> SDValue getNode(ArgT &&...Args) {
+    return DAG.getNode(std::forward<ArgT>(Args)...);
+  }
+
+  bool isOperationLegalOrCustom(unsigned Op, EVT VT,
+                                bool LegalOnly = false) const {
+    return TLI.isOperationLegalOrCustom(Op, VT, LegalOnly);
+  }
+};
+
+class VPMatchContext {
+  SelectionDAG &DAG;
+  const TargetLowering &TLI;
+  SDValue RootMaskOp;
+  SDValue RootVectorLenOp;
+
+public:
+  VPMatchContext(SelectionDAG &DAG, const TargetLowering &TLI, SDNode *Root)
+      : DAG(DAG), TLI(TLI), RootMaskOp(), RootVectorLenOp() {
+    assert(Root->isVPOpcode());
+    if (auto RootMaskPos = ISD::getVPMaskIdx(Root->getOpcode()))
+      RootMaskOp = Root->getOperand(*RootMaskPos);
+
+    if (auto RootVLenPos =
+            ISD::getVPExplicitVectorLengthIdx(Root->getOpcode()))
+      RootVectorLenOp = Root->getOperand(*RootVLenPos);
+  }
+
+  /// whether \p OpVal is a node that is functionally compatible with the
+  /// NodeType \p Opc
+  bool match(SDValue OpVal, unsigned Opc) const {
+    if (!OpVal->isVPOpcode())
+      return OpVal->getOpcode() == Opc;
+
+    auto BaseOpc = ISD::getBaseOpcodeForVP(OpVal->getOpcode(),
+                                           !OpVal->getFlags().hasNoFPExcept());
+    if (BaseOpc != Opc)
+      return false;
+
+    // Make sure the mask of OpVal is true mask or is same as Root's.
+    unsigned VPOpcode = OpVal->getOpcode();
+    if (auto MaskPos = ISD::getVPMaskIdx(VPOpcode)) {
+      SDValue MaskOp = OpVal.getOperand(*MaskPos);
+      if (RootMaskOp != MaskOp &&
+          !ISD::isConstantSplatVectorAllOnes(MaskOp.getNode()))
+        return false;
+    }
+
+    // Make sure the EVL of OpVal is same as Root's.
+    if (auto VLenPos = ISD::getVPExplicitVectorLengthIdx(VPOpcode))
+      if (RootVectorLenOp != OpVal.getOperand(*VLenPos))
+        return false;
+    return true;
+  }
+
+  // Specialize based on number of operands.
+  // TODO emit VP intrinsics where MaskOp/VectorLenOp != null
+  // SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT) { return
+  // DAG.getNode(Opcode, DL, VT); }
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 1 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2);
+    return DAG.getNode(VPOpcode, DL, VT,
+                       {Operand, RootMaskOp, RootVectorLenOp});
+  }
+
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                  SDValue N2) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 2 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3);
+    return DAG.getNode(VPOpcode, DL, VT,
+                       {N1, N2, RootMaskOp, RootVectorLenOp});
+  }
+
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                  SDValue N2, SDValue N3) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 3 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4);
+    return DAG.getNode(VPOpcode, DL, VT,
+                       {N1, N2, N3, RootMaskOp, RootVectorLenOp});
+  }
+
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue Operand,
+                  SDNodeFlags Flags) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 1 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 2);
+    return DAG.getNode(VPOpcode, DL, VT, {Operand, RootMaskOp, RootVectorLenOp},
+                       Flags);
+  }
+
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                  SDValue N2, SDNodeFlags Flags) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 2 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 3);
+    return DAG.getNode(VPOpcode, DL, VT, {N1, N2, RootMaskOp, RootVectorLenOp},
+                       Flags);
+  }
+
+  SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N1,
+                  SDValue N2, SDValue N3, SDNodeFlags Flags) {
+    unsigned VPOpcode = ISD::getVPForBaseOpcode(Opcode);
+    assert(ISD::getVPMaskIdx(VPOpcode) == 3 &&
+           ISD::getVPExplicitVectorLengthIdx(VPOpcode) == 4);
+    return DAG.getNode(VPOpcode, DL, VT,
+                       {N1, N2, N3, RootMaskOp, RootVectorLenOp}, Flags);
+  }
+
+  bool isOperationLegalOrCustom(unsigned Op, EVT VT,
+                                bool LegalOnly = false) const {
+    unsigned VPOp = ISD::getVPForBaseOpcode(Op);
+    return TLI.isOperationLegalOrCustom(VPOp, VT, LegalOnly);
+  }
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//  TargetLowering::DAGCombinerInfo implementation
+//===----------------------------------------------------------------------===//
+
+void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) {
+  ((DAGCombiner*)DC)->AddToWorklist(N);
+}
+
+SDValue TargetLowering::DAGCombinerInfo::
+CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo) {
+  return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo);
+}
+
+SDValue TargetLowering::DAGCombinerInfo::
+CombineTo(SDNode *N, SDValue Res, bool AddTo) {
+  return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo);
+}
+
+SDValue TargetLowering::DAGCombinerInfo::
+CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) {
+  return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo);
+}
+
+bool TargetLowering::DAGCombinerInfo::
+recursivelyDeleteUnusedNodes(SDNode *N) {
+  return ((DAGCombiner*)DC)->recursivelyDeleteUnusedNodes(N);
+}
+
+void TargetLowering::DAGCombinerInfo::
+CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
+  return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+void DAGCombiner::deleteAndRecombine(SDNode *N) {
+  removeFromWorklist(N);
+
+  // If the operands of this node are only used by the node, they will now be
+  // dead. Make sure to re-visit them and recursively delete dead nodes.
+  for (const SDValue &Op : N->ops())
+    // For an operand generating multiple values, one of the values may
+    // become dead allowing further simplification (e.g. split index
+    // arithmetic from an indexed load).
+    if (Op->hasOneUse() || Op->getNumValues() > 1)
+      AddToWorklist(Op.getNode());
+
+  DAG.DeleteNode(N);
+}
+
+// APInts must be the same size for most operations, this helper
+// function zero extends the shorter of the pair so that they match.
+// We provide an Offset so that we can create bitwidths that won't overflow.
+static void zeroExtendToMatch(APInt &LHS, APInt &RHS, unsigned Offset = 0) {
+  unsigned Bits = Offset + std::max(LHS.getBitWidth(), RHS.getBitWidth());
+  LHS = LHS.zext(Bits);
+  RHS = RHS.zext(Bits);
+}
+
+// Return true if this node is a setcc, or is a select_cc
+// that selects between the target values used for true and false, making it
+// equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to
+// the appropriate nodes based on the type of node we are checking. This
+// simplifies life a bit for the callers.
+bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
+                                    SDValue &CC, bool MatchStrict) const {
+  if (N.getOpcode() == ISD::SETCC) {
+    LHS = N.getOperand(0);
+    RHS = N.getOperand(1);
+    CC  = N.getOperand(2);
+    return true;
+  }
+
+  if (MatchStrict &&
+      (N.getOpcode() == ISD::STRICT_FSETCC ||
+       N.getOpcode() == ISD::STRICT_FSETCCS)) {
+    LHS = N.getOperand(1);
+    RHS = N.getOperand(2);
+    CC  = N.getOperand(3);
+    return true;
+  }
+
+  if (N.getOpcode() != ISD::SELECT_CC || !TLI.isConstTrueVal(N.getOperand(2)) ||
+      !TLI.isConstFalseVal(N.getOperand(3)))
+    return false;
+
+  if (TLI.getBooleanContents(N.getValueType()) ==
+      TargetLowering::UndefinedBooleanContent)
+    return false;
+
+  LHS = N.getOperand(0);
+  RHS = N.getOperand(1);
+  CC  = N.getOperand(4);
+  return true;
+}
+
+/// Return true if this is a SetCC-equivalent operation with only one use.
+/// If this is true, it allows the users to invert the operation for free when
+/// it is profitable to do so.
+bool DAGCombiner::isOneUseSetCC(SDValue N) const {
+  SDValue N0, N1, N2;
+  if (isSetCCEquivalent(N, N0, N1, N2) && N->hasOneUse())
+    return true;
+  return false;
+}
+
+static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy) {
+  if (!ScalarTy.isSimple())
+    return false;
+
+  uint64_t MaskForTy = 0ULL;
+  switch (ScalarTy.getSimpleVT().SimpleTy) {
+  case MVT::i8:
+    MaskForTy = 0xFFULL;
+    break;
+  case MVT::i16:
+    MaskForTy = 0xFFFFULL;
+    break;
+  case MVT::i32:
+    MaskForTy = 0xFFFFFFFFULL;
+    break;
+  default:
+    return false;
+    break;
+  }
+
+  APInt Val;
+  if (ISD::isConstantSplatVector(N, Val))
+    return Val.getLimitedValue() == MaskForTy;
+
+  return false;
+}
+
+// Determines if it is a constant integer or a splat/build vector of constant
+// integers (and undefs).
+// Do not permit build vector implicit truncation.
+static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) {
+  if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N))
+    return !(Const->isOpaque() && NoOpaques);
+  if (N.getOpcode() != ISD::BUILD_VECTOR && N.getOpcode() != ISD::SPLAT_VECTOR)
+    return false;
+  unsigned BitWidth = N.getScalarValueSizeInBits();
+  for (const SDValue &Op : N->op_values()) {
+    if (Op.isUndef())
+      continue;
+    ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Op);
+    if (!Const || Const->getAPIntValue().getBitWidth() != BitWidth ||
+        (Const->isOpaque() && NoOpaques))
+      return false;
+  }
+  return true;
+}
+
+// Determines if a BUILD_VECTOR is composed of all-constants possibly mixed with
+// undef's.
+static bool isAnyConstantBuildVector(SDValue V, bool NoOpaques = false) {
+  if (V.getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+  return isConstantOrConstantVector(V, NoOpaques) ||
+         ISD::isBuildVectorOfConstantFPSDNodes(V.getNode());
+}
+
+// Determine if this an indexed load with an opaque target constant index.
+static bool canSplitIdx(LoadSDNode *LD) {
+  return MaySplitLoadIndex &&
+         (LD->getOperand(2).getOpcode() != ISD::TargetConstant ||
+          !cast<ConstantSDNode>(LD->getOperand(2))->isOpaque());
+}
+
+bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
+                                                             const SDLoc &DL,
+                                                             SDNode *N,
+                                                             SDValue N0,
+                                                             SDValue N1) {
+  // Currently this only tries to ensure we don't undo the GEP splits done by
+  // CodeGenPrepare when shouldConsiderGEPOffsetSplit is true. To ensure this,
+  // we check if the following transformation would be problematic:
+  // (load/store (add, (add, x, offset1), offset2)) ->
+  // (load/store (add, x, offset1+offset2)).
+
+  // (load/store (add, (add, x, y), offset2)) ->
+  // (load/store (add, (add, x, offset2), y)).
+
+  if (Opc != ISD::ADD || N0.getOpcode() != ISD::ADD)
+    return false;
+
+  auto *C2 = dyn_cast<ConstantSDNode>(N1);
+  if (!C2)
+    return false;
+
+  const APInt &C2APIntVal = C2->getAPIntValue();
+  if (C2APIntVal.getSignificantBits() > 64)
+    return false;
+
+  if (auto *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+    if (N0.hasOneUse())
+      return false;
+
+    const APInt &C1APIntVal = C1->getAPIntValue();
+    const APInt CombinedValueIntVal = C1APIntVal + C2APIntVal;
+    if (CombinedValueIntVal.getSignificantBits() > 64)
+      return false;
+    const int64_t CombinedValue = CombinedValueIntVal.getSExtValue();
+
+    for (SDNode *Node : N->uses()) {
+      if (auto *LoadStore = dyn_cast<MemSDNode>(Node)) {
+        // Is x[offset2] already not a legal addressing mode? If so then
+        // reassociating the constants breaks nothing (we test offset2 because
+        // that's the one we hope to fold into the load or store).
+        TargetLoweringBase::AddrMode AM;
+        AM.HasBaseReg = true;
+        AM.BaseOffs = C2APIntVal.getSExtValue();
+        EVT VT = LoadStore->getMemoryVT();
+        unsigned AS = LoadStore->getAddressSpace();
+        Type *AccessTy = VT.getTypeForEVT(*DAG.getContext());
+        if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
+          continue;
+
+        // Would x[offset1+offset2] still be a legal addressing mode?
+        AM.BaseOffs = CombinedValue;
+        if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
+          return true;
+      }
+    }
+  } else {
+    if (auto *GA = dyn_cast<GlobalAddressSDNode>(N0.getOperand(1)))
+      if (GA->getOpcode() == ISD::GlobalAddress && TLI.isOffsetFoldingLegal(GA))
+        return false;
+
+    for (SDNode *Node : N->uses()) {
+      auto *LoadStore = dyn_cast<MemSDNode>(Node);
+      if (!LoadStore)
+        return false;
+
+      // Is x[offset2] a legal addressing mode? If so then
+      // reassociating the constants breaks address pattern
+      TargetLoweringBase::AddrMode AM;
+      AM.HasBaseReg = true;
+      AM.BaseOffs = C2APIntVal.getSExtValue();
+      EVT VT = LoadStore->getMemoryVT();
+      unsigned AS = LoadStore->getAddressSpace();
+      Type *AccessTy = VT.getTypeForEVT(*DAG.getContext());
+      if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
+        return false;
+    }
+    return true;
+  }
+
+  return false;
+}
+
+// Helper for DAGCombiner::reassociateOps. Try to reassociate an expression
+// such as (Opc N0, N1), if \p N0 is the same kind of operation as \p Opc.
+SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
+                                               SDValue N0, SDValue N1,
+                                               SDNodeFlags Flags) {
+  EVT VT = N0.getValueType();
+
+  if (N0.getOpcode() != Opc)
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  SDValue N01 = N0.getOperand(1);
+
+  if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N01))) {
+    if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) {
+      // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
+      if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, {N01, N1}))
+        return DAG.getNode(Opc, DL, VT, N00, OpNode);
+      return SDValue();
+    }
+    if (TLI.isReassocProfitable(DAG, N0, N1)) {
+      // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
+      //              iff (op x, c1) has one use
+      SDNodeFlags NewFlags;
+      if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() &&
+          Flags.hasNoUnsignedWrap())
+        NewFlags.setNoUnsignedWrap(true);
+      SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1, NewFlags);
+      return DAG.getNode(Opc, DL, VT, OpNode, N01, NewFlags);
+    }
+  }
+
+  // Check for repeated operand logic simplifications.
+  if (Opc == ISD::AND || Opc == ISD::OR) {
+    // (N00 & N01) & N00 --> N00 & N01
+    // (N00 & N01) & N01 --> N00 & N01
+    // (N00 | N01) | N00 --> N00 | N01
+    // (N00 | N01) | N01 --> N00 | N01
+    if (N1 == N00 || N1 == N01)
+      return N0;
+  }
+  if (Opc == ISD::XOR) {
+    // (N00 ^ N01) ^ N00 --> N01
+    if (N1 == N00)
+      return N01;
+    // (N00 ^ N01) ^ N01 --> N00
+    if (N1 == N01)
+      return N00;
+  }
+
+  if (TLI.isReassocProfitable(DAG, N0, N1)) {
+    if (N1 != N01) {
+      // Reassociate if (op N00, N1) already exist
+      if (SDNode *NE = DAG.getNodeIfExists(Opc, DAG.getVTList(VT), {N00, N1})) {
+        // if Op (Op N00, N1), N01 already exist
+        // we need to stop reassciate to avoid dead loop
+        if (!DAG.doesNodeExist(Opc, DAG.getVTList(VT), {SDValue(NE, 0), N01}))
+          return DAG.getNode(Opc, DL, VT, SDValue(NE, 0), N01);
+      }
+    }
+
+    if (N1 != N00) {
+      // Reassociate if (op N01, N1) already exist
+      if (SDNode *NE = DAG.getNodeIfExists(Opc, DAG.getVTList(VT), {N01, N1})) {
+        // if Op (Op N01, N1), N00 already exist
+        // we need to stop reassciate to avoid dead loop
+        if (!DAG.doesNodeExist(Opc, DAG.getVTList(VT), {SDValue(NE, 0), N00}))
+          return DAG.getNode(Opc, DL, VT, SDValue(NE, 0), N00);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+// Try to reassociate commutative binops.
+SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
+                                    SDValue N1, SDNodeFlags Flags) {
+  assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative.");
+
+  // Floating-point reassociation is not allowed without loose FP math.
+  if (N0.getValueType().isFloatingPoint() ||
+      N1.getValueType().isFloatingPoint())
+    if (!Flags.hasAllowReassociation() || !Flags.hasNoSignedZeros())
+      return SDValue();
+
+  if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N0, N1, Flags))
+    return Combined;
+  if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N1, N0, Flags))
+    return Combined;
+  return SDValue();
+}
+
+// Try to fold Opc(vecreduce(x), vecreduce(y)) -> vecreduce(Opc(x, y))
+// Note that we only expect Flags to be passed from FP operations. For integer
+// operations they need to be dropped.
+SDValue DAGCombiner::reassociateReduction(unsigned RedOpc, unsigned Opc,
+                                          const SDLoc &DL, EVT VT, SDValue N0,
+                                          SDValue N1, SDNodeFlags Flags) {
+  if (N0.getOpcode() == RedOpc && N1.getOpcode() == RedOpc &&
+      N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
+      N0->hasOneUse() && N1->hasOneUse() &&
+      TLI.isOperationLegalOrCustom(Opc, N0.getOperand(0).getValueType()) &&
+      TLI.shouldReassociateReduction(RedOpc, N0.getOperand(0).getValueType())) {
+    SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
+    return DAG.getNode(RedOpc, DL, VT,
+                       DAG.getNode(Opc, DL, N0.getOperand(0).getValueType(),
+                                   N0.getOperand(0), N1.getOperand(0)));
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
+                               bool AddTo) {
+  assert(N->getNumValues() == NumTo && "Broken CombineTo call!");
+  ++NodesCombined;
+  LLVM_DEBUG(dbgs() << "\nReplacing.1 "; N->dump(&DAG); dbgs() << "\nWith: ";
+             To[0].dump(&DAG);
+             dbgs() << " and " << NumTo - 1 << " other values\n");
+  for (unsigned i = 0, e = NumTo; i != e; ++i)
+    assert((!To[i].getNode() ||
+            N->getValueType(i) == To[i].getValueType()) &&
+           "Cannot combine value to value of different type!");
+
+  WorklistRemover DeadNodes(*this);
+  DAG.ReplaceAllUsesWith(N, To);
+  if (AddTo) {
+    // Push the new nodes and any users onto the worklist
+    for (unsigned i = 0, e = NumTo; i != e; ++i) {
+      if (To[i].getNode())
+        AddToWorklistWithUsers(To[i].getNode());
+    }
+  }
+
+  // Finally, if the node is now dead, remove it from the graph.  The node
+  // may not be dead if the replacement process recursively simplified to
+  // something else needing this node.
+  if (N->use_empty())
+    deleteAndRecombine(N);
+  return SDValue(N, 0);
+}
+
+void DAGCombiner::
+CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
+  // Replace the old value with the new one.
+  ++NodesCombined;
+  LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.dump(&DAG);
+             dbgs() << "\nWith: "; TLO.New.dump(&DAG); dbgs() << '\n');
+
+  // Replace all uses.
+  DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New);
+
+  // Push the new node and any (possibly new) users onto the worklist.
+  AddToWorklistWithUsers(TLO.New.getNode());
+
+  // Finally, if the node is now dead, remove it from the graph.
+  recursivelyDeleteUnusedNodes(TLO.Old.getNode());
+}
+
+/// Check the specified integer node value to see if it can be simplified or if
+/// things it uses can be simplified by bit propagation. If so, return true.
+bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
+                                       const APInt &DemandedElts,
+                                       bool AssumeSingleUse) {
+  TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
+  KnownBits Known;
+  if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, 0,
+                                AssumeSingleUse))
+    return false;
+
+  // Revisit the node.
+  AddToWorklist(Op.getNode());
+
+  CommitTargetLoweringOpt(TLO);
+  return true;
+}
+
+/// Check the specified vector node value to see if it can be simplified or
+/// if things it uses can be simplified as it only uses some of the elements.
+/// If so, return true.
+bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op,
+                                             const APInt &DemandedElts,
+                                             bool AssumeSingleUse) {
+  TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
+  APInt KnownUndef, KnownZero;
+  if (!TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
+                                      TLO, 0, AssumeSingleUse))
+    return false;
+
+  // Revisit the node.
+  AddToWorklist(Op.getNode());
+
+  CommitTargetLoweringOpt(TLO);
+  return true;
+}
+
+void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
+  SDLoc DL(Load);
+  EVT VT = Load->getValueType(0);
+  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(ExtLoad, 0));
+
+  LLVM_DEBUG(dbgs() << "\nReplacing.9 "; Load->dump(&DAG); dbgs() << "\nWith: ";
+             Trunc.dump(&DAG); dbgs() << '\n');
+
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
+
+  AddToWorklist(Trunc.getNode());
+  recursivelyDeleteUnusedNodes(Load);
+}
+
+SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
+  Replace = false;
+  SDLoc DL(Op);
+  if (ISD::isUNINDEXEDLoad(Op.getNode())) {
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
+    EVT MemVT = LD->getMemoryVT();
+    ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD
+                                                      : LD->getExtensionType();
+    Replace = true;
+    return DAG.getExtLoad(ExtType, DL, PVT,
+                          LD->getChain(), LD->getBasePtr(),
+                          MemVT, LD->getMemOperand());
+  }
+
+  unsigned Opc = Op.getOpcode();
+  switch (Opc) {
+  default: break;
+  case ISD::AssertSext:
+    if (SDValue Op0 = SExtPromoteOperand(Op.getOperand(0), PVT))
+      return DAG.getNode(ISD::AssertSext, DL, PVT, Op0, Op.getOperand(1));
+    break;
+  case ISD::AssertZext:
+    if (SDValue Op0 = ZExtPromoteOperand(Op.getOperand(0), PVT))
+      return DAG.getNode(ISD::AssertZext, DL, PVT, Op0, Op.getOperand(1));
+    break;
+  case ISD::Constant: {
+    unsigned ExtOpc =
+      Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    return DAG.getNode(ExtOpc, DL, PVT, Op);
+  }
+  }
+
+  if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT))
+    return SDValue();
+  return DAG.getNode(ISD::ANY_EXTEND, DL, PVT, Op);
+}
+
+SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) {
+  if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT))
+    return SDValue();
+  EVT OldVT = Op.getValueType();
+  SDLoc DL(Op);
+  bool Replace = false;
+  SDValue NewOp = PromoteOperand(Op, PVT, Replace);
+  if (!NewOp.getNode())
+    return SDValue();
+  AddToWorklist(NewOp.getNode());
+
+  if (Replace)
+    ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, NewOp.getValueType(), NewOp,
+                     DAG.getValueType(OldVT));
+}
+
+SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) {
+  EVT OldVT = Op.getValueType();
+  SDLoc DL(Op);
+  bool Replace = false;
+  SDValue NewOp = PromoteOperand(Op, PVT, Replace);
+  if (!NewOp.getNode())
+    return SDValue();
+  AddToWorklist(NewOp.getNode());
+
+  if (Replace)
+    ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
+  return DAG.getZeroExtendInReg(NewOp, DL, OldVT);
+}
+
+/// Promote the specified integer binary operation if the target indicates it is
+/// beneficial. e.g. On x86, it's usually better to promote i16 operations to
+/// i32 since i16 instructions are longer.
+SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  // If operation type is 'undesirable', e.g. i16 on x86, consider
+  // promoting it.
+  unsigned Opc = Op.getOpcode();
+  if (TLI.isTypeDesirableForOp(Opc, VT))
+    return SDValue();
+
+  EVT PVT = VT;
+  // Consult target whether it is a good idea to promote this operation and
+  // what's the right type to promote it to.
+  if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
+    assert(PVT != VT && "Don't know what type to promote to!");
+
+    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.dump(&DAG));
+
+    bool Replace0 = false;
+    SDValue N0 = Op.getOperand(0);
+    SDValue NN0 = PromoteOperand(N0, PVT, Replace0);
+
+    bool Replace1 = false;
+    SDValue N1 = Op.getOperand(1);
+    SDValue NN1 = PromoteOperand(N1, PVT, Replace1);
+    SDLoc DL(Op);
+
+    SDValue RV =
+        DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1));
+
+    // We are always replacing N0/N1's use in N and only need additional
+    // replacements if there are additional uses.
+    // Note: We are checking uses of the *nodes* (SDNode) rather than values
+    //       (SDValue) here because the node may reference multiple values
+    //       (for example, the chain value of a load node).
+    Replace0 &= !N0->hasOneUse();
+    Replace1 &= (N0 != N1) && !N1->hasOneUse();
+
+    // Combine Op here so it is preserved past replacements.
+    CombineTo(Op.getNode(), RV);
+
+    // If operands have a use ordering, make sure we deal with
+    // predecessor first.
+    if (Replace0 && Replace1 && N0->isPredecessorOf(N1.getNode())) {
+      std::swap(N0, N1);
+      std::swap(NN0, NN1);
+    }
+
+    if (Replace0) {
+      AddToWorklist(NN0.getNode());
+      ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode());
+    }
+    if (Replace1) {
+      AddToWorklist(NN1.getNode());
+      ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode());
+    }
+    return Op;
+  }
+  return SDValue();
+}
+
+/// Promote the specified integer shift operation if the target indicates it is
+/// beneficial. e.g. On x86, it's usually better to promote i16 operations to
+/// i32 since i16 instructions are longer.
+SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  // If operation type is 'undesirable', e.g. i16 on x86, consider
+  // promoting it.
+  unsigned Opc = Op.getOpcode();
+  if (TLI.isTypeDesirableForOp(Opc, VT))
+    return SDValue();
+
+  EVT PVT = VT;
+  // Consult target whether it is a good idea to promote this operation and
+  // what's the right type to promote it to.
+  if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
+    assert(PVT != VT && "Don't know what type to promote to!");
+
+    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.dump(&DAG));
+
+    bool Replace = false;
+    SDValue N0 = Op.getOperand(0);
+    if (Opc == ISD::SRA)
+      N0 = SExtPromoteOperand(N0, PVT);
+    else if (Opc == ISD::SRL)
+      N0 = ZExtPromoteOperand(N0, PVT);
+    else
+      N0 = PromoteOperand(N0, PVT, Replace);
+
+    if (!N0.getNode())
+      return SDValue();
+
+    SDLoc DL(Op);
+    SDValue N1 = Op.getOperand(1);
+    SDValue RV =
+        DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1));
+
+    if (Replace)
+      ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());
+
+    // Deal with Op being deleted.
+    if (Op && Op.getOpcode() != ISD::DELETED_NODE)
+      return RV;
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::PromoteExtend(SDValue Op) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  // If operation type is 'undesirable', e.g. i16 on x86, consider
+  // promoting it.
+  unsigned Opc = Op.getOpcode();
+  if (TLI.isTypeDesirableForOp(Opc, VT))
+    return SDValue();
+
+  EVT PVT = VT;
+  // Consult target whether it is a good idea to promote this operation and
+  // what's the right type to promote it to.
+  if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
+    assert(PVT != VT && "Don't know what type to promote to!");
+    // fold (aext (aext x)) -> (aext x)
+    // fold (aext (zext x)) -> (zext x)
+    // fold (aext (sext x)) -> (sext x)
+    LLVM_DEBUG(dbgs() << "\nPromoting "; Op.dump(&DAG));
+    return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0));
+  }
+  return SDValue();
+}
+
+bool DAGCombiner::PromoteLoad(SDValue Op) {
+  if (!LegalOperations)
+    return false;
+
+  if (!ISD::isUNINDEXEDLoad(Op.getNode()))
+    return false;
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector() || !VT.isInteger())
+    return false;
+
+  // If operation type is 'undesirable', e.g. i16 on x86, consider
+  // promoting it.
+  unsigned Opc = Op.getOpcode();
+  if (TLI.isTypeDesirableForOp(Opc, VT))
+    return false;
+
+  EVT PVT = VT;
+  // Consult target whether it is a good idea to promote this operation and
+  // what's the right type to promote it to.
+  if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
+    assert(PVT != VT && "Don't know what type to promote to!");
+
+    SDLoc DL(Op);
+    SDNode *N = Op.getNode();
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    EVT MemVT = LD->getMemoryVT();
+    ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD
+                                                      : LD->getExtensionType();
+    SDValue NewLD = DAG.getExtLoad(ExtType, DL, PVT,
+                                   LD->getChain(), LD->getBasePtr(),
+                                   MemVT, LD->getMemOperand());
+    SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
+
+    LLVM_DEBUG(dbgs() << "\nPromoting "; N->dump(&DAG); dbgs() << "\nTo: ";
+               Result.dump(&DAG); dbgs() << '\n');
+
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
+
+    AddToWorklist(Result.getNode());
+    recursivelyDeleteUnusedNodes(N);
+    return true;
+  }
+
+  return false;
+}
+
+/// Recursively delete a node which has no uses and any operands for
+/// which it is the only use.
+///
+/// Note that this both deletes the nodes and removes them from the worklist.
+/// It also adds any nodes who have had a user deleted to the worklist as they
+/// may now have only one use and subject to other combines.
+bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
+  if (!N->use_empty())
+    return false;
+
+  SmallSetVector<SDNode *, 16> Nodes;
+  Nodes.insert(N);
+  do {
+    N = Nodes.pop_back_val();
+    if (!N)
+      continue;
+
+    if (N->use_empty()) {
+      for (const SDValue &ChildN : N->op_values())
+        Nodes.insert(ChildN.getNode());
+
+      removeFromWorklist(N);
+      DAG.DeleteNode(N);
+    } else {
+      AddToWorklist(N);
+    }
+  } while (!Nodes.empty());
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  Main DAG Combiner implementation
+//===----------------------------------------------------------------------===//
+
+void DAGCombiner::Run(CombineLevel AtLevel) {
+  // set the instance variables, so that the various visit routines may use it.
+  Level = AtLevel;
+  LegalDAG = Level >= AfterLegalizeDAG;
+  LegalOperations = Level >= AfterLegalizeVectorOps;
+  LegalTypes = Level >= AfterLegalizeTypes;
+
+  WorklistInserter AddNodes(*this);
+
+  // Add all the dag nodes to the worklist.
+  //
+  // Note: All nodes are not added to PruningList here, this is because the only
+  // nodes which can be deleted are those which have no uses and all other nodes
+  // which would otherwise be added to the worklist by the first call to
+  // getNextWorklistEntry are already present in it.
+  for (SDNode &Node : DAG.allnodes())
+    AddToWorklist(&Node, /* IsCandidateForPruning */ Node.use_empty());
+
+  // Create a dummy node (which is not added to allnodes), that adds a reference
+  // to the root node, preventing it from being deleted, and tracking any
+  // changes of the root.
+  HandleSDNode Dummy(DAG.getRoot());
+
+  // While we have a valid worklist entry node, try to combine it.
+  while (SDNode *N = getNextWorklistEntry()) {
+    // If N has no uses, it is dead.  Make sure to revisit all N's operands once
+    // N is deleted from the DAG, since they too may now be dead or may have a
+    // reduced number of uses, allowing other xforms.
+    if (recursivelyDeleteUnusedNodes(N))
+      continue;
+
+    WorklistRemover DeadNodes(*this);
+
+    // If this combine is running after legalizing the DAG, re-legalize any
+    // nodes pulled off the worklist.
+    if (LegalDAG) {
+      SmallSetVector<SDNode *, 16> UpdatedNodes;
+      bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes);
+
+      for (SDNode *LN : UpdatedNodes)
+        AddToWorklistWithUsers(LN);
+
+      if (!NIsValid)
+        continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG));
+
+    // Add any operands of the new node which have not yet been combined to the
+    // worklist as well. Because the worklist uniques things already, this
+    // won't repeatedly process the same operand.
+    for (const SDValue &ChildN : N->op_values())
+      if (!CombinedNodes.count(ChildN.getNode()))
+        AddToWorklist(ChildN.getNode());
+
+    CombinedNodes.insert(N);
+    SDValue RV = combine(N);
+
+    if (!RV.getNode())
+      continue;
+
+    ++NodesCombined;
+
+    // If we get back the same node we passed in, rather than a new node or
+    // zero, we know that the node must have defined multiple values and
+    // CombineTo was used.  Since CombineTo takes care of the worklist
+    // mechanics for us, we have no work to do in this case.
+    if (RV.getNode() == N)
+      continue;
+
+    assert(N->getOpcode() != ISD::DELETED_NODE &&
+           RV.getOpcode() != ISD::DELETED_NODE &&
+           "Node was deleted but visit returned new node!");
+
+    LLVM_DEBUG(dbgs() << " ... into: "; RV.dump(&DAG));
+
+    if (N->getNumValues() == RV->getNumValues())
+      DAG.ReplaceAllUsesWith(N, RV.getNode());
+    else {
+      assert(N->getValueType(0) == RV.getValueType() &&
+             N->getNumValues() == 1 && "Type mismatch");
+      DAG.ReplaceAllUsesWith(N, &RV);
+    }
+
+    // Push the new node and any users onto the worklist.  Omit this if the
+    // new node is the EntryToken (e.g. if a store managed to get optimized
+    // out), because re-visiting the EntryToken and its users will not uncover
+    // any additional opportunities, but there may be a large number of such
+    // users, potentially causing compile time explosion.
+    if (RV.getOpcode() != ISD::EntryToken)
+      AddToWorklistWithUsers(RV.getNode());
+
+    // Finally, if the node is now dead, remove it from the graph.  The node
+    // may not be dead if the replacement process recursively simplified to
+    // something else needing this node. This will also take care of adding any
+    // operands which have lost a user to the worklist.
+    recursivelyDeleteUnusedNodes(N);
+  }
+
+  // If the root changed (e.g. it was a dead load, update the root).
+  DAG.setRoot(Dummy.getValue());
+  DAG.RemoveDeadNodes();
+}
+
+SDValue DAGCombiner::visit(SDNode *N) {
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::TokenFactor:        return visitTokenFactor(N);
+  case ISD::MERGE_VALUES:       return visitMERGE_VALUES(N);
+  case ISD::ADD:                return visitADD(N);
+  case ISD::SUB:                return visitSUB(N);
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:            return visitADDSAT(N);
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:            return visitSUBSAT(N);
+  case ISD::ADDC:               return visitADDC(N);
+  case ISD::SADDO:
+  case ISD::UADDO:              return visitADDO(N);
+  case ISD::SUBC:               return visitSUBC(N);
+  case ISD::SSUBO:
+  case ISD::USUBO:              return visitSUBO(N);
+  case ISD::ADDE:               return visitADDE(N);
+  case ISD::UADDO_CARRY:        return visitUADDO_CARRY(N);
+  case ISD::SADDO_CARRY:        return visitSADDO_CARRY(N);
+  case ISD::SUBE:               return visitSUBE(N);
+  case ISD::USUBO_CARRY:        return visitUSUBO_CARRY(N);
+  case ISD::SSUBO_CARRY:        return visitSSUBO_CARRY(N);
+  case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:         return visitMULFIX(N);
+  case ISD::MUL:                return visitMUL(N);
+  case ISD::SDIV:               return visitSDIV(N);
+  case ISD::UDIV:               return visitUDIV(N);
+  case ISD::SREM:
+  case ISD::UREM:               return visitREM(N);
+  case ISD::MULHU:              return visitMULHU(N);
+  case ISD::MULHS:              return visitMULHS(N);
+  case ISD::AVGFLOORS:
+  case ISD::AVGFLOORU:
+  case ISD::AVGCEILS:
+  case ISD::AVGCEILU:           return visitAVG(N);
+  case ISD::ABDS:
+  case ISD::ABDU:               return visitABD(N);
+  case ISD::SMUL_LOHI:          return visitSMUL_LOHI(N);
+  case ISD::UMUL_LOHI:          return visitUMUL_LOHI(N);
+  case ISD::SMULO:
+  case ISD::UMULO:              return visitMULO(N);
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:               return visitIMINMAX(N);
+  case ISD::AND:                return visitAND(N);
+  case ISD::OR:                 return visitOR(N);
+  case ISD::XOR:                return visitXOR(N);
+  case ISD::SHL:                return visitSHL(N);
+  case ISD::SRA:                return visitSRA(N);
+  case ISD::SRL:                return visitSRL(N);
+  case ISD::ROTR:
+  case ISD::ROTL:               return visitRotate(N);
+  case ISD::FSHL:
+  case ISD::FSHR:               return visitFunnelShift(N);
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:            return visitSHLSAT(N);
+  case ISD::ABS:                return visitABS(N);
+  case ISD::BSWAP:              return visitBSWAP(N);
+  case ISD::BITREVERSE:         return visitBITREVERSE(N);
+  case ISD::CTLZ:               return visitCTLZ(N);
+  case ISD::CTLZ_ZERO_UNDEF:    return visitCTLZ_ZERO_UNDEF(N);
+  case ISD::CTTZ:               return visitCTTZ(N);
+  case ISD::CTTZ_ZERO_UNDEF:    return visitCTTZ_ZERO_UNDEF(N);
+  case ISD::CTPOP:              return visitCTPOP(N);
+  case ISD::SELECT:             return visitSELECT(N);
+  case ISD::VSELECT:            return visitVSELECT(N);
+  case ISD::SELECT_CC:          return visitSELECT_CC(N);
+  case ISD::SETCC:              return visitSETCC(N);
+  case ISD::SETCCCARRY:         return visitSETCCCARRY(N);
+  case ISD::SIGN_EXTEND:        return visitSIGN_EXTEND(N);
+  case ISD::ZERO_EXTEND:        return visitZERO_EXTEND(N);
+  case ISD::ANY_EXTEND:         return visitANY_EXTEND(N);
+  case ISD::AssertSext:
+  case ISD::AssertZext:         return visitAssertExt(N);
+  case ISD::AssertAlign:        return visitAssertAlign(N);
+  case ISD::SIGN_EXTEND_INREG:  return visitSIGN_EXTEND_INREG(N);
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+  case ISD::ANY_EXTEND_VECTOR_INREG: return visitEXTEND_VECTOR_INREG(N);
+  case ISD::TRUNCATE:           return visitTRUNCATE(N);
+  case ISD::BITCAST:            return visitBITCAST(N);
+  case ISD::BUILD_PAIR:         return visitBUILD_PAIR(N);
+  case ISD::FADD:               return visitFADD(N);
+  case ISD::STRICT_FADD:        return visitSTRICT_FADD(N);
+  case ISD::FSUB:               return visitFSUB(N);
+  case ISD::FMUL:               return visitFMUL(N);
+  case ISD::FMA:                return visitFMA<EmptyMatchContext>(N);
+  case ISD::FDIV:               return visitFDIV(N);
+  case ISD::FREM:               return visitFREM(N);
+  case ISD::FSQRT:              return visitFSQRT(N);
+  case ISD::FCOPYSIGN:          return visitFCOPYSIGN(N);
+  case ISD::FPOW:               return visitFPOW(N);
+  case ISD::SINT_TO_FP:         return visitSINT_TO_FP(N);
+  case ISD::UINT_TO_FP:         return visitUINT_TO_FP(N);
+  case ISD::FP_TO_SINT:         return visitFP_TO_SINT(N);
+  case ISD::FP_TO_UINT:         return visitFP_TO_UINT(N);
+  case ISD::FP_ROUND:           return visitFP_ROUND(N);
+  case ISD::FP_EXTEND:          return visitFP_EXTEND(N);
+  case ISD::FNEG:               return visitFNEG(N);
+  case ISD::FABS:               return visitFABS(N);
+  case ISD::FFLOOR:             return visitFFLOOR(N);
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:           return visitFMinMax(N);
+  case ISD::FCEIL:              return visitFCEIL(N);
+  case ISD::FTRUNC:             return visitFTRUNC(N);
+  case ISD::FFREXP:             return visitFFREXP(N);
+  case ISD::BRCOND:             return visitBRCOND(N);
+  case ISD::BR_CC:              return visitBR_CC(N);
+  case ISD::LOAD:               return visitLOAD(N);
+  case ISD::STORE:              return visitSTORE(N);
+  case ISD::INSERT_VECTOR_ELT:  return visitINSERT_VECTOR_ELT(N);
+  case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N);
+  case ISD::BUILD_VECTOR:       return visitBUILD_VECTOR(N);
+  case ISD::CONCAT_VECTORS:     return visitCONCAT_VECTORS(N);
+  case ISD::EXTRACT_SUBVECTOR:  return visitEXTRACT_SUBVECTOR(N);
+  case ISD::VECTOR_SHUFFLE:     return visitVECTOR_SHUFFLE(N);
+  case ISD::SCALAR_TO_VECTOR:   return visitSCALAR_TO_VECTOR(N);
+  case ISD::INSERT_SUBVECTOR:   return visitINSERT_SUBVECTOR(N);
+  case ISD::MGATHER:            return visitMGATHER(N);
+  case ISD::MLOAD:              return visitMLOAD(N);
+  case ISD::MSCATTER:           return visitMSCATTER(N);
+  case ISD::MSTORE:             return visitMSTORE(N);
+  case ISD::LIFETIME_END:       return visitLIFETIME_END(N);
+  case ISD::FP_TO_FP16:         return visitFP_TO_FP16(N);
+  case ISD::FP16_TO_FP:         return visitFP16_TO_FP(N);
+  case ISD::FP_TO_BF16:         return visitFP_TO_BF16(N);
+  case ISD::FREEZE:             return visitFREEZE(N);
+  case ISD::GET_FPENV_MEM:      return visitGET_FPENV_MEM(N);
+  case ISD::SET_FPENV_MEM:      return visitSET_FPENV_MEM(N);
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN:
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VECREDUCE_FMINIMUM:     return visitVECREDUCE(N);
+#define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...) case ISD::SDOPC:
+#include "llvm/IR/VPIntrinsics.def"
+    return visitVPOp(N);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::combine(SDNode *N) {
+  SDValue RV;
+  if (!DisableGenericCombines)
+    RV = visit(N);
+
+  // If nothing happened, try a target-specific DAG combine.
+  if (!RV.getNode()) {
+    assert(N->getOpcode() != ISD::DELETED_NODE &&
+           "Node was deleted but visit returned NULL!");
+
+    if (N->getOpcode() >= ISD::BUILTIN_OP_END ||
+        TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) {
+
+      // Expose the DAG combiner to the target combiner impls.
+      TargetLowering::DAGCombinerInfo
+        DagCombineInfo(DAG, Level, false, this);
+
+      RV = TLI.PerformDAGCombine(N, DagCombineInfo);
+    }
+  }
+
+  // If nothing happened still, try promoting the operation.
+  if (!RV.getNode()) {
+    switch (N->getOpcode()) {
+    default: break;
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+      RV = PromoteIntBinOp(SDValue(N, 0));
+      break;
+    case ISD::SHL:
+    case ISD::SRA:
+    case ISD::SRL:
+      RV = PromoteIntShiftOp(SDValue(N, 0));
+      break;
+    case ISD::SIGN_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::ANY_EXTEND:
+      RV = PromoteExtend(SDValue(N, 0));
+      break;
+    case ISD::LOAD:
+      if (PromoteLoad(SDValue(N, 0)))
+        RV = SDValue(N, 0);
+      break;
+    }
+  }
+
+  // If N is a commutative binary node, try to eliminate it if the commuted
+  // version is already present in the DAG.
+  if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode())) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+
+    // Constant operands are canonicalized to RHS.
+    if (N0 != N1 && (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1))) {
+      SDValue Ops[] = {N1, N0};
+      SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops,
+                                            N->getFlags());
+      if (CSENode)
+        return SDValue(CSENode, 0);
+    }
+  }
+
+  return RV;
+}
+
+/// Given a node, return its input chain if it has one, otherwise return a null
+/// sd operand.
+static SDValue getInputChainForNode(SDNode *N) {
+  if (unsigned NumOps = N->getNumOperands()) {
+    if (N->getOperand(0).getValueType() == MVT::Other)
+      return N->getOperand(0);
+    if (N->getOperand(NumOps-1).getValueType() == MVT::Other)
+      return N->getOperand(NumOps-1);
+    for (unsigned i = 1; i < NumOps-1; ++i)
+      if (N->getOperand(i).getValueType() == MVT::Other)
+        return N->getOperand(i);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
+  // If N has two operands, where one has an input chain equal to the other,
+  // the 'other' chain is redundant.
+  if (N->getNumOperands() == 2) {
+    if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1))
+      return N->getOperand(0);
+    if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0))
+      return N->getOperand(1);
+  }
+
+  // Don't simplify token factors if optnone.
+  if (OptLevel == CodeGenOpt::None)
+    return SDValue();
+
+  // Don't simplify the token factor if the node itself has too many operands.
+  if (N->getNumOperands() > TokenFactorInlineLimit)
+    return SDValue();
+
+  // If the sole user is a token factor, we should make sure we have a
+  // chance to merge them together. This prevents TF chains from inhibiting
+  // optimizations.
+  if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::TokenFactor)
+    AddToWorklist(*(N->use_begin()));
+
+  SmallVector<SDNode *, 8> TFs;     // List of token factors to visit.
+  SmallVector<SDValue, 8> Ops;      // Ops for replacing token factor.
+  SmallPtrSet<SDNode*, 16> SeenOps;
+  bool Changed = false;             // If we should replace this token factor.
+
+  // Start out with this token factor.
+  TFs.push_back(N);
+
+  // Iterate through token factors.  The TFs grows when new token factors are
+  // encountered.
+  for (unsigned i = 0; i < TFs.size(); ++i) {
+    // Limit number of nodes to inline, to avoid quadratic compile times.
+    // We have to add the outstanding Token Factors to Ops, otherwise we might
+    // drop Ops from the resulting Token Factors.
+    if (Ops.size() > TokenFactorInlineLimit) {
+      for (unsigned j = i; j < TFs.size(); j++)
+        Ops.emplace_back(TFs[j], 0);
+      // Drop unprocessed Token Factors from TFs, so we do not add them to the
+      // combiner worklist later.
+      TFs.resize(i);
+      break;
+    }
+
+    SDNode *TF = TFs[i];
+    // Check each of the operands.
+    for (const SDValue &Op : TF->op_values()) {
+      switch (Op.getOpcode()) {
+      case ISD::EntryToken:
+        // Entry tokens don't need to be added to the list. They are
+        // redundant.
+        Changed = true;
+        break;
+
+      case ISD::TokenFactor:
+        if (Op.hasOneUse() && !is_contained(TFs, Op.getNode())) {
+          // Queue up for processing.
+          TFs.push_back(Op.getNode());
+          Changed = true;
+          break;
+        }
+        [[fallthrough]];
+
+      default:
+        // Only add if it isn't already in the list.
+        if (SeenOps.insert(Op.getNode()).second)
+          Ops.push_back(Op);
+        else
+          Changed = true;
+        break;
+      }
+    }
+  }
+
+  // Re-visit inlined Token Factors, to clean them up in case they have been
+  // removed. Skip the first Token Factor, as this is the current node.
+  for (unsigned i = 1, e = TFs.size(); i < e; i++)
+    AddToWorklist(TFs[i]);
+
+  // Remove Nodes that are chained to another node in the list. Do so
+  // by walking up chains breath-first stopping when we've seen
+  // another operand. In general we must climb to the EntryNode, but we can exit
+  // early if we find all remaining work is associated with just one operand as
+  // no further pruning is possible.
+
+  // List of nodes to search through and original Ops from which they originate.
+  SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist;
+  SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op.
+  SmallPtrSet<SDNode *, 16> SeenChains;
+  bool DidPruneOps = false;
+
+  unsigned NumLeftToConsider = 0;
+  for (const SDValue &Op : Ops) {
+    Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++));
+    OpWorkCount.push_back(1);
+  }
+
+  auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) {
+    // If this is an Op, we can remove the op from the list. Remark any
+    // search associated with it as from the current OpNumber.
+    if (SeenOps.contains(Op)) {
+      Changed = true;
+      DidPruneOps = true;
+      unsigned OrigOpNumber = 0;
+      while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op)
+        OrigOpNumber++;
+      assert((OrigOpNumber != Ops.size()) &&
+             "expected to find TokenFactor Operand");
+      // Re-mark worklist from OrigOpNumber to OpNumber
+      for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) {
+        if (Worklist[i].second == OrigOpNumber) {
+          Worklist[i].second = OpNumber;
+        }
+      }
+      OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber];
+      OpWorkCount[OrigOpNumber] = 0;
+      NumLeftToConsider--;
+    }
+    // Add if it's a new chain
+    if (SeenChains.insert(Op).second) {
+      OpWorkCount[OpNumber]++;
+      Worklist.push_back(std::make_pair(Op, OpNumber));
+    }
+  };
+
+  for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) {
+    // We need at least be consider at least 2 Ops to prune.
+    if (NumLeftToConsider <= 1)
+      break;
+    auto CurNode = Worklist[i].first;
+    auto CurOpNumber = Worklist[i].second;
+    assert((OpWorkCount[CurOpNumber] > 0) &&
+           "Node should not appear in worklist");
+    switch (CurNode->getOpcode()) {
+    case ISD::EntryToken:
+      // Hitting EntryToken is the only way for the search to terminate without
+      // hitting
+      // another operand's search. Prevent us from marking this operand
+      // considered.
+      NumLeftToConsider++;
+      break;
+    case ISD::TokenFactor:
+      for (const SDValue &Op : CurNode->op_values())
+        AddToWorklist(i, Op.getNode(), CurOpNumber);
+      break;
+    case ISD::LIFETIME_START:
+    case ISD::LIFETIME_END:
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+      AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber);
+      break;
+    default:
+      if (auto *MemNode = dyn_cast<MemSDNode>(CurNode))
+        AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber);
+      break;
+    }
+    OpWorkCount[CurOpNumber]--;
+    if (OpWorkCount[CurOpNumber] == 0)
+      NumLeftToConsider--;
+  }
+
+  // If we've changed things around then replace token factor.
+  if (Changed) {
+    SDValue Result;
+    if (Ops.empty()) {
+      // The entry token is the only possible outcome.
+      Result = DAG.getEntryNode();
+    } else {
+      if (DidPruneOps) {
+        SmallVector<SDValue, 8> PrunedOps;
+        //
+        for (const SDValue &Op : Ops) {
+          if (SeenChains.count(Op.getNode()) == 0)
+            PrunedOps.push_back(Op);
+        }
+        Result = DAG.getTokenFactor(SDLoc(N), PrunedOps);
+      } else {
+        Result = DAG.getTokenFactor(SDLoc(N), Ops);
+      }
+    }
+    return Result;
+  }
+  return SDValue();
+}
+
+/// MERGE_VALUES can always be eliminated.
+SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
+  WorklistRemover DeadNodes(*this);
+  // Replacing results may cause a different MERGE_VALUES to suddenly
+  // be CSE'd with N, and carry its uses with it. Iterate until no
+  // uses remain, to ensure that the node can be safely deleted.
+  // First add the users of this node to the work list so that they
+  // can be tried again once they have new operands.
+  AddUsersToWorklist(N);
+  do {
+    // Do as a single replacement to avoid rewalking use lists.
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+      Ops.push_back(N->getOperand(i));
+    DAG.ReplaceAllUsesWith(N, Ops.data());
+  } while (!N->use_empty());
+  deleteAndRecombine(N);
+  return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+}
+
+/// If \p N is a ConstantSDNode with isOpaque() == false return it casted to a
+/// ConstantSDNode pointer else nullptr.
+static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) {
+  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N);
+  return Const != nullptr && !Const->isOpaque() ? Const : nullptr;
+}
+
+// isTruncateOf - If N is a truncate of some other value, return true, record
+// the value being truncated in Op and which of Op's bits are zero/one in Known.
+// This function computes KnownBits to avoid a duplicated call to
+// computeKnownBits in the caller.
+static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
+                         KnownBits &Known) {
+  if (N->getOpcode() == ISD::TRUNCATE) {
+    Op = N->getOperand(0);
+    Known = DAG.computeKnownBits(Op);
+    return true;
+  }
+
+  if (N.getOpcode() != ISD::SETCC ||
+      N.getValueType().getScalarType() != MVT::i1 ||
+      cast<CondCodeSDNode>(N.getOperand(2))->get() != ISD::SETNE)
+    return false;
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  assert(Op0.getValueType() == Op1.getValueType());
+
+  if (isNullOrNullSplat(Op0))
+    Op = Op1;
+  else if (isNullOrNullSplat(Op1))
+    Op = Op0;
+  else
+    return false;
+
+  Known = DAG.computeKnownBits(Op);
+
+  return (Known.Zero | 1).isAllOnes();
+}
+
+/// Return true if 'Use' is a load or a store that uses N as its base pointer
+/// and that N may be folded in the load / store addressing mode.
+static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG,
+                                    const TargetLowering &TLI) {
+  EVT VT;
+  unsigned AS;
+
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) {
+    if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
+      return false;
+    VT = LD->getMemoryVT();
+    AS = LD->getAddressSpace();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) {
+    if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
+      return false;
+    VT = ST->getMemoryVT();
+    AS = ST->getAddressSpace();
+  } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Use)) {
+    if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
+      return false;
+    VT = LD->getMemoryVT();
+    AS = LD->getAddressSpace();
+  } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Use)) {
+    if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
+      return false;
+    VT = ST->getMemoryVT();
+    AS = ST->getAddressSpace();
+  } else {
+    return false;
+  }
+
+  TargetLowering::AddrMode AM;
+  if (N->getOpcode() == ISD::ADD) {
+    AM.HasBaseReg = true;
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (Offset)
+      // [reg +/- imm]
+      AM.BaseOffs = Offset->getSExtValue();
+    else
+      // [reg +/- reg]
+      AM.Scale = 1;
+  } else if (N->getOpcode() == ISD::SUB) {
+    AM.HasBaseReg = true;
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (Offset)
+      // [reg +/- imm]
+      AM.BaseOffs = -Offset->getSExtValue();
+    else
+      // [reg +/- reg]
+      AM.Scale = 1;
+  } else {
+    return false;
+  }
+
+  return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM,
+                                   VT.getTypeForEVT(*DAG.getContext()), AS);
+}
+
+/// This inverts a canonicalization in IR that replaces a variable select arm
+/// with an identity constant. Codegen improves if we re-use the variable
+/// operand rather than load a constant. This can also be converted into a
+/// masked vector operation if the target supports it.
+static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG,
+                                              bool ShouldCommuteOperands) {
+  // Match a select as operand 1. The identity constant that we are looking for
+  // is only valid as operand 1 of a non-commutative binop.
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (ShouldCommuteOperands)
+    std::swap(N0, N1);
+
+  // TODO: Should this apply to scalar select too?
+  if (N1.getOpcode() != ISD::VSELECT || !N1.hasOneUse())
+    return SDValue();
+
+  // We can't hoist all instructions because of immediate UB (not speculatable).
+  // For example div/rem by zero.
+  if (!DAG.isSafeToSpeculativelyExecuteNode(N))
+    return SDValue();
+
+  unsigned Opcode = N->getOpcode();
+  EVT VT = N->getValueType(0);
+  SDValue Cond = N1.getOperand(0);
+  SDValue TVal = N1.getOperand(1);
+  SDValue FVal = N1.getOperand(2);
+
+  // This transform increases uses of N0, so freeze it to be safe.
+  // binop N0, (vselect Cond, IDC, FVal) --> vselect Cond, N0, (binop N0, FVal)
+  unsigned OpNo = ShouldCommuteOperands ? 0 : 1;
+  if (isNeutralConstant(Opcode, N->getFlags(), TVal, OpNo)) {
+    SDValue F0 = DAG.getFreeze(N0);
+    SDValue NewBO = DAG.getNode(Opcode, SDLoc(N), VT, F0, FVal, N->getFlags());
+    return DAG.getSelect(SDLoc(N), VT, Cond, F0, NewBO);
+  }
+  // binop N0, (vselect Cond, TVal, IDC) --> vselect Cond, (binop N0, TVal), N0
+  if (isNeutralConstant(Opcode, N->getFlags(), FVal, OpNo)) {
+    SDValue F0 = DAG.getFreeze(N0);
+    SDValue NewBO = DAG.getNode(Opcode, SDLoc(N), VT, F0, TVal, N->getFlags());
+    return DAG.getSelect(SDLoc(N), VT, Cond, NewBO, F0);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
+  assert(TLI.isBinOp(BO->getOpcode()) && BO->getNumValues() == 1 &&
+         "Unexpected binary operator");
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  auto BinOpcode = BO->getOpcode();
+  EVT VT = BO->getValueType(0);
+  if (TLI.shouldFoldSelectWithIdentityConstant(BinOpcode, VT)) {
+    if (SDValue Sel = foldSelectWithIdentityConstant(BO, DAG, false))
+      return Sel;
+
+    if (TLI.isCommutativeBinOp(BO->getOpcode()))
+      if (SDValue Sel = foldSelectWithIdentityConstant(BO, DAG, true))
+        return Sel;
+  }
+
+  // Don't do this unless the old select is going away. We want to eliminate the
+  // binary operator, not replace a binop with a select.
+  // TODO: Handle ISD::SELECT_CC.
+  unsigned SelOpNo = 0;
+  SDValue Sel = BO->getOperand(0);
+  if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) {
+    SelOpNo = 1;
+    Sel = BO->getOperand(1);
+
+    // Peek through trunc to shift amount type.
+    if ((BinOpcode == ISD::SHL || BinOpcode == ISD::SRA ||
+         BinOpcode == ISD::SRL) && Sel.hasOneUse()) {
+      // This is valid when the truncated bits of x are already zero.
+      SDValue Op;
+      KnownBits Known;
+      if (isTruncateOf(DAG, Sel, Op, Known) &&
+          Known.countMaxActiveBits() < Sel.getScalarValueSizeInBits())
+        Sel = Op;
+    }
+  }
+
+  if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse())
+    return SDValue();
+
+  SDValue CT = Sel.getOperand(1);
+  if (!isConstantOrConstantVector(CT, true) &&
+      !DAG.isConstantFPBuildVectorOrConstantFP(CT))
+    return SDValue();
+
+  SDValue CF = Sel.getOperand(2);
+  if (!isConstantOrConstantVector(CF, true) &&
+      !DAG.isConstantFPBuildVectorOrConstantFP(CF))
+    return SDValue();
+
+  // Bail out if any constants are opaque because we can't constant fold those.
+  // The exception is "and" and "or" with either 0 or -1 in which case we can
+  // propagate non constant operands into select. I.e.:
+  // and (select Cond, 0, -1), X --> select Cond, 0, X
+  // or X, (select Cond, -1, 0) --> select Cond, -1, X
+  bool CanFoldNonConst =
+      (BinOpcode == ISD::AND || BinOpcode == ISD::OR) &&
+      ((isNullOrNullSplat(CT) && isAllOnesOrAllOnesSplat(CF)) ||
+       (isNullOrNullSplat(CF) && isAllOnesOrAllOnesSplat(CT)));
+
+  SDValue CBO = BO->getOperand(SelOpNo ^ 1);
+  if (!CanFoldNonConst &&
+      !isConstantOrConstantVector(CBO, true) &&
+      !DAG.isConstantFPBuildVectorOrConstantFP(CBO))
+    return SDValue();
+
+  SDLoc DL(Sel);
+  SDValue NewCT, NewCF;
+
+  if (CanFoldNonConst) {
+    // If CBO is an opaque constant, we can't rely on getNode to constant fold.
+    if ((BinOpcode == ISD::AND && isNullOrNullSplat(CT)) ||
+        (BinOpcode == ISD::OR && isAllOnesOrAllOnesSplat(CT)))
+      NewCT = CT;
+    else
+      NewCT = CBO;
+
+    if ((BinOpcode == ISD::AND && isNullOrNullSplat(CF)) ||
+        (BinOpcode == ISD::OR && isAllOnesOrAllOnesSplat(CF)))
+      NewCF = CF;
+    else
+      NewCF = CBO;
+  } else {
+    // We have a select-of-constants followed by a binary operator with a
+    // constant. Eliminate the binop by pulling the constant math into the
+    // select. Example: add (select Cond, CT, CF), CBO --> select Cond, CT +
+    // CBO, CF + CBO
+    NewCT = SelOpNo ? DAG.FoldConstantArithmetic(BinOpcode, DL, VT, {CBO, CT})
+                    : DAG.FoldConstantArithmetic(BinOpcode, DL, VT, {CT, CBO});
+    if (!NewCT)
+      return SDValue();
+
+    NewCF = SelOpNo ? DAG.FoldConstantArithmetic(BinOpcode, DL, VT, {CBO, CF})
+                    : DAG.FoldConstantArithmetic(BinOpcode, DL, VT, {CF, CBO});
+    if (!NewCF)
+      return SDValue();
+  }
+
+  SDValue SelectOp = DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF);
+  SelectOp->setFlags(BO->getFlags());
+  return SelectOp;
+}
+
+static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, SelectionDAG &DAG) {
+  assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
+         "Expecting add or sub");
+
+  // Match a constant operand and a zext operand for the math instruction:
+  // add Z, C
+  // sub C, Z
+  bool IsAdd = N->getOpcode() == ISD::ADD;
+  SDValue C = IsAdd ? N->getOperand(1) : N->getOperand(0);
+  SDValue Z = IsAdd ? N->getOperand(0) : N->getOperand(1);
+  auto *CN = dyn_cast<ConstantSDNode>(C);
+  if (!CN || Z.getOpcode() != ISD::ZERO_EXTEND)
+    return SDValue();
+
+  // Match the zext operand as a setcc of a boolean.
+  if (Z.getOperand(0).getOpcode() != ISD::SETCC ||
+      Z.getOperand(0).getValueType() != MVT::i1)
+    return SDValue();
+
+  // Match the compare as: setcc (X & 1), 0, eq.
+  SDValue SetCC = Z.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+  if (CC != ISD::SETEQ || !isNullConstant(SetCC.getOperand(1)) ||
+      SetCC.getOperand(0).getOpcode() != ISD::AND ||
+      !isOneConstant(SetCC.getOperand(0).getOperand(1)))
+    return SDValue();
+
+  // We are adding/subtracting a constant and an inverted low bit. Turn that
+  // into a subtract/add of the low bit with incremented/decremented constant:
+  // add (zext i1 (seteq (X & 1), 0)), C --> sub C+1, (zext (X & 1))
+  // sub C, (zext i1 (seteq (X & 1), 0)) --> add C-1, (zext (X & 1))
+  EVT VT = C.getValueType();
+  SDLoc DL(N);
+  SDValue LowBit = DAG.getZExtOrTrunc(SetCC.getOperand(0), DL, VT);
+  SDValue C1 = IsAdd ? DAG.getConstant(CN->getAPIntValue() + 1, DL, VT) :
+                       DAG.getConstant(CN->getAPIntValue() - 1, DL, VT);
+  return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, C1, LowBit);
+}
+
+/// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into
+/// a shift and add with a different constant.
+static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
+  assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
+         "Expecting add or sub");
+
+  // We need a constant operand for the add/sub, and the other operand is a
+  // logical shift right: add (srl), C or sub C, (srl).
+  bool IsAdd = N->getOpcode() == ISD::ADD;
+  SDValue ConstantOp = IsAdd ? N->getOperand(1) : N->getOperand(0);
+  SDValue ShiftOp = IsAdd ? N->getOperand(0) : N->getOperand(1);
+  if (!DAG.isConstantIntBuildVectorOrConstantInt(ConstantOp) ||
+      ShiftOp.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  // The shift must be of a 'not' value.
+  SDValue Not = ShiftOp.getOperand(0);
+  if (!Not.hasOneUse() || !isBitwiseNot(Not))
+    return SDValue();
+
+  // The shift must be moving the sign bit to the least-significant-bit.
+  EVT VT = ShiftOp.getValueType();
+  SDValue ShAmt = ShiftOp.getOperand(1);
+  ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt);
+  if (!ShAmtC || ShAmtC->getAPIntValue() != (VT.getScalarSizeInBits() - 1))
+    return SDValue();
+
+  // Eliminate the 'not' by adjusting the shift and add/sub constant:
+  // add (srl (not X), 31), C --> add (sra X, 31), (C + 1)
+  // sub C, (srl (not X), 31) --> add (srl X, 31), (C - 1)
+  SDLoc DL(N);
+  if (SDValue NewC = DAG.FoldConstantArithmetic(
+          IsAdd ? ISD::ADD : ISD::SUB, DL, VT,
+          {ConstantOp, DAG.getConstant(1, DL, VT)})) {
+    SDValue NewShift = DAG.getNode(IsAdd ? ISD::SRA : ISD::SRL, DL, VT,
+                                   Not.getOperand(0), ShAmt);
+    return DAG.getNode(ISD::ADD, DL, VT, NewShift, NewC);
+  }
+
+  return SDValue();
+}
+
+static bool isADDLike(SDValue V, const SelectionDAG &DAG) {
+  unsigned Opcode = V.getOpcode();
+  if (Opcode == ISD::OR)
+    return DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1));
+  if (Opcode == ISD::XOR)
+    return isMinSignedConstant(V.getOperand(1));
+  return false;
+}
+
+static bool
+areBitwiseNotOfEachother(SDValue Op0, SDValue Op1) {
+  return (isBitwiseNot(Op0) && Op0.getOperand(0) == Op1) ||
+         (isBitwiseNot(Op1) && Op1.getOperand(0) == Op0);
+}
+
+/// Try to fold a node that behaves like an ADD (note that N isn't necessarily
+/// an ISD::ADD here, it could for example be an ISD::OR if we know that there
+/// are no common bits set in the operands).
+SDValue DAGCombiner::visitADDLike(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  SDLoc DL(N);
+
+  // fold (add x, undef) -> undef
+  if (N0.isUndef())
+    return N0;
+  if (N1.isUndef())
+    return N1;
+
+  // fold (add c1, c2) -> c1+c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
+
+  if (areBitwiseNotOfEachother(N0, N1))
+    return DAG.getConstant(APInt::getAllOnes(VT.getScalarSizeInBits()),
+                           SDLoc(N), VT);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+    // fold (add x, 0) -> x, vector edition
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
+      return N0;
+  }
+
+  // fold (add x, 0) -> x
+  if (isNullConstant(N1))
+    return N0;
+
+  if (N0.getOpcode() == ISD::SUB) {
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+
+    // fold ((A-c1)+c2) -> (A+(c2-c1))
+    if (SDValue Sub = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N1, N01}))
+      return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Sub);
+
+    // fold ((c1-A)+c2) -> (c1+c2)-A
+    if (SDValue Add = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N1, N00}))
+      return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1));
+  }
+
+  // add (sext i1 X), 1 -> zext (not i1 X)
+  // We don't transform this pattern:
+  //   add (zext i1 X), -1 -> sext (not i1 X)
+  // because most (?) targets generate better code for the zext form.
+  if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() &&
+      isOneOrOneSplat(N1)) {
+    SDValue X = N0.getOperand(0);
+    if ((!LegalOperations ||
+         (TLI.isOperationLegal(ISD::XOR, X.getValueType()) &&
+          TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) &&
+        X.getScalarValueSizeInBits() == 1) {
+      SDValue Not = DAG.getNOT(DL, X, X.getValueType());
+      return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not);
+    }
+  }
+
+  // Fold (add (or x, c0), c1) -> (add x, (c0 + c1))
+  // iff (or x, c0) is equivalent to (add x, c0).
+  // Fold (add (xor x, c0), c1) -> (add x, (c0 + c1))
+  // iff (xor x, c0) is equivalent to (add x, c0).
+  if (isADDLike(N0, DAG)) {
+    SDValue N01 = N0.getOperand(1);
+    if (SDValue Add = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N1, N01}))
+      return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add);
+  }
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  // reassociate add
+  if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N, N0, N1)) {
+    if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags()))
+      return RADD;
+
+    // Reassociate (add (or x, c), y) -> (add add(x, y), c)) if (or x, c) is
+    // equivalent to (add x, c).
+    // Reassociate (add (xor x, c), y) -> (add add(x, y), c)) if (xor x, c) is
+    // equivalent to (add x, c).
+    // Do this optimization only when adding c does not introduce instructions
+    // for adding carries.
+    auto ReassociateAddOr = [&](SDValue N0, SDValue N1) {
+      if (isADDLike(N0, DAG) && N0.hasOneUse() &&
+          isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true)) {
+        // If N0's type does not split or is a sign mask, it does not introduce
+        // add carry.
+        auto TyActn = TLI.getTypeAction(*DAG.getContext(), N0.getValueType());
+        bool NoAddCarry = TyActn == TargetLoweringBase::TypeLegal ||
+                          TyActn == TargetLoweringBase::TypePromoteInteger ||
+                          isMinSignedConstant(N0.getOperand(1));
+        if (NoAddCarry)
+          return DAG.getNode(
+              ISD::ADD, DL, VT,
+              DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)),
+              N0.getOperand(1));
+      }
+      return SDValue();
+    };
+    if (SDValue Add = ReassociateAddOr(N0, N1))
+      return Add;
+    if (SDValue Add = ReassociateAddOr(N1, N0))
+      return Add;
+
+    // Fold add(vecreduce(x), vecreduce(y)) -> vecreduce(add(x, y))
+    if (SDValue SD =
+            reassociateReduction(ISD::VECREDUCE_ADD, ISD::ADD, DL, VT, N0, N1))
+      return SD;
+  }
+  // fold ((0-A) + B) -> B-A
+  if (N0.getOpcode() == ISD::SUB && isNullOrNullSplat(N0.getOperand(0)))
+    return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));
+
+  // fold (A + (0-B)) -> A-B
+  if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0)))
+    return DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(1));
+
+  // fold (A+(B-A)) -> B
+  if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1))
+    return N1.getOperand(0);
+
+  // fold ((B-A)+A) -> B
+  if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1))
+    return N0.getOperand(0);
+
+  // fold ((A-B)+(C-A)) -> (C-B)
+  if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB &&
+      N0.getOperand(0) == N1.getOperand(1))
+    return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
+                       N0.getOperand(1));
+
+  // fold ((A-B)+(B-C)) -> (A-C)
+  if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB &&
+      N0.getOperand(1) == N1.getOperand(0))
+    return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
+                       N1.getOperand(1));
+
+  // fold (A+(B-(A+C))) to (B-C)
+  if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
+      N0 == N1.getOperand(1).getOperand(0))
+    return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
+                       N1.getOperand(1).getOperand(1));
+
+  // fold (A+(B-(C+A))) to (B-C)
+  if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
+      N0 == N1.getOperand(1).getOperand(1))
+    return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
+                       N1.getOperand(1).getOperand(0));
+
+  // fold (A+((B-A)+or-C)) to (B+or-C)
+  if ((N1.getOpcode() == ISD::SUB || N1.getOpcode() == ISD::ADD) &&
+      N1.getOperand(0).getOpcode() == ISD::SUB &&
+      N0 == N1.getOperand(0).getOperand(1))
+    return DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0).getOperand(0),
+                       N1.getOperand(1));
+
+  // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
+  if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB &&
+      N0->hasOneUse() && N1->hasOneUse()) {
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+    SDValue N10 = N1.getOperand(0);
+    SDValue N11 = N1.getOperand(1);
+
+    if (isConstantOrConstantVector(N00) || isConstantOrConstantVector(N10))
+      return DAG.getNode(ISD::SUB, DL, VT,
+                         DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10),
+                         DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11));
+  }
+
+  // fold (add (umax X, C), -C) --> (usubsat X, C)
+  if (N0.getOpcode() == ISD::UMAX && hasOperation(ISD::USUBSAT, VT)) {
+    auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) {
+      return (!Max && !Op) ||
+             (Max && Op && Max->getAPIntValue() == (-Op->getAPIntValue()));
+    };
+    if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchUSUBSAT,
+                                  /*AllowUndefs*/ true))
+      return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0),
+                         N0.getOperand(1));
+  }
+
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  if (isOneOrOneSplat(N1)) {
+    // fold (add (xor a, -1), 1) -> (sub 0, a)
+    if (isBitwiseNot(N0))
+      return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+                         N0.getOperand(0));
+
+    // fold (add (add (xor a, -1), b), 1) -> (sub b, a)
+    if (N0.getOpcode() == ISD::ADD) {
+      SDValue A, Xor;
+
+      if (isBitwiseNot(N0.getOperand(0))) {
+        A = N0.getOperand(1);
+        Xor = N0.getOperand(0);
+      } else if (isBitwiseNot(N0.getOperand(1))) {
+        A = N0.getOperand(0);
+        Xor = N0.getOperand(1);
+      }
+
+      if (Xor)
+        return DAG.getNode(ISD::SUB, DL, VT, A, Xor.getOperand(0));
+    }
+
+    // Look for:
+    //   add (add x, y), 1
+    // And if the target does not like this form then turn into:
+    //   sub y, (xor x, -1)
+    if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.getOpcode() == ISD::ADD &&
+        N0.hasOneUse() &&
+        // Limit this to after legalization if the add has wrap flags
+        (Level >= AfterLegalizeDAG || (!N->getFlags().hasNoUnsignedWrap() &&
+                                       !N->getFlags().hasNoSignedWrap()))) {
+      SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
+                                DAG.getAllOnesConstant(DL, VT));
+      return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(1), Not);
+    }
+  }
+
+  // (x - y) + -1  ->  add (xor y, -1), x
+  if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
+      isAllOnesOrAllOnesSplat(N1)) {
+    SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), N1);
+    return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0));
+  }
+
+  if (SDValue Combined = visitADDLikeCommutative(N0, N1, N))
+    return Combined;
+
+  if (SDValue Combined = visitADDLikeCommutative(N1, N0, N))
+    return Combined;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADD(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  SDLoc DL(N);
+
+  if (SDValue Combined = visitADDLike(N))
+    return Combined;
+
+  if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG))
+    return V;
+
+  if (SDValue V = foldAddSubOfSignBit(N, DAG))
+    return V;
+
+  // fold (a+b) -> (a|b) iff a and b share no bits.
+  if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) &&
+      DAG.haveNoCommonBitsSet(N0, N1))
+    return DAG.getNode(ISD::OR, DL, VT, N0, N1);
+
+  // Fold (add (vscale * C0), (vscale * C1)) to (vscale * (C0 + C1)).
+  if (N0.getOpcode() == ISD::VSCALE && N1.getOpcode() == ISD::VSCALE) {
+    const APInt &C0 = N0->getConstantOperandAPInt(0);
+    const APInt &C1 = N1->getConstantOperandAPInt(0);
+    return DAG.getVScale(DL, VT, C0 + C1);
+  }
+
+  // fold a+vscale(c1)+vscale(c2) -> a+vscale(c1+c2)
+  if (N0.getOpcode() == ISD::ADD &&
+      N0.getOperand(1).getOpcode() == ISD::VSCALE &&
+      N1.getOpcode() == ISD::VSCALE) {
+    const APInt &VS0 = N0.getOperand(1)->getConstantOperandAPInt(0);
+    const APInt &VS1 = N1->getConstantOperandAPInt(0);
+    SDValue VS = DAG.getVScale(DL, VT, VS0 + VS1);
+    return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), VS);
+  }
+
+  // Fold (add step_vector(c1), step_vector(c2)  to step_vector(c1+c2))
+  if (N0.getOpcode() == ISD::STEP_VECTOR &&
+      N1.getOpcode() == ISD::STEP_VECTOR) {
+    const APInt &C0 = N0->getConstantOperandAPInt(0);
+    const APInt &C1 = N1->getConstantOperandAPInt(0);
+    APInt NewStep = C0 + C1;
+    return DAG.getStepVector(DL, VT, NewStep);
+  }
+
+  // Fold a + step_vector(c1) + step_vector(c2) to a + step_vector(c1+c2)
+  if (N0.getOpcode() == ISD::ADD &&
+      N0.getOperand(1).getOpcode() == ISD::STEP_VECTOR &&
+      N1.getOpcode() == ISD::STEP_VECTOR) {
+    const APInt &SV0 = N0.getOperand(1)->getConstantOperandAPInt(0);
+    const APInt &SV1 = N1->getConstantOperandAPInt(0);
+    APInt NewStep = SV0 + SV1;
+    SDValue SV = DAG.getStepVector(DL, VT, NewStep);
+    return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), SV);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADDSAT(SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  bool IsSigned = Opcode == ISD::SADDSAT;
+  SDLoc DL(N);
+
+  // fold (add_sat x, undef) -> -1
+  if (N0.isUndef() || N1.isUndef())
+    return DAG.getAllOnesConstant(DL, VT);
+
+  // fold (add_sat c1, c2) -> c3
+  if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(Opcode, DL, VT, N1, N0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+    // fold (add_sat x, 0) -> x, vector edition
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
+      return N0;
+  }
+
+  // fold (add_sat x, 0) -> x
+  if (isNullConstant(N1))
+    return N0;
+
+  // If it cannot overflow, transform into an add.
+  if (DAG.computeOverflowForAdd(IsSigned, N0, N1) == SelectionDAG::OFK_Never)
+    return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
+
+  return SDValue();
+}
+
+static SDValue getAsCarry(const TargetLowering &TLI, SDValue V,
+                          bool ForceCarryReconstruction = false) {
+  bool Masked = false;
+
+  // First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization.
+  while (true) {
+    if (V.getOpcode() == ISD::TRUNCATE || V.getOpcode() == ISD::ZERO_EXTEND) {
+      V = V.getOperand(0);
+      continue;
+    }
+
+    if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) {
+      if (ForceCarryReconstruction)
+        return V;
+
+      Masked = true;
+      V = V.getOperand(0);
+      continue;
+    }
+
+    if (ForceCarryReconstruction && V.getValueType() == MVT::i1)
+      return V;
+
+    break;
+  }
+
+  // If this is not a carry, return.
+  if (V.getResNo() != 1)
+    return SDValue();
+
+  if (V.getOpcode() != ISD::UADDO_CARRY && V.getOpcode() != ISD::USUBO_CARRY &&
+      V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO)
+    return SDValue();
+
+  EVT VT = V->getValueType(0);
+  if (!TLI.isOperationLegalOrCustom(V.getOpcode(), VT))
+    return SDValue();
+
+  // If the result is masked, then no matter what kind of bool it is we can
+  // return. If it isn't, then we need to make sure the bool type is either 0 or
+  // 1 and not other values.
+  if (Masked ||
+      TLI.getBooleanContents(V.getValueType()) ==
+          TargetLoweringBase::ZeroOrOneBooleanContent)
+    return V;
+
+  return SDValue();
+}
+
+/// Given the operands of an add/sub operation, see if the 2nd operand is a
+/// masked 0/1 whose source operand is actually known to be 0/-1. If so, invert
+/// the opcode and bypass the mask operation.
+static SDValue foldAddSubMasked1(bool IsAdd, SDValue N0, SDValue N1,
+                                 SelectionDAG &DAG, const SDLoc &DL) {
+  if (N1.getOpcode() == ISD::ZERO_EXTEND)
+    N1 = N1.getOperand(0);
+
+  if (N1.getOpcode() != ISD::AND || !isOneOrOneSplat(N1->getOperand(1)))
+    return SDValue();
+
+  EVT VT = N0.getValueType();
+  SDValue N10 = N1.getOperand(0);
+  if (N10.getValueType() != VT && N10.getOpcode() == ISD::TRUNCATE)
+    N10 = N10.getOperand(0);
+
+  if (N10.getValueType() != VT)
+    return SDValue();
+
+  if (DAG.ComputeNumSignBits(N10) != VT.getScalarSizeInBits())
+    return SDValue();
+
+  // add N0, (and (AssertSext X, i1), 1) --> sub N0, X
+  // sub N0, (and (AssertSext X, i1), 1) --> add N0, X
+  return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, N0, N10);
+}
+
+/// Helper for doing combines based on N0 and N1 being added to each other.
+SDValue DAGCombiner::visitADDLikeCommutative(SDValue N0, SDValue N1,
+                                          SDNode *LocReference) {
+  EVT VT = N0.getValueType();
+  SDLoc DL(LocReference);
+
+  // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
+  if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&
+      isNullOrNullSplat(N1.getOperand(0).getOperand(0)))
+    return DAG.getNode(ISD::SUB, DL, VT, N0,
+                       DAG.getNode(ISD::SHL, DL, VT,
+                                   N1.getOperand(0).getOperand(1),
+                                   N1.getOperand(1)));
+
+  if (SDValue V = foldAddSubMasked1(true, N0, N1, DAG, DL))
+    return V;
+
+  // Look for:
+  //   add (add x, 1), y
+  // And if the target does not like this form then turn into:
+  //   sub y, (xor x, -1)
+  if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.getOpcode() == ISD::ADD &&
+      N0.hasOneUse() && isOneOrOneSplat(N0.getOperand(1)) &&
+      // Limit this to after legalization if the add has wrap flags
+      (Level >= AfterLegalizeDAG || (!N0->getFlags().hasNoUnsignedWrap() &&
+                                     !N0->getFlags().hasNoSignedWrap()))) {
+    SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
+                              DAG.getAllOnesConstant(DL, VT));
+    return DAG.getNode(ISD::SUB, DL, VT, N1, Not);
+  }
+
+  if (N0.getOpcode() == ISD::SUB && N0.hasOneUse()) {
+    // Hoist one-use subtraction by non-opaque constant:
+    //   (x - C) + y  ->  (x + y) - C
+    // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors.
+    if (isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) {
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), N1);
+      return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1));
+    }
+    // Hoist one-use subtraction from non-opaque constant:
+    //   (C - x) + y  ->  (y - x) + C
+    if (isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) {
+      SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));
+      return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(0));
+    }
+  }
+
+  // add (mul x, C), x -> mul x, C+1
+  if (N0.getOpcode() == ISD::MUL && N0.getOperand(0) == N1 &&
+      isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true) &&
+      N0.hasOneUse()) {
+    SDValue NewC = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1),
+                               DAG.getConstant(1, DL, VT));
+    return DAG.getNode(ISD::MUL, DL, VT, N0.getOperand(0), NewC);
+  }
+
+  // If the target's bool is represented as 0/1, prefer to make this 'sub 0/1'
+  // rather than 'add 0/-1' (the zext should get folded).
+  // add (sext i1 Y), X --> sub X, (zext i1 Y)
+  if (N0.getOpcode() == ISD::SIGN_EXTEND &&
+      N0.getOperand(0).getScalarValueSizeInBits() == 1 &&
+      TLI.getBooleanContents(VT) == TargetLowering::ZeroOrOneBooleanContent) {
+    SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
+    return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt);
+  }
+
+  // add X, (sextinreg Y i1) -> sub X, (and Y 1)
+  if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
+    if (TN->getVT() == MVT::i1) {
+      SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
+                                 DAG.getConstant(1, DL, VT));
+      return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt);
+    }
+  }
+
+  // (add X, (uaddo_carry Y, 0, Carry)) -> (uaddo_carry X, Y, Carry)
+  if (N1.getOpcode() == ISD::UADDO_CARRY && isNullConstant(N1.getOperand(1)) &&
+      N1.getResNo() == 0)
+    return DAG.getNode(ISD::UADDO_CARRY, DL, N1->getVTList(),
+                       N0, N1.getOperand(0), N1.getOperand(2));
+
+  // (add X, Carry) -> (uaddo_carry X, 0, Carry)
+  if (TLI.isOperationLegalOrCustom(ISD::UADDO_CARRY, VT))
+    if (SDValue Carry = getAsCarry(TLI, N1))
+      return DAG.getNode(ISD::UADDO_CARRY, DL,
+                         DAG.getVTList(VT, Carry.getValueType()), N0,
+                         DAG.getConstant(0, DL, VT), Carry);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADDC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  SDLoc DL(N);
+
+  // If the flag result is dead, turn this into an ADD.
+  if (!N->hasAnyUseOfValue(1))
+    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
+
+  // canonicalize constant to RHS.
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0);
+
+  // fold (addc x, 0) -> x + no carry out
+  if (isNullConstant(N1))
+    return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE,
+                                        DL, MVT::Glue));
+
+  // If it cannot overflow, transform into an add.
+  if (DAG.computeOverflowForUnsignedAdd(N0, N1) == SelectionDAG::OFK_Never)
+    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
+
+  return SDValue();
+}
+
+/**
+ * Flips a boolean if it is cheaper to compute. If the Force parameters is set,
+ * then the flip also occurs if computing the inverse is the same cost.
+ * This function returns an empty SDValue in case it cannot flip the boolean
+ * without increasing the cost of the computation. If you want to flip a boolean
+ * no matter what, use DAG.getLogicalNOT.
+ */
+static SDValue extractBooleanFlip(SDValue V, SelectionDAG &DAG,
+                                  const TargetLowering &TLI,
+                                  bool Force) {
+  if (Force && isa<ConstantSDNode>(V))
+    return DAG.getLogicalNOT(SDLoc(V), V, V.getValueType());
+
+  if (V.getOpcode() != ISD::XOR)
+    return SDValue();
+
+  ConstantSDNode *Const = isConstOrConstSplat(V.getOperand(1), false);
+  if (!Const)
+    return SDValue();
+
+  EVT VT = V.getValueType();
+
+  bool IsFlip = false;
+  switch(TLI.getBooleanContents(VT)) {
+    case TargetLowering::ZeroOrOneBooleanContent:
+      IsFlip = Const->isOne();
+      break;
+    case TargetLowering::ZeroOrNegativeOneBooleanContent:
+      IsFlip = Const->isAllOnes();
+      break;
+    case TargetLowering::UndefinedBooleanContent:
+      IsFlip = (Const->getAPIntValue() & 0x01) == 1;
+      break;
+  }
+
+  if (IsFlip)
+    return V.getOperand(0);
+  if (Force)
+    return DAG.getLogicalNOT(SDLoc(V), V, V.getValueType());
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADDO(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  bool IsSigned = (ISD::SADDO == N->getOpcode());
+
+  EVT CarryVT = N->getValueType(1);
+  SDLoc DL(N);
+
+  // If the flag result is dead, turn this into an ADD.
+  if (!N->hasAnyUseOfValue(1))
+    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
+                     DAG.getUNDEF(CarryVT));
+
+  // canonicalize constant to RHS.
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0);
+
+  // fold (addo x, 0) -> x + no carry out
+  if (isNullOrNullSplat(N1))
+    return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
+
+  // If it cannot overflow, transform into an add.
+  if (DAG.computeOverflowForAdd(IsSigned, N0, N1) == SelectionDAG::OFK_Never)
+    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
+                     DAG.getConstant(0, DL, CarryVT));
+
+  if (!IsSigned) {
+    // fold (uaddo (xor a, -1), 1) -> (usub 0, a) and flip carry.
+    if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) {
+      SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(),
+                                DAG.getConstant(0, DL, VT), N0.getOperand(0));
+      return CombineTo(
+          N, Sub, DAG.getLogicalNOT(DL, Sub.getValue(1), Sub->getValueType(1)));
+    }
+
+    if (SDValue Combined = visitUADDOLike(N0, N1, N))
+      return Combined;
+
+    if (SDValue Combined = visitUADDOLike(N1, N0, N))
+      return Combined;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) {
+  EVT VT = N0.getValueType();
+  if (VT.isVector())
+    return SDValue();
+
+  // (uaddo X, (uaddo_carry Y, 0, Carry)) -> (uaddo_carry X, Y, Carry)
+  // If Y + 1 cannot overflow.
+  if (N1.getOpcode() == ISD::UADDO_CARRY && isNullConstant(N1.getOperand(1))) {
+    SDValue Y = N1.getOperand(0);
+    SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType());
+    if (DAG.computeOverflowForUnsignedAdd(Y, One) == SelectionDAG::OFK_Never)
+      return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), N->getVTList(), N0, Y,
+                         N1.getOperand(2));
+  }
+
+  // (uaddo X, Carry) -> (uaddo_carry X, 0, Carry)
+  if (TLI.isOperationLegalOrCustom(ISD::UADDO_CARRY, VT))
+    if (SDValue Carry = getAsCarry(TLI, N1))
+      return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), N->getVTList(), N0,
+                         DAG.getConstant(0, SDLoc(N), VT), Carry);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADDE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+
+  // canonicalize constant to RHS
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(),
+                       N1, N0, CarryIn);
+
+  // fold (adde x, y, false) -> (addc x, y)
+  if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
+    return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N0, N1);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUADDO_CARRY(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+  SDLoc DL(N);
+
+  // canonicalize constant to RHS
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::UADDO_CARRY, DL, N->getVTList(), N1, N0, CarryIn);
+
+  // fold (uaddo_carry x, y, false) -> (uaddo x, y)
+  if (isNullConstant(CarryIn)) {
+    if (!LegalOperations ||
+        TLI.isOperationLegalOrCustom(ISD::UADDO, N->getValueType(0)))
+      return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);
+  }
+
+  // fold (uaddo_carry 0, 0, X) -> (and (ext/trunc X), 1) and no carry.
+  if (isNullConstant(N0) && isNullConstant(N1)) {
+    EVT VT = N0.getValueType();
+    EVT CarryVT = CarryIn.getValueType();
+    SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT);
+    AddToWorklist(CarryExt.getNode());
+    return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt,
+                                    DAG.getConstant(1, DL, VT)),
+                     DAG.getConstant(0, DL, CarryVT));
+  }
+
+  if (SDValue Combined = visitUADDO_CARRYLike(N0, N1, CarryIn, N))
+    return Combined;
+
+  if (SDValue Combined = visitUADDO_CARRYLike(N1, N0, CarryIn, N))
+    return Combined;
+
+  // We want to avoid useless duplication.
+  // TODO: This is done automatically for binary operations. As UADDO_CARRY is
+  // not a binary operation, this is not really possible to leverage this
+  // existing mechanism for it. However, if more operations require the same
+  // deduplication logic, then it may be worth generalize.
+  SDValue Ops[] = {N1, N0, CarryIn};
+  SDNode *CSENode =
+      DAG.getNodeIfExists(ISD::UADDO_CARRY, N->getVTList(), Ops, N->getFlags());
+  if (CSENode)
+    return SDValue(CSENode, 0);
+
+  return SDValue();
+}
+
+/**
+ * If we are facing some sort of diamond carry propapagtion pattern try to
+ * break it up to generate something like:
+ *   (uaddo_carry X, 0, (uaddo_carry A, B, Z):Carry)
+ *
+ * The end result is usually an increase in operation required, but because the
+ * carry is now linearized, other transforms can kick in and optimize the DAG.
+ *
+ * Patterns typically look something like
+ *                (uaddo A, B)
+ *                /          \
+ *             Carry         Sum
+ *               |             \
+ *               | (uaddo_carry *, 0, Z)
+ *               |       /
+ *                \   Carry
+ *                 |   /
+ * (uaddo_carry X, *, *)
+ *
+ * But numerous variation exist. Our goal is to identify A, B, X and Z and
+ * produce a combine with a single path for carry propagation.
+ */
+static SDValue combineUADDO_CARRYDiamond(DAGCombiner &Combiner,
+                                         SelectionDAG &DAG, SDValue X,
+                                         SDValue Carry0, SDValue Carry1,
+                                         SDNode *N) {
+  if (Carry1.getResNo() != 1 || Carry0.getResNo() != 1)
+    return SDValue();
+  if (Carry1.getOpcode() != ISD::UADDO)
+    return SDValue();
+
+  SDValue Z;
+
+  /**
+   * First look for a suitable Z. It will present itself in the form of
+   * (uaddo_carry Y, 0, Z) or its equivalent (uaddo Y, 1) for Z=true
+   */
+  if (Carry0.getOpcode() == ISD::UADDO_CARRY &&
+      isNullConstant(Carry0.getOperand(1))) {
+    Z = Carry0.getOperand(2);
+  } else if (Carry0.getOpcode() == ISD::UADDO &&
+             isOneConstant(Carry0.getOperand(1))) {
+    EVT VT = Combiner.getSetCCResultType(Carry0.getValueType());
+    Z = DAG.getConstant(1, SDLoc(Carry0.getOperand(1)), VT);
+  } else {
+    // We couldn't find a suitable Z.
+    return SDValue();
+  }
+
+
+  auto cancelDiamond = [&](SDValue A,SDValue B) {
+    SDLoc DL(N);
+    SDValue NewY =
+        DAG.getNode(ISD::UADDO_CARRY, DL, Carry0->getVTList(), A, B, Z);
+    Combiner.AddToWorklist(NewY.getNode());
+    return DAG.getNode(ISD::UADDO_CARRY, DL, N->getVTList(), X,
+                       DAG.getConstant(0, DL, X.getValueType()),
+                       NewY.getValue(1));
+  };
+
+  /**
+   *         (uaddo A, B)
+   *              |
+   *             Sum
+   *              |
+   * (uaddo_carry *, 0, Z)
+   */
+  if (Carry0.getOperand(0) == Carry1.getValue(0)) {
+    return cancelDiamond(Carry1.getOperand(0), Carry1.getOperand(1));
+  }
+
+  /**
+   * (uaddo_carry A, 0, Z)
+   *         |
+   *        Sum
+   *         |
+   *  (uaddo *, B)
+   */
+  if (Carry1.getOperand(0) == Carry0.getValue(0)) {
+    return cancelDiamond(Carry0.getOperand(0), Carry1.getOperand(1));
+  }
+
+  if (Carry1.getOperand(1) == Carry0.getValue(0)) {
+    return cancelDiamond(Carry1.getOperand(0), Carry0.getOperand(0));
+  }
+
+  return SDValue();
+}
+
+// If we are facing some sort of diamond carry/borrow in/out pattern try to
+// match patterns like:
+//
+//          (uaddo A, B)            CarryIn
+//            |  \                     |
+//            |   \                    |
+//    PartialSum   PartialCarryOutX   /
+//            |        |             /
+//            |    ____|____________/
+//            |   /    |
+//     (uaddo *, *)    \________
+//       |  \                   \
+//       |   \                   |
+//       |    PartialCarryOutY   |
+//       |        \              |
+//       |         \            /
+//   AddCarrySum    |    ______/
+//                  |   /
+//   CarryOut = (or *, *)
+//
+// And generate UADDO_CARRY (or USUBO_CARRY) with two result values:
+//
+//    {AddCarrySum, CarryOut} = (uaddo_carry A, B, CarryIn)
+//
+// Our goal is to identify A, B, and CarryIn and produce UADDO_CARRY/USUBO_CARRY
+// with a single path for carry/borrow out propagation.
+static SDValue combineCarryDiamond(SelectionDAG &DAG, const TargetLowering &TLI,
+                                   SDValue N0, SDValue N1, SDNode *N) {
+  SDValue Carry0 = getAsCarry(TLI, N0);
+  if (!Carry0)
+    return SDValue();
+  SDValue Carry1 = getAsCarry(TLI, N1);
+  if (!Carry1)
+    return SDValue();
+
+  unsigned Opcode = Carry0.getOpcode();
+  if (Opcode != Carry1.getOpcode())
+    return SDValue();
+  if (Opcode != ISD::UADDO && Opcode != ISD::USUBO)
+    return SDValue();
+
+  // Canonicalize the add/sub of A and B (the top node in the above ASCII art)
+  // as Carry0 and the add/sub of the carry in as Carry1 (the middle node).
+  if (Carry1.getNode()->isOperandOf(Carry0.getNode()))
+    std::swap(Carry0, Carry1);
+
+  // Check if nodes are connected in expected way.
+  if (Carry1.getOperand(0) != Carry0.getValue(0) &&
+      Carry1.getOperand(1) != Carry0.getValue(0))
+    return SDValue();
+
+  // The carry in value must be on the righthand side for subtraction.
+  unsigned CarryInOperandNum =
+      Carry1.getOperand(0) == Carry0.getValue(0) ? 1 : 0;
+  if (Opcode == ISD::USUBO && CarryInOperandNum != 1)
+    return SDValue();
+  SDValue CarryIn = Carry1.getOperand(CarryInOperandNum);
+
+  unsigned NewOp = Opcode == ISD::UADDO ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
+  if (!TLI.isOperationLegalOrCustom(NewOp, Carry0.getValue(0).getValueType()))
+    return SDValue();
+
+  // Verify that the carry/borrow in is plausibly a carry/borrow bit.
+  CarryIn = getAsCarry(TLI, CarryIn, true);
+  if (!CarryIn)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue Merged =
+      DAG.getNode(NewOp, DL, Carry1->getVTList(), Carry0.getOperand(0),
+                  Carry0.getOperand(1), CarryIn);
+
+  // Please note that because we have proven that the result of the UADDO/USUBO
+  // of A and B feeds into the UADDO/USUBO that does the carry/borrow in, we can
+  // therefore prove that if the first UADDO/USUBO overflows, the second
+  // UADDO/USUBO cannot. For example consider 8-bit numbers where 0xFF is the
+  // maximum value.
+  //
+  //   0xFF + 0xFF == 0xFE with carry but 0xFE + 1 does not carry
+  //   0x00 - 0xFF == 1 with a carry/borrow but 1 - 1 == 0 (no carry/borrow)
+  //
+  // This is important because it means that OR and XOR can be used to merge
+  // carry flags; and that AND can return a constant zero.
+  //
+  // TODO: match other operations that can merge flags (ADD, etc)
+  DAG.ReplaceAllUsesOfValueWith(Carry1.getValue(0), Merged.getValue(0));
+  if (N->getOpcode() == ISD::AND)
+    return DAG.getConstant(0, DL, MVT::i1);
+  return Merged.getValue(1);
+}
+
+SDValue DAGCombiner::visitUADDO_CARRYLike(SDValue N0, SDValue N1,
+                                          SDValue CarryIn, SDNode *N) {
+  // fold (uaddo_carry (xor a, -1), b, c) -> (usubo_carry b, a, !c) and flip
+  // carry.
+  if (isBitwiseNot(N0))
+    if (SDValue NotC = extractBooleanFlip(CarryIn, DAG, TLI, true)) {
+      SDLoc DL(N);
+      SDValue Sub = DAG.getNode(ISD::USUBO_CARRY, DL, N->getVTList(), N1,
+                                N0.getOperand(0), NotC);
+      return CombineTo(
+          N, Sub, DAG.getLogicalNOT(DL, Sub.getValue(1), Sub->getValueType(1)));
+    }
+
+  // Iff the flag result is dead:
+  // (uaddo_carry (add|uaddo X, Y), 0, Carry) -> (uaddo_carry X, Y, Carry)
+  // Don't do this if the Carry comes from the uaddo. It won't remove the uaddo
+  // or the dependency between the instructions.
+  if ((N0.getOpcode() == ISD::ADD ||
+       (N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0 &&
+        N0.getValue(1) != CarryIn)) &&
+      isNullConstant(N1) && !N->hasAnyUseOfValue(1))
+    return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), N->getVTList(),
+                       N0.getOperand(0), N0.getOperand(1), CarryIn);
+
+  /**
+   * When one of the uaddo_carry argument is itself a carry, we may be facing
+   * a diamond carry propagation. In which case we try to transform the DAG
+   * to ensure linear carry propagation if that is possible.
+   */
+  if (auto Y = getAsCarry(TLI, N1)) {
+    // Because both are carries, Y and Z can be swapped.
+    if (auto R = combineUADDO_CARRYDiamond(*this, DAG, N0, Y, CarryIn, N))
+      return R;
+    if (auto R = combineUADDO_CARRYDiamond(*this, DAG, N0, CarryIn, Y, N))
+      return R;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSADDO_CARRY(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+  SDLoc DL(N);
+
+  // canonicalize constant to RHS
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::SADDO_CARRY, DL, N->getVTList(), N1, N0, CarryIn);
+
+  // fold (saddo_carry x, y, false) -> (saddo x, y)
+  if (isNullConstant(CarryIn)) {
+    if (!LegalOperations ||
+        TLI.isOperationLegalOrCustom(ISD::SADDO, N->getValueType(0)))
+      return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0, N1);
+  }
+
+  return SDValue();
+}
+
+// Attempt to create a USUBSAT(LHS, RHS) node with DstVT, performing a
+// clamp/truncation if necessary.
+static SDValue getTruncatedUSUBSAT(EVT DstVT, EVT SrcVT, SDValue LHS,
+                                   SDValue RHS, SelectionDAG &DAG,
+                                   const SDLoc &DL) {
+  assert(DstVT.getScalarSizeInBits() <= SrcVT.getScalarSizeInBits() &&
+         "Illegal truncation");
+
+  if (DstVT == SrcVT)
+    return DAG.getNode(ISD::USUBSAT, DL, DstVT, LHS, RHS);
+
+  // If the LHS is zero-extended then we can perform the USUBSAT as DstVT by
+  // clamping RHS.
+  APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
+                                          DstVT.getScalarSizeInBits());
+  if (!DAG.MaskedValueIsZero(LHS, UpperBits))
+    return SDValue();
+
+  SDValue SatLimit =
+      DAG.getConstant(APInt::getLowBitsSet(SrcVT.getScalarSizeInBits(),
+                                           DstVT.getScalarSizeInBits()),
+                      DL, SrcVT);
+  RHS = DAG.getNode(ISD::UMIN, DL, SrcVT, RHS, SatLimit);
+  RHS = DAG.getNode(ISD::TRUNCATE, DL, DstVT, RHS);
+  LHS = DAG.getNode(ISD::TRUNCATE, DL, DstVT, LHS);
+  return DAG.getNode(ISD::USUBSAT, DL, DstVT, LHS, RHS);
+}
+
+// Try to find umax(a,b) - b or a - umin(a,b) patterns that may be converted to
+// usubsat(a,b), optionally as a truncated type.
+SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N) {
+  if (N->getOpcode() != ISD::SUB ||
+      !(!LegalOperations || hasOperation(ISD::USUBSAT, DstVT)))
+    return SDValue();
+
+  EVT SubVT = N->getValueType(0);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // Try to find umax(a,b) - b or a - umin(a,b) patterns
+  // they may be converted to usubsat(a,b).
+  if (Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
+    SDValue MaxLHS = Op0.getOperand(0);
+    SDValue MaxRHS = Op0.getOperand(1);
+    if (MaxLHS == Op1)
+      return getTruncatedUSUBSAT(DstVT, SubVT, MaxRHS, Op1, DAG, SDLoc(N));
+    if (MaxRHS == Op1)
+      return getTruncatedUSUBSAT(DstVT, SubVT, MaxLHS, Op1, DAG, SDLoc(N));
+  }
+
+  if (Op1.getOpcode() == ISD::UMIN && Op1.hasOneUse()) {
+    SDValue MinLHS = Op1.getOperand(0);
+    SDValue MinRHS = Op1.getOperand(1);
+    if (MinLHS == Op0)
+      return getTruncatedUSUBSAT(DstVT, SubVT, Op0, MinRHS, DAG, SDLoc(N));
+    if (MinRHS == Op0)
+      return getTruncatedUSUBSAT(DstVT, SubVT, Op0, MinLHS, DAG, SDLoc(N));
+  }
+
+  // sub(a,trunc(umin(zext(a),b))) -> usubsat(a,trunc(umin(b,SatLimit)))
+  if (Op1.getOpcode() == ISD::TRUNCATE &&
+      Op1.getOperand(0).getOpcode() == ISD::UMIN &&
+      Op1.getOperand(0).hasOneUse()) {
+    SDValue MinLHS = Op1.getOperand(0).getOperand(0);
+    SDValue MinRHS = Op1.getOperand(0).getOperand(1);
+    if (MinLHS.getOpcode() == ISD::ZERO_EXTEND && MinLHS.getOperand(0) == Op0)
+      return getTruncatedUSUBSAT(DstVT, MinLHS.getValueType(), MinLHS, MinRHS,
+                                 DAG, SDLoc(N));
+    if (MinRHS.getOpcode() == ISD::ZERO_EXTEND && MinRHS.getOperand(0) == Op0)
+      return getTruncatedUSUBSAT(DstVT, MinLHS.getValueType(), MinRHS, MinLHS,
+                                 DAG, SDLoc(N));
+  }
+
+  return SDValue();
+}
+
+// Since it may not be valid to emit a fold to zero for vector initializers
+// check if we can before folding.
+static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
+                             SelectionDAG &DAG, bool LegalOperations) {
+  if (!VT.isVector())
+    return DAG.getConstant(0, DL, VT);
+  if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
+    return DAG.getConstant(0, DL, VT);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSUB(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  SDLoc DL(N);
+
+  auto PeekThroughFreeze = [](SDValue N) {
+    if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
+      return N->getOperand(0);
+    return N;
+  };
+
+  // fold (sub x, x) -> 0
+  // FIXME: Refactor this and xor and other similar operations together.
+  if (PeekThroughFreeze(N0) == PeekThroughFreeze(N1))
+    return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
+
+  // fold (sub c1, c2) -> c3
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N1}))
+    return C;
+
+  // fold vector ops
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+    // fold (sub x, 0) -> x, vector edition
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
+      return N0;
+  }
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
+
+  // fold (sub x, c) -> (add x, -c)
+  if (N1C) {
+    return DAG.getNode(ISD::ADD, DL, VT, N0,
+                       DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
+  }
+
+  if (isNullOrNullSplat(N0)) {
+    unsigned BitWidth = VT.getScalarSizeInBits();
+    // Right-shifting everything out but the sign bit followed by negation is
+    // the same as flipping arithmetic/logical shift type without the negation:
+    // -(X >>u 31) -> (X >>s 31)
+    // -(X >>s 31) -> (X >>u 31)
+    if (N1->getOpcode() == ISD::SRA || N1->getOpcode() == ISD::SRL) {
+      ConstantSDNode *ShiftAmt = isConstOrConstSplat(N1.getOperand(1));
+      if (ShiftAmt && ShiftAmt->getAPIntValue() == (BitWidth - 1)) {
+        auto NewSh = N1->getOpcode() == ISD::SRA ? ISD::SRL : ISD::SRA;
+        if (!LegalOperations || TLI.isOperationLegal(NewSh, VT))
+          return DAG.getNode(NewSh, DL, VT, N1.getOperand(0), N1.getOperand(1));
+      }
+    }
+
+    // 0 - X --> 0 if the sub is NUW.
+    if (N->getFlags().hasNoUnsignedWrap())
+      return N0;
+
+    if (DAG.MaskedValueIsZero(N1, ~APInt::getSignMask(BitWidth))) {
+      // N1 is either 0 or the minimum signed value. If the sub is NSW, then
+      // N1 must be 0 because negating the minimum signed value is undefined.
+      if (N->getFlags().hasNoSignedWrap())
+        return N0;
+
+      // 0 - X --> X if X is 0 or the minimum signed value.
+      return N1;
+    }
+
+    // Convert 0 - abs(x).
+    if (N1.getOpcode() == ISD::ABS && N1.hasOneUse() &&
+        !TLI.isOperationLegalOrCustom(ISD::ABS, VT))
+      if (SDValue Result = TLI.expandABS(N1.getNode(), DAG, true))
+        return Result;
+
+    // Fold neg(splat(neg(x)) -> splat(x)
+    if (VT.isVector()) {
+      SDValue N1S = DAG.getSplatValue(N1, true);
+      if (N1S && N1S.getOpcode() == ISD::SUB &&
+          isNullConstant(N1S.getOperand(0)))
+        return DAG.getSplat(VT, DL, N1S.getOperand(1));
+    }
+  }
+
+  // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1)
+  if (isAllOnesOrAllOnesSplat(N0))
+    return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
+
+  // fold (A - (0-B)) -> A+B
+  if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0)))
+    return DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(1));
+
+  // fold A-(A-B) -> B
+  if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0))
+    return N1.getOperand(1);
+
+  // fold (A+B)-A -> B
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1)
+    return N0.getOperand(1);
+
+  // fold (A+B)-B -> A
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1)
+    return N0.getOperand(0);
+
+  // fold (A+C1)-C2 -> A+(C1-C2)
+  if (N0.getOpcode() == ISD::ADD) {
+    SDValue N01 = N0.getOperand(1);
+    if (SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N01, N1}))
+      return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), NewC);
+  }
+
+  // fold C2-(A+C1) -> (C2-C1)-A
+  if (N1.getOpcode() == ISD::ADD) {
+    SDValue N11 = N1.getOperand(1);
+    if (SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N11}))
+      return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0));
+  }
+
+  // fold (A-C1)-C2 -> A-(C1+C2)
+  if (N0.getOpcode() == ISD::SUB) {
+    SDValue N01 = N0.getOperand(1);
+    if (SDValue NewC = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N01, N1}))
+      return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), NewC);
+  }
+
+  // fold (c1-A)-c2 -> (c1-c2)-A
+  if (N0.getOpcode() == ISD::SUB) {
+    SDValue N00 = N0.getOperand(0);
+    if (SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N00, N1}))
+      return DAG.getNode(ISD::SUB, DL, VT, NewC, N0.getOperand(1));
+  }
+
+  // fold ((A+(B+or-C))-B) -> A+or-C
+  if (N0.getOpcode() == ISD::ADD &&
+      (N0.getOperand(1).getOpcode() == ISD::SUB ||
+       N0.getOperand(1).getOpcode() == ISD::ADD) &&
+      N0.getOperand(1).getOperand(0) == N1)
+    return DAG.getNode(N0.getOperand(1).getOpcode(), DL, VT, N0.getOperand(0),
+                       N0.getOperand(1).getOperand(1));
+
+  // fold ((A+(C+B))-B) -> A+C
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(1).getOpcode() == ISD::ADD &&
+      N0.getOperand(1).getOperand(1) == N1)
+    return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0),
+                       N0.getOperand(1).getOperand(0));
+
+  // fold ((A-(B-C))-C) -> A-B
+  if (N0.getOpcode() == ISD::SUB && N0.getOperand(1).getOpcode() == ISD::SUB &&
+      N0.getOperand(1).getOperand(1) == N1)
+    return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
+                       N0.getOperand(1).getOperand(0));
+
+  // fold (A-(B-C)) -> A+(C-B)
+  if (N1.getOpcode() == ISD::SUB && N1.hasOneUse())
+    return DAG.getNode(ISD::ADD, DL, VT, N0,
+                       DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(1),
+                                   N1.getOperand(0)));
+
+  // A - (A & B)  ->  A & (~B)
+  if (N1.getOpcode() == ISD::AND) {
+    SDValue A = N1.getOperand(0);
+    SDValue B = N1.getOperand(1);
+    if (A != N0)
+      std::swap(A, B);
+    if (A == N0 &&
+        (N1.hasOneUse() || isConstantOrConstantVector(B, /*NoOpaques=*/true))) {
+      SDValue InvB =
+          DAG.getNode(ISD::XOR, DL, VT, B, DAG.getAllOnesConstant(DL, VT));
+      return DAG.getNode(ISD::AND, DL, VT, A, InvB);
+    }
+  }
+
+  // fold (X - (-Y * Z)) -> (X + (Y * Z))
+  if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) {
+    if (N1.getOperand(0).getOpcode() == ISD::SUB &&
+        isNullOrNullSplat(N1.getOperand(0).getOperand(0))) {
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
+                                N1.getOperand(0).getOperand(1),
+                                N1.getOperand(1));
+      return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
+    }
+    if (N1.getOperand(1).getOpcode() == ISD::SUB &&
+        isNullOrNullSplat(N1.getOperand(1).getOperand(0))) {
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
+                                N1.getOperand(0),
+                                N1.getOperand(1).getOperand(1));
+      return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
+    }
+  }
+
+  // If either operand of a sub is undef, the result is undef
+  if (N0.isUndef())
+    return N0;
+  if (N1.isUndef())
+    return N1;
+
+  if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG))
+    return V;
+
+  if (SDValue V = foldAddSubOfSignBit(N, DAG))
+    return V;
+
+  if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N)))
+    return V;
+
+  if (SDValue V = foldSubToUSubSat(VT, N))
+    return V;
+
+  // (x - y) - 1  ->  add (xor y, -1), x
+  if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() && isOneOrOneSplat(N1)) {
+    SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1),
+                              DAG.getAllOnesConstant(DL, VT));
+    return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0));
+  }
+
+  // Look for:
+  //   sub y, (xor x, -1)
+  // And if the target does not like this form then turn into:
+  //   add (add x, y), 1
+  if (TLI.preferIncOfAddToSubOfNot(VT) && N1.hasOneUse() && isBitwiseNot(N1)) {
+    SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(0));
+    return DAG.getNode(ISD::ADD, DL, VT, Add, DAG.getConstant(1, DL, VT));
+  }
+
+  // Hoist one-use addition by non-opaque constant:
+  //   (x + C) - y  ->  (x - y) + C
+  if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() &&
+      isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) {
+    SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1);
+    return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(1));
+  }
+  // y - (x + C)  ->  (y - x) - C
+  if (N1.getOpcode() == ISD::ADD && N1.hasOneUse() &&
+      isConstantOrConstantVector(N1.getOperand(1), /*NoOpaques=*/true)) {
+    SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(0));
+    return DAG.getNode(ISD::SUB, DL, VT, Sub, N1.getOperand(1));
+  }
+  // (x - C) - y  ->  (x - y) - C
+  // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors.
+  if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
+      isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) {
+    SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1);
+    return DAG.getNode(ISD::SUB, DL, VT, Sub, N0.getOperand(1));
+  }
+  // (C - x) - y  ->  C - (x + y)
+  if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
+      isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) {
+    SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), N1);
+    return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), Add);
+  }
+
+  // If the target's bool is represented as 0/-1, prefer to make this 'add 0/-1'
+  // rather than 'sub 0/1' (the sext should get folded).
+  // sub X, (zext i1 Y) --> add X, (sext i1 Y)
+  if (N1.getOpcode() == ISD::ZERO_EXTEND &&
+      N1.getOperand(0).getScalarValueSizeInBits() == 1 &&
+      TLI.getBooleanContents(VT) ==
+          TargetLowering::ZeroOrNegativeOneBooleanContent) {
+    SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N1.getOperand(0));
+    return DAG.getNode(ISD::ADD, DL, VT, N0, SExt);
+  }
+
+  // fold Y = sra (X, size(X)-1); sub (xor (X, Y), Y) -> (abs X)
+  if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
+    if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) {
+      SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1);
+      SDValue S0 = N1.getOperand(0);
+      if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0))
+        if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
+          if (C->getAPIntValue() == (VT.getScalarSizeInBits() - 1))
+            return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0);
+    }
+  }
+
+  // If the relocation model supports it, consider symbol offsets.
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
+    if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
+      // fold (sub Sym+c1, Sym+c2) -> c1-c2
+      if (GlobalAddressSDNode *GB = dyn_cast<GlobalAddressSDNode>(N1))
+        if (GA->getGlobal() == GB->getGlobal())
+          return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(),
+                                 DL, VT);
+    }
+
+  // sub X, (sextinreg Y i1) -> add X, (and Y 1)
+  if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
+    if (TN->getVT() == MVT::i1) {
+      SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
+                                 DAG.getConstant(1, DL, VT));
+      return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt);
+    }
+  }
+
+  // canonicalize (sub X, (vscale * C)) to (add X, (vscale * -C))
+  if (N1.getOpcode() == ISD::VSCALE && N1.hasOneUse()) {
+    const APInt &IntVal = N1.getConstantOperandAPInt(0);
+    return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getVScale(DL, VT, -IntVal));
+  }
+
+  // canonicalize (sub X, step_vector(C)) to (add X, step_vector(-C))
+  if (N1.getOpcode() == ISD::STEP_VECTOR && N1.hasOneUse()) {
+    APInt NewStep = -N1.getConstantOperandAPInt(0);
+    return DAG.getNode(ISD::ADD, DL, VT, N0,
+                       DAG.getStepVector(DL, VT, NewStep));
+  }
+
+  // Prefer an add for more folding potential and possibly better codegen:
+  // sub N0, (lshr N10, width-1) --> add N0, (ashr N10, width-1)
+  if (!LegalOperations && N1.getOpcode() == ISD::SRL && N1.hasOneUse()) {
+    SDValue ShAmt = N1.getOperand(1);
+    ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt);
+    if (ShAmtC &&
+        ShAmtC->getAPIntValue() == (N1.getScalarValueSizeInBits() - 1)) {
+      SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0), ShAmt);
+      return DAG.getNode(ISD::ADD, DL, VT, N0, SRA);
+    }
+  }
+
+  // As with the previous fold, prefer add for more folding potential.
+  // Subtracting SMIN/0 is the same as adding SMIN/0:
+  // N0 - (X << BW-1) --> N0 + (X << BW-1)
+  if (N1.getOpcode() == ISD::SHL) {
+    ConstantSDNode *ShlC = isConstOrConstSplat(N1.getOperand(1));
+    if (ShlC && ShlC->getAPIntValue() == VT.getScalarSizeInBits() - 1)
+      return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
+  }
+
+  // (sub (usubo_carry X, 0, Carry), Y) -> (usubo_carry X, Y, Carry)
+  if (N0.getOpcode() == ISD::USUBO_CARRY && isNullConstant(N0.getOperand(1)) &&
+      N0.getResNo() == 0 && N0.hasOneUse())
+    return DAG.getNode(ISD::USUBO_CARRY, DL, N0->getVTList(),
+                       N0.getOperand(0), N1, N0.getOperand(2));
+
+  if (TLI.isOperationLegalOrCustom(ISD::UADDO_CARRY, VT)) {
+    // (sub Carry, X)  ->  (uaddo_carry (sub 0, X), 0, Carry)
+    if (SDValue Carry = getAsCarry(TLI, N0)) {
+      SDValue X = N1;
+      SDValue Zero = DAG.getConstant(0, DL, VT);
+      SDValue NegX = DAG.getNode(ISD::SUB, DL, VT, Zero, X);
+      return DAG.getNode(ISD::UADDO_CARRY, DL,
+                         DAG.getVTList(VT, Carry.getValueType()), NegX, Zero,
+                         Carry);
+    }
+  }
+
+  // If there's no chance of borrowing from adjacent bits, then sub is xor:
+  // sub C0, X --> xor X, C0
+  if (ConstantSDNode *C0 = isConstOrConstSplat(N0)) {
+    if (!C0->isOpaque()) {
+      const APInt &C0Val = C0->getAPIntValue();
+      const APInt &MaybeOnes = ~DAG.computeKnownBits(N1).Zero;
+      if ((C0Val - MaybeOnes) == (C0Val ^ MaybeOnes))
+        return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
+    }
+  }
+
+  // max(a,b) - min(a,b) --> abd(a,b)
+  auto MatchSubMaxMin = [&](unsigned Max, unsigned Min, unsigned Abd) {
+    if (N0.getOpcode() != Max || N1.getOpcode() != Min)
+      return SDValue();
+    if ((N0.getOperand(0) != N1.getOperand(0) ||
+         N0.getOperand(1) != N1.getOperand(1)) &&
+        (N0.getOperand(0) != N1.getOperand(1) ||
+         N0.getOperand(1) != N1.getOperand(0)))
+      return SDValue();
+    if (!hasOperation(Abd, VT))
+      return SDValue();
+    return DAG.getNode(Abd, DL, VT, N0.getOperand(0), N0.getOperand(1));
+  };
+  if (SDValue R = MatchSubMaxMin(ISD::SMAX, ISD::SMIN, ISD::ABDS))
+    return R;
+  if (SDValue R = MatchSubMaxMin(ISD::UMAX, ISD::UMIN, ISD::ABDU))
+    return R;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSUBSAT(SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  bool IsSigned = Opcode == ISD::SSUBSAT;
+  SDLoc DL(N);
+
+  // fold (sub_sat x, undef) -> 0
+  if (N0.isUndef() || N1.isUndef())
+    return DAG.getConstant(0, DL, VT);
+
+  // fold (sub_sat x, x) -> 0
+  if (N0 == N1)
+    return DAG.getConstant(0, DL, VT);
+
+  // fold (sub_sat c1, c2) -> c3
+  if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
+    return C;
+
+  // fold vector ops
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+    // fold (sub_sat x, 0) -> x, vector edition
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
+      return N0;
+  }
+
+  // fold (sub_sat x, 0) -> x
+  if (isNullConstant(N1))
+    return N0;
+
+  // If it cannot overflow, transform into an sub.
+  if (DAG.computeOverflowForSub(IsSigned, N0, N1) == SelectionDAG::OFK_Never)
+    return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSUBC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  SDLoc DL(N);
+
+  // If the flag result is dead, turn this into an SUB.
+  if (!N->hasAnyUseOfValue(1))
+    return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
+
+  // fold (subc x, x) -> 0 + no borrow
+  if (N0 == N1)
+    return CombineTo(N, DAG.getConstant(0, DL, VT),
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
+
+  // fold (subc x, 0) -> x + no borrow
+  if (isNullConstant(N1))
+    return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
+
+  // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow
+  if (isAllOnesConstant(N0))
+    return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSUBO(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  bool IsSigned = (ISD::SSUBO == N->getOpcode());
+
+  EVT CarryVT = N->getValueType(1);
+  SDLoc DL(N);
+
+  // If the flag result is dead, turn this into an SUB.
+  if (!N->hasAnyUseOfValue(1))
+    return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
+                     DAG.getUNDEF(CarryVT));
+
+  // fold (subo x, x) -> 0 + no borrow
+  if (N0 == N1)
+    return CombineTo(N, DAG.getConstant(0, DL, VT),
+                     DAG.getConstant(0, DL, CarryVT));
+
+  ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
+
+  // fold (subox, c) -> (addo x, -c)
+  if (IsSigned && N1C && !N1C->isMinSignedValue()) {
+    return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0,
+                       DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
+  }
+
+  // fold (subo x, 0) -> x + no borrow
+  if (isNullOrNullSplat(N1))
+    return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
+
+  // If it cannot overflow, transform into an sub.
+  if (DAG.computeOverflowForSub(IsSigned, N0, N1) == SelectionDAG::OFK_Never)
+    return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
+                     DAG.getConstant(0, DL, CarryVT));
+
+  // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow
+  if (!IsSigned && isAllOnesOrAllOnesSplat(N0))
+    return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
+                     DAG.getConstant(0, DL, CarryVT));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSUBE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+
+  // fold (sube x, y, false) -> (subc x, y)
+  if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
+    return DAG.getNode(ISD::SUBC, SDLoc(N), N->getVTList(), N0, N1);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUSUBO_CARRY(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+
+  // fold (usubo_carry x, y, false) -> (usubo x, y)
+  if (isNullConstant(CarryIn)) {
+    if (!LegalOperations ||
+        TLI.isOperationLegalOrCustom(ISD::USUBO, N->getValueType(0)))
+      return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSSUBO_CARRY(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+
+  // fold (ssubo_carry x, y, false) -> (ssubo x, y)
+  if (isNullConstant(CarryIn)) {
+    if (!LegalOperations ||
+        TLI.isOperationLegalOrCustom(ISD::SSUBO, N->getValueType(0)))
+      return DAG.getNode(ISD::SSUBO, SDLoc(N), N->getVTList(), N0, N1);
+  }
+
+  return SDValue();
+}
+
+// Notice that "mulfix" can be any of SMULFIX, SMULFIXSAT, UMULFIX and
+// UMULFIXSAT here.
+SDValue DAGCombiner::visitMULFIX(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue Scale = N->getOperand(2);
+  EVT VT = N0.getValueType();
+
+  // fold (mulfix x, undef, scale) -> 0
+  if (N0.isUndef() || N1.isUndef())
+    return DAG.getConstant(0, SDLoc(N), VT);
+
+  // Canonicalize constant to RHS (vector doesn't have to splat)
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0, Scale);
+
+  // fold (mulfix x, 0, scale) -> 0
+  if (isNullConstant(N1))
+    return DAG.getConstant(0, SDLoc(N), VT);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMUL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  SDLoc DL(N);
+
+  // fold (mul x, undef) -> 0
+  if (N0.isUndef() || N1.isUndef())
+    return DAG.getConstant(0, DL, VT);
+
+  // fold (mul c1, c2) -> c1*c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::MUL, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS (vector doesn't have to splat)
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::MUL, DL, VT, N1, N0);
+
+  bool N1IsConst = false;
+  bool N1IsOpaqueConst = false;
+  APInt ConstValue1;
+
+  // fold vector ops
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+    N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1);
+    assert((!N1IsConst ||
+            ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) &&
+           "Splat APInt should be element width");
+  } else {
+    N1IsConst = isa<ConstantSDNode>(N1);
+    if (N1IsConst) {
+      ConstValue1 = cast<ConstantSDNode>(N1)->getAPIntValue();
+      N1IsOpaqueConst = cast<ConstantSDNode>(N1)->isOpaque();
+    }
+  }
+
+  // fold (mul x, 0) -> 0
+  if (N1IsConst && ConstValue1.isZero())
+    return N1;
+
+  // fold (mul x, 1) -> x
+  if (N1IsConst && ConstValue1.isOne())
+    return N0;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  // fold (mul x, -1) -> 0-x
+  if (N1IsConst && ConstValue1.isAllOnes())
+    return DAG.getNegative(N0, DL, VT);
+
+  // fold (mul x, (1 << c)) -> x << c
+  if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
+      DAG.isKnownToBeAPowerOfTwo(N1) &&
+      (!VT.isVector() || Level <= AfterLegalizeVectorOps)) {
+    SDValue LogBase2 = BuildLogBase2(N1, DL);
+    EVT ShiftVT = getShiftAmountTy(N0.getValueType());
+    SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
+    return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc);
+  }
+
+  // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
+  if (N1IsConst && !N1IsOpaqueConst && ConstValue1.isNegatedPowerOf2()) {
+    unsigned Log2Val = (-ConstValue1).logBase2();
+    EVT ShiftVT = getShiftAmountTy(N0.getValueType());
+
+    // FIXME: If the input is something that is easily negated (e.g. a
+    // single-use add), we should put the negate there.
+    return DAG.getNode(ISD::SUB, DL, VT,
+                       DAG.getConstant(0, DL, VT),
+                       DAG.getNode(ISD::SHL, DL, VT, N0,
+                            DAG.getConstant(Log2Val, DL, ShiftVT)));
+  }
+
+  // Attempt to reuse an existing umul_lohi/smul_lohi node, but only if the
+  // hi result is in use in case we hit this mid-legalization.
+  for (unsigned LoHiOpc : {ISD::UMUL_LOHI, ISD::SMUL_LOHI}) {
+    if (!LegalOperations || TLI.isOperationLegalOrCustom(LoHiOpc, VT)) {
+      SDVTList LoHiVT = DAG.getVTList(VT, VT);
+      // TODO: Can we match commutable operands with getNodeIfExists?
+      if (SDNode *LoHi = DAG.getNodeIfExists(LoHiOpc, LoHiVT, {N0, N1}))
+        if (LoHi->hasAnyUseOfValue(1))
+          return SDValue(LoHi, 0);
+      if (SDNode *LoHi = DAG.getNodeIfExists(LoHiOpc, LoHiVT, {N1, N0}))
+        if (LoHi->hasAnyUseOfValue(1))
+          return SDValue(LoHi, 0);
+    }
+  }
+
+  // Try to transform:
+  // (1) multiply-by-(power-of-2 +/- 1) into shift and add/sub.
+  // mul x, (2^N + 1) --> add (shl x, N), x
+  // mul x, (2^N - 1) --> sub (shl x, N), x
+  // Examples: x * 33 --> (x << 5) + x
+  //           x * 15 --> (x << 4) - x
+  //           x * -33 --> -((x << 5) + x)
+  //           x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4)
+  // (2) multiply-by-(power-of-2 +/- power-of-2) into shifts and add/sub.
+  // mul x, (2^N + 2^M) --> (add (shl x, N), (shl x, M))
+  // mul x, (2^N - 2^M) --> (sub (shl x, N), (shl x, M))
+  // Examples: x * 0x8800 --> (x << 15) + (x << 11)
+  //           x * 0xf800 --> (x << 16) - (x << 11)
+  //           x * -0x8800 --> -((x << 15) + (x << 11))
+  //           x * -0xf800 --> -((x << 16) - (x << 11)) ; (x << 11) - (x << 16)
+  if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) {
+    // TODO: We could handle more general decomposition of any constant by
+    //       having the target set a limit on number of ops and making a
+    //       callback to determine that sequence (similar to sqrt expansion).
+    unsigned MathOp = ISD::DELETED_NODE;
+    APInt MulC = ConstValue1.abs();
+    // The constant `2` should be treated as (2^0 + 1).
+    unsigned TZeros = MulC == 2 ? 0 : MulC.countr_zero();
+    MulC.lshrInPlace(TZeros);
+    if ((MulC - 1).isPowerOf2())
+      MathOp = ISD::ADD;
+    else if ((MulC + 1).isPowerOf2())
+      MathOp = ISD::SUB;
+
+    if (MathOp != ISD::DELETED_NODE) {
+      unsigned ShAmt =
+          MathOp == ISD::ADD ? (MulC - 1).logBase2() : (MulC + 1).logBase2();
+      ShAmt += TZeros;
+      assert(ShAmt < VT.getScalarSizeInBits() &&
+             "multiply-by-constant generated out of bounds shift");
+      SDValue Shl =
+          DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShAmt, DL, VT));
+      SDValue R =
+          TZeros ? DAG.getNode(MathOp, DL, VT, Shl,
+                               DAG.getNode(ISD::SHL, DL, VT, N0,
+                                           DAG.getConstant(TZeros, DL, VT)))
+                 : DAG.getNode(MathOp, DL, VT, Shl, N0);
+      if (ConstValue1.isNegative())
+        R = DAG.getNegative(R, DL, VT);
+      return R;
+    }
+  }
+
+  // (mul (shl X, c1), c2) -> (mul X, c2 << c1)
+  if (N0.getOpcode() == ISD::SHL) {
+    SDValue N01 = N0.getOperand(1);
+    if (SDValue C3 = DAG.FoldConstantArithmetic(ISD::SHL, DL, VT, {N1, N01}))
+      return DAG.getNode(ISD::MUL, DL, VT, N0.getOperand(0), C3);
+  }
+
+  // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
+  // use.
+  {
+    SDValue Sh, Y;
+
+    // Check for both (mul (shl X, C), Y)  and  (mul Y, (shl X, C)).
+    if (N0.getOpcode() == ISD::SHL &&
+        isConstantOrConstantVector(N0.getOperand(1)) && N0->hasOneUse()) {
+      Sh = N0; Y = N1;
+    } else if (N1.getOpcode() == ISD::SHL &&
+               isConstantOrConstantVector(N1.getOperand(1)) &&
+               N1->hasOneUse()) {
+      Sh = N1; Y = N0;
+    }
+
+    if (Sh.getNode()) {
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, Sh.getOperand(0), Y);
+      return DAG.getNode(ISD::SHL, DL, VT, Mul, Sh.getOperand(1));
+    }
+  }
+
+  // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
+  if (N0.getOpcode() == ISD::ADD &&
+      DAG.isConstantIntBuildVectorOrConstantInt(N1) &&
+      DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) &&
+      isMulAddWithConstProfitable(N, N0, N1))
+    return DAG.getNode(
+        ISD::ADD, DL, VT,
+        DAG.getNode(ISD::MUL, SDLoc(N0), VT, N0.getOperand(0), N1),
+        DAG.getNode(ISD::MUL, SDLoc(N1), VT, N0.getOperand(1), N1));
+
+  // Fold (mul (vscale * C0), C1) to (vscale * (C0 * C1)).
+  ConstantSDNode *NC1 = isConstOrConstSplat(N1);
+  if (N0.getOpcode() == ISD::VSCALE && NC1) {
+    const APInt &C0 = N0.getConstantOperandAPInt(0);
+    const APInt &C1 = NC1->getAPIntValue();
+    return DAG.getVScale(DL, VT, C0 * C1);
+  }
+
+  // Fold (mul step_vector(C0), C1) to (step_vector(C0 * C1)).
+  APInt MulVal;
+  if (N0.getOpcode() == ISD::STEP_VECTOR &&
+      ISD::isConstantSplatVector(N1.getNode(), MulVal)) {
+    const APInt &C0 = N0.getConstantOperandAPInt(0);
+    APInt NewStep = C0 * MulVal;
+    return DAG.getStepVector(DL, VT, NewStep);
+  }
+
+  // Fold ((mul x, 0/undef) -> 0,
+  //       (mul x, 1) -> x) -> x)
+  // -> and(x, mask)
+  // We can replace vectors with '0' and '1' factors with a clearing mask.
+  if (VT.isFixedLengthVector()) {
+    unsigned NumElts = VT.getVectorNumElements();
+    SmallBitVector ClearMask;
+    ClearMask.reserve(NumElts);
+    auto IsClearMask = [&ClearMask](ConstantSDNode *V) {
+      if (!V || V->isZero()) {
+        ClearMask.push_back(true);
+        return true;
+      }
+      ClearMask.push_back(false);
+      return V->isOne();
+    };
+    if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::AND, VT)) &&
+        ISD::matchUnaryPredicate(N1, IsClearMask, /*AllowUndefs*/ true)) {
+      assert(N1.getOpcode() == ISD::BUILD_VECTOR && "Unknown constant vector");
+      EVT LegalSVT = N1.getOperand(0).getValueType();
+      SDValue Zero = DAG.getConstant(0, DL, LegalSVT);
+      SDValue AllOnes = DAG.getAllOnesConstant(DL, LegalSVT);
+      SmallVector<SDValue, 16> Mask(NumElts, AllOnes);
+      for (unsigned I = 0; I != NumElts; ++I)
+        if (ClearMask[I])
+          Mask[I] = Zero;
+      return DAG.getNode(ISD::AND, DL, VT, N0, DAG.getBuildVector(VT, DL, Mask));
+    }
+  }
+
+  // reassociate mul
+  if (SDValue RMUL = reassociateOps(ISD::MUL, DL, N0, N1, N->getFlags()))
+    return RMUL;
+
+  // Fold mul(vecreduce(x), vecreduce(y)) -> vecreduce(mul(x, y))
+  if (SDValue SD =
+          reassociateReduction(ISD::VECREDUCE_MUL, ISD::MUL, DL, VT, N0, N1))
+    return SD;
+
+  // Simplify the operands using demanded-bits information.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+/// Return true if divmod libcall is available.
+static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
+                                     const TargetLowering &TLI) {
+  RTLIB::Libcall LC;
+  EVT NodeType = Node->getValueType(0);
+  if (!NodeType.isSimple())
+    return false;
+  switch (NodeType.getSimpleVT().SimpleTy) {
+  default: return false; // No libcall for vector types.
+  case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
+  case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
+  case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
+  case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
+  case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
+  }
+
+  return TLI.getLibcallName(LC) != nullptr;
+}
+
+/// Issue divrem if both quotient and remainder are needed.
+SDValue DAGCombiner::useDivRem(SDNode *Node) {
+  if (Node->use_empty())
+    return SDValue(); // This is a dead node, leave it alone.
+
+  unsigned Opcode = Node->getOpcode();
+  bool isSigned = (Opcode == ISD::SDIV) || (Opcode == ISD::SREM);
+  unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
+
+  // DivMod lib calls can still work on non-legal types if using lib-calls.
+  EVT VT = Node->getValueType(0);
+  if (VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT))
+    return SDValue();
+
+  // If DIVREM is going to get expanded into a libcall,
+  // but there is no libcall available, then don't combine.
+  if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) &&
+      !isDivRemLibcallAvailable(Node, isSigned, TLI))
+    return SDValue();
+
+  // If div is legal, it's better to do the normal expansion
+  unsigned OtherOpcode = 0;
+  if ((Opcode == ISD::SDIV) || (Opcode == ISD::UDIV)) {
+    OtherOpcode = isSigned ? ISD::SREM : ISD::UREM;
+    if (TLI.isOperationLegalOrCustom(Opcode, VT))
+      return SDValue();
+  } else {
+    OtherOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
+    if (TLI.isOperationLegalOrCustom(OtherOpcode, VT))
+      return SDValue();
+  }
+
+  SDValue Op0 = Node->getOperand(0);
+  SDValue Op1 = Node->getOperand(1);
+  SDValue combined;
+  for (SDNode *User : Op0->uses()) {
+    if (User == Node || User->getOpcode() == ISD::DELETED_NODE ||
+        User->use_empty())
+      continue;
+    // Convert the other matching node(s), too;
+    // otherwise, the DIVREM may get target-legalized into something
+    // target-specific that we won't be able to recognize.
+    unsigned UserOpc = User->getOpcode();
+    if ((UserOpc == Opcode || UserOpc == OtherOpcode || UserOpc == DivRemOpc) &&
+        User->getOperand(0) == Op0 &&
+        User->getOperand(1) == Op1) {
+      if (!combined) {
+        if (UserOpc == OtherOpcode) {
+          SDVTList VTs = DAG.getVTList(VT, VT);
+          combined = DAG.getNode(DivRemOpc, SDLoc(Node), VTs, Op0, Op1);
+        } else if (UserOpc == DivRemOpc) {
+          combined = SDValue(User, 0);
+        } else {
+          assert(UserOpc == Opcode);
+          continue;
+        }
+      }
+      if (UserOpc == ISD::SDIV || UserOpc == ISD::UDIV)
+        CombineTo(User, combined);
+      else if (UserOpc == ISD::SREM || UserOpc == ISD::UREM)
+        CombineTo(User, combined.getValue(1));
+    }
+  }
+  return combined;
+}
+
+static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  unsigned Opc = N->getOpcode();
+  bool IsDiv = (ISD::SDIV == Opc) || (ISD::UDIV == Opc);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+
+  // X / undef -> undef
+  // X % undef -> undef
+  // X / 0 -> undef
+  // X % 0 -> undef
+  // NOTE: This includes vectors where any divisor element is zero/undef.
+  if (DAG.isUndef(Opc, {N0, N1}))
+    return DAG.getUNDEF(VT);
+
+  // undef / X -> 0
+  // undef % X -> 0
+  if (N0.isUndef())
+    return DAG.getConstant(0, DL, VT);
+
+  // 0 / X -> 0
+  // 0 % X -> 0
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  if (N0C && N0C->isZero())
+    return N0;
+
+  // X / X -> 1
+  // X % X -> 0
+  if (N0 == N1)
+    return DAG.getConstant(IsDiv ? 1 : 0, DL, VT);
+
+  // X / 1 -> X
+  // X % 1 -> 0
+  // If this is a boolean op (single-bit element type), we can't have
+  // division-by-zero or remainder-by-zero, so assume the divisor is 1.
+  // TODO: Similarly, if we're zero-extending a boolean divisor, then assume
+  // it's a 1.
+  if ((N1C && N1C->isOne()) || (VT.getScalarType() == MVT::i1))
+    return IsDiv ? N0 : DAG.getConstant(0, DL, VT);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSDIV(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  EVT CCVT = getSetCCResultType(VT);
+  SDLoc DL(N);
+
+  // fold (sdiv c1, c2) -> c1/c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, {N0, N1}))
+    return C;
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+  // fold (sdiv X, -1) -> 0-X
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+  if (N1C && N1C->isAllOnes())
+    return DAG.getNegative(N0, DL, VT);
+
+  // fold (sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0)
+  if (N1C && N1C->isMinSignedValue())
+    return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
+                         DAG.getConstant(1, DL, VT),
+                         DAG.getConstant(0, DL, VT));
+
+  if (SDValue V = simplifyDivRem(N, DAG))
+    return V;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  // If we know the sign bits of both operands are zero, strength reduce to a
+  // udiv instead.  Handles (X&15) /s 4 -> X&15 >> 2
+  if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1);
+
+  if (SDValue V = visitSDIVLike(N0, N1, N)) {
+    // If the corresponding remainder node exists, update its users with
+    // (Dividend - (Quotient * Divisor).
+    if (SDNode *RemNode = DAG.getNodeIfExists(ISD::SREM, N->getVTList(),
+                                              { N0, N1 })) {
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, V, N1);
+      SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
+      AddToWorklist(Mul.getNode());
+      AddToWorklist(Sub.getNode());
+      CombineTo(RemNode, Sub);
+    }
+    return V;
+  }
+
+  // sdiv, srem -> sdivrem
+  // If the divisor is constant, then return DIVREM only if isIntDivCheap() is
+  // true.  Otherwise, we break the simplification logic in visitREM().
+  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+  if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
+    if (SDValue DivRem = useDivRem(N))
+        return DivRem;
+
+  return SDValue();
+}
+
+static bool isDivisorPowerOfTwo(SDValue Divisor) {
+  // Helper for determining whether a value is a power-2 constant scalar or a
+  // vector of such elements.
+  auto IsPowerOfTwo = [](ConstantSDNode *C) {
+    if (C->isZero() || C->isOpaque())
+      return false;
+    if (C->getAPIntValue().isPowerOf2())
+      return true;
+    if (C->getAPIntValue().isNegatedPowerOf2())
+      return true;
+    return false;
+  };
+
+  return ISD::matchUnaryPredicate(Divisor, IsPowerOfTwo);
+}
+
+SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  EVT CCVT = getSetCCResultType(VT);
+  unsigned BitWidth = VT.getScalarSizeInBits();
+
+  // fold (sdiv X, pow2) -> simple ops after legalize
+  // FIXME: We check for the exact bit here because the generic lowering gives
+  // better results in that case. The target-specific lowering should learn how
+  // to handle exact sdivs efficiently.
+  if (!N->getFlags().hasExact() && isDivisorPowerOfTwo(N1)) {
+    // Target-specific implementation of sdiv x, pow2.
+    if (SDValue Res = BuildSDIVPow2(N))
+      return Res;
+
+    // Create constants that are functions of the shift amount value.
+    EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType());
+    SDValue Bits = DAG.getConstant(BitWidth, DL, ShiftAmtTy);
+    SDValue C1 = DAG.getNode(ISD::CTTZ, DL, VT, N1);
+    C1 = DAG.getZExtOrTrunc(C1, DL, ShiftAmtTy);
+    SDValue Inexact = DAG.getNode(ISD::SUB, DL, ShiftAmtTy, Bits, C1);
+    if (!isConstantOrConstantVector(Inexact))
+      return SDValue();
+
+    // Splat the sign bit into the register
+    SDValue Sign = DAG.getNode(ISD::SRA, DL, VT, N0,
+                               DAG.getConstant(BitWidth - 1, DL, ShiftAmtTy));
+    AddToWorklist(Sign.getNode());
+
+    // Add (N0 < 0) ? abs2 - 1 : 0;
+    SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, Sign, Inexact);
+    AddToWorklist(Srl.getNode());
+    SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Srl);
+    AddToWorklist(Add.getNode());
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Add, C1);
+    AddToWorklist(Sra.getNode());
+
+    // Special case: (sdiv X, 1) -> X
+    // Special Case: (sdiv X, -1) -> 0-X
+    SDValue One = DAG.getConstant(1, DL, VT);
+    SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
+    SDValue IsOne = DAG.getSetCC(DL, CCVT, N1, One, ISD::SETEQ);
+    SDValue IsAllOnes = DAG.getSetCC(DL, CCVT, N1, AllOnes, ISD::SETEQ);
+    SDValue IsOneOrAllOnes = DAG.getNode(ISD::OR, DL, CCVT, IsOne, IsAllOnes);
+    Sra = DAG.getSelect(DL, VT, IsOneOrAllOnes, N0, Sra);
+
+    // If dividing by a positive value, we're done. Otherwise, the result must
+    // be negated.
+    SDValue Zero = DAG.getConstant(0, DL, VT);
+    SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, Zero, Sra);
+
+    // FIXME: Use SELECT_CC once we improve SELECT_CC constant-folding.
+    SDValue IsNeg = DAG.getSetCC(DL, CCVT, N1, Zero, ISD::SETLT);
+    SDValue Res = DAG.getSelect(DL, VT, IsNeg, Sub, Sra);
+    return Res;
+  }
+
+  // If integer divide is expensive and we satisfy the requirements, emit an
+  // alternate sequence.  Targets may check function attributes for size/speed
+  // trade-offs.
+  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+  if (isConstantOrConstantVector(N1) &&
+      !TLI.isIntDivCheap(N->getValueType(0), Attr))
+    if (SDValue Op = BuildSDIV(N))
+      return Op;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUDIV(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  EVT CCVT = getSetCCResultType(VT);
+  SDLoc DL(N);
+
+  // fold (udiv c1, c2) -> c1/c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, {N0, N1}))
+    return C;
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+  // fold (udiv X, -1) -> select(X == -1, 1, 0)
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+  if (N1C && N1C->isAllOnes() && CCVT.isVector() == VT.isVector()) {
+    return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
+                         DAG.getConstant(1, DL, VT),
+                         DAG.getConstant(0, DL, VT));
+  }
+
+  if (SDValue V = simplifyDivRem(N, DAG))
+    return V;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  if (SDValue V = visitUDIVLike(N0, N1, N)) {
+    // If the corresponding remainder node exists, update its users with
+    // (Dividend - (Quotient * Divisor).
+    if (SDNode *RemNode = DAG.getNodeIfExists(ISD::UREM, N->getVTList(),
+                                              { N0, N1 })) {
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, V, N1);
+      SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
+      AddToWorklist(Mul.getNode());
+      AddToWorklist(Sub.getNode());
+      CombineTo(RemNode, Sub);
+    }
+    return V;
+  }
+
+  // sdiv, srem -> sdivrem
+  // If the divisor is constant, then return DIVREM only if isIntDivCheap() is
+  // true.  Otherwise, we break the simplification logic in visitREM().
+  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+  if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
+    if (SDValue DivRem = useDivRem(N))
+        return DivRem;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  // fold (udiv x, (1 << c)) -> x >>u c
+  if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
+      DAG.isKnownToBeAPowerOfTwo(N1)) {
+    SDValue LogBase2 = BuildLogBase2(N1, DL);
+    AddToWorklist(LogBase2.getNode());
+
+    EVT ShiftVT = getShiftAmountTy(N0.getValueType());
+    SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
+    AddToWorklist(Trunc.getNode());
+    return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc);
+  }
+
+  // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
+  if (N1.getOpcode() == ISD::SHL) {
+    SDValue N10 = N1.getOperand(0);
+    if (isConstantOrConstantVector(N10, /*NoOpaques*/ true) &&
+        DAG.isKnownToBeAPowerOfTwo(N10)) {
+      SDValue LogBase2 = BuildLogBase2(N10, DL);
+      AddToWorklist(LogBase2.getNode());
+
+      EVT ADDVT = N1.getOperand(1).getValueType();
+      SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT);
+      AddToWorklist(Trunc.getNode());
+      SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT, N1.getOperand(1), Trunc);
+      AddToWorklist(Add.getNode());
+      return DAG.getNode(ISD::SRL, DL, VT, N0, Add);
+    }
+  }
+
+  // fold (udiv x, c) -> alternate
+  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+  if (isConstantOrConstantVector(N1) &&
+      !TLI.isIntDivCheap(N->getValueType(0), Attr))
+    if (SDValue Op = BuildUDIV(N))
+      return Op;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::buildOptimizedSREM(SDValue N0, SDValue N1, SDNode *N) {
+  if (!N->getFlags().hasExact() && isDivisorPowerOfTwo(N1) &&
+      !DAG.doesNodeExist(ISD::SDIV, N->getVTList(), {N0, N1})) {
+    // Target-specific implementation of srem x, pow2.
+    if (SDValue Res = BuildSREMPow2(N))
+      return Res;
+  }
+  return SDValue();
+}
+
+// handles ISD::SREM and ISD::UREM
+SDValue DAGCombiner::visitREM(SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  EVT CCVT = getSetCCResultType(VT);
+
+  bool isSigned = (Opcode == ISD::SREM);
+  SDLoc DL(N);
+
+  // fold (rem c1, c2) -> c1%c2
+  if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
+    return C;
+
+  // fold (urem X, -1) -> select(FX == -1, 0, FX)
+  // Freeze the numerator to avoid a miscompile with an undefined value.
+  if (!isSigned && llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false) &&
+      CCVT.isVector() == VT.isVector()) {
+    SDValue F0 = DAG.getFreeze(N0);
+    SDValue EqualsNeg1 = DAG.getSetCC(DL, CCVT, F0, N1, ISD::SETEQ);
+    return DAG.getSelect(DL, VT, EqualsNeg1, DAG.getConstant(0, DL, VT), F0);
+  }
+
+  if (SDValue V = simplifyDivRem(N, DAG))
+    return V;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  if (isSigned) {
+    // If we know the sign bits of both operands are zero, strength reduce to a
+    // urem instead.  Handles (X & 0x0FFFFFFF) %s 16 -> X&15
+    if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::UREM, DL, VT, N0, N1);
+  } else {
+    if (DAG.isKnownToBeAPowerOfTwo(N1)) {
+      // fold (urem x, pow2) -> (and x, pow2-1)
+      SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
+      AddToWorklist(Add.getNode());
+      return DAG.getNode(ISD::AND, DL, VT, N0, Add);
+    }
+    // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+    // fold (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1))
+    // TODO: We should sink the following into isKnownToBePowerOfTwo
+    // using a OrZero parameter analogous to our handling in ValueTracking.
+    if ((N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) &&
+        DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) {
+      SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
+      AddToWorklist(Add.getNode());
+      return DAG.getNode(ISD::AND, DL, VT, N0, Add);
+    }
+  }
+
+  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+
+  // If X/C can be simplified by the division-by-constant logic, lower
+  // X%C to the equivalent of X-X/C*C.
+  // Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the
+  // speculative DIV must not cause a DIVREM conversion.  We guard against this
+  // by skipping the simplification if isIntDivCheap().  When div is not cheap,
+  // combine will not return a DIVREM.  Regardless, checking cheapness here
+  // makes sense since the simplification results in fatter code.
+  if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) {
+    if (isSigned) {
+      // check if we can build faster implementation for srem
+      if (SDValue OptimizedRem = buildOptimizedSREM(N0, N1, N))
+        return OptimizedRem;
+    }
+
+    SDValue OptimizedDiv =
+        isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N);
+    if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != N) {
+      // If the equivalent Div node also exists, update its users.
+      unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
+      if (SDNode *DivNode = DAG.getNodeIfExists(DivOpcode, N->getVTList(),
+                                                { N0, N1 }))
+        CombineTo(DivNode, OptimizedDiv);
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1);
+      SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
+      AddToWorklist(OptimizedDiv.getNode());
+      AddToWorklist(Mul.getNode());
+      return Sub;
+    }
+  }
+
+  // sdiv, srem -> sdivrem
+  if (SDValue DivRem = useDivRem(N))
+    return DivRem.getValue(1);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMULHS(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // fold (mulhs c1, c2)
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHS, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS.
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::MULHS, DL, N->getVTList(), N1, N0);
+
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+    // fold (mulhs x, 0) -> 0
+    // do not return N1, because undef node may exist.
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
+      return DAG.getConstant(0, DL, VT);
+  }
+
+  // fold (mulhs x, 0) -> 0
+  if (isNullConstant(N1))
+    return N1;
+
+  // fold (mulhs x, 1) -> (sra x, size(x)-1)
+  if (isOneConstant(N1))
+    return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0,
+                       DAG.getConstant(N0.getScalarValueSizeInBits() - 1, DL,
+                                       getShiftAmountTy(N0.getValueType())));
+
+  // fold (mulhs x, undef) -> 0
+  if (N0.isUndef() || N1.isUndef())
+    return DAG.getConstant(0, DL, VT);
+
+  // If the type twice as wide is legal, transform the mulhs to a wider multiply
+  // plus a shift.
+  if (!TLI.isOperationLegalOrCustom(ISD::MULHS, VT) && VT.isSimple() &&
+      !VT.isVector()) {
+    MVT Simple = VT.getSimpleVT();
+    unsigned SimpleSize = Simple.getSizeInBits();
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
+    if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
+      N0 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0);
+      N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1);
+      N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
+      N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
+            DAG.getConstant(SimpleSize, DL,
+                            getShiftAmountTy(N1.getValueType())));
+      return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMULHU(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // fold (mulhu c1, c2)
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHU, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS.
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::MULHU, DL, N->getVTList(), N1, N0);
+
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+    // fold (mulhu x, 0) -> 0
+    // do not return N1, because undef node may exist.
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
+      return DAG.getConstant(0, DL, VT);
+  }
+
+  // fold (mulhu x, 0) -> 0
+  if (isNullConstant(N1))
+    return N1;
+
+  // fold (mulhu x, 1) -> 0
+  if (isOneConstant(N1))
+    return DAG.getConstant(0, DL, N0.getValueType());
+
+  // fold (mulhu x, undef) -> 0
+  if (N0.isUndef() || N1.isUndef())
+    return DAG.getConstant(0, DL, VT);
+
+  // fold (mulhu x, (1 << c)) -> x >> (bitwidth - c)
+  if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
+      DAG.isKnownToBeAPowerOfTwo(N1) && hasOperation(ISD::SRL, VT)) {
+    unsigned NumEltBits = VT.getScalarSizeInBits();
+    SDValue LogBase2 = BuildLogBase2(N1, DL);
+    SDValue SRLAmt = DAG.getNode(
+        ISD::SUB, DL, VT, DAG.getConstant(NumEltBits, DL, VT), LogBase2);
+    EVT ShiftVT = getShiftAmountTy(N0.getValueType());
+    SDValue Trunc = DAG.getZExtOrTrunc(SRLAmt, DL, ShiftVT);
+    return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc);
+  }
+
+  // If the type twice as wide is legal, transform the mulhu to a wider multiply
+  // plus a shift.
+  if (!TLI.isOperationLegalOrCustom(ISD::MULHU, VT) && VT.isSimple() &&
+      !VT.isVector()) {
+    MVT Simple = VT.getSimpleVT();
+    unsigned SimpleSize = Simple.getSizeInBits();
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
+    if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
+      N0 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0);
+      N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1);
+      N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
+      N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
+            DAG.getConstant(SimpleSize, DL,
+                            getShiftAmountTy(N1.getValueType())));
+      return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
+    }
+  }
+
+  // Simplify the operands using demanded-bits information.
+  // We don't have demanded bits support for MULHU so this just enables constant
+  // folding based on known bits.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitAVG(SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // fold (avg c1, c2)
+  if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS.
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0);
+
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+    // fold (avgfloor x, 0) -> x >> 1
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) {
+      if (Opcode == ISD::AVGFLOORS)
+        return DAG.getNode(ISD::SRA, DL, VT, N0, DAG.getConstant(1, DL, VT));
+      if (Opcode == ISD::AVGFLOORU)
+        return DAG.getNode(ISD::SRL, DL, VT, N0, DAG.getConstant(1, DL, VT));
+    }
+  }
+
+  // fold (avg x, undef) -> x
+  if (N0.isUndef())
+    return N1;
+  if (N1.isUndef())
+    return N0;
+
+  // Fold (avg x, x) --> x
+  if (N0 == N1 && Level >= AfterLegalizeTypes)
+    return N0;
+
+  // TODO If we use avg for scalars anywhere, we can add (avgfl x, 0) -> x >> 1
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitABD(SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // fold (abd c1, c2)
+  if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS.
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0);
+
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+    // fold (abds x, 0) -> abs x
+    // fold (abdu x, 0) -> x
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) {
+      if (Opcode == ISD::ABDS)
+        return DAG.getNode(ISD::ABS, DL, VT, N0);
+      if (Opcode == ISD::ABDU)
+        return N0;
+    }
+  }
+
+  // fold (abd x, undef) -> 0
+  if (N0.isUndef() || N1.isUndef())
+    return DAG.getConstant(0, DL, VT);
+
+  // fold (abds x, y) -> (abdu x, y) iff both args are known positive
+  if (Opcode == ISD::ABDS && hasOperation(ISD::ABDU, VT) &&
+      DAG.SignBitIsZero(N0) && DAG.SignBitIsZero(N1))
+    return DAG.getNode(ISD::ABDU, DL, VT, N1, N0);
+
+  return SDValue();
+}
+
+/// Perform optimizations common to nodes that compute two values. LoOp and HiOp
+/// give the opcodes for the two computations that are being performed. Return
+/// true if a simplification was made.
+SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
+                                                unsigned HiOp) {
+  // If the high half is not needed, just compute the low half.
+  bool HiExists = N->hasAnyUseOfValue(1);
+  if (!HiExists && (!LegalOperations ||
+                    TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) {
+    SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
+    return CombineTo(N, Res, Res);
+  }
+
+  // If the low half is not needed, just compute the high half.
+  bool LoExists = N->hasAnyUseOfValue(0);
+  if (!LoExists && (!LegalOperations ||
+                    TLI.isOperationLegalOrCustom(HiOp, N->getValueType(1)))) {
+    SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
+    return CombineTo(N, Res, Res);
+  }
+
+  // If both halves are used, return as it is.
+  if (LoExists && HiExists)
+    return SDValue();
+
+  // If the two computed results can be simplified separately, separate them.
+  if (LoExists) {
+    SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
+    AddToWorklist(Lo.getNode());
+    SDValue LoOpt = combine(Lo.getNode());
+    if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(LoOpt.getOpcode(), LoOpt.getValueType())))
+      return CombineTo(N, LoOpt, LoOpt);
+  }
+
+  if (HiExists) {
+    SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
+    AddToWorklist(Hi.getNode());
+    SDValue HiOpt = combine(Hi.getNode());
+    if (HiOpt.getNode() && HiOpt != Hi &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(HiOpt.getOpcode(), HiOpt.getValueType())))
+      return CombineTo(N, HiOpt, HiOpt);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
+  if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS))
+    return Res;
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // canonicalize constant to RHS (vector doesn't have to splat)
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::SMUL_LOHI, DL, N->getVTList(), N1, N0);
+
+  // If the type is twice as wide is legal, transform the mulhu to a wider
+  // multiply plus a shift.
+  if (VT.isSimple() && !VT.isVector()) {
+    MVT Simple = VT.getSimpleVT();
+    unsigned SimpleSize = Simple.getSizeInBits();
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
+    if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
+      SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0);
+      SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1);
+      Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
+      // Compute the high part as N1.
+      Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
+            DAG.getConstant(SimpleSize, DL,
+                            getShiftAmountTy(Lo.getValueType())));
+      Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
+      // Compute the low part as N0.
+      Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
+      return CombineTo(N, Lo, Hi);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
+  if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU))
+    return Res;
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // canonicalize constant to RHS (vector doesn't have to splat)
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::UMUL_LOHI, DL, N->getVTList(), N1, N0);
+
+  // (umul_lohi N0, 0) -> (0, 0)
+  if (isNullConstant(N1)) {
+    SDValue Zero = DAG.getConstant(0, DL, VT);
+    return CombineTo(N, Zero, Zero);
+  }
+
+  // (umul_lohi N0, 1) -> (N0, 0)
+  if (isOneConstant(N1)) {
+    SDValue Zero = DAG.getConstant(0, DL, VT);
+    return CombineTo(N, N0, Zero);
+  }
+
+  // If the type is twice as wide is legal, transform the mulhu to a wider
+  // multiply plus a shift.
+  if (VT.isSimple() && !VT.isVector()) {
+    MVT Simple = VT.getSimpleVT();
+    unsigned SimpleSize = Simple.getSizeInBits();
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
+    if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
+      SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0);
+      SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1);
+      Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
+      // Compute the high part as N1.
+      Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
+            DAG.getConstant(SimpleSize, DL,
+                            getShiftAmountTy(Lo.getValueType())));
+      Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
+      // Compute the low part as N0.
+      Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
+      return CombineTo(N, Lo, Hi);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMULO(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  bool IsSigned = (ISD::SMULO == N->getOpcode());
+
+  EVT CarryVT = N->getValueType(1);
+  SDLoc DL(N);
+
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+
+  // fold operation with constant operands.
+  // TODO: Move this to FoldConstantArithmetic when it supports nodes with
+  // multiple results.
+  if (N0C && N1C) {
+    bool Overflow;
+    APInt Result =
+        IsSigned ? N0C->getAPIntValue().smul_ov(N1C->getAPIntValue(), Overflow)
+                 : N0C->getAPIntValue().umul_ov(N1C->getAPIntValue(), Overflow);
+    return CombineTo(N, DAG.getConstant(Result, DL, VT),
+                     DAG.getBoolConstant(Overflow, DL, CarryVT, CarryVT));
+  }
+
+  // canonicalize constant to RHS.
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0);
+
+  // fold (mulo x, 0) -> 0 + no carry out
+  if (isNullOrNullSplat(N1))
+    return CombineTo(N, DAG.getConstant(0, DL, VT),
+                     DAG.getConstant(0, DL, CarryVT));
+
+  // (mulo x, 2) -> (addo x, x)
+  // FIXME: This needs a freeze.
+  if (N1C && N1C->getAPIntValue() == 2 &&
+      (!IsSigned || VT.getScalarSizeInBits() > 2))
+    return DAG.getNode(IsSigned ? ISD::SADDO : ISD::UADDO, DL,
+                       N->getVTList(), N0, N0);
+
+  if (IsSigned) {
+    // A 1 bit SMULO overflows if both inputs are 1.
+    if (VT.getScalarSizeInBits() == 1) {
+      SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, N1);
+      return CombineTo(N, And,
+                       DAG.getSetCC(DL, CarryVT, And,
+                                    DAG.getConstant(0, DL, VT), ISD::SETNE));
+    }
+
+    // Multiplying n * m significant bits yields a result of n + m significant
+    // bits. If the total number of significant bits does not exceed the
+    // result bit width (minus 1), there is no overflow.
+    unsigned SignBits = DAG.ComputeNumSignBits(N0);
+    if (SignBits > 1)
+      SignBits += DAG.ComputeNumSignBits(N1);
+    if (SignBits > VT.getScalarSizeInBits() + 1)
+      return CombineTo(N, DAG.getNode(ISD::MUL, DL, VT, N0, N1),
+                       DAG.getConstant(0, DL, CarryVT));
+  } else {
+    KnownBits N1Known = DAG.computeKnownBits(N1);
+    KnownBits N0Known = DAG.computeKnownBits(N0);
+    bool Overflow;
+    (void)N0Known.getMaxValue().umul_ov(N1Known.getMaxValue(), Overflow);
+    if (!Overflow)
+      return CombineTo(N, DAG.getNode(ISD::MUL, DL, VT, N0, N1),
+                       DAG.getConstant(0, DL, CarryVT));
+  }
+
+  return SDValue();
+}
+
+// Function to calculate whether the Min/Max pair of SDNodes (potentially
+// swapped around) make a signed saturate pattern, clamping to between a signed
+// saturate of -2^(BW-1) and 2^(BW-1)-1, or an unsigned saturate of 0 and 2^BW.
+// Returns the node being clamped and the bitwidth of the clamp in BW. Should
+// work with both SMIN/SMAX nodes and setcc/select combo. The operands are the
+// same as SimplifySelectCC. N0<N1 ? N2 : N3.
+static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2,
+                                  SDValue N3, ISD::CondCode CC, unsigned &BW,
+                                  bool &Unsigned, SelectionDAG &DAG) {
+  auto isSignedMinMax = [&](SDValue N0, SDValue N1, SDValue N2, SDValue N3,
+                            ISD::CondCode CC) {
+    // The compare and select operand should be the same or the select operands
+    // should be truncated versions of the comparison.
+    if (N0 != N2 && (N2.getOpcode() != ISD::TRUNCATE || N0 != N2.getOperand(0)))
+      return 0;
+    // The constants need to be the same or a truncated version of each other.
+    ConstantSDNode *N1C = isConstOrConstSplat(N1);
+    ConstantSDNode *N3C = isConstOrConstSplat(N3);
+    if (!N1C || !N3C)
+      return 0;
+    const APInt &C1 = N1C->getAPIntValue();
+    const APInt &C2 = N3C->getAPIntValue();
+    if (C1.getBitWidth() < C2.getBitWidth() || C1 != C2.sext(C1.getBitWidth()))
+      return 0;
+    return CC == ISD::SETLT ? ISD::SMIN : (CC == ISD::SETGT ? ISD::SMAX : 0);
+  };
+
+  // Check the initial value is a SMIN/SMAX equivalent.
+  unsigned Opcode0 = isSignedMinMax(N0, N1, N2, N3, CC);
+  if (!Opcode0)
+    return SDValue();
+
+  // We could only need one range check, if the fptosi could never produce
+  // the upper value.
+  if (N0.getOpcode() == ISD::FP_TO_SINT && Opcode0 == ISD::SMAX) {
+    if (isNullOrNullSplat(N3)) {
+      EVT IntVT = N0.getValueType().getScalarType();
+      EVT FPVT = N0.getOperand(0).getValueType().getScalarType();
+      if (FPVT.isSimple()) {
+        Type *InputTy = FPVT.getTypeForEVT(*DAG.getContext());
+        const fltSemantics &Semantics = InputTy->getFltSemantics();
+        uint32_t MinBitWidth =
+          APFloatBase::semanticsIntSizeInBits(Semantics, /*isSigned*/ true);
+        if (IntVT.getSizeInBits() >= MinBitWidth) {
+          Unsigned = true;
+          BW = PowerOf2Ceil(MinBitWidth);
+          return N0;
+        }
+      }
+    }
+  }
+
+  SDValue N00, N01, N02, N03;
+  ISD::CondCode N0CC;
+  switch (N0.getOpcode()) {
+  case ISD::SMIN:
+  case ISD::SMAX:
+    N00 = N02 = N0.getOperand(0);
+    N01 = N03 = N0.getOperand(1);
+    N0CC = N0.getOpcode() == ISD::SMIN ? ISD::SETLT : ISD::SETGT;
+    break;
+  case ISD::SELECT_CC:
+    N00 = N0.getOperand(0);
+    N01 = N0.getOperand(1);
+    N02 = N0.getOperand(2);
+    N03 = N0.getOperand(3);
+    N0CC = cast<CondCodeSDNode>(N0.getOperand(4))->get();
+    break;
+  case ISD::SELECT:
+  case ISD::VSELECT:
+    if (N0.getOperand(0).getOpcode() != ISD::SETCC)
+      return SDValue();
+    N00 = N0.getOperand(0).getOperand(0);
+    N01 = N0.getOperand(0).getOperand(1);
+    N02 = N0.getOperand(1);
+    N03 = N0.getOperand(2);
+    N0CC = cast<CondCodeSDNode>(N0.getOperand(0).getOperand(2))->get();
+    break;
+  default:
+    return SDValue();
+  }
+
+  unsigned Opcode1 = isSignedMinMax(N00, N01, N02, N03, N0CC);
+  if (!Opcode1 || Opcode0 == Opcode1)
+    return SDValue();
+
+  ConstantSDNode *MinCOp = isConstOrConstSplat(Opcode0 == ISD::SMIN ? N1 : N01);
+  ConstantSDNode *MaxCOp = isConstOrConstSplat(Opcode0 == ISD::SMIN ? N01 : N1);
+  if (!MinCOp || !MaxCOp || MinCOp->getValueType(0) != MaxCOp->getValueType(0))
+    return SDValue();
+
+  const APInt &MinC = MinCOp->getAPIntValue();
+  const APInt &MaxC = MaxCOp->getAPIntValue();
+  APInt MinCPlus1 = MinC + 1;
+  if (-MaxC == MinCPlus1 && MinCPlus1.isPowerOf2()) {
+    BW = MinCPlus1.exactLogBase2() + 1;
+    Unsigned = false;
+    return N02;
+  }
+
+  if (MaxC == 0 && MinCPlus1.isPowerOf2()) {
+    BW = MinCPlus1.exactLogBase2();
+    Unsigned = true;
+    return N02;
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformMinMaxFpToSatCombine(SDValue N0, SDValue N1, SDValue N2,
+                                           SDValue N3, ISD::CondCode CC,
+                                           SelectionDAG &DAG) {
+  unsigned BW;
+  bool Unsigned;
+  SDValue Fp = isSaturatingMinMax(N0, N1, N2, N3, CC, BW, Unsigned, DAG);
+  if (!Fp || Fp.getOpcode() != ISD::FP_TO_SINT)
+    return SDValue();
+  EVT FPVT = Fp.getOperand(0).getValueType();
+  EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), BW);
+  if (FPVT.isVector())
+    NewVT = EVT::getVectorVT(*DAG.getContext(), NewVT,
+                             FPVT.getVectorElementCount());
+  unsigned NewOpc = Unsigned ? ISD::FP_TO_UINT_SAT : ISD::FP_TO_SINT_SAT;
+  if (!DAG.getTargetLoweringInfo().shouldConvertFpToSat(NewOpc, FPVT, NewVT))
+    return SDValue();
+  SDLoc DL(Fp);
+  SDValue Sat = DAG.getNode(NewOpc, DL, NewVT, Fp.getOperand(0),
+                            DAG.getValueType(NewVT.getScalarType()));
+  return DAG.getExtOrTrunc(!Unsigned, Sat, DL, N2->getValueType(0));
+}
+
+static SDValue PerformUMinFpToSatCombine(SDValue N0, SDValue N1, SDValue N2,
+                                         SDValue N3, ISD::CondCode CC,
+                                         SelectionDAG &DAG) {
+  // We are looking for UMIN(FPTOUI(X), (2^n)-1), which may have come via a
+  // select/vselect/select_cc. The two operands pairs for the select (N2/N3) may
+  // be truncated versions of the the setcc (N0/N1).
+  if ((N0 != N2 &&
+       (N2.getOpcode() != ISD::TRUNCATE || N0 != N2.getOperand(0))) ||
+      N0.getOpcode() != ISD::FP_TO_UINT || CC != ISD::SETULT)
+    return SDValue();
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+  ConstantSDNode *N3C = isConstOrConstSplat(N3);
+  if (!N1C || !N3C)
+    return SDValue();
+  const APInt &C1 = N1C->getAPIntValue();
+  const APInt &C3 = N3C->getAPIntValue();
+  if (!(C1 + 1).isPowerOf2() || C1.getBitWidth() < C3.getBitWidth() ||
+      C1 != C3.zext(C1.getBitWidth()))
+    return SDValue();
+
+  unsigned BW = (C1 + 1).exactLogBase2();
+  EVT FPVT = N0.getOperand(0).getValueType();
+  EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), BW);
+  if (FPVT.isVector())
+    NewVT = EVT::getVectorVT(*DAG.getContext(), NewVT,
+                             FPVT.getVectorElementCount());
+  if (!DAG.getTargetLoweringInfo().shouldConvertFpToSat(ISD::FP_TO_UINT_SAT,
+                                                        FPVT, NewVT))
+    return SDValue();
+
+  SDValue Sat =
+      DAG.getNode(ISD::FP_TO_UINT_SAT, SDLoc(N0), NewVT, N0.getOperand(0),
+                  DAG.getValueType(NewVT.getScalarType()));
+  return DAG.getZExtOrTrunc(Sat, SDLoc(N0), N3.getValueType());
+}
+
+SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  unsigned Opcode = N->getOpcode();
+  SDLoc DL(N);
+
+  // fold operation with constant operands.
+  if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
+    return C;
+
+  // If the operands are the same, this is a no-op.
+  if (N0 == N1)
+    return N0;
+
+  // canonicalize constant to RHS
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(Opcode, DL, VT, N1, N0);
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+  // Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX.
+  // Only do this if the current op isn't legal and the flipped is.
+  if (!TLI.isOperationLegal(Opcode, VT) &&
+      (N0.isUndef() || DAG.SignBitIsZero(N0)) &&
+      (N1.isUndef() || DAG.SignBitIsZero(N1))) {
+    unsigned AltOpcode;
+    switch (Opcode) {
+    case ISD::SMIN: AltOpcode = ISD::UMIN; break;
+    case ISD::SMAX: AltOpcode = ISD::UMAX; break;
+    case ISD::UMIN: AltOpcode = ISD::SMIN; break;
+    case ISD::UMAX: AltOpcode = ISD::SMAX; break;
+    default: llvm_unreachable("Unknown MINMAX opcode");
+    }
+    if (TLI.isOperationLegal(AltOpcode, VT))
+      return DAG.getNode(AltOpcode, DL, VT, N0, N1);
+  }
+
+  if (Opcode == ISD::SMIN || Opcode == ISD::SMAX)
+    if (SDValue S = PerformMinMaxFpToSatCombine(
+            N0, N1, N0, N1, Opcode == ISD::SMIN ? ISD::SETLT : ISD::SETGT, DAG))
+      return S;
+  if (Opcode == ISD::UMIN)
+    if (SDValue S = PerformUMinFpToSatCombine(N0, N1, N0, N1, ISD::SETULT, DAG))
+      return S;
+
+  // Fold min/max(vecreduce(x), vecreduce(y)) -> vecreduce(min/max(x, y))
+  auto ReductionOpcode = [](unsigned Opcode) {
+    switch (Opcode) {
+    case ISD::SMIN:
+      return ISD::VECREDUCE_SMIN;
+    case ISD::SMAX:
+      return ISD::VECREDUCE_SMAX;
+    case ISD::UMIN:
+      return ISD::VECREDUCE_UMIN;
+    case ISD::UMAX:
+      return ISD::VECREDUCE_UMAX;
+    default:
+      llvm_unreachable("Unexpected opcode");
+    }
+  };
+  if (SDValue SD = reassociateReduction(ReductionOpcode(Opcode), Opcode,
+                                        SDLoc(N), VT, N0, N1))
+    return SD;
+
+  // Simplify the operands using demanded-bits information.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+/// If this is a bitwise logic instruction and both operands have the same
+/// opcode, try to sink the other opcode after the logic instruction.
+SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  unsigned LogicOpcode = N->getOpcode();
+  unsigned HandOpcode = N0.getOpcode();
+  assert(ISD::isBitwiseLogicOp(LogicOpcode) && "Expected logic opcode");
+  assert(HandOpcode == N1.getOpcode() && "Bad input!");
+
+  // Bail early if none of these transforms apply.
+  if (N0.getNumOperands() == 0)
+    return SDValue();
+
+  // FIXME: We should check number of uses of the operands to not increase
+  //        the instruction count for all transforms.
+
+  // Handle size-changing casts (or sign_extend_inreg).
+  SDValue X = N0.getOperand(0);
+  SDValue Y = N1.getOperand(0);
+  EVT XVT = X.getValueType();
+  SDLoc DL(N);
+  if (ISD::isExtOpcode(HandOpcode) || ISD::isExtVecInRegOpcode(HandOpcode) ||
+      (HandOpcode == ISD::SIGN_EXTEND_INREG &&
+       N0.getOperand(1) == N1.getOperand(1))) {
+    // If both operands have other uses, this transform would create extra
+    // instructions without eliminating anything.
+    if (!N0.hasOneUse() && !N1.hasOneUse())
+      return SDValue();
+    // We need matching integer source types.
+    if (XVT != Y.getValueType())
+      return SDValue();
+    // Don't create an illegal op during or after legalization. Don't ever
+    // create an unsupported vector op.
+    if ((VT.isVector() || LegalOperations) &&
+        !TLI.isOperationLegalOrCustom(LogicOpcode, XVT))
+      return SDValue();
+    // Avoid infinite looping with PromoteIntBinOp.
+    // TODO: Should we apply desirable/legal constraints to all opcodes?
+    if ((HandOpcode == ISD::ANY_EXTEND ||
+         HandOpcode == ISD::ANY_EXTEND_VECTOR_INREG) &&
+        LegalTypes && !TLI.isTypeDesirableForOp(LogicOpcode, XVT))
+      return SDValue();
+    // logic_op (hand_op X), (hand_op Y) --> hand_op (logic_op X, Y)
+    SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
+    if (HandOpcode == ISD::SIGN_EXTEND_INREG)
+      return DAG.getNode(HandOpcode, DL, VT, Logic, N0.getOperand(1));
+    return DAG.getNode(HandOpcode, DL, VT, Logic);
+  }
+
+  // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
+  if (HandOpcode == ISD::TRUNCATE) {
+    // If both operands have other uses, this transform would create extra
+    // instructions without eliminating anything.
+    if (!N0.hasOneUse() && !N1.hasOneUse())
+      return SDValue();
+    // We need matching source types.
+    if (XVT != Y.getValueType())
+      return SDValue();
+    // Don't create an illegal op during or after legalization.
+    if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT))
+      return SDValue();
+    // Be extra careful sinking truncate. If it's free, there's no benefit in
+    // widening a binop. Also, don't create a logic op on an illegal type.
+    if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT))
+      return SDValue();
+    if (!TLI.isTypeLegal(XVT))
+      return SDValue();
+    SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
+    return DAG.getNode(HandOpcode, DL, VT, Logic);
+  }
+
+  // For binops SHL/SRL/SRA/AND:
+  //   logic_op (OP x, z), (OP y, z) --> OP (logic_op x, y), z
+  if ((HandOpcode == ISD::SHL || HandOpcode == ISD::SRL ||
+       HandOpcode == ISD::SRA || HandOpcode == ISD::AND) &&
+      N0.getOperand(1) == N1.getOperand(1)) {
+    // If either operand has other uses, this transform is not an improvement.
+    if (!N0.hasOneUse() || !N1.hasOneUse())
+      return SDValue();
+    SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
+    return DAG.getNode(HandOpcode, DL, VT, Logic, N0.getOperand(1));
+  }
+
+  // Unary ops: logic_op (bswap x), (bswap y) --> bswap (logic_op x, y)
+  if (HandOpcode == ISD::BSWAP) {
+    // If either operand has other uses, this transform is not an improvement.
+    if (!N0.hasOneUse() || !N1.hasOneUse())
+      return SDValue();
+    SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
+    return DAG.getNode(HandOpcode, DL, VT, Logic);
+  }
+
+  // For funnel shifts FSHL/FSHR:
+  // logic_op (OP x, x1, s), (OP y, y1, s) -->
+  // --> OP (logic_op x, y), (logic_op, x1, y1), s
+  if ((HandOpcode == ISD::FSHL || HandOpcode == ISD::FSHR) &&
+      N0.getOperand(2) == N1.getOperand(2)) {
+    if (!N0.hasOneUse() || !N1.hasOneUse())
+      return SDValue();
+    SDValue X1 = N0.getOperand(1);
+    SDValue Y1 = N1.getOperand(1);
+    SDValue S = N0.getOperand(2);
+    SDValue Logic0 = DAG.getNode(LogicOpcode, DL, VT, X, Y);
+    SDValue Logic1 = DAG.getNode(LogicOpcode, DL, VT, X1, Y1);
+    return DAG.getNode(HandOpcode, DL, VT, Logic0, Logic1, S);
+  }
+
+  // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
+  // Only perform this optimization up until type legalization, before
+  // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
+  // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and
+  // we don't want to undo this promotion.
+  // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
+  // on scalars.
+  if ((HandOpcode == ISD::BITCAST || HandOpcode == ISD::SCALAR_TO_VECTOR) &&
+       Level <= AfterLegalizeTypes) {
+    // Input types must be integer and the same.
+    if (XVT.isInteger() && XVT == Y.getValueType() &&
+        !(VT.isVector() && TLI.isTypeLegal(VT) &&
+          !XVT.isVector() && !TLI.isTypeLegal(XVT))) {
+      SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
+      return DAG.getNode(HandOpcode, DL, VT, Logic);
+    }
+  }
+
+  // Xor/and/or are indifferent to the swizzle operation (shuffle of one value).
+  // Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B))
+  // If both shuffles use the same mask, and both shuffle within a single
+  // vector, then it is worthwhile to move the swizzle after the operation.
+  // The type-legalizer generates this pattern when loading illegal
+  // vector types from memory. In many cases this allows additional shuffle
+  // optimizations.
+  // There are other cases where moving the shuffle after the xor/and/or
+  // is profitable even if shuffles don't perform a swizzle.
+  // If both shuffles use the same mask, and both shuffles have the same first
+  // or second operand, then it might still be profitable to move the shuffle
+  // after the xor/and/or operation.
+  if (HandOpcode == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) {
+    auto *SVN0 = cast<ShuffleVectorSDNode>(N0);
+    auto *SVN1 = cast<ShuffleVectorSDNode>(N1);
+    assert(X.getValueType() == Y.getValueType() &&
+           "Inputs to shuffles are not the same type");
+
+    // Check that both shuffles use the same mask. The masks are known to be of
+    // the same length because the result vector type is the same.
+    // Check also that shuffles have only one use to avoid introducing extra
+    // instructions.
+    if (!SVN0->hasOneUse() || !SVN1->hasOneUse() ||
+        !SVN0->getMask().equals(SVN1->getMask()))
+      return SDValue();
+
+    // Don't try to fold this node if it requires introducing a
+    // build vector of all zeros that might be illegal at this stage.
+    SDValue ShOp = N0.getOperand(1);
+    if (LogicOpcode == ISD::XOR && !ShOp.isUndef())
+      ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
+
+    // (logic_op (shuf (A, C), shuf (B, C))) --> shuf (logic_op (A, B), C)
+    if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) {
+      SDValue Logic = DAG.getNode(LogicOpcode, DL, VT,
+                                  N0.getOperand(0), N1.getOperand(0));
+      return DAG.getVectorShuffle(VT, DL, Logic, ShOp, SVN0->getMask());
+    }
+
+    // Don't try to fold this node if it requires introducing a
+    // build vector of all zeros that might be illegal at this stage.
+    ShOp = N0.getOperand(0);
+    if (LogicOpcode == ISD::XOR && !ShOp.isUndef())
+      ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
+
+    // (logic_op (shuf (C, A), shuf (C, B))) --> shuf (C, logic_op (A, B))
+    if (N0.getOperand(0) == N1.getOperand(0) && ShOp.getNode()) {
+      SDValue Logic = DAG.getNode(LogicOpcode, DL, VT, N0.getOperand(1),
+                                  N1.getOperand(1));
+      return DAG.getVectorShuffle(VT, DL, ShOp, Logic, SVN0->getMask());
+    }
+  }
+
+  return SDValue();
+}
+
+/// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient.
+SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
+                                       const SDLoc &DL) {
+  SDValue LL, LR, RL, RR, N0CC, N1CC;
+  if (!isSetCCEquivalent(N0, LL, LR, N0CC) ||
+      !isSetCCEquivalent(N1, RL, RR, N1CC))
+    return SDValue();
+
+  assert(N0.getValueType() == N1.getValueType() &&
+         "Unexpected operand types for bitwise logic op");
+  assert(LL.getValueType() == LR.getValueType() &&
+         RL.getValueType() == RR.getValueType() &&
+         "Unexpected operand types for setcc");
+
+  // If we're here post-legalization or the logic op type is not i1, the logic
+  // op type must match a setcc result type. Also, all folds require new
+  // operations on the left and right operands, so those types must match.
+  EVT VT = N0.getValueType();
+  EVT OpVT = LL.getValueType();
+  if (LegalOperations || VT.getScalarType() != MVT::i1)
+    if (VT != getSetCCResultType(OpVT))
+      return SDValue();
+  if (OpVT != RL.getValueType())
+    return SDValue();
+
+  ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get();
+  ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get();
+  bool IsInteger = OpVT.isInteger();
+  if (LR == RR && CC0 == CC1 && IsInteger) {
+    bool IsZero = isNullOrNullSplat(LR);
+    bool IsNeg1 = isAllOnesOrAllOnesSplat(LR);
+
+    // All bits clear?
+    bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero;
+    // All sign bits clear?
+    bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1;
+    // Any bits set?
+    bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero;
+    // Any sign bits set?
+    bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero;
+
+    // (and (seteq X,  0), (seteq Y,  0)) --> (seteq (or X, Y),  0)
+    // (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1)
+    // (or  (setne X,  0), (setne Y,  0)) --> (setne (or X, Y),  0)
+    // (or  (setlt X,  0), (setlt Y,  0)) --> (setlt (or X, Y),  0)
+    if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) {
+      SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL);
+      AddToWorklist(Or.getNode());
+      return DAG.getSetCC(DL, VT, Or, LR, CC1);
+    }
+
+    // All bits set?
+    bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1;
+    // All sign bits set?
+    bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero;
+    // Any bits clear?
+    bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1;
+    // Any sign bits clear?
+    bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1;
+
+    // (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1)
+    // (and (setlt X,  0), (setlt Y,  0)) --> (setlt (and X, Y),  0)
+    // (or  (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1)
+    // (or  (setgt X, -1), (setgt Y  -1)) --> (setgt (and X, Y), -1)
+    if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) {
+      SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL);
+      AddToWorklist(And.getNode());
+      return DAG.getSetCC(DL, VT, And, LR, CC1);
+    }
+  }
+
+  // TODO: What is the 'or' equivalent of this fold?
+  // (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2)
+  if (IsAnd && LL == RL && CC0 == CC1 && OpVT.getScalarSizeInBits() > 1 &&
+      IsInteger && CC0 == ISD::SETNE &&
+      ((isNullConstant(LR) && isAllOnesConstant(RR)) ||
+       (isAllOnesConstant(LR) && isNullConstant(RR)))) {
+    SDValue One = DAG.getConstant(1, DL, OpVT);
+    SDValue Two = DAG.getConstant(2, DL, OpVT);
+    SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One);
+    AddToWorklist(Add.getNode());
+    return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE);
+  }
+
+  // Try more general transforms if the predicates match and the only user of
+  // the compares is the 'and' or 'or'.
+  if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 &&
+      N0.hasOneUse() && N1.hasOneUse()) {
+    // and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
+    // or  (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0
+    if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) {
+      SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR);
+      SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR);
+      SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR);
+      SDValue Zero = DAG.getConstant(0, DL, OpVT);
+      return DAG.getSetCC(DL, VT, Or, Zero, CC1);
+    }
+
+    // Turn compare of constants whose difference is 1 bit into add+and+setcc.
+    if ((IsAnd && CC1 == ISD::SETNE) || (!IsAnd && CC1 == ISD::SETEQ)) {
+      // Match a shared variable operand and 2 non-opaque constant operands.
+      auto MatchDiffPow2 = [&](ConstantSDNode *C0, ConstantSDNode *C1) {
+        // The difference of the constants must be a single bit.
+        const APInt &CMax =
+            APIntOps::umax(C0->getAPIntValue(), C1->getAPIntValue());
+        const APInt &CMin =
+            APIntOps::umin(C0->getAPIntValue(), C1->getAPIntValue());
+        return !C0->isOpaque() && !C1->isOpaque() && (CMax - CMin).isPowerOf2();
+      };
+      if (LL == RL && ISD::matchBinaryPredicate(LR, RR, MatchDiffPow2)) {
+        // and/or (setcc X, CMax, ne), (setcc X, CMin, ne/eq) -->
+        // setcc ((sub X, CMin), ~(CMax - CMin)), 0, ne/eq
+        SDValue Max = DAG.getNode(ISD::UMAX, DL, OpVT, LR, RR);
+        SDValue Min = DAG.getNode(ISD::UMIN, DL, OpVT, LR, RR);
+        SDValue Offset = DAG.getNode(ISD::SUB, DL, OpVT, LL, Min);
+        SDValue Diff = DAG.getNode(ISD::SUB, DL, OpVT, Max, Min);
+        SDValue Mask = DAG.getNOT(DL, Diff, OpVT);
+        SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Offset, Mask);
+        SDValue Zero = DAG.getConstant(0, DL, OpVT);
+        return DAG.getSetCC(DL, VT, And, Zero, CC0);
+      }
+    }
+  }
+
+  // Canonicalize equivalent operands to LL == RL.
+  if (LL == RR && LR == RL) {
+    CC1 = ISD::getSetCCSwappedOperands(CC1);
+    std::swap(RL, RR);
+  }
+
+  // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
+  // (or  (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
+  if (LL == RL && LR == RR) {
+    ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, OpVT)
+                                : ISD::getSetCCOrOperation(CC0, CC1, OpVT);
+    if (NewCC != ISD::SETCC_INVALID &&
+        (!LegalOperations ||
+         (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) &&
+          TLI.isOperationLegal(ISD::SETCC, OpVT))))
+      return DAG.getSetCC(DL, VT, LL, LR, NewCC);
+  }
+
+  return SDValue();
+}
+
+static SDValue foldAndOrOfSETCC(SDNode *LogicOp, SelectionDAG &DAG) {
+  using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind;
+  assert(
+      (LogicOp->getOpcode() == ISD::AND || LogicOp->getOpcode() == ISD::OR) &&
+      "Invalid Op to combine SETCC with");
+
+  // TODO: Search past casts/truncates.
+  SDValue LHS = LogicOp->getOperand(0);
+  SDValue RHS = LogicOp->getOperand(1);
+  if (LHS->getOpcode() != ISD::SETCC || RHS->getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  AndOrSETCCFoldKind TargetPreference = TLI.isDesirableToCombineLogicOpOfSETCC(
+      LogicOp, LHS.getNode(), RHS.getNode());
+
+  SDValue LHS0 = LHS->getOperand(0);
+  SDValue RHS0 = RHS->getOperand(0);
+  SDValue LHS1 = LHS->getOperand(1);
+  SDValue RHS1 = RHS->getOperand(1);
+  // TODO: We don't actually need a splat here, for vectors we just need the
+  // invariants to hold for each element.
+  auto *LHS1C = isConstOrConstSplat(LHS1);
+  auto *RHS1C = isConstOrConstSplat(RHS1);
+  ISD::CondCode CCL = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
+  ISD::CondCode CCR = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
+  EVT VT = LogicOp->getValueType(0);
+  EVT OpVT = LHS0.getValueType();
+  SDLoc DL(LogicOp);
+
+  // Check if the operands of an and/or operation are comparisons and if they
+  // compare against the same value. Replace the and/or-cmp-cmp sequence with
+  // min/max cmp sequence. If LHS1 is equal to RHS1, then the or-cmp-cmp
+  // sequence will be replaced with min-cmp sequence:
+  // (LHS0 < LHS1) | (RHS0 < RHS1) -> min(LHS0, RHS0) < LHS1
+  // and and-cmp-cmp will be replaced with max-cmp sequence:
+  // (LHS0 < LHS1) & (RHS0 < RHS1) -> max(LHS0, RHS0) < LHS1
+  if (OpVT.isInteger() && TLI.isOperationLegal(ISD::UMAX, OpVT) &&
+      TLI.isOperationLegal(ISD::SMAX, OpVT) &&
+      TLI.isOperationLegal(ISD::UMIN, OpVT) &&
+      TLI.isOperationLegal(ISD::SMIN, OpVT)) {
+    if (LHS->getOpcode() == ISD::SETCC && RHS->getOpcode() == ISD::SETCC &&
+        LHS->hasOneUse() && RHS->hasOneUse() &&
+        // The two comparisons should have either the same predicate or the
+        // predicate of one of the comparisons is the opposite of the other one.
+        (CCL == CCR || CCL == ISD::getSetCCSwappedOperands(CCR)) &&
+        // The optimization does not work for `==` or `!=` .
+        !ISD::isIntEqualitySetCC(CCL) && !ISD::isIntEqualitySetCC(CCR)) {
+      SDValue CommonValue, Operand1, Operand2;
+      ISD::CondCode CC = ISD::SETCC_INVALID;
+      if (CCL == CCR) {
+        if (LHS0 == RHS0) {
+          CommonValue = LHS0;
+          Operand1 = LHS1;
+          Operand2 = RHS1;
+          CC = ISD::getSetCCSwappedOperands(CCL);
+        } else if (LHS1 == RHS1) {
+          CommonValue = LHS1;
+          Operand1 = LHS0;
+          Operand2 = RHS0;
+          CC = CCL;
+        }
+      } else {
+        assert(CCL == ISD::getSetCCSwappedOperands(CCR) && "Unexpected CC");
+        if (LHS0 == RHS1) {
+          CommonValue = LHS0;
+          Operand1 = LHS1;
+          Operand2 = RHS0;
+          CC = ISD::getSetCCSwappedOperands(CCL);
+        } else if (RHS0 == LHS1) {
+          CommonValue = LHS1;
+          Operand1 = LHS0;
+          Operand2 = RHS1;
+          CC = CCL;
+        }
+      }
+
+      if (CC != ISD::SETCC_INVALID) {
+        unsigned NewOpcode;
+        bool IsSigned = isSignedIntSetCC(CC);
+        if (((CC == ISD::SETLE || CC == ISD::SETULE || CC == ISD::SETLT ||
+              CC == ISD::SETULT) &&
+             (LogicOp->getOpcode() == ISD::OR)) ||
+            ((CC == ISD::SETGE || CC == ISD::SETUGE || CC == ISD::SETGT ||
+              CC == ISD::SETUGT) &&
+             (LogicOp->getOpcode() == ISD::AND)))
+          NewOpcode = IsSigned ? ISD::SMIN : ISD::UMIN;
+        else
+          NewOpcode = IsSigned ? ISD::SMAX : ISD::UMAX;
+
+        SDValue MinMaxValue =
+            DAG.getNode(NewOpcode, DL, OpVT, Operand1, Operand2);
+        return DAG.getSetCC(DL, VT, MinMaxValue, CommonValue, CC);
+      }
+    }
+  }
+
+  if (TargetPreference == AndOrSETCCFoldKind::None)
+    return SDValue();
+
+  if (CCL == CCR &&
+      CCL == (LogicOp->getOpcode() == ISD::AND ? ISD::SETNE : ISD::SETEQ) &&
+      LHS0 == RHS0 && LHS1C && RHS1C && OpVT.isInteger() && LHS.hasOneUse() &&
+      RHS.hasOneUse()) {
+    const APInt &APLhs = LHS1C->getAPIntValue();
+    const APInt &APRhs = RHS1C->getAPIntValue();
+
+    // Preference is to use ISD::ABS or we already have an ISD::ABS (in which
+    // case this is just a compare).
+    if (APLhs == (-APRhs) &&
+        ((TargetPreference & AndOrSETCCFoldKind::ABS) ||
+         DAG.doesNodeExist(ISD::ABS, DAG.getVTList(OpVT), {LHS0}))) {
+      const APInt &C = APLhs.isNegative() ? APRhs : APLhs;
+      // (icmp eq A, C) | (icmp eq A, -C)
+      //    -> (icmp eq Abs(A), C)
+      // (icmp ne A, C) & (icmp ne A, -C)
+      //    -> (icmp ne Abs(A), C)
+      SDValue AbsOp = DAG.getNode(ISD::ABS, DL, OpVT, LHS0);
+      return DAG.getNode(ISD::SETCC, DL, VT, AbsOp,
+                         DAG.getConstant(C, DL, OpVT), LHS.getOperand(2));
+    } else if (TargetPreference &
+               (AndOrSETCCFoldKind::AddAnd | AndOrSETCCFoldKind::NotAnd)) {
+
+      // AndOrSETCCFoldKind::AddAnd:
+      // A == C0 | A == C1
+      //  IF IsPow2(smax(C0, C1)-smin(C0, C1))
+      //    -> ((A - smin(C0, C1)) & ~(smax(C0, C1)-smin(C0, C1))) == 0
+      // A != C0 & A != C1
+      //  IF IsPow2(smax(C0, C1)-smin(C0, C1))
+      //    -> ((A - smin(C0, C1)) & ~(smax(C0, C1)-smin(C0, C1))) != 0
+
+      // AndOrSETCCFoldKind::NotAnd:
+      // A == C0 | A == C1
+      //  IF smax(C0, C1) == -1 AND IsPow2(smax(C0, C1) - smin(C0, C1))
+      //    -> ~A & smin(C0, C1) == 0
+      // A != C0 & A != C1
+      //  IF smax(C0, C1) == -1 AND IsPow2(smax(C0, C1) - smin(C0, C1))
+      //    -> ~A & smin(C0, C1) != 0
+
+      const APInt &MaxC = APIntOps::smax(APRhs, APLhs);
+      const APInt &MinC = APIntOps::smin(APRhs, APLhs);
+      APInt Dif = MaxC - MinC;
+      if (!Dif.isZero() && Dif.isPowerOf2()) {
+        if (MaxC.isAllOnes() &&
+            (TargetPreference & AndOrSETCCFoldKind::NotAnd)) {
+          SDValue NotOp = DAG.getNOT(DL, LHS0, OpVT);
+          SDValue AndOp = DAG.getNode(ISD::AND, DL, OpVT, NotOp,
+                                      DAG.getConstant(MinC, DL, OpVT));
+          return DAG.getNode(ISD::SETCC, DL, VT, AndOp,
+                             DAG.getConstant(0, DL, OpVT), LHS.getOperand(2));
+        } else if (TargetPreference & AndOrSETCCFoldKind::AddAnd) {
+
+          SDValue AddOp = DAG.getNode(ISD::ADD, DL, OpVT, LHS0,
+                                      DAG.getConstant(-MinC, DL, OpVT));
+          SDValue AndOp = DAG.getNode(ISD::AND, DL, OpVT, AddOp,
+                                      DAG.getConstant(~Dif, DL, OpVT));
+          return DAG.getNode(ISD::SETCC, DL, VT, AndOp,
+                             DAG.getConstant(0, DL, OpVT), LHS.getOperand(2));
+        }
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+/// This contains all DAGCombine rules which reduce two values combined by
+/// an And operation to a single value. This makes them reusable in the context
+/// of visitSELECT(). Rules involving constants are not included as
+/// visitSELECT() already handles those cases.
+SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) {
+  EVT VT = N1.getValueType();
+  SDLoc DL(N);
+
+  // fold (and x, undef) -> 0
+  if (N0.isUndef() || N1.isUndef())
+    return DAG.getConstant(0, DL, VT);
+
+  if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL))
+    return V;
+
+  // Canonicalize:
+  //   and(x, add) -> and(add, x)
+  if (N1.getOpcode() == ISD::ADD)
+    std::swap(N0, N1);
+
+  // TODO: Rewrite this to return a new 'AND' instead of using CombineTo.
+  if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
+      VT.getSizeInBits() <= 64 && N0->hasOneUse()) {
+    if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) {
+        // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal
+        // immediate for an add, but it is legal if its top c2 bits are set,
+        // transform the ADD so the immediate doesn't need to be materialized
+        // in a register.
+        APInt ADDC = ADDI->getAPIntValue();
+        APInt SRLC = SRLI->getAPIntValue();
+        if (ADDC.getSignificantBits() <= 64 && SRLC.ult(VT.getSizeInBits()) &&
+            !TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
+          APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                             SRLC.getZExtValue());
+          if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {
+            ADDC |= Mask;
+            if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
+              SDLoc DL0(N0);
+              SDValue NewAdd =
+                DAG.getNode(ISD::ADD, DL0, VT,
+                            N0.getOperand(0), DAG.getConstant(ADDC, DL, VT));
+              CombineTo(N0.getNode(), NewAdd);
+              // Return N so it doesn't get rechecked!
+              return SDValue(N, 0);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
+                                   EVT LoadResultTy, EVT &ExtVT) {
+  if (!AndC->getAPIntValue().isMask())
+    return false;
+
+  unsigned ActiveBits = AndC->getAPIntValue().countr_one();
+
+  ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
+  EVT LoadedVT = LoadN->getMemoryVT();
+
+  if (ExtVT == LoadedVT &&
+      (!LegalOperations ||
+       TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) {
+    // ZEXTLOAD will match without needing to change the size of the value being
+    // loaded.
+    return true;
+  }
+
+  // Do not change the width of a volatile or atomic loads.
+  if (!LoadN->isSimple())
+    return false;
+
+  // Do not generate loads of non-round integer types since these can
+  // be expensive (and would be wrong if the type is not byte sized).
+  if (!LoadedVT.bitsGT(ExtVT) || !ExtVT.isRound())
+    return false;
+
+  if (LegalOperations &&
+      !TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))
+    return false;
+
+  if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT))
+    return false;
+
+  return true;
+}
+
+bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST,
+                                    ISD::LoadExtType ExtType, EVT &MemVT,
+                                    unsigned ShAmt) {
+  if (!LDST)
+    return false;
+  // Only allow byte offsets.
+  if (ShAmt % 8)
+    return false;
+
+  // Do not generate loads of non-round integer types since these can
+  // be expensive (and would be wrong if the type is not byte sized).
+  if (!MemVT.isRound())
+    return false;
+
+  // Don't change the width of a volatile or atomic loads.
+  if (!LDST->isSimple())
+    return false;
+
+  EVT LdStMemVT = LDST->getMemoryVT();
+
+  // Bail out when changing the scalable property, since we can't be sure that
+  // we're actually narrowing here.
+  if (LdStMemVT.isScalableVector() != MemVT.isScalableVector())
+    return false;
+
+  // Verify that we are actually reducing a load width here.
+  if (LdStMemVT.bitsLT(MemVT))
+    return false;
+
+  // Ensure that this isn't going to produce an unsupported memory access.
+  if (ShAmt) {
+    assert(ShAmt % 8 == 0 && "ShAmt is byte offset");
+    const unsigned ByteShAmt = ShAmt / 8;
+    const Align LDSTAlign = LDST->getAlign();
+    const Align NarrowAlign = commonAlignment(LDSTAlign, ByteShAmt);
+    if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
+                                LDST->getAddressSpace(), NarrowAlign,
+                                LDST->getMemOperand()->getFlags()))
+      return false;
+  }
+
+  // It's not possible to generate a constant of extended or untyped type.
+  EVT PtrType = LDST->getBasePtr().getValueType();
+  if (PtrType == MVT::Untyped || PtrType.isExtended())
+    return false;
+
+  if (isa<LoadSDNode>(LDST)) {
+    LoadSDNode *Load = cast<LoadSDNode>(LDST);
+    // Don't transform one with multiple uses, this would require adding a new
+    // load.
+    if (!SDValue(Load, 0).hasOneUse())
+      return false;
+
+    if (LegalOperations &&
+        !TLI.isLoadExtLegal(ExtType, Load->getValueType(0), MemVT))
+      return false;
+
+    // For the transform to be legal, the load must produce only two values
+    // (the value loaded and the chain).  Don't transform a pre-increment
+    // load, for example, which produces an extra value.  Otherwise the
+    // transformation is not equivalent, and the downstream logic to replace
+    // uses gets things wrong.
+    if (Load->getNumValues() > 2)
+      return false;
+
+    // If the load that we're shrinking is an extload and we're not just
+    // discarding the extension we can't simply shrink the load. Bail.
+    // TODO: It would be possible to merge the extensions in some cases.
+    if (Load->getExtensionType() != ISD::NON_EXTLOAD &&
+        Load->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt)
+      return false;
+
+    if (!TLI.shouldReduceLoadWidth(Load, ExtType, MemVT))
+      return false;
+  } else {
+    assert(isa<StoreSDNode>(LDST) && "It is not a Load nor a Store SDNode");
+    StoreSDNode *Store = cast<StoreSDNode>(LDST);
+    // Can't write outside the original store
+    if (Store->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt)
+      return false;
+
+    if (LegalOperations &&
+        !TLI.isTruncStoreLegal(Store->getValue().getValueType(), MemVT))
+      return false;
+  }
+  return true;
+}
+
+bool DAGCombiner::SearchForAndLoads(SDNode *N,
+                                    SmallVectorImpl<LoadSDNode*> &Loads,
+                                    SmallPtrSetImpl<SDNode*> &NodesWithConsts,
+                                    ConstantSDNode *Mask,
+                                    SDNode *&NodeToMask) {
+  // Recursively search for the operands, looking for loads which can be
+  // narrowed.
+  for (SDValue Op : N->op_values()) {
+    if (Op.getValueType().isVector())
+      return false;
+
+    // Some constants may need fixing up later if they are too large.
+    if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
+      if ((N->getOpcode() == ISD::OR || N->getOpcode() == ISD::XOR) &&
+          (Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue())
+        NodesWithConsts.insert(N);
+      continue;
+    }
+
+    if (!Op.hasOneUse())
+      return false;
+
+    switch(Op.getOpcode()) {
+    case ISD::LOAD: {
+      auto *Load = cast<LoadSDNode>(Op);
+      EVT ExtVT;
+      if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) &&
+          isLegalNarrowLdSt(Load, ISD::ZEXTLOAD, ExtVT)) {
+
+        // ZEXTLOAD is already small enough.
+        if (Load->getExtensionType() == ISD::ZEXTLOAD &&
+            ExtVT.bitsGE(Load->getMemoryVT()))
+          continue;
+
+        // Use LE to convert equal sized loads to zext.
+        if (ExtVT.bitsLE(Load->getMemoryVT()))
+          Loads.push_back(Load);
+
+        continue;
+      }
+      return false;
+    }
+    case ISD::ZERO_EXTEND:
+    case ISD::AssertZext: {
+      unsigned ActiveBits = Mask->getAPIntValue().countr_one();
+      EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
+      EVT VT = Op.getOpcode() == ISD::AssertZext ?
+        cast<VTSDNode>(Op.getOperand(1))->getVT() :
+        Op.getOperand(0).getValueType();
+
+      // We can accept extending nodes if the mask is wider or an equal
+      // width to the original type.
+      if (ExtVT.bitsGE(VT))
+        continue;
+      break;
+    }
+    case ISD::OR:
+    case ISD::XOR:
+    case ISD::AND:
+      if (!SearchForAndLoads(Op.getNode(), Loads, NodesWithConsts, Mask,
+                             NodeToMask))
+        return false;
+      continue;
+    }
+
+    // Allow one node which will masked along with any loads found.
+    if (NodeToMask)
+      return false;
+
+    // Also ensure that the node to be masked only produces one data result.
+    NodeToMask = Op.getNode();
+    if (NodeToMask->getNumValues() > 1) {
+      bool HasValue = false;
+      for (unsigned i = 0, e = NodeToMask->getNumValues(); i < e; ++i) {
+        MVT VT = SDValue(NodeToMask, i).getSimpleValueType();
+        if (VT != MVT::Glue && VT != MVT::Other) {
+          if (HasValue) {
+            NodeToMask = nullptr;
+            return false;
+          }
+          HasValue = true;
+        }
+      }
+      assert(HasValue && "Node to be masked has no data result?");
+    }
+  }
+  return true;
+}
+
+bool DAGCombiner::BackwardsPropagateMask(SDNode *N) {
+  auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!Mask)
+    return false;
+
+  if (!Mask->getAPIntValue().isMask())
+    return false;
+
+  // No need to do anything if the and directly uses a load.
+  if (isa<LoadSDNode>(N->getOperand(0)))
+    return false;
+
+  SmallVector<LoadSDNode*, 8> Loads;
+  SmallPtrSet<SDNode*, 2> NodesWithConsts;
+  SDNode *FixupNode = nullptr;
+  if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) {
+    if (Loads.size() == 0)
+      return false;
+
+    LLVM_DEBUG(dbgs() << "Backwards propagate AND: "; N->dump());
+    SDValue MaskOp = N->getOperand(1);
+
+    // If it exists, fixup the single node we allow in the tree that needs
+    // masking.
+    if (FixupNode) {
+      LLVM_DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump());
+      SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode),
+                                FixupNode->getValueType(0),
+                                SDValue(FixupNode, 0), MaskOp);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And);
+      if (And.getOpcode() == ISD ::AND)
+        DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOp);
+    }
+
+    // Narrow any constants that need it.
+    for (auto *LogicN : NodesWithConsts) {
+      SDValue Op0 = LogicN->getOperand(0);
+      SDValue Op1 = LogicN->getOperand(1);
+
+      if (isa<ConstantSDNode>(Op0))
+          std::swap(Op0, Op1);
+
+      SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(),
+                                Op1, MaskOp);
+
+      DAG.UpdateNodeOperands(LogicN, Op0, And);
+    }
+
+    // Create narrow loads.
+    for (auto *Load : Loads) {
+      LLVM_DEBUG(dbgs() << "Propagate AND back to: "; Load->dump());
+      SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0),
+                                SDValue(Load, 0), MaskOp);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And);
+      if (And.getOpcode() == ISD ::AND)
+        And = SDValue(
+            DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp), 0);
+      SDValue NewLoad = reduceLoadWidth(And.getNode());
+      assert(NewLoad &&
+             "Shouldn't be masking the load if it can't be narrowed");
+      CombineTo(Load, NewLoad, NewLoad.getValue(1));
+    }
+    DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode());
+    return true;
+  }
+  return false;
+}
+
+// Unfold
+//    x &  (-1 'logical shift' y)
+// To
+//    (x 'opposite logical shift' y) 'logical shift' y
+// if it is better for performance.
+SDValue DAGCombiner::unfoldExtremeBitClearingToShifts(SDNode *N) {
+  assert(N->getOpcode() == ISD::AND);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // Do we actually prefer shifts over mask?
+  if (!TLI.shouldFoldMaskToVariableShiftPair(N0))
+    return SDValue();
+
+  // Try to match  (-1 '[outer] logical shift' y)
+  unsigned OuterShift;
+  unsigned InnerShift; // The opposite direction to the OuterShift.
+  SDValue Y;           // Shift amount.
+  auto matchMask = [&OuterShift, &InnerShift, &Y](SDValue M) -> bool {
+    if (!M.hasOneUse())
+      return false;
+    OuterShift = M->getOpcode();
+    if (OuterShift == ISD::SHL)
+      InnerShift = ISD::SRL;
+    else if (OuterShift == ISD::SRL)
+      InnerShift = ISD::SHL;
+    else
+      return false;
+    if (!isAllOnesConstant(M->getOperand(0)))
+      return false;
+    Y = M->getOperand(1);
+    return true;
+  };
+
+  SDValue X;
+  if (matchMask(N1))
+    X = N0;
+  else if (matchMask(N0))
+    X = N1;
+  else
+    return SDValue();
+
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  //     tmp = x   'opposite logical shift' y
+  SDValue T0 = DAG.getNode(InnerShift, DL, VT, X, Y);
+  //     ret = tmp 'logical shift' y
+  SDValue T1 = DAG.getNode(OuterShift, DL, VT, T0, Y);
+
+  return T1;
+}
+
+/// Try to replace shift/logic that tests if a bit is clear with mask + setcc.
+/// For a target with a bit test, this is expected to become test + set and save
+/// at least 1 instruction.
+static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) {
+  assert(And->getOpcode() == ISD::AND && "Expected an 'and' op");
+
+  // Look through an optional extension.
+  SDValue And0 = And->getOperand(0), And1 = And->getOperand(1);
+  if (And0.getOpcode() == ISD::ANY_EXTEND && And0.hasOneUse())
+    And0 = And0.getOperand(0);
+  if (!isOneConstant(And1) || !And0.hasOneUse())
+    return SDValue();
+
+  SDValue Src = And0;
+
+  // Attempt to find a 'not' op.
+  // TODO: Should we favor test+set even without the 'not' op?
+  bool FoundNot = false;
+  if (isBitwiseNot(Src)) {
+    FoundNot = true;
+    Src = Src.getOperand(0);
+
+    // Look though an optional truncation. The source operand may not be the
+    // same type as the original 'and', but that is ok because we are masking
+    // off everything but the low bit.
+    if (Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse())
+      Src = Src.getOperand(0);
+  }
+
+  // Match a shift-right by constant.
+  if (Src.getOpcode() != ISD::SRL || !Src.hasOneUse())
+    return SDValue();
+
+  // This is probably not worthwhile without a supported type.
+  EVT SrcVT = Src.getValueType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isTypeLegal(SrcVT))
+    return SDValue();
+
+  // We might have looked through casts that make this transform invalid.
+  unsigned BitWidth = SrcVT.getScalarSizeInBits();
+  SDValue ShiftAmt = Src.getOperand(1);
+  auto *ShiftAmtC = dyn_cast<ConstantSDNode>(ShiftAmt);
+  if (!ShiftAmtC || !ShiftAmtC->getAPIntValue().ult(BitWidth))
+    return SDValue();
+
+  // Set source to shift source.
+  Src = Src.getOperand(0);
+
+  // Try again to find a 'not' op.
+  // TODO: Should we favor test+set even with two 'not' ops?
+  if (!FoundNot) {
+    if (!isBitwiseNot(Src))
+      return SDValue();
+    Src = Src.getOperand(0);
+  }
+
+  if (!TLI.hasBitTest(Src, ShiftAmt))
+    return SDValue();
+
+  // Turn this into a bit-test pattern using mask op + setcc:
+  // and (not (srl X, C)), 1 --> (and X, 1<<C) == 0
+  // and (srl (not X), C)), 1 --> (and X, 1<<C) == 0
+  SDLoc DL(And);
+  SDValue X = DAG.getZExtOrTrunc(Src, DL, SrcVT);
+  EVT CCVT =
+      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
+  SDValue Mask = DAG.getConstant(
+      APInt::getOneBitSet(BitWidth, ShiftAmtC->getZExtValue()), DL, SrcVT);
+  SDValue NewAnd = DAG.getNode(ISD::AND, DL, SrcVT, X, Mask);
+  SDValue Zero = DAG.getConstant(0, DL, SrcVT);
+  SDValue Setcc = DAG.getSetCC(DL, CCVT, NewAnd, Zero, ISD::SETEQ);
+  return DAG.getZExtOrTrunc(Setcc, DL, And->getValueType(0));
+}
+
+/// For targets that support usubsat, match a bit-hack form of that operation
+/// that ends in 'and' and convert it.
+static SDValue foldAndToUsubsat(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N1.getValueType();
+
+  // Canonicalize SRA as operand 1.
+  if (N0.getOpcode() == ISD::SRA)
+    std::swap(N0, N1);
+
+  // xor/add with SMIN (signmask) are logically equivalent.
+  if (N0.getOpcode() != ISD::XOR && N0.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  if (N1.getOpcode() != ISD::SRA || !N0.hasOneUse() || !N1.hasOneUse() ||
+      N0.getOperand(0) != N1.getOperand(0))
+    return SDValue();
+
+  unsigned BitWidth = VT.getScalarSizeInBits();
+  ConstantSDNode *XorC = isConstOrConstSplat(N0.getOperand(1), true);
+  ConstantSDNode *SraC = isConstOrConstSplat(N1.getOperand(1), true);
+  if (!XorC || !XorC->getAPIntValue().isSignMask() ||
+      !SraC || SraC->getAPIntValue() != BitWidth - 1)
+    return SDValue();
+
+  // (i8 X ^ 128) & (i8 X s>> 7) --> usubsat X, 128
+  // (i8 X + 128) & (i8 X s>> 7) --> usubsat X, 128
+  SDLoc DL(N);
+  SDValue SignMask = DAG.getConstant(XorC->getAPIntValue(), DL, VT);
+  return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0), SignMask);
+}
+
+/// Given a bitwise logic operation N with a matching bitwise logic operand,
+/// fold a pattern where 2 of the source operands are identically shifted
+/// values. For example:
+/// ((X0 << Y) | Z) | (X1 << Y) --> ((X0 | X1) << Y) | Z
+static SDValue foldLogicOfShifts(SDNode *N, SDValue LogicOp, SDValue ShiftOp,
+                                 SelectionDAG &DAG) {
+  unsigned LogicOpcode = N->getOpcode();
+  assert(ISD::isBitwiseLogicOp(LogicOpcode) &&
+         "Expected bitwise logic operation");
+
+  if (!LogicOp.hasOneUse() || !ShiftOp.hasOneUse())
+    return SDValue();
+
+  // Match another bitwise logic op and a shift.
+  unsigned ShiftOpcode = ShiftOp.getOpcode();
+  if (LogicOp.getOpcode() != LogicOpcode ||
+      !(ShiftOpcode == ISD::SHL || ShiftOpcode == ISD::SRL ||
+        ShiftOpcode == ISD::SRA))
+    return SDValue();
+
+  // Match another shift op inside the first logic operand. Handle both commuted
+  // possibilities.
+  // LOGIC (LOGIC (SH X0, Y), Z), (SH X1, Y) --> LOGIC (SH (LOGIC X0, X1), Y), Z
+  // LOGIC (LOGIC Z, (SH X0, Y)), (SH X1, Y) --> LOGIC (SH (LOGIC X0, X1), Y), Z
+  SDValue X1 = ShiftOp.getOperand(0);
+  SDValue Y = ShiftOp.getOperand(1);
+  SDValue X0, Z;
+  if (LogicOp.getOperand(0).getOpcode() == ShiftOpcode &&
+      LogicOp.getOperand(0).getOperand(1) == Y) {
+    X0 = LogicOp.getOperand(0).getOperand(0);
+    Z = LogicOp.getOperand(1);
+  } else if (LogicOp.getOperand(1).getOpcode() == ShiftOpcode &&
+             LogicOp.getOperand(1).getOperand(1) == Y) {
+    X0 = LogicOp.getOperand(1).getOperand(0);
+    Z = LogicOp.getOperand(0);
+  } else {
+    return SDValue();
+  }
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  SDValue LogicX = DAG.getNode(LogicOpcode, DL, VT, X0, X1);
+  SDValue NewShift = DAG.getNode(ShiftOpcode, DL, VT, LogicX, Y);
+  return DAG.getNode(LogicOpcode, DL, VT, NewShift, Z);
+}
+
+/// Given a tree of logic operations with shape like
+/// (LOGIC (LOGIC (X, Y), LOGIC (Z, Y)))
+/// try to match and fold shift operations with the same shift amount.
+/// For example:
+/// LOGIC (LOGIC (SH X0, Y), Z), (LOGIC (SH X1, Y), W) -->
+/// --> LOGIC (SH (LOGIC X0, X1), Y), (LOGIC Z, W)
+static SDValue foldLogicTreeOfShifts(SDNode *N, SDValue LeftHand,
+                                     SDValue RightHand, SelectionDAG &DAG) {
+  unsigned LogicOpcode = N->getOpcode();
+  assert(ISD::isBitwiseLogicOp(LogicOpcode) &&
+         "Expected bitwise logic operation");
+  if (LeftHand.getOpcode() != LogicOpcode ||
+      RightHand.getOpcode() != LogicOpcode)
+    return SDValue();
+  if (!LeftHand.hasOneUse() || !RightHand.hasOneUse())
+    return SDValue();
+
+  // Try to match one of following patterns:
+  // LOGIC (LOGIC (SH X0, Y), Z), (LOGIC (SH X1, Y), W)
+  // LOGIC (LOGIC (SH X0, Y), Z), (LOGIC W, (SH X1, Y))
+  // Note that foldLogicOfShifts will handle commuted versions of the left hand
+  // itself.
+  SDValue CombinedShifts, W;
+  SDValue R0 = RightHand.getOperand(0);
+  SDValue R1 = RightHand.getOperand(1);
+  if ((CombinedShifts = foldLogicOfShifts(N, LeftHand, R0, DAG)))
+    W = R1;
+  else if ((CombinedShifts = foldLogicOfShifts(N, LeftHand, R1, DAG)))
+    W = R0;
+  else
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  return DAG.getNode(LogicOpcode, DL, VT, CombinedShifts, W);
+}
+
+SDValue DAGCombiner::visitAND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N1.getValueType();
+
+  // x & x --> x
+  if (N0 == N1)
+    return N0;
+
+  // fold (and c1, c2) -> c1&c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);
+
+  if (areBitwiseNotOfEachother(N0, N1))
+    return DAG.getConstant(APInt::getZero(VT.getScalarSizeInBits()), SDLoc(N),
+                           VT);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
+      return FoldedVOp;
+
+    // fold (and x, 0) -> 0, vector edition
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
+      // do not return N1, because undef node may exist in N1
+      return DAG.getConstant(APInt::getZero(N1.getScalarValueSizeInBits()),
+                             SDLoc(N), N1.getValueType());
+
+    // fold (and x, -1) -> x, vector edition
+    if (ISD::isConstantSplatVectorAllOnes(N1.getNode()))
+      return N0;
+
+    // fold (and (masked_load) (splat_vec (x, ...))) to zext_masked_load
+    auto *MLoad = dyn_cast<MaskedLoadSDNode>(N0);
+    ConstantSDNode *Splat = isConstOrConstSplat(N1, true, true);
+    if (MLoad && MLoad->getExtensionType() == ISD::EXTLOAD && Splat &&
+        N1.hasOneUse()) {
+      EVT LoadVT = MLoad->getMemoryVT();
+      EVT ExtVT = VT;
+      if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT, LoadVT)) {
+        // For this AND to be a zero extension of the masked load the elements
+        // of the BuildVec must mask the bottom bits of the extended element
+        // type
+        uint64_t ElementSize =
+            LoadVT.getVectorElementType().getScalarSizeInBits();
+        if (Splat->getAPIntValue().isMask(ElementSize)) {
+          auto NewLoad = DAG.getMaskedLoad(
+              ExtVT, SDLoc(N), MLoad->getChain(), MLoad->getBasePtr(),
+              MLoad->getOffset(), MLoad->getMask(), MLoad->getPassThru(),
+              LoadVT, MLoad->getMemOperand(), MLoad->getAddressingMode(),
+              ISD::ZEXTLOAD, MLoad->isExpandingLoad());
+          bool LoadHasOtherUsers = !N0.hasOneUse();
+          CombineTo(N, NewLoad);
+          if (LoadHasOtherUsers)
+            CombineTo(MLoad, NewLoad.getValue(0), NewLoad.getValue(1));
+          return SDValue(N, 0);
+        }
+      }
+    }
+  }
+
+  // fold (and x, -1) -> x
+  if (isAllOnesConstant(N1))
+    return N0;
+
+  // if (and x, c) is known to be zero, return 0
+  unsigned BitWidth = VT.getScalarSizeInBits();
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+  if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(BitWidth)))
+    return DAG.getConstant(0, SDLoc(N), VT);
+
+  if (SDValue R = foldAndOrOfSETCC(N, DAG))
+    return R;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  // reassociate and
+  if (SDValue RAND = reassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags()))
+    return RAND;
+
+  // Fold and(vecreduce(x), vecreduce(y)) -> vecreduce(and(x, y))
+  if (SDValue SD = reassociateReduction(ISD::VECREDUCE_AND, ISD::AND, SDLoc(N),
+                                        VT, N0, N1))
+    return SD;
+
+  // fold (and (or x, C), D) -> D if (C & D) == D
+  auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) {
+    return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue());
+  };
+  if (N0.getOpcode() == ISD::OR &&
+      ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset))
+    return N1;
+
+  if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
+    SDValue N0Op0 = N0.getOperand(0);
+    EVT SrcVT = N0Op0.getValueType();
+    unsigned SrcBitWidth = SrcVT.getScalarSizeInBits();
+    APInt Mask = ~N1C->getAPIntValue();
+    Mask = Mask.trunc(SrcBitWidth);
+
+    // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
+    if (DAG.MaskedValueIsZero(N0Op0, Mask))
+      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0Op0);
+
+    // fold (and (any_ext V), c) -> (zero_ext (and (trunc V), c)) if profitable.
+    if (N1C->getAPIntValue().countLeadingZeros() >= (BitWidth - SrcBitWidth) &&
+        TLI.isTruncateFree(VT, SrcVT) && TLI.isZExtFree(SrcVT, VT) &&
+        TLI.isTypeDesirableForOp(ISD::AND, SrcVT) &&
+        TLI.isNarrowingProfitable(VT, SrcVT)) {
+      SDLoc DL(N);
+      return DAG.getNode(ISD::ZERO_EXTEND, DL, VT,
+                         DAG.getNode(ISD::AND, DL, SrcVT, N0Op0,
+                                     DAG.getZExtOrTrunc(N1, DL, SrcVT)));
+    }
+  }
+
+  // fold (and (ext (and V, c1)), c2) -> (and (ext V), (and c1, (ext c2)))
+  if (ISD::isExtOpcode(N0.getOpcode())) {
+    unsigned ExtOpc = N0.getOpcode();
+    SDValue N0Op0 = N0.getOperand(0);
+    if (N0Op0.getOpcode() == ISD::AND &&
+        (ExtOpc != ISD::ZERO_EXTEND || !TLI.isZExtFree(N0Op0, VT)) &&
+        DAG.isConstantIntBuildVectorOrConstantInt(N1) &&
+        DAG.isConstantIntBuildVectorOrConstantInt(N0Op0.getOperand(1)) &&
+        N0->hasOneUse() && N0Op0->hasOneUse()) {
+      SDLoc DL(N);
+      SDValue NewMask =
+          DAG.getNode(ISD::AND, DL, VT, N1,
+                      DAG.getNode(ExtOpc, DL, VT, N0Op0.getOperand(1)));
+      return DAG.getNode(ISD::AND, DL, VT,
+                         DAG.getNode(ExtOpc, DL, VT, N0Op0.getOperand(0)),
+                         NewMask);
+    }
+  }
+
+  // similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) ->
+  // (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must
+  // already be zero by virtue of the width of the base type of the load.
+  //
+  // the 'X' node here can either be nothing or an extract_vector_elt to catch
+  // more cases.
+  if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+       N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() &&
+       N0.getOperand(0).getOpcode() == ISD::LOAD &&
+       N0.getOperand(0).getResNo() == 0) ||
+      (N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) {
+    LoadSDNode *Load = cast<LoadSDNode>( (N0.getOpcode() == ISD::LOAD) ?
+                                         N0 : N0.getOperand(0) );
+
+    // Get the constant (if applicable) the zero'th operand is being ANDed with.
+    // This can be a pure constant or a vector splat, in which case we treat the
+    // vector as a scalar and use the splat value.
+    APInt Constant = APInt::getZero(1);
+    if (const ConstantSDNode *C = isConstOrConstSplat(
+            N1, /*AllowUndef=*/false, /*AllowTruncation=*/true)) {
+      Constant = C->getAPIntValue();
+    } else if (BuildVectorSDNode *Vector = dyn_cast<BuildVectorSDNode>(N1)) {
+      APInt SplatValue, SplatUndef;
+      unsigned SplatBitSize;
+      bool HasAnyUndefs;
+      bool IsSplat = Vector->isConstantSplat(SplatValue, SplatUndef,
+                                             SplatBitSize, HasAnyUndefs);
+      if (IsSplat) {
+        // Undef bits can contribute to a possible optimisation if set, so
+        // set them.
+        SplatValue |= SplatUndef;
+
+        // The splat value may be something like "0x00FFFFFF", which means 0 for
+        // the first vector value and FF for the rest, repeating. We need a mask
+        // that will apply equally to all members of the vector, so AND all the
+        // lanes of the constant together.
+        unsigned EltBitWidth = Vector->getValueType(0).getScalarSizeInBits();
+
+        // If the splat value has been compressed to a bitlength lower
+        // than the size of the vector lane, we need to re-expand it to
+        // the lane size.
+        if (EltBitWidth > SplatBitSize)
+          for (SplatValue = SplatValue.zextOrTrunc(EltBitWidth);
+               SplatBitSize < EltBitWidth; SplatBitSize = SplatBitSize * 2)
+            SplatValue |= SplatValue.shl(SplatBitSize);
+
+        // Make sure that variable 'Constant' is only set if 'SplatBitSize' is a
+        // multiple of 'BitWidth'. Otherwise, we could propagate a wrong value.
+        if ((SplatBitSize % EltBitWidth) == 0) {
+          Constant = APInt::getAllOnes(EltBitWidth);
+          for (unsigned i = 0, n = (SplatBitSize / EltBitWidth); i < n; ++i)
+            Constant &= SplatValue.extractBits(EltBitWidth, i * EltBitWidth);
+        }
+      }
+    }
+
+    // If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is
+    // actually legal and isn't going to get expanded, else this is a false
+    // optimisation.
+    bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD,
+                                                    Load->getValueType(0),
+                                                    Load->getMemoryVT());
+
+    // Resize the constant to the same size as the original memory access before
+    // extension. If it is still the AllOnesValue then this AND is completely
+    // unneeded.
+    Constant = Constant.zextOrTrunc(Load->getMemoryVT().getScalarSizeInBits());
+
+    bool B;
+    switch (Load->getExtensionType()) {
+    default: B = false; break;
+    case ISD::EXTLOAD: B = CanZextLoadProfitably; break;
+    case ISD::ZEXTLOAD:
+    case ISD::NON_EXTLOAD: B = true; break;
+    }
+
+    if (B && Constant.isAllOnes()) {
+      // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to
+      // preserve semantics once we get rid of the AND.
+      SDValue NewLoad(Load, 0);
+
+      // Fold the AND away. NewLoad may get replaced immediately.
+      CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0);
+
+      if (Load->getExtensionType() == ISD::EXTLOAD) {
+        NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD,
+                              Load->getValueType(0), SDLoc(Load),
+                              Load->getChain(), Load->getBasePtr(),
+                              Load->getOffset(), Load->getMemoryVT(),
+                              Load->getMemOperand());
+        // Replace uses of the EXTLOAD with the new ZEXTLOAD.
+        if (Load->getNumValues() == 3) {
+          // PRE/POST_INC loads have 3 values.
+          SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1),
+                           NewLoad.getValue(2) };
+          CombineTo(Load, To, 3, true);
+        } else {
+          CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1));
+        }
+      }
+
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // Try to convert a constant mask AND into a shuffle clear mask.
+  if (VT.isVector())
+    if (SDValue Shuffle = XformToShuffleWithZero(N))
+      return Shuffle;
+
+  if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N))
+    return Combined;
+
+  if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && N0.hasOneUse() && N1C &&
+      ISD::isExtOpcode(N0.getOperand(0).getOpcode())) {
+    SDValue Ext = N0.getOperand(0);
+    EVT ExtVT = Ext->getValueType(0);
+    SDValue Extendee = Ext->getOperand(0);
+
+    unsigned ScalarWidth = Extendee.getValueType().getScalarSizeInBits();
+    if (N1C->getAPIntValue().isMask(ScalarWidth) &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, ExtVT))) {
+      //    (and (extract_subvector (zext|anyext|sext v) _) iN_mask)
+      // => (extract_subvector (iN_zeroext v))
+      SDValue ZeroExtExtendee =
+          DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), ExtVT, Extendee);
+
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, ZeroExtExtendee,
+                         N0.getOperand(1));
+    }
+  }
+
+  // fold (and (masked_gather x)) -> (zext_masked_gather x)
+  if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
+    EVT MemVT = GN0->getMemoryVT();
+    EVT ScalarVT = MemVT.getScalarType();
+
+    if (SDValue(GN0, 0).hasOneUse() &&
+        isConstantSplatVectorMaskForType(N1.getNode(), ScalarVT) &&
+        TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
+      SDValue Ops[] = {GN0->getChain(),   GN0->getPassThru(), GN0->getMask(),
+                       GN0->getBasePtr(), GN0->getIndex(),    GN0->getScale()};
+
+      SDValue ZExtLoad = DAG.getMaskedGather(
+          DAG.getVTList(VT, MVT::Other), MemVT, SDLoc(N), Ops,
+          GN0->getMemOperand(), GN0->getIndexType(), ISD::ZEXTLOAD);
+
+      CombineTo(N, ZExtLoad);
+      AddToWorklist(ZExtLoad.getNode());
+      // Avoid recheck of N.
+      return SDValue(N, 0);
+    }
+  }
+
+  // fold (and (load x), 255) -> (zextload x, i8)
+  // fold (and (extload x, i16), 255) -> (zextload x, i8)
+  if (N1C && N0.getOpcode() == ISD::LOAD && !VT.isVector())
+    if (SDValue Res = reduceLoadWidth(N))
+      return Res;
+
+  if (LegalTypes) {
+    // Attempt to propagate the AND back up to the leaves which, if they're
+    // loads, can be combined to narrow loads and the AND node can be removed.
+    // Perform after legalization so that extend nodes will already be
+    // combined into the loads.
+    if (BackwardsPropagateMask(N))
+      return SDValue(N, 0);
+  }
+
+  if (SDValue Combined = visitANDLike(N0, N1, N))
+    return Combined;
+
+  // Simplify: (and (op x...), (op y...))  -> (op (and x, y))
+  if (N0.getOpcode() == N1.getOpcode())
+    if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
+      return V;
+
+  if (SDValue R = foldLogicOfShifts(N, N0, N1, DAG))
+    return R;
+  if (SDValue R = foldLogicOfShifts(N, N1, N0, DAG))
+    return R;
+
+  // Masking the negated extension of a boolean is just the zero-extended
+  // boolean:
+  // and (sub 0, zext(bool X)), 1 --> zext(bool X)
+  // and (sub 0, sext(bool X)), 1 --> zext(bool X)
+  //
+  // Note: the SimplifyDemandedBits fold below can make an information-losing
+  // transform, and then we have no way to find this better fold.
+  if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) {
+    if (isNullOrNullSplat(N0.getOperand(0))) {
+      SDValue SubRHS = N0.getOperand(1);
+      if (SubRHS.getOpcode() == ISD::ZERO_EXTEND &&
+          SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
+        return SubRHS;
+      if (SubRHS.getOpcode() == ISD::SIGN_EXTEND &&
+          SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
+        return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, SubRHS.getOperand(0));
+    }
+  }
+
+  // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
+  // fold (and (sra)) -> (and (srl)) when possible.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (zext_inreg (extload x)) -> (zextload x)
+  // fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use
+  if (ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      (ISD::isEXTLoad(N0.getNode()) ||
+       (ISD::isSEXTLoad(N0.getNode()) && N0.hasOneUse()))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    EVT MemVT = LN0->getMemoryVT();
+    // If we zero all the possible extended bits, then we can turn this into
+    // a zextload if we are running before legalize or the operation is legal.
+    unsigned ExtBitSize = N1.getScalarValueSizeInBits();
+    unsigned MemBitSize = MemVT.getScalarSizeInBits();
+    APInt ExtBits = APInt::getHighBitsSet(ExtBitSize, ExtBitSize - MemBitSize);
+    if (DAG.MaskedValueIsZero(N1, ExtBits) &&
+        ((!LegalOperations && LN0->isSimple()) ||
+         TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
+      SDValue ExtLoad =
+          DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(),
+                         LN0->getBasePtr(), MemVT, LN0->getMemOperand());
+      AddToWorklist(N);
+      CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const)
+  if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) {
+    if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
+                                           N0.getOperand(1), false))
+      return BSwap;
+  }
+
+  if (SDValue Shifts = unfoldExtremeBitClearingToShifts(N))
+    return Shifts;
+
+  if (SDValue V = combineShiftAnd1ToBitTest(N, DAG))
+    return V;
+
+  // Recognize the following pattern:
+  //
+  // AndVT = (and (sign_extend NarrowVT to AndVT) #bitmask)
+  //
+  // where bitmask is a mask that clears the upper bits of AndVT. The
+  // number of bits in bitmask must be a power of two.
+  auto IsAndZeroExtMask = [](SDValue LHS, SDValue RHS) {
+    if (LHS->getOpcode() != ISD::SIGN_EXTEND)
+      return false;
+
+    auto *C = dyn_cast<ConstantSDNode>(RHS);
+    if (!C)
+      return false;
+
+    if (!C->getAPIntValue().isMask(
+            LHS.getOperand(0).getValueType().getFixedSizeInBits()))
+      return false;
+
+    return true;
+  };
+
+  // Replace (and (sign_extend ...) #bitmask) with (zero_extend ...).
+  if (IsAndZeroExtMask(N0, N1))
+    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0));
+
+  if (hasOperation(ISD::USUBSAT, VT))
+    if (SDValue V = foldAndToUsubsat(N, DAG))
+      return V;
+
+  // Postpone until legalization completed to avoid interference with bswap
+  // folding
+  if (LegalOperations || VT.isVector())
+    if (SDValue R = foldLogicTreeOfShifts(N, N0, N1, DAG))
+      return R;
+
+  return SDValue();
+}
+
+/// Match (a >> 8) | (a << 8) as (bswap a) >> 16.
+SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
+                                        bool DemandHighBits) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16)
+    return SDValue();
+  if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
+    return SDValue();
+
+  // Recognize (and (shl a, 8), 0xff00), (and (srl a, 8), 0xff)
+  bool LookPassAnd0 = false;
+  bool LookPassAnd1 = false;
+  if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL)
+    std::swap(N0, N1);
+  if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL)
+    std::swap(N0, N1);
+  if (N0.getOpcode() == ISD::AND) {
+    if (!N0->hasOneUse())
+      return SDValue();
+    ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    // Also handle 0xffff since the LHS is guaranteed to have zeros there.
+    // This is needed for X86.
+    if (!N01C || (N01C->getZExtValue() != 0xFF00 &&
+                  N01C->getZExtValue() != 0xFFFF))
+      return SDValue();
+    N0 = N0.getOperand(0);
+    LookPassAnd0 = true;
+  }
+
+  if (N1.getOpcode() == ISD::AND) {
+    if (!N1->hasOneUse())
+      return SDValue();
+    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    if (!N11C || N11C->getZExtValue() != 0xFF)
+      return SDValue();
+    N1 = N1.getOperand(0);
+    LookPassAnd1 = true;
+  }
+
+  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
+    std::swap(N0, N1);
+  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
+    return SDValue();
+  if (!N0->hasOneUse() || !N1->hasOneUse())
+    return SDValue();
+
+  ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+  if (!N01C || !N11C)
+    return SDValue();
+  if (N01C->getZExtValue() != 8 || N11C->getZExtValue() != 8)
+    return SDValue();
+
+  // Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8)
+  SDValue N00 = N0->getOperand(0);
+  if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) {
+    if (!N00->hasOneUse())
+      return SDValue();
+    ConstantSDNode *N001C = dyn_cast<ConstantSDNode>(N00.getOperand(1));
+    if (!N001C || N001C->getZExtValue() != 0xFF)
+      return SDValue();
+    N00 = N00.getOperand(0);
+    LookPassAnd0 = true;
+  }
+
+  SDValue N10 = N1->getOperand(0);
+  if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) {
+    if (!N10->hasOneUse())
+      return SDValue();
+    ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1));
+    // Also allow 0xFFFF since the bits will be shifted out. This is needed
+    // for X86.
+    if (!N101C || (N101C->getZExtValue() != 0xFF00 &&
+                   N101C->getZExtValue() != 0xFFFF))
+      return SDValue();
+    N10 = N10.getOperand(0);
+    LookPassAnd1 = true;
+  }
+
+  if (N00 != N10)
+    return SDValue();
+
+  // Make sure everything beyond the low halfword gets set to zero since the SRL
+  // 16 will clear the top bits.
+  unsigned OpSizeInBits = VT.getSizeInBits();
+  if (OpSizeInBits > 16) {
+    // If the left-shift isn't masked out then the only way this is a bswap is
+    // if all bits beyond the low 8 are 0. In that case the entire pattern
+    // reduces to a left shift anyway: leave it for other parts of the combiner.
+    if (DemandHighBits && !LookPassAnd0)
+      return SDValue();
+
+    // However, if the right shift isn't masked out then it might be because
+    // it's not needed. See if we can spot that too. If the high bits aren't
+    // demanded, we only need bits 23:16 to be zero. Otherwise, we need all
+    // upper bits to be zero.
+    if (!LookPassAnd1) {
+      unsigned HighBit = DemandHighBits ? OpSizeInBits : 24;
+      if (!DAG.MaskedValueIsZero(N10,
+                                 APInt::getBitsSet(OpSizeInBits, 16, HighBit)))
+        return SDValue();
+    }
+  }
+
+  SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00);
+  if (OpSizeInBits > 16) {
+    SDLoc DL(N);
+    Res = DAG.getNode(ISD::SRL, DL, VT, Res,
+                      DAG.getConstant(OpSizeInBits - 16, DL,
+                                      getShiftAmountTy(VT)));
+  }
+  return Res;
+}
+
+/// Return true if the specified node is an element that makes up a 32-bit
+/// packed halfword byteswap.
+/// ((x & 0x000000ff) << 8) |
+/// ((x & 0x0000ff00) >> 8) |
+/// ((x & 0x00ff0000) << 8) |
+/// ((x & 0xff000000) >> 8)
+static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
+  if (!N->hasOneUse())
+    return false;
+
+  unsigned Opc = N.getOpcode();
+  if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL)
+    return false;
+
+  SDValue N0 = N.getOperand(0);
+  unsigned Opc0 = N0.getOpcode();
+  if (Opc0 != ISD::AND && Opc0 != ISD::SHL && Opc0 != ISD::SRL)
+    return false;
+
+  ConstantSDNode *N1C = nullptr;
+  // SHL or SRL: look upstream for AND mask operand
+  if (Opc == ISD::AND)
+    N1C = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  else if (Opc0 == ISD::AND)
+    N1C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  if (!N1C)
+    return false;
+
+  unsigned MaskByteOffset;
+  switch (N1C->getZExtValue()) {
+  default:
+    return false;
+  case 0xFF:       MaskByteOffset = 0; break;
+  case 0xFF00:     MaskByteOffset = 1; break;
+  case 0xFFFF:
+    // In case demanded bits didn't clear the bits that will be shifted out.
+    // This is needed for X86.
+    if (Opc == ISD::SRL || (Opc == ISD::AND && Opc0 == ISD::SHL)) {
+      MaskByteOffset = 1;
+      break;
+    }
+    return false;
+  case 0xFF0000:   MaskByteOffset = 2; break;
+  case 0xFF000000: MaskByteOffset = 3; break;
+  }
+
+  // Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00).
+  if (Opc == ISD::AND) {
+    if (MaskByteOffset == 0 || MaskByteOffset == 2) {
+      // (x >> 8) & 0xff
+      // (x >> 8) & 0xff0000
+      if (Opc0 != ISD::SRL)
+        return false;
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      if (!C || C->getZExtValue() != 8)
+        return false;
+    } else {
+      // (x << 8) & 0xff00
+      // (x << 8) & 0xff000000
+      if (Opc0 != ISD::SHL)
+        return false;
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      if (!C || C->getZExtValue() != 8)
+        return false;
+    }
+  } else if (Opc == ISD::SHL) {
+    // (x & 0xff) << 8
+    // (x & 0xff0000) << 8
+    if (MaskByteOffset != 0 && MaskByteOffset != 2)
+      return false;
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!C || C->getZExtValue() != 8)
+      return false;
+  } else { // Opc == ISD::SRL
+    // (x & 0xff00) >> 8
+    // (x & 0xff000000) >> 8
+    if (MaskByteOffset != 1 && MaskByteOffset != 3)
+      return false;
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!C || C->getZExtValue() != 8)
+      return false;
+  }
+
+  if (Parts[MaskByteOffset])
+    return false;
+
+  Parts[MaskByteOffset] = N0.getOperand(0).getNode();
+  return true;
+}
+
+// Match 2 elements of a packed halfword bswap.
+static bool isBSwapHWordPair(SDValue N, MutableArrayRef<SDNode *> Parts) {
+  if (N.getOpcode() == ISD::OR)
+    return isBSwapHWordElement(N.getOperand(0), Parts) &&
+           isBSwapHWordElement(N.getOperand(1), Parts);
+
+  if (N.getOpcode() == ISD::SRL && N.getOperand(0).getOpcode() == ISD::BSWAP) {
+    ConstantSDNode *C = isConstOrConstSplat(N.getOperand(1));
+    if (!C || C->getAPIntValue() != 16)
+      return false;
+    Parts[0] = Parts[1] = N.getOperand(0).getOperand(0).getNode();
+    return true;
+  }
+
+  return false;
+}
+
+// Match this pattern:
+//   (or (and (shl (A, 8)), 0xff00ff00), (and (srl (A, 8)), 0x00ff00ff))
+// And rewrite this to:
+//   (rotr (bswap A), 16)
+static SDValue matchBSwapHWordOrAndAnd(const TargetLowering &TLI,
+                                       SelectionDAG &DAG, SDNode *N, SDValue N0,
+                                       SDValue N1, EVT VT, EVT ShiftAmountTy) {
+  assert(N->getOpcode() == ISD::OR && VT == MVT::i32 &&
+         "MatchBSwapHWordOrAndAnd: expecting i32");
+  if (!TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
+    return SDValue();
+  if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
+    return SDValue();
+  // TODO: this is too restrictive; lifting this restriction requires more tests
+  if (!N0->hasOneUse() || !N1->hasOneUse())
+    return SDValue();
+  ConstantSDNode *Mask0 = isConstOrConstSplat(N0.getOperand(1));
+  ConstantSDNode *Mask1 = isConstOrConstSplat(N1.getOperand(1));
+  if (!Mask0 || !Mask1)
+    return SDValue();
+  if (Mask0->getAPIntValue() != 0xff00ff00 ||
+      Mask1->getAPIntValue() != 0x00ff00ff)
+    return SDValue();
+  SDValue Shift0 = N0.getOperand(0);
+  SDValue Shift1 = N1.getOperand(0);
+  if (Shift0.getOpcode() != ISD::SHL || Shift1.getOpcode() != ISD::SRL)
+    return SDValue();
+  ConstantSDNode *ShiftAmt0 = isConstOrConstSplat(Shift0.getOperand(1));
+  ConstantSDNode *ShiftAmt1 = isConstOrConstSplat(Shift1.getOperand(1));
+  if (!ShiftAmt0 || !ShiftAmt1)
+    return SDValue();
+  if (ShiftAmt0->getAPIntValue() != 8 || ShiftAmt1->getAPIntValue() != 8)
+    return SDValue();
+  if (Shift0.getOperand(0) != Shift1.getOperand(0))
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, Shift0.getOperand(0));
+  SDValue ShAmt = DAG.getConstant(16, DL, ShiftAmountTy);
+  return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);
+}
+
+/// Match a 32-bit packed halfword bswap. That is
+/// ((x & 0x000000ff) << 8) |
+/// ((x & 0x0000ff00) >> 8) |
+/// ((x & 0x00ff0000) << 8) |
+/// ((x & 0xff000000) >> 8)
+/// => (rotl (bswap x), 16)
+SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32)
+    return SDValue();
+  if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
+    return SDValue();
+
+  if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N0, N1, VT,
+                                              getShiftAmountTy(VT)))
+  return BSwap;
+
+  // Try again with commuted operands.
+  if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N1, N0, VT,
+                                              getShiftAmountTy(VT)))
+  return BSwap;
+
+
+  // Look for either
+  // (or (bswaphpair), (bswaphpair))
+  // (or (or (bswaphpair), (and)), (and))
+  // (or (or (and), (bswaphpair)), (and))
+  SDNode *Parts[4] = {};
+
+  if (isBSwapHWordPair(N0, Parts)) {
+    // (or (or (and), (and)), (or (and), (and)))
+    if (!isBSwapHWordPair(N1, Parts))
+      return SDValue();
+  } else if (N0.getOpcode() == ISD::OR) {
+    // (or (or (or (and), (and)), (and)), (and))
+    if (!isBSwapHWordElement(N1, Parts))
+      return SDValue();
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+    if (!(isBSwapHWordElement(N01, Parts) && isBSwapHWordPair(N00, Parts)) &&
+        !(isBSwapHWordElement(N00, Parts) && isBSwapHWordPair(N01, Parts)))
+      return SDValue();
+  } else {
+    return SDValue();
+  }
+
+  // Make sure the parts are all coming from the same node.
+  if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3])
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT,
+                              SDValue(Parts[0], 0));
+
+  // Result of the bswap should be rotated by 16. If it's not legal, then
+  // do  (x << 16) | (x >> 16).
+  SDValue ShAmt = DAG.getConstant(16, DL, getShiftAmountTy(VT));
+  if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT))
+    return DAG.getNode(ISD::ROTL, DL, VT, BSwap, ShAmt);
+  if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
+    return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);
+  return DAG.getNode(ISD::OR, DL, VT,
+                     DAG.getNode(ISD::SHL, DL, VT, BSwap, ShAmt),
+                     DAG.getNode(ISD::SRL, DL, VT, BSwap, ShAmt));
+}
+
+/// This contains all DAGCombine rules which reduce two values combined by
+/// an Or operation to a single value \see visitANDLike().
+SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
+  EVT VT = N1.getValueType();
+  SDLoc DL(N);
+
+  // fold (or x, undef) -> -1
+  if (!LegalOperations && (N0.isUndef() || N1.isUndef()))
+    return DAG.getAllOnesConstant(DL, VT);
+
+  if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL))
+    return V;
+
+  // (or (and X, C1), (and Y, C2))  -> (and (or X, Y), C3) if possible.
+  if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
+      // Don't increase # computations.
+      (N0->hasOneUse() || N1->hasOneUse())) {
+    // We can only do this xform if we know that bits from X that are set in C2
+    // but not in C1 are already zero.  Likewise for Y.
+    if (const ConstantSDNode *N0O1C =
+        getAsNonOpaqueConstant(N0.getOperand(1))) {
+      if (const ConstantSDNode *N1O1C =
+          getAsNonOpaqueConstant(N1.getOperand(1))) {
+        // We can only do this xform if we know that bits from X that are set in
+        // C2 but not in C1 are already zero.  Likewise for Y.
+        const APInt &LHSMask = N0O1C->getAPIntValue();
+        const APInt &RHSMask = N1O1C->getAPIntValue();
+
+        if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) &&
+            DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {
+          SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
+                                  N0.getOperand(0), N1.getOperand(0));
+          return DAG.getNode(ISD::AND, DL, VT, X,
+                             DAG.getConstant(LHSMask | RHSMask, DL, VT));
+        }
+      }
+    }
+  }
+
+  // (or (and X, M), (and X, N)) -> (and X, (or M, N))
+  if (N0.getOpcode() == ISD::AND &&
+      N1.getOpcode() == ISD::AND &&
+      N0.getOperand(0) == N1.getOperand(0) &&
+      // Don't increase # computations.
+      (N0->hasOneUse() || N1->hasOneUse())) {
+    SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
+                            N0.getOperand(1), N1.getOperand(1));
+    return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X);
+  }
+
+  return SDValue();
+}
+
+/// OR combines for which the commuted variant will be tried as well.
+static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
+                                  SDNode *N) {
+  EVT VT = N0.getValueType();
+
+  auto peekThroughResize = [](SDValue V) {
+    if (V->getOpcode() == ISD::ZERO_EXTEND || V->getOpcode() == ISD::TRUNCATE)
+      return V->getOperand(0);
+    return V;
+  };
+
+  SDValue N0Resized = peekThroughResize(N0);
+  if (N0Resized.getOpcode() == ISD::AND) {
+    SDValue N1Resized = peekThroughResize(N1);
+    SDValue N00 = N0Resized.getOperand(0);
+    SDValue N01 = N0Resized.getOperand(1);
+
+    // fold or (and x, y), x --> x
+    if (N00 == N1Resized || N01 == N1Resized)
+      return N1;
+
+    // fold (or (and X, (xor Y, -1)), Y) -> (or X, Y)
+    // TODO: Set AllowUndefs = true.
+    if (SDValue NotOperand = getBitwiseNotOperand(N01, N00,
+                                                  /* AllowUndefs */ false)) {
+      if (peekThroughResize(NotOperand) == N1Resized)
+        return DAG.getNode(ISD::OR, SDLoc(N), VT,
+                           DAG.getZExtOrTrunc(N00, SDLoc(N), VT), N1);
+    }
+
+    // fold (or (and (xor Y, -1), X), Y) -> (or X, Y)
+    if (SDValue NotOperand = getBitwiseNotOperand(N00, N01,
+                                                  /* AllowUndefs */ false)) {
+      if (peekThroughResize(NotOperand) == N1Resized)
+        return DAG.getNode(ISD::OR, SDLoc(N), VT,
+                           DAG.getZExtOrTrunc(N01, SDLoc(N), VT), N1);
+    }
+  }
+
+  if (N0.getOpcode() == ISD::XOR) {
+    // fold or (xor x, y), x --> or x, y
+    //      or (xor x, y), (x and/or y) --> or x, y
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+    if (N00 == N1)
+      return DAG.getNode(ISD::OR, SDLoc(N), VT, N01, N1);
+    if (N01 == N1)
+      return DAG.getNode(ISD::OR, SDLoc(N), VT, N00, N1);
+
+    if (N1.getOpcode() == ISD::AND || N1.getOpcode() == ISD::OR) {
+      SDValue N10 = N1.getOperand(0);
+      SDValue N11 = N1.getOperand(1);
+      if ((N00 == N10 && N01 == N11) || (N00 == N11 && N01 == N10))
+        return DAG.getNode(ISD::OR, SDLoc(N), VT, N00, N01);
+    }
+  }
+
+  if (SDValue R = foldLogicOfShifts(N, N0, N1, DAG))
+    return R;
+
+  auto peekThroughZext = [](SDValue V) {
+    if (V->getOpcode() == ISD::ZERO_EXTEND)
+      return V->getOperand(0);
+    return V;
+  };
+
+  // (fshl X, ?, Y) | (shl X, Y) --> fshl X, ?, Y
+  if (N0.getOpcode() == ISD::FSHL && N1.getOpcode() == ISD::SHL &&
+      N0.getOperand(0) == N1.getOperand(0) &&
+      peekThroughZext(N0.getOperand(2)) == peekThroughZext(N1.getOperand(1)))
+    return N0;
+
+  // (fshr ?, X, Y) | (srl X, Y) --> fshr ?, X, Y
+  if (N0.getOpcode() == ISD::FSHR && N1.getOpcode() == ISD::SRL &&
+      N0.getOperand(1) == N1.getOperand(0) &&
+      peekThroughZext(N0.getOperand(2)) == peekThroughZext(N1.getOperand(1)))
+    return N0;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N1.getValueType();
+
+  // x | x --> x
+  if (N0 == N1)
+    return N0;
+
+  // fold (or c1, c2) -> c1|c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
+      return FoldedVOp;
+
+    // fold (or x, 0) -> x, vector edition
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
+      return N0;
+
+    // fold (or x, -1) -> -1, vector edition
+    if (ISD::isConstantSplatVectorAllOnes(N1.getNode()))
+      // do not return N1, because undef node may exist in N1
+      return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType());
+
+    // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask)
+    // Do this only if the resulting type / shuffle is legal.
+    auto *SV0 = dyn_cast<ShuffleVectorSDNode>(N0);
+    auto *SV1 = dyn_cast<ShuffleVectorSDNode>(N1);
+    if (SV0 && SV1 && TLI.isTypeLegal(VT)) {
+      bool ZeroN00 = ISD::isBuildVectorAllZeros(N0.getOperand(0).getNode());
+      bool ZeroN01 = ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode());
+      bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
+      bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode());
+      // Ensure both shuffles have a zero input.
+      if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) {
+        assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!");
+        assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!");
+        bool CanFold = true;
+        int NumElts = VT.getVectorNumElements();
+        SmallVector<int, 4> Mask(NumElts, -1);
+
+        for (int i = 0; i != NumElts; ++i) {
+          int M0 = SV0->getMaskElt(i);
+          int M1 = SV1->getMaskElt(i);
+
+          // Determine if either index is pointing to a zero vector.
+          bool M0Zero = M0 < 0 || (ZeroN00 == (M0 < NumElts));
+          bool M1Zero = M1 < 0 || (ZeroN10 == (M1 < NumElts));
+
+          // If one element is zero and the otherside is undef, keep undef.
+          // This also handles the case that both are undef.
+          if ((M0Zero && M1 < 0) || (M1Zero && M0 < 0))
+            continue;
+
+          // Make sure only one of the elements is zero.
+          if (M0Zero == M1Zero) {
+            CanFold = false;
+            break;
+          }
+
+          assert((M0 >= 0 || M1 >= 0) && "Undef index!");
+
+          // We have a zero and non-zero element. If the non-zero came from
+          // SV0 make the index a LHS index. If it came from SV1, make it
+          // a RHS index. We need to mod by NumElts because we don't care
+          // which operand it came from in the original shuffles.
+          Mask[i] = M1Zero ? M0 % NumElts : (M1 % NumElts) + NumElts;
+        }
+
+        if (CanFold) {
+          SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0);
+          SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0);
+
+          SDValue LegalShuffle =
+              TLI.buildLegalVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS,
+                                          Mask, DAG);
+          if (LegalShuffle)
+            return LegalShuffle;
+        }
+      }
+    }
+  }
+
+  // fold (or x, 0) -> x
+  if (isNullConstant(N1))
+    return N0;
+
+  // fold (or x, -1) -> -1
+  if (isAllOnesConstant(N1))
+    return N1;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  // fold (or x, c) -> c iff (x & ~c) == 0
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))
+    return N1;
+
+  if (SDValue R = foldAndOrOfSETCC(N, DAG))
+    return R;
+
+  if (SDValue Combined = visitORLike(N0, N1, N))
+    return Combined;
+
+  if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N))
+    return Combined;
+
+  // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
+  if (SDValue BSwap = MatchBSwapHWord(N, N0, N1))
+    return BSwap;
+  if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1))
+    return BSwap;
+
+  // reassociate or
+  if (SDValue ROR = reassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags()))
+    return ROR;
+
+  // Fold or(vecreduce(x), vecreduce(y)) -> vecreduce(or(x, y))
+  if (SDValue SD = reassociateReduction(ISD::VECREDUCE_OR, ISD::OR, SDLoc(N),
+                                        VT, N0, N1))
+    return SD;
+
+  // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
+  // iff (c1 & c2) != 0 or c1/c2 are undef.
+  auto MatchIntersect = [](ConstantSDNode *C1, ConstantSDNode *C2) {
+    return !C1 || !C2 || C1->getAPIntValue().intersects(C2->getAPIntValue());
+  };
+  if (N0.getOpcode() == ISD::AND && N0->hasOneUse() &&
+      ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect, true)) {
+    if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT,
+                                                 {N1, N0.getOperand(1)})) {
+      SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1);
+      AddToWorklist(IOR.getNode());
+      return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR);
+    }
+  }
+
+  if (SDValue Combined = visitORCommutative(DAG, N0, N1, N))
+    return Combined;
+  if (SDValue Combined = visitORCommutative(DAG, N1, N0, N))
+    return Combined;
+
+  // Simplify: (or (op x...), (op y...))  -> (op (or x, y))
+  if (N0.getOpcode() == N1.getOpcode())
+    if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
+      return V;
+
+  // See if this is some rotate idiom.
+  if (SDValue Rot = MatchRotate(N0, N1, SDLoc(N)))
+    return Rot;
+
+  if (SDValue Load = MatchLoadCombine(N))
+    return Load;
+
+  // Simplify the operands using demanded-bits information.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // If OR can be rewritten into ADD, try combines based on ADD.
+  if ((!LegalOperations || TLI.isOperationLegal(ISD::ADD, VT)) &&
+      DAG.haveNoCommonBitsSet(N0, N1))
+    if (SDValue Combined = visitADDLike(N))
+      return Combined;
+
+  // Postpone until legalization completed to avoid interference with bswap
+  // folding
+  if (LegalOperations || VT.isVector())
+    if (SDValue R = foldLogicTreeOfShifts(N, N0, N1, DAG))
+      return R;
+
+  return SDValue();
+}
+
+static SDValue stripConstantMask(const SelectionDAG &DAG, SDValue Op,
+                                 SDValue &Mask) {
+  if (Op.getOpcode() == ISD::AND &&
+      DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) {
+    Mask = Op.getOperand(1);
+    return Op.getOperand(0);
+  }
+  return Op;
+}
+
+/// Match "(X shl/srl V1) & V2" where V2 may not be present.
+static bool matchRotateHalf(const SelectionDAG &DAG, SDValue Op, SDValue &Shift,
+                            SDValue &Mask) {
+  Op = stripConstantMask(DAG, Op, Mask);
+  if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) {
+    Shift = Op;
+    return true;
+  }
+  return false;
+}
+
+/// Helper function for visitOR to extract the needed side of a rotate idiom
+/// from a shl/srl/mul/udiv.  This is meant to handle cases where
+/// InstCombine merged some outside op with one of the shifts from
+/// the rotate pattern.
+/// \returns An empty \c SDValue if the needed shift couldn't be extracted.
+/// Otherwise, returns an expansion of \p ExtractFrom based on the following
+/// patterns:
+///
+///   (or (add v v) (shrl v bitwidth-1)):
+///     expands (add v v) -> (shl v 1)
+///
+///   (or (mul v c0) (shrl (mul v c1) c2)):
+///     expands (mul v c0) -> (shl (mul v c1) c3)
+///
+///   (or (udiv v c0) (shl (udiv v c1) c2)):
+///     expands (udiv v c0) -> (shrl (udiv v c1) c3)
+///
+///   (or (shl v c0) (shrl (shl v c1) c2)):
+///     expands (shl v c0) -> (shl (shl v c1) c3)
+///
+///   (or (shrl v c0) (shl (shrl v c1) c2)):
+///     expands (shrl v c0) -> (shrl (shrl v c1) c3)
+///
+/// Such that in all cases, c3+c2==bitwidth(op v c1).
+static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
+                                     SDValue ExtractFrom, SDValue &Mask,
+                                     const SDLoc &DL) {
+  assert(OppShift && ExtractFrom && "Empty SDValue");
+  if (OppShift.getOpcode() != ISD::SHL && OppShift.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  ExtractFrom = stripConstantMask(DAG, ExtractFrom, Mask);
+
+  // Value and Type of the shift.
+  SDValue OppShiftLHS = OppShift.getOperand(0);
+  EVT ShiftedVT = OppShiftLHS.getValueType();
+
+  // Amount of the existing shift.
+  ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1));
+
+  // (add v v) -> (shl v 1)
+  // TODO: Should this be a general DAG canonicalization?
+  if (OppShift.getOpcode() == ISD::SRL && OppShiftCst &&
+      ExtractFrom.getOpcode() == ISD::ADD &&
+      ExtractFrom.getOperand(0) == ExtractFrom.getOperand(1) &&
+      ExtractFrom.getOperand(0) == OppShiftLHS &&
+      OppShiftCst->getAPIntValue() == ShiftedVT.getScalarSizeInBits() - 1)
+    return DAG.getNode(ISD::SHL, DL, ShiftedVT, OppShiftLHS,
+                       DAG.getShiftAmountConstant(1, ShiftedVT, DL));
+
+  // Preconditions:
+  //    (or (op0 v c0) (shiftl/r (op0 v c1) c2))
+  //
+  // Find opcode of the needed shift to be extracted from (op0 v c0).
+  unsigned Opcode = ISD::DELETED_NODE;
+  bool IsMulOrDiv = false;
+  // Set Opcode and IsMulOrDiv if the extract opcode matches the needed shift
+  // opcode or its arithmetic (mul or udiv) variant.
+  auto SelectOpcode = [&](unsigned NeededShift, unsigned MulOrDivVariant) {
+    IsMulOrDiv = ExtractFrom.getOpcode() == MulOrDivVariant;
+    if (!IsMulOrDiv && ExtractFrom.getOpcode() != NeededShift)
+      return false;
+    Opcode = NeededShift;
+    return true;
+  };
+  // op0 must be either the needed shift opcode or the mul/udiv equivalent
+  // that the needed shift can be extracted from.
+  if ((OppShift.getOpcode() != ISD::SRL || !SelectOpcode(ISD::SHL, ISD::MUL)) &&
+      (OppShift.getOpcode() != ISD::SHL || !SelectOpcode(ISD::SRL, ISD::UDIV)))
+    return SDValue();
+
+  // op0 must be the same opcode on both sides, have the same LHS argument,
+  // and produce the same value type.
+  if (OppShiftLHS.getOpcode() != ExtractFrom.getOpcode() ||
+      OppShiftLHS.getOperand(0) != ExtractFrom.getOperand(0) ||
+      ShiftedVT != ExtractFrom.getValueType())
+    return SDValue();
+
+  // Constant mul/udiv/shift amount from the RHS of the shift's LHS op.
+  ConstantSDNode *OppLHSCst = isConstOrConstSplat(OppShiftLHS.getOperand(1));
+  // Constant mul/udiv/shift amount from the RHS of the ExtractFrom op.
+  ConstantSDNode *ExtractFromCst =
+      isConstOrConstSplat(ExtractFrom.getOperand(1));
+  // TODO: We should be able to handle non-uniform constant vectors for these values
+  // Check that we have constant values.
+  if (!OppShiftCst || !OppShiftCst->getAPIntValue() ||
+      !OppLHSCst || !OppLHSCst->getAPIntValue() ||
+      !ExtractFromCst || !ExtractFromCst->getAPIntValue())
+    return SDValue();
+
+  // Compute the shift amount we need to extract to complete the rotate.
+  const unsigned VTWidth = ShiftedVT.getScalarSizeInBits();
+  if (OppShiftCst->getAPIntValue().ugt(VTWidth))
+    return SDValue();
+  APInt NeededShiftAmt = VTWidth - OppShiftCst->getAPIntValue();
+  // Normalize the bitwidth of the two mul/udiv/shift constant operands.
+  APInt ExtractFromAmt = ExtractFromCst->getAPIntValue();
+  APInt OppLHSAmt = OppLHSCst->getAPIntValue();
+  zeroExtendToMatch(ExtractFromAmt, OppLHSAmt);
+
+  // Now try extract the needed shift from the ExtractFrom op and see if the
+  // result matches up with the existing shift's LHS op.
+  if (IsMulOrDiv) {
+    // Op to extract from is a mul or udiv by a constant.
+    // Check:
+    //     c2 / (1 << (bitwidth(op0 v c0) - c1)) == c0
+    //     c2 % (1 << (bitwidth(op0 v c0) - c1)) == 0
+    const APInt ExtractDiv = APInt::getOneBitSet(ExtractFromAmt.getBitWidth(),
+                                                 NeededShiftAmt.getZExtValue());
+    APInt ResultAmt;
+    APInt Rem;
+    APInt::udivrem(ExtractFromAmt, ExtractDiv, ResultAmt, Rem);
+    if (Rem != 0 || ResultAmt != OppLHSAmt)
+      return SDValue();
+  } else {
+    // Op to extract from is a shift by a constant.
+    // Check:
+    //      c2 - (bitwidth(op0 v c0) - c1) == c0
+    if (OppLHSAmt != ExtractFromAmt - NeededShiftAmt.zextOrTrunc(
+                                          ExtractFromAmt.getBitWidth()))
+      return SDValue();
+  }
+
+  // Return the expanded shift op that should allow a rotate to be formed.
+  EVT ShiftVT = OppShift.getOperand(1).getValueType();
+  EVT ResVT = ExtractFrom.getValueType();
+  SDValue NewShiftNode = DAG.getConstant(NeededShiftAmt, DL, ShiftVT);
+  return DAG.getNode(Opcode, DL, ResVT, OppShiftLHS, NewShiftNode);
+}
+
+// Return true if we can prove that, whenever Neg and Pos are both in the
+// range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos).  This means that
+// for two opposing shifts shift1 and shift2 and a value X with OpBits bits:
+//
+//     (or (shift1 X, Neg), (shift2 X, Pos))
+//
+// reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate
+// in direction shift1 by Neg.  The range [0, EltSize) means that we only need
+// to consider shift amounts with defined behavior.
+//
+// The IsRotate flag should be set when the LHS of both shifts is the same.
+// Otherwise if matching a general funnel shift, it should be clear.
+static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
+                           SelectionDAG &DAG, bool IsRotate) {
+  const auto &TLI = DAG.getTargetLoweringInfo();
+  // If EltSize is a power of 2 then:
+  //
+  //  (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1)
+  //  (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize).
+  //
+  // So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check
+  // for the stronger condition:
+  //
+  //     Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1)    [A]
+  //
+  // for all Neg and Pos.  Since Neg & (EltSize - 1) == Neg' & (EltSize - 1)
+  // we can just replace Neg with Neg' for the rest of the function.
+  //
+  // In other cases we check for the even stronger condition:
+  //
+  //     Neg == EltSize - Pos                                    [B]
+  //
+  // for all Neg and Pos.  Note that the (or ...) then invokes undefined
+  // behavior if Pos == 0 (and consequently Neg == EltSize).
+  //
+  // We could actually use [A] whenever EltSize is a power of 2, but the
+  // only extra cases that it would match are those uninteresting ones
+  // where Neg and Pos are never in range at the same time.  E.g. for
+  // EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos)
+  // as well as (sub 32, Pos), but:
+  //
+  //     (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos))
+  //
+  // always invokes undefined behavior for 32-bit X.
+  //
+  // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise.
+  // This allows us to peek through any operations that only affect Mask's
+  // un-demanded bits.
+  //
+  // NOTE: We can only do this when matching operations which won't modify the
+  // least Log2(EltSize) significant bits and not a general funnel shift.
+  unsigned MaskLoBits = 0;
+  if (IsRotate && isPowerOf2_64(EltSize)) {
+    unsigned Bits = Log2_64(EltSize);
+    unsigned NegBits = Neg.getScalarValueSizeInBits();
+    if (NegBits >= Bits) {
+      APInt DemandedBits = APInt::getLowBitsSet(NegBits, Bits);
+      if (SDValue Inner =
+              TLI.SimplifyMultipleUseDemandedBits(Neg, DemandedBits, DAG)) {
+        Neg = Inner;
+        MaskLoBits = Bits;
+      }
+    }
+  }
+
+  // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1.
+  if (Neg.getOpcode() != ISD::SUB)
+    return false;
+  ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0));
+  if (!NegC)
+    return false;
+  SDValue NegOp1 = Neg.getOperand(1);
+
+  // On the RHS of [A], if Pos is the result of operation on Pos' that won't
+  // affect Mask's demanded bits, just replace Pos with Pos'. These operations
+  // are redundant for the purpose of the equality.
+  if (MaskLoBits) {
+    unsigned PosBits = Pos.getScalarValueSizeInBits();
+    if (PosBits >= MaskLoBits) {
+      APInt DemandedBits = APInt::getLowBitsSet(PosBits, MaskLoBits);
+      if (SDValue Inner =
+              TLI.SimplifyMultipleUseDemandedBits(Pos, DemandedBits, DAG)) {
+        Pos = Inner;
+      }
+    }
+  }
+
+  // The condition we need is now:
+  //
+  //     (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask
+  //
+  // If NegOp1 == Pos then we need:
+  //
+  //              EltSize & Mask == NegC & Mask
+  //
+  // (because "x & Mask" is a truncation and distributes through subtraction).
+  //
+  // We also need to account for a potential truncation of NegOp1 if the amount
+  // has already been legalized to a shift amount type.
+  APInt Width;
+  if ((Pos == NegOp1) ||
+      (NegOp1.getOpcode() == ISD::TRUNCATE && Pos == NegOp1.getOperand(0)))
+    Width = NegC->getAPIntValue();
+
+  // Check for cases where Pos has the form (add NegOp1, PosC) for some PosC.
+  // Then the condition we want to prove becomes:
+  //
+  //     (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask
+  //
+  // which, again because "x & Mask" is a truncation, becomes:
+  //
+  //                NegC & Mask == (EltSize - PosC) & Mask
+  //             EltSize & Mask == (NegC + PosC) & Mask
+  else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) {
+    if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
+      Width = PosC->getAPIntValue() + NegC->getAPIntValue();
+    else
+      return false;
+  } else
+    return false;
+
+  // Now we just need to check that EltSize & Mask == Width & Mask.
+  if (MaskLoBits)
+    // EltSize & Mask is 0 since Mask is EltSize - 1.
+    return Width.getLoBits(MaskLoBits) == 0;
+  return Width == EltSize;
+}
+
+// A subroutine of MatchRotate used once we have found an OR of two opposite
+// shifts of Shifted.  If Neg == <operand size> - Pos then the OR reduces
+// to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the
+// former being preferred if supported.  InnerPos and InnerNeg are Pos and
+// Neg with outer conversions stripped away.
+SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
+                                       SDValue Neg, SDValue InnerPos,
+                                       SDValue InnerNeg, bool HasPos,
+                                       unsigned PosOpcode, unsigned NegOpcode,
+                                       const SDLoc &DL) {
+  // fold (or (shl x, (*ext y)),
+  //          (srl x, (*ext (sub 32, y)))) ->
+  //   (rotl x, y) or (rotr x, (sub 32, y))
+  //
+  // fold (or (shl x, (*ext (sub 32, y))),
+  //          (srl x, (*ext y))) ->
+  //   (rotr x, y) or (rotl x, (sub 32, y))
+  EVT VT = Shifted.getValueType();
+  if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG,
+                     /*IsRotate*/ true)) {
+    return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
+                       HasPos ? Pos : Neg);
+  }
+
+  return SDValue();
+}
+
+// A subroutine of MatchRotate used once we have found an OR of two opposite
+// shifts of N0 + N1.  If Neg == <operand size> - Pos then the OR reduces
+// to both (PosOpcode N0, N1, Pos) and (NegOpcode N0, N1, Neg), with the
+// former being preferred if supported.  InnerPos and InnerNeg are Pos and
+// Neg with outer conversions stripped away.
+// TODO: Merge with MatchRotatePosNeg.
+SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
+                                       SDValue Neg, SDValue InnerPos,
+                                       SDValue InnerNeg, bool HasPos,
+                                       unsigned PosOpcode, unsigned NegOpcode,
+                                       const SDLoc &DL) {
+  EVT VT = N0.getValueType();
+  unsigned EltBits = VT.getScalarSizeInBits();
+
+  // fold (or (shl x0, (*ext y)),
+  //          (srl x1, (*ext (sub 32, y)))) ->
+  //   (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y))
+  //
+  // fold (or (shl x0, (*ext (sub 32, y))),
+  //          (srl x1, (*ext y))) ->
+  //   (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
+  if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1)) {
+    return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,
+                       HasPos ? Pos : Neg);
+  }
+
+  // Matching the shift+xor cases, we can't easily use the xor'd shift amount
+  // so for now just use the PosOpcode case if its legal.
+  // TODO: When can we use the NegOpcode case?
+  if (PosOpcode == ISD::FSHL && isPowerOf2_32(EltBits)) {
+    auto IsBinOpImm = [](SDValue Op, unsigned BinOpc, unsigned Imm) {
+      if (Op.getOpcode() != BinOpc)
+        return false;
+      ConstantSDNode *Cst = isConstOrConstSplat(Op.getOperand(1));
+      return Cst && (Cst->getAPIntValue() == Imm);
+    };
+
+    // fold (or (shl x0, y), (srl (srl x1, 1), (xor y, 31)))
+    //   -> (fshl x0, x1, y)
+    if (IsBinOpImm(N1, ISD::SRL, 1) &&
+        IsBinOpImm(InnerNeg, ISD::XOR, EltBits - 1) &&
+        InnerPos == InnerNeg.getOperand(0) &&
+        TLI.isOperationLegalOrCustom(ISD::FSHL, VT)) {
+      return DAG.getNode(ISD::FSHL, DL, VT, N0, N1.getOperand(0), Pos);
+    }
+
+    // fold (or (shl (shl x0, 1), (xor y, 31)), (srl x1, y))
+    //   -> (fshr x0, x1, y)
+    if (IsBinOpImm(N0, ISD::SHL, 1) &&
+        IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&
+        InnerNeg == InnerPos.getOperand(0) &&
+        TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) {
+      return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);
+    }
+
+    // fold (or (shl (add x0, x0), (xor y, 31)), (srl x1, y))
+    //   -> (fshr x0, x1, y)
+    // TODO: Should add(x,x) -> shl(x,1) be a general DAG canonicalization?
+    if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N0.getOperand(1) &&
+        IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&
+        InnerNeg == InnerPos.getOperand(0) &&
+        TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) {
+      return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);
+    }
+  }
+
+  return SDValue();
+}
+
+// MatchRotate - Handle an 'or' of two operands.  If this is one of the many
+// idioms for rotate, and if the target supports rotation instructions, generate
+// a rot[lr]. This also matches funnel shift patterns, similar to rotation but
+// with different shifted sources.
+SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
+  EVT VT = LHS.getValueType();
+
+  // The target must have at least one rotate/funnel flavor.
+  // We still try to match rotate by constant pre-legalization.
+  // TODO: Support pre-legalization funnel-shift by constant.
+  bool HasROTL = hasOperation(ISD::ROTL, VT);
+  bool HasROTR = hasOperation(ISD::ROTR, VT);
+  bool HasFSHL = hasOperation(ISD::FSHL, VT);
+  bool HasFSHR = hasOperation(ISD::FSHR, VT);
+
+  // If the type is going to be promoted and the target has enabled custom
+  // lowering for rotate, allow matching rotate by non-constants. Only allow
+  // this for scalar types.
+  if (VT.isScalarInteger() && TLI.getTypeAction(*DAG.getContext(), VT) ==
+                                  TargetLowering::TypePromoteInteger) {
+    HasROTL |= TLI.getOperationAction(ISD::ROTL, VT) == TargetLowering::Custom;
+    HasROTR |= TLI.getOperationAction(ISD::ROTR, VT) == TargetLowering::Custom;
+  }
+
+  if (LegalOperations && !HasROTL && !HasROTR && !HasFSHL && !HasFSHR)
+    return SDValue();
+
+  // Check for truncated rotate.
+  if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE &&
+      LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) {
+    assert(LHS.getValueType() == RHS.getValueType());
+    if (SDValue Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) {
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), Rot);
+    }
+  }
+
+  // Match "(X shl/srl V1) & V2" where V2 may not be present.
+  SDValue LHSShift;   // The shift.
+  SDValue LHSMask;    // AND value if any.
+  matchRotateHalf(DAG, LHS, LHSShift, LHSMask);
+
+  SDValue RHSShift;   // The shift.
+  SDValue RHSMask;    // AND value if any.
+  matchRotateHalf(DAG, RHS, RHSShift, RHSMask);
+
+  // If neither side matched a rotate half, bail
+  if (!LHSShift && !RHSShift)
+    return SDValue();
+
+  // InstCombine may have combined a constant shl, srl, mul, or udiv with one
+  // side of the rotate, so try to handle that here. In all cases we need to
+  // pass the matched shift from the opposite side to compute the opcode and
+  // needed shift amount to extract.  We still want to do this if both sides
+  // matched a rotate half because one half may be a potential overshift that
+  // can be broken down (ie if InstCombine merged two shl or srl ops into a
+  // single one).
+
+  // Have LHS side of the rotate, try to extract the needed shift from the RHS.
+  if (LHSShift)
+    if (SDValue NewRHSShift =
+            extractShiftForRotate(DAG, LHSShift, RHS, RHSMask, DL))
+      RHSShift = NewRHSShift;
+  // Have RHS side of the rotate, try to extract the needed shift from the LHS.
+  if (RHSShift)
+    if (SDValue NewLHSShift =
+            extractShiftForRotate(DAG, RHSShift, LHS, LHSMask, DL))
+      LHSShift = NewLHSShift;
+
+  // If a side is still missing, nothing else we can do.
+  if (!RHSShift || !LHSShift)
+    return SDValue();
+
+  // At this point we've matched or extracted a shift op on each side.
+
+  if (LHSShift.getOpcode() == RHSShift.getOpcode())
+    return SDValue(); // Shifts must disagree.
+
+  // Canonicalize shl to left side in a shl/srl pair.
+  if (RHSShift.getOpcode() == ISD::SHL) {
+    std::swap(LHS, RHS);
+    std::swap(LHSShift, RHSShift);
+    std::swap(LHSMask, RHSMask);
+  }
+
+  // Something has gone wrong - we've lost the shl/srl pair - bail.
+  if (LHSShift.getOpcode() != ISD::SHL || RHSShift.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  SDValue LHSShiftArg = LHSShift.getOperand(0);
+  SDValue LHSShiftAmt = LHSShift.getOperand(1);
+  SDValue RHSShiftArg = RHSShift.getOperand(0);
+  SDValue RHSShiftAmt = RHSShift.getOperand(1);
+
+  auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS,
+                                        ConstantSDNode *RHS) {
+    return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits;
+  };
+
+  auto ApplyMasks = [&](SDValue Res) {
+    // If there is an AND of either shifted operand, apply it to the result.
+    if (LHSMask.getNode() || RHSMask.getNode()) {
+      SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
+      SDValue Mask = AllOnes;
+
+      if (LHSMask.getNode()) {
+        SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt);
+        Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
+                           DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits));
+      }
+      if (RHSMask.getNode()) {
+        SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt);
+        Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
+                           DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits));
+      }
+
+      Res = DAG.getNode(ISD::AND, DL, VT, Res, Mask);
+    }
+
+    return Res;
+  };
+
+  // TODO: Support pre-legalization funnel-shift by constant.
+  bool IsRotate = LHSShiftArg == RHSShiftArg;
+  if (!IsRotate && !(HasFSHL || HasFSHR)) {
+    if (TLI.isTypeLegal(VT) && LHS.hasOneUse() && RHS.hasOneUse() &&
+        ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
+      // Look for a disguised rotate by constant.
+      // The common shifted operand X may be hidden inside another 'or'.
+      SDValue X, Y;
+      auto matchOr = [&X, &Y](SDValue Or, SDValue CommonOp) {
+        if (!Or.hasOneUse() || Or.getOpcode() != ISD::OR)
+          return false;
+        if (CommonOp == Or.getOperand(0)) {
+          X = CommonOp;
+          Y = Or.getOperand(1);
+          return true;
+        }
+        if (CommonOp == Or.getOperand(1)) {
+          X = CommonOp;
+          Y = Or.getOperand(0);
+          return true;
+        }
+        return false;
+      };
+
+      SDValue Res;
+      if (matchOr(LHSShiftArg, RHSShiftArg)) {
+        // (shl (X | Y), C1) | (srl X, C2) --> (rotl X, C1) | (shl Y, C1)
+        SDValue RotX = DAG.getNode(ISD::ROTL, DL, VT, X, LHSShiftAmt);
+        SDValue ShlY = DAG.getNode(ISD::SHL, DL, VT, Y, LHSShiftAmt);
+        Res = DAG.getNode(ISD::OR, DL, VT, RotX, ShlY);
+      } else if (matchOr(RHSShiftArg, LHSShiftArg)) {
+        // (shl X, C1) | (srl (X | Y), C2) --> (rotl X, C1) | (srl Y, C2)
+        SDValue RotX = DAG.getNode(ISD::ROTL, DL, VT, X, LHSShiftAmt);
+        SDValue SrlY = DAG.getNode(ISD::SRL, DL, VT, Y, RHSShiftAmt);
+        Res = DAG.getNode(ISD::OR, DL, VT, RotX, SrlY);
+      } else {
+        return SDValue();
+      }
+
+      return ApplyMasks(Res);
+    }
+
+    return SDValue(); // Requires funnel shift support.
+  }
+
+  // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
+  // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
+  // fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1)
+  // fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2)
+  // iff C1+C2 == EltSizeInBits
+  if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
+    SDValue Res;
+    if (IsRotate && (HasROTL || HasROTR || !(HasFSHL || HasFSHR))) {
+      bool UseROTL = !LegalOperations || HasROTL;
+      Res = DAG.getNode(UseROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
+                        UseROTL ? LHSShiftAmt : RHSShiftAmt);
+    } else {
+      bool UseFSHL = !LegalOperations || HasFSHL;
+      Res = DAG.getNode(UseFSHL ? ISD::FSHL : ISD::FSHR, DL, VT, LHSShiftArg,
+                        RHSShiftArg, UseFSHL ? LHSShiftAmt : RHSShiftAmt);
+    }
+
+    return ApplyMasks(Res);
+  }
+
+  // Even pre-legalization, we can't easily rotate/funnel-shift by a variable
+  // shift.
+  if (!HasROTL && !HasROTR && !HasFSHL && !HasFSHR)
+    return SDValue();
+
+  // If there is a mask here, and we have a variable shift, we can't be sure
+  // that we're masking out the right stuff.
+  if (LHSMask.getNode() || RHSMask.getNode())
+    return SDValue();
+
+  // If the shift amount is sign/zext/any-extended just peel it off.
+  SDValue LExtOp0 = LHSShiftAmt;
+  SDValue RExtOp0 = RHSShiftAmt;
+  if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
+       LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
+       LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
+       LHSShiftAmt.getOpcode() == ISD::TRUNCATE) &&
+      (RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
+       RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
+       RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
+       RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
+    LExtOp0 = LHSShiftAmt.getOperand(0);
+    RExtOp0 = RHSShiftAmt.getOperand(0);
+  }
+
+  if (IsRotate && (HasROTL || HasROTR)) {
+    SDValue TryL =
+        MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0,
+                          RExtOp0, HasROTL, ISD::ROTL, ISD::ROTR, DL);
+    if (TryL)
+      return TryL;
+
+    SDValue TryR =
+        MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0,
+                          LExtOp0, HasROTR, ISD::ROTR, ISD::ROTL, DL);
+    if (TryR)
+      return TryR;
+  }
+
+  SDValue TryL =
+      MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt,
+                        LExtOp0, RExtOp0, HasFSHL, ISD::FSHL, ISD::FSHR, DL);
+  if (TryL)
+    return TryL;
+
+  SDValue TryR =
+      MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
+                        RExtOp0, LExtOp0, HasFSHR, ISD::FSHR, ISD::FSHL, DL);
+  if (TryR)
+    return TryR;
+
+  return SDValue();
+}
+
+/// Recursively traverses the expression calculating the origin of the requested
+/// byte of the given value. Returns std::nullopt if the provider can't be
+/// calculated.
+///
+/// For all the values except the root of the expression, we verify that the
+/// value has exactly one use and if not then return std::nullopt. This way if
+/// the origin of the byte is returned it's guaranteed that the values which
+/// contribute to the byte are not used outside of this expression.
+
+/// However, there is a special case when dealing with vector loads -- we allow
+/// more than one use if the load is a vector type.  Since the values that
+/// contribute to the byte ultimately come from the ExtractVectorElements of the
+/// Load, we don't care if the Load has uses other than ExtractVectorElements,
+/// because those operations are independent from the pattern to be combined.
+/// For vector loads, we simply care that the ByteProviders are adjacent
+/// positions of the same vector, and their index matches the byte that is being
+/// provided. This is captured by the \p VectorIndex algorithm. \p VectorIndex
+/// is the index used in an ExtractVectorElement, and \p StartingIndex is the
+/// byte position we are trying to provide for the LoadCombine. If these do
+/// not match, then we can not combine the vector loads. \p Index uses the
+/// byte position we are trying to provide for and is matched against the
+/// shl and load size. The \p Index algorithm ensures the requested byte is
+/// provided for by the pattern, and the pattern does not over provide bytes.
+///
+///
+/// The supported LoadCombine pattern for vector loads is as follows
+///                              or
+///                          /        \
+///                         or        shl
+///                       /     \      |
+///                     or      shl   zext
+///                   /    \     |     |
+///                 shl   zext  zext  EVE*
+///                  |     |     |     |
+///                 zext  EVE*  EVE*  LOAD
+///                  |     |     |
+///                 EVE*  LOAD  LOAD
+///                  |
+///                 LOAD
+///
+/// *ExtractVectorElement
+using SDByteProvider = ByteProvider<SDNode *>;
+
+static const std::optional<SDByteProvider>
+calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
+                      std::optional<uint64_t> VectorIndex,
+                      unsigned StartingIndex = 0) {
+
+  // Typical i64 by i8 pattern requires recursion up to 8 calls depth
+  if (Depth == 10)
+    return std::nullopt;
+
+  // Only allow multiple uses if the instruction is a vector load (in which
+  // case we will use the load for every ExtractVectorElement)
+  if (Depth && !Op.hasOneUse() &&
+      (Op.getOpcode() != ISD::LOAD || !Op.getValueType().isVector()))
+    return std::nullopt;
+
+  // Fail to combine if we have encountered anything but a LOAD after handling
+  // an ExtractVectorElement.
+  if (Op.getOpcode() != ISD::LOAD && VectorIndex.has_value())
+    return std::nullopt;
+
+  unsigned BitWidth = Op.getValueSizeInBits();
+  if (BitWidth % 8 != 0)
+    return std::nullopt;
+  unsigned ByteWidth = BitWidth / 8;
+  assert(Index < ByteWidth && "invalid index requested");
+  (void) ByteWidth;
+
+  switch (Op.getOpcode()) {
+  case ISD::OR: {
+    auto LHS =
+        calculateByteProvider(Op->getOperand(0), Index, Depth + 1, VectorIndex);
+    if (!LHS)
+      return std::nullopt;
+    auto RHS =
+        calculateByteProvider(Op->getOperand(1), Index, Depth + 1, VectorIndex);
+    if (!RHS)
+      return std::nullopt;
+
+    if (LHS->isConstantZero())
+      return RHS;
+    if (RHS->isConstantZero())
+      return LHS;
+    return std::nullopt;
+  }
+  case ISD::SHL: {
+    auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+    if (!ShiftOp)
+      return std::nullopt;
+
+    uint64_t BitShift = ShiftOp->getZExtValue();
+
+    if (BitShift % 8 != 0)
+      return std::nullopt;
+    uint64_t ByteShift = BitShift / 8;
+
+    // If we are shifting by an amount greater than the index we are trying to
+    // provide, then do not provide anything. Otherwise, subtract the index by
+    // the amount we shifted by.
+    return Index < ByteShift
+               ? SDByteProvider::getConstantZero()
+               : calculateByteProvider(Op->getOperand(0), Index - ByteShift,
+                                       Depth + 1, VectorIndex, Index);
+  }
+  case ISD::ANY_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND: {
+    SDValue NarrowOp = Op->getOperand(0);
+    unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
+    if (NarrowBitWidth % 8 != 0)
+      return std::nullopt;
+    uint64_t NarrowByteWidth = NarrowBitWidth / 8;
+
+    if (Index >= NarrowByteWidth)
+      return Op.getOpcode() == ISD::ZERO_EXTEND
+                 ? std::optional<SDByteProvider>(
+                       SDByteProvider::getConstantZero())
+                 : std::nullopt;
+    return calculateByteProvider(NarrowOp, Index, Depth + 1, VectorIndex,
+                                 StartingIndex);
+  }
+  case ISD::BSWAP:
+    return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1,
+                                 Depth + 1, VectorIndex, StartingIndex);
+  case ISD::EXTRACT_VECTOR_ELT: {
+    auto OffsetOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+    if (!OffsetOp)
+      return std::nullopt;
+
+    VectorIndex = OffsetOp->getZExtValue();
+
+    SDValue NarrowOp = Op->getOperand(0);
+    unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
+    if (NarrowBitWidth % 8 != 0)
+      return std::nullopt;
+    uint64_t NarrowByteWidth = NarrowBitWidth / 8;
+
+    // Check to see if the position of the element in the vector corresponds
+    // with the byte we are trying to provide for. In the case of a vector of
+    // i8, this simply means the VectorIndex == StartingIndex. For non i8 cases,
+    // the element will provide a range of bytes. For example, if we have a
+    // vector of i16s, each element provides two bytes (V[1] provides byte 2 and
+    // 3).
+    if (*VectorIndex * NarrowByteWidth > StartingIndex)
+      return std::nullopt;
+    if ((*VectorIndex + 1) * NarrowByteWidth <= StartingIndex)
+      return std::nullopt;
+
+    return calculateByteProvider(Op->getOperand(0), Index, Depth + 1,
+                                 VectorIndex, StartingIndex);
+  }
+  case ISD::LOAD: {
+    auto L = cast<LoadSDNode>(Op.getNode());
+    if (!L->isSimple() || L->isIndexed())
+      return std::nullopt;
+
+    unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
+    if (NarrowBitWidth % 8 != 0)
+      return std::nullopt;
+    uint64_t NarrowByteWidth = NarrowBitWidth / 8;
+
+    // If the width of the load does not reach byte we are trying to provide for
+    // and it is not a ZEXTLOAD, then the load does not provide for the byte in
+    // question
+    if (Index >= NarrowByteWidth)
+      return L->getExtensionType() == ISD::ZEXTLOAD
+                 ? std::optional<SDByteProvider>(
+                       SDByteProvider::getConstantZero())
+                 : std::nullopt;
+
+    unsigned BPVectorIndex = VectorIndex.value_or(0U);
+    return SDByteProvider::getSrc(L, Index, BPVectorIndex);
+  }
+  }
+
+  return std::nullopt;
+}
+
+static unsigned littleEndianByteAt(unsigned BW, unsigned i) {
+  return i;
+}
+
+static unsigned bigEndianByteAt(unsigned BW, unsigned i) {
+  return BW - i - 1;
+}
+
+// Check if the bytes offsets we are looking at match with either big or
+// little endian value loaded. Return true for big endian, false for little
+// endian, and std::nullopt if match failed.
+static std::optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets,
+                                       int64_t FirstOffset) {
+  // The endian can be decided only when it is 2 bytes at least.
+  unsigned Width = ByteOffsets.size();
+  if (Width < 2)
+    return std::nullopt;
+
+  bool BigEndian = true, LittleEndian = true;
+  for (unsigned i = 0; i < Width; i++) {
+    int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset;
+    LittleEndian &= CurrentByteOffset == littleEndianByteAt(Width, i);
+    BigEndian &= CurrentByteOffset == bigEndianByteAt(Width, i);
+    if (!BigEndian && !LittleEndian)
+      return std::nullopt;
+  }
+
+  assert((BigEndian != LittleEndian) && "It should be either big endian or"
+                                        "little endian");
+  return BigEndian;
+}
+
+static SDValue stripTruncAndExt(SDValue Value) {
+  switch (Value.getOpcode()) {
+  case ISD::TRUNCATE:
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ANY_EXTEND:
+    return stripTruncAndExt(Value.getOperand(0));
+  }
+  return Value;
+}
+
+/// Match a pattern where a wide type scalar value is stored by several narrow
+/// stores. Fold it into a single store or a BSWAP and a store if the targets
+/// supports it.
+///
+/// Assuming little endian target:
+///  i8 *p = ...
+///  i32 val = ...
+///  p[0] = (val >> 0) & 0xFF;
+///  p[1] = (val >> 8) & 0xFF;
+///  p[2] = (val >> 16) & 0xFF;
+///  p[3] = (val >> 24) & 0xFF;
+/// =>
+///  *((i32)p) = val;
+///
+///  i8 *p = ...
+///  i32 val = ...
+///  p[0] = (val >> 24) & 0xFF;
+///  p[1] = (val >> 16) & 0xFF;
+///  p[2] = (val >> 8) & 0xFF;
+///  p[3] = (val >> 0) & 0xFF;
+/// =>
+///  *((i32)p) = BSWAP(val);
+SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
+  // The matching looks for "store (trunc x)" patterns that appear early but are
+  // likely to be replaced by truncating store nodes during combining.
+  // TODO: If there is evidence that running this later would help, this
+  //       limitation could be removed. Legality checks may need to be added
+  //       for the created store and optional bswap/rotate.
+  if (LegalOperations || OptLevel == CodeGenOpt::None)
+    return SDValue();
+
+  // We only handle merging simple stores of 1-4 bytes.
+  // TODO: Allow unordered atomics when wider type is legal (see D66309)
+  EVT MemVT = N->getMemoryVT();
+  if (!(MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) ||
+      !N->isSimple() || N->isIndexed())
+    return SDValue();
+
+  // Collect all of the stores in the chain, upto the maximum store width (i64).
+  SDValue Chain = N->getChain();
+  SmallVector<StoreSDNode *, 8> Stores = {N};
+  unsigned NarrowNumBits = MemVT.getScalarSizeInBits();
+  unsigned MaxWideNumBits = 64;
+  unsigned MaxStores = MaxWideNumBits / NarrowNumBits;
+  while (auto *Store = dyn_cast<StoreSDNode>(Chain)) {
+    // All stores must be the same size to ensure that we are writing all of the
+    // bytes in the wide value.
+    // This store should have exactly one use as a chain operand for another
+    // store in the merging set. If there are other chain uses, then the
+    // transform may not be safe because order of loads/stores outside of this
+    // set may not be preserved.
+    // TODO: We could allow multiple sizes by tracking each stored byte.
+    if (Store->getMemoryVT() != MemVT || !Store->isSimple() ||
+        Store->isIndexed() || !Store->hasOneUse())
+      return SDValue();
+    Stores.push_back(Store);
+    Chain = Store->getChain();
+    if (MaxStores < Stores.size())
+      return SDValue();
+  }
+  // There is no reason to continue if we do not have at least a pair of stores.
+  if (Stores.size() < 2)
+    return SDValue();
+
+  // Handle simple types only.
+  LLVMContext &Context = *DAG.getContext();
+  unsigned NumStores = Stores.size();
+  unsigned WideNumBits = NumStores * NarrowNumBits;
+  EVT WideVT = EVT::getIntegerVT(Context, WideNumBits);
+  if (WideVT != MVT::i16 && WideVT != MVT::i32 && WideVT != MVT::i64)
+    return SDValue();
+
+  // Check if all bytes of the source value that we are looking at are stored
+  // to the same base address. Collect offsets from Base address into OffsetMap.
+  SDValue SourceValue;
+  SmallVector<int64_t, 8> OffsetMap(NumStores, INT64_MAX);
+  int64_t FirstOffset = INT64_MAX;
+  StoreSDNode *FirstStore = nullptr;
+  std::optional<BaseIndexOffset> Base;
+  for (auto *Store : Stores) {
+    // All the stores store different parts of the CombinedValue. A truncate is
+    // required to get the partial value.
+    SDValue Trunc = Store->getValue();
+    if (Trunc.getOpcode() != ISD::TRUNCATE)
+      return SDValue();
+    // Other than the first/last part, a shift operation is required to get the
+    // offset.
+    int64_t Offset = 0;
+    SDValue WideVal = Trunc.getOperand(0);
+    if ((WideVal.getOpcode() == ISD::SRL || WideVal.getOpcode() == ISD::SRA) &&
+        isa<ConstantSDNode>(WideVal.getOperand(1))) {
+      // The shift amount must be a constant multiple of the narrow type.
+      // It is translated to the offset address in the wide source value "y".
+      //
+      // x = srl y, ShiftAmtC
+      // i8 z = trunc x
+      // store z, ...
+      uint64_t ShiftAmtC = WideVal.getConstantOperandVal(1);
+      if (ShiftAmtC % NarrowNumBits != 0)
+        return SDValue();
+
+      Offset = ShiftAmtC / NarrowNumBits;
+      WideVal = WideVal.getOperand(0);
+    }
+
+    // Stores must share the same source value with different offsets.
+    // Truncate and extends should be stripped to get the single source value.
+    if (!SourceValue)
+      SourceValue = WideVal;
+    else if (stripTruncAndExt(SourceValue) != stripTruncAndExt(WideVal))
+      return SDValue();
+    else if (SourceValue.getValueType() != WideVT) {
+      if (WideVal.getValueType() == WideVT ||
+          WideVal.getScalarValueSizeInBits() >
+              SourceValue.getScalarValueSizeInBits())
+        SourceValue = WideVal;
+      // Give up if the source value type is smaller than the store size.
+      if (SourceValue.getScalarValueSizeInBits() < WideVT.getScalarSizeInBits())
+        return SDValue();
+    }
+
+    // Stores must share the same base address.
+    BaseIndexOffset Ptr = BaseIndexOffset::match(Store, DAG);
+    int64_t ByteOffsetFromBase = 0;
+    if (!Base)
+      Base = Ptr;
+    else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
+      return SDValue();
+
+    // Remember the first store.
+    if (ByteOffsetFromBase < FirstOffset) {
+      FirstStore = Store;
+      FirstOffset = ByteOffsetFromBase;
+    }
+    // Map the offset in the store and the offset in the combined value, and
+    // early return if it has been set before.
+    if (Offset < 0 || Offset >= NumStores || OffsetMap[Offset] != INT64_MAX)
+      return SDValue();
+    OffsetMap[Offset] = ByteOffsetFromBase;
+  }
+
+  assert(FirstOffset != INT64_MAX && "First byte offset must be set");
+  assert(FirstStore && "First store must be set");
+
+  // Check that a store of the wide type is both allowed and fast on the target
+  const DataLayout &Layout = DAG.getDataLayout();
+  unsigned Fast = 0;
+  bool Allowed = TLI.allowsMemoryAccess(Context, Layout, WideVT,
+                                        *FirstStore->getMemOperand(), &Fast);
+  if (!Allowed || !Fast)
+    return SDValue();
+
+  // Check if the pieces of the value are going to the expected places in memory
+  // to merge the stores.
+  auto checkOffsets = [&](bool MatchLittleEndian) {
+    if (MatchLittleEndian) {
+      for (unsigned i = 0; i != NumStores; ++i)
+        if (OffsetMap[i] != i * (NarrowNumBits / 8) + FirstOffset)
+          return false;
+    } else { // MatchBigEndian by reversing loop counter.
+      for (unsigned i = 0, j = NumStores - 1; i != NumStores; ++i, --j)
+        if (OffsetMap[j] != i * (NarrowNumBits / 8) + FirstOffset)
+          return false;
+    }
+    return true;
+  };
+
+  // Check if the offsets line up for the native data layout of this target.
+  bool NeedBswap = false;
+  bool NeedRotate = false;
+  if (!checkOffsets(Layout.isLittleEndian())) {
+    // Special-case: check if byte offsets line up for the opposite endian.
+    if (NarrowNumBits == 8 && checkOffsets(Layout.isBigEndian()))
+      NeedBswap = true;
+    else if (NumStores == 2 && checkOffsets(Layout.isBigEndian()))
+      NeedRotate = true;
+    else
+      return SDValue();
+  }
+
+  SDLoc DL(N);
+  if (WideVT != SourceValue.getValueType()) {
+    assert(SourceValue.getValueType().getScalarSizeInBits() > WideNumBits &&
+           "Unexpected store value to merge");
+    SourceValue = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SourceValue);
+  }
+
+  // Before legalize we can introduce illegal bswaps/rotates which will be later
+  // converted to an explicit bswap sequence. This way we end up with a single
+  // store and byte shuffling instead of several stores and byte shuffling.
+  if (NeedBswap) {
+    SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue);
+  } else if (NeedRotate) {
+    assert(WideNumBits % 2 == 0 && "Unexpected type for rotate");
+    SDValue RotAmt = DAG.getConstant(WideNumBits / 2, DL, WideVT);
+    SourceValue = DAG.getNode(ISD::ROTR, DL, WideVT, SourceValue, RotAmt);
+  }
+
+  SDValue NewStore =
+      DAG.getStore(Chain, DL, SourceValue, FirstStore->getBasePtr(),
+                   FirstStore->getPointerInfo(), FirstStore->getAlign());
+
+  // Rely on other DAG combine rules to remove the other individual stores.
+  DAG.ReplaceAllUsesWith(N, NewStore.getNode());
+  return NewStore;
+}
+
+/// Match a pattern where a wide type scalar value is loaded by several narrow
+/// loads and combined by shifts and ors. Fold it into a single load or a load
+/// and a BSWAP if the targets supports it.
+///
+/// Assuming little endian target:
+///  i8 *a = ...
+///  i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
+/// =>
+///  i32 val = *((i32)a)
+///
+///  i8 *a = ...
+///  i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]
+/// =>
+///  i32 val = BSWAP(*((i32)a))
+///
+/// TODO: This rule matches complex patterns with OR node roots and doesn't
+/// interact well with the worklist mechanism. When a part of the pattern is
+/// updated (e.g. one of the loads) its direct users are put into the worklist,
+/// but the root node of the pattern which triggers the load combine is not
+/// necessarily a direct user of the changed node. For example, once the address
+/// of t28 load is reassociated load combine won't be triggered:
+///             t25: i32 = add t4, Constant:i32<2>
+///           t26: i64 = sign_extend t25
+///        t27: i64 = add t2, t26
+///       t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64
+///     t29: i32 = zero_extend t28
+///   t32: i32 = shl t29, Constant:i8<8>
+/// t33: i32 = or t23, t32
+/// As a possible fix visitLoad can check if the load can be a part of a load
+/// combine pattern and add corresponding OR roots to the worklist.
+SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
+  assert(N->getOpcode() == ISD::OR &&
+         "Can only match load combining against OR nodes");
+
+  // Handles simple types only
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+  unsigned ByteWidth = VT.getSizeInBits() / 8;
+
+  bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();
+  auto MemoryByteOffset = [&](SDByteProvider P) {
+    assert(P.hasSrc() && "Must be a memory byte provider");
+    auto *Load = cast<LoadSDNode>(P.Src.value());
+
+    unsigned LoadBitWidth = Load->getMemoryVT().getScalarSizeInBits();
+
+    assert(LoadBitWidth % 8 == 0 &&
+           "can only analyze providers for individual bytes not bit");
+    unsigned LoadByteWidth = LoadBitWidth / 8;
+    return IsBigEndianTarget ? bigEndianByteAt(LoadByteWidth, P.DestOffset)
+                             : littleEndianByteAt(LoadByteWidth, P.DestOffset);
+  };
+
+  std::optional<BaseIndexOffset> Base;
+  SDValue Chain;
+
+  SmallPtrSet<LoadSDNode *, 8> Loads;
+  std::optional<SDByteProvider> FirstByteProvider;
+  int64_t FirstOffset = INT64_MAX;
+
+  // Check if all the bytes of the OR we are looking at are loaded from the same
+  // base address. Collect bytes offsets from Base address in ByteOffsets.
+  SmallVector<int64_t, 8> ByteOffsets(ByteWidth);
+  unsigned ZeroExtendedBytes = 0;
+  for (int i = ByteWidth - 1; i >= 0; --i) {
+    auto P =
+        calculateByteProvider(SDValue(N, 0), i, 0, /*VectorIndex*/ std::nullopt,
+                              /*StartingIndex*/ i);
+    if (!P)
+      return SDValue();
+
+    if (P->isConstantZero()) {
+      // It's OK for the N most significant bytes to be 0, we can just
+      // zero-extend the load.
+      if (++ZeroExtendedBytes != (ByteWidth - static_cast<unsigned>(i)))
+        return SDValue();
+      continue;
+    }
+    assert(P->hasSrc() && "provenance should either be memory or zero");
+    auto *L = cast<LoadSDNode>(P->Src.value());
+
+    // All loads must share the same chain
+    SDValue LChain = L->getChain();
+    if (!Chain)
+      Chain = LChain;
+    else if (Chain != LChain)
+      return SDValue();
+
+    // Loads must share the same base address
+    BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG);
+    int64_t ByteOffsetFromBase = 0;
+
+    // For vector loads, the expected load combine pattern will have an
+    // ExtractElement for each index in the vector. While each of these
+    // ExtractElements will be accessing the same base address as determined
+    // by the load instruction, the actual bytes they interact with will differ
+    // due to different ExtractElement indices. To accurately determine the
+    // byte position of an ExtractElement, we offset the base load ptr with
+    // the index multiplied by the byte size of each element in the vector.
+    if (L->getMemoryVT().isVector()) {
+      unsigned LoadWidthInBit = L->getMemoryVT().getScalarSizeInBits();
+      if (LoadWidthInBit % 8 != 0)
+        return SDValue();
+      unsigned ByteOffsetFromVector = P->SrcOffset * LoadWidthInBit / 8;
+      Ptr.addToOffset(ByteOffsetFromVector);
+    }
+
+    if (!Base)
+      Base = Ptr;
+
+    else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
+      return SDValue();
+
+    // Calculate the offset of the current byte from the base address
+    ByteOffsetFromBase += MemoryByteOffset(*P);
+    ByteOffsets[i] = ByteOffsetFromBase;
+
+    // Remember the first byte load
+    if (ByteOffsetFromBase < FirstOffset) {
+      FirstByteProvider = P;
+      FirstOffset = ByteOffsetFromBase;
+    }
+
+    Loads.insert(L);
+  }
+
+  assert(!Loads.empty() && "All the bytes of the value must be loaded from "
+         "memory, so there must be at least one load which produces the value");
+  assert(Base && "Base address of the accessed memory location must be set");
+  assert(FirstOffset != INT64_MAX && "First byte offset must be set");
+
+  bool NeedsZext = ZeroExtendedBytes > 0;
+
+  EVT MemVT =
+      EVT::getIntegerVT(*DAG.getContext(), (ByteWidth - ZeroExtendedBytes) * 8);
+
+  if (!MemVT.isSimple())
+    return SDValue();
+
+  // Before legalize we can introduce too wide illegal loads which will be later
+  // split into legal sized loads. This enables us to combine i64 load by i8
+  // patterns to a couple of i32 loads on 32 bit targets.
+  if (LegalOperations &&
+      !TLI.isOperationLegal(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD,
+                            MemVT))
+    return SDValue();
+
+  // Check if the bytes of the OR we are looking at match with either big or
+  // little endian value load
+  std::optional<bool> IsBigEndian = isBigEndian(
+      ArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset);
+  if (!IsBigEndian)
+    return SDValue();
+
+  assert(FirstByteProvider && "must be set");
+
+  // Ensure that the first byte is loaded from zero offset of the first load.
+  // So the combined value can be loaded from the first load address.
+  if (MemoryByteOffset(*FirstByteProvider) != 0)
+    return SDValue();
+  auto *FirstLoad = cast<LoadSDNode>(FirstByteProvider->Src.value());
+
+  // The node we are looking at matches with the pattern, check if we can
+  // replace it with a single (possibly zero-extended) load and bswap + shift if
+  // needed.
+
+  // If the load needs byte swap check if the target supports it
+  bool NeedsBswap = IsBigEndianTarget != *IsBigEndian;
+
+  // Before legalize we can introduce illegal bswaps which will be later
+  // converted to an explicit bswap sequence. This way we end up with a single
+  // load and byte shuffling instead of several loads and byte shuffling.
+  // We do not introduce illegal bswaps when zero-extending as this tends to
+  // introduce too many arithmetic instructions.
+  if (NeedsBswap && (LegalOperations || NeedsZext) &&
+      !TLI.isOperationLegal(ISD::BSWAP, VT))
+    return SDValue();
+
+  // If we need to bswap and zero extend, we have to insert a shift. Check that
+  // it is legal.
+  if (NeedsBswap && NeedsZext && LegalOperations &&
+      !TLI.isOperationLegal(ISD::SHL, VT))
+    return SDValue();
+
+  // Check that a load of the wide type is both allowed and fast on the target
+  unsigned Fast = 0;
+  bool Allowed =
+      TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
+                             *FirstLoad->getMemOperand(), &Fast);
+  if (!Allowed || !Fast)
+    return SDValue();
+
+  SDValue NewLoad =
+      DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, SDLoc(N), VT,
+                     Chain, FirstLoad->getBasePtr(),
+                     FirstLoad->getPointerInfo(), MemVT, FirstLoad->getAlign());
+
+  // Transfer chain users from old loads to the new load.
+  for (LoadSDNode *L : Loads)
+    DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1));
+
+  if (!NeedsBswap)
+    return NewLoad;
+
+  SDValue ShiftedLoad =
+      NeedsZext
+          ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad,
+                        DAG.getShiftAmountConstant(ZeroExtendedBytes * 8, VT,
+                                                   SDLoc(N), LegalOperations))
+          : NewLoad;
+  return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad);
+}
+
+// If the target has andn, bsl, or a similar bit-select instruction,
+// we want to unfold masked merge, with canonical pattern of:
+//   |        A  |  |B|
+//   ((x ^ y) & m) ^ y
+//    |  D  |
+// Into:
+//   (x & m) | (y & ~m)
+// If y is a constant, m is not a 'not', and the 'andn' does not work with
+// immediates, we unfold into a different pattern:
+//   ~(~x & m) & (m | y)
+// If x is a constant, m is a 'not', and the 'andn' does not work with
+// immediates, we unfold into a different pattern:
+//   (x | ~m) & ~(~m & ~y)
+// NOTE: we don't unfold the pattern if 'xor' is actually a 'not', because at
+//       the very least that breaks andnpd / andnps patterns, and because those
+//       patterns are simplified in IR and shouldn't be created in the DAG
+SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) {
+  assert(N->getOpcode() == ISD::XOR);
+
+  // Don't touch 'not' (i.e. where y = -1).
+  if (isAllOnesOrAllOnesSplat(N->getOperand(1)))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  // There are 3 commutable operators in the pattern,
+  // so we have to deal with 8 possible variants of the basic pattern.
+  SDValue X, Y, M;
+  auto matchAndXor = [&X, &Y, &M](SDValue And, unsigned XorIdx, SDValue Other) {
+    if (And.getOpcode() != ISD::AND || !And.hasOneUse())
+      return false;
+    SDValue Xor = And.getOperand(XorIdx);
+    if (Xor.getOpcode() != ISD::XOR || !Xor.hasOneUse())
+      return false;
+    SDValue Xor0 = Xor.getOperand(0);
+    SDValue Xor1 = Xor.getOperand(1);
+    // Don't touch 'not' (i.e. where y = -1).
+    if (isAllOnesOrAllOnesSplat(Xor1))
+      return false;
+    if (Other == Xor0)
+      std::swap(Xor0, Xor1);
+    if (Other != Xor1)
+      return false;
+    X = Xor0;
+    Y = Xor1;
+    M = And.getOperand(XorIdx ? 0 : 1);
+    return true;
+  };
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (!matchAndXor(N0, 0, N1) && !matchAndXor(N0, 1, N1) &&
+      !matchAndXor(N1, 0, N0) && !matchAndXor(N1, 1, N0))
+    return SDValue();
+
+  // Don't do anything if the mask is constant. This should not be reachable.
+  // InstCombine should have already unfolded this pattern, and DAGCombiner
+  // probably shouldn't produce it, too.
+  if (isa<ConstantSDNode>(M.getNode()))
+    return SDValue();
+
+  // We can transform if the target has AndNot
+  if (!TLI.hasAndNot(M))
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // If Y is a constant, check that 'andn' works with immediates. Unless M is
+  // a bitwise not that would already allow ANDN to be used.
+  if (!TLI.hasAndNot(Y) && !isBitwiseNot(M)) {
+    assert(TLI.hasAndNot(X) && "Only mask is a variable? Unreachable.");
+    // If not, we need to do a bit more work to make sure andn is still used.
+    SDValue NotX = DAG.getNOT(DL, X, VT);
+    SDValue LHS = DAG.getNode(ISD::AND, DL, VT, NotX, M);
+    SDValue NotLHS = DAG.getNOT(DL, LHS, VT);
+    SDValue RHS = DAG.getNode(ISD::OR, DL, VT, M, Y);
+    return DAG.getNode(ISD::AND, DL, VT, NotLHS, RHS);
+  }
+
+  // If X is a constant and M is a bitwise not, check that 'andn' works with
+  // immediates.
+  if (!TLI.hasAndNot(X) && isBitwiseNot(M)) {
+    assert(TLI.hasAndNot(Y) && "Only mask is a variable? Unreachable.");
+    // If not, we need to do a bit more work to make sure andn is still used.
+    SDValue NotM = M.getOperand(0);
+    SDValue LHS = DAG.getNode(ISD::OR, DL, VT, X, NotM);
+    SDValue NotY = DAG.getNOT(DL, Y, VT);
+    SDValue RHS = DAG.getNode(ISD::AND, DL, VT, NotM, NotY);
+    SDValue NotRHS = DAG.getNOT(DL, RHS, VT);
+    return DAG.getNode(ISD::AND, DL, VT, LHS, NotRHS);
+  }
+
+  SDValue LHS = DAG.getNode(ISD::AND, DL, VT, X, M);
+  SDValue NotM = DAG.getNOT(DL, M, VT);
+  SDValue RHS = DAG.getNode(ISD::AND, DL, VT, Y, NotM);
+
+  return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
+}
+
+SDValue DAGCombiner::visitXOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  SDLoc DL(N);
+
+  // fold (xor undef, undef) -> 0. This is a common idiom (misuse).
+  if (N0.isUndef() && N1.isUndef())
+    return DAG.getConstant(0, DL, VT);
+
+  // fold (xor x, undef) -> undef
+  if (N0.isUndef())
+    return N0;
+  if (N1.isUndef())
+    return N1;
+
+  // fold (xor c1, c2) -> c1^c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::XOR, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+    // fold (xor x, 0) -> x, vector edition
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
+      return N0;
+  }
+
+  // fold (xor x, 0) -> x
+  if (isNullConstant(N1))
+    return N0;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  // reassociate xor
+  if (SDValue RXOR = reassociateOps(ISD::XOR, DL, N0, N1, N->getFlags()))
+    return RXOR;
+
+  // Fold xor(vecreduce(x), vecreduce(y)) -> vecreduce(xor(x, y))
+  if (SDValue SD =
+          reassociateReduction(ISD::VECREDUCE_XOR, ISD::XOR, DL, VT, N0, N1))
+    return SD;
+
+  // fold (a^b) -> (a|b) iff a and b share no bits.
+  if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) &&
+      DAG.haveNoCommonBitsSet(N0, N1))
+    return DAG.getNode(ISD::OR, DL, VT, N0, N1);
+
+  // look for 'add-like' folds:
+  // XOR(N0,MIN_SIGNED_VALUE) == ADD(N0,MIN_SIGNED_VALUE)
+  if ((!LegalOperations || TLI.isOperationLegal(ISD::ADD, VT)) &&
+      isMinSignedConstant(N1))
+    if (SDValue Combined = visitADDLike(N))
+      return Combined;
+
+  // fold !(x cc y) -> (x !cc y)
+  unsigned N0Opcode = N0.getOpcode();
+  SDValue LHS, RHS, CC;
+  if (TLI.isConstTrueVal(N1) &&
+      isSetCCEquivalent(N0, LHS, RHS, CC, /*MatchStrict*/ true)) {
+    ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+                                               LHS.getValueType());
+    if (!LegalOperations ||
+        TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) {
+      switch (N0Opcode) {
+      default:
+        llvm_unreachable("Unhandled SetCC Equivalent!");
+      case ISD::SETCC:
+        return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC);
+      case ISD::SELECT_CC:
+        return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2),
+                               N0.getOperand(3), NotCC);
+      case ISD::STRICT_FSETCC:
+      case ISD::STRICT_FSETCCS: {
+        if (N0.hasOneUse()) {
+          // FIXME Can we handle multiple uses? Could we token factor the chain
+          // results from the new/old setcc?
+          SDValue SetCC =
+              DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC,
+                           N0.getOperand(0), N0Opcode == ISD::STRICT_FSETCCS);
+          CombineTo(N, SetCC);
+          DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), SetCC.getValue(1));
+          recursivelyDeleteUnusedNodes(N0.getNode());
+          return SDValue(N, 0); // Return N so it doesn't get rechecked!
+        }
+        break;
+      }
+      }
+    }
+  }
+
+  // fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y)))
+  if (isOneConstant(N1) && N0Opcode == ISD::ZERO_EXTEND && N0.hasOneUse() &&
+      isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){
+    SDValue V = N0.getOperand(0);
+    SDLoc DL0(N0);
+    V = DAG.getNode(ISD::XOR, DL0, V.getValueType(), V,
+                    DAG.getConstant(1, DL0, V.getValueType()));
+    AddToWorklist(V.getNode());
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, V);
+  }
+
+  // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
+  if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() &&
+      (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
+    SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);
+    if (isOneUseSetCC(N01) || isOneUseSetCC(N00)) {
+      unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND;
+      N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00
+      N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01
+      AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode());
+      return DAG.getNode(NewOpcode, DL, VT, N00, N01);
+    }
+  }
+  // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
+  if (isAllOnesConstant(N1) && N0.hasOneUse() &&
+      (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
+    SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);
+    if (isa<ConstantSDNode>(N01) || isa<ConstantSDNode>(N00)) {
+      unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND;
+      N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00
+      N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01
+      AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode());
+      return DAG.getNode(NewOpcode, DL, VT, N00, N01);
+    }
+  }
+
+  // fold (not (neg x)) -> (add X, -1)
+  // FIXME: This can be generalized to (not (sub Y, X)) -> (add X, ~Y) if
+  // Y is a constant or the subtract has a single use.
+  if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::SUB &&
+      isNullConstant(N0.getOperand(0))) {
+    return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1),
+                       DAG.getAllOnesConstant(DL, VT));
+  }
+
+  // fold (not (add X, -1)) -> (neg X)
+  if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::ADD &&
+      isAllOnesOrAllOnesSplat(N0.getOperand(1))) {
+    return DAG.getNegative(N0.getOperand(0), DL, VT);
+  }
+
+  // fold (xor (and x, y), y) -> (and (not x), y)
+  if (N0Opcode == ISD::AND && N0.hasOneUse() && N0->getOperand(1) == N1) {
+    SDValue X = N0.getOperand(0);
+    SDValue NotX = DAG.getNOT(SDLoc(X), X, VT);
+    AddToWorklist(NotX.getNode());
+    return DAG.getNode(ISD::AND, DL, VT, NotX, N1);
+  }
+
+  // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)
+  if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
+    SDValue A = N0Opcode == ISD::ADD ? N0 : N1;
+    SDValue S = N0Opcode == ISD::SRA ? N0 : N1;
+    if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) {
+      SDValue A0 = A.getOperand(0), A1 = A.getOperand(1);
+      SDValue S0 = S.getOperand(0);
+      if ((A0 == S && A1 == S0) || (A1 == S && A0 == S0))
+        if (ConstantSDNode *C = isConstOrConstSplat(S.getOperand(1)))
+          if (C->getAPIntValue() == (VT.getScalarSizeInBits() - 1))
+            return DAG.getNode(ISD::ABS, DL, VT, S0);
+    }
+  }
+
+  // fold (xor x, x) -> 0
+  if (N0 == N1)
+    return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
+
+  // fold (xor (shl 1, x), -1) -> (rotl ~1, x)
+  // Here is a concrete example of this equivalence:
+  // i16   x ==  14
+  // i16 shl ==   1 << 14  == 16384 == 0b0100000000000000
+  // i16 xor == ~(1 << 14) == 49151 == 0b1011111111111111
+  //
+  // =>
+  //
+  // i16     ~1      == 0b1111111111111110
+  // i16 rol(~1, 14) == 0b1011111111111111
+  //
+  // Some additional tips to help conceptualize this transform:
+  // - Try to see the operation as placing a single zero in a value of all ones.
+  // - There exists no value for x which would allow the result to contain zero.
+  // - Values of x larger than the bitwidth are undefined and do not require a
+  //   consistent result.
+  // - Pushing the zero left requires shifting one bits in from the right.
+  // A rotate left of ~1 is a nice way of achieving the desired result.
+  if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0Opcode == ISD::SHL &&
+      isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) {
+    return DAG.getNode(ISD::ROTL, DL, VT, DAG.getConstant(~1, DL, VT),
+                       N0.getOperand(1));
+  }
+
+  // Simplify: xor (op x...), (op y...)  -> (op (xor x, y))
+  if (N0Opcode == N1.getOpcode())
+    if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
+      return V;
+
+  if (SDValue R = foldLogicOfShifts(N, N0, N1, DAG))
+    return R;
+  if (SDValue R = foldLogicOfShifts(N, N1, N0, DAG))
+    return R;
+  if (SDValue R = foldLogicTreeOfShifts(N, N0, N1, DAG))
+    return R;
+
+  // Unfold  ((x ^ y) & m) ^ y  into  (x & m) | (y & ~m)  if profitable
+  if (SDValue MM = unfoldMaskedMerge(N))
+    return MM;
+
+  // Simplify the expression using non-local knowledge.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N))
+    return Combined;
+
+  return SDValue();
+}
+
+/// If we have a shift-by-constant of a bitwise logic op that itself has a
+/// shift-by-constant operand with identical opcode, we may be able to convert
+/// that into 2 independent shifts followed by the logic op. This is a
+/// throughput improvement.
+static SDValue combineShiftOfShiftedLogic(SDNode *Shift, SelectionDAG &DAG) {
+  // Match a one-use bitwise logic op.
+  SDValue LogicOp = Shift->getOperand(0);
+  if (!LogicOp.hasOneUse())
+    return SDValue();
+
+  unsigned LogicOpcode = LogicOp.getOpcode();
+  if (LogicOpcode != ISD::AND && LogicOpcode != ISD::OR &&
+      LogicOpcode != ISD::XOR)
+    return SDValue();
+
+  // Find a matching one-use shift by constant.
+  unsigned ShiftOpcode = Shift->getOpcode();
+  SDValue C1 = Shift->getOperand(1);
+  ConstantSDNode *C1Node = isConstOrConstSplat(C1);
+  assert(C1Node && "Expected a shift with constant operand");
+  const APInt &C1Val = C1Node->getAPIntValue();
+  auto matchFirstShift = [&](SDValue V, SDValue &ShiftOp,
+                             const APInt *&ShiftAmtVal) {
+    if (V.getOpcode() != ShiftOpcode || !V.hasOneUse())
+      return false;
+
+    ConstantSDNode *ShiftCNode = isConstOrConstSplat(V.getOperand(1));
+    if (!ShiftCNode)
+      return false;
+
+    // Capture the shifted operand and shift amount value.
+    ShiftOp = V.getOperand(0);
+    ShiftAmtVal = &ShiftCNode->getAPIntValue();
+
+    // Shift amount types do not have to match their operand type, so check that
+    // the constants are the same width.
+    if (ShiftAmtVal->getBitWidth() != C1Val.getBitWidth())
+      return false;
+
+    // The fold is not valid if the sum of the shift values exceeds bitwidth.
+    if ((*ShiftAmtVal + C1Val).uge(V.getScalarValueSizeInBits()))
+      return false;
+
+    return true;
+  };
+
+  // Logic ops are commutative, so check each operand for a match.
+  SDValue X, Y;
+  const APInt *C0Val;
+  if (matchFirstShift(LogicOp.getOperand(0), X, C0Val))
+    Y = LogicOp.getOperand(1);
+  else if (matchFirstShift(LogicOp.getOperand(1), X, C0Val))
+    Y = LogicOp.getOperand(0);
+  else
+    return SDValue();
+
+  // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1)
+  SDLoc DL(Shift);
+  EVT VT = Shift->getValueType(0);
+  EVT ShiftAmtVT = Shift->getOperand(1).getValueType();
+  SDValue ShiftSumC = DAG.getConstant(*C0Val + C1Val, DL, ShiftAmtVT);
+  SDValue NewShift1 = DAG.getNode(ShiftOpcode, DL, VT, X, ShiftSumC);
+  SDValue NewShift2 = DAG.getNode(ShiftOpcode, DL, VT, Y, C1);
+  return DAG.getNode(LogicOpcode, DL, VT, NewShift1, NewShift2);
+}
+
+/// Handle transforms common to the three shifts, when the shift amount is a
+/// constant.
+/// We are looking for: (shift being one of shl/sra/srl)
+///   shift (binop X, C0), C1
+/// And want to transform into:
+///   binop (shift X, C1), (shift C0, C1)
+SDValue DAGCombiner::visitShiftByConstant(SDNode *N) {
+  assert(isConstOrConstSplat(N->getOperand(1)) && "Expected constant operand");
+
+  // Do not turn a 'not' into a regular xor.
+  if (isBitwiseNot(N->getOperand(0)))
+    return SDValue();
+
+  // The inner binop must be one-use, since we want to replace it.
+  SDValue LHS = N->getOperand(0);
+  if (!LHS.hasOneUse() || !TLI.isDesirableToCommuteWithShift(N, Level))
+    return SDValue();
+
+  // Fold shift(bitop(shift(x,c1),y), c2) -> bitop(shift(x,c1+c2),shift(y,c2)).
+  if (SDValue R = combineShiftOfShiftedLogic(N, DAG))
+    return R;
+
+  // We want to pull some binops through shifts, so that we have (and (shift))
+  // instead of (shift (and)), likewise for add, or, xor, etc.  This sort of
+  // thing happens with address calculations, so it's important to canonicalize
+  // it.
+  switch (LHS.getOpcode()) {
+  default:
+    return SDValue();
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::AND:
+    break;
+  case ISD::ADD:
+    if (N->getOpcode() != ISD::SHL)
+      return SDValue(); // only shl(add) not sr[al](add).
+    break;
+  }
+
+  // FIXME: disable this unless the input to the binop is a shift by a constant
+  // or is copy/select. Enable this in other cases when figure out it's exactly
+  // profitable.
+  SDValue BinOpLHSVal = LHS.getOperand(0);
+  bool IsShiftByConstant = (BinOpLHSVal.getOpcode() == ISD::SHL ||
+                            BinOpLHSVal.getOpcode() == ISD::SRA ||
+                            BinOpLHSVal.getOpcode() == ISD::SRL) &&
+                           isa<ConstantSDNode>(BinOpLHSVal.getOperand(1));
+  bool IsCopyOrSelect = BinOpLHSVal.getOpcode() == ISD::CopyFromReg ||
+                        BinOpLHSVal.getOpcode() == ISD::SELECT;
+
+  if (!IsShiftByConstant && !IsCopyOrSelect)
+    return SDValue();
+
+  if (IsCopyOrSelect && N->hasOneUse())
+    return SDValue();
+
+  // Attempt to fold the constants, shifting the binop RHS by the shift amount.
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  if (SDValue NewRHS = DAG.FoldConstantArithmetic(
+          N->getOpcode(), DL, VT, {LHS.getOperand(1), N->getOperand(1)})) {
+    SDValue NewShift = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(0),
+                                   N->getOperand(1));
+    return DAG.getNode(LHS.getOpcode(), DL, VT, NewShift, NewRHS);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) {
+  assert(N->getOpcode() == ISD::TRUNCATE);
+  assert(N->getOperand(0).getOpcode() == ISD::AND);
+
+  // (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC)
+  EVT TruncVT = N->getValueType(0);
+  if (N->hasOneUse() && N->getOperand(0).hasOneUse() &&
+      TLI.isTypeDesirableForOp(ISD::AND, TruncVT)) {
+    SDValue N01 = N->getOperand(0).getOperand(1);
+    if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) {
+      SDLoc DL(N);
+      SDValue N00 = N->getOperand(0).getOperand(0);
+      SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00);
+      SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01);
+      AddToWorklist(Trunc00.getNode());
+      AddToWorklist(Trunc01.getNode());
+      return DAG.getNode(ISD::AND, DL, TruncVT, Trunc00, Trunc01);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitRotate(SDNode *N) {
+  SDLoc dl(N);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  unsigned Bitsize = VT.getScalarSizeInBits();
+
+  // fold (rot x, 0) -> x
+  if (isNullOrNullSplat(N1))
+    return N0;
+
+  // fold (rot x, c) -> x iff (c % BitSize) == 0
+  if (isPowerOf2_32(Bitsize) && Bitsize > 1) {
+    APInt ModuloMask(N1.getScalarValueSizeInBits(), Bitsize - 1);
+    if (DAG.MaskedValueIsZero(N1, ModuloMask))
+      return N0;
+  }
+
+  // fold (rot x, c) -> (rot x, c % BitSize)
+  bool OutOfRange = false;
+  auto MatchOutOfRange = [Bitsize, &OutOfRange](ConstantSDNode *C) {
+    OutOfRange |= C->getAPIntValue().uge(Bitsize);
+    return true;
+  };
+  if (ISD::matchUnaryPredicate(N1, MatchOutOfRange) && OutOfRange) {
+    EVT AmtVT = N1.getValueType();
+    SDValue Bits = DAG.getConstant(Bitsize, dl, AmtVT);
+    if (SDValue Amt =
+            DAG.FoldConstantArithmetic(ISD::UREM, dl, AmtVT, {N1, Bits}))
+      return DAG.getNode(N->getOpcode(), dl, VT, N0, Amt);
+  }
+
+  // rot i16 X, 8 --> bswap X
+  auto *RotAmtC = isConstOrConstSplat(N1);
+  if (RotAmtC && RotAmtC->getAPIntValue() == 8 &&
+      VT.getScalarSizeInBits() == 16 && hasOperation(ISD::BSWAP, VT))
+    return DAG.getNode(ISD::BSWAP, dl, VT, N0);
+
+  // Simplify the operands using demanded-bits information.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))).
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND) {
+    if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
+      return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1);
+  }
+
+  unsigned NextOp = N0.getOpcode();
+
+  // fold (rot* (rot* x, c2), c1)
+  //   -> (rot* x, ((c1 % bitsize) +- (c2 % bitsize) + bitsize) % bitsize)
+  if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) {
+    SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1);
+    SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1));
+    if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) {
+      EVT ShiftVT = C1->getValueType(0);
+      bool SameSide = (N->getOpcode() == NextOp);
+      unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
+      SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);
+      SDValue Norm1 = DAG.FoldConstantArithmetic(ISD::UREM, dl, ShiftVT,
+                                                 {N1, BitsizeC});
+      SDValue Norm2 = DAG.FoldConstantArithmetic(ISD::UREM, dl, ShiftVT,
+                                                 {N0.getOperand(1), BitsizeC});
+      if (Norm1 && Norm2)
+        if (SDValue CombinedShift = DAG.FoldConstantArithmetic(
+                CombineOp, dl, ShiftVT, {Norm1, Norm2})) {
+          CombinedShift = DAG.FoldConstantArithmetic(ISD::ADD, dl, ShiftVT,
+                                                     {CombinedShift, BitsizeC});
+          SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(
+              ISD::UREM, dl, ShiftVT, {CombinedShift, BitsizeC});
+          return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0),
+                             CombinedShiftNorm);
+        }
+    }
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSHL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (SDValue V = DAG.simplifyShift(N0, N1))
+    return V;
+
+  EVT VT = N0.getValueType();
+  EVT ShiftVT = N1.getValueType();
+  unsigned OpSizeInBits = VT.getScalarSizeInBits();
+
+  // fold (shl c1, c2) -> c1<<c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N0, N1}))
+    return C;
+
+  // fold vector ops
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
+      return FoldedVOp;
+
+    BuildVectorSDNode *N1CV = dyn_cast<BuildVectorSDNode>(N1);
+    // If setcc produces all-one true value then:
+    // (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<<N1CV)
+    if (N1CV && N1CV->isConstant()) {
+      if (N0.getOpcode() == ISD::AND) {
+        SDValue N00 = N0->getOperand(0);
+        SDValue N01 = N0->getOperand(1);
+        BuildVectorSDNode *N01CV = dyn_cast<BuildVectorSDNode>(N01);
+
+        if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC &&
+            TLI.getBooleanContents(N00.getOperand(0).getValueType()) ==
+                TargetLowering::ZeroOrNegativeOneBooleanContent) {
+          if (SDValue C =
+                  DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N01, N1}))
+            return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C);
+        }
+      }
+    }
+  }
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  // if (shl x, c) is known to be zero, return 0
+  if (DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(OpSizeInBits)))
+    return DAG.getConstant(0, SDLoc(N), VT);
+
+  // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND) {
+    if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
+      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1);
+  }
+
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))
+  if (N0.getOpcode() == ISD::SHL) {
+    auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
+                                          ConstantSDNode *RHS) {
+      APInt c1 = LHS->getAPIntValue();
+      APInt c2 = RHS->getAPIntValue();
+      zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
+      return (c1 + c2).uge(OpSizeInBits);
+    };
+    if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
+      return DAG.getConstant(0, SDLoc(N), VT);
+
+    auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
+                                       ConstantSDNode *RHS) {
+      APInt c1 = LHS->getAPIntValue();
+      APInt c2 = RHS->getAPIntValue();
+      zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
+      return (c1 + c2).ult(OpSizeInBits);
+    };
+    if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
+      SDLoc DL(N);
+      SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
+      return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum);
+    }
+  }
+
+  // fold (shl (ext (shl x, c1)), c2) -> (shl (ext x), (add c1, c2))
+  // For this to be valid, the second form must not preserve any of the bits
+  // that are shifted out by the inner shift in the first form.  This means
+  // the outer shift size must be >= the number of bits added by the ext.
+  // As a corollary, we don't care what kind of ext it is.
+  if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
+       N0.getOpcode() == ISD::ANY_EXTEND ||
+       N0.getOpcode() == ISD::SIGN_EXTEND) &&
+      N0.getOperand(0).getOpcode() == ISD::SHL) {
+    SDValue N0Op0 = N0.getOperand(0);
+    SDValue InnerShiftAmt = N0Op0.getOperand(1);
+    EVT InnerVT = N0Op0.getValueType();
+    uint64_t InnerBitwidth = InnerVT.getScalarSizeInBits();
+
+    auto MatchOutOfRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS,
+                                                         ConstantSDNode *RHS) {
+      APInt c1 = LHS->getAPIntValue();
+      APInt c2 = RHS->getAPIntValue();
+      zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
+      return c2.uge(OpSizeInBits - InnerBitwidth) &&
+             (c1 + c2).uge(OpSizeInBits);
+    };
+    if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchOutOfRange,
+                                  /*AllowUndefs*/ false,
+                                  /*AllowTypeMismatch*/ true))
+      return DAG.getConstant(0, SDLoc(N), VT);
+
+    auto MatchInRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS,
+                                                      ConstantSDNode *RHS) {
+      APInt c1 = LHS->getAPIntValue();
+      APInt c2 = RHS->getAPIntValue();
+      zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
+      return c2.uge(OpSizeInBits - InnerBitwidth) &&
+             (c1 + c2).ult(OpSizeInBits);
+    };
+    if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchInRange,
+                                  /*AllowUndefs*/ false,
+                                  /*AllowTypeMismatch*/ true)) {
+      SDLoc DL(N);
+      SDValue Ext = DAG.getNode(N0.getOpcode(), DL, VT, N0Op0.getOperand(0));
+      SDValue Sum = DAG.getZExtOrTrunc(InnerShiftAmt, DL, ShiftVT);
+      Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, Sum, N1);
+      return DAG.getNode(ISD::SHL, DL, VT, Ext, Sum);
+    }
+  }
+
+  // fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
+  // Only fold this if the inner zext has no other uses to avoid increasing
+  // the total number of instructions.
+  if (N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() &&
+      N0.getOperand(0).getOpcode() == ISD::SRL) {
+    SDValue N0Op0 = N0.getOperand(0);
+    SDValue InnerShiftAmt = N0Op0.getOperand(1);
+
+    auto MatchEqual = [VT](ConstantSDNode *LHS, ConstantSDNode *RHS) {
+      APInt c1 = LHS->getAPIntValue();
+      APInt c2 = RHS->getAPIntValue();
+      zeroExtendToMatch(c1, c2);
+      return c1.ult(VT.getScalarSizeInBits()) && (c1 == c2);
+    };
+    if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchEqual,
+                                  /*AllowUndefs*/ false,
+                                  /*AllowTypeMismatch*/ true)) {
+      SDLoc DL(N);
+      EVT InnerShiftAmtVT = N0Op0.getOperand(1).getValueType();
+      SDValue NewSHL = DAG.getZExtOrTrunc(N1, DL, InnerShiftAmtVT);
+      NewSHL = DAG.getNode(ISD::SHL, DL, N0Op0.getValueType(), N0Op0, NewSHL);
+      AddToWorklist(NewSHL.getNode());
+      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL);
+    }
+  }
+
+  if (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) {
+    auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS,
+                                           ConstantSDNode *RHS) {
+      const APInt &LHSC = LHS->getAPIntValue();
+      const APInt &RHSC = RHS->getAPIntValue();
+      return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
+             LHSC.getZExtValue() <= RHSC.getZExtValue();
+    };
+
+    SDLoc DL(N);
+
+    // fold (shl (sr[la] exact X,  C1), C2) -> (shl    X, (C2-C1)) if C1 <= C2
+    // fold (shl (sr[la] exact X,  C1), C2) -> (sr[la] X, (C2-C1)) if C1 >= C2
+    if (N0->getFlags().hasExact()) {
+      if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
+                                    /*AllowUndefs*/ false,
+                                    /*AllowTypeMismatch*/ true)) {
+        SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
+        SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
+        return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
+      }
+      if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
+                                    /*AllowUndefs*/ false,
+                                    /*AllowTypeMismatch*/ true)) {
+        SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
+        SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
+        return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0), Diff);
+      }
+    }
+
+    // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
+    //                               (and (srl x, (sub c1, c2), MASK)
+    // Only fold this if the inner shift has no other uses -- if it does,
+    // folding this will increase the total number of instructions.
+    if (N0.getOpcode() == ISD::SRL &&
+        (N0.getOperand(1) == N1 || N0.hasOneUse()) &&
+        TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
+      if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
+                                    /*AllowUndefs*/ false,
+                                    /*AllowTypeMismatch*/ true)) {
+        SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
+        SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
+        SDValue Mask = DAG.getAllOnesConstant(DL, VT);
+        Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, N01);
+        Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, Diff);
+        SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff);
+        return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
+      }
+      if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
+                                    /*AllowUndefs*/ false,
+                                    /*AllowTypeMismatch*/ true)) {
+        SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
+        SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
+        SDValue Mask = DAG.getAllOnesConstant(DL, VT);
+        Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, N1);
+        SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
+        return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
+      }
+    }
+  }
+
+  // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
+  if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) &&
+      isConstantOrConstantVector(N1, /* No Opaques */ true)) {
+    SDLoc DL(N);
+    SDValue AllBits = DAG.getAllOnesConstant(DL, VT);
+    SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);
+    return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);
+  }
+
+  // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
+  // fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
+  // Variant of version done on multiply, except mul by a power of 2 is turned
+  // into a shift.
+  if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) &&
+      N0->hasOneUse() &&
+      isConstantOrConstantVector(N1, /* No Opaques */ true) &&
+      isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true) &&
+      TLI.isDesirableToCommuteWithShift(N, Level)) {
+    SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
+    SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
+    AddToWorklist(Shl0.getNode());
+    AddToWorklist(Shl1.getNode());
+    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1);
+  }
+
+  // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
+  if (N0.getOpcode() == ISD::MUL && N0->hasOneUse()) {
+    SDValue N01 = N0.getOperand(1);
+    if (SDValue Shl =
+            DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N1), VT, {N01, N1}))
+      return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl);
+  }
+
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+  if (N1C && !N1C->isOpaque())
+    if (SDValue NewSHL = visitShiftByConstant(N))
+      return NewSHL;
+
+  // Fold (shl (vscale * C0), C1) to (vscale * (C0 << C1)).
+  if (N0.getOpcode() == ISD::VSCALE && N1C) {
+    const APInt &C0 = N0.getConstantOperandAPInt(0);
+    const APInt &C1 = N1C->getAPIntValue();
+    return DAG.getVScale(SDLoc(N), VT, C0 << C1);
+  }
+
+  // Fold (shl step_vector(C0), C1) to (step_vector(C0 << C1)).
+  APInt ShlVal;
+  if (N0.getOpcode() == ISD::STEP_VECTOR &&
+      ISD::isConstantSplatVector(N1.getNode(), ShlVal)) {
+    const APInt &C0 = N0.getConstantOperandAPInt(0);
+    if (ShlVal.ult(C0.getBitWidth())) {
+      APInt NewStep = C0 << ShlVal;
+      return DAG.getStepVector(SDLoc(N), VT, NewStep);
+    }
+  }
+
+  return SDValue();
+}
+
+// Transform a right shift of a multiply into a multiply-high.
+// Examples:
+// (srl (mul (zext i32:$a to i64), (zext i32:$a to i64)), 32) -> (mulhu $a, $b)
+// (sra (mul (sext i32:$a to i64), (sext i32:$a to i64)), 32) -> (mulhs $a, $b)
+static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG,
+                                  const TargetLowering &TLI) {
+  assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+         "SRL or SRA node is required here!");
+
+  // Check the shift amount. Proceed with the transformation if the shift
+  // amount is constant.
+  ConstantSDNode *ShiftAmtSrc = isConstOrConstSplat(N->getOperand(1));
+  if (!ShiftAmtSrc)
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // The operation feeding into the shift must be a multiply.
+  SDValue ShiftOperand = N->getOperand(0);
+  if (ShiftOperand.getOpcode() != ISD::MUL)
+    return SDValue();
+
+  // Both operands must be equivalent extend nodes.
+  SDValue LeftOp = ShiftOperand.getOperand(0);
+  SDValue RightOp = ShiftOperand.getOperand(1);
+
+  bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND;
+  bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND;
+
+  if (!IsSignExt && !IsZeroExt)
+    return SDValue();
+
+  EVT NarrowVT = LeftOp.getOperand(0).getValueType();
+  unsigned NarrowVTSize = NarrowVT.getScalarSizeInBits();
+
+  // return true if U may use the lower bits of its operands
+  auto UserOfLowerBits = [NarrowVTSize](SDNode *U) {
+    if (U->getOpcode() != ISD::SRL && U->getOpcode() != ISD::SRA) {
+      return true;
+    }
+    ConstantSDNode *UShiftAmtSrc = isConstOrConstSplat(U->getOperand(1));
+    if (!UShiftAmtSrc) {
+      return true;
+    }
+    unsigned UShiftAmt = UShiftAmtSrc->getZExtValue();
+    return UShiftAmt < NarrowVTSize;
+  };
+
+  // If the lower part of the MUL is also used and MUL_LOHI is supported
+  // do not introduce the MULH in favor of MUL_LOHI
+  unsigned MulLoHiOp = IsSignExt ? ISD::SMUL_LOHI : ISD::UMUL_LOHI;
+  if (!ShiftOperand.hasOneUse() &&
+      TLI.isOperationLegalOrCustom(MulLoHiOp, NarrowVT) &&
+      llvm::any_of(ShiftOperand->uses(), UserOfLowerBits)) {
+    return SDValue();
+  }
+
+  SDValue MulhRightOp;
+  if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
+    unsigned ActiveBits = IsSignExt
+                              ? Constant->getAPIntValue().getSignificantBits()
+                              : Constant->getAPIntValue().getActiveBits();
+    if (ActiveBits > NarrowVTSize)
+      return SDValue();
+    MulhRightOp = DAG.getConstant(
+        Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
+        NarrowVT);
+  } else {
+    if (LeftOp.getOpcode() != RightOp.getOpcode())
+      return SDValue();
+    // Check that the two extend nodes are the same type.
+    if (NarrowVT != RightOp.getOperand(0).getValueType())
+      return SDValue();
+    MulhRightOp = RightOp.getOperand(0);
+  }
+
+  EVT WideVT = LeftOp.getValueType();
+  // Proceed with the transformation if the wide types match.
+  assert((WideVT == RightOp.getValueType()) &&
+         "Cannot have a multiply node with two different operand types.");
+
+  // Proceed with the transformation if the wide type is twice as large
+  // as the narrow type.
+  if (WideVT.getScalarSizeInBits() != 2 * NarrowVTSize)
+    return SDValue();
+
+  // Check the shift amount with the narrow type size.
+  // Proceed with the transformation if the shift amount is the width
+  // of the narrow type.
+  unsigned ShiftAmt = ShiftAmtSrc->getZExtValue();
+  if (ShiftAmt != NarrowVTSize)
+    return SDValue();
+
+  // If the operation feeding into the MUL is a sign extend (sext),
+  // we use mulhs. Othewise, zero extends (zext) use mulhu.
+  unsigned MulhOpcode = IsSignExt ? ISD::MULHS : ISD::MULHU;
+
+  // Combine to mulh if mulh is legal/custom for the narrow type on the target
+  // or if it is a vector type then we could transform to an acceptable type and
+  // rely on legalization to split/combine the result.
+  if (NarrowVT.isVector()) {
+    EVT TransformVT = TLI.getTypeToTransformTo(*DAG.getContext(), NarrowVT);
+    if (TransformVT.getVectorElementType() != NarrowVT.getVectorElementType() ||
+        !TLI.isOperationLegalOrCustom(MulhOpcode, TransformVT))
+      return SDValue();
+  } else {
+    if (!TLI.isOperationLegalOrCustom(MulhOpcode, NarrowVT))
+      return SDValue();
+  }
+
+  SDValue Result =
+      DAG.getNode(MulhOpcode, DL, NarrowVT, LeftOp.getOperand(0), MulhRightOp);
+  bool IsSigned = N->getOpcode() == ISD::SRA;
+  return DAG.getExtOrTrunc(IsSigned, Result, DL, WideVT);
+}
+
+// fold (bswap (logic_op(bswap(x),y))) -> logic_op(x,bswap(y))
+// This helper function accept SDNode with opcode ISD::BSWAP and ISD::BITREVERSE
+static SDValue foldBitOrderCrossLogicOp(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode != ISD::BSWAP && Opcode != ISD::BITREVERSE)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  if (ISD::isBitwiseLogicOp(N0.getOpcode()) && N0.hasOneUse()) {
+    SDValue OldLHS = N0.getOperand(0);
+    SDValue OldRHS = N0.getOperand(1);
+
+    // If both operands are bswap/bitreverse, ignore the multiuse
+    // Otherwise need to ensure logic_op and bswap/bitreverse(x) have one use.
+    if (OldLHS.getOpcode() == Opcode && OldRHS.getOpcode() == Opcode) {
+      return DAG.getNode(N0.getOpcode(), DL, VT, OldLHS.getOperand(0),
+                         OldRHS.getOperand(0));
+    }
+
+    if (OldLHS.getOpcode() == Opcode && OldLHS.hasOneUse()) {
+      SDValue NewBitReorder = DAG.getNode(Opcode, DL, VT, OldRHS);
+      return DAG.getNode(N0.getOpcode(), DL, VT, OldLHS.getOperand(0),
+                         NewBitReorder);
+    }
+
+    if (OldRHS.getOpcode() == Opcode && OldRHS.hasOneUse()) {
+      SDValue NewBitReorder = DAG.getNode(Opcode, DL, VT, OldLHS);
+      return DAG.getNode(N0.getOpcode(), DL, VT, NewBitReorder,
+                         OldRHS.getOperand(0));
+    }
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSRA(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (SDValue V = DAG.simplifyShift(N0, N1))
+    return V;
+
+  EVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getScalarSizeInBits();
+
+  // fold (sra c1, c2) -> (sra c1, c2)
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, {N0, N1}))
+    return C;
+
+  // Arithmetic shifting an all-sign-bit value is a no-op.
+  // fold (sra 0, x) -> 0
+  // fold (sra -1, x) -> -1
+  if (DAG.ComputeNumSignBits(N0) == OpSizeInBits)
+    return N0;
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
+      return FoldedVOp;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
+  // sext_inreg.
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+  if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
+    unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
+    EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
+    if (VT.isVector())
+      ExtVT = EVT::getVectorVT(*DAG.getContext(), ExtVT,
+                               VT.getVectorElementCount());
+    if (!LegalOperations ||
+        TLI.getOperationAction(ISD::SIGN_EXTEND_INREG, ExtVT) ==
+        TargetLowering::Legal)
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
+                         N0.getOperand(0), DAG.getValueType(ExtVT));
+    // Even if we can't convert to sext_inreg, we might be able to remove
+    // this shift pair if the input is already sign extended.
+    if (DAG.ComputeNumSignBits(N0.getOperand(0)) > N1C->getZExtValue())
+      return N0.getOperand(0);
+  }
+
+  // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
+  // clamp (add c1, c2) to max shift.
+  if (N0.getOpcode() == ISD::SRA) {
+    SDLoc DL(N);
+    EVT ShiftVT = N1.getValueType();
+    EVT ShiftSVT = ShiftVT.getScalarType();
+    SmallVector<SDValue, 16> ShiftValues;
+
+    auto SumOfShifts = [&](ConstantSDNode *LHS, ConstantSDNode *RHS) {
+      APInt c1 = LHS->getAPIntValue();
+      APInt c2 = RHS->getAPIntValue();
+      zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
+      APInt Sum = c1 + c2;
+      unsigned ShiftSum =
+          Sum.uge(OpSizeInBits) ? (OpSizeInBits - 1) : Sum.getZExtValue();
+      ShiftValues.push_back(DAG.getConstant(ShiftSum, DL, ShiftSVT));
+      return true;
+    };
+    if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), SumOfShifts)) {
+      SDValue ShiftValue;
+      if (N1.getOpcode() == ISD::BUILD_VECTOR)
+        ShiftValue = DAG.getBuildVector(ShiftVT, DL, ShiftValues);
+      else if (N1.getOpcode() == ISD::SPLAT_VECTOR) {
+        assert(ShiftValues.size() == 1 &&
+               "Expected matchBinaryPredicate to return one element for "
+               "SPLAT_VECTORs");
+        ShiftValue = DAG.getSplatVector(ShiftVT, DL, ShiftValues[0]);
+      } else
+        ShiftValue = ShiftValues[0];
+      return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), ShiftValue);
+    }
+  }
+
+  // fold (sra (shl X, m), (sub result_size, n))
+  // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for
+  // result_size - n != m.
+  // If truncate is free for the target sext(shl) is likely to result in better
+  // code.
+  if (N0.getOpcode() == ISD::SHL && N1C) {
+    // Get the two constanst of the shifts, CN0 = m, CN = n.
+    const ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1));
+    if (N01C) {
+      LLVMContext &Ctx = *DAG.getContext();
+      // Determine what the truncate's result bitsize and type would be.
+      EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - N1C->getZExtValue());
+
+      if (VT.isVector())
+        TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorElementCount());
+
+      // Determine the residual right-shift amount.
+      int ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue();
+
+      // If the shift is not a no-op (in which case this should be just a sign
+      // extend already), the truncated to type is legal, sign_extend is legal
+      // on that type, and the truncate to that type is both legal and free,
+      // perform the transform.
+      if ((ShiftAmt > 0) &&
+          TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) &&
+          TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) &&
+          TLI.isTruncateFree(VT, TruncVT)) {
+        SDLoc DL(N);
+        SDValue Amt = DAG.getConstant(ShiftAmt, DL,
+            getShiftAmountTy(N0.getOperand(0).getValueType()));
+        SDValue Shift = DAG.getNode(ISD::SRL, DL, VT,
+                                    N0.getOperand(0), Amt);
+        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT,
+                                    Shift);
+        return DAG.getNode(ISD::SIGN_EXTEND, DL,
+                           N->getValueType(0), Trunc);
+      }
+    }
+  }
+
+  // We convert trunc/ext to opposing shifts in IR, but casts may be cheaper.
+  //   sra (add (shl X, N1C), AddC), N1C -->
+  //   sext (add (trunc X to (width - N1C)), AddC')
+  //   sra (sub AddC, (shl X, N1C)), N1C -->
+  //   sext (sub AddC1',(trunc X to (width - N1C)))
+  if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::SUB) && N1C &&
+      N0.hasOneUse()) {
+    bool IsAdd = N0.getOpcode() == ISD::ADD;
+    SDValue Shl = N0.getOperand(IsAdd ? 0 : 1);
+    if (Shl.getOpcode() == ISD::SHL && Shl.getOperand(1) == N1 &&
+        Shl.hasOneUse()) {
+      // TODO: AddC does not need to be a splat.
+      if (ConstantSDNode *AddC =
+              isConstOrConstSplat(N0.getOperand(IsAdd ? 1 : 0))) {
+        // Determine what the truncate's type would be and ask the target if
+        // that is a free operation.
+        LLVMContext &Ctx = *DAG.getContext();
+        unsigned ShiftAmt = N1C->getZExtValue();
+        EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - ShiftAmt);
+        if (VT.isVector())
+          TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorElementCount());
+
+        // TODO: The simple type check probably belongs in the default hook
+        //       implementation and/or target-specific overrides (because
+        //       non-simple types likely require masking when legalized), but
+        //       that restriction may conflict with other transforms.
+        if (TruncVT.isSimple() && isTypeLegal(TruncVT) &&
+            TLI.isTruncateFree(VT, TruncVT)) {
+          SDLoc DL(N);
+          SDValue Trunc = DAG.getZExtOrTrunc(Shl.getOperand(0), DL, TruncVT);
+          SDValue ShiftC =
+              DAG.getConstant(AddC->getAPIntValue().lshr(ShiftAmt).trunc(
+                                  TruncVT.getScalarSizeInBits()),
+                              DL, TruncVT);
+          SDValue Add;
+          if (IsAdd)
+            Add = DAG.getNode(ISD::ADD, DL, TruncVT, Trunc, ShiftC);
+          else
+            Add = DAG.getNode(ISD::SUB, DL, TruncVT, ShiftC, Trunc);
+          return DAG.getSExtOrTrunc(Add, DL, VT);
+        }
+      }
+    }
+  }
+
+  // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND) {
+    if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
+      return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1);
+  }
+
+  // fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2))
+  // fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2))
+  //      if c1 is equal to the number of bits the trunc removes
+  // TODO - support non-uniform vector shift amounts.
+  if (N0.getOpcode() == ISD::TRUNCATE &&
+      (N0.getOperand(0).getOpcode() == ISD::SRL ||
+       N0.getOperand(0).getOpcode() == ISD::SRA) &&
+      N0.getOperand(0).hasOneUse() &&
+      N0.getOperand(0).getOperand(1).hasOneUse() && N1C) {
+    SDValue N0Op0 = N0.getOperand(0);
+    if (ConstantSDNode *LargeShift = isConstOrConstSplat(N0Op0.getOperand(1))) {
+      EVT LargeVT = N0Op0.getValueType();
+      unsigned TruncBits = LargeVT.getScalarSizeInBits() - OpSizeInBits;
+      if (LargeShift->getAPIntValue() == TruncBits) {
+        SDLoc DL(N);
+        EVT LargeShiftVT = getShiftAmountTy(LargeVT);
+        SDValue Amt = DAG.getZExtOrTrunc(N1, DL, LargeShiftVT);
+        Amt = DAG.getNode(ISD::ADD, DL, LargeShiftVT, Amt,
+                          DAG.getConstant(TruncBits, DL, LargeShiftVT));
+        SDValue SRA =
+            DAG.getNode(ISD::SRA, DL, LargeVT, N0Op0.getOperand(0), Amt);
+        return DAG.getNode(ISD::TRUNCATE, DL, VT, SRA);
+      }
+    }
+  }
+
+  // Simplify, based on bits shifted out of the LHS.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // If the sign bit is known to be zero, switch this to a SRL.
+  if (DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);
+
+  if (N1C && !N1C->isOpaque())
+    if (SDValue NewSRA = visitShiftByConstant(N))
+      return NewSRA;
+
+  // Try to transform this shift into a multiply-high if
+  // it matches the appropriate pattern detected in combineShiftToMULH.
+  if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))
+    return MULH;
+
+  // Attempt to convert a sra of a load into a narrower sign-extending load.
+  if (SDValue NarrowLoad = reduceLoadWidth(N))
+    return NarrowLoad;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSRL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (SDValue V = DAG.simplifyShift(N0, N1))
+    return V;
+
+  EVT VT = N0.getValueType();
+  EVT ShiftVT = N1.getValueType();
+  unsigned OpSizeInBits = VT.getScalarSizeInBits();
+
+  // fold (srl c1, c2) -> c1 >>u c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, {N0, N1}))
+    return C;
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
+      return FoldedVOp;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  // if (srl x, c) is known to be zero, return 0
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+  if (N1C &&
+      DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(OpSizeInBits)))
+    return DAG.getConstant(0, SDLoc(N), VT);
+
+  // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2))
+  if (N0.getOpcode() == ISD::SRL) {
+    auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
+                                          ConstantSDNode *RHS) {
+      APInt c1 = LHS->getAPIntValue();
+      APInt c2 = RHS->getAPIntValue();
+      zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
+      return (c1 + c2).uge(OpSizeInBits);
+    };
+    if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
+      return DAG.getConstant(0, SDLoc(N), VT);
+
+    auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
+                                       ConstantSDNode *RHS) {
+      APInt c1 = LHS->getAPIntValue();
+      APInt c2 = RHS->getAPIntValue();
+      zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
+      return (c1 + c2).ult(OpSizeInBits);
+    };
+    if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
+      SDLoc DL(N);
+      SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
+      return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum);
+    }
+  }
+
+  if (N1C && N0.getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(0).getOpcode() == ISD::SRL) {
+    SDValue InnerShift = N0.getOperand(0);
+    // TODO - support non-uniform vector shift amounts.
+    if (auto *N001C = isConstOrConstSplat(InnerShift.getOperand(1))) {
+      uint64_t c1 = N001C->getZExtValue();
+      uint64_t c2 = N1C->getZExtValue();
+      EVT InnerShiftVT = InnerShift.getValueType();
+      EVT ShiftAmtVT = InnerShift.getOperand(1).getValueType();
+      uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits();
+      // srl (trunc (srl x, c1)), c2 --> 0 or (trunc (srl x, (add c1, c2)))
+      // This is only valid if the OpSizeInBits + c1 = size of inner shift.
+      if (c1 + OpSizeInBits == InnerShiftSize) {
+        SDLoc DL(N);
+        if (c1 + c2 >= InnerShiftSize)
+          return DAG.getConstant(0, DL, VT);
+        SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT);
+        SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT,
+                                       InnerShift.getOperand(0), NewShiftAmt);
+        return DAG.getNode(ISD::TRUNCATE, DL, VT, NewShift);
+      }
+      // In the more general case, we can clear the high bits after the shift:
+      // srl (trunc (srl x, c1)), c2 --> trunc (and (srl x, (c1+c2)), Mask)
+      if (N0.hasOneUse() && InnerShift.hasOneUse() &&
+          c1 + c2 < InnerShiftSize) {
+        SDLoc DL(N);
+        SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT);
+        SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT,
+                                       InnerShift.getOperand(0), NewShiftAmt);
+        SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(InnerShiftSize,
+                                                            OpSizeInBits - c2),
+                                       DL, InnerShiftVT);
+        SDValue And = DAG.getNode(ISD::AND, DL, InnerShiftVT, NewShift, Mask);
+        return DAG.getNode(ISD::TRUNCATE, DL, VT, And);
+      }
+    }
+  }
+
+  // fold (srl (shl x, c1), c2) -> (and (shl x, (sub c1, c2), MASK) or
+  //                               (and (srl x, (sub c2, c1), MASK)
+  if (N0.getOpcode() == ISD::SHL &&
+      (N0.getOperand(1) == N1 || N0->hasOneUse()) &&
+      TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
+    auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS,
+                                           ConstantSDNode *RHS) {
+      const APInt &LHSC = LHS->getAPIntValue();
+      const APInt &RHSC = RHS->getAPIntValue();
+      return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
+             LHSC.getZExtValue() <= RHSC.getZExtValue();
+    };
+    if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
+                                  /*AllowUndefs*/ false,
+                                  /*AllowTypeMismatch*/ true)) {
+      SDLoc DL(N);
+      SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
+      SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
+      SDValue Mask = DAG.getAllOnesConstant(DL, VT);
+      Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N01);
+      Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, Diff);
+      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
+      return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
+    }
+    if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
+                                  /*AllowUndefs*/ false,
+                                  /*AllowTypeMismatch*/ true)) {
+      SDLoc DL(N);
+      SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
+      SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
+      SDValue Mask = DAG.getAllOnesConstant(DL, VT);
+      Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N1);
+      SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff);
+      return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
+    }
+  }
+
+  // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask)
+  // TODO - support non-uniform vector shift amounts.
+  if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
+    // Shifting in all undef bits?
+    EVT SmallVT = N0.getOperand(0).getValueType();
+    unsigned BitSize = SmallVT.getScalarSizeInBits();
+    if (N1C->getAPIntValue().uge(BitSize))
+      return DAG.getUNDEF(VT);
+
+    if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) {
+      uint64_t ShiftAmt = N1C->getZExtValue();
+      SDLoc DL0(N0);
+      SDValue SmallShift = DAG.getNode(ISD::SRL, DL0, SmallVT,
+                                       N0.getOperand(0),
+                          DAG.getConstant(ShiftAmt, DL0,
+                                          getShiftAmountTy(SmallVT)));
+      AddToWorklist(SmallShift.getNode());
+      APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt);
+      SDLoc DL(N);
+      return DAG.getNode(ISD::AND, DL, VT,
+                         DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift),
+                         DAG.getConstant(Mask, DL, VT));
+    }
+  }
+
+  // fold (srl (sra X, Y), 31) -> (srl X, 31).  This srl only looks at the sign
+  // bit, which is unmodified by sra.
+  if (N1C && N1C->getAPIntValue() == (OpSizeInBits - 1)) {
+    if (N0.getOpcode() == ISD::SRA)
+      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1);
+  }
+
+  // fold (srl (ctlz x), "5") -> x  iff x has one bit set (the low bit), and x has a power
+  // of two bitwidth. The "5" represents (log2 (bitwidth x)).
+  if (N1C && N0.getOpcode() == ISD::CTLZ &&
+      isPowerOf2_32(OpSizeInBits) &&
+      N1C->getAPIntValue() == Log2_32(OpSizeInBits)) {
+    KnownBits Known = DAG.computeKnownBits(N0.getOperand(0));
+
+    // If any of the input bits are KnownOne, then the input couldn't be all
+    // zeros, thus the result of the srl will always be zero.
+    if (Known.One.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT);
+
+    // If all of the bits input the to ctlz node are known to be zero, then
+    // the result of the ctlz is "32" and the result of the shift is one.
+    APInt UnknownBits = ~Known.Zero;
+    if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT);
+
+    // Otherwise, check to see if there is exactly one bit input to the ctlz.
+    if (UnknownBits.isPowerOf2()) {
+      // Okay, we know that only that the single bit specified by UnknownBits
+      // could be set on input to the CTLZ node. If this bit is set, the SRL
+      // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair
+      // to an SRL/XOR pair, which is likely to simplify more.
+      unsigned ShAmt = UnknownBits.countr_zero();
+      SDValue Op = N0.getOperand(0);
+
+      if (ShAmt) {
+        SDLoc DL(N0);
+        Op = DAG.getNode(ISD::SRL, DL, VT, Op,
+                  DAG.getConstant(ShAmt, DL,
+                                  getShiftAmountTy(Op.getValueType())));
+        AddToWorklist(Op.getNode());
+      }
+
+      SDLoc DL(N);
+      return DAG.getNode(ISD::XOR, DL, VT,
+                         Op, DAG.getConstant(1, DL, VT));
+    }
+  }
+
+  // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND) {
+    if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
+      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1);
+  }
+
+  // fold operands of srl based on knowledge that the low bits are not
+  // demanded.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  if (N1C && !N1C->isOpaque())
+    if (SDValue NewSRL = visitShiftByConstant(N))
+      return NewSRL;
+
+  // Attempt to convert a srl of a load into a narrower zero-extending load.
+  if (SDValue NarrowLoad = reduceLoadWidth(N))
+    return NarrowLoad;
+
+  // Here is a common situation. We want to optimize:
+  //
+  //   %a = ...
+  //   %b = and i32 %a, 2
+  //   %c = srl i32 %b, 1
+  //   brcond i32 %c ...
+  //
+  // into
+  //
+  //   %a = ...
+  //   %b = and %a, 2
+  //   %c = setcc eq %b, 0
+  //   brcond %c ...
+  //
+  // However when after the source operand of SRL is optimized into AND, the SRL
+  // itself may not be optimized further. Look for it and add the BRCOND into
+  // the worklist.
+  //
+  // The also tends to happen for binary operations when SimplifyDemandedBits
+  // is involved.
+  //
+  // FIXME: This is unecessary if we process the DAG in topological order,
+  // which we plan to do. This workaround can be removed once the DAG is
+  // processed in topological order.
+  if (N->hasOneUse()) {
+    SDNode *Use = *N->use_begin();
+
+    // Look pass the truncate.
+    if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse())
+      Use = *Use->use_begin();
+
+    if (Use->getOpcode() == ISD::BRCOND || Use->getOpcode() == ISD::AND ||
+        Use->getOpcode() == ISD::OR || Use->getOpcode() == ISD::XOR)
+      AddToWorklist(Use);
+  }
+
+  // Try to transform this shift into a multiply-high if
+  // it matches the appropriate pattern detected in combineShiftToMULH.
+  if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))
+    return MULH;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  bool IsFSHL = N->getOpcode() == ISD::FSHL;
+  unsigned BitWidth = VT.getScalarSizeInBits();
+
+  // fold (fshl N0, N1, 0) -> N0
+  // fold (fshr N0, N1, 0) -> N1
+  if (isPowerOf2_32(BitWidth))
+    if (DAG.MaskedValueIsZero(
+            N2, APInt(N2.getScalarValueSizeInBits(), BitWidth - 1)))
+      return IsFSHL ? N0 : N1;
+
+  auto IsUndefOrZero = [](SDValue V) {
+    return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
+  };
+
+  // TODO - support non-uniform vector shift amounts.
+  if (ConstantSDNode *Cst = isConstOrConstSplat(N2)) {
+    EVT ShAmtTy = N2.getValueType();
+
+    // fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth)
+    if (Cst->getAPIntValue().uge(BitWidth)) {
+      uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth);
+      return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1,
+                         DAG.getConstant(RotAmt, SDLoc(N), ShAmtTy));
+    }
+
+    unsigned ShAmt = Cst->getZExtValue();
+    if (ShAmt == 0)
+      return IsFSHL ? N0 : N1;
+
+    // fold fshl(undef_or_zero, N1, C) -> lshr(N1, BW-C)
+    // fold fshr(undef_or_zero, N1, C) -> lshr(N1, C)
+    // fold fshl(N0, undef_or_zero, C) -> shl(N0, C)
+    // fold fshr(N0, undef_or_zero, C) -> shl(N0, BW-C)
+    if (IsUndefOrZero(N0))
+      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1,
+                         DAG.getConstant(IsFSHL ? BitWidth - ShAmt : ShAmt,
+                                         SDLoc(N), ShAmtTy));
+    if (IsUndefOrZero(N1))
+      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,
+                         DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt,
+                                         SDLoc(N), ShAmtTy));
+
+    // fold (fshl ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
+    // fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
+    // TODO - bigendian support once we have test coverage.
+    // TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine?
+    // TODO - permit LHS EXTLOAD if extensions are shifted out.
+    if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() &&
+        !DAG.getDataLayout().isBigEndian()) {
+      auto *LHS = dyn_cast<LoadSDNode>(N0);
+      auto *RHS = dyn_cast<LoadSDNode>(N1);
+      if (LHS && RHS && LHS->isSimple() && RHS->isSimple() &&
+          LHS->getAddressSpace() == RHS->getAddressSpace() &&
+          (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS) &&
+          ISD::isNON_EXTLoad(LHS)) {
+        if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) {
+          SDLoc DL(RHS);
+          uint64_t PtrOff =
+              IsFSHL ? (((BitWidth - ShAmt) % BitWidth) / 8) : (ShAmt / 8);
+          Align NewAlign = commonAlignment(RHS->getAlign(), PtrOff);
+          unsigned Fast = 0;
+          if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                                     RHS->getAddressSpace(), NewAlign,
+                                     RHS->getMemOperand()->getFlags(), &Fast) &&
+              Fast) {
+            SDValue NewPtr = DAG.getMemBasePlusOffset(
+                RHS->getBasePtr(), TypeSize::Fixed(PtrOff), DL);
+            AddToWorklist(NewPtr.getNode());
+            SDValue Load = DAG.getLoad(
+                VT, DL, RHS->getChain(), NewPtr,
+                RHS->getPointerInfo().getWithOffset(PtrOff), NewAlign,
+                RHS->getMemOperand()->getFlags(), RHS->getAAInfo());
+            // Replace the old load's chain with the new load's chain.
+            WorklistRemover DeadNodes(*this);
+            DAG.ReplaceAllUsesOfValueWith(N1.getValue(1), Load.getValue(1));
+            return Load;
+          }
+        }
+      }
+    }
+  }
+
+  // fold fshr(undef_or_zero, N1, N2) -> lshr(N1, N2)
+  // fold fshl(N0, undef_or_zero, N2) -> shl(N0, N2)
+  // iff We know the shift amount is in range.
+  // TODO: when is it worth doing SUB(BW, N2) as well?
+  if (isPowerOf2_32(BitWidth)) {
+    APInt ModuloBits(N2.getScalarValueSizeInBits(), BitWidth - 1);
+    if (IsUndefOrZero(N0) && !IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits))
+      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, N2);
+    if (IsUndefOrZero(N1) && IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits))
+      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N2);
+  }
+
+  // fold (fshl N0, N0, N2) -> (rotl N0, N2)
+  // fold (fshr N0, N0, N2) -> (rotr N0, N2)
+  // TODO: Investigate flipping this rotate if only one is legal, if funnel shift
+  // is legal as well we might be better off avoiding non-constant (BW - N2).
+  unsigned RotOpc = IsFSHL ? ISD::ROTL : ISD::ROTR;
+  if (N0 == N1 && hasOperation(RotOpc, VT))
+    return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2);
+
+  // Simplify, based on bits shifted out of N0/N1.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSHLSAT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (SDValue V = DAG.simplifyShift(N0, N1))
+    return V;
+
+  EVT VT = N0.getValueType();
+
+  // fold (*shlsat c1, c2) -> c1<<c2
+  if (SDValue C =
+          DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, {N0, N1}))
+    return C;
+
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+
+  if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) {
+    // fold (sshlsat x, c) -> (shl x, c)
+    if (N->getOpcode() == ISD::SSHLSAT && N1C &&
+        N1C->getAPIntValue().ult(DAG.ComputeNumSignBits(N0)))
+      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N1);
+
+    // fold (ushlsat x, c) -> (shl x, c)
+    if (N->getOpcode() == ISD::USHLSAT && N1C &&
+        N1C->getAPIntValue().ule(
+            DAG.computeKnownBits(N0).countMinLeadingZeros()))
+      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N1);
+  }
+
+  return SDValue();
+}
+
+// Given a ABS node, detect the following patterns:
+// (ABS (SUB (EXTEND a), (EXTEND b))).
+// (TRUNC (ABS (SUB (EXTEND a), (EXTEND b)))).
+// Generates UABD/SABD instruction.
+SDValue DAGCombiner::foldABSToABD(SDNode *N) {
+  EVT SrcVT = N->getValueType(0);
+
+  if (N->getOpcode() == ISD::TRUNCATE)
+    N = N->getOperand(0).getNode();
+
+  if (N->getOpcode() != ISD::ABS)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDValue AbsOp1 = N->getOperand(0);
+  SDValue Op0, Op1;
+  SDLoc DL(N);
+
+  if (AbsOp1.getOpcode() != ISD::SUB)
+    return SDValue();
+
+  Op0 = AbsOp1.getOperand(0);
+  Op1 = AbsOp1.getOperand(1);
+
+  unsigned Opc0 = Op0.getOpcode();
+  // Check if the operands of the sub are (zero|sign)-extended.
+  if (Opc0 != Op1.getOpcode() ||
+      (Opc0 != ISD::ZERO_EXTEND && Opc0 != ISD::SIGN_EXTEND)) {
+    // fold (abs (sub nsw x, y)) -> abds(x, y)
+    if (AbsOp1->getFlags().hasNoSignedWrap() && hasOperation(ISD::ABDS, VT) &&
+        TLI.preferABDSToABSWithNSW(VT)) {
+      SDValue ABD = DAG.getNode(ISD::ABDS, DL, VT, Op0, Op1);
+      return DAG.getZExtOrTrunc(ABD, DL, SrcVT);
+    }
+    return SDValue();
+  }
+
+  EVT VT1 = Op0.getOperand(0).getValueType();
+  EVT VT2 = Op1.getOperand(0).getValueType();
+  unsigned ABDOpcode = (Opc0 == ISD::SIGN_EXTEND) ? ISD::ABDS : ISD::ABDU;
+
+  // fold abs(sext(x) - sext(y)) -> zext(abds(x, y))
+  // fold abs(zext(x) - zext(y)) -> zext(abdu(x, y))
+  // NOTE: Extensions must be equivalent.
+  if (VT1 == VT2 && hasOperation(ABDOpcode, VT1)) {
+    Op0 = Op0.getOperand(0);
+    Op1 = Op1.getOperand(0);
+    SDValue ABD = DAG.getNode(ABDOpcode, DL, VT1, Op0, Op1);
+    ABD = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, ABD);
+    return DAG.getZExtOrTrunc(ABD, DL, SrcVT);
+  }
+
+  // fold abs(sext(x) - sext(y)) -> abds(sext(x), sext(y))
+  // fold abs(zext(x) - zext(y)) -> abdu(zext(x), zext(y))
+  if (hasOperation(ABDOpcode, VT)) {
+    SDValue ABD = DAG.getNode(ABDOpcode, DL, VT, Op0, Op1);
+    return DAG.getZExtOrTrunc(ABD, DL, SrcVT);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitABS(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (abs c1) -> c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::ABS, SDLoc(N), VT, {N0}))
+    return C;
+  // fold (abs (abs x)) -> (abs x)
+  if (N0.getOpcode() == ISD::ABS)
+    return N0;
+  // fold (abs x) -> x iff not-negative
+  if (DAG.SignBitIsZero(N0))
+    return N0;
+
+  if (SDValue ABD = foldABSToABD(N))
+    return ABD;
+
+  // fold (abs (sign_extend_inreg x)) -> (zero_extend (abs (truncate x)))
+  // iff zero_extend/truncate are free.
+  if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    EVT ExtVT = cast<VTSDNode>(N0.getOperand(1))->getVT();
+    if (TLI.isTruncateFree(VT, ExtVT) && TLI.isZExtFree(ExtVT, VT) &&
+        TLI.isTypeDesirableForOp(ISD::ABS, ExtVT) &&
+        hasOperation(ISD::ABS, ExtVT)) {
+      SDLoc DL(N);
+      return DAG.getNode(
+          ISD::ZERO_EXTEND, DL, VT,
+          DAG.getNode(ISD::ABS, DL, ExtVT,
+                      DAG.getNode(ISD::TRUNCATE, DL, ExtVT, N0.getOperand(0))));
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBSWAP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // fold (bswap c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::BSWAP, DL, VT, N0);
+  // fold (bswap (bswap x)) -> x
+  if (N0.getOpcode() == ISD::BSWAP)
+    return N0.getOperand(0);
+
+  // Canonicalize bswap(bitreverse(x)) -> bitreverse(bswap(x)). If bitreverse
+  // isn't supported, it will be expanded to bswap followed by a manual reversal
+  // of bits in each byte. By placing bswaps before bitreverse, we can remove
+  // the two bswaps if the bitreverse gets expanded.
+  if (N0.getOpcode() == ISD::BITREVERSE && N0.hasOneUse()) {
+    SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, N0.getOperand(0));
+    return DAG.getNode(ISD::BITREVERSE, DL, VT, BSwap);
+  }
+
+  // fold (bswap shl(x,c)) -> (zext(bswap(trunc(shl(x,sub(c,bw/2))))))
+  // iff x >= bw/2 (i.e. lower half is known zero)
+  unsigned BW = VT.getScalarSizeInBits();
+  if (BW >= 32 && N0.getOpcode() == ISD::SHL && N0.hasOneUse()) {
+    auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), BW / 2);
+    if (ShAmt && ShAmt->getAPIntValue().ult(BW) &&
+        ShAmt->getZExtValue() >= (BW / 2) &&
+        (ShAmt->getZExtValue() % 16) == 0 && TLI.isTypeLegal(HalfVT) &&
+        TLI.isTruncateFree(VT, HalfVT) &&
+        (!LegalOperations || hasOperation(ISD::BSWAP, HalfVT))) {
+      SDValue Res = N0.getOperand(0);
+      if (uint64_t NewShAmt = (ShAmt->getZExtValue() - (BW / 2)))
+        Res = DAG.getNode(ISD::SHL, DL, VT, Res,
+                          DAG.getConstant(NewShAmt, DL, getShiftAmountTy(VT)));
+      Res = DAG.getZExtOrTrunc(Res, DL, HalfVT);
+      Res = DAG.getNode(ISD::BSWAP, DL, HalfVT, Res);
+      return DAG.getZExtOrTrunc(Res, DL, VT);
+    }
+  }
+
+  // Try to canonicalize bswap-of-logical-shift-by-8-bit-multiple as
+  // inverse-shift-of-bswap:
+  // bswap (X u<< C) --> (bswap X) u>> C
+  // bswap (X u>> C) --> (bswap X) u<< C
+  if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) &&
+      N0.hasOneUse()) {
+    auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (ShAmt && ShAmt->getAPIntValue().ult(BW) &&
+        ShAmt->getZExtValue() % 8 == 0) {
+      SDValue NewSwap = DAG.getNode(ISD::BSWAP, DL, VT, N0.getOperand(0));
+      unsigned InverseShift = N0.getOpcode() == ISD::SHL ? ISD::SRL : ISD::SHL;
+      return DAG.getNode(InverseShift, DL, VT, NewSwap, N0.getOperand(1));
+    }
+  }
+
+  if (SDValue V = foldBitOrderCrossLogicOp(N, DAG))
+    return V;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBITREVERSE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (bitreverse c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0);
+  // fold (bitreverse (bitreverse x)) -> x
+  if (N0.getOpcode() == ISD::BITREVERSE)
+    return N0.getOperand(0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTLZ(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ctlz c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0);
+
+  // If the value is known never to be zero, switch to the undef version.
+  if (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ_ZERO_UNDEF, VT)) {
+    if (DAG.isKnownNeverZero(N0))
+      return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ctlz_zero_undef c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTTZ(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (cttz c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0);
+
+  // If the value is known never to be zero, switch to the undef version.
+  if (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ_ZERO_UNDEF, VT)) {
+    if (DAG.isKnownNeverZero(N0))
+      return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (cttz_zero_undef c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTPOP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ctpop c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0);
+  return SDValue();
+}
+
+// FIXME: This should be checking for no signed zeros on individual operands, as
+// well as no nans.
+static bool isLegalToCombineMinNumMaxNum(SelectionDAG &DAG, SDValue LHS,
+                                         SDValue RHS,
+                                         const TargetLowering &TLI) {
+  const TargetOptions &Options = DAG.getTarget().Options;
+  EVT VT = LHS.getValueType();
+
+  return Options.NoSignedZerosFPMath && VT.isFloatingPoint() &&
+         TLI.isProfitableToCombineMinNumMaxNum(VT) &&
+         DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS);
+}
+
+static SDValue combineMinNumMaxNumImpl(const SDLoc &DL, EVT VT, SDValue LHS,
+                                       SDValue RHS, SDValue True, SDValue False,
+                                       ISD::CondCode CC,
+                                       const TargetLowering &TLI,
+                                       SelectionDAG &DAG) {
+  EVT TransformVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  switch (CC) {
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETLT:
+  case ISD::SETLE:
+  case ISD::SETULT:
+  case ISD::SETULE: {
+    // Since it's known never nan to get here already, either fminnum or
+    // fminnum_ieee are OK. Try the ieee version first, since it's fminnum is
+    // expanded in terms of it.
+    unsigned IEEEOpcode = (LHS == True) ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
+    if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
+      return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);
+
+    unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM;
+    if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
+      return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+    return SDValue();
+  }
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETGT:
+  case ISD::SETGE:
+  case ISD::SETUGT:
+  case ISD::SETUGE: {
+    unsigned IEEEOpcode = (LHS == True) ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE;
+    if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
+      return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);
+
+    unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM;
+    if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
+      return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+    return SDValue();
+  }
+  default:
+    return SDValue();
+  }
+}
+
+/// Generate Min/Max node
+SDValue DAGCombiner::combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
+                                         SDValue RHS, SDValue True,
+                                         SDValue False, ISD::CondCode CC) {
+  if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
+    return combineMinNumMaxNumImpl(DL, VT, LHS, RHS, True, False, CC, TLI, DAG);
+
+  // If we can't directly match this, try to see if we can pull an fneg out of
+  // the select.
+  SDValue NegTrue = TLI.getCheaperOrNeutralNegatedExpression(
+      True, DAG, LegalOperations, ForCodeSize);
+  if (!NegTrue)
+    return SDValue();
+
+  HandleSDNode NegTrueHandle(NegTrue);
+
+  // Try to unfold an fneg from the select if we are comparing the negated
+  // constant.
+  //
+  // select (setcc x, K) (fneg x), -K -> fneg(minnum(x, K))
+  //
+  // TODO: Handle fabs
+  if (LHS == NegTrue) {
+    // If we can't directly match this, try to see if we can pull an fneg out of
+    // the select.
+    SDValue NegRHS = TLI.getCheaperOrNeutralNegatedExpression(
+        RHS, DAG, LegalOperations, ForCodeSize);
+    if (NegRHS) {
+      HandleSDNode NegRHSHandle(NegRHS);
+      if (NegRHS == False) {
+        SDValue Combined = combineMinNumMaxNumImpl(DL, VT, LHS, RHS, NegTrue,
+                                                   False, CC, TLI, DAG);
+        if (Combined)
+          return DAG.getNode(ISD::FNEG, DL, VT, Combined);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+/// If a (v)select has a condition value that is a sign-bit test, try to smear
+/// the condition operand sign-bit across the value width and use it as a mask.
+static SDValue foldSelectOfConstantsUsingSra(SDNode *N, SelectionDAG &DAG) {
+  SDValue Cond = N->getOperand(0);
+  SDValue C1 = N->getOperand(1);
+  SDValue C2 = N->getOperand(2);
+  if (!isConstantOrConstantVector(C1) || !isConstantOrConstantVector(C2))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse() ||
+      VT != Cond.getOperand(0).getValueType())
+    return SDValue();
+
+  // The inverted-condition + commuted-select variants of these patterns are
+  // canonicalized to these forms in IR.
+  SDValue X = Cond.getOperand(0);
+  SDValue CondC = Cond.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+  if (CC == ISD::SETGT && isAllOnesOrAllOnesSplat(CondC) &&
+      isAllOnesOrAllOnesSplat(C2)) {
+    // i32 X > -1 ? C1 : -1 --> (X >>s 31) | C1
+    SDLoc DL(N);
+    SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC);
+    return DAG.getNode(ISD::OR, DL, VT, Sra, C1);
+  }
+  if (CC == ISD::SETLT && isNullOrNullSplat(CondC) && isNullOrNullSplat(C2)) {
+    // i8 X < 0 ? C1 : 0 --> (X >>s 7) & C1
+    SDLoc DL(N);
+    SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC);
+    return DAG.getNode(ISD::AND, DL, VT, Sra, C1);
+  }
+  return SDValue();
+}
+
+static bool shouldConvertSelectOfConstantsToMath(const SDValue &Cond, EVT VT,
+                                                 const TargetLowering &TLI) {
+  if (!TLI.convertSelectOfConstantsToMath(VT))
+    return false;
+
+  if (Cond.getOpcode() != ISD::SETCC || !Cond->hasOneUse())
+    return true;
+  if (!TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))
+    return true;
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+  if (CC == ISD::SETLT && isNullOrNullSplat(Cond.getOperand(1)))
+    return true;
+  if (CC == ISD::SETGT && isAllOnesOrAllOnesSplat(Cond.getOperand(1)))
+    return true;
+
+  return false;
+}
+
+SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
+  SDValue Cond = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  EVT CondVT = Cond.getValueType();
+  SDLoc DL(N);
+
+  if (!VT.isInteger())
+    return SDValue();
+
+  auto *C1 = dyn_cast<ConstantSDNode>(N1);
+  auto *C2 = dyn_cast<ConstantSDNode>(N2);
+  if (!C1 || !C2)
+    return SDValue();
+
+  if (CondVT != MVT::i1 || LegalOperations) {
+    // fold (select Cond, 0, 1) -> (xor Cond, 1)
+    // We can't do this reliably if integer based booleans have different contents
+    // to floating point based booleans. This is because we can't tell whether we
+    // have an integer-based boolean or a floating-point-based boolean unless we
+    // can find the SETCC that produced it and inspect its operands. This is
+    // fairly easy if C is the SETCC node, but it can potentially be
+    // undiscoverable (or not reasonably discoverable). For example, it could be
+    // in another basic block or it could require searching a complicated
+    // expression.
+    if (CondVT.isInteger() &&
+        TLI.getBooleanContents(/*isVec*/false, /*isFloat*/true) ==
+            TargetLowering::ZeroOrOneBooleanContent &&
+        TLI.getBooleanContents(/*isVec*/false, /*isFloat*/false) ==
+            TargetLowering::ZeroOrOneBooleanContent &&
+        C1->isZero() && C2->isOne()) {
+      SDValue NotCond =
+          DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT));
+      if (VT.bitsEq(CondVT))
+        return NotCond;
+      return DAG.getZExtOrTrunc(NotCond, DL, VT);
+    }
+
+    return SDValue();
+  }
+
+  // Only do this before legalization to avoid conflicting with target-specific
+  // transforms in the other direction (create a select from a zext/sext). There
+  // is also a target-independent combine here in DAGCombiner in the other
+  // direction for (select Cond, -1, 0) when the condition is not i1.
+  assert(CondVT == MVT::i1 && !LegalOperations);
+
+  // select Cond, 1, 0 --> zext (Cond)
+  if (C1->isOne() && C2->isZero())
+    return DAG.getZExtOrTrunc(Cond, DL, VT);
+
+  // select Cond, -1, 0 --> sext (Cond)
+  if (C1->isAllOnes() && C2->isZero())
+    return DAG.getSExtOrTrunc(Cond, DL, VT);
+
+  // select Cond, 0, 1 --> zext (!Cond)
+  if (C1->isZero() && C2->isOne()) {
+    SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
+    NotCond = DAG.getZExtOrTrunc(NotCond, DL, VT);
+    return NotCond;
+  }
+
+  // select Cond, 0, -1 --> sext (!Cond)
+  if (C1->isZero() && C2->isAllOnes()) {
+    SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
+    NotCond = DAG.getSExtOrTrunc(NotCond, DL, VT);
+    return NotCond;
+  }
+
+  // Use a target hook because some targets may prefer to transform in the
+  // other direction.
+  if (!shouldConvertSelectOfConstantsToMath(Cond, VT, TLI))
+    return SDValue();
+
+  // For any constants that differ by 1, we can transform the select into
+  // an extend and add.
+  const APInt &C1Val = C1->getAPIntValue();
+  const APInt &C2Val = C2->getAPIntValue();
+
+  // select Cond, C1, C1-1 --> add (zext Cond), C1-1
+  if (C1Val - 1 == C2Val) {
+    Cond = DAG.getZExtOrTrunc(Cond, DL, VT);
+    return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
+  }
+
+  // select Cond, C1, C1+1 --> add (sext Cond), C1+1
+  if (C1Val + 1 == C2Val) {
+    Cond = DAG.getSExtOrTrunc(Cond, DL, VT);
+    return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
+  }
+
+  // select Cond, Pow2, 0 --> (zext Cond) << log2(Pow2)
+  if (C1Val.isPowerOf2() && C2Val.isZero()) {
+    Cond = DAG.getZExtOrTrunc(Cond, DL, VT);
+    SDValue ShAmtC =
+        DAG.getShiftAmountConstant(C1Val.exactLogBase2(), VT, DL);
+    return DAG.getNode(ISD::SHL, DL, VT, Cond, ShAmtC);
+  }
+
+  // select Cond, -1, C --> or (sext Cond), C
+  if (C1->isAllOnes()) {
+    Cond = DAG.getSExtOrTrunc(Cond, DL, VT);
+    return DAG.getNode(ISD::OR, DL, VT, Cond, N2);
+  }
+
+  // select Cond, C, -1 --> or (sext (not Cond)), C
+  if (C2->isAllOnes()) {
+    SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
+    NotCond = DAG.getSExtOrTrunc(NotCond, DL, VT);
+    return DAG.getNode(ISD::OR, DL, VT, NotCond, N1);
+  }
+
+  if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG))
+    return V;
+
+  return SDValue();
+}
+
+static SDValue foldBoolSelectToLogic(SDNode *N, SelectionDAG &DAG) {
+  assert((N->getOpcode() == ISD::SELECT || N->getOpcode() == ISD::VSELECT) &&
+         "Expected a (v)select");
+  SDValue Cond = N->getOperand(0);
+  SDValue T = N->getOperand(1), F = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  if (VT != Cond.getValueType() || VT.getScalarSizeInBits() != 1)
+    return SDValue();
+
+  // select Cond, Cond, F --> or Cond, F
+  // select Cond, 1, F    --> or Cond, F
+  if (Cond == T || isOneOrOneSplat(T, /* AllowUndefs */ true))
+    return DAG.getNode(ISD::OR, SDLoc(N), VT, Cond, F);
+
+  // select Cond, T, Cond --> and Cond, T
+  // select Cond, T, 0    --> and Cond, T
+  if (Cond == F || isNullOrNullSplat(F, /* AllowUndefs */ true))
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, Cond, T);
+
+  // select Cond, T, 1 --> or (not Cond), T
+  if (isOneOrOneSplat(F, /* AllowUndefs */ true)) {
+    SDValue NotCond = DAG.getNOT(SDLoc(N), Cond, VT);
+    return DAG.getNode(ISD::OR, SDLoc(N), VT, NotCond, T);
+  }
+
+  // select Cond, 0, F --> and (not Cond), F
+  if (isNullOrNullSplat(T, /* AllowUndefs */ true)) {
+    SDValue NotCond = DAG.getNOT(SDLoc(N), Cond, VT);
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, NotCond, F);
+  }
+
+  return SDValue();
+}
+
+static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  if (N0.getOpcode() != ISD::SETCC || !N0.hasOneUse())
+    return SDValue();
+
+  SDValue Cond0 = N0.getOperand(0);
+  SDValue Cond1 = N0.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+  if (VT != Cond0.getValueType())
+    return SDValue();
+
+  // Match a signbit check of Cond0 as "Cond0 s<0". Swap select operands if the
+  // compare is inverted from that pattern ("Cond0 s> -1").
+  if (CC == ISD::SETLT && isNullOrNullSplat(Cond1))
+    ; // This is the pattern we are looking for.
+  else if (CC == ISD::SETGT && isAllOnesOrAllOnesSplat(Cond1))
+    std::swap(N1, N2);
+  else
+    return SDValue();
+
+  // (Cond0 s< 0) ? N1 : 0 --> (Cond0 s>> BW-1) & N1
+  if (isNullOrNullSplat(N2)) {
+    SDLoc DL(N);
+    SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
+    return DAG.getNode(ISD::AND, DL, VT, Sra, N1);
+  }
+
+  // (Cond0 s< 0) ? -1 : N2 --> (Cond0 s>> BW-1) | N2
+  if (isAllOnesOrAllOnesSplat(N1)) {
+    SDLoc DL(N);
+    SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
+    return DAG.getNode(ISD::OR, DL, VT, Sra, N2);
+  }
+
+  // If we have to invert the sign bit mask, only do that transform if the
+  // target has a bitwise 'and not' instruction (the invert is free).
+  // (Cond0 s< -0) ? 0 : N2 --> ~(Cond0 s>> BW-1) & N2
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (isNullOrNullSplat(N1) && TLI.hasAndNot(N1)) {
+    SDLoc DL(N);
+    SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
+    SDValue Not = DAG.getNOT(DL, Sra, VT);
+    return DAG.getNode(ISD::AND, DL, VT, Not, N2);
+  }
+
+  // TODO: There's another pattern in this family, but it may require
+  //       implementing hasOrNot() to check for profitability:
+  //       (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | N2
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSELECT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  EVT VT0 = N0.getValueType();
+  SDLoc DL(N);
+  SDNodeFlags Flags = N->getFlags();
+
+  if (SDValue V = DAG.simplifySelect(N0, N1, N2))
+    return V;
+
+  if (SDValue V = foldBoolSelectToLogic(N, DAG))
+    return V;
+
+  // select (not Cond), N1, N2 -> select Cond, N2, N1
+  if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) {
+    SDValue SelectOp = DAG.getSelect(DL, VT, F, N2, N1);
+    SelectOp->setFlags(Flags);
+    return SelectOp;
+  }
+
+  if (SDValue V = foldSelectOfConstants(N))
+    return V;
+
+  // If we can fold this based on the true/false value, do so.
+  if (SimplifySelectOps(N, N1, N2))
+    return SDValue(N, 0); // Don't revisit N.
+
+  if (VT0 == MVT::i1) {
+    // The code in this block deals with the following 2 equivalences:
+    //    select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y))
+    //    select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y)
+    // The target can specify its preferred form with the
+    // shouldNormalizeToSelectSequence() callback. However we always transform
+    // to the right anyway if we find the inner select exists in the DAG anyway
+    // and we always transform to the left side if we know that we can further
+    // optimize the combination of the conditions.
+    bool normalizeToSequence =
+        TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
+    // select (and Cond0, Cond1), X, Y
+    //   -> select Cond0, (select Cond1, X, Y), Y
+    if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
+      SDValue Cond0 = N0->getOperand(0);
+      SDValue Cond1 = N0->getOperand(1);
+      SDValue InnerSelect =
+          DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2, Flags);
+      if (normalizeToSequence || !InnerSelect.use_empty())
+        return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0,
+                           InnerSelect, N2, Flags);
+      // Cleanup on failure.
+      if (InnerSelect.use_empty())
+        recursivelyDeleteUnusedNodes(InnerSelect.getNode());
+    }
+    // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y)
+    if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
+      SDValue Cond0 = N0->getOperand(0);
+      SDValue Cond1 = N0->getOperand(1);
+      SDValue InnerSelect = DAG.getNode(ISD::SELECT, DL, N1.getValueType(),
+                                        Cond1, N1, N2, Flags);
+      if (normalizeToSequence || !InnerSelect.use_empty())
+        return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N1,
+                           InnerSelect, Flags);
+      // Cleanup on failure.
+      if (InnerSelect.use_empty())
+        recursivelyDeleteUnusedNodes(InnerSelect.getNode());
+    }
+
+    // select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y
+    if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) {
+      SDValue N1_0 = N1->getOperand(0);
+      SDValue N1_1 = N1->getOperand(1);
+      SDValue N1_2 = N1->getOperand(2);
+      if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
+        // Create the actual and node if we can generate good code for it.
+        if (!normalizeToSequence) {
+          SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0);
+          return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1,
+                             N2, Flags);
+        }
+        // Otherwise see if we can optimize the "and" to a better pattern.
+        if (SDValue Combined = visitANDLike(N0, N1_0, N)) {
+          return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1_1,
+                             N2, Flags);
+        }
+      }
+    }
+    // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y
+    if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) {
+      SDValue N2_0 = N2->getOperand(0);
+      SDValue N2_1 = N2->getOperand(1);
+      SDValue N2_2 = N2->getOperand(2);
+      if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
+        // Create the actual or node if we can generate good code for it.
+        if (!normalizeToSequence) {
+          SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
+          return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1,
+                             N2_2, Flags);
+        }
+        // Otherwise see if we can optimize to a better pattern.
+        if (SDValue Combined = visitORLike(N0, N2_0, N))
+          return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1,
+                             N2_2, Flags);
+      }
+    }
+  }
+
+  // Fold selects based on a setcc into other things, such as min/max/abs.
+  if (N0.getOpcode() == ISD::SETCC) {
+    SDValue Cond0 = N0.getOperand(0), Cond1 = N0.getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+
+    // select (fcmp lt x, y), x, y -> fminnum x, y
+    // select (fcmp gt x, y), x, y -> fmaxnum x, y
+    //
+    // This is OK if we don't care what happens if either operand is a NaN.
+    if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N1, N2, TLI))
+      if (SDValue FMinMax =
+              combineMinNumMaxNum(DL, VT, Cond0, Cond1, N1, N2, CC))
+        return FMinMax;
+
+    // Use 'unsigned add with overflow' to optimize an unsigned saturating add.
+    // This is conservatively limited to pre-legal-operations to give targets
+    // a chance to reverse the transform if they want to do that. Also, it is
+    // unlikely that the pattern would be formed late, so it's probably not
+    // worth going through the other checks.
+    if (!LegalOperations && TLI.isOperationLegalOrCustom(ISD::UADDO, VT) &&
+        CC == ISD::SETUGT && N0.hasOneUse() && isAllOnesConstant(N1) &&
+        N2.getOpcode() == ISD::ADD && Cond0 == N2.getOperand(0)) {
+      auto *C = dyn_cast<ConstantSDNode>(N2.getOperand(1));
+      auto *NotC = dyn_cast<ConstantSDNode>(Cond1);
+      if (C && NotC && C->getAPIntValue() == ~NotC->getAPIntValue()) {
+        // select (setcc Cond0, ~C, ugt), -1, (add Cond0, C) -->
+        // uaddo Cond0, C; select uaddo.1, -1, uaddo.0
+        //
+        // The IR equivalent of this transform would have this form:
+        //   %a = add %x, C
+        //   %c = icmp ugt %x, ~C
+        //   %r = select %c, -1, %a
+        //   =>
+        //   %u = call {iN,i1} llvm.uadd.with.overflow(%x, C)
+        //   %u0 = extractvalue %u, 0
+        //   %u1 = extractvalue %u, 1
+        //   %r = select %u1, -1, %u0
+        SDVTList VTs = DAG.getVTList(VT, VT0);
+        SDValue UAO = DAG.getNode(ISD::UADDO, DL, VTs, Cond0, N2.getOperand(1));
+        return DAG.getSelect(DL, VT, UAO.getValue(1), N1, UAO.getValue(0));
+      }
+    }
+
+    if (TLI.isOperationLegal(ISD::SELECT_CC, VT) ||
+        (!LegalOperations &&
+         TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))) {
+      // Any flags available in a select/setcc fold will be on the setcc as they
+      // migrated from fcmp
+      Flags = N0->getFlags();
+      SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1,
+                                       N2, N0.getOperand(2));
+      SelectNode->setFlags(Flags);
+      return SelectNode;
+    }
+
+    if (SDValue NewSel = SimplifySelect(DL, N0, N1, N2))
+      return NewSel;
+  }
+
+  if (!VT.isVector())
+    if (SDValue BinOp = foldSelectOfBinops(N))
+      return BinOp;
+
+  return SDValue();
+}
+
+// This function assumes all the vselect's arguments are CONCAT_VECTOR
+// nodes and that the condition is a BV of ConstantSDNodes (or undefs).
+static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  SDValue Cond = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+  assert(LHS.getOpcode() == ISD::CONCAT_VECTORS &&
+         RHS.getOpcode() == ISD::CONCAT_VECTORS &&
+         Cond.getOpcode() == ISD::BUILD_VECTOR);
+
+  // CONCAT_VECTOR can take an arbitrary number of arguments. We only care about
+  // binary ones here.
+  if (LHS->getNumOperands() != 2 || RHS->getNumOperands() != 2)
+    return SDValue();
+
+  // We're sure we have an even number of elements due to the
+  // concat_vectors we have as arguments to vselect.
+  // Skip BV elements until we find one that's not an UNDEF
+  // After we find an UNDEF element, keep looping until we get to half the
+  // length of the BV and see if all the non-undef nodes are the same.
+  ConstantSDNode *BottomHalf = nullptr;
+  for (int i = 0; i < NumElems / 2; ++i) {
+    if (Cond->getOperand(i)->isUndef())
+      continue;
+
+    if (BottomHalf == nullptr)
+      BottomHalf = cast<ConstantSDNode>(Cond.getOperand(i));
+    else if (Cond->getOperand(i).getNode() != BottomHalf)
+      return SDValue();
+  }
+
+  // Do the same for the second half of the BuildVector
+  ConstantSDNode *TopHalf = nullptr;
+  for (int i = NumElems / 2; i < NumElems; ++i) {
+    if (Cond->getOperand(i)->isUndef())
+      continue;
+
+    if (TopHalf == nullptr)
+      TopHalf = cast<ConstantSDNode>(Cond.getOperand(i));
+    else if (Cond->getOperand(i).getNode() != TopHalf)
+      return SDValue();
+  }
+
+  assert(TopHalf && BottomHalf &&
+         "One half of the selector was all UNDEFs and the other was all the "
+         "same value. This should have been addressed before this function.");
+  return DAG.getNode(
+      ISD::CONCAT_VECTORS, DL, VT,
+      BottomHalf->isZero() ? RHS->getOperand(0) : LHS->getOperand(0),
+      TopHalf->isZero() ? RHS->getOperand(1) : LHS->getOperand(1));
+}
+
+bool refineUniformBase(SDValue &BasePtr, SDValue &Index, bool IndexIsScaled,
+                       SelectionDAG &DAG, const SDLoc &DL) {
+  if (Index.getOpcode() != ISD::ADD)
+    return false;
+
+  // Only perform the transformation when existing operands can be reused.
+  if (IndexIsScaled)
+    return false;
+
+  if (!isNullConstant(BasePtr) && !Index.hasOneUse())
+    return false;
+
+  EVT VT = BasePtr.getValueType();
+  if (SDValue SplatVal = DAG.getSplatValue(Index.getOperand(0));
+      SplatVal && SplatVal.getValueType() == VT) {
+    if (isNullConstant(BasePtr))
+      BasePtr = SplatVal;
+    else
+      BasePtr = DAG.getNode(ISD::ADD, DL, VT, BasePtr, SplatVal);
+    Index = Index.getOperand(1);
+    return true;
+  }
+  if (SDValue SplatVal = DAG.getSplatValue(Index.getOperand(1));
+      SplatVal && SplatVal.getValueType() == VT) {
+    if (isNullConstant(BasePtr))
+      BasePtr = SplatVal;
+    else
+      BasePtr = DAG.getNode(ISD::ADD, DL, VT, BasePtr, SplatVal);
+    Index = Index.getOperand(0);
+    return true;
+  }
+  return false;
+}
+
+// Fold sext/zext of index into index type.
+bool refineIndexType(SDValue &Index, ISD::MemIndexType &IndexType, EVT DataVT,
+                     SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // It's always safe to look through zero extends.
+  if (Index.getOpcode() == ISD::ZERO_EXTEND) {
+    SDValue Op = Index.getOperand(0);
+    if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType(), DataVT)) {
+      IndexType = ISD::UNSIGNED_SCALED;
+      Index = Op;
+      return true;
+    }
+    if (ISD::isIndexTypeSigned(IndexType)) {
+      IndexType = ISD::UNSIGNED_SCALED;
+      return true;
+    }
+  }
+
+  // It's only safe to look through sign extends when Index is signed.
+  if (Index.getOpcode() == ISD::SIGN_EXTEND &&
+      ISD::isIndexTypeSigned(IndexType)) {
+    SDValue Op = Index.getOperand(0);
+    if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType(), DataVT)) {
+      Index = Op;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+SDValue DAGCombiner::visitVPSCATTER(SDNode *N) {
+  VPScatterSDNode *MSC = cast<VPScatterSDNode>(N);
+  SDValue Mask = MSC->getMask();
+  SDValue Chain = MSC->getChain();
+  SDValue Index = MSC->getIndex();
+  SDValue Scale = MSC->getScale();
+  SDValue StoreVal = MSC->getValue();
+  SDValue BasePtr = MSC->getBasePtr();
+  SDValue VL = MSC->getVectorLength();
+  ISD::MemIndexType IndexType = MSC->getIndexType();
+  SDLoc DL(N);
+
+  // Zap scatters with a zero mask.
+  if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
+    return Chain;
+
+  if (refineUniformBase(BasePtr, Index, MSC->isIndexScaled(), DAG, DL)) {
+    SDValue Ops[] = {Chain, StoreVal, BasePtr, Index, Scale, Mask, VL};
+    return DAG.getScatterVP(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
+                            DL, Ops, MSC->getMemOperand(), IndexType);
+  }
+
+  if (refineIndexType(Index, IndexType, StoreVal.getValueType(), DAG)) {
+    SDValue Ops[] = {Chain, StoreVal, BasePtr, Index, Scale, Mask, VL};
+    return DAG.getScatterVP(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
+                            DL, Ops, MSC->getMemOperand(), IndexType);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
+  MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
+  SDValue Mask = MSC->getMask();
+  SDValue Chain = MSC->getChain();
+  SDValue Index = MSC->getIndex();
+  SDValue Scale = MSC->getScale();
+  SDValue StoreVal = MSC->getValue();
+  SDValue BasePtr = MSC->getBasePtr();
+  ISD::MemIndexType IndexType = MSC->getIndexType();
+  SDLoc DL(N);
+
+  // Zap scatters with a zero mask.
+  if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
+    return Chain;
+
+  if (refineUniformBase(BasePtr, Index, MSC->isIndexScaled(), DAG, DL)) {
+    SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
+                                DL, Ops, MSC->getMemOperand(), IndexType,
+                                MSC->isTruncatingStore());
+  }
+
+  if (refineIndexType(Index, IndexType, StoreVal.getValueType(), DAG)) {
+    SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
+                                DL, Ops, MSC->getMemOperand(), IndexType,
+                                MSC->isTruncatingStore());
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMSTORE(SDNode *N) {
+  MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
+  SDValue Mask = MST->getMask();
+  SDValue Chain = MST->getChain();
+  SDValue Value = MST->getValue();
+  SDValue Ptr = MST->getBasePtr();
+  SDLoc DL(N);
+
+  // Zap masked stores with a zero mask.
+  if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
+    return Chain;
+
+  // Remove a masked store if base pointers and masks are equal.
+  if (MaskedStoreSDNode *MST1 = dyn_cast<MaskedStoreSDNode>(Chain)) {
+    if (MST->isUnindexed() && MST->isSimple() && MST1->isUnindexed() &&
+        MST1->isSimple() && MST1->getBasePtr() == Ptr &&
+        !MST->getBasePtr().isUndef() &&
+        ((Mask == MST1->getMask() && MST->getMemoryVT().getStoreSize() ==
+                                         MST1->getMemoryVT().getStoreSize()) ||
+         ISD::isConstantSplatVectorAllOnes(Mask.getNode())) &&
+        TypeSize::isKnownLE(MST1->getMemoryVT().getStoreSize(),
+                            MST->getMemoryVT().getStoreSize())) {
+      CombineTo(MST1, MST1->getChain());
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        AddToWorklist(N);
+      return SDValue(N, 0);
+    }
+  }
+
+  // If this is a masked load with an all ones mask, we can use a unmasked load.
+  // FIXME: Can we do this for indexed, compressing, or truncating stores?
+  if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && MST->isUnindexed() &&
+      !MST->isCompressingStore() && !MST->isTruncatingStore())
+    return DAG.getStore(MST->getChain(), SDLoc(N), MST->getValue(),
+                        MST->getBasePtr(), MST->getPointerInfo(),
+                        MST->getOriginalAlign(), MachineMemOperand::MOStore,
+                        MST->getAAInfo());
+
+  // Try transforming N to an indexed store.
+  if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+    return SDValue(N, 0);
+
+  if (MST->isTruncatingStore() && MST->isUnindexed() &&
+      Value.getValueType().isInteger() &&
+      (!isa<ConstantSDNode>(Value) ||
+       !cast<ConstantSDNode>(Value)->isOpaque())) {
+    APInt TruncDemandedBits =
+        APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
+                             MST->getMemoryVT().getScalarSizeInBits());
+
+    // See if we can simplify the operation with
+    // SimplifyDemandedBits, which only works if the value has a single use.
+    if (SimplifyDemandedBits(Value, TruncDemandedBits)) {
+      // Re-visit the store if anything changed and the store hasn't been merged
+      // with another node (N is deleted) SimplifyDemandedBits will add Value's
+      // node back to the worklist if necessary, but we also need to re-visit
+      // the Store node itself.
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        AddToWorklist(N);
+      return SDValue(N, 0);
+    }
+  }
+
+  // If this is a TRUNC followed by a masked store, fold this into a masked
+  // truncating store.  We can do this even if this is already a masked
+  // truncstore.
+  // TODO: Try combine to masked compress store if possiable.
+  if ((Value.getOpcode() == ISD::TRUNCATE) && Value->hasOneUse() &&
+      MST->isUnindexed() && !MST->isCompressingStore() &&
+      TLI.canCombineTruncStore(Value.getOperand(0).getValueType(),
+                               MST->getMemoryVT(), LegalOperations)) {
+    auto Mask = TLI.promoteTargetBoolean(DAG, MST->getMask(),
+                                         Value.getOperand(0).getValueType());
+    return DAG.getMaskedStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
+                              MST->getOffset(), Mask, MST->getMemoryVT(),
+                              MST->getMemOperand(), MST->getAddressingMode(),
+                              /*IsTruncating=*/true);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitVPGATHER(SDNode *N) {
+  VPGatherSDNode *MGT = cast<VPGatherSDNode>(N);
+  SDValue Mask = MGT->getMask();
+  SDValue Chain = MGT->getChain();
+  SDValue Index = MGT->getIndex();
+  SDValue Scale = MGT->getScale();
+  SDValue BasePtr = MGT->getBasePtr();
+  SDValue VL = MGT->getVectorLength();
+  ISD::MemIndexType IndexType = MGT->getIndexType();
+  SDLoc DL(N);
+
+  if (refineUniformBase(BasePtr, Index, MGT->isIndexScaled(), DAG, DL)) {
+    SDValue Ops[] = {Chain, BasePtr, Index, Scale, Mask, VL};
+    return DAG.getGatherVP(
+        DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
+        Ops, MGT->getMemOperand(), IndexType);
+  }
+
+  if (refineIndexType(Index, IndexType, N->getValueType(0), DAG)) {
+    SDValue Ops[] = {Chain, BasePtr, Index, Scale, Mask, VL};
+    return DAG.getGatherVP(
+        DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
+        Ops, MGT->getMemOperand(), IndexType);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMGATHER(SDNode *N) {
+  MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N);
+  SDValue Mask = MGT->getMask();
+  SDValue Chain = MGT->getChain();
+  SDValue Index = MGT->getIndex();
+  SDValue Scale = MGT->getScale();
+  SDValue PassThru = MGT->getPassThru();
+  SDValue BasePtr = MGT->getBasePtr();
+  ISD::MemIndexType IndexType = MGT->getIndexType();
+  SDLoc DL(N);
+
+  // Zap gathers with a zero mask.
+  if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
+    return CombineTo(N, PassThru, MGT->getChain());
+
+  if (refineUniformBase(BasePtr, Index, MGT->isIndexScaled(), DAG, DL)) {
+    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedGather(
+        DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
+        Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
+  }
+
+  if (refineIndexType(Index, IndexType, N->getValueType(0), DAG)) {
+    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedGather(
+        DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
+        Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMLOAD(SDNode *N) {
+  MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N);
+  SDValue Mask = MLD->getMask();
+  SDLoc DL(N);
+
+  // Zap masked loads with a zero mask.
+  if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
+    return CombineTo(N, MLD->getPassThru(), MLD->getChain());
+
+  // If this is a masked load with an all ones mask, we can use a unmasked load.
+  // FIXME: Can we do this for indexed, expanding, or extending loads?
+  if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && MLD->isUnindexed() &&
+      !MLD->isExpandingLoad() && MLD->getExtensionType() == ISD::NON_EXTLOAD) {
+    SDValue NewLd = DAG.getLoad(
+        N->getValueType(0), SDLoc(N), MLD->getChain(), MLD->getBasePtr(),
+        MLD->getPointerInfo(), MLD->getOriginalAlign(),
+        MachineMemOperand::MOLoad, MLD->getAAInfo(), MLD->getRanges());
+    return CombineTo(N, NewLd, NewLd.getValue(1));
+  }
+
+  // Try transforming N to an indexed load.
+  if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+/// A vector select of 2 constant vectors can be simplified to math/logic to
+/// avoid a variable select instruction and possibly avoid constant loads.
+SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {
+  SDValue Cond = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  if (!Cond.hasOneUse() || Cond.getScalarValueSizeInBits() != 1 ||
+      !shouldConvertSelectOfConstantsToMath(Cond, VT, TLI) ||
+      !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) ||
+      !ISD::isBuildVectorOfConstantSDNodes(N2.getNode()))
+    return SDValue();
+
+  // Check if we can use the condition value to increment/decrement a single
+  // constant value. This simplifies a select to an add and removes a constant
+  // load/materialization from the general case.
+  bool AllAddOne = true;
+  bool AllSubOne = true;
+  unsigned Elts = VT.getVectorNumElements();
+  for (unsigned i = 0; i != Elts; ++i) {
+    SDValue N1Elt = N1.getOperand(i);
+    SDValue N2Elt = N2.getOperand(i);
+    if (N1Elt.isUndef() || N2Elt.isUndef())
+      continue;
+    if (N1Elt.getValueType() != N2Elt.getValueType())
+      continue;
+
+    const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue();
+    const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue();
+    if (C1 != C2 + 1)
+      AllAddOne = false;
+    if (C1 != C2 - 1)
+      AllSubOne = false;
+  }
+
+  // Further simplifications for the extra-special cases where the constants are
+  // all 0 or all -1 should be implemented as folds of these patterns.
+  SDLoc DL(N);
+  if (AllAddOne || AllSubOne) {
+    // vselect <N x i1> Cond, C+1, C --> add (zext Cond), C
+    // vselect <N x i1> Cond, C-1, C --> add (sext Cond), C
+    auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+    SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond);
+    return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2);
+  }
+
+  // select Cond, Pow2C, 0 --> (zext Cond) << log2(Pow2C)
+  APInt Pow2C;
+  if (ISD::isConstantSplatVector(N1.getNode(), Pow2C) && Pow2C.isPowerOf2() &&
+      isNullOrNullSplat(N2)) {
+    SDValue ZextCond = DAG.getZExtOrTrunc(Cond, DL, VT);
+    SDValue ShAmtC = DAG.getConstant(Pow2C.exactLogBase2(), DL, VT);
+    return DAG.getNode(ISD::SHL, DL, VT, ZextCond, ShAmtC);
+  }
+
+  if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG))
+    return V;
+
+  // The general case for select-of-constants:
+  // vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2
+  // ...but that only makes sense if a vselect is slower than 2 logic ops, so
+  // leave that to a machine-specific pass.
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitVSELECT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  if (SDValue V = DAG.simplifySelect(N0, N1, N2))
+    return V;
+
+  if (SDValue V = foldBoolSelectToLogic(N, DAG))
+    return V;
+
+  // vselect (not Cond), N1, N2 -> vselect Cond, N2, N1
+  if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false))
+    return DAG.getSelect(DL, VT, F, N2, N1);
+
+  // Canonicalize integer abs.
+  // vselect (setg[te] X,  0),  X, -X ->
+  // vselect (setgt    X, -1),  X, -X ->
+  // vselect (setl[te] X,  0), -X,  X ->
+  // Y = sra (X, size(X)-1); xor (add (X, Y), Y)
+  if (N0.getOpcode() == ISD::SETCC) {
+    SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+    bool isAbs = false;
+    bool RHSIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+    if (((RHSIsAllZeros && (CC == ISD::SETGT || CC == ISD::SETGE)) ||
+         (ISD::isBuildVectorAllOnes(RHS.getNode()) && CC == ISD::SETGT)) &&
+        N1 == LHS && N2.getOpcode() == ISD::SUB && N1 == N2.getOperand(1))
+      isAbs = ISD::isBuildVectorAllZeros(N2.getOperand(0).getNode());
+    else if ((RHSIsAllZeros && (CC == ISD::SETLT || CC == ISD::SETLE)) &&
+             N2 == LHS && N1.getOpcode() == ISD::SUB && N2 == N1.getOperand(1))
+      isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
+
+    if (isAbs) {
+      if (TLI.isOperationLegalOrCustom(ISD::ABS, VT))
+        return DAG.getNode(ISD::ABS, DL, VT, LHS);
+
+      SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, LHS,
+                                  DAG.getConstant(VT.getScalarSizeInBits() - 1,
+                                                  DL, getShiftAmountTy(VT)));
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift);
+      AddToWorklist(Shift.getNode());
+      AddToWorklist(Add.getNode());
+      return DAG.getNode(ISD::XOR, DL, VT, Add, Shift);
+    }
+
+    // vselect x, y (fcmp lt x, y) -> fminnum x, y
+    // vselect x, y (fcmp gt x, y) -> fmaxnum x, y
+    //
+    // This is OK if we don't care about what happens if either operand is a
+    // NaN.
+    //
+    if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, LHS, RHS, TLI)) {
+      if (SDValue FMinMax = combineMinNumMaxNum(DL, VT, LHS, RHS, N1, N2, CC))
+        return FMinMax;
+    }
+
+    if (SDValue S = PerformMinMaxFpToSatCombine(LHS, RHS, N1, N2, CC, DAG))
+      return S;
+    if (SDValue S = PerformUMinFpToSatCombine(LHS, RHS, N1, N2, CC, DAG))
+      return S;
+
+    // If this select has a condition (setcc) with narrower operands than the
+    // select, try to widen the compare to match the select width.
+    // TODO: This should be extended to handle any constant.
+    // TODO: This could be extended to handle non-loading patterns, but that
+    //       requires thorough testing to avoid regressions.
+    if (isNullOrNullSplat(RHS)) {
+      EVT NarrowVT = LHS.getValueType();
+      EVT WideVT = N1.getValueType().changeVectorElementTypeToInteger();
+      EVT SetCCVT = getSetCCResultType(LHS.getValueType());
+      unsigned SetCCWidth = SetCCVT.getScalarSizeInBits();
+      unsigned WideWidth = WideVT.getScalarSizeInBits();
+      bool IsSigned = isSignedIntSetCC(CC);
+      auto LoadExtOpcode = IsSigned ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+      if (LHS.getOpcode() == ISD::LOAD && LHS.hasOneUse() &&
+          SetCCWidth != 1 && SetCCWidth < WideWidth &&
+          TLI.isLoadExtLegalOrCustom(LoadExtOpcode, WideVT, NarrowVT) &&
+          TLI.isOperationLegalOrCustom(ISD::SETCC, WideVT)) {
+        // Both compare operands can be widened for free. The LHS can use an
+        // extended load, and the RHS is a constant:
+        //   vselect (ext (setcc load(X), C)), N1, N2 -->
+        //   vselect (setcc extload(X), C'), N1, N2
+        auto ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+        SDValue WideLHS = DAG.getNode(ExtOpcode, DL, WideVT, LHS);
+        SDValue WideRHS = DAG.getNode(ExtOpcode, DL, WideVT, RHS);
+        EVT WideSetCCVT = getSetCCResultType(WideVT);
+        SDValue WideSetCC = DAG.getSetCC(DL, WideSetCCVT, WideLHS, WideRHS, CC);
+        return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2);
+      }
+    }
+
+    // Match VSELECTs with absolute difference patterns.
+    // (vselect (setcc a, b, set?gt), (sub a, b), (sub b, a)) --> (abd? a, b)
+    // (vselect (setcc a, b, set?ge), (sub a, b), (sub b, a)) --> (abd? a, b)
+    // (vselect (setcc a, b, set?lt), (sub b, a), (sub a, b)) --> (abd? a, b)
+    // (vselect (setcc a, b, set?le), (sub b, a), (sub a, b)) --> (abd? a, b)
+    if (N1.getOpcode() == ISD::SUB && N2.getOpcode() == ISD::SUB &&
+        N1.getOperand(0) == N2.getOperand(1) &&
+        N1.getOperand(1) == N2.getOperand(0)) {
+      bool IsSigned = isSignedIntSetCC(CC);
+      unsigned ABDOpc = IsSigned ? ISD::ABDS : ISD::ABDU;
+      if (hasOperation(ABDOpc, VT)) {
+        switch (CC) {
+        case ISD::SETGT:
+        case ISD::SETGE:
+        case ISD::SETUGT:
+        case ISD::SETUGE:
+          if (LHS == N1.getOperand(0) && RHS == N1.getOperand(1))
+            return DAG.getNode(ABDOpc, DL, VT, LHS, RHS);
+          break;
+        case ISD::SETLT:
+        case ISD::SETLE:
+        case ISD::SETULT:
+        case ISD::SETULE:
+          if (RHS == N1.getOperand(0) && LHS == N1.getOperand(1) )
+            return DAG.getNode(ABDOpc, DL, VT, LHS, RHS);
+          break;
+        default:
+          break;
+        }
+      }
+    }
+
+    // Match VSELECTs into add with unsigned saturation.
+    if (hasOperation(ISD::UADDSAT, VT)) {
+      // Check if one of the arms of the VSELECT is vector with all bits set.
+      // If it's on the left side invert the predicate to simplify logic below.
+      SDValue Other;
+      ISD::CondCode SatCC = CC;
+      if (ISD::isConstantSplatVectorAllOnes(N1.getNode())) {
+        Other = N2;
+        SatCC = ISD::getSetCCInverse(SatCC, VT.getScalarType());
+      } else if (ISD::isConstantSplatVectorAllOnes(N2.getNode())) {
+        Other = N1;
+      }
+
+      if (Other && Other.getOpcode() == ISD::ADD) {
+        SDValue CondLHS = LHS, CondRHS = RHS;
+        SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
+
+        // Canonicalize condition operands.
+        if (SatCC == ISD::SETUGE) {
+          std::swap(CondLHS, CondRHS);
+          SatCC = ISD::SETULE;
+        }
+
+        // We can test against either of the addition operands.
+        // x <= x+y ? x+y : ~0 --> uaddsat x, y
+        // x+y >= x ? x+y : ~0 --> uaddsat x, y
+        if (SatCC == ISD::SETULE && Other == CondRHS &&
+            (OpLHS == CondLHS || OpRHS == CondLHS))
+          return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
+
+        if (OpRHS.getOpcode() == CondRHS.getOpcode() &&
+            (OpRHS.getOpcode() == ISD::BUILD_VECTOR ||
+             OpRHS.getOpcode() == ISD::SPLAT_VECTOR) &&
+            CondLHS == OpLHS) {
+          // If the RHS is a constant we have to reverse the const
+          // canonicalization.
+          // x >= ~C ? x+C : ~0 --> uaddsat x, C
+          auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
+            return Cond->getAPIntValue() == ~Op->getAPIntValue();
+          };
+          if (SatCC == ISD::SETULE &&
+              ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
+            return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
+        }
+      }
+    }
+
+    // Match VSELECTs into sub with unsigned saturation.
+    if (hasOperation(ISD::USUBSAT, VT)) {
+      // Check if one of the arms of the VSELECT is a zero vector. If it's on
+      // the left side invert the predicate to simplify logic below.
+      SDValue Other;
+      ISD::CondCode SatCC = CC;
+      if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) {
+        Other = N2;
+        SatCC = ISD::getSetCCInverse(SatCC, VT.getScalarType());
+      } else if (ISD::isConstantSplatVectorAllZeros(N2.getNode())) {
+        Other = N1;
+      }
+
+      // zext(x) >= y ? trunc(zext(x) - y) : 0
+      // --> usubsat(trunc(zext(x)),trunc(umin(y,SatLimit)))
+      // zext(x) >  y ? trunc(zext(x) - y) : 0
+      // --> usubsat(trunc(zext(x)),trunc(umin(y,SatLimit)))
+      if (Other && Other.getOpcode() == ISD::TRUNCATE &&
+          Other.getOperand(0).getOpcode() == ISD::SUB &&
+          (SatCC == ISD::SETUGE || SatCC == ISD::SETUGT)) {
+        SDValue OpLHS = Other.getOperand(0).getOperand(0);
+        SDValue OpRHS = Other.getOperand(0).getOperand(1);
+        if (LHS == OpLHS && RHS == OpRHS && LHS.getOpcode() == ISD::ZERO_EXTEND)
+          if (SDValue R = getTruncatedUSUBSAT(VT, LHS.getValueType(), LHS, RHS,
+                                              DAG, DL))
+            return R;
+      }
+
+      if (Other && Other.getNumOperands() == 2) {
+        SDValue CondRHS = RHS;
+        SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
+
+        if (OpLHS == LHS) {
+          // Look for a general sub with unsigned saturation first.
+          // x >= y ? x-y : 0 --> usubsat x, y
+          // x >  y ? x-y : 0 --> usubsat x, y
+          if ((SatCC == ISD::SETUGE || SatCC == ISD::SETUGT) &&
+              Other.getOpcode() == ISD::SUB && OpRHS == CondRHS)
+            return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
+
+          if (OpRHS.getOpcode() == ISD::BUILD_VECTOR ||
+              OpRHS.getOpcode() == ISD::SPLAT_VECTOR) {
+            if (CondRHS.getOpcode() == ISD::BUILD_VECTOR ||
+                CondRHS.getOpcode() == ISD::SPLAT_VECTOR) {
+              // If the RHS is a constant we have to reverse the const
+              // canonicalization.
+              // x > C-1 ? x+-C : 0 --> usubsat x, C
+              auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
+                return (!Op && !Cond) ||
+                       (Op && Cond &&
+                        Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
+              };
+              if (SatCC == ISD::SETUGT && Other.getOpcode() == ISD::ADD &&
+                  ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
+                                            /*AllowUndefs*/ true)) {
+                OpRHS = DAG.getNegative(OpRHS, DL, VT);
+                return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
+              }
+
+              // Another special case: If C was a sign bit, the sub has been
+              // canonicalized into a xor.
+              // FIXME: Would it be better to use computeKnownBits to
+              // determine whether it's safe to decanonicalize the xor?
+              // x s< 0 ? x^C : 0 --> usubsat x, C
+              APInt SplatValue;
+              if (SatCC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
+                  ISD::isConstantSplatVector(OpRHS.getNode(), SplatValue) &&
+                  ISD::isConstantSplatVectorAllZeros(CondRHS.getNode()) &&
+                  SplatValue.isSignMask()) {
+                // Note that we have to rebuild the RHS constant here to
+                // ensure we don't rely on particular values of undef lanes.
+                OpRHS = DAG.getConstant(SplatValue, DL, VT);
+                return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (SimplifySelectOps(N, N1, N2))
+    return SDValue(N, 0);  // Don't revisit N.
+
+  // Fold (vselect all_ones, N1, N2) -> N1
+  if (ISD::isConstantSplatVectorAllOnes(N0.getNode()))
+    return N1;
+  // Fold (vselect all_zeros, N1, N2) -> N2
+  if (ISD::isConstantSplatVectorAllZeros(N0.getNode()))
+    return N2;
+
+  // The ConvertSelectToConcatVector function is assuming both the above
+  // checks for (vselect (build_vector all{ones,zeros) ...) have been made
+  // and addressed.
+  if (N1.getOpcode() == ISD::CONCAT_VECTORS &&
+      N2.getOpcode() == ISD::CONCAT_VECTORS &&
+      ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
+    if (SDValue CV = ConvertSelectToConcatVector(N, DAG))
+      return CV;
+  }
+
+  if (SDValue V = foldVSelectOfConstants(N))
+    return V;
+
+  if (hasOperation(ISD::SRA, VT))
+    if (SDValue V = foldVSelectToSignBitSplatMask(N, DAG))
+      return V;
+
+  if (SimplifyDemandedVectorElts(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  SDValue N3 = N->getOperand(3);
+  SDValue N4 = N->getOperand(4);
+  ISD::CondCode CC = cast<CondCodeSDNode>(N4)->get();
+
+  // fold select_cc lhs, rhs, x, x, cc -> x
+  if (N2 == N3)
+    return N2;
+
+  // select_cc bool, 0, x, y, seteq -> select bool, y, x
+  if (CC == ISD::SETEQ && !LegalTypes && N0.getValueType() == MVT::i1 &&
+      isNullConstant(N1))
+    return DAG.getSelect(SDLoc(N), N2.getValueType(), N0, N3, N2);
+
+  // Determine if the condition we're dealing with is constant
+  if (SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1,
+                                  CC, SDLoc(N), false)) {
+    AddToWorklist(SCC.getNode());
+
+    // cond always true -> true val
+    // cond always false -> false val
+    if (auto *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode()))
+      return SCCC->isZero() ? N3 : N2;
+
+    // When the condition is UNDEF, just return the first operand. This is
+    // coherent the DAG creation, no setcc node is created in this case
+    if (SCC->isUndef())
+      return N2;
+
+    // Fold to a simpler select_cc
+    if (SCC.getOpcode() == ISD::SETCC) {
+      SDValue SelectOp = DAG.getNode(
+          ISD::SELECT_CC, SDLoc(N), N2.getValueType(), SCC.getOperand(0),
+          SCC.getOperand(1), N2, N3, SCC.getOperand(2));
+      SelectOp->setFlags(SCC->getFlags());
+      return SelectOp;
+    }
+  }
+
+  // If we can fold this based on the true/false value, do so.
+  if (SimplifySelectOps(N, N2, N3))
+    return SDValue(N, 0);  // Don't revisit N.
+
+  // fold select_cc into other things, such as min/max/abs
+  return SimplifySelectCC(SDLoc(N), N0, N1, N2, N3, CC);
+}
+
+SDValue DAGCombiner::visitSETCC(SDNode *N) {
+  // setcc is very commonly used as an argument to brcond. This pattern
+  // also lend itself to numerous combines and, as a result, it is desired
+  // we keep the argument to a brcond as a setcc as much as possible.
+  bool PreferSetCC =
+      N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BRCOND;
+
+  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  EVT VT = N->getValueType(0);
+
+  SDValue Combined = SimplifySetCC(VT, N->getOperand(0), N->getOperand(1), Cond,
+                                   SDLoc(N), !PreferSetCC);
+
+  if (!Combined)
+    return SDValue();
+
+  // If we prefer to have a setcc, and we don't, we'll try our best to
+  // recreate one using rebuildSetCC.
+  if (PreferSetCC && Combined.getOpcode() != ISD::SETCC) {
+    SDValue NewSetCC = rebuildSetCC(Combined);
+
+    // We don't have anything interesting to combine to.
+    if (NewSetCC.getNode() == N)
+      return SDValue();
+
+    if (NewSetCC)
+      return NewSetCC;
+  }
+
+  return Combined;
+}
+
+SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Carry = N->getOperand(2);
+  SDValue Cond = N->getOperand(3);
+
+  // If Carry is false, fold to a regular SETCC.
+  if (isNullConstant(Carry))
+    return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond);
+
+  return SDValue();
+}
+
+/// Check if N satisfies:
+///   N is used once.
+///   N is a Load.
+///   The load is compatible with ExtOpcode. It means
+///     If load has explicit zero/sign extension, ExpOpcode must have the same
+///     extension.
+///     Otherwise returns true.
+static bool isCompatibleLoad(SDValue N, unsigned ExtOpcode) {
+  if (!N.hasOneUse())
+    return false;
+
+  if (!isa<LoadSDNode>(N))
+    return false;
+
+  LoadSDNode *Load = cast<LoadSDNode>(N);
+  ISD::LoadExtType LoadExt = Load->getExtensionType();
+  if (LoadExt == ISD::NON_EXTLOAD || LoadExt == ISD::EXTLOAD)
+    return true;
+
+  // Now LoadExt is either SEXTLOAD or ZEXTLOAD, ExtOpcode must have the same
+  // extension.
+  if ((LoadExt == ISD::SEXTLOAD && ExtOpcode != ISD::SIGN_EXTEND) ||
+      (LoadExt == ISD::ZEXTLOAD && ExtOpcode != ISD::ZERO_EXTEND))
+    return false;
+
+  return true;
+}
+
+/// Fold
+///   (sext (select c, load x, load y)) -> (select c, sextload x, sextload y)
+///   (zext (select c, load x, load y)) -> (select c, zextload x, zextload y)
+///   (aext (select c, load x, load y)) -> (select c, extload x, extload y)
+/// This function is called by the DAGCombiner when visiting sext/zext/aext
+/// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
+static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI,
+                                         SelectionDAG &DAG,
+                                         CombineLevel Level) {
+  unsigned Opcode = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND ||
+          Opcode == ISD::ANY_EXTEND) &&
+         "Expected EXTEND dag node in input!");
+
+  if (!(N0->getOpcode() == ISD::SELECT || N0->getOpcode() == ISD::VSELECT) ||
+      !N0.hasOneUse())
+    return SDValue();
+
+  SDValue Op1 = N0->getOperand(1);
+  SDValue Op2 = N0->getOperand(2);
+  if (!isCompatibleLoad(Op1, Opcode) || !isCompatibleLoad(Op2, Opcode))
+    return SDValue();
+
+  auto ExtLoadOpcode = ISD::EXTLOAD;
+  if (Opcode == ISD::SIGN_EXTEND)
+    ExtLoadOpcode = ISD::SEXTLOAD;
+  else if (Opcode == ISD::ZERO_EXTEND)
+    ExtLoadOpcode = ISD::ZEXTLOAD;
+
+  // Illegal VSELECT may ISel fail if happen after legalization (DAG
+  // Combine2), so we should conservatively check the OperationAction.
+  LoadSDNode *Load1 = cast<LoadSDNode>(Op1);
+  LoadSDNode *Load2 = cast<LoadSDNode>(Op2);
+  if (!TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load1->getMemoryVT()) ||
+      !TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load2->getMemoryVT()) ||
+      (N0->getOpcode() == ISD::VSELECT && Level >= AfterLegalizeTypes &&
+       TLI.getOperationAction(ISD::VSELECT, VT) != TargetLowering::Legal))
+    return SDValue();
+
+  SDValue Ext1 = DAG.getNode(Opcode, DL, VT, Op1);
+  SDValue Ext2 = DAG.getNode(Opcode, DL, VT, Op2);
+  return DAG.getSelect(DL, VT, N0->getOperand(0), Ext1, Ext2);
+}
+
+/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or
+/// a build_vector of constants.
+/// This function is called by the DAGCombiner when visiting sext/zext/aext
+/// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
+/// Vector extends are not folded if operations are legal; this is to
+/// avoid introducing illegal build_vector dag nodes.
+static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
+                                         SelectionDAG &DAG, bool LegalTypes) {
+  unsigned Opcode = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  assert((ISD::isExtOpcode(Opcode) || ISD::isExtVecInRegOpcode(Opcode)) &&
+         "Expected EXTEND dag node in input!");
+
+  // fold (sext c1) -> c1
+  // fold (zext c1) -> c1
+  // fold (aext c1) -> c1
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(Opcode, DL, VT, N0);
+
+  // fold (sext (select cond, c1, c2)) -> (select cond, sext c1, sext c2)
+  // fold (zext (select cond, c1, c2)) -> (select cond, zext c1, zext c2)
+  // fold (aext (select cond, c1, c2)) -> (select cond, sext c1, sext c2)
+  if (N0->getOpcode() == ISD::SELECT) {
+    SDValue Op1 = N0->getOperand(1);
+    SDValue Op2 = N0->getOperand(2);
+    if (isa<ConstantSDNode>(Op1) && isa<ConstantSDNode>(Op2) &&
+        (Opcode != ISD::ZERO_EXTEND || !TLI.isZExtFree(N0.getValueType(), VT))) {
+      // For any_extend, choose sign extension of the constants to allow a
+      // possible further transform to sign_extend_inreg.i.e.
+      //
+      // t1: i8 = select t0, Constant:i8<-1>, Constant:i8<0>
+      // t2: i64 = any_extend t1
+      // -->
+      // t3: i64 = select t0, Constant:i64<-1>, Constant:i64<0>
+      // -->
+      // t4: i64 = sign_extend_inreg t3
+      unsigned FoldOpc = Opcode;
+      if (FoldOpc == ISD::ANY_EXTEND)
+        FoldOpc = ISD::SIGN_EXTEND;
+      return DAG.getSelect(DL, VT, N0->getOperand(0),
+                           DAG.getNode(FoldOpc, DL, VT, Op1),
+                           DAG.getNode(FoldOpc, DL, VT, Op2));
+    }
+  }
+
+  // fold (sext (build_vector AllConstants) -> (build_vector AllConstants)
+  // fold (zext (build_vector AllConstants) -> (build_vector AllConstants)
+  // fold (aext (build_vector AllConstants) -> (build_vector AllConstants)
+  EVT SVT = VT.getScalarType();
+  if (!(VT.isVector() && (!LegalTypes || TLI.isTypeLegal(SVT)) &&
+      ISD::isBuildVectorOfConstantSDNodes(N0.getNode())))
+    return SDValue();
+
+  // We can fold this node into a build_vector.
+  unsigned VTBits = SVT.getSizeInBits();
+  unsigned EVTBits = N0->getValueType(0).getScalarSizeInBits();
+  SmallVector<SDValue, 8> Elts;
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (unsigned i = 0; i != NumElts; ++i) {
+    SDValue Op = N0.getOperand(i);
+    if (Op.isUndef()) {
+      if (Opcode == ISD::ANY_EXTEND || Opcode == ISD::ANY_EXTEND_VECTOR_INREG)
+        Elts.push_back(DAG.getUNDEF(SVT));
+      else
+        Elts.push_back(DAG.getConstant(0, DL, SVT));
+      continue;
+    }
+
+    SDLoc DL(Op);
+    // Get the constant value and if needed trunc it to the size of the type.
+    // Nodes like build_vector might have constants wider than the scalar type.
+    APInt C = cast<ConstantSDNode>(Op)->getAPIntValue().zextOrTrunc(EVTBits);
+    if (Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG)
+      Elts.push_back(DAG.getConstant(C.sext(VTBits), DL, SVT));
+    else
+      Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT));
+  }
+
+  return DAG.getBuildVector(VT, DL, Elts);
+}
+
+// ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
+// "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))"
+// transformation. Returns true if extension are possible and the above
+// mentioned transformation is profitable.
+static bool ExtendUsesToFormExtLoad(EVT VT, SDNode *N, SDValue N0,
+                                    unsigned ExtOpc,
+                                    SmallVectorImpl<SDNode *> &ExtendNodes,
+                                    const TargetLowering &TLI) {
+  bool HasCopyToRegUses = false;
+  bool isTruncFree = TLI.isTruncateFree(VT, N0.getValueType());
+  for (SDNode::use_iterator UI = N0->use_begin(), UE = N0->use_end(); UI != UE;
+       ++UI) {
+    SDNode *User = *UI;
+    if (User == N)
+      continue;
+    if (UI.getUse().getResNo() != N0.getResNo())
+      continue;
+    // FIXME: Only extend SETCC N, N and SETCC N, c for now.
+    if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
+      if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC))
+        // Sign bits will be lost after a zext.
+        return false;
+      bool Add = false;
+      for (unsigned i = 0; i != 2; ++i) {
+        SDValue UseOp = User->getOperand(i);
+        if (UseOp == N0)
+          continue;
+        if (!isa<ConstantSDNode>(UseOp))
+          return false;
+        Add = true;
+      }
+      if (Add)
+        ExtendNodes.push_back(User);
+      continue;
+    }
+    // If truncates aren't free and there are users we can't
+    // extend, it isn't worthwhile.
+    if (!isTruncFree)
+      return false;
+    // Remember if this value is live-out.
+    if (User->getOpcode() == ISD::CopyToReg)
+      HasCopyToRegUses = true;
+  }
+
+  if (HasCopyToRegUses) {
+    bool BothLiveOut = false;
+    for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+         UI != UE; ++UI) {
+      SDUse &Use = UI.getUse();
+      if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) {
+        BothLiveOut = true;
+        break;
+      }
+    }
+    if (BothLiveOut)
+      // Both unextended and extended values are live out. There had better be
+      // a good reason for the transformation.
+      return ExtendNodes.size();
+  }
+  return true;
+}
+
+void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
+                                  SDValue OrigLoad, SDValue ExtLoad,
+                                  ISD::NodeType ExtType) {
+  // Extend SetCC uses if necessary.
+  SDLoc DL(ExtLoad);
+  for (SDNode *SetCC : SetCCs) {
+    SmallVector<SDValue, 4> Ops;
+
+    for (unsigned j = 0; j != 2; ++j) {
+      SDValue SOp = SetCC->getOperand(j);
+      if (SOp == OrigLoad)
+        Ops.push_back(ExtLoad);
+      else
+        Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp));
+    }
+
+    Ops.push_back(SetCC->getOperand(2));
+    CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops));
+  }
+}
+
+// FIXME: Bring more similar combines here, common to sext/zext (maybe aext?).
+SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT DstVT = N->getValueType(0);
+  EVT SrcVT = N0.getValueType();
+
+  assert((N->getOpcode() == ISD::SIGN_EXTEND ||
+          N->getOpcode() == ISD::ZERO_EXTEND) &&
+         "Unexpected node type (not an extend)!");
+
+  // fold (sext (load x)) to multiple smaller sextloads; same for zext.
+  // For example, on a target with legal v4i32, but illegal v8i32, turn:
+  //   (v8i32 (sext (v8i16 (load x))))
+  // into:
+  //   (v8i32 (concat_vectors (v4i32 (sextload x)),
+  //                          (v4i32 (sextload (x + 16)))))
+  // Where uses of the original load, i.e.:
+  //   (v8i16 (load x))
+  // are replaced with:
+  //   (v8i16 (truncate
+  //     (v8i32 (concat_vectors (v4i32 (sextload x)),
+  //                            (v4i32 (sextload (x + 16)))))))
+  //
+  // This combine is only applicable to illegal, but splittable, vectors.
+  // All legal types, and illegal non-vector types, are handled elsewhere.
+  // This combine is controlled by TargetLowering::isVectorLoadExtDesirable.
+  //
+  if (N0->getOpcode() != ISD::LOAD)
+    return SDValue();
+
+  LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+
+  if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) ||
+      !N0.hasOneUse() || !LN0->isSimple() ||
+      !DstVT.isVector() || !DstVT.isPow2VectorType() ||
+      !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
+    return SDValue();
+
+  SmallVector<SDNode *, 4> SetCCs;
+  if (!ExtendUsesToFormExtLoad(DstVT, N, N0, N->getOpcode(), SetCCs, TLI))
+    return SDValue();
+
+  ISD::LoadExtType ExtType =
+      N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+
+  // Try to split the vector types to get down to legal types.
+  EVT SplitSrcVT = SrcVT;
+  EVT SplitDstVT = DstVT;
+  while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) &&
+         SplitSrcVT.getVectorNumElements() > 1) {
+    SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first;
+    SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first;
+  }
+
+  if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT))
+    return SDValue();
+
+  assert(!DstVT.isScalableVector() && "Unexpected scalable vector type");
+
+  SDLoc DL(N);
+  const unsigned NumSplits =
+      DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements();
+  const unsigned Stride = SplitSrcVT.getStoreSize();
+  SmallVector<SDValue, 4> Loads;
+  SmallVector<SDValue, 4> Chains;
+
+  SDValue BasePtr = LN0->getBasePtr();
+  for (unsigned Idx = 0; Idx < NumSplits; Idx++) {
+    const unsigned Offset = Idx * Stride;
+    const Align Align = commonAlignment(LN0->getAlign(), Offset);
+
+    SDValue SplitLoad = DAG.getExtLoad(
+        ExtType, SDLoc(LN0), SplitDstVT, LN0->getChain(), BasePtr,
+        LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align,
+        LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
+
+    BasePtr = DAG.getMemBasePlusOffset(BasePtr, TypeSize::Fixed(Stride), DL);
+
+    Loads.push_back(SplitLoad.getValue(0));
+    Chains.push_back(SplitLoad.getValue(1));
+  }
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);
+
+  // Simplify TF.
+  AddToWorklist(NewChain.getNode());
+
+  CombineTo(N, NewValue);
+
+  // Replace uses of the original load (before extension)
+  // with a truncate of the concatenated sextloaded vectors.
+  SDValue Trunc =
+      DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue);
+  ExtendSetCCUses(SetCCs, N0, NewValue, (ISD::NodeType)N->getOpcode());
+  CombineTo(N0.getNode(), Trunc, NewChain);
+  return SDValue(N, 0); // Return N so it doesn't get rechecked!
+}
+
+// fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
+//      (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
+SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) {
+  assert(N->getOpcode() == ISD::ZERO_EXTEND);
+  EVT VT = N->getValueType(0);
+  EVT OrigVT = N->getOperand(0).getValueType();
+  if (TLI.isZExtFree(OrigVT, VT))
+    return SDValue();
+
+  // and/or/xor
+  SDValue N0 = N->getOperand(0);
+  if (!ISD::isBitwiseLogicOp(N0.getOpcode()) ||
+      N0.getOperand(1).getOpcode() != ISD::Constant ||
+      (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT)))
+    return SDValue();
+
+  // shl/shr
+  SDValue N1 = N0->getOperand(0);
+  if (!(N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) ||
+      N1.getOperand(1).getOpcode() != ISD::Constant ||
+      (LegalOperations && !TLI.isOperationLegal(N1.getOpcode(), VT)))
+    return SDValue();
+
+  // load
+  if (!isa<LoadSDNode>(N1.getOperand(0)))
+    return SDValue();
+  LoadSDNode *Load = cast<LoadSDNode>(N1.getOperand(0));
+  EVT MemVT = Load->getMemoryVT();
+  if (!TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) ||
+      Load->getExtensionType() == ISD::SEXTLOAD || Load->isIndexed())
+    return SDValue();
+
+
+  // If the shift op is SHL, the logic op must be AND, otherwise the result
+  // will be wrong.
+  if (N1.getOpcode() == ISD::SHL && N0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (!N0.hasOneUse() || !N1.hasOneUse())
+    return SDValue();
+
+  SmallVector<SDNode*, 4> SetCCs;
+  if (!ExtendUsesToFormExtLoad(VT, N1.getNode(), N1.getOperand(0),
+                               ISD::ZERO_EXTEND, SetCCs, TLI))
+    return SDValue();
+
+  // Actually do the transformation.
+  SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Load), VT,
+                                   Load->getChain(), Load->getBasePtr(),
+                                   Load->getMemoryVT(), Load->getMemOperand());
+
+  SDLoc DL1(N1);
+  SDValue Shift = DAG.getNode(N1.getOpcode(), DL1, VT, ExtLoad,
+                              N1.getOperand(1));
+
+  APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
+  SDLoc DL0(N0);
+  SDValue And = DAG.getNode(N0.getOpcode(), DL0, VT, Shift,
+                            DAG.getConstant(Mask, DL0, VT));
+
+  ExtendSetCCUses(SetCCs, N1.getOperand(0), ExtLoad, ISD::ZERO_EXTEND);
+  CombineTo(N, And);
+  if (SDValue(Load, 0).hasOneUse()) {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), ExtLoad.getValue(1));
+  } else {
+    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(Load),
+                                Load->getValueType(0), ExtLoad);
+    CombineTo(Load, Trunc, ExtLoad.getValue(1));
+  }
+
+  // N0 is dead at this point.
+  recursivelyDeleteUnusedNodes(N0.getNode());
+
+  return SDValue(N,0); // Return N so it doesn't get rechecked!
+}
+
+/// If we're narrowing or widening the result of a vector select and the final
+/// size is the same size as a setcc (compare) feeding the select, then try to
+/// apply the cast operation to the select's operands because matching vector
+/// sizes for a select condition and other operands should be more efficient.
+SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) {
+  unsigned CastOpcode = Cast->getOpcode();
+  assert((CastOpcode == ISD::SIGN_EXTEND || CastOpcode == ISD::ZERO_EXTEND ||
+          CastOpcode == ISD::TRUNCATE || CastOpcode == ISD::FP_EXTEND ||
+          CastOpcode == ISD::FP_ROUND) &&
+         "Unexpected opcode for vector select narrowing/widening");
+
+  // We only do this transform before legal ops because the pattern may be
+  // obfuscated by target-specific operations after legalization. Do not create
+  // an illegal select op, however, because that may be difficult to lower.
+  EVT VT = Cast->getValueType(0);
+  if (LegalOperations || !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+    return SDValue();
+
+  SDValue VSel = Cast->getOperand(0);
+  if (VSel.getOpcode() != ISD::VSELECT || !VSel.hasOneUse() ||
+      VSel.getOperand(0).getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  // Does the setcc have the same vector size as the casted select?
+  SDValue SetCC = VSel.getOperand(0);
+  EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType());
+  if (SetCCVT.getSizeInBits() != VT.getSizeInBits())
+    return SDValue();
+
+  // cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B)
+  SDValue A = VSel.getOperand(1);
+  SDValue B = VSel.getOperand(2);
+  SDValue CastA, CastB;
+  SDLoc DL(Cast);
+  if (CastOpcode == ISD::FP_ROUND) {
+    // FP_ROUND (fptrunc) has an extra flag operand to pass along.
+    CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1));
+    CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1));
+  } else {
+    CastA = DAG.getNode(CastOpcode, DL, VT, A);
+    CastB = DAG.getNode(CastOpcode, DL, VT, B);
+  }
+  return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB);
+}
+
+// fold ([s|z]ext ([s|z]extload x)) -> ([s|z]ext (truncate ([s|z]extload x)))
+// fold ([s|z]ext (     extload x)) -> ([s|z]ext (truncate ([s|z]extload x)))
+static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner,
+                                     const TargetLowering &TLI, EVT VT,
+                                     bool LegalOperations, SDNode *N,
+                                     SDValue N0, ISD::LoadExtType ExtLoadType) {
+  SDNode *N0Node = N0.getNode();
+  bool isAExtLoad = (ExtLoadType == ISD::SEXTLOAD) ? ISD::isSEXTLoad(N0Node)
+                                                   : ISD::isZEXTLoad(N0Node);
+  if ((!isAExtLoad && !ISD::isEXTLoad(N0Node)) ||
+      !ISD::isUNINDEXEDLoad(N0Node) || !N0.hasOneUse())
+    return SDValue();
+
+  LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+  EVT MemVT = LN0->getMemoryVT();
+  if ((LegalOperations || !LN0->isSimple() ||
+       VT.isVector()) &&
+      !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT))
+    return SDValue();
+
+  SDValue ExtLoad =
+      DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(),
+                     LN0->getBasePtr(), MemVT, LN0->getMemOperand());
+  Combiner.CombineTo(N, ExtLoad);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+  if (LN0->use_empty())
+    Combiner.recursivelyDeleteUnusedNodes(LN0);
+  return SDValue(N, 0); // Return N so it doesn't get rechecked!
+}
+
+// fold ([s|z]ext (load x)) -> ([s|z]ext (truncate ([s|z]extload x)))
+// Only generate vector extloads when 1) they're legal, and 2) they are
+// deemed desirable by the target.
+static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner,
+                                  const TargetLowering &TLI, EVT VT,
+                                  bool LegalOperations, SDNode *N, SDValue N0,
+                                  ISD::LoadExtType ExtLoadType,
+                                  ISD::NodeType ExtOpc) {
+  // TODO: isFixedLengthVector() should be removed and any negative effects on
+  // code generation being the result of that target's implementation of
+  // isVectorLoadExtDesirable().
+  if (!ISD::isNON_EXTLoad(N0.getNode()) ||
+      !ISD::isUNINDEXEDLoad(N0.getNode()) ||
+      ((LegalOperations || VT.isFixedLengthVector() ||
+        !cast<LoadSDNode>(N0)->isSimple()) &&
+       !TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType())))
+    return {};
+
+  bool DoXform = true;
+  SmallVector<SDNode *, 4> SetCCs;
+  if (!N0.hasOneUse())
+    DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ExtOpc, SetCCs, TLI);
+  if (VT.isVector())
+    DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
+  if (!DoXform)
+    return {};
+
+  LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+  SDValue ExtLoad = DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(),
+                                   LN0->getBasePtr(), N0.getValueType(),
+                                   LN0->getMemOperand());
+  Combiner.ExtendSetCCUses(SetCCs, N0, ExtLoad, ExtOpc);
+  // If the load value is used only by N, replace it via CombineTo N.
+  bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
+  Combiner.CombineTo(N, ExtLoad);
+  if (NoReplaceTrunc) {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+    Combiner.recursivelyDeleteUnusedNodes(LN0);
+  } else {
+    SDValue Trunc =
+        DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad);
+    Combiner.CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+  }
+  return SDValue(N, 0); // Return N so it doesn't get rechecked!
+}
+
+static SDValue tryToFoldExtOfMaskedLoad(SelectionDAG &DAG,
+                                        const TargetLowering &TLI, EVT VT,
+                                        SDNode *N, SDValue N0,
+                                        ISD::LoadExtType ExtLoadType,
+                                        ISD::NodeType ExtOpc) {
+  if (!N0.hasOneUse())
+    return SDValue();
+
+  MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0);
+  if (!Ld || Ld->getExtensionType() != ISD::NON_EXTLOAD)
+    return SDValue();
+
+  if (!TLI.isLoadExtLegalOrCustom(ExtLoadType, VT, Ld->getValueType(0)))
+    return SDValue();
+
+  if (!TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
+    return SDValue();
+
+  SDLoc dl(Ld);
+  SDValue PassThru = DAG.getNode(ExtOpc, dl, VT, Ld->getPassThru());
+  SDValue NewLoad = DAG.getMaskedLoad(
+      VT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), Ld->getMask(),
+      PassThru, Ld->getMemoryVT(), Ld->getMemOperand(), Ld->getAddressingMode(),
+      ExtLoadType, Ld->isExpandingLoad());
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1));
+  return NewLoad;
+}
+
+static SDValue foldExtendedSignBitTest(SDNode *N, SelectionDAG &DAG,
+                                       bool LegalOperations) {
+  assert((N->getOpcode() == ISD::SIGN_EXTEND ||
+          N->getOpcode() == ISD::ZERO_EXTEND) && "Expected sext or zext");
+
+  SDValue SetCC = N->getOperand(0);
+  if (LegalOperations || SetCC.getOpcode() != ISD::SETCC ||
+      !SetCC.hasOneUse() || SetCC.getValueType() != MVT::i1)
+    return SDValue();
+
+  SDValue X = SetCC.getOperand(0);
+  SDValue Ones = SetCC.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+  EVT VT = N->getValueType(0);
+  EVT XVT = X.getValueType();
+  // setge X, C is canonicalized to setgt, so we do not need to match that
+  // pattern. The setlt sibling is folded in SimplifySelectCC() because it does
+  // not require the 'not' op.
+  if (CC == ISD::SETGT && isAllOnesConstant(Ones) && VT == XVT) {
+    // Invert and smear/shift the sign bit:
+    // sext i1 (setgt iN X, -1) --> sra (not X), (N - 1)
+    // zext i1 (setgt iN X, -1) --> srl (not X), (N - 1)
+    SDLoc DL(N);
+    unsigned ShCt = VT.getSizeInBits() - 1;
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) {
+      SDValue NotX = DAG.getNOT(DL, X, VT);
+      SDValue ShiftAmount = DAG.getConstant(ShCt, DL, VT);
+      auto ShiftOpcode =
+        N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SRA : ISD::SRL;
+      return DAG.getNode(ShiftOpcode, DL, VT, NotX, ShiftAmount);
+    }
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::foldSextSetcc(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  SDValue N01 = N0.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+  EVT VT = N->getValueType(0);
+  EVT N00VT = N00.getValueType();
+  SDLoc DL(N);
+
+  // Propagate fast-math-flags.
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags());
+
+  // On some architectures (such as SSE/NEON/etc) the SETCC result type is
+  // the same size as the compared operands. Try to optimize sext(setcc())
+  // if this is the case.
+  if (VT.isVector() && !LegalOperations &&
+      TLI.getBooleanContents(N00VT) ==
+          TargetLowering::ZeroOrNegativeOneBooleanContent) {
+    EVT SVT = getSetCCResultType(N00VT);
+
+    // If we already have the desired type, don't change it.
+    if (SVT != N0.getValueType()) {
+      // We know that the # elements of the results is the same as the
+      // # elements of the compare (and the # elements of the compare result
+      // for that matter).  Check to see that they are the same size.  If so,
+      // we know that the element size of the sext'd result matches the
+      // element size of the compare operands.
+      if (VT.getSizeInBits() == SVT.getSizeInBits())
+        return DAG.getSetCC(DL, VT, N00, N01, CC);
+
+      // If the desired elements are smaller or larger than the source
+      // elements, we can use a matching integer vector type and then
+      // truncate/sign extend.
+      EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
+      if (SVT == MatchingVecType) {
+        SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC);
+        return DAG.getSExtOrTrunc(VsetCC, DL, VT);
+      }
+    }
+
+    // Try to eliminate the sext of a setcc by zexting the compare operands.
+    if (N0.hasOneUse() && TLI.isOperationLegalOrCustom(ISD::SETCC, VT) &&
+        !TLI.isOperationLegalOrCustom(ISD::SETCC, SVT)) {
+      bool IsSignedCmp = ISD::isSignedIntSetCC(CC);
+      unsigned LoadOpcode = IsSignedCmp ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+      unsigned ExtOpcode = IsSignedCmp ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+
+      // We have an unsupported narrow vector compare op that would be legal
+      // if extended to the destination type. See if the compare operands
+      // can be freely extended to the destination type.
+      auto IsFreeToExtend = [&](SDValue V) {
+        if (isConstantOrConstantVector(V, /*NoOpaques*/ true))
+          return true;
+        // Match a simple, non-extended load that can be converted to a
+        // legal {z/s}ext-load.
+        // TODO: Allow widening of an existing {z/s}ext-load?
+        if (!(ISD::isNON_EXTLoad(V.getNode()) &&
+              ISD::isUNINDEXEDLoad(V.getNode()) &&
+              cast<LoadSDNode>(V)->isSimple() &&
+              TLI.isLoadExtLegal(LoadOpcode, VT, V.getValueType())))
+          return false;
+
+        // Non-chain users of this value must either be the setcc in this
+        // sequence or extends that can be folded into the new {z/s}ext-load.
+        for (SDNode::use_iterator UI = V->use_begin(), UE = V->use_end();
+             UI != UE; ++UI) {
+          // Skip uses of the chain and the setcc.
+          SDNode *User = *UI;
+          if (UI.getUse().getResNo() != 0 || User == N0.getNode())
+            continue;
+          // Extra users must have exactly the same cast we are about to create.
+          // TODO: This restriction could be eased if ExtendUsesToFormExtLoad()
+          //       is enhanced similarly.
+          if (User->getOpcode() != ExtOpcode || User->getValueType(0) != VT)
+            return false;
+        }
+        return true;
+      };
+
+      if (IsFreeToExtend(N00) && IsFreeToExtend(N01)) {
+        SDValue Ext0 = DAG.getNode(ExtOpcode, DL, VT, N00);
+        SDValue Ext1 = DAG.getNode(ExtOpcode, DL, VT, N01);
+        return DAG.getSetCC(DL, VT, Ext0, Ext1, CC);
+      }
+    }
+  }
+
+  // sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0)
+  // Here, T can be 1 or -1, depending on the type of the setcc and
+  // getBooleanContents().
+  unsigned SetCCWidth = N0.getScalarValueSizeInBits();
+
+  // To determine the "true" side of the select, we need to know the high bit
+  // of the value returned by the setcc if it evaluates to true.
+  // If the type of the setcc is i1, then the true case of the select is just
+  // sext(i1 1), that is, -1.
+  // If the type of the setcc is larger (say, i8) then the value of the high
+  // bit depends on getBooleanContents(), so ask TLI for a real "true" value
+  // of the appropriate width.
+  SDValue ExtTrueVal = (SetCCWidth == 1)
+                           ? DAG.getAllOnesConstant(DL, VT)
+                           : DAG.getBoolConstant(true, DL, VT, N00VT);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  if (SDValue SCC = SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))
+    return SCC;
+
+  if (!VT.isVector() && !shouldConvertSelectOfConstantsToMath(N0, VT, TLI)) {
+    EVT SetCCVT = getSetCCResultType(N00VT);
+    // Don't do this transform for i1 because there's a select transform
+    // that would reverse it.
+    // TODO: We should not do this transform at all without a target hook
+    // because a sext is likely cheaper than a select?
+    if (SetCCVT.getScalarSizeInBits() != 1 &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) {
+      SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC);
+      return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVCastOp(N, DL))
+      return FoldedVOp;
+
+  // sext(undef) = 0 because the top bit will all be the same.
+  if (N0.isUndef())
+    return DAG.getConstant(0, DL, VT);
+
+  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+    return Res;
+
+  // fold (sext (sext x)) -> (sext x)
+  // fold (sext (aext x)) -> (sext x)
+  if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0));
+
+  // fold (sext (aext_extend_vector_inreg x)) -> (sext_extend_vector_inreg x)
+  // fold (sext (sext_extend_vector_inreg x)) -> (sext_extend_vector_inreg x)
+  if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ||
+      N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
+    return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT,
+                       N0.getOperand(0));
+
+  // fold (sext (sext_inreg x)) -> (sext (trunc x))
+  if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    SDValue N00 = N0.getOperand(0);
+    EVT ExtVT = cast<VTSDNode>(N0->getOperand(1))->getVT();
+    if (N00.getOpcode() == ISD::TRUNCATE &&
+        (!LegalTypes || TLI.isTypeLegal(ExtVT))) {
+      SDValue T = DAG.getNode(ISD::TRUNCATE, DL, ExtVT, N00.getOperand(0));
+      return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, T);
+    }
+  }
+
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    // fold (sext (truncate (load x))) -> (sext (smaller load x))
+    // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n)))
+    if (SDValue NarrowLoad = reduceLoadWidth(N0.getNode())) {
+      SDNode *oye = N0.getOperand(0).getNode();
+      if (NarrowLoad.getNode() != N0.getNode()) {
+        CombineTo(N0.getNode(), NarrowLoad);
+        // CombineTo deleted the truncate, if needed, but not what's under it.
+        AddToWorklist(oye);
+      }
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+
+    // See if the value being truncated is already sign extended.  If so, just
+    // eliminate the trunc/sext pair.
+    SDValue Op = N0.getOperand(0);
+    unsigned OpBits   = Op.getScalarValueSizeInBits();
+    unsigned MidBits  = N0.getScalarValueSizeInBits();
+    unsigned DestBits = VT.getScalarSizeInBits();
+    unsigned NumSignBits = DAG.ComputeNumSignBits(Op);
+
+    if (OpBits == DestBits) {
+      // Op is i32, Mid is i8, and Dest is i32.  If Op has more than 24 sign
+      // bits, it is already ready.
+      if (NumSignBits > DestBits-MidBits)
+        return Op;
+    } else if (OpBits < DestBits) {
+      // Op is i32, Mid is i8, and Dest is i64.  If Op has more than 24 sign
+      // bits, just sext from i32.
+      if (NumSignBits > OpBits-MidBits)
+        return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op);
+    } else {
+      // Op is i64, Mid is i8, and Dest is i32.  If Op has more than 56 sign
+      // bits, just truncate to i32.
+      if (NumSignBits > OpBits-MidBits)
+        return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
+    }
+
+    // fold (sext (truncate x)) -> (sextinreg x).
+    if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG,
+                                                 N0.getValueType())) {
+      if (OpBits < DestBits)
+        Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op);
+      else if (OpBits > DestBits)
+        Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op);
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
+                         DAG.getValueType(N0.getValueType()));
+    }
+  }
+
+  // Try to simplify (sext (load x)).
+  if (SDValue foldedExt =
+          tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
+                             ISD::SEXTLOAD, ISD::SIGN_EXTEND))
+    return foldedExt;
+
+  if (SDValue foldedExt =
+      tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::SEXTLOAD,
+                               ISD::SIGN_EXTEND))
+    return foldedExt;
+
+  // fold (sext (load x)) to multiple smaller sextloads.
+  // Only on illegal but splittable vectors.
+  if (SDValue ExtLoad = CombineExtLoad(N))
+    return ExtLoad;
+
+  // Try to simplify (sext (sextload x)).
+  if (SDValue foldedExt = tryToFoldExtOfExtload(
+          DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::SEXTLOAD))
+    return foldedExt;
+
+  // fold (sext (and/or/xor (load x), cst)) ->
+  //      (and/or/xor (sextload x), (sext cst))
+  if (ISD::isBitwiseLogicOp(N0.getOpcode()) &&
+      isa<LoadSDNode>(N0.getOperand(0)) &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
+    LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0));
+    EVT MemVT = LN00->getMemoryVT();
+    if (TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT) &&
+      LN00->getExtensionType() != ISD::ZEXTLOAD && LN00->isUnindexed()) {
+      SmallVector<SDNode*, 4> SetCCs;
+      bool DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0),
+                                             ISD::SIGN_EXTEND, SetCCs, TLI);
+      if (DoXform) {
+        SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN00), VT,
+                                         LN00->getChain(), LN00->getBasePtr(),
+                                         LN00->getMemoryVT(),
+                                         LN00->getMemOperand());
+        APInt Mask = N0.getConstantOperandAPInt(1).sext(VT.getSizeInBits());
+        SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
+                                  ExtLoad, DAG.getConstant(Mask, DL, VT));
+        ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::SIGN_EXTEND);
+        bool NoReplaceTruncAnd = !N0.hasOneUse();
+        bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse();
+        CombineTo(N, And);
+        // If N0 has multiple uses, change other uses as well.
+        if (NoReplaceTruncAnd) {
+          SDValue TruncAnd =
+              DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
+          CombineTo(N0.getNode(), TruncAnd);
+        }
+        if (NoReplaceTrunc) {
+          DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1));
+        } else {
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00),
+                                      LN00->getValueType(0), ExtLoad);
+          CombineTo(LN00, Trunc, ExtLoad.getValue(1));
+        }
+        return SDValue(N,0); // Return N so it doesn't get rechecked!
+      }
+    }
+  }
+
+  if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations))
+    return V;
+
+  if (SDValue V = foldSextSetcc(N))
+    return V;
+
+  // fold (sext x) -> (zext x) if the sign bit is known zero.
+  if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
+      DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0);
+
+  if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
+    return NewVSel;
+
+  // Eliminate this sign extend by doing a negation in the destination type:
+  // sext i32 (0 - (zext i8 X to i32)) to i64 --> 0 - (zext i8 X to i64)
+  if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
+      isNullOrNullSplat(N0.getOperand(0)) &&
+      N0.getOperand(1).getOpcode() == ISD::ZERO_EXTEND &&
+      TLI.isOperationLegalOrCustom(ISD::SUB, VT)) {
+    SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(1).getOperand(0), DL, VT);
+    return DAG.getNegative(Zext, DL, VT);
+  }
+  // Eliminate this sign extend by doing a decrement in the destination type:
+  // sext i32 ((zext i8 X to i32) + (-1)) to i64 --> (zext i8 X to i64) + (-1)
+  if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() &&
+      isAllOnesOrAllOnesSplat(N0.getOperand(1)) &&
+      N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
+      TLI.isOperationLegalOrCustom(ISD::ADD, VT)) {
+    SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT);
+    return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT));
+  }
+
+  // fold sext (not i1 X) -> add (zext i1 X), -1
+  // TODO: This could be extended to handle bool vectors.
+  if (N0.getValueType() == MVT::i1 && isBitwiseNot(N0) && N0.hasOneUse() &&
+      (!LegalOperations || (TLI.isOperationLegal(ISD::ZERO_EXTEND, VT) &&
+                            TLI.isOperationLegal(ISD::ADD, VT)))) {
+    // If we can eliminate the 'not', the sext form should be better
+    if (SDValue NewXor = visitXOR(N0.getNode())) {
+      // Returning N0 is a form of in-visit replacement that may have
+      // invalidated N0.
+      if (NewXor.getNode() == N0.getNode()) {
+        // Return SDValue here as the xor should have already been replaced in
+        // this sext.
+        return SDValue();
+      }
+
+      // Return a new sext with the new xor.
+      return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewXor);
+    }
+
+    SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
+    return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT));
+  }
+
+  if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level))
+    return Res;
+
+  return SDValue();
+}
+
+/// Given an extending node with a pop-count operand, if the target does not
+/// support a pop-count in the narrow source type but does support it in the
+/// destination type, widen the pop-count to the destination type.
+static SDValue widenCtPop(SDNode *Extend, SelectionDAG &DAG) {
+  assert((Extend->getOpcode() == ISD::ZERO_EXTEND ||
+          Extend->getOpcode() == ISD::ANY_EXTEND) && "Expected extend op");
+
+  SDValue CtPop = Extend->getOperand(0);
+  if (CtPop.getOpcode() != ISD::CTPOP || !CtPop.hasOneUse())
+    return SDValue();
+
+  EVT VT = Extend->getValueType(0);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.isOperationLegalOrCustom(ISD::CTPOP, CtPop.getValueType()) ||
+      !TLI.isOperationLegalOrCustom(ISD::CTPOP, VT))
+    return SDValue();
+
+  // zext (ctpop X) --> ctpop (zext X)
+  SDLoc DL(Extend);
+  SDValue NewZext = DAG.getZExtOrTrunc(CtPop.getOperand(0), DL, VT);
+  return DAG.getNode(ISD::CTPOP, DL, VT, NewZext);
+}
+
+// If we have (zext (abs X)) where X is a type that will be promoted by type
+// legalization, convert to (abs (sext X)). But don't extend past a legal type.
+static SDValue widenAbs(SDNode *Extend, SelectionDAG &DAG) {
+  assert(Extend->getOpcode() == ISD::ZERO_EXTEND && "Expected zero extend.");
+
+  EVT VT = Extend->getValueType(0);
+  if (VT.isVector())
+    return SDValue();
+
+  SDValue Abs = Extend->getOperand(0);
+  if (Abs.getOpcode() != ISD::ABS || !Abs.hasOneUse())
+    return SDValue();
+
+  EVT AbsVT = Abs.getValueType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.getTypeAction(*DAG.getContext(), AbsVT) !=
+      TargetLowering::TypePromoteInteger)
+    return SDValue();
+
+  EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), AbsVT);
+
+  SDValue SExt =
+      DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Abs), LegalVT, Abs.getOperand(0));
+  SDValue NewAbs = DAG.getNode(ISD::ABS, SDLoc(Abs), LegalVT, SExt);
+  return DAG.getZExtOrTrunc(NewAbs, SDLoc(Extend), VT);
+}
+
+SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVCastOp(N, DL))
+      return FoldedVOp;
+
+  // zext(undef) = 0
+  if (N0.isUndef())
+    return DAG.getConstant(0, DL, VT);
+
+  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+    return Res;
+
+  // fold (zext (zext x)) -> (zext x)
+  // fold (zext (aext x)) -> (zext x)
+  if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
+
+  // fold (zext (aext_extend_vector_inreg x)) -> (zext_extend_vector_inreg x)
+  // fold (zext (zext_extend_vector_inreg x)) -> (zext_extend_vector_inreg x)
+  if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ||
+      N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG)
+    return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT,
+                       N0.getOperand(0));
+
+  // fold (zext (truncate x)) -> (zext x) or
+  //      (zext (truncate x)) -> (truncate x)
+  // This is valid when the truncated bits of x are already zero.
+  SDValue Op;
+  KnownBits Known;
+  if (isTruncateOf(DAG, N0, Op, Known)) {
+    APInt TruncatedBits =
+      (Op.getScalarValueSizeInBits() == N0.getScalarValueSizeInBits()) ?
+      APInt(Op.getScalarValueSizeInBits(), 0) :
+      APInt::getBitsSet(Op.getScalarValueSizeInBits(),
+                        N0.getScalarValueSizeInBits(),
+                        std::min(Op.getScalarValueSizeInBits(),
+                                 VT.getScalarSizeInBits()));
+    if (TruncatedBits.isSubsetOf(Known.Zero))
+      return DAG.getZExtOrTrunc(Op, DL, VT);
+  }
+
+  // fold (zext (truncate x)) -> (and x, mask)
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    // fold (zext (truncate (load x))) -> (zext (smaller load x))
+    // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n)))
+    if (SDValue NarrowLoad = reduceLoadWidth(N0.getNode())) {
+      SDNode *oye = N0.getOperand(0).getNode();
+      if (NarrowLoad.getNode() != N0.getNode()) {
+        CombineTo(N0.getNode(), NarrowLoad);
+        // CombineTo deleted the truncate, if needed, but not what's under it.
+        AddToWorklist(oye);
+      }
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
+    }
+
+    EVT SrcVT = N0.getOperand(0).getValueType();
+    EVT MinVT = N0.getValueType();
+
+    // Try to mask before the extension to avoid having to generate a larger mask,
+    // possibly over several sub-vectors.
+    if (SrcVT.bitsLT(VT) && VT.isVector()) {
+      if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) &&
+                               TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) {
+        SDValue Op = N0.getOperand(0);
+        Op = DAG.getZeroExtendInReg(Op, DL, MinVT);
+        AddToWorklist(Op.getNode());
+        SDValue ZExtOrTrunc = DAG.getZExtOrTrunc(Op, DL, VT);
+        // Transfer the debug info; the new node is equivalent to N0.
+        DAG.transferDbgValues(N0, ZExtOrTrunc);
+        return ZExtOrTrunc;
+      }
+    }
+
+    if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {
+      SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), DL, VT);
+      AddToWorklist(Op.getNode());
+      SDValue And = DAG.getZeroExtendInReg(Op, DL, MinVT);
+      // We may safely transfer the debug info describing the truncate node over
+      // to the equivalent and operation.
+      DAG.transferDbgValues(N0, And);
+      return And;
+    }
+  }
+
+  // Fold (zext (and (trunc x), cst)) -> (and x, cst),
+  // if either of the casts is not free.
+  if (N0.getOpcode() == ISD::AND &&
+      N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
+                           N0.getValueType()) ||
+       !TLI.isZExtFree(N0.getValueType(), VT))) {
+    SDValue X = N0.getOperand(0).getOperand(0);
+    X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);
+    APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
+    return DAG.getNode(ISD::AND, DL, VT,
+                       X, DAG.getConstant(Mask, DL, VT));
+  }
+
+  // Try to simplify (zext (load x)).
+  if (SDValue foldedExt =
+          tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
+                             ISD::ZEXTLOAD, ISD::ZERO_EXTEND))
+    return foldedExt;
+
+  if (SDValue foldedExt =
+      tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::ZEXTLOAD,
+                               ISD::ZERO_EXTEND))
+    return foldedExt;
+
+  // fold (zext (load x)) to multiple smaller zextloads.
+  // Only on illegal but splittable vectors.
+  if (SDValue ExtLoad = CombineExtLoad(N))
+    return ExtLoad;
+
+  // fold (zext (and/or/xor (load x), cst)) ->
+  //      (and/or/xor (zextload x), (zext cst))
+  // Unless (and (load x) cst) will match as a zextload already and has
+  // additional users.
+  if (ISD::isBitwiseLogicOp(N0.getOpcode()) &&
+      isa<LoadSDNode>(N0.getOperand(0)) &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
+    LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0));
+    EVT MemVT = LN00->getMemoryVT();
+    if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) &&
+        LN00->getExtensionType() != ISD::SEXTLOAD && LN00->isUnindexed()) {
+      bool DoXform = true;
+      SmallVector<SDNode*, 4> SetCCs;
+      if (!N0.hasOneUse()) {
+        if (N0.getOpcode() == ISD::AND) {
+          auto *AndC = cast<ConstantSDNode>(N0.getOperand(1));
+          EVT LoadResultTy = AndC->getValueType(0);
+          EVT ExtVT;
+          if (isAndLoadExtLoad(AndC, LN00, LoadResultTy, ExtVT))
+            DoXform = false;
+        }
+      }
+      if (DoXform)
+        DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0),
+                                          ISD::ZERO_EXTEND, SetCCs, TLI);
+      if (DoXform) {
+        SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN00), VT,
+                                         LN00->getChain(), LN00->getBasePtr(),
+                                         LN00->getMemoryVT(),
+                                         LN00->getMemOperand());
+        APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
+        SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
+                                  ExtLoad, DAG.getConstant(Mask, DL, VT));
+        ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::ZERO_EXTEND);
+        bool NoReplaceTruncAnd = !N0.hasOneUse();
+        bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse();
+        CombineTo(N, And);
+        // If N0 has multiple uses, change other uses as well.
+        if (NoReplaceTruncAnd) {
+          SDValue TruncAnd =
+              DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
+          CombineTo(N0.getNode(), TruncAnd);
+        }
+        if (NoReplaceTrunc) {
+          DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1));
+        } else {
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00),
+                                      LN00->getValueType(0), ExtLoad);
+          CombineTo(LN00, Trunc, ExtLoad.getValue(1));
+        }
+        return SDValue(N,0); // Return N so it doesn't get rechecked!
+      }
+    }
+  }
+
+  // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
+  //      (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
+  if (SDValue ZExtLoad = CombineZExtLogicopShiftLoad(N))
+    return ZExtLoad;
+
+  // Try to simplify (zext (zextload x)).
+  if (SDValue foldedExt = tryToFoldExtOfExtload(
+          DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::ZEXTLOAD))
+    return foldedExt;
+
+  if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations))
+    return V;
+
+  if (N0.getOpcode() == ISD::SETCC) {
+    // Propagate fast-math-flags.
+    SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags());
+
+    // Only do this before legalize for now.
+    if (!LegalOperations && VT.isVector() &&
+        N0.getValueType().getVectorElementType() == MVT::i1) {
+      EVT N00VT = N0.getOperand(0).getValueType();
+      if (getSetCCResultType(N00VT) == N0.getValueType())
+        return SDValue();
+
+      // We know that the # elements of the results is the same as the #
+      // elements of the compare (and the # elements of the compare result for
+      // that matter). Check to see that they are the same size. If so, we know
+      // that the element size of the sext'd result matches the element size of
+      // the compare operands.
+      if (VT.getSizeInBits() == N00VT.getSizeInBits()) {
+        // zext(setcc) -> zext_in_reg(vsetcc) for vectors.
+        SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0),
+                                     N0.getOperand(1), N0.getOperand(2));
+        return DAG.getZeroExtendInReg(VSetCC, DL, N0.getValueType());
+      }
+
+      // If the desired elements are smaller or larger than the source
+      // elements we can use a matching integer vector type and then
+      // truncate/any extend followed by zext_in_reg.
+      EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
+      SDValue VsetCC =
+          DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0),
+                      N0.getOperand(1), N0.getOperand(2));
+      return DAG.getZeroExtendInReg(DAG.getAnyExtOrTrunc(VsetCC, DL, VT), DL,
+                                    N0.getValueType());
+    }
+
+    // zext(setcc x,y,cc) -> zext(select x, y, true, false, cc)
+    EVT N0VT = N0.getValueType();
+    EVT N00VT = N0.getOperand(0).getValueType();
+    if (SDValue SCC = SimplifySelectCC(
+            DL, N0.getOperand(0), N0.getOperand(1),
+            DAG.getBoolConstant(true, DL, N0VT, N00VT),
+            DAG.getBoolConstant(false, DL, N0VT, N00VT),
+            cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
+      return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, SCC);
+  }
+
+  // (zext (shl (zext x), cst)) -> (shl (zext x), cst)
+  if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) &&
+      !TLI.isZExtFree(N0, VT)) {
+    SDValue ShVal = N0.getOperand(0);
+    SDValue ShAmt = N0.getOperand(1);
+    if (auto *ShAmtC = dyn_cast<ConstantSDNode>(ShAmt)) {
+      if (ShVal.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse()) {
+        if (N0.getOpcode() == ISD::SHL) {
+          // If the original shl may be shifting out bits, do not perform this
+          // transformation.
+          // TODO: Add MaskedValueIsZero check.
+          unsigned KnownZeroBits = ShVal.getValueSizeInBits() -
+                                   ShVal.getOperand(0).getValueSizeInBits();
+          if (ShAmtC->getAPIntValue().ugt(KnownZeroBits))
+            return SDValue();
+        }
+
+        // Ensure that the shift amount is wide enough for the shifted value.
+        if (Log2_32_Ceil(VT.getSizeInBits()) > ShAmt.getValueSizeInBits())
+          ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt);
+
+        return DAG.getNode(N0.getOpcode(), DL, VT,
+                           DAG.getNode(ISD::ZERO_EXTEND, DL, VT, ShVal), ShAmt);
+      }
+    }
+  }
+
+  if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
+    return NewVSel;
+
+  if (SDValue NewCtPop = widenCtPop(N, DAG))
+    return NewCtPop;
+
+  if (SDValue V = widenAbs(N, DAG))
+    return V;
+
+  if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level))
+    return Res;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // aext(undef) = undef
+  if (N0.isUndef())
+    return DAG.getUNDEF(VT);
+
+  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+    return Res;
+
+  // fold (aext (aext x)) -> (aext x)
+  // fold (aext (zext x)) -> (zext x)
+  // fold (aext (sext x)) -> (sext x)
+  if (N0.getOpcode() == ISD::ANY_EXTEND  ||
+      N0.getOpcode() == ISD::ZERO_EXTEND ||
+      N0.getOpcode() == ISD::SIGN_EXTEND)
+    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
+
+  // fold (aext (aext_extend_vector_inreg x)) -> (aext_extend_vector_inreg x)
+  // fold (aext (zext_extend_vector_inreg x)) -> (zext_extend_vector_inreg x)
+  // fold (aext (sext_extend_vector_inreg x)) -> (sext_extend_vector_inreg x)
+  if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ||
+      N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
+      N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)
+    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
+
+  // fold (aext (truncate (load x))) -> (aext (smaller load x))
+  // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    if (SDValue NarrowLoad = reduceLoadWidth(N0.getNode())) {
+      SDNode *oye = N0.getOperand(0).getNode();
+      if (NarrowLoad.getNode() != N0.getNode()) {
+        CombineTo(N0.getNode(), NarrowLoad);
+        // CombineTo deleted the truncate, if needed, but not what's under it.
+        AddToWorklist(oye);
+      }
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (aext (truncate x))
+  if (N0.getOpcode() == ISD::TRUNCATE)
+    return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
+
+  // Fold (aext (and (trunc x), cst)) -> (and x, cst)
+  // if the trunc is not free.
+  if (N0.getOpcode() == ISD::AND &&
+      N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
+                          N0.getValueType())) {
+    SDLoc DL(N);
+    SDValue X = DAG.getAnyExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT);
+    SDValue Y = DAG.getNode(ISD::ANY_EXTEND, DL, VT, N0.getOperand(1));
+    assert(isa<ConstantSDNode>(Y) && "Expected constant to be folded!");
+    return DAG.getNode(ISD::AND, DL, VT, X, Y);
+  }
+
+  // fold (aext (load x)) -> (aext (truncate (extload x)))
+  // None of the supported targets knows how to perform load and any_ext
+  // on vectors in one instruction, so attempt to fold to zext instead.
+  if (VT.isVector()) {
+    // Try to simplify (zext (load x)).
+    if (SDValue foldedExt =
+            tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
+                               ISD::ZEXTLOAD, ISD::ZERO_EXTEND))
+      return foldedExt;
+  } else if (ISD::isNON_EXTLoad(N0.getNode()) &&
+             ISD::isUNINDEXEDLoad(N0.getNode()) &&
+             TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
+    bool DoXform = true;
+    SmallVector<SDNode *, 4> SetCCs;
+    if (!N0.hasOneUse())
+      DoXform =
+          ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
+    if (DoXform) {
+      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+      SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
+                                       LN0->getChain(), LN0->getBasePtr(),
+                                       N0.getValueType(), LN0->getMemOperand());
+      ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND);
+      // If the load value is used only by N, replace it via CombineTo N.
+      bool NoReplaceTrunc = N0.hasOneUse();
+      CombineTo(N, ExtLoad);
+      if (NoReplaceTrunc) {
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+        recursivelyDeleteUnusedNodes(LN0);
+      } else {
+        SDValue Trunc =
+            DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad);
+        CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+      }
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (aext (zextload x)) -> (aext (truncate (zextload x)))
+  // fold (aext (sextload x)) -> (aext (truncate (sextload x)))
+  // fold (aext ( extload x)) -> (aext (truncate (extload  x)))
+  if (N0.getOpcode() == ISD::LOAD && !ISD::isNON_EXTLoad(N0.getNode()) &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    ISD::LoadExtType ExtType = LN0->getExtensionType();
+    EVT MemVT = LN0->getMemoryVT();
+    if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) {
+      SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N),
+                                       VT, LN0->getChain(), LN0->getBasePtr(),
+                                       MemVT, LN0->getMemOperand());
+      CombineTo(N, ExtLoad);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+      recursivelyDeleteUnusedNodes(LN0);
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  if (N0.getOpcode() == ISD::SETCC) {
+    // Propagate fast-math-flags.
+    SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags());
+
+    // For vectors:
+    // aext(setcc) -> vsetcc
+    // aext(setcc) -> truncate(vsetcc)
+    // aext(setcc) -> aext(vsetcc)
+    // Only do this before legalize for now.
+    if (VT.isVector() && !LegalOperations) {
+      EVT N00VT = N0.getOperand(0).getValueType();
+      if (getSetCCResultType(N00VT) == N0.getValueType())
+        return SDValue();
+
+      // We know that the # elements of the results is the same as the
+      // # elements of the compare (and the # elements of the compare result
+      // for that matter).  Check to see that they are the same size.  If so,
+      // we know that the element size of the sext'd result matches the
+      // element size of the compare operands.
+      if (VT.getSizeInBits() == N00VT.getSizeInBits())
+        return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
+                             N0.getOperand(1),
+                             cast<CondCodeSDNode>(N0.getOperand(2))->get());
+
+      // If the desired elements are smaller or larger than the source
+      // elements we can use a matching integer vector type and then
+      // truncate/any extend
+      EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
+      SDValue VsetCC =
+        DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
+                      N0.getOperand(1),
+                      cast<CondCodeSDNode>(N0.getOperand(2))->get());
+      return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT);
+    }
+
+    // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
+    SDLoc DL(N);
+    if (SDValue SCC = SimplifySelectCC(
+            DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
+            DAG.getConstant(0, DL, VT),
+            cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
+      return SCC;
+  }
+
+  if (SDValue NewCtPop = widenCtPop(N, DAG))
+    return NewCtPop;
+
+  if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG, Level))
+    return Res;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitAssertExt(SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT AssertVT = cast<VTSDNode>(N1)->getVT();
+
+  // fold (assert?ext (assert?ext x, vt), vt) -> (assert?ext x, vt)
+  if (N0.getOpcode() == Opcode &&
+      AssertVT == cast<VTSDNode>(N0.getOperand(1))->getVT())
+    return N0;
+
+  if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
+      N0.getOperand(0).getOpcode() == Opcode) {
+    // We have an assert, truncate, assert sandwich. Make one stronger assert
+    // by asserting on the smallest asserted type to the larger source type.
+    // This eliminates the later assert:
+    // assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN
+    // assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN
+    SDLoc DL(N);
+    SDValue BigA = N0.getOperand(0);
+    EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
+    EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT;
+    SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT);
+    SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
+                                    BigA.getOperand(0), MinAssertVTVal);
+    return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
+  }
+
+  // If we have (AssertZext (truncate (AssertSext X, iX)), iY) and Y is smaller
+  // than X. Just move the AssertZext in front of the truncate and drop the
+  // AssertSExt.
+  if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
+      N0.getOperand(0).getOpcode() == ISD::AssertSext &&
+      Opcode == ISD::AssertZext) {
+    SDValue BigA = N0.getOperand(0);
+    EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
+    if (AssertVT.bitsLT(BigA_AssertVT)) {
+      SDLoc DL(N);
+      SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
+                                      BigA.getOperand(0), N1);
+      return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitAssertAlign(SDNode *N) {
+  SDLoc DL(N);
+
+  Align AL = cast<AssertAlignSDNode>(N)->getAlign();
+  SDValue N0 = N->getOperand(0);
+
+  // Fold (assertalign (assertalign x, AL0), AL1) ->
+  // (assertalign x, max(AL0, AL1))
+  if (auto *AAN = dyn_cast<AssertAlignSDNode>(N0))
+    return DAG.getAssertAlign(DL, N0.getOperand(0),
+                              std::max(AL, AAN->getAlign()));
+
+  // In rare cases, there are trivial arithmetic ops in source operands. Sink
+  // this assert down to source operands so that those arithmetic ops could be
+  // exposed to the DAG combining.
+  switch (N0.getOpcode()) {
+  default:
+    break;
+  case ISD::ADD:
+  case ISD::SUB: {
+    unsigned AlignShift = Log2(AL);
+    SDValue LHS = N0.getOperand(0);
+    SDValue RHS = N0.getOperand(1);
+    unsigned LHSAlignShift = DAG.computeKnownBits(LHS).countMinTrailingZeros();
+    unsigned RHSAlignShift = DAG.computeKnownBits(RHS).countMinTrailingZeros();
+    if (LHSAlignShift >= AlignShift || RHSAlignShift >= AlignShift) {
+      if (LHSAlignShift < AlignShift)
+        LHS = DAG.getAssertAlign(DL, LHS, AL);
+      if (RHSAlignShift < AlignShift)
+        RHS = DAG.getAssertAlign(DL, RHS, AL);
+      return DAG.getNode(N0.getOpcode(), DL, N0.getValueType(), LHS, RHS);
+    }
+    break;
+  }
+  }
+
+  return SDValue();
+}
+
+/// If the result of a load is shifted/masked/truncated to an effectively
+/// narrower type, try to transform the load to a narrower type and/or
+/// use an extending load.
+SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
+  unsigned Opc = N->getOpcode();
+
+  ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT ExtVT = VT;
+
+  // This transformation isn't valid for vector loads.
+  if (VT.isVector())
+    return SDValue();
+
+  // The ShAmt variable is used to indicate that we've consumed a right
+  // shift. I.e. we want to narrow the width of the load by skipping to load the
+  // ShAmt least significant bits.
+  unsigned ShAmt = 0;
+  // A special case is when the least significant bits from the load are masked
+  // away, but using an AND rather than a right shift. HasShiftedOffset is used
+  // to indicate that the narrowed load should be left-shifted ShAmt bits to get
+  // the result.
+  bool HasShiftedOffset = false;
+  // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
+  // extended to VT.
+  if (Opc == ISD::SIGN_EXTEND_INREG) {
+    ExtType = ISD::SEXTLOAD;
+    ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  } else if (Opc == ISD::SRL || Opc == ISD::SRA) {
+    // Another special-case: SRL/SRA is basically zero/sign-extending a narrower
+    // value, or it may be shifting a higher subword, half or byte into the
+    // lowest bits.
+
+    // Only handle shift with constant shift amount, and the shiftee must be a
+    // load.
+    auto *LN = dyn_cast<LoadSDNode>(N0);
+    auto *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!N1C || !LN)
+      return SDValue();
+    // If the shift amount is larger than the memory type then we're not
+    // accessing any of the loaded bytes.
+    ShAmt = N1C->getZExtValue();
+    uint64_t MemoryWidth = LN->getMemoryVT().getScalarSizeInBits();
+    if (MemoryWidth <= ShAmt)
+      return SDValue();
+    // Attempt to fold away the SRL by using ZEXTLOAD and SRA by using SEXTLOAD.
+    ExtType = Opc == ISD::SRL ? ISD::ZEXTLOAD : ISD::SEXTLOAD;
+    ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShAmt);
+    // If original load is a SEXTLOAD then we can't simply replace it by a
+    // ZEXTLOAD (we could potentially replace it by a more narrow SEXTLOAD
+    // followed by a ZEXT, but that is not handled at the moment). Similarly if
+    // the original load is a ZEXTLOAD and we want to use a SEXTLOAD.
+    if ((LN->getExtensionType() == ISD::SEXTLOAD ||
+         LN->getExtensionType() == ISD::ZEXTLOAD) &&
+        LN->getExtensionType() != ExtType)
+      return SDValue();
+  } else if (Opc == ISD::AND) {
+    // An AND with a constant mask is the same as a truncate + zero-extend.
+    auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!AndC)
+      return SDValue();
+
+    const APInt &Mask = AndC->getAPIntValue();
+    unsigned ActiveBits = 0;
+    if (Mask.isMask()) {
+      ActiveBits = Mask.countr_one();
+    } else if (Mask.isShiftedMask(ShAmt, ActiveBits)) {
+      HasShiftedOffset = true;
+    } else {
+      return SDValue();
+    }
+
+    ExtType = ISD::ZEXTLOAD;
+    ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
+  }
+
+  // In case Opc==SRL we've already prepared ExtVT/ExtType/ShAmt based on doing
+  // a right shift. Here we redo some of those checks, to possibly adjust the
+  // ExtVT even further based on "a masking AND". We could also end up here for
+  // other reasons (e.g. based on Opc==TRUNCATE) and that is why some checks
+  // need to be done here as well.
+  if (Opc == ISD::SRL || N0.getOpcode() == ISD::SRL) {
+    SDValue SRL = Opc == ISD::SRL ? SDValue(N, 0) : N0;
+    // Bail out when the SRL has more than one use. This is done for historical
+    // (undocumented) reasons. Maybe intent was to guard the AND-masking below
+    // check below? And maybe it could be non-profitable to do the transform in
+    // case the SRL has multiple uses and we get here with Opc!=ISD::SRL?
+    // FIXME: Can't we just skip this check for the Opc==ISD::SRL case.
+    if (!SRL.hasOneUse())
+      return SDValue();
+
+    // Only handle shift with constant shift amount, and the shiftee must be a
+    // load.
+    auto *LN = dyn_cast<LoadSDNode>(SRL.getOperand(0));
+    auto *SRL1C = dyn_cast<ConstantSDNode>(SRL.getOperand(1));
+    if (!SRL1C || !LN)
+      return SDValue();
+
+    // If the shift amount is larger than the input type then we're not
+    // accessing any of the loaded bytes.  If the load was a zextload/extload
+    // then the result of the shift+trunc is zero/undef (handled elsewhere).
+    ShAmt = SRL1C->getZExtValue();
+    uint64_t MemoryWidth = LN->getMemoryVT().getSizeInBits();
+    if (ShAmt >= MemoryWidth)
+      return SDValue();
+
+    // Because a SRL must be assumed to *need* to zero-extend the high bits
+    // (as opposed to anyext the high bits), we can't combine the zextload
+    // lowering of SRL and an sextload.
+    if (LN->getExtensionType() == ISD::SEXTLOAD)
+      return SDValue();
+
+    // Avoid reading outside the memory accessed by the original load (could
+    // happened if we only adjust the load base pointer by ShAmt). Instead we
+    // try to narrow the load even further. The typical scenario here is:
+    //   (i64 (truncate (i96 (srl (load x), 64)))) ->
+    //     (i64 (truncate (i96 (zextload (load i32 + offset) from i32))))
+    if (ExtVT.getScalarSizeInBits() > MemoryWidth - ShAmt) {
+      // Don't replace sextload by zextload.
+      if (ExtType == ISD::SEXTLOAD)
+        return SDValue();
+      // Narrow the load.
+      ExtType = ISD::ZEXTLOAD;
+      ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShAmt);
+    }
+
+    // If the SRL is only used by a masking AND, we may be able to adjust
+    // the ExtVT to make the AND redundant.
+    SDNode *Mask = *(SRL->use_begin());
+    if (SRL.hasOneUse() && Mask->getOpcode() == ISD::AND &&
+        isa<ConstantSDNode>(Mask->getOperand(1))) {
+      const APInt& ShiftMask = Mask->getConstantOperandAPInt(1);
+      if (ShiftMask.isMask()) {
+        EVT MaskedVT =
+            EVT::getIntegerVT(*DAG.getContext(), ShiftMask.countr_one());
+        // If the mask is smaller, recompute the type.
+        if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) &&
+            TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT))
+          ExtVT = MaskedVT;
+      }
+    }
+
+    N0 = SRL.getOperand(0);
+  }
+
+  // If the load is shifted left (and the result isn't shifted back right), we
+  // can fold a truncate through the shift. The typical scenario is that N
+  // points at a TRUNCATE here so the attempted fold is:
+  //   (truncate (shl (load x), c))) -> (shl (narrow load x), c)
+  // ShLeftAmt will indicate how much a narrowed load should be shifted left.
+  unsigned ShLeftAmt = 0;
+  if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
+      ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
+    if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      ShLeftAmt = N01->getZExtValue();
+      N0 = N0.getOperand(0);
+    }
+  }
+
+  // If we haven't found a load, we can't narrow it.
+  if (!isa<LoadSDNode>(N0))
+    return SDValue();
+
+  LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+  // Reducing the width of a volatile load is illegal.  For atomics, we may be
+  // able to reduce the width provided we never widen again. (see D66309)
+  if (!LN0->isSimple() ||
+      !isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt))
+    return SDValue();
+
+  auto AdjustBigEndianShift = [&](unsigned ShAmt) {
+    unsigned LVTStoreBits =
+        LN0->getMemoryVT().getStoreSizeInBits().getFixedValue();
+    unsigned EVTStoreBits = ExtVT.getStoreSizeInBits().getFixedValue();
+    return LVTStoreBits - EVTStoreBits - ShAmt;
+  };
+
+  // We need to adjust the pointer to the load by ShAmt bits in order to load
+  // the correct bytes.
+  unsigned PtrAdjustmentInBits =
+      DAG.getDataLayout().isBigEndian() ? AdjustBigEndianShift(ShAmt) : ShAmt;
+
+  uint64_t PtrOff = PtrAdjustmentInBits / 8;
+  Align NewAlign = commonAlignment(LN0->getAlign(), PtrOff);
+  SDLoc DL(LN0);
+  // The original load itself didn't wrap, so an offset within it doesn't.
+  SDNodeFlags Flags;
+  Flags.setNoUnsignedWrap(true);
+  SDValue NewPtr = DAG.getMemBasePlusOffset(LN0->getBasePtr(),
+                                            TypeSize::Fixed(PtrOff), DL, Flags);
+  AddToWorklist(NewPtr.getNode());
+
+  SDValue Load;
+  if (ExtType == ISD::NON_EXTLOAD)
+    Load = DAG.getLoad(VT, DL, LN0->getChain(), NewPtr,
+                       LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign,
+                       LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
+  else
+    Load = DAG.getExtLoad(ExtType, DL, VT, LN0->getChain(), NewPtr,
+                          LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT,
+                          NewAlign, LN0->getMemOperand()->getFlags(),
+                          LN0->getAAInfo());
+
+  // Replace the old load's chain with the new load's chain.
+  WorklistRemover DeadNodes(*this);
+  DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
+
+  // Shift the result left, if we've swallowed a left shift.
+  SDValue Result = Load;
+  if (ShLeftAmt != 0) {
+    EVT ShImmTy = getShiftAmountTy(Result.getValueType());
+    if (!isUIntN(ShImmTy.getScalarSizeInBits(), ShLeftAmt))
+      ShImmTy = VT;
+    // If the shift amount is as large as the result size (but, presumably,
+    // no larger than the source) then the useful bits of the result are
+    // zero; we can't simply return the shortened shift, because the result
+    // of that operation is undefined.
+    if (ShLeftAmt >= VT.getScalarSizeInBits())
+      Result = DAG.getConstant(0, DL, VT);
+    else
+      Result = DAG.getNode(ISD::SHL, DL, VT,
+                          Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy));
+  }
+
+  if (HasShiftedOffset) {
+    // We're using a shifted mask, so the load now has an offset. This means
+    // that data has been loaded into the lower bytes than it would have been
+    // before, so we need to shl the loaded data into the correct position in the
+    // register.
+    SDValue ShiftC = DAG.getConstant(ShAmt, DL, VT);
+    Result = DAG.getNode(ISD::SHL, DL, VT, Result, ShiftC);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+  }
+
+  // Return the new loaded value.
+  return Result;
+}
+
+SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  EVT ExtVT = cast<VTSDNode>(N1)->getVT();
+  unsigned VTBits = VT.getScalarSizeInBits();
+  unsigned ExtVTBits = ExtVT.getScalarSizeInBits();
+
+  // sext_vector_inreg(undef) = 0 because the top bit will all be the same.
+  if (N0.isUndef())
+    return DAG.getConstant(0, SDLoc(N), VT);
+
+  // fold (sext_in_reg c1) -> c1
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);
+
+  // If the input is already sign extended, just drop the extension.
+  if (ExtVTBits >= DAG.ComputeMaxSignificantBits(N0))
+    return N0;
+
+  // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
+  if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+      ExtVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT()))
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0),
+                       N1);
+
+  // fold (sext_in_reg (sext x)) -> (sext x)
+  // fold (sext_in_reg (aext x)) -> (sext x)
+  // if x is small enough or if we know that x has more than 1 sign bit and the
+  // sign_extend_inreg is extending from one of them.
+  if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) {
+    SDValue N00 = N0.getOperand(0);
+    unsigned N00Bits = N00.getScalarValueSizeInBits();
+    if ((N00Bits <= ExtVTBits ||
+         DAG.ComputeMaxSignificantBits(N00) <= ExtVTBits) &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
+      return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00);
+  }
+
+  // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x)
+  // if x is small enough or if we know that x has more than 1 sign bit and the
+  // sign_extend_inreg is extending from one of them.
+  if (ISD::isExtVecInRegOpcode(N0.getOpcode())) {
+    SDValue N00 = N0.getOperand(0);
+    unsigned N00Bits = N00.getScalarValueSizeInBits();
+    unsigned DstElts = N0.getValueType().getVectorMinNumElements();
+    unsigned SrcElts = N00.getValueType().getVectorMinNumElements();
+    bool IsZext = N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG;
+    APInt DemandedSrcElts = APInt::getLowBitsSet(SrcElts, DstElts);
+    if ((N00Bits == ExtVTBits ||
+         (!IsZext && (N00Bits < ExtVTBits ||
+                      DAG.ComputeMaxSignificantBits(N00) <= ExtVTBits))) &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT)))
+      return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, N00);
+  }
+
+  // fold (sext_in_reg (zext x)) -> (sext x)
+  // iff we are extending the source sign bit.
+  if (N0.getOpcode() == ISD::ZERO_EXTEND) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getScalarValueSizeInBits() == ExtVTBits &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
+      return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00);
+  }
+
+  // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
+  if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, ExtVTBits - 1)))
+    return DAG.getZeroExtendInReg(N0, SDLoc(N), ExtVT);
+
+  // fold operands of sext_in_reg based on knowledge that the top bits are not
+  // demanded.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (sext_in_reg (load x)) -> (smaller sextload x)
+  // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits))
+  if (SDValue NarrowLoad = reduceLoadWidth(N))
+    return NarrowLoad;
+
+  // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24)
+  // fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible.
+  // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above.
+  if (N0.getOpcode() == ISD::SRL) {
+    if (auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
+      if (ShAmt->getAPIntValue().ule(VTBits - ExtVTBits)) {
+        // We can turn this into an SRA iff the input to the SRL is already sign
+        // extended enough.
+        unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
+        if (((VTBits - ExtVTBits) - ShAmt->getZExtValue()) < InSignBits)
+          return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0),
+                             N0.getOperand(1));
+      }
+  }
+
+  // fold (sext_inreg (extload x)) -> (sextload x)
+  // If sextload is not supported by target, we can only do the combine when
+  // load has one use. Doing otherwise can block folding the extload with other
+  // extends that the target does support.
+  if (ISD::isEXTLoad(N0.getNode()) &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
+      ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple() &&
+        N0.hasOneUse()) ||
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
+                                     LN0->getChain(),
+                                     LN0->getBasePtr(), ExtVT,
+                                     LN0->getMemOperand());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+    AddToWorklist(ExtLoad.getNode());
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+
+  // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
+  if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      N0.hasOneUse() &&
+      ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
+      ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) &&
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
+                                     LN0->getChain(),
+                                     LN0->getBasePtr(), ExtVT,
+                                     LN0->getMemOperand());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+
+  // fold (sext_inreg (masked_load x)) -> (sext_masked_load x)
+  // ignore it if the masked load is already sign extended
+  if (MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0)) {
+    if (ExtVT == Ld->getMemoryVT() && N0.hasOneUse() &&
+        Ld->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD &&
+        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT)) {
+      SDValue ExtMaskedLoad = DAG.getMaskedLoad(
+          VT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(),
+          Ld->getMask(), Ld->getPassThru(), ExtVT, Ld->getMemOperand(),
+          Ld->getAddressingMode(), ISD::SEXTLOAD, Ld->isExpandingLoad());
+      CombineTo(N, ExtMaskedLoad);
+      CombineTo(N0.getNode(), ExtMaskedLoad, ExtMaskedLoad.getValue(1));
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x)
+  if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
+    if (SDValue(GN0, 0).hasOneUse() &&
+        ExtVT == GN0->getMemoryVT() &&
+        TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
+      SDValue Ops[] = {GN0->getChain(),   GN0->getPassThru(), GN0->getMask(),
+                       GN0->getBasePtr(), GN0->getIndex(),    GN0->getScale()};
+
+      SDValue ExtLoad = DAG.getMaskedGather(
+          DAG.getVTList(VT, MVT::Other), ExtVT, SDLoc(N), Ops,
+          GN0->getMemOperand(), GN0->getIndexType(), ISD::SEXTLOAD);
+
+      CombineTo(N, ExtLoad);
+      CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+      AddToWorklist(ExtLoad.getNode());
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
+  if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) {
+    if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
+                                           N0.getOperand(1), false))
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, BSwap, N1);
+  }
+
+  // Fold (iM_signext_inreg
+  //        (extract_subvector (zext|anyext|sext iN_v to _) _)
+  //        from iN)
+  //      -> (extract_subvector (signext iN_v to iM))
+  if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && N0.hasOneUse() &&
+      ISD::isExtOpcode(N0.getOperand(0).getOpcode())) {
+    SDValue InnerExt = N0.getOperand(0);
+    EVT InnerExtVT = InnerExt->getValueType(0);
+    SDValue Extendee = InnerExt->getOperand(0);
+
+    if (ExtVTBits == Extendee.getValueType().getScalarSizeInBits() &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(ISD::SIGN_EXTEND, InnerExtVT))) {
+      SDValue SignExtExtendee =
+          DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), InnerExtVT, Extendee);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, SignExtExtendee,
+                         N0.getOperand(1));
+    }
+  }
+
+  return SDValue();
+}
+
+static SDValue
+foldExtendVectorInregToExtendOfSubvector(SDNode *N, const TargetLowering &TLI,
+                                         SelectionDAG &DAG,
+                                         bool LegalOperations) {
+  unsigned InregOpcode = N->getOpcode();
+  unsigned Opcode = DAG.getOpcode_EXTEND(InregOpcode);
+
+  SDValue Src = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT SrcVT = EVT::getVectorVT(*DAG.getContext(),
+                               Src.getValueType().getVectorElementType(),
+                               VT.getVectorElementCount());
+
+  assert(ISD::isExtVecInRegOpcode(InregOpcode) &&
+         "Expected EXTEND_VECTOR_INREG dag node in input!");
+
+  // Profitability check: our operand must be an one-use CONCAT_VECTORS.
+  // FIXME: one-use check may be overly restrictive
+  if (!Src.hasOneUse() || Src.getOpcode() != ISD::CONCAT_VECTORS)
+    return SDValue();
+
+  // Profitability check: we must be extending exactly one of it's operands.
+  // FIXME: this is probably overly restrictive.
+  Src = Src.getOperand(0);
+  if (Src.getValueType() != SrcVT)
+    return SDValue();
+
+  if (LegalOperations && !TLI.isOperationLegal(Opcode, VT))
+    return SDValue();
+
+  return DAG.getNode(Opcode, SDLoc(N), VT, Src);
+}
+
+SDValue DAGCombiner::visitEXTEND_VECTOR_INREG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (N0.isUndef()) {
+    // aext_vector_inreg(undef) = undef because the top bits are undefined.
+    // {s/z}ext_vector_inreg(undef) = 0 because the top bits must be the same.
+    return N->getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG
+               ? DAG.getUNDEF(VT)
+               : DAG.getConstant(0, SDLoc(N), VT);
+  }
+
+  if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
+    return Res;
+
+  if (SimplifyDemandedVectorElts(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  if (SDValue R = foldExtendVectorInregToExtendOfSubvector(N, TLI, DAG,
+                                                           LegalOperations))
+    return R;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT SrcVT = N0.getValueType();
+  bool isLE = DAG.getDataLayout().isLittleEndian();
+
+  // noop truncate
+  if (SrcVT == VT)
+    return N0;
+
+  // fold (truncate (truncate x)) -> (truncate x)
+  if (N0.getOpcode() == ISD::TRUNCATE)
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
+
+  // fold (truncate c1) -> c1
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::TRUNCATE, SDLoc(N), VT, {N0}))
+    return C;
+
+  // fold (truncate (ext x)) -> (ext x) or (truncate x) or x
+  if (N0.getOpcode() == ISD::ZERO_EXTEND ||
+      N0.getOpcode() == ISD::SIGN_EXTEND ||
+      N0.getOpcode() == ISD::ANY_EXTEND) {
+    // if the source is smaller than the dest, we still need an extend.
+    if (N0.getOperand(0).getValueType().bitsLT(VT))
+      return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
+    // if the source is larger than the dest, than we just need the truncate.
+    if (N0.getOperand(0).getValueType().bitsGT(VT))
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
+    // if the source and dest are the same type, we can drop both the extend
+    // and the truncate.
+    return N0.getOperand(0);
+  }
+
+  // Try to narrow a truncate-of-sext_in_reg to the destination type:
+  // trunc (sign_ext_inreg X, iM) to iN --> sign_ext_inreg (trunc X to iN), iM
+  if (!LegalTypes && N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+      N0.hasOneUse()) {
+    SDValue X = N0.getOperand(0);
+    SDValue ExtVal = N0.getOperand(1);
+    EVT ExtVT = cast<VTSDNode>(ExtVal)->getVT();
+    if (ExtVT.bitsLT(VT)) {
+      SDValue TrX = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, X);
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, TrX, ExtVal);
+    }
+  }
+
+  // If this is anyext(trunc), don't fold it, allow ourselves to be folded.
+  if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND))
+    return SDValue();
+
+  // Fold extract-and-trunc into a narrow extract. For example:
+  //   i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1)
+  //   i32 y = TRUNCATE(i64 x)
+  //        -- becomes --
+  //   v16i8 b = BITCAST (v2i64 val)
+  //   i8 x = EXTRACT_VECTOR_ELT(v16i8 b, i32 8)
+  //
+  // Note: We only run this optimization after type legalization (which often
+  // creates this pattern) and before operation legalization after which
+  // we need to be more careful about the vector instructions that we generate.
+  if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) {
+    EVT VecTy = N0.getOperand(0).getValueType();
+    EVT ExTy = N0.getValueType();
+    EVT TrTy = N->getValueType(0);
+
+    auto EltCnt = VecTy.getVectorElementCount();
+    unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits();
+    auto NewEltCnt = EltCnt * SizeRatio;
+
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, NewEltCnt);
+    assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size");
+
+    SDValue EltNo = N0->getOperand(1);
+    if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
+      int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+      int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
+
+      SDLoc DL(N);
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,
+                         DAG.getBitcast(NVT, N0.getOperand(0)),
+                         DAG.getVectorIdxConstant(Index, DL));
+    }
+  }
+
+  // trunc (select c, a, b) -> select c, (trunc a), (trunc b)
+  if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
+    if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
+        TLI.isTruncateFree(SrcVT, VT)) {
+      SDLoc SL(N0);
+      SDValue Cond = N0.getOperand(0);
+      SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
+      SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2));
+      return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1);
+    }
+  }
+
+  // trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits()
+  if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
+      (!LegalOperations || TLI.isOperationLegal(ISD::SHL, VT)) &&
+      TLI.isTypeDesirableForOp(ISD::SHL, VT)) {
+    SDValue Amt = N0.getOperand(1);
+    KnownBits Known = DAG.computeKnownBits(Amt);
+    unsigned Size = VT.getScalarSizeInBits();
+    if (Known.countMaxActiveBits() <= Log2_32(Size)) {
+      SDLoc SL(N);
+      EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
+      if (AmtVT != Amt.getValueType()) {
+        Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT);
+        AddToWorklist(Amt.getNode());
+      }
+      return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt);
+    }
+  }
+
+  if (SDValue V = foldSubToUSubSat(VT, N0.getNode()))
+    return V;
+
+  if (SDValue ABD = foldABSToABD(N))
+    return ABD;
+
+  // Attempt to pre-truncate BUILD_VECTOR sources.
+  if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations &&
+      TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) &&
+      // Avoid creating illegal types if running after type legalizer.
+      (!LegalTypes || TLI.isTypeLegal(VT.getScalarType()))) {
+    SDLoc DL(N);
+    EVT SVT = VT.getScalarType();
+    SmallVector<SDValue, 8> TruncOps;
+    for (const SDValue &Op : N0->op_values()) {
+      SDValue TruncOp = DAG.getNode(ISD::TRUNCATE, DL, SVT, Op);
+      TruncOps.push_back(TruncOp);
+    }
+    return DAG.getBuildVector(VT, DL, TruncOps);
+  }
+
+  // Fold a series of buildvector, bitcast, and truncate if possible.
+  // For example fold
+  //   (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to
+  //   (2xi32 (buildvector x, y)).
+  if (Level == AfterLegalizeVectorOps && VT.isVector() &&
+      N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
+      N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
+      N0.getOperand(0).hasOneUse()) {
+    SDValue BuildVect = N0.getOperand(0);
+    EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType();
+    EVT TruncVecEltTy = VT.getVectorElementType();
+
+    // Check that the element types match.
+    if (BuildVectEltTy == TruncVecEltTy) {
+      // Now we only need to compute the offset of the truncated elements.
+      unsigned BuildVecNumElts =  BuildVect.getNumOperands();
+      unsigned TruncVecNumElts = VT.getVectorNumElements();
+      unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts;
+
+      assert((BuildVecNumElts % TruncVecNumElts) == 0 &&
+             "Invalid number of elements");
+
+      SmallVector<SDValue, 8> Opnds;
+      for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset)
+        Opnds.push_back(BuildVect.getOperand(i));
+
+      return DAG.getBuildVector(VT, SDLoc(N), Opnds);
+    }
+  }
+
+  // fold (truncate (load x)) -> (smaller load x)
+  // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
+  if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
+    if (SDValue Reduced = reduceLoadWidth(N))
+      return Reduced;
+
+    // Handle the case where the load remains an extending load even
+    // after truncation.
+    if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
+      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+      if (LN0->isSimple() && LN0->getMemoryVT().bitsLT(VT)) {
+        SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0),
+                                         VT, LN0->getChain(), LN0->getBasePtr(),
+                                         LN0->getMemoryVT(),
+                                         LN0->getMemOperand());
+        DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLoad.getValue(1));
+        return NewLoad;
+      }
+    }
+  }
+
+  // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)),
+  // where ... are all 'undef'.
+  if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) {
+    SmallVector<EVT, 8> VTs;
+    SDValue V;
+    unsigned Idx = 0;
+    unsigned NumDefs = 0;
+
+    for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) {
+      SDValue X = N0.getOperand(i);
+      if (!X.isUndef()) {
+        V = X;
+        Idx = i;
+        NumDefs++;
+      }
+      // Stop if more than one members are non-undef.
+      if (NumDefs > 1)
+        break;
+
+      VTs.push_back(EVT::getVectorVT(*DAG.getContext(),
+                                     VT.getVectorElementType(),
+                                     X.getValueType().getVectorElementCount()));
+    }
+
+    if (NumDefs == 0)
+      return DAG.getUNDEF(VT);
+
+    if (NumDefs == 1) {
+      assert(V.getNode() && "The single defined operand is empty!");
+      SmallVector<SDValue, 8> Opnds;
+      for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+        if (i != Idx) {
+          Opnds.push_back(DAG.getUNDEF(VTs[i]));
+          continue;
+        }
+        SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V);
+        AddToWorklist(NV.getNode());
+        Opnds.push_back(NV);
+      }
+      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds);
+    }
+  }
+
+  // Fold truncate of a bitcast of a vector to an extract of the low vector
+  // element.
+  //
+  // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx
+  if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) {
+    SDValue VecSrc = N0.getOperand(0);
+    EVT VecSrcVT = VecSrc.getValueType();
+    if (VecSrcVT.isVector() && VecSrcVT.getScalarType() == VT &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecSrcVT))) {
+      SDLoc SL(N);
+
+      unsigned Idx = isLE ? 0 : VecSrcVT.getVectorNumElements() - 1;
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, VecSrc,
+                         DAG.getVectorIdxConstant(Idx, SL));
+    }
+  }
+
+  // Simplify the operands using demanded-bits information.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (truncate (extract_subvector(ext x))) ->
+  //      (extract_subvector x)
+  // TODO: This can be generalized to cover cases where the truncate and extract
+  // do not fully cancel each other out.
+  if (!LegalTypes && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == ISD::SIGN_EXTEND ||
+        N00.getOpcode() == ISD::ZERO_EXTEND ||
+        N00.getOpcode() == ISD::ANY_EXTEND) {
+      if (N00.getOperand(0)->getValueType(0).getVectorElementType() ==
+          VT.getVectorElementType())
+        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N0->getOperand(0)), VT,
+                           N00.getOperand(0), N0.getOperand(1));
+    }
+  }
+
+  if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
+    return NewVSel;
+
+  // Narrow a suitable binary operation with a non-opaque constant operand by
+  // moving it ahead of the truncate. This is limited to pre-legalization
+  // because targets may prefer a wider type during later combines and invert
+  // this transform.
+  switch (N0.getOpcode()) {
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    if (!LegalOperations && N0.hasOneUse() &&
+        (isConstantOrConstantVector(N0.getOperand(0), true) ||
+         isConstantOrConstantVector(N0.getOperand(1), true))) {
+      // TODO: We already restricted this to pre-legalization, but for vectors
+      // we are extra cautious to not create an unsupported operation.
+      // Target-specific changes are likely needed to avoid regressions here.
+      if (VT.isScalarInteger() || TLI.isOperationLegal(N0.getOpcode(), VT)) {
+        SDLoc DL(N);
+        SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
+        SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
+        return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
+      }
+    }
+    break;
+  case ISD::ADDE:
+  case ISD::UADDO_CARRY:
+    // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)
+    // (trunc uaddo_carry(X, Y, Carry)) ->
+    //     (uaddo_carry trunc(X), trunc(Y), Carry)
+    // When the adde's carry is not used.
+    // We only do for uaddo_carry before legalize operation
+    if (((!LegalOperations && N0.getOpcode() == ISD::UADDO_CARRY) ||
+         TLI.isOperationLegal(N0.getOpcode(), VT)) &&
+        N0.hasOneUse() && !N0->hasAnyUseOfValue(1)) {
+      SDLoc DL(N);
+      SDValue X = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
+      SDValue Y = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
+      SDVTList VTs = DAG.getVTList(VT, N0->getValueType(1));
+      return DAG.getNode(N0.getOpcode(), DL, VTs, X, Y, N0.getOperand(2));
+    }
+    break;
+  case ISD::USUBSAT:
+    // Truncate the USUBSAT only if LHS is a known zero-extension, its not
+    // enough to know that the upper bits are zero we must ensure that we don't
+    // introduce an extra truncate.
+    if (!LegalOperations && N0.hasOneUse() &&
+        N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
+        N0.getOperand(0).getOperand(0).getScalarValueSizeInBits() <=
+            VT.getScalarSizeInBits() &&
+        hasOperation(N0.getOpcode(), VT)) {
+      return getTruncatedUSUBSAT(VT, SrcVT, N0.getOperand(0), N0.getOperand(1),
+                                 DAG, SDLoc(N));
+    }
+    break;
+  }
+
+  return SDValue();
+}
+
+static SDNode *getBuildPairElt(SDNode *N, unsigned i) {
+  SDValue Elt = N->getOperand(i);
+  if (Elt.getOpcode() != ISD::MERGE_VALUES)
+    return Elt.getNode();
+  return Elt.getOperand(Elt.getResNo()).getNode();
+}
+
+/// build_pair (load, load) -> load
+/// if load locations are consecutive.
+SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
+  assert(N->getOpcode() == ISD::BUILD_PAIR);
+
+  auto *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));
+  auto *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1));
+
+  // A BUILD_PAIR is always having the least significant part in elt 0 and the
+  // most significant part in elt 1. So when combining into one large load, we
+  // need to consider the endianness.
+  if (DAG.getDataLayout().isBigEndian())
+    std::swap(LD1, LD2);
+
+  if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !ISD::isNON_EXTLoad(LD2) ||
+      !LD1->hasOneUse() || !LD2->hasOneUse() ||
+      LD1->getAddressSpace() != LD2->getAddressSpace())
+    return SDValue();
+
+  unsigned LD1Fast = 0;
+  EVT LD1VT = LD1->getValueType(0);
+  unsigned LD1Bytes = LD1VT.getStoreSize();
+  if ((!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) &&
+      DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1) &&
+      TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                             *LD1->getMemOperand(), &LD1Fast) && LD1Fast)
+    return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(),
+                       LD1->getPointerInfo(), LD1->getAlign());
+
+  return SDValue();
+}
+
+static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) {
+  // On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi
+  // and Lo parts; on big-endian machines it doesn't.
+  return DAG.getDataLayout().isBigEndian() ? 1 : 0;
+}
+
+SDValue DAGCombiner::foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG,
+                                          const TargetLowering &TLI) {
+  // If this is not a bitcast to an FP type or if the target doesn't have
+  // IEEE754-compliant FP logic, we're done.
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  EVT SourceVT = N0.getValueType();
+
+  if (!VT.isFloatingPoint())
+    return SDValue();
+
+  // TODO: Handle cases where the integer constant is a different scalar
+  // bitwidth to the FP.
+  if (VT.getScalarSizeInBits() != SourceVT.getScalarSizeInBits())
+    return SDValue();
+
+  unsigned FPOpcode;
+  APInt SignMask;
+  switch (N0.getOpcode()) {
+  case ISD::AND:
+    FPOpcode = ISD::FABS;
+    SignMask = ~APInt::getSignMask(SourceVT.getScalarSizeInBits());
+    break;
+  case ISD::XOR:
+    FPOpcode = ISD::FNEG;
+    SignMask = APInt::getSignMask(SourceVT.getScalarSizeInBits());
+    break;
+  case ISD::OR:
+    FPOpcode = ISD::FABS;
+    SignMask = APInt::getSignMask(SourceVT.getScalarSizeInBits());
+    break;
+  default:
+    return SDValue();
+  }
+
+  if (LegalOperations && !TLI.isOperationLegal(FPOpcode, VT))
+    return SDValue();
+
+  // This needs to be the inverse of logic in foldSignChangeInBitcast.
+  // FIXME: I don't think looking for bitcast intrinsically makes sense, but
+  // removing this would require more changes.
+  auto IsBitCastOrFree = [&TLI, FPOpcode](SDValue Op, EVT VT) {
+    if (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).getValueType() == VT)
+      return true;
+
+    return FPOpcode == ISD::FABS ? TLI.isFAbsFree(VT) : TLI.isFNegFree(VT);
+  };
+
+  // Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X
+  // Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X
+  // Fold (bitcast int (or (bitcast fp X to int), 0x8000...) to fp) ->
+  //   fneg (fabs X)
+  SDValue LogicOp0 = N0.getOperand(0);
+  ConstantSDNode *LogicOp1 = isConstOrConstSplat(N0.getOperand(1), true);
+  if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask &&
+      IsBitCastOrFree(LogicOp0, VT)) {
+    SDValue CastOp0 = DAG.getNode(ISD::BITCAST, SDLoc(N), VT, LogicOp0);
+    SDValue FPOp = DAG.getNode(FPOpcode, SDLoc(N), VT, CastOp0);
+    NumFPLogicOpsConv++;
+    if (N0.getOpcode() == ISD::OR)
+      return DAG.getNode(ISD::FNEG, SDLoc(N), VT, FPOp);
+    return FPOp;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBITCAST(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (N0.isUndef())
+    return DAG.getUNDEF(VT);
+
+  // If the input is a BUILD_VECTOR with all constant elements, fold this now.
+  // Only do this before legalize types, unless both types are integer and the
+  // scalar type is legal. Only do this before legalize ops, since the target
+  // maybe depending on the bitcast.
+  // First check to see if this is all constant.
+  // TODO: Support FP bitcasts after legalize types.
+  if (VT.isVector() &&
+      (!LegalTypes ||
+       (!LegalOperations && VT.isInteger() && N0.getValueType().isInteger() &&
+        TLI.isTypeLegal(VT.getVectorElementType()))) &&
+      N0.getOpcode() == ISD::BUILD_VECTOR && N0->hasOneUse() &&
+      cast<BuildVectorSDNode>(N0)->isConstant())
+    return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(),
+                                             VT.getVectorElementType());
+
+  // If the input is a constant, let getNode fold it.
+  if (isIntOrFPConstant(N0)) {
+    // If we can't allow illegal operations, we need to check that this is just
+    // a fp -> int or int -> conversion and that the resulting operation will
+    // be legal.
+    if (!LegalOperations ||
+        (isa<ConstantSDNode>(N0) && VT.isFloatingPoint() && !VT.isVector() &&
+         TLI.isOperationLegal(ISD::ConstantFP, VT)) ||
+        (isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() &&
+         TLI.isOperationLegal(ISD::Constant, VT))) {
+      SDValue C = DAG.getBitcast(VT, N0);
+      if (C.getNode() != N)
+        return C;
+    }
+  }
+
+  // (conv (conv x, t1), t2) -> (conv x, t2)
+  if (N0.getOpcode() == ISD::BITCAST)
+    return DAG.getBitcast(VT, N0.getOperand(0));
+
+  // fold (conv (logicop (conv x), (c))) -> (logicop x, (conv c))
+  // iff the current bitwise logicop type isn't legal
+  if (ISD::isBitwiseLogicOp(N0.getOpcode()) && VT.isInteger() &&
+      !TLI.isTypeLegal(N0.getOperand(0).getValueType())) {
+    auto IsFreeBitcast = [VT](SDValue V) {
+      return (V.getOpcode() == ISD::BITCAST &&
+              V.getOperand(0).getValueType() == VT) ||
+             (ISD::isBuildVectorOfConstantSDNodes(V.getNode()) &&
+              V->hasOneUse());
+    };
+    if (IsFreeBitcast(N0.getOperand(0)) && IsFreeBitcast(N0.getOperand(1)))
+      return DAG.getNode(N0.getOpcode(), SDLoc(N), VT,
+                         DAG.getBitcast(VT, N0.getOperand(0)),
+                         DAG.getBitcast(VT, N0.getOperand(1)));
+  }
+
+  // fold (conv (load x)) -> (load (conv*)x)
+  // If the resultant load doesn't need a higher alignment than the original!
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      // Do not remove the cast if the types differ in endian layout.
+      TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) ==
+          TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) &&
+      // If the load is volatile, we only want to change the load type if the
+      // resulting load is legal. Otherwise we might increase the number of
+      // memory accesses. We don't care if the original type was legal or not
+      // as we assume software couldn't rely on the number of accesses of an
+      // illegal type.
+      ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) ||
+       TLI.isOperationLegal(ISD::LOAD, VT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+
+    if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
+                                    *LN0->getMemOperand())) {
+      SDValue Load =
+          DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
+                      LN0->getPointerInfo(), LN0->getAlign(),
+                      LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
+      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
+      return Load;
+    }
+  }
+
+  if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI))
+    return V;
+
+  // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
+  // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
+  //
+  // For ppc_fp128:
+  // fold (bitcast (fneg x)) ->
+  //     flipbit = signbit
+  //     (xor (bitcast x) (build_pair flipbit, flipbit))
+  //
+  // fold (bitcast (fabs x)) ->
+  //     flipbit = (and (extract_element (bitcast x), 0), signbit)
+  //     (xor (bitcast x) (build_pair flipbit, flipbit))
+  // This often reduces constant pool loads.
+  if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) ||
+       (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
+      N0->hasOneUse() && VT.isInteger() && !VT.isVector() &&
+      !N0.getValueType().isVector()) {
+    SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0));
+    AddToWorklist(NewConv.getNode());
+
+    SDLoc DL(N);
+    if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
+      assert(VT.getSizeInBits() == 128);
+      SDValue SignBit = DAG.getConstant(
+          APInt::getSignMask(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64);
+      SDValue FlipBit;
+      if (N0.getOpcode() == ISD::FNEG) {
+        FlipBit = SignBit;
+        AddToWorklist(FlipBit.getNode());
+      } else {
+        assert(N0.getOpcode() == ISD::FABS);
+        SDValue Hi =
+            DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv,
+                        DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
+                                              SDLoc(NewConv)));
+        AddToWorklist(Hi.getNode());
+        FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit);
+        AddToWorklist(FlipBit.getNode());
+      }
+      SDValue FlipBits =
+          DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
+      AddToWorklist(FlipBits.getNode());
+      return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits);
+    }
+    APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
+    if (N0.getOpcode() == ISD::FNEG)
+      return DAG.getNode(ISD::XOR, DL, VT,
+                         NewConv, DAG.getConstant(SignBit, DL, VT));
+    assert(N0.getOpcode() == ISD::FABS);
+    return DAG.getNode(ISD::AND, DL, VT,
+                       NewConv, DAG.getConstant(~SignBit, DL, VT));
+  }
+
+  // fold (bitconvert (fcopysign cst, x)) ->
+  //         (or (and (bitconvert x), sign), (and cst, (not sign)))
+  // Note that we don't handle (copysign x, cst) because this can always be
+  // folded to an fneg or fabs.
+  //
+  // For ppc_fp128:
+  // fold (bitcast (fcopysign cst, x)) ->
+  //     flipbit = (and (extract_element
+  //                     (xor (bitcast cst), (bitcast x)), 0),
+  //                    signbit)
+  //     (xor (bitcast cst) (build_pair flipbit, flipbit))
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0->hasOneUse() &&
+      isa<ConstantFPSDNode>(N0.getOperand(0)) && VT.isInteger() &&
+      !VT.isVector()) {
+    unsigned OrigXWidth = N0.getOperand(1).getValueSizeInBits();
+    EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth);
+    if (isTypeLegal(IntXVT)) {
+      SDValue X = DAG.getBitcast(IntXVT, N0.getOperand(1));
+      AddToWorklist(X.getNode());
+
+      // If X has a different width than the result/lhs, sext it or truncate it.
+      unsigned VTWidth = VT.getSizeInBits();
+      if (OrigXWidth < VTWidth) {
+        X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X);
+        AddToWorklist(X.getNode());
+      } else if (OrigXWidth > VTWidth) {
+        // To get the sign bit in the right place, we have to shift it right
+        // before truncating.
+        SDLoc DL(X);
+        X = DAG.getNode(ISD::SRL, DL,
+                        X.getValueType(), X,
+                        DAG.getConstant(OrigXWidth-VTWidth, DL,
+                                        X.getValueType()));
+        AddToWorklist(X.getNode());
+        X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
+        AddToWorklist(X.getNode());
+      }
+
+      if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
+        APInt SignBit = APInt::getSignMask(VT.getSizeInBits() / 2);
+        SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
+        AddToWorklist(Cst.getNode());
+        SDValue X = DAG.getBitcast(VT, N0.getOperand(1));
+        AddToWorklist(X.getNode());
+        SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X);
+        AddToWorklist(XorResult.getNode());
+        SDValue XorResult64 = DAG.getNode(
+            ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult,
+            DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
+                                  SDLoc(XorResult)));
+        AddToWorklist(XorResult64.getNode());
+        SDValue FlipBit =
+            DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64,
+                        DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64));
+        AddToWorklist(FlipBit.getNode());
+        SDValue FlipBits =
+            DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
+        AddToWorklist(FlipBits.getNode());
+        return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits);
+      }
+      APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
+      X = DAG.getNode(ISD::AND, SDLoc(X), VT,
+                      X, DAG.getConstant(SignBit, SDLoc(X), VT));
+      AddToWorklist(X.getNode());
+
+      SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
+      Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT,
+                        Cst, DAG.getConstant(~SignBit, SDLoc(Cst), VT));
+      AddToWorklist(Cst.getNode());
+
+      return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst);
+    }
+  }
+
+  // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive.
+  if (N0.getOpcode() == ISD::BUILD_PAIR)
+    if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT))
+      return CombineLD;
+
+  // Remove double bitcasts from shuffles - this is often a legacy of
+  // XformToShuffleWithZero being used to combine bitmaskings (of
+  // float vectors bitcast to integer vectors) into shuffles.
+  // bitcast(shuffle(bitcast(s0),bitcast(s1))) -> shuffle(s0,s1)
+  if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT) && VT.isVector() &&
+      N0->getOpcode() == ISD::VECTOR_SHUFFLE && N0.hasOneUse() &&
+      VT.getVectorNumElements() >= N0.getValueType().getVectorNumElements() &&
+      !(VT.getVectorNumElements() % N0.getValueType().getVectorNumElements())) {
+    ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N0);
+
+    // If operands are a bitcast, peek through if it casts the original VT.
+    // If operands are a constant, just bitcast back to original VT.
+    auto PeekThroughBitcast = [&](SDValue Op) {
+      if (Op.getOpcode() == ISD::BITCAST &&
+          Op.getOperand(0).getValueType() == VT)
+        return SDValue(Op.getOperand(0));
+      if (Op.isUndef() || isAnyConstantBuildVector(Op))
+        return DAG.getBitcast(VT, Op);
+      return SDValue();
+    };
+
+    // FIXME: If either input vector is bitcast, try to convert the shuffle to
+    // the result type of this bitcast. This would eliminate at least one
+    // bitcast. See the transform in InstCombine.
+    SDValue SV0 = PeekThroughBitcast(N0->getOperand(0));
+    SDValue SV1 = PeekThroughBitcast(N0->getOperand(1));
+    if (!(SV0 && SV1))
+      return SDValue();
+
+    int MaskScale =
+        VT.getVectorNumElements() / N0.getValueType().getVectorNumElements();
+    SmallVector<int, 8> NewMask;
+    for (int M : SVN->getMask())
+      for (int i = 0; i != MaskScale; ++i)
+        NewMask.push_back(M < 0 ? -1 : M * MaskScale + i);
+
+    SDValue LegalShuffle =
+        TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask, DAG);
+    if (LegalShuffle)
+      return LegalShuffle;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  return CombineConsecutiveLoads(N, VT);
+}
+
+SDValue DAGCombiner::visitFREEZE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+
+  if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, /*PoisonOnly*/ false))
+    return N0;
+
+  // Fold freeze(op(x, ...)) -> op(freeze(x), ...).
+  // Try to push freeze through instructions that propagate but don't produce
+  // poison as far as possible. If an operand of freeze follows three
+  // conditions 1) one-use, 2) does not produce poison, and 3) has all but one
+  // guaranteed-non-poison operands (or is a BUILD_VECTOR or similar) then push
+  // the freeze through to the operands that are not guaranteed non-poison.
+  // NOTE: we will strip poison-generating flags, so ignore them here.
+  if (DAG.canCreateUndefOrPoison(N0, /*PoisonOnly*/ false,
+                                 /*ConsiderFlags*/ false) ||
+      N0->getNumValues() != 1 || !N0->hasOneUse())
+    return SDValue();
+
+  bool AllowMultipleMaybePoisonOperands = N0.getOpcode() == ISD::BUILD_VECTOR ||
+                                          N0.getOpcode() == ISD::BUILD_PAIR ||
+                                          N0.getOpcode() == ISD::CONCAT_VECTORS;
+
+  SmallSetVector<SDValue, 8> MaybePoisonOperands;
+  for (SDValue Op : N0->ops()) {
+    if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false,
+                                             /*Depth*/ 1))
+      continue;
+    bool HadMaybePoisonOperands = !MaybePoisonOperands.empty();
+    bool IsNewMaybePoisonOperand = MaybePoisonOperands.insert(Op);
+    if (!HadMaybePoisonOperands)
+      continue;
+    if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) {
+      // Multiple maybe-poison ops when not allowed - bail out.
+      return SDValue();
+    }
+  }
+  // NOTE: the whole op may be not guaranteed to not be undef or poison because
+  // it could create undef or poison due to it's poison-generating flags.
+  // So not finding any maybe-poison operands is fine.
+
+  for (SDValue MaybePoisonOperand : MaybePoisonOperands) {
+    // Don't replace every single UNDEF everywhere with frozen UNDEF, though.
+    if (MaybePoisonOperand.getOpcode() == ISD::UNDEF)
+      continue;
+    // First, freeze each offending operand.
+    SDValue FrozenMaybePoisonOperand = DAG.getFreeze(MaybePoisonOperand);
+    // Then, change all other uses of unfrozen operand to use frozen operand.
+    DAG.ReplaceAllUsesOfValueWith(MaybePoisonOperand, FrozenMaybePoisonOperand);
+    if (FrozenMaybePoisonOperand.getOpcode() == ISD::FREEZE &&
+        FrozenMaybePoisonOperand.getOperand(0) == FrozenMaybePoisonOperand) {
+      // But, that also updated the use in the freeze we just created, thus
+      // creating a cycle in a DAG. Let's undo that by mutating the freeze.
+      DAG.UpdateNodeOperands(FrozenMaybePoisonOperand.getNode(),
+                             MaybePoisonOperand);
+    }
+  }
+
+  // This node has been merged with another.
+  if (N->getOpcode() == ISD::DELETED_NODE)
+    return SDValue(N, 0);
+
+  // The whole node may have been updated, so the value we were holding
+  // may no longer be valid. Re-fetch the operand we're `freeze`ing.
+  N0 = N->getOperand(0);
+
+  // Finally, recreate the node, it's operands were updated to use
+  // frozen operands, so we just need to use it's "original" operands.
+  SmallVector<SDValue> Ops(N0->op_begin(), N0->op_end());
+  // Special-handle ISD::UNDEF, each single one of them can be it's own thing.
+  for (SDValue &Op : Ops) {
+    if (Op.getOpcode() == ISD::UNDEF)
+      Op = DAG.getFreeze(Op);
+  }
+  // NOTE: this strips poison generating flags.
+  SDValue R = DAG.getNode(N0.getOpcode(), SDLoc(N0), N0->getVTList(), Ops);
+  assert(DAG.isGuaranteedNotToBeUndefOrPoison(R, /*PoisonOnly*/ false) &&
+         "Can't create node that may be undef/poison!");
+  return R;
+}
+
+/// We know that BV is a build_vector node with Constant, ConstantFP or Undef
+/// operands. DstEltVT indicates the destination element value type.
+SDValue DAGCombiner::
+ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
+  EVT SrcEltVT = BV->getValueType(0).getVectorElementType();
+
+  // If this is already the right type, we're done.
+  if (SrcEltVT == DstEltVT) return SDValue(BV, 0);
+
+  unsigned SrcBitSize = SrcEltVT.getSizeInBits();
+  unsigned DstBitSize = DstEltVT.getSizeInBits();
+
+  // If this is a conversion of N elements of one type to N elements of another
+  // type, convert each element.  This handles FP<->INT cases.
+  if (SrcBitSize == DstBitSize) {
+    SmallVector<SDValue, 8> Ops;
+    for (SDValue Op : BV->op_values()) {
+      // If the vector element type is not legal, the BUILD_VECTOR operands
+      // are promoted and implicitly truncated.  Make that explicit here.
+      if (Op.getValueType() != SrcEltVT)
+        Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op);
+      Ops.push_back(DAG.getBitcast(DstEltVT, Op));
+      AddToWorklist(Ops.back().getNode());
+    }
+    EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
+                              BV->getValueType(0).getVectorNumElements());
+    return DAG.getBuildVector(VT, SDLoc(BV), Ops);
+  }
+
+  // Otherwise, we're growing or shrinking the elements.  To avoid having to
+  // handle annoying details of growing/shrinking FP values, we convert them to
+  // int first.
+  if (SrcEltVT.isFloatingPoint()) {
+    // Convert the input float vector to a int vector where the elements are the
+    // same sizes.
+    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits());
+    BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode();
+    SrcEltVT = IntVT;
+  }
+
+  // Now we know the input is an integer vector.  If the output is a FP type,
+  // convert to integer first, then to FP of the right size.
+  if (DstEltVT.isFloatingPoint()) {
+    EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits());
+    SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode();
+
+    // Next, convert to FP elements of the same size.
+    return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT);
+  }
+
+  // Okay, we know the src/dst types are both integers of differing types.
+  assert(SrcEltVT.isInteger() && DstEltVT.isInteger());
+
+  // TODO: Should ConstantFoldBITCASTofBUILD_VECTOR always take a
+  // BuildVectorSDNode?
+  auto *BVN = cast<BuildVectorSDNode>(BV);
+
+  // Extract the constant raw bit data.
+  BitVector UndefElements;
+  SmallVector<APInt> RawBits;
+  bool IsLE = DAG.getDataLayout().isLittleEndian();
+  if (!BVN->getConstantRawBits(IsLE, DstBitSize, RawBits, UndefElements))
+    return SDValue();
+
+  SDLoc DL(BV);
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned I = 0, E = RawBits.size(); I != E; ++I) {
+    if (UndefElements[I])
+      Ops.push_back(DAG.getUNDEF(DstEltVT));
+    else
+      Ops.push_back(DAG.getConstant(RawBits[I], DL, DstEltVT));
+  }
+
+  EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
+  return DAG.getBuildVector(VT, DL, Ops);
+}
+
+// Returns true if floating point contraction is allowed on the FMUL-SDValue
+// `N`
+static bool isContractableFMUL(const TargetOptions &Options, SDValue N) {
+  assert(N.getOpcode() == ISD::FMUL);
+
+  return Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+         N->getFlags().hasAllowContract();
+}
+
+// Returns true if `N` can assume no infinities involved in its computation.
+static bool hasNoInfs(const TargetOptions &Options, SDValue N) {
+  return Options.NoInfsFPMath || N->getFlags().hasNoInfs();
+}
+
+/// Try to perform FMA combining on a given FADD node.
+template <class MatchContextClass>
+SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc SL(N);
+  MatchContextClass matcher(DAG, TLI, N);
+  const TargetOptions &Options = DAG.getTarget().Options;
+
+  bool UseVP = std::is_same_v<MatchContextClass, VPMatchContext>;
+
+  // Floating-point multiply-add with intermediate rounding.
+  // FIXME: Make isFMADLegal have specific behavior when using VPMatchContext.
+  // FIXME: Add VP_FMAD opcode.
+  bool HasFMAD = !UseVP && (LegalOperations && TLI.isFMADLegal(DAG, N));
+
+  // Floating-point multiply-add without intermediate rounding.
+  bool HasFMA =
+      TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) &&
+      (!LegalOperations || matcher.isOperationLegalOrCustom(ISD::FMA, VT));
+
+  // No valid opcode, do not combine.
+  if (!HasFMAD && !HasFMA)
+    return SDValue();
+
+  bool CanReassociate =
+      Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
+  bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+                              Options.UnsafeFPMath || HasFMAD);
+  // If the addition is not contractable, do not combine.
+  if (!AllowFusionGlobally && !N->getFlags().hasAllowContract())
+    return SDValue();
+
+  // Folding fadd (fmul x, y), (fmul x, y) -> fma x, y, (fmul x, y) is never
+  // beneficial. It does not reduce latency. It increases register pressure. It
+  // replaces an fadd with an fma which is a more complex instruction, so is
+  // likely to have a larger encoding, use more functional units, etc.
+  if (N0 == N1)
+    return SDValue();
+
+  if (TLI.generateFMAsInMachineCombiner(VT, OptLevel))
+    return SDValue();
+
+  // Always prefer FMAD to FMA for precision.
+  unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
+  bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
+
+  auto isFusedOp = [&](SDValue N) {
+    return matcher.match(N, ISD::FMA) || matcher.match(N, ISD::FMAD);
+  };
+
+  // Is the node an FMUL and contractable either due to global flags or
+  // SDNodeFlags.
+  auto isContractableFMUL = [AllowFusionGlobally, &matcher](SDValue N) {
+    if (!matcher.match(N, ISD::FMUL))
+      return false;
+    return AllowFusionGlobally || N->getFlags().hasAllowContract();
+  };
+  // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) {
+    if (N0->use_size() > N1->use_size())
+      std::swap(N0, N1);
+  }
+
+  // fold (fadd (fmul x, y), z) -> (fma x, y, z)
+  if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
+    return matcher.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0),
+                           N0.getOperand(1), N1);
+  }
+
+  // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
+  // Note: Commutes FADD operands.
+  if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {
+    return matcher.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0),
+                           N1.getOperand(1), N0);
+  }
+
+  // fadd (fma A, B, (fmul C, D)), E --> fma A, B, (fma C, D, E)
+  // fadd E, (fma A, B, (fmul C, D)) --> fma A, B, (fma C, D, E)
+  // This also works with nested fma instructions:
+  // fadd (fma A, B, (fma (C, D, (fmul (E, F))))), G -->
+  // fma A, B, (fma C, D, fma (E, F, G))
+  // fadd (G, (fma A, B, (fma (C, D, (fmul (E, F)))))) -->
+  // fma A, B, (fma C, D, fma (E, F, G)).
+  // This requires reassociation because it changes the order of operations.
+  if (CanReassociate) {
+    SDValue FMA, E;
+    if (isFusedOp(N0) && N0.hasOneUse()) {
+      FMA = N0;
+      E = N1;
+    } else if (isFusedOp(N1) && N1.hasOneUse()) {
+      FMA = N1;
+      E = N0;
+    }
+
+    SDValue TmpFMA = FMA;
+    while (E && isFusedOp(TmpFMA) && TmpFMA.hasOneUse()) {
+      SDValue FMul = TmpFMA->getOperand(2);
+      if (matcher.match(FMul, ISD::FMUL) && FMul.hasOneUse()) {
+        SDValue C = FMul.getOperand(0);
+        SDValue D = FMul.getOperand(1);
+        SDValue CDE = matcher.getNode(PreferredFusedOpcode, SL, VT, C, D, E);
+        DAG.ReplaceAllUsesOfValueWith(FMul, CDE);
+        // Replacing the inner FMul could cause the outer FMA to be simplified
+        // away.
+        return FMA.getOpcode() == ISD::DELETED_NODE ? SDValue() : FMA;
+      }
+
+      TmpFMA = TmpFMA->getOperand(2);
+    }
+  }
+
+  // Look through FP_EXTEND nodes to do more combining.
+
+  // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
+  if (matcher.match(N0, ISD::FP_EXTEND)) {
+    SDValue N00 = N0.getOperand(0);
+    if (isContractableFMUL(N00) &&
+        TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
+                            N00.getValueType())) {
+      return matcher.getNode(
+          PreferredFusedOpcode, SL, VT,
+          matcher.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)),
+          matcher.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), N1);
+    }
+  }
+
+  // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x)
+  // Note: Commutes FADD operands.
+  if (matcher.match(N1, ISD::FP_EXTEND)) {
+    SDValue N10 = N1.getOperand(0);
+    if (isContractableFMUL(N10) &&
+        TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
+                            N10.getValueType())) {
+      return matcher.getNode(
+          PreferredFusedOpcode, SL, VT,
+          matcher.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0)),
+          matcher.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), N0);
+    }
+  }
+
+  // More folding opportunities when target permits.
+  if (Aggressive) {
+    // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
+    //   -> (fma x, y, (fma (fpext u), (fpext v), z))
+    auto FoldFAddFMAFPExtFMul = [&](SDValue X, SDValue Y, SDValue U, SDValue V,
+                                    SDValue Z) {
+      return matcher.getNode(
+          PreferredFusedOpcode, SL, VT, X, Y,
+          matcher.getNode(PreferredFusedOpcode, SL, VT,
+                          matcher.getNode(ISD::FP_EXTEND, SL, VT, U),
+                          matcher.getNode(ISD::FP_EXTEND, SL, VT, V), Z));
+    };
+    if (isFusedOp(N0)) {
+      SDValue N02 = N0.getOperand(2);
+      if (matcher.match(N02, ISD::FP_EXTEND)) {
+        SDValue N020 = N02.getOperand(0);
+        if (isContractableFMUL(N020) &&
+            TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
+                                N020.getValueType())) {
+          return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),
+                                      N020.getOperand(0), N020.getOperand(1),
+                                      N1);
+        }
+      }
+    }
+
+    // fold (fadd (fpext (fma x, y, (fmul u, v))), z)
+    //   -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
+    // FIXME: This turns two single-precision and one double-precision
+    // operation into two double-precision operations, which might not be
+    // interesting for all targets, especially GPUs.
+    auto FoldFAddFPExtFMAFMul = [&](SDValue X, SDValue Y, SDValue U, SDValue V,
+                                    SDValue Z) {
+      return matcher.getNode(
+          PreferredFusedOpcode, SL, VT,
+          matcher.getNode(ISD::FP_EXTEND, SL, VT, X),
+          matcher.getNode(ISD::FP_EXTEND, SL, VT, Y),
+          matcher.getNode(PreferredFusedOpcode, SL, VT,
+                          matcher.getNode(ISD::FP_EXTEND, SL, VT, U),
+                          matcher.getNode(ISD::FP_EXTEND, SL, VT, V), Z));
+    };
+    if (N0.getOpcode() == ISD::FP_EXTEND) {
+      SDValue N00 = N0.getOperand(0);
+      if (isFusedOp(N00)) {
+        SDValue N002 = N00.getOperand(2);
+        if (isContractableFMUL(N002) &&
+            TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
+                                N00.getValueType())) {
+          return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),
+                                      N002.getOperand(0), N002.getOperand(1),
+                                      N1);
+        }
+      }
+    }
+
+    // fold (fadd x, (fma y, z, (fpext (fmul u, v)))
+    //   -> (fma y, z, (fma (fpext u), (fpext v), x))
+    if (isFusedOp(N1)) {
+      SDValue N12 = N1.getOperand(2);
+      if (N12.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N120 = N12.getOperand(0);
+        if (isContractableFMUL(N120) &&
+            TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
+                                N120.getValueType())) {
+          return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),
+                                      N120.getOperand(0), N120.getOperand(1),
+                                      N0);
+        }
+      }
+    }
+
+    // fold (fadd x, (fpext (fma y, z, (fmul u, v)))
+    //   -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x))
+    // FIXME: This turns two single-precision and one double-precision
+    // operation into two double-precision operations, which might not be
+    // interesting for all targets, especially GPUs.
+    if (N1.getOpcode() == ISD::FP_EXTEND) {
+      SDValue N10 = N1.getOperand(0);
+      if (isFusedOp(N10)) {
+        SDValue N102 = N10.getOperand(2);
+        if (isContractableFMUL(N102) &&
+            TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
+                                N10.getValueType())) {
+          return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),
+                                      N102.getOperand(0), N102.getOperand(1),
+                                      N0);
+        }
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+/// Try to perform FMA combining on a given FSUB node.
+template <class MatchContextClass>
+SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc SL(N);
+  MatchContextClass matcher(DAG, TLI, N);
+  const TargetOptions &Options = DAG.getTarget().Options;
+
+  bool UseVP = std::is_same_v<MatchContextClass, VPMatchContext>;
+
+  // Floating-point multiply-add with intermediate rounding.
+  // FIXME: Make isFMADLegal have specific behavior when using VPMatchContext.
+  // FIXME: Add VP_FMAD opcode.
+  bool HasFMAD = !UseVP && (LegalOperations && TLI.isFMADLegal(DAG, N));
+
+  // Floating-point multiply-add without intermediate rounding.
+  bool HasFMA =
+      TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) &&
+      (!LegalOperations || matcher.isOperationLegalOrCustom(ISD::FMA, VT));
+
+  // No valid opcode, do not combine.
+  if (!HasFMAD && !HasFMA)
+    return SDValue();
+
+  const SDNodeFlags Flags = N->getFlags();
+  bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+                              Options.UnsafeFPMath || HasFMAD);
+
+  // If the subtraction is not contractable, do not combine.
+  if (!AllowFusionGlobally && !N->getFlags().hasAllowContract())
+    return SDValue();
+
+  if (TLI.generateFMAsInMachineCombiner(VT, OptLevel))
+    return SDValue();
+
+  // Always prefer FMAD to FMA for precision.
+  unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
+  bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
+  bool NoSignedZero = Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros();
+
+  // Is the node an FMUL and contractable either due to global flags or
+  // SDNodeFlags.
+  auto isContractableFMUL = [AllowFusionGlobally, &matcher](SDValue N) {
+    if (!matcher.match(N, ISD::FMUL))
+      return false;
+    return AllowFusionGlobally || N->getFlags().hasAllowContract();
+  };
+
+  // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+  auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) {
+    if (isContractableFMUL(XY) && (Aggressive || XY->hasOneUse())) {
+      return matcher.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0),
+                             XY.getOperand(1),
+                             matcher.getNode(ISD::FNEG, SL, VT, Z));
+    }
+    return SDValue();
+  };
+
+  // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+  // Note: Commutes FSUB operands.
+  auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) {
+    if (isContractableFMUL(YZ) && (Aggressive || YZ->hasOneUse())) {
+      return matcher.getNode(
+          PreferredFusedOpcode, SL, VT,
+          matcher.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)),
+          YZ.getOperand(1), X);
+    }
+    return SDValue();
+  };
+
+  // If we have two choices trying to fold (fsub (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  if (isContractableFMUL(N0) && isContractableFMUL(N1) &&
+      (N0->use_size() > N1->use_size())) {
+    // fold (fsub (fmul a, b), (fmul c, d)) -> (fma (fneg c), d, (fmul a, b))
+    if (SDValue V = tryToFoldXSubYZ(N0, N1))
+      return V;
+    // fold (fsub (fmul a, b), (fmul c, d)) -> (fma a, b, (fneg (fmul c, d)))
+    if (SDValue V = tryToFoldXYSubZ(N0, N1))
+      return V;
+  } else {
+    // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+    if (SDValue V = tryToFoldXYSubZ(N0, N1))
+      return V;
+    // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+    if (SDValue V = tryToFoldXSubYZ(N0, N1))
+      return V;
+  }
+
+  // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
+  if (matcher.match(N0, ISD::FNEG) && isContractableFMUL(N0.getOperand(0)) &&
+      (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {
+    SDValue N00 = N0.getOperand(0).getOperand(0);
+    SDValue N01 = N0.getOperand(0).getOperand(1);
+    return matcher.getNode(PreferredFusedOpcode, SL, VT,
+                           matcher.getNode(ISD::FNEG, SL, VT, N00), N01,
+                           matcher.getNode(ISD::FNEG, SL, VT, N1));
+  }
+
+  // Look through FP_EXTEND nodes to do more combining.
+
+  // fold (fsub (fpext (fmul x, y)), z)
+  //   -> (fma (fpext x), (fpext y), (fneg z))
+  if (matcher.match(N0, ISD::FP_EXTEND)) {
+    SDValue N00 = N0.getOperand(0);
+    if (isContractableFMUL(N00) &&
+        TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
+                            N00.getValueType())) {
+      return matcher.getNode(
+          PreferredFusedOpcode, SL, VT,
+          matcher.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)),
+          matcher.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)),
+          matcher.getNode(ISD::FNEG, SL, VT, N1));
+    }
+  }
+
+  // fold (fsub x, (fpext (fmul y, z)))
+  //   -> (fma (fneg (fpext y)), (fpext z), x)
+  // Note: Commutes FSUB operands.
+  if (matcher.match(N1, ISD::FP_EXTEND)) {
+    SDValue N10 = N1.getOperand(0);
+    if (isContractableFMUL(N10) &&
+        TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
+                            N10.getValueType())) {
+      return matcher.getNode(
+          PreferredFusedOpcode, SL, VT,
+          matcher.getNode(
+              ISD::FNEG, SL, VT,
+              matcher.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0))),
+          matcher.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), N0);
+    }
+  }
+
+  // fold (fsub (fpext (fneg (fmul, x, y))), z)
+  //   -> (fneg (fma (fpext x), (fpext y), z))
+  // Note: This could be removed with appropriate canonicalization of the
+  // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
+  // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
+  // from implementing the canonicalization in visitFSUB.
+  if (matcher.match(N0, ISD::FP_EXTEND)) {
+    SDValue N00 = N0.getOperand(0);
+    if (matcher.match(N00, ISD::FNEG)) {
+      SDValue N000 = N00.getOperand(0);
+      if (isContractableFMUL(N000) &&
+          TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
+                              N00.getValueType())) {
+        return matcher.getNode(
+            ISD::FNEG, SL, VT,
+            matcher.getNode(
+                PreferredFusedOpcode, SL, VT,
+                matcher.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)),
+                matcher.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)),
+                N1));
+      }
+    }
+  }
+
+  // fold (fsub (fneg (fpext (fmul, x, y))), z)
+  //   -> (fneg (fma (fpext x)), (fpext y), z)
+  // Note: This could be removed with appropriate canonicalization of the
+  // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
+  // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
+  // from implementing the canonicalization in visitFSUB.
+  if (matcher.match(N0, ISD::FNEG)) {
+    SDValue N00 = N0.getOperand(0);
+    if (matcher.match(N00, ISD::FP_EXTEND)) {
+      SDValue N000 = N00.getOperand(0);
+      if (isContractableFMUL(N000) &&
+          TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
+                              N000.getValueType())) {
+        return matcher.getNode(
+            ISD::FNEG, SL, VT,
+            matcher.getNode(
+                PreferredFusedOpcode, SL, VT,
+                matcher.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)),
+                matcher.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)),
+                N1));
+      }
+    }
+  }
+
+  auto isReassociable = [&Options](SDNode *N) {
+    return Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
+  };
+
+  auto isContractableAndReassociableFMUL = [&isContractableFMUL,
+                                            &isReassociable](SDValue N) {
+    return isContractableFMUL(N) && isReassociable(N.getNode());
+  };
+
+  auto isFusedOp = [&](SDValue N) {
+    return matcher.match(N, ISD::FMA) || matcher.match(N, ISD::FMAD);
+  };
+
+  // More folding opportunities when target permits.
+  if (Aggressive && isReassociable(N)) {
+    bool CanFuse = Options.UnsafeFPMath || N->getFlags().hasAllowContract();
+    // fold (fsub (fma x, y, (fmul u, v)), z)
+    //   -> (fma x, y (fma u, v, (fneg z)))
+    if (CanFuse && isFusedOp(N0) &&
+        isContractableAndReassociableFMUL(N0.getOperand(2)) &&
+        N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
+      return matcher.getNode(
+          PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1),
+          matcher.getNode(PreferredFusedOpcode, SL, VT,
+                          N0.getOperand(2).getOperand(0),
+                          N0.getOperand(2).getOperand(1),
+                          matcher.getNode(ISD::FNEG, SL, VT, N1)));
+    }
+
+    // fold (fsub x, (fma y, z, (fmul u, v)))
+    //   -> (fma (fneg y), z, (fma (fneg u), v, x))
+    if (CanFuse && isFusedOp(N1) &&
+        isContractableAndReassociableFMUL(N1.getOperand(2)) &&
+        N1->hasOneUse() && NoSignedZero) {
+      SDValue N20 = N1.getOperand(2).getOperand(0);
+      SDValue N21 = N1.getOperand(2).getOperand(1);
+      return matcher.getNode(
+          PreferredFusedOpcode, SL, VT,
+          matcher.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)),
+          N1.getOperand(1),
+          matcher.getNode(PreferredFusedOpcode, SL, VT,
+                          matcher.getNode(ISD::FNEG, SL, VT, N20), N21, N0));
+    }
+
+    // fold (fsub (fma x, y, (fpext (fmul u, v))), z)
+    //   -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
+    if (isFusedOp(N0) && N0->hasOneUse()) {
+      SDValue N02 = N0.getOperand(2);
+      if (matcher.match(N02, ISD::FP_EXTEND)) {
+        SDValue N020 = N02.getOperand(0);
+        if (isContractableAndReassociableFMUL(N020) &&
+            TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
+                                N020.getValueType())) {
+          return matcher.getNode(
+              PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1),
+              matcher.getNode(
+                  PreferredFusedOpcode, SL, VT,
+                  matcher.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(0)),
+                  matcher.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(1)),
+                  matcher.getNode(ISD::FNEG, SL, VT, N1)));
+        }
+      }
+    }
+
+    // fold (fsub (fpext (fma x, y, (fmul u, v))), z)
+    //   -> (fma (fpext x), (fpext y),
+    //           (fma (fpext u), (fpext v), (fneg z)))
+    // FIXME: This turns two single-precision and one double-precision
+    // operation into two double-precision operations, which might not be
+    // interesting for all targets, especially GPUs.
+    if (matcher.match(N0, ISD::FP_EXTEND)) {
+      SDValue N00 = N0.getOperand(0);
+      if (isFusedOp(N00)) {
+        SDValue N002 = N00.getOperand(2);
+        if (isContractableAndReassociableFMUL(N002) &&
+            TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
+                                N00.getValueType())) {
+          return matcher.getNode(
+              PreferredFusedOpcode, SL, VT,
+              matcher.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)),
+              matcher.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)),
+              matcher.getNode(
+                  PreferredFusedOpcode, SL, VT,
+                  matcher.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(0)),
+                  matcher.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(1)),
+                  matcher.getNode(ISD::FNEG, SL, VT, N1)));
+        }
+      }
+    }
+
+    // fold (fsub x, (fma y, z, (fpext (fmul u, v))))
+    //   -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x))
+    if (isFusedOp(N1) && matcher.match(N1.getOperand(2), ISD::FP_EXTEND) &&
+        N1->hasOneUse()) {
+      SDValue N120 = N1.getOperand(2).getOperand(0);
+      if (isContractableAndReassociableFMUL(N120) &&
+          TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
+                              N120.getValueType())) {
+        SDValue N1200 = N120.getOperand(0);
+        SDValue N1201 = N120.getOperand(1);
+        return matcher.getNode(
+            PreferredFusedOpcode, SL, VT,
+            matcher.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)),
+            N1.getOperand(1),
+            matcher.getNode(
+                PreferredFusedOpcode, SL, VT,
+                matcher.getNode(ISD::FNEG, SL, VT,
+                                matcher.getNode(ISD::FP_EXTEND, SL, VT, N1200)),
+                matcher.getNode(ISD::FP_EXTEND, SL, VT, N1201), N0));
+      }
+    }
+
+    // fold (fsub x, (fpext (fma y, z, (fmul u, v))))
+    //   -> (fma (fneg (fpext y)), (fpext z),
+    //           (fma (fneg (fpext u)), (fpext v), x))
+    // FIXME: This turns two single-precision and one double-precision
+    // operation into two double-precision operations, which might not be
+    // interesting for all targets, especially GPUs.
+    if (matcher.match(N1, ISD::FP_EXTEND) && isFusedOp(N1.getOperand(0))) {
+      SDValue CvtSrc = N1.getOperand(0);
+      SDValue N100 = CvtSrc.getOperand(0);
+      SDValue N101 = CvtSrc.getOperand(1);
+      SDValue N102 = CvtSrc.getOperand(2);
+      if (isContractableAndReassociableFMUL(N102) &&
+          TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
+                              CvtSrc.getValueType())) {
+        SDValue N1020 = N102.getOperand(0);
+        SDValue N1021 = N102.getOperand(1);
+        return matcher.getNode(
+            PreferredFusedOpcode, SL, VT,
+            matcher.getNode(ISD::FNEG, SL, VT,
+                            matcher.getNode(ISD::FP_EXTEND, SL, VT, N100)),
+            matcher.getNode(ISD::FP_EXTEND, SL, VT, N101),
+            matcher.getNode(
+                PreferredFusedOpcode, SL, VT,
+                matcher.getNode(ISD::FNEG, SL, VT,
+                                matcher.getNode(ISD::FP_EXTEND, SL, VT, N1020)),
+                matcher.getNode(ISD::FP_EXTEND, SL, VT, N1021), N0));
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+/// Try to perform FMA combining on a given FMUL node based on the distributive
+/// law x * (y + 1) = x * y + x and variants thereof (commuted versions,
+/// subtraction instead of addition).
+SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc SL(N);
+
+  assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation");
+
+  const TargetOptions &Options = DAG.getTarget().Options;
+
+  // The transforms below are incorrect when x == 0 and y == inf, because the
+  // intermediate multiplication produces a nan.
+  SDValue FAdd = N0.getOpcode() == ISD::FADD ? N0 : N1;
+  if (!hasNoInfs(Options, FAdd))
+    return SDValue();
+
+  // Floating-point multiply-add without intermediate rounding.
+  bool HasFMA =
+      isContractableFMUL(Options, SDValue(N, 0)) &&
+      TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
+
+  // Floating-point multiply-add with intermediate rounding. This can result
+  // in a less precise result due to the changed rounding order.
+  bool HasFMAD = Options.UnsafeFPMath &&
+                 (LegalOperations && TLI.isFMADLegal(DAG, N));
+
+  // No valid opcode, do not combine.
+  if (!HasFMAD && !HasFMA)
+    return SDValue();
+
+  // Always prefer FMAD to FMA for precision.
+  unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
+  bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
+
+  // fold (fmul (fadd x0, +1.0), y) -> (fma x0, y, y)
+  // fold (fmul (fadd x0, -1.0), y) -> (fma x0, y, (fneg y))
+  auto FuseFADD = [&](SDValue X, SDValue Y) {
+    if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) {
+      if (auto *C = isConstOrConstSplatFP(X.getOperand(1), true)) {
+        if (C->isExactlyValue(+1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                             Y);
+        if (C->isExactlyValue(-1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                             DAG.getNode(ISD::FNEG, SL, VT, Y));
+      }
+    }
+    return SDValue();
+  };
+
+  if (SDValue FMA = FuseFADD(N0, N1))
+    return FMA;
+  if (SDValue FMA = FuseFADD(N1, N0))
+    return FMA;
+
+  // fold (fmul (fsub +1.0, x1), y) -> (fma (fneg x1), y, y)
+  // fold (fmul (fsub -1.0, x1), y) -> (fma (fneg x1), y, (fneg y))
+  // fold (fmul (fsub x0, +1.0), y) -> (fma x0, y, (fneg y))
+  // fold (fmul (fsub x0, -1.0), y) -> (fma x0, y, y)
+  auto FuseFSUB = [&](SDValue X, SDValue Y) {
+    if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) {
+      if (auto *C0 = isConstOrConstSplatFP(X.getOperand(0), true)) {
+        if (C0->isExactlyValue(+1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                             DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
+                             Y);
+        if (C0->isExactlyValue(-1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                             DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
+                             DAG.getNode(ISD::FNEG, SL, VT, Y));
+      }
+      if (auto *C1 = isConstOrConstSplatFP(X.getOperand(1), true)) {
+        if (C1->isExactlyValue(+1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                             DAG.getNode(ISD::FNEG, SL, VT, Y));
+        if (C1->isExactlyValue(-1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                             Y);
+      }
+    }
+    return SDValue();
+  };
+
+  if (SDValue FMA = FuseFSUB(N0, N1))
+    return FMA;
+  if (SDValue FMA = FuseFSUB(N1, N0))
+    return FMA;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitVP_FADD(SDNode *N) {
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+
+  // FADD -> FMA combines:
+  if (SDValue Fused = visitFADDForFMACombine<VPMatchContext>(N)) {
+    AddToWorklist(Fused.getNode());
+    return Fused;
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFADD(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDNode *N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0);
+  SDNode *N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  const TargetOptions &Options = DAG.getTarget().Options;
+  SDNodeFlags Flags = N->getFlags();
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+
+  if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
+    return R;
+
+  // fold (fadd c1, c2) -> c1 + c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FADD, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS
+  if (N0CFP && !N1CFP)
+    return DAG.getNode(ISD::FADD, DL, VT, N1, N0);
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+  // N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math)
+  ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true);
+  if (N1C && N1C->isZero())
+    if (N1C->isNegative() || Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())
+      return N0;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  // fold (fadd A, (fneg B)) -> (fsub A, B)
+  if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
+    if (SDValue NegN1 = TLI.getCheaperNegatedExpression(
+            N1, DAG, LegalOperations, ForCodeSize))
+      return DAG.getNode(ISD::FSUB, DL, VT, N0, NegN1);
+
+  // fold (fadd (fneg A), B) -> (fsub B, A)
+  if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
+    if (SDValue NegN0 = TLI.getCheaperNegatedExpression(
+            N0, DAG, LegalOperations, ForCodeSize))
+      return DAG.getNode(ISD::FSUB, DL, VT, N1, NegN0);
+
+  auto isFMulNegTwo = [](SDValue FMul) {
+    if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL)
+      return false;
+    auto *C = isConstOrConstSplatFP(FMul.getOperand(1), true);
+    return C && C->isExactlyValue(-2.0);
+  };
+
+  // fadd (fmul B, -2.0), A --> fsub A, (fadd B, B)
+  if (isFMulNegTwo(N0)) {
+    SDValue B = N0.getOperand(0);
+    SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B);
+    return DAG.getNode(ISD::FSUB, DL, VT, N1, Add);
+  }
+  // fadd A, (fmul B, -2.0) --> fsub A, (fadd B, B)
+  if (isFMulNegTwo(N1)) {
+    SDValue B = N1.getOperand(0);
+    SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B);
+    return DAG.getNode(ISD::FSUB, DL, VT, N0, Add);
+  }
+
+  // No FP constant should be created after legalization as Instruction
+  // Selection pass has a hard time dealing with FP constants.
+  bool AllowNewConst = (Level < AfterLegalizeDAG);
+
+  // If nnan is enabled, fold lots of things.
+  if ((Options.NoNaNsFPMath || Flags.hasNoNaNs()) && AllowNewConst) {
+    // If allowed, fold (fadd (fneg x), x) -> 0.0
+    if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
+      return DAG.getConstantFP(0.0, DL, VT);
+
+    // If allowed, fold (fadd x, (fneg x)) -> 0.0
+    if (N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0)
+      return DAG.getConstantFP(0.0, DL, VT);
+  }
+
+  // If 'unsafe math' or reassoc and nsz, fold lots of things.
+  // TODO: break out portions of the transformations below for which Unsafe is
+  //       considered and which do not require both nsz and reassoc
+  if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) ||
+       (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
+      AllowNewConst) {
+    // fadd (fadd x, c1), c2 -> fadd x, c1 + c2
+    if (N1CFP && N0.getOpcode() == ISD::FADD &&
+        DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
+      SDValue NewC = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1);
+      return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), NewC);
+    }
+
+    // We can fold chains of FADD's of the same value into multiplications.
+    // This transform is not safe in general because we are reducing the number
+    // of rounding steps.
+    if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) {
+      if (N0.getOpcode() == ISD::FMUL) {
+        SDNode *CFP00 =
+            DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
+        SDNode *CFP01 =
+            DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1));
+
+        // (fadd (fmul x, c), x) -> (fmul x, c+1)
+        if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
+          SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
+                                       DAG.getConstantFP(1.0, DL, VT));
+          return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP);
+        }
+
+        // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2)
+        if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
+            N1.getOperand(0) == N1.getOperand(1) &&
+            N0.getOperand(0) == N1.getOperand(0)) {
+          SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
+                                       DAG.getConstantFP(2.0, DL, VT));
+          return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP);
+        }
+      }
+
+      if (N1.getOpcode() == ISD::FMUL) {
+        SDNode *CFP10 =
+            DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
+        SDNode *CFP11 =
+            DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(1));
+
+        // (fadd x, (fmul x, c)) -> (fmul x, c+1)
+        if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
+          SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
+                                       DAG.getConstantFP(1.0, DL, VT));
+          return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP);
+        }
+
+        // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2)
+        if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD &&
+            N0.getOperand(0) == N0.getOperand(1) &&
+            N1.getOperand(0) == N0.getOperand(0)) {
+          SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
+                                       DAG.getConstantFP(2.0, DL, VT));
+          return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP);
+        }
+      }
+
+      if (N0.getOpcode() == ISD::FADD) {
+        SDNode *CFP00 =
+            DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
+        // (fadd (fadd x, x), x) -> (fmul x, 3.0)
+        if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) &&
+            (N0.getOperand(0) == N1)) {
+          return DAG.getNode(ISD::FMUL, DL, VT, N1,
+                             DAG.getConstantFP(3.0, DL, VT));
+        }
+      }
+
+      if (N1.getOpcode() == ISD::FADD) {
+        SDNode *CFP10 =
+            DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
+        // (fadd x, (fadd x, x)) -> (fmul x, 3.0)
+        if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
+            N1.getOperand(0) == N0) {
+          return DAG.getNode(ISD::FMUL, DL, VT, N0,
+                             DAG.getConstantFP(3.0, DL, VT));
+        }
+      }
+
+      // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0)
+      if (N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
+          N0.getOperand(0) == N0.getOperand(1) &&
+          N1.getOperand(0) == N1.getOperand(1) &&
+          N0.getOperand(0) == N1.getOperand(0)) {
+        return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0),
+                           DAG.getConstantFP(4.0, DL, VT));
+      }
+    }
+
+    // Fold fadd(vecreduce(x), vecreduce(y)) -> vecreduce(fadd(x, y))
+    if (SDValue SD = reassociateReduction(ISD::VECREDUCE_FADD, ISD::FADD, DL,
+                                          VT, N0, N1, Flags))
+      return SD;
+  } // enable-unsafe-fp-math
+
+  // FADD -> FMA combines:
+  if (SDValue Fused = visitFADDForFMACombine<EmptyMatchContext>(N)) {
+    AddToWorklist(Fused.getNode());
+    return Fused;
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSTRICT_FADD(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue N0 = N->getOperand(1);
+  SDValue N1 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  EVT ChainVT = N->getValueType(1);
+  SDLoc DL(N);
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+
+  // fold (strict_fadd A, (fneg B)) -> (strict_fsub A, B)
+  if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::STRICT_FSUB, VT))
+    if (SDValue NegN1 = TLI.getCheaperNegatedExpression(
+            N1, DAG, LegalOperations, ForCodeSize)) {
+      return DAG.getNode(ISD::STRICT_FSUB, DL, DAG.getVTList(VT, ChainVT),
+                         {Chain, N0, NegN1});
+    }
+
+  // fold (strict_fadd (fneg A), B) -> (strict_fsub B, A)
+  if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::STRICT_FSUB, VT))
+    if (SDValue NegN0 = TLI.getCheaperNegatedExpression(
+            N0, DAG, LegalOperations, ForCodeSize)) {
+      return DAG.getNode(ISD::STRICT_FSUB, DL, DAG.getVTList(VT, ChainVT),
+                         {Chain, N1, NegN0});
+    }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFSUB(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, true);
+  ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  const TargetOptions &Options = DAG.getTarget().Options;
+  const SDNodeFlags Flags = N->getFlags();
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+
+  if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
+    return R;
+
+  // fold (fsub c1, c2) -> c1-c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FSUB, DL, VT, {N0, N1}))
+    return C;
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  // (fsub A, 0) -> A
+  if (N1CFP && N1CFP->isZero()) {
+    if (!N1CFP->isNegative() || Options.NoSignedZerosFPMath ||
+        Flags.hasNoSignedZeros()) {
+      return N0;
+    }
+  }
+
+  if (N0 == N1) {
+    // (fsub x, x) -> 0.0
+    if (Options.NoNaNsFPMath || Flags.hasNoNaNs())
+      return DAG.getConstantFP(0.0f, DL, VT);
+  }
+
+  // (fsub -0.0, N1) -> -N1
+  if (N0CFP && N0CFP->isZero()) {
+    if (N0CFP->isNegative() ||
+        (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) {
+      // We cannot replace an FSUB(+-0.0,X) with FNEG(X) when denormals are
+      // flushed to zero, unless all users treat denorms as zero (DAZ).
+      // FIXME: This transform will change the sign of a NaN and the behavior
+      // of a signaling NaN. It is only valid when a NoNaN flag is present.
+      DenormalMode DenormMode = DAG.getDenormalMode(VT);
+      if (DenormMode == DenormalMode::getIEEE()) {
+        if (SDValue NegN1 =
+                TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))
+          return NegN1;
+        if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+          return DAG.getNode(ISD::FNEG, DL, VT, N1);
+      }
+    }
+  }
+
+  if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) ||
+       (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
+      N1.getOpcode() == ISD::FADD) {
+    // X - (X + Y) -> -Y
+    if (N0 == N1->getOperand(0))
+      return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(1));
+    // X - (Y + X) -> -Y
+    if (N0 == N1->getOperand(1))
+      return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(0));
+  }
+
+  // fold (fsub A, (fneg B)) -> (fadd A, B)
+  if (SDValue NegN1 =
+          TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))
+    return DAG.getNode(ISD::FADD, DL, VT, N0, NegN1);
+
+  // FSUB -> FMA combines:
+  if (SDValue Fused = visitFSUBForFMACombine<EmptyMatchContext>(N)) {
+    AddToWorklist(Fused.getNode());
+    return Fused;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFMUL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  const TargetOptions &Options = DAG.getTarget().Options;
+  const SDNodeFlags Flags = N->getFlags();
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+
+  if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
+    return R;
+
+  // fold (fmul c1, c2) -> c1*c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FMUL, DL, VT, {N0, N1}))
+    return C;
+
+  // canonicalize constant to RHS
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0) &&
+     !DAG.isConstantFPBuildVectorOrConstantFP(N1))
+    return DAG.getNode(ISD::FMUL, DL, VT, N1, N0);
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  if (Options.UnsafeFPMath || Flags.hasAllowReassociation()) {
+    // fmul (fmul X, C1), C2 -> fmul X, C1 * C2
+    if (DAG.isConstantFPBuildVectorOrConstantFP(N1) &&
+        N0.getOpcode() == ISD::FMUL) {
+      SDValue N00 = N0.getOperand(0);
+      SDValue N01 = N0.getOperand(1);
+      // Avoid an infinite loop by making sure that N00 is not a constant
+      // (the inner multiply has not been constant folded yet).
+      if (DAG.isConstantFPBuildVectorOrConstantFP(N01) &&
+          !DAG.isConstantFPBuildVectorOrConstantFP(N00)) {
+        SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1);
+        return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts);
+      }
+    }
+
+    // Match a special-case: we convert X * 2.0 into fadd.
+    // fmul (fadd X, X), C -> fmul X, 2.0 * C
+    if (N0.getOpcode() == ISD::FADD && N0.hasOneUse() &&
+        N0.getOperand(0) == N0.getOperand(1)) {
+      const SDValue Two = DAG.getConstantFP(2.0, DL, VT);
+      SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1);
+      return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts);
+    }
+
+    // Fold fmul(vecreduce(x), vecreduce(y)) -> vecreduce(fmul(x, y))
+    if (SDValue SD = reassociateReduction(ISD::VECREDUCE_FMUL, ISD::FMUL, DL,
+                                          VT, N0, N1, Flags))
+      return SD;
+  }
+
+  // fold (fmul X, 2.0) -> (fadd X, X)
+  if (N1CFP && N1CFP->isExactlyValue(+2.0))
+    return DAG.getNode(ISD::FADD, DL, VT, N0, N0);
+
+  // fold (fmul X, -1.0) -> (fsub -0.0, X)
+  if (N1CFP && N1CFP->isExactlyValue(-1.0)) {
+    if (!LegalOperations || TLI.isOperationLegal(ISD::FSUB, VT)) {
+      return DAG.getNode(ISD::FSUB, DL, VT,
+                         DAG.getConstantFP(-0.0, DL, VT), N0, Flags);
+    }
+  }
+
+  // -N0 * -N1 --> N0 * N1
+  TargetLowering::NegatibleCost CostN0 =
+      TargetLowering::NegatibleCost::Expensive;
+  TargetLowering::NegatibleCost CostN1 =
+      TargetLowering::NegatibleCost::Expensive;
+  SDValue NegN0 =
+      TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
+  if (NegN0) {
+    HandleSDNode NegN0Handle(NegN0);
+    SDValue NegN1 =
+        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
+    if (NegN1 && (CostN0 == TargetLowering::NegatibleCost::Cheaper ||
+                  CostN1 == TargetLowering::NegatibleCost::Cheaper))
+      return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1);
+  }
+
+  // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X))
+  // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X)
+  if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() &&
+      (N0.getOpcode() == ISD::SELECT || N1.getOpcode() == ISD::SELECT) &&
+      TLI.isOperationLegal(ISD::FABS, VT)) {
+    SDValue Select = N0, X = N1;
+    if (Select.getOpcode() != ISD::SELECT)
+      std::swap(Select, X);
+
+    SDValue Cond = Select.getOperand(0);
+    auto TrueOpnd  = dyn_cast<ConstantFPSDNode>(Select.getOperand(1));
+    auto FalseOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(2));
+
+    if (TrueOpnd && FalseOpnd &&
+        Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X &&
+        isa<ConstantFPSDNode>(Cond.getOperand(1)) &&
+        cast<ConstantFPSDNode>(Cond.getOperand(1))->isExactlyValue(0.0)) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+      switch (CC) {
+      default: break;
+      case ISD::SETOLT:
+      case ISD::SETULT:
+      case ISD::SETOLE:
+      case ISD::SETULE:
+      case ISD::SETLT:
+      case ISD::SETLE:
+        std::swap(TrueOpnd, FalseOpnd);
+        [[fallthrough]];
+      case ISD::SETOGT:
+      case ISD::SETUGT:
+      case ISD::SETOGE:
+      case ISD::SETUGE:
+      case ISD::SETGT:
+      case ISD::SETGE:
+        if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) &&
+            TLI.isOperationLegal(ISD::FNEG, VT))
+          return DAG.getNode(ISD::FNEG, DL, VT,
+                   DAG.getNode(ISD::FABS, DL, VT, X));
+        if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0))
+          return DAG.getNode(ISD::FABS, DL, VT, X);
+
+        break;
+      }
+    }
+  }
+
+  // FMUL -> FMA combines:
+  if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) {
+    AddToWorklist(Fused.getNode());
+    return Fused;
+  }
+
+  return SDValue();
+}
+
+template <class MatchContextClass> SDValue DAGCombiner::visitFMA(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  const TargetOptions &Options = DAG.getTarget().Options;
+  // FMA nodes have flags that propagate to the created nodes.
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+  MatchContextClass matcher(DAG, TLI, N);
+
+  bool CanReassociate =
+      Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
+
+  // Constant fold FMA.
+  if (isa<ConstantFPSDNode>(N0) &&
+      isa<ConstantFPSDNode>(N1) &&
+      isa<ConstantFPSDNode>(N2)) {
+    return matcher.getNode(ISD::FMA, DL, VT, N0, N1, N2);
+  }
+
+  // (-N0 * -N1) + N2 --> (N0 * N1) + N2
+  TargetLowering::NegatibleCost CostN0 =
+      TargetLowering::NegatibleCost::Expensive;
+  TargetLowering::NegatibleCost CostN1 =
+      TargetLowering::NegatibleCost::Expensive;
+  SDValue NegN0 =
+      TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
+  if (NegN0) {
+    HandleSDNode NegN0Handle(NegN0);
+    SDValue NegN1 =
+        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
+    if (NegN1 && (CostN0 == TargetLowering::NegatibleCost::Cheaper ||
+                  CostN1 == TargetLowering::NegatibleCost::Cheaper))
+      return matcher.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2);
+  }
+
+  // FIXME: use fast math flags instead of Options.UnsafeFPMath
+  if (Options.UnsafeFPMath) {
+    if (N0CFP && N0CFP->isZero())
+      return N2;
+    if (N1CFP && N1CFP->isZero())
+      return N2;
+  }
+
+  // FIXME: Support splat of constant.
+  if (N0CFP && N0CFP->isExactlyValue(1.0))
+    return matcher.getNode(ISD::FADD, SDLoc(N), VT, N1, N2);
+  if (N1CFP && N1CFP->isExactlyValue(1.0))
+    return matcher.getNode(ISD::FADD, SDLoc(N), VT, N0, N2);
+
+  // Canonicalize (fma c, x, y) -> (fma x, c, y)
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0) &&
+     !DAG.isConstantFPBuildVectorOrConstantFP(N1))
+    return matcher.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2);
+
+  if (CanReassociate) {
+    // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
+    if (matcher.match(N2, ISD::FMUL) && N0 == N2.getOperand(0) &&
+        DAG.isConstantFPBuildVectorOrConstantFP(N1) &&
+        DAG.isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) {
+      return matcher.getNode(
+          ISD::FMUL, DL, VT, N0,
+          matcher.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1)));
+    }
+
+    // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
+    if (matcher.match(N0, ISD::FMUL) &&
+        DAG.isConstantFPBuildVectorOrConstantFP(N1) &&
+        DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
+      return matcher.getNode(
+          ISD::FMA, DL, VT, N0.getOperand(0),
+          matcher.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1)), N2);
+    }
+  }
+
+  // (fma x, -1, y) -> (fadd (fneg x), y)
+  // FIXME: Support splat of constant.
+  if (N1CFP) {
+    if (N1CFP->isExactlyValue(1.0))
+      return matcher.getNode(ISD::FADD, DL, VT, N0, N2);
+
+    if (N1CFP->isExactlyValue(-1.0) &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) {
+      SDValue RHSNeg = matcher.getNode(ISD::FNEG, DL, VT, N0);
+      AddToWorklist(RHSNeg.getNode());
+      return matcher.getNode(ISD::FADD, DL, VT, N2, RHSNeg);
+    }
+
+    // fma (fneg x), K, y -> fma x -K, y
+    if (matcher.match(N0, ISD::FNEG) &&
+        (TLI.isOperationLegal(ISD::ConstantFP, VT) ||
+         (N1.hasOneUse() &&
+          !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT, ForCodeSize)))) {
+      return matcher.getNode(ISD::FMA, DL, VT, N0.getOperand(0),
+                             matcher.getNode(ISD::FNEG, DL, VT, N1), N2);
+    }
+  }
+
+  // FIXME: Support splat of constant.
+  if (CanReassociate) {
+    // (fma x, c, x) -> (fmul x, (c+1))
+    if (N1CFP && N0 == N2) {
+      return matcher.getNode(ISD::FMUL, DL, VT, N0,
+                             matcher.getNode(ISD::FADD, DL, VT, N1,
+                                             DAG.getConstantFP(1.0, DL, VT)));
+    }
+
+    // (fma x, c, (fneg x)) -> (fmul x, (c-1))
+    if (N1CFP && matcher.match(N2, ISD::FNEG) && N2.getOperand(0) == N0) {
+      return matcher.getNode(ISD::FMUL, DL, VT, N0,
+                             matcher.getNode(ISD::FADD, DL, VT, N1,
+                                             DAG.getConstantFP(-1.0, DL, VT)));
+    }
+  }
+
+  // fold ((fma (fneg X), Y, (fneg Z)) -> fneg (fma X, Y, Z))
+  // fold ((fma X, (fneg Y), (fneg Z)) -> fneg (fma X, Y, Z))
+  if (!TLI.isFNegFree(VT))
+    if (SDValue Neg = TLI.getCheaperNegatedExpression(
+            SDValue(N, 0), DAG, LegalOperations, ForCodeSize))
+      return matcher.getNode(ISD::FNEG, DL, VT, Neg);
+  return SDValue();
+}
+
+// Combine multiple FDIVs with the same divisor into multiple FMULs by the
+// reciprocal.
+// E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
+// Notice that this is not always beneficial. One reason is different targets
+// may have different costs for FDIV and FMUL, so sometimes the cost of two
+// FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
+// is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
+SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
+  // TODO: Limit this transform based on optsize/minsize - it always creates at
+  //       least 1 extra instruction. But the perf win may be substantial enough
+  //       that only minsize should restrict this.
+  bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath;
+  const SDNodeFlags Flags = N->getFlags();
+  if (LegalDAG || (!UnsafeMath && !Flags.hasAllowReciprocal()))
+    return SDValue();
+
+  // Skip if current node is a reciprocal/fneg-reciprocal.
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, /* AllowUndefs */ true);
+  if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0)))
+    return SDValue();
+
+  // Exit early if the target does not want this transform or if there can't
+  // possibly be enough uses of the divisor to make the transform worthwhile.
+  unsigned MinUses = TLI.combineRepeatedFPDivisors();
+
+  // For splat vectors, scale the number of uses by the splat factor. If we can
+  // convert the division into a scalar op, that will likely be much faster.
+  unsigned NumElts = 1;
+  EVT VT = N->getValueType(0);
+  if (VT.isVector() && DAG.isSplatValue(N1))
+    NumElts = VT.getVectorMinNumElements();
+
+  if (!MinUses || (N1->use_size() * NumElts) < MinUses)
+    return SDValue();
+
+  // Find all FDIV users of the same divisor.
+  // Use a set because duplicates may be present in the user list.
+  SetVector<SDNode *> Users;
+  for (auto *U : N1->uses()) {
+    if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) {
+      // Skip X/sqrt(X) that has not been simplified to sqrt(X) yet.
+      if (U->getOperand(1).getOpcode() == ISD::FSQRT &&
+          U->getOperand(0) == U->getOperand(1).getOperand(0) &&
+          U->getFlags().hasAllowReassociation() &&
+          U->getFlags().hasNoSignedZeros())
+        continue;
+
+      // This division is eligible for optimization only if global unsafe math
+      // is enabled or if this division allows reciprocal formation.
+      if (UnsafeMath || U->getFlags().hasAllowReciprocal())
+        Users.insert(U);
+    }
+  }
+
+  // Now that we have the actual number of divisor uses, make sure it meets
+  // the minimum threshold specified by the target.
+  if ((Users.size() * NumElts) < MinUses)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
+  SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags);
+
+  // Dividend / Divisor -> Dividend * Reciprocal
+  for (auto *U : Users) {
+    SDValue Dividend = U->getOperand(0);
+    if (Dividend != FPOne) {
+      SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend,
+                                    Reciprocal, Flags);
+      CombineTo(U, NewNode);
+    } else if (U != Reciprocal.getNode()) {
+      // In the absence of fast-math-flags, this user node is always the
+      // same node as Reciprocal, but with FMF they may be different nodes.
+      CombineTo(U, Reciprocal);
+    }
+  }
+  return SDValue(N, 0);  // N was replaced.
+}
+
+SDValue DAGCombiner::visitFDIV(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  const TargetOptions &Options = DAG.getTarget().Options;
+  SDNodeFlags Flags = N->getFlags();
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+
+  if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
+    return R;
+
+  // fold (fdiv c1, c2) -> c1/c2
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FDIV, DL, VT, {N0, N1}))
+    return C;
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+      return FoldedVOp;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  if (SDValue V = combineRepeatedFPDivisors(N))
+    return V;
+
+  if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
+    // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
+    if (auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1)) {
+      // Compute the reciprocal 1.0 / c2.
+      const APFloat &N1APF = N1CFP->getValueAPF();
+      APFloat Recip(N1APF.getSemantics(), 1); // 1.0
+      APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
+      // Only do the transform if the reciprocal is a legal fp immediate that
+      // isn't too nasty (eg NaN, denormal, ...).
+      if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
+          (!LegalOperations ||
+           // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
+           // backend)... we should handle this gracefully after Legalize.
+           // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) ||
+           TLI.isOperationLegal(ISD::ConstantFP, VT) ||
+           TLI.isFPImmLegal(Recip, VT, ForCodeSize)))
+        return DAG.getNode(ISD::FMUL, DL, VT, N0,
+                           DAG.getConstantFP(Recip, DL, VT));
+    }
+
+    // If this FDIV is part of a reciprocal square root, it may be folded
+    // into a target-specific square root estimate instruction.
+    if (N1.getOpcode() == ISD::FSQRT) {
+      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags))
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+    } else if (N1.getOpcode() == ISD::FP_EXTEND &&
+               N1.getOperand(0).getOpcode() == ISD::FSQRT) {
+      if (SDValue RV =
+              buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) {
+        RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
+        AddToWorklist(RV.getNode());
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+      }
+    } else if (N1.getOpcode() == ISD::FP_ROUND &&
+               N1.getOperand(0).getOpcode() == ISD::FSQRT) {
+      if (SDValue RV =
+              buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) {
+        RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
+        AddToWorklist(RV.getNode());
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
+      }
+    } else if (N1.getOpcode() == ISD::FMUL) {
+      // Look through an FMUL. Even though this won't remove the FDIV directly,
+      // it's still worthwhile to get rid of the FSQRT if possible.
+      SDValue Sqrt, Y;
+      if (N1.getOperand(0).getOpcode() == ISD::FSQRT) {
+        Sqrt = N1.getOperand(0);
+        Y = N1.getOperand(1);
+      } else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) {
+        Sqrt = N1.getOperand(1);
+        Y = N1.getOperand(0);
+      }
+      if (Sqrt.getNode()) {
+        // If the other multiply operand is known positive, pull it into the
+        // sqrt. That will eliminate the division if we convert to an estimate.
+        if (Flags.hasAllowReassociation() && N1.hasOneUse() &&
+            N1->getFlags().hasAllowReassociation() && Sqrt.hasOneUse()) {
+          SDValue A;
+          if (Y.getOpcode() == ISD::FABS && Y.hasOneUse())
+            A = Y.getOperand(0);
+          else if (Y == Sqrt.getOperand(0))
+            A = Y;
+          if (A) {
+            // X / (fabs(A) * sqrt(Z)) --> X / sqrt(A*A*Z) --> X * rsqrt(A*A*Z)
+            // X / (A * sqrt(A))       --> X / sqrt(A*A*A) --> X * rsqrt(A*A*A)
+            SDValue AA = DAG.getNode(ISD::FMUL, DL, VT, A, A);
+            SDValue AAZ =
+                DAG.getNode(ISD::FMUL, DL, VT, AA, Sqrt.getOperand(0));
+            if (SDValue Rsqrt = buildRsqrtEstimate(AAZ, Flags))
+              return DAG.getNode(ISD::FMUL, DL, VT, N0, Rsqrt);
+
+            // Estimate creation failed. Clean up speculatively created nodes.
+            recursivelyDeleteUnusedNodes(AAZ.getNode());
+          }
+        }
+
+        // We found a FSQRT, so try to make this fold:
+        // X / (Y * sqrt(Z)) -> X * (rsqrt(Z) / Y)
+        if (SDValue Rsqrt = buildRsqrtEstimate(Sqrt.getOperand(0), Flags)) {
+          SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, Rsqrt, Y);
+          AddToWorklist(Div.getNode());
+          return DAG.getNode(ISD::FMUL, DL, VT, N0, Div);
+        }
+      }
+    }
+
+    // Fold into a reciprocal estimate and multiply instead of a real divide.
+    if (Options.NoInfsFPMath || Flags.hasNoInfs())
+      if (SDValue RV = BuildDivEstimate(N0, N1, Flags))
+        return RV;
+  }
+
+  // Fold X/Sqrt(X) -> Sqrt(X)
+  if ((Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) &&
+      (Options.UnsafeFPMath || Flags.hasAllowReassociation()))
+    if (N1.getOpcode() == ISD::FSQRT && N0 == N1.getOperand(0))
+      return N1;
+
+  // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
+  TargetLowering::NegatibleCost CostN0 =
+      TargetLowering::NegatibleCost::Expensive;
+  TargetLowering::NegatibleCost CostN1 =
+      TargetLowering::NegatibleCost::Expensive;
+  SDValue NegN0 =
+      TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
+  if (NegN0) {
+    HandleSDNode NegN0Handle(NegN0);
+    SDValue NegN1 =
+        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
+    if (NegN1 && (CostN0 == TargetLowering::NegatibleCost::Cheaper ||
+                  CostN1 == TargetLowering::NegatibleCost::Cheaper))
+      return DAG.getNode(ISD::FDIV, SDLoc(N), VT, NegN0, NegN1);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFREM(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDNodeFlags Flags = N->getFlags();
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+
+  if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
+    return R;
+
+  // fold (frem c1, c2) -> fmod(c1,c2)
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FREM, SDLoc(N), VT, {N0, N1}))
+    return C;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFSQRT(SDNode *N) {
+  SDNodeFlags Flags = N->getFlags();
+  const TargetOptions &Options = DAG.getTarget().Options;
+
+  // Require 'ninf' flag since sqrt(+Inf) = +Inf, but the estimation goes as:
+  // sqrt(+Inf) == rsqrt(+Inf) * +Inf = 0 * +Inf = NaN
+  if (!Flags.hasApproximateFuncs() ||
+      (!Options.NoInfsFPMath && !Flags.hasNoInfs()))
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  if (TLI.isFsqrtCheap(N0, DAG))
+    return SDValue();
+
+  // FSQRT nodes have flags that propagate to the created nodes.
+  // TODO: If this is N0/sqrt(N0), and we reach this node before trying to
+  //       transform the fdiv, we may produce a sub-optimal estimate sequence
+  //       because the reciprocal calculation may not have to filter out a
+  //       0.0 input.
+  return buildSqrtEstimate(N0, Flags);
+}
+
+/// copysign(x, fp_extend(y)) -> copysign(x, y)
+/// copysign(x, fp_round(y)) -> copysign(x, y)
+/// Operands to the functions are the type of X and Y respectively.
+static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(EVT XTy, EVT YTy) {
+  // Always fold no-op FP casts.
+  if (XTy == YTy)
+    return true;
+
+  // Do not optimize out type conversion of f128 type yet.
+  // For some targets like x86_64, configuration is changed to keep one f128
+  // value in one SSE register, but instruction selection cannot handle
+  // FCOPYSIGN on SSE registers yet.
+  if (YTy == MVT::f128)
+    return false;
+
+  return !YTy.isVector() || EnableVectorFCopySignExtendRound;
+}
+
+static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() != ISD::FP_EXTEND &&
+      N1.getOpcode() != ISD::FP_ROUND)
+    return false;
+  EVT N1VT = N1->getValueType(0);
+  EVT N1Op0VT = N1->getOperand(0).getValueType();
+  return CanCombineFCOPYSIGN_EXTEND_ROUND(N1VT, N1Op0VT);
+}
+
+SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  // fold (fcopysign c1, c2) -> fcopysign(c1,c2)
+  if (SDValue C =
+          DAG.FoldConstantArithmetic(ISD::FCOPYSIGN, SDLoc(N), VT, {N0, N1}))
+    return C;
+
+  if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) {
+    const APFloat &V = N1C->getValueAPF();
+    // copysign(x, c1) -> fabs(x)       iff ispos(c1)
+    // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
+    if (!V.isNegative()) {
+      if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT))
+        return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
+    } else {
+      if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+        return DAG.getNode(ISD::FNEG, SDLoc(N), VT,
+                           DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0));
+    }
+  }
+
+  // copysign(fabs(x), y) -> copysign(x, y)
+  // copysign(fneg(x), y) -> copysign(x, y)
+  // copysign(copysign(x,z), y) -> copysign(x, y)
+  if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
+      N0.getOpcode() == ISD::FCOPYSIGN)
+    return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0.getOperand(0), N1);
+
+  // copysign(x, abs(y)) -> abs(x)
+  if (N1.getOpcode() == ISD::FABS)
+    return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
+
+  // copysign(x, copysign(y,z)) -> copysign(x, z)
+  if (N1.getOpcode() == ISD::FCOPYSIGN)
+    return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(1));
+
+  // copysign(x, fp_extend(y)) -> copysign(x, y)
+  // copysign(x, fp_round(y)) -> copysign(x, y)
+  if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
+    return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFPOW(SDNode *N) {
+  ConstantFPSDNode *ExponentC = isConstOrConstSplatFP(N->getOperand(1));
+  if (!ExponentC)
+    return SDValue();
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+
+  // Try to convert x ** (1/3) into cube root.
+  // TODO: Handle the various flavors of long double.
+  // TODO: Since we're approximating, we don't need an exact 1/3 exponent.
+  //       Some range near 1/3 should be fine.
+  EVT VT = N->getValueType(0);
+  if ((VT == MVT::f32 && ExponentC->getValueAPF().isExactlyValue(1.0f/3.0f)) ||
+      (VT == MVT::f64 && ExponentC->getValueAPF().isExactlyValue(1.0/3.0))) {
+    // pow(-0.0, 1/3) = +0.0; cbrt(-0.0) = -0.0.
+    // pow(-inf, 1/3) = +inf; cbrt(-inf) = -inf.
+    // pow(-val, 1/3) =  nan; cbrt(-val) = -num.
+    // For regular numbers, rounding may cause the results to differ.
+    // Therefore, we require { nsz ninf nnan afn } for this transform.
+    // TODO: We could select out the special cases if we don't have nsz/ninf.
+    SDNodeFlags Flags = N->getFlags();
+    if (!Flags.hasNoSignedZeros() || !Flags.hasNoInfs() || !Flags.hasNoNaNs() ||
+        !Flags.hasApproximateFuncs())
+      return SDValue();
+
+    // Do not create a cbrt() libcall if the target does not have it, and do not
+    // turn a pow that has lowering support into a cbrt() libcall.
+    if (!DAG.getLibInfo().has(LibFunc_cbrt) ||
+        (!DAG.getTargetLoweringInfo().isOperationExpand(ISD::FPOW, VT) &&
+         DAG.getTargetLoweringInfo().isOperationExpand(ISD::FCBRT, VT)))
+      return SDValue();
+
+    return DAG.getNode(ISD::FCBRT, SDLoc(N), VT, N->getOperand(0));
+  }
+
+  // Try to convert x ** (1/4) and x ** (3/4) into square roots.
+  // x ** (1/2) is canonicalized to sqrt, so we do not bother with that case.
+  // TODO: This could be extended (using a target hook) to handle smaller
+  // power-of-2 fractional exponents.
+  bool ExponentIs025 = ExponentC->getValueAPF().isExactlyValue(0.25);
+  bool ExponentIs075 = ExponentC->getValueAPF().isExactlyValue(0.75);
+  if (ExponentIs025 || ExponentIs075) {
+    // pow(-0.0, 0.25) = +0.0; sqrt(sqrt(-0.0)) = -0.0.
+    // pow(-inf, 0.25) = +inf; sqrt(sqrt(-inf)) =  NaN.
+    // pow(-0.0, 0.75) = +0.0; sqrt(-0.0) * sqrt(sqrt(-0.0)) = +0.0.
+    // pow(-inf, 0.75) = +inf; sqrt(-inf) * sqrt(sqrt(-inf)) =  NaN.
+    // For regular numbers, rounding may cause the results to differ.
+    // Therefore, we require { nsz ninf afn } for this transform.
+    // TODO: We could select out the special cases if we don't have nsz/ninf.
+    SDNodeFlags Flags = N->getFlags();
+
+    // We only need no signed zeros for the 0.25 case.
+    if ((!Flags.hasNoSignedZeros() && ExponentIs025) || !Flags.hasNoInfs() ||
+        !Flags.hasApproximateFuncs())
+      return SDValue();
+
+    // Don't double the number of libcalls. We are trying to inline fast code.
+    if (!DAG.getTargetLoweringInfo().isOperationLegalOrCustom(ISD::FSQRT, VT))
+      return SDValue();
+
+    // Assume that libcalls are the smallest code.
+    // TODO: This restriction should probably be lifted for vectors.
+    if (ForCodeSize)
+      return SDValue();
+
+    // pow(X, 0.25) --> sqrt(sqrt(X))
+    SDLoc DL(N);
+    SDValue Sqrt = DAG.getNode(ISD::FSQRT, DL, VT, N->getOperand(0));
+    SDValue SqrtSqrt = DAG.getNode(ISD::FSQRT, DL, VT, Sqrt);
+    if (ExponentIs025)
+      return SqrtSqrt;
+    // pow(X, 0.75) --> sqrt(X) * sqrt(sqrt(X))
+    return DAG.getNode(ISD::FMUL, DL, VT, Sqrt, SqrtSqrt);
+  }
+
+  return SDValue();
+}
+
+static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG,
+                               const TargetLowering &TLI) {
+  // We only do this if the target has legal ftrunc. Otherwise, we'd likely be
+  // replacing casts with a libcall. We also must be allowed to ignore -0.0
+  // because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer
+  // conversions would return +0.0.
+  // FIXME: We should be able to use node-level FMF here.
+  // TODO: If strict math, should we use FABS (+ range check for signed cast)?
+  EVT VT = N->getValueType(0);
+  if (!TLI.isOperationLegal(ISD::FTRUNC, VT) ||
+      !DAG.getTarget().Options.NoSignedZerosFPMath)
+    return SDValue();
+
+  // fptosi/fptoui round towards zero, so converting from FP to integer and
+  // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
+  SDValue N0 = N->getOperand(0);
+  if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
+      N0.getOperand(0).getValueType() == VT)
+    return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0));
+
+  if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
+      N0.getOperand(0).getValueType() == VT)
+    return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT OpVT = N0.getValueType();
+
+  // [us]itofp(undef) = 0, because the result value is bounded.
+  if (N0.isUndef())
+    return DAG.getConstantFP(0.0, SDLoc(N), VT);
+
+  // fold (sint_to_fp c1) -> c1fp
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      // ...but only if the target supports immediate floating-point values
+      (!LegalOperations ||
+       TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
+    return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
+
+  // If the input is a legal type, and SINT_TO_FP is not legal on this target,
+  // but UINT_TO_FP is legal on this target, try to convert.
+  if (!hasOperation(ISD::SINT_TO_FP, OpVT) &&
+      hasOperation(ISD::UINT_TO_FP, OpVT)) {
+    // If the sign bit is known to be zero, we can change this to UINT_TO_FP.
+    if (DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
+  }
+
+  // The next optimizations are desirable only if SELECT_CC can be lowered.
+  // fold (sint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), -1.0, 0.0)
+  if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
+      !VT.isVector() &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
+    SDLoc DL(N);
+    return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(-1.0, DL, VT),
+                         DAG.getConstantFP(0.0, DL, VT));
+  }
+
+  // fold (sint_to_fp (zext (setcc x, y, cc))) ->
+  //      (select (setcc x, y, cc), 1.0, 0.0)
+  if (N0.getOpcode() == ISD::ZERO_EXTEND &&
+      N0.getOperand(0).getOpcode() == ISD::SETCC && !VT.isVector() &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
+    SDLoc DL(N);
+    return DAG.getSelect(DL, VT, N0.getOperand(0),
+                         DAG.getConstantFP(1.0, DL, VT),
+                         DAG.getConstantFP(0.0, DL, VT));
+  }
+
+  if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
+    return FTrunc;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT OpVT = N0.getValueType();
+
+  // [us]itofp(undef) = 0, because the result value is bounded.
+  if (N0.isUndef())
+    return DAG.getConstantFP(0.0, SDLoc(N), VT);
+
+  // fold (uint_to_fp c1) -> c1fp
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      // ...but only if the target supports immediate floating-point values
+      (!LegalOperations ||
+       TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
+    return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
+
+  // If the input is a legal type, and UINT_TO_FP is not legal on this target,
+  // but SINT_TO_FP is legal on this target, try to convert.
+  if (!hasOperation(ISD::UINT_TO_FP, OpVT) &&
+      hasOperation(ISD::SINT_TO_FP, OpVT)) {
+    // If the sign bit is known to be zero, we can change this to SINT_TO_FP.
+    if (DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
+  }
+
+  // fold (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), 1.0, 0.0)
+  if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
+    SDLoc DL(N);
+    return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(1.0, DL, VT),
+                         DAG.getConstantFP(0.0, DL, VT));
+  }
+
+  if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
+    return FTrunc;
+
+  return SDValue();
+}
+
+// Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x
+static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP)
+    return SDValue();
+
+  SDValue Src = N0.getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP;
+  bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT;
+
+  // We can safely assume the conversion won't overflow the output range,
+  // because (for example) (uint8_t)18293.f is undefined behavior.
+
+  // Since we can assume the conversion won't overflow, our decision as to
+  // whether the input will fit in the float should depend on the minimum
+  // of the input range and output range.
+
+  // This means this is also safe for a signed input and unsigned output, since
+  // a negative input would lead to undefined behavior.
+  unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned;
+  unsigned OutputSize = (int)VT.getScalarSizeInBits();
+  unsigned ActualSize = std::min(InputSize, OutputSize);
+  const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType());
+
+  // We can only fold away the float conversion if the input range can be
+  // represented exactly in the float range.
+  if (APFloat::semanticsPrecision(sem) >= ActualSize) {
+    if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) {
+      unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND
+                                                       : ISD::ZERO_EXTEND;
+      return DAG.getNode(ExtOp, SDLoc(N), VT, Src);
+    }
+    if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits())
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src);
+    return DAG.getBitcast(VT, Src);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fp_to_sint undef) -> undef
+  if (N0.isUndef())
+    return DAG.getUNDEF(VT);
+
+  // fold (fp_to_sint c1fp) -> c1
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0);
+
+  return FoldIntToFPToInt(N, DAG);
+}
+
+SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fp_to_uint undef) -> undef
+  if (N0.isUndef())
+    return DAG.getUNDEF(VT);
+
+  // fold (fp_to_uint c1fp) -> c1
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0);
+
+  return FoldIntToFPToInt(N, DAG);
+}
+
+SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  // fold (fp_round c1fp) -> c1fp
+  if (SDValue C =
+          DAG.FoldConstantArithmetic(ISD::FP_ROUND, SDLoc(N), VT, {N0, N1}))
+    return C;
+
+  // fold (fp_round (fp_extend x)) -> x
+  if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType())
+    return N0.getOperand(0);
+
+  // fold (fp_round (fp_round x)) -> (fp_round x)
+  if (N0.getOpcode() == ISD::FP_ROUND) {
+    const bool NIsTrunc = N->getConstantOperandVal(1) == 1;
+    const bool N0IsTrunc = N0.getConstantOperandVal(1) == 1;
+
+    // Avoid folding legal fp_rounds into non-legal ones.
+    if (!hasOperation(ISD::FP_ROUND, VT))
+      return SDValue();
+
+    // Skip this folding if it results in an fp_round from f80 to f16.
+    //
+    // f80 to f16 always generates an expensive (and as yet, unimplemented)
+    // libcall to __truncxfhf2 instead of selecting native f16 conversion
+    // instructions from f32 or f64.  Moreover, the first (value-preserving)
+    // fp_round from f80 to either f32 or f64 may become a NOP in platforms like
+    // x86.
+    if (N0.getOperand(0).getValueType() == MVT::f80 && VT == MVT::f16)
+      return SDValue();
+
+    // If the first fp_round isn't a value preserving truncation, it might
+    // introduce a tie in the second fp_round, that wouldn't occur in the
+    // single-step fp_round we want to fold to.
+    // In other words, double rounding isn't the same as rounding.
+    // Also, this is a value preserving truncation iff both fp_round's are.
+    if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc) {
+      SDLoc DL(N);
+      return DAG.getNode(
+          ISD::FP_ROUND, DL, VT, N0.getOperand(0),
+          DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL, /*isTarget=*/true));
+    }
+  }
+
+  // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
+  // Note: From a legality perspective, this is a two step transform.  First,
+  // we duplicate the fp_round to the arguments of the copysign, then we
+  // eliminate the fp_round on Y.  The second step requires an additional
+  // predicate to match the implementation above.
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0->hasOneUse() &&
+      CanCombineFCOPYSIGN_EXTEND_ROUND(VT,
+                                       N0.getValueType())) {
+    SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT,
+                              N0.getOperand(0), N1);
+    AddToWorklist(Tmp.getNode());
+    return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
+                       Tmp, N0.getOperand(1));
+  }
+
+  if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
+    return NewVSel;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVCastOp(N, SDLoc(N)))
+      return FoldedVOp;
+
+  // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
+  if (N->hasOneUse() &&
+      N->use_begin()->getOpcode() == ISD::FP_ROUND)
+    return SDValue();
+
+  // fold (fp_extend c1fp) -> c1fp
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0);
+
+  // fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op)
+  if (N0.getOpcode() == ISD::FP16_TO_FP &&
+      TLI.getOperationAction(ISD::FP16_TO_FP, VT) == TargetLowering::Legal)
+    return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), VT, N0.getOperand(0));
+
+  // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the
+  // value of X.
+  if (N0.getOpcode() == ISD::FP_ROUND
+      && N0.getConstantOperandVal(1) == 1) {
+    SDValue In = N0.getOperand(0);
+    if (In.getValueType() == VT) return In;
+    if (VT.bitsLT(In.getValueType()))
+      return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT,
+                         In, N0.getOperand(1));
+    return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In);
+  }
+
+  // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
+                                     LN0->getChain(),
+                                     LN0->getBasePtr(), N0.getValueType(),
+                                     LN0->getMemOperand());
+    CombineTo(N, ExtLoad);
+    CombineTo(
+        N0.getNode(),
+        DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
+                    DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
+        ExtLoad.getValue(1));
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+
+  if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
+    return NewVSel;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFCEIL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fceil c1) -> fceil(c1)
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ftrunc c1) -> ftrunc(c1)
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0);
+
+  // fold ftrunc (known rounded int x) -> x
+  // ftrunc is a part of fptosi/fptoui expansion on some targets, so this is
+  // likely to be generated to extract integer from a rounded floating value.
+  switch (N0.getOpcode()) {
+  default: break;
+  case ISD::FRINT:
+  case ISD::FTRUNC:
+  case ISD::FNEARBYINT:
+  case ISD::FFLOOR:
+  case ISD::FCEIL:
+    return N0;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFFREXP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+
+  // fold (ffrexp c1) -> ffrexp(c1)
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FFREXP, SDLoc(N), N->getVTList(), N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ffloor c1) -> ffloor(c1)
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFNEG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+
+  // Constant fold FNEG.
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);
+
+  if (SDValue NegN0 =
+          TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize))
+    return NegN0;
+
+  // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
+  // FIXME: This is duplicated in getNegatibleCost, but getNegatibleCost doesn't
+  // know it was called from a context with a nsz flag if the input fsub does
+  // not.
+  if (N0.getOpcode() == ISD::FSUB &&
+      (DAG.getTarget().Options.NoSignedZerosFPMath ||
+       N->getFlags().hasNoSignedZeros()) && N0.hasOneUse()) {
+    return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1),
+                       N0.getOperand(0));
+  }
+
+  if (SDValue Cast = foldSignChangeInBitcast(N))
+    return Cast;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFMinMax(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  const SDNodeFlags Flags = N->getFlags();
+  unsigned Opc = N->getOpcode();
+  bool PropagatesNaN = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM;
+  bool IsMin = Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM;
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+
+  // Constant fold.
+  if (SDValue C = DAG.FoldConstantArithmetic(Opc, SDLoc(N), VT, {N0, N1}))
+    return C;
+
+  // Canonicalize to constant on RHS.
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0) &&
+      !DAG.isConstantFPBuildVectorOrConstantFP(N1))
+    return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
+
+  if (const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1)) {
+    const APFloat &AF = N1CFP->getValueAPF();
+
+    // minnum(X, nan) -> X
+    // maxnum(X, nan) -> X
+    // minimum(X, nan) -> nan
+    // maximum(X, nan) -> nan
+    if (AF.isNaN())
+      return PropagatesNaN ? N->getOperand(1) : N->getOperand(0);
+
+    // In the following folds, inf can be replaced with the largest finite
+    // float, if the ninf flag is set.
+    if (AF.isInfinity() || (Flags.hasNoInfs() && AF.isLargest())) {
+      // minnum(X, -inf) -> -inf
+      // maxnum(X, +inf) -> +inf
+      // minimum(X, -inf) -> -inf if nnan
+      // maximum(X, +inf) -> +inf if nnan
+      if (IsMin == AF.isNegative() && (!PropagatesNaN || Flags.hasNoNaNs()))
+        return N->getOperand(1);
+
+      // minnum(X, +inf) -> X if nnan
+      // maxnum(X, -inf) -> X if nnan
+      // minimum(X, +inf) -> X
+      // maximum(X, -inf) -> X
+      if (IsMin != AF.isNegative() && (PropagatesNaN || Flags.hasNoNaNs()))
+        return N->getOperand(0);
+    }
+  }
+
+  if (SDValue SD = reassociateReduction(
+          PropagatesNaN
+              ? (IsMin ? ISD::VECREDUCE_FMINIMUM : ISD::VECREDUCE_FMAXIMUM)
+              : (IsMin ? ISD::VECREDUCE_FMIN : ISD::VECREDUCE_FMAX),
+          Opc, SDLoc(N), VT, N0, N1, Flags))
+    return SD;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFABS(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fabs c1) -> fabs(c1)
+  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
+
+  // fold (fabs (fabs x)) -> (fabs x)
+  if (N0.getOpcode() == ISD::FABS)
+    return N->getOperand(0);
+
+  // fold (fabs (fneg x)) -> (fabs x)
+  // fold (fabs (fcopysign x, y)) -> (fabs x)
+  if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
+    return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0));
+
+  if (SDValue Cast = foldSignChangeInBitcast(N))
+    return Cast;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBRCOND(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+
+  // BRCOND(FREEZE(cond)) is equivalent to BRCOND(cond) (both are
+  // nondeterministic jumps).
+  if (N1->getOpcode() == ISD::FREEZE && N1.hasOneUse()) {
+    return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain,
+                       N1->getOperand(0), N2);
+  }
+
+  // Variant of the previous fold where there is a SETCC in between:
+  //   BRCOND(SETCC(FREEZE(X), CONST, Cond))
+  // =>
+  //   BRCOND(FREEZE(SETCC(X, CONST, Cond)))
+  // =>
+  //   BRCOND(SETCC(X, CONST, Cond))
+  // This is correct if FREEZE(X) has one use and SETCC(FREEZE(X), CONST, Cond)
+  // isn't equivalent to true or false.
+  // For example, SETCC(FREEZE(X), -128, SETULT) cannot be folded to
+  // FREEZE(SETCC(X, -128, SETULT)) because X can be poison.
+  if (N1->getOpcode() == ISD::SETCC && N1.hasOneUse()) {
+    SDValue S0 = N1->getOperand(0), S1 = N1->getOperand(1);
+    ISD::CondCode Cond = cast<CondCodeSDNode>(N1->getOperand(2))->get();
+    ConstantSDNode *S0C = dyn_cast<ConstantSDNode>(S0);
+    ConstantSDNode *S1C = dyn_cast<ConstantSDNode>(S1);
+    bool Updated = false;
+
+    // Is 'X Cond C' always true or false?
+    auto IsAlwaysTrueOrFalse = [](ISD::CondCode Cond, ConstantSDNode *C) {
+      bool False = (Cond == ISD::SETULT && C->isZero()) ||
+                   (Cond == ISD::SETLT && C->isMinSignedValue()) ||
+                   (Cond == ISD::SETUGT && C->isAllOnes()) ||
+                   (Cond == ISD::SETGT && C->isMaxSignedValue());
+      bool True = (Cond == ISD::SETULE && C->isAllOnes()) ||
+                  (Cond == ISD::SETLE && C->isMaxSignedValue()) ||
+                  (Cond == ISD::SETUGE && C->isZero()) ||
+                  (Cond == ISD::SETGE && C->isMinSignedValue());
+      return True || False;
+    };
+
+    if (S0->getOpcode() == ISD::FREEZE && S0.hasOneUse() && S1C) {
+      if (!IsAlwaysTrueOrFalse(Cond, S1C)) {
+        S0 = S0->getOperand(0);
+        Updated = true;
+      }
+    }
+    if (S1->getOpcode() == ISD::FREEZE && S1.hasOneUse() && S0C) {
+      if (!IsAlwaysTrueOrFalse(ISD::getSetCCSwappedOperands(Cond), S0C)) {
+        S1 = S1->getOperand(0);
+        Updated = true;
+      }
+    }
+
+    if (Updated)
+      return DAG.getNode(
+          ISD::BRCOND, SDLoc(N), MVT::Other, Chain,
+          DAG.getSetCC(SDLoc(N1), N1->getValueType(0), S0, S1, Cond), N2);
+  }
+
+  // If N is a constant we could fold this into a fallthrough or unconditional
+  // branch. However that doesn't happen very often in normal code, because
+  // Instcombine/SimplifyCFG should have handled the available opportunities.
+  // If we did this folding here, it would be necessary to update the
+  // MachineBasicBlock CFG, which is awkward.
+
+  // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal
+  // on the target.
+  if (N1.getOpcode() == ISD::SETCC &&
+      TLI.isOperationLegalOrCustom(ISD::BR_CC,
+                                   N1.getOperand(0).getValueType())) {
+    return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
+                       Chain, N1.getOperand(2),
+                       N1.getOperand(0), N1.getOperand(1), N2);
+  }
+
+  if (N1.hasOneUse()) {
+    // rebuildSetCC calls visitXor which may change the Chain when there is a
+    // STRICT_FSETCC/STRICT_FSETCCS involved. Use a handle to track changes.
+    HandleSDNode ChainHandle(Chain);
+    if (SDValue NewN1 = rebuildSetCC(N1))
+      return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other,
+                         ChainHandle.getValue(), NewN1, N2);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::rebuildSetCC(SDValue N) {
+  if (N.getOpcode() == ISD::SRL ||
+      (N.getOpcode() == ISD::TRUNCATE &&
+       (N.getOperand(0).hasOneUse() &&
+        N.getOperand(0).getOpcode() == ISD::SRL))) {
+    // Look pass the truncate.
+    if (N.getOpcode() == ISD::TRUNCATE)
+      N = N.getOperand(0);
+
+    // Match this pattern so that we can generate simpler code:
+    //
+    //   %a = ...
+    //   %b = and i32 %a, 2
+    //   %c = srl i32 %b, 1
+    //   brcond i32 %c ...
+    //
+    // into
+    //
+    //   %a = ...
+    //   %b = and i32 %a, 2
+    //   %c = setcc eq %b, 0
+    //   brcond %c ...
+    //
+    // This applies only when the AND constant value has one bit set and the
+    // SRL constant is equal to the log2 of the AND constant. The back-end is
+    // smart enough to convert the result into a TEST/JMP sequence.
+    SDValue Op0 = N.getOperand(0);
+    SDValue Op1 = N.getOperand(1);
+
+    if (Op0.getOpcode() == ISD::AND && Op1.getOpcode() == ISD::Constant) {
+      SDValue AndOp1 = Op0.getOperand(1);
+
+      if (AndOp1.getOpcode() == ISD::Constant) {
+        const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue();
+
+        if (AndConst.isPowerOf2() &&
+            cast<ConstantSDNode>(Op1)->getAPIntValue() == AndConst.logBase2()) {
+          SDLoc DL(N);
+          return DAG.getSetCC(DL, getSetCCResultType(Op0.getValueType()),
+                              Op0, DAG.getConstant(0, DL, Op0.getValueType()),
+                              ISD::SETNE);
+        }
+      }
+    }
+  }
+
+  // Transform (brcond (xor x, y)) -> (brcond (setcc, x, y, ne))
+  // Transform (brcond (xor (xor x, y), -1)) -> (brcond (setcc, x, y, eq))
+  if (N.getOpcode() == ISD::XOR) {
+    // Because we may call this on a speculatively constructed
+    // SimplifiedSetCC Node, we need to simplify this node first.
+    // Ideally this should be folded into SimplifySetCC and not
+    // here. For now, grab a handle to N so we don't lose it from
+    // replacements interal to the visit.
+    HandleSDNode XORHandle(N);
+    while (N.getOpcode() == ISD::XOR) {
+      SDValue Tmp = visitXOR(N.getNode());
+      // No simplification done.
+      if (!Tmp.getNode())
+        break;
+      // Returning N is form in-visit replacement that may invalidated
+      // N. Grab value from Handle.
+      if (Tmp.getNode() == N.getNode())
+        N = XORHandle.getValue();
+      else // Node simplified. Try simplifying again.
+        N = Tmp;
+    }
+
+    if (N.getOpcode() != ISD::XOR)
+      return N;
+
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+
+    if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) {
+      bool Equal = false;
+      // (brcond (xor (xor x, y), -1)) -> (brcond (setcc x, y, eq))
+      if (isBitwiseNot(N) && Op0.hasOneUse() && Op0.getOpcode() == ISD::XOR &&
+          Op0.getValueType() == MVT::i1) {
+        N = Op0;
+        Op0 = N->getOperand(0);
+        Op1 = N->getOperand(1);
+        Equal = true;
+      }
+
+      EVT SetCCVT = N.getValueType();
+      if (LegalTypes)
+        SetCCVT = getSetCCResultType(SetCCVT);
+      // Replace the uses of XOR with SETCC
+      return DAG.getSetCC(SDLoc(N), SetCCVT, Op0, Op1,
+                          Equal ? ISD::SETEQ : ISD::SETNE);
+    }
+  }
+
+  return SDValue();
+}
+
+// Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB.
+//
+SDValue DAGCombiner::visitBR_CC(SDNode *N) {
+  CondCodeSDNode *CC = cast<CondCodeSDNode>(N->getOperand(1));
+  SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3);
+
+  // If N is a constant we could fold this into a fallthrough or unconditional
+  // branch. However that doesn't happen very often in normal code, because
+  // Instcombine/SimplifyCFG should have handled the available opportunities.
+  // If we did this folding here, it would be necessary to update the
+  // MachineBasicBlock CFG, which is awkward.
+
+  // Use SimplifySetCC to simplify SETCC's.
+  SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()),
+                               CondLHS, CondRHS, CC->get(), SDLoc(N),
+                               false);
+  if (Simp.getNode()) AddToWorklist(Simp.getNode());
+
+  // fold to a simpler setcc
+  if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC)
+    return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
+                       N->getOperand(0), Simp.getOperand(2),
+                       Simp.getOperand(0), Simp.getOperand(1),
+                       N->getOperand(4));
+
+  return SDValue();
+}
+
+static bool getCombineLoadStoreParts(SDNode *N, unsigned Inc, unsigned Dec,
+                                     bool &IsLoad, bool &IsMasked, SDValue &Ptr,
+                                     const TargetLowering &TLI) {
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    if (LD->isIndexed())
+      return false;
+    EVT VT = LD->getMemoryVT();
+    if (!TLI.isIndexedLoadLegal(Inc, VT) && !TLI.isIndexedLoadLegal(Dec, VT))
+      return false;
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    if (ST->isIndexed())
+      return false;
+    EVT VT = ST->getMemoryVT();
+    if (!TLI.isIndexedStoreLegal(Inc, VT) && !TLI.isIndexedStoreLegal(Dec, VT))
+      return false;
+    Ptr = ST->getBasePtr();
+    IsLoad = false;
+  } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
+    if (LD->isIndexed())
+      return false;
+    EVT VT = LD->getMemoryVT();
+    if (!TLI.isIndexedMaskedLoadLegal(Inc, VT) &&
+        !TLI.isIndexedMaskedLoadLegal(Dec, VT))
+      return false;
+    Ptr = LD->getBasePtr();
+    IsMasked = true;
+  } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
+    if (ST->isIndexed())
+      return false;
+    EVT VT = ST->getMemoryVT();
+    if (!TLI.isIndexedMaskedStoreLegal(Inc, VT) &&
+        !TLI.isIndexedMaskedStoreLegal(Dec, VT))
+      return false;
+    Ptr = ST->getBasePtr();
+    IsLoad = false;
+    IsMasked = true;
+  } else {
+    return false;
+  }
+  return true;
+}
+
+/// Try turning a load/store into a pre-indexed load/store when the base
+/// pointer is an add or subtract and it has other uses besides the load/store.
+/// After the transformation, the new indexed load/store has effectively folded
+/// the add/subtract in and all of its other uses are redirected to the
+/// new load/store.
+bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
+  if (Level < AfterLegalizeDAG)
+    return false;
+
+  bool IsLoad = true;
+  bool IsMasked = false;
+  SDValue Ptr;
+  if (!getCombineLoadStoreParts(N, ISD::PRE_INC, ISD::PRE_DEC, IsLoad, IsMasked,
+                                Ptr, TLI))
+    return false;
+
+  // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail
+  // out.  There is no reason to make this a preinc/predec.
+  if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) ||
+      Ptr->hasOneUse())
+    return false;
+
+  // Ask the target to do addressing mode selection.
+  SDValue BasePtr;
+  SDValue Offset;
+  ISD::MemIndexedMode AM = ISD::UNINDEXED;
+  if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG))
+    return false;
+
+  // Backends without true r+i pre-indexed forms may need to pass a
+  // constant base with a variable offset so that constant coercion
+  // will work with the patterns in canonical form.
+  bool Swapped = false;
+  if (isa<ConstantSDNode>(BasePtr)) {
+    std::swap(BasePtr, Offset);
+    Swapped = true;
+  }
+
+  // Don't create a indexed load / store with zero offset.
+  if (isNullConstant(Offset))
+    return false;
+
+  // Try turning it into a pre-indexed load / store except when:
+  // 1) The new base ptr is a frame index.
+  // 2) If N is a store and the new base ptr is either the same as or is a
+  //    predecessor of the value being stored.
+  // 3) Another use of old base ptr is a predecessor of N. If ptr is folded
+  //    that would create a cycle.
+  // 4) All uses are load / store ops that use it as old base ptr.
+
+  // Check #1.  Preinc'ing a frame index would require copying the stack pointer
+  // (plus the implicit offset) to a register to preinc anyway.
+  if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
+    return false;
+
+  // Check #2.
+  if (!IsLoad) {
+    SDValue Val = IsMasked ? cast<MaskedStoreSDNode>(N)->getValue()
+                           : cast<StoreSDNode>(N)->getValue();
+
+    // Would require a copy.
+    if (Val == BasePtr)
+      return false;
+
+    // Would create a cycle.
+    if (Val == Ptr || Ptr->isPredecessorOf(Val.getNode()))
+      return false;
+  }
+
+  // Caches for hasPredecessorHelper.
+  SmallPtrSet<const SDNode *, 32> Visited;
+  SmallVector<const SDNode *, 16> Worklist;
+  Worklist.push_back(N);
+
+  // If the offset is a constant, there may be other adds of constants that
+  // can be folded with this one. We should do this to avoid having to keep
+  // a copy of the original base pointer.
+  SmallVector<SDNode *, 16> OtherUses;
+  if (isa<ConstantSDNode>(Offset))
+    for (SDNode::use_iterator UI = BasePtr->use_begin(),
+                              UE = BasePtr->use_end();
+         UI != UE; ++UI) {
+      SDUse &Use = UI.getUse();
+      // Skip the use that is Ptr and uses of other results from BasePtr's
+      // node (important for nodes that return multiple results).
+      if (Use.getUser() == Ptr.getNode() || Use != BasePtr)
+        continue;
+
+      if (SDNode::hasPredecessorHelper(Use.getUser(), Visited, Worklist))
+        continue;
+
+      if (Use.getUser()->getOpcode() != ISD::ADD &&
+          Use.getUser()->getOpcode() != ISD::SUB) {
+        OtherUses.clear();
+        break;
+      }
+
+      SDValue Op1 = Use.getUser()->getOperand((UI.getOperandNo() + 1) & 1);
+      if (!isa<ConstantSDNode>(Op1)) {
+        OtherUses.clear();
+        break;
+      }
+
+      // FIXME: In some cases, we can be smarter about this.
+      if (Op1.getValueType() != Offset.getValueType()) {
+        OtherUses.clear();
+        break;
+      }
+
+      OtherUses.push_back(Use.getUser());
+    }
+
+  if (Swapped)
+    std::swap(BasePtr, Offset);
+
+  // Now check for #3 and #4.
+  bool RealUse = false;
+
+  for (SDNode *Use : Ptr->uses()) {
+    if (Use == N)
+      continue;
+    if (SDNode::hasPredecessorHelper(Use, Visited, Worklist))
+      return false;
+
+    // If Ptr may be folded in addressing mode of other use, then it's
+    // not profitable to do this transformation.
+    if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI))
+      RealUse = true;
+  }
+
+  if (!RealUse)
+    return false;
+
+  SDValue Result;
+  if (!IsMasked) {
+    if (IsLoad)
+      Result = DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM);
+    else
+      Result =
+          DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM);
+  } else {
+    if (IsLoad)
+      Result = DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
+                                        Offset, AM);
+    else
+      Result = DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), BasePtr,
+                                         Offset, AM);
+  }
+  ++PreIndexedNodes;
+  ++NodesCombined;
+  LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: ";
+             Result.dump(&DAG); dbgs() << '\n');
+  WorklistRemover DeadNodes(*this);
+  if (IsLoad) {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
+  } else {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
+  }
+
+  // Finally, since the node is now dead, remove it from the graph.
+  deleteAndRecombine(N);
+
+  if (Swapped)
+    std::swap(BasePtr, Offset);
+
+  // Replace other uses of BasePtr that can be updated to use Ptr
+  for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) {
+    unsigned OffsetIdx = 1;
+    if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode())
+      OffsetIdx = 0;
+    assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() ==
+           BasePtr.getNode() && "Expected BasePtr operand");
+
+    // We need to replace ptr0 in the following expression:
+    //   x0 * offset0 + y0 * ptr0 = t0
+    // knowing that
+    //   x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store)
+    //
+    // where x0, x1, y0 and y1 in {-1, 1} are given by the types of the
+    // indexed load/store and the expression that needs to be re-written.
+    //
+    // Therefore, we have:
+    //   t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1
+
+    auto *CN = cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx));
+    const APInt &Offset0 = CN->getAPIntValue();
+    const APInt &Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue();
+    int X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1;
+    int Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1;
+    int X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1;
+    int Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1;
+
+    unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD;
+
+    APInt CNV = Offset0;
+    if (X0 < 0) CNV = -CNV;
+    if (X1 * Y0 * Y1 < 0) CNV = CNV + Offset1;
+    else CNV = CNV - Offset1;
+
+    SDLoc DL(OtherUses[i]);
+
+    // We can now generate the new expression.
+    SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0));
+    SDValue NewOp2 = Result.getValue(IsLoad ? 1 : 0);
+
+    SDValue NewUse = DAG.getNode(Opcode,
+                                 DL,
+                                 OtherUses[i]->getValueType(0), NewOp1, NewOp2);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse);
+    deleteAndRecombine(OtherUses[i]);
+  }
+
+  // Replace the uses of Ptr with uses of the updated base value.
+  DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(IsLoad ? 1 : 0));
+  deleteAndRecombine(Ptr.getNode());
+  AddToWorklist(Result.getNode());
+
+  return true;
+}
+
+static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse,
+                                   SDValue &BasePtr, SDValue &Offset,
+                                   ISD::MemIndexedMode &AM,
+                                   SelectionDAG &DAG,
+                                   const TargetLowering &TLI) {
+  if (PtrUse == N ||
+      (PtrUse->getOpcode() != ISD::ADD && PtrUse->getOpcode() != ISD::SUB))
+    return false;
+
+  if (!TLI.getPostIndexedAddressParts(N, PtrUse, BasePtr, Offset, AM, DAG))
+    return false;
+
+  // Don't create a indexed load / store with zero offset.
+  if (isNullConstant(Offset))
+    return false;
+
+  if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
+    return false;
+
+  SmallPtrSet<const SDNode *, 32> Visited;
+  for (SDNode *Use : BasePtr->uses()) {
+    if (Use == Ptr.getNode())
+      continue;
+
+    // No if there's a later user which could perform the index instead.
+    if (isa<MemSDNode>(Use)) {
+      bool IsLoad = true;
+      bool IsMasked = false;
+      SDValue OtherPtr;
+      if (getCombineLoadStoreParts(Use, ISD::POST_INC, ISD::POST_DEC, IsLoad,
+                                   IsMasked, OtherPtr, TLI)) {
+        SmallVector<const SDNode *, 2> Worklist;
+        Worklist.push_back(Use);
+        if (SDNode::hasPredecessorHelper(N, Visited, Worklist))
+          return false;
+      }
+    }
+
+    // If all the uses are load / store addresses, then don't do the
+    // transformation.
+    if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) {
+      for (SDNode *UseUse : Use->uses())
+        if (canFoldInAddressingMode(Use, UseUse, DAG, TLI))
+          return false;
+    }
+  }
+  return true;
+}
+
+static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,
+                                         bool &IsMasked, SDValue &Ptr,
+                                         SDValue &BasePtr, SDValue &Offset,
+                                         ISD::MemIndexedMode &AM,
+                                         SelectionDAG &DAG,
+                                         const TargetLowering &TLI) {
+  if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad,
+                                IsMasked, Ptr, TLI) ||
+      Ptr->hasOneUse())
+    return nullptr;
+
+  // Try turning it into a post-indexed load / store except when
+  // 1) All uses are load / store ops that use it as base ptr (and
+  //    it may be folded as addressing mmode).
+  // 2) Op must be independent of N, i.e. Op is neither a predecessor
+  //    nor a successor of N. Otherwise, if Op is folded that would
+  //    create a cycle.
+  for (SDNode *Op : Ptr->uses()) {
+    // Check for #1.
+    if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI))
+      continue;
+
+    // Check for #2.
+    SmallPtrSet<const SDNode *, 32> Visited;
+    SmallVector<const SDNode *, 8> Worklist;
+    // Ptr is predecessor to both N and Op.
+    Visited.insert(Ptr.getNode());
+    Worklist.push_back(N);
+    Worklist.push_back(Op);
+    if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) &&
+        !SDNode::hasPredecessorHelper(Op, Visited, Worklist))
+      return Op;
+  }
+  return nullptr;
+}
+
+/// Try to combine a load/store with a add/sub of the base pointer node into a
+/// post-indexed load/store. The transformation folded the add/subtract into the
+/// new indexed load/store effectively and all of its uses are redirected to the
+/// new load/store.
+bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
+  if (Level < AfterLegalizeDAG)
+    return false;
+
+  bool IsLoad = true;
+  bool IsMasked = false;
+  SDValue Ptr;
+  SDValue BasePtr;
+  SDValue Offset;
+  ISD::MemIndexedMode AM = ISD::UNINDEXED;
+  SDNode *Op = getPostIndexedLoadStoreOp(N, IsLoad, IsMasked, Ptr, BasePtr,
+                                         Offset, AM, DAG, TLI);
+  if (!Op)
+    return false;
+
+  SDValue Result;
+  if (!IsMasked)
+    Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
+                                         Offset, AM)
+                    : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N),
+                                          BasePtr, Offset, AM);
+  else
+    Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N),
+                                               BasePtr, Offset, AM)
+                    : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N),
+                                                BasePtr, Offset, AM);
+  ++PostIndexedNodes;
+  ++NodesCombined;
+  LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG); dbgs() << "\nWith: ";
+             Result.dump(&DAG); dbgs() << '\n');
+  WorklistRemover DeadNodes(*this);
+  if (IsLoad) {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
+  } else {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
+  }
+
+  // Finally, since the node is now dead, remove it from the graph.
+  deleteAndRecombine(N);
+
+  // Replace the uses of Use with uses of the updated base value.
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
+                                Result.getValue(IsLoad ? 1 : 0));
+  deleteAndRecombine(Op);
+  return true;
+}
+
+/// Return the base-pointer arithmetic from an indexed \p LD.
+SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  assert(AM != ISD::UNINDEXED);
+  SDValue BP = LD->getOperand(1);
+  SDValue Inc = LD->getOperand(2);
+
+  // Some backends use TargetConstants for load offsets, but don't expect
+  // TargetConstants in general ADD nodes. We can convert these constants into
+  // regular Constants (if the constant is not opaque).
+  assert((Inc.getOpcode() != ISD::TargetConstant ||
+          !cast<ConstantSDNode>(Inc)->isOpaque()) &&
+         "Cannot split out indexing using opaque target constants");
+  if (Inc.getOpcode() == ISD::TargetConstant) {
+    ConstantSDNode *ConstInc = cast<ConstantSDNode>(Inc);
+    Inc = DAG.getConstant(*ConstInc->getConstantIntValue(), SDLoc(Inc),
+                          ConstInc->getValueType(0));
+  }
+
+  unsigned Opc =
+      (AM == ISD::PRE_INC || AM == ISD::POST_INC ? ISD::ADD : ISD::SUB);
+  return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
+}
+
+static inline ElementCount numVectorEltsOrZero(EVT T) {
+  return T.isVector() ? T.getVectorElementCount() : ElementCount::getFixed(0);
+}
+
+bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) {
+  EVT STType = Val.getValueType();
+  EVT STMemType = ST->getMemoryVT();
+  if (STType == STMemType)
+    return true;
+  if (isTypeLegal(STMemType))
+    return false; // fail.
+  if (STType.isFloatingPoint() && STMemType.isFloatingPoint() &&
+      TLI.isOperationLegal(ISD::FTRUNC, STMemType)) {
+    Val = DAG.getNode(ISD::FTRUNC, SDLoc(ST), STMemType, Val);
+    return true;
+  }
+  if (numVectorEltsOrZero(STType) == numVectorEltsOrZero(STMemType) &&
+      STType.isInteger() && STMemType.isInteger()) {
+    Val = DAG.getNode(ISD::TRUNCATE, SDLoc(ST), STMemType, Val);
+    return true;
+  }
+  if (STType.getSizeInBits() == STMemType.getSizeInBits()) {
+    Val = DAG.getBitcast(STMemType, Val);
+    return true;
+  }
+  return false; // fail.
+}
+
+bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) {
+  EVT LDMemType = LD->getMemoryVT();
+  EVT LDType = LD->getValueType(0);
+  assert(Val.getValueType() == LDMemType &&
+         "Attempting to extend value of non-matching type");
+  if (LDType == LDMemType)
+    return true;
+  if (LDMemType.isInteger() && LDType.isInteger()) {
+    switch (LD->getExtensionType()) {
+    case ISD::NON_EXTLOAD:
+      Val = DAG.getBitcast(LDType, Val);
+      return true;
+    case ISD::EXTLOAD:
+      Val = DAG.getNode(ISD::ANY_EXTEND, SDLoc(LD), LDType, Val);
+      return true;
+    case ISD::SEXTLOAD:
+      Val = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(LD), LDType, Val);
+      return true;
+    case ISD::ZEXTLOAD:
+      Val = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(LD), LDType, Val);
+      return true;
+    }
+  }
+  return false;
+}
+
+StoreSDNode *DAGCombiner::getUniqueStoreFeeding(LoadSDNode *LD,
+                                                int64_t &Offset) {
+  SDValue Chain = LD->getOperand(0);
+
+  // Look through CALLSEQ_START.
+  if (Chain.getOpcode() == ISD::CALLSEQ_START)
+    Chain = Chain->getOperand(0);
+
+  StoreSDNode *ST = nullptr;
+  SmallVector<SDValue, 8> Aliases;
+  if (Chain.getOpcode() == ISD::TokenFactor) {
+    // Look for unique store within the TokenFactor.
+    for (SDValue Op : Chain->ops()) {
+      StoreSDNode *Store = dyn_cast<StoreSDNode>(Op.getNode());
+      if (!Store)
+        continue;
+      BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
+      BaseIndexOffset BasePtrST = BaseIndexOffset::match(Store, DAG);
+      if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset))
+        continue;
+      // Make sure the store is not aliased with any nodes in TokenFactor.
+      GatherAllAliases(Store, Chain, Aliases);
+      if (Aliases.empty() ||
+          (Aliases.size() == 1 && Aliases.front().getNode() == Store))
+        ST = Store;
+      break;
+    }
+  } else {
+    StoreSDNode *Store = dyn_cast<StoreSDNode>(Chain.getNode());
+    if (Store) {
+      BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
+      BaseIndexOffset BasePtrST = BaseIndexOffset::match(Store, DAG);
+      if (BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset))
+        ST = Store;
+    }
+  }
+
+  return ST;
+}
+
+SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
+  if (OptLevel == CodeGenOpt::None || !LD->isSimple())
+    return SDValue();
+  SDValue Chain = LD->getOperand(0);
+  int64_t Offset;
+
+  StoreSDNode *ST = getUniqueStoreFeeding(LD, Offset);
+  // TODO: Relax this restriction for unordered atomics (see D66309)
+  if (!ST || !ST->isSimple() || ST->getAddressSpace() != LD->getAddressSpace())
+    return SDValue();
+
+  EVT LDType = LD->getValueType(0);
+  EVT LDMemType = LD->getMemoryVT();
+  EVT STMemType = ST->getMemoryVT();
+  EVT STType = ST->getValue().getValueType();
+
+  // There are two cases to consider here:
+  //  1. The store is fixed width and the load is scalable. In this case we
+  //     don't know at compile time if the store completely envelops the load
+  //     so we abandon the optimisation.
+  //  2. The store is scalable and the load is fixed width. We could
+  //     potentially support a limited number of cases here, but there has been
+  //     no cost-benefit analysis to prove it's worth it.
+  bool LdStScalable = LDMemType.isScalableVT();
+  if (LdStScalable != STMemType.isScalableVT())
+    return SDValue();
+
+  // If we are dealing with scalable vectors on a big endian platform the
+  // calculation of offsets below becomes trickier, since we do not know at
+  // compile time the absolute size of the vector. Until we've done more
+  // analysis on big-endian platforms it seems better to bail out for now.
+  if (LdStScalable && DAG.getDataLayout().isBigEndian())
+    return SDValue();
+
+  // Normalize for Endianness. After this Offset=0 will denote that the least
+  // significant bit in the loaded value maps to the least significant bit in
+  // the stored value). With Offset=n (for n > 0) the loaded value starts at the
+  // n:th least significant byte of the stored value.
+  int64_t OrigOffset = Offset;
+  if (DAG.getDataLayout().isBigEndian())
+    Offset = ((int64_t)STMemType.getStoreSizeInBits().getFixedValue() -
+              (int64_t)LDMemType.getStoreSizeInBits().getFixedValue()) /
+                 8 -
+             Offset;
+
+  // Check that the stored value cover all bits that are loaded.
+  bool STCoversLD;
+
+  TypeSize LdMemSize = LDMemType.getSizeInBits();
+  TypeSize StMemSize = STMemType.getSizeInBits();
+  if (LdStScalable)
+    STCoversLD = (Offset == 0) && LdMemSize == StMemSize;
+  else
+    STCoversLD = (Offset >= 0) && (Offset * 8 + LdMemSize.getFixedValue() <=
+                                   StMemSize.getFixedValue());
+
+  auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue {
+    if (LD->isIndexed()) {
+      // Cannot handle opaque target constants and we must respect the user's
+      // request not to split indexes from loads.
+      if (!canSplitIdx(LD))
+        return SDValue();
+      SDValue Idx = SplitIndexingFromLoad(LD);
+      SDValue Ops[] = {Val, Idx, Chain};
+      return CombineTo(LD, Ops, 3);
+    }
+    return CombineTo(LD, Val, Chain);
+  };
+
+  if (!STCoversLD)
+    return SDValue();
+
+  // Memory as copy space (potentially masked).
+  if (Offset == 0 && LDType == STType && STMemType == LDMemType) {
+    // Simple case: Direct non-truncating forwarding
+    if (LDType.getSizeInBits() == LdMemSize)
+      return ReplaceLd(LD, ST->getValue(), Chain);
+    // Can we model the truncate and extension with an and mask?
+    if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() &&
+        !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) {
+      // Mask to size of LDMemType
+      auto Mask =
+          DAG.getConstant(APInt::getLowBitsSet(STType.getFixedSizeInBits(),
+                                               StMemSize.getFixedValue()),
+                          SDLoc(ST), STType);
+      auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask);
+      return ReplaceLd(LD, Val, Chain);
+    }
+  }
+
+  // Handle some cases for big-endian that would be Offset 0 and handled for
+  // little-endian.
+  SDValue Val = ST->getValue();
+  if (DAG.getDataLayout().isBigEndian() && Offset > 0 && OrigOffset == 0) {
+    if (STType.isInteger() && !STType.isVector() && LDType.isInteger() &&
+        !LDType.isVector() && isTypeLegal(STType) &&
+        TLI.isOperationLegal(ISD::SRL, STType)) {
+      Val = DAG.getNode(ISD::SRL, SDLoc(LD), STType, Val,
+                        DAG.getConstant(Offset * 8, SDLoc(LD), STType));
+      Offset = 0;
+    }
+  }
+
+  // TODO: Deal with nonzero offset.
+  if (LD->getBasePtr().isUndef() || Offset != 0)
+    return SDValue();
+  // Model necessary truncations / extenstions.
+  // Truncate Value To Stored Memory Size.
+  do {
+    if (!getTruncatedStoreValue(ST, Val))
+      continue;
+    if (!isTypeLegal(LDMemType))
+      continue;
+    if (STMemType != LDMemType) {
+      // TODO: Support vectors? This requires extract_subvector/bitcast.
+      if (!STMemType.isVector() && !LDMemType.isVector() &&
+          STMemType.isInteger() && LDMemType.isInteger())
+        Val = DAG.getNode(ISD::TRUNCATE, SDLoc(LD), LDMemType, Val);
+      else
+        continue;
+    }
+    if (!extendLoadedValueToExtension(LD, Val))
+      continue;
+    return ReplaceLd(LD, Val, Chain);
+  } while (false);
+
+  // On failure, cleanup dead nodes we may have created.
+  if (Val->use_empty())
+    deleteAndRecombine(Val.getNode());
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitLOAD(SDNode *N) {
+  LoadSDNode *LD  = cast<LoadSDNode>(N);
+  SDValue Chain = LD->getChain();
+  SDValue Ptr   = LD->getBasePtr();
+
+  // If load is not volatile and there are no uses of the loaded value (and
+  // the updated indexed value in case of indexed loads), change uses of the
+  // chain value into uses of the chain input (i.e. delete the dead load).
+  // TODO: Allow this for unordered atomics (see D66309)
+  if (LD->isSimple()) {
+    if (N->getValueType(1) == MVT::Other) {
+      // Unindexed loads.
+      if (!N->hasAnyUseOfValue(0)) {
+        // It's not safe to use the two value CombineTo variant here. e.g.
+        // v1, chain2 = load chain1, loc
+        // v2, chain3 = load chain2, loc
+        // v3         = add v2, c
+        // Now we replace use of chain2 with chain1.  This makes the second load
+        // isomorphic to the one we are deleting, and thus makes this load live.
+        LLVM_DEBUG(dbgs() << "\nReplacing.6 "; N->dump(&DAG);
+                   dbgs() << "\nWith chain: "; Chain.dump(&DAG);
+                   dbgs() << "\n");
+        WorklistRemover DeadNodes(*this);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
+        AddUsersToWorklist(Chain.getNode());
+        if (N->use_empty())
+          deleteAndRecombine(N);
+
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      }
+    } else {
+      // Indexed loads.
+      assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");
+
+      // If this load has an opaque TargetConstant offset, then we cannot split
+      // the indexing into an add/sub directly (that TargetConstant may not be
+      // valid for a different type of node, and we cannot convert an opaque
+      // target constant into a regular constant).
+      bool CanSplitIdx = canSplitIdx(LD);
+
+      if (!N->hasAnyUseOfValue(0) && (CanSplitIdx || !N->hasAnyUseOfValue(1))) {
+        SDValue Undef = DAG.getUNDEF(N->getValueType(0));
+        SDValue Index;
+        if (N->hasAnyUseOfValue(1) && CanSplitIdx) {
+          Index = SplitIndexingFromLoad(LD);
+          // Try to fold the base pointer arithmetic into subsequent loads and
+          // stores.
+          AddUsersToWorklist(N);
+        } else
+          Index = DAG.getUNDEF(N->getValueType(1));
+        LLVM_DEBUG(dbgs() << "\nReplacing.7 "; N->dump(&DAG);
+                   dbgs() << "\nWith: "; Undef.dump(&DAG);
+                   dbgs() << " and 2 other values\n");
+        WorklistRemover DeadNodes(*this);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
+        deleteAndRecombine(N);
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      }
+    }
+  }
+
+  // If this load is directly stored, replace the load value with the stored
+  // value.
+  if (auto V = ForwardStoreValueToDirectLoad(LD))
+    return V;
+
+  // Try to infer better alignment information than the load already has.
+  if (OptLevel != CodeGenOpt::None && LD->isUnindexed() && !LD->isAtomic()) {
+    if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) {
+      if (*Alignment > LD->getAlign() &&
+          isAligned(*Alignment, LD->getSrcValueOffset())) {
+        SDValue NewLoad = DAG.getExtLoad(
+            LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr,
+            LD->getPointerInfo(), LD->getMemoryVT(), *Alignment,
+            LD->getMemOperand()->getFlags(), LD->getAAInfo());
+        // NewLoad will always be N as we are only refining the alignment
+        assert(NewLoad.getNode() == N);
+        (void)NewLoad;
+      }
+    }
+  }
+
+  if (LD->isUnindexed()) {
+    // Walk up chain skipping non-aliasing memory nodes.
+    SDValue BetterChain = FindBetterChain(LD, Chain);
+
+    // If there is a better chain.
+    if (Chain != BetterChain) {
+      SDValue ReplLoad;
+
+      // Replace the chain to void dependency.
+      if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
+        ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD),
+                               BetterChain, Ptr, LD->getMemOperand());
+      } else {
+        ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD),
+                                  LD->getValueType(0),
+                                  BetterChain, Ptr, LD->getMemoryVT(),
+                                  LD->getMemOperand());
+      }
+
+      // Create token factor to keep old chain connected.
+      SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N),
+                                  MVT::Other, Chain, ReplLoad.getValue(1));
+
+      // Replace uses with load result and token factor
+      return CombineTo(N, ReplLoad.getValue(0), Token);
+    }
+  }
+
+  // Try transforming N to an indexed load.
+  if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+    return SDValue(N, 0);
+
+  // Try to slice up N to more direct loads if the slices are mapped to
+  // different register banks or pairing can take place.
+  if (SliceUpLoad(N))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+namespace {
+
+/// Helper structure used to slice a load in smaller loads.
+/// Basically a slice is obtained from the following sequence:
+/// Origin = load Ty1, Base
+/// Shift = srl Ty1 Origin, CstTy Amount
+/// Inst = trunc Shift to Ty2
+///
+/// Then, it will be rewritten into:
+/// Slice = load SliceTy, Base + SliceOffset
+/// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2
+///
+/// SliceTy is deduced from the number of bits that are actually used to
+/// build Inst.
+struct LoadedSlice {
+  /// Helper structure used to compute the cost of a slice.
+  struct Cost {
+    /// Are we optimizing for code size.
+    bool ForCodeSize = false;
+
+    /// Various cost.
+    unsigned Loads = 0;
+    unsigned Truncates = 0;
+    unsigned CrossRegisterBanksCopies = 0;
+    unsigned ZExts = 0;
+    unsigned Shift = 0;
+
+    explicit Cost(bool ForCodeSize) : ForCodeSize(ForCodeSize) {}
+
+    /// Get the cost of one isolated slice.
+    Cost(const LoadedSlice &LS, bool ForCodeSize)
+        : ForCodeSize(ForCodeSize), Loads(1) {
+      EVT TruncType = LS.Inst->getValueType(0);
+      EVT LoadedType = LS.getLoadedType();
+      if (TruncType != LoadedType &&
+          !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType))
+        ZExts = 1;
+    }
+
+    /// Account for slicing gain in the current cost.
+    /// Slicing provide a few gains like removing a shift or a
+    /// truncate. This method allows to grow the cost of the original
+    /// load with the gain from this slice.
+    void addSliceGain(const LoadedSlice &LS) {
+      // Each slice saves a truncate.
+      const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
+      if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(),
+                              LS.Inst->getValueType(0)))
+        ++Truncates;
+      // If there is a shift amount, this slice gets rid of it.
+      if (LS.Shift)
+        ++Shift;
+      // If this slice can merge a cross register bank copy, account for it.
+      if (LS.canMergeExpensiveCrossRegisterBankCopy())
+        ++CrossRegisterBanksCopies;
+    }
+
+    Cost &operator+=(const Cost &RHS) {
+      Loads += RHS.Loads;
+      Truncates += RHS.Truncates;
+      CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies;
+      ZExts += RHS.ZExts;
+      Shift += RHS.Shift;
+      return *this;
+    }
+
+    bool operator==(const Cost &RHS) const {
+      return Loads == RHS.Loads && Truncates == RHS.Truncates &&
+             CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies &&
+             ZExts == RHS.ZExts && Shift == RHS.Shift;
+    }
+
+    bool operator!=(const Cost &RHS) const { return !(*this == RHS); }
+
+    bool operator<(const Cost &RHS) const {
+      // Assume cross register banks copies are as expensive as loads.
+      // FIXME: Do we want some more target hooks?
+      unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies;
+      unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies;
+      // Unless we are optimizing for code size, consider the
+      // expensive operation first.
+      if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS)
+        return ExpensiveOpsLHS < ExpensiveOpsRHS;
+      return (Truncates + ZExts + Shift + ExpensiveOpsLHS) <
+             (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS);
+    }
+
+    bool operator>(const Cost &RHS) const { return RHS < *this; }
+
+    bool operator<=(const Cost &RHS) const { return !(RHS < *this); }
+
+    bool operator>=(const Cost &RHS) const { return !(*this < RHS); }
+  };
+
+  // The last instruction that represent the slice. This should be a
+  // truncate instruction.
+  SDNode *Inst;
+
+  // The original load instruction.
+  LoadSDNode *Origin;
+
+  // The right shift amount in bits from the original load.
+  unsigned Shift;
+
+  // The DAG from which Origin came from.
+  // This is used to get some contextual information about legal types, etc.
+  SelectionDAG *DAG;
+
+  LoadedSlice(SDNode *Inst = nullptr, LoadSDNode *Origin = nullptr,
+              unsigned Shift = 0, SelectionDAG *DAG = nullptr)
+      : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}
+
+  /// Get the bits used in a chunk of bits \p BitWidth large.
+  /// \return Result is \p BitWidth and has used bits set to 1 and
+  ///         not used bits set to 0.
+  APInt getUsedBits() const {
+    // Reproduce the trunc(lshr) sequence:
+    // - Start from the truncated value.
+    // - Zero extend to the desired bit width.
+    // - Shift left.
+    assert(Origin && "No original load to compare against.");
+    unsigned BitWidth = Origin->getValueSizeInBits(0);
+    assert(Inst && "This slice is not bound to an instruction");
+    assert(Inst->getValueSizeInBits(0) <= BitWidth &&
+           "Extracted slice is bigger than the whole type!");
+    APInt UsedBits(Inst->getValueSizeInBits(0), 0);
+    UsedBits.setAllBits();
+    UsedBits = UsedBits.zext(BitWidth);
+    UsedBits <<= Shift;
+    return UsedBits;
+  }
+
+  /// Get the size of the slice to be loaded in bytes.
+  unsigned getLoadedSize() const {
+    unsigned SliceSize = getUsedBits().popcount();
+    assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
+    return SliceSize / 8;
+  }
+
+  /// Get the type that will be loaded for this slice.
+  /// Note: This may not be the final type for the slice.
+  EVT getLoadedType() const {
+    assert(DAG && "Missing context");
+    LLVMContext &Ctxt = *DAG->getContext();
+    return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8);
+  }
+
+  /// Get the alignment of the load used for this slice.
+  Align getAlign() const {
+    Align Alignment = Origin->getAlign();
+    uint64_t Offset = getOffsetFromBase();
+    if (Offset != 0)
+      Alignment = commonAlignment(Alignment, Alignment.value() + Offset);
+    return Alignment;
+  }
+
+  /// Check if this slice can be rewritten with legal operations.
+  bool isLegal() const {
+    // An invalid slice is not legal.
+    if (!Origin || !Inst || !DAG)
+      return false;
+
+    // Offsets are for indexed load only, we do not handle that.
+    if (!Origin->getOffset().isUndef())
+      return false;
+
+    const TargetLowering &TLI = DAG->getTargetLoweringInfo();
+
+    // Check that the type is legal.
+    EVT SliceType = getLoadedType();
+    if (!TLI.isTypeLegal(SliceType))
+      return false;
+
+    // Check that the load is legal for this type.
+    if (!TLI.isOperationLegal(ISD::LOAD, SliceType))
+      return false;
+
+    // Check that the offset can be computed.
+    // 1. Check its type.
+    EVT PtrType = Origin->getBasePtr().getValueType();
+    if (PtrType == MVT::Untyped || PtrType.isExtended())
+      return false;
+
+    // 2. Check that it fits in the immediate.
+    if (!TLI.isLegalAddImmediate(getOffsetFromBase()))
+      return false;
+
+    // 3. Check that the computation is legal.
+    if (!TLI.isOperationLegal(ISD::ADD, PtrType))
+      return false;
+
+    // Check that the zext is legal if it needs one.
+    EVT TruncateType = Inst->getValueType(0);
+    if (TruncateType != SliceType &&
+        !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType))
+      return false;
+
+    return true;
+  }
+
+  /// Get the offset in bytes of this slice in the original chunk of
+  /// bits.
+  /// \pre DAG != nullptr.
+  uint64_t getOffsetFromBase() const {
+    assert(DAG && "Missing context.");
+    bool IsBigEndian = DAG->getDataLayout().isBigEndian();
+    assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported.");
+    uint64_t Offset = Shift / 8;
+    unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8;
+    assert(!(Origin->getValueSizeInBits(0) & 0x7) &&
+           "The size of the original loaded type is not a multiple of a"
+           " byte.");
+    // If Offset is bigger than TySizeInBytes, it means we are loading all
+    // zeros. This should have been optimized before in the process.
+    assert(TySizeInBytes > Offset &&
+           "Invalid shift amount for given loaded size");
+    if (IsBigEndian)
+      Offset = TySizeInBytes - Offset - getLoadedSize();
+    return Offset;
+  }
+
+  /// Generate the sequence of instructions to load the slice
+  /// represented by this object and redirect the uses of this slice to
+  /// this new sequence of instructions.
+  /// \pre this->Inst && this->Origin are valid Instructions and this
+  /// object passed the legal check: LoadedSlice::isLegal returned true.
+  /// \return The last instruction of the sequence used to load the slice.
+  SDValue loadSlice() const {
+    assert(Inst && Origin && "Unable to replace a non-existing slice.");
+    const SDValue &OldBaseAddr = Origin->getBasePtr();
+    SDValue BaseAddr = OldBaseAddr;
+    // Get the offset in that chunk of bytes w.r.t. the endianness.
+    int64_t Offset = static_cast<int64_t>(getOffsetFromBase());
+    assert(Offset >= 0 && "Offset too big to fit in int64_t!");
+    if (Offset) {
+      // BaseAddr = BaseAddr + Offset.
+      EVT ArithType = BaseAddr.getValueType();
+      SDLoc DL(Origin);
+      BaseAddr = DAG->getNode(ISD::ADD, DL, ArithType, BaseAddr,
+                              DAG->getConstant(Offset, DL, ArithType));
+    }
+
+    // Create the type of the loaded slice according to its size.
+    EVT SliceType = getLoadedType();
+
+    // Create the load for the slice.
+    SDValue LastInst =
+        DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
+                     Origin->getPointerInfo().getWithOffset(Offset), getAlign(),
+                     Origin->getMemOperand()->getFlags());
+    // If the final type is not the same as the loaded type, this means that
+    // we have to pad with zero. Create a zero extend for that.
+    EVT FinalType = Inst->getValueType(0);
+    if (SliceType != FinalType)
+      LastInst =
+          DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst);
+    return LastInst;
+  }
+
+  /// Check if this slice can be merged with an expensive cross register
+  /// bank copy. E.g.,
+  /// i = load i32
+  /// f = bitcast i32 i to float
+  bool canMergeExpensiveCrossRegisterBankCopy() const {
+    if (!Inst || !Inst->hasOneUse())
+      return false;
+    SDNode *Use = *Inst->use_begin();
+    if (Use->getOpcode() != ISD::BITCAST)
+      return false;
+    assert(DAG && "Missing context");
+    const TargetLowering &TLI = DAG->getTargetLoweringInfo();
+    EVT ResVT = Use->getValueType(0);
+    const TargetRegisterClass *ResRC =
+        TLI.getRegClassFor(ResVT.getSimpleVT(), Use->isDivergent());
+    const TargetRegisterClass *ArgRC =
+        TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT(),
+                           Use->getOperand(0)->isDivergent());
+    if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT))
+      return false;
+
+    // At this point, we know that we perform a cross-register-bank copy.
+    // Check if it is expensive.
+    const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo();
+    // Assume bitcasts are cheap, unless both register classes do not
+    // explicitly share a common sub class.
+    if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC))
+      return false;
+
+    // Check if it will be merged with the load.
+    // 1. Check the alignment / fast memory access constraint.
+    unsigned IsFast = 0;
+    if (!TLI.allowsMemoryAccess(*DAG->getContext(), DAG->getDataLayout(), ResVT,
+                                Origin->getAddressSpace(), getAlign(),
+                                Origin->getMemOperand()->getFlags(), &IsFast) ||
+        !IsFast)
+      return false;
+
+    // 2. Check that the load is a legal operation for that type.
+    if (!TLI.isOperationLegal(ISD::LOAD, ResVT))
+      return false;
+
+    // 3. Check that we do not have a zext in the way.
+    if (Inst->getValueType(0) != getLoadedType())
+      return false;
+
+    return true;
+  }
+};
+
+} // end anonymous namespace
+
+/// Check that all bits set in \p UsedBits form a dense region, i.e.,
+/// \p UsedBits looks like 0..0 1..1 0..0.
+static bool areUsedBitsDense(const APInt &UsedBits) {
+  // If all the bits are one, this is dense!
+  if (UsedBits.isAllOnes())
+    return true;
+
+  // Get rid of the unused bits on the right.
+  APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countr_zero());
+  // Get rid of the unused bits on the left.
+  if (NarrowedUsedBits.countl_zero())
+    NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits());
+  // Check that the chunk of bits is completely used.
+  return NarrowedUsedBits.isAllOnes();
+}
+
+/// Check whether or not \p First and \p Second are next to each other
+/// in memory. This means that there is no hole between the bits loaded
+/// by \p First and the bits loaded by \p Second.
+static bool areSlicesNextToEachOther(const LoadedSlice &First,
+                                     const LoadedSlice &Second) {
+  assert(First.Origin == Second.Origin && First.Origin &&
+         "Unable to match different memory origins.");
+  APInt UsedBits = First.getUsedBits();
+  assert((UsedBits & Second.getUsedBits()) == 0 &&
+         "Slices are not supposed to overlap.");
+  UsedBits |= Second.getUsedBits();
+  return areUsedBitsDense(UsedBits);
+}
+
+/// Adjust the \p GlobalLSCost according to the target
+/// paring capabilities and the layout of the slices.
+/// \pre \p GlobalLSCost should account for at least as many loads as
+/// there is in the slices in \p LoadedSlices.
+static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
+                                 LoadedSlice::Cost &GlobalLSCost) {
+  unsigned NumberOfSlices = LoadedSlices.size();
+  // If there is less than 2 elements, no pairing is possible.
+  if (NumberOfSlices < 2)
+    return;
+
+  // Sort the slices so that elements that are likely to be next to each
+  // other in memory are next to each other in the list.
+  llvm::sort(LoadedSlices, [](const LoadedSlice &LHS, const LoadedSlice &RHS) {
+    assert(LHS.Origin == RHS.Origin && "Different bases not implemented.");
+    return LHS.getOffsetFromBase() < RHS.getOffsetFromBase();
+  });
+  const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo();
+  // First (resp. Second) is the first (resp. Second) potentially candidate
+  // to be placed in a paired load.
+  const LoadedSlice *First = nullptr;
+  const LoadedSlice *Second = nullptr;
+  for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
+                // Set the beginning of the pair.
+                                                           First = Second) {
+    Second = &LoadedSlices[CurrSlice];
+
+    // If First is NULL, it means we start a new pair.
+    // Get to the next slice.
+    if (!First)
+      continue;
+
+    EVT LoadedType = First->getLoadedType();
+
+    // If the types of the slices are different, we cannot pair them.
+    if (LoadedType != Second->getLoadedType())
+      continue;
+
+    // Check if the target supplies paired loads for this type.
+    Align RequiredAlignment;
+    if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
+      // move to the next pair, this type is hopeless.
+      Second = nullptr;
+      continue;
+    }
+    // Check if we meet the alignment requirement.
+    if (First->getAlign() < RequiredAlignment)
+      continue;
+
+    // Check that both loads are next to each other in memory.
+    if (!areSlicesNextToEachOther(*First, *Second))
+      continue;
+
+    assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!");
+    --GlobalLSCost.Loads;
+    // Move to the next pair.
+    Second = nullptr;
+  }
+}
+
+/// Check the profitability of all involved LoadedSlice.
+/// Currently, it is considered profitable if there is exactly two
+/// involved slices (1) which are (2) next to each other in memory, and
+/// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3).
+///
+/// Note: The order of the elements in \p LoadedSlices may be modified, but not
+/// the elements themselves.
+///
+/// FIXME: When the cost model will be mature enough, we can relax
+/// constraints (1) and (2).
+static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices,
+                                const APInt &UsedBits, bool ForCodeSize) {
+  unsigned NumberOfSlices = LoadedSlices.size();
+  if (StressLoadSlicing)
+    return NumberOfSlices > 1;
+
+  // Check (1).
+  if (NumberOfSlices != 2)
+    return false;
+
+  // Check (2).
+  if (!areUsedBitsDense(UsedBits))
+    return false;
+
+  // Check (3).
+  LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize);
+  // The original code has one big load.
+  OrigCost.Loads = 1;
+  for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) {
+    const LoadedSlice &LS = LoadedSlices[CurrSlice];
+    // Accumulate the cost of all the slices.
+    LoadedSlice::Cost SliceCost(LS, ForCodeSize);
+    GlobalSlicingCost += SliceCost;
+
+    // Account as cost in the original configuration the gain obtained
+    // with the current slices.
+    OrigCost.addSliceGain(LS);
+  }
+
+  // If the target supports paired load, adjust the cost accordingly.
+  adjustCostForPairing(LoadedSlices, GlobalSlicingCost);
+  return OrigCost > GlobalSlicingCost;
+}
+
+/// If the given load, \p LI, is used only by trunc or trunc(lshr)
+/// operations, split it in the various pieces being extracted.
+///
+/// This sort of thing is introduced by SROA.
+/// This slicing takes care not to insert overlapping loads.
+/// \pre LI is a simple load (i.e., not an atomic or volatile load).
+bool DAGCombiner::SliceUpLoad(SDNode *N) {
+  if (Level < AfterLegalizeDAG)
+    return false;
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (!LD->isSimple() || !ISD::isNormalLoad(LD) ||
+      !LD->getValueType(0).isInteger())
+    return false;
+
+  // The algorithm to split up a load of a scalable vector into individual
+  // elements currently requires knowing the length of the loaded type,
+  // so will need adjusting to work on scalable vectors.
+  if (LD->getValueType(0).isScalableVector())
+    return false;
+
+  // Keep track of already used bits to detect overlapping values.
+  // In that case, we will just abort the transformation.
+  APInt UsedBits(LD->getValueSizeInBits(0), 0);
+
+  SmallVector<LoadedSlice, 4> LoadedSlices;
+
+  // Check if this load is used as several smaller chunks of bits.
+  // Basically, look for uses in trunc or trunc(lshr) and record a new chain
+  // of computation for each trunc.
+  for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
+       UI != UIEnd; ++UI) {
+    // Skip the uses of the chain.
+    if (UI.getUse().getResNo() != 0)
+      continue;
+
+    SDNode *User = *UI;
+    unsigned Shift = 0;
+
+    // Check if this is a trunc(lshr).
+    if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
+        isa<ConstantSDNode>(User->getOperand(1))) {
+      Shift = User->getConstantOperandVal(1);
+      User = *User->use_begin();
+    }
+
+    // At this point, User is a Truncate, iff we encountered, trunc or
+    // trunc(lshr).
+    if (User->getOpcode() != ISD::TRUNCATE)
+      return false;
+
+    // The width of the type must be a power of 2 and greater than 8-bits.
+    // Otherwise the load cannot be represented in LLVM IR.
+    // Moreover, if we shifted with a non-8-bits multiple, the slice
+    // will be across several bytes. We do not support that.
+    unsigned Width = User->getValueSizeInBits(0);
+    if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7))
+      return false;
+
+    // Build the slice for this chain of computations.
+    LoadedSlice LS(User, LD, Shift, &DAG);
+    APInt CurrentUsedBits = LS.getUsedBits();
+
+    // Check if this slice overlaps with another.
+    if ((CurrentUsedBits & UsedBits) != 0)
+      return false;
+    // Update the bits used globally.
+    UsedBits |= CurrentUsedBits;
+
+    // Check if the new slice would be legal.
+    if (!LS.isLegal())
+      return false;
+
+    // Record the slice.
+    LoadedSlices.push_back(LS);
+  }
+
+  // Abort slicing if it does not seem to be profitable.
+  if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize))
+    return false;
+
+  ++SlicedLoads;
+
+  // Rewrite each chain to use an independent load.
+  // By construction, each chain can be represented by a unique load.
+
+  // Prepare the argument for the new token factor for all the slices.
+  SmallVector<SDValue, 8> ArgChains;
+  for (const LoadedSlice &LS : LoadedSlices) {
+    SDValue SliceInst = LS.loadSlice();
+    CombineTo(LS.Inst, SliceInst, true);
+    if (SliceInst.getOpcode() != ISD::LOAD)
+      SliceInst = SliceInst.getOperand(0);
+    assert(SliceInst->getOpcode() == ISD::LOAD &&
+           "It takes more than a zext to get to the loaded slice!!");
+    ArgChains.push_back(SliceInst.getValue(1));
+  }
+
+  SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
+                              ArgChains);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
+  AddToWorklist(Chain.getNode());
+  return true;
+}
+
+/// Check to see if V is (and load (ptr), imm), where the load is having
+/// specific bytes cleared out.  If so, return the byte size being masked out
+/// and the shift amount.
+static std::pair<unsigned, unsigned>
+CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
+  std::pair<unsigned, unsigned> Result(0, 0);
+
+  // Check for the structure we're looking for.
+  if (V->getOpcode() != ISD::AND ||
+      !isa<ConstantSDNode>(V->getOperand(1)) ||
+      !ISD::isNormalLoad(V->getOperand(0).getNode()))
+    return Result;
+
+  // Check the chain and pointer.
+  LoadSDNode *LD = cast<LoadSDNode>(V->getOperand(0));
+  if (LD->getBasePtr() != Ptr) return Result;  // Not from same pointer.
+
+  // This only handles simple types.
+  if (V.getValueType() != MVT::i16 &&
+      V.getValueType() != MVT::i32 &&
+      V.getValueType() != MVT::i64)
+    return Result;
+
+  // Check the constant mask.  Invert it so that the bits being masked out are
+  // 0 and the bits being kept are 1.  Use getSExtValue so that leading bits
+  // follow the sign bit for uniformity.
+  uint64_t NotMask = ~cast<ConstantSDNode>(V->getOperand(1))->getSExtValue();
+  unsigned NotMaskLZ = llvm::countl_zero(NotMask);
+  if (NotMaskLZ & 7) return Result;  // Must be multiple of a byte.
+  unsigned NotMaskTZ = llvm::countr_zero(NotMask);
+  if (NotMaskTZ & 7) return Result;  // Must be multiple of a byte.
+  if (NotMaskLZ == 64) return Result;  // All zero mask.
+
+  // See if we have a continuous run of bits.  If so, we have 0*1+0*
+  if (llvm::countr_one(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64)
+    return Result;
+
+  // Adjust NotMaskLZ down to be from the actual size of the int instead of i64.
+  if (V.getValueType() != MVT::i64 && NotMaskLZ)
+    NotMaskLZ -= 64-V.getValueSizeInBits();
+
+  unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8;
+  switch (MaskedBytes) {
+  case 1:
+  case 2:
+  case 4: break;
+  default: return Result; // All one mask, or 5-byte mask.
+  }
+
+  // Verify that the first bit starts at a multiple of mask so that the access
+  // is aligned the same as the access width.
+  if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result;
+
+  // For narrowing to be valid, it must be the case that the load the
+  // immediately preceding memory operation before the store.
+  if (LD == Chain.getNode())
+    ; // ok.
+  else if (Chain->getOpcode() == ISD::TokenFactor &&
+           SDValue(LD, 1).hasOneUse()) {
+    // LD has only 1 chain use so they are no indirect dependencies.
+    if (!LD->isOperandOf(Chain.getNode()))
+      return Result;
+  } else
+    return Result; // Fail.
+
+  Result.first = MaskedBytes;
+  Result.second = NotMaskTZ/8;
+  return Result;
+}
+
+/// Check to see if IVal is something that provides a value as specified by
+/// MaskInfo. If so, replace the specified store with a narrower store of
+/// truncated IVal.
+static SDValue
+ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
+                                SDValue IVal, StoreSDNode *St,
+                                DAGCombiner *DC) {
+  unsigned NumBytes = MaskInfo.first;
+  unsigned ByteShift = MaskInfo.second;
+  SelectionDAG &DAG = DC->getDAG();
+
+  // Check to see if IVal is all zeros in the part being masked in by the 'or'
+  // that uses this.  If not, this is not a replacement.
+  APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(),
+                                  ByteShift*8, (ByteShift+NumBytes)*8);
+  if (!DAG.MaskedValueIsZero(IVal, Mask)) return SDValue();
+
+  // Check that it is legal on the target to do this.  It is legal if the new
+  // VT we're shrinking to (i8/i16/i32) is legal or we're still before type
+  // legalization. If the source type is legal, but the store type isn't, see
+  // if we can use a truncating store.
+  MVT VT = MVT::getIntegerVT(NumBytes * 8);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  bool UseTruncStore;
+  if (DC->isTypeLegal(VT))
+    UseTruncStore = false;
+  else if (TLI.isTypeLegal(IVal.getValueType()) &&
+           TLI.isTruncStoreLegal(IVal.getValueType(), VT))
+    UseTruncStore = true;
+  else
+    return SDValue();
+
+  // Can't do this for indexed stores.
+  if (St->isIndexed())
+    return SDValue();
+
+  // Check that the target doesn't think this is a bad idea.
+  if (St->getMemOperand() &&
+      !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                              *St->getMemOperand()))
+    return SDValue();
+
+  // Okay, we can do this!  Replace the 'St' store with a store of IVal that is
+  // shifted by ByteShift and truncated down to NumBytes.
+  if (ByteShift) {
+    SDLoc DL(IVal);
+    IVal = DAG.getNode(ISD::SRL, DL, IVal.getValueType(), IVal,
+                       DAG.getConstant(ByteShift*8, DL,
+                                    DC->getShiftAmountTy(IVal.getValueType())));
+  }
+
+  // Figure out the offset for the store and the alignment of the access.
+  unsigned StOffset;
+  if (DAG.getDataLayout().isLittleEndian())
+    StOffset = ByteShift;
+  else
+    StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes;
+
+  SDValue Ptr = St->getBasePtr();
+  if (StOffset) {
+    SDLoc DL(IVal);
+    Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(StOffset), DL);
+  }
+
+  ++OpsNarrowed;
+  if (UseTruncStore)
+    return DAG.getTruncStore(St->getChain(), SDLoc(St), IVal, Ptr,
+                             St->getPointerInfo().getWithOffset(StOffset),
+                             VT, St->getOriginalAlign());
+
+  // Truncate down to the new size.
+  IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal);
+
+  return DAG
+      .getStore(St->getChain(), SDLoc(St), IVal, Ptr,
+                St->getPointerInfo().getWithOffset(StOffset),
+                St->getOriginalAlign());
+}
+
+/// Look for sequence of load / op / store where op is one of 'or', 'xor', and
+/// 'and' of immediates. If 'op' is only touching some of the loaded bits, try
+/// narrowing the load and store if it would end up being a win for performance
+/// or code size.
+SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
+  StoreSDNode *ST  = cast<StoreSDNode>(N);
+  if (!ST->isSimple())
+    return SDValue();
+
+  SDValue Chain = ST->getChain();
+  SDValue Value = ST->getValue();
+  SDValue Ptr   = ST->getBasePtr();
+  EVT VT = Value.getValueType();
+
+  if (ST->isTruncatingStore() || VT.isVector())
+    return SDValue();
+
+  unsigned Opc = Value.getOpcode();
+
+  if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
+      !Value.hasOneUse())
+    return SDValue();
+
+  // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
+  // is a byte mask indicating a consecutive number of bytes, check to see if
+  // Y is known to provide just those bytes.  If so, we try to replace the
+  // load + replace + store sequence with a single (narrower) store, which makes
+  // the load dead.
+  if (Opc == ISD::OR && EnableShrinkLoadReplaceStoreWithStore) {
+    std::pair<unsigned, unsigned> MaskedLoad;
+    MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain);
+    if (MaskedLoad.first)
+      if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
+                                                  Value.getOperand(1), ST,this))
+        return NewST;
+
+    // Or is commutative, so try swapping X and Y.
+    MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain);
+    if (MaskedLoad.first)
+      if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
+                                                  Value.getOperand(0), ST,this))
+        return NewST;
+  }
+
+  if (!EnableReduceLoadOpStoreWidth)
+    return SDValue();
+
+  if (Value.getOperand(1).getOpcode() != ISD::Constant)
+    return SDValue();
+
+  SDValue N0 = Value.getOperand(0);
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      Chain == SDValue(N0.getNode(), 1)) {
+    LoadSDNode *LD = cast<LoadSDNode>(N0);
+    if (LD->getBasePtr() != Ptr ||
+        LD->getPointerInfo().getAddrSpace() !=
+        ST->getPointerInfo().getAddrSpace())
+      return SDValue();
+
+    // Find the type to narrow it the load / op / store to.
+    SDValue N1 = Value.getOperand(1);
+    unsigned BitWidth = N1.getValueSizeInBits();
+    APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue();
+    if (Opc == ISD::AND)
+      Imm ^= APInt::getAllOnes(BitWidth);
+    if (Imm == 0 || Imm.isAllOnes())
+      return SDValue();
+    unsigned ShAmt = Imm.countr_zero();
+    unsigned MSB = BitWidth - Imm.countl_zero() - 1;
+    unsigned NewBW = NextPowerOf2(MSB - ShAmt);
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
+    // The narrowing should be profitable, the load/store operation should be
+    // legal (or custom) and the store size should be equal to the NewVT width.
+    while (NewBW < BitWidth &&
+           (NewVT.getStoreSizeInBits() != NewBW ||
+            !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
+            !TLI.isNarrowingProfitable(VT, NewVT))) {
+      NewBW = NextPowerOf2(NewBW);
+      NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
+    }
+    if (NewBW >= BitWidth)
+      return SDValue();
+
+    // If the lsb changed does not start at the type bitwidth boundary,
+    // start at the previous one.
+    if (ShAmt % NewBW)
+      ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
+    APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
+                                   std::min(BitWidth, ShAmt + NewBW));
+    if ((Imm & Mask) == Imm) {
+      APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
+      if (Opc == ISD::AND)
+        NewImm ^= APInt::getAllOnes(NewBW);
+      uint64_t PtrOff = ShAmt / 8;
+      // For big endian targets, we need to adjust the offset to the pointer to
+      // load the correct bytes.
+      if (DAG.getDataLayout().isBigEndian())
+        PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
+
+      unsigned IsFast = 0;
+      Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
+      if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
+                                  LD->getAddressSpace(), NewAlign,
+                                  LD->getMemOperand()->getFlags(), &IsFast) ||
+          !IsFast)
+        return SDValue();
+
+      SDValue NewPtr =
+          DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(PtrOff), SDLoc(LD));
+      SDValue NewLD =
+          DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
+                      LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
+                      LD->getMemOperand()->getFlags(), LD->getAAInfo());
+      SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
+                                   DAG.getConstant(NewImm, SDLoc(Value),
+                                                   NewVT));
+      SDValue NewST =
+          DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
+                       ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
+
+      AddToWorklist(NewPtr.getNode());
+      AddToWorklist(NewLD.getNode());
+      AddToWorklist(NewVal.getNode());
+      WorklistRemover DeadNodes(*this);
+      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
+      ++OpsNarrowed;
+      return NewST;
+    }
+  }
+
+  return SDValue();
+}
+
+/// For a given floating point load / store pair, if the load value isn't used
+/// by any other operations, then consider transforming the pair to integer
+/// load / store operations if the target deems the transformation profitable.
+SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
+  StoreSDNode *ST  = cast<StoreSDNode>(N);
+  SDValue Value = ST->getValue();
+  if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) &&
+      Value.hasOneUse()) {
+    LoadSDNode *LD = cast<LoadSDNode>(Value);
+    EVT VT = LD->getMemoryVT();
+    if (!VT.isFloatingPoint() ||
+        VT != ST->getMemoryVT() ||
+        LD->isNonTemporal() ||
+        ST->isNonTemporal() ||
+        LD->getPointerInfo().getAddrSpace() != 0 ||
+        ST->getPointerInfo().getAddrSpace() != 0)
+      return SDValue();
+
+    TypeSize VTSize = VT.getSizeInBits();
+
+    // We don't know the size of scalable types at compile time so we cannot
+    // create an integer of the equivalent size.
+    if (VTSize.isScalable())
+      return SDValue();
+
+    unsigned FastLD = 0, FastST = 0;
+    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VTSize.getFixedValue());
+    if (!TLI.isOperationLegal(ISD::LOAD, IntVT) ||
+        !TLI.isOperationLegal(ISD::STORE, IntVT) ||
+        !TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) ||
+        !TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT) ||
+        !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), IntVT,
+                                *LD->getMemOperand(), &FastLD) ||
+        !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), IntVT,
+                                *ST->getMemOperand(), &FastST) ||
+        !FastLD || !FastST)
+      return SDValue();
+
+    SDValue NewLD =
+        DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(),
+                    LD->getPointerInfo(), LD->getAlign());
+
+    SDValue NewST =
+        DAG.getStore(ST->getChain(), SDLoc(N), NewLD, ST->getBasePtr(),
+                     ST->getPointerInfo(), ST->getAlign());
+
+    AddToWorklist(NewLD.getNode());
+    AddToWorklist(NewST.getNode());
+    WorklistRemover DeadNodes(*this);
+    DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1));
+    ++LdStFP2Int;
+    return NewST;
+  }
+
+  return SDValue();
+}
+
+// This is a helper function for visitMUL to check the profitability
+// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
+// MulNode is the original multiply, AddNode is (add x, c1),
+// and ConstNode is c2.
+//
+// If the (add x, c1) has multiple uses, we could increase
+// the number of adds if we make this transformation.
+// It would only be worth doing this if we can remove a
+// multiply in the process. Check for that here.
+// To illustrate:
+//     (A + c1) * c3
+//     (A + c2) * c3
+// We're checking for cases where we have common "c3 * A" expressions.
+bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, SDValue AddNode,
+                                              SDValue ConstNode) {
+  APInt Val;
+
+  // If the add only has one use, and the target thinks the folding is
+  // profitable or does not lead to worse code, this would be OK to do.
+  if (AddNode->hasOneUse() &&
+      TLI.isMulAddWithConstProfitable(AddNode, ConstNode))
+    return true;
+
+  // Walk all the users of the constant with which we're multiplying.
+  for (SDNode *Use : ConstNode->uses()) {
+    if (Use == MulNode) // This use is the one we're on right now. Skip it.
+      continue;
+
+    if (Use->getOpcode() == ISD::MUL) { // We have another multiply use.
+      SDNode *OtherOp;
+      SDNode *MulVar = AddNode.getOperand(0).getNode();
+
+      // OtherOp is what we're multiplying against the constant.
+      if (Use->getOperand(0) == ConstNode)
+        OtherOp = Use->getOperand(1).getNode();
+      else
+        OtherOp = Use->getOperand(0).getNode();
+
+      // Check to see if multiply is with the same operand of our "add".
+      //
+      //     ConstNode  = CONST
+      //     Use = ConstNode * A  <-- visiting Use. OtherOp is A.
+      //     ...
+      //     AddNode  = (A + c1)  <-- MulVar is A.
+      //         = AddNode * ConstNode   <-- current visiting instruction.
+      //
+      // If we make this transformation, we will have a common
+      // multiply (ConstNode * A) that we can save.
+      if (OtherOp == MulVar)
+        return true;
+
+      // Now check to see if a future expansion will give us a common
+      // multiply.
+      //
+      //     ConstNode  = CONST
+      //     AddNode    = (A + c1)
+      //     ...   = AddNode * ConstNode <-- current visiting instruction.
+      //     ...
+      //     OtherOp = (A + c2)
+      //     Use     = OtherOp * ConstNode <-- visiting Use.
+      //
+      // If we make this transformation, we will have a common
+      // multiply (CONST * A) after we also do the same transformation
+      // to the "t2" instruction.
+      if (OtherOp->getOpcode() == ISD::ADD &&
+          DAG.isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) &&
+          OtherOp->getOperand(0).getNode() == MulVar)
+        return true;
+    }
+  }
+
+  // Didn't find a case where this would be profitable.
+  return false;
+}
+
+SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                         unsigned NumStores) {
+  SmallVector<SDValue, 8> Chains;
+  SmallPtrSet<const SDNode *, 8> Visited;
+  SDLoc StoreDL(StoreNodes[0].MemNode);
+
+  for (unsigned i = 0; i < NumStores; ++i) {
+    Visited.insert(StoreNodes[i].MemNode);
+  }
+
+  // don't include nodes that are children or repeated nodes.
+  for (unsigned i = 0; i < NumStores; ++i) {
+    if (Visited.insert(StoreNodes[i].MemNode->getChain().getNode()).second)
+      Chains.push_back(StoreNodes[i].MemNode->getChain());
+  }
+
+  assert(Chains.size() > 0 && "Chain should have generated a chain");
+  return DAG.getTokenFactor(StoreDL, Chains);
+}
+
+bool DAGCombiner::hasSameUnderlyingObj(ArrayRef<MemOpLink> StoreNodes) {
+  const Value *UnderlyingObj = nullptr;
+  for (const auto &MemOp : StoreNodes) {
+    const MachineMemOperand *MMO = MemOp.MemNode->getMemOperand();
+    // Pseudo value like stack frame has its own frame index and size, should
+    // not use the first store's frame index for other frames.
+    if (MMO->getPseudoValue())
+      return false;
+
+    if (!MMO->getValue())
+      return false;
+
+    const Value *Obj = getUnderlyingObject(MMO->getValue());
+
+    if (UnderlyingObj && UnderlyingObj != Obj)
+      return false;
+
+    if (!UnderlyingObj)
+      UnderlyingObj = Obj;
+  }
+
+  return true;
+}
+
+bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
+    SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores,
+    bool IsConstantSrc, bool UseVector, bool UseTrunc) {
+  // Make sure we have something to merge.
+  if (NumStores < 2)
+    return false;
+
+  assert((!UseTrunc || !UseVector) &&
+         "This optimization cannot emit a vector truncating store");
+
+  // The latest Node in the DAG.
+  SDLoc DL(StoreNodes[0].MemNode);
+
+  TypeSize ElementSizeBits = MemVT.getStoreSizeInBits();
+  unsigned SizeInBits = NumStores * ElementSizeBits;
+  unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
+
+  std::optional<MachineMemOperand::Flags> Flags;
+  AAMDNodes AAInfo;
+  for (unsigned I = 0; I != NumStores; ++I) {
+    StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
+    if (!Flags) {
+      Flags = St->getMemOperand()->getFlags();
+      AAInfo = St->getAAInfo();
+      continue;
+    }
+    // Skip merging if there's an inconsistent flag.
+    if (Flags != St->getMemOperand()->getFlags())
+      return false;
+    // Concatenate AA metadata.
+    AAInfo = AAInfo.concat(St->getAAInfo());
+  }
+
+  EVT StoreTy;
+  if (UseVector) {
+    unsigned Elts = NumStores * NumMemElts;
+    // Get the type for the merged vector store.
+    StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
+  } else
+    StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
+
+  SDValue StoredVal;
+  if (UseVector) {
+    if (IsConstantSrc) {
+      SmallVector<SDValue, 8> BuildVector;
+      for (unsigned I = 0; I != NumStores; ++I) {
+        StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
+        SDValue Val = St->getValue();
+        // If constant is of the wrong type, convert it now.
+        if (MemVT != Val.getValueType()) {
+          Val = peekThroughBitcasts(Val);
+          // Deal with constants of wrong size.
+          if (ElementSizeBits != Val.getValueSizeInBits()) {
+            EVT IntMemVT =
+                EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+            if (isa<ConstantFPSDNode>(Val)) {
+              // Not clear how to truncate FP values.
+              return false;
+            }
+
+            if (auto *C = dyn_cast<ConstantSDNode>(Val))
+              Val = DAG.getConstant(C->getAPIntValue()
+                                        .zextOrTrunc(Val.getValueSizeInBits())
+                                        .zextOrTrunc(ElementSizeBits),
+                                    SDLoc(C), IntMemVT);
+          }
+          // Make sure correctly size type is the correct type.
+          Val = DAG.getBitcast(MemVT, Val);
+        }
+        BuildVector.push_back(Val);
+      }
+      StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
+                                               : ISD::BUILD_VECTOR,
+                              DL, StoreTy, BuildVector);
+    } else {
+      SmallVector<SDValue, 8> Ops;
+      for (unsigned i = 0; i < NumStores; ++i) {
+        StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+        SDValue Val = peekThroughBitcasts(St->getValue());
+        // All operands of BUILD_VECTOR / CONCAT_VECTOR must be of
+        // type MemVT. If the underlying value is not the correct
+        // type, but it is an extraction of an appropriate vector we
+        // can recast Val to be of the correct type. This may require
+        // converting between EXTRACT_VECTOR_ELT and
+        // EXTRACT_SUBVECTOR.
+        if ((MemVT != Val.getValueType()) &&
+            (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+             Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) {
+          EVT MemVTScalarTy = MemVT.getScalarType();
+          // We may need to add a bitcast here to get types to line up.
+          if (MemVTScalarTy != Val.getValueType().getScalarType()) {
+            Val = DAG.getBitcast(MemVT, Val);
+          } else if (MemVT.isVector() &&
+                     Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+            Val = DAG.getNode(ISD::BUILD_VECTOR, DL, MemVT, Val);
+          } else {
+            unsigned OpC = MemVT.isVector() ? ISD::EXTRACT_SUBVECTOR
+                                            : ISD::EXTRACT_VECTOR_ELT;
+            SDValue Vec = Val.getOperand(0);
+            SDValue Idx = Val.getOperand(1);
+            Val = DAG.getNode(OpC, SDLoc(Val), MemVT, Vec, Idx);
+          }
+        }
+        Ops.push_back(Val);
+      }
+
+      // Build the extracted vector elements back into a vector.
+      StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
+                                               : ISD::BUILD_VECTOR,
+                              DL, StoreTy, Ops);
+    }
+  } else {
+    // We should always use a vector store when merging extracted vector
+    // elements, so this path implies a store of constants.
+    assert(IsConstantSrc && "Merged vector elements should use vector store");
+
+    APInt StoreInt(SizeInBits, 0);
+
+    // Construct a single integer constant which is made of the smaller
+    // constant inputs.
+    bool IsLE = DAG.getDataLayout().isLittleEndian();
+    for (unsigned i = 0; i < NumStores; ++i) {
+      unsigned Idx = IsLE ? (NumStores - 1 - i) : i;
+      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
+
+      SDValue Val = St->getValue();
+      Val = peekThroughBitcasts(Val);
+      StoreInt <<= ElementSizeBits;
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
+        StoreInt |= C->getAPIntValue()
+                        .zextOrTrunc(ElementSizeBits)
+                        .zextOrTrunc(SizeInBits);
+      } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
+        StoreInt |= C->getValueAPF()
+                        .bitcastToAPInt()
+                        .zextOrTrunc(ElementSizeBits)
+                        .zextOrTrunc(SizeInBits);
+        // If fp truncation is necessary give up for now.
+        if (MemVT.getSizeInBits() != ElementSizeBits)
+          return false;
+      } else {
+        llvm_unreachable("Invalid constant element type");
+      }
+    }
+
+    // Create the new Load and Store operations.
+    StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);
+  }
+
+  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+  SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores);
+  bool CanReusePtrInfo = hasSameUnderlyingObj(StoreNodes);
+
+  // make sure we use trunc store if it's necessary to be legal.
+  // When generate the new widen store, if the first store's pointer info can
+  // not be reused, discard the pointer info except the address space because
+  // now the widen store can not be represented by the original pointer info
+  // which is for the narrow memory object.
+  SDValue NewStore;
+  if (!UseTrunc) {
+    NewStore = DAG.getStore(
+        NewChain, DL, StoredVal, FirstInChain->getBasePtr(),
+        CanReusePtrInfo
+            ? FirstInChain->getPointerInfo()
+            : MachinePointerInfo(FirstInChain->getPointerInfo().getAddrSpace()),
+        FirstInChain->getAlign(), *Flags, AAInfo);
+  } else { // Must be realized as a trunc store
+    EVT LegalizedStoredValTy =
+        TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
+    unsigned LegalizedStoreSize = LegalizedStoredValTy.getSizeInBits();
+    ConstantSDNode *C = cast<ConstantSDNode>(StoredVal);
+    SDValue ExtendedStoreVal =
+        DAG.getConstant(C->getAPIntValue().zextOrTrunc(LegalizedStoreSize), DL,
+                        LegalizedStoredValTy);
+    NewStore = DAG.getTruncStore(
+        NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(),
+        CanReusePtrInfo
+            ? FirstInChain->getPointerInfo()
+            : MachinePointerInfo(FirstInChain->getPointerInfo().getAddrSpace()),
+        StoredVal.getValueType() /*TVT*/, FirstInChain->getAlign(), *Flags,
+        AAInfo);
+  }
+
+  // Replace all merged stores with the new store.
+  for (unsigned i = 0; i < NumStores; ++i)
+    CombineTo(StoreNodes[i].MemNode, NewStore);
+
+  AddToWorklist(NewChain.getNode());
+  return true;
+}
+
+void DAGCombiner::getStoreMergeCandidates(
+    StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes,
+    SDNode *&RootNode) {
+  // This holds the base pointer, index, and the offset in bytes from the base
+  // pointer. We must have a base and an offset. Do not handle stores to undef
+  // base pointers.
+  BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
+  if (!BasePtr.getBase().getNode() || BasePtr.getBase().isUndef())
+    return;
+
+  SDValue Val = peekThroughBitcasts(St->getValue());
+  StoreSource StoreSrc = getStoreSource(Val);
+  assert(StoreSrc != StoreSource::Unknown && "Expected known source for store");
+
+  // Match on loadbaseptr if relevant.
+  EVT MemVT = St->getMemoryVT();
+  BaseIndexOffset LBasePtr;
+  EVT LoadVT;
+  if (StoreSrc == StoreSource::Load) {
+    auto *Ld = cast<LoadSDNode>(Val);
+    LBasePtr = BaseIndexOffset::match(Ld, DAG);
+    LoadVT = Ld->getMemoryVT();
+    // Load and store should be the same type.
+    if (MemVT != LoadVT)
+      return;
+    // Loads must only have one use.
+    if (!Ld->hasNUsesOfValue(1, 0))
+      return;
+    // The memory operands must not be volatile/indexed/atomic.
+    // TODO: May be able to relax for unordered atomics (see D66309)
+    if (!Ld->isSimple() || Ld->isIndexed())
+      return;
+  }
+  auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
+                            int64_t &Offset) -> bool {
+    // The memory operands must not be volatile/indexed/atomic.
+    // TODO: May be able to relax for unordered atomics (see D66309)
+    if (!Other->isSimple() || Other->isIndexed())
+      return false;
+    // Don't mix temporal stores with non-temporal stores.
+    if (St->isNonTemporal() != Other->isNonTemporal())
+      return false;
+    if (!TLI.areTwoSDNodeTargetMMOFlagsMergeable(*St, *Other))
+      return false;
+    SDValue OtherBC = peekThroughBitcasts(Other->getValue());
+    // Allow merging constants of different types as integers.
+    bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT())
+                                           : Other->getMemoryVT() != MemVT;
+    switch (StoreSrc) {
+    case StoreSource::Load: {
+      if (NoTypeMatch)
+        return false;
+      // The Load's Base Ptr must also match.
+      auto *OtherLd = dyn_cast<LoadSDNode>(OtherBC);
+      if (!OtherLd)
+        return false;
+      BaseIndexOffset LPtr = BaseIndexOffset::match(OtherLd, DAG);
+      if (LoadVT != OtherLd->getMemoryVT())
+        return false;
+      // Loads must only have one use.
+      if (!OtherLd->hasNUsesOfValue(1, 0))
+        return false;
+      // The memory operands must not be volatile/indexed/atomic.
+      // TODO: May be able to relax for unordered atomics (see D66309)
+      if (!OtherLd->isSimple() || OtherLd->isIndexed())
+        return false;
+      // Don't mix temporal loads with non-temporal loads.
+      if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal())
+        return false;
+      if (!TLI.areTwoSDNodeTargetMMOFlagsMergeable(*cast<LoadSDNode>(Val),
+                                                   *OtherLd))
+        return false;
+      if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
+        return false;
+      break;
+    }
+    case StoreSource::Constant:
+      if (NoTypeMatch)
+        return false;
+      if (!isIntOrFPConstant(OtherBC))
+        return false;
+      break;
+    case StoreSource::Extract:
+      // Do not merge truncated stores here.
+      if (Other->isTruncatingStore())
+        return false;
+      if (!MemVT.bitsEq(OtherBC.getValueType()))
+        return false;
+      if (OtherBC.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
+          OtherBC.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+        return false;
+      break;
+    default:
+      llvm_unreachable("Unhandled store source for merging");
+    }
+    Ptr = BaseIndexOffset::match(Other, DAG);
+    return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
+  };
+
+  // Check if the pair of StoreNode and the RootNode already bail out many
+  // times which is over the limit in dependence check.
+  auto OverLimitInDependenceCheck = [&](SDNode *StoreNode,
+                                        SDNode *RootNode) -> bool {
+    auto RootCount = StoreRootCountMap.find(StoreNode);
+    return RootCount != StoreRootCountMap.end() &&
+           RootCount->second.first == RootNode &&
+           RootCount->second.second > StoreMergeDependenceLimit;
+  };
+
+  auto TryToAddCandidate = [&](SDNode::use_iterator UseIter) {
+    // This must be a chain use.
+    if (UseIter.getOperandNo() != 0)
+      return;
+    if (auto *OtherStore = dyn_cast<StoreSDNode>(*UseIter)) {
+      BaseIndexOffset Ptr;
+      int64_t PtrDiff;
+      if (CandidateMatch(OtherStore, Ptr, PtrDiff) &&
+          !OverLimitInDependenceCheck(OtherStore, RootNode))
+        StoreNodes.push_back(MemOpLink(OtherStore, PtrDiff));
+    }
+  };
+
+  // We looking for a root node which is an ancestor to all mergable
+  // stores. We search up through a load, to our root and then down
+  // through all children. For instance we will find Store{1,2,3} if
+  // St is Store1, Store2. or Store3 where the root is not a load
+  // which always true for nonvolatile ops. TODO: Expand
+  // the search to find all valid candidates through multiple layers of loads.
+  //
+  // Root
+  // |-------|-------|
+  // Load    Load    Store3
+  // |       |
+  // Store1   Store2
+  //
+  // FIXME: We should be able to climb and
+  // descend TokenFactors to find candidates as well.
+
+  RootNode = St->getChain().getNode();
+
+  unsigned NumNodesExplored = 0;
+  const unsigned MaxSearchNodes = 1024;
+  if (auto *Ldn = dyn_cast<LoadSDNode>(RootNode)) {
+    RootNode = Ldn->getChain().getNode();
+    for (auto I = RootNode->use_begin(), E = RootNode->use_end();
+         I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored) {
+      if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) { // walk down chain
+        for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2)
+          TryToAddCandidate(I2);
+      }
+      // Check stores that depend on the root (e.g. Store 3 in the chart above).
+      if (I.getOperandNo() == 0 && isa<StoreSDNode>(*I)) {
+        TryToAddCandidate(I);
+      }
+    }
+  } else {
+    for (auto I = RootNode->use_begin(), E = RootNode->use_end();
+         I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored)
+      TryToAddCandidate(I);
+  }
+}
+
+// We need to check that merging these stores does not cause a loop in the
+// DAG. Any store candidate may depend on another candidate indirectly through
+// its operands. Check in parallel by searching up from operands of candidates.
+bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
+    SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
+    SDNode *RootNode) {
+  // FIXME: We should be able to truncate a full search of
+  // predecessors by doing a BFS and keeping tabs the originating
+  // stores from which worklist nodes come from in a similar way to
+  // TokenFactor simplfication.
+
+  SmallPtrSet<const SDNode *, 32> Visited;
+  SmallVector<const SDNode *, 8> Worklist;
+
+  // RootNode is a predecessor to all candidates so we need not search
+  // past it. Add RootNode (peeking through TokenFactors). Do not count
+  // these towards size check.
+
+  Worklist.push_back(RootNode);
+  while (!Worklist.empty()) {
+    auto N = Worklist.pop_back_val();
+    if (!Visited.insert(N).second)
+      continue; // Already present in Visited.
+    if (N->getOpcode() == ISD::TokenFactor) {
+      for (SDValue Op : N->ops())
+        Worklist.push_back(Op.getNode());
+    }
+  }
+
+  // Don't count pruning nodes towards max.
+  unsigned int Max = 1024 + Visited.size();
+  // Search Ops of store candidates.
+  for (unsigned i = 0; i < NumStores; ++i) {
+    SDNode *N = StoreNodes[i].MemNode;
+    // Of the 4 Store Operands:
+    //   * Chain (Op 0) -> We have already considered these
+    //                     in candidate selection, but only by following the
+    //                     chain dependencies. We could still have a chain
+    //                     dependency to a load, that has a non-chain dep to
+    //                     another load, that depends on a store, etc. So it is
+    //                     possible to have dependencies that consist of a mix
+    //                     of chain and non-chain deps, and we need to include
+    //                     chain operands in the analysis here..
+    //   * Value (Op 1) -> Cycles may happen (e.g. through load chains)
+    //   * Address (Op 2) -> Merged addresses may only vary by a fixed constant,
+    //                       but aren't necessarily fromt the same base node, so
+    //                       cycles possible (e.g. via indexed store).
+    //   * (Op 3) -> Represents the pre or post-indexing offset (or undef for
+    //               non-indexed stores). Not constant on all targets (e.g. ARM)
+    //               and so can participate in a cycle.
+    for (unsigned j = 0; j < N->getNumOperands(); ++j)
+      Worklist.push_back(N->getOperand(j).getNode());
+  }
+  // Search through DAG. We can stop early if we find a store node.
+  for (unsigned i = 0; i < NumStores; ++i)
+    if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist,
+                                     Max)) {
+      // If the searching bail out, record the StoreNode and RootNode in the
+      // StoreRootCountMap. If we have seen the pair many times over a limit,
+      // we won't add the StoreNode into StoreNodes set again.
+      if (Visited.size() >= Max) {
+        auto &RootCount = StoreRootCountMap[StoreNodes[i].MemNode];
+        if (RootCount.first == RootNode)
+          RootCount.second++;
+        else
+          RootCount = {RootNode, 1};
+      }
+      return false;
+    }
+  return true;
+}
+
+unsigned
+DAGCombiner::getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                  int64_t ElementSizeBytes) const {
+  while (true) {
+    // Find a store past the width of the first store.
+    size_t StartIdx = 0;
+    while ((StartIdx + 1 < StoreNodes.size()) &&
+           StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes !=
+              StoreNodes[StartIdx + 1].OffsetFromBase)
+      ++StartIdx;
+
+    // Bail if we don't have enough candidates to merge.
+    if (StartIdx + 1 >= StoreNodes.size())
+      return 0;
+
+    // Trim stores that overlapped with the first store.
+    if (StartIdx)
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx);
+
+    // Scan the memory operations on the chain and find the first
+    // non-consecutive store memory address.
+    unsigned NumConsecutiveStores = 1;
+    int64_t StartAddress = StoreNodes[0].OffsetFromBase;
+    // Check that the addresses are consecutive starting from the second
+    // element in the list of stores.
+    for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) {
+      int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
+      if (CurrAddress - StartAddress != (ElementSizeBytes * i))
+        break;
+      NumConsecutiveStores = i + 1;
+    }
+    if (NumConsecutiveStores > 1)
+      return NumConsecutiveStores;
+
+    // There are no consecutive stores at the start of the list.
+    // Remove the first store and try again.
+    StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
+  }
+}
+
+bool DAGCombiner::tryStoreMergeOfConstants(
+    SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores,
+    EVT MemVT, SDNode *RootNode, bool AllowVectors) {
+  LLVMContext &Context = *DAG.getContext();
+  const DataLayout &DL = DAG.getDataLayout();
+  int64_t ElementSizeBytes = MemVT.getStoreSize();
+  unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
+  bool MadeChange = false;
+
+  // Store the constants into memory as one consecutive store.
+  while (NumConsecutiveStores >= 2) {
+    LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+    unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+    Align FirstStoreAlign = FirstInChain->getAlign();
+    unsigned LastLegalType = 1;
+    unsigned LastLegalVectorType = 1;
+    bool LastIntegerTrunc = false;
+    bool NonZero = false;
+    unsigned FirstZeroAfterNonZero = NumConsecutiveStores;
+    for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
+      StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
+      SDValue StoredVal = ST->getValue();
+      bool IsElementZero = false;
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal))
+        IsElementZero = C->isZero();
+      else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))
+        IsElementZero = C->getConstantFPValue()->isNullValue();
+      if (IsElementZero) {
+        if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores)
+          FirstZeroAfterNonZero = i;
+      }
+      NonZero |= !IsElementZero;
+
+      // Find a legal type for the constant store.
+      unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
+      EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
+      unsigned IsFast = 0;
+
+      // Break early when size is too large to be legal.
+      if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
+        break;
+
+      if (TLI.isTypeLegal(StoreTy) &&
+          TLI.canMergeStoresTo(FirstStoreAS, StoreTy,
+                               DAG.getMachineFunction()) &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy,
+                                 *FirstInChain->getMemOperand(), &IsFast) &&
+          IsFast) {
+        LastIntegerTrunc = false;
+        LastLegalType = i + 1;
+        // Or check whether a truncstore is legal.
+      } else if (TLI.getTypeAction(Context, StoreTy) ==
+                 TargetLowering::TypePromoteInteger) {
+        EVT LegalizedStoredValTy =
+            TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
+        if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
+            TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy,
+                                 DAG.getMachineFunction()) &&
+            TLI.allowsMemoryAccess(Context, DL, StoreTy,
+                                   *FirstInChain->getMemOperand(), &IsFast) &&
+            IsFast) {
+          LastIntegerTrunc = true;
+          LastLegalType = i + 1;
+        }
+      }
+
+      // We only use vectors if the target allows it and the function is not
+      // marked with the noimplicitfloat attribute.
+      if (TLI.storeOfVectorConstantIsCheap(!NonZero, MemVT, i + 1, FirstStoreAS) &&
+          AllowVectors) {
+        // Find a legal type for the vector store.
+        unsigned Elts = (i + 1) * NumMemElts;
+        EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
+        if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&
+            TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG.getMachineFunction()) &&
+            TLI.allowsMemoryAccess(Context, DL, Ty,
+                                   *FirstInChain->getMemOperand(), &IsFast) &&
+            IsFast)
+          LastLegalVectorType = i + 1;
+      }
+    }
+
+    bool UseVector = (LastLegalVectorType > LastLegalType) && AllowVectors;
+    unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
+    bool UseTrunc = LastIntegerTrunc && !UseVector;
+
+    // Check if we found a legal integer type that creates a meaningful
+    // merge.
+    if (NumElem < 2) {
+      // We know that candidate stores are in order and of correct
+      // shape. While there is no mergeable sequence from the
+      // beginning one may start later in the sequence. The only
+      // reason a merge of size N could have failed where another of
+      // the same size would not have, is if the alignment has
+      // improved or we've dropped a non-zero value. Drop as many
+      // candidates as we can here.
+      unsigned NumSkip = 1;
+      while ((NumSkip < NumConsecutiveStores) &&
+             (NumSkip < FirstZeroAfterNonZero) &&
+             (StoreNodes[NumSkip].MemNode->getAlign() <= FirstStoreAlign))
+        NumSkip++;
+
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
+      NumConsecutiveStores -= NumSkip;
+      continue;
+    }
+
+    // Check that we can merge these candidates without causing a cycle.
+    if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
+                                                  RootNode)) {
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+      NumConsecutiveStores -= NumElem;
+      continue;
+    }
+
+    MadeChange |= mergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
+                                                  /*IsConstantSrc*/ true,
+                                                  UseVector, UseTrunc);
+
+    // Remove merged stores for next iteration.
+    StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+    NumConsecutiveStores -= NumElem;
+  }
+  return MadeChange;
+}
+
+bool DAGCombiner::tryStoreMergeOfExtracts(
+    SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores,
+    EVT MemVT, SDNode *RootNode) {
+  LLVMContext &Context = *DAG.getContext();
+  const DataLayout &DL = DAG.getDataLayout();
+  unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
+  bool MadeChange = false;
+
+  // Loop on Consecutive Stores on success.
+  while (NumConsecutiveStores >= 2) {
+    LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+    unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+    Align FirstStoreAlign = FirstInChain->getAlign();
+    unsigned NumStoresToMerge = 1;
+    for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
+      // Find a legal type for the vector store.
+      unsigned Elts = (i + 1) * NumMemElts;
+      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
+      unsigned IsFast = 0;
+
+      // Break early when size is too large to be legal.
+      if (Ty.getSizeInBits() > MaximumLegalStoreInBits)
+        break;
+
+      if (TLI.isTypeLegal(Ty) &&
+          TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG.getMachineFunction()) &&
+          TLI.allowsMemoryAccess(Context, DL, Ty,
+                                 *FirstInChain->getMemOperand(), &IsFast) &&
+          IsFast)
+        NumStoresToMerge = i + 1;
+    }
+
+    // Check if we found a legal integer type creating a meaningful
+    // merge.
+    if (NumStoresToMerge < 2) {
+      // We know that candidate stores are in order and of correct
+      // shape. While there is no mergeable sequence from the
+      // beginning one may start later in the sequence. The only
+      // reason a merge of size N could have failed where another of
+      // the same size would not have, is if the alignment has
+      // improved. Drop as many candidates as we can here.
+      unsigned NumSkip = 1;
+      while ((NumSkip < NumConsecutiveStores) &&
+             (StoreNodes[NumSkip].MemNode->getAlign() <= FirstStoreAlign))
+        NumSkip++;
+
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
+      NumConsecutiveStores -= NumSkip;
+      continue;
+    }
+
+    // Check that we can merge these candidates without causing a cycle.
+    if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumStoresToMerge,
+                                                  RootNode)) {
+      StoreNodes.erase(StoreNodes.begin(),
+                       StoreNodes.begin() + NumStoresToMerge);
+      NumConsecutiveStores -= NumStoresToMerge;
+      continue;
+    }
+
+    MadeChange |= mergeStoresOfConstantsOrVecElts(
+        StoreNodes, MemVT, NumStoresToMerge, /*IsConstantSrc*/ false,
+        /*UseVector*/ true, /*UseTrunc*/ false);
+
+    StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumStoresToMerge);
+    NumConsecutiveStores -= NumStoresToMerge;
+  }
+  return MadeChange;
+}
+
+bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                       unsigned NumConsecutiveStores, EVT MemVT,
+                                       SDNode *RootNode, bool AllowVectors,
+                                       bool IsNonTemporalStore,
+                                       bool IsNonTemporalLoad) {
+  LLVMContext &Context = *DAG.getContext();
+  const DataLayout &DL = DAG.getDataLayout();
+  int64_t ElementSizeBytes = MemVT.getStoreSize();
+  unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
+  bool MadeChange = false;
+
+  // Look for load nodes which are used by the stored values.
+  SmallVector<MemOpLink, 8> LoadNodes;
+
+  // Find acceptable loads. Loads need to have the same chain (token factor),
+  // must not be zext, volatile, indexed, and they must be consecutive.
+  BaseIndexOffset LdBasePtr;
+
+  for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
+    StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+    SDValue Val = peekThroughBitcasts(St->getValue());
+    LoadSDNode *Ld = cast<LoadSDNode>(Val);
+
+    BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG);
+    // If this is not the first ptr that we check.
+    int64_t LdOffset = 0;
+    if (LdBasePtr.getBase().getNode()) {
+      // The base ptr must be the same.
+      if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset))
+        break;
+    } else {
+      // Check that all other base pointers are the same as this one.
+      LdBasePtr = LdPtr;
+    }
+
+    // We found a potential memory operand to merge.
+    LoadNodes.push_back(MemOpLink(Ld, LdOffset));
+  }
+
+  while (NumConsecutiveStores >= 2 && LoadNodes.size() >= 2) {
+    Align RequiredAlignment;
+    bool NeedRotate = false;
+    if (LoadNodes.size() == 2) {
+      // If we have load/store pair instructions and we only have two values,
+      // don't bother merging.
+      if (TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
+          StoreNodes[0].MemNode->getAlign() >= RequiredAlignment) {
+        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2);
+        LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + 2);
+        break;
+      }
+      // If the loads are reversed, see if we can rotate the halves into place.
+      int64_t Offset0 = LoadNodes[0].OffsetFromBase;
+      int64_t Offset1 = LoadNodes[1].OffsetFromBase;
+      EVT PairVT = EVT::getIntegerVT(Context, ElementSizeBytes * 8 * 2);
+      if (Offset0 - Offset1 == ElementSizeBytes &&
+          (hasOperation(ISD::ROTL, PairVT) ||
+           hasOperation(ISD::ROTR, PairVT))) {
+        std::swap(LoadNodes[0], LoadNodes[1]);
+        NeedRotate = true;
+      }
+    }
+    LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+    unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+    Align FirstStoreAlign = FirstInChain->getAlign();
+    LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
+
+    // Scan the memory operations on the chain and find the first
+    // non-consecutive load memory address. These variables hold the index in
+    // the store node array.
+
+    unsigned LastConsecutiveLoad = 1;
+
+    // This variable refers to the size and not index in the array.
+    unsigned LastLegalVectorType = 1;
+    unsigned LastLegalIntegerType = 1;
+    bool isDereferenceable = true;
+    bool DoIntegerTruncate = false;
+    int64_t StartAddress = LoadNodes[0].OffsetFromBase;
+    SDValue LoadChain = FirstLoad->getChain();
+    for (unsigned i = 1; i < LoadNodes.size(); ++i) {
+      // All loads must share the same chain.
+      if (LoadNodes[i].MemNode->getChain() != LoadChain)
+        break;
+
+      int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
+      if (CurrAddress - StartAddress != (ElementSizeBytes * i))
+        break;
+      LastConsecutiveLoad = i;
+
+      if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable())
+        isDereferenceable = false;
+
+      // Find a legal type for the vector store.
+      unsigned Elts = (i + 1) * NumMemElts;
+      EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
+
+      // Break early when size is too large to be legal.
+      if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
+        break;
+
+      unsigned IsFastSt = 0;
+      unsigned IsFastLd = 0;
+      // Don't try vector types if we need a rotate. We may still fail the
+      // legality checks for the integer type, but we can't handle the rotate
+      // case with vectors.
+      // FIXME: We could use a shuffle in place of the rotate.
+      if (!NeedRotate && TLI.isTypeLegal(StoreTy) &&
+          TLI.canMergeStoresTo(FirstStoreAS, StoreTy,
+                               DAG.getMachineFunction()) &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy,
+                                 *FirstInChain->getMemOperand(), &IsFastSt) &&
+          IsFastSt &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy,
+                                 *FirstLoad->getMemOperand(), &IsFastLd) &&
+          IsFastLd) {
+        LastLegalVectorType = i + 1;
+      }
+
+      // Find a legal type for the integer store.
+      unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
+      StoreTy = EVT::getIntegerVT(Context, SizeInBits);
+      if (TLI.isTypeLegal(StoreTy) &&
+          TLI.canMergeStoresTo(FirstStoreAS, StoreTy,
+                               DAG.getMachineFunction()) &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy,
+                                 *FirstInChain->getMemOperand(), &IsFastSt) &&
+          IsFastSt &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy,
+                                 *FirstLoad->getMemOperand(), &IsFastLd) &&
+          IsFastLd) {
+        LastLegalIntegerType = i + 1;
+        DoIntegerTruncate = false;
+        // Or check whether a truncstore and extload is legal.
+      } else if (TLI.getTypeAction(Context, StoreTy) ==
+                 TargetLowering::TypePromoteInteger) {
+        EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy);
+        if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
+            TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy,
+                                 DAG.getMachineFunction()) &&
+            TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy, StoreTy) &&
+            TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy, StoreTy) &&
+            TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) &&
+            TLI.allowsMemoryAccess(Context, DL, StoreTy,
+                                   *FirstInChain->getMemOperand(), &IsFastSt) &&
+            IsFastSt &&
+            TLI.allowsMemoryAccess(Context, DL, StoreTy,
+                                   *FirstLoad->getMemOperand(), &IsFastLd) &&
+            IsFastLd) {
+          LastLegalIntegerType = i + 1;
+          DoIntegerTruncate = true;
+        }
+      }
+    }
+
+    // Only use vector types if the vector type is larger than the integer
+    // type. If they are the same, use integers.
+    bool UseVectorTy =
+        LastLegalVectorType > LastLegalIntegerType && AllowVectors;
+    unsigned LastLegalType =
+        std::max(LastLegalVectorType, LastLegalIntegerType);
+
+    // We add +1 here because the LastXXX variables refer to location while
+    // the NumElem refers to array/index size.
+    unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
+    NumElem = std::min(LastLegalType, NumElem);
+    Align FirstLoadAlign = FirstLoad->getAlign();
+
+    if (NumElem < 2) {
+      // We know that candidate stores are in order and of correct
+      // shape. While there is no mergeable sequence from the
+      // beginning one may start later in the sequence. The only
+      // reason a merge of size N could have failed where another of
+      // the same size would not have is if the alignment or either
+      // the load or store has improved. Drop as many candidates as we
+      // can here.
+      unsigned NumSkip = 1;
+      while ((NumSkip < LoadNodes.size()) &&
+             (LoadNodes[NumSkip].MemNode->getAlign() <= FirstLoadAlign) &&
+             (StoreNodes[NumSkip].MemNode->getAlign() <= FirstStoreAlign))
+        NumSkip++;
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
+      LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip);
+      NumConsecutiveStores -= NumSkip;
+      continue;
+    }
+
+    // Check that we can merge these candidates without causing a cycle.
+    if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
+                                                  RootNode)) {
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+      LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
+      NumConsecutiveStores -= NumElem;
+      continue;
+    }
+
+    // Find if it is better to use vectors or integers to load and store
+    // to memory.
+    EVT JointMemOpVT;
+    if (UseVectorTy) {
+      // Find a legal type for the vector store.
+      unsigned Elts = NumElem * NumMemElts;
+      JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
+    } else {
+      unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
+      JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
+    }
+
+    SDLoc LoadDL(LoadNodes[0].MemNode);
+    SDLoc StoreDL(StoreNodes[0].MemNode);
+
+    // The merged loads are required to have the same incoming chain, so
+    // using the first's chain is acceptable.
+
+    SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
+    bool CanReusePtrInfo = hasSameUnderlyingObj(StoreNodes);
+    AddToWorklist(NewStoreChain.getNode());
+
+    MachineMemOperand::Flags LdMMOFlags =
+        isDereferenceable ? MachineMemOperand::MODereferenceable
+                          : MachineMemOperand::MONone;
+    if (IsNonTemporalLoad)
+      LdMMOFlags |= MachineMemOperand::MONonTemporal;
+
+    LdMMOFlags |= TLI.getTargetMMOFlags(*FirstLoad);
+
+    MachineMemOperand::Flags StMMOFlags = IsNonTemporalStore
+                                              ? MachineMemOperand::MONonTemporal
+                                              : MachineMemOperand::MONone;
+
+    StMMOFlags |= TLI.getTargetMMOFlags(*StoreNodes[0].MemNode);
+
+    SDValue NewLoad, NewStore;
+    if (UseVectorTy || !DoIntegerTruncate) {
+      NewLoad = DAG.getLoad(
+          JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(),
+          FirstLoad->getPointerInfo(), FirstLoadAlign, LdMMOFlags);
+      SDValue StoreOp = NewLoad;
+      if (NeedRotate) {
+        unsigned LoadWidth = ElementSizeBytes * 8 * 2;
+        assert(JointMemOpVT == EVT::getIntegerVT(Context, LoadWidth) &&
+               "Unexpected type for rotate-able load pair");
+        SDValue RotAmt =
+            DAG.getShiftAmountConstant(LoadWidth / 2, JointMemOpVT, LoadDL);
+        // Target can convert to the identical ROTR if it does not have ROTL.
+        StoreOp = DAG.getNode(ISD::ROTL, LoadDL, JointMemOpVT, NewLoad, RotAmt);
+      }
+      NewStore = DAG.getStore(
+          NewStoreChain, StoreDL, StoreOp, FirstInChain->getBasePtr(),
+          CanReusePtrInfo ? FirstInChain->getPointerInfo()
+                          : MachinePointerInfo(FirstStoreAS),
+          FirstStoreAlign, StMMOFlags);
+    } else { // This must be the truncstore/extload case
+      EVT ExtendedTy =
+          TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);
+      NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy,
+                               FirstLoad->getChain(), FirstLoad->getBasePtr(),
+                               FirstLoad->getPointerInfo(), JointMemOpVT,
+                               FirstLoadAlign, LdMMOFlags);
+      NewStore = DAG.getTruncStore(
+          NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
+          CanReusePtrInfo ? FirstInChain->getPointerInfo()
+                          : MachinePointerInfo(FirstStoreAS),
+          JointMemOpVT, FirstInChain->getAlign(),
+          FirstInChain->getMemOperand()->getFlags());
+    }
+
+    // Transfer chain users from old loads to the new load.
+    for (unsigned i = 0; i < NumElem; ++i) {
+      LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
+                                    SDValue(NewLoad.getNode(), 1));
+    }
+
+    // Replace all stores with the new store. Recursively remove corresponding
+    // values if they are no longer used.
+    for (unsigned i = 0; i < NumElem; ++i) {
+      SDValue Val = StoreNodes[i].MemNode->getOperand(1);
+      CombineTo(StoreNodes[i].MemNode, NewStore);
+      if (Val->use_empty())
+        recursivelyDeleteUnusedNodes(Val.getNode());
+    }
+
+    MadeChange = true;
+    StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+    LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
+    NumConsecutiveStores -= NumElem;
+  }
+  return MadeChange;
+}
+
+bool DAGCombiner::mergeConsecutiveStores(StoreSDNode *St) {
+  if (OptLevel == CodeGenOpt::None || !EnableStoreMerging)
+    return false;
+
+  // TODO: Extend this function to merge stores of scalable vectors.
+  // (i.e. two <vscale x 8 x i8> stores can be merged to one <vscale x 16 x i8>
+  // store since we know <vscale x 16 x i8> is exactly twice as large as
+  // <vscale x 8 x i8>). Until then, bail out for scalable vectors.
+  EVT MemVT = St->getMemoryVT();
+  if (MemVT.isScalableVT())
+    return false;
+  if (!MemVT.isSimple() || MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)
+    return false;
+
+  // This function cannot currently deal with non-byte-sized memory sizes.
+  int64_t ElementSizeBytes = MemVT.getStoreSize();
+  if (ElementSizeBytes * 8 != (int64_t)MemVT.getSizeInBits())
+    return false;
+
+  // Do not bother looking at stored values that are not constants, loads, or
+  // extracted vector elements.
+  SDValue StoredVal = peekThroughBitcasts(St->getValue());
+  const StoreSource StoreSrc = getStoreSource(StoredVal);
+  if (StoreSrc == StoreSource::Unknown)
+    return false;
+
+  SmallVector<MemOpLink, 8> StoreNodes;
+  SDNode *RootNode;
+  // Find potential store merge candidates by searching through chain sub-DAG
+  getStoreMergeCandidates(St, StoreNodes, RootNode);
+
+  // Check if there is anything to merge.
+  if (StoreNodes.size() < 2)
+    return false;
+
+  // Sort the memory operands according to their distance from the
+  // base pointer.
+  llvm::sort(StoreNodes, [](MemOpLink LHS, MemOpLink RHS) {
+    return LHS.OffsetFromBase < RHS.OffsetFromBase;
+  });
+
+  bool AllowVectors = !DAG.getMachineFunction().getFunction().hasFnAttribute(
+      Attribute::NoImplicitFloat);
+  bool IsNonTemporalStore = St->isNonTemporal();
+  bool IsNonTemporalLoad = StoreSrc == StoreSource::Load &&
+                           cast<LoadSDNode>(StoredVal)->isNonTemporal();
+
+  // Store Merge attempts to merge the lowest stores. This generally
+  // works out as if successful, as the remaining stores are checked
+  // after the first collection of stores is merged. However, in the
+  // case that a non-mergeable store is found first, e.g., {p[-2],
+  // p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent
+  // mergeable cases. To prevent this, we prune such stores from the
+  // front of StoreNodes here.
+  bool MadeChange = false;
+  while (StoreNodes.size() > 1) {
+    unsigned NumConsecutiveStores =
+        getConsecutiveStores(StoreNodes, ElementSizeBytes);
+    // There are no more stores in the list to examine.
+    if (NumConsecutiveStores == 0)
+      return MadeChange;
+
+    // We have at least 2 consecutive stores. Try to merge them.
+    assert(NumConsecutiveStores >= 2 && "Expected at least 2 stores");
+    switch (StoreSrc) {
+    case StoreSource::Constant:
+      MadeChange |= tryStoreMergeOfConstants(StoreNodes, NumConsecutiveStores,
+                                             MemVT, RootNode, AllowVectors);
+      break;
+
+    case StoreSource::Extract:
+      MadeChange |= tryStoreMergeOfExtracts(StoreNodes, NumConsecutiveStores,
+                                            MemVT, RootNode);
+      break;
+
+    case StoreSource::Load:
+      MadeChange |= tryStoreMergeOfLoads(StoreNodes, NumConsecutiveStores,
+                                         MemVT, RootNode, AllowVectors,
+                                         IsNonTemporalStore, IsNonTemporalLoad);
+      break;
+
+    default:
+      llvm_unreachable("Unhandled store source type");
+    }
+  }
+  return MadeChange;
+}
+
+SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) {
+  SDLoc SL(ST);
+  SDValue ReplStore;
+
+  // Replace the chain to avoid dependency.
+  if (ST->isTruncatingStore()) {
+    ReplStore = DAG.getTruncStore(BetterChain, SL, ST->getValue(),
+                                  ST->getBasePtr(), ST->getMemoryVT(),
+                                  ST->getMemOperand());
+  } else {
+    ReplStore = DAG.getStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(),
+                             ST->getMemOperand());
+  }
+
+  // Create token to keep both nodes around.
+  SDValue Token = DAG.getNode(ISD::TokenFactor, SL,
+                              MVT::Other, ST->getChain(), ReplStore);
+
+  // Make sure the new and old chains are cleaned up.
+  AddToWorklist(Token.getNode());
+
+  // Don't add users to work list.
+  return CombineTo(ST, Token, false);
+}
+
+SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
+  SDValue Value = ST->getValue();
+  if (Value.getOpcode() == ISD::TargetConstantFP)
+    return SDValue();
+
+  if (!ISD::isNormalStore(ST))
+    return SDValue();
+
+  SDLoc DL(ST);
+
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+
+  const ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Value);
+
+  // NOTE: If the original store is volatile, this transform must not increase
+  // the number of stores.  For example, on x86-32 an f64 can be stored in one
+  // processor operation but an i64 (which is not legal) requires two.  So the
+  // transform should not be done in this case.
+
+  SDValue Tmp;
+  switch (CFP->getSimpleValueType(0).SimpleTy) {
+  default:
+    llvm_unreachable("Unknown FP type");
+  case MVT::f16:    // We don't do this for these yet.
+  case MVT::bf16:
+  case MVT::f80:
+  case MVT::f128:
+  case MVT::ppcf128:
+    return SDValue();
+  case MVT::f32:
+    if ((isTypeLegal(MVT::i32) && !LegalOperations && ST->isSimple()) ||
+        TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
+      Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
+                            bitcastToAPInt().getZExtValue(), SDLoc(CFP),
+                            MVT::i32);
+      return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand());
+    }
+
+    return SDValue();
+  case MVT::f64:
+    if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
+         ST->isSimple()) ||
+        TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
+      Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
+                            getZExtValue(), SDLoc(CFP), MVT::i64);
+      return DAG.getStore(Chain, DL, Tmp,
+                          Ptr, ST->getMemOperand());
+    }
+
+    if (ST->isSimple() &&
+        TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
+      // Many FP stores are not made apparent until after legalize, e.g. for
+      // argument passing.  Since this is so common, custom legalize the
+      // 64-bit integer store into two 32-bit stores.
+      uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+      SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32);
+      SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32);
+      if (DAG.getDataLayout().isBigEndian())
+        std::swap(Lo, Hi);
+
+      MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
+      AAMDNodes AAInfo = ST->getAAInfo();
+
+      SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
+                                 ST->getOriginalAlign(), MMOFlags, AAInfo);
+      Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(4), DL);
+      SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr,
+                                 ST->getPointerInfo().getWithOffset(4),
+                                 ST->getOriginalAlign(), MMOFlags, AAInfo);
+      return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                         St0, St1);
+    }
+
+    return SDValue();
+  }
+}
+
+// (store (insert_vector_elt (load p), x, i), p) -> (store x, p+offset)
+//
+// If a store of a load with an element inserted into it has no other
+// uses in between the chain, then we can consider the vector store
+// dead and replace it with just the single scalar element store.
+SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) {
+  SDLoc DL(ST);
+  SDValue Value = ST->getValue();
+  SDValue Ptr = ST->getBasePtr();
+  SDValue Chain = ST->getChain();
+  if (Value.getOpcode() != ISD::INSERT_VECTOR_ELT || !Value.hasOneUse())
+    return SDValue();
+
+  SDValue Elt = Value.getOperand(1);
+  SDValue Idx = Value.getOperand(2);
+
+  // If the element isn't byte sized or is implicitly truncated then we can't
+  // compute an offset.
+  EVT EltVT = Elt.getValueType();
+  if (!EltVT.isByteSized() ||
+      EltVT != Value.getOperand(0).getValueType().getVectorElementType())
+    return SDValue();
+
+  auto *Ld = dyn_cast<LoadSDNode>(Value.getOperand(0));
+  if (!Ld || Ld->getBasePtr() != Ptr ||
+      ST->getMemoryVT() != Ld->getMemoryVT() || !ST->isSimple() ||
+      !ISD::isNormalStore(ST) ||
+      Ld->getAddressSpace() != ST->getAddressSpace() ||
+      !Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1)))
+    return SDValue();
+
+  unsigned IsFast;
+  if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+                              Elt.getValueType(), ST->getAddressSpace(),
+                              ST->getAlign(), ST->getMemOperand()->getFlags(),
+                              &IsFast) ||
+      !IsFast)
+    return SDValue();
+  EVT PtrVT = Ptr.getValueType();
+
+  SDValue Offset =
+      DAG.getNode(ISD::MUL, DL, PtrVT, Idx,
+                  DAG.getConstant(EltVT.getSizeInBits() / 8, DL, PtrVT));
+  SDValue NewPtr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Offset);
+  MachinePointerInfo PointerInfo(ST->getAddressSpace());
+
+  // If the offset is a known constant then try to recover the pointer
+  // info
+  if (auto *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
+    unsigned COffset = CIdx->getSExtValue() * EltVT.getSizeInBits() / 8;
+    NewPtr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(COffset), DL);
+    PointerInfo = ST->getPointerInfo().getWithOffset(COffset);
+  }
+
+  return DAG.getStore(Chain, DL, Elt, NewPtr, PointerInfo, ST->getAlign(),
+                      ST->getMemOperand()->getFlags());
+}
+
+SDValue DAGCombiner::visitSTORE(SDNode *N) {
+  StoreSDNode *ST  = cast<StoreSDNode>(N);
+  SDValue Chain = ST->getChain();
+  SDValue Value = ST->getValue();
+  SDValue Ptr   = ST->getBasePtr();
+
+  // If this is a store of a bit convert, store the input value if the
+  // resultant store does not need a higher alignment than the original.
+  if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() &&
+      ST->isUnindexed()) {
+    EVT SVT = Value.getOperand(0).getValueType();
+    // If the store is volatile, we only want to change the store type if the
+    // resulting store is legal. Otherwise we might increase the number of
+    // memory accesses. We don't care if the original type was legal or not
+    // as we assume software couldn't rely on the number of accesses of an
+    // illegal type.
+    // TODO: May be able to relax for unordered atomics (see D66309)
+    if (((!LegalOperations && ST->isSimple()) ||
+         TLI.isOperationLegal(ISD::STORE, SVT)) &&
+        TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT,
+                                     DAG, *ST->getMemOperand())) {
+      return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
+                          ST->getMemOperand());
+    }
+  }
+
+  // Turn 'store undef, Ptr' -> nothing.
+  if (Value.isUndef() && ST->isUnindexed())
+    return Chain;
+
+  // Try to infer better alignment information than the store already has.
+  if (OptLevel != CodeGenOpt::None && ST->isUnindexed() && !ST->isAtomic()) {
+    if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) {
+      if (*Alignment > ST->getAlign() &&
+          isAligned(*Alignment, ST->getSrcValueOffset())) {
+        SDValue NewStore =
+            DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(),
+                              ST->getMemoryVT(), *Alignment,
+                              ST->getMemOperand()->getFlags(), ST->getAAInfo());
+        // NewStore will always be N as we are only refining the alignment
+        assert(NewStore.getNode() == N);
+        (void)NewStore;
+      }
+    }
+  }
+
+  // Try transforming a pair floating point load / store ops to integer
+  // load / store ops.
+  if (SDValue NewST = TransformFPLoadStorePair(N))
+    return NewST;
+
+  // Try transforming several stores into STORE (BSWAP).
+  if (SDValue Store = mergeTruncStores(ST))
+    return Store;
+
+  if (ST->isUnindexed()) {
+    // Walk up chain skipping non-aliasing memory nodes, on this store and any
+    // adjacent stores.
+    if (findBetterNeighborChains(ST)) {
+      // replaceStoreChain uses CombineTo, which handled all of the worklist
+      // manipulation. Return the original node to not do anything else.
+      return SDValue(ST, 0);
+    }
+    Chain = ST->getChain();
+  }
+
+  // FIXME: is there such a thing as a truncating indexed store?
+  if (ST->isTruncatingStore() && ST->isUnindexed() &&
+      Value.getValueType().isInteger() &&
+      (!isa<ConstantSDNode>(Value) ||
+       !cast<ConstantSDNode>(Value)->isOpaque())) {
+    // Convert a truncating store of a extension into a standard store.
+    if ((Value.getOpcode() == ISD::ZERO_EXTEND ||
+         Value.getOpcode() == ISD::SIGN_EXTEND ||
+         Value.getOpcode() == ISD::ANY_EXTEND) &&
+        Value.getOperand(0).getValueType() == ST->getMemoryVT() &&
+        TLI.isOperationLegalOrCustom(ISD::STORE, ST->getMemoryVT()))
+      return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
+                          ST->getMemOperand());
+
+    APInt TruncDemandedBits =
+        APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
+                             ST->getMemoryVT().getScalarSizeInBits());
+
+    // See if we can simplify the operation with SimplifyDemandedBits, which
+    // only works if the value has a single use.
+    AddToWorklist(Value.getNode());
+    if (SimplifyDemandedBits(Value, TruncDemandedBits)) {
+      // Re-visit the store if anything changed and the store hasn't been merged
+      // with another node (N is deleted) SimplifyDemandedBits will add Value's
+      // node back to the worklist if necessary, but we also need to re-visit
+      // the Store node itself.
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        AddToWorklist(N);
+      return SDValue(N, 0);
+    }
+
+    // Otherwise, see if we can simplify the input to this truncstore with
+    // knowledge that only the low bits are being used.  For example:
+    // "truncstore (or (shl x, 8), y), i8"  -> "truncstore y, i8"
+    if (SDValue Shorter =
+            TLI.SimplifyMultipleUseDemandedBits(Value, TruncDemandedBits, DAG))
+      return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(),
+                               ST->getMemOperand());
+
+    // If we're storing a truncated constant, see if we can simplify it.
+    // TODO: Move this to targetShrinkDemandedConstant?
+    if (auto *Cst = dyn_cast<ConstantSDNode>(Value))
+      if (!Cst->isOpaque()) {
+        const APInt &CValue = Cst->getAPIntValue();
+        APInt NewVal = CValue & TruncDemandedBits;
+        if (NewVal != CValue) {
+          SDValue Shorter =
+              DAG.getConstant(NewVal, SDLoc(N), Value.getValueType());
+          return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr,
+                                   ST->getMemoryVT(), ST->getMemOperand());
+        }
+      }
+  }
+
+  // If this is a load followed by a store to the same location, then the store
+  // is dead/noop. Peek through any truncates if canCombineTruncStore failed.
+  // TODO: Add big-endian truncate support with test coverage.
+  // TODO: Can relax for unordered atomics (see D66309)
+  SDValue TruncVal = DAG.getDataLayout().isLittleEndian()
+                         ? peekThroughTruncates(Value)
+                         : Value;
+  if (auto *Ld = dyn_cast<LoadSDNode>(TruncVal)) {
+    if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() &&
+        ST->isUnindexed() && ST->isSimple() &&
+        Ld->getAddressSpace() == ST->getAddressSpace() &&
+        // There can't be any side effects between the load and store, such as
+        // a call or store.
+        Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) {
+      // The store is dead, remove it.
+      return Chain;
+    }
+  }
+
+  // Try scalarizing vector stores of loads where we only change one element
+  if (SDValue NewST = replaceStoreOfInsertLoad(ST))
+    return NewST;
+
+  // TODO: Can relax for unordered atomics (see D66309)
+  if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
+    if (ST->isUnindexed() && ST->isSimple() &&
+        ST1->isUnindexed() && ST1->isSimple()) {
+      if (OptLevel != CodeGenOpt::None && ST1->getBasePtr() == Ptr &&
+          ST1->getValue() == Value && ST->getMemoryVT() == ST1->getMemoryVT() &&
+          ST->getAddressSpace() == ST1->getAddressSpace()) {
+        // If this is a store followed by a store with the same value to the
+        // same location, then the store is dead/noop.
+        return Chain;
+      }
+
+      if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&
+          !ST1->getBasePtr().isUndef() &&
+          ST->getAddressSpace() == ST1->getAddressSpace()) {
+        // If we consider two stores and one smaller in size is a scalable
+        // vector type and another one a bigger size store with a fixed type,
+        // then we could not allow the scalable store removal because we don't
+        // know its final size in the end.
+        if (ST->getMemoryVT().isScalableVector() ||
+            ST1->getMemoryVT().isScalableVector()) {
+          if (ST1->getBasePtr() == Ptr &&
+              TypeSize::isKnownLE(ST1->getMemoryVT().getStoreSize(),
+                                  ST->getMemoryVT().getStoreSize())) {
+            CombineTo(ST1, ST1->getChain());
+            return SDValue();
+          }
+        } else {
+          const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG);
+          const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG);
+          // If this is a store who's preceding store to a subset of the current
+          // location and no one other node is chained to that store we can
+          // effectively drop the store. Do not remove stores to undef as they
+          // may be used as data sinks.
+          if (STBase.contains(DAG, ST->getMemoryVT().getFixedSizeInBits(),
+                              ChainBase,
+                              ST1->getMemoryVT().getFixedSizeInBits())) {
+            CombineTo(ST1, ST1->getChain());
+            return SDValue();
+          }
+        }
+      }
+    }
+  }
+
+  // If this is an FP_ROUND or TRUNC followed by a store, fold this into a
+  // truncating store.  We can do this even if this is already a truncstore.
+  if ((Value.getOpcode() == ISD::FP_ROUND ||
+       Value.getOpcode() == ISD::TRUNCATE) &&
+      Value->hasOneUse() && ST->isUnindexed() &&
+      TLI.canCombineTruncStore(Value.getOperand(0).getValueType(),
+                               ST->getMemoryVT(), LegalOperations)) {
+    return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
+                             Ptr, ST->getMemoryVT(), ST->getMemOperand());
+  }
+
+  // Always perform this optimization before types are legal. If the target
+  // prefers, also try this after legalization to catch stores that were created
+  // by intrinsics or other nodes.
+  if (!LegalTypes || (TLI.mergeStoresAfterLegalization(ST->getMemoryVT()))) {
+    while (true) {
+      // There can be multiple store sequences on the same chain.
+      // Keep trying to merge store sequences until we are unable to do so
+      // or until we merge the last store on the chain.
+      bool Changed = mergeConsecutiveStores(ST);
+      if (!Changed) break;
+      // Return N as merge only uses CombineTo and no worklist clean
+      // up is necessary.
+      if (N->getOpcode() == ISD::DELETED_NODE || !isa<StoreSDNode>(N))
+        return SDValue(N, 0);
+    }
+  }
+
+  // Try transforming N to an indexed store.
+  if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+    return SDValue(N, 0);
+
+  // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
+  //
+  // Make sure to do this only after attempting to merge stores in order to
+  //  avoid changing the types of some subset of stores due to visit order,
+  //  preventing their merging.
+  if (isa<ConstantFPSDNode>(ST->getValue())) {
+    if (SDValue NewSt = replaceStoreOfFPConstant(ST))
+      return NewSt;
+  }
+
+  if (SDValue NewSt = splitMergedValStore(ST))
+    return NewSt;
+
+  return ReduceLoadOpStoreWidth(N);
+}
+
+SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
+  const auto *LifetimeEnd = cast<LifetimeSDNode>(N);
+  if (!LifetimeEnd->hasOffset())
+    return SDValue();
+
+  const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(),
+                                        LifetimeEnd->getOffset(), false);
+
+  // We walk up the chains to find stores.
+  SmallVector<SDValue, 8> Chains = {N->getOperand(0)};
+  while (!Chains.empty()) {
+    SDValue Chain = Chains.pop_back_val();
+    if (!Chain.hasOneUse())
+      continue;
+    switch (Chain.getOpcode()) {
+    case ISD::TokenFactor:
+      for (unsigned Nops = Chain.getNumOperands(); Nops;)
+        Chains.push_back(Chain.getOperand(--Nops));
+      break;
+    case ISD::LIFETIME_START:
+    case ISD::LIFETIME_END:
+      // We can forward past any lifetime start/end that can be proven not to
+      // alias the node.
+      if (!mayAlias(Chain.getNode(), N))
+        Chains.push_back(Chain.getOperand(0));
+      break;
+    case ISD::STORE: {
+      StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain);
+      // TODO: Can relax for unordered atomics (see D66309)
+      if (!ST->isSimple() || ST->isIndexed())
+        continue;
+      const TypeSize StoreSize = ST->getMemoryVT().getStoreSize();
+      // The bounds of a scalable store are not known until runtime, so this
+      // store cannot be elided.
+      if (StoreSize.isScalable())
+        continue;
+      const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG);
+      // If we store purely within object bounds just before its lifetime ends,
+      // we can remove the store.
+      if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase,
+                                   StoreSize.getFixedValue() * 8)) {
+        LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump();
+                   dbgs() << "\nwithin LIFETIME_END of : ";
+                   LifetimeEndBase.dump(); dbgs() << "\n");
+        CombineTo(ST, ST->getChain());
+        return SDValue(N, 0);
+      }
+    }
+    }
+  }
+  return SDValue();
+}
+
+/// For the instruction sequence of store below, F and I values
+/// are bundled together as an i64 value before being stored into memory.
+/// Sometimes it is more efficent to generate separate stores for F and I,
+/// which can remove the bitwise instructions or sink them to colder places.
+///
+///   (store (or (zext (bitcast F to i32) to i64),
+///              (shl (zext I to i64), 32)), addr)  -->
+///   (store F, addr) and (store I, addr+4)
+///
+/// Similarly, splitting for other merged store can also be beneficial, like:
+/// For pair of {i32, i32}, i64 store --> two i32 stores.
+/// For pair of {i32, i16}, i64 store --> two i32 stores.
+/// For pair of {i16, i16}, i32 store --> two i16 stores.
+/// For pair of {i16, i8},  i32 store --> two i16 stores.
+/// For pair of {i8, i8},   i16 store --> two i8 stores.
+///
+/// We allow each target to determine specifically which kind of splitting is
+/// supported.
+///
+/// The store patterns are commonly seen from the simple code snippet below
+/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
+///   void goo(const std::pair<int, float> &);
+///   hoo() {
+///     ...
+///     goo(std::make_pair(tmp, ftmp));
+///     ...
+///   }
+///
+SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
+  if (OptLevel == CodeGenOpt::None)
+    return SDValue();
+
+  // Can't change the number of memory accesses for a volatile store or break
+  // atomicity for an atomic one.
+  if (!ST->isSimple())
+    return SDValue();
+
+  SDValue Val = ST->getValue();
+  SDLoc DL(ST);
+
+  // Match OR operand.
+  if (!Val.getValueType().isScalarInteger() || Val.getOpcode() != ISD::OR)
+    return SDValue();
+
+  // Match SHL operand and get Lower and Higher parts of Val.
+  SDValue Op1 = Val.getOperand(0);
+  SDValue Op2 = Val.getOperand(1);
+  SDValue Lo, Hi;
+  if (Op1.getOpcode() != ISD::SHL) {
+    std::swap(Op1, Op2);
+    if (Op1.getOpcode() != ISD::SHL)
+      return SDValue();
+  }
+  Lo = Op2;
+  Hi = Op1.getOperand(0);
+  if (!Op1.hasOneUse())
+    return SDValue();
+
+  // Match shift amount to HalfValBitSize.
+  unsigned HalfValBitSize = Val.getValueSizeInBits() / 2;
+  ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(Op1.getOperand(1));
+  if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize)
+    return SDValue();
+
+  // Lo and Hi are zero-extended from int with size less equal than 32
+  // to i64.
+  if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() ||
+      !Lo.getOperand(0).getValueType().isScalarInteger() ||
+      Lo.getOperand(0).getValueSizeInBits() > HalfValBitSize ||
+      Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() ||
+      !Hi.getOperand(0).getValueType().isScalarInteger() ||
+      Hi.getOperand(0).getValueSizeInBits() > HalfValBitSize)
+    return SDValue();
+
+  // Use the EVT of low and high parts before bitcast as the input
+  // of target query.
+  EVT LowTy = (Lo.getOperand(0).getOpcode() == ISD::BITCAST)
+                  ? Lo.getOperand(0).getValueType()
+                  : Lo.getValueType();
+  EVT HighTy = (Hi.getOperand(0).getOpcode() == ISD::BITCAST)
+                   ? Hi.getOperand(0).getValueType()
+                   : Hi.getValueType();
+  if (!TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
+    return SDValue();
+
+  // Start to split store.
+  MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = ST->getAAInfo();
+
+  // Change the sizes of Lo and Hi's value types to HalfValBitSize.
+  EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize);
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0));
+  Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0));
+
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+  // Lower value store.
+  SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
+                             ST->getOriginalAlign(), MMOFlags, AAInfo);
+  Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(HalfValBitSize / 8), DL);
+  // Higher value store.
+  SDValue St1 = DAG.getStore(
+      St0, DL, Hi, Ptr, ST->getPointerInfo().getWithOffset(HalfValBitSize / 8),
+      ST->getOriginalAlign(), MMOFlags, AAInfo);
+  return St1;
+}
+
+// Merge an insertion into an existing shuffle:
+// (insert_vector_elt (vector_shuffle X, Y, Mask),
+//                   .(extract_vector_elt X, N), InsIndex)
+//   --> (vector_shuffle X, Y, NewMask)
+//  and variations where shuffle operands may be CONCAT_VECTORS.
+static bool mergeEltWithShuffle(SDValue &X, SDValue &Y, ArrayRef<int> Mask,
+                                SmallVectorImpl<int> &NewMask, SDValue Elt,
+                                unsigned InsIndex) {
+  if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !isa<ConstantSDNode>(Elt.getOperand(1)))
+    return false;
+
+  // Vec's operand 0 is using indices from 0 to N-1 and
+  // operand 1 from N to 2N - 1, where N is the number of
+  // elements in the vectors.
+  SDValue InsertVal0 = Elt.getOperand(0);
+  int ElementOffset = -1;
+
+  // We explore the inputs of the shuffle in order to see if we find the
+  // source of the extract_vector_elt. If so, we can use it to modify the
+  // shuffle rather than perform an insert_vector_elt.
+  SmallVector<std::pair<int, SDValue>, 8> ArgWorkList;
+  ArgWorkList.emplace_back(Mask.size(), Y);
+  ArgWorkList.emplace_back(0, X);
+
+  while (!ArgWorkList.empty()) {
+    int ArgOffset;
+    SDValue ArgVal;
+    std::tie(ArgOffset, ArgVal) = ArgWorkList.pop_back_val();
+
+    if (ArgVal == InsertVal0) {
+      ElementOffset = ArgOffset;
+      break;
+    }
+
+    // Peek through concat_vector.
+    if (ArgVal.getOpcode() == ISD::CONCAT_VECTORS) {
+      int CurrentArgOffset =
+          ArgOffset + ArgVal.getValueType().getVectorNumElements();
+      int Step = ArgVal.getOperand(0).getValueType().getVectorNumElements();
+      for (SDValue Op : reverse(ArgVal->ops())) {
+        CurrentArgOffset -= Step;
+        ArgWorkList.emplace_back(CurrentArgOffset, Op);
+      }
+
+      // Make sure we went through all the elements and did not screw up index
+      // computation.
+      assert(CurrentArgOffset == ArgOffset);
+    }
+  }
+
+  // If we failed to find a match, see if we can replace an UNDEF shuffle
+  // operand.
+  if (ElementOffset == -1) {
+    if (!Y.isUndef() || InsertVal0.getValueType() != Y.getValueType())
+      return false;
+    ElementOffset = Mask.size();
+    Y = InsertVal0;
+  }
+
+  NewMask.assign(Mask.begin(), Mask.end());
+  NewMask[InsIndex] = ElementOffset + Elt.getConstantOperandVal(1);
+  assert(NewMask[InsIndex] < (int)(2 * Mask.size()) && NewMask[InsIndex] >= 0 &&
+         "NewMask[InsIndex] is out of bound");
+  return true;
+}
+
+// Merge an insertion into an existing shuffle:
+// (insert_vector_elt (vector_shuffle X, Y), (extract_vector_elt X, N),
+// InsIndex)
+//   --> (vector_shuffle X, Y) and variations where shuffle operands may be
+//   CONCAT_VECTORS.
+SDValue DAGCombiner::mergeInsertEltWithShuffle(SDNode *N, unsigned InsIndex) {
+  assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT &&
+         "Expected extract_vector_elt");
+  SDValue InsertVal = N->getOperand(1);
+  SDValue Vec = N->getOperand(0);
+
+  auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec);
+  if (!SVN || !Vec.hasOneUse())
+    return SDValue();
+
+  ArrayRef<int> Mask = SVN->getMask();
+  SDValue X = Vec.getOperand(0);
+  SDValue Y = Vec.getOperand(1);
+
+  SmallVector<int, 16> NewMask(Mask);
+  if (mergeEltWithShuffle(X, Y, Mask, NewMask, InsertVal, InsIndex)) {
+    SDValue LegalShuffle = TLI.buildLegalVectorShuffle(
+        Vec.getValueType(), SDLoc(N), X, Y, NewMask, DAG);
+    if (LegalShuffle)
+      return LegalShuffle;
+  }
+
+  return SDValue();
+}
+
+// Convert a disguised subvector insertion into a shuffle:
+// insert_vector_elt V, (bitcast X from vector type), IdxC -->
+// bitcast(shuffle (bitcast V), (extended X), Mask)
+// Note: We do not use an insert_subvector node because that requires a
+// legal subvector type.
+SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
+  assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT &&
+         "Expected extract_vector_elt");
+  SDValue InsertVal = N->getOperand(1);
+
+  if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() ||
+      !InsertVal.getOperand(0).getValueType().isVector())
+    return SDValue();
+
+  SDValue SubVec = InsertVal.getOperand(0);
+  SDValue DestVec = N->getOperand(0);
+  EVT SubVecVT = SubVec.getValueType();
+  EVT VT = DestVec.getValueType();
+  unsigned NumSrcElts = SubVecVT.getVectorNumElements();
+  // If the source only has a single vector element, the cost of creating adding
+  // it to a vector is likely to exceed the cost of a insert_vector_elt.
+  if (NumSrcElts == 1)
+    return SDValue();
+  unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();
+  unsigned NumMaskVals = ExtendRatio * NumSrcElts;
+
+  // Step 1: Create a shuffle mask that implements this insert operation. The
+  // vector that we are inserting into will be operand 0 of the shuffle, so
+  // those elements are just 'i'. The inserted subvector is in the first
+  // positions of operand 1 of the shuffle. Example:
+  // insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7}
+  SmallVector<int, 16> Mask(NumMaskVals);
+  for (unsigned i = 0; i != NumMaskVals; ++i) {
+    if (i / NumSrcElts == InsIndex)
+      Mask[i] = (i % NumSrcElts) + NumMaskVals;
+    else
+      Mask[i] = i;
+  }
+
+  // Bail out if the target can not handle the shuffle we want to create.
+  EVT SubVecEltVT = SubVecVT.getVectorElementType();
+  EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals);
+  if (!TLI.isShuffleMaskLegal(Mask, ShufVT))
+    return SDValue();
+
+  // Step 2: Create a wide vector from the inserted source vector by appending
+  // undefined elements. This is the same size as our destination vector.
+  SDLoc DL(N);
+  SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT));
+  ConcatOps[0] = SubVec;
+  SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps);
+
+  // Step 3: Shuffle in the padded subvector.
+  SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec);
+  SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask);
+  AddToWorklist(PaddedSubV.getNode());
+  AddToWorklist(DestVecBC.getNode());
+  AddToWorklist(Shuf.getNode());
+  return DAG.getBitcast(VT, Shuf);
+}
+
+// Combine insert(shuffle(load, <u,0,1,2>), load, 0) into a single load if
+// possible and the new load will be quick. We use more loads but less shuffles
+// and inserts.
+SDValue DAGCombiner::combineInsertEltToLoad(SDNode *N, unsigned InsIndex) {
+  EVT VT = N->getValueType(0);
+
+  // InsIndex is expected to be the first of last lane.
+  if (!VT.isFixedLengthVector() ||
+      (InsIndex != 0 && InsIndex != VT.getVectorNumElements() - 1))
+    return SDValue();
+
+  // Look for a shuffle with the mask u,0,1,2,3,4,5,6 or 1,2,3,4,5,6,7,u
+  // depending on the InsIndex.
+  auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
+  SDValue Scalar = N->getOperand(1);
+  if (!Shuffle || !all_of(enumerate(Shuffle->getMask()), [&](auto P) {
+        return InsIndex == P.index() || P.value() < 0 ||
+               (InsIndex == 0 && P.value() == (int)P.index() - 1) ||
+               (InsIndex == VT.getVectorNumElements() - 1 &&
+                P.value() == (int)P.index() + 1);
+      }))
+    return SDValue();
+
+  // We optionally skip over an extend so long as both loads are extended in the
+  // same way from the same type.
+  unsigned Extend = 0;
+  if (Scalar.getOpcode() == ISD::ZERO_EXTEND ||
+      Scalar.getOpcode() == ISD::SIGN_EXTEND ||
+      Scalar.getOpcode() == ISD::ANY_EXTEND) {
+    Extend = Scalar.getOpcode();
+    Scalar = Scalar.getOperand(0);
+  }
+
+  auto *ScalarLoad = dyn_cast<LoadSDNode>(Scalar);
+  if (!ScalarLoad)
+    return SDValue();
+
+  SDValue Vec = Shuffle->getOperand(0);
+  if (Extend) {
+    if (Vec.getOpcode() != Extend)
+      return SDValue();
+    Vec = Vec.getOperand(0);
+  }
+  auto *VecLoad = dyn_cast<LoadSDNode>(Vec);
+  if (!VecLoad || Vec.getValueType().getScalarType() != Scalar.getValueType())
+    return SDValue();
+
+  int EltSize = ScalarLoad->getValueType(0).getScalarSizeInBits();
+  if (EltSize == 0 || EltSize % 8 != 0 || !ScalarLoad->isSimple() ||
+      !VecLoad->isSimple() || VecLoad->getExtensionType() != ISD::NON_EXTLOAD ||
+      ScalarLoad->getExtensionType() != ISD::NON_EXTLOAD ||
+      ScalarLoad->getAddressSpace() != VecLoad->getAddressSpace())
+    return SDValue();
+
+  // Check that the offset between the pointers to produce a single continuous
+  // load.
+  if (InsIndex == 0) {
+    if (!DAG.areNonVolatileConsecutiveLoads(ScalarLoad, VecLoad, EltSize / 8,
+                                            -1))
+      return SDValue();
+  } else {
+    if (!DAG.areNonVolatileConsecutiveLoads(
+            VecLoad, ScalarLoad, VT.getVectorNumElements() * EltSize / 8, -1))
+      return SDValue();
+  }
+
+  // And that the new unaligned load will be fast.
+  unsigned IsFast = 0;
+  Align NewAlign = commonAlignment(VecLoad->getAlign(), EltSize / 8);
+  if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+                              Vec.getValueType(), VecLoad->getAddressSpace(),
+                              NewAlign, VecLoad->getMemOperand()->getFlags(),
+                              &IsFast) ||
+      !IsFast)
+    return SDValue();
+
+  // Calculate the new Ptr and create the new load.
+  SDLoc DL(N);
+  SDValue Ptr = ScalarLoad->getBasePtr();
+  if (InsIndex != 0)
+    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), VecLoad->getBasePtr(),
+                      DAG.getConstant(EltSize / 8, DL, Ptr.getValueType()));
+  MachinePointerInfo PtrInfo =
+      InsIndex == 0 ? ScalarLoad->getPointerInfo()
+                    : VecLoad->getPointerInfo().getWithOffset(EltSize / 8);
+
+  SDValue Load = DAG.getLoad(VecLoad->getValueType(0), DL,
+                             ScalarLoad->getChain(), Ptr, PtrInfo, NewAlign);
+  DAG.makeEquivalentMemoryOrdering(ScalarLoad, Load.getValue(1));
+  DAG.makeEquivalentMemoryOrdering(VecLoad, Load.getValue(1));
+  return Extend ? DAG.getNode(Extend, DL, VT, Load) : Load;
+}
+
+SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
+  SDValue InVec = N->getOperand(0);
+  SDValue InVal = N->getOperand(1);
+  SDValue EltNo = N->getOperand(2);
+  SDLoc DL(N);
+
+  EVT VT = InVec.getValueType();
+  auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
+
+  // Insert into out-of-bounds element is undefined.
+  if (IndexC && VT.isFixedLengthVector() &&
+      IndexC->getZExtValue() >= VT.getVectorNumElements())
+    return DAG.getUNDEF(VT);
+
+  // Remove redundant insertions:
+  // (insert_vector_elt x (extract_vector_elt x idx) idx) -> x
+  if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))
+    return InVec;
+
+  if (!IndexC) {
+    // If this is variable insert to undef vector, it might be better to splat:
+    // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
+    if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT))
+      return DAG.getSplat(VT, DL, InVal);
+    return SDValue();
+  }
+
+  if (VT.isScalableVector())
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // We must know which element is being inserted for folds below here.
+  unsigned Elt = IndexC->getZExtValue();
+
+  // Handle <1 x ???> vector insertion special cases.
+  if (NumElts == 1) {
+    // insert_vector_elt(x, extract_vector_elt(y, 0), 0) -> y
+    if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        InVal.getOperand(0).getValueType() == VT &&
+        isNullConstant(InVal.getOperand(1)))
+      return InVal.getOperand(0);
+  }
+
+  // Canonicalize insert_vector_elt dag nodes.
+  // Example:
+  // (insert_vector_elt (insert_vector_elt A, Idx0), Idx1)
+  // -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0)
+  //
+  // Do this only if the child insert_vector node has one use; also
+  // do this only if indices are both constants and Idx1 < Idx0.
+  if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse()
+      && isa<ConstantSDNode>(InVec.getOperand(2))) {
+    unsigned OtherElt = InVec.getConstantOperandVal(2);
+    if (Elt < OtherElt) {
+      // Swap nodes.
+      SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
+                                  InVec.getOperand(0), InVal, EltNo);
+      AddToWorklist(NewOp.getNode());
+      return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()),
+                         VT, NewOp, InVec.getOperand(1), InVec.getOperand(2));
+    }
+  }
+
+  if (SDValue Shuf = mergeInsertEltWithShuffle(N, Elt))
+    return Shuf;
+
+  if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
+    return Shuf;
+
+  if (SDValue Shuf = combineInsertEltToLoad(N, Elt))
+    return Shuf;
+
+  // Attempt to convert an insert_vector_elt chain into a legal build_vector.
+  if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) {
+    // vXi1 vector - we don't need to recurse.
+    if (NumElts == 1)
+      return DAG.getBuildVector(VT, DL, {InVal});
+
+    // If we haven't already collected the element, insert into the op list.
+    EVT MaxEltVT = InVal.getValueType();
+    auto AddBuildVectorOp = [&](SmallVectorImpl<SDValue> &Ops, SDValue Elt,
+                                unsigned Idx) {
+      if (!Ops[Idx]) {
+        Ops[Idx] = Elt;
+        if (VT.isInteger()) {
+          EVT EltVT = Elt.getValueType();
+          MaxEltVT = MaxEltVT.bitsGE(EltVT) ? MaxEltVT : EltVT;
+        }
+      }
+    };
+
+    // Ensure all the operands are the same value type, fill any missing
+    // operands with UNDEF and create the BUILD_VECTOR.
+    auto CanonicalizeBuildVector = [&](SmallVectorImpl<SDValue> &Ops) {
+      assert(Ops.size() == NumElts && "Unexpected vector size");
+      for (SDValue &Op : Ops) {
+        if (Op)
+          Op = VT.isInteger() ? DAG.getAnyExtOrTrunc(Op, DL, MaxEltVT) : Op;
+        else
+          Op = DAG.getUNDEF(MaxEltVT);
+      }
+      return DAG.getBuildVector(VT, DL, Ops);
+    };
+
+    SmallVector<SDValue, 8> Ops(NumElts, SDValue());
+    Ops[Elt] = InVal;
+
+    // Recurse up a INSERT_VECTOR_ELT chain to build a BUILD_VECTOR.
+    for (SDValue CurVec = InVec; CurVec;) {
+      // UNDEF - build new BUILD_VECTOR from already inserted operands.
+      if (CurVec.isUndef())
+        return CanonicalizeBuildVector(Ops);
+
+      // BUILD_VECTOR - insert unused operands and build new BUILD_VECTOR.
+      if (CurVec.getOpcode() == ISD::BUILD_VECTOR && CurVec.hasOneUse()) {
+        for (unsigned I = 0; I != NumElts; ++I)
+          AddBuildVectorOp(Ops, CurVec.getOperand(I), I);
+        return CanonicalizeBuildVector(Ops);
+      }
+
+      // SCALAR_TO_VECTOR - insert unused scalar and build new BUILD_VECTOR.
+      if (CurVec.getOpcode() == ISD::SCALAR_TO_VECTOR && CurVec.hasOneUse()) {
+        AddBuildVectorOp(Ops, CurVec.getOperand(0), 0);
+        return CanonicalizeBuildVector(Ops);
+      }
+
+      // INSERT_VECTOR_ELT - insert operand and continue up the chain.
+      if (CurVec.getOpcode() == ISD::INSERT_VECTOR_ELT && CurVec.hasOneUse())
+        if (auto *CurIdx = dyn_cast<ConstantSDNode>(CurVec.getOperand(2)))
+          if (CurIdx->getAPIntValue().ult(NumElts)) {
+            unsigned Idx = CurIdx->getZExtValue();
+            AddBuildVectorOp(Ops, CurVec.getOperand(1), Idx);
+
+            // Found entire BUILD_VECTOR.
+            if (all_of(Ops, [](SDValue Op) { return !!Op; }))
+              return CanonicalizeBuildVector(Ops);
+
+            CurVec = CurVec->getOperand(0);
+            continue;
+          }
+
+      // VECTOR_SHUFFLE - if all the operands match the shuffle's sources,
+      // update the shuffle mask (and second operand if we started with unary
+      // shuffle) and create a new legal shuffle.
+      if (CurVec.getOpcode() == ISD::VECTOR_SHUFFLE && CurVec.hasOneUse()) {
+        auto *SVN = cast<ShuffleVectorSDNode>(CurVec);
+        SDValue LHS = SVN->getOperand(0);
+        SDValue RHS = SVN->getOperand(1);
+        SmallVector<int, 16> Mask(SVN->getMask());
+        bool Merged = true;
+        for (auto I : enumerate(Ops)) {
+          SDValue &Op = I.value();
+          if (Op) {
+            SmallVector<int, 16> NewMask;
+            if (!mergeEltWithShuffle(LHS, RHS, Mask, NewMask, Op, I.index())) {
+              Merged = false;
+              break;
+            }
+            Mask = std::move(NewMask);
+          }
+        }
+        if (Merged)
+          if (SDValue NewShuffle =
+                  TLI.buildLegalVectorShuffle(VT, DL, LHS, RHS, Mask, DAG))
+            return NewShuffle;
+      }
+
+      // If all insertions are zero value, try to convert to AND mask.
+      // TODO: Do this for -1 with OR mask?
+      if (!LegalOperations && llvm::isNullConstant(InVal) &&
+          all_of(Ops, [InVal](SDValue Op) { return !Op || Op == InVal; }) &&
+          count_if(Ops, [InVal](SDValue Op) { return Op == InVal; }) >= 2) {
+        SDValue Zero = DAG.getConstant(0, DL, MaxEltVT);
+        SDValue AllOnes = DAG.getAllOnesConstant(DL, MaxEltVT);
+        SmallVector<SDValue, 8> Mask(NumElts);
+        for (unsigned I = 0; I != NumElts; ++I)
+          Mask[I] = Ops[I] ? Zero : AllOnes;
+        return DAG.getNode(ISD::AND, DL, VT, CurVec,
+                           DAG.getBuildVector(VT, DL, Mask));
+      }
+
+      // Failed to find a match in the chain - bail.
+      break;
+    }
+
+    // See if we can fill in the missing constant elements as zeros.
+    // TODO: Should we do this for any constant?
+    APInt DemandedZeroElts = APInt::getZero(NumElts);
+    for (unsigned I = 0; I != NumElts; ++I)
+      if (!Ops[I])
+        DemandedZeroElts.setBit(I);
+
+    if (DAG.MaskedVectorIsZero(InVec, DemandedZeroElts)) {
+      SDValue Zero = VT.isInteger() ? DAG.getConstant(0, DL, MaxEltVT)
+                                    : DAG.getConstantFP(0, DL, MaxEltVT);
+      for (unsigned I = 0; I != NumElts; ++I)
+        if (!Ops[I])
+          Ops[I] = Zero;
+
+      return CanonicalizeBuildVector(Ops);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
+                                                  SDValue EltNo,
+                                                  LoadSDNode *OriginalLoad) {
+  assert(OriginalLoad->isSimple());
+
+  EVT ResultVT = EVE->getValueType(0);
+  EVT VecEltVT = InVecVT.getVectorElementType();
+
+  // If the vector element type is not a multiple of a byte then we are unable
+  // to correctly compute an address to load only the extracted element as a
+  // scalar.
+  if (!VecEltVT.isByteSized())
+    return SDValue();
+
+  ISD::LoadExtType ExtTy =
+      ResultVT.bitsGT(VecEltVT) ? ISD::NON_EXTLOAD : ISD::EXTLOAD;
+  if (!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT) ||
+      !TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
+    return SDValue();
+
+  Align Alignment = OriginalLoad->getAlign();
+  MachinePointerInfo MPI;
+  SDLoc DL(EVE);
+  if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
+    int Elt = ConstEltNo->getZExtValue();
+    unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
+    MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
+    Alignment = commonAlignment(Alignment, PtrOff);
+  } else {
+    // Discard the pointer info except the address space because the memory
+    // operand can't represent this new access since the offset is variable.
+    MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace());
+    Alignment = commonAlignment(Alignment, VecEltVT.getSizeInBits() / 8);
+  }
+
+  unsigned IsFast = 0;
+  if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VecEltVT,
+                              OriginalLoad->getAddressSpace(), Alignment,
+                              OriginalLoad->getMemOperand()->getFlags(),
+                              &IsFast) ||
+      !IsFast)
+    return SDValue();
+
+  SDValue NewPtr = TLI.getVectorElementPointer(DAG, OriginalLoad->getBasePtr(),
+                                               InVecVT, EltNo);
+
+  // We are replacing a vector load with a scalar load. The new load must have
+  // identical memory op ordering to the original.
+  SDValue Load;
+  if (ResultVT.bitsGT(VecEltVT)) {
+    // If the result type of vextract is wider than the load, then issue an
+    // extending load instead.
+    ISD::LoadExtType ExtType =
+        TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, VecEltVT) ? ISD::ZEXTLOAD
+                                                              : ISD::EXTLOAD;
+    Load = DAG.getExtLoad(ExtType, DL, ResultVT, OriginalLoad->getChain(),
+                          NewPtr, MPI, VecEltVT, Alignment,
+                          OriginalLoad->getMemOperand()->getFlags(),
+                          OriginalLoad->getAAInfo());
+    DAG.makeEquivalentMemoryOrdering(OriginalLoad, Load);
+  } else {
+    // The result type is narrower or the same width as the vector element
+    Load = DAG.getLoad(VecEltVT, DL, OriginalLoad->getChain(), NewPtr, MPI,
+                       Alignment, OriginalLoad->getMemOperand()->getFlags(),
+                       OriginalLoad->getAAInfo());
+    DAG.makeEquivalentMemoryOrdering(OriginalLoad, Load);
+    if (ResultVT.bitsLT(VecEltVT))
+      Load = DAG.getNode(ISD::TRUNCATE, DL, ResultVT, Load);
+    else
+      Load = DAG.getBitcast(ResultVT, Load);
+  }
+  ++OpsNarrowed;
+  return Load;
+}
+
+/// Transform a vector binary operation into a scalar binary operation by moving
+/// the math/logic after an extract element of a vector.
+static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
+                                       bool LegalOperations) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Vec = ExtElt->getOperand(0);
+  SDValue Index = ExtElt->getOperand(1);
+  auto *IndexC = dyn_cast<ConstantSDNode>(Index);
+  if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() ||
+      Vec->getNumValues() != 1)
+    return SDValue();
+
+  // Targets may want to avoid this to prevent an expensive register transfer.
+  if (!TLI.shouldScalarizeBinop(Vec))
+    return SDValue();
+
+  // Extracting an element of a vector constant is constant-folded, so this
+  // transform is just replacing a vector op with a scalar op while moving the
+  // extract.
+  SDValue Op0 = Vec.getOperand(0);
+  SDValue Op1 = Vec.getOperand(1);
+  APInt SplatVal;
+  if (isAnyConstantBuildVector(Op0, true) ||
+      ISD::isConstantSplatVector(Op0.getNode(), SplatVal) ||
+      isAnyConstantBuildVector(Op1, true) ||
+      ISD::isConstantSplatVector(Op1.getNode(), SplatVal)) {
+    // extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C'
+    // extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC)
+    SDLoc DL(ExtElt);
+    EVT VT = ExtElt->getValueType(0);
+    SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index);
+    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index);
+    return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1);
+  }
+
+  return SDValue();
+}
+
+// Given a ISD::EXTRACT_VECTOR_ELT, which is a glorified bit sequence extract,
+// recursively analyse all of it's users. and try to model themselves as
+// bit sequence extractions. If all of them agree on the new, narrower element
+// type, and all of them can be modelled as ISD::EXTRACT_VECTOR_ELT's of that
+// new element type, do so now.
+// This is mainly useful to recover from legalization that scalarized
+// the vector as wide elements, but tries to rebuild it with narrower elements.
+//
+// Some more nodes could be modelled if that helps cover interesting patterns.
+bool DAGCombiner::refineExtractVectorEltIntoMultipleNarrowExtractVectorElts(
+    SDNode *N) {
+  // We perform this optimization post type-legalization because
+  // the type-legalizer often scalarizes integer-promoted vectors.
+  // Performing this optimization before may cause legalizaton cycles.
+  if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes)
+    return false;
+
+  // TODO: Add support for big-endian.
+  if (DAG.getDataLayout().isBigEndian())
+    return false;
+
+  SDValue VecOp = N->getOperand(0);
+  EVT VecVT = VecOp.getValueType();
+  assert(!VecVT.isScalableVector() && "Only for fixed vectors.");
+
+  // We must start with a constant extraction index.
+  auto *IndexC = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!IndexC)
+    return false;
+
+  assert(IndexC->getZExtValue() < VecVT.getVectorNumElements() &&
+         "Original ISD::EXTRACT_VECTOR_ELT is undefinend?");
+
+  // TODO: deal with the case of implicit anyext of the extraction.
+  unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();
+  EVT ScalarVT = N->getValueType(0);
+  if (VecVT.getScalarType() != ScalarVT)
+    return false;
+
+  // TODO: deal with the cases other than everything being integer-typed.
+  if (!ScalarVT.isScalarInteger())
+    return false;
+
+  struct Entry {
+    SDNode *Producer;
+
+    // Which bits of VecOp does it contain?
+    unsigned BitPos;
+    int NumBits;
+    // NOTE: the actual width of \p Producer may be wider than NumBits!
+
+    Entry(Entry &&) = default;
+    Entry(SDNode *Producer_, unsigned BitPos_, int NumBits_)
+        : Producer(Producer_), BitPos(BitPos_), NumBits(NumBits_) {}
+
+    Entry() = delete;
+    Entry(const Entry &) = delete;
+    Entry &operator=(const Entry &) = delete;
+    Entry &operator=(Entry &&) = delete;
+  };
+  SmallVector<Entry, 32> Worklist;
+  SmallVector<Entry, 32> Leafs;
+
+  // We start at the "root" ISD::EXTRACT_VECTOR_ELT.
+  Worklist.emplace_back(N, /*BitPos=*/VecEltBitWidth * IndexC->getZExtValue(),
+                        /*NumBits=*/VecEltBitWidth);
+
+  while (!Worklist.empty()) {
+    Entry E = Worklist.pop_back_val();
+    // Does the node not even use any of the VecOp bits?
+    if (!(E.NumBits > 0 && E.BitPos < VecVT.getSizeInBits() &&
+          E.BitPos + E.NumBits <= VecVT.getSizeInBits()))
+      return false; // Let's allow the other combines clean this up first.
+    // Did we fail to model any of the users of the Producer?
+    bool ProducerIsLeaf = false;
+    // Look at each user of this Producer.
+    for (SDNode *User : E.Producer->uses()) {
+      switch (User->getOpcode()) {
+      // TODO: support ISD::BITCAST
+      // TODO: support ISD::ANY_EXTEND
+      // TODO: support ISD::ZERO_EXTEND
+      // TODO: support ISD::SIGN_EXTEND
+      case ISD::TRUNCATE:
+        // Truncation simply means we keep position, but extract less bits.
+        Worklist.emplace_back(User, E.BitPos,
+                              /*NumBits=*/User->getValueSizeInBits(0));
+        break;
+      // TODO: support ISD::SRA
+      // TODO: support ISD::SHL
+      case ISD::SRL:
+        // We should be shifting the Producer by a constant amount.
+        if (auto *ShAmtC = dyn_cast<ConstantSDNode>(User->getOperand(1));
+            User->getOperand(0).getNode() == E.Producer && ShAmtC) {
+          // Logical right-shift means that we start extraction later,
+          // but stop it at the same position we did previously.
+          unsigned ShAmt = ShAmtC->getZExtValue();
+          Worklist.emplace_back(User, E.BitPos + ShAmt, E.NumBits - ShAmt);
+          break;
+        }
+        [[fallthrough]];
+      default:
+        // We can not model this user of the Producer.
+        // Which means the current Producer will be a ISD::EXTRACT_VECTOR_ELT.
+        ProducerIsLeaf = true;
+        // Profitability check: all users that we can not model
+        //                      must be ISD::BUILD_VECTOR's.
+        if (User->getOpcode() != ISD::BUILD_VECTOR)
+          return false;
+        break;
+      }
+    }
+    if (ProducerIsLeaf)
+      Leafs.emplace_back(std::move(E));
+  }
+
+  unsigned NewVecEltBitWidth = Leafs.front().NumBits;
+
+  // If we are still at the same element granularity, give up,
+  if (NewVecEltBitWidth == VecEltBitWidth)
+    return false;
+
+  // The vector width must be a multiple of the new element width.
+  if (VecVT.getSizeInBits() % NewVecEltBitWidth != 0)
+    return false;
+
+  // All leafs must agree on the new element width.
+  // All leafs must not expect any "padding" bits ontop of that width.
+  // All leafs must start extraction from multiple of that width.
+  if (!all_of(Leafs, [NewVecEltBitWidth](const Entry &E) {
+        return (unsigned)E.NumBits == NewVecEltBitWidth &&
+               E.Producer->getValueSizeInBits(0) == NewVecEltBitWidth &&
+               E.BitPos % NewVecEltBitWidth == 0;
+      }))
+    return false;
+
+  EVT NewScalarVT = EVT::getIntegerVT(*DAG.getContext(), NewVecEltBitWidth);
+  EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewScalarVT,
+                                  VecVT.getSizeInBits() / NewVecEltBitWidth);
+
+  if (LegalTypes &&
+      !(TLI.isTypeLegal(NewScalarVT) && TLI.isTypeLegal(NewVecVT)))
+    return false;
+
+  if (LegalOperations &&
+      !(TLI.isOperationLegalOrCustom(ISD::BITCAST, NewVecVT) &&
+        TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, NewVecVT)))
+    return false;
+
+  SDValue NewVecOp = DAG.getBitcast(NewVecVT, VecOp);
+  for (const Entry &E : Leafs) {
+    SDLoc DL(E.Producer);
+    unsigned NewIndex = E.BitPos / NewVecEltBitWidth;
+    assert(NewIndex < NewVecVT.getVectorNumElements() &&
+           "Creating out-of-bounds ISD::EXTRACT_VECTOR_ELT?");
+    SDValue V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, NewScalarVT, NewVecOp,
+                            DAG.getVectorIdxConstant(NewIndex, DL));
+    CombineTo(E.Producer, V);
+  }
+
+  return true;
+}
+
+SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue VecOp = N->getOperand(0);
+  SDValue Index = N->getOperand(1);
+  EVT ScalarVT = N->getValueType(0);
+  EVT VecVT = VecOp.getValueType();
+  if (VecOp.isUndef())
+    return DAG.getUNDEF(ScalarVT);
+
+  // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val
+  //
+  // This only really matters if the index is non-constant since other combines
+  // on the constant elements already work.
+  SDLoc DL(N);
+  if (VecOp.getOpcode() == ISD::INSERT_VECTOR_ELT &&
+      Index == VecOp.getOperand(2)) {
+    SDValue Elt = VecOp.getOperand(1);
+    return VecVT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, DL, ScalarVT) : Elt;
+  }
+
+  // (vextract (scalar_to_vector val, 0) -> val
+  if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    // Only 0'th element of SCALAR_TO_VECTOR is defined.
+    if (DAG.isKnownNeverZero(Index))
+      return DAG.getUNDEF(ScalarVT);
+
+    // Check if the result type doesn't match the inserted element type. A
+    // SCALAR_TO_VECTOR may truncate the inserted element and the
+    // EXTRACT_VECTOR_ELT may widen the extracted vector.
+    SDValue InOp = VecOp.getOperand(0);
+    if (InOp.getValueType() != ScalarVT) {
+      assert(InOp.getValueType().isInteger() && ScalarVT.isInteger() &&
+             InOp.getValueType().bitsGT(ScalarVT));
+      return DAG.getNode(ISD::TRUNCATE, DL, ScalarVT, InOp);
+    }
+    return InOp;
+  }
+
+  // extract_vector_elt of out-of-bounds element -> UNDEF
+  auto *IndexC = dyn_cast<ConstantSDNode>(Index);
+  if (IndexC && VecVT.isFixedLengthVector() &&
+      IndexC->getAPIntValue().uge(VecVT.getVectorNumElements()))
+    return DAG.getUNDEF(ScalarVT);
+
+  // extract_vector_elt(freeze(x)), idx -> freeze(extract_vector_elt(x)), idx
+  if (VecOp.hasOneUse() && VecOp.getOpcode() == ISD::FREEZE) {
+    return DAG.getFreeze(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT,
+                                     VecOp.getOperand(0), Index));
+  }
+
+  // extract_vector_elt (build_vector x, y), 1 -> y
+  if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) ||
+       VecOp.getOpcode() == ISD::SPLAT_VECTOR) &&
+      TLI.isTypeLegal(VecVT)) {
+    assert((VecOp.getOpcode() != ISD::BUILD_VECTOR ||
+            VecVT.isFixedLengthVector()) &&
+           "BUILD_VECTOR used for scalable vectors");
+    unsigned IndexVal =
+        VecOp.getOpcode() == ISD::BUILD_VECTOR ? IndexC->getZExtValue() : 0;
+    SDValue Elt = VecOp.getOperand(IndexVal);
+    EVT InEltVT = Elt.getValueType();
+
+    if (VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT) ||
+        isNullConstant(Elt)) {
+      // Sometimes build_vector's scalar input types do not match result type.
+      if (ScalarVT == InEltVT)
+        return Elt;
+
+      // TODO: It may be useful to truncate if free if the build_vector
+      // implicitly converts.
+    }
+  }
+
+  if (SDValue BO = scalarizeExtractedBinop(N, DAG, LegalOperations))
+    return BO;
+
+  if (VecVT.isScalableVector())
+    return SDValue();
+
+  // All the code from this point onwards assumes fixed width vectors, but it's
+  // possible that some of the combinations could be made to work for scalable
+  // vectors too.
+  unsigned NumElts = VecVT.getVectorNumElements();
+  unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();
+
+  // TODO: These transforms should not require the 'hasOneUse' restriction, but
+  // there are regressions on multiple targets without it. We can end up with a
+  // mess of scalar and vector code if we reduce only part of the DAG to scalar.
+  if (IndexC && VecOp.getOpcode() == ISD::BITCAST && VecVT.isInteger() &&
+      VecOp.hasOneUse()) {
+    // The vector index of the LSBs of the source depend on the endian-ness.
+    bool IsLE = DAG.getDataLayout().isLittleEndian();
+    unsigned ExtractIndex = IndexC->getZExtValue();
+    // extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x)
+    unsigned BCTruncElt = IsLE ? 0 : NumElts - 1;
+    SDValue BCSrc = VecOp.getOperand(0);
+    if (ExtractIndex == BCTruncElt && BCSrc.getValueType().isScalarInteger())
+      return DAG.getAnyExtOrTrunc(BCSrc, DL, ScalarVT);
+
+    if (LegalTypes && BCSrc.getValueType().isInteger() &&
+        BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+      // ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt -->
+      // trunc i64 X to i32
+      SDValue X = BCSrc.getOperand(0);
+      assert(X.getValueType().isScalarInteger() && ScalarVT.isScalarInteger() &&
+             "Extract element and scalar to vector can't change element type "
+             "from FP to integer.");
+      unsigned XBitWidth = X.getValueSizeInBits();
+      BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1;
+
+      // An extract element return value type can be wider than its vector
+      // operand element type. In that case, the high bits are undefined, so
+      // it's possible that we may need to extend rather than truncate.
+      if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth) {
+        assert(XBitWidth % VecEltBitWidth == 0 &&
+               "Scalar bitwidth must be a multiple of vector element bitwidth");
+        return DAG.getAnyExtOrTrunc(X, DL, ScalarVT);
+      }
+    }
+  }
+
+  // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT.
+  // We only perform this optimization before the op legalization phase because
+  // we may introduce new vector instructions which are not backed by TD
+  // patterns. For example on AVX, extracting elements from a wide vector
+  // without using extract_subvector. However, if we can find an underlying
+  // scalar value, then we can always use that.
+  if (IndexC && VecOp.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    auto *Shuf = cast<ShuffleVectorSDNode>(VecOp);
+    // Find the new index to extract from.
+    int OrigElt = Shuf->getMaskElt(IndexC->getZExtValue());
+
+    // Extracting an undef index is undef.
+    if (OrigElt == -1)
+      return DAG.getUNDEF(ScalarVT);
+
+    // Select the right vector half to extract from.
+    SDValue SVInVec;
+    if (OrigElt < (int)NumElts) {
+      SVInVec = VecOp.getOperand(0);
+    } else {
+      SVInVec = VecOp.getOperand(1);
+      OrigElt -= NumElts;
+    }
+
+    if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) {
+      SDValue InOp = SVInVec.getOperand(OrigElt);
+      if (InOp.getValueType() != ScalarVT) {
+        assert(InOp.getValueType().isInteger() && ScalarVT.isInteger());
+        InOp = DAG.getSExtOrTrunc(InOp, DL, ScalarVT);
+      }
+
+      return InOp;
+    }
+
+    // FIXME: We should handle recursing on other vector shuffles and
+    // scalar_to_vector here as well.
+
+    if (!LegalOperations ||
+        // FIXME: Should really be just isOperationLegalOrCustom.
+        TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecVT) ||
+        TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VecVT)) {
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SVInVec,
+                         DAG.getVectorIdxConstant(OrigElt, DL));
+    }
+  }
+
+  // If only EXTRACT_VECTOR_ELT nodes use the source vector we can
+  // simplify it based on the (valid) extraction indices.
+  if (llvm::all_of(VecOp->uses(), [&](SDNode *Use) {
+        return Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+               Use->getOperand(0) == VecOp &&
+               isa<ConstantSDNode>(Use->getOperand(1));
+      })) {
+    APInt DemandedElts = APInt::getZero(NumElts);
+    for (SDNode *Use : VecOp->uses()) {
+      auto *CstElt = cast<ConstantSDNode>(Use->getOperand(1));
+      if (CstElt->getAPIntValue().ult(NumElts))
+        DemandedElts.setBit(CstElt->getZExtValue());
+    }
+    if (SimplifyDemandedVectorElts(VecOp, DemandedElts, true)) {
+      // We simplified the vector operand of this extract element. If this
+      // extract is not dead, visit it again so it is folded properly.
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        AddToWorklist(N);
+      return SDValue(N, 0);
+    }
+    APInt DemandedBits = APInt::getAllOnes(VecEltBitWidth);
+    if (SimplifyDemandedBits(VecOp, DemandedBits, DemandedElts, true)) {
+      // We simplified the vector operand of this extract element. If this
+      // extract is not dead, visit it again so it is folded properly.
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        AddToWorklist(N);
+      return SDValue(N, 0);
+    }
+  }
+
+  if (refineExtractVectorEltIntoMultipleNarrowExtractVectorElts(N))
+    return SDValue(N, 0);
+
+  // Everything under here is trying to match an extract of a loaded value.
+  // If the result of load has to be truncated, then it's not necessarily
+  // profitable.
+  bool BCNumEltsChanged = false;
+  EVT ExtVT = VecVT.getVectorElementType();
+  EVT LVT = ExtVT;
+  if (ScalarVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, ScalarVT))
+    return SDValue();
+
+  if (VecOp.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!VecOp.hasOneUse())
+      return SDValue();
+
+    EVT BCVT = VecOp.getOperand(0).getValueType();
+    if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
+      return SDValue();
+    if (NumElts != BCVT.getVectorNumElements())
+      BCNumEltsChanged = true;
+    VecOp = VecOp.getOperand(0);
+    ExtVT = BCVT.getVectorElementType();
+  }
+
+  // extract (vector load $addr), i --> load $addr + i * size
+  if (!LegalOperations && !IndexC && VecOp.hasOneUse() &&
+      ISD::isNormalLoad(VecOp.getNode()) &&
+      !Index->hasPredecessor(VecOp.getNode())) {
+    auto *VecLoad = dyn_cast<LoadSDNode>(VecOp);
+    if (VecLoad && VecLoad->isSimple())
+      return scalarizeExtractedVectorLoad(N, VecVT, Index, VecLoad);
+  }
+
+  // Perform only after legalization to ensure build_vector / vector_shuffle
+  // optimizations have already been done.
+  if (!LegalOperations || !IndexC)
+    return SDValue();
+
+  // (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size)
+  // (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size)
+  // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr)
+  int Elt = IndexC->getZExtValue();
+  LoadSDNode *LN0 = nullptr;
+  if (ISD::isNormalLoad(VecOp.getNode())) {
+    LN0 = cast<LoadSDNode>(VecOp);
+  } else if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+             VecOp.getOperand(0).getValueType() == ExtVT &&
+             ISD::isNormalLoad(VecOp.getOperand(0).getNode())) {
+    // Don't duplicate a load with other uses.
+    if (!VecOp.hasOneUse())
+      return SDValue();
+
+    LN0 = cast<LoadSDNode>(VecOp.getOperand(0));
+  }
+  if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(VecOp)) {
+    // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1)
+    // =>
+    // (load $addr+1*size)
+
+    // Don't duplicate a load with other uses.
+    if (!VecOp.hasOneUse())
+      return SDValue();
+
+    // If the bit convert changed the number of elements, it is unsafe
+    // to examine the mask.
+    if (BCNumEltsChanged)
+      return SDValue();
+
+    // Select the input vector, guarding against out of range extract vector.
+    int Idx = (Elt > (int)NumElts) ? -1 : Shuf->getMaskElt(Elt);
+    VecOp = (Idx < (int)NumElts) ? VecOp.getOperand(0) : VecOp.getOperand(1);
+
+    if (VecOp.getOpcode() == ISD::BITCAST) {
+      // Don't duplicate a load with other uses.
+      if (!VecOp.hasOneUse())
+        return SDValue();
+
+      VecOp = VecOp.getOperand(0);
+    }
+    if (ISD::isNormalLoad(VecOp.getNode())) {
+      LN0 = cast<LoadSDNode>(VecOp);
+      Elt = (Idx < (int)NumElts) ? Idx : Idx - (int)NumElts;
+      Index = DAG.getConstant(Elt, DL, Index.getValueType());
+    }
+  } else if (VecOp.getOpcode() == ISD::CONCAT_VECTORS && !BCNumEltsChanged &&
+             VecVT.getVectorElementType() == ScalarVT &&
+             (!LegalTypes ||
+              TLI.isTypeLegal(
+                  VecOp.getOperand(0).getValueType().getVectorElementType()))) {
+    // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 0
+    //      -> extract_vector_elt a, 0
+    // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 1
+    //      -> extract_vector_elt a, 1
+    // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 2
+    //      -> extract_vector_elt b, 0
+    // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 3
+    //      -> extract_vector_elt b, 1
+    SDLoc SL(N);
+    EVT ConcatVT = VecOp.getOperand(0).getValueType();
+    unsigned ConcatNumElts = ConcatVT.getVectorNumElements();
+    SDValue NewIdx = DAG.getConstant(Elt % ConcatNumElts, SL,
+                                     Index.getValueType());
+
+    SDValue ConcatOp = VecOp.getOperand(Elt / ConcatNumElts);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL,
+                              ConcatVT.getVectorElementType(),
+                              ConcatOp, NewIdx);
+    return DAG.getNode(ISD::BITCAST, SL, ScalarVT, Elt);
+  }
+
+  // Make sure we found a non-volatile load and the extractelement is
+  // the only use.
+  if (!LN0 || !LN0->hasNUsesOfValue(1,0) || !LN0->isSimple())
+    return SDValue();
+
+  // If Idx was -1 above, Elt is going to be -1, so just return undef.
+  if (Elt == -1)
+    return DAG.getUNDEF(LVT);
+
+  return scalarizeExtractedVectorLoad(N, VecVT, Index, LN0);
+}
+
+// Simplify (build_vec (ext )) to (bitcast (build_vec ))
+SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
+  // We perform this optimization post type-legalization because
+  // the type-legalizer often scalarizes integer-promoted vectors.
+  // Performing this optimization before may create bit-casts which
+  // will be type-legalized to complex code sequences.
+  // We perform this optimization only before the operation legalizer because we
+  // may introduce illegal operations.
+  if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes)
+    return SDValue();
+
+  unsigned NumInScalars = N->getNumOperands();
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  // Check to see if this is a BUILD_VECTOR of a bunch of values
+  // which come from any_extend or zero_extend nodes. If so, we can create
+  // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR
+  // optimizations. We do not handle sign-extend because we can't fill the sign
+  // using shuffles.
+  EVT SourceType = MVT::Other;
+  bool AllAnyExt = true;
+
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue In = N->getOperand(i);
+    // Ignore undef inputs.
+    if (In.isUndef()) continue;
+
+    bool AnyExt  = In.getOpcode() == ISD::ANY_EXTEND;
+    bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND;
+
+    // Abort if the element is not an extension.
+    if (!ZeroExt && !AnyExt) {
+      SourceType = MVT::Other;
+      break;
+    }
+
+    // The input is a ZeroExt or AnyExt. Check the original type.
+    EVT InTy = In.getOperand(0).getValueType();
+
+    // Check that all of the widened source types are the same.
+    if (SourceType == MVT::Other)
+      // First time.
+      SourceType = InTy;
+    else if (InTy != SourceType) {
+      // Multiple income types. Abort.
+      SourceType = MVT::Other;
+      break;
+    }
+
+    // Check if all of the extends are ANY_EXTENDs.
+    AllAnyExt &= AnyExt;
+  }
+
+  // In order to have valid types, all of the inputs must be extended from the
+  // same source type and all of the inputs must be any or zero extend.
+  // Scalar sizes must be a power of two.
+  EVT OutScalarTy = VT.getScalarType();
+  bool ValidTypes =
+      SourceType != MVT::Other &&
+      llvm::has_single_bit<uint32_t>(OutScalarTy.getSizeInBits()) &&
+      llvm::has_single_bit<uint32_t>(SourceType.getSizeInBits());
+
+  // Create a new simpler BUILD_VECTOR sequence which other optimizations can
+  // turn into a single shuffle instruction.
+  if (!ValidTypes)
+    return SDValue();
+
+  // If we already have a splat buildvector, then don't fold it if it means
+  // introducing zeros.
+  if (!AllAnyExt && DAG.isSplatValue(SDValue(N, 0), /*AllowUndefs*/ true))
+    return SDValue();
+
+  bool isLE = DAG.getDataLayout().isLittleEndian();
+  unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();
+  assert(ElemRatio > 1 && "Invalid element size ratio");
+  SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType):
+                               DAG.getConstant(0, DL, SourceType);
+
+  unsigned NewBVElems = ElemRatio * VT.getVectorNumElements();
+  SmallVector<SDValue, 8> Ops(NewBVElems, Filler);
+
+  // Populate the new build_vector
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDValue Cast = N->getOperand(i);
+    assert((Cast.getOpcode() == ISD::ANY_EXTEND ||
+            Cast.getOpcode() == ISD::ZERO_EXTEND ||
+            Cast.isUndef()) && "Invalid cast opcode");
+    SDValue In;
+    if (Cast.isUndef())
+      In = DAG.getUNDEF(SourceType);
+    else
+      In = Cast->getOperand(0);
+    unsigned Index = isLE ? (i * ElemRatio) :
+                            (i * ElemRatio + (ElemRatio - 1));
+
+    assert(Index < Ops.size() && "Invalid index");
+    Ops[Index] = In;
+  }
+
+  // The type of the new BUILD_VECTOR node.
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems);
+  assert(VecVT.getSizeInBits() == VT.getSizeInBits() &&
+         "Invalid vector size");
+  // Check if the new vector type is legal.
+  if (!isTypeLegal(VecVT) ||
+      (!TLI.isOperationLegal(ISD::BUILD_VECTOR, VecVT) &&
+       TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)))
+    return SDValue();
+
+  // Make the new BUILD_VECTOR.
+  SDValue BV = DAG.getBuildVector(VecVT, DL, Ops);
+
+  // The new BUILD_VECTOR node has the potential to be further optimized.
+  AddToWorklist(BV.getNode());
+  // Bitcast to the desired type.
+  return DAG.getBitcast(VT, BV);
+}
+
+// Simplify (build_vec (trunc $1)
+//                     (trunc (srl $1 half-width))
+//                     (trunc (srl $1 (2 * half-width))))
+// to (bitcast $1)
+SDValue DAGCombiner::reduceBuildVecTruncToBitCast(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector");
+
+  // Only for little endian
+  if (!DAG.getDataLayout().isLittleEndian())
+    return SDValue();
+
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  EVT OutScalarTy = VT.getScalarType();
+  uint64_t ScalarTypeBitsize = OutScalarTy.getSizeInBits();
+
+  // Only for power of two types to be sure that bitcast works well
+  if (!isPowerOf2_64(ScalarTypeBitsize))
+    return SDValue();
+
+  unsigned NumInScalars = N->getNumOperands();
+
+  // Look through bitcasts
+  auto PeekThroughBitcast = [](SDValue Op) {
+    if (Op.getOpcode() == ISD::BITCAST)
+      return Op.getOperand(0);
+    return Op;
+  };
+
+  // The source value where all the parts are extracted.
+  SDValue Src;
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue In = PeekThroughBitcast(N->getOperand(i));
+    // Ignore undef inputs.
+    if (In.isUndef()) continue;
+
+    if (In.getOpcode() != ISD::TRUNCATE)
+      return SDValue();
+
+    In = PeekThroughBitcast(In.getOperand(0));
+
+    if (In.getOpcode() != ISD::SRL) {
+      // For now only build_vec without shuffling, handle shifts here in the
+      // future.
+      if (i != 0)
+        return SDValue();
+
+      Src = In;
+    } else {
+      // In is SRL
+      SDValue part = PeekThroughBitcast(In.getOperand(0));
+
+      if (!Src) {
+        Src = part;
+      } else if (Src != part) {
+        // Vector parts do not stem from the same variable
+        return SDValue();
+      }
+
+      SDValue ShiftAmtVal = In.getOperand(1);
+      if (!isa<ConstantSDNode>(ShiftAmtVal))
+        return SDValue();
+
+      uint64_t ShiftAmt = In.getConstantOperandVal(1);
+
+      // The extracted value is not extracted at the right position
+      if (ShiftAmt != i * ScalarTypeBitsize)
+        return SDValue();
+    }
+  }
+
+  // Only cast if the size is the same
+  if (!Src || Src.getValueType().getSizeInBits() != VT.getSizeInBits())
+    return SDValue();
+
+  return DAG.getBitcast(VT, Src);
+}
+
+SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
+                                           ArrayRef<int> VectorMask,
+                                           SDValue VecIn1, SDValue VecIn2,
+                                           unsigned LeftIdx, bool DidSplitVec) {
+  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
+
+  EVT VT = N->getValueType(0);
+  EVT InVT1 = VecIn1.getValueType();
+  EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1;
+
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned ShuffleNumElems = NumElems;
+
+  // If we artificially split a vector in two already, then the offsets in the
+  // operands will all be based off of VecIn1, even those in VecIn2.
+  unsigned Vec2Offset = DidSplitVec ? 0 : InVT1.getVectorNumElements();
+
+  uint64_t VTSize = VT.getFixedSizeInBits();
+  uint64_t InVT1Size = InVT1.getFixedSizeInBits();
+  uint64_t InVT2Size = InVT2.getFixedSizeInBits();
+
+  assert(InVT2Size <= InVT1Size &&
+         "Inputs must be sorted to be in non-increasing vector size order.");
+
+  // We can't generate a shuffle node with mismatched input and output types.
+  // Try to make the types match the type of the output.
+  if (InVT1 != VT || InVT2 != VT) {
+    if ((VTSize % InVT1Size == 0) && InVT1 == InVT2) {
+      // If the output vector length is a multiple of both input lengths,
+      // we can concatenate them and pad the rest with undefs.
+      unsigned NumConcats = VTSize / InVT1Size;
+      assert(NumConcats >= 2 && "Concat needs at least two inputs!");
+      SmallVector<SDValue, 2> ConcatOps(NumConcats, DAG.getUNDEF(InVT1));
+      ConcatOps[0] = VecIn1;
+      ConcatOps[1] = VecIn2 ? VecIn2 : DAG.getUNDEF(InVT1);
+      VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
+      VecIn2 = SDValue();
+    } else if (InVT1Size == VTSize * 2) {
+      if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems))
+        return SDValue();
+
+      if (!VecIn2.getNode()) {
+        // If we only have one input vector, and it's twice the size of the
+        // output, split it in two.
+        VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1,
+                             DAG.getVectorIdxConstant(NumElems, DL));
+        VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, ZeroIdx);
+        // Since we now have shorter input vectors, adjust the offset of the
+        // second vector's start.
+        Vec2Offset = NumElems;
+      } else {
+        assert(InVT2Size <= InVT1Size &&
+               "Second input is not going to be larger than the first one.");
+
+        // VecIn1 is wider than the output, and we have another, possibly
+        // smaller input. Pad the smaller input with undefs, shuffle at the
+        // input vector width, and extract the output.
+        // The shuffle type is different than VT, so check legality again.
+        if (LegalOperations &&
+            !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1))
+          return SDValue();
+
+        // Legalizing INSERT_SUBVECTOR is tricky - you basically have to
+        // lower it back into a BUILD_VECTOR. So if the inserted type is
+        // illegal, don't even try.
+        if (InVT1 != InVT2) {
+          if (!TLI.isTypeLegal(InVT2))
+            return SDValue();
+          VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT1,
+                               DAG.getUNDEF(InVT1), VecIn2, ZeroIdx);
+        }
+        ShuffleNumElems = NumElems * 2;
+      }
+    } else if (InVT2Size * 2 == VTSize && InVT1Size == VTSize) {
+      SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2));
+      ConcatOps[0] = VecIn2;
+      VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
+    } else if (InVT1Size / VTSize > 1 && InVT1Size % VTSize == 0) {
+      if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems) ||
+          !TLI.isTypeLegal(InVT1) || !TLI.isTypeLegal(InVT2))
+        return SDValue();
+      // If dest vector has less than two elements, then use shuffle and extract
+      // from larger regs will cost even more.
+      if (VT.getVectorNumElements() <= 2 || !VecIn2.getNode())
+        return SDValue();
+      assert(InVT2Size <= InVT1Size &&
+             "Second input is not going to be larger than the first one.");
+
+      // VecIn1 is wider than the output, and we have another, possibly
+      // smaller input. Pad the smaller input with undefs, shuffle at the
+      // input vector width, and extract the output.
+      // The shuffle type is different than VT, so check legality again.
+      if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1))
+        return SDValue();
+
+      if (InVT1 != InVT2) {
+        VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT1,
+                             DAG.getUNDEF(InVT1), VecIn2, ZeroIdx);
+      }
+      ShuffleNumElems = InVT1Size / VTSize * NumElems;
+    } else {
+      // TODO: Support cases where the length mismatch isn't exactly by a
+      // factor of 2.
+      // TODO: Move this check upwards, so that if we have bad type
+      // mismatches, we don't create any DAG nodes.
+      return SDValue();
+    }
+  }
+
+  // Initialize mask to undef.
+  SmallVector<int, 8> Mask(ShuffleNumElems, -1);
+
+  // Only need to run up to the number of elements actually used, not the
+  // total number of elements in the shuffle - if we are shuffling a wider
+  // vector, the high lanes should be set to undef.
+  for (unsigned i = 0; i != NumElems; ++i) {
+    if (VectorMask[i] <= 0)
+      continue;
+
+    unsigned ExtIndex = N->getOperand(i).getConstantOperandVal(1);
+    if (VectorMask[i] == (int)LeftIdx) {
+      Mask[i] = ExtIndex;
+    } else if (VectorMask[i] == (int)LeftIdx + 1) {
+      Mask[i] = Vec2Offset + ExtIndex;
+    }
+  }
+
+  // The type the input vectors may have changed above.
+  InVT1 = VecIn1.getValueType();
+
+  // If we already have a VecIn2, it should have the same type as VecIn1.
+  // If we don't, get an undef/zero vector of the appropriate type.
+  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(InVT1);
+  assert(InVT1 == VecIn2.getValueType() && "Unexpected second input type.");
+
+  SDValue Shuffle = DAG.getVectorShuffle(InVT1, DL, VecIn1, VecIn2, Mask);
+  if (ShuffleNumElems > NumElems)
+    Shuffle = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuffle, ZeroIdx);
+
+  return Shuffle;
+}
+
+static SDValue reduceBuildVecToShuffleWithZero(SDNode *BV, SelectionDAG &DAG) {
+  assert(BV->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector");
+
+  // First, determine where the build vector is not undef.
+  // TODO: We could extend this to handle zero elements as well as undefs.
+  int NumBVOps = BV->getNumOperands();
+  int ZextElt = -1;
+  for (int i = 0; i != NumBVOps; ++i) {
+    SDValue Op = BV->getOperand(i);
+    if (Op.isUndef())
+      continue;
+    if (ZextElt == -1)
+      ZextElt = i;
+    else
+      return SDValue();
+  }
+  // Bail out if there's no non-undef element.
+  if (ZextElt == -1)
+    return SDValue();
+
+  // The build vector contains some number of undef elements and exactly
+  // one other element. That other element must be a zero-extended scalar
+  // extracted from a vector at a constant index to turn this into a shuffle.
+  // Also, require that the build vector does not implicitly truncate/extend
+  // its elements.
+  // TODO: This could be enhanced to allow ANY_EXTEND as well as ZERO_EXTEND.
+  EVT VT = BV->getValueType(0);
+  SDValue Zext = BV->getOperand(ZextElt);
+  if (Zext.getOpcode() != ISD::ZERO_EXTEND || !Zext.hasOneUse() ||
+      Zext.getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !isa<ConstantSDNode>(Zext.getOperand(0).getOperand(1)) ||
+      Zext.getValueSizeInBits() != VT.getScalarSizeInBits())
+    return SDValue();
+
+  // The zero-extend must be a multiple of the source size, and we must be
+  // building a vector of the same size as the source of the extract element.
+  SDValue Extract = Zext.getOperand(0);
+  unsigned DestSize = Zext.getValueSizeInBits();
+  unsigned SrcSize = Extract.getValueSizeInBits();
+  if (DestSize % SrcSize != 0 ||
+      Extract.getOperand(0).getValueSizeInBits() != VT.getSizeInBits())
+    return SDValue();
+
+  // Create a shuffle mask that will combine the extracted element with zeros
+  // and undefs.
+  int ZextRatio = DestSize / SrcSize;
+  int NumMaskElts = NumBVOps * ZextRatio;
+  SmallVector<int, 32> ShufMask(NumMaskElts, -1);
+  for (int i = 0; i != NumMaskElts; ++i) {
+    if (i / ZextRatio == ZextElt) {
+      // The low bits of the (potentially translated) extracted element map to
+      // the source vector. The high bits map to zero. We will use a zero vector
+      // as the 2nd source operand of the shuffle, so use the 1st element of
+      // that vector (mask value is number-of-elements) for the high bits.
+      int Low = DAG.getDataLayout().isBigEndian() ? (ZextRatio - 1) : 0;
+      ShufMask[i] = (i % ZextRatio == Low) ? Extract.getConstantOperandVal(1)
+                                           : NumMaskElts;
+    }
+
+    // Undef elements of the build vector remain undef because we initialize
+    // the shuffle mask with -1.
+  }
+
+  // buildvec undef, ..., (zext (extractelt V, IndexC)), undef... -->
+  // bitcast (shuffle V, ZeroVec, VectorMask)
+  SDLoc DL(BV);
+  EVT VecVT = Extract.getOperand(0).getValueType();
+  SDValue ZeroVec = DAG.getConstant(0, DL, VecVT);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Shuf = TLI.buildLegalVectorShuffle(VecVT, DL, Extract.getOperand(0),
+                                             ZeroVec, ShufMask, DAG);
+  if (!Shuf)
+    return SDValue();
+  return DAG.getBitcast(VT, Shuf);
+}
+
+// FIXME: promote to STLExtras.
+template <typename R, typename T>
+static auto getFirstIndexOf(R &&Range, const T &Val) {
+  auto I = find(Range, Val);
+  if (I == Range.end())
+    return static_cast<decltype(std::distance(Range.begin(), I))>(-1);
+  return std::distance(Range.begin(), I);
+}
+
+// Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
+// operations. If the types of the vectors we're extracting from allow it,
+// turn this into a vector_shuffle node.
+SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes.
+  if (!isTypeLegal(VT))
+    return SDValue();
+
+  if (SDValue V = reduceBuildVecToShuffleWithZero(N, DAG))
+    return V;
+
+  // May only combine to shuffle after legalize if shuffle is legal.
+  if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT))
+    return SDValue();
+
+  bool UsesZeroVector = false;
+  unsigned NumElems = N->getNumOperands();
+
+  // Record, for each element of the newly built vector, which input vector
+  // that element comes from. -1 stands for undef, 0 for the zero vector,
+  // and positive values for the input vectors.
+  // VectorMask maps each element to its vector number, and VecIn maps vector
+  // numbers to their initial SDValues.
+
+  SmallVector<int, 8> VectorMask(NumElems, -1);
+  SmallVector<SDValue, 8> VecIn;
+  VecIn.push_back(SDValue());
+
+  for (unsigned i = 0; i != NumElems; ++i) {
+    SDValue Op = N->getOperand(i);
+
+    if (Op.isUndef())
+      continue;
+
+    // See if we can use a blend with a zero vector.
+    // TODO: Should we generalize this to a blend with an arbitrary constant
+    // vector?
+    if (isNullConstant(Op) || isNullFPConstant(Op)) {
+      UsesZeroVector = true;
+      VectorMask[i] = 0;
+      continue;
+    }
+
+    // Not an undef or zero. If the input is something other than an
+    // EXTRACT_VECTOR_ELT with an in-range constant index, bail out.
+    if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(Op.getOperand(1)))
+      return SDValue();
+    SDValue ExtractedFromVec = Op.getOperand(0);
+
+    if (ExtractedFromVec.getValueType().isScalableVector())
+      return SDValue();
+
+    const APInt &ExtractIdx = Op.getConstantOperandAPInt(1);
+    if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
+      return SDValue();
+
+    // All inputs must have the same element type as the output.
+    if (VT.getVectorElementType() !=
+        ExtractedFromVec.getValueType().getVectorElementType())
+      return SDValue();
+
+    // Have we seen this input vector before?
+    // The vectors are expected to be tiny (usually 1 or 2 elements), so using
+    // a map back from SDValues to numbers isn't worth it.
+    int Idx = getFirstIndexOf(VecIn, ExtractedFromVec);
+    if (Idx == -1) { // A new source vector?
+      Idx = VecIn.size();
+      VecIn.push_back(ExtractedFromVec);
+    }
+
+    VectorMask[i] = Idx;
+  }
+
+  // If we didn't find at least one input vector, bail out.
+  if (VecIn.size() < 2)
+    return SDValue();
+
+  // If all the Operands of BUILD_VECTOR extract from same
+  // vector, then split the vector efficiently based on the maximum
+  // vector access index and adjust the VectorMask and
+  // VecIn accordingly.
+  bool DidSplitVec = false;
+  if (VecIn.size() == 2) {
+    unsigned MaxIndex = 0;
+    unsigned NearestPow2 = 0;
+    SDValue Vec = VecIn.back();
+    EVT InVT = Vec.getValueType();
+    SmallVector<unsigned, 8> IndexVec(NumElems, 0);
+
+    for (unsigned i = 0; i < NumElems; i++) {
+      if (VectorMask[i] <= 0)
+        continue;
+      unsigned Index = N->getOperand(i).getConstantOperandVal(1);
+      IndexVec[i] = Index;
+      MaxIndex = std::max(MaxIndex, Index);
+    }
+
+    NearestPow2 = PowerOf2Ceil(MaxIndex);
+    if (InVT.isSimple() && NearestPow2 > 2 && MaxIndex < NearestPow2 &&
+        NumElems * 2 < NearestPow2) {
+      unsigned SplitSize = NearestPow2 / 2;
+      EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
+                                     InVT.getVectorElementType(), SplitSize);
+      if (TLI.isTypeLegal(SplitVT) &&
+          SplitSize + SplitVT.getVectorNumElements() <=
+              InVT.getVectorNumElements()) {
+        SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
+                                     DAG.getVectorIdxConstant(SplitSize, DL));
+        SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
+                                     DAG.getVectorIdxConstant(0, DL));
+        VecIn.pop_back();
+        VecIn.push_back(VecIn1);
+        VecIn.push_back(VecIn2);
+        DidSplitVec = true;
+
+        for (unsigned i = 0; i < NumElems; i++) {
+          if (VectorMask[i] <= 0)
+            continue;
+          VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2;
+        }
+      }
+    }
+  }
+
+  // Sort input vectors by decreasing vector element count,
+  // while preserving the relative order of equally-sized vectors.
+  // Note that we keep the first "implicit zero vector as-is.
+  SmallVector<SDValue, 8> SortedVecIn(VecIn);
+  llvm::stable_sort(MutableArrayRef<SDValue>(SortedVecIn).drop_front(),
+                    [](const SDValue &a, const SDValue &b) {
+                      return a.getValueType().getVectorNumElements() >
+                             b.getValueType().getVectorNumElements();
+                    });
+
+  // We now also need to rebuild the VectorMask, because it referenced element
+  // order in VecIn, and we just sorted them.
+  for (int &SourceVectorIndex : VectorMask) {
+    if (SourceVectorIndex <= 0)
+      continue;
+    unsigned Idx = getFirstIndexOf(SortedVecIn, VecIn[SourceVectorIndex]);
+    assert(Idx > 0 && Idx < SortedVecIn.size() &&
+           VecIn[SourceVectorIndex] == SortedVecIn[Idx] && "Remapping failure");
+    SourceVectorIndex = Idx;
+  }
+
+  VecIn = std::move(SortedVecIn);
+
+  // TODO: Should this fire if some of the input vectors has illegal type (like
+  // it does now), or should we let legalization run its course first?
+
+  // Shuffle phase:
+  // Take pairs of vectors, and shuffle them so that the result has elements
+  // from these vectors in the correct places.
+  // For example, given:
+  // t10: i32 = extract_vector_elt t1, Constant:i64<0>
+  // t11: i32 = extract_vector_elt t2, Constant:i64<0>
+  // t12: i32 = extract_vector_elt t3, Constant:i64<0>
+  // t13: i32 = extract_vector_elt t1, Constant:i64<1>
+  // t14: v4i32 = BUILD_VECTOR t10, t11, t12, t13
+  // We will generate:
+  // t20: v4i32 = vector_shuffle<0,4,u,1> t1, t2
+  // t21: v4i32 = vector_shuffle<u,u,0,u> t3, undef
+  SmallVector<SDValue, 4> Shuffles;
+  for (unsigned In = 0, Len = (VecIn.size() / 2); In < Len; ++In) {
+    unsigned LeftIdx = 2 * In + 1;
+    SDValue VecLeft = VecIn[LeftIdx];
+    SDValue VecRight =
+        (LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue();
+
+    if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft,
+                                                VecRight, LeftIdx, DidSplitVec))
+      Shuffles.push_back(Shuffle);
+    else
+      return SDValue();
+  }
+
+  // If we need the zero vector as an "ingredient" in the blend tree, add it
+  // to the list of shuffles.
+  if (UsesZeroVector)
+    Shuffles.push_back(VT.isInteger() ? DAG.getConstant(0, DL, VT)
+                                      : DAG.getConstantFP(0.0, DL, VT));
+
+  // If we only have one shuffle, we're done.
+  if (Shuffles.size() == 1)
+    return Shuffles[0];
+
+  // Update the vector mask to point to the post-shuffle vectors.
+  for (int &Vec : VectorMask)
+    if (Vec == 0)
+      Vec = Shuffles.size() - 1;
+    else
+      Vec = (Vec - 1) / 2;
+
+  // More than one shuffle. Generate a binary tree of blends, e.g. if from
+  // the previous step we got the set of shuffles t10, t11, t12, t13, we will
+  // generate:
+  // t10: v8i32 = vector_shuffle<0,8,u,u,u,u,u,u> t1, t2
+  // t11: v8i32 = vector_shuffle<u,u,0,8,u,u,u,u> t3, t4
+  // t12: v8i32 = vector_shuffle<u,u,u,u,0,8,u,u> t5, t6
+  // t13: v8i32 = vector_shuffle<u,u,u,u,u,u,0,8> t7, t8
+  // t20: v8i32 = vector_shuffle<0,1,10,11,u,u,u,u> t10, t11
+  // t21: v8i32 = vector_shuffle<u,u,u,u,4,5,14,15> t12, t13
+  // t30: v8i32 = vector_shuffle<0,1,2,3,12,13,14,15> t20, t21
+
+  // Make sure the initial size of the shuffle list is even.
+  if (Shuffles.size() % 2)
+    Shuffles.push_back(DAG.getUNDEF(VT));
+
+  for (unsigned CurSize = Shuffles.size(); CurSize > 1; CurSize /= 2) {
+    if (CurSize % 2) {
+      Shuffles[CurSize] = DAG.getUNDEF(VT);
+      CurSize++;
+    }
+    for (unsigned In = 0, Len = CurSize / 2; In < Len; ++In) {
+      int Left = 2 * In;
+      int Right = 2 * In + 1;
+      SmallVector<int, 8> Mask(NumElems, -1);
+      SDValue L = Shuffles[Left];
+      ArrayRef<int> LMask;
+      bool IsLeftShuffle = L.getOpcode() == ISD::VECTOR_SHUFFLE &&
+                           L.use_empty() && L.getOperand(1).isUndef() &&
+                           L.getOperand(0).getValueType() == L.getValueType();
+      if (IsLeftShuffle) {
+        LMask = cast<ShuffleVectorSDNode>(L.getNode())->getMask();
+        L = L.getOperand(0);
+      }
+      SDValue R = Shuffles[Right];
+      ArrayRef<int> RMask;
+      bool IsRightShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE &&
+                            R.use_empty() && R.getOperand(1).isUndef() &&
+                            R.getOperand(0).getValueType() == R.getValueType();
+      if (IsRightShuffle) {
+        RMask = cast<ShuffleVectorSDNode>(R.getNode())->getMask();
+        R = R.getOperand(0);
+      }
+      for (unsigned I = 0; I != NumElems; ++I) {
+        if (VectorMask[I] == Left) {
+          Mask[I] = I;
+          if (IsLeftShuffle)
+            Mask[I] = LMask[I];
+          VectorMask[I] = In;
+        } else if (VectorMask[I] == Right) {
+          Mask[I] = I + NumElems;
+          if (IsRightShuffle)
+            Mask[I] = RMask[I] + NumElems;
+          VectorMask[I] = In;
+        }
+      }
+
+      Shuffles[In] = DAG.getVectorShuffle(VT, DL, L, R, Mask);
+    }
+  }
+  return Shuffles[0];
+}
+
+// Try to turn a build vector of zero extends of extract vector elts into a
+// a vector zero extend and possibly an extract subvector.
+// TODO: Support sign extend?
+// TODO: Allow undef elements?
+SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) {
+  if (LegalOperations)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  bool FoundZeroExtend = false;
+  SDValue Op0 = N->getOperand(0);
+  auto checkElem = [&](SDValue Op) -> int64_t {
+    unsigned Opc = Op.getOpcode();
+    FoundZeroExtend |= (Opc == ISD::ZERO_EXTEND);
+    if ((Opc == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND) &&
+        Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        Op0.getOperand(0).getOperand(0) == Op.getOperand(0).getOperand(0))
+      if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(0).getOperand(1)))
+        return C->getZExtValue();
+    return -1;
+  };
+
+  // Make sure the first element matches
+  // (zext (extract_vector_elt X, C))
+  // Offset must be a constant multiple of the
+  // known-minimum vector length of the result type.
+  int64_t Offset = checkElem(Op0);
+  if (Offset < 0 || (Offset % VT.getVectorNumElements()) != 0)
+    return SDValue();
+
+  unsigned NumElems = N->getNumOperands();
+  SDValue In = Op0.getOperand(0).getOperand(0);
+  EVT InSVT = In.getValueType().getScalarType();
+  EVT InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumElems);
+
+  // Don't create an illegal input type after type legalization.
+  if (LegalTypes && !TLI.isTypeLegal(InVT))
+    return SDValue();
+
+  // Ensure all the elements come from the same vector and are adjacent.
+  for (unsigned i = 1; i != NumElems; ++i) {
+    if ((Offset + i) != checkElem(N->getOperand(i)))
+      return SDValue();
+  }
+
+  SDLoc DL(N);
+  In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InVT, In,
+                   Op0.getOperand(0).getOperand(1));
+  return DAG.getNode(FoundZeroExtend ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND, DL,
+                     VT, In);
+}
+
+// If this is a very simple BUILD_VECTOR with first element being a ZERO_EXTEND,
+// and all other elements being constant zero's, granularize the BUILD_VECTOR's
+// element width, absorbing the ZERO_EXTEND, turning it into a constant zero op.
+// This patten can appear during legalization.
+//
+// NOTE: This can be generalized to allow more than a single
+//       non-constant-zero op, UNDEF's, and to be KnownBits-based,
+SDValue DAGCombiner::convertBuildVecZextToBuildVecWithZeros(SDNode *N) {
+  // Don't run this after legalization. Targets may have other preferences.
+  if (Level >= AfterLegalizeDAG)
+    return SDValue();
+
+  // FIXME: support big-endian.
+  if (DAG.getDataLayout().isBigEndian())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  EVT OpVT = N->getOperand(0).getValueType();
+  assert(!VT.isScalableVector() && "Encountered scalable BUILD_VECTOR?");
+
+  EVT OpIntVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
+
+  if (!TLI.isTypeLegal(OpIntVT) ||
+      (LegalOperations && !TLI.isOperationLegalOrCustom(ISD::BITCAST, OpIntVT)))
+    return SDValue();
+
+  unsigned EltBitwidth = VT.getScalarSizeInBits();
+  // NOTE: the actual width of operands may be wider than that!
+
+  // Analyze all operands of this BUILD_VECTOR. What is the largest number of
+  // active bits they all have? We'll want to truncate them all to that width.
+  unsigned ActiveBits = 0;
+  APInt KnownZeroOps(VT.getVectorNumElements(), 0);
+  for (auto I : enumerate(N->ops())) {
+    SDValue Op = I.value();
+    // FIXME: support UNDEF elements?
+    if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
+      unsigned OpActiveBits =
+          Cst->getAPIntValue().trunc(EltBitwidth).getActiveBits();
+      if (OpActiveBits == 0) {
+        KnownZeroOps.setBit(I.index());
+        continue;
+      }
+      // Profitability check: don't allow non-zero constant operands.
+      return SDValue();
+    }
+    // Profitability check: there must only be a single non-zero operand,
+    // and it must be the first operand of the BUILD_VECTOR.
+    if (I.index() != 0)
+      return SDValue();
+    // The operand must be a zero-extension itself.
+    // FIXME: this could be generalized to known leading zeros check.
+    if (Op.getOpcode() != ISD::ZERO_EXTEND)
+      return SDValue();
+    unsigned CurrActiveBits =
+        Op.getOperand(0).getValueSizeInBits().getFixedValue();
+    assert(!ActiveBits && "Already encountered non-constant-zero operand?");
+    ActiveBits = CurrActiveBits;
+    // We want to at least halve the element size.
+    if (2 * ActiveBits > EltBitwidth)
+      return SDValue();
+  }
+
+  // This BUILD_VECTOR must have at least one non-constant-zero operand.
+  if (ActiveBits == 0)
+    return SDValue();
+
+  // We have EltBitwidth bits, the *minimal* chunk size is ActiveBits,
+  // into how many chunks can we split our element width?
+  EVT NewScalarIntVT, NewIntVT;
+  std::optional<unsigned> Factor;
+  // We can split the element into at least two chunks, but not into more
+  // than |_ EltBitwidth / ActiveBits _| chunks. Find a largest split factor
+  // for which the element width is a multiple of it,
+  // and the resulting types/operations on that chunk width are legal.
+  assert(2 * ActiveBits <= EltBitwidth &&
+         "We know that half or less bits of the element are active.");
+  for (unsigned Scale = EltBitwidth / ActiveBits; Scale >= 2; --Scale) {
+    if (EltBitwidth % Scale != 0)
+      continue;
+    unsigned ChunkBitwidth = EltBitwidth / Scale;
+    assert(ChunkBitwidth >= ActiveBits && "As per starting point.");
+    NewScalarIntVT = EVT::getIntegerVT(*DAG.getContext(), ChunkBitwidth);
+    NewIntVT = EVT::getVectorVT(*DAG.getContext(), NewScalarIntVT,
+                                Scale * N->getNumOperands());
+    if (!TLI.isTypeLegal(NewScalarIntVT) || !TLI.isTypeLegal(NewIntVT) ||
+        (LegalOperations &&
+         !(TLI.isOperationLegalOrCustom(ISD::TRUNCATE, NewScalarIntVT) &&
+           TLI.isOperationLegalOrCustom(ISD::BUILD_VECTOR, NewIntVT))))
+      continue;
+    Factor = Scale;
+    break;
+  }
+  if (!Factor)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue ZeroOp = DAG.getConstant(0, DL, NewScalarIntVT);
+
+  // Recreate the BUILD_VECTOR, with elements now being Factor times smaller.
+  SmallVector<SDValue, 16> NewOps;
+  NewOps.reserve(NewIntVT.getVectorNumElements());
+  for (auto I : enumerate(N->ops())) {
+    SDValue Op = I.value();
+    assert(!Op.isUndef() && "FIXME: after allowing UNDEF's, handle them here.");
+    unsigned SrcOpIdx = I.index();
+    if (KnownZeroOps[SrcOpIdx]) {
+      NewOps.append(*Factor, ZeroOp);
+      continue;
+    }
+    Op = DAG.getBitcast(OpIntVT, Op);
+    Op = DAG.getNode(ISD::TRUNCATE, DL, NewScalarIntVT, Op);
+    NewOps.emplace_back(Op);
+    NewOps.append(*Factor - 1, ZeroOp);
+  }
+  assert(NewOps.size() == NewIntVT.getVectorNumElements());
+  SDValue NewBV = DAG.getBuildVector(NewIntVT, DL, NewOps);
+  NewBV = DAG.getBitcast(VT, NewBV);
+  return NewBV;
+}
+
+SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  // A vector built entirely of undefs is undef.
+  if (ISD::allOperandsUndef(N))
+    return DAG.getUNDEF(VT);
+
+  // If this is a splat of a bitcast from another vector, change to a
+  // concat_vector.
+  // For example:
+  //   (build_vector (i64 (bitcast (v2i32 X))), (i64 (bitcast (v2i32 X)))) ->
+  //     (v2i64 (bitcast (concat_vectors (v2i32 X), (v2i32 X))))
+  //
+  // If X is a build_vector itself, the concat can become a larger build_vector.
+  // TODO: Maybe this is useful for non-splat too?
+  if (!LegalOperations) {
+    if (SDValue Splat = cast<BuildVectorSDNode>(N)->getSplatValue()) {
+      Splat = peekThroughBitcasts(Splat);
+      EVT SrcVT = Splat.getValueType();
+      if (SrcVT.isVector()) {
+        unsigned NumElts = N->getNumOperands() * SrcVT.getVectorNumElements();
+        EVT NewVT = EVT::getVectorVT(*DAG.getContext(),
+                                     SrcVT.getVectorElementType(), NumElts);
+        if (!LegalTypes || TLI.isTypeLegal(NewVT)) {
+          SmallVector<SDValue, 8> Ops(N->getNumOperands(), Splat);
+          SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N),
+                                       NewVT, Ops);
+          return DAG.getBitcast(VT, Concat);
+        }
+      }
+    }
+  }
+
+  // Check if we can express BUILD VECTOR via subvector extract.
+  if (!LegalTypes && (N->getNumOperands() > 1)) {
+    SDValue Op0 = N->getOperand(0);
+    auto checkElem = [&](SDValue Op) -> uint64_t {
+      if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) &&
+          (Op0.getOperand(0) == Op.getOperand(0)))
+        if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+          return CNode->getZExtValue();
+      return -1;
+    };
+
+    int Offset = checkElem(Op0);
+    for (unsigned i = 0; i < N->getNumOperands(); ++i) {
+      if (Offset + i != checkElem(N->getOperand(i))) {
+        Offset = -1;
+        break;
+      }
+    }
+
+    if ((Offset == 0) &&
+        (Op0.getOperand(0).getValueType() == N->getValueType(0)))
+      return Op0.getOperand(0);
+    if ((Offset != -1) &&
+        ((Offset % N->getValueType(0).getVectorNumElements()) ==
+         0)) // IDX must be multiple of output size.
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0),
+                         Op0.getOperand(0), Op0.getOperand(1));
+  }
+
+  if (SDValue V = convertBuildVecZextToZext(N))
+    return V;
+
+  if (SDValue V = convertBuildVecZextToBuildVecWithZeros(N))
+    return V;
+
+  if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
+    return V;
+
+  if (SDValue V = reduceBuildVecTruncToBitCast(N))
+    return V;
+
+  if (SDValue V = reduceBuildVecToShuffle(N))
+    return V;
+
+  // A splat of a single element is a SPLAT_VECTOR if supported on the target.
+  // Do this late as some of the above may replace the splat.
+  if (TLI.getOperationAction(ISD::SPLAT_VECTOR, VT) != TargetLowering::Expand)
+    if (SDValue V = cast<BuildVectorSDNode>(N)->getSplatValue()) {
+      assert(!V.isUndef() && "Splat of undef should have been handled earlier");
+      return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V);
+    }
+
+  return SDValue();
+}
+
+static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT OpVT = N->getOperand(0).getValueType();
+
+  // If the operands are legal vectors, leave them alone.
+  if (TLI.isTypeLegal(OpVT) || OpVT.isScalableVector())
+    return SDValue();
+
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SmallVector<SDValue, 8> Ops;
+
+  EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
+  SDValue ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
+
+  // Keep track of what we encounter.
+  bool AnyInteger = false;
+  bool AnyFP = false;
+  for (const SDValue &Op : N->ops()) {
+    if (ISD::BITCAST == Op.getOpcode() &&
+        !Op.getOperand(0).getValueType().isVector())
+      Ops.push_back(Op.getOperand(0));
+    else if (ISD::UNDEF == Op.getOpcode())
+      Ops.push_back(ScalarUndef);
+    else
+      return SDValue();
+
+    // Note whether we encounter an integer or floating point scalar.
+    // If it's neither, bail out, it could be something weird like x86mmx.
+    EVT LastOpVT = Ops.back().getValueType();
+    if (LastOpVT.isFloatingPoint())
+      AnyFP = true;
+    else if (LastOpVT.isInteger())
+      AnyInteger = true;
+    else
+      return SDValue();
+  }
+
+  // If any of the operands is a floating point scalar bitcast to a vector,
+  // use floating point types throughout, and bitcast everything.
+  // Replace UNDEFs by another scalar UNDEF node, of the final desired type.
+  if (AnyFP) {
+    SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits());
+    ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
+    if (AnyInteger) {
+      for (SDValue &Op : Ops) {
+        if (Op.getValueType() == SVT)
+          continue;
+        if (Op.isUndef())
+          Op = ScalarUndef;
+        else
+          Op = DAG.getBitcast(SVT, Op);
+      }
+    }
+  }
+
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT,
+                               VT.getSizeInBits() / SVT.getSizeInBits());
+  return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops));
+}
+
+// Attempt to merge nested concat_vectors/undefs.
+// Fold concat_vectors(concat_vectors(x,y,z,w),u,u,concat_vectors(a,b,c,d))
+//  --> concat_vectors(x,y,z,w,u,u,u,u,u,u,u,u,a,b,c,d)
+static SDValue combineConcatVectorOfConcatVectors(SDNode *N,
+                                                  SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  // Ensure we're concatenating UNDEF and CONCAT_VECTORS nodes of similar types.
+  EVT SubVT;
+  SDValue FirstConcat;
+  for (const SDValue &Op : N->ops()) {
+    if (Op.isUndef())
+      continue;
+    if (Op.getOpcode() != ISD::CONCAT_VECTORS)
+      return SDValue();
+    if (!FirstConcat) {
+      SubVT = Op.getOperand(0).getValueType();
+      if (!DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
+        return SDValue();
+      FirstConcat = Op;
+      continue;
+    }
+    if (SubVT != Op.getOperand(0).getValueType())
+      return SDValue();
+  }
+  assert(FirstConcat && "Concat of all-undefs found");
+
+  SmallVector<SDValue> ConcatOps;
+  for (const SDValue &Op : N->ops()) {
+    if (Op.isUndef()) {
+      ConcatOps.append(FirstConcat->getNumOperands(), DAG.getUNDEF(SubVT));
+      continue;
+    }
+    ConcatOps.append(Op->op_begin(), Op->op_end());
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, ConcatOps);
+}
+
+// Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR
+// operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at
+// most two distinct vectors the same size as the result, attempt to turn this
+// into a legal shuffle.
+static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  EVT OpVT = N->getOperand(0).getValueType();
+
+  // We currently can't generate an appropriate shuffle for a scalable vector.
+  if (VT.isScalableVector())
+    return SDValue();
+
+  int NumElts = VT.getVectorNumElements();
+  int NumOpElts = OpVT.getVectorNumElements();
+
+  SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT);
+  SmallVector<int, 8> Mask;
+
+  for (SDValue Op : N->ops()) {
+    Op = peekThroughBitcasts(Op);
+
+    // UNDEF nodes convert to UNDEF shuffle mask values.
+    if (Op.isUndef()) {
+      Mask.append((unsigned)NumOpElts, -1);
+      continue;
+    }
+
+    if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+      return SDValue();
+
+    // What vector are we extracting the subvector from and at what index?
+    SDValue ExtVec = Op.getOperand(0);
+    int ExtIdx = Op.getConstantOperandVal(1);
+
+    // We want the EVT of the original extraction to correctly scale the
+    // extraction index.
+    EVT ExtVT = ExtVec.getValueType();
+    ExtVec = peekThroughBitcasts(ExtVec);
+
+    // UNDEF nodes convert to UNDEF shuffle mask values.
+    if (ExtVec.isUndef()) {
+      Mask.append((unsigned)NumOpElts, -1);
+      continue;
+    }
+
+    // Ensure that we are extracting a subvector from a vector the same
+    // size as the result.
+    if (ExtVT.getSizeInBits() != VT.getSizeInBits())
+      return SDValue();
+
+    // Scale the subvector index to account for any bitcast.
+    int NumExtElts = ExtVT.getVectorNumElements();
+    if (0 == (NumExtElts % NumElts))
+      ExtIdx /= (NumExtElts / NumElts);
+    else if (0 == (NumElts % NumExtElts))
+      ExtIdx *= (NumElts / NumExtElts);
+    else
+      return SDValue();
+
+    // At most we can reference 2 inputs in the final shuffle.
+    if (SV0.isUndef() || SV0 == ExtVec) {
+      SV0 = ExtVec;
+      for (int i = 0; i != NumOpElts; ++i)
+        Mask.push_back(i + ExtIdx);
+    } else if (SV1.isUndef() || SV1 == ExtVec) {
+      SV1 = ExtVec;
+      for (int i = 0; i != NumOpElts; ++i)
+        Mask.push_back(i + ExtIdx + NumElts);
+    } else {
+      return SDValue();
+    }
+  }
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  return TLI.buildLegalVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0),
+                                     DAG.getBitcast(VT, SV1), Mask, DAG);
+}
+
+static SDValue combineConcatVectorOfCasts(SDNode *N, SelectionDAG &DAG) {
+  unsigned CastOpcode = N->getOperand(0).getOpcode();
+  switch (CastOpcode) {
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    // TODO: Allow more opcodes?
+    //  case ISD::BITCAST:
+    //  case ISD::TRUNCATE:
+    //  case ISD::ZERO_EXTEND:
+    //  case ISD::SIGN_EXTEND:
+    //  case ISD::FP_EXTEND:
+    break;
+  default:
+    return SDValue();
+  }
+
+  EVT SrcVT = N->getOperand(0).getOperand(0).getValueType();
+  if (!SrcVT.isVector())
+    return SDValue();
+
+  // All operands of the concat must be the same kind of cast from the same
+  // source type.
+  SmallVector<SDValue, 4> SrcOps;
+  for (SDValue Op : N->ops()) {
+    if (Op.getOpcode() != CastOpcode || !Op.hasOneUse() ||
+        Op.getOperand(0).getValueType() != SrcVT)
+      return SDValue();
+    SrcOps.push_back(Op.getOperand(0));
+  }
+
+  // The wider cast must be supported by the target. This is unusual because
+  // the operation support type parameter depends on the opcode. In addition,
+  // check the other type in the cast to make sure this is really legal.
+  EVT VT = N->getValueType(0);
+  EVT SrcEltVT = SrcVT.getVectorElementType();
+  ElementCount NumElts = SrcVT.getVectorElementCount() * N->getNumOperands();
+  EVT ConcatSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcEltVT, NumElts);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  switch (CastOpcode) {
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    if (!TLI.isOperationLegalOrCustom(CastOpcode, ConcatSrcVT) ||
+        !TLI.isTypeLegal(VT))
+      return SDValue();
+    break;
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    if (!TLI.isOperationLegalOrCustom(CastOpcode, VT) ||
+        !TLI.isTypeLegal(ConcatSrcVT))
+      return SDValue();
+    break;
+  default:
+    llvm_unreachable("Unexpected cast opcode");
+  }
+
+  // concat (cast X), (cast Y)... -> cast (concat X, Y...)
+  SDLoc DL(N);
+  SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatSrcVT, SrcOps);
+  return DAG.getNode(CastOpcode, DL, VT, NewConcat);
+}
+
+// See if this is a simple CONCAT_VECTORS with no UNDEF operands, and if one of
+// the operands is a SHUFFLE_VECTOR, and all other operands are also operands
+// to that SHUFFLE_VECTOR, create wider SHUFFLE_VECTOR.
+static SDValue combineConcatVectorOfShuffleAndItsOperands(
+    SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI, bool LegalTypes,
+    bool LegalOperations) {
+  EVT VT = N->getValueType(0);
+  EVT OpVT = N->getOperand(0).getValueType();
+  if (VT.isScalableVector())
+    return SDValue();
+
+  // For now, only allow simple 2-operand concatenations.
+  if (N->getNumOperands() != 2)
+    return SDValue();
+
+  // Don't create illegal types/shuffles when not allowed to.
+  if ((LegalTypes && !TLI.isTypeLegal(VT)) ||
+      (LegalOperations &&
+       !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT)))
+    return SDValue();
+
+  // Analyze all of the operands of the CONCAT_VECTORS. Out of all of them,
+  // we want to find one that is: (1) a SHUFFLE_VECTOR (2) only used by us,
+  // and (3) all operands of CONCAT_VECTORS must be either that SHUFFLE_VECTOR,
+  // or one of the operands of that SHUFFLE_VECTOR (but not UNDEF!).
+  // (4) and for now, the SHUFFLE_VECTOR must be unary.
+  ShuffleVectorSDNode *SVN = nullptr;
+  for (SDValue Op : N->ops()) {
+    if (auto *CurSVN = dyn_cast<ShuffleVectorSDNode>(Op);
+        CurSVN && CurSVN->getOperand(1).isUndef() && N->isOnlyUserOf(CurSVN) &&
+        all_of(N->ops(), [CurSVN](SDValue Op) {
+          // FIXME: can we allow UNDEF operands?
+          return !Op.isUndef() &&
+                 (Op.getNode() == CurSVN || is_contained(CurSVN->ops(), Op));
+        })) {
+      SVN = CurSVN;
+      break;
+    }
+  }
+  if (!SVN)
+    return SDValue();
+
+  // We are going to pad the shuffle operands, so any indice, that was picking
+  // from the second operand, must be adjusted.
+  SmallVector<int, 16> AdjustedMask;
+  AdjustedMask.reserve(SVN->getMask().size());
+  assert(SVN->getOperand(1).isUndef() && "Expected unary shuffle!");
+  append_range(AdjustedMask, SVN->getMask());
+
+  // Identity masks for the operands of the (padded) shuffle.
+  SmallVector<int, 32> IdentityMask(2 * OpVT.getVectorNumElements());
+  MutableArrayRef<int> FirstShufOpIdentityMask =
+      MutableArrayRef<int>(IdentityMask)
+          .take_front(OpVT.getVectorNumElements());
+  MutableArrayRef<int> SecondShufOpIdentityMask =
+      MutableArrayRef<int>(IdentityMask).take_back(OpVT.getVectorNumElements());
+  std::iota(FirstShufOpIdentityMask.begin(), FirstShufOpIdentityMask.end(), 0);
+  std::iota(SecondShufOpIdentityMask.begin(), SecondShufOpIdentityMask.end(),
+            VT.getVectorNumElements());
+
+  // New combined shuffle mask.
+  SmallVector<int, 32> Mask;
+  Mask.reserve(VT.getVectorNumElements());
+  for (SDValue Op : N->ops()) {
+    assert(!Op.isUndef() && "Not expecting to concatenate UNDEF.");
+    if (Op.getNode() == SVN) {
+      append_range(Mask, AdjustedMask);
+      continue;
+    }
+    if (Op == SVN->getOperand(0)) {
+      append_range(Mask, FirstShufOpIdentityMask);
+      continue;
+    }
+    if (Op == SVN->getOperand(1)) {
+      append_range(Mask, SecondShufOpIdentityMask);
+      continue;
+    }
+    llvm_unreachable("Unexpected operand!");
+  }
+
+  // Don't create illegal shuffle masks.
+  if (!TLI.isShuffleMaskLegal(Mask, VT))
+    return SDValue();
+
+  // Pad the shuffle operands with UNDEF.
+  SDLoc dl(N);
+  std::array<SDValue, 2> ShufOps;
+  for (auto I : zip(SVN->ops(), ShufOps)) {
+    SDValue ShufOp = std::get<0>(I);
+    SDValue &NewShufOp = std::get<1>(I);
+    if (ShufOp.isUndef())
+      NewShufOp = DAG.getUNDEF(VT);
+    else {
+      SmallVector<SDValue, 2> ShufOpParts(N->getNumOperands(),
+                                          DAG.getUNDEF(OpVT));
+      ShufOpParts[0] = ShufOp;
+      NewShufOp = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, ShufOpParts);
+    }
+  }
+  // Finally, create the new wide shuffle.
+  return DAG.getVectorShuffle(VT, dl, ShufOps[0], ShufOps[1], Mask);
+}
+
+SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
+  // If we only have one input vector, we don't need to do any concatenation.
+  if (N->getNumOperands() == 1)
+    return N->getOperand(0);
+
+  // Check if all of the operands are undefs.
+  EVT VT = N->getValueType(0);
+  if (ISD::allOperandsUndef(N))
+    return DAG.getUNDEF(VT);
+
+  // Optimize concat_vectors where all but the first of the vectors are undef.
+  if (all_of(drop_begin(N->ops()),
+             [](const SDValue &Op) { return Op.isUndef(); })) {
+    SDValue In = N->getOperand(0);
+    assert(In.getValueType().isVector() && "Must concat vectors");
+
+    // If the input is a concat_vectors, just make a larger concat by padding
+    // with smaller undefs.
+    //
+    // Legalizing in AArch64TargetLowering::LowerCONCAT_VECTORS() and combining
+    // here could cause an infinite loop. That legalizing happens when LegalDAG
+    // is true and input of AArch64TargetLowering::LowerCONCAT_VECTORS() is
+    // scalable.
+    if (In.getOpcode() == ISD::CONCAT_VECTORS && In.hasOneUse() &&
+        !(LegalDAG && In.getValueType().isScalableVector())) {
+      unsigned NumOps = N->getNumOperands() * In.getNumOperands();
+      SmallVector<SDValue, 4> Ops(In->op_begin(), In->op_end());
+      Ops.resize(NumOps, DAG.getUNDEF(Ops[0].getValueType()));
+      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
+    }
+
+    SDValue Scalar = peekThroughOneUseBitcasts(In);
+
+    // concat_vectors(scalar_to_vector(scalar), undef) ->
+    //     scalar_to_vector(scalar)
+    if (!LegalOperations && Scalar.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+         Scalar.hasOneUse()) {
+      EVT SVT = Scalar.getValueType().getVectorElementType();
+      if (SVT == Scalar.getOperand(0).getValueType())
+        Scalar = Scalar.getOperand(0);
+    }
+
+    // concat_vectors(scalar, undef) -> scalar_to_vector(scalar)
+    if (!Scalar.getValueType().isVector()) {
+      // If the bitcast type isn't legal, it might be a trunc of a legal type;
+      // look through the trunc so we can still do the transform:
+      //   concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar)
+      if (Scalar->getOpcode() == ISD::TRUNCATE &&
+          !TLI.isTypeLegal(Scalar.getValueType()) &&
+          TLI.isTypeLegal(Scalar->getOperand(0).getValueType()))
+        Scalar = Scalar->getOperand(0);
+
+      EVT SclTy = Scalar.getValueType();
+
+      if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
+        return SDValue();
+
+      // Bail out if the vector size is not a multiple of the scalar size.
+      if (VT.getSizeInBits() % SclTy.getSizeInBits())
+        return SDValue();
+
+      unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits();
+      if (VNTNumElms < 2)
+        return SDValue();
+
+      EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms);
+      if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType()))
+        return SDValue();
+
+      SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), NVT, Scalar);
+      return DAG.getBitcast(VT, Res);
+    }
+  }
+
+  // Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR.
+  // We have already tested above for an UNDEF only concatenation.
+  // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...))
+  // -> (BUILD_VECTOR A, B, ..., C, D, ...)
+  auto IsBuildVectorOrUndef = [](const SDValue &Op) {
+    return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode();
+  };
+  if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) {
+    SmallVector<SDValue, 8> Opnds;
+    EVT SVT = VT.getScalarType();
+
+    EVT MinVT = SVT;
+    if (!SVT.isFloatingPoint()) {
+      // If BUILD_VECTOR are from built from integer, they may have different
+      // operand types. Get the smallest type and truncate all operands to it.
+      bool FoundMinVT = false;
+      for (const SDValue &Op : N->ops())
+        if (ISD::BUILD_VECTOR == Op.getOpcode()) {
+          EVT OpSVT = Op.getOperand(0).getValueType();
+          MinVT = (!FoundMinVT || OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT;
+          FoundMinVT = true;
+        }
+      assert(FoundMinVT && "Concat vector type mismatch");
+    }
+
+    for (const SDValue &Op : N->ops()) {
+      EVT OpVT = Op.getValueType();
+      unsigned NumElts = OpVT.getVectorNumElements();
+
+      if (ISD::UNDEF == Op.getOpcode())
+        Opnds.append(NumElts, DAG.getUNDEF(MinVT));
+
+      if (ISD::BUILD_VECTOR == Op.getOpcode()) {
+        if (SVT.isFloatingPoint()) {
+          assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch");
+          Opnds.append(Op->op_begin(), Op->op_begin() + NumElts);
+        } else {
+          for (unsigned i = 0; i != NumElts; ++i)
+            Opnds.push_back(
+                DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i)));
+        }
+      }
+    }
+
+    assert(VT.getVectorNumElements() == Opnds.size() &&
+           "Concat vector type mismatch");
+    return DAG.getBuildVector(VT, SDLoc(N), Opnds);
+  }
+
+  // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR.
+  // FIXME: Add support for concat_vectors(bitcast(vec0),bitcast(vec1),...).
+  if (SDValue V = combineConcatVectorOfScalars(N, DAG))
+    return V;
+
+  if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) {
+    // Fold CONCAT_VECTORS of CONCAT_VECTORS (or undef) to VECTOR_SHUFFLE.
+    if (SDValue V = combineConcatVectorOfConcatVectors(N, DAG))
+      return V;
+
+    // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE.
+    if (SDValue V = combineConcatVectorOfExtracts(N, DAG))
+      return V;
+  }
+
+  if (SDValue V = combineConcatVectorOfCasts(N, DAG))
+    return V;
+
+  if (SDValue V = combineConcatVectorOfShuffleAndItsOperands(
+          N, DAG, TLI, LegalTypes, LegalOperations))
+    return V;
+
+  // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
+  // nodes often generate nop CONCAT_VECTOR nodes. Scan the CONCAT_VECTOR
+  // operands and look for a CONCAT operations that place the incoming vectors
+  // at the exact same location.
+  //
+  // For scalable vectors, EXTRACT_SUBVECTOR indexes are implicitly scaled.
+  SDValue SingleSource = SDValue();
+  unsigned PartNumElem =
+      N->getOperand(0).getValueType().getVectorMinNumElements();
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDValue Op = N->getOperand(i);
+
+    if (Op.isUndef())
+      continue;
+
+    // Check if this is the identity extract:
+    if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+      return SDValue();
+
+    // Find the single incoming vector for the extract_subvector.
+    if (SingleSource.getNode()) {
+      if (Op.getOperand(0) != SingleSource)
+        return SDValue();
+    } else {
+      SingleSource = Op.getOperand(0);
+
+      // Check the source type is the same as the type of the result.
+      // If not, this concat may extend the vector, so we can not
+      // optimize it away.
+      if (SingleSource.getValueType() != N->getValueType(0))
+        return SDValue();
+    }
+
+    // Check that we are reading from the identity index.
+    unsigned IdentityIndex = i * PartNumElem;
+    if (Op.getConstantOperandAPInt(1) != IdentityIndex)
+      return SDValue();
+  }
+
+  if (SingleSource.getNode())
+    return SingleSource;
+
+  return SDValue();
+}
+
+// Helper that peeks through INSERT_SUBVECTOR/CONCAT_VECTORS to find
+// if the subvector can be sourced for free.
+static SDValue getSubVectorSrc(SDValue V, SDValue Index, EVT SubVT) {
+  if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      V.getOperand(1).getValueType() == SubVT && V.getOperand(2) == Index) {
+    return V.getOperand(1);
+  }
+  auto *IndexC = dyn_cast<ConstantSDNode>(Index);
+  if (IndexC && V.getOpcode() == ISD::CONCAT_VECTORS &&
+      V.getOperand(0).getValueType() == SubVT &&
+      (IndexC->getZExtValue() % SubVT.getVectorMinNumElements()) == 0) {
+    uint64_t SubIdx = IndexC->getZExtValue() / SubVT.getVectorMinNumElements();
+    return V.getOperand(SubIdx);
+  }
+  return SDValue();
+}
+
+static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract,
+                                              SelectionDAG &DAG,
+                                              bool LegalOperations) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue BinOp = Extract->getOperand(0);
+  unsigned BinOpcode = BinOp.getOpcode();
+  if (!TLI.isBinOp(BinOpcode) || BinOp->getNumValues() != 1)
+    return SDValue();
+
+  EVT VecVT = BinOp.getValueType();
+  SDValue Bop0 = BinOp.getOperand(0), Bop1 = BinOp.getOperand(1);
+  if (VecVT != Bop0.getValueType() || VecVT != Bop1.getValueType())
+    return SDValue();
+
+  SDValue Index = Extract->getOperand(1);
+  EVT SubVT = Extract->getValueType(0);
+  if (!TLI.isOperationLegalOrCustom(BinOpcode, SubVT, LegalOperations))
+    return SDValue();
+
+  SDValue Sub0 = getSubVectorSrc(Bop0, Index, SubVT);
+  SDValue Sub1 = getSubVectorSrc(Bop1, Index, SubVT);
+
+  // TODO: We could handle the case where only 1 operand is being inserted by
+  //       creating an extract of the other operand, but that requires checking
+  //       number of uses and/or costs.
+  if (!Sub0 || !Sub1)
+    return SDValue();
+
+  // We are inserting both operands of the wide binop only to extract back
+  // to the narrow vector size. Eliminate all of the insert/extract:
+  // ext (binop (ins ?, X, Index), (ins ?, Y, Index)), Index --> binop X, Y
+  return DAG.getNode(BinOpcode, SDLoc(Extract), SubVT, Sub0, Sub1,
+                     BinOp->getFlags());
+}
+
+/// If we are extracting a subvector produced by a wide binary operator try
+/// to use a narrow binary operator and/or avoid concatenation and extraction.
+static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG,
+                                          bool LegalOperations) {
+  // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share
+  // some of these bailouts with other transforms.
+
+  if (SDValue V = narrowInsertExtractVectorBinOp(Extract, DAG, LegalOperations))
+    return V;
+
+  // The extract index must be a constant, so we can map it to a concat operand.
+  auto *ExtractIndexC = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
+  if (!ExtractIndexC)
+    return SDValue();
+
+  // We are looking for an optionally bitcasted wide vector binary operator
+  // feeding an extract subvector.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue BinOp = peekThroughBitcasts(Extract->getOperand(0));
+  unsigned BOpcode = BinOp.getOpcode();
+  if (!TLI.isBinOp(BOpcode) || BinOp->getNumValues() != 1)
+    return SDValue();
+
+  // Exclude the fake form of fneg (fsub -0.0, x) because that is likely to be
+  // reduced to the unary fneg when it is visited, and we probably want to deal
+  // with fneg in a target-specific way.
+  if (BOpcode == ISD::FSUB) {
+    auto *C = isConstOrConstSplatFP(BinOp.getOperand(0), /*AllowUndefs*/ true);
+    if (C && C->getValueAPF().isNegZero())
+      return SDValue();
+  }
+
+  // The binop must be a vector type, so we can extract some fraction of it.
+  EVT WideBVT = BinOp.getValueType();
+  // The optimisations below currently assume we are dealing with fixed length
+  // vectors. It is possible to add support for scalable vectors, but at the
+  // moment we've done no analysis to prove whether they are profitable or not.
+  if (!WideBVT.isFixedLengthVector())
+    return SDValue();
+
+  EVT VT = Extract->getValueType(0);
+  unsigned ExtractIndex = ExtractIndexC->getZExtValue();
+  assert(ExtractIndex % VT.getVectorNumElements() == 0 &&
+         "Extract index is not a multiple of the vector length.");
+
+  // Bail out if this is not a proper multiple width extraction.
+  unsigned WideWidth = WideBVT.getSizeInBits();
+  unsigned NarrowWidth = VT.getSizeInBits();
+  if (WideWidth % NarrowWidth != 0)
+    return SDValue();
+
+  // Bail out if we are extracting a fraction of a single operation. This can
+  // occur because we potentially looked through a bitcast of the binop.
+  unsigned NarrowingRatio = WideWidth / NarrowWidth;
+  unsigned WideNumElts = WideBVT.getVectorNumElements();
+  if (WideNumElts % NarrowingRatio != 0)
+    return SDValue();
+
+  // Bail out if the target does not support a narrower version of the binop.
+  EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
+                                   WideNumElts / NarrowingRatio);
+  if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
+    return SDValue();
+
+  // If extraction is cheap, we don't need to look at the binop operands
+  // for concat ops. The narrow binop alone makes this transform profitable.
+  // We can't just reuse the original extract index operand because we may have
+  // bitcasted.
+  unsigned ConcatOpNum = ExtractIndex / VT.getVectorNumElements();
+  unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();
+  if (TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtBOIdx) &&
+      BinOp.hasOneUse() && Extract->getOperand(0)->hasOneUse()) {
+    // extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N)
+    SDLoc DL(Extract);
+    SDValue NewExtIndex = DAG.getVectorIdxConstant(ExtBOIdx, DL);
+    SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
+                            BinOp.getOperand(0), NewExtIndex);
+    SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
+                            BinOp.getOperand(1), NewExtIndex);
+    SDValue NarrowBinOp =
+        DAG.getNode(BOpcode, DL, NarrowBVT, X, Y, BinOp->getFlags());
+    return DAG.getBitcast(VT, NarrowBinOp);
+  }
+
+  // Only handle the case where we are doubling and then halving. A larger ratio
+  // may require more than two narrow binops to replace the wide binop.
+  if (NarrowingRatio != 2)
+    return SDValue();
+
+  // TODO: The motivating case for this transform is an x86 AVX1 target. That
+  // target has temptingly almost legal versions of bitwise logic ops in 256-bit
+  // flavors, but no other 256-bit integer support. This could be extended to
+  // handle any binop, but that may require fixing/adding other folds to avoid
+  // codegen regressions.
+  if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR)
+    return SDValue();
+
+  // We need at least one concatenation operation of a binop operand to make
+  // this transform worthwhile. The concat must double the input vector sizes.
+  auto GetSubVector = [ConcatOpNum](SDValue V) -> SDValue {
+    if (V.getOpcode() == ISD::CONCAT_VECTORS && V.getNumOperands() == 2)
+      return V.getOperand(ConcatOpNum);
+    return SDValue();
+  };
+  SDValue SubVecL = GetSubVector(peekThroughBitcasts(BinOp.getOperand(0)));
+  SDValue SubVecR = GetSubVector(peekThroughBitcasts(BinOp.getOperand(1)));
+
+  if (SubVecL || SubVecR) {
+    // If a binop operand was not the result of a concat, we must extract a
+    // half-sized operand for our new narrow binop:
+    // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
+    // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, IndexC)
+    // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, IndexC), YN
+    SDLoc DL(Extract);
+    SDValue IndexC = DAG.getVectorIdxConstant(ExtBOIdx, DL);
+    SDValue X = SubVecL ? DAG.getBitcast(NarrowBVT, SubVecL)
+                        : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
+                                      BinOp.getOperand(0), IndexC);
+
+    SDValue Y = SubVecR ? DAG.getBitcast(NarrowBVT, SubVecR)
+                        : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
+                                      BinOp.getOperand(1), IndexC);
+
+    SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y);
+    return DAG.getBitcast(VT, NarrowBinOp);
+  }
+
+  return SDValue();
+}
+
+/// If we are extracting a subvector from a wide vector load, convert to a
+/// narrow load to eliminate the extraction:
+/// (extract_subvector (load wide vector)) --> (load narrow vector)
+static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
+  // TODO: Add support for big-endian. The offset calculation must be adjusted.
+  if (DAG.getDataLayout().isBigEndian())
+    return SDValue();
+
+  auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
+  if (!Ld || Ld->getExtensionType() || !Ld->isSimple())
+    return SDValue();
+
+  // Allow targets to opt-out.
+  EVT VT = Extract->getValueType(0);
+
+  // We can only create byte sized loads.
+  if (!VT.isByteSized())
+    return SDValue();
+
+  unsigned Index = Extract->getConstantOperandVal(1);
+  unsigned NumElts = VT.getVectorMinNumElements();
+
+  // The definition of EXTRACT_SUBVECTOR states that the index must be a
+  // multiple of the minimum number of elements in the result type.
+  assert(Index % NumElts == 0 && "The extract subvector index is not a "
+                                 "multiple of the result's element count");
+
+  // It's fine to use TypeSize here as we know the offset will not be negative.
+  TypeSize Offset = VT.getStoreSize() * (Index / NumElts);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT))
+    return SDValue();
+
+  // The narrow load will be offset from the base address of the old load if
+  // we are extracting from something besides index 0 (little-endian).
+  SDLoc DL(Extract);
+
+  // TODO: Use "BaseIndexOffset" to make this more effective.
+  SDValue NewAddr = DAG.getMemBasePlusOffset(Ld->getBasePtr(), Offset, DL);
+
+  uint64_t StoreSize = MemoryLocation::getSizeOrUnknown(VT.getStoreSize());
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineMemOperand *MMO;
+  if (Offset.isScalable()) {
+    MachinePointerInfo MPI =
+        MachinePointerInfo(Ld->getPointerInfo().getAddrSpace());
+    MMO = MF.getMachineMemOperand(Ld->getMemOperand(), MPI, StoreSize);
+  } else
+    MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset.getFixedValue(),
+                                  StoreSize);
+
+  SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO);
+  DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
+  return NewLd;
+}
+
+/// Given  EXTRACT_SUBVECTOR(VECTOR_SHUFFLE(Op0, Op1, Mask)),
+/// try to produce  VECTOR_SHUFFLE(EXTRACT_SUBVECTOR(Op?, ?),
+///                                EXTRACT_SUBVECTOR(Op?, ?),
+///                                Mask'))
+/// iff it is legal and profitable to do so. Notably, the trimmed mask
+/// (containing only the elements that are extracted)
+/// must reference at most two subvectors.
+static SDValue foldExtractSubvectorFromShuffleVector(SDNode *N,
+                                                     SelectionDAG &DAG,
+                                                     const TargetLowering &TLI,
+                                                     bool LegalOperations) {
+  assert(N->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+         "Must only be called on EXTRACT_SUBVECTOR's");
+
+  SDValue N0 = N->getOperand(0);
+
+  // Only deal with non-scalable vectors.
+  EVT NarrowVT = N->getValueType(0);
+  EVT WideVT = N0.getValueType();
+  if (!NarrowVT.isFixedLengthVector() || !WideVT.isFixedLengthVector())
+    return SDValue();
+
+  // The operand must be a shufflevector.
+  auto *WideShuffleVector = dyn_cast<ShuffleVectorSDNode>(N0);
+  if (!WideShuffleVector)
+    return SDValue();
+
+  // The old shuffleneeds to go away.
+  if (!WideShuffleVector->hasOneUse())
+    return SDValue();
+
+  // And the narrow shufflevector that we'll form must be legal.
+  if (LegalOperations &&
+      !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, NarrowVT))
+    return SDValue();
+
+  uint64_t FirstExtractedEltIdx = N->getConstantOperandVal(1);
+  int NumEltsExtracted = NarrowVT.getVectorNumElements();
+  assert((FirstExtractedEltIdx % NumEltsExtracted) == 0 &&
+         "Extract index is not a multiple of the output vector length.");
+
+  int WideNumElts = WideVT.getVectorNumElements();
+
+  SmallVector<int, 16> NewMask;
+  NewMask.reserve(NumEltsExtracted);
+  SmallSetVector<std::pair<SDValue /*Op*/, int /*SubvectorIndex*/>, 2>
+      DemandedSubvectors;
+
+  // Try to decode the wide mask into narrow mask from at most two subvectors.
+  for (int M : WideShuffleVector->getMask().slice(FirstExtractedEltIdx,
+                                                  NumEltsExtracted)) {
+    assert((M >= -1) && (M < (2 * WideNumElts)) &&
+           "Out-of-bounds shuffle mask?");
+
+    if (M < 0) {
+      // Does not depend on operands, does not require adjustment.
+      NewMask.emplace_back(M);
+      continue;
+    }
+
+    // From which operand of the shuffle does this shuffle mask element pick?
+    int WideShufOpIdx = M / WideNumElts;
+    // Which element of that operand is picked?
+    int OpEltIdx = M % WideNumElts;
+
+    assert((OpEltIdx + WideShufOpIdx * WideNumElts) == M &&
+           "Shuffle mask vector decomposition failure.");
+
+    // And which NumEltsExtracted-sized subvector of that operand is that?
+    int OpSubvecIdx = OpEltIdx / NumEltsExtracted;
+    // And which element within that subvector of that operand is that?
+    int OpEltIdxInSubvec = OpEltIdx % NumEltsExtracted;
+
+    assert((OpEltIdxInSubvec + OpSubvecIdx * NumEltsExtracted) == OpEltIdx &&
+           "Shuffle mask subvector decomposition failure.");
+
+    assert((OpEltIdxInSubvec + OpSubvecIdx * NumEltsExtracted +
+            WideShufOpIdx * WideNumElts) == M &&
+           "Shuffle mask full decomposition failure.");
+
+    SDValue Op = WideShuffleVector->getOperand(WideShufOpIdx);
+
+    if (Op.isUndef()) {
+      // Picking from an undef operand. Let's adjust mask instead.
+      NewMask.emplace_back(-1);
+      continue;
+    }
+
+    const std::pair<SDValue, int> DemandedSubvector =
+        std::make_pair(Op, OpSubvecIdx);
+
+    if (DemandedSubvectors.insert(DemandedSubvector)) {
+      if (DemandedSubvectors.size() > 2)
+        return SDValue(); // We can't handle more than two subvectors.
+      // How many elements into the WideVT does this subvector start?
+      int Index = NumEltsExtracted * OpSubvecIdx;
+      // Bail out if the extraction isn't going to be cheap.
+      if (!TLI.isExtractSubvectorCheap(NarrowVT, WideVT, Index))
+        return SDValue();
+    }
+
+    // Ok, but from which operand of the new shuffle will this element pick?
+    int NewOpIdx =
+        getFirstIndexOf(DemandedSubvectors.getArrayRef(), DemandedSubvector);
+    assert((NewOpIdx == 0 || NewOpIdx == 1) && "Unexpected operand index.");
+
+    int AdjM = OpEltIdxInSubvec + NewOpIdx * NumEltsExtracted;
+    NewMask.emplace_back(AdjM);
+  }
+  assert(NewMask.size() == (unsigned)NumEltsExtracted && "Produced bad mask.");
+  assert(DemandedSubvectors.size() <= 2 &&
+         "Should have ended up demanding at most two subvectors.");
+
+  // Did we discover that the shuffle does not actually depend on operands?
+  if (DemandedSubvectors.empty())
+    return DAG.getUNDEF(NarrowVT);
+
+  // Profitability check: only deal with extractions from the first subvector
+  // unless the mask becomes an identity mask.
+  if (!ShuffleVectorInst::isIdentityMask(NewMask) ||
+      any_of(NewMask, [](int M) { return M < 0; }))
+    for (auto &DemandedSubvector : DemandedSubvectors)
+      if (DemandedSubvector.second != 0)
+        return SDValue();
+
+  // We still perform the exact same EXTRACT_SUBVECTOR,  just on different
+  // operand[s]/index[es], so there is no point in checking for it's legality.
+
+  // Do not turn a legal shuffle into an illegal one.
+  if (TLI.isShuffleMaskLegal(WideShuffleVector->getMask(), WideVT) &&
+      !TLI.isShuffleMaskLegal(NewMask, NarrowVT))
+    return SDValue();
+
+  SDLoc DL(N);
+
+  SmallVector<SDValue, 2> NewOps;
+  for (const std::pair<SDValue /*Op*/, int /*SubvectorIndex*/>
+           &DemandedSubvector : DemandedSubvectors) {
+    // How many elements into the WideVT does this subvector start?
+    int Index = NumEltsExtracted * DemandedSubvector.second;
+    SDValue IndexC = DAG.getVectorIdxConstant(Index, DL);
+    NewOps.emplace_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowVT,
+                                    DemandedSubvector.first, IndexC));
+  }
+  assert((NewOps.size() == 1 || NewOps.size() == 2) &&
+         "Should end up with either one or two ops");
+
+  // If we ended up with only one operand, pad with an undef.
+  if (NewOps.size() == 1)
+    NewOps.emplace_back(DAG.getUNDEF(NarrowVT));
+
+  return DAG.getVectorShuffle(NarrowVT, DL, NewOps[0], NewOps[1], NewMask);
+}
+
+SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
+  EVT NVT = N->getValueType(0);
+  SDValue V = N->getOperand(0);
+  uint64_t ExtIdx = N->getConstantOperandVal(1);
+
+  // Extract from UNDEF is UNDEF.
+  if (V.isUndef())
+    return DAG.getUNDEF(NVT);
+
+  if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT))
+    if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG))
+      return NarrowLoad;
+
+  // Combine an extract of an extract into a single extract_subvector.
+  // ext (ext X, C), 0 --> ext X, C
+  if (ExtIdx == 0 && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) {
+    if (TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(),
+                                    V.getConstantOperandVal(1)) &&
+        TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) {
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, V.getOperand(0),
+                         V.getOperand(1));
+    }
+  }
+
+  // ty1 extract_vector(ty2 splat(V))) -> ty1 splat(V)
+  if (V.getOpcode() == ISD::SPLAT_VECTOR)
+    if (DAG.isConstantValueOfAnyType(V.getOperand(0)) || V.hasOneUse())
+      if (!LegalOperations || TLI.isOperationLegal(ISD::SPLAT_VECTOR, NVT))
+        return DAG.getSplatVector(NVT, SDLoc(N), V.getOperand(0));
+
+  // Try to move vector bitcast after extract_subv by scaling extraction index:
+  // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
+  if (V.getOpcode() == ISD::BITCAST &&
+      V.getOperand(0).getValueType().isVector() &&
+      (!LegalOperations || TLI.isOperationLegal(ISD::BITCAST, NVT))) {
+    SDValue SrcOp = V.getOperand(0);
+    EVT SrcVT = SrcOp.getValueType();
+    unsigned SrcNumElts = SrcVT.getVectorMinNumElements();
+    unsigned DestNumElts = V.getValueType().getVectorMinNumElements();
+    if ((SrcNumElts % DestNumElts) == 0) {
+      unsigned SrcDestRatio = SrcNumElts / DestNumElts;
+      ElementCount NewExtEC = NVT.getVectorElementCount() * SrcDestRatio;
+      EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
+                                      NewExtEC);
+      if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
+        SDLoc DL(N);
+        SDValue NewIndex = DAG.getVectorIdxConstant(ExtIdx * SrcDestRatio, DL);
+        SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
+                                         V.getOperand(0), NewIndex);
+        return DAG.getBitcast(NVT, NewExtract);
+      }
+    }
+    if ((DestNumElts % SrcNumElts) == 0) {
+      unsigned DestSrcRatio = DestNumElts / SrcNumElts;
+      if (NVT.getVectorElementCount().isKnownMultipleOf(DestSrcRatio)) {
+        ElementCount NewExtEC =
+            NVT.getVectorElementCount().divideCoefficientBy(DestSrcRatio);
+        EVT ScalarVT = SrcVT.getScalarType();
+        if ((ExtIdx % DestSrcRatio) == 0) {
+          SDLoc DL(N);
+          unsigned IndexValScaled = ExtIdx / DestSrcRatio;
+          EVT NewExtVT =
+              EVT::getVectorVT(*DAG.getContext(), ScalarVT, NewExtEC);
+          if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
+            SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);
+            SDValue NewExtract =
+                DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
+                            V.getOperand(0), NewIndex);
+            return DAG.getBitcast(NVT, NewExtract);
+          }
+          if (NewExtEC.isScalar() &&
+              TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ScalarVT)) {
+            SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);
+            SDValue NewExtract =
+                DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT,
+                            V.getOperand(0), NewIndex);
+            return DAG.getBitcast(NVT, NewExtract);
+          }
+        }
+      }
+    }
+  }
+
+  if (V.getOpcode() == ISD::CONCAT_VECTORS) {
+    unsigned ExtNumElts = NVT.getVectorMinNumElements();
+    EVT ConcatSrcVT = V.getOperand(0).getValueType();
+    assert(ConcatSrcVT.getVectorElementType() == NVT.getVectorElementType() &&
+           "Concat and extract subvector do not change element type");
+    assert((ExtIdx % ExtNumElts) == 0 &&
+           "Extract index is not a multiple of the input vector length.");
+
+    unsigned ConcatSrcNumElts = ConcatSrcVT.getVectorMinNumElements();
+    unsigned ConcatOpIdx = ExtIdx / ConcatSrcNumElts;
+
+    // If the concatenated source types match this extract, it's a direct
+    // simplification:
+    // extract_subvec (concat V1, V2, ...), i --> Vi
+    if (NVT.getVectorElementCount() == ConcatSrcVT.getVectorElementCount())
+      return V.getOperand(ConcatOpIdx);
+
+    // If the concatenated source vectors are a multiple length of this extract,
+    // then extract a fraction of one of those source vectors directly from a
+    // concat operand. Example:
+    //   v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 -->
+    //   v2i8 extract_subvec v8i8 Y, 6
+    if (NVT.isFixedLengthVector() && ConcatSrcVT.isFixedLengthVector() &&
+        ConcatSrcNumElts % ExtNumElts == 0) {
+      SDLoc DL(N);
+      unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts;
+      assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts &&
+             "Trying to extract from >1 concat operand?");
+      assert(NewExtIdx % ExtNumElts == 0 &&
+             "Extract index is not a multiple of the input vector length.");
+      SDValue NewIndexC = DAG.getVectorIdxConstant(NewExtIdx, DL);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT,
+                         V.getOperand(ConcatOpIdx), NewIndexC);
+    }
+  }
+
+  if (SDValue V =
+          foldExtractSubvectorFromShuffleVector(N, DAG, TLI, LegalOperations))
+    return V;
+
+  V = peekThroughBitcasts(V);
+
+  // If the input is a build vector. Try to make a smaller build vector.
+  if (V.getOpcode() == ISD::BUILD_VECTOR) {
+    EVT InVT = V.getValueType();
+    unsigned ExtractSize = NVT.getSizeInBits();
+    unsigned EltSize = InVT.getScalarSizeInBits();
+    // Only do this if we won't split any elements.
+    if (ExtractSize % EltSize == 0) {
+      unsigned NumElems = ExtractSize / EltSize;
+      EVT EltVT = InVT.getVectorElementType();
+      EVT ExtractVT =
+          NumElems == 1 ? EltVT
+                        : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElems);
+      if ((Level < AfterLegalizeDAG ||
+           (NumElems == 1 ||
+            TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT))) &&
+          (!LegalTypes || TLI.isTypeLegal(ExtractVT))) {
+        unsigned IdxVal = (ExtIdx * NVT.getScalarSizeInBits()) / EltSize;
+
+        if (NumElems == 1) {
+          SDValue Src = V->getOperand(IdxVal);
+          if (EltVT != Src.getValueType())
+            Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, Src);
+          return DAG.getBitcast(NVT, Src);
+        }
+
+        // Extract the pieces from the original build_vector.
+        SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N),
+                                              V->ops().slice(IdxVal, NumElems));
+        return DAG.getBitcast(NVT, BuildVec);
+      }
+    }
+  }
+
+  if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
+    // Handle only simple case where vector being inserted and vector
+    // being extracted are of same size.
+    EVT SmallVT = V.getOperand(1).getValueType();
+    if (!NVT.bitsEq(SmallVT))
+      return SDValue();
+
+    // Combine:
+    //    (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
+    // Into:
+    //    indices are equal or bit offsets are equal => V1
+    //    otherwise => (extract_subvec V1, ExtIdx)
+    uint64_t InsIdx = V.getConstantOperandVal(2);
+    if (InsIdx * SmallVT.getScalarSizeInBits() ==
+        ExtIdx * NVT.getScalarSizeInBits()) {
+      if (LegalOperations && !TLI.isOperationLegal(ISD::BITCAST, NVT))
+        return SDValue();
+
+      return DAG.getBitcast(NVT, V.getOperand(1));
+    }
+    return DAG.getNode(
+        ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT,
+        DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),
+        N->getOperand(1));
+  }
+
+  if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG, LegalOperations))
+    return NarrowBOp;
+
+  if (SimplifyDemandedVectorElts(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+/// Try to convert a wide shuffle of concatenated vectors into 2 narrow shuffles
+/// followed by concatenation. Narrow vector ops may have better performance
+/// than wide ops, and this can unlock further narrowing of other vector ops.
+/// Targets can invert this transform later if it is not profitable.
+static SDValue foldShuffleOfConcatUndefs(ShuffleVectorSDNode *Shuf,
+                                         SelectionDAG &DAG) {
+  SDValue N0 = Shuf->getOperand(0), N1 = Shuf->getOperand(1);
+  if (N0.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
+      N1.getOpcode() != ISD::CONCAT_VECTORS || N1.getNumOperands() != 2 ||
+      !N0.getOperand(1).isUndef() || !N1.getOperand(1).isUndef())
+    return SDValue();
+
+  // Split the wide shuffle mask into halves. Any mask element that is accessing
+  // operand 1 is offset down to account for narrowing of the vectors.
+  ArrayRef<int> Mask = Shuf->getMask();
+  EVT VT = Shuf->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned HalfNumElts = NumElts / 2;
+  SmallVector<int, 16> Mask0(HalfNumElts, -1);
+  SmallVector<int, 16> Mask1(HalfNumElts, -1);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (Mask[i] == -1)
+      continue;
+    // If we reference the upper (undef) subvector then the element is undef.
+    if ((Mask[i] % NumElts) >= HalfNumElts)
+      continue;
+    int M = Mask[i] < (int)NumElts ? Mask[i] : Mask[i] - (int)HalfNumElts;
+    if (i < HalfNumElts)
+      Mask0[i] = M;
+    else
+      Mask1[i - HalfNumElts] = M;
+  }
+
+  // Ask the target if this is a valid transform.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
+                                HalfNumElts);
+  if (!TLI.isShuffleMaskLegal(Mask0, HalfVT) ||
+      !TLI.isShuffleMaskLegal(Mask1, HalfVT))
+    return SDValue();
+
+  // shuffle (concat X, undef), (concat Y, undef), Mask -->
+  // concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1)
+  SDValue X = N0.getOperand(0), Y = N1.getOperand(0);
+  SDLoc DL(Shuf);
+  SDValue Shuf0 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask0);
+  SDValue Shuf1 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask1);
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Shuf0, Shuf1);
+}
+
+// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
+// or turn a shuffle of a single concat into simpler shuffle then concat.
+static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+  ArrayRef<int> Mask = SVN->getMask();
+
+  SmallVector<SDValue, 4> Ops;
+  EVT ConcatVT = N0.getOperand(0).getValueType();
+  unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements();
+  unsigned NumConcats = NumElts / NumElemsPerConcat;
+
+  auto IsUndefMaskElt = [](int i) { return i == -1; };
+
+  // Special case: shuffle(concat(A,B)) can be more efficiently represented
+  // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high
+  // half vector elements.
+  if (NumElemsPerConcat * 2 == NumElts && N1.isUndef() &&
+      llvm::all_of(Mask.slice(NumElemsPerConcat, NumElemsPerConcat),
+                   IsUndefMaskElt)) {
+    N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0),
+                              N0.getOperand(1),
+                              Mask.slice(0, NumElemsPerConcat));
+    N1 = DAG.getUNDEF(ConcatVT);
+    return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1);
+  }
+
+  // Look at every vector that's inserted. We're looking for exact
+  // subvector-sized copies from a concatenated vector
+  for (unsigned I = 0; I != NumConcats; ++I) {
+    unsigned Begin = I * NumElemsPerConcat;
+    ArrayRef<int> SubMask = Mask.slice(Begin, NumElemsPerConcat);
+
+    // Make sure we're dealing with a copy.
+    if (llvm::all_of(SubMask, IsUndefMaskElt)) {
+      Ops.push_back(DAG.getUNDEF(ConcatVT));
+      continue;
+    }
+
+    int OpIdx = -1;
+    for (int i = 0; i != (int)NumElemsPerConcat; ++i) {
+      if (IsUndefMaskElt(SubMask[i]))
+        continue;
+      if ((SubMask[i] % (int)NumElemsPerConcat) != i)
+        return SDValue();
+      int EltOpIdx = SubMask[i] / NumElemsPerConcat;
+      if (0 <= OpIdx && EltOpIdx != OpIdx)
+        return SDValue();
+      OpIdx = EltOpIdx;
+    }
+    assert(0 <= OpIdx && "Unknown concat_vectors op");
+
+    if (OpIdx < (int)N0.getNumOperands())
+      Ops.push_back(N0.getOperand(OpIdx));
+    else
+      Ops.push_back(N1.getOperand(OpIdx - N0.getNumOperands()));
+  }
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
+}
+
+// Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
+// BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
+//
+// SHUFFLE(BUILD_VECTOR(), BUILD_VECTOR()) -> BUILD_VECTOR() is always
+// a simplification in some sense, but it isn't appropriate in general: some
+// BUILD_VECTORs are substantially cheaper than others. The general case
+// of a BUILD_VECTOR requires inserting each element individually (or
+// performing the equivalent in a temporary stack variable). A BUILD_VECTOR of
+// all constants is a single constant pool load.  A BUILD_VECTOR where each
+// element is identical is a splat.  A BUILD_VECTOR where most of the operands
+// are undef lowers to a small number of element insertions.
+//
+// To deal with this, we currently use a bunch of mostly arbitrary heuristics.
+// We don't fold shuffles where one side is a non-zero constant, and we don't
+// fold shuffles if the resulting (non-splat) BUILD_VECTOR would have duplicate
+// non-constant operands. This seems to work out reasonably well in practice.
+static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
+                                       SelectionDAG &DAG,
+                                       const TargetLowering &TLI) {
+  EVT VT = SVN->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue N0 = SVN->getOperand(0);
+  SDValue N1 = SVN->getOperand(1);
+
+  if (!N0->hasOneUse())
+    return SDValue();
+
+  // If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as
+  // discussed above.
+  if (!N1.isUndef()) {
+    if (!N1->hasOneUse())
+      return SDValue();
+
+    bool N0AnyConst = isAnyConstantBuildVector(N0);
+    bool N1AnyConst = isAnyConstantBuildVector(N1);
+    if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode()))
+      return SDValue();
+    if (!N0AnyConst && N1AnyConst && !ISD::isBuildVectorAllZeros(N1.getNode()))
+      return SDValue();
+  }
+
+  // If both inputs are splats of the same value then we can safely merge this
+  // to a single BUILD_VECTOR with undef elements based on the shuffle mask.
+  bool IsSplat = false;
+  auto *BV0 = dyn_cast<BuildVectorSDNode>(N0);
+  auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
+  if (BV0 && BV1)
+    if (SDValue Splat0 = BV0->getSplatValue())
+      IsSplat = (Splat0 == BV1->getSplatValue());
+
+  SmallVector<SDValue, 8> Ops;
+  SmallSet<SDValue, 16> DuplicateOps;
+  for (int M : SVN->getMask()) {
+    SDValue Op = DAG.getUNDEF(VT.getScalarType());
+    if (M >= 0) {
+      int Idx = M < (int)NumElts ? M : M - NumElts;
+      SDValue &S = (M < (int)NumElts ? N0 : N1);
+      if (S.getOpcode() == ISD::BUILD_VECTOR) {
+        Op = S.getOperand(Idx);
+      } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+        SDValue Op0 = S.getOperand(0);
+        Op = Idx == 0 ? Op0 : DAG.getUNDEF(Op0.getValueType());
+      } else {
+        // Operand can't be combined - bail out.
+        return SDValue();
+      }
+    }
+
+    // Don't duplicate a non-constant BUILD_VECTOR operand unless we're
+    // generating a splat; semantically, this is fine, but it's likely to
+    // generate low-quality code if the target can't reconstruct an appropriate
+    // shuffle.
+    if (!Op.isUndef() && !isIntOrFPConstant(Op))
+      if (!IsSplat && !DuplicateOps.insert(Op).second)
+        return SDValue();
+
+    Ops.push_back(Op);
+  }
+
+  // BUILD_VECTOR requires all inputs to be of the same type, find the
+  // maximum type and extend them all.
+  EVT SVT = VT.getScalarType();
+  if (SVT.isInteger())
+    for (SDValue &Op : Ops)
+      SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
+  if (SVT != VT.getScalarType())
+    for (SDValue &Op : Ops)
+      Op = Op.isUndef() ? DAG.getUNDEF(SVT)
+                        : (TLI.isZExtFree(Op.getValueType(), SVT)
+                               ? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT)
+                               : DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT));
+  return DAG.getBuildVector(VT, SDLoc(SVN), Ops);
+}
+
+// Match shuffles that can be converted to *_vector_extend_in_reg.
+// This is often generated during legalization.
+// e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src)),
+// and returns the EVT to which the extension should be performed.
+// NOTE: this assumes that the src is the first operand of the shuffle.
+static std::optional<EVT> canCombineShuffleToExtendVectorInreg(
+    unsigned Opcode, EVT VT, std::function<bool(unsigned)> Match,
+    SelectionDAG &DAG, const TargetLowering &TLI, bool LegalTypes,
+    bool LegalOperations) {
+  bool IsBigEndian = DAG.getDataLayout().isBigEndian();
+
+  // TODO Add support for big-endian when we have a test case.
+  if (!VT.isInteger() || IsBigEndian)
+    return std::nullopt;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+
+  // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for
+  // power-of-2 extensions as they are the most likely.
+  // FIXME: should try Scale == NumElts case too,
+  for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) {
+    // The vector width must be a multiple of Scale.
+    if (NumElts % Scale != 0)
+      continue;
+
+    EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale);
+    EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale);
+
+    if ((LegalTypes && !TLI.isTypeLegal(OutVT)) ||
+        (LegalOperations && !TLI.isOperationLegalOrCustom(Opcode, OutVT)))
+      continue;
+
+    if (Match(Scale))
+      return OutVT;
+  }
+
+  return std::nullopt;
+}
+
+// Match shuffles that can be converted to any_vector_extend_in_reg.
+// This is often generated during legalization.
+// e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src))
+static SDValue combineShuffleToAnyExtendVectorInreg(ShuffleVectorSDNode *SVN,
+                                                    SelectionDAG &DAG,
+                                                    const TargetLowering &TLI,
+                                                    bool LegalOperations) {
+  EVT VT = SVN->getValueType(0);
+  bool IsBigEndian = DAG.getDataLayout().isBigEndian();
+
+  // TODO Add support for big-endian when we have a test case.
+  if (!VT.isInteger() || IsBigEndian)
+    return SDValue();
+
+  // shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32))
+  auto isAnyExtend = [NumElts = VT.getVectorNumElements(),
+                      Mask = SVN->getMask()](unsigned Scale) {
+    for (unsigned i = 0; i != NumElts; ++i) {
+      if (Mask[i] < 0)
+        continue;
+      if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale))
+        continue;
+      return false;
+    }
+    return true;
+  };
+
+  unsigned Opcode = ISD::ANY_EXTEND_VECTOR_INREG;
+  SDValue N0 = SVN->getOperand(0);
+  // Never create an illegal type. Only create unsupported operations if we
+  // are pre-legalization.
+  std::optional<EVT> OutVT = canCombineShuffleToExtendVectorInreg(
+      Opcode, VT, isAnyExtend, DAG, TLI, /*LegalTypes=*/true, LegalOperations);
+  if (!OutVT)
+    return SDValue();
+  return DAG.getBitcast(VT, DAG.getNode(Opcode, SDLoc(SVN), *OutVT, N0));
+}
+
+// Match shuffles that can be converted to zero_extend_vector_inreg.
+// This is often generated during legalization.
+// e.g. v4i32 <0,z,1,u> -> (v2i64 zero_extend_vector_inreg(v4i32 src))
+static SDValue combineShuffleToZeroExtendVectorInReg(ShuffleVectorSDNode *SVN,
+                                                     SelectionDAG &DAG,
+                                                     const TargetLowering &TLI,
+                                                     bool LegalOperations) {
+  bool LegalTypes = true;
+  EVT VT = SVN->getValueType(0);
+  assert(!VT.isScalableVector() && "Encountered scalable shuffle?");
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+
+  // TODO: add support for big-endian when we have a test case.
+  bool IsBigEndian = DAG.getDataLayout().isBigEndian();
+  if (!VT.isInteger() || IsBigEndian)
+    return SDValue();
+
+  SmallVector<int, 16> Mask(SVN->getMask().begin(), SVN->getMask().end());
+  auto ForEachDecomposedIndice = [NumElts, &Mask](auto Fn) {
+    for (int &Indice : Mask) {
+      if (Indice < 0)
+        continue;
+      int OpIdx = (unsigned)Indice < NumElts ? 0 : 1;
+      int OpEltIdx = (unsigned)Indice < NumElts ? Indice : Indice - NumElts;
+      Fn(Indice, OpIdx, OpEltIdx);
+    }
+  };
+
+  // Which elements of which operand does this shuffle demand?
+  std::array<APInt, 2> OpsDemandedElts;
+  for (APInt &OpDemandedElts : OpsDemandedElts)
+    OpDemandedElts = APInt::getZero(NumElts);
+  ForEachDecomposedIndice(
+      [&OpsDemandedElts](int &Indice, int OpIdx, int OpEltIdx) {
+        OpsDemandedElts[OpIdx].setBit(OpEltIdx);
+      });
+
+  // Element-wise(!), which of these demanded elements are know to be zero?
+  std::array<APInt, 2> OpsKnownZeroElts;
+  for (auto I : zip(SVN->ops(), OpsDemandedElts, OpsKnownZeroElts))
+    std::get<2>(I) =
+        DAG.computeVectorKnownZeroElements(std::get<0>(I), std::get<1>(I));
+
+  // Manifest zeroable element knowledge in the shuffle mask.
+  // NOTE: we don't have 'zeroable' sentinel value in generic DAG,
+  //       this is a local invention, but it won't leak into DAG.
+  // FIXME: should we not manifest them, but just check when matching?
+  bool HadZeroableElts = false;
+  ForEachDecomposedIndice([&OpsKnownZeroElts, &HadZeroableElts](
+                              int &Indice, int OpIdx, int OpEltIdx) {
+    if (OpsKnownZeroElts[OpIdx][OpEltIdx]) {
+      Indice = -2; // Zeroable element.
+      HadZeroableElts = true;
+    }
+  });
+
+  // Don't proceed unless we've refined at least one zeroable mask indice.
+  // If we didn't, then we are still trying to match the same shuffle mask
+  // we previously tried to match as ISD::ANY_EXTEND_VECTOR_INREG,
+  // and evidently failed. Proceeding will lead to endless combine loops.
+  if (!HadZeroableElts)
+    return SDValue();
+
+  // The shuffle may be more fine-grained than we want. Widen elements first.
+  // FIXME: should we do this before manifesting zeroable shuffle mask indices?
+  SmallVector<int, 16> ScaledMask;
+  getShuffleMaskWithWidestElts(Mask, ScaledMask);
+  assert(Mask.size() >= ScaledMask.size() &&
+         Mask.size() % ScaledMask.size() == 0 && "Unexpected mask widening.");
+  int Prescale = Mask.size() / ScaledMask.size();
+
+  NumElts = ScaledMask.size();
+  EltSizeInBits *= Prescale;
+
+  EVT PrescaledVT = EVT::getVectorVT(
+      *DAG.getContext(), EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits),
+      NumElts);
+
+  if (LegalTypes && !TLI.isTypeLegal(PrescaledVT) && TLI.isTypeLegal(VT))
+    return SDValue();
+
+  // For example,
+  // shuffle<0,z,1,-1> == (v2i64 zero_extend_vector_inreg(v4i32))
+  // But not shuffle<z,z,1,-1> and not shuffle<0,z,z,-1> ! (for same types)
+  auto isZeroExtend = [NumElts, &ScaledMask](unsigned Scale) {
+    assert(Scale >= 2 && Scale <= NumElts && NumElts % Scale == 0 &&
+           "Unexpected mask scaling factor.");
+    ArrayRef<int> Mask = ScaledMask;
+    for (unsigned SrcElt = 0, NumSrcElts = NumElts / Scale;
+         SrcElt != NumSrcElts; ++SrcElt) {
+      // Analyze the shuffle mask in Scale-sized chunks.
+      ArrayRef<int> MaskChunk = Mask.take_front(Scale);
+      assert(MaskChunk.size() == Scale && "Unexpected mask size.");
+      Mask = Mask.drop_front(MaskChunk.size());
+      // The first indice in this chunk must be SrcElt, but not zero!
+      // FIXME: undef should be fine, but that results in more-defined result.
+      if (int FirstIndice = MaskChunk[0]; (unsigned)FirstIndice != SrcElt)
+        return false;
+      // The rest of the indices in this chunk must be zeros.
+      // FIXME: undef should be fine, but that results in more-defined result.
+      if (!all_of(MaskChunk.drop_front(1),
+                  [](int Indice) { return Indice == -2; }))
+        return false;
+    }
+    assert(Mask.empty() && "Did not process the whole mask?");
+    return true;
+  };
+
+  unsigned Opcode = ISD::ZERO_EXTEND_VECTOR_INREG;
+  for (bool Commuted : {false, true}) {
+    SDValue Op = SVN->getOperand(!Commuted ? 0 : 1);
+    if (Commuted)
+      ShuffleVectorSDNode::commuteMask(ScaledMask);
+    std::optional<EVT> OutVT = canCombineShuffleToExtendVectorInreg(
+        Opcode, PrescaledVT, isZeroExtend, DAG, TLI, LegalTypes,
+        LegalOperations);
+    if (OutVT)
+      return DAG.getBitcast(VT, DAG.getNode(Opcode, SDLoc(SVN), *OutVT,
+                                            DAG.getBitcast(PrescaledVT, Op)));
+  }
+  return SDValue();
+}
+
+// Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of
+// each source element of a large type into the lowest elements of a smaller
+// destination type. This is often generated during legalization.
+// If the source node itself was a '*_extend_vector_inreg' node then we should
+// then be able to remove it.
+static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN,
+                                        SelectionDAG &DAG) {
+  EVT VT = SVN->getValueType(0);
+  bool IsBigEndian = DAG.getDataLayout().isBigEndian();
+
+  // TODO Add support for big-endian when we have a test case.
+  if (!VT.isInteger() || IsBigEndian)
+    return SDValue();
+
+  SDValue N0 = peekThroughBitcasts(SVN->getOperand(0));
+
+  unsigned Opcode = N0.getOpcode();
+  if (!ISD::isExtVecInRegOpcode(Opcode))
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  ArrayRef<int> Mask = SVN->getMask();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits();
+  unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits();
+
+  if (ExtDstSizeInBits % ExtSrcSizeInBits != 0)
+    return SDValue();
+  unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits;
+
+  // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1>
+  // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1>
+  // (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1>
+  auto isTruncate = [&Mask, &NumElts](unsigned Scale) {
+    for (unsigned i = 0; i != NumElts; ++i) {
+      if (Mask[i] < 0)
+        continue;
+      if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale))
+        continue;
+      return false;
+    }
+    return true;
+  };
+
+  // At the moment we just handle the case where we've truncated back to the
+  // same size as before the extension.
+  // TODO: handle more extension/truncation cases as cases arise.
+  if (EltSizeInBits != ExtSrcSizeInBits)
+    return SDValue();
+
+  // We can remove *extend_vector_inreg only if the truncation happens at
+  // the same scale as the extension.
+  if (isTruncate(ExtScale))
+    return DAG.getBitcast(VT, N00);
+
+  return SDValue();
+}
+
+// Combine shuffles of splat-shuffles of the form:
+// shuffle (shuffle V, undef, splat-mask), undef, M
+// If splat-mask contains undef elements, we need to be careful about
+// introducing undef's in the folded mask which are not the result of composing
+// the masks of the shuffles.
+static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf,
+                                        SelectionDAG &DAG) {
+  EVT VT = Shuf->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+
+  if (!Shuf->getOperand(1).isUndef())
+    return SDValue();
+
+  // See if this unary non-splat shuffle actually *is* a splat shuffle,
+  // in disguise, with all demanded elements being identical.
+  // FIXME: this can be done per-operand.
+  if (!Shuf->isSplat()) {
+    APInt DemandedElts(NumElts, 0);
+    for (int Idx : Shuf->getMask()) {
+      if (Idx < 0)
+        continue; // Ignore sentinel indices.
+      assert((unsigned)Idx < NumElts && "Out-of-bounds shuffle indice?");
+      DemandedElts.setBit(Idx);
+    }
+    assert(DemandedElts.popcount() > 1 && "Is a splat shuffle already?");
+    APInt UndefElts;
+    if (DAG.isSplatValue(Shuf->getOperand(0), DemandedElts, UndefElts)) {
+      // Even if all demanded elements are splat, some of them could be undef.
+      // Which lowest demanded element is *not* known-undef?
+      std::optional<unsigned> MinNonUndefIdx;
+      for (int Idx : Shuf->getMask()) {
+        if (Idx < 0 || UndefElts[Idx])
+          continue; // Ignore sentinel indices, and undef elements.
+        MinNonUndefIdx = std::min<unsigned>(Idx, MinNonUndefIdx.value_or(~0U));
+      }
+      if (!MinNonUndefIdx)
+        return DAG.getUNDEF(VT); // All undef - result is undef.
+      assert(*MinNonUndefIdx < NumElts && "Expected valid element index.");
+      SmallVector<int, 8> SplatMask(Shuf->getMask().begin(),
+                                    Shuf->getMask().end());
+      for (int &Idx : SplatMask) {
+        if (Idx < 0)
+          continue; // Passthrough sentinel indices.
+        // Otherwise, just pick the lowest demanded non-undef element.
+        // Or sentinel undef, if we know we'd pick a known-undef element.
+        Idx = UndefElts[Idx] ? -1 : *MinNonUndefIdx;
+      }
+      assert(SplatMask != Shuf->getMask() && "Expected mask to change!");
+      return DAG.getVectorShuffle(VT, SDLoc(Shuf), Shuf->getOperand(0),
+                                  Shuf->getOperand(1), SplatMask);
+    }
+  }
+
+  // If the inner operand is a known splat with no undefs, just return that directly.
+  // TODO: Create DemandedElts mask from Shuf's mask.
+  // TODO: Allow undef elements and merge with the shuffle code below.
+  if (DAG.isSplatValue(Shuf->getOperand(0), /*AllowUndefs*/ false))
+    return Shuf->getOperand(0);
+
+  auto *Splat = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0));
+  if (!Splat || !Splat->isSplat())
+    return SDValue();
+
+  ArrayRef<int> ShufMask = Shuf->getMask();
+  ArrayRef<int> SplatMask = Splat->getMask();
+  assert(ShufMask.size() == SplatMask.size() && "Mask length mismatch");
+
+  // Prefer simplifying to the splat-shuffle, if possible. This is legal if
+  // every undef mask element in the splat-shuffle has a corresponding undef
+  // element in the user-shuffle's mask or if the composition of mask elements
+  // would result in undef.
+  // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask):
+  // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u]
+  //   In this case it is not legal to simplify to the splat-shuffle because we
+  //   may be exposing the users of the shuffle an undef element at index 1
+  //   which was not there before the combine.
+  // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u]
+  //   In this case the composition of masks yields SplatMask, so it's ok to
+  //   simplify to the splat-shuffle.
+  // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u]
+  //   In this case the composed mask includes all undef elements of SplatMask
+  //   and in addition sets element zero to undef. It is safe to simplify to
+  //   the splat-shuffle.
+  auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask,
+                                       ArrayRef<int> SplatMask) {
+    for (unsigned i = 0, e = UserMask.size(); i != e; ++i)
+      if (UserMask[i] != -1 && SplatMask[i] == -1 &&
+          SplatMask[UserMask[i]] != -1)
+        return false;
+    return true;
+  };
+  if (CanSimplifyToExistingSplat(ShufMask, SplatMask))
+    return Shuf->getOperand(0);
+
+  // Create a new shuffle with a mask that is composed of the two shuffles'
+  // masks.
+  SmallVector<int, 32> NewMask;
+  for (int Idx : ShufMask)
+    NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]);
+
+  return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat),
+                              Splat->getOperand(0), Splat->getOperand(1),
+                              NewMask);
+}
+
+// Combine shuffles of bitcasts into a shuffle of the bitcast type, providing
+// the mask can be treated as a larger type.
+static SDValue combineShuffleOfBitcast(ShuffleVectorSDNode *SVN,
+                                       SelectionDAG &DAG,
+                                       const TargetLowering &TLI,
+                                       bool LegalOperations) {
+  SDValue Op0 = SVN->getOperand(0);
+  SDValue Op1 = SVN->getOperand(1);
+  EVT VT = SVN->getValueType(0);
+  if (Op0.getOpcode() != ISD::BITCAST)
+    return SDValue();
+  EVT InVT = Op0.getOperand(0).getValueType();
+  if (!InVT.isVector() ||
+      (!Op1.isUndef() && (Op1.getOpcode() != ISD::BITCAST ||
+                          Op1.getOperand(0).getValueType() != InVT)))
+    return SDValue();
+  if (isAnyConstantBuildVector(Op0.getOperand(0)) &&
+      (Op1.isUndef() || isAnyConstantBuildVector(Op1.getOperand(0))))
+    return SDValue();
+
+  int VTLanes = VT.getVectorNumElements();
+  int InLanes = InVT.getVectorNumElements();
+  if (VTLanes <= InLanes || VTLanes % InLanes != 0 ||
+      (LegalOperations &&
+       !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, InVT)))
+    return SDValue();
+  int Factor = VTLanes / InLanes;
+
+  // Check that each group of lanes in the mask are either undef or make a valid
+  // mask for the wider lane type.
+  ArrayRef<int> Mask = SVN->getMask();
+  SmallVector<int> NewMask;
+  if (!widenShuffleMaskElts(Factor, Mask, NewMask))
+    return SDValue();
+
+  if (!TLI.isShuffleMaskLegal(NewMask, InVT))
+    return SDValue();
+
+  // Create the new shuffle with the new mask and bitcast it back to the
+  // original type.
+  SDLoc DL(SVN);
+  Op0 = Op0.getOperand(0);
+  Op1 = Op1.isUndef() ? DAG.getUNDEF(InVT) : Op1.getOperand(0);
+  SDValue NewShuf = DAG.getVectorShuffle(InVT, DL, Op0, Op1, NewMask);
+  return DAG.getBitcast(VT, NewShuf);
+}
+
+/// Combine shuffle of shuffle of the form:
+/// shuf (shuf X, undef, InnerMask), undef, OuterMask --> splat X
+static SDValue formSplatFromShuffles(ShuffleVectorSDNode *OuterShuf,
+                                     SelectionDAG &DAG) {
+  if (!OuterShuf->getOperand(1).isUndef())
+    return SDValue();
+  auto *InnerShuf = dyn_cast<ShuffleVectorSDNode>(OuterShuf->getOperand(0));
+  if (!InnerShuf || !InnerShuf->getOperand(1).isUndef())
+    return SDValue();
+
+  ArrayRef<int> OuterMask = OuterShuf->getMask();
+  ArrayRef<int> InnerMask = InnerShuf->getMask();
+  unsigned NumElts = OuterMask.size();
+  assert(NumElts == InnerMask.size() && "Mask length mismatch");
+  SmallVector<int, 32> CombinedMask(NumElts, -1);
+  int SplatIndex = -1;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    // Undef lanes remain undef.
+    int OuterMaskElt = OuterMask[i];
+    if (OuterMaskElt == -1)
+      continue;
+
+    // Peek through the shuffle masks to get the underlying source element.
+    int InnerMaskElt = InnerMask[OuterMaskElt];
+    if (InnerMaskElt == -1)
+      continue;
+
+    // Initialize the splatted element.
+    if (SplatIndex == -1)
+      SplatIndex = InnerMaskElt;
+
+    // Non-matching index - this is not a splat.
+    if (SplatIndex != InnerMaskElt)
+      return SDValue();
+
+    CombinedMask[i] = InnerMaskElt;
+  }
+  assert((all_of(CombinedMask, [](int M) { return M == -1; }) ||
+          getSplatIndex(CombinedMask) != -1) &&
+         "Expected a splat mask");
+
+  // TODO: The transform may be a win even if the mask is not legal.
+  EVT VT = OuterShuf->getValueType(0);
+  assert(VT == InnerShuf->getValueType(0) && "Expected matching shuffle types");
+  if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(CombinedMask, VT))
+    return SDValue();
+
+  return DAG.getVectorShuffle(VT, SDLoc(OuterShuf), InnerShuf->getOperand(0),
+                              InnerShuf->getOperand(1), CombinedMask);
+}
+
+/// If the shuffle mask is taking exactly one element from the first vector
+/// operand and passing through all other elements from the second vector
+/// operand, return the index of the mask element that is choosing an element
+/// from the first operand. Otherwise, return -1.
+static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef<int> Mask) {
+  int MaskSize = Mask.size();
+  int EltFromOp0 = -1;
+  // TODO: This does not match if there are undef elements in the shuffle mask.
+  // Should we ignore undefs in the shuffle mask instead? The trade-off is
+  // removing an instruction (a shuffle), but losing the knowledge that some
+  // vector lanes are not needed.
+  for (int i = 0; i != MaskSize; ++i) {
+    if (Mask[i] >= 0 && Mask[i] < MaskSize) {
+      // We're looking for a shuffle of exactly one element from operand 0.
+      if (EltFromOp0 != -1)
+        return -1;
+      EltFromOp0 = i;
+    } else if (Mask[i] != i + MaskSize) {
+      // Nothing from operand 1 can change lanes.
+      return -1;
+    }
+  }
+  return EltFromOp0;
+}
+
+/// If a shuffle inserts exactly one element from a source vector operand into
+/// another vector operand and we can access the specified element as a scalar,
+/// then we can eliminate the shuffle.
+static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf,
+                                      SelectionDAG &DAG) {
+  // First, check if we are taking one element of a vector and shuffling that
+  // element into another vector.
+  ArrayRef<int> Mask = Shuf->getMask();
+  SmallVector<int, 16> CommutedMask(Mask);
+  SDValue Op0 = Shuf->getOperand(0);
+  SDValue Op1 = Shuf->getOperand(1);
+  int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask);
+  if (ShufOp0Index == -1) {
+    // Commute mask and check again.
+    ShuffleVectorSDNode::commuteMask(CommutedMask);
+    ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask);
+    if (ShufOp0Index == -1)
+      return SDValue();
+    // Commute operands to match the commuted shuffle mask.
+    std::swap(Op0, Op1);
+    Mask = CommutedMask;
+  }
+
+  // The shuffle inserts exactly one element from operand 0 into operand 1.
+  // Now see if we can access that element as a scalar via a real insert element
+  // instruction.
+  // TODO: We can try harder to locate the element as a scalar. Examples: it
+  // could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant.
+  assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() &&
+         "Shuffle mask value must be from operand 0");
+  if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT)
+    return SDValue();
+
+  auto *InsIndexC = dyn_cast<ConstantSDNode>(Op0.getOperand(2));
+  if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index])
+    return SDValue();
+
+  // There's an existing insertelement with constant insertion index, so we
+  // don't need to check the legality/profitability of a replacement operation
+  // that differs at most in the constant value. The target should be able to
+  // lower any of those in a similar way. If not, legalization will expand this
+  // to a scalar-to-vector plus shuffle.
+  //
+  // Note that the shuffle may move the scalar from the position that the insert
+  // element used. Therefore, our new insert element occurs at the shuffle's
+  // mask index value, not the insert's index value.
+  // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C'
+  SDValue NewInsIndex = DAG.getVectorIdxConstant(ShufOp0Index, SDLoc(Shuf));
+  return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(),
+                     Op1, Op0.getOperand(1), NewInsIndex);
+}
+
+/// If we have a unary shuffle of a shuffle, see if it can be folded away
+/// completely. This has the potential to lose undef knowledge because the first
+/// shuffle may not have an undef mask element where the second one does. So
+/// only call this after doing simplifications based on demanded elements.
+static SDValue simplifyShuffleOfShuffle(ShuffleVectorSDNode *Shuf) {
+  // shuf (shuf0 X, Y, Mask0), undef, Mask
+  auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0));
+  if (!Shuf0 || !Shuf->getOperand(1).isUndef())
+    return SDValue();
+
+  ArrayRef<int> Mask = Shuf->getMask();
+  ArrayRef<int> Mask0 = Shuf0->getMask();
+  for (int i = 0, e = (int)Mask.size(); i != e; ++i) {
+    // Ignore undef elements.
+    if (Mask[i] == -1)
+      continue;
+    assert(Mask[i] >= 0 && Mask[i] < e && "Unexpected shuffle mask value");
+
+    // Is the element of the shuffle operand chosen by this shuffle the same as
+    // the element chosen by the shuffle operand itself?
+    if (Mask0[Mask[i]] != Mask0[i])
+      return SDValue();
+  }
+  // Every element of this shuffle is identical to the result of the previous
+  // shuffle, so we can replace this value.
+  return Shuf->getOperand(0);
+}
+
+SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG");
+
+  // Canonicalize shuffle undef, undef -> undef
+  if (N0.isUndef() && N1.isUndef())
+    return DAG.getUNDEF(VT);
+
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+
+  // Canonicalize shuffle v, v -> v, undef
+  if (N0 == N1)
+    return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT),
+                                createUnaryMask(SVN->getMask(), NumElts));
+
+  // Canonicalize shuffle undef, v -> v, undef.  Commute the shuffle mask.
+  if (N0.isUndef())
+    return DAG.getCommutedVectorShuffle(*SVN);
+
+  // Remove references to rhs if it is undef
+  if (N1.isUndef()) {
+    bool Changed = false;
+    SmallVector<int, 8> NewMask;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      if (Idx >= (int)NumElts) {
+        Idx = -1;
+        Changed = true;
+      }
+      NewMask.push_back(Idx);
+    }
+    if (Changed)
+      return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
+  }
+
+  if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG))
+    return InsElt;
+
+  // A shuffle of a single vector that is a splatted value can always be folded.
+  if (SDValue V = combineShuffleOfSplatVal(SVN, DAG))
+    return V;
+
+  if (SDValue V = formSplatFromShuffles(SVN, DAG))
+    return V;
+
+  // If it is a splat, check if the argument vector is another splat or a
+  // build_vector.
+  if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
+    int SplatIndex = SVN->getSplatIndex();
+    if (N0.hasOneUse() && TLI.isExtractVecEltCheap(VT, SplatIndex) &&
+        TLI.isBinOp(N0.getOpcode()) && N0->getNumValues() == 1) {
+      // splat (vector_bo L, R), Index -->
+      // splat (scalar_bo (extelt L, Index), (extelt R, Index))
+      SDValue L = N0.getOperand(0), R = N0.getOperand(1);
+      SDLoc DL(N);
+      EVT EltVT = VT.getScalarType();
+      SDValue Index = DAG.getVectorIdxConstant(SplatIndex, DL);
+      SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, L, Index);
+      SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, R, Index);
+      SDValue NewBO =
+          DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR, N0->getFlags());
+      SDValue Insert = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, NewBO);
+      SmallVector<int, 16> ZeroMask(VT.getVectorNumElements(), 0);
+      return DAG.getVectorShuffle(VT, DL, Insert, DAG.getUNDEF(VT), ZeroMask);
+    }
+
+    // splat(scalar_to_vector(x), 0) -> build_vector(x,...,x)
+    // splat(insert_vector_elt(v, x, c), c) -> build_vector(x,...,x)
+    if ((!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) &&
+        N0.hasOneUse()) {
+      if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR && SplatIndex == 0)
+        return DAG.getSplatBuildVector(VT, SDLoc(N), N0.getOperand(0));
+
+      if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT)
+        if (auto *Idx = dyn_cast<ConstantSDNode>(N0.getOperand(2)))
+          if (Idx->getAPIntValue() == SplatIndex)
+            return DAG.getSplatBuildVector(VT, SDLoc(N), N0.getOperand(1));
+
+      // Look through a bitcast if LE and splatting lane 0, through to a
+      // scalar_to_vector or a build_vector.
+      if (N0.getOpcode() == ISD::BITCAST && N0.getOperand(0).hasOneUse() &&
+          SplatIndex == 0 && DAG.getDataLayout().isLittleEndian() &&
+          (N0.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR ||
+           N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR)) {
+        EVT N00VT = N0.getOperand(0).getValueType();
+        if (VT.getScalarSizeInBits() <= N00VT.getScalarSizeInBits() &&
+            VT.isInteger() && N00VT.isInteger()) {
+          EVT InVT =
+              TLI.getTypeToTransformTo(*DAG.getContext(), VT.getScalarType());
+          SDValue Op = DAG.getZExtOrTrunc(N0.getOperand(0).getOperand(0),
+                                          SDLoc(N), InVT);
+          return DAG.getSplatBuildVector(VT, SDLoc(N), Op);
+        }
+      }
+    }
+
+    // If this is a bit convert that changes the element type of the vector but
+    // not the number of vector elements, look through it.  Be careful not to
+    // look though conversions that change things like v4f32 to v2f64.
+    SDNode *V = N0.getNode();
+    if (V->getOpcode() == ISD::BITCAST) {
+      SDValue ConvInput = V->getOperand(0);
+      if (ConvInput.getValueType().isVector() &&
+          ConvInput.getValueType().getVectorNumElements() == NumElts)
+        V = ConvInput.getNode();
+    }
+
+    if (V->getOpcode() == ISD::BUILD_VECTOR) {
+      assert(V->getNumOperands() == NumElts &&
+             "BUILD_VECTOR has wrong number of operands");
+      SDValue Base;
+      bool AllSame = true;
+      for (unsigned i = 0; i != NumElts; ++i) {
+        if (!V->getOperand(i).isUndef()) {
+          Base = V->getOperand(i);
+          break;
+        }
+      }
+      // Splat of <u, u, u, u>, return <u, u, u, u>
+      if (!Base.getNode())
+        return N0;
+      for (unsigned i = 0; i != NumElts; ++i) {
+        if (V->getOperand(i) != Base) {
+          AllSame = false;
+          break;
+        }
+      }
+      // Splat of <x, x, x, x>, return <x, x, x, x>
+      if (AllSame)
+        return N0;
+
+      // Canonicalize any other splat as a build_vector.
+      SDValue Splatted = V->getOperand(SplatIndex);
+      SmallVector<SDValue, 8> Ops(NumElts, Splatted);
+      SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops);
+
+      // We may have jumped through bitcasts, so the type of the
+      // BUILD_VECTOR may not match the type of the shuffle.
+      if (V->getValueType(0) != VT)
+        NewBV = DAG.getBitcast(VT, NewBV);
+      return NewBV;
+    }
+  }
+
+  // Simplify source operands based on shuffle mask.
+  if (SimplifyDemandedVectorElts(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // This is intentionally placed after demanded elements simplification because
+  // it could eliminate knowledge of undef elements created by this shuffle.
+  if (SDValue ShufOp = simplifyShuffleOfShuffle(SVN))
+    return ShufOp;
+
+  // Match shuffles that can be converted to any_vector_extend_in_reg.
+  if (SDValue V =
+          combineShuffleToAnyExtendVectorInreg(SVN, DAG, TLI, LegalOperations))
+    return V;
+
+  // Combine "truncate_vector_in_reg" style shuffles.
+  if (SDValue V = combineTruncationShuffle(SVN, DAG))
+    return V;
+
+  if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
+      Level < AfterLegalizeVectorOps &&
+      (N1.isUndef() ||
+      (N1.getOpcode() == ISD::CONCAT_VECTORS &&
+       N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) {
+    if (SDValue V = partitionShuffleOfConcats(N, DAG))
+      return V;
+  }
+
+  // A shuffle of a concat of the same narrow vector can be reduced to use
+  // only low-half elements of a concat with undef:
+  // shuf (concat X, X), undef, Mask --> shuf (concat X, undef), undef, Mask'
+  if (N0.getOpcode() == ISD::CONCAT_VECTORS && N1.isUndef() &&
+      N0.getNumOperands() == 2 &&
+      N0.getOperand(0) == N0.getOperand(1)) {
+    int HalfNumElts = (int)NumElts / 2;
+    SmallVector<int, 8> NewMask;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      if (Idx >= HalfNumElts) {
+        assert(Idx < (int)NumElts && "Shuffle mask chooses undef op");
+        Idx -= HalfNumElts;
+      }
+      NewMask.push_back(Idx);
+    }
+    if (TLI.isShuffleMaskLegal(NewMask, VT)) {
+      SDValue UndefVec = DAG.getUNDEF(N0.getOperand(0).getValueType());
+      SDValue NewCat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
+                                   N0.getOperand(0), UndefVec);
+      return DAG.getVectorShuffle(VT, SDLoc(N), NewCat, N1, NewMask);
+    }
+  }
+
+  // See if we can replace a shuffle with an insert_subvector.
+  // e.g. v2i32 into v8i32:
+  // shuffle(lhs,concat(rhs0,rhs1,rhs2,rhs3),0,1,2,3,10,11,6,7).
+  // --> insert_subvector(lhs,rhs1,4).
+  if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT) &&
+      TLI.isOperationLegalOrCustom(ISD::INSERT_SUBVECTOR, VT)) {
+    auto ShuffleToInsert = [&](SDValue LHS, SDValue RHS, ArrayRef<int> Mask) {
+      // Ensure RHS subvectors are legal.
+      assert(RHS.getOpcode() == ISD::CONCAT_VECTORS && "Can't find subvectors");
+      EVT SubVT = RHS.getOperand(0).getValueType();
+      int NumSubVecs = RHS.getNumOperands();
+      int NumSubElts = SubVT.getVectorNumElements();
+      assert((NumElts % NumSubElts) == 0 && "Subvector mismatch");
+      if (!TLI.isTypeLegal(SubVT))
+        return SDValue();
+
+      // Don't bother if we have an unary shuffle (matches undef + LHS elts).
+      if (all_of(Mask, [NumElts](int M) { return M < (int)NumElts; }))
+        return SDValue();
+
+      // Search [NumSubElts] spans for RHS sequence.
+      // TODO: Can we avoid nested loops to increase performance?
+      SmallVector<int> InsertionMask(NumElts);
+      for (int SubVec = 0; SubVec != NumSubVecs; ++SubVec) {
+        for (int SubIdx = 0; SubIdx != (int)NumElts; SubIdx += NumSubElts) {
+          // Reset mask to identity.
+          std::iota(InsertionMask.begin(), InsertionMask.end(), 0);
+
+          // Add subvector insertion.
+          std::iota(InsertionMask.begin() + SubIdx,
+                    InsertionMask.begin() + SubIdx + NumSubElts,
+                    NumElts + (SubVec * NumSubElts));
+
+          // See if the shuffle mask matches the reference insertion mask.
+          bool MatchingShuffle = true;
+          for (int i = 0; i != (int)NumElts; ++i) {
+            int ExpectIdx = InsertionMask[i];
+            int ActualIdx = Mask[i];
+            if (0 <= ActualIdx && ExpectIdx != ActualIdx) {
+              MatchingShuffle = false;
+              break;
+            }
+          }
+
+          if (MatchingShuffle)
+            return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, LHS,
+                               RHS.getOperand(SubVec),
+                               DAG.getVectorIdxConstant(SubIdx, SDLoc(N)));
+        }
+      }
+      return SDValue();
+    };
+    ArrayRef<int> Mask = SVN->getMask();
+    if (N1.getOpcode() == ISD::CONCAT_VECTORS)
+      if (SDValue InsertN1 = ShuffleToInsert(N0, N1, Mask))
+        return InsertN1;
+    if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
+      SmallVector<int> CommuteMask(Mask);
+      ShuffleVectorSDNode::commuteMask(CommuteMask);
+      if (SDValue InsertN0 = ShuffleToInsert(N1, N0, CommuteMask))
+        return InsertN0;
+    }
+  }
+
+  // If we're not performing a select/blend shuffle, see if we can convert the
+  // shuffle into a AND node, with all the out-of-lane elements are known zero.
+  if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
+    bool IsInLaneMask = true;
+    ArrayRef<int> Mask = SVN->getMask();
+    SmallVector<int, 16> ClearMask(NumElts, -1);
+    APInt DemandedLHS = APInt::getZero(NumElts);
+    APInt DemandedRHS = APInt::getZero(NumElts);
+    for (int I = 0; I != (int)NumElts; ++I) {
+      int M = Mask[I];
+      if (M < 0)
+        continue;
+      ClearMask[I] = M == I ? I : (I + NumElts);
+      IsInLaneMask &= (M == I) || (M == (int)(I + NumElts));
+      if (M != I) {
+        APInt &Demanded = M < (int)NumElts ? DemandedLHS : DemandedRHS;
+        Demanded.setBit(M % NumElts);
+      }
+    }
+    // TODO: Should we try to mask with N1 as well?
+    if (!IsInLaneMask && (!DemandedLHS.isZero() || !DemandedRHS.isZero()) &&
+        (DemandedLHS.isZero() || DAG.MaskedVectorIsZero(N0, DemandedLHS)) &&
+        (DemandedRHS.isZero() || DAG.MaskedVectorIsZero(N1, DemandedRHS))) {
+      SDLoc DL(N);
+      EVT IntVT = VT.changeVectorElementTypeToInteger();
+      EVT IntSVT = VT.getVectorElementType().changeTypeToInteger();
+      // Transform the type to a legal type so that the buildvector constant
+      // elements are not illegal. Make sure that the result is larger than the
+      // original type, incase the value is split into two (eg i64->i32).
+      if (!TLI.isTypeLegal(IntSVT) && LegalTypes)
+        IntSVT = TLI.getTypeToTransformTo(*DAG.getContext(), IntSVT);
+      if (IntSVT.getSizeInBits() >= IntVT.getScalarSizeInBits()) {
+        SDValue ZeroElt = DAG.getConstant(0, DL, IntSVT);
+        SDValue AllOnesElt = DAG.getAllOnesConstant(DL, IntSVT);
+        SmallVector<SDValue, 16> AndMask(NumElts, DAG.getUNDEF(IntSVT));
+        for (int I = 0; I != (int)NumElts; ++I)
+          if (0 <= Mask[I])
+            AndMask[I] = Mask[I] == I ? AllOnesElt : ZeroElt;
+
+        // See if a clear mask is legal instead of going via
+        // XformToShuffleWithZero which loses UNDEF mask elements.
+        if (TLI.isVectorClearMaskLegal(ClearMask, IntVT))
+          return DAG.getBitcast(
+              VT, DAG.getVectorShuffle(IntVT, DL, DAG.getBitcast(IntVT, N0),
+                                      DAG.getConstant(0, DL, IntVT), ClearMask));
+
+        if (TLI.isOperationLegalOrCustom(ISD::AND, IntVT))
+          return DAG.getBitcast(
+              VT, DAG.getNode(ISD::AND, DL, IntVT, DAG.getBitcast(IntVT, N0),
+                              DAG.getBuildVector(IntVT, DL, AndMask)));
+      }
+    }
+  }
+
+  // Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
+  // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
+  if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT))
+    if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI))
+      return Res;
+
+  // If this shuffle only has a single input that is a bitcasted shuffle,
+  // attempt to merge the 2 shuffles and suitably bitcast the inputs/output
+  // back to their original types.
+  if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
+      N1.isUndef() && Level < AfterLegalizeVectorOps &&
+      TLI.isTypeLegal(VT)) {
+
+    SDValue BC0 = peekThroughOneUseBitcasts(N0);
+    if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) {
+      EVT SVT = VT.getScalarType();
+      EVT InnerVT = BC0->getValueType(0);
+      EVT InnerSVT = InnerVT.getScalarType();
+
+      // Determine which shuffle works with the smaller scalar type.
+      EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT;
+      EVT ScaleSVT = ScaleVT.getScalarType();
+
+      if (TLI.isTypeLegal(ScaleVT) &&
+          0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) &&
+          0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) {
+        int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits();
+        int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits();
+
+        // Scale the shuffle masks to the smaller scalar type.
+        ShuffleVectorSDNode *InnerSVN = cast<ShuffleVectorSDNode>(BC0);
+        SmallVector<int, 8> InnerMask;
+        SmallVector<int, 8> OuterMask;
+        narrowShuffleMaskElts(InnerScale, InnerSVN->getMask(), InnerMask);
+        narrowShuffleMaskElts(OuterScale, SVN->getMask(), OuterMask);
+
+        // Merge the shuffle masks.
+        SmallVector<int, 8> NewMask;
+        for (int M : OuterMask)
+          NewMask.push_back(M < 0 ? -1 : InnerMask[M]);
+
+        // Test for shuffle mask legality over both commutations.
+        SDValue SV0 = BC0->getOperand(0);
+        SDValue SV1 = BC0->getOperand(1);
+        bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
+        if (!LegalMask) {
+          std::swap(SV0, SV1);
+          ShuffleVectorSDNode::commuteMask(NewMask);
+          LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
+        }
+
+        if (LegalMask) {
+          SV0 = DAG.getBitcast(ScaleVT, SV0);
+          SV1 = DAG.getBitcast(ScaleVT, SV1);
+          return DAG.getBitcast(
+              VT, DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask));
+        }
+      }
+    }
+  }
+
+  // Match shuffles of bitcasts, so long as the mask can be treated as the
+  // larger type.
+  if (SDValue V = combineShuffleOfBitcast(SVN, DAG, TLI, LegalOperations))
+    return V;
+
+  // Compute the combined shuffle mask for a shuffle with SV0 as the first
+  // operand, and SV1 as the second operand.
+  // i.e. Merge SVN(OtherSVN, N1) -> shuffle(SV0, SV1, Mask) iff Commute = false
+  //      Merge SVN(N1, OtherSVN) -> shuffle(SV0, SV1, Mask') iff Commute = true
+  auto MergeInnerShuffle =
+      [NumElts, &VT](bool Commute, ShuffleVectorSDNode *SVN,
+                     ShuffleVectorSDNode *OtherSVN, SDValue N1,
+                     const TargetLowering &TLI, SDValue &SV0, SDValue &SV1,
+                     SmallVectorImpl<int> &Mask) -> bool {
+    // Don't try to fold splats; they're likely to simplify somehow, or they
+    // might be free.
+    if (OtherSVN->isSplat())
+      return false;
+
+    SV0 = SV1 = SDValue();
+    Mask.clear();
+
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      if (Idx < 0) {
+        // Propagate Undef.
+        Mask.push_back(Idx);
+        continue;
+      }
+
+      if (Commute)
+        Idx = (Idx < (int)NumElts) ? (Idx + NumElts) : (Idx - NumElts);
+
+      SDValue CurrentVec;
+      if (Idx < (int)NumElts) {
+        // This shuffle index refers to the inner shuffle N0. Lookup the inner
+        // shuffle mask to identify which vector is actually referenced.
+        Idx = OtherSVN->getMaskElt(Idx);
+        if (Idx < 0) {
+          // Propagate Undef.
+          Mask.push_back(Idx);
+          continue;
+        }
+        CurrentVec = (Idx < (int)NumElts) ? OtherSVN->getOperand(0)
+                                          : OtherSVN->getOperand(1);
+      } else {
+        // This shuffle index references an element within N1.
+        CurrentVec = N1;
+      }
+
+      // Simple case where 'CurrentVec' is UNDEF.
+      if (CurrentVec.isUndef()) {
+        Mask.push_back(-1);
+        continue;
+      }
+
+      // Canonicalize the shuffle index. We don't know yet if CurrentVec
+      // will be the first or second operand of the combined shuffle.
+      Idx = Idx % NumElts;
+      if (!SV0.getNode() || SV0 == CurrentVec) {
+        // Ok. CurrentVec is the left hand side.
+        // Update the mask accordingly.
+        SV0 = CurrentVec;
+        Mask.push_back(Idx);
+        continue;
+      }
+      if (!SV1.getNode() || SV1 == CurrentVec) {
+        // Ok. CurrentVec is the right hand side.
+        // Update the mask accordingly.
+        SV1 = CurrentVec;
+        Mask.push_back(Idx + NumElts);
+        continue;
+      }
+
+      // Last chance - see if the vector is another shuffle and if it
+      // uses one of the existing candidate shuffle ops.
+      if (auto *CurrentSVN = dyn_cast<ShuffleVectorSDNode>(CurrentVec)) {
+        int InnerIdx = CurrentSVN->getMaskElt(Idx);
+        if (InnerIdx < 0) {
+          Mask.push_back(-1);
+          continue;
+        }
+        SDValue InnerVec = (InnerIdx < (int)NumElts)
+                               ? CurrentSVN->getOperand(0)
+                               : CurrentSVN->getOperand(1);
+        if (InnerVec.isUndef()) {
+          Mask.push_back(-1);
+          continue;
+        }
+        InnerIdx %= NumElts;
+        if (InnerVec == SV0) {
+          Mask.push_back(InnerIdx);
+          continue;
+        }
+        if (InnerVec == SV1) {
+          Mask.push_back(InnerIdx + NumElts);
+          continue;
+        }
+      }
+
+      // Bail out if we cannot convert the shuffle pair into a single shuffle.
+      return false;
+    }
+
+    if (llvm::all_of(Mask, [](int M) { return M < 0; }))
+      return true;
+
+    // Avoid introducing shuffles with illegal mask.
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
+    if (TLI.isShuffleMaskLegal(Mask, VT))
+      return true;
+
+    std::swap(SV0, SV1);
+    ShuffleVectorSDNode::commuteMask(Mask);
+    return TLI.isShuffleMaskLegal(Mask, VT);
+  };
+
+  if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
+    // Canonicalize shuffles according to rules:
+    //  shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A)
+    //  shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B)
+    //  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
+    if (N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
+        N0.getOpcode() != ISD::VECTOR_SHUFFLE) {
+      // The incoming shuffle must be of the same type as the result of the
+      // current shuffle.
+      assert(N1->getOperand(0).getValueType() == VT &&
+             "Shuffle types don't match");
+
+      SDValue SV0 = N1->getOperand(0);
+      SDValue SV1 = N1->getOperand(1);
+      bool HasSameOp0 = N0 == SV0;
+      bool IsSV1Undef = SV1.isUndef();
+      if (HasSameOp0 || IsSV1Undef || N0 == SV1)
+        // Commute the operands of this shuffle so merging below will trigger.
+        return DAG.getCommutedVectorShuffle(*SVN);
+    }
+
+    // Canonicalize splat shuffles to the RHS to improve merging below.
+    //  shuffle(splat(A,u), shuffle(C,D)) -> shuffle'(shuffle(C,D), splat(A,u))
+    if (N0.getOpcode() == ISD::VECTOR_SHUFFLE &&
+        N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
+        cast<ShuffleVectorSDNode>(N0)->isSplat() &&
+        !cast<ShuffleVectorSDNode>(N1)->isSplat()) {
+      return DAG.getCommutedVectorShuffle(*SVN);
+    }
+
+    // Try to fold according to rules:
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
+    // Don't try to fold shuffles with illegal type.
+    // Only fold if this shuffle is the only user of the other shuffle.
+    // Try matching shuffle(C,shuffle(A,B)) commutted patterns as well.
+    for (int i = 0; i != 2; ++i) {
+      if (N->getOperand(i).getOpcode() == ISD::VECTOR_SHUFFLE &&
+          N->isOnlyUserOf(N->getOperand(i).getNode())) {
+        // The incoming shuffle must be of the same type as the result of the
+        // current shuffle.
+        auto *OtherSV = cast<ShuffleVectorSDNode>(N->getOperand(i));
+        assert(OtherSV->getOperand(0).getValueType() == VT &&
+               "Shuffle types don't match");
+
+        SDValue SV0, SV1;
+        SmallVector<int, 4> Mask;
+        if (MergeInnerShuffle(i != 0, SVN, OtherSV, N->getOperand(1 - i), TLI,
+                              SV0, SV1, Mask)) {
+          // Check if all indices in Mask are Undef. In case, propagate Undef.
+          if (llvm::all_of(Mask, [](int M) { return M < 0; }))
+            return DAG.getUNDEF(VT);
+
+          return DAG.getVectorShuffle(VT, SDLoc(N),
+                                      SV0 ? SV0 : DAG.getUNDEF(VT),
+                                      SV1 ? SV1 : DAG.getUNDEF(VT), Mask);
+        }
+      }
+    }
+
+    // Merge shuffles through binops if we are able to merge it with at least
+    // one other shuffles.
+    // shuffle(bop(shuffle(x,y),shuffle(z,w)),undef)
+    // shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d)))
+    unsigned SrcOpcode = N0.getOpcode();
+    if (TLI.isBinOp(SrcOpcode) && N->isOnlyUserOf(N0.getNode()) &&
+        (N1.isUndef() ||
+         (SrcOpcode == N1.getOpcode() && N->isOnlyUserOf(N1.getNode())))) {
+      // Get binop source ops, or just pass on the undef.
+      SDValue Op00 = N0.getOperand(0);
+      SDValue Op01 = N0.getOperand(1);
+      SDValue Op10 = N1.isUndef() ? N1 : N1.getOperand(0);
+      SDValue Op11 = N1.isUndef() ? N1 : N1.getOperand(1);
+      // TODO: We might be able to relax the VT check but we don't currently
+      // have any isBinOp() that has different result/ops VTs so play safe until
+      // we have test coverage.
+      if (Op00.getValueType() == VT && Op10.getValueType() == VT &&
+          Op01.getValueType() == VT && Op11.getValueType() == VT &&
+          (Op00.getOpcode() == ISD::VECTOR_SHUFFLE ||
+           Op10.getOpcode() == ISD::VECTOR_SHUFFLE ||
+           Op01.getOpcode() == ISD::VECTOR_SHUFFLE ||
+           Op11.getOpcode() == ISD::VECTOR_SHUFFLE)) {
+        auto CanMergeInnerShuffle = [&](SDValue &SV0, SDValue &SV1,
+                                        SmallVectorImpl<int> &Mask, bool LeftOp,
+                                        bool Commute) {
+          SDValue InnerN = Commute ? N1 : N0;
+          SDValue Op0 = LeftOp ? Op00 : Op01;
+          SDValue Op1 = LeftOp ? Op10 : Op11;
+          if (Commute)
+            std::swap(Op0, Op1);
+          // Only accept the merged shuffle if we don't introduce undef elements,
+          // or the inner shuffle already contained undef elements.
+          auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(Op0);
+          return SVN0 && InnerN->isOnlyUserOf(SVN0) &&
+                 MergeInnerShuffle(Commute, SVN, SVN0, Op1, TLI, SV0, SV1,
+                                   Mask) &&
+                 (llvm::any_of(SVN0->getMask(), [](int M) { return M < 0; }) ||
+                  llvm::none_of(Mask, [](int M) { return M < 0; }));
+        };
+
+        // Ensure we don't increase the number of shuffles - we must merge a
+        // shuffle from at least one of the LHS and RHS ops.
+        bool MergedLeft = false;
+        SDValue LeftSV0, LeftSV1;
+        SmallVector<int, 4> LeftMask;
+        if (CanMergeInnerShuffle(LeftSV0, LeftSV1, LeftMask, true, false) ||
+            CanMergeInnerShuffle(LeftSV0, LeftSV1, LeftMask, true, true)) {
+          MergedLeft = true;
+        } else {
+          LeftMask.assign(SVN->getMask().begin(), SVN->getMask().end());
+          LeftSV0 = Op00, LeftSV1 = Op10;
+        }
+
+        bool MergedRight = false;
+        SDValue RightSV0, RightSV1;
+        SmallVector<int, 4> RightMask;
+        if (CanMergeInnerShuffle(RightSV0, RightSV1, RightMask, false, false) ||
+            CanMergeInnerShuffle(RightSV0, RightSV1, RightMask, false, true)) {
+          MergedRight = true;
+        } else {
+          RightMask.assign(SVN->getMask().begin(), SVN->getMask().end());
+          RightSV0 = Op01, RightSV1 = Op11;
+        }
+
+        if (MergedLeft || MergedRight) {
+          SDLoc DL(N);
+          SDValue LHS = DAG.getVectorShuffle(
+              VT, DL, LeftSV0 ? LeftSV0 : DAG.getUNDEF(VT),
+              LeftSV1 ? LeftSV1 : DAG.getUNDEF(VT), LeftMask);
+          SDValue RHS = DAG.getVectorShuffle(
+              VT, DL, RightSV0 ? RightSV0 : DAG.getUNDEF(VT),
+              RightSV1 ? RightSV1 : DAG.getUNDEF(VT), RightMask);
+          return DAG.getNode(SrcOpcode, DL, VT, LHS, RHS);
+        }
+      }
+    }
+  }
+
+  if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG))
+    return V;
+
+  // Match shuffles that can be converted to ISD::ZERO_EXTEND_VECTOR_INREG.
+  // Perform this really late, because it could eliminate knowledge
+  // of undef elements created by this shuffle.
+  if (Level < AfterLegalizeTypes)
+    if (SDValue V = combineShuffleToZeroExtendVectorInReg(SVN, DAG, TLI,
+                                                          LegalOperations))
+      return V;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isFixedLengthVector())
+    return SDValue();
+
+  // Try to convert a scalar binop with an extracted vector element to a vector
+  // binop. This is intended to reduce potentially expensive register moves.
+  // TODO: Check if both operands are extracted.
+  // TODO: Generalize this, so it can be called from visitINSERT_VECTOR_ELT().
+  SDValue Scalar = N->getOperand(0);
+  unsigned Opcode = Scalar.getOpcode();
+  EVT VecEltVT = VT.getScalarType();
+  if (Scalar.hasOneUse() && Scalar->getNumValues() == 1 &&
+      TLI.isBinOp(Opcode) && Scalar.getValueType() == VecEltVT &&
+      Scalar.getOperand(0).getValueType() == VecEltVT &&
+      Scalar.getOperand(1).getValueType() == VecEltVT &&
+      DAG.isSafeToSpeculativelyExecute(Opcode) && hasOperation(Opcode, VT)) {
+    // Match an extract element and get a shuffle mask equivalent.
+    SmallVector<int, 8> ShufMask(VT.getVectorNumElements(), -1);
+
+    for (int i : {0, 1}) {
+      // s2v (bo (extelt V, Idx), C) --> shuffle (bo V, C'), {Idx, -1, -1...}
+      // s2v (bo C, (extelt V, Idx)) --> shuffle (bo C', V), {Idx, -1, -1...}
+      SDValue EE = Scalar.getOperand(i);
+      auto *C = dyn_cast<ConstantSDNode>(Scalar.getOperand(i ? 0 : 1));
+      if (C && EE.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          EE.getOperand(0).getValueType() == VT &&
+          isa<ConstantSDNode>(EE.getOperand(1))) {
+        // Mask = {ExtractIndex, undef, undef....}
+        ShufMask[0] = EE.getConstantOperandVal(1);
+        // Make sure the shuffle is legal if we are crossing lanes.
+        if (TLI.isShuffleMaskLegal(ShufMask, VT)) {
+          SDLoc DL(N);
+          SDValue V[] = {EE.getOperand(0),
+                         DAG.getConstant(C->getAPIntValue(), DL, VT)};
+          SDValue VecBO = DAG.getNode(Opcode, DL, VT, V[i], V[1 - i]);
+          return DAG.getVectorShuffle(VT, DL, VecBO, DAG.getUNDEF(VT),
+                                      ShufMask);
+        }
+      }
+    }
+  }
+
+  // Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern
+  // with a VECTOR_SHUFFLE and possible truncate.
+  if (Opcode != ISD::EXTRACT_VECTOR_ELT ||
+      !Scalar.getOperand(0).getValueType().isFixedLengthVector())
+    return SDValue();
+
+  // If we have an implicit truncate, truncate here if it is legal.
+  if (VecEltVT != Scalar.getValueType() &&
+      Scalar.getValueType().isScalarInteger() && isTypeLegal(VecEltVT)) {
+    SDValue Val = DAG.getNode(ISD::TRUNCATE, SDLoc(Scalar), VecEltVT, Scalar);
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val);
+  }
+
+  auto *ExtIndexC = dyn_cast<ConstantSDNode>(Scalar.getOperand(1));
+  if (!ExtIndexC)
+    return SDValue();
+
+  SDValue SrcVec = Scalar.getOperand(0);
+  EVT SrcVT = SrcVec.getValueType();
+  unsigned SrcNumElts = SrcVT.getVectorNumElements();
+  unsigned VTNumElts = VT.getVectorNumElements();
+  if (VecEltVT == SrcVT.getScalarType() && VTNumElts <= SrcNumElts) {
+    // Create a shuffle equivalent for scalar-to-vector: {ExtIndex, -1, -1, ...}
+    SmallVector<int, 8> Mask(SrcNumElts, -1);
+    Mask[0] = ExtIndexC->getZExtValue();
+    SDValue LegalShuffle = TLI.buildLegalVectorShuffle(
+        SrcVT, SDLoc(N), SrcVec, DAG.getUNDEF(SrcVT), Mask, DAG);
+    if (!LegalShuffle)
+      return SDValue();
+
+    // If the initial vector is the same size, the shuffle is the result.
+    if (VT == SrcVT)
+      return LegalShuffle;
+
+    // If not, shorten the shuffled vector.
+    if (VTNumElts != SrcNumElts) {
+      SDValue ZeroIdx = DAG.getVectorIdxConstant(0, SDLoc(N));
+      EVT SubVT = EVT::getVectorVT(*DAG.getContext(),
+                                   SrcVT.getVectorElementType(), VTNumElts);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT, LegalShuffle,
+                         ZeroIdx);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  uint64_t InsIdx = N->getConstantOperandVal(2);
+
+  // If inserting an UNDEF, just return the original vector.
+  if (N1.isUndef())
+    return N0;
+
+  // If this is an insert of an extracted vector into an undef vector, we can
+  // just use the input to the extract.
+  if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT)
+    return N1.getOperand(0);
+
+  // Simplify scalar inserts into an undef vector:
+  // insert_subvector undef, (splat X), N2 -> splat X
+  if (N0.isUndef() && N1.getOpcode() == ISD::SPLAT_VECTOR)
+    return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, N1.getOperand(0));
+
+  // If we are inserting a bitcast value into an undef, with the same
+  // number of elements, just use the bitcast input of the extract.
+  // i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 ->
+  //        BITCAST (INSERT_SUBVECTOR UNDEF N1 N2)
+  if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST &&
+      N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      N1.getOperand(0).getOperand(1) == N2 &&
+      N1.getOperand(0).getOperand(0).getValueType().getVectorElementCount() ==
+          VT.getVectorElementCount() &&
+      N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() ==
+          VT.getSizeInBits()) {
+    return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0));
+  }
+
+  // If both N1 and N2 are bitcast values on which insert_subvector
+  // would makes sense, pull the bitcast through.
+  // i.e. INSERT_SUBVECTOR (BITCAST N0) (BITCAST N1) N2 ->
+  //        BITCAST (INSERT_SUBVECTOR N0 N1 N2)
+  if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
+    SDValue CN0 = N0.getOperand(0);
+    SDValue CN1 = N1.getOperand(0);
+    EVT CN0VT = CN0.getValueType();
+    EVT CN1VT = CN1.getValueType();
+    if (CN0VT.isVector() && CN1VT.isVector() &&
+        CN0VT.getVectorElementType() == CN1VT.getVectorElementType() &&
+        CN0VT.getVectorElementCount() == VT.getVectorElementCount()) {
+      SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N),
+                                      CN0.getValueType(), CN0, CN1, N2);
+      return DAG.getBitcast(VT, NewINSERT);
+    }
+  }
+
+  // Combine INSERT_SUBVECTORs where we are inserting to the same index.
+  // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx )
+  // --> INSERT_SUBVECTOR( Vec, SubNew, Idx )
+  if (N0.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      N0.getOperand(1).getValueType() == N1.getValueType() &&
+      N0.getOperand(2) == N2)
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
+                       N1, N2);
+
+  // Eliminate an intermediate insert into an undef vector:
+  // insert_subvector undef, (insert_subvector undef, X, 0), N2 -->
+  // insert_subvector undef, X, N2
+  if (N0.isUndef() && N1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      N1.getOperand(0).isUndef() && isNullConstant(N1.getOperand(2)))
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0,
+                       N1.getOperand(1), N2);
+
+  // Push subvector bitcasts to the output, adjusting the index as we go.
+  // insert_subvector(bitcast(v), bitcast(s), c1)
+  // -> bitcast(insert_subvector(v, s, c2))
+  if ((N0.isUndef() || N0.getOpcode() == ISD::BITCAST) &&
+      N1.getOpcode() == ISD::BITCAST) {
+    SDValue N0Src = peekThroughBitcasts(N0);
+    SDValue N1Src = peekThroughBitcasts(N1);
+    EVT N0SrcSVT = N0Src.getValueType().getScalarType();
+    EVT N1SrcSVT = N1Src.getValueType().getScalarType();
+    if ((N0.isUndef() || N0SrcSVT == N1SrcSVT) &&
+        N0Src.getValueType().isVector() && N1Src.getValueType().isVector()) {
+      EVT NewVT;
+      SDLoc DL(N);
+      SDValue NewIdx;
+      LLVMContext &Ctx = *DAG.getContext();
+      ElementCount NumElts = VT.getVectorElementCount();
+      unsigned EltSizeInBits = VT.getScalarSizeInBits();
+      if ((EltSizeInBits % N1SrcSVT.getSizeInBits()) == 0) {
+        unsigned Scale = EltSizeInBits / N1SrcSVT.getSizeInBits();
+        NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts * Scale);
+        NewIdx = DAG.getVectorIdxConstant(InsIdx * Scale, DL);
+      } else if ((N1SrcSVT.getSizeInBits() % EltSizeInBits) == 0) {
+        unsigned Scale = N1SrcSVT.getSizeInBits() / EltSizeInBits;
+        if (NumElts.isKnownMultipleOf(Scale) && (InsIdx % Scale) == 0) {
+          NewVT = EVT::getVectorVT(Ctx, N1SrcSVT,
+                                   NumElts.divideCoefficientBy(Scale));
+          NewIdx = DAG.getVectorIdxConstant(InsIdx / Scale, DL);
+        }
+      }
+      if (NewIdx && hasOperation(ISD::INSERT_SUBVECTOR, NewVT)) {
+        SDValue Res = DAG.getBitcast(NewVT, N0Src);
+        Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT, Res, N1Src, NewIdx);
+        return DAG.getBitcast(VT, Res);
+      }
+    }
+  }
+
+  // Canonicalize insert_subvector dag nodes.
+  // Example:
+  // (insert_subvector (insert_subvector A, Idx0), Idx1)
+  // -> (insert_subvector (insert_subvector A, Idx1), Idx0)
+  if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() &&
+      N1.getValueType() == N0.getOperand(1).getValueType()) {
+    unsigned OtherIdx = N0.getConstantOperandVal(2);
+    if (InsIdx < OtherIdx) {
+      // Swap nodes.
+      SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT,
+                                  N0.getOperand(0), N1, N2);
+      AddToWorklist(NewOp.getNode());
+      return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()),
+                         VT, NewOp, N0.getOperand(1), N0.getOperand(2));
+    }
+  }
+
+  // If the input vector is a concatenation, and the insert replaces
+  // one of the pieces, we can optimize into a single concat_vectors.
+  if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() &&
+      N0.getOperand(0).getValueType() == N1.getValueType() &&
+      N0.getOperand(0).getValueType().isScalableVector() ==
+          N1.getValueType().isScalableVector()) {
+    unsigned Factor = N1.getValueType().getVectorMinNumElements();
+    SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end());
+    Ops[InsIdx / Factor] = N1;
+    return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
+  }
+
+  // Simplify source operands based on insertion.
+  if (SimplifyDemandedVectorElts(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+
+  // fold (fp_to_fp16 (fp16_to_fp op)) -> op
+  if (N0->getOpcode() == ISD::FP16_TO_FP)
+    return N0->getOperand(0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+
+  // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op)
+  if (!TLI.shouldKeepZExtForFP16Conv() && N0->getOpcode() == ISD::AND) {
+    ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1));
+    if (AndConst && AndConst->getAPIntValue() == 0xffff) {
+      return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0),
+                         N0.getOperand(0));
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_TO_BF16(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+
+  // fold (fp_to_bf16 (bf16_to_fp op)) -> op
+  if (N0->getOpcode() == ISD::BF16_TO_FP)
+    return N0->getOperand(0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitVECREDUCE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N0.getValueType();
+  unsigned Opcode = N->getOpcode();
+
+  // VECREDUCE over 1-element vector is just an extract.
+  if (VT.getVectorElementCount().isScalar()) {
+    SDLoc dl(N);
+    SDValue Res =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0,
+                    DAG.getVectorIdxConstant(0, dl));
+    if (Res.getValueType() != N->getValueType(0))
+      Res = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Res);
+    return Res;
+  }
+
+  // On an boolean vector an and/or reduction is the same as a umin/umax
+  // reduction. Convert them if the latter is legal while the former isn't.
+  if (Opcode == ISD::VECREDUCE_AND || Opcode == ISD::VECREDUCE_OR) {
+    unsigned NewOpcode = Opcode == ISD::VECREDUCE_AND
+        ? ISD::VECREDUCE_UMIN : ISD::VECREDUCE_UMAX;
+    if (!TLI.isOperationLegalOrCustom(Opcode, VT) &&
+        TLI.isOperationLegalOrCustom(NewOpcode, VT) &&
+        DAG.ComputeNumSignBits(N0) == VT.getScalarSizeInBits())
+      return DAG.getNode(NewOpcode, SDLoc(N), N->getValueType(0), N0);
+  }
+
+  // vecreduce_or(insert_subvector(zero or undef, val)) -> vecreduce_or(val)
+  // vecreduce_and(insert_subvector(ones or undef, val)) -> vecreduce_and(val)
+  if (N0.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
+    SDValue Vec = N0.getOperand(0);
+    SDValue Subvec = N0.getOperand(1);
+    if ((Opcode == ISD::VECREDUCE_OR &&
+         (N0.getOperand(0).isUndef() || isNullOrNullSplat(Vec))) ||
+        (Opcode == ISD::VECREDUCE_AND &&
+         (N0.getOperand(0).isUndef() || isAllOnesOrAllOnesSplat(Vec))))
+      return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), Subvec);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitVP_FSUB(SDNode *N) {
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N);
+
+  // FSUB -> FMA combines:
+  if (SDValue Fused = visitFSUBForFMACombine<VPMatchContext>(N)) {
+    AddToWorklist(Fused.getNode());
+    return Fused;
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitVPOp(SDNode *N) {
+
+  if (N->getOpcode() == ISD::VP_GATHER)
+    if (SDValue SD = visitVPGATHER(N))
+      return SD;
+
+  if (N->getOpcode() == ISD::VP_SCATTER)
+    if (SDValue SD = visitVPSCATTER(N))
+      return SD;
+
+  // VP operations in which all vector elements are disabled - either by
+  // determining that the mask is all false or that the EVL is 0 - can be
+  // eliminated.
+  bool AreAllEltsDisabled = false;
+  if (auto EVLIdx = ISD::getVPExplicitVectorLengthIdx(N->getOpcode()))
+    AreAllEltsDisabled |= isNullConstant(N->getOperand(*EVLIdx));
+  if (auto MaskIdx = ISD::getVPMaskIdx(N->getOpcode()))
+    AreAllEltsDisabled |=
+        ISD::isConstantSplatVectorAllZeros(N->getOperand(*MaskIdx).getNode());
+
+  // This is the only generic VP combine we support for now.
+  if (!AreAllEltsDisabled) {
+    switch (N->getOpcode()) {
+    case ISD::VP_FADD:
+      return visitVP_FADD(N);
+    case ISD::VP_FSUB:
+      return visitVP_FSUB(N);
+    case ISD::VP_FMA:
+      return visitFMA<VPMatchContext>(N);
+    }
+    return SDValue();
+  }
+
+  // Binary operations can be replaced by UNDEF.
+  if (ISD::isVPBinaryOp(N->getOpcode()))
+    return DAG.getUNDEF(N->getValueType(0));
+
+  // VP Memory operations can be replaced by either the chain (stores) or the
+  // chain + undef (loads).
+  if (const auto *MemSD = dyn_cast<MemSDNode>(N)) {
+    if (MemSD->writeMem())
+      return MemSD->getChain();
+    return CombineTo(N, DAG.getUNDEF(N->getValueType(0)), MemSD->getChain());
+  }
+
+  // Reduction operations return the start operand when no elements are active.
+  if (ISD::isVPReduction(N->getOpcode()))
+    return N->getOperand(0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitGET_FPENV_MEM(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Ptr = N->getOperand(1);
+  EVT MemVT = cast<FPStateAccessSDNode>(N)->getMemoryVT();
+
+  // Check if the memory, where FP state is written to, is used only in a single
+  // load operation.
+  LoadSDNode *LdNode = nullptr;
+  for (auto *U : Ptr->uses()) {
+    if (U == N)
+      continue;
+    if (auto *Ld = dyn_cast<LoadSDNode>(U)) {
+      if (LdNode && LdNode != Ld)
+        return SDValue();
+      LdNode = Ld;
+      continue;
+    }
+    return SDValue();
+  }
+  if (!LdNode || !LdNode->isSimple() || LdNode->isIndexed() ||
+      !LdNode->getOffset().isUndef() || LdNode->getMemoryVT() != MemVT ||
+      !LdNode->getChain().reachesChainWithoutSideEffects(SDValue(N, 0)))
+    return SDValue();
+
+  // Check if the loaded value is used only in a store operation.
+  StoreSDNode *StNode = nullptr;
+  for (auto I = LdNode->use_begin(), E = LdNode->use_end(); I != E; ++I) {
+    SDUse &U = I.getUse();
+    if (U.getResNo() == 0) {
+      if (auto *St = dyn_cast<StoreSDNode>(U.getUser())) {
+        if (StNode)
+          return SDValue();
+        StNode = St;
+      } else {
+        return SDValue();
+      }
+    }
+  }
+  if (!StNode || !StNode->isSimple() || StNode->isIndexed() ||
+      !StNode->getOffset().isUndef() || StNode->getMemoryVT() != MemVT ||
+      !StNode->getChain().reachesChainWithoutSideEffects(SDValue(LdNode, 1)))
+    return SDValue();
+
+  // Create new node GET_FPENV_MEM, which uses the store address to write FP
+  // environment.
+  SDValue Res = DAG.getGetFPEnv(Chain, SDLoc(N), StNode->getBasePtr(), MemVT,
+                                StNode->getMemOperand());
+  CombineTo(StNode, Res, false);
+  return Res;
+}
+
+SDValue DAGCombiner::visitSET_FPENV_MEM(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Ptr = N->getOperand(1);
+  EVT MemVT = cast<FPStateAccessSDNode>(N)->getMemoryVT();
+
+  // Check if the address of FP state is used also in a store operation only.
+  StoreSDNode *StNode = nullptr;
+  for (auto *U : Ptr->uses()) {
+    if (U == N)
+      continue;
+    if (auto *St = dyn_cast<StoreSDNode>(U)) {
+      if (StNode && StNode != St)
+        return SDValue();
+      StNode = St;
+      continue;
+    }
+    return SDValue();
+  }
+  if (!StNode || !StNode->isSimple() || StNode->isIndexed() ||
+      !StNode->getOffset().isUndef() || StNode->getMemoryVT() != MemVT ||
+      !Chain.reachesChainWithoutSideEffects(SDValue(StNode, 0)))
+    return SDValue();
+
+  // Check if the stored value is loaded from some location and the loaded
+  // value is used only in the store operation.
+  SDValue StValue = StNode->getValue();
+  auto *LdNode = dyn_cast<LoadSDNode>(StValue);
+  if (!LdNode || !LdNode->isSimple() || LdNode->isIndexed() ||
+      !LdNode->getOffset().isUndef() || LdNode->getMemoryVT() != MemVT ||
+      !StNode->getChain().reachesChainWithoutSideEffects(SDValue(LdNode, 1)))
+    return SDValue();
+
+  // Create new node SET_FPENV_MEM, which uses the load address to read FP
+  // environment.
+  SDValue Res =
+      DAG.getSetFPEnv(LdNode->getChain(), SDLoc(N), LdNode->getBasePtr(), MemVT,
+                      LdNode->getMemOperand());
+  return Res;
+}
+
+/// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle
+/// with the destination vector and a zero vector.
+/// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
+///      vector_shuffle V, Zero, <0, 4, 2, 4>
+SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
+  assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
+
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = peekThroughBitcasts(N->getOperand(1));
+  SDLoc DL(N);
+
+  // Make sure we're not running after operation legalization where it
+  // may have custom lowered the vector shuffles.
+  if (LegalOperations)
+    return SDValue();
+
+  if (RHS.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  EVT RVT = RHS.getValueType();
+  unsigned NumElts = RHS.getNumOperands();
+
+  // Attempt to create a valid clear mask, splitting the mask into
+  // sub elements and checking to see if each is
+  // all zeros or all ones - suitable for shuffle masking.
+  auto BuildClearMask = [&](int Split) {
+    int NumSubElts = NumElts * Split;
+    int NumSubBits = RVT.getScalarSizeInBits() / Split;
+
+    SmallVector<int, 8> Indices;
+    for (int i = 0; i != NumSubElts; ++i) {
+      int EltIdx = i / Split;
+      int SubIdx = i % Split;
+      SDValue Elt = RHS.getOperand(EltIdx);
+      // X & undef --> 0 (not undef). So this lane must be converted to choose
+      // from the zero constant vector (same as if the element had all 0-bits).
+      if (Elt.isUndef()) {
+        Indices.push_back(i + NumSubElts);
+        continue;
+      }
+
+      APInt Bits;
+      if (isa<ConstantSDNode>(Elt))
+        Bits = cast<ConstantSDNode>(Elt)->getAPIntValue();
+      else if (isa<ConstantFPSDNode>(Elt))
+        Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt();
+      else
+        return SDValue();
+
+      // Extract the sub element from the constant bit mask.
+      if (DAG.getDataLayout().isBigEndian())
+        Bits = Bits.extractBits(NumSubBits, (Split - SubIdx - 1) * NumSubBits);
+      else
+        Bits = Bits.extractBits(NumSubBits, SubIdx * NumSubBits);
+
+      if (Bits.isAllOnes())
+        Indices.push_back(i);
+      else if (Bits == 0)
+        Indices.push_back(i + NumSubElts);
+      else
+        return SDValue();
+    }
+
+    // Let's see if the target supports this vector_shuffle.
+    EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits);
+    EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts);
+    if (!TLI.isVectorClearMaskLegal(Indices, ClearVT))
+      return SDValue();
+
+    SDValue Zero = DAG.getConstant(0, DL, ClearVT);
+    return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, DL,
+                                                   DAG.getBitcast(ClearVT, LHS),
+                                                   Zero, Indices));
+  };
+
+  // Determine maximum split level (byte level masking).
+  int MaxSplit = 1;
+  if (RVT.getScalarSizeInBits() % 8 == 0)
+    MaxSplit = RVT.getScalarSizeInBits() / 8;
+
+  for (int Split = 1; Split <= MaxSplit; ++Split)
+    if (RVT.getScalarSizeInBits() % Split == 0)
+      if (SDValue S = BuildClearMask(Split))
+        return S;
+
+  return SDValue();
+}
+
+/// If a vector binop is performed on splat values, it may be profitable to
+/// extract, scalarize, and insert/splat.
+static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
+                                      const SDLoc &DL) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  unsigned Opcode = N->getOpcode();
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // TODO: Remove/replace the extract cost check? If the elements are available
+  //       as scalars, then there may be no extract cost. Should we ask if
+  //       inserting a scalar back into a vector is cheap instead?
+  int Index0, Index1;
+  SDValue Src0 = DAG.getSplatSourceVector(N0, Index0);
+  SDValue Src1 = DAG.getSplatSourceVector(N1, Index1);
+  // Extract element from splat_vector should be free.
+  // TODO: use DAG.isSplatValue instead?
+  bool IsBothSplatVector = N0.getOpcode() == ISD::SPLAT_VECTOR &&
+                           N1.getOpcode() == ISD::SPLAT_VECTOR;
+  if (!Src0 || !Src1 || Index0 != Index1 ||
+      Src0.getValueType().getVectorElementType() != EltVT ||
+      Src1.getValueType().getVectorElementType() != EltVT ||
+      !(IsBothSplatVector || TLI.isExtractVecEltCheap(VT, Index0)) ||
+      !TLI.isOperationLegalOrCustom(Opcode, EltVT))
+    return SDValue();
+
+  SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
+  SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC);
+  SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC);
+  SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags());
+
+  // If all lanes but 1 are undefined, no need to splat the scalar result.
+  // TODO: Keep track of undefs and use that info in the general case.
+  if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() &&
+      count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1 &&
+      count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) {
+    // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
+    // build_vec ..undef, (bo X, Y), undef...
+    SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT));
+    Ops[Index0] = ScalarBO;
+    return DAG.getBuildVector(VT, DL, Ops);
+  }
+
+  // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index
+  return DAG.getSplat(VT, DL, ScalarBO);
+}
+
+/// Visit a vector cast operation, like FP_EXTEND.
+SDValue DAGCombiner::SimplifyVCastOp(SDNode *N, const SDLoc &DL) {
+  EVT VT = N->getValueType(0);
+  assert(VT.isVector() && "SimplifyVCastOp only works on vectors!");
+  EVT EltVT = VT.getVectorElementType();
+  unsigned Opcode = N->getOpcode();
+
+  SDValue N0 = N->getOperand(0);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // TODO: promote operation might be also good here?
+  int Index0;
+  SDValue Src0 = DAG.getSplatSourceVector(N0, Index0);
+  if (Src0 &&
+      (N0.getOpcode() == ISD::SPLAT_VECTOR ||
+       TLI.isExtractVecEltCheap(VT, Index0)) &&
+      TLI.isOperationLegalOrCustom(Opcode, EltVT) &&
+      TLI.preferScalarizeSplat(N)) {
+    EVT SrcVT = N0.getValueType();
+    EVT SrcEltVT = SrcVT.getVectorElementType();
+    SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
+    SDValue Elt =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcEltVT, Src0, IndexC);
+    SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, Elt, N->getFlags());
+    if (VT.isScalableVector())
+      return DAG.getSplatVector(VT, DL, ScalarBO);
+    SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), ScalarBO);
+    return DAG.getBuildVector(VT, DL, Ops);
+  }
+
+  return SDValue();
+}
+
+/// Visit a binary vector operation, like ADD.
+SDValue DAGCombiner::SimplifyVBinOp(SDNode *N, const SDLoc &DL) {
+  EVT VT = N->getValueType(0);
+  assert(VT.isVector() && "SimplifyVBinOp only works on vectors!");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  unsigned Opcode = N->getOpcode();
+  SDNodeFlags Flags = N->getFlags();
+
+  // Move unary shuffles with identical masks after a vector binop:
+  // VBinOp (shuffle A, Undef, Mask), (shuffle B, Undef, Mask))
+  //   --> shuffle (VBinOp A, B), Undef, Mask
+  // This does not require type legality checks because we are creating the
+  // same types of operations that are in the original sequence. We do have to
+  // restrict ops like integer div that have immediate UB (eg, div-by-zero)
+  // though. This code is adapted from the identical transform in instcombine.
+  if (DAG.isSafeToSpeculativelyExecute(Opcode)) {
+    auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
+    auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
+    if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
+        LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
+        (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
+      SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS.getOperand(0),
+                                     RHS.getOperand(0), Flags);
+      SDValue UndefV = LHS.getOperand(1);
+      return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
+    }
+
+    // Try to sink a splat shuffle after a binop with a uniform constant.
+    // This is limited to cases where neither the shuffle nor the constant have
+    // undefined elements because that could be poison-unsafe or inhibit
+    // demanded elements analysis. It is further limited to not change a splat
+    // of an inserted scalar because that may be optimized better by
+    // load-folding or other target-specific behaviors.
+    if (isConstOrConstSplat(RHS) && Shuf0 && all_equal(Shuf0->getMask()) &&
+        Shuf0->hasOneUse() && Shuf0->getOperand(1).isUndef() &&
+        Shuf0->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {
+      // binop (splat X), (splat C) --> splat (binop X, C)
+      SDValue X = Shuf0->getOperand(0);
+      SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, X, RHS, Flags);
+      return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),
+                                  Shuf0->getMask());
+    }
+    if (isConstOrConstSplat(LHS) && Shuf1 && all_equal(Shuf1->getMask()) &&
+        Shuf1->hasOneUse() && Shuf1->getOperand(1).isUndef() &&
+        Shuf1->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {
+      // binop (splat C), (splat X) --> splat (binop C, X)
+      SDValue X = Shuf1->getOperand(0);
+      SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS, X, Flags);
+      return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),
+                                  Shuf1->getMask());
+    }
+  }
+
+  // The following pattern is likely to emerge with vector reduction ops. Moving
+  // the binary operation ahead of insertion may allow using a narrower vector
+  // instruction that has better performance than the wide version of the op:
+  // VBinOp (ins undef, X, Z), (ins undef, Y, Z) --> ins VecC, (VBinOp X, Y), Z
+  if (LHS.getOpcode() == ISD::INSERT_SUBVECTOR && LHS.getOperand(0).isUndef() &&
+      RHS.getOpcode() == ISD::INSERT_SUBVECTOR && RHS.getOperand(0).isUndef() &&
+      LHS.getOperand(2) == RHS.getOperand(2) &&
+      (LHS.hasOneUse() || RHS.hasOneUse())) {
+    SDValue X = LHS.getOperand(1);
+    SDValue Y = RHS.getOperand(1);
+    SDValue Z = LHS.getOperand(2);
+    EVT NarrowVT = X.getValueType();
+    if (NarrowVT == Y.getValueType() &&
+        TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT,
+                                              LegalOperations)) {
+      // (binop undef, undef) may not return undef, so compute that result.
+      SDValue VecC =
+          DAG.getNode(Opcode, DL, VT, DAG.getUNDEF(VT), DAG.getUNDEF(VT));
+      SDValue NarrowBO = DAG.getNode(Opcode, DL, NarrowVT, X, Y);
+      return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, VecC, NarrowBO, Z);
+    }
+  }
+
+  // Make sure all but the first op are undef or constant.
+  auto ConcatWithConstantOrUndef = [](SDValue Concat) {
+    return Concat.getOpcode() == ISD::CONCAT_VECTORS &&
+           all_of(drop_begin(Concat->ops()), [](const SDValue &Op) {
+             return Op.isUndef() ||
+                    ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
+           });
+  };
+
+  // The following pattern is likely to emerge with vector reduction ops. Moving
+  // the binary operation ahead of the concat may allow using a narrower vector
+  // instruction that has better performance than the wide version of the op:
+  // VBinOp (concat X, undef/constant), (concat Y, undef/constant) -->
+  //   concat (VBinOp X, Y), VecC
+  if (ConcatWithConstantOrUndef(LHS) && ConcatWithConstantOrUndef(RHS) &&
+      (LHS.hasOneUse() || RHS.hasOneUse())) {
+    EVT NarrowVT = LHS.getOperand(0).getValueType();
+    if (NarrowVT == RHS.getOperand(0).getValueType() &&
+        TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) {
+      unsigned NumOperands = LHS.getNumOperands();
+      SmallVector<SDValue, 4> ConcatOps;
+      for (unsigned i = 0; i != NumOperands; ++i) {
+        // This constant fold for operands 1 and up.
+        ConcatOps.push_back(DAG.getNode(Opcode, DL, NarrowVT, LHS.getOperand(i),
+                                        RHS.getOperand(i)));
+      }
+
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
+    }
+  }
+
+  if (SDValue V = scalarizeBinOpOfSplats(N, DAG, DL))
+    return V;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1,
+                                    SDValue N2) {
+  assert(N0.getOpcode() == ISD::SETCC &&
+         "First argument must be a SetCC node!");
+
+  SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2,
+                                 cast<CondCodeSDNode>(N0.getOperand(2))->get());
+
+  // If we got a simplified select_cc node back from SimplifySelectCC, then
+  // break it down into a new SETCC node, and a new SELECT node, and then return
+  // the SELECT node, since we were called with a SELECT node.
+  if (SCC.getNode()) {
+    // Check to see if we got a select_cc back (to turn into setcc/select).
+    // Otherwise, just return whatever node we got back, like fabs.
+    if (SCC.getOpcode() == ISD::SELECT_CC) {
+      const SDNodeFlags Flags = N0->getFlags();
+      SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0),
+                                  N0.getValueType(),
+                                  SCC.getOperand(0), SCC.getOperand(1),
+                                  SCC.getOperand(4), Flags);
+      AddToWorklist(SETCC.getNode());
+      SDValue SelectNode = DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC,
+                                         SCC.getOperand(2), SCC.getOperand(3));
+      SelectNode->setFlags(Flags);
+      return SelectNode;
+    }
+
+    return SCC;
+  }
+  return SDValue();
+}
+
+/// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values
+/// being selected between, see if we can simplify the select.  Callers of this
+/// should assume that TheSelect is deleted if this returns true.  As such, they
+/// should return the appropriate thing (e.g. the node) back to the top-level of
+/// the DAG combiner loop to avoid it being looked at.
+bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
+                                    SDValue RHS) {
+  // fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
+  // The select + setcc is redundant, because fsqrt returns NaN for X < 0.
+  if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) {
+    if (NaN->isNaN() && RHS.getOpcode() == ISD::FSQRT) {
+      // We have: (select (setcc ?, ?, ?), NaN, (fsqrt ?))
+      SDValue Sqrt = RHS;
+      ISD::CondCode CC;
+      SDValue CmpLHS;
+      const ConstantFPSDNode *Zero = nullptr;
+
+      if (TheSelect->getOpcode() == ISD::SELECT_CC) {
+        CC = cast<CondCodeSDNode>(TheSelect->getOperand(4))->get();
+        CmpLHS = TheSelect->getOperand(0);
+        Zero = isConstOrConstSplatFP(TheSelect->getOperand(1));
+      } else {
+        // SELECT or VSELECT
+        SDValue Cmp = TheSelect->getOperand(0);
+        if (Cmp.getOpcode() == ISD::SETCC) {
+          CC = cast<CondCodeSDNode>(Cmp.getOperand(2))->get();
+          CmpLHS = Cmp.getOperand(0);
+          Zero = isConstOrConstSplatFP(Cmp.getOperand(1));
+        }
+      }
+      if (Zero && Zero->isZero() &&
+          Sqrt.getOperand(0) == CmpLHS && (CC == ISD::SETOLT ||
+          CC == ISD::SETULT || CC == ISD::SETLT)) {
+        // We have: (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
+        CombineTo(TheSelect, Sqrt);
+        return true;
+      }
+    }
+  }
+  // Cannot simplify select with vector condition
+  if (TheSelect->getOperand(0).getValueType().isVector()) return false;
+
+  // If this is a select from two identical things, try to pull the operation
+  // through the select.
+  if (LHS.getOpcode() != RHS.getOpcode() ||
+      !LHS.hasOneUse() || !RHS.hasOneUse())
+    return false;
+
+  // If this is a load and the token chain is identical, replace the select
+  // of two loads with a load through a select of the address to load from.
+  // This triggers in things like "select bool X, 10.0, 123.0" after the FP
+  // constants have been dropped into the constant pool.
+  if (LHS.getOpcode() == ISD::LOAD) {
+    LoadSDNode *LLD = cast<LoadSDNode>(LHS);
+    LoadSDNode *RLD = cast<LoadSDNode>(RHS);
+
+    // Token chains must be identical.
+    if (LHS.getOperand(0) != RHS.getOperand(0) ||
+        // Do not let this transformation reduce the number of volatile loads.
+        // Be conservative for atomics for the moment
+        // TODO: This does appear to be legal for unordered atomics (see D66309)
+        !LLD->isSimple() || !RLD->isSimple() ||
+        // FIXME: If either is a pre/post inc/dec load,
+        // we'd need to split out the address adjustment.
+        LLD->isIndexed() || RLD->isIndexed() ||
+        // If this is an EXTLOAD, the VT's must match.
+        LLD->getMemoryVT() != RLD->getMemoryVT() ||
+        // If this is an EXTLOAD, the kind of extension must match.
+        (LLD->getExtensionType() != RLD->getExtensionType() &&
+         // The only exception is if one of the extensions is anyext.
+         LLD->getExtensionType() != ISD::EXTLOAD &&
+         RLD->getExtensionType() != ISD::EXTLOAD) ||
+        // FIXME: this discards src value information.  This is
+        // over-conservative. It would be beneficial to be able to remember
+        // both potential memory locations.  Since we are discarding
+        // src value info, don't do the transformation if the memory
+        // locations are not in the default address space.
+        LLD->getPointerInfo().getAddrSpace() != 0 ||
+        RLD->getPointerInfo().getAddrSpace() != 0 ||
+        // We can't produce a CMOV of a TargetFrameIndex since we won't
+        // generate the address generation required.
+        LLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex ||
+        RLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex ||
+        !TLI.isOperationLegalOrCustom(TheSelect->getOpcode(),
+                                      LLD->getBasePtr().getValueType()))
+      return false;
+
+    // The loads must not depend on one another.
+    if (LLD->isPredecessorOf(RLD) || RLD->isPredecessorOf(LLD))
+      return false;
+
+    // Check that the select condition doesn't reach either load.  If so,
+    // folding this will induce a cycle into the DAG.  If not, this is safe to
+    // xform, so create a select of the addresses.
+
+    SmallPtrSet<const SDNode *, 32> Visited;
+    SmallVector<const SDNode *, 16> Worklist;
+
+    // Always fail if LLD and RLD are not independent. TheSelect is a
+    // predecessor to all Nodes in question so we need not search past it.
+
+    Visited.insert(TheSelect);
+    Worklist.push_back(LLD);
+    Worklist.push_back(RLD);
+
+    if (SDNode::hasPredecessorHelper(LLD, Visited, Worklist) ||
+        SDNode::hasPredecessorHelper(RLD, Visited, Worklist))
+      return false;
+
+    SDValue Addr;
+    if (TheSelect->getOpcode() == ISD::SELECT) {
+      // We cannot do this optimization if any pair of {RLD, LLD} is a
+      // predecessor to {RLD, LLD, CondNode}. As we've already compared the
+      // Loads, we only need to check if CondNode is a successor to one of the
+      // loads. We can further avoid this if there's no use of their chain
+      // value.
+      SDNode *CondNode = TheSelect->getOperand(0).getNode();
+      Worklist.push_back(CondNode);
+
+      if ((LLD->hasAnyUseOfValue(1) &&
+           SDNode::hasPredecessorHelper(LLD, Visited, Worklist)) ||
+          (RLD->hasAnyUseOfValue(1) &&
+           SDNode::hasPredecessorHelper(RLD, Visited, Worklist)))
+        return false;
+
+      Addr = DAG.getSelect(SDLoc(TheSelect),
+                           LLD->getBasePtr().getValueType(),
+                           TheSelect->getOperand(0), LLD->getBasePtr(),
+                           RLD->getBasePtr());
+    } else {  // Otherwise SELECT_CC
+      // We cannot do this optimization if any pair of {RLD, LLD} is a
+      // predecessor to {RLD, LLD, CondLHS, CondRHS}. As we've already compared
+      // the Loads, we only need to check if CondLHS/CondRHS is a successor to
+      // one of the loads. We can further avoid this if there's no use of their
+      // chain value.
+
+      SDNode *CondLHS = TheSelect->getOperand(0).getNode();
+      SDNode *CondRHS = TheSelect->getOperand(1).getNode();
+      Worklist.push_back(CondLHS);
+      Worklist.push_back(CondRHS);
+
+      if ((LLD->hasAnyUseOfValue(1) &&
+           SDNode::hasPredecessorHelper(LLD, Visited, Worklist)) ||
+          (RLD->hasAnyUseOfValue(1) &&
+           SDNode::hasPredecessorHelper(RLD, Visited, Worklist)))
+        return false;
+
+      Addr = DAG.getNode(ISD::SELECT_CC, SDLoc(TheSelect),
+                         LLD->getBasePtr().getValueType(),
+                         TheSelect->getOperand(0),
+                         TheSelect->getOperand(1),
+                         LLD->getBasePtr(), RLD->getBasePtr(),
+                         TheSelect->getOperand(4));
+    }
+
+    SDValue Load;
+    // It is safe to replace the two loads if they have different alignments,
+    // but the new load must be the minimum (most restrictive) alignment of the
+    // inputs.
+    Align Alignment = std::min(LLD->getAlign(), RLD->getAlign());
+    MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags();
+    if (!RLD->isInvariant())
+      MMOFlags &= ~MachineMemOperand::MOInvariant;
+    if (!RLD->isDereferenceable())
+      MMOFlags &= ~MachineMemOperand::MODereferenceable;
+    if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
+      // FIXME: Discards pointer and AA info.
+      Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect),
+                         LLD->getChain(), Addr, MachinePointerInfo(), Alignment,
+                         MMOFlags);
+    } else {
+      // FIXME: Discards pointer and AA info.
+      Load = DAG.getExtLoad(
+          LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType()
+                                                  : LLD->getExtensionType(),
+          SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr,
+          MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags);
+    }
+
+    // Users of the select now use the result of the load.
+    CombineTo(TheSelect, Load);
+
+    // Users of the old loads now use the new load's chain.  We know the
+    // old-load value is dead now.
+    CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1));
+    CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1));
+    return true;
+  }
+
+  return false;
+}
+
+/// Try to fold an expression of the form (N0 cond N1) ? N2 : N3 to a shift and
+/// bitwise 'and'.
+SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0,
+                                            SDValue N1, SDValue N2, SDValue N3,
+                                            ISD::CondCode CC) {
+  // If this is a select where the false operand is zero and the compare is a
+  // check of the sign bit, see if we can perform the "gzip trick":
+  // select_cc setlt X, 0, A, 0 -> and (sra X, size(X)-1), A
+  // select_cc setgt X, 0, A, 0 -> and (not (sra X, size(X)-1)), A
+  EVT XType = N0.getValueType();
+  EVT AType = N2.getValueType();
+  if (!isNullConstant(N3) || !XType.bitsGE(AType))
+    return SDValue();
+
+  // If the comparison is testing for a positive value, we have to invert
+  // the sign bit mask, so only do that transform if the target has a bitwise
+  // 'and not' instruction (the invert is free).
+  if (CC == ISD::SETGT && TLI.hasAndNot(N2)) {
+    // (X > -1) ? A : 0
+    // (X >  0) ? X : 0 <-- This is canonical signed max.
+    if (!(isAllOnesConstant(N1) || (isNullConstant(N1) && N0 == N2)))
+      return SDValue();
+  } else if (CC == ISD::SETLT) {
+    // (X <  0) ? A : 0
+    // (X <  1) ? X : 0 <-- This is un-canonicalized signed min.
+    if (!(isNullConstant(N1) || (isOneConstant(N1) && N0 == N2)))
+      return SDValue();
+  } else {
+    return SDValue();
+  }
+
+  // and (sra X, size(X)-1), A -> "and (srl X, C2), A" iff A is a single-bit
+  // constant.
+  EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType());
+  auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
+  if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue() - 1)) == 0)) {
+    unsigned ShCt = XType.getSizeInBits() - N2C->getAPIntValue().logBase2() - 1;
+    if (!TLI.shouldAvoidTransformToShift(XType, ShCt)) {
+      SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy);
+      SDValue Shift = DAG.getNode(ISD::SRL, DL, XType, N0, ShiftAmt);
+      AddToWorklist(Shift.getNode());
+
+      if (XType.bitsGT(AType)) {
+        Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
+        AddToWorklist(Shift.getNode());
+      }
+
+      if (CC == ISD::SETGT)
+        Shift = DAG.getNOT(DL, Shift, AType);
+
+      return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
+    }
+  }
+
+  unsigned ShCt = XType.getSizeInBits() - 1;
+  if (TLI.shouldAvoidTransformToShift(XType, ShCt))
+    return SDValue();
+
+  SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy);
+  SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, ShiftAmt);
+  AddToWorklist(Shift.getNode());
+
+  if (XType.bitsGT(AType)) {
+    Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
+    AddToWorklist(Shift.getNode());
+  }
+
+  if (CC == ISD::SETGT)
+    Shift = DAG.getNOT(DL, Shift, AType);
+
+  return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
+}
+
+// Fold select(cc, binop(), binop()) -> binop(select(), select()) etc.
+SDValue DAGCombiner::foldSelectOfBinops(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  unsigned BinOpc = N1.getOpcode();
+  if (!TLI.isBinOp(BinOpc) || (N2.getOpcode() != BinOpc))
+    return SDValue();
+
+  // The use checks are intentionally on SDNode because we may be dealing
+  // with opcodes that produce more than one SDValue.
+  // TODO: Do we really need to check N0 (the condition operand of the select)?
+  //       But removing that clause could cause an infinite loop...
+  if (!N0->hasOneUse() || !N1->hasOneUse() || !N2->hasOneUse())
+    return SDValue();
+
+  // Binops may include opcodes that return multiple values, so all values
+  // must be created/propagated from the newly created binops below.
+  SDVTList OpVTs = N1->getVTList();
+
+  // Fold select(cond, binop(x, y), binop(z, y))
+  //  --> binop(select(cond, x, z), y)
+  if (N1.getOperand(1) == N2.getOperand(1)) {
+    SDValue NewSel =
+        DAG.getSelect(DL, VT, N0, N1.getOperand(0), N2.getOperand(0));
+    SDValue NewBinOp = DAG.getNode(BinOpc, DL, OpVTs, NewSel, N1.getOperand(1));
+    NewBinOp->setFlags(N1->getFlags());
+    NewBinOp->intersectFlagsWith(N2->getFlags());
+    return NewBinOp;
+  }
+
+  // Fold select(cond, binop(x, y), binop(x, z))
+  //  --> binop(x, select(cond, y, z))
+  // Second op VT might be different (e.g. shift amount type)
+  if (N1.getOperand(0) == N2.getOperand(0) &&
+      VT == N1.getOperand(1).getValueType() &&
+      VT == N2.getOperand(1).getValueType()) {
+    SDValue NewSel =
+        DAG.getSelect(DL, VT, N0, N1.getOperand(1), N2.getOperand(1));
+    SDValue NewBinOp = DAG.getNode(BinOpc, DL, OpVTs, N1.getOperand(0), NewSel);
+    NewBinOp->setFlags(N1->getFlags());
+    NewBinOp->intersectFlagsWith(N2->getFlags());
+    return NewBinOp;
+  }
+
+  // TODO: Handle isCommutativeBinOp patterns as well?
+  return SDValue();
+}
+
+// Transform (fneg/fabs (bitconvert x)) to avoid loading constant pool values.
+SDValue DAGCombiner::foldSignChangeInBitcast(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  bool IsFabs = N->getOpcode() == ISD::FABS;
+  bool IsFree = IsFabs ? TLI.isFAbsFree(VT) : TLI.isFNegFree(VT);
+
+  if (IsFree || N0.getOpcode() != ISD::BITCAST || !N0.hasOneUse())
+    return SDValue();
+
+  SDValue Int = N0.getOperand(0);
+  EVT IntVT = Int.getValueType();
+
+  // The operand to cast should be integer.
+  if (!IntVT.isInteger() || IntVT.isVector())
+    return SDValue();
+
+  // (fneg (bitconvert x)) -> (bitconvert (xor x sign))
+  // (fabs (bitconvert x)) -> (bitconvert (and x ~sign))
+  APInt SignMask;
+  if (N0.getValueType().isVector()) {
+    // For vector, create a sign mask (0x80...) or its inverse (for fabs,
+    // 0x7f...) per element and splat it.
+    SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits());
+    if (IsFabs)
+      SignMask = ~SignMask;
+    SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
+  } else {
+    // For scalar, just use the sign mask (0x80... or the inverse, 0x7f...)
+    SignMask = APInt::getSignMask(IntVT.getSizeInBits());
+    if (IsFabs)
+      SignMask = ~SignMask;
+  }
+  SDLoc DL(N0);
+  Int = DAG.getNode(IsFabs ? ISD::AND : ISD::XOR, DL, IntVT, Int,
+                    DAG.getConstant(SignMask, DL, IntVT));
+  AddToWorklist(Int.getNode());
+  return DAG.getBitcast(VT, Int);
+}
+
+/// Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
+/// where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
+/// in it. This may be a win when the constant is not otherwise available
+/// because it replaces two constant pool loads with one.
+SDValue DAGCombiner::convertSelectOfFPConstantsToLoadOffset(
+    const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3,
+    ISD::CondCode CC) {
+  if (!TLI.reduceSelectOfFPConstantLoads(N0.getValueType()))
+    return SDValue();
+
+  // If we are before legalize types, we want the other legalization to happen
+  // first (for example, to avoid messing with soft float).
+  auto *TV = dyn_cast<ConstantFPSDNode>(N2);
+  auto *FV = dyn_cast<ConstantFPSDNode>(N3);
+  EVT VT = N2.getValueType();
+  if (!TV || !FV || !TLI.isTypeLegal(VT))
+    return SDValue();
+
+  // If a constant can be materialized without loads, this does not make sense.
+  if (TLI.getOperationAction(ISD::ConstantFP, VT) == TargetLowering::Legal ||
+      TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0), ForCodeSize) ||
+      TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0), ForCodeSize))
+    return SDValue();
+
+  // If both constants have multiple uses, then we won't need to do an extra
+  // load. The values are likely around in registers for other users.
+  if (!TV->hasOneUse() && !FV->hasOneUse())
+    return SDValue();
+
+  Constant *Elts[] = { const_cast<ConstantFP*>(FV->getConstantFPValue()),
+                       const_cast<ConstantFP*>(TV->getConstantFPValue()) };
+  Type *FPTy = Elts[0]->getType();
+  const DataLayout &TD = DAG.getDataLayout();
+
+  // Create a ConstantArray of the two constants.
+  Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts);
+  SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()),
+                                      TD.getPrefTypeAlign(FPTy));
+  Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign();
+
+  // Get offsets to the 0 and 1 elements of the array, so we can select between
+  // them.
+  SDValue Zero = DAG.getIntPtrConstant(0, DL);
+  unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType());
+  SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV));
+  SDValue Cond =
+      DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC);
+  AddToWorklist(Cond.getNode());
+  SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(), Cond, One, Zero);
+  AddToWorklist(CstOffset.getNode());
+  CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx, CstOffset);
+  AddToWorklist(CPIdx.getNode());
+  return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
+                     MachinePointerInfo::getConstantPool(
+                         DAG.getMachineFunction()), Alignment);
+}
+
+/// Simplify an expression of the form (N0 cond N1) ? N2 : N3
+/// where 'cond' is the comparison specified by CC.
+SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
+                                      SDValue N2, SDValue N3, ISD::CondCode CC,
+                                      bool NotExtCompare) {
+  // (x ? y : y) -> y.
+  if (N2 == N3) return N2;
+
+  EVT CmpOpVT = N0.getValueType();
+  EVT CmpResVT = getSetCCResultType(CmpOpVT);
+  EVT VT = N2.getValueType();
+  auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
+  auto *N3C = dyn_cast<ConstantSDNode>(N3.getNode());
+
+  // Determine if the condition we're dealing with is constant.
+  if (SDValue SCC = DAG.FoldSetCC(CmpResVT, N0, N1, CC, DL)) {
+    AddToWorklist(SCC.getNode());
+    if (auto *SCCC = dyn_cast<ConstantSDNode>(SCC)) {
+      // fold select_cc true, x, y -> x
+      // fold select_cc false, x, y -> y
+      return !(SCCC->isZero()) ? N2 : N3;
+    }
+  }
+
+  if (SDValue V =
+          convertSelectOfFPConstantsToLoadOffset(DL, N0, N1, N2, N3, CC))
+    return V;
+
+  if (SDValue V = foldSelectCCToShiftAnd(DL, N0, N1, N2, N3, CC))
+    return V;
+
+  // fold (select_cc seteq (and x, y), 0, 0, A) -> (and (sra (shl x)) A)
+  // where y is has a single bit set.
+  // A plaintext description would be, we can turn the SELECT_CC into an AND
+  // when the condition can be materialized as an all-ones register.  Any
+  // single bit-test can be materialized as an all-ones register with
+  // shift-left and shift-right-arith.
+  if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND &&
+      N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) {
+    SDValue AndLHS = N0->getOperand(0);
+    auto *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+    if (ConstAndRHS && ConstAndRHS->getAPIntValue().popcount() == 1) {
+      // Shift the tested bit over the sign bit.
+      const APInt &AndMask = ConstAndRHS->getAPIntValue();
+      unsigned ShCt = AndMask.getBitWidth() - 1;
+      if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) {
+        SDValue ShlAmt =
+            DAG.getConstant(AndMask.countl_zero(), SDLoc(AndLHS),
+                            getShiftAmountTy(AndLHS.getValueType()));
+        SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt);
+
+        // Now arithmetic right shift it all the way over, so the result is
+        // either all-ones, or zero.
+        SDValue ShrAmt =
+          DAG.getConstant(ShCt, SDLoc(Shl),
+                          getShiftAmountTy(Shl.getValueType()));
+        SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt);
+
+        return DAG.getNode(ISD::AND, DL, VT, Shr, N3);
+      }
+    }
+  }
+
+  // fold select C, 16, 0 -> shl C, 4
+  bool Fold = N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2();
+  bool Swap = N3C && isNullConstant(N2) && N3C->getAPIntValue().isPowerOf2();
+
+  if ((Fold || Swap) &&
+      TLI.getBooleanContents(CmpOpVT) ==
+          TargetLowering::ZeroOrOneBooleanContent &&
+      (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, CmpOpVT))) {
+
+    if (Swap) {
+      CC = ISD::getSetCCInverse(CC, CmpOpVT);
+      std::swap(N2C, N3C);
+    }
+
+    // If the caller doesn't want us to simplify this into a zext of a compare,
+    // don't do it.
+    if (NotExtCompare && N2C->isOne())
+      return SDValue();
+
+    SDValue Temp, SCC;
+    // zext (setcc n0, n1)
+    if (LegalTypes) {
+      SCC = DAG.getSetCC(DL, CmpResVT, N0, N1, CC);
+      if (VT.bitsLT(SCC.getValueType()))
+        Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2), VT);
+      else
+        Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC);
+    } else {
+      SCC = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC);
+      Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC);
+    }
+
+    AddToWorklist(SCC.getNode());
+    AddToWorklist(Temp.getNode());
+
+    if (N2C->isOne())
+      return Temp;
+
+    unsigned ShCt = N2C->getAPIntValue().logBase2();
+    if (TLI.shouldAvoidTransformToShift(VT, ShCt))
+      return SDValue();
+
+    // shl setcc result by log2 n2c
+    return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp,
+                       DAG.getConstant(ShCt, SDLoc(Temp),
+                                       getShiftAmountTy(Temp.getValueType())));
+  }
+
+  // select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X)
+  // select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X)
+  // select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X)
+  // select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X)
+  // select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X)
+  // select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X)
+  // select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X)
+  // select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X)
+  if (N1C && N1C->isZero() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    SDValue ValueOnZero = N2;
+    SDValue Count = N3;
+    // If the condition is NE instead of E, swap the operands.
+    if (CC == ISD::SETNE)
+      std::swap(ValueOnZero, Count);
+    // Check if the value on zero is a constant equal to the bits in the type.
+    if (auto *ValueOnZeroC = dyn_cast<ConstantSDNode>(ValueOnZero)) {
+      if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) {
+        // If the other operand is cttz/cttz_zero_undef of N0, and cttz is
+        // legal, combine to just cttz.
+        if ((Count.getOpcode() == ISD::CTTZ ||
+             Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) &&
+            N0 == Count.getOperand(0) &&
+            (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ, VT)))
+          return DAG.getNode(ISD::CTTZ, DL, VT, N0);
+        // If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is
+        // legal, combine to just ctlz.
+        if ((Count.getOpcode() == ISD::CTLZ ||
+             Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) &&
+            N0 == Count.getOperand(0) &&
+            (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ, VT)))
+          return DAG.getNode(ISD::CTLZ, DL, VT, N0);
+      }
+    }
+  }
+
+  // Fold select_cc setgt X, -1, C, ~C -> xor (ashr X, BW-1), C
+  // Fold select_cc setlt X, 0, C, ~C -> xor (ashr X, BW-1), ~C
+  if (!NotExtCompare && N1C && N2C && N3C &&
+      N2C->getAPIntValue() == ~N3C->getAPIntValue() &&
+      ((N1C->isAllOnes() && CC == ISD::SETGT) ||
+       (N1C->isZero() && CC == ISD::SETLT)) &&
+      !TLI.shouldAvoidTransformToShift(VT, CmpOpVT.getScalarSizeInBits() - 1)) {
+    SDValue ASR = DAG.getNode(
+        ISD::SRA, DL, CmpOpVT, N0,
+        DAG.getConstant(CmpOpVT.getScalarSizeInBits() - 1, DL, CmpOpVT));
+    return DAG.getNode(ISD::XOR, DL, VT, DAG.getSExtOrTrunc(ASR, DL, VT),
+                       DAG.getSExtOrTrunc(CC == ISD::SETLT ? N3 : N2, DL, VT));
+  }
+
+  if (SDValue S = PerformMinMaxFpToSatCombine(N0, N1, N2, N3, CC, DAG))
+    return S;
+  if (SDValue S = PerformUMinFpToSatCombine(N0, N1, N2, N3, CC, DAG))
+    return S;
+
+  return SDValue();
+}
+
+/// This is a stub for TargetLowering::SimplifySetCC.
+SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
+                                   ISD::CondCode Cond, const SDLoc &DL,
+                                   bool foldBooleans) {
+  TargetLowering::DAGCombinerInfo
+    DagCombineInfo(DAG, Level, false, this);
+  return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
+}
+
+/// Given an ISD::SDIV node expressing a divide by constant, return
+/// a DAG expression to select that will generate the same value by multiplying
+/// by a magic number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
+SDValue DAGCombiner::BuildSDIV(SDNode *N) {
+  // when optimising for minimum size, we don't want to expand a div to a mul
+  // and a shift.
+  if (DAG.getMachineFunction().getFunction().hasMinSize())
+    return SDValue();
+
+  SmallVector<SDNode *, 8> Built;
+  if (SDValue S = TLI.BuildSDIV(N, DAG, LegalOperations, Built)) {
+    for (SDNode *N : Built)
+      AddToWorklist(N);
+    return S;
+  }
+
+  return SDValue();
+}
+
+/// Given an ISD::SDIV node expressing a divide by constant power of 2, return a
+/// DAG expression that will generate the same value by right shifting.
+SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) {
+  ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
+  if (!C)
+    return SDValue();
+
+  // Avoid division by zero.
+  if (C->isZero())
+    return SDValue();
+
+  SmallVector<SDNode *, 8> Built;
+  if (SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, Built)) {
+    for (SDNode *N : Built)
+      AddToWorklist(N);
+    return S;
+  }
+
+  return SDValue();
+}
+
+/// Given an ISD::UDIV node expressing a divide by constant, return a DAG
+/// expression that will generate the same value by multiplying by a magic
+/// number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
+SDValue DAGCombiner::BuildUDIV(SDNode *N) {
+  // when optimising for minimum size, we don't want to expand a div to a mul
+  // and a shift.
+  if (DAG.getMachineFunction().getFunction().hasMinSize())
+    return SDValue();
+
+  SmallVector<SDNode *, 8> Built;
+  if (SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, Built)) {
+    for (SDNode *N : Built)
+      AddToWorklist(N);
+    return S;
+  }
+
+  return SDValue();
+}
+
+/// Given an ISD::SREM node expressing a remainder by constant power of 2,
+/// return a DAG expression that will generate the same value.
+SDValue DAGCombiner::BuildSREMPow2(SDNode *N) {
+  ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
+  if (!C)
+    return SDValue();
+
+  // Avoid division by zero.
+  if (C->isZero())
+    return SDValue();
+
+  SmallVector<SDNode *, 8> Built;
+  if (SDValue S = TLI.BuildSREMPow2(N, C->getAPIntValue(), DAG, Built)) {
+    for (SDNode *N : Built)
+      AddToWorklist(N);
+    return S;
+  }
+
+  return SDValue();
+}
+
+/// Determines the LogBase2 value for a non-null input value using the
+/// transform: LogBase2(V) = (EltBits - 1) - ctlz(V).
+SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
+  EVT VT = V.getValueType();
+  SDValue Ctlz = DAG.getNode(ISD::CTLZ, DL, VT, V);
+  SDValue Base = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
+  SDValue LogBase2 = DAG.getNode(ISD::SUB, DL, VT, Base, Ctlz);
+  return LogBase2;
+}
+
+/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+/// For the reciprocal, we need to find the zero of the function:
+///   F(X) = 1/X - A [which has a zero at X = 1/A]
+///     =>
+///   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
+///     does not require additional intermediate precision]
+/// For the last iteration, put numerator N into it to gain more precision:
+///   Result = N X_i + X_i (N - N A X_i)
+SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op,
+                                      SDNodeFlags Flags) {
+  if (LegalDAG)
+    return SDValue();
+
+  // TODO: Handle extended types?
+  EVT VT = Op.getValueType();
+  if (VT.getScalarType() != MVT::f16 && VT.getScalarType() != MVT::f32 &&
+      VT.getScalarType() != MVT::f64)
+    return SDValue();
+
+  // If estimates are explicitly disabled for this function, we're done.
+  MachineFunction &MF = DAG.getMachineFunction();
+  int Enabled = TLI.getRecipEstimateDivEnabled(VT, MF);
+  if (Enabled == TLI.ReciprocalEstimate::Disabled)
+    return SDValue();
+
+  // Estimates may be explicitly enabled for this type with a custom number of
+  // refinement steps.
+  int Iterations = TLI.getDivRefinementSteps(VT, MF);
+  if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) {
+    AddToWorklist(Est.getNode());
+
+    SDLoc DL(Op);
+    if (Iterations) {
+      SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
+
+      // Newton iterations: Est = Est + Est (N - Arg * Est)
+      // If this is the last iteration, also multiply by the numerator.
+      for (int i = 0; i < Iterations; ++i) {
+        SDValue MulEst = Est;
+
+        if (i == Iterations - 1) {
+          MulEst = DAG.getNode(ISD::FMUL, DL, VT, N, Est, Flags);
+          AddToWorklist(MulEst.getNode());
+        }
+
+        SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, MulEst, Flags);
+        AddToWorklist(NewEst.getNode());
+
+        NewEst = DAG.getNode(ISD::FSUB, DL, VT,
+                             (i == Iterations - 1 ? N : FPOne), NewEst, Flags);
+        AddToWorklist(NewEst.getNode());
+
+        NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
+        AddToWorklist(NewEst.getNode());
+
+        Est = DAG.getNode(ISD::FADD, DL, VT, MulEst, NewEst, Flags);
+        AddToWorklist(Est.getNode());
+      }
+    } else {
+      // If no iterations are available, multiply with N.
+      Est = DAG.getNode(ISD::FMUL, DL, VT, Est, N, Flags);
+      AddToWorklist(Est.getNode());
+    }
+
+    return Est;
+  }
+
+  return SDValue();
+}
+
+/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+/// For the reciprocal sqrt, we need to find the zero of the function:
+///   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
+///     =>
+///   X_{i+1} = X_i (1.5 - A X_i^2 / 2)
+/// As a result, we precompute A/2 prior to the iteration loop.
+SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
+                                         unsigned Iterations,
+                                         SDNodeFlags Flags, bool Reciprocal) {
+  EVT VT = Arg.getValueType();
+  SDLoc DL(Arg);
+  SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT);
+
+  // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
+  // this entire sequence requires only one FP constant.
+  SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags);
+  HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags);
+
+  // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
+  for (unsigned i = 0; i < Iterations; ++i) {
+    SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
+    NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags);
+    NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags);
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
+  }
+
+  // If non-reciprocal square root is requested, multiply the result by Arg.
+  if (!Reciprocal)
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
+
+  return Est;
+}
+
+/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+/// For the reciprocal sqrt, we need to find the zero of the function:
+///   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
+///     =>
+///   X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
+SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
+                                         unsigned Iterations,
+                                         SDNodeFlags Flags, bool Reciprocal) {
+  EVT VT = Arg.getValueType();
+  SDLoc DL(Arg);
+  SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT);
+  SDValue MinusHalf = DAG.getConstantFP(-0.5, DL, VT);
+
+  // This routine must enter the loop below to work correctly
+  // when (Reciprocal == false).
+  assert(Iterations > 0);
+
+  // Newton iterations for reciprocal square root:
+  // E = (E * -0.5) * ((A * E) * E + -3.0)
+  for (unsigned i = 0; i < Iterations; ++i) {
+    SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
+    SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags);
+    SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags);
+
+    // When calculating a square root at the last iteration build:
+    // S = ((A * E) * -0.5) * ((A * E) * E + -3.0)
+    // (notice a common subexpression)
+    SDValue LHS;
+    if (Reciprocal || (i + 1) < Iterations) {
+      // RSQRT: LHS = (E * -0.5)
+      LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags);
+    } else {
+      // SQRT: LHS = (A * E) * -0.5
+      LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags);
+    }
+
+    Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags);
+  }
+
+  return Est;
+}
+
+/// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case
+/// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if
+/// Op can be zero.
+SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
+                                           bool Reciprocal) {
+  if (LegalDAG)
+    return SDValue();
+
+  // TODO: Handle extended types?
+  EVT VT = Op.getValueType();
+  if (VT.getScalarType() != MVT::f16 && VT.getScalarType() != MVT::f32 &&
+      VT.getScalarType() != MVT::f64)
+    return SDValue();
+
+  // If estimates are explicitly disabled for this function, we're done.
+  MachineFunction &MF = DAG.getMachineFunction();
+  int Enabled = TLI.getRecipEstimateSqrtEnabled(VT, MF);
+  if (Enabled == TLI.ReciprocalEstimate::Disabled)
+    return SDValue();
+
+  // Estimates may be explicitly enabled for this type with a custom number of
+  // refinement steps.
+  int Iterations = TLI.getSqrtRefinementSteps(VT, MF);
+
+  bool UseOneConstNR = false;
+  if (SDValue Est =
+      TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR,
+                          Reciprocal)) {
+    AddToWorklist(Est.getNode());
+
+    if (Iterations > 0)
+      Est = UseOneConstNR
+            ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
+            : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
+    if (!Reciprocal) {
+      SDLoc DL(Op);
+      // Try the target specific test first.
+      SDValue Test = TLI.getSqrtInputTest(Op, DAG, DAG.getDenormalMode(VT));
+
+      // The estimate is now completely wrong if the input was exactly 0.0 or
+      // possibly a denormal. Force the answer to 0.0 or value provided by
+      // target for those cases.
+      Est = DAG.getNode(
+          Test.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
+          Test, TLI.getSqrtResultForDenormInput(Op, DAG), Est);
+    }
+    return Est;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) {
+  return buildSqrtEstimateImpl(Op, Flags, true);
+}
+
+SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) {
+  return buildSqrtEstimateImpl(Op, Flags, false);
+}
+
+/// Return true if there is any possibility that the two addresses overlap.
+bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
+
+  struct MemUseCharacteristics {
+    bool IsVolatile;
+    bool IsAtomic;
+    SDValue BasePtr;
+    int64_t Offset;
+    std::optional<int64_t> NumBytes;
+    MachineMemOperand *MMO;
+  };
+
+  auto getCharacteristics = [](SDNode *N) -> MemUseCharacteristics {
+    if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) {
+      int64_t Offset = 0;
+      if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset()))
+        Offset = (LSN->getAddressingMode() == ISD::PRE_INC)
+                     ? C->getSExtValue()
+                     : (LSN->getAddressingMode() == ISD::PRE_DEC)
+                           ? -1 * C->getSExtValue()
+                           : 0;
+      uint64_t Size =
+          MemoryLocation::getSizeOrUnknown(LSN->getMemoryVT().getStoreSize());
+      return {LSN->isVolatile(),
+              LSN->isAtomic(),
+              LSN->getBasePtr(),
+              Offset /*base offset*/,
+              std::optional<int64_t>(Size),
+              LSN->getMemOperand()};
+    }
+    if (const auto *LN = cast<LifetimeSDNode>(N))
+      return {false /*isVolatile*/,
+              /*isAtomic*/ false,
+              LN->getOperand(1),
+              (LN->hasOffset()) ? LN->getOffset() : 0,
+              (LN->hasOffset()) ? std::optional<int64_t>(LN->getSize())
+                                : std::optional<int64_t>(),
+              (MachineMemOperand *)nullptr};
+    // Default.
+    return {false /*isvolatile*/,
+            /*isAtomic*/ false,          SDValue(),
+            (int64_t)0 /*offset*/,       std::optional<int64_t>() /*size*/,
+            (MachineMemOperand *)nullptr};
+  };
+
+  MemUseCharacteristics MUC0 = getCharacteristics(Op0),
+                        MUC1 = getCharacteristics(Op1);
+
+  // If they are to the same address, then they must be aliases.
+  if (MUC0.BasePtr.getNode() && MUC0.BasePtr == MUC1.BasePtr &&
+      MUC0.Offset == MUC1.Offset)
+    return true;
+
+  // If they are both volatile then they cannot be reordered.
+  if (MUC0.IsVolatile && MUC1.IsVolatile)
+    return true;
+
+  // Be conservative about atomics for the moment
+  // TODO: This is way overconservative for unordered atomics (see D66309)
+  if (MUC0.IsAtomic && MUC1.IsAtomic)
+    return true;
+
+  if (MUC0.MMO && MUC1.MMO) {
+    if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) ||
+        (MUC1.MMO->isInvariant() && MUC0.MMO->isStore()))
+      return false;
+  }
+
+  // Try to prove that there is aliasing, or that there is no aliasing. Either
+  // way, we can return now. If nothing can be proved, proceed with more tests.
+  bool IsAlias;
+  if (BaseIndexOffset::computeAliasing(Op0, MUC0.NumBytes, Op1, MUC1.NumBytes,
+                                       DAG, IsAlias))
+    return IsAlias;
+
+  // The following all rely on MMO0 and MMO1 being valid. Fail conservatively if
+  // either are not known.
+  if (!MUC0.MMO || !MUC1.MMO)
+    return true;
+
+  // If one operation reads from invariant memory, and the other may store, they
+  // cannot alias. These should really be checking the equivalent of mayWrite,
+  // but it only matters for memory nodes other than load /store.
+  if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) ||
+      (MUC1.MMO->isInvariant() && MUC0.MMO->isStore()))
+    return false;
+
+  // If we know required SrcValue1 and SrcValue2 have relatively large
+  // alignment compared to the size and offset of the access, we may be able
+  // to prove they do not alias. This check is conservative for now to catch
+  // cases created by splitting vector types, it only works when the offsets are
+  // multiples of the size of the data.
+  int64_t SrcValOffset0 = MUC0.MMO->getOffset();
+  int64_t SrcValOffset1 = MUC1.MMO->getOffset();
+  Align OrigAlignment0 = MUC0.MMO->getBaseAlign();
+  Align OrigAlignment1 = MUC1.MMO->getBaseAlign();
+  auto &Size0 = MUC0.NumBytes;
+  auto &Size1 = MUC1.NumBytes;
+  if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 &&
+      Size0.has_value() && Size1.has_value() && *Size0 == *Size1 &&
+      OrigAlignment0 > *Size0 && SrcValOffset0 % *Size0 == 0 &&
+      SrcValOffset1 % *Size1 == 0) {
+    int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0.value();
+    int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1.value();
+
+    // There is no overlap between these relatively aligned accesses of
+    // similar size. Return no alias.
+    if ((OffAlign0 + *Size0) <= OffAlign1 || (OffAlign1 + *Size1) <= OffAlign0)
+      return false;
+  }
+
+  bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0
+                   ? CombinerGlobalAA
+                   : DAG.getSubtarget().useAA();
+#ifndef NDEBUG
+  if (CombinerAAOnlyFunc.getNumOccurrences() &&
+      CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
+    UseAA = false;
+#endif
+
+  if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() && Size0 &&
+      Size1) {
+    // Use alias analysis information.
+    int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
+    int64_t Overlap0 = *Size0 + SrcValOffset0 - MinOffset;
+    int64_t Overlap1 = *Size1 + SrcValOffset1 - MinOffset;
+    if (AA->isNoAlias(
+            MemoryLocation(MUC0.MMO->getValue(), Overlap0,
+                           UseTBAA ? MUC0.MMO->getAAInfo() : AAMDNodes()),
+            MemoryLocation(MUC1.MMO->getValue(), Overlap1,
+                           UseTBAA ? MUC1.MMO->getAAInfo() : AAMDNodes())))
+      return false;
+  }
+
+  // Otherwise we have to assume they alias.
+  return true;
+}
+
+/// Walk up chain skipping non-aliasing memory nodes,
+/// looking for aliasing nodes and adding them to the Aliases vector.
+void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
+                                   SmallVectorImpl<SDValue> &Aliases) {
+  SmallVector<SDValue, 8> Chains;     // List of chains to visit.
+  SmallPtrSet<SDNode *, 16> Visited;  // Visited node set.
+
+  // Get alias information for node.
+  // TODO: relax aliasing for unordered atomics (see D66309)
+  const bool IsLoad = isa<LoadSDNode>(N) && cast<LoadSDNode>(N)->isSimple();
+
+  // Starting off.
+  Chains.push_back(OriginalChain);
+  unsigned Depth = 0;
+
+  // Attempt to improve chain by a single step
+  auto ImproveChain = [&](SDValue &C) -> bool {
+    switch (C.getOpcode()) {
+    case ISD::EntryToken:
+      // No need to mark EntryToken.
+      C = SDValue();
+      return true;
+    case ISD::LOAD:
+    case ISD::STORE: {
+      // Get alias information for C.
+      // TODO: Relax aliasing for unordered atomics (see D66309)
+      bool IsOpLoad = isa<LoadSDNode>(C.getNode()) &&
+                      cast<LSBaseSDNode>(C.getNode())->isSimple();
+      if ((IsLoad && IsOpLoad) || !mayAlias(N, C.getNode())) {
+        // Look further up the chain.
+        C = C.getOperand(0);
+        return true;
+      }
+      // Alias, so stop here.
+      return false;
+    }
+
+    case ISD::CopyFromReg:
+      // Always forward past past CopyFromReg.
+      C = C.getOperand(0);
+      return true;
+
+    case ISD::LIFETIME_START:
+    case ISD::LIFETIME_END: {
+      // We can forward past any lifetime start/end that can be proven not to
+      // alias the memory access.
+      if (!mayAlias(N, C.getNode())) {
+        // Look further up the chain.
+        C = C.getOperand(0);
+        return true;
+      }
+      return false;
+    }
+    default:
+      return false;
+    }
+  };
+
+  // Look at each chain and determine if it is an alias.  If so, add it to the
+  // aliases list.  If not, then continue up the chain looking for the next
+  // candidate.
+  while (!Chains.empty()) {
+    SDValue Chain = Chains.pop_back_val();
+
+    // Don't bother if we've seen Chain before.
+    if (!Visited.insert(Chain.getNode()).second)
+      continue;
+
+    // For TokenFactor nodes, look at each operand and only continue up the
+    // chain until we reach the depth limit.
+    //
+    // FIXME: The depth check could be made to return the last non-aliasing
+    // chain we found before we hit a tokenfactor rather than the original
+    // chain.
+    if (Depth > TLI.getGatherAllAliasesMaxDepth()) {
+      Aliases.clear();
+      Aliases.push_back(OriginalChain);
+      return;
+    }
+
+    if (Chain.getOpcode() == ISD::TokenFactor) {
+      // We have to check each of the operands of the token factor for "small"
+      // token factors, so we queue them up.  Adding the operands to the queue
+      // (stack) in reverse order maintains the original order and increases the
+      // likelihood that getNode will find a matching token factor (CSE.)
+      if (Chain.getNumOperands() > 16) {
+        Aliases.push_back(Chain);
+        continue;
+      }
+      for (unsigned n = Chain.getNumOperands(); n;)
+        Chains.push_back(Chain.getOperand(--n));
+      ++Depth;
+      continue;
+    }
+    // Everything else
+    if (ImproveChain(Chain)) {
+      // Updated Chain Found, Consider new chain if one exists.
+      if (Chain.getNode())
+        Chains.push_back(Chain);
+      ++Depth;
+      continue;
+    }
+    // No Improved Chain Possible, treat as Alias.
+    Aliases.push_back(Chain);
+  }
+}
+
+/// Walk up chain skipping non-aliasing memory nodes, looking for a better chain
+/// (aliasing node.)
+SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
+  if (OptLevel == CodeGenOpt::None)
+    return OldChain;
+
+  // Ops for replacing token factor.
+  SmallVector<SDValue, 8> Aliases;
+
+  // Accumulate all the aliases to this node.
+  GatherAllAliases(N, OldChain, Aliases);
+
+  // If no operands then chain to entry token.
+  if (Aliases.size() == 0)
+    return DAG.getEntryNode();
+
+  // If a single operand then chain to it.  We don't need to revisit it.
+  if (Aliases.size() == 1)
+    return Aliases[0];
+
+  // Construct a custom tailored token factor.
+  return DAG.getTokenFactor(SDLoc(N), Aliases);
+}
+
+// This function tries to collect a bunch of potentially interesting
+// nodes to improve the chains of, all at once. This might seem
+// redundant, as this function gets called when visiting every store
+// node, so why not let the work be done on each store as it's visited?
+//
+// I believe this is mainly important because mergeConsecutiveStores
+// is unable to deal with merging stores of different sizes, so unless
+// we improve the chains of all the potential candidates up-front
+// before running mergeConsecutiveStores, it might only see some of
+// the nodes that will eventually be candidates, and then not be able
+// to go from a partially-merged state to the desired final
+// fully-merged state.
+
+bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) {
+  SmallVector<StoreSDNode *, 8> ChainedStores;
+  StoreSDNode *STChain = St;
+  // Intervals records which offsets from BaseIndex have been covered. In
+  // the common case, every store writes to the immediately previous address
+  // space and thus merged with the previous interval at insertion time.
+
+  using IMap = llvm::IntervalMap<int64_t, std::monostate, 8,
+                                 IntervalMapHalfOpenInfo<int64_t>>;
+  IMap::Allocator A;
+  IMap Intervals(A);
+
+  // This holds the base pointer, index, and the offset in bytes from the base
+  // pointer.
+  const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
+
+  // We must have a base and an offset.
+  if (!BasePtr.getBase().getNode())
+    return false;
+
+  // Do not handle stores to undef base pointers.
+  if (BasePtr.getBase().isUndef())
+    return false;
+
+  // Do not handle stores to opaque types
+  if (St->getMemoryVT().isZeroSized())
+    return false;
+
+  // BaseIndexOffset assumes that offsets are fixed-size, which
+  // is not valid for scalable vectors where the offsets are
+  // scaled by `vscale`, so bail out early.
+  if (St->getMemoryVT().isScalableVT())
+    return false;
+
+  // Add ST's interval.
+  Intervals.insert(0, (St->getMemoryVT().getSizeInBits() + 7) / 8,
+                   std::monostate{});
+
+  while (StoreSDNode *Chain = dyn_cast<StoreSDNode>(STChain->getChain())) {
+    if (Chain->getMemoryVT().isScalableVector())
+      return false;
+
+    // If the chain has more than one use, then we can't reorder the mem ops.
+    if (!SDValue(Chain, 0)->hasOneUse())
+      break;
+    // TODO: Relax for unordered atomics (see D66309)
+    if (!Chain->isSimple() || Chain->isIndexed())
+      break;
+
+    // Find the base pointer and offset for this memory node.
+    const BaseIndexOffset Ptr = BaseIndexOffset::match(Chain, DAG);
+    // Check that the base pointer is the same as the original one.
+    int64_t Offset;
+    if (!BasePtr.equalBaseIndex(Ptr, DAG, Offset))
+      break;
+    int64_t Length = (Chain->getMemoryVT().getSizeInBits() + 7) / 8;
+    // Make sure we don't overlap with other intervals by checking the ones to
+    // the left or right before inserting.
+    auto I = Intervals.find(Offset);
+    // If there's a next interval, we should end before it.
+    if (I != Intervals.end() && I.start() < (Offset + Length))
+      break;
+    // If there's a previous interval, we should start after it.
+    if (I != Intervals.begin() && (--I).stop() <= Offset)
+      break;
+    Intervals.insert(Offset, Offset + Length, std::monostate{});
+
+    ChainedStores.push_back(Chain);
+    STChain = Chain;
+  }
+
+  // If we didn't find a chained store, exit.
+  if (ChainedStores.size() == 0)
+    return false;
+
+  // Improve all chained stores (St and ChainedStores members) starting from
+  // where the store chain ended and return single TokenFactor.
+  SDValue NewChain = STChain->getChain();
+  SmallVector<SDValue, 8> TFOps;
+  for (unsigned I = ChainedStores.size(); I;) {
+    StoreSDNode *S = ChainedStores[--I];
+    SDValue BetterChain = FindBetterChain(S, NewChain);
+    S = cast<StoreSDNode>(DAG.UpdateNodeOperands(
+        S, BetterChain, S->getOperand(1), S->getOperand(2), S->getOperand(3)));
+    TFOps.push_back(SDValue(S, 0));
+    ChainedStores[I] = S;
+  }
+
+  // Improve St's chain. Use a new node to avoid creating a loop from CombineTo.
+  SDValue BetterChain = FindBetterChain(St, NewChain);
+  SDValue NewST;
+  if (St->isTruncatingStore())
+    NewST = DAG.getTruncStore(BetterChain, SDLoc(St), St->getValue(),
+                              St->getBasePtr(), St->getMemoryVT(),
+                              St->getMemOperand());
+  else
+    NewST = DAG.getStore(BetterChain, SDLoc(St), St->getValue(),
+                         St->getBasePtr(), St->getMemOperand());
+
+  TFOps.push_back(NewST);
+
+  // If we improved every element of TFOps, then we've lost the dependence on
+  // NewChain to successors of St and we need to add it back to TFOps. Do so at
+  // the beginning to keep relative order consistent with FindBetterChains.
+  auto hasImprovedChain = [&](SDValue ST) -> bool {
+    return ST->getOperand(0) != NewChain;
+  };
+  bool AddNewChain = llvm::all_of(TFOps, hasImprovedChain);
+  if (AddNewChain)
+    TFOps.insert(TFOps.begin(), NewChain);
+
+  SDValue TF = DAG.getTokenFactor(SDLoc(STChain), TFOps);
+  CombineTo(St, TF);
+
+  // Add TF and its operands to the worklist.
+  AddToWorklist(TF.getNode());
+  for (const SDValue &Op : TF->ops())
+    AddToWorklist(Op.getNode());
+  AddToWorklist(STChain);
+  return true;
+}
+
+bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
+  if (OptLevel == CodeGenOpt::None)
+    return false;
+
+  const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
+
+  // We must have a base and an offset.
+  if (!BasePtr.getBase().getNode())
+    return false;
+
+  // Do not handle stores to undef base pointers.
+  if (BasePtr.getBase().isUndef())
+    return false;
+
+  // Directly improve a chain of disjoint stores starting at St.
+  if (parallelizeChainedStores(St))
+    return true;
+
+  // Improve St's Chain..
+  SDValue BetterChain = FindBetterChain(St, St->getChain());
+  if (St->getChain() != BetterChain) {
+    replaceStoreChain(St, BetterChain);
+    return true;
+  }
+  return false;
+}
+
+/// This is the entry point for the file.
+void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA,
+                           CodeGenOpt::Level OptLevel) {
+  /// This is the main entry point to this class.
+  DAGCombiner(*this, AA, OptLevel).Run(Level);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
new file mode 100644
index 000000000000..f0affce7b6b8
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -0,0 +1,2382 @@
+//===- FastISel.cpp - Implementation of the FastISel class ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the FastISel class.
+//
+// "Fast" instruction selection is designed to emit very poor code quickly.
+// Also, it is not designed to be able to do much lowering, so most illegal
+// types (e.g. i64 on 32-bit targets) and operations are not supported.  It is
+// also not intended to be able to do much optimization, except in a few cases
+// where doing optimizations reduces overall compile time.  For example, folding
+// constants into immediate fields is often done, because it's cheap and it
+// reduces the number of instructions later phases have to examine.
+//
+// "Fast" instruction selection is able to fail gracefully and transfer
+// control to the SelectionDAG selector for operations that it doesn't
+// support.  In many cases, this allows us to avoid duplicating a lot of
+// the complicated lowering logic that SelectionDAG currently has.
+//
+// The intended use for "fast" instruction selection is "-O0" mode
+// compilation, where the quality of the generated code is irrelevant when
+// weighed against the speed at which the code can be generated.  Also,
+// at -O0, the LLVM optimizers are not running, and this makes the
+// compile time of codegen a much higher portion of the overall compile
+// time.  Despite its limitations, "fast" instruction selection is able to
+// handle enough code on its own to provide noticeable overall speedups
+// in -O0 compiles.
+//
+// Basic operations are supported in a target-independent way, by reading
+// the same instruction descriptions that the SelectionDAG selector reads,
+// and identifying simple arithmetic operations that can be directly selected
+// from simple operators.  More complicated operations currently require
+// target-specific code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <optional>
+#include <utility>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "isel"
+
+STATISTIC(NumFastIselSuccessIndependent, "Number of insts selected by "
+                                         "target-independent selector");
+STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by "
+                                    "target-specific selector");
+STATISTIC(NumFastIselDead, "Number of dead insts removed on failure");
+
+/// Set the current block to which generated machine instructions will be
+/// appended.
+void FastISel::startNewBlock() {
+  assert(LocalValueMap.empty() &&
+         "local values should be cleared after finishing a BB");
+
+  // Instructions are appended to FuncInfo.MBB. If the basic block already
+  // contains labels or copies, use the last instruction as the last local
+  // value.
+  EmitStartPt = nullptr;
+  if (!FuncInfo.MBB->empty())
+    EmitStartPt = &FuncInfo.MBB->back();
+  LastLocalValue = EmitStartPt;
+}
+
+void FastISel::finishBasicBlock() { flushLocalValueMap(); }
+
+bool FastISel::lowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    // Fallback to SDISel argument lowering code to deal with sret pointer
+    // parameter.
+    return false;
+
+  if (!fastLowerArguments())
+    return false;
+
+  // Enter arguments into ValueMap for uses in non-entry BBs.
+  for (Function::const_arg_iterator I = FuncInfo.Fn->arg_begin(),
+                                    E = FuncInfo.Fn->arg_end();
+       I != E; ++I) {
+    DenseMap<const Value *, Register>::iterator VI = LocalValueMap.find(&*I);
+    assert(VI != LocalValueMap.end() && "Missed an argument?");
+    FuncInfo.ValueMap[&*I] = VI->second;
+  }
+  return true;
+}
+
+/// Return the defined register if this instruction defines exactly one
+/// virtual register and uses no other virtual registers. Otherwise return 0.
+static Register findLocalRegDef(MachineInstr &MI) {
+  Register RegDef;
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    if (MO.isDef()) {
+      if (RegDef)
+        return Register();
+      RegDef = MO.getReg();
+    } else if (MO.getReg().isVirtual()) {
+      // This is another use of a vreg. Don't delete it.
+      return Register();
+    }
+  }
+  return RegDef;
+}
+
+static bool isRegUsedByPhiNodes(Register DefReg,
+                                FunctionLoweringInfo &FuncInfo) {
+  for (auto &P : FuncInfo.PHINodesToUpdate)
+    if (P.second == DefReg)
+      return true;
+  return false;
+}
+
+void FastISel::flushLocalValueMap() {
+  // If FastISel bails out, it could leave local value instructions behind
+  // that aren't used for anything.  Detect and erase those.
+  if (LastLocalValue != EmitStartPt) {
+    // Save the first instruction after local values, for later.
+    MachineBasicBlock::iterator FirstNonValue(LastLocalValue);
+    ++FirstNonValue;
+
+    MachineBasicBlock::reverse_iterator RE =
+        EmitStartPt ? MachineBasicBlock::reverse_iterator(EmitStartPt)
+                    : FuncInfo.MBB->rend();
+    MachineBasicBlock::reverse_iterator RI(LastLocalValue);
+    for (MachineInstr &LocalMI :
+         llvm::make_early_inc_range(llvm::make_range(RI, RE))) {
+      Register DefReg = findLocalRegDef(LocalMI);
+      if (!DefReg)
+        continue;
+      if (FuncInfo.RegsWithFixups.count(DefReg))
+        continue;
+      bool UsedByPHI = isRegUsedByPhiNodes(DefReg, FuncInfo);
+      if (!UsedByPHI && MRI.use_nodbg_empty(DefReg)) {
+        if (EmitStartPt == &LocalMI)
+          EmitStartPt = EmitStartPt->getPrevNode();
+        LLVM_DEBUG(dbgs() << "removing dead local value materialization"
+                          << LocalMI);
+        LocalMI.eraseFromParent();
+      }
+    }
+
+    if (FirstNonValue != FuncInfo.MBB->end()) {
+      // See if there are any local value instructions left.  If so, we want to
+      // make sure the first one has a debug location; if it doesn't, use the
+      // first non-value instruction's debug location.
+
+      // If EmitStartPt is non-null, this block had copies at the top before
+      // FastISel started doing anything; it points to the last one, so the
+      // first local value instruction is the one after EmitStartPt.
+      // If EmitStartPt is null, the first local value instruction is at the
+      // top of the block.
+      MachineBasicBlock::iterator FirstLocalValue =
+          EmitStartPt ? ++MachineBasicBlock::iterator(EmitStartPt)
+                      : FuncInfo.MBB->begin();
+      if (FirstLocalValue != FirstNonValue && !FirstLocalValue->getDebugLoc())
+        FirstLocalValue->setDebugLoc(FirstNonValue->getDebugLoc());
+    }
+  }
+
+  LocalValueMap.clear();
+  LastLocalValue = EmitStartPt;
+  recomputeInsertPt();
+  SavedInsertPt = FuncInfo.InsertPt;
+}
+
+Register FastISel::getRegForValue(const Value *V) {
+  EVT RealVT = TLI.getValueType(DL, V->getType(), /*AllowUnknown=*/true);
+  // Don't handle non-simple values in FastISel.
+  if (!RealVT.isSimple())
+    return Register();
+
+  // Ignore illegal types. We must do this before looking up the value
+  // in ValueMap because Arguments are given virtual registers regardless
+  // of whether FastISel can handle them.
+  MVT VT = RealVT.getSimpleVT();
+  if (!TLI.isTypeLegal(VT)) {
+    // Handle integer promotions, though, because they're common and easy.
+    if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+      VT = TLI.getTypeToTransformTo(V->getContext(), VT).getSimpleVT();
+    else
+      return Register();
+  }
+
+  // Look up the value to see if we already have a register for it.
+  Register Reg = lookUpRegForValue(V);
+  if (Reg)
+    return Reg;
+
+  // In bottom-up mode, just create the virtual register which will be used
+  // to hold the value. It will be materialized later.
+  if (isa<Instruction>(V) &&
+      (!isa<AllocaInst>(V) ||
+       !FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(V))))
+    return FuncInfo.InitializeRegForValue(V);
+
+  SavePoint SaveInsertPt = enterLocalValueArea();
+
+  // Materialize the value in a register. Emit any instructions in the
+  // local value area.
+  Reg = materializeRegForValue(V, VT);
+
+  leaveLocalValueArea(SaveInsertPt);
+
+  return Reg;
+}
+
+Register FastISel::materializeConstant(const Value *V, MVT VT) {
+  Register Reg;
+  if (const auto *CI = dyn_cast<ConstantInt>(V)) {
+    if (CI->getValue().getActiveBits() <= 64)
+      Reg = fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
+  } else if (isa<AllocaInst>(V))
+    Reg = fastMaterializeAlloca(cast<AllocaInst>(V));
+  else if (isa<ConstantPointerNull>(V))
+    // Translate this as an integer zero so that it can be
+    // local-CSE'd with actual integer zeros.
+    Reg =
+        getRegForValue(Constant::getNullValue(DL.getIntPtrType(V->getType())));
+  else if (const auto *CF = dyn_cast<ConstantFP>(V)) {
+    if (CF->isNullValue())
+      Reg = fastMaterializeFloatZero(CF);
+    else
+      // Try to emit the constant directly.
+      Reg = fastEmit_f(VT, VT, ISD::ConstantFP, CF);
+
+    if (!Reg) {
+      // Try to emit the constant by using an integer constant with a cast.
+      const APFloat &Flt = CF->getValueAPF();
+      EVT IntVT = TLI.getPointerTy(DL);
+      uint32_t IntBitWidth = IntVT.getSizeInBits();
+      APSInt SIntVal(IntBitWidth, /*isUnsigned=*/false);
+      bool isExact;
+      (void)Flt.convertToInteger(SIntVal, APFloat::rmTowardZero, &isExact);
+      if (isExact) {
+        Register IntegerReg =
+            getRegForValue(ConstantInt::get(V->getContext(), SIntVal));
+        if (IntegerReg)
+          Reg = fastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP,
+                           IntegerReg);
+      }
+    }
+  } else if (const auto *Op = dyn_cast<Operator>(V)) {
+    if (!selectOperator(Op, Op->getOpcode()))
+      if (!isa<Instruction>(Op) ||
+          !fastSelectInstruction(cast<Instruction>(Op)))
+        return 0;
+    Reg = lookUpRegForValue(Op);
+  } else if (isa<UndefValue>(V)) {
+    Reg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+            TII.get(TargetOpcode::IMPLICIT_DEF), Reg);
+  }
+  return Reg;
+}
+
+/// Helper for getRegForValue. This function is called when the value isn't
+/// already available in a register and must be materialized with new
+/// instructions.
+Register FastISel::materializeRegForValue(const Value *V, MVT VT) {
+  Register Reg;
+  // Give the target-specific code a try first.
+  if (isa<Constant>(V))
+    Reg = fastMaterializeConstant(cast<Constant>(V));
+
+  // If target-specific code couldn't or didn't want to handle the value, then
+  // give target-independent code a try.
+  if (!Reg)
+    Reg = materializeConstant(V, VT);
+
+  // Don't cache constant materializations in the general ValueMap.
+  // To do so would require tracking what uses they dominate.
+  if (Reg) {
+    LocalValueMap[V] = Reg;
+    LastLocalValue = MRI.getVRegDef(Reg);
+  }
+  return Reg;
+}
+
+Register FastISel::lookUpRegForValue(const Value *V) {
+  // Look up the value to see if we already have a register for it. We
+  // cache values defined by Instructions across blocks, and other values
+  // only locally. This is because Instructions already have the SSA
+  // def-dominates-use requirement enforced.
+  DenseMap<const Value *, Register>::iterator I = FuncInfo.ValueMap.find(V);
+  if (I != FuncInfo.ValueMap.end())
+    return I->second;
+  return LocalValueMap[V];
+}
+
+void FastISel::updateValueMap(const Value *I, Register Reg, unsigned NumRegs) {
+  if (!isa<Instruction>(I)) {
+    LocalValueMap[I] = Reg;
+    return;
+  }
+
+  Register &AssignedReg = FuncInfo.ValueMap[I];
+  if (!AssignedReg)
+    // Use the new register.
+    AssignedReg = Reg;
+  else if (Reg != AssignedReg) {
+    // Arrange for uses of AssignedReg to be replaced by uses of Reg.
+    for (unsigned i = 0; i < NumRegs; i++) {
+      FuncInfo.RegFixups[AssignedReg + i] = Reg + i;
+      FuncInfo.RegsWithFixups.insert(Reg + i);
+    }
+
+    AssignedReg = Reg;
+  }
+}
+
+Register FastISel::getRegForGEPIndex(const Value *Idx) {
+  Register IdxN = getRegForValue(Idx);
+  if (!IdxN)
+    // Unhandled operand. Halt "fast" selection and bail.
+    return Register();
+
+  // If the index is smaller or larger than intptr_t, truncate or extend it.
+  MVT PtrVT = TLI.getPointerTy(DL);
+  EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
+  if (IdxVT.bitsLT(PtrVT)) {
+    IdxN = fastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::SIGN_EXTEND, IdxN);
+  } else if (IdxVT.bitsGT(PtrVT)) {
+    IdxN =
+        fastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::TRUNCATE, IdxN);
+  }
+  return IdxN;
+}
+
+void FastISel::recomputeInsertPt() {
+  if (getLastLocalValue()) {
+    FuncInfo.InsertPt = getLastLocalValue();
+    FuncInfo.MBB = FuncInfo.InsertPt->getParent();
+    ++FuncInfo.InsertPt;
+  } else
+    FuncInfo.InsertPt = FuncInfo.MBB->getFirstNonPHI();
+}
+
+void FastISel::removeDeadCode(MachineBasicBlock::iterator I,
+                              MachineBasicBlock::iterator E) {
+  assert(I.isValid() && E.isValid() && std::distance(I, E) > 0 &&
+         "Invalid iterator!");
+  while (I != E) {
+    if (SavedInsertPt == I)
+      SavedInsertPt = E;
+    if (EmitStartPt == I)
+      EmitStartPt = E.isValid() ? &*E : nullptr;
+    if (LastLocalValue == I)
+      LastLocalValue = E.isValid() ? &*E : nullptr;
+
+    MachineInstr *Dead = &*I;
+    ++I;
+    Dead->eraseFromParent();
+    ++NumFastIselDead;
+  }
+  recomputeInsertPt();
+}
+
+FastISel::SavePoint FastISel::enterLocalValueArea() {
+  SavePoint OldInsertPt = FuncInfo.InsertPt;
+  recomputeInsertPt();
+  return OldInsertPt;
+}
+
+void FastISel::leaveLocalValueArea(SavePoint OldInsertPt) {
+  if (FuncInfo.InsertPt != FuncInfo.MBB->begin())
+    LastLocalValue = &*std::prev(FuncInfo.InsertPt);
+
+  // Restore the previous insert position.
+  FuncInfo.InsertPt = OldInsertPt;
+}
+
+bool FastISel::selectBinaryOp(const User *I, unsigned ISDOpcode) {
+  EVT VT = EVT::getEVT(I->getType(), /*HandleUnknown=*/true);
+  if (VT == MVT::Other || !VT.isSimple())
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+
+  // We only handle legal types. For example, on x86-32 the instruction
+  // selector contains all of the 64-bit instructions from x86-64,
+  // under the assumption that i64 won't be used if the target doesn't
+  // support it.
+  if (!TLI.isTypeLegal(VT)) {
+    // MVT::i1 is special. Allow AND, OR, or XOR because they
+    // don't require additional zeroing, which makes them easy.
+    if (VT == MVT::i1 && ISD::isBitwiseLogicOp(ISDOpcode))
+      VT = TLI.getTypeToTransformTo(I->getContext(), VT);
+    else
+      return false;
+  }
+
+  // Check if the first operand is a constant, and handle it as "ri".  At -O0,
+  // we don't have anything that canonicalizes operand order.
+  if (const auto *CI = dyn_cast<ConstantInt>(I->getOperand(0)))
+    if (isa<Instruction>(I) && cast<Instruction>(I)->isCommutative()) {
+      Register Op1 = getRegForValue(I->getOperand(1));
+      if (!Op1)
+        return false;
+
+      Register ResultReg =
+          fastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op1, CI->getZExtValue(),
+                       VT.getSimpleVT());
+      if (!ResultReg)
+        return false;
+
+      // We successfully emitted code for the given LLVM Instruction.
+      updateValueMap(I, ResultReg);
+      return true;
+    }
+
+  Register Op0 = getRegForValue(I->getOperand(0));
+  if (!Op0) // Unhandled operand. Halt "fast" selection and bail.
+    return false;
+
+  // Check if the second operand is a constant and handle it appropriately.
+  if (const auto *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    uint64_t Imm = CI->getSExtValue();
+
+    // Transform "sdiv exact X, 8" -> "sra X, 3".
+    if (ISDOpcode == ISD::SDIV && isa<BinaryOperator>(I) &&
+        cast<BinaryOperator>(I)->isExact() && isPowerOf2_64(Imm)) {
+      Imm = Log2_64(Imm);
+      ISDOpcode = ISD::SRA;
+    }
+
+    // Transform "urem x, pow2" -> "and x, pow2-1".
+    if (ISDOpcode == ISD::UREM && isa<BinaryOperator>(I) &&
+        isPowerOf2_64(Imm)) {
+      --Imm;
+      ISDOpcode = ISD::AND;
+    }
+
+    Register ResultReg = fastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op0, Imm,
+                                      VT.getSimpleVT());
+    if (!ResultReg)
+      return false;
+
+    // We successfully emitted code for the given LLVM Instruction.
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  Register Op1 = getRegForValue(I->getOperand(1));
+  if (!Op1) // Unhandled operand. Halt "fast" selection and bail.
+    return false;
+
+  // Now we have both operands in registers. Emit the instruction.
+  Register ResultReg = fastEmit_rr(VT.getSimpleVT(), VT.getSimpleVT(),
+                                   ISDOpcode, Op0, Op1);
+  if (!ResultReg)
+    // Target-specific code wasn't able to find a machine opcode for
+    // the given ISD opcode and type. Halt "fast" selection and bail.
+    return false;
+
+  // We successfully emitted code for the given LLVM Instruction.
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool FastISel::selectGetElementPtr(const User *I) {
+  Register N = getRegForValue(I->getOperand(0));
+  if (!N) // Unhandled operand. Halt "fast" selection and bail.
+    return false;
+
+  // FIXME: The code below does not handle vector GEPs. Halt "fast" selection
+  // and bail.
+  if (isa<VectorType>(I->getType()))
+    return false;
+
+  // Keep a running tab of the total offset to coalesce multiple N = N + Offset
+  // into a single N = N + TotalOffset.
+  uint64_t TotalOffs = 0;
+  // FIXME: What's a good SWAG number for MaxOffs?
+  uint64_t MaxOffs = 2048;
+  MVT VT = TLI.getPointerTy(DL);
+  for (gep_type_iterator GTI = gep_type_begin(I), E = gep_type_end(I);
+       GTI != E; ++GTI) {
+    const Value *Idx = GTI.getOperand();
+    if (StructType *StTy = GTI.getStructTypeOrNull()) {
+      uint64_t Field = cast<ConstantInt>(Idx)->getZExtValue();
+      if (Field) {
+        // N = N + Offset
+        TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);
+        if (TotalOffs >= MaxOffs) {
+          N = fastEmit_ri_(VT, ISD::ADD, N, TotalOffs, VT);
+          if (!N) // Unhandled operand. Halt "fast" selection and bail.
+            return false;
+          TotalOffs = 0;
+        }
+      }
+    } else {
+      Type *Ty = GTI.getIndexedType();
+
+      // If this is a constant subscript, handle it quickly.
+      if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
+        if (CI->isZero())
+          continue;
+        // N = N + Offset
+        uint64_t IdxN = CI->getValue().sextOrTrunc(64).getSExtValue();
+        TotalOffs += DL.getTypeAllocSize(Ty) * IdxN;
+        if (TotalOffs >= MaxOffs) {
+          N = fastEmit_ri_(VT, ISD::ADD, N, TotalOffs, VT);
+          if (!N) // Unhandled operand. Halt "fast" selection and bail.
+            return false;
+          TotalOffs = 0;
+        }
+        continue;
+      }
+      if (TotalOffs) {
+        N = fastEmit_ri_(VT, ISD::ADD, N, TotalOffs, VT);
+        if (!N) // Unhandled operand. Halt "fast" selection and bail.
+          return false;
+        TotalOffs = 0;
+      }
+
+      // N = N + Idx * ElementSize;
+      uint64_t ElementSize = DL.getTypeAllocSize(Ty);
+      Register IdxN = getRegForGEPIndex(Idx);
+      if (!IdxN) // Unhandled operand. Halt "fast" selection and bail.
+        return false;
+
+      if (ElementSize != 1) {
+        IdxN = fastEmit_ri_(VT, ISD::MUL, IdxN, ElementSize, VT);
+        if (!IdxN) // Unhandled operand. Halt "fast" selection and bail.
+          return false;
+      }
+      N = fastEmit_rr(VT, VT, ISD::ADD, N, IdxN);
+      if (!N) // Unhandled operand. Halt "fast" selection and bail.
+        return false;
+    }
+  }
+  if (TotalOffs) {
+    N = fastEmit_ri_(VT, ISD::ADD, N, TotalOffs, VT);
+    if (!N) // Unhandled operand. Halt "fast" selection and bail.
+      return false;
+  }
+
+  // We successfully emitted code for the given LLVM Instruction.
+  updateValueMap(I, N);
+  return true;
+}
+
+bool FastISel::addStackMapLiveVars(SmallVectorImpl<MachineOperand> &Ops,
+                                   const CallInst *CI, unsigned StartIdx) {
+  for (unsigned i = StartIdx, e = CI->arg_size(); i != e; ++i) {
+    Value *Val = CI->getArgOperand(i);
+    // Check for constants and encode them with a StackMaps::ConstantOp prefix.
+    if (const auto *C = dyn_cast<ConstantInt>(Val)) {
+      Ops.push_back(MachineOperand::CreateImm(StackMaps::ConstantOp));
+      Ops.push_back(MachineOperand::CreateImm(C->getSExtValue()));
+    } else if (isa<ConstantPointerNull>(Val)) {
+      Ops.push_back(MachineOperand::CreateImm(StackMaps::ConstantOp));
+      Ops.push_back(MachineOperand::CreateImm(0));
+    } else if (auto *AI = dyn_cast<AllocaInst>(Val)) {
+      // Values coming from a stack location also require a special encoding,
+      // but that is added later on by the target specific frame index
+      // elimination implementation.
+      auto SI = FuncInfo.StaticAllocaMap.find(AI);
+      if (SI != FuncInfo.StaticAllocaMap.end())
+        Ops.push_back(MachineOperand::CreateFI(SI->second));
+      else
+        return false;
+    } else {
+      Register Reg = getRegForValue(Val);
+      if (!Reg)
+        return false;
+      Ops.push_back(MachineOperand::CreateReg(Reg, /*isDef=*/false));
+    }
+  }
+  return true;
+}
+
+bool FastISel::selectStackmap(const CallInst *I) {
+  // void @llvm.experimental.stackmap(i64 <id>, i32 <numShadowBytes>,
+  //                                  [live variables...])
+  assert(I->getCalledFunction()->getReturnType()->isVoidTy() &&
+         "Stackmap cannot return a value.");
+
+  // The stackmap intrinsic only records the live variables (the arguments
+  // passed to it) and emits NOPS (if requested). Unlike the patchpoint
+  // intrinsic, this won't be lowered to a function call. This means we don't
+  // have to worry about calling conventions and target-specific lowering code.
+  // Instead we perform the call lowering right here.
+  //
+  // CALLSEQ_START(0, 0...)
+  // STACKMAP(id, nbytes, ...)
+  // CALLSEQ_END(0, 0)
+  //
+  SmallVector<MachineOperand, 32> Ops;
+
+  // Add the <id> and <numBytes> constants.
+  assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::IDPos)) &&
+         "Expected a constant integer.");
+  const auto *ID = cast<ConstantInt>(I->getOperand(PatchPointOpers::IDPos));
+  Ops.push_back(MachineOperand::CreateImm(ID->getZExtValue()));
+
+  assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos)) &&
+         "Expected a constant integer.");
+  const auto *NumBytes =
+      cast<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos));
+  Ops.push_back(MachineOperand::CreateImm(NumBytes->getZExtValue()));
+
+  // Push live variables for the stack map (skipping the first two arguments
+  // <id> and <numBytes>).
+  if (!addStackMapLiveVars(Ops, I, 2))
+    return false;
+
+  // We are not adding any register mask info here, because the stackmap doesn't
+  // clobber anything.
+
+  // Add scratch registers as implicit def and early clobber.
+  CallingConv::ID CC = I->getCallingConv();
+  const MCPhysReg *ScratchRegs = TLI.getScratchRegisters(CC);
+  for (unsigned i = 0; ScratchRegs[i]; ++i)
+    Ops.push_back(MachineOperand::CreateReg(
+        ScratchRegs[i], /*isDef=*/true, /*isImp=*/true, /*isKill=*/false,
+        /*isDead=*/false, /*isUndef=*/false, /*isEarlyClobber=*/true));
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  auto Builder =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(AdjStackDown));
+  const MCInstrDesc &MCID = Builder.getInstr()->getDesc();
+  for (unsigned I = 0, E = MCID.getNumOperands(); I < E; ++I)
+    Builder.addImm(0);
+
+  // Issue STACKMAP.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+                                    TII.get(TargetOpcode::STACKMAP));
+  for (auto const &MO : Ops)
+    MIB.add(MO);
+
+  // Issue CALLSEQ_END
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(AdjStackUp))
+      .addImm(0)
+      .addImm(0);
+
+  // Inform the Frame Information that we have a stackmap in this function.
+  FuncInfo.MF->getFrameInfo().setHasStackMap();
+
+  return true;
+}
+
+/// Lower an argument list according to the target calling convention.
+///
+/// This is a helper for lowering intrinsics that follow a target calling
+/// convention or require stack pointer adjustment. Only a subset of the
+/// intrinsic's operands need to participate in the calling convention.
+bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx,
+                                 unsigned NumArgs, const Value *Callee,
+                                 bool ForceRetVoidTy, CallLoweringInfo &CLI) {
+  ArgListTy Args;
+  Args.reserve(NumArgs);
+
+  // Populate the argument list.
+  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs; ArgI != ArgE; ++ArgI) {
+    Value *V = CI->getOperand(ArgI);
+
+    assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
+
+    ArgListEntry Entry;
+    Entry.Val = V;
+    Entry.Ty = V->getType();
+    Entry.setAttributes(CI, ArgI);
+    Args.push_back(Entry);
+  }
+
+  Type *RetTy = ForceRetVoidTy ? Type::getVoidTy(CI->getType()->getContext())
+                               : CI->getType();
+  CLI.setCallee(CI->getCallingConv(), RetTy, Callee, std::move(Args), NumArgs);
+
+  return lowerCallTo(CLI);
+}
+
+FastISel::CallLoweringInfo &FastISel::CallLoweringInfo::setCallee(
+    const DataLayout &DL, MCContext &Ctx, CallingConv::ID CC, Type *ResultTy,
+    StringRef Target, ArgListTy &&ArgsList, unsigned FixedArgs) {
+  SmallString<32> MangledName;
+  Mangler::getNameWithPrefix(MangledName, Target, DL);
+  MCSymbol *Sym = Ctx.getOrCreateSymbol(MangledName);
+  return setCallee(CC, ResultTy, Sym, std::move(ArgsList), FixedArgs);
+}
+
+bool FastISel::selectPatchpoint(const CallInst *I) {
+  // void|i64 @llvm.experimental.patchpoint.void|i64(i64 <id>,
+  //                                                 i32 <numBytes>,
+  //                                                 i8* <target>,
+  //                                                 i32 <numArgs>,
+  //                                                 [Args...],
+  //                                                 [live variables...])
+  CallingConv::ID CC = I->getCallingConv();
+  bool IsAnyRegCC = CC == CallingConv::AnyReg;
+  bool HasDef = !I->getType()->isVoidTy();
+  Value *Callee = I->getOperand(PatchPointOpers::TargetPos)->stripPointerCasts();
+
+  // Get the real number of arguments participating in the call <numArgs>
+  assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::NArgPos)) &&
+         "Expected a constant integer.");
+  const auto *NumArgsVal =
+      cast<ConstantInt>(I->getOperand(PatchPointOpers::NArgPos));
+  unsigned NumArgs = NumArgsVal->getZExtValue();
+
+  // Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
+  // This includes all meta-operands up to but not including CC.
+  unsigned NumMetaOpers = PatchPointOpers::CCPos;
+  assert(I->arg_size() >= NumMetaOpers + NumArgs &&
+         "Not enough arguments provided to the patchpoint intrinsic");
+
+  // For AnyRegCC the arguments are lowered later on manually.
+  unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
+  CallLoweringInfo CLI;
+  CLI.setIsPatchPoint();
+  if (!lowerCallOperands(I, NumMetaOpers, NumCallArgs, Callee, IsAnyRegCC, CLI))
+    return false;
+
+  assert(CLI.Call && "No call instruction specified.");
+
+  SmallVector<MachineOperand, 32> Ops;
+
+  // Add an explicit result reg if we use the anyreg calling convention.
+  if (IsAnyRegCC && HasDef) {
+    assert(CLI.NumResultRegs == 0 && "Unexpected result register.");
+    CLI.ResultReg = createResultReg(TLI.getRegClassFor(MVT::i64));
+    CLI.NumResultRegs = 1;
+    Ops.push_back(MachineOperand::CreateReg(CLI.ResultReg, /*isDef=*/true));
+  }
+
+  // Add the <id> and <numBytes> constants.
+  assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::IDPos)) &&
+         "Expected a constant integer.");
+  const auto *ID = cast<ConstantInt>(I->getOperand(PatchPointOpers::IDPos));
+  Ops.push_back(MachineOperand::CreateImm(ID->getZExtValue()));
+
+  assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos)) &&
+         "Expected a constant integer.");
+  const auto *NumBytes =
+      cast<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos));
+  Ops.push_back(MachineOperand::CreateImm(NumBytes->getZExtValue()));
+
+  // Add the call target.
+  if (const auto *C = dyn_cast<IntToPtrInst>(Callee)) {
+    uint64_t CalleeConstAddr =
+      cast<ConstantInt>(C->getOperand(0))->getZExtValue();
+    Ops.push_back(MachineOperand::CreateImm(CalleeConstAddr));
+  } else if (const auto *C = dyn_cast<ConstantExpr>(Callee)) {
+    if (C->getOpcode() == Instruction::IntToPtr) {
+      uint64_t CalleeConstAddr =
+        cast<ConstantInt>(C->getOperand(0))->getZExtValue();
+      Ops.push_back(MachineOperand::CreateImm(CalleeConstAddr));
+    } else
+      llvm_unreachable("Unsupported ConstantExpr.");
+  } else if (const auto *GV = dyn_cast<GlobalValue>(Callee)) {
+    Ops.push_back(MachineOperand::CreateGA(GV, 0));
+  } else if (isa<ConstantPointerNull>(Callee))
+    Ops.push_back(MachineOperand::CreateImm(0));
+  else
+    llvm_unreachable("Unsupported callee address.");
+
+  // Adjust <numArgs> to account for any arguments that have been passed on
+  // the stack instead.
+  unsigned NumCallRegArgs = IsAnyRegCC ? NumArgs : CLI.OutRegs.size();
+  Ops.push_back(MachineOperand::CreateImm(NumCallRegArgs));
+
+  // Add the calling convention
+  Ops.push_back(MachineOperand::CreateImm((unsigned)CC));
+
+  // Add the arguments we omitted previously. The register allocator should
+  // place these in any free register.
+  if (IsAnyRegCC) {
+    for (unsigned i = NumMetaOpers, e = NumMetaOpers + NumArgs; i != e; ++i) {
+      Register Reg = getRegForValue(I->getArgOperand(i));
+      if (!Reg)
+        return false;
+      Ops.push_back(MachineOperand::CreateReg(Reg, /*isDef=*/false));
+    }
+  }
+
+  // Push the arguments from the call instruction.
+  for (auto Reg : CLI.OutRegs)
+    Ops.push_back(MachineOperand::CreateReg(Reg, /*isDef=*/false));
+
+  // Push live variables for the stack map.
+  if (!addStackMapLiveVars(Ops, I, NumMetaOpers + NumArgs))
+    return false;
+
+  // Push the register mask info.
+  Ops.push_back(MachineOperand::CreateRegMask(
+      TRI.getCallPreservedMask(*FuncInfo.MF, CC)));
+
+  // Add scratch registers as implicit def and early clobber.
+  const MCPhysReg *ScratchRegs = TLI.getScratchRegisters(CC);
+  for (unsigned i = 0; ScratchRegs[i]; ++i)
+    Ops.push_back(MachineOperand::CreateReg(
+        ScratchRegs[i], /*isDef=*/true, /*isImp=*/true, /*isKill=*/false,
+        /*isDead=*/false, /*isUndef=*/false, /*isEarlyClobber=*/true));
+
+  // Add implicit defs (return values).
+  for (auto Reg : CLI.InRegs)
+    Ops.push_back(MachineOperand::CreateReg(Reg, /*isDef=*/true,
+                                            /*isImp=*/true));
+
+  // Insert the patchpoint instruction before the call generated by the target.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, CLI.Call, MIMD,
+                                    TII.get(TargetOpcode::PATCHPOINT));
+
+  for (auto &MO : Ops)
+    MIB.add(MO);
+
+  MIB->setPhysRegsDeadExcept(CLI.InRegs, TRI);
+
+  // Delete the original call instruction.
+  CLI.Call->eraseFromParent();
+
+  // Inform the Frame Information that we have a patchpoint in this function.
+  FuncInfo.MF->getFrameInfo().setHasPatchPoint();
+
+  if (CLI.NumResultRegs)
+    updateValueMap(I, CLI.ResultReg, CLI.NumResultRegs);
+  return true;
+}
+
+bool FastISel::selectXRayCustomEvent(const CallInst *I) {
+  const auto &Triple = TM.getTargetTriple();
+  if (Triple.isAArch64(64) && Triple.getArch() != Triple::x86_64)
+    return true; // don't do anything to this instruction.
+  SmallVector<MachineOperand, 8> Ops;
+  Ops.push_back(MachineOperand::CreateReg(getRegForValue(I->getArgOperand(0)),
+                                          /*isDef=*/false));
+  Ops.push_back(MachineOperand::CreateReg(getRegForValue(I->getArgOperand(1)),
+                                          /*isDef=*/false));
+  MachineInstrBuilder MIB =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+              TII.get(TargetOpcode::PATCHABLE_EVENT_CALL));
+  for (auto &MO : Ops)
+    MIB.add(MO);
+
+  // Insert the Patchable Event Call instruction, that gets lowered properly.
+  return true;
+}
+
+bool FastISel::selectXRayTypedEvent(const CallInst *I) {
+  const auto &Triple = TM.getTargetTriple();
+  if (Triple.isAArch64(64) && Triple.getArch() != Triple::x86_64)
+    return true; // don't do anything to this instruction.
+  SmallVector<MachineOperand, 8> Ops;
+  Ops.push_back(MachineOperand::CreateReg(getRegForValue(I->getArgOperand(0)),
+                                          /*isDef=*/false));
+  Ops.push_back(MachineOperand::CreateReg(getRegForValue(I->getArgOperand(1)),
+                                          /*isDef=*/false));
+  Ops.push_back(MachineOperand::CreateReg(getRegForValue(I->getArgOperand(2)),
+                                          /*isDef=*/false));
+  MachineInstrBuilder MIB =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+              TII.get(TargetOpcode::PATCHABLE_TYPED_EVENT_CALL));
+  for (auto &MO : Ops)
+    MIB.add(MO);
+
+  // Insert the Patchable Typed Event Call instruction, that gets lowered properly.
+  return true;
+}
+
+/// Returns an AttributeList representing the attributes applied to the return
+/// value of the given call.
+static AttributeList getReturnAttrs(FastISel::CallLoweringInfo &CLI) {
+  SmallVector<Attribute::AttrKind, 2> Attrs;
+  if (CLI.RetSExt)
+    Attrs.push_back(Attribute::SExt);
+  if (CLI.RetZExt)
+    Attrs.push_back(Attribute::ZExt);
+  if (CLI.IsInReg)
+    Attrs.push_back(Attribute::InReg);
+
+  return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex,
+                            Attrs);
+}
+
+bool FastISel::lowerCallTo(const CallInst *CI, const char *SymName,
+                           unsigned NumArgs) {
+  MCContext &Ctx = MF->getContext();
+  SmallString<32> MangledName;
+  Mangler::getNameWithPrefix(MangledName, SymName, DL);
+  MCSymbol *Sym = Ctx.getOrCreateSymbol(MangledName);
+  return lowerCallTo(CI, Sym, NumArgs);
+}
+
+bool FastISel::lowerCallTo(const CallInst *CI, MCSymbol *Symbol,
+                           unsigned NumArgs) {
+  FunctionType *FTy = CI->getFunctionType();
+  Type *RetTy = CI->getType();
+
+  ArgListTy Args;
+  Args.reserve(NumArgs);
+
+  // Populate the argument list.
+  // Attributes for args start at offset 1, after the return attribute.
+  for (unsigned ArgI = 0; ArgI != NumArgs; ++ArgI) {
+    Value *V = CI->getOperand(ArgI);
+
+    assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
+
+    ArgListEntry Entry;
+    Entry.Val = V;
+    Entry.Ty = V->getType();
+    Entry.setAttributes(CI, ArgI);
+    Args.push_back(Entry);
+  }
+  TLI.markLibCallAttributes(MF, CI->getCallingConv(), Args);
+
+  CallLoweringInfo CLI;
+  CLI.setCallee(RetTy, FTy, Symbol, std::move(Args), *CI, NumArgs);
+
+  return lowerCallTo(CLI);
+}
+
+bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
+  // Handle the incoming return values from the call.
+  CLI.clearIns();
+  SmallVector<EVT, 4> RetTys;
+  ComputeValueVTs(TLI, DL, CLI.RetTy, RetTys);
+
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(CLI.CallConv, CLI.RetTy, getReturnAttrs(CLI), Outs, TLI, DL);
+
+  bool CanLowerReturn = TLI.CanLowerReturn(
+      CLI.CallConv, *FuncInfo.MF, CLI.IsVarArg, Outs, CLI.RetTy->getContext());
+
+  // FIXME: sret demotion isn't supported yet - bail out.
+  if (!CanLowerReturn)
+    return false;
+
+  for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+    EVT VT = RetTys[I];
+    MVT RegisterVT = TLI.getRegisterType(CLI.RetTy->getContext(), VT);
+    unsigned NumRegs = TLI.getNumRegisters(CLI.RetTy->getContext(), VT);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      ISD::InputArg MyFlags;
+      MyFlags.VT = RegisterVT;
+      MyFlags.ArgVT = VT;
+      MyFlags.Used = CLI.IsReturnValueUsed;
+      if (CLI.RetSExt)
+        MyFlags.Flags.setSExt();
+      if (CLI.RetZExt)
+        MyFlags.Flags.setZExt();
+      if (CLI.IsInReg)
+        MyFlags.Flags.setInReg();
+      CLI.Ins.push_back(MyFlags);
+    }
+  }
+
+  // Handle all of the outgoing arguments.
+  CLI.clearOuts();
+  for (auto &Arg : CLI.getArgs()) {
+    Type *FinalType = Arg.Ty;
+    if (Arg.IsByVal)
+      FinalType = Arg.IndirectType;
+    bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters(
+        FinalType, CLI.CallConv, CLI.IsVarArg, DL);
+
+    ISD::ArgFlagsTy Flags;
+    if (Arg.IsZExt)
+      Flags.setZExt();
+    if (Arg.IsSExt)
+      Flags.setSExt();
+    if (Arg.IsInReg)
+      Flags.setInReg();
+    if (Arg.IsSRet)
+      Flags.setSRet();
+    if (Arg.IsSwiftSelf)
+      Flags.setSwiftSelf();
+    if (Arg.IsSwiftAsync)
+      Flags.setSwiftAsync();
+    if (Arg.IsSwiftError)
+      Flags.setSwiftError();
+    if (Arg.IsCFGuardTarget)
+      Flags.setCFGuardTarget();
+    if (Arg.IsByVal)
+      Flags.setByVal();
+    if (Arg.IsInAlloca) {
+      Flags.setInAlloca();
+      // Set the byval flag for CCAssignFn callbacks that don't know about
+      // inalloca. This way we can know how many bytes we should've allocated
+      // and how many bytes a callee cleanup function will pop.  If we port
+      // inalloca to more targets, we'll have to add custom inalloca handling in
+      // the various CC lowering callbacks.
+      Flags.setByVal();
+    }
+    if (Arg.IsPreallocated) {
+      Flags.setPreallocated();
+      // Set the byval flag for CCAssignFn callbacks that don't know about
+      // preallocated. This way we can know how many bytes we should've
+      // allocated and how many bytes a callee cleanup function will pop.  If we
+      // port preallocated to more targets, we'll have to add custom
+      // preallocated handling in the various CC lowering callbacks.
+      Flags.setByVal();
+    }
+    MaybeAlign MemAlign = Arg.Alignment;
+    if (Arg.IsByVal || Arg.IsInAlloca || Arg.IsPreallocated) {
+      unsigned FrameSize = DL.getTypeAllocSize(Arg.IndirectType);
+
+      // For ByVal, alignment should come from FE. BE will guess if this info
+      // is not there, but there are cases it cannot get right.
+      if (!MemAlign)
+        MemAlign = Align(TLI.getByValTypeAlignment(Arg.IndirectType, DL));
+      Flags.setByValSize(FrameSize);
+    } else if (!MemAlign) {
+      MemAlign = DL.getABITypeAlign(Arg.Ty);
+    }
+    Flags.setMemAlign(*MemAlign);
+    if (Arg.IsNest)
+      Flags.setNest();
+    if (NeedsRegBlock)
+      Flags.setInConsecutiveRegs();
+    Flags.setOrigAlign(DL.getABITypeAlign(Arg.Ty));
+    CLI.OutVals.push_back(Arg.Val);
+    CLI.OutFlags.push_back(Flags);
+  }
+
+  if (!fastLowerCall(CLI))
+    return false;
+
+  // Set all unused physreg defs as dead.
+  assert(CLI.Call && "No call instruction specified.");
+  CLI.Call->setPhysRegsDeadExcept(CLI.InRegs, TRI);
+
+  if (CLI.NumResultRegs && CLI.CB)
+    updateValueMap(CLI.CB, CLI.ResultReg, CLI.NumResultRegs);
+
+  // Set labels for heapallocsite call.
+  if (CLI.CB)
+    if (MDNode *MD = CLI.CB->getMetadata("heapallocsite"))
+      CLI.Call->setHeapAllocMarker(*MF, MD);
+
+  return true;
+}
+
+bool FastISel::lowerCall(const CallInst *CI) {
+  FunctionType *FuncTy = CI->getFunctionType();
+  Type *RetTy = CI->getType();
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+  Args.reserve(CI->arg_size());
+
+  for (auto i = CI->arg_begin(), e = CI->arg_end(); i != e; ++i) {
+    Value *V = *i;
+
+    // Skip empty types
+    if (V->getType()->isEmptyTy())
+      continue;
+
+    Entry.Val = V;
+    Entry.Ty = V->getType();
+
+    // Skip the first return-type Attribute to get to params.
+    Entry.setAttributes(CI, i - CI->arg_begin());
+    Args.push_back(Entry);
+  }
+
+  // Check if target-independent constraints permit a tail call here.
+  // Target-dependent constraints are checked within fastLowerCall.
+  bool IsTailCall = CI->isTailCall();
+  if (IsTailCall && !isInTailCallPosition(*CI, TM))
+    IsTailCall = false;
+  if (IsTailCall && !CI->isMustTailCall() &&
+      MF->getFunction().getFnAttribute("disable-tail-calls").getValueAsBool())
+    IsTailCall = false;
+
+  CallLoweringInfo CLI;
+  CLI.setCallee(RetTy, FuncTy, CI->getCalledOperand(), std::move(Args), *CI)
+      .setTailCall(IsTailCall);
+
+  diagnoseDontCall(*CI);
+
+  return lowerCallTo(CLI);
+}
+
+bool FastISel::selectCall(const User *I) {
+  const CallInst *Call = cast<CallInst>(I);
+
+  // Handle simple inline asms.
+  if (const InlineAsm *IA = dyn_cast<InlineAsm>(Call->getCalledOperand())) {
+    // Don't attempt to handle constraints.
+    if (!IA->getConstraintString().empty())
+      return false;
+
+    unsigned ExtraInfo = 0;
+    if (IA->hasSideEffects())
+      ExtraInfo |= InlineAsm::Extra_HasSideEffects;
+    if (IA->isAlignStack())
+      ExtraInfo |= InlineAsm::Extra_IsAlignStack;
+    if (Call->isConvergent())
+      ExtraInfo |= InlineAsm::Extra_IsConvergent;
+    ExtraInfo |= IA->getDialect() * InlineAsm::Extra_AsmDialect;
+
+    MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+                                      TII.get(TargetOpcode::INLINEASM));
+    MIB.addExternalSymbol(IA->getAsmString().c_str());
+    MIB.addImm(ExtraInfo);
+
+    const MDNode *SrcLoc = Call->getMetadata("srcloc");
+    if (SrcLoc)
+      MIB.addMetadata(SrcLoc);
+
+    return true;
+  }
+
+  // Handle intrinsic function calls.
+  if (const auto *II = dyn_cast<IntrinsicInst>(Call))
+    return selectIntrinsicCall(II);
+
+  return lowerCall(Call);
+}
+
+bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
+  switch (II->getIntrinsicID()) {
+  default:
+    break;
+  // At -O0 we don't care about the lifetime intrinsics.
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end:
+  // The donothing intrinsic does, well, nothing.
+  case Intrinsic::donothing:
+  // Neither does the sideeffect intrinsic.
+  case Intrinsic::sideeffect:
+  // Neither does the assume intrinsic; it's also OK not to codegen its operand.
+  case Intrinsic::assume:
+  // Neither does the llvm.experimental.noalias.scope.decl intrinsic
+  case Intrinsic::experimental_noalias_scope_decl:
+    return true;
+  case Intrinsic::dbg_declare: {
+    const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
+    assert(DI->getVariable() && "Missing variable");
+    if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI
+                        << " (!hasDebugInfo)\n");
+      return true;
+    }
+
+    if (FuncInfo.PreprocessedDbgDeclares.contains(DI))
+      return true;
+
+    const Value *Address = DI->getAddress();
+    if (!Address || isa<UndefValue>(Address)) {
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI
+                        << " (bad/undef address)\n");
+      return true;
+    }
+
+    std::optional<MachineOperand> Op;
+    if (Register Reg = lookUpRegForValue(Address))
+      Op = MachineOperand::CreateReg(Reg, false);
+
+    // If we have a VLA that has a "use" in a metadata node that's then used
+    // here but it has no other uses, then we have a problem. E.g.,
+    //
+    //   int foo (const int *x) {
+    //     char a[*x];
+    //     return 0;
+    //   }
+    //
+    // If we assign 'a' a vreg and fast isel later on has to use the selection
+    // DAG isel, it will want to copy the value to the vreg. However, there are
+    // no uses, which goes counter to what selection DAG isel expects.
+    if (!Op && !Address->use_empty() && isa<Instruction>(Address) &&
+        (!isa<AllocaInst>(Address) ||
+         !FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(Address))))
+      Op = MachineOperand::CreateReg(FuncInfo.InitializeRegForValue(Address),
+                                     false);
+
+    if (Op) {
+      assert(DI->getVariable()->isValidLocationForIntrinsic(MIMD.getDL()) &&
+             "Expected inlined-at fields to agree");
+      if (FuncInfo.MF->useDebugInstrRef() && Op->isReg()) {
+        // If using instruction referencing, produce this as a DBG_INSTR_REF,
+        // to be later patched up by finalizeDebugInstrRefs. Tack a deref onto
+        // the expression, we don't have an "indirect" flag in DBG_INSTR_REF.
+        SmallVector<uint64_t, 3> Ops(
+            {dwarf::DW_OP_LLVM_arg, 0, dwarf::DW_OP_deref});
+        auto *NewExpr = DIExpression::prependOpcodes(DI->getExpression(), Ops);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD.getDL(),
+                TII.get(TargetOpcode::DBG_INSTR_REF), /*IsIndirect*/ false, *Op,
+                DI->getVariable(), NewExpr);
+      } else {
+        // A dbg.declare describes the address of a source variable, so lower it
+        // into an indirect DBG_VALUE.
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD.getDL(),
+                TII.get(TargetOpcode::DBG_VALUE), /*IsIndirect*/ true, *Op,
+                DI->getVariable(), DI->getExpression());
+      }
+    } else {
+      // We can't yet handle anything else here because it would require
+      // generating code, thus altering codegen because of debug info.
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI
+                        << " (no materialized reg for address)\n");
+    }
+    return true;
+  }
+  case Intrinsic::dbg_value: {
+    // This form of DBG_VALUE is target-independent.
+    const DbgValueInst *DI = cast<DbgValueInst>(II);
+    const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
+    const Value *V = DI->getValue();
+    DIExpression *Expr = DI->getExpression();
+    DILocalVariable *Var = DI->getVariable();
+    assert(Var->isValidLocationForIntrinsic(MIMD.getDL()) &&
+           "Expected inlined-at fields to agree");
+    if (!V || isa<UndefValue>(V) || DI->hasArgList()) {
+      // DI is either undef or cannot produce a valid DBG_VALUE, so produce an
+      // undef DBG_VALUE to terminate any prior location.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD.getDL(), II, false, 0U,
+              Var, Expr);
+      return true;
+    }
+    if (const auto *CI = dyn_cast<ConstantInt>(V)) {
+      // See if there's an expression to constant-fold.
+      if (Expr)
+        std::tie(Expr, CI) = Expr->constantFold(CI);
+      if (CI->getBitWidth() > 64)
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
+            .addCImm(CI)
+            .addImm(0U)
+            .addMetadata(Var)
+            .addMetadata(Expr);
+      else
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
+            .addImm(CI->getZExtValue())
+            .addImm(0U)
+            .addMetadata(Var)
+            .addMetadata(Expr);
+      return true;
+    }
+    if (const auto *CF = dyn_cast<ConstantFP>(V)) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
+          .addFPImm(CF)
+          .addImm(0U)
+          .addMetadata(Var)
+          .addMetadata(Expr);
+      return true;
+    }
+    if (const auto *Arg = dyn_cast<Argument>(V);
+        Arg && Expr && Expr->isEntryValue()) {
+      // As per the Verifier, this case is only valid for swift async Args.
+      assert(Arg->hasAttribute(Attribute::AttrKind::SwiftAsync));
+
+      Register Reg = getRegForValue(Arg);
+      for (auto [PhysReg, VirtReg] : FuncInfo.RegInfo->liveins())
+        if (Reg == VirtReg || Reg == PhysReg) {
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD.getDL(), II,
+                  false /*IsIndirect*/, PhysReg, Var, Expr);
+          return true;
+        }
+
+      LLVM_DEBUG(dbgs() << "Dropping dbg.value: expression is entry_value but "
+                           "couldn't find a physical register\n"
+                        << *DI << "\n");
+      return true;
+    }
+    if (Register Reg = lookUpRegForValue(V)) {
+      // FIXME: This does not handle register-indirect values at offset 0.
+      if (!FuncInfo.MF->useDebugInstrRef()) {
+        bool IsIndirect = false;
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD.getDL(), II, IsIndirect,
+                Reg, Var, Expr);
+        return true;
+      }
+      // If using instruction referencing, produce this as a DBG_INSTR_REF,
+      // to be later patched up by finalizeDebugInstrRefs.
+      SmallVector<MachineOperand, 1> MOs({MachineOperand::CreateReg(
+          /* Reg */ Reg, /* isDef */ false, /* isImp */ false,
+          /* isKill */ false, /* isDead */ false,
+          /* isUndef */ false, /* isEarlyClobber */ false,
+          /* SubReg */ 0, /* isDebug */ true)});
+      SmallVector<uint64_t, 2> Ops({dwarf::DW_OP_LLVM_arg, 0});
+      auto *NewExpr = DIExpression::prependOpcodes(Expr, Ops);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD.getDL(),
+              TII.get(TargetOpcode::DBG_INSTR_REF), /*IsIndirect*/ false, MOs,
+              Var, NewExpr);
+      return true;
+    }
+    // We don't know how to handle other cases, so we drop.
+    LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+    return true;
+  }
+  case Intrinsic::dbg_label: {
+    const DbgLabelInst *DI = cast<DbgLabelInst>(II);
+    assert(DI->getLabel() && "Missing label");
+    if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+      return true;
+    }
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+            TII.get(TargetOpcode::DBG_LABEL)).addMetadata(DI->getLabel());
+    return true;
+  }
+  case Intrinsic::objectsize:
+    llvm_unreachable("llvm.objectsize.* should have been lowered already");
+
+  case Intrinsic::is_constant:
+    llvm_unreachable("llvm.is.constant.* should have been lowered already");
+
+  case Intrinsic::launder_invariant_group:
+  case Intrinsic::strip_invariant_group:
+  case Intrinsic::expect: {
+    Register ResultReg = getRegForValue(II->getArgOperand(0));
+    if (!ResultReg)
+      return false;
+    updateValueMap(II, ResultReg);
+    return true;
+  }
+  case Intrinsic::experimental_stackmap:
+    return selectStackmap(II);
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    return selectPatchpoint(II);
+
+  case Intrinsic::xray_customevent:
+    return selectXRayCustomEvent(II);
+  case Intrinsic::xray_typedevent:
+    return selectXRayTypedEvent(II);
+  }
+
+  return fastLowerIntrinsicCall(II);
+}
+
+bool FastISel::selectCast(const User *I, unsigned Opcode) {
+  EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(DL, I->getType());
+
+  if (SrcVT == MVT::Other || !SrcVT.isSimple() || DstVT == MVT::Other ||
+      !DstVT.isSimple())
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+
+  // Check if the destination type is legal.
+  if (!TLI.isTypeLegal(DstVT))
+    return false;
+
+  // Check if the source operand is legal.
+  if (!TLI.isTypeLegal(SrcVT))
+    return false;
+
+  Register InputReg = getRegForValue(I->getOperand(0));
+  if (!InputReg)
+    // Unhandled operand.  Halt "fast" selection and bail.
+    return false;
+
+  Register ResultReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(),
+                                  Opcode, InputReg);
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool FastISel::selectBitCast(const User *I) {
+  EVT SrcEVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+  EVT DstEVT = TLI.getValueType(DL, I->getType());
+  if (SrcEVT == MVT::Other || DstEVT == MVT::Other ||
+      !TLI.isTypeLegal(SrcEVT) || !TLI.isTypeLegal(DstEVT))
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DstVT = DstEVT.getSimpleVT();
+  Register Op0 = getRegForValue(I->getOperand(0));
+  if (!Op0) // Unhandled operand. Halt "fast" selection and bail.
+    return false;
+
+  // If the bitcast doesn't change the type, just use the operand value.
+  if (SrcVT == DstVT) {
+    updateValueMap(I, Op0);
+    return true;
+  }
+
+  // Otherwise, select a BITCAST opcode.
+  Register ResultReg = fastEmit_r(SrcVT, DstVT, ISD::BITCAST, Op0);
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool FastISel::selectFreeze(const User *I) {
+  Register Reg = getRegForValue(I->getOperand(0));
+  if (!Reg)
+    // Unhandled operand.
+    return false;
+
+  EVT ETy = TLI.getValueType(DL, I->getOperand(0)->getType());
+  if (ETy == MVT::Other || !TLI.isTypeLegal(ETy))
+    // Unhandled type, bail out.
+    return false;
+
+  MVT Ty = ETy.getSimpleVT();
+  const TargetRegisterClass *TyRegClass = TLI.getRegClassFor(Ty);
+  Register ResultReg = createResultReg(TyRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+          TII.get(TargetOpcode::COPY), ResultReg).addReg(Reg);
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+// Remove local value instructions starting from the instruction after
+// SavedLastLocalValue to the current function insert point.
+void FastISel::removeDeadLocalValueCode(MachineInstr *SavedLastLocalValue)
+{
+  MachineInstr *CurLastLocalValue = getLastLocalValue();
+  if (CurLastLocalValue != SavedLastLocalValue) {
+    // Find the first local value instruction to be deleted.
+    // This is the instruction after SavedLastLocalValue if it is non-NULL.
+    // Otherwise it's the first instruction in the block.
+    MachineBasicBlock::iterator FirstDeadInst(SavedLastLocalValue);
+    if (SavedLastLocalValue)
+      ++FirstDeadInst;
+    else
+      FirstDeadInst = FuncInfo.MBB->getFirstNonPHI();
+    setLastLocalValue(SavedLastLocalValue);
+    removeDeadCode(FirstDeadInst, FuncInfo.InsertPt);
+  }
+}
+
+bool FastISel::selectInstruction(const Instruction *I) {
+  // Flush the local value map before starting each instruction.
+  // This improves locality and debugging, and can reduce spills.
+  // Reuse of values across IR instructions is relatively uncommon.
+  flushLocalValueMap();
+
+  MachineInstr *SavedLastLocalValue = getLastLocalValue();
+  // Just before the terminator instruction, insert instructions to
+  // feed PHI nodes in successor blocks.
+  if (I->isTerminator()) {
+    if (!handlePHINodesInSuccessorBlocks(I->getParent())) {
+      // PHI node handling may have generated local value instructions,
+      // even though it failed to handle all PHI nodes.
+      // We remove these instructions because SelectionDAGISel will generate
+      // them again.
+      removeDeadLocalValueCode(SavedLastLocalValue);
+      return false;
+    }
+  }
+
+  // FastISel does not handle any operand bundles except OB_funclet.
+  if (auto *Call = dyn_cast<CallBase>(I))
+    for (unsigned i = 0, e = Call->getNumOperandBundles(); i != e; ++i)
+      if (Call->getOperandBundleAt(i).getTagID() != LLVMContext::OB_funclet)
+        return false;
+
+  MIMD = MIMetadata(*I);
+
+  SavedInsertPt = FuncInfo.InsertPt;
+
+  if (const auto *Call = dyn_cast<CallInst>(I)) {
+    const Function *F = Call->getCalledFunction();
+    LibFunc Func;
+
+    // As a special case, don't handle calls to builtin library functions that
+    // may be translated directly to target instructions.
+    if (F && !F->hasLocalLinkage() && F->hasName() &&
+        LibInfo->getLibFunc(F->getName(), Func) &&
+        LibInfo->hasOptimizedCodeGen(Func))
+      return false;
+
+    // Don't handle Intrinsic::trap if a trap function is specified.
+    if (F && F->getIntrinsicID() == Intrinsic::trap &&
+        Call->hasFnAttr("trap-func-name"))
+      return false;
+  }
+
+  // First, try doing target-independent selection.
+  if (!SkipTargetIndependentISel) {
+    if (selectOperator(I, I->getOpcode())) {
+      ++NumFastIselSuccessIndependent;
+      MIMD = {};
+      return true;
+    }
+    // Remove dead code.
+    recomputeInsertPt();
+    if (SavedInsertPt != FuncInfo.InsertPt)
+      removeDeadCode(FuncInfo.InsertPt, SavedInsertPt);
+    SavedInsertPt = FuncInfo.InsertPt;
+  }
+  // Next, try calling the target to attempt to handle the instruction.
+  if (fastSelectInstruction(I)) {
+    ++NumFastIselSuccessTarget;
+    MIMD = {};
+    return true;
+  }
+  // Remove dead code.
+  recomputeInsertPt();
+  if (SavedInsertPt != FuncInfo.InsertPt)
+    removeDeadCode(FuncInfo.InsertPt, SavedInsertPt);
+
+  MIMD = {};
+  // Undo phi node updates, because they will be added again by SelectionDAG.
+  if (I->isTerminator()) {
+    // PHI node handling may have generated local value instructions.
+    // We remove them because SelectionDAGISel will generate them again.
+    removeDeadLocalValueCode(SavedLastLocalValue);
+    FuncInfo.PHINodesToUpdate.resize(FuncInfo.OrigNumPHINodesToUpdate);
+  }
+  return false;
+}
+
+/// Emit an unconditional branch to the given block, unless it is the immediate
+/// (fall-through) successor, and update the CFG.
+void FastISel::fastEmitBranch(MachineBasicBlock *MSucc,
+                              const DebugLoc &DbgLoc) {
+  if (FuncInfo.MBB->getBasicBlock()->sizeWithoutDebug() > 1 &&
+      FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
+    // For more accurate line information if this is the only non-debug
+    // instruction in the block then emit it, otherwise we have the
+    // unconditional fall-through case, which needs no instructions.
+  } else {
+    // The unconditional branch case.
+    TII.insertBranch(*FuncInfo.MBB, MSucc, nullptr,
+                     SmallVector<MachineOperand, 0>(), DbgLoc);
+  }
+  if (FuncInfo.BPI) {
+    auto BranchProbability = FuncInfo.BPI->getEdgeProbability(
+        FuncInfo.MBB->getBasicBlock(), MSucc->getBasicBlock());
+    FuncInfo.MBB->addSuccessor(MSucc, BranchProbability);
+  } else
+    FuncInfo.MBB->addSuccessorWithoutProb(MSucc);
+}
+
+void FastISel::finishCondBranch(const BasicBlock *BranchBB,
+                                MachineBasicBlock *TrueMBB,
+                                MachineBasicBlock *FalseMBB) {
+  // Add TrueMBB as successor unless it is equal to the FalseMBB: This can
+  // happen in degenerate IR and MachineIR forbids to have a block twice in the
+  // successor/predecessor lists.
+  if (TrueMBB != FalseMBB) {
+    if (FuncInfo.BPI) {
+      auto BranchProbability =
+          FuncInfo.BPI->getEdgeProbability(BranchBB, TrueMBB->getBasicBlock());
+      FuncInfo.MBB->addSuccessor(TrueMBB, BranchProbability);
+    } else
+      FuncInfo.MBB->addSuccessorWithoutProb(TrueMBB);
+  }
+
+  fastEmitBranch(FalseMBB, MIMD.getDL());
+}
+
+/// Emit an FNeg operation.
+bool FastISel::selectFNeg(const User *I, const Value *In) {
+  Register OpReg = getRegForValue(In);
+  if (!OpReg)
+    return false;
+
+  // If the target has ISD::FNEG, use it.
+  EVT VT = TLI.getValueType(DL, I->getType());
+  Register ResultReg = fastEmit_r(VT.getSimpleVT(), VT.getSimpleVT(), ISD::FNEG,
+                                  OpReg);
+  if (ResultReg) {
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  // Bitcast the value to integer, twiddle the sign bit with xor,
+  // and then bitcast it back to floating-point.
+  if (VT.getSizeInBits() > 64)
+    return false;
+  EVT IntVT = EVT::getIntegerVT(I->getContext(), VT.getSizeInBits());
+  if (!TLI.isTypeLegal(IntVT))
+    return false;
+
+  Register IntReg = fastEmit_r(VT.getSimpleVT(), IntVT.getSimpleVT(),
+                               ISD::BITCAST, OpReg);
+  if (!IntReg)
+    return false;
+
+  Register IntResultReg = fastEmit_ri_(
+      IntVT.getSimpleVT(), ISD::XOR, IntReg,
+      UINT64_C(1) << (VT.getSizeInBits() - 1), IntVT.getSimpleVT());
+  if (!IntResultReg)
+    return false;
+
+  ResultReg = fastEmit_r(IntVT.getSimpleVT(), VT.getSimpleVT(), ISD::BITCAST,
+                         IntResultReg);
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool FastISel::selectExtractValue(const User *U) {
+  const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(U);
+  if (!EVI)
+    return false;
+
+  // Make sure we only try to handle extracts with a legal result.  But also
+  // allow i1 because it's easy.
+  EVT RealVT = TLI.getValueType(DL, EVI->getType(), /*AllowUnknown=*/true);
+  if (!RealVT.isSimple())
+    return false;
+  MVT VT = RealVT.getSimpleVT();
+  if (!TLI.isTypeLegal(VT) && VT != MVT::i1)
+    return false;
+
+  const Value *Op0 = EVI->getOperand(0);
+  Type *AggTy = Op0->getType();
+
+  // Get the base result register.
+  unsigned ResultReg;
+  DenseMap<const Value *, Register>::iterator I = FuncInfo.ValueMap.find(Op0);
+  if (I != FuncInfo.ValueMap.end())
+    ResultReg = I->second;
+  else if (isa<Instruction>(Op0))
+    ResultReg = FuncInfo.InitializeRegForValue(Op0);
+  else
+    return false; // fast-isel can't handle aggregate constants at the moment
+
+  // Get the actual result register, which is an offset from the base register.
+  unsigned VTIndex = ComputeLinearIndex(AggTy, EVI->getIndices());
+
+  SmallVector<EVT, 4> AggValueVTs;
+  ComputeValueVTs(TLI, DL, AggTy, AggValueVTs);
+
+  for (unsigned i = 0; i < VTIndex; i++)
+    ResultReg += TLI.getNumRegisters(FuncInfo.Fn->getContext(), AggValueVTs[i]);
+
+  updateValueMap(EVI, ResultReg);
+  return true;
+}
+
+bool FastISel::selectOperator(const User *I, unsigned Opcode) {
+  switch (Opcode) {
+  case Instruction::Add:
+    return selectBinaryOp(I, ISD::ADD);
+  case Instruction::FAdd:
+    return selectBinaryOp(I, ISD::FADD);
+  case Instruction::Sub:
+    return selectBinaryOp(I, ISD::SUB);
+  case Instruction::FSub:
+    return selectBinaryOp(I, ISD::FSUB);
+  case Instruction::Mul:
+    return selectBinaryOp(I, ISD::MUL);
+  case Instruction::FMul:
+    return selectBinaryOp(I, ISD::FMUL);
+  case Instruction::SDiv:
+    return selectBinaryOp(I, ISD::SDIV);
+  case Instruction::UDiv:
+    return selectBinaryOp(I, ISD::UDIV);
+  case Instruction::FDiv:
+    return selectBinaryOp(I, ISD::FDIV);
+  case Instruction::SRem:
+    return selectBinaryOp(I, ISD::SREM);
+  case Instruction::URem:
+    return selectBinaryOp(I, ISD::UREM);
+  case Instruction::FRem:
+    return selectBinaryOp(I, ISD::FREM);
+  case Instruction::Shl:
+    return selectBinaryOp(I, ISD::SHL);
+  case Instruction::LShr:
+    return selectBinaryOp(I, ISD::SRL);
+  case Instruction::AShr:
+    return selectBinaryOp(I, ISD::SRA);
+  case Instruction::And:
+    return selectBinaryOp(I, ISD::AND);
+  case Instruction::Or:
+    return selectBinaryOp(I, ISD::OR);
+  case Instruction::Xor:
+    return selectBinaryOp(I, ISD::XOR);
+
+  case Instruction::FNeg:
+    return selectFNeg(I, I->getOperand(0));
+
+  case Instruction::GetElementPtr:
+    return selectGetElementPtr(I);
+
+  case Instruction::Br: {
+    const BranchInst *BI = cast<BranchInst>(I);
+
+    if (BI->isUnconditional()) {
+      const BasicBlock *LLVMSucc = BI->getSuccessor(0);
+      MachineBasicBlock *MSucc = FuncInfo.MBBMap[LLVMSucc];
+      fastEmitBranch(MSucc, BI->getDebugLoc());
+      return true;
+    }
+
+    // Conditional branches are not handed yet.
+    // Halt "fast" selection and bail.
+    return false;
+  }
+
+  case Instruction::Unreachable:
+    if (TM.Options.TrapUnreachable)
+      return fastEmit_(MVT::Other, MVT::Other, ISD::TRAP) != 0;
+    else
+      return true;
+
+  case Instruction::Alloca:
+    // FunctionLowering has the static-sized case covered.
+    if (FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(I)))
+      return true;
+
+    // Dynamic-sized alloca is not handled yet.
+    return false;
+
+  case Instruction::Call:
+    // On AIX, normal call lowering uses the DAG-ISEL path currently so that the
+    // callee of the direct function call instruction will be mapped to the
+    // symbol for the function's entry point, which is distinct from the
+    // function descriptor symbol. The latter is the symbol whose XCOFF symbol
+    // name is the C-linkage name of the source level function.
+    // But fast isel still has the ability to do selection for intrinsics.
+    if (TM.getTargetTriple().isOSAIX() && !isa<IntrinsicInst>(I))
+      return false;
+    return selectCall(I);
+
+  case Instruction::BitCast:
+    return selectBitCast(I);
+
+  case Instruction::FPToSI:
+    return selectCast(I, ISD::FP_TO_SINT);
+  case Instruction::ZExt:
+    return selectCast(I, ISD::ZERO_EXTEND);
+  case Instruction::SExt:
+    return selectCast(I, ISD::SIGN_EXTEND);
+  case Instruction::Trunc:
+    return selectCast(I, ISD::TRUNCATE);
+  case Instruction::SIToFP:
+    return selectCast(I, ISD::SINT_TO_FP);
+
+  case Instruction::IntToPtr: // Deliberate fall-through.
+  case Instruction::PtrToInt: {
+    EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+    EVT DstVT = TLI.getValueType(DL, I->getType());
+    if (DstVT.bitsGT(SrcVT))
+      return selectCast(I, ISD::ZERO_EXTEND);
+    if (DstVT.bitsLT(SrcVT))
+      return selectCast(I, ISD::TRUNCATE);
+    Register Reg = getRegForValue(I->getOperand(0));
+    if (!Reg)
+      return false;
+    updateValueMap(I, Reg);
+    return true;
+  }
+
+  case Instruction::ExtractValue:
+    return selectExtractValue(I);
+
+  case Instruction::Freeze:
+    return selectFreeze(I);
+
+  case Instruction::PHI:
+    llvm_unreachable("FastISel shouldn't visit PHI nodes!");
+
+  default:
+    // Unhandled instruction. Halt "fast" selection and bail.
+    return false;
+  }
+}
+
+FastISel::FastISel(FunctionLoweringInfo &FuncInfo,
+                   const TargetLibraryInfo *LibInfo,
+                   bool SkipTargetIndependentISel)
+    : FuncInfo(FuncInfo), MF(FuncInfo.MF), MRI(FuncInfo.MF->getRegInfo()),
+      MFI(FuncInfo.MF->getFrameInfo()), MCP(*FuncInfo.MF->getConstantPool()),
+      TM(FuncInfo.MF->getTarget()), DL(MF->getDataLayout()),
+      TII(*MF->getSubtarget().getInstrInfo()),
+      TLI(*MF->getSubtarget().getTargetLowering()),
+      TRI(*MF->getSubtarget().getRegisterInfo()), LibInfo(LibInfo),
+      SkipTargetIndependentISel(SkipTargetIndependentISel) {}
+
+FastISel::~FastISel() = default;
+
+bool FastISel::fastLowerArguments() { return false; }
+
+bool FastISel::fastLowerCall(CallLoweringInfo & /*CLI*/) { return false; }
+
+bool FastISel::fastLowerIntrinsicCall(const IntrinsicInst * /*II*/) {
+  return false;
+}
+
+unsigned FastISel::fastEmit_(MVT, MVT, unsigned) { return 0; }
+
+unsigned FastISel::fastEmit_r(MVT, MVT, unsigned, unsigned /*Op0*/) {
+  return 0;
+}
+
+unsigned FastISel::fastEmit_rr(MVT, MVT, unsigned, unsigned /*Op0*/,
+                               unsigned /*Op1*/) {
+  return 0;
+}
+
+unsigned FastISel::fastEmit_i(MVT, MVT, unsigned, uint64_t /*Imm*/) {
+  return 0;
+}
+
+unsigned FastISel::fastEmit_f(MVT, MVT, unsigned,
+                              const ConstantFP * /*FPImm*/) {
+  return 0;
+}
+
+unsigned FastISel::fastEmit_ri(MVT, MVT, unsigned, unsigned /*Op0*/,
+                               uint64_t /*Imm*/) {
+  return 0;
+}
+
+/// This method is a wrapper of fastEmit_ri. It first tries to emit an
+/// instruction with an immediate operand using fastEmit_ri.
+/// If that fails, it materializes the immediate into a register and try
+/// fastEmit_rr instead.
+Register FastISel::fastEmit_ri_(MVT VT, unsigned Opcode, unsigned Op0,
+                                uint64_t Imm, MVT ImmType) {
+  // If this is a multiply by a power of two, emit this as a shift left.
+  if (Opcode == ISD::MUL && isPowerOf2_64(Imm)) {
+    Opcode = ISD::SHL;
+    Imm = Log2_64(Imm);
+  } else if (Opcode == ISD::UDIV && isPowerOf2_64(Imm)) {
+    // div x, 8 -> srl x, 3
+    Opcode = ISD::SRL;
+    Imm = Log2_64(Imm);
+  }
+
+  // Horrible hack (to be removed), check to make sure shift amounts are
+  // in-range.
+  if ((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
+      Imm >= VT.getSizeInBits())
+    return 0;
+
+  // First check if immediate type is legal. If not, we can't use the ri form.
+  Register ResultReg = fastEmit_ri(VT, VT, Opcode, Op0, Imm);
+  if (ResultReg)
+    return ResultReg;
+  Register MaterialReg = fastEmit_i(ImmType, ImmType, ISD::Constant, Imm);
+  if (!MaterialReg) {
+    // This is a bit ugly/slow, but failing here means falling out of
+    // fast-isel, which would be very slow.
+    IntegerType *ITy =
+        IntegerType::get(FuncInfo.Fn->getContext(), VT.getSizeInBits());
+    MaterialReg = getRegForValue(ConstantInt::get(ITy, Imm));
+    if (!MaterialReg)
+      return 0;
+  }
+  return fastEmit_rr(VT, VT, Opcode, Op0, MaterialReg);
+}
+
+Register FastISel::createResultReg(const TargetRegisterClass *RC) {
+  return MRI.createVirtualRegister(RC);
+}
+
+Register FastISel::constrainOperandRegClass(const MCInstrDesc &II, Register Op,
+                                            unsigned OpNum) {
+  if (Op.isVirtual()) {
+    const TargetRegisterClass *RegClass =
+        TII.getRegClass(II, OpNum, &TRI, *FuncInfo.MF);
+    if (!MRI.constrainRegClass(Op, RegClass)) {
+      // If it's not legal to COPY between the register classes, something
+      // has gone very wrong before we got here.
+      Register NewOp = createResultReg(RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+              TII.get(TargetOpcode::COPY), NewOp).addReg(Op);
+      return NewOp;
+    }
+  }
+  return Op;
+}
+
+Register FastISel::fastEmitInst_(unsigned MachineInstOpcode,
+                                 const TargetRegisterClass *RC) {
+  Register ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, ResultReg);
+  return ResultReg;
+}
+
+Register FastISel::fastEmitInst_r(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC, unsigned Op0) {
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  Register ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, ResultReg)
+        .addReg(Op0);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
+        .addReg(Op0);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
+            ResultReg)
+        .addReg(II.implicit_defs()[0]);
+  }
+
+  return ResultReg;
+}
+
+Register FastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC, unsigned Op0,
+                                   unsigned Op1) {
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  Register ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+  Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, ResultReg)
+        .addReg(Op0)
+        .addReg(Op1);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
+        .addReg(Op0)
+        .addReg(Op1);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
+            ResultReg)
+        .addReg(II.implicit_defs()[0]);
+  }
+  return ResultReg;
+}
+
+Register FastISel::fastEmitInst_rrr(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC, unsigned Op0,
+                                    unsigned Op1, unsigned Op2) {
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  Register ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+  Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+  Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, ResultReg)
+        .addReg(Op0)
+        .addReg(Op1)
+        .addReg(Op2);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
+        .addReg(Op0)
+        .addReg(Op1)
+        .addReg(Op2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
+            ResultReg)
+        .addReg(II.implicit_defs()[0]);
+  }
+  return ResultReg;
+}
+
+Register FastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC, unsigned Op0,
+                                   uint64_t Imm) {
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  Register ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, ResultReg)
+        .addReg(Op0)
+        .addImm(Imm);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
+        .addReg(Op0)
+        .addImm(Imm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
+            ResultReg)
+        .addReg(II.implicit_defs()[0]);
+  }
+  return ResultReg;
+}
+
+Register FastISel::fastEmitInst_rii(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC, unsigned Op0,
+                                    uint64_t Imm1, uint64_t Imm2) {
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  Register ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, ResultReg)
+        .addReg(Op0)
+        .addImm(Imm1)
+        .addImm(Imm2);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
+        .addReg(Op0)
+        .addImm(Imm1)
+        .addImm(Imm2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
+            ResultReg)
+        .addReg(II.implicit_defs()[0]);
+  }
+  return ResultReg;
+}
+
+Register FastISel::fastEmitInst_f(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC,
+                                  const ConstantFP *FPImm) {
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  Register ResultReg = createResultReg(RC);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, ResultReg)
+        .addFPImm(FPImm);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
+        .addFPImm(FPImm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
+            ResultReg)
+        .addReg(II.implicit_defs()[0]);
+  }
+  return ResultReg;
+}
+
+Register FastISel::fastEmitInst_rri(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC, unsigned Op0,
+                                    unsigned Op1, uint64_t Imm) {
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  Register ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+  Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, ResultReg)
+        .addReg(Op0)
+        .addReg(Op1)
+        .addImm(Imm);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II)
+        .addReg(Op0)
+        .addReg(Op1)
+        .addImm(Imm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
+            ResultReg)
+        .addReg(II.implicit_defs()[0]);
+  }
+  return ResultReg;
+}
+
+Register FastISel::fastEmitInst_i(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC, uint64_t Imm) {
+  Register ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II, ResultReg)
+        .addImm(Imm);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, II).addImm(Imm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
+            ResultReg)
+        .addReg(II.implicit_defs()[0]);
+  }
+  return ResultReg;
+}
+
+Register FastISel::fastEmitInst_extractsubreg(MVT RetVT, unsigned Op0,
+                                              uint32_t Idx) {
+  Register ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
+  assert(Register::isVirtualRegister(Op0) &&
+         "Cannot yet extract from physregs");
+  const TargetRegisterClass *RC = MRI.getRegClass(Op0);
+  MRI.constrainRegClass(Op0, TRI.getSubClassWithSubReg(RC, Idx));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(TargetOpcode::COPY),
+          ResultReg).addReg(Op0, 0, Idx);
+  return ResultReg;
+}
+
+/// Emit MachineInstrs to compute the value of Op with all but the least
+/// significant bit set to zero.
+Register FastISel::fastEmitZExtFromI1(MVT VT, unsigned Op0) {
+  return fastEmit_ri(VT, VT, ISD::AND, Op0, 1);
+}
+
+/// HandlePHINodesInSuccessorBlocks - Handle PHI nodes in successor blocks.
+/// Emit code to ensure constants are copied into registers when needed.
+/// Remember the virtual registers that need to be added to the Machine PHI
+/// nodes as input.  We cannot just directly add them, because expansion
+/// might result in multiple MBB's for one BB.  As such, the start of the
+/// BB might correspond to a different MBB than the end.
+bool FastISel::handlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
+  const Instruction *TI = LLVMBB->getTerminator();
+
+  SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
+  FuncInfo.OrigNumPHINodesToUpdate = FuncInfo.PHINodesToUpdate.size();
+
+  // Check successor nodes' PHI nodes that expect a constant to be available
+  // from this block.
+  for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) {
+    const BasicBlock *SuccBB = TI->getSuccessor(succ);
+    if (!isa<PHINode>(SuccBB->begin()))
+      continue;
+    MachineBasicBlock *SuccMBB = FuncInfo.MBBMap[SuccBB];
+
+    // If this terminator has multiple identical successors (common for
+    // switches), only handle each succ once.
+    if (!SuccsHandled.insert(SuccMBB).second)
+      continue;
+
+    MachineBasicBlock::iterator MBBI = SuccMBB->begin();
+
+    // At this point we know that there is a 1-1 correspondence between LLVM PHI
+    // nodes and Machine PHI nodes, but the incoming operands have not been
+    // emitted yet.
+    for (const PHINode &PN : SuccBB->phis()) {
+      // Ignore dead phi's.
+      if (PN.use_empty())
+        continue;
+
+      // Only handle legal types. Two interesting things to note here. First,
+      // by bailing out early, we may leave behind some dead instructions,
+      // since SelectionDAG's HandlePHINodesInSuccessorBlocks will insert its
+      // own moves. Second, this check is necessary because FastISel doesn't
+      // use CreateRegs to create registers, so it always creates
+      // exactly one register for each non-void instruction.
+      EVT VT = TLI.getValueType(DL, PN.getType(), /*AllowUnknown=*/true);
+      if (VT == MVT::Other || !TLI.isTypeLegal(VT)) {
+        // Handle integer promotions, though, because they're common and easy.
+        if (!(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)) {
+          FuncInfo.PHINodesToUpdate.resize(FuncInfo.OrigNumPHINodesToUpdate);
+          return false;
+        }
+      }
+
+      const Value *PHIOp = PN.getIncomingValueForBlock(LLVMBB);
+
+      // Set the DebugLoc for the copy. Use the location of the operand if
+      // there is one; otherwise no location, flushLocalValueMap will fix it.
+      MIMD = {};
+      if (const auto *Inst = dyn_cast<Instruction>(PHIOp))
+        MIMD = MIMetadata(*Inst);
+
+      Register Reg = getRegForValue(PHIOp);
+      if (!Reg) {
+        FuncInfo.PHINodesToUpdate.resize(FuncInfo.OrigNumPHINodesToUpdate);
+        return false;
+      }
+      FuncInfo.PHINodesToUpdate.push_back(std::make_pair(&*MBBI++, Reg));
+      MIMD = {};
+    }
+  }
+
+  return true;
+}
+
+bool FastISel::tryToFoldLoad(const LoadInst *LI, const Instruction *FoldInst) {
+  assert(LI->hasOneUse() &&
+         "tryToFoldLoad expected a LoadInst with a single use");
+  // We know that the load has a single use, but don't know what it is.  If it
+  // isn't one of the folded instructions, then we can't succeed here.  Handle
+  // this by scanning the single-use users of the load until we get to FoldInst.
+  unsigned MaxUsers = 6; // Don't scan down huge single-use chains of instrs.
+
+  const Instruction *TheUser = LI->user_back();
+  while (TheUser != FoldInst && // Scan up until we find FoldInst.
+         // Stay in the right block.
+         TheUser->getParent() == FoldInst->getParent() &&
+         --MaxUsers) { // Don't scan too far.
+    // If there are multiple or no uses of this instruction, then bail out.
+    if (!TheUser->hasOneUse())
+      return false;
+
+    TheUser = TheUser->user_back();
+  }
+
+  // If we didn't find the fold instruction, then we failed to collapse the
+  // sequence.
+  if (TheUser != FoldInst)
+    return false;
+
+  // Don't try to fold volatile loads.  Target has to deal with alignment
+  // constraints.
+  if (LI->isVolatile())
+    return false;
+
+  // Figure out which vreg this is going into.  If there is no assigned vreg yet
+  // then there actually was no reference to it.  Perhaps the load is referenced
+  // by a dead instruction.
+  Register LoadReg = getRegForValue(LI);
+  if (!LoadReg)
+    return false;
+
+  // We can't fold if this vreg has no uses or more than one use.  Multiple uses
+  // may mean that the instruction got lowered to multiple MIs, or the use of
+  // the loaded value ended up being multiple operands of the result.
+  if (!MRI.hasOneUse(LoadReg))
+    return false;
+
+  // If the register has fixups, there may be additional uses through a
+  // different alias of the register.
+  if (FuncInfo.RegsWithFixups.contains(LoadReg))
+    return false;
+
+  MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(LoadReg);
+  MachineInstr *User = RI->getParent();
+
+  // Set the insertion point properly.  Folding the load can cause generation of
+  // other random instructions (like sign extends) for addressing modes; make
+  // sure they get inserted in a logical place before the new instruction.
+  FuncInfo.InsertPt = User;
+  FuncInfo.MBB = User->getParent();
+
+  // Ask the target to try folding the load.
+  return tryToFoldLoadIntoMI(User, RI.getOperandNo(), LI);
+}
+
+bool FastISel::canFoldAddIntoGEP(const User *GEP, const Value *Add) {
+  // Must be an add.
+  if (!isa<AddOperator>(Add))
+    return false;
+  // Type size needs to match.
+  if (DL.getTypeSizeInBits(GEP->getType()) !=
+      DL.getTypeSizeInBits(Add->getType()))
+    return false;
+  // Must be in the same basic block.
+  if (isa<Instruction>(Add) &&
+      FuncInfo.MBBMap[cast<Instruction>(Add)->getParent()] != FuncInfo.MBB)
+    return false;
+  // Must have a constant operand.
+  return isa<ConstantInt>(cast<AddOperator>(Add)->getOperand(1));
+}
+
+MachineMemOperand *
+FastISel::createMachineMemOperandFor(const Instruction *I) const {
+  const Value *Ptr;
+  Type *ValTy;
+  MaybeAlign Alignment;
+  MachineMemOperand::Flags Flags;
+  bool IsVolatile;
+
+  if (const auto *LI = dyn_cast<LoadInst>(I)) {
+    Alignment = LI->getAlign();
+    IsVolatile = LI->isVolatile();
+    Flags = MachineMemOperand::MOLoad;
+    Ptr = LI->getPointerOperand();
+    ValTy = LI->getType();
+  } else if (const auto *SI = dyn_cast<StoreInst>(I)) {
+    Alignment = SI->getAlign();
+    IsVolatile = SI->isVolatile();
+    Flags = MachineMemOperand::MOStore;
+    Ptr = SI->getPointerOperand();
+    ValTy = SI->getValueOperand()->getType();
+  } else
+    return nullptr;
+
+  bool IsNonTemporal = I->hasMetadata(LLVMContext::MD_nontemporal);
+  bool IsInvariant = I->hasMetadata(LLVMContext::MD_invariant_load);
+  bool IsDereferenceable = I->hasMetadata(LLVMContext::MD_dereferenceable);
+  const MDNode *Ranges = I->getMetadata(LLVMContext::MD_range);
+
+  AAMDNodes AAInfo = I->getAAMetadata();
+
+  if (!Alignment) // Ensure that codegen never sees alignment 0.
+    Alignment = DL.getABITypeAlign(ValTy);
+
+  unsigned Size = DL.getTypeStoreSize(ValTy);
+
+  if (IsVolatile)
+    Flags |= MachineMemOperand::MOVolatile;
+  if (IsNonTemporal)
+    Flags |= MachineMemOperand::MONonTemporal;
+  if (IsDereferenceable)
+    Flags |= MachineMemOperand::MODereferenceable;
+  if (IsInvariant)
+    Flags |= MachineMemOperand::MOInvariant;
+
+  return FuncInfo.MF->getMachineMemOperand(MachinePointerInfo(Ptr), Flags, Size,
+                                           *Alignment, AAInfo, Ranges);
+}
+
+CmpInst::Predicate FastISel::optimizeCmpPredicate(const CmpInst *CI) const {
+  // If both operands are the same, then try to optimize or fold the cmp.
+  CmpInst::Predicate Predicate = CI->getPredicate();
+  if (CI->getOperand(0) != CI->getOperand(1))
+    return Predicate;
+
+  switch (Predicate) {
+  default: llvm_unreachable("Invalid predicate!");
+  case CmpInst::FCMP_FALSE: Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OEQ:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_OGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OGE:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_OLT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OLE:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_ONE:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_ORD:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_UNO:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_UEQ:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_UGT:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_UGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_ULT:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_ULE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_UNE:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_TRUE:  Predicate = CmpInst::FCMP_TRUE;  break;
+
+  case CmpInst::ICMP_EQ:    Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_NE:    Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_UGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_UGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_ULT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_ULE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_SGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_SGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_SLT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_SLE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  }
+
+  return Predicate;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
new file mode 100644
index 000000000000..1d0a03ccfcdc
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -0,0 +1,560 @@
+//===-- FunctionLoweringInfo.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements routines for translating functions from LLVM IR into
+// Machine IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/WasmEHFuncInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "function-lowering-info"
+
+/// isUsedOutsideOfDefiningBlock - Return true if this instruction is used by
+/// PHI nodes or outside of the basic block that defines it, or used by a
+/// switch or atomic instruction, which may expand to multiple basic blocks.
+static bool isUsedOutsideOfDefiningBlock(const Instruction *I) {
+  if (I->use_empty()) return false;
+  if (isa<PHINode>(I)) return true;
+  const BasicBlock *BB = I->getParent();
+  for (const User *U : I->users())
+    if (cast<Instruction>(U)->getParent() != BB || isa<PHINode>(U))
+      return true;
+
+  return false;
+}
+
+static ISD::NodeType getPreferredExtendForValue(const Instruction *I) {
+  // For the users of the source value being used for compare instruction, if
+  // the number of signed predicate is greater than unsigned predicate, we
+  // prefer to use SIGN_EXTEND.
+  //
+  // With this optimization, we would be able to reduce some redundant sign or
+  // zero extension instruction, and eventually more machine CSE opportunities
+  // can be exposed.
+  ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+  unsigned NumOfSigned = 0, NumOfUnsigned = 0;
+  for (const User *U : I->users()) {
+    if (const auto *CI = dyn_cast<CmpInst>(U)) {
+      NumOfSigned += CI->isSigned();
+      NumOfUnsigned += CI->isUnsigned();
+    }
+  }
+  if (NumOfSigned > NumOfUnsigned)
+    ExtendKind = ISD::SIGN_EXTEND;
+
+  return ExtendKind;
+}
+
+void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
+                               SelectionDAG *DAG) {
+  Fn = &fn;
+  MF = &mf;
+  TLI = MF->getSubtarget().getTargetLowering();
+  RegInfo = &MF->getRegInfo();
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  UA = DAG->getUniformityInfo();
+
+  // Check whether the function can return without sret-demotion.
+  SmallVector<ISD::OutputArg, 4> Outs;
+  CallingConv::ID CC = Fn->getCallingConv();
+
+  GetReturnInfo(CC, Fn->getReturnType(), Fn->getAttributes(), Outs, *TLI,
+                mf.getDataLayout());
+  CanLowerReturn =
+      TLI->CanLowerReturn(CC, *MF, Fn->isVarArg(), Outs, Fn->getContext());
+
+  // If this personality uses funclets, we need to do a bit more work.
+  DenseMap<const AllocaInst *, TinyPtrVector<int *>> CatchObjects;
+  EHPersonality Personality = classifyEHPersonality(
+      Fn->hasPersonalityFn() ? Fn->getPersonalityFn() : nullptr);
+  if (isFuncletEHPersonality(Personality)) {
+    // Calculate state numbers if we haven't already.
+    WinEHFuncInfo &EHInfo = *MF->getWinEHFuncInfo();
+    if (Personality == EHPersonality::MSVC_CXX)
+      calculateWinCXXEHStateNumbers(&fn, EHInfo);
+    else if (isAsynchronousEHPersonality(Personality))
+      calculateSEHStateNumbers(&fn, EHInfo);
+    else if (Personality == EHPersonality::CoreCLR)
+      calculateClrEHStateNumbers(&fn, EHInfo);
+
+    // Map all BB references in the WinEH data to MBBs.
+    for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
+      for (WinEHHandlerType &H : TBME.HandlerArray) {
+        if (const AllocaInst *AI = H.CatchObj.Alloca)
+          CatchObjects.insert({AI, {}}).first->second.push_back(
+              &H.CatchObj.FrameIndex);
+        else
+          H.CatchObj.FrameIndex = INT_MAX;
+      }
+    }
+  }
+
+  // Initialize the mapping of values to registers.  This is only set up for
+  // instruction values that are used outside of the block that defines
+  // them.
+  const Align StackAlign = TFI->getStackAlign();
+  for (const BasicBlock &BB : *Fn) {
+    for (const Instruction &I : BB) {
+      if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
+        Type *Ty = AI->getAllocatedType();
+        Align Alignment = AI->getAlign();
+
+        // Static allocas can be folded into the initial stack frame
+        // adjustment. For targets that don't realign the stack, don't
+        // do this if there is an extra alignment requirement.
+        if (AI->isStaticAlloca() &&
+            (TFI->isStackRealignable() || (Alignment <= StackAlign))) {
+          const ConstantInt *CUI = cast<ConstantInt>(AI->getArraySize());
+          uint64_t TySize =
+              MF->getDataLayout().getTypeAllocSize(Ty).getKnownMinValue();
+
+          TySize *= CUI->getZExtValue();   // Get total allocated size.
+          if (TySize == 0) TySize = 1; // Don't create zero-sized stack objects.
+          int FrameIndex = INT_MAX;
+          auto Iter = CatchObjects.find(AI);
+          if (Iter != CatchObjects.end() && TLI->needsFixedCatchObjects()) {
+            FrameIndex = MF->getFrameInfo().CreateFixedObject(
+                TySize, 0, /*IsImmutable=*/false, /*isAliased=*/true);
+            MF->getFrameInfo().setObjectAlignment(FrameIndex, Alignment);
+          } else {
+            FrameIndex = MF->getFrameInfo().CreateStackObject(TySize, Alignment,
+                                                              false, AI);
+          }
+
+          // Scalable vectors and structures that contain scalable vectors may
+          // need a special StackID to distinguish them from other (fixed size)
+          // stack objects.
+          if (Ty->isScalableTy())
+            MF->getFrameInfo().setStackID(FrameIndex,
+                                          TFI->getStackIDForScalableVectors());
+
+          StaticAllocaMap[AI] = FrameIndex;
+          // Update the catch handler information.
+          if (Iter != CatchObjects.end()) {
+            for (int *CatchObjPtr : Iter->second)
+              *CatchObjPtr = FrameIndex;
+          }
+        } else {
+          // FIXME: Overaligned static allocas should be grouped into
+          // a single dynamic allocation instead of using a separate
+          // stack allocation for each one.
+          // Inform the Frame Information that we have variable-sized objects.
+          MF->getFrameInfo().CreateVariableSizedObject(
+              Alignment <= StackAlign ? Align(1) : Alignment, AI);
+        }
+      } else if (auto *Call = dyn_cast<CallBase>(&I)) {
+        // Look for inline asm that clobbers the SP register.
+        if (Call->isInlineAsm()) {
+          Register SP = TLI->getStackPointerRegisterToSaveRestore();
+          const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+          std::vector<TargetLowering::AsmOperandInfo> Ops =
+              TLI->ParseConstraints(Fn->getParent()->getDataLayout(), TRI,
+                                    *Call);
+          for (TargetLowering::AsmOperandInfo &Op : Ops) {
+            if (Op.Type == InlineAsm::isClobber) {
+              // Clobbers don't have SDValue operands, hence SDValue().
+              TLI->ComputeConstraintToUse(Op, SDValue(), DAG);
+              std::pair<unsigned, const TargetRegisterClass *> PhysReg =
+                  TLI->getRegForInlineAsmConstraint(TRI, Op.ConstraintCode,
+                                                    Op.ConstraintVT);
+              if (PhysReg.first == SP)
+                MF->getFrameInfo().setHasOpaqueSPAdjustment(true);
+            }
+          }
+        }
+        // Look for calls to the @llvm.va_start intrinsic. We can omit some
+        // prologue boilerplate for variadic functions that don't examine their
+        // arguments.
+        if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {
+          if (II->getIntrinsicID() == Intrinsic::vastart)
+            MF->getFrameInfo().setHasVAStart(true);
+        }
+
+        // If we have a musttail call in a variadic function, we need to ensure
+        // we forward implicit register parameters.
+        if (const auto *CI = dyn_cast<CallInst>(&I)) {
+          if (CI->isMustTailCall() && Fn->isVarArg())
+            MF->getFrameInfo().setHasMustTailInVarArgFunc(true);
+        }
+      }
+
+      // Mark values used outside their block as exported, by allocating
+      // a virtual register for them.
+      if (isUsedOutsideOfDefiningBlock(&I))
+        if (!isa<AllocaInst>(I) || !StaticAllocaMap.count(cast<AllocaInst>(&I)))
+          InitializeRegForValue(&I);
+
+      // Decide the preferred extend type for a value.
+      PreferredExtendType[&I] = getPreferredExtendForValue(&I);
+    }
+  }
+
+  // Create an initial MachineBasicBlock for each LLVM BasicBlock in F.  This
+  // also creates the initial PHI MachineInstrs, though none of the input
+  // operands are populated.
+  for (const BasicBlock &BB : *Fn) {
+    // Don't create MachineBasicBlocks for imaginary EH pad blocks. These blocks
+    // are really data, and no instructions can live here.
+    if (BB.isEHPad()) {
+      const Instruction *PadInst = BB.getFirstNonPHI();
+      // If this is a non-landingpad EH pad, mark this function as using
+      // funclets.
+      // FIXME: SEH catchpads do not create EH scope/funclets, so we could avoid
+      // setting this in such cases in order to improve frame layout.
+      if (!isa<LandingPadInst>(PadInst)) {
+        MF->setHasEHScopes(true);
+        MF->setHasEHFunclets(true);
+        MF->getFrameInfo().setHasOpaqueSPAdjustment(true);
+      }
+      if (isa<CatchSwitchInst>(PadInst)) {
+        assert(&*BB.begin() == PadInst &&
+               "WinEHPrepare failed to remove PHIs from imaginary BBs");
+        continue;
+      }
+      if (isa<FuncletPadInst>(PadInst))
+        assert(&*BB.begin() == PadInst && "WinEHPrepare failed to demote PHIs");
+    }
+
+    MachineBasicBlock *MBB = mf.CreateMachineBasicBlock(&BB);
+    MBBMap[&BB] = MBB;
+    MF->push_back(MBB);
+
+    // Transfer the address-taken flag. This is necessary because there could
+    // be multiple MachineBasicBlocks corresponding to one BasicBlock, and only
+    // the first one should be marked.
+    if (BB.hasAddressTaken())
+      MBB->setAddressTakenIRBlock(const_cast<BasicBlock *>(&BB));
+
+    // Mark landing pad blocks.
+    if (BB.isEHPad())
+      MBB->setIsEHPad();
+
+    // Create Machine PHI nodes for LLVM PHI nodes, lowering them as
+    // appropriate.
+    for (const PHINode &PN : BB.phis()) {
+      if (PN.use_empty())
+        continue;
+
+      // Skip empty types
+      if (PN.getType()->isEmptyTy())
+        continue;
+
+      DebugLoc DL = PN.getDebugLoc();
+      unsigned PHIReg = ValueMap[&PN];
+      assert(PHIReg && "PHI node does not have an assigned virtual register!");
+
+      SmallVector<EVT, 4> ValueVTs;
+      ComputeValueVTs(*TLI, MF->getDataLayout(), PN.getType(), ValueVTs);
+      for (EVT VT : ValueVTs) {
+        unsigned NumRegisters = TLI->getNumRegisters(Fn->getContext(), VT);
+        const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+        for (unsigned i = 0; i != NumRegisters; ++i)
+          BuildMI(MBB, DL, TII->get(TargetOpcode::PHI), PHIReg + i);
+        PHIReg += NumRegisters;
+      }
+    }
+  }
+
+  if (isFuncletEHPersonality(Personality)) {
+    WinEHFuncInfo &EHInfo = *MF->getWinEHFuncInfo();
+
+    // Map all BB references in the WinEH data to MBBs.
+    for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
+      for (WinEHHandlerType &H : TBME.HandlerArray) {
+        if (H.Handler)
+          H.Handler = MBBMap[cast<const BasicBlock *>(H.Handler)];
+      }
+    }
+    for (CxxUnwindMapEntry &UME : EHInfo.CxxUnwindMap)
+      if (UME.Cleanup)
+        UME.Cleanup = MBBMap[cast<const BasicBlock *>(UME.Cleanup)];
+    for (SEHUnwindMapEntry &UME : EHInfo.SEHUnwindMap) {
+      const auto *BB = cast<const BasicBlock *>(UME.Handler);
+      UME.Handler = MBBMap[BB];
+    }
+    for (ClrEHUnwindMapEntry &CME : EHInfo.ClrEHUnwindMap) {
+      const auto *BB = cast<const BasicBlock *>(CME.Handler);
+      CME.Handler = MBBMap[BB];
+    }
+  } else if (Personality == EHPersonality::Wasm_CXX) {
+    WasmEHFuncInfo &EHInfo = *MF->getWasmEHFuncInfo();
+    calculateWasmEHInfo(&fn, EHInfo);
+
+    // Map all BB references in the Wasm EH data to MBBs.
+    DenseMap<BBOrMBB, BBOrMBB> SrcToUnwindDest;
+    for (auto &KV : EHInfo.SrcToUnwindDest) {
+      const auto *Src = cast<const BasicBlock *>(KV.first);
+      const auto *Dest = cast<const BasicBlock *>(KV.second);
+      SrcToUnwindDest[MBBMap[Src]] = MBBMap[Dest];
+    }
+    EHInfo.SrcToUnwindDest = std::move(SrcToUnwindDest);
+    DenseMap<BBOrMBB, SmallPtrSet<BBOrMBB, 4>> UnwindDestToSrcs;
+    for (auto &KV : EHInfo.UnwindDestToSrcs) {
+      const auto *Dest = cast<const BasicBlock *>(KV.first);
+      UnwindDestToSrcs[MBBMap[Dest]] = SmallPtrSet<BBOrMBB, 4>();
+      for (const auto P : KV.second)
+        UnwindDestToSrcs[MBBMap[Dest]].insert(
+            MBBMap[cast<const BasicBlock *>(P)]);
+    }
+    EHInfo.UnwindDestToSrcs = std::move(UnwindDestToSrcs);
+  }
+}
+
+/// clear - Clear out all the function-specific state. This returns this
+/// FunctionLoweringInfo to an empty state, ready to be used for a
+/// different function.
+void FunctionLoweringInfo::clear() {
+  MBBMap.clear();
+  ValueMap.clear();
+  VirtReg2Value.clear();
+  StaticAllocaMap.clear();
+  LiveOutRegInfo.clear();
+  VisitedBBs.clear();
+  ArgDbgValues.clear();
+  DescribedArgs.clear();
+  ByValArgFrameIndexMap.clear();
+  RegFixups.clear();
+  RegsWithFixups.clear();
+  StatepointStackSlots.clear();
+  StatepointRelocationMaps.clear();
+  PreferredExtendType.clear();
+  PreprocessedDbgDeclares.clear();
+}
+
+/// CreateReg - Allocate a single virtual register for the given type.
+Register FunctionLoweringInfo::CreateReg(MVT VT, bool isDivergent) {
+  return RegInfo->createVirtualRegister(TLI->getRegClassFor(VT, isDivergent));
+}
+
+/// CreateRegs - Allocate the appropriate number of virtual registers of
+/// the correctly promoted or expanded types.  Assign these registers
+/// consecutive vreg numbers and return the first assigned number.
+///
+/// In the case that the given value has struct or array type, this function
+/// will assign registers for each member or element.
+///
+Register FunctionLoweringInfo::CreateRegs(Type *Ty, bool isDivergent) {
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(*TLI, MF->getDataLayout(), Ty, ValueVTs);
+
+  Register FirstReg;
+  for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    EVT ValueVT = ValueVTs[Value];
+    MVT RegisterVT = TLI->getRegisterType(Ty->getContext(), ValueVT);
+
+    unsigned NumRegs = TLI->getNumRegisters(Ty->getContext(), ValueVT);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      Register R = CreateReg(RegisterVT, isDivergent);
+      if (!FirstReg) FirstReg = R;
+    }
+  }
+  return FirstReg;
+}
+
+Register FunctionLoweringInfo::CreateRegs(const Value *V) {
+  return CreateRegs(V->getType(), UA && UA->isDivergent(V) &&
+                                      !TLI->requiresUniformRegister(*MF, V));
+}
+
+/// GetLiveOutRegInfo - Gets LiveOutInfo for a register, returning NULL if the
+/// register is a PHI destination and the PHI's LiveOutInfo is not valid. If
+/// the register's LiveOutInfo is for a smaller bit width, it is extended to
+/// the larger bit width by zero extension. The bit width must be no smaller
+/// than the LiveOutInfo's existing bit width.
+const FunctionLoweringInfo::LiveOutInfo *
+FunctionLoweringInfo::GetLiveOutRegInfo(Register Reg, unsigned BitWidth) {
+  if (!LiveOutRegInfo.inBounds(Reg))
+    return nullptr;
+
+  LiveOutInfo *LOI = &LiveOutRegInfo[Reg];
+  if (!LOI->IsValid)
+    return nullptr;
+
+  if (BitWidth > LOI->Known.getBitWidth()) {
+    LOI->NumSignBits = 1;
+    LOI->Known = LOI->Known.anyext(BitWidth);
+  }
+
+  return LOI;
+}
+
+/// ComputePHILiveOutRegInfo - Compute LiveOutInfo for a PHI's destination
+/// register based on the LiveOutInfo of its operands.
+void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
+  Type *Ty = PN->getType();
+  if (!Ty->isIntegerTy() || Ty->isVectorTy())
+    return;
+
+  SmallVector<EVT, 1> ValueVTs;
+  ComputeValueVTs(*TLI, MF->getDataLayout(), Ty, ValueVTs);
+  assert(ValueVTs.size() == 1 &&
+         "PHIs with non-vector integer types should have a single VT.");
+  EVT IntVT = ValueVTs[0];
+
+  if (TLI->getNumRegisters(PN->getContext(), IntVT) != 1)
+    return;
+  IntVT = TLI->getTypeToTransformTo(PN->getContext(), IntVT);
+  unsigned BitWidth = IntVT.getSizeInBits();
+
+  auto It = ValueMap.find(PN);
+  if (It == ValueMap.end())
+    return;
+
+  Register DestReg = It->second;
+  if (DestReg == 0)
+    return;
+  assert(DestReg.isVirtual() && "Expected a virtual reg");
+  LiveOutRegInfo.grow(DestReg);
+  LiveOutInfo &DestLOI = LiveOutRegInfo[DestReg];
+
+  Value *V = PN->getIncomingValue(0);
+  if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) {
+    DestLOI.NumSignBits = 1;
+    DestLOI.Known = KnownBits(BitWidth);
+    return;
+  }
+
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    APInt Val;
+    if (TLI->signExtendConstant(CI))
+      Val = CI->getValue().sext(BitWidth);
+    else
+      Val = CI->getValue().zext(BitWidth);
+    DestLOI.NumSignBits = Val.getNumSignBits();
+    DestLOI.Known = KnownBits::makeConstant(Val);
+  } else {
+    assert(ValueMap.count(V) && "V should have been placed in ValueMap when its"
+                                "CopyToReg node was created.");
+    Register SrcReg = ValueMap[V];
+    if (!SrcReg.isVirtual()) {
+      DestLOI.IsValid = false;
+      return;
+    }
+    const LiveOutInfo *SrcLOI = GetLiveOutRegInfo(SrcReg, BitWidth);
+    if (!SrcLOI) {
+      DestLOI.IsValid = false;
+      return;
+    }
+    DestLOI = *SrcLOI;
+  }
+
+  assert(DestLOI.Known.Zero.getBitWidth() == BitWidth &&
+         DestLOI.Known.One.getBitWidth() == BitWidth &&
+         "Masks should have the same bit width as the type.");
+
+  for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *V = PN->getIncomingValue(i);
+    if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) {
+      DestLOI.NumSignBits = 1;
+      DestLOI.Known = KnownBits(BitWidth);
+      return;
+    }
+
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      APInt Val;
+      if (TLI->signExtendConstant(CI))
+        Val = CI->getValue().sext(BitWidth);
+      else
+        Val = CI->getValue().zext(BitWidth);
+      DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, Val.getNumSignBits());
+      DestLOI.Known.Zero &= ~Val;
+      DestLOI.Known.One &= Val;
+      continue;
+    }
+
+    assert(ValueMap.count(V) && "V should have been placed in ValueMap when "
+                                "its CopyToReg node was created.");
+    Register SrcReg = ValueMap[V];
+    if (!SrcReg.isVirtual()) {
+      DestLOI.IsValid = false;
+      return;
+    }
+    const LiveOutInfo *SrcLOI = GetLiveOutRegInfo(SrcReg, BitWidth);
+    if (!SrcLOI) {
+      DestLOI.IsValid = false;
+      return;
+    }
+    DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, SrcLOI->NumSignBits);
+    DestLOI.Known = DestLOI.Known.intersectWith(SrcLOI->Known);
+  }
+}
+
+/// setArgumentFrameIndex - Record frame index for the byval
+/// argument. This overrides previous frame index entry for this argument,
+/// if any.
+void FunctionLoweringInfo::setArgumentFrameIndex(const Argument *A,
+                                                 int FI) {
+  ByValArgFrameIndexMap[A] = FI;
+}
+
+/// getArgumentFrameIndex - Get frame index for the byval argument.
+/// If the argument does not have any assigned frame index then 0 is
+/// returned.
+int FunctionLoweringInfo::getArgumentFrameIndex(const Argument *A) {
+  auto I = ByValArgFrameIndexMap.find(A);
+  if (I != ByValArgFrameIndexMap.end())
+    return I->second;
+  LLVM_DEBUG(dbgs() << "Argument does not have assigned frame index!\n");
+  return INT_MAX;
+}
+
+Register FunctionLoweringInfo::getCatchPadExceptionPointerVReg(
+    const Value *CPI, const TargetRegisterClass *RC) {
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  auto I = CatchPadExceptionPointers.insert({CPI, 0});
+  Register &VReg = I.first->second;
+  if (I.second)
+    VReg = MRI.createVirtualRegister(RC);
+  assert(VReg && "null vreg in exception pointer table!");
+  return VReg;
+}
+
+const Value *
+FunctionLoweringInfo::getValueFromVirtualReg(Register Vreg) {
+  if (VirtReg2Value.empty()) {
+    SmallVector<EVT, 4> ValueVTs;
+    for (auto &P : ValueMap) {
+      ValueVTs.clear();
+      ComputeValueVTs(*TLI, Fn->getParent()->getDataLayout(),
+                      P.first->getType(), ValueVTs);
+      unsigned Reg = P.second;
+      for (EVT VT : ValueVTs) {
+        unsigned NumRegisters = TLI->getNumRegisters(Fn->getContext(), VT);
+        for (unsigned i = 0, e = NumRegisters; i != e; ++i)
+          VirtReg2Value[Reg++] = P.first;
+      }
+    }
+  }
+  return VirtReg2Value.lookup(Vreg);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
new file mode 100644
index 000000000000..4e7895c0b3cf
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -0,0 +1,1414 @@
+//==--- InstrEmitter.cpp - Emit MachineInstrs for the SelectionDAG class ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the Emit routines for the SelectionDAG class, which creates
+// MachineInstrs based on the decisions of the SelectionDAG instruction
+// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstrEmitter.h"
+#include "SDNodeDbgValue.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/PseudoProbe.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "instr-emitter"
+
+/// MinRCSize - Smallest register class we allow when constraining virtual
+/// registers.  If satisfying all register class constraints would require
+/// using a smaller register class, emit a COPY to a new virtual register
+/// instead.
+const unsigned MinRCSize = 4;
+
+/// CountResults - The results of target nodes have register or immediate
+/// operands first, then an optional chain, and optional glue operands (which do
+/// not go into the resulting MachineInstr).
+unsigned InstrEmitter::CountResults(SDNode *Node) {
+  unsigned N = Node->getNumValues();
+  while (N && Node->getValueType(N - 1) == MVT::Glue)
+    --N;
+  if (N && Node->getValueType(N - 1) == MVT::Other)
+    --N;    // Skip over chain result.
+  return N;
+}
+
+/// countOperands - The inputs to target nodes have any actual inputs first,
+/// followed by an optional chain operand, then an optional glue operand.
+/// Compute the number of actual operands that will go into the resulting
+/// MachineInstr.
+///
+/// Also count physreg RegisterSDNode and RegisterMaskSDNode operands preceding
+/// the chain and glue. These operands may be implicit on the machine instr.
+static unsigned countOperands(SDNode *Node, unsigned NumExpUses,
+                              unsigned &NumImpUses) {
+  unsigned N = Node->getNumOperands();
+  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
+    --N;
+  if (N && Node->getOperand(N - 1).getValueType() == MVT::Other)
+    --N; // Ignore chain if it exists.
+
+  // Count RegisterSDNode and RegisterMaskSDNode operands for NumImpUses.
+  NumImpUses = N - NumExpUses;
+  for (unsigned I = N; I > NumExpUses; --I) {
+    if (isa<RegisterMaskSDNode>(Node->getOperand(I - 1)))
+      continue;
+    if (RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Node->getOperand(I - 1)))
+      if (RN->getReg().isPhysical())
+        continue;
+    NumImpUses = N - I;
+    break;
+  }
+
+  return N;
+}
+
+/// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
+/// implicit physical register output.
+void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
+                                   Register SrcReg,
+                                   DenseMap<SDValue, Register> &VRBaseMap) {
+  Register VRBase;
+  if (SrcReg.isVirtual()) {
+    // Just use the input register directly!
+    SDValue Op(Node, ResNo);
+    if (IsClone)
+      VRBaseMap.erase(Op);
+    bool isNew = VRBaseMap.insert(std::make_pair(Op, SrcReg)).second;
+    (void)isNew; // Silence compiler warning.
+    assert(isNew && "Node emitted out of order - early");
+    return;
+  }
+
+  // If the node is only used by a CopyToReg and the dest reg is a vreg, use
+  // the CopyToReg'd destination register instead of creating a new vreg.
+  bool MatchReg = true;
+  const TargetRegisterClass *UseRC = nullptr;
+  MVT VT = Node->getSimpleValueType(ResNo);
+
+  // Stick to the preferred register classes for legal types.
+  if (TLI->isTypeLegal(VT))
+    UseRC = TLI->getRegClassFor(VT, Node->isDivergent());
+
+  for (SDNode *User : Node->uses()) {
+    bool Match = true;
+    if (User->getOpcode() == ISD::CopyToReg &&
+        User->getOperand(2).getNode() == Node &&
+        User->getOperand(2).getResNo() == ResNo) {
+      Register DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+      if (DestReg.isVirtual()) {
+        VRBase = DestReg;
+        Match = false;
+      } else if (DestReg != SrcReg)
+        Match = false;
+    } else {
+      for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
+        SDValue Op = User->getOperand(i);
+        if (Op.getNode() != Node || Op.getResNo() != ResNo)
+          continue;
+        MVT VT = Node->getSimpleValueType(Op.getResNo());
+        if (VT == MVT::Other || VT == MVT::Glue)
+          continue;
+        Match = false;
+        if (User->isMachineOpcode()) {
+          const MCInstrDesc &II = TII->get(User->getMachineOpcode());
+          const TargetRegisterClass *RC = nullptr;
+          if (i + II.getNumDefs() < II.getNumOperands()) {
+            RC = TRI->getAllocatableClass(
+                TII->getRegClass(II, i + II.getNumDefs(), TRI, *MF));
+          }
+          if (!UseRC)
+            UseRC = RC;
+          else if (RC) {
+            const TargetRegisterClass *ComRC =
+                TRI->getCommonSubClass(UseRC, RC);
+            // If multiple uses expect disjoint register classes, we emit
+            // copies in AddRegisterOperand.
+            if (ComRC)
+              UseRC = ComRC;
+          }
+        }
+      }
+    }
+    MatchReg &= Match;
+    if (VRBase)
+      break;
+  }
+
+  const TargetRegisterClass *SrcRC = nullptr, *DstRC = nullptr;
+  SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
+
+  // Figure out the register class to create for the destreg.
+  if (VRBase) {
+    DstRC = MRI->getRegClass(VRBase);
+  } else if (UseRC) {
+    assert(TRI->isTypeLegalForClass(*UseRC, VT) &&
+           "Incompatible phys register def and uses!");
+    DstRC = UseRC;
+  } else
+    DstRC = SrcRC;
+
+  // If all uses are reading from the src physical register and copying the
+  // register is either impossible or very expensive, then don't create a copy.
+  if (MatchReg && SrcRC->getCopyCost() < 0) {
+    VRBase = SrcReg;
+  } else {
+    // Create the reg, emit the copy.
+    VRBase = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+            VRBase).addReg(SrcReg);
+  }
+
+  SDValue Op(Node, ResNo);
+  if (IsClone)
+    VRBaseMap.erase(Op);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+  (void)isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
+                                       MachineInstrBuilder &MIB,
+                                       const MCInstrDesc &II,
+                                       bool IsClone, bool IsCloned,
+                                       DenseMap<SDValue, Register> &VRBaseMap) {
+  assert(Node->getMachineOpcode() != TargetOpcode::IMPLICIT_DEF &&
+         "IMPLICIT_DEF should have been handled as a special case elsewhere!");
+
+  unsigned NumResults = CountResults(Node);
+  bool HasVRegVariadicDefs = !MF->getTarget().usesPhysRegsForValues() &&
+                             II.isVariadic() && II.variadicOpsAreDefs();
+  unsigned NumVRegs = HasVRegVariadicDefs ? NumResults : II.getNumDefs();
+  if (Node->getMachineOpcode() == TargetOpcode::STATEPOINT)
+    NumVRegs = NumResults;
+  for (unsigned i = 0; i < NumVRegs; ++i) {
+    // If the specific node value is only used by a CopyToReg and the dest reg
+    // is a vreg in the same register class, use the CopyToReg'd destination
+    // register instead of creating a new vreg.
+    Register VRBase;
+    const TargetRegisterClass *RC =
+      TRI->getAllocatableClass(TII->getRegClass(II, i, TRI, *MF));
+    // Always let the value type influence the used register class. The
+    // constraints on the instruction may be too lax to represent the value
+    // type correctly. For example, a 64-bit float (X86::FR64) can't live in
+    // the 32-bit float super-class (X86::FR32).
+    if (i < NumResults && TLI->isTypeLegal(Node->getSimpleValueType(i))) {
+      const TargetRegisterClass *VTRC = TLI->getRegClassFor(
+          Node->getSimpleValueType(i),
+          (Node->isDivergent() || (RC && TRI->isDivergentRegClass(RC))));
+      if (RC)
+        VTRC = TRI->getCommonSubClass(RC, VTRC);
+      if (VTRC)
+        RC = VTRC;
+    }
+
+    if (!II.operands().empty() && II.operands()[i].isOptionalDef()) {
+      // Optional def must be a physical register.
+      VRBase = cast<RegisterSDNode>(Node->getOperand(i-NumResults))->getReg();
+      assert(VRBase.isPhysical());
+      MIB.addReg(VRBase, RegState::Define);
+    }
+
+    if (!VRBase && !IsClone && !IsCloned)
+      for (SDNode *User : Node->uses()) {
+        if (User->getOpcode() == ISD::CopyToReg &&
+            User->getOperand(2).getNode() == Node &&
+            User->getOperand(2).getResNo() == i) {
+          Register Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+          if (Reg.isVirtual()) {
+            const TargetRegisterClass *RegRC = MRI->getRegClass(Reg);
+            if (RegRC == RC) {
+              VRBase = Reg;
+              MIB.addReg(VRBase, RegState::Define);
+              break;
+            }
+          }
+        }
+      }
+
+    // Create the result registers for this node and add the result regs to
+    // the machine instruction.
+    if (VRBase == 0) {
+      assert(RC && "Isn't a register operand!");
+      VRBase = MRI->createVirtualRegister(RC);
+      MIB.addReg(VRBase, RegState::Define);
+    }
+
+    // If this def corresponds to a result of the SDNode insert the VRBase into
+    // the lookup map.
+    if (i < NumResults) {
+      SDValue Op(Node, i);
+      if (IsClone)
+        VRBaseMap.erase(Op);
+      bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+      (void)isNew; // Silence compiler warning.
+      assert(isNew && "Node emitted out of order - early");
+    }
+  }
+}
+
+/// getVR - Return the virtual register corresponding to the specified result
+/// of the specified node.
+Register InstrEmitter::getVR(SDValue Op,
+                             DenseMap<SDValue, Register> &VRBaseMap) {
+  if (Op.isMachineOpcode() &&
+      Op.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF) {
+    // Add an IMPLICIT_DEF instruction before every use.
+    // IMPLICIT_DEF can produce any type of result so its MCInstrDesc
+    // does not include operand register class info.
+    const TargetRegisterClass *RC = TLI->getRegClassFor(
+        Op.getSimpleValueType(), Op.getNode()->isDivergent());
+    Register VReg = MRI->createVirtualRegister(RC);
+    BuildMI(*MBB, InsertPos, Op.getDebugLoc(),
+            TII->get(TargetOpcode::IMPLICIT_DEF), VReg);
+    return VReg;
+  }
+
+  DenseMap<SDValue, Register>::iterator I = VRBaseMap.find(Op);
+  assert(I != VRBaseMap.end() && "Node emitted out of order - late");
+  return I->second;
+}
+
+
+/// AddRegisterOperand - Add the specified register as an operand to the
+/// specified machine instr. Insert register copies if the register is
+/// not in the required register class.
+void
+InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
+                                 SDValue Op,
+                                 unsigned IIOpNum,
+                                 const MCInstrDesc *II,
+                                 DenseMap<SDValue, Register> &VRBaseMap,
+                                 bool IsDebug, bool IsClone, bool IsCloned) {
+  assert(Op.getValueType() != MVT::Other &&
+         Op.getValueType() != MVT::Glue &&
+         "Chain and glue operands should occur at end of operand list!");
+  // Get/emit the operand.
+  Register VReg = getVR(Op, VRBaseMap);
+
+  const MCInstrDesc &MCID = MIB->getDesc();
+  bool isOptDef = IIOpNum < MCID.getNumOperands() &&
+                  MCID.operands()[IIOpNum].isOptionalDef();
+
+  // If the instruction requires a register in a different class, create
+  // a new virtual register and copy the value into it, but first attempt to
+  // shrink VReg's register class within reason.  For example, if VReg == GR32
+  // and II requires a GR32_NOSP, just constrain VReg to GR32_NOSP.
+  if (II) {
+    const TargetRegisterClass *OpRC = nullptr;
+    if (IIOpNum < II->getNumOperands())
+      OpRC = TII->getRegClass(*II, IIOpNum, TRI, *MF);
+
+    if (OpRC) {
+      unsigned MinNumRegs = MinRCSize;
+      // Don't apply any RC size limit for IMPLICIT_DEF. Each use has a unique
+      // virtual register.
+      if (Op.isMachineOpcode() &&
+          Op.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF)
+        MinNumRegs = 0;
+
+      const TargetRegisterClass *ConstrainedRC
+        = MRI->constrainRegClass(VReg, OpRC, MinNumRegs);
+      if (!ConstrainedRC) {
+        OpRC = TRI->getAllocatableClass(OpRC);
+        assert(OpRC && "Constraints cannot be fulfilled for allocation");
+        Register NewVReg = MRI->createVirtualRegister(OpRC);
+        BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
+                TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
+        VReg = NewVReg;
+      } else {
+        assert(ConstrainedRC->isAllocatable() &&
+           "Constraining an allocatable VReg produced an unallocatable class?");
+      }
+    }
+  }
+
+  // If this value has only one use, that use is a kill. This is a
+  // conservative approximation. InstrEmitter does trivial coalescing
+  // with CopyFromReg nodes, so don't emit kill flags for them.
+  // Avoid kill flags on Schedule cloned nodes, since there will be
+  // multiple uses.
+  // Tied operands are never killed, so we need to check that. And that
+  // means we need to determine the index of the operand.
+  bool isKill = Op.hasOneUse() &&
+                Op.getNode()->getOpcode() != ISD::CopyFromReg &&
+                !IsDebug &&
+                !(IsClone || IsCloned);
+  if (isKill) {
+    unsigned Idx = MIB->getNumOperands();
+    while (Idx > 0 &&
+           MIB->getOperand(Idx-1).isReg() &&
+           MIB->getOperand(Idx-1).isImplicit())
+      --Idx;
+    bool isTied = MCID.getOperandConstraint(Idx, MCOI::TIED_TO) != -1;
+    if (isTied)
+      isKill = false;
+  }
+
+  MIB.addReg(VReg, getDefRegState(isOptDef) | getKillRegState(isKill) |
+             getDebugRegState(IsDebug));
+}
+
+/// AddOperand - Add the specified operand to the specified machine instr.  II
+/// specifies the instruction information for the node, and IIOpNum is the
+/// operand number (in the II) that we are adding.
+void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
+                              SDValue Op,
+                              unsigned IIOpNum,
+                              const MCInstrDesc *II,
+                              DenseMap<SDValue, Register> &VRBaseMap,
+                              bool IsDebug, bool IsClone, bool IsCloned) {
+  if (Op.isMachineOpcode()) {
+    AddRegisterOperand(MIB, Op, IIOpNum, II, VRBaseMap,
+                       IsDebug, IsClone, IsCloned);
+  } else if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    MIB.addImm(C->getSExtValue());
+  } else if (ConstantFPSDNode *F = dyn_cast<ConstantFPSDNode>(Op)) {
+    MIB.addFPImm(F->getConstantFPValue());
+  } else if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(Op)) {
+    Register VReg = R->getReg();
+    MVT OpVT = Op.getSimpleValueType();
+    const TargetRegisterClass *IIRC =
+        II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum, TRI, *MF))
+           : nullptr;
+    const TargetRegisterClass *OpRC =
+        TLI->isTypeLegal(OpVT)
+            ? TLI->getRegClassFor(OpVT,
+                                  Op.getNode()->isDivergent() ||
+                                      (IIRC && TRI->isDivergentRegClass(IIRC)))
+            : nullptr;
+
+    if (OpRC && IIRC && OpRC != IIRC && VReg.isVirtual()) {
+      Register NewVReg = MRI->createVirtualRegister(IIRC);
+      BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
+               TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
+      VReg = NewVReg;
+    }
+    // Turn additional physreg operands into implicit uses on non-variadic
+    // instructions. This is used by call and return instructions passing
+    // arguments in registers.
+    bool Imp = II && (IIOpNum >= II->getNumOperands() && !II->isVariadic());
+    MIB.addReg(VReg, getImplRegState(Imp));
+  } else if (RegisterMaskSDNode *RM = dyn_cast<RegisterMaskSDNode>(Op)) {
+    MIB.addRegMask(RM->getRegMask());
+  } else if (GlobalAddressSDNode *TGA = dyn_cast<GlobalAddressSDNode>(Op)) {
+    MIB.addGlobalAddress(TGA->getGlobal(), TGA->getOffset(),
+                         TGA->getTargetFlags());
+  } else if (BasicBlockSDNode *BBNode = dyn_cast<BasicBlockSDNode>(Op)) {
+    MIB.addMBB(BBNode->getBasicBlock());
+  } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Op)) {
+    MIB.addFrameIndex(FI->getIndex());
+  } else if (JumpTableSDNode *JT = dyn_cast<JumpTableSDNode>(Op)) {
+    MIB.addJumpTableIndex(JT->getIndex(), JT->getTargetFlags());
+  } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op)) {
+    int Offset = CP->getOffset();
+    Align Alignment = CP->getAlign();
+
+    unsigned Idx;
+    MachineConstantPool *MCP = MF->getConstantPool();
+    if (CP->isMachineConstantPoolEntry())
+      Idx = MCP->getConstantPoolIndex(CP->getMachineCPVal(), Alignment);
+    else
+      Idx = MCP->getConstantPoolIndex(CP->getConstVal(), Alignment);
+    MIB.addConstantPoolIndex(Idx, Offset, CP->getTargetFlags());
+  } else if (ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op)) {
+    MIB.addExternalSymbol(ES->getSymbol(), ES->getTargetFlags());
+  } else if (auto *SymNode = dyn_cast<MCSymbolSDNode>(Op)) {
+    MIB.addSym(SymNode->getMCSymbol());
+  } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op)) {
+    MIB.addBlockAddress(BA->getBlockAddress(),
+                        BA->getOffset(),
+                        BA->getTargetFlags());
+  } else if (TargetIndexSDNode *TI = dyn_cast<TargetIndexSDNode>(Op)) {
+    MIB.addTargetIndex(TI->getIndex(), TI->getOffset(), TI->getTargetFlags());
+  } else {
+    assert(Op.getValueType() != MVT::Other &&
+           Op.getValueType() != MVT::Glue &&
+           "Chain and glue operands should occur at end of operand list!");
+    AddRegisterOperand(MIB, Op, IIOpNum, II, VRBaseMap,
+                       IsDebug, IsClone, IsCloned);
+  }
+}
+
+Register InstrEmitter::ConstrainForSubReg(Register VReg, unsigned SubIdx,
+                                          MVT VT, bool isDivergent, const DebugLoc &DL) {
+  const TargetRegisterClass *VRC = MRI->getRegClass(VReg);
+  const TargetRegisterClass *RC = TRI->getSubClassWithSubReg(VRC, SubIdx);
+
+  // RC is a sub-class of VRC that supports SubIdx.  Try to constrain VReg
+  // within reason.
+  if (RC && RC != VRC)
+    RC = MRI->constrainRegClass(VReg, RC, MinRCSize);
+
+  // VReg has been adjusted.  It can be used with SubIdx operands now.
+  if (RC)
+    return VReg;
+
+  // VReg couldn't be reasonably constrained.  Emit a COPY to a new virtual
+  // register instead.
+  RC = TRI->getSubClassWithSubReg(TLI->getRegClassFor(VT, isDivergent), SubIdx);
+  assert(RC && "No legal register class for VT supports that SubIdx");
+  Register NewReg = MRI->createVirtualRegister(RC);
+  BuildMI(*MBB, InsertPos, DL, TII->get(TargetOpcode::COPY), NewReg)
+    .addReg(VReg);
+  return NewReg;
+}
+
+/// EmitSubregNode - Generate machine code for subreg nodes.
+///
+void InstrEmitter::EmitSubregNode(SDNode *Node,
+                                  DenseMap<SDValue, Register> &VRBaseMap,
+                                  bool IsClone, bool IsCloned) {
+  Register VRBase;
+  unsigned Opc = Node->getMachineOpcode();
+
+  // If the node is only used by a CopyToReg and the dest reg is a vreg, use
+  // the CopyToReg'd destination register instead of creating a new vreg.
+  for (SDNode *User : Node->uses()) {
+    if (User->getOpcode() == ISD::CopyToReg &&
+        User->getOperand(2).getNode() == Node) {
+      Register DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+      if (DestReg.isVirtual()) {
+        VRBase = DestReg;
+        break;
+      }
+    }
+  }
+
+  if (Opc == TargetOpcode::EXTRACT_SUBREG) {
+    // EXTRACT_SUBREG is lowered as %dst = COPY %src:sub.  There are no
+    // constraints on the %dst register, COPY can target all legal register
+    // classes.
+    unsigned SubIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    const TargetRegisterClass *TRC =
+      TLI->getRegClassFor(Node->getSimpleValueType(0), Node->isDivergent());
+
+    Register Reg;
+    MachineInstr *DefMI;
+    RegisterSDNode *R = dyn_cast<RegisterSDNode>(Node->getOperand(0));
+    if (R && R->getReg().isPhysical()) {
+      Reg = R->getReg();
+      DefMI = nullptr;
+    } else {
+      Reg = R ? R->getReg() : getVR(Node->getOperand(0), VRBaseMap);
+      DefMI = MRI->getVRegDef(Reg);
+    }
+
+    Register SrcReg, DstReg;
+    unsigned DefSubIdx;
+    if (DefMI &&
+        TII->isCoalescableExtInstr(*DefMI, SrcReg, DstReg, DefSubIdx) &&
+        SubIdx == DefSubIdx &&
+        TRC == MRI->getRegClass(SrcReg)) {
+      // Optimize these:
+      // r1025 = s/zext r1024, 4
+      // r1026 = extract_subreg r1025, 4
+      // to a copy
+      // r1026 = copy r1024
+      VRBase = MRI->createVirtualRegister(TRC);
+      BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), VRBase).addReg(SrcReg);
+      MRI->clearKillFlags(SrcReg);
+    } else {
+      // Reg may not support a SubIdx sub-register, and we may need to
+      // constrain its register class or issue a COPY to a compatible register
+      // class.
+      if (Reg.isVirtual())
+        Reg = ConstrainForSubReg(Reg, SubIdx,
+                                 Node->getOperand(0).getSimpleValueType(),
+                                 Node->isDivergent(), Node->getDebugLoc());
+      // Create the destreg if it is missing.
+      if (!VRBase)
+        VRBase = MRI->createVirtualRegister(TRC);
+
+      // Create the extract_subreg machine instruction.
+      MachineInstrBuilder CopyMI =
+          BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
+                  TII->get(TargetOpcode::COPY), VRBase);
+      if (Reg.isVirtual())
+        CopyMI.addReg(Reg, 0, SubIdx);
+      else
+        CopyMI.addReg(TRI->getSubReg(Reg, SubIdx));
+    }
+  } else if (Opc == TargetOpcode::INSERT_SUBREG ||
+             Opc == TargetOpcode::SUBREG_TO_REG) {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+    SDValue N2 = Node->getOperand(2);
+    unsigned SubIdx = cast<ConstantSDNode>(N2)->getZExtValue();
+
+    // Figure out the register class to create for the destreg.  It should be
+    // the largest legal register class supporting SubIdx sub-registers.
+    // RegisterCoalescer will constrain it further if it decides to eliminate
+    // the INSERT_SUBREG instruction.
+    //
+    //   %dst = INSERT_SUBREG %src, %sub, SubIdx
+    //
+    // is lowered by TwoAddressInstructionPass to:
+    //
+    //   %dst = COPY %src
+    //   %dst:SubIdx = COPY %sub
+    //
+    // There is no constraint on the %src register class.
+    //
+    const TargetRegisterClass *SRC =
+        TLI->getRegClassFor(Node->getSimpleValueType(0), Node->isDivergent());
+    SRC = TRI->getSubClassWithSubReg(SRC, SubIdx);
+    assert(SRC && "No register class supports VT and SubIdx for INSERT_SUBREG");
+
+    if (VRBase == 0 || !SRC->hasSubClassEq(MRI->getRegClass(VRBase)))
+      VRBase = MRI->createVirtualRegister(SRC);
+
+    // Create the insert_subreg or subreg_to_reg machine instruction.
+    MachineInstrBuilder MIB =
+      BuildMI(*MF, Node->getDebugLoc(), TII->get(Opc), VRBase);
+
+    // If creating a subreg_to_reg, then the first input operand
+    // is an implicit value immediate, otherwise it's a register
+    if (Opc == TargetOpcode::SUBREG_TO_REG) {
+      const ConstantSDNode *SD = cast<ConstantSDNode>(N0);
+      MIB.addImm(SD->getZExtValue());
+    } else
+      AddOperand(MIB, N0, 0, nullptr, VRBaseMap, /*IsDebug=*/false,
+                 IsClone, IsCloned);
+    // Add the subregister being inserted
+    AddOperand(MIB, N1, 0, nullptr, VRBaseMap, /*IsDebug=*/false,
+               IsClone, IsCloned);
+    MIB.addImm(SubIdx);
+    MBB->insert(InsertPos, MIB);
+  } else
+    llvm_unreachable("Node is not insert_subreg, extract_subreg, or subreg_to_reg");
+
+  SDValue Op(Node, 0);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+  (void)isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// EmitCopyToRegClassNode - Generate machine code for COPY_TO_REGCLASS nodes.
+/// COPY_TO_REGCLASS is just a normal copy, except that the destination
+/// register is constrained to be in a particular register class.
+///
+void
+InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
+                                     DenseMap<SDValue, Register> &VRBaseMap) {
+  unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
+
+  // Create the new VReg in the destination class and emit a copy.
+  unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+  const TargetRegisterClass *DstRC =
+    TRI->getAllocatableClass(TRI->getRegClass(DstRCIdx));
+  Register NewVReg = MRI->createVirtualRegister(DstRC);
+  BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+    NewVReg).addReg(VReg);
+
+  SDValue Op(Node, 0);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, NewVReg)).second;
+  (void)isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// EmitRegSequence - Generate machine code for REG_SEQUENCE nodes.
+///
+void InstrEmitter::EmitRegSequence(SDNode *Node,
+                                  DenseMap<SDValue, Register> &VRBaseMap,
+                                  bool IsClone, bool IsCloned) {
+  unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+  const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
+  Register NewVReg = MRI->createVirtualRegister(TRI->getAllocatableClass(RC));
+  const MCInstrDesc &II = TII->get(TargetOpcode::REG_SEQUENCE);
+  MachineInstrBuilder MIB = BuildMI(*MF, Node->getDebugLoc(), II, NewVReg);
+  unsigned NumOps = Node->getNumOperands();
+  // If the input pattern has a chain, then the root of the corresponding
+  // output pattern will get a chain as well. This can happen to be a
+  // REG_SEQUENCE (which is not "guarded" by countOperands/CountResults).
+  if (NumOps && Node->getOperand(NumOps-1).getValueType() == MVT::Other)
+    --NumOps; // Ignore chain if it exists.
+
+  assert((NumOps & 1) == 1 &&
+         "REG_SEQUENCE must have an odd number of operands!");
+  for (unsigned i = 1; i != NumOps; ++i) {
+    SDValue Op = Node->getOperand(i);
+    if ((i & 1) == 0) {
+      RegisterSDNode *R = dyn_cast<RegisterSDNode>(Node->getOperand(i-1));
+      // Skip physical registers as they don't have a vreg to get and we'll
+      // insert copies for them in TwoAddressInstructionPass anyway.
+      if (!R || !R->getReg().isPhysical()) {
+        unsigned SubIdx = cast<ConstantSDNode>(Op)->getZExtValue();
+        unsigned SubReg = getVR(Node->getOperand(i-1), VRBaseMap);
+        const TargetRegisterClass *TRC = MRI->getRegClass(SubReg);
+        const TargetRegisterClass *SRC =
+        TRI->getMatchingSuperRegClass(RC, TRC, SubIdx);
+        if (SRC && SRC != RC) {
+          MRI->setRegClass(NewVReg, SRC);
+          RC = SRC;
+        }
+      }
+    }
+    AddOperand(MIB, Op, i+1, &II, VRBaseMap, /*IsDebug=*/false,
+               IsClone, IsCloned);
+  }
+
+  MBB->insert(InsertPos, MIB);
+  SDValue Op(Node, 0);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, NewVReg)).second;
+  (void)isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// EmitDbgValue - Generate machine instruction for a dbg_value node.
+///
+MachineInstr *
+InstrEmitter::EmitDbgValue(SDDbgValue *SD,
+                           DenseMap<SDValue, Register> &VRBaseMap) {
+  DebugLoc DL = SD->getDebugLoc();
+  assert(cast<DILocalVariable>(SD->getVariable())
+             ->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+
+  SD->setIsEmitted();
+
+  assert(!SD->getLocationOps().empty() &&
+         "dbg_value with no location operands?");
+
+  if (SD->isInvalidated())
+    return EmitDbgNoLocation(SD);
+
+  // Attempt to produce a DBG_INSTR_REF if we've been asked to.
+  if (EmitDebugInstrRefs)
+    if (auto *InstrRef = EmitDbgInstrRef(SD, VRBaseMap))
+      return InstrRef;
+
+  // Emit variadic dbg_value nodes as DBG_VALUE_LIST if they have not been
+  // emitted as instruction references.
+  if (SD->isVariadic())
+    return EmitDbgValueList(SD, VRBaseMap);
+
+  // Emit single-location dbg_value nodes as DBG_VALUE if they have not been
+  // emitted as instruction references.
+  return EmitDbgValueFromSingleOp(SD, VRBaseMap);
+}
+
+MachineOperand GetMOForConstDbgOp(const SDDbgOperand &Op) {
+  const Value *V = Op.getConst();
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    if (CI->getBitWidth() > 64)
+      return MachineOperand::CreateCImm(CI);
+    return MachineOperand::CreateImm(CI->getSExtValue());
+  }
+  if (const ConstantFP *CF = dyn_cast<ConstantFP>(V))
+    return MachineOperand::CreateFPImm(CF);
+  // Note: This assumes that all nullptr constants are zero-valued.
+  if (isa<ConstantPointerNull>(V))
+    return MachineOperand::CreateImm(0);
+  // Undef or unhandled value type, so return an undef operand.
+  return MachineOperand::CreateReg(
+      /* Reg */ 0U, /* isDef */ false, /* isImp */ false,
+      /* isKill */ false, /* isDead */ false,
+      /* isUndef */ false, /* isEarlyClobber */ false,
+      /* SubReg */ 0, /* isDebug */ true);
+}
+
+void InstrEmitter::AddDbgValueLocationOps(
+    MachineInstrBuilder &MIB, const MCInstrDesc &DbgValDesc,
+    ArrayRef<SDDbgOperand> LocationOps,
+    DenseMap<SDValue, Register> &VRBaseMap) {
+  for (const SDDbgOperand &Op : LocationOps) {
+    switch (Op.getKind()) {
+    case SDDbgOperand::FRAMEIX:
+      MIB.addFrameIndex(Op.getFrameIx());
+      break;
+    case SDDbgOperand::VREG:
+      MIB.addReg(Op.getVReg());
+      break;
+    case SDDbgOperand::SDNODE: {
+      SDValue V = SDValue(Op.getSDNode(), Op.getResNo());
+      // It's possible we replaced this SDNode with other(s) and therefore
+      // didn't generate code for it. It's better to catch these cases where
+      // they happen and transfer the debug info, but trying to guarantee that
+      // in all cases would be very fragile; this is a safeguard for any
+      // that were missed.
+      if (VRBaseMap.count(V) == 0)
+        MIB.addReg(0U); // undef
+      else
+        AddOperand(MIB, V, (*MIB).getNumOperands(), &DbgValDesc, VRBaseMap,
+                   /*IsDebug=*/true, /*IsClone=*/false, /*IsCloned=*/false);
+    } break;
+    case SDDbgOperand::CONST:
+      MIB.add(GetMOForConstDbgOp(Op));
+      break;
+    }
+  }
+}
+
+MachineInstr *
+InstrEmitter::EmitDbgInstrRef(SDDbgValue *SD,
+                              DenseMap<SDValue, Register> &VRBaseMap) {
+  MDNode *Var = SD->getVariable();
+  const DIExpression *Expr = (DIExpression *)SD->getExpression();
+  DebugLoc DL = SD->getDebugLoc();
+  const MCInstrDesc &RefII = TII->get(TargetOpcode::DBG_INSTR_REF);
+
+  // Returns true if the given operand is not a legal debug operand for a
+  // DBG_INSTR_REF.
+  auto IsInvalidOp = [](SDDbgOperand DbgOp) {
+    return DbgOp.getKind() == SDDbgOperand::FRAMEIX;
+  };
+  // Returns true if the given operand is not itself an instruction reference
+  // but is a legal debug operand for a DBG_INSTR_REF.
+  auto IsNonInstrRefOp = [](SDDbgOperand DbgOp) {
+    return DbgOp.getKind() == SDDbgOperand::CONST;
+  };
+
+  // If this variable location does not depend on any instructions or contains
+  // any stack locations, produce it as a standard debug value instead.
+  if (any_of(SD->getLocationOps(), IsInvalidOp) ||
+      all_of(SD->getLocationOps(), IsNonInstrRefOp)) {
+    if (SD->isVariadic())
+      return EmitDbgValueList(SD, VRBaseMap);
+    return EmitDbgValueFromSingleOp(SD, VRBaseMap);
+  }
+
+  // Immediately fold any indirectness from the LLVM-IR intrinsic into the
+  // expression:
+  if (SD->isIndirect())
+    Expr = DIExpression::append(Expr, dwarf::DW_OP_deref);
+  // If this is not already a variadic expression, it must be modified to become
+  // one.
+  if (!SD->isVariadic())
+    Expr = DIExpression::convertToVariadicExpression(Expr);
+
+  SmallVector<MachineOperand> MOs;
+
+  // It may not be immediately possible to identify the MachineInstr that
+  // defines a VReg, it can depend for example on the order blocks are
+  // emitted in. When this happens, or when further analysis is needed later,
+  // produce an instruction like this:
+  //
+  //    DBG_INSTR_REF !123, !456, %0:gr64
+  //
+  // i.e., point the instruction at the vreg, and patch it up later in
+  // MachineFunction::finalizeDebugInstrRefs.
+  auto AddVRegOp = [&](unsigned VReg) {
+    MOs.push_back(MachineOperand::CreateReg(
+        /* Reg */ VReg, /* isDef */ false, /* isImp */ false,
+        /* isKill */ false, /* isDead */ false,
+        /* isUndef */ false, /* isEarlyClobber */ false,
+        /* SubReg */ 0, /* isDebug */ true));
+  };
+  unsigned OpCount = SD->getLocationOps().size();
+  for (unsigned OpIdx = 0; OpIdx < OpCount; ++OpIdx) {
+    SDDbgOperand DbgOperand = SD->getLocationOps()[OpIdx];
+
+    // Try to find both the defined register and the instruction defining it.
+    MachineInstr *DefMI = nullptr;
+    unsigned VReg;
+
+    if (DbgOperand.getKind() == SDDbgOperand::VREG) {
+      VReg = DbgOperand.getVReg();
+
+      // No definition means that block hasn't been emitted yet. Leave a vreg
+      // reference to be fixed later.
+      if (!MRI->hasOneDef(VReg)) {
+        AddVRegOp(VReg);
+        continue;
+      }
+
+      DefMI = &*MRI->def_instr_begin(VReg);
+    } else if (DbgOperand.getKind() == SDDbgOperand::SDNODE) {
+      // Look up the corresponding VReg for the given SDNode, if any.
+      SDNode *Node = DbgOperand.getSDNode();
+      SDValue Op = SDValue(Node, DbgOperand.getResNo());
+      DenseMap<SDValue, Register>::iterator I = VRBaseMap.find(Op);
+      // No VReg -> produce a DBG_VALUE $noreg instead.
+      if (I == VRBaseMap.end())
+        break;
+
+      // Try to pick out a defining instruction at this point.
+      VReg = getVR(Op, VRBaseMap);
+
+      // Again, if there's no instruction defining the VReg right now, fix it up
+      // later.
+      if (!MRI->hasOneDef(VReg)) {
+        AddVRegOp(VReg);
+        continue;
+      }
+
+      DefMI = &*MRI->def_instr_begin(VReg);
+    } else {
+      assert(DbgOperand.getKind() == SDDbgOperand::CONST);
+      MOs.push_back(GetMOForConstDbgOp(DbgOperand));
+      continue;
+    }
+
+    // Avoid copy like instructions: they don't define values, only move them.
+    // Leave a virtual-register reference until it can be fixed up later, to
+    // find the underlying value definition.
+    if (DefMI->isCopyLike() || TII->isCopyInstr(*DefMI)) {
+      AddVRegOp(VReg);
+      continue;
+    }
+
+    // Find the operand number which defines the specified VReg.
+    unsigned OperandIdx = 0;
+    for (const auto &MO : DefMI->operands()) {
+      if (MO.isReg() && MO.isDef() && MO.getReg() == VReg)
+        break;
+      ++OperandIdx;
+    }
+    assert(OperandIdx < DefMI->getNumOperands());
+
+    // Make the DBG_INSTR_REF refer to that instruction, and that operand.
+    unsigned InstrNum = DefMI->getDebugInstrNum();
+    MOs.push_back(MachineOperand::CreateDbgInstrRef(InstrNum, OperandIdx));
+  }
+
+  // If we haven't created a valid MachineOperand for every DbgOp, abort and
+  // produce an undef DBG_VALUE.
+  if (MOs.size() != OpCount)
+    return EmitDbgNoLocation(SD);
+
+  return BuildMI(*MF, DL, RefII, false, MOs, Var, Expr);
+}
+
+MachineInstr *InstrEmitter::EmitDbgNoLocation(SDDbgValue *SD) {
+  // An invalidated SDNode must generate an undef DBG_VALUE: although the
+  // original value is no longer computed, earlier DBG_VALUEs live ranges
+  // must not leak into later code.
+  DIVariable *Var = SD->getVariable();
+  const DIExpression *Expr =
+      DIExpression::convertToUndefExpression(SD->getExpression());
+  DebugLoc DL = SD->getDebugLoc();
+  const MCInstrDesc &Desc = TII->get(TargetOpcode::DBG_VALUE);
+  return BuildMI(*MF, DL, Desc, false, 0U, Var, Expr);
+}
+
+MachineInstr *
+InstrEmitter::EmitDbgValueList(SDDbgValue *SD,
+                               DenseMap<SDValue, Register> &VRBaseMap) {
+  MDNode *Var = SD->getVariable();
+  DIExpression *Expr = SD->getExpression();
+  DebugLoc DL = SD->getDebugLoc();
+  // DBG_VALUE_LIST := "DBG_VALUE_LIST" var, expression, loc (, loc)*
+  const MCInstrDesc &DbgValDesc = TII->get(TargetOpcode::DBG_VALUE_LIST);
+  // Build the DBG_VALUE_LIST instruction base.
+  auto MIB = BuildMI(*MF, DL, DbgValDesc);
+  MIB.addMetadata(Var);
+  MIB.addMetadata(Expr);
+  AddDbgValueLocationOps(MIB, DbgValDesc, SD->getLocationOps(), VRBaseMap);
+  return &*MIB;
+}
+
+MachineInstr *
+InstrEmitter::EmitDbgValueFromSingleOp(SDDbgValue *SD,
+                                       DenseMap<SDValue, Register> &VRBaseMap) {
+  MDNode *Var = SD->getVariable();
+  DIExpression *Expr = SD->getExpression();
+  DebugLoc DL = SD->getDebugLoc();
+  const MCInstrDesc &II = TII->get(TargetOpcode::DBG_VALUE);
+
+  assert(SD->getLocationOps().size() == 1 &&
+         "Non variadic dbg_value should have only one location op");
+
+  // See about constant-folding the expression.
+  // Copy the location operand in case we replace it.
+  SmallVector<SDDbgOperand, 1> LocationOps(1, SD->getLocationOps()[0]);
+  if (Expr && LocationOps[0].getKind() == SDDbgOperand::CONST) {
+    const Value *V = LocationOps[0].getConst();
+    if (auto *C = dyn_cast<ConstantInt>(V)) {
+      std::tie(Expr, C) = Expr->constantFold(C);
+      LocationOps[0] = SDDbgOperand::fromConst(C);
+    }
+  }
+
+  // Emit non-variadic dbg_value nodes as DBG_VALUE.
+  // DBG_VALUE := "DBG_VALUE" loc, isIndirect, var, expr
+  auto MIB = BuildMI(*MF, DL, II);
+  AddDbgValueLocationOps(MIB, II, LocationOps, VRBaseMap);
+
+  if (SD->isIndirect())
+    MIB.addImm(0U);
+  else
+    MIB.addReg(0U);
+
+  return MIB.addMetadata(Var).addMetadata(Expr);
+}
+
+MachineInstr *
+InstrEmitter::EmitDbgLabel(SDDbgLabel *SD) {
+  MDNode *Label = SD->getLabel();
+  DebugLoc DL = SD->getDebugLoc();
+  assert(cast<DILabel>(Label)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+
+  const MCInstrDesc &II = TII->get(TargetOpcode::DBG_LABEL);
+  MachineInstrBuilder MIB = BuildMI(*MF, DL, II);
+  MIB.addMetadata(Label);
+
+  return &*MIB;
+}
+
+/// EmitMachineNode - Generate machine code for a target-specific node and
+/// needed dependencies.
+///
+void InstrEmitter::
+EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
+                DenseMap<SDValue, Register> &VRBaseMap) {
+  unsigned Opc = Node->getMachineOpcode();
+
+  // Handle subreg insert/extract specially
+  if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+      Opc == TargetOpcode::INSERT_SUBREG ||
+      Opc == TargetOpcode::SUBREG_TO_REG) {
+    EmitSubregNode(Node, VRBaseMap, IsClone, IsCloned);
+    return;
+  }
+
+  // Handle COPY_TO_REGCLASS specially.
+  if (Opc == TargetOpcode::COPY_TO_REGCLASS) {
+    EmitCopyToRegClassNode(Node, VRBaseMap);
+    return;
+  }
+
+  // Handle REG_SEQUENCE specially.
+  if (Opc == TargetOpcode::REG_SEQUENCE) {
+    EmitRegSequence(Node, VRBaseMap, IsClone, IsCloned);
+    return;
+  }
+
+  if (Opc == TargetOpcode::IMPLICIT_DEF)
+    // We want a unique VR for each IMPLICIT_DEF use.
+    return;
+
+  const MCInstrDesc &II = TII->get(Opc);
+  unsigned NumResults = CountResults(Node);
+  unsigned NumDefs = II.getNumDefs();
+  const MCPhysReg *ScratchRegs = nullptr;
+
+  // Handle STACKMAP and PATCHPOINT specially and then use the generic code.
+  if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) {
+    // Stackmaps do not have arguments and do not preserve their calling
+    // convention. However, to simplify runtime support, they clobber the same
+    // scratch registers as AnyRegCC.
+    unsigned CC = CallingConv::AnyReg;
+    if (Opc == TargetOpcode::PATCHPOINT) {
+      CC = Node->getConstantOperandVal(PatchPointOpers::CCPos);
+      NumDefs = NumResults;
+    }
+    ScratchRegs = TLI->getScratchRegisters((CallingConv::ID) CC);
+  } else if (Opc == TargetOpcode::STATEPOINT) {
+    NumDefs = NumResults;
+  }
+
+  unsigned NumImpUses = 0;
+  unsigned NodeOperands =
+    countOperands(Node, II.getNumOperands() - NumDefs, NumImpUses);
+  bool HasVRegVariadicDefs = !MF->getTarget().usesPhysRegsForValues() &&
+                             II.isVariadic() && II.variadicOpsAreDefs();
+  bool HasPhysRegOuts = NumResults > NumDefs && !II.implicit_defs().empty() &&
+                        !HasVRegVariadicDefs;
+#ifndef NDEBUG
+  unsigned NumMIOperands = NodeOperands + NumResults;
+  if (II.isVariadic())
+    assert(NumMIOperands >= II.getNumOperands() &&
+           "Too few operands for a variadic node!");
+  else
+    assert(NumMIOperands >= II.getNumOperands() &&
+           NumMIOperands <=
+               II.getNumOperands() + II.implicit_defs().size() + NumImpUses &&
+           "#operands for dag node doesn't match .td file!");
+#endif
+
+  // Create the new machine instruction.
+  MachineInstrBuilder MIB = BuildMI(*MF, Node->getDebugLoc(), II);
+
+  // Add result register values for things that are defined by this
+  // instruction.
+  if (NumResults) {
+    CreateVirtualRegisters(Node, MIB, II, IsClone, IsCloned, VRBaseMap);
+
+    // Transfer any IR flags from the SDNode to the MachineInstr
+    MachineInstr *MI = MIB.getInstr();
+    const SDNodeFlags Flags = Node->getFlags();
+    if (Flags.hasNoSignedZeros())
+      MI->setFlag(MachineInstr::MIFlag::FmNsz);
+
+    if (Flags.hasAllowReciprocal())
+      MI->setFlag(MachineInstr::MIFlag::FmArcp);
+
+    if (Flags.hasNoNaNs())
+      MI->setFlag(MachineInstr::MIFlag::FmNoNans);
+
+    if (Flags.hasNoInfs())
+      MI->setFlag(MachineInstr::MIFlag::FmNoInfs);
+
+    if (Flags.hasAllowContract())
+      MI->setFlag(MachineInstr::MIFlag::FmContract);
+
+    if (Flags.hasApproximateFuncs())
+      MI->setFlag(MachineInstr::MIFlag::FmAfn);
+
+    if (Flags.hasAllowReassociation())
+      MI->setFlag(MachineInstr::MIFlag::FmReassoc);
+
+    if (Flags.hasNoUnsignedWrap())
+      MI->setFlag(MachineInstr::MIFlag::NoUWrap);
+
+    if (Flags.hasNoSignedWrap())
+      MI->setFlag(MachineInstr::MIFlag::NoSWrap);
+
+    if (Flags.hasExact())
+      MI->setFlag(MachineInstr::MIFlag::IsExact);
+
+    if (Flags.hasNoFPExcept())
+      MI->setFlag(MachineInstr::MIFlag::NoFPExcept);
+
+    if (Flags.hasUnpredictable())
+      MI->setFlag(MachineInstr::MIFlag::Unpredictable);
+  }
+
+  // Emit all of the actual operands of this instruction, adding them to the
+  // instruction as appropriate.
+  bool HasOptPRefs = NumDefs > NumResults;
+  assert((!HasOptPRefs || !HasPhysRegOuts) &&
+         "Unable to cope with optional defs and phys regs defs!");
+  unsigned NumSkip = HasOptPRefs ? NumDefs - NumResults : 0;
+  for (unsigned i = NumSkip; i != NodeOperands; ++i)
+    AddOperand(MIB, Node->getOperand(i), i-NumSkip+NumDefs, &II,
+               VRBaseMap, /*IsDebug=*/false, IsClone, IsCloned);
+
+  // Add scratch registers as implicit def and early clobber
+  if (ScratchRegs)
+    for (unsigned i = 0; ScratchRegs[i]; ++i)
+      MIB.addReg(ScratchRegs[i], RegState::ImplicitDefine |
+                                 RegState::EarlyClobber);
+
+  // Set the memory reference descriptions of this instruction now that it is
+  // part of the function.
+  MIB.setMemRefs(cast<MachineSDNode>(Node)->memoperands());
+
+  // Set the CFI type.
+  MIB->setCFIType(*MF, Node->getCFIType());
+
+  // Insert the instruction into position in the block. This needs to
+  // happen before any custom inserter hook is called so that the
+  // hook knows where in the block to insert the replacement code.
+  MBB->insert(InsertPos, MIB);
+
+  // The MachineInstr may also define physregs instead of virtregs.  These
+  // physreg values can reach other instructions in different ways:
+  //
+  // 1. When there is a use of a Node value beyond the explicitly defined
+  //    virtual registers, we emit a CopyFromReg for one of the implicitly
+  //    defined physregs.  This only happens when HasPhysRegOuts is true.
+  //
+  // 2. A CopyFromReg reading a physreg may be glued to this instruction.
+  //
+  // 3. A glued instruction may implicitly use a physreg.
+  //
+  // 4. A glued instruction may use a RegisterSDNode operand.
+  //
+  // Collect all the used physreg defs, and make sure that any unused physreg
+  // defs are marked as dead.
+  SmallVector<Register, 8> UsedRegs;
+
+  // Additional results must be physical register defs.
+  if (HasPhysRegOuts) {
+    for (unsigned i = NumDefs; i < NumResults; ++i) {
+      Register Reg = II.implicit_defs()[i - NumDefs];
+      if (!Node->hasAnyUseOfValue(i))
+        continue;
+      // This implicitly defined physreg has a use.
+      UsedRegs.push_back(Reg);
+      EmitCopyFromReg(Node, i, IsClone, Reg, VRBaseMap);
+    }
+  }
+
+  // Scan the glue chain for any used physregs.
+  if (Node->getValueType(Node->getNumValues()-1) == MVT::Glue) {
+    for (SDNode *F = Node->getGluedUser(); F; F = F->getGluedUser()) {
+      if (F->getOpcode() == ISD::CopyFromReg) {
+        UsedRegs.push_back(cast<RegisterSDNode>(F->getOperand(1))->getReg());
+        continue;
+      } else if (F->getOpcode() == ISD::CopyToReg) {
+        // Skip CopyToReg nodes that are internal to the glue chain.
+        continue;
+      }
+      // Collect declared implicit uses.
+      const MCInstrDesc &MCID = TII->get(F->getMachineOpcode());
+      append_range(UsedRegs, MCID.implicit_uses());
+      // In addition to declared implicit uses, we must also check for
+      // direct RegisterSDNode operands.
+      for (unsigned i = 0, e = F->getNumOperands(); i != e; ++i)
+        if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(F->getOperand(i))) {
+          Register Reg = R->getReg();
+          if (Reg.isPhysical())
+            UsedRegs.push_back(Reg);
+        }
+    }
+  }
+
+  // Add rounding control registers as implicit def for function call.
+  if (II.isCall() && MF->getFunction().hasFnAttribute(Attribute::StrictFP)) {
+    ArrayRef<MCPhysReg> RCRegs = TLI->getRoundingControlRegisters();
+    for (MCPhysReg Reg : RCRegs)
+      UsedRegs.push_back(Reg);
+  }
+
+  // Finally mark unused registers as dead.
+  if (!UsedRegs.empty() || !II.implicit_defs().empty() || II.hasOptionalDef())
+    MIB->setPhysRegsDeadExcept(UsedRegs, *TRI);
+
+  // STATEPOINT is too 'dynamic' to have meaningful machine description.
+  // We have to manually tie operands.
+  if (Opc == TargetOpcode::STATEPOINT && NumDefs > 0) {
+    assert(!HasPhysRegOuts && "STATEPOINT mishandled");
+    MachineInstr *MI = MIB;
+    unsigned Def = 0;
+    int First = StatepointOpers(MI).getFirstGCPtrIdx();
+    assert(First > 0 && "Statepoint has Defs but no GC ptr list");
+    unsigned Use = (unsigned)First;
+    while (Def < NumDefs) {
+      if (MI->getOperand(Use).isReg())
+        MI->tieOperands(Def++, Use);
+      Use = StackMaps::getNextMetaArgIdx(MI, Use);
+    }
+  }
+
+  // Run post-isel target hook to adjust this instruction if needed.
+  if (II.hasPostISelHook())
+    TLI->AdjustInstrPostInstrSelection(*MIB, Node);
+}
+
+/// EmitSpecialNode - Generate machine code for a target-independent node and
+/// needed dependencies.
+void InstrEmitter::
+EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
+                DenseMap<SDValue, Register> &VRBaseMap) {
+  switch (Node->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    Node->dump();
+#endif
+    llvm_unreachable("This target-independent node should have been selected!");
+  case ISD::EntryToken:
+  case ISD::MERGE_VALUES:
+  case ISD::TokenFactor: // fall thru
+    break;
+  case ISD::CopyToReg: {
+    Register DestReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+    SDValue SrcVal = Node->getOperand(2);
+    if (DestReg.isVirtual() && SrcVal.isMachineOpcode() &&
+        SrcVal.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF) {
+      // Instead building a COPY to that vreg destination, build an
+      // IMPLICIT_DEF instruction instead.
+      BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
+              TII->get(TargetOpcode::IMPLICIT_DEF), DestReg);
+      break;
+    }
+    Register SrcReg;
+    if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(SrcVal))
+      SrcReg = R->getReg();
+    else
+      SrcReg = getVR(SrcVal, VRBaseMap);
+
+    if (SrcReg == DestReg) // Coalesced away the copy? Ignore.
+      break;
+
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+            DestReg).addReg(SrcReg);
+    break;
+  }
+  case ISD::CopyFromReg: {
+    unsigned SrcReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+    EmitCopyFromReg(Node, 0, IsClone, SrcReg, VRBaseMap);
+    break;
+  }
+  case ISD::EH_LABEL:
+  case ISD::ANNOTATION_LABEL: {
+    unsigned Opc = (Node->getOpcode() == ISD::EH_LABEL)
+                       ? TargetOpcode::EH_LABEL
+                       : TargetOpcode::ANNOTATION_LABEL;
+    MCSymbol *S = cast<LabelSDNode>(Node)->getLabel();
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
+            TII->get(Opc)).addSym(S);
+    break;
+  }
+
+  case ISD::LIFETIME_START:
+  case ISD::LIFETIME_END: {
+    unsigned TarOp = (Node->getOpcode() == ISD::LIFETIME_START)
+                         ? TargetOpcode::LIFETIME_START
+                         : TargetOpcode::LIFETIME_END;
+    auto *FI = cast<FrameIndexSDNode>(Node->getOperand(1));
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TarOp))
+    .addFrameIndex(FI->getIndex());
+    break;
+  }
+
+  case ISD::PSEUDO_PROBE: {
+    unsigned TarOp = TargetOpcode::PSEUDO_PROBE;
+    auto Guid = cast<PseudoProbeSDNode>(Node)->getGuid();
+    auto Index = cast<PseudoProbeSDNode>(Node)->getIndex();
+    auto Attr = cast<PseudoProbeSDNode>(Node)->getAttributes();
+
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TarOp))
+        .addImm(Guid)
+        .addImm(Index)
+        .addImm((uint8_t)PseudoProbeType::Block)
+        .addImm(Attr);
+    break;
+  }
+
+  case ISD::INLINEASM:
+  case ISD::INLINEASM_BR: {
+    unsigned NumOps = Node->getNumOperands();
+    if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
+      --NumOps;  // Ignore the glue operand.
+
+    // Create the inline asm machine instruction.
+    unsigned TgtOpc = Node->getOpcode() == ISD::INLINEASM_BR
+                          ? TargetOpcode::INLINEASM_BR
+                          : TargetOpcode::INLINEASM;
+    MachineInstrBuilder MIB =
+        BuildMI(*MF, Node->getDebugLoc(), TII->get(TgtOpc));
+
+    // Add the asm string as an external symbol operand.
+    SDValue AsmStrV = Node->getOperand(InlineAsm::Op_AsmString);
+    const char *AsmStr = cast<ExternalSymbolSDNode>(AsmStrV)->getSymbol();
+    MIB.addExternalSymbol(AsmStr);
+
+    // Add the HasSideEffect, isAlignStack, AsmDialect, MayLoad and MayStore
+    // bits.
+    int64_t ExtraInfo =
+      cast<ConstantSDNode>(Node->getOperand(InlineAsm::Op_ExtraInfo))->
+                          getZExtValue();
+    MIB.addImm(ExtraInfo);
+
+    // Remember to operand index of the group flags.
+    SmallVector<unsigned, 8> GroupIdx;
+
+    // Remember registers that are part of early-clobber defs.
+    SmallVector<unsigned, 8> ECRegs;
+
+    // Add all of the operand registers to the instruction.
+    for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+      unsigned Flags =
+        cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+      const unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+
+      GroupIdx.push_back(MIB->getNumOperands());
+      MIB.addImm(Flags);
+      ++i;  // Skip the ID value.
+
+      switch (InlineAsm::getKind(Flags)) {
+      default: llvm_unreachable("Bad flags!");
+        case InlineAsm::Kind_RegDef:
+        for (unsigned j = 0; j != NumVals; ++j, ++i) {
+          Register Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+          // FIXME: Add dead flags for physical and virtual registers defined.
+          // For now, mark physical register defs as implicit to help fast
+          // regalloc. This makes inline asm look a lot like calls.
+          MIB.addReg(Reg, RegState::Define | getImplRegState(Reg.isPhysical()));
+        }
+        break;
+      case InlineAsm::Kind_RegDefEarlyClobber:
+      case InlineAsm::Kind_Clobber:
+        for (unsigned j = 0; j != NumVals; ++j, ++i) {
+          Register Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+          MIB.addReg(Reg, RegState::Define | RegState::EarlyClobber |
+                              getImplRegState(Reg.isPhysical()));
+          ECRegs.push_back(Reg);
+        }
+        break;
+      case InlineAsm::Kind_RegUse:  // Use of register.
+      case InlineAsm::Kind_Imm:  // Immediate.
+      case InlineAsm::Kind_Mem:  // Non-function addressing mode.
+        // The addressing mode has been selected, just add all of the
+        // operands to the machine instruction.
+        for (unsigned j = 0; j != NumVals; ++j, ++i)
+          AddOperand(MIB, Node->getOperand(i), 0, nullptr, VRBaseMap,
+                     /*IsDebug=*/false, IsClone, IsCloned);
+
+        // Manually set isTied bits.
+        if (InlineAsm::getKind(Flags) == InlineAsm::Kind_RegUse) {
+          unsigned DefGroup = 0;
+          if (InlineAsm::isUseOperandTiedToDef(Flags, DefGroup)) {
+            unsigned DefIdx = GroupIdx[DefGroup] + 1;
+            unsigned UseIdx = GroupIdx.back() + 1;
+            for (unsigned j = 0; j != NumVals; ++j)
+              MIB->tieOperands(DefIdx + j, UseIdx + j);
+          }
+        }
+        break;
+      case InlineAsm::Kind_Func: // Function addressing mode.
+        for (unsigned j = 0; j != NumVals; ++j, ++i) {
+          SDValue Op = Node->getOperand(i);
+          AddOperand(MIB, Op, 0, nullptr, VRBaseMap,
+                     /*IsDebug=*/false, IsClone, IsCloned);
+
+          // Adjust Target Flags for function reference.
+          if (auto *TGA = dyn_cast<GlobalAddressSDNode>(Op)) {
+            unsigned NewFlags =
+                MF->getSubtarget().classifyGlobalFunctionReference(
+                    TGA->getGlobal());
+            unsigned LastIdx = MIB.getInstr()->getNumOperands() - 1;
+            MIB.getInstr()->getOperand(LastIdx).setTargetFlags(NewFlags);
+          }
+        }
+      }
+    }
+
+    // GCC inline assembly allows input operands to also be early-clobber
+    // output operands (so long as the operand is written only after it's
+    // used), but this does not match the semantics of our early-clobber flag.
+    // If an early-clobber operand register is also an input operand register,
+    // then remove the early-clobber flag.
+    for (unsigned Reg : ECRegs) {
+      if (MIB->readsRegister(Reg, TRI)) {
+        MachineOperand *MO =
+            MIB->findRegisterDefOperand(Reg, false, false, TRI);
+        assert(MO && "No def operand for clobbered register?");
+        MO->setIsEarlyClobber(false);
+      }
+    }
+
+    // Get the mdnode from the asm if it exists and add it to the instruction.
+    SDValue MDV = Node->getOperand(InlineAsm::Op_MDNode);
+    const MDNode *MD = cast<MDNodeSDNode>(MDV)->getMD();
+    if (MD)
+      MIB.addMetadata(MD);
+
+    MBB->insert(InsertPos, MIB);
+    break;
+  }
+  }
+}
+
+/// InstrEmitter - Construct an InstrEmitter and set it to start inserting
+/// at the given position in the given block.
+InstrEmitter::InstrEmitter(const TargetMachine &TM, MachineBasicBlock *mbb,
+                           MachineBasicBlock::iterator insertpos)
+    : MF(mbb->getParent()), MRI(&MF->getRegInfo()),
+      TII(MF->getSubtarget().getInstrInfo()),
+      TRI(MF->getSubtarget().getRegisterInfo()),
+      TLI(MF->getSubtarget().getTargetLowering()), MBB(mbb),
+      InsertPos(insertpos) {
+  EmitDebugInstrRefs = mbb->getParent()->useDebugInstrRef();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
new file mode 100644
index 000000000000..959bce31c8b2
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -0,0 +1,169 @@
+//===- InstrEmitter.h - Emit MachineInstrs for the SelectionDAG -*- C++ -*--==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This declares the Emit routines for the SelectionDAG class, which creates
+// MachineInstrs based on the decisions of the SelectionDAG instruction
+// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_INSTREMITTER_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_INSTREMITTER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+
+namespace llvm {
+
+class MachineInstrBuilder;
+class MCInstrDesc;
+class SDDbgLabel;
+class SDDbgValue;
+class SDDbgOperand;
+class TargetLowering;
+class TargetMachine;
+
+class LLVM_LIBRARY_VISIBILITY InstrEmitter {
+  MachineFunction *MF;
+  MachineRegisterInfo *MRI;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const TargetLowering *TLI;
+
+  MachineBasicBlock *MBB;
+  MachineBasicBlock::iterator InsertPos;
+
+  /// Should we try to produce DBG_INSTR_REF instructions?
+  bool EmitDebugInstrRefs;
+
+  /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
+  /// implicit physical register output.
+  void EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
+                       Register SrcReg, DenseMap<SDValue, Register> &VRBaseMap);
+
+  void CreateVirtualRegisters(SDNode *Node,
+                              MachineInstrBuilder &MIB,
+                              const MCInstrDesc &II,
+                              bool IsClone, bool IsCloned,
+                              DenseMap<SDValue, Register> &VRBaseMap);
+
+  /// getVR - Return the virtual register corresponding to the specified result
+  /// of the specified node.
+  Register getVR(SDValue Op,
+                 DenseMap<SDValue, Register> &VRBaseMap);
+
+  /// AddRegisterOperand - Add the specified register as an operand to the
+  /// specified machine instr. Insert register copies if the register is
+  /// not in the required register class.
+  void AddRegisterOperand(MachineInstrBuilder &MIB,
+                          SDValue Op,
+                          unsigned IIOpNum,
+                          const MCInstrDesc *II,
+                          DenseMap<SDValue, Register> &VRBaseMap,
+                          bool IsDebug, bool IsClone, bool IsCloned);
+
+  /// AddOperand - Add the specified operand to the specified machine instr.  II
+  /// specifies the instruction information for the node, and IIOpNum is the
+  /// operand number (in the II) that we are adding. IIOpNum and II are used for
+  /// assertions only.
+  void AddOperand(MachineInstrBuilder &MIB,
+                  SDValue Op,
+                  unsigned IIOpNum,
+                  const MCInstrDesc *II,
+                  DenseMap<SDValue, Register> &VRBaseMap,
+                  bool IsDebug, bool IsClone, bool IsCloned);
+
+  /// ConstrainForSubReg - Try to constrain VReg to a register class that
+  /// supports SubIdx sub-registers.  Emit a copy if that isn't possible.
+  /// Return the virtual register to use.
+  Register ConstrainForSubReg(Register VReg, unsigned SubIdx, MVT VT,
+                              bool isDivergent, const DebugLoc &DL);
+
+  /// EmitSubregNode - Generate machine code for subreg nodes.
+  ///
+  void EmitSubregNode(SDNode *Node, DenseMap<SDValue, Register> &VRBaseMap,
+                      bool IsClone, bool IsCloned);
+
+  /// EmitCopyToRegClassNode - Generate machine code for COPY_TO_REGCLASS nodes.
+  /// COPY_TO_REGCLASS is just a normal copy, except that the destination
+  /// register is constrained to be in a particular register class.
+  ///
+  void EmitCopyToRegClassNode(SDNode *Node,
+                              DenseMap<SDValue, Register> &VRBaseMap);
+
+  /// EmitRegSequence - Generate machine code for REG_SEQUENCE nodes.
+  ///
+  void EmitRegSequence(SDNode *Node, DenseMap<SDValue, Register> &VRBaseMap,
+                       bool IsClone, bool IsCloned);
+public:
+  /// CountResults - The results of target nodes have register or immediate
+  /// operands first, then an optional chain, and optional flag operands
+  /// (which do not go into the machine instrs.)
+  static unsigned CountResults(SDNode *Node);
+
+  void AddDbgValueLocationOps(MachineInstrBuilder &MIB,
+                              const MCInstrDesc &DbgValDesc,
+                              ArrayRef<SDDbgOperand> Locations,
+                              DenseMap<SDValue, Register> &VRBaseMap);
+
+  /// EmitDbgValue - Generate machine instruction for a dbg_value node.
+  ///
+  MachineInstr *EmitDbgValue(SDDbgValue *SD,
+                             DenseMap<SDValue, Register> &VRBaseMap);
+
+  /// Emit a dbg_value as a DBG_INSTR_REF. May produce DBG_VALUE $noreg instead
+  /// if there is no variable location; alternately a half-formed DBG_INSTR_REF
+  /// that refers to a virtual register and is corrected later in isel.
+  MachineInstr *EmitDbgInstrRef(SDDbgValue *SD,
+                                DenseMap<SDValue, Register> &VRBaseMap);
+
+  /// Emit a DBG_VALUE $noreg, indicating a variable has no location.
+  MachineInstr *EmitDbgNoLocation(SDDbgValue *SD);
+
+  /// Emit a DBG_VALUE_LIST from the operands to SDDbgValue.
+  MachineInstr *EmitDbgValueList(SDDbgValue *SD,
+                                 DenseMap<SDValue, Register> &VRBaseMap);
+
+  /// Emit a DBG_VALUE from the operands to SDDbgValue.
+  MachineInstr *EmitDbgValueFromSingleOp(SDDbgValue *SD,
+                                    DenseMap<SDValue, Register> &VRBaseMap);
+
+  /// Generate machine instruction for a dbg_label node.
+  MachineInstr *EmitDbgLabel(SDDbgLabel *SD);
+
+  /// EmitNode - Generate machine code for a node and needed dependencies.
+  ///
+  void EmitNode(SDNode *Node, bool IsClone, bool IsCloned,
+                DenseMap<SDValue, Register> &VRBaseMap) {
+    if (Node->isMachineOpcode())
+      EmitMachineNode(Node, IsClone, IsCloned, VRBaseMap);
+    else
+      EmitSpecialNode(Node, IsClone, IsCloned, VRBaseMap);
+  }
+
+  /// getBlock - Return the current basic block.
+  MachineBasicBlock *getBlock() { return MBB; }
+
+  /// getInsertPos - Return the current insertion position.
+  MachineBasicBlock::iterator getInsertPos() { return InsertPos; }
+
+  /// InstrEmitter - Construct an InstrEmitter and set it to start inserting
+  /// at the given position in the given block.
+  InstrEmitter(const TargetMachine &TM, MachineBasicBlock *mbb,
+               MachineBasicBlock::iterator insertpos);
+
+private:
+  void EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
+                       DenseMap<SDValue, Register> &VRBaseMap);
+  void EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
+                       DenseMap<SDValue, Register> &VRBaseMap);
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
new file mode 100644
index 000000000000..61fc31715d71
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -0,0 +1,5533 @@
+//===- LegalizeDAG.cpp - Implement SelectionDAG::Legalize -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SelectionDAG::Legalize method.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FloatingPointMode.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <cassert>
+#include <cstdint>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "legalizedag"
+
+namespace {
+
+/// Keeps track of state when getting the sign of a floating-point value as an
+/// integer.
+struct FloatSignAsInt {
+  EVT FloatVT;
+  SDValue Chain;
+  SDValue FloatPtr;
+  SDValue IntPtr;
+  MachinePointerInfo IntPointerInfo;
+  MachinePointerInfo FloatPointerInfo;
+  SDValue IntValue;
+  APInt SignMask;
+  uint8_t SignBit;
+};
+
+//===----------------------------------------------------------------------===//
+/// This takes an arbitrary SelectionDAG as input and
+/// hacks on it until the target machine can handle it.  This involves
+/// eliminating value sizes the machine cannot handle (promoting small sizes to
+/// large sizes or splitting up large values into small values) as well as
+/// eliminating operations the machine cannot handle.
+///
+/// This code also does a small amount of optimization and recognition of idioms
+/// as part of its processing.  For example, if a target does not support a
+/// 'setcc' instruction efficiently, but does support 'brcc' instruction, this
+/// will attempt merge setcc and brc instructions into brcc's.
+class SelectionDAGLegalize {
+  const TargetMachine &TM;
+  const TargetLowering &TLI;
+  SelectionDAG &DAG;
+
+  /// The set of nodes which have already been legalized. We hold a
+  /// reference to it in order to update as necessary on node deletion.
+  SmallPtrSetImpl<SDNode *> &LegalizedNodes;
+
+  /// A set of all the nodes updated during legalization.
+  SmallSetVector<SDNode *, 16> *UpdatedNodes;
+
+  EVT getSetCCResultType(EVT VT) const {
+    return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  }
+
+  // Libcall insertion helpers.
+
+public:
+  SelectionDAGLegalize(SelectionDAG &DAG,
+                       SmallPtrSetImpl<SDNode *> &LegalizedNodes,
+                       SmallSetVector<SDNode *, 16> *UpdatedNodes = nullptr)
+      : TM(DAG.getTarget()), TLI(DAG.getTargetLoweringInfo()), DAG(DAG),
+        LegalizedNodes(LegalizedNodes), UpdatedNodes(UpdatedNodes) {}
+
+  /// Legalizes the given operation.
+  void LegalizeOp(SDNode *Node);
+
+private:
+  SDValue OptimizeFloatStore(StoreSDNode *ST);
+
+  void LegalizeLoadOps(SDNode *Node);
+  void LegalizeStoreOps(SDNode *Node);
+
+  /// Some targets cannot handle a variable
+  /// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
+  /// is necessary to spill the vector being inserted into to memory, perform
+  /// the insert there, and then read the result back.
+  SDValue PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
+                                         const SDLoc &dl);
+  SDValue ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val, SDValue Idx,
+                                  const SDLoc &dl);
+
+  /// Return a vector shuffle operation which
+  /// performs the same shuffe in terms of order or result bytes, but on a type
+  /// whose vector element type is narrower than the original shuffle type.
+  /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
+  SDValue ShuffleWithNarrowerEltType(EVT NVT, EVT VT, const SDLoc &dl,
+                                     SDValue N1, SDValue N2,
+                                     ArrayRef<int> Mask) const;
+
+  std::pair<SDValue, SDValue> ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
+                        TargetLowering::ArgListTy &&Args, bool isSigned);
+  std::pair<SDValue, SDValue> ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
+
+  void ExpandFrexpLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  void ExpandFPLibCall(SDNode *Node, RTLIB::Libcall LC,
+                       SmallVectorImpl<SDValue> &Results);
+  void ExpandFPLibCall(SDNode *Node, RTLIB::Libcall Call_F32,
+                       RTLIB::Libcall Call_F64, RTLIB::Libcall Call_F80,
+                       RTLIB::Libcall Call_F128,
+                       RTLIB::Libcall Call_PPCF128,
+                       SmallVectorImpl<SDValue> &Results);
+  SDValue ExpandIntLibCall(SDNode *Node, bool isSigned,
+                           RTLIB::Libcall Call_I8,
+                           RTLIB::Libcall Call_I16,
+                           RTLIB::Libcall Call_I32,
+                           RTLIB::Libcall Call_I64,
+                           RTLIB::Libcall Call_I128);
+  void ExpandArgFPLibCall(SDNode *Node,
+                          RTLIB::Libcall Call_F32, RTLIB::Libcall Call_F64,
+                          RTLIB::Libcall Call_F80, RTLIB::Libcall Call_F128,
+                          RTLIB::Libcall Call_PPCF128,
+                          SmallVectorImpl<SDValue> &Results);
+  void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  void ExpandSinCosLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+
+  SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
+                           const SDLoc &dl);
+  SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
+                           const SDLoc &dl, SDValue ChainIn);
+  SDValue ExpandBUILD_VECTOR(SDNode *Node);
+  SDValue ExpandSPLAT_VECTOR(SDNode *Node);
+  SDValue ExpandSCALAR_TO_VECTOR(SDNode *Node);
+  void ExpandDYNAMIC_STACKALLOC(SDNode *Node,
+                                SmallVectorImpl<SDValue> &Results);
+  void getSignAsIntValue(FloatSignAsInt &State, const SDLoc &DL,
+                         SDValue Value) const;
+  SDValue modifySignAsInt(const FloatSignAsInt &State, const SDLoc &DL,
+                          SDValue NewIntValue) const;
+  SDValue ExpandFCOPYSIGN(SDNode *Node) const;
+  SDValue ExpandFABS(SDNode *Node) const;
+  SDValue ExpandFNEG(SDNode *Node) const;
+  SDValue expandLdexp(SDNode *Node) const;
+  SDValue expandFrexp(SDNode *Node) const;
+
+  SDValue ExpandLegalINT_TO_FP(SDNode *Node, SDValue &Chain);
+  void PromoteLegalINT_TO_FP(SDNode *N, const SDLoc &dl,
+                             SmallVectorImpl<SDValue> &Results);
+  void PromoteLegalFP_TO_INT(SDNode *N, const SDLoc &dl,
+                             SmallVectorImpl<SDValue> &Results);
+  SDValue PromoteLegalFP_TO_INT_SAT(SDNode *Node, const SDLoc &dl);
+
+  SDValue ExpandPARITY(SDValue Op, const SDLoc &dl);
+
+  SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
+  SDValue ExpandInsertToVectorThroughStack(SDValue Op);
+  SDValue ExpandVectorBuildThroughStack(SDNode* Node);
+
+  SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP);
+  SDValue ExpandConstant(ConstantSDNode *CP);
+
+  // if ExpandNode returns false, LegalizeOp falls back to ConvertNodeToLibcall
+  bool ExpandNode(SDNode *Node);
+  void ConvertNodeToLibcall(SDNode *Node);
+  void PromoteNode(SDNode *Node);
+
+public:
+  // Node replacement helpers
+
+  void ReplacedNode(SDNode *N) {
+    LegalizedNodes.erase(N);
+    if (UpdatedNodes)
+      UpdatedNodes->insert(N);
+  }
+
+  void ReplaceNode(SDNode *Old, SDNode *New) {
+    LLVM_DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
+               dbgs() << "     with:      "; New->dump(&DAG));
+
+    assert(Old->getNumValues() == New->getNumValues() &&
+           "Replacing one node with another that produces a different number "
+           "of values!");
+    DAG.ReplaceAllUsesWith(Old, New);
+    if (UpdatedNodes)
+      UpdatedNodes->insert(New);
+    ReplacedNode(Old);
+  }
+
+  void ReplaceNode(SDValue Old, SDValue New) {
+    LLVM_DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
+               dbgs() << "     with:      "; New->dump(&DAG));
+
+    DAG.ReplaceAllUsesWith(Old, New);
+    if (UpdatedNodes)
+      UpdatedNodes->insert(New.getNode());
+    ReplacedNode(Old.getNode());
+  }
+
+  void ReplaceNode(SDNode *Old, const SDValue *New) {
+    LLVM_DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG));
+
+    DAG.ReplaceAllUsesWith(Old, New);
+    for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i) {
+      LLVM_DEBUG(dbgs() << (i == 0 ? "     with:      " : "      and:      ");
+                 New[i]->dump(&DAG));
+      if (UpdatedNodes)
+        UpdatedNodes->insert(New[i].getNode());
+    }
+    ReplacedNode(Old);
+  }
+
+  void ReplaceNodeWithValue(SDValue Old, SDValue New) {
+    LLVM_DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
+               dbgs() << "     with:      "; New->dump(&DAG));
+
+    DAG.ReplaceAllUsesOfValueWith(Old, New);
+    if (UpdatedNodes)
+      UpdatedNodes->insert(New.getNode());
+    ReplacedNode(Old.getNode());
+  }
+};
+
+} // end anonymous namespace
+
+/// Return a vector shuffle operation which
+/// performs the same shuffle in terms of order or result bytes, but on a type
+/// whose vector element type is narrower than the original shuffle type.
+/// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
+SDValue SelectionDAGLegalize::ShuffleWithNarrowerEltType(
+    EVT NVT, EVT VT, const SDLoc &dl, SDValue N1, SDValue N2,
+    ArrayRef<int> Mask) const {
+  unsigned NumMaskElts = VT.getVectorNumElements();
+  unsigned NumDestElts = NVT.getVectorNumElements();
+  unsigned NumEltsGrowth = NumDestElts / NumMaskElts;
+
+  assert(NumEltsGrowth && "Cannot promote to vector type with fewer elts!");
+
+  if (NumEltsGrowth == 1)
+    return DAG.getVectorShuffle(NVT, dl, N1, N2, Mask);
+
+  SmallVector<int, 8> NewMask;
+  for (unsigned i = 0; i != NumMaskElts; ++i) {
+    int Idx = Mask[i];
+    for (unsigned j = 0; j != NumEltsGrowth; ++j) {
+      if (Idx < 0)
+        NewMask.push_back(-1);
+      else
+        NewMask.push_back(Idx * NumEltsGrowth + j);
+    }
+  }
+  assert(NewMask.size() == NumDestElts && "Non-integer NumEltsGrowth?");
+  assert(TLI.isShuffleMaskLegal(NewMask, NVT) && "Shuffle not legal?");
+  return DAG.getVectorShuffle(NVT, dl, N1, N2, NewMask);
+}
+
+/// Expands the ConstantFP node to an integer constant or
+/// a load from the constant pool.
+SDValue
+SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
+  bool Extend = false;
+  SDLoc dl(CFP);
+
+  // If a FP immediate is precise when represented as a float and if the
+  // target can do an extending load from float to double, we put it into
+  // the constant pool as a float, even if it's is statically typed as a
+  // double.  This shrinks FP constants and canonicalizes them for targets where
+  // an FP extending load is the same cost as a normal load (such as on the x87
+  // fp stack or PPC FP unit).
+  EVT VT = CFP->getValueType(0);
+  ConstantFP *LLVMC = const_cast<ConstantFP*>(CFP->getConstantFPValue());
+  if (!UseCP) {
+    assert((VT == MVT::f64 || VT == MVT::f32) && "Invalid type expansion");
+    return DAG.getConstant(LLVMC->getValueAPF().bitcastToAPInt(), dl,
+                           (VT == MVT::f64) ? MVT::i64 : MVT::i32);
+  }
+
+  APFloat APF = CFP->getValueAPF();
+  EVT OrigVT = VT;
+  EVT SVT = VT;
+
+  // We don't want to shrink SNaNs. Converting the SNaN back to its real type
+  // can cause it to be changed into a QNaN on some platforms (e.g. on SystemZ).
+  if (!APF.isSignaling()) {
+    while (SVT != MVT::f32 && SVT != MVT::f16 && SVT != MVT::bf16) {
+      SVT = (MVT::SimpleValueType)(SVT.getSimpleVT().SimpleTy - 1);
+      if (ConstantFPSDNode::isValueValidForType(SVT, APF) &&
+          // Only do this if the target has a native EXTLOAD instruction from
+          // smaller type.
+          TLI.isLoadExtLegal(ISD::EXTLOAD, OrigVT, SVT) &&
+          TLI.ShouldShrinkFPConstant(OrigVT)) {
+        Type *SType = SVT.getTypeForEVT(*DAG.getContext());
+        LLVMC = cast<ConstantFP>(ConstantExpr::getFPTrunc(LLVMC, SType));
+        VT = SVT;
+        Extend = true;
+      }
+    }
+  }
+
+  SDValue CPIdx =
+      DAG.getConstantPool(LLVMC, TLI.getPointerTy(DAG.getDataLayout()));
+  Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign();
+  if (Extend) {
+    SDValue Result = DAG.getExtLoad(
+        ISD::EXTLOAD, dl, OrigVT, DAG.getEntryNode(), CPIdx,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), VT,
+        Alignment);
+    return Result;
+  }
+  SDValue Result = DAG.getLoad(
+      OrigVT, dl, DAG.getEntryNode(), CPIdx,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
+  return Result;
+}
+
+/// Expands the Constant node to a load from the constant pool.
+SDValue SelectionDAGLegalize::ExpandConstant(ConstantSDNode *CP) {
+  SDLoc dl(CP);
+  EVT VT = CP->getValueType(0);
+  SDValue CPIdx = DAG.getConstantPool(CP->getConstantIntValue(),
+                                      TLI.getPointerTy(DAG.getDataLayout()));
+  Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign();
+  SDValue Result = DAG.getLoad(
+      VT, dl, DAG.getEntryNode(), CPIdx,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
+  return Result;
+}
+
+/// Some target cannot handle a variable insertion index for the
+/// INSERT_VECTOR_ELT instruction.  In this case, it
+/// is necessary to spill the vector being inserted into to memory, perform
+/// the insert there, and then read the result back.
+SDValue SelectionDAGLegalize::PerformInsertVectorEltInMemory(SDValue Vec,
+                                                             SDValue Val,
+                                                             SDValue Idx,
+                                                             const SDLoc &dl) {
+  SDValue Tmp1 = Vec;
+  SDValue Tmp2 = Val;
+  SDValue Tmp3 = Idx;
+
+  // If the target doesn't support this, we have to spill the input vector
+  // to a temporary stack slot, update the element, then reload it.  This is
+  // badness.  We could also load the value into a vector register (either
+  // with a "move to register" or "extload into register" instruction, then
+  // permute it into place, if the idx is a constant and if the idx is
+  // supported by the target.
+  EVT VT    = Tmp1.getValueType();
+  EVT EltVT = VT.getVectorElementType();
+  SDValue StackPtr = DAG.CreateStackTemporary(VT);
+
+  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+
+  // Store the vector.
+  SDValue Ch = DAG.getStore(
+      DAG.getEntryNode(), dl, Tmp1, StackPtr,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI));
+
+  SDValue StackPtr2 = TLI.getVectorElementPointer(DAG, StackPtr, VT, Tmp3);
+
+  // Store the scalar value.
+  Ch = DAG.getTruncStore(
+      Ch, dl, Tmp2, StackPtr2,
+      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT);
+  // Load the updated vector.
+  return DAG.getLoad(VT, dl, Ch, StackPtr, MachinePointerInfo::getFixedStack(
+                                               DAG.getMachineFunction(), SPFI));
+}
+
+SDValue SelectionDAGLegalize::ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
+                                                      SDValue Idx,
+                                                      const SDLoc &dl) {
+  if (ConstantSDNode *InsertPos = dyn_cast<ConstantSDNode>(Idx)) {
+    // SCALAR_TO_VECTOR requires that the type of the value being inserted
+    // match the element type of the vector being created, except for
+    // integers in which case the inserted value can be over width.
+    EVT EltVT = Vec.getValueType().getVectorElementType();
+    if (Val.getValueType() == EltVT ||
+        (EltVT.isInteger() && Val.getValueType().bitsGE(EltVT))) {
+      SDValue ScVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                  Vec.getValueType(), Val);
+
+      unsigned NumElts = Vec.getValueType().getVectorNumElements();
+      // We generate a shuffle of InVec and ScVec, so the shuffle mask
+      // should be 0,1,2,3,4,5... with the appropriate element replaced with
+      // elt 0 of the RHS.
+      SmallVector<int, 8> ShufOps;
+      for (unsigned i = 0; i != NumElts; ++i)
+        ShufOps.push_back(i != InsertPos->getZExtValue() ? i : NumElts);
+
+      return DAG.getVectorShuffle(Vec.getValueType(), dl, Vec, ScVec, ShufOps);
+    }
+  }
+  return PerformInsertVectorEltInMemory(Vec, Val, Idx, dl);
+}
+
+SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
+  if (!ISD::isNormalStore(ST))
+    return SDValue();
+
+  LLVM_DEBUG(dbgs() << "Optimizing float store operations\n");
+  // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
+  // FIXME: move this to the DAG Combiner!  Note that we can't regress due
+  // to phase ordering between legalized code and the dag combiner.  This
+  // probably means that we need to integrate dag combiner and legalizer
+  // together.
+  // We generally can't do this one for long doubles.
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+  SDValue Value = ST->getValue();
+  MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = ST->getAAInfo();
+  SDLoc dl(ST);
+
+  // Don't optimise TargetConstantFP
+  if (Value.getOpcode() == ISD::TargetConstantFP)
+    return SDValue();
+
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Value)) {
+    if (CFP->getValueType(0) == MVT::f32 &&
+        TLI.isTypeLegal(MVT::i32)) {
+      SDValue Con = DAG.getConstant(CFP->getValueAPF().
+                                      bitcastToAPInt().zextOrTrunc(32),
+                                    SDLoc(CFP), MVT::i32);
+      return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
+                          ST->getOriginalAlign(), MMOFlags, AAInfo);
+    }
+
+    if (CFP->getValueType(0) == MVT::f64) {
+      // If this target supports 64-bit registers, do a single 64-bit store.
+      if (TLI.isTypeLegal(MVT::i64)) {
+        SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
+                                      zextOrTrunc(64), SDLoc(CFP), MVT::i64);
+        return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
+                            ST->getOriginalAlign(), MMOFlags, AAInfo);
+      }
+
+      if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) {
+        // Otherwise, if the target supports 32-bit registers, use 2 32-bit
+        // stores.  If the target supports neither 32- nor 64-bits, this
+        // xform is certainly not worth it.
+        const APInt &IntVal = CFP->getValueAPF().bitcastToAPInt();
+        SDValue Lo = DAG.getConstant(IntVal.trunc(32), dl, MVT::i32);
+        SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), dl, MVT::i32);
+        if (DAG.getDataLayout().isBigEndian())
+          std::swap(Lo, Hi);
+
+        Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(),
+                          ST->getOriginalAlign(), MMOFlags, AAInfo);
+        Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(4), dl);
+        Hi = DAG.getStore(Chain, dl, Hi, Ptr,
+                          ST->getPointerInfo().getWithOffset(4),
+                          ST->getOriginalAlign(), MMOFlags, AAInfo);
+
+        return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+      }
+    }
+  }
+  return SDValue();
+}
+
+void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
+  StoreSDNode *ST = cast<StoreSDNode>(Node);
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+  SDLoc dl(Node);
+
+  MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = ST->getAAInfo();
+
+  if (!ST->isTruncatingStore()) {
+    LLVM_DEBUG(dbgs() << "Legalizing store operation\n");
+    if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
+      ReplaceNode(ST, OptStore);
+      return;
+    }
+
+    SDValue Value = ST->getValue();
+    MVT VT = Value.getSimpleValueType();
+    switch (TLI.getOperationAction(ISD::STORE, VT)) {
+    default: llvm_unreachable("This action is not supported yet!");
+    case TargetLowering::Legal: {
+      // If this is an unaligned store and the target doesn't support it,
+      // expand it.
+      EVT MemVT = ST->getMemoryVT();
+      const DataLayout &DL = DAG.getDataLayout();
+      if (!TLI.allowsMemoryAccessForAlignment(*DAG.getContext(), DL, MemVT,
+                                              *ST->getMemOperand())) {
+        LLVM_DEBUG(dbgs() << "Expanding unsupported unaligned store\n");
+        SDValue Result = TLI.expandUnalignedStore(ST, DAG);
+        ReplaceNode(SDValue(ST, 0), Result);
+      } else
+        LLVM_DEBUG(dbgs() << "Legal store\n");
+      break;
+    }
+    case TargetLowering::Custom: {
+      LLVM_DEBUG(dbgs() << "Trying custom lowering\n");
+      SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+      if (Res && Res != SDValue(Node, 0))
+        ReplaceNode(SDValue(Node, 0), Res);
+      return;
+    }
+    case TargetLowering::Promote: {
+      MVT NVT = TLI.getTypeToPromoteTo(ISD::STORE, VT);
+      assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
+             "Can only promote stores to same size type");
+      Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value);
+      SDValue Result = DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                                    ST->getOriginalAlign(), MMOFlags, AAInfo);
+      ReplaceNode(SDValue(Node, 0), Result);
+      break;
+    }
+    }
+    return;
+  }
+
+  LLVM_DEBUG(dbgs() << "Legalizing truncating store operations\n");
+  SDValue Value = ST->getValue();
+  EVT StVT = ST->getMemoryVT();
+  TypeSize StWidth = StVT.getSizeInBits();
+  TypeSize StSize = StVT.getStoreSizeInBits();
+  auto &DL = DAG.getDataLayout();
+
+  if (StWidth != StSize) {
+    // Promote to a byte-sized store with upper bits zero if not
+    // storing an integral number of bytes.  For example, promote
+    // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
+    EVT NVT = EVT::getIntegerVT(*DAG.getContext(), StSize.getFixedValue());
+    Value = DAG.getZeroExtendInReg(Value, dl, StVT);
+    SDValue Result =
+        DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), NVT,
+                          ST->getOriginalAlign(), MMOFlags, AAInfo);
+    ReplaceNode(SDValue(Node, 0), Result);
+  } else if (!StVT.isVector() && !isPowerOf2_64(StWidth.getFixedValue())) {
+    // If not storing a power-of-2 number of bits, expand as two stores.
+    assert(!StVT.isVector() && "Unsupported truncstore!");
+    unsigned StWidthBits = StWidth.getFixedValue();
+    unsigned LogStWidth = Log2_32(StWidthBits);
+    assert(LogStWidth < 32);
+    unsigned RoundWidth = 1 << LogStWidth;
+    assert(RoundWidth < StWidthBits);
+    unsigned ExtraWidth = StWidthBits - RoundWidth;
+    assert(ExtraWidth < RoundWidth);
+    assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
+           "Store size not an integral number of bytes!");
+    EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
+    EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
+    SDValue Lo, Hi;
+    unsigned IncrementSize;
+
+    if (DL.isLittleEndian()) {
+      // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
+      // Store the bottom RoundWidth bits.
+      Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                             RoundVT, ST->getOriginalAlign(), MMOFlags, AAInfo);
+
+      // Store the remaining ExtraWidth bits.
+      IncrementSize = RoundWidth / 8;
+      Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl);
+      Hi = DAG.getNode(
+          ISD::SRL, dl, Value.getValueType(), Value,
+          DAG.getConstant(RoundWidth, dl,
+                          TLI.getShiftAmountTy(Value.getValueType(), DL)));
+      Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr,
+                             ST->getPointerInfo().getWithOffset(IncrementSize),
+                             ExtraVT, ST->getOriginalAlign(), MMOFlags, AAInfo);
+    } else {
+      // Big endian - avoid unaligned stores.
+      // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
+      // Store the top RoundWidth bits.
+      Hi = DAG.getNode(
+          ISD::SRL, dl, Value.getValueType(), Value,
+          DAG.getConstant(ExtraWidth, dl,
+                          TLI.getShiftAmountTy(Value.getValueType(), DL)));
+      Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(), RoundVT,
+                             ST->getOriginalAlign(), MMOFlags, AAInfo);
+
+      // Store the remaining ExtraWidth bits.
+      IncrementSize = RoundWidth / 8;
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                        DAG.getConstant(IncrementSize, dl,
+                                        Ptr.getValueType()));
+      Lo = DAG.getTruncStore(Chain, dl, Value, Ptr,
+                             ST->getPointerInfo().getWithOffset(IncrementSize),
+                             ExtraVT, ST->getOriginalAlign(), MMOFlags, AAInfo);
+    }
+
+    // The order of the stores doesn't matter.
+    SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+    ReplaceNode(SDValue(Node, 0), Result);
+  } else {
+    switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) {
+    default: llvm_unreachable("This action is not supported yet!");
+    case TargetLowering::Legal: {
+      EVT MemVT = ST->getMemoryVT();
+      // If this is an unaligned store and the target doesn't support it,
+      // expand it.
+      if (!TLI.allowsMemoryAccessForAlignment(*DAG.getContext(), DL, MemVT,
+                                              *ST->getMemOperand())) {
+        SDValue Result = TLI.expandUnalignedStore(ST, DAG);
+        ReplaceNode(SDValue(ST, 0), Result);
+      }
+      break;
+    }
+    case TargetLowering::Custom: {
+      SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+      if (Res && Res != SDValue(Node, 0))
+        ReplaceNode(SDValue(Node, 0), Res);
+      return;
+    }
+    case TargetLowering::Expand:
+      assert(!StVT.isVector() &&
+             "Vector Stores are handled in LegalizeVectorOps");
+
+      SDValue Result;
+
+      // TRUNCSTORE:i16 i32 -> STORE i16
+      if (TLI.isTypeLegal(StVT)) {
+        Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value);
+        Result = DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                              ST->getOriginalAlign(), MMOFlags, AAInfo);
+      } else {
+        // The in-memory type isn't legal. Truncate to the type it would promote
+        // to, and then do a truncstore.
+        Value = DAG.getNode(ISD::TRUNCATE, dl,
+                            TLI.getTypeToTransformTo(*DAG.getContext(), StVT),
+                            Value);
+        Result =
+            DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), StVT,
+                              ST->getOriginalAlign(), MMOFlags, AAInfo);
+      }
+
+      ReplaceNode(SDValue(Node, 0), Result);
+      break;
+    }
+  }
+}
+
+void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
+  LoadSDNode *LD = cast<LoadSDNode>(Node);
+  SDValue Chain = LD->getChain();  // The chain.
+  SDValue Ptr = LD->getBasePtr();  // The base pointer.
+  SDValue Value;                   // The value returned by the load op.
+  SDLoc dl(Node);
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD) {
+    LLVM_DEBUG(dbgs() << "Legalizing non-extending load operation\n");
+    MVT VT = Node->getSimpleValueType(0);
+    SDValue RVal = SDValue(Node, 0);
+    SDValue RChain = SDValue(Node, 1);
+
+    switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
+    default: llvm_unreachable("This action is not supported yet!");
+    case TargetLowering::Legal: {
+      EVT MemVT = LD->getMemoryVT();
+      const DataLayout &DL = DAG.getDataLayout();
+      // If this is an unaligned load and the target doesn't support it,
+      // expand it.
+      if (!TLI.allowsMemoryAccessForAlignment(*DAG.getContext(), DL, MemVT,
+                                              *LD->getMemOperand())) {
+        std::tie(RVal, RChain) = TLI.expandUnalignedLoad(LD, DAG);
+      }
+      break;
+    }
+    case TargetLowering::Custom:
+      if (SDValue Res = TLI.LowerOperation(RVal, DAG)) {
+        RVal = Res;
+        RChain = Res.getValue(1);
+      }
+      break;
+
+    case TargetLowering::Promote: {
+      MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
+      assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
+             "Can only promote loads to same size type");
+
+      SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getMemOperand());
+      RVal = DAG.getNode(ISD::BITCAST, dl, VT, Res);
+      RChain = Res.getValue(1);
+      break;
+    }
+    }
+    if (RChain.getNode() != Node) {
+      assert(RVal.getNode() != Node && "Load must be completely replaced");
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), RVal);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), RChain);
+      if (UpdatedNodes) {
+        UpdatedNodes->insert(RVal.getNode());
+        UpdatedNodes->insert(RChain.getNode());
+      }
+      ReplacedNode(Node);
+    }
+    return;
+  }
+
+  LLVM_DEBUG(dbgs() << "Legalizing extending load operation\n");
+  EVT SrcVT = LD->getMemoryVT();
+  TypeSize SrcWidth = SrcVT.getSizeInBits();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = LD->getAAInfo();
+
+  if (SrcWidth != SrcVT.getStoreSizeInBits() &&
+      // Some targets pretend to have an i1 loading operation, and actually
+      // load an i8.  This trick is correct for ZEXTLOAD because the top 7
+      // bits are guaranteed to be zero; it helps the optimizers understand
+      // that these bits are zero.  It is also useful for EXTLOAD, since it
+      // tells the optimizers that those bits are undefined.  It would be
+      // nice to have an effective generic way of getting these benefits...
+      // Until such a way is found, don't insist on promoting i1 here.
+      (SrcVT != MVT::i1 ||
+       TLI.getLoadExtAction(ExtType, Node->getValueType(0), MVT::i1) ==
+         TargetLowering::Promote)) {
+    // Promote to a byte-sized load if not loading an integral number of
+    // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
+    unsigned NewWidth = SrcVT.getStoreSizeInBits();
+    EVT NVT = EVT::getIntegerVT(*DAG.getContext(), NewWidth);
+    SDValue Ch;
+
+    // The extra bits are guaranteed to be zero, since we stored them that
+    // way.  A zext load from NVT thus automatically gives zext from SrcVT.
+
+    ISD::LoadExtType NewExtType =
+      ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;
+
+    SDValue Result = DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
+                                    Chain, Ptr, LD->getPointerInfo(), NVT,
+                                    LD->getOriginalAlign(), MMOFlags, AAInfo);
+
+    Ch = Result.getValue(1); // The chain.
+
+    if (ExtType == ISD::SEXTLOAD)
+      // Having the top bits zero doesn't help when sign extending.
+      Result = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
+                           Result.getValueType(),
+                           Result, DAG.getValueType(SrcVT));
+    else if (ExtType == ISD::ZEXTLOAD || NVT == Result.getValueType())
+      // All the top bits are guaranteed to be zero - inform the optimizers.
+      Result = DAG.getNode(ISD::AssertZext, dl,
+                           Result.getValueType(), Result,
+                           DAG.getValueType(SrcVT));
+
+    Value = Result;
+    Chain = Ch;
+  } else if (!isPowerOf2_64(SrcWidth.getKnownMinValue())) {
+    // If not loading a power-of-2 number of bits, expand as two loads.
+    assert(!SrcVT.isVector() && "Unsupported extload!");
+    unsigned SrcWidthBits = SrcWidth.getFixedValue();
+    unsigned LogSrcWidth = Log2_32(SrcWidthBits);
+    assert(LogSrcWidth < 32);
+    unsigned RoundWidth = 1 << LogSrcWidth;
+    assert(RoundWidth < SrcWidthBits);
+    unsigned ExtraWidth = SrcWidthBits - RoundWidth;
+    assert(ExtraWidth < RoundWidth);
+    assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
+           "Load size not an integral number of bytes!");
+    EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
+    EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
+    SDValue Lo, Hi, Ch;
+    unsigned IncrementSize;
+    auto &DL = DAG.getDataLayout();
+
+    if (DL.isLittleEndian()) {
+      // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
+      // Load the bottom RoundWidth bits.
+      Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo(), RoundVT, LD->getOriginalAlign(),
+                          MMOFlags, AAInfo);
+
+      // Load the remaining ExtraWidth bits.
+      IncrementSize = RoundWidth / 8;
+      Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl);
+      Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo().getWithOffset(IncrementSize),
+                          ExtraVT, LD->getOriginalAlign(), MMOFlags, AAInfo);
+
+      // Build a factor node to remember that this load is independent of
+      // the other one.
+      Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                       Hi.getValue(1));
+
+      // Move the top bits to the right place.
+      Hi = DAG.getNode(
+          ISD::SHL, dl, Hi.getValueType(), Hi,
+          DAG.getConstant(RoundWidth, dl,
+                          TLI.getShiftAmountTy(Hi.getValueType(), DL)));
+
+      // Join the hi and lo parts.
+      Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
+    } else {
+      // Big endian - avoid unaligned loads.
+      // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8
+      // Load the top RoundWidth bits.
+      Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo(), RoundVT, LD->getOriginalAlign(),
+                          MMOFlags, AAInfo);
+
+      // Load the remaining ExtraWidth bits.
+      IncrementSize = RoundWidth / 8;
+      Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl);
+      Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo().getWithOffset(IncrementSize),
+                          ExtraVT, LD->getOriginalAlign(), MMOFlags, AAInfo);
+
+      // Build a factor node to remember that this load is independent of
+      // the other one.
+      Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                       Hi.getValue(1));
+
+      // Move the top bits to the right place.
+      Hi = DAG.getNode(
+          ISD::SHL, dl, Hi.getValueType(), Hi,
+          DAG.getConstant(ExtraWidth, dl,
+                          TLI.getShiftAmountTy(Hi.getValueType(), DL)));
+
+      // Join the hi and lo parts.
+      Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
+    }
+
+    Chain = Ch;
+  } else {
+    bool isCustom = false;
+    switch (TLI.getLoadExtAction(ExtType, Node->getValueType(0),
+                                 SrcVT.getSimpleVT())) {
+    default: llvm_unreachable("This action is not supported yet!");
+    case TargetLowering::Custom:
+      isCustom = true;
+      [[fallthrough]];
+    case TargetLowering::Legal:
+      Value = SDValue(Node, 0);
+      Chain = SDValue(Node, 1);
+
+      if (isCustom) {
+        if (SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG)) {
+          Value = Res;
+          Chain = Res.getValue(1);
+        }
+      } else {
+        // If this is an unaligned load and the target doesn't support it,
+        // expand it.
+        EVT MemVT = LD->getMemoryVT();
+        const DataLayout &DL = DAG.getDataLayout();
+        if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT,
+                                    *LD->getMemOperand())) {
+          std::tie(Value, Chain) = TLI.expandUnalignedLoad(LD, DAG);
+        }
+      }
+      break;
+
+    case TargetLowering::Expand: {
+      EVT DestVT = Node->getValueType(0);
+      if (!TLI.isLoadExtLegal(ISD::EXTLOAD, DestVT, SrcVT)) {
+        // If the source type is not legal, see if there is a legal extload to
+        // an intermediate type that we can then extend further.
+        EVT LoadVT = TLI.getRegisterType(SrcVT.getSimpleVT());
+        if ((LoadVT.isFloatingPoint() == SrcVT.isFloatingPoint()) &&
+            (TLI.isTypeLegal(SrcVT) || // Same as SrcVT == LoadVT?
+             TLI.isLoadExtLegal(ExtType, LoadVT, SrcVT))) {
+          // If we are loading a legal type, this is a non-extload followed by a
+          // full extend.
+          ISD::LoadExtType MidExtType =
+              (LoadVT == SrcVT) ? ISD::NON_EXTLOAD : ExtType;
+
+          SDValue Load = DAG.getExtLoad(MidExtType, dl, LoadVT, Chain, Ptr,
+                                        SrcVT, LD->getMemOperand());
+          unsigned ExtendOp =
+              ISD::getExtForLoadExtType(SrcVT.isFloatingPoint(), ExtType);
+          Value = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
+          Chain = Load.getValue(1);
+          break;
+        }
+
+        // Handle the special case of fp16 extloads. EXTLOAD doesn't have the
+        // normal undefined upper bits behavior to allow using an in-reg extend
+        // with the illegal FP type, so load as an integer and do the
+        // from-integer conversion.
+        if (SrcVT.getScalarType() == MVT::f16) {
+          EVT ISrcVT = SrcVT.changeTypeToInteger();
+          EVT IDestVT = DestVT.changeTypeToInteger();
+          EVT ILoadVT = TLI.getRegisterType(IDestVT.getSimpleVT());
+
+          SDValue Result = DAG.getExtLoad(ISD::ZEXTLOAD, dl, ILoadVT, Chain,
+                                          Ptr, ISrcVT, LD->getMemOperand());
+          Value = DAG.getNode(ISD::FP16_TO_FP, dl, DestVT, Result);
+          Chain = Result.getValue(1);
+          break;
+        }
+      }
+
+      assert(!SrcVT.isVector() &&
+             "Vector Loads are handled in LegalizeVectorOps");
+
+      // FIXME: This does not work for vectors on most targets.  Sign-
+      // and zero-extend operations are currently folded into extending
+      // loads, whether they are legal or not, and then we end up here
+      // without any support for legalizing them.
+      assert(ExtType != ISD::EXTLOAD &&
+             "EXTLOAD should always be supported!");
+      // Turn the unsupported load into an EXTLOAD followed by an
+      // explicit zero/sign extend inreg.
+      SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl,
+                                      Node->getValueType(0),
+                                      Chain, Ptr, SrcVT,
+                                      LD->getMemOperand());
+      SDValue ValRes;
+      if (ExtType == ISD::SEXTLOAD)
+        ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
+                             Result.getValueType(),
+                             Result, DAG.getValueType(SrcVT));
+      else
+        ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT);
+      Value = ValRes;
+      Chain = Result.getValue(1);
+      break;
+    }
+    }
+  }
+
+  // Since loads produce two values, make sure to remember that we legalized
+  // both of them.
+  if (Chain.getNode() != Node) {
+    assert(Value.getNode() != Node && "Load must be completely replaced");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Value);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
+    if (UpdatedNodes) {
+      UpdatedNodes->insert(Value.getNode());
+      UpdatedNodes->insert(Chain.getNode());
+    }
+    ReplacedNode(Node);
+  }
+}
+
+/// Return a legal replacement for the given operation, with all legal operands.
+void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
+  LLVM_DEBUG(dbgs() << "\nLegalizing: "; Node->dump(&DAG));
+
+  // Allow illegal target nodes and illegal registers.
+  if (Node->getOpcode() == ISD::TargetConstant ||
+      Node->getOpcode() == ISD::Register)
+    return;
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
+    assert(TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
+             TargetLowering::TypeLegal &&
+           "Unexpected illegal type!");
+
+  for (const SDValue &Op : Node->op_values())
+    assert((TLI.getTypeAction(*DAG.getContext(), Op.getValueType()) ==
+              TargetLowering::TypeLegal ||
+            Op.getOpcode() == ISD::TargetConstant ||
+            Op.getOpcode() == ISD::Register) &&
+            "Unexpected illegal type!");
+#endif
+
+  // Figure out the correct action; the way to query this varies by opcode
+  TargetLowering::LegalizeAction Action = TargetLowering::Legal;
+  bool SimpleFinishLegalizing = true;
+  switch (Node->getOpcode()) {
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID:
+  case ISD::STACKSAVE:
+    Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
+    break;
+  case ISD::GET_DYNAMIC_AREA_OFFSET:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getValueType(0));
+    break;
+  case ISD::VAARG:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getValueType(0));
+    if (Action != TargetLowering::Promote)
+      Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
+    break;
+  case ISD::SET_FPENV:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(1).getValueType());
+    break;
+  case ISD::FP_TO_FP16:
+  case ISD::FP_TO_BF16:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::EXTRACT_VECTOR_ELT:
+  case ISD::LROUND:
+  case ISD::LLROUND:
+  case ISD::LRINT:
+  case ISD::LLRINT:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(0).getValueType());
+    break;
+  case ISD::STRICT_FP_TO_FP16:
+  case ISD::STRICT_SINT_TO_FP:
+  case ISD::STRICT_UINT_TO_FP:
+  case ISD::STRICT_LRINT:
+  case ISD::STRICT_LLRINT:
+  case ISD::STRICT_LROUND:
+  case ISD::STRICT_LLROUND:
+    // These pseudo-ops are the same as the other STRICT_ ops except
+    // they are registered with setOperationAction() using the input type
+    // instead of the output type.
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(1).getValueType());
+    break;
+  case ISD::SIGN_EXTEND_INREG: {
+    EVT InnerType = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    Action = TLI.getOperationAction(Node->getOpcode(), InnerType);
+    break;
+  }
+  case ISD::ATOMIC_STORE:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(2).getValueType());
+    break;
+  case ISD::SELECT_CC:
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS:
+  case ISD::SETCC:
+  case ISD::SETCCCARRY:
+  case ISD::VP_SETCC:
+  case ISD::BR_CC: {
+    unsigned Opc = Node->getOpcode();
+    unsigned CCOperand = Opc == ISD::SELECT_CC                         ? 4
+                         : Opc == ISD::STRICT_FSETCC                   ? 3
+                         : Opc == ISD::STRICT_FSETCCS                  ? 3
+                         : Opc == ISD::SETCCCARRY                      ? 3
+                         : (Opc == ISD::SETCC || Opc == ISD::VP_SETCC) ? 2
+                                                                       : 1;
+    unsigned CompareOperand = Opc == ISD::BR_CC            ? 2
+                              : Opc == ISD::STRICT_FSETCC  ? 1
+                              : Opc == ISD::STRICT_FSETCCS ? 1
+                                                           : 0;
+    MVT OpVT = Node->getOperand(CompareOperand).getSimpleValueType();
+    ISD::CondCode CCCode =
+        cast<CondCodeSDNode>(Node->getOperand(CCOperand))->get();
+    Action = TLI.getCondCodeAction(CCCode, OpVT);
+    if (Action == TargetLowering::Legal) {
+      if (Node->getOpcode() == ISD::SELECT_CC)
+        Action = TLI.getOperationAction(Node->getOpcode(),
+                                        Node->getValueType(0));
+      else
+        Action = TLI.getOperationAction(Node->getOpcode(), OpVT);
+    }
+    break;
+  }
+  case ISD::LOAD:
+  case ISD::STORE:
+    // FIXME: Model these properly.  LOAD and STORE are complicated, and
+    // STORE expects the unlegalized operand in some cases.
+    SimpleFinishLegalizing = false;
+    break;
+  case ISD::CALLSEQ_START:
+  case ISD::CALLSEQ_END:
+    // FIXME: This shouldn't be necessary.  These nodes have special properties
+    // dealing with the recursive nature of legalization.  Removing this
+    // special case should be done as part of making LegalizeDAG non-recursive.
+    SimpleFinishLegalizing = false;
+    break;
+  case ISD::EXTRACT_ELEMENT:
+  case ISD::GET_ROUNDING:
+  case ISD::MERGE_VALUES:
+  case ISD::EH_RETURN:
+  case ISD::FRAME_TO_ARGS_OFFSET:
+  case ISD::EH_DWARF_CFA:
+  case ISD::EH_SJLJ_SETJMP:
+  case ISD::EH_SJLJ_LONGJMP:
+  case ISD::EH_SJLJ_SETUP_DISPATCH:
+    // These operations lie about being legal: when they claim to be legal,
+    // they should actually be expanded.
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Legal)
+      Action = TargetLowering::Expand;
+    break;
+  case ISD::INIT_TRAMPOLINE:
+  case ISD::ADJUST_TRAMPOLINE:
+  case ISD::FRAMEADDR:
+  case ISD::RETURNADDR:
+  case ISD::ADDROFRETURNADDR:
+  case ISD::SPONENTRY:
+    // These operations lie about being legal: when they claim to be legal,
+    // they should actually be custom-lowered.
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Legal)
+      Action = TargetLowering::Custom;
+    break;
+  case ISD::READCYCLECOUNTER:
+    // READCYCLECOUNTER returns an i64, even if type legalization might have
+    // expanded that to several smaller types.
+    Action = TLI.getOperationAction(Node->getOpcode(), MVT::i64);
+    break;
+  case ISD::READ_REGISTER:
+  case ISD::WRITE_REGISTER:
+    // Named register is legal in the DAG, but blocked by register name
+    // selection if not implemented by target (to chose the correct register)
+    // They'll be converted to Copy(To/From)Reg.
+    Action = TargetLowering::Legal;
+    break;
+  case ISD::UBSANTRAP:
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Expand) {
+      // replace ISD::UBSANTRAP with ISD::TRAP
+      SDValue NewVal;
+      NewVal = DAG.getNode(ISD::TRAP, SDLoc(Node), Node->getVTList(),
+                           Node->getOperand(0));
+      ReplaceNode(Node, NewVal.getNode());
+      LegalizeOp(NewVal.getNode());
+      return;
+    }
+    break;
+  case ISD::DEBUGTRAP:
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Expand) {
+      // replace ISD::DEBUGTRAP with ISD::TRAP
+      SDValue NewVal;
+      NewVal = DAG.getNode(ISD::TRAP, SDLoc(Node), Node->getVTList(),
+                           Node->getOperand(0));
+      ReplaceNode(Node, NewVal.getNode());
+      LegalizeOp(NewVal.getNode());
+      return;
+    }
+    break;
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    break;
+  case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
+  case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT: {
+    unsigned Scale = Node->getConstantOperandVal(2);
+    Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
+                                              Node->getValueType(0), Scale);
+    break;
+  }
+  case ISD::MSCATTER:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                    cast<MaskedScatterSDNode>(Node)->getValue().getValueType());
+    break;
+  case ISD::MSTORE:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                    cast<MaskedStoreSDNode>(Node)->getValue().getValueType());
+    break;
+  case ISD::VP_SCATTER:
+    Action = TLI.getOperationAction(
+        Node->getOpcode(),
+        cast<VPScatterSDNode>(Node)->getValue().getValueType());
+    break;
+  case ISD::VP_STORE:
+    Action = TLI.getOperationAction(
+        Node->getOpcode(),
+        cast<VPStoreSDNode>(Node)->getValue().getValueType());
+    break;
+  case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
+    Action = TLI.getOperationAction(
+        Node->getOpcode(),
+        cast<VPStridedStoreSDNode>(Node)->getValue().getValueType());
+    break;
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN:
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VECREDUCE_FMINIMUM:
+  case ISD::IS_FPCLASS:
+    Action = TLI.getOperationAction(
+        Node->getOpcode(), Node->getOperand(0).getValueType());
+    break;
+  case ISD::VECREDUCE_SEQ_FADD:
+  case ISD::VECREDUCE_SEQ_FMUL:
+  case ISD::VP_REDUCE_FADD:
+  case ISD::VP_REDUCE_FMUL:
+  case ISD::VP_REDUCE_ADD:
+  case ISD::VP_REDUCE_MUL:
+  case ISD::VP_REDUCE_AND:
+  case ISD::VP_REDUCE_OR:
+  case ISD::VP_REDUCE_XOR:
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_SMIN:
+  case ISD::VP_REDUCE_UMAX:
+  case ISD::VP_REDUCE_UMIN:
+  case ISD::VP_REDUCE_FMAX:
+  case ISD::VP_REDUCE_FMIN:
+  case ISD::VP_REDUCE_SEQ_FADD:
+  case ISD::VP_REDUCE_SEQ_FMUL:
+    Action = TLI.getOperationAction(
+        Node->getOpcode(), Node->getOperand(1).getValueType());
+    break;
+  default:
+    if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
+      Action = TLI.getCustomOperationAction(*Node);
+    } else {
+      Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    }
+    break;
+  }
+
+  if (SimpleFinishLegalizing) {
+    SDNode *NewNode = Node;
+    switch (Node->getOpcode()) {
+    default: break;
+    case ISD::SHL:
+    case ISD::SRL:
+    case ISD::SRA:
+    case ISD::ROTL:
+    case ISD::ROTR: {
+      // Legalizing shifts/rotates requires adjusting the shift amount
+      // to the appropriate width.
+      SDValue Op0 = Node->getOperand(0);
+      SDValue Op1 = Node->getOperand(1);
+      if (!Op1.getValueType().isVector()) {
+        SDValue SAO = DAG.getShiftAmountOperand(Op0.getValueType(), Op1);
+        // The getShiftAmountOperand() may create a new operand node or
+        // return the existing one. If new operand is created we need
+        // to update the parent node.
+        // Do not try to legalize SAO here! It will be automatically legalized
+        // in the next round.
+        if (SAO != Op1)
+          NewNode = DAG.UpdateNodeOperands(Node, Op0, SAO);
+      }
+    }
+    break;
+    case ISD::FSHL:
+    case ISD::FSHR:
+    case ISD::SRL_PARTS:
+    case ISD::SRA_PARTS:
+    case ISD::SHL_PARTS: {
+      // Legalizing shifts/rotates requires adjusting the shift amount
+      // to the appropriate width.
+      SDValue Op0 = Node->getOperand(0);
+      SDValue Op1 = Node->getOperand(1);
+      SDValue Op2 = Node->getOperand(2);
+      if (!Op2.getValueType().isVector()) {
+        SDValue SAO = DAG.getShiftAmountOperand(Op0.getValueType(), Op2);
+        // The getShiftAmountOperand() may create a new operand node or
+        // return the existing one. If new operand is created we need
+        // to update the parent node.
+        if (SAO != Op2)
+          NewNode = DAG.UpdateNodeOperands(Node, Op0, Op1, SAO);
+      }
+      break;
+    }
+    }
+
+    if (NewNode != Node) {
+      ReplaceNode(Node, NewNode);
+      Node = NewNode;
+    }
+    switch (Action) {
+    case TargetLowering::Legal:
+      LLVM_DEBUG(dbgs() << "Legal node: nothing to do\n");
+      return;
+    case TargetLowering::Custom:
+      LLVM_DEBUG(dbgs() << "Trying custom legalization\n");
+      // FIXME: The handling for custom lowering with multiple results is
+      // a complete mess.
+      if (SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG)) {
+        if (!(Res.getNode() != Node || Res.getResNo() != 0))
+          return;
+
+        if (Node->getNumValues() == 1) {
+          // Verify the new types match the original. Glue is waived because
+          // ISD::ADDC can be legalized by replacing Glue with an integer type.
+          assert((Res.getValueType() == Node->getValueType(0) ||
+                  Node->getValueType(0) == MVT::Glue) &&
+                 "Type mismatch for custom legalized operation");
+          LLVM_DEBUG(dbgs() << "Successfully custom legalized node\n");
+          // We can just directly replace this node with the lowered value.
+          ReplaceNode(SDValue(Node, 0), Res);
+          return;
+        }
+
+        SmallVector<SDValue, 8> ResultVals;
+        for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i) {
+          // Verify the new types match the original. Glue is waived because
+          // ISD::ADDC can be legalized by replacing Glue with an integer type.
+          assert((Res->getValueType(i) == Node->getValueType(i) ||
+                  Node->getValueType(i) == MVT::Glue) &&
+                 "Type mismatch for custom legalized operation");
+          ResultVals.push_back(Res.getValue(i));
+        }
+        LLVM_DEBUG(dbgs() << "Successfully custom legalized node\n");
+        ReplaceNode(Node, ResultVals.data());
+        return;
+      }
+      LLVM_DEBUG(dbgs() << "Could not custom legalize node\n");
+      [[fallthrough]];
+    case TargetLowering::Expand:
+      if (ExpandNode(Node))
+        return;
+      [[fallthrough]];
+    case TargetLowering::LibCall:
+      ConvertNodeToLibcall(Node);
+      return;
+    case TargetLowering::Promote:
+      PromoteNode(Node);
+      return;
+    }
+  }
+
+  switch (Node->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "NODE: ";
+    Node->dump( &DAG);
+    dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to legalize this operator!");
+
+  case ISD::CALLSEQ_START:
+  case ISD::CALLSEQ_END:
+    break;
+  case ISD::LOAD:
+    return LegalizeLoadOps(Node);
+  case ISD::STORE:
+    return LegalizeStoreOps(Node);
+  }
+}
+
+SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
+  SDValue Vec = Op.getOperand(0);
+  SDValue Idx = Op.getOperand(1);
+  SDLoc dl(Op);
+
+  // Before we generate a new store to a temporary stack slot, see if there is
+  // already one that we can use. There often is because when we scalarize
+  // vector operations (using SelectionDAG::UnrollVectorOp for example) a whole
+  // series of EXTRACT_VECTOR_ELT nodes are generated, one for each element in
+  // the vector. If all are expanded here, we don't want one store per vector
+  // element.
+
+  // Caches for hasPredecessorHelper
+  SmallPtrSet<const SDNode *, 32> Visited;
+  SmallVector<const SDNode *, 16> Worklist;
+  Visited.insert(Op.getNode());
+  Worklist.push_back(Idx.getNode());
+  SDValue StackPtr, Ch;
+  for (SDNode *User : Vec.getNode()->uses()) {
+    if (StoreSDNode *ST = dyn_cast<StoreSDNode>(User)) {
+      if (ST->isIndexed() || ST->isTruncatingStore() ||
+          ST->getValue() != Vec)
+        continue;
+
+      // Make sure that nothing else could have stored into the destination of
+      // this store.
+      if (!ST->getChain().reachesChainWithoutSideEffects(DAG.getEntryNode()))
+        continue;
+
+      // If the index is dependent on the store we will introduce a cycle when
+      // creating the load (the load uses the index, and by replacing the chain
+      // we will make the index dependent on the load). Also, the store might be
+      // dependent on the extractelement and introduce a cycle when creating
+      // the load.
+      if (SDNode::hasPredecessorHelper(ST, Visited, Worklist) ||
+          ST->hasPredecessor(Op.getNode()))
+        continue;
+
+      StackPtr = ST->getBasePtr();
+      Ch = SDValue(ST, 0);
+      break;
+    }
+  }
+
+  EVT VecVT = Vec.getValueType();
+
+  if (!Ch.getNode()) {
+    // Store the value to a temporary stack slot, then LOAD the returned part.
+    StackPtr = DAG.CreateStackTemporary(VecVT);
+    Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
+                      MachinePointerInfo());
+  }
+
+  SDValue NewLoad;
+  Align ElementAlignment =
+      std::min(cast<StoreSDNode>(Ch)->getAlign(),
+               DAG.getDataLayout().getPrefTypeAlign(
+                   Op.getValueType().getTypeForEVT(*DAG.getContext())));
+
+  if (Op.getValueType().isVector()) {
+    StackPtr = TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT,
+                                          Op.getValueType(), Idx);
+    NewLoad = DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr,
+                          MachinePointerInfo(), ElementAlignment);
+  } else {
+    StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
+    NewLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
+                             MachinePointerInfo(), VecVT.getVectorElementType(),
+                             ElementAlignment);
+  }
+
+  // Replace the chain going out of the store, by the one out of the load.
+  DAG.ReplaceAllUsesOfValueWith(Ch, SDValue(NewLoad.getNode(), 1));
+
+  // We introduced a cycle though, so update the loads operands, making sure
+  // to use the original store's chain as an incoming chain.
+  SmallVector<SDValue, 6> NewLoadOperands(NewLoad->op_begin(),
+                                          NewLoad->op_end());
+  NewLoadOperands[0] = Ch;
+  NewLoad =
+      SDValue(DAG.UpdateNodeOperands(NewLoad.getNode(), NewLoadOperands), 0);
+  return NewLoad;
+}
+
+SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
+  assert(Op.getValueType().isVector() && "Non-vector insert subvector!");
+
+  SDValue Vec  = Op.getOperand(0);
+  SDValue Part = Op.getOperand(1);
+  SDValue Idx  = Op.getOperand(2);
+  SDLoc dl(Op);
+
+  // Store the value to a temporary stack slot, then LOAD the returned part.
+  EVT VecVT = Vec.getValueType();
+  EVT SubVecVT = Part.getValueType();
+  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+
+  // First store the whole vector.
+  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
+
+  // Then store the inserted part.
+  SDValue SubStackPtr =
+      TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, SubVecVT, Idx);
+
+  // Store the subvector.
+  Ch = DAG.getStore(
+      Ch, dl, Part, SubStackPtr,
+      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+
+  // Finally, load the updated vector.
+  return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, PtrInfo);
+}
+
+SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
+  assert((Node->getOpcode() == ISD::BUILD_VECTOR ||
+          Node->getOpcode() == ISD::CONCAT_VECTORS) &&
+         "Unexpected opcode!");
+
+  // We can't handle this case efficiently.  Allocate a sufficiently
+  // aligned object on the stack, store each operand into it, then load
+  // the result as a vector.
+  // Create the stack frame object.
+  EVT VT = Node->getValueType(0);
+  EVT MemVT = isa<BuildVectorSDNode>(Node) ? VT.getVectorElementType()
+                                           : Node->getOperand(0).getValueType();
+  SDLoc dl(Node);
+  SDValue FIPtr = DAG.CreateStackTemporary(VT);
+  int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+
+  // Emit a store of each element to the stack slot.
+  SmallVector<SDValue, 8> Stores;
+  unsigned TypeByteSize = MemVT.getSizeInBits() / 8;
+  assert(TypeByteSize > 0 && "Vector element type too small for stack store!");
+
+  // If the destination vector element type of a BUILD_VECTOR is narrower than
+  // the source element type, only store the bits necessary.
+  bool Truncate = isa<BuildVectorSDNode>(Node) &&
+                  MemVT.bitsLT(Node->getOperand(0).getValueType());
+
+  // Store (in the right endianness) the elements to memory.
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
+    // Ignore undef elements.
+    if (Node->getOperand(i).isUndef()) continue;
+
+    unsigned Offset = TypeByteSize*i;
+
+    SDValue Idx = DAG.getMemBasePlusOffset(FIPtr, TypeSize::Fixed(Offset), dl);
+
+    if (Truncate)
+      Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl,
+                                         Node->getOperand(i), Idx,
+                                         PtrInfo.getWithOffset(Offset), MemVT));
+    else
+      Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, Node->getOperand(i),
+                                    Idx, PtrInfo.getWithOffset(Offset)));
+  }
+
+  SDValue StoreChain;
+  if (!Stores.empty())    // Not all undef elements?
+    StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+  else
+    StoreChain = DAG.getEntryNode();
+
+  // Result is a load from the stack slot.
+  return DAG.getLoad(VT, dl, StoreChain, FIPtr, PtrInfo);
+}
+
+/// Bitcast a floating-point value to an integer value. Only bitcast the part
+/// containing the sign bit if the target has no integer value capable of
+/// holding all bits of the floating-point value.
+void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State,
+                                             const SDLoc &DL,
+                                             SDValue Value) const {
+  EVT FloatVT = Value.getValueType();
+  unsigned NumBits = FloatVT.getScalarSizeInBits();
+  State.FloatVT = FloatVT;
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), NumBits);
+  // Convert to an integer of the same size.
+  if (TLI.isTypeLegal(IVT)) {
+    State.IntValue = DAG.getNode(ISD::BITCAST, DL, IVT, Value);
+    State.SignMask = APInt::getSignMask(NumBits);
+    State.SignBit = NumBits - 1;
+    return;
+  }
+
+  auto &DataLayout = DAG.getDataLayout();
+  // Store the float to memory, then load the sign part out as an integer.
+  MVT LoadTy = TLI.getRegisterType(MVT::i8);
+  // First create a temporary that is aligned for both the load and store.
+  SDValue StackPtr = DAG.CreateStackTemporary(FloatVT, LoadTy);
+  int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  // Then store the float to it.
+  State.FloatPtr = StackPtr;
+  MachineFunction &MF = DAG.getMachineFunction();
+  State.FloatPointerInfo = MachinePointerInfo::getFixedStack(MF, FI);
+  State.Chain = DAG.getStore(DAG.getEntryNode(), DL, Value, State.FloatPtr,
+                             State.FloatPointerInfo);
+
+  SDValue IntPtr;
+  if (DataLayout.isBigEndian()) {
+    assert(FloatVT.isByteSized() && "Unsupported floating point type!");
+    // Load out a legal integer with the same sign bit as the float.
+    IntPtr = StackPtr;
+    State.IntPointerInfo = State.FloatPointerInfo;
+  } else {
+    // Advance the pointer so that the loaded byte will contain the sign bit.
+    unsigned ByteOffset = (NumBits / 8) - 1;
+    IntPtr =
+        DAG.getMemBasePlusOffset(StackPtr, TypeSize::Fixed(ByteOffset), DL);
+    State.IntPointerInfo = MachinePointerInfo::getFixedStack(MF, FI,
+                                                             ByteOffset);
+  }
+
+  State.IntPtr = IntPtr;
+  State.IntValue = DAG.getExtLoad(ISD::EXTLOAD, DL, LoadTy, State.Chain, IntPtr,
+                                  State.IntPointerInfo, MVT::i8);
+  State.SignMask = APInt::getOneBitSet(LoadTy.getScalarSizeInBits(), 7);
+  State.SignBit = 7;
+}
+
+/// Replace the integer value produced by getSignAsIntValue() with a new value
+/// and cast the result back to a floating-point type.
+SDValue SelectionDAGLegalize::modifySignAsInt(const FloatSignAsInt &State,
+                                              const SDLoc &DL,
+                                              SDValue NewIntValue) const {
+  if (!State.Chain)
+    return DAG.getNode(ISD::BITCAST, DL, State.FloatVT, NewIntValue);
+
+  // Override the part containing the sign bit in the value stored on the stack.
+  SDValue Chain = DAG.getTruncStore(State.Chain, DL, NewIntValue, State.IntPtr,
+                                    State.IntPointerInfo, MVT::i8);
+  return DAG.getLoad(State.FloatVT, DL, Chain, State.FloatPtr,
+                     State.FloatPointerInfo);
+}
+
+SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode *Node) const {
+  SDLoc DL(Node);
+  SDValue Mag = Node->getOperand(0);
+  SDValue Sign = Node->getOperand(1);
+
+  // Get sign bit into an integer value.
+  FloatSignAsInt SignAsInt;
+  getSignAsIntValue(SignAsInt, DL, Sign);
+
+  EVT IntVT = SignAsInt.IntValue.getValueType();
+  SDValue SignMask = DAG.getConstant(SignAsInt.SignMask, DL, IntVT);
+  SDValue SignBit = DAG.getNode(ISD::AND, DL, IntVT, SignAsInt.IntValue,
+                                SignMask);
+
+  // If FABS is legal transform FCOPYSIGN(x, y) => sign(x) ? -FABS(x) : FABS(X)
+  EVT FloatVT = Mag.getValueType();
+  if (TLI.isOperationLegalOrCustom(ISD::FABS, FloatVT) &&
+      TLI.isOperationLegalOrCustom(ISD::FNEG, FloatVT)) {
+    SDValue AbsValue = DAG.getNode(ISD::FABS, DL, FloatVT, Mag);
+    SDValue NegValue = DAG.getNode(ISD::FNEG, DL, FloatVT, AbsValue);
+    SDValue Cond = DAG.getSetCC(DL, getSetCCResultType(IntVT), SignBit,
+                                DAG.getConstant(0, DL, IntVT), ISD::SETNE);
+    return DAG.getSelect(DL, FloatVT, Cond, NegValue, AbsValue);
+  }
+
+  // Transform Mag value to integer, and clear the sign bit.
+  FloatSignAsInt MagAsInt;
+  getSignAsIntValue(MagAsInt, DL, Mag);
+  EVT MagVT = MagAsInt.IntValue.getValueType();
+  SDValue ClearSignMask = DAG.getConstant(~MagAsInt.SignMask, DL, MagVT);
+  SDValue ClearedSign = DAG.getNode(ISD::AND, DL, MagVT, MagAsInt.IntValue,
+                                    ClearSignMask);
+
+  // Get the signbit at the right position for MagAsInt.
+  int ShiftAmount = SignAsInt.SignBit - MagAsInt.SignBit;
+  EVT ShiftVT = IntVT;
+  if (SignBit.getScalarValueSizeInBits() <
+      ClearedSign.getScalarValueSizeInBits()) {
+    SignBit = DAG.getNode(ISD::ZERO_EXTEND, DL, MagVT, SignBit);
+    ShiftVT = MagVT;
+  }
+  if (ShiftAmount > 0) {
+    SDValue ShiftCnst = DAG.getConstant(ShiftAmount, DL, ShiftVT);
+    SignBit = DAG.getNode(ISD::SRL, DL, ShiftVT, SignBit, ShiftCnst);
+  } else if (ShiftAmount < 0) {
+    SDValue ShiftCnst = DAG.getConstant(-ShiftAmount, DL, ShiftVT);
+    SignBit = DAG.getNode(ISD::SHL, DL, ShiftVT, SignBit, ShiftCnst);
+  }
+  if (SignBit.getScalarValueSizeInBits() >
+      ClearedSign.getScalarValueSizeInBits()) {
+    SignBit = DAG.getNode(ISD::TRUNCATE, DL, MagVT, SignBit);
+  }
+
+  // Store the part with the modified sign and convert back to float.
+  SDValue CopiedSign = DAG.getNode(ISD::OR, DL, MagVT, ClearedSign, SignBit);
+  return modifySignAsInt(MagAsInt, DL, CopiedSign);
+}
+
+SDValue SelectionDAGLegalize::ExpandFNEG(SDNode *Node) const {
+  // Get the sign bit as an integer.
+  SDLoc DL(Node);
+  FloatSignAsInt SignAsInt;
+  getSignAsIntValue(SignAsInt, DL, Node->getOperand(0));
+  EVT IntVT = SignAsInt.IntValue.getValueType();
+
+  // Flip the sign.
+  SDValue SignMask = DAG.getConstant(SignAsInt.SignMask, DL, IntVT);
+  SDValue SignFlip =
+      DAG.getNode(ISD::XOR, DL, IntVT, SignAsInt.IntValue, SignMask);
+
+  // Convert back to float.
+  return modifySignAsInt(SignAsInt, DL, SignFlip);
+}
+
+SDValue SelectionDAGLegalize::ExpandFABS(SDNode *Node) const {
+  SDLoc DL(Node);
+  SDValue Value = Node->getOperand(0);
+
+  // Transform FABS(x) => FCOPYSIGN(x, 0.0) if FCOPYSIGN is legal.
+  EVT FloatVT = Value.getValueType();
+  if (TLI.isOperationLegalOrCustom(ISD::FCOPYSIGN, FloatVT)) {
+    SDValue Zero = DAG.getConstantFP(0.0, DL, FloatVT);
+    return DAG.getNode(ISD::FCOPYSIGN, DL, FloatVT, Value, Zero);
+  }
+
+  // Transform value to integer, clear the sign bit and transform back.
+  FloatSignAsInt ValueAsInt;
+  getSignAsIntValue(ValueAsInt, DL, Value);
+  EVT IntVT = ValueAsInt.IntValue.getValueType();
+  SDValue ClearSignMask = DAG.getConstant(~ValueAsInt.SignMask, DL, IntVT);
+  SDValue ClearedSign = DAG.getNode(ISD::AND, DL, IntVT, ValueAsInt.IntValue,
+                                    ClearSignMask);
+  return modifySignAsInt(ValueAsInt, DL, ClearedSign);
+}
+
+void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
+                                           SmallVectorImpl<SDValue> &Results) {
+  Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
+  assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+          " not tell us which reg is the stack pointer!");
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  SDValue Tmp1 = SDValue(Node, 0);
+  SDValue Tmp2 = SDValue(Node, 1);
+  SDValue Tmp3 = Node->getOperand(2);
+  SDValue Chain = Tmp1.getOperand(0);
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
+
+  SDValue Size  = Tmp2.getOperand(1);
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+  Chain = SP.getValue(1);
+  Align Alignment = cast<ConstantSDNode>(Tmp3)->getAlignValue();
+  const TargetFrameLowering *TFL = DAG.getSubtarget().getFrameLowering();
+  unsigned Opc =
+    TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
+    ISD::ADD : ISD::SUB;
+
+  Align StackAlign = TFL->getStackAlign();
+  Tmp1 = DAG.getNode(Opc, dl, VT, SP, Size);       // Value
+  if (Alignment > StackAlign)
+    Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
+                       DAG.getConstant(-Alignment.value(), dl, VT));
+  Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1);     // Output chain
+
+  Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
+
+  Results.push_back(Tmp1);
+  Results.push_back(Tmp2);
+}
+
+/// Emit a store/load combination to the stack.  This stores
+/// SrcOp to a stack slot of type SlotVT, truncating it if needed.  It then does
+/// a load from the stack slot to DestVT, extending it if needed.
+/// The resultant code need not be legal.
+SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
+                                               EVT DestVT, const SDLoc &dl) {
+  return EmitStackConvert(SrcOp, SlotVT, DestVT, dl, DAG.getEntryNode());
+}
+
+SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
+                                               EVT DestVT, const SDLoc &dl,
+                                               SDValue Chain) {
+  EVT SrcVT = SrcOp.getValueType();
+  Type *DestType = DestVT.getTypeForEVT(*DAG.getContext());
+  Align DestAlign = DAG.getDataLayout().getPrefTypeAlign(DestType);
+
+  // Don't convert with stack if the load/store is expensive.
+  if ((SrcVT.bitsGT(SlotVT) &&
+       !TLI.isTruncStoreLegalOrCustom(SrcOp.getValueType(), SlotVT)) ||
+      (SlotVT.bitsLT(DestVT) &&
+       !TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, DestVT, SlotVT)))
+    return SDValue();
+
+  // Create the stack frame object.
+  Align SrcAlign = DAG.getDataLayout().getPrefTypeAlign(
+      SrcOp.getValueType().getTypeForEVT(*DAG.getContext()));
+  SDValue FIPtr = DAG.CreateStackTemporary(SlotVT.getStoreSize(), SrcAlign);
+
+  FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(FIPtr);
+  int SPFI = StackPtrFI->getIndex();
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+
+  // Emit a store to the stack slot.  Use a truncstore if the input value is
+  // later than DestVT.
+  SDValue Store;
+
+  if (SrcVT.bitsGT(SlotVT))
+    Store = DAG.getTruncStore(Chain, dl, SrcOp, FIPtr, PtrInfo,
+                              SlotVT, SrcAlign);
+  else {
+    assert(SrcVT.bitsEq(SlotVT) && "Invalid store");
+    Store = DAG.getStore(Chain, dl, SrcOp, FIPtr, PtrInfo, SrcAlign);
+  }
+
+  // Result is a load from the stack slot.
+  if (SlotVT.bitsEq(DestVT))
+    return DAG.getLoad(DestVT, dl, Store, FIPtr, PtrInfo, DestAlign);
+
+  assert(SlotVT.bitsLT(DestVT) && "Unknown extension!");
+  return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr, PtrInfo, SlotVT,
+                        DestAlign);
+}
+
+SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
+  SDLoc dl(Node);
+  // Create a vector sized/aligned stack slot, store the value to element #0,
+  // then load the whole vector back out.
+  SDValue StackPtr = DAG.CreateStackTemporary(Node->getValueType(0));
+
+  FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(StackPtr);
+  int SPFI = StackPtrFI->getIndex();
+
+  SDValue Ch = DAG.getTruncStore(
+      DAG.getEntryNode(), dl, Node->getOperand(0), StackPtr,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI),
+      Node->getValueType(0).getVectorElementType());
+  return DAG.getLoad(
+      Node->getValueType(0), dl, Ch, StackPtr,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI));
+}
+
+static bool
+ExpandBVWithShuffles(SDNode *Node, SelectionDAG &DAG,
+                     const TargetLowering &TLI, SDValue &Res) {
+  unsigned NumElems = Node->getNumOperands();
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+
+  // Try to group the scalars into pairs, shuffle the pairs together, then
+  // shuffle the pairs of pairs together, etc. until the vector has
+  // been built. This will work only if all of the necessary shuffle masks
+  // are legal.
+
+  // We do this in two phases; first to check the legality of the shuffles,
+  // and next, assuming that all shuffles are legal, to create the new nodes.
+  for (int Phase = 0; Phase < 2; ++Phase) {
+    SmallVector<std::pair<SDValue, SmallVector<int, 16>>, 16> IntermedVals,
+                                                              NewIntermedVals;
+    for (unsigned i = 0; i < NumElems; ++i) {
+      SDValue V = Node->getOperand(i);
+      if (V.isUndef())
+        continue;
+
+      SDValue Vec;
+      if (Phase)
+        Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, V);
+      IntermedVals.push_back(std::make_pair(Vec, SmallVector<int, 16>(1, i)));
+    }
+
+    while (IntermedVals.size() > 2) {
+      NewIntermedVals.clear();
+      for (unsigned i = 0, e = (IntermedVals.size() & ~1u); i < e; i += 2) {
+        // This vector and the next vector are shuffled together (simply to
+        // append the one to the other).
+        SmallVector<int, 16> ShuffleVec(NumElems, -1);
+
+        SmallVector<int, 16> FinalIndices;
+        FinalIndices.reserve(IntermedVals[i].second.size() +
+                             IntermedVals[i+1].second.size());
+
+        int k = 0;
+        for (unsigned j = 0, f = IntermedVals[i].second.size(); j != f;
+             ++j, ++k) {
+          ShuffleVec[k] = j;
+          FinalIndices.push_back(IntermedVals[i].second[j]);
+        }
+        for (unsigned j = 0, f = IntermedVals[i+1].second.size(); j != f;
+             ++j, ++k) {
+          ShuffleVec[k] = NumElems + j;
+          FinalIndices.push_back(IntermedVals[i+1].second[j]);
+        }
+
+        SDValue Shuffle;
+        if (Phase)
+          Shuffle = DAG.getVectorShuffle(VT, dl, IntermedVals[i].first,
+                                         IntermedVals[i+1].first,
+                                         ShuffleVec);
+        else if (!TLI.isShuffleMaskLegal(ShuffleVec, VT))
+          return false;
+        NewIntermedVals.push_back(
+            std::make_pair(Shuffle, std::move(FinalIndices)));
+      }
+
+      // If we had an odd number of defined values, then append the last
+      // element to the array of new vectors.
+      if ((IntermedVals.size() & 1) != 0)
+        NewIntermedVals.push_back(IntermedVals.back());
+
+      IntermedVals.swap(NewIntermedVals);
+    }
+
+    assert(IntermedVals.size() <= 2 && IntermedVals.size() > 0 &&
+           "Invalid number of intermediate vectors");
+    SDValue Vec1 = IntermedVals[0].first;
+    SDValue Vec2;
+    if (IntermedVals.size() > 1)
+      Vec2 = IntermedVals[1].first;
+    else if (Phase)
+      Vec2 = DAG.getUNDEF(VT);
+
+    SmallVector<int, 16> ShuffleVec(NumElems, -1);
+    for (unsigned i = 0, e = IntermedVals[0].second.size(); i != e; ++i)
+      ShuffleVec[IntermedVals[0].second[i]] = i;
+    for (unsigned i = 0, e = IntermedVals[1].second.size(); i != e; ++i)
+      ShuffleVec[IntermedVals[1].second[i]] = NumElems + i;
+
+    if (Phase)
+      Res = DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec);
+    else if (!TLI.isShuffleMaskLegal(ShuffleVec, VT))
+      return false;
+  }
+
+  return true;
+}
+
+/// Expand a BUILD_VECTOR node on targets that don't
+/// support the operation, but do support the resultant vector type.
+SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
+  unsigned NumElems = Node->getNumOperands();
+  SDValue Value1, Value2;
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  EVT OpVT = Node->getOperand(0).getValueType();
+  EVT EltVT = VT.getVectorElementType();
+
+  // If the only non-undef value is the low element, turn this into a
+  // SCALAR_TO_VECTOR node.  If this is { X, X, X, X }, determine X.
+  bool isOnlyLowElement = true;
+  bool MoreThanTwoValues = false;
+  bool isConstant = true;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue V = Node->getOperand(i);
+    if (V.isUndef())
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    if (!Value1.getNode()) {
+      Value1 = V;
+    } else if (!Value2.getNode()) {
+      if (V != Value1)
+        Value2 = V;
+    } else if (V != Value1 && V != Value2) {
+      MoreThanTwoValues = true;
+    }
+  }
+
+  if (!Value1.getNode())
+    return DAG.getUNDEF(VT);
+
+  if (isOnlyLowElement)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Node->getOperand(0));
+
+  // If all elements are constants, create a load from the constant pool.
+  if (isConstant) {
+    SmallVector<Constant*, 16> CV;
+    for (unsigned i = 0, e = NumElems; i != e; ++i) {
+      if (ConstantFPSDNode *V =
+          dyn_cast<ConstantFPSDNode>(Node->getOperand(i))) {
+        CV.push_back(const_cast<ConstantFP *>(V->getConstantFPValue()));
+      } else if (ConstantSDNode *V =
+                 dyn_cast<ConstantSDNode>(Node->getOperand(i))) {
+        if (OpVT==EltVT)
+          CV.push_back(const_cast<ConstantInt *>(V->getConstantIntValue()));
+        else {
+          // If OpVT and EltVT don't match, EltVT is not legal and the
+          // element values have been promoted/truncated earlier.  Undo this;
+          // we don't want a v16i8 to become a v16i32 for example.
+          const ConstantInt *CI = V->getConstantIntValue();
+          CV.push_back(ConstantInt::get(EltVT.getTypeForEVT(*DAG.getContext()),
+                                        CI->getZExtValue()));
+        }
+      } else {
+        assert(Node->getOperand(i).isUndef());
+        Type *OpNTy = EltVT.getTypeForEVT(*DAG.getContext());
+        CV.push_back(UndefValue::get(OpNTy));
+      }
+    }
+    Constant *CP = ConstantVector::get(CV);
+    SDValue CPIdx =
+        DAG.getConstantPool(CP, TLI.getPointerTy(DAG.getDataLayout()));
+    Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign();
+    return DAG.getLoad(
+        VT, dl, DAG.getEntryNode(), CPIdx,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+        Alignment);
+  }
+
+  SmallSet<SDValue, 16> DefinedValues;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    if (Node->getOperand(i).isUndef())
+      continue;
+    DefinedValues.insert(Node->getOperand(i));
+  }
+
+  if (TLI.shouldExpandBuildVectorWithShuffles(VT, DefinedValues.size())) {
+    if (!MoreThanTwoValues) {
+      SmallVector<int, 8> ShuffleVec(NumElems, -1);
+      for (unsigned i = 0; i < NumElems; ++i) {
+        SDValue V = Node->getOperand(i);
+        if (V.isUndef())
+          continue;
+        ShuffleVec[i] = V == Value1 ? 0 : NumElems;
+      }
+      if (TLI.isShuffleMaskLegal(ShuffleVec, Node->getValueType(0))) {
+        // Get the splatted value into the low element of a vector register.
+        SDValue Vec1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value1);
+        SDValue Vec2;
+        if (Value2.getNode())
+          Vec2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value2);
+        else
+          Vec2 = DAG.getUNDEF(VT);
+
+        // Return shuffle(LowValVec, undef, <0,0,0,0>)
+        return DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec);
+      }
+    } else {
+      SDValue Res;
+      if (ExpandBVWithShuffles(Node, DAG, TLI, Res))
+        return Res;
+    }
+  }
+
+  // Otherwise, we can't handle this case efficiently.
+  return ExpandVectorBuildThroughStack(Node);
+}
+
+SDValue SelectionDAGLegalize::ExpandSPLAT_VECTOR(SDNode *Node) {
+  SDLoc DL(Node);
+  EVT VT = Node->getValueType(0);
+  SDValue SplatVal = Node->getOperand(0);
+
+  return DAG.getSplatBuildVector(VT, DL, SplatVal);
+}
+
+// Expand a node into a call to a libcall, returning the value as the first
+// result and the chain as the second.  If the result value does not fit into a
+// register, return the lo part and set the hi part to the by-reg argument in
+// the first.  If it does fit into a single register, return the result and
+// leave the Hi part unset.
+std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
+                                            TargetLowering::ArgListTy &&Args,
+                                            bool isSigned) {
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+
+  EVT RetVT = Node->getValueType(0);
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+
+  // By default, the input chain to this libcall is the entry node of the
+  // function. If the libcall is going to be emitted as a tail call then
+  // TLI.isUsedByReturnOnly will change it to the right chain if the return
+  // node which is being folded has a non-entry input chain.
+  SDValue InChain = DAG.getEntryNode();
+
+  // isTailCall may be true since the callee does not reference caller stack
+  // frame. Check if it's in the right position and that the return types match.
+  SDValue TCChain = InChain;
+  const Function &F = DAG.getMachineFunction().getFunction();
+  bool isTailCall =
+      TLI.isInTailCallPosition(DAG, Node, TCChain) &&
+      (RetTy == F.getReturnType() || F.getReturnType()->isVoidTy());
+  if (isTailCall)
+    InChain = TCChain;
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  bool signExtend = TLI.shouldSignExtendTypeInLibCall(RetVT, isSigned);
+  CLI.setDebugLoc(SDLoc(Node))
+      .setChain(InChain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setTailCall(isTailCall)
+      .setSExtResult(signExtend)
+      .setZExtResult(!signExtend)
+      .setIsPostTypeLegalization(true);
+
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  if (!CallInfo.second.getNode()) {
+    LLVM_DEBUG(dbgs() << "Created tailcall: "; DAG.getRoot().dump(&DAG));
+    // It's a tailcall, return the chain (which is the DAG root).
+    return {DAG.getRoot(), DAG.getRoot()};
+  }
+
+  LLVM_DEBUG(dbgs() << "Created libcall: "; CallInfo.first.dump(&DAG));
+  return CallInfo;
+}
+
+std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
+                                            bool isSigned) {
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (const SDValue &Op : Node->op_values()) {
+    EVT ArgVT = Op.getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Op;
+    Entry.Ty = ArgTy;
+    Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgVT, isSigned);
+    Entry.IsZExt = !Entry.IsSExt;
+    Args.push_back(Entry);
+  }
+
+  return ExpandLibCall(LC, Node, std::move(Args), isSigned);
+}
+
+void SelectionDAGLegalize::ExpandFrexpLibCall(
+    SDNode *Node, SmallVectorImpl<SDValue> &Results) {
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  EVT ExpVT = Node->getValueType(1);
+
+  SDValue FPOp = Node->getOperand(0);
+
+  EVT ArgVT = FPOp.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  TargetLowering::ArgListEntry FPArgEntry;
+  FPArgEntry.Node = FPOp;
+  FPArgEntry.Ty = ArgTy;
+
+  SDValue StackSlot = DAG.CreateStackTemporary(ExpVT);
+  TargetLowering::ArgListEntry PtrArgEntry;
+  PtrArgEntry.Node = StackSlot;
+  PtrArgEntry.Ty = PointerType::get(*DAG.getContext(),
+                                    DAG.getDataLayout().getAllocaAddrSpace());
+
+  TargetLowering::ArgListTy Args = {FPArgEntry, PtrArgEntry};
+
+  RTLIB::Libcall LC = RTLIB::getFREXP(VT);
+  auto [Call, Chain] = ExpandLibCall(LC, Node, std::move(Args), false);
+
+  // FIXME: Get type of int for libcall declaration and cast
+
+  int FrameIdx = cast<FrameIndexSDNode>(StackSlot)->getIndex();
+  auto PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+
+  SDValue LoadExp = DAG.getLoad(ExpVT, dl, Chain, StackSlot, PtrInfo);
+  SDValue OutputChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                    LoadExp.getValue(1), DAG.getRoot());
+  DAG.setRoot(OutputChain);
+
+  Results.push_back(Call);
+  Results.push_back(LoadExp);
+}
+
+void SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
+                                           RTLIB::Libcall LC,
+                                           SmallVectorImpl<SDValue> &Results) {
+  if (LC == RTLIB::UNKNOWN_LIBCALL)
+    llvm_unreachable("Can't create an unknown libcall!");
+
+  if (Node->isStrictFPOpcode()) {
+    EVT RetVT = Node->getValueType(0);
+    SmallVector<SDValue, 4> Ops(drop_begin(Node->ops()));
+    TargetLowering::MakeLibCallOptions CallOptions;
+    // FIXME: This doesn't support tail calls.
+    std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RetVT,
+                                                      Ops, CallOptions,
+                                                      SDLoc(Node),
+                                                      Node->getOperand(0));
+    Results.push_back(Tmp.first);
+    Results.push_back(Tmp.second);
+  } else {
+    SDValue Tmp = ExpandLibCall(LC, Node, false).first;
+    Results.push_back(Tmp);
+  }
+}
+
+/// Expand the node to a libcall based on the result type.
+void SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
+                                           RTLIB::Libcall Call_F32,
+                                           RTLIB::Libcall Call_F64,
+                                           RTLIB::Libcall Call_F80,
+                                           RTLIB::Libcall Call_F128,
+                                           RTLIB::Libcall Call_PPCF128,
+                                           SmallVectorImpl<SDValue> &Results) {
+  RTLIB::Libcall LC = RTLIB::getFPLibCall(Node->getSimpleValueType(0),
+                                          Call_F32, Call_F64, Call_F80,
+                                          Call_F128, Call_PPCF128);
+  ExpandFPLibCall(Node, LC, Results);
+}
+
+SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
+                                               RTLIB::Libcall Call_I8,
+                                               RTLIB::Libcall Call_I16,
+                                               RTLIB::Libcall Call_I32,
+                                               RTLIB::Libcall Call_I64,
+                                               RTLIB::Libcall Call_I128) {
+  RTLIB::Libcall LC;
+  switch (Node->getSimpleValueType(0).SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::i8:   LC = Call_I8; break;
+  case MVT::i16:  LC = Call_I16; break;
+  case MVT::i32:  LC = Call_I32; break;
+  case MVT::i64:  LC = Call_I64; break;
+  case MVT::i128: LC = Call_I128; break;
+  }
+  return ExpandLibCall(LC, Node, isSigned).first;
+}
+
+/// Expand the node to a libcall based on first argument type (for instance
+/// lround and its variant).
+void SelectionDAGLegalize::ExpandArgFPLibCall(SDNode* Node,
+                                            RTLIB::Libcall Call_F32,
+                                            RTLIB::Libcall Call_F64,
+                                            RTLIB::Libcall Call_F80,
+                                            RTLIB::Libcall Call_F128,
+                                            RTLIB::Libcall Call_PPCF128,
+                                            SmallVectorImpl<SDValue> &Results) {
+  EVT InVT = Node->getOperand(Node->isStrictFPOpcode() ? 1 : 0).getValueType();
+  RTLIB::Libcall LC = RTLIB::getFPLibCall(InVT.getSimpleVT(),
+                                          Call_F32, Call_F64, Call_F80,
+                                          Call_F128, Call_PPCF128);
+  ExpandFPLibCall(Node, LC, Results);
+}
+
+/// Issue libcalls to __{u}divmod to compute div / rem pairs.
+void
+SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
+                                          SmallVectorImpl<SDValue> &Results) {
+  unsigned Opcode = Node->getOpcode();
+  bool isSigned = Opcode == ISD::SDIVREM;
+
+  RTLIB::Libcall LC;
+  switch (Node->getSimpleValueType(0).SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
+  case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
+  case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
+  case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
+  case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
+  }
+
+  // The input chain to this libcall is the entry node of the function.
+  // Legalizing the call will automatically add the previous call to the
+  // dependence.
+  SDValue InChain = DAG.getEntryNode();
+
+  EVT RetVT = Node->getValueType(0);
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (const SDValue &Op : Node->op_values()) {
+    EVT ArgVT = Op.getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Op;
+    Entry.Ty = ArgTy;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+
+  // Also pass the return address of the remainder.
+  SDValue FIPtr = DAG.CreateStackTemporary(RetVT);
+  Entry.Node = FIPtr;
+  Entry.Ty = RetTy->getPointerTo();
+  Entry.IsSExt = isSigned;
+  Entry.IsZExt = !isSigned;
+  Args.push_back(Entry);
+
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+
+  SDLoc dl(Node);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+      .setChain(InChain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  // Remainder is loaded back from the stack frame.
+  SDValue Rem =
+      DAG.getLoad(RetVT, dl, CallInfo.second, FIPtr, MachinePointerInfo());
+  Results.push_back(CallInfo.first);
+  Results.push_back(Rem);
+}
+
+/// Return true if sincos libcall is available.
+static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
+  RTLIB::Libcall LC;
+  switch (Node->getSimpleValueType(0).SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
+  case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
+  case MVT::f80:     LC = RTLIB::SINCOS_F80; break;
+  case MVT::f128:    LC = RTLIB::SINCOS_F128; break;
+  case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
+  }
+  return TLI.getLibcallName(LC) != nullptr;
+}
+
+/// Only issue sincos libcall if both sin and cos are needed.
+static bool useSinCos(SDNode *Node) {
+  unsigned OtherOpcode = Node->getOpcode() == ISD::FSIN
+    ? ISD::FCOS : ISD::FSIN;
+
+  SDValue Op0 = Node->getOperand(0);
+  for (const SDNode *User : Op0.getNode()->uses()) {
+    if (User == Node)
+      continue;
+    // The other user might have been turned into sincos already.
+    if (User->getOpcode() == OtherOpcode || User->getOpcode() == ISD::FSINCOS)
+      return true;
+  }
+  return false;
+}
+
+/// Issue libcalls to sincos to compute sin / cos pairs.
+void
+SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
+                                          SmallVectorImpl<SDValue> &Results) {
+  RTLIB::Libcall LC;
+  switch (Node->getSimpleValueType(0).SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
+  case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
+  case MVT::f80:     LC = RTLIB::SINCOS_F80; break;
+  case MVT::f128:    LC = RTLIB::SINCOS_F128; break;
+  case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
+  }
+
+  // The input chain to this libcall is the entry node of the function.
+  // Legalizing the call will automatically add the previous call to the
+  // dependence.
+  SDValue InChain = DAG.getEntryNode();
+
+  EVT RetVT = Node->getValueType(0);
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  // Pass the argument.
+  Entry.Node = Node->getOperand(0);
+  Entry.Ty = RetTy;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
+  Args.push_back(Entry);
+
+  // Pass the return address of sin.
+  SDValue SinPtr = DAG.CreateStackTemporary(RetVT);
+  Entry.Node = SinPtr;
+  Entry.Ty = RetTy->getPointerTo();
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
+  Args.push_back(Entry);
+
+  // Also pass the return address of the cos.
+  SDValue CosPtr = DAG.CreateStackTemporary(RetVT);
+  Entry.Node = CosPtr;
+  Entry.Ty = RetTy->getPointerTo();
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
+  Args.push_back(Entry);
+
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+
+  SDLoc dl(Node);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(InChain).setLibCallee(
+      TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee,
+      std::move(Args));
+
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  Results.push_back(
+      DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr, MachinePointerInfo()));
+  Results.push_back(
+      DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr, MachinePointerInfo()));
+}
+
+SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  SDValue X = Node->getOperand(0);
+  SDValue N = Node->getOperand(1);
+  EVT ExpVT = N.getValueType();
+  EVT AsIntVT = VT.changeTypeToInteger();
+  if (AsIntVT == EVT()) // TODO: How to handle f80?
+    return SDValue();
+
+  if (Node->getOpcode() == ISD::STRICT_FLDEXP) // TODO
+    return SDValue();
+
+  SDNodeFlags NSW;
+  NSW.setNoSignedWrap(true);
+  SDNodeFlags NUW_NSW;
+  NUW_NSW.setNoUnsignedWrap(true);
+  NUW_NSW.setNoSignedWrap(true);
+
+  EVT SetCCVT =
+      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ExpVT);
+  const fltSemantics &FltSem = SelectionDAG::EVTToAPFloatSemantics(VT);
+
+  const APFloat::ExponentType MaxExpVal = APFloat::semanticsMaxExponent(FltSem);
+  const APFloat::ExponentType MinExpVal = APFloat::semanticsMinExponent(FltSem);
+  const int Precision = APFloat::semanticsPrecision(FltSem);
+
+  const SDValue MaxExp = DAG.getConstant(MaxExpVal, dl, ExpVT);
+  const SDValue MinExp = DAG.getConstant(MinExpVal, dl, ExpVT);
+
+  const SDValue DoubleMaxExp = DAG.getConstant(2 * MaxExpVal, dl, ExpVT);
+
+  const APFloat One(FltSem, "1.0");
+  APFloat ScaleUpK = scalbn(One, MaxExpVal, APFloat::rmNearestTiesToEven);
+
+  // Offset by precision to avoid denormal range.
+  APFloat ScaleDownK =
+      scalbn(One, MinExpVal + Precision, APFloat::rmNearestTiesToEven);
+
+  // TODO: Should really introduce control flow and use a block for the >
+  // MaxExp, < MinExp cases
+
+  // First, handle exponents Exp > MaxExp and scale down.
+  SDValue NGtMaxExp = DAG.getSetCC(dl, SetCCVT, N, MaxExp, ISD::SETGT);
+
+  SDValue DecN0 = DAG.getNode(ISD::SUB, dl, ExpVT, N, MaxExp, NSW);
+  SDValue ClampMaxVal = DAG.getConstant(3 * MaxExpVal, dl, ExpVT);
+  SDValue ClampN_Big = DAG.getNode(ISD::SMIN, dl, ExpVT, N, ClampMaxVal);
+  SDValue DecN1 =
+      DAG.getNode(ISD::SUB, dl, ExpVT, ClampN_Big, DoubleMaxExp, NSW);
+
+  SDValue ScaleUpTwice =
+      DAG.getSetCC(dl, SetCCVT, N, DoubleMaxExp, ISD::SETUGT);
+
+  const SDValue ScaleUpVal = DAG.getConstantFP(ScaleUpK, dl, VT);
+  SDValue ScaleUp0 = DAG.getNode(ISD::FMUL, dl, VT, X, ScaleUpVal);
+  SDValue ScaleUp1 = DAG.getNode(ISD::FMUL, dl, VT, ScaleUp0, ScaleUpVal);
+
+  SDValue SelectN_Big =
+      DAG.getNode(ISD::SELECT, dl, ExpVT, ScaleUpTwice, DecN1, DecN0);
+  SDValue SelectX_Big =
+      DAG.getNode(ISD::SELECT, dl, VT, ScaleUpTwice, ScaleUp1, ScaleUp0);
+
+  // Now handle exponents Exp < MinExp
+  SDValue NLtMinExp = DAG.getSetCC(dl, SetCCVT, N, MinExp, ISD::SETLT);
+
+  SDValue Increment0 = DAG.getConstant(-(MinExpVal + Precision), dl, ExpVT);
+  SDValue Increment1 = DAG.getConstant(-2 * (MinExpVal + Precision), dl, ExpVT);
+
+  SDValue IncN0 = DAG.getNode(ISD::ADD, dl, ExpVT, N, Increment0, NUW_NSW);
+
+  SDValue ClampMinVal =
+      DAG.getConstant(3 * MinExpVal + 2 * Precision, dl, ExpVT);
+  SDValue ClampN_Small = DAG.getNode(ISD::SMAX, dl, ExpVT, N, ClampMinVal);
+  SDValue IncN1 =
+      DAG.getNode(ISD::ADD, dl, ExpVT, ClampN_Small, Increment1, NSW);
+
+  const SDValue ScaleDownVal = DAG.getConstantFP(ScaleDownK, dl, VT);
+  SDValue ScaleDown0 = DAG.getNode(ISD::FMUL, dl, VT, X, ScaleDownVal);
+  SDValue ScaleDown1 = DAG.getNode(ISD::FMUL, dl, VT, ScaleDown0, ScaleDownVal);
+
+  SDValue ScaleDownTwice = DAG.getSetCC(
+      dl, SetCCVT, N, DAG.getConstant(2 * MinExpVal + Precision, dl, ExpVT),
+      ISD::SETULT);
+
+  SDValue SelectN_Small =
+      DAG.getNode(ISD::SELECT, dl, ExpVT, ScaleDownTwice, IncN1, IncN0);
+  SDValue SelectX_Small =
+      DAG.getNode(ISD::SELECT, dl, VT, ScaleDownTwice, ScaleDown1, ScaleDown0);
+
+  // Now combine the two out of range exponent handling cases with the base
+  // case.
+  SDValue NewX = DAG.getNode(
+      ISD::SELECT, dl, VT, NGtMaxExp, SelectX_Big,
+      DAG.getNode(ISD::SELECT, dl, VT, NLtMinExp, SelectX_Small, X));
+
+  SDValue NewN = DAG.getNode(
+      ISD::SELECT, dl, ExpVT, NGtMaxExp, SelectN_Big,
+      DAG.getNode(ISD::SELECT, dl, ExpVT, NLtMinExp, SelectN_Small, N));
+
+  SDValue BiasedN = DAG.getNode(ISD::ADD, dl, ExpVT, NewN, MaxExp, NSW);
+
+  SDValue ExponentShiftAmt =
+      DAG.getShiftAmountConstant(Precision - 1, ExpVT, dl);
+  SDValue CastExpToValTy = DAG.getZExtOrTrunc(BiasedN, dl, AsIntVT);
+
+  SDValue AsInt = DAG.getNode(ISD::SHL, dl, AsIntVT, CastExpToValTy,
+                              ExponentShiftAmt, NUW_NSW);
+  SDValue AsFP = DAG.getNode(ISD::BITCAST, dl, VT, AsInt);
+  return DAG.getNode(ISD::FMUL, dl, VT, NewX, AsFP);
+}
+
+SDValue SelectionDAGLegalize::expandFrexp(SDNode *Node) const {
+  SDLoc dl(Node);
+  SDValue Val = Node->getOperand(0);
+  EVT VT = Val.getValueType();
+  EVT ExpVT = Node->getValueType(1);
+  EVT AsIntVT = VT.changeTypeToInteger();
+  if (AsIntVT == EVT()) // TODO: How to handle f80?
+    return SDValue();
+
+  const fltSemantics &FltSem = SelectionDAG::EVTToAPFloatSemantics(VT);
+  const APFloat::ExponentType MinExpVal = APFloat::semanticsMinExponent(FltSem);
+  const unsigned Precision = APFloat::semanticsPrecision(FltSem);
+  const unsigned BitSize = VT.getScalarSizeInBits();
+
+  // TODO: Could introduce control flow and skip over the denormal handling.
+
+  // scale_up = fmul value, scalbn(1.0, precision + 1)
+  // extracted_exp = (bitcast value to uint) >> precision - 1
+  // biased_exp = extracted_exp + min_exp
+  // extracted_fract = (bitcast value to uint) & (fract_mask | sign_mask)
+  //
+  // is_denormal = val < smallest_normalized
+  // computed_fract = is_denormal ? scale_up : extracted_fract
+  // computed_exp = is_denormal ? biased_exp + (-precision - 1) : biased_exp
+  //
+  // result_0 =  (!isfinite(val) || iszero(val)) ? val : computed_fract
+  // result_1 =  (!isfinite(val) || iszero(val)) ? 0 : computed_exp
+
+  SDValue NegSmallestNormalizedInt = DAG.getConstant(
+      APFloat::getSmallestNormalized(FltSem, true).bitcastToAPInt(), dl,
+      AsIntVT);
+
+  SDValue SmallestNormalizedInt = DAG.getConstant(
+      APFloat::getSmallestNormalized(FltSem, false).bitcastToAPInt(), dl,
+      AsIntVT);
+
+  // Masks out the exponent bits.
+  SDValue ExpMask =
+      DAG.getConstant(APFloat::getInf(FltSem).bitcastToAPInt(), dl, AsIntVT);
+
+  // Mask out the exponent part of the value.
+  //
+  // e.g, for f32 FractSignMaskVal = 0x807fffff
+  APInt FractSignMaskVal = APInt::getBitsSet(BitSize, 0, Precision - 1);
+  FractSignMaskVal.setBit(BitSize - 1); // Set the sign bit
+
+  APInt SignMaskVal = APInt::getSignedMaxValue(BitSize);
+  SDValue SignMask = DAG.getConstant(SignMaskVal, dl, AsIntVT);
+
+  SDValue FractSignMask = DAG.getConstant(FractSignMaskVal, dl, AsIntVT);
+
+  const APFloat One(FltSem, "1.0");
+  // Scale a possible denormal input.
+  // e.g., for f64, 0x1p+54
+  APFloat ScaleUpKVal =
+      scalbn(One, Precision + 1, APFloat::rmNearestTiesToEven);
+
+  SDValue ScaleUpK = DAG.getConstantFP(ScaleUpKVal, dl, VT);
+  SDValue ScaleUp = DAG.getNode(ISD::FMUL, dl, VT, Val, ScaleUpK);
+
+  EVT SetCCVT =
+      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+
+  SDValue AsInt = DAG.getNode(ISD::BITCAST, dl, AsIntVT, Val);
+
+  SDValue Abs = DAG.getNode(ISD::AND, dl, AsIntVT, AsInt, SignMask);
+
+  SDValue AddNegSmallestNormal =
+      DAG.getNode(ISD::ADD, dl, AsIntVT, Abs, NegSmallestNormalizedInt);
+  SDValue DenormOrZero = DAG.getSetCC(dl, SetCCVT, AddNegSmallestNormal,
+                                      NegSmallestNormalizedInt, ISD::SETULE);
+
+  SDValue IsDenormal =
+      DAG.getSetCC(dl, SetCCVT, Abs, SmallestNormalizedInt, ISD::SETULT);
+
+  SDValue MinExp = DAG.getConstant(MinExpVal, dl, ExpVT);
+  SDValue Zero = DAG.getConstant(0, dl, ExpVT);
+
+  SDValue ScaledAsInt = DAG.getNode(ISD::BITCAST, dl, AsIntVT, ScaleUp);
+  SDValue ScaledSelect =
+      DAG.getNode(ISD::SELECT, dl, AsIntVT, IsDenormal, ScaledAsInt, AsInt);
+
+  SDValue ExpMaskScaled =
+      DAG.getNode(ISD::AND, dl, AsIntVT, ScaledAsInt, ExpMask);
+
+  SDValue ScaledValue =
+      DAG.getNode(ISD::SELECT, dl, AsIntVT, IsDenormal, ExpMaskScaled, Abs);
+
+  // Extract the exponent bits.
+  SDValue ExponentShiftAmt =
+      DAG.getShiftAmountConstant(Precision - 1, AsIntVT, dl);
+  SDValue ShiftedExp =
+      DAG.getNode(ISD::SRL, dl, AsIntVT, ScaledValue, ExponentShiftAmt);
+  SDValue Exp = DAG.getSExtOrTrunc(ShiftedExp, dl, ExpVT);
+
+  SDValue NormalBiasedExp = DAG.getNode(ISD::ADD, dl, ExpVT, Exp, MinExp);
+  SDValue DenormalOffset = DAG.getConstant(-Precision - 1, dl, ExpVT);
+  SDValue DenormalExpBias =
+      DAG.getNode(ISD::SELECT, dl, ExpVT, IsDenormal, DenormalOffset, Zero);
+
+  SDValue MaskedFractAsInt =
+      DAG.getNode(ISD::AND, dl, AsIntVT, ScaledSelect, FractSignMask);
+  const APFloat Half(FltSem, "0.5");
+  SDValue FPHalf = DAG.getConstant(Half.bitcastToAPInt(), dl, AsIntVT);
+  SDValue Or = DAG.getNode(ISD::OR, dl, AsIntVT, MaskedFractAsInt, FPHalf);
+  SDValue MaskedFract = DAG.getNode(ISD::BITCAST, dl, VT, Or);
+
+  SDValue ComputedExp =
+      DAG.getNode(ISD::ADD, dl, ExpVT, NormalBiasedExp, DenormalExpBias);
+
+  SDValue Result0 =
+      DAG.getNode(ISD::SELECT, dl, VT, DenormOrZero, Val, MaskedFract);
+
+  SDValue Result1 =
+      DAG.getNode(ISD::SELECT, dl, ExpVT, DenormOrZero, Zero, ComputedExp);
+
+  return DAG.getMergeValues({Result0, Result1}, dl);
+}
+
+/// This function is responsible for legalizing a
+/// INT_TO_FP operation of the specified operand when the target requests that
+/// we expand it.  At this point, we know that the result and operand types are
+/// legal for the target.
+SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node,
+                                                   SDValue &Chain) {
+  bool isSigned = (Node->getOpcode() == ISD::STRICT_SINT_TO_FP ||
+                   Node->getOpcode() == ISD::SINT_TO_FP);
+  EVT DestVT = Node->getValueType(0);
+  SDLoc dl(Node);
+  unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0;
+  SDValue Op0 = Node->getOperand(OpNo);
+  EVT SrcVT = Op0.getValueType();
+
+  // TODO: Should any fast-math-flags be set for the created nodes?
+  LLVM_DEBUG(dbgs() << "Legalizing INT_TO_FP\n");
+  if (SrcVT == MVT::i32 && TLI.isTypeLegal(MVT::f64) &&
+      (DestVT.bitsLE(MVT::f64) ||
+       TLI.isOperationLegal(Node->isStrictFPOpcode() ? ISD::STRICT_FP_EXTEND
+                                                     : ISD::FP_EXTEND,
+                            DestVT))) {
+    LLVM_DEBUG(dbgs() << "32-bit [signed|unsigned] integer to float/double "
+                         "expansion\n");
+
+    // Get the stack frame index of a 8 byte buffer.
+    SDValue StackSlot = DAG.CreateStackTemporary(MVT::f64);
+
+    SDValue Lo = Op0;
+    // if signed map to unsigned space
+    if (isSigned) {
+      // Invert sign bit (signed to unsigned mapping).
+      Lo = DAG.getNode(ISD::XOR, dl, MVT::i32, Lo,
+                       DAG.getConstant(0x80000000u, dl, MVT::i32));
+    }
+    // Initial hi portion of constructed double.
+    SDValue Hi = DAG.getConstant(0x43300000u, dl, MVT::i32);
+
+    // If this a big endian target, swap the lo and high data.
+    if (DAG.getDataLayout().isBigEndian())
+      std::swap(Lo, Hi);
+
+    SDValue MemChain = DAG.getEntryNode();
+
+    // Store the lo of the constructed double.
+    SDValue Store1 = DAG.getStore(MemChain, dl, Lo, StackSlot,
+                                  MachinePointerInfo());
+    // Store the hi of the constructed double.
+    SDValue HiPtr = DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);
+    SDValue Store2 =
+        DAG.getStore(MemChain, dl, Hi, HiPtr, MachinePointerInfo());
+    MemChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+
+    // load the constructed double
+    SDValue Load =
+        DAG.getLoad(MVT::f64, dl, MemChain, StackSlot, MachinePointerInfo());
+    // FP constant to bias correct the final result
+    SDValue Bias = DAG.getConstantFP(
+        isSigned ? llvm::bit_cast<double>(0x4330000080000000ULL)
+                 : llvm::bit_cast<double>(0x4330000000000000ULL),
+        dl, MVT::f64);
+    // Subtract the bias and get the final result.
+    SDValue Sub;
+    SDValue Result;
+    if (Node->isStrictFPOpcode()) {
+      Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
+                        {Node->getOperand(0), Load, Bias});
+      Chain = Sub.getValue(1);
+      if (DestVT != Sub.getValueType()) {
+        std::pair<SDValue, SDValue> ResultPair;
+        ResultPair =
+            DAG.getStrictFPExtendOrRound(Sub, Chain, dl, DestVT);
+        Result = ResultPair.first;
+        Chain = ResultPair.second;
+      }
+      else
+        Result = Sub;
+    } else {
+      Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Load, Bias);
+      Result = DAG.getFPExtendOrRound(Sub, dl, DestVT);
+    }
+    return Result;
+  }
+
+  if (isSigned)
+    return SDValue();
+
+  // TODO: Generalize this for use with other types.
+  if (((SrcVT == MVT::i32 || SrcVT == MVT::i64) && DestVT == MVT::f32) ||
+      (SrcVT == MVT::i64 && DestVT == MVT::f64)) {
+    LLVM_DEBUG(dbgs() << "Converting unsigned i32/i64 to f32/f64\n");
+    // For unsigned conversions, convert them to signed conversions using the
+    // algorithm from the x86_64 __floatundisf in compiler_rt. That method
+    // should be valid for i32->f32 as well.
+
+    // More generally this transform should be valid if there are 3 more bits
+    // in the integer type than the significand. Rounding uses the first bit
+    // after the width of the significand and the OR of all bits after that. So
+    // we need to be able to OR the shifted out bit into one of the bits that
+    // participate in the OR.
+
+    // TODO: This really should be implemented using a branch rather than a
+    // select.  We happen to get lucky and machinesink does the right
+    // thing most of the time.  This would be a good candidate for a
+    // pseudo-op, or, even better, for whole-function isel.
+    EVT SetCCVT = getSetCCResultType(SrcVT);
+
+    SDValue SignBitTest = DAG.getSetCC(
+        dl, SetCCVT, Op0, DAG.getConstant(0, dl, SrcVT), ISD::SETLT);
+
+    EVT ShiftVT = TLI.getShiftAmountTy(SrcVT, DAG.getDataLayout());
+    SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT);
+    SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Op0, ShiftConst);
+    SDValue AndConst = DAG.getConstant(1, dl, SrcVT);
+    SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Op0, AndConst);
+    SDValue Or = DAG.getNode(ISD::OR, dl, SrcVT, And, Shr);
+
+    SDValue Slow, Fast;
+    if (Node->isStrictFPOpcode()) {
+      // In strict mode, we must avoid spurious exceptions, and therefore
+      // must make sure to only emit a single STRICT_SINT_TO_FP.
+      SDValue InCvt = DAG.getSelect(dl, SrcVT, SignBitTest, Or, Op0);
+      Fast = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, { DestVT, MVT::Other },
+                         { Node->getOperand(0), InCvt });
+      Slow = DAG.getNode(ISD::STRICT_FADD, dl, { DestVT, MVT::Other },
+                         { Fast.getValue(1), Fast, Fast });
+      Chain = Slow.getValue(1);
+      // The STRICT_SINT_TO_FP inherits the exception mode from the
+      // incoming STRICT_UINT_TO_FP node; the STRICT_FADD node can
+      // never raise any exception.
+      SDNodeFlags Flags;
+      Flags.setNoFPExcept(Node->getFlags().hasNoFPExcept());
+      Fast->setFlags(Flags);
+      Flags.setNoFPExcept(true);
+      Slow->setFlags(Flags);
+    } else {
+      SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Or);
+      Slow = DAG.getNode(ISD::FADD, dl, DestVT, SignCvt, SignCvt);
+      Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);
+    }
+
+    return DAG.getSelect(dl, DestVT, SignBitTest, Slow, Fast);
+  }
+
+  // Don't expand it if there isn't cheap fadd.
+  if (!TLI.isOperationLegalOrCustom(
+          Node->isStrictFPOpcode() ? ISD::STRICT_FADD : ISD::FADD, DestVT))
+    return SDValue();
+
+  // The following optimization is valid only if every value in SrcVT (when
+  // treated as signed) is representable in DestVT.  Check that the mantissa
+  // size of DestVT is >= than the number of bits in SrcVT -1.
+  assert(APFloat::semanticsPrecision(DAG.EVTToAPFloatSemantics(DestVT)) >=
+             SrcVT.getSizeInBits() - 1 &&
+         "Cannot perform lossless SINT_TO_FP!");
+
+  SDValue Tmp1;
+  if (Node->isStrictFPOpcode()) {
+    Tmp1 = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, { DestVT, MVT::Other },
+                       { Node->getOperand(0), Op0 });
+  } else
+    Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);
+
+  SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(SrcVT), Op0,
+                                 DAG.getConstant(0, dl, SrcVT), ISD::SETLT);
+  SDValue Zero = DAG.getIntPtrConstant(0, dl),
+          Four = DAG.getIntPtrConstant(4, dl);
+  SDValue CstOffset = DAG.getSelect(dl, Zero.getValueType(),
+                                    SignSet, Four, Zero);
+
+  // If the sign bit of the integer is set, the large number will be treated
+  // as a negative number.  To counteract this, the dynamic code adds an
+  // offset depending on the data type.
+  uint64_t FF;
+  switch (SrcVT.getSimpleVT().SimpleTy) {
+  default:
+    return SDValue();
+  case MVT::i8 : FF = 0x43800000ULL; break;  // 2^8  (as a float)
+  case MVT::i16: FF = 0x47800000ULL; break;  // 2^16 (as a float)
+  case MVT::i32: FF = 0x4F800000ULL; break;  // 2^32 (as a float)
+  case MVT::i64: FF = 0x5F800000ULL; break;  // 2^64 (as a float)
+  }
+  if (DAG.getDataLayout().isLittleEndian())
+    FF <<= 32;
+  Constant *FudgeFactor = ConstantInt::get(
+                                       Type::getInt64Ty(*DAG.getContext()), FF);
+
+  SDValue CPIdx =
+      DAG.getConstantPool(FudgeFactor, TLI.getPointerTy(DAG.getDataLayout()));
+  Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign();
+  CPIdx = DAG.getNode(ISD::ADD, dl, CPIdx.getValueType(), CPIdx, CstOffset);
+  Alignment = commonAlignment(Alignment, 4);
+  SDValue FudgeInReg;
+  if (DestVT == MVT::f32)
+    FudgeInReg = DAG.getLoad(
+        MVT::f32, dl, DAG.getEntryNode(), CPIdx,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+        Alignment);
+  else {
+    SDValue Load = DAG.getExtLoad(
+        ISD::EXTLOAD, dl, DestVT, DAG.getEntryNode(), CPIdx,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
+        Alignment);
+    HandleSDNode Handle(Load);
+    LegalizeOp(Load.getNode());
+    FudgeInReg = Handle.getValue();
+  }
+
+  if (Node->isStrictFPOpcode()) {
+    SDValue Result = DAG.getNode(ISD::STRICT_FADD, dl, { DestVT, MVT::Other },
+                                 { Tmp1.getValue(1), Tmp1, FudgeInReg });
+    Chain = Result.getValue(1);
+    return Result;
+  }
+
+  return DAG.getNode(ISD::FADD, dl, DestVT, Tmp1, FudgeInReg);
+}
+
+/// This function is responsible for legalizing a
+/// *INT_TO_FP operation of the specified operand when the target requests that
+/// we promote it.  At this point, we know that the result and operand types are
+/// legal for the target, and that there is a legal UINT_TO_FP or SINT_TO_FP
+/// operation that takes a larger input.
+void SelectionDAGLegalize::PromoteLegalINT_TO_FP(
+    SDNode *N, const SDLoc &dl, SmallVectorImpl<SDValue> &Results) {
+  bool IsStrict = N->isStrictFPOpcode();
+  bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
+                  N->getOpcode() == ISD::STRICT_SINT_TO_FP;
+  EVT DestVT = N->getValueType(0);
+  SDValue LegalOp = N->getOperand(IsStrict ? 1 : 0);
+  unsigned UIntOp = IsStrict ? ISD::STRICT_UINT_TO_FP : ISD::UINT_TO_FP;
+  unsigned SIntOp = IsStrict ? ISD::STRICT_SINT_TO_FP : ISD::SINT_TO_FP;
+
+  // First step, figure out the appropriate *INT_TO_FP operation to use.
+  EVT NewInTy = LegalOp.getValueType();
+
+  unsigned OpToUse = 0;
+
+  // Scan for the appropriate larger type to use.
+  while (true) {
+    NewInTy = (MVT::SimpleValueType)(NewInTy.getSimpleVT().SimpleTy+1);
+    assert(NewInTy.isInteger() && "Ran out of possibilities!");
+
+    // If the target supports SINT_TO_FP of this type, use it.
+    if (TLI.isOperationLegalOrCustom(SIntOp, NewInTy)) {
+      OpToUse = SIntOp;
+      break;
+    }
+    if (IsSigned)
+      continue;
+
+    // If the target supports UINT_TO_FP of this type, use it.
+    if (TLI.isOperationLegalOrCustom(UIntOp, NewInTy)) {
+      OpToUse = UIntOp;
+      break;
+    }
+
+    // Otherwise, try a larger type.
+  }
+
+  // Okay, we found the operation and type to use.  Zero extend our input to the
+  // desired type then run the operation on it.
+  if (IsStrict) {
+    SDValue Res =
+        DAG.getNode(OpToUse, dl, {DestVT, MVT::Other},
+                    {N->getOperand(0),
+                     DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
+                                 dl, NewInTy, LegalOp)});
+    Results.push_back(Res);
+    Results.push_back(Res.getValue(1));
+    return;
+  }
+
+  Results.push_back(
+      DAG.getNode(OpToUse, dl, DestVT,
+                  DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
+                              dl, NewInTy, LegalOp)));
+}
+
+/// This function is responsible for legalizing a
+/// FP_TO_*INT operation of the specified operand when the target requests that
+/// we promote it.  At this point, we know that the result and operand types are
+/// legal for the target, and that there is a legal FP_TO_UINT or FP_TO_SINT
+/// operation that returns a larger result.
+void SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDNode *N, const SDLoc &dl,
+                                                 SmallVectorImpl<SDValue> &Results) {
+  bool IsStrict = N->isStrictFPOpcode();
+  bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
+                  N->getOpcode() == ISD::STRICT_FP_TO_SINT;
+  EVT DestVT = N->getValueType(0);
+  SDValue LegalOp = N->getOperand(IsStrict ? 1 : 0);
+  // First step, figure out the appropriate FP_TO*INT operation to use.
+  EVT NewOutTy = DestVT;
+
+  unsigned OpToUse = 0;
+
+  // Scan for the appropriate larger type to use.
+  while (true) {
+    NewOutTy = (MVT::SimpleValueType)(NewOutTy.getSimpleVT().SimpleTy+1);
+    assert(NewOutTy.isInteger() && "Ran out of possibilities!");
+
+    // A larger signed type can hold all unsigned values of the requested type,
+    // so using FP_TO_SINT is valid
+    OpToUse = IsStrict ? ISD::STRICT_FP_TO_SINT : ISD::FP_TO_SINT;
+    if (TLI.isOperationLegalOrCustom(OpToUse, NewOutTy))
+      break;
+
+    // However, if the value may be < 0.0, we *must* use some FP_TO_SINT.
+    OpToUse = IsStrict ? ISD::STRICT_FP_TO_UINT : ISD::FP_TO_UINT;
+    if (!IsSigned && TLI.isOperationLegalOrCustom(OpToUse, NewOutTy))
+      break;
+
+    // Otherwise, try a larger type.
+  }
+
+  // Okay, we found the operation and type to use.
+  SDValue Operation;
+  if (IsStrict) {
+    SDVTList VTs = DAG.getVTList(NewOutTy, MVT::Other);
+    Operation = DAG.getNode(OpToUse, dl, VTs, N->getOperand(0), LegalOp);
+  } else
+    Operation = DAG.getNode(OpToUse, dl, NewOutTy, LegalOp);
+
+  // Truncate the result of the extended FP_TO_*INT operation to the desired
+  // size.
+  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, DestVT, Operation);
+  Results.push_back(Trunc);
+  if (IsStrict)
+    Results.push_back(Operation.getValue(1));
+}
+
+/// Promote FP_TO_*INT_SAT operation to a larger result type. At this point
+/// the result and operand types are legal and there must be a legal
+/// FP_TO_*INT_SAT operation for a larger result type.
+SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT_SAT(SDNode *Node,
+                                                        const SDLoc &dl) {
+  unsigned Opcode = Node->getOpcode();
+
+  // Scan for the appropriate larger type to use.
+  EVT NewOutTy = Node->getValueType(0);
+  while (true) {
+    NewOutTy = (MVT::SimpleValueType)(NewOutTy.getSimpleVT().SimpleTy + 1);
+    assert(NewOutTy.isInteger() && "Ran out of possibilities!");
+
+    if (TLI.isOperationLegalOrCustom(Opcode, NewOutTy))
+      break;
+  }
+
+  // Saturation width is determined by second operand, so we don't have to
+  // perform any fixup and can directly truncate the result.
+  SDValue Result = DAG.getNode(Opcode, dl, NewOutTy, Node->getOperand(0),
+                               Node->getOperand(1));
+  return DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0), Result);
+}
+
+/// Open code the operations for PARITY of the specified operation.
+SDValue SelectionDAGLegalize::ExpandPARITY(SDValue Op, const SDLoc &dl) {
+  EVT VT = Op.getValueType();
+  EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+  unsigned Sz = VT.getScalarSizeInBits();
+
+  // If CTPOP is legal, use it. Otherwise use shifts and xor.
+  SDValue Result;
+  if (TLI.isOperationLegalOrPromote(ISD::CTPOP, VT)) {
+    Result = DAG.getNode(ISD::CTPOP, dl, VT, Op);
+  } else {
+    Result = Op;
+    for (unsigned i = Log2_32_Ceil(Sz); i != 0;) {
+      SDValue Shift = DAG.getNode(ISD::SRL, dl, VT, Result,
+                                  DAG.getConstant(1ULL << (--i), dl, ShVT));
+      Result = DAG.getNode(ISD::XOR, dl, VT, Result, Shift);
+    }
+  }
+
+  return DAG.getNode(ISD::AND, dl, VT, Result, DAG.getConstant(1, dl, VT));
+}
+
+bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
+  LLVM_DEBUG(dbgs() << "Trying to expand node\n");
+  SmallVector<SDValue, 8> Results;
+  SDLoc dl(Node);
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4;
+  bool NeedInvert;
+  switch (Node->getOpcode()) {
+  case ISD::ABS:
+    if ((Tmp1 = TLI.expandABS(Node, DAG)))
+      Results.push_back(Tmp1);
+    break;
+  case ISD::ABDS:
+  case ISD::ABDU:
+    if ((Tmp1 = TLI.expandABD(Node, DAG)))
+      Results.push_back(Tmp1);
+    break;
+  case ISD::CTPOP:
+    if ((Tmp1 = TLI.expandCTPOP(Node, DAG)))
+      Results.push_back(Tmp1);
+    break;
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+    if ((Tmp1 = TLI.expandCTLZ(Node, DAG)))
+      Results.push_back(Tmp1);
+    break;
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:
+    if ((Tmp1 = TLI.expandCTTZ(Node, DAG)))
+      Results.push_back(Tmp1);
+    break;
+  case ISD::BITREVERSE:
+    if ((Tmp1 = TLI.expandBITREVERSE(Node, DAG)))
+      Results.push_back(Tmp1);
+    break;
+  case ISD::BSWAP:
+    if ((Tmp1 = TLI.expandBSWAP(Node, DAG)))
+      Results.push_back(Tmp1);
+    break;
+  case ISD::PARITY:
+    Results.push_back(ExpandPARITY(Node->getOperand(0), dl));
+    break;
+  case ISD::FRAMEADDR:
+  case ISD::RETURNADDR:
+  case ISD::FRAME_TO_ARGS_OFFSET:
+    Results.push_back(DAG.getConstant(0, dl, Node->getValueType(0)));
+    break;
+  case ISD::EH_DWARF_CFA: {
+    SDValue CfaArg = DAG.getSExtOrTrunc(Node->getOperand(0), dl,
+                                        TLI.getPointerTy(DAG.getDataLayout()));
+    SDValue Offset = DAG.getNode(ISD::ADD, dl,
+                                 CfaArg.getValueType(),
+                                 DAG.getNode(ISD::FRAME_TO_ARGS_OFFSET, dl,
+                                             CfaArg.getValueType()),
+                                 CfaArg);
+    SDValue FA = DAG.getNode(
+        ISD::FRAMEADDR, dl, TLI.getPointerTy(DAG.getDataLayout()),
+        DAG.getConstant(0, dl, TLI.getPointerTy(DAG.getDataLayout())));
+    Results.push_back(DAG.getNode(ISD::ADD, dl, FA.getValueType(),
+                                  FA, Offset));
+    break;
+  }
+  case ISD::GET_ROUNDING:
+    Results.push_back(DAG.getConstant(1, dl, Node->getValueType(0)));
+    Results.push_back(Node->getOperand(0));
+    break;
+  case ISD::EH_RETURN:
+  case ISD::EH_LABEL:
+  case ISD::PREFETCH:
+  case ISD::VAEND:
+  case ISD::EH_SJLJ_LONGJMP:
+    // If the target didn't expand these, there's nothing to do, so just
+    // preserve the chain and be done.
+    Results.push_back(Node->getOperand(0));
+    break;
+  case ISD::READCYCLECOUNTER:
+    // If the target didn't expand this, just return 'zero' and preserve the
+    // chain.
+    Results.append(Node->getNumValues() - 1,
+                   DAG.getConstant(0, dl, Node->getValueType(0)));
+    Results.push_back(Node->getOperand(0));
+    break;
+  case ISD::EH_SJLJ_SETJMP:
+    // If the target didn't expand this, just return 'zero' and preserve the
+    // chain.
+    Results.push_back(DAG.getConstant(0, dl, MVT::i32));
+    Results.push_back(Node->getOperand(0));
+    break;
+  case ISD::ATOMIC_LOAD: {
+    // There is no libcall for atomic load; fake it with ATOMIC_CMP_SWAP.
+    SDValue Zero = DAG.getConstant(0, dl, Node->getValueType(0));
+    SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
+    SDValue Swap = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), VTs,
+        Node->getOperand(0), Node->getOperand(1), Zero, Zero,
+        cast<AtomicSDNode>(Node)->getMemOperand());
+    Results.push_back(Swap.getValue(0));
+    Results.push_back(Swap.getValue(1));
+    break;
+  }
+  case ISD::ATOMIC_STORE: {
+    // There is no libcall for atomic store; fake it with ATOMIC_SWAP.
+    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
+                                 Node->getOperand(0),
+                                 Node->getOperand(1), Node->getOperand(2),
+                                 cast<AtomicSDNode>(Node)->getMemOperand());
+    Results.push_back(Swap.getValue(1));
+    break;
+  }
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
+    // Expanding an ATOMIC_CMP_SWAP_WITH_SUCCESS produces an ATOMIC_CMP_SWAP and
+    // splits out the success value as a comparison. Expanding the resulting
+    // ATOMIC_CMP_SWAP will produce a libcall.
+    SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
+    SDValue Res = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), VTs,
+        Node->getOperand(0), Node->getOperand(1), Node->getOperand(2),
+        Node->getOperand(3), cast<MemSDNode>(Node)->getMemOperand());
+
+    SDValue ExtRes = Res;
+    SDValue LHS = Res;
+    SDValue RHS = Node->getOperand(1);
+
+    EVT AtomicType = cast<AtomicSDNode>(Node)->getMemoryVT();
+    EVT OuterType = Node->getValueType(0);
+    switch (TLI.getExtendForAtomicOps()) {
+    case ISD::SIGN_EXTEND:
+      LHS = DAG.getNode(ISD::AssertSext, dl, OuterType, Res,
+                        DAG.getValueType(AtomicType));
+      RHS = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, OuterType,
+                        Node->getOperand(2), DAG.getValueType(AtomicType));
+      ExtRes = LHS;
+      break;
+    case ISD::ZERO_EXTEND:
+      LHS = DAG.getNode(ISD::AssertZext, dl, OuterType, Res,
+                        DAG.getValueType(AtomicType));
+      RHS = DAG.getZeroExtendInReg(Node->getOperand(2), dl, AtomicType);
+      ExtRes = LHS;
+      break;
+    case ISD::ANY_EXTEND:
+      LHS = DAG.getZeroExtendInReg(Res, dl, AtomicType);
+      RHS = DAG.getZeroExtendInReg(Node->getOperand(2), dl, AtomicType);
+      break;
+    default:
+      llvm_unreachable("Invalid atomic op extension");
+    }
+
+    SDValue Success =
+        DAG.getSetCC(dl, Node->getValueType(1), LHS, RHS, ISD::SETEQ);
+
+    Results.push_back(ExtRes.getValue(0));
+    Results.push_back(Success);
+    Results.push_back(Res.getValue(1));
+    break;
+  }
+  case ISD::DYNAMIC_STACKALLOC:
+    ExpandDYNAMIC_STACKALLOC(Node, Results);
+    break;
+  case ISD::MERGE_VALUES:
+    for (unsigned i = 0; i < Node->getNumValues(); i++)
+      Results.push_back(Node->getOperand(i));
+    break;
+  case ISD::UNDEF: {
+    EVT VT = Node->getValueType(0);
+    if (VT.isInteger())
+      Results.push_back(DAG.getConstant(0, dl, VT));
+    else {
+      assert(VT.isFloatingPoint() && "Unknown value type!");
+      Results.push_back(DAG.getConstantFP(0, dl, VT));
+    }
+    break;
+  }
+  case ISD::STRICT_FP_ROUND:
+    // When strict mode is enforced we can't do expansion because it
+    // does not honor the "strict" properties. Only libcall is allowed.
+    if (TLI.isStrictFPEnabled())
+      break;
+    // We might as well mutate to FP_ROUND when FP_ROUND operation is legal
+    // since this operation is more efficient than stack operation.
+    if (TLI.getStrictFPOperationAction(Node->getOpcode(),
+                                       Node->getValueType(0))
+        == TargetLowering::Legal)
+      break;
+    // We fall back to use stack operation when the FP_ROUND operation
+    // isn't available.
+    if ((Tmp1 = EmitStackConvert(Node->getOperand(1), Node->getValueType(0),
+                                 Node->getValueType(0), dl,
+                                 Node->getOperand(0)))) {
+      ReplaceNode(Node, Tmp1.getNode());
+      LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_ROUND node\n");
+      return true;
+    }
+    break;
+  case ISD::FP_ROUND:
+  case ISD::BITCAST:
+    if ((Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0),
+                                 Node->getValueType(0), dl)))
+      Results.push_back(Tmp1);
+    break;
+  case ISD::STRICT_FP_EXTEND:
+    // When strict mode is enforced we can't do expansion because it
+    // does not honor the "strict" properties. Only libcall is allowed.
+    if (TLI.isStrictFPEnabled())
+      break;
+    // We might as well mutate to FP_EXTEND when FP_EXTEND operation is legal
+    // since this operation is more efficient than stack operation.
+    if (TLI.getStrictFPOperationAction(Node->getOpcode(),
+                                       Node->getValueType(0))
+        == TargetLowering::Legal)
+      break;
+    // We fall back to use stack operation when the FP_EXTEND operation
+    // isn't available.
+    if ((Tmp1 = EmitStackConvert(
+             Node->getOperand(1), Node->getOperand(1).getValueType(),
+             Node->getValueType(0), dl, Node->getOperand(0)))) {
+      ReplaceNode(Node, Tmp1.getNode());
+      LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_EXTEND node\n");
+      return true;
+    }
+    break;
+  case ISD::FP_EXTEND:
+    if ((Tmp1 = EmitStackConvert(Node->getOperand(0),
+                                 Node->getOperand(0).getValueType(),
+                                 Node->getValueType(0), dl)))
+      Results.push_back(Tmp1);
+    break;
+  case ISD::BF16_TO_FP: {
+    // Always expand bf16 to f32 casts, they lower to ext + shift.
+    //
+    // Note that the operand of this code can be bf16 or an integer type in case
+    // bf16 is not supported on the target and was softened.
+    SDValue Op = Node->getOperand(0);
+    if (Op.getValueType() == MVT::bf16) {
+      Op = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32,
+                       DAG.getNode(ISD::BITCAST, dl, MVT::i16, Op));
+    } else {
+      Op = DAG.getAnyExtOrTrunc(Op, dl, MVT::i32);
+    }
+    Op = DAG.getNode(
+        ISD::SHL, dl, MVT::i32, Op,
+        DAG.getConstant(16, dl,
+                        TLI.getShiftAmountTy(MVT::i32, DAG.getDataLayout())));
+    Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op);
+    // Add fp_extend in case the output is bigger than f32.
+    if (Node->getValueType(0) != MVT::f32)
+      Op = DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Op);
+    Results.push_back(Op);
+    break;
+  }
+  case ISD::FP_TO_BF16: {
+    SDValue Op = Node->getOperand(0);
+    if (Op.getValueType() != MVT::f32)
+      Op = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Op,
+                       DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
+    Op = DAG.getNode(
+        ISD::SRL, dl, MVT::i32, DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op),
+        DAG.getConstant(16, dl,
+                        TLI.getShiftAmountTy(MVT::i32, DAG.getDataLayout())));
+    // The result of this node can be bf16 or an integer type in case bf16 is
+    // not supported on the target and was softened to i16 for storage.
+    if (Node->getValueType(0) == MVT::bf16) {
+      Op = DAG.getNode(ISD::BITCAST, dl, MVT::bf16,
+                       DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Op));
+    } else {
+      Op = DAG.getAnyExtOrTrunc(Op, dl, Node->getValueType(0));
+    }
+    Results.push_back(Op);
+    break;
+  }
+  case ISD::SIGN_EXTEND_INREG: {
+    EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    EVT VT = Node->getValueType(0);
+
+    // An in-register sign-extend of a boolean is a negation:
+    // 'true' (1) sign-extended is -1.
+    // 'false' (0) sign-extended is 0.
+    // However, we must mask the high bits of the source operand because the
+    // SIGN_EXTEND_INREG does not guarantee that the high bits are already zero.
+
+    // TODO: Do this for vectors too?
+    if (ExtraVT.isScalarInteger() && ExtraVT.getSizeInBits() == 1) {
+      SDValue One = DAG.getConstant(1, dl, VT);
+      SDValue And = DAG.getNode(ISD::AND, dl, VT, Node->getOperand(0), One);
+      SDValue Zero = DAG.getConstant(0, dl, VT);
+      SDValue Neg = DAG.getNode(ISD::SUB, dl, VT, Zero, And);
+      Results.push_back(Neg);
+      break;
+    }
+
+    // NOTE: we could fall back on load/store here too for targets without
+    // SRA.  However, it is doubtful that any exist.
+    EVT ShiftAmountTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+    unsigned BitsDiff = VT.getScalarSizeInBits() -
+                        ExtraVT.getScalarSizeInBits();
+    SDValue ShiftCst = DAG.getConstant(BitsDiff, dl, ShiftAmountTy);
+    Tmp1 = DAG.getNode(ISD::SHL, dl, Node->getValueType(0),
+                       Node->getOperand(0), ShiftCst);
+    Tmp1 = DAG.getNode(ISD::SRA, dl, Node->getValueType(0), Tmp1, ShiftCst);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::UINT_TO_FP:
+  case ISD::STRICT_UINT_TO_FP:
+    if (TLI.expandUINT_TO_FP(Node, Tmp1, Tmp2, DAG)) {
+      Results.push_back(Tmp1);
+      if (Node->isStrictFPOpcode())
+        Results.push_back(Tmp2);
+      break;
+    }
+    [[fallthrough]];
+  case ISD::SINT_TO_FP:
+  case ISD::STRICT_SINT_TO_FP:
+    if ((Tmp1 = ExpandLegalINT_TO_FP(Node, Tmp2))) {
+      Results.push_back(Tmp1);
+      if (Node->isStrictFPOpcode())
+        Results.push_back(Tmp2);
+    }
+    break;
+  case ISD::FP_TO_SINT:
+    if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG))
+      Results.push_back(Tmp1);
+    break;
+  case ISD::STRICT_FP_TO_SINT:
+    if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG)) {
+      ReplaceNode(Node, Tmp1.getNode());
+      LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_TO_SINT node\n");
+      return true;
+    }
+    break;
+  case ISD::FP_TO_UINT:
+    if (TLI.expandFP_TO_UINT(Node, Tmp1, Tmp2, DAG))
+      Results.push_back(Tmp1);
+    break;
+  case ISD::STRICT_FP_TO_UINT:
+    if (TLI.expandFP_TO_UINT(Node, Tmp1, Tmp2, DAG)) {
+      // Relink the chain.
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Node,1), Tmp2);
+      // Replace the new UINT result.
+      ReplaceNodeWithValue(SDValue(Node, 0), Tmp1);
+      LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_TO_UINT node\n");
+      return true;
+    }
+    break;
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    Results.push_back(TLI.expandFP_TO_INT_SAT(Node, DAG));
+    break;
+  case ISD::VAARG:
+    Results.push_back(DAG.expandVAArg(Node));
+    Results.push_back(Results[0].getValue(1));
+    break;
+  case ISD::VACOPY:
+    Results.push_back(DAG.expandVACopy(Node));
+    break;
+  case ISD::EXTRACT_VECTOR_ELT:
+    if (Node->getOperand(0).getValueType().getVectorNumElements() == 1)
+      // This must be an access of the only element.  Return it.
+      Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0),
+                         Node->getOperand(0));
+    else
+      Tmp1 = ExpandExtractFromVectorThroughStack(SDValue(Node, 0));
+    Results.push_back(Tmp1);
+    break;
+  case ISD::EXTRACT_SUBVECTOR:
+    Results.push_back(ExpandExtractFromVectorThroughStack(SDValue(Node, 0)));
+    break;
+  case ISD::INSERT_SUBVECTOR:
+    Results.push_back(ExpandInsertToVectorThroughStack(SDValue(Node, 0)));
+    break;
+  case ISD::CONCAT_VECTORS:
+    Results.push_back(ExpandVectorBuildThroughStack(Node));
+    break;
+  case ISD::SCALAR_TO_VECTOR:
+    Results.push_back(ExpandSCALAR_TO_VECTOR(Node));
+    break;
+  case ISD::INSERT_VECTOR_ELT:
+    Results.push_back(ExpandINSERT_VECTOR_ELT(Node->getOperand(0),
+                                              Node->getOperand(1),
+                                              Node->getOperand(2), dl));
+    break;
+  case ISD::VECTOR_SHUFFLE: {
+    SmallVector<int, 32> NewMask;
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Node)->getMask();
+
+    EVT VT = Node->getValueType(0);
+    EVT EltVT = VT.getVectorElementType();
+    SDValue Op0 = Node->getOperand(0);
+    SDValue Op1 = Node->getOperand(1);
+    if (!TLI.isTypeLegal(EltVT)) {
+      EVT NewEltVT = TLI.getTypeToTransformTo(*DAG.getContext(), EltVT);
+
+      // BUILD_VECTOR operands are allowed to be wider than the element type.
+      // But if NewEltVT is smaller that EltVT the BUILD_VECTOR does not accept
+      // it.
+      if (NewEltVT.bitsLT(EltVT)) {
+        // Convert shuffle node.
+        // If original node was v4i64 and the new EltVT is i32,
+        // cast operands to v8i32 and re-build the mask.
+
+        // Calculate new VT, the size of the new VT should be equal to original.
+        EVT NewVT =
+            EVT::getVectorVT(*DAG.getContext(), NewEltVT,
+                             VT.getSizeInBits() / NewEltVT.getSizeInBits());
+        assert(NewVT.bitsEq(VT));
+
+        // cast operands to new VT
+        Op0 = DAG.getNode(ISD::BITCAST, dl, NewVT, Op0);
+        Op1 = DAG.getNode(ISD::BITCAST, dl, NewVT, Op1);
+
+        // Convert the shuffle mask
+        unsigned int factor =
+                         NewVT.getVectorNumElements()/VT.getVectorNumElements();
+
+        // EltVT gets smaller
+        assert(factor > 0);
+
+        for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
+          if (Mask[i] < 0) {
+            for (unsigned fi = 0; fi < factor; ++fi)
+              NewMask.push_back(Mask[i]);
+          }
+          else {
+            for (unsigned fi = 0; fi < factor; ++fi)
+              NewMask.push_back(Mask[i]*factor+fi);
+          }
+        }
+        Mask = NewMask;
+        VT = NewVT;
+      }
+      EltVT = NewEltVT;
+    }
+    unsigned NumElems = VT.getVectorNumElements();
+    SmallVector<SDValue, 16> Ops;
+    for (unsigned i = 0; i != NumElems; ++i) {
+      if (Mask[i] < 0) {
+        Ops.push_back(DAG.getUNDEF(EltVT));
+        continue;
+      }
+      unsigned Idx = Mask[i];
+      if (Idx < NumElems)
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+                                  DAG.getVectorIdxConstant(Idx, dl)));
+      else
+        Ops.push_back(
+            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op1,
+                        DAG.getVectorIdxConstant(Idx - NumElems, dl)));
+    }
+
+    Tmp1 = DAG.getBuildVector(VT, dl, Ops);
+    // We may have changed the BUILD_VECTOR type. Cast it back to the Node type.
+    Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0), Tmp1);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::VECTOR_SPLICE: {
+    Results.push_back(TLI.expandVectorSplice(Node, DAG));
+    break;
+  }
+  case ISD::EXTRACT_ELEMENT: {
+    EVT OpTy = Node->getOperand(0).getValueType();
+    if (Node->getConstantOperandVal(1)) {
+      // 1 -> Hi
+      Tmp1 = DAG.getNode(ISD::SRL, dl, OpTy, Node->getOperand(0),
+                         DAG.getConstant(OpTy.getSizeInBits() / 2, dl,
+                                         TLI.getShiftAmountTy(
+                                             Node->getOperand(0).getValueType(),
+                                             DAG.getDataLayout())));
+      Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0), Tmp1);
+    } else {
+      // 0 -> Lo
+      Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0),
+                         Node->getOperand(0));
+    }
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::STACKSAVE:
+    // Expand to CopyFromReg if the target set
+    // StackPointerRegisterToSaveRestore.
+    if (Register SP = TLI.getStackPointerRegisterToSaveRestore()) {
+      Results.push_back(DAG.getCopyFromReg(Node->getOperand(0), dl, SP,
+                                           Node->getValueType(0)));
+      Results.push_back(Results[0].getValue(1));
+    } else {
+      Results.push_back(DAG.getUNDEF(Node->getValueType(0)));
+      Results.push_back(Node->getOperand(0));
+    }
+    break;
+  case ISD::STACKRESTORE:
+    // Expand to CopyToReg if the target set
+    // StackPointerRegisterToSaveRestore.
+    if (Register SP = TLI.getStackPointerRegisterToSaveRestore()) {
+      Results.push_back(DAG.getCopyToReg(Node->getOperand(0), dl, SP,
+                                         Node->getOperand(1)));
+    } else {
+      Results.push_back(Node->getOperand(0));
+    }
+    break;
+  case ISD::GET_DYNAMIC_AREA_OFFSET:
+    Results.push_back(DAG.getConstant(0, dl, Node->getValueType(0)));
+    Results.push_back(Results[0].getValue(0));
+    break;
+  case ISD::FCOPYSIGN:
+    Results.push_back(ExpandFCOPYSIGN(Node));
+    break;
+  case ISD::FNEG:
+    Results.push_back(ExpandFNEG(Node));
+    break;
+  case ISD::FABS:
+    Results.push_back(ExpandFABS(Node));
+    break;
+  case ISD::IS_FPCLASS: {
+    auto CNode = cast<ConstantSDNode>(Node->getOperand(1));
+    auto Test = static_cast<FPClassTest>(CNode->getZExtValue());
+    if (SDValue Expanded =
+            TLI.expandIS_FPCLASS(Node->getValueType(0), Node->getOperand(0),
+                                 Test, Node->getFlags(), SDLoc(Node), DAG))
+      Results.push_back(Expanded);
+    break;
+  }
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX: {
+    // Expand Y = MAX(A, B) -> Y = (A > B) ? A : B
+    ISD::CondCode Pred;
+    switch (Node->getOpcode()) {
+    default: llvm_unreachable("How did we get here?");
+    case ISD::SMAX: Pred = ISD::SETGT; break;
+    case ISD::SMIN: Pred = ISD::SETLT; break;
+    case ISD::UMAX: Pred = ISD::SETUGT; break;
+    case ISD::UMIN: Pred = ISD::SETULT; break;
+    }
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp1, Tmp2, Pred);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM: {
+    if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Node, DAG))
+      Results.push_back(Expanded);
+    break;
+  }
+  case ISD::FSIN:
+  case ISD::FCOS: {
+    EVT VT = Node->getValueType(0);
+    // Turn fsin / fcos into ISD::FSINCOS node if there are a pair of fsin /
+    // fcos which share the same operand and both are used.
+    if ((TLI.isOperationLegalOrCustom(ISD::FSINCOS, VT) ||
+         isSinCosLibcallAvailable(Node, TLI))
+        && useSinCos(Node)) {
+      SDVTList VTs = DAG.getVTList(VT, VT);
+      Tmp1 = DAG.getNode(ISD::FSINCOS, dl, VTs, Node->getOperand(0));
+      if (Node->getOpcode() == ISD::FCOS)
+        Tmp1 = Tmp1.getValue(1);
+      Results.push_back(Tmp1);
+    }
+    break;
+  }
+  case ISD::FLDEXP:
+  case ISD::STRICT_FLDEXP: {
+    EVT VT = Node->getValueType(0);
+    RTLIB::Libcall LC = RTLIB::getLDEXP(VT);
+    // Use the LibCall instead, it is very likely faster
+    // FIXME: Use separate LibCall action.
+    if (TLI.getLibcallName(LC))
+      break;
+
+    if (SDValue Expanded = expandLdexp(Node)) {
+      Results.push_back(Expanded);
+      if (Node->getOpcode() == ISD::STRICT_FLDEXP)
+        Results.push_back(Expanded.getValue(1));
+    }
+
+    break;
+  }
+  case ISD::FFREXP: {
+    RTLIB::Libcall LC = RTLIB::getFREXP(Node->getValueType(0));
+    // Use the LibCall instead, it is very likely faster
+    // FIXME: Use separate LibCall action.
+    if (TLI.getLibcallName(LC))
+      break;
+
+    if (SDValue Expanded = expandFrexp(Node)) {
+      Results.push_back(Expanded);
+      Results.push_back(Expanded.getValue(1));
+    }
+    break;
+  }
+  case ISD::FMAD:
+    llvm_unreachable("Illegal fmad should never be formed");
+
+  case ISD::FP16_TO_FP:
+    if (Node->getValueType(0) != MVT::f32) {
+      // We can extend to types bigger than f32 in two steps without changing
+      // the result. Since "f16 -> f32" is much more commonly available, give
+      // CodeGen the option of emitting that before resorting to a libcall.
+      SDValue Res =
+          DAG.getNode(ISD::FP16_TO_FP, dl, MVT::f32, Node->getOperand(0));
+      Results.push_back(
+          DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Res));
+    }
+    break;
+  case ISD::STRICT_FP16_TO_FP:
+    if (Node->getValueType(0) != MVT::f32) {
+      // We can extend to types bigger than f32 in two steps without changing
+      // the result. Since "f16 -> f32" is much more commonly available, give
+      // CodeGen the option of emitting that before resorting to a libcall.
+      SDValue Res =
+          DAG.getNode(ISD::STRICT_FP16_TO_FP, dl, {MVT::f32, MVT::Other},
+                      {Node->getOperand(0), Node->getOperand(1)});
+      Res = DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
+                        {Node->getValueType(0), MVT::Other},
+                        {Res.getValue(1), Res});
+      Results.push_back(Res);
+      Results.push_back(Res.getValue(1));
+    }
+    break;
+  case ISD::FP_TO_FP16:
+    LLVM_DEBUG(dbgs() << "Legalizing FP_TO_FP16\n");
+    if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) {
+      SDValue Op = Node->getOperand(0);
+      MVT SVT = Op.getSimpleValueType();
+      if ((SVT == MVT::f64 || SVT == MVT::f80) &&
+          TLI.isOperationLegalOrCustom(ISD::FP_TO_FP16, MVT::f32)) {
+        // Under fastmath, we can expand this node into a fround followed by
+        // a float-half conversion.
+        SDValue FloatVal =
+            DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Op,
+                        DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
+        Results.push_back(
+            DAG.getNode(ISD::FP_TO_FP16, dl, Node->getValueType(0), FloatVal));
+      }
+    }
+    break;
+  case ISD::ConstantFP: {
+    ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Node);
+    // Check to see if this FP immediate is already legal.
+    // If this is a legal constant, turn it into a TargetConstantFP node.
+    if (!TLI.isFPImmLegal(CFP->getValueAPF(), Node->getValueType(0),
+                          DAG.shouldOptForSize()))
+      Results.push_back(ExpandConstantFP(CFP, true));
+    break;
+  }
+  case ISD::Constant: {
+    ConstantSDNode *CP = cast<ConstantSDNode>(Node);
+    Results.push_back(ExpandConstant(CP));
+    break;
+  }
+  case ISD::FSUB: {
+    EVT VT = Node->getValueType(0);
+    if (TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::FNEG, VT)) {
+      const SDNodeFlags Flags = Node->getFlags();
+      Tmp1 = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(1));
+      Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1, Flags);
+      Results.push_back(Tmp1);
+    }
+    break;
+  }
+  case ISD::SUB: {
+    EVT VT = Node->getValueType(0);
+    assert(TLI.isOperationLegalOrCustom(ISD::ADD, VT) &&
+           TLI.isOperationLegalOrCustom(ISD::XOR, VT) &&
+           "Don't know how to expand this subtraction!");
+    Tmp1 = DAG.getNOT(dl, Node->getOperand(1), VT);
+    Tmp1 = DAG.getNode(ISD::ADD, dl, VT, Tmp1, DAG.getConstant(1, dl, VT));
+    Results.push_back(DAG.getNode(ISD::ADD, dl, VT, Node->getOperand(0), Tmp1));
+    break;
+  }
+  case ISD::UREM:
+  case ISD::SREM:
+    if (TLI.expandREM(Node, Tmp1, DAG))
+      Results.push_back(Tmp1);
+    break;
+  case ISD::UDIV:
+  case ISD::SDIV: {
+    bool isSigned = Node->getOpcode() == ISD::SDIV;
+    unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
+    EVT VT = Node->getValueType(0);
+    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) {
+      SDVTList VTs = DAG.getVTList(VT, VT);
+      Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Node->getOperand(0),
+                         Node->getOperand(1));
+      Results.push_back(Tmp1);
+    }
+    break;
+  }
+  case ISD::MULHU:
+  case ISD::MULHS: {
+    unsigned ExpandOpcode =
+        Node->getOpcode() == ISD::MULHU ? ISD::UMUL_LOHI : ISD::SMUL_LOHI;
+    EVT VT = Node->getValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, VT);
+
+    Tmp1 = DAG.getNode(ExpandOpcode, dl, VTs, Node->getOperand(0),
+                       Node->getOperand(1));
+    Results.push_back(Tmp1.getValue(1));
+    break;
+  }
+  case ISD::UMUL_LOHI:
+  case ISD::SMUL_LOHI: {
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+    MVT VT = LHS.getSimpleValueType();
+    unsigned MULHOpcode =
+        Node->getOpcode() == ISD::UMUL_LOHI ? ISD::MULHU : ISD::MULHS;
+
+    if (TLI.isOperationLegalOrCustom(MULHOpcode, VT)) {
+      Results.push_back(DAG.getNode(ISD::MUL, dl, VT, LHS, RHS));
+      Results.push_back(DAG.getNode(MULHOpcode, dl, VT, LHS, RHS));
+      break;
+    }
+
+    SmallVector<SDValue, 4> Halves;
+    EVT HalfType = EVT(VT).getHalfSizedIntegerVT(*DAG.getContext());
+    assert(TLI.isTypeLegal(HalfType));
+    if (TLI.expandMUL_LOHI(Node->getOpcode(), VT, dl, LHS, RHS, Halves,
+                           HalfType, DAG,
+                           TargetLowering::MulExpansionKind::Always)) {
+      for (unsigned i = 0; i < 2; ++i) {
+        SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Halves[2 * i]);
+        SDValue Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Halves[2 * i + 1]);
+        SDValue Shift = DAG.getConstant(
+            HalfType.getScalarSizeInBits(), dl,
+            TLI.getShiftAmountTy(HalfType, DAG.getDataLayout()));
+        Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
+        Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
+      }
+      break;
+    }
+    break;
+  }
+  case ISD::MUL: {
+    EVT VT = Node->getValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, VT);
+    // See if multiply or divide can be lowered using two-result operations.
+    // We just need the low half of the multiply; try both the signed
+    // and unsigned forms. If the target supports both SMUL_LOHI and
+    // UMUL_LOHI, form a preference by checking which forms of plain
+    // MULH it supports.
+    bool HasSMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::SMUL_LOHI, VT);
+    bool HasUMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::UMUL_LOHI, VT);
+    bool HasMULHS = TLI.isOperationLegalOrCustom(ISD::MULHS, VT);
+    bool HasMULHU = TLI.isOperationLegalOrCustom(ISD::MULHU, VT);
+    unsigned OpToUse = 0;
+    if (HasSMUL_LOHI && !HasMULHS) {
+      OpToUse = ISD::SMUL_LOHI;
+    } else if (HasUMUL_LOHI && !HasMULHU) {
+      OpToUse = ISD::UMUL_LOHI;
+    } else if (HasSMUL_LOHI) {
+      OpToUse = ISD::SMUL_LOHI;
+    } else if (HasUMUL_LOHI) {
+      OpToUse = ISD::UMUL_LOHI;
+    }
+    if (OpToUse) {
+      Results.push_back(DAG.getNode(OpToUse, dl, VTs, Node->getOperand(0),
+                                    Node->getOperand(1)));
+      break;
+    }
+
+    SDValue Lo, Hi;
+    EVT HalfType = VT.getHalfSizedIntegerVT(*DAG.getContext());
+    if (TLI.isOperationLegalOrCustom(ISD::ZERO_EXTEND, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::SHL, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::OR, VT) &&
+        TLI.expandMUL(Node, Lo, Hi, HalfType, DAG,
+                      TargetLowering::MulExpansionKind::OnlyLegalOrCustom)) {
+      Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Lo);
+      Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Hi);
+      SDValue Shift =
+          DAG.getConstant(HalfType.getSizeInBits(), dl,
+                          TLI.getShiftAmountTy(HalfType, DAG.getDataLayout()));
+      Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
+      Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
+    }
+    break;
+  }
+  case ISD::FSHL:
+  case ISD::FSHR:
+    if (SDValue Expanded = TLI.expandFunnelShift(Node, DAG))
+      Results.push_back(Expanded);
+    break;
+  case ISD::ROTL:
+  case ISD::ROTR:
+    if (SDValue Expanded = TLI.expandROT(Node, true /*AllowVectorOps*/, DAG))
+      Results.push_back(Expanded);
+    break;
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
+    Results.push_back(TLI.expandAddSubSat(Node, DAG));
+    break;
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:
+    Results.push_back(TLI.expandShlSat(Node, DAG));
+    break;
+  case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
+    Results.push_back(TLI.expandFixedPointMul(Node, DAG));
+    break;
+  case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT:
+    if (SDValue V = TLI.expandFixedPointDiv(Node->getOpcode(), SDLoc(Node),
+                                            Node->getOperand(0),
+                                            Node->getOperand(1),
+                                            Node->getConstantOperandVal(2),
+                                            DAG)) {
+      Results.push_back(V);
+      break;
+    }
+    // FIXME: We might want to retry here with a wider type if we fail, if that
+    // type is legal.
+    // FIXME: Technically, so long as we only have sdivfixes where BW+Scale is
+    // <= 128 (which is the case for all of the default Embedded-C types),
+    // we will only get here with types and scales that we could always expand
+    // if we were allowed to generate libcalls to division functions of illegal
+    // type. But we cannot do that.
+    llvm_unreachable("Cannot expand DIVFIX!");
+  case ISD::UADDO_CARRY:
+  case ISD::USUBO_CARRY: {
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+    SDValue Carry = Node->getOperand(2);
+
+    bool IsAdd = Node->getOpcode() == ISD::UADDO_CARRY;
+
+    // Initial add of the 2 operands.
+    unsigned Op = IsAdd ? ISD::ADD : ISD::SUB;
+    EVT VT = LHS.getValueType();
+    SDValue Sum = DAG.getNode(Op, dl, VT, LHS, RHS);
+
+    // Initial check for overflow.
+    EVT CarryType = Node->getValueType(1);
+    EVT SetCCType = getSetCCResultType(Node->getValueType(0));
+    ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT;
+    SDValue Overflow = DAG.getSetCC(dl, SetCCType, Sum, LHS, CC);
+
+    // Add of the sum and the carry.
+    SDValue One = DAG.getConstant(1, dl, VT);
+    SDValue CarryExt =
+        DAG.getNode(ISD::AND, dl, VT, DAG.getZExtOrTrunc(Carry, dl, VT), One);
+    SDValue Sum2 = DAG.getNode(Op, dl, VT, Sum, CarryExt);
+
+    // Second check for overflow. If we are adding, we can only overflow if the
+    // initial sum is all 1s ang the carry is set, resulting in a new sum of 0.
+    // If we are subtracting, we can only overflow if the initial sum is 0 and
+    // the carry is set, resulting in a new sum of all 1s.
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    SDValue Overflow2 =
+        IsAdd ? DAG.getSetCC(dl, SetCCType, Sum2, Zero, ISD::SETEQ)
+              : DAG.getSetCC(dl, SetCCType, Sum, Zero, ISD::SETEQ);
+    Overflow2 = DAG.getNode(ISD::AND, dl, SetCCType, Overflow2,
+                            DAG.getZExtOrTrunc(Carry, dl, SetCCType));
+
+    SDValue ResultCarry =
+        DAG.getNode(ISD::OR, dl, SetCCType, Overflow, Overflow2);
+
+    Results.push_back(Sum2);
+    Results.push_back(DAG.getBoolExtOrTrunc(ResultCarry, dl, CarryType, VT));
+    break;
+  }
+  case ISD::SADDO:
+  case ISD::SSUBO: {
+    SDValue Result, Overflow;
+    TLI.expandSADDSUBO(Node, Result, Overflow, DAG);
+    Results.push_back(Result);
+    Results.push_back(Overflow);
+    break;
+  }
+  case ISD::UADDO:
+  case ISD::USUBO: {
+    SDValue Result, Overflow;
+    TLI.expandUADDSUBO(Node, Result, Overflow, DAG);
+    Results.push_back(Result);
+    Results.push_back(Overflow);
+    break;
+  }
+  case ISD::UMULO:
+  case ISD::SMULO: {
+    SDValue Result, Overflow;
+    if (TLI.expandMULO(Node, Result, Overflow, DAG)) {
+      Results.push_back(Result);
+      Results.push_back(Overflow);
+    }
+    break;
+  }
+  case ISD::BUILD_PAIR: {
+    EVT PairTy = Node->getValueType(0);
+    Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, PairTy, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::ANY_EXTEND, dl, PairTy, Node->getOperand(1));
+    Tmp2 = DAG.getNode(
+        ISD::SHL, dl, PairTy, Tmp2,
+        DAG.getConstant(PairTy.getSizeInBits() / 2, dl,
+                        TLI.getShiftAmountTy(PairTy, DAG.getDataLayout())));
+    Results.push_back(DAG.getNode(ISD::OR, dl, PairTy, Tmp1, Tmp2));
+    break;
+  }
+  case ISD::SELECT:
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    Tmp3 = Node->getOperand(2);
+    if (Tmp1.getOpcode() == ISD::SETCC) {
+      Tmp1 = DAG.getSelectCC(dl, Tmp1.getOperand(0), Tmp1.getOperand(1),
+                             Tmp2, Tmp3,
+                             cast<CondCodeSDNode>(Tmp1.getOperand(2))->get());
+    } else {
+      Tmp1 = DAG.getSelectCC(dl, Tmp1,
+                             DAG.getConstant(0, dl, Tmp1.getValueType()),
+                             Tmp2, Tmp3, ISD::SETNE);
+    }
+    Tmp1->setFlags(Node->getFlags());
+    Results.push_back(Tmp1);
+    break;
+  case ISD::BR_JT: {
+    SDValue Chain = Node->getOperand(0);
+    SDValue Table = Node->getOperand(1);
+    SDValue Index = Node->getOperand(2);
+
+    const DataLayout &TD = DAG.getDataLayout();
+    EVT PTy = TLI.getPointerTy(TD);
+
+    unsigned EntrySize =
+      DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);
+
+    // For power-of-two jumptable entry sizes convert multiplication to a shift.
+    // This transformation needs to be done here since otherwise the MIPS
+    // backend will end up emitting a three instruction multiply sequence
+    // instead of a single shift and MSP430 will call a runtime function.
+    if (llvm::isPowerOf2_32(EntrySize))
+      Index = DAG.getNode(
+          ISD::SHL, dl, Index.getValueType(), Index,
+          DAG.getConstant(llvm::Log2_32(EntrySize), dl, Index.getValueType()));
+    else
+      Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(), Index,
+                          DAG.getConstant(EntrySize, dl, Index.getValueType()));
+    SDValue Addr = DAG.getNode(ISD::ADD, dl, Index.getValueType(),
+                               Index, Table);
+
+    EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
+    SDValue LD = DAG.getExtLoad(
+        ISD::SEXTLOAD, dl, PTy, Chain, Addr,
+        MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), MemVT);
+    Addr = LD;
+    if (TLI.isJumpTableRelative()) {
+      // For PIC, the sequence is:
+      // BRIND(load(Jumptable + index) + RelocBase)
+      // RelocBase can be JumpTable, GOT or some sort of global base.
+      Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr,
+                          TLI.getPICJumpTableRelocBase(Table, DAG));
+    }
+
+    Tmp1 = TLI.expandIndirectJTBranch(dl, LD.getValue(1), Addr, DAG);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::BRCOND:
+    // Expand brcond's setcc into its constituent parts and create a BR_CC
+    // Node.
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    if (Tmp2.getOpcode() == ISD::SETCC &&
+        TLI.isOperationLegalOrCustom(ISD::BR_CC,
+                                     Tmp2.getOperand(0).getValueType())) {
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other, Tmp1, Tmp2.getOperand(2),
+                         Tmp2.getOperand(0), Tmp2.getOperand(1),
+                         Node->getOperand(2));
+    } else {
+      // We test only the i1 bit.  Skip the AND if UNDEF or another AND.
+      if (Tmp2.isUndef() ||
+          (Tmp2.getOpcode() == ISD::AND && isOneConstant(Tmp2.getOperand(1))))
+        Tmp3 = Tmp2;
+      else
+        Tmp3 = DAG.getNode(ISD::AND, dl, Tmp2.getValueType(), Tmp2,
+                           DAG.getConstant(1, dl, Tmp2.getValueType()));
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other, Tmp1,
+                         DAG.getCondCode(ISD::SETNE), Tmp3,
+                         DAG.getConstant(0, dl, Tmp3.getValueType()),
+                         Node->getOperand(2));
+    }
+    Results.push_back(Tmp1);
+    break;
+  case ISD::SETCC:
+  case ISD::VP_SETCC:
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS: {
+    bool IsVP = Node->getOpcode() == ISD::VP_SETCC;
+    bool IsStrict = Node->getOpcode() == ISD::STRICT_FSETCC ||
+                    Node->getOpcode() == ISD::STRICT_FSETCCS;
+    bool IsSignaling = Node->getOpcode() == ISD::STRICT_FSETCCS;
+    SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue();
+    unsigned Offset = IsStrict ? 1 : 0;
+    Tmp1 = Node->getOperand(0 + Offset);
+    Tmp2 = Node->getOperand(1 + Offset);
+    Tmp3 = Node->getOperand(2 + Offset);
+    SDValue Mask, EVL;
+    if (IsVP) {
+      Mask = Node->getOperand(3 + Offset);
+      EVL = Node->getOperand(4 + Offset);
+    }
+    bool Legalized = TLI.LegalizeSetCCCondCode(
+        DAG, Node->getValueType(0), Tmp1, Tmp2, Tmp3, Mask, EVL, NeedInvert, dl,
+        Chain, IsSignaling);
+
+    if (Legalized) {
+      // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
+      // condition code, create a new SETCC node.
+      if (Tmp3.getNode()) {
+        if (IsStrict) {
+          Tmp1 = DAG.getNode(Node->getOpcode(), dl, Node->getVTList(),
+                             {Chain, Tmp1, Tmp2, Tmp3}, Node->getFlags());
+          Chain = Tmp1.getValue(1);
+        } else if (IsVP) {
+          Tmp1 = DAG.getNode(Node->getOpcode(), dl, Node->getValueType(0),
+                             {Tmp1, Tmp2, Tmp3, Mask, EVL}, Node->getFlags());
+        } else {
+          Tmp1 = DAG.getNode(Node->getOpcode(), dl, Node->getValueType(0), Tmp1,
+                             Tmp2, Tmp3, Node->getFlags());
+        }
+      }
+
+      // If we expanded the SETCC by inverting the condition code, then wrap
+      // the existing SETCC in a NOT to restore the intended condition.
+      if (NeedInvert) {
+        if (!IsVP)
+          Tmp1 = DAG.getLogicalNOT(dl, Tmp1, Tmp1->getValueType(0));
+        else
+          Tmp1 =
+              DAG.getVPLogicalNOT(dl, Tmp1, Mask, EVL, Tmp1->getValueType(0));
+      }
+
+      Results.push_back(Tmp1);
+      if (IsStrict)
+        Results.push_back(Chain);
+
+      break;
+    }
+
+    // FIXME: It seems Legalized is false iff CCCode is Legal. I don't
+    // understand if this code is useful for strict nodes.
+    assert(!IsStrict && "Don't know how to expand for strict nodes.");
+
+    // Otherwise, SETCC for the given comparison type must be completely
+    // illegal; expand it into a SELECT_CC.
+    // FIXME: This drops the mask/evl for VP_SETCC.
+    EVT VT = Node->getValueType(0);
+    EVT Tmp1VT = Tmp1.getValueType();
+    Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, VT, Tmp1, Tmp2,
+                       DAG.getBoolConstant(true, dl, VT, Tmp1VT),
+                       DAG.getBoolConstant(false, dl, VT, Tmp1VT), Tmp3);
+    Tmp1->setFlags(Node->getFlags());
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SELECT_CC: {
+    // TODO: need to add STRICT_SELECT_CC and STRICT_SELECT_CCS
+    Tmp1 = Node->getOperand(0);   // LHS
+    Tmp2 = Node->getOperand(1);   // RHS
+    Tmp3 = Node->getOperand(2);   // True
+    Tmp4 = Node->getOperand(3);   // False
+    EVT VT = Node->getValueType(0);
+    SDValue Chain;
+    SDValue CC = Node->getOperand(4);
+    ISD::CondCode CCOp = cast<CondCodeSDNode>(CC)->get();
+
+    if (TLI.isCondCodeLegalOrCustom(CCOp, Tmp1.getSimpleValueType())) {
+      // If the condition code is legal, then we need to expand this
+      // node using SETCC and SELECT.
+      EVT CmpVT = Tmp1.getValueType();
+      assert(!TLI.isOperationExpand(ISD::SELECT, VT) &&
+             "Cannot expand ISD::SELECT_CC when ISD::SELECT also needs to be "
+             "expanded.");
+      EVT CCVT = getSetCCResultType(CmpVT);
+      SDValue Cond = DAG.getNode(ISD::SETCC, dl, CCVT, Tmp1, Tmp2, CC, Node->getFlags());
+      Results.push_back(DAG.getSelect(dl, VT, Cond, Tmp3, Tmp4));
+      break;
+    }
+
+    // SELECT_CC is legal, so the condition code must not be.
+    bool Legalized = false;
+    // Try to legalize by inverting the condition.  This is for targets that
+    // might support an ordered version of a condition, but not the unordered
+    // version (or vice versa).
+    ISD::CondCode InvCC = ISD::getSetCCInverse(CCOp, Tmp1.getValueType());
+    if (TLI.isCondCodeLegalOrCustom(InvCC, Tmp1.getSimpleValueType())) {
+      // Use the new condition code and swap true and false
+      Legalized = true;
+      Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC);
+      Tmp1->setFlags(Node->getFlags());
+    } else {
+      // If The inverse is not legal, then try to swap the arguments using
+      // the inverse condition code.
+      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InvCC);
+      if (TLI.isCondCodeLegalOrCustom(SwapInvCC, Tmp1.getSimpleValueType())) {
+        // The swapped inverse condition is legal, so swap true and false,
+        // lhs and rhs.
+        Legalized = true;
+        Tmp1 = DAG.getSelectCC(dl, Tmp2, Tmp1, Tmp4, Tmp3, SwapInvCC);
+        Tmp1->setFlags(Node->getFlags());
+      }
+    }
+
+    if (!Legalized) {
+      Legalized = TLI.LegalizeSetCCCondCode(
+          DAG, getSetCCResultType(Tmp1.getValueType()), Tmp1, Tmp2, CC,
+          /*Mask*/ SDValue(), /*EVL*/ SDValue(), NeedInvert, dl, Chain);
+
+      assert(Legalized && "Can't legalize SELECT_CC with legal condition!");
+
+      // If we expanded the SETCC by inverting the condition code, then swap
+      // the True/False operands to match.
+      if (NeedInvert)
+        std::swap(Tmp3, Tmp4);
+
+      // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
+      // condition code, create a new SELECT_CC node.
+      if (CC.getNode()) {
+        Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0),
+                           Tmp1, Tmp2, Tmp3, Tmp4, CC);
+      } else {
+        Tmp2 = DAG.getConstant(0, dl, Tmp1.getValueType());
+        CC = DAG.getCondCode(ISD::SETNE);
+        Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1,
+                           Tmp2, Tmp3, Tmp4, CC);
+      }
+      Tmp1->setFlags(Node->getFlags());
+    }
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::BR_CC: {
+    // TODO: need to add STRICT_BR_CC and STRICT_BR_CCS
+    SDValue Chain;
+    Tmp1 = Node->getOperand(0);              // Chain
+    Tmp2 = Node->getOperand(2);              // LHS
+    Tmp3 = Node->getOperand(3);              // RHS
+    Tmp4 = Node->getOperand(1);              // CC
+
+    bool Legalized = TLI.LegalizeSetCCCondCode(
+        DAG, getSetCCResultType(Tmp2.getValueType()), Tmp2, Tmp3, Tmp4,
+        /*Mask*/ SDValue(), /*EVL*/ SDValue(), NeedInvert, dl, Chain);
+    (void)Legalized;
+    assert(Legalized && "Can't legalize BR_CC with legal condition!");
+
+    // If we expanded the SETCC by swapping LHS and RHS, create a new BR_CC
+    // node.
+    if (Tmp4.getNode()) {
+      assert(!NeedInvert && "Don't know how to invert BR_CC!");
+
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1,
+                         Tmp4, Tmp2, Tmp3, Node->getOperand(4));
+    } else {
+      Tmp3 = DAG.getConstant(0, dl, Tmp2.getValueType());
+      Tmp4 = DAG.getCondCode(NeedInvert ? ISD::SETEQ : ISD::SETNE);
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1, Tmp4,
+                         Tmp2, Tmp3, Node->getOperand(4));
+    }
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::BUILD_VECTOR:
+    Results.push_back(ExpandBUILD_VECTOR(Node));
+    break;
+  case ISD::SPLAT_VECTOR:
+    Results.push_back(ExpandSPLAT_VECTOR(Node));
+    break;
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL: {
+    // Scalarize vector SRA/SRL/SHL.
+    EVT VT = Node->getValueType(0);
+    assert(VT.isVector() && "Unable to legalize non-vector shift");
+    assert(TLI.isTypeLegal(VT.getScalarType())&& "Element type must be legal");
+    unsigned NumElem = VT.getVectorNumElements();
+
+    SmallVector<SDValue, 8> Scalars;
+    for (unsigned Idx = 0; Idx < NumElem; Idx++) {
+      SDValue Ex =
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getScalarType(),
+                      Node->getOperand(0), DAG.getVectorIdxConstant(Idx, dl));
+      SDValue Sh =
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getScalarType(),
+                      Node->getOperand(1), DAG.getVectorIdxConstant(Idx, dl));
+      Scalars.push_back(DAG.getNode(Node->getOpcode(), dl,
+                                    VT.getScalarType(), Ex, Sh));
+    }
+
+    SDValue Result = DAG.getBuildVector(Node->getValueType(0), dl, Scalars);
+    Results.push_back(Result);
+    break;
+  }
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN:
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VECREDUCE_FMINIMUM:
+    Results.push_back(TLI.expandVecReduce(Node, DAG));
+    break;
+  case ISD::GLOBAL_OFFSET_TABLE:
+  case ISD::GlobalAddress:
+  case ISD::GlobalTLSAddress:
+  case ISD::ExternalSymbol:
+  case ISD::ConstantPool:
+  case ISD::JumpTable:
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID:
+    // FIXME: Custom lowering for these operations shouldn't return null!
+    // Return true so that we don't call ConvertNodeToLibcall which also won't
+    // do anything.
+    return true;
+  }
+
+  if (!TLI.isStrictFPEnabled() && Results.empty() && Node->isStrictFPOpcode()) {
+    // FIXME: We were asked to expand a strict floating-point operation,
+    // but there is currently no expansion implemented that would preserve
+    // the "strict" properties.  For now, we just fall back to the non-strict
+    // version if that is legal on the target.  The actual mutation of the
+    // operation will happen in SelectionDAGISel::DoInstructionSelection.
+    switch (Node->getOpcode()) {
+    default:
+      if (TLI.getStrictFPOperationAction(Node->getOpcode(),
+                                         Node->getValueType(0))
+          == TargetLowering::Legal)
+        return true;
+      break;
+    case ISD::STRICT_FSUB: {
+      if (TLI.getStrictFPOperationAction(
+              ISD::STRICT_FSUB, Node->getValueType(0)) == TargetLowering::Legal)
+        return true;
+      if (TLI.getStrictFPOperationAction(
+              ISD::STRICT_FADD, Node->getValueType(0)) != TargetLowering::Legal)
+        break;
+
+      EVT VT = Node->getValueType(0);
+      const SDNodeFlags Flags = Node->getFlags();
+      SDValue Neg = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(2), Flags);
+      SDValue Fadd = DAG.getNode(ISD::STRICT_FADD, dl, Node->getVTList(),
+                                 {Node->getOperand(0), Node->getOperand(1), Neg},
+                         Flags);
+
+      Results.push_back(Fadd);
+      Results.push_back(Fadd.getValue(1));
+      break;
+    }
+    case ISD::STRICT_SINT_TO_FP:
+    case ISD::STRICT_UINT_TO_FP:
+    case ISD::STRICT_LRINT:
+    case ISD::STRICT_LLRINT:
+    case ISD::STRICT_LROUND:
+    case ISD::STRICT_LLROUND:
+      // These are registered by the operand type instead of the value
+      // type. Reflect that here.
+      if (TLI.getStrictFPOperationAction(Node->getOpcode(),
+                                         Node->getOperand(1).getValueType())
+          == TargetLowering::Legal)
+        return true;
+      break;
+    }
+  }
+
+  // Replace the original node with the legalized result.
+  if (Results.empty()) {
+    LLVM_DEBUG(dbgs() << "Cannot expand node\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Successfully expanded node\n");
+  ReplaceNode(Node, Results.data());
+  return true;
+}
+
+void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
+  LLVM_DEBUG(dbgs() << "Trying to convert node to libcall\n");
+  SmallVector<SDValue, 8> Results;
+  SDLoc dl(Node);
+  // FIXME: Check flags on the node to see if we can use a finite call.
+  unsigned Opc = Node->getOpcode();
+  switch (Opc) {
+  case ISD::ATOMIC_FENCE: {
+    // If the target didn't lower this, lower it to '__sync_synchronize()' call
+    // FIXME: handle "fence singlethread" more efficiently.
+    TargetLowering::ArgListTy Args;
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl)
+        .setChain(Node->getOperand(0))
+        .setLibCallee(
+            CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+            DAG.getExternalSymbol("__sync_synchronize",
+                                  TLI.getPointerTy(DAG.getDataLayout())),
+            std::move(Args));
+
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+    Results.push_back(CallResult.second);
+    break;
+  }
+  // By default, atomic intrinsics are marked Legal and lowered. Targets
+  // which don't support them directly, however, may want libcalls, in which
+  // case they mark them Expand, and we get here.
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_CLR:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_CMP_SWAP: {
+    MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
+    AtomicOrdering Order = cast<AtomicSDNode>(Node)->getMergedOrdering();
+    RTLIB::Libcall LC = RTLIB::getOUTLINE_ATOMIC(Opc, Order, VT);
+    EVT RetVT = Node->getValueType(0);
+    TargetLowering::MakeLibCallOptions CallOptions;
+    SmallVector<SDValue, 4> Ops;
+    if (TLI.getLibcallName(LC)) {
+      // If outline atomic available, prepare its arguments and expand.
+      Ops.append(Node->op_begin() + 2, Node->op_end());
+      Ops.push_back(Node->getOperand(1));
+
+    } else {
+      LC = RTLIB::getSYNC(Opc, VT);
+      assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+             "Unexpected atomic op or value type!");
+      // Arguments for expansion to sync libcall
+      Ops.append(Node->op_begin() + 1, Node->op_end());
+    }
+    std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RetVT,
+                                                      Ops, CallOptions,
+                                                      SDLoc(Node),
+                                                      Node->getOperand(0));
+    Results.push_back(Tmp.first);
+    Results.push_back(Tmp.second);
+    break;
+  }
+  case ISD::TRAP: {
+    // If this operation is not supported, lower it to 'abort()' call
+    TargetLowering::ArgListTy Args;
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl)
+        .setChain(Node->getOperand(0))
+        .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                      DAG.getExternalSymbol(
+                          "abort", TLI.getPointerTy(DAG.getDataLayout())),
+                      std::move(Args));
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+    Results.push_back(CallResult.second);
+    break;
+  }
+  case ISD::FMINNUM:
+  case ISD::STRICT_FMINNUM:
+    ExpandFPLibCall(Node, RTLIB::FMIN_F32, RTLIB::FMIN_F64,
+                    RTLIB::FMIN_F80, RTLIB::FMIN_F128,
+                    RTLIB::FMIN_PPCF128, Results);
+    break;
+  // FIXME: We do not have libcalls for FMAXIMUM and FMINIMUM. So, we cannot use
+  // libcall legalization for these nodes, but there is no default expasion for
+  // these nodes either (see PR63267 for example).
+  case ISD::FMAXNUM:
+  case ISD::STRICT_FMAXNUM:
+    ExpandFPLibCall(Node, RTLIB::FMAX_F32, RTLIB::FMAX_F64,
+                    RTLIB::FMAX_F80, RTLIB::FMAX_F128,
+                    RTLIB::FMAX_PPCF128, Results);
+    break;
+  case ISD::FSQRT:
+  case ISD::STRICT_FSQRT:
+    ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64,
+                    RTLIB::SQRT_F80, RTLIB::SQRT_F128,
+                    RTLIB::SQRT_PPCF128, Results);
+    break;
+  case ISD::FCBRT:
+    ExpandFPLibCall(Node, RTLIB::CBRT_F32, RTLIB::CBRT_F64,
+                    RTLIB::CBRT_F80, RTLIB::CBRT_F128,
+                    RTLIB::CBRT_PPCF128, Results);
+    break;
+  case ISD::FSIN:
+  case ISD::STRICT_FSIN:
+    ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64,
+                    RTLIB::SIN_F80, RTLIB::SIN_F128,
+                    RTLIB::SIN_PPCF128, Results);
+    break;
+  case ISD::FCOS:
+  case ISD::STRICT_FCOS:
+    ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64,
+                    RTLIB::COS_F80, RTLIB::COS_F128,
+                    RTLIB::COS_PPCF128, Results);
+    break;
+  case ISD::FSINCOS:
+    // Expand into sincos libcall.
+    ExpandSinCosLibCall(Node, Results);
+    break;
+  case ISD::FLOG:
+  case ISD::STRICT_FLOG:
+    ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64, RTLIB::LOG_F80,
+                    RTLIB::LOG_F128, RTLIB::LOG_PPCF128, Results);
+    break;
+  case ISD::FLOG2:
+  case ISD::STRICT_FLOG2:
+    ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64, RTLIB::LOG2_F80,
+                    RTLIB::LOG2_F128, RTLIB::LOG2_PPCF128, Results);
+    break;
+  case ISD::FLOG10:
+  case ISD::STRICT_FLOG10:
+    ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64, RTLIB::LOG10_F80,
+                    RTLIB::LOG10_F128, RTLIB::LOG10_PPCF128, Results);
+    break;
+  case ISD::FEXP:
+  case ISD::STRICT_FEXP:
+    ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64, RTLIB::EXP_F80,
+                    RTLIB::EXP_F128, RTLIB::EXP_PPCF128, Results);
+    break;
+  case ISD::FEXP2:
+  case ISD::STRICT_FEXP2:
+    ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64, RTLIB::EXP2_F80,
+                    RTLIB::EXP2_F128, RTLIB::EXP2_PPCF128, Results);
+    break;
+  case ISD::FTRUNC:
+  case ISD::STRICT_FTRUNC:
+    ExpandFPLibCall(Node, RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
+                    RTLIB::TRUNC_F80, RTLIB::TRUNC_F128,
+                    RTLIB::TRUNC_PPCF128, Results);
+    break;
+  case ISD::FFLOOR:
+  case ISD::STRICT_FFLOOR:
+    ExpandFPLibCall(Node, RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
+                    RTLIB::FLOOR_F80, RTLIB::FLOOR_F128,
+                    RTLIB::FLOOR_PPCF128, Results);
+    break;
+  case ISD::FCEIL:
+  case ISD::STRICT_FCEIL:
+    ExpandFPLibCall(Node, RTLIB::CEIL_F32, RTLIB::CEIL_F64,
+                    RTLIB::CEIL_F80, RTLIB::CEIL_F128,
+                    RTLIB::CEIL_PPCF128, Results);
+    break;
+  case ISD::FRINT:
+  case ISD::STRICT_FRINT:
+    ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64,
+                    RTLIB::RINT_F80, RTLIB::RINT_F128,
+                    RTLIB::RINT_PPCF128, Results);
+    break;
+  case ISD::FNEARBYINT:
+  case ISD::STRICT_FNEARBYINT:
+    ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32,
+                    RTLIB::NEARBYINT_F64,
+                    RTLIB::NEARBYINT_F80,
+                    RTLIB::NEARBYINT_F128,
+                    RTLIB::NEARBYINT_PPCF128, Results);
+    break;
+  case ISD::FROUND:
+  case ISD::STRICT_FROUND:
+    ExpandFPLibCall(Node, RTLIB::ROUND_F32,
+                    RTLIB::ROUND_F64,
+                    RTLIB::ROUND_F80,
+                    RTLIB::ROUND_F128,
+                    RTLIB::ROUND_PPCF128, Results);
+    break;
+  case ISD::FROUNDEVEN:
+  case ISD::STRICT_FROUNDEVEN:
+    ExpandFPLibCall(Node, RTLIB::ROUNDEVEN_F32,
+                    RTLIB::ROUNDEVEN_F64,
+                    RTLIB::ROUNDEVEN_F80,
+                    RTLIB::ROUNDEVEN_F128,
+                    RTLIB::ROUNDEVEN_PPCF128, Results);
+    break;
+  case ISD::FLDEXP:
+  case ISD::STRICT_FLDEXP:
+    ExpandFPLibCall(Node, RTLIB::LDEXP_F32, RTLIB::LDEXP_F64, RTLIB::LDEXP_F80,
+                    RTLIB::LDEXP_F128, RTLIB::LDEXP_PPCF128, Results);
+    break;
+  case ISD::FFREXP: {
+    ExpandFrexpLibCall(Node, Results);
+    break;
+  }
+  case ISD::FPOWI:
+  case ISD::STRICT_FPOWI: {
+    RTLIB::Libcall LC = RTLIB::getPOWI(Node->getSimpleValueType(0));
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fpowi.");
+    if (!TLI.getLibcallName(LC)) {
+      // Some targets don't have a powi libcall; use pow instead.
+      if (Node->isStrictFPOpcode()) {
+        SDValue Exponent =
+            DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(Node),
+                        {Node->getValueType(0), Node->getValueType(1)},
+                        {Node->getOperand(0), Node->getOperand(2)});
+        SDValue FPOW =
+            DAG.getNode(ISD::STRICT_FPOW, SDLoc(Node),
+                        {Node->getValueType(0), Node->getValueType(1)},
+                        {Exponent.getValue(1), Node->getOperand(1), Exponent});
+        Results.push_back(FPOW);
+        Results.push_back(FPOW.getValue(1));
+      } else {
+        SDValue Exponent =
+            DAG.getNode(ISD::SINT_TO_FP, SDLoc(Node), Node->getValueType(0),
+                        Node->getOperand(1));
+        Results.push_back(DAG.getNode(ISD::FPOW, SDLoc(Node),
+                                      Node->getValueType(0),
+                                      Node->getOperand(0), Exponent));
+      }
+      break;
+    }
+    unsigned Offset = Node->isStrictFPOpcode() ? 1 : 0;
+    bool ExponentHasSizeOfInt =
+        DAG.getLibInfo().getIntSize() ==
+        Node->getOperand(1 + Offset).getValueType().getSizeInBits();
+    if (!ExponentHasSizeOfInt) {
+      // If the exponent does not match with sizeof(int) a libcall to
+      // RTLIB::POWI would use the wrong type for the argument.
+      DAG.getContext()->emitError("POWI exponent does not match sizeof(int)");
+      Results.push_back(DAG.getUNDEF(Node->getValueType(0)));
+      break;
+    }
+    ExpandFPLibCall(Node, LC, Results);
+    break;
+  }
+  case ISD::FPOW:
+  case ISD::STRICT_FPOW:
+    ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64, RTLIB::POW_F80,
+                    RTLIB::POW_F128, RTLIB::POW_PPCF128, Results);
+    break;
+  case ISD::LROUND:
+  case ISD::STRICT_LROUND:
+    ExpandArgFPLibCall(Node, RTLIB::LROUND_F32,
+                       RTLIB::LROUND_F64, RTLIB::LROUND_F80,
+                       RTLIB::LROUND_F128,
+                       RTLIB::LROUND_PPCF128, Results);
+    break;
+  case ISD::LLROUND:
+  case ISD::STRICT_LLROUND:
+    ExpandArgFPLibCall(Node, RTLIB::LLROUND_F32,
+                       RTLIB::LLROUND_F64, RTLIB::LLROUND_F80,
+                       RTLIB::LLROUND_F128,
+                       RTLIB::LLROUND_PPCF128, Results);
+    break;
+  case ISD::LRINT:
+  case ISD::STRICT_LRINT:
+    ExpandArgFPLibCall(Node, RTLIB::LRINT_F32,
+                       RTLIB::LRINT_F64, RTLIB::LRINT_F80,
+                       RTLIB::LRINT_F128,
+                       RTLIB::LRINT_PPCF128, Results);
+    break;
+  case ISD::LLRINT:
+  case ISD::STRICT_LLRINT:
+    ExpandArgFPLibCall(Node, RTLIB::LLRINT_F32,
+                       RTLIB::LLRINT_F64, RTLIB::LLRINT_F80,
+                       RTLIB::LLRINT_F128,
+                       RTLIB::LLRINT_PPCF128, Results);
+    break;
+  case ISD::FDIV:
+  case ISD::STRICT_FDIV:
+    ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64,
+                    RTLIB::DIV_F80, RTLIB::DIV_F128,
+                    RTLIB::DIV_PPCF128, Results);
+    break;
+  case ISD::FREM:
+  case ISD::STRICT_FREM:
+    ExpandFPLibCall(Node, RTLIB::REM_F32, RTLIB::REM_F64,
+                    RTLIB::REM_F80, RTLIB::REM_F128,
+                    RTLIB::REM_PPCF128, Results);
+    break;
+  case ISD::FMA:
+  case ISD::STRICT_FMA:
+    ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64,
+                    RTLIB::FMA_F80, RTLIB::FMA_F128,
+                    RTLIB::FMA_PPCF128, Results);
+    break;
+  case ISD::FADD:
+  case ISD::STRICT_FADD:
+    ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64,
+                    RTLIB::ADD_F80, RTLIB::ADD_F128,
+                    RTLIB::ADD_PPCF128, Results);
+    break;
+  case ISD::FMUL:
+  case ISD::STRICT_FMUL:
+    ExpandFPLibCall(Node, RTLIB::MUL_F32, RTLIB::MUL_F64,
+                    RTLIB::MUL_F80, RTLIB::MUL_F128,
+                    RTLIB::MUL_PPCF128, Results);
+    break;
+  case ISD::FP16_TO_FP:
+    if (Node->getValueType(0) == MVT::f32) {
+      Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false).first);
+    }
+    break;
+  case ISD::STRICT_FP16_TO_FP: {
+    if (Node->getValueType(0) == MVT::f32) {
+      TargetLowering::MakeLibCallOptions CallOptions;
+      std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(
+          DAG, RTLIB::FPEXT_F16_F32, MVT::f32, Node->getOperand(1), CallOptions,
+          SDLoc(Node), Node->getOperand(0));
+      Results.push_back(Tmp.first);
+      Results.push_back(Tmp.second);
+    }
+    break;
+  }
+  case ISD::FP_TO_FP16: {
+    RTLIB::Libcall LC =
+        RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::f16);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_fp16");
+    Results.push_back(ExpandLibCall(LC, Node, false).first);
+    break;
+  }
+  case ISD::FP_TO_BF16: {
+    RTLIB::Libcall LC =
+        RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::bf16);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_bf16");
+    Results.push_back(ExpandLibCall(LC, Node, false).first);
+    break;
+  }
+  case ISD::STRICT_SINT_TO_FP:
+  case ISD::STRICT_UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP: {
+    // TODO - Common the code with DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP
+    bool IsStrict = Node->isStrictFPOpcode();
+    bool Signed = Node->getOpcode() == ISD::SINT_TO_FP ||
+                  Node->getOpcode() == ISD::STRICT_SINT_TO_FP;
+    EVT SVT = Node->getOperand(IsStrict ? 1 : 0).getValueType();
+    EVT RVT = Node->getValueType(0);
+    EVT NVT = EVT();
+    SDLoc dl(Node);
+
+    // Even if the input is legal, no libcall may exactly match, eg. we don't
+    // have i1 -> fp conversions. So, it needs to be promoted to a larger type,
+    // eg: i13 -> fp. Then, look for an appropriate libcall.
+    RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+    for (unsigned t = MVT::FIRST_INTEGER_VALUETYPE;
+         t <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL;
+         ++t) {
+      NVT = (MVT::SimpleValueType)t;
+      // The source needs to big enough to hold the operand.
+      if (NVT.bitsGE(SVT))
+        LC = Signed ? RTLIB::getSINTTOFP(NVT, RVT)
+                    : RTLIB::getUINTTOFP(NVT, RVT);
+    }
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall");
+
+    SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue();
+    // Sign/zero extend the argument if the libcall takes a larger type.
+    SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                             NVT, Node->getOperand(IsStrict ? 1 : 0));
+    TargetLowering::MakeLibCallOptions CallOptions;
+    CallOptions.setSExt(Signed);
+    std::pair<SDValue, SDValue> Tmp =
+        TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, dl, Chain);
+    Results.push_back(Tmp.first);
+    if (IsStrict)
+      Results.push_back(Tmp.second);
+    break;
+  }
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT: {
+    // TODO - Common the code with DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT.
+    bool IsStrict = Node->isStrictFPOpcode();
+    bool Signed = Node->getOpcode() == ISD::FP_TO_SINT ||
+                  Node->getOpcode() == ISD::STRICT_FP_TO_SINT;
+
+    SDValue Op = Node->getOperand(IsStrict ? 1 : 0);
+    EVT SVT = Op.getValueType();
+    EVT RVT = Node->getValueType(0);
+    EVT NVT = EVT();
+    SDLoc dl(Node);
+
+    // Even if the result is legal, no libcall may exactly match, eg. we don't
+    // have fp -> i1 conversions. So, it needs to be promoted to a larger type,
+    // eg: fp -> i32. Then, look for an appropriate libcall.
+    RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+    for (unsigned IntVT = MVT::FIRST_INTEGER_VALUETYPE;
+         IntVT <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL;
+         ++IntVT) {
+      NVT = (MVT::SimpleValueType)IntVT;
+      // The type needs to big enough to hold the result.
+      if (NVT.bitsGE(RVT))
+        LC = Signed ? RTLIB::getFPTOSINT(SVT, NVT)
+                    : RTLIB::getFPTOUINT(SVT, NVT);
+    }
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall");
+
+    SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue();
+    TargetLowering::MakeLibCallOptions CallOptions;
+    std::pair<SDValue, SDValue> Tmp =
+        TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, dl, Chain);
+
+    // Truncate the result if the libcall returns a larger type.
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, RVT, Tmp.first));
+    if (IsStrict)
+      Results.push_back(Tmp.second);
+    break;
+  }
+
+  case ISD::FP_ROUND:
+  case ISD::STRICT_FP_ROUND: {
+    // X = FP_ROUND(Y, TRUNC)
+    // TRUNC is a flag, which is always an integer that is zero or one.
+    // If TRUNC is 0, this is a normal rounding, if it is 1, this FP_ROUND
+    // is known to not change the value of Y.
+    // We can only expand it into libcall if the TRUNC is 0.
+    bool IsStrict = Node->isStrictFPOpcode();
+    SDValue Op = Node->getOperand(IsStrict ? 1 : 0);
+    SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue();
+    EVT VT = Node->getValueType(0);
+    assert(cast<ConstantSDNode>(Node->getOperand(IsStrict ? 2 : 1))->isZero() &&
+           "Unable to expand as libcall if it is not normal rounding");
+
+    RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), VT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall");
+
+    TargetLowering::MakeLibCallOptions CallOptions;
+    std::pair<SDValue, SDValue> Tmp =
+        TLI.makeLibCall(DAG, LC, VT, Op, CallOptions, SDLoc(Node), Chain);
+    Results.push_back(Tmp.first);
+    if (IsStrict)
+      Results.push_back(Tmp.second);
+    break;
+  }
+  case ISD::FP_EXTEND: {
+    Results.push_back(
+        ExpandLibCall(RTLIB::getFPEXT(Node->getOperand(0).getValueType(),
+                                      Node->getValueType(0)),
+                      Node, false).first);
+    break;
+  }
+  case ISD::STRICT_FP_EXTEND:
+  case ISD::STRICT_FP_TO_FP16: {
+    RTLIB::Libcall LC =
+        Node->getOpcode() == ISD::STRICT_FP_TO_FP16
+            ? RTLIB::getFPROUND(Node->getOperand(1).getValueType(), MVT::f16)
+            : RTLIB::getFPEXT(Node->getOperand(1).getValueType(),
+                              Node->getValueType(0));
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall");
+
+    TargetLowering::MakeLibCallOptions CallOptions;
+    std::pair<SDValue, SDValue> Tmp =
+        TLI.makeLibCall(DAG, LC, Node->getValueType(0), Node->getOperand(1),
+                        CallOptions, SDLoc(Node), Node->getOperand(0));
+    Results.push_back(Tmp.first);
+    Results.push_back(Tmp.second);
+    break;
+  }
+  case ISD::FSUB:
+  case ISD::STRICT_FSUB:
+    ExpandFPLibCall(Node, RTLIB::SUB_F32, RTLIB::SUB_F64,
+                    RTLIB::SUB_F80, RTLIB::SUB_F128,
+                    RTLIB::SUB_PPCF128, Results);
+    break;
+  case ISD::SREM:
+    Results.push_back(ExpandIntLibCall(Node, true,
+                                       RTLIB::SREM_I8,
+                                       RTLIB::SREM_I16, RTLIB::SREM_I32,
+                                       RTLIB::SREM_I64, RTLIB::SREM_I128));
+    break;
+  case ISD::UREM:
+    Results.push_back(ExpandIntLibCall(Node, false,
+                                       RTLIB::UREM_I8,
+                                       RTLIB::UREM_I16, RTLIB::UREM_I32,
+                                       RTLIB::UREM_I64, RTLIB::UREM_I128));
+    break;
+  case ISD::SDIV:
+    Results.push_back(ExpandIntLibCall(Node, true,
+                                       RTLIB::SDIV_I8,
+                                       RTLIB::SDIV_I16, RTLIB::SDIV_I32,
+                                       RTLIB::SDIV_I64, RTLIB::SDIV_I128));
+    break;
+  case ISD::UDIV:
+    Results.push_back(ExpandIntLibCall(Node, false,
+                                       RTLIB::UDIV_I8,
+                                       RTLIB::UDIV_I16, RTLIB::UDIV_I32,
+                                       RTLIB::UDIV_I64, RTLIB::UDIV_I128));
+    break;
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+    // Expand into divrem libcall
+    ExpandDivRemLibCall(Node, Results);
+    break;
+  case ISD::MUL:
+    Results.push_back(ExpandIntLibCall(Node, false,
+                                       RTLIB::MUL_I8,
+                                       RTLIB::MUL_I16, RTLIB::MUL_I32,
+                                       RTLIB::MUL_I64, RTLIB::MUL_I128));
+    break;
+  case ISD::CTLZ_ZERO_UNDEF:
+    switch (Node->getSimpleValueType(0).SimpleTy) {
+    default:
+      llvm_unreachable("LibCall explicitly requested, but not available");
+    case MVT::i32:
+      Results.push_back(ExpandLibCall(RTLIB::CTLZ_I32, Node, false).first);
+      break;
+    case MVT::i64:
+      Results.push_back(ExpandLibCall(RTLIB::CTLZ_I64, Node, false).first);
+      break;
+    case MVT::i128:
+      Results.push_back(ExpandLibCall(RTLIB::CTLZ_I128, Node, false).first);
+      break;
+    }
+    break;
+  case ISD::RESET_FPENV: {
+    // It is legalized to call 'fesetenv(FE_DFL_ENV)'. On most targets
+    // FE_DFL_ENV is defined as '((const fenv_t *) -1)' in glibc.
+    SDValue Ptr = DAG.getIntPtrConstant(-1LL, dl);
+    SDValue Chain = Node->getOperand(0);
+    Results.push_back(
+        DAG.makeStateFunctionCall(RTLIB::FESETENV, Ptr, Chain, dl));
+    break;
+  }
+  case ISD::GET_FPENV_MEM: {
+    SDValue Chain = Node->getOperand(0);
+    SDValue EnvPtr = Node->getOperand(1);
+    Results.push_back(
+        DAG.makeStateFunctionCall(RTLIB::FEGETENV, EnvPtr, Chain, dl));
+    break;
+  }
+  case ISD::SET_FPENV_MEM: {
+    SDValue Chain = Node->getOperand(0);
+    SDValue EnvPtr = Node->getOperand(1);
+    Results.push_back(
+        DAG.makeStateFunctionCall(RTLIB::FESETENV, EnvPtr, Chain, dl));
+    break;
+  }
+  }
+
+  // Replace the original node with the legalized result.
+  if (!Results.empty()) {
+    LLVM_DEBUG(dbgs() << "Successfully converted node to libcall\n");
+    ReplaceNode(Node, Results.data());
+  } else
+    LLVM_DEBUG(dbgs() << "Could not convert node to libcall\n");
+}
+
+// Determine the vector type to use in place of an original scalar element when
+// promoting equally sized vectors.
+static MVT getPromotedVectorElementType(const TargetLowering &TLI,
+                                        MVT EltVT, MVT NewEltVT) {
+  unsigned OldEltsPerNewElt = EltVT.getSizeInBits() / NewEltVT.getSizeInBits();
+  MVT MidVT = MVT::getVectorVT(NewEltVT, OldEltsPerNewElt);
+  assert(TLI.isTypeLegal(MidVT) && "unexpected");
+  return MidVT;
+}
+
+void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
+  LLVM_DEBUG(dbgs() << "Trying to promote node\n");
+  SmallVector<SDValue, 8> Results;
+  MVT OVT = Node->getSimpleValueType(0);
+  if (Node->getOpcode() == ISD::UINT_TO_FP ||
+      Node->getOpcode() == ISD::SINT_TO_FP ||
+      Node->getOpcode() == ISD::SETCC ||
+      Node->getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+      Node->getOpcode() == ISD::INSERT_VECTOR_ELT) {
+    OVT = Node->getOperand(0).getSimpleValueType();
+  }
+  if (Node->getOpcode() == ISD::STRICT_UINT_TO_FP ||
+      Node->getOpcode() == ISD::STRICT_SINT_TO_FP ||
+      Node->getOpcode() == ISD::STRICT_FSETCC ||
+      Node->getOpcode() == ISD::STRICT_FSETCCS)
+    OVT = Node->getOperand(1).getSimpleValueType();
+  if (Node->getOpcode() == ISD::BR_CC ||
+      Node->getOpcode() == ISD::SELECT_CC)
+    OVT = Node->getOperand(2).getSimpleValueType();
+  MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), OVT);
+  SDLoc dl(Node);
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4;
+  switch (Node->getOpcode()) {
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTPOP:
+    // Zero extend the argument unless its cttz, then use any_extend.
+    if (Node->getOpcode() == ISD::CTTZ ||
+        Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF)
+      Tmp1 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(0));
+    else
+      Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
+
+    if (Node->getOpcode() == ISD::CTTZ) {
+      // The count is the same in the promoted type except if the original
+      // value was zero.  This can be handled by setting the bit just off
+      // the top of the original type.
+      auto TopBit = APInt::getOneBitSet(NVT.getSizeInBits(),
+                                        OVT.getSizeInBits());
+      Tmp1 = DAG.getNode(ISD::OR, dl, NVT, Tmp1,
+                         DAG.getConstant(TopBit, dl, NVT));
+    }
+    // Perform the larger operation. For CTPOP and CTTZ_ZERO_UNDEF, this is
+    // already the correct result.
+    Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
+    if (Node->getOpcode() == ISD::CTLZ ||
+        Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
+      // Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT))
+      Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1,
+                          DAG.getConstant(NVT.getSizeInBits() -
+                                          OVT.getSizeInBits(), dl, NVT));
+    }
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
+    break;
+  case ISD::BITREVERSE:
+  case ISD::BSWAP: {
+    unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
+    Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
+    Tmp1 = DAG.getNode(
+        ISD::SRL, dl, NVT, Tmp1,
+        DAG.getConstant(DiffBits, dl,
+                        TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
+
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
+    break;
+  }
+  case ISD::FP_TO_UINT:
+  case ISD::STRICT_FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+  case ISD::STRICT_FP_TO_SINT:
+    PromoteLegalFP_TO_INT(Node, dl, Results);
+    break;
+  case ISD::FP_TO_UINT_SAT:
+  case ISD::FP_TO_SINT_SAT:
+    Results.push_back(PromoteLegalFP_TO_INT_SAT(Node, dl));
+    break;
+  case ISD::UINT_TO_FP:
+  case ISD::STRICT_UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+  case ISD::STRICT_SINT_TO_FP:
+    PromoteLegalINT_TO_FP(Node, dl, Results);
+    break;
+  case ISD::VAARG: {
+    SDValue Chain = Node->getOperand(0); // Get the chain.
+    SDValue Ptr = Node->getOperand(1); // Get the pointer.
+
+    unsigned TruncOp;
+    if (OVT.isVector()) {
+      TruncOp = ISD::BITCAST;
+    } else {
+      assert(OVT.isInteger()
+        && "VAARG promotion is supported only for vectors or integer types");
+      TruncOp = ISD::TRUNCATE;
+    }
+
+    // Perform the larger operation, then convert back
+    Tmp1 = DAG.getVAArg(NVT, dl, Chain, Ptr, Node->getOperand(2),
+             Node->getConstantOperandVal(3));
+    Chain = Tmp1.getValue(1);
+
+    Tmp2 = DAG.getNode(TruncOp, dl, OVT, Tmp1);
+
+    // Modified the chain result - switch anything that used the old chain to
+    // use the new one.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp2);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
+    if (UpdatedNodes) {
+      UpdatedNodes->insert(Tmp2.getNode());
+      UpdatedNodes->insert(Chain.getNode());
+    }
+    ReplacedNode(Node);
+    break;
+  }
+  case ISD::MUL:
+  case ISD::SDIV:
+  case ISD::SREM:
+  case ISD::UDIV:
+  case ISD::UREM:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    unsigned ExtOp, TruncOp;
+    if (OVT.isVector()) {
+      ExtOp   = ISD::BITCAST;
+      TruncOp = ISD::BITCAST;
+    } else {
+      assert(OVT.isInteger() && "Cannot promote logic operation");
+
+      switch (Node->getOpcode()) {
+      default:
+        ExtOp = ISD::ANY_EXTEND;
+        break;
+      case ISD::SDIV:
+      case ISD::SREM:
+        ExtOp = ISD::SIGN_EXTEND;
+        break;
+      case ISD::UDIV:
+      case ISD::UREM:
+        ExtOp = ISD::ZERO_EXTEND;
+        break;
+      }
+      TruncOp = ISD::TRUNCATE;
+    }
+    // Promote each of the values to the new type.
+    Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
+    // Perform the larger operation, then convert back
+    Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);
+    Results.push_back(DAG.getNode(TruncOp, dl, OVT, Tmp1));
+    break;
+  }
+  case ISD::UMUL_LOHI:
+  case ISD::SMUL_LOHI: {
+    // Promote to a multiply in a wider integer type.
+    unsigned ExtOp = Node->getOpcode() == ISD::UMUL_LOHI ? ISD::ZERO_EXTEND
+                                                         : ISD::SIGN_EXTEND;
+    Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
+    Tmp1 = DAG.getNode(ISD::MUL, dl, NVT, Tmp1, Tmp2);
+
+    auto &DL = DAG.getDataLayout();
+    unsigned OriginalSize = OVT.getScalarSizeInBits();
+    Tmp2 = DAG.getNode(
+        ISD::SRL, dl, NVT, Tmp1,
+        DAG.getConstant(OriginalSize, dl, TLI.getScalarShiftAmountTy(DL, NVT)));
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp2));
+    break;
+  }
+  case ISD::SELECT: {
+    unsigned ExtOp, TruncOp;
+    if (Node->getValueType(0).isVector() ||
+        Node->getValueType(0).getSizeInBits() == NVT.getSizeInBits()) {
+      ExtOp   = ISD::BITCAST;
+      TruncOp = ISD::BITCAST;
+    } else if (Node->getValueType(0).isInteger()) {
+      ExtOp   = ISD::ANY_EXTEND;
+      TruncOp = ISD::TRUNCATE;
+    } else {
+      ExtOp   = ISD::FP_EXTEND;
+      TruncOp = ISD::FP_ROUND;
+    }
+    Tmp1 = Node->getOperand(0);
+    // Promote each of the values to the new type.
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
+    Tmp3 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
+    // Perform the larger operation, then round down.
+    Tmp1 = DAG.getSelect(dl, NVT, Tmp1, Tmp2, Tmp3);
+    Tmp1->setFlags(Node->getFlags());
+    if (TruncOp != ISD::FP_ROUND)
+      Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1);
+    else
+      Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1,
+                         DAG.getIntPtrConstant(0, dl));
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Node)->getMask();
+
+    // Cast the two input vectors.
+    Tmp1 = DAG.getNode(ISD::BITCAST, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::BITCAST, dl, NVT, Node->getOperand(1));
+
+    // Convert the shuffle mask to the right # elements.
+    Tmp1 = ShuffleWithNarrowerEltType(NVT, OVT, dl, Tmp1, Tmp2, Mask);
+    Tmp1 = DAG.getNode(ISD::BITCAST, dl, OVT, Tmp1);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::VECTOR_SPLICE: {
+    Tmp1 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(1));
+    Tmp3 = DAG.getNode(ISD::VECTOR_SPLICE, dl, NVT, Tmp1, Tmp2,
+                       Node->getOperand(2));
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp3));
+    break;
+  }
+  case ISD::SELECT_CC: {
+    SDValue Cond = Node->getOperand(4);
+    ISD::CondCode CCCode = cast<CondCodeSDNode>(Cond)->get();
+    // Type of the comparison operands.
+    MVT CVT = Node->getSimpleValueType(0);
+    assert(CVT == OVT && "not handled");
+
+    unsigned ExtOp = ISD::FP_EXTEND;
+    if (NVT.isInteger()) {
+      ExtOp = isSignedIntSetCC(CCCode) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    }
+
+    // Promote the comparison operands, if needed.
+    if (TLI.isCondCodeLegal(CCCode, CVT)) {
+      Tmp1 = Node->getOperand(0);
+      Tmp2 = Node->getOperand(1);
+    } else {
+      Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
+      Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
+    }
+    // Cast the true/false operands.
+    Tmp3 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
+    Tmp4 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(3));
+
+    Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, NVT, {Tmp1, Tmp2, Tmp3, Tmp4, Cond},
+                       Node->getFlags());
+
+    // Cast the result back to the original type.
+    if (ExtOp != ISD::FP_EXTEND)
+      Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1);
+    else
+      Tmp1 = DAG.getNode(ISD::FP_ROUND, dl, OVT, Tmp1,
+                         DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
+
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SETCC:
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS: {
+    unsigned ExtOp = ISD::FP_EXTEND;
+    if (NVT.isInteger()) {
+      ISD::CondCode CCCode = cast<CondCodeSDNode>(Node->getOperand(2))->get();
+      ExtOp = isSignedIntSetCC(CCCode) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    }
+    if (Node->isStrictFPOpcode()) {
+      SDValue InChain = Node->getOperand(0);
+      std::tie(Tmp1, std::ignore) =
+          DAG.getStrictFPExtendOrRound(Node->getOperand(1), InChain, dl, NVT);
+      std::tie(Tmp2, std::ignore) =
+          DAG.getStrictFPExtendOrRound(Node->getOperand(2), InChain, dl, NVT);
+      SmallVector<SDValue, 2> TmpChains = {Tmp1.getValue(1), Tmp2.getValue(1)};
+      SDValue OutChain = DAG.getTokenFactor(dl, TmpChains);
+      SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
+      Results.push_back(DAG.getNode(Node->getOpcode(), dl, VTs,
+                                    {OutChain, Tmp1, Tmp2, Node->getOperand(3)},
+                                    Node->getFlags()));
+      Results.push_back(Results.back().getValue(1));
+      break;
+    }
+    Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
+    Results.push_back(DAG.getNode(ISD::SETCC, dl, Node->getValueType(0), Tmp1,
+                                  Tmp2, Node->getOperand(2), Node->getFlags()));
+    break;
+  }
+  case ISD::BR_CC: {
+    unsigned ExtOp = ISD::FP_EXTEND;
+    if (NVT.isInteger()) {
+      ISD::CondCode CCCode =
+        cast<CondCodeSDNode>(Node->getOperand(1))->get();
+      ExtOp = isSignedIntSetCC(CCCode) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    }
+    Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(3));
+    Results.push_back(DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0),
+                                  Node->getOperand(0), Node->getOperand(1),
+                                  Tmp1, Tmp2, Node->getOperand(4)));
+    break;
+  }
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
+  case ISD::FPOW:
+    Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1));
+    Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2,
+                       Node->getFlags());
+    Results.push_back(
+        DAG.getNode(ISD::FP_ROUND, dl, OVT, Tmp3,
+                    DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)));
+    break;
+  case ISD::STRICT_FADD:
+  case ISD::STRICT_FSUB:
+  case ISD::STRICT_FMUL:
+  case ISD::STRICT_FDIV:
+  case ISD::STRICT_FMINNUM:
+  case ISD::STRICT_FMAXNUM:
+  case ISD::STRICT_FREM:
+  case ISD::STRICT_FPOW:
+    Tmp1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
+                       {Node->getOperand(0), Node->getOperand(1)});
+    Tmp2 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
+                       {Node->getOperand(0), Node->getOperand(2)});
+    Tmp3 = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Tmp1.getValue(1),
+                       Tmp2.getValue(1));
+    Tmp1 = DAG.getNode(Node->getOpcode(), dl, {NVT, MVT::Other},
+                       {Tmp3, Tmp1, Tmp2});
+    Tmp1 = DAG.getNode(ISD::STRICT_FP_ROUND, dl, {OVT, MVT::Other},
+                       {Tmp1.getValue(1), Tmp1, DAG.getIntPtrConstant(0, dl)});
+    Results.push_back(Tmp1);
+    Results.push_back(Tmp1.getValue(1));
+    break;
+  case ISD::FMA:
+    Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1));
+    Tmp3 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(2));
+    Results.push_back(
+        DAG.getNode(ISD::FP_ROUND, dl, OVT,
+                    DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2, Tmp3),
+                    DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)));
+    break;
+  case ISD::STRICT_FMA:
+    Tmp1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
+                       {Node->getOperand(0), Node->getOperand(1)});
+    Tmp2 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
+                       {Node->getOperand(0), Node->getOperand(2)});
+    Tmp3 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
+                       {Node->getOperand(0), Node->getOperand(3)});
+    Tmp4 = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Tmp1.getValue(1),
+                       Tmp2.getValue(1), Tmp3.getValue(1));
+    Tmp4 = DAG.getNode(Node->getOpcode(), dl, {NVT, MVT::Other},
+                       {Tmp4, Tmp1, Tmp2, Tmp3});
+    Tmp4 = DAG.getNode(ISD::STRICT_FP_ROUND, dl, {OVT, MVT::Other},
+                       {Tmp4.getValue(1), Tmp4, DAG.getIntPtrConstant(0, dl)});
+    Results.push_back(Tmp4);
+    Results.push_back(Tmp4.getValue(1));
+    break;
+  case ISD::FCOPYSIGN:
+  case ISD::FLDEXP:
+  case ISD::FPOWI: {
+    Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp2 = Node->getOperand(1);
+    Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);
+
+    // fcopysign doesn't change anything but the sign bit, so
+    //   (fp_round (fcopysign (fpext a), b))
+    // is as precise as
+    //   (fp_round (fpext a))
+    // which is a no-op. Mark it as a TRUNCating FP_ROUND.
+    const bool isTrunc = (Node->getOpcode() == ISD::FCOPYSIGN);
+    Results.push_back(
+        DAG.getNode(ISD::FP_ROUND, dl, OVT, Tmp3,
+                    DAG.getIntPtrConstant(isTrunc, dl, /*isTarget=*/true)));
+    break;
+  }
+  case ISD::STRICT_FPOWI:
+    Tmp1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
+                       {Node->getOperand(0), Node->getOperand(1)});
+    Tmp2 = DAG.getNode(Node->getOpcode(), dl, {NVT, MVT::Other},
+                       {Tmp1.getValue(1), Tmp1, Node->getOperand(2)});
+    Tmp3 = DAG.getNode(ISD::STRICT_FP_ROUND, dl, {OVT, MVT::Other},
+                       {Tmp2.getValue(1), Tmp2, DAG.getIntPtrConstant(0, dl)});
+    Results.push_back(Tmp3);
+    Results.push_back(Tmp3.getValue(1));
+    break;
+  case ISD::FFREXP: {
+    Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::FFREXP, dl, {NVT, Node->getValueType(1)}, Tmp1);
+
+    Results.push_back(
+        DAG.getNode(ISD::FP_ROUND, dl, OVT, Tmp2,
+                    DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)));
+
+    Results.push_back(Tmp2.getValue(1));
+    break;
+  }
+  case ISD::FFLOOR:
+  case ISD::FCEIL:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::FROUND:
+  case ISD::FROUNDEVEN:
+  case ISD::FTRUNC:
+  case ISD::FNEG:
+  case ISD::FSQRT:
+  case ISD::FSIN:
+  case ISD::FCOS:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+  case ISD::FABS:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+    Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
+    Results.push_back(
+        DAG.getNode(ISD::FP_ROUND, dl, OVT, Tmp2,
+                    DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)));
+    break;
+  case ISD::STRICT_FFLOOR:
+  case ISD::STRICT_FCEIL:
+  case ISD::STRICT_FRINT:
+  case ISD::STRICT_FNEARBYINT:
+  case ISD::STRICT_FROUND:
+  case ISD::STRICT_FROUNDEVEN:
+  case ISD::STRICT_FTRUNC:
+  case ISD::STRICT_FSQRT:
+  case ISD::STRICT_FSIN:
+  case ISD::STRICT_FCOS:
+  case ISD::STRICT_FLOG:
+  case ISD::STRICT_FLOG2:
+  case ISD::STRICT_FLOG10:
+  case ISD::STRICT_FEXP:
+  case ISD::STRICT_FEXP2:
+    Tmp1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
+                       {Node->getOperand(0), Node->getOperand(1)});
+    Tmp2 = DAG.getNode(Node->getOpcode(), dl, {NVT, MVT::Other},
+                       {Tmp1.getValue(1), Tmp1});
+    Tmp3 = DAG.getNode(ISD::STRICT_FP_ROUND, dl, {OVT, MVT::Other},
+                       {Tmp2.getValue(1), Tmp2, DAG.getIntPtrConstant(0, dl)});
+    Results.push_back(Tmp3);
+    Results.push_back(Tmp3.getValue(1));
+    break;
+  case ISD::BUILD_VECTOR: {
+    MVT EltVT = OVT.getVectorElementType();
+    MVT NewEltVT = NVT.getVectorElementType();
+
+    // Handle bitcasts to a different vector type with the same total bit size
+    //
+    // e.g. v2i64 = build_vector i64:x, i64:y => v4i32
+    //  =>
+    //  v4i32 = concat_vectors (v2i32 (bitcast i64:x)), (v2i32 (bitcast i64:y))
+
+    assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
+           "Invalid promote type for build_vector");
+    assert(NewEltVT.bitsLT(EltVT) && "not handled");
+
+    MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
+
+    SmallVector<SDValue, 8> NewOps;
+    for (unsigned I = 0, E = Node->getNumOperands(); I != E; ++I) {
+      SDValue Op = Node->getOperand(I);
+      NewOps.push_back(DAG.getNode(ISD::BITCAST, SDLoc(Op), MidVT, Op));
+    }
+
+    SDLoc SL(Node);
+    SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SL, NVT, NewOps);
+    SDValue CvtVec = DAG.getNode(ISD::BITCAST, SL, OVT, Concat);
+    Results.push_back(CvtVec);
+    break;
+  }
+  case ISD::EXTRACT_VECTOR_ELT: {
+    MVT EltVT = OVT.getVectorElementType();
+    MVT NewEltVT = NVT.getVectorElementType();
+
+    // Handle bitcasts to a different vector type with the same total bit size.
+    //
+    // e.g. v2i64 = extract_vector_elt x:v2i64, y:i32
+    //  =>
+    //  v4i32:castx = bitcast x:v2i64
+    //
+    // i64 = bitcast
+    //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
+    //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
+    //
+
+    assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
+           "Invalid promote type for extract_vector_elt");
+    assert(NewEltVT.bitsLT(EltVT) && "not handled");
+
+    MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
+    unsigned NewEltsPerOldElt = MidVT.getVectorNumElements();
+
+    SDValue Idx = Node->getOperand(1);
+    EVT IdxVT = Idx.getValueType();
+    SDLoc SL(Node);
+    SDValue Factor = DAG.getConstant(NewEltsPerOldElt, SL, IdxVT);
+    SDValue NewBaseIdx = DAG.getNode(ISD::MUL, SL, IdxVT, Idx, Factor);
+
+    SDValue CastVec = DAG.getNode(ISD::BITCAST, SL, NVT, Node->getOperand(0));
+
+    SmallVector<SDValue, 8> NewOps;
+    for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
+      SDValue IdxOffset = DAG.getConstant(I, SL, IdxVT);
+      SDValue TmpIdx = DAG.getNode(ISD::ADD, SL, IdxVT, NewBaseIdx, IdxOffset);
+
+      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, NewEltVT,
+                                CastVec, TmpIdx);
+      NewOps.push_back(Elt);
+    }
+
+    SDValue NewVec = DAG.getBuildVector(MidVT, SL, NewOps);
+    Results.push_back(DAG.getNode(ISD::BITCAST, SL, EltVT, NewVec));
+    break;
+  }
+  case ISD::INSERT_VECTOR_ELT: {
+    MVT EltVT = OVT.getVectorElementType();
+    MVT NewEltVT = NVT.getVectorElementType();
+
+    // Handle bitcasts to a different vector type with the same total bit size
+    //
+    // e.g. v2i64 = insert_vector_elt x:v2i64, y:i64, z:i32
+    //  =>
+    //  v4i32:castx = bitcast x:v2i64
+    //  v2i32:casty = bitcast y:i64
+    //
+    // v2i64 = bitcast
+    //   (v4i32 insert_vector_elt
+    //       (v4i32 insert_vector_elt v4i32:castx,
+    //                                (extract_vector_elt casty, 0), 2 * z),
+    //        (extract_vector_elt casty, 1), (2 * z + 1))
+
+    assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
+           "Invalid promote type for insert_vector_elt");
+    assert(NewEltVT.bitsLT(EltVT) && "not handled");
+
+    MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
+    unsigned NewEltsPerOldElt = MidVT.getVectorNumElements();
+
+    SDValue Val = Node->getOperand(1);
+    SDValue Idx = Node->getOperand(2);
+    EVT IdxVT = Idx.getValueType();
+    SDLoc SL(Node);
+
+    SDValue Factor = DAG.getConstant(NewEltsPerOldElt, SDLoc(), IdxVT);
+    SDValue NewBaseIdx = DAG.getNode(ISD::MUL, SL, IdxVT, Idx, Factor);
+
+    SDValue CastVec = DAG.getNode(ISD::BITCAST, SL, NVT, Node->getOperand(0));
+    SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, MidVT, Val);
+
+    SDValue NewVec = CastVec;
+    for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
+      SDValue IdxOffset = DAG.getConstant(I, SL, IdxVT);
+      SDValue InEltIdx = DAG.getNode(ISD::ADD, SL, IdxVT, NewBaseIdx, IdxOffset);
+
+      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, NewEltVT,
+                                CastVal, IdxOffset);
+
+      NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NVT,
+                           NewVec, Elt, InEltIdx);
+    }
+
+    Results.push_back(DAG.getNode(ISD::BITCAST, SL, OVT, NewVec));
+    break;
+  }
+  case ISD::SCALAR_TO_VECTOR: {
+    MVT EltVT = OVT.getVectorElementType();
+    MVT NewEltVT = NVT.getVectorElementType();
+
+    // Handle bitcasts to different vector type with the same total bit size.
+    //
+    // e.g. v2i64 = scalar_to_vector x:i64
+    //   =>
+    //  concat_vectors (v2i32 bitcast x:i64), (v2i32 undef)
+    //
+
+    MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
+    SDValue Val = Node->getOperand(0);
+    SDLoc SL(Node);
+
+    SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, MidVT, Val);
+    SDValue Undef = DAG.getUNDEF(MidVT);
+
+    SmallVector<SDValue, 8> NewElts;
+    NewElts.push_back(CastVal);
+    for (unsigned I = 1, NElts = OVT.getVectorNumElements(); I != NElts; ++I)
+      NewElts.push_back(Undef);
+
+    SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SL, NVT, NewElts);
+    SDValue CvtVec = DAG.getNode(ISD::BITCAST, SL, OVT, Concat);
+    Results.push_back(CvtVec);
+    break;
+  }
+  case ISD::ATOMIC_SWAP: {
+    AtomicSDNode *AM = cast<AtomicSDNode>(Node);
+    SDLoc SL(Node);
+    SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NVT, AM->getVal());
+    assert(NVT.getSizeInBits() == OVT.getSizeInBits() &&
+           "unexpected promotion type");
+    assert(AM->getMemoryVT().getSizeInBits() == NVT.getSizeInBits() &&
+           "unexpected atomic_swap with illegal type");
+
+    SDValue NewAtomic
+      = DAG.getAtomic(ISD::ATOMIC_SWAP, SL, NVT,
+                      DAG.getVTList(NVT, MVT::Other),
+                      { AM->getChain(), AM->getBasePtr(), CastVal },
+                      AM->getMemOperand());
+    Results.push_back(DAG.getNode(ISD::BITCAST, SL, OVT, NewAtomic));
+    Results.push_back(NewAtomic.getValue(1));
+    break;
+  }
+  }
+
+  // Replace the original node with the legalized result.
+  if (!Results.empty()) {
+    LLVM_DEBUG(dbgs() << "Successfully promoted node\n");
+    ReplaceNode(Node, Results.data());
+  } else
+    LLVM_DEBUG(dbgs() << "Could not promote node\n");
+}
+
+/// This is the entry point for the file.
+void SelectionDAG::Legalize() {
+  AssignTopologicalOrder();
+
+  SmallPtrSet<SDNode *, 16> LegalizedNodes;
+  // Use a delete listener to remove nodes which were deleted during
+  // legalization from LegalizeNodes. This is needed to handle the situation
+  // where a new node is allocated by the object pool to the same address of a
+  // previously deleted node.
+  DAGNodeDeletedListener DeleteListener(
+      *this,
+      [&LegalizedNodes](SDNode *N, SDNode *E) { LegalizedNodes.erase(N); });
+
+  SelectionDAGLegalize Legalizer(*this, LegalizedNodes);
+
+  // Visit all the nodes. We start in topological order, so that we see
+  // nodes with their original operands intact. Legalization can produce
+  // new nodes which may themselves need to be legalized. Iterate until all
+  // nodes have been legalized.
+  while (true) {
+    bool AnyLegalized = false;
+    for (auto NI = allnodes_end(); NI != allnodes_begin();) {
+      --NI;
+
+      SDNode *N = &*NI;
+      if (N->use_empty() && N != getRoot().getNode()) {
+        ++NI;
+        DeleteNode(N);
+        continue;
+      }
+
+      if (LegalizedNodes.insert(N).second) {
+        AnyLegalized = true;
+        Legalizer.LegalizeOp(N);
+
+        if (N->use_empty() && N != getRoot().getNode()) {
+          ++NI;
+          DeleteNode(N);
+        }
+      }
+    }
+    if (!AnyLegalized)
+      break;
+
+  }
+
+  // Remove dead nodes now.
+  RemoveDeadNodes();
+}
+
+bool SelectionDAG::LegalizeOp(SDNode *N,
+                              SmallSetVector<SDNode *, 16> &UpdatedNodes) {
+  SmallPtrSet<SDNode *, 16> LegalizedNodes;
+  SelectionDAGLegalize Legalizer(*this, LegalizedNodes, &UpdatedNodes);
+
+  // Directly insert the node in question, and legalize it. This will recurse
+  // as needed through operands.
+  LegalizedNodes.insert(N);
+  Legalizer.LegalizeOp(N);
+
+  return LegalizedNodes.count(N);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
new file mode 100644
index 000000000000..7e035d21ef71
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -0,0 +1,3207 @@
+//===-------- LegalizeFloatTypes.cpp - Legalization of float types --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements float type expansion and softening for LegalizeTypes.
+// Softening is the act of turning a computation in an illegal floating point
+// type into a computation in an integer type of the same size; also known as
+// "soft float".  For example, turning f32 arithmetic into operations using i32.
+// The resulting integer value is the same as what you would get by performing
+// the floating point operation and bitcasting the result to the integer type.
+// Expansion is the act of changing a computation in an illegal type to be a
+// computation in two identical registers of a smaller type.  For example,
+// implementing ppcf128 arithmetic in two f64 registers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "legalize-types"
+
+/// GetFPLibCall - Return the right libcall for the given floating point type.
+/// FIXME: This is a local version of RTLIB::getFPLibCall that should be
+///        refactored away (see RTLIB::getPOWI for an example).
+static RTLIB::Libcall GetFPLibCall(EVT VT,
+                                   RTLIB::Libcall Call_F32,
+                                   RTLIB::Libcall Call_F64,
+                                   RTLIB::Libcall Call_F80,
+                                   RTLIB::Libcall Call_F128,
+                                   RTLIB::Libcall Call_PPCF128) {
+  return
+    VT == MVT::f32 ? Call_F32 :
+    VT == MVT::f64 ? Call_F64 :
+    VT == MVT::f80 ? Call_F80 :
+    VT == MVT::f128 ? Call_F128 :
+    VT == MVT::ppcf128 ? Call_PPCF128 :
+    RTLIB::UNKNOWN_LIBCALL;
+}
+
+//===----------------------------------------------------------------------===//
+//  Convert Float Results to Integer
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
+  LLVM_DEBUG(dbgs() << "Soften float result " << ResNo << ": "; N->dump(&DAG);
+             dbgs() << "\n");
+  SDValue R = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "SoftenFloatResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to soften the result of this "
+                       "operator!");
+
+    case ISD::ARITH_FENCE: R = SoftenFloatRes_ARITH_FENCE(N); break;
+    case ISD::MERGE_VALUES:R = SoftenFloatRes_MERGE_VALUES(N, ResNo); break;
+    case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N); break;
+    case ISD::BUILD_PAIR:  R = SoftenFloatRes_BUILD_PAIR(N); break;
+    case ISD::ConstantFP:  R = SoftenFloatRes_ConstantFP(N); break;
+    case ISD::EXTRACT_VECTOR_ELT:
+      R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N, ResNo); break;
+    case ISD::FABS:        R = SoftenFloatRes_FABS(N); break;
+    case ISD::STRICT_FMINNUM:
+    case ISD::FMINNUM:     R = SoftenFloatRes_FMINNUM(N); break;
+    case ISD::STRICT_FMAXNUM:
+    case ISD::FMAXNUM:     R = SoftenFloatRes_FMAXNUM(N); break;
+    case ISD::STRICT_FADD:
+    case ISD::FADD:        R = SoftenFloatRes_FADD(N); break;
+    case ISD::FCBRT:       R = SoftenFloatRes_FCBRT(N); break;
+    case ISD::STRICT_FCEIL:
+    case ISD::FCEIL:       R = SoftenFloatRes_FCEIL(N); break;
+    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N); break;
+    case ISD::STRICT_FCOS:
+    case ISD::FCOS:        R = SoftenFloatRes_FCOS(N); break;
+    case ISD::STRICT_FDIV:
+    case ISD::FDIV:        R = SoftenFloatRes_FDIV(N); break;
+    case ISD::STRICT_FEXP:
+    case ISD::FEXP:        R = SoftenFloatRes_FEXP(N); break;
+    case ISD::STRICT_FEXP2:
+    case ISD::FEXP2:       R = SoftenFloatRes_FEXP2(N); break;
+    case ISD::STRICT_FFLOOR:
+    case ISD::FFLOOR:      R = SoftenFloatRes_FFLOOR(N); break;
+    case ISD::STRICT_FLOG:
+    case ISD::FLOG:        R = SoftenFloatRes_FLOG(N); break;
+    case ISD::STRICT_FLOG2:
+    case ISD::FLOG2:       R = SoftenFloatRes_FLOG2(N); break;
+    case ISD::STRICT_FLOG10:
+    case ISD::FLOG10:      R = SoftenFloatRes_FLOG10(N); break;
+    case ISD::STRICT_FMA:
+    case ISD::FMA:         R = SoftenFloatRes_FMA(N); break;
+    case ISD::STRICT_FMUL:
+    case ISD::FMUL:        R = SoftenFloatRes_FMUL(N); break;
+    case ISD::STRICT_FNEARBYINT:
+    case ISD::FNEARBYINT:  R = SoftenFloatRes_FNEARBYINT(N); break;
+    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N); break;
+    case ISD::STRICT_FP_EXTEND:
+    case ISD::FP_EXTEND:   R = SoftenFloatRes_FP_EXTEND(N); break;
+    case ISD::STRICT_FP_ROUND:
+    case ISD::FP_ROUND:    R = SoftenFloatRes_FP_ROUND(N); break;
+    case ISD::FP16_TO_FP:  R = SoftenFloatRes_FP16_TO_FP(N); break;
+    case ISD::BF16_TO_FP:  R = SoftenFloatRes_BF16_TO_FP(N); break;
+    case ISD::STRICT_FPOW:
+    case ISD::FPOW:        R = SoftenFloatRes_FPOW(N); break;
+    case ISD::STRICT_FPOWI:
+    case ISD::FPOWI:
+    case ISD::FLDEXP:
+    case ISD::STRICT_FLDEXP: R = SoftenFloatRes_ExpOp(N); break;
+    case ISD::FFREXP:
+      R = SoftenFloatRes_FFREXP(N);
+      break;
+    case ISD::STRICT_FREM:
+    case ISD::FREM:        R = SoftenFloatRes_FREM(N); break;
+    case ISD::STRICT_FRINT:
+    case ISD::FRINT:       R = SoftenFloatRes_FRINT(N); break;
+    case ISD::STRICT_FROUND:
+    case ISD::FROUND:      R = SoftenFloatRes_FROUND(N); break;
+    case ISD::STRICT_FROUNDEVEN:
+    case ISD::FROUNDEVEN:  R = SoftenFloatRes_FROUNDEVEN(N); break;
+    case ISD::STRICT_FSIN:
+    case ISD::FSIN:        R = SoftenFloatRes_FSIN(N); break;
+    case ISD::STRICT_FSQRT:
+    case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
+    case ISD::STRICT_FSUB:
+    case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
+    case ISD::STRICT_FTRUNC:
+    case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
+    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N); break;
+    case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
+    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N); break;
+    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N); break;
+    case ISD::FREEZE:      R = SoftenFloatRes_FREEZE(N); break;
+    case ISD::STRICT_SINT_TO_FP:
+    case ISD::STRICT_UINT_TO_FP:
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:  R = SoftenFloatRes_XINT_TO_FP(N); break;
+    case ISD::UNDEF:       R = SoftenFloatRes_UNDEF(N); break;
+    case ISD::VAARG:       R = SoftenFloatRes_VAARG(N); break;
+    case ISD::VECREDUCE_FADD:
+    case ISD::VECREDUCE_FMUL:
+    case ISD::VECREDUCE_FMIN:
+    case ISD::VECREDUCE_FMAX:
+    case ISD::VECREDUCE_FMAXIMUM:
+    case ISD::VECREDUCE_FMINIMUM:
+      R = SoftenFloatRes_VECREDUCE(N);
+      break;
+    case ISD::VECREDUCE_SEQ_FADD:
+    case ISD::VECREDUCE_SEQ_FMUL:
+      R = SoftenFloatRes_VECREDUCE_SEQ(N);
+      break;
+  }
+
+  // If R is null, the sub-method took care of registering the result.
+  if (R.getNode()) {
+    assert(R.getNode() != N);
+    SetSoftenedFloat(SDValue(N, ResNo), R);
+  }
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_Unary(SDNode *N, RTLIB::Libcall LC) {
+  bool IsStrict = N->isStrictFPOpcode();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned Offset = IsStrict ? 1 : 0;
+  assert(N->getNumOperands() == (1 + Offset) &&
+         "Unexpected number of operands!");
+  SDValue Op = GetSoftenedFloat(N->getOperand(0 + Offset));
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpVT = N->getOperand(0 + Offset).getValueType();
+  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
+                                                    CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  return Tmp.first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_Binary(SDNode *N, RTLIB::Libcall LC) {
+  bool IsStrict = N->isStrictFPOpcode();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned Offset = IsStrict ? 1 : 0;
+  assert(N->getNumOperands() == (2 + Offset) &&
+         "Unexpected number of operands!");
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0 + Offset)),
+                     GetSoftenedFloat(N->getOperand(1 + Offset)) };
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0 + Offset).getValueType(),
+                   N->getOperand(1 + Offset).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Ops,
+                                                    CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  return Tmp.first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N) {
+  return BitConvertToInteger(N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FREEZE(SDNode *N) {
+  EVT Ty = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.getNode(ISD::FREEZE, SDLoc(N), Ty,
+                     GetSoftenedFloat(N->getOperand(0)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_ARITH_FENCE(SDNode *N) {
+  EVT Ty = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue NewFence = DAG.getNode(ISD::ARITH_FENCE, SDLoc(N), Ty,
+                                 GetSoftenedFloat(N->getOperand(0)));
+  return NewFence;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_MERGE_VALUES(SDNode *N,
+                                                      unsigned ResNo) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  return BitConvertToInteger(Op);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_BUILD_PAIR(SDNode *N) {
+  // Convert the inputs to integers, and build a new pair out of them.
+  return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N),
+                     TLI.getTypeToTransformTo(*DAG.getContext(),
+                                              N->getValueType(0)),
+                     BitConvertToInteger(N->getOperand(0)),
+                     BitConvertToInteger(N->getOperand(1)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N) {
+  ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
+  // In ppcf128, the high 64 bits are always first in memory regardless
+  // of Endianness. LLVM's APFloat representation is not Endian sensitive,
+  // and so always converts into a 128-bit APInt in a non-Endian-sensitive
+  // way. However, APInt's are serialized in an Endian-sensitive fashion,
+  // so on big-Endian targets, the two doubles are output in the wrong
+  // order. Fix this by manually flipping the order of the high 64 bits
+  // and the low 64 bits here.
+  if (DAG.getDataLayout().isBigEndian() &&
+      CN->getValueType(0).getSimpleVT() == llvm::MVT::ppcf128) {
+    uint64_t words[2] = { CN->getValueAPF().bitcastToAPInt().getRawData()[1],
+                          CN->getValueAPF().bitcastToAPInt().getRawData()[0] };
+    APInt Val(128, words);
+    return DAG.getConstant(Val, SDLoc(CN),
+                           TLI.getTypeToTransformTo(*DAG.getContext(),
+                                                    CN->getValueType(0)));
+  } else {
+    return DAG.getConstant(CN->getValueAPF().bitcastToAPInt(), SDLoc(CN),
+                           TLI.getTypeToTransformTo(*DAG.getContext(),
+                                                    CN->getValueType(0)));
+  }
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N, unsigned ResNo) {
+  SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0));
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
+                     NewOp.getValueType().getVectorElementType(),
+                     NewOp, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned Size = NVT.getSizeInBits();
+
+  // Mask = ~(1 << (Size-1))
+  APInt API = APInt::getAllOnes(Size);
+  API.clearBit(Size - 1);
+  SDValue Mask = DAG.getConstant(API, SDLoc(N), NVT);
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return DAG.getNode(ISD::AND, SDLoc(N), NVT, Op, Mask);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMINNUM(SDNode *N) {
+  if (SDValue SelCC = TLI.createSelectForFMINNUM_FMAXNUM(N, DAG))
+    return SoftenFloatRes_SELECT_CC(SelCC.getNode());
+  return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                               RTLIB::FMIN_F32,
+                                               RTLIB::FMIN_F64,
+                                               RTLIB::FMIN_F80,
+                                               RTLIB::FMIN_F128,
+                                               RTLIB::FMIN_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXNUM(SDNode *N) {
+  if (SDValue SelCC = TLI.createSelectForFMINNUM_FMAXNUM(N, DAG))
+    return SoftenFloatRes_SELECT_CC(SelCC.getNode());
+  return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                               RTLIB::FMAX_F32,
+                                               RTLIB::FMAX_F64,
+                                               RTLIB::FMAX_F80,
+                                               RTLIB::FMAX_F128,
+                                               RTLIB::FMAX_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
+  return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                               RTLIB::ADD_F32,
+                                               RTLIB::ADD_F64,
+                                               RTLIB::ADD_F80,
+                                               RTLIB::ADD_F128,
+                                               RTLIB::ADD_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCBRT(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::CBRT_F32,
+                                           RTLIB::CBRT_F64,
+                                           RTLIB::CBRT_F80,
+                                           RTLIB::CBRT_F128,
+                                           RTLIB::CBRT_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::CEIL_F32,
+                                              RTLIB::CEIL_F64,
+                                              RTLIB::CEIL_F80,
+                                              RTLIB::CEIL_F128,
+                                              RTLIB::CEIL_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
+  SDValue LHS = GetSoftenedFloat(N->getOperand(0));
+  SDValue RHS = BitConvertToInteger(N->getOperand(1));
+  SDLoc dl(N);
+
+  EVT LVT = LHS.getValueType();
+  EVT RVT = RHS.getValueType();
+
+  unsigned LSize = LVT.getSizeInBits();
+  unsigned RSize = RVT.getSizeInBits();
+
+  // First get the sign bit of second operand.
+  SDValue SignBit = DAG.getNode(
+      ISD::SHL, dl, RVT, DAG.getConstant(1, dl, RVT),
+      DAG.getConstant(RSize - 1, dl,
+                      TLI.getShiftAmountTy(RVT, DAG.getDataLayout())));
+  SignBit = DAG.getNode(ISD::AND, dl, RVT, RHS, SignBit);
+
+  // Shift right or sign-extend it if the two operands have different types.
+  int SizeDiff = RVT.getSizeInBits() - LVT.getSizeInBits();
+  if (SizeDiff > 0) {
+    SignBit =
+        DAG.getNode(ISD::SRL, dl, RVT, SignBit,
+                    DAG.getConstant(SizeDiff, dl,
+                                    TLI.getShiftAmountTy(SignBit.getValueType(),
+                                                         DAG.getDataLayout())));
+    SignBit = DAG.getNode(ISD::TRUNCATE, dl, LVT, SignBit);
+  } else if (SizeDiff < 0) {
+    SignBit = DAG.getNode(ISD::ANY_EXTEND, dl, LVT, SignBit);
+    SignBit =
+        DAG.getNode(ISD::SHL, dl, LVT, SignBit,
+                    DAG.getConstant(-SizeDiff, dl,
+                                    TLI.getShiftAmountTy(SignBit.getValueType(),
+                                                         DAG.getDataLayout())));
+  }
+
+  // Clear the sign bit of the first operand.
+  SDValue Mask = DAG.getNode(
+      ISD::SHL, dl, LVT, DAG.getConstant(1, dl, LVT),
+      DAG.getConstant(LSize - 1, dl,
+                      TLI.getShiftAmountTy(LVT, DAG.getDataLayout())));
+  Mask = DAG.getNode(ISD::SUB, dl, LVT, Mask, DAG.getConstant(1, dl, LVT));
+  LHS = DAG.getNode(ISD::AND, dl, LVT, LHS, Mask);
+
+  // Or the value with the sign bit.
+  return DAG.getNode(ISD::OR, dl, LVT, LHS, SignBit);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCOS(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::COS_F32,
+                                              RTLIB::COS_F64,
+                                              RTLIB::COS_F80,
+                                              RTLIB::COS_F128,
+                                              RTLIB::COS_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
+  return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                               RTLIB::DIV_F32,
+                                               RTLIB::DIV_F64,
+                                               RTLIB::DIV_F80,
+                                               RTLIB::DIV_F128,
+                                               RTLIB::DIV_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::EXP_F32,
+                                              RTLIB::EXP_F64,
+                                              RTLIB::EXP_F80,
+                                              RTLIB::EXP_F128,
+                                              RTLIB::EXP_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::EXP2_F32,
+                                              RTLIB::EXP2_F64,
+                                              RTLIB::EXP2_F80,
+                                              RTLIB::EXP2_F128,
+                                              RTLIB::EXP2_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::FLOOR_F32,
+                                              RTLIB::FLOOR_F64,
+                                              RTLIB::FLOOR_F80,
+                                              RTLIB::FLOOR_F128,
+                                              RTLIB::FLOOR_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::LOG_F32,
+                                              RTLIB::LOG_F64,
+                                              RTLIB::LOG_F80,
+                                              RTLIB::LOG_F128,
+                                              RTLIB::LOG_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::LOG2_F32,
+                                              RTLIB::LOG2_F64,
+                                              RTLIB::LOG2_F80,
+                                              RTLIB::LOG2_F128,
+                                              RTLIB::LOG2_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::LOG10_F32,
+                                              RTLIB::LOG10_F64,
+                                              RTLIB::LOG10_F80,
+                                              RTLIB::LOG10_F128,
+                                              RTLIB::LOG10_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
+  bool IsStrict = N->isStrictFPOpcode();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned Offset = IsStrict ? 1 : 0;
+  SDValue Ops[3] = { GetSoftenedFloat(N->getOperand(0 + Offset)),
+                     GetSoftenedFloat(N->getOperand(1 + Offset)),
+                     GetSoftenedFloat(N->getOperand(2 + Offset)) };
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[3] = { N->getOperand(0 + Offset).getValueType(),
+                   N->getOperand(1 + Offset).getValueType(),
+                   N->getOperand(2 + Offset).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG,
+                                                    GetFPLibCall(N->getValueType(0),
+                                                                 RTLIB::FMA_F32,
+                                                                 RTLIB::FMA_F64,
+                                                                 RTLIB::FMA_F80,
+                                                                 RTLIB::FMA_F128,
+                                                                 RTLIB::FMA_PPCF128),
+                         NVT, Ops, CallOptions, SDLoc(N), Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  return Tmp.first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
+  return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                               RTLIB::MUL_F32,
+                                               RTLIB::MUL_F64,
+                                               RTLIB::MUL_F80,
+                                               RTLIB::MUL_F128,
+                                               RTLIB::MUL_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::NEARBYINT_F32,
+                                              RTLIB::NEARBYINT_F64,
+                                              RTLIB::NEARBYINT_F80,
+                                              RTLIB::NEARBYINT_F128,
+                                              RTLIB::NEARBYINT_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+
+  // Expand Y = FNEG(X) -> Y = X ^ sign mask
+  APInt SignMask = APInt::getSignMask(NVT.getSizeInBits());
+  return DAG.getNode(ISD::XOR, dl, NVT, GetSoftenedFloat(N->getOperand(0)),
+                     DAG.getConstant(SignMask, dl, NVT));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
+  bool IsStrict = N->isStrictFPOpcode();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+
+  if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat) {
+    Op = GetPromotedFloat(Op);
+    // If the promotion did the FP_EXTEND to the destination type for us,
+    // there's nothing left to do here.
+    if (Op.getValueType() == N->getValueType(0))
+      return BitConvertToInteger(Op);
+  }
+
+  // There's only a libcall for f16 -> f32 and shifting is only valid for bf16
+  // -> f32, so proceed in two stages. Also, it's entirely possible for both
+  // f16 and f32 to be legal, so use the fully hard-float FP_EXTEND rather
+  // than FP16_TO_FP.
+  if ((Op.getValueType() == MVT::f16 || Op.getValueType() == MVT::bf16) &&
+      N->getValueType(0) != MVT::f32) {
+    if (IsStrict) {
+      Op = DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(N),
+                       { MVT::f32, MVT::Other }, { Chain, Op });
+      Chain = Op.getValue(1);
+    } else {
+      Op = DAG.getNode(ISD::FP_EXTEND, SDLoc(N), MVT::f32, Op);
+    }
+  }
+
+  if (Op.getValueType() == MVT::bf16)
+    return SoftenFloatRes_BF16_TO_FP(N);
+
+  RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpVT = N->getOperand(IsStrict ? 1 : 0).getValueType();
+  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
+                                                    CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  return Tmp.first;
+}
+
+// FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special
+// nodes?
+SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) {
+  EVT MidVT = TLI.getTypeToTransformTo(*DAG.getContext(), MVT::f32);
+  SDValue Op = N->getOperand(0);
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  SDValue Res32 = TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, MidVT, Op,
+                                  CallOptions, SDLoc(N)).first;
+  if (N->getValueType(0) == MVT::f32)
+    return Res32;
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  RTLIB::Libcall LC = RTLIB::getFPEXT(MVT::f32, N->getValueType(0));
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
+  return TLI.makeLibCall(DAG, LC, NVT, Res32, CallOptions, SDLoc(N)).first;
+}
+
+// FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special
+// nodes?
+SDValue DAGTypeLegalizer::SoftenFloatRes_BF16_TO_FP(SDNode *N) {
+  assert(N->getValueType(0) == MVT::f32 &&
+         "Can only soften BF16_TO_FP with f32 result");
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), MVT::f32);
+  SDValue Op = N->getOperand(0);
+  SDLoc DL(N);
+  Op = DAG.getNode(ISD::ANY_EXTEND, DL, NVT,
+                   DAG.getNode(ISD::BITCAST, DL, MVT::i16, Op));
+  SDValue Res = DAG.getNode(ISD::SHL, DL, NVT, Op,
+                            DAG.getShiftAmountConstant(16, NVT, DL));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
+  bool IsStrict = N->isStrictFPOpcode();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0));
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpVT = N->getOperand(IsStrict ? 1 : 0).getValueType();
+  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
+                                                    CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  return Tmp.first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
+  return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                               RTLIB::POW_F32,
+                                               RTLIB::POW_F64,
+                                               RTLIB::POW_F80,
+                                               RTLIB::POW_F128,
+                                               RTLIB::POW_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_ExpOp(SDNode *N) {
+  bool IsStrict = N->isStrictFPOpcode();
+  unsigned Offset = IsStrict ? 1 : 0;
+  assert((N->getOperand(1 + Offset).getValueType() == MVT::i16 ||
+          N->getOperand(1 + Offset).getValueType() == MVT::i32) &&
+         "Unsupported power type!");
+  bool IsPowI =
+      N->getOpcode() == ISD::FPOWI || N->getOpcode() == ISD::STRICT_FPOWI;
+
+  RTLIB::Libcall LC = IsPowI ? RTLIB::getPOWI(N->getValueType(0))
+                             : RTLIB::getLDEXP(N->getValueType(0));
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fpowi.");
+  if (!TLI.getLibcallName(LC)) {
+    // Some targets don't have a powi libcall; use pow instead.
+    // FIXME: Implement this if some target needs it.
+    DAG.getContext()->emitError("Don't know how to soften fpowi to fpow");
+    return DAG.getUNDEF(N->getValueType(0));
+  }
+
+  if (DAG.getLibInfo().getIntSize() !=
+      N->getOperand(1 + Offset).getValueType().getSizeInBits()) {
+    // If the exponent does not match with sizeof(int) a libcall to RTLIB::POWI
+    // would use the wrong type for the argument.
+    DAG.getContext()->emitError("POWI exponent does not match sizeof(int)");
+    return DAG.getUNDEF(N->getValueType(0));
+  }
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0 + Offset)),
+                     N->getOperand(1 + Offset) };
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0 + Offset).getValueType(),
+                   N->getOperand(1 + Offset).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Ops,
+                                                    CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  return Tmp.first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FFREXP(SDNode *N) {
+  assert(!N->isStrictFPOpcode() && "strictfp not implemented for frexp");
+  EVT VT0 = N->getValueType(0);
+  EVT VT1 = N->getValueType(1);
+  RTLIB::Libcall LC = RTLIB::getFREXP(VT0);
+
+  if (DAG.getLibInfo().getIntSize() != VT1.getSizeInBits()) {
+    // If the exponent does not match with sizeof(int) a libcall would use the
+    // wrong type for the argument.
+    // TODO: Should be able to handle mismatches.
+    DAG.getContext()->emitError("ffrexp exponent does not match sizeof(int)");
+    return DAG.getUNDEF(N->getValueType(0));
+  }
+
+  EVT NVT0 = TLI.getTypeToTransformTo(*DAG.getContext(), VT0);
+  SDValue StackSlot = DAG.CreateStackTemporary(VT1);
+
+  SDLoc DL(N);
+
+  TargetLowering::MakeLibCallOptions CallOptions;
+  SDValue Ops[2] = {GetSoftenedFloat(N->getOperand(0)), StackSlot};
+  EVT OpsVT[2] = {VT0, StackSlot.getValueType()};
+
+  // TODO: setTypeListBeforeSoften can't properly express multiple return types,
+  // but we only really need to handle the 0th one for softening anyway.
+  CallOptions.setTypeListBeforeSoften({OpsVT}, VT0, true);
+
+  auto [ReturnVal, Chain] = TLI.makeLibCall(DAG, LC, NVT0, Ops, CallOptions, DL,
+                                            /*Chain=*/SDValue());
+  int FrameIdx = cast<FrameIndexSDNode>(StackSlot)->getIndex();
+  auto PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+
+  SDValue LoadExp = DAG.getLoad(VT1, DL, Chain, StackSlot, PtrInfo);
+
+  ReplaceValueWith(SDValue(N, 1), LoadExp);
+  return ReturnVal;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
+  return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                               RTLIB::REM_F32,
+                                               RTLIB::REM_F64,
+                                               RTLIB::REM_F80,
+                                               RTLIB::REM_F128,
+                                               RTLIB::REM_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::RINT_F32,
+                                              RTLIB::RINT_F64,
+                                              RTLIB::RINT_F80,
+                                              RTLIB::RINT_F128,
+                                              RTLIB::RINT_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FROUND(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::ROUND_F32,
+                                              RTLIB::ROUND_F64,
+                                              RTLIB::ROUND_F80,
+                                              RTLIB::ROUND_F128,
+                                              RTLIB::ROUND_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FROUNDEVEN(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::ROUNDEVEN_F32,
+                                              RTLIB::ROUNDEVEN_F64,
+                                              RTLIB::ROUNDEVEN_F80,
+                                              RTLIB::ROUNDEVEN_F128,
+                                              RTLIB::ROUNDEVEN_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::SIN_F32,
+                                              RTLIB::SIN_F64,
+                                              RTLIB::SIN_F80,
+                                              RTLIB::SIN_F128,
+                                              RTLIB::SIN_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::SQRT_F32,
+                                              RTLIB::SQRT_F64,
+                                              RTLIB::SQRT_F80,
+                                              RTLIB::SQRT_F128,
+                                              RTLIB::SQRT_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
+  return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                               RTLIB::SUB_F32,
+                                               RTLIB::SUB_F64,
+                                               RTLIB::SUB_F80,
+                                               RTLIB::SUB_F128,
+                                               RTLIB::SUB_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::TRUNC_F32,
+                                              RTLIB::TRUNC_F64,
+                                              RTLIB::TRUNC_F80,
+                                              RTLIB::TRUNC_F128,
+                                              RTLIB::TRUNC_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
+  LoadSDNode *L = cast<LoadSDNode>(N);
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDLoc dl(N);
+
+  auto MMOFlags =
+      L->getMemOperand()->getFlags() &
+      ~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
+  SDValue NewL;
+  if (L->getExtensionType() == ISD::NON_EXTLOAD) {
+    NewL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(), NVT, dl,
+                       L->getChain(), L->getBasePtr(), L->getOffset(),
+                       L->getPointerInfo(), NVT, L->getOriginalAlign(),
+                       MMOFlags, L->getAAInfo());
+    // Legalized the chain result - switch anything that used the old chain to
+    // use the new one.
+    ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+    return NewL;
+  }
+
+  // Do a non-extending load followed by FP_EXTEND.
+  NewL = DAG.getLoad(L->getAddressingMode(), ISD::NON_EXTLOAD, L->getMemoryVT(),
+                     dl, L->getChain(), L->getBasePtr(), L->getOffset(),
+                     L->getPointerInfo(), L->getMemoryVT(),
+                     L->getOriginalAlign(), MMOFlags, L->getAAInfo());
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+  auto ExtendNode = DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL);
+  return BitConvertToInteger(ExtendNode);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) {
+  SDValue LHS = GetSoftenedFloat(N->getOperand(1));
+  SDValue RHS = GetSoftenedFloat(N->getOperand(2));
+  return DAG.getSelect(SDLoc(N),
+                       LHS.getValueType(), N->getOperand(0), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N) {
+  SDValue LHS = GetSoftenedFloat(N->getOperand(2));
+  SDValue RHS = GetSoftenedFloat(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
+                     LHS.getValueType(), N->getOperand(0),
+                     N->getOperand(1), LHS, RHS, N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(),
+                                               N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_VAARG(SDNode *N) {
+  SDValue Chain = N->getOperand(0); // Get the chain.
+  SDValue Ptr = N->getOperand(1); // Get the pointer.
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDLoc dl(N);
+
+  SDValue NewVAARG;
+  NewVAARG = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2),
+                          N->getConstantOperandVal(3));
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  if (N != NewVAARG.getValue(1).getNode())
+    ReplaceValueWith(SDValue(N, 1), NewVAARG.getValue(1));
+  return NewVAARG;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
+  bool IsStrict = N->isStrictFPOpcode();
+  bool Signed = N->getOpcode() == ISD::SINT_TO_FP ||
+                N->getOpcode() == ISD::STRICT_SINT_TO_FP;
+  EVT SVT = N->getOperand(IsStrict ? 1 : 0).getValueType();
+  EVT RVT = N->getValueType(0);
+  EVT NVT = EVT();
+  SDLoc dl(N);
+
+  // If the input is not legal, eg: i1 -> fp, then it needs to be promoted to
+  // a larger type, eg: i8 -> fp.  Even if it is legal, no libcall may exactly
+  // match.  Look for an appropriate libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  for (unsigned t = MVT::FIRST_INTEGER_VALUETYPE;
+       t <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL; ++t) {
+    NVT = (MVT::SimpleValueType)t;
+    // The source needs to big enough to hold the operand.
+    if (NVT.bitsGE(SVT))
+      LC = Signed ? RTLIB::getSINTTOFP(NVT, RVT):RTLIB::getUINTTOFP (NVT, RVT);
+  }
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
+
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  // Sign/zero extend the argument if the libcall takes a larger type.
+  SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                           NVT, N->getOperand(IsStrict ? 1 : 0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(Signed);
+  CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+  std::pair<SDValue, SDValue> Tmp =
+      TLI.makeLibCall(DAG, LC, TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
+                      Op, CallOptions, dl, Chain);
+
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  return Tmp.first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_VECREDUCE(SDNode *N) {
+  // Expand and soften recursively.
+  ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG));
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_VECREDUCE_SEQ(SDNode *N) {
+  ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduceSeq(N, DAG));
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Convert Float Operand to Integer
+//===----------------------------------------------------------------------===//
+
+bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
+  LLVM_DEBUG(dbgs() << "Soften float operand " << OpNo << ": "; N->dump(&DAG);
+             dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "SoftenFloatOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to soften this operator's operand!");
+
+  case ISD::BITCAST:     Res = SoftenFloatOp_BITCAST(N); break;
+  case ISD::BR_CC:       Res = SoftenFloatOp_BR_CC(N); break;
+  case ISD::STRICT_FP_TO_FP16:
+  case ISD::FP_TO_FP16:  // Same as FP_ROUND for softening purposes
+  case ISD::FP_TO_BF16:
+  case ISD::STRICT_FP_ROUND:
+  case ISD::FP_ROUND:    Res = SoftenFloatOp_FP_ROUND(N); break;
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:  Res = SoftenFloatOp_FP_TO_XINT(N); break;
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+                         Res = SoftenFloatOp_FP_TO_XINT_SAT(N); break;
+  case ISD::STRICT_LROUND:
+  case ISD::LROUND:      Res = SoftenFloatOp_LROUND(N); break;
+  case ISD::STRICT_LLROUND:
+  case ISD::LLROUND:     Res = SoftenFloatOp_LLROUND(N); break;
+  case ISD::STRICT_LRINT:
+  case ISD::LRINT:       Res = SoftenFloatOp_LRINT(N); break;
+  case ISD::STRICT_LLRINT:
+  case ISD::LLRINT:      Res = SoftenFloatOp_LLRINT(N); break;
+  case ISD::SELECT_CC:   Res = SoftenFloatOp_SELECT_CC(N); break;
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS:
+  case ISD::SETCC:       Res = SoftenFloatOp_SETCC(N); break;
+  case ISD::STORE:       Res = SoftenFloatOp_STORE(N, OpNo); break;
+  case ISD::FCOPYSIGN:   Res = SoftenFloatOp_FCOPYSIGN(N); break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this to re-analyze.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand softening");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) {
+  SDValue Op0 = GetSoftenedFloat(N->getOperand(0));
+
+  return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
+  // We actually deal with the partially-softened FP_TO_FP16 node too, which
+  // returns an i16 so doesn't meet the constraints necessary for FP_ROUND.
+  assert(N->getOpcode() == ISD::FP_ROUND || N->getOpcode() == ISD::FP_TO_FP16 ||
+         N->getOpcode() == ISD::STRICT_FP_TO_FP16 ||
+         N->getOpcode() == ISD::FP_TO_BF16 ||
+         N->getOpcode() == ISD::STRICT_FP_ROUND);
+
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+  EVT SVT = Op.getValueType();
+  EVT RVT = N->getValueType(0);
+  EVT FloatRVT = RVT;
+  if (N->getOpcode() == ISD::FP_TO_FP16 ||
+      N->getOpcode() == ISD::STRICT_FP_TO_FP16)
+    FloatRVT = MVT::f16;
+  else if (N->getOpcode() == ISD::FP_TO_BF16)
+    FloatRVT = MVT::bf16;
+
+  RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, FloatRVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
+
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  Op = GetSoftenedFloat(Op);
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RVT, Op,
+                                                    CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict) {
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+    ReplaceValueWith(SDValue(N, 0), Tmp.first);
+    return SDValue();
+  }
+  return Tmp.first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
+
+  EVT VT = NewLHS.getValueType();
+  NewLHS = GetSoftenedFloat(NewLHS);
+  NewRHS = GetSoftenedFloat(NewRHS);
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N),
+                          N->getOperand(2), N->getOperand(3));
+
+  // If softenSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (!NewRHS.getNode()) {
+    NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                DAG.getCondCode(CCCode), NewLHS, NewRHS,
+                                N->getOperand(4)),
+                 0);
+}
+
+// Even if the result type is legal, no libcall may exactly match. (e.g. We
+// don't have FP-i8 conversions) This helper method looks for an appropriate
+// promoted libcall.
+static RTLIB::Libcall findFPToIntLibcall(EVT SrcVT, EVT RetVT, EVT &Promoted,
+                                         bool Signed) {
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  for (unsigned IntVT = MVT::FIRST_INTEGER_VALUETYPE;
+       IntVT <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL;
+       ++IntVT) {
+    Promoted = (MVT::SimpleValueType)IntVT;
+    // The type needs to big enough to hold the result.
+    if (Promoted.bitsGE(RetVT))
+      LC = Signed ? RTLIB::getFPTOSINT(SrcVT, Promoted)
+                  : RTLIB::getFPTOUINT(SrcVT, Promoted);
+  }
+  return LC;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
+  bool IsStrict = N->isStrictFPOpcode();
+  bool Signed = N->getOpcode() == ISD::FP_TO_SINT ||
+                N->getOpcode() == ISD::STRICT_FP_TO_SINT;
+
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+  EVT SVT = Op.getValueType();
+  EVT RVT = N->getValueType(0);
+  EVT NVT = EVT();
+  SDLoc dl(N);
+
+  // If the result is not legal, eg: fp -> i1, then it needs to be promoted to
+  // a larger type, eg: fp -> i32. Even if it is legal, no libcall may exactly
+  // match, eg. we don't have fp -> i8 conversions.
+  // Look for an appropriate libcall.
+  RTLIB::Libcall LC = findFPToIntLibcall(SVT, RVT, NVT, Signed);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && NVT.isSimple() &&
+         "Unsupported FP_TO_XINT!");
+
+  Op = GetSoftenedFloat(Op);
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
+                                                    CallOptions, dl, Chain);
+
+  // Truncate the result if the libcall returns a larger type.
+  SDValue Res = DAG.getNode(ISD::TRUNCATE, dl, RVT, Tmp.first);
+
+  if (!IsStrict)
+    return Res;
+
+  ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT_SAT(SDNode *N) {
+  SDValue Res = TLI.expandFP_TO_INT_SAT(N, DAG);
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
+
+  EVT VT = NewLHS.getValueType();
+  NewLHS = GetSoftenedFloat(NewLHS);
+  NewRHS = GetSoftenedFloat(NewRHS);
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N),
+                          N->getOperand(0), N->getOperand(1));
+
+  // If softenSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (!NewRHS.getNode()) {
+    NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                N->getOperand(2), N->getOperand(3),
+                                DAG.getCondCode(CCCode)),
+                 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_SETCC(SDNode *N) {
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
+  SDValue Op1 = N->getOperand(IsStrict ? 2 : 1);
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  ISD::CondCode CCCode =
+      cast<CondCodeSDNode>(N->getOperand(IsStrict ? 3 : 2))->get();
+
+  EVT VT = Op0.getValueType();
+  SDValue NewLHS = GetSoftenedFloat(Op0);
+  SDValue NewRHS = GetSoftenedFloat(Op1);
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N), Op0, Op1,
+                          Chain, N->getOpcode() == ISD::STRICT_FSETCCS);
+
+  // Update N to have the operands specified.
+  if (NewRHS.getNode()) {
+    if (IsStrict)
+      NewLHS = DAG.getNode(ISD::SETCC, SDLoc(N), N->getValueType(0), NewLHS,
+                           NewRHS, DAG.getCondCode(CCCode));
+    else
+      return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                            DAG.getCondCode(CCCode)), 0);
+  }
+
+  // Otherwise, softenSetCCOperands returned a scalar, use it.
+  assert((NewRHS.getNode() || NewLHS.getValueType() == N->getValueType(0)) &&
+         "Unexpected setcc expansion!");
+
+  if (IsStrict) {
+    ReplaceValueWith(SDValue(N, 0), NewLHS);
+    ReplaceValueWith(SDValue(N, 1), Chain);
+    return SDValue();
+  }
+  return NewLHS;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_STORE(SDNode *N, unsigned OpNo) {
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  assert(OpNo == 1 && "Can only soften the stored value!");
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  SDValue Val = ST->getValue();
+  SDLoc dl(N);
+
+  if (ST->isTruncatingStore())
+    // Do an FP_ROUND followed by a non-truncating store.
+    Val = BitConvertToInteger(
+        DAG.getNode(ISD::FP_ROUND, dl, ST->getMemoryVT(), Val,
+                    DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)));
+  else
+    Val = GetSoftenedFloat(Val);
+
+  return DAG.getStore(ST->getChain(), dl, Val, ST->getBasePtr(),
+                      ST->getMemOperand());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FCOPYSIGN(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = BitConvertToInteger(N->getOperand(1));
+  SDLoc dl(N);
+
+  EVT LVT = LHS.getValueType();
+  EVT ILVT = EVT::getIntegerVT(*DAG.getContext(), LVT.getSizeInBits());
+  EVT RVT = RHS.getValueType();
+
+  unsigned LSize = LVT.getSizeInBits();
+  unsigned RSize = RVT.getSizeInBits();
+
+  // Shift right or sign-extend it if the two operands have different types.
+  int SizeDiff = RSize - LSize;
+  if (SizeDiff > 0) {
+    RHS =
+        DAG.getNode(ISD::SRL, dl, RVT, RHS,
+                    DAG.getConstant(SizeDiff, dl,
+                                    TLI.getShiftAmountTy(RHS.getValueType(),
+                                                         DAG.getDataLayout())));
+    RHS = DAG.getNode(ISD::TRUNCATE, dl, ILVT, RHS);
+  } else if (SizeDiff < 0) {
+    RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LVT, RHS);
+    RHS =
+        DAG.getNode(ISD::SHL, dl, ILVT, RHS,
+                    DAG.getConstant(-SizeDiff, dl,
+                                    TLI.getShiftAmountTy(RHS.getValueType(),
+                                                         DAG.getDataLayout())));
+  }
+
+  RHS = DAG.getBitcast(LVT, RHS);
+  return DAG.getNode(ISD::FCOPYSIGN, dl, LVT, LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_Unary(SDNode *N, RTLIB::Libcall LC) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  bool IsStrict = N->isStrictFPOpcode();
+  unsigned Offset = IsStrict ? 1 : 0;
+  SDValue Op = GetSoftenedFloat(N->getOperand(0 + Offset));
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpVT = N->getOperand(0 + Offset).getValueType();
+  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
+                                                    CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict) {
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+    ReplaceValueWith(SDValue(N, 0), Tmp.first);
+    return SDValue();
+  }
+
+  return Tmp.first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_LROUND(SDNode *N) {
+  EVT OpVT = N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType();
+  return SoftenFloatOp_Unary(N, GetFPLibCall(OpVT,
+                                             RTLIB::LROUND_F32,
+                                             RTLIB::LROUND_F64,
+                                             RTLIB::LROUND_F80,
+                                             RTLIB::LROUND_F128,
+                                             RTLIB::LROUND_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_LLROUND(SDNode *N) {
+  EVT OpVT = N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType();
+  return SoftenFloatOp_Unary(N, GetFPLibCall(OpVT,
+                                             RTLIB::LLROUND_F32,
+                                             RTLIB::LLROUND_F64,
+                                             RTLIB::LLROUND_F80,
+                                             RTLIB::LLROUND_F128,
+                                             RTLIB::LLROUND_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_LRINT(SDNode *N) {
+  EVT OpVT = N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType();
+  return SoftenFloatOp_Unary(N, GetFPLibCall(OpVT,
+                                             RTLIB::LRINT_F32,
+                                             RTLIB::LRINT_F64,
+                                             RTLIB::LRINT_F80,
+                                             RTLIB::LRINT_F128,
+                                             RTLIB::LRINT_PPCF128));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_LLRINT(SDNode *N) {
+  EVT OpVT = N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType();
+  return SoftenFloatOp_Unary(N, GetFPLibCall(OpVT,
+                                             RTLIB::LLRINT_F32,
+                                             RTLIB::LLRINT_F64,
+                                             RTLIB::LLRINT_F80,
+                                             RTLIB::LLRINT_F128,
+                                             RTLIB::LLRINT_PPCF128));
+}
+
+//===----------------------------------------------------------------------===//
+//  Float Result Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandFloatResult - This method is called when the specified result of the
+/// specified node is found to need expansion.  At this point, the node may also
+/// have invalid operands or may have other results that need promotion, we just
+/// know that (at least) one result needs expansion.
+void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
+  LLVM_DEBUG(dbgs() << "Expand float result: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Lo, Hi;
+  Lo = Hi = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ExpandFloatResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to expand the result of this "
+                       "operator!");
+
+  case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
+  case ISD::SELECT:       SplitRes_Select(N, Lo, Hi); break;
+  case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
+
+  case ISD::MERGE_VALUES:       ExpandRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
+  case ISD::BITCAST:            ExpandRes_BITCAST(N, Lo, Hi); break;
+  case ISD::BUILD_PAIR:         ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
+  case ISD::EXTRACT_ELEMENT:    ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break;
+  case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break;
+  case ISD::VAARG:              ExpandRes_VAARG(N, Lo, Hi); break;
+
+  case ISD::ConstantFP: ExpandFloatRes_ConstantFP(N, Lo, Hi); break;
+  case ISD::FABS:       ExpandFloatRes_FABS(N, Lo, Hi); break;
+  case ISD::STRICT_FMINNUM:
+  case ISD::FMINNUM:    ExpandFloatRes_FMINNUM(N, Lo, Hi); break;
+  case ISD::STRICT_FMAXNUM:
+  case ISD::FMAXNUM:    ExpandFloatRes_FMAXNUM(N, Lo, Hi); break;
+  case ISD::STRICT_FADD:
+  case ISD::FADD:       ExpandFloatRes_FADD(N, Lo, Hi); break;
+  case ISD::FCBRT:      ExpandFloatRes_FCBRT(N, Lo, Hi); break;
+  case ISD::STRICT_FCEIL:
+  case ISD::FCEIL:      ExpandFloatRes_FCEIL(N, Lo, Hi); break;
+  case ISD::FCOPYSIGN:  ExpandFloatRes_FCOPYSIGN(N, Lo, Hi); break;
+  case ISD::STRICT_FCOS:
+  case ISD::FCOS:       ExpandFloatRes_FCOS(N, Lo, Hi); break;
+  case ISD::STRICT_FDIV:
+  case ISD::FDIV:       ExpandFloatRes_FDIV(N, Lo, Hi); break;
+  case ISD::STRICT_FEXP:
+  case ISD::FEXP:       ExpandFloatRes_FEXP(N, Lo, Hi); break;
+  case ISD::STRICT_FEXP2:
+  case ISD::FEXP2:      ExpandFloatRes_FEXP2(N, Lo, Hi); break;
+  case ISD::STRICT_FFLOOR:
+  case ISD::FFLOOR:     ExpandFloatRes_FFLOOR(N, Lo, Hi); break;
+  case ISD::STRICT_FLOG:
+  case ISD::FLOG:       ExpandFloatRes_FLOG(N, Lo, Hi); break;
+  case ISD::STRICT_FLOG2:
+  case ISD::FLOG2:      ExpandFloatRes_FLOG2(N, Lo, Hi); break;
+  case ISD::STRICT_FLOG10:
+  case ISD::FLOG10:     ExpandFloatRes_FLOG10(N, Lo, Hi); break;
+  case ISD::STRICT_FMA:
+  case ISD::FMA:        ExpandFloatRes_FMA(N, Lo, Hi); break;
+  case ISD::STRICT_FMUL:
+  case ISD::FMUL:       ExpandFloatRes_FMUL(N, Lo, Hi); break;
+  case ISD::STRICT_FNEARBYINT:
+  case ISD::FNEARBYINT: ExpandFloatRes_FNEARBYINT(N, Lo, Hi); break;
+  case ISD::FNEG:       ExpandFloatRes_FNEG(N, Lo, Hi); break;
+  case ISD::STRICT_FP_EXTEND:
+  case ISD::FP_EXTEND:  ExpandFloatRes_FP_EXTEND(N, Lo, Hi); break;
+  case ISD::STRICT_FPOW:
+  case ISD::FPOW:       ExpandFloatRes_FPOW(N, Lo, Hi); break;
+  case ISD::STRICT_FPOWI:
+  case ISD::FPOWI:      ExpandFloatRes_FPOWI(N, Lo, Hi); break;
+  case ISD::FLDEXP:
+  case ISD::STRICT_FLDEXP: ExpandFloatRes_FLDEXP(N, Lo, Hi); break;
+  case ISD::FREEZE:     ExpandFloatRes_FREEZE(N, Lo, Hi); break;
+  case ISD::STRICT_FRINT:
+  case ISD::FRINT:      ExpandFloatRes_FRINT(N, Lo, Hi); break;
+  case ISD::STRICT_FROUND:
+  case ISD::FROUND:     ExpandFloatRes_FROUND(N, Lo, Hi); break;
+  case ISD::STRICT_FROUNDEVEN:
+  case ISD::FROUNDEVEN: ExpandFloatRes_FROUNDEVEN(N, Lo, Hi); break;
+  case ISD::STRICT_FSIN:
+  case ISD::FSIN:       ExpandFloatRes_FSIN(N, Lo, Hi); break;
+  case ISD::STRICT_FSQRT:
+  case ISD::FSQRT:      ExpandFloatRes_FSQRT(N, Lo, Hi); break;
+  case ISD::STRICT_FSUB:
+  case ISD::FSUB:       ExpandFloatRes_FSUB(N, Lo, Hi); break;
+  case ISD::STRICT_FTRUNC:
+  case ISD::FTRUNC:     ExpandFloatRes_FTRUNC(N, Lo, Hi); break;
+  case ISD::LOAD:       ExpandFloatRes_LOAD(N, Lo, Hi); break;
+  case ISD::STRICT_SINT_TO_FP:
+  case ISD::STRICT_UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP: ExpandFloatRes_XINT_TO_FP(N, Lo, Hi); break;
+  case ISD::STRICT_FREM:
+  case ISD::FREM:       ExpandFloatRes_FREM(N, Lo, Hi); break;
+  }
+
+  // If Lo/Hi is null, the sub-method took care of registering results etc.
+  if (Lo.getNode())
+    SetExpandedFloat(SDValue(N, ResNo), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  assert(NVT.getSizeInBits() == 64 &&
+         "Do not know how to expand this float constant!");
+  APInt C = cast<ConstantFPSDNode>(N)->getValueAPF().bitcastToAPInt();
+  SDLoc dl(N);
+  Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                 APInt(64, C.getRawData()[1])),
+                         dl, NVT);
+  Hi = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                 APInt(64, C.getRawData()[0])),
+                         dl, NVT);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_Unary(SDNode *N, RTLIB::Libcall LC,
+                                            SDValue &Lo, SDValue &Hi) {
+  bool IsStrict = N->isStrictFPOpcode();
+  unsigned Offset = IsStrict ? 1 : 0;
+  SDValue Op = N->getOperand(0 + Offset);
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, N->getValueType(0),
+                                                    Op, CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  GetPairElements(Tmp.first, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_Binary(SDNode *N, RTLIB::Libcall LC,
+                                             SDValue &Lo, SDValue &Hi) {
+  bool IsStrict = N->isStrictFPOpcode();
+  unsigned Offset = IsStrict ? 1 : 0;
+  SDValue Ops[] = { N->getOperand(0 + Offset), N->getOperand(1 + Offset) };
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, N->getValueType(0),
+                                                    Ops, CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  GetPairElements(Tmp.first, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FABS(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  assert(N->getValueType(0) == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+  SDLoc dl(N);
+  SDValue Tmp;
+  GetExpandedFloat(N->getOperand(0), Lo, Tmp);
+  Hi = DAG.getNode(ISD::FABS, dl, Tmp.getValueType(), Tmp);
+  // Lo = Hi==fabs(Hi) ? Lo : -Lo;
+  Lo = DAG.getSelectCC(dl, Tmp, Hi, Lo,
+                   DAG.getNode(ISD::FNEG, dl, Lo.getValueType(), Lo),
+                   ISD::SETEQ);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FMINNUM(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::FMIN_F32, RTLIB::FMIN_F64,
+                                       RTLIB::FMIN_F80, RTLIB::FMIN_F128,
+                                       RTLIB::FMIN_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FMAXNUM(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                        RTLIB::FMAX_F32, RTLIB::FMAX_F64,
+                                        RTLIB::FMAX_F80, RTLIB::FMAX_F128,
+                                        RTLIB::FMAX_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FADD(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                        RTLIB::ADD_F32, RTLIB::ADD_F64,
+                                        RTLIB::ADD_F80, RTLIB::ADD_F128,
+                                        RTLIB::ADD_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FCBRT(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0), RTLIB::CBRT_F32,
+                                       RTLIB::CBRT_F64, RTLIB::CBRT_F80,
+                                       RTLIB::CBRT_F128,
+                                       RTLIB::CBRT_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FCEIL(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::CEIL_F32, RTLIB::CEIL_F64,
+                                       RTLIB::CEIL_F80, RTLIB::CEIL_F128,
+                                       RTLIB::CEIL_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FCOPYSIGN(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                        RTLIB::COPYSIGN_F32,
+                                        RTLIB::COPYSIGN_F64,
+                                        RTLIB::COPYSIGN_F80,
+                                        RTLIB::COPYSIGN_F128,
+                                        RTLIB::COPYSIGN_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FCOS(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::COS_F32, RTLIB::COS_F64,
+                                       RTLIB::COS_F80, RTLIB::COS_F128,
+                                       RTLIB::COS_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FDIV(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                        RTLIB::DIV_F32,
+                                        RTLIB::DIV_F64,
+                                        RTLIB::DIV_F80,
+                                        RTLIB::DIV_F128,
+                                        RTLIB::DIV_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FEXP(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::EXP_F32, RTLIB::EXP_F64,
+                                       RTLIB::EXP_F80, RTLIB::EXP_F128,
+                                       RTLIB::EXP_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FEXP2(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::EXP2_F32, RTLIB::EXP2_F64,
+                                       RTLIB::EXP2_F80, RTLIB::EXP2_F128,
+                                       RTLIB::EXP2_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FFLOOR(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
+                                       RTLIB::FLOOR_F80, RTLIB::FLOOR_F128,
+                                       RTLIB::FLOOR_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FLOG(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::LOG_F32, RTLIB::LOG_F64,
+                                       RTLIB::LOG_F80, RTLIB::LOG_F128,
+                                       RTLIB::LOG_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FLOG2(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::LOG2_F32, RTLIB::LOG2_F64,
+                                       RTLIB::LOG2_F80, RTLIB::LOG2_F128,
+                                       RTLIB::LOG2_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FLOG10(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::LOG10_F32, RTLIB::LOG10_F64,
+                                       RTLIB::LOG10_F80, RTLIB::LOG10_F128,
+                                       RTLIB::LOG10_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FMA(SDNode *N, SDValue &Lo,
+                                          SDValue &Hi) {
+  bool IsStrict = N->isStrictFPOpcode();
+  unsigned Offset = IsStrict ? 1 : 0;
+  SDValue Ops[3] = { N->getOperand(0 + Offset), N->getOperand(1 + Offset),
+                     N->getOperand(2 + Offset) };
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                                   RTLIB::FMA_F32,
+                                                   RTLIB::FMA_F64,
+                                                   RTLIB::FMA_F80,
+                                                   RTLIB::FMA_F128,
+                                                   RTLIB::FMA_PPCF128),
+                                 N->getValueType(0), Ops, CallOptions,
+                                 SDLoc(N), Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  GetPairElements(Tmp.first, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FMUL(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                                   RTLIB::MUL_F32,
+                                                   RTLIB::MUL_F64,
+                                                   RTLIB::MUL_F80,
+                                                   RTLIB::MUL_F128,
+                                                   RTLIB::MUL_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FNEARBYINT(SDNode *N,
+                                                 SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::NEARBYINT_F32,
+                                       RTLIB::NEARBYINT_F64,
+                                       RTLIB::NEARBYINT_F80,
+                                       RTLIB::NEARBYINT_F128,
+                                       RTLIB::NEARBYINT_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FNEG(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDLoc dl(N);
+  GetExpandedFloat(N->getOperand(0), Lo, Hi);
+  Lo = DAG.getNode(ISD::FNEG, dl, Lo.getValueType(), Lo);
+  Hi = DAG.getNode(ISD::FNEG, dl, Hi.getValueType(), Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FP_EXTEND(SDNode *N, SDValue &Lo,
+                                                SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+  bool IsStrict = N->isStrictFPOpcode();
+
+  SDValue Chain;
+  if (IsStrict) {
+    // If the expanded type is the same as the input type, just bypass the node.
+    if (NVT == N->getOperand(1).getValueType()) {
+      Hi = N->getOperand(1);
+      Chain = N->getOperand(0);
+    } else {
+      // Other we need to extend.
+      Hi = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, { NVT, MVT::Other },
+                       { N->getOperand(0), N->getOperand(1) });
+      Chain = Hi.getValue(1);
+    }
+  } else {
+    Hi = DAG.getNode(ISD::FP_EXTEND, dl, NVT, N->getOperand(0));
+  }
+
+  Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                 APInt(NVT.getSizeInBits(), 0)), dl, NVT);
+
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Chain);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FPOW(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                        RTLIB::POW_F32, RTLIB::POW_F64,
+                                        RTLIB::POW_F80, RTLIB::POW_F128,
+                                        RTLIB::POW_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FPOWI(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Binary(N, RTLIB::getPOWI(N->getValueType(0)), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FLDEXP(SDNode *N, SDValue &Lo,
+                                             SDValue &Hi) {
+  ExpandFloatRes_Binary(N, RTLIB::getLDEXP(N->getValueType(0)), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FREEZE(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  assert(N->getValueType(0) == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+
+  SDLoc dl(N);
+  GetExpandedFloat(N->getOperand(0), Lo, Hi);
+  Lo = DAG.getNode(ISD::FREEZE, dl, Lo.getValueType(), Lo);
+  Hi = DAG.getNode(ISD::FREEZE, dl, Hi.getValueType(), Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FREM(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                        RTLIB::REM_F32, RTLIB::REM_F64,
+                                        RTLIB::REM_F80, RTLIB::REM_F128,
+                                        RTLIB::REM_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FRINT(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::RINT_F32, RTLIB::RINT_F64,
+                                       RTLIB::RINT_F80, RTLIB::RINT_F128,
+                                       RTLIB::RINT_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FROUND(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::ROUND_F32,
+                                       RTLIB::ROUND_F64,
+                                       RTLIB::ROUND_F80,
+                                       RTLIB::ROUND_F128,
+                                       RTLIB::ROUND_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FROUNDEVEN(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::ROUNDEVEN_F32,
+                                       RTLIB::ROUNDEVEN_F64,
+                                       RTLIB::ROUNDEVEN_F80,
+                                       RTLIB::ROUNDEVEN_F128,
+                                       RTLIB::ROUNDEVEN_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FSIN(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::SIN_F32, RTLIB::SIN_F64,
+                                       RTLIB::SIN_F80, RTLIB::SIN_F128,
+                                       RTLIB::SIN_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FSQRT(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::SQRT_F32, RTLIB::SQRT_F64,
+                                       RTLIB::SQRT_F80, RTLIB::SQRT_F128,
+                                       RTLIB::SQRT_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                        RTLIB::SUB_F32,
+                                        RTLIB::SUB_F64,
+                                        RTLIB::SUB_F80,
+                                        RTLIB::SUB_F128,
+                                        RTLIB::SUB_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FTRUNC(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
+                                       RTLIB::TRUNC_F80, RTLIB::TRUNC_F128,
+                                       RTLIB::TRUNC_PPCF128), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_LOAD(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  if (ISD::isNormalLoad(N)) {
+    ExpandRes_NormalLoad(N, Lo, Hi);
+    return;
+  }
+
+  assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  SDLoc dl(N);
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0));
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+  assert(LD->getMemoryVT().bitsLE(NVT) && "Float type not round?");
+
+  Hi = DAG.getExtLoad(LD->getExtensionType(), dl, NVT, Chain, Ptr,
+                      LD->getMemoryVT(), LD->getMemOperand());
+
+  // Remember the chain.
+  Chain = Hi.getValue(1);
+
+  // The low part is zero.
+  Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                 APInt(NVT.getSizeInBits(), 0)), dl, NVT);
+
+  // Modified the chain - switch anything that used the old chain to use the
+  // new one.
+  ReplaceValueWith(SDValue(LD, 1), Chain);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  assert(N->getValueType(0) == MVT::ppcf128 && "Unsupported XINT_TO_FP!");
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  bool Strict = N->isStrictFPOpcode();
+  SDValue Src = N->getOperand(Strict ? 1 : 0);
+  EVT SrcVT = Src.getValueType();
+  bool isSigned = N->getOpcode() == ISD::SINT_TO_FP ||
+                  N->getOpcode() == ISD::STRICT_SINT_TO_FP;
+  SDLoc dl(N);
+  SDValue Chain = Strict ? N->getOperand(0) : DAG.getEntryNode();
+
+  // TODO: Any other flags to propagate?
+  SDNodeFlags Flags;
+  Flags.setNoFPExcept(N->getFlags().hasNoFPExcept());
+
+  // First do an SINT_TO_FP, whether the original was signed or unsigned.
+  // When promoting partial word types to i32 we must honor the signedness,
+  // though.
+  if (SrcVT.bitsLE(MVT::i32)) {
+    // The integer can be represented exactly in an f64.
+    Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                   APInt(NVT.getSizeInBits(), 0)), dl, NVT);
+    if (Strict) {
+      Hi = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(NVT, MVT::Other),
+                       {Chain, Src}, Flags);
+      Chain = Hi.getValue(1);
+    } else
+      Hi = DAG.getNode(N->getOpcode(), dl, NVT, Src);
+  } else {
+    RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+    if (SrcVT.bitsLE(MVT::i64)) {
+      Src = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                        MVT::i64, Src);
+      LC = RTLIB::SINTTOFP_I64_PPCF128;
+    } else if (SrcVT.bitsLE(MVT::i128)) {
+      Src = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i128, Src);
+      LC = RTLIB::SINTTOFP_I128_PPCF128;
+    }
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
+
+    TargetLowering::MakeLibCallOptions CallOptions;
+    CallOptions.setSExt(true);
+    std::pair<SDValue, SDValue> Tmp =
+        TLI.makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
+    if (Strict)
+      Chain = Tmp.second;
+    GetPairElements(Tmp.first, Lo, Hi);
+  }
+
+  // No need to complement for unsigned 32-bit integers
+  if (isSigned || SrcVT.bitsLE(MVT::i32)) {
+    if (Strict)
+      ReplaceValueWith(SDValue(N, 1), Chain);
+
+    return;
+  }
+
+  // Unsigned - fix up the SINT_TO_FP value just calculated.
+  // FIXME: For unsigned i128 to ppc_fp128 conversion, we need to carefully
+  // keep semantics correctness if the integer is not exactly representable
+  // here. See ExpandLegalINT_TO_FP.
+  Hi = DAG.getNode(ISD::BUILD_PAIR, dl, VT, Lo, Hi);
+  SrcVT = Src.getValueType();
+
+  // x>=0 ? (ppcf128)(iN)x : (ppcf128)(iN)x + 2^N; N=32,64,128.
+  static const uint64_t TwoE32[]  = { 0x41f0000000000000LL, 0 };
+  static const uint64_t TwoE64[]  = { 0x43f0000000000000LL, 0 };
+  static const uint64_t TwoE128[] = { 0x47f0000000000000LL, 0 };
+  ArrayRef<uint64_t> Parts;
+
+  switch (SrcVT.getSimpleVT().SimpleTy) {
+  default:
+    llvm_unreachable("Unsupported UINT_TO_FP!");
+  case MVT::i32:
+    Parts = TwoE32;
+    break;
+  case MVT::i64:
+    Parts = TwoE64;
+    break;
+  case MVT::i128:
+    Parts = TwoE128;
+    break;
+  }
+
+  // TODO: Are there other fast-math-flags to propagate to this FADD?
+  SDValue NewLo = DAG.getConstantFP(
+      APFloat(APFloat::PPCDoubleDouble(), APInt(128, Parts)), dl, MVT::ppcf128);
+  if (Strict) {
+    Lo = DAG.getNode(ISD::STRICT_FADD, dl, DAG.getVTList(VT, MVT::Other),
+                     {Chain, Hi, NewLo}, Flags);
+    Chain = Lo.getValue(1);
+    ReplaceValueWith(SDValue(N, 1), Chain);
+  } else
+    Lo = DAG.getNode(ISD::FADD, dl, VT, Hi, NewLo);
+  Lo = DAG.getSelectCC(dl, Src, DAG.getConstant(0, dl, SrcVT),
+                       Lo, Hi, ISD::SETLT);
+  GetPairElements(Lo, Lo, Hi);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Float Operand Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandFloatOperand - This method is called when the specified operand of the
+/// specified node is found to need expansion.  At this point, all of the result
+/// types of the node are known to be legal, but other operands of the node may
+/// need promotion or expansion as well as the specified one.
+bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) {
+  LLVM_DEBUG(dbgs() << "Expand float operand: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ExpandFloatOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to expand this operator's operand!");
+
+  case ISD::BITCAST:         Res = ExpandOp_BITCAST(N); break;
+  case ISD::BUILD_VECTOR:    Res = ExpandOp_BUILD_VECTOR(N); break;
+  case ISD::EXTRACT_ELEMENT: Res = ExpandOp_EXTRACT_ELEMENT(N); break;
+
+  case ISD::BR_CC:      Res = ExpandFloatOp_BR_CC(N); break;
+  case ISD::FCOPYSIGN:  Res = ExpandFloatOp_FCOPYSIGN(N); break;
+  case ISD::STRICT_FP_ROUND:
+  case ISD::FP_ROUND:   Res = ExpandFloatOp_FP_ROUND(N); break;
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT: Res = ExpandFloatOp_FP_TO_XINT(N); break;
+  case ISD::LROUND:     Res = ExpandFloatOp_LROUND(N); break;
+  case ISD::LLROUND:    Res = ExpandFloatOp_LLROUND(N); break;
+  case ISD::LRINT:      Res = ExpandFloatOp_LRINT(N); break;
+  case ISD::LLRINT:     Res = ExpandFloatOp_LLRINT(N); break;
+  case ISD::SELECT_CC:  Res = ExpandFloatOp_SELECT_CC(N); break;
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS:
+  case ISD::SETCC:      Res = ExpandFloatOp_SETCC(N); break;
+  case ISD::STORE:      Res = ExpandFloatOp_STORE(cast<StoreSDNode>(N),
+                                                  OpNo); break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// FloatExpandSetCCOperands - Expand the operands of a comparison.  This code
+/// is shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::FloatExpandSetCCOperands(SDValue &NewLHS,
+                                                SDValue &NewRHS,
+                                                ISD::CondCode &CCCode,
+                                                const SDLoc &dl, SDValue &Chain,
+                                                bool IsSignaling) {
+  SDValue LHSLo, LHSHi, RHSLo, RHSHi;
+  GetExpandedFloat(NewLHS, LHSLo, LHSHi);
+  GetExpandedFloat(NewRHS, RHSLo, RHSHi);
+
+  assert(NewLHS.getValueType() == MVT::ppcf128 && "Unsupported setcc type!");
+
+  // FIXME:  This generated code sucks.  We want to generate
+  //         FCMPU crN, hi1, hi2
+  //         BNE crN, L:
+  //         FCMPU crN, lo1, lo2
+  // The following can be improved, but not that much.
+  SDValue Tmp1, Tmp2, Tmp3, OutputChain;
+  Tmp1 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()), LHSHi,
+                      RHSHi, ISD::SETOEQ, Chain, IsSignaling);
+  OutputChain = Tmp1->getNumValues() > 1 ? Tmp1.getValue(1) : SDValue();
+  Tmp2 = DAG.getSetCC(dl, getSetCCResultType(LHSLo.getValueType()), LHSLo,
+                      RHSLo, CCCode, OutputChain, IsSignaling);
+  OutputChain = Tmp2->getNumValues() > 1 ? Tmp2.getValue(1) : SDValue();
+  Tmp3 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2);
+  Tmp1 =
+      DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()), LHSHi, RHSHi,
+                   ISD::SETUNE, OutputChain, IsSignaling);
+  OutputChain = Tmp1->getNumValues() > 1 ? Tmp1.getValue(1) : SDValue();
+  Tmp2 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()), LHSHi,
+                      RHSHi, CCCode, OutputChain, IsSignaling);
+  OutputChain = Tmp2->getNumValues() > 1 ? Tmp2.getValue(1) : SDValue();
+  Tmp1 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2);
+  NewLHS = DAG.getNode(ISD::OR, dl, Tmp1.getValueType(), Tmp1, Tmp3);
+  NewRHS = SDValue();   // LHS is the result, not a compare.
+  Chain = OutputChain;
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_BR_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
+  SDValue Chain;
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N), Chain);
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (!NewRHS.getNode()) {
+    NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                DAG.getCondCode(CCCode), NewLHS, NewRHS,
+                                N->getOperand(4)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_FCOPYSIGN(SDNode *N) {
+  assert(N->getOperand(1).getValueType() == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+  SDValue Lo, Hi;
+  GetExpandedFloat(N->getOperand(1), Lo, Hi);
+  // The ppcf128 value is providing only the sign; take it from the
+  // higher-order double (which must have the larger magnitude).
+  return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N),
+                     N->getValueType(0), N->getOperand(0), Hi);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_FP_ROUND(SDNode *N) {
+  bool IsStrict = N->isStrictFPOpcode();
+  assert(N->getOperand(IsStrict ? 1 : 0).getValueType() == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+  SDValue Lo, Hi;
+  GetExpandedFloat(N->getOperand(IsStrict ? 1 : 0), Lo, Hi);
+
+  if (!IsStrict)
+    // Round it the rest of the way (e.g. to f32) if needed.
+    return DAG.getNode(ISD::FP_ROUND, SDLoc(N),
+                       N->getValueType(0), Hi, N->getOperand(1));
+
+  // Eliminate the node if the input float type is the same as the output float
+  // type.
+  if (Hi.getValueType() == N->getValueType(0)) {
+    // Connect the output chain to the input chain, unlinking the node.
+    ReplaceValueWith(SDValue(N, 1), N->getOperand(0));
+    ReplaceValueWith(SDValue(N, 0), Hi);
+    return SDValue();
+  }
+
+  SDValue Expansion = DAG.getNode(ISD::STRICT_FP_ROUND, SDLoc(N),
+                                  {N->getValueType(0), MVT::Other},
+                                  {N->getOperand(0), Hi, N->getOperand(2)});
+  ReplaceValueWith(SDValue(N, 1), Expansion.getValue(1));
+  ReplaceValueWith(SDValue(N, 0), Expansion);
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_XINT(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  SDLoc dl(N);
+
+  bool IsStrict = N->isStrictFPOpcode();
+  bool Signed = N->getOpcode() == ISD::FP_TO_SINT ||
+                N->getOpcode() == ISD::STRICT_FP_TO_SINT;
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+
+  EVT NVT;
+  RTLIB::Libcall LC = findFPToIntLibcall(Op.getValueType(), RVT, NVT, Signed);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && NVT.isSimple() &&
+         "Unsupported FP_TO_XINT!");
+  TargetLowering::MakeLibCallOptions CallOptions;
+  std::pair<SDValue, SDValue> Tmp =
+      TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, dl, Chain);
+  if (!IsStrict)
+    return Tmp.first;
+
+  ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  ReplaceValueWith(SDValue(N, 0), Tmp.first);
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
+  SDValue Chain;
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N), Chain);
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (!NewRHS.getNode()) {
+    NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                N->getOperand(2), N->getOperand(3),
+                                DAG.getCondCode(CCCode)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_SETCC(SDNode *N) {
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue NewLHS = N->getOperand(IsStrict ? 1 : 0);
+  SDValue NewRHS = N->getOperand(IsStrict ? 2 : 1);
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  ISD::CondCode CCCode =
+      cast<CondCodeSDNode>(N->getOperand(IsStrict ? 3 : 2))->get();
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N), Chain,
+                           N->getOpcode() == ISD::STRICT_FSETCCS);
+
+  // FloatExpandSetCCOperands always returned a scalar.
+  assert(!NewRHS.getNode() && "Expect to return scalar");
+  assert(NewLHS.getValueType() == N->getValueType(0) &&
+         "Unexpected setcc expansion!");
+  if (Chain) {
+    ReplaceValueWith(SDValue(N, 0), NewLHS);
+    ReplaceValueWith(SDValue(N, 1), Chain);
+    return SDValue();
+  }
+  return NewLHS;
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_STORE(SDNode *N, unsigned OpNo) {
+  if (ISD::isNormalStore(N))
+    return ExpandOp_NormalStore(N, OpNo);
+
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  assert(OpNo == 1 && "Can only expand the stored value so far");
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(),
+                                     ST->getValue().getValueType());
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+  assert(ST->getMemoryVT().bitsLE(NVT) && "Float type not round?");
+  (void)NVT;
+
+  SDValue Lo, Hi;
+  GetExpandedOp(ST->getValue(), Lo, Hi);
+
+  return DAG.getTruncStore(Chain, SDLoc(N), Hi, Ptr,
+                           ST->getMemoryVT(), ST->getMemOperand());
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_LROUND(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  EVT RetVT = N->getOperand(0).getValueType();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
+                                           RTLIB::LROUND_F32,
+                                           RTLIB::LROUND_F64,
+                                           RTLIB::LROUND_F80,
+                                           RTLIB::LROUND_F128,
+                                           RTLIB::LROUND_PPCF128),
+                         RVT, N->getOperand(0), CallOptions, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_LLROUND(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  EVT RetVT = N->getOperand(0).getValueType();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
+                                           RTLIB::LLROUND_F32,
+                                           RTLIB::LLROUND_F64,
+                                           RTLIB::LLROUND_F80,
+                                           RTLIB::LLROUND_F128,
+                                           RTLIB::LLROUND_PPCF128),
+                         RVT, N->getOperand(0), CallOptions, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_LRINT(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  EVT RetVT = N->getOperand(0).getValueType();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
+                                           RTLIB::LRINT_F32,
+                                           RTLIB::LRINT_F64,
+                                           RTLIB::LRINT_F80,
+                                           RTLIB::LRINT_F128,
+                                           RTLIB::LRINT_PPCF128),
+                         RVT, N->getOperand(0), CallOptions, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_LLRINT(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  EVT RetVT = N->getOperand(0).getValueType();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
+                                           RTLIB::LLRINT_F32,
+                                           RTLIB::LLRINT_F64,
+                                           RTLIB::LLRINT_F80,
+                                           RTLIB::LLRINT_F128,
+                                           RTLIB::LLRINT_PPCF128),
+                         RVT, N->getOperand(0), CallOptions, SDLoc(N)).first;
+}
+
+//===----------------------------------------------------------------------===//
+//  Float Operand Promotion
+//===----------------------------------------------------------------------===//
+//
+
+static ISD::NodeType GetPromotionOpcode(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f16) {
+    return ISD::FP16_TO_FP;
+  } else if (RetVT == MVT::f16) {
+    return ISD::FP_TO_FP16;
+  } else if (OpVT == MVT::bf16) {
+    return ISD::BF16_TO_FP;
+  } else if (RetVT == MVT::bf16) {
+    return ISD::FP_TO_BF16;
+  }
+
+  report_fatal_error("Attempt at an invalid promotion-related conversion");
+}
+
+bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
+  LLVM_DEBUG(dbgs() << "Promote float operand " << OpNo << ": "; N->dump(&DAG);
+             dbgs() << "\n");
+  SDValue R = SDValue();
+
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false)) {
+    LLVM_DEBUG(dbgs() << "Node has been custom lowered, done\n");
+    return false;
+  }
+
+  // Nodes that use a promotion-requiring floating point operand, but doesn't
+  // produce a promotion-requiring floating point result, need to be legalized
+  // to use the promoted float operand.  Nodes that produce at least one
+  // promotion-requiring floating point result have their operands legalized as
+  // a part of PromoteFloatResult.
+  switch (N->getOpcode()) {
+    default:
+  #ifndef NDEBUG
+      dbgs() << "PromoteFloatOperand Op #" << OpNo << ": ";
+      N->dump(&DAG); dbgs() << "\n";
+  #endif
+      report_fatal_error("Do not know how to promote this operator's operand!");
+
+    case ISD::BITCAST:    R = PromoteFloatOp_BITCAST(N, OpNo); break;
+    case ISD::FCOPYSIGN:  R = PromoteFloatOp_FCOPYSIGN(N, OpNo); break;
+    case ISD::FP_TO_SINT:
+    case ISD::FP_TO_UINT: R = PromoteFloatOp_FP_TO_XINT(N, OpNo); break;
+    case ISD::FP_TO_SINT_SAT:
+    case ISD::FP_TO_UINT_SAT:
+                          R = PromoteFloatOp_FP_TO_XINT_SAT(N, OpNo); break;
+    case ISD::FP_EXTEND:  R = PromoteFloatOp_FP_EXTEND(N, OpNo); break;
+    case ISD::SELECT_CC:  R = PromoteFloatOp_SELECT_CC(N, OpNo); break;
+    case ISD::SETCC:      R = PromoteFloatOp_SETCC(N, OpNo); break;
+    case ISD::STORE:      R = PromoteFloatOp_STORE(N, OpNo); break;
+  }
+
+  if (R.getNode())
+    ReplaceValueWith(SDValue(N, 0), R);
+  return false;
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo) {
+  SDValue Op = N->getOperand(0);
+  EVT OpVT = Op->getValueType(0);
+
+  SDValue Promoted = GetPromotedFloat(N->getOperand(0));
+  EVT PromotedVT = Promoted->getValueType(0);
+
+  // Convert the promoted float value to the desired IVT.
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
+  SDValue Convert = DAG.getNode(GetPromotionOpcode(PromotedVT, OpVT), SDLoc(N),
+                                IVT, Promoted);
+  // The final result type might not be an scalar so we need a bitcast. The
+  // bitcast will be further legalized if needed.
+  return DAG.getBitcast(N->getValueType(0), Convert);
+}
+
+// Promote Operand 1 of FCOPYSIGN.  Operand 0 ought to be handled by
+// PromoteFloatRes_FCOPYSIGN.
+SDValue DAGTypeLegalizer::PromoteFloatOp_FCOPYSIGN(SDNode *N, unsigned OpNo) {
+  assert (OpNo == 1 && "Only Operand 1 must need promotion here");
+  SDValue Op1 = GetPromotedFloat(N->getOperand(1));
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+                     N->getOperand(0), Op1);
+}
+
+// Convert the promoted float value to the desired integer type
+SDValue DAGTypeLegalizer::PromoteFloatOp_FP_TO_XINT(SDNode *N, unsigned OpNo) {
+  SDValue Op = GetPromotedFloat(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatOp_FP_TO_XINT_SAT(SDNode *N,
+                                                        unsigned OpNo) {
+  SDValue Op = GetPromotedFloat(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), Op,
+                     N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo) {
+  SDValue Op = GetPromotedFloat(N->getOperand(0));
+  EVT VT = N->getValueType(0);
+
+  // Desired VT is same as promoted type.  Use promoted float directly.
+  if (VT == Op->getValueType(0))
+    return Op;
+
+  // Else, extend the promoted float value to the desired VT.
+  return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Op);
+}
+
+// Promote the float operands used for comparison.  The true- and false-
+// operands have the same type as the result and are promoted, if needed, by
+// PromoteFloatRes_SELECT_CC
+SDValue DAGTypeLegalizer::PromoteFloatOp_SELECT_CC(SDNode *N, unsigned OpNo) {
+  SDValue LHS = GetPromotedFloat(N->getOperand(0));
+  SDValue RHS = GetPromotedFloat(N->getOperand(1));
+
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
+                     LHS, RHS, N->getOperand(2), N->getOperand(3),
+                     N->getOperand(4));
+}
+
+// Construct a SETCC that compares the promoted values and sets the conditional
+// code.
+SDValue DAGTypeLegalizer::PromoteFloatOp_SETCC(SDNode *N, unsigned OpNo) {
+  EVT VT = N->getValueType(0);
+  SDValue Op0 = GetPromotedFloat(N->getOperand(0));
+  SDValue Op1 = GetPromotedFloat(N->getOperand(1));
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+
+  return DAG.getSetCC(SDLoc(N), VT, Op0, Op1, CCCode);
+
+}
+
+// Lower the promoted Float down to the integer value of same size and construct
+// a STORE of the integer value.
+SDValue DAGTypeLegalizer::PromoteFloatOp_STORE(SDNode *N, unsigned OpNo) {
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  SDValue Val = ST->getValue();
+  SDLoc DL(N);
+
+  SDValue Promoted = GetPromotedFloat(Val);
+  EVT VT = ST->getOperand(1).getValueType();
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+
+  SDValue NewVal;
+  NewVal = DAG.getNode(GetPromotionOpcode(Promoted.getValueType(), VT), DL,
+                       IVT, Promoted);
+
+  return DAG.getStore(ST->getChain(), DL, NewVal, ST->getBasePtr(),
+                      ST->getMemOperand());
+}
+
+//===----------------------------------------------------------------------===//
+//  Float Result Promotion
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
+  LLVM_DEBUG(dbgs() << "Promote float result " << ResNo << ": "; N->dump(&DAG);
+             dbgs() << "\n");
+  SDValue R = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true)) {
+    LLVM_DEBUG(dbgs() << "Node has been custom expanded, done\n");
+    return;
+  }
+
+  switch (N->getOpcode()) {
+    // These opcodes cannot appear if promotion of FP16 is done in the backend
+    // instead of Clang
+    case ISD::FP16_TO_FP:
+    case ISD::FP_TO_FP16:
+    default:
+#ifndef NDEBUG
+      dbgs() << "PromoteFloatResult #" << ResNo << ": ";
+      N->dump(&DAG); dbgs() << "\n";
+#endif
+      report_fatal_error("Do not know how to promote this operator's result!");
+
+    case ISD::BITCAST:    R = PromoteFloatRes_BITCAST(N); break;
+    case ISD::ConstantFP: R = PromoteFloatRes_ConstantFP(N); break;
+    case ISD::EXTRACT_VECTOR_ELT:
+                          R = PromoteFloatRes_EXTRACT_VECTOR_ELT(N); break;
+    case ISD::FCOPYSIGN:  R = PromoteFloatRes_FCOPYSIGN(N); break;
+
+    // Unary FP Operations
+    case ISD::FABS:
+    case ISD::FCBRT:
+    case ISD::FCEIL:
+    case ISD::FCOS:
+    case ISD::FEXP:
+    case ISD::FEXP2:
+    case ISD::FFLOOR:
+    case ISD::FLOG:
+    case ISD::FLOG2:
+    case ISD::FLOG10:
+    case ISD::FNEARBYINT:
+    case ISD::FNEG:
+    case ISD::FRINT:
+    case ISD::FROUND:
+    case ISD::FROUNDEVEN:
+    case ISD::FSIN:
+    case ISD::FSQRT:
+    case ISD::FTRUNC:
+    case ISD::FCANONICALIZE: R = PromoteFloatRes_UnaryOp(N); break;
+
+    // Binary FP Operations
+    case ISD::FADD:
+    case ISD::FDIV:
+    case ISD::FMAXIMUM:
+    case ISD::FMINIMUM:
+    case ISD::FMAXNUM:
+    case ISD::FMINNUM:
+    case ISD::FMUL:
+    case ISD::FPOW:
+    case ISD::FREM:
+    case ISD::FSUB:       R = PromoteFloatRes_BinOp(N); break;
+
+    case ISD::FMA:        // FMA is same as FMAD
+    case ISD::FMAD:       R = PromoteFloatRes_FMAD(N); break;
+
+    case ISD::FPOWI:
+    case ISD::FLDEXP:     R = PromoteFloatRes_ExpOp(N); break;
+    case ISD::FFREXP:     R = PromoteFloatRes_FFREXP(N); break;
+
+    case ISD::FP_ROUND:   R = PromoteFloatRes_FP_ROUND(N); break;
+    case ISD::LOAD:       R = PromoteFloatRes_LOAD(N); break;
+    case ISD::SELECT:     R = PromoteFloatRes_SELECT(N); break;
+    case ISD::SELECT_CC:  R = PromoteFloatRes_SELECT_CC(N); break;
+
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP: R = PromoteFloatRes_XINT_TO_FP(N); break;
+    case ISD::UNDEF:      R = PromoteFloatRes_UNDEF(N); break;
+    case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
+    case ISD::VECREDUCE_FADD:
+    case ISD::VECREDUCE_FMUL:
+    case ISD::VECREDUCE_FMIN:
+    case ISD::VECREDUCE_FMAX:
+    case ISD::VECREDUCE_FMAXIMUM:
+    case ISD::VECREDUCE_FMINIMUM:
+      R = PromoteFloatRes_VECREDUCE(N);
+      break;
+    case ISD::VECREDUCE_SEQ_FADD:
+    case ISD::VECREDUCE_SEQ_FMUL:
+      R = PromoteFloatRes_VECREDUCE_SEQ(N);
+      break;
+  }
+
+  if (R.getNode())
+    SetPromotedFloat(SDValue(N, ResNo), R);
+}
+
+// Bitcast from i16 to f16:  convert the i16 to a f32 value instead.
+// At this point, it is not possible to determine if the bitcast value is
+// eventually stored to memory or promoted to f32 or promoted to a floating
+// point at a higher precision.  Some of these cases are handled by FP_EXTEND,
+// STORE promotion handlers.
+SDValue DAGTypeLegalizer::PromoteFloatRes_BITCAST(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  // Input type isn't guaranteed to be a scalar int so bitcast if not. The
+  // bitcast will be legalized further if necessary.
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(),
+                              N->getOperand(0).getValueType().getSizeInBits());
+  SDValue Cast = DAG.getBitcast(IVT, N->getOperand(0));
+  return DAG.getNode(GetPromotionOpcode(VT, NVT), SDLoc(N), NVT, Cast);
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatRes_ConstantFP(SDNode *N) {
+  ConstantFPSDNode *CFPNode = cast<ConstantFPSDNode>(N);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // Get the (bit-cast) APInt of the APFloat and build an integer constant
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  SDValue C = DAG.getConstant(CFPNode->getValueAPF().bitcastToAPInt(), DL,
+                              IVT);
+
+  // Convert the Constant to the desired FP type
+  // FIXME We might be able to do the conversion during compilation and get rid
+  // of it from the object code
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  return DAG.getNode(GetPromotionOpcode(VT, NVT), DL, NVT, C);
+}
+
+// If the Index operand is a constant, try to redirect the extract operation to
+// the correct legalized vector.  If not, bit-convert the input vector to
+// equivalent integer vector.  Extract the element as an (bit-cast) integer
+// value and convert it to the promoted type.
+SDValue DAGTypeLegalizer::PromoteFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDLoc DL(N);
+
+  // If the index is constant, try to extract the value from the legalized
+  // vector type.
+  if (isa<ConstantSDNode>(N->getOperand(1))) {
+    SDValue Vec = N->getOperand(0);
+    SDValue Idx = N->getOperand(1);
+    EVT VecVT = Vec->getValueType(0);
+    EVT EltVT = VecVT.getVectorElementType();
+
+    uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
+    switch (getTypeAction(VecVT)) {
+    default: break;
+    case TargetLowering::TypeScalarizeVector: {
+      SDValue Res = GetScalarizedVector(N->getOperand(0));
+      ReplaceValueWith(SDValue(N, 0), Res);
+      return SDValue();
+    }
+    case TargetLowering::TypeWidenVector: {
+      Vec = GetWidenedVector(Vec);
+      SDValue Res = DAG.getNode(N->getOpcode(), DL, EltVT, Vec, Idx);
+      ReplaceValueWith(SDValue(N, 0), Res);
+      return SDValue();
+    }
+    case TargetLowering::TypeSplitVector: {
+      SDValue Lo, Hi;
+      GetSplitVector(Vec, Lo, Hi);
+
+      uint64_t LoElts = Lo.getValueType().getVectorNumElements();
+      SDValue Res;
+      if (IdxVal < LoElts)
+        Res = DAG.getNode(N->getOpcode(), DL, EltVT, Lo, Idx);
+      else
+        Res = DAG.getNode(N->getOpcode(), DL, EltVT, Hi,
+                          DAG.getConstant(IdxVal - LoElts, DL,
+                                          Idx.getValueType()));
+      ReplaceValueWith(SDValue(N, 0), Res);
+      return SDValue();
+    }
+
+    }
+  }
+
+  // Bit-convert the input vector to the equivalent integer vector
+  SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0));
+  EVT IVT = NewOp.getValueType().getVectorElementType();
+
+  // Extract the element as an (bit-cast) integer value
+  SDValue NewVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, IVT,
+                               NewOp, N->getOperand(1));
+
+  // Convert the element to the desired FP type
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  return DAG.getNode(GetPromotionOpcode(VT, NVT), SDLoc(N), NVT, NewVal);
+}
+
+// FCOPYSIGN(X, Y) returns the value of X with the sign of Y.  If the result
+// needs promotion, so does the argument X.  Note that Y, if needed, will be
+// handled during operand promotion.
+SDValue DAGTypeLegalizer::PromoteFloatRes_FCOPYSIGN(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Op0 = GetPromotedFloat(N->getOperand(0));
+
+  SDValue Op1 = N->getOperand(1);
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1);
+}
+
+// Unary operation where the result and the operand have PromoteFloat type
+// action.  Construct a new SDNode with the promoted float value of the old
+// operand.
+SDValue DAGTypeLegalizer::PromoteFloatRes_UnaryOp(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Op = GetPromotedFloat(N->getOperand(0));
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op);
+}
+
+// Binary operations where the result and both operands have PromoteFloat type
+// action.  Construct a new SDNode with the promoted float values of the old
+// operands.
+SDValue DAGTypeLegalizer::PromoteFloatRes_BinOp(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Op0 = GetPromotedFloat(N->getOperand(0));
+  SDValue Op1 = GetPromotedFloat(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1, N->getFlags());
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatRes_FMAD(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Op0 = GetPromotedFloat(N->getOperand(0));
+  SDValue Op1 = GetPromotedFloat(N->getOperand(1));
+  SDValue Op2 = GetPromotedFloat(N->getOperand(2));
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1, Op2);
+}
+
+// Promote the Float (first) operand and retain the Integer (second) operand
+SDValue DAGTypeLegalizer::PromoteFloatRes_ExpOp(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Op0 = GetPromotedFloat(N->getOperand(0));
+  SDValue Op1 = N->getOperand(1);
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1);
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatRes_FFREXP(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Op = GetPromotedFloat(N->getOperand(0));
+  SDValue Res =
+      DAG.getNode(N->getOpcode(), SDLoc(N), {NVT, N->getValueType(1)}, Op);
+
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+// Explicit operation to reduce precision.  Reduce the value to half precision
+// and promote it back to the legal type.
+SDValue DAGTypeLegalizer::PromoteFloatRes_FP_ROUND(SDNode *N) {
+  SDLoc DL(N);
+
+  SDValue Op = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT OpVT = Op->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+
+  // Round promoted float to desired precision
+  SDValue Round = DAG.getNode(GetPromotionOpcode(OpVT, VT), DL, IVT, Op);
+  // Promote it back to the legal output type
+  return DAG.getNode(GetPromotionOpcode(VT, NVT), DL, NVT, Round);
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatRes_LOAD(SDNode *N) {
+  LoadSDNode *L = cast<LoadSDNode>(N);
+  EVT VT = N->getValueType(0);
+
+  // Load the value as an integer value with the same number of bits.
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  SDValue newL = DAG.getLoad(
+      L->getAddressingMode(), L->getExtensionType(), IVT, SDLoc(N),
+      L->getChain(), L->getBasePtr(), L->getOffset(), L->getPointerInfo(), IVT,
+      L->getOriginalAlign(), L->getMemOperand()->getFlags(), L->getAAInfo());
+  // Legalize the chain result by replacing uses of the old value chain with the
+  // new one
+  ReplaceValueWith(SDValue(N, 1), newL.getValue(1));
+
+  // Convert the integer value to the desired FP type
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  return DAG.getNode(GetPromotionOpcode(VT, NVT), SDLoc(N), NVT, newL);
+}
+
+// Construct a new SELECT node with the promoted true- and false- values.
+SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT(SDNode *N) {
+  SDValue TrueVal = GetPromotedFloat(N->getOperand(1));
+  SDValue FalseVal = GetPromotedFloat(N->getOperand(2));
+
+  return DAG.getNode(ISD::SELECT, SDLoc(N), TrueVal->getValueType(0),
+                     N->getOperand(0), TrueVal, FalseVal);
+}
+
+// Construct a new SELECT_CC node with the promoted true- and false- values.
+// The operands used for comparison are promoted by PromoteFloatOp_SELECT_CC.
+SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT_CC(SDNode *N) {
+  SDValue TrueVal = GetPromotedFloat(N->getOperand(2));
+  SDValue FalseVal = GetPromotedFloat(N->getOperand(3));
+
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
+                     TrueVal.getNode()->getValueType(0), N->getOperand(0),
+                     N->getOperand(1), TrueVal, FalseVal, N->getOperand(4));
+}
+
+// Construct a SDNode that transforms the SINT or UINT operand to the promoted
+// float type.
+SDValue DAGTypeLegalizer::PromoteFloatRes_XINT_TO_FP(SDNode *N) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue NV = DAG.getNode(N->getOpcode(), DL, NVT, N->getOperand(0));
+  // Round the value to the desired precision (that of the source type).
+  return DAG.getNode(
+      ISD::FP_EXTEND, DL, NVT,
+      DAG.getNode(ISD::FP_ROUND, DL, VT, NV,
+                  DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)));
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(),
+                                               N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatRes_VECREDUCE(SDNode *N) {
+  // Expand and promote recursively.
+  // TODO: This is non-optimal, but dealing with the concurrently happening
+  // vector-legalization is non-trivial. We could do something similar to
+  // PromoteFloatRes_EXTRACT_VECTOR_ELT here.
+  ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG));
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatRes_VECREDUCE_SEQ(SDNode *N) {
+  ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduceSeq(N, DAG));
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::BitcastToInt_ATOMIC_SWAP(SDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  AtomicSDNode *AM = cast<AtomicSDNode>(N);
+  SDLoc SL(N);
+
+  SDValue CastVal = BitConvertToInteger(AM->getVal());
+  EVT CastVT = CastVal.getValueType();
+
+  SDValue NewAtomic
+    = DAG.getAtomic(ISD::ATOMIC_SWAP, SL, CastVT,
+                    DAG.getVTList(CastVT, MVT::Other),
+                    { AM->getChain(), AM->getBasePtr(), CastVal },
+                    AM->getMemOperand());
+
+  SDValue Result = NewAtomic;
+
+  if (getTypeAction(VT) == TargetLowering::TypePromoteFloat) {
+    EVT NFPVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+    Result = DAG.getNode(GetPromotionOpcode(VT, NFPVT), SL, NFPVT,
+                                     NewAtomic);
+  }
+
+  // Legalize the chain result by replacing uses of the old value chain with the
+  // new one
+  ReplaceValueWith(SDValue(N, 1), NewAtomic.getValue(1));
+
+  return Result;
+
+}
+
+//===----------------------------------------------------------------------===//
+//  Half Result Soft Promotion
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
+  LLVM_DEBUG(dbgs() << "Soft promote half result " << ResNo << ": ";
+             N->dump(&DAG); dbgs() << "\n");
+  SDValue R = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true)) {
+    LLVM_DEBUG(dbgs() << "Node has been custom expanded, done\n");
+    return;
+  }
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "SoftPromoteHalfResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to soft promote this operator's "
+                       "result!");
+
+  case ISD::BITCAST:    R = SoftPromoteHalfRes_BITCAST(N); break;
+  case ISD::ConstantFP: R = SoftPromoteHalfRes_ConstantFP(N); break;
+  case ISD::EXTRACT_VECTOR_ELT:
+    R = SoftPromoteHalfRes_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::FCOPYSIGN:  R = SoftPromoteHalfRes_FCOPYSIGN(N); break;
+  case ISD::STRICT_FP_ROUND:
+  case ISD::FP_ROUND:   R = SoftPromoteHalfRes_FP_ROUND(N); break;
+
+  // Unary FP Operations
+  case ISD::FABS:
+  case ISD::FCBRT:
+  case ISD::FCEIL:
+  case ISD::FCOS:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FFLOOR:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+  case ISD::FNEARBYINT:
+  case ISD::FNEG:
+  case ISD::FREEZE:
+  case ISD::FRINT:
+  case ISD::FROUND:
+  case ISD::FROUNDEVEN:
+  case ISD::FSIN:
+  case ISD::FSQRT:
+  case ISD::FTRUNC:
+  case ISD::FCANONICALIZE: R = SoftPromoteHalfRes_UnaryOp(N); break;
+
+  // Binary FP Operations
+  case ISD::FADD:
+  case ISD::FDIV:
+  case ISD::FMAXIMUM:
+  case ISD::FMINIMUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINNUM:
+  case ISD::FMUL:
+  case ISD::FPOW:
+  case ISD::FREM:
+  case ISD::FSUB:        R = SoftPromoteHalfRes_BinOp(N); break;
+
+  case ISD::FMA:         // FMA is same as FMAD
+  case ISD::FMAD:        R = SoftPromoteHalfRes_FMAD(N); break;
+
+  case ISD::FPOWI:
+  case ISD::FLDEXP:      R = SoftPromoteHalfRes_ExpOp(N); break;
+
+  case ISD::LOAD:        R = SoftPromoteHalfRes_LOAD(N); break;
+  case ISD::SELECT:      R = SoftPromoteHalfRes_SELECT(N); break;
+  case ISD::SELECT_CC:   R = SoftPromoteHalfRes_SELECT_CC(N); break;
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:  R = SoftPromoteHalfRes_XINT_TO_FP(N); break;
+  case ISD::UNDEF:       R = SoftPromoteHalfRes_UNDEF(N); break;
+  case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VECREDUCE_FMINIMUM:
+    R = SoftPromoteHalfRes_VECREDUCE(N);
+    break;
+  case ISD::VECREDUCE_SEQ_FADD:
+  case ISD::VECREDUCE_SEQ_FMUL:
+    R = SoftPromoteHalfRes_VECREDUCE_SEQ(N);
+    break;
+  }
+
+  if (R.getNode())
+    SetSoftPromotedHalf(SDValue(N, ResNo), R);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_BITCAST(SDNode *N) {
+  return BitConvertToInteger(N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_ConstantFP(SDNode *N) {
+  ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
+
+  // Get the (bit-cast) APInt of the APFloat and build an integer constant
+  return DAG.getConstant(CN->getValueAPF().bitcastToAPInt(), SDLoc(CN),
+                         MVT::i16);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0));
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
+                     NewOp.getValueType().getVectorElementType(), NewOp,
+                     N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FCOPYSIGN(SDNode *N) {
+  SDValue LHS = GetSoftPromotedHalf(N->getOperand(0));
+  SDValue RHS = BitConvertToInteger(N->getOperand(1));
+  SDLoc dl(N);
+
+  EVT LVT = LHS.getValueType();
+  EVT RVT = RHS.getValueType();
+
+  unsigned LSize = LVT.getSizeInBits();
+  unsigned RSize = RVT.getSizeInBits();
+
+  // First get the sign bit of second operand.
+  SDValue SignBit = DAG.getNode(
+      ISD::SHL, dl, RVT, DAG.getConstant(1, dl, RVT),
+      DAG.getConstant(RSize - 1, dl,
+                      TLI.getShiftAmountTy(RVT, DAG.getDataLayout())));
+  SignBit = DAG.getNode(ISD::AND, dl, RVT, RHS, SignBit);
+
+  // Shift right or sign-extend it if the two operands have different types.
+  int SizeDiff = RVT.getSizeInBits() - LVT.getSizeInBits();
+  if (SizeDiff > 0) {
+    SignBit =
+        DAG.getNode(ISD::SRL, dl, RVT, SignBit,
+                    DAG.getConstant(SizeDiff, dl,
+                                    TLI.getShiftAmountTy(SignBit.getValueType(),
+                                                         DAG.getDataLayout())));
+    SignBit = DAG.getNode(ISD::TRUNCATE, dl, LVT, SignBit);
+  } else if (SizeDiff < 0) {
+    SignBit = DAG.getNode(ISD::ANY_EXTEND, dl, LVT, SignBit);
+    SignBit =
+        DAG.getNode(ISD::SHL, dl, LVT, SignBit,
+                    DAG.getConstant(-SizeDiff, dl,
+                                    TLI.getShiftAmountTy(SignBit.getValueType(),
+                                                         DAG.getDataLayout())));
+  }
+
+  // Clear the sign bit of the first operand.
+  SDValue Mask = DAG.getNode(
+      ISD::SHL, dl, LVT, DAG.getConstant(1, dl, LVT),
+      DAG.getConstant(LSize - 1, dl,
+                      TLI.getShiftAmountTy(LVT, DAG.getDataLayout())));
+  Mask = DAG.getNode(ISD::SUB, dl, LVT, Mask, DAG.getConstant(1, dl, LVT));
+  LHS = DAG.getNode(ISD::AND, dl, LVT, LHS, Mask);
+
+  // Or the value with the sign bit.
+  return DAG.getNode(ISD::OR, dl, LVT, LHS, SignBit);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
+  SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
+  SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
+  SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
+  SDLoc dl(N);
+
+  // Promote to the larger FP type.
+  auto PromotionOpcode = GetPromotionOpcode(OVT, NVT);
+  Op0 = DAG.getNode(PromotionOpcode, dl, NVT, Op0);
+  Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1);
+  Op2 = DAG.getNode(PromotionOpcode, dl, NVT, Op2);
+
+  SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2);
+
+  // Convert back to FP16 as an integer.
+  return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_ExpOp(SDNode *N) {
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
+  SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
+  SDValue Op1 = N->getOperand(1);
+  SDLoc dl(N);
+
+  // Promote to the larger FP type.
+  Op0 = DAG.getNode(GetPromotionOpcode(OVT, NVT), dl, NVT, Op0);
+
+  SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1);
+
+  // Convert back to FP16 as an integer.
+  return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FP_ROUND(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  EVT SVT = N->getOperand(0).getValueType();
+
+  if (N->isStrictFPOpcode()) {
+    assert(RVT == MVT::f16);
+    SDValue Res =
+        DAG.getNode(ISD::STRICT_FP_TO_FP16, SDLoc(N), {MVT::i16, MVT::Other},
+                    {N->getOperand(0), N->getOperand(1)});
+    ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+    return Res;
+  }
+
+  return DAG.getNode(GetPromotionOpcode(SVT, RVT), SDLoc(N), MVT::i16,
+                     N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_LOAD(SDNode *N) {
+  LoadSDNode *L = cast<LoadSDNode>(N);
+
+  // Load the value as an integer value with the same number of bits.
+  assert(L->getExtensionType() == ISD::NON_EXTLOAD && "Unexpected extension!");
+  SDValue NewL =
+      DAG.getLoad(L->getAddressingMode(), L->getExtensionType(), MVT::i16,
+                  SDLoc(N), L->getChain(), L->getBasePtr(), L->getOffset(),
+                  L->getPointerInfo(), MVT::i16, L->getOriginalAlign(),
+                  L->getMemOperand()->getFlags(), L->getAAInfo());
+  // Legalize the chain result by replacing uses of the old value chain with the
+  // new one
+  ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+  return NewL;
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_SELECT(SDNode *N) {
+  SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
+  SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
+  return DAG.getSelect(SDLoc(N), Op1.getValueType(), N->getOperand(0), Op1,
+                       Op2);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_SELECT_CC(SDNode *N) {
+  SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
+  SDValue Op3 = GetSoftPromotedHalf(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N), Op2.getValueType(),
+                     N->getOperand(0), N->getOperand(1), Op2, Op3,
+                     N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_XINT_TO_FP(SDNode *N) {
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
+  SDLoc dl(N);
+
+  SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
+
+  // Round the value to the softened type.
+  return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(MVT::i16);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_UnaryOp(SDNode *N) {
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
+  SDValue Op = GetSoftPromotedHalf(N->getOperand(0));
+  SDLoc dl(N);
+
+  // Promote to the larger FP type.
+  Op = DAG.getNode(GetPromotionOpcode(OVT, NVT), dl, NVT, Op);
+
+  SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op);
+
+  // Convert back to FP16 as an integer.
+  return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_BinOp(SDNode *N) {
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
+  SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
+  SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
+  SDLoc dl(N);
+
+  // Promote to the larger FP type.
+  auto PromotionOpcode = GetPromotionOpcode(OVT, NVT);
+  Op0 = DAG.getNode(PromotionOpcode, dl, NVT, Op0);
+  Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1);
+
+  SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1);
+
+  // Convert back to FP16 as an integer.
+  return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_VECREDUCE(SDNode *N) {
+  // Expand and soften recursively.
+  ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduce(N, DAG));
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_VECREDUCE_SEQ(SDNode *N) {
+  // Expand and soften.
+  ReplaceValueWith(SDValue(N, 0), TLI.expandVecReduceSeq(N, DAG));
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Half Operand Soft Promotion
+//===----------------------------------------------------------------------===//
+
+bool DAGTypeLegalizer::SoftPromoteHalfOperand(SDNode *N, unsigned OpNo) {
+  LLVM_DEBUG(dbgs() << "Soft promote half operand " << OpNo << ": ";
+             N->dump(&DAG); dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false)) {
+    LLVM_DEBUG(dbgs() << "Node has been custom lowered, done\n");
+    return false;
+  }
+
+  // Nodes that use a promotion-requiring floating point operand, but doesn't
+  // produce a soft promotion-requiring floating point result, need to be
+  // legalized to use the soft promoted float operand.  Nodes that produce at
+  // least one soft promotion-requiring floating point result have their
+  // operands legalized as a part of PromoteFloatResult.
+  switch (N->getOpcode()) {
+  default:
+  #ifndef NDEBUG
+    dbgs() << "SoftPromoteHalfOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+  #endif
+    report_fatal_error("Do not know how to soft promote this operator's "
+                       "operand!");
+
+  case ISD::BITCAST:    Res = SoftPromoteHalfOp_BITCAST(N); break;
+  case ISD::FCOPYSIGN:  Res = SoftPromoteHalfOp_FCOPYSIGN(N, OpNo); break;
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT: Res = SoftPromoteHalfOp_FP_TO_XINT(N); break;
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+                        Res = SoftPromoteHalfOp_FP_TO_XINT_SAT(N); break;
+  case ISD::STRICT_FP_EXTEND:
+  case ISD::FP_EXTEND:  Res = SoftPromoteHalfOp_FP_EXTEND(N); break;
+  case ISD::SELECT_CC:  Res = SoftPromoteHalfOp_SELECT_CC(N, OpNo); break;
+  case ISD::SETCC:      Res = SoftPromoteHalfOp_SETCC(N); break;
+  case ISD::STORE:      Res = SoftPromoteHalfOp_STORE(N, OpNo); break;
+  case ISD::STACKMAP:
+    Res = SoftPromoteHalfOp_STACKMAP(N, OpNo);
+    break;
+  case ISD::PATCHPOINT:
+    Res = SoftPromoteHalfOp_PATCHPOINT(N, OpNo);
+    break;
+  }
+
+  if (!Res.getNode())
+    return false;
+
+  assert(Res.getNode() != N && "Expected a new node!");
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_BITCAST(SDNode *N) {
+  SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
+
+  return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op0);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FCOPYSIGN(SDNode *N,
+                                                      unsigned OpNo) {
+  assert(OpNo == 1 && "Only Operand 1 must need promotion here");
+  SDValue Op1 = N->getOperand(1);
+  EVT RVT = Op1.getValueType();
+  SDLoc dl(N);
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op1.getValueType());
+
+  Op1 = GetSoftPromotedHalf(Op1);
+  Op1 = DAG.getNode(GetPromotionOpcode(RVT, NVT), dl, NVT, Op1);
+
+  return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), N->getOperand(0),
+                     Op1);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_EXTEND(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+  EVT SVT = Op.getValueType();
+  Op = GetSoftPromotedHalf(N->getOperand(IsStrict ? 1 : 0));
+
+  if (IsStrict) {
+    assert(SVT == MVT::f16);
+    SDValue Res =
+        DAG.getNode(ISD::STRICT_FP16_TO_FP, SDLoc(N),
+                    {N->getValueType(0), MVT::Other}, {N->getOperand(0), Op});
+    ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+    ReplaceValueWith(SDValue(N, 0), Res);
+    return SDValue();
+  }
+
+  return DAG.getNode(GetPromotionOpcode(SVT, RVT), SDLoc(N), RVT, Op);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_TO_XINT(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  EVT SVT = Op.getValueType();
+  SDLoc dl(N);
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
+
+  Op = GetSoftPromotedHalf(Op);
+
+  SDValue Res = DAG.getNode(GetPromotionOpcode(SVT, RVT), dl, NVT, Op);
+
+  return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Res);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_TO_XINT_SAT(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  EVT SVT = Op.getValueType();
+  SDLoc dl(N);
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
+
+  Op = GetSoftPromotedHalf(Op);
+
+  SDValue Res = DAG.getNode(GetPromotionOpcode(SVT, RVT), dl, NVT, Op);
+
+  return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Res,
+                     N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_SELECT_CC(SDNode *N,
+                                                      unsigned OpNo) {
+  assert(OpNo == 0 && "Can only soften the comparison values");
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDLoc dl(N);
+
+  EVT SVT = Op0.getValueType();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), SVT);
+
+  Op0 = GetSoftPromotedHalf(Op0);
+  Op1 = GetSoftPromotedHalf(Op1);
+
+  // Promote to the larger FP type.
+  auto PromotionOpcode = GetPromotionOpcode(SVT, NVT);
+  Op0 = DAG.getNode(PromotionOpcode, dl, NVT, Op0);
+  Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1);
+
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0), Op0, Op1,
+                     N->getOperand(2), N->getOperand(3), N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_SETCC(SDNode *N) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  SDLoc dl(N);
+
+  EVT SVT = Op0.getValueType();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op0.getValueType());
+
+  Op0 = GetSoftPromotedHalf(Op0);
+  Op1 = GetSoftPromotedHalf(Op1);
+
+  // Promote to the larger FP type.
+  auto PromotionOpcode = GetPromotionOpcode(SVT, NVT);
+  Op0 = DAG.getNode(PromotionOpcode, dl, NVT, Op0);
+  Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1);
+
+  return DAG.getSetCC(SDLoc(N), N->getValueType(0), Op0, Op1, CCCode);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_STORE(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 1 && "Can only soften the stored value!");
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  SDValue Val = ST->getValue();
+  SDLoc dl(N);
+
+  assert(!ST->isTruncatingStore() && "Unexpected truncating store.");
+  SDValue Promoted = GetSoftPromotedHalf(Val);
+  return DAG.getStore(ST->getChain(), dl, Promoted, ST->getBasePtr(),
+                      ST->getMemOperand());
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_STACKMAP(SDNode *N, unsigned OpNo) {
+  assert(OpNo > 1); // Because the first two arguments are guaranteed legal.
+  SmallVector<SDValue> NewOps(N->ops().begin(), N->ops().end());
+  SDValue Op = N->getOperand(OpNo);
+  NewOps[OpNo] = GetSoftPromotedHalf(Op);
+  SDValue NewNode =
+      DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), NewOps);
+
+  for (unsigned ResNum = 0; ResNum < N->getNumValues(); ResNum++)
+    ReplaceValueWith(SDValue(N, ResNum), NewNode.getValue(ResNum));
+
+  return SDValue(); // Signal that we replaced the node ourselves.
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_PATCHPOINT(SDNode *N,
+                                                       unsigned OpNo) {
+  assert(OpNo >= 7);
+  SmallVector<SDValue> NewOps(N->ops().begin(), N->ops().end());
+  SDValue Op = N->getOperand(OpNo);
+  NewOps[OpNo] = GetSoftPromotedHalf(Op);
+  SDValue NewNode =
+      DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), NewOps);
+
+  for (unsigned ResNum = 0; ResNum < N->getNumValues(); ResNum++)
+    ReplaceValueWith(SDValue(N, ResNum), NewNode.getValue(ResNum));
+
+  return SDValue(); // Signal that we replaced the node ourselves.
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
new file mode 100644
index 000000000000..df5878fcdf2e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -0,0 +1,5977 @@
+//===----- LegalizeIntegerTypes.cpp - Legalization of integer types -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements integer type expansion and promotion for LegalizeTypes.
+// Promotion is the act of changing a computation in an illegal type into a
+// computation in a larger type.  For example, implementing i8 arithmetic in an
+// i32 register (often needed on powerpc).
+// Expansion is the act of changing a computation in an illegal type into a
+// computation in two identical registers of a smaller type.  For example,
+// implementing i64 arithmetic in two i32 registers (often needed on 32-bit
+// targets).
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "legalize-types"
+
+//===----------------------------------------------------------------------===//
+//  Integer Result Promotion
+//===----------------------------------------------------------------------===//
+
+/// PromoteIntegerResult - This method is called when a result of a node is
+/// found to be in need of promotion to a larger type.  At this point, the node
+/// may also have invalid operands or may have other results that need
+/// expansion, we just know that (at least) one result needs promotion.
+void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
+  LLVM_DEBUG(dbgs() << "Promote integer result: "; N->dump(&DAG);
+             dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true)) {
+    LLVM_DEBUG(dbgs() << "Node has been custom expanded, done\n");
+    return;
+  }
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "PromoteIntegerResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to promote this operator!");
+  case ISD::MERGE_VALUES:Res = PromoteIntRes_MERGE_VALUES(N, ResNo); break;
+  case ISD::AssertSext:  Res = PromoteIntRes_AssertSext(N); break;
+  case ISD::AssertZext:  Res = PromoteIntRes_AssertZext(N); break;
+  case ISD::BITCAST:     Res = PromoteIntRes_BITCAST(N); break;
+  case ISD::BITREVERSE:  Res = PromoteIntRes_BITREVERSE(N); break;
+  case ISD::BSWAP:       Res = PromoteIntRes_BSWAP(N); break;
+  case ISD::BUILD_PAIR:  Res = PromoteIntRes_BUILD_PAIR(N); break;
+  case ISD::Constant:    Res = PromoteIntRes_Constant(N); break;
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTLZ:        Res = PromoteIntRes_CTLZ(N); break;
+  case ISD::PARITY:
+  case ISD::CTPOP:       Res = PromoteIntRes_CTPOP_PARITY(N); break;
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTTZ:        Res = PromoteIntRes_CTTZ(N); break;
+  case ISD::EXTRACT_VECTOR_ELT:
+                         Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::LOAD:        Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N)); break;
+  case ISD::MLOAD:       Res = PromoteIntRes_MLOAD(cast<MaskedLoadSDNode>(N));
+    break;
+  case ISD::MGATHER:     Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N));
+    break;
+  case ISD::SELECT:
+  case ISD::VSELECT:
+  case ISD::VP_SELECT:
+  case ISD::VP_MERGE:
+    Res = PromoteIntRes_Select(N);
+    break;
+  case ISD::SELECT_CC:   Res = PromoteIntRes_SELECT_CC(N); break;
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS:
+  case ISD::SETCC:       Res = PromoteIntRes_SETCC(N); break;
+  case ISD::SMIN:
+  case ISD::SMAX:        Res = PromoteIntRes_SExtIntBinOp(N); break;
+  case ISD::UMIN:
+  case ISD::UMAX:        Res = PromoteIntRes_UMINUMAX(N); break;
+
+  case ISD::SHL:
+  case ISD::VP_SHL:      Res = PromoteIntRes_SHL(N); break;
+  case ISD::SIGN_EXTEND_INREG:
+                         Res = PromoteIntRes_SIGN_EXTEND_INREG(N); break;
+  case ISD::SRA:
+  case ISD::VP_ASHR:     Res = PromoteIntRes_SRA(N); break;
+  case ISD::SRL:
+  case ISD::VP_LSHR:     Res = PromoteIntRes_SRL(N); break;
+  case ISD::VP_TRUNCATE:
+  case ISD::TRUNCATE:    Res = PromoteIntRes_TRUNCATE(N); break;
+  case ISD::UNDEF:       Res = PromoteIntRes_UNDEF(N); break;
+  case ISD::VAARG:       Res = PromoteIntRes_VAARG(N); break;
+  case ISD::VSCALE:      Res = PromoteIntRes_VSCALE(N); break;
+
+  case ISD::EXTRACT_SUBVECTOR:
+                         Res = PromoteIntRes_EXTRACT_SUBVECTOR(N); break;
+  case ISD::INSERT_SUBVECTOR:
+                         Res = PromoteIntRes_INSERT_SUBVECTOR(N); break;
+  case ISD::VECTOR_REVERSE:
+                         Res = PromoteIntRes_VECTOR_REVERSE(N); break;
+  case ISD::VECTOR_SHUFFLE:
+                         Res = PromoteIntRes_VECTOR_SHUFFLE(N); break;
+  case ISD::VECTOR_SPLICE:
+                         Res = PromoteIntRes_VECTOR_SPLICE(N); break;
+  case ISD::VECTOR_INTERLEAVE:
+  case ISD::VECTOR_DEINTERLEAVE:
+    Res = PromoteIntRes_VECTOR_INTERLEAVE_DEINTERLEAVE(N);
+    return;
+  case ISD::INSERT_VECTOR_ELT:
+                         Res = PromoteIntRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::BUILD_VECTOR:
+    Res = PromoteIntRes_BUILD_VECTOR(N);
+    break;
+  case ISD::SPLAT_VECTOR:
+  case ISD::SCALAR_TO_VECTOR:
+    Res = PromoteIntRes_ScalarOp(N);
+    break;
+  case ISD::STEP_VECTOR: Res = PromoteIntRes_STEP_VECTOR(N); break;
+  case ISD::CONCAT_VECTORS:
+                         Res = PromoteIntRes_CONCAT_VECTORS(N); break;
+
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+                         Res = PromoteIntRes_EXTEND_VECTOR_INREG(N); break;
+
+  case ISD::SIGN_EXTEND:
+  case ISD::VP_SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::VP_ZERO_EXTEND:
+  case ISD::ANY_EXTEND:  Res = PromoteIntRes_INT_EXTEND(N); break;
+
+  case ISD::VP_FP_TO_SINT:
+  case ISD::VP_FP_TO_UINT:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:  Res = PromoteIntRes_FP_TO_XINT(N); break;
+
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+                         Res = PromoteIntRes_FP_TO_XINT_SAT(N); break;
+
+  case ISD::FP_TO_BF16:
+  case ISD::FP_TO_FP16:
+    Res = PromoteIntRes_FP_TO_FP16_BF16(N);
+    break;
+
+  case ISD::GET_ROUNDING: Res = PromoteIntRes_GET_ROUNDING(N); break;
+
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::VP_AND:
+  case ISD::VP_OR:
+  case ISD::VP_XOR:
+  case ISD::VP_ADD:
+  case ISD::VP_SUB:
+  case ISD::VP_MUL:      Res = PromoteIntRes_SimpleIntBinOp(N); break;
+
+  case ISD::VP_SMIN:
+  case ISD::VP_SMAX:
+  case ISD::SDIV:
+  case ISD::SREM:
+  case ISD::VP_SDIV:
+  case ISD::VP_SREM:     Res = PromoteIntRes_SExtIntBinOp(N); break;
+
+  case ISD::VP_UMIN:
+  case ISD::VP_UMAX:
+  case ISD::UDIV:
+  case ISD::UREM:
+  case ISD::VP_UDIV:
+  case ISD::VP_UREM:     Res = PromoteIntRes_ZExtIntBinOp(N); break;
+
+  case ISD::SADDO:
+  case ISD::SSUBO:       Res = PromoteIntRes_SADDSUBO(N, ResNo); break;
+  case ISD::UADDO:
+  case ISD::USUBO:       Res = PromoteIntRes_UADDSUBO(N, ResNo); break;
+  case ISD::SMULO:
+  case ISD::UMULO:       Res = PromoteIntRes_XMULO(N, ResNo); break;
+
+  case ISD::ADDE:
+  case ISD::SUBE:
+  case ISD::UADDO_CARRY:
+  case ISD::USUBO_CARRY: Res = PromoteIntRes_UADDSUBO_CARRY(N, ResNo); break;
+
+  case ISD::SADDO_CARRY:
+  case ISD::SSUBO_CARRY: Res = PromoteIntRes_SADDSUBO_CARRY(N, ResNo); break;
+
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:     Res = PromoteIntRes_ADDSUBSHLSAT(N); break;
+
+  case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:  Res = PromoteIntRes_MULFIX(N); break;
+
+  case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT:  Res = PromoteIntRes_DIVFIX(N); break;
+
+  case ISD::ABS:         Res = PromoteIntRes_ABS(N); break;
+
+  case ISD::ATOMIC_LOAD:
+    Res = PromoteIntRes_Atomic0(cast<AtomicSDNode>(N)); break;
+
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_CLR:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_SWAP:
+    Res = PromoteIntRes_Atomic1(cast<AtomicSDNode>(N)); break;
+
+  case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+    Res = PromoteIntRes_AtomicCmpSwap(cast<AtomicSDNode>(N), ResNo);
+    break;
+
+  case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN:
+    Res = PromoteIntRes_VECREDUCE(N);
+    break;
+
+  case ISD::VP_REDUCE_ADD:
+  case ISD::VP_REDUCE_MUL:
+  case ISD::VP_REDUCE_AND:
+  case ISD::VP_REDUCE_OR:
+  case ISD::VP_REDUCE_XOR:
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_SMIN:
+  case ISD::VP_REDUCE_UMAX:
+  case ISD::VP_REDUCE_UMIN:
+    Res = PromoteIntRes_VP_REDUCE(N);
+    break;
+
+  case ISD::FREEZE:
+    Res = PromoteIntRes_FREEZE(N);
+    break;
+
+  case ISD::ROTL:
+  case ISD::ROTR:
+    Res = PromoteIntRes_Rotate(N);
+    break;
+
+  case ISD::FSHL:
+  case ISD::FSHR:
+    Res = PromoteIntRes_FunnelShift(N);
+    break;
+
+  case ISD::IS_FPCLASS:
+    Res = PromoteIntRes_IS_FPCLASS(N);
+    break;
+  case ISD::FFREXP:
+    Res = PromoteIntRes_FFREXP(N);
+    break;
+  }
+
+  // If the result is null then the sub-method took care of registering it.
+  if (Res.getNode())
+    SetPromotedInteger(SDValue(N, ResNo), Res);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N,
+                                                     unsigned ResNo) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  return GetPromotedInteger(Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_AssertSext(SDNode *N) {
+  // Sign-extend the new bits, and continue the assertion.
+  SDValue Op = SExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::AssertSext, SDLoc(N),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_AssertZext(SDNode *N) {
+  // Zero the new bits, and continue the assertion.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::AssertZext, SDLoc(N),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) {
+  EVT ResVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N),
+                              N->getMemoryVT(), ResVT,
+                              N->getChain(), N->getBasePtr(),
+                              N->getMemOperand());
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) {
+  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N),
+                              N->getMemoryVT(),
+                              N->getChain(), N->getBasePtr(),
+                              Op2, N->getMemOperand());
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N,
+                                                      unsigned ResNo) {
+  if (ResNo == 1) {
+    assert(N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
+    EVT SVT = getSetCCResultType(N->getOperand(2).getValueType());
+    EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1));
+
+    // Only use the result of getSetCCResultType if it is legal,
+    // otherwise just use the promoted result type (NVT).
+    if (!TLI.isTypeLegal(SVT))
+      SVT = NVT;
+
+    SDVTList VTs = DAG.getVTList(N->getValueType(0), SVT, MVT::Other);
+    SDValue Res = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, SDLoc(N), N->getMemoryVT(), VTs,
+        N->getChain(), N->getBasePtr(), N->getOperand(2), N->getOperand(3),
+        N->getMemOperand());
+    ReplaceValueWith(SDValue(N, 0), Res.getValue(0));
+    ReplaceValueWith(SDValue(N, 2), Res.getValue(2));
+    return Res.getValue(1);
+  }
+
+  // Op2 is used for the comparison and thus must be extended according to the
+  // target's atomic operations. Op3 is merely stored and so can be left alone.
+  SDValue Op2 = N->getOperand(2);
+  SDValue Op3 = GetPromotedInteger(N->getOperand(3));
+  switch (TLI.getExtendForAtomicCmpSwapArg()) {
+  case ISD::SIGN_EXTEND:
+    Op2 = SExtPromotedInteger(Op2);
+    break;
+  case ISD::ZERO_EXTEND:
+    Op2 = ZExtPromotedInteger(Op2);
+    break;
+  case ISD::ANY_EXTEND:
+    Op2 = GetPromotedInteger(Op2);
+    break;
+  default:
+    llvm_unreachable("Invalid atomic op extension");
+  }
+
+  SDVTList VTs =
+      DAG.getVTList(Op2.getValueType(), N->getValueType(1), MVT::Other);
+  SDValue Res = DAG.getAtomicCmpSwap(
+      N->getOpcode(), SDLoc(N), N->getMemoryVT(), VTs, N->getChain(),
+      N->getBasePtr(), Op2, Op3, N->getMemOperand());
+  // Update the use to N with the newly created Res.
+  for (unsigned i = 1, NumResults = N->getNumValues(); i < NumResults; ++i)
+    ReplaceValueWith(SDValue(N, i), Res.getValue(i));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+  EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  SDLoc dl(N);
+
+  switch (getTypeAction(InVT)) {
+  case TargetLowering::TypeLegal:
+    break;
+  case TargetLowering::TypePromoteInteger:
+    if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector() && !NInVT.isVector())
+      // The input promotes to the same size.  Convert the promoted value.
+      return DAG.getNode(ISD::BITCAST, dl, NOutVT, GetPromotedInteger(InOp));
+    break;
+  case TargetLowering::TypeSoftenFloat:
+    // Promote the integer operand by hand.
+    return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftenedFloat(InOp));
+  case TargetLowering::TypeSoftPromoteHalf:
+    // Promote the integer operand by hand.
+    return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftPromotedHalf(InOp));
+  case TargetLowering::TypePromoteFloat: {
+    // Convert the promoted float by hand.
+    if (!NOutVT.isVector())
+      return DAG.getNode(ISD::FP_TO_FP16, dl, NOutVT, GetPromotedFloat(InOp));
+    break;
+  }
+  case TargetLowering::TypeExpandInteger:
+  case TargetLowering::TypeExpandFloat:
+    break;
+  case TargetLowering::TypeScalarizeVector:
+    // Convert the element to an integer and promote it by hand.
+    if (!NOutVT.isVector())
+      return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
+                         BitConvertToInteger(GetScalarizedVector(InOp)));
+    break;
+  case TargetLowering::TypeScalarizeScalableVector:
+    report_fatal_error("Scalarization of scalable vectors is not supported.");
+  case TargetLowering::TypeSplitVector: {
+    if (!NOutVT.isVector()) {
+      // For example, i32 = BITCAST v2i16 on alpha.  Convert the split
+      // pieces of the input into integers and reassemble in the final type.
+      SDValue Lo, Hi;
+      GetSplitVector(N->getOperand(0), Lo, Hi);
+      Lo = BitConvertToInteger(Lo);
+      Hi = BitConvertToInteger(Hi);
+
+      if (DAG.getDataLayout().isBigEndian())
+        std::swap(Lo, Hi);
+
+      InOp = DAG.getNode(ISD::ANY_EXTEND, dl,
+                         EVT::getIntegerVT(*DAG.getContext(),
+                                           NOutVT.getSizeInBits()),
+                         JoinIntegers(Lo, Hi));
+      return DAG.getNode(ISD::BITCAST, dl, NOutVT, InOp);
+    }
+    break;
+  }
+  case TargetLowering::TypeWidenVector:
+    // The input is widened to the same size. Convert to the widened value.
+    // Make sure that the outgoing value is not a vector, because this would
+    // make us bitcast between two vectors which are legalized in different ways.
+    if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector()) {
+      SDValue Res =
+        DAG.getNode(ISD::BITCAST, dl, NOutVT, GetWidenedVector(InOp));
+
+      // For big endian targets we need to shift the casted value or the
+      // interesting bits will end up at the wrong place.
+      if (DAG.getDataLayout().isBigEndian()) {
+        unsigned ShiftAmt = NInVT.getSizeInBits() - InVT.getSizeInBits();
+        assert(ShiftAmt < NOutVT.getSizeInBits() && "Too large shift amount!");
+        Res = DAG.getNode(ISD::SRL, dl, NOutVT, Res,
+                          DAG.getShiftAmountConstant(ShiftAmt, NOutVT, dl));
+      }
+      return Res;
+    }
+    // If the output type is also a vector and widening it to the same size
+    // as the widened input type would be a legal type, we can widen the bitcast
+    // and handle the promotion after.
+    if (NOutVT.isVector()) {
+      TypeSize WidenInSize = NInVT.getSizeInBits();
+      TypeSize OutSize = OutVT.getSizeInBits();
+      if (WidenInSize.hasKnownScalarFactor(OutSize)) {
+        unsigned Scale = WidenInSize.getKnownScalarFactor(OutSize);
+        EVT WideOutVT =
+            EVT::getVectorVT(*DAG.getContext(), OutVT.getVectorElementType(),
+                             OutVT.getVectorElementCount() * Scale);
+        if (isTypeLegal(WideOutVT)) {
+          InOp = DAG.getBitcast(WideOutVT, GetWidenedVector(InOp));
+          InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OutVT, InOp,
+                             DAG.getVectorIdxConstant(0, dl));
+          return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, InOp);
+        }
+      }
+    }
+  }
+
+  return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
+                     CreateStackStoreLoad(InOp, OutVT));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_FREEZE(SDNode *N) {
+  SDValue V = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::FREEZE, SDLoc(N),
+                     V.getValueType(), V);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  EVT OVT = N->getValueType(0);
+  EVT NVT = Op.getValueType();
+  SDLoc dl(N);
+
+  // If the larger BSWAP isn't supported by the target, try to expand now.
+  // If we expand later we'll end up with more operations since we lost the
+  // original type. We only do this for scalars since we have a shuffle
+  // based lowering for vectors in LegalizeVectorOps.
+  if (!OVT.isVector() &&
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::BSWAP, NVT)) {
+    if (SDValue Res = TLI.expandBSWAP(N, DAG))
+      return DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Res);
+  }
+
+  unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
+  return DAG.getNode(ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op),
+                     DAG.getShiftAmountConstant(DiffBits, NVT, dl));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  EVT OVT = N->getValueType(0);
+  EVT NVT = Op.getValueType();
+  SDLoc dl(N);
+
+  // If the larger BITREVERSE isn't supported by the target, try to expand now.
+  // If we expand later we'll end up with more operations since we lost the
+  // original type. We only do this for scalars since we have a shuffle
+  // based lowering for vectors in LegalizeVectorOps.
+  if (!OVT.isVector() && OVT.isSimple() &&
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::BITREVERSE, NVT)) {
+    if (SDValue Res = TLI.expandBITREVERSE(N, DAG))
+      return DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Res);
+  }
+
+  unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
+  return DAG.getNode(ISD::SRL, dl, NVT,
+                     DAG.getNode(ISD::BITREVERSE, dl, NVT, Op),
+                     DAG.getShiftAmountConstant(DiffBits, NVT, dl));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) {
+  // The pair element type may be legal, or may not promote to the same type as
+  // the result, for example i14 = BUILD_PAIR (i7, i7).  Handle all cases.
+  return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N),
+                     TLI.getTypeToTransformTo(*DAG.getContext(),
+                     N->getValueType(0)), JoinIntegers(N->getOperand(0),
+                     N->getOperand(1)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  // FIXME there is no actual debug info here
+  SDLoc dl(N);
+  // Zero extend things like i1, sign extend everything else.  It shouldn't
+  // matter in theory which one we pick, but this tends to give better code?
+  unsigned Opc = VT.isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+  SDValue Result = DAG.getNode(Opc, dl,
+                               TLI.getTypeToTransformTo(*DAG.getContext(), VT),
+                               SDValue(N, 0));
+  assert(isa<ConstantSDNode>(Result) && "Didn't constant fold ext?");
+  return Result;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
+  SDLoc dl(N);
+
+  // If the larger CTLZ isn't supported by the target, try to expand now.
+  // If we expand later we'll end up with more operations since we lost the
+  // original type.
+  if (!OVT.isVector() && TLI.isTypeLegal(NVT) &&
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::CTLZ, NVT) &&
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::CTLZ_ZERO_UNDEF, NVT)) {
+    if (SDValue Result = TLI.expandCTLZ(N, DAG)) {
+      Result = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Result);
+      return Result;
+    }
+  }
+
+  // Zero extend to the promoted type and do the count there.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  Op = DAG.getNode(N->getOpcode(), dl, NVT, Op);
+  // Subtract off the extra leading bits in the bigger type.
+  return DAG.getNode(
+      ISD::SUB, dl, NVT, Op,
+      DAG.getConstant(NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(), dl,
+                      NVT));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP_PARITY(SDNode *N) {
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
+
+  // If the larger CTPOP isn't supported by the target, try to expand now.
+  // If we expand later we'll end up with more operations since we lost the
+  // original type.
+  // TODO: Expand ISD::PARITY. Need to move ExpandPARITY from LegalizeDAG to
+  // TargetLowering.
+  if (N->getOpcode() == ISD::CTPOP && !OVT.isVector() && TLI.isTypeLegal(NVT) &&
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::CTPOP, NVT)) {
+    if (SDValue Result = TLI.expandCTPOP(N, DAG)) {
+      Result = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Result);
+      return Result;
+    }
+  }
+
+  // Zero extend to the promoted type and do the count or parity there.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  EVT OVT = N->getValueType(0);
+  EVT NVT = Op.getValueType();
+  SDLoc dl(N);
+
+  // If the larger CTTZ isn't supported by the target, try to expand now.
+  // If we expand later we'll end up with more operations since we lost the
+  // original type. Don't expand if we can use CTPOP or CTLZ expansion on the
+  // larger type.
+  if (!OVT.isVector() && TLI.isTypeLegal(NVT) &&
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::CTTZ, NVT) &&
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::CTTZ_ZERO_UNDEF, NVT) &&
+      !TLI.isOperationLegal(ISD::CTPOP, NVT) &&
+      !TLI.isOperationLegal(ISD::CTLZ, NVT)) {
+    if (SDValue Result = TLI.expandCTTZ(N, DAG)) {
+      Result = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Result);
+      return Result;
+    }
+  }
+
+  if (N->getOpcode() == ISD::CTTZ) {
+    // The count is the same in the promoted type except if the original
+    // value was zero.  This can be handled by setting the bit just off
+    // the top of the original type.
+    auto TopBit = APInt::getOneBitSet(NVT.getScalarSizeInBits(),
+                                      OVT.getScalarSizeInBits());
+    Op = DAG.getNode(ISD::OR, dl, NVT, Op, DAG.getConstant(TopBit, dl, NVT));
+  }
+  return DAG.getNode(N->getOpcode(), dl, NVT, Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDLoc dl(N);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // If the input also needs to be promoted, do that first so we can get a
+  // get a good idea for the output type.
+  if (TLI.getTypeAction(*DAG.getContext(), Op0.getValueType())
+      == TargetLowering::TypePromoteInteger) {
+    SDValue In = GetPromotedInteger(Op0);
+
+    // If the new type is larger than NVT, use it. We probably won't need to
+    // promote it again.
+    EVT SVT = In.getValueType().getScalarType();
+    if (SVT.bitsGE(NVT)) {
+      SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, In, Op1);
+      return DAG.getAnyExtOrTrunc(Ext, dl, NVT);
+    }
+  }
+
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NVT, Op0, Op1);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned NewOpc = N->getOpcode();
+  SDLoc dl(N);
+
+  // If we're promoting a UINT to a larger size and the larger FP_TO_UINT is
+  // not Legal, check to see if we can use FP_TO_SINT instead.  (If both UINT
+  // and SINT conversions are Custom, there is no way to tell which is
+  // preferable. We choose SINT because that's the right thing on PPC.)
+  if (N->getOpcode() == ISD::FP_TO_UINT &&
+      !TLI.isOperationLegal(ISD::FP_TO_UINT, NVT) &&
+      TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT))
+    NewOpc = ISD::FP_TO_SINT;
+
+  if (N->getOpcode() == ISD::STRICT_FP_TO_UINT &&
+      !TLI.isOperationLegal(ISD::STRICT_FP_TO_UINT, NVT) &&
+      TLI.isOperationLegalOrCustom(ISD::STRICT_FP_TO_SINT, NVT))
+    NewOpc = ISD::STRICT_FP_TO_SINT;
+
+  if (N->getOpcode() == ISD::VP_FP_TO_UINT &&
+      !TLI.isOperationLegal(ISD::VP_FP_TO_UINT, NVT) &&
+      TLI.isOperationLegalOrCustom(ISD::VP_FP_TO_SINT, NVT))
+    NewOpc = ISD::VP_FP_TO_SINT;
+
+  SDValue Res;
+  if (N->isStrictFPOpcode()) {
+    Res = DAG.getNode(NewOpc, dl, {NVT, MVT::Other},
+                      {N->getOperand(0), N->getOperand(1)});
+    // Legalize the chain result - switch anything that used the old chain to
+    // use the new one.
+    ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  } else if (NewOpc == ISD::VP_FP_TO_SINT || NewOpc == ISD::VP_FP_TO_UINT) {
+    Res = DAG.getNode(NewOpc, dl, NVT, {N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2)});
+  } else {
+    Res = DAG.getNode(NewOpc, dl, NVT, N->getOperand(0));
+  }
+
+  // Assert that the converted value fits in the original type.  If it doesn't
+  // (eg: because the value being converted is too big), then the result of the
+  // original operation was undefined anyway, so the assert is still correct.
+  //
+  // NOTE: fp-to-uint to fp-to-sint promotion guarantees zero extend. For example:
+  //   before legalization: fp-to-uint16, 65534. -> 0xfffe
+  //   after legalization: fp-to-sint32, 65534. -> 0x0000fffe
+  return DAG.getNode((N->getOpcode() == ISD::FP_TO_UINT ||
+                      N->getOpcode() == ISD::STRICT_FP_TO_UINT ||
+                      N->getOpcode() == ISD::VP_FP_TO_UINT)
+                         ? ISD::AssertZext
+                         : ISD::AssertSext,
+                     dl, NVT, Res,
+                     DAG.getValueType(N->getValueType(0).getScalarType()));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT_SAT(SDNode *N) {
+  // Promote the result type, while keeping the original width in Op1.
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+  return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0),
+                     N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_FP16_BF16(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+
+  return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_GET_ROUNDING(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+
+  SDValue Res =
+      DAG.getNode(N->getOpcode(), dl, {NVT, MVT::Other}, N->getOperand(0));
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+
+  if (getTypeAction(N->getOperand(0).getValueType())
+      == TargetLowering::TypePromoteInteger) {
+    SDValue Res = GetPromotedInteger(N->getOperand(0));
+    assert(Res.getValueType().bitsLE(NVT) && "Extension doesn't make sense!");
+
+    // If the result and operand types are the same after promotion, simplify
+    // to an in-register extension. Unless this is a VP_*_EXTEND.
+    if (NVT == Res.getValueType() && N->getNumOperands() == 1) {
+      // The high bits are not guaranteed to be anything.  Insert an extend.
+      if (N->getOpcode() == ISD::SIGN_EXTEND)
+        return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res,
+                           DAG.getValueType(N->getOperand(0).getValueType()));
+      if (N->getOpcode() == ISD::ZERO_EXTEND)
+        return DAG.getZeroExtendInReg(Res, dl, N->getOperand(0).getValueType());
+      assert(N->getOpcode() == ISD::ANY_EXTEND && "Unknown integer extension!");
+      return Res;
+    }
+  }
+
+  // Otherwise, just extend the original operand all the way to the larger type.
+  if (N->getNumOperands() != 1) {
+    assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+    assert(N->isVPOpcode() && "Expected VP opcode");
+    return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0),
+                       N->getOperand(1), N->getOperand(2));
+  }
+  return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
+  assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  ISD::LoadExtType ExtType =
+    ISD::isNON_EXTLoad(N) ? ISD::EXTLOAD : N->getExtensionType();
+  SDLoc dl(N);
+  SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(),
+                               N->getMemoryVT(), N->getMemOperand());
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue ExtPassThru = GetPromotedInteger(N->getPassThru());
+
+  ISD::LoadExtType ExtType = N->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    ExtType = ISD::EXTLOAD;
+
+  SDLoc dl(N);
+  SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(),
+                                  N->getOffset(), N->getMask(), ExtPassThru,
+                                  N->getMemoryVT(), N->getMemOperand(),
+                                  N->getAddressingMode(), ExtType,
+                                  N->isExpandingLoad());
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue ExtPassThru = GetPromotedInteger(N->getPassThru());
+  assert(NVT == ExtPassThru.getValueType() &&
+      "Gather result type and the passThru argument type should be the same");
+
+  ISD::LoadExtType ExtType = N->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    ExtType = ISD::EXTLOAD;
+
+  SDLoc dl(N);
+  SDValue Ops[] = {N->getChain(), ExtPassThru, N->getMask(), N->getBasePtr(),
+                   N->getIndex(), N->getScale() };
+  SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other),
+                                    N->getMemoryVT(), dl, Ops,
+                                    N->getMemOperand(), N->getIndexType(),
+                                    ExtType);
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+/// Promote the overflow flag of an overflowing arithmetic node.
+SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
+  // Change the return type of the boolean result while obeying
+  // getSetCCResultType.
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1));
+  EVT VT = N->getValueType(0);
+  EVT SVT = getSetCCResultType(VT);
+  SDValue Ops[3] = { N->getOperand(0), N->getOperand(1) };
+  unsigned NumOps = N->getNumOperands();
+  assert(NumOps <= 3 && "Too many operands");
+  if (NumOps == 3)
+    Ops[2] = N->getOperand(2);
+
+  SDLoc dl(N);
+  SDValue Res = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(VT, SVT),
+                            ArrayRef(Ops, NumOps));
+
+  // Modified the sum result - switch anything that used the old sum to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 0), Res);
+
+  // Convert to the expected type.
+  return DAG.getBoolExtOrTrunc(Res.getValue(1), dl, NVT, VT);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
+  // If the promoted type is legal, we can convert this to:
+  //   1. ANY_EXTEND iN to iM
+  //   2. SHL by M-N
+  //   3. [US][ADD|SUB|SHL]SAT
+  //   4. L/ASHR by M-N
+  // Else it is more efficient to convert this to a min and a max
+  // operation in the higher precision arithmetic.
+  SDLoc dl(N);
+  SDValue Op1 = N->getOperand(0);
+  SDValue Op2 = N->getOperand(1);
+  unsigned OldBits = Op1.getScalarValueSizeInBits();
+
+  unsigned Opcode = N->getOpcode();
+  bool IsShift = Opcode == ISD::USHLSAT || Opcode == ISD::SSHLSAT;
+
+  SDValue Op1Promoted, Op2Promoted;
+  if (IsShift) {
+    Op1Promoted = GetPromotedInteger(Op1);
+    Op2Promoted = ZExtPromotedInteger(Op2);
+  } else if (Opcode == ISD::UADDSAT || Opcode == ISD::USUBSAT) {
+    Op1Promoted = ZExtPromotedInteger(Op1);
+    Op2Promoted = ZExtPromotedInteger(Op2);
+  } else {
+    Op1Promoted = SExtPromotedInteger(Op1);
+    Op2Promoted = SExtPromotedInteger(Op2);
+  }
+  EVT PromotedType = Op1Promoted.getValueType();
+  unsigned NewBits = PromotedType.getScalarSizeInBits();
+
+  if (Opcode == ISD::UADDSAT) {
+    APInt MaxVal = APInt::getAllOnes(OldBits).zext(NewBits);
+    SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
+    SDValue Add =
+        DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted);
+    return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax);
+  }
+
+  // USUBSAT can always be promoted as long as we have zero-extended the args.
+  if (Opcode == ISD::USUBSAT)
+    return DAG.getNode(ISD::USUBSAT, dl, PromotedType, Op1Promoted,
+                       Op2Promoted);
+
+  // Shift cannot use a min/max expansion, we can't detect overflow if all of
+  // the bits have been shifted out.
+  if (IsShift || TLI.isOperationLegal(Opcode, PromotedType)) {
+    unsigned ShiftOp;
+    switch (Opcode) {
+    case ISD::SADDSAT:
+    case ISD::SSUBSAT:
+    case ISD::SSHLSAT:
+      ShiftOp = ISD::SRA;
+      break;
+    case ISD::USHLSAT:
+      ShiftOp = ISD::SRL;
+      break;
+    default:
+      llvm_unreachable("Expected opcode to be signed or unsigned saturation "
+                       "addition, subtraction or left shift");
+    }
+
+    unsigned SHLAmount = NewBits - OldBits;
+    SDValue ShiftAmount =
+        DAG.getShiftAmountConstant(SHLAmount, PromotedType, dl);
+    Op1Promoted =
+        DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted, ShiftAmount);
+    if (!IsShift)
+      Op2Promoted =
+          DAG.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount);
+
+    SDValue Result =
+        DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted);
+    return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
+  }
+
+  unsigned AddOp = Opcode == ISD::SADDSAT ? ISD::ADD : ISD::SUB;
+  APInt MinVal = APInt::getSignedMinValue(OldBits).sext(NewBits);
+  APInt MaxVal = APInt::getSignedMaxValue(OldBits).sext(NewBits);
+  SDValue SatMin = DAG.getConstant(MinVal, dl, PromotedType);
+  SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
+  SDValue Result =
+      DAG.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted);
+  Result = DAG.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax);
+  Result = DAG.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin);
+  return Result;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_MULFIX(SDNode *N) {
+  // Can just promote the operands then continue with operation.
+  SDLoc dl(N);
+  SDValue Op1Promoted, Op2Promoted;
+  bool Signed =
+      N->getOpcode() == ISD::SMULFIX || N->getOpcode() == ISD::SMULFIXSAT;
+  bool Saturating =
+      N->getOpcode() == ISD::SMULFIXSAT || N->getOpcode() == ISD::UMULFIXSAT;
+  if (Signed) {
+    Op1Promoted = SExtPromotedInteger(N->getOperand(0));
+    Op2Promoted = SExtPromotedInteger(N->getOperand(1));
+  } else {
+    Op1Promoted = ZExtPromotedInteger(N->getOperand(0));
+    Op2Promoted = ZExtPromotedInteger(N->getOperand(1));
+  }
+  EVT OldType = N->getOperand(0).getValueType();
+  EVT PromotedType = Op1Promoted.getValueType();
+  unsigned DiffSize =
+      PromotedType.getScalarSizeInBits() - OldType.getScalarSizeInBits();
+
+  if (Saturating) {
+    // Promoting the operand and result values changes the saturation width,
+    // which is extends the values that we clamp to on saturation. This could be
+    // resolved by shifting one of the operands the same amount, which would
+    // also shift the result we compare against, then shifting back.
+    Op1Promoted =
+        DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted,
+                    DAG.getShiftAmountConstant(DiffSize, PromotedType, dl));
+    SDValue Result = DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted,
+                                 Op2Promoted, N->getOperand(2));
+    unsigned ShiftOp = Signed ? ISD::SRA : ISD::SRL;
+    return DAG.getNode(ShiftOp, dl, PromotedType, Result,
+                       DAG.getShiftAmountConstant(DiffSize, PromotedType, dl));
+  }
+  return DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted, Op2Promoted,
+                     N->getOperand(2));
+}
+
+static SDValue SaturateWidenedDIVFIX(SDValue V, SDLoc &dl,
+                                     unsigned SatW, bool Signed,
+                                     const TargetLowering &TLI,
+                                     SelectionDAG &DAG) {
+  EVT VT = V.getValueType();
+  unsigned VTW = VT.getScalarSizeInBits();
+
+  if (!Signed) {
+    // Saturate to the unsigned maximum by getting the minimum of V and the
+    // maximum.
+    return DAG.getNode(ISD::UMIN, dl, VT, V,
+                       DAG.getConstant(APInt::getLowBitsSet(VTW, SatW),
+                                       dl, VT));
+  }
+
+  // Saturate to the signed maximum (the low SatW - 1 bits) by taking the
+  // signed minimum of it and V.
+  V = DAG.getNode(ISD::SMIN, dl, VT, V,
+                  DAG.getConstant(APInt::getLowBitsSet(VTW, SatW - 1),
+                                  dl, VT));
+  // Saturate to the signed minimum (the high SatW + 1 bits) by taking the
+  // signed maximum of it and V.
+  V = DAG.getNode(ISD::SMAX, dl, VT, V,
+                  DAG.getConstant(APInt::getHighBitsSet(VTW, VTW - SatW + 1),
+                                  dl, VT));
+  return V;
+}
+
+static SDValue earlyExpandDIVFIX(SDNode *N, SDValue LHS, SDValue RHS,
+                                 unsigned Scale, const TargetLowering &TLI,
+                                 SelectionDAG &DAG, unsigned SatW = 0) {
+  EVT VT = LHS.getValueType();
+  unsigned VTSize = VT.getScalarSizeInBits();
+  bool Signed = N->getOpcode() == ISD::SDIVFIX ||
+                N->getOpcode() == ISD::SDIVFIXSAT;
+  bool Saturating = N->getOpcode() == ISD::SDIVFIXSAT ||
+                    N->getOpcode() == ISD::UDIVFIXSAT;
+
+  SDLoc dl(N);
+  // Widen the types by a factor of two. This is guaranteed to expand, since it
+  // will always have enough high bits in the LHS to shift into.
+  EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VTSize * 2);
+  if (VT.isVector())
+    WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT,
+                              VT.getVectorElementCount());
+  LHS = DAG.getExtOrTrunc(Signed, LHS, dl, WideVT);
+  RHS = DAG.getExtOrTrunc(Signed, RHS, dl, WideVT);
+  SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, LHS, RHS, Scale,
+                                        DAG);
+  assert(Res && "Expanding DIVFIX with wide type failed?");
+  if (Saturating) {
+    // If the caller has told us to saturate at something less, use that width
+    // instead of the type before doubling. However, it cannot be more than
+    // what we just widened!
+    assert(SatW <= VTSize &&
+           "Tried to saturate to more than the original type?");
+    Res = SaturateWidenedDIVFIX(Res, dl, SatW == 0 ? VTSize : SatW, Signed,
+                                TLI, DAG);
+  }
+  return DAG.getZExtOrTrunc(Res, dl, VT);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_DIVFIX(SDNode *N) {
+  SDLoc dl(N);
+  SDValue Op1Promoted, Op2Promoted;
+  bool Signed = N->getOpcode() == ISD::SDIVFIX ||
+                N->getOpcode() == ISD::SDIVFIXSAT;
+  bool Saturating = N->getOpcode() == ISD::SDIVFIXSAT ||
+                    N->getOpcode() == ISD::UDIVFIXSAT;
+  if (Signed) {
+    Op1Promoted = SExtPromotedInteger(N->getOperand(0));
+    Op2Promoted = SExtPromotedInteger(N->getOperand(1));
+  } else {
+    Op1Promoted = ZExtPromotedInteger(N->getOperand(0));
+    Op2Promoted = ZExtPromotedInteger(N->getOperand(1));
+  }
+  EVT PromotedType = Op1Promoted.getValueType();
+  unsigned Scale = N->getConstantOperandVal(2);
+
+  // If the type is already legal and the operation is legal in that type, we
+  // should not early expand.
+  if (TLI.isTypeLegal(PromotedType)) {
+    TargetLowering::LegalizeAction Action =
+        TLI.getFixedPointOperationAction(N->getOpcode(), PromotedType, Scale);
+    if (Action == TargetLowering::Legal || Action == TargetLowering::Custom) {
+      unsigned Diff = PromotedType.getScalarSizeInBits() -
+                      N->getValueType(0).getScalarSizeInBits();
+      if (Saturating)
+        Op1Promoted =
+            DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted,
+                        DAG.getShiftAmountConstant(Diff, PromotedType, dl));
+      SDValue Res = DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted,
+                                Op2Promoted, N->getOperand(2));
+      if (Saturating)
+        Res = DAG.getNode(Signed ? ISD::SRA : ISD::SRL, dl, PromotedType, Res,
+                          DAG.getShiftAmountConstant(Diff, PromotedType, dl));
+      return Res;
+    }
+  }
+
+  // See if we can perform the division in this type without expanding.
+  if (SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, Op1Promoted,
+                                        Op2Promoted, Scale, DAG)) {
+    if (Saturating)
+      Res = SaturateWidenedDIVFIX(Res, dl,
+                                  N->getValueType(0).getScalarSizeInBits(),
+                                  Signed, TLI, DAG);
+    return Res;
+  }
+  // If we cannot, expand it to twice the type width. If we are saturating, give
+  // it the original width as a saturating width so we don't need to emit
+  // two saturations.
+  return earlyExpandDIVFIX(N, Op1Promoted, Op2Promoted, Scale, TLI, DAG,
+                            N->getValueType(0).getScalarSizeInBits());
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
+  if (ResNo == 1)
+    return PromoteIntRes_Overflow(N);
+
+  // The operation overflowed iff the result in the larger type is not the
+  // sign extension of its truncation to the original type.
+  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = SExtPromotedInteger(N->getOperand(1));
+  EVT OVT = N->getOperand(0).getValueType();
+  EVT NVT = LHS.getValueType();
+  SDLoc dl(N);
+
+  // Do the arithmetic in the larger type.
+  unsigned Opcode = N->getOpcode() == ISD::SADDO ? ISD::ADD : ISD::SUB;
+  SDValue Res = DAG.getNode(Opcode, dl, NVT, LHS, RHS);
+
+  // Calculate the overflow flag: sign extend the arithmetic result from
+  // the original type.
+  SDValue Ofl = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res,
+                            DAG.getValueType(OVT));
+  // Overflowed if and only if this is not equal to Res.
+  Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Select(SDNode *N) {
+  SDValue Mask = N->getOperand(0);
+
+  SDValue LHS = GetPromotedInteger(N->getOperand(1));
+  SDValue RHS = GetPromotedInteger(N->getOperand(2));
+
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::VP_SELECT || Opcode == ISD::VP_MERGE)
+    return DAG.getNode(Opcode, SDLoc(N), LHS.getValueType(), Mask, LHS, RHS,
+                       N->getOperand(3));
+  return DAG.getNode(Opcode, SDLoc(N), LHS.getValueType(), Mask, LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SELECT_CC(SDNode *N) {
+  SDValue LHS = GetPromotedInteger(N->getOperand(2));
+  SDValue RHS = GetPromotedInteger(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
+                     LHS.getValueType(), N->getOperand(0),
+                     N->getOperand(1), LHS, RHS, N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) {
+  unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
+  EVT InVT = N->getOperand(OpNo).getValueType();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+
+  EVT SVT = getSetCCResultType(InVT);
+
+  // If we got back a type that needs to be promoted, this likely means the
+  // the input type also needs to be promoted. So get the promoted type for
+  // the input and try the query again.
+  if (getTypeAction(SVT) == TargetLowering::TypePromoteInteger) {
+    if (getTypeAction(InVT) == TargetLowering::TypePromoteInteger) {
+      InVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+      SVT = getSetCCResultType(InVT);
+    } else {
+      // Input type isn't promoted, just use the default promoted type.
+      SVT = NVT;
+    }
+  }
+
+  SDLoc dl(N);
+  assert(SVT.isVector() == N->getOperand(OpNo).getValueType().isVector() &&
+         "Vector compare must return a vector result!");
+
+  // Get the SETCC result using the canonical SETCC type.
+  SDValue SetCC;
+  if (N->isStrictFPOpcode()) {
+    SDVTList VTs = DAG.getVTList({SVT, MVT::Other});
+    SDValue Opers[] = {N->getOperand(0), N->getOperand(1),
+                       N->getOperand(2), N->getOperand(3)};
+    SetCC = DAG.getNode(N->getOpcode(), dl, VTs, Opers, N->getFlags());
+    // Legalize the chain result - switch anything that used the old chain to
+    // use the new one.
+    ReplaceValueWith(SDValue(N, 1), SetCC.getValue(1));
+  } else
+    SetCC = DAG.getNode(N->getOpcode(), dl, SVT, N->getOperand(0),
+                        N->getOperand(1), N->getOperand(2), N->getFlags());
+
+  // Convert to the expected type.
+  return DAG.getSExtOrTrunc(SetCC, dl, NVT);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_IS_FPCLASS(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Arg = N->getOperand(0);
+  SDValue Test = N->getOperand(1);
+  EVT NResVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.getNode(ISD::IS_FPCLASS, DL, NResVT, Arg, Test);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_FFREXP(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1));
+  EVT VT = N->getValueType(0);
+
+  SDLoc dl(N);
+  SDValue Res =
+      DAG.getNode(N->getOpcode(), dl, DAG.getVTList(VT, NVT), N->getOperand(0));
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return Res.getValue(1);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) {
+  SDValue LHS = GetPromotedInteger(N->getOperand(0));
+  SDValue RHS = N->getOperand(1);
+  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    RHS = ZExtPromotedInteger(RHS);
+  if (N->getOpcode() != ISD::VP_SHL)
+    return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
+                     N->getOperand(2), N->getOperand(3));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) {
+  // The input may have strange things in the top bits of the registers, but
+  // these operations don't care.  They may have weird bits going out, but
+  // that too is okay if they are integer operations.
+  SDValue LHS = GetPromotedInteger(N->getOperand(0));
+  SDValue RHS = GetPromotedInteger(N->getOperand(1));
+  if (N->getNumOperands() == 2)
+    return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  assert(N->getNumOperands() == 4 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
+  return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
+                     N->getOperand(2), N->getOperand(3));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N) {
+  // Sign extend the input.
+  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = SExtPromotedInteger(N->getOperand(1));
+  if (N->getNumOperands() == 2)
+    return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  assert(N->getNumOperands() == 4 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
+  return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
+                     N->getOperand(2), N->getOperand(3));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) {
+  // Zero extend the input.
+  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
+  if (N->getNumOperands() == 2)
+    return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  assert(N->getNumOperands() == 4 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
+  return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
+                     N->getOperand(2), N->getOperand(3));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) {
+  // It doesn't matter if we sign extend or zero extend in the inputs. So do
+  // whatever is best for the target.
+  SDValue LHS = SExtOrZExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = SExtOrZExtPromotedInteger(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
+                     LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
+  // The input value must be properly sign extended.
+  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = N->getOperand(1);
+  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    RHS = ZExtPromotedInteger(RHS);
+  if (N->getOpcode() != ISD::VP_ASHR)
+    return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
+                     N->getOperand(2), N->getOperand(3));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
+  // The input value must be properly zero extended.
+  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = N->getOperand(1);
+  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    RHS = ZExtPromotedInteger(RHS);
+  if (N->getOpcode() != ISD::VP_LSHR)
+    return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
+                     N->getOperand(2), N->getOperand(3));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Rotate(SDNode *N) {
+  // Lower the rotate to shifts and ORs which can be promoted.
+  SDValue Res = TLI.expandROT(N, true /*AllowVectorOps*/, DAG);
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_FunnelShift(SDNode *N) {
+  SDValue Hi = GetPromotedInteger(N->getOperand(0));
+  SDValue Lo = GetPromotedInteger(N->getOperand(1));
+  SDValue Amt = N->getOperand(2);
+  if (getTypeAction(Amt.getValueType()) == TargetLowering::TypePromoteInteger)
+    Amt = ZExtPromotedInteger(Amt);
+  EVT AmtVT = Amt.getValueType();
+
+  SDLoc DL(N);
+  EVT OldVT = N->getOperand(0).getValueType();
+  EVT VT = Lo.getValueType();
+  unsigned Opcode = N->getOpcode();
+  bool IsFSHR = Opcode == ISD::FSHR;
+  unsigned OldBits = OldVT.getScalarSizeInBits();
+  unsigned NewBits = VT.getScalarSizeInBits();
+
+  // Amount has to be interpreted modulo the old bit width.
+  Amt = DAG.getNode(ISD::UREM, DL, AmtVT, Amt,
+                    DAG.getConstant(OldBits, DL, AmtVT));
+
+  // If the promoted type is twice the size (or more), then we use the
+  // traditional funnel 'double' shift codegen. This isn't necessary if the
+  // shift amount is constant.
+  // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z % bw)) >> bw.
+  // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z % bw)).
+  if (NewBits >= (2 * OldBits) && !isa<ConstantSDNode>(Amt) &&
+      !TLI.isOperationLegalOrCustom(Opcode, VT)) {
+    SDValue HiShift = DAG.getConstant(OldBits, DL, VT);
+    Hi = DAG.getNode(ISD::SHL, DL, VT, Hi, HiShift);
+    Lo = DAG.getZeroExtendInReg(Lo, DL, OldVT);
+    SDValue Res = DAG.getNode(ISD::OR, DL, VT, Hi, Lo);
+    Res = DAG.getNode(IsFSHR ? ISD::SRL : ISD::SHL, DL, VT, Res, Amt);
+    if (!IsFSHR)
+      Res = DAG.getNode(ISD::SRL, DL, VT, Res, HiShift);
+    return Res;
+  }
+
+  // Shift Lo up to occupy the upper bits of the promoted type.
+  SDValue ShiftOffset = DAG.getConstant(NewBits - OldBits, DL, AmtVT);
+  Lo = DAG.getNode(ISD::SHL, DL, VT, Lo, ShiftOffset);
+
+  // Increase Amount to shift the result into the lower bits of the promoted
+  // type.
+  if (IsFSHR)
+    Amt = DAG.getNode(ISD::ADD, DL, AmtVT, Amt, ShiftOffset);
+
+  return DAG.getNode(Opcode, DL, VT, Hi, Lo, Amt);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Res;
+  SDValue InOp = N->getOperand(0);
+  SDLoc dl(N);
+
+  switch (getTypeAction(InOp.getValueType())) {
+  default: llvm_unreachable("Unknown type action!");
+  case TargetLowering::TypeLegal:
+  case TargetLowering::TypeExpandInteger:
+    Res = InOp;
+    break;
+  case TargetLowering::TypePromoteInteger:
+    Res = GetPromotedInteger(InOp);
+    break;
+  case TargetLowering::TypeSplitVector: {
+    EVT InVT = InOp.getValueType();
+    assert(InVT.isVector() && "Cannot split scalar types");
+    ElementCount NumElts = InVT.getVectorElementCount();
+    assert(NumElts == NVT.getVectorElementCount() &&
+           "Dst and Src must have the same number of elements");
+    assert(isPowerOf2_32(NumElts.getKnownMinValue()) &&
+           "Promoted vector type must be a power of two");
+
+    SDValue EOp1, EOp2;
+    GetSplitVector(InOp, EOp1, EOp2);
+
+    EVT HalfNVT = EVT::getVectorVT(*DAG.getContext(), NVT.getScalarType(),
+                                   NumElts.divideCoefficientBy(2));
+    if (N->getOpcode() == ISD::TRUNCATE) {
+      EOp1 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp1);
+      EOp2 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp2);
+    } else {
+      assert(N->getOpcode() == ISD::VP_TRUNCATE &&
+             "Expected VP_TRUNCATE opcode");
+      SDValue MaskLo, MaskHi, EVLLo, EVLHi;
+      std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(1));
+      std::tie(EVLLo, EVLHi) =
+          DAG.SplitEVL(N->getOperand(2), N->getValueType(0), dl);
+      EOp1 = DAG.getNode(ISD::VP_TRUNCATE, dl, HalfNVT, EOp1, MaskLo, EVLLo);
+      EOp2 = DAG.getNode(ISD::VP_TRUNCATE, dl, HalfNVT, EOp2, MaskHi, EVLHi);
+    }
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, EOp1, EOp2);
+  }
+  // TODO: VP_TRUNCATE need to handle when TypeWidenVector access to some
+  // targets.
+  case TargetLowering::TypeWidenVector: {
+    SDValue WideInOp = GetWidenedVector(InOp);
+
+    // Truncate widened InOp.
+    unsigned NumElem = WideInOp.getValueType().getVectorNumElements();
+    EVT TruncVT = EVT::getVectorVT(*DAG.getContext(),
+                                   N->getValueType(0).getScalarType(), NumElem);
+    SDValue WideTrunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, WideInOp);
+
+    // Zero extend so that the elements are of same type as those of NVT
+    EVT ExtVT = EVT::getVectorVT(*DAG.getContext(), NVT.getVectorElementType(),
+                                 NumElem);
+    SDValue WideExt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, WideTrunc);
+
+    // Extract the low NVT subvector.
+    SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, WideExt, ZeroIdx);
+  }
+  }
+
+  // Truncate to NVT instead of VT
+  if (N->getOpcode() == ISD::VP_TRUNCATE)
+    return DAG.getNode(ISD::VP_TRUNCATE, dl, NVT, Res, N->getOperand(1),
+                       N->getOperand(2));
+  return DAG.getNode(ISD::TRUNCATE, dl, NVT, Res);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) {
+  if (ResNo == 1)
+    return PromoteIntRes_Overflow(N);
+
+  // The operation overflowed iff the result in the larger type is not the
+  // zero extension of its truncation to the original type.
+  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
+  EVT OVT = N->getOperand(0).getValueType();
+  EVT NVT = LHS.getValueType();
+  SDLoc dl(N);
+
+  // Do the arithmetic in the larger type.
+  unsigned Opcode = N->getOpcode() == ISD::UADDO ? ISD::ADD : ISD::SUB;
+  SDValue Res = DAG.getNode(Opcode, dl, NVT, LHS, RHS);
+
+  // Calculate the overflow flag: zero extend the arithmetic result from
+  // the original type.
+  SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT);
+  // Overflowed if and only if this is not equal to Res.
+  Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+
+  return Res;
+}
+
+// Handle promotion for the ADDE/SUBE/UADDO_CARRY/USUBO_CARRY nodes. Notice that
+// the third operand of ADDE/SUBE nodes is carry flag, which differs from
+// the UADDO_CARRY/USUBO_CARRY nodes in that the third operand is carry Boolean.
+SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO_CARRY(SDNode *N,
+                                                       unsigned ResNo) {
+  if (ResNo == 1)
+    return PromoteIntRes_Overflow(N);
+
+  // We need to sign-extend the operands so the carry value computed by the
+  // wide operation will be equivalent to the carry value computed by the
+  // narrow operation.
+  // An UADDO_CARRY can generate carry only if any of the operands has its
+  // most significant bit set. Sign extension propagates the most significant
+  // bit into the higher bits which means the extra bit that the narrow
+  // addition would need (i.e. the carry) will be propagated through the higher
+  // bits of the wide addition.
+  // A USUBO_CARRY can generate borrow only if LHS < RHS and this property will
+  // be preserved by sign extension.
+  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = SExtPromotedInteger(N->getOperand(1));
+
+  EVT ValueVTs[] = {LHS.getValueType(), N->getValueType(1)};
+
+  // Do the arithmetic in the wide type.
+  SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N), DAG.getVTList(ValueVTs),
+                            LHS, RHS, N->getOperand(2));
+
+  // Update the users of the original carry/borrow value.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+
+  return SDValue(Res.getNode(), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO_CARRY(SDNode *N,
+                                                       unsigned ResNo) {
+  assert(ResNo == 1 && "Don't know how to promote other results yet.");
+  return PromoteIntRes_Overflow(N);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_ABS(SDNode *N) {
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
+
+  // If a larger ABS or SMAX isn't supported by the target, try to expand now.
+  // If we expand later we'll end up sign extending more than just the sra input
+  // in sra+xor+sub expansion.
+  if (!OVT.isVector() &&
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::ABS, NVT) &&
+      !TLI.isOperationLegal(ISD::SMAX, NVT)) {
+    if (SDValue Res = TLI.expandABS(N, DAG))
+      return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Res);
+  }
+
+  SDValue Op0 = SExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::ABS, SDLoc(N), Op0.getValueType(), Op0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
+  // Promote the overflow bit trivially.
+  if (ResNo == 1)
+    return PromoteIntRes_Overflow(N);
+
+  SDValue LHS = N->getOperand(0), RHS = N->getOperand(1);
+  SDLoc DL(N);
+  EVT SmallVT = LHS.getValueType();
+
+  // To determine if the result overflowed in a larger type, we extend the
+  // input to the larger type, do the multiply (checking if it overflows),
+  // then also check the high bits of the result to see if overflow happened
+  // there.
+  if (N->getOpcode() == ISD::SMULO) {
+    LHS = SExtPromotedInteger(LHS);
+    RHS = SExtPromotedInteger(RHS);
+  } else {
+    LHS = ZExtPromotedInteger(LHS);
+    RHS = ZExtPromotedInteger(RHS);
+  }
+  SDVTList VTs = DAG.getVTList(LHS.getValueType(), N->getValueType(1));
+  SDValue Mul = DAG.getNode(N->getOpcode(), DL, VTs, LHS, RHS);
+
+  // Overflow occurred if it occurred in the larger type, or if the high part
+  // of the result does not zero/sign-extend the low part.  Check this second
+  // possibility first.
+  SDValue Overflow;
+  if (N->getOpcode() == ISD::UMULO) {
+    // Unsigned overflow occurred if the high part is non-zero.
+    unsigned Shift = SmallVT.getScalarSizeInBits();
+    SDValue Hi =
+        DAG.getNode(ISD::SRL, DL, Mul.getValueType(), Mul,
+                    DAG.getShiftAmountConstant(Shift, Mul.getValueType(), DL));
+    Overflow = DAG.getSetCC(DL, N->getValueType(1), Hi,
+                            DAG.getConstant(0, DL, Hi.getValueType()),
+                            ISD::SETNE);
+  } else {
+    // Signed overflow occurred if the high part does not sign extend the low.
+    SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Mul.getValueType(),
+                               Mul, DAG.getValueType(SmallVT));
+    Overflow = DAG.getSetCC(DL, N->getValueType(1), SExt, Mul, ISD::SETNE);
+  }
+
+  // The only other way for overflow to occur is if the multiplication in the
+  // larger type itself overflowed.
+  Overflow = DAG.getNode(ISD::OR, DL, N->getValueType(1), Overflow,
+                         SDValue(Mul.getNode(), 1));
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Overflow);
+  return Mul;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(),
+                                               N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VSCALE(SDNode *N) {
+  EVT VT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+
+  const APInt &MulImm = N->getConstantOperandAPInt(0);
+  return DAG.getVScale(SDLoc(N), VT, MulImm.sext(VT.getSizeInBits()));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
+  SDValue Chain = N->getOperand(0); // Get the chain.
+  SDValue Ptr = N->getOperand(1); // Get the pointer.
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  MVT RegVT = TLI.getRegisterType(*DAG.getContext(), VT);
+  unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), VT);
+  // The argument is passed as NumRegs registers of type RegVT.
+
+  SmallVector<SDValue, 8> Parts(NumRegs);
+  for (unsigned i = 0; i < NumRegs; ++i) {
+    Parts[i] = DAG.getVAArg(RegVT, dl, Chain, Ptr, N->getOperand(2),
+                            N->getConstantOperandVal(3));
+    Chain = Parts[i].getValue(1);
+  }
+
+  // Handle endianness of the load.
+  if (DAG.getDataLayout().isBigEndian())
+    std::reverse(Parts.begin(), Parts.end());
+
+  // Assemble the parts in the promoted type.
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Res = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[0]);
+  for (unsigned i = 1; i < NumRegs; ++i) {
+    SDValue Part = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[i]);
+    // Shift it to the right position and "or" it in.
+    Part = DAG.getNode(ISD::SHL, dl, NVT, Part,
+                       DAG.getConstant(i * RegVT.getSizeInBits(), dl,
+                                       TLI.getPointerTy(DAG.getDataLayout())));
+    Res = DAG.getNode(ISD::OR, dl, NVT, Res, Part);
+  }
+
+  // Modified the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
+
+  return Res;
+}
+
+//===----------------------------------------------------------------------===//
+//  Integer Operand Promotion
+//===----------------------------------------------------------------------===//
+
+/// PromoteIntegerOperand - This method is called when the specified operand of
+/// the specified node is found to need promotion.  At this point, all of the
+/// result types of the node are known to be legal, but other operands of the
+/// node may need promotion or expansion as well as the specified one.
+bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
+  LLVM_DEBUG(dbgs() << "Promote integer operand: "; N->dump(&DAG);
+             dbgs() << "\n");
+  SDValue Res = SDValue();
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false)) {
+    LLVM_DEBUG(dbgs() << "Node has been custom lowered, done\n");
+    return false;
+  }
+
+  switch (N->getOpcode()) {
+    default:
+  #ifndef NDEBUG
+    dbgs() << "PromoteIntegerOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+  #endif
+    report_fatal_error("Do not know how to promote this operator's operand!");
+
+  case ISD::ANY_EXTEND:   Res = PromoteIntOp_ANY_EXTEND(N); break;
+  case ISD::ATOMIC_STORE:
+    Res = PromoteIntOp_ATOMIC_STORE(cast<AtomicSDNode>(N));
+    break;
+  case ISD::BITCAST:      Res = PromoteIntOp_BITCAST(N); break;
+  case ISD::BR_CC:        Res = PromoteIntOp_BR_CC(N, OpNo); break;
+  case ISD::BRCOND:       Res = PromoteIntOp_BRCOND(N, OpNo); break;
+  case ISD::BUILD_PAIR:   Res = PromoteIntOp_BUILD_PAIR(N); break;
+  case ISD::BUILD_VECTOR: Res = PromoteIntOp_BUILD_VECTOR(N); break;
+  case ISD::CONCAT_VECTORS: Res = PromoteIntOp_CONCAT_VECTORS(N); break;
+  case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntOp_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::INSERT_VECTOR_ELT:
+    Res = PromoteIntOp_INSERT_VECTOR_ELT(N, OpNo);
+    break;
+  case ISD::SPLAT_VECTOR:
+  case ISD::SCALAR_TO_VECTOR:
+    Res = PromoteIntOp_ScalarOp(N);
+    break;
+  case ISD::VSELECT:
+  case ISD::SELECT:       Res = PromoteIntOp_SELECT(N, OpNo); break;
+  case ISD::SELECT_CC:    Res = PromoteIntOp_SELECT_CC(N, OpNo); break;
+  case ISD::VP_SETCC:
+  case ISD::SETCC:        Res = PromoteIntOp_SETCC(N, OpNo); break;
+  case ISD::SIGN_EXTEND:  Res = PromoteIntOp_SIGN_EXTEND(N); break;
+  case ISD::VP_SIGN_EXTEND: Res = PromoteIntOp_VP_SIGN_EXTEND(N); break;
+  case ISD::VP_SINT_TO_FP:
+  case ISD::SINT_TO_FP:   Res = PromoteIntOp_SINT_TO_FP(N); break;
+  case ISD::STRICT_SINT_TO_FP: Res = PromoteIntOp_STRICT_SINT_TO_FP(N); break;
+  case ISD::STORE:        Res = PromoteIntOp_STORE(cast<StoreSDNode>(N),
+                                                   OpNo); break;
+  case ISD::MSTORE:       Res = PromoteIntOp_MSTORE(cast<MaskedStoreSDNode>(N),
+                                                    OpNo); break;
+  case ISD::MLOAD:        Res = PromoteIntOp_MLOAD(cast<MaskedLoadSDNode>(N),
+                                                    OpNo); break;
+  case ISD::MGATHER:  Res = PromoteIntOp_MGATHER(cast<MaskedGatherSDNode>(N),
+                                                 OpNo); break;
+  case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast<MaskedScatterSDNode>(N),
+                                                  OpNo); break;
+  case ISD::VP_TRUNCATE:
+  case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
+  case ISD::BF16_TO_FP:
+  case ISD::FP16_TO_FP:
+  case ISD::VP_UINT_TO_FP:
+  case ISD::UINT_TO_FP:   Res = PromoteIntOp_UINT_TO_FP(N); break;
+  case ISD::STRICT_UINT_TO_FP:  Res = PromoteIntOp_STRICT_UINT_TO_FP(N); break;
+  case ISD::ZERO_EXTEND:  Res = PromoteIntOp_ZERO_EXTEND(N); break;
+  case ISD::VP_ZERO_EXTEND: Res = PromoteIntOp_VP_ZERO_EXTEND(N); break;
+  case ISD::EXTRACT_SUBVECTOR: Res = PromoteIntOp_EXTRACT_SUBVECTOR(N); break;
+  case ISD::INSERT_SUBVECTOR: Res = PromoteIntOp_INSERT_SUBVECTOR(N); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR: Res = PromoteIntOp_Shift(N); break;
+
+  case ISD::FSHL:
+  case ISD::FSHR: Res = PromoteIntOp_FunnelShift(N); break;
+
+  case ISD::SADDO_CARRY:
+  case ISD::SSUBO_CARRY:
+  case ISD::UADDO_CARRY:
+  case ISD::USUBO_CARRY: Res = PromoteIntOp_ADDSUBO_CARRY(N, OpNo); break;
+
+  case ISD::FRAMEADDR:
+  case ISD::RETURNADDR: Res = PromoteIntOp_FRAMERETURNADDR(N); break;
+
+  case ISD::PREFETCH: Res = PromoteIntOp_PREFETCH(N, OpNo); break;
+
+  case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
+  case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT: Res = PromoteIntOp_FIX(N); break;
+  case ISD::FPOWI:
+  case ISD::STRICT_FPOWI:
+  case ISD::FLDEXP:
+  case ISD::STRICT_FLDEXP: Res = PromoteIntOp_ExpOp(N); break;
+  case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN: Res = PromoteIntOp_VECREDUCE(N); break;
+  case ISD::VP_REDUCE_ADD:
+  case ISD::VP_REDUCE_MUL:
+  case ISD::VP_REDUCE_AND:
+  case ISD::VP_REDUCE_OR:
+  case ISD::VP_REDUCE_XOR:
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_SMIN:
+  case ISD::VP_REDUCE_UMAX:
+  case ISD::VP_REDUCE_UMIN:
+    Res = PromoteIntOp_VP_REDUCE(N, OpNo);
+    break;
+
+  case ISD::SET_ROUNDING: Res = PromoteIntOp_SET_ROUNDING(N); break;
+  case ISD::STACKMAP:
+    Res = PromoteIntOp_STACKMAP(N, OpNo);
+    break;
+  case ISD::PATCHPOINT:
+    Res = PromoteIntOp_PATCHPOINT(N, OpNo);
+    break;
+  case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
+  case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
+    Res = PromoteIntOp_VP_STRIDED(N, OpNo);
+    break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  const bool IsStrictFp = N->isStrictFPOpcode();
+  assert(Res.getValueType() == N->getValueType(0) &&
+         N->getNumValues() == (IsStrictFp ? 2 : 1) &&
+         "Invalid operand expansion");
+  LLVM_DEBUG(dbgs() << "Replacing: "; N->dump(&DAG); dbgs() << "     with: ";
+             Res.dump());
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  if (IsStrictFp)
+    ReplaceValueWith(SDValue(N, 1), SDValue(Res.getNode(), 1));
+
+  return false;
+}
+
+/// PromoteSetCCOperands - Promote the operands of a comparison.  This code is
+/// shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &LHS, SDValue &RHS,
+                                            ISD::CondCode CCCode) {
+  // We have to insert explicit sign or zero extends. Note that we could
+  // insert sign extends for ALL conditions. For those operations where either
+  // zero or sign extension would be valid, we ask the target which extension
+  // it would prefer.
+
+  // Signed comparisons always require sign extension.
+  if (ISD::isSignedIntSetCC(CCCode)) {
+    LHS = SExtPromotedInteger(LHS);
+    RHS = SExtPromotedInteger(RHS);
+    return;
+  }
+
+  assert((ISD::isUnsignedIntSetCC(CCCode) || ISD::isIntEqualitySetCC(CCCode)) &&
+         "Unknown integer comparison!");
+
+  SDValue OpL = GetPromotedInteger(LHS);
+  SDValue OpR = GetPromotedInteger(RHS);
+
+  if (TLI.isSExtCheaperThanZExt(LHS.getValueType(), OpL.getValueType())) {
+    // The target would prefer to promote the comparison operand with sign
+    // extension. Honor that unless the promoted values are already zero
+    // extended.
+    unsigned OpLEffectiveBits =
+        DAG.computeKnownBits(OpL).countMaxActiveBits();
+    unsigned OpREffectiveBits =
+        DAG.computeKnownBits(OpR).countMaxActiveBits();
+    if (OpLEffectiveBits <= LHS.getScalarValueSizeInBits() &&
+        OpREffectiveBits <= RHS.getScalarValueSizeInBits()) {
+      LHS = OpL;
+      RHS = OpR;
+      return;
+    }
+
+    // The promoted values aren't zero extended, use a sext_inreg.
+    LHS = SExtPromotedInteger(LHS);
+    RHS = SExtPromotedInteger(RHS);
+    return;
+  }
+
+  // Prefer to promote the comparison operand with zero extension.
+
+  // If the width of OpL/OpR excluding the duplicated sign bits is no greater
+  // than the width of LHS/RHS, we can avoid/ inserting a zext_inreg operation
+  // that we might not be able to remove.
+  unsigned OpLEffectiveBits = DAG.ComputeMaxSignificantBits(OpL);
+  unsigned OpREffectiveBits = DAG.ComputeMaxSignificantBits(OpR);
+  if (OpLEffectiveBits <= LHS.getScalarValueSizeInBits() &&
+      OpREffectiveBits <= RHS.getScalarValueSizeInBits()) {
+    LHS = OpL;
+    RHS = OpR;
+    return;
+  }
+
+  // Otherwise, use zext_inreg.
+  LHS = ZExtPromotedInteger(LHS);
+  RHS = ZExtPromotedInteger(RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) {
+  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  return DAG.getAtomic(N->getOpcode(), SDLoc(N), N->getMemoryVT(),
+                       N->getChain(), N->getBasePtr(), Op2, N->getMemOperand());
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) {
+  // This should only occur in unusual situations like bitcasting to an
+  // x86_fp80, so just turn it into a store+load
+  return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 2 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(2);
+  SDValue RHS = N->getOperand(3);
+  PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(1))->get());
+
+  // The chain (Op#0), CC (#1) and basic block destination (Op#4) are always
+  // legal types.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                N->getOperand(1), LHS, RHS, N->getOperand(4)),
+                 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 1 && "only know how to promote condition");
+
+  // Promote all the way up to the canonical SetCC type.
+  SDValue Cond = PromoteTargetBoolean(N->getOperand(1), MVT::Other);
+
+  // The chain (Op#0) and basic block destination (Op#2) are always legal types.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Cond,
+                                        N->getOperand(2)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_PAIR(SDNode *N) {
+  // Since the result type is legal, the operands must promote to it.
+  EVT OVT = N->getOperand(0).getValueType();
+  SDValue Lo = ZExtPromotedInteger(N->getOperand(0));
+  SDValue Hi = GetPromotedInteger(N->getOperand(1));
+  assert(Lo.getValueType() == N->getValueType(0) && "Operand over promoted?");
+  SDLoc dl(N);
+
+  Hi = DAG.getNode(ISD::SHL, dl, N->getValueType(0), Hi,
+                   DAG.getConstant(OVT.getSizeInBits(), dl,
+                                   TLI.getPointerTy(DAG.getDataLayout())));
+  return DAG.getNode(ISD::OR, dl, N->getValueType(0), Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) {
+  // The vector type is legal but the element type is not.  This implies
+  // that the vector is a power-of-two in length and that the element
+  // type does not have a strange size (eg: it is not i1).
+  EVT VecVT = N->getValueType(0);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  assert(!((NumElts & 1) && (!TLI.isTypeLegal(VecVT))) &&
+         "Legal vector of one illegal element?");
+
+  // Promote the inserted value.  The type does not need to match the
+  // vector element type.  Check that any extra bits introduced will be
+  // truncated away.
+  assert(N->getOperand(0).getValueSizeInBits() >=
+         N->getValueType(0).getScalarSizeInBits() &&
+         "Type of inserted value narrower than vector element type!");
+
+  SmallVector<SDValue, 16> NewOps;
+  for (unsigned i = 0; i < NumElts; ++i)
+    NewOps.push_back(GetPromotedInteger(N->getOperand(i)));
+
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N,
+                                                         unsigned OpNo) {
+  if (OpNo == 1) {
+    // Promote the inserted value.  This is valid because the type does not
+    // have to match the vector element type.
+
+    // Check that any extra bits introduced will be truncated away.
+    assert(N->getOperand(1).getValueSizeInBits() >=
+           N->getValueType(0).getScalarSizeInBits() &&
+           "Type of inserted value narrower than vector element type!");
+    return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                  GetPromotedInteger(N->getOperand(1)),
+                                  N->getOperand(2)),
+                   0);
+  }
+
+  assert(OpNo == 2 && "Different operand and result vector types?");
+
+  // Promote the index.
+  SDValue Idx = DAG.getZExtOrTrunc(N->getOperand(2), SDLoc(N),
+                                   TLI.getVectorIdxTy(DAG.getDataLayout()));
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                N->getOperand(1), Idx), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_ScalarOp(SDNode *N) {
+  // Integer SPLAT_VECTOR/SCALAR_TO_VECTOR operands are implicitly truncated,
+  // so just promote the operand in place.
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                GetPromotedInteger(N->getOperand(0))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Only know how to promote the condition!");
+  SDValue Cond = N->getOperand(0);
+  EVT OpTy = N->getOperand(1).getValueType();
+
+  if (N->getOpcode() == ISD::VSELECT)
+    if (SDValue Res = WidenVSELECTMask(N))
+      return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+                         Res, N->getOperand(1), N->getOperand(2));
+
+  // Promote all the way up to the canonical SetCC type.
+  EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy;
+  Cond = PromoteTargetBoolean(Cond, OpVT);
+
+  return SDValue(DAG.UpdateNodeOperands(N, Cond, N->getOperand(1),
+                                        N->getOperand(2)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(4))->get());
+
+  // The CC (#4) and the possible return values (#2 and #3) have legal types.
+  return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2),
+                                N->getOperand(3), N->getOperand(4)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SETCC(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(2))->get());
+
+  // The CC (#2) is always legal.
+  if (N->getOpcode() == ISD::SETCC)
+    return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2)), 0);
+
+  assert(N->getOpcode() == ISD::VP_SETCC && "Expected VP_SETCC opcode");
+
+  return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2),
+                                        N->getOperand(3), N->getOperand(4)),
+                 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_Shift(SDNode *N) {
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                ZExtPromotedInteger(N->getOperand(1))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_FunnelShift(SDNode *N) {
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1),
+                                ZExtPromotedInteger(N->getOperand(2))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SIGN_EXTEND(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  SDLoc dl(N);
+  Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Op.getValueType(),
+                     Op, DAG.getValueType(N->getOperand(0).getValueType()));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_VP_SIGN_EXTEND(SDNode *N) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  // FIXME: There is no VP_ANY_EXTEND yet.
+  Op = DAG.getNode(ISD::VP_ZERO_EXTEND, dl, VT, Op, N->getOperand(1),
+                   N->getOperand(2));
+  unsigned Diff =
+      VT.getScalarSizeInBits() - N->getOperand(0).getScalarValueSizeInBits();
+  SDValue ShAmt = DAG.getShiftAmountConstant(Diff, VT, dl);
+  // FIXME: There is no VP_SIGN_EXTEND_INREG so use a pair of shifts.
+  SDValue Shl = DAG.getNode(ISD::VP_SHL, dl, VT, Op, ShAmt, N->getOperand(1),
+                            N->getOperand(2));
+  return DAG.getNode(ISD::VP_ASHR, dl, VT, Shl, ShAmt, N->getOperand(1),
+                     N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SINT_TO_FP(SDNode *N) {
+  if (N->getOpcode() == ISD::VP_SINT_TO_FP)
+    return SDValue(DAG.UpdateNodeOperands(N,
+                                          SExtPromotedInteger(N->getOperand(0)),
+                                          N->getOperand(1), N->getOperand(2)),
+                   0);
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                SExtPromotedInteger(N->getOperand(0))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_STRICT_SINT_TO_FP(SDNode *N) {
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                SExtPromotedInteger(N->getOperand(1))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  SDValue Ch = N->getChain(), Ptr = N->getBasePtr();
+  SDLoc dl(N);
+
+  SDValue Val = GetPromotedInteger(N->getValue());  // Get promoted value.
+
+  // Truncate the value and store the result.
+  return DAG.getTruncStore(Ch, dl, Val, Ptr,
+                           N->getMemoryVT(), N->getMemOperand());
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N,
+                                              unsigned OpNo) {
+  SDValue DataOp = N->getValue();
+  SDValue Mask = N->getMask();
+
+  if (OpNo == 4) {
+    // The Mask. Update in place.
+    EVT DataVT = DataOp.getValueType();
+    Mask = PromoteTargetBoolean(Mask, DataVT);
+    SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
+    NewOps[4] = Mask;
+    return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+  }
+
+  assert(OpNo == 1 && "Unexpected operand for promotion");
+  DataOp = GetPromotedInteger(DataOp);
+
+  return DAG.getMaskedStore(N->getChain(), SDLoc(N), DataOp, N->getBasePtr(),
+                            N->getOffset(), Mask, N->getMemoryVT(),
+                            N->getMemOperand(), N->getAddressingMode(),
+                            /*IsTruncating*/ true, N->isCompressingStore());
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N,
+                                             unsigned OpNo) {
+  assert(OpNo == 3 && "Only know how to promote the mask!");
+  EVT DataVT = N->getValueType(0);
+  SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
+  SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
+  NewOps[OpNo] = Mask;
+  SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
+  if (Res == N)
+    return SDValue(Res, 0);
+
+  // Update triggered CSE, do our own replacement since caller can't.
+  ReplaceValueWith(SDValue(N, 0), SDValue(Res, 0));
+  ReplaceValueWith(SDValue(N, 1), SDValue(Res, 1));
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_MGATHER(MaskedGatherSDNode *N,
+                                               unsigned OpNo) {
+  SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+
+  if (OpNo == 2) {
+    // The Mask
+    EVT DataVT = N->getValueType(0);
+    NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
+  } else if (OpNo == 4) {
+    // The Index
+    if (N->isIndexSigned())
+      // Need to sign extend the index since the bits will likely be used.
+      NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
+    else
+      NewOps[OpNo] = ZExtPromotedInteger(N->getOperand(OpNo));
+  } else
+    NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
+
+  SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
+  if (Res == N)
+    return SDValue(Res, 0);
+
+  // Update triggered CSE, do our own replacement since caller can't.
+  ReplaceValueWith(SDValue(N, 0), SDValue(Res, 0));
+  ReplaceValueWith(SDValue(N, 1), SDValue(Res, 1));
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
+                                                unsigned OpNo) {
+  bool TruncateStore = N->isTruncatingStore();
+  SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+
+  if (OpNo == 2) {
+    // The Mask
+    EVT DataVT = N->getValue().getValueType();
+    NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
+  } else if (OpNo == 4) {
+    // The Index
+    if (N->isIndexSigned())
+      // Need to sign extend the index since the bits will likely be used.
+      NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
+    else
+      NewOps[OpNo] = ZExtPromotedInteger(N->getOperand(OpNo));
+  } else {
+    NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
+    TruncateStore = true;
+  }
+
+  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(),
+                              SDLoc(N), NewOps, N->getMemOperand(),
+                              N->getIndexType(), TruncateStore);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  if (N->getOpcode() == ISD::VP_TRUNCATE)
+    return DAG.getNode(ISD::VP_TRUNCATE, SDLoc(N), N->getValueType(0), Op,
+                       N->getOperand(1), N->getOperand(2));
+  return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_UINT_TO_FP(SDNode *N) {
+  if (N->getOpcode() == ISD::VP_UINT_TO_FP)
+    return SDValue(DAG.UpdateNodeOperands(N,
+                                          ZExtPromotedInteger(N->getOperand(0)),
+                                          N->getOperand(1), N->getOperand(2)),
+                   0);
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                ZExtPromotedInteger(N->getOperand(0))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_STRICT_UINT_TO_FP(SDNode *N) {
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                ZExtPromotedInteger(N->getOperand(1))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) {
+  SDLoc dl(N);
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
+  return DAG.getZeroExtendInReg(Op, dl, N->getOperand(0).getValueType());
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_VP_ZERO_EXTEND(SDNode *N) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  // FIXME: There is no VP_ANY_EXTEND yet.
+  Op = DAG.getNode(ISD::VP_ZERO_EXTEND, dl, VT, Op, N->getOperand(1),
+                   N->getOperand(2));
+  APInt Imm = APInt::getLowBitsSet(VT.getScalarSizeInBits(),
+                                   N->getOperand(0).getScalarValueSizeInBits());
+  return DAG.getNode(ISD::VP_AND, dl, VT, Op, DAG.getConstant(Imm, dl, VT),
+                     N->getOperand(1), N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_ADDSUBO_CARRY(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 2 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Carry = N->getOperand(2);
+  SDLoc DL(N);
+
+  Carry = PromoteTargetBoolean(Carry, LHS.getValueType());
+
+  return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, Carry), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_FIX(SDNode *N) {
+  SDValue Op2 = ZExtPromotedInteger(N->getOperand(2));
+  return SDValue(
+      DAG.UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1), Op2), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_FRAMERETURNADDR(SDNode *N) {
+  // Promote the RETURNADDR/FRAMEADDR argument to a supported integer width.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  return SDValue(DAG.UpdateNodeOperands(N, Op), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_PREFETCH(SDNode *N, unsigned OpNo) {
+  assert(OpNo > 1 && "Don't know how to promote this operand!");
+  // Promote the rw, locality, and cache type arguments to a supported integer
+  // width.
+  SDValue Op2 = ZExtPromotedInteger(N->getOperand(2));
+  SDValue Op3 = ZExtPromotedInteger(N->getOperand(3));
+  SDValue Op4 = ZExtPromotedInteger(N->getOperand(4));
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1),
+                                        Op2, Op3, Op4),
+                 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_ExpOp(SDNode *N) {
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+
+  bool IsPowI =
+      N->getOpcode() == ISD::FPOWI || N->getOpcode() == ISD::STRICT_FPOWI;
+
+  // The integer operand is the last operand in FPOWI (or FLDEXP) (so the result
+  // and floating point operand is already type legalized).
+  RTLIB::Libcall LC = IsPowI ? RTLIB::getPOWI(N->getValueType(0))
+                             : RTLIB::getLDEXP(N->getValueType(0));
+
+  if (LC == RTLIB::UNKNOWN_LIBCALL || !TLI.getLibcallName(LC)) {
+    SDValue Op = SExtPromotedInteger(N->getOperand(1));
+    return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op), 0);
+  }
+
+  // We can't just promote the exponent type in FPOWI, since we want to lower
+  // the node to a libcall and we if we promote to a type larger than
+  // sizeof(int) the libcall might not be according to the targets ABI. Instead
+  // we rewrite to a libcall here directly, letting makeLibCall handle promotion
+  // if the target accepts it according to shouldSignExtendTypeInLibCall.
+
+  unsigned OpOffset = IsStrict ? 1 : 0;
+  // The exponent should fit in a sizeof(int) type for the libcall to be valid.
+  assert(DAG.getLibInfo().getIntSize() ==
+             N->getOperand(1 + OpOffset).getValueType().getSizeInBits() &&
+         "POWI exponent should match with sizeof(int) when doing the libcall.");
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  SDValue Ops[2] = {N->getOperand(0 + OpOffset), N->getOperand(1 + OpOffset)};
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(
+      DAG, LC, N->getValueType(0), Ops, CallOptions, SDLoc(N), Chain);
+  ReplaceValueWith(SDValue(N, 0), Tmp.first);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  return SDValue();
+}
+
+static unsigned getExtendForIntVecReduction(SDNode *N) {
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Expected integer vector reduction");
+  case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+  case ISD::VP_REDUCE_ADD:
+  case ISD::VP_REDUCE_MUL:
+  case ISD::VP_REDUCE_AND:
+  case ISD::VP_REDUCE_OR:
+  case ISD::VP_REDUCE_XOR:
+    return ISD::ANY_EXTEND;
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_SMIN:
+    return ISD::SIGN_EXTEND;
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN:
+  case ISD::VP_REDUCE_UMAX:
+  case ISD::VP_REDUCE_UMIN:
+    return ISD::ZERO_EXTEND;
+  }
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOpVectorReduction(SDNode *N, SDValue V) {
+  switch (getExtendForIntVecReduction(N)) {
+  default:
+    llvm_unreachable("Impossible extension kind for integer reduction");
+  case ISD::ANY_EXTEND:
+    return GetPromotedInteger(V);
+  case ISD::SIGN_EXTEND:
+    return SExtPromotedInteger(V);
+  case ISD::ZERO_EXTEND:
+    return ZExtPromotedInteger(V);
+  }
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_VECREDUCE(SDNode *N) {
+  SDLoc dl(N);
+  SDValue Op = PromoteIntOpVectorReduction(N, N->getOperand(0));
+
+  EVT OrigEltVT = N->getOperand(0).getValueType().getVectorElementType();
+  EVT InVT = Op.getValueType();
+  EVT EltVT = InVT.getVectorElementType();
+  EVT ResVT = N->getValueType(0);
+  unsigned Opcode = N->getOpcode();
+
+  // An i1 vecreduce_xor is equivalent to vecreduce_add, use that instead if
+  // vecreduce_xor is not legal
+  if (Opcode == ISD::VECREDUCE_XOR && OrigEltVT == MVT::i1 &&
+      !TLI.isOperationLegalOrCustom(ISD::VECREDUCE_XOR, InVT) &&
+      TLI.isOperationLegalOrCustom(ISD::VECREDUCE_ADD, InVT))
+    Opcode = ISD::VECREDUCE_ADD;
+
+  // An i1 vecreduce_or is equivalent to vecreduce_umax, use that instead if
+  // vecreduce_or is not legal
+  else if (Opcode == ISD::VECREDUCE_OR && OrigEltVT == MVT::i1 &&
+           !TLI.isOperationLegalOrCustom(ISD::VECREDUCE_OR, InVT) &&
+           TLI.isOperationLegalOrCustom(ISD::VECREDUCE_UMAX, InVT)) {
+    Opcode = ISD::VECREDUCE_UMAX;
+    // Can't use promoteTargetBoolean here because we still need
+    // to either sign_ext or zero_ext in the undefined case.
+    switch (TLI.getBooleanContents(InVT)) {
+    case TargetLoweringBase::UndefinedBooleanContent:
+    case TargetLoweringBase::ZeroOrOneBooleanContent:
+      Op = ZExtPromotedInteger(N->getOperand(0));
+      break;
+    case TargetLoweringBase::ZeroOrNegativeOneBooleanContent:
+      Op = SExtPromotedInteger(N->getOperand(0));
+      break;
+    }
+  }
+
+  // An i1 vecreduce_and is equivalent to vecreduce_umin, use that instead if
+  // vecreduce_and is not legal
+  else if (Opcode == ISD::VECREDUCE_AND && OrigEltVT == MVT::i1 &&
+           !TLI.isOperationLegalOrCustom(ISD::VECREDUCE_AND, InVT) &&
+           TLI.isOperationLegalOrCustom(ISD::VECREDUCE_UMIN, InVT)) {
+    Opcode = ISD::VECREDUCE_UMIN;
+    // Can't use promoteTargetBoolean here because we still need
+    // to either sign_ext or zero_ext in the undefined case.
+    switch (TLI.getBooleanContents(InVT)) {
+    case TargetLoweringBase::UndefinedBooleanContent:
+    case TargetLoweringBase::ZeroOrOneBooleanContent:
+      Op = ZExtPromotedInteger(N->getOperand(0));
+      break;
+    case TargetLoweringBase::ZeroOrNegativeOneBooleanContent:
+      Op = SExtPromotedInteger(N->getOperand(0));
+      break;
+    }
+  }
+
+  if (ResVT.bitsGE(EltVT))
+    return DAG.getNode(Opcode, SDLoc(N), ResVT, Op);
+
+  // Result size must be >= element size. If this is not the case after
+  // promotion, also promote the result type and then truncate.
+  SDValue Reduce = DAG.getNode(Opcode, dl, EltVT, Op);
+  return DAG.getNode(ISD::TRUNCATE, dl, ResVT, Reduce);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo) {
+  SDLoc DL(N);
+  SDValue Op = N->getOperand(OpNo);
+  SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
+
+  if (OpNo == 2) { // Mask
+    // Update in place.
+    NewOps[2] = PromoteTargetBoolean(Op, N->getOperand(1).getValueType());
+    return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+  }
+
+  assert(OpNo == 1 && "Unexpected operand for promotion");
+
+  Op = PromoteIntOpVectorReduction(N, Op);
+
+  NewOps[OpNo] = Op;
+
+  EVT VT = N->getValueType(0);
+  EVT EltVT = Op.getValueType().getScalarType();
+
+  if (VT.bitsGE(EltVT))
+    return DAG.getNode(N->getOpcode(), SDLoc(N), VT, NewOps);
+
+  // Result size must be >= element/start-value size. If this is not the case
+  // after promotion, also promote both the start value and result type and
+  // then truncate.
+  NewOps[0] =
+      DAG.getNode(getExtendForIntVecReduction(N), DL, EltVT, N->getOperand(0));
+  SDValue Reduce = DAG.getNode(N->getOpcode(), DL, EltVT, NewOps);
+  return DAG.getNode(ISD::TRUNCATE, DL, VT, Reduce);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SET_ROUNDING(SDNode *N) {
+  SDValue Op = ZExtPromotedInteger(N->getOperand(1));
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo) {
+  assert(OpNo > 1); // Because the first two arguments are guaranteed legal.
+  SmallVector<SDValue> NewOps(N->ops().begin(), N->ops().end());
+  SDValue Operand = N->getOperand(OpNo);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType());
+  NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand);
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo) {
+  assert(OpNo >= 7);
+  SmallVector<SDValue> NewOps(N->ops().begin(), N->ops().end());
+  SDValue Operand = N->getOperand(OpNo);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType());
+  NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand);
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo) {
+  assert((N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD && OpNo == 3) ||
+         (N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_STORE && OpNo == 4));
+
+  SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
+  NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
+
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+//===----------------------------------------------------------------------===//
+//  Integer Result Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandIntegerResult - This method is called when the specified result of the
+/// specified node is found to need expansion.  At this point, the node may also
+/// have invalid operands or may have other results that need promotion, we just
+/// know that (at least) one result needs expansion.
+void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
+  LLVM_DEBUG(dbgs() << "Expand integer result: "; N->dump(&DAG);
+             dbgs() << "\n");
+  SDValue Lo, Hi;
+  Lo = Hi = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ExpandIntegerResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to expand the result of this "
+                       "operator!");
+
+  case ISD::ARITH_FENCE:  SplitRes_ARITH_FENCE(N, Lo, Hi); break;
+  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
+  case ISD::SELECT:       SplitRes_Select(N, Lo, Hi); break;
+  case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
+  case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
+  case ISD::FREEZE:       SplitRes_FREEZE(N, Lo, Hi); break;
+
+  case ISD::BITCAST:            ExpandRes_BITCAST(N, Lo, Hi); break;
+  case ISD::BUILD_PAIR:         ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
+  case ISD::EXTRACT_ELEMENT:    ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break;
+  case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break;
+  case ISD::VAARG:              ExpandRes_VAARG(N, Lo, Hi); break;
+
+  case ISD::ANY_EXTEND:  ExpandIntRes_ANY_EXTEND(N, Lo, Hi); break;
+  case ISD::AssertSext:  ExpandIntRes_AssertSext(N, Lo, Hi); break;
+  case ISD::AssertZext:  ExpandIntRes_AssertZext(N, Lo, Hi); break;
+  case ISD::BITREVERSE:  ExpandIntRes_BITREVERSE(N, Lo, Hi); break;
+  case ISD::BSWAP:       ExpandIntRes_BSWAP(N, Lo, Hi); break;
+  case ISD::PARITY:      ExpandIntRes_PARITY(N, Lo, Hi); break;
+  case ISD::Constant:    ExpandIntRes_Constant(N, Lo, Hi); break;
+  case ISD::ABS:         ExpandIntRes_ABS(N, Lo, Hi); break;
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTLZ:        ExpandIntRes_CTLZ(N, Lo, Hi); break;
+  case ISD::CTPOP:       ExpandIntRes_CTPOP(N, Lo, Hi); break;
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTTZ:        ExpandIntRes_CTTZ(N, Lo, Hi); break;
+  case ISD::GET_ROUNDING:ExpandIntRes_GET_ROUNDING(N, Lo, Hi); break;
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::FP_TO_SINT:  ExpandIntRes_FP_TO_SINT(N, Lo, Hi); break;
+  case ISD::STRICT_FP_TO_UINT:
+  case ISD::FP_TO_UINT:  ExpandIntRes_FP_TO_UINT(N, Lo, Hi); break;
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT: ExpandIntRes_FP_TO_XINT_SAT(N, Lo, Hi); break;
+  case ISD::STRICT_LROUND:
+  case ISD::STRICT_LRINT:
+  case ISD::LROUND:
+  case ISD::LRINT:
+  case ISD::STRICT_LLROUND:
+  case ISD::STRICT_LLRINT:
+  case ISD::LLROUND:
+  case ISD::LLRINT:      ExpandIntRes_XROUND_XRINT(N, Lo, Hi); break;
+  case ISD::LOAD:        ExpandIntRes_LOAD(cast<LoadSDNode>(N), Lo, Hi); break;
+  case ISD::MUL:         ExpandIntRes_MUL(N, Lo, Hi); break;
+  case ISD::READCYCLECOUNTER: ExpandIntRes_READCYCLECOUNTER(N, Lo, Hi); break;
+  case ISD::SDIV:        ExpandIntRes_SDIV(N, Lo, Hi); break;
+  case ISD::SIGN_EXTEND: ExpandIntRes_SIGN_EXTEND(N, Lo, Hi); break;
+  case ISD::SIGN_EXTEND_INREG: ExpandIntRes_SIGN_EXTEND_INREG(N, Lo, Hi); break;
+  case ISD::SREM:        ExpandIntRes_SREM(N, Lo, Hi); break;
+  case ISD::TRUNCATE:    ExpandIntRes_TRUNCATE(N, Lo, Hi); break;
+  case ISD::UDIV:        ExpandIntRes_UDIV(N, Lo, Hi); break;
+  case ISD::UREM:        ExpandIntRes_UREM(N, Lo, Hi); break;
+  case ISD::ZERO_EXTEND: ExpandIntRes_ZERO_EXTEND(N, Lo, Hi); break;
+  case ISD::ATOMIC_LOAD: ExpandIntRes_ATOMIC_LOAD(N, Lo, Hi); break;
+
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_CLR:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_CMP_SWAP: {
+    std::pair<SDValue, SDValue> Tmp = ExpandAtomic(N);
+    SplitInteger(Tmp.first, Lo, Hi);
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+    break;
+  }
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
+    AtomicSDNode *AN = cast<AtomicSDNode>(N);
+    SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::Other);
+    SDValue Tmp = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP, SDLoc(N), AN->getMemoryVT(), VTs,
+        N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3),
+        AN->getMemOperand());
+
+    // Expanding to the strong ATOMIC_CMP_SWAP node means we can determine
+    // success simply by comparing the loaded value against the ingoing
+    // comparison.
+    SDValue Success = DAG.getSetCC(SDLoc(N), N->getValueType(1), Tmp,
+                                   N->getOperand(2), ISD::SETEQ);
+
+    SplitInteger(Tmp, Lo, Hi);
+    ReplaceValueWith(SDValue(N, 1), Success);
+    ReplaceValueWith(SDValue(N, 2), Tmp.getValue(1));
+    break;
+  }
+
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: ExpandIntRes_Logical(N, Lo, Hi); break;
+
+  case ISD::UMAX:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::SMIN: ExpandIntRes_MINMAX(N, Lo, Hi); break;
+
+  case ISD::ADD:
+  case ISD::SUB: ExpandIntRes_ADDSUB(N, Lo, Hi); break;
+
+  case ISD::ADDC:
+  case ISD::SUBC: ExpandIntRes_ADDSUBC(N, Lo, Hi); break;
+
+  case ISD::ADDE:
+  case ISD::SUBE: ExpandIntRes_ADDSUBE(N, Lo, Hi); break;
+
+  case ISD::UADDO_CARRY:
+  case ISD::USUBO_CARRY: ExpandIntRes_UADDSUBO_CARRY(N, Lo, Hi); break;
+
+  case ISD::SADDO_CARRY:
+  case ISD::SSUBO_CARRY: ExpandIntRes_SADDSUBO_CARRY(N, Lo, Hi); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL: ExpandIntRes_Shift(N, Lo, Hi); break;
+
+  case ISD::SADDO:
+  case ISD::SSUBO: ExpandIntRes_SADDSUBO(N, Lo, Hi); break;
+  case ISD::UADDO:
+  case ISD::USUBO: ExpandIntRes_UADDSUBO(N, Lo, Hi); break;
+  case ISD::UMULO:
+  case ISD::SMULO: ExpandIntRes_XMULO(N, Lo, Hi); break;
+
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT: ExpandIntRes_ADDSUBSAT(N, Lo, Hi); break;
+
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT: ExpandIntRes_SHLSAT(N, Lo, Hi); break;
+
+  case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT: ExpandIntRes_MULFIX(N, Lo, Hi); break;
+
+  case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT: ExpandIntRes_DIVFIX(N, Lo, Hi); break;
+
+  case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN: ExpandIntRes_VECREDUCE(N, Lo, Hi); break;
+
+  case ISD::ROTL:
+  case ISD::ROTR:
+    ExpandIntRes_Rotate(N, Lo, Hi);
+    break;
+
+  case ISD::FSHL:
+  case ISD::FSHR:
+    ExpandIntRes_FunnelShift(N, Lo, Hi);
+    break;
+
+  case ISD::VSCALE:
+    ExpandIntRes_VSCALE(N, Lo, Hi);
+    break;
+  }
+
+  // If Lo/Hi is null, the sub-method took care of registering results etc.
+  if (Lo.getNode())
+    SetExpandedInteger(SDValue(N, ResNo), Lo, Hi);
+}
+
+/// Lower an atomic node to the appropriate builtin call.
+std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
+  unsigned Opc = Node->getOpcode();
+  MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
+  AtomicOrdering order = cast<AtomicSDNode>(Node)->getMergedOrdering();
+  // Lower to outline atomic libcall if outline atomics enabled,
+  // or to sync libcall otherwise
+  RTLIB::Libcall LC = RTLIB::getOUTLINE_ATOMIC(Opc, order, VT);
+  EVT RetVT = Node->getValueType(0);
+  TargetLowering::MakeLibCallOptions CallOptions;
+  SmallVector<SDValue, 4> Ops;
+  if (TLI.getLibcallName(LC)) {
+    Ops.append(Node->op_begin() + 2, Node->op_end());
+    Ops.push_back(Node->getOperand(1));
+  } else {
+    LC = RTLIB::getSYNC(Opc, VT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+           "Unexpected atomic op or value type!");
+    Ops.append(Node->op_begin() + 1, Node->op_end());
+  }
+  return TLI.makeLibCall(DAG, LC, RetVT, Ops, CallOptions, SDLoc(Node),
+                         Node->getOperand(0));
+}
+
+/// N is a shift by a value that needs to be expanded,
+/// and the shift amount is a constant 'Amt'.  Expand the operation.
+void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDLoc DL(N);
+  // Expand the incoming operand to be shifted, so that we have its parts
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+
+  // Though Amt shouldn't usually be 0, it's possible. E.g. when legalization
+  // splitted a vector shift, like this: <op1, op2> SHL <0, 2>.
+  if (!Amt) {
+    Lo = InL;
+    Hi = InH;
+    return;
+  }
+
+  EVT NVT = InL.getValueType();
+  unsigned VTBits = N->getValueType(0).getSizeInBits();
+  unsigned NVTBits = NVT.getSizeInBits();
+  EVT ShTy = N->getOperand(1).getValueType();
+
+  if (N->getOpcode() == ISD::SHL) {
+    if (Amt.uge(VTBits)) {
+      Lo = Hi = DAG.getConstant(0, DL, NVT);
+    } else if (Amt.ugt(NVTBits)) {
+      Lo = DAG.getConstant(0, DL, NVT);
+      Hi = DAG.getNode(ISD::SHL, DL,
+                       NVT, InL, DAG.getConstant(Amt - NVTBits, DL, ShTy));
+    } else if (Amt == NVTBits) {
+      Lo = DAG.getConstant(0, DL, NVT);
+      Hi = InL;
+    } else {
+      Lo = DAG.getNode(ISD::SHL, DL, NVT, InL, DAG.getConstant(Amt, DL, ShTy));
+      Hi = DAG.getNode(ISD::OR, DL, NVT,
+                       DAG.getNode(ISD::SHL, DL, NVT, InH,
+                                   DAG.getConstant(Amt, DL, ShTy)),
+                       DAG.getNode(ISD::SRL, DL, NVT, InL,
+                                   DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
+    }
+    return;
+  }
+
+  if (N->getOpcode() == ISD::SRL) {
+    if (Amt.uge(VTBits)) {
+      Lo = Hi = DAG.getConstant(0, DL, NVT);
+    } else if (Amt.ugt(NVTBits)) {
+      Lo = DAG.getNode(ISD::SRL, DL,
+                       NVT, InH, DAG.getConstant(Amt - NVTBits, DL, ShTy));
+      Hi = DAG.getConstant(0, DL, NVT);
+    } else if (Amt == NVTBits) {
+      Lo = InH;
+      Hi = DAG.getConstant(0, DL, NVT);
+    } else {
+      Lo = DAG.getNode(ISD::OR, DL, NVT,
+                       DAG.getNode(ISD::SRL, DL, NVT, InL,
+                                   DAG.getConstant(Amt, DL, ShTy)),
+                       DAG.getNode(ISD::SHL, DL, NVT, InH,
+                                   DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
+      Hi = DAG.getNode(ISD::SRL, DL, NVT, InH, DAG.getConstant(Amt, DL, ShTy));
+    }
+    return;
+  }
+
+  assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+  if (Amt.uge(VTBits)) {
+    Hi = Lo = DAG.getNode(ISD::SRA, DL, NVT, InH,
+                          DAG.getConstant(NVTBits - 1, DL, ShTy));
+  } else if (Amt.ugt(NVTBits)) {
+    Lo = DAG.getNode(ISD::SRA, DL, NVT, InH,
+                     DAG.getConstant(Amt - NVTBits, DL, ShTy));
+    Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
+                     DAG.getConstant(NVTBits - 1, DL, ShTy));
+  } else if (Amt == NVTBits) {
+    Lo = InH;
+    Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
+                     DAG.getConstant(NVTBits - 1, DL, ShTy));
+  } else {
+    Lo = DAG.getNode(ISD::OR, DL, NVT,
+                     DAG.getNode(ISD::SRL, DL, NVT, InL,
+                                 DAG.getConstant(Amt, DL, ShTy)),
+                     DAG.getNode(ISD::SHL, DL, NVT, InH,
+                                 DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
+    Hi = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(Amt, DL, ShTy));
+  }
+}
+
+/// ExpandShiftWithKnownAmountBit - Try to determine whether we can simplify
+/// this shift based on knowledge of the high bit of the shift amount.  If we
+/// can tell this, we know that it is >= 32 or < 32, without knowing the actual
+/// shift amount.
+bool DAGTypeLegalizer::
+ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue Amt = N->getOperand(1);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT ShTy = Amt.getValueType();
+  unsigned ShBits = ShTy.getScalarSizeInBits();
+  unsigned NVTBits = NVT.getScalarSizeInBits();
+  assert(isPowerOf2_32(NVTBits) &&
+         "Expanded integer type size not a power of two!");
+  SDLoc dl(N);
+
+  APInt HighBitMask = APInt::getHighBitsSet(ShBits, ShBits - Log2_32(NVTBits));
+  KnownBits Known = DAG.computeKnownBits(N->getOperand(1));
+
+  // If we don't know anything about the high bits, exit.
+  if (((Known.Zero|Known.One) & HighBitMask) == 0)
+    return false;
+
+  // Get the incoming operand to be shifted.
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+
+  // If we know that any of the high bits of the shift amount are one, then we
+  // can do this as a couple of simple shifts.
+  if (Known.One.intersects(HighBitMask)) {
+    // Mask out the high bit, which we know is set.
+    Amt = DAG.getNode(ISD::AND, dl, ShTy, Amt,
+                      DAG.getConstant(~HighBitMask, dl, ShTy));
+
+    switch (N->getOpcode()) {
+    default: llvm_unreachable("Unknown shift");
+    case ISD::SHL:
+      Lo = DAG.getConstant(0, dl, NVT);              // Low part is zero.
+      Hi = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt); // High part from Lo part.
+      return true;
+    case ISD::SRL:
+      Hi = DAG.getConstant(0, dl, NVT);              // Hi part is zero.
+      Lo = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt); // Lo part from Hi part.
+      return true;
+    case ISD::SRA:
+      Hi = DAG.getNode(ISD::SRA, dl, NVT, InH,       // Sign extend high part.
+                       DAG.getConstant(NVTBits - 1, dl, ShTy));
+      Lo = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt); // Lo part from Hi part.
+      return true;
+    }
+  }
+
+  // If we know that all of the high bits of the shift amount are zero, then we
+  // can do this as a couple of simple shifts.
+  if (HighBitMask.isSubsetOf(Known.Zero)) {
+    // Calculate 31-x. 31 is used instead of 32 to avoid creating an undefined
+    // shift if x is zero.  We can use XOR here because x is known to be smaller
+    // than 32.
+    SDValue Amt2 = DAG.getNode(ISD::XOR, dl, ShTy, Amt,
+                               DAG.getConstant(NVTBits - 1, dl, ShTy));
+
+    unsigned Op1, Op2;
+    switch (N->getOpcode()) {
+    default: llvm_unreachable("Unknown shift");
+    case ISD::SHL:  Op1 = ISD::SHL; Op2 = ISD::SRL; break;
+    case ISD::SRL:
+    case ISD::SRA:  Op1 = ISD::SRL; Op2 = ISD::SHL; break;
+    }
+
+    // When shifting right the arithmetic for Lo and Hi is swapped.
+    if (N->getOpcode() != ISD::SHL)
+      std::swap(InL, InH);
+
+    // Use a little trick to get the bits that move from Lo to Hi. First
+    // shift by one bit.
+    SDValue Sh1 = DAG.getNode(Op2, dl, NVT, InL, DAG.getConstant(1, dl, ShTy));
+    // Then compute the remaining shift with amount-1.
+    SDValue Sh2 = DAG.getNode(Op2, dl, NVT, Sh1, Amt2);
+
+    Lo = DAG.getNode(N->getOpcode(), dl, NVT, InL, Amt);
+    Hi = DAG.getNode(ISD::OR, dl, NVT, DAG.getNode(Op1, dl, NVT, InH, Amt),Sh2);
+
+    if (N->getOpcode() != ISD::SHL)
+      std::swap(Hi, Lo);
+    return true;
+  }
+
+  return false;
+}
+
+/// ExpandShiftWithUnknownAmountBit - Fully general expansion of integer shift
+/// of any size.
+bool DAGTypeLegalizer::
+ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue Amt = N->getOperand(1);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT ShTy = Amt.getValueType();
+  unsigned NVTBits = NVT.getSizeInBits();
+  assert(isPowerOf2_32(NVTBits) &&
+         "Expanded integer type size not a power of two!");
+  SDLoc dl(N);
+
+  // Get the incoming operand to be shifted.
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+
+  SDValue NVBitsNode = DAG.getConstant(NVTBits, dl, ShTy);
+  SDValue AmtExcess = DAG.getNode(ISD::SUB, dl, ShTy, Amt, NVBitsNode);
+  SDValue AmtLack = DAG.getNode(ISD::SUB, dl, ShTy, NVBitsNode, Amt);
+  SDValue isShort = DAG.getSetCC(dl, getSetCCResultType(ShTy),
+                                 Amt, NVBitsNode, ISD::SETULT);
+  SDValue isZero = DAG.getSetCC(dl, getSetCCResultType(ShTy),
+                                Amt, DAG.getConstant(0, dl, ShTy),
+                                ISD::SETEQ);
+
+  SDValue LoS, HiS, LoL, HiL;
+  switch (N->getOpcode()) {
+  default: llvm_unreachable("Unknown shift");
+  case ISD::SHL:
+    // Short: ShAmt < NVTBits
+    LoS = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt);
+    HiS = DAG.getNode(ISD::OR, dl, NVT,
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, Amt),
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    LoL = DAG.getConstant(0, dl, NVT);                    // Lo part is zero.
+    HiL = DAG.getNode(ISD::SHL, dl, NVT, InL, AmtExcess); // Hi from Lo part.
+
+    Lo = DAG.getSelect(dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getSelect(dl, NVT, isZero, InH,
+                       DAG.getSelect(dl, NVT, isShort, HiS, HiL));
+    return true;
+  case ISD::SRL:
+    // Short: ShAmt < NVTBits
+    HiS = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt);
+    LoS = DAG.getNode(ISD::OR, dl, NVT,
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
+    // FIXME: If Amt is zero, the following shift generates an undefined result
+    // on some architectures.
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    HiL = DAG.getConstant(0, dl, NVT);                    // Hi part is zero.
+    LoL = DAG.getNode(ISD::SRL, dl, NVT, InH, AmtExcess); // Lo from Hi part.
+
+    Lo = DAG.getSelect(dl, NVT, isZero, InL,
+                       DAG.getSelect(dl, NVT, isShort, LoS, LoL));
+    Hi = DAG.getSelect(dl, NVT, isShort, HiS, HiL);
+    return true;
+  case ISD::SRA:
+    // Short: ShAmt < NVTBits
+    HiS = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt);
+    LoS = DAG.getNode(ISD::OR, dl, NVT,
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    HiL = DAG.getNode(ISD::SRA, dl, NVT, InH,             // Sign of Hi part.
+                      DAG.getConstant(NVTBits - 1, dl, ShTy));
+    LoL = DAG.getNode(ISD::SRA, dl, NVT, InH, AmtExcess); // Lo from Hi part.
+
+    Lo = DAG.getSelect(dl, NVT, isZero, InL,
+                       DAG.getSelect(dl, NVT, isShort, LoS, LoL));
+    Hi = DAG.getSelect(dl, NVT, isShort, HiS, HiL);
+    return true;
+  }
+}
+
+static std::pair<ISD::CondCode, ISD::NodeType> getExpandedMinMaxOps(int Op) {
+
+  switch (Op) {
+    default: llvm_unreachable("invalid min/max opcode");
+    case ISD::SMAX:
+      return std::make_pair(ISD::SETGT, ISD::UMAX);
+    case ISD::UMAX:
+      return std::make_pair(ISD::SETUGT, ISD::UMAX);
+    case ISD::SMIN:
+      return std::make_pair(ISD::SETLT, ISD::UMIN);
+    case ISD::UMIN:
+      return std::make_pair(ISD::SETULT, ISD::UMIN);
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_MINMAX(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDLoc DL(N);
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // If the upper halves are all sign bits, then we can perform the MINMAX on
+  // the lower half and sign-extend the result to the upper half.
+  unsigned NumBits = N->getValueType(0).getScalarSizeInBits();
+  unsigned NumHalfBits = NumBits / 2;
+  if (DAG.ComputeNumSignBits(LHS) > NumHalfBits &&
+      DAG.ComputeNumSignBits(RHS) > NumHalfBits) {
+    SDValue LHSL, LHSH, RHSL, RHSH;
+    GetExpandedInteger(LHS, LHSL, LHSH);
+    GetExpandedInteger(RHS, RHSL, RHSH);
+    EVT NVT = LHSL.getValueType();
+
+    Lo = DAG.getNode(N->getOpcode(), DL, NVT, LHSL, RHSL);
+    Hi = DAG.getNode(ISD::SRA, DL, NVT, Lo,
+                     DAG.getShiftAmountConstant(NumHalfBits - 1, NVT, DL));
+    return;
+  }
+
+  // The Lo of smin(X, -1) is LHSL if X is negative. Otherwise it's -1.
+  // The Lo of smax(X, 0) is 0 if X is negative. Otherwise it's LHSL.
+  if ((N->getOpcode() == ISD::SMAX && isNullConstant(RHS)) ||
+      (N->getOpcode() == ISD::SMIN && isAllOnesConstant(RHS))) {
+    SDValue LHSL, LHSH, RHSL, RHSH;
+    GetExpandedInteger(LHS, LHSL, LHSH);
+    GetExpandedInteger(RHS, RHSL, RHSH);
+    EVT NVT = LHSL.getValueType();
+    EVT CCT = getSetCCResultType(NVT);
+
+    SDValue HiNeg =
+        DAG.getSetCC(DL, CCT, LHSH, DAG.getConstant(0, DL, NVT), ISD::SETLT);
+    if (N->getOpcode() == ISD::SMIN) {
+      Lo = DAG.getSelect(DL, NVT, HiNeg, LHSL, DAG.getConstant(-1, DL, NVT));
+    } else {
+      Lo = DAG.getSelect(DL, NVT, HiNeg, DAG.getConstant(0, DL, NVT), LHSL);
+    }
+    Hi = DAG.getNode(N->getOpcode(), DL, NVT, {LHSH, RHSH});
+    return;
+  }
+
+  const APInt *RHSVal = nullptr;
+  if (auto *RHSConst = dyn_cast<ConstantSDNode>(RHS))
+    RHSVal = &RHSConst->getAPIntValue();
+
+  // The high half of MIN/MAX is always just the the MIN/MAX of the
+  // high halves of the operands.  Expand this way if it appears profitable.
+  if (RHSVal && (N->getOpcode() == ISD::UMIN || N->getOpcode() == ISD::UMAX) &&
+                 (RHSVal->countLeadingOnes() >= NumHalfBits ||
+                  RHSVal->countLeadingZeros() >= NumHalfBits)) {
+    SDValue LHSL, LHSH, RHSL, RHSH;
+    GetExpandedInteger(LHS, LHSL, LHSH);
+    GetExpandedInteger(RHS, RHSL, RHSH);
+    EVT NVT = LHSL.getValueType();
+    EVT CCT = getSetCCResultType(NVT);
+
+    ISD::NodeType LoOpc;
+    ISD::CondCode CondC;
+    std::tie(CondC, LoOpc) = getExpandedMinMaxOps(N->getOpcode());
+
+    Hi = DAG.getNode(N->getOpcode(), DL, NVT, {LHSH, RHSH});
+    // We need to know whether to select Lo part that corresponds to 'winning'
+    // Hi part or if Hi parts are equal.
+    SDValue IsHiLeft = DAG.getSetCC(DL, CCT, LHSH, RHSH, CondC);
+    SDValue IsHiEq = DAG.getSetCC(DL, CCT, LHSH, RHSH, ISD::SETEQ);
+
+    // Lo part corresponding to the 'winning' Hi part
+    SDValue LoCmp = DAG.getSelect(DL, NVT, IsHiLeft, LHSL, RHSL);
+
+    // Recursed Lo part if Hi parts are equal, this uses unsigned version
+    SDValue LoMinMax = DAG.getNode(LoOpc, DL, NVT, {LHSL, RHSL});
+
+    Lo = DAG.getSelect(DL, NVT, IsHiEq, LoMinMax, LoCmp);
+    return;
+  }
+
+  // Expand to "a < b ? a : b" etc.  Prefer ge/le if that simplifies
+  // the compare.
+  ISD::CondCode Pred;
+  switch (N->getOpcode()) {
+  default: llvm_unreachable("How did we get here?");
+  case ISD::SMAX:
+    if (RHSVal && RHSVal->countTrailingZeros() >= NumHalfBits)
+      Pred = ISD::SETGE;
+    else
+      Pred = ISD::SETGT;
+    break;
+  case ISD::SMIN:
+    if (RHSVal && RHSVal->countTrailingOnes() >= NumHalfBits)
+      Pred = ISD::SETLE;
+    else
+      Pred = ISD::SETLT;
+    break;
+  case ISD::UMAX:
+    if (RHSVal && RHSVal->countTrailingZeros() >= NumHalfBits)
+      Pred = ISD::SETUGE;
+    else
+      Pred = ISD::SETUGT;
+    break;
+  case ISD::UMIN:
+    if (RHSVal && RHSVal->countTrailingOnes() >= NumHalfBits)
+      Pred = ISD::SETULE;
+    else
+      Pred = ISD::SETULT;
+    break;
+  }
+  EVT VT = N->getValueType(0);
+  EVT CCT = getSetCCResultType(VT);
+  SDValue Cond = DAG.getSetCC(DL, CCT, LHS, RHS, Pred);
+  SDValue Result = DAG.getSelect(DL, VT, Cond, LHS, RHS);
+  SplitInteger(Result, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+
+  EVT NVT = LHSL.getValueType();
+  SDValue LoOps[2] = { LHSL, RHSL };
+  SDValue HiOps[3] = { LHSH, RHSH };
+
+  bool HasOpCarry = TLI.isOperationLegalOrCustom(
+      N->getOpcode() == ISD::ADD ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
+      TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
+  if (HasOpCarry) {
+    SDVTList VTList = DAG.getVTList(NVT, getSetCCResultType(NVT));
+    if (N->getOpcode() == ISD::ADD) {
+      Lo = DAG.getNode(ISD::UADDO, dl, VTList, LoOps);
+      HiOps[2] = Lo.getValue(1);
+      Hi = DAG.computeKnownBits(HiOps[2]).isZero()
+               ? DAG.getNode(ISD::UADDO, dl, VTList, ArrayRef(HiOps, 2))
+               : DAG.getNode(ISD::UADDO_CARRY, dl, VTList, HiOps);
+    } else {
+      Lo = DAG.getNode(ISD::USUBO, dl, VTList, LoOps);
+      HiOps[2] = Lo.getValue(1);
+      Hi = DAG.computeKnownBits(HiOps[2]).isZero()
+               ? DAG.getNode(ISD::USUBO, dl, VTList, ArrayRef(HiOps, 2))
+               : DAG.getNode(ISD::USUBO_CARRY, dl, VTList, HiOps);
+    }
+    return;
+  }
+
+  // Do not generate ADDC/ADDE or SUBC/SUBE if the target does not support
+  // them.  TODO: Teach operation legalization how to expand unsupported
+  // ADDC/ADDE/SUBC/SUBE.  The problem is that these operations generate
+  // a carry of type MVT::Glue, but there doesn't seem to be any way to
+  // generate a value of this type in the expanded code sequence.
+  bool hasCarry =
+    TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ?
+                                   ISD::ADDC : ISD::SUBC,
+                                 TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
+
+  if (hasCarry) {
+    SDVTList VTList = DAG.getVTList(NVT, MVT::Glue);
+    if (N->getOpcode() == ISD::ADD) {
+      Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps);
+      HiOps[2] = Lo.getValue(1);
+      Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps);
+    } else {
+      Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps);
+      HiOps[2] = Lo.getValue(1);
+      Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps);
+    }
+    return;
+  }
+
+  bool hasOVF =
+    TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ?
+                                   ISD::UADDO : ISD::USUBO,
+                                 TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
+  TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT);
+
+  if (hasOVF) {
+    EVT OvfVT = getSetCCResultType(NVT);
+    SDVTList VTList = DAG.getVTList(NVT, OvfVT);
+    int RevOpc;
+    if (N->getOpcode() == ISD::ADD) {
+      RevOpc = ISD::SUB;
+      Lo = DAG.getNode(ISD::UADDO, dl, VTList, LoOps);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, ArrayRef(HiOps, 2));
+    } else {
+      RevOpc = ISD::ADD;
+      Lo = DAG.getNode(ISD::USUBO, dl, VTList, LoOps);
+      Hi = DAG.getNode(ISD::SUB, dl, NVT, ArrayRef(HiOps, 2));
+    }
+    SDValue OVF = Lo.getValue(1);
+
+    switch (BoolType) {
+    case TargetLoweringBase::UndefinedBooleanContent:
+      OVF = DAG.getNode(ISD::AND, dl, OvfVT, DAG.getConstant(1, dl, OvfVT), OVF);
+      [[fallthrough]];
+    case TargetLoweringBase::ZeroOrOneBooleanContent:
+      OVF = DAG.getZExtOrTrunc(OVF, dl, NVT);
+      Hi = DAG.getNode(N->getOpcode(), dl, NVT, Hi, OVF);
+      break;
+    case TargetLoweringBase::ZeroOrNegativeOneBooleanContent:
+      OVF = DAG.getSExtOrTrunc(OVF, dl, NVT);
+      Hi = DAG.getNode(RevOpc, dl, NVT, Hi, OVF);
+    }
+    return;
+  }
+
+  if (N->getOpcode() == ISD::ADD) {
+    Lo = DAG.getNode(ISD::ADD, dl, NVT, LoOps);
+    Hi = DAG.getNode(ISD::ADD, dl, NVT, ArrayRef(HiOps, 2));
+    SDValue Cmp;
+    // Special case: X+1 has a carry out if X+1==0. This may reduce the live
+    // range of X. We assume comparing with 0 is cheap.
+    if (isOneConstant(LoOps[1]))
+      Cmp = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo,
+                         DAG.getConstant(0, dl, NVT), ISD::SETEQ);
+    else if (isAllOnesConstant(LoOps[1])) {
+      if (isAllOnesConstant(HiOps[1]))
+        Cmp = DAG.getSetCC(dl, getSetCCResultType(NVT), LoOps[0],
+                           DAG.getConstant(0, dl, NVT), ISD::SETEQ);
+      else
+        Cmp = DAG.getSetCC(dl, getSetCCResultType(NVT), LoOps[0],
+                           DAG.getConstant(0, dl, NVT), ISD::SETNE);
+    } else
+      Cmp = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0],
+                         ISD::SETULT);
+
+    SDValue Carry;
+    if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent)
+      Carry = DAG.getZExtOrTrunc(Cmp, dl, NVT);
+    else
+      Carry = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, dl, NVT),
+                             DAG.getConstant(0, dl, NVT));
+
+    if (isAllOnesConstant(LoOps[1]) && isAllOnesConstant(HiOps[1]))
+      Hi = DAG.getNode(ISD::SUB, dl, NVT, HiOps[0], Carry);
+    else
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry);
+  } else {
+    Lo = DAG.getNode(ISD::SUB, dl, NVT, LoOps);
+    Hi = DAG.getNode(ISD::SUB, dl, NVT, ArrayRef(HiOps, 2));
+    SDValue Cmp =
+      DAG.getSetCC(dl, getSetCCResultType(LoOps[0].getValueType()),
+                   LoOps[0], LoOps[1], ISD::SETULT);
+
+    SDValue Borrow;
+    if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent)
+      Borrow = DAG.getZExtOrTrunc(Cmp, dl, NVT);
+    else
+      Borrow = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, dl, NVT),
+                             DAG.getConstant(0, dl, NVT));
+
+    Hi = DAG.getNode(ISD::SUB, dl, NVT, Hi, Borrow);
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUBC(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+  SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Glue);
+  SDValue LoOps[2] = { LHSL, RHSL };
+  SDValue HiOps[3] = { LHSH, RHSH };
+
+  if (N->getOpcode() == ISD::ADDC) {
+    Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps);
+    HiOps[2] = Lo.getValue(1);
+    Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps);
+  } else {
+    Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps);
+    HiOps[2] = Lo.getValue(1);
+    Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps);
+  }
+
+  // Legalized the flag result - switch anything that used the old flag to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUBE(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+  SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Glue);
+  SDValue LoOps[3] = { LHSL, RHSL, N->getOperand(2) };
+  SDValue HiOps[3] = { LHSH, RHSH };
+
+  Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps);
+  HiOps[2] = Lo.getValue(1);
+  Hi = DAG.getNode(N->getOpcode(), dl, VTList, HiOps);
+
+  // Legalized the flag result - switch anything that used the old flag to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
+void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDLoc dl(N);
+
+  SDValue Ovf;
+
+  unsigned CarryOp, NoCarryOp;
+  ISD::CondCode Cond;
+  switch(N->getOpcode()) {
+    case ISD::UADDO:
+      CarryOp = ISD::UADDO_CARRY;
+      NoCarryOp = ISD::ADD;
+      Cond = ISD::SETULT;
+      break;
+    case ISD::USUBO:
+      CarryOp = ISD::USUBO_CARRY;
+      NoCarryOp = ISD::SUB;
+      Cond = ISD::SETUGT;
+      break;
+    default:
+      llvm_unreachable("Node has unexpected Opcode");
+  }
+
+  bool HasCarryOp = TLI.isOperationLegalOrCustom(
+      CarryOp, TLI.getTypeToExpandTo(*DAG.getContext(), LHS.getValueType()));
+
+  if (HasCarryOp) {
+    // Expand the subcomponents.
+    SDValue LHSL, LHSH, RHSL, RHSH;
+    GetExpandedInteger(LHS, LHSL, LHSH);
+    GetExpandedInteger(RHS, RHSL, RHSH);
+    SDVTList VTList = DAG.getVTList(LHSL.getValueType(), N->getValueType(1));
+    SDValue LoOps[2] = { LHSL, RHSL };
+    SDValue HiOps[3] = { LHSH, RHSH };
+
+    Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps);
+    HiOps[2] = Lo.getValue(1);
+    Hi = DAG.getNode(CarryOp, dl, VTList, HiOps);
+
+    Ovf = Hi.getValue(1);
+  } else {
+    // Expand the result by simply replacing it with the equivalent
+    // non-overflow-checking operation.
+    SDValue Sum = DAG.getNode(NoCarryOp, dl, LHS.getValueType(), LHS, RHS);
+    SplitInteger(Sum, Lo, Hi);
+
+    if (N->getOpcode() == ISD::UADDO && isOneConstant(RHS)) {
+      // Special case: uaddo X, 1 overflowed if X+1 == 0. We can detect this
+      // with (Lo | Hi) == 0.
+      SDValue Or = DAG.getNode(ISD::OR, dl, Lo.getValueType(), Lo, Hi);
+      Ovf = DAG.getSetCC(dl, N->getValueType(1), Or,
+                         DAG.getConstant(0, dl, Lo.getValueType()), ISD::SETEQ);
+    } else if (N->getOpcode() == ISD::UADDO && isAllOnesConstant(RHS)) {
+      // Special case: uaddo X, -1 overflows if X == 0.
+      Ovf =
+          DAG.getSetCC(dl, N->getValueType(1), LHS,
+                       DAG.getConstant(0, dl, LHS.getValueType()), ISD::SETNE);
+    } else {
+      // Calculate the overflow: addition overflows iff a + b < a, and
+      // subtraction overflows iff a - b > a.
+      Ovf = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS, Cond);
+    }
+  }
+
+  // Legalized the flag result - switch anything that used the old flag to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Ovf);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_UADDSUBO_CARRY(SDNode *N, SDValue &Lo,
+                                                   SDValue &Hi) {
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+  SDVTList VTList = DAG.getVTList(LHSL.getValueType(), N->getValueType(1));
+  SDValue LoOps[3] = { LHSL, RHSL, N->getOperand(2) };
+  SDValue HiOps[3] = { LHSH, RHSH, SDValue() };
+
+  Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps);
+  HiOps[2] = Lo.getValue(1);
+  Hi = DAG.getNode(N->getOpcode(), dl, VTList, HiOps);
+
+  // Legalized the flag result - switch anything that used the old flag to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SADDSUBO_CARRY(SDNode *N,
+                                                   SDValue &Lo, SDValue &Hi) {
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+  SDVTList VTList = DAG.getVTList(LHSL.getValueType(), N->getValueType(1));
+
+  // We need to use an unsigned carry op for the lo part.
+  unsigned CarryOp =
+      N->getOpcode() == ISD::SADDO_CARRY ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
+  Lo = DAG.getNode(CarryOp, dl, VTList, { LHSL, RHSL, N->getOperand(2) });
+  Hi = DAG.getNode(N->getOpcode(), dl, VTList, { LHSH, RHSH, Lo.getValue(1) });
+
+  // Legalized the flag result - switch anything that used the old flag to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().bitsLE(NVT)) {
+    // The low part is any extension of the input (which degenerates to a copy).
+    Lo = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Op);
+    Hi = DAG.getUNDEF(NVT);   // The high part is undefined.
+  } else {
+    // For example, extension of an i48 to an i64.  The operand type necessarily
+    // promotes to the result type, so will end up being expanded too.
+    assert(getTypeAction(Op.getValueType()) ==
+           TargetLowering::TypePromoteInteger &&
+           "Only know how to promote this result!");
+    SDValue Res = GetPromotedInteger(Op);
+    assert(Res.getValueType() == N->getValueType(0) &&
+           "Operand over promoted?");
+    // Split the promoted operand.  This will simplify when it is expanded.
+    SplitInteger(Res, Lo, Hi);
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_AssertSext(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  unsigned NVTBits = NVT.getSizeInBits();
+  unsigned EVTBits = EVT.getSizeInBits();
+
+  if (NVTBits < EVTBits) {
+    Hi = DAG.getNode(ISD::AssertSext, dl, NVT, Hi,
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        EVTBits - NVTBits)));
+  } else {
+    Lo = DAG.getNode(ISD::AssertSext, dl, NVT, Lo, DAG.getValueType(EVT));
+    // The high part replicates the sign bit of Lo, make it explicit.
+    Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
+                     DAG.getConstant(NVTBits - 1, dl,
+                                     TLI.getPointerTy(DAG.getDataLayout())));
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_AssertZext(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  unsigned NVTBits = NVT.getSizeInBits();
+  unsigned EVTBits = EVT.getSizeInBits();
+
+  if (NVTBits < EVTBits) {
+    Hi = DAG.getNode(ISD::AssertZext, dl, NVT, Hi,
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        EVTBits - NVTBits)));
+  } else {
+    Lo = DAG.getNode(ISD::AssertZext, dl, NVT, Lo, DAG.getValueType(EVT));
+    // The high part must be zero, make it explicit.
+    Hi = DAG.getConstant(0, dl, NVT);
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_BITREVERSE(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), Hi, Lo);  // Note swapped operands.
+  Lo = DAG.getNode(ISD::BITREVERSE, dl, Lo.getValueType(), Lo);
+  Hi = DAG.getNode(ISD::BITREVERSE, dl, Hi.getValueType(), Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_BSWAP(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), Hi, Lo);  // Note swapped operands.
+  Lo = DAG.getNode(ISD::BSWAP, dl, Lo.getValueType(), Lo);
+  Hi = DAG.getNode(ISD::BSWAP, dl, Hi.getValueType(), Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_PARITY(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDLoc dl(N);
+  // parity(HiLo) -> parity(Lo^Hi)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  Lo =
+      DAG.getNode(ISD::PARITY, dl, NVT, DAG.getNode(ISD::XOR, dl, NVT, Lo, Hi));
+  Hi = DAG.getConstant(0, dl, NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned NBitWidth = NVT.getSizeInBits();
+  auto Constant = cast<ConstantSDNode>(N);
+  const APInt &Cst = Constant->getAPIntValue();
+  bool IsTarget = Constant->isTargetOpcode();
+  bool IsOpaque = Constant->isOpaque();
+  SDLoc dl(N);
+  Lo = DAG.getConstant(Cst.trunc(NBitWidth), dl, NVT, IsTarget, IsOpaque);
+  Hi = DAG.getConstant(Cst.lshr(NBitWidth).trunc(NBitWidth), dl, NVT, IsTarget,
+                       IsOpaque);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ABS(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+
+  SDValue N0 = N->getOperand(0);
+  GetExpandedInteger(N0, Lo, Hi);
+  EVT NVT = Lo.getValueType();
+
+  // If the upper half is all sign bits, then we can perform the ABS on the
+  // lower half and zero-extend.
+  if (DAG.ComputeNumSignBits(N0) > NVT.getScalarSizeInBits()) {
+    Lo = DAG.getNode(ISD::ABS, dl, NVT, Lo);
+    Hi = DAG.getConstant(0, dl, NVT);
+    return;
+  }
+
+  // If we have USUBO_CARRY, use the expanded form of the sra+xor+sub sequence
+  // we use in LegalizeDAG. The SUB part of the expansion is based on
+  // ExpandIntRes_ADDSUB which also uses USUBO_CARRY/USUBO after checking that
+  // USUBO_CARRY is LegalOrCustom. Each of the pieces here can be further
+  // expanded if needed. Shift expansion has a special case for filling with
+  // sign bits so that we will only end up with one SRA.
+  bool HasSubCarry = TLI.isOperationLegalOrCustom(
+      ISD::USUBO_CARRY, TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
+  if (HasSubCarry) {
+    SDValue Sign = DAG.getNode(
+        ISD::SRA, dl, NVT, Hi,
+        DAG.getShiftAmountConstant(NVT.getSizeInBits() - 1, NVT, dl));
+    SDVTList VTList = DAG.getVTList(NVT, getSetCCResultType(NVT));
+    Lo = DAG.getNode(ISD::XOR, dl, NVT, Lo, Sign);
+    Hi = DAG.getNode(ISD::XOR, dl, NVT, Hi, Sign);
+    Lo = DAG.getNode(ISD::USUBO, dl, VTList, Lo, Sign);
+    Hi = DAG.getNode(ISD::USUBO_CARRY, dl, VTList, Hi, Sign, Lo.getValue(1));
+    return;
+  }
+
+  // abs(HiLo) -> (Hi < 0 ? -HiLo : HiLo)
+  EVT VT = N->getValueType(0);
+  SDValue Neg = DAG.getNode(ISD::SUB, dl, VT,
+                            DAG.getConstant(0, dl, VT), N0);
+  SDValue NegLo, NegHi;
+  SplitInteger(Neg, NegLo, NegHi);
+
+  SDValue HiIsNeg = DAG.getSetCC(dl, getSetCCResultType(NVT), Hi,
+                                 DAG.getConstant(0, dl, NVT), ISD::SETLT);
+  Lo = DAG.getSelect(dl, NVT, HiIsNeg, NegLo, Lo);
+  Hi = DAG.getSelect(dl, NVT, HiIsNeg, NegHi, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_CTLZ(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  // ctlz (HiLo) -> Hi != 0 ? ctlz(Hi) : (ctlz(Lo)+32)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+
+  SDValue HiNotZero = DAG.getSetCC(dl, getSetCCResultType(NVT), Hi,
+                                   DAG.getConstant(0, dl, NVT), ISD::SETNE);
+
+  SDValue LoLZ = DAG.getNode(N->getOpcode(), dl, NVT, Lo);
+  SDValue HiLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Hi);
+
+  Lo = DAG.getSelect(dl, NVT, HiNotZero, HiLZ,
+                     DAG.getNode(ISD::ADD, dl, NVT, LoLZ,
+                                 DAG.getConstant(NVT.getSizeInBits(), dl,
+                                                 NVT)));
+  Hi = DAG.getConstant(0, dl, NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  // ctpop(HiLo) -> ctpop(Hi)+ctpop(Lo)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  Lo = DAG.getNode(ISD::ADD, dl, NVT, DAG.getNode(ISD::CTPOP, dl, NVT, Lo),
+                   DAG.getNode(ISD::CTPOP, dl, NVT, Hi));
+  Hi = DAG.getConstant(0, dl, NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_CTTZ(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  // cttz (HiLo) -> Lo != 0 ? cttz(Lo) : (cttz(Hi)+32)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+
+  SDValue LoNotZero = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo,
+                                   DAG.getConstant(0, dl, NVT), ISD::SETNE);
+
+  SDValue LoLZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, NVT, Lo);
+  SDValue HiLZ = DAG.getNode(N->getOpcode(), dl, NVT, Hi);
+
+  Lo = DAG.getSelect(dl, NVT, LoNotZero, LoLZ,
+                     DAG.getNode(ISD::ADD, dl, NVT, HiLZ,
+                                 DAG.getConstant(NVT.getSizeInBits(), dl,
+                                                 NVT)));
+  Hi = DAG.getConstant(0, dl, NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_GET_ROUNDING(SDNode *N, SDValue &Lo,
+                                               SDValue &Hi) {
+  SDLoc dl(N);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned NBitWidth = NVT.getSizeInBits();
+
+  Lo = DAG.getNode(ISD::GET_ROUNDING, dl, {NVT, MVT::Other}, N->getOperand(0));
+  SDValue Chain = Lo.getValue(1);
+  // The high part is the sign of Lo, as -1 is a valid value for GET_ROUNDING
+  Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
+                   DAG.getShiftAmountConstant(NBitWidth - 1, NVT, dl));
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
+                                               SDValue &Hi) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+  if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat)
+    Op = GetPromotedFloat(Op);
+
+  if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftPromoteHalf) {
+    EVT NFPVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
+    Op = GetSoftPromotedHalf(Op);
+    Op = DAG.getNode(ISD::FP16_TO_FP, dl, NFPVT, Op);
+    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
+    SplitInteger(Op, Lo, Hi);
+    return;
+  }
+
+  RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!");
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, VT, Op,
+                                                    CallOptions, dl, Chain);
+  SplitInteger(Tmp.first, Lo, Hi);
+
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
+                                               SDValue &Hi) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+  if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat)
+    Op = GetPromotedFloat(Op);
+
+  if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftPromoteHalf) {
+    EVT NFPVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
+    Op = GetSoftPromotedHalf(Op);
+    Op = DAG.getNode(ISD::FP16_TO_FP, dl, NFPVT, Op);
+    Op = DAG.getNode(ISD::FP_TO_UINT, dl, VT, Op);
+    SplitInteger(Op, Lo, Hi);
+    return;
+  }
+
+  RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!");
+  TargetLowering::MakeLibCallOptions CallOptions;
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, VT, Op,
+                                                    CallOptions, dl, Chain);
+  SplitInteger(Tmp.first, Lo, Hi);
+
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo,
+                                                   SDValue &Hi) {
+  SDValue Res = TLI.expandFP_TO_INT_SAT(N, DAG);
+  SplitInteger(Res, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_XROUND_XRINT(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  SDLoc dl(N);
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+
+  assert(getTypeAction(Op.getValueType()) != TargetLowering::TypePromoteFloat &&
+         "Input type needs to be promoted!");
+
+  EVT VT = Op.getValueType();
+
+  if (VT == MVT::f16) {
+    VT = MVT::f32;
+    // Extend to f32.
+    if (IsStrict) {
+      Op = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, { VT, MVT::Other }, {Chain, Op});
+      Chain = Op.getValue(1);
+    } else {
+      Op = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op);
+    }
+  }
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (N->getOpcode() == ISD::LROUND ||
+      N->getOpcode() == ISD::STRICT_LROUND) {
+    if (VT == MVT::f32)
+      LC = RTLIB::LROUND_F32;
+    else if (VT == MVT::f64)
+      LC = RTLIB::LROUND_F64;
+    else if (VT == MVT::f80)
+      LC = RTLIB::LROUND_F80;
+    else if (VT == MVT::f128)
+      LC = RTLIB::LROUND_F128;
+    else if (VT == MVT::ppcf128)
+      LC = RTLIB::LROUND_PPCF128;
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected lround input type!");
+  } else if (N->getOpcode() == ISD::LRINT ||
+             N->getOpcode() == ISD::STRICT_LRINT) {
+    if (VT == MVT::f32)
+      LC = RTLIB::LRINT_F32;
+    else if (VT == MVT::f64)
+      LC = RTLIB::LRINT_F64;
+    else if (VT == MVT::f80)
+      LC = RTLIB::LRINT_F80;
+    else if (VT == MVT::f128)
+      LC = RTLIB::LRINT_F128;
+    else if (VT == MVT::ppcf128)
+      LC = RTLIB::LRINT_PPCF128;
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected lrint input type!");
+  } else if (N->getOpcode() == ISD::LLROUND ||
+      N->getOpcode() == ISD::STRICT_LLROUND) {
+    if (VT == MVT::f32)
+      LC = RTLIB::LLROUND_F32;
+    else if (VT == MVT::f64)
+      LC = RTLIB::LLROUND_F64;
+    else if (VT == MVT::f80)
+      LC = RTLIB::LLROUND_F80;
+    else if (VT == MVT::f128)
+      LC = RTLIB::LLROUND_F128;
+    else if (VT == MVT::ppcf128)
+      LC = RTLIB::LLROUND_PPCF128;
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected llround input type!");
+  } else if (N->getOpcode() == ISD::LLRINT ||
+             N->getOpcode() == ISD::STRICT_LLRINT) {
+    if (VT == MVT::f32)
+      LC = RTLIB::LLRINT_F32;
+    else if (VT == MVT::f64)
+      LC = RTLIB::LLRINT_F64;
+    else if (VT == MVT::f80)
+      LC = RTLIB::LLRINT_F80;
+    else if (VT == MVT::f128)
+      LC = RTLIB::LLRINT_F128;
+    else if (VT == MVT::ppcf128)
+      LC = RTLIB::LLRINT_PPCF128;
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected llrint input type!");
+  } else
+    llvm_unreachable("Unexpected opcode!");
+
+  EVT RetVT = N->getValueType(0);
+
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RetVT,
+                                                    Op, CallOptions, dl,
+                                                    Chain);
+  SplitInteger(Tmp.first, Lo, Hi);
+
+  if (N->isStrictFPOpcode())
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  if (N->isAtomic()) {
+    // It's typical to have larger CAS than atomic load instructions.
+    SDLoc dl(N);
+    EVT VT = N->getMemoryVT();
+    SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    SDValue Swap = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl,
+        VT, VTs, N->getOperand(0),
+        N->getOperand(1), Zero, Zero, N->getMemOperand());
+    ReplaceValueWith(SDValue(N, 0), Swap.getValue(0));
+    ReplaceValueWith(SDValue(N, 1), Swap.getValue(2));
+    return;
+  }
+
+  if (ISD::isNormalLoad(N)) {
+    ExpandRes_NormalLoad(N, Lo, Hi);
+    return;
+  }
+
+  assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
+
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  ISD::LoadExtType ExtType = N->getExtensionType();
+  MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = N->getAAInfo();
+  SDLoc dl(N);
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+
+  if (N->getMemoryVT().bitsLE(NVT)) {
+    EVT MemVT = N->getMemoryVT();
+
+    Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(), MemVT,
+                        N->getOriginalAlign(), MMOFlags, AAInfo);
+
+    // Remember the chain.
+    Ch = Lo.getValue(1);
+
+    if (ExtType == ISD::SEXTLOAD) {
+      // The high part is obtained by SRA'ing all but one of the bits of the
+      // lo part.
+      unsigned LoSize = Lo.getValueSizeInBits();
+      Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
+                       DAG.getConstant(LoSize - 1, dl,
+                                       TLI.getPointerTy(DAG.getDataLayout())));
+    } else if (ExtType == ISD::ZEXTLOAD) {
+      // The high part is just a zero.
+      Hi = DAG.getConstant(0, dl, NVT);
+    } else {
+      assert(ExtType == ISD::EXTLOAD && "Unknown extload!");
+      // The high part is undefined.
+      Hi = DAG.getUNDEF(NVT);
+    }
+  } else if (DAG.getDataLayout().isLittleEndian()) {
+    // Little-endian - low bits are at low addresses.
+    Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(),
+                     N->getOriginalAlign(), MMOFlags, AAInfo);
+
+    unsigned ExcessBits =
+      N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
+    EVT NEVT = EVT::getIntegerVT(*DAG.getContext(), ExcessBits);
+
+    // Increment the pointer to the other half.
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl);
+    Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr,
+                        N->getPointerInfo().getWithOffset(IncrementSize), NEVT,
+                        N->getOriginalAlign(), MMOFlags, AAInfo);
+
+    // Build a factor node to remember that this load is independent of the
+    // other one.
+    Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                     Hi.getValue(1));
+  } else {
+    // Big-endian - high bits are at low addresses.  Favor aligned loads at
+    // the cost of some bit-fiddling.
+    EVT MemVT = N->getMemoryVT();
+    unsigned EBytes = MemVT.getStoreSize();
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    unsigned ExcessBits = (EBytes - IncrementSize)*8;
+
+    // Load both the high bits and maybe some of the low bits.
+    Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
+                        EVT::getIntegerVT(*DAG.getContext(),
+                                          MemVT.getSizeInBits() - ExcessBits),
+                        N->getOriginalAlign(), MMOFlags, AAInfo);
+
+    // Increment the pointer to the other half.
+    Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl);
+    // Load the rest of the low bits.
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr,
+                        N->getPointerInfo().getWithOffset(IncrementSize),
+                        EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
+                        N->getOriginalAlign(), MMOFlags, AAInfo);
+
+    // Build a factor node to remember that this load is independent of the
+    // other one.
+    Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                     Hi.getValue(1));
+
+    if (ExcessBits < NVT.getSizeInBits()) {
+      // Transfer low bits from the bottom of Hi to the top of Lo.
+      Lo = DAG.getNode(
+          ISD::OR, dl, NVT, Lo,
+          DAG.getNode(ISD::SHL, dl, NVT, Hi,
+                      DAG.getConstant(ExcessBits, dl,
+                                      TLI.getPointerTy(DAG.getDataLayout()))));
+      // Move high bits to the right position in Hi.
+      Hi = DAG.getNode(ExtType == ISD::SEXTLOAD ? ISD::SRA : ISD::SRL, dl, NVT,
+                       Hi,
+                       DAG.getConstant(NVT.getSizeInBits() - ExcessBits, dl,
+                                       TLI.getPointerTy(DAG.getDataLayout())));
+    }
+  }
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Ch);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_Logical(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  SDValue LL, LH, RL, RH;
+  GetExpandedInteger(N->getOperand(0), LL, LH);
+  GetExpandedInteger(N->getOperand(1), RL, RH);
+  Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), LL, RL);
+  Hi = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), LH, RH);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
+                                        SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDLoc dl(N);
+
+  SDValue LL, LH, RL, RH;
+  GetExpandedInteger(N->getOperand(0), LL, LH);
+  GetExpandedInteger(N->getOperand(1), RL, RH);
+
+  if (TLI.expandMUL(N, Lo, Hi, NVT, DAG,
+                    TargetLowering::MulExpansionKind::OnlyLegalOrCustom,
+                    LL, LH, RL, RH))
+    return;
+
+  // If nothing else, we can make a libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::MUL_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::MUL_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::MUL_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::MUL_I128;
+
+  if (LC == RTLIB::UNKNOWN_LIBCALL || !TLI.getLibcallName(LC)) {
+    // We'll expand the multiplication by brute force because we have no other
+    // options. This is a trivially-generalized version of the code from
+    // Hacker's Delight (itself derived from Knuth's Algorithm M from section
+    // 4.3.1).
+    unsigned Bits = NVT.getSizeInBits();
+    unsigned HalfBits = Bits >> 1;
+    SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl,
+                                   NVT);
+    SDValue LLL = DAG.getNode(ISD::AND, dl, NVT, LL, Mask);
+    SDValue RLL = DAG.getNode(ISD::AND, dl, NVT, RL, Mask);
+
+    SDValue T = DAG.getNode(ISD::MUL, dl, NVT, LLL, RLL);
+    SDValue TL = DAG.getNode(ISD::AND, dl, NVT, T, Mask);
+
+    SDValue Shift = DAG.getShiftAmountConstant(HalfBits, NVT, dl);
+    SDValue TH = DAG.getNode(ISD::SRL, dl, NVT, T, Shift);
+    SDValue LLH = DAG.getNode(ISD::SRL, dl, NVT, LL, Shift);
+    SDValue RLH = DAG.getNode(ISD::SRL, dl, NVT, RL, Shift);
+
+    SDValue U = DAG.getNode(ISD::ADD, dl, NVT,
+                            DAG.getNode(ISD::MUL, dl, NVT, LLH, RLL), TH);
+    SDValue UL = DAG.getNode(ISD::AND, dl, NVT, U, Mask);
+    SDValue UH = DAG.getNode(ISD::SRL, dl, NVT, U, Shift);
+
+    SDValue V = DAG.getNode(ISD::ADD, dl, NVT,
+                            DAG.getNode(ISD::MUL, dl, NVT, LLL, RLH), UL);
+    SDValue VH = DAG.getNode(ISD::SRL, dl, NVT, V, Shift);
+
+    SDValue W = DAG.getNode(ISD::ADD, dl, NVT,
+                            DAG.getNode(ISD::MUL, dl, NVT, LLH, RLH),
+                            DAG.getNode(ISD::ADD, dl, NVT, UH, VH));
+    Lo = DAG.getNode(ISD::ADD, dl, NVT, TL,
+                     DAG.getNode(ISD::SHL, dl, NVT, V, Shift));
+
+    Hi = DAG.getNode(ISD::ADD, dl, NVT, W,
+                     DAG.getNode(ISD::ADD, dl, NVT,
+                                 DAG.getNode(ISD::MUL, dl, NVT, RH, LL),
+                                 DAG.getNode(ISD::MUL, dl, NVT, RL, LH)));
+    return;
+  }
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first,
+               Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_READCYCLECOUNTER(SDNode *N, SDValue &Lo,
+                                                     SDValue &Hi) {
+  SDLoc DL(N);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDVTList VTs = DAG.getVTList(NVT, NVT, MVT::Other);
+  SDValue R = DAG.getNode(N->getOpcode(), DL, VTs, N->getOperand(0));
+  Lo = R.getValue(0);
+  Hi = R.getValue(1);
+  ReplaceValueWith(SDValue(N, 1), R.getValue(2));
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUBSAT(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  SDValue Result = TLI.expandAddSubSat(N, DAG);
+  SplitInteger(Result, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SHLSAT(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Result = TLI.expandShlSat(N, DAG);
+  SplitInteger(Result, Lo, Hi);
+}
+
+/// This performs an expansion of the integer result for a fixed point
+/// multiplication. The default expansion performs rounding down towards
+/// negative infinity, though targets that do care about rounding should specify
+/// a target hook for rounding and provide their own expansion or lowering of
+/// fixed point multiplication to be consistent with rounding.
+void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  unsigned VTSize = VT.getScalarSizeInBits();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  uint64_t Scale = N->getConstantOperandVal(2);
+  bool Saturating = (N->getOpcode() == ISD::SMULFIXSAT ||
+                     N->getOpcode() == ISD::UMULFIXSAT);
+  bool Signed = (N->getOpcode() == ISD::SMULFIX ||
+                 N->getOpcode() == ISD::SMULFIXSAT);
+
+  // Handle special case when scale is equal to zero.
+  if (!Scale) {
+    SDValue Result;
+    if (!Saturating) {
+      Result = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+    } else {
+      EVT BoolVT = getSetCCResultType(VT);
+      unsigned MulOp = Signed ? ISD::SMULO : ISD::UMULO;
+      Result = DAG.getNode(MulOp, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
+      SDValue Product = Result.getValue(0);
+      SDValue Overflow = Result.getValue(1);
+      if (Signed) {
+        APInt MinVal = APInt::getSignedMinValue(VTSize);
+        APInt MaxVal = APInt::getSignedMaxValue(VTSize);
+        SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
+        SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+        SDValue Zero = DAG.getConstant(0, dl, VT);
+        // Xor the inputs, if resulting sign bit is 0 the product will be
+        // positive, else negative.
+        SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, LHS, RHS);
+        SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Xor, Zero, ISD::SETLT);
+        Result = DAG.getSelect(dl, VT, ProdNeg, SatMin, SatMax);
+        Result = DAG.getSelect(dl, VT, Overflow, Result, Product);
+      } else {
+        // For unsigned multiplication, we only need to check the max since we
+        // can't really overflow towards zero.
+        APInt MaxVal = APInt::getMaxValue(VTSize);
+        SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+        Result = DAG.getSelect(dl, VT, Overflow, SatMax, Product);
+      }
+    }
+    SplitInteger(Result, Lo, Hi);
+    return;
+  }
+
+  // For SMULFIX[SAT] we only expect to find Scale<VTSize, but this assert will
+  // cover for unhandled cases below, while still being valid for UMULFIX[SAT].
+  assert(Scale <= VTSize && "Scale can't be larger than the value type size.");
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue LL, LH, RL, RH;
+  GetExpandedInteger(LHS, LL, LH);
+  GetExpandedInteger(RHS, RL, RH);
+  SmallVector<SDValue, 4> Result;
+
+  unsigned LoHiOp = Signed ? ISD::SMUL_LOHI : ISD::UMUL_LOHI;
+  if (!TLI.expandMUL_LOHI(LoHiOp, VT, dl, LHS, RHS, Result, NVT, DAG,
+                          TargetLowering::MulExpansionKind::OnlyLegalOrCustom,
+                          LL, LH, RL, RH)) {
+    report_fatal_error("Unable to expand MUL_FIX using MUL_LOHI.");
+    return;
+  }
+
+  unsigned NVTSize = NVT.getScalarSizeInBits();
+  assert((VTSize == NVTSize * 2) && "Expected the new value type to be half "
+                                    "the size of the current value type");
+
+  // After getting the multiplication result in 4 parts, we need to perform a
+  // shift right by the amount of the scale to get the result in that scale.
+  //
+  // Let's say we multiply 2 64 bit numbers. The resulting value can be held in
+  // 128 bits that are cut into 4 32-bit parts:
+  //
+  //      HH       HL       LH       LL
+  //  |---32---|---32---|---32---|---32---|
+  // 128      96       64       32        0
+  //
+  //                    |------VTSize-----|
+  //
+  //                             |NVTSize-|
+  //
+  // The resulting Lo and Hi would normally be in LL and LH after the shift. But
+  // to avoid unneccessary shifting of all 4 parts, we can adjust the shift
+  // amount and get Lo and Hi using two funnel shifts. Or for the special case
+  // when Scale is a multiple of NVTSize we can just pick the result without
+  // shifting.
+  uint64_t Part0 = Scale / NVTSize; // Part holding lowest bit needed.
+  if (Scale % NVTSize) {
+    SDValue ShiftAmount = DAG.getShiftAmountConstant(Scale % NVTSize, NVT, dl);
+    Lo = DAG.getNode(ISD::FSHR, dl, NVT, Result[Part0 + 1], Result[Part0],
+                     ShiftAmount);
+    Hi = DAG.getNode(ISD::FSHR, dl, NVT, Result[Part0 + 2], Result[Part0 + 1],
+                     ShiftAmount);
+  } else {
+    Lo = Result[Part0];
+    Hi = Result[Part0 + 1];
+  }
+
+  // Unless saturation is requested we are done. The result is in <Hi,Lo>.
+  if (!Saturating)
+    return;
+
+  // Can not overflow when there is no integer part.
+  if (Scale == VTSize)
+    return;
+
+  // To handle saturation we must check for overflow in the multiplication.
+  //
+  // Unsigned overflow happened if the upper (VTSize - Scale) bits (of Result)
+  // aren't all zeroes.
+  //
+  // Signed overflow happened if the upper (VTSize - Scale + 1) bits (of Result)
+  // aren't all ones or all zeroes.
+  //
+  // We cannot overflow past HH when multiplying 2 ints of size VTSize, so the
+  // highest bit of HH determines saturation direction in the event of signed
+  // saturation.
+
+  SDValue ResultHL = Result[2];
+  SDValue ResultHH = Result[3];
+
+  SDValue SatMax, SatMin;
+  SDValue NVTZero = DAG.getConstant(0, dl, NVT);
+  SDValue NVTNeg1 = DAG.getConstant(-1, dl, NVT);
+  EVT BoolNVT = getSetCCResultType(NVT);
+
+  if (!Signed) {
+    if (Scale < NVTSize) {
+      // Overflow happened if ((HH | (HL >> Scale)) != 0).
+      SDValue HLAdjusted =
+          DAG.getNode(ISD::SRL, dl, NVT, ResultHL,
+                      DAG.getShiftAmountConstant(Scale, NVT, dl));
+      SDValue Tmp = DAG.getNode(ISD::OR, dl, NVT, HLAdjusted, ResultHH);
+      SatMax = DAG.getSetCC(dl, BoolNVT, Tmp, NVTZero, ISD::SETNE);
+    } else if (Scale == NVTSize) {
+      // Overflow happened if (HH != 0).
+      SatMax = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETNE);
+    } else if (Scale < VTSize) {
+      // Overflow happened if ((HH >> (Scale - NVTSize)) != 0).
+      SDValue HLAdjusted =
+          DAG.getNode(ISD::SRL, dl, NVT, ResultHL,
+                      DAG.getShiftAmountConstant(Scale - NVTSize, NVT, dl));
+      SatMax = DAG.getSetCC(dl, BoolNVT, HLAdjusted, NVTZero, ISD::SETNE);
+    } else
+      llvm_unreachable("Scale must be less or equal to VTSize for UMULFIXSAT"
+                       "(and saturation can't happen with Scale==VTSize).");
+
+    Hi = DAG.getSelect(dl, NVT, SatMax, NVTNeg1, Hi);
+    Lo = DAG.getSelect(dl, NVT, SatMax, NVTNeg1, Lo);
+    return;
+  }
+
+  if (Scale < NVTSize) {
+    // The number of overflow bits we can check are VTSize - Scale + 1 (we
+    // include the sign bit). If these top bits are > 0, then we overflowed past
+    // the max value. If these top bits are < -1, then we overflowed past the
+    // min value. Otherwise, we did not overflow.
+    unsigned OverflowBits = VTSize - Scale + 1;
+    assert(OverflowBits <= VTSize && OverflowBits > NVTSize &&
+           "Extent of overflow bits must start within HL");
+    SDValue HLHiMask = DAG.getConstant(
+        APInt::getHighBitsSet(NVTSize, OverflowBits - NVTSize), dl, NVT);
+    SDValue HLLoMask = DAG.getConstant(
+        APInt::getLowBitsSet(NVTSize, VTSize - OverflowBits), dl, NVT);
+    // We overflow max if HH > 0 or (HH == 0 && HL > HLLoMask).
+    SDValue HHGT0 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETGT);
+    SDValue HHEQ0 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETEQ);
+    SDValue HLUGT = DAG.getSetCC(dl, BoolNVT, ResultHL, HLLoMask, ISD::SETUGT);
+    SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHGT0,
+                         DAG.getNode(ISD::AND, dl, BoolNVT, HHEQ0, HLUGT));
+    // We overflow min if HH < -1 or (HH == -1 && HL < HLHiMask).
+    SDValue HHLT = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
+    SDValue HHEQ = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
+    SDValue HLULT = DAG.getSetCC(dl, BoolNVT, ResultHL, HLHiMask, ISD::SETULT);
+    SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHLT,
+                         DAG.getNode(ISD::AND, dl, BoolNVT, HHEQ, HLULT));
+  } else if (Scale == NVTSize) {
+    // We overflow max if HH > 0 or (HH == 0 && HL sign bit is 1).
+    SDValue HHGT0 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETGT);
+    SDValue HHEQ0 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETEQ);
+    SDValue HLNeg = DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETLT);
+    SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHGT0,
+                         DAG.getNode(ISD::AND, dl, BoolNVT, HHEQ0, HLNeg));
+    // We overflow min if HH < -1 or (HH == -1 && HL sign bit is 0).
+    SDValue HHLT = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
+    SDValue HHEQ = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
+    SDValue HLPos = DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETGE);
+    SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHLT,
+                         DAG.getNode(ISD::AND, dl, BoolNVT, HHEQ, HLPos));
+  } else if (Scale < VTSize) {
+    // This is similar to the case when we saturate if Scale < NVTSize, but we
+    // only need to check HH.
+    unsigned OverflowBits = VTSize - Scale + 1;
+    SDValue HHHiMask = DAG.getConstant(
+        APInt::getHighBitsSet(NVTSize, OverflowBits), dl, NVT);
+    SDValue HHLoMask = DAG.getConstant(
+        APInt::getLowBitsSet(NVTSize, NVTSize - OverflowBits), dl, NVT);
+    SatMax = DAG.getSetCC(dl, BoolNVT, ResultHH, HHLoMask, ISD::SETGT);
+    SatMin = DAG.getSetCC(dl, BoolNVT, ResultHH, HHHiMask, ISD::SETLT);
+  } else
+    llvm_unreachable("Illegal scale for signed fixed point mul.");
+
+  // Saturate to signed maximum.
+  APInt MaxHi = APInt::getSignedMaxValue(NVTSize);
+  APInt MaxLo = APInt::getAllOnes(NVTSize);
+  Hi = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(MaxHi, dl, NVT), Hi);
+  Lo = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(MaxLo, dl, NVT), Lo);
+  // Saturate to signed minimum.
+  APInt MinHi = APInt::getSignedMinValue(NVTSize);
+  Hi = DAG.getSelect(dl, NVT, SatMin, DAG.getConstant(MinHi, dl, NVT), Hi);
+  Lo = DAG.getSelect(dl, NVT, SatMin, NVTZero, Lo);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_DIVFIX(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDLoc dl(N);
+  // Try expanding in the existing type first.
+  SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, N->getOperand(0),
+                                        N->getOperand(1),
+                                        N->getConstantOperandVal(2), DAG);
+
+  if (!Res)
+    Res = earlyExpandDIVFIX(N, N->getOperand(0), N->getOperand(1),
+                            N->getConstantOperandVal(2), TLI, DAG);
+  SplitInteger(Res, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
+                                             SDValue &Lo, SDValue &Hi) {
+  assert((Node->getOpcode() == ISD::SADDO || Node->getOpcode() == ISD::SSUBO) &&
+         "Node has unexpected Opcode");
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  SDLoc dl(Node);
+
+  SDValue Ovf;
+
+  bool IsAdd = Node->getOpcode() == ISD::SADDO;
+  unsigned CarryOp = IsAdd ? ISD::SADDO_CARRY : ISD::SSUBO_CARRY;
+
+  bool HasCarryOp = TLI.isOperationLegalOrCustom(
+      CarryOp, TLI.getTypeToExpandTo(*DAG.getContext(), LHS.getValueType()));
+
+  if (HasCarryOp) {
+    // Expand the subcomponents.
+    SDValue LHSL, LHSH, RHSL, RHSH;
+    GetExpandedInteger(LHS, LHSL, LHSH);
+    GetExpandedInteger(RHS, RHSL, RHSH);
+    SDVTList VTList = DAG.getVTList(LHSL.getValueType(), Node->getValueType(1));
+
+    Lo = DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, dl, VTList, {LHSL, RHSL});
+    Hi = DAG.getNode(CarryOp, dl, VTList, { LHSH, RHSH, Lo.getValue(1) });
+
+    Ovf = Hi.getValue(1);
+  } else {
+    // Expand the result by simply replacing it with the equivalent
+    // non-overflow-checking operation.
+    SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
+                              ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                              LHS, RHS);
+    SplitInteger(Sum, Lo, Hi);
+
+    // Compute the overflow.
+    //
+    //   LHSSign -> LHS < 0
+    //   RHSSign -> RHS < 0
+    //   SumSign -> Sum < 0
+    //
+    //   Add:
+    //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
+    //   Sub:
+    //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
+    //
+    // To get better codegen we can rewrite this by doing bitwise math on
+    // the integers and extract the final sign bit at the end. So the
+    // above becomes:
+    //
+    //   Add:
+    //   Overflow -> (~(LHS ^ RHS) & (LHS ^ Sum)) < 0
+    //   Sub:
+    //   Overflow -> ((LHS ^ RHS) & (LHS ^ Sum)) < 0
+    //
+    // NOTE: This is different than the expansion we do in expandSADDSUBO
+    // because it is more costly to determine the RHS is > 0 for SSUBO with the
+    // integers split.
+    EVT VT = LHS.getValueType();
+    SDValue SignsMatch = DAG.getNode(ISD::XOR, dl, VT, LHS, RHS);
+    if (IsAdd)
+      SignsMatch = DAG.getNOT(dl, SignsMatch, VT);
+
+    SDValue SumSignNE = DAG.getNode(ISD::XOR, dl, VT, LHS, Sum);
+    Ovf = DAG.getNode(ISD::AND, dl, VT, SignsMatch, SumSignNE);
+    EVT OType = Node->getValueType(1);
+    Ovf = DAG.getSetCC(dl, OType, Ovf, DAG.getConstant(0, dl, VT), ISD::SETLT);
+  }
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(Node, 1), Ovf);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+
+  if (TLI.getOperationAction(ISD::SDIVREM, VT) == TargetLowering::Custom) {
+    SDValue Res = DAG.getNode(ISD::SDIVREM, dl, DAG.getVTList(VT, VT), Ops);
+    SplitInteger(Res.getValue(0), Lo, Hi);
+    return;
+  }
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::SDIV_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::SDIV_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::SDIV_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::SDIV_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
+
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
+                                                      SDValue &Hi) {
+  SDLoc dl(N);
+  SDValue Shiftee = N->getOperand(0);
+  EVT VT = Shiftee.getValueType();
+  SDValue ShAmt = N->getOperand(1);
+  EVT ShAmtVT = ShAmt.getValueType();
+
+  // This legalization is optimal when the shift is by a multiple of byte width,
+  //   %x * 8 <-> %x << 3   so 3 low bits should be be known zero.
+  bool ShiftByByteMultiple =
+      DAG.computeKnownBits(ShAmt).countMinTrailingZeros() >= 3;
+
+  // If we can't do it as one step, we'll have two uses of shift amount,
+  // and thus must freeze it.
+  if (!ShiftByByteMultiple)
+    ShAmt = DAG.getFreeze(ShAmt);
+
+  unsigned VTBitWidth = VT.getScalarSizeInBits();
+  assert(VTBitWidth % 8 == 0 && "Shifting a not byte multiple value?");
+  unsigned VTByteWidth = VTBitWidth / 8;
+  assert(isPowerOf2_32(VTByteWidth) &&
+         "Shiftee type size is not a power of two!");
+  unsigned StackSlotByteWidth = 2 * VTByteWidth;
+  unsigned StackSlotBitWidth = 8 * StackSlotByteWidth;
+  EVT StackSlotVT = EVT::getIntegerVT(*DAG.getContext(), StackSlotBitWidth);
+
+  // Get a temporary stack slot 2x the width of our VT.
+  // FIXME: reuse stack slots?
+  // FIXME: should we be more picky about alignment?
+  Align StackSlotAlignment(1);
+  SDValue StackPtr = DAG.CreateStackTemporary(
+      TypeSize::getFixed(StackSlotByteWidth), StackSlotAlignment);
+  EVT PtrTy = StackPtr.getValueType();
+  SDValue Ch = DAG.getEntryNode();
+
+  MachinePointerInfo StackPtrInfo = MachinePointerInfo::getFixedStack(
+      DAG.getMachineFunction(),
+      cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex());
+
+  // Extend the value, that is being shifted, to the entire stack slot's width.
+  SDValue Init;
+  if (N->getOpcode() != ISD::SHL) {
+    unsigned WideningOpc =
+        N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    Init = DAG.getNode(WideningOpc, dl, StackSlotVT, Shiftee);
+  } else {
+    // For left-shifts, pad the Shiftee's LSB with zeros to twice it's width.
+    SDValue AllZeros = DAG.getConstant(0, dl, VT);
+    Init = DAG.getNode(ISD::BUILD_PAIR, dl, StackSlotVT, AllZeros, Shiftee);
+  }
+  // And spill it into the stack slot.
+  Ch = DAG.getStore(Ch, dl, Init, StackPtr, StackPtrInfo, StackSlotAlignment);
+
+  // Now, compute the full-byte offset into stack slot from where we can load.
+  // We have shift amount, which is in bits, but in multiples of byte.
+  // So just divide by CHAR_BIT.
+  SDNodeFlags Flags;
+  if (ShiftByByteMultiple)
+    Flags.setExact(true);
+  SDValue ByteOffset = DAG.getNode(ISD::SRL, dl, ShAmtVT, ShAmt,
+                                   DAG.getConstant(3, dl, ShAmtVT), Flags);
+  // And clamp it, because OOB load is an immediate UB,
+  // while shift overflow would have *just* been poison.
+  ByteOffset = DAG.getNode(ISD::AND, dl, ShAmtVT, ByteOffset,
+                           DAG.getConstant(VTByteWidth - 1, dl, ShAmtVT));
+  // We have exactly two strategies on indexing into stack slot here:
+  // 1. upwards starting from the beginning of the slot
+  // 2. downwards starting from the middle of the slot
+  // On little-endian machine, we pick 1. for right shifts and 2. for left-shift
+  // and vice versa on big-endian machine.
+  bool WillIndexUpwards = N->getOpcode() != ISD::SHL;
+  if (DAG.getDataLayout().isBigEndian())
+    WillIndexUpwards = !WillIndexUpwards;
+
+  SDValue AdjStackPtr;
+  if (WillIndexUpwards) {
+    AdjStackPtr = StackPtr;
+  } else {
+    AdjStackPtr = DAG.getMemBasePlusOffset(
+        StackPtr, DAG.getConstant(VTByteWidth, dl, PtrTy), dl);
+    ByteOffset = DAG.getNegative(ByteOffset, dl, ShAmtVT);
+  }
+
+  // Get the pointer somewhere into the stack slot from which we need to load.
+  ByteOffset = DAG.getSExtOrTrunc(ByteOffset, dl, PtrTy);
+  AdjStackPtr = DAG.getMemBasePlusOffset(AdjStackPtr, ByteOffset, dl);
+
+  // And load it! While the load is not legal, legalizing it is obvious.
+  SDValue Res = DAG.getLoad(
+      VT, dl, Ch, AdjStackPtr,
+      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), Align(1));
+  // We've performed the shift by a CHAR_BIT * [_ShAmt / CHAR_BIT_]
+
+  // If we may still have a less-than-CHAR_BIT to shift by, do so now.
+  if (!ShiftByByteMultiple) {
+    SDValue ShAmtRem = DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
+                                   DAG.getConstant(7, dl, ShAmtVT));
+    Res = DAG.getNode(N->getOpcode(), dl, VT, Res, ShAmtRem);
+  }
+
+  // Finally, split the computed value.
+  SplitInteger(Res, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  // If we can emit an efficient shift operation, do so now.  Check to see if
+  // the RHS is a constant.
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    return ExpandShiftByConstant(N, CN->getAPIntValue(), Lo, Hi);
+
+  // If we can determine that the high bit of the shift is zero or one, even if
+  // the low bits are variable, emit this shift in an optimized form.
+  if (ExpandShiftWithKnownAmountBit(N, Lo, Hi))
+    return;
+
+  // If this target supports shift_PARTS, use it.  First, map to the _PARTS opc.
+  unsigned PartsOpc;
+  if (N->getOpcode() == ISD::SHL) {
+    PartsOpc = ISD::SHL_PARTS;
+  } else if (N->getOpcode() == ISD::SRL) {
+    PartsOpc = ISD::SRL_PARTS;
+  } else {
+    assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+    PartsOpc = ISD::SRA_PARTS;
+  }
+
+  // Next check to see if the target supports this SHL_PARTS operation or if it
+  // will custom expand it. Don't lower this to SHL_PARTS when we optimise for
+  // size, but create a libcall instead.
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  TargetLowering::LegalizeAction Action = TLI.getOperationAction(PartsOpc, NVT);
+  const bool LegalOrCustom =
+    (Action == TargetLowering::Legal && TLI.isTypeLegal(NVT)) ||
+    Action == TargetLowering::Custom;
+
+  unsigned ExpansionFactor = 1;
+  // That VT->NVT expansion is one step. But will we re-expand NVT?
+  for (EVT TmpVT = NVT;;) {
+    EVT NewTMPVT = TLI.getTypeToTransformTo(*DAG.getContext(), TmpVT);
+    if (NewTMPVT == TmpVT)
+      break;
+    TmpVT = NewTMPVT;
+    ++ExpansionFactor;
+  }
+
+  TargetLowering::ShiftLegalizationStrategy S =
+      TLI.preferredShiftLegalizationStrategy(DAG, N, ExpansionFactor);
+
+  if (S == TargetLowering::ShiftLegalizationStrategy::ExpandThroughStack)
+    return ExpandIntRes_ShiftThroughStack(N, Lo, Hi);
+
+  if (LegalOrCustom &&
+      S != TargetLowering::ShiftLegalizationStrategy::LowerToLibcall) {
+    // Expand the subcomponents.
+    SDValue LHSL, LHSH;
+    GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+    EVT VT = LHSL.getValueType();
+
+    // If the shift amount operand is coming from a vector legalization it may
+    // have an illegal type.  Fix that first by casting the operand, otherwise
+    // the new SHL_PARTS operation would need further legalization.
+    SDValue ShiftOp = N->getOperand(1);
+    EVT ShiftTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+    if (ShiftOp.getValueType() != ShiftTy)
+      ShiftOp = DAG.getZExtOrTrunc(ShiftOp, dl, ShiftTy);
+
+    SDValue Ops[] = { LHSL, LHSH, ShiftOp };
+    Lo = DAG.getNode(PartsOpc, dl, DAG.getVTList(VT, VT), Ops);
+    Hi = Lo.getValue(1);
+    return;
+  }
+
+  // Otherwise, emit a libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  bool isSigned;
+  if (N->getOpcode() == ISD::SHL) {
+    isSigned = false; /*sign irrelevant*/
+    if (VT == MVT::i16)
+      LC = RTLIB::SHL_I16;
+    else if (VT == MVT::i32)
+      LC = RTLIB::SHL_I32;
+    else if (VT == MVT::i64)
+      LC = RTLIB::SHL_I64;
+    else if (VT == MVT::i128)
+      LC = RTLIB::SHL_I128;
+  } else if (N->getOpcode() == ISD::SRL) {
+    isSigned = false;
+    if (VT == MVT::i16)
+      LC = RTLIB::SRL_I16;
+    else if (VT == MVT::i32)
+      LC = RTLIB::SRL_I32;
+    else if (VT == MVT::i64)
+      LC = RTLIB::SRL_I64;
+    else if (VT == MVT::i128)
+      LC = RTLIB::SRL_I128;
+  } else {
+    assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+    isSigned = true;
+    if (VT == MVT::i16)
+      LC = RTLIB::SRA_I16;
+    else if (VT == MVT::i32)
+      LC = RTLIB::SRA_I32;
+    else if (VT == MVT::i64)
+      LC = RTLIB::SRA_I64;
+    else if (VT == MVT::i128)
+      LC = RTLIB::SRA_I128;
+  }
+
+  if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) {
+    EVT ShAmtTy =
+        EVT::getIntegerVT(*DAG.getContext(), DAG.getLibInfo().getIntSize());
+    SDValue ShAmt = DAG.getZExtOrTrunc(N->getOperand(1), dl, ShAmtTy);
+    SDValue Ops[2] = {N->getOperand(0), ShAmt};
+    TargetLowering::MakeLibCallOptions CallOptions;
+    CallOptions.setSExt(isSigned);
+    SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
+    return;
+  }
+
+  if (!ExpandShiftWithUnknownAmountBit(N, Lo, Hi))
+    llvm_unreachable("Unsupported shift!");
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SIGN_EXTEND(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().bitsLE(NVT)) {
+    // The low part is sign extension of the input (degenerates to a copy).
+    Lo = DAG.getNode(ISD::SIGN_EXTEND, dl, NVT, N->getOperand(0));
+    // The high part is obtained by SRA'ing all but one of the bits of low part.
+    unsigned LoSize = NVT.getSizeInBits();
+    Hi = DAG.getNode(
+        ISD::SRA, dl, NVT, Lo,
+        DAG.getConstant(LoSize - 1, dl, TLI.getPointerTy(DAG.getDataLayout())));
+  } else {
+    // For example, extension of an i48 to an i64.  The operand type necessarily
+    // promotes to the result type, so will end up being expanded too.
+    assert(getTypeAction(Op.getValueType()) ==
+           TargetLowering::TypePromoteInteger &&
+           "Only know how to promote this result!");
+    SDValue Res = GetPromotedInteger(Op);
+    assert(Res.getValueType() == N->getValueType(0) &&
+           "Operand over promoted?");
+    // Split the promoted operand.  This will simplify when it is expanded.
+    SplitInteger(Res, Lo, Hi);
+    unsigned ExcessBits = Op.getValueSizeInBits() - NVT.getSizeInBits();
+    Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi,
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        ExcessBits)));
+  }
+}
+
+void DAGTypeLegalizer::
+ExpandIntRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+
+  if (EVT.bitsLE(Lo.getValueType())) {
+    // sext_inreg the low part if needed.
+    Lo = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Lo.getValueType(), Lo,
+                     N->getOperand(1));
+
+    // The high part gets the sign extension from the lo-part.  This handles
+    // things like sextinreg V:i64 from i8.
+    Hi = DAG.getNode(ISD::SRA, dl, Hi.getValueType(), Lo,
+                     DAG.getConstant(Hi.getValueSizeInBits() - 1, dl,
+                                     TLI.getPointerTy(DAG.getDataLayout())));
+  } else {
+    // For example, extension of an i48 to an i64.  Leave the low part alone,
+    // sext_inreg the high part.
+    unsigned ExcessBits = EVT.getSizeInBits() - Lo.getValueSizeInBits();
+    Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi,
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        ExcessBits)));
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+
+  if (TLI.getOperationAction(ISD::SDIVREM, VT) == TargetLowering::Custom) {
+    SDValue Res = DAG.getNode(ISD::SDIVREM, dl, DAG.getVTList(VT, VT), Ops);
+    SplitInteger(Res.getValue(1), Lo, Hi);
+    return;
+  }
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::SREM_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::SREM_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::SREM_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::SREM_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
+
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+  Lo = DAG.getNode(ISD::TRUNCATE, dl, NVT, N->getOperand(0));
+  Hi = DAG.getNode(ISD::SRL, dl, N->getOperand(0).getValueType(),
+                   N->getOperand(0),
+                   DAG.getConstant(NVT.getSizeInBits(), dl,
+                                   TLI.getPointerTy(DAG.getDataLayout())));
+  Hi = DAG.getNode(ISD::TRUNCATE, dl, NVT, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  if (N->getOpcode() == ISD::UMULO) {
+    // This section expands the operation into the following sequence of
+    // instructions. `iNh` here refers to a type which has half the bit width of
+    // the type the original operation operated on.
+    //
+    // %0 = %LHS.HI != 0 && %RHS.HI != 0
+    // %1 = { iNh, i1 } @umul.with.overflow.iNh(iNh %LHS.HI, iNh %RHS.LO)
+    // %2 = { iNh, i1 } @umul.with.overflow.iNh(iNh %RHS.HI, iNh %LHS.LO)
+    // %3 = mul nuw iN (%LHS.LOW as iN), (%RHS.LOW as iN)
+    // %4 = add iNh %1.0, %2.0 as iN
+    // %5 = { iNh, i1 } @uadd.with.overflow.iNh(iNh %4, iNh %3.HIGH)
+    //
+    // %lo = %3.LO
+    // %hi = %5.0
+    // %ovf = %0 || %1.1 || %2.1 || %5.1
+    SDValue LHS = N->getOperand(0), RHS = N->getOperand(1);
+    SDValue LHSHigh, LHSLow, RHSHigh, RHSLow;
+    GetExpandedInteger(LHS, LHSLow, LHSHigh);
+    GetExpandedInteger(RHS, RHSLow, RHSHigh);
+    EVT HalfVT = LHSLow.getValueType();
+    EVT BitVT = N->getValueType(1);
+    SDVTList VTHalfWithO = DAG.getVTList(HalfVT, BitVT);
+
+    SDValue HalfZero = DAG.getConstant(0, dl, HalfVT);
+    SDValue Overflow = DAG.getNode(ISD::AND, dl, BitVT,
+      DAG.getSetCC(dl, BitVT, LHSHigh, HalfZero, ISD::SETNE),
+      DAG.getSetCC(dl, BitVT, RHSHigh, HalfZero, ISD::SETNE));
+
+    SDValue One = DAG.getNode(ISD::UMULO, dl, VTHalfWithO, LHSHigh, RHSLow);
+    Overflow = DAG.getNode(ISD::OR, dl, BitVT, Overflow, One.getValue(1));
+
+    SDValue Two = DAG.getNode(ISD::UMULO, dl, VTHalfWithO, RHSHigh, LHSLow);
+    Overflow = DAG.getNode(ISD::OR, dl, BitVT, Overflow, Two.getValue(1));
+
+    SDValue HighSum = DAG.getNode(ISD::ADD, dl, HalfVT, One, Two);
+
+    // Cannot use `UMUL_LOHI` directly, because some 32-bit targets (ARM) do not
+    // know how to expand `i64,i64 = umul_lohi a, b` and abort (why isn’t this
+    // operation recursively legalized?).
+    //
+    // Many backends understand this pattern and will convert into LOHI
+    // themselves, if applicable.
+    SDValue Three = DAG.getNode(ISD::MUL, dl, VT,
+      DAG.getNode(ISD::ZERO_EXTEND, dl, VT, LHSLow),
+      DAG.getNode(ISD::ZERO_EXTEND, dl, VT, RHSLow));
+    SplitInteger(Three, Lo, Hi);
+
+    Hi = DAG.getNode(ISD::UADDO, dl, VTHalfWithO, Hi, HighSum);
+    Overflow = DAG.getNode(ISD::OR, dl, BitVT, Overflow, Hi.getValue(1));
+    ReplaceValueWith(SDValue(N, 1), Overflow);
+    return;
+  }
+
+  Type *RetTy = VT.getTypeForEVT(*DAG.getContext());
+  EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+  Type *PtrTy = PtrVT.getTypeForEVT(*DAG.getContext());
+
+  // Replace this with a libcall that will check overflow.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i32)
+    LC = RTLIB::MULO_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::MULO_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::MULO_I128;
+
+  // If we don't have the libcall or if the function we are compiling is the
+  // implementation of the expected libcall (avoid inf-loop), expand inline.
+  if (LC == RTLIB::UNKNOWN_LIBCALL || !TLI.getLibcallName(LC) ||
+      TLI.getLibcallName(LC) == DAG.getMachineFunction().getName()) {
+    // FIXME: This is not an optimal expansion, but better than crashing.
+    EVT WideVT =
+        EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2);
+    SDValue LHS = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, N->getOperand(0));
+    SDValue RHS = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, N->getOperand(1));
+    SDValue Mul = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS);
+    SDValue MulLo, MulHi;
+    SplitInteger(Mul, MulLo, MulHi);
+    SDValue SRA =
+        DAG.getNode(ISD::SRA, dl, VT, MulLo,
+                    DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, VT));
+    SDValue Overflow =
+        DAG.getSetCC(dl, N->getValueType(1), MulHi, SRA, ISD::SETNE);
+    SplitInteger(MulLo, Lo, Hi);
+    ReplaceValueWith(SDValue(N, 1), Overflow);
+    return;
+  }
+
+  SDValue Temp = DAG.CreateStackTemporary(PtrVT);
+  // Temporary for the overflow value, default it to zero.
+  SDValue Chain =
+      DAG.getStore(DAG.getEntryNode(), dl, DAG.getConstant(0, dl, PtrVT), Temp,
+                   MachinePointerInfo());
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (const SDValue &Op : N->op_values()) {
+    EVT ArgVT = Op.getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Op;
+    Entry.Ty = ArgTy;
+    Entry.IsSExt = true;
+    Entry.IsZExt = false;
+    Args.push_back(Entry);
+  }
+
+  // Also pass the address of the overflow check.
+  Entry.Node = Temp;
+  Entry.Ty = PtrTy->getPointerTo();
+  Entry.IsSExt = true;
+  Entry.IsZExt = false;
+  Args.push_back(Entry);
+
+  SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args))
+      .setSExtResult();
+
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  SplitInteger(CallInfo.first, Lo, Hi);
+  SDValue Temp2 =
+      DAG.getLoad(PtrVT, dl, CallInfo.second, Temp, MachinePointerInfo());
+  SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Temp2,
+                             DAG.getConstant(0, dl, PtrVT),
+                             ISD::SETNE);
+  // Use the overflow from the libcall everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+
+  if (TLI.getOperationAction(ISD::UDIVREM, VT) == TargetLowering::Custom) {
+    SDValue Res = DAG.getNode(ISD::UDIVREM, dl, DAG.getVTList(VT, VT), Ops);
+    SplitInteger(Res.getValue(0), Lo, Hi);
+    return;
+  }
+
+  // Try to expand UDIV by constant.
+  if (isa<ConstantSDNode>(N->getOperand(1))) {
+    EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+    // Only if the new type is legal.
+    if (isTypeLegal(NVT)) {
+      SDValue InL, InH;
+      GetExpandedInteger(N->getOperand(0), InL, InH);
+      SmallVector<SDValue> Result;
+      if (TLI.expandDIVREMByConstant(N, Result, NVT, DAG, InL, InH)) {
+        Lo = Result[0];
+        Hi = Result[1];
+        return;
+      }
+    }
+  }
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::UDIV_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::UDIV_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::UDIV_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::UDIV_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!");
+
+  TargetLowering::MakeLibCallOptions CallOptions;
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+
+  if (TLI.getOperationAction(ISD::UDIVREM, VT) == TargetLowering::Custom) {
+    SDValue Res = DAG.getNode(ISD::UDIVREM, dl, DAG.getVTList(VT, VT), Ops);
+    SplitInteger(Res.getValue(1), Lo, Hi);
+    return;
+  }
+
+  // Try to expand UREM by constant.
+  if (isa<ConstantSDNode>(N->getOperand(1))) {
+    EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+    // Only if the new type is legal.
+    if (isTypeLegal(NVT)) {
+      SDValue InL, InH;
+      GetExpandedInteger(N->getOperand(0), InL, InH);
+      SmallVector<SDValue> Result;
+      if (TLI.expandDIVREMByConstant(N, Result, NVT, DAG, InL, InH)) {
+        Lo = Result[0];
+        Hi = Result[1];
+        return;
+      }
+    }
+  }
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::UREM_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::UREM_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::UREM_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::UREM_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!");
+
+  TargetLowering::MakeLibCallOptions CallOptions;
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().bitsLE(NVT)) {
+    // The low part is zero extension of the input (degenerates to a copy).
+    Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N->getOperand(0));
+    Hi = DAG.getConstant(0, dl, NVT);   // The high part is just a zero.
+  } else {
+    // For example, extension of an i48 to an i64.  The operand type necessarily
+    // promotes to the result type, so will end up being expanded too.
+    assert(getTypeAction(Op.getValueType()) ==
+           TargetLowering::TypePromoteInteger &&
+           "Only know how to promote this result!");
+    SDValue Res = GetPromotedInteger(Op);
+    assert(Res.getValueType() == N->getValueType(0) &&
+           "Operand over promoted?");
+    // Split the promoted operand.  This will simplify when it is expanded.
+    SplitInteger(Res, Lo, Hi);
+    unsigned ExcessBits = Op.getValueSizeInBits() - NVT.getSizeInBits();
+    Hi = DAG.getZeroExtendInReg(Hi, dl,
+                                EVT::getIntegerVT(*DAG.getContext(),
+                                                  ExcessBits));
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ATOMIC_LOAD(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  EVT VT = cast<AtomicSDNode>(N)->getMemoryVT();
+  SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
+  SDValue Zero = DAG.getConstant(0, dl, VT);
+  SDValue Swap = DAG.getAtomicCmpSwap(
+      ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl,
+      cast<AtomicSDNode>(N)->getMemoryVT(), VTs, N->getOperand(0),
+      N->getOperand(1), Zero, Zero, cast<AtomicSDNode>(N)->getMemOperand());
+
+  ReplaceValueWith(SDValue(N, 0), Swap.getValue(0));
+  ReplaceValueWith(SDValue(N, 1), Swap.getValue(2));
+}
+
+void DAGTypeLegalizer::ExpandIntRes_VECREDUCE(SDNode *N,
+                                              SDValue &Lo, SDValue &Hi) {
+  // TODO For VECREDUCE_(AND|OR|XOR) we could split the vector and calculate
+  // both halves independently.
+  SDValue Res = TLI.expandVecReduce(N, DAG);
+  SplitInteger(Res, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_Rotate(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  // Delegate to funnel-shift expansion.
+  SDLoc DL(N);
+  unsigned Opcode = N->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
+  SDValue Res = DAG.getNode(Opcode, DL, N->getValueType(0), N->getOperand(0),
+                            N->getOperand(0), N->getOperand(1));
+  SplitInteger(Res, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_FunnelShift(SDNode *N, SDValue &Lo,
+                                                SDValue &Hi) {
+  // Values numbered from least significant to most significant.
+  SDValue In1, In2, In3, In4;
+  GetExpandedInteger(N->getOperand(0), In3, In4);
+  GetExpandedInteger(N->getOperand(1), In1, In2);
+  EVT HalfVT = In1.getValueType();
+
+  SDLoc DL(N);
+  unsigned Opc = N->getOpcode();
+  SDValue ShAmt = N->getOperand(2);
+  EVT ShAmtVT = ShAmt.getValueType();
+  EVT ShAmtCCVT = getSetCCResultType(ShAmtVT);
+
+  // If the shift amount is at least half the bitwidth, swap the inputs.
+  unsigned HalfVTBits = HalfVT.getScalarSizeInBits();
+  SDValue AndNode = DAG.getNode(ISD::AND, DL, ShAmtVT, ShAmt,
+                                DAG.getConstant(HalfVTBits, DL, ShAmtVT));
+  SDValue Cond =
+      DAG.getSetCC(DL, ShAmtCCVT, AndNode, DAG.getConstant(0, DL, ShAmtVT),
+                   Opc == ISD::FSHL ? ISD::SETNE : ISD::SETEQ);
+
+  // Expand to a pair of funnel shifts.
+  EVT NewShAmtVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout());
+  SDValue NewShAmt = DAG.getAnyExtOrTrunc(ShAmt, DL, NewShAmtVT);
+
+  SDValue Select1 = DAG.getNode(ISD::SELECT, DL, HalfVT, Cond, In1, In2);
+  SDValue Select2 = DAG.getNode(ISD::SELECT, DL, HalfVT, Cond, In2, In3);
+  SDValue Select3 = DAG.getNode(ISD::SELECT, DL, HalfVT, Cond, In3, In4);
+  Lo = DAG.getNode(Opc, DL, HalfVT, Select2, Select1, NewShAmt);
+  Hi = DAG.getNode(Opc, DL, HalfVT, Select3, Select2, NewShAmt);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_VSCALE(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  EVT HalfVT =
+      EVT::getIntegerVT(*DAG.getContext(), N->getValueSizeInBits(0) / 2);
+  SDLoc dl(N);
+
+  // We assume VSCALE(1) fits into a legal integer.
+  APInt One(HalfVT.getSizeInBits(), 1);
+  SDValue VScaleBase = DAG.getVScale(dl, HalfVT, One);
+  VScaleBase = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, VScaleBase);
+  SDValue Res = DAG.getNode(ISD::MUL, dl, VT, VScaleBase, N->getOperand(0));
+  SplitInteger(Res, Lo, Hi);
+}
+
+//===----------------------------------------------------------------------===//
+//  Integer Operand Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandIntegerOperand - This method is called when the specified operand of
+/// the specified node is found to need expansion.  At this point, all of the
+/// result types of the node are known to be legal, but other operands of the
+/// node may need promotion or expansion as well as the specified one.
+bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
+  LLVM_DEBUG(dbgs() << "Expand integer operand: "; N->dump(&DAG);
+             dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+  #ifndef NDEBUG
+    dbgs() << "ExpandIntegerOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+  #endif
+    report_fatal_error("Do not know how to expand this operator's operand!");
+
+  case ISD::BITCAST:           Res = ExpandOp_BITCAST(N); break;
+  case ISD::BR_CC:             Res = ExpandIntOp_BR_CC(N); break;
+  case ISD::BUILD_VECTOR:      Res = ExpandOp_BUILD_VECTOR(N); break;
+  case ISD::EXTRACT_ELEMENT:   Res = ExpandOp_EXTRACT_ELEMENT(N); break;
+  case ISD::INSERT_VECTOR_ELT: Res = ExpandOp_INSERT_VECTOR_ELT(N); break;
+  case ISD::SCALAR_TO_VECTOR:  Res = ExpandOp_SCALAR_TO_VECTOR(N); break;
+  case ISD::SPLAT_VECTOR:      Res = ExpandIntOp_SPLAT_VECTOR(N); break;
+  case ISD::SELECT_CC:         Res = ExpandIntOp_SELECT_CC(N); break;
+  case ISD::SETCC:             Res = ExpandIntOp_SETCC(N); break;
+  case ISD::SETCCCARRY:        Res = ExpandIntOp_SETCCCARRY(N); break;
+  case ISD::STRICT_SINT_TO_FP:
+  case ISD::SINT_TO_FP:        Res = ExpandIntOp_SINT_TO_FP(N); break;
+  case ISD::STORE:   Res = ExpandIntOp_STORE(cast<StoreSDNode>(N), OpNo); break;
+  case ISD::TRUNCATE:          Res = ExpandIntOp_TRUNCATE(N); break;
+  case ISD::STRICT_UINT_TO_FP:
+  case ISD::UINT_TO_FP:        Res = ExpandIntOp_UINT_TO_FP(N); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR:              Res = ExpandIntOp_Shift(N); break;
+  case ISD::RETURNADDR:
+  case ISD::FRAMEADDR:         Res = ExpandIntOp_RETURNADDR(N); break;
+
+  case ISD::ATOMIC_STORE:      Res = ExpandIntOp_ATOMIC_STORE(N); break;
+  case ISD::STACKMAP:
+    Res = ExpandIntOp_STACKMAP(N, OpNo);
+    break;
+  case ISD::PATCHPOINT:
+    Res = ExpandIntOp_PATCHPOINT(N, OpNo);
+    break;
+  case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
+  case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
+    Res = ExpandIntOp_VP_STRIDED(N, OpNo);
+    break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// IntegerExpandSetCCOperands - Expand the operands of a comparison.  This code
+/// is shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
+                                                  SDValue &NewRHS,
+                                                  ISD::CondCode &CCCode,
+                                                  const SDLoc &dl) {
+  SDValue LHSLo, LHSHi, RHSLo, RHSHi;
+  GetExpandedInteger(NewLHS, LHSLo, LHSHi);
+  GetExpandedInteger(NewRHS, RHSLo, RHSHi);
+
+  if (CCCode == ISD::SETEQ || CCCode == ISD::SETNE) {
+    if (RHSLo == RHSHi) {
+      if (ConstantSDNode *RHSCST = dyn_cast<ConstantSDNode>(RHSLo)) {
+        if (RHSCST->isAllOnes()) {
+          // Equality comparison to -1.
+          NewLHS = DAG.getNode(ISD::AND, dl,
+                               LHSLo.getValueType(), LHSLo, LHSHi);
+          NewRHS = RHSLo;
+          return;
+        }
+      }
+    }
+
+    NewLHS = DAG.getNode(ISD::XOR, dl, LHSLo.getValueType(), LHSLo, RHSLo);
+    NewRHS = DAG.getNode(ISD::XOR, dl, LHSLo.getValueType(), LHSHi, RHSHi);
+    NewLHS = DAG.getNode(ISD::OR, dl, NewLHS.getValueType(), NewLHS, NewRHS);
+    NewRHS = DAG.getConstant(0, dl, NewLHS.getValueType());
+    return;
+  }
+
+  // If this is a comparison of the sign bit, just look at the top part.
+  // X > -1,  x < 0
+  if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(NewRHS))
+    if ((CCCode == ISD::SETLT && CST->isZero()) ||    // X < 0
+        (CCCode == ISD::SETGT && CST->isAllOnes())) { // X > -1
+      NewLHS = LHSHi;
+      NewRHS = RHSHi;
+      return;
+    }
+
+  // FIXME: This generated code sucks.
+  ISD::CondCode LowCC;
+  switch (CCCode) {
+  default: llvm_unreachable("Unknown integer setcc!");
+  case ISD::SETLT:
+  case ISD::SETULT: LowCC = ISD::SETULT; break;
+  case ISD::SETGT:
+  case ISD::SETUGT: LowCC = ISD::SETUGT; break;
+  case ISD::SETLE:
+  case ISD::SETULE: LowCC = ISD::SETULE; break;
+  case ISD::SETGE:
+  case ISD::SETUGE: LowCC = ISD::SETUGE; break;
+  }
+
+  // LoCmp = lo(op1) < lo(op2)   // Always unsigned comparison
+  // HiCmp = hi(op1) < hi(op2)   // Signedness depends on operands
+  // dest  = hi(op1) == hi(op2) ? LoCmp : HiCmp;
+
+  // NOTE: on targets without efficient SELECT of bools, we can always use
+  // this identity: (B1 ? B2 : B3) --> (B1 & B2)|(!B1&B3)
+  TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, AfterLegalizeTypes, true,
+                                                 nullptr);
+  SDValue LoCmp, HiCmp;
+  if (TLI.isTypeLegal(LHSLo.getValueType()) &&
+      TLI.isTypeLegal(RHSLo.getValueType()))
+    LoCmp = TLI.SimplifySetCC(getSetCCResultType(LHSLo.getValueType()), LHSLo,
+                              RHSLo, LowCC, false, DagCombineInfo, dl);
+  if (!LoCmp.getNode())
+    LoCmp = DAG.getSetCC(dl, getSetCCResultType(LHSLo.getValueType()), LHSLo,
+                         RHSLo, LowCC);
+  if (TLI.isTypeLegal(LHSHi.getValueType()) &&
+      TLI.isTypeLegal(RHSHi.getValueType()))
+    HiCmp = TLI.SimplifySetCC(getSetCCResultType(LHSHi.getValueType()), LHSHi,
+                              RHSHi, CCCode, false, DagCombineInfo, dl);
+  if (!HiCmp.getNode())
+    HiCmp =
+        DAG.getNode(ISD::SETCC, dl, getSetCCResultType(LHSHi.getValueType()),
+                    LHSHi, RHSHi, DAG.getCondCode(CCCode));
+
+  ConstantSDNode *LoCmpC = dyn_cast<ConstantSDNode>(LoCmp.getNode());
+  ConstantSDNode *HiCmpC = dyn_cast<ConstantSDNode>(HiCmp.getNode());
+
+  bool EqAllowed = ISD::isTrueWhenEqual(CCCode);
+
+  // FIXME: Is the HiCmpC->isOne() here correct for
+  // ZeroOrNegativeOneBooleanContent.
+  if ((EqAllowed && (HiCmpC && HiCmpC->isZero())) ||
+      (!EqAllowed &&
+       ((HiCmpC && HiCmpC->isOne()) || (LoCmpC && LoCmpC->isZero())))) {
+    // For LE / GE, if high part is known false, ignore the low part.
+    // For LT / GT: if low part is known false, return the high part.
+    //              if high part is known true, ignore the low part.
+    NewLHS = HiCmp;
+    NewRHS = SDValue();
+    return;
+  }
+
+  if (LHSHi == RHSHi) {
+    // Comparing the low bits is enough.
+    NewLHS = LoCmp;
+    NewRHS = SDValue();
+    return;
+  }
+
+  // Lower with SETCCCARRY if the target supports it.
+  EVT HiVT = LHSHi.getValueType();
+  EVT ExpandVT = TLI.getTypeToExpandTo(*DAG.getContext(), HiVT);
+  bool HasSETCCCARRY = TLI.isOperationLegalOrCustom(ISD::SETCCCARRY, ExpandVT);
+
+  // FIXME: Make all targets support this, then remove the other lowering.
+  if (HasSETCCCARRY) {
+    // SETCCCARRY can detect < and >= directly. For > and <=, flip
+    // operands and condition code.
+    bool FlipOperands = false;
+    switch (CCCode) {
+    case ISD::SETGT:  CCCode = ISD::SETLT;  FlipOperands = true; break;
+    case ISD::SETUGT: CCCode = ISD::SETULT; FlipOperands = true; break;
+    case ISD::SETLE:  CCCode = ISD::SETGE;  FlipOperands = true; break;
+    case ISD::SETULE: CCCode = ISD::SETUGE; FlipOperands = true; break;
+    default: break;
+    }
+    if (FlipOperands) {
+      std::swap(LHSLo, RHSLo);
+      std::swap(LHSHi, RHSHi);
+    }
+    // Perform a wide subtraction, feeding the carry from the low part into
+    // SETCCCARRY. The SETCCCARRY operation is essentially looking at the high
+    // part of the result of LHS - RHS. It is negative iff LHS < RHS. It is
+    // zero or positive iff LHS >= RHS.
+    EVT LoVT = LHSLo.getValueType();
+    SDVTList VTList = DAG.getVTList(LoVT, getSetCCResultType(LoVT));
+    SDValue LowCmp = DAG.getNode(ISD::USUBO, dl, VTList, LHSLo, RHSLo);
+    SDValue Res = DAG.getNode(ISD::SETCCCARRY, dl, getSetCCResultType(HiVT),
+                              LHSHi, RHSHi, LowCmp.getValue(1),
+                              DAG.getCondCode(CCCode));
+    NewLHS = Res;
+    NewRHS = SDValue();
+    return;
+  }
+
+  NewLHS = TLI.SimplifySetCC(getSetCCResultType(HiVT), LHSHi, RHSHi, ISD::SETEQ,
+                             false, DagCombineInfo, dl);
+  if (!NewLHS.getNode())
+    NewLHS =
+        DAG.getSetCC(dl, getSetCCResultType(HiVT), LHSHi, RHSHi, ISD::SETEQ);
+  NewLHS = DAG.getSelect(dl, LoCmp.getValueType(), NewLHS, LoCmp, HiCmp);
+  NewRHS = SDValue();
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_BR_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (!NewRHS.getNode()) {
+    NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                DAG.getCondCode(CCCode), NewLHS, NewRHS,
+                                N->getOperand(4)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SELECT_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (!NewRHS.getNode()) {
+    NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                N->getOperand(2), N->getOperand(3),
+                                DAG.getCondCode(CCCode)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
+
+  // If ExpandSetCCOperands returned a scalar, use it.
+  if (!NewRHS.getNode()) {
+    assert(NewLHS.getValueType() == N->getValueType(0) &&
+           "Unexpected setcc expansion!");
+    return NewLHS;
+  }
+
+  // Otherwise, update N to have the operands specified.
+  return SDValue(
+      DAG.UpdateNodeOperands(N, NewLHS, NewRHS, DAG.getCondCode(CCCode)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SETCCCARRY(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Carry = N->getOperand(2);
+  SDValue Cond = N->getOperand(3);
+  SDLoc dl = SDLoc(N);
+
+  SDValue LHSLo, LHSHi, RHSLo, RHSHi;
+  GetExpandedInteger(LHS, LHSLo, LHSHi);
+  GetExpandedInteger(RHS, RHSLo, RHSHi);
+
+  // Expand to a USUBO_CARRY for the low part and a SETCCCARRY for the high.
+  SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), Carry.getValueType());
+  SDValue LowCmp =
+      DAG.getNode(ISD::USUBO_CARRY, dl, VTList, LHSLo, RHSLo, Carry);
+  return DAG.getNode(ISD::SETCCCARRY, dl, N->getValueType(0), LHSHi, RHSHi,
+                     LowCmp.getValue(1), Cond);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SPLAT_VECTOR(SDNode *N) {
+  // Split the operand and replace with SPLAT_VECTOR_PARTS.
+  SDValue Lo, Hi;
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  return DAG.getNode(ISD::SPLAT_VECTOR_PARTS, SDLoc(N), N->getValueType(0), Lo,
+                     Hi);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) {
+  // The value being shifted is legal, but the shift amount is too big.
+  // It follows that either the result of the shift is undefined, or the
+  // upper half of the shift amount is zero.  Just use the lower half.
+  SDValue Lo, Hi;
+  GetExpandedInteger(N->getOperand(1), Lo, Hi);
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Lo), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_RETURNADDR(SDNode *N) {
+  // The argument of RETURNADDR / FRAMEADDR builtin is 32 bit contant.  This
+  // surely makes pretty nice problems on 8/16 bit targets. Just truncate this
+  // constant to valid type.
+  SDValue Lo, Hi;
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  return SDValue(DAG.UpdateNodeOperands(N, Lo), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+  EVT DstVT = N->getValueType(0);
+  RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+         "Don't know how to expand this SINT_TO_FP!");
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  std::pair<SDValue, SDValue> Tmp =
+      TLI.makeLibCall(DAG, LC, DstVT, Op, CallOptions, SDLoc(N), Chain);
+
+  if (!IsStrict)
+    return Tmp.first;
+
+  ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  ReplaceValueWith(SDValue(N, 0), Tmp.first);
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
+  if (N->isAtomic()) {
+    // It's typical to have larger CAS than atomic store instructions.
+    SDLoc dl(N);
+    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+                                 N->getMemoryVT(),
+                                 N->getOperand(0), N->getOperand(2),
+                                 N->getOperand(1),
+                                 N->getMemOperand());
+    return Swap.getValue(1);
+  }
+  if (ISD::isNormalStore(N))
+    return ExpandOp_NormalStore(N, OpNo);
+
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  assert(OpNo == 1 && "Can only expand the stored value so far");
+
+  EVT VT = N->getOperand(1).getValueType();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = N->getAAInfo();
+  SDLoc dl(N);
+  SDValue Lo, Hi;
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+
+  if (N->getMemoryVT().bitsLE(NVT)) {
+    GetExpandedInteger(N->getValue(), Lo, Hi);
+    return DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
+                             N->getMemoryVT(), N->getOriginalAlign(), MMOFlags,
+                             AAInfo);
+  }
+
+  if (DAG.getDataLayout().isLittleEndian()) {
+    // Little-endian - low bits are at low addresses.
+    GetExpandedInteger(N->getValue(), Lo, Hi);
+
+    Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
+                      N->getOriginalAlign(), MMOFlags, AAInfo);
+
+    unsigned ExcessBits =
+      N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
+    EVT NEVT = EVT::getIntegerVT(*DAG.getContext(), ExcessBits);
+
+    // Increment the pointer to the other half.
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::Fixed(IncrementSize));
+    Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr,
+                           N->getPointerInfo().getWithOffset(IncrementSize),
+                           NEVT, N->getOriginalAlign(), MMOFlags, AAInfo);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+  }
+
+  // Big-endian - high bits are at low addresses.  Favor aligned stores at
+  // the cost of some bit-fiddling.
+  GetExpandedInteger(N->getValue(), Lo, Hi);
+
+  EVT ExtVT = N->getMemoryVT();
+  unsigned EBytes = ExtVT.getStoreSize();
+  unsigned IncrementSize = NVT.getSizeInBits()/8;
+  unsigned ExcessBits = (EBytes - IncrementSize)*8;
+  EVT HiVT = EVT::getIntegerVT(*DAG.getContext(),
+                               ExtVT.getSizeInBits() - ExcessBits);
+
+  if (ExcessBits < NVT.getSizeInBits()) {
+    // Transfer high bits from the top of Lo to the bottom of Hi.
+    Hi = DAG.getNode(ISD::SHL, dl, NVT, Hi,
+                     DAG.getConstant(NVT.getSizeInBits() - ExcessBits, dl,
+                                     TLI.getPointerTy(DAG.getDataLayout())));
+    Hi = DAG.getNode(
+        ISD::OR, dl, NVT, Hi,
+        DAG.getNode(ISD::SRL, dl, NVT, Lo,
+                    DAG.getConstant(ExcessBits, dl,
+                                    TLI.getPointerTy(DAG.getDataLayout()))));
+  }
+
+  // Store both the high bits and maybe some of the low bits.
+  Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo(), HiVT,
+                         N->getOriginalAlign(), MMOFlags, AAInfo);
+
+  // Increment the pointer to the other half.
+  Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::Fixed(IncrementSize));
+  // Store the lowest ExcessBits bits in the second half.
+  Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr,
+                         N->getPointerInfo().getWithOffset(IncrementSize),
+                         EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
+                         N->getOriginalAlign(), MMOFlags, AAInfo);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_TRUNCATE(SDNode *N) {
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+  // Just truncate the low part of the source.
+  return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), InL);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+  EVT DstVT = N->getValueType(0);
+  RTLIB::Libcall LC = RTLIB::getUINTTOFP(Op.getValueType(), DstVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+         "Don't know how to expand this UINT_TO_FP!");
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  std::pair<SDValue, SDValue> Tmp =
+      TLI.makeLibCall(DAG, LC, DstVT, Op, CallOptions, SDLoc(N), Chain);
+
+  if (!IsStrict)
+    return Tmp.first;
+
+  ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  ReplaceValueWith(SDValue(N, 0), Tmp.first);
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) {
+  SDLoc dl(N);
+  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+                               cast<AtomicSDNode>(N)->getMemoryVT(),
+                               N->getOperand(0),
+                               N->getOperand(1), N->getOperand(2),
+                               cast<AtomicSDNode>(N)->getMemOperand());
+  return Swap.getValue(1);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_VP_STRIDED(SDNode *N, unsigned OpNo) {
+  assert((N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD && OpNo == 3) ||
+         (N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_STORE && OpNo == 4));
+
+  SDValue Hi; // The upper half is dropped out.
+  SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
+  GetExpandedInteger(NewOps[OpNo], NewOps[OpNo], Hi);
+
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SPLICE(SDNode *N) {
+  SDLoc dl(N);
+
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  SDValue V1 = GetPromotedInteger(N->getOperand(1));
+  EVT OutVT = V0.getValueType();
+
+  return DAG.getNode(ISD::VECTOR_SPLICE, dl, OutVT, V0, V1, N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_INTERLEAVE_DEINTERLEAVE(SDNode *N) {
+  SDLoc dl(N);
+
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  SDValue V1 = GetPromotedInteger(N->getOperand(1));
+  EVT ResVT = V0.getValueType();
+  SDValue Res = DAG.getNode(N->getOpcode(), dl,
+                            DAG.getVTList(ResVT, ResVT), V0, V1);
+  SetPromotedInteger(SDValue(N, 0), Res.getValue(0));
+  SetPromotedInteger(SDValue(N, 1), Res.getValue(1));
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  SDLoc dl(N);
+  SDValue BaseIdx = N->getOperand(1);
+
+  // TODO: We may be able to use this for types other than scalable
+  // vectors and fix those tests that expect BUILD_VECTOR to be used
+  if (OutVT.isScalableVector()) {
+    SDValue InOp0 = N->getOperand(0);
+    EVT InVT = InOp0.getValueType();
+
+    // Try and extract from a smaller type so that it eventually falls
+    // into the promotion code below.
+    if (getTypeAction(InVT) == TargetLowering::TypeSplitVector ||
+        getTypeAction(InVT) == TargetLowering::TypeLegal) {
+      EVT NInVT = InVT.getHalfNumVectorElementsVT(*DAG.getContext());
+      unsigned NElts = NInVT.getVectorMinNumElements();
+      uint64_t IdxVal = cast<ConstantSDNode>(BaseIdx)->getZExtValue();
+
+      SDValue Step1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NInVT, InOp0,
+                                  DAG.getConstant(alignDown(IdxVal, NElts), dl,
+                                                  BaseIdx.getValueType()));
+      SDValue Step2 = DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, dl, OutVT, Step1,
+          DAG.getConstant(IdxVal % NElts, dl, BaseIdx.getValueType()));
+      return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, Step2);
+    }
+
+    // Try and extract from a widened type.
+    if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
+      SDValue Ops[] = {GetWidenedVector(InOp0), BaseIdx};
+      SDValue Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), OutVT, Ops);
+      return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, Ext);
+    }
+
+    // Promote operands and see if this is handled by target lowering,
+    // Otherwise, use the BUILD_VECTOR approach below
+    if (getTypeAction(InVT) == TargetLowering::TypePromoteInteger) {
+      // Collect the (promoted) operands
+      SDValue Ops[] = { GetPromotedInteger(InOp0), BaseIdx };
+
+      EVT PromEltVT = Ops[0].getValueType().getVectorElementType();
+      assert(PromEltVT.bitsLE(NOutVTElem) &&
+             "Promoted operand has an element type greater than result");
+
+      EVT ExtVT = NOutVT.changeVectorElementType(PromEltVT);
+      SDValue Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), ExtVT, Ops);
+      return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, Ext);
+    }
+  }
+
+  if (OutVT.isScalableVector())
+    report_fatal_error("Unable to promote scalable types using BUILD_VECTOR");
+
+  SDValue InOp0 = N->getOperand(0);
+  if (getTypeAction(InOp0.getValueType()) == TargetLowering::TypePromoteInteger)
+    InOp0 = GetPromotedInteger(N->getOperand(0));
+
+  EVT InVT = InOp0.getValueType();
+
+  unsigned OutNumElems = OutVT.getVectorNumElements();
+  SmallVector<SDValue, 8> Ops;
+  Ops.reserve(OutNumElems);
+  for (unsigned i = 0; i != OutNumElems; ++i) {
+
+    // Extract the element from the original vector.
+    SDValue Index = DAG.getNode(ISD::ADD, dl, BaseIdx.getValueType(),
+      BaseIdx, DAG.getConstant(i, dl, BaseIdx.getValueType()));
+    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+      InVT.getVectorElementType(), N->getOperand(0), Index);
+
+    SDValue Op = DAG.getAnyExtOrTrunc(Ext, dl, NOutVTElem);
+    // Insert the converted element to the new vector.
+    Ops.push_back(Op);
+  }
+
+  return DAG.getBuildVector(NOutVT, dl, Ops);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_SUBVECTOR(SDNode *N) {
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+
+  SDLoc dl(N);
+  SDValue Vec = N->getOperand(0);
+  SDValue SubVec = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+
+  EVT SubVecVT = SubVec.getValueType();
+  EVT NSubVT =
+      EVT::getVectorVT(*DAG.getContext(), NOutVT.getVectorElementType(),
+                       SubVecVT.getVectorElementCount());
+
+  Vec = GetPromotedInteger(Vec);
+  SubVec = DAG.getNode(ISD::ANY_EXTEND, dl, NSubVT, SubVec);
+
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NOutVT, Vec, SubVec, Idx);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_REVERSE(SDNode *N) {
+  SDLoc dl(N);
+
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  EVT OutVT = V0.getValueType();
+
+  return DAG.getNode(ISD::VECTOR_REVERSE, dl, OutVT, V0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SHUFFLE(SDNode *N) {
+  ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  ArrayRef<int> NewMask = SV->getMask().slice(0, VT.getVectorNumElements());
+
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  SDValue V1 = GetPromotedInteger(N->getOperand(1));
+  EVT OutVT = V0.getValueType();
+
+  return DAG.getVectorShuffle(OutVT, dl, V0, V1, NewMask);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) {
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+  unsigned NumElems = N->getNumOperands();
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+  TargetLoweringBase::BooleanContent NOutBoolType = TLI.getBooleanContents(NOutVT);
+  unsigned NOutExtOpc = TargetLowering::getExtendForContent(NOutBoolType);
+  SDLoc dl(N);
+
+  SmallVector<SDValue, 8> Ops;
+  Ops.reserve(NumElems);
+  for (unsigned i = 0; i != NumElems; ++i) {
+    SDValue Op = N->getOperand(i);
+    EVT OpVT = Op.getValueType();
+    // BUILD_VECTOR integer operand types are allowed to be larger than the
+    // result's element type. This may still be true after the promotion. For
+    // example, we might be promoting (<v?i1> = BV <i32>, <i32>, ...) to
+    // (v?i16 = BV <i32>, <i32>, ...), and we can't any_extend <i32> to <i16>.
+    if (OpVT.bitsLT(NOutVTElem)) {
+      unsigned ExtOpc = ISD::ANY_EXTEND;
+      // Attempt to extend constant bool vectors to match target's BooleanContent.
+      // While not necessary, this improves chances of the constant correctly
+      // folding with compare results (e.g. for NOT patterns).
+      if (OpVT == MVT::i1 && Op.getOpcode() == ISD::Constant)
+        ExtOpc = NOutExtOpc;
+      Op = DAG.getNode(ExtOpc, dl, NOutVTElem, Op);
+    }
+    Ops.push_back(Op);
+  }
+
+  return DAG.getBuildVector(NOutVT, dl, Ops);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_ScalarOp(SDNode *N) {
+
+  SDLoc dl(N);
+
+  assert(!N->getOperand(0).getValueType().isVector() &&
+         "Input must be a scalar");
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+  EVT NOutElemVT = NOutVT.getVectorElementType();
+
+  SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutElemVT, N->getOperand(0));
+
+  return DAG.getNode(N->getOpcode(), dl, NOutVT, Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_STEP_VECTOR(SDNode *N) {
+  SDLoc dl(N);
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isScalableVector() &&
+         "Type must be promoted to a scalable vector type");
+  const APInt &StepVal = N->getConstantOperandAPInt(0);
+  return DAG.getStepVector(dl, NOutVT,
+                           StepVal.sext(NOutVT.getScalarSizeInBits()));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
+  SDLoc dl(N);
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+
+  unsigned NumOperands = N->getNumOperands();
+  unsigned NumOutElem = NOutVT.getVectorMinNumElements();
+  EVT OutElemTy = NOutVT.getVectorElementType();
+  if (OutVT.isScalableVector()) {
+    // Find the largest promoted element type for each of the operands.
+    SDUse *MaxSizedValue = std::max_element(
+        N->op_begin(), N->op_end(), [](const SDValue &A, const SDValue &B) {
+          EVT AVT = A.getValueType().getVectorElementType();
+          EVT BVT = B.getValueType().getVectorElementType();
+          return AVT.getScalarSizeInBits() < BVT.getScalarSizeInBits();
+        });
+    EVT MaxElementVT = MaxSizedValue->getValueType().getVectorElementType();
+
+    // Then promote all vectors to the largest element type.
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned I = 0; I < NumOperands; ++I) {
+      SDValue Op = N->getOperand(I);
+      EVT OpVT = Op.getValueType();
+      if (getTypeAction(OpVT) == TargetLowering::TypePromoteInteger)
+        Op = GetPromotedInteger(Op);
+      else
+        assert(getTypeAction(OpVT) == TargetLowering::TypeLegal &&
+               "Unhandled legalization type");
+
+      if (OpVT.getVectorElementType().getScalarSizeInBits() <
+          MaxElementVT.getScalarSizeInBits())
+        Op = DAG.getAnyExtOrTrunc(Op, dl,
+                                  OpVT.changeVectorElementType(MaxElementVT));
+      Ops.push_back(Op);
+    }
+
+    // Do the CONCAT on the promoted type and finally truncate to (the promoted)
+    // NOutVT.
+    return DAG.getAnyExtOrTrunc(
+        DAG.getNode(ISD::CONCAT_VECTORS, dl,
+                    OutVT.changeVectorElementType(MaxElementVT), Ops),
+        dl, NOutVT);
+  }
+
+  unsigned NumElem = N->getOperand(0).getValueType().getVectorNumElements();
+  assert(NumElem * NumOperands == NumOutElem &&
+         "Unexpected number of elements");
+
+  // Take the elements from the first vector.
+  SmallVector<SDValue, 8> Ops(NumOutElem);
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    SDValue Op = N->getOperand(i);
+    if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteInteger)
+      Op = GetPromotedInteger(Op);
+    EVT SclrTy = Op.getValueType().getVectorElementType();
+    assert(NumElem == Op.getValueType().getVectorNumElements() &&
+           "Unexpected number of elements");
+
+    for (unsigned j = 0; j < NumElem; ++j) {
+      SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SclrTy, Op,
+                                DAG.getVectorIdxConstant(j, dl));
+      Ops[i * NumElem + j] = DAG.getAnyExtOrTrunc(Ext, dl, OutElemTy);
+    }
+  }
+
+  return DAG.getBuildVector(NOutVT, dl, Ops);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  assert(NVT.isVector() && "This type must be promoted to a vector type");
+
+  SDLoc dl(N);
+
+  // For operands whose TypeAction is to promote, extend the promoted node
+  // appropriately (ZERO_EXTEND or SIGN_EXTEND) from the original pre-promotion
+  // type, and then construct a new *_EXTEND_VECTOR_INREG node to the promote-to
+  // type..
+  if (getTypeAction(N->getOperand(0).getValueType())
+      == TargetLowering::TypePromoteInteger) {
+    SDValue Promoted;
+
+    switch(N->getOpcode()) {
+      case ISD::SIGN_EXTEND_VECTOR_INREG:
+        Promoted = SExtPromotedInteger(N->getOperand(0));
+        break;
+      case ISD::ZERO_EXTEND_VECTOR_INREG:
+        Promoted = ZExtPromotedInteger(N->getOperand(0));
+        break;
+      case ISD::ANY_EXTEND_VECTOR_INREG:
+        Promoted = GetPromotedInteger(N->getOperand(0));
+        break;
+      default:
+        llvm_unreachable("Node has unexpected Opcode");
+    }
+    return DAG.getNode(N->getOpcode(), dl, NVT, Promoted);
+  }
+
+  // Directly extend to the appropriate transform-to type.
+  return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) {
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  SDLoc dl(N);
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+
+  SDValue ConvElem = DAG.getNode(ISD::ANY_EXTEND, dl,
+    NOutVTElem, N->getOperand(1));
+  return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NOutVT,
+    V0, ConvElem, N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VECREDUCE(SDNode *N) {
+  // The VECREDUCE result size may be larger than the element size, so
+  // we can simply change the result type.
+  SDLoc dl(N);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.getNode(N->getOpcode(), dl, NVT, N->ops());
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VP_REDUCE(SDNode *N) {
+  // The VP_REDUCE result size may be larger than the element size, so we can
+  // simply change the result type. However the start value and result must be
+  // the same.
+  SDLoc DL(N);
+  SDValue Start = PromoteIntOpVectorReduction(N, N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), DL, Start.getValueType(), Start,
+                     N->getOperand(1), N->getOperand(2), N->getOperand(3));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDLoc dl(N);
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  SDValue V1 = DAG.getZExtOrTrunc(N->getOperand(1), dl,
+                                  TLI.getVectorIdxTy(DAG.getDataLayout()));
+  SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+    V0->getValueType(0).getScalarType(), V0, V1);
+
+  // EXTRACT_VECTOR_ELT can return types which are wider than the incoming
+  // element types. If this is the case then we need to expand the outgoing
+  // value and not truncate it.
+  return DAG.getAnyExtOrTrunc(Ext, dl, N->getValueType(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_SUBVECTOR(SDNode *N) {
+  SDLoc dl(N);
+  // The result type is equal to the first input operand's type, so the
+  // type that needs promoting must be the second source vector.
+  SDValue V0 = N->getOperand(0);
+  SDValue V1 = GetPromotedInteger(N->getOperand(1));
+  SDValue Idx = N->getOperand(2);
+  EVT PromVT = EVT::getVectorVT(*DAG.getContext(),
+                                V1.getValueType().getVectorElementType(),
+                                V0.getValueType().getVectorElementCount());
+  V0 = DAG.getAnyExtOrTrunc(V0, dl, PromVT);
+  SDValue Ext = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, PromVT, V0, V1, Idx);
+  return DAG.getAnyExtOrTrunc(Ext, dl, N->getValueType(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N) {
+  SDLoc dl(N);
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  MVT InVT = V0.getValueType().getSimpleVT();
+  MVT OutVT = MVT::getVectorVT(InVT.getVectorElementType(),
+                               N->getValueType(0).getVectorNumElements());
+  SDValue Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OutVT, V0, N->getOperand(1));
+  return DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), Ext);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
+  SDLoc dl(N);
+
+  EVT ResVT = N->getValueType(0);
+  unsigned NumElems = N->getNumOperands();
+
+  if (ResVT.isScalableVector()) {
+    SDValue ResVec = DAG.getUNDEF(ResVT);
+
+    for (unsigned OpIdx = 0; OpIdx < NumElems; ++OpIdx) {
+      SDValue Op = N->getOperand(OpIdx);
+      unsigned OpNumElts = Op.getValueType().getVectorMinNumElements();
+      ResVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ResVec, Op,
+                           DAG.getIntPtrConstant(OpIdx * OpNumElts, dl));
+    }
+
+    return ResVec;
+  }
+
+  EVT RetSclrTy = N->getValueType(0).getVectorElementType();
+
+  SmallVector<SDValue, 8> NewOps;
+  NewOps.reserve(NumElems);
+
+  // For each incoming vector
+  for (unsigned VecIdx = 0; VecIdx != NumElems; ++VecIdx) {
+    SDValue Incoming = GetPromotedInteger(N->getOperand(VecIdx));
+    EVT SclrTy = Incoming->getValueType(0).getVectorElementType();
+    unsigned NumElem = Incoming->getValueType(0).getVectorNumElements();
+
+    for (unsigned i=0; i<NumElem; ++i) {
+      // Extract element from incoming vector
+      SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SclrTy, Incoming,
+                               DAG.getVectorIdxConstant(i, dl));
+      SDValue Tr = DAG.getNode(ISD::TRUNCATE, dl, RetSclrTy, Ex);
+      NewOps.push_back(Tr);
+    }
+  }
+
+  return DAG.getBuildVector(N->getValueType(0), dl, NewOps);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_STACKMAP(SDNode *N, unsigned OpNo) {
+  assert(OpNo > 1);
+  SDValue Op = N->getOperand(OpNo);
+
+  // FIXME: Non-constant operands are not yet handled:
+  //  - https://github.com/llvm/llvm-project/issues/26431
+  //  - https://github.com/llvm/llvm-project/issues/55957
+  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op);
+  if (!CN)
+    return SDValue();
+
+  // Copy operands before the one being expanded.
+  SmallVector<SDValue> NewOps;
+  for (unsigned I = 0; I < OpNo; I++)
+    NewOps.push_back(N->getOperand(I));
+
+  EVT Ty = Op.getValueType();
+  SDLoc DL = SDLoc(N);
+  if (CN->getConstantIntValue()->getValue().getActiveBits() < 64) {
+    NewOps.push_back(
+        DAG.getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64));
+    NewOps.push_back(DAG.getTargetConstant(CN->getZExtValue(), DL, Ty));
+  } else {
+    // FIXME: https://github.com/llvm/llvm-project/issues/55609
+    return SDValue();
+  }
+
+  // Copy remaining operands.
+  for (unsigned I = OpNo + 1; I < N->getNumOperands(); I++)
+    NewOps.push_back(N->getOperand(I));
+
+  SDValue NewNode = DAG.getNode(N->getOpcode(), DL, N->getVTList(), NewOps);
+
+  for (unsigned ResNum = 0; ResNum < N->getNumValues(); ResNum++)
+    ReplaceValueWith(SDValue(N, ResNum), NewNode.getValue(ResNum));
+
+  return SDValue(); // Signal that we have replaced the node already.
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_PATCHPOINT(SDNode *N, unsigned OpNo) {
+  assert(OpNo >= 7);
+  SDValue Op = N->getOperand(OpNo);
+
+  // FIXME: Non-constant operands are not yet handled:
+  //  - https://github.com/llvm/llvm-project/issues/26431
+  //  - https://github.com/llvm/llvm-project/issues/55957
+  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op);
+  if (!CN)
+    return SDValue();
+
+  // Copy operands before the one being expanded.
+  SmallVector<SDValue> NewOps;
+  for (unsigned I = 0; I < OpNo; I++)
+    NewOps.push_back(N->getOperand(I));
+
+  EVT Ty = Op.getValueType();
+  SDLoc DL = SDLoc(N);
+  if (CN->getConstantIntValue()->getValue().getActiveBits() < 64) {
+    NewOps.push_back(
+        DAG.getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64));
+    NewOps.push_back(DAG.getTargetConstant(CN->getZExtValue(), DL, Ty));
+  } else {
+    // FIXME: https://github.com/llvm/llvm-project/issues/55609
+    return SDValue();
+  }
+
+  // Copy remaining operands.
+  for (unsigned I = OpNo + 1; I < N->getNumOperands(); I++)
+    NewOps.push_back(N->getOperand(I));
+
+  SDValue NewNode = DAG.getNode(N->getOpcode(), DL, N->getVTList(), NewOps);
+
+  for (unsigned ResNum = 0; ResNum < N->getNumValues(); ResNum++)
+    ReplaceValueWith(SDValue(N, ResNum), NewNode.getValue(ResNum));
+
+  return SDValue(); // Signal that we have replaced the node already.
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
new file mode 100644
index 000000000000..328939e44dcb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -0,0 +1,1060 @@
+//===-- LegalizeTypes.cpp - Common code for DAG type legalizer ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SelectionDAG::LegalizeTypes method.  It transforms
+// an arbitrary well-formed SelectionDAG to only consist of legal types.  This
+// is common code shared among the LegalizeTypes*.cpp files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "legalize-types"
+
+static cl::opt<bool>
+EnableExpensiveChecks("enable-legalize-types-checking", cl::Hidden);
+
+/// Do extensive, expensive, basic correctness checking.
+void DAGTypeLegalizer::PerformExpensiveChecks() {
+  // If a node is not processed, then none of its values should be mapped by any
+  // of PromotedIntegers, ExpandedIntegers, ..., ReplacedValues.
+
+  // If a node is processed, then each value with an illegal type must be mapped
+  // by exactly one of PromotedIntegers, ExpandedIntegers, ..., ReplacedValues.
+  // Values with a legal type may be mapped by ReplacedValues, but not by any of
+  // the other maps.
+
+  // Note that these invariants may not hold momentarily when processing a node:
+  // the node being processed may be put in a map before being marked Processed.
+
+  // Note that it is possible to have nodes marked NewNode in the DAG.  This can
+  // occur in two ways.  Firstly, a node may be created during legalization but
+  // never passed to the legalization core.  This is usually due to the implicit
+  // folding that occurs when using the DAG.getNode operators.  Secondly, a new
+  // node may be passed to the legalization core, but when analyzed may morph
+  // into a different node, leaving the original node as a NewNode in the DAG.
+  // A node may morph if one of its operands changes during analysis.  Whether
+  // it actually morphs or not depends on whether, after updating its operands,
+  // it is equivalent to an existing node: if so, it morphs into that existing
+  // node (CSE).  An operand can change during analysis if the operand is a new
+  // node that morphs, or it is a processed value that was mapped to some other
+  // value (as recorded in ReplacedValues) in which case the operand is turned
+  // into that other value.  If a node morphs then the node it morphed into will
+  // be used instead of it for legalization, however the original node continues
+  // to live on in the DAG.
+  // The conclusion is that though there may be nodes marked NewNode in the DAG,
+  // all uses of such nodes are also marked NewNode: the result is a fungus of
+  // NewNodes growing on top of the useful nodes, and perhaps using them, but
+  // not used by them.
+
+  // If a value is mapped by ReplacedValues, then it must have no uses, except
+  // by nodes marked NewNode (see above).
+
+  // The final node obtained by mapping by ReplacedValues is not marked NewNode.
+  // Note that ReplacedValues should be applied iteratively.
+
+  // Note that the ReplacedValues map may also map deleted nodes (by iterating
+  // over the DAG we never dereference deleted nodes).  This means that it may
+  // also map nodes marked NewNode if the deallocated memory was reallocated as
+  // another node, and that new node was not seen by the LegalizeTypes machinery
+  // (for example because it was created but not used).  In general, we cannot
+  // distinguish between new nodes and deleted nodes.
+  SmallVector<SDNode*, 16> NewNodes;
+  for (SDNode &Node : DAG.allnodes()) {
+    // Remember nodes marked NewNode - they are subject to extra checking below.
+    if (Node.getNodeId() == NewNode)
+      NewNodes.push_back(&Node);
+
+    for (unsigned i = 0, e = Node.getNumValues(); i != e; ++i) {
+      SDValue Res(&Node, i);
+      bool Failed = false;
+      // Don't create a value in map.
+      auto ResId = ValueToIdMap.lookup(Res);
+
+      unsigned Mapped = 0;
+      if (ResId) {
+        auto I = ReplacedValues.find(ResId);
+        if (I != ReplacedValues.end()) {
+          Mapped |= 1;
+          // Check that remapped values are only used by nodes marked NewNode.
+          for (SDNode::use_iterator UI = Node.use_begin(), UE = Node.use_end();
+               UI != UE; ++UI)
+            if (UI.getUse().getResNo() == i)
+              assert(UI->getNodeId() == NewNode &&
+                     "Remapped value has non-trivial use!");
+
+          // Check that the final result of applying ReplacedValues is not
+          // marked NewNode.
+          auto NewValId = I->second;
+          I = ReplacedValues.find(NewValId);
+          while (I != ReplacedValues.end()) {
+            NewValId = I->second;
+            I = ReplacedValues.find(NewValId);
+          }
+          SDValue NewVal = getSDValue(NewValId);
+          (void)NewVal;
+          assert(NewVal.getNode()->getNodeId() != NewNode &&
+                 "ReplacedValues maps to a new node!");
+        }
+        if (PromotedIntegers.count(ResId))
+          Mapped |= 2;
+        if (SoftenedFloats.count(ResId))
+          Mapped |= 4;
+        if (ScalarizedVectors.count(ResId))
+          Mapped |= 8;
+        if (ExpandedIntegers.count(ResId))
+          Mapped |= 16;
+        if (ExpandedFloats.count(ResId))
+          Mapped |= 32;
+        if (SplitVectors.count(ResId))
+          Mapped |= 64;
+        if (WidenedVectors.count(ResId))
+          Mapped |= 128;
+        if (PromotedFloats.count(ResId))
+          Mapped |= 256;
+        if (SoftPromotedHalfs.count(ResId))
+          Mapped |= 512;
+      }
+
+      if (Node.getNodeId() != Processed) {
+        // Since we allow ReplacedValues to map deleted nodes, it may map nodes
+        // marked NewNode too, since a deleted node may have been reallocated as
+        // another node that has not been seen by the LegalizeTypes machinery.
+        if ((Node.getNodeId() == NewNode && Mapped > 1) ||
+            (Node.getNodeId() != NewNode && Mapped != 0)) {
+          dbgs() << "Unprocessed value in a map!";
+          Failed = true;
+        }
+      } else if (isTypeLegal(Res.getValueType()) || IgnoreNodeResults(&Node)) {
+        if (Mapped > 1) {
+          dbgs() << "Value with legal type was transformed!";
+          Failed = true;
+        }
+      } else {
+        if (Mapped == 0) {
+          SDValue NodeById = IdToValueMap.lookup(ResId);
+          // It is possible the node has been remapped to another node and had
+          // its Id updated in the Value to Id table. The node it remapped to
+          // may not have been processed yet. Look up the Id in the Id to Value
+          // table and re-check the Processed state. If the node hasn't been
+          // remapped we'll get the same state as we got earlier.
+          if (NodeById->getNodeId() == Processed) {
+            dbgs() << "Processed value not in any map!";
+            Failed = true;
+          }
+        } else if (Mapped & (Mapped - 1)) {
+          dbgs() << "Value in multiple maps!";
+          Failed = true;
+        }
+      }
+
+      if (Failed) {
+        if (Mapped & 1)
+          dbgs() << " ReplacedValues";
+        if (Mapped & 2)
+          dbgs() << " PromotedIntegers";
+        if (Mapped & 4)
+          dbgs() << " SoftenedFloats";
+        if (Mapped & 8)
+          dbgs() << " ScalarizedVectors";
+        if (Mapped & 16)
+          dbgs() << " ExpandedIntegers";
+        if (Mapped & 32)
+          dbgs() << " ExpandedFloats";
+        if (Mapped & 64)
+          dbgs() << " SplitVectors";
+        if (Mapped & 128)
+          dbgs() << " WidenedVectors";
+        if (Mapped & 256)
+          dbgs() << " PromotedFloats";
+        if (Mapped & 512)
+          dbgs() << " SoftPromoteHalfs";
+        dbgs() << "\n";
+        llvm_unreachable(nullptr);
+      }
+    }
+  }
+
+#ifndef NDEBUG
+  // Checked that NewNodes are only used by other NewNodes.
+  for (unsigned i = 0, e = NewNodes.size(); i != e; ++i) {
+    SDNode *N = NewNodes[i];
+    for (SDNode *U : N->uses())
+      assert(U->getNodeId() == NewNode && "NewNode used by non-NewNode!");
+  }
+#endif
+}
+
+/// This is the main entry point for the type legalizer. This does a top-down
+/// traversal of the dag, legalizing types as it goes. Returns "true" if it made
+/// any changes.
+bool DAGTypeLegalizer::run() {
+  bool Changed = false;
+
+  // Create a dummy node (which is not added to allnodes), that adds a reference
+  // to the root node, preventing it from being deleted, and tracking any
+  // changes of the root.
+  HandleSDNode Dummy(DAG.getRoot());
+  Dummy.setNodeId(Unanalyzed);
+
+  // The root of the dag may dangle to deleted nodes until the type legalizer is
+  // done.  Set it to null to avoid confusion.
+  DAG.setRoot(SDValue());
+
+  // Walk all nodes in the graph, assigning them a NodeId of 'ReadyToProcess'
+  // (and remembering them) if they are leaves and assigning 'Unanalyzed' if
+  // non-leaves.
+  for (SDNode &Node : DAG.allnodes()) {
+    if (Node.getNumOperands() == 0) {
+      Node.setNodeId(ReadyToProcess);
+      Worklist.push_back(&Node);
+    } else {
+      Node.setNodeId(Unanalyzed);
+    }
+  }
+
+  // Now that we have a set of nodes to process, handle them all.
+  while (!Worklist.empty()) {
+#ifndef EXPENSIVE_CHECKS
+    if (EnableExpensiveChecks)
+#endif
+      PerformExpensiveChecks();
+
+    SDNode *N = Worklist.pop_back_val();
+    assert(N->getNodeId() == ReadyToProcess &&
+           "Node should be ready if on worklist!");
+
+    LLVM_DEBUG(dbgs() << "Legalizing node: "; N->dump(&DAG));
+    if (IgnoreNodeResults(N)) {
+      LLVM_DEBUG(dbgs() << "Ignoring node results\n");
+      goto ScanOperands;
+    }
+
+    // Scan the values produced by the node, checking to see if any result
+    // types are illegal.
+    for (unsigned i = 0, NumResults = N->getNumValues(); i < NumResults; ++i) {
+      EVT ResultVT = N->getValueType(i);
+      LLVM_DEBUG(dbgs() << "Analyzing result type: " << ResultVT << "\n");
+      switch (getTypeAction(ResultVT)) {
+      case TargetLowering::TypeLegal:
+        LLVM_DEBUG(dbgs() << "Legal result type\n");
+        break;
+      case TargetLowering::TypeScalarizeScalableVector:
+        report_fatal_error(
+            "Scalarization of scalable vectors is not supported.");
+      // The following calls must take care of *all* of the node's results,
+      // not just the illegal result they were passed (this includes results
+      // with a legal type).  Results can be remapped using ReplaceValueWith,
+      // or their promoted/expanded/etc values registered in PromotedIntegers,
+      // ExpandedIntegers etc.
+      case TargetLowering::TypePromoteInteger:
+        PromoteIntegerResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeExpandInteger:
+        ExpandIntegerResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeSoftenFloat:
+        SoftenFloatResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeExpandFloat:
+        ExpandFloatResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeScalarizeVector:
+        ScalarizeVectorResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeSplitVector:
+        SplitVectorResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeWidenVector:
+        WidenVectorResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypePromoteFloat:
+        PromoteFloatResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeSoftPromoteHalf:
+        SoftPromoteHalfResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      }
+    }
+
+ScanOperands:
+    // Scan the operand list for the node, handling any nodes with operands that
+    // are illegal.
+    {
+    unsigned NumOperands = N->getNumOperands();
+    bool NeedsReanalyzing = false;
+    unsigned i;
+    for (i = 0; i != NumOperands; ++i) {
+      if (IgnoreNodeResults(N->getOperand(i).getNode()))
+        continue;
+
+      const auto &Op = N->getOperand(i);
+      LLVM_DEBUG(dbgs() << "Analyzing operand: "; Op.dump(&DAG));
+      EVT OpVT = Op.getValueType();
+      switch (getTypeAction(OpVT)) {
+      case TargetLowering::TypeLegal:
+        LLVM_DEBUG(dbgs() << "Legal operand\n");
+        continue;
+      case TargetLowering::TypeScalarizeScalableVector:
+        report_fatal_error(
+            "Scalarization of scalable vectors is not supported.");
+      // The following calls must either replace all of the node's results
+      // using ReplaceValueWith, and return "false"; or update the node's
+      // operands in place, and return "true".
+      case TargetLowering::TypePromoteInteger:
+        NeedsReanalyzing = PromoteIntegerOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeExpandInteger:
+        NeedsReanalyzing = ExpandIntegerOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeSoftenFloat:
+        NeedsReanalyzing = SoftenFloatOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeExpandFloat:
+        NeedsReanalyzing = ExpandFloatOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeScalarizeVector:
+        NeedsReanalyzing = ScalarizeVectorOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeSplitVector:
+        NeedsReanalyzing = SplitVectorOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeWidenVector:
+        NeedsReanalyzing = WidenVectorOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypePromoteFloat:
+        NeedsReanalyzing = PromoteFloatOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeSoftPromoteHalf:
+        NeedsReanalyzing = SoftPromoteHalfOperand(N, i);
+        Changed = true;
+        break;
+      }
+      break;
+    }
+
+    // The sub-method updated N in place.  Check to see if any operands are new,
+    // and if so, mark them.  If the node needs revisiting, don't add all users
+    // to the worklist etc.
+    if (NeedsReanalyzing) {
+      assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?");
+
+      N->setNodeId(NewNode);
+      // Recompute the NodeId and correct processed operands, adding the node to
+      // the worklist if ready.
+      SDNode *M = AnalyzeNewNode(N);
+      if (M == N)
+        // The node didn't morph - nothing special to do, it will be revisited.
+        continue;
+
+      // The node morphed - this is equivalent to legalizing by replacing every
+      // value of N with the corresponding value of M.  So do that now.
+      assert(N->getNumValues() == M->getNumValues() &&
+             "Node morphing changed the number of results!");
+      for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
+        // Replacing the value takes care of remapping the new value.
+        ReplaceValueWith(SDValue(N, i), SDValue(M, i));
+      assert(N->getNodeId() == NewNode && "Unexpected node state!");
+      // The node continues to live on as part of the NewNode fungus that
+      // grows on top of the useful nodes.  Nothing more needs to be done
+      // with it - move on to the next node.
+      continue;
+    }
+
+    if (i == NumOperands) {
+      LLVM_DEBUG(dbgs() << "Legally typed node: "; N->dump(&DAG);
+                 dbgs() << "\n");
+    }
+    }
+NodeDone:
+
+    // If we reach here, the node was processed, potentially creating new nodes.
+    // Mark it as processed and add its users to the worklist as appropriate.
+    assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?");
+    N->setNodeId(Processed);
+
+    for (SDNode *User : N->uses()) {
+      int NodeId = User->getNodeId();
+
+      // This node has two options: it can either be a new node or its Node ID
+      // may be a count of the number of operands it has that are not ready.
+      if (NodeId > 0) {
+        User->setNodeId(NodeId-1);
+
+        // If this was the last use it was waiting on, add it to the ready list.
+        if (NodeId-1 == ReadyToProcess)
+          Worklist.push_back(User);
+        continue;
+      }
+
+      // If this is an unreachable new node, then ignore it.  If it ever becomes
+      // reachable by being used by a newly created node then it will be handled
+      // by AnalyzeNewNode.
+      if (NodeId == NewNode)
+        continue;
+
+      // Otherwise, this node is new: this is the first operand of it that
+      // became ready.  Its new NodeId is the number of operands it has minus 1
+      // (as this node is now processed).
+      assert(NodeId == Unanalyzed && "Unknown node ID!");
+      User->setNodeId(User->getNumOperands() - 1);
+
+      // If the node only has a single operand, it is now ready.
+      if (User->getNumOperands() == 1)
+        Worklist.push_back(User);
+    }
+  }
+
+#ifndef EXPENSIVE_CHECKS
+  if (EnableExpensiveChecks)
+#endif
+    PerformExpensiveChecks();
+
+  // If the root changed (e.g. it was a dead load) update the root.
+  DAG.setRoot(Dummy.getValue());
+
+  // Remove dead nodes.  This is important to do for cleanliness but also before
+  // the checking loop below.  Implicit folding by the DAG.getNode operators and
+  // node morphing can cause unreachable nodes to be around with their flags set
+  // to new.
+  DAG.RemoveDeadNodes();
+
+  // In a debug build, scan all the nodes to make sure we found them all.  This
+  // ensures that there are no cycles and that everything got processed.
+#ifndef NDEBUG
+  for (SDNode &Node : DAG.allnodes()) {
+    bool Failed = false;
+
+    // Check that all result types are legal.
+    if (!IgnoreNodeResults(&Node))
+      for (unsigned i = 0, NumVals = Node.getNumValues(); i < NumVals; ++i)
+        if (!isTypeLegal(Node.getValueType(i))) {
+          dbgs() << "Result type " << i << " illegal: ";
+          Node.dump(&DAG);
+          Failed = true;
+        }
+
+    // Check that all operand types are legal.
+    for (unsigned i = 0, NumOps = Node.getNumOperands(); i < NumOps; ++i)
+      if (!IgnoreNodeResults(Node.getOperand(i).getNode()) &&
+          !isTypeLegal(Node.getOperand(i).getValueType())) {
+        dbgs() << "Operand type " << i << " illegal: ";
+        Node.getOperand(i).dump(&DAG);
+        Failed = true;
+      }
+
+    if (Node.getNodeId() != Processed) {
+       if (Node.getNodeId() == NewNode)
+         dbgs() << "New node not analyzed?\n";
+       else if (Node.getNodeId() == Unanalyzed)
+         dbgs() << "Unanalyzed node not noticed?\n";
+       else if (Node.getNodeId() > 0)
+         dbgs() << "Operand not processed?\n";
+       else if (Node.getNodeId() == ReadyToProcess)
+         dbgs() << "Not added to worklist?\n";
+       Failed = true;
+    }
+
+    if (Failed) {
+      Node.dump(&DAG); dbgs() << "\n";
+      llvm_unreachable(nullptr);
+    }
+  }
+#endif
+
+  return Changed;
+}
+
+/// The specified node is the root of a subtree of potentially new nodes.
+/// Correct any processed operands (this may change the node) and calculate the
+/// NodeId. If the node itself changes to a processed node, it is not remapped -
+/// the caller needs to take care of this. Returns the potentially changed node.
+SDNode *DAGTypeLegalizer::AnalyzeNewNode(SDNode *N) {
+  // If this was an existing node that is already done, we're done.
+  if (N->getNodeId() != NewNode && N->getNodeId() != Unanalyzed)
+    return N;
+
+  // Okay, we know that this node is new.  Recursively walk all of its operands
+  // to see if they are new also.  The depth of this walk is bounded by the size
+  // of the new tree that was constructed (usually 2-3 nodes), so we don't worry
+  // about revisiting of nodes.
+  //
+  // As we walk the operands, keep track of the number of nodes that are
+  // processed.  If non-zero, this will become the new nodeid of this node.
+  // Operands may morph when they are analyzed.  If so, the node will be
+  // updated after all operands have been analyzed.  Since this is rare,
+  // the code tries to minimize overhead in the non-morphing case.
+
+  std::vector<SDValue> NewOps;
+  unsigned NumProcessed = 0;
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDValue OrigOp = N->getOperand(i);
+    SDValue Op = OrigOp;
+
+    AnalyzeNewValue(Op); // Op may morph.
+
+    if (Op.getNode()->getNodeId() == Processed)
+      ++NumProcessed;
+
+    if (!NewOps.empty()) {
+      // Some previous operand changed.  Add this one to the list.
+      NewOps.push_back(Op);
+    } else if (Op != OrigOp) {
+      // This is the first operand to change - add all operands so far.
+      NewOps.insert(NewOps.end(), N->op_begin(), N->op_begin() + i);
+      NewOps.push_back(Op);
+    }
+  }
+
+  // Some operands changed - update the node.
+  if (!NewOps.empty()) {
+    SDNode *M = DAG.UpdateNodeOperands(N, NewOps);
+    if (M != N) {
+      // The node morphed into a different node.  Normally for this to happen
+      // the original node would have to be marked NewNode.  However this can
+      // in theory momentarily not be the case while ReplaceValueWith is doing
+      // its stuff.  Mark the original node NewNode to help basic correctness
+      // checking.
+      N->setNodeId(NewNode);
+      if (M->getNodeId() != NewNode && M->getNodeId() != Unanalyzed)
+        // It morphed into a previously analyzed node - nothing more to do.
+        return M;
+
+      // It morphed into a different new node.  Do the equivalent of passing
+      // it to AnalyzeNewNode: expunge it and calculate the NodeId.  No need
+      // to remap the operands, since they are the same as the operands we
+      // remapped above.
+      N = M;
+    }
+  }
+
+  // Calculate the NodeId.
+  N->setNodeId(N->getNumOperands() - NumProcessed);
+  if (N->getNodeId() == ReadyToProcess)
+    Worklist.push_back(N);
+
+  return N;
+}
+
+/// Call AnalyzeNewNode, updating the node in Val if needed.
+/// If the node changes to a processed node, then remap it.
+void DAGTypeLegalizer::AnalyzeNewValue(SDValue &Val) {
+  Val.setNode(AnalyzeNewNode(Val.getNode()));
+  if (Val.getNode()->getNodeId() == Processed)
+    // We were passed a processed node, or it morphed into one - remap it.
+    RemapValue(Val);
+}
+
+/// If the specified value was already legalized to another value,
+/// replace it by that value.
+void DAGTypeLegalizer::RemapValue(SDValue &V) {
+  auto Id = getTableId(V);
+  V = getSDValue(Id);
+}
+
+void DAGTypeLegalizer::RemapId(TableId &Id) {
+  auto I = ReplacedValues.find(Id);
+  if (I != ReplacedValues.end()) {
+    assert(Id != I->second && "Id is mapped to itself.");
+    // Use path compression to speed up future lookups if values get multiply
+    // replaced with other values.
+    RemapId(I->second);
+    Id = I->second;
+
+    // Note that N = IdToValueMap[Id] it is possible to have
+    // N.getNode()->getNodeId() == NewNode at this point because it is possible
+    // for a node to be put in the map before being processed.
+  }
+}
+
+namespace {
+  /// This class is a DAGUpdateListener that listens for updates to nodes and
+  /// recomputes their ready state.
+  class NodeUpdateListener : public SelectionDAG::DAGUpdateListener {
+    DAGTypeLegalizer &DTL;
+    SmallSetVector<SDNode*, 16> &NodesToAnalyze;
+  public:
+    explicit NodeUpdateListener(DAGTypeLegalizer &dtl,
+                                SmallSetVector<SDNode*, 16> &nta)
+      : SelectionDAG::DAGUpdateListener(dtl.getDAG()),
+        DTL(dtl), NodesToAnalyze(nta) {}
+
+    void NodeDeleted(SDNode *N, SDNode *E) override {
+      assert(N->getNodeId() != DAGTypeLegalizer::ReadyToProcess &&
+             N->getNodeId() != DAGTypeLegalizer::Processed &&
+             "Invalid node ID for RAUW deletion!");
+      // It is possible, though rare, for the deleted node N to occur as a
+      // target in a map, so note the replacement N -> E in ReplacedValues.
+      assert(E && "Node not replaced?");
+      DTL.NoteDeletion(N, E);
+
+      // In theory the deleted node could also have been scheduled for analysis.
+      // So remove it from the set of nodes which will be analyzed.
+      NodesToAnalyze.remove(N);
+
+      // In general nothing needs to be done for E, since it didn't change but
+      // only gained new uses.  However N -> E was just added to ReplacedValues,
+      // and the result of a ReplacedValues mapping is not allowed to be marked
+      // NewNode.  So if E is marked NewNode, then it needs to be analyzed.
+      if (E->getNodeId() == DAGTypeLegalizer::NewNode)
+        NodesToAnalyze.insert(E);
+    }
+
+    void NodeUpdated(SDNode *N) override {
+      // Node updates can mean pretty much anything.  It is possible that an
+      // operand was set to something already processed (f.e.) in which case
+      // this node could become ready.  Recompute its flags.
+      assert(N->getNodeId() != DAGTypeLegalizer::ReadyToProcess &&
+             N->getNodeId() != DAGTypeLegalizer::Processed &&
+             "Invalid node ID for RAUW deletion!");
+      N->setNodeId(DAGTypeLegalizer::NewNode);
+      NodesToAnalyze.insert(N);
+    }
+  };
+}
+
+
+/// The specified value was legalized to the specified other value.
+/// Update the DAG and NodeIds replacing any uses of From to use To instead.
+void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
+  assert(From.getNode() != To.getNode() && "Potential legalization loop!");
+
+  // If expansion produced new nodes, make sure they are properly marked.
+  AnalyzeNewValue(To);
+
+  // Anything that used the old node should now use the new one.  Note that this
+  // can potentially cause recursive merging.
+  SmallSetVector<SDNode*, 16> NodesToAnalyze;
+  NodeUpdateListener NUL(*this, NodesToAnalyze);
+  do {
+
+    // The old node may be present in a map like ExpandedIntegers or
+    // PromotedIntegers. Inform maps about the replacement.
+    auto FromId = getTableId(From);
+    auto ToId = getTableId(To);
+
+    if (FromId != ToId)
+      ReplacedValues[FromId] = ToId;
+    DAG.ReplaceAllUsesOfValueWith(From, To);
+
+    // Process the list of nodes that need to be reanalyzed.
+    while (!NodesToAnalyze.empty()) {
+      SDNode *N = NodesToAnalyze.pop_back_val();
+      if (N->getNodeId() != DAGTypeLegalizer::NewNode)
+        // The node was analyzed while reanalyzing an earlier node - it is safe
+        // to skip.  Note that this is not a morphing node - otherwise it would
+        // still be marked NewNode.
+        continue;
+
+      // Analyze the node's operands and recalculate the node ID.
+      SDNode *M = AnalyzeNewNode(N);
+      if (M != N) {
+        // The node morphed into a different node.  Make everyone use the new
+        // node instead.
+        assert(M->getNodeId() != NewNode && "Analysis resulted in NewNode!");
+        assert(N->getNumValues() == M->getNumValues() &&
+               "Node morphing changed the number of results!");
+        for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+          SDValue OldVal(N, i);
+          SDValue NewVal(M, i);
+          if (M->getNodeId() == Processed)
+            RemapValue(NewVal);
+          // OldVal may be a target of the ReplacedValues map which was marked
+          // NewNode to force reanalysis because it was updated.  Ensure that
+          // anything that ReplacedValues mapped to OldVal will now be mapped
+          // all the way to NewVal.
+          auto OldValId = getTableId(OldVal);
+          auto NewValId = getTableId(NewVal);
+          DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal);
+          if (OldValId != NewValId)
+            ReplacedValues[OldValId] = NewValId;
+        }
+        // The original node continues to exist in the DAG, marked NewNode.
+      }
+    }
+    // When recursively update nodes with new nodes, it is possible to have
+    // new uses of From due to CSE. If this happens, replace the new uses of
+    // From with To.
+  } while (!From.use_empty());
+}
+
+void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
+  assert(Result.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         "Invalid type for promoted integer");
+  AnalyzeNewValue(Result);
+
+  auto &OpIdEntry = PromotedIntegers[getTableId(Op)];
+  assert((OpIdEntry == 0) && "Node is already promoted!");
+  OpIdEntry = getTableId(Result);
+
+  DAG.transferDbgValues(Op, Result);
+}
+
+void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) {
+#ifndef NDEBUG
+  EVT VT = Result.getValueType();
+  LLVMContext &Ctx = *DAG.getContext();
+  assert((VT == EVT::getIntegerVT(Ctx, 80) ||
+          VT == TLI.getTypeToTransformTo(Ctx, Op.getValueType())) &&
+         "Invalid type for softened float");
+#endif
+  AnalyzeNewValue(Result);
+
+  auto &OpIdEntry = SoftenedFloats[getTableId(Op)];
+  assert((OpIdEntry == 0) && "Node is already converted to integer!");
+  OpIdEntry = getTableId(Result);
+}
+
+void DAGTypeLegalizer::SetPromotedFloat(SDValue Op, SDValue Result) {
+  assert(Result.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         "Invalid type for promoted float");
+  AnalyzeNewValue(Result);
+
+  auto &OpIdEntry = PromotedFloats[getTableId(Op)];
+  assert((OpIdEntry == 0) && "Node is already promoted!");
+  OpIdEntry = getTableId(Result);
+}
+
+void DAGTypeLegalizer::SetSoftPromotedHalf(SDValue Op, SDValue Result) {
+  assert(Result.getValueType() == MVT::i16 &&
+         "Invalid type for soft-promoted half");
+  AnalyzeNewValue(Result);
+
+  auto &OpIdEntry = SoftPromotedHalfs[getTableId(Op)];
+  assert((OpIdEntry == 0) && "Node is already promoted!");
+  OpIdEntry = getTableId(Result);
+}
+
+void DAGTypeLegalizer::SetScalarizedVector(SDValue Op, SDValue Result) {
+  // Note that in some cases vector operation operands may be greater than
+  // the vector element type. For example BUILD_VECTOR of type <1 x i1> with
+  // a constant i8 operand.
+
+  // We don't currently support the scalarization of scalable vector types.
+  assert(Result.getValueSizeInBits().getFixedValue() >=
+             Op.getScalarValueSizeInBits() &&
+         "Invalid type for scalarized vector");
+  AnalyzeNewValue(Result);
+
+  auto &OpIdEntry = ScalarizedVectors[getTableId(Op)];
+  assert((OpIdEntry == 0) && "Node is already scalarized!");
+  OpIdEntry = getTableId(Result);
+}
+
+void DAGTypeLegalizer::GetExpandedInteger(SDValue Op, SDValue &Lo,
+                                          SDValue &Hi) {
+  std::pair<TableId, TableId> &Entry = ExpandedIntegers[getTableId(Op)];
+  assert((Entry.first != 0) && "Operand isn't expanded");
+  Lo = getSDValue(Entry.first);
+  Hi = getSDValue(Entry.second);
+}
+
+void DAGTypeLegalizer::SetExpandedInteger(SDValue Op, SDValue Lo,
+                                          SDValue Hi) {
+  assert(Lo.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         Hi.getValueType() == Lo.getValueType() &&
+         "Invalid type for expanded integer");
+  // Lo/Hi may have been newly allocated, if so, add nodeid's as relevant.
+  AnalyzeNewValue(Lo);
+  AnalyzeNewValue(Hi);
+
+  // Transfer debug values. Don't invalidate the source debug value until it's
+  // been transferred to the high and low bits.
+  if (DAG.getDataLayout().isBigEndian()) {
+    DAG.transferDbgValues(Op, Hi, 0, Hi.getValueSizeInBits(), false);
+    DAG.transferDbgValues(Op, Lo, Hi.getValueSizeInBits(),
+                          Lo.getValueSizeInBits());
+  } else {
+    DAG.transferDbgValues(Op, Lo, 0, Lo.getValueSizeInBits(), false);
+    DAG.transferDbgValues(Op, Hi, Lo.getValueSizeInBits(),
+                          Hi.getValueSizeInBits());
+  }
+
+  // Remember that this is the result of the node.
+  std::pair<TableId, TableId> &Entry = ExpandedIntegers[getTableId(Op)];
+  assert((Entry.first == 0) && "Node already expanded");
+  Entry.first = getTableId(Lo);
+  Entry.second = getTableId(Hi);
+}
+
+void DAGTypeLegalizer::GetExpandedFloat(SDValue Op, SDValue &Lo,
+                                        SDValue &Hi) {
+  std::pair<TableId, TableId> &Entry = ExpandedFloats[getTableId(Op)];
+  assert((Entry.first != 0) && "Operand isn't expanded");
+  Lo = getSDValue(Entry.first);
+  Hi = getSDValue(Entry.second);
+}
+
+void DAGTypeLegalizer::SetExpandedFloat(SDValue Op, SDValue Lo,
+                                        SDValue Hi) {
+  assert(Lo.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         Hi.getValueType() == Lo.getValueType() &&
+         "Invalid type for expanded float");
+  // Lo/Hi may have been newly allocated, if so, add nodeid's as relevant.
+  AnalyzeNewValue(Lo);
+  AnalyzeNewValue(Hi);
+
+  std::pair<TableId, TableId> &Entry = ExpandedFloats[getTableId(Op)];
+  assert((Entry.first == 0) && "Node already expanded");
+  Entry.first = getTableId(Lo);
+  Entry.second = getTableId(Hi);
+}
+
+void DAGTypeLegalizer::GetSplitVector(SDValue Op, SDValue &Lo,
+                                      SDValue &Hi) {
+  std::pair<TableId, TableId> &Entry = SplitVectors[getTableId(Op)];
+  Lo = getSDValue(Entry.first);
+  Hi = getSDValue(Entry.second);
+  assert(Lo.getNode() && "Operand isn't split");
+  ;
+}
+
+void DAGTypeLegalizer::SetSplitVector(SDValue Op, SDValue Lo,
+                                      SDValue Hi) {
+  assert(Lo.getValueType().getVectorElementType() ==
+             Op.getValueType().getVectorElementType() &&
+         Lo.getValueType().getVectorElementCount() * 2 ==
+             Op.getValueType().getVectorElementCount() &&
+         Hi.getValueType() == Lo.getValueType() &&
+         "Invalid type for split vector");
+  // Lo/Hi may have been newly allocated, if so, add nodeid's as relevant.
+  AnalyzeNewValue(Lo);
+  AnalyzeNewValue(Hi);
+
+  // Remember that this is the result of the node.
+  std::pair<TableId, TableId> &Entry = SplitVectors[getTableId(Op)];
+  assert((Entry.first == 0) && "Node already split");
+  Entry.first = getTableId(Lo);
+  Entry.second = getTableId(Hi);
+}
+
+void DAGTypeLegalizer::SetWidenedVector(SDValue Op, SDValue Result) {
+  assert(Result.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         "Invalid type for widened vector");
+  AnalyzeNewValue(Result);
+
+  auto &OpIdEntry = WidenedVectors[getTableId(Op)];
+  assert((OpIdEntry == 0) && "Node already widened!");
+  OpIdEntry = getTableId(Result);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Utilities.
+//===----------------------------------------------------------------------===//
+
+/// Convert to an integer of the same size.
+SDValue DAGTypeLegalizer::BitConvertToInteger(SDValue Op) {
+  unsigned BitWidth = Op.getValueSizeInBits();
+  return DAG.getNode(ISD::BITCAST, SDLoc(Op),
+                     EVT::getIntegerVT(*DAG.getContext(), BitWidth), Op);
+}
+
+/// Convert to a vector of integers of the same size.
+SDValue DAGTypeLegalizer::BitConvertVectorToIntegerVector(SDValue Op) {
+  assert(Op.getValueType().isVector() && "Only applies to vectors!");
+  unsigned EltWidth = Op.getScalarValueSizeInBits();
+  EVT EltNVT = EVT::getIntegerVT(*DAG.getContext(), EltWidth);
+  auto EltCnt = Op.getValueType().getVectorElementCount();
+  return DAG.getNode(ISD::BITCAST, SDLoc(Op),
+                     EVT::getVectorVT(*DAG.getContext(), EltNVT, EltCnt), Op);
+}
+
+SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
+                                               EVT DestVT) {
+  SDLoc dl(Op);
+  // Create the stack frame object.  Make sure it is aligned for both
+  // the source and destination types.
+
+  // In cases where the vector is illegal it will be broken down into parts
+  // and stored in parts - we should use the alignment for the smallest part.
+  Align DestAlign = DAG.getReducedAlign(DestVT, /*UseABI=*/false);
+  Align OpAlign = DAG.getReducedAlign(Op.getValueType(), /*UseABI=*/false);
+  Align Align = std::max(DestAlign, OpAlign);
+  SDValue StackPtr =
+      DAG.CreateStackTemporary(Op.getValueType().getStoreSize(), Align);
+  // Emit a store to the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr,
+                               MachinePointerInfo(), Align);
+  // Result is a load from the stack slot.
+  return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo(), Align);
+}
+
+/// Replace the node's results with custom code provided by the target and
+/// return "true", or do nothing and return "false".
+/// The last parameter is FALSE if we are dealing with a node with legal
+/// result types and illegal operand. The second parameter denotes the type of
+/// illegal OperandNo in that case.
+/// The last parameter being TRUE means we are dealing with a
+/// node with illegal result types. The second parameter denotes the type of
+/// illegal ResNo in that case.
+bool DAGTypeLegalizer::CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult) {
+  // See if the target wants to custom lower this node.
+  if (TLI.getOperationAction(N->getOpcode(), VT) != TargetLowering::Custom)
+    return false;
+
+  SmallVector<SDValue, 8> Results;
+  if (LegalizeResult)
+    TLI.ReplaceNodeResults(N, Results, DAG);
+  else
+    TLI.LowerOperationWrapper(N, Results, DAG);
+
+  if (Results.empty())
+    // The target didn't want to custom lower it after all.
+    return false;
+
+  // Make everything that once used N's values now use those in Results instead.
+  assert(Results.size() == N->getNumValues() &&
+         "Custom lowering returned the wrong number of results!");
+  for (unsigned i = 0, e = Results.size(); i != e; ++i) {
+    ReplaceValueWith(SDValue(N, i), Results[i]);
+  }
+  return true;
+}
+
+
+/// Widen the node's results with custom code provided by the target and return
+/// "true", or do nothing and return "false".
+bool DAGTypeLegalizer::CustomWidenLowerNode(SDNode *N, EVT VT) {
+  // See if the target wants to custom lower this node.
+  if (TLI.getOperationAction(N->getOpcode(), VT) != TargetLowering::Custom)
+    return false;
+
+  SmallVector<SDValue, 8> Results;
+  TLI.ReplaceNodeResults(N, Results, DAG);
+
+  if (Results.empty())
+    // The target didn't want to custom widen lower its result after all.
+    return false;
+
+  // Update the widening map.
+  assert(Results.size() == N->getNumValues() &&
+         "Custom lowering returned the wrong number of results!");
+  for (unsigned i = 0, e = Results.size(); i != e; ++i) {
+    // If this is a chain output or already widened just replace it.
+    bool WasWidened = SDValue(N, i).getValueType() != Results[i].getValueType();
+    if (WasWidened)
+      SetWidenedVector(SDValue(N, i), Results[i]);
+    else
+      ReplaceValueWith(SDValue(N, i), Results[i]);
+  }
+  return true;
+}
+
+SDValue DAGTypeLegalizer::DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo) {
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
+    if (i != ResNo)
+      ReplaceValueWith(SDValue(N, i), SDValue(N->getOperand(i)));
+  return SDValue(N->getOperand(ResNo));
+}
+
+/// Use ISD::EXTRACT_ELEMENT nodes to extract the low and high parts of the
+/// given value.
+void DAGTypeLegalizer::GetPairElements(SDValue Pair,
+                                       SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(Pair);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Pair.getValueType());
+  std::tie(Lo, Hi) = DAG.SplitScalar(Pair, dl, NVT, NVT);
+}
+
+/// Build an integer with low bits Lo and high bits Hi.
+SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
+  // Arbitrarily use dlHi for result SDLoc
+  SDLoc dlHi(Hi);
+  SDLoc dlLo(Lo);
+  EVT LVT = Lo.getValueType();
+  EVT HVT = Hi.getValueType();
+  EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
+                              LVT.getSizeInBits() + HVT.getSizeInBits());
+
+  EVT ShiftAmtVT = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, dlLo, NVT, Lo);
+  Hi = DAG.getNode(ISD::ANY_EXTEND, dlHi, NVT, Hi);
+  Hi = DAG.getNode(ISD::SHL, dlHi, NVT, Hi,
+                   DAG.getConstant(LVT.getSizeInBits(), dlHi, ShiftAmtVT));
+  return DAG.getNode(ISD::OR, dlHi, NVT, Lo, Hi);
+}
+
+/// Promote the given target boolean to a target boolean of the given type.
+/// A target boolean is an integer value, not necessarily of type i1, the bits
+/// of which conform to getBooleanContents.
+///
+/// ValVT is the type of values that produced the boolean.
+SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT ValVT) {
+  return TLI.promoteTargetBoolean(DAG, Bool, ValVT);
+}
+
+/// Return the lower LoVT bits of Op in Lo and the upper HiVT bits in Hi.
+void DAGTypeLegalizer::SplitInteger(SDValue Op,
+                                    EVT LoVT, EVT HiVT,
+                                    SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(Op);
+  assert(LoVT.getSizeInBits() + HiVT.getSizeInBits() ==
+         Op.getValueSizeInBits() && "Invalid integer splitting!");
+  Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Op);
+  unsigned ReqShiftAmountInBits =
+      Log2_32_Ceil(Op.getValueType().getSizeInBits());
+  MVT ShiftAmountTy =
+      TLI.getScalarShiftAmountTy(DAG.getDataLayout(), Op.getValueType());
+  if (ReqShiftAmountInBits > ShiftAmountTy.getSizeInBits())
+    ShiftAmountTy = MVT::getIntegerVT(NextPowerOf2(ReqShiftAmountInBits));
+  Hi = DAG.getNode(ISD::SRL, dl, Op.getValueType(), Op,
+                   DAG.getConstant(LoVT.getSizeInBits(), dl, ShiftAmountTy));
+  Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
+}
+
+/// Return the lower and upper halves of Op's bits in a value type half the
+/// size of Op's.
+void DAGTypeLegalizer::SplitInteger(SDValue Op,
+                                    SDValue &Lo, SDValue &Hi) {
+  EVT HalfVT =
+      EVT::getIntegerVT(*DAG.getContext(), Op.getValueSizeInBits() / 2);
+  SplitInteger(Op, HalfVT, HalfVT, Lo, Hi);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Entry Point
+//===----------------------------------------------------------------------===//
+
+/// This transforms the SelectionDAG into a SelectionDAG that only uses types
+/// natively supported by the target. Returns "true" if it made any changes.
+///
+/// Note that this is an involved process that may invalidate pointers into
+/// the graph.
+bool SelectionDAG::LegalizeTypes() {
+  return DAGTypeLegalizer(*this).run();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
new file mode 100644
index 000000000000..db8f61eee606
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -0,0 +1,1137 @@
+//===-- LegalizeTypes.h - DAG Type Legalizer class definition ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the DAGTypeLegalizer class.  This is a private interface
+// shared between the code that implements the SelectionDAG::LegalizeTypes
+// method.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+/// This takes an arbitrary SelectionDAG as input and hacks on it until only
+/// value types the target machine can handle are left. This involves promoting
+/// small sizes to large sizes or splitting up large values into small values.
+///
+class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
+  const TargetLowering &TLI;
+  SelectionDAG &DAG;
+public:
+  /// This pass uses the NodeId on the SDNodes to hold information about the
+  /// state of the node. The enum has all the values.
+  enum NodeIdFlags {
+    /// All operands have been processed, so this node is ready to be handled.
+    ReadyToProcess = 0,
+
+    /// This is a new node, not before seen, that was created in the process of
+    /// legalizing some other node.
+    NewNode = -1,
+
+    /// This node's ID needs to be set to the number of its unprocessed
+    /// operands.
+    Unanalyzed = -2,
+
+    /// This is a node that has already been processed.
+    Processed = -3
+
+    // 1+ - This is a node which has this many unprocessed operands.
+  };
+private:
+
+  /// This is a bitvector that contains two bits for each simple value type,
+  /// where the two bits correspond to the LegalizeAction enum from
+  /// TargetLowering. This can be queried with "getTypeAction(VT)".
+  TargetLowering::ValueTypeActionImpl ValueTypeActions;
+
+  /// Return how we should legalize values of this type.
+  TargetLowering::LegalizeTypeAction getTypeAction(EVT VT) const {
+    return TLI.getTypeAction(*DAG.getContext(), VT);
+  }
+
+  /// Return true if this type is legal on this target.
+  bool isTypeLegal(EVT VT) const {
+    return TLI.getTypeAction(*DAG.getContext(), VT) == TargetLowering::TypeLegal;
+  }
+
+  /// Return true if this is a simple legal type.
+  bool isSimpleLegalType(EVT VT) const {
+    return VT.isSimple() && TLI.isTypeLegal(VT);
+  }
+
+  EVT getSetCCResultType(EVT VT) const {
+    return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  }
+
+  /// Pretend all of this node's results are legal.
+  bool IgnoreNodeResults(SDNode *N) const {
+    return N->getOpcode() == ISD::TargetConstant ||
+           N->getOpcode() == ISD::Register;
+  }
+
+  // Bijection from SDValue to unique id. As each created node gets a
+  // new id we do not need to worry about reuse expunging.  Should we
+  // run out of ids, we can do a one time expensive compactifcation.
+  typedef unsigned TableId;
+
+  TableId NextValueId = 1;
+
+  SmallDenseMap<SDValue, TableId, 8> ValueToIdMap;
+  SmallDenseMap<TableId, SDValue, 8> IdToValueMap;
+
+  /// For integer nodes that are below legal width, this map indicates what
+  /// promoted value to use.
+  SmallDenseMap<TableId, TableId, 8> PromotedIntegers;
+
+  /// For integer nodes that need to be expanded this map indicates which
+  /// operands are the expanded version of the input.
+  SmallDenseMap<TableId, std::pair<TableId, TableId>, 8> ExpandedIntegers;
+
+  /// For floating-point nodes converted to integers of the same size, this map
+  /// indicates the converted value to use.
+  SmallDenseMap<TableId, TableId, 8> SoftenedFloats;
+
+  /// For floating-point nodes that have a smaller precision than the smallest
+  /// supported precision, this map indicates what promoted value to use.
+  SmallDenseMap<TableId, TableId, 8> PromotedFloats;
+
+  /// For floating-point nodes that have a smaller precision than the smallest
+  /// supported precision, this map indicates the converted value to use.
+  SmallDenseMap<TableId, TableId, 8> SoftPromotedHalfs;
+
+  /// For float nodes that need to be expanded this map indicates which operands
+  /// are the expanded version of the input.
+  SmallDenseMap<TableId, std::pair<TableId, TableId>, 8> ExpandedFloats;
+
+  /// For nodes that are <1 x ty>, this map indicates the scalar value of type
+  /// 'ty' to use.
+  SmallDenseMap<TableId, TableId, 8> ScalarizedVectors;
+
+  /// For nodes that need to be split this map indicates which operands are the
+  /// expanded version of the input.
+  SmallDenseMap<TableId, std::pair<TableId, TableId>, 8> SplitVectors;
+
+  /// For vector nodes that need to be widened, indicates the widened value to
+  /// use.
+  SmallDenseMap<TableId, TableId, 8> WidenedVectors;
+
+  /// For values that have been replaced with another, indicates the replacement
+  /// value to use.
+  SmallDenseMap<TableId, TableId, 8> ReplacedValues;
+
+  /// This defines a worklist of nodes to process. In order to be pushed onto
+  /// this worklist, all operands of a node must have already been processed.
+  SmallVector<SDNode*, 128> Worklist;
+
+  TableId getTableId(SDValue V) {
+    assert(V.getNode() && "Getting TableId on SDValue()");
+
+    auto I = ValueToIdMap.find(V);
+    if (I != ValueToIdMap.end()) {
+      // replace if there's been a shift.
+      RemapId(I->second);
+      assert(I->second && "All Ids should be nonzero");
+      return I->second;
+    }
+    // Add if it's not there.
+    ValueToIdMap.insert(std::make_pair(V, NextValueId));
+    IdToValueMap.insert(std::make_pair(NextValueId, V));
+    ++NextValueId;
+    assert(NextValueId != 0 &&
+           "Ran out of Ids. Increase id type size or add compactification");
+    return NextValueId - 1;
+  }
+
+  const SDValue &getSDValue(TableId &Id) {
+    RemapId(Id);
+    assert(Id && "TableId should be non-zero");
+    auto I = IdToValueMap.find(Id);
+    assert(I != IdToValueMap.end() && "cannot find Id in map");
+    return I->second;
+  }
+
+public:
+  explicit DAGTypeLegalizer(SelectionDAG &dag)
+    : TLI(dag.getTargetLoweringInfo()), DAG(dag),
+    ValueTypeActions(TLI.getValueTypeActions()) {
+    static_assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE,
+                  "Too many value types for ValueTypeActions to hold!");
+  }
+
+  /// This is the main entry point for the type legalizer.  This does a
+  /// top-down traversal of the dag, legalizing types as it goes.  Returns
+  /// "true" if it made any changes.
+  bool run();
+
+  void NoteDeletion(SDNode *Old, SDNode *New) {
+    assert(Old != New && "node replaced with self");
+    for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i) {
+      TableId NewId = getTableId(SDValue(New, i));
+      TableId OldId = getTableId(SDValue(Old, i));
+
+      if (OldId != NewId) {
+        ReplacedValues[OldId] = NewId;
+
+        // Delete Node from tables.  We cannot do this when OldId == NewId,
+        // because NewId can still have table references to it in
+        // ReplacedValues.
+        IdToValueMap.erase(OldId);
+        PromotedIntegers.erase(OldId);
+        ExpandedIntegers.erase(OldId);
+        SoftenedFloats.erase(OldId);
+        PromotedFloats.erase(OldId);
+        SoftPromotedHalfs.erase(OldId);
+        ExpandedFloats.erase(OldId);
+        ScalarizedVectors.erase(OldId);
+        SplitVectors.erase(OldId);
+        WidenedVectors.erase(OldId);
+      }
+
+      ValueToIdMap.erase(SDValue(Old, i));
+    }
+  }
+
+  SelectionDAG &getDAG() const { return DAG; }
+
+private:
+  SDNode *AnalyzeNewNode(SDNode *N);
+  void AnalyzeNewValue(SDValue &Val);
+  void PerformExpensiveChecks();
+  void RemapId(TableId &Id);
+  void RemapValue(SDValue &V);
+
+  // Common routines.
+  SDValue BitConvertToInteger(SDValue Op);
+  SDValue BitConvertVectorToIntegerVector(SDValue Op);
+  SDValue CreateStackStoreLoad(SDValue Op, EVT DestVT);
+  bool CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult);
+  bool CustomWidenLowerNode(SDNode *N, EVT VT);
+
+  /// Replace each result of the given MERGE_VALUES node with the corresponding
+  /// input operand, except for the result 'ResNo', for which the corresponding
+  /// input operand is returned.
+  SDValue DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo);
+
+  SDValue JoinIntegers(SDValue Lo, SDValue Hi);
+
+  std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node);
+
+  SDValue PromoteTargetBoolean(SDValue Bool, EVT ValVT);
+
+  void ReplaceValueWith(SDValue From, SDValue To);
+  void SplitInteger(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT,
+                    SDValue &Lo, SDValue &Hi);
+
+  //===--------------------------------------------------------------------===//
+  // Integer Promotion Support: LegalizeIntegerTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Given a processed operand Op which was promoted to a larger integer type,
+  /// this returns the promoted value. The low bits of the promoted value
+  /// corresponding to the original type are exactly equal to Op.
+  /// The extra bits contain rubbish, so the promoted value may need to be zero-
+  /// or sign-extended from the original type before it is usable (the helpers
+  /// SExtPromotedInteger and ZExtPromotedInteger can do this for you).
+  /// For example, if Op is an i16 and was promoted to an i32, then this method
+  /// returns an i32, the lower 16 bits of which coincide with Op, and the upper
+  /// 16 bits of which contain rubbish.
+  SDValue GetPromotedInteger(SDValue Op) {
+    TableId &PromotedId = PromotedIntegers[getTableId(Op)];
+    SDValue PromotedOp = getSDValue(PromotedId);
+    assert(PromotedOp.getNode() && "Operand wasn't promoted?");
+    return PromotedOp;
+  }
+  void SetPromotedInteger(SDValue Op, SDValue Result);
+
+  /// Get a promoted operand and sign extend it to the final size.
+  SDValue SExtPromotedInteger(SDValue Op) {
+    EVT OldVT = Op.getValueType();
+    SDLoc dl(Op);
+    Op = GetPromotedInteger(Op);
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Op.getValueType(), Op,
+                       DAG.getValueType(OldVT));
+  }
+
+  /// Get a promoted operand and zero extend it to the final size.
+  SDValue ZExtPromotedInteger(SDValue Op) {
+    EVT OldVT = Op.getValueType();
+    SDLoc dl(Op);
+    Op = GetPromotedInteger(Op);
+    return DAG.getZeroExtendInReg(Op, dl, OldVT);
+  }
+
+  // Get a promoted operand and sign or zero extend it to the final size
+  // (depending on TargetLoweringInfo::isSExtCheaperThanZExt). For a given
+  // subtarget and type, the choice of sign or zero-extension will be
+  // consistent.
+  SDValue SExtOrZExtPromotedInteger(SDValue Op) {
+    EVT OldVT = Op.getValueType();
+    SDLoc DL(Op);
+    Op = GetPromotedInteger(Op);
+    if (TLI.isSExtCheaperThanZExt(OldVT, Op.getValueType()))
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Op.getValueType(), Op,
+                         DAG.getValueType(OldVT));
+    return DAG.getZeroExtendInReg(Op, DL, OldVT);
+  }
+
+  // Promote the given operand V (vector or scalar) according to N's specific
+  // reduction kind. N must be an integer VECREDUCE_* or VP_REDUCE_*. Returns
+  // the nominal extension opcode (ISD::(ANY|ZERO|SIGN)_EXTEND) and the
+  // promoted value.
+  SDValue PromoteIntOpVectorReduction(SDNode *N, SDValue V);
+
+  // Integer Result Promotion.
+  void PromoteIntegerResult(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_AssertSext(SDNode *N);
+  SDValue PromoteIntRes_AssertZext(SDNode *N);
+  SDValue PromoteIntRes_Atomic0(AtomicSDNode *N);
+  SDValue PromoteIntRes_Atomic1(AtomicSDNode *N);
+  SDValue PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue PromoteIntRes_INSERT_SUBVECTOR(SDNode *N);
+  SDValue PromoteIntRes_VECTOR_REVERSE(SDNode *N);
+  SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N);
+  SDValue PromoteIntRes_VECTOR_SPLICE(SDNode *N);
+  SDValue PromoteIntRes_VECTOR_INTERLEAVE_DEINTERLEAVE(SDNode *N);
+  SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N);
+  SDValue PromoteIntRes_ScalarOp(SDNode *N);
+  SDValue PromoteIntRes_STEP_VECTOR(SDNode *N);
+  SDValue PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N);
+  SDValue PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N);
+  SDValue PromoteIntRes_CONCAT_VECTORS(SDNode *N);
+  SDValue PromoteIntRes_BITCAST(SDNode *N);
+  SDValue PromoteIntRes_BSWAP(SDNode *N);
+  SDValue PromoteIntRes_BITREVERSE(SDNode *N);
+  SDValue PromoteIntRes_BUILD_PAIR(SDNode *N);
+  SDValue PromoteIntRes_Constant(SDNode *N);
+  SDValue PromoteIntRes_CTLZ(SDNode *N);
+  SDValue PromoteIntRes_CTPOP_PARITY(SDNode *N);
+  SDValue PromoteIntRes_CTTZ(SDNode *N);
+  SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue PromoteIntRes_FP_TO_XINT(SDNode *N);
+  SDValue PromoteIntRes_FP_TO_XINT_SAT(SDNode *N);
+  SDValue PromoteIntRes_FP_TO_FP16_BF16(SDNode *N);
+  SDValue PromoteIntRes_FREEZE(SDNode *N);
+  SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
+  SDValue PromoteIntRes_LOAD(LoadSDNode *N);
+  SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N);
+  SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N);
+  SDValue PromoteIntRes_Overflow(SDNode *N);
+  SDValue PromoteIntRes_FFREXP(SDNode *N);
+  SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_Select(SDNode *N);
+  SDValue PromoteIntRes_SELECT_CC(SDNode *N);
+  SDValue PromoteIntRes_SETCC(SDNode *N);
+  SDValue PromoteIntRes_SHL(SDNode *N);
+  SDValue PromoteIntRes_SimpleIntBinOp(SDNode *N);
+  SDValue PromoteIntRes_ZExtIntBinOp(SDNode *N);
+  SDValue PromoteIntRes_SExtIntBinOp(SDNode *N);
+  SDValue PromoteIntRes_UMINUMAX(SDNode *N);
+  SDValue PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N);
+  SDValue PromoteIntRes_SRA(SDNode *N);
+  SDValue PromoteIntRes_SRL(SDNode *N);
+  SDValue PromoteIntRes_TRUNCATE(SDNode *N);
+  SDValue PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_UADDSUBO_CARRY(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_SADDSUBO_CARRY(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_UNDEF(SDNode *N);
+  SDValue PromoteIntRes_VAARG(SDNode *N);
+  SDValue PromoteIntRes_VSCALE(SDNode *N);
+  SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_ADDSUBSHLSAT(SDNode *N);
+  SDValue PromoteIntRes_MULFIX(SDNode *N);
+  SDValue PromoteIntRes_DIVFIX(SDNode *N);
+  SDValue PromoteIntRes_GET_ROUNDING(SDNode *N);
+  SDValue PromoteIntRes_VECREDUCE(SDNode *N);
+  SDValue PromoteIntRes_VP_REDUCE(SDNode *N);
+  SDValue PromoteIntRes_ABS(SDNode *N);
+  SDValue PromoteIntRes_Rotate(SDNode *N);
+  SDValue PromoteIntRes_FunnelShift(SDNode *N);
+  SDValue PromoteIntRes_IS_FPCLASS(SDNode *N);
+
+  // Integer Operand Promotion.
+  bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_ANY_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N);
+  SDValue PromoteIntOp_BITCAST(SDNode *N);
+  SDValue PromoteIntOp_BUILD_PAIR(SDNode *N);
+  SDValue PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_BUILD_VECTOR(SDNode *N);
+  SDValue PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue PromoteIntOp_INSERT_SUBVECTOR(SDNode *N);
+  SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N);
+  SDValue PromoteIntOp_ScalarOp(SDNode *N);
+  SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_Shift(SDNode *N);
+  SDValue PromoteIntOp_FunnelShift(SDNode *N);
+  SDValue PromoteIntOp_SIGN_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_VP_SIGN_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_SINT_TO_FP(SDNode *N);
+  SDValue PromoteIntOp_STRICT_SINT_TO_FP(SDNode *N);
+  SDValue PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_TRUNCATE(SDNode *N);
+  SDValue PromoteIntOp_UINT_TO_FP(SDNode *N);
+  SDValue PromoteIntOp_STRICT_UINT_TO_FP(SDNode *N);
+  SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_VP_ZERO_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_ADDSUBO_CARRY(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_FRAMERETURNADDR(SDNode *N);
+  SDValue PromoteIntOp_PREFETCH(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_FIX(SDNode *N);
+  SDValue PromoteIntOp_ExpOp(SDNode *N);
+  SDValue PromoteIntOp_VECREDUCE(SDNode *N);
+  SDValue PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_SET_ROUNDING(SDNode *N);
+  SDValue PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
+
+  void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
+
+  //===--------------------------------------------------------------------===//
+  // Integer Expansion Support: LegalizeIntegerTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Given a processed operand Op which was expanded into two integers of half
+  /// the size, this returns the two halves. The low bits of Op are exactly
+  /// equal to the bits of Lo; the high bits exactly equal Hi.
+  /// For example, if Op is an i64 which was expanded into two i32's, then this
+  /// method returns the two i32's, with Lo being equal to the lower 32 bits of
+  /// Op, and Hi being equal to the upper 32 bits.
+  void GetExpandedInteger(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SetExpandedInteger(SDValue Op, SDValue Lo, SDValue Hi);
+
+  // Integer Result Expansion.
+  void ExpandIntegerResult(SDNode *N, unsigned ResNo);
+  void ExpandIntRes_ANY_EXTEND        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_AssertSext        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_AssertZext        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_Constant          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ABS               (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTLZ              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTPOP             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTTZ              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_LOAD          (LoadSDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_READCYCLECOUNTER  (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SIGN_EXTEND       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SIGN_EXTEND_INREG (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_TRUNCATE          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ZERO_EXTEND       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_GET_ROUNDING      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_FP_TO_SINT        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_FP_TO_UINT        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_FP_TO_XINT_SAT    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_XROUND_XRINT      (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandIntRes_Logical           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUB            (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUBC           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUBE           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UADDSUBO_CARRY    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SADDSUBO_CARRY    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_BITREVERSE        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_BSWAP             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_PARITY            (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_MUL               (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ShiftThroughStack (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_Shift             (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandIntRes_MINMAX            (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandIntRes_SADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_XMULO             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUBSAT         (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SHLSAT            (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_MULFIX            (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_DIVFIX            (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandIntRes_ATOMIC_LOAD       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_VECREDUCE         (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandIntRes_Rotate            (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_FunnelShift       (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandIntRes_VSCALE            (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandShiftByConstant(SDNode *N, const APInt &Amt,
+                             SDValue &Lo, SDValue &Hi);
+  bool ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
+  bool ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Integer Operand Expansion.
+  bool ExpandIntegerOperand(SDNode *N, unsigned OpNo);
+  SDValue ExpandIntOp_BR_CC(SDNode *N);
+  SDValue ExpandIntOp_SELECT_CC(SDNode *N);
+  SDValue ExpandIntOp_SETCC(SDNode *N);
+  SDValue ExpandIntOp_SETCCCARRY(SDNode *N);
+  SDValue ExpandIntOp_Shift(SDNode *N);
+  SDValue ExpandIntOp_SINT_TO_FP(SDNode *N);
+  SDValue ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue ExpandIntOp_TRUNCATE(SDNode *N);
+  SDValue ExpandIntOp_UINT_TO_FP(SDNode *N);
+  SDValue ExpandIntOp_RETURNADDR(SDNode *N);
+  SDValue ExpandIntOp_ATOMIC_STORE(SDNode *N);
+  SDValue ExpandIntOp_SPLAT_VECTOR(SDNode *N);
+  SDValue ExpandIntOp_STACKMAP(SDNode *N, unsigned OpNo);
+  SDValue ExpandIntOp_PATCHPOINT(SDNode *N, unsigned OpNo);
+  SDValue ExpandIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
+
+  void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
+                                  ISD::CondCode &CCCode, const SDLoc &dl);
+
+  //===--------------------------------------------------------------------===//
+  // Float to Integer Conversion Support: LegalizeFloatTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetSoftenedFloat - Given a processed operand Op which was converted to an
+  /// integer of the same size, this returns the integer.  The integer contains
+  /// exactly the same bits as Op - only the type changed.  For example, if Op
+  /// is an f32 which was softened to an i32, then this method returns an i32,
+  /// the bits of which coincide with those of Op
+  SDValue GetSoftenedFloat(SDValue Op) {
+    TableId Id = getTableId(Op);
+    auto Iter = SoftenedFloats.find(Id);
+    if (Iter == SoftenedFloats.end()) {
+      assert(isSimpleLegalType(Op.getValueType()) &&
+             "Operand wasn't converted to integer?");
+      return Op;
+    }
+    SDValue SoftenedOp = getSDValue(Iter->second);
+    assert(SoftenedOp.getNode() && "Unconverted op in SoftenedFloats?");
+    return SoftenedOp;
+  }
+  void SetSoftenedFloat(SDValue Op, SDValue Result);
+
+  // Convert Float Results to Integer.
+  void SoftenFloatResult(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_Unary(SDNode *N, RTLIB::Libcall LC);
+  SDValue SoftenFloatRes_Binary(SDNode *N, RTLIB::Libcall LC);
+  SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_ARITH_FENCE(SDNode *N);
+  SDValue SoftenFloatRes_BITCAST(SDNode *N);
+  SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N);
+  SDValue SoftenFloatRes_ConstantFP(SDNode *N);
+  SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_FABS(SDNode *N);
+  SDValue SoftenFloatRes_FMINNUM(SDNode *N);
+  SDValue SoftenFloatRes_FMAXNUM(SDNode *N);
+  SDValue SoftenFloatRes_FADD(SDNode *N);
+  SDValue SoftenFloatRes_FCBRT(SDNode *N);
+  SDValue SoftenFloatRes_FCEIL(SDNode *N);
+  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N);
+  SDValue SoftenFloatRes_FCOS(SDNode *N);
+  SDValue SoftenFloatRes_FDIV(SDNode *N);
+  SDValue SoftenFloatRes_FEXP(SDNode *N);
+  SDValue SoftenFloatRes_FEXP2(SDNode *N);
+  SDValue SoftenFloatRes_FFLOOR(SDNode *N);
+  SDValue SoftenFloatRes_FLOG(SDNode *N);
+  SDValue SoftenFloatRes_FLOG2(SDNode *N);
+  SDValue SoftenFloatRes_FLOG10(SDNode *N);
+  SDValue SoftenFloatRes_FMA(SDNode *N);
+  SDValue SoftenFloatRes_FMUL(SDNode *N);
+  SDValue SoftenFloatRes_FNEARBYINT(SDNode *N);
+  SDValue SoftenFloatRes_FNEG(SDNode *N);
+  SDValue SoftenFloatRes_FP_EXTEND(SDNode *N);
+  SDValue SoftenFloatRes_FP16_TO_FP(SDNode *N);
+  SDValue SoftenFloatRes_BF16_TO_FP(SDNode *N);
+  SDValue SoftenFloatRes_FP_ROUND(SDNode *N);
+  SDValue SoftenFloatRes_FPOW(SDNode *N);
+  SDValue SoftenFloatRes_ExpOp(SDNode *N);
+  SDValue SoftenFloatRes_FFREXP(SDNode *N);
+  SDValue SoftenFloatRes_FREEZE(SDNode *N);
+  SDValue SoftenFloatRes_FREM(SDNode *N);
+  SDValue SoftenFloatRes_FRINT(SDNode *N);
+  SDValue SoftenFloatRes_FROUND(SDNode *N);
+  SDValue SoftenFloatRes_FROUNDEVEN(SDNode *N);
+  SDValue SoftenFloatRes_FSIN(SDNode *N);
+  SDValue SoftenFloatRes_FSQRT(SDNode *N);
+  SDValue SoftenFloatRes_FSUB(SDNode *N);
+  SDValue SoftenFloatRes_FTRUNC(SDNode *N);
+  SDValue SoftenFloatRes_LOAD(SDNode *N);
+  SDValue SoftenFloatRes_SELECT(SDNode *N);
+  SDValue SoftenFloatRes_SELECT_CC(SDNode *N);
+  SDValue SoftenFloatRes_UNDEF(SDNode *N);
+  SDValue SoftenFloatRes_VAARG(SDNode *N);
+  SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N);
+  SDValue SoftenFloatRes_VECREDUCE(SDNode *N);
+  SDValue SoftenFloatRes_VECREDUCE_SEQ(SDNode *N);
+
+  // Convert Float Operand to Integer.
+  bool SoftenFloatOperand(SDNode *N, unsigned OpNo);
+  SDValue SoftenFloatOp_Unary(SDNode *N, RTLIB::Libcall LC);
+  SDValue SoftenFloatOp_BITCAST(SDNode *N);
+  SDValue SoftenFloatOp_BR_CC(SDNode *N);
+  SDValue SoftenFloatOp_FP_ROUND(SDNode *N);
+  SDValue SoftenFloatOp_FP_TO_XINT(SDNode *N);
+  SDValue SoftenFloatOp_FP_TO_XINT_SAT(SDNode *N);
+  SDValue SoftenFloatOp_LROUND(SDNode *N);
+  SDValue SoftenFloatOp_LLROUND(SDNode *N);
+  SDValue SoftenFloatOp_LRINT(SDNode *N);
+  SDValue SoftenFloatOp_LLRINT(SDNode *N);
+  SDValue SoftenFloatOp_SELECT_CC(SDNode *N);
+  SDValue SoftenFloatOp_SETCC(SDNode *N);
+  SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo);
+  SDValue SoftenFloatOp_FCOPYSIGN(SDNode *N);
+
+  //===--------------------------------------------------------------------===//
+  // Float Expansion Support: LegalizeFloatTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Given a processed operand Op which was expanded into two floating-point
+  /// values of half the size, this returns the two halves.
+  /// The low bits of Op are exactly equal to the bits of Lo; the high bits
+  /// exactly equal Hi.  For example, if Op is a ppcf128 which was expanded
+  /// into two f64's, then this method returns the two f64's, with Lo being
+  /// equal to the lower 64 bits of Op, and Hi to the upper 64 bits.
+  void GetExpandedFloat(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SetExpandedFloat(SDValue Op, SDValue Lo, SDValue Hi);
+
+  // Float Result Expansion.
+  void ExpandFloatResult(SDNode *N, unsigned ResNo);
+  void ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_Unary(SDNode *N, RTLIB::Libcall LC,
+                            SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_Binary(SDNode *N, RTLIB::Libcall LC,
+                             SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FABS      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMINNUM   (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMAXNUM   (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FADD      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FCBRT     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FCEIL     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FCOPYSIGN (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FCOS      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FDIV      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FEXP      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FEXP2     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FFLOOR    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FLOG      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FLOG2     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FLOG10    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMA       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMUL      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FNEARBYINT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FNEG      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FP_EXTEND (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FPOW      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FPOWI     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FLDEXP    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FREEZE    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FREM      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FRINT     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FROUND    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FROUNDEVEN(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FSIN      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FSQRT     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FSUB      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FTRUNC    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_LOAD      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Float Operand Expansion.
+  bool ExpandFloatOperand(SDNode *N, unsigned OpNo);
+  SDValue ExpandFloatOp_BR_CC(SDNode *N);
+  SDValue ExpandFloatOp_FCOPYSIGN(SDNode *N);
+  SDValue ExpandFloatOp_FP_ROUND(SDNode *N);
+  SDValue ExpandFloatOp_FP_TO_XINT(SDNode *N);
+  SDValue ExpandFloatOp_LROUND(SDNode *N);
+  SDValue ExpandFloatOp_LLROUND(SDNode *N);
+  SDValue ExpandFloatOp_LRINT(SDNode *N);
+  SDValue ExpandFloatOp_LLRINT(SDNode *N);
+  SDValue ExpandFloatOp_SELECT_CC(SDNode *N);
+  SDValue ExpandFloatOp_SETCC(SDNode *N);
+  SDValue ExpandFloatOp_STORE(SDNode *N, unsigned OpNo);
+
+  void FloatExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
+                                ISD::CondCode &CCCode, const SDLoc &dl,
+                                SDValue &Chain, bool IsSignaling = false);
+
+  //===--------------------------------------------------------------------===//
+  // Float promotion support: LegalizeFloatTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  SDValue GetPromotedFloat(SDValue Op) {
+    TableId &PromotedId = PromotedFloats[getTableId(Op)];
+    SDValue PromotedOp = getSDValue(PromotedId);
+    assert(PromotedOp.getNode() && "Operand wasn't promoted?");
+    return PromotedOp;
+  }
+  void SetPromotedFloat(SDValue Op, SDValue Result);
+
+  void PromoteFloatResult(SDNode *N, unsigned ResNo);
+  SDValue PromoteFloatRes_BITCAST(SDNode *N);
+  SDValue PromoteFloatRes_BinOp(SDNode *N);
+  SDValue PromoteFloatRes_ConstantFP(SDNode *N);
+  SDValue PromoteFloatRes_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue PromoteFloatRes_FCOPYSIGN(SDNode *N);
+  SDValue PromoteFloatRes_FMAD(SDNode *N);
+  SDValue PromoteFloatRes_ExpOp(SDNode *N);
+  SDValue PromoteFloatRes_FFREXP(SDNode *N);
+  SDValue PromoteFloatRes_FP_ROUND(SDNode *N);
+  SDValue PromoteFloatRes_LOAD(SDNode *N);
+  SDValue PromoteFloatRes_SELECT(SDNode *N);
+  SDValue PromoteFloatRes_SELECT_CC(SDNode *N);
+  SDValue PromoteFloatRes_UnaryOp(SDNode *N);
+  SDValue PromoteFloatRes_UNDEF(SDNode *N);
+  SDValue BitcastToInt_ATOMIC_SWAP(SDNode *N);
+  SDValue PromoteFloatRes_XINT_TO_FP(SDNode *N);
+  SDValue PromoteFloatRes_VECREDUCE(SDNode *N);
+  SDValue PromoteFloatRes_VECREDUCE_SEQ(SDNode *N);
+
+  bool PromoteFloatOperand(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_FCOPYSIGN(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_FP_TO_XINT(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_FP_TO_XINT_SAT(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_STORE(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_SELECT_CC(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_SETCC(SDNode *N, unsigned OpNo);
+
+  //===--------------------------------------------------------------------===//
+  // Half soft promotion support: LegalizeFloatTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  SDValue GetSoftPromotedHalf(SDValue Op) {
+    TableId &PromotedId = SoftPromotedHalfs[getTableId(Op)];
+    SDValue PromotedOp = getSDValue(PromotedId);
+    assert(PromotedOp.getNode() && "Operand wasn't promoted?");
+    return PromotedOp;
+  }
+  void SetSoftPromotedHalf(SDValue Op, SDValue Result);
+
+  void SoftPromoteHalfResult(SDNode *N, unsigned ResNo);
+  SDValue SoftPromoteHalfRes_BinOp(SDNode *N);
+  SDValue SoftPromoteHalfRes_BITCAST(SDNode *N);
+  SDValue SoftPromoteHalfRes_ConstantFP(SDNode *N);
+  SDValue SoftPromoteHalfRes_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue SoftPromoteHalfRes_FCOPYSIGN(SDNode *N);
+  SDValue SoftPromoteHalfRes_FMAD(SDNode *N);
+  SDValue SoftPromoteHalfRes_ExpOp(SDNode *N);
+  SDValue SoftPromoteHalfRes_FP_ROUND(SDNode *N);
+  SDValue SoftPromoteHalfRes_LOAD(SDNode *N);
+  SDValue SoftPromoteHalfRes_SELECT(SDNode *N);
+  SDValue SoftPromoteHalfRes_SELECT_CC(SDNode *N);
+  SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N);
+  SDValue SoftPromoteHalfRes_XINT_TO_FP(SDNode *N);
+  SDValue SoftPromoteHalfRes_UNDEF(SDNode *N);
+  SDValue SoftPromoteHalfRes_VECREDUCE(SDNode *N);
+  SDValue SoftPromoteHalfRes_VECREDUCE_SEQ(SDNode *N);
+
+  bool SoftPromoteHalfOperand(SDNode *N, unsigned OpNo);
+  SDValue SoftPromoteHalfOp_BITCAST(SDNode *N);
+  SDValue SoftPromoteHalfOp_FCOPYSIGN(SDNode *N, unsigned OpNo);
+  SDValue SoftPromoteHalfOp_FP_EXTEND(SDNode *N);
+  SDValue SoftPromoteHalfOp_FP_TO_XINT(SDNode *N);
+  SDValue SoftPromoteHalfOp_FP_TO_XINT_SAT(SDNode *N);
+  SDValue SoftPromoteHalfOp_SETCC(SDNode *N);
+  SDValue SoftPromoteHalfOp_SELECT_CC(SDNode *N, unsigned OpNo);
+  SDValue SoftPromoteHalfOp_STORE(SDNode *N, unsigned OpNo);
+  SDValue SoftPromoteHalfOp_STACKMAP(SDNode *N, unsigned OpNo);
+  SDValue SoftPromoteHalfOp_PATCHPOINT(SDNode *N, unsigned OpNo);
+
+  //===--------------------------------------------------------------------===//
+  // Scalarization Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Given a processed one-element vector Op which was scalarized to its
+  /// element type, this returns the element. For example, if Op is a v1i32,
+  /// Op = < i32 val >, this method returns val, an i32.
+  SDValue GetScalarizedVector(SDValue Op) {
+    TableId &ScalarizedId = ScalarizedVectors[getTableId(Op)];
+    SDValue ScalarizedOp = getSDValue(ScalarizedId);
+    assert(ScalarizedOp.getNode() && "Operand wasn't scalarized?");
+    return ScalarizedOp;
+  }
+  void SetScalarizedVector(SDValue Op, SDValue Result);
+
+  // Vector Result Scalarization: <1 x ty> -> ty.
+  void ScalarizeVectorResult(SDNode *N, unsigned ResNo);
+  SDValue ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
+  SDValue ScalarizeVecRes_BinOp(SDNode *N);
+  SDValue ScalarizeVecRes_TernaryOp(SDNode *N);
+  SDValue ScalarizeVecRes_UnaryOp(SDNode *N);
+  SDValue ScalarizeVecRes_StrictFPOp(SDNode *N);
+  SDValue ScalarizeVecRes_OverflowOp(SDNode *N, unsigned ResNo);
+  SDValue ScalarizeVecRes_InregOp(SDNode *N);
+  SDValue ScalarizeVecRes_VecInregOp(SDNode *N);
+
+  SDValue ScalarizeVecRes_BITCAST(SDNode *N);
+  SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N);
+  SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue ScalarizeVecRes_FP_ROUND(SDNode *N);
+  SDValue ScalarizeVecRes_ExpOp(SDNode *N);
+  SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N);
+  SDValue ScalarizeVecRes_LOAD(LoadSDNode *N);
+  SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue ScalarizeVecRes_VSELECT(SDNode *N);
+  SDValue ScalarizeVecRes_SELECT(SDNode *N);
+  SDValue ScalarizeVecRes_SELECT_CC(SDNode *N);
+  SDValue ScalarizeVecRes_SETCC(SDNode *N);
+  SDValue ScalarizeVecRes_UNDEF(SDNode *N);
+  SDValue ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N);
+  SDValue ScalarizeVecRes_FP_TO_XINT_SAT(SDNode *N);
+  SDValue ScalarizeVecRes_IS_FPCLASS(SDNode *N);
+
+  SDValue ScalarizeVecRes_FIX(SDNode *N);
+  SDValue ScalarizeVecRes_FFREXP(SDNode *N, unsigned ResNo);
+
+  // Vector Operand Scalarization: <1 x ty> -> ty.
+  bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo);
+  SDValue ScalarizeVecOp_BITCAST(SDNode *N);
+  SDValue ScalarizeVecOp_UnaryOp(SDNode *N);
+  SDValue ScalarizeVecOp_UnaryOp_StrictFP(SDNode *N);
+  SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue ScalarizeVecOp_VSELECT(SDNode *N);
+  SDValue ScalarizeVecOp_VSETCC(SDNode *N);
+  SDValue ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo);
+  SDValue ScalarizeVecOp_STRICT_FP_ROUND(SDNode *N, unsigned OpNo);
+  SDValue ScalarizeVecOp_FP_EXTEND(SDNode *N);
+  SDValue ScalarizeVecOp_STRICT_FP_EXTEND(SDNode *N);
+  SDValue ScalarizeVecOp_VECREDUCE(SDNode *N);
+  SDValue ScalarizeVecOp_VECREDUCE_SEQ(SDNode *N);
+
+  //===--------------------------------------------------------------------===//
+  // Vector Splitting Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Given a processed vector Op which was split into vectors of half the size,
+  /// this method returns the halves. The first elements of Op coincide with the
+  /// elements of Lo; the remaining elements of Op coincide with the elements of
+  /// Hi: Op is what you would get by concatenating Lo and Hi.
+  /// For example, if Op is a v8i32 that was split into two v4i32's, then this
+  /// method returns the two v4i32's, with Lo corresponding to the first 4
+  /// elements of Op, and Hi to the last 4 elements.
+  void GetSplitVector(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SetSplitVector(SDValue Op, SDValue Lo, SDValue Hi);
+
+  /// Split mask operator of a VP intrinsic.
+  std::pair<SDValue, SDValue> SplitMask(SDValue Mask);
+
+  /// Split mask operator of a VP intrinsic in a given location.
+  std::pair<SDValue, SDValue> SplitMask(SDValue Mask, const SDLoc &DL);
+
+  // Helper function for incrementing the pointer when splitting
+  // memory operations
+  void IncrementPointer(MemSDNode *N, EVT MemVT, MachinePointerInfo &MPI,
+                        SDValue &Ptr, uint64_t *ScaledOffset = nullptr);
+
+  // Vector Result Splitting: <128 x ty> -> 2 x <64 x ty>.
+  void SplitVectorResult(SDNode *N, unsigned ResNo);
+  void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_FFREXP(SDNode *N, unsigned ResNo, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo,
+                              SDValue &Lo, SDValue &Hi);
+
+  void SplitVecRes_FIX(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD, SDValue &Lo,
+                                   SDValue &Hi);
+  void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_Gather(MemSDNode *VPGT, SDValue &Lo, SDValue &Hi,
+                          bool SplitSETCC = false);
+  void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_STEP_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_VECTOR_REVERSE(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo,
+                                  SDValue &Hi);
+  void SplitVecRes_VECTOR_SPLICE(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_VECTOR_DEINTERLEAVE(SDNode *N);
+  void SplitVecRes_VECTOR_INTERLEAVE(SDNode *N);
+  void SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>.
+  bool SplitVectorOperand(SDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_VECREDUCE_SEQ(SDNode *N);
+  SDValue SplitVecOp_VP_REDUCE(SDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_UnaryOp(SDNode *N);
+  SDValue SplitVecOp_TruncateHelper(SDNode *N);
+
+  SDValue SplitVecOp_BITCAST(SDNode *N);
+  SDValue SplitVecOp_INSERT_SUBVECTOR(SDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue SplitVecOp_ExtVecInRegOp(SDNode *N);
+  SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_VP_STRIDED_STORE(VPStridedStoreSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_Scatter(MemSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_Gather(MemSDNode *MGT, unsigned OpNo);
+  SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue SplitVecOp_VSETCC(SDNode *N);
+  SDValue SplitVecOp_FP_ROUND(SDNode *N);
+  SDValue SplitVecOp_FPOpDifferentTypes(SDNode *N);
+  SDValue SplitVecOp_FP_TO_XINT_SAT(SDNode *N);
+
+  //===--------------------------------------------------------------------===//
+  // Vector Widening Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Given a processed vector Op which was widened into a larger vector, this
+  /// method returns the larger vector. The elements of the returned vector
+  /// consist of the elements of Op followed by elements containing rubbish.
+  /// For example, if Op is a v2i32 that was widened to a v4i32, then this
+  /// method returns a v4i32 for which the first two elements are the same as
+  /// those of Op, while the last two elements contain rubbish.
+  SDValue GetWidenedVector(SDValue Op) {
+    TableId &WidenedId = WidenedVectors[getTableId(Op)];
+    SDValue WidenedOp = getSDValue(WidenedId);
+    assert(WidenedOp.getNode() && "Operand wasn't widened?");
+    return WidenedOp;
+  }
+  void SetWidenedVector(SDValue Op, SDValue Result);
+
+  /// Given a mask Mask, returns the larger vector into which Mask was widened.
+  SDValue GetWidenedMask(SDValue Mask, ElementCount EC) {
+    // For VP operations, we must also widen the mask. Note that the mask type
+    // may not actually need widening, leading it be split along with the VP
+    // operation.
+    // FIXME: This could lead to an infinite split/widen loop. We only handle
+    // the case where the mask needs widening to an identically-sized type as
+    // the vector inputs.
+    assert(getTypeAction(Mask.getValueType()) ==
+               TargetLowering::TypeWidenVector &&
+           "Unable to widen binary VP op");
+    Mask = GetWidenedVector(Mask);
+    assert(Mask.getValueType().getVectorElementCount() == EC &&
+           "Unable to widen binary VP op");
+    return Mask;
+  }
+
+  // Widen Vector Result Promotion.
+  void WidenVectorResult(SDNode *N, unsigned ResNo);
+  SDValue WidenVecRes_MERGE_VALUES(SDNode* N, unsigned ResNo);
+  SDValue WidenVecRes_AssertZext(SDNode* N);
+  SDValue WidenVecRes_BITCAST(SDNode* N);
+  SDValue WidenVecRes_BUILD_VECTOR(SDNode* N);
+  SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N);
+  SDValue WidenVecRes_EXTEND_VECTOR_INREG(SDNode* N);
+  SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
+  SDValue WidenVecRes_INSERT_SUBVECTOR(SDNode *N);
+  SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
+  SDValue WidenVecRes_LOAD(SDNode* N);
+  SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N);
+  SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N);
+  SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
+  SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);
+  SDValue WidenVecRes_VP_GATHER(VPGatherSDNode* N);
+  SDValue WidenVecRes_ScalarOp(SDNode* N);
+  SDValue WidenVecRes_Select(SDNode *N);
+  SDValue WidenVSELECTMask(SDNode *N);
+  SDValue WidenVecRes_SELECT_CC(SDNode* N);
+  SDValue WidenVecRes_SETCC(SDNode* N);
+  SDValue WidenVecRes_STRICT_FSETCC(SDNode* N);
+  SDValue WidenVecRes_UNDEF(SDNode *N);
+  SDValue WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N);
+  SDValue WidenVecRes_VECTOR_REVERSE(SDNode *N);
+
+  SDValue WidenVecRes_Ternary(SDNode *N);
+  SDValue WidenVecRes_Binary(SDNode *N);
+  SDValue WidenVecRes_BinaryCanTrap(SDNode *N);
+  SDValue WidenVecRes_BinaryWithExtraScalarOp(SDNode *N);
+  SDValue WidenVecRes_StrictFP(SDNode *N);
+  SDValue WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo);
+  SDValue WidenVecRes_Convert(SDNode *N);
+  SDValue WidenVecRes_Convert_StrictFP(SDNode *N);
+  SDValue WidenVecRes_FP_TO_XINT_SAT(SDNode *N);
+  SDValue WidenVecRes_FCOPYSIGN(SDNode *N);
+  SDValue WidenVecRes_IS_FPCLASS(SDNode *N);
+  SDValue WidenVecRes_ExpOp(SDNode *N);
+  SDValue WidenVecRes_Unary(SDNode *N);
+  SDValue WidenVecRes_InregOp(SDNode *N);
+
+  // Widen Vector Operand.
+  bool WidenVectorOperand(SDNode *N, unsigned OpNo);
+  SDValue WidenVecOp_BITCAST(SDNode *N);
+  SDValue WidenVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue WidenVecOp_EXTEND(SDNode *N);
+  SDValue WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue WidenVecOp_INSERT_SUBVECTOR(SDNode *N);
+  SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue WidenVecOp_STORE(SDNode* N);
+  SDValue WidenVecOp_VP_STORE(SDNode *N, unsigned OpNo);
+  SDValue WidenVecOp_VP_STRIDED_STORE(SDNode *N, unsigned OpNo);
+  SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo);
+  SDValue WidenVecOp_MGATHER(SDNode* N, unsigned OpNo);
+  SDValue WidenVecOp_MSCATTER(SDNode* N, unsigned OpNo);
+  SDValue WidenVecOp_VP_SCATTER(SDNode* N, unsigned OpNo);
+  SDValue WidenVecOp_SETCC(SDNode* N);
+  SDValue WidenVecOp_STRICT_FSETCC(SDNode* N);
+  SDValue WidenVecOp_VSELECT(SDNode *N);
+
+  SDValue WidenVecOp_Convert(SDNode *N);
+  SDValue WidenVecOp_FP_TO_XINT_SAT(SDNode *N);
+  SDValue WidenVecOp_UnrollVectorOp(SDNode *N);
+  SDValue WidenVecOp_IS_FPCLASS(SDNode *N);
+  SDValue WidenVecOp_VECREDUCE(SDNode *N);
+  SDValue WidenVecOp_VECREDUCE_SEQ(SDNode *N);
+  SDValue WidenVecOp_VP_REDUCE(SDNode *N);
+  SDValue WidenVecOp_ExpOp(SDNode *N);
+
+  /// Helper function to generate a set of operations to perform
+  /// a vector operation for a wider type.
+  ///
+  SDValue UnrollVectorOp_StrictFP(SDNode *N, unsigned ResNE);
+
+  //===--------------------------------------------------------------------===//
+  // Vector Widening Utilities Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Helper function to generate a set of loads to load a vector with a
+  /// resulting wider type. It takes:
+  ///   LdChain: list of chains for the load to be generated.
+  ///   Ld:      load to widen
+  SDValue GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
+                              LoadSDNode *LD);
+
+  /// Helper function to generate a set of extension loads to load a vector with
+  /// a resulting wider type. It takes:
+  ///   LdChain: list of chains for the load to be generated.
+  ///   Ld:      load to widen
+  ///   ExtType: extension element type
+  SDValue GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
+                                 LoadSDNode *LD, ISD::LoadExtType ExtType);
+
+  /// Helper function to generate a set of stores to store a widen vector into
+  /// non-widen memory. Returns true if successful, false otherwise.
+  ///   StChain: list of chains for the stores we have generated
+  ///   ST:      store of a widen value
+  bool GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain, StoreSDNode *ST);
+
+  /// Modifies a vector input (widen or narrows) to a vector of NVT.  The
+  /// input vector must have the same element type as NVT.
+  /// When FillWithZeroes is "on" the vector will be widened with zeroes.
+  /// By default, the vector will be widened with undefined values.
+  SDValue ModifyToType(SDValue InOp, EVT NVT, bool FillWithZeroes = false);
+
+  /// Return a mask of vector type MaskVT to replace InMask. Also adjust
+  /// MaskVT to ToMaskVT if needed with vector extension or truncation.
+  SDValue convertMask(SDValue InMask, EVT MaskVT, EVT ToMaskVT);
+
+  //===--------------------------------------------------------------------===//
+  // Generic Splitting: LegalizeTypesGeneric.cpp
+  //===--------------------------------------------------------------------===//
+
+  // Legalization methods which only use that the illegal type is split into two
+  // not necessarily identical types.  As such they can be used for splitting
+  // vectors and expanding integers and floats.
+
+  void GetSplitOp(SDValue Op, SDValue &Lo, SDValue &Hi) {
+    if (Op.getValueType().isVector())
+      GetSplitVector(Op, Lo, Hi);
+    else if (Op.getValueType().isInteger())
+      GetExpandedInteger(Op, Lo, Hi);
+    else
+      GetExpandedFloat(Op, Lo, Hi);
+  }
+
+  /// Use ISD::EXTRACT_ELEMENT nodes to extract the low and high parts of the
+  /// given value.
+  void GetPairElements(SDValue Pair, SDValue &Lo, SDValue &Hi);
+
+  // Generic Result Splitting.
+  void SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
+                             SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_AssertZext  (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_ARITH_FENCE (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_Select      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_SELECT_CC   (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_UNDEF       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_FREEZE      (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  //===--------------------------------------------------------------------===//
+  // Generic Expansion: LegalizeTypesGeneric.cpp
+  //===--------------------------------------------------------------------===//
+
+  // Legalization methods which only use that the illegal type is split into two
+  // identical types of half the size, and that the Lo/Hi part is stored first
+  // in memory on little/big-endian machines, followed by the Hi/Lo part.  As
+  // such they can be used for expanding integers and floats.
+
+  void GetExpandedOp(SDValue Op, SDValue &Lo, SDValue &Hi) {
+    if (Op.getValueType().isInteger())
+      GetExpandedInteger(Op, Lo, Hi);
+    else
+      GetExpandedFloat(Op, Lo, Hi);
+  }
+
+
+  /// This function will split the integer \p Op into \p NumElements
+  /// operations of type \p EltVT and store them in \p Ops.
+  void IntegerToVector(SDValue Op, unsigned NumElements,
+                       SmallVectorImpl<SDValue> &Ops, EVT EltVT);
+
+  // Generic Result Expansion.
+  void ExpandRes_MERGE_VALUES      (SDNode *N, unsigned ResNo,
+                                    SDValue &Lo, SDValue &Hi);
+  void ExpandRes_BITCAST           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_BUILD_PAIR        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_EXTRACT_ELEMENT   (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_NormalLoad        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_VAARG             (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Generic Operand Expansion.
+  SDValue ExpandOp_BITCAST          (SDNode *N);
+  SDValue ExpandOp_BUILD_VECTOR     (SDNode *N);
+  SDValue ExpandOp_EXTRACT_ELEMENT  (SDNode *N);
+  SDValue ExpandOp_INSERT_VECTOR_ELT(SDNode *N);
+  SDValue ExpandOp_SCALAR_TO_VECTOR (SDNode *N);
+  SDValue ExpandOp_NormalStore      (SDNode *N, unsigned OpNo);
+};
+
+} // end namespace llvm.
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
new file mode 100644
index 000000000000..296242c00401
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -0,0 +1,601 @@
+//===-------- LegalizeTypesGeneric.cpp - Generic type legalization --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements generic type expansion and splitting for LegalizeTypes.
+// The routines here perform legalization when the details of the type (such as
+// whether it is an integer or a float) do not matter.
+// Expansion is the act of changing a computation in an illegal type to be a
+// computation in two identical registers of a smaller type.  The Lo/Hi part
+// is required to be stored first in memory on little/big-endian machines.
+// Splitting is the act of changing a computation in an illegal type to be a
+// computation in two not necessarily identical registers of a smaller type.
+// There are no requirements on how the type is represented in memory.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/IR/DataLayout.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "legalize-types"
+
+//===----------------------------------------------------------------------===//
+// Generic Result Expansion.
+//===----------------------------------------------------------------------===//
+
+// These routines assume that the Lo/Hi part is stored first in memory on
+// little/big-endian machines, followed by the Hi/Lo part.  This means that
+// they cannot be used as is on vectors, for which Lo is always stored first.
+void DAGTypeLegalizer::ExpandRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
+                                              SDValue &Lo, SDValue &Hi) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  GetExpandedOp(Op, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+  SDLoc dl(N);
+
+  // Handle some special cases efficiently.
+  switch (getTypeAction(InVT)) {
+    case TargetLowering::TypeLegal:
+    case TargetLowering::TypePromoteInteger:
+      break;
+    case TargetLowering::TypePromoteFloat:
+    case TargetLowering::TypeSoftPromoteHalf:
+      llvm_unreachable("Bitcast of a promotion-needing float should never need"
+                       "expansion");
+    case TargetLowering::TypeSoftenFloat:
+      SplitInteger(GetSoftenedFloat(InOp), Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
+      return;
+    case TargetLowering::TypeExpandInteger:
+    case TargetLowering::TypeExpandFloat: {
+      auto &DL = DAG.getDataLayout();
+      // Convert the expanded pieces of the input.
+      GetExpandedOp(InOp, Lo, Hi);
+      if (TLI.hasBigEndianPartOrdering(InVT, DL) !=
+          TLI.hasBigEndianPartOrdering(OutVT, DL))
+        std::swap(Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
+      return;
+    }
+    case TargetLowering::TypeSplitVector:
+      GetSplitVector(InOp, Lo, Hi);
+      if (TLI.hasBigEndianPartOrdering(OutVT, DAG.getDataLayout()))
+        std::swap(Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
+      return;
+    case TargetLowering::TypeScalarizeVector:
+      // Convert the element instead.
+      SplitInteger(BitConvertToInteger(GetScalarizedVector(InOp)), Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
+      return;
+    case TargetLowering::TypeScalarizeScalableVector:
+      report_fatal_error("Scalarization of scalable vectors is not supported.");
+    case TargetLowering::TypeWidenVector: {
+      assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BITCAST");
+      InOp = GetWidenedVector(InOp);
+      EVT LoVT, HiVT;
+      std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(InVT);
+      std::tie(Lo, Hi) = DAG.SplitVector(InOp, dl, LoVT, HiVT);
+      if (TLI.hasBigEndianPartOrdering(OutVT, DAG.getDataLayout()))
+        std::swap(Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
+      return;
+    }
+  }
+
+  if (InVT.isVector() && OutVT.isInteger()) {
+    // Handle cases like i64 = BITCAST v1i64 on x86, where the operand
+    // is legal but the result is not.
+    unsigned NumElems = 2;
+    EVT ElemVT = NOutVT;
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems);
+
+    // If <ElemVT * N> is not a legal type, try <ElemVT/2 * (N*2)>.
+    while (!isTypeLegal(NVT)) {
+      unsigned NewSizeInBits = ElemVT.getSizeInBits() / 2;
+      // If the element size is smaller than byte, bail.
+      if (NewSizeInBits < 8)
+        break;
+      NumElems *= 2;
+      ElemVT = EVT::getIntegerVT(*DAG.getContext(), NewSizeInBits);
+      NVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems);
+    }
+
+    if (isTypeLegal(NVT)) {
+      SDValue CastInOp = DAG.getNode(ISD::BITCAST, dl, NVT, InOp);
+
+      SmallVector<SDValue, 8> Vals;
+      for (unsigned i = 0; i < NumElems; ++i)
+        Vals.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ElemVT,
+                                   CastInOp, DAG.getVectorIdxConstant(i, dl)));
+
+      // Build Lo, Hi pair by pairing extracted elements if needed.
+      unsigned Slot = 0;
+      for (unsigned e = Vals.size(); e - Slot > 2; Slot += 2, e += 1) {
+        // Each iteration will BUILD_PAIR two nodes and append the result until
+        // there are only two nodes left, i.e. Lo and Hi.
+        SDValue LHS = Vals[Slot];
+        SDValue RHS = Vals[Slot + 1];
+
+        if (DAG.getDataLayout().isBigEndian())
+          std::swap(LHS, RHS);
+
+        Vals.push_back(DAG.getNode(
+            ISD::BUILD_PAIR, dl,
+            EVT::getIntegerVT(*DAG.getContext(), LHS.getValueSizeInBits() << 1),
+            LHS, RHS));
+      }
+      Lo = Vals[Slot++];
+      Hi = Vals[Slot++];
+
+      if (DAG.getDataLayout().isBigEndian())
+        std::swap(Lo, Hi);
+
+      return;
+    }
+  }
+
+  // Lower the bit-convert to a store/load from the stack.
+  assert(NOutVT.isByteSized() && "Expanded type not byte sized!");
+
+  // Create the stack frame object.  Make sure it is aligned for both
+  // the source and expanded destination types.
+
+  // In cases where the vector is illegal it will be broken down into parts
+  // and stored in parts - we should use the alignment for the smallest part.
+  Align InAlign = DAG.getReducedAlign(InVT, /*UseABI=*/false);
+  Align NOutAlign = DAG.getReducedAlign(NOutVT, /*UseABI=*/false);
+  Align Align = std::max(InAlign, NOutAlign);
+  SDValue StackPtr = DAG.CreateStackTemporary(InVT.getStoreSize(), Align);
+  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+
+  // Emit a store to the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo);
+
+  // Load the first half from the stack slot.
+  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, NOutAlign);
+
+  // Increment the pointer to the other half.
+  unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
+  StackPtr =
+      DAG.getMemBasePlusOffset(StackPtr, TypeSize::Fixed(IncrementSize), dl);
+
+  // Load the second half from the stack slot.
+  Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr,
+                   PtrInfo.getWithOffset(IncrementSize), NOutAlign);
+
+  // Handle endianness of the load.
+  if (TLI.hasBigEndianPartOrdering(OutVT, DAG.getDataLayout()))
+    std::swap(Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_BUILD_PAIR(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  // Return the operands.
+  Lo = N->getOperand(0);
+  Hi = N->getOperand(1);
+}
+
+void DAGTypeLegalizer::ExpandRes_EXTRACT_ELEMENT(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  GetExpandedOp(N->getOperand(0), Lo, Hi);
+  SDValue Part = N->getConstantOperandVal(1) ? Hi : Lo;
+
+  assert(Part.getValueType() == N->getValueType(0) &&
+         "Type twice as big as expanded type not itself expanded!");
+
+  GetPairElements(Part, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
+                                                    SDValue &Hi) {
+  SDValue OldVec = N->getOperand(0);
+  ElementCount OldEltCount = OldVec.getValueType().getVectorElementCount();
+  EVT OldEltVT = OldVec.getValueType().getVectorElementType();
+  SDLoc dl(N);
+
+  // Convert to a vector of the expanded element type, for example
+  // <3 x i64> -> <6 x i32>.
+  EVT OldVT = N->getValueType(0);
+  EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT);
+
+  if (OldVT != OldEltVT) {
+    // The result of EXTRACT_VECTOR_ELT may be larger than the element type of
+    // the input vector.  If so, extend the elements of the input vector to the
+    // same bitwidth as the result before expanding.
+    assert(OldEltVT.bitsLT(OldVT) && "Result type smaller then element type!");
+    EVT NVecVT = EVT::getVectorVT(*DAG.getContext(), OldVT, OldEltCount);
+    OldVec = DAG.getNode(ISD::ANY_EXTEND, dl, NVecVT, N->getOperand(0));
+  }
+
+  SDValue NewVec = DAG.getNode(
+      ISD::BITCAST, dl,
+      EVT::getVectorVT(*DAG.getContext(), NewVT, OldEltCount * 2), OldVec);
+
+  // Extract the elements at 2 * Idx and 2 * Idx + 1 from the new vector.
+  SDValue Idx = N->getOperand(1);
+
+  Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, Idx);
+  Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, NewVec, Idx);
+
+  Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
+                    DAG.getConstant(1, dl, Idx.getValueType()));
+  Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, NewVec, Idx);
+
+  if (DAG.getDataLayout().isBigEndian())
+    std::swap(Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  assert(ISD::isNormalLoad(N) && "This routine only for normal loads!");
+  SDLoc dl(N);
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  assert(!LD->isAtomic() && "Atomics can not be split");
+  EVT ValueVT = LD->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), ValueVT);
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  AAMDNodes AAInfo = LD->getAAInfo();
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+
+  Lo = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
+                   LD->getOriginalAlign(), LD->getMemOperand()->getFlags(),
+                   AAInfo);
+
+  // Increment the pointer to the other half.
+  unsigned IncrementSize = NVT.getSizeInBits() / 8;
+  Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(IncrementSize), dl);
+  Hi = DAG.getLoad(
+      NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(IncrementSize),
+      LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), AAInfo);
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                      Hi.getValue(1));
+
+  // Handle endianness of the load.
+  if (TLI.hasBigEndianPartOrdering(ValueVT, DAG.getDataLayout()))
+    std::swap(Lo, Hi);
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
+}
+
+void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
+  SDValue Chain = N->getOperand(0);
+  SDValue Ptr = N->getOperand(1);
+  SDLoc dl(N);
+  const unsigned Align = N->getConstantOperandVal(3);
+
+  Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2), Align);
+  Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, N->getOperand(2), 0);
+  Chain = Hi.getValue(1);
+
+  // Handle endianness of the load.
+  if (TLI.hasBigEndianPartOrdering(OVT, DAG.getDataLayout()))
+    std::swap(Lo, Hi);
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
+}
+
+
+//===--------------------------------------------------------------------===//
+// Generic Operand Expansion.
+//===--------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::IntegerToVector(SDValue Op, unsigned NumElements,
+                                       SmallVectorImpl<SDValue> &Ops,
+                                       EVT EltVT) {
+  assert(Op.getValueType().isInteger());
+  SDLoc DL(Op);
+  SDValue Parts[2];
+
+  if (NumElements > 1) {
+    NumElements >>= 1;
+    SplitInteger(Op, Parts[0], Parts[1]);
+    if (DAG.getDataLayout().isBigEndian())
+      std::swap(Parts[0], Parts[1]);
+    IntegerToVector(Parts[0], NumElements, Ops, EltVT);
+    IntegerToVector(Parts[1], NumElements, Ops, EltVT);
+  } else {
+    Ops.push_back(DAG.getNode(ISD::BITCAST, DL, EltVT, Op));
+  }
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_BITCAST(SDNode *N) {
+  SDLoc dl(N);
+  if (N->getValueType(0).isVector() &&
+      N->getOperand(0).getValueType().isInteger()) {
+    // An illegal expanding type is being converted to a legal vector type.
+    // Make a two element vector out of the expanded parts and convert that
+    // instead, but only if the new vector type is legal (otherwise there
+    // is no point, and it might create expansion loops).  For example, on
+    // x86 this turns v1i64 = BITCAST i64 into v1i64 = BITCAST v2i32.
+    //
+    // FIXME: I'm not sure why we are first trying to split the input into
+    // a 2 element vector, so I'm leaving it here to maintain the current
+    // behavior.
+    unsigned NumElts = 2;
+    EVT OVT = N->getOperand(0).getValueType();
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(),
+                               TLI.getTypeToTransformTo(*DAG.getContext(), OVT),
+                               NumElts);
+    if (!isTypeLegal(NVT)) {
+      // If we can't find a legal type by splitting the integer in half,
+      // then we can use the node's value type.
+      NumElts = N->getValueType(0).getVectorNumElements();
+      NVT = N->getValueType(0);
+    }
+
+    SmallVector<SDValue, 8> Ops;
+    IntegerToVector(N->getOperand(0), NumElts, Ops, NVT.getVectorElementType());
+
+    SDValue Vec = DAG.getBuildVector(NVT, dl, ArrayRef(Ops.data(), NumElts));
+    return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), Vec);
+  }
+
+  // Otherwise, store to a temporary and load out again as the new type.
+  return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) {
+  // The vector type is legal but the element type needs expansion.
+  EVT VecVT = N->getValueType(0);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  EVT OldVT = N->getOperand(0).getValueType();
+  EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT);
+  SDLoc dl(N);
+
+  assert(OldVT == VecVT.getVectorElementType() &&
+         "BUILD_VECTOR operand type doesn't match vector element type!");
+
+  // Build a vector of twice the length out of the expanded elements.
+  // For example <3 x i64> -> <6 x i32>.
+  SmallVector<SDValue, 16> NewElts;
+  NewElts.reserve(NumElts*2);
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Lo, Hi;
+    GetExpandedOp(N->getOperand(i), Lo, Hi);
+    if (DAG.getDataLayout().isBigEndian())
+      std::swap(Lo, Hi);
+    NewElts.push_back(Lo);
+    NewElts.push_back(Hi);
+  }
+
+  EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NewElts.size());
+  SDValue NewVec = DAG.getBuildVector(NewVecVT, dl, NewElts);
+
+  // Convert the new vector to the old vector type.
+  return DAG.getNode(ISD::BITCAST, dl, VecVT, NewVec);
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_EXTRACT_ELEMENT(SDNode *N) {
+  SDValue Lo, Hi;
+  GetExpandedOp(N->getOperand(0), Lo, Hi);
+  return N->getConstantOperandVal(1) ? Hi : Lo;
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_INSERT_VECTOR_ELT(SDNode *N) {
+  // The vector type is legal but the element type needs expansion.
+  EVT VecVT = N->getValueType(0);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  SDLoc dl(N);
+
+  SDValue Val = N->getOperand(1);
+  EVT OldEVT = Val.getValueType();
+  EVT NewEVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldEVT);
+
+  assert(OldEVT == VecVT.getVectorElementType() &&
+         "Inserted element type doesn't match vector element type!");
+
+  // Bitconvert to a vector of twice the length with elements of the expanded
+  // type, insert the expanded vector elements, and then convert back.
+  EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewEVT, NumElts*2);
+  SDValue NewVec = DAG.getNode(ISD::BITCAST, dl,
+                               NewVecVT, N->getOperand(0));
+
+  SDValue Lo, Hi;
+  GetExpandedOp(Val, Lo, Hi);
+  if (DAG.getDataLayout().isBigEndian())
+    std::swap(Lo, Hi);
+
+  SDValue Idx = N->getOperand(2);
+  Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, Idx);
+  NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Lo, Idx);
+  Idx = DAG.getNode(ISD::ADD, dl,
+                    Idx.getValueType(), Idx,
+                    DAG.getConstant(1, dl, Idx.getValueType()));
+  NewVec =  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Hi, Idx);
+
+  // Convert the new vector to the old vector type.
+  return DAG.getNode(ISD::BITCAST, dl, VecVT, NewVec);
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_SCALAR_TO_VECTOR(SDNode *N) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  assert(VT.getVectorElementType() == N->getOperand(0).getValueType() &&
+         "SCALAR_TO_VECTOR operand type doesn't match vector element type!");
+  unsigned NumElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(NumElts);
+  Ops[0] = N->getOperand(0);
+  SDValue UndefVal = DAG.getUNDEF(Ops[0].getValueType());
+  for (unsigned i = 1; i < NumElts; ++i)
+    Ops[i] = UndefVal;
+  return DAG.getBuildVector(VT, dl, Ops);
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
+  assert(ISD::isNormalStore(N) && "This routine only for normal stores!");
+  assert(OpNo == 1 && "Can only expand the stored value so far");
+  SDLoc dl(N);
+
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  assert(!St->isAtomic() && "Atomics can not be split");
+  EVT ValueVT = St->getValue().getValueType();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), ValueVT);
+  SDValue Chain = St->getChain();
+  SDValue Ptr = St->getBasePtr();
+  AAMDNodes AAInfo = St->getAAInfo();
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+  unsigned IncrementSize = NVT.getSizeInBits() / 8;
+
+  SDValue Lo, Hi;
+  GetExpandedOp(St->getValue(), Lo, Hi);
+
+  if (TLI.hasBigEndianPartOrdering(ValueVT, DAG.getDataLayout()))
+    std::swap(Lo, Hi);
+
+  Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(),
+                    St->getOriginalAlign(), St->getMemOperand()->getFlags(),
+                    AAInfo);
+
+  Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::Fixed(IncrementSize));
+  Hi = DAG.getStore(
+      Chain, dl, Hi, Ptr, St->getPointerInfo().getWithOffset(IncrementSize),
+      St->getOriginalAlign(), St->getMemOperand()->getFlags(), AAInfo);
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+}
+
+
+//===--------------------------------------------------------------------===//
+// Generic Result Splitting.
+//===--------------------------------------------------------------------===//
+
+// Be careful to make no assumptions about which of Lo/Hi is stored first in
+// memory (for vectors it is always Lo first followed by Hi in the following
+// bytes; for integers and floats it is Lo first if and only if the machine is
+// little-endian).
+
+void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  GetSplitOp(Op, Lo, Hi);
+}
+
+void DAGTypeLegalizer::SplitRes_Select(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue LL, LH, RL, RH, CL, CH;
+  SDLoc dl(N);
+  unsigned Opcode = N->getOpcode();
+  GetSplitOp(N->getOperand(1), LL, LH);
+  GetSplitOp(N->getOperand(2), RL, RH);
+
+  SDValue Cond = N->getOperand(0);
+  CL = CH = Cond;
+  if (Cond.getValueType().isVector()) {
+    if (SDValue Res = WidenVSELECTMask(N))
+      std::tie(CL, CH) = DAG.SplitVector(Res, dl);
+    // Check if there are already splitted versions of the vector available and
+    // use those instead of splitting the mask operand again.
+    else if (getTypeAction(Cond.getValueType()) ==
+             TargetLowering::TypeSplitVector)
+      GetSplitVector(Cond, CL, CH);
+    // It seems to improve code to generate two narrow SETCCs as opposed to
+    // splitting a wide result vector.
+    else if (Cond.getOpcode() == ISD::SETCC) {
+      // If the condition is a vXi1 vector, and the LHS of the setcc is a legal
+      // type and the setcc result type is the same vXi1, then leave the setcc
+      // alone.
+      EVT CondLHSVT = Cond.getOperand(0).getValueType();
+      if (Cond.getValueType().getVectorElementType() == MVT::i1 &&
+          isTypeLegal(CondLHSVT) &&
+          getSetCCResultType(CondLHSVT) == Cond.getValueType())
+        std::tie(CL, CH) = DAG.SplitVector(Cond, dl);
+      else
+        SplitVecRes_SETCC(Cond.getNode(), CL, CH);
+    } else
+      std::tie(CL, CH) = DAG.SplitVector(Cond, dl);
+  }
+
+  if (Opcode != ISD::VP_SELECT && Opcode != ISD::VP_MERGE) {
+    Lo = DAG.getNode(Opcode, dl, LL.getValueType(), CL, LL, RL);
+    Hi = DAG.getNode(Opcode, dl, LH.getValueType(), CH, LH, RH);
+    return;
+  }
+
+  SDValue EVLLo, EVLHi;
+  std::tie(EVLLo, EVLHi) =
+      DAG.SplitEVL(N->getOperand(3), N->getValueType(0), dl);
+
+  Lo = DAG.getNode(Opcode, dl, LL.getValueType(), CL, LL, RL, EVLLo);
+  Hi = DAG.getNode(Opcode, dl, LH.getValueType(), CH, LH, RH, EVLHi);
+}
+
+void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo,
+                                          SDValue &Hi) {
+  SDValue LL, LH, RL, RH;
+  SDLoc dl(N);
+  GetSplitOp(N->getOperand(2), LL, LH);
+  GetSplitOp(N->getOperand(3), RL, RH);
+
+  Lo = DAG.getNode(ISD::SELECT_CC, dl, LL.getValueType(), N->getOperand(0),
+                   N->getOperand(1), LL, RL, N->getOperand(4));
+  Hi = DAG.getNode(ISD::SELECT_CC, dl, LH.getValueType(), N->getOperand(0),
+                   N->getOperand(1), LH, RH, N->getOperand(4));
+}
+
+void DAGTypeLegalizer::SplitRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  Lo = DAG.getUNDEF(LoVT);
+  Hi = DAG.getUNDEF(HiVT);
+}
+
+void DAGTypeLegalizer::SplitVecRes_AssertZext(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  SDValue L, H;
+  SDLoc dl(N);
+  GetSplitOp(N->getOperand(0), L, H);
+
+  Lo = DAG.getNode(ISD::AssertZext, dl, L.getValueType(), L, N->getOperand(1));
+  Hi = DAG.getNode(ISD::AssertZext, dl, H.getValueType(), H, N->getOperand(1));
+}
+
+void DAGTypeLegalizer::SplitRes_FREEZE(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue L, H;
+  SDLoc dl(N);
+  GetSplitOp(N->getOperand(0), L, H);
+
+  Lo = DAG.getNode(ISD::FREEZE, dl, L.getValueType(), L);
+  Hi = DAG.getNode(ISD::FREEZE, dl, H.getValueType(), H);
+}
+
+void DAGTypeLegalizer::SplitRes_ARITH_FENCE(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  SDValue L, H;
+  SDLoc DL(N);
+  GetSplitOp(N->getOperand(0), L, H);
+
+  Lo = DAG.getNode(ISD::ARITH_FENCE, DL, L.getValueType(), L);
+  Hi = DAG.getNode(ISD::ARITH_FENCE, DL, H.getValueType(), H);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
new file mode 100644
index 000000000000..3862fd241897
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -0,0 +1,1771 @@
+//===- LegalizeVectorOps.cpp - Implement SelectionDAG::LegalizeVectors ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SelectionDAG::LegalizeVectors method.
+//
+// The vector legalizer looks for vector operations which might need to be
+// scalarized and legalizes them. This is a separate step from Legalize because
+// scalarizing can introduce illegal types.  For example, suppose we have an
+// ISD::SDIV of type v2i64 on x86-32.  The type is legal (for example, addition
+// on a v2i64 is legal), but ISD::SDIV isn't legal, so we have to unroll the
+// operation, which introduces nodes with the illegal type i64 which must be
+// expanded.  Similarly, suppose we have an ISD::SRA of type v16i8 on PowerPC;
+// the operation must be unrolled, which introduces nodes with the illegal
+// type i8 which must be promoted.
+//
+// This does not legalize vector manipulations like ISD::BUILD_VECTOR,
+// or operations that happen to take a vector which are custom-lowered;
+// the legalization for such operations never produces nodes
+// with illegal types, so it's okay to put off legalizing them until
+// SelectionDAG::Legalize runs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "legalizevectorops"
+
+namespace {
+
+class VectorLegalizer {
+  SelectionDAG& DAG;
+  const TargetLowering &TLI;
+  bool Changed = false; // Keep track of whether anything changed
+
+  /// For nodes that are of legal width, and that have more than one use, this
+  /// map indicates what regularized operand to use.  This allows us to avoid
+  /// legalizing the same thing more than once.
+  SmallDenseMap<SDValue, SDValue, 64> LegalizedNodes;
+
+  /// Adds a node to the translation cache.
+  void AddLegalizedOperand(SDValue From, SDValue To) {
+    LegalizedNodes.insert(std::make_pair(From, To));
+    // If someone requests legalization of the new node, return itself.
+    if (From != To)
+      LegalizedNodes.insert(std::make_pair(To, To));
+  }
+
+  /// Legalizes the given node.
+  SDValue LegalizeOp(SDValue Op);
+
+  /// Assuming the node is legal, "legalize" the results.
+  SDValue TranslateLegalizeResults(SDValue Op, SDNode *Result);
+
+  /// Make sure Results are legal and update the translation cache.
+  SDValue RecursivelyLegalizeResults(SDValue Op,
+                                     MutableArrayRef<SDValue> Results);
+
+  /// Wrapper to interface LowerOperation with a vector of Results.
+  /// Returns false if the target wants to use default expansion. Otherwise
+  /// returns true. If return is true and the Results are empty, then the
+  /// target wants to keep the input node as is.
+  bool LowerOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results);
+
+  /// Implements unrolling a VSETCC.
+  SDValue UnrollVSETCC(SDNode *Node);
+
+  /// Implement expand-based legalization of vector operations.
+  ///
+  /// This is just a high-level routine to dispatch to specific code paths for
+  /// operations to legalize them.
+  void Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+
+  /// Implements expansion for FP_TO_UINT; falls back to UnrollVectorOp if
+  /// FP_TO_SINT isn't legal.
+  void ExpandFP_TO_UINT(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+
+  /// Implements expansion for UINT_TO_FLOAT; falls back to UnrollVectorOp if
+  /// SINT_TO_FLOAT and SHR on vectors isn't legal.
+  void ExpandUINT_TO_FLOAT(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+
+  /// Implement expansion for SIGN_EXTEND_INREG using SRL and SRA.
+  SDValue ExpandSEXTINREG(SDNode *Node);
+
+  /// Implement expansion for ANY_EXTEND_VECTOR_INREG.
+  ///
+  /// Shuffles the low lanes of the operand into place and bitcasts to the proper
+  /// type. The contents of the bits in the extended part of each element are
+  /// undef.
+  SDValue ExpandANY_EXTEND_VECTOR_INREG(SDNode *Node);
+
+  /// Implement expansion for SIGN_EXTEND_VECTOR_INREG.
+  ///
+  /// Shuffles the low lanes of the operand into place, bitcasts to the proper
+  /// type, then shifts left and arithmetic shifts right to introduce a sign
+  /// extension.
+  SDValue ExpandSIGN_EXTEND_VECTOR_INREG(SDNode *Node);
+
+  /// Implement expansion for ZERO_EXTEND_VECTOR_INREG.
+  ///
+  /// Shuffles the low lanes of the operand into place and blends zeros into
+  /// the remaining lanes, finally bitcasting to the proper type.
+  SDValue ExpandZERO_EXTEND_VECTOR_INREG(SDNode *Node);
+
+  /// Expand bswap of vectors into a shuffle if legal.
+  SDValue ExpandBSWAP(SDNode *Node);
+
+  /// Implement vselect in terms of XOR, AND, OR when blend is not
+  /// supported by the target.
+  SDValue ExpandVSELECT(SDNode *Node);
+  SDValue ExpandVP_SELECT(SDNode *Node);
+  SDValue ExpandVP_MERGE(SDNode *Node);
+  SDValue ExpandVP_REM(SDNode *Node);
+  SDValue ExpandSELECT(SDNode *Node);
+  std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
+  SDValue ExpandStore(SDNode *N);
+  SDValue ExpandFNEG(SDNode *Node);
+  void ExpandFSUB(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  void ExpandSETCC(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  void ExpandBITREVERSE(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  void ExpandUADDSUBO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  void ExpandSADDSUBO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  void ExpandMULO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  void ExpandFixedPointDiv(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  void ExpandStrictFPOp(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  void ExpandREM(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+
+  void UnrollStrictFPOp(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+
+  /// Implements vector promotion.
+  ///
+  /// This is essentially just bitcasting the operands to a different type and
+  /// bitcasting the result back to the original type.
+  void Promote(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+
+  /// Implements [SU]INT_TO_FP vector promotion.
+  ///
+  /// This is a [zs]ext of the input operand to a larger integer type.
+  void PromoteINT_TO_FP(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+
+  /// Implements FP_TO_[SU]INT vector promotion of the result type.
+  ///
+  /// It is promoted to a larger integer type.  The result is then
+  /// truncated back to the original type.
+  void PromoteFP_TO_INT(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+
+public:
+  VectorLegalizer(SelectionDAG& dag) :
+      DAG(dag), TLI(dag.getTargetLoweringInfo()) {}
+
+  /// Begin legalizer the vector operations in the DAG.
+  bool Run();
+};
+
+} // end anonymous namespace
+
+bool VectorLegalizer::Run() {
+  // Before we start legalizing vector nodes, check if there are any vectors.
+  bool HasVectors = false;
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = std::prev(DAG.allnodes_end()); I != std::next(E); ++I) {
+    // Check if the values of the nodes contain vectors. We don't need to check
+    // the operands because we are going to check their values at some point.
+    HasVectors = llvm::any_of(I->values(), [](EVT T) { return T.isVector(); });
+
+    // If we found a vector node we can start the legalization.
+    if (HasVectors)
+      break;
+  }
+
+  // If this basic block has no vectors then no need to legalize vectors.
+  if (!HasVectors)
+    return false;
+
+  // The legalize process is inherently a bottom-up recursive process (users
+  // legalize their uses before themselves).  Given infinite stack space, we
+  // could just start legalizing on the root and traverse the whole graph.  In
+  // practice however, this causes us to run out of stack space on large basic
+  // blocks.  To avoid this problem, compute an ordering of the nodes where each
+  // node is only legalized after all of its operands are legalized.
+  DAG.AssignTopologicalOrder();
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = std::prev(DAG.allnodes_end()); I != std::next(E); ++I)
+    LegalizeOp(SDValue(&*I, 0));
+
+  // Finally, it's possible the root changed.  Get the new root.
+  SDValue OldRoot = DAG.getRoot();
+  assert(LegalizedNodes.count(OldRoot) && "Root didn't get legalized?");
+  DAG.setRoot(LegalizedNodes[OldRoot]);
+
+  LegalizedNodes.clear();
+
+  // Remove dead nodes now.
+  DAG.RemoveDeadNodes();
+
+  return Changed;
+}
+
+SDValue VectorLegalizer::TranslateLegalizeResults(SDValue Op, SDNode *Result) {
+  assert(Op->getNumValues() == Result->getNumValues() &&
+         "Unexpected number of results");
+  // Generic legalization: just pass the operand through.
+  for (unsigned i = 0, e = Op->getNumValues(); i != e; ++i)
+    AddLegalizedOperand(Op.getValue(i), SDValue(Result, i));
+  return SDValue(Result, Op.getResNo());
+}
+
+SDValue
+VectorLegalizer::RecursivelyLegalizeResults(SDValue Op,
+                                            MutableArrayRef<SDValue> Results) {
+  assert(Results.size() == Op->getNumValues() &&
+         "Unexpected number of results");
+  // Make sure that the generated code is itself legal.
+  for (unsigned i = 0, e = Results.size(); i != e; ++i) {
+    Results[i] = LegalizeOp(Results[i]);
+    AddLegalizedOperand(Op.getValue(i), Results[i]);
+  }
+
+  return Results[Op.getResNo()];
+}
+
+SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
+  // Note that LegalizeOp may be reentered even from single-use nodes, which
+  // means that we always must cache transformed nodes.
+  DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op);
+  if (I != LegalizedNodes.end()) return I->second;
+
+  // Legalize the operands
+  SmallVector<SDValue, 8> Ops;
+  for (const SDValue &Oper : Op->op_values())
+    Ops.push_back(LegalizeOp(Oper));
+
+  SDNode *Node = DAG.UpdateNodeOperands(Op.getNode(), Ops);
+
+  bool HasVectorValueOrOp =
+      llvm::any_of(Node->values(), [](EVT T) { return T.isVector(); }) ||
+      llvm::any_of(Node->op_values(),
+                   [](SDValue O) { return O.getValueType().isVector(); });
+  if (!HasVectorValueOrOp)
+    return TranslateLegalizeResults(Op, Node);
+
+  TargetLowering::LegalizeAction Action = TargetLowering::Legal;
+  EVT ValVT;
+  switch (Op.getOpcode()) {
+  default:
+    return TranslateLegalizeResults(Op, Node);
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(Node);
+    ISD::LoadExtType ExtType = LD->getExtensionType();
+    EVT LoadedVT = LD->getMemoryVT();
+    if (LoadedVT.isVector() && ExtType != ISD::NON_EXTLOAD)
+      Action = TLI.getLoadExtAction(ExtType, LD->getValueType(0), LoadedVT);
+    break;
+  }
+  case ISD::STORE: {
+    StoreSDNode *ST = cast<StoreSDNode>(Node);
+    EVT StVT = ST->getMemoryVT();
+    MVT ValVT = ST->getValue().getSimpleValueType();
+    if (StVT.isVector() && ST->isTruncatingStore())
+      Action = TLI.getTruncStoreAction(ValVT, StVT);
+    break;
+  }
+  case ISD::MERGE_VALUES:
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    // This operation lies about being legal: when it claims to be legal,
+    // it should actually be expanded.
+    if (Action == TargetLowering::Legal)
+      Action = TargetLowering::Expand;
+    break;
+#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
+  case ISD::STRICT_##DAGN:
+#include "llvm/IR/ConstrainedOps.def"
+    ValVT = Node->getValueType(0);
+    if (Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
+        Op.getOpcode() == ISD::STRICT_UINT_TO_FP)
+      ValVT = Node->getOperand(1).getValueType();
+    if (Op.getOpcode() == ISD::STRICT_FSETCC ||
+        Op.getOpcode() == ISD::STRICT_FSETCCS) {
+      MVT OpVT = Node->getOperand(1).getSimpleValueType();
+      ISD::CondCode CCCode = cast<CondCodeSDNode>(Node->getOperand(3))->get();
+      Action = TLI.getCondCodeAction(CCCode, OpVT);
+      if (Action == TargetLowering::Legal)
+        Action = TLI.getOperationAction(Node->getOpcode(), OpVT);
+    } else {
+      Action = TLI.getOperationAction(Node->getOpcode(), ValVT);
+    }
+    // If we're asked to expand a strict vector floating-point operation,
+    // by default we're going to simply unroll it.  That is usually the
+    // best approach, except in the case where the resulting strict (scalar)
+    // operations would themselves use the fallback mutation to non-strict.
+    // In that specific case, just do the fallback on the vector op.
+    if (Action == TargetLowering::Expand && !TLI.isStrictFPEnabled() &&
+        TLI.getStrictFPOperationAction(Node->getOpcode(), ValVT) ==
+            TargetLowering::Legal) {
+      EVT EltVT = ValVT.getVectorElementType();
+      if (TLI.getOperationAction(Node->getOpcode(), EltVT)
+          == TargetLowering::Expand &&
+          TLI.getStrictFPOperationAction(Node->getOpcode(), EltVT)
+          == TargetLowering::Legal)
+        Action = TargetLowering::Legal;
+    }
+    break;
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::MULHS:
+  case ISD::MULHU:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::FSHL:
+  case ISD::FSHR:
+  case ISD::ROTL:
+  case ISD::ROTR:
+  case ISD::ABS:
+  case ISD::BSWAP:
+  case ISD::BITREVERSE:
+  case ISD::CTLZ:
+  case ISD::CTTZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTPOP:
+  case ISD::SELECT:
+  case ISD::VSELECT:
+  case ISD::SELECT_CC:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+  case ISD::TRUNCATE:
+  case ISD::SIGN_EXTEND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::FNEG:
+  case ISD::FABS:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
+  case ISD::FCOPYSIGN:
+  case ISD::FSQRT:
+  case ISD::FSIN:
+  case ISD::FCOS:
+  case ISD::FLDEXP:
+  case ISD::FPOWI:
+  case ISD::FPOW:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FCEIL:
+  case ISD::FTRUNC:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::FROUND:
+  case ISD::FROUNDEVEN:
+  case ISD::FFLOOR:
+  case ISD::FP_ROUND:
+  case ISD::FP_EXTEND:
+  case ISD::FMA:
+  case ISD::SIGN_EXTEND_INREG:
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI:
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+  case ISD::FCANONICALIZE:
+  case ISD::FFREXP:
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+  case ISD::MGATHER:
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    break;
+  case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
+  case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT: {
+    unsigned Scale = Node->getConstantOperandVal(2);
+    Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
+                                              Node->getValueType(0), Scale);
+    break;
+  }
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN:
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VECREDUCE_FMINIMUM:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(0).getValueType());
+    break;
+  case ISD::VECREDUCE_SEQ_FADD:
+  case ISD::VECREDUCE_SEQ_FMUL:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(1).getValueType());
+    break;
+  case ISD::SETCC: {
+    MVT OpVT = Node->getOperand(0).getSimpleValueType();
+    ISD::CondCode CCCode = cast<CondCodeSDNode>(Node->getOperand(2))->get();
+    Action = TLI.getCondCodeAction(CCCode, OpVT);
+    if (Action == TargetLowering::Legal)
+      Action = TLI.getOperationAction(Node->getOpcode(), OpVT);
+    break;
+  }
+
+#define BEGIN_REGISTER_VP_SDNODE(VPID, LEGALPOS, ...)                          \
+  case ISD::VPID: {                                                            \
+    EVT LegalizeVT = LEGALPOS < 0 ? Node->getValueType(-(1 + LEGALPOS))        \
+                                  : Node->getOperand(LEGALPOS).getValueType(); \
+    if (ISD::VPID == ISD::VP_SETCC) {                                          \
+      ISD::CondCode CCCode = cast<CondCodeSDNode>(Node->getOperand(2))->get(); \
+      Action = TLI.getCondCodeAction(CCCode, LegalizeVT.getSimpleVT());        \
+      if (Action != TargetLowering::Legal)                                     \
+        break;                                                                 \
+    }                                                                          \
+    Action = TLI.getOperationAction(Node->getOpcode(), LegalizeVT);            \
+  } break;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+
+  LLVM_DEBUG(dbgs() << "\nLegalizing vector op: "; Node->dump(&DAG));
+
+  SmallVector<SDValue, 8> ResultVals;
+  switch (Action) {
+  default: llvm_unreachable("This action is not supported yet!");
+  case TargetLowering::Promote:
+    assert((Op.getOpcode() != ISD::LOAD && Op.getOpcode() != ISD::STORE) &&
+           "This action is not supported yet!");
+    LLVM_DEBUG(dbgs() << "Promoting\n");
+    Promote(Node, ResultVals);
+    assert(!ResultVals.empty() && "No results for promotion?");
+    break;
+  case TargetLowering::Legal:
+    LLVM_DEBUG(dbgs() << "Legal node: nothing to do\n");
+    break;
+  case TargetLowering::Custom:
+    LLVM_DEBUG(dbgs() << "Trying custom legalization\n");
+    if (LowerOperationWrapper(Node, ResultVals))
+      break;
+    LLVM_DEBUG(dbgs() << "Could not custom legalize node\n");
+    [[fallthrough]];
+  case TargetLowering::Expand:
+    LLVM_DEBUG(dbgs() << "Expanding\n");
+    Expand(Node, ResultVals);
+    break;
+  }
+
+  if (ResultVals.empty())
+    return TranslateLegalizeResults(Op, Node);
+
+  Changed = true;
+  return RecursivelyLegalizeResults(Op, ResultVals);
+}
+
+// FIXME: This is very similar to TargetLowering::LowerOperationWrapper. Can we
+// merge them somehow?
+bool VectorLegalizer::LowerOperationWrapper(SDNode *Node,
+                                            SmallVectorImpl<SDValue> &Results) {
+  SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+
+  if (!Res.getNode())
+    return false;
+
+  if (Res == SDValue(Node, 0))
+    return true;
+
+  // If the original node has one result, take the return value from
+  // LowerOperation as is. It might not be result number 0.
+  if (Node->getNumValues() == 1) {
+    Results.push_back(Res);
+    return true;
+  }
+
+  // If the original node has multiple results, then the return node should
+  // have the same number of results.
+  assert((Node->getNumValues() == Res->getNumValues()) &&
+         "Lowering returned the wrong number of results!");
+
+  // Places new result values base on N result number.
+  for (unsigned I = 0, E = Node->getNumValues(); I != E; ++I)
+    Results.push_back(Res.getValue(I));
+
+  return true;
+}
+
+void VectorLegalizer::Promote(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
+  // For a few operations there is a specific concept for promotion based on
+  // the operand's type.
+  switch (Node->getOpcode()) {
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::STRICT_SINT_TO_FP:
+  case ISD::STRICT_UINT_TO_FP:
+    // "Promote" the operation by extending the operand.
+    PromoteINT_TO_FP(Node, Results);
+    return;
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
+  case ISD::STRICT_FP_TO_SINT:
+    // Promote the operation by extending the operand.
+    PromoteFP_TO_INT(Node, Results);
+    return;
+  case ISD::FP_ROUND:
+  case ISD::FP_EXTEND:
+    // These operations are used to do promotion so they can't be promoted
+    // themselves.
+    llvm_unreachable("Don't know how to promote this operation!");
+  }
+
+  // There are currently two cases of vector promotion:
+  // 1) Bitcasting a vector of integers to a different type to a vector of the
+  //    same overall length. For example, x86 promotes ISD::AND v2i32 to v1i64.
+  // 2) Extending a vector of floats to a vector of the same number of larger
+  //    floats. For example, AArch64 promotes ISD::FADD on v4f16 to v4f32.
+  assert(Node->getNumValues() == 1 &&
+         "Can't promote a vector with multiple results!");
+  MVT VT = Node->getSimpleValueType(0);
+  MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
+  SDLoc dl(Node);
+  SmallVector<SDValue, 4> Operands(Node->getNumOperands());
+
+  for (unsigned j = 0; j != Node->getNumOperands(); ++j) {
+    if (Node->getOperand(j).getValueType().isVector())
+      if (Node->getOperand(j)
+              .getValueType()
+              .getVectorElementType()
+              .isFloatingPoint() &&
+          NVT.isVector() && NVT.getVectorElementType().isFloatingPoint())
+        Operands[j] = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(j));
+      else
+        Operands[j] = DAG.getNode(ISD::BITCAST, dl, NVT, Node->getOperand(j));
+    else
+      Operands[j] = Node->getOperand(j);
+  }
+
+  SDValue Res =
+      DAG.getNode(Node->getOpcode(), dl, NVT, Operands, Node->getFlags());
+
+  if ((VT.isFloatingPoint() && NVT.isFloatingPoint()) ||
+      (VT.isVector() && VT.getVectorElementType().isFloatingPoint() &&
+       NVT.isVector() && NVT.getVectorElementType().isFloatingPoint()))
+    Res = DAG.getNode(ISD::FP_ROUND, dl, VT, Res,
+                      DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
+  else
+    Res = DAG.getNode(ISD::BITCAST, dl, VT, Res);
+
+  Results.push_back(Res);
+}
+
+void VectorLegalizer::PromoteINT_TO_FP(SDNode *Node,
+                                       SmallVectorImpl<SDValue> &Results) {
+  // INT_TO_FP operations may require the input operand be promoted even
+  // when the type is otherwise legal.
+  bool IsStrict = Node->isStrictFPOpcode();
+  MVT VT = Node->getOperand(IsStrict ? 1 : 0).getSimpleValueType();
+  MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
+  assert(NVT.getVectorNumElements() == VT.getVectorNumElements() &&
+         "Vectors have different number of elements!");
+
+  SDLoc dl(Node);
+  SmallVector<SDValue, 4> Operands(Node->getNumOperands());
+
+  unsigned Opc = (Node->getOpcode() == ISD::UINT_TO_FP ||
+                  Node->getOpcode() == ISD::STRICT_UINT_TO_FP)
+                     ? ISD::ZERO_EXTEND
+                     : ISD::SIGN_EXTEND;
+  for (unsigned j = 0; j != Node->getNumOperands(); ++j) {
+    if (Node->getOperand(j).getValueType().isVector())
+      Operands[j] = DAG.getNode(Opc, dl, NVT, Node->getOperand(j));
+    else
+      Operands[j] = Node->getOperand(j);
+  }
+
+  if (IsStrict) {
+    SDValue Res = DAG.getNode(Node->getOpcode(), dl,
+                              {Node->getValueType(0), MVT::Other}, Operands);
+    Results.push_back(Res);
+    Results.push_back(Res.getValue(1));
+    return;
+  }
+
+  SDValue Res =
+      DAG.getNode(Node->getOpcode(), dl, Node->getValueType(0), Operands);
+  Results.push_back(Res);
+}
+
+// For FP_TO_INT we promote the result type to a vector type with wider
+// elements and then truncate the result.  This is different from the default
+// PromoteVector which uses bitcast to promote thus assumning that the
+// promoted vector type has the same overall size.
+void VectorLegalizer::PromoteFP_TO_INT(SDNode *Node,
+                                       SmallVectorImpl<SDValue> &Results) {
+  MVT VT = Node->getSimpleValueType(0);
+  MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
+  bool IsStrict = Node->isStrictFPOpcode();
+  assert(NVT.getVectorNumElements() == VT.getVectorNumElements() &&
+         "Vectors have different number of elements!");
+
+  unsigned NewOpc = Node->getOpcode();
+  // Change FP_TO_UINT to FP_TO_SINT if possible.
+  // TODO: Should we only do this if FP_TO_UINT itself isn't legal?
+  if (NewOpc == ISD::FP_TO_UINT &&
+      TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT))
+    NewOpc = ISD::FP_TO_SINT;
+
+  if (NewOpc == ISD::STRICT_FP_TO_UINT &&
+      TLI.isOperationLegalOrCustom(ISD::STRICT_FP_TO_SINT, NVT))
+    NewOpc = ISD::STRICT_FP_TO_SINT;
+
+  SDLoc dl(Node);
+  SDValue Promoted, Chain;
+  if (IsStrict) {
+    Promoted = DAG.getNode(NewOpc, dl, {NVT, MVT::Other},
+                           {Node->getOperand(0), Node->getOperand(1)});
+    Chain = Promoted.getValue(1);
+  } else
+    Promoted = DAG.getNode(NewOpc, dl, NVT, Node->getOperand(0));
+
+  // Assert that the converted value fits in the original type.  If it doesn't
+  // (eg: because the value being converted is too big), then the result of the
+  // original operation was undefined anyway, so the assert is still correct.
+  if (Node->getOpcode() == ISD::FP_TO_UINT ||
+      Node->getOpcode() == ISD::STRICT_FP_TO_UINT)
+    NewOpc = ISD::AssertZext;
+  else
+    NewOpc = ISD::AssertSext;
+
+  Promoted = DAG.getNode(NewOpc, dl, NVT, Promoted,
+                         DAG.getValueType(VT.getScalarType()));
+  Promoted = DAG.getNode(ISD::TRUNCATE, dl, VT, Promoted);
+  Results.push_back(Promoted);
+  if (IsStrict)
+    Results.push_back(Chain);
+}
+
+std::pair<SDValue, SDValue> VectorLegalizer::ExpandLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  return TLI.scalarizeVectorLoad(LD, DAG);
+}
+
+SDValue VectorLegalizer::ExpandStore(SDNode *N) {
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  SDValue TF = TLI.scalarizeVectorStore(ST, DAG);
+  return TF;
+}
+
+void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
+  switch (Node->getOpcode()) {
+  case ISD::LOAD: {
+    std::pair<SDValue, SDValue> Tmp = ExpandLoad(Node);
+    Results.push_back(Tmp.first);
+    Results.push_back(Tmp.second);
+    return;
+  }
+  case ISD::STORE:
+    Results.push_back(ExpandStore(Node));
+    return;
+  case ISD::MERGE_VALUES:
+    for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
+      Results.push_back(Node->getOperand(i));
+    return;
+  case ISD::SIGN_EXTEND_INREG:
+    Results.push_back(ExpandSEXTINREG(Node));
+    return;
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+    Results.push_back(ExpandANY_EXTEND_VECTOR_INREG(Node));
+    return;
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    Results.push_back(ExpandSIGN_EXTEND_VECTOR_INREG(Node));
+    return;
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    Results.push_back(ExpandZERO_EXTEND_VECTOR_INREG(Node));
+    return;
+  case ISD::BSWAP:
+    Results.push_back(ExpandBSWAP(Node));
+    return;
+  case ISD::VP_BSWAP:
+    Results.push_back(TLI.expandVPBSWAP(Node, DAG));
+    return;
+  case ISD::VSELECT:
+    Results.push_back(ExpandVSELECT(Node));
+    return;
+  case ISD::VP_SELECT:
+    Results.push_back(ExpandVP_SELECT(Node));
+    return;
+  case ISD::VP_SREM:
+  case ISD::VP_UREM:
+    if (SDValue Expanded = ExpandVP_REM(Node)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::SELECT:
+    Results.push_back(ExpandSELECT(Node));
+    return;
+  case ISD::SELECT_CC: {
+    if (Node->getValueType(0).isScalableVector()) {
+      EVT CondVT = TLI.getSetCCResultType(
+          DAG.getDataLayout(), *DAG.getContext(), Node->getValueType(0));
+      SDValue SetCC =
+          DAG.getNode(ISD::SETCC, SDLoc(Node), CondVT, Node->getOperand(0),
+                      Node->getOperand(1), Node->getOperand(4));
+      Results.push_back(DAG.getSelect(SDLoc(Node), Node->getValueType(0), SetCC,
+                                      Node->getOperand(2),
+                                      Node->getOperand(3)));
+      return;
+    }
+    break;
+  }
+  case ISD::FP_TO_UINT:
+    ExpandFP_TO_UINT(Node, Results);
+    return;
+  case ISD::UINT_TO_FP:
+    ExpandUINT_TO_FLOAT(Node, Results);
+    return;
+  case ISD::FNEG:
+    Results.push_back(ExpandFNEG(Node));
+    return;
+  case ISD::FSUB:
+    ExpandFSUB(Node, Results);
+    return;
+  case ISD::SETCC:
+  case ISD::VP_SETCC:
+    ExpandSETCC(Node, Results);
+    return;
+  case ISD::ABS:
+    if (SDValue Expanded = TLI.expandABS(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::ABDS:
+  case ISD::ABDU:
+    if (SDValue Expanded = TLI.expandABD(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::BITREVERSE:
+    ExpandBITREVERSE(Node, Results);
+    return;
+  case ISD::VP_BITREVERSE:
+    if (SDValue Expanded = TLI.expandVPBITREVERSE(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::CTPOP:
+    if (SDValue Expanded = TLI.expandCTPOP(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::VP_CTPOP:
+    if (SDValue Expanded = TLI.expandVPCTPOP(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+    if (SDValue Expanded = TLI.expandCTLZ(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::VP_CTLZ:
+  case ISD::VP_CTLZ_ZERO_UNDEF:
+    if (SDValue Expanded = TLI.expandVPCTLZ(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:
+    if (SDValue Expanded = TLI.expandCTTZ(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::VP_CTTZ:
+  case ISD::VP_CTTZ_ZERO_UNDEF:
+    if (SDValue Expanded = TLI.expandVPCTTZ(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::FSHL:
+  case ISD::VP_FSHL:
+  case ISD::FSHR:
+  case ISD::VP_FSHR:
+    if (SDValue Expanded = TLI.expandFunnelShift(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::ROTL:
+  case ISD::ROTR:
+    if (SDValue Expanded = TLI.expandROT(Node, false /*AllowVectorOps*/, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+    if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
+    if (SDValue Expanded = TLI.expandIntMINMAX(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::UADDO:
+  case ISD::USUBO:
+    ExpandUADDSUBO(Node, Results);
+    return;
+  case ISD::SADDO:
+  case ISD::SSUBO:
+    ExpandSADDSUBO(Node, Results);
+    return;
+  case ISD::UMULO:
+  case ISD::SMULO:
+    ExpandMULO(Node, Results);
+    return;
+  case ISD::USUBSAT:
+  case ISD::SSUBSAT:
+  case ISD::UADDSAT:
+  case ISD::SADDSAT:
+    if (SDValue Expanded = TLI.expandAddSubSat(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::USHLSAT:
+  case ISD::SSHLSAT:
+    if (SDValue Expanded = TLI.expandShlSat(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    // Expand the fpsosisat if it is scalable to prevent it from unrolling below.
+    if (Node->getValueType(0).isScalableVector()) {
+      if (SDValue Expanded = TLI.expandFP_TO_INT_SAT(Node, DAG)) {
+        Results.push_back(Expanded);
+        return;
+      }
+    }
+    break;
+  case ISD::SMULFIX:
+  case ISD::UMULFIX:
+    if (SDValue Expanded = TLI.expandFixedPointMul(Node, DAG)) {
+      Results.push_back(Expanded);
+      return;
+    }
+    break;
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIXSAT:
+    // FIXME: We do not expand SMULFIXSAT/UMULFIXSAT here yet, not sure exactly
+    // why. Maybe it results in worse codegen compared to the unroll for some
+    // targets? This should probably be investigated. And if we still prefer to
+    // unroll an explanation could be helpful.
+    break;
+  case ISD::SDIVFIX:
+  case ISD::UDIVFIX:
+    ExpandFixedPointDiv(Node, Results);
+    return;
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIXSAT:
+    break;
+#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
+  case ISD::STRICT_##DAGN:
+#include "llvm/IR/ConstrainedOps.def"
+    ExpandStrictFPOp(Node, Results);
+    return;
+  case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN:
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VECREDUCE_FMINIMUM:
+    Results.push_back(TLI.expandVecReduce(Node, DAG));
+    return;
+  case ISD::VECREDUCE_SEQ_FADD:
+  case ISD::VECREDUCE_SEQ_FMUL:
+    Results.push_back(TLI.expandVecReduceSeq(Node, DAG));
+    return;
+  case ISD::SREM:
+  case ISD::UREM:
+    ExpandREM(Node, Results);
+    return;
+  case ISD::VP_MERGE:
+    Results.push_back(ExpandVP_MERGE(Node));
+    return;
+  }
+
+  SDValue Unrolled = DAG.UnrollVectorOp(Node);
+  for (unsigned I = 0, E = Unrolled->getNumValues(); I != E; ++I)
+    Results.push_back(Unrolled.getValue(I));
+}
+
+SDValue VectorLegalizer::ExpandSELECT(SDNode *Node) {
+  // Lower a select instruction where the condition is a scalar and the
+  // operands are vectors. Lower this select to VSELECT and implement it
+  // using XOR AND OR. The selector bit is broadcasted.
+  EVT VT = Node->getValueType(0);
+  SDLoc DL(Node);
+
+  SDValue Mask = Node->getOperand(0);
+  SDValue Op1 = Node->getOperand(1);
+  SDValue Op2 = Node->getOperand(2);
+
+  assert(VT.isVector() && !Mask.getValueType().isVector()
+         && Op1.getValueType() == Op2.getValueType() && "Invalid type");
+
+  // If we can't even use the basic vector operations of
+  // AND,OR,XOR, we will have to scalarize the op.
+  // Notice that the operation may be 'promoted' which means that it is
+  // 'bitcasted' to another type which is handled.
+  // Also, we need to be able to construct a splat vector using either
+  // BUILD_VECTOR or SPLAT_VECTOR.
+  // FIXME: Should we also permit fixed-length SPLAT_VECTOR as a fallback to
+  // BUILD_VECTOR?
+  if (TLI.getOperationAction(ISD::AND, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::XOR, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::OR, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(VT.isFixedLengthVector() ? ISD::BUILD_VECTOR
+                                                      : ISD::SPLAT_VECTOR,
+                             VT) == TargetLowering::Expand)
+    return DAG.UnrollVectorOp(Node);
+
+  // Generate a mask operand.
+  EVT MaskTy = VT.changeVectorElementTypeToInteger();
+
+  // What is the size of each element in the vector mask.
+  EVT BitTy = MaskTy.getScalarType();
+
+  Mask = DAG.getSelect(DL, BitTy, Mask, DAG.getAllOnesConstant(DL, BitTy),
+                       DAG.getConstant(0, DL, BitTy));
+
+  // Broadcast the mask so that the entire vector is all one or all zero.
+  Mask = DAG.getSplat(MaskTy, DL, Mask);
+
+  // Bitcast the operands to be the same type as the mask.
+  // This is needed when we select between FP types because
+  // the mask is a vector of integers.
+  Op1 = DAG.getNode(ISD::BITCAST, DL, MaskTy, Op1);
+  Op2 = DAG.getNode(ISD::BITCAST, DL, MaskTy, Op2);
+
+  SDValue NotMask = DAG.getNOT(DL, Mask, MaskTy);
+
+  Op1 = DAG.getNode(ISD::AND, DL, MaskTy, Op1, Mask);
+  Op2 = DAG.getNode(ISD::AND, DL, MaskTy, Op2, NotMask);
+  SDValue Val = DAG.getNode(ISD::OR, DL, MaskTy, Op1, Op2);
+  return DAG.getNode(ISD::BITCAST, DL, Node->getValueType(0), Val);
+}
+
+SDValue VectorLegalizer::ExpandSEXTINREG(SDNode *Node) {
+  EVT VT = Node->getValueType(0);
+
+  // Make sure that the SRA and SHL instructions are available.
+  if (TLI.getOperationAction(ISD::SRA, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::SHL, VT) == TargetLowering::Expand)
+    return DAG.UnrollVectorOp(Node);
+
+  SDLoc DL(Node);
+  EVT OrigTy = cast<VTSDNode>(Node->getOperand(1))->getVT();
+
+  unsigned BW = VT.getScalarSizeInBits();
+  unsigned OrigBW = OrigTy.getScalarSizeInBits();
+  SDValue ShiftSz = DAG.getConstant(BW - OrigBW, DL, VT);
+
+  SDValue Op = DAG.getNode(ISD::SHL, DL, VT, Node->getOperand(0), ShiftSz);
+  return DAG.getNode(ISD::SRA, DL, VT, Op, ShiftSz);
+}
+
+// Generically expand a vector anyext in register to a shuffle of the relevant
+// lanes into the appropriate locations, with other lanes left undef.
+SDValue VectorLegalizer::ExpandANY_EXTEND_VECTOR_INREG(SDNode *Node) {
+  SDLoc DL(Node);
+  EVT VT = Node->getValueType(0);
+  int NumElements = VT.getVectorNumElements();
+  SDValue Src = Node->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  int NumSrcElements = SrcVT.getVectorNumElements();
+
+  // *_EXTEND_VECTOR_INREG SrcVT can be smaller than VT - so insert the vector
+  // into a larger vector type.
+  if (SrcVT.bitsLE(VT)) {
+    assert((VT.getSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
+           "ANY_EXTEND_VECTOR_INREG vector size mismatch");
+    NumSrcElements = VT.getSizeInBits() / SrcVT.getScalarSizeInBits();
+    SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
+                             NumSrcElements);
+    Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcVT, DAG.getUNDEF(SrcVT),
+                      Src, DAG.getVectorIdxConstant(0, DL));
+  }
+
+  // Build a base mask of undef shuffles.
+  SmallVector<int, 16> ShuffleMask;
+  ShuffleMask.resize(NumSrcElements, -1);
+
+  // Place the extended lanes into the correct locations.
+  int ExtLaneScale = NumSrcElements / NumElements;
+  int EndianOffset = DAG.getDataLayout().isBigEndian() ? ExtLaneScale - 1 : 0;
+  for (int i = 0; i < NumElements; ++i)
+    ShuffleMask[i * ExtLaneScale + EndianOffset] = i;
+
+  return DAG.getNode(
+      ISD::BITCAST, DL, VT,
+      DAG.getVectorShuffle(SrcVT, DL, Src, DAG.getUNDEF(SrcVT), ShuffleMask));
+}
+
+SDValue VectorLegalizer::ExpandSIGN_EXTEND_VECTOR_INREG(SDNode *Node) {
+  SDLoc DL(Node);
+  EVT VT = Node->getValueType(0);
+  SDValue Src = Node->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  // First build an any-extend node which can be legalized above when we
+  // recurse through it.
+  SDValue Op = DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, Src);
+
+  // Now we need sign extend. Do this by shifting the elements. Even if these
+  // aren't legal operations, they have a better chance of being legalized
+  // without full scalarization than the sign extension does.
+  unsigned EltWidth = VT.getScalarSizeInBits();
+  unsigned SrcEltWidth = SrcVT.getScalarSizeInBits();
+  SDValue ShiftAmount = DAG.getConstant(EltWidth - SrcEltWidth, DL, VT);
+  return DAG.getNode(ISD::SRA, DL, VT,
+                     DAG.getNode(ISD::SHL, DL, VT, Op, ShiftAmount),
+                     ShiftAmount);
+}
+
+// Generically expand a vector zext in register to a shuffle of the relevant
+// lanes into the appropriate locations, a blend of zero into the high bits,
+// and a bitcast to the wider element type.
+SDValue VectorLegalizer::ExpandZERO_EXTEND_VECTOR_INREG(SDNode *Node) {
+  SDLoc DL(Node);
+  EVT VT = Node->getValueType(0);
+  int NumElements = VT.getVectorNumElements();
+  SDValue Src = Node->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  int NumSrcElements = SrcVT.getVectorNumElements();
+
+  // *_EXTEND_VECTOR_INREG SrcVT can be smaller than VT - so insert the vector
+  // into a larger vector type.
+  if (SrcVT.bitsLE(VT)) {
+    assert((VT.getSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
+           "ZERO_EXTEND_VECTOR_INREG vector size mismatch");
+    NumSrcElements = VT.getSizeInBits() / SrcVT.getScalarSizeInBits();
+    SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
+                             NumSrcElements);
+    Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcVT, DAG.getUNDEF(SrcVT),
+                      Src, DAG.getVectorIdxConstant(0, DL));
+  }
+
+  // Build up a zero vector to blend into this one.
+  SDValue Zero = DAG.getConstant(0, DL, SrcVT);
+
+  // Shuffle the incoming lanes into the correct position, and pull all other
+  // lanes from the zero vector.
+  auto ShuffleMask = llvm::to_vector<16>(llvm::seq<int>(0, NumSrcElements));
+
+  int ExtLaneScale = NumSrcElements / NumElements;
+  int EndianOffset = DAG.getDataLayout().isBigEndian() ? ExtLaneScale - 1 : 0;
+  for (int i = 0; i < NumElements; ++i)
+    ShuffleMask[i * ExtLaneScale + EndianOffset] = NumSrcElements + i;
+
+  return DAG.getNode(ISD::BITCAST, DL, VT,
+                     DAG.getVectorShuffle(SrcVT, DL, Zero, Src, ShuffleMask));
+}
+
+static void createBSWAPShuffleMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
+  for (int I = 0, E = VT.getVectorNumElements(); I != E; ++I)
+    for (int J = ScalarSizeInBytes - 1; J >= 0; --J)
+      ShuffleMask.push_back((I * ScalarSizeInBytes) + J);
+}
+
+SDValue VectorLegalizer::ExpandBSWAP(SDNode *Node) {
+  EVT VT = Node->getValueType(0);
+
+  // Scalable vectors can't use shuffle expansion.
+  if (VT.isScalableVector())
+    return TLI.expandBSWAP(Node, DAG);
+
+  // Generate a byte wise shuffle mask for the BSWAP.
+  SmallVector<int, 16> ShuffleMask;
+  createBSWAPShuffleMask(VT, ShuffleMask);
+  EVT ByteVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, ShuffleMask.size());
+
+  // Only emit a shuffle if the mask is legal.
+  if (TLI.isShuffleMaskLegal(ShuffleMask, ByteVT)) {
+    SDLoc DL(Node);
+    SDValue Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Node->getOperand(0));
+    Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT), ShuffleMask);
+    return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+  }
+
+  // If we have the appropriate vector bit operations, it is better to use them
+  // than unrolling and expanding each component.
+  if (TLI.isOperationLegalOrCustom(ISD::SHL, VT) &&
+      TLI.isOperationLegalOrCustom(ISD::SRL, VT) &&
+      TLI.isOperationLegalOrCustomOrPromote(ISD::AND, VT) &&
+      TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT))
+    return TLI.expandBSWAP(Node, DAG);
+
+  // Otherwise unroll.
+  return DAG.UnrollVectorOp(Node);
+}
+
+void VectorLegalizer::ExpandBITREVERSE(SDNode *Node,
+                                       SmallVectorImpl<SDValue> &Results) {
+  EVT VT = Node->getValueType(0);
+
+  // We can't unroll or use shuffles for scalable vectors.
+  if (VT.isScalableVector()) {
+    Results.push_back(TLI.expandBITREVERSE(Node, DAG));
+    return;
+  }
+
+  // If we have the scalar operation, it's probably cheaper to unroll it.
+  if (TLI.isOperationLegalOrCustom(ISD::BITREVERSE, VT.getScalarType())) {
+    SDValue Tmp = DAG.UnrollVectorOp(Node);
+    Results.push_back(Tmp);
+    return;
+  }
+
+  // If the vector element width is a whole number of bytes, test if its legal
+  // to BSWAP shuffle the bytes and then perform the BITREVERSE on the byte
+  // vector. This greatly reduces the number of bit shifts necessary.
+  unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
+  if (ScalarSizeInBits > 8 && (ScalarSizeInBits % 8) == 0) {
+    SmallVector<int, 16> BSWAPMask;
+    createBSWAPShuffleMask(VT, BSWAPMask);
+
+    EVT ByteVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, BSWAPMask.size());
+    if (TLI.isShuffleMaskLegal(BSWAPMask, ByteVT) &&
+        (TLI.isOperationLegalOrCustom(ISD::BITREVERSE, ByteVT) ||
+         (TLI.isOperationLegalOrCustom(ISD::SHL, ByteVT) &&
+          TLI.isOperationLegalOrCustom(ISD::SRL, ByteVT) &&
+          TLI.isOperationLegalOrCustomOrPromote(ISD::AND, ByteVT) &&
+          TLI.isOperationLegalOrCustomOrPromote(ISD::OR, ByteVT)))) {
+      SDLoc DL(Node);
+      SDValue Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Node->getOperand(0));
+      Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT),
+                                BSWAPMask);
+      Op = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Op);
+      Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
+      Results.push_back(Op);
+      return;
+    }
+  }
+
+  // If we have the appropriate vector bit operations, it is better to use them
+  // than unrolling and expanding each component.
+  if (TLI.isOperationLegalOrCustom(ISD::SHL, VT) &&
+      TLI.isOperationLegalOrCustom(ISD::SRL, VT) &&
+      TLI.isOperationLegalOrCustomOrPromote(ISD::AND, VT) &&
+      TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT)) {
+    Results.push_back(TLI.expandBITREVERSE(Node, DAG));
+    return;
+  }
+
+  // Otherwise unroll.
+  SDValue Tmp = DAG.UnrollVectorOp(Node);
+  Results.push_back(Tmp);
+}
+
+SDValue VectorLegalizer::ExpandVSELECT(SDNode *Node) {
+  // Implement VSELECT in terms of XOR, AND, OR
+  // on platforms which do not support blend natively.
+  SDLoc DL(Node);
+
+  SDValue Mask = Node->getOperand(0);
+  SDValue Op1 = Node->getOperand(1);
+  SDValue Op2 = Node->getOperand(2);
+
+  EVT VT = Mask.getValueType();
+
+  // If we can't even use the basic vector operations of
+  // AND,OR,XOR, we will have to scalarize the op.
+  // Notice that the operation may be 'promoted' which means that it is
+  // 'bitcasted' to another type which is handled.
+  if (TLI.getOperationAction(ISD::AND, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::XOR, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::OR, VT) == TargetLowering::Expand)
+    return DAG.UnrollVectorOp(Node);
+
+  // This operation also isn't safe with AND, OR, XOR when the boolean type is
+  // 0/1 and the select operands aren't also booleans, as we need an all-ones
+  // vector constant to mask with.
+  // FIXME: Sign extend 1 to all ones if that's legal on the target.
+  auto BoolContents = TLI.getBooleanContents(Op1.getValueType());
+  if (BoolContents != TargetLowering::ZeroOrNegativeOneBooleanContent &&
+      !(BoolContents == TargetLowering::ZeroOrOneBooleanContent &&
+        Op1.getValueType().getVectorElementType() == MVT::i1))
+    return DAG.UnrollVectorOp(Node);
+
+  // If the mask and the type are different sizes, unroll the vector op. This
+  // can occur when getSetCCResultType returns something that is different in
+  // size from the operand types. For example, v4i8 = select v4i32, v4i8, v4i8.
+  if (VT.getSizeInBits() != Op1.getValueSizeInBits())
+    return DAG.UnrollVectorOp(Node);
+
+  // Bitcast the operands to be the same type as the mask.
+  // This is needed when we select between FP types because
+  // the mask is a vector of integers.
+  Op1 = DAG.getNode(ISD::BITCAST, DL, VT, Op1);
+  Op2 = DAG.getNode(ISD::BITCAST, DL, VT, Op2);
+
+  SDValue NotMask = DAG.getNOT(DL, Mask, VT);
+
+  Op1 = DAG.getNode(ISD::AND, DL, VT, Op1, Mask);
+  Op2 = DAG.getNode(ISD::AND, DL, VT, Op2, NotMask);
+  SDValue Val = DAG.getNode(ISD::OR, DL, VT, Op1, Op2);
+  return DAG.getNode(ISD::BITCAST, DL, Node->getValueType(0), Val);
+}
+
+SDValue VectorLegalizer::ExpandVP_SELECT(SDNode *Node) {
+  // Implement VP_SELECT in terms of VP_XOR, VP_AND and VP_OR on platforms which
+  // do not support it natively.
+  SDLoc DL(Node);
+
+  SDValue Mask = Node->getOperand(0);
+  SDValue Op1 = Node->getOperand(1);
+  SDValue Op2 = Node->getOperand(2);
+  SDValue EVL = Node->getOperand(3);
+
+  EVT VT = Mask.getValueType();
+
+  // If we can't even use the basic vector operations of
+  // VP_AND,VP_OR,VP_XOR, we will have to scalarize the op.
+  if (TLI.getOperationAction(ISD::VP_AND, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::VP_XOR, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::VP_OR, VT) == TargetLowering::Expand)
+    return DAG.UnrollVectorOp(Node);
+
+  // This operation also isn't safe when the operands aren't also booleans.
+  if (Op1.getValueType().getVectorElementType() != MVT::i1)
+    return DAG.UnrollVectorOp(Node);
+
+  SDValue Ones = DAG.getAllOnesConstant(DL, VT);
+  SDValue NotMask = DAG.getNode(ISD::VP_XOR, DL, VT, Mask, Ones, Ones, EVL);
+
+  Op1 = DAG.getNode(ISD::VP_AND, DL, VT, Op1, Mask, Ones, EVL);
+  Op2 = DAG.getNode(ISD::VP_AND, DL, VT, Op2, NotMask, Ones, EVL);
+  return DAG.getNode(ISD::VP_OR, DL, VT, Op1, Op2, Ones, EVL);
+}
+
+SDValue VectorLegalizer::ExpandVP_MERGE(SDNode *Node) {
+  // Implement VP_MERGE in terms of VSELECT. Construct a mask where vector
+  // indices less than the EVL/pivot are true. Combine that with the original
+  // mask for a full-length mask. Use a full-length VSELECT to select between
+  // the true and false values.
+  SDLoc DL(Node);
+
+  SDValue Mask = Node->getOperand(0);
+  SDValue Op1 = Node->getOperand(1);
+  SDValue Op2 = Node->getOperand(2);
+  SDValue EVL = Node->getOperand(3);
+
+  EVT MaskVT = Mask.getValueType();
+  bool IsFixedLen = MaskVT.isFixedLengthVector();
+
+  EVT EVLVecVT = EVT::getVectorVT(*DAG.getContext(), EVL.getValueType(),
+                                  MaskVT.getVectorElementCount());
+
+  // If we can't construct the EVL mask efficiently, it's better to unroll.
+  if ((IsFixedLen &&
+       !TLI.isOperationLegalOrCustom(ISD::BUILD_VECTOR, EVLVecVT)) ||
+      (!IsFixedLen &&
+       (!TLI.isOperationLegalOrCustom(ISD::STEP_VECTOR, EVLVecVT) ||
+        !TLI.isOperationLegalOrCustom(ISD::SPLAT_VECTOR, EVLVecVT))))
+    return DAG.UnrollVectorOp(Node);
+
+  // If using a SETCC would result in a different type than the mask type,
+  // unroll.
+  if (TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                             EVLVecVT) != MaskVT)
+    return DAG.UnrollVectorOp(Node);
+
+  SDValue StepVec = DAG.getStepVector(DL, EVLVecVT);
+  SDValue SplatEVL = DAG.getSplat(EVLVecVT, DL, EVL);
+  SDValue EVLMask =
+      DAG.getSetCC(DL, MaskVT, StepVec, SplatEVL, ISD::CondCode::SETULT);
+
+  SDValue FullMask = DAG.getNode(ISD::AND, DL, MaskVT, Mask, EVLMask);
+  return DAG.getSelect(DL, Node->getValueType(0), FullMask, Op1, Op2);
+}
+
+SDValue VectorLegalizer::ExpandVP_REM(SDNode *Node) {
+  // Implement VP_SREM/UREM in terms of VP_SDIV/VP_UDIV, VP_MUL, VP_SUB.
+  EVT VT = Node->getValueType(0);
+
+  unsigned DivOpc = Node->getOpcode() == ISD::VP_SREM ? ISD::VP_SDIV : ISD::VP_UDIV;
+
+  if (!TLI.isOperationLegalOrCustom(DivOpc, VT) ||
+      !TLI.isOperationLegalOrCustom(ISD::VP_MUL, VT) ||
+      !TLI.isOperationLegalOrCustom(ISD::VP_SUB, VT))
+    return SDValue();
+
+  SDLoc DL(Node);
+
+  SDValue Dividend = Node->getOperand(0);
+  SDValue Divisor = Node->getOperand(1);
+  SDValue Mask = Node->getOperand(2);
+  SDValue EVL = Node->getOperand(3);
+
+  // X % Y -> X-X/Y*Y
+  SDValue Div = DAG.getNode(DivOpc, DL, VT, Dividend, Divisor, Mask, EVL);
+  SDValue Mul = DAG.getNode(ISD::VP_MUL, DL, VT, Divisor, Div, Mask, EVL);
+  return DAG.getNode(ISD::VP_SUB, DL, VT, Dividend, Mul, Mask, EVL);
+}
+
+void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node,
+                                       SmallVectorImpl<SDValue> &Results) {
+  // Attempt to expand using TargetLowering.
+  SDValue Result, Chain;
+  if (TLI.expandFP_TO_UINT(Node, Result, Chain, DAG)) {
+    Results.push_back(Result);
+    if (Node->isStrictFPOpcode())
+      Results.push_back(Chain);
+    return;
+  }
+
+  // Otherwise go ahead and unroll.
+  if (Node->isStrictFPOpcode()) {
+    UnrollStrictFPOp(Node, Results);
+    return;
+  }
+
+  Results.push_back(DAG.UnrollVectorOp(Node));
+}
+
+void VectorLegalizer::ExpandUINT_TO_FLOAT(SDNode *Node,
+                                          SmallVectorImpl<SDValue> &Results) {
+  bool IsStrict = Node->isStrictFPOpcode();
+  unsigned OpNo = IsStrict ? 1 : 0;
+  SDValue Src = Node->getOperand(OpNo);
+  EVT VT = Src.getValueType();
+  SDLoc DL(Node);
+
+  // Attempt to expand using TargetLowering.
+  SDValue Result;
+  SDValue Chain;
+  if (TLI.expandUINT_TO_FP(Node, Result, Chain, DAG)) {
+    Results.push_back(Result);
+    if (IsStrict)
+      Results.push_back(Chain);
+    return;
+  }
+
+  // Make sure that the SINT_TO_FP and SRL instructions are available.
+  if (((!IsStrict && TLI.getOperationAction(ISD::SINT_TO_FP, VT) ==
+                         TargetLowering::Expand) ||
+       (IsStrict && TLI.getOperationAction(ISD::STRICT_SINT_TO_FP, VT) ==
+                        TargetLowering::Expand)) ||
+      TLI.getOperationAction(ISD::SRL, VT) == TargetLowering::Expand) {
+    if (IsStrict) {
+      UnrollStrictFPOp(Node, Results);
+      return;
+    }
+
+    Results.push_back(DAG.UnrollVectorOp(Node));
+    return;
+  }
+
+  unsigned BW = VT.getScalarSizeInBits();
+  assert((BW == 64 || BW == 32) &&
+         "Elements in vector-UINT_TO_FP must be 32 or 64 bits wide");
+
+  SDValue HalfWord = DAG.getConstant(BW / 2, DL, VT);
+
+  // Constants to clear the upper part of the word.
+  // Notice that we can also use SHL+SHR, but using a constant is slightly
+  // faster on x86.
+  uint64_t HWMask = (BW == 64) ? 0x00000000FFFFFFFF : 0x0000FFFF;
+  SDValue HalfWordMask = DAG.getConstant(HWMask, DL, VT);
+
+  // Two to the power of half-word-size.
+  SDValue TWOHW =
+      DAG.getConstantFP(1ULL << (BW / 2), DL, Node->getValueType(0));
+
+  // Clear upper part of LO, lower HI
+  SDValue HI = DAG.getNode(ISD::SRL, DL, VT, Src, HalfWord);
+  SDValue LO = DAG.getNode(ISD::AND, DL, VT, Src, HalfWordMask);
+
+  if (IsStrict) {
+    // Convert hi and lo to floats
+    // Convert the hi part back to the upper values
+    // TODO: Can any fast-math-flags be set on these nodes?
+    SDValue fHI = DAG.getNode(ISD::STRICT_SINT_TO_FP, DL,
+                              {Node->getValueType(0), MVT::Other},
+                              {Node->getOperand(0), HI});
+    fHI = DAG.getNode(ISD::STRICT_FMUL, DL, {Node->getValueType(0), MVT::Other},
+                      {fHI.getValue(1), fHI, TWOHW});
+    SDValue fLO = DAG.getNode(ISD::STRICT_SINT_TO_FP, DL,
+                              {Node->getValueType(0), MVT::Other},
+                              {Node->getOperand(0), LO});
+
+    SDValue TF = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, fHI.getValue(1),
+                             fLO.getValue(1));
+
+    // Add the two halves
+    SDValue Result =
+        DAG.getNode(ISD::STRICT_FADD, DL, {Node->getValueType(0), MVT::Other},
+                    {TF, fHI, fLO});
+
+    Results.push_back(Result);
+    Results.push_back(Result.getValue(1));
+    return;
+  }
+
+  // Convert hi and lo to floats
+  // Convert the hi part back to the upper values
+  // TODO: Can any fast-math-flags be set on these nodes?
+  SDValue fHI = DAG.getNode(ISD::SINT_TO_FP, DL, Node->getValueType(0), HI);
+  fHI = DAG.getNode(ISD::FMUL, DL, Node->getValueType(0), fHI, TWOHW);
+  SDValue fLO = DAG.getNode(ISD::SINT_TO_FP, DL, Node->getValueType(0), LO);
+
+  // Add the two halves
+  Results.push_back(
+      DAG.getNode(ISD::FADD, DL, Node->getValueType(0), fHI, fLO));
+}
+
+SDValue VectorLegalizer::ExpandFNEG(SDNode *Node) {
+  if (TLI.isOperationLegalOrCustom(ISD::FSUB, Node->getValueType(0))) {
+    SDLoc DL(Node);
+    SDValue Zero = DAG.getConstantFP(-0.0, DL, Node->getValueType(0));
+    // TODO: If FNEG had fast-math-flags, they'd get propagated to this FSUB.
+    return DAG.getNode(ISD::FSUB, DL, Node->getValueType(0), Zero,
+                       Node->getOperand(0));
+  }
+  return DAG.UnrollVectorOp(Node);
+}
+
+void VectorLegalizer::ExpandFSUB(SDNode *Node,
+                                 SmallVectorImpl<SDValue> &Results) {
+  // For floating-point values, (a-b) is the same as a+(-b). If FNEG is legal,
+  // we can defer this to operation legalization where it will be lowered as
+  // a+(-b).
+  EVT VT = Node->getValueType(0);
+  if (TLI.isOperationLegalOrCustom(ISD::FNEG, VT) &&
+      TLI.isOperationLegalOrCustom(ISD::FADD, VT))
+    return; // Defer to LegalizeDAG
+
+  SDValue Tmp = DAG.UnrollVectorOp(Node);
+  Results.push_back(Tmp);
+}
+
+void VectorLegalizer::ExpandSETCC(SDNode *Node,
+                                  SmallVectorImpl<SDValue> &Results) {
+  bool NeedInvert = false;
+  bool IsVP = Node->getOpcode() == ISD::VP_SETCC;
+  bool IsStrict = Node->getOpcode() == ISD::STRICT_FSETCC ||
+                  Node->getOpcode() == ISD::STRICT_FSETCCS;
+  bool IsSignaling = Node->getOpcode() == ISD::STRICT_FSETCCS;
+  unsigned Offset = IsStrict ? 1 : 0;
+
+  SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue();
+  SDValue LHS = Node->getOperand(0 + Offset);
+  SDValue RHS = Node->getOperand(1 + Offset);
+  SDValue CC = Node->getOperand(2 + Offset);
+
+  MVT OpVT = LHS.getSimpleValueType();
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
+
+  if (TLI.getCondCodeAction(CCCode, OpVT) != TargetLowering::Expand) {
+    if (IsStrict) {
+      UnrollStrictFPOp(Node, Results);
+      return;
+    }
+    Results.push_back(UnrollVSETCC(Node));
+    return;
+  }
+
+  SDValue Mask, EVL;
+  if (IsVP) {
+    Mask = Node->getOperand(3 + Offset);
+    EVL = Node->getOperand(4 + Offset);
+  }
+
+  SDLoc dl(Node);
+  bool Legalized =
+      TLI.LegalizeSetCCCondCode(DAG, Node->getValueType(0), LHS, RHS, CC, Mask,
+                                EVL, NeedInvert, dl, Chain, IsSignaling);
+
+  if (Legalized) {
+    // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
+    // condition code, create a new SETCC node.
+    if (CC.getNode()) {
+      if (IsStrict) {
+        LHS = DAG.getNode(Node->getOpcode(), dl, Node->getVTList(),
+                          {Chain, LHS, RHS, CC}, Node->getFlags());
+        Chain = LHS.getValue(1);
+      } else if (IsVP) {
+        LHS = DAG.getNode(ISD::VP_SETCC, dl, Node->getValueType(0),
+                          {LHS, RHS, CC, Mask, EVL}, Node->getFlags());
+      } else {
+        LHS = DAG.getNode(ISD::SETCC, dl, Node->getValueType(0), LHS, RHS, CC,
+                          Node->getFlags());
+      }
+    }
+
+    // If we expanded the SETCC by inverting the condition code, then wrap
+    // the existing SETCC in a NOT to restore the intended condition.
+    if (NeedInvert) {
+      if (!IsVP)
+        LHS = DAG.getLogicalNOT(dl, LHS, LHS->getValueType(0));
+      else
+        LHS = DAG.getVPLogicalNOT(dl, LHS, Mask, EVL, LHS->getValueType(0));
+    }
+  } else {
+    assert(!IsStrict && "Don't know how to expand for strict nodes.");
+
+    // Otherwise, SETCC for the given comparison type must be completely
+    // illegal; expand it into a SELECT_CC.
+    EVT VT = Node->getValueType(0);
+    LHS =
+        DAG.getNode(ISD::SELECT_CC, dl, VT, LHS, RHS,
+                    DAG.getBoolConstant(true, dl, VT, LHS.getValueType()),
+                    DAG.getBoolConstant(false, dl, VT, LHS.getValueType()), CC);
+    LHS->setFlags(Node->getFlags());
+  }
+
+  Results.push_back(LHS);
+  if (IsStrict)
+    Results.push_back(Chain);
+}
+
+void VectorLegalizer::ExpandUADDSUBO(SDNode *Node,
+                                     SmallVectorImpl<SDValue> &Results) {
+  SDValue Result, Overflow;
+  TLI.expandUADDSUBO(Node, Result, Overflow, DAG);
+  Results.push_back(Result);
+  Results.push_back(Overflow);
+}
+
+void VectorLegalizer::ExpandSADDSUBO(SDNode *Node,
+                                     SmallVectorImpl<SDValue> &Results) {
+  SDValue Result, Overflow;
+  TLI.expandSADDSUBO(Node, Result, Overflow, DAG);
+  Results.push_back(Result);
+  Results.push_back(Overflow);
+}
+
+void VectorLegalizer::ExpandMULO(SDNode *Node,
+                                 SmallVectorImpl<SDValue> &Results) {
+  SDValue Result, Overflow;
+  if (!TLI.expandMULO(Node, Result, Overflow, DAG))
+    std::tie(Result, Overflow) = DAG.UnrollVectorOverflowOp(Node);
+
+  Results.push_back(Result);
+  Results.push_back(Overflow);
+}
+
+void VectorLegalizer::ExpandFixedPointDiv(SDNode *Node,
+                                          SmallVectorImpl<SDValue> &Results) {
+  SDNode *N = Node;
+  if (SDValue Expanded = TLI.expandFixedPointDiv(N->getOpcode(), SDLoc(N),
+          N->getOperand(0), N->getOperand(1), N->getConstantOperandVal(2), DAG))
+    Results.push_back(Expanded);
+}
+
+void VectorLegalizer::ExpandStrictFPOp(SDNode *Node,
+                                       SmallVectorImpl<SDValue> &Results) {
+  if (Node->getOpcode() == ISD::STRICT_UINT_TO_FP) {
+    ExpandUINT_TO_FLOAT(Node, Results);
+    return;
+  }
+  if (Node->getOpcode() == ISD::STRICT_FP_TO_UINT) {
+    ExpandFP_TO_UINT(Node, Results);
+    return;
+  }
+
+  if (Node->getOpcode() == ISD::STRICT_FSETCC ||
+      Node->getOpcode() == ISD::STRICT_FSETCCS) {
+    ExpandSETCC(Node, Results);
+    return;
+  }
+
+  UnrollStrictFPOp(Node, Results);
+}
+
+void VectorLegalizer::ExpandREM(SDNode *Node,
+                                SmallVectorImpl<SDValue> &Results) {
+  assert((Node->getOpcode() == ISD::SREM || Node->getOpcode() == ISD::UREM) &&
+         "Expected REM node");
+
+  SDValue Result;
+  if (!TLI.expandREM(Node, Result, DAG))
+    Result = DAG.UnrollVectorOp(Node);
+  Results.push_back(Result);
+}
+
+void VectorLegalizer::UnrollStrictFPOp(SDNode *Node,
+                                       SmallVectorImpl<SDValue> &Results) {
+  EVT VT = Node->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned NumOpers = Node->getNumOperands();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  EVT TmpEltVT = EltVT;
+  if (Node->getOpcode() == ISD::STRICT_FSETCC ||
+      Node->getOpcode() == ISD::STRICT_FSETCCS)
+    TmpEltVT = TLI.getSetCCResultType(DAG.getDataLayout(),
+                                      *DAG.getContext(), TmpEltVT);
+
+  EVT ValueVTs[] = {TmpEltVT, MVT::Other};
+  SDValue Chain = Node->getOperand(0);
+  SDLoc dl(Node);
+
+  SmallVector<SDValue, 32> OpValues;
+  SmallVector<SDValue, 32> OpChains;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SmallVector<SDValue, 4> Opers;
+    SDValue Idx = DAG.getVectorIdxConstant(i, dl);
+
+    // The Chain is the first operand.
+    Opers.push_back(Chain);
+
+    // Now process the remaining operands.
+    for (unsigned j = 1; j < NumOpers; ++j) {
+      SDValue Oper = Node->getOperand(j);
+      EVT OperVT = Oper.getValueType();
+
+      if (OperVT.isVector())
+        Oper = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                           OperVT.getVectorElementType(), Oper, Idx);
+
+      Opers.push_back(Oper);
+    }
+
+    SDValue ScalarOp = DAG.getNode(Node->getOpcode(), dl, ValueVTs, Opers);
+    SDValue ScalarResult = ScalarOp.getValue(0);
+    SDValue ScalarChain = ScalarOp.getValue(1);
+
+    if (Node->getOpcode() == ISD::STRICT_FSETCC ||
+        Node->getOpcode() == ISD::STRICT_FSETCCS)
+      ScalarResult = DAG.getSelect(dl, EltVT, ScalarResult,
+                                   DAG.getAllOnesConstant(dl, EltVT),
+                                   DAG.getConstant(0, dl, EltVT));
+
+    OpValues.push_back(ScalarResult);
+    OpChains.push_back(ScalarChain);
+  }
+
+  SDValue Result = DAG.getBuildVector(VT, dl, OpValues);
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OpChains);
+
+  Results.push_back(Result);
+  Results.push_back(NewChain);
+}
+
+SDValue VectorLegalizer::UnrollVSETCC(SDNode *Node) {
+  EVT VT = Node->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  SDValue CC = Node->getOperand(2);
+  EVT TmpEltVT = LHS.getValueType().getVectorElementType();
+  SDLoc dl(Node);
+  SmallVector<SDValue, 8> Ops(NumElems);
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
+                                  DAG.getVectorIdxConstant(i, dl));
+    SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
+                                  DAG.getVectorIdxConstant(i, dl));
+    Ops[i] = DAG.getNode(ISD::SETCC, dl,
+                         TLI.getSetCCResultType(DAG.getDataLayout(),
+                                                *DAG.getContext(), TmpEltVT),
+                         LHSElem, RHSElem, CC);
+    Ops[i] = DAG.getSelect(dl, EltVT, Ops[i], DAG.getAllOnesConstant(dl, EltVT),
+                           DAG.getConstant(0, dl, EltVT));
+  }
+  return DAG.getBuildVector(VT, dl, Ops);
+}
+
+bool SelectionDAG::LegalizeVectors() {
+  return VectorLegalizer(*this).Run();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
new file mode 100644
index 000000000000..8c117c1c74dc
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -0,0 +1,7262 @@
+//===------- LegalizeVectorTypes.cpp - Legalization of vector types -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file performs vector type splitting and scalarization for LegalizeTypes.
+// Scalarization is the act of changing a computation in an illegal one-element
+// vector type to be a computation in its scalar element type.  For example,
+// implementing <1 x f32> arithmetic in a scalar f32 register.  This is needed
+// as a base case when scalarizing vector arithmetic like <4 x f32>, which
+// eventually decomposes to scalars if the target doesn't support v4f32 or v2f32
+// types.
+// Splitting is the act of changing a computation in an invalid vector type to
+// be a computation in two vectors of half the size.  For example, implementing
+// <128 x f32> operations in terms of two <64 x f32> operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TypeSize.h"
+#include "llvm/Support/raw_ostream.h"
+#include <numeric>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "legalize-types"
+
+//===----------------------------------------------------------------------===//
+//  Result Vector Scalarization: <1 x ty> -> ty.
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
+  LLVM_DEBUG(dbgs() << "Scalarize node result " << ResNo << ": "; N->dump(&DAG);
+             dbgs() << "\n");
+  SDValue R = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ScalarizeVectorResult #" << ResNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to scalarize the result of this "
+                       "operator!\n");
+
+  case ISD::MERGE_VALUES:      R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break;
+  case ISD::BITCAST:           R = ScalarizeVecRes_BITCAST(N); break;
+  case ISD::BUILD_VECTOR:      R = ScalarizeVecRes_BUILD_VECTOR(N); break;
+  case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break;
+  case ISD::FP_ROUND:          R = ScalarizeVecRes_FP_ROUND(N); break;
+  case ISD::FPOWI:             R = ScalarizeVecRes_ExpOp(N); break;
+  case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::LOAD:           R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
+  case ISD::SCALAR_TO_VECTOR:  R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
+  case ISD::VSELECT:           R = ScalarizeVecRes_VSELECT(N); break;
+  case ISD::SELECT:            R = ScalarizeVecRes_SELECT(N); break;
+  case ISD::SELECT_CC:         R = ScalarizeVecRes_SELECT_CC(N); break;
+  case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;
+  case ISD::UNDEF:             R = ScalarizeVecRes_UNDEF(N); break;
+  case ISD::VECTOR_SHUFFLE:    R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break;
+  case ISD::IS_FPCLASS:        R = ScalarizeVecRes_IS_FPCLASS(N); break;
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    R = ScalarizeVecRes_VecInregOp(N);
+    break;
+  case ISD::ABS:
+  case ISD::ANY_EXTEND:
+  case ISD::BITREVERSE:
+  case ISD::BSWAP:
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTPOP:
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::FABS:
+  case ISD::FCEIL:
+  case ISD::FCOS:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FFLOOR:
+  case ISD::FLOG:
+  case ISD::FLOG10:
+  case ISD::FLOG2:
+  case ISD::FNEARBYINT:
+  case ISD::FNEG:
+  case ISD::FREEZE:
+  case ISD::ARITH_FENCE:
+  case ISD::FP_EXTEND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::FRINT:
+  case ISD::FROUND:
+  case ISD::FROUNDEVEN:
+  case ISD::FSIN:
+  case ISD::FSQRT:
+  case ISD::FTRUNC:
+  case ISD::SIGN_EXTEND:
+  case ISD::SINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::UINT_TO_FP:
+  case ISD::ZERO_EXTEND:
+  case ISD::FCANONICALIZE:
+    R = ScalarizeVecRes_UnaryOp(N);
+    break;
+  case ISD::FFREXP:
+    R = ScalarizeVecRes_FFREXP(N, ResNo);
+    break;
+  case ISD::ADD:
+  case ISD::AND:
+  case ISD::FADD:
+  case ISD::FCOPYSIGN:
+  case ISD::FDIV:
+  case ISD::FMUL:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
+  case ISD::FLDEXP:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
+
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:
+
+  case ISD::FPOW:
+  case ISD::FREM:
+  case ISD::FSUB:
+  case ISD::MUL:
+  case ISD::MULHS:
+  case ISD::MULHU:
+  case ISD::OR:
+  case ISD::SDIV:
+  case ISD::SREM:
+  case ISD::SUB:
+  case ISD::UDIV:
+  case ISD::UREM:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR:
+    R = ScalarizeVecRes_BinOp(N);
+    break;
+  case ISD::FMA:
+  case ISD::FSHL:
+  case ISD::FSHR:
+    R = ScalarizeVecRes_TernaryOp(N);
+    break;
+
+#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
+  case ISD::STRICT_##DAGN:
+#include "llvm/IR/ConstrainedOps.def"
+    R = ScalarizeVecRes_StrictFPOp(N);
+    break;
+
+  case ISD::FP_TO_UINT_SAT:
+  case ISD::FP_TO_SINT_SAT:
+    R = ScalarizeVecRes_FP_TO_XINT_SAT(N);
+    break;
+
+  case ISD::UADDO:
+  case ISD::SADDO:
+  case ISD::USUBO:
+  case ISD::SSUBO:
+  case ISD::UMULO:
+  case ISD::SMULO:
+    R = ScalarizeVecRes_OverflowOp(N, ResNo);
+    break;
+  case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
+  case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT:
+    R = ScalarizeVecRes_FIX(N);
+    break;
+  }
+
+  // If R is null, the sub-method took care of registering the result.
+  if (R.getNode())
+    SetScalarizedVector(SDValue(N, ResNo), R);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  SDValue RHS = GetScalarizedVector(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
+                     LHS.getValueType(), LHS, RHS, N->getFlags());
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) {
+  SDValue Op0 = GetScalarizedVector(N->getOperand(0));
+  SDValue Op1 = GetScalarizedVector(N->getOperand(1));
+  SDValue Op2 = GetScalarizedVector(N->getOperand(2));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), Op0.getValueType(), Op0, Op1,
+                     Op2, N->getFlags());
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_FIX(SDNode *N) {
+  SDValue Op0 = GetScalarizedVector(N->getOperand(0));
+  SDValue Op1 = GetScalarizedVector(N->getOperand(1));
+  SDValue Op2 = N->getOperand(2);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), Op0.getValueType(), Op0, Op1,
+                     Op2, N->getFlags());
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_FFREXP(SDNode *N, unsigned ResNo) {
+  assert(N->getValueType(0).getVectorNumElements() == 1 &&
+         "Unexpected vector type!");
+  SDValue Elt = GetScalarizedVector(N->getOperand(0));
+
+  EVT VT0 = N->getValueType(0);
+  EVT VT1 = N->getValueType(1);
+  SDLoc dl(N);
+
+  SDNode *ScalarNode =
+      DAG.getNode(N->getOpcode(), dl,
+                  {VT0.getScalarType(), VT1.getScalarType()}, Elt)
+          .getNode();
+
+  // Replace the other vector result not being explicitly scalarized here.
+  unsigned OtherNo = 1 - ResNo;
+  EVT OtherVT = N->getValueType(OtherNo);
+  if (getTypeAction(OtherVT) == TargetLowering::TypeScalarizeVector) {
+    SetScalarizedVector(SDValue(N, OtherNo), SDValue(ScalarNode, OtherNo));
+  } else {
+    SDValue OtherVal = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, OtherVT,
+                                   SDValue(ScalarNode, OtherNo));
+    ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
+  }
+
+  return SDValue(ScalarNode, ResNo);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_StrictFPOp(SDNode *N) {
+  EVT VT = N->getValueType(0).getVectorElementType();
+  unsigned NumOpers = N->getNumOperands();
+  SDValue Chain = N->getOperand(0);
+  EVT ValueVTs[] = {VT, MVT::Other};
+  SDLoc dl(N);
+
+  SmallVector<SDValue, 4> Opers(NumOpers);
+
+  // The Chain is the first operand.
+  Opers[0] = Chain;
+
+  // Now process the remaining operands.
+  for (unsigned i = 1; i < NumOpers; ++i) {
+    SDValue Oper = N->getOperand(i);
+    EVT OperVT = Oper.getValueType();
+
+    if (OperVT.isVector()) {
+      if (getTypeAction(OperVT) == TargetLowering::TypeScalarizeVector)
+        Oper = GetScalarizedVector(Oper);
+      else
+        Oper = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                           OperVT.getVectorElementType(), Oper,
+                           DAG.getVectorIdxConstant(0, dl));
+    }
+
+    Opers[i] = Oper;
+  }
+
+  SDValue Result = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(ValueVTs),
+                               Opers, N->getFlags());
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
+  return Result;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_OverflowOp(SDNode *N,
+                                                     unsigned ResNo) {
+  SDLoc DL(N);
+  EVT ResVT = N->getValueType(0);
+  EVT OvVT = N->getValueType(1);
+
+  SDValue ScalarLHS, ScalarRHS;
+  if (getTypeAction(ResVT) == TargetLowering::TypeScalarizeVector) {
+    ScalarLHS = GetScalarizedVector(N->getOperand(0));
+    ScalarRHS = GetScalarizedVector(N->getOperand(1));
+  } else {
+    SmallVector<SDValue, 1> ElemsLHS, ElemsRHS;
+    DAG.ExtractVectorElements(N->getOperand(0), ElemsLHS);
+    DAG.ExtractVectorElements(N->getOperand(1), ElemsRHS);
+    ScalarLHS = ElemsLHS[0];
+    ScalarRHS = ElemsRHS[0];
+  }
+
+  SDVTList ScalarVTs = DAG.getVTList(
+      ResVT.getVectorElementType(), OvVT.getVectorElementType());
+  SDNode *ScalarNode = DAG.getNode(
+      N->getOpcode(), DL, ScalarVTs, ScalarLHS, ScalarRHS).getNode();
+  ScalarNode->setFlags(N->getFlags());
+
+  // Replace the other vector result not being explicitly scalarized here.
+  unsigned OtherNo = 1 - ResNo;
+  EVT OtherVT = N->getValueType(OtherNo);
+  if (getTypeAction(OtherVT) == TargetLowering::TypeScalarizeVector) {
+    SetScalarizedVector(SDValue(N, OtherNo), SDValue(ScalarNode, OtherNo));
+  } else {
+    SDValue OtherVal = DAG.getNode(
+        ISD::SCALAR_TO_VECTOR, DL, OtherVT, SDValue(ScalarNode, OtherNo));
+    ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
+  }
+
+  return SDValue(ScalarNode, ResNo);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
+                                                       unsigned ResNo) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  return GetScalarizedVector(Op);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().isVector()
+      && Op.getValueType().getVectorNumElements() == 1
+      && !isSimpleLegalType(Op.getValueType()))
+    Op = GetScalarizedVector(Op);
+  EVT NewVT = N->getValueType(0).getVectorElementType();
+  return DAG.getNode(ISD::BITCAST, SDLoc(N),
+                     NewVT, Op);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_BUILD_VECTOR(SDNode *N) {
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  SDValue InOp = N->getOperand(0);
+  // The BUILD_VECTOR operands may be of wider element types and
+  // we may need to truncate them back to the requested return type.
+  if (EltVT.isInteger())
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, InOp);
+  return InOp;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
+                     N->getValueType(0).getVectorElementType(),
+                     N->getOperand(0), N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_FP_ROUND(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Op = N->getOperand(0);
+  EVT OpVT = Op.getValueType();
+  // The result needs scalarizing, but it's not a given that the source does.
+  // See similar logic in ScalarizeVecRes_UnaryOp.
+  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
+    Op = GetScalarizedVector(Op);
+  } else {
+    EVT VT = OpVT.getVectorElementType();
+    Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
+                     DAG.getVectorIdxConstant(0, DL));
+  }
+  return DAG.getNode(ISD::FP_ROUND, DL,
+                     N->getValueType(0).getVectorElementType(), Op,
+                     N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_ExpOp(SDNode *N) {
+  SDValue Op = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), Op.getValueType(), Op,
+                     N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
+  // The value to insert may have a wider type than the vector element type,
+  // so be sure to truncate it to the element type if necessary.
+  SDValue Op = N->getOperand(1);
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  if (Op.getValueType() != EltVT)
+    // FIXME: Can this happen for floating point types?
+    Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, Op);
+  return Op;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
+  assert(N->isUnindexed() && "Indexed vector load?");
+
+  SDValue Result = DAG.getLoad(
+      ISD::UNINDEXED, N->getExtensionType(),
+      N->getValueType(0).getVectorElementType(), SDLoc(N), N->getChain(),
+      N->getBasePtr(), DAG.getUNDEF(N->getBasePtr().getValueType()),
+      N->getPointerInfo(), N->getMemoryVT().getVectorElementType(),
+      N->getOriginalAlign(), N->getMemOperand()->getFlags(), N->getAAInfo());
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
+  return Result;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_UnaryOp(SDNode *N) {
+  // Get the dest type - it doesn't always match the input type, e.g. int_to_fp.
+  EVT DestVT = N->getValueType(0).getVectorElementType();
+  SDValue Op = N->getOperand(0);
+  EVT OpVT = Op.getValueType();
+  SDLoc DL(N);
+  // The result needs scalarizing, but it's not a given that the source does.
+  // This is a workaround for targets where it's impossible to scalarize the
+  // result of a conversion, because the source type is legal.
+  // For instance, this happens on AArch64: v1i1 is illegal but v1i{8,16,32}
+  // are widened to v8i8, v4i16, and v2i32, which is legal, because v1i64 is
+  // legal and was not scalarized.
+  // See the similar logic in ScalarizeVecRes_SETCC
+  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
+    Op = GetScalarizedVector(Op);
+  } else {
+    EVT VT = OpVT.getVectorElementType();
+    Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
+                     DAG.getVectorIdxConstant(0, DL));
+  }
+  return DAG.getNode(N->getOpcode(), SDLoc(N), DestVT, Op, N->getFlags());
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) {
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  EVT ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT().getVectorElementType();
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), EltVT,
+                     LHS, DAG.getValueType(ExtVT));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VecInregOp(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Op = N->getOperand(0);
+
+  EVT OpVT = Op.getValueType();
+  EVT OpEltVT = OpVT.getVectorElementType();
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+
+  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
+    Op = GetScalarizedVector(Op);
+  } else {
+    Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpEltVT, Op,
+                     DAG.getVectorIdxConstant(0, DL));
+  }
+
+  switch (N->getOpcode()) {
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+    return DAG.getNode(ISD::ANY_EXTEND, DL, EltVT, Op);
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, EltVT, Op);
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Op);
+  }
+
+  llvm_unreachable("Illegal extend_vector_inreg opcode");
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
+  // If the operand is wider than the vector element type then it is implicitly
+  // truncated.  Make that explicit here.
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  SDValue InOp = N->getOperand(0);
+  if (InOp.getValueType() != EltVT)
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, InOp);
+  return InOp;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {
+  SDValue Cond = N->getOperand(0);
+  EVT OpVT = Cond.getValueType();
+  SDLoc DL(N);
+  // The vselect result and true/value operands needs scalarizing, but it's
+  // not a given that the Cond does. For instance, in AVX512 v1i1 is legal.
+  // See the similar logic in ScalarizeVecRes_SETCC
+  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
+    Cond = GetScalarizedVector(Cond);
+  } else {
+    EVT VT = OpVT.getVectorElementType();
+    Cond = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cond,
+                       DAG.getVectorIdxConstant(0, DL));
+  }
+
+  SDValue LHS = GetScalarizedVector(N->getOperand(1));
+  TargetLowering::BooleanContent ScalarBool =
+      TLI.getBooleanContents(false, false);
+  TargetLowering::BooleanContent VecBool = TLI.getBooleanContents(true, false);
+
+  // If integer and float booleans have different contents then we can't
+  // reliably optimize in all cases. There is a full explanation for this in
+  // DAGCombiner::visitSELECT() where the same issue affects folding
+  // (select C, 0, 1) to (xor C, 1).
+  if (TLI.getBooleanContents(false, false) !=
+      TLI.getBooleanContents(false, true)) {
+    // At least try the common case where the boolean is generated by a
+    // comparison.
+    if (Cond->getOpcode() == ISD::SETCC) {
+      EVT OpVT = Cond->getOperand(0).getValueType();
+      ScalarBool = TLI.getBooleanContents(OpVT.getScalarType());
+      VecBool = TLI.getBooleanContents(OpVT);
+    } else
+      ScalarBool = TargetLowering::UndefinedBooleanContent;
+  }
+
+  EVT CondVT = Cond.getValueType();
+  if (ScalarBool != VecBool) {
+    switch (ScalarBool) {
+      case TargetLowering::UndefinedBooleanContent:
+        break;
+      case TargetLowering::ZeroOrOneBooleanContent:
+        assert(VecBool == TargetLowering::UndefinedBooleanContent ||
+               VecBool == TargetLowering::ZeroOrNegativeOneBooleanContent);
+        // Vector read from all ones, scalar expects a single 1 so mask.
+        Cond = DAG.getNode(ISD::AND, SDLoc(N), CondVT,
+                           Cond, DAG.getConstant(1, SDLoc(N), CondVT));
+        break;
+      case TargetLowering::ZeroOrNegativeOneBooleanContent:
+        assert(VecBool == TargetLowering::UndefinedBooleanContent ||
+               VecBool == TargetLowering::ZeroOrOneBooleanContent);
+        // Vector reads from a one, scalar from all ones so sign extend.
+        Cond = DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), CondVT,
+                           Cond, DAG.getValueType(MVT::i1));
+        break;
+    }
+  }
+
+  // Truncate the condition if needed
+  auto BoolVT = getSetCCResultType(CondVT);
+  if (BoolVT.bitsLT(CondVT))
+    Cond = DAG.getNode(ISD::TRUNCATE, SDLoc(N), BoolVT, Cond);
+
+  return DAG.getSelect(SDLoc(N),
+                       LHS.getValueType(), Cond, LHS,
+                       GetScalarizedVector(N->getOperand(2)));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(1));
+  return DAG.getSelect(SDLoc(N),
+                       LHS.getValueType(), N->getOperand(0), LHS,
+                       GetScalarizedVector(N->getOperand(2)));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT_CC(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(2));
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N), LHS.getValueType(),
+                     N->getOperand(0), N->getOperand(1),
+                     LHS, GetScalarizedVector(N->getOperand(3)),
+                     N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N) {
+  // Figure out if the scalar is the LHS or RHS and return it.
+  SDValue Arg = N->getOperand(2).getOperand(0);
+  if (Arg.isUndef())
+    return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
+  unsigned Op = !cast<ConstantSDNode>(Arg)->isZero();
+  return GetScalarizedVector(N->getOperand(Op));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_FP_TO_XINT_SAT(SDNode *N) {
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  SDLoc dl(N);
+
+  // Handle case where result is scalarized but operand is not
+  if (getTypeAction(SrcVT) == TargetLowering::TypeScalarizeVector)
+    Src = GetScalarizedVector(Src);
+  else
+    Src = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, SrcVT.getVectorElementType(), Src,
+        DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+
+  EVT DstVT = N->getValueType(0).getVectorElementType();
+  return DAG.getNode(N->getOpcode(), dl, DstVT, Src, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operand types must be vectors");
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  EVT OpVT = LHS.getValueType();
+  EVT NVT = N->getValueType(0).getVectorElementType();
+  SDLoc DL(N);
+
+  // The result needs scalarizing, but it's not a given that the source does.
+  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
+    LHS = GetScalarizedVector(LHS);
+    RHS = GetScalarizedVector(RHS);
+  } else {
+    EVT VT = OpVT.getVectorElementType();
+    LHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, LHS,
+                      DAG.getVectorIdxConstant(0, DL));
+    RHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, RHS,
+                      DAG.getVectorIdxConstant(0, DL));
+  }
+
+  // Turn it into a scalar SETCC.
+  SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS,
+                            N->getOperand(2));
+  // Vectors may have a different boolean contents to scalars.  Promote the
+  // value appropriately.
+  ISD::NodeType ExtendCode =
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
+  return DAG.getNode(ExtendCode, DL, NVT, Res);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_IS_FPCLASS(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Arg = N->getOperand(0);
+  SDValue Test = N->getOperand(1);
+  EVT ArgVT = Arg.getValueType();
+  EVT ResultVT = N->getValueType(0).getVectorElementType();
+
+  if (getTypeAction(ArgVT) == TargetLowering::TypeScalarizeVector) {
+    Arg = GetScalarizedVector(Arg);
+  } else {
+    EVT VT = ArgVT.getVectorElementType();
+    Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Arg,
+                      DAG.getVectorIdxConstant(0, DL));
+  }
+
+  SDValue Res =
+      DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, {Arg, Test}, N->getFlags());
+  // Vectors may have a different boolean contents to scalars.  Promote the
+  // value appropriately.
+  ISD::NodeType ExtendCode =
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(ArgVT));
+  return DAG.getNode(ExtendCode, DL, ResultVT, Res);
+}
+
+//===----------------------------------------------------------------------===//
+//  Operand Vector Scalarization <1 x ty> -> ty.
+//===----------------------------------------------------------------------===//
+
+bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
+  LLVM_DEBUG(dbgs() << "Scalarize node operand " << OpNo << ": "; N->dump(&DAG);
+             dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ScalarizeVectorOperand Op #" << OpNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to scalarize this operator's "
+                       "operand!\n");
+  case ISD::BITCAST:
+    Res = ScalarizeVecOp_BITCAST(N);
+    break;
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::TRUNCATE:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    Res = ScalarizeVecOp_UnaryOp(N);
+    break;
+  case ISD::STRICT_SINT_TO_FP:
+  case ISD::STRICT_UINT_TO_FP:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
+    Res = ScalarizeVecOp_UnaryOp_StrictFP(N);
+    break;
+  case ISD::CONCAT_VECTORS:
+    Res = ScalarizeVecOp_CONCAT_VECTORS(N);
+    break;
+  case ISD::EXTRACT_VECTOR_ELT:
+    Res = ScalarizeVecOp_EXTRACT_VECTOR_ELT(N);
+    break;
+  case ISD::VSELECT:
+    Res = ScalarizeVecOp_VSELECT(N);
+    break;
+  case ISD::SETCC:
+    Res = ScalarizeVecOp_VSETCC(N);
+    break;
+  case ISD::STORE:
+    Res = ScalarizeVecOp_STORE(cast<StoreSDNode>(N), OpNo);
+    break;
+  case ISD::STRICT_FP_ROUND:
+    Res = ScalarizeVecOp_STRICT_FP_ROUND(N, OpNo);
+    break;
+  case ISD::FP_ROUND:
+    Res = ScalarizeVecOp_FP_ROUND(N, OpNo);
+    break;
+  case ISD::STRICT_FP_EXTEND:
+    Res = ScalarizeVecOp_STRICT_FP_EXTEND(N);
+    break;
+  case ISD::FP_EXTEND:
+    Res = ScalarizeVecOp_FP_EXTEND(N);
+    break;
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN:
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VECREDUCE_FMINIMUM:
+    Res = ScalarizeVecOp_VECREDUCE(N);
+    break;
+  case ISD::VECREDUCE_SEQ_FADD:
+  case ISD::VECREDUCE_SEQ_FMUL:
+    Res = ScalarizeVecOp_VECREDUCE_SEQ(N);
+    break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// If the value to convert is a vector that needs to be scalarized, it must be
+/// <1 x ty>. Convert the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) {
+  SDValue Elt = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(ISD::BITCAST, SDLoc(N),
+                     N->getValueType(0), Elt);
+}
+
+/// If the input is a vector that needs to be scalarized, it must be <1 x ty>.
+/// Do the operation on the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
+  assert(N->getValueType(0).getVectorNumElements() == 1 &&
+         "Unexpected vector type!");
+  SDValue Elt = GetScalarizedVector(N->getOperand(0));
+  SDValue Op = DAG.getNode(N->getOpcode(), SDLoc(N),
+                           N->getValueType(0).getScalarType(), Elt);
+  // Revectorize the result so the types line up with what the uses of this
+  // expression expect.
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Op);
+}
+
+/// If the input is a vector that needs to be scalarized, it must be <1 x ty>.
+/// Do the strict FP operation on the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp_StrictFP(SDNode *N) {
+  assert(N->getValueType(0).getVectorNumElements() == 1 &&
+         "Unexpected vector type!");
+  SDValue Elt = GetScalarizedVector(N->getOperand(1));
+  SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N),
+                            { N->getValueType(0).getScalarType(), MVT::Other },
+                            { N->getOperand(0), Elt });
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  // Revectorize the result so the types line up with what the uses of this
+  // expression expect.
+  Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);
+
+  // Do our own replacement and return SDValue() to tell the caller that we
+  // handled all replacements since caller can only handle a single result.
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return SDValue();
+}
+
+/// The vectors to concatenate have length one - use a BUILD_VECTOR instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
+  SmallVector<SDValue, 8> Ops(N->getNumOperands());
+  for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
+    Ops[i] = GetScalarizedVector(N->getOperand(i));
+  return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Ops);
+}
+
+/// If the input is a vector that needs to be scalarized, it must be <1 x ty>,
+/// so just return the element, ignoring the index.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  SDValue Res = GetScalarizedVector(N->getOperand(0));
+  if (Res.getValueType() != VT)
+    Res = VT.isFloatingPoint()
+              ? DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Res)
+              : DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Res);
+  return Res;
+}
+
+/// If the input condition is a vector that needs to be scalarized, it must be
+/// <1 x i1>, so just convert to a normal ISD::SELECT
+/// (still with vector output type since that was acceptable if we got here).
+SDValue DAGTypeLegalizer::ScalarizeVecOp_VSELECT(SDNode *N) {
+  SDValue ScalarCond = GetScalarizedVector(N->getOperand(0));
+  EVT VT = N->getValueType(0);
+
+  return DAG.getNode(ISD::SELECT, SDLoc(N), VT, ScalarCond, N->getOperand(1),
+                     N->getOperand(2));
+}
+
+/// If the operand is a vector that needs to be scalarized then the
+/// result must be v1i1, so just convert to a scalar SETCC and wrap
+/// with a scalar_to_vector since the res type is legal if we got here
+SDValue DAGTypeLegalizer::ScalarizeVecOp_VSETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operand types must be vectors");
+  assert(N->getValueType(0) == MVT::v1i1 && "Expected v1i1 type");
+
+  EVT VT = N->getValueType(0);
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  SDValue RHS = GetScalarizedVector(N->getOperand(1));
+
+  EVT OpVT = N->getOperand(0).getValueType();
+  EVT NVT = VT.getVectorElementType();
+  SDLoc DL(N);
+  // Turn it into a scalar SETCC.
+  SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS,
+      N->getOperand(2));
+
+  // Vectors may have a different boolean contents to scalars.  Promote the
+  // value appropriately.
+  ISD::NodeType ExtendCode =
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
+
+  Res = DAG.getNode(ExtendCode, DL, NVT, Res);
+
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Res);
+}
+
+/// If the value to store is a vector that needs to be scalarized, it must be
+/// <1 x ty>. Just store the element.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
+  assert(N->isUnindexed() && "Indexed store of one-element vector?");
+  assert(OpNo == 1 && "Do not know how to scalarize this operand!");
+  SDLoc dl(N);
+
+  if (N->isTruncatingStore())
+    return DAG.getTruncStore(
+        N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
+        N->getBasePtr(), N->getPointerInfo(),
+        N->getMemoryVT().getVectorElementType(), N->getOriginalAlign(),
+        N->getMemOperand()->getFlags(), N->getAAInfo());
+
+  return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
+                      N->getBasePtr(), N->getPointerInfo(),
+                      N->getOriginalAlign(), N->getMemOperand()->getFlags(),
+                      N->getAAInfo());
+}
+
+/// If the value to round is a vector that needs to be scalarized, it must be
+/// <1 x ty>. Convert the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Wrong operand for scalarization!");
+  SDValue Elt = GetScalarizedVector(N->getOperand(0));
+  SDValue Res = DAG.getNode(ISD::FP_ROUND, SDLoc(N),
+                            N->getValueType(0).getVectorElementType(), Elt,
+                            N->getOperand(1));
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecOp_STRICT_FP_ROUND(SDNode *N, 
+                                                         unsigned OpNo) {
+  assert(OpNo == 1 && "Wrong operand for scalarization!");
+  SDValue Elt = GetScalarizedVector(N->getOperand(1));
+  SDValue Res = DAG.getNode(ISD::STRICT_FP_ROUND, SDLoc(N),
+                            { N->getValueType(0).getVectorElementType(), 
+                              MVT::Other },
+                            { N->getOperand(0), Elt, N->getOperand(2) });
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+
+  Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);
+
+  // Do our own replacement and return SDValue() to tell the caller that we
+  // handled all replacements since caller can only handle a single result.
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return SDValue();
+}
+
+/// If the value to extend is a vector that needs to be scalarized, it must be
+/// <1 x ty>. Convert the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_FP_EXTEND(SDNode *N) {
+  SDValue Elt = GetScalarizedVector(N->getOperand(0));
+  SDValue Res = DAG.getNode(ISD::FP_EXTEND, SDLoc(N),
+                            N->getValueType(0).getVectorElementType(), Elt);
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);
+}
+
+/// If the value to extend is a vector that needs to be scalarized, it must be
+/// <1 x ty>. Convert the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_STRICT_FP_EXTEND(SDNode *N) {
+  SDValue Elt = GetScalarizedVector(N->getOperand(1));
+  SDValue Res =
+      DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(N),
+                  {N->getValueType(0).getVectorElementType(), MVT::Other},
+                  {N->getOperand(0), Elt});
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+
+  Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);
+
+  // Do our own replacement and return SDValue() to tell the caller that we
+  // handled all replacements since caller can only handle a single result.
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecOp_VECREDUCE(SDNode *N) {
+  SDValue Res = GetScalarizedVector(N->getOperand(0));
+  // Result type may be wider than element type.
+  if (Res.getValueType() != N->getValueType(0))
+    Res = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), Res);
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecOp_VECREDUCE_SEQ(SDNode *N) {
+  SDValue AccOp = N->getOperand(0);
+  SDValue VecOp = N->getOperand(1);
+
+  unsigned BaseOpc = ISD::getVecReduceBaseOpcode(N->getOpcode());
+
+  SDValue Op = GetScalarizedVector(VecOp);
+  return DAG.getNode(BaseOpc, SDLoc(N), N->getValueType(0),
+                     AccOp, Op, N->getFlags());
+}
+
+//===----------------------------------------------------------------------===//
+//  Result Vector Splitting
+//===----------------------------------------------------------------------===//
+
+/// This method is called when the specified result of the specified node is
+/// found to need vector splitting. At this point, the node may also have
+/// invalid operands or may have other results that need legalization, we just
+/// know that (at least) one result needs vector splitting.
+void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
+  LLVM_DEBUG(dbgs() << "Split node result: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Lo, Hi;
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "SplitVectorResult #" << ResNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to split the result of this "
+                       "operator!\n");
+
+  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
+  case ISD::AssertZext:   SplitVecRes_AssertZext(N, Lo, Hi); break;
+  case ISD::VSELECT:
+  case ISD::SELECT:
+  case ISD::VP_MERGE:
+  case ISD::VP_SELECT:    SplitRes_Select(N, Lo, Hi); break;
+  case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
+  case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
+  case ISD::BITCAST:           SplitVecRes_BITCAST(N, Lo, Hi); break;
+  case ISD::BUILD_VECTOR:      SplitVecRes_BUILD_VECTOR(N, Lo, Hi); break;
+  case ISD::CONCAT_VECTORS:    SplitVecRes_CONCAT_VECTORS(N, Lo, Hi); break;
+  case ISD::EXTRACT_SUBVECTOR: SplitVecRes_EXTRACT_SUBVECTOR(N, Lo, Hi); break;
+  case ISD::INSERT_SUBVECTOR:  SplitVecRes_INSERT_SUBVECTOR(N, Lo, Hi); break;
+  case ISD::FPOWI:
+  case ISD::FLDEXP:
+  case ISD::FCOPYSIGN:         SplitVecRes_FPOp_MultiType(N, Lo, Hi); break;
+  case ISD::IS_FPCLASS:        SplitVecRes_IS_FPCLASS(N, Lo, Hi); break;
+  case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break;
+  case ISD::SPLAT_VECTOR:
+  case ISD::SCALAR_TO_VECTOR:
+    SplitVecRes_ScalarOp(N, Lo, Hi);
+    break;
+  case ISD::STEP_VECTOR:
+    SplitVecRes_STEP_VECTOR(N, Lo, Hi);
+    break;
+  case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+  case ISD::LOAD:
+    SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
+    break;
+  case ISD::VP_LOAD:
+    SplitVecRes_VP_LOAD(cast<VPLoadSDNode>(N), Lo, Hi);
+    break;
+  case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
+    SplitVecRes_VP_STRIDED_LOAD(cast<VPStridedLoadSDNode>(N), Lo, Hi);
+    break;
+  case ISD::MLOAD:
+    SplitVecRes_MLOAD(cast<MaskedLoadSDNode>(N), Lo, Hi);
+    break;
+  case ISD::MGATHER:
+  case ISD::VP_GATHER:
+    SplitVecRes_Gather(cast<MemSDNode>(N), Lo, Hi, /*SplitSETCC*/ true);
+    break;
+  case ISD::SETCC:
+  case ISD::VP_SETCC:
+    SplitVecRes_SETCC(N, Lo, Hi);
+    break;
+  case ISD::VECTOR_REVERSE:
+    SplitVecRes_VECTOR_REVERSE(N, Lo, Hi);
+    break;
+  case ISD::VECTOR_SHUFFLE:
+    SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
+    break;
+  case ISD::VECTOR_SPLICE:
+    SplitVecRes_VECTOR_SPLICE(N, Lo, Hi);
+    break;
+  case ISD::VECTOR_DEINTERLEAVE:
+    SplitVecRes_VECTOR_DEINTERLEAVE(N);
+    return;
+  case ISD::VECTOR_INTERLEAVE:
+    SplitVecRes_VECTOR_INTERLEAVE(N);
+    return;
+  case ISD::VAARG:
+    SplitVecRes_VAARG(N, Lo, Hi);
+    break;
+
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    SplitVecRes_ExtVecInRegOp(N, Lo, Hi);
+    break;
+
+  case ISD::ABS:
+  case ISD::VP_ABS:
+  case ISD::BITREVERSE:
+  case ISD::VP_BITREVERSE:
+  case ISD::BSWAP:
+  case ISD::VP_BSWAP:
+  case ISD::CTLZ:
+  case ISD::VP_CTLZ:
+  case ISD::CTTZ:
+  case ISD::VP_CTTZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::VP_CTLZ_ZERO_UNDEF:
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::VP_CTTZ_ZERO_UNDEF:
+  case ISD::CTPOP:
+  case ISD::VP_CTPOP:
+  case ISD::FABS: case ISD::VP_FABS:
+  case ISD::FCEIL:
+  case ISD::VP_FCEIL:
+  case ISD::FCOS:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FFLOOR:
+  case ISD::VP_FFLOOR:
+  case ISD::FLOG:
+  case ISD::FLOG10:
+  case ISD::FLOG2:
+  case ISD::FNEARBYINT:
+  case ISD::VP_FNEARBYINT:
+  case ISD::FNEG: case ISD::VP_FNEG:
+  case ISD::FREEZE:
+  case ISD::ARITH_FENCE:
+  case ISD::FP_EXTEND:
+  case ISD::VP_FP_EXTEND:
+  case ISD::FP_ROUND:
+  case ISD::VP_FP_ROUND:
+  case ISD::FP_TO_SINT:
+  case ISD::VP_FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::VP_FP_TO_UINT:
+  case ISD::FRINT:
+  case ISD::VP_FRINT:
+  case ISD::FROUND:
+  case ISD::VP_FROUND:
+  case ISD::FROUNDEVEN:
+  case ISD::VP_FROUNDEVEN:
+  case ISD::FSIN:
+  case ISD::FSQRT: case ISD::VP_SQRT:
+  case ISD::FTRUNC:
+  case ISD::VP_FROUNDTOZERO:
+  case ISD::SINT_TO_FP:
+  case ISD::VP_SINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::VP_TRUNCATE:
+  case ISD::UINT_TO_FP:
+  case ISD::VP_UINT_TO_FP:
+  case ISD::FCANONICALIZE:
+    SplitVecRes_UnaryOp(N, Lo, Hi);
+    break;
+  case ISD::FFREXP:
+    SplitVecRes_FFREXP(N, ResNo, Lo, Hi);
+    break;
+
+  case ISD::ANY_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::VP_SIGN_EXTEND:
+  case ISD::VP_ZERO_EXTEND:
+    SplitVecRes_ExtendOp(N, Lo, Hi);
+    break;
+
+  case ISD::ADD: case ISD::VP_ADD:
+  case ISD::SUB: case ISD::VP_SUB:
+  case ISD::MUL: case ISD::VP_MUL:
+  case ISD::MULHS:
+  case ISD::MULHU:
+  case ISD::FADD: case ISD::VP_FADD:
+  case ISD::FSUB: case ISD::VP_FSUB:
+  case ISD::FMUL: case ISD::VP_FMUL:
+  case ISD::FMINNUM: case ISD::VP_FMINNUM:
+  case ISD::FMAXNUM: case ISD::VP_FMAXNUM:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
+  case ISD::SDIV: case ISD::VP_SDIV:
+  case ISD::UDIV: case ISD::VP_UDIV:
+  case ISD::FDIV: case ISD::VP_FDIV:
+  case ISD::FPOW:
+  case ISD::AND: case ISD::VP_AND:
+  case ISD::OR: case ISD::VP_OR:
+  case ISD::XOR: case ISD::VP_XOR:
+  case ISD::SHL: case ISD::VP_SHL:
+  case ISD::SRA: case ISD::VP_ASHR:
+  case ISD::SRL: case ISD::VP_LSHR:
+  case ISD::UREM: case ISD::VP_UREM:
+  case ISD::SREM: case ISD::VP_SREM:
+  case ISD::FREM: case ISD::VP_FREM:
+  case ISD::SMIN: case ISD::VP_SMIN:
+  case ISD::SMAX: case ISD::VP_SMAX:
+  case ISD::UMIN: case ISD::VP_UMIN:
+  case ISD::UMAX: case ISD::VP_UMAX:
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:
+  case ISD::ROTL:
+  case ISD::ROTR:
+  case ISD::VP_FCOPYSIGN:
+    SplitVecRes_BinOp(N, Lo, Hi);
+    break;
+  case ISD::FMA: case ISD::VP_FMA:
+  case ISD::FSHL:
+  case ISD::VP_FSHL:
+  case ISD::FSHR:
+  case ISD::VP_FSHR:
+    SplitVecRes_TernaryOp(N, Lo, Hi);
+    break;
+
+#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
+  case ISD::STRICT_##DAGN:
+#include "llvm/IR/ConstrainedOps.def"
+    SplitVecRes_StrictFPOp(N, Lo, Hi);
+    break;
+
+  case ISD::FP_TO_UINT_SAT:
+  case ISD::FP_TO_SINT_SAT:
+    SplitVecRes_FP_TO_XINT_SAT(N, Lo, Hi);
+    break;
+
+  case ISD::UADDO:
+  case ISD::SADDO:
+  case ISD::USUBO:
+  case ISD::SSUBO:
+  case ISD::UMULO:
+  case ISD::SMULO:
+    SplitVecRes_OverflowOp(N, ResNo, Lo, Hi);
+    break;
+  case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
+  case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT:
+    SplitVecRes_FIX(N, Lo, Hi);
+    break;
+  }
+
+  // If Lo/Hi is null, the sub-method took care of registering results etc.
+  if (Lo.getNode())
+    SetSplitVector(SDValue(N, ResNo), Lo, Hi);
+}
+
+void DAGTypeLegalizer::IncrementPointer(MemSDNode *N, EVT MemVT,
+                                        MachinePointerInfo &MPI, SDValue &Ptr,
+                                        uint64_t *ScaledOffset) {
+  SDLoc DL(N);
+  unsigned IncrementSize = MemVT.getSizeInBits().getKnownMinValue() / 8;
+
+  if (MemVT.isScalableVector()) {
+    SDNodeFlags Flags;
+    SDValue BytesIncrement = DAG.getVScale(
+        DL, Ptr.getValueType(),
+        APInt(Ptr.getValueSizeInBits().getFixedValue(), IncrementSize));
+    MPI = MachinePointerInfo(N->getPointerInfo().getAddrSpace());
+    Flags.setNoUnsignedWrap(true);
+    if (ScaledOffset)
+      *ScaledOffset += IncrementSize;
+    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, BytesIncrement,
+                      Flags);
+  } else {
+    MPI = N->getPointerInfo().getWithOffset(IncrementSize);
+    // Increment the pointer to the other half.
+    Ptr = DAG.getObjectPtrOffset(DL, Ptr, TypeSize::Fixed(IncrementSize));
+  }
+}
+
+std::pair<SDValue, SDValue> DAGTypeLegalizer::SplitMask(SDValue Mask) {
+  return SplitMask(Mask, SDLoc(Mask));
+}
+
+std::pair<SDValue, SDValue> DAGTypeLegalizer::SplitMask(SDValue Mask,
+                                                        const SDLoc &DL) {
+  SDValue MaskLo, MaskHi;
+  EVT MaskVT = Mask.getValueType();
+  if (getTypeAction(MaskVT) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Mask, MaskLo, MaskHi);
+  else
+    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+  return std::make_pair(MaskLo, MaskHi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue LHSLo, LHSHi;
+  GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
+  SDValue RHSLo, RHSHi;
+  GetSplitVector(N->getOperand(1), RHSLo, RHSHi);
+  SDLoc dl(N);
+
+  const SDNodeFlags Flags = N->getFlags();
+  unsigned Opcode = N->getOpcode();
+  if (N->getNumOperands() == 2) {
+    Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags);
+    Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags);
+    return;
+  }
+
+  assert(N->getNumOperands() == 4 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
+
+  SDValue MaskLo, MaskHi;
+  std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(2));
+
+  SDValue EVLLo, EVLHi;
+  std::tie(EVLLo, EVLHi) =
+      DAG.SplitEVL(N->getOperand(3), N->getValueType(0), dl);
+
+  Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(),
+                   {LHSLo, RHSLo, MaskLo, EVLLo}, Flags);
+  Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(),
+                   {LHSHi, RHSHi, MaskHi, EVLHi}, Flags);
+}
+
+void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo,
+                                             SDValue &Hi) {
+  SDValue Op0Lo, Op0Hi;
+  GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi);
+  SDValue Op1Lo, Op1Hi;
+  GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi);
+  SDValue Op2Lo, Op2Hi;
+  GetSplitVector(N->getOperand(2), Op2Lo, Op2Hi);
+  SDLoc dl(N);
+
+  const SDNodeFlags Flags = N->getFlags();
+  unsigned Opcode = N->getOpcode();
+  if (N->getNumOperands() == 3) {
+    Lo = DAG.getNode(Opcode, dl, Op0Lo.getValueType(), Op0Lo, Op1Lo, Op2Lo, Flags);
+    Hi = DAG.getNode(Opcode, dl, Op0Hi.getValueType(), Op0Hi, Op1Hi, Op2Hi, Flags);
+    return;
+  }
+
+  assert(N->getNumOperands() == 5 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
+
+  SDValue MaskLo, MaskHi;
+  std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(3));
+
+  SDValue EVLLo, EVLHi;
+  std::tie(EVLLo, EVLHi) =
+      DAG.SplitEVL(N->getOperand(4), N->getValueType(0), dl);
+
+  Lo = DAG.getNode(Opcode, dl, Op0Lo.getValueType(),
+                   {Op0Lo, Op1Lo, Op2Lo, MaskLo, EVLLo}, Flags);
+  Hi = DAG.getNode(Opcode, dl, Op0Hi.getValueType(),
+                   {Op0Hi, Op1Hi, Op2Hi, MaskHi, EVLHi}, Flags);
+}
+
+void DAGTypeLegalizer::SplitVecRes_FIX(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue LHSLo, LHSHi;
+  GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
+  SDValue RHSLo, RHSHi;
+  GetSplitVector(N->getOperand(1), RHSLo, RHSHi);
+  SDLoc dl(N);
+  SDValue Op2 = N->getOperand(2);
+
+  unsigned Opcode = N->getOpcode();
+  Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Op2,
+                   N->getFlags());
+  Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Op2,
+                   N->getFlags());
+}
+
+void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  // We know the result is a vector.  The input may be either a vector or a
+  // scalar value.
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  SDLoc dl(N);
+
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+
+  // Handle some special cases efficiently.
+  switch (getTypeAction(InVT)) {
+  case TargetLowering::TypeLegal:
+  case TargetLowering::TypePromoteInteger:
+  case TargetLowering::TypePromoteFloat:
+  case TargetLowering::TypeSoftPromoteHalf:
+  case TargetLowering::TypeSoftenFloat:
+  case TargetLowering::TypeScalarizeVector:
+  case TargetLowering::TypeWidenVector:
+    break;
+  case TargetLowering::TypeExpandInteger:
+  case TargetLowering::TypeExpandFloat:
+    // A scalar to vector conversion, where the scalar needs expansion.
+    // If the vector is being split in two then we can just convert the
+    // expanded pieces.
+    if (LoVT == HiVT) {
+      GetExpandedOp(InOp, Lo, Hi);
+      if (DAG.getDataLayout().isBigEndian())
+        std::swap(Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
+      return;
+    }
+    break;
+  case TargetLowering::TypeSplitVector:
+    // If the input is a vector that needs to be split, convert each split
+    // piece of the input now.
+    GetSplitVector(InOp, Lo, Hi);
+    Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
+    Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
+    return;
+  case TargetLowering::TypeScalarizeScalableVector:
+    report_fatal_error("Scalarization of scalable vectors is not supported.");
+  }
+
+  // In the general case, convert the input to an integer and split it by hand.
+  EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+  EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+  if (DAG.getDataLayout().isBigEndian())
+    std::swap(LoIntVT, HiIntVT);
+
+  SplitInteger(BitConvertToInteger(InOp), LoIntVT, HiIntVT, Lo, Hi);
+
+  if (DAG.getDataLayout().isBigEndian())
+    std::swap(Lo, Hi);
+  Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
+  Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
+                                                SDValue &Hi) {
+  EVT LoVT, HiVT;
+  SDLoc dl(N);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  unsigned LoNumElts = LoVT.getVectorNumElements();
+  SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts);
+  Lo = DAG.getBuildVector(LoVT, dl, LoOps);
+
+  SmallVector<SDValue, 8> HiOps(N->op_begin()+LoNumElts, N->op_end());
+  Hi = DAG.getBuildVector(HiVT, dl, HiOps);
+}
+
+void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
+                                                  SDValue &Hi) {
+  assert(!(N->getNumOperands() & 1) && "Unsupported CONCAT_VECTORS");
+  SDLoc dl(N);
+  unsigned NumSubvectors = N->getNumOperands() / 2;
+  if (NumSubvectors == 1) {
+    Lo = N->getOperand(0);
+    Hi = N->getOperand(1);
+    return;
+  }
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+NumSubvectors);
+  Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, LoVT, LoOps);
+
+  SmallVector<SDValue, 8> HiOps(N->op_begin()+NumSubvectors, N->op_end());
+  Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HiVT, HiOps);
+}
+
+void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
+                                                     SDValue &Hi) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Idx = N->getOperand(1);
+  SDLoc dl(N);
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, LoVT, Vec, Idx);
+  uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  Hi = DAG.getNode(
+      ISD::EXTRACT_SUBVECTOR, dl, HiVT, Vec,
+      DAG.getVectorIdxConstant(IdxVal + LoVT.getVectorMinNumElements(), dl));
+}
+
+void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
+                                                    SDValue &Hi) {
+  SDValue Vec = N->getOperand(0);
+  SDValue SubVec = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+  SDLoc dl(N);
+  GetSplitVector(Vec, Lo, Hi);
+
+  EVT VecVT = Vec.getValueType();
+  EVT LoVT = Lo.getValueType();
+  EVT SubVecVT = SubVec.getValueType();
+  unsigned VecElems = VecVT.getVectorMinNumElements();
+  unsigned SubElems = SubVecVT.getVectorMinNumElements();
+  unsigned LoElems = LoVT.getVectorMinNumElements();
+
+  // If we know the index is in the first half, and we know the subvector
+  // doesn't cross the boundary between the halves, we can avoid spilling the
+  // vector, and insert into the lower half of the split vector directly.
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  if (IdxVal + SubElems <= LoElems) {
+    Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, LoVT, Lo, SubVec, Idx);
+    return;
+  }
+  // Similarly if the subvector is fully in the high half, but mind that we
+  // can't tell whether a fixed-length subvector is fully within the high half
+  // of a scalable vector.
+  if (VecVT.isScalableVector() == SubVecVT.isScalableVector() &&
+      IdxVal >= LoElems && IdxVal + SubElems <= VecElems) {
+    Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, Hi.getValueType(), Hi, SubVec,
+                     DAG.getVectorIdxConstant(IdxVal - LoElems, dl));
+    return;
+  }
+
+  // Spill the vector to the stack.
+  // In cases where the vector is illegal it will be broken down into parts
+  // and stored in parts - we should use the alignment for the smallest part.
+  Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+  SDValue StackPtr =
+      DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
+  auto &MF = DAG.getMachineFunction();
+  auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+                               SmallestAlign);
+
+  // Store the new subvector into the specified index.
+  SDValue SubVecPtr =
+      TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, SubVecVT, Idx);
+  Store = DAG.getStore(Store, dl, SubVec, SubVecPtr,
+                       MachinePointerInfo::getUnknownStack(MF));
+
+  // Load the Lo part from the stack slot.
+  Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, PtrInfo,
+                   SmallestAlign);
+
+  // Increment the pointer to the other part.
+  auto *Load = cast<LoadSDNode>(Lo);
+  MachinePointerInfo MPI = Load->getPointerInfo();
+  IncrementPointer(Load, LoVT, MPI, StackPtr);
+
+  // Load the Hi part from the stack slot.
+  Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MPI, SmallestAlign);
+}
+
+// Handle splitting an FP where the second operand does not match the first
+// type. The second operand may be a scalar, or a vector that has exactly as
+// many elements as the first
+void DAGTypeLegalizer::SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo,
+                                                  SDValue &Hi) {
+  SDValue LHSLo, LHSHi;
+  GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
+  SDLoc DL(N);
+
+  SDValue RHSLo, RHSHi;
+  SDValue RHS = N->getOperand(1);
+  EVT RHSVT = RHS.getValueType();
+  if (RHSVT.isVector()) {
+    if (getTypeAction(RHSVT) == TargetLowering::TypeSplitVector)
+      GetSplitVector(RHS, RHSLo, RHSHi);
+    else
+      std::tie(RHSLo, RHSHi) = DAG.SplitVector(RHS, SDLoc(RHS));
+
+    Lo = DAG.getNode(N->getOpcode(), DL, LHSLo.getValueType(), LHSLo, RHSLo);
+    Hi = DAG.getNode(N->getOpcode(), DL, LHSHi.getValueType(), LHSHi, RHSHi);
+  } else {
+    Lo = DAG.getNode(N->getOpcode(), DL, LHSLo.getValueType(), LHSLo, RHS);
+    Hi = DAG.getNode(N->getOpcode(), DL, LHSHi.getValueType(), LHSHi, RHS);
+  }
+}
+
+void DAGTypeLegalizer::SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  SDLoc DL(N);
+  SDValue ArgLo, ArgHi;
+  SDValue Test = N->getOperand(1);
+  SDValue FpValue = N->getOperand(0);
+  if (getTypeAction(FpValue.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(FpValue, ArgLo, ArgHi);
+  else
+    std::tie(ArgLo, ArgHi) = DAG.SplitVector(FpValue, SDLoc(FpValue));
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  Lo = DAG.getNode(ISD::IS_FPCLASS, DL, LoVT, ArgLo, Test, N->getFlags());
+  Hi = DAG.getNode(ISD::IS_FPCLASS, DL, HiVT, ArgHi, Test, N->getFlags());
+}
+
+void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue LHSLo, LHSHi;
+  GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
+  SDLoc dl(N);
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) =
+    DAG.GetSplitDestVTs(cast<VTSDNode>(N->getOperand(1))->getVT());
+
+  Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo,
+                   DAG.getValueType(LoVT));
+  Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi,
+                   DAG.getValueType(HiVT));
+}
+
+void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  unsigned Opcode = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+
+  SDLoc dl(N);
+  SDValue InLo, InHi;
+
+  if (getTypeAction(N0.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(N0, InLo, InHi);
+  else
+    std::tie(InLo, InHi) = DAG.SplitVectorOperand(N, 0);
+
+  EVT InLoVT = InLo.getValueType();
+  unsigned InNumElements = InLoVT.getVectorNumElements();
+
+  EVT OutLoVT, OutHiVT;
+  std::tie(OutLoVT, OutHiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  unsigned OutNumElements = OutLoVT.getVectorNumElements();
+  assert((2 * OutNumElements) <= InNumElements &&
+         "Illegal extend vector in reg split");
+
+  // *_EXTEND_VECTOR_INREG instructions extend the lowest elements of the
+  // input vector (i.e. we only use InLo):
+  // OutLo will extend the first OutNumElements from InLo.
+  // OutHi will extend the next OutNumElements from InLo.
+
+  // Shuffle the elements from InLo for OutHi into the bottom elements to
+  // create a 'fake' InHi.
+  SmallVector<int, 8> SplitHi(InNumElements, -1);
+  for (unsigned i = 0; i != OutNumElements; ++i)
+    SplitHi[i] = i + OutNumElements;
+  InHi = DAG.getVectorShuffle(InLoVT, dl, InLo, DAG.getUNDEF(InLoVT), SplitHi);
+
+  Lo = DAG.getNode(Opcode, dl, OutLoVT, InLo);
+  Hi = DAG.getNode(Opcode, dl, OutHiVT, InHi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  unsigned NumOps = N->getNumOperands();
+  SDValue Chain = N->getOperand(0);
+  EVT LoVT, HiVT;
+  SDLoc dl(N);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  SmallVector<SDValue, 4> OpsLo(NumOps);
+  SmallVector<SDValue, 4> OpsHi(NumOps);
+
+  // The Chain is the first operand.
+  OpsLo[0] = Chain;
+  OpsHi[0] = Chain;
+
+  // Now process the remaining operands.
+  for (unsigned i = 1; i < NumOps; ++i) {
+    SDValue Op = N->getOperand(i);
+    SDValue OpLo = Op;
+    SDValue OpHi = Op;
+
+    EVT InVT = Op.getValueType();
+    if (InVT.isVector()) {
+      // If the input also splits, handle it directly for a
+      // compile time speedup. Otherwise split it by hand.
+      if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
+        GetSplitVector(Op, OpLo, OpHi);
+      else
+        std::tie(OpLo, OpHi) = DAG.SplitVectorOperand(N, i);
+    }
+
+    OpsLo[i] = OpLo;
+    OpsHi[i] = OpHi;
+  }
+
+  EVT LoValueVTs[] = {LoVT, MVT::Other};
+  EVT HiValueVTs[] = {HiVT, MVT::Other};
+  Lo = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(LoValueVTs), OpsLo,
+                   N->getFlags());
+  Hi = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(HiValueVTs), OpsHi,
+                   N->getFlags());
+
+  // Build a factor node to remember that this Op is independent of the
+  // other one.
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                      Lo.getValue(1), Hi.getValue(1));
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
+}
+
+SDValue DAGTypeLegalizer::UnrollVectorOp_StrictFP(SDNode *N, unsigned ResNE) {
+  SDValue Chain = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  unsigned NE = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+  SDLoc dl(N);
+
+  SmallVector<SDValue, 8> Scalars;
+  SmallVector<SDValue, 4> Operands(N->getNumOperands());
+
+  // If ResNE is 0, fully unroll the vector op.
+  if (ResNE == 0)
+    ResNE = NE;
+  else if (NE > ResNE)
+    NE = ResNE;
+
+  //The results of each unrolled operation, including the chain.
+  EVT ChainVTs[] = {EltVT, MVT::Other};
+  SmallVector<SDValue, 8> Chains;
+
+  unsigned i;
+  for (i = 0; i != NE; ++i) {
+    Operands[0] = Chain;
+    for (unsigned j = 1, e = N->getNumOperands(); j != e; ++j) {
+      SDValue Operand = N->getOperand(j);
+      EVT OperandVT = Operand.getValueType();
+      if (OperandVT.isVector()) {
+        EVT OperandEltVT = OperandVT.getVectorElementType();
+        Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT,
+                                  Operand, DAG.getVectorIdxConstant(i, dl));
+      } else {
+        Operands[j] = Operand;
+      }
+    }
+    SDValue Scalar = DAG.getNode(N->getOpcode(), dl, ChainVTs, Operands);
+    Scalar.getNode()->setFlags(N->getFlags());
+
+    //Add in the scalar as well as its chain value to the
+    //result vectors.
+    Scalars.push_back(Scalar);
+    Chains.push_back(Scalar.getValue(1));
+  }
+
+  for (; i < ResNE; ++i)
+    Scalars.push_back(DAG.getUNDEF(EltVT));
+
+  // Build a new factor node to connect the chain back together.
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+  ReplaceValueWith(SDValue(N, 1), Chain);
+
+  // Create a new BUILD_VECTOR node
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, ResNE);
+  return DAG.getBuildVector(VecVT, dl, Scalars);
+}
+
+void DAGTypeLegalizer::SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo,
+                                              SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  EVT ResVT = N->getValueType(0);
+  EVT OvVT = N->getValueType(1);
+  EVT LoResVT, HiResVT, LoOvVT, HiOvVT;
+  std::tie(LoResVT, HiResVT) = DAG.GetSplitDestVTs(ResVT);
+  std::tie(LoOvVT, HiOvVT) = DAG.GetSplitDestVTs(OvVT);
+
+  SDValue LoLHS, HiLHS, LoRHS, HiRHS;
+  if (getTypeAction(ResVT) == TargetLowering::TypeSplitVector) {
+    GetSplitVector(N->getOperand(0), LoLHS, HiLHS);
+    GetSplitVector(N->getOperand(1), LoRHS, HiRHS);
+  } else {
+    std::tie(LoLHS, HiLHS) = DAG.SplitVectorOperand(N, 0);
+    std::tie(LoRHS, HiRHS) = DAG.SplitVectorOperand(N, 1);
+  }
+
+  unsigned Opcode = N->getOpcode();
+  SDVTList LoVTs = DAG.getVTList(LoResVT, LoOvVT);
+  SDVTList HiVTs = DAG.getVTList(HiResVT, HiOvVT);
+  SDNode *LoNode = DAG.getNode(Opcode, dl, LoVTs, LoLHS, LoRHS).getNode();
+  SDNode *HiNode = DAG.getNode(Opcode, dl, HiVTs, HiLHS, HiRHS).getNode();
+  LoNode->setFlags(N->getFlags());
+  HiNode->setFlags(N->getFlags());
+
+  Lo = SDValue(LoNode, ResNo);
+  Hi = SDValue(HiNode, ResNo);
+
+  // Replace the other vector result not being explicitly split here.
+  unsigned OtherNo = 1 - ResNo;
+  EVT OtherVT = N->getValueType(OtherNo);
+  if (getTypeAction(OtherVT) == TargetLowering::TypeSplitVector) {
+    SetSplitVector(SDValue(N, OtherNo),
+                   SDValue(LoNode, OtherNo), SDValue(HiNode, OtherNo));
+  } else {
+    SDValue OtherVal = DAG.getNode(
+        ISD::CONCAT_VECTORS, dl, OtherVT,
+        SDValue(LoNode, OtherNo), SDValue(HiNode, OtherNo));
+    ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
+  }
+}
+
+void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
+                                                     SDValue &Hi) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Elt = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+  SDLoc dl(N);
+  GetSplitVector(Vec, Lo, Hi);
+
+  if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
+    unsigned IdxVal = CIdx->getZExtValue();
+    unsigned LoNumElts = Lo.getValueType().getVectorMinNumElements();
+    if (IdxVal < LoNumElts) {
+      Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+                       Lo.getValueType(), Lo, Elt, Idx);
+      return;
+    } else if (!Vec.getValueType().isScalableVector()) {
+      Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
+                       DAG.getVectorIdxConstant(IdxVal - LoNumElts, dl));
+      return;
+    }
+  }
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(0), true))
+    return;
+
+  // Make the vector elements byte-addressable if they aren't already.
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  if (VecVT.getScalarSizeInBits() < 8) {
+    EltVT = MVT::i8;
+    VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
+                             VecVT.getVectorElementCount());
+    Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
+    // Extend the element type to match if needed.
+    if (EltVT.bitsGT(Elt.getValueType()))
+      Elt = DAG.getNode(ISD::ANY_EXTEND, dl, EltVT, Elt);
+  }
+
+  // Spill the vector to the stack.
+  // In cases where the vector is illegal it will be broken down into parts
+  // and stored in parts - we should use the alignment for the smallest part.
+  Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+  SDValue StackPtr =
+      DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
+  auto &MF = DAG.getMachineFunction();
+  auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+                               SmallestAlign);
+
+  // Store the new element.  This may be larger than the vector element type,
+  // so use a truncating store.
+  SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
+  Store = DAG.getTruncStore(
+      Store, dl, Elt, EltPtr, MachinePointerInfo::getUnknownStack(MF), EltVT,
+      commonAlignment(SmallestAlign,
+                      EltVT.getFixedSizeInBits() / 8));
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
+
+  // Load the Lo part from the stack slot.
+  Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, SmallestAlign);
+
+  // Increment the pointer to the other part.
+  auto Load = cast<LoadSDNode>(Lo);
+  MachinePointerInfo MPI = Load->getPointerInfo();
+  IncrementPointer(Load, LoVT, MPI, StackPtr);
+
+  Hi = DAG.getLoad(HiVT, dl, Store, StackPtr, MPI, SmallestAlign);
+
+  // If we adjusted the original type, we need to truncate the results.
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  if (LoVT != Lo.getValueType())
+    Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Lo);
+  if (HiVT != Hi.getValueType())
+    Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_STEP_VECTOR(SDNode *N, SDValue &Lo,
+                                               SDValue &Hi) {
+  EVT LoVT, HiVT;
+  SDLoc dl(N);
+  assert(N->getValueType(0).isScalableVector() &&
+         "Only scalable vectors are supported for STEP_VECTOR");
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  SDValue Step = N->getOperand(0);
+
+  Lo = DAG.getNode(ISD::STEP_VECTOR, dl, LoVT, Step);
+
+  // Hi = Lo + (EltCnt * Step)
+  EVT EltVT = Step.getValueType();
+  APInt StepVal = cast<ConstantSDNode>(Step)->getAPIntValue();
+  SDValue StartOfHi =
+      DAG.getVScale(dl, EltVT, StepVal * LoVT.getVectorMinNumElements());
+  StartOfHi = DAG.getSExtOrTrunc(StartOfHi, dl, HiVT.getVectorElementType());
+  StartOfHi = DAG.getNode(ISD::SPLAT_VECTOR, dl, HiVT, StartOfHi);
+
+  Hi = DAG.getNode(ISD::STEP_VECTOR, dl, HiVT, Step);
+  Hi = DAG.getNode(ISD::ADD, dl, HiVT, Hi, StartOfHi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  EVT LoVT, HiVT;
+  SDLoc dl(N);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  Lo = DAG.getNode(N->getOpcode(), dl, LoVT, N->getOperand(0));
+  if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    Hi = DAG.getUNDEF(HiVT);
+  } else {
+    assert(N->getOpcode() == ISD::SPLAT_VECTOR && "Unexpected opcode");
+    Hi = Lo;
+  }
+}
+
+void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
+                                        SDValue &Hi) {
+  assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
+  EVT LoVT, HiVT;
+  SDLoc dl(LD);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0));
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  SDValue Ch = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
+  EVT MemoryVT = LD->getMemoryVT();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = LD->getAAInfo();
+
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  if (!LoMemVT.isByteSized() || !HiMemVT.isByteSized()) {
+    SDValue Value, NewChain;
+    std::tie(Value, NewChain) = TLI.scalarizeVectorLoad(LD, DAG);
+    std::tie(Lo, Hi) = DAG.SplitVector(Value, dl);
+    ReplaceValueWith(SDValue(LD, 1), NewChain);
+    return;
+  }
+
+  Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
+                   LD->getPointerInfo(), LoMemVT, LD->getOriginalAlign(),
+                   MMOFlags, AAInfo);
+
+  MachinePointerInfo MPI;
+  IncrementPointer(LD, LoMemVT, MPI, Ptr);
+
+  Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset, MPI,
+                   HiMemVT, LD->getOriginalAlign(), MMOFlags, AAInfo);
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                   Hi.getValue(1));
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(LD, 1), Ch);
+}
+
+void DAGTypeLegalizer::SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo,
+                                           SDValue &Hi) {
+  assert(LD->isUnindexed() && "Indexed VP load during type legalization!");
+  EVT LoVT, HiVT;
+  SDLoc dl(LD);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0));
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  SDValue Ch = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  SDValue Offset = LD->getOffset();
+  assert(Offset.isUndef() && "Unexpected indexed variable-length load offset");
+  Align Alignment = LD->getOriginalAlign();
+  SDValue Mask = LD->getMask();
+  SDValue EVL = LD->getVectorLength();
+  EVT MemoryVT = LD->getMemoryVT();
+
+  EVT LoMemVT, HiMemVT;
+  bool HiIsEmpty = false;
+  std::tie(LoMemVT, HiMemVT) =
+      DAG.GetDependentSplitDestVTs(MemoryVT, LoVT, &HiIsEmpty);
+
+  // Split Mask operand
+  SDValue MaskLo, MaskHi;
+  if (Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+  } else {
+    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Mask, MaskLo, MaskHi);
+    else
+      std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+  }
+
+  // Split EVL operand
+  SDValue EVLLo, EVLHi;
+  std::tie(EVLLo, EVLHi) = DAG.SplitEVL(EVL, LD->getValueType(0), dl);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      LD->getPointerInfo(), MachineMemOperand::MOLoad,
+      MemoryLocation::UnknownSize, Alignment, LD->getAAInfo(), LD->getRanges());
+
+  Lo =
+      DAG.getLoadVP(LD->getAddressingMode(), ExtType, LoVT, dl, Ch, Ptr, Offset,
+                    MaskLo, EVLLo, LoMemVT, MMO, LD->isExpandingLoad());
+
+  if (HiIsEmpty) {
+    // The hi vp_load has zero storage size. We therefore simply set it to
+    // the low vp_load and rely on subsequent removal from the chain.
+    Hi = Lo;
+  } else {
+    // Generate hi vp_load.
+    Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG,
+                                     LD->isExpandingLoad());
+
+    MachinePointerInfo MPI;
+    if (LoMemVT.isScalableVector())
+      MPI = MachinePointerInfo(LD->getPointerInfo().getAddrSpace());
+    else
+      MPI = LD->getPointerInfo().getWithOffset(
+          LoMemVT.getStoreSize().getFixedValue());
+
+    MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MPI, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize, Alignment,
+        LD->getAAInfo(), LD->getRanges());
+
+    Hi = DAG.getLoadVP(LD->getAddressingMode(), ExtType, HiVT, dl, Ch, Ptr,
+                       Offset, MaskHi, EVLHi, HiMemVT, MMO,
+                       LD->isExpandingLoad());
+  }
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                   Hi.getValue(1));
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(LD, 1), Ch);
+}
+
+void DAGTypeLegalizer::SplitVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *SLD,
+                                                   SDValue &Lo, SDValue &Hi) {
+  assert(SLD->isUnindexed() &&
+         "Indexed VP strided load during type legalization!");
+  assert(SLD->getOffset().isUndef() &&
+         "Unexpected indexed variable-length load offset");
+
+  SDLoc DL(SLD);
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(SLD->getValueType(0));
+
+  EVT LoMemVT, HiMemVT;
+  bool HiIsEmpty = false;
+  std::tie(LoMemVT, HiMemVT) =
+      DAG.GetDependentSplitDestVTs(SLD->getMemoryVT(), LoVT, &HiIsEmpty);
+
+  SDValue Mask = SLD->getMask();
+  SDValue LoMask, HiMask;
+  if (Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Mask.getNode(), LoMask, HiMask);
+  } else {
+    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Mask, LoMask, HiMask);
+    else
+      std::tie(LoMask, HiMask) = DAG.SplitVector(Mask, DL);
+  }
+
+  SDValue LoEVL, HiEVL;
+  std::tie(LoEVL, HiEVL) =
+      DAG.SplitEVL(SLD->getVectorLength(), SLD->getValueType(0), DL);
+
+  // Generate the low vp_strided_load
+  Lo = DAG.getStridedLoadVP(
+      SLD->getAddressingMode(), SLD->getExtensionType(), LoVT, DL,
+      SLD->getChain(), SLD->getBasePtr(), SLD->getOffset(), SLD->getStride(),
+      LoMask, LoEVL, LoMemVT, SLD->getMemOperand(), SLD->isExpandingLoad());
+
+  if (HiIsEmpty) {
+    // The high vp_strided_load has zero storage size. We therefore simply set
+    // it to the low vp_strided_load and rely on subsequent removal from the
+    // chain.
+    Hi = Lo;
+  } else {
+    // Generate the high vp_strided_load.
+    // To calculate the high base address, we need to sum to the low base
+    // address stride number of bytes for each element already loaded by low,
+    // that is: Ptr = Ptr + (LoEVL * Stride)
+    EVT PtrVT = SLD->getBasePtr().getValueType();
+    SDValue Increment =
+        DAG.getNode(ISD::MUL, DL, PtrVT, LoEVL,
+                    DAG.getSExtOrTrunc(SLD->getStride(), DL, PtrVT));
+    SDValue Ptr =
+        DAG.getNode(ISD::ADD, DL, PtrVT, SLD->getBasePtr(), Increment);
+
+    Align Alignment = SLD->getOriginalAlign();
+    if (LoMemVT.isScalableVector())
+      Alignment = commonAlignment(
+          Alignment, LoMemVT.getSizeInBits().getKnownMinValue() / 8);
+
+    MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MachinePointerInfo(SLD->getPointerInfo().getAddrSpace()),
+        MachineMemOperand::MOLoad, MemoryLocation::UnknownSize, Alignment,
+        SLD->getAAInfo(), SLD->getRanges());
+
+    Hi = DAG.getStridedLoadVP(SLD->getAddressingMode(), SLD->getExtensionType(),
+                              HiVT, DL, SLD->getChain(), Ptr, SLD->getOffset(),
+                              SLD->getStride(), HiMask, HiEVL, HiMemVT, MMO,
+                              SLD->isExpandingLoad());
+  }
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  SDValue Ch = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
+                           Hi.getValue(1));
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(SLD, 1), Ch);
+}
+
+void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
+                                         SDValue &Lo, SDValue &Hi) {
+  assert(MLD->isUnindexed() && "Indexed masked load during type legalization!");
+  EVT LoVT, HiVT;
+  SDLoc dl(MLD);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
+
+  SDValue Ch = MLD->getChain();
+  SDValue Ptr = MLD->getBasePtr();
+  SDValue Offset = MLD->getOffset();
+  assert(Offset.isUndef() && "Unexpected indexed masked load offset");
+  SDValue Mask = MLD->getMask();
+  SDValue PassThru = MLD->getPassThru();
+  Align Alignment = MLD->getOriginalAlign();
+  ISD::LoadExtType ExtType = MLD->getExtensionType();
+
+  // Split Mask operand
+  SDValue MaskLo, MaskHi;
+  if (Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+  } else {
+    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Mask, MaskLo, MaskHi);
+    else
+      std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+  }
+
+  EVT MemoryVT = MLD->getMemoryVT();
+  EVT LoMemVT, HiMemVT;
+  bool HiIsEmpty = false;
+  std::tie(LoMemVT, HiMemVT) =
+      DAG.GetDependentSplitDestVTs(MemoryVT, LoVT, &HiIsEmpty);
+
+  SDValue PassThruLo, PassThruHi;
+  if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(PassThru, PassThruLo, PassThruHi);
+  else
+    std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MLD->getPointerInfo(), MachineMemOperand::MOLoad,
+      MemoryLocation::UnknownSize, Alignment, MLD->getAAInfo(),
+      MLD->getRanges());
+
+  Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, Offset, MaskLo, PassThruLo, LoMemVT,
+                         MMO, MLD->getAddressingMode(), ExtType,
+                         MLD->isExpandingLoad());
+
+  if (HiIsEmpty) {
+    // The hi masked load has zero storage size. We therefore simply set it to
+    // the low masked load and rely on subsequent removal from the chain.
+    Hi = Lo;
+  } else {
+    // Generate hi masked load.
+    Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG,
+                                     MLD->isExpandingLoad());
+
+    MachinePointerInfo MPI;
+    if (LoMemVT.isScalableVector())
+      MPI = MachinePointerInfo(MLD->getPointerInfo().getAddrSpace());
+    else
+      MPI = MLD->getPointerInfo().getWithOffset(
+          LoMemVT.getStoreSize().getFixedValue());
+
+    MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MPI, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize, Alignment,
+        MLD->getAAInfo(), MLD->getRanges());
+
+    Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, Offset, MaskHi, PassThruHi,
+                           HiMemVT, MMO, MLD->getAddressingMode(), ExtType,
+                           MLD->isExpandingLoad());
+  }
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                   Hi.getValue(1));
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(MLD, 1), Ch);
+
+}
+
+void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo,
+                                          SDValue &Hi, bool SplitSETCC) {
+  EVT LoVT, HiVT;
+  SDLoc dl(N);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  SDValue Ch = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  struct Operands {
+    SDValue Mask;
+    SDValue Index;
+    SDValue Scale;
+  } Ops = [&]() -> Operands {
+    if (auto *MSC = dyn_cast<MaskedGatherSDNode>(N)) {
+      return {MSC->getMask(), MSC->getIndex(), MSC->getScale()};
+    }
+    auto *VPSC = cast<VPGatherSDNode>(N);
+    return {VPSC->getMask(), VPSC->getIndex(), VPSC->getScale()};
+  }();
+
+  EVT MemoryVT = N->getMemoryVT();
+  Align Alignment = N->getOriginalAlign();
+
+  // Split Mask operand
+  SDValue MaskLo, MaskHi;
+  if (SplitSETCC && Ops.Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Ops.Mask.getNode(), MaskLo, MaskHi);
+  } else {
+    std::tie(MaskLo, MaskHi) = SplitMask(Ops.Mask, dl);
+  }
+
+  EVT LoMemVT, HiMemVT;
+  // Split MemoryVT
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  SDValue IndexHi, IndexLo;
+  if (getTypeAction(Ops.Index.getValueType()) ==
+      TargetLowering::TypeSplitVector)
+    GetSplitVector(Ops.Index, IndexLo, IndexHi);
+  else
+    std::tie(IndexLo, IndexHi) = DAG.SplitVector(Ops.Index, dl);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      N->getPointerInfo(), MachineMemOperand::MOLoad,
+      MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
+
+  if (auto *MGT = dyn_cast<MaskedGatherSDNode>(N)) {
+    SDValue PassThru = MGT->getPassThru();
+    SDValue PassThruLo, PassThruHi;
+    if (getTypeAction(PassThru.getValueType()) ==
+        TargetLowering::TypeSplitVector)
+      GetSplitVector(PassThru, PassThruLo, PassThruHi);
+    else
+      std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl);
+
+    ISD::LoadExtType ExtType = MGT->getExtensionType();
+    ISD::MemIndexType IndexTy = MGT->getIndexType();
+
+    SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Ops.Scale};
+    Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl,
+                             OpsLo, MMO, IndexTy, ExtType);
+
+    SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Ops.Scale};
+    Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl,
+                             OpsHi, MMO, IndexTy, ExtType);
+  } else {
+    auto *VPGT = cast<VPGatherSDNode>(N);
+    SDValue EVLLo, EVLHi;
+    std::tie(EVLLo, EVLHi) =
+        DAG.SplitEVL(VPGT->getVectorLength(), MemoryVT, dl);
+
+    SDValue OpsLo[] = {Ch, Ptr, IndexLo, Ops.Scale, MaskLo, EVLLo};
+    Lo = DAG.getGatherVP(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, OpsLo,
+                         MMO, VPGT->getIndexType());
+
+    SDValue OpsHi[] = {Ch, Ptr, IndexHi, Ops.Scale, MaskHi, EVLHi};
+    Hi = DAG.getGatherVP(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, OpsHi,
+                         MMO, VPGT->getIndexType());
+  }
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                   Hi.getValue(1));
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Ch);
+}
+
+void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operand types must be vectors");
+
+  EVT LoVT, HiVT;
+  SDLoc DL(N);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  // If the input also splits, handle it directly. Otherwise split it by hand.
+  SDValue LL, LH, RL, RH;
+  if (getTypeAction(N->getOperand(0).getValueType()) ==
+      TargetLowering::TypeSplitVector)
+    GetSplitVector(N->getOperand(0), LL, LH);
+  else
+    std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
+
+  if (getTypeAction(N->getOperand(1).getValueType()) ==
+      TargetLowering::TypeSplitVector)
+    GetSplitVector(N->getOperand(1), RL, RH);
+  else
+    std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
+
+  if (N->getOpcode() == ISD::SETCC) {
+    Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
+    Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
+  } else {
+    assert(N->getOpcode() == ISD::VP_SETCC && "Expected VP_SETCC opcode");
+    SDValue MaskLo, MaskHi, EVLLo, EVLHi;
+    std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(3));
+    std::tie(EVLLo, EVLHi) =
+        DAG.SplitEVL(N->getOperand(4), N->getValueType(0), DL);
+    Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2), MaskLo,
+                     EVLLo);
+    Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2), MaskHi,
+                     EVLHi);
+  }
+}
+
+void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  // Get the dest types - they may not match the input types, e.g. int_to_fp.
+  EVT LoVT, HiVT;
+  SDLoc dl(N);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  // If the input also splits, handle it directly for a compile time speedup.
+  // Otherwise split it by hand.
+  EVT InVT = N->getOperand(0).getValueType();
+  if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
+    GetSplitVector(N->getOperand(0), Lo, Hi);
+  else
+    std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+
+  const SDNodeFlags Flags = N->getFlags();
+  unsigned Opcode = N->getOpcode();
+  if (N->getNumOperands() <= 2) {
+    if (Opcode == ISD::FP_ROUND) {
+      Lo = DAG.getNode(Opcode, dl, LoVT, Lo, N->getOperand(1), Flags);
+      Hi = DAG.getNode(Opcode, dl, HiVT, Hi, N->getOperand(1), Flags);
+    } else {
+      Lo = DAG.getNode(Opcode, dl, LoVT, Lo, Flags);
+      Hi = DAG.getNode(Opcode, dl, HiVT, Hi, Flags);
+    }
+    return;
+  }
+
+  assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
+
+  SDValue MaskLo, MaskHi;
+  std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(1));
+
+  SDValue EVLLo, EVLHi;
+  std::tie(EVLLo, EVLHi) =
+      DAG.SplitEVL(N->getOperand(2), N->getValueType(0), dl);
+
+  Lo = DAG.getNode(Opcode, dl, LoVT, {Lo, MaskLo, EVLLo}, Flags);
+  Hi = DAG.getNode(Opcode, dl, HiVT, {Hi, MaskHi, EVLHi}, Flags);
+}
+
+void DAGTypeLegalizer::SplitVecRes_FFREXP(SDNode *N, unsigned ResNo,
+                                          SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  auto [LoVT, HiVT] = DAG.GetSplitDestVTs(N->getValueType(0));
+  auto [LoVT1, HiVT1] = DAG.GetSplitDestVTs(N->getValueType(1));
+
+  // If the input also splits, handle it directly for a compile time speedup.
+  // Otherwise split it by hand.
+  EVT InVT = N->getOperand(0).getValueType();
+  if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
+    GetSplitVector(N->getOperand(0), Lo, Hi);
+  else
+    std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+
+  Lo = DAG.getNode(N->getOpcode(), dl, {LoVT, LoVT1}, Lo);
+  Hi = DAG.getNode(N->getOpcode(), dl, {HiVT, HiVT1}, Hi);
+  Lo->setFlags(N->getFlags());
+  Hi->setFlags(N->getFlags());
+
+  SDNode *HiNode = Hi.getNode();
+  SDNode *LoNode = Lo.getNode();
+
+  // Replace the other vector result not being explicitly split here.
+  unsigned OtherNo = 1 - ResNo;
+  EVT OtherVT = N->getValueType(OtherNo);
+  if (getTypeAction(OtherVT) == TargetLowering::TypeSplitVector) {
+    SetSplitVector(SDValue(N, OtherNo), SDValue(LoNode, OtherNo),
+                   SDValue(HiNode, OtherNo));
+  } else {
+    SDValue OtherVal =
+        DAG.getNode(ISD::CONCAT_VECTORS, dl, OtherVT, SDValue(LoNode, OtherNo),
+                    SDValue(HiNode, OtherNo));
+    ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
+  }
+}
+
+void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  SDLoc dl(N);
+  EVT SrcVT = N->getOperand(0).getValueType();
+  EVT DestVT = N->getValueType(0);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(DestVT);
+
+  // We can do better than a generic split operation if the extend is doing
+  // more than just doubling the width of the elements and the following are
+  // true:
+  //   - The number of vector elements is even,
+  //   - the source type is legal,
+  //   - the type of a split source is illegal,
+  //   - the type of an extended (by doubling element size) source is legal, and
+  //   - the type of that extended source when split is legal.
+  //
+  // This won't necessarily completely legalize the operation, but it will
+  // more effectively move in the right direction and prevent falling down
+  // to scalarization in many cases due to the input vector being split too
+  // far.
+  if (SrcVT.getVectorElementCount().isKnownEven() &&
+      SrcVT.getScalarSizeInBits() * 2 < DestVT.getScalarSizeInBits()) {
+    LLVMContext &Ctx = *DAG.getContext();
+    EVT NewSrcVT = SrcVT.widenIntegerVectorElementType(Ctx);
+    EVT SplitSrcVT = SrcVT.getHalfNumVectorElementsVT(Ctx);
+
+    EVT SplitLoVT, SplitHiVT;
+    std::tie(SplitLoVT, SplitHiVT) = DAG.GetSplitDestVTs(NewSrcVT);
+    if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) &&
+        TLI.isTypeLegal(NewSrcVT) && TLI.isTypeLegal(SplitLoVT)) {
+      LLVM_DEBUG(dbgs() << "Split vector extend via incremental extend:";
+                 N->dump(&DAG); dbgs() << "\n");
+      if (!N->isVPOpcode()) {
+        // Extend the source vector by one step.
+        SDValue NewSrc =
+            DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0));
+        // Get the low and high halves of the new, extended one step, vector.
+        std::tie(Lo, Hi) = DAG.SplitVector(NewSrc, dl);
+        // Extend those vector halves the rest of the way.
+        Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
+        Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
+        return;
+      }
+
+      // Extend the source vector by one step.
+      SDValue NewSrc =
+          DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0),
+                      N->getOperand(1), N->getOperand(2));
+      // Get the low and high halves of the new, extended one step, vector.
+      std::tie(Lo, Hi) = DAG.SplitVector(NewSrc, dl);
+
+      SDValue MaskLo, MaskHi;
+      std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(1));
+
+      SDValue EVLLo, EVLHi;
+      std::tie(EVLLo, EVLHi) =
+          DAG.SplitEVL(N->getOperand(2), N->getValueType(0), dl);
+      // Extend those vector halves the rest of the way.
+      Lo = DAG.getNode(N->getOpcode(), dl, LoVT, {Lo, MaskLo, EVLLo});
+      Hi = DAG.getNode(N->getOpcode(), dl, HiVT, {Hi, MaskHi, EVLHi});
+      return;
+    }
+  }
+  // Fall back to the generic unary operator splitting otherwise.
+  SplitVecRes_UnaryOp(N, Lo, Hi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
+                                                  SDValue &Lo, SDValue &Hi) {
+  // The low and high parts of the original input give four input vectors.
+  SDValue Inputs[4];
+  SDLoc DL(N);
+  GetSplitVector(N->getOperand(0), Inputs[0], Inputs[1]);
+  GetSplitVector(N->getOperand(1), Inputs[2], Inputs[3]);
+  EVT NewVT = Inputs[0].getValueType();
+  unsigned NewElts = NewVT.getVectorNumElements();
+
+  auto &&IsConstant = [](const SDValue &N) {
+    APInt SplatValue;
+    return N.getResNo() == 0 &&
+           (ISD::isConstantSplatVector(N.getNode(), SplatValue) ||
+            ISD::isBuildVectorOfConstantSDNodes(N.getNode()));
+  };
+  auto &&BuildVector = [NewElts, &DAG = DAG, NewVT, &DL](SDValue &Input1,
+                                                         SDValue &Input2,
+                                                         ArrayRef<int> Mask) {
+    assert(Input1->getOpcode() == ISD::BUILD_VECTOR &&
+           Input2->getOpcode() == ISD::BUILD_VECTOR &&
+           "Expected build vector node.");
+    EVT EltVT = NewVT.getVectorElementType();
+    SmallVector<SDValue> Ops(NewElts, DAG.getUNDEF(EltVT));
+    for (unsigned I = 0; I < NewElts; ++I) {
+      if (Mask[I] == PoisonMaskElem)
+        continue;
+      unsigned Idx = Mask[I];
+      if (Idx >= NewElts)
+        Ops[I] = Input2.getOperand(Idx - NewElts);
+      else
+        Ops[I] = Input1.getOperand(Idx);
+      // Make the type of all elements the same as the element type.
+      if (Ops[I].getValueType().bitsGT(EltVT))
+        Ops[I] = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Ops[I]);
+    }
+    return DAG.getBuildVector(NewVT, DL, Ops);
+  };
+
+  // If Lo or Hi uses elements from at most two of the four input vectors, then
+  // express it as a vector shuffle of those two inputs.  Otherwise extract the
+  // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
+  SmallVector<int> OrigMask(N->getMask());
+  // Try to pack incoming shuffles/inputs.
+  auto &&TryPeekThroughShufflesInputs = [&Inputs, &NewVT, this, NewElts,
+                                         &DL](SmallVectorImpl<int> &Mask) {
+    // Check if all inputs are shuffles of the same operands or non-shuffles.
+    MapVector<std::pair<SDValue, SDValue>, SmallVector<unsigned>> ShufflesIdxs;
+    for (unsigned Idx = 0; Idx < std::size(Inputs); ++Idx) {
+      SDValue Input = Inputs[Idx];
+      auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Input.getNode());
+      if (!Shuffle ||
+          Input.getOperand(0).getValueType() != Input.getValueType())
+        continue;
+      ShufflesIdxs[std::make_pair(Input.getOperand(0), Input.getOperand(1))]
+          .push_back(Idx);
+      ShufflesIdxs[std::make_pair(Input.getOperand(1), Input.getOperand(0))]
+          .push_back(Idx);
+    }
+    for (auto &P : ShufflesIdxs) {
+      if (P.second.size() < 2)
+        continue;
+      // Use shuffles operands instead of shuffles themselves.
+      // 1. Adjust mask.
+      for (int &Idx : Mask) {
+        if (Idx == PoisonMaskElem)
+          continue;
+        unsigned SrcRegIdx = Idx / NewElts;
+        if (Inputs[SrcRegIdx].isUndef()) {
+          Idx = PoisonMaskElem;
+          continue;
+        }
+        auto *Shuffle =
+            dyn_cast<ShuffleVectorSDNode>(Inputs[SrcRegIdx].getNode());
+        if (!Shuffle || !is_contained(P.second, SrcRegIdx))
+          continue;
+        int MaskElt = Shuffle->getMaskElt(Idx % NewElts);
+        if (MaskElt == PoisonMaskElem) {
+          Idx = PoisonMaskElem;
+          continue;
+        }
+        Idx = MaskElt % NewElts +
+              P.second[Shuffle->getOperand(MaskElt / NewElts) == P.first.first
+                           ? 0
+                           : 1] *
+                  NewElts;
+      }
+      // 2. Update inputs.
+      Inputs[P.second[0]] = P.first.first;
+      Inputs[P.second[1]] = P.first.second;
+      // Clear the pair data.
+      P.second.clear();
+      ShufflesIdxs[std::make_pair(P.first.second, P.first.first)].clear();
+    }
+    // Check if any concat_vectors can be simplified.
+    SmallBitVector UsedSubVector(2 * std::size(Inputs));
+    for (int &Idx : Mask) {
+      if (Idx == PoisonMaskElem)
+        continue;
+      unsigned SrcRegIdx = Idx / NewElts;
+      if (Inputs[SrcRegIdx].isUndef()) {
+        Idx = PoisonMaskElem;
+        continue;
+      }
+      TargetLowering::LegalizeTypeAction TypeAction =
+          getTypeAction(Inputs[SrcRegIdx].getValueType());
+      if (Inputs[SrcRegIdx].getOpcode() == ISD::CONCAT_VECTORS &&
+          Inputs[SrcRegIdx].getNumOperands() == 2 &&
+          !Inputs[SrcRegIdx].getOperand(1).isUndef() &&
+          (TypeAction == TargetLowering::TypeLegal ||
+           TypeAction == TargetLowering::TypeWidenVector))
+        UsedSubVector.set(2 * SrcRegIdx + (Idx % NewElts) / (NewElts / 2));
+    }
+    if (UsedSubVector.count() > 1) {
+      SmallVector<SmallVector<std::pair<unsigned, int>, 2>> Pairs;
+      for (unsigned I = 0; I < std::size(Inputs); ++I) {
+        if (UsedSubVector.test(2 * I) == UsedSubVector.test(2 * I + 1))
+          continue;
+        if (Pairs.empty() || Pairs.back().size() == 2)
+          Pairs.emplace_back();
+        if (UsedSubVector.test(2 * I)) {
+          Pairs.back().emplace_back(I, 0);
+        } else {
+          assert(UsedSubVector.test(2 * I + 1) &&
+                 "Expected to be used one of the subvectors.");
+          Pairs.back().emplace_back(I, 1);
+        }
+      }
+      if (!Pairs.empty() && Pairs.front().size() > 1) {
+        // Adjust mask.
+        for (int &Idx : Mask) {
+          if (Idx == PoisonMaskElem)
+            continue;
+          unsigned SrcRegIdx = Idx / NewElts;
+          auto *It = find_if(
+              Pairs, [SrcRegIdx](ArrayRef<std::pair<unsigned, int>> Idxs) {
+                return Idxs.front().first == SrcRegIdx ||
+                       Idxs.back().first == SrcRegIdx;
+              });
+          if (It == Pairs.end())
+            continue;
+          Idx = It->front().first * NewElts + (Idx % NewElts) % (NewElts / 2) +
+                (SrcRegIdx == It->front().first ? 0 : (NewElts / 2));
+        }
+        // Adjust inputs.
+        for (ArrayRef<std::pair<unsigned, int>> Idxs : Pairs) {
+          Inputs[Idxs.front().first] = DAG.getNode(
+              ISD::CONCAT_VECTORS, DL,
+              Inputs[Idxs.front().first].getValueType(),
+              Inputs[Idxs.front().first].getOperand(Idxs.front().second),
+              Inputs[Idxs.back().first].getOperand(Idxs.back().second));
+        }
+      }
+    }
+    bool Changed;
+    do {
+      // Try to remove extra shuffles (except broadcasts) and shuffles with the
+      // reused operands.
+      Changed = false;
+      for (unsigned I = 0; I < std::size(Inputs); ++I) {
+        auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Inputs[I].getNode());
+        if (!Shuffle)
+          continue;
+        if (Shuffle->getOperand(0).getValueType() != NewVT)
+          continue;
+        int Op = -1;
+        if (!Inputs[I].hasOneUse() && Shuffle->getOperand(1).isUndef() &&
+            !Shuffle->isSplat()) {
+          Op = 0;
+        } else if (!Inputs[I].hasOneUse() &&
+                   !Shuffle->getOperand(1).isUndef()) {
+          // Find the only used operand, if possible.
+          for (int &Idx : Mask) {
+            if (Idx == PoisonMaskElem)
+              continue;
+            unsigned SrcRegIdx = Idx / NewElts;
+            if (SrcRegIdx != I)
+              continue;
+            int MaskElt = Shuffle->getMaskElt(Idx % NewElts);
+            if (MaskElt == PoisonMaskElem) {
+              Idx = PoisonMaskElem;
+              continue;
+            }
+            int OpIdx = MaskElt / NewElts;
+            if (Op == -1) {
+              Op = OpIdx;
+              continue;
+            }
+            if (Op != OpIdx) {
+              Op = -1;
+              break;
+            }
+          }
+        }
+        if (Op < 0) {
+          // Try to check if one of the shuffle operands is used already.
+          for (int OpIdx = 0; OpIdx < 2; ++OpIdx) {
+            if (Shuffle->getOperand(OpIdx).isUndef())
+              continue;
+            auto *It = find(Inputs, Shuffle->getOperand(OpIdx));
+            if (It == std::end(Inputs))
+              continue;
+            int FoundOp = std::distance(std::begin(Inputs), It);
+            // Found that operand is used already.
+            // 1. Fix the mask for the reused operand.
+            for (int &Idx : Mask) {
+              if (Idx == PoisonMaskElem)
+                continue;
+              unsigned SrcRegIdx = Idx / NewElts;
+              if (SrcRegIdx != I)
+                continue;
+              int MaskElt = Shuffle->getMaskElt(Idx % NewElts);
+              if (MaskElt == PoisonMaskElem) {
+                Idx = PoisonMaskElem;
+                continue;
+              }
+              int MaskIdx = MaskElt / NewElts;
+              if (OpIdx == MaskIdx)
+                Idx = MaskElt % NewElts + FoundOp * NewElts;
+            }
+            // 2. Set Op to the unused OpIdx.
+            Op = (OpIdx + 1) % 2;
+            break;
+          }
+        }
+        if (Op >= 0) {
+          Changed = true;
+          Inputs[I] = Shuffle->getOperand(Op);
+          // Adjust mask.
+          for (int &Idx : Mask) {
+            if (Idx == PoisonMaskElem)
+              continue;
+            unsigned SrcRegIdx = Idx / NewElts;
+            if (SrcRegIdx != I)
+              continue;
+            int MaskElt = Shuffle->getMaskElt(Idx % NewElts);
+            int OpIdx = MaskElt / NewElts;
+            if (OpIdx != Op)
+              continue;
+            Idx = MaskElt % NewElts + SrcRegIdx * NewElts;
+          }
+        }
+      }
+    } while (Changed);
+  };
+  TryPeekThroughShufflesInputs(OrigMask);
+  // Proces unique inputs.
+  auto &&MakeUniqueInputs = [&Inputs, &IsConstant,
+                             NewElts](SmallVectorImpl<int> &Mask) {
+    SetVector<SDValue> UniqueInputs;
+    SetVector<SDValue> UniqueConstantInputs;
+    for (const auto &I : Inputs) {
+      if (IsConstant(I))
+        UniqueConstantInputs.insert(I);
+      else if (!I.isUndef())
+        UniqueInputs.insert(I);
+    }
+    // Adjust mask in case of reused inputs. Also, need to insert constant
+    // inputs at first, otherwise it affects the final outcome.
+    if (UniqueInputs.size() != std::size(Inputs)) {
+      auto &&UniqueVec = UniqueInputs.takeVector();
+      auto &&UniqueConstantVec = UniqueConstantInputs.takeVector();
+      unsigned ConstNum = UniqueConstantVec.size();
+      for (int &Idx : Mask) {
+        if (Idx == PoisonMaskElem)
+          continue;
+        unsigned SrcRegIdx = Idx / NewElts;
+        if (Inputs[SrcRegIdx].isUndef()) {
+          Idx = PoisonMaskElem;
+          continue;
+        }
+        const auto It = find(UniqueConstantVec, Inputs[SrcRegIdx]);
+        if (It != UniqueConstantVec.end()) {
+          Idx = (Idx % NewElts) +
+                NewElts * std::distance(UniqueConstantVec.begin(), It);
+          assert(Idx >= 0 && "Expected defined mask idx.");
+          continue;
+        }
+        const auto RegIt = find(UniqueVec, Inputs[SrcRegIdx]);
+        assert(RegIt != UniqueVec.end() && "Cannot find non-const value.");
+        Idx = (Idx % NewElts) +
+              NewElts * (std::distance(UniqueVec.begin(), RegIt) + ConstNum);
+        assert(Idx >= 0 && "Expected defined mask idx.");
+      }
+      copy(UniqueConstantVec, std::begin(Inputs));
+      copy(UniqueVec, std::next(std::begin(Inputs), ConstNum));
+    }
+  };
+  MakeUniqueInputs(OrigMask);
+  SDValue OrigInputs[4];
+  copy(Inputs, std::begin(OrigInputs));
+  for (unsigned High = 0; High < 2; ++High) {
+    SDValue &Output = High ? Hi : Lo;
+
+    // Build a shuffle mask for the output, discovering on the fly which
+    // input vectors to use as shuffle operands.
+    unsigned FirstMaskIdx = High * NewElts;
+    SmallVector<int> Mask(NewElts * std::size(Inputs), PoisonMaskElem);
+    copy(ArrayRef(OrigMask).slice(FirstMaskIdx, NewElts), Mask.begin());
+    assert(!Output && "Expected default initialized initial value.");
+    TryPeekThroughShufflesInputs(Mask);
+    MakeUniqueInputs(Mask);
+    SDValue TmpInputs[4];
+    copy(Inputs, std::begin(TmpInputs));
+    // Track changes in the output registers.
+    int UsedIdx = -1;
+    bool SecondIteration = false;
+    auto &&AccumulateResults = [&UsedIdx, &SecondIteration](unsigned Idx) {
+      if (UsedIdx < 0) {
+        UsedIdx = Idx;
+        return false;
+      }
+      if (UsedIdx >= 0 && static_cast<unsigned>(UsedIdx) == Idx)
+        SecondIteration = true;
+      return SecondIteration;
+    };
+    processShuffleMasks(
+        Mask, std::size(Inputs), std::size(Inputs),
+        /*NumOfUsedRegs=*/1,
+        [&Output, &DAG = DAG, NewVT]() { Output = DAG.getUNDEF(NewVT); },
+        [&Output, &DAG = DAG, NewVT, &DL, &Inputs,
+         &BuildVector](ArrayRef<int> Mask, unsigned Idx, unsigned /*Unused*/) {
+          if (Inputs[Idx]->getOpcode() == ISD::BUILD_VECTOR)
+            Output = BuildVector(Inputs[Idx], Inputs[Idx], Mask);
+          else
+            Output = DAG.getVectorShuffle(NewVT, DL, Inputs[Idx],
+                                          DAG.getUNDEF(NewVT), Mask);
+          Inputs[Idx] = Output;
+        },
+        [&AccumulateResults, &Output, &DAG = DAG, NewVT, &DL, &Inputs,
+         &TmpInputs,
+         &BuildVector](ArrayRef<int> Mask, unsigned Idx1, unsigned Idx2) {
+          if (AccumulateResults(Idx1)) {
+            if (Inputs[Idx1]->getOpcode() == ISD::BUILD_VECTOR &&
+                Inputs[Idx2]->getOpcode() == ISD::BUILD_VECTOR)
+              Output = BuildVector(Inputs[Idx1], Inputs[Idx2], Mask);
+            else
+              Output = DAG.getVectorShuffle(NewVT, DL, Inputs[Idx1],
+                                            Inputs[Idx2], Mask);
+          } else {
+            if (TmpInputs[Idx1]->getOpcode() == ISD::BUILD_VECTOR &&
+                TmpInputs[Idx2]->getOpcode() == ISD::BUILD_VECTOR)
+              Output = BuildVector(TmpInputs[Idx1], TmpInputs[Idx2], Mask);
+            else
+              Output = DAG.getVectorShuffle(NewVT, DL, TmpInputs[Idx1],
+                                            TmpInputs[Idx2], Mask);
+          }
+          Inputs[Idx1] = Output;
+        });
+    copy(OrigInputs, std::begin(Inputs));
+  }
+}
+
+void DAGTypeLegalizer::SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  EVT OVT = N->getValueType(0);
+  EVT NVT = OVT.getHalfNumVectorElementsVT(*DAG.getContext());
+  SDValue Chain = N->getOperand(0);
+  SDValue Ptr = N->getOperand(1);
+  SDValue SV = N->getOperand(2);
+  SDLoc dl(N);
+
+  const Align Alignment =
+      DAG.getDataLayout().getABITypeAlign(NVT.getTypeForEVT(*DAG.getContext()));
+
+  Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, SV, Alignment.value());
+  Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, SV, Alignment.value());
+  Chain = Hi.getValue(1);
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
+}
+
+void DAGTypeLegalizer::SplitVecRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo,
+                                                  SDValue &Hi) {
+  EVT DstVTLo, DstVTHi;
+  std::tie(DstVTLo, DstVTHi) = DAG.GetSplitDestVTs(N->getValueType(0));
+  SDLoc dl(N);
+
+  SDValue SrcLo, SrcHi;
+  EVT SrcVT = N->getOperand(0).getValueType();
+  if (getTypeAction(SrcVT) == TargetLowering::TypeSplitVector)
+    GetSplitVector(N->getOperand(0), SrcLo, SrcHi);
+  else
+    std::tie(SrcLo, SrcHi) = DAG.SplitVectorOperand(N, 0);
+
+  Lo = DAG.getNode(N->getOpcode(), dl, DstVTLo, SrcLo, N->getOperand(1));
+  Hi = DAG.getNode(N->getOpcode(), dl, DstVTHi, SrcHi, N->getOperand(1));
+}
+
+void DAGTypeLegalizer::SplitVecRes_VECTOR_REVERSE(SDNode *N, SDValue &Lo,
+                                                  SDValue &Hi) {
+  SDValue InLo, InHi;
+  GetSplitVector(N->getOperand(0), InLo, InHi);
+  SDLoc DL(N);
+
+  Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, InHi.getValueType(), InHi);
+  Hi = DAG.getNode(ISD::VECTOR_REVERSE, DL, InLo.getValueType(), InLo);
+}
+
+void DAGTypeLegalizer::SplitVecRes_VECTOR_SPLICE(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+  SDValue Expanded = TLI.expandVectorSplice(N, DAG);
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, Expanded,
+                   DAG.getVectorIdxConstant(0, DL));
+  Hi =
+      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, Expanded,
+                  DAG.getVectorIdxConstant(LoVT.getVectorMinNumElements(), DL));
+}
+
+void DAGTypeLegalizer::SplitVecRes_VECTOR_DEINTERLEAVE(SDNode *N) {
+
+  SDValue Op0Lo, Op0Hi, Op1Lo, Op1Hi;
+  GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi);
+  GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi);
+  EVT VT = Op0Lo.getValueType();
+  SDLoc DL(N);
+  SDValue ResLo = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL,
+                              DAG.getVTList(VT, VT), Op0Lo, Op0Hi);
+  SDValue ResHi = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL,
+                              DAG.getVTList(VT, VT), Op1Lo, Op1Hi);
+
+  SetSplitVector(SDValue(N, 0), ResLo.getValue(0), ResHi.getValue(0));
+  SetSplitVector(SDValue(N, 1), ResLo.getValue(1), ResHi.getValue(1));
+}
+
+void DAGTypeLegalizer::SplitVecRes_VECTOR_INTERLEAVE(SDNode *N) {
+  SDValue Op0Lo, Op0Hi, Op1Lo, Op1Hi;
+  GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi);
+  GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi);
+  EVT VT = Op0Lo.getValueType();
+  SDLoc DL(N);
+  SDValue Res[] = {DAG.getNode(ISD::VECTOR_INTERLEAVE, DL,
+                               DAG.getVTList(VT, VT), Op0Lo, Op1Lo),
+                   DAG.getNode(ISD::VECTOR_INTERLEAVE, DL,
+                               DAG.getVTList(VT, VT), Op0Hi, Op1Hi)};
+
+  SetSplitVector(SDValue(N, 0), Res[0].getValue(0), Res[0].getValue(1));
+  SetSplitVector(SDValue(N, 1), Res[1].getValue(0), Res[1].getValue(1));
+}
+
+//===----------------------------------------------------------------------===//
+//  Operand Vector Splitting
+//===----------------------------------------------------------------------===//
+
+/// This method is called when the specified operand of the specified node is
+/// found to need vector splitting. At this point, all of the result types of
+/// the node are known to be legal, but other operands of the node may need
+/// legalization as well as the specified one.
+bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
+  LLVM_DEBUG(dbgs() << "Split node operand: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  // See if the target wants to custom split this node.
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "SplitVectorOperand Op #" << OpNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to split this operator's "
+                       "operand!\n");
+
+  case ISD::VP_SETCC:
+  case ISD::SETCC:             Res = SplitVecOp_VSETCC(N); break;
+  case ISD::BITCAST:           Res = SplitVecOp_BITCAST(N); break;
+  case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
+  case ISD::INSERT_SUBVECTOR:  Res = SplitVecOp_INSERT_SUBVECTOR(N, OpNo); break;
+  case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::CONCAT_VECTORS:    Res = SplitVecOp_CONCAT_VECTORS(N); break;
+  case ISD::VP_TRUNCATE:
+  case ISD::TRUNCATE:
+    Res = SplitVecOp_TruncateHelper(N);
+    break;
+  case ISD::STRICT_FP_ROUND:
+  case ISD::VP_FP_ROUND:
+  case ISD::FP_ROUND:          Res = SplitVecOp_FP_ROUND(N); break;
+  case ISD::FCOPYSIGN:         Res = SplitVecOp_FPOpDifferentTypes(N); break;
+  case ISD::STORE:
+    Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
+    break;
+  case ISD::VP_STORE:
+    Res = SplitVecOp_VP_STORE(cast<VPStoreSDNode>(N), OpNo);
+    break;
+  case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
+    Res = SplitVecOp_VP_STRIDED_STORE(cast<VPStridedStoreSDNode>(N), OpNo);
+    break;
+  case ISD::MSTORE:
+    Res = SplitVecOp_MSTORE(cast<MaskedStoreSDNode>(N), OpNo);
+    break;
+  case ISD::MSCATTER:
+  case ISD::VP_SCATTER:
+    Res = SplitVecOp_Scatter(cast<MemSDNode>(N), OpNo);
+    break;
+  case ISD::MGATHER:
+  case ISD::VP_GATHER:
+    Res = SplitVecOp_Gather(cast<MemSDNode>(N), OpNo);
+    break;
+  case ISD::VSELECT:
+    Res = SplitVecOp_VSELECT(N, OpNo);
+    break;
+  case ISD::STRICT_SINT_TO_FP:
+  case ISD::STRICT_UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::VP_SINT_TO_FP:
+  case ISD::VP_UINT_TO_FP:
+    if (N->getValueType(0).bitsLT(
+            N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType()))
+      Res = SplitVecOp_TruncateHelper(N);
+    else
+      Res = SplitVecOp_UnaryOp(N);
+    break;
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    Res = SplitVecOp_FP_TO_XINT_SAT(N);
+    break;
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::VP_FP_TO_SINT:
+  case ISD::VP_FP_TO_UINT:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
+  case ISD::STRICT_FP_EXTEND:
+  case ISD::FP_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+  case ISD::FTRUNC:
+    Res = SplitVecOp_UnaryOp(N);
+    break;
+  case ISD::FLDEXP:
+    Res = SplitVecOp_FPOpDifferentTypes(N);
+    break;
+
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    Res = SplitVecOp_ExtVecInRegOp(N);
+    break;
+
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN:
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VECREDUCE_FMINIMUM:
+    Res = SplitVecOp_VECREDUCE(N, OpNo);
+    break;
+  case ISD::VECREDUCE_SEQ_FADD:
+  case ISD::VECREDUCE_SEQ_FMUL:
+    Res = SplitVecOp_VECREDUCE_SEQ(N);
+    break;
+  case ISD::VP_REDUCE_FADD:
+  case ISD::VP_REDUCE_SEQ_FADD:
+  case ISD::VP_REDUCE_FMUL:
+  case ISD::VP_REDUCE_SEQ_FMUL:
+  case ISD::VP_REDUCE_ADD:
+  case ISD::VP_REDUCE_MUL:
+  case ISD::VP_REDUCE_AND:
+  case ISD::VP_REDUCE_OR:
+  case ISD::VP_REDUCE_XOR:
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_SMIN:
+  case ISD::VP_REDUCE_UMAX:
+  case ISD::VP_REDUCE_UMIN:
+  case ISD::VP_REDUCE_FMAX:
+  case ISD::VP_REDUCE_FMIN:
+    Res = SplitVecOp_VP_REDUCE(N, OpNo);
+    break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  if (N->isStrictFPOpcode())
+    assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 2 &&
+           "Invalid operand expansion");
+  else
+    assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
+  // The only possibility for an illegal operand is the mask, since result type
+  // legalization would have handled this node already otherwise.
+  assert(OpNo == 0 && "Illegal operand must be mask");
+
+  SDValue Mask = N->getOperand(0);
+  SDValue Src0 = N->getOperand(1);
+  SDValue Src1 = N->getOperand(2);
+  EVT Src0VT = Src0.getValueType();
+  SDLoc DL(N);
+  assert(Mask.getValueType().isVector() && "VSELECT without a vector mask?");
+
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  assert(Lo.getValueType() == Hi.getValueType() &&
+         "Lo and Hi have differing types");
+
+  EVT LoOpVT, HiOpVT;
+  std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(Src0VT);
+  assert(LoOpVT == HiOpVT && "Asymmetric vector split?");
+
+  SDValue LoOp0, HiOp0, LoOp1, HiOp1, LoMask, HiMask;
+  std::tie(LoOp0, HiOp0) = DAG.SplitVector(Src0, DL);
+  std::tie(LoOp1, HiOp1) = DAG.SplitVector(Src1, DL);
+  std::tie(LoMask, HiMask) = DAG.SplitVector(Mask, DL);
+
+  SDValue LoSelect =
+    DAG.getNode(ISD::VSELECT, DL, LoOpVT, LoMask, LoOp0, LoOp1);
+  SDValue HiSelect =
+    DAG.getNode(ISD::VSELECT, DL, HiOpVT, HiMask, HiOp0, HiOp1);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) {
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  SDLoc dl(N);
+
+  SDValue VecOp = N->getOperand(OpNo);
+  EVT VecVT = VecOp.getValueType();
+  assert(VecVT.isVector() && "Can only split reduce vector operand");
+  GetSplitVector(VecOp, Lo, Hi);
+  EVT LoOpVT, HiOpVT;
+  std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT);
+
+  // Use the appropriate scalar instruction on the split subvectors before
+  // reducing the now partially reduced smaller vector.
+  unsigned CombineOpc = ISD::getVecReduceBaseOpcode(N->getOpcode());
+  SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi, N->getFlags());
+  return DAG.getNode(N->getOpcode(), dl, ResVT, Partial, N->getFlags());
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE_SEQ(SDNode *N) {
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  SDLoc dl(N);
+
+  SDValue AccOp = N->getOperand(0);
+  SDValue VecOp = N->getOperand(1);
+  SDNodeFlags Flags = N->getFlags();
+
+  EVT VecVT = VecOp.getValueType();
+  assert(VecVT.isVector() && "Can only split reduce vector operand");
+  GetSplitVector(VecOp, Lo, Hi);
+  EVT LoOpVT, HiOpVT;
+  std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT);
+
+  // Reduce low half.
+  SDValue Partial = DAG.getNode(N->getOpcode(), dl, ResVT, AccOp, Lo, Flags);
+
+  // Reduce high half, using low half result as initial value.
+  return DAG.getNode(N->getOpcode(), dl, ResVT, Partial, Hi, Flags);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_VP_REDUCE(SDNode *N, unsigned OpNo) {
+  assert(N->isVPOpcode() && "Expected VP opcode");
+  assert(OpNo == 1 && "Can only split reduce vector operand");
+
+  unsigned Opc = N->getOpcode();
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  SDLoc dl(N);
+
+  SDValue VecOp = N->getOperand(OpNo);
+  EVT VecVT = VecOp.getValueType();
+  assert(VecVT.isVector() && "Can only split reduce vector operand");
+  GetSplitVector(VecOp, Lo, Hi);
+
+  SDValue MaskLo, MaskHi;
+  std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(2));
+
+  SDValue EVLLo, EVLHi;
+  std::tie(EVLLo, EVLHi) = DAG.SplitEVL(N->getOperand(3), VecVT, dl);
+
+  const SDNodeFlags Flags = N->getFlags();
+
+  SDValue ResLo =
+      DAG.getNode(Opc, dl, ResVT, {N->getOperand(0), Lo, MaskLo, EVLLo}, Flags);
+  return DAG.getNode(Opc, dl, ResVT, {ResLo, Hi, MaskHi, EVLHi}, Flags);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
+  // The result has a legal vector type, but the input needs splitting.
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  SDLoc dl(N);
+  GetSplitVector(N->getOperand(N->isStrictFPOpcode() ? 1 : 0), Lo, Hi);
+  EVT InVT = Lo.getValueType();
+
+  EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
+                               InVT.getVectorElementCount());
+
+  if (N->isStrictFPOpcode()) {
+    Lo = DAG.getNode(N->getOpcode(), dl, { OutVT, MVT::Other }, 
+                     { N->getOperand(0), Lo });
+    Hi = DAG.getNode(N->getOpcode(), dl, { OutVT, MVT::Other }, 
+                     { N->getOperand(0), Hi });
+
+    // Build a factor node to remember that this operation is independent
+    // of the other one.
+    SDValue Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                             Hi.getValue(1));
+  
+    // Legalize the chain result - switch anything that used the old chain to
+    // use the new one.
+    ReplaceValueWith(SDValue(N, 1), Ch);
+  } else if (N->getNumOperands() == 3) {
+    assert(N->isVPOpcode() && "Expected VP opcode");
+    SDValue MaskLo, MaskHi, EVLLo, EVLHi;
+    std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(1));
+    std::tie(EVLLo, EVLHi) =
+        DAG.SplitEVL(N->getOperand(2), N->getValueType(0), dl);
+    Lo = DAG.getNode(N->getOpcode(), dl, OutVT, Lo, MaskLo, EVLLo);
+    Hi = DAG.getNode(N->getOpcode(), dl, OutVT, Hi, MaskHi, EVLHi);
+  } else {
+    Lo = DAG.getNode(N->getOpcode(), dl, OutVT, Lo);
+    Hi = DAG.getNode(N->getOpcode(), dl, OutVT, Hi);
+  }
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_BITCAST(SDNode *N) {
+  // For example, i64 = BITCAST v4i16 on alpha.  Typically the vector will
+  // end up being split all the way down to individual components.  Convert the
+  // split pieces into integers and reassemble.
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  Lo = BitConvertToInteger(Lo);
+  Hi = BitConvertToInteger(Hi);
+
+  if (DAG.getDataLayout().isBigEndian())
+    std::swap(Lo, Hi);
+
+  return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
+                     JoinIntegers(Lo, Hi));
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_INSERT_SUBVECTOR(SDNode *N,
+                                                      unsigned OpNo) {
+  assert(OpNo == 1 && "Invalid OpNo; can only split SubVec.");
+  // We know that the result type is legal.
+  EVT ResVT = N->getValueType(0);
+
+  SDValue Vec = N->getOperand(0);
+  SDValue SubVec = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+  SDLoc dl(N);
+
+  SDValue Lo, Hi;
+  GetSplitVector(SubVec, Lo, Hi);
+
+  uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  uint64_t LoElts = Lo.getValueType().getVectorMinNumElements();
+
+  SDValue FirstInsertion =
+      DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Lo, Idx);
+  SDValue SecondInsertion =
+      DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, FirstInsertion, Hi,
+                  DAG.getVectorIdxConstant(IdxVal + LoElts, dl));
+
+  return SecondInsertion;
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
+  // We know that the extracted result type is legal.
+  EVT SubVT = N->getValueType(0);
+  SDValue Idx = N->getOperand(1);
+  SDLoc dl(N);
+  SDValue Lo, Hi;
+
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+
+  uint64_t LoEltsMin = Lo.getValueType().getVectorMinNumElements();
+  uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
+  if (IdxVal < LoEltsMin) {
+    assert(IdxVal + SubVT.getVectorMinNumElements() <= LoEltsMin &&
+           "Extracted subvector crosses vector split!");
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx);
+  } else if (SubVT.isScalableVector() ==
+             N->getOperand(0).getValueType().isScalableVector())
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Hi,
+                       DAG.getVectorIdxConstant(IdxVal - LoEltsMin, dl));
+
+  // After this point the DAG node only permits extracting fixed-width
+  // subvectors from scalable vectors.
+  assert(SubVT.isFixedLengthVector() &&
+         "Extracting scalable subvector from fixed-width unsupported");
+
+  // If the element type is i1 and we're not promoting the result, then we may
+  // end up loading the wrong data since the bits are packed tightly into
+  // bytes. For example, if we extract a v4i1 (legal) from a nxv4i1 (legal)
+  // type at index 4, then we will load a byte starting at index 0.
+  if (SubVT.getScalarType() == MVT::i1)
+    report_fatal_error("Don't know how to extract fixed-width predicate "
+                       "subvector from a scalable predicate vector");
+
+  // Spill the vector to the stack. We should use the alignment for
+  // the smallest part.
+  SDValue Vec = N->getOperand(0);
+  EVT VecVT = Vec.getValueType();
+  Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+  SDValue StackPtr =
+      DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
+  auto &MF = DAG.getMachineFunction();
+  auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+                               SmallestAlign);
+
+  // Extract the subvector by loading the correct part.
+  StackPtr = TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT, SubVT, Idx);
+
+  return DAG.getLoad(
+      SubVT, dl, Store, StackPtr,
+      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Idx = N->getOperand(1);
+  EVT VecVT = Vec.getValueType();
+
+  if (const ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Idx)) {
+    uint64_t IdxVal = Index->getZExtValue();
+
+    SDValue Lo, Hi;
+    GetSplitVector(Vec, Lo, Hi);
+
+    uint64_t LoElts = Lo.getValueType().getVectorMinNumElements();
+
+    if (IdxVal < LoElts)
+      return SDValue(DAG.UpdateNodeOperands(N, Lo, Idx), 0);
+    else if (!Vec.getValueType().isScalableVector())
+      return SDValue(DAG.UpdateNodeOperands(N, Hi,
+                                    DAG.getConstant(IdxVal - LoElts, SDLoc(N),
+                                                    Idx.getValueType())), 0);
+  }
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(0), true))
+    return SDValue();
+
+  // Make the vector elements byte-addressable if they aren't already.
+  SDLoc dl(N);
+  EVT EltVT = VecVT.getVectorElementType();
+  if (VecVT.getScalarSizeInBits() < 8) {
+    EltVT = MVT::i8;
+    VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
+                             VecVT.getVectorElementCount());
+    Vec = DAG.getNode(ISD::ANY_EXTEND, dl, VecVT, Vec);
+  }
+
+  // Store the vector to the stack.
+  // In cases where the vector is illegal it will be broken down into parts
+  // and stored in parts - we should use the alignment for the smallest part.
+  Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+  SDValue StackPtr =
+      DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
+  auto &MF = DAG.getMachineFunction();
+  auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+                               SmallestAlign);
+
+  // Load back the required element.
+  StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
+
+  // FIXME: This is to handle i1 vectors with elements promoted to i8.
+  // i1 vector handling needs general improvement.
+  if (N->getValueType(0).bitsLT(EltVT)) {
+    SDValue Load = DAG.getLoad(EltVT, dl, Store, StackPtr,
+      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+    return DAG.getZExtOrTrunc(Load, dl, N->getValueType(0));
+  }
+
+  return DAG.getExtLoad(
+      ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
+      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT,
+      commonAlignment(SmallestAlign, EltVT.getFixedSizeInBits() / 8));
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) {
+  SDValue Lo, Hi;
+
+  // *_EXTEND_VECTOR_INREG only reference the lower half of the input, so
+  // splitting the result has the same effect as splitting the input operand.
+  SplitVecRes_ExtVecInRegOp(N, Lo, Hi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_Gather(MemSDNode *N, unsigned OpNo) {
+  (void)OpNo;
+  SDValue Lo, Hi;
+  SplitVecRes_Gather(N, Lo, Hi);
+
+  SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, N, N->getValueType(0), Lo, Hi);
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo) {
+  assert(N->isUnindexed() && "Indexed vp_store of vector?");
+  SDValue Ch = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  SDValue Offset = N->getOffset();
+  assert(Offset.isUndef() && "Unexpected VP store offset");
+  SDValue Mask = N->getMask();
+  SDValue EVL = N->getVectorLength();
+  SDValue Data = N->getValue();
+  Align Alignment = N->getOriginalAlign();
+  SDLoc DL(N);
+
+  SDValue DataLo, DataHi;
+  if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
+    // Split Data operand
+    GetSplitVector(Data, DataLo, DataHi);
+  else
+    std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
+
+  // Split Mask operand
+  SDValue MaskLo, MaskHi;
+  if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+  } else {
+    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Mask, MaskLo, MaskHi);
+    else
+      std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+  }
+
+  EVT MemoryVT = N->getMemoryVT();
+  EVT LoMemVT, HiMemVT;
+  bool HiIsEmpty = false;
+  std::tie(LoMemVT, HiMemVT) =
+      DAG.GetDependentSplitDestVTs(MemoryVT, DataLo.getValueType(), &HiIsEmpty);
+
+  // Split EVL
+  SDValue EVLLo, EVLHi;
+  std::tie(EVLLo, EVLHi) = DAG.SplitEVL(EVL, Data.getValueType(), DL);
+
+  SDValue Lo, Hi;
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      N->getPointerInfo(), MachineMemOperand::MOStore,
+      MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
+
+  Lo = DAG.getStoreVP(Ch, DL, DataLo, Ptr, Offset, MaskLo, EVLLo, LoMemVT, MMO,
+                      N->getAddressingMode(), N->isTruncatingStore(),
+                      N->isCompressingStore());
+
+  // If the hi vp_store has zero storage size, only the lo vp_store is needed.
+  if (HiIsEmpty)
+    return Lo;
+
+  Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
+                                   N->isCompressingStore());
+
+  MachinePointerInfo MPI;
+  if (LoMemVT.isScalableVector()) {
+    Alignment = commonAlignment(Alignment,
+                                LoMemVT.getSizeInBits().getKnownMinValue() / 8);
+    MPI = MachinePointerInfo(N->getPointerInfo().getAddrSpace());
+  } else
+    MPI = N->getPointerInfo().getWithOffset(
+        LoMemVT.getStoreSize().getFixedValue());
+
+  MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MPI, MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment,
+      N->getAAInfo(), N->getRanges());
+
+  Hi = DAG.getStoreVP(Ch, DL, DataHi, Ptr, Offset, MaskHi, EVLHi, HiMemVT, MMO,
+                      N->getAddressingMode(), N->isTruncatingStore(),
+                      N->isCompressingStore());
+
+  // Build a factor node to remember that this store is independent of the
+  // other one.
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_VP_STRIDED_STORE(VPStridedStoreSDNode *N,
+                                                      unsigned OpNo) {
+  assert(N->isUnindexed() && "Indexed vp_strided_store of a vector?");
+  assert(N->getOffset().isUndef() && "Unexpected VP strided store offset");
+
+  SDLoc DL(N);
+
+  SDValue Data = N->getValue();
+  SDValue LoData, HiData;
+  if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Data, LoData, HiData);
+  else
+    std::tie(LoData, HiData) = DAG.SplitVector(Data, DL);
+
+  EVT LoMemVT, HiMemVT;
+  bool HiIsEmpty = false;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetDependentSplitDestVTs(
+      N->getMemoryVT(), LoData.getValueType(), &HiIsEmpty);
+
+  SDValue Mask = N->getMask();
+  SDValue LoMask, HiMask;
+  if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC)
+    SplitVecRes_SETCC(Mask.getNode(), LoMask, HiMask);
+  else if (getTypeAction(Mask.getValueType()) ==
+           TargetLowering::TypeSplitVector)
+    GetSplitVector(Mask, LoMask, HiMask);
+  else
+    std::tie(LoMask, HiMask) = DAG.SplitVector(Mask, DL);
+
+  SDValue LoEVL, HiEVL;
+  std::tie(LoEVL, HiEVL) =
+      DAG.SplitEVL(N->getVectorLength(), Data.getValueType(), DL);
+
+  // Generate the low vp_strided_store
+  SDValue Lo = DAG.getStridedStoreVP(
+      N->getChain(), DL, LoData, N->getBasePtr(), N->getOffset(),
+      N->getStride(), LoMask, LoEVL, LoMemVT, N->getMemOperand(),
+      N->getAddressingMode(), N->isTruncatingStore(), N->isCompressingStore());
+
+  // If the high vp_strided_store has zero storage size, only the low
+  // vp_strided_store is needed.
+  if (HiIsEmpty)
+    return Lo;
+
+  // Generate the high vp_strided_store.
+  // To calculate the high base address, we need to sum to the low base
+  // address stride number of bytes for each element already stored by low,
+  // that is: Ptr = Ptr + (LoEVL * Stride)
+  EVT PtrVT = N->getBasePtr().getValueType();
+  SDValue Increment =
+      DAG.getNode(ISD::MUL, DL, PtrVT, LoEVL,
+                  DAG.getSExtOrTrunc(N->getStride(), DL, PtrVT));
+  SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, N->getBasePtr(), Increment);
+
+  Align Alignment = N->getOriginalAlign();
+  if (LoMemVT.isScalableVector())
+    Alignment = commonAlignment(Alignment,
+                                LoMemVT.getSizeInBits().getKnownMinValue() / 8);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(N->getPointerInfo().getAddrSpace()),
+      MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment,
+      N->getAAInfo(), N->getRanges());
+
+  SDValue Hi = DAG.getStridedStoreVP(
+      N->getChain(), DL, HiData, Ptr, N->getOffset(), N->getStride(), HiMask,
+      HiEVL, HiMemVT, MMO, N->getAddressingMode(), N->isTruncatingStore(),
+      N->isCompressingStore());
+
+  // Build a factor node to remember that this store is independent of the
+  // other one.
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
+                                            unsigned OpNo) {
+  assert(N->isUnindexed() && "Indexed masked store of vector?");
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  SDValue Offset = N->getOffset();
+  assert(Offset.isUndef() && "Unexpected indexed masked store offset");
+  SDValue Mask = N->getMask();
+  SDValue Data = N->getValue();
+  Align Alignment = N->getOriginalAlign();
+  SDLoc DL(N);
+
+  SDValue DataLo, DataHi;
+  if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
+    // Split Data operand
+    GetSplitVector(Data, DataLo, DataHi);
+  else
+    std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
+
+  // Split Mask operand
+  SDValue MaskLo, MaskHi;
+  if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+  } else {
+    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Mask, MaskLo, MaskHi);
+    else
+      std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+  }
+
+  EVT MemoryVT = N->getMemoryVT();
+  EVT LoMemVT, HiMemVT;
+  bool HiIsEmpty = false;
+  std::tie(LoMemVT, HiMemVT) =
+      DAG.GetDependentSplitDestVTs(MemoryVT, DataLo.getValueType(), &HiIsEmpty);
+
+  SDValue Lo, Hi, Res;
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      N->getPointerInfo(), MachineMemOperand::MOStore,
+      MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
+
+  Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, Offset, MaskLo, LoMemVT, MMO,
+                          N->getAddressingMode(), N->isTruncatingStore(),
+                          N->isCompressingStore());
+
+  if (HiIsEmpty) {
+    // The hi masked store has zero storage size.
+    // Only the lo masked store is needed.
+    Res = Lo;
+  } else {
+
+    Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
+                                     N->isCompressingStore());
+
+    MachinePointerInfo MPI;
+    if (LoMemVT.isScalableVector()) {
+      Alignment = commonAlignment(
+          Alignment, LoMemVT.getSizeInBits().getKnownMinValue() / 8);
+      MPI = MachinePointerInfo(N->getPointerInfo().getAddrSpace());
+    } else
+      MPI = N->getPointerInfo().getWithOffset(
+          LoMemVT.getStoreSize().getFixedValue());
+
+    MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MPI, MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment,
+        N->getAAInfo(), N->getRanges());
+
+    Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, Offset, MaskHi, HiMemVT, MMO,
+                            N->getAddressingMode(), N->isTruncatingStore(),
+                            N->isCompressingStore());
+
+    // Build a factor node to remember that this store is independent of the
+    // other one.
+    Res = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+  }
+
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_Scatter(MemSDNode *N, unsigned OpNo) {
+  SDValue Ch = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  EVT MemoryVT = N->getMemoryVT();
+  Align Alignment = N->getOriginalAlign();
+  SDLoc DL(N);
+  struct Operands {
+    SDValue Mask;
+    SDValue Index;
+    SDValue Scale;
+    SDValue Data;
+  } Ops = [&]() -> Operands {
+    if (auto *MSC = dyn_cast<MaskedScatterSDNode>(N)) {
+      return {MSC->getMask(), MSC->getIndex(), MSC->getScale(),
+              MSC->getValue()};
+    }
+    auto *VPSC = cast<VPScatterSDNode>(N);
+    return {VPSC->getMask(), VPSC->getIndex(), VPSC->getScale(),
+            VPSC->getValue()};
+  }();
+  // Split all operands
+
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  SDValue DataLo, DataHi;
+  if (getTypeAction(Ops.Data.getValueType()) == TargetLowering::TypeSplitVector)
+    // Split Data operand
+    GetSplitVector(Ops.Data, DataLo, DataHi);
+  else
+    std::tie(DataLo, DataHi) = DAG.SplitVector(Ops.Data, DL);
+
+  // Split Mask operand
+  SDValue MaskLo, MaskHi;
+  if (OpNo == 1 && Ops.Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Ops.Mask.getNode(), MaskLo, MaskHi);
+  } else {
+    std::tie(MaskLo, MaskHi) = SplitMask(Ops.Mask, DL);
+  }
+
+  SDValue IndexHi, IndexLo;
+  if (getTypeAction(Ops.Index.getValueType()) ==
+      TargetLowering::TypeSplitVector)
+    GetSplitVector(Ops.Index, IndexLo, IndexHi);
+  else
+    std::tie(IndexLo, IndexHi) = DAG.SplitVector(Ops.Index, DL);
+
+  SDValue Lo;
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      N->getPointerInfo(), MachineMemOperand::MOStore,
+      MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
+
+  if (auto *MSC = dyn_cast<MaskedScatterSDNode>(N)) {
+    SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Ops.Scale};
+    Lo =
+        DAG.getMaskedScatter(DAG.getVTList(MVT::Other), LoMemVT, DL, OpsLo, MMO,
+                             MSC->getIndexType(), MSC->isTruncatingStore());
+
+    // The order of the Scatter operation after split is well defined. The "Hi"
+    // part comes after the "Lo". So these two operations should be chained one
+    // after another.
+    SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi, Ops.Scale};
+    return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), HiMemVT, DL, OpsHi,
+                                MMO, MSC->getIndexType(),
+                                MSC->isTruncatingStore());
+  }
+  auto *VPSC = cast<VPScatterSDNode>(N);
+  SDValue EVLLo, EVLHi;
+  std::tie(EVLLo, EVLHi) =
+      DAG.SplitEVL(VPSC->getVectorLength(), Ops.Data.getValueType(), DL);
+
+  SDValue OpsLo[] = {Ch, DataLo, Ptr, IndexLo, Ops.Scale, MaskLo, EVLLo};
+  Lo = DAG.getScatterVP(DAG.getVTList(MVT::Other), LoMemVT, DL, OpsLo, MMO,
+                        VPSC->getIndexType());
+
+  // The order of the Scatter operation after split is well defined. The "Hi"
+  // part comes after the "Lo". So these two operations should be chained one
+  // after another.
+  SDValue OpsHi[] = {Lo, DataHi, Ptr, IndexHi, Ops.Scale, MaskHi, EVLHi};
+  return DAG.getScatterVP(DAG.getVTList(MVT::Other), HiMemVT, DL, OpsHi, MMO,
+                          VPSC->getIndexType());
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
+  assert(N->isUnindexed() && "Indexed store of vector?");
+  assert(OpNo == 1 && "Can only split the stored value");
+  SDLoc DL(N);
+
+  bool isTruncating = N->isTruncatingStore();
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  EVT MemoryVT = N->getMemoryVT();
+  Align Alignment = N->getOriginalAlign();
+  MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = N->getAAInfo();
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(1), Lo, Hi);
+
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  // Scalarize if the split halves are not byte-sized.
+  if (!LoMemVT.isByteSized() || !HiMemVT.isByteSized())
+    return TLI.scalarizeVectorStore(N, DAG);
+
+  if (isTruncating)
+    Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), LoMemVT,
+                           Alignment, MMOFlags, AAInfo);
+  else
+    Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), Alignment, MMOFlags,
+                      AAInfo);
+
+  MachinePointerInfo MPI;
+  IncrementPointer(N, LoMemVT, MPI, Ptr);
+
+  if (isTruncating)
+    Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr, MPI,
+                           HiMemVT, Alignment, MMOFlags, AAInfo);
+  else
+    Hi = DAG.getStore(Ch, DL, Hi, Ptr, MPI, Alignment, MMOFlags, AAInfo);
+
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
+  SDLoc DL(N);
+
+  // The input operands all must have the same type, and we know the result
+  // type is valid.  Convert this to a buildvector which extracts all the
+  // input elements.
+  // TODO: If the input elements are power-two vectors, we could convert this to
+  // a new CONCAT_VECTORS node with elements that are half-wide.
+  SmallVector<SDValue, 32> Elts;
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  for (const SDValue &Op : N->op_values()) {
+    for (unsigned i = 0, e = Op.getValueType().getVectorNumElements();
+         i != e; ++i) {
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op,
+                                 DAG.getVectorIdxConstant(i, DL)));
+    }
+  }
+
+  return DAG.getBuildVector(N->getValueType(0), DL, Elts);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
+  // The result type is legal, but the input type is illegal.  If splitting
+  // ends up with the result type of each half still being legal, just
+  // do that.  If, however, that would result in an illegal result type,
+  // we can try to get more clever with power-two vectors. Specifically,
+  // split the input type, but also widen the result element size, then
+  // concatenate the halves and truncate again.  For example, consider a target
+  // where v8i8 is legal and v8i32 is not (ARM, which doesn't have 256-bit
+  // vectors). To perform a "%res = v8i8 trunc v8i32 %in" we do:
+  //   %inlo = v4i32 extract_subvector %in, 0
+  //   %inhi = v4i32 extract_subvector %in, 4
+  //   %lo16 = v4i16 trunc v4i32 %inlo
+  //   %hi16 = v4i16 trunc v4i32 %inhi
+  //   %in16 = v8i16 concat_vectors v4i16 %lo16, v4i16 %hi16
+  //   %res = v8i8 trunc v8i16 %in16
+  //
+  // Without this transform, the original truncate would end up being
+  // scalarized, which is pretty much always a last resort.
+  unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
+  SDValue InVec = N->getOperand(OpNo);
+  EVT InVT = InVec->getValueType(0);
+  EVT OutVT = N->getValueType(0);
+  ElementCount NumElements = OutVT.getVectorElementCount();
+  bool IsFloat = OutVT.isFloatingPoint();
+
+  unsigned InElementSize = InVT.getScalarSizeInBits();
+  unsigned OutElementSize = OutVT.getScalarSizeInBits();
+
+  // Determine the split output VT. If its legal we can just split dirctly.
+  EVT LoOutVT, HiOutVT;
+  std::tie(LoOutVT, HiOutVT) = DAG.GetSplitDestVTs(OutVT);
+  assert(LoOutVT == HiOutVT && "Unequal split?");
+
+  // If the input elements are only 1/2 the width of the result elements,
+  // just use the normal splitting. Our trick only work if there's room
+  // to split more than once.
+  if (isTypeLegal(LoOutVT) ||
+      InElementSize <= OutElementSize * 2)
+    return SplitVecOp_UnaryOp(N);
+  SDLoc DL(N);
+
+  // Don't touch if this will be scalarized.
+  EVT FinalVT = InVT;
+  while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector)
+    FinalVT = FinalVT.getHalfNumVectorElementsVT(*DAG.getContext());
+
+  if (getTypeAction(FinalVT) == TargetLowering::TypeScalarizeVector)
+    return SplitVecOp_UnaryOp(N);
+
+  // Get the split input vector.
+  SDValue InLoVec, InHiVec;
+  GetSplitVector(InVec, InLoVec, InHiVec);
+
+  // Truncate them to 1/2 the element size.
+  //
+  // This assumes the number of elements is a power of two; any vector that
+  // isn't should be widened, not split.
+  EVT HalfElementVT = IsFloat ?
+    EVT::getFloatingPointVT(InElementSize/2) :
+    EVT::getIntegerVT(*DAG.getContext(), InElementSize/2);
+  EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT,
+                                NumElements.divideCoefficientBy(2));
+
+  SDValue HalfLo;
+  SDValue HalfHi;
+  SDValue Chain;
+  if (N->isStrictFPOpcode()) {
+    HalfLo = DAG.getNode(N->getOpcode(), DL, {HalfVT, MVT::Other},
+                         {N->getOperand(0), InLoVec});
+    HalfHi = DAG.getNode(N->getOpcode(), DL, {HalfVT, MVT::Other},
+                         {N->getOperand(0), InHiVec});
+    // Legalize the chain result - switch anything that used the old chain to
+    // use the new one.
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, HalfLo.getValue(1),
+                        HalfHi.getValue(1));
+  } else {
+    HalfLo = DAG.getNode(N->getOpcode(), DL, HalfVT, InLoVec);
+    HalfHi = DAG.getNode(N->getOpcode(), DL, HalfVT, InHiVec);
+  }
+
+  // Concatenate them to get the full intermediate truncation result.
+  EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements);
+  SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo,
+                                 HalfHi);
+  // Now finish up by truncating all the way down to the original result
+  // type. This should normally be something that ends up being legal directly,
+  // but in theory if a target has very wide vectors and an annoyingly
+  // restricted set of legal types, this split can chain to build things up.
+
+  if (N->isStrictFPOpcode()) {
+    SDValue Res = DAG.getNode(
+        ISD::STRICT_FP_ROUND, DL, {OutVT, MVT::Other},
+        {Chain, InterVec,
+         DAG.getTargetConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()))});
+    // Relink the chain
+    ReplaceValueWith(SDValue(N, 1), SDValue(Res.getNode(), 1));
+    return Res;
+  }
+
+  return IsFloat
+             ? DAG.getNode(ISD::FP_ROUND, DL, OutVT, InterVec,
+                           DAG.getTargetConstant(
+                               0, DL, TLI.getPointerTy(DAG.getDataLayout())))
+             : DAG.getNode(ISD::TRUNCATE, DL, OutVT, InterVec);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operand types must be vectors");
+  // The result has a legal vector type, but the input needs splitting.
+  SDValue Lo0, Hi0, Lo1, Hi1, LoRes, HiRes;
+  SDLoc DL(N);
+  GetSplitVector(N->getOperand(0), Lo0, Hi0);
+  GetSplitVector(N->getOperand(1), Lo1, Hi1);
+  auto PartEltCnt = Lo0.getValueType().getVectorElementCount();
+
+  LLVMContext &Context = *DAG.getContext();
+  EVT PartResVT = EVT::getVectorVT(Context, MVT::i1, PartEltCnt);
+  EVT WideResVT = EVT::getVectorVT(Context, MVT::i1, PartEltCnt*2);
+
+  if (N->getOpcode() == ISD::SETCC) {
+    LoRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Lo0, Lo1, N->getOperand(2));
+    HiRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Hi0, Hi1, N->getOperand(2));
+  } else {
+    assert(N->getOpcode() == ISD::VP_SETCC && "Expected VP_SETCC opcode");
+    SDValue MaskLo, MaskHi, EVLLo, EVLHi;
+    std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(3));
+    std::tie(EVLLo, EVLHi) =
+        DAG.SplitEVL(N->getOperand(4), N->getValueType(0), DL);
+    LoRes = DAG.getNode(ISD::VP_SETCC, DL, PartResVT, Lo0, Lo1,
+                        N->getOperand(2), MaskLo, EVLLo);
+    HiRes = DAG.getNode(ISD::VP_SETCC, DL, PartResVT, Hi0, Hi1,
+                        N->getOperand(2), MaskHi, EVLHi);
+  }
+  SDValue Con = DAG.getNode(ISD::CONCAT_VECTORS, DL, WideResVT, LoRes, HiRes);
+
+  EVT OpVT = N->getOperand(0).getValueType();
+  ISD::NodeType ExtendCode =
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
+  return DAG.getNode(ExtendCode, DL, N->getValueType(0), Con);
+}
+
+
+SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) {
+  // The result has a legal vector type, but the input needs splitting.
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  SDLoc DL(N);
+  GetSplitVector(N->getOperand(N->isStrictFPOpcode() ? 1 : 0), Lo, Hi);
+  EVT InVT = Lo.getValueType();
+
+  EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
+                               InVT.getVectorElementCount());
+
+  if (N->isStrictFPOpcode()) {
+    Lo = DAG.getNode(N->getOpcode(), DL, { OutVT, MVT::Other }, 
+                     { N->getOperand(0), Lo, N->getOperand(2) });
+    Hi = DAG.getNode(N->getOpcode(), DL, { OutVT, MVT::Other }, 
+                     { N->getOperand(0), Hi, N->getOperand(2) });
+    // Legalize the chain result - switch anything that used the old chain to
+    // use the new one.
+    SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 
+                                   Lo.getValue(1), Hi.getValue(1));
+    ReplaceValueWith(SDValue(N, 1), NewChain);
+  } else if (N->getOpcode() == ISD::VP_FP_ROUND) {
+    SDValue MaskLo, MaskHi, EVLLo, EVLHi;
+    std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(1));
+    std::tie(EVLLo, EVLHi) =
+        DAG.SplitEVL(N->getOperand(2), N->getValueType(0), DL);
+    Lo = DAG.getNode(ISD::VP_FP_ROUND, DL, OutVT, Lo, MaskLo, EVLLo);
+    Hi = DAG.getNode(ISD::VP_FP_ROUND, DL, OutVT, Hi, MaskHi, EVLHi);
+  } else {
+    Lo = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Lo, N->getOperand(1));
+    Hi = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Hi, N->getOperand(1));
+  }
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
+}
+
+// Split a vector type in an FP binary operation where the second operand has a
+// different type from the first.
+//
+// The result (and the first input) has a legal vector type, but the second
+// input needs splitting.
+SDValue DAGTypeLegalizer::SplitVecOp_FPOpDifferentTypes(SDNode *N) {
+  SDLoc DL(N);
+
+  EVT LHSLoVT, LHSHiVT;
+  std::tie(LHSLoVT, LHSHiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  if (!isTypeLegal(LHSLoVT) || !isTypeLegal(LHSHiVT))
+    return DAG.UnrollVectorOp(N, N->getValueType(0).getVectorNumElements());
+
+  SDValue LHSLo, LHSHi;
+  std::tie(LHSLo, LHSHi) =
+      DAG.SplitVector(N->getOperand(0), DL, LHSLoVT, LHSHiVT);
+
+  SDValue RHSLo, RHSHi;
+  std::tie(RHSLo, RHSHi) = DAG.SplitVector(N->getOperand(1), DL);
+
+  SDValue Lo = DAG.getNode(N->getOpcode(), DL, LHSLoVT, LHSLo, RHSLo);
+  SDValue Hi = DAG.getNode(N->getOpcode(), DL, LHSHiVT, LHSHi, RHSHi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_FP_TO_XINT_SAT(SDNode *N) {
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  SDLoc dl(N);
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  EVT InVT = Lo.getValueType();
+
+  EVT NewResVT =
+      EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
+                       InVT.getVectorElementCount());
+
+  Lo = DAG.getNode(N->getOpcode(), dl, NewResVT, Lo, N->getOperand(1));
+  Hi = DAG.getNode(N->getOpcode(), dl, NewResVT, Hi, N->getOperand(1));
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+}
+
+//===----------------------------------------------------------------------===//
+//  Result Vector Widening
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
+  LLVM_DEBUG(dbgs() << "Widen node result " << ResNo << ": "; N->dump(&DAG);
+             dbgs() << "\n");
+
+  // See if the target wants to custom widen this node.
+  if (CustomWidenLowerNode(N, N->getValueType(ResNo)))
+    return;
+
+  SDValue Res = SDValue();
+
+  auto unrollExpandedOp = [&]() {
+    // We're going to widen this vector op to a legal type by padding with undef
+    // elements. If the wide vector op is eventually going to be expanded to
+    // scalar libcalls, then unroll into scalar ops now to avoid unnecessary
+    // libcalls on the undef elements.
+    EVT VT = N->getValueType(0);
+    EVT WideVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+    if (!TLI.isOperationLegalOrCustom(N->getOpcode(), WideVecVT) &&
+        TLI.isOperationExpand(N->getOpcode(), VT.getScalarType())) {
+      Res = DAG.UnrollVectorOp(N, WideVecVT.getVectorNumElements());
+      return true;
+    }
+    return false;
+  };
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "WidenVectorResult #" << ResNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to widen the result of this operator!");
+
+  case ISD::MERGE_VALUES:      Res = WidenVecRes_MERGE_VALUES(N, ResNo); break;
+  case ISD::AssertZext:        Res = WidenVecRes_AssertZext(N); break;
+  case ISD::BITCAST:           Res = WidenVecRes_BITCAST(N); break;
+  case ISD::BUILD_VECTOR:      Res = WidenVecRes_BUILD_VECTOR(N); break;
+  case ISD::CONCAT_VECTORS:    Res = WidenVecRes_CONCAT_VECTORS(N); break;
+  case ISD::INSERT_SUBVECTOR:
+    Res = WidenVecRes_INSERT_SUBVECTOR(N);
+    break;
+  case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
+  case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::LOAD:              Res = WidenVecRes_LOAD(N); break;
+  case ISD::STEP_VECTOR:
+  case ISD::SPLAT_VECTOR:
+  case ISD::SCALAR_TO_VECTOR:
+    Res = WidenVecRes_ScalarOp(N);
+    break;
+  case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break;
+  case ISD::VSELECT:
+  case ISD::SELECT:
+  case ISD::VP_SELECT:
+  case ISD::VP_MERGE:
+    Res = WidenVecRes_Select(N);
+    break;
+  case ISD::SELECT_CC:         Res = WidenVecRes_SELECT_CC(N); break;
+  case ISD::VP_SETCC:
+  case ISD::SETCC:             Res = WidenVecRes_SETCC(N); break;
+  case ISD::UNDEF:             Res = WidenVecRes_UNDEF(N); break;
+  case ISD::VECTOR_SHUFFLE:
+    Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N));
+    break;
+  case ISD::VP_LOAD:
+    Res = WidenVecRes_VP_LOAD(cast<VPLoadSDNode>(N));
+    break;
+  case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
+    Res = WidenVecRes_VP_STRIDED_LOAD(cast<VPStridedLoadSDNode>(N));
+    break;
+  case ISD::MLOAD:
+    Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N));
+    break;
+  case ISD::MGATHER:
+    Res = WidenVecRes_MGATHER(cast<MaskedGatherSDNode>(N));
+    break;
+  case ISD::VP_GATHER:
+    Res = WidenVecRes_VP_GATHER(cast<VPGatherSDNode>(N));
+    break;
+  case ISD::VECTOR_REVERSE:
+    Res = WidenVecRes_VECTOR_REVERSE(N);
+    break;
+
+  case ISD::ADD: case ISD::VP_ADD:
+  case ISD::AND: case ISD::VP_AND:
+  case ISD::MUL: case ISD::VP_MUL:
+  case ISD::MULHS:
+  case ISD::MULHU:
+  case ISD::OR: case ISD::VP_OR:
+  case ISD::SUB: case ISD::VP_SUB:
+  case ISD::XOR: case ISD::VP_XOR:
+  case ISD::SHL: case ISD::VP_SHL:
+  case ISD::SRA: case ISD::VP_ASHR:
+  case ISD::SRL: case ISD::VP_LSHR:
+  case ISD::FMINNUM: case ISD::VP_FMINNUM:
+  case ISD::FMAXNUM: case ISD::VP_FMAXNUM:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
+  case ISD::SMIN: case ISD::VP_SMIN:
+  case ISD::SMAX: case ISD::VP_SMAX:
+  case ISD::UMIN: case ISD::VP_UMIN:
+  case ISD::UMAX: case ISD::VP_UMAX:
+  case ISD::UADDSAT:
+  case ISD::SADDSAT:
+  case ISD::USUBSAT:
+  case ISD::SSUBSAT:
+  case ISD::SSHLSAT:
+  case ISD::USHLSAT:
+  case ISD::ROTL:
+  case ISD::ROTR:
+  case ISD::AVGFLOORS:
+  case ISD::AVGFLOORU:
+  case ISD::AVGCEILS:
+  case ISD::AVGCEILU:
+  // Vector-predicated binary op widening. Note that -- unlike the
+  // unpredicated versions -- we don't have to worry about trapping on
+  // operations like UDIV, FADD, etc., as we pass on the original vector
+  // length parameter. This means the widened elements containing garbage
+  // aren't active.
+  case ISD::VP_SDIV:
+  case ISD::VP_UDIV:
+  case ISD::VP_SREM:
+  case ISD::VP_UREM:
+  case ISD::VP_FADD:
+  case ISD::VP_FSUB:
+  case ISD::VP_FMUL:
+  case ISD::VP_FDIV:
+  case ISD::VP_FREM:
+  case ISD::VP_FCOPYSIGN:
+    Res = WidenVecRes_Binary(N);
+    break;
+
+  case ISD::FPOW:
+  case ISD::FREM:
+    if (unrollExpandedOp())
+      break;
+    // If the target has custom/legal support for the scalar FP intrinsic ops
+    // (they are probably not destined to become libcalls), then widen those
+    // like any other binary ops.
+    [[fallthrough]];
+
+  case ISD::FADD:
+  case ISD::FMUL:
+  case ISD::FSUB:
+  case ISD::FDIV:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+    Res = WidenVecRes_BinaryCanTrap(N);
+    break;
+
+  case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
+    // These are binary operations, but with an extra operand that shouldn't
+    // be widened (the scale).
+    Res = WidenVecRes_BinaryWithExtraScalarOp(N);
+    break;
+
+#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
+  case ISD::STRICT_##DAGN:
+#include "llvm/IR/ConstrainedOps.def"
+    Res = WidenVecRes_StrictFP(N);
+    break;
+
+  case ISD::UADDO:
+  case ISD::SADDO:
+  case ISD::USUBO:
+  case ISD::SSUBO:
+  case ISD::UMULO:
+  case ISD::SMULO:
+    Res = WidenVecRes_OverflowOp(N, ResNo);
+    break;
+
+  case ISD::FCOPYSIGN:
+    Res = WidenVecRes_FCOPYSIGN(N);
+    break;
+
+  case ISD::IS_FPCLASS:
+    Res = WidenVecRes_IS_FPCLASS(N);
+    break;
+
+  case ISD::FLDEXP:
+  case ISD::FPOWI:
+    if (!unrollExpandedOp())
+      Res = WidenVecRes_ExpOp(N);
+    break;
+
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    Res = WidenVecRes_EXTEND_VECTOR_INREG(N);
+    break;
+
+  case ISD::ANY_EXTEND:
+  case ISD::FP_EXTEND:
+  case ISD::VP_FP_EXTEND:
+  case ISD::FP_ROUND:
+  case ISD::VP_FP_ROUND:
+  case ISD::FP_TO_SINT:
+  case ISD::VP_FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::VP_FP_TO_UINT:
+  case ISD::SIGN_EXTEND:
+  case ISD::VP_SIGN_EXTEND:
+  case ISD::SINT_TO_FP:
+  case ISD::VP_SINT_TO_FP:
+  case ISD::VP_TRUNCATE:
+  case ISD::TRUNCATE:
+  case ISD::UINT_TO_FP:
+  case ISD::VP_UINT_TO_FP:
+  case ISD::ZERO_EXTEND:
+  case ISD::VP_ZERO_EXTEND:
+    Res = WidenVecRes_Convert(N);
+    break;
+
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    Res = WidenVecRes_FP_TO_XINT_SAT(N);
+    break;
+
+  case ISD::FABS:
+  case ISD::FCEIL:
+  case ISD::FCOS:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FFLOOR:
+  case ISD::FLOG:
+  case ISD::FLOG10:
+  case ISD::FLOG2:
+  case ISD::FNEARBYINT:
+  case ISD::FRINT:
+  case ISD::FROUND:
+  case ISD::FROUNDEVEN:
+  case ISD::FSIN:
+  case ISD::FSQRT:
+  case ISD::FTRUNC:
+    if (unrollExpandedOp())
+      break;
+    // If the target has custom/legal support for the scalar FP intrinsic ops
+    // (they are probably not destined to become libcalls), then widen those
+    // like any other unary ops.
+    [[fallthrough]];
+
+  case ISD::ABS:
+  case ISD::VP_ABS:
+  case ISD::BITREVERSE:
+  case ISD::VP_BITREVERSE:
+  case ISD::BSWAP:
+  case ISD::VP_BSWAP:
+  case ISD::CTLZ:
+  case ISD::VP_CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::VP_CTLZ_ZERO_UNDEF:
+  case ISD::CTPOP:
+  case ISD::VP_CTPOP:
+  case ISD::CTTZ:
+  case ISD::VP_CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::VP_CTTZ_ZERO_UNDEF:
+  case ISD::FNEG: case ISD::VP_FNEG:
+  case ISD::VP_FABS:
+  case ISD::VP_SQRT:
+  case ISD::VP_FCEIL:
+  case ISD::VP_FFLOOR:
+  case ISD::VP_FRINT:
+  case ISD::VP_FNEARBYINT:
+  case ISD::VP_FROUND:
+  case ISD::VP_FROUNDEVEN:
+  case ISD::VP_FROUNDTOZERO:
+  case ISD::FREEZE:
+  case ISD::ARITH_FENCE:
+  case ISD::FCANONICALIZE:
+    Res = WidenVecRes_Unary(N);
+    break;
+  case ISD::FMA: case ISD::VP_FMA:
+  case ISD::FSHL:
+  case ISD::VP_FSHL:
+  case ISD::FSHR:
+  case ISD::VP_FSHR:
+    Res = WidenVecRes_Ternary(N);
+    break;
+  }
+
+  // If Res is null, the sub-method took care of registering the result.
+  if (Res.getNode())
+    SetWidenedVector(SDValue(N, ResNo), Res);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
+  // Ternary op widening.
+  SDLoc dl(N);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  SDValue InOp3 = GetWidenedVector(N->getOperand(2));
+  if (N->getNumOperands() == 3)
+    return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3);
+
+  assert(N->getNumOperands() == 5 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
+
+  SDValue Mask =
+      GetWidenedMask(N->getOperand(3), WidenVT.getVectorElementCount());
+  return DAG.getNode(N->getOpcode(), dl, WidenVT,
+                     {InOp1, InOp2, InOp3, Mask, N->getOperand(4)});
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
+  // Binary op widening.
+  SDLoc dl(N);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  if (N->getNumOperands() == 2)
+    return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2,
+                       N->getFlags());
+
+  assert(N->getNumOperands() == 4 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
+
+  SDValue Mask =
+      GetWidenedMask(N->getOperand(2), WidenVT.getVectorElementCount());
+  return DAG.getNode(N->getOpcode(), dl, WidenVT,
+                     {InOp1, InOp2, Mask, N->getOperand(3)}, N->getFlags());
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_BinaryWithExtraScalarOp(SDNode *N) {
+  // Binary op widening, but with an extra operand that shouldn't be widened.
+  SDLoc dl(N);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  SDValue InOp3 = N->getOperand(2);
+  return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3,
+                     N->getFlags());
+}
+
+// Given a vector of operations that have been broken up to widen, see
+// if we can collect them together into the next widest legal VT. This
+// implementation is trap-safe.
+static SDValue CollectOpsToWiden(SelectionDAG &DAG, const TargetLowering &TLI,
+                                 SmallVectorImpl<SDValue> &ConcatOps,
+                                 unsigned ConcatEnd, EVT VT, EVT MaxVT,
+                                 EVT WidenVT) {
+  // Check to see if we have a single operation with the widen type.
+  if (ConcatEnd == 1) {
+    VT = ConcatOps[0].getValueType();
+    if (VT == WidenVT)
+      return ConcatOps[0];
+  }
+
+  SDLoc dl(ConcatOps[0]);
+  EVT WidenEltVT = WidenVT.getVectorElementType();
+
+  // while (Some element of ConcatOps is not of type MaxVT) {
+  //   From the end of ConcatOps, collect elements of the same type and put
+  //   them into an op of the next larger supported type
+  // }
+  while (ConcatOps[ConcatEnd-1].getValueType() != MaxVT) {
+    int Idx = ConcatEnd - 1;
+    VT = ConcatOps[Idx--].getValueType();
+    while (Idx >= 0 && ConcatOps[Idx].getValueType() == VT)
+      Idx--;
+
+    int NextSize = VT.isVector() ? VT.getVectorNumElements() : 1;
+    EVT NextVT;
+    do {
+      NextSize *= 2;
+      NextVT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NextSize);
+    } while (!TLI.isTypeLegal(NextVT));
+
+    if (!VT.isVector()) {
+      // Scalar type, create an INSERT_VECTOR_ELEMENT of type NextVT
+      SDValue VecOp = DAG.getUNDEF(NextVT);
+      unsigned NumToInsert = ConcatEnd - Idx - 1;
+      for (unsigned i = 0, OpIdx = Idx+1; i < NumToInsert; i++, OpIdx++) {
+        VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp,
+                            ConcatOps[OpIdx], DAG.getVectorIdxConstant(i, dl));
+      }
+      ConcatOps[Idx+1] = VecOp;
+      ConcatEnd = Idx + 2;
+    } else {
+      // Vector type, create a CONCAT_VECTORS of type NextVT
+      SDValue undefVec = DAG.getUNDEF(VT);
+      unsigned OpsToConcat = NextSize/VT.getVectorNumElements();
+      SmallVector<SDValue, 16> SubConcatOps(OpsToConcat);
+      unsigned RealVals = ConcatEnd - Idx - 1;
+      unsigned SubConcatEnd = 0;
+      unsigned SubConcatIdx = Idx + 1;
+      while (SubConcatEnd < RealVals)
+        SubConcatOps[SubConcatEnd++] = ConcatOps[++Idx];
+      while (SubConcatEnd < OpsToConcat)
+        SubConcatOps[SubConcatEnd++] = undefVec;
+      ConcatOps[SubConcatIdx] = DAG.getNode(ISD::CONCAT_VECTORS, dl,
+                                            NextVT, SubConcatOps);
+      ConcatEnd = SubConcatIdx + 1;
+    }
+  }
+
+  // Check to see if we have a single operation with the widen type.
+  if (ConcatEnd == 1) {
+    VT = ConcatOps[0].getValueType();
+    if (VT == WidenVT)
+      return ConcatOps[0];
+  }
+
+  // add undefs of size MaxVT until ConcatOps grows to length of WidenVT
+  unsigned NumOps = WidenVT.getVectorNumElements()/MaxVT.getVectorNumElements();
+  if (NumOps != ConcatEnd ) {
+    SDValue UndefVal = DAG.getUNDEF(MaxVT);
+    for (unsigned j = ConcatEnd; j < NumOps; ++j)
+      ConcatOps[j] = UndefVal;
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
+                     ArrayRef(ConcatOps.data(), NumOps));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
+  // Binary op widening for operations that can trap.
+  unsigned Opcode = N->getOpcode();
+  SDLoc dl(N);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT WidenEltVT = WidenVT.getVectorElementType();
+  EVT VT = WidenVT;
+  unsigned NumElts = VT.getVectorMinNumElements();
+  const SDNodeFlags Flags = N->getFlags();
+  while (!TLI.isTypeLegal(VT) && NumElts != 1) {
+    NumElts = NumElts / 2;
+    VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
+  }
+
+  if (NumElts != 1 && !TLI.canOpTrap(N->getOpcode(), VT)) {
+    // Operation doesn't trap so just widen as normal.
+    SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+    SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+    return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, Flags);
+  }
+
+  // FIXME: Improve support for scalable vectors.
+  assert(!VT.isScalableVector() && "Scalable vectors not handled yet.");
+
+  // No legal vector version so unroll the vector operation and then widen.
+  if (NumElts == 1)
+    return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
+
+  // Since the operation can trap, apply operation on the original vector.
+  EVT MaxVT = VT;
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  unsigned CurNumElts = N->getValueType(0).getVectorNumElements();
+
+  SmallVector<SDValue, 16> ConcatOps(CurNumElts);
+  unsigned ConcatEnd = 0;  // Current ConcatOps index.
+  int Idx = 0;        // Current Idx into input vectors.
+
+  // NumElts := greatest legal vector size (at most WidenVT)
+  // while (orig. vector has unhandled elements) {
+  //   take munches of size NumElts from the beginning and add to ConcatOps
+  //   NumElts := next smaller supported vector size or 1
+  // }
+  while (CurNumElts != 0) {
+    while (CurNumElts >= NumElts) {
+      SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
+                                 DAG.getVectorIdxConstant(Idx, dl));
+      SDValue EOp2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2,
+                                 DAG.getVectorIdxConstant(Idx, dl));
+      ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2, Flags);
+      Idx += NumElts;
+      CurNumElts -= NumElts;
+    }
+    do {
+      NumElts = NumElts / 2;
+      VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
+    } while (!TLI.isTypeLegal(VT) && NumElts != 1);
+
+    if (NumElts == 1) {
+      for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
+        SDValue EOp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT,
+                                   InOp1, DAG.getVectorIdxConstant(Idx, dl));
+        SDValue EOp2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT,
+                                   InOp2, DAG.getVectorIdxConstant(Idx, dl));
+        ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT,
+                                             EOp1, EOp2, Flags);
+      }
+      CurNumElts = 0;
+    }
+  }
+
+  return CollectOpsToWiden(DAG, TLI, ConcatOps, ConcatEnd, VT, MaxVT, WidenVT);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_StrictFP(SDNode *N) {
+  switch (N->getOpcode()) {
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS:
+    return WidenVecRes_STRICT_FSETCC(N);
+  case ISD::STRICT_FP_EXTEND:
+  case ISD::STRICT_FP_ROUND:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
+  case ISD::STRICT_SINT_TO_FP:
+  case ISD::STRICT_UINT_TO_FP:
+   return WidenVecRes_Convert_StrictFP(N);
+  default:
+    break;
+  }
+
+  // StrictFP op widening for operations that can trap.
+  unsigned NumOpers = N->getNumOperands();
+  unsigned Opcode = N->getOpcode();
+  SDLoc dl(N);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT WidenEltVT = WidenVT.getVectorElementType();
+  EVT VT = WidenVT;
+  unsigned NumElts = VT.getVectorNumElements();
+  while (!TLI.isTypeLegal(VT) && NumElts != 1) {
+    NumElts = NumElts / 2;
+    VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
+  }
+
+  // No legal vector version so unroll the vector operation and then widen.
+  if (NumElts == 1)
+    return UnrollVectorOp_StrictFP(N, WidenVT.getVectorNumElements());
+
+  // Since the operation can trap, apply operation on the original vector.
+  EVT MaxVT = VT;
+  SmallVector<SDValue, 4> InOps;
+  unsigned CurNumElts = N->getValueType(0).getVectorNumElements();
+
+  SmallVector<SDValue, 16> ConcatOps(CurNumElts);
+  SmallVector<SDValue, 16> Chains;
+  unsigned ConcatEnd = 0;  // Current ConcatOps index.
+  int Idx = 0;        // Current Idx into input vectors.
+
+  // The Chain is the first operand.
+  InOps.push_back(N->getOperand(0));
+
+  // Now process the remaining operands.
+  for (unsigned i = 1; i < NumOpers; ++i) {
+    SDValue Oper = N->getOperand(i);
+
+    EVT OpVT = Oper.getValueType();
+    if (OpVT.isVector()) {
+      if (getTypeAction(OpVT) == TargetLowering::TypeWidenVector)
+        Oper = GetWidenedVector(Oper);
+      else {
+        EVT WideOpVT =
+            EVT::getVectorVT(*DAG.getContext(), OpVT.getVectorElementType(),
+                             WidenVT.getVectorElementCount());
+        Oper = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+                           DAG.getUNDEF(WideOpVT), Oper,
+                           DAG.getVectorIdxConstant(0, dl));
+      }
+    }
+
+    InOps.push_back(Oper);
+  }
+
+  // NumElts := greatest legal vector size (at most WidenVT)
+  // while (orig. vector has unhandled elements) {
+  //   take munches of size NumElts from the beginning and add to ConcatOps
+  //   NumElts := next smaller supported vector size or 1
+  // }
+  while (CurNumElts != 0) {
+    while (CurNumElts >= NumElts) {
+      SmallVector<SDValue, 4> EOps;
+
+      for (unsigned i = 0; i < NumOpers; ++i) {
+        SDValue Op = InOps[i];
+
+        EVT OpVT = Op.getValueType();
+        if (OpVT.isVector()) {
+          EVT OpExtractVT =
+              EVT::getVectorVT(*DAG.getContext(), OpVT.getVectorElementType(),
+                               VT.getVectorElementCount());
+          Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpExtractVT, Op,
+                           DAG.getVectorIdxConstant(Idx, dl));
+        }
+
+        EOps.push_back(Op);
+      }
+
+      EVT OperVT[] = {VT, MVT::Other};
+      SDValue Oper = DAG.getNode(Opcode, dl, OperVT, EOps);
+      ConcatOps[ConcatEnd++] = Oper;
+      Chains.push_back(Oper.getValue(1));
+      Idx += NumElts;
+      CurNumElts -= NumElts;
+    }
+    do {
+      NumElts = NumElts / 2;
+      VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
+    } while (!TLI.isTypeLegal(VT) && NumElts != 1);
+
+    if (NumElts == 1) {
+      for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
+        SmallVector<SDValue, 4> EOps;
+
+        for (unsigned i = 0; i < NumOpers; ++i) {
+          SDValue Op = InOps[i];
+
+          EVT OpVT = Op.getValueType();
+          if (OpVT.isVector())
+            Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                             OpVT.getVectorElementType(), Op,
+                             DAG.getVectorIdxConstant(Idx, dl));
+
+          EOps.push_back(Op);
+        }
+
+        EVT WidenVT[] = {WidenEltVT, MVT::Other}; 
+        SDValue Oper = DAG.getNode(Opcode, dl, WidenVT, EOps);
+        ConcatOps[ConcatEnd++] = Oper;
+        Chains.push_back(Oper.getValue(1));
+      }
+      CurNumElts = 0;
+    }
+  }
+
+  // Build a factor node to remember all the Ops that have been created.
+  SDValue NewChain;
+  if (Chains.size() == 1)
+    NewChain = Chains[0];
+  else
+    NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+  ReplaceValueWith(SDValue(N, 1), NewChain);
+
+  return CollectOpsToWiden(DAG, TLI, ConcatOps, ConcatEnd, VT, MaxVT, WidenVT);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo) {
+  SDLoc DL(N);
+  EVT ResVT = N->getValueType(0);
+  EVT OvVT = N->getValueType(1);
+  EVT WideResVT, WideOvVT;
+  SDValue WideLHS, WideRHS;
+
+  // TODO: This might result in a widen/split loop.
+  if (ResNo == 0) {
+    WideResVT = TLI.getTypeToTransformTo(*DAG.getContext(), ResVT);
+    WideOvVT = EVT::getVectorVT(
+        *DAG.getContext(), OvVT.getVectorElementType(),
+        WideResVT.getVectorNumElements());
+
+    WideLHS = GetWidenedVector(N->getOperand(0));
+    WideRHS = GetWidenedVector(N->getOperand(1));
+  } else {
+    WideOvVT = TLI.getTypeToTransformTo(*DAG.getContext(), OvVT);
+    WideResVT = EVT::getVectorVT(
+        *DAG.getContext(), ResVT.getVectorElementType(),
+        WideOvVT.getVectorNumElements());
+
+    SDValue Zero = DAG.getVectorIdxConstant(0, DL);
+    WideLHS = DAG.getNode(
+        ISD::INSERT_SUBVECTOR, DL, WideResVT, DAG.getUNDEF(WideResVT),
+        N->getOperand(0), Zero);
+    WideRHS = DAG.getNode(
+        ISD::INSERT_SUBVECTOR, DL, WideResVT, DAG.getUNDEF(WideResVT),
+        N->getOperand(1), Zero);
+  }
+
+  SDVTList WideVTs = DAG.getVTList(WideResVT, WideOvVT);
+  SDNode *WideNode = DAG.getNode(
+      N->getOpcode(), DL, WideVTs, WideLHS, WideRHS).getNode();
+
+  // Replace the other vector result not being explicitly widened here.
+  unsigned OtherNo = 1 - ResNo;
+  EVT OtherVT = N->getValueType(OtherNo);
+  if (getTypeAction(OtherVT) == TargetLowering::TypeWidenVector) {
+    SetWidenedVector(SDValue(N, OtherNo), SDValue(WideNode, OtherNo));
+  } else {
+    SDValue Zero = DAG.getVectorIdxConstant(0, DL);
+    SDValue OtherVal = DAG.getNode(
+        ISD::EXTRACT_SUBVECTOR, DL, OtherVT, SDValue(WideNode, OtherNo), Zero);
+    ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
+  }
+
+  return SDValue(WideNode, ResNo);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
+  LLVMContext &Ctx = *DAG.getContext();
+  SDValue InOp = N->getOperand(0);
+  SDLoc DL(N);
+
+  EVT WidenVT = TLI.getTypeToTransformTo(Ctx, N->getValueType(0));
+  ElementCount WidenEC = WidenVT.getVectorElementCount();
+
+  EVT InVT = InOp.getValueType();
+
+  unsigned Opcode = N->getOpcode();
+  const SDNodeFlags Flags = N->getFlags();
+
+  // Handle the case of ZERO_EXTEND where the promoted InVT element size does
+  // not equal that of WidenVT.
+  if (N->getOpcode() == ISD::ZERO_EXTEND &&
+      getTypeAction(InVT) == TargetLowering::TypePromoteInteger &&
+      TLI.getTypeToTransformTo(Ctx, InVT).getScalarSizeInBits() !=
+      WidenVT.getScalarSizeInBits()) {
+    InOp = ZExtPromotedInteger(InOp);
+    InVT = InOp.getValueType();
+    if (WidenVT.getScalarSizeInBits() < InVT.getScalarSizeInBits())
+      Opcode = ISD::TRUNCATE;
+  }
+
+  EVT InEltVT = InVT.getVectorElementType();
+  EVT InWidenVT = EVT::getVectorVT(Ctx, InEltVT, WidenEC);
+  ElementCount InVTEC = InVT.getVectorElementCount();
+
+  if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
+    InOp = GetWidenedVector(N->getOperand(0));
+    InVT = InOp.getValueType();
+    InVTEC = InVT.getVectorElementCount();
+    if (InVTEC == WidenEC) {
+      if (N->getNumOperands() == 1)
+        return DAG.getNode(Opcode, DL, WidenVT, InOp);
+      if (N->getNumOperands() == 3) {
+        assert(N->isVPOpcode() && "Expected VP opcode");
+        SDValue Mask =
+            GetWidenedMask(N->getOperand(1), WidenVT.getVectorElementCount());
+        return DAG.getNode(Opcode, DL, WidenVT, InOp, Mask, N->getOperand(2));
+      }
+      return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1), Flags);
+    }
+    if (WidenVT.getSizeInBits() == InVT.getSizeInBits()) {
+      // If both input and result vector types are of same width, extend
+      // operations should be done with SIGN/ZERO_EXTEND_VECTOR_INREG, which
+      // accepts fewer elements in the result than in the input.
+      if (Opcode == ISD::ANY_EXTEND)
+        return DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, WidenVT, InOp);
+      if (Opcode == ISD::SIGN_EXTEND)
+        return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, WidenVT, InOp);
+      if (Opcode == ISD::ZERO_EXTEND)
+        return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, WidenVT, InOp);
+    }
+  }
+
+  if (TLI.isTypeLegal(InWidenVT)) {
+    // Because the result and the input are different vector types, widening
+    // the result could create a legal type but widening the input might make
+    // it an illegal type that might lead to repeatedly splitting the input
+    // and then widening it. To avoid this, we widen the input only if
+    // it results in a legal type.
+    if (WidenEC.isKnownMultipleOf(InVTEC.getKnownMinValue())) {
+      // Widen the input and call convert on the widened input vector.
+      unsigned NumConcat =
+          WidenEC.getKnownMinValue() / InVTEC.getKnownMinValue();
+      SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
+      Ops[0] = InOp;
+      SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT, Ops);
+      if (N->getNumOperands() == 1)
+        return DAG.getNode(Opcode, DL, WidenVT, InVec);
+      return DAG.getNode(Opcode, DL, WidenVT, InVec, N->getOperand(1), Flags);
+    }
+
+    if (InVTEC.isKnownMultipleOf(WidenEC.getKnownMinValue())) {
+      SDValue InVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InWidenVT, InOp,
+                                  DAG.getVectorIdxConstant(0, DL));
+      // Extract the input and convert the shorten input vector.
+      if (N->getNumOperands() == 1)
+        return DAG.getNode(Opcode, DL, WidenVT, InVal);
+      return DAG.getNode(Opcode, DL, WidenVT, InVal, N->getOperand(1), Flags);
+    }
+  }
+
+  // Otherwise unroll into some nasty scalar code and rebuild the vector.
+  EVT EltVT = WidenVT.getVectorElementType();
+  SmallVector<SDValue, 16> Ops(WidenEC.getFixedValue(), DAG.getUNDEF(EltVT));
+  // Use the original element count so we don't do more scalar opts than
+  // necessary.
+  unsigned MinElts = N->getValueType(0).getVectorNumElements();
+  for (unsigned i=0; i < MinElts; ++i) {
+    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
+                              DAG.getVectorIdxConstant(i, DL));
+    if (N->getNumOperands() == 1)
+      Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val);
+    else
+      Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, N->getOperand(1), Flags);
+  }
+
+  return DAG.getBuildVector(WidenVT, DL, Ops);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_FP_TO_XINT_SAT(SDNode *N) {
+  SDLoc dl(N);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  ElementCount WidenNumElts = WidenVT.getVectorElementCount();
+
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  // Also widen the input.
+  if (getTypeAction(SrcVT) == TargetLowering::TypeWidenVector) {
+    Src = GetWidenedVector(Src);
+    SrcVT = Src.getValueType();
+  }
+
+  // Input and output not widened to the same size, give up.
+  if (WidenNumElts != SrcVT.getVectorElementCount())
+    return DAG.UnrollVectorOp(N, WidenNumElts.getKnownMinValue());
+
+  return DAG.getNode(N->getOpcode(), dl, WidenVT, Src, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Convert_StrictFP(SDNode *N) {
+  SDValue InOp = N->getOperand(1);
+  SDLoc DL(N);
+  SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  EVT InVT = InOp.getValueType();
+  EVT InEltVT = InVT.getVectorElementType();
+
+  unsigned Opcode = N->getOpcode();
+
+  // FIXME: Optimizations need to be implemented here.
+
+  // Otherwise unroll into some nasty scalar code and rebuild the vector.
+  EVT EltVT = WidenVT.getVectorElementType();
+  std::array<EVT, 2> EltVTs = {{EltVT, MVT::Other}};
+  SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
+  SmallVector<SDValue, 32> OpChains;
+  // Use the original element count so we don't do more scalar opts than
+  // necessary.
+  unsigned MinElts = N->getValueType(0).getVectorNumElements();
+  for (unsigned i=0; i < MinElts; ++i) {
+    NewOps[1] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
+                            DAG.getVectorIdxConstant(i, DL));
+    Ops[i] = DAG.getNode(Opcode, DL, EltVTs, NewOps);
+    OpChains.push_back(Ops[i].getValue(1));
+  }
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OpChains);
+  ReplaceValueWith(SDValue(N, 1), NewChain);
+
+  return DAG.getBuildVector(WidenVT, DL, Ops);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  SDValue InOp = N->getOperand(0);
+  SDLoc DL(N);
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT WidenSVT = WidenVT.getVectorElementType();
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  EVT InVT = InOp.getValueType();
+  EVT InSVT = InVT.getVectorElementType();
+  unsigned InVTNumElts = InVT.getVectorNumElements();
+
+  if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
+    InOp = GetWidenedVector(InOp);
+    InVT = InOp.getValueType();
+    if (InVT.getSizeInBits() == WidenVT.getSizeInBits()) {
+      switch (Opcode) {
+      case ISD::ANY_EXTEND_VECTOR_INREG:
+      case ISD::SIGN_EXTEND_VECTOR_INREG:
+      case ISD::ZERO_EXTEND_VECTOR_INREG:
+        return DAG.getNode(Opcode, DL, WidenVT, InOp);
+      }
+    }
+  }
+
+  // Unroll, extend the scalars and rebuild the vector.
+  SmallVector<SDValue, 16> Ops;
+  for (unsigned i = 0, e = std::min(InVTNumElts, WidenNumElts); i != e; ++i) {
+    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InSVT, InOp,
+                              DAG.getVectorIdxConstant(i, DL));
+    switch (Opcode) {
+    case ISD::ANY_EXTEND_VECTOR_INREG:
+      Val = DAG.getNode(ISD::ANY_EXTEND, DL, WidenSVT, Val);
+      break;
+    case ISD::SIGN_EXTEND_VECTOR_INREG:
+      Val = DAG.getNode(ISD::SIGN_EXTEND, DL, WidenSVT, Val);
+      break;
+    case ISD::ZERO_EXTEND_VECTOR_INREG:
+      Val = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenSVT, Val);
+      break;
+    default:
+      llvm_unreachable("A *_EXTEND_VECTOR_INREG node was expected");
+    }
+    Ops.push_back(Val);
+  }
+
+  while (Ops.size() != WidenNumElts)
+    Ops.push_back(DAG.getUNDEF(WidenSVT));
+
+  return DAG.getBuildVector(WidenVT, DL, Ops);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) {
+  // If this is an FCOPYSIGN with same input types, we can treat it as a
+  // normal (can trap) binary op.
+  if (N->getOperand(0).getValueType() == N->getOperand(1).getValueType())
+    return WidenVecRes_BinaryCanTrap(N);
+
+  // If the types are different, fall back to unrolling.
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_IS_FPCLASS(SDNode *N) {
+  SDValue FpValue = N->getOperand(0);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  if (getTypeAction(FpValue.getValueType()) != TargetLowering::TypeWidenVector)
+    return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
+  SDValue Arg = GetWidenedVector(FpValue);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, {Arg, N->getOperand(1)},
+                     N->getFlags());
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_ExpOp(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  SDValue RHS = N->getOperand(1);
+  SDValue ExpOp = RHS.getValueType().isVector() ? GetWidenedVector(RHS) : RHS;
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ExpOp);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) {
+  // Unary op widening.
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  if (N->getNumOperands() == 1)
+    return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, N->getFlags());
+
+  assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
+
+  SDValue Mask =
+      GetWidenedMask(N->getOperand(1), WidenVT.getVectorElementCount());
+  return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT,
+                     {InOp, Mask, N->getOperand(2)});
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_InregOp(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
+                               cast<VTSDNode>(N->getOperand(1))->getVT()
+                                 .getVectorElementType(),
+                               WidenVT.getVectorNumElements());
+  SDValue WidenLHS = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
+                     WidenVT, WidenLHS, DAG.getValueType(ExtVT));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo) {
+  SDValue WidenVec = DisintegrateMERGE_VALUES(N, ResNo);
+  return GetWidenedVector(WidenVec);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+  EVT VT = N->getValueType(0);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDLoc dl(N);
+
+  switch (getTypeAction(InVT)) {
+  case TargetLowering::TypeLegal:
+    break;
+  case TargetLowering::TypeScalarizeScalableVector:
+    report_fatal_error("Scalarization of scalable vectors is not supported.");
+  case TargetLowering::TypePromoteInteger: {
+    // If the incoming type is a vector that is being promoted, then
+    // we know that the elements are arranged differently and that we
+    // must perform the conversion using a stack slot.
+    if (InVT.isVector())
+      break;
+
+    // If the InOp is promoted to the same size, convert it.  Otherwise,
+    // fall out of the switch and widen the promoted input.
+    SDValue NInOp = GetPromotedInteger(InOp);
+    EVT NInVT = NInOp.getValueType();
+    if (WidenVT.bitsEq(NInVT)) {
+      // For big endian targets we need to shift the input integer or the
+      // interesting bits will end up at the wrong place.
+      if (DAG.getDataLayout().isBigEndian()) {
+        unsigned ShiftAmt = NInVT.getSizeInBits() - InVT.getSizeInBits();
+        EVT ShiftAmtTy = TLI.getShiftAmountTy(NInVT, DAG.getDataLayout());
+        assert(ShiftAmt < WidenVT.getSizeInBits() && "Too large shift amount!");
+        NInOp = DAG.getNode(ISD::SHL, dl, NInVT, NInOp,
+                           DAG.getConstant(ShiftAmt, dl, ShiftAmtTy));
+      }
+      return DAG.getNode(ISD::BITCAST, dl, WidenVT, NInOp);
+    }
+    InOp = NInOp;
+    InVT = NInVT;
+    break;
+  }
+  case TargetLowering::TypeSoftenFloat:
+  case TargetLowering::TypePromoteFloat:
+  case TargetLowering::TypeSoftPromoteHalf:
+  case TargetLowering::TypeExpandInteger:
+  case TargetLowering::TypeExpandFloat:
+  case TargetLowering::TypeScalarizeVector:
+  case TargetLowering::TypeSplitVector:
+    break;
+  case TargetLowering::TypeWidenVector:
+    // If the InOp is widened to the same size, convert it.  Otherwise, fall
+    // out of the switch and widen the widened input.
+    InOp = GetWidenedVector(InOp);
+    InVT = InOp.getValueType();
+    if (WidenVT.bitsEq(InVT))
+      // The input widens to the same size. Convert to the widen value.
+      return DAG.getNode(ISD::BITCAST, dl, WidenVT, InOp);
+    break;
+  }
+
+  unsigned WidenSize = WidenVT.getSizeInBits();
+  unsigned InSize = InVT.getSizeInBits();
+  unsigned InScalarSize = InVT.getScalarSizeInBits();
+  // x86mmx is not an acceptable vector element type, so don't try.
+  if (WidenSize % InScalarSize == 0 && InVT != MVT::x86mmx) {
+    // Determine new input vector type.  The new input vector type will use
+    // the same element type (if its a vector) or use the input type as a
+    // vector.  It is the same size as the type to widen to.
+    EVT NewInVT;
+    unsigned NewNumParts = WidenSize / InSize;
+    if (InVT.isVector()) {
+      EVT InEltVT = InVT.getVectorElementType();
+      NewInVT = EVT::getVectorVT(*DAG.getContext(), InEltVT,
+                                 WidenSize / InEltVT.getSizeInBits());
+    } else {
+      // For big endian systems, using the promoted input scalar type
+      // to produce the scalar_to_vector would put the desired bits into
+      // the least significant byte(s) of the wider element zero. This
+      // will mean that the users of the result vector are using incorrect
+      // bits. Use the original input type instead. Although either input
+      // type can be used on little endian systems, for consistency we
+      // use the original type there as well.
+      EVT OrigInVT = N->getOperand(0).getValueType();
+      NewNumParts = WidenSize / OrigInVT.getSizeInBits();
+      NewInVT = EVT::getVectorVT(*DAG.getContext(), OrigInVT, NewNumParts);
+    }
+
+    if (TLI.isTypeLegal(NewInVT)) {
+      SDValue NewVec;
+      if (InVT.isVector()) {
+        // Because the result and the input are different vector types, widening
+        // the result could create a legal type but widening the input might
+        // make it an illegal type that might lead to repeatedly splitting the
+        // input and then widening it. To avoid this, we widen the input only if
+        // it results in a legal type.
+        if (WidenSize % InSize == 0) {
+          SmallVector<SDValue, 16> Ops(NewNumParts, DAG.getUNDEF(InVT));
+          Ops[0] = InOp;
+
+          NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops);
+        } else {
+          SmallVector<SDValue, 16> Ops;
+          DAG.ExtractVectorElements(InOp, Ops);
+          Ops.append(WidenSize / InScalarSize - Ops.size(),
+                     DAG.getUNDEF(InVT.getVectorElementType()));
+
+          NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl, NewInVT, Ops);
+        }
+      } else {
+        NewVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewInVT, InOp);
+      }
+      return DAG.getNode(ISD::BITCAST, dl, WidenVT, NewVec);
+    }
+  }
+
+  return CreateStackStoreLoad(InOp, WidenVT);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
+  SDLoc dl(N);
+  // Build a vector with undefined for the new nodes.
+  EVT VT = N->getValueType(0);
+
+  // Integer BUILD_VECTOR operands may be larger than the node's vector element
+  // type. The UNDEFs need to have the same type as the existing operands.
+  EVT EltVT = N->getOperand(0).getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SmallVector<SDValue, 16> NewOps(N->op_begin(), N->op_end());
+  assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
+  NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT));
+
+  return DAG.getBuildVector(WidenVT, dl, NewOps);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
+  EVT InVT = N->getOperand(0).getValueType();
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+  unsigned NumOperands = N->getNumOperands();
+
+  bool InputWidened = false; // Indicates we need to widen the input.
+  if (getTypeAction(InVT) != TargetLowering::TypeWidenVector) {
+    unsigned WidenNumElts = WidenVT.getVectorMinNumElements();
+    unsigned NumInElts = InVT.getVectorMinNumElements();
+    if (WidenNumElts % NumInElts == 0) {
+      // Add undef vectors to widen to correct length.
+      unsigned NumConcat = WidenNumElts / NumInElts;
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      SmallVector<SDValue, 16> Ops(NumConcat);
+      for (unsigned i=0; i < NumOperands; ++i)
+        Ops[i] = N->getOperand(i);
+      for (unsigned i = NumOperands; i != NumConcat; ++i)
+        Ops[i] = UndefVal;
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Ops);
+    }
+  } else {
+    InputWidened = true;
+    if (WidenVT == TLI.getTypeToTransformTo(*DAG.getContext(), InVT)) {
+      // The inputs and the result are widen to the same value.
+      unsigned i;
+      for (i=1; i < NumOperands; ++i)
+        if (!N->getOperand(i).isUndef())
+          break;
+
+      if (i == NumOperands)
+        // Everything but the first operand is an UNDEF so just return the
+        // widened first operand.
+        return GetWidenedVector(N->getOperand(0));
+
+      if (NumOperands == 2) {
+        assert(!WidenVT.isScalableVector() &&
+               "Cannot use vector shuffles to widen CONCAT_VECTOR result");
+        unsigned WidenNumElts = WidenVT.getVectorNumElements();
+        unsigned NumInElts = InVT.getVectorNumElements();
+
+        // Replace concat of two operands with a shuffle.
+        SmallVector<int, 16> MaskOps(WidenNumElts, -1);
+        for (unsigned i = 0; i < NumInElts; ++i) {
+          MaskOps[i] = i;
+          MaskOps[i + NumInElts] = i + WidenNumElts;
+        }
+        return DAG.getVectorShuffle(WidenVT, dl,
+                                    GetWidenedVector(N->getOperand(0)),
+                                    GetWidenedVector(N->getOperand(1)),
+                                    MaskOps);
+      }
+    }
+  }
+
+  assert(!WidenVT.isScalableVector() &&
+         "Cannot use build vectors to widen CONCAT_VECTOR result");
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  unsigned NumInElts = InVT.getVectorNumElements();
+
+  // Fall back to use extracts and build vector.
+  EVT EltVT = WidenVT.getVectorElementType();
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  unsigned Idx = 0;
+  for (unsigned i=0; i < NumOperands; ++i) {
+    SDValue InOp = N->getOperand(i);
+    if (InputWidened)
+      InOp = GetWidenedVector(InOp);
+    for (unsigned j = 0; j < NumInElts; ++j)
+      Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                               DAG.getVectorIdxConstant(j, dl));
+  }
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; Idx < WidenNumElts; ++Idx)
+    Ops[Idx] = UndefVal;
+  return DAG.getBuildVector(WidenVT, dl, Ops);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_INSERT_SUBVECTOR(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+  SDLoc dl(N);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WidenVT, InOp1, InOp2, Idx);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue InOp = N->getOperand(0);
+  SDValue Idx = N->getOperand(1);
+  SDLoc dl(N);
+
+  auto InOpTypeAction = getTypeAction(InOp.getValueType());
+  if (InOpTypeAction == TargetLowering::TypeWidenVector)
+    InOp = GetWidenedVector(InOp);
+
+  EVT InVT = InOp.getValueType();
+
+  // Check if we can just return the input vector after widening.
+  uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  if (IdxVal == 0 && InVT == WidenVT)
+    return InOp;
+
+  // Check if we can extract from the vector.
+  unsigned WidenNumElts = WidenVT.getVectorMinNumElements();
+  unsigned InNumElts = InVT.getVectorMinNumElements();
+  unsigned VTNumElts = VT.getVectorMinNumElements();
+  assert(IdxVal % VTNumElts == 0 &&
+         "Expected Idx to be a multiple of subvector minimum vector length");
+  if (IdxVal % WidenNumElts == 0 && IdxVal + WidenNumElts < InNumElts)
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, WidenVT, InOp, Idx);
+
+  if (VT.isScalableVector()) {
+    // Try to split the operation up into smaller extracts and concat the
+    // results together, e.g.
+    //    nxv6i64 extract_subvector(nxv12i64, 6)
+    // <->
+    //  nxv8i64 concat(
+    //    nxv2i64 extract_subvector(nxv16i64, 6)
+    //    nxv2i64 extract_subvector(nxv16i64, 8)
+    //    nxv2i64 extract_subvector(nxv16i64, 10)
+    //    undef)
+    unsigned GCD = std::gcd(VTNumElts, WidenNumElts);
+    assert((IdxVal % GCD) == 0 && "Expected Idx to be a multiple of the broken "
+                                  "down type's element count");
+    EVT PartVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
+                                  ElementCount::getScalable(GCD));
+    // Avoid recursion around e.g. nxv1i8.
+    if (getTypeAction(PartVT) != TargetLowering::TypeWidenVector) {
+      SmallVector<SDValue> Parts;
+      unsigned I = 0;
+      for (; I < VTNumElts / GCD; ++I)
+        Parts.push_back(
+            DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, PartVT, InOp,
+                        DAG.getVectorIdxConstant(IdxVal + I * GCD, dl)));
+      for (; I < WidenNumElts / GCD; ++I)
+        Parts.push_back(DAG.getUNDEF(PartVT));
+
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Parts);
+    }
+
+    report_fatal_error("Don't know how to widen the result of "
+                       "EXTRACT_SUBVECTOR for scalable vectors");
+  }
+
+  // We could try widening the input to the right length but for now, extract
+  // the original elements, fill the rest with undefs and build a vector.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  unsigned i;
+  for (i = 0; i < VTNumElts; ++i)
+    Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                         DAG.getVectorIdxConstant(IdxVal + i, dl));
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i < WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+  return DAG.getBuildVector(WidenVT, dl, Ops);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_AssertZext(SDNode *N) {
+  SDValue InOp = ModifyToType(
+      N->getOperand(0),
+      TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)), true);
+  return DAG.getNode(ISD::AssertZext, SDLoc(N), InOp.getValueType(), InOp,
+                     N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N),
+                     InOp.getValueType(), InOp,
+                     N->getOperand(1), N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+
+  // A vector must always be stored in memory as-is, i.e. without any padding
+  // between the elements, since various code depend on it, e.g. in the
+  // handling of a bitcast of a vector type to int, which may be done with a
+  // vector store followed by an integer load. A vector that does not have
+  // elements that are byte-sized must therefore be stored as an integer
+  // built out of the extracted vector elements.
+  if (!LD->getMemoryVT().isByteSized()) {
+    SDValue Value, NewChain;
+    std::tie(Value, NewChain) = TLI.scalarizeVectorLoad(LD, DAG);
+    ReplaceValueWith(SDValue(LD, 0), Value);
+    ReplaceValueWith(SDValue(LD, 1), NewChain);
+    return SDValue();
+  }
+
+  // Generate a vector-predicated load if it is custom/legal on the target. To
+  // avoid possible recursion, only do this if the widened mask type is legal.
+  // FIXME: Not all targets may support EVL in VP_LOAD. These will have been
+  // removed from the IR by the ExpandVectorPredication pass but we're
+  // reintroducing them here.
+  EVT LdVT = LD->getMemoryVT();
+  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), LdVT);
+  EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                    WideVT.getVectorElementCount());
+  if (ExtType == ISD::NON_EXTLOAD &&
+      TLI.isOperationLegalOrCustom(ISD::VP_LOAD, WideVT) &&
+      TLI.isTypeLegal(WideMaskVT)) {
+    SDLoc DL(N);
+    SDValue Mask = DAG.getAllOnesConstant(DL, WideMaskVT);
+    SDValue EVL = DAG.getElementCount(DL, TLI.getVPExplicitVectorLengthTy(),
+                                      LdVT.getVectorElementCount());
+    const auto *MMO = LD->getMemOperand();
+    SDValue NewLoad =
+        DAG.getLoadVP(WideVT, DL, LD->getChain(), LD->getBasePtr(), Mask, EVL,
+                      MMO->getPointerInfo(), MMO->getAlign(), MMO->getFlags(),
+                      MMO->getAAInfo());
+
+    // Modified the chain - switch anything that used the old chain to use
+    // the new one.
+    ReplaceValueWith(SDValue(N, 1), NewLoad.getValue(1));
+
+    return NewLoad;
+  }
+
+  SDValue Result;
+  SmallVector<SDValue, 16> LdChain; // Chain for the series of load
+  if (ExtType != ISD::NON_EXTLOAD)
+    Result = GenWidenVectorExtLoads(LdChain, LD, ExtType);
+  else
+    Result = GenWidenVectorLoads(LdChain, LD);
+
+  if (Result) {
+    // If we generate a single load, we can use that for the chain.  Otherwise,
+    // build a factor node to remember the multiple loads are independent and
+    // chain to that.
+    SDValue NewChain;
+    if (LdChain.size() == 1)
+      NewChain = LdChain[0];
+    else
+      NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, LdChain);
+
+    // Modified the chain - switch anything that used the old chain to use
+    // the new one.
+    ReplaceValueWith(SDValue(N, 1), NewChain);
+
+    return Result;
+  }
+
+  report_fatal_error("Unable to widen vector load");
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_VP_LOAD(VPLoadSDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Mask = N->getMask();
+  SDValue EVL = N->getVectorLength();
+  ISD::LoadExtType ExtType = N->getExtensionType();
+  SDLoc dl(N);
+
+  // The mask should be widened as well
+  assert(getTypeAction(Mask.getValueType()) ==
+             TargetLowering::TypeWidenVector &&
+         "Unable to widen binary VP op");
+  Mask = GetWidenedVector(Mask);
+  assert(Mask.getValueType().getVectorElementCount() ==
+             TLI.getTypeToTransformTo(*DAG.getContext(), Mask.getValueType())
+                 .getVectorElementCount() &&
+         "Unable to widen vector load");
+
+  SDValue Res =
+      DAG.getLoadVP(N->getAddressingMode(), ExtType, WidenVT, dl, N->getChain(),
+                    N->getBasePtr(), N->getOffset(), Mask, EVL,
+                    N->getMemoryVT(), N->getMemOperand(), N->isExpandingLoad());
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N) {
+  SDLoc DL(N);
+
+  // The mask should be widened as well
+  SDValue Mask = N->getMask();
+  assert(getTypeAction(Mask.getValueType()) ==
+             TargetLowering::TypeWidenVector &&
+         "Unable to widen VP strided load");
+  Mask = GetWidenedVector(Mask);
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  assert(Mask.getValueType().getVectorElementCount() ==
+             WidenVT.getVectorElementCount() &&
+         "Data and mask vectors should have the same number of elements");
+
+  SDValue Res = DAG.getStridedLoadVP(
+      N->getAddressingMode(), N->getExtensionType(), WidenVT, DL, N->getChain(),
+      N->getBasePtr(), N->getOffset(), N->getStride(), Mask,
+      N->getVectorLength(), N->getMemoryVT(), N->getMemOperand(),
+      N->isExpandingLoad());
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0));
+  SDValue Mask = N->getMask();
+  EVT MaskVT = Mask.getValueType();
+  SDValue PassThru = GetWidenedVector(N->getPassThru());
+  ISD::LoadExtType ExtType = N->getExtensionType();
+  SDLoc dl(N);
+
+  // The mask should be widened as well
+  EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
+                                    MaskVT.getVectorElementType(),
+                                    WidenVT.getVectorNumElements());
+  Mask = ModifyToType(Mask, WideMaskVT, true);
+
+  SDValue Res = DAG.getMaskedLoad(
+      WidenVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
+      PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
+      ExtType, N->isExpandingLoad());
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {
+
+  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Mask = N->getMask();
+  EVT MaskVT = Mask.getValueType();
+  SDValue PassThru = GetWidenedVector(N->getPassThru());
+  SDValue Scale = N->getScale();
+  unsigned NumElts = WideVT.getVectorNumElements();
+  SDLoc dl(N);
+
+  // The mask should be widened as well
+  EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
+                                    MaskVT.getVectorElementType(),
+                                    WideVT.getVectorNumElements());
+  Mask = ModifyToType(Mask, WideMaskVT, true);
+
+  // Widen the Index operand
+  SDValue Index = N->getIndex();
+  EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
+                                     Index.getValueType().getScalarType(),
+                                     NumElts);
+  Index = ModifyToType(Index, WideIndexVT);
+  SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
+                    Scale };
+
+  // Widen the MemoryType
+  EVT WideMemVT = EVT::getVectorVT(*DAG.getContext(),
+                                   N->getMemoryVT().getScalarType(), NumElts);
+  SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other),
+                                    WideMemVT, dl, Ops, N->getMemOperand(),
+                                    N->getIndexType(), N->getExtensionType());
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_VP_GATHER(VPGatherSDNode *N) {
+  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Mask = N->getMask();
+  SDValue Scale = N->getScale();
+  ElementCount WideEC = WideVT.getVectorElementCount();
+  SDLoc dl(N);
+
+  SDValue Index = GetWidenedVector(N->getIndex());
+  EVT WideMemVT = EVT::getVectorVT(*DAG.getContext(),
+                                   N->getMemoryVT().getScalarType(), WideEC);
+  Mask = GetWidenedMask(Mask, WideEC);
+
+  SDValue Ops[] = {N->getChain(), N->getBasePtr(),     Index, Scale,
+                   Mask,          N->getVectorLength()};
+  SDValue Res = DAG.getGatherVP(DAG.getVTList(WideVT, MVT::Other), WideMemVT,
+                                dl, Ops, N->getMemOperand(), N->getIndexType());
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_ScalarOp(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, N->getOperand(0));
+}
+
+// Return true is this is a SETCC node or a strict version of it.
+static inline bool isSETCCOp(unsigned Opcode) {
+  switch (Opcode) {
+  case ISD::SETCC:
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS:
+    return true;
+  }
+  return false;
+}
+
+// Return true if this is a node that could have two SETCCs as operands.
+static inline bool isLogicalMaskOp(unsigned Opcode) {
+  switch (Opcode) {
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    return true;
+  }
+  return false;
+}
+
+// If N is a SETCC or a strict variant of it, return the type
+// of the compare operands.
+static inline EVT getSETCCOperandType(SDValue N) {
+  unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
+  return N->getOperand(OpNo).getValueType();
+}
+
+// This is used just for the assert in convertMask(). Check that this either
+// a SETCC or a previously handled SETCC by convertMask().
+#ifndef NDEBUG
+static inline bool isSETCCorConvertedSETCC(SDValue N) {
+  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+    N = N.getOperand(0);
+  else if (N.getOpcode() == ISD::CONCAT_VECTORS) {
+    for (unsigned i = 1; i < N->getNumOperands(); ++i)
+      if (!N->getOperand(i)->isUndef())
+        return false;
+    N = N.getOperand(0);
+  }
+
+  if (N.getOpcode() == ISD::TRUNCATE)
+    N = N.getOperand(0);
+  else if (N.getOpcode() == ISD::SIGN_EXTEND)
+    N = N.getOperand(0);
+
+  if (isLogicalMaskOp(N.getOpcode()))
+    return isSETCCorConvertedSETCC(N.getOperand(0)) &&
+           isSETCCorConvertedSETCC(N.getOperand(1));
+
+  return (isSETCCOp(N.getOpcode()) ||
+          ISD::isBuildVectorOfConstantSDNodes(N.getNode()));
+}
+#endif
+
+// Return a mask of vector type MaskVT to replace InMask. Also adjust MaskVT
+// to ToMaskVT if needed with vector extension or truncation.
+SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT,
+                                      EVT ToMaskVT) {
+  // Currently a SETCC or a AND/OR/XOR with two SETCCs are handled.
+  // FIXME: This code seems to be too restrictive, we might consider
+  // generalizing it or dropping it.
+  assert(isSETCCorConvertedSETCC(InMask) && "Unexpected mask argument.");
+
+  // Make a new Mask node, with a legal result VT.
+  SDValue Mask;
+  SmallVector<SDValue, 4> Ops;
+  for (unsigned i = 0, e = InMask->getNumOperands(); i < e; ++i)
+    Ops.push_back(InMask->getOperand(i));
+  if (InMask->isStrictFPOpcode()) {
+    Mask = DAG.getNode(InMask->getOpcode(), SDLoc(InMask),
+                       { MaskVT, MVT::Other }, Ops);
+    ReplaceValueWith(InMask.getValue(1), Mask.getValue(1));
+  }
+  else
+    Mask = DAG.getNode(InMask->getOpcode(), SDLoc(InMask), MaskVT, Ops);
+
+  // If MaskVT has smaller or bigger elements than ToMaskVT, a vector sign
+  // extend or truncate is needed.
+  LLVMContext &Ctx = *DAG.getContext();
+  unsigned MaskScalarBits = MaskVT.getScalarSizeInBits();
+  unsigned ToMaskScalBits = ToMaskVT.getScalarSizeInBits();
+  if (MaskScalarBits < ToMaskScalBits) {
+    EVT ExtVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
+                                 MaskVT.getVectorNumElements());
+    Mask = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Mask), ExtVT, Mask);
+  } else if (MaskScalarBits > ToMaskScalBits) {
+    EVT TruncVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
+                                   MaskVT.getVectorNumElements());
+    Mask = DAG.getNode(ISD::TRUNCATE, SDLoc(Mask), TruncVT, Mask);
+  }
+
+  assert(Mask->getValueType(0).getScalarSizeInBits() ==
+             ToMaskVT.getScalarSizeInBits() &&
+         "Mask should have the right element size by now.");
+
+  // Adjust Mask to the right number of elements.
+  unsigned CurrMaskNumEls = Mask->getValueType(0).getVectorNumElements();
+  if (CurrMaskNumEls > ToMaskVT.getVectorNumElements()) {
+    SDValue ZeroIdx = DAG.getVectorIdxConstant(0, SDLoc(Mask));
+    Mask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Mask), ToMaskVT, Mask,
+                       ZeroIdx);
+  } else if (CurrMaskNumEls < ToMaskVT.getVectorNumElements()) {
+    unsigned NumSubVecs = (ToMaskVT.getVectorNumElements() / CurrMaskNumEls);
+    EVT SubVT = Mask->getValueType(0);
+    SmallVector<SDValue, 16> SubOps(NumSubVecs, DAG.getUNDEF(SubVT));
+    SubOps[0] = Mask;
+    Mask = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Mask), ToMaskVT, SubOps);
+  }
+
+  assert((Mask->getValueType(0) == ToMaskVT) &&
+         "A mask of ToMaskVT should have been produced by now.");
+
+  return Mask;
+}
+
+// This method tries to handle some special cases for the vselect mask
+// and if needed adjusting the mask vector type to match that of the VSELECT.
+// Without it, many cases end up with scalarization of the SETCC, with many
+// unnecessary instructions.
+SDValue DAGTypeLegalizer::WidenVSELECTMask(SDNode *N) {
+  LLVMContext &Ctx = *DAG.getContext();
+  SDValue Cond = N->getOperand(0);
+
+  if (N->getOpcode() != ISD::VSELECT)
+    return SDValue();
+
+  if (!isSETCCOp(Cond->getOpcode()) && !isLogicalMaskOp(Cond->getOpcode()))
+    return SDValue();
+
+  // If this is a splitted VSELECT that was previously already handled, do
+  // nothing.
+  EVT CondVT = Cond->getValueType(0);
+  if (CondVT.getScalarSizeInBits() != 1)
+    return SDValue();
+
+  EVT VSelVT = N->getValueType(0);
+
+  // This method can't handle scalable vector types.
+  // FIXME: This support could be added in the future.
+  if (VSelVT.isScalableVector())
+    return SDValue();
+
+  // Only handle vector types which are a power of 2.
+  if (!isPowerOf2_64(VSelVT.getSizeInBits()))
+    return SDValue();
+
+  // Don't touch if this will be scalarized.
+  EVT FinalVT = VSelVT;
+  while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector)
+    FinalVT = FinalVT.getHalfNumVectorElementsVT(Ctx);
+
+  if (FinalVT.getVectorNumElements() == 1)
+    return SDValue();
+
+  // If there is support for an i1 vector mask, don't touch.
+  if (isSETCCOp(Cond.getOpcode())) {
+    EVT SetCCOpVT = getSETCCOperandType(Cond);
+    while (TLI.getTypeAction(Ctx, SetCCOpVT) != TargetLowering::TypeLegal)
+      SetCCOpVT = TLI.getTypeToTransformTo(Ctx, SetCCOpVT);
+    EVT SetCCResVT = getSetCCResultType(SetCCOpVT);
+    if (SetCCResVT.getScalarSizeInBits() == 1)
+      return SDValue();
+  } else if (CondVT.getScalarType() == MVT::i1) {
+    // If there is support for an i1 vector mask (or only scalar i1 conditions),
+    // don't touch.
+    while (TLI.getTypeAction(Ctx, CondVT) != TargetLowering::TypeLegal)
+      CondVT = TLI.getTypeToTransformTo(Ctx, CondVT);
+
+    if (CondVT.getScalarType() == MVT::i1)
+      return SDValue();
+  }
+
+  // Widen the vselect result type if needed.
+  if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector)
+    VSelVT = TLI.getTypeToTransformTo(Ctx, VSelVT);
+
+  // The mask of the VSELECT should have integer elements.
+  EVT ToMaskVT = VSelVT;
+  if (!ToMaskVT.getScalarType().isInteger())
+    ToMaskVT = ToMaskVT.changeVectorElementTypeToInteger();
+
+  SDValue Mask;
+  if (isSETCCOp(Cond->getOpcode())) {
+    EVT MaskVT = getSetCCResultType(getSETCCOperandType(Cond));
+    Mask = convertMask(Cond, MaskVT, ToMaskVT);
+  } else if (isLogicalMaskOp(Cond->getOpcode()) &&
+             isSETCCOp(Cond->getOperand(0).getOpcode()) &&
+             isSETCCOp(Cond->getOperand(1).getOpcode())) {
+    // Cond is (AND/OR/XOR (SETCC, SETCC))
+    SDValue SETCC0 = Cond->getOperand(0);
+    SDValue SETCC1 = Cond->getOperand(1);
+    EVT VT0 = getSetCCResultType(getSETCCOperandType(SETCC0));
+    EVT VT1 = getSetCCResultType(getSETCCOperandType(SETCC1));
+    unsigned ScalarBits0 = VT0.getScalarSizeInBits();
+    unsigned ScalarBits1 = VT1.getScalarSizeInBits();
+    unsigned ScalarBits_ToMask = ToMaskVT.getScalarSizeInBits();
+    EVT MaskVT;
+    // If the two SETCCs have different VTs, either extend/truncate one of
+    // them to the other "towards" ToMaskVT, or truncate one and extend the
+    // other to ToMaskVT.
+    if (ScalarBits0 != ScalarBits1) {
+      EVT NarrowVT = ((ScalarBits0 < ScalarBits1) ? VT0 : VT1);
+      EVT WideVT = ((NarrowVT == VT0) ? VT1 : VT0);
+      if (ScalarBits_ToMask >= WideVT.getScalarSizeInBits())
+        MaskVT = WideVT;
+      else if (ScalarBits_ToMask <= NarrowVT.getScalarSizeInBits())
+        MaskVT = NarrowVT;
+      else
+        MaskVT = ToMaskVT;
+    } else
+      // If the two SETCCs have the same VT, don't change it.
+      MaskVT = VT0;
+
+    // Make new SETCCs and logical nodes.
+    SETCC0 = convertMask(SETCC0, VT0, MaskVT);
+    SETCC1 = convertMask(SETCC1, VT1, MaskVT);
+    Cond = DAG.getNode(Cond->getOpcode(), SDLoc(Cond), MaskVT, SETCC0, SETCC1);
+
+    // Convert the logical op for VSELECT if needed.
+    Mask = convertMask(Cond, MaskVT, ToMaskVT);
+  } else
+    return SDValue();
+
+  return Mask;
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Select(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  ElementCount WidenEC = WidenVT.getVectorElementCount();
+
+  SDValue Cond1 = N->getOperand(0);
+  EVT CondVT = Cond1.getValueType();
+  unsigned Opcode = N->getOpcode();
+  if (CondVT.isVector()) {
+    if (SDValue WideCond = WidenVSELECTMask(N)) {
+      SDValue InOp1 = GetWidenedVector(N->getOperand(1));
+      SDValue InOp2 = GetWidenedVector(N->getOperand(2));
+      assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT);
+      return DAG.getNode(Opcode, SDLoc(N), WidenVT, WideCond, InOp1, InOp2);
+    }
+
+    EVT CondEltVT = CondVT.getVectorElementType();
+    EVT CondWidenVT = EVT::getVectorVT(*DAG.getContext(), CondEltVT, WidenEC);
+    if (getTypeAction(CondVT) == TargetLowering::TypeWidenVector)
+      Cond1 = GetWidenedVector(Cond1);
+
+    // If we have to split the condition there is no point in widening the
+    // select. This would result in an cycle of widening the select ->
+    // widening the condition operand -> splitting the condition operand ->
+    // splitting the select -> widening the select. Instead split this select
+    // further and widen the resulting type.
+    if (getTypeAction(CondVT) == TargetLowering::TypeSplitVector) {
+      SDValue SplitSelect = SplitVecOp_VSELECT(N, 0);
+      SDValue Res = ModifyToType(SplitSelect, WidenVT);
+      return Res;
+    }
+
+    if (Cond1.getValueType() != CondWidenVT)
+      Cond1 = ModifyToType(Cond1, CondWidenVT);
+  }
+
+  SDValue InOp1 = GetWidenedVector(N->getOperand(1));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(2));
+  assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT);
+  if (Opcode == ISD::VP_SELECT || Opcode == ISD::VP_MERGE)
+    return DAG.getNode(Opcode, SDLoc(N), WidenVT, Cond1, InOp1, InOp2,
+                       N->getOperand(3));
+  return DAG.getNode(Opcode, SDLoc(N), WidenVT, Cond1, InOp1, InOp2);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SELECT_CC(SDNode *N) {
+  SDValue InOp1 = GetWidenedVector(N->getOperand(2));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
+                     InOp1.getValueType(), N->getOperand(0),
+                     N->getOperand(1), InOp1, InOp2, N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_UNDEF(SDNode *N) {
+ EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ return DAG.getUNDEF(WidenVT);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+
+  // Adjust mask based on new input vector length.
+  SmallVector<int, 16> NewMask;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int Idx = N->getMaskElt(i);
+    if (Idx < (int)NumElts)
+      NewMask.push_back(Idx);
+    else
+      NewMask.push_back(Idx - NumElts + WidenNumElts);
+  }
+  for (unsigned i = NumElts; i != WidenNumElts; ++i)
+    NewMask.push_back(-1);
+  return DAG.getVectorShuffle(WidenVT, dl, InOp1, InOp2, NewMask);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_REVERSE(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  SDLoc dl(N);
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue OpValue = GetWidenedVector(N->getOperand(0));
+  assert(WidenVT == OpValue.getValueType() && "Unexpected widened vector type");
+
+  SDValue ReverseVal = DAG.getNode(ISD::VECTOR_REVERSE, dl, WidenVT, OpValue);
+  unsigned WidenNumElts = WidenVT.getVectorMinNumElements();
+  unsigned VTNumElts = VT.getVectorMinNumElements();
+  unsigned IdxVal = WidenNumElts - VTNumElts;
+
+  if (VT.isScalableVector()) {
+    // Try to split the 'Widen ReverseVal' into smaller extracts and concat the
+    // results together, e.g.(nxv6i64 -> nxv8i64)
+    //    nxv8i64 vector_reverse
+    // <->
+    //  nxv8i64 concat(
+    //    nxv2i64 extract_subvector(nxv8i64, 2)
+    //    nxv2i64 extract_subvector(nxv8i64, 4)
+    //    nxv2i64 extract_subvector(nxv8i64, 6)
+    //    nxv2i64 undef)
+
+    unsigned GCD = std::gcd(VTNumElts, WidenNumElts);
+    EVT PartVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
+                                  ElementCount::getScalable(GCD));
+    assert((IdxVal % GCD) == 0 && "Expected Idx to be a multiple of the broken "
+                                  "down type's element count");
+    SmallVector<SDValue> Parts;
+    unsigned i = 0;
+    for (; i < VTNumElts / GCD; ++i)
+      Parts.push_back(
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, PartVT, ReverseVal,
+                      DAG.getVectorIdxConstant(IdxVal + i * GCD, dl)));
+    for (; i < WidenNumElts / GCD; ++i)
+      Parts.push_back(DAG.getUNDEF(PartVT));
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Parts);
+  }
+
+  // Use VECTOR_SHUFFLE to combine new vector from 'ReverseVal' for
+  // fixed-vectors.
+  SmallVector<int, 16> Mask;
+  for (unsigned i = 0; i != VTNumElts; ++i) {
+    Mask.push_back(IdxVal + i);
+  }
+  for (unsigned i = VTNumElts; i != WidenNumElts; ++i)
+    Mask.push_back(-1);
+
+  return DAG.getVectorShuffle(WidenVT, dl, ReverseVal, DAG.getUNDEF(WidenVT),
+                              Mask);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operands must be vectors");
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  ElementCount WidenEC = WidenVT.getVectorElementCount();
+
+  SDValue InOp1 = N->getOperand(0);
+  EVT InVT = InOp1.getValueType();
+  assert(InVT.isVector() && "can not widen non-vector type");
+  EVT WidenInVT =
+      EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(), WidenEC);
+
+  // The input and output types often differ here, and it could be that while
+  // we'd prefer to widen the result type, the input operands have been split.
+  // In this case, we also need to split the result of this node as well.
+  if (getTypeAction(InVT) == TargetLowering::TypeSplitVector) {
+    SDValue SplitVSetCC = SplitVecOp_VSETCC(N);
+    SDValue Res = ModifyToType(SplitVSetCC, WidenVT);
+    return Res;
+  }
+
+  // If the inputs also widen, handle them directly. Otherwise widen by hand.
+  SDValue InOp2 = N->getOperand(1);
+  if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
+    InOp1 = GetWidenedVector(InOp1);
+    InOp2 = GetWidenedVector(InOp2);
+  } else {
+    InOp1 = DAG.WidenVector(InOp1, SDLoc(N));
+    InOp2 = DAG.WidenVector(InOp2, SDLoc(N));
+  }
+
+  // Assume that the input and output will be widen appropriately.  If not,
+  // we will have to unroll it at some point.
+  assert(InOp1.getValueType() == WidenInVT &&
+         InOp2.getValueType() == WidenInVT &&
+         "Input not widened to expected type!");
+  (void)WidenInVT;
+  if (N->getOpcode() == ISD::VP_SETCC) {
+    SDValue Mask =
+        GetWidenedMask(N->getOperand(3), WidenVT.getVectorElementCount());
+    return DAG.getNode(ISD::VP_SETCC, SDLoc(N), WidenVT, InOp1, InOp2,
+                       N->getOperand(2), Mask, N->getOperand(4));
+  }
+  return DAG.getNode(ISD::SETCC, SDLoc(N), WidenVT, InOp1, InOp2,
+                     N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_STRICT_FSETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(1).getValueType().isVector() &&
+         "Operands must be vectors");
+  EVT VT = N->getValueType(0);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  unsigned NumElts = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+
+  SDLoc dl(N);
+  SDValue Chain = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  SDValue CC = N->getOperand(3);
+  EVT TmpEltVT = LHS.getValueType().getVectorElementType();
+
+  // Fully unroll and reassemble.
+  SmallVector<SDValue, 8> Scalars(WidenNumElts, DAG.getUNDEF(EltVT));
+  SmallVector<SDValue, 8> Chains(NumElts);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
+                                  DAG.getVectorIdxConstant(i, dl));
+    SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
+                                  DAG.getVectorIdxConstant(i, dl));
+
+    Scalars[i] = DAG.getNode(N->getOpcode(), dl, {MVT::i1, MVT::Other},
+                             {Chain, LHSElem, RHSElem, CC});
+    Chains[i] = Scalars[i].getValue(1);
+    Scalars[i] = DAG.getSelect(dl, EltVT, Scalars[i],
+                               DAG.getBoolConstant(true, dl, EltVT, VT),
+                               DAG.getBoolConstant(false, dl, EltVT, VT));
+  }
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+  ReplaceValueWith(SDValue(N, 1), NewChain);
+
+  return DAG.getBuildVector(WidenVT, dl, Scalars);
+}
+
+//===----------------------------------------------------------------------===//
+// Widen Vector Operand
+//===----------------------------------------------------------------------===//
+bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
+  LLVM_DEBUG(dbgs() << "Widen node operand " << OpNo << ": "; N->dump(&DAG);
+             dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  // See if the target wants to custom widen this node.
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "WidenVectorOperand op #" << OpNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to widen this operator's operand!");
+
+  case ISD::BITCAST:            Res = WidenVecOp_BITCAST(N); break;
+  case ISD::CONCAT_VECTORS:     Res = WidenVecOp_CONCAT_VECTORS(N); break;
+  case ISD::INSERT_SUBVECTOR:   Res = WidenVecOp_INSERT_SUBVECTOR(N); break;
+  case ISD::EXTRACT_SUBVECTOR:  Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break;
+  case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::STORE:              Res = WidenVecOp_STORE(N); break;
+  case ISD::VP_STORE:           Res = WidenVecOp_VP_STORE(N, OpNo); break;
+  case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
+    Res = WidenVecOp_VP_STRIDED_STORE(N, OpNo);
+    break;
+  case ISD::MSTORE:             Res = WidenVecOp_MSTORE(N, OpNo); break;
+  case ISD::MGATHER:            Res = WidenVecOp_MGATHER(N, OpNo); break;
+  case ISD::MSCATTER:           Res = WidenVecOp_MSCATTER(N, OpNo); break;
+  case ISD::VP_SCATTER:         Res = WidenVecOp_VP_SCATTER(N, OpNo); break;
+  case ISD::SETCC:              Res = WidenVecOp_SETCC(N); break;
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS:     Res = WidenVecOp_STRICT_FSETCC(N); break;
+  case ISD::VSELECT:            Res = WidenVecOp_VSELECT(N); break;
+  case ISD::FLDEXP:
+  case ISD::FCOPYSIGN:          Res = WidenVecOp_UnrollVectorOp(N); break;
+  case ISD::IS_FPCLASS:         Res = WidenVecOp_IS_FPCLASS(N); break;
+
+  case ISD::ANY_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    Res = WidenVecOp_EXTEND(N);
+    break;
+
+  case ISD::FP_EXTEND:
+  case ISD::STRICT_FP_EXTEND:
+  case ISD::FP_ROUND:
+  case ISD::STRICT_FP_ROUND:
+  case ISD::FP_TO_SINT:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::STRICT_FP_TO_UINT:
+  case ISD::SINT_TO_FP:
+  case ISD::STRICT_SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::STRICT_UINT_TO_FP:
+  case ISD::TRUNCATE:
+    Res = WidenVecOp_Convert(N);
+    break;
+
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    Res = WidenVecOp_FP_TO_XINT_SAT(N);
+    break;
+
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN:
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMAXIMUM:
+  case ISD::VECREDUCE_FMINIMUM:
+    Res = WidenVecOp_VECREDUCE(N);
+    break;
+  case ISD::VECREDUCE_SEQ_FADD:
+  case ISD::VECREDUCE_SEQ_FMUL:
+    Res = WidenVecOp_VECREDUCE_SEQ(N);
+    break;
+  case ISD::VP_REDUCE_FADD:
+  case ISD::VP_REDUCE_SEQ_FADD:
+  case ISD::VP_REDUCE_FMUL:
+  case ISD::VP_REDUCE_SEQ_FMUL:
+  case ISD::VP_REDUCE_ADD:
+  case ISD::VP_REDUCE_MUL:
+  case ISD::VP_REDUCE_AND:
+  case ISD::VP_REDUCE_OR:
+  case ISD::VP_REDUCE_XOR:
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_SMIN:
+  case ISD::VP_REDUCE_UMAX:
+  case ISD::VP_REDUCE_UMIN:
+  case ISD::VP_REDUCE_FMAX:
+  case ISD::VP_REDUCE_FMIN:
+    Res = WidenVecOp_VP_REDUCE(N);
+    break;
+  }
+
+  // If Res is null, the sub-method took care of registering the result.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+
+  if (N->isStrictFPOpcode())
+    assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 2 &&
+           "Invalid operand expansion");
+  else
+    assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+           "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  SDValue InOp = N->getOperand(0);
+  assert(getTypeAction(InOp.getValueType()) ==
+             TargetLowering::TypeWidenVector &&
+         "Unexpected type action");
+  InOp = GetWidenedVector(InOp);
+  assert(VT.getVectorNumElements() <
+             InOp.getValueType().getVectorNumElements() &&
+         "Input wasn't widened!");
+
+  // We may need to further widen the operand until it has the same total
+  // vector size as the result.
+  EVT InVT = InOp.getValueType();
+  if (InVT.getSizeInBits() != VT.getSizeInBits()) {
+    EVT InEltVT = InVT.getVectorElementType();
+    for (EVT FixedVT : MVT::vector_valuetypes()) {
+      EVT FixedEltVT = FixedVT.getVectorElementType();
+      if (TLI.isTypeLegal(FixedVT) &&
+          FixedVT.getSizeInBits() == VT.getSizeInBits() &&
+          FixedEltVT == InEltVT) {
+        assert(FixedVT.getVectorNumElements() >= VT.getVectorNumElements() &&
+               "Not enough elements in the fixed type for the operand!");
+        assert(FixedVT.getVectorNumElements() != InVT.getVectorNumElements() &&
+               "We can't have the same type as we started with!");
+        if (FixedVT.getVectorNumElements() > InVT.getVectorNumElements())
+          InOp = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, FixedVT,
+                             DAG.getUNDEF(FixedVT), InOp,
+                             DAG.getVectorIdxConstant(0, DL));
+        else
+          InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FixedVT, InOp,
+                             DAG.getVectorIdxConstant(0, DL));
+        break;
+      }
+    }
+    InVT = InOp.getValueType();
+    if (InVT.getSizeInBits() != VT.getSizeInBits())
+      // We couldn't find a legal vector type that was a widening of the input
+      // and could be extended in-register to the result type, so we have to
+      // scalarize.
+      return WidenVecOp_Convert(N);
+  }
+
+  // Use special DAG nodes to represent the operation of extending the
+  // low lanes.
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Extend legalization on extend operation!");
+  case ISD::ANY_EXTEND:
+    return DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, InOp);
+  case ISD::SIGN_EXTEND:
+    return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, InOp);
+  case ISD::ZERO_EXTEND:
+    return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, InOp);
+  }
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_UnrollVectorOp(SDNode *N) {
+  // The result (and first input) is legal, but the second input is illegal.
+  // We can't do much to fix that, so just unroll and let the extracts off of
+  // the second input be widened as needed later.
+  return DAG.UnrollVectorOp(N);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_IS_FPCLASS(SDNode *N) {
+  SDLoc DL(N);
+  EVT ResultVT = N->getValueType(0);
+  SDValue Test = N->getOperand(1);
+  SDValue WideArg = GetWidenedVector(N->getOperand(0));
+
+  // Process this node similarly to SETCC.
+  EVT WideResultVT = getSetCCResultType(WideArg.getValueType());
+  if (ResultVT.getScalarType() == MVT::i1)
+    WideResultVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                    WideResultVT.getVectorNumElements());
+
+  SDValue WideNode = DAG.getNode(ISD::IS_FPCLASS, DL, WideResultVT,
+                                 {WideArg, Test}, N->getFlags());
+
+  // Extract the needed results from the result vector.
+  EVT ResVT =
+      EVT::getVectorVT(*DAG.getContext(), WideResultVT.getVectorElementType(),
+                       ResultVT.getVectorNumElements());
+  SDValue CC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, WideNode,
+                           DAG.getVectorIdxConstant(0, DL));
+
+  EVT OpVT = N->getOperand(0).getValueType();
+  ISD::NodeType ExtendCode =
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
+  return DAG.getNode(ExtendCode, DL, ResultVT, CC);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
+  // Since the result is legal and the input is illegal.
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  SDLoc dl(N);
+  SDValue InOp = N->getOperand(N->isStrictFPOpcode() ? 1 : 0);
+  assert(getTypeAction(InOp.getValueType()) ==
+             TargetLowering::TypeWidenVector &&
+         "Unexpected type action");
+  InOp = GetWidenedVector(InOp);
+  EVT InVT = InOp.getValueType();
+  unsigned Opcode = N->getOpcode();
+
+  // See if a widened result type would be legal, if so widen the node.
+  // FIXME: This isn't safe for StrictFP. Other optimization here is needed.
+  EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
+                                InVT.getVectorElementCount());
+  if (TLI.isTypeLegal(WideVT) && !N->isStrictFPOpcode()) {
+    SDValue Res;
+    if (N->isStrictFPOpcode()) {
+      if (Opcode == ISD::STRICT_FP_ROUND)
+        Res = DAG.getNode(Opcode, dl, { WideVT, MVT::Other },
+                          { N->getOperand(0), InOp, N->getOperand(2) });
+      else
+        Res = DAG.getNode(Opcode, dl, { WideVT, MVT::Other },
+                          { N->getOperand(0), InOp });
+      // Legalize the chain result - switch anything that used the old chain to
+      // use the new one.
+      ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+    } else {
+      if (Opcode == ISD::FP_ROUND)
+        Res = DAG.getNode(Opcode, dl, WideVT, InOp, N->getOperand(1));
+      else
+        Res = DAG.getNode(Opcode, dl, WideVT, InOp);
+    }
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
+                       DAG.getVectorIdxConstant(0, dl));
+  }
+
+  EVT InEltVT = InVT.getVectorElementType();
+
+  // Unroll the convert into some scalar code and create a nasty build vector.
+  unsigned NumElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(NumElts);
+  if (N->isStrictFPOpcode()) {
+    SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
+    SmallVector<SDValue, 32> OpChains;
+    for (unsigned i=0; i < NumElts; ++i) {
+      NewOps[1] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
+                              DAG.getVectorIdxConstant(i, dl));
+      Ops[i] = DAG.getNode(Opcode, dl, { EltVT, MVT::Other }, NewOps);
+      OpChains.push_back(Ops[i].getValue(1));
+    }
+    SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OpChains);
+    ReplaceValueWith(SDValue(N, 1), NewChain);
+  } else {
+    for (unsigned i = 0; i < NumElts; ++i)
+      Ops[i] = DAG.getNode(Opcode, dl, EltVT,
+                           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT,
+                                       InOp, DAG.getVectorIdxConstant(i, dl)));
+  }
+
+  return DAG.getBuildVector(VT, dl, Ops);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_FP_TO_XINT_SAT(SDNode *N) {
+  EVT DstVT = N->getValueType(0);
+  SDValue Src = GetWidenedVector(N->getOperand(0));
+  EVT SrcVT = Src.getValueType();
+  ElementCount WideNumElts = SrcVT.getVectorElementCount();
+  SDLoc dl(N);
+
+  // See if a widened result type would be legal, if so widen the node.
+  EVT WideDstVT = EVT::getVectorVT(*DAG.getContext(),
+                                   DstVT.getVectorElementType(), WideNumElts);
+  if (TLI.isTypeLegal(WideDstVT)) {
+    SDValue Res =
+        DAG.getNode(N->getOpcode(), dl, WideDstVT, Src, N->getOperand(1));
+    return DAG.getNode(
+        ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res,
+        DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  }
+
+  // Give up and unroll.
+  return DAG.UnrollVectorOp(N);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  EVT InWidenVT = InOp.getValueType();
+  SDLoc dl(N);
+
+  // Check if we can convert between two legal vector types and extract.
+  TypeSize InWidenSize = InWidenVT.getSizeInBits();
+  TypeSize Size = VT.getSizeInBits();
+  // x86mmx is not an acceptable vector element type, so don't try.
+  if (!VT.isVector() && VT != MVT::x86mmx &&
+      InWidenSize.hasKnownScalarFactor(Size)) {
+    unsigned NewNumElts = InWidenSize.getKnownScalarFactor(Size);
+    EVT NewVT = EVT::getVectorVT(*DAG.getContext(), VT, NewNumElts);
+    if (TLI.isTypeLegal(NewVT)) {
+      SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp,
+                         DAG.getVectorIdxConstant(0, dl));
+    }
+  }
+
+  // Handle a case like bitcast v12i8 -> v3i32. Normally that would get widened
+  // to v16i8 -> v4i32, but for a target where v3i32 is legal but v12i8 is not,
+  // we end up here. Handling the case here with EXTRACT_SUBVECTOR avoids
+  // having to copy via memory.
+  if (VT.isVector()) {
+    EVT EltVT = VT.getVectorElementType();
+    unsigned EltSize = EltVT.getFixedSizeInBits();
+    if (InWidenSize.isKnownMultipleOf(EltSize)) {
+      ElementCount NewNumElts =
+          (InWidenVT.getVectorElementCount() * InWidenVT.getScalarSizeInBits())
+              .divideCoefficientBy(EltSize);
+      EVT NewVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NewNumElts);
+      if (TLI.isTypeLegal(NewVT)) {
+        SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
+        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, BitOp,
+                           DAG.getVectorIdxConstant(0, dl));
+      }
+    }
+  }
+
+  return CreateStackStoreLoad(InOp, VT);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  EVT InVT = N->getOperand(0).getValueType();
+  SDLoc dl(N);
+
+  // If the widen width for this operand is the same as the width of the concat
+  // and all but the first operand is undef, just use the widened operand.
+  unsigned NumOperands = N->getNumOperands();
+  if (VT == TLI.getTypeToTransformTo(*DAG.getContext(), InVT)) {
+    unsigned i;
+    for (i = 1; i < NumOperands; ++i)
+      if (!N->getOperand(i).isUndef())
+        break;
+
+    if (i == NumOperands)
+      return GetWidenedVector(N->getOperand(0));
+  }
+
+  // Otherwise, fall back to a nasty build vector.
+  unsigned NumElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(NumElts);
+
+  unsigned NumInElts = InVT.getVectorNumElements();
+
+  unsigned Idx = 0;
+  for (unsigned i=0; i < NumOperands; ++i) {
+    SDValue InOp = N->getOperand(i);
+    assert(getTypeAction(InOp.getValueType()) ==
+               TargetLowering::TypeWidenVector &&
+           "Unexpected type action");
+    InOp = GetWidenedVector(InOp);
+    for (unsigned j = 0; j < NumInElts; ++j)
+      Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                               DAG.getVectorIdxConstant(j, dl));
+  }
+  return DAG.getBuildVector(VT, dl, Ops);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  SDValue SubVec = N->getOperand(1);
+  SDValue InVec = N->getOperand(0);
+
+  if (getTypeAction(SubVec.getValueType()) == TargetLowering::TypeWidenVector)
+    SubVec = GetWidenedVector(SubVec);
+
+  if (SubVec.getValueType().knownBitsLE(VT) && InVec.isUndef() &&
+      N->getConstantOperandVal(2) == 0)
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, InVec, SubVec,
+                       N->getOperand(2));
+
+  report_fatal_error("Don't know how to widen the operands for "
+                     "INSERT_SUBVECTOR");
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N),
+                     N->getValueType(0), InOp, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
+                     N->getValueType(0), InOp, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
+  // We have to widen the value, but we want only to store the original
+  // vector type.
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+
+  if (!ST->getMemoryVT().getScalarType().isByteSized())
+    return TLI.scalarizeVectorStore(ST, DAG);
+
+  if (ST->isTruncatingStore())
+    return TLI.scalarizeVectorStore(ST, DAG);
+
+  // Generate a vector-predicated store if it is custom/legal on the target.
+  // To avoid possible recursion, only do this if the widened mask type is
+  // legal.
+  // FIXME: Not all targets may support EVL in VP_STORE. These will have been
+  // removed from the IR by the ExpandVectorPredication pass but we're
+  // reintroducing them here.
+  SDValue StVal = ST->getValue();
+  EVT StVT = StVal.getValueType();
+  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StVT);
+  EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                    WideVT.getVectorElementCount());
+
+  if (TLI.isOperationLegalOrCustom(ISD::VP_STORE, WideVT) &&
+      TLI.isTypeLegal(WideMaskVT)) {
+    // Widen the value.
+    SDLoc DL(N);
+    StVal = GetWidenedVector(StVal);
+    SDValue Mask = DAG.getAllOnesConstant(DL, WideMaskVT);
+    SDValue EVL = DAG.getElementCount(DL, TLI.getVPExplicitVectorLengthTy(),
+                                      StVT.getVectorElementCount());
+    return DAG.getStoreVP(ST->getChain(), DL, StVal, ST->getBasePtr(),
+                          DAG.getUNDEF(ST->getBasePtr().getValueType()), Mask,
+                          EVL, StVT, ST->getMemOperand(),
+                          ST->getAddressingMode());
+  }
+
+  SmallVector<SDValue, 16> StChain;
+  if (GenWidenVectorStores(StChain, ST)) {
+    if (StChain.size() == 1)
+      return StChain[0];
+
+    return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
+  }
+
+  report_fatal_error("Unable to widen vector store");
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_VP_STORE(SDNode *N, unsigned OpNo) {
+  assert((OpNo == 1 || OpNo == 3) &&
+         "Can widen only data or mask operand of vp_store");
+  VPStoreSDNode *ST = cast<VPStoreSDNode>(N);
+  SDValue Mask = ST->getMask();
+  SDValue StVal = ST->getValue();
+  SDLoc dl(N);
+
+  if (OpNo == 1) {
+    // Widen the value.
+    StVal = GetWidenedVector(StVal);
+
+    // We only handle the case where the mask needs widening to an
+    // identically-sized type as the vector inputs.
+    assert(getTypeAction(Mask.getValueType()) ==
+               TargetLowering::TypeWidenVector &&
+           "Unable to widen VP store");
+    Mask = GetWidenedVector(Mask);
+  } else {
+    Mask = GetWidenedVector(Mask);
+
+    // We only handle the case where the stored value needs widening to an
+    // identically-sized type as the mask.
+    assert(getTypeAction(StVal.getValueType()) ==
+               TargetLowering::TypeWidenVector &&
+           "Unable to widen VP store");
+    StVal = GetWidenedVector(StVal);
+  }
+
+  assert(Mask.getValueType().getVectorElementCount() ==
+             StVal.getValueType().getVectorElementCount() &&
+         "Mask and data vectors should have the same number of elements");
+  return DAG.getStoreVP(ST->getChain(), dl, StVal, ST->getBasePtr(),
+                        ST->getOffset(), Mask, ST->getVectorLength(),
+                        ST->getMemoryVT(), ST->getMemOperand(),
+                        ST->getAddressingMode(), ST->isTruncatingStore(),
+                        ST->isCompressingStore());
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_VP_STRIDED_STORE(SDNode *N,
+                                                      unsigned OpNo) {
+  assert((OpNo == 1 || OpNo == 4) &&
+         "Can widen only data or mask operand of vp_strided_store");
+  VPStridedStoreSDNode *SST = cast<VPStridedStoreSDNode>(N);
+  SDValue Mask = SST->getMask();
+  SDValue StVal = SST->getValue();
+  SDLoc DL(N);
+
+  if (OpNo == 1)
+    assert(getTypeAction(Mask.getValueType()) ==
+               TargetLowering::TypeWidenVector &&
+           "Unable to widen VP strided store");
+  else
+    assert(getTypeAction(StVal.getValueType()) ==
+               TargetLowering::TypeWidenVector &&
+           "Unable to widen VP strided store");
+
+  StVal = GetWidenedVector(StVal);
+  Mask = GetWidenedVector(Mask);
+
+  assert(StVal.getValueType().getVectorElementCount() ==
+             Mask.getValueType().getVectorElementCount() &&
+         "Data and mask vectors should have the same number of elements");
+
+  return DAG.getStridedStoreVP(
+      SST->getChain(), DL, StVal, SST->getBasePtr(), SST->getOffset(),
+      SST->getStride(), Mask, SST->getVectorLength(), SST->getMemoryVT(),
+      SST->getMemOperand(), SST->getAddressingMode(), SST->isTruncatingStore(),
+      SST->isCompressingStore());
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
+  assert((OpNo == 1 || OpNo == 3) &&
+         "Can widen only data or mask operand of mstore");
+  MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
+  SDValue Mask = MST->getMask();
+  EVT MaskVT = Mask.getValueType();
+  SDValue StVal = MST->getValue();
+  SDLoc dl(N);
+
+  if (OpNo == 1) {
+    // Widen the value.
+    StVal = GetWidenedVector(StVal);
+
+    // The mask should be widened as well.
+    EVT WideVT = StVal.getValueType();
+    EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
+                                      MaskVT.getVectorElementType(),
+                                      WideVT.getVectorNumElements());
+    Mask = ModifyToType(Mask, WideMaskVT, true);
+  } else {
+    // Widen the mask.
+    EVT WideMaskVT = TLI.getTypeToTransformTo(*DAG.getContext(), MaskVT);
+    Mask = ModifyToType(Mask, WideMaskVT, true);
+
+    EVT ValueVT = StVal.getValueType();
+    EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
+                                  ValueVT.getVectorElementType(),
+                                  WideMaskVT.getVectorNumElements());
+    StVal = ModifyToType(StVal, WideVT);
+  }
+
+  assert(Mask.getValueType().getVectorNumElements() ==
+         StVal.getValueType().getVectorNumElements() &&
+         "Mask and data vectors should have the same number of elements");
+  return DAG.getMaskedStore(MST->getChain(), dl, StVal, MST->getBasePtr(),
+                            MST->getOffset(), Mask, MST->getMemoryVT(),
+                            MST->getMemOperand(), MST->getAddressingMode(),
+                            false, MST->isCompressingStore());
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_MGATHER(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 4 && "Can widen only the index of mgather");
+  auto *MG = cast<MaskedGatherSDNode>(N);
+  SDValue DataOp = MG->getPassThru();
+  SDValue Mask = MG->getMask();
+  SDValue Scale = MG->getScale();
+
+  // Just widen the index. It's allowed to have extra elements.
+  SDValue Index = GetWidenedVector(MG->getIndex());
+
+  SDLoc dl(N);
+  SDValue Ops[] = {MG->getChain(), DataOp, Mask, MG->getBasePtr(), Index,
+                   Scale};
+  SDValue Res = DAG.getMaskedGather(MG->getVTList(), MG->getMemoryVT(), dl, Ops,
+                                    MG->getMemOperand(), MG->getIndexType(),
+                                    MG->getExtensionType());
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  ReplaceValueWith(SDValue(N, 0), Res.getValue(0));
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
+  MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
+  SDValue DataOp = MSC->getValue();
+  SDValue Mask = MSC->getMask();
+  SDValue Index = MSC->getIndex();
+  SDValue Scale = MSC->getScale();
+  EVT WideMemVT = MSC->getMemoryVT();
+
+  if (OpNo == 1) {
+    DataOp = GetWidenedVector(DataOp);
+    unsigned NumElts = DataOp.getValueType().getVectorNumElements();
+
+    // Widen index.
+    EVT IndexVT = Index.getValueType();
+    EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
+                                       IndexVT.getVectorElementType(), NumElts);
+    Index = ModifyToType(Index, WideIndexVT);
+
+    // The mask should be widened as well.
+    EVT MaskVT = Mask.getValueType();
+    EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
+                                      MaskVT.getVectorElementType(), NumElts);
+    Mask = ModifyToType(Mask, WideMaskVT, true);
+
+    // Widen the MemoryType
+    WideMemVT = EVT::getVectorVT(*DAG.getContext(),
+                                 MSC->getMemoryVT().getScalarType(), NumElts);
+  } else if (OpNo == 4) {
+    // Just widen the index. It's allowed to have extra elements.
+    Index = GetWidenedVector(Index);
+  } else
+    llvm_unreachable("Can't widen this operand of mscatter");
+
+  SDValue Ops[] = {MSC->getChain(), DataOp, Mask, MSC->getBasePtr(), Index,
+                   Scale};
+  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), WideMemVT, SDLoc(N),
+                              Ops, MSC->getMemOperand(), MSC->getIndexType(),
+                              MSC->isTruncatingStore());
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_VP_SCATTER(SDNode *N, unsigned OpNo) {
+  VPScatterSDNode *VPSC = cast<VPScatterSDNode>(N);
+  SDValue DataOp = VPSC->getValue();
+  SDValue Mask = VPSC->getMask();
+  SDValue Index = VPSC->getIndex();
+  SDValue Scale = VPSC->getScale();
+  EVT WideMemVT = VPSC->getMemoryVT();
+
+  if (OpNo == 1) {
+    DataOp = GetWidenedVector(DataOp);
+    Index = GetWidenedVector(Index);
+    const auto WideEC = DataOp.getValueType().getVectorElementCount();
+    Mask = GetWidenedMask(Mask, WideEC);
+    WideMemVT = EVT::getVectorVT(*DAG.getContext(),
+                                 VPSC->getMemoryVT().getScalarType(), WideEC);
+  } else if (OpNo == 3) {
+    // Just widen the index. It's allowed to have extra elements.
+    Index = GetWidenedVector(Index);
+  } else
+    llvm_unreachable("Can't widen this operand of VP_SCATTER");
+
+  SDValue Ops[] = {
+      VPSC->getChain(),       DataOp, VPSC->getBasePtr(), Index, Scale, Mask,
+      VPSC->getVectorLength()};
+  return DAG.getScatterVP(DAG.getVTList(MVT::Other), WideMemVT, SDLoc(N), Ops,
+                          VPSC->getMemOperand(), VPSC->getIndexType());
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
+  SDValue InOp0 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(1));
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  // WARNING: In this code we widen the compare instruction with garbage.
+  // This garbage may contain denormal floats which may be slow. Is this a real
+  // concern ? Should we zero the unused lanes if this is a float compare ?
+
+  // Get a new SETCC node to compare the newly widened operands.
+  // Only some of the compared elements are legal.
+  EVT SVT = getSetCCResultType(InOp0.getValueType());
+  // The result type is legal, if its vXi1, keep vXi1 for the new SETCC.
+  if (VT.getScalarType() == MVT::i1)
+    SVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                           SVT.getVectorElementCount());
+
+  SDValue WideSETCC = DAG.getNode(ISD::SETCC, SDLoc(N),
+                                  SVT, InOp0, InOp1, N->getOperand(2));
+
+  // Extract the needed results from the result vector.
+  EVT ResVT = EVT::getVectorVT(*DAG.getContext(),
+                               SVT.getVectorElementType(),
+                               VT.getVectorElementCount());
+  SDValue CC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, WideSETCC,
+                           DAG.getVectorIdxConstant(0, dl));
+
+  EVT OpVT = N->getOperand(0).getValueType();
+  ISD::NodeType ExtendCode =
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
+  return DAG.getNode(ExtendCode, dl, VT, CC);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_STRICT_FSETCC(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue LHS = GetWidenedVector(N->getOperand(1));
+  SDValue RHS = GetWidenedVector(N->getOperand(2));
+  SDValue CC = N->getOperand(3);
+  SDLoc dl(N);
+
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  EVT TmpEltVT = LHS.getValueType().getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Unroll into a build vector.
+  SmallVector<SDValue, 8> Scalars(NumElts);
+  SmallVector<SDValue, 8> Chains(NumElts);
+
+  for (unsigned i = 0; i != NumElts; ++i) {
+    SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
+                                  DAG.getVectorIdxConstant(i, dl));
+    SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
+                                  DAG.getVectorIdxConstant(i, dl));
+
+    Scalars[i] = DAG.getNode(N->getOpcode(), dl, {MVT::i1, MVT::Other},
+                             {Chain, LHSElem, RHSElem, CC});
+    Chains[i] = Scalars[i].getValue(1);
+    Scalars[i] = DAG.getSelect(dl, EltVT, Scalars[i],
+                               DAG.getBoolConstant(true, dl, EltVT, VT),
+                               DAG.getBoolConstant(false, dl, EltVT, VT));
+  }
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+  ReplaceValueWith(SDValue(N, 1), NewChain);
+
+  return DAG.getBuildVector(VT, dl, Scalars);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) {
+  SDLoc dl(N);
+  SDValue Op = GetWidenedVector(N->getOperand(0));
+  EVT OrigVT = N->getOperand(0).getValueType();
+  EVT WideVT = Op.getValueType();
+  EVT ElemVT = OrigVT.getVectorElementType();
+  SDNodeFlags Flags = N->getFlags();
+
+  unsigned Opc = N->getOpcode();
+  unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Opc);
+  SDValue NeutralElem = DAG.getNeutralElement(BaseOpc, dl, ElemVT, Flags);
+  assert(NeutralElem && "Neutral element must exist");
+
+  // Pad the vector with the neutral element.
+  unsigned OrigElts = OrigVT.getVectorMinNumElements();
+  unsigned WideElts = WideVT.getVectorMinNumElements();
+
+  if (WideVT.isScalableVector()) {
+    unsigned GCD = std::gcd(OrigElts, WideElts);
+    EVT SplatVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
+                                   ElementCount::getScalable(GCD));
+    SDValue SplatNeutral = DAG.getSplatVector(SplatVT, dl, NeutralElem);
+    for (unsigned Idx = OrigElts; Idx < WideElts; Idx = Idx + GCD)
+      Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Op, SplatNeutral,
+                       DAG.getVectorIdxConstant(Idx, dl));
+    return DAG.getNode(Opc, dl, N->getValueType(0), Op, Flags);
+  }
+
+  for (unsigned Idx = OrigElts; Idx < WideElts; Idx++)
+    Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem,
+                     DAG.getVectorIdxConstant(Idx, dl));
+
+  return DAG.getNode(Opc, dl, N->getValueType(0), Op, Flags);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE_SEQ(SDNode *N) {
+  SDLoc dl(N);
+  SDValue AccOp = N->getOperand(0);
+  SDValue VecOp = N->getOperand(1);
+  SDValue Op = GetWidenedVector(VecOp);
+
+  EVT OrigVT = VecOp.getValueType();
+  EVT WideVT = Op.getValueType();
+  EVT ElemVT = OrigVT.getVectorElementType();
+  SDNodeFlags Flags = N->getFlags();
+
+  unsigned Opc = N->getOpcode();
+  unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Opc);
+  SDValue NeutralElem = DAG.getNeutralElement(BaseOpc, dl, ElemVT, Flags);
+
+  // Pad the vector with the neutral element.
+  unsigned OrigElts = OrigVT.getVectorMinNumElements();
+  unsigned WideElts = WideVT.getVectorMinNumElements();
+
+  if (WideVT.isScalableVector()) {
+    unsigned GCD = std::gcd(OrigElts, WideElts);
+    EVT SplatVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
+                                   ElementCount::getScalable(GCD));
+    SDValue SplatNeutral = DAG.getSplatVector(SplatVT, dl, NeutralElem);
+    for (unsigned Idx = OrigElts; Idx < WideElts; Idx = Idx + GCD)
+      Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Op, SplatNeutral,
+                       DAG.getVectorIdxConstant(Idx, dl));
+    return DAG.getNode(Opc, dl, N->getValueType(0), AccOp, Op, Flags);
+  }
+
+  for (unsigned Idx = OrigElts; Idx < WideElts; Idx++)
+    Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem,
+                     DAG.getVectorIdxConstant(Idx, dl));
+
+  return DAG.getNode(Opc, dl, N->getValueType(0), AccOp, Op, Flags);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_VP_REDUCE(SDNode *N) {
+  assert(N->isVPOpcode() && "Expected VP opcode");
+
+  SDLoc dl(N);
+  SDValue Op = GetWidenedVector(N->getOperand(1));
+  SDValue Mask = GetWidenedMask(N->getOperand(2),
+                                Op.getValueType().getVectorElementCount());
+
+  return DAG.getNode(N->getOpcode(), dl, N->getValueType(0),
+                     {N->getOperand(0), Op, Mask, N->getOperand(3)},
+                     N->getFlags());
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_VSELECT(SDNode *N) {
+  // This only gets called in the case that the left and right inputs and
+  // result are of a legal odd vector type, and the condition is illegal i1 of
+  // the same odd width that needs widening.
+  EVT VT = N->getValueType(0);
+  assert(VT.isVector() && !VT.isPow2VectorType() && isTypeLegal(VT));
+
+  SDValue Cond = GetWidenedVector(N->getOperand(0));
+  SDValue LeftIn = DAG.WidenVector(N->getOperand(1), SDLoc(N));
+  SDValue RightIn = DAG.WidenVector(N->getOperand(2), SDLoc(N));
+  SDLoc DL(N);
+
+  SDValue Select = DAG.getNode(N->getOpcode(), DL, LeftIn.getValueType(), Cond,
+                               LeftIn, RightIn);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Select,
+                     DAG.getVectorIdxConstant(0, DL));
+}
+
+//===----------------------------------------------------------------------===//
+// Vector Widening Utilities
+//===----------------------------------------------------------------------===//
+
+// Utility function to find the type to chop up a widen vector for load/store
+//  TLI:       Target lowering used to determine legal types.
+//  Width:     Width left need to load/store.
+//  WidenVT:   The widen vector type to load to/store from
+//  Align:     If 0, don't allow use of a wider type
+//  WidenEx:   If Align is not 0, the amount additional we can load/store from.
+
+static std::optional<EVT> findMemType(SelectionDAG &DAG,
+                                      const TargetLowering &TLI, unsigned Width,
+                                      EVT WidenVT, unsigned Align = 0,
+                                      unsigned WidenEx = 0) {
+  EVT WidenEltVT = WidenVT.getVectorElementType();
+  const bool Scalable = WidenVT.isScalableVector();
+  unsigned WidenWidth = WidenVT.getSizeInBits().getKnownMinValue();
+  unsigned WidenEltWidth = WidenEltVT.getSizeInBits();
+  unsigned AlignInBits = Align*8;
+
+  // If we have one element to load/store, return it.
+  EVT RetVT = WidenEltVT;
+  if (!Scalable && Width == WidenEltWidth)
+    return RetVT;
+
+  // Don't bother looking for an integer type if the vector is scalable, skip
+  // to vector types.
+  if (!Scalable) {
+    // See if there is larger legal integer than the element type to load/store.
+    for (EVT MemVT : reverse(MVT::integer_valuetypes())) {
+      unsigned MemVTWidth = MemVT.getSizeInBits();
+      if (MemVT.getSizeInBits() <= WidenEltWidth)
+        break;
+      auto Action = TLI.getTypeAction(*DAG.getContext(), MemVT);
+      if ((Action == TargetLowering::TypeLegal ||
+           Action == TargetLowering::TypePromoteInteger) &&
+          (WidenWidth % MemVTWidth) == 0 &&
+          isPowerOf2_32(WidenWidth / MemVTWidth) &&
+          (MemVTWidth <= Width ||
+           (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
+        if (MemVTWidth == WidenWidth)
+          return MemVT;
+        RetVT = MemVT;
+        break;
+      }
+    }
+  }
+
+  // See if there is a larger vector type to load/store that has the same vector
+  // element type and is evenly divisible with the WidenVT.
+  for (EVT MemVT : reverse(MVT::vector_valuetypes())) {
+    // Skip vector MVTs which don't match the scalable property of WidenVT.
+    if (Scalable != MemVT.isScalableVector())
+      continue;
+    unsigned MemVTWidth = MemVT.getSizeInBits().getKnownMinValue();
+    auto Action = TLI.getTypeAction(*DAG.getContext(), MemVT);
+    if ((Action == TargetLowering::TypeLegal ||
+         Action == TargetLowering::TypePromoteInteger) &&
+        WidenEltVT == MemVT.getVectorElementType() &&
+        (WidenWidth % MemVTWidth) == 0 &&
+        isPowerOf2_32(WidenWidth / MemVTWidth) &&
+        (MemVTWidth <= Width ||
+         (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
+      if (RetVT.getFixedSizeInBits() < MemVTWidth || MemVT == WidenVT)
+        return MemVT;
+    }
+  }
+
+  // Using element-wise loads and stores for widening operations is not
+  // supported for scalable vectors
+  if (Scalable)
+    return std::nullopt;
+
+  return RetVT;
+}
+
+// Builds a vector type from scalar loads
+//  VecTy: Resulting Vector type
+//  LDOps: Load operators to build a vector type
+//  [Start,End) the list of loads to use.
+static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy,
+                                     SmallVectorImpl<SDValue> &LdOps,
+                                     unsigned Start, unsigned End) {
+  SDLoc dl(LdOps[Start]);
+  EVT LdTy = LdOps[Start].getValueType();
+  unsigned Width = VecTy.getSizeInBits();
+  unsigned NumElts = Width / LdTy.getSizeInBits();
+  EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), LdTy, NumElts);
+
+  unsigned Idx = 1;
+  SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT,LdOps[Start]);
+
+  for (unsigned i = Start + 1; i != End; ++i) {
+    EVT NewLdTy = LdOps[i].getValueType();
+    if (NewLdTy != LdTy) {
+      NumElts = Width / NewLdTy.getSizeInBits();
+      NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewLdTy, NumElts);
+      VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, VecOp);
+      // Readjust position and vector position based on new load type.
+      Idx = Idx * LdTy.getSizeInBits() / NewLdTy.getSizeInBits();
+      LdTy = NewLdTy;
+    }
+    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOps[i],
+                        DAG.getVectorIdxConstant(Idx++, dl));
+  }
+  return DAG.getNode(ISD::BITCAST, dl, VecTy, VecOp);
+}
+
+SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
+                                              LoadSDNode *LD) {
+  // The strategy assumes that we can efficiently load power-of-two widths.
+  // The routine chops the vector into the largest vector loads with the same
+  // element type or scalar loads and then recombines it to the widen vector
+  // type.
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
+  EVT LdVT    = LD->getMemoryVT();
+  SDLoc dl(LD);
+  assert(LdVT.isVector() && WidenVT.isVector());
+  assert(LdVT.isScalableVector() == WidenVT.isScalableVector());
+  assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());
+
+  // Load information
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = LD->getAAInfo();
+
+  TypeSize LdWidth = LdVT.getSizeInBits();
+  TypeSize WidenWidth = WidenVT.getSizeInBits();
+  TypeSize WidthDiff = WidenWidth - LdWidth;
+  // Allow wider loads if they are sufficiently aligned to avoid memory faults
+  // and if the original load is simple.
+  unsigned LdAlign =
+      (!LD->isSimple() || LdVT.isScalableVector()) ? 0 : LD->getAlign().value();
+
+  // Find the vector type that can load from.
+  std::optional<EVT> FirstVT =
+      findMemType(DAG, TLI, LdWidth.getKnownMinValue(), WidenVT, LdAlign,
+                  WidthDiff.getKnownMinValue());
+
+  if (!FirstVT)
+    return SDValue();
+
+  SmallVector<EVT, 8> MemVTs;
+  TypeSize FirstVTWidth = FirstVT->getSizeInBits();
+
+  // Unless we're able to load in one instruction we must work out how to load
+  // the remainder.
+  if (!TypeSize::isKnownLE(LdWidth, FirstVTWidth)) {
+    std::optional<EVT> NewVT = FirstVT;
+    TypeSize RemainingWidth = LdWidth;
+    TypeSize NewVTWidth = FirstVTWidth;
+    do {
+      RemainingWidth -= NewVTWidth;
+      if (TypeSize::isKnownLT(RemainingWidth, NewVTWidth)) {
+        // The current type we are using is too large. Find a better size.
+        NewVT = findMemType(DAG, TLI, RemainingWidth.getKnownMinValue(),
+                            WidenVT, LdAlign, WidthDiff.getKnownMinValue());
+        if (!NewVT)
+          return SDValue();
+        NewVTWidth = NewVT->getSizeInBits();
+      }
+      MemVTs.push_back(*NewVT);
+    } while (TypeSize::isKnownGT(RemainingWidth, NewVTWidth));
+  }
+
+  SDValue LdOp = DAG.getLoad(*FirstVT, dl, Chain, BasePtr, LD->getPointerInfo(),
+                             LD->getOriginalAlign(), MMOFlags, AAInfo);
+  LdChain.push_back(LdOp.getValue(1));
+
+  // Check if we can load the element with one instruction.
+  if (MemVTs.empty()) {
+    assert(TypeSize::isKnownLE(LdWidth, FirstVTWidth));
+    if (!FirstVT->isVector()) {
+      unsigned NumElts =
+          WidenWidth.getFixedValue() / FirstVTWidth.getFixedValue();
+      EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), *FirstVT, NumElts);
+      SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
+      return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp);
+    }
+    if (FirstVT == WidenVT)
+      return LdOp;
+
+    // TODO: We don't currently have any tests that exercise this code path.
+    assert(WidenWidth.getFixedValue() % FirstVTWidth.getFixedValue() == 0);
+    unsigned NumConcat =
+        WidenWidth.getFixedValue() / FirstVTWidth.getFixedValue();
+    SmallVector<SDValue, 16> ConcatOps(NumConcat);
+    SDValue UndefVal = DAG.getUNDEF(*FirstVT);
+    ConcatOps[0] = LdOp;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      ConcatOps[i] = UndefVal;
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, ConcatOps);
+  }
+
+  // Load vector by using multiple loads from largest vector to scalar.
+  SmallVector<SDValue, 16> LdOps;
+  LdOps.push_back(LdOp);
+
+  uint64_t ScaledOffset = 0;
+  MachinePointerInfo MPI = LD->getPointerInfo();
+
+  // First incremement past the first load.
+  IncrementPointer(cast<LoadSDNode>(LdOp), *FirstVT, MPI, BasePtr,
+                   &ScaledOffset);
+
+  for (EVT MemVT : MemVTs) {
+    Align NewAlign = ScaledOffset == 0
+                         ? LD->getOriginalAlign()
+                         : commonAlignment(LD->getAlign(), ScaledOffset);
+    SDValue L =
+        DAG.getLoad(MemVT, dl, Chain, BasePtr, MPI, NewAlign, MMOFlags, AAInfo);
+
+    LdOps.push_back(L);
+    LdChain.push_back(L.getValue(1));
+    IncrementPointer(cast<LoadSDNode>(L), MemVT, MPI, BasePtr, &ScaledOffset);
+  }
+
+  // Build the vector from the load operations.
+  unsigned End = LdOps.size();
+  if (!LdOps[0].getValueType().isVector())
+    // All the loads are scalar loads.
+    return BuildVectorFromScalar(DAG, WidenVT, LdOps, 0, End);
+
+  // If the load contains vectors, build the vector using concat vector.
+  // All of the vectors used to load are power-of-2, and the scalar loads can be
+  // combined to make a power-of-2 vector.
+  SmallVector<SDValue, 16> ConcatOps(End);
+  int i = End - 1;
+  int Idx = End;
+  EVT LdTy = LdOps[i].getValueType();
+  // First, combine the scalar loads to a vector.
+  if (!LdTy.isVector())  {
+    for (--i; i >= 0; --i) {
+      LdTy = LdOps[i].getValueType();
+      if (LdTy.isVector())
+        break;
+    }
+    ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i + 1, End);
+  }
+
+  ConcatOps[--Idx] = LdOps[i];
+  for (--i; i >= 0; --i) {
+    EVT NewLdTy = LdOps[i].getValueType();
+    if (NewLdTy != LdTy) {
+      // Create a larger vector.
+      TypeSize LdTySize = LdTy.getSizeInBits();
+      TypeSize NewLdTySize = NewLdTy.getSizeInBits();
+      assert(NewLdTySize.isScalable() == LdTySize.isScalable() &&
+             NewLdTySize.isKnownMultipleOf(LdTySize.getKnownMinValue()));
+      unsigned NumOps =
+          NewLdTySize.getKnownMinValue() / LdTySize.getKnownMinValue();
+      SmallVector<SDValue, 16> WidenOps(NumOps);
+      unsigned j = 0;
+      for (; j != End-Idx; ++j)
+        WidenOps[j] = ConcatOps[Idx+j];
+      for (; j != NumOps; ++j)
+        WidenOps[j] = DAG.getUNDEF(LdTy);
+
+      ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
+                                     WidenOps);
+      Idx = End - 1;
+      LdTy = NewLdTy;
+    }
+    ConcatOps[--Idx] = LdOps[i];
+  }
+
+  if (WidenWidth == LdTy.getSizeInBits() * (End - Idx))
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
+                       ArrayRef(&ConcatOps[Idx], End - Idx));
+
+  // We need to fill the rest with undefs to build the vector.
+  unsigned NumOps =
+      WidenWidth.getKnownMinValue() / LdTy.getSizeInBits().getKnownMinValue();
+  SmallVector<SDValue, 16> WidenOps(NumOps);
+  SDValue UndefVal = DAG.getUNDEF(LdTy);
+  {
+    unsigned i = 0;
+    for (; i != End-Idx; ++i)
+      WidenOps[i] = ConcatOps[Idx+i];
+    for (; i != NumOps; ++i)
+      WidenOps[i] = UndefVal;
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, WidenOps);
+}
+
+SDValue
+DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
+                                         LoadSDNode *LD,
+                                         ISD::LoadExtType ExtType) {
+  // For extension loads, it may not be more efficient to chop up the vector
+  // and then extend it. Instead, we unroll the load and build a new vector.
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
+  EVT LdVT    = LD->getMemoryVT();
+  SDLoc dl(LD);
+  assert(LdVT.isVector() && WidenVT.isVector());
+  assert(LdVT.isScalableVector() == WidenVT.isScalableVector());
+
+  // Load information
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = LD->getAAInfo();
+
+  if (LdVT.isScalableVector())
+    report_fatal_error("Generating widen scalable extending vector loads is "
+                       "not yet supported");
+
+  EVT EltVT = WidenVT.getVectorElementType();
+  EVT LdEltVT = LdVT.getVectorElementType();
+  unsigned NumElts = LdVT.getVectorNumElements();
+
+  // Load each element and widen.
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  unsigned Increment = LdEltVT.getSizeInBits() / 8;
+  Ops[0] =
+      DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, LD->getPointerInfo(),
+                     LdEltVT, LD->getOriginalAlign(), MMOFlags, AAInfo);
+  LdChain.push_back(Ops[0].getValue(1));
+  unsigned i = 0, Offset = Increment;
+  for (i=1; i < NumElts; ++i, Offset += Increment) {
+    SDValue NewBasePtr =
+        DAG.getObjectPtrOffset(dl, BasePtr, TypeSize::Fixed(Offset));
+    Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr,
+                            LD->getPointerInfo().getWithOffset(Offset), LdEltVT,
+                            LD->getOriginalAlign(), MMOFlags, AAInfo);
+    LdChain.push_back(Ops[i].getValue(1));
+  }
+
+  // Fill the rest with undefs.
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i != WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+
+  return DAG.getBuildVector(WidenVT, dl, Ops);
+}
+
+bool DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
+                                            StoreSDNode *ST) {
+  // The strategy assumes that we can efficiently store power-of-two widths.
+  // The routine chops the vector into the largest vector stores with the same
+  // element type or scalar stores.
+  SDValue  Chain = ST->getChain();
+  SDValue  BasePtr = ST->getBasePtr();
+  MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = ST->getAAInfo();
+  SDValue  ValOp = GetWidenedVector(ST->getValue());
+  SDLoc dl(ST);
+
+  EVT StVT = ST->getMemoryVT();
+  TypeSize StWidth = StVT.getSizeInBits();
+  EVT ValVT = ValOp.getValueType();
+  TypeSize ValWidth = ValVT.getSizeInBits();
+  EVT ValEltVT = ValVT.getVectorElementType();
+  unsigned ValEltWidth = ValEltVT.getFixedSizeInBits();
+  assert(StVT.getVectorElementType() == ValEltVT);
+  assert(StVT.isScalableVector() == ValVT.isScalableVector() &&
+         "Mismatch between store and value types");
+
+  int Idx = 0;          // current index to store
+
+  MachinePointerInfo MPI = ST->getPointerInfo();
+  uint64_t ScaledOffset = 0;
+
+  // A breakdown of how to widen this vector store. Each element of the vector
+  // is a memory VT combined with the number of times it is to be stored to,
+  // e,g., v5i32 -> {{v2i32,2},{i32,1}}
+  SmallVector<std::pair<EVT, unsigned>, 4> MemVTs;
+
+  while (StWidth.isNonZero()) {
+    // Find the largest vector type we can store with.
+    std::optional<EVT> NewVT =
+        findMemType(DAG, TLI, StWidth.getKnownMinValue(), ValVT);
+    if (!NewVT)
+      return false;
+    MemVTs.push_back({*NewVT, 0});
+    TypeSize NewVTWidth = NewVT->getSizeInBits();
+
+    do {
+      StWidth -= NewVTWidth;
+      MemVTs.back().second++;
+    } while (StWidth.isNonZero() && TypeSize::isKnownGE(StWidth, NewVTWidth));
+  }
+
+  for (const auto &Pair : MemVTs) {
+    EVT NewVT = Pair.first;
+    unsigned Count = Pair.second;
+    TypeSize NewVTWidth = NewVT.getSizeInBits();
+
+    if (NewVT.isVector()) {
+      unsigned NumVTElts = NewVT.getVectorMinNumElements();
+      do {
+        Align NewAlign = ScaledOffset == 0
+                             ? ST->getOriginalAlign()
+                             : commonAlignment(ST->getAlign(), ScaledOffset);
+        SDValue EOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp,
+                                  DAG.getVectorIdxConstant(Idx, dl));
+        SDValue PartStore = DAG.getStore(Chain, dl, EOp, BasePtr, MPI, NewAlign,
+                                         MMOFlags, AAInfo);
+        StChain.push_back(PartStore);
+
+        Idx += NumVTElts;
+        IncrementPointer(cast<StoreSDNode>(PartStore), NewVT, MPI, BasePtr,
+                         &ScaledOffset);
+      } while (--Count);
+    } else {
+      // Cast the vector to the scalar type we can store.
+      unsigned NumElts = ValWidth.getFixedValue() / NewVTWidth.getFixedValue();
+      EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
+      SDValue VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, ValOp);
+      // Readjust index position based on new vector type.
+      Idx = Idx * ValEltWidth / NewVTWidth.getFixedValue();
+      do {
+        SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp,
+                                  DAG.getVectorIdxConstant(Idx++, dl));
+        SDValue PartStore =
+            DAG.getStore(Chain, dl, EOp, BasePtr, MPI, ST->getOriginalAlign(),
+                         MMOFlags, AAInfo);
+        StChain.push_back(PartStore);
+
+        IncrementPointer(cast<StoreSDNode>(PartStore), NewVT, MPI, BasePtr);
+      } while (--Count);
+      // Restore index back to be relative to the original widen element type.
+      Idx = Idx * NewVTWidth.getFixedValue() / ValEltWidth;
+    }
+  }
+
+  return true;
+}
+
+/// Modifies a vector input (widen or narrows) to a vector of NVT.  The
+/// input vector must have the same element type as NVT.
+/// FillWithZeroes specifies that the vector should be widened with zeroes.
+SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
+                                       bool FillWithZeroes) {
+  // Note that InOp might have been widened so it might already have
+  // the right width or it might need be narrowed.
+  EVT InVT = InOp.getValueType();
+  assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
+         "input and widen element type must match");
+  assert(InVT.isScalableVector() == NVT.isScalableVector() &&
+         "cannot modify scalable vectors in this way");
+  SDLoc dl(InOp);
+
+  // Check if InOp already has the right width.
+  if (InVT == NVT)
+    return InOp;
+
+  ElementCount InEC = InVT.getVectorElementCount();
+  ElementCount WidenEC = NVT.getVectorElementCount();
+  if (WidenEC.hasKnownScalarFactor(InEC)) {
+    unsigned NumConcat = WidenEC.getKnownScalarFactor(InEC);
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, InVT) :
+      DAG.getUNDEF(InVT);
+    Ops[0] = InOp;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = FillVal;
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, Ops);
+  }
+
+  if (InEC.hasKnownScalarFactor(WidenEC))
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp,
+                       DAG.getVectorIdxConstant(0, dl));
+
+  assert(!InVT.isScalableVector() && !NVT.isScalableVector() &&
+         "Scalable vectors should have been handled already.");
+
+  unsigned InNumElts = InEC.getFixedValue();
+  unsigned WidenNumElts = WidenEC.getFixedValue();
+
+  // Fall back to extract and build (+ mask, if padding with zeros).
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  EVT EltVT = NVT.getVectorElementType();
+  unsigned MinNumElts = std::min(WidenNumElts, InNumElts);
+  unsigned Idx;
+  for (Idx = 0; Idx < MinNumElts; ++Idx)
+    Ops[Idx] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                           DAG.getVectorIdxConstant(Idx, dl));
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; Idx < WidenNumElts; ++Idx)
+    Ops[Idx] = UndefVal;
+
+  SDValue Widened = DAG.getBuildVector(NVT, dl, Ops);
+  if (!FillWithZeroes)
+    return Widened;
+
+  assert(NVT.isInteger() &&
+         "We expect to never want to FillWithZeroes for non-integral types.");
+
+  SmallVector<SDValue, 16> MaskOps;
+  MaskOps.append(MinNumElts, DAG.getAllOnesConstant(dl, EltVT));
+  MaskOps.append(WidenNumElts - MinNumElts, DAG.getConstant(0, dl, EltVT));
+
+  return DAG.getNode(ISD::AND, dl, NVT, Widened,
+                     DAG.getBuildVector(NVT, dl, MaskOps));
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
new file mode 100644
index 000000000000..e0e8d503ca92
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -0,0 +1,624 @@
+//===- ResourcePriorityQueue.cpp - A DFA-oriented priority queue -*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ResourcePriorityQueue class, which is a
+// SchedulingPriorityQueue that prioritizes instructions using DFA state to
+// reduce the length of the critical path through the basic block
+// on VLIW platforms.
+// The scheduler is basically a top-down adaptable list scheduler with DFA
+// resource tracking added to the cost function.
+// DFA is queried as a state machine to model "packets/bundles" during
+// schedule. Currently packets/bundles are discarded at the end of
+// scheduling, affecting only order of instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ResourcePriorityQueue.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "scheduler"
+
+static cl::opt<bool>
+    DisableDFASched("disable-dfa-sched", cl::Hidden,
+                    cl::desc("Disable use of DFA during scheduling"));
+
+static cl::opt<int> RegPressureThreshold(
+    "dfa-sched-reg-pressure-threshold", cl::Hidden, cl::init(5),
+    cl::desc("Track reg pressure and switch priority to in-depth"));
+
+ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS)
+    : Picker(this), InstrItins(IS->MF->getSubtarget().getInstrItineraryData()) {
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  TRI = STI.getRegisterInfo();
+  TLI = IS->TLI;
+  TII = STI.getInstrInfo();
+  ResourcesModel.reset(TII->CreateTargetScheduleState(STI));
+  // This hard requirement could be relaxed, but for now
+  // do not let it proceed.
+  assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
+
+  unsigned NumRC = TRI->getNumRegClasses();
+  RegLimit.resize(NumRC);
+  RegPressure.resize(NumRC);
+  std::fill(RegLimit.begin(), RegLimit.end(), 0);
+  std::fill(RegPressure.begin(), RegPressure.end(), 0);
+  for (const TargetRegisterClass *RC : TRI->regclasses())
+    RegLimit[RC->getID()] = TRI->getRegPressureLimit(RC, *IS->MF);
+
+  ParallelLiveRanges = 0;
+  HorizontalVerticalBalance = 0;
+}
+
+unsigned
+ResourcePriorityQueue::numberRCValPredInSU(SUnit *SU, unsigned RCId) {
+  unsigned NumberDeps = 0;
+  for (SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
+      continue;
+
+    SUnit *PredSU = Pred.getSUnit();
+    const SDNode *ScegN = PredSU->getNode();
+
+    if (!ScegN)
+      continue;
+
+    // If value is passed to CopyToReg, it is probably
+    // live outside BB.
+    switch (ScegN->getOpcode()) {
+      default:  break;
+      case ISD::TokenFactor:    break;
+      case ISD::CopyFromReg:    NumberDeps++;  break;
+      case ISD::CopyToReg:      break;
+      case ISD::INLINEASM:      break;
+      case ISD::INLINEASM_BR:   break;
+    }
+    if (!ScegN->isMachineOpcode())
+      continue;
+
+    for (unsigned i = 0, e = ScegN->getNumValues(); i != e; ++i) {
+      MVT VT = ScegN->getSimpleValueType(i);
+      if (TLI->isTypeLegal(VT)
+          && (TLI->getRegClassFor(VT)->getID() == RCId)) {
+        NumberDeps++;
+        break;
+      }
+    }
+  }
+  return NumberDeps;
+}
+
+unsigned ResourcePriorityQueue::numberRCValSuccInSU(SUnit *SU,
+                                                    unsigned RCId) {
+  unsigned NumberDeps = 0;
+  for (const SDep &Succ : SU->Succs) {
+    if (Succ.isCtrl())
+      continue;
+
+    SUnit *SuccSU = Succ.getSUnit();
+    const SDNode *ScegN = SuccSU->getNode();
+    if (!ScegN)
+      continue;
+
+    // If value is passed to CopyToReg, it is probably
+    // live outside BB.
+    switch (ScegN->getOpcode()) {
+      default:  break;
+      case ISD::TokenFactor:    break;
+      case ISD::CopyFromReg:    break;
+      case ISD::CopyToReg:      NumberDeps++;  break;
+      case ISD::INLINEASM:      break;
+      case ISD::INLINEASM_BR:   break;
+    }
+    if (!ScegN->isMachineOpcode())
+      continue;
+
+    for (unsigned i = 0, e = ScegN->getNumOperands(); i != e; ++i) {
+      const SDValue &Op = ScegN->getOperand(i);
+      MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
+      if (TLI->isTypeLegal(VT)
+          && (TLI->getRegClassFor(VT)->getID() == RCId)) {
+        NumberDeps++;
+        break;
+      }
+    }
+  }
+  return NumberDeps;
+}
+
+static unsigned numberCtrlDepsInSU(SUnit *SU) {
+  unsigned NumberDeps = 0;
+  for (const SDep &Succ : SU->Succs)
+    if (Succ.isCtrl())
+      NumberDeps++;
+
+  return NumberDeps;
+}
+
+static unsigned numberCtrlPredInSU(SUnit *SU) {
+  unsigned NumberDeps = 0;
+  for (SDep &Pred : SU->Preds)
+    if (Pred.isCtrl())
+      NumberDeps++;
+
+  return NumberDeps;
+}
+
+///
+/// Initialize nodes.
+///
+void ResourcePriorityQueue::initNodes(std::vector<SUnit> &sunits) {
+  SUnits = &sunits;
+  NumNodesSolelyBlocking.resize(SUnits->size(), 0);
+
+  for (SUnit &SU : *SUnits) {
+    initNumRegDefsLeft(&SU);
+    SU.NodeQueueId = 0;
+  }
+}
+
+/// This heuristic is used if DFA scheduling is not desired
+/// for some VLIW platform.
+bool resource_sort::operator()(const SUnit *LHS, const SUnit *RHS) const {
+  // The isScheduleHigh flag allows nodes with wraparound dependencies that
+  // cannot easily be modeled as edges with latencies to be scheduled as
+  // soon as possible in a top-down schedule.
+  if (LHS->isScheduleHigh && !RHS->isScheduleHigh)
+    return false;
+
+  if (!LHS->isScheduleHigh && RHS->isScheduleHigh)
+    return true;
+
+  unsigned LHSNum = LHS->NodeNum;
+  unsigned RHSNum = RHS->NodeNum;
+
+  // The most important heuristic is scheduling the critical path.
+  unsigned LHSLatency = PQ->getLatency(LHSNum);
+  unsigned RHSLatency = PQ->getLatency(RHSNum);
+  if (LHSLatency < RHSLatency) return true;
+  if (LHSLatency > RHSLatency) return false;
+
+  // After that, if two nodes have identical latencies, look to see if one will
+  // unblock more other nodes than the other.
+  unsigned LHSBlocked = PQ->getNumSolelyBlockNodes(LHSNum);
+  unsigned RHSBlocked = PQ->getNumSolelyBlockNodes(RHSNum);
+  if (LHSBlocked < RHSBlocked) return true;
+  if (LHSBlocked > RHSBlocked) return false;
+
+  // Finally, just to provide a stable ordering, use the node number as a
+  // deciding factor.
+  return LHSNum < RHSNum;
+}
+
+
+/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
+/// of SU, return it, otherwise return null.
+SUnit *ResourcePriorityQueue::getSingleUnscheduledPred(SUnit *SU) {
+  SUnit *OnlyAvailablePred = nullptr;
+  for (const SDep &Pred : SU->Preds) {
+    SUnit &PredSU = *Pred.getSUnit();
+    if (!PredSU.isScheduled) {
+      // We found an available, but not scheduled, predecessor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailablePred && OnlyAvailablePred != &PredSU)
+        return nullptr;
+      OnlyAvailablePred = &PredSU;
+    }
+  }
+  return OnlyAvailablePred;
+}
+
+void ResourcePriorityQueue::push(SUnit *SU) {
+  // Look at all of the successors of this node.  Count the number of nodes that
+  // this node is the sole unscheduled node for.
+  unsigned NumNodesBlocking = 0;
+  for (const SDep &Succ : SU->Succs)
+    if (getSingleUnscheduledPred(Succ.getSUnit()) == SU)
+      ++NumNodesBlocking;
+
+  NumNodesSolelyBlocking[SU->NodeNum] = NumNodesBlocking;
+  Queue.push_back(SU);
+}
+
+/// Check if scheduling of this SU is possible
+/// in the current packet.
+bool ResourcePriorityQueue::isResourceAvailable(SUnit *SU) {
+  if (!SU || !SU->getNode())
+    return false;
+
+  // If this is a compound instruction,
+  // it is likely to be a call. Do not delay it.
+  if (SU->getNode()->getGluedNode())
+    return true;
+
+  // First see if the pipeline could receive this instruction
+  // in the current cycle.
+  if (SU->getNode()->isMachineOpcode())
+    switch (SU->getNode()->getMachineOpcode()) {
+    default:
+      if (!ResourcesModel->canReserveResources(&TII->get(
+          SU->getNode()->getMachineOpcode())))
+           return false;
+      break;
+    case TargetOpcode::EXTRACT_SUBREG:
+    case TargetOpcode::INSERT_SUBREG:
+    case TargetOpcode::SUBREG_TO_REG:
+    case TargetOpcode::REG_SEQUENCE:
+    case TargetOpcode::IMPLICIT_DEF:
+        break;
+    }
+
+  // Now see if there are no other dependencies
+  // to instructions already in the packet.
+  for (const SUnit *S : Packet)
+    for (const SDep &Succ : S->Succs) {
+      // Since we do not add pseudos to packets, might as well
+      // ignore order deps.
+      if (Succ.isCtrl())
+        continue;
+
+      if (Succ.getSUnit() == SU)
+        return false;
+    }
+
+  return true;
+}
+
+/// Keep track of available resources.
+void ResourcePriorityQueue::reserveResources(SUnit *SU) {
+  // If this SU does not fit in the packet
+  // start a new one.
+  if (!isResourceAvailable(SU) || SU->getNode()->getGluedNode()) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+  }
+
+  if (SU->getNode() && SU->getNode()->isMachineOpcode()) {
+    switch (SU->getNode()->getMachineOpcode()) {
+    default:
+      ResourcesModel->reserveResources(&TII->get(
+        SU->getNode()->getMachineOpcode()));
+      break;
+    case TargetOpcode::EXTRACT_SUBREG:
+    case TargetOpcode::INSERT_SUBREG:
+    case TargetOpcode::SUBREG_TO_REG:
+    case TargetOpcode::REG_SEQUENCE:
+    case TargetOpcode::IMPLICIT_DEF:
+      break;
+    }
+    Packet.push_back(SU);
+  }
+  // Forcefully end packet for PseudoOps.
+  else {
+    ResourcesModel->clearResources();
+    Packet.clear();
+  }
+
+  // If packet is now full, reset the state so in the next cycle
+  // we start fresh.
+  if (Packet.size() >= InstrItins->SchedModel.IssueWidth) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+  }
+}
+
+int ResourcePriorityQueue::rawRegPressureDelta(SUnit *SU, unsigned RCId) {
+  int RegBalance = 0;
+
+  if (!SU || !SU->getNode() || !SU->getNode()->isMachineOpcode())
+    return RegBalance;
+
+  // Gen estimate.
+  for (unsigned i = 0, e = SU->getNode()->getNumValues(); i != e; ++i) {
+      MVT VT = SU->getNode()->getSimpleValueType(i);
+      if (TLI->isTypeLegal(VT)
+          && TLI->getRegClassFor(VT)
+          && TLI->getRegClassFor(VT)->getID() == RCId)
+        RegBalance += numberRCValSuccInSU(SU, RCId);
+  }
+  // Kill estimate.
+  for (unsigned i = 0, e = SU->getNode()->getNumOperands(); i != e; ++i) {
+      const SDValue &Op = SU->getNode()->getOperand(i);
+      MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
+      if (isa<ConstantSDNode>(Op.getNode()))
+        continue;
+
+      if (TLI->isTypeLegal(VT) && TLI->getRegClassFor(VT)
+          && TLI->getRegClassFor(VT)->getID() == RCId)
+        RegBalance -= numberRCValPredInSU(SU, RCId);
+  }
+  return RegBalance;
+}
+
+/// Estimates change in reg pressure from this SU.
+/// It is achieved by trivial tracking of defined
+/// and used vregs in dependent instructions.
+/// The RawPressure flag makes this function to ignore
+/// existing reg file sizes, and report raw def/use
+/// balance.
+int ResourcePriorityQueue::regPressureDelta(SUnit *SU, bool RawPressure) {
+  int RegBalance = 0;
+
+  if (!SU || !SU->getNode() || !SU->getNode()->isMachineOpcode())
+    return RegBalance;
+
+  if (RawPressure) {
+    for (const TargetRegisterClass *RC : TRI->regclasses())
+      RegBalance += rawRegPressureDelta(SU, RC->getID());
+  }
+  else {
+    for (const TargetRegisterClass *RC : TRI->regclasses()) {
+      if ((RegPressure[RC->getID()] +
+           rawRegPressureDelta(SU, RC->getID()) > 0) &&
+          (RegPressure[RC->getID()] +
+           rawRegPressureDelta(SU, RC->getID())  >= RegLimit[RC->getID()]))
+        RegBalance += rawRegPressureDelta(SU, RC->getID());
+    }
+  }
+
+  return RegBalance;
+}
+
+// Constants used to denote relative importance of
+// heuristic components for cost computation.
+static const unsigned PriorityOne = 200;
+static const unsigned PriorityTwo = 50;
+static const unsigned PriorityThree = 15;
+static const unsigned PriorityFour = 5;
+static const unsigned ScaleOne = 20;
+static const unsigned ScaleTwo = 10;
+static const unsigned ScaleThree = 5;
+static const unsigned FactorOne = 2;
+
+/// Returns single number reflecting benefit of scheduling SU
+/// in the current cycle.
+int ResourcePriorityQueue::SUSchedulingCost(SUnit *SU) {
+  // Initial trivial priority.
+  int ResCount = 1;
+
+  // Do not waste time on a node that is already scheduled.
+  if (SU->isScheduled)
+    return ResCount;
+
+  // Forced priority is high.
+  if (SU->isScheduleHigh)
+    ResCount += PriorityOne;
+
+  // Adaptable scheduling
+  // A small, but very parallel
+  // region, where reg pressure is an issue.
+  if (HorizontalVerticalBalance > RegPressureThreshold) {
+    // Critical path first
+    ResCount += (SU->getHeight() * ScaleTwo);
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (isResourceAvailable(SU))
+      ResCount <<= FactorOne;
+
+    // Consider change to reg pressure from scheduling
+    // this SU.
+    ResCount -= (regPressureDelta(SU,true) * ScaleOne);
+  }
+  // Default heuristic, greeady and
+  // critical path driven.
+  else {
+    // Critical path first.
+    ResCount += (SU->getHeight() * ScaleTwo);
+    // Now see how many instructions is blocked by this SU.
+    ResCount += (NumNodesSolelyBlocking[SU->NodeNum] * ScaleTwo);
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (isResourceAvailable(SU))
+      ResCount <<= FactorOne;
+
+    ResCount -= (regPressureDelta(SU) * ScaleTwo);
+  }
+
+  // These are platform-specific things.
+  // Will need to go into the back end
+  // and accessed from here via a hook.
+  for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) {
+    if (N->isMachineOpcode()) {
+      const MCInstrDesc &TID = TII->get(N->getMachineOpcode());
+      if (TID.isCall())
+        ResCount += (PriorityTwo + (ScaleThree*N->getNumValues()));
+    }
+    else
+      switch (N->getOpcode()) {
+      default:  break;
+      case ISD::TokenFactor:
+      case ISD::CopyFromReg:
+      case ISD::CopyToReg:
+        ResCount += PriorityFour;
+        break;
+
+      case ISD::INLINEASM:
+      case ISD::INLINEASM_BR:
+        ResCount += PriorityThree;
+        break;
+      }
+  }
+  return ResCount;
+}
+
+
+/// Main resource tracking point.
+void ResourcePriorityQueue::scheduledNode(SUnit *SU) {
+  // Use NULL entry as an event marker to reset
+  // the DFA state.
+  if (!SU) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    return;
+  }
+
+  const SDNode *ScegN = SU->getNode();
+  // Update reg pressure tracking.
+  // First update current node.
+  if (ScegN->isMachineOpcode()) {
+    // Estimate generated regs.
+    for (unsigned i = 0, e = ScegN->getNumValues(); i != e; ++i) {
+      MVT VT = ScegN->getSimpleValueType(i);
+
+      if (TLI->isTypeLegal(VT)) {
+        const TargetRegisterClass *RC = TLI->getRegClassFor(VT);
+        if (RC)
+          RegPressure[RC->getID()] += numberRCValSuccInSU(SU, RC->getID());
+      }
+    }
+    // Estimate killed regs.
+    for (unsigned i = 0, e = ScegN->getNumOperands(); i != e; ++i) {
+      const SDValue &Op = ScegN->getOperand(i);
+      MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
+
+      if (TLI->isTypeLegal(VT)) {
+        const TargetRegisterClass *RC = TLI->getRegClassFor(VT);
+        if (RC) {
+          if (RegPressure[RC->getID()] >
+            (numberRCValPredInSU(SU, RC->getID())))
+            RegPressure[RC->getID()] -= numberRCValPredInSU(SU, RC->getID());
+          else RegPressure[RC->getID()] = 0;
+        }
+      }
+    }
+    for (SDep &Pred : SU->Preds) {
+      if (Pred.isCtrl() || (Pred.getSUnit()->NumRegDefsLeft == 0))
+        continue;
+      --Pred.getSUnit()->NumRegDefsLeft;
+    }
+  }
+
+  // Reserve resources for this SU.
+  reserveResources(SU);
+
+  // Adjust number of parallel live ranges.
+  // Heuristic is simple - node with no data successors reduces
+  // number of live ranges. All others, increase it.
+  unsigned NumberNonControlDeps = 0;
+
+  for (const SDep &Succ : SU->Succs) {
+    adjustPriorityOfUnscheduledPreds(Succ.getSUnit());
+    if (!Succ.isCtrl())
+      NumberNonControlDeps++;
+  }
+
+  if (!NumberNonControlDeps) {
+    if (ParallelLiveRanges >= SU->NumPreds)
+      ParallelLiveRanges -= SU->NumPreds;
+    else
+      ParallelLiveRanges = 0;
+
+  }
+  else
+    ParallelLiveRanges += SU->NumRegDefsLeft;
+
+  // Track parallel live chains.
+  HorizontalVerticalBalance += (SU->Succs.size() - numberCtrlDepsInSU(SU));
+  HorizontalVerticalBalance -= (SU->Preds.size() - numberCtrlPredInSU(SU));
+}
+
+void ResourcePriorityQueue::initNumRegDefsLeft(SUnit *SU) {
+  unsigned  NodeNumDefs = 0;
+  for (SDNode *N = SU->getNode(); N; N = N->getGluedNode())
+    if (N->isMachineOpcode()) {
+      const MCInstrDesc &TID = TII->get(N->getMachineOpcode());
+      // No register need be allocated for this.
+      if (N->getMachineOpcode() == TargetOpcode::IMPLICIT_DEF) {
+        NodeNumDefs = 0;
+        break;
+      }
+      NodeNumDefs = std::min(N->getNumValues(), TID.getNumDefs());
+    }
+    else
+      switch(N->getOpcode()) {
+        default:     break;
+        case ISD::CopyFromReg:
+          NodeNumDefs++;
+          break;
+        case ISD::INLINEASM:
+        case ISD::INLINEASM_BR:
+          NodeNumDefs++;
+          break;
+      }
+
+  SU->NumRegDefsLeft = NodeNumDefs;
+}
+
+/// adjustPriorityOfUnscheduledPreds - One of the predecessors of SU was just
+/// scheduled.  If SU is not itself available, then there is at least one
+/// predecessor node that has not been scheduled yet.  If SU has exactly ONE
+/// unscheduled predecessor, we want to increase its priority: it getting
+/// scheduled will make this node available, so it is better than some other
+/// node of the same priority that will not make a node available.
+void ResourcePriorityQueue::adjustPriorityOfUnscheduledPreds(SUnit *SU) {
+  if (SU->isAvailable) return;  // All preds scheduled.
+
+  SUnit *OnlyAvailablePred = getSingleUnscheduledPred(SU);
+  if (!OnlyAvailablePred || !OnlyAvailablePred->isAvailable)
+    return;
+
+  // Okay, we found a single predecessor that is available, but not scheduled.
+  // Since it is available, it must be in the priority queue.  First remove it.
+  remove(OnlyAvailablePred);
+
+  // Reinsert the node into the priority queue, which recomputes its
+  // NumNodesSolelyBlocking value.
+  push(OnlyAvailablePred);
+}
+
+
+/// Main access point - returns next instructions
+/// to be placed in scheduling sequence.
+SUnit *ResourcePriorityQueue::pop() {
+  if (empty())
+    return nullptr;
+
+  std::vector<SUnit *>::iterator Best = Queue.begin();
+  if (!DisableDFASched) {
+    int BestCost = SUSchedulingCost(*Best);
+    for (auto I = std::next(Queue.begin()), E = Queue.end(); I != E; ++I) {
+
+      if (SUSchedulingCost(*I) > BestCost) {
+        BestCost = SUSchedulingCost(*I);
+        Best = I;
+      }
+    }
+  }
+  // Use default TD scheduling mechanism.
+  else {
+    for (auto I = std::next(Queue.begin()), E = Queue.end(); I != E; ++I)
+      if (Picker(*Best, *I))
+        Best = I;
+  }
+
+  SUnit *V = *Best;
+  if (Best != std::prev(Queue.end()))
+    std::swap(*Best, Queue.back());
+
+  Queue.pop_back();
+
+  return V;
+}
+
+
+void ResourcePriorityQueue::remove(SUnit *SU) {
+  assert(!Queue.empty() && "Queue is empty!");
+  std::vector<SUnit *>::iterator I = find(Queue, SU);
+  if (I != std::prev(Queue.end()))
+    std::swap(*I, Queue.back());
+
+  Queue.pop_back();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
new file mode 100644
index 000000000000..c31b971e7fc3
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
@@ -0,0 +1,264 @@
+//===-- llvm/CodeGen/SDNodeDbgValue.h - SelectionDAG dbg_value --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SDDbgValue class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SDNODEDBGVALUE_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_SDNODEDBGVALUE_H
+
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/DataTypes.h"
+#include <utility>
+
+namespace llvm {
+
+class DIVariable;
+class DIExpression;
+class SDNode;
+class Value;
+class raw_ostream;
+
+/// Holds the information for a single machine location through SDISel; either
+/// an SDNode, a constant, a stack location, or a virtual register.
+class SDDbgOperand {
+public:
+  enum Kind {
+    SDNODE = 0,  ///< Value is the result of an expression.
+    CONST = 1,   ///< Value is a constant.
+    FRAMEIX = 2, ///< Value is contents of a stack location.
+    VREG = 3     ///< Value is a virtual register.
+  };
+  Kind getKind() const { return kind; }
+
+  /// Returns the SDNode* for a register ref
+  SDNode *getSDNode() const {
+    assert(kind == SDNODE);
+    return u.s.Node;
+  }
+
+  /// Returns the ResNo for a register ref
+  unsigned getResNo() const {
+    assert(kind == SDNODE);
+    return u.s.ResNo;
+  }
+
+  /// Returns the Value* for a constant
+  const Value *getConst() const {
+    assert(kind == CONST);
+    return u.Const;
+  }
+
+  /// Returns the FrameIx for a stack object
+  unsigned getFrameIx() const {
+    assert(kind == FRAMEIX);
+    return u.FrameIx;
+  }
+
+  /// Returns the Virtual Register for a VReg
+  unsigned getVReg() const {
+    assert(kind == VREG);
+    return u.VReg;
+  }
+
+  static SDDbgOperand fromNode(SDNode *Node, unsigned ResNo) {
+    return SDDbgOperand(Node, ResNo);
+  }
+  static SDDbgOperand fromFrameIdx(unsigned FrameIdx) {
+    return SDDbgOperand(FrameIdx, FRAMEIX);
+  }
+  static SDDbgOperand fromVReg(unsigned VReg) {
+    return SDDbgOperand(VReg, VREG);
+  }
+  static SDDbgOperand fromConst(const Value *Const) {
+    return SDDbgOperand(Const);
+  }
+
+  bool operator!=(const SDDbgOperand &Other) const { return !(*this == Other); }
+  bool operator==(const SDDbgOperand &Other) const {
+    if (kind != Other.kind)
+      return false;
+    switch (kind) {
+    case SDNODE:
+      return getSDNode() == Other.getSDNode() && getResNo() == Other.getResNo();
+    case CONST:
+      return getConst() == Other.getConst();
+    case VREG:
+      return getVReg() == Other.getVReg();
+    case FRAMEIX:
+      return getFrameIx() == Other.getFrameIx();
+    }
+    return false;
+  }
+
+private:
+  Kind kind;
+  union {
+    struct {
+      SDNode *Node;   ///< Valid for expressions.
+      unsigned ResNo; ///< Valid for expressions.
+    } s;
+    const Value *Const; ///< Valid for constants.
+    unsigned FrameIx;   ///< Valid for stack objects.
+    unsigned VReg;      ///< Valid for registers.
+  } u;
+
+  /// Constructor for non-constants.
+  SDDbgOperand(SDNode *N, unsigned R) : kind(SDNODE) {
+    u.s.Node = N;
+    u.s.ResNo = R;
+  }
+  /// Constructor for constants.
+  SDDbgOperand(const Value *C) : kind(CONST) { u.Const = C; }
+  /// Constructor for virtual registers and frame indices.
+  SDDbgOperand(unsigned VRegOrFrameIdx, Kind Kind) : kind(Kind) {
+    assert((Kind == VREG || Kind == FRAMEIX) &&
+           "Invalid SDDbgValue constructor");
+    if (kind == VREG)
+      u.VReg = VRegOrFrameIdx;
+    else
+      u.FrameIx = VRegOrFrameIdx;
+  }
+};
+
+/// Holds the information from a dbg_value node through SDISel.
+/// We do not use SDValue here to avoid including its header.
+class SDDbgValue {
+public:
+
+private:
+  // SDDbgValues are allocated by a BumpPtrAllocator, which means the destructor
+  // may not be called; therefore all member arrays must also be allocated by
+  // that BumpPtrAllocator, to ensure that they are correctly freed.
+  size_t NumLocationOps;
+  SDDbgOperand *LocationOps;
+  // SDNode dependencies will be calculated as SDNodes that appear in
+  // LocationOps plus these AdditionalDependencies.
+  size_t NumAdditionalDependencies;
+  SDNode **AdditionalDependencies;
+  DIVariable *Var;
+  DIExpression *Expr;
+  DebugLoc DL;
+  unsigned Order;
+  bool IsIndirect;
+  bool IsVariadic;
+  bool Invalid = false;
+  bool Emitted = false;
+
+public:
+  SDDbgValue(BumpPtrAllocator &Alloc, DIVariable *Var, DIExpression *Expr,
+             ArrayRef<SDDbgOperand> L, ArrayRef<SDNode *> Dependencies,
+             bool IsIndirect, DebugLoc DL, unsigned O, bool IsVariadic)
+      : NumLocationOps(L.size()),
+        LocationOps(Alloc.Allocate<SDDbgOperand>(L.size())),
+        NumAdditionalDependencies(Dependencies.size()),
+        AdditionalDependencies(Alloc.Allocate<SDNode *>(Dependencies.size())),
+        Var(Var), Expr(Expr), DL(DL), Order(O), IsIndirect(IsIndirect),
+        IsVariadic(IsVariadic) {
+    assert(IsVariadic || L.size() == 1);
+    assert(!(IsVariadic && IsIndirect));
+    std::copy(L.begin(), L.end(), LocationOps);
+    std::copy(Dependencies.begin(), Dependencies.end(), AdditionalDependencies);
+  }
+
+  // We allocate arrays with the BumpPtrAllocator and never free or copy them,
+  // for LocationOps and AdditionalDependencies, as we never expect to copy or
+  // destroy an SDDbgValue. If we ever start copying or destroying instances, we
+  // should manage the allocated memory appropriately.
+  SDDbgValue(const SDDbgValue &Other) = delete;
+  SDDbgValue &operator=(const SDDbgValue &Other) = delete;
+  ~SDDbgValue() = delete;
+
+  /// Returns the DIVariable pointer for the variable.
+  DIVariable *getVariable() const { return Var; }
+
+  /// Returns the DIExpression pointer for the expression.
+  DIExpression *getExpression() const { return Expr; }
+
+  ArrayRef<SDDbgOperand> getLocationOps() const {
+    return ArrayRef<SDDbgOperand>(LocationOps, NumLocationOps);
+  }
+
+  SmallVector<SDDbgOperand> copyLocationOps() const {
+    return SmallVector<SDDbgOperand>(LocationOps, LocationOps + NumLocationOps);
+  }
+
+  // Returns the SDNodes which this SDDbgValue depends on.
+  SmallVector<SDNode *> getSDNodes() const {
+    SmallVector<SDNode *> Dependencies;
+    for (const SDDbgOperand &DbgOp : getLocationOps())
+      if (DbgOp.getKind() == SDDbgOperand::SDNODE)
+        Dependencies.push_back(DbgOp.getSDNode());
+    for (SDNode *Node : getAdditionalDependencies())
+      Dependencies.push_back(Node);
+    return Dependencies;
+  }
+
+  ArrayRef<SDNode *> getAdditionalDependencies() const {
+    return ArrayRef<SDNode *>(AdditionalDependencies,
+                              NumAdditionalDependencies);
+  }
+
+  /// Returns whether this is an indirect value.
+  bool isIndirect() const { return IsIndirect; }
+
+  bool isVariadic() const { return IsVariadic; }
+
+  /// Returns the DebugLoc.
+  const DebugLoc &getDebugLoc() const { return DL; }
+
+  /// Returns the SDNodeOrder.  This is the order of the preceding node in the
+  /// input.
+  unsigned getOrder() const { return Order; }
+
+  /// setIsInvalidated / isInvalidated - Setter / getter of the "Invalidated"
+  /// property. A SDDbgValue is invalid if the SDNode that produces the value is
+  /// deleted.
+  void setIsInvalidated() { Invalid = true; }
+  bool isInvalidated() const { return Invalid; }
+
+  /// setIsEmitted / isEmitted - Getter/Setter for flag indicating that this
+  /// SDDbgValue has been emitted to an MBB.
+  void setIsEmitted() { Emitted = true; }
+  bool isEmitted() const { return Emitted; }
+
+  /// clearIsEmitted - Reset Emitted flag, for certain special cases where
+  /// SDDbgValue is emitted twice. DBG_INSTR_REF depends on this behaviour.
+  void clearIsEmitted() { Emitted = false; }
+
+  LLVM_DUMP_METHOD void dump() const;
+  LLVM_DUMP_METHOD void print(raw_ostream &OS) const;
+};
+
+/// Holds the information from a dbg_label node through SDISel.
+/// We do not use SDValue here to avoid including its header.
+class SDDbgLabel {
+  MDNode *Label;
+  DebugLoc DL;
+  unsigned Order;
+
+public:
+  SDDbgLabel(MDNode *Label, DebugLoc dl, unsigned O)
+      : Label(Label), DL(std::move(dl)), Order(O) {}
+
+  /// Returns the MDNode pointer for the label.
+  MDNode *getLabel() const { return Label; }
+
+  /// Returns the DebugLoc.
+  const DebugLoc &getDebugLoc() const { return DL; }
+
+  /// Returns the SDNodeOrder.  This is the order of the preceding node in the
+  /// input.
+  unsigned getOrder() const { return Order; }
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
new file mode 100644
index 000000000000..5b01743d23e0
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -0,0 +1,819 @@
+//===----- ScheduleDAGFast.cpp - Fast poor list scheduler -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a fast scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstrEmitter.h"
+#include "SDNodeDbgValue.h"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "pre-RA-sched"
+
+STATISTIC(NumUnfolds,    "Number of nodes unfolded");
+STATISTIC(NumDups,       "Number of duplicated nodes");
+STATISTIC(NumPRCopies,   "Number of physical copies");
+
+static RegisterScheduler
+  fastDAGScheduler("fast", "Fast suboptimal list scheduling",
+                   createFastDAGScheduler);
+static RegisterScheduler
+  linearizeDAGScheduler("linearize", "Linearize DAG, no scheduling",
+                        createDAGLinearizer);
+
+
+namespace {
+  /// FastPriorityQueue - A degenerate priority queue that considers
+  /// all nodes to have the same priority.
+  ///
+  struct FastPriorityQueue {
+    SmallVector<SUnit *, 16> Queue;
+
+    bool empty() const { return Queue.empty(); }
+
+    void push(SUnit *U) {
+      Queue.push_back(U);
+    }
+
+    SUnit *pop() {
+      if (empty()) return nullptr;
+      return Queue.pop_back_val();
+    }
+  };
+
+//===----------------------------------------------------------------------===//
+/// ScheduleDAGFast - The actual "fast" list scheduler implementation.
+///
+class ScheduleDAGFast : public ScheduleDAGSDNodes {
+private:
+  /// AvailableQueue - The priority queue to use for the available SUnits.
+  FastPriorityQueue AvailableQueue;
+
+  /// LiveRegDefs - A set of physical registers and their definition
+  /// that are "live". These nodes must be scheduled before any other nodes that
+  /// modifies the registers can be scheduled.
+  unsigned NumLiveRegs = 0u;
+  std::vector<SUnit*> LiveRegDefs;
+  std::vector<unsigned> LiveRegCycles;
+
+public:
+  ScheduleDAGFast(MachineFunction &mf)
+    : ScheduleDAGSDNodes(mf) {}
+
+  void Schedule() override;
+
+  /// AddPred - adds a predecessor edge to SUnit SU.
+  /// This returns true if this is a new predecessor.
+  void AddPred(SUnit *SU, const SDep &D) {
+    SU->addPred(D);
+  }
+
+  /// RemovePred - removes a predecessor edge from SUnit SU.
+  /// This returns true if an edge was removed.
+  void RemovePred(SUnit *SU, const SDep &D) {
+    SU->removePred(D);
+  }
+
+private:
+  void ReleasePred(SUnit *SU, SDep *PredEdge);
+  void ReleasePredecessors(SUnit *SU, unsigned CurCycle);
+  void ScheduleNodeBottomUp(SUnit*, unsigned);
+  SUnit *CopyAndMoveSuccessors(SUnit*);
+  void InsertCopiesAndMoveSuccs(SUnit*, unsigned,
+                                const TargetRegisterClass*,
+                                const TargetRegisterClass*,
+                                SmallVectorImpl<SUnit*>&);
+  bool DelayForLiveRegsBottomUp(SUnit*, SmallVectorImpl<unsigned>&);
+  void ListScheduleBottomUp();
+
+  /// forceUnitLatencies - The fast scheduler doesn't care about real latencies.
+  bool forceUnitLatencies() const override { return true; }
+};
+}  // end anonymous namespace
+
+
+/// Schedule - Schedule the DAG using list scheduling.
+void ScheduleDAGFast::Schedule() {
+  LLVM_DEBUG(dbgs() << "********** List Scheduling **********\n");
+
+  NumLiveRegs = 0;
+  LiveRegDefs.resize(TRI->getNumRegs(), nullptr);
+  LiveRegCycles.resize(TRI->getNumRegs(), 0);
+
+  // Build the scheduling graph.
+  BuildSchedGraph(nullptr);
+
+  LLVM_DEBUG(dump());
+
+  // Execute the actual scheduling loop.
+  ListScheduleBottomUp();
+}
+
+//===----------------------------------------------------------------------===//
+//  Bottom-Up Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. Add it to
+/// the AvailableQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGFast::ReleasePred(SUnit *SU, SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+
+#ifndef NDEBUG
+  if (PredSU->NumSuccsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    dumpNode(*PredSU);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+  --PredSU->NumSuccsLeft;
+
+  // If all the node's successors are scheduled, this node is ready
+  // to be scheduled. Ignore the special EntrySU node.
+  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) {
+    PredSU->isAvailable = true;
+    AvailableQueue.push(PredSU);
+  }
+}
+
+void ScheduleDAGFast::ReleasePredecessors(SUnit *SU, unsigned CurCycle) {
+  // Bottom up: release predecessors
+  for (SDep &Pred : SU->Preds) {
+    ReleasePred(SU, &Pred);
+    if (Pred.isAssignedRegDep()) {
+      // This is a physical register dependency and it's impossible or
+      // expensive to copy the register. Make sure nothing that can
+      // clobber the register is scheduled between the predecessor and
+      // this node.
+      if (!LiveRegDefs[Pred.getReg()]) {
+        ++NumLiveRegs;
+        LiveRegDefs[Pred.getReg()] = Pred.getSUnit();
+        LiveRegCycles[Pred.getReg()] = CurCycle;
+      }
+    }
+  }
+}
+
+/// ScheduleNodeBottomUp - Add the node to the schedule. Decrement the pending
+/// count of its predecessors. If a predecessor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGFast::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) {
+  LLVM_DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  LLVM_DEBUG(dumpNode(*SU));
+
+  assert(CurCycle >= SU->getHeight() && "Node scheduled below its height!");
+  SU->setHeightToAtLeast(CurCycle);
+  Sequence.push_back(SU);
+
+  ReleasePredecessors(SU, CurCycle);
+
+  // Release all the implicit physical register defs that are live.
+  for (SDep &Succ : SU->Succs) {
+    if (Succ.isAssignedRegDep()) {
+      if (LiveRegCycles[Succ.getReg()] == Succ.getSUnit()->getHeight()) {
+        assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+        assert(LiveRegDefs[Succ.getReg()] == SU &&
+               "Physical register dependency violated?");
+        --NumLiveRegs;
+        LiveRegDefs[Succ.getReg()] = nullptr;
+        LiveRegCycles[Succ.getReg()] = 0;
+      }
+    }
+  }
+
+  SU->isScheduled = true;
+}
+
+/// CopyAndMoveSuccessors - Clone the specified node and move its scheduled
+/// successors to the newly created node.
+SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
+  if (SU->getNode()->getGluedNode())
+    return nullptr;
+
+  SDNode *N = SU->getNode();
+  if (!N)
+    return nullptr;
+
+  SUnit *NewSU;
+  bool TryUnfold = false;
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+    MVT VT = N->getSimpleValueType(i);
+    if (VT == MVT::Glue)
+      return nullptr;
+    else if (VT == MVT::Other)
+      TryUnfold = true;
+  }
+  for (const SDValue &Op : N->op_values()) {
+    MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
+    if (VT == MVT::Glue)
+      return nullptr;
+  }
+
+  if (TryUnfold) {
+    SmallVector<SDNode*, 2> NewNodes;
+    if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
+      return nullptr;
+
+    LLVM_DEBUG(dbgs() << "Unfolding SU # " << SU->NodeNum << "\n");
+    assert(NewNodes.size() == 2 && "Expected a load folding node!");
+
+    N = NewNodes[1];
+    SDNode *LoadNode = NewNodes[0];
+    unsigned NumVals = N->getNumValues();
+    unsigned OldNumVals = SU->getNode()->getNumValues();
+    for (unsigned i = 0; i != NumVals; ++i)
+      DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i));
+    DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals-1),
+                                   SDValue(LoadNode, 1));
+
+    SUnit *NewSU = newSUnit(N);
+    assert(N->getNodeId() == -1 && "Node already inserted!");
+    N->setNodeId(NewSU->NodeNum);
+
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
+      if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
+        NewSU->isTwoAddress = true;
+        break;
+      }
+    }
+    if (MCID.isCommutable())
+      NewSU->isCommutable = true;
+
+    // LoadNode may already exist. This can happen when there is another
+    // load from the same location and producing the same type of value
+    // but it has different alignment or volatileness.
+    bool isNewLoad = true;
+    SUnit *LoadSU;
+    if (LoadNode->getNodeId() != -1) {
+      LoadSU = &SUnits[LoadNode->getNodeId()];
+      isNewLoad = false;
+    } else {
+      LoadSU = newSUnit(LoadNode);
+      LoadNode->setNodeId(LoadSU->NodeNum);
+    }
+
+    SDep ChainPred;
+    SmallVector<SDep, 4> ChainSuccs;
+    SmallVector<SDep, 4> LoadPreds;
+    SmallVector<SDep, 4> NodePreds;
+    SmallVector<SDep, 4> NodeSuccs;
+    for (SDep &Pred : SU->Preds) {
+      if (Pred.isCtrl())
+        ChainPred = Pred;
+      else if (Pred.getSUnit()->getNode() &&
+               Pred.getSUnit()->getNode()->isOperandOf(LoadNode))
+        LoadPreds.push_back(Pred);
+      else
+        NodePreds.push_back(Pred);
+    }
+    for (SDep &Succ : SU->Succs) {
+      if (Succ.isCtrl())
+        ChainSuccs.push_back(Succ);
+      else
+        NodeSuccs.push_back(Succ);
+    }
+
+    if (ChainPred.getSUnit()) {
+      RemovePred(SU, ChainPred);
+      if (isNewLoad)
+        AddPred(LoadSU, ChainPred);
+    }
+    for (unsigned i = 0, e = LoadPreds.size(); i != e; ++i) {
+      const SDep &Pred = LoadPreds[i];
+      RemovePred(SU, Pred);
+      if (isNewLoad) {
+        AddPred(LoadSU, Pred);
+      }
+    }
+    for (unsigned i = 0, e = NodePreds.size(); i != e; ++i) {
+      const SDep &Pred = NodePreds[i];
+      RemovePred(SU, Pred);
+      AddPred(NewSU, Pred);
+    }
+    for (unsigned i = 0, e = NodeSuccs.size(); i != e; ++i) {
+      SDep D = NodeSuccs[i];
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      D.setSUnit(NewSU);
+      AddPred(SuccDep, D);
+    }
+    for (unsigned i = 0, e = ChainSuccs.size(); i != e; ++i) {
+      SDep D = ChainSuccs[i];
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      if (isNewLoad) {
+        D.setSUnit(LoadSU);
+        AddPred(SuccDep, D);
+      }
+    }
+    if (isNewLoad) {
+      SDep D(LoadSU, SDep::Barrier);
+      D.setLatency(LoadSU->Latency);
+      AddPred(NewSU, D);
+    }
+
+    ++NumUnfolds;
+
+    if (NewSU->NumSuccsLeft == 0) {
+      NewSU->isAvailable = true;
+      return NewSU;
+    }
+    SU = NewSU;
+  }
+
+  LLVM_DEBUG(dbgs() << "Duplicating SU # " << SU->NodeNum << "\n");
+  NewSU = Clone(SU);
+
+  // New SUnit has the exact same predecessors.
+  for (SDep &Pred : SU->Preds)
+    if (!Pred.isArtificial())
+      AddPred(NewSU, Pred);
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SDep &Succ : SU->Succs) {
+    if (Succ.isArtificial())
+      continue;
+    SUnit *SuccSU = Succ.getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = Succ;
+      D.setSUnit(NewSU);
+      AddPred(SuccSU, D);
+      D.setSUnit(SU);
+      DelDeps.push_back(std::make_pair(SuccSU, D));
+    }
+  }
+  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i)
+    RemovePred(DelDeps[i].first, DelDeps[i].second);
+
+  ++NumDups;
+  return NewSU;
+}
+
+/// InsertCopiesAndMoveSuccs - Insert register copies and move all
+/// scheduled successors of the given SUnit to the last copy.
+void ScheduleDAGFast::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
+                                              const TargetRegisterClass *DestRC,
+                                              const TargetRegisterClass *SrcRC,
+                                              SmallVectorImpl<SUnit*> &Copies) {
+  SUnit *CopyFromSU = newSUnit(static_cast<SDNode *>(nullptr));
+  CopyFromSU->CopySrcRC = SrcRC;
+  CopyFromSU->CopyDstRC = DestRC;
+
+  SUnit *CopyToSU = newSUnit(static_cast<SDNode *>(nullptr));
+  CopyToSU->CopySrcRC = DestRC;
+  CopyToSU->CopyDstRC = SrcRC;
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SDep &Succ : SU->Succs) {
+    if (Succ.isArtificial())
+      continue;
+    SUnit *SuccSU = Succ.getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = Succ;
+      D.setSUnit(CopyToSU);
+      AddPred(SuccSU, D);
+      DelDeps.push_back(std::make_pair(SuccSU, Succ));
+    }
+  }
+  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i) {
+    RemovePred(DelDeps[i].first, DelDeps[i].second);
+  }
+  SDep FromDep(SU, SDep::Data, Reg);
+  FromDep.setLatency(SU->Latency);
+  AddPred(CopyFromSU, FromDep);
+  SDep ToDep(CopyFromSU, SDep::Data, 0);
+  ToDep.setLatency(CopyFromSU->Latency);
+  AddPred(CopyToSU, ToDep);
+
+  Copies.push_back(CopyFromSU);
+  Copies.push_back(CopyToSU);
+
+  ++NumPRCopies;
+}
+
+/// getPhysicalRegisterVT - Returns the ValueType of the physical register
+/// definition of the specified node.
+/// FIXME: Move to SelectionDAG?
+static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
+                                 const TargetInstrInfo *TII) {
+  unsigned NumRes;
+  if (N->getOpcode() == ISD::CopyFromReg) {
+    // CopyFromReg has: "chain, Val, glue" so operand 1 gives the type.
+    NumRes = 1;
+  } else {
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    assert(!MCID.implicit_defs().empty() &&
+           "Physical reg def must be in implicit def list!");
+    NumRes = MCID.getNumDefs();
+    for (MCPhysReg ImpDef : MCID.implicit_defs()) {
+      if (Reg == ImpDef)
+        break;
+      ++NumRes;
+    }
+  }
+  return N->getSimpleValueType(NumRes);
+}
+
+/// CheckForLiveRegDef - Return true and update live register vector if the
+/// specified register def of the specified SUnit clobbers any "live" registers.
+static bool CheckForLiveRegDef(SUnit *SU, unsigned Reg,
+                               std::vector<SUnit *> &LiveRegDefs,
+                               SmallSet<unsigned, 4> &RegAdded,
+                               SmallVectorImpl<unsigned> &LRegs,
+                               const TargetRegisterInfo *TRI,
+                               const SDNode *Node = nullptr) {
+  bool Added = false;
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+    // Check if Ref is live.
+    if (!LiveRegDefs[*AI])
+      continue;
+
+    // Allow multiple uses of the same def.
+    if (LiveRegDefs[*AI] == SU)
+      continue;
+
+    // Allow multiple uses of same def
+    if (Node && LiveRegDefs[*AI]->getNode() == Node)
+      continue;
+
+    // Add Reg to the set of interfering live regs.
+    if (RegAdded.insert(*AI).second) {
+      LRegs.push_back(*AI);
+      Added = true;
+    }
+  }
+  return Added;
+}
+
+/// DelayForLiveRegsBottomUp - Returns true if it is necessary to delay
+/// scheduling of the given node to satisfy live physical register dependencies.
+/// If the specific node is the last one that's available to schedule, do
+/// whatever is necessary (i.e. backtracking or cloning) to make it possible.
+bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
+                                              SmallVectorImpl<unsigned> &LRegs){
+  if (NumLiveRegs == 0)
+    return false;
+
+  SmallSet<unsigned, 4> RegAdded;
+  // If this node would clobber any "live" register, then it's not ready.
+  for (SDep &Pred : SU->Preds) {
+    if (Pred.isAssignedRegDep()) {
+      CheckForLiveRegDef(Pred.getSUnit(), Pred.getReg(), LiveRegDefs,
+                         RegAdded, LRegs, TRI);
+    }
+  }
+
+  for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode()) {
+    if (Node->getOpcode() == ISD::INLINEASM ||
+        Node->getOpcode() == ISD::INLINEASM_BR) {
+      // Inline asm can clobber physical defs.
+      unsigned NumOps = Node->getNumOperands();
+      if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
+        --NumOps;  // Ignore the glue operand.
+
+      for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+        unsigned Flags =
+          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+        unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+
+        ++i; // Skip the ID value.
+        if (InlineAsm::isRegDefKind(Flags) ||
+            InlineAsm::isRegDefEarlyClobberKind(Flags) ||
+            InlineAsm::isClobberKind(Flags)) {
+          // Check for def of register or earlyclobber register.
+          for (; NumVals; --NumVals, ++i) {
+            unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+            if (Register::isPhysicalRegister(Reg))
+              CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+          }
+        } else
+          i += NumVals;
+      }
+      continue;
+    }
+
+    if (Node->getOpcode() == ISD::CopyToReg) {
+      Register Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+      if (Reg.isPhysical()) {
+        SDNode *SrcNode = Node->getOperand(2).getNode();
+        CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI, SrcNode);
+      }
+    }
+
+    if (!Node->isMachineOpcode())
+      continue;
+    const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
+    for (MCPhysReg Reg : MCID.implicit_defs())
+      CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+  }
+  return !LRegs.empty();
+}
+
+
+/// ListScheduleBottomUp - The main loop of list scheduling for bottom-up
+/// schedulers.
+void ScheduleDAGFast::ListScheduleBottomUp() {
+  unsigned CurCycle = 0;
+
+  // Release any predecessors of the special Exit node.
+  ReleasePredecessors(&ExitSU, CurCycle);
+
+  // Add root to Available queue.
+  if (!SUnits.empty()) {
+    SUnit *RootSU = &SUnits[DAG->getRoot().getNode()->getNodeId()];
+    assert(RootSU->Succs.empty() && "Graph root shouldn't have successors!");
+    RootSU->isAvailable = true;
+    AvailableQueue.push(RootSU);
+  }
+
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  SmallVector<SUnit*, 4> NotReady;
+  DenseMap<SUnit*, SmallVector<unsigned, 4> > LRegsMap;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue.empty()) {
+    bool Delayed = false;
+    LRegsMap.clear();
+    SUnit *CurSU = AvailableQueue.pop();
+    while (CurSU) {
+      SmallVector<unsigned, 4> LRegs;
+      if (!DelayForLiveRegsBottomUp(CurSU, LRegs))
+        break;
+      Delayed = true;
+      LRegsMap.insert(std::make_pair(CurSU, LRegs));
+
+      CurSU->isPending = true;  // This SU is not in AvailableQueue right now.
+      NotReady.push_back(CurSU);
+      CurSU = AvailableQueue.pop();
+    }
+
+    // All candidates are delayed due to live physical reg dependencies.
+    // Try code duplication or inserting cross class copies
+    // to resolve it.
+    if (Delayed && !CurSU) {
+      if (!CurSU) {
+        // Try duplicating the nodes that produces these
+        // "expensive to copy" values to break the dependency. In case even
+        // that doesn't work, insert cross class copies.
+        SUnit *TrySU = NotReady[0];
+        SmallVectorImpl<unsigned> &LRegs = LRegsMap[TrySU];
+        assert(LRegs.size() == 1 && "Can't handle this yet!");
+        unsigned Reg = LRegs[0];
+        SUnit *LRDef = LiveRegDefs[Reg];
+        MVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
+        const TargetRegisterClass *RC =
+          TRI->getMinimalPhysRegClass(Reg, VT);
+        const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
+
+        // If cross copy register class is the same as RC, then it must be
+        // possible copy the value directly. Do not try duplicate the def.
+        // If cross copy register class is not the same as RC, then it's
+        // possible to copy the value but it require cross register class copies
+        // and it is expensive.
+        // If cross copy register class is null, then it's not possible to copy
+        // the value at all.
+        SUnit *NewDef = nullptr;
+        if (DestRC != RC) {
+          NewDef = CopyAndMoveSuccessors(LRDef);
+          if (!DestRC && !NewDef)
+            report_fatal_error("Can't handle live physical "
+                               "register dependency!");
+        }
+        if (!NewDef) {
+          // Issue copies, these can be expensive cross register class copies.
+          SmallVector<SUnit*, 2> Copies;
+          InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
+          LLVM_DEBUG(dbgs() << "Adding an edge from SU # " << TrySU->NodeNum
+                            << " to SU #" << Copies.front()->NodeNum << "\n");
+          AddPred(TrySU, SDep(Copies.front(), SDep::Artificial));
+          NewDef = Copies.back();
+        }
+
+        LLVM_DEBUG(dbgs() << "Adding an edge from SU # " << NewDef->NodeNum
+                          << " to SU #" << TrySU->NodeNum << "\n");
+        LiveRegDefs[Reg] = NewDef;
+        AddPred(NewDef, SDep(TrySU, SDep::Artificial));
+        TrySU->isAvailable = false;
+        CurSU = NewDef;
+      }
+
+      if (!CurSU) {
+        llvm_unreachable("Unable to resolve live physical register dependencies!");
+      }
+    }
+
+    // Add the nodes that aren't ready back onto the available list.
+    for (unsigned i = 0, e = NotReady.size(); i != e; ++i) {
+      NotReady[i]->isPending = false;
+      // May no longer be available due to backtracking.
+      if (NotReady[i]->isAvailable)
+        AvailableQueue.push(NotReady[i]);
+    }
+    NotReady.clear();
+
+    if (CurSU)
+      ScheduleNodeBottomUp(CurSU, CurCycle);
+    ++CurCycle;
+  }
+
+  // Reverse the order since it is bottom up.
+  std::reverse(Sequence.begin(), Sequence.end());
+
+#ifndef NDEBUG
+  VerifyScheduledSequence(/*isBottomUp=*/true);
+#endif
+}
+
+
+namespace {
+//===----------------------------------------------------------------------===//
+// ScheduleDAGLinearize - No scheduling scheduler, it simply linearize the
+// DAG in topological order.
+// IMPORTANT: this may not work for targets with phyreg dependency.
+//
+class ScheduleDAGLinearize : public ScheduleDAGSDNodes {
+public:
+  ScheduleDAGLinearize(MachineFunction &mf) : ScheduleDAGSDNodes(mf) {}
+
+  void Schedule() override;
+
+  MachineBasicBlock *
+    EmitSchedule(MachineBasicBlock::iterator &InsertPos) override;
+
+private:
+  std::vector<SDNode*> Sequence;
+  DenseMap<SDNode*, SDNode*> GluedMap;  // Cache glue to its user
+
+  void ScheduleNode(SDNode *N);
+};
+} // end anonymous namespace
+
+void ScheduleDAGLinearize::ScheduleNode(SDNode *N) {
+  if (N->getNodeId() != 0)
+    llvm_unreachable(nullptr);
+
+  if (!N->isMachineOpcode() &&
+      (N->getOpcode() == ISD::EntryToken || isPassiveNode(N)))
+    // These nodes do not need to be translated into MIs.
+    return;
+
+  LLVM_DEBUG(dbgs() << "\n*** Scheduling: ");
+  LLVM_DEBUG(N->dump(DAG));
+  Sequence.push_back(N);
+
+  unsigned NumOps = N->getNumOperands();
+  if (unsigned NumLeft = NumOps) {
+    SDNode *GluedOpN = nullptr;
+    do {
+      const SDValue &Op = N->getOperand(NumLeft-1);
+      SDNode *OpN = Op.getNode();
+
+      if (NumLeft == NumOps && Op.getValueType() == MVT::Glue) {
+        // Schedule glue operand right above N.
+        GluedOpN = OpN;
+        assert(OpN->getNodeId() != 0 && "Glue operand not ready?");
+        OpN->setNodeId(0);
+        ScheduleNode(OpN);
+        continue;
+      }
+
+      if (OpN == GluedOpN)
+        // Glue operand is already scheduled.
+        continue;
+
+      DenseMap<SDNode*, SDNode*>::iterator DI = GluedMap.find(OpN);
+      if (DI != GluedMap.end() && DI->second != N)
+        // Users of glues are counted against the glued users.
+        OpN = DI->second;
+
+      unsigned Degree = OpN->getNodeId();
+      assert(Degree > 0 && "Predecessor over-released!");
+      OpN->setNodeId(--Degree);
+      if (Degree == 0)
+        ScheduleNode(OpN);
+    } while (--NumLeft);
+  }
+}
+
+/// findGluedUser - Find the representative use of a glue value by walking
+/// the use chain.
+static SDNode *findGluedUser(SDNode *N) {
+  while (SDNode *Glued = N->getGluedUser())
+    N = Glued;
+  return N;
+}
+
+void ScheduleDAGLinearize::Schedule() {
+  LLVM_DEBUG(dbgs() << "********** DAG Linearization **********\n");
+
+  SmallVector<SDNode*, 8> Glues;
+  unsigned DAGSize = 0;
+  for (SDNode &Node : DAG->allnodes()) {
+    SDNode *N = &Node;
+
+    // Use node id to record degree.
+    unsigned Degree = N->use_size();
+    N->setNodeId(Degree);
+    unsigned NumVals = N->getNumValues();
+    if (NumVals && N->getValueType(NumVals-1) == MVT::Glue &&
+        N->hasAnyUseOfValue(NumVals-1)) {
+      SDNode *User = findGluedUser(N);
+      if (User) {
+        Glues.push_back(N);
+        GluedMap.insert(std::make_pair(N, User));
+      }
+    }
+
+    if (N->isMachineOpcode() ||
+        (N->getOpcode() != ISD::EntryToken && !isPassiveNode(N)))
+      ++DAGSize;
+  }
+
+  for (unsigned i = 0, e = Glues.size(); i != e; ++i) {
+    SDNode *Glue = Glues[i];
+    SDNode *GUser = GluedMap[Glue];
+    unsigned Degree = Glue->getNodeId();
+    unsigned UDegree = GUser->getNodeId();
+
+    // Glue user must be scheduled together with the glue operand. So other
+    // users of the glue operand must be treated as its users.
+    SDNode *ImmGUser = Glue->getGluedUser();
+    for (const SDNode *U : Glue->uses())
+      if (U == ImmGUser)
+        --Degree;
+    GUser->setNodeId(UDegree + Degree);
+    Glue->setNodeId(1);
+  }
+
+  Sequence.reserve(DAGSize);
+  ScheduleNode(DAG->getRoot().getNode());
+}
+
+MachineBasicBlock*
+ScheduleDAGLinearize::EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
+  InstrEmitter Emitter(DAG->getTarget(), BB, InsertPos);
+  DenseMap<SDValue, Register> VRBaseMap;
+
+  LLVM_DEBUG({ dbgs() << "\n*** Final schedule ***\n"; });
+
+  unsigned NumNodes = Sequence.size();
+  MachineBasicBlock *BB = Emitter.getBlock();
+  for (unsigned i = 0; i != NumNodes; ++i) {
+    SDNode *N = Sequence[NumNodes-i-1];
+    LLVM_DEBUG(N->dump(DAG));
+    Emitter.EmitNode(N, false, false, VRBaseMap);
+
+    // Emit any debug values associated with the node.
+    if (N->getHasDebugValue()) {
+      MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos();
+      for (auto *DV : DAG->GetDbgValues(N)) {
+        if (!DV->isEmitted())
+          if (auto *DbgMI = Emitter.EmitDbgValue(DV, VRBaseMap))
+            BB->insert(InsertPos, DbgMI);
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << '\n');
+
+  InsertPos = Emitter.getInsertPos();
+  return Emitter.getBlock();
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+llvm::ScheduleDAGSDNodes *
+llvm::createFastDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  return new ScheduleDAGFast(*IS->MF);
+}
+
+llvm::ScheduleDAGSDNodes *
+llvm::createDAGLinearizer(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  return new ScheduleDAGLinearize(*IS->MF);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
new file mode 100644
index 000000000000..458f50c54824
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -0,0 +1,3210 @@
+//===- ScheduleDAGRRList.cpp - Reg pressure reduction list scheduler ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements bottom-up and top-down register pressure reduction list
+// schedulers, using standard algorithms.  The basic approach uses a priority
+// queue of available nodes to schedule.  One at a time, nodes are taken from
+// the priority queue (thus in priority order), checked for legality to
+// schedule, and emitted if legal.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pre-RA-sched"
+
+STATISTIC(NumBacktracks, "Number of times scheduler backtracked");
+STATISTIC(NumUnfolds,    "Number of nodes unfolded");
+STATISTIC(NumDups,       "Number of duplicated nodes");
+STATISTIC(NumPRCopies,   "Number of physical register copies");
+
+static RegisterScheduler
+  burrListDAGScheduler("list-burr",
+                       "Bottom-up register reduction list scheduling",
+                       createBURRListDAGScheduler);
+
+static RegisterScheduler
+  sourceListDAGScheduler("source",
+                         "Similar to list-burr but schedules in source "
+                         "order when possible",
+                         createSourceListDAGScheduler);
+
+static RegisterScheduler
+  hybridListDAGScheduler("list-hybrid",
+                         "Bottom-up register pressure aware list scheduling "
+                         "which tries to balance latency and register pressure",
+                         createHybridListDAGScheduler);
+
+static RegisterScheduler
+  ILPListDAGScheduler("list-ilp",
+                      "Bottom-up register pressure aware list scheduling "
+                      "which tries to balance ILP and register pressure",
+                      createILPListDAGScheduler);
+
+static cl::opt<bool> DisableSchedCycles(
+  "disable-sched-cycles", cl::Hidden, cl::init(false),
+  cl::desc("Disable cycle-level precision during preRA scheduling"));
+
+// Temporary sched=list-ilp flags until the heuristics are robust.
+// Some options are also available under sched=list-hybrid.
+static cl::opt<bool> DisableSchedRegPressure(
+  "disable-sched-reg-pressure", cl::Hidden, cl::init(false),
+  cl::desc("Disable regpressure priority in sched=list-ilp"));
+static cl::opt<bool> DisableSchedLiveUses(
+  "disable-sched-live-uses", cl::Hidden, cl::init(true),
+  cl::desc("Disable live use priority in sched=list-ilp"));
+static cl::opt<bool> DisableSchedVRegCycle(
+  "disable-sched-vrcycle", cl::Hidden, cl::init(false),
+  cl::desc("Disable virtual register cycle interference checks"));
+static cl::opt<bool> DisableSchedPhysRegJoin(
+  "disable-sched-physreg-join", cl::Hidden, cl::init(false),
+  cl::desc("Disable physreg def-use affinity"));
+static cl::opt<bool> DisableSchedStalls(
+  "disable-sched-stalls", cl::Hidden, cl::init(true),
+  cl::desc("Disable no-stall priority in sched=list-ilp"));
+static cl::opt<bool> DisableSchedCriticalPath(
+  "disable-sched-critical-path", cl::Hidden, cl::init(false),
+  cl::desc("Disable critical path priority in sched=list-ilp"));
+static cl::opt<bool> DisableSchedHeight(
+  "disable-sched-height", cl::Hidden, cl::init(false),
+  cl::desc("Disable scheduled-height priority in sched=list-ilp"));
+static cl::opt<bool> Disable2AddrHack(
+  "disable-2addr-hack", cl::Hidden, cl::init(true),
+  cl::desc("Disable scheduler's two-address hack"));
+
+static cl::opt<int> MaxReorderWindow(
+  "max-sched-reorder", cl::Hidden, cl::init(6),
+  cl::desc("Number of instructions to allow ahead of the critical path "
+           "in sched=list-ilp"));
+
+static cl::opt<unsigned> AvgIPC(
+  "sched-avg-ipc", cl::Hidden, cl::init(1),
+  cl::desc("Average inst/cycle whan no target itinerary exists."));
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+/// ScheduleDAGRRList - The actual register reduction list scheduler
+/// implementation.  This supports both top-down and bottom-up scheduling.
+///
+class ScheduleDAGRRList : public ScheduleDAGSDNodes {
+private:
+  /// NeedLatency - True if the scheduler will make use of latency information.
+  bool NeedLatency;
+
+  /// AvailableQueue - The priority queue to use for the available SUnits.
+  SchedulingPriorityQueue *AvailableQueue;
+
+  /// PendingQueue - This contains all of the instructions whose operands have
+  /// been issued, but their results are not ready yet (due to the latency of
+  /// the operation).  Once the operands becomes available, the instruction is
+  /// added to the AvailableQueue.
+  std::vector<SUnit *> PendingQueue;
+
+  /// HazardRec - The hazard recognizer to use.
+  ScheduleHazardRecognizer *HazardRec;
+
+  /// CurCycle - The current scheduler state corresponds to this cycle.
+  unsigned CurCycle = 0;
+
+  /// MinAvailableCycle - Cycle of the soonest available instruction.
+  unsigned MinAvailableCycle = ~0u;
+
+  /// IssueCount - Count instructions issued in this cycle
+  /// Currently valid only for bottom-up scheduling.
+  unsigned IssueCount = 0u;
+
+  /// LiveRegDefs - A set of physical registers and their definition
+  /// that are "live". These nodes must be scheduled before any other nodes that
+  /// modifies the registers can be scheduled.
+  unsigned NumLiveRegs = 0u;
+  std::unique_ptr<SUnit*[]> LiveRegDefs;
+  std::unique_ptr<SUnit*[]> LiveRegGens;
+
+  // Collect interferences between physical register use/defs.
+  // Each interference is an SUnit and set of physical registers.
+  SmallVector<SUnit*, 4> Interferences;
+
+  using LRegsMapT = DenseMap<SUnit *, SmallVector<unsigned, 4>>;
+
+  LRegsMapT LRegsMap;
+
+  /// Topo - A topological ordering for SUnits which permits fast IsReachable
+  /// and similar queries.
+  ScheduleDAGTopologicalSort Topo;
+
+  // Hack to keep track of the inverse of FindCallSeqStart without more crazy
+  // DAG crawling.
+  DenseMap<SUnit*, SUnit*> CallSeqEndForStart;
+
+public:
+  ScheduleDAGRRList(MachineFunction &mf, bool needlatency,
+                    SchedulingPriorityQueue *availqueue,
+                    CodeGenOpt::Level OptLevel)
+    : ScheduleDAGSDNodes(mf),
+      NeedLatency(needlatency), AvailableQueue(availqueue),
+      Topo(SUnits, nullptr) {
+    const TargetSubtargetInfo &STI = mf.getSubtarget();
+    if (DisableSchedCycles || !NeedLatency)
+      HazardRec = new ScheduleHazardRecognizer();
+    else
+      HazardRec = STI.getInstrInfo()->CreateTargetHazardRecognizer(&STI, this);
+  }
+
+  ~ScheduleDAGRRList() override {
+    delete HazardRec;
+    delete AvailableQueue;
+  }
+
+  void Schedule() override;
+
+  ScheduleHazardRecognizer *getHazardRec() { return HazardRec; }
+
+  /// IsReachable - Checks if SU is reachable from TargetSU.
+  bool IsReachable(const SUnit *SU, const SUnit *TargetSU) {
+    return Topo.IsReachable(SU, TargetSU);
+  }
+
+  /// WillCreateCycle - Returns true if adding an edge from SU to TargetSU will
+  /// create a cycle.
+  bool WillCreateCycle(SUnit *SU, SUnit *TargetSU) {
+    return Topo.WillCreateCycle(SU, TargetSU);
+  }
+
+  /// AddPredQueued - Queues and update to add a predecessor edge to SUnit SU.
+  /// This returns true if this is a new predecessor.
+  /// Does *NOT* update the topological ordering! It just queues an update.
+  void AddPredQueued(SUnit *SU, const SDep &D) {
+    Topo.AddPredQueued(SU, D.getSUnit());
+    SU->addPred(D);
+  }
+
+  /// AddPred - adds a predecessor edge to SUnit SU.
+  /// This returns true if this is a new predecessor.
+  /// Updates the topological ordering if required.
+  void AddPred(SUnit *SU, const SDep &D) {
+    Topo.AddPred(SU, D.getSUnit());
+    SU->addPred(D);
+  }
+
+  /// RemovePred - removes a predecessor edge from SUnit SU.
+  /// This returns true if an edge was removed.
+  /// Updates the topological ordering if required.
+  void RemovePred(SUnit *SU, const SDep &D) {
+    Topo.RemovePred(SU, D.getSUnit());
+    SU->removePred(D);
+  }
+
+private:
+  bool isReady(SUnit *SU) {
+    return DisableSchedCycles || !AvailableQueue->hasReadyFilter() ||
+      AvailableQueue->isReady(SU);
+  }
+
+  void ReleasePred(SUnit *SU, const SDep *PredEdge);
+  void ReleasePredecessors(SUnit *SU);
+  void ReleasePending();
+  void AdvanceToCycle(unsigned NextCycle);
+  void AdvancePastStalls(SUnit *SU);
+  void EmitNode(SUnit *SU);
+  void ScheduleNodeBottomUp(SUnit*);
+  void CapturePred(SDep *PredEdge);
+  void UnscheduleNodeBottomUp(SUnit*);
+  void RestoreHazardCheckerBottomUp();
+  void BacktrackBottomUp(SUnit*, SUnit*);
+  SUnit *TryUnfoldSU(SUnit *);
+  SUnit *CopyAndMoveSuccessors(SUnit*);
+  void InsertCopiesAndMoveSuccs(SUnit*, unsigned,
+                                const TargetRegisterClass*,
+                                const TargetRegisterClass*,
+                                SmallVectorImpl<SUnit*>&);
+  bool DelayForLiveRegsBottomUp(SUnit*, SmallVectorImpl<unsigned>&);
+
+  void releaseInterferences(unsigned Reg = 0);
+
+  SUnit *PickNodeToScheduleBottomUp();
+  void ListScheduleBottomUp();
+
+  /// CreateNewSUnit - Creates a new SUnit and returns a pointer to it.
+  SUnit *CreateNewSUnit(SDNode *N) {
+    unsigned NumSUnits = SUnits.size();
+    SUnit *NewNode = newSUnit(N);
+    // Update the topological ordering.
+    if (NewNode->NodeNum >= NumSUnits)
+      Topo.AddSUnitWithoutPredecessors(NewNode);
+    return NewNode;
+  }
+
+  /// CreateClone - Creates a new SUnit from an existing one.
+  SUnit *CreateClone(SUnit *N) {
+    unsigned NumSUnits = SUnits.size();
+    SUnit *NewNode = Clone(N);
+    // Update the topological ordering.
+    if (NewNode->NodeNum >= NumSUnits)
+      Topo.AddSUnitWithoutPredecessors(NewNode);
+    return NewNode;
+  }
+
+  /// forceUnitLatencies - Register-pressure-reducing scheduling doesn't
+  /// need actual latency information but the hybrid scheduler does.
+  bool forceUnitLatencies() const override {
+    return !NeedLatency;
+  }
+};
+
+}  // end anonymous namespace
+
+static constexpr unsigned RegSequenceCost = 1;
+
+/// GetCostForDef - Looks up the register class and cost for a given definition.
+/// Typically this just means looking up the representative register class,
+/// but for untyped values (MVT::Untyped) it means inspecting the node's
+/// opcode to determine what register class is being generated.
+static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
+                          const TargetLowering *TLI,
+                          const TargetInstrInfo *TII,
+                          const TargetRegisterInfo *TRI,
+                          unsigned &RegClass, unsigned &Cost,
+                          const MachineFunction &MF) {
+  MVT VT = RegDefPos.GetValue();
+
+  // Special handling for untyped values.  These values can only come from
+  // the expansion of custom DAG-to-DAG patterns.
+  if (VT == MVT::Untyped) {
+    const SDNode *Node = RegDefPos.GetNode();
+
+    // Special handling for CopyFromReg of untyped values.
+    if (!Node->isMachineOpcode() && Node->getOpcode() == ISD::CopyFromReg) {
+      Register Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+      const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(Reg);
+      RegClass = RC->getID();
+      Cost = 1;
+      return;
+    }
+
+    unsigned Opcode = Node->getMachineOpcode();
+    if (Opcode == TargetOpcode::REG_SEQUENCE) {
+      unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+      const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
+      RegClass = RC->getID();
+      Cost = RegSequenceCost;
+      return;
+    }
+
+    unsigned Idx = RegDefPos.GetIdx();
+    const MCInstrDesc &Desc = TII->get(Opcode);
+    const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx, TRI, MF);
+    assert(RC && "Not a valid register class");
+    RegClass = RC->getID();
+    // FIXME: Cost arbitrarily set to 1 because there doesn't seem to be a
+    // better way to determine it.
+    Cost = 1;
+  } else {
+    RegClass = TLI->getRepRegClassFor(VT)->getID();
+    Cost = TLI->getRepRegClassCostFor(VT);
+  }
+}
+
+/// Schedule - Schedule the DAG using list scheduling.
+void ScheduleDAGRRList::Schedule() {
+  LLVM_DEBUG(dbgs() << "********** List Scheduling " << printMBBReference(*BB)
+                    << " '" << BB->getName() << "' **********\n");
+
+  CurCycle = 0;
+  IssueCount = 0;
+  MinAvailableCycle =
+      DisableSchedCycles ? 0 : std::numeric_limits<unsigned>::max();
+  NumLiveRegs = 0;
+  // Allocate slots for each physical register, plus one for a special register
+  // to track the virtual resource of a calling sequence.
+  LiveRegDefs.reset(new SUnit*[TRI->getNumRegs() + 1]());
+  LiveRegGens.reset(new SUnit*[TRI->getNumRegs() + 1]());
+  CallSeqEndForStart.clear();
+  assert(Interferences.empty() && LRegsMap.empty() && "stale Interferences");
+
+  // Build the scheduling graph.
+  BuildSchedGraph(nullptr);
+
+  LLVM_DEBUG(dump());
+  Topo.MarkDirty();
+
+  AvailableQueue->initNodes(SUnits);
+
+  HazardRec->Reset();
+
+  // Execute the actual scheduling loop.
+  ListScheduleBottomUp();
+
+  AvailableQueue->releaseState();
+
+  LLVM_DEBUG({
+    dbgs() << "*** Final schedule ***\n";
+    dumpSchedule();
+    dbgs() << '\n';
+  });
+}
+
+//===----------------------------------------------------------------------===//
+//  Bottom-Up Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. Add it to
+/// the AvailableQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGRRList::ReleasePred(SUnit *SU, const SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+
+#ifndef NDEBUG
+  if (PredSU->NumSuccsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    dumpNode(*PredSU);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+  --PredSU->NumSuccsLeft;
+
+  if (!forceUnitLatencies()) {
+    // Updating predecessor's height. This is now the cycle when the
+    // predecessor can be scheduled without causing a pipeline stall.
+    PredSU->setHeightToAtLeast(SU->getHeight() + PredEdge->getLatency());
+  }
+
+  // If all the node's successors are scheduled, this node is ready
+  // to be scheduled. Ignore the special EntrySU node.
+  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) {
+    PredSU->isAvailable = true;
+
+    unsigned Height = PredSU->getHeight();
+    if (Height < MinAvailableCycle)
+      MinAvailableCycle = Height;
+
+    if (isReady(PredSU)) {
+      AvailableQueue->push(PredSU);
+    }
+    // CapturePred and others may have left the node in the pending queue, avoid
+    // adding it twice.
+    else if (!PredSU->isPending) {
+      PredSU->isPending = true;
+      PendingQueue.push_back(PredSU);
+    }
+  }
+}
+
+/// IsChainDependent - Test if Outer is reachable from Inner through
+/// chain dependencies.
+static bool IsChainDependent(SDNode *Outer, SDNode *Inner,
+                             unsigned NestLevel,
+                             const TargetInstrInfo *TII) {
+  SDNode *N = Outer;
+  while (true) {
+    if (N == Inner)
+      return true;
+    // For a TokenFactor, examine each operand. There may be multiple ways
+    // to get to the CALLSEQ_BEGIN, but we need to find the path with the
+    // most nesting in order to ensure that we find the corresponding match.
+    if (N->getOpcode() == ISD::TokenFactor) {
+      for (const SDValue &Op : N->op_values())
+        if (IsChainDependent(Op.getNode(), Inner, NestLevel, TII))
+          return true;
+      return false;
+    }
+    // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END.
+    if (N->isMachineOpcode()) {
+      if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
+        ++NestLevel;
+      } else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
+        if (NestLevel == 0)
+          return false;
+        --NestLevel;
+      }
+    }
+    // Otherwise, find the chain and continue climbing.
+    for (const SDValue &Op : N->op_values())
+      if (Op.getValueType() == MVT::Other) {
+        N = Op.getNode();
+        goto found_chain_operand;
+      }
+    return false;
+  found_chain_operand:;
+    if (N->getOpcode() == ISD::EntryToken)
+      return false;
+  }
+}
+
+/// FindCallSeqStart - Starting from the (lowered) CALLSEQ_END node, locate
+/// the corresponding (lowered) CALLSEQ_BEGIN node.
+///
+/// NestLevel and MaxNested are used in recursion to indcate the current level
+/// of nesting of CALLSEQ_BEGIN and CALLSEQ_END pairs, as well as the maximum
+/// level seen so far.
+///
+/// TODO: It would be better to give CALLSEQ_END an explicit operand to point
+/// to the corresponding CALLSEQ_BEGIN to avoid needing to search for it.
+static SDNode *
+FindCallSeqStart(SDNode *N, unsigned &NestLevel, unsigned &MaxNest,
+                 const TargetInstrInfo *TII) {
+  while (true) {
+    // For a TokenFactor, examine each operand. There may be multiple ways
+    // to get to the CALLSEQ_BEGIN, but we need to find the path with the
+    // most nesting in order to ensure that we find the corresponding match.
+    if (N->getOpcode() == ISD::TokenFactor) {
+      SDNode *Best = nullptr;
+      unsigned BestMaxNest = MaxNest;
+      for (const SDValue &Op : N->op_values()) {
+        unsigned MyNestLevel = NestLevel;
+        unsigned MyMaxNest = MaxNest;
+        if (SDNode *New = FindCallSeqStart(Op.getNode(),
+                                           MyNestLevel, MyMaxNest, TII))
+          if (!Best || (MyMaxNest > BestMaxNest)) {
+            Best = New;
+            BestMaxNest = MyMaxNest;
+          }
+      }
+      assert(Best);
+      MaxNest = BestMaxNest;
+      return Best;
+    }
+    // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END.
+    if (N->isMachineOpcode()) {
+      if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
+        ++NestLevel;
+        MaxNest = std::max(MaxNest, NestLevel);
+      } else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
+        assert(NestLevel != 0);
+        --NestLevel;
+        if (NestLevel == 0)
+          return N;
+      }
+    }
+    // Otherwise, find the chain and continue climbing.
+    for (const SDValue &Op : N->op_values())
+      if (Op.getValueType() == MVT::Other) {
+        N = Op.getNode();
+        goto found_chain_operand;
+      }
+    return nullptr;
+  found_chain_operand:;
+    if (N->getOpcode() == ISD::EntryToken)
+      return nullptr;
+  }
+}
+
+/// Call ReleasePred for each predecessor, then update register live def/gen.
+/// Always update LiveRegDefs for a register dependence even if the current SU
+/// also defines the register. This effectively create one large live range
+/// across a sequence of two-address node. This is important because the
+/// entire chain must be scheduled together. Example:
+///
+/// flags = (3) add
+/// flags = (2) addc flags
+/// flags = (1) addc flags
+///
+/// results in
+///
+/// LiveRegDefs[flags] = 3
+/// LiveRegGens[flags] = 1
+///
+/// If (2) addc is unscheduled, then (1) addc must also be unscheduled to avoid
+/// interference on flags.
+void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU) {
+  // Bottom up: release predecessors
+  for (SDep &Pred : SU->Preds) {
+    ReleasePred(SU, &Pred);
+    if (Pred.isAssignedRegDep()) {
+      // This is a physical register dependency and it's impossible or
+      // expensive to copy the register. Make sure nothing that can
+      // clobber the register is scheduled between the predecessor and
+      // this node.
+      SUnit *RegDef = LiveRegDefs[Pred.getReg()]; (void)RegDef;
+      assert((!RegDef || RegDef == SU || RegDef == Pred.getSUnit()) &&
+             "interference on register dependence");
+      LiveRegDefs[Pred.getReg()] = Pred.getSUnit();
+      if (!LiveRegGens[Pred.getReg()]) {
+        ++NumLiveRegs;
+        LiveRegGens[Pred.getReg()] = SU;
+      }
+    }
+  }
+
+  // If we're scheduling a lowered CALLSEQ_END, find the corresponding
+  // CALLSEQ_BEGIN. Inject an artificial physical register dependence between
+  // these nodes, to prevent other calls from being interscheduled with them.
+  unsigned CallResource = TRI->getNumRegs();
+  if (!LiveRegDefs[CallResource])
+    for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode())
+      if (Node->isMachineOpcode() &&
+          Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
+        unsigned NestLevel = 0;
+        unsigned MaxNest = 0;
+        SDNode *N = FindCallSeqStart(Node, NestLevel, MaxNest, TII);
+        assert(N && "Must find call sequence start");
+
+        SUnit *Def = &SUnits[N->getNodeId()];
+        CallSeqEndForStart[Def] = SU;
+
+        ++NumLiveRegs;
+        LiveRegDefs[CallResource] = Def;
+        LiveRegGens[CallResource] = SU;
+        break;
+      }
+}
+
+/// Check to see if any of the pending instructions are ready to issue.  If
+/// so, add them to the available queue.
+void ScheduleDAGRRList::ReleasePending() {
+  if (DisableSchedCycles) {
+    assert(PendingQueue.empty() && "pending instrs not allowed in this mode");
+    return;
+  }
+
+  // If the available queue is empty, it is safe to reset MinAvailableCycle.
+  if (AvailableQueue->empty())
+    MinAvailableCycle = std::numeric_limits<unsigned>::max();
+
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) {
+    unsigned ReadyCycle = PendingQueue[i]->getHeight();
+    if (ReadyCycle < MinAvailableCycle)
+      MinAvailableCycle = ReadyCycle;
+
+    if (PendingQueue[i]->isAvailable) {
+      if (!isReady(PendingQueue[i]))
+          continue;
+      AvailableQueue->push(PendingQueue[i]);
+    }
+    PendingQueue[i]->isPending = false;
+    PendingQueue[i] = PendingQueue.back();
+    PendingQueue.pop_back();
+    --i; --e;
+  }
+}
+
+/// Move the scheduler state forward by the specified number of Cycles.
+void ScheduleDAGRRList::AdvanceToCycle(unsigned NextCycle) {
+  if (NextCycle <= CurCycle)
+    return;
+
+  IssueCount = 0;
+  AvailableQueue->setCurCycle(NextCycle);
+  if (!HazardRec->isEnabled()) {
+    // Bypass lots of virtual calls in case of long latency.
+    CurCycle = NextCycle;
+  }
+  else {
+    for (; CurCycle != NextCycle; ++CurCycle) {
+      HazardRec->RecedeCycle();
+    }
+  }
+  // FIXME: Instead of visiting the pending Q each time, set a dirty flag on the
+  // available Q to release pending nodes at least once before popping.
+  ReleasePending();
+}
+
+/// Move the scheduler state forward until the specified node's dependents are
+/// ready and can be scheduled with no resource conflicts.
+void ScheduleDAGRRList::AdvancePastStalls(SUnit *SU) {
+  if (DisableSchedCycles)
+    return;
+
+  // FIXME: Nodes such as CopyFromReg probably should not advance the current
+  // cycle. Otherwise, we can wrongly mask real stalls. If the non-machine node
+  // has predecessors the cycle will be advanced when they are scheduled.
+  // But given the crude nature of modeling latency though such nodes, we
+  // currently need to treat these nodes like real instructions.
+  // if (!SU->getNode() || !SU->getNode()->isMachineOpcode()) return;
+
+  unsigned ReadyCycle = SU->getHeight();
+
+  // Bump CurCycle to account for latency. We assume the latency of other
+  // available instructions may be hidden by the stall (not a full pipe stall).
+  // This updates the hazard recognizer's cycle before reserving resources for
+  // this instruction.
+  AdvanceToCycle(ReadyCycle);
+
+  // Calls are scheduled in their preceding cycle, so don't conflict with
+  // hazards from instructions after the call. EmitNode will reset the
+  // scoreboard state before emitting the call.
+  if (SU->isCall)
+    return;
+
+  // FIXME: For resource conflicts in very long non-pipelined stages, we
+  // should probably skip ahead here to avoid useless scoreboard checks.
+  int Stalls = 0;
+  while (true) {
+    ScheduleHazardRecognizer::HazardType HT =
+      HazardRec->getHazardType(SU, -Stalls);
+
+    if (HT == ScheduleHazardRecognizer::NoHazard)
+      break;
+
+    ++Stalls;
+  }
+  AdvanceToCycle(CurCycle + Stalls);
+}
+
+/// Record this SUnit in the HazardRecognizer.
+/// Does not update CurCycle.
+void ScheduleDAGRRList::EmitNode(SUnit *SU) {
+  if (!HazardRec->isEnabled())
+    return;
+
+  // Check for phys reg copy.
+  if (!SU->getNode())
+    return;
+
+  switch (SU->getNode()->getOpcode()) {
+  default:
+    assert(SU->getNode()->isMachineOpcode() &&
+           "This target-independent node should not be scheduled.");
+    break;
+  case ISD::MERGE_VALUES:
+  case ISD::TokenFactor:
+  case ISD::LIFETIME_START:
+  case ISD::LIFETIME_END:
+  case ISD::CopyToReg:
+  case ISD::CopyFromReg:
+  case ISD::EH_LABEL:
+    // Noops don't affect the scoreboard state. Copies are likely to be
+    // removed.
+    return;
+  case ISD::INLINEASM:
+  case ISD::INLINEASM_BR:
+    // For inline asm, clear the pipeline state.
+    HazardRec->Reset();
+    return;
+  }
+  if (SU->isCall) {
+    // Calls are scheduled with their preceding instructions. For bottom-up
+    // scheduling, clear the pipeline state before emitting.
+    HazardRec->Reset();
+  }
+
+  HazardRec->EmitInstruction(SU);
+}
+
+static void resetVRegCycle(SUnit *SU);
+
+/// ScheduleNodeBottomUp - Add the node to the schedule. Decrement the pending
+/// count of its predecessors. If a predecessor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
+  LLVM_DEBUG(dbgs() << "\n*** Scheduling [" << CurCycle << "]: ");
+  LLVM_DEBUG(dumpNode(*SU));
+
+#ifndef NDEBUG
+  if (CurCycle < SU->getHeight())
+    LLVM_DEBUG(dbgs() << "   Height [" << SU->getHeight()
+                      << "] pipeline stall!\n");
+#endif
+
+  // FIXME: Do not modify node height. It may interfere with
+  // backtracking. Instead add a "ready cycle" to SUnit. Before scheduling the
+  // node its ready cycle can aid heuristics, and after scheduling it can
+  // indicate the scheduled cycle.
+  SU->setHeightToAtLeast(CurCycle);
+
+  // Reserve resources for the scheduled instruction.
+  EmitNode(SU);
+
+  Sequence.push_back(SU);
+
+  AvailableQueue->scheduledNode(SU);
+
+  // If HazardRec is disabled, and each inst counts as one cycle, then
+  // advance CurCycle before ReleasePredecessors to avoid useless pushes to
+  // PendingQueue for schedulers that implement HasReadyFilter.
+  if (!HazardRec->isEnabled() && AvgIPC < 2)
+    AdvanceToCycle(CurCycle + 1);
+
+  // Update liveness of predecessors before successors to avoid treating a
+  // two-address node as a live range def.
+  ReleasePredecessors(SU);
+
+  // Release all the implicit physical register defs that are live.
+  for (SDep &Succ : SU->Succs) {
+    // LiveRegDegs[Succ.getReg()] != SU when SU is a two-address node.
+    if (Succ.isAssignedRegDep() && LiveRegDefs[Succ.getReg()] == SU) {
+      assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+      --NumLiveRegs;
+      LiveRegDefs[Succ.getReg()] = nullptr;
+      LiveRegGens[Succ.getReg()] = nullptr;
+      releaseInterferences(Succ.getReg());
+    }
+  }
+  // Release the special call resource dependence, if this is the beginning
+  // of a call.
+  unsigned CallResource = TRI->getNumRegs();
+  if (LiveRegDefs[CallResource] == SU)
+    for (const SDNode *SUNode = SU->getNode(); SUNode;
+         SUNode = SUNode->getGluedNode()) {
+      if (SUNode->isMachineOpcode() &&
+          SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
+        assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+        --NumLiveRegs;
+        LiveRegDefs[CallResource] = nullptr;
+        LiveRegGens[CallResource] = nullptr;
+        releaseInterferences(CallResource);
+      }
+    }
+
+  resetVRegCycle(SU);
+
+  SU->isScheduled = true;
+
+  // Conditions under which the scheduler should eagerly advance the cycle:
+  // (1) No available instructions
+  // (2) All pipelines full, so available instructions must have hazards.
+  //
+  // If HazardRec is disabled, the cycle was pre-advanced before calling
+  // ReleasePredecessors. In that case, IssueCount should remain 0.
+  //
+  // Check AvailableQueue after ReleasePredecessors in case of zero latency.
+  if (HazardRec->isEnabled() || AvgIPC > 1) {
+    if (SU->getNode() && SU->getNode()->isMachineOpcode())
+      ++IssueCount;
+    if ((HazardRec->isEnabled() && HazardRec->atIssueLimit())
+        || (!HazardRec->isEnabled() && IssueCount == AvgIPC))
+      AdvanceToCycle(CurCycle + 1);
+  }
+}
+
+/// CapturePred - This does the opposite of ReleasePred. Since SU is being
+/// unscheduled, increase the succ left count of its predecessors. Remove
+/// them from AvailableQueue if necessary.
+void ScheduleDAGRRList::CapturePred(SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+  if (PredSU->isAvailable) {
+    PredSU->isAvailable = false;
+    if (!PredSU->isPending)
+      AvailableQueue->remove(PredSU);
+  }
+
+  assert(PredSU->NumSuccsLeft < std::numeric_limits<unsigned>::max() &&
+         "NumSuccsLeft will overflow!");
+  ++PredSU->NumSuccsLeft;
+}
+
+/// UnscheduleNodeBottomUp - Remove the node from the schedule, update its and
+/// its predecessor states to reflect the change.
+void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
+  LLVM_DEBUG(dbgs() << "*** Unscheduling [" << SU->getHeight() << "]: ");
+  LLVM_DEBUG(dumpNode(*SU));
+
+  for (SDep &Pred : SU->Preds) {
+    CapturePred(&Pred);
+    if (Pred.isAssignedRegDep() && SU == LiveRegGens[Pred.getReg()]){
+      assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+      assert(LiveRegDefs[Pred.getReg()] == Pred.getSUnit() &&
+             "Physical register dependency violated?");
+      --NumLiveRegs;
+      LiveRegDefs[Pred.getReg()] = nullptr;
+      LiveRegGens[Pred.getReg()] = nullptr;
+      releaseInterferences(Pred.getReg());
+    }
+  }
+
+  // Reclaim the special call resource dependence, if this is the beginning
+  // of a call.
+  unsigned CallResource = TRI->getNumRegs();
+  for (const SDNode *SUNode = SU->getNode(); SUNode;
+       SUNode = SUNode->getGluedNode()) {
+    if (SUNode->isMachineOpcode() &&
+        SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
+      SUnit *SeqEnd = CallSeqEndForStart[SU];
+      assert(SeqEnd && "Call sequence start/end must be known");
+      assert(!LiveRegDefs[CallResource]);
+      assert(!LiveRegGens[CallResource]);
+      ++NumLiveRegs;
+      LiveRegDefs[CallResource] = SU;
+      LiveRegGens[CallResource] = SeqEnd;
+    }
+  }
+
+  // Release the special call resource dependence, if this is the end
+  // of a call.
+  if (LiveRegGens[CallResource] == SU)
+    for (const SDNode *SUNode = SU->getNode(); SUNode;
+         SUNode = SUNode->getGluedNode()) {
+      if (SUNode->isMachineOpcode() &&
+          SUNode->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
+        assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+        assert(LiveRegDefs[CallResource]);
+        assert(LiveRegGens[CallResource]);
+        --NumLiveRegs;
+        LiveRegDefs[CallResource] = nullptr;
+        LiveRegGens[CallResource] = nullptr;
+        releaseInterferences(CallResource);
+      }
+    }
+
+  for (auto &Succ : SU->Succs) {
+    if (Succ.isAssignedRegDep()) {
+      auto Reg = Succ.getReg();
+      if (!LiveRegDefs[Reg])
+        ++NumLiveRegs;
+      // This becomes the nearest def. Note that an earlier def may still be
+      // pending if this is a two-address node.
+      LiveRegDefs[Reg] = SU;
+
+      // Update LiveRegGen only if was empty before this unscheduling.
+      // This is to avoid incorrect updating LiveRegGen set in previous run.
+      if (!LiveRegGens[Reg]) {
+        // Find the successor with the lowest height.
+        LiveRegGens[Reg] = Succ.getSUnit();
+        for (auto &Succ2 : SU->Succs) {
+          if (Succ2.isAssignedRegDep() && Succ2.getReg() == Reg &&
+              Succ2.getSUnit()->getHeight() < LiveRegGens[Reg]->getHeight())
+            LiveRegGens[Reg] = Succ2.getSUnit();
+        }
+      }
+    }
+  }
+  if (SU->getHeight() < MinAvailableCycle)
+    MinAvailableCycle = SU->getHeight();
+
+  SU->setHeightDirty();
+  SU->isScheduled = false;
+  SU->isAvailable = true;
+  if (!DisableSchedCycles && AvailableQueue->hasReadyFilter()) {
+    // Don't make available until backtracking is complete.
+    SU->isPending = true;
+    PendingQueue.push_back(SU);
+  }
+  else {
+    AvailableQueue->push(SU);
+  }
+  AvailableQueue->unscheduledNode(SU);
+}
+
+/// After backtracking, the hazard checker needs to be restored to a state
+/// corresponding the current cycle.
+void ScheduleDAGRRList::RestoreHazardCheckerBottomUp() {
+  HazardRec->Reset();
+
+  unsigned LookAhead = std::min((unsigned)Sequence.size(),
+                                HazardRec->getMaxLookAhead());
+  if (LookAhead == 0)
+    return;
+
+  std::vector<SUnit *>::const_iterator I = (Sequence.end() - LookAhead);
+  unsigned HazardCycle = (*I)->getHeight();
+  for (auto E = Sequence.end(); I != E; ++I) {
+    SUnit *SU = *I;
+    for (; SU->getHeight() > HazardCycle; ++HazardCycle) {
+      HazardRec->RecedeCycle();
+    }
+    EmitNode(SU);
+  }
+}
+
+/// BacktrackBottomUp - Backtrack scheduling to a previous cycle specified in
+/// BTCycle in order to schedule a specific node.
+void ScheduleDAGRRList::BacktrackBottomUp(SUnit *SU, SUnit *BtSU) {
+  SUnit *OldSU = Sequence.back();
+  while (true) {
+    Sequence.pop_back();
+    // FIXME: use ready cycle instead of height
+    CurCycle = OldSU->getHeight();
+    UnscheduleNodeBottomUp(OldSU);
+    AvailableQueue->setCurCycle(CurCycle);
+    if (OldSU == BtSU)
+      break;
+    OldSU = Sequence.back();
+  }
+
+  assert(!SU->isSucc(OldSU) && "Something is wrong!");
+
+  RestoreHazardCheckerBottomUp();
+
+  ReleasePending();
+
+  ++NumBacktracks;
+}
+
+static bool isOperandOf(const SUnit *SU, SDNode *N) {
+  for (const SDNode *SUNode = SU->getNode(); SUNode;
+       SUNode = SUNode->getGluedNode()) {
+    if (SUNode->isOperandOf(N))
+      return true;
+  }
+  return false;
+}
+
+/// TryUnfold - Attempt to unfold
+SUnit *ScheduleDAGRRList::TryUnfoldSU(SUnit *SU) {
+  SDNode *N = SU->getNode();
+  // Use while over if to ease fall through.
+  SmallVector<SDNode *, 2> NewNodes;
+  if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
+    return nullptr;
+
+  // unfolding an x86 DEC64m operation results in store, dec, load which
+  // can't be handled here so quit
+  if (NewNodes.size() == 3)
+    return nullptr;
+
+  assert(NewNodes.size() == 2 && "Expected a load folding node!");
+
+  N = NewNodes[1];
+  SDNode *LoadNode = NewNodes[0];
+  unsigned NumVals = N->getNumValues();
+  unsigned OldNumVals = SU->getNode()->getNumValues();
+
+  // LoadNode may already exist. This can happen when there is another
+  // load from the same location and producing the same type of value
+  // but it has different alignment or volatileness.
+  bool isNewLoad = true;
+  SUnit *LoadSU;
+  if (LoadNode->getNodeId() != -1) {
+    LoadSU = &SUnits[LoadNode->getNodeId()];
+    // If LoadSU has already been scheduled, we should clone it but
+    // this would negate the benefit to unfolding so just return SU.
+    if (LoadSU->isScheduled)
+      return SU;
+    isNewLoad = false;
+  } else {
+    LoadSU = CreateNewSUnit(LoadNode);
+    LoadNode->setNodeId(LoadSU->NodeNum);
+
+    InitNumRegDefsLeft(LoadSU);
+    computeLatency(LoadSU);
+  }
+
+  bool isNewN = true;
+  SUnit *NewSU;
+  // This can only happen when isNewLoad is false.
+  if (N->getNodeId() != -1) {
+    NewSU = &SUnits[N->getNodeId()];
+    // If NewSU has already been scheduled, we need to clone it, but this
+    // negates the benefit to unfolding so just return SU.
+    if (NewSU->isScheduled) {
+      return SU;
+    }
+    isNewN = false;
+  } else {
+    NewSU = CreateNewSUnit(N);
+    N->setNodeId(NewSU->NodeNum);
+
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
+      if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
+        NewSU->isTwoAddress = true;
+        break;
+      }
+    }
+    if (MCID.isCommutable())
+      NewSU->isCommutable = true;
+
+    InitNumRegDefsLeft(NewSU);
+    computeLatency(NewSU);
+  }
+
+  LLVM_DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n");
+
+  // Now that we are committed to unfolding replace DAG Uses.
+  for (unsigned i = 0; i != NumVals; ++i)
+    DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i));
+  DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals - 1),
+                                 SDValue(LoadNode, 1));
+
+  // Record all the edges to and from the old SU, by category.
+  SmallVector<SDep, 4> ChainPreds;
+  SmallVector<SDep, 4> ChainSuccs;
+  SmallVector<SDep, 4> LoadPreds;
+  SmallVector<SDep, 4> NodePreds;
+  SmallVector<SDep, 4> NodeSuccs;
+  for (SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
+      ChainPreds.push_back(Pred);
+    else if (isOperandOf(Pred.getSUnit(), LoadNode))
+      LoadPreds.push_back(Pred);
+    else
+      NodePreds.push_back(Pred);
+  }
+  for (SDep &Succ : SU->Succs) {
+    if (Succ.isCtrl())
+      ChainSuccs.push_back(Succ);
+    else
+      NodeSuccs.push_back(Succ);
+  }
+
+  // Now assign edges to the newly-created nodes.
+  for (const SDep &Pred : ChainPreds) {
+    RemovePred(SU, Pred);
+    if (isNewLoad)
+      AddPredQueued(LoadSU, Pred);
+  }
+  for (const SDep &Pred : LoadPreds) {
+    RemovePred(SU, Pred);
+    if (isNewLoad)
+      AddPredQueued(LoadSU, Pred);
+  }
+  for (const SDep &Pred : NodePreds) {
+    RemovePred(SU, Pred);
+    AddPredQueued(NewSU, Pred);
+  }
+  for (SDep &D : NodeSuccs) {
+    SUnit *SuccDep = D.getSUnit();
+    D.setSUnit(SU);
+    RemovePred(SuccDep, D);
+    D.setSUnit(NewSU);
+    AddPredQueued(SuccDep, D);
+    // Balance register pressure.
+    if (AvailableQueue->tracksRegPressure() && SuccDep->isScheduled &&
+        !D.isCtrl() && NewSU->NumRegDefsLeft > 0)
+      --NewSU->NumRegDefsLeft;
+  }
+  for (SDep &D : ChainSuccs) {
+    SUnit *SuccDep = D.getSUnit();
+    D.setSUnit(SU);
+    RemovePred(SuccDep, D);
+    if (isNewLoad) {
+      D.setSUnit(LoadSU);
+      AddPredQueued(SuccDep, D);
+    }
+  }
+
+  // Add a data dependency to reflect that NewSU reads the value defined
+  // by LoadSU.
+  SDep D(LoadSU, SDep::Data, 0);
+  D.setLatency(LoadSU->Latency);
+  AddPredQueued(NewSU, D);
+
+  if (isNewLoad)
+    AvailableQueue->addNode(LoadSU);
+  if (isNewN)
+    AvailableQueue->addNode(NewSU);
+
+  ++NumUnfolds;
+
+  if (NewSU->NumSuccsLeft == 0)
+    NewSU->isAvailable = true;
+
+  return NewSU;
+}
+
+/// CopyAndMoveSuccessors - Clone the specified node and move its scheduled
+/// successors to the newly created node.
+SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
+  SDNode *N = SU->getNode();
+  if (!N)
+    return nullptr;
+
+  LLVM_DEBUG(dbgs() << "Considering duplicating the SU\n");
+  LLVM_DEBUG(dumpNode(*SU));
+
+  if (N->getGluedNode() &&
+      !TII->canCopyGluedNodeDuringSchedule(N)) {
+    LLVM_DEBUG(
+        dbgs()
+        << "Giving up because it has incoming glue and the target does not "
+           "want to copy it\n");
+    return nullptr;
+  }
+
+  SUnit *NewSU;
+  bool TryUnfold = false;
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+    MVT VT = N->getSimpleValueType(i);
+    if (VT == MVT::Glue) {
+      LLVM_DEBUG(dbgs() << "Giving up because it has outgoing glue\n");
+      return nullptr;
+    } else if (VT == MVT::Other)
+      TryUnfold = true;
+  }
+  for (const SDValue &Op : N->op_values()) {
+    MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
+    if (VT == MVT::Glue && !TII->canCopyGluedNodeDuringSchedule(N)) {
+      LLVM_DEBUG(
+          dbgs() << "Giving up because it one of the operands is glue and "
+                    "the target does not want to copy it\n");
+      return nullptr;
+    }
+  }
+
+  // If possible unfold instruction.
+  if (TryUnfold) {
+    SUnit *UnfoldSU = TryUnfoldSU(SU);
+    if (!UnfoldSU)
+      return nullptr;
+    SU = UnfoldSU;
+    N = SU->getNode();
+    // If this can be scheduled don't bother duplicating and just return
+    if (SU->NumSuccsLeft == 0)
+      return SU;
+  }
+
+  LLVM_DEBUG(dbgs() << "    Duplicating SU #" << SU->NodeNum << "\n");
+  NewSU = CreateClone(SU);
+
+  // New SUnit has the exact same predecessors.
+  for (SDep &Pred : SU->Preds)
+    if (!Pred.isArtificial())
+      AddPredQueued(NewSU, Pred);
+
+  // Make sure the clone comes after the original. (InstrEmitter assumes
+  // this ordering.)
+  AddPredQueued(NewSU, SDep(SU, SDep::Artificial));
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SDep &Succ : SU->Succs) {
+    if (Succ.isArtificial())
+      continue;
+    SUnit *SuccSU = Succ.getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = Succ;
+      D.setSUnit(NewSU);
+      AddPredQueued(SuccSU, D);
+      D.setSUnit(SU);
+      DelDeps.emplace_back(SuccSU, D);
+    }
+  }
+  for (const auto &[DelSU, DelD] : DelDeps)
+    RemovePred(DelSU, DelD);
+
+  AvailableQueue->updateNode(SU);
+  AvailableQueue->addNode(NewSU);
+
+  ++NumDups;
+  return NewSU;
+}
+
+/// InsertCopiesAndMoveSuccs - Insert register copies and move all
+/// scheduled successors of the given SUnit to the last copy.
+void ScheduleDAGRRList::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
+                                              const TargetRegisterClass *DestRC,
+                                              const TargetRegisterClass *SrcRC,
+                                              SmallVectorImpl<SUnit*> &Copies) {
+  SUnit *CopyFromSU = CreateNewSUnit(nullptr);
+  CopyFromSU->CopySrcRC = SrcRC;
+  CopyFromSU->CopyDstRC = DestRC;
+
+  SUnit *CopyToSU = CreateNewSUnit(nullptr);
+  CopyToSU->CopySrcRC = DestRC;
+  CopyToSU->CopyDstRC = SrcRC;
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SDep &Succ : SU->Succs) {
+    if (Succ.isArtificial())
+      continue;
+    SUnit *SuccSU = Succ.getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = Succ;
+      D.setSUnit(CopyToSU);
+      AddPredQueued(SuccSU, D);
+      DelDeps.emplace_back(SuccSU, Succ);
+    }
+    else {
+      // Avoid scheduling the def-side copy before other successors. Otherwise,
+      // we could introduce another physreg interference on the copy and
+      // continue inserting copies indefinitely.
+      AddPredQueued(SuccSU, SDep(CopyFromSU, SDep::Artificial));
+    }
+  }
+  for (const auto &[DelSU, DelD] : DelDeps)
+    RemovePred(DelSU, DelD);
+
+  SDep FromDep(SU, SDep::Data, Reg);
+  FromDep.setLatency(SU->Latency);
+  AddPredQueued(CopyFromSU, FromDep);
+  SDep ToDep(CopyFromSU, SDep::Data, 0);
+  ToDep.setLatency(CopyFromSU->Latency);
+  AddPredQueued(CopyToSU, ToDep);
+
+  AvailableQueue->updateNode(SU);
+  AvailableQueue->addNode(CopyFromSU);
+  AvailableQueue->addNode(CopyToSU);
+  Copies.push_back(CopyFromSU);
+  Copies.push_back(CopyToSU);
+
+  ++NumPRCopies;
+}
+
+/// getPhysicalRegisterVT - Returns the ValueType of the physical register
+/// definition of the specified node.
+/// FIXME: Move to SelectionDAG?
+static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
+                                 const TargetInstrInfo *TII) {
+  unsigned NumRes;
+  if (N->getOpcode() == ISD::CopyFromReg) {
+    // CopyFromReg has: "chain, Val, glue" so operand 1 gives the type.
+    NumRes = 1;
+  } else {
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    assert(!MCID.implicit_defs().empty() &&
+           "Physical reg def must be in implicit def list!");
+    NumRes = MCID.getNumDefs();
+    for (MCPhysReg ImpDef : MCID.implicit_defs()) {
+      if (Reg == ImpDef)
+        break;
+      ++NumRes;
+    }
+  }
+  return N->getSimpleValueType(NumRes);
+}
+
+/// CheckForLiveRegDef - Return true and update live register vector if the
+/// specified register def of the specified SUnit clobbers any "live" registers.
+static void CheckForLiveRegDef(SUnit *SU, unsigned Reg, SUnit **LiveRegDefs,
+                               SmallSet<unsigned, 4> &RegAdded,
+                               SmallVectorImpl<unsigned> &LRegs,
+                               const TargetRegisterInfo *TRI,
+                               const SDNode *Node = nullptr) {
+  for (MCRegAliasIterator AliasI(Reg, TRI, true); AliasI.isValid(); ++AliasI) {
+
+    // Check if Ref is live.
+    if (!LiveRegDefs[*AliasI]) continue;
+
+    // Allow multiple uses of the same def.
+    if (LiveRegDefs[*AliasI] == SU) continue;
+
+    // Allow multiple uses of same def
+    if (Node && LiveRegDefs[*AliasI]->getNode() == Node)
+      continue;
+
+    // Add Reg to the set of interfering live regs.
+    if (RegAdded.insert(*AliasI).second) {
+      LRegs.push_back(*AliasI);
+    }
+  }
+}
+
+/// CheckForLiveRegDefMasked - Check for any live physregs that are clobbered
+/// by RegMask, and add them to LRegs.
+static void CheckForLiveRegDefMasked(SUnit *SU, const uint32_t *RegMask,
+                                     ArrayRef<SUnit*> LiveRegDefs,
+                                     SmallSet<unsigned, 4> &RegAdded,
+                                     SmallVectorImpl<unsigned> &LRegs) {
+  // Look at all live registers. Skip Reg0 and the special CallResource.
+  for (unsigned i = 1, e = LiveRegDefs.size()-1; i != e; ++i) {
+    if (!LiveRegDefs[i]) continue;
+    if (LiveRegDefs[i] == SU) continue;
+    if (!MachineOperand::clobbersPhysReg(RegMask, i)) continue;
+    if (RegAdded.insert(i).second)
+      LRegs.push_back(i);
+  }
+}
+
+/// getNodeRegMask - Returns the register mask attached to an SDNode, if any.
+static const uint32_t *getNodeRegMask(const SDNode *N) {
+  for (const SDValue &Op : N->op_values())
+    if (const auto *RegOp = dyn_cast<RegisterMaskSDNode>(Op.getNode()))
+      return RegOp->getRegMask();
+  return nullptr;
+}
+
+/// DelayForLiveRegsBottomUp - Returns true if it is necessary to delay
+/// scheduling of the given node to satisfy live physical register dependencies.
+/// If the specific node is the last one that's available to schedule, do
+/// whatever is necessary (i.e. backtracking or cloning) to make it possible.
+bool ScheduleDAGRRList::
+DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
+  if (NumLiveRegs == 0)
+    return false;
+
+  SmallSet<unsigned, 4> RegAdded;
+  // If this node would clobber any "live" register, then it's not ready.
+  //
+  // If SU is the currently live definition of the same register that it uses,
+  // then we are free to schedule it.
+  for (SDep &Pred : SU->Preds) {
+    if (Pred.isAssignedRegDep() && LiveRegDefs[Pred.getReg()] != SU)
+      CheckForLiveRegDef(Pred.getSUnit(), Pred.getReg(), LiveRegDefs.get(),
+                         RegAdded, LRegs, TRI);
+  }
+
+  for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode()) {
+    if (Node->getOpcode() == ISD::INLINEASM ||
+        Node->getOpcode() == ISD::INLINEASM_BR) {
+      // Inline asm can clobber physical defs.
+      unsigned NumOps = Node->getNumOperands();
+      if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
+        --NumOps;  // Ignore the glue operand.
+
+      for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+        unsigned Flags =
+          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+        unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+
+        ++i; // Skip the ID value.
+        if (InlineAsm::isRegDefKind(Flags) ||
+            InlineAsm::isRegDefEarlyClobberKind(Flags) ||
+            InlineAsm::isClobberKind(Flags)) {
+          // Check for def of register or earlyclobber register.
+          for (; NumVals; --NumVals, ++i) {
+            Register Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+            if (Reg.isPhysical())
+              CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
+          }
+        } else
+          i += NumVals;
+      }
+      continue;
+    }
+
+    if (Node->getOpcode() == ISD::CopyToReg) {
+      Register Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+      if (Reg.isPhysical()) {
+        SDNode *SrcNode = Node->getOperand(2).getNode();
+        CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI,
+                           SrcNode);
+      }
+    }
+
+    if (!Node->isMachineOpcode())
+      continue;
+    // If we're in the middle of scheduling a call, don't begin scheduling
+    // another call. Also, don't allow any physical registers to be live across
+    // the call.
+    if (Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
+      // Check the special calling-sequence resource.
+      unsigned CallResource = TRI->getNumRegs();
+      if (LiveRegDefs[CallResource]) {
+        SDNode *Gen = LiveRegGens[CallResource]->getNode();
+        while (SDNode *Glued = Gen->getGluedNode())
+          Gen = Glued;
+        if (!IsChainDependent(Gen, Node, 0, TII) &&
+            RegAdded.insert(CallResource).second)
+          LRegs.push_back(CallResource);
+      }
+    }
+    if (const uint32_t *RegMask = getNodeRegMask(Node))
+      CheckForLiveRegDefMasked(SU, RegMask,
+                               ArrayRef(LiveRegDefs.get(), TRI->getNumRegs()),
+                               RegAdded, LRegs);
+
+    const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
+    if (MCID.hasOptionalDef()) {
+      // Most ARM instructions have an OptionalDef for CPSR, to model the S-bit.
+      // This operand can be either a def of CPSR, if the S bit is set; or a use
+      // of %noreg.  When the OptionalDef is set to a valid register, we need to
+      // handle it in the same way as an ImplicitDef.
+      for (unsigned i = 0; i < MCID.getNumDefs(); ++i)
+        if (MCID.operands()[i].isOptionalDef()) {
+          const SDValue &OptionalDef = Node->getOperand(i - Node->getNumValues());
+          Register Reg = cast<RegisterSDNode>(OptionalDef)->getReg();
+          CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
+        }
+    }
+    for (MCPhysReg Reg : MCID.implicit_defs())
+      CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
+  }
+
+  return !LRegs.empty();
+}
+
+void ScheduleDAGRRList::releaseInterferences(unsigned Reg) {
+  // Add the nodes that aren't ready back onto the available list.
+  for (unsigned i = Interferences.size(); i > 0; --i) {
+    SUnit *SU = Interferences[i-1];
+    LRegsMapT::iterator LRegsPos = LRegsMap.find(SU);
+    if (Reg) {
+      SmallVectorImpl<unsigned> &LRegs = LRegsPos->second;
+      if (!is_contained(LRegs, Reg))
+        continue;
+    }
+    SU->isPending = false;
+    // The interfering node may no longer be available due to backtracking.
+    // Furthermore, it may have been made available again, in which case it is
+    // now already in the AvailableQueue.
+    if (SU->isAvailable && !SU->NodeQueueId) {
+      LLVM_DEBUG(dbgs() << "    Repushing SU #" << SU->NodeNum << '\n');
+      AvailableQueue->push(SU);
+    }
+    if (i < Interferences.size())
+      Interferences[i-1] = Interferences.back();
+    Interferences.pop_back();
+    LRegsMap.erase(LRegsPos);
+  }
+}
+
+/// Return a node that can be scheduled in this cycle. Requirements:
+/// (1) Ready: latency has been satisfied
+/// (2) No Hazards: resources are available
+/// (3) No Interferences: may unschedule to break register interferences.
+SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
+  SUnit *CurSU = AvailableQueue->empty() ? nullptr : AvailableQueue->pop();
+  auto FindAvailableNode = [&]() {
+    while (CurSU) {
+      SmallVector<unsigned, 4> LRegs;
+      if (!DelayForLiveRegsBottomUp(CurSU, LRegs))
+        break;
+      LLVM_DEBUG(dbgs() << "    Interfering reg ";
+                 if (LRegs[0] == TRI->getNumRegs()) dbgs() << "CallResource";
+                 else dbgs() << printReg(LRegs[0], TRI);
+                 dbgs() << " SU #" << CurSU->NodeNum << '\n');
+      auto [LRegsIter, LRegsInserted] = LRegsMap.try_emplace(CurSU, LRegs);
+      if (LRegsInserted) {
+        CurSU->isPending = true;  // This SU is not in AvailableQueue right now.
+        Interferences.push_back(CurSU);
+      }
+      else {
+        assert(CurSU->isPending && "Interferences are pending");
+        // Update the interference with current live regs.
+        LRegsIter->second = LRegs;
+      }
+      CurSU = AvailableQueue->pop();
+    }
+  };
+  FindAvailableNode();
+  if (CurSU)
+    return CurSU;
+
+  // We query the topological order in the loop body, so make sure outstanding
+  // updates are applied before entering it (we only enter the loop if there
+  // are some interferences). If we make changes to the ordering, we exit
+  // the loop.
+
+  // All candidates are delayed due to live physical reg dependencies.
+  // Try backtracking, code duplication, or inserting cross class copies
+  // to resolve it.
+  for (SUnit *TrySU : Interferences) {
+    SmallVectorImpl<unsigned> &LRegs = LRegsMap[TrySU];
+
+    // Try unscheduling up to the point where it's safe to schedule
+    // this node.
+    SUnit *BtSU = nullptr;
+    unsigned LiveCycle = std::numeric_limits<unsigned>::max();
+    for (unsigned Reg : LRegs) {
+      if (LiveRegGens[Reg]->getHeight() < LiveCycle) {
+        BtSU = LiveRegGens[Reg];
+        LiveCycle = BtSU->getHeight();
+      }
+    }
+    if (!WillCreateCycle(TrySU, BtSU))  {
+      // BacktrackBottomUp mutates Interferences!
+      BacktrackBottomUp(TrySU, BtSU);
+
+      // Force the current node to be scheduled before the node that
+      // requires the physical reg dep.
+      if (BtSU->isAvailable) {
+        BtSU->isAvailable = false;
+        if (!BtSU->isPending)
+          AvailableQueue->remove(BtSU);
+      }
+      LLVM_DEBUG(dbgs() << "ARTIFICIAL edge from SU(" << BtSU->NodeNum
+                        << ") to SU(" << TrySU->NodeNum << ")\n");
+      AddPredQueued(TrySU, SDep(BtSU, SDep::Artificial));
+
+      // If one or more successors has been unscheduled, then the current
+      // node is no longer available.
+      if (!TrySU->isAvailable || !TrySU->NodeQueueId) {
+        LLVM_DEBUG(dbgs() << "TrySU not available; choosing node from queue\n");
+        CurSU = AvailableQueue->pop();
+      } else {
+        LLVM_DEBUG(dbgs() << "TrySU available\n");
+        // Available and in AvailableQueue
+        AvailableQueue->remove(TrySU);
+        CurSU = TrySU;
+      }
+      FindAvailableNode();
+      // Interferences has been mutated. We must break.
+      break;
+    }
+  }
+
+  if (!CurSU) {
+    // Can't backtrack. If it's too expensive to copy the value, then try
+    // duplicate the nodes that produces these "too expensive to copy"
+    // values to break the dependency. In case even that doesn't work,
+    // insert cross class copies.
+    // If it's not too expensive, i.e. cost != -1, issue copies.
+    SUnit *TrySU = Interferences[0];
+    SmallVectorImpl<unsigned> &LRegs = LRegsMap[TrySU];
+    assert(LRegs.size() == 1 && "Can't handle this yet!");
+    unsigned Reg = LRegs[0];
+    SUnit *LRDef = LiveRegDefs[Reg];
+    MVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
+    const TargetRegisterClass *RC =
+      TRI->getMinimalPhysRegClass(Reg, VT);
+    const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
+
+    // If cross copy register class is the same as RC, then it must be possible
+    // copy the value directly. Do not try duplicate the def.
+    // If cross copy register class is not the same as RC, then it's possible to
+    // copy the value but it require cross register class copies and it is
+    // expensive.
+    // If cross copy register class is null, then it's not possible to copy
+    // the value at all.
+    SUnit *NewDef = nullptr;
+    if (DestRC != RC) {
+      NewDef = CopyAndMoveSuccessors(LRDef);
+      if (!DestRC && !NewDef)
+        report_fatal_error("Can't handle live physical register dependency!");
+    }
+    if (!NewDef) {
+      // Issue copies, these can be expensive cross register class copies.
+      SmallVector<SUnit*, 2> Copies;
+      InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
+      LLVM_DEBUG(dbgs() << "    Adding an edge from SU #" << TrySU->NodeNum
+                        << " to SU #" << Copies.front()->NodeNum << "\n");
+      AddPredQueued(TrySU, SDep(Copies.front(), SDep::Artificial));
+      NewDef = Copies.back();
+    }
+
+    LLVM_DEBUG(dbgs() << "    Adding an edge from SU #" << NewDef->NodeNum
+                      << " to SU #" << TrySU->NodeNum << "\n");
+    LiveRegDefs[Reg] = NewDef;
+    AddPredQueued(NewDef, SDep(TrySU, SDep::Artificial));
+    TrySU->isAvailable = false;
+    CurSU = NewDef;
+  }
+  assert(CurSU && "Unable to resolve live physical register dependencies!");
+  return CurSU;
+}
+
+/// ListScheduleBottomUp - The main loop of list scheduling for bottom-up
+/// schedulers.
+void ScheduleDAGRRList::ListScheduleBottomUp() {
+  // Release any predecessors of the special Exit node.
+  ReleasePredecessors(&ExitSU);
+
+  // Add root to Available queue.
+  if (!SUnits.empty()) {
+    SUnit *RootSU = &SUnits[DAG->getRoot().getNode()->getNodeId()];
+    assert(RootSU->Succs.empty() && "Graph root shouldn't have successors!");
+    RootSU->isAvailable = true;
+    AvailableQueue->push(RootSU);
+  }
+
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue->empty() || !Interferences.empty()) {
+    LLVM_DEBUG(dbgs() << "\nExamining Available:\n";
+               AvailableQueue->dump(this));
+
+    // Pick the best node to schedule taking all constraints into
+    // consideration.
+    SUnit *SU = PickNodeToScheduleBottomUp();
+
+    AdvancePastStalls(SU);
+
+    ScheduleNodeBottomUp(SU);
+
+    while (AvailableQueue->empty() && !PendingQueue.empty()) {
+      // Advance the cycle to free resources. Skip ahead to the next ready SU.
+      assert(MinAvailableCycle < std::numeric_limits<unsigned>::max() &&
+             "MinAvailableCycle uninitialized");
+      AdvanceToCycle(std::max(CurCycle + 1, MinAvailableCycle));
+    }
+  }
+
+  // Reverse the order if it is bottom up.
+  std::reverse(Sequence.begin(), Sequence.end());
+
+#ifndef NDEBUG
+  VerifyScheduledSequence(/*isBottomUp=*/true);
+#endif
+}
+
+namespace {
+
+class RegReductionPQBase;
+
+struct queue_sort {
+  bool isReady(SUnit* SU, unsigned CurCycle) const { return true; }
+};
+
+#ifndef NDEBUG
+template<class SF>
+struct reverse_sort : public queue_sort {
+  SF &SortFunc;
+
+  reverse_sort(SF &sf) : SortFunc(sf) {}
+
+  bool operator()(SUnit* left, SUnit* right) const {
+    // reverse left/right rather than simply !SortFunc(left, right)
+    // to expose different paths in the comparison logic.
+    return SortFunc(right, left);
+  }
+};
+#endif // NDEBUG
+
+/// bu_ls_rr_sort - Priority function for bottom up register pressure
+// reduction scheduler.
+struct bu_ls_rr_sort : public queue_sort {
+  enum {
+    IsBottomUp = true,
+    HasReadyFilter = false
+  };
+
+  RegReductionPQBase *SPQ;
+
+  bu_ls_rr_sort(RegReductionPQBase *spq) : SPQ(spq) {}
+
+  bool operator()(SUnit* left, SUnit* right) const;
+};
+
+// src_ls_rr_sort - Priority function for source order scheduler.
+struct src_ls_rr_sort : public queue_sort {
+  enum {
+    IsBottomUp = true,
+    HasReadyFilter = false
+  };
+
+  RegReductionPQBase *SPQ;
+
+  src_ls_rr_sort(RegReductionPQBase *spq) : SPQ(spq) {}
+
+  bool operator()(SUnit* left, SUnit* right) const;
+};
+
+// hybrid_ls_rr_sort - Priority function for hybrid scheduler.
+struct hybrid_ls_rr_sort : public queue_sort {
+  enum {
+    IsBottomUp = true,
+    HasReadyFilter = false
+  };
+
+  RegReductionPQBase *SPQ;
+
+  hybrid_ls_rr_sort(RegReductionPQBase *spq) : SPQ(spq) {}
+
+  bool isReady(SUnit *SU, unsigned CurCycle) const;
+
+  bool operator()(SUnit* left, SUnit* right) const;
+};
+
+// ilp_ls_rr_sort - Priority function for ILP (instruction level parallelism)
+// scheduler.
+struct ilp_ls_rr_sort : public queue_sort {
+  enum {
+    IsBottomUp = true,
+    HasReadyFilter = false
+  };
+
+  RegReductionPQBase *SPQ;
+
+  ilp_ls_rr_sort(RegReductionPQBase *spq) : SPQ(spq) {}
+
+  bool isReady(SUnit *SU, unsigned CurCycle) const;
+
+  bool operator()(SUnit* left, SUnit* right) const;
+};
+
+class RegReductionPQBase : public SchedulingPriorityQueue {
+protected:
+  std::vector<SUnit *> Queue;
+  unsigned CurQueueId = 0;
+  bool TracksRegPressure;
+  bool SrcOrder;
+
+  // SUnits - The SUnits for the current graph.
+  std::vector<SUnit> *SUnits = nullptr;
+
+  MachineFunction &MF;
+  const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  const TargetLowering *TLI = nullptr;
+  ScheduleDAGRRList *scheduleDAG = nullptr;
+
+  // SethiUllmanNumbers - The SethiUllman number for each node.
+  std::vector<unsigned> SethiUllmanNumbers;
+
+  /// RegPressure - Tracking current reg pressure per register class.
+  std::vector<unsigned> RegPressure;
+
+  /// RegLimit - Tracking the number of allocatable registers per register
+  /// class.
+  std::vector<unsigned> RegLimit;
+
+public:
+  RegReductionPQBase(MachineFunction &mf,
+                     bool hasReadyFilter,
+                     bool tracksrp,
+                     bool srcorder,
+                     const TargetInstrInfo *tii,
+                     const TargetRegisterInfo *tri,
+                     const TargetLowering *tli)
+    : SchedulingPriorityQueue(hasReadyFilter), TracksRegPressure(tracksrp),
+      SrcOrder(srcorder), MF(mf), TII(tii), TRI(tri), TLI(tli) {
+    if (TracksRegPressure) {
+      unsigned NumRC = TRI->getNumRegClasses();
+      RegLimit.resize(NumRC);
+      RegPressure.resize(NumRC);
+      std::fill(RegLimit.begin(), RegLimit.end(), 0);
+      std::fill(RegPressure.begin(), RegPressure.end(), 0);
+      for (const TargetRegisterClass *RC : TRI->regclasses())
+        RegLimit[RC->getID()] = tri->getRegPressureLimit(RC, MF);
+    }
+  }
+
+  void setScheduleDAG(ScheduleDAGRRList *scheduleDag) {
+    scheduleDAG = scheduleDag;
+  }
+
+  ScheduleHazardRecognizer* getHazardRec() {
+    return scheduleDAG->getHazardRec();
+  }
+
+  void initNodes(std::vector<SUnit> &sunits) override;
+
+  void addNode(const SUnit *SU) override;
+
+  void updateNode(const SUnit *SU) override;
+
+  void releaseState() override {
+    SUnits = nullptr;
+    SethiUllmanNumbers.clear();
+    std::fill(RegPressure.begin(), RegPressure.end(), 0);
+  }
+
+  unsigned getNodePriority(const SUnit *SU) const;
+
+  unsigned getNodeOrdering(const SUnit *SU) const {
+    if (!SU->getNode()) return 0;
+
+    return SU->getNode()->getIROrder();
+  }
+
+  bool empty() const override { return Queue.empty(); }
+
+  void push(SUnit *U) override {
+    assert(!U->NodeQueueId && "Node in the queue already");
+    U->NodeQueueId = ++CurQueueId;
+    Queue.push_back(U);
+  }
+
+  void remove(SUnit *SU) override {
+    assert(!Queue.empty() && "Queue is empty!");
+    assert(SU->NodeQueueId != 0 && "Not in queue!");
+    std::vector<SUnit *>::iterator I = llvm::find(Queue, SU);
+    if (I != std::prev(Queue.end()))
+      std::swap(*I, Queue.back());
+    Queue.pop_back();
+    SU->NodeQueueId = 0;
+  }
+
+  bool tracksRegPressure() const override { return TracksRegPressure; }
+
+  void dumpRegPressure() const;
+
+  bool HighRegPressure(const SUnit *SU) const;
+
+  bool MayReduceRegPressure(SUnit *SU) const;
+
+  int RegPressureDiff(SUnit *SU, unsigned &LiveUses) const;
+
+  void scheduledNode(SUnit *SU) override;
+
+  void unscheduledNode(SUnit *SU) override;
+
+protected:
+  bool canClobber(const SUnit *SU, const SUnit *Op);
+  void AddPseudoTwoAddrDeps();
+  void PrescheduleNodesWithMultipleUses();
+  void CalculateSethiUllmanNumbers();
+};
+
+template<class SF>
+static SUnit *popFromQueueImpl(std::vector<SUnit *> &Q, SF &Picker) {
+  unsigned BestIdx = 0;
+  // Only compute the cost for the first 1000 items in the queue, to avoid
+  // excessive compile-times for very large queues.
+  for (unsigned I = 1, E = std::min(Q.size(), (decltype(Q.size()))1000); I != E;
+       I++)
+    if (Picker(Q[BestIdx], Q[I]))
+      BestIdx = I;
+  SUnit *V = Q[BestIdx];
+  if (BestIdx + 1 != Q.size())
+    std::swap(Q[BestIdx], Q.back());
+  Q.pop_back();
+  return V;
+}
+
+template<class SF>
+SUnit *popFromQueue(std::vector<SUnit *> &Q, SF &Picker, ScheduleDAG *DAG) {
+#ifndef NDEBUG
+  if (DAG->StressSched) {
+    reverse_sort<SF> RPicker(Picker);
+    return popFromQueueImpl(Q, RPicker);
+  }
+#endif
+  (void)DAG;
+  return popFromQueueImpl(Q, Picker);
+}
+
+//===----------------------------------------------------------------------===//
+//                RegReductionPriorityQueue Definition
+//===----------------------------------------------------------------------===//
+//
+// This is a SchedulingPriorityQueue that schedules using Sethi Ullman numbers
+// to reduce register pressure.
+//
+template<class SF>
+class RegReductionPriorityQueue : public RegReductionPQBase {
+  SF Picker;
+
+public:
+  RegReductionPriorityQueue(MachineFunction &mf,
+                            bool tracksrp,
+                            bool srcorder,
+                            const TargetInstrInfo *tii,
+                            const TargetRegisterInfo *tri,
+                            const TargetLowering *tli)
+    : RegReductionPQBase(mf, SF::HasReadyFilter, tracksrp, srcorder,
+                         tii, tri, tli),
+      Picker(this) {}
+
+  bool isBottomUp() const override { return SF::IsBottomUp; }
+
+  bool isReady(SUnit *U) const override {
+    return Picker.HasReadyFilter && Picker.isReady(U, getCurCycle());
+  }
+
+  SUnit *pop() override {
+    if (Queue.empty()) return nullptr;
+
+    SUnit *V = popFromQueue(Queue, Picker, scheduleDAG);
+    V->NodeQueueId = 0;
+    return V;
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  LLVM_DUMP_METHOD void dump(ScheduleDAG *DAG) const override {
+    // Emulate pop() without clobbering NodeQueueIds.
+    std::vector<SUnit *> DumpQueue = Queue;
+    SF DumpPicker = Picker;
+    while (!DumpQueue.empty()) {
+      SUnit *SU = popFromQueue(DumpQueue, DumpPicker, scheduleDAG);
+      dbgs() << "Height " << SU->getHeight() << ": ";
+      DAG->dumpNode(*SU);
+    }
+  }
+#endif
+};
+
+using BURegReductionPriorityQueue = RegReductionPriorityQueue<bu_ls_rr_sort>;
+using SrcRegReductionPriorityQueue = RegReductionPriorityQueue<src_ls_rr_sort>;
+using HybridBURRPriorityQueue = RegReductionPriorityQueue<hybrid_ls_rr_sort>;
+using ILPBURRPriorityQueue = RegReductionPriorityQueue<ilp_ls_rr_sort>;
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//           Static Node Priority for Register Pressure Reduction
+//===----------------------------------------------------------------------===//
+
+// Check for special nodes that bypass scheduling heuristics.
+// Currently this pushes TokenFactor nodes down, but may be used for other
+// pseudo-ops as well.
+//
+// Return -1 to schedule right above left, 1 for left above right.
+// Return 0 if no bias exists.
+static int checkSpecialNodes(const SUnit *left, const SUnit *right) {
+  bool LSchedLow = left->isScheduleLow;
+  bool RSchedLow = right->isScheduleLow;
+  if (LSchedLow != RSchedLow)
+    return LSchedLow < RSchedLow ? 1 : -1;
+  return 0;
+}
+
+/// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number.
+/// Smaller number is the higher priority.
+static unsigned
+CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) {
+  if (SUNumbers[SU->NodeNum] != 0)
+    return SUNumbers[SU->NodeNum];
+
+  // Use WorkList to avoid stack overflow on excessively large IRs.
+  struct WorkState {
+    WorkState(const SUnit *SU) : SU(SU) {}
+    const SUnit *SU;
+    unsigned PredsProcessed = 0;
+  };
+
+  SmallVector<WorkState, 16> WorkList;
+  WorkList.push_back(SU);
+  while (!WorkList.empty()) {
+    auto &Temp = WorkList.back();
+    auto *TempSU = Temp.SU;
+    bool AllPredsKnown = true;
+    // Try to find a non-evaluated pred and push it into the processing stack.
+    for (unsigned P = Temp.PredsProcessed; P < TempSU->Preds.size(); ++P) {
+      auto &Pred = TempSU->Preds[P];
+      if (Pred.isCtrl()) continue;  // ignore chain preds
+      SUnit *PredSU = Pred.getSUnit();
+      if (SUNumbers[PredSU->NodeNum] == 0) {
+#ifndef NDEBUG
+        // In debug mode, check that we don't have such element in the stack.
+        for (auto It : WorkList)
+          assert(It.SU != PredSU && "Trying to push an element twice?");
+#endif
+        // Next time start processing this one starting from the next pred.
+        Temp.PredsProcessed = P + 1;
+        WorkList.push_back(PredSU);
+        AllPredsKnown = false;
+        break;
+      }
+    }
+
+    if (!AllPredsKnown)
+      continue;
+
+    // Once all preds are known, we can calculate the answer for this one.
+    unsigned SethiUllmanNumber = 0;
+    unsigned Extra = 0;
+    for (const SDep &Pred : TempSU->Preds) {
+      if (Pred.isCtrl()) continue;  // ignore chain preds
+      SUnit *PredSU = Pred.getSUnit();
+      unsigned PredSethiUllman = SUNumbers[PredSU->NodeNum];
+      assert(PredSethiUllman > 0 && "We should have evaluated this pred!");
+      if (PredSethiUllman > SethiUllmanNumber) {
+        SethiUllmanNumber = PredSethiUllman;
+        Extra = 0;
+      } else if (PredSethiUllman == SethiUllmanNumber)
+        ++Extra;
+    }
+
+    SethiUllmanNumber += Extra;
+    if (SethiUllmanNumber == 0)
+      SethiUllmanNumber = 1;
+    SUNumbers[TempSU->NodeNum] = SethiUllmanNumber;
+    WorkList.pop_back();
+  }
+
+  assert(SUNumbers[SU->NodeNum] > 0 && "SethiUllman should never be zero!");
+  return SUNumbers[SU->NodeNum];
+}
+
+/// CalculateSethiUllmanNumbers - Calculate Sethi-Ullman numbers of all
+/// scheduling units.
+void RegReductionPQBase::CalculateSethiUllmanNumbers() {
+  SethiUllmanNumbers.assign(SUnits->size(), 0);
+
+  for (const SUnit &SU : *SUnits)
+    CalcNodeSethiUllmanNumber(&SU, SethiUllmanNumbers);
+}
+
+void RegReductionPQBase::addNode(const SUnit *SU) {
+  unsigned SUSize = SethiUllmanNumbers.size();
+  if (SUnits->size() > SUSize)
+    SethiUllmanNumbers.resize(SUSize*2, 0);
+  CalcNodeSethiUllmanNumber(SU, SethiUllmanNumbers);
+}
+
+void RegReductionPQBase::updateNode(const SUnit *SU) {
+  SethiUllmanNumbers[SU->NodeNum] = 0;
+  CalcNodeSethiUllmanNumber(SU, SethiUllmanNumbers);
+}
+
+// Lower priority means schedule further down. For bottom-up scheduling, lower
+// priority SUs are scheduled before higher priority SUs.
+unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const {
+  assert(SU->NodeNum < SethiUllmanNumbers.size());
+  unsigned Opc = SU->getNode() ? SU->getNode()->getOpcode() : 0;
+  if (Opc == ISD::TokenFactor || Opc == ISD::CopyToReg)
+    // CopyToReg should be close to its uses to facilitate coalescing and
+    // avoid spilling.
+    return 0;
+  if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+      Opc == TargetOpcode::SUBREG_TO_REG ||
+      Opc == TargetOpcode::INSERT_SUBREG)
+    // EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG nodes should be
+    // close to their uses to facilitate coalescing.
+    return 0;
+  if (SU->NumSuccs == 0 && SU->NumPreds != 0)
+    // If SU does not have a register use, i.e. it doesn't produce a value
+    // that would be consumed (e.g. store), then it terminates a chain of
+    // computation.  Give it a large SethiUllman number so it will be
+    // scheduled right before its predecessors that it doesn't lengthen
+    // their live ranges.
+    return 0xffff;
+  if (SU->NumPreds == 0 && SU->NumSuccs != 0)
+    // If SU does not have a register def, schedule it close to its uses
+    // because it does not lengthen any live ranges.
+    return 0;
+#if 1
+  return SethiUllmanNumbers[SU->NodeNum];
+#else
+  unsigned Priority = SethiUllmanNumbers[SU->NodeNum];
+  if (SU->isCallOp) {
+    // FIXME: This assumes all of the defs are used as call operands.
+    int NP = (int)Priority - SU->getNode()->getNumValues();
+    return (NP > 0) ? NP : 0;
+  }
+  return Priority;
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                     Register Pressure Tracking
+//===----------------------------------------------------------------------===//
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegReductionPQBase::dumpRegPressure() const {
+  for (const TargetRegisterClass *RC : TRI->regclasses()) {
+    unsigned Id = RC->getID();
+    unsigned RP = RegPressure[Id];
+    if (!RP) continue;
+    LLVM_DEBUG(dbgs() << TRI->getRegClassName(RC) << ": " << RP << " / "
+                      << RegLimit[Id] << '\n');
+  }
+}
+#endif
+
+bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const {
+  if (!TLI)
+    return false;
+
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
+      continue;
+    SUnit *PredSU = Pred.getSUnit();
+    // NumRegDefsLeft is zero when enough uses of this node have been scheduled
+    // to cover the number of registers defined (they are all live).
+    if (PredSU->NumRegDefsLeft == 0) {
+      continue;
+    }
+    for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
+         RegDefPos.IsValid(); RegDefPos.Advance()) {
+      unsigned RCId, Cost;
+      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
+
+      if ((RegPressure[RCId] + Cost) >= RegLimit[RCId])
+        return true;
+    }
+  }
+  return false;
+}
+
+bool RegReductionPQBase::MayReduceRegPressure(SUnit *SU) const {
+  const SDNode *N = SU->getNode();
+
+  if (!N->isMachineOpcode() || !SU->NumSuccs)
+    return false;
+
+  unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+  for (unsigned i = 0; i != NumDefs; ++i) {
+    MVT VT = N->getSimpleValueType(i);
+    if (!N->hasAnyUseOfValue(i))
+      continue;
+    unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+    if (RegPressure[RCId] >= RegLimit[RCId])
+      return true;
+  }
+  return false;
+}
+
+// Compute the register pressure contribution by this instruction by count up
+// for uses that are not live and down for defs. Only count register classes
+// that are already under high pressure. As a side effect, compute the number of
+// uses of registers that are already live.
+//
+// FIXME: This encompasses the logic in HighRegPressure and MayReduceRegPressure
+// so could probably be factored.
+int RegReductionPQBase::RegPressureDiff(SUnit *SU, unsigned &LiveUses) const {
+  LiveUses = 0;
+  int PDiff = 0;
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
+      continue;
+    SUnit *PredSU = Pred.getSUnit();
+    // NumRegDefsLeft is zero when enough uses of this node have been scheduled
+    // to cover the number of registers defined (they are all live).
+    if (PredSU->NumRegDefsLeft == 0) {
+      if (PredSU->getNode()->isMachineOpcode())
+        ++LiveUses;
+      continue;
+    }
+    for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
+         RegDefPos.IsValid(); RegDefPos.Advance()) {
+      MVT VT = RegDefPos.GetValue();
+      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+      if (RegPressure[RCId] >= RegLimit[RCId])
+        ++PDiff;
+    }
+  }
+  const SDNode *N = SU->getNode();
+
+  if (!N || !N->isMachineOpcode() || !SU->NumSuccs)
+    return PDiff;
+
+  unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+  for (unsigned i = 0; i != NumDefs; ++i) {
+    MVT VT = N->getSimpleValueType(i);
+    if (!N->hasAnyUseOfValue(i))
+      continue;
+    unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+    if (RegPressure[RCId] >= RegLimit[RCId])
+      --PDiff;
+  }
+  return PDiff;
+}
+
+void RegReductionPQBase::scheduledNode(SUnit *SU) {
+  if (!TracksRegPressure)
+    return;
+
+  if (!SU->getNode())
+    return;
+
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
+      continue;
+    SUnit *PredSU = Pred.getSUnit();
+    // NumRegDefsLeft is zero when enough uses of this node have been scheduled
+    // to cover the number of registers defined (they are all live).
+    if (PredSU->NumRegDefsLeft == 0) {
+      continue;
+    }
+    // FIXME: The ScheduleDAG currently loses information about which of a
+    // node's values is consumed by each dependence. Consequently, if the node
+    // defines multiple register classes, we don't know which to pressurize
+    // here. Instead the following loop consumes the register defs in an
+    // arbitrary order. At least it handles the common case of clustered loads
+    // to the same class. For precise liveness, each SDep needs to indicate the
+    // result number. But that tightly couples the ScheduleDAG with the
+    // SelectionDAG making updates tricky. A simpler hack would be to attach a
+    // value type or register class to SDep.
+    //
+    // The most important aspect of register tracking is balancing the increase
+    // here with the reduction further below. Note that this SU may use multiple
+    // defs in PredSU. The can't be determined here, but we've already
+    // compensated by reducing NumRegDefsLeft in PredSU during
+    // ScheduleDAGSDNodes::AddSchedEdges.
+    --PredSU->NumRegDefsLeft;
+    unsigned SkipRegDefs = PredSU->NumRegDefsLeft;
+    for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
+         RegDefPos.IsValid(); RegDefPos.Advance(), --SkipRegDefs) {
+      if (SkipRegDefs)
+        continue;
+
+      unsigned RCId, Cost;
+      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
+      RegPressure[RCId] += Cost;
+      break;
+    }
+  }
+
+  // We should have this assert, but there may be dead SDNodes that never
+  // materialize as SUnits, so they don't appear to generate liveness.
+  //assert(SU->NumRegDefsLeft == 0 && "not all regdefs have scheduled uses");
+  int SkipRegDefs = (int)SU->NumRegDefsLeft;
+  for (ScheduleDAGSDNodes::RegDefIter RegDefPos(SU, scheduleDAG);
+       RegDefPos.IsValid(); RegDefPos.Advance(), --SkipRegDefs) {
+    if (SkipRegDefs > 0)
+      continue;
+    unsigned RCId, Cost;
+    GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
+    if (RegPressure[RCId] < Cost) {
+      // Register pressure tracking is imprecise. This can happen. But we try
+      // hard not to let it happen because it likely results in poor scheduling.
+      LLVM_DEBUG(dbgs() << "  SU(" << SU->NodeNum
+                        << ") has too many regdefs\n");
+      RegPressure[RCId] = 0;
+    }
+    else {
+      RegPressure[RCId] -= Cost;
+    }
+  }
+  LLVM_DEBUG(dumpRegPressure());
+}
+
+void RegReductionPQBase::unscheduledNode(SUnit *SU) {
+  if (!TracksRegPressure)
+    return;
+
+  const SDNode *N = SU->getNode();
+  if (!N) return;
+
+  if (!N->isMachineOpcode()) {
+    if (N->getOpcode() != ISD::CopyToReg)
+      return;
+  } else {
+    unsigned Opc = N->getMachineOpcode();
+    if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+        Opc == TargetOpcode::INSERT_SUBREG ||
+        Opc == TargetOpcode::SUBREG_TO_REG ||
+        Opc == TargetOpcode::REG_SEQUENCE ||
+        Opc == TargetOpcode::IMPLICIT_DEF)
+      return;
+  }
+
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
+      continue;
+    SUnit *PredSU = Pred.getSUnit();
+    // NumSuccsLeft counts all deps. Don't compare it with NumSuccs which only
+    // counts data deps.
+    if (PredSU->NumSuccsLeft != PredSU->Succs.size())
+      continue;
+    const SDNode *PN = PredSU->getNode();
+    if (!PN->isMachineOpcode()) {
+      if (PN->getOpcode() == ISD::CopyFromReg) {
+        MVT VT = PN->getSimpleValueType(0);
+        unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+        RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+      }
+      continue;
+    }
+    unsigned POpc = PN->getMachineOpcode();
+    if (POpc == TargetOpcode::IMPLICIT_DEF)
+      continue;
+    if (POpc == TargetOpcode::EXTRACT_SUBREG ||
+        POpc == TargetOpcode::INSERT_SUBREG ||
+        POpc == TargetOpcode::SUBREG_TO_REG) {
+      MVT VT = PN->getSimpleValueType(0);
+      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+      RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+      continue;
+    }
+    if (POpc == TargetOpcode::REG_SEQUENCE) {
+      unsigned DstRCIdx =
+          cast<ConstantSDNode>(PN->getOperand(0))->getZExtValue();
+      const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
+      unsigned RCId = RC->getID();
+      // REG_SEQUENCE is untyped, so getRepRegClassCostFor could not be used
+      // here. Instead use the same constant as in GetCostForDef.
+      RegPressure[RCId] += RegSequenceCost;
+      continue;
+    }
+    unsigned NumDefs = TII->get(PN->getMachineOpcode()).getNumDefs();
+    for (unsigned i = 0; i != NumDefs; ++i) {
+      MVT VT = PN->getSimpleValueType(i);
+      if (!PN->hasAnyUseOfValue(i))
+        continue;
+      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+      if (RegPressure[RCId] < TLI->getRepRegClassCostFor(VT))
+        // Register pressure tracking is imprecise. This can happen.
+        RegPressure[RCId] = 0;
+      else
+        RegPressure[RCId] -= TLI->getRepRegClassCostFor(VT);
+    }
+  }
+
+  // Check for isMachineOpcode() as PrescheduleNodesWithMultipleUses()
+  // may transfer data dependencies to CopyToReg.
+  if (SU->NumSuccs && N->isMachineOpcode()) {
+    unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+    for (unsigned i = NumDefs, e = N->getNumValues(); i != e; ++i) {
+      MVT VT = N->getSimpleValueType(i);
+      if (VT == MVT::Glue || VT == MVT::Other)
+        continue;
+      if (!N->hasAnyUseOfValue(i))
+        continue;
+      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+      RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+    }
+  }
+
+  LLVM_DEBUG(dumpRegPressure());
+}
+
+//===----------------------------------------------------------------------===//
+//           Dynamic Node Priority for Register Pressure Reduction
+//===----------------------------------------------------------------------===//
+
+/// closestSucc - Returns the scheduled cycle of the successor which is
+/// closest to the current cycle.
+static unsigned closestSucc(const SUnit *SU) {
+  unsigned MaxHeight = 0;
+  for (const SDep &Succ : SU->Succs) {
+    if (Succ.isCtrl()) continue;  // ignore chain succs
+    unsigned Height = Succ.getSUnit()->getHeight();
+    // If there are bunch of CopyToRegs stacked up, they should be considered
+    // to be at the same position.
+    if (Succ.getSUnit()->getNode() &&
+        Succ.getSUnit()->getNode()->getOpcode() == ISD::CopyToReg)
+      Height = closestSucc(Succ.getSUnit())+1;
+    if (Height > MaxHeight)
+      MaxHeight = Height;
+  }
+  return MaxHeight;
+}
+
+/// calcMaxScratches - Returns an cost estimate of the worse case requirement
+/// for scratch registers, i.e. number of data dependencies.
+static unsigned calcMaxScratches(const SUnit *SU) {
+  unsigned Scratches = 0;
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;  // ignore chain preds
+    Scratches++;
+  }
+  return Scratches;
+}
+
+/// hasOnlyLiveInOpers - Return true if SU has only value predecessors that are
+/// CopyFromReg from a virtual register.
+static bool hasOnlyLiveInOpers(const SUnit *SU) {
+  bool RetVal = false;
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;
+    const SUnit *PredSU = Pred.getSUnit();
+    if (PredSU->getNode() &&
+        PredSU->getNode()->getOpcode() == ISD::CopyFromReg) {
+      Register Reg =
+          cast<RegisterSDNode>(PredSU->getNode()->getOperand(1))->getReg();
+      if (Reg.isVirtual()) {
+        RetVal = true;
+        continue;
+      }
+    }
+    return false;
+  }
+  return RetVal;
+}
+
+/// hasOnlyLiveOutUses - Return true if SU has only value successors that are
+/// CopyToReg to a virtual register. This SU def is probably a liveout and
+/// it has no other use. It should be scheduled closer to the terminator.
+static bool hasOnlyLiveOutUses(const SUnit *SU) {
+  bool RetVal = false;
+  for (const SDep &Succ : SU->Succs) {
+    if (Succ.isCtrl()) continue;
+    const SUnit *SuccSU = Succ.getSUnit();
+    if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg) {
+      Register Reg =
+          cast<RegisterSDNode>(SuccSU->getNode()->getOperand(1))->getReg();
+      if (Reg.isVirtual()) {
+        RetVal = true;
+        continue;
+      }
+    }
+    return false;
+  }
+  return RetVal;
+}
+
+// Set isVRegCycle for a node with only live in opers and live out uses. Also
+// set isVRegCycle for its CopyFromReg operands.
+//
+// This is only relevant for single-block loops, in which case the VRegCycle
+// node is likely an induction variable in which the operand and target virtual
+// registers should be coalesced (e.g. pre/post increment values). Setting the
+// isVRegCycle flag helps the scheduler prioritize other uses of the same
+// CopyFromReg so that this node becomes the virtual register "kill". This
+// avoids interference between the values live in and out of the block and
+// eliminates a copy inside the loop.
+static void initVRegCycle(SUnit *SU) {
+  if (DisableSchedVRegCycle)
+    return;
+
+  if (!hasOnlyLiveInOpers(SU) || !hasOnlyLiveOutUses(SU))
+    return;
+
+  LLVM_DEBUG(dbgs() << "VRegCycle: SU(" << SU->NodeNum << ")\n");
+
+  SU->isVRegCycle = true;
+
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;
+    Pred.getSUnit()->isVRegCycle = true;
+  }
+}
+
+// After scheduling the definition of a VRegCycle, clear the isVRegCycle flag of
+// CopyFromReg operands. We should no longer penalize other uses of this VReg.
+static void resetVRegCycle(SUnit *SU) {
+  if (!SU->isVRegCycle)
+    return;
+
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;  // ignore chain preds
+    SUnit *PredSU = Pred.getSUnit();
+    if (PredSU->isVRegCycle) {
+      assert(PredSU->getNode()->getOpcode() == ISD::CopyFromReg &&
+             "VRegCycle def must be CopyFromReg");
+      Pred.getSUnit()->isVRegCycle = false;
+    }
+  }
+}
+
+// Return true if this SUnit uses a CopyFromReg node marked as a VRegCycle. This
+// means a node that defines the VRegCycle has not been scheduled yet.
+static bool hasVRegCycleUse(const SUnit *SU) {
+  // If this SU also defines the VReg, don't hoist it as a "use".
+  if (SU->isVRegCycle)
+    return false;
+
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;  // ignore chain preds
+    if (Pred.getSUnit()->isVRegCycle &&
+        Pred.getSUnit()->getNode()->getOpcode() == ISD::CopyFromReg) {
+      LLVM_DEBUG(dbgs() << "  VReg cycle use: SU (" << SU->NodeNum << ")\n");
+      return true;
+    }
+  }
+  return false;
+}
+
+// Check for either a dependence (latency) or resource (hazard) stall.
+//
+// Note: The ScheduleHazardRecognizer interface requires a non-const SU.
+static bool BUHasStall(SUnit *SU, int Height, RegReductionPQBase *SPQ) {
+  if ((int)SPQ->getCurCycle() < Height) return true;
+  if (SPQ->getHazardRec()->getHazardType(SU, 0)
+      != ScheduleHazardRecognizer::NoHazard)
+    return true;
+  return false;
+}
+
+// Return -1 if left has higher priority, 1 if right has higher priority.
+// Return 0 if latency-based priority is equivalent.
+static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref,
+                            RegReductionPQBase *SPQ) {
+  // Scheduling an instruction that uses a VReg whose postincrement has not yet
+  // been scheduled will induce a copy. Model this as an extra cycle of latency.
+  int LPenalty = hasVRegCycleUse(left) ? 1 : 0;
+  int RPenalty = hasVRegCycleUse(right) ? 1 : 0;
+  int LHeight = (int)left->getHeight() + LPenalty;
+  int RHeight = (int)right->getHeight() + RPenalty;
+
+  bool LStall = (!checkPref || left->SchedulingPref == Sched::ILP) &&
+    BUHasStall(left, LHeight, SPQ);
+  bool RStall = (!checkPref || right->SchedulingPref == Sched::ILP) &&
+    BUHasStall(right, RHeight, SPQ);
+
+  // If scheduling one of the node will cause a pipeline stall, delay it.
+  // If scheduling either one of the node will cause a pipeline stall, sort
+  // them according to their height.
+  if (LStall) {
+    if (!RStall)
+      return 1;
+    if (LHeight != RHeight)
+      return LHeight > RHeight ? 1 : -1;
+  } else if (RStall)
+    return -1;
+
+  // If either node is scheduling for latency, sort them by height/depth
+  // and latency.
+  if (!checkPref || (left->SchedulingPref == Sched::ILP ||
+                     right->SchedulingPref == Sched::ILP)) {
+    // If neither instruction stalls (!LStall && !RStall) and HazardRecognizer
+    // is enabled, grouping instructions by cycle, then its height is already
+    // covered so only its depth matters. We also reach this point if both stall
+    // but have the same height.
+    if (!SPQ->getHazardRec()->isEnabled()) {
+      if (LHeight != RHeight)
+        return LHeight > RHeight ? 1 : -1;
+    }
+    int LDepth = left->getDepth() - LPenalty;
+    int RDepth = right->getDepth() - RPenalty;
+    if (LDepth != RDepth) {
+      LLVM_DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum
+                        << ") depth " << LDepth << " vs SU (" << right->NodeNum
+                        << ") depth " << RDepth << "\n");
+      return LDepth < RDepth ? 1 : -1;
+    }
+    if (left->Latency != right->Latency)
+      return left->Latency > right->Latency ? 1 : -1;
+  }
+  return 0;
+}
+
+static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
+  // Schedule physical register definitions close to their use. This is
+  // motivated by microarchitectures that can fuse cmp+jump macro-ops. But as
+  // long as shortening physreg live ranges is generally good, we can defer
+  // creating a subtarget hook.
+  if (!DisableSchedPhysRegJoin) {
+    bool LHasPhysReg = left->hasPhysRegDefs;
+    bool RHasPhysReg = right->hasPhysRegDefs;
+    if (LHasPhysReg != RHasPhysReg) {
+      #ifndef NDEBUG
+      static const char *const PhysRegMsg[] = { " has no physreg",
+                                                " defines a physreg" };
+      #endif
+      LLVM_DEBUG(dbgs() << "  SU (" << left->NodeNum << ") "
+                        << PhysRegMsg[LHasPhysReg] << " SU(" << right->NodeNum
+                        << ") " << PhysRegMsg[RHasPhysReg] << "\n");
+      return LHasPhysReg < RHasPhysReg;
+    }
+  }
+
+  // Prioritize by Sethi-Ulmann number and push CopyToReg nodes down.
+  unsigned LPriority = SPQ->getNodePriority(left);
+  unsigned RPriority = SPQ->getNodePriority(right);
+
+  // Be really careful about hoisting call operands above previous calls.
+  // Only allows it if it would reduce register pressure.
+  if (left->isCall && right->isCallOp) {
+    unsigned RNumVals = right->getNode()->getNumValues();
+    RPriority = (RPriority > RNumVals) ? (RPriority - RNumVals) : 0;
+  }
+  if (right->isCall && left->isCallOp) {
+    unsigned LNumVals = left->getNode()->getNumValues();
+    LPriority = (LPriority > LNumVals) ? (LPriority - LNumVals) : 0;
+  }
+
+  if (LPriority != RPriority)
+    return LPriority > RPriority;
+
+  // One or both of the nodes are calls and their sethi-ullman numbers are the
+  // same, then keep source order.
+  if (left->isCall || right->isCall) {
+    unsigned LOrder = SPQ->getNodeOrdering(left);
+    unsigned ROrder = SPQ->getNodeOrdering(right);
+
+    // Prefer an ordering where the lower the non-zero order number, the higher
+    // the preference.
+    if ((LOrder || ROrder) && LOrder != ROrder)
+      return LOrder != 0 && (LOrder < ROrder || ROrder == 0);
+  }
+
+  // Try schedule def + use closer when Sethi-Ullman numbers are the same.
+  // e.g.
+  // t1 = op t2, c1
+  // t3 = op t4, c2
+  //
+  // and the following instructions are both ready.
+  // t2 = op c3
+  // t4 = op c4
+  //
+  // Then schedule t2 = op first.
+  // i.e.
+  // t4 = op c4
+  // t2 = op c3
+  // t1 = op t2, c1
+  // t3 = op t4, c2
+  //
+  // This creates more short live intervals.
+  unsigned LDist = closestSucc(left);
+  unsigned RDist = closestSucc(right);
+  if (LDist != RDist)
+    return LDist < RDist;
+
+  // How many registers becomes live when the node is scheduled.
+  unsigned LScratch = calcMaxScratches(left);
+  unsigned RScratch = calcMaxScratches(right);
+  if (LScratch != RScratch)
+    return LScratch > RScratch;
+
+  // Comparing latency against a call makes little sense unless the node
+  // is register pressure-neutral.
+  if ((left->isCall && RPriority > 0) || (right->isCall && LPriority > 0))
+    return (left->NodeQueueId > right->NodeQueueId);
+
+  // Do not compare latencies when one or both of the nodes are calls.
+  if (!DisableSchedCycles &&
+      !(left->isCall || right->isCall)) {
+    int result = BUCompareLatency(left, right, false /*checkPref*/, SPQ);
+    if (result != 0)
+      return result > 0;
+  }
+  else {
+    if (left->getHeight() != right->getHeight())
+      return left->getHeight() > right->getHeight();
+
+    if (left->getDepth() != right->getDepth())
+      return left->getDepth() < right->getDepth();
+  }
+
+  assert(left->NodeQueueId && right->NodeQueueId &&
+         "NodeQueueId cannot be zero");
+  return (left->NodeQueueId > right->NodeQueueId);
+}
+
+// Bottom up
+bool bu_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
+  return BURRSort(left, right, SPQ);
+}
+
+// Source order, otherwise bottom up.
+bool src_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
+  unsigned LOrder = SPQ->getNodeOrdering(left);
+  unsigned ROrder = SPQ->getNodeOrdering(right);
+
+  // Prefer an ordering where the lower the non-zero order number, the higher
+  // the preference.
+  if ((LOrder || ROrder) && LOrder != ROrder)
+    return LOrder != 0 && (LOrder < ROrder || ROrder == 0);
+
+  return BURRSort(left, right, SPQ);
+}
+
+// If the time between now and when the instruction will be ready can cover
+// the spill code, then avoid adding it to the ready queue. This gives long
+// stalls highest priority and allows hoisting across calls. It should also
+// speed up processing the available queue.
+bool hybrid_ls_rr_sort::isReady(SUnit *SU, unsigned CurCycle) const {
+  static const unsigned ReadyDelay = 3;
+
+  if (SPQ->MayReduceRegPressure(SU)) return true;
+
+  if (SU->getHeight() > (CurCycle + ReadyDelay)) return false;
+
+  if (SPQ->getHazardRec()->getHazardType(SU, -ReadyDelay)
+      != ScheduleHazardRecognizer::NoHazard)
+    return false;
+
+  return true;
+}
+
+// Return true if right should be scheduled with higher priority than left.
+bool hybrid_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
+  if (left->isCall || right->isCall)
+    // No way to compute latency of calls.
+    return BURRSort(left, right, SPQ);
+
+  bool LHigh = SPQ->HighRegPressure(left);
+  bool RHigh = SPQ->HighRegPressure(right);
+  // Avoid causing spills. If register pressure is high, schedule for
+  // register pressure reduction.
+  if (LHigh && !RHigh) {
+    LLVM_DEBUG(dbgs() << "  pressure SU(" << left->NodeNum << ") > SU("
+                      << right->NodeNum << ")\n");
+    return true;
+  }
+  else if (!LHigh && RHigh) {
+    LLVM_DEBUG(dbgs() << "  pressure SU(" << right->NodeNum << ") > SU("
+                      << left->NodeNum << ")\n");
+    return false;
+  }
+  if (!LHigh && !RHigh) {
+    int result = BUCompareLatency(left, right, true /*checkPref*/, SPQ);
+    if (result != 0)
+      return result > 0;
+  }
+  return BURRSort(left, right, SPQ);
+}
+
+// Schedule as many instructions in each cycle as possible. So don't make an
+// instruction available unless it is ready in the current cycle.
+bool ilp_ls_rr_sort::isReady(SUnit *SU, unsigned CurCycle) const {
+  if (SU->getHeight() > CurCycle) return false;
+
+  if (SPQ->getHazardRec()->getHazardType(SU, 0)
+      != ScheduleHazardRecognizer::NoHazard)
+    return false;
+
+  return true;
+}
+
+static bool canEnableCoalescing(SUnit *SU) {
+  unsigned Opc = SU->getNode() ? SU->getNode()->getOpcode() : 0;
+  if (Opc == ISD::TokenFactor || Opc == ISD::CopyToReg)
+    // CopyToReg should be close to its uses to facilitate coalescing and
+    // avoid spilling.
+    return true;
+
+  if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+      Opc == TargetOpcode::SUBREG_TO_REG ||
+      Opc == TargetOpcode::INSERT_SUBREG)
+    // EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG nodes should be
+    // close to their uses to facilitate coalescing.
+    return true;
+
+  if (SU->NumPreds == 0 && SU->NumSuccs != 0)
+    // If SU does not have a register def, schedule it close to its uses
+    // because it does not lengthen any live ranges.
+    return true;
+
+  return false;
+}
+
+// list-ilp is currently an experimental scheduler that allows various
+// heuristics to be enabled prior to the normal register reduction logic.
+bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
+  if (left->isCall || right->isCall)
+    // No way to compute latency of calls.
+    return BURRSort(left, right, SPQ);
+
+  unsigned LLiveUses = 0, RLiveUses = 0;
+  int LPDiff = 0, RPDiff = 0;
+  if (!DisableSchedRegPressure || !DisableSchedLiveUses) {
+    LPDiff = SPQ->RegPressureDiff(left, LLiveUses);
+    RPDiff = SPQ->RegPressureDiff(right, RLiveUses);
+  }
+  if (!DisableSchedRegPressure && LPDiff != RPDiff) {
+    LLVM_DEBUG(dbgs() << "RegPressureDiff SU(" << left->NodeNum
+                      << "): " << LPDiff << " != SU(" << right->NodeNum
+                      << "): " << RPDiff << "\n");
+    return LPDiff > RPDiff;
+  }
+
+  if (!DisableSchedRegPressure && (LPDiff > 0 || RPDiff > 0)) {
+    bool LReduce = canEnableCoalescing(left);
+    bool RReduce = canEnableCoalescing(right);
+    if (LReduce && !RReduce) return false;
+    if (RReduce && !LReduce) return true;
+  }
+
+  if (!DisableSchedLiveUses && (LLiveUses != RLiveUses)) {
+    LLVM_DEBUG(dbgs() << "Live uses SU(" << left->NodeNum << "): " << LLiveUses
+                      << " != SU(" << right->NodeNum << "): " << RLiveUses
+                      << "\n");
+    return LLiveUses < RLiveUses;
+  }
+
+  if (!DisableSchedStalls) {
+    bool LStall = BUHasStall(left, left->getHeight(), SPQ);
+    bool RStall = BUHasStall(right, right->getHeight(), SPQ);
+    if (LStall != RStall)
+      return left->getHeight() > right->getHeight();
+  }
+
+  if (!DisableSchedCriticalPath) {
+    int spread = (int)left->getDepth() - (int)right->getDepth();
+    if (std::abs(spread) > MaxReorderWindow) {
+      LLVM_DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
+                        << left->getDepth() << " != SU(" << right->NodeNum
+                        << "): " << right->getDepth() << "\n");
+      return left->getDepth() < right->getDepth();
+    }
+  }
+
+  if (!DisableSchedHeight && left->getHeight() != right->getHeight()) {
+    int spread = (int)left->getHeight() - (int)right->getHeight();
+    if (std::abs(spread) > MaxReorderWindow)
+      return left->getHeight() > right->getHeight();
+  }
+
+  return BURRSort(left, right, SPQ);
+}
+
+void RegReductionPQBase::initNodes(std::vector<SUnit> &sunits) {
+  SUnits = &sunits;
+  // Add pseudo dependency edges for two-address nodes.
+  if (!Disable2AddrHack)
+    AddPseudoTwoAddrDeps();
+  // Reroute edges to nodes with multiple uses.
+  if (!TracksRegPressure && !SrcOrder)
+    PrescheduleNodesWithMultipleUses();
+  // Calculate node priorities.
+  CalculateSethiUllmanNumbers();
+
+  // For single block loops, mark nodes that look like canonical IV increments.
+  if (scheduleDAG->BB->isSuccessor(scheduleDAG->BB))
+    for (SUnit &SU : sunits)
+      initVRegCycle(&SU);
+}
+
+//===----------------------------------------------------------------------===//
+//                    Preschedule for Register Pressure
+//===----------------------------------------------------------------------===//
+
+bool RegReductionPQBase::canClobber(const SUnit *SU, const SUnit *Op) {
+  if (SU->isTwoAddress) {
+    unsigned Opc = SU->getNode()->getMachineOpcode();
+    const MCInstrDesc &MCID = TII->get(Opc);
+    unsigned NumRes = MCID.getNumDefs();
+    unsigned NumOps = MCID.getNumOperands() - NumRes;
+    for (unsigned i = 0; i != NumOps; ++i) {
+      if (MCID.getOperandConstraint(i+NumRes, MCOI::TIED_TO) != -1) {
+        SDNode *DU = SU->getNode()->getOperand(i).getNode();
+        if (DU->getNodeId() != -1 &&
+            Op->OrigNode == &(*SUnits)[DU->getNodeId()])
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// canClobberReachingPhysRegUse - True if SU would clobber one of it's
+/// successor's explicit physregs whose definition can reach DepSU.
+/// i.e. DepSU should not be scheduled above SU.
+static bool canClobberReachingPhysRegUse(const SUnit *DepSU, const SUnit *SU,
+                                         ScheduleDAGRRList *scheduleDAG,
+                                         const TargetInstrInfo *TII,
+                                         const TargetRegisterInfo *TRI) {
+  ArrayRef<MCPhysReg> ImpDefs =
+      TII->get(SU->getNode()->getMachineOpcode()).implicit_defs();
+  const uint32_t *RegMask = getNodeRegMask(SU->getNode());
+  if (ImpDefs.empty() && !RegMask)
+    return false;
+
+  for (const SDep &Succ : SU->Succs) {
+    SUnit *SuccSU = Succ.getSUnit();
+    for (const SDep &SuccPred : SuccSU->Preds) {
+      if (!SuccPred.isAssignedRegDep())
+        continue;
+
+      if (RegMask &&
+          MachineOperand::clobbersPhysReg(RegMask, SuccPred.getReg()) &&
+          scheduleDAG->IsReachable(DepSU, SuccPred.getSUnit()))
+        return true;
+
+      for (MCPhysReg ImpDef : ImpDefs) {
+        // Return true if SU clobbers this physical register use and the
+        // definition of the register reaches from DepSU. IsReachable queries
+        // a topological forward sort of the DAG (following the successors).
+        if (TRI->regsOverlap(ImpDef, SuccPred.getReg()) &&
+            scheduleDAG->IsReachable(DepSU, SuccPred.getSUnit()))
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// canClobberPhysRegDefs - True if SU would clobber one of SuccSU's
+/// physical register defs.
+static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU,
+                                  const TargetInstrInfo *TII,
+                                  const TargetRegisterInfo *TRI) {
+  SDNode *N = SuccSU->getNode();
+  unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+  ArrayRef<MCPhysReg> ImpDefs = TII->get(N->getMachineOpcode()).implicit_defs();
+  assert(!ImpDefs.empty() && "Caller should check hasPhysRegDefs");
+  for (const SDNode *SUNode = SU->getNode(); SUNode;
+       SUNode = SUNode->getGluedNode()) {
+    if (!SUNode->isMachineOpcode())
+      continue;
+    ArrayRef<MCPhysReg> SUImpDefs =
+        TII->get(SUNode->getMachineOpcode()).implicit_defs();
+    const uint32_t *SURegMask = getNodeRegMask(SUNode);
+    if (SUImpDefs.empty() && !SURegMask)
+      continue;
+    for (unsigned i = NumDefs, e = N->getNumValues(); i != e; ++i) {
+      MVT VT = N->getSimpleValueType(i);
+      if (VT == MVT::Glue || VT == MVT::Other)
+        continue;
+      if (!N->hasAnyUseOfValue(i))
+        continue;
+      MCPhysReg Reg = ImpDefs[i - NumDefs];
+      if (SURegMask && MachineOperand::clobbersPhysReg(SURegMask, Reg))
+        return true;
+      for (MCPhysReg SUReg : SUImpDefs) {
+        if (TRI->regsOverlap(Reg, SUReg))
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// PrescheduleNodesWithMultipleUses - Nodes with multiple uses
+/// are not handled well by the general register pressure reduction
+/// heuristics. When presented with code like this:
+///
+///      N
+///    / |
+///   /  |
+///  U  store
+///  |
+/// ...
+///
+/// the heuristics tend to push the store up, but since the
+/// operand of the store has another use (U), this would increase
+/// the length of that other use (the U->N edge).
+///
+/// This function transforms code like the above to route U's
+/// dependence through the store when possible, like this:
+///
+///      N
+///      ||
+///      ||
+///     store
+///       |
+///       U
+///       |
+///      ...
+///
+/// This results in the store being scheduled immediately
+/// after N, which shortens the U->N live range, reducing
+/// register pressure.
+void RegReductionPQBase::PrescheduleNodesWithMultipleUses() {
+  // Visit all the nodes in topological order, working top-down.
+  for (SUnit &SU : *SUnits) {
+    // For now, only look at nodes with no data successors, such as stores.
+    // These are especially important, due to the heuristics in
+    // getNodePriority for nodes with no data successors.
+    if (SU.NumSuccs != 0)
+      continue;
+    // For now, only look at nodes with exactly one data predecessor.
+    if (SU.NumPreds != 1)
+      continue;
+    // Avoid prescheduling copies to virtual registers, which don't behave
+    // like other nodes from the perspective of scheduling heuristics.
+    if (SDNode *N = SU.getNode())
+      if (N->getOpcode() == ISD::CopyToReg &&
+          cast<RegisterSDNode>(N->getOperand(1))->getReg().isVirtual())
+        continue;
+
+    SDNode *PredFrameSetup = nullptr;
+    for (const SDep &Pred : SU.Preds)
+      if (Pred.isCtrl() && Pred.getSUnit()) {
+        // Find the predecessor which is not data dependence.
+        SDNode *PredND = Pred.getSUnit()->getNode();
+
+        // If PredND is FrameSetup, we should not pre-scheduled the node,
+        // or else, when bottom up scheduling, ADJCALLSTACKDOWN and
+        // ADJCALLSTACKUP may hold CallResource too long and make other
+        // calls can't be scheduled. If there's no other available node
+        // to schedule, the schedular will try to rename the register by
+        // creating copy to avoid the conflict which will fail because
+        // CallResource is not a real physical register.
+        if (PredND && PredND->isMachineOpcode() &&
+            (PredND->getMachineOpcode() == TII->getCallFrameSetupOpcode())) {
+          PredFrameSetup = PredND;
+          break;
+        }
+      }
+    // Skip the node has FrameSetup parent.
+    if (PredFrameSetup != nullptr)
+      continue;
+
+    // Locate the single data predecessor.
+    SUnit *PredSU = nullptr;
+    for (const SDep &Pred : SU.Preds)
+      if (!Pred.isCtrl()) {
+        PredSU = Pred.getSUnit();
+        break;
+      }
+    assert(PredSU);
+
+    // Don't rewrite edges that carry physregs, because that requires additional
+    // support infrastructure.
+    if (PredSU->hasPhysRegDefs)
+      continue;
+    // Short-circuit the case where SU is PredSU's only data successor.
+    if (PredSU->NumSuccs == 1)
+      continue;
+    // Avoid prescheduling to copies from virtual registers, which don't behave
+    // like other nodes from the perspective of scheduling heuristics.
+    if (SDNode *N = SU.getNode())
+      if (N->getOpcode() == ISD::CopyFromReg &&
+          cast<RegisterSDNode>(N->getOperand(1))->getReg().isVirtual())
+        continue;
+
+    // Perform checks on the successors of PredSU.
+    for (const SDep &PredSucc : PredSU->Succs) {
+      SUnit *PredSuccSU = PredSucc.getSUnit();
+      if (PredSuccSU == &SU) continue;
+      // If PredSU has another successor with no data successors, for
+      // now don't attempt to choose either over the other.
+      if (PredSuccSU->NumSuccs == 0)
+        goto outer_loop_continue;
+      // Don't break physical register dependencies.
+      if (SU.hasPhysRegClobbers && PredSuccSU->hasPhysRegDefs)
+        if (canClobberPhysRegDefs(PredSuccSU, &SU, TII, TRI))
+          goto outer_loop_continue;
+      // Don't introduce graph cycles.
+      if (scheduleDAG->IsReachable(&SU, PredSuccSU))
+        goto outer_loop_continue;
+    }
+
+    // Ok, the transformation is safe and the heuristics suggest it is
+    // profitable. Update the graph.
+    LLVM_DEBUG(
+        dbgs() << "    Prescheduling SU #" << SU.NodeNum << " next to PredSU #"
+               << PredSU->NodeNum
+               << " to guide scheduling in the presence of multiple uses\n");
+    for (unsigned i = 0; i != PredSU->Succs.size(); ++i) {
+      SDep Edge = PredSU->Succs[i];
+      assert(!Edge.isAssignedRegDep());
+      SUnit *SuccSU = Edge.getSUnit();
+      if (SuccSU != &SU) {
+        Edge.setSUnit(PredSU);
+        scheduleDAG->RemovePred(SuccSU, Edge);
+        scheduleDAG->AddPredQueued(&SU, Edge);
+        Edge.setSUnit(&SU);
+        scheduleDAG->AddPredQueued(SuccSU, Edge);
+        --i;
+      }
+    }
+  outer_loop_continue:;
+  }
+}
+
+/// AddPseudoTwoAddrDeps - If two nodes share an operand and one of them uses
+/// it as a def&use operand. Add a pseudo control edge from it to the other
+/// node (if it won't create a cycle) so the two-address one will be scheduled
+/// first (lower in the schedule). If both nodes are two-address, favor the
+/// one that has a CopyToReg use (more likely to be a loop induction update).
+/// If both are two-address, but one is commutable while the other is not
+/// commutable, favor the one that's not commutable.
+void RegReductionPQBase::AddPseudoTwoAddrDeps() {
+  for (SUnit &SU : *SUnits) {
+    if (!SU.isTwoAddress)
+      continue;
+
+    SDNode *Node = SU.getNode();
+    if (!Node || !Node->isMachineOpcode() || SU.getNode()->getGluedNode())
+      continue;
+
+    bool isLiveOut = hasOnlyLiveOutUses(&SU);
+    unsigned Opc = Node->getMachineOpcode();
+    const MCInstrDesc &MCID = TII->get(Opc);
+    unsigned NumRes = MCID.getNumDefs();
+    unsigned NumOps = MCID.getNumOperands() - NumRes;
+    for (unsigned j = 0; j != NumOps; ++j) {
+      if (MCID.getOperandConstraint(j+NumRes, MCOI::TIED_TO) == -1)
+        continue;
+      SDNode *DU = SU.getNode()->getOperand(j).getNode();
+      if (DU->getNodeId() == -1)
+        continue;
+      const SUnit *DUSU = &(*SUnits)[DU->getNodeId()];
+      if (!DUSU)
+        continue;
+      for (const SDep &Succ : DUSU->Succs) {
+        if (Succ.isCtrl())
+          continue;
+        SUnit *SuccSU = Succ.getSUnit();
+        if (SuccSU == &SU)
+          continue;
+        // Be conservative. Ignore if nodes aren't at roughly the same
+        // depth and height.
+        if (SuccSU->getHeight() < SU.getHeight() &&
+            (SU.getHeight() - SuccSU->getHeight()) > 1)
+          continue;
+        // Skip past COPY_TO_REGCLASS nodes, so that the pseudo edge
+        // constrains whatever is using the copy, instead of the copy
+        // itself. In the case that the copy is coalesced, this
+        // preserves the intent of the pseudo two-address heurietics.
+        while (SuccSU->Succs.size() == 1 &&
+               SuccSU->getNode()->isMachineOpcode() &&
+               SuccSU->getNode()->getMachineOpcode() ==
+                 TargetOpcode::COPY_TO_REGCLASS)
+          SuccSU = SuccSU->Succs.front().getSUnit();
+        // Don't constrain non-instruction nodes.
+        if (!SuccSU->getNode() || !SuccSU->getNode()->isMachineOpcode())
+          continue;
+        // Don't constrain nodes with physical register defs if the
+        // predecessor can clobber them.
+        if (SuccSU->hasPhysRegDefs && SU.hasPhysRegClobbers) {
+          if (canClobberPhysRegDefs(SuccSU, &SU, TII, TRI))
+            continue;
+        }
+        // Don't constrain EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG;
+        // these may be coalesced away. We want them close to their uses.
+        unsigned SuccOpc = SuccSU->getNode()->getMachineOpcode();
+        if (SuccOpc == TargetOpcode::EXTRACT_SUBREG ||
+            SuccOpc == TargetOpcode::INSERT_SUBREG ||
+            SuccOpc == TargetOpcode::SUBREG_TO_REG)
+          continue;
+        if (!canClobberReachingPhysRegUse(SuccSU, &SU, scheduleDAG, TII, TRI) &&
+            (!canClobber(SuccSU, DUSU) ||
+             (isLiveOut && !hasOnlyLiveOutUses(SuccSU)) ||
+             (!SU.isCommutable && SuccSU->isCommutable)) &&
+            !scheduleDAG->IsReachable(SuccSU, &SU)) {
+          LLVM_DEBUG(dbgs()
+                     << "    Adding a pseudo-two-addr edge from SU #"
+                     << SU.NodeNum << " to SU #" << SuccSU->NodeNum << "\n");
+          scheduleDAG->AddPredQueued(&SU, SDep(SuccSU, SDep::Artificial));
+        }
+      }
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+ScheduleDAGSDNodes *
+llvm::createBURRListDAGScheduler(SelectionDAGISel *IS,
+                                 CodeGenOpt::Level OptLevel) {
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
+  BURegReductionPriorityQueue *PQ =
+    new BURegReductionPriorityQueue(*IS->MF, false, false, TII, TRI, nullptr);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, false, PQ, OptLevel);
+  PQ->setScheduleDAG(SD);
+  return SD;
+}
+
+ScheduleDAGSDNodes *
+llvm::createSourceListDAGScheduler(SelectionDAGISel *IS,
+                                   CodeGenOpt::Level OptLevel) {
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
+  SrcRegReductionPriorityQueue *PQ =
+    new SrcRegReductionPriorityQueue(*IS->MF, false, true, TII, TRI, nullptr);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, false, PQ, OptLevel);
+  PQ->setScheduleDAG(SD);
+  return SD;
+}
+
+ScheduleDAGSDNodes *
+llvm::createHybridListDAGScheduler(SelectionDAGISel *IS,
+                                   CodeGenOpt::Level OptLevel) {
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  const TargetLowering *TLI = IS->TLI;
+
+  HybridBURRPriorityQueue *PQ =
+    new HybridBURRPriorityQueue(*IS->MF, true, false, TII, TRI, TLI);
+
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, PQ, OptLevel);
+  PQ->setScheduleDAG(SD);
+  return SD;
+}
+
+ScheduleDAGSDNodes *
+llvm::createILPListDAGScheduler(SelectionDAGISel *IS,
+                                CodeGenOpt::Level OptLevel) {
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  const TargetLowering *TLI = IS->TLI;
+
+  ILPBURRPriorityQueue *PQ =
+    new ILPBURRPriorityQueue(*IS->MF, true, false, TII, TRI, TLI);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, PQ, OptLevel);
+  PQ->setScheduleDAG(SD);
+  return SD;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
new file mode 100644
index 000000000000..0579c1664d5c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -0,0 +1,1086 @@
+//===--- ScheduleDAGSDNodes.cpp - Implement the ScheduleDAGSDNodes class --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAG class, which is a base class used by
+// scheduling implementation classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScheduleDAGSDNodes.h"
+#include "InstrEmitter.h"
+#include "SDNodeDbgValue.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "pre-RA-sched"
+
+STATISTIC(LoadsClustered, "Number of loads clustered together");
+
+// This allows the latency-based scheduler to notice high latency instructions
+// without a target itinerary. The choice of number here has more to do with
+// balancing scheduler heuristics than with the actual machine latency.
+static cl::opt<int> HighLatencyCycles(
+  "sched-high-latency-cycles", cl::Hidden, cl::init(10),
+  cl::desc("Roughly estimate the number of cycles that 'long latency'"
+           "instructions take for targets with no itinerary"));
+
+ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
+    : ScheduleDAG(mf), InstrItins(mf.getSubtarget().getInstrItineraryData()) {}
+
+/// Run - perform scheduling.
+///
+void ScheduleDAGSDNodes::Run(SelectionDAG *dag, MachineBasicBlock *bb) {
+  BB = bb;
+  DAG = dag;
+
+  // Clear the scheduler's SUnit DAG.
+  ScheduleDAG::clearDAG();
+  Sequence.clear();
+
+  // Invoke the target's selection of scheduler.
+  Schedule();
+}
+
+/// NewSUnit - Creates a new SUnit and return a ptr to it.
+///
+SUnit *ScheduleDAGSDNodes::newSUnit(SDNode *N) {
+#ifndef NDEBUG
+  const SUnit *Addr = nullptr;
+  if (!SUnits.empty())
+    Addr = &SUnits[0];
+#endif
+  SUnits.emplace_back(N, (unsigned)SUnits.size());
+  assert((Addr == nullptr || Addr == &SUnits[0]) &&
+         "SUnits std::vector reallocated on the fly!");
+  SUnits.back().OrigNode = &SUnits.back();
+  SUnit *SU = &SUnits.back();
+  const TargetLowering &TLI = DAG->getTargetLoweringInfo();
+  if (!N ||
+      (N->isMachineOpcode() &&
+       N->getMachineOpcode() == TargetOpcode::IMPLICIT_DEF))
+    SU->SchedulingPref = Sched::None;
+  else
+    SU->SchedulingPref = TLI.getSchedulingPreference(N);
+  return SU;
+}
+
+SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) {
+  SUnit *SU = newSUnit(Old->getNode());
+  SU->OrigNode = Old->OrigNode;
+  SU->Latency = Old->Latency;
+  SU->isVRegCycle = Old->isVRegCycle;
+  SU->isCall = Old->isCall;
+  SU->isCallOp = Old->isCallOp;
+  SU->isTwoAddress = Old->isTwoAddress;
+  SU->isCommutable = Old->isCommutable;
+  SU->hasPhysRegDefs = Old->hasPhysRegDefs;
+  SU->hasPhysRegClobbers = Old->hasPhysRegClobbers;
+  SU->isScheduleHigh = Old->isScheduleHigh;
+  SU->isScheduleLow = Old->isScheduleLow;
+  SU->SchedulingPref = Old->SchedulingPref;
+  Old->isCloned = true;
+  return SU;
+}
+
+/// CheckForPhysRegDependency - Check if the dependency between def and use of
+/// a specified operand is a physical register dependency. If so, returns the
+/// register and the cost of copying the register.
+static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
+                                      const TargetRegisterInfo *TRI,
+                                      const TargetInstrInfo *TII,
+                                      const TargetLowering &TLI,
+                                      unsigned &PhysReg, int &Cost) {
+  if (Op != 2 || User->getOpcode() != ISD::CopyToReg)
+    return;
+
+  unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+  if (TLI.checkForPhysRegDependency(Def, User, Op, TRI, TII, PhysReg, Cost))
+    return;
+
+  if (Register::isVirtualRegister(Reg))
+    return;
+
+  unsigned ResNo = User->getOperand(2).getResNo();
+  if (Def->getOpcode() == ISD::CopyFromReg &&
+      cast<RegisterSDNode>(Def->getOperand(1))->getReg() == Reg) {
+    PhysReg = Reg;
+  } else if (Def->isMachineOpcode()) {
+    const MCInstrDesc &II = TII->get(Def->getMachineOpcode());
+    if (ResNo >= II.getNumDefs() && II.hasImplicitDefOfPhysReg(Reg))
+      PhysReg = Reg;
+  }
+
+  if (PhysReg != 0) {
+    const TargetRegisterClass *RC =
+        TRI->getMinimalPhysRegClass(Reg, Def->getSimpleValueType(ResNo));
+    Cost = RC->getCopyCost();
+  }
+}
+
+// Helper for AddGlue to clone node operands.
+static void CloneNodeWithValues(SDNode *N, SelectionDAG *DAG, ArrayRef<EVT> VTs,
+                                SDValue ExtraOper = SDValue()) {
+  SmallVector<SDValue, 8> Ops(N->op_begin(), N->op_end());
+  if (ExtraOper.getNode())
+    Ops.push_back(ExtraOper);
+
+  SDVTList VTList = DAG->getVTList(VTs);
+  MachineSDNode *MN = dyn_cast<MachineSDNode>(N);
+
+  // Store memory references.
+  SmallVector<MachineMemOperand *, 2> MMOs;
+  if (MN)
+    MMOs.assign(MN->memoperands_begin(), MN->memoperands_end());
+
+  DAG->MorphNodeTo(N, N->getOpcode(), VTList, Ops);
+
+  // Reset the memory references
+  if (MN)
+    DAG->setNodeMemRefs(MN, MMOs);
+}
+
+static bool AddGlue(SDNode *N, SDValue Glue, bool AddGlue, SelectionDAG *DAG) {
+  SDNode *GlueDestNode = Glue.getNode();
+
+  // Don't add glue from a node to itself.
+  if (GlueDestNode == N) return false;
+
+  // Don't add a glue operand to something that already uses glue.
+  if (GlueDestNode &&
+      N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue) {
+    return false;
+  }
+  // Don't add glue to something that already has a glue value.
+  if (N->getValueType(N->getNumValues() - 1) == MVT::Glue) return false;
+
+  SmallVector<EVT, 4> VTs(N->values());
+  if (AddGlue)
+    VTs.push_back(MVT::Glue);
+
+  CloneNodeWithValues(N, DAG, VTs, Glue);
+
+  return true;
+}
+
+// Cleanup after unsuccessful AddGlue. Use the standard method of morphing the
+// node even though simply shrinking the value list is sufficient.
+static void RemoveUnusedGlue(SDNode *N, SelectionDAG *DAG) {
+  assert((N->getValueType(N->getNumValues() - 1) == MVT::Glue &&
+          !N->hasAnyUseOfValue(N->getNumValues() - 1)) &&
+         "expected an unused glue value");
+
+  CloneNodeWithValues(N, DAG,
+                      ArrayRef(N->value_begin(), N->getNumValues() - 1));
+}
+
+/// ClusterNeighboringLoads - Force nearby loads together by "gluing" them.
+/// This function finds loads of the same base and different offsets. If the
+/// offsets are not far apart (target specific), it add MVT::Glue inputs and
+/// outputs to ensure they are scheduled together and in order. This
+/// optimization may benefit some targets by improving cache locality.
+void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
+  SDValue Chain;
+  unsigned NumOps = Node->getNumOperands();
+  if (Node->getOperand(NumOps-1).getValueType() == MVT::Other)
+    Chain = Node->getOperand(NumOps-1);
+  if (!Chain)
+    return;
+
+  // Skip any load instruction that has a tied input. There may be an additional
+  // dependency requiring a different order than by increasing offsets, and the
+  // added glue may introduce a cycle.
+  auto hasTiedInput = [this](const SDNode *N) {
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    for (unsigned I = 0; I != MCID.getNumOperands(); ++I) {
+      if (MCID.getOperandConstraint(I, MCOI::TIED_TO) != -1)
+        return true;
+    }
+
+    return false;
+  };
+
+  // Look for other loads of the same chain. Find loads that are loading from
+  // the same base pointer and different offsets.
+  SmallPtrSet<SDNode*, 16> Visited;
+  SmallVector<int64_t, 4> Offsets;
+  DenseMap<long long, SDNode*> O2SMap;  // Map from offset to SDNode.
+  bool Cluster = false;
+  SDNode *Base = Node;
+
+  if (hasTiedInput(Base))
+    return;
+
+  // This algorithm requires a reasonably low use count before finding a match
+  // to avoid uselessly blowing up compile time in large blocks.
+  unsigned UseCount = 0;
+  for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
+       I != E && UseCount < 100; ++I, ++UseCount) {
+    if (I.getUse().getResNo() != Chain.getResNo())
+      continue;
+
+    SDNode *User = *I;
+    if (User == Node || !Visited.insert(User).second)
+      continue;
+    int64_t Offset1, Offset2;
+    if (!TII->areLoadsFromSameBasePtr(Base, User, Offset1, Offset2) ||
+        Offset1 == Offset2 ||
+        hasTiedInput(User)) {
+      // FIXME: Should be ok if they addresses are identical. But earlier
+      // optimizations really should have eliminated one of the loads.
+      continue;
+    }
+    if (O2SMap.insert(std::make_pair(Offset1, Base)).second)
+      Offsets.push_back(Offset1);
+    O2SMap.insert(std::make_pair(Offset2, User));
+    Offsets.push_back(Offset2);
+    if (Offset2 < Offset1)
+      Base = User;
+    Cluster = true;
+    // Reset UseCount to allow more matches.
+    UseCount = 0;
+  }
+
+  if (!Cluster)
+    return;
+
+  // Sort them in increasing order.
+  llvm::sort(Offsets);
+
+  // Check if the loads are close enough.
+  SmallVector<SDNode*, 4> Loads;
+  unsigned NumLoads = 0;
+  int64_t BaseOff = Offsets[0];
+  SDNode *BaseLoad = O2SMap[BaseOff];
+  Loads.push_back(BaseLoad);
+  for (unsigned i = 1, e = Offsets.size(); i != e; ++i) {
+    int64_t Offset = Offsets[i];
+    SDNode *Load = O2SMap[Offset];
+    if (!TII->shouldScheduleLoadsNear(BaseLoad, Load, BaseOff, Offset,NumLoads))
+      break; // Stop right here. Ignore loads that are further away.
+    Loads.push_back(Load);
+    ++NumLoads;
+  }
+
+  if (NumLoads == 0)
+    return;
+
+  // Cluster loads by adding MVT::Glue outputs and inputs. This also
+  // ensure they are scheduled in order of increasing addresses.
+  SDNode *Lead = Loads[0];
+  SDValue InGlue;
+  if (AddGlue(Lead, InGlue, true, DAG))
+    InGlue = SDValue(Lead, Lead->getNumValues() - 1);
+  for (unsigned I = 1, E = Loads.size(); I != E; ++I) {
+    bool OutGlue = I < E - 1;
+    SDNode *Load = Loads[I];
+
+    // If AddGlue fails, we could leave an unsused glue value. This should not
+    // cause any
+    if (AddGlue(Load, InGlue, OutGlue, DAG)) {
+      if (OutGlue)
+        InGlue = SDValue(Load, Load->getNumValues() - 1);
+
+      ++LoadsClustered;
+    }
+    else if (!OutGlue && InGlue.getNode())
+      RemoveUnusedGlue(InGlue.getNode(), DAG);
+  }
+}
+
+/// ClusterNodes - Cluster certain nodes which should be scheduled together.
+///
+void ScheduleDAGSDNodes::ClusterNodes() {
+  for (SDNode &NI : DAG->allnodes()) {
+    SDNode *Node = &NI;
+    if (!Node || !Node->isMachineOpcode())
+      continue;
+
+    unsigned Opc = Node->getMachineOpcode();
+    const MCInstrDesc &MCID = TII->get(Opc);
+    if (MCID.mayLoad())
+      // Cluster loads from "near" addresses into combined SUnits.
+      ClusterNeighboringLoads(Node);
+  }
+}
+
+void ScheduleDAGSDNodes::BuildSchedUnits() {
+  // During scheduling, the NodeId field of SDNode is used to map SDNodes
+  // to their associated SUnits by holding SUnits table indices. A value
+  // of -1 means the SDNode does not yet have an associated SUnit.
+  unsigned NumNodes = 0;
+  for (SDNode &NI : DAG->allnodes()) {
+    NI.setNodeId(-1);
+    ++NumNodes;
+  }
+
+  // Reserve entries in the vector for each of the SUnits we are creating.  This
+  // ensure that reallocation of the vector won't happen, so SUnit*'s won't get
+  // invalidated.
+  // FIXME: Multiply by 2 because we may clone nodes during scheduling.
+  // This is a temporary workaround.
+  SUnits.reserve(NumNodes * 2);
+
+  // Add all nodes in depth first order.
+  SmallVector<SDNode*, 64> Worklist;
+  SmallPtrSet<SDNode*, 32> Visited;
+  Worklist.push_back(DAG->getRoot().getNode());
+  Visited.insert(DAG->getRoot().getNode());
+
+  SmallVector<SUnit*, 8> CallSUnits;
+  while (!Worklist.empty()) {
+    SDNode *NI = Worklist.pop_back_val();
+
+    // Add all operands to the worklist unless they've already been added.
+    for (const SDValue &Op : NI->op_values())
+      if (Visited.insert(Op.getNode()).second)
+        Worklist.push_back(Op.getNode());
+
+    if (isPassiveNode(NI))  // Leaf node, e.g. a TargetImmediate.
+      continue;
+
+    // If this node has already been processed, stop now.
+    if (NI->getNodeId() != -1) continue;
+
+    SUnit *NodeSUnit = newSUnit(NI);
+
+    // See if anything is glued to this node, if so, add them to glued
+    // nodes.  Nodes can have at most one glue input and one glue output.  Glue
+    // is required to be the last operand and result of a node.
+
+    // Scan up to find glued preds.
+    SDNode *N = NI;
+    while (N->getNumOperands() &&
+           N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue) {
+      N = N->getOperand(N->getNumOperands()-1).getNode();
+      assert(N->getNodeId() == -1 && "Node already inserted!");
+      N->setNodeId(NodeSUnit->NodeNum);
+      if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall())
+        NodeSUnit->isCall = true;
+    }
+
+    // Scan down to find any glued succs.
+    N = NI;
+    while (N->getValueType(N->getNumValues()-1) == MVT::Glue) {
+      SDValue GlueVal(N, N->getNumValues()-1);
+
+      // There are either zero or one users of the Glue result.
+      bool HasGlueUse = false;
+      for (SDNode *U : N->uses())
+        if (GlueVal.isOperandOf(U)) {
+          HasGlueUse = true;
+          assert(N->getNodeId() == -1 && "Node already inserted!");
+          N->setNodeId(NodeSUnit->NodeNum);
+          N = U;
+          if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall())
+            NodeSUnit->isCall = true;
+          break;
+        }
+      if (!HasGlueUse) break;
+    }
+
+    if (NodeSUnit->isCall)
+      CallSUnits.push_back(NodeSUnit);
+
+    // Schedule zero-latency TokenFactor below any nodes that may increase the
+    // schedule height. Otherwise, ancestors of the TokenFactor may appear to
+    // have false stalls.
+    if (NI->getOpcode() == ISD::TokenFactor)
+      NodeSUnit->isScheduleLow = true;
+
+    // If there are glue operands involved, N is now the bottom-most node
+    // of the sequence of nodes that are glued together.
+    // Update the SUnit.
+    NodeSUnit->setNode(N);
+    assert(N->getNodeId() == -1 && "Node already inserted!");
+    N->setNodeId(NodeSUnit->NodeNum);
+
+    // Compute NumRegDefsLeft. This must be done before AddSchedEdges.
+    InitNumRegDefsLeft(NodeSUnit);
+
+    // Assign the Latency field of NodeSUnit using target-provided information.
+    computeLatency(NodeSUnit);
+  }
+
+  // Find all call operands.
+  while (!CallSUnits.empty()) {
+    SUnit *SU = CallSUnits.pop_back_val();
+    for (const SDNode *SUNode = SU->getNode(); SUNode;
+         SUNode = SUNode->getGluedNode()) {
+      if (SUNode->getOpcode() != ISD::CopyToReg)
+        continue;
+      SDNode *SrcN = SUNode->getOperand(2).getNode();
+      if (isPassiveNode(SrcN)) continue;   // Not scheduled.
+      SUnit *SrcSU = &SUnits[SrcN->getNodeId()];
+      SrcSU->isCallOp = true;
+    }
+  }
+}
+
+void ScheduleDAGSDNodes::AddSchedEdges() {
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+
+  // Check to see if the scheduler cares about latencies.
+  bool UnitLatencies = forceUnitLatencies();
+
+  // Pass 2: add the preds, succs, etc.
+  for (SUnit &SU : SUnits) {
+    SDNode *MainNode = SU.getNode();
+
+    if (MainNode->isMachineOpcode()) {
+      unsigned Opc = MainNode->getMachineOpcode();
+      const MCInstrDesc &MCID = TII->get(Opc);
+      for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
+        if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
+          SU.isTwoAddress = true;
+          break;
+        }
+      }
+      if (MCID.isCommutable())
+        SU.isCommutable = true;
+    }
+
+    // Find all predecessors and successors of the group.
+    for (SDNode *N = SU.getNode(); N; N = N->getGluedNode()) {
+      if (N->isMachineOpcode() &&
+          !TII->get(N->getMachineOpcode()).implicit_defs().empty()) {
+        SU.hasPhysRegClobbers = true;
+        unsigned NumUsed = InstrEmitter::CountResults(N);
+        while (NumUsed != 0 && !N->hasAnyUseOfValue(NumUsed - 1))
+          --NumUsed;    // Skip over unused values at the end.
+        if (NumUsed > TII->get(N->getMachineOpcode()).getNumDefs())
+          SU.hasPhysRegDefs = true;
+      }
+
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+        SDNode *OpN = N->getOperand(i).getNode();
+        unsigned DefIdx = N->getOperand(i).getResNo();
+        if (isPassiveNode(OpN)) continue;   // Not scheduled.
+        SUnit *OpSU = &SUnits[OpN->getNodeId()];
+        assert(OpSU && "Node has no SUnit!");
+        if (OpSU == &SU)
+          continue; // In the same group.
+
+        EVT OpVT = N->getOperand(i).getValueType();
+        assert(OpVT != MVT::Glue && "Glued nodes should be in same sunit!");
+        bool isChain = OpVT == MVT::Other;
+
+        unsigned PhysReg = 0;
+        int Cost = 1;
+        // Determine if this is a physical register dependency.
+        const TargetLowering &TLI = DAG->getTargetLoweringInfo();
+        CheckForPhysRegDependency(OpN, N, i, TRI, TII, TLI, PhysReg, Cost);
+        assert((PhysReg == 0 || !isChain) &&
+               "Chain dependence via physreg data?");
+        // FIXME: See ScheduleDAGSDNodes::EmitCopyFromReg. For now, scheduler
+        // emits a copy from the physical register to a virtual register unless
+        // it requires a cross class copy (cost < 0). That means we are only
+        // treating "expensive to copy" register dependency as physical register
+        // dependency. This may change in the future though.
+        if (Cost >= 0 && !StressSched)
+          PhysReg = 0;
+
+        // If this is a ctrl dep, latency is 1.
+        unsigned OpLatency = isChain ? 1 : OpSU->Latency;
+        // Special-case TokenFactor chains as zero-latency.
+        if(isChain && OpN->getOpcode() == ISD::TokenFactor)
+          OpLatency = 0;
+
+        SDep Dep = isChain ? SDep(OpSU, SDep::Barrier)
+          : SDep(OpSU, SDep::Data, PhysReg);
+        Dep.setLatency(OpLatency);
+        if (!isChain && !UnitLatencies) {
+          computeOperandLatency(OpN, N, i, Dep);
+          ST.adjustSchedDependency(OpSU, DefIdx, &SU, i, Dep);
+        }
+
+        if (!SU.addPred(Dep) && !Dep.isCtrl() && OpSU->NumRegDefsLeft > 1) {
+          // Multiple register uses are combined in the same SUnit. For example,
+          // we could have a set of glued nodes with all their defs consumed by
+          // another set of glued nodes. Register pressure tracking sees this as
+          // a single use, so to keep pressure balanced we reduce the defs.
+          //
+          // We can't tell (without more book-keeping) if this results from
+          // glued nodes or duplicate operands. As long as we don't reduce
+          // NumRegDefsLeft to zero, we handle the common cases well.
+          --OpSU->NumRegDefsLeft;
+        }
+      }
+    }
+  }
+}
+
+/// BuildSchedGraph - Build the SUnit graph from the selection dag that we
+/// are input.  This SUnit graph is similar to the SelectionDAG, but
+/// excludes nodes that aren't interesting to scheduling, and represents
+/// glued together nodes with a single SUnit.
+void ScheduleDAGSDNodes::BuildSchedGraph(AAResults *AA) {
+  // Cluster certain nodes which should be scheduled together.
+  ClusterNodes();
+  // Populate the SUnits array.
+  BuildSchedUnits();
+  // Compute all the scheduling dependencies between nodes.
+  AddSchedEdges();
+}
+
+// Initialize NumNodeDefs for the current Node's opcode.
+void ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs() {
+  // Check for phys reg copy.
+  if (!Node)
+    return;
+
+  if (!Node->isMachineOpcode()) {
+    if (Node->getOpcode() == ISD::CopyFromReg)
+      NodeNumDefs = 1;
+    else
+      NodeNumDefs = 0;
+    return;
+  }
+  unsigned POpc = Node->getMachineOpcode();
+  if (POpc == TargetOpcode::IMPLICIT_DEF) {
+    // No register need be allocated for this.
+    NodeNumDefs = 0;
+    return;
+  }
+  if (POpc == TargetOpcode::PATCHPOINT &&
+      Node->getValueType(0) == MVT::Other) {
+    // PATCHPOINT is defined to have one result, but it might really have none
+    // if we're not using CallingConv::AnyReg. Don't mistake the chain for a
+    // real definition.
+    NodeNumDefs = 0;
+    return;
+  }
+  unsigned NRegDefs = SchedDAG->TII->get(Node->getMachineOpcode()).getNumDefs();
+  // Some instructions define regs that are not represented in the selection DAG
+  // (e.g. unused flags). See tMOVi8. Make sure we don't access past NumValues.
+  NodeNumDefs = std::min(Node->getNumValues(), NRegDefs);
+  DefIdx = 0;
+}
+
+// Construct a RegDefIter for this SUnit and find the first valid value.
+ScheduleDAGSDNodes::RegDefIter::RegDefIter(const SUnit *SU,
+                                           const ScheduleDAGSDNodes *SD)
+    : SchedDAG(SD), Node(SU->getNode()) {
+  InitNodeNumDefs();
+  Advance();
+}
+
+// Advance to the next valid value defined by the SUnit.
+void ScheduleDAGSDNodes::RegDefIter::Advance() {
+  for (;Node;) { // Visit all glued nodes.
+    for (;DefIdx < NodeNumDefs; ++DefIdx) {
+      if (!Node->hasAnyUseOfValue(DefIdx))
+        continue;
+      ValueType = Node->getSimpleValueType(DefIdx);
+      ++DefIdx;
+      return; // Found a normal regdef.
+    }
+    Node = Node->getGluedNode();
+    if (!Node) {
+      return; // No values left to visit.
+    }
+    InitNodeNumDefs();
+  }
+}
+
+void ScheduleDAGSDNodes::InitNumRegDefsLeft(SUnit *SU) {
+  assert(SU->NumRegDefsLeft == 0 && "expect a new node");
+  for (RegDefIter I(SU, this); I.IsValid(); I.Advance()) {
+    assert(SU->NumRegDefsLeft < USHRT_MAX && "overflow is ok but unexpected");
+    ++SU->NumRegDefsLeft;
+  }
+}
+
+void ScheduleDAGSDNodes::computeLatency(SUnit *SU) {
+  SDNode *N = SU->getNode();
+
+  // TokenFactor operands are considered zero latency, and some schedulers
+  // (e.g. Top-Down list) may rely on the fact that operand latency is nonzero
+  // whenever node latency is nonzero.
+  if (N && N->getOpcode() == ISD::TokenFactor) {
+    SU->Latency = 0;
+    return;
+  }
+
+  // Check to see if the scheduler cares about latencies.
+  if (forceUnitLatencies()) {
+    SU->Latency = 1;
+    return;
+  }
+
+  if (!InstrItins || InstrItins->isEmpty()) {
+    if (N && N->isMachineOpcode() &&
+        TII->isHighLatencyDef(N->getMachineOpcode()))
+      SU->Latency = HighLatencyCycles;
+    else
+      SU->Latency = 1;
+    return;
+  }
+
+  // Compute the latency for the node.  We use the sum of the latencies for
+  // all nodes glued together into this SUnit.
+  SU->Latency = 0;
+  for (SDNode *N = SU->getNode(); N; N = N->getGluedNode())
+    if (N->isMachineOpcode())
+      SU->Latency += TII->getInstrLatency(InstrItins, N);
+}
+
+void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use,
+                                               unsigned OpIdx, SDep& dep) const{
+  // Check to see if the scheduler cares about latencies.
+  if (forceUnitLatencies())
+    return;
+
+  if (dep.getKind() != SDep::Data)
+    return;
+
+  unsigned DefIdx = Use->getOperand(OpIdx).getResNo();
+  if (Use->isMachineOpcode())
+    // Adjust the use operand index by num of defs.
+    OpIdx += TII->get(Use->getMachineOpcode()).getNumDefs();
+  int Latency = TII->getOperandLatency(InstrItins, Def, DefIdx, Use, OpIdx);
+  if (Latency > 1 && Use->getOpcode() == ISD::CopyToReg &&
+      !BB->succ_empty()) {
+    unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
+    if (Register::isVirtualRegister(Reg))
+      // This copy is a liveout value. It is likely coalesced, so reduce the
+      // latency so not to penalize the def.
+      // FIXME: need target specific adjustment here?
+      Latency = Latency - 1;
+  }
+  if (Latency >= 0)
+    dep.setLatency(Latency);
+}
+
+void ScheduleDAGSDNodes::dumpNode(const SUnit &SU) const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  dumpNodeName(SU);
+  dbgs() << ": ";
+
+  if (!SU.getNode()) {
+    dbgs() << "PHYS REG COPY\n";
+    return;
+  }
+
+  SU.getNode()->dump(DAG);
+  dbgs() << "\n";
+  SmallVector<SDNode *, 4> GluedNodes;
+  for (SDNode *N = SU.getNode()->getGluedNode(); N; N = N->getGluedNode())
+    GluedNodes.push_back(N);
+  while (!GluedNodes.empty()) {
+    dbgs() << "    ";
+    GluedNodes.back()->dump(DAG);
+    dbgs() << "\n";
+    GluedNodes.pop_back();
+  }
+#endif
+}
+
+void ScheduleDAGSDNodes::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  if (EntrySU.getNode() != nullptr)
+    dumpNodeAll(EntrySU);
+  for (const SUnit &SU : SUnits)
+    dumpNodeAll(SU);
+  if (ExitSU.getNode() != nullptr)
+    dumpNodeAll(ExitSU);
+#endif
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ScheduleDAGSDNodes::dumpSchedule() const {
+  for (const SUnit *SU : Sequence) {
+    if (SU)
+      dumpNode(*SU);
+    else
+      dbgs() << "**** NOOP ****\n";
+  }
+}
+#endif
+
+#ifndef NDEBUG
+/// VerifyScheduledSequence - Verify that all SUnits were scheduled and that
+/// their state is consistent with the nodes listed in Sequence.
+///
+void ScheduleDAGSDNodes::VerifyScheduledSequence(bool isBottomUp) {
+  unsigned ScheduledNodes = ScheduleDAG::VerifyScheduledDAG(isBottomUp);
+  unsigned Noops = llvm::count(Sequence, nullptr);
+  assert(Sequence.size() - Noops == ScheduledNodes &&
+         "The number of nodes scheduled doesn't match the expected number!");
+}
+#endif // NDEBUG
+
+/// ProcessSDDbgValues - Process SDDbgValues associated with this node.
+static void
+ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
+                   SmallVectorImpl<std::pair<unsigned, MachineInstr*> > &Orders,
+                   DenseMap<SDValue, Register> &VRBaseMap, unsigned Order) {
+  if (!N->getHasDebugValue())
+    return;
+
+  /// Returns true if \p DV has any VReg operand locations which don't exist in
+  /// VRBaseMap.
+  auto HasUnknownVReg = [&VRBaseMap](SDDbgValue *DV) {
+    for (const SDDbgOperand &L : DV->getLocationOps()) {
+      if (L.getKind() == SDDbgOperand::SDNODE &&
+          VRBaseMap.count({L.getSDNode(), L.getResNo()}) == 0)
+        return true;
+    }
+    return false;
+  };
+
+  // Opportunistically insert immediate dbg_value uses, i.e. those with the same
+  // source order number as N.
+  MachineBasicBlock *BB = Emitter.getBlock();
+  MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos();
+  for (auto *DV : DAG->GetDbgValues(N)) {
+    if (DV->isEmitted())
+      continue;
+    unsigned DVOrder = DV->getOrder();
+    if (Order != 0 && DVOrder != Order)
+      continue;
+    // If DV has any VReg location operands which haven't been mapped then
+    // either that node is no longer available or we just haven't visited the
+    // node yet. In the former case we should emit an undef dbg_value, but we
+    // can do it later. And for the latter we'll want to wait until all
+    // dependent nodes have been visited.
+    if (!DV->isInvalidated() && HasUnknownVReg(DV))
+      continue;
+    MachineInstr *DbgMI = Emitter.EmitDbgValue(DV, VRBaseMap);
+    if (!DbgMI)
+      continue;
+    Orders.push_back({DVOrder, DbgMI});
+    BB->insert(InsertPos, DbgMI);
+  }
+}
+
+// ProcessSourceNode - Process nodes with source order numbers. These are added
+// to a vector which EmitSchedule uses to determine how to insert dbg_value
+// instructions in the right order.
+static void
+ProcessSourceNode(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
+                  DenseMap<SDValue, Register> &VRBaseMap,
+                  SmallVectorImpl<std::pair<unsigned, MachineInstr *>> &Orders,
+                  SmallSet<Register, 8> &Seen, MachineInstr *NewInsn) {
+  unsigned Order = N->getIROrder();
+  if (!Order || Seen.count(Order)) {
+    // Process any valid SDDbgValues even if node does not have any order
+    // assigned.
+    ProcessSDDbgValues(N, DAG, Emitter, Orders, VRBaseMap, 0);
+    return;
+  }
+
+  // If a new instruction was generated for this Order number, record it.
+  // Otherwise, leave this order number unseen: we will either find later
+  // instructions for it, or leave it unseen if there were no instructions at
+  // all.
+  if (NewInsn) {
+    Seen.insert(Order);
+    Orders.push_back({Order, NewInsn});
+  }
+
+  // Even if no instruction was generated, a Value may have become defined via
+  // earlier nodes. Try to process them now.
+  ProcessSDDbgValues(N, DAG, Emitter, Orders, VRBaseMap, Order);
+}
+
+void ScheduleDAGSDNodes::
+EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, Register> &VRBaseMap,
+                MachineBasicBlock::iterator InsertPos) {
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
+      continue; // ignore chain preds
+    if (Pred.getSUnit()->CopyDstRC) {
+      // Copy to physical register.
+      DenseMap<SUnit *, Register>::iterator VRI =
+          VRBaseMap.find(Pred.getSUnit());
+      assert(VRI != VRBaseMap.end() && "Node emitted out of order - late");
+      // Find the destination physical register.
+      Register Reg;
+      for (const SDep &Succ : SU->Succs) {
+        if (Succ.isCtrl())
+          continue; // ignore chain preds
+        if (Succ.getReg()) {
+          Reg = Succ.getReg();
+          break;
+        }
+      }
+      BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), Reg)
+        .addReg(VRI->second);
+    } else {
+      // Copy from physical register.
+      assert(Pred.getReg() && "Unknown physical register!");
+      Register VRBase = MRI.createVirtualRegister(SU->CopyDstRC);
+      bool isNew = VRBaseMap.insert(std::make_pair(SU, VRBase)).second;
+      (void)isNew; // Silence compiler warning.
+      assert(isNew && "Node emitted out of order - early");
+      BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), VRBase)
+          .addReg(Pred.getReg());
+    }
+    break;
+  }
+}
+
+/// EmitSchedule - Emit the machine code in scheduled order. Return the new
+/// InsertPos and MachineBasicBlock that contains this insertion
+/// point. ScheduleDAGSDNodes holds a BB pointer for convenience, but this does
+/// not necessarily refer to returned BB. The emitter may split blocks.
+MachineBasicBlock *ScheduleDAGSDNodes::
+EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
+  InstrEmitter Emitter(DAG->getTarget(), BB, InsertPos);
+  DenseMap<SDValue, Register> VRBaseMap;
+  DenseMap<SUnit*, Register> CopyVRBaseMap;
+  SmallVector<std::pair<unsigned, MachineInstr*>, 32> Orders;
+  SmallSet<Register, 8> Seen;
+  bool HasDbg = DAG->hasDebugValues();
+
+  // Emit a node, and determine where its first instruction is for debuginfo.
+  // Zero, one, or multiple instructions can be created when emitting a node.
+  auto EmitNode =
+      [&](SDNode *Node, bool IsClone, bool IsCloned,
+          DenseMap<SDValue, Register> &VRBaseMap) -> MachineInstr * {
+    // Fetch instruction prior to this, or end() if nonexistant.
+    auto GetPrevInsn = [&](MachineBasicBlock::iterator I) {
+      if (I == BB->begin())
+        return BB->end();
+      else
+        return std::prev(Emitter.getInsertPos());
+    };
+
+    MachineBasicBlock::iterator Before = GetPrevInsn(Emitter.getInsertPos());
+    Emitter.EmitNode(Node, IsClone, IsCloned, VRBaseMap);
+    MachineBasicBlock::iterator After = GetPrevInsn(Emitter.getInsertPos());
+
+    // If the iterator did not change, no instructions were inserted.
+    if (Before == After)
+      return nullptr;
+
+    MachineInstr *MI;
+    if (Before == BB->end()) {
+      // There were no prior instructions; the new ones must start at the
+      // beginning of the block.
+      MI = &Emitter.getBlock()->instr_front();
+    } else {
+      // Return first instruction after the pre-existing instructions.
+      MI = &*std::next(Before);
+    }
+
+    if (MI->isCandidateForCallSiteEntry() &&
+        DAG->getTarget().Options.EmitCallSiteInfo)
+      MF.addCallArgsForwardingRegs(MI, DAG->getCallSiteInfo(Node));
+
+    if (DAG->getNoMergeSiteInfo(Node)) {
+      MI->setFlag(MachineInstr::MIFlag::NoMerge);
+    }
+
+    if (MDNode *MD = DAG->getPCSections(Node))
+      MI->setPCSections(MF, MD);
+
+    return MI;
+  };
+
+  // If this is the first BB, emit byval parameter dbg_value's.
+  if (HasDbg && BB->getParent()->begin() == MachineFunction::iterator(BB)) {
+    SDDbgInfo::DbgIterator PDI = DAG->ByvalParmDbgBegin();
+    SDDbgInfo::DbgIterator PDE = DAG->ByvalParmDbgEnd();
+    for (; PDI != PDE; ++PDI) {
+      MachineInstr *DbgMI= Emitter.EmitDbgValue(*PDI, VRBaseMap);
+      if (DbgMI) {
+        BB->insert(InsertPos, DbgMI);
+        // We re-emit the dbg_value closer to its use, too, after instructions
+        // are emitted to the BB.
+        (*PDI)->clearIsEmitted();
+      }
+    }
+  }
+
+  for (SUnit *SU : Sequence) {
+    if (!SU) {
+      // Null SUnit* is a noop.
+      TII->insertNoop(*Emitter.getBlock(), InsertPos);
+      continue;
+    }
+
+    // For pre-regalloc scheduling, create instructions corresponding to the
+    // SDNode and any glued SDNodes and append them to the block.
+    if (!SU->getNode()) {
+      // Emit a copy.
+      EmitPhysRegCopy(SU, CopyVRBaseMap, InsertPos);
+      continue;
+    }
+
+    SmallVector<SDNode *, 4> GluedNodes;
+    for (SDNode *N = SU->getNode()->getGluedNode(); N; N = N->getGluedNode())
+      GluedNodes.push_back(N);
+    while (!GluedNodes.empty()) {
+      SDNode *N = GluedNodes.back();
+      auto NewInsn = EmitNode(N, SU->OrigNode != SU, SU->isCloned, VRBaseMap);
+      // Remember the source order of the inserted instruction.
+      if (HasDbg)
+        ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen, NewInsn);
+
+      if (MDNode *MD = DAG->getHeapAllocSite(N))
+        if (NewInsn && NewInsn->isCall())
+          NewInsn->setHeapAllocMarker(MF, MD);
+
+      GluedNodes.pop_back();
+    }
+    auto NewInsn =
+        EmitNode(SU->getNode(), SU->OrigNode != SU, SU->isCloned, VRBaseMap);
+    // Remember the source order of the inserted instruction.
+    if (HasDbg)
+      ProcessSourceNode(SU->getNode(), DAG, Emitter, VRBaseMap, Orders, Seen,
+                        NewInsn);
+
+    if (MDNode *MD = DAG->getHeapAllocSite(SU->getNode())) {
+      if (NewInsn && NewInsn->isCall())
+        NewInsn->setHeapAllocMarker(MF, MD);
+    }
+  }
+
+  // Insert all the dbg_values which have not already been inserted in source
+  // order sequence.
+  if (HasDbg) {
+    MachineBasicBlock::iterator BBBegin = BB->getFirstNonPHI();
+
+    // Sort the source order instructions and use the order to insert debug
+    // values. Use stable_sort so that DBG_VALUEs are inserted in the same order
+    // regardless of the host's implementation fo std::sort.
+    llvm::stable_sort(Orders, less_first());
+    std::stable_sort(DAG->DbgBegin(), DAG->DbgEnd(),
+                     [](const SDDbgValue *LHS, const SDDbgValue *RHS) {
+                       return LHS->getOrder() < RHS->getOrder();
+                     });
+
+    SDDbgInfo::DbgIterator DI = DAG->DbgBegin();
+    SDDbgInfo::DbgIterator DE = DAG->DbgEnd();
+    // Now emit the rest according to source order.
+    unsigned LastOrder = 0;
+    for (unsigned i = 0, e = Orders.size(); i != e && DI != DE; ++i) {
+      unsigned Order = Orders[i].first;
+      MachineInstr *MI = Orders[i].second;
+      // Insert all SDDbgValue's whose order(s) are before "Order".
+      assert(MI);
+      for (; DI != DE; ++DI) {
+        if ((*DI)->getOrder() < LastOrder || (*DI)->getOrder() >= Order)
+          break;
+        if ((*DI)->isEmitted())
+          continue;
+
+        MachineInstr *DbgMI = Emitter.EmitDbgValue(*DI, VRBaseMap);
+        if (DbgMI) {
+          if (!LastOrder)
+            // Insert to start of the BB (after PHIs).
+            BB->insert(BBBegin, DbgMI);
+          else {
+            // Insert at the instruction, which may be in a different
+            // block, if the block was split by a custom inserter.
+            MachineBasicBlock::iterator Pos = MI;
+            MI->getParent()->insert(Pos, DbgMI);
+          }
+        }
+      }
+      LastOrder = Order;
+    }
+    // Add trailing DbgValue's before the terminator. FIXME: May want to add
+    // some of them before one or more conditional branches?
+    SmallVector<MachineInstr*, 8> DbgMIs;
+    for (; DI != DE; ++DI) {
+      if ((*DI)->isEmitted())
+        continue;
+      assert((*DI)->getOrder() >= LastOrder &&
+             "emitting DBG_VALUE out of order");
+      if (MachineInstr *DbgMI = Emitter.EmitDbgValue(*DI, VRBaseMap))
+        DbgMIs.push_back(DbgMI);
+    }
+
+    MachineBasicBlock *InsertBB = Emitter.getBlock();
+    MachineBasicBlock::iterator Pos = InsertBB->getFirstTerminator();
+    InsertBB->insert(Pos, DbgMIs.begin(), DbgMIs.end());
+
+    SDDbgInfo::DbgLabelIterator DLI = DAG->DbgLabelBegin();
+    SDDbgInfo::DbgLabelIterator DLE = DAG->DbgLabelEnd();
+    // Now emit the rest according to source order.
+    LastOrder = 0;
+    for (const auto &InstrOrder : Orders) {
+      unsigned Order = InstrOrder.first;
+      MachineInstr *MI = InstrOrder.second;
+      if (!MI)
+        continue;
+
+      // Insert all SDDbgLabel's whose order(s) are before "Order".
+      for (; DLI != DLE &&
+             (*DLI)->getOrder() >= LastOrder && (*DLI)->getOrder() < Order;
+             ++DLI) {
+        MachineInstr *DbgMI = Emitter.EmitDbgLabel(*DLI);
+        if (DbgMI) {
+          if (!LastOrder)
+            // Insert to start of the BB (after PHIs).
+            BB->insert(BBBegin, DbgMI);
+          else {
+            // Insert at the instruction, which may be in a different
+            // block, if the block was split by a custom inserter.
+            MachineBasicBlock::iterator Pos = MI;
+            MI->getParent()->insert(Pos, DbgMI);
+          }
+        }
+      }
+      if (DLI == DLE)
+        break;
+
+      LastOrder = Order;
+    }
+  }
+
+  InsertPos = Emitter.getInsertPos();
+  // In some cases, DBG_VALUEs might be inserted after the first terminator,
+  // which results in an invalid MBB. If that happens, move the DBG_VALUEs
+  // before the first terminator.
+  MachineBasicBlock *InsertBB = Emitter.getBlock();
+  auto FirstTerm = InsertBB->getFirstTerminator();
+  if (FirstTerm != InsertBB->end()) {
+    assert(!FirstTerm->isDebugValue() &&
+           "first terminator cannot be a debug value");
+    for (MachineInstr &MI : make_early_inc_range(
+             make_range(std::next(FirstTerm), InsertBB->end()))) {
+      // Only scan up to insertion point.
+      if (&MI == InsertPos)
+        break;
+
+      if (!MI.isDebugValue())
+        continue;
+
+      // The DBG_VALUE was referencing a value produced by a terminator. By
+      // moving the DBG_VALUE, the referenced value also needs invalidating.
+      MI.getOperand(0).ChangeToRegister(0, false);
+      MI.moveBefore(&*FirstTerm);
+    }
+  }
+  return InsertBB;
+}
+
+/// Return the basic block label.
+std::string ScheduleDAGSDNodes::getDAGName() const {
+  return "sunit-dag." + BB->getFullName();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
new file mode 100644
index 000000000000..439ccfdc3275
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -0,0 +1,193 @@
+//===---- ScheduleDAGSDNodes.h - SDNode Scheduling --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ScheduleDAGSDNodes class, which implements
+// scheduling for an SDNode-based dependency graph.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SCHEDULEDAGSDNODES_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_SCHEDULEDAGSDNODES_H
+
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+class AAResults;
+class InstrItineraryData;
+
+  /// ScheduleDAGSDNodes - A ScheduleDAG for scheduling SDNode-based DAGs.
+  ///
+  /// Edges between SUnits are initially based on edges in the SelectionDAG,
+  /// and additional edges can be added by the schedulers as heuristics.
+  /// SDNodes such as Constants, Registers, and a few others that are not
+  /// interesting to schedulers are not allocated SUnits.
+  ///
+  /// SDNodes with MVT::Glue operands are grouped along with the flagged
+  /// nodes into a single SUnit so that they are scheduled together.
+  ///
+  /// SDNode-based scheduling graphs do not use SDep::Anti or SDep::Output
+  /// edges.  Physical register dependence information is not carried in
+  /// the DAG and must be handled explicitly by schedulers.
+  ///
+  class ScheduleDAGSDNodes : public ScheduleDAG {
+  public:
+    MachineBasicBlock *BB = nullptr;
+    SelectionDAG *DAG = nullptr; // DAG of the current basic block
+    const InstrItineraryData *InstrItins;
+
+    /// The schedule. Null SUnit*'s represent noop instructions.
+    std::vector<SUnit*> Sequence;
+
+    explicit ScheduleDAGSDNodes(MachineFunction &mf);
+
+    ~ScheduleDAGSDNodes() override = default;
+
+    /// Run - perform scheduling.
+    ///
+    void Run(SelectionDAG *dag, MachineBasicBlock *bb);
+
+    /// isPassiveNode - Return true if the node is a non-scheduled leaf.
+    ///
+    static bool isPassiveNode(SDNode *Node) {
+      if (isa<ConstantSDNode>(Node))       return true;
+      if (isa<ConstantFPSDNode>(Node))     return true;
+      if (isa<RegisterSDNode>(Node))       return true;
+      if (isa<RegisterMaskSDNode>(Node))   return true;
+      if (isa<GlobalAddressSDNode>(Node))  return true;
+      if (isa<BasicBlockSDNode>(Node))     return true;
+      if (isa<FrameIndexSDNode>(Node))     return true;
+      if (isa<ConstantPoolSDNode>(Node))   return true;
+      if (isa<TargetIndexSDNode>(Node))    return true;
+      if (isa<JumpTableSDNode>(Node))      return true;
+      if (isa<ExternalSymbolSDNode>(Node)) return true;
+      if (isa<MCSymbolSDNode>(Node))       return true;
+      if (isa<BlockAddressSDNode>(Node))   return true;
+      if (Node->getOpcode() == ISD::EntryToken ||
+          isa<MDNodeSDNode>(Node)) return true;
+      return false;
+    }
+
+    /// NewSUnit - Creates a new SUnit and return a ptr to it.
+    ///
+    SUnit *newSUnit(SDNode *N);
+
+    /// Clone - Creates a clone of the specified SUnit. It does not copy the
+    /// predecessors / successors info nor the temporary scheduling states.
+    ///
+    SUnit *Clone(SUnit *Old);
+
+    /// BuildSchedGraph - Build the SUnit graph from the selection dag that we
+    /// are input.  This SUnit graph is similar to the SelectionDAG, but
+    /// excludes nodes that aren't interesting to scheduling, and represents
+    /// flagged together nodes with a single SUnit.
+    void BuildSchedGraph(AAResults *AA);
+
+    /// InitNumRegDefsLeft - Determine the # of regs defined by this node.
+    ///
+    void InitNumRegDefsLeft(SUnit *SU);
+
+    /// computeLatency - Compute node latency.
+    ///
+    virtual void computeLatency(SUnit *SU);
+
+    virtual void computeOperandLatency(SDNode *Def, SDNode *Use,
+                                       unsigned OpIdx, SDep& dep) const;
+
+    /// Schedule - Order nodes according to selected style, filling
+    /// in the Sequence member.
+    ///
+    virtual void Schedule() = 0;
+
+    /// VerifyScheduledSequence - Verify that all SUnits are scheduled and
+    /// consistent with the Sequence of scheduled instructions.
+    void VerifyScheduledSequence(bool isBottomUp);
+
+    /// EmitSchedule - Insert MachineInstrs into the MachineBasicBlock
+    /// according to the order specified in Sequence.
+    ///
+    virtual MachineBasicBlock*
+    EmitSchedule(MachineBasicBlock::iterator &InsertPos);
+
+    void dumpNode(const SUnit &SU) const override;
+    void dump() const override;
+    void dumpSchedule() const;
+
+    std::string getGraphNodeLabel(const SUnit *SU) const override;
+
+    std::string getDAGName() const override;
+
+    virtual void getCustomGraphFeatures(GraphWriter<ScheduleDAG*> &GW) const;
+
+    /// RegDefIter - In place iteration over the values defined by an
+    /// SUnit. This does not need copies of the iterator or any other STLisms.
+    /// The iterator creates itself, rather than being provided by the SchedDAG.
+    class RegDefIter {
+      const ScheduleDAGSDNodes *SchedDAG;
+      const SDNode *Node;
+      unsigned DefIdx = 0;
+      unsigned NodeNumDefs = 0;
+      MVT ValueType;
+
+    public:
+      RegDefIter(const SUnit *SU, const ScheduleDAGSDNodes *SD);
+
+      bool IsValid() const { return Node != nullptr; }
+
+      MVT GetValue() const {
+        assert(IsValid() && "bad iterator");
+        return ValueType;
+      }
+
+      const SDNode *GetNode() const {
+        return Node;
+      }
+
+      unsigned GetIdx() const {
+        return DefIdx-1;
+      }
+
+      void Advance();
+
+    private:
+      void InitNodeNumDefs();
+    };
+
+  protected:
+    /// ForceUnitLatencies - Return true if all scheduling edges should be given
+    /// a latency value of one.  The default is to return false; schedulers may
+    /// override this as needed.
+    virtual bool forceUnitLatencies() const { return false; }
+
+  private:
+    /// ClusterNeighboringLoads - Cluster loads from "near" addresses into
+    /// combined SUnits.
+    void ClusterNeighboringLoads(SDNode *Node);
+    /// ClusterNodes - Cluster certain nodes which should be scheduled together.
+    ///
+    void ClusterNodes();
+
+    /// BuildSchedUnits, AddSchedEdges - Helper functions for BuildSchedGraph.
+    void BuildSchedUnits();
+    void AddSchedEdges();
+
+    void EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, Register> &VRBaseMap,
+                         MachineBasicBlock::iterator InsertPos);
+  };
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_SELECTIONDAG_SCHEDULEDAGSDNODES_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
new file mode 100644
index 000000000000..1ba1fd65b8c9
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -0,0 +1,271 @@
+//===- ScheduleDAGVLIW.cpp - SelectionDAG list scheduler for VLIW -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a top-down list scheduler, using standard algorithms.
+// The basic approach uses a priority queue of available nodes to schedule.
+// One at a time, nodes are taken from the priority queue (thus in priority
+// order), checked for legality to schedule, and emitted if legal.
+//
+// Nodes may not be legal to schedule either due to structural hazards (e.g.
+// pipeline or resource constraints) or because an input to the instruction has
+// not completed execution.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/ResourcePriorityQueue.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "pre-RA-sched"
+
+STATISTIC(NumNoops , "Number of noops inserted");
+STATISTIC(NumStalls, "Number of pipeline stalls");
+
+static RegisterScheduler
+  VLIWScheduler("vliw-td", "VLIW scheduler",
+                createVLIWDAGScheduler);
+
+namespace {
+//===----------------------------------------------------------------------===//
+/// ScheduleDAGVLIW - The actual DFA list scheduler implementation.  This
+/// supports / top-down scheduling.
+///
+class ScheduleDAGVLIW : public ScheduleDAGSDNodes {
+private:
+  /// AvailableQueue - The priority queue to use for the available SUnits.
+  ///
+  SchedulingPriorityQueue *AvailableQueue;
+
+  /// PendingQueue - This contains all of the instructions whose operands have
+  /// been issued, but their results are not ready yet (due to the latency of
+  /// the operation).  Once the operands become available, the instruction is
+  /// added to the AvailableQueue.
+  std::vector<SUnit*> PendingQueue;
+
+  /// HazardRec - The hazard recognizer to use.
+  ScheduleHazardRecognizer *HazardRec;
+
+  /// AA - AAResults for making memory reference queries.
+  AAResults *AA;
+
+public:
+  ScheduleDAGVLIW(MachineFunction &mf, AAResults *aa,
+                  SchedulingPriorityQueue *availqueue)
+      : ScheduleDAGSDNodes(mf), AvailableQueue(availqueue), AA(aa) {
+    const TargetSubtargetInfo &STI = mf.getSubtarget();
+    HazardRec = STI.getInstrInfo()->CreateTargetHazardRecognizer(&STI, this);
+  }
+
+  ~ScheduleDAGVLIW() override {
+    delete HazardRec;
+    delete AvailableQueue;
+  }
+
+  void Schedule() override;
+
+private:
+  void releaseSucc(SUnit *SU, const SDep &D);
+  void releaseSuccessors(SUnit *SU);
+  void scheduleNodeTopDown(SUnit *SU, unsigned CurCycle);
+  void listScheduleTopDown();
+};
+}  // end anonymous namespace
+
+/// Schedule - Schedule the DAG using list scheduling.
+void ScheduleDAGVLIW::Schedule() {
+  LLVM_DEBUG(dbgs() << "********** List Scheduling " << printMBBReference(*BB)
+                    << " '" << BB->getName() << "' **********\n");
+
+  // Build the scheduling graph.
+  BuildSchedGraph(AA);
+
+  AvailableQueue->initNodes(SUnits);
+
+  listScheduleTopDown();
+
+  AvailableQueue->releaseState();
+}
+
+//===----------------------------------------------------------------------===//
+//  Top-Down Scheduling
+//===----------------------------------------------------------------------===//
+
+/// releaseSucc - Decrement the NumPredsLeft count of a successor. Add it to
+/// the PendingQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGVLIW::releaseSucc(SUnit *SU, const SDep &D) {
+  SUnit *SuccSU = D.getSUnit();
+
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    dumpNode(*SuccSU);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+  assert(!D.isWeak() && "unexpected artificial DAG edge");
+
+  --SuccSU->NumPredsLeft;
+
+  SuccSU->setDepthToAtLeast(SU->getDepth() + D.getLatency());
+
+  // If all the node's predecessors are scheduled, this node is ready
+  // to be scheduled. Ignore the special ExitSU node.
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU) {
+    PendingQueue.push_back(SuccSU);
+  }
+}
+
+void ScheduleDAGVLIW::releaseSuccessors(SUnit *SU) {
+  // Top down: release successors.
+  for (SDep &Succ : SU->Succs) {
+    assert(!Succ.isAssignedRegDep() &&
+           "The list-td scheduler doesn't yet support physreg dependencies!");
+
+    releaseSucc(SU, Succ);
+  }
+}
+
+/// scheduleNodeTopDown - Add the node to the schedule. Decrement the pending
+/// count of its successors. If a successor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGVLIW::scheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
+  LLVM_DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  LLVM_DEBUG(dumpNode(*SU));
+
+  Sequence.push_back(SU);
+  assert(CurCycle >= SU->getDepth() && "Node scheduled above its depth!");
+  SU->setDepthToAtLeast(CurCycle);
+
+  releaseSuccessors(SU);
+  SU->isScheduled = true;
+  AvailableQueue->scheduledNode(SU);
+}
+
+/// listScheduleTopDown - The main loop of list scheduling for top-down
+/// schedulers.
+void ScheduleDAGVLIW::listScheduleTopDown() {
+  unsigned CurCycle = 0;
+
+  // Release any successors of the special Entry node.
+  releaseSuccessors(&EntrySU);
+
+  // All leaves to AvailableQueue.
+  for (SUnit &SU : SUnits) {
+    // It is available if it has no predecessors.
+    if (SU.Preds.empty()) {
+      AvailableQueue->push(&SU);
+      SU.isAvailable = true;
+    }
+  }
+
+  // While AvailableQueue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  std::vector<SUnit*> NotReady;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue->empty() || !PendingQueue.empty()) {
+    // Check to see if any of the pending instructions are ready to issue.  If
+    // so, add them to the available queue.
+    for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) {
+      if (PendingQueue[i]->getDepth() == CurCycle) {
+        AvailableQueue->push(PendingQueue[i]);
+        PendingQueue[i]->isAvailable = true;
+        PendingQueue[i] = PendingQueue.back();
+        PendingQueue.pop_back();
+        --i; --e;
+      }
+      else {
+        assert(PendingQueue[i]->getDepth() > CurCycle && "Negative latency?");
+      }
+    }
+
+    // If there are no instructions available, don't try to issue anything, and
+    // don't advance the hazard recognizer.
+    if (AvailableQueue->empty()) {
+      // Reset DFA state.
+      AvailableQueue->scheduledNode(nullptr);
+      ++CurCycle;
+      continue;
+    }
+
+    SUnit *FoundSUnit = nullptr;
+
+    bool HasNoopHazards = false;
+    while (!AvailableQueue->empty()) {
+      SUnit *CurSUnit = AvailableQueue->pop();
+
+      ScheduleHazardRecognizer::HazardType HT =
+        HazardRec->getHazardType(CurSUnit, 0/*no stalls*/);
+      if (HT == ScheduleHazardRecognizer::NoHazard) {
+        FoundSUnit = CurSUnit;
+        break;
+      }
+
+      // Remember if this is a noop hazard.
+      HasNoopHazards |= HT == ScheduleHazardRecognizer::NoopHazard;
+
+      NotReady.push_back(CurSUnit);
+    }
+
+    // Add the nodes that aren't ready back onto the available list.
+    if (!NotReady.empty()) {
+      AvailableQueue->push_all(NotReady);
+      NotReady.clear();
+    }
+
+    // If we found a node to schedule, do it now.
+    if (FoundSUnit) {
+      scheduleNodeTopDown(FoundSUnit, CurCycle);
+      HazardRec->EmitInstruction(FoundSUnit);
+
+      // If this is a pseudo-op node, we don't want to increment the current
+      // cycle.
+      if (FoundSUnit->Latency)  // Don't increment CurCycle for pseudo-ops!
+        ++CurCycle;
+    } else if (!HasNoopHazards) {
+      // Otherwise, we have a pipeline stall, but no other problem, just advance
+      // the current cycle and try again.
+      LLVM_DEBUG(dbgs() << "*** Advancing cycle, no work to do\n");
+      HazardRec->AdvanceCycle();
+      ++NumStalls;
+      ++CurCycle;
+    } else {
+      // Otherwise, we have no instructions to issue and we have instructions
+      // that will fault if we don't do this right.  This is the case for
+      // processors without pipeline interlocks and other cases.
+      LLVM_DEBUG(dbgs() << "*** Emitting noop\n");
+      HazardRec->EmitNoop();
+      Sequence.push_back(nullptr);   // NULL here means noop
+      ++NumNoops;
+      ++CurCycle;
+    }
+  }
+
+#ifndef NDEBUG
+  VerifyScheduledSequence(/*isBottomUp=*/false);
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+/// createVLIWDAGScheduler - This creates a top-down list scheduler.
+ScheduleDAGSDNodes *
+llvm::createVLIWDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  return new ScheduleDAGVLIW(*IS->MF, IS->AA, new ResourcePriorityQueue(IS));
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
new file mode 100644
index 000000000000..30d202494320
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -0,0 +1,12710 @@
+//===- SelectionDAG.cpp - Implement the SelectionDAG data structures ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAG class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "SDNodeDbgValue.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+/// makeVTList - Return an instance of the SDVTList struct initialized with the
+/// specified members.
+static SDVTList makeVTList(const EVT *VTs, unsigned NumVTs) {
+  SDVTList Res = {VTs, NumVTs};
+  return Res;
+}
+
+// Default null implementations of the callbacks.
+void SelectionDAG::DAGUpdateListener::NodeDeleted(SDNode*, SDNode*) {}
+void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {}
+void SelectionDAG::DAGUpdateListener::NodeInserted(SDNode *) {}
+
+void SelectionDAG::DAGNodeDeletedListener::anchor() {}
+void SelectionDAG::DAGNodeInsertedListener::anchor() {}
+
+#define DEBUG_TYPE "selectiondag"
+
+static cl::opt<bool> EnableMemCpyDAGOpt("enable-memcpy-dag-opt",
+       cl::Hidden, cl::init(true),
+       cl::desc("Gang up loads and stores generated by inlining of memcpy"));
+
+static cl::opt<int> MaxLdStGlue("ldstmemcpy-glue-max",
+       cl::desc("Number limit for gluing ld/st of memcpy."),
+       cl::Hidden, cl::init(0));
+
+static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) {
+  LLVM_DEBUG(dbgs() << Msg; V.getNode()->dump(G););
+}
+
+//===----------------------------------------------------------------------===//
+//                              ConstantFPSDNode Class
+//===----------------------------------------------------------------------===//
+
+/// isExactlyValue - We don't rely on operator== working on double values, as
+/// it returns true for things that are clearly not equal, like -0.0 and 0.0.
+/// As such, this method can be used to do an exact bit-for-bit comparison of
+/// two floating point values.
+bool ConstantFPSDNode::isExactlyValue(const APFloat& V) const {
+  return getValueAPF().bitwiseIsEqual(V);
+}
+
+bool ConstantFPSDNode::isValueValidForType(EVT VT,
+                                           const APFloat& Val) {
+  assert(VT.isFloatingPoint() && "Can only convert between FP types");
+
+  // convert modifies in place, so make a copy.
+  APFloat Val2 = APFloat(Val);
+  bool losesInfo;
+  (void) Val2.convert(SelectionDAG::EVTToAPFloatSemantics(VT),
+                      APFloat::rmNearestTiesToEven,
+                      &losesInfo);
+  return !losesInfo;
+}
+
+//===----------------------------------------------------------------------===//
+//                              ISD Namespace
+//===----------------------------------------------------------------------===//
+
+bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal) {
+  if (N->getOpcode() == ISD::SPLAT_VECTOR) {
+    unsigned EltSize =
+        N->getValueType(0).getVectorElementType().getSizeInBits();
+    if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      SplatVal = Op0->getAPIntValue().trunc(EltSize);
+      return true;
+    }
+    if (auto *Op0 = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
+      SplatVal = Op0->getValueAPF().bitcastToAPInt().trunc(EltSize);
+      return true;
+    }
+  }
+
+  auto *BV = dyn_cast<BuildVectorSDNode>(N);
+  if (!BV)
+    return false;
+
+  APInt SplatUndef;
+  unsigned SplatBitSize;
+  bool HasUndefs;
+  unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits();
+  return BV->isConstantSplat(SplatVal, SplatUndef, SplatBitSize, HasUndefs,
+                             EltSize) &&
+         EltSize == SplatBitSize;
+}
+
+// FIXME: AllOnes and AllZeros duplicate a lot of code. Could these be
+// specializations of the more general isConstantSplatVector()?
+
+bool ISD::isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly) {
+  // Look through a bit convert.
+  while (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0).getNode();
+
+  if (!BuildVectorOnly && N->getOpcode() == ISD::SPLAT_VECTOR) {
+    APInt SplatVal;
+    return isConstantSplatVector(N, SplatVal) && SplatVal.isAllOnes();
+  }
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
+
+  unsigned i = 0, e = N->getNumOperands();
+
+  // Skip over all of the undef values.
+  while (i != e && N->getOperand(i).isUndef())
+    ++i;
+
+  // Do not accept an all-undef vector.
+  if (i == e) return false;
+
+  // Do not accept build_vectors that aren't all constants or which have non-~0
+  // elements. We have to be a bit careful here, as the type of the constant
+  // may not be the same as the type of the vector elements due to type
+  // legalization (the elements are promoted to a legal type for the target and
+  // a vector of a type may be legal when the base element type is not).
+  // We only want to check enough bits to cover the vector elements, because
+  // we care if the resultant vector is all ones, not whether the individual
+  // constants are.
+  SDValue NotZero = N->getOperand(i);
+  unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(NotZero)) {
+    if (CN->getAPIntValue().countr_one() < EltSize)
+      return false;
+  } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(NotZero)) {
+    if (CFPN->getValueAPF().bitcastToAPInt().countr_one() < EltSize)
+      return false;
+  } else
+    return false;
+
+  // Okay, we have at least one ~0 value, check to see if the rest match or are
+  // undefs. Even with the above element type twiddling, this should be OK, as
+  // the same type legalization should have applied to all the elements.
+  for (++i; i != e; ++i)
+    if (N->getOperand(i) != NotZero && !N->getOperand(i).isUndef())
+      return false;
+  return true;
+}
+
+bool ISD::isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly) {
+  // Look through a bit convert.
+  while (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0).getNode();
+
+  if (!BuildVectorOnly && N->getOpcode() == ISD::SPLAT_VECTOR) {
+    APInt SplatVal;
+    return isConstantSplatVector(N, SplatVal) && SplatVal.isZero();
+  }
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
+
+  bool IsAllUndef = true;
+  for (const SDValue &Op : N->op_values()) {
+    if (Op.isUndef())
+      continue;
+    IsAllUndef = false;
+    // Do not accept build_vectors that aren't all constants or which have non-0
+    // elements. We have to be a bit careful here, as the type of the constant
+    // may not be the same as the type of the vector elements due to type
+    // legalization (the elements are promoted to a legal type for the target
+    // and a vector of a type may be legal when the base element type is not).
+    // We only want to check enough bits to cover the vector elements, because
+    // we care if the resultant vector is all zeros, not whether the individual
+    // constants are.
+    unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op)) {
+      if (CN->getAPIntValue().countr_zero() < EltSize)
+        return false;
+    } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(Op)) {
+      if (CFPN->getValueAPF().bitcastToAPInt().countr_zero() < EltSize)
+        return false;
+    } else
+      return false;
+  }
+
+  // Do not accept an all-undef vector.
+  if (IsAllUndef)
+    return false;
+  return true;
+}
+
+bool ISD::isBuildVectorAllOnes(const SDNode *N) {
+  return isConstantSplatVectorAllOnes(N, /*BuildVectorOnly*/ true);
+}
+
+bool ISD::isBuildVectorAllZeros(const SDNode *N) {
+  return isConstantSplatVectorAllZeros(N, /*BuildVectorOnly*/ true);
+}
+
+bool ISD::isBuildVectorOfConstantSDNodes(const SDNode *N) {
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  for (const SDValue &Op : N->op_values()) {
+    if (Op.isUndef())
+      continue;
+    if (!isa<ConstantSDNode>(Op))
+      return false;
+  }
+  return true;
+}
+
+bool ISD::isBuildVectorOfConstantFPSDNodes(const SDNode *N) {
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  for (const SDValue &Op : N->op_values()) {
+    if (Op.isUndef())
+      continue;
+    if (!isa<ConstantFPSDNode>(Op))
+      return false;
+  }
+  return true;
+}
+
+bool ISD::isVectorShrinkable(const SDNode *N, unsigned NewEltSize,
+                             bool Signed) {
+  assert(N->getValueType(0).isVector() && "Expected a vector!");
+
+  unsigned EltSize = N->getValueType(0).getScalarSizeInBits();
+  if (EltSize <= NewEltSize)
+    return false;
+
+  if (N->getOpcode() == ISD::ZERO_EXTEND) {
+    return (N->getOperand(0).getValueType().getScalarSizeInBits() <=
+            NewEltSize) &&
+           !Signed;
+  }
+  if (N->getOpcode() == ISD::SIGN_EXTEND) {
+    return (N->getOperand(0).getValueType().getScalarSizeInBits() <=
+            NewEltSize) &&
+           Signed;
+  }
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  for (const SDValue &Op : N->op_values()) {
+    if (Op.isUndef())
+      continue;
+    if (!isa<ConstantSDNode>(Op))
+      return false;
+
+    APInt C = cast<ConstantSDNode>(Op)->getAPIntValue().trunc(EltSize);
+    if (Signed && C.trunc(NewEltSize).sext(EltSize) != C)
+      return false;
+    if (!Signed && C.trunc(NewEltSize).zext(EltSize) != C)
+      return false;
+  }
+
+  return true;
+}
+
+bool ISD::allOperandsUndef(const SDNode *N) {
+  // Return false if the node has no operands.
+  // This is "logically inconsistent" with the definition of "all" but
+  // is probably the desired behavior.
+  if (N->getNumOperands() == 0)
+    return false;
+  return all_of(N->op_values(), [](SDValue Op) { return Op.isUndef(); });
+}
+
+bool ISD::isFreezeUndef(const SDNode *N) {
+  return N->getOpcode() == ISD::FREEZE && N->getOperand(0).isUndef();
+}
+
+bool ISD::matchUnaryPredicate(SDValue Op,
+                              std::function<bool(ConstantSDNode *)> Match,
+                              bool AllowUndefs) {
+  // FIXME: Add support for scalar UNDEF cases?
+  if (auto *Cst = dyn_cast<ConstantSDNode>(Op))
+    return Match(Cst);
+
+  // FIXME: Add support for vector UNDEF cases?
+  if (ISD::BUILD_VECTOR != Op.getOpcode() &&
+      ISD::SPLAT_VECTOR != Op.getOpcode())
+    return false;
+
+  EVT SVT = Op.getValueType().getScalarType();
+  for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+    if (AllowUndefs && Op.getOperand(i).isUndef()) {
+      if (!Match(nullptr))
+        return false;
+      continue;
+    }
+
+    auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(i));
+    if (!Cst || Cst->getValueType(0) != SVT || !Match(Cst))
+      return false;
+  }
+  return true;
+}
+
+bool ISD::matchBinaryPredicate(
+    SDValue LHS, SDValue RHS,
+    std::function<bool(ConstantSDNode *, ConstantSDNode *)> Match,
+    bool AllowUndefs, bool AllowTypeMismatch) {
+  if (!AllowTypeMismatch && LHS.getValueType() != RHS.getValueType())
+    return false;
+
+  // TODO: Add support for scalar UNDEF cases?
+  if (auto *LHSCst = dyn_cast<ConstantSDNode>(LHS))
+    if (auto *RHSCst = dyn_cast<ConstantSDNode>(RHS))
+      return Match(LHSCst, RHSCst);
+
+  // TODO: Add support for vector UNDEF cases?
+  if (LHS.getOpcode() != RHS.getOpcode() ||
+      (LHS.getOpcode() != ISD::BUILD_VECTOR &&
+       LHS.getOpcode() != ISD::SPLAT_VECTOR))
+    return false;
+
+  EVT SVT = LHS.getValueType().getScalarType();
+  for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) {
+    SDValue LHSOp = LHS.getOperand(i);
+    SDValue RHSOp = RHS.getOperand(i);
+    bool LHSUndef = AllowUndefs && LHSOp.isUndef();
+    bool RHSUndef = AllowUndefs && RHSOp.isUndef();
+    auto *LHSCst = dyn_cast<ConstantSDNode>(LHSOp);
+    auto *RHSCst = dyn_cast<ConstantSDNode>(RHSOp);
+    if ((!LHSCst && !LHSUndef) || (!RHSCst && !RHSUndef))
+      return false;
+    if (!AllowTypeMismatch && (LHSOp.getValueType() != SVT ||
+                               LHSOp.getValueType() != RHSOp.getValueType()))
+      return false;
+    if (!Match(LHSCst, RHSCst))
+      return false;
+  }
+  return true;
+}
+
+ISD::NodeType ISD::getVecReduceBaseOpcode(unsigned VecReduceOpcode) {
+  switch (VecReduceOpcode) {
+  default:
+    llvm_unreachable("Expected VECREDUCE opcode");
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_SEQ_FADD:
+  case ISD::VP_REDUCE_FADD:
+  case ISD::VP_REDUCE_SEQ_FADD:
+    return ISD::FADD;
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_SEQ_FMUL:
+  case ISD::VP_REDUCE_FMUL:
+  case ISD::VP_REDUCE_SEQ_FMUL:
+    return ISD::FMUL;
+  case ISD::VECREDUCE_ADD:
+  case ISD::VP_REDUCE_ADD:
+    return ISD::ADD;
+  case ISD::VECREDUCE_MUL:
+  case ISD::VP_REDUCE_MUL:
+    return ISD::MUL;
+  case ISD::VECREDUCE_AND:
+  case ISD::VP_REDUCE_AND:
+    return ISD::AND;
+  case ISD::VECREDUCE_OR:
+  case ISD::VP_REDUCE_OR:
+    return ISD::OR;
+  case ISD::VECREDUCE_XOR:
+  case ISD::VP_REDUCE_XOR:
+    return ISD::XOR;
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VP_REDUCE_SMAX:
+    return ISD::SMAX;
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VP_REDUCE_SMIN:
+    return ISD::SMIN;
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VP_REDUCE_UMAX:
+    return ISD::UMAX;
+  case ISD::VECREDUCE_UMIN:
+  case ISD::VP_REDUCE_UMIN:
+    return ISD::UMIN;
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VP_REDUCE_FMAX:
+    return ISD::FMAXNUM;
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VP_REDUCE_FMIN:
+    return ISD::FMINNUM;
+  case ISD::VECREDUCE_FMAXIMUM:
+    return ISD::FMAXIMUM;
+  case ISD::VECREDUCE_FMINIMUM:
+    return ISD::FMINIMUM;
+  }
+}
+
+bool ISD::isVPOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+#define BEGIN_REGISTER_VP_SDNODE(VPSD, ...)                                    \
+  case ISD::VPSD:                                                              \
+    return true;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
+bool ISD::isVPBinaryOp(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    break;
+#define BEGIN_REGISTER_VP_SDNODE(VPSD, ...) case ISD::VPSD:
+#define VP_PROPERTY_BINARYOP return true;
+#define END_REGISTER_VP_SDNODE(VPSD) break;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+  return false;
+}
+
+bool ISD::isVPReduction(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    break;
+#define BEGIN_REGISTER_VP_SDNODE(VPSD, ...) case ISD::VPSD:
+#define VP_PROPERTY_REDUCTION(STARTPOS, ...) return true;
+#define END_REGISTER_VP_SDNODE(VPSD) break;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+  return false;
+}
+
+/// The operand position of the vector mask.
+std::optional<unsigned> ISD::getVPMaskIdx(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return std::nullopt;
+#define BEGIN_REGISTER_VP_SDNODE(VPSD, LEGALPOS, TDNAME, MASKPOS, ...)         \
+  case ISD::VPSD:                                                              \
+    return MASKPOS;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
+/// The operand position of the explicit vector length parameter.
+std::optional<unsigned> ISD::getVPExplicitVectorLengthIdx(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return std::nullopt;
+#define BEGIN_REGISTER_VP_SDNODE(VPSD, LEGALPOS, TDNAME, MASKPOS, EVLPOS)      \
+  case ISD::VPSD:                                                              \
+    return EVLPOS;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
+std::optional<unsigned> ISD::getBaseOpcodeForVP(unsigned VPOpcode,
+                                                bool hasFPExcept) {
+  // FIXME: Return strict opcodes in case of fp exceptions.
+  switch (VPOpcode) {
+  default:
+    return std::nullopt;
+#define BEGIN_REGISTER_VP_SDNODE(VPOPC, ...) case ISD::VPOPC:
+#define VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC) return ISD::SDOPC;
+#define END_REGISTER_VP_SDNODE(VPOPC) break;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+  return std::nullopt;
+}
+
+unsigned ISD::getVPForBaseOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("can not translate this Opcode to VP.");
+#define BEGIN_REGISTER_VP_SDNODE(VPOPC, ...) break;
+#define VP_PROPERTY_FUNCTIONAL_SDOPC(SDOPC) case ISD::SDOPC:
+#define END_REGISTER_VP_SDNODE(VPOPC) return ISD::VPOPC;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
+ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) {
+  switch (ExtType) {
+  case ISD::EXTLOAD:
+    return IsFP ? ISD::FP_EXTEND : ISD::ANY_EXTEND;
+  case ISD::SEXTLOAD:
+    return ISD::SIGN_EXTEND;
+  case ISD::ZEXTLOAD:
+    return ISD::ZERO_EXTEND;
+  default:
+    break;
+  }
+
+  llvm_unreachable("Invalid LoadExtType");
+}
+
+ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) {
+  // To perform this operation, we just need to swap the L and G bits of the
+  // operation.
+  unsigned OldL = (Operation >> 2) & 1;
+  unsigned OldG = (Operation >> 1) & 1;
+  return ISD::CondCode((Operation & ~6) |  // Keep the N, U, E bits
+                       (OldL << 1) |       // New G bit
+                       (OldG << 2));       // New L bit.
+}
+
+static ISD::CondCode getSetCCInverseImpl(ISD::CondCode Op, bool isIntegerLike) {
+  unsigned Operation = Op;
+  if (isIntegerLike)
+    Operation ^= 7;   // Flip L, G, E bits, but not U.
+  else
+    Operation ^= 15;  // Flip all of the condition bits.
+
+  if (Operation > ISD::SETTRUE2)
+    Operation &= ~8;  // Don't let N and U bits get set.
+
+  return ISD::CondCode(Operation);
+}
+
+ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, EVT Type) {
+  return getSetCCInverseImpl(Op, Type.isInteger());
+}
+
+ISD::CondCode ISD::GlobalISel::getSetCCInverse(ISD::CondCode Op,
+                                               bool isIntegerLike) {
+  return getSetCCInverseImpl(Op, isIntegerLike);
+}
+
+/// For an integer comparison, return 1 if the comparison is a signed operation
+/// and 2 if the result is an unsigned comparison. Return zero if the operation
+/// does not depend on the sign of the input (setne and seteq).
+static int isSignedOp(ISD::CondCode Opcode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Illegal integer setcc operation!");
+  case ISD::SETEQ:
+  case ISD::SETNE: return 0;
+  case ISD::SETLT:
+  case ISD::SETLE:
+  case ISD::SETGT:
+  case ISD::SETGE: return 1;
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETUGT:
+  case ISD::SETUGE: return 2;
+  }
+}
+
+ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2,
+                                       EVT Type) {
+  bool IsInteger = Type.isInteger();
+  if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+    // Cannot fold a signed integer setcc with an unsigned integer setcc.
+    return ISD::SETCC_INVALID;
+
+  unsigned Op = Op1 | Op2;  // Combine all of the condition bits.
+
+  // If the N and U bits get set, then the resultant comparison DOES suddenly
+  // care about orderedness, and it is true when ordered.
+  if (Op > ISD::SETTRUE2)
+    Op &= ~16;     // Clear the U bit if the N bit is set.
+
+  // Canonicalize illegal integer setcc's.
+  if (IsInteger && Op == ISD::SETUNE)  // e.g. SETUGT | SETULT
+    Op = ISD::SETNE;
+
+  return ISD::CondCode(Op);
+}
+
+ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
+                                        EVT Type) {
+  bool IsInteger = Type.isInteger();
+  if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+    // Cannot fold a signed setcc with an unsigned setcc.
+    return ISD::SETCC_INVALID;
+
+  // Combine all of the condition bits.
+  ISD::CondCode Result = ISD::CondCode(Op1 & Op2);
+
+  // Canonicalize illegal integer setcc's.
+  if (IsInteger) {
+    switch (Result) {
+    default: break;
+    case ISD::SETUO : Result = ISD::SETFALSE; break;  // SETUGT & SETULT
+    case ISD::SETOEQ:                                 // SETEQ  & SETU[LG]E
+    case ISD::SETUEQ: Result = ISD::SETEQ   ; break;  // SETUGE & SETULE
+    case ISD::SETOLT: Result = ISD::SETULT  ; break;  // SETULT & SETNE
+    case ISD::SETOGT: Result = ISD::SETUGT  ; break;  // SETUGT & SETNE
+    }
+  }
+
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+//                           SDNode Profile Support
+//===----------------------------------------------------------------------===//
+
+/// AddNodeIDOpcode - Add the node opcode to the NodeID data.
+static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC)  {
+  ID.AddInteger(OpC);
+}
+
+/// AddNodeIDValueTypes - Value type lists are intern'd so we can represent them
+/// solely with their pointer.
+static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
+  ID.AddPointer(VTList.VTs);
+}
+
+/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
+static void AddNodeIDOperands(FoldingSetNodeID &ID,
+                              ArrayRef<SDValue> Ops) {
+  for (const auto &Op : Ops) {
+    ID.AddPointer(Op.getNode());
+    ID.AddInteger(Op.getResNo());
+  }
+}
+
+/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
+static void AddNodeIDOperands(FoldingSetNodeID &ID,
+                              ArrayRef<SDUse> Ops) {
+  for (const auto &Op : Ops) {
+    ID.AddPointer(Op.getNode());
+    ID.AddInteger(Op.getResNo());
+  }
+}
+
+static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned OpC,
+                          SDVTList VTList, ArrayRef<SDValue> OpList) {
+  AddNodeIDOpcode(ID, OpC);
+  AddNodeIDValueTypes(ID, VTList);
+  AddNodeIDOperands(ID, OpList);
+}
+
+/// If this is an SDNode with special info, add this info to the NodeID data.
+static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
+  switch (N->getOpcode()) {
+  case ISD::TargetExternalSymbol:
+  case ISD::ExternalSymbol:
+  case ISD::MCSymbol:
+    llvm_unreachable("Should only be used on nodes with operands");
+  default: break;  // Normal nodes don't need extra info.
+  case ISD::TargetConstant:
+  case ISD::Constant: {
+    const ConstantSDNode *C = cast<ConstantSDNode>(N);
+    ID.AddPointer(C->getConstantIntValue());
+    ID.AddBoolean(C->isOpaque());
+    break;
+  }
+  case ISD::TargetConstantFP:
+  case ISD::ConstantFP:
+    ID.AddPointer(cast<ConstantFPSDNode>(N)->getConstantFPValue());
+    break;
+  case ISD::TargetGlobalAddress:
+  case ISD::GlobalAddress:
+  case ISD::TargetGlobalTLSAddress:
+  case ISD::GlobalTLSAddress: {
+    const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+    ID.AddPointer(GA->getGlobal());
+    ID.AddInteger(GA->getOffset());
+    ID.AddInteger(GA->getTargetFlags());
+    break;
+  }
+  case ISD::BasicBlock:
+    ID.AddPointer(cast<BasicBlockSDNode>(N)->getBasicBlock());
+    break;
+  case ISD::Register:
+    ID.AddInteger(cast<RegisterSDNode>(N)->getReg());
+    break;
+  case ISD::RegisterMask:
+    ID.AddPointer(cast<RegisterMaskSDNode>(N)->getRegMask());
+    break;
+  case ISD::SRCVALUE:
+    ID.AddPointer(cast<SrcValueSDNode>(N)->getValue());
+    break;
+  case ISD::FrameIndex:
+  case ISD::TargetFrameIndex:
+    ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex());
+    break;
+  case ISD::LIFETIME_START:
+  case ISD::LIFETIME_END:
+    if (cast<LifetimeSDNode>(N)->hasOffset()) {
+      ID.AddInteger(cast<LifetimeSDNode>(N)->getSize());
+      ID.AddInteger(cast<LifetimeSDNode>(N)->getOffset());
+    }
+    break;
+  case ISD::PSEUDO_PROBE:
+    ID.AddInteger(cast<PseudoProbeSDNode>(N)->getGuid());
+    ID.AddInteger(cast<PseudoProbeSDNode>(N)->getIndex());
+    ID.AddInteger(cast<PseudoProbeSDNode>(N)->getAttributes());
+    break;
+  case ISD::JumpTable:
+  case ISD::TargetJumpTable:
+    ID.AddInteger(cast<JumpTableSDNode>(N)->getIndex());
+    ID.AddInteger(cast<JumpTableSDNode>(N)->getTargetFlags());
+    break;
+  case ISD::ConstantPool:
+  case ISD::TargetConstantPool: {
+    const ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N);
+    ID.AddInteger(CP->getAlign().value());
+    ID.AddInteger(CP->getOffset());
+    if (CP->isMachineConstantPoolEntry())
+      CP->getMachineCPVal()->addSelectionDAGCSEId(ID);
+    else
+      ID.AddPointer(CP->getConstVal());
+    ID.AddInteger(CP->getTargetFlags());
+    break;
+  }
+  case ISD::TargetIndex: {
+    const TargetIndexSDNode *TI = cast<TargetIndexSDNode>(N);
+    ID.AddInteger(TI->getIndex());
+    ID.AddInteger(TI->getOffset());
+    ID.AddInteger(TI->getTargetFlags());
+    break;
+  }
+  case ISD::LOAD: {
+    const LoadSDNode *LD = cast<LoadSDNode>(N);
+    ID.AddInteger(LD->getMemoryVT().getRawBits());
+    ID.AddInteger(LD->getRawSubclassData());
+    ID.AddInteger(LD->getPointerInfo().getAddrSpace());
+    ID.AddInteger(LD->getMemOperand()->getFlags());
+    break;
+  }
+  case ISD::STORE: {
+    const StoreSDNode *ST = cast<StoreSDNode>(N);
+    ID.AddInteger(ST->getMemoryVT().getRawBits());
+    ID.AddInteger(ST->getRawSubclassData());
+    ID.AddInteger(ST->getPointerInfo().getAddrSpace());
+    ID.AddInteger(ST->getMemOperand()->getFlags());
+    break;
+  }
+  case ISD::VP_LOAD: {
+    const VPLoadSDNode *ELD = cast<VPLoadSDNode>(N);
+    ID.AddInteger(ELD->getMemoryVT().getRawBits());
+    ID.AddInteger(ELD->getRawSubclassData());
+    ID.AddInteger(ELD->getPointerInfo().getAddrSpace());
+    ID.AddInteger(ELD->getMemOperand()->getFlags());
+    break;
+  }
+  case ISD::VP_STORE: {
+    const VPStoreSDNode *EST = cast<VPStoreSDNode>(N);
+    ID.AddInteger(EST->getMemoryVT().getRawBits());
+    ID.AddInteger(EST->getRawSubclassData());
+    ID.AddInteger(EST->getPointerInfo().getAddrSpace());
+    ID.AddInteger(EST->getMemOperand()->getFlags());
+    break;
+  }
+  case ISD::EXPERIMENTAL_VP_STRIDED_LOAD: {
+    const VPStridedLoadSDNode *SLD = cast<VPStridedLoadSDNode>(N);
+    ID.AddInteger(SLD->getMemoryVT().getRawBits());
+    ID.AddInteger(SLD->getRawSubclassData());
+    ID.AddInteger(SLD->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::EXPERIMENTAL_VP_STRIDED_STORE: {
+    const VPStridedStoreSDNode *SST = cast<VPStridedStoreSDNode>(N);
+    ID.AddInteger(SST->getMemoryVT().getRawBits());
+    ID.AddInteger(SST->getRawSubclassData());
+    ID.AddInteger(SST->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::VP_GATHER: {
+    const VPGatherSDNode *EG = cast<VPGatherSDNode>(N);
+    ID.AddInteger(EG->getMemoryVT().getRawBits());
+    ID.AddInteger(EG->getRawSubclassData());
+    ID.AddInteger(EG->getPointerInfo().getAddrSpace());
+    ID.AddInteger(EG->getMemOperand()->getFlags());
+    break;
+  }
+  case ISD::VP_SCATTER: {
+    const VPScatterSDNode *ES = cast<VPScatterSDNode>(N);
+    ID.AddInteger(ES->getMemoryVT().getRawBits());
+    ID.AddInteger(ES->getRawSubclassData());
+    ID.AddInteger(ES->getPointerInfo().getAddrSpace());
+    ID.AddInteger(ES->getMemOperand()->getFlags());
+    break;
+  }
+  case ISD::MLOAD: {
+    const MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N);
+    ID.AddInteger(MLD->getMemoryVT().getRawBits());
+    ID.AddInteger(MLD->getRawSubclassData());
+    ID.AddInteger(MLD->getPointerInfo().getAddrSpace());
+    ID.AddInteger(MLD->getMemOperand()->getFlags());
+    break;
+  }
+  case ISD::MSTORE: {
+    const MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
+    ID.AddInteger(MST->getMemoryVT().getRawBits());
+    ID.AddInteger(MST->getRawSubclassData());
+    ID.AddInteger(MST->getPointerInfo().getAddrSpace());
+    ID.AddInteger(MST->getMemOperand()->getFlags());
+    break;
+  }
+  case ISD::MGATHER: {
+    const MaskedGatherSDNode *MG = cast<MaskedGatherSDNode>(N);
+    ID.AddInteger(MG->getMemoryVT().getRawBits());
+    ID.AddInteger(MG->getRawSubclassData());
+    ID.AddInteger(MG->getPointerInfo().getAddrSpace());
+    ID.AddInteger(MG->getMemOperand()->getFlags());
+    break;
+  }
+  case ISD::MSCATTER: {
+    const MaskedScatterSDNode *MS = cast<MaskedScatterSDNode>(N);
+    ID.AddInteger(MS->getMemoryVT().getRawBits());
+    ID.AddInteger(MS->getRawSubclassData());
+    ID.AddInteger(MS->getPointerInfo().getAddrSpace());
+    ID.AddInteger(MS->getMemOperand()->getFlags());
+    break;
+  }
+  case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_CLR:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE: {
+    const AtomicSDNode *AT = cast<AtomicSDNode>(N);
+    ID.AddInteger(AT->getMemoryVT().getRawBits());
+    ID.AddInteger(AT->getRawSubclassData());
+    ID.AddInteger(AT->getPointerInfo().getAddrSpace());
+    ID.AddInteger(AT->getMemOperand()->getFlags());
+    break;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+    for (unsigned i = 0, e = N->getValueType(0).getVectorNumElements();
+         i != e; ++i)
+      ID.AddInteger(SVN->getMaskElt(i));
+    break;
+  }
+  case ISD::TargetBlockAddress:
+  case ISD::BlockAddress: {
+    const BlockAddressSDNode *BA = cast<BlockAddressSDNode>(N);
+    ID.AddPointer(BA->getBlockAddress());
+    ID.AddInteger(BA->getOffset());
+    ID.AddInteger(BA->getTargetFlags());
+    break;
+  }
+  case ISD::AssertAlign:
+    ID.AddInteger(cast<AssertAlignSDNode>(N)->getAlign().value());
+    break;
+  case ISD::PREFETCH:
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:
+    // Handled by MemIntrinsicSDNode check after the switch.
+    break;
+  } // end switch (N->getOpcode())
+
+  // MemIntrinsic nodes could also have subclass data, address spaces, and flags
+  // to check.
+  if (auto *MN = dyn_cast<MemIntrinsicSDNode>(N)) {
+    ID.AddInteger(MN->getRawSubclassData());
+    ID.AddInteger(MN->getPointerInfo().getAddrSpace());
+    ID.AddInteger(MN->getMemOperand()->getFlags());
+    ID.AddInteger(MN->getMemoryVT().getRawBits());
+  }
+}
+
+/// AddNodeIDNode - Generic routine for adding a nodes info to the NodeID
+/// data.
+static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) {
+  AddNodeIDOpcode(ID, N->getOpcode());
+  // Add the return value info.
+  AddNodeIDValueTypes(ID, N->getVTList());
+  // Add the operand info.
+  AddNodeIDOperands(ID, N->ops());
+
+  // Handle SDNode leafs with special info.
+  AddNodeIDCustom(ID, N);
+}
+
+//===----------------------------------------------------------------------===//
+//                              SelectionDAG Class
+//===----------------------------------------------------------------------===//
+
+/// doNotCSE - Return true if CSE should not be performed for this node.
+static bool doNotCSE(SDNode *N) {
+  if (N->getValueType(0) == MVT::Glue)
+    return true; // Never CSE anything that produces a flag.
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::HANDLENODE:
+  case ISD::EH_LABEL:
+    return true;   // Never CSE these nodes.
+  }
+
+  // Check that remaining values produced are not flags.
+  for (unsigned i = 1, e = N->getNumValues(); i != e; ++i)
+    if (N->getValueType(i) == MVT::Glue)
+      return true; // Never CSE anything that produces a flag.
+
+  return false;
+}
+
+/// RemoveDeadNodes - This method deletes all unreachable nodes in the
+/// SelectionDAG.
+void SelectionDAG::RemoveDeadNodes() {
+  // Create a dummy node (which is not added to allnodes), that adds a reference
+  // to the root node, preventing it from being deleted.
+  HandleSDNode Dummy(getRoot());
+
+  SmallVector<SDNode*, 128> DeadNodes;
+
+  // Add all obviously-dead nodes to the DeadNodes worklist.
+  for (SDNode &Node : allnodes())
+    if (Node.use_empty())
+      DeadNodes.push_back(&Node);
+
+  RemoveDeadNodes(DeadNodes);
+
+  // If the root changed (e.g. it was a dead load, update the root).
+  setRoot(Dummy.getValue());
+}
+
+/// RemoveDeadNodes - This method deletes the unreachable nodes in the
+/// given list, and any nodes that become unreachable as a result.
+void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes) {
+
+  // Process the worklist, deleting the nodes and adding their uses to the
+  // worklist.
+  while (!DeadNodes.empty()) {
+    SDNode *N = DeadNodes.pop_back_val();
+    // Skip to next node if we've already managed to delete the node. This could
+    // happen if replacing a node causes a node previously added to the node to
+    // be deleted.
+    if (N->getOpcode() == ISD::DELETED_NODE)
+      continue;
+
+    for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+      DUL->NodeDeleted(N, nullptr);
+
+    // Take the node out of the appropriate CSE map.
+    RemoveNodeFromCSEMaps(N);
+
+    // Next, brutally remove the operand list.  This is safe to do, as there are
+    // no cycles in the graph.
+    for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
+      SDUse &Use = *I++;
+      SDNode *Operand = Use.getNode();
+      Use.set(SDValue());
+
+      // Now that we removed this operand, see if there are no uses of it left.
+      if (Operand->use_empty())
+        DeadNodes.push_back(Operand);
+    }
+
+    DeallocateNode(N);
+  }
+}
+
+void SelectionDAG::RemoveDeadNode(SDNode *N){
+  SmallVector<SDNode*, 16> DeadNodes(1, N);
+
+  // Create a dummy node that adds a reference to the root node, preventing
+  // it from being deleted.  (This matters if the root is an operand of the
+  // dead node.)
+  HandleSDNode Dummy(getRoot());
+
+  RemoveDeadNodes(DeadNodes);
+}
+
+void SelectionDAG::DeleteNode(SDNode *N) {
+  // First take this out of the appropriate CSE map.
+  RemoveNodeFromCSEMaps(N);
+
+  // Finally, remove uses due to operands of this node, remove from the
+  // AllNodes list, and delete the node.
+  DeleteNodeNotInCSEMaps(N);
+}
+
+void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) {
+  assert(N->getIterator() != AllNodes.begin() &&
+         "Cannot delete the entry node!");
+  assert(N->use_empty() && "Cannot delete a node that is not dead!");
+
+  // Drop all of the operands and decrement used node's use counts.
+  N->DropOperands();
+
+  DeallocateNode(N);
+}
+
+void SDDbgInfo::add(SDDbgValue *V, bool isParameter) {
+  assert(!(V->isVariadic() && isParameter));
+  if (isParameter)
+    ByvalParmDbgValues.push_back(V);
+  else
+    DbgValues.push_back(V);
+  for (const SDNode *Node : V->getSDNodes())
+    if (Node)
+      DbgValMap[Node].push_back(V);
+}
+
+void SDDbgInfo::erase(const SDNode *Node) {
+  DbgValMapType::iterator I = DbgValMap.find(Node);
+  if (I == DbgValMap.end())
+    return;
+  for (auto &Val: I->second)
+    Val->setIsInvalidated();
+  DbgValMap.erase(I);
+}
+
+void SelectionDAG::DeallocateNode(SDNode *N) {
+  // If we have operands, deallocate them.
+  removeOperands(N);
+
+  NodeAllocator.Deallocate(AllNodes.remove(N));
+
+  // Set the opcode to DELETED_NODE to help catch bugs when node
+  // memory is reallocated.
+  // FIXME: There are places in SDag that have grown a dependency on the opcode
+  // value in the released node.
+  __asan_unpoison_memory_region(&N->NodeType, sizeof(N->NodeType));
+  N->NodeType = ISD::DELETED_NODE;
+
+  // If any of the SDDbgValue nodes refer to this SDNode, invalidate
+  // them and forget about that node.
+  DbgInfo->erase(N);
+
+  // Invalidate extra info.
+  SDEI.erase(N);
+}
+
+#ifndef NDEBUG
+/// VerifySDNode - Check the given SDNode.  Aborts if it is invalid.
+static void VerifySDNode(SDNode *N) {
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::BUILD_PAIR: {
+    EVT VT = N->getValueType(0);
+    assert(N->getNumValues() == 1 && "Too many results!");
+    assert(!VT.isVector() && (VT.isInteger() || VT.isFloatingPoint()) &&
+           "Wrong return type!");
+    assert(N->getNumOperands() == 2 && "Wrong number of operands!");
+    assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
+           "Mismatched operand types!");
+    assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() &&
+           "Wrong operand type!");
+    assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() &&
+           "Wrong return type size");
+    break;
+  }
+  case ISD::BUILD_VECTOR: {
+    assert(N->getNumValues() == 1 && "Too many results!");
+    assert(N->getValueType(0).isVector() && "Wrong return type!");
+    assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() &&
+           "Wrong number of operands!");
+    EVT EltVT = N->getValueType(0).getVectorElementType();
+    for (const SDUse &Op : N->ops()) {
+      assert((Op.getValueType() == EltVT ||
+              (EltVT.isInteger() && Op.getValueType().isInteger() &&
+               EltVT.bitsLE(Op.getValueType()))) &&
+             "Wrong operand type!");
+      assert(Op.getValueType() == N->getOperand(0).getValueType() &&
+             "Operands must all have the same type");
+    }
+    break;
+  }
+  }
+}
+#endif // NDEBUG
+
+/// Insert a newly allocated node into the DAG.
+///
+/// Handles insertion into the all nodes list and CSE map, as well as
+/// verification and other common operations when a new node is allocated.
+void SelectionDAG::InsertNode(SDNode *N) {
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  N->PersistentId = NextPersistentId++;
+  VerifySDNode(N);
+#endif
+  for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+    DUL->NodeInserted(N);
+}
+
+/// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that
+/// correspond to it.  This is useful when we're about to delete or repurpose
+/// the node.  We don't want future request for structurally identical nodes
+/// to return N anymore.
+bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
+  bool Erased = false;
+  switch (N->getOpcode()) {
+  case ISD::HANDLENODE: return false;  // noop.
+  case ISD::CONDCODE:
+    assert(CondCodeNodes[cast<CondCodeSDNode>(N)->get()] &&
+           "Cond code doesn't exist!");
+    Erased = CondCodeNodes[cast<CondCodeSDNode>(N)->get()] != nullptr;
+    CondCodeNodes[cast<CondCodeSDNode>(N)->get()] = nullptr;
+    break;
+  case ISD::ExternalSymbol:
+    Erased = ExternalSymbols.erase(cast<ExternalSymbolSDNode>(N)->getSymbol());
+    break;
+  case ISD::TargetExternalSymbol: {
+    ExternalSymbolSDNode *ESN = cast<ExternalSymbolSDNode>(N);
+    Erased = TargetExternalSymbols.erase(std::pair<std::string, unsigned>(
+        ESN->getSymbol(), ESN->getTargetFlags()));
+    break;
+  }
+  case ISD::MCSymbol: {
+    auto *MCSN = cast<MCSymbolSDNode>(N);
+    Erased = MCSymbols.erase(MCSN->getMCSymbol());
+    break;
+  }
+  case ISD::VALUETYPE: {
+    EVT VT = cast<VTSDNode>(N)->getVT();
+    if (VT.isExtended()) {
+      Erased = ExtendedValueTypeNodes.erase(VT);
+    } else {
+      Erased = ValueTypeNodes[VT.getSimpleVT().SimpleTy] != nullptr;
+      ValueTypeNodes[VT.getSimpleVT().SimpleTy] = nullptr;
+    }
+    break;
+  }
+  default:
+    // Remove it from the CSE Map.
+    assert(N->getOpcode() != ISD::DELETED_NODE && "DELETED_NODE in CSEMap!");
+    assert(N->getOpcode() != ISD::EntryToken && "EntryToken in CSEMap!");
+    Erased = CSEMap.RemoveNode(N);
+    break;
+  }
+#ifndef NDEBUG
+  // Verify that the node was actually in one of the CSE maps, unless it has a
+  // flag result (which cannot be CSE'd) or is one of the special cases that are
+  // not subject to CSE.
+  if (!Erased && N->getValueType(N->getNumValues()-1) != MVT::Glue &&
+      !N->isMachineOpcode() && !doNotCSE(N)) {
+    N->dump(this);
+    dbgs() << "\n";
+    llvm_unreachable("Node is not in map!");
+  }
+#endif
+  return Erased;
+}
+
+/// AddModifiedNodeToCSEMaps - The specified node has been removed from the CSE
+/// maps and modified in place. Add it back to the CSE maps, unless an identical
+/// node already exists, in which case transfer all its users to the existing
+/// node. This transfer can potentially trigger recursive merging.
+void
+SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) {
+  // For node types that aren't CSE'd, just act as if no identical node
+  // already exists.
+  if (!doNotCSE(N)) {
+    SDNode *Existing = CSEMap.GetOrInsertNode(N);
+    if (Existing != N) {
+      // If there was already an existing matching node, use ReplaceAllUsesWith
+      // to replace the dead one with the existing one.  This can cause
+      // recursive merging of other unrelated nodes down the line.
+      ReplaceAllUsesWith(N, Existing);
+
+      // N is now dead. Inform the listeners and delete it.
+      for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+        DUL->NodeDeleted(N, Existing);
+      DeleteNodeNotInCSEMaps(N);
+      return;
+    }
+  }
+
+  // If the node doesn't already exist, we updated it.  Inform listeners.
+  for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+    DUL->NodeUpdated(N);
+}
+
+/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+/// were replaced with those specified.  If this node is never memoized,
+/// return null, otherwise return a pointer to the slot it would take.  If a
+/// node already exists with these operands, the slot will be non-null.
+SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op,
+                                           void *&InsertPos) {
+  if (doNotCSE(N))
+    return nullptr;
+
+  SDValue Ops[] = { Op };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
+  AddNodeIDCustom(ID, N);
+  SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
+  if (Node)
+    Node->intersectFlagsWith(N->getFlags());
+  return Node;
+}
+
+/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+/// were replaced with those specified.  If this node is never memoized,
+/// return null, otherwise return a pointer to the slot it would take.  If a
+/// node already exists with these operands, the slot will be non-null.
+SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
+                                           SDValue Op1, SDValue Op2,
+                                           void *&InsertPos) {
+  if (doNotCSE(N))
+    return nullptr;
+
+  SDValue Ops[] = { Op1, Op2 };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
+  AddNodeIDCustom(ID, N);
+  SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
+  if (Node)
+    Node->intersectFlagsWith(N->getFlags());
+  return Node;
+}
+
+/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+/// were replaced with those specified.  If this node is never memoized,
+/// return null, otherwise return a pointer to the slot it would take.  If a
+/// node already exists with these operands, the slot will be non-null.
+SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops,
+                                           void *&InsertPos) {
+  if (doNotCSE(N))
+    return nullptr;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
+  AddNodeIDCustom(ID, N);
+  SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
+  if (Node)
+    Node->intersectFlagsWith(N->getFlags());
+  return Node;
+}
+
+Align SelectionDAG::getEVTAlign(EVT VT) const {
+  Type *Ty = VT == MVT::iPTR ?
+                   PointerType::get(Type::getInt8Ty(*getContext()), 0) :
+                   VT.getTypeForEVT(*getContext());
+
+  return getDataLayout().getABITypeAlign(Ty);
+}
+
+// EntryNode could meaningfully have debug info if we can find it...
+SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
+    : TM(tm), OptLevel(OL),
+      EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other, MVT::Glue)),
+      Root(getEntryNode()) {
+  InsertNode(&EntryNode);
+  DbgInfo = new SDDbgInfo();
+}
+
+void SelectionDAG::init(MachineFunction &NewMF,
+                        OptimizationRemarkEmitter &NewORE, Pass *PassPtr,
+                        const TargetLibraryInfo *LibraryInfo,
+                        UniformityInfo *NewUA, ProfileSummaryInfo *PSIin,
+                        BlockFrequencyInfo *BFIin,
+                        FunctionVarLocs const *VarLocs) {
+  MF = &NewMF;
+  SDAGISelPass = PassPtr;
+  ORE = &NewORE;
+  TLI = getSubtarget().getTargetLowering();
+  TSI = getSubtarget().getSelectionDAGInfo();
+  LibInfo = LibraryInfo;
+  Context = &MF->getFunction().getContext();
+  UA = NewUA;
+  PSI = PSIin;
+  BFI = BFIin;
+  FnVarLocs = VarLocs;
+}
+
+SelectionDAG::~SelectionDAG() {
+  assert(!UpdateListeners && "Dangling registered DAGUpdateListeners");
+  allnodes_clear();
+  OperandRecycler.clear(OperandAllocator);
+  delete DbgInfo;
+}
+
+bool SelectionDAG::shouldOptForSize() const {
+  return MF->getFunction().hasOptSize() ||
+      llvm::shouldOptimizeForSize(FLI->MBB->getBasicBlock(), PSI, BFI);
+}
+
+void SelectionDAG::allnodes_clear() {
+  assert(&*AllNodes.begin() == &EntryNode);
+  AllNodes.remove(AllNodes.begin());
+  while (!AllNodes.empty())
+    DeallocateNode(&AllNodes.front());
+#ifndef NDEBUG
+  NextPersistentId = 0;
+#endif
+}
+
+SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
+                                          void *&InsertPos) {
+  SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  if (N) {
+    switch (N->getOpcode()) {
+    default: break;
+    case ISD::Constant:
+    case ISD::ConstantFP:
+      llvm_unreachable("Querying for Constant and ConstantFP nodes requires "
+                       "debug location.  Use another overload.");
+    }
+  }
+  return N;
+}
+
+SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
+                                          const SDLoc &DL, void *&InsertPos) {
+  SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  if (N) {
+    switch (N->getOpcode()) {
+    case ISD::Constant:
+    case ISD::ConstantFP:
+      // Erase debug location from the node if the node is used at several
+      // different places. Do not propagate one location to all uses as it
+      // will cause a worse single stepping debugging experience.
+      if (N->getDebugLoc() != DL.getDebugLoc())
+        N->setDebugLoc(DebugLoc());
+      break;
+    default:
+      // When the node's point of use is located earlier in the instruction
+      // sequence than its prior point of use, update its debug info to the
+      // earlier location.
+      if (DL.getIROrder() && DL.getIROrder() < N->getIROrder())
+        N->setDebugLoc(DL.getDebugLoc());
+      break;
+    }
+  }
+  return N;
+}
+
+void SelectionDAG::clear() {
+  allnodes_clear();
+  OperandRecycler.clear(OperandAllocator);
+  OperandAllocator.Reset();
+  CSEMap.clear();
+
+  ExtendedValueTypeNodes.clear();
+  ExternalSymbols.clear();
+  TargetExternalSymbols.clear();
+  MCSymbols.clear();
+  SDEI.clear();
+  std::fill(CondCodeNodes.begin(), CondCodeNodes.end(),
+            static_cast<CondCodeSDNode*>(nullptr));
+  std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(),
+            static_cast<SDNode*>(nullptr));
+
+  EntryNode.UseList = nullptr;
+  InsertNode(&EntryNode);
+  Root = getEntryNode();
+  DbgInfo->clear();
+}
+
+SDValue SelectionDAG::getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT) {
+  return VT.bitsGT(Op.getValueType())
+             ? getNode(ISD::FP_EXTEND, DL, VT, Op)
+             : getNode(ISD::FP_ROUND, DL, VT, Op,
+                       getIntPtrConstant(0, DL, /*isTarget=*/true));
+}
+
+std::pair<SDValue, SDValue>
+SelectionDAG::getStrictFPExtendOrRound(SDValue Op, SDValue Chain,
+                                       const SDLoc &DL, EVT VT) {
+  assert(!VT.bitsEq(Op.getValueType()) &&
+         "Strict no-op FP extend/round not allowed.");
+  SDValue Res =
+      VT.bitsGT(Op.getValueType())
+          ? getNode(ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other}, {Chain, Op})
+          : getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
+                    {Chain, Op, getIntPtrConstant(0, DL)});
+
+  return std::pair<SDValue, SDValue>(Res, SDValue(Res.getNode(), 1));
+}
+
+SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
+  return VT.bitsGT(Op.getValueType()) ?
+    getNode(ISD::ANY_EXTEND, DL, VT, Op) :
+    getNode(ISD::TRUNCATE, DL, VT, Op);
+}
+
+SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
+  return VT.bitsGT(Op.getValueType()) ?
+    getNode(ISD::SIGN_EXTEND, DL, VT, Op) :
+    getNode(ISD::TRUNCATE, DL, VT, Op);
+}
+
+SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
+  return VT.bitsGT(Op.getValueType()) ?
+    getNode(ISD::ZERO_EXTEND, DL, VT, Op) :
+    getNode(ISD::TRUNCATE, DL, VT, Op);
+}
+
+SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT,
+                                        EVT OpVT) {
+  if (VT.bitsLE(Op.getValueType()))
+    return getNode(ISD::TRUNCATE, SL, VT, Op);
+
+  TargetLowering::BooleanContent BType = TLI->getBooleanContents(OpVT);
+  return getNode(TLI->getExtendForContent(BType), SL, VT, Op);
+}
+
+SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
+  EVT OpVT = Op.getValueType();
+  assert(VT.isInteger() && OpVT.isInteger() &&
+         "Cannot getZeroExtendInReg FP types");
+  assert(VT.isVector() == OpVT.isVector() &&
+         "getZeroExtendInReg type should be vector iff the operand "
+         "type is vector!");
+  assert((!VT.isVector() ||
+          VT.getVectorElementCount() == OpVT.getVectorElementCount()) &&
+         "Vector element counts must match in getZeroExtendInReg");
+  assert(VT.bitsLE(OpVT) && "Not extending!");
+  if (OpVT == VT)
+    return Op;
+  APInt Imm = APInt::getLowBitsSet(OpVT.getScalarSizeInBits(),
+                                   VT.getScalarSizeInBits());
+  return getNode(ISD::AND, DL, OpVT, Op, getConstant(Imm, DL, OpVT));
+}
+
+SDValue SelectionDAG::getPtrExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
+  // Only unsigned pointer semantics are supported right now. In the future this
+  // might delegate to TLI to check pointer signedness.
+  return getZExtOrTrunc(Op, DL, VT);
+}
+
+SDValue SelectionDAG::getPtrExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
+  // Only unsigned pointer semantics are supported right now. In the future this
+  // might delegate to TLI to check pointer signedness.
+  return getZeroExtendInReg(Op, DL, VT);
+}
+
+SDValue SelectionDAG::getNegative(SDValue Val, const SDLoc &DL, EVT VT) {
+  return getNode(ISD::SUB, DL, VT, getConstant(0, DL, VT), Val);
+}
+
+/// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
+SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) {
+  return getNode(ISD::XOR, DL, VT, Val, getAllOnesConstant(DL, VT));
+}
+
+SDValue SelectionDAG::getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT) {
+  SDValue TrueValue = getBoolConstant(true, DL, VT, VT);
+  return getNode(ISD::XOR, DL, VT, Val, TrueValue);
+}
+
+SDValue SelectionDAG::getVPLogicalNOT(const SDLoc &DL, SDValue Val,
+                                      SDValue Mask, SDValue EVL, EVT VT) {
+  SDValue TrueValue = getBoolConstant(true, DL, VT, VT);
+  return getNode(ISD::VP_XOR, DL, VT, Val, TrueValue, Mask, EVL);
+}
+
+SDValue SelectionDAG::getVPPtrExtOrTrunc(const SDLoc &DL, EVT VT, SDValue Op,
+                                         SDValue Mask, SDValue EVL) {
+  return getVPZExtOrTrunc(DL, VT, Op, Mask, EVL);
+}
+
+SDValue SelectionDAG::getVPZExtOrTrunc(const SDLoc &DL, EVT VT, SDValue Op,
+                                       SDValue Mask, SDValue EVL) {
+  if (VT.bitsGT(Op.getValueType()))
+    return getNode(ISD::VP_ZERO_EXTEND, DL, VT, Op, Mask, EVL);
+  if (VT.bitsLT(Op.getValueType()))
+    return getNode(ISD::VP_TRUNCATE, DL, VT, Op, Mask, EVL);
+  return Op;
+}
+
+SDValue SelectionDAG::getBoolConstant(bool V, const SDLoc &DL, EVT VT,
+                                      EVT OpVT) {
+  if (!V)
+    return getConstant(0, DL, VT);
+
+  switch (TLI->getBooleanContents(OpVT)) {
+  case TargetLowering::ZeroOrOneBooleanContent:
+  case TargetLowering::UndefinedBooleanContent:
+    return getConstant(1, DL, VT);
+  case TargetLowering::ZeroOrNegativeOneBooleanContent:
+    return getAllOnesConstant(DL, VT);
+  }
+  llvm_unreachable("Unexpected boolean content enum!");
+}
+
+SDValue SelectionDAG::getConstant(uint64_t Val, const SDLoc &DL, EVT VT,
+                                  bool isT, bool isO) {
+  EVT EltVT = VT.getScalarType();
+  assert((EltVT.getSizeInBits() >= 64 ||
+          (uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) &&
+         "getConstant with a uint64_t value that doesn't fit in the type!");
+  return getConstant(APInt(EltVT.getSizeInBits(), Val), DL, VT, isT, isO);
+}
+
+SDValue SelectionDAG::getConstant(const APInt &Val, const SDLoc &DL, EVT VT,
+                                  bool isT, bool isO) {
+  return getConstant(*ConstantInt::get(*Context, Val), DL, VT, isT, isO);
+}
+
+SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
+                                  EVT VT, bool isT, bool isO) {
+  assert(VT.isInteger() && "Cannot create FP integer constant!");
+
+  EVT EltVT = VT.getScalarType();
+  const ConstantInt *Elt = &Val;
+
+  // In some cases the vector type is legal but the element type is illegal and
+  // needs to be promoted, for example v8i8 on ARM.  In this case, promote the
+  // inserted value (the type does not need to match the vector element type).
+  // Any extra bits introduced will be truncated away.
+  if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) ==
+                           TargetLowering::TypePromoteInteger) {
+    EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
+    APInt NewVal = Elt->getValue().zextOrTrunc(EltVT.getSizeInBits());
+    Elt = ConstantInt::get(*getContext(), NewVal);
+  }
+  // In other cases the element type is illegal and needs to be expanded, for
+  // example v2i64 on MIPS32. In this case, find the nearest legal type, split
+  // the value into n parts and use a vector type with n-times the elements.
+  // Then bitcast to the type requested.
+  // Legalizing constants too early makes the DAGCombiner's job harder so we
+  // only legalize if the DAG tells us we must produce legal types.
+  else if (NewNodesMustHaveLegalTypes && VT.isVector() &&
+           TLI->getTypeAction(*getContext(), EltVT) ==
+               TargetLowering::TypeExpandInteger) {
+    const APInt &NewVal = Elt->getValue();
+    EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
+    unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
+
+    // For scalable vectors, try to use a SPLAT_VECTOR_PARTS node.
+    if (VT.isScalableVector()) {
+      assert(EltVT.getSizeInBits() % ViaEltSizeInBits == 0 &&
+             "Can only handle an even split!");
+      unsigned Parts = EltVT.getSizeInBits() / ViaEltSizeInBits;
+
+      SmallVector<SDValue, 2> ScalarParts;
+      for (unsigned i = 0; i != Parts; ++i)
+        ScalarParts.push_back(getConstant(
+            NewVal.extractBits(ViaEltSizeInBits, i * ViaEltSizeInBits), DL,
+            ViaEltVT, isT, isO));
+
+      return getNode(ISD::SPLAT_VECTOR_PARTS, DL, VT, ScalarParts);
+    }
+
+    unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
+    EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts);
+
+    // Check the temporary vector is the correct size. If this fails then
+    // getTypeToTransformTo() probably returned a type whose size (in bits)
+    // isn't a power-of-2 factor of the requested type size.
+    assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SmallVector<SDValue, 2> EltParts;
+    for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i)
+      EltParts.push_back(getConstant(
+          NewVal.extractBits(ViaEltSizeInBits, i * ViaEltSizeInBits), DL,
+          ViaEltVT, isT, isO));
+
+    // EltParts is currently in little endian order. If we actually want
+    // big-endian order then reverse it now.
+    if (getDataLayout().isBigEndian())
+      std::reverse(EltParts.begin(), EltParts.end());
+
+    // The elements must be reversed when the element order is different
+    // to the endianness of the elements (because the BITCAST is itself a
+    // vector shuffle in this situation). However, we do not need any code to
+    // perform this reversal because getConstant() is producing a vector
+    // splat.
+    // This situation occurs in MIPS MSA.
+
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
+      llvm::append_range(Ops, EltParts);
+
+    SDValue V =
+        getNode(ISD::BITCAST, DL, VT, getBuildVector(ViaVecVT, DL, Ops));
+    return V;
+  }
+
+  assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&
+         "APInt size does not match type size!");
+  unsigned Opc = isT ? ISD::TargetConstant : ISD::Constant;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(EltVT), std::nullopt);
+  ID.AddPointer(Elt);
+  ID.AddBoolean(isO);
+  void *IP = nullptr;
+  SDNode *N = nullptr;
+  if ((N = FindNodeOrInsertPos(ID, DL, IP)))
+    if (!VT.isVector())
+      return SDValue(N, 0);
+
+  if (!N) {
+    N = newSDNode<ConstantSDNode>(isT, isO, Elt, EltVT);
+    CSEMap.InsertNode(N, IP);
+    InsertNode(N);
+    NewSDValueDbgMsg(SDValue(N, 0), "Creating constant: ", this);
+  }
+
+  SDValue Result(N, 0);
+  if (VT.isVector())
+    Result = getSplat(VT, DL, Result);
+  return Result;
+}
+
+SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, const SDLoc &DL,
+                                        bool isTarget) {
+  return getConstant(Val, DL, TLI->getPointerTy(getDataLayout()), isTarget);
+}
+
+SDValue SelectionDAG::getShiftAmountConstant(uint64_t Val, EVT VT,
+                                             const SDLoc &DL, bool LegalTypes) {
+  assert(VT.isInteger() && "Shift amount is not an integer type!");
+  EVT ShiftVT = TLI->getShiftAmountTy(VT, getDataLayout(), LegalTypes);
+  return getConstant(Val, DL, ShiftVT);
+}
+
+SDValue SelectionDAG::getVectorIdxConstant(uint64_t Val, const SDLoc &DL,
+                                           bool isTarget) {
+  return getConstant(Val, DL, TLI->getVectorIdxTy(getDataLayout()), isTarget);
+}
+
+SDValue SelectionDAG::getConstantFP(const APFloat &V, const SDLoc &DL, EVT VT,
+                                    bool isTarget) {
+  return getConstantFP(*ConstantFP::get(*getContext(), V), DL, VT, isTarget);
+}
+
+SDValue SelectionDAG::getConstantFP(const ConstantFP &V, const SDLoc &DL,
+                                    EVT VT, bool isTarget) {
+  assert(VT.isFloatingPoint() && "Cannot create integer FP constant!");
+
+  EVT EltVT = VT.getScalarType();
+
+  // Do the map lookup using the actual bit pattern for the floating point
+  // value, so that we don't have problems with 0.0 comparing equal to -0.0, and
+  // we don't have issues with SNANs.
+  unsigned Opc = isTarget ? ISD::TargetConstantFP : ISD::ConstantFP;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(EltVT), std::nullopt);
+  ID.AddPointer(&V);
+  void *IP = nullptr;
+  SDNode *N = nullptr;
+  if ((N = FindNodeOrInsertPos(ID, DL, IP)))
+    if (!VT.isVector())
+      return SDValue(N, 0);
+
+  if (!N) {
+    N = newSDNode<ConstantFPSDNode>(isTarget, &V, EltVT);
+    CSEMap.InsertNode(N, IP);
+    InsertNode(N);
+  }
+
+  SDValue Result(N, 0);
+  if (VT.isVector())
+    Result = getSplat(VT, DL, Result);
+  NewSDValueDbgMsg(Result, "Creating fp constant: ", this);
+  return Result;
+}
+
+SDValue SelectionDAG::getConstantFP(double Val, const SDLoc &DL, EVT VT,
+                                    bool isTarget) {
+  EVT EltVT = VT.getScalarType();
+  if (EltVT == MVT::f32)
+    return getConstantFP(APFloat((float)Val), DL, VT, isTarget);
+  if (EltVT == MVT::f64)
+    return getConstantFP(APFloat(Val), DL, VT, isTarget);
+  if (EltVT == MVT::f80 || EltVT == MVT::f128 || EltVT == MVT::ppcf128 ||
+      EltVT == MVT::f16 || EltVT == MVT::bf16) {
+    bool Ignored;
+    APFloat APF = APFloat(Val);
+    APF.convert(EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven,
+                &Ignored);
+    return getConstantFP(APF, DL, VT, isTarget);
+  }
+  llvm_unreachable("Unsupported type in getConstantFP");
+}
+
+SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, const SDLoc &DL,
+                                       EVT VT, int64_t Offset, bool isTargetGA,
+                                       unsigned TargetFlags) {
+  assert((TargetFlags == 0 || isTargetGA) &&
+         "Cannot set target flags on target-independent globals");
+
+  // Truncate (with sign-extension) the offset value to the pointer size.
+  unsigned BitWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
+  if (BitWidth < 64)
+    Offset = SignExtend64(Offset, BitWidth);
+
+  unsigned Opc;
+  if (GV->isThreadLocal())
+    Opc = isTargetGA ? ISD::TargetGlobalTLSAddress : ISD::GlobalTLSAddress;
+  else
+    Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), std::nullopt);
+  ID.AddPointer(GV);
+  ID.AddInteger(Offset);
+  ID.AddInteger(TargetFlags);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<GlobalAddressSDNode>(
+      Opc, DL.getIROrder(), DL.getDebugLoc(), GV, VT, Offset, TargetFlags);
+  CSEMap.InsertNode(N, IP);
+    InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
+  unsigned Opc = isTarget ? ISD::TargetFrameIndex : ISD::FrameIndex;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), std::nullopt);
+  ID.AddInteger(FI);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<FrameIndexSDNode>(FI, VT, isTarget);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
+                                   unsigned TargetFlags) {
+  assert((TargetFlags == 0 || isTarget) &&
+         "Cannot set target flags on target-independent jump tables");
+  unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), std::nullopt);
+  ID.AddInteger(JTI);
+  ID.AddInteger(TargetFlags);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<JumpTableSDNode>(JTI, VT, isTarget, TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
+                                      MaybeAlign Alignment, int Offset,
+                                      bool isTarget, unsigned TargetFlags) {
+  assert((TargetFlags == 0 || isTarget) &&
+         "Cannot set target flags on target-independent globals");
+  if (!Alignment)
+    Alignment = shouldOptForSize()
+                    ? getDataLayout().getABITypeAlign(C->getType())
+                    : getDataLayout().getPrefTypeAlign(C->getType());
+  unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), std::nullopt);
+  ID.AddInteger(Alignment->value());
+  ID.AddInteger(Offset);
+  ID.AddPointer(C);
+  ID.AddInteger(TargetFlags);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, *Alignment,
+                                          TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V = SDValue(N, 0);
+  NewSDValueDbgMsg(V, "Creating new constant pool: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
+                                      MaybeAlign Alignment, int Offset,
+                                      bool isTarget, unsigned TargetFlags) {
+  assert((TargetFlags == 0 || isTarget) &&
+         "Cannot set target flags on target-independent globals");
+  if (!Alignment)
+    Alignment = getDataLayout().getPrefTypeAlign(C->getType());
+  unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), std::nullopt);
+  ID.AddInteger(Alignment->value());
+  ID.AddInteger(Offset);
+  C->addSelectionDAGCSEId(ID);
+  ID.AddInteger(TargetFlags);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, *Alignment,
+                                          TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
+                                     unsigned TargetFlags) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), std::nullopt);
+  ID.AddInteger(Index);
+  ID.AddInteger(Offset);
+  ID.AddInteger(TargetFlags);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<TargetIndexSDNode>(Index, VT, Offset, TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), std::nullopt);
+  ID.AddPointer(MBB);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<BasicBlockSDNode>(MBB);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getValueType(EVT VT) {
+  if (VT.isSimple() && (unsigned)VT.getSimpleVT().SimpleTy >=
+      ValueTypeNodes.size())
+    ValueTypeNodes.resize(VT.getSimpleVT().SimpleTy+1);
+
+  SDNode *&N = VT.isExtended() ?
+    ExtendedValueTypeNodes[VT] : ValueTypeNodes[VT.getSimpleVT().SimpleTy];
+
+  if (N) return SDValue(N, 0);
+  N = newSDNode<VTSDNode>(VT);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) {
+  SDNode *&N = ExternalSymbols[Sym];
+  if (N) return SDValue(N, 0);
+  N = newSDNode<ExternalSymbolSDNode>(false, Sym, 0, VT);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getMCSymbol(MCSymbol *Sym, EVT VT) {
+  SDNode *&N = MCSymbols[Sym];
+  if (N)
+    return SDValue(N, 0);
+  N = newSDNode<MCSymbolSDNode>(Sym, VT);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
+                                              unsigned TargetFlags) {
+  SDNode *&N =
+      TargetExternalSymbols[std::pair<std::string, unsigned>(Sym, TargetFlags)];
+  if (N) return SDValue(N, 0);
+  N = newSDNode<ExternalSymbolSDNode>(true, Sym, TargetFlags, VT);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
+  if ((unsigned)Cond >= CondCodeNodes.size())
+    CondCodeNodes.resize(Cond+1);
+
+  if (!CondCodeNodes[Cond]) {
+    auto *N = newSDNode<CondCodeSDNode>(Cond);
+    CondCodeNodes[Cond] = N;
+    InsertNode(N);
+  }
+
+  return SDValue(CondCodeNodes[Cond], 0);
+}
+
+SDValue SelectionDAG::getVScale(const SDLoc &DL, EVT VT, APInt MulImm,
+                                bool ConstantFold) {
+  assert(MulImm.getBitWidth() == VT.getSizeInBits() &&
+         "APInt size does not match type size!");
+
+  if (MulImm == 0)
+    return getConstant(0, DL, VT);
+
+  if (ConstantFold) {
+    const MachineFunction &MF = getMachineFunction();
+    auto Attr = MF.getFunction().getFnAttribute(Attribute::VScaleRange);
+    if (Attr.isValid()) {
+      unsigned VScaleMin = Attr.getVScaleRangeMin();
+      if (std::optional<unsigned> VScaleMax = Attr.getVScaleRangeMax())
+        if (*VScaleMax == VScaleMin)
+          return getConstant(MulImm * VScaleMin, DL, VT);
+    }
+  }
+
+  return getNode(ISD::VSCALE, DL, VT, getConstant(MulImm, DL, VT));
+}
+
+SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
+                                      bool ConstantFold) {
+  if (EC.isScalable())
+    return getVScale(DL, VT,
+                     APInt(VT.getSizeInBits(), EC.getKnownMinValue()));
+
+  return getConstant(EC.getKnownMinValue(), DL, VT);
+}
+
+SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) {
+  APInt One(ResVT.getScalarSizeInBits(), 1);
+  return getStepVector(DL, ResVT, One);
+}
+
+SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT, APInt StepVal) {
+  assert(ResVT.getScalarSizeInBits() == StepVal.getBitWidth());
+  if (ResVT.isScalableVector())
+    return getNode(
+        ISD::STEP_VECTOR, DL, ResVT,
+        getTargetConstant(StepVal, DL, ResVT.getVectorElementType()));
+
+  SmallVector<SDValue, 16> OpsStepConstants;
+  for (uint64_t i = 0; i < ResVT.getVectorNumElements(); i++)
+    OpsStepConstants.push_back(
+        getConstant(StepVal * i, DL, ResVT.getVectorElementType()));
+  return getBuildVector(ResVT, DL, OpsStepConstants);
+}
+
+/// Swaps the values of N1 and N2. Swaps all indices in the shuffle mask M that
+/// point at N1 to point at N2 and indices that point at N2 to point at N1.
+static void commuteShuffle(SDValue &N1, SDValue &N2, MutableArrayRef<int> M) {
+  std::swap(N1, N2);
+  ShuffleVectorSDNode::commuteMask(M);
+}
+
+SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
+                                       SDValue N2, ArrayRef<int> Mask) {
+  assert(VT.getVectorNumElements() == Mask.size() &&
+         "Must have the same number of vector elements as mask elements!");
+  assert(VT == N1.getValueType() && VT == N2.getValueType() &&
+         "Invalid VECTOR_SHUFFLE");
+
+  // Canonicalize shuffle undef, undef -> undef
+  if (N1.isUndef() && N2.isUndef())
+    return getUNDEF(VT);
+
+  // Validate that all indices in Mask are within the range of the elements
+  // input to the shuffle.
+  int NElts = Mask.size();
+  assert(llvm::all_of(Mask,
+                      [&](int M) { return M < (NElts * 2) && M >= -1; }) &&
+         "Index out of range");
+
+  // Copy the mask so we can do any needed cleanup.
+  SmallVector<int, 8> MaskVec(Mask);
+
+  // Canonicalize shuffle v, v -> v, undef
+  if (N1 == N2) {
+    N2 = getUNDEF(VT);
+    for (int i = 0; i != NElts; ++i)
+      if (MaskVec[i] >= NElts) MaskVec[i] -= NElts;
+  }
+
+  // Canonicalize shuffle undef, v -> v, undef.  Commute the shuffle mask.
+  if (N1.isUndef())
+    commuteShuffle(N1, N2, MaskVec);
+
+  if (TLI->hasVectorBlend()) {
+    // If shuffling a splat, try to blend the splat instead. We do this here so
+    // that even when this arises during lowering we don't have to re-handle it.
+    auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) {
+      BitVector UndefElements;
+      SDValue Splat = BV->getSplatValue(&UndefElements);
+      if (!Splat)
+        return;
+
+      for (int i = 0; i < NElts; ++i) {
+        if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + NElts))
+          continue;
+
+        // If this input comes from undef, mark it as such.
+        if (UndefElements[MaskVec[i] - Offset]) {
+          MaskVec[i] = -1;
+          continue;
+        }
+
+        // If we can blend a non-undef lane, use that instead.
+        if (!UndefElements[i])
+          MaskVec[i] = i + Offset;
+      }
+    };
+    if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
+      BlendSplat(N1BV, 0);
+    if (auto *N2BV = dyn_cast<BuildVectorSDNode>(N2))
+      BlendSplat(N2BV, NElts);
+  }
+
+  // Canonicalize all index into lhs, -> shuffle lhs, undef
+  // Canonicalize all index into rhs, -> shuffle rhs, undef
+  bool AllLHS = true, AllRHS = true;
+  bool N2Undef = N2.isUndef();
+  for (int i = 0; i != NElts; ++i) {
+    if (MaskVec[i] >= NElts) {
+      if (N2Undef)
+        MaskVec[i] = -1;
+      else
+        AllLHS = false;
+    } else if (MaskVec[i] >= 0) {
+      AllRHS = false;
+    }
+  }
+  if (AllLHS && AllRHS)
+    return getUNDEF(VT);
+  if (AllLHS && !N2Undef)
+    N2 = getUNDEF(VT);
+  if (AllRHS) {
+    N1 = getUNDEF(VT);
+    commuteShuffle(N1, N2, MaskVec);
+  }
+  // Reset our undef status after accounting for the mask.
+  N2Undef = N2.isUndef();
+  // Re-check whether both sides ended up undef.
+  if (N1.isUndef() && N2Undef)
+    return getUNDEF(VT);
+
+  // If Identity shuffle return that node.
+  bool Identity = true, AllSame = true;
+  for (int i = 0; i != NElts; ++i) {
+    if (MaskVec[i] >= 0 && MaskVec[i] != i) Identity = false;
+    if (MaskVec[i] != MaskVec[0]) AllSame = false;
+  }
+  if (Identity && NElts)
+    return N1;
+
+  // Shuffling a constant splat doesn't change the result.
+  if (N2Undef) {
+    SDValue V = N1;
+
+    // Look through any bitcasts. We check that these don't change the number
+    // (and size) of elements and just changes their types.
+    while (V.getOpcode() == ISD::BITCAST)
+      V = V->getOperand(0);
+
+    // A splat should always show up as a build vector node.
+    if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
+      BitVector UndefElements;
+      SDValue Splat = BV->getSplatValue(&UndefElements);
+      // If this is a splat of an undef, shuffling it is also undef.
+      if (Splat && Splat.isUndef())
+        return getUNDEF(VT);
+
+      bool SameNumElts =
+          V.getValueType().getVectorNumElements() == VT.getVectorNumElements();
+
+      // We only have a splat which can skip shuffles if there is a splatted
+      // value and no undef lanes rearranged by the shuffle.
+      if (Splat && UndefElements.none()) {
+        // Splat of <x, x, ..., x>, return <x, x, ..., x>, provided that the
+        // number of elements match or the value splatted is a zero constant.
+        if (SameNumElts)
+          return N1;
+        if (auto *C = dyn_cast<ConstantSDNode>(Splat))
+          if (C->isZero())
+            return N1;
+      }
+
+      // If the shuffle itself creates a splat, build the vector directly.
+      if (AllSame && SameNumElts) {
+        EVT BuildVT = BV->getValueType(0);
+        const SDValue &Splatted = BV->getOperand(MaskVec[0]);
+        SDValue NewBV = getSplatBuildVector(BuildVT, dl, Splatted);
+
+        // We may have jumped through bitcasts, so the type of the
+        // BUILD_VECTOR may not match the type of the shuffle.
+        if (BuildVT != VT)
+          NewBV = getNode(ISD::BITCAST, dl, VT, NewBV);
+        return NewBV;
+      }
+    }
+  }
+
+  FoldingSetNodeID ID;
+  SDValue Ops[2] = { N1, N2 };
+  AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops);
+  for (int i = 0; i != NElts; ++i)
+    ID.AddInteger(MaskVec[i]);
+
+  void* IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
+    return SDValue(E, 0);
+
+  // Allocate the mask array for the node out of the BumpPtrAllocator, since
+  // SDNode doesn't have access to it.  This memory will be "leaked" when
+  // the node is deallocated, but recovered when the NodeAllocator is released.
+  int *MaskAlloc = OperandAllocator.Allocate<int>(NElts);
+  llvm::copy(MaskVec, MaskAlloc);
+
+  auto *N = newSDNode<ShuffleVectorSDNode>(VT, dl.getIROrder(),
+                                           dl.getDebugLoc(), MaskAlloc);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V = SDValue(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) {
+  EVT VT = SV.getValueType(0);
+  SmallVector<int, 8> MaskVec(SV.getMask());
+  ShuffleVectorSDNode::commuteMask(MaskVec);
+
+  SDValue Op0 = SV.getOperand(0);
+  SDValue Op1 = SV.getOperand(1);
+  return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, MaskVec);
+}
+
+SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::Register, getVTList(VT), std::nullopt);
+  ID.AddInteger(RegNo);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<RegisterSDNode>(RegNo, VT);
+  N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, UA);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::RegisterMask, getVTList(MVT::Untyped), std::nullopt);
+  ID.AddPointer(RegMask);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<RegisterMaskSDNode>(RegMask);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getEHLabel(const SDLoc &dl, SDValue Root,
+                                 MCSymbol *Label) {
+  return getLabelNode(ISD::EH_LABEL, dl, Root, Label);
+}
+
+SDValue SelectionDAG::getLabelNode(unsigned Opcode, const SDLoc &dl,
+                                   SDValue Root, MCSymbol *Label) {
+  FoldingSetNodeID ID;
+  SDValue Ops[] = { Root };
+  AddNodeIDNode(ID, Opcode, getVTList(MVT::Other), Ops);
+  ID.AddPointer(Label);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N =
+      newSDNode<LabelSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(), Label);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
+                                      int64_t Offset, bool isTarget,
+                                      unsigned TargetFlags) {
+  unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), std::nullopt);
+  ID.AddPointer(BA);
+  ID.AddInteger(Offset);
+  ID.AddInteger(TargetFlags);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<BlockAddressSDNode>(Opc, VT, BA, Offset, TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getSrcValue(const Value *V) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), std::nullopt);
+  ID.AddPointer(V);
+
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<SrcValueSDNode>(V);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getMDNode(const MDNode *MD) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MDNODE_SDNODE, getVTList(MVT::Other), std::nullopt);
+  ID.AddPointer(MD);
+
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<MDNodeSDNode>(MD);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getBitcast(EVT VT, SDValue V) {
+  if (VT == V.getValueType())
+    return V;
+
+  return getNode(ISD::BITCAST, SDLoc(V), VT, V);
+}
+
+SDValue SelectionDAG::getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr,
+                                       unsigned SrcAS, unsigned DestAS) {
+  SDValue Ops[] = {Ptr};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::ADDRSPACECAST, getVTList(VT), Ops);
+  ID.AddInteger(SrcAS);
+  ID.AddInteger(DestAS);
+
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<AddrSpaceCastSDNode>(dl.getIROrder(), dl.getDebugLoc(),
+                                           VT, SrcAS, DestAS);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getFreeze(SDValue V) {
+  return getNode(ISD::FREEZE, SDLoc(V), V.getValueType(), V);
+}
+
+/// getShiftAmountOperand - Return the specified value casted to
+/// the target's desired shift amount type.
+SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
+  EVT OpTy = Op.getValueType();
+  EVT ShTy = TLI->getShiftAmountTy(LHSTy, getDataLayout());
+  if (OpTy == ShTy || OpTy.isVector()) return Op;
+
+  return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
+}
+
+SDValue SelectionDAG::expandVAArg(SDNode *Node) {
+  SDLoc dl(Node);
+  const TargetLowering &TLI = getTargetLoweringInfo();
+  const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  EVT VT = Node->getValueType(0);
+  SDValue Tmp1 = Node->getOperand(0);
+  SDValue Tmp2 = Node->getOperand(1);
+  const MaybeAlign MA(Node->getConstantOperandVal(3));
+
+  SDValue VAListLoad = getLoad(TLI.getPointerTy(getDataLayout()), dl, Tmp1,
+                               Tmp2, MachinePointerInfo(V));
+  SDValue VAList = VAListLoad;
+
+  if (MA && *MA > TLI.getMinStackArgumentAlignment()) {
+    VAList = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
+                     getConstant(MA->value() - 1, dl, VAList.getValueType()));
+
+    VAList =
+        getNode(ISD::AND, dl, VAList.getValueType(), VAList,
+                getConstant(-(int64_t)MA->value(), dl, VAList.getValueType()));
+  }
+
+  // Increment the pointer, VAList, to the next vaarg
+  Tmp1 = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
+                 getConstant(getDataLayout().getTypeAllocSize(
+                                               VT.getTypeForEVT(*getContext())),
+                             dl, VAList.getValueType()));
+  // Store the incremented VAList to the legalized pointer
+  Tmp1 =
+      getStore(VAListLoad.getValue(1), dl, Tmp1, Tmp2, MachinePointerInfo(V));
+  // Load the actual argument out of the pointer VAList
+  return getLoad(VT, dl, Tmp1, VAList, MachinePointerInfo());
+}
+
+SDValue SelectionDAG::expandVACopy(SDNode *Node) {
+  SDLoc dl(Node);
+  const TargetLowering &TLI = getTargetLoweringInfo();
+  // This defaults to loading a pointer from the input and storing it to the
+  // output, returning the chain.
+  const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue();
+  const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue();
+  SDValue Tmp1 =
+      getLoad(TLI.getPointerTy(getDataLayout()), dl, Node->getOperand(0),
+              Node->getOperand(2), MachinePointerInfo(VS));
+  return getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1),
+                  MachinePointerInfo(VD));
+}
+
+Align SelectionDAG::getReducedAlign(EVT VT, bool UseABI) {
+  const DataLayout &DL = getDataLayout();
+  Type *Ty = VT.getTypeForEVT(*getContext());
+  Align RedAlign = UseABI ? DL.getABITypeAlign(Ty) : DL.getPrefTypeAlign(Ty);
+
+  if (TLI->isTypeLegal(VT) || !VT.isVector())
+    return RedAlign;
+
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  const Align StackAlign = TFI->getStackAlign();
+
+  // See if we can choose a smaller ABI alignment in cases where it's an
+  // illegal vector type that will get broken down.
+  if (RedAlign > StackAlign) {
+    EVT IntermediateVT;
+    MVT RegisterVT;
+    unsigned NumIntermediates;
+    TLI->getVectorTypeBreakdown(*getContext(), VT, IntermediateVT,
+                                NumIntermediates, RegisterVT);
+    Ty = IntermediateVT.getTypeForEVT(*getContext());
+    Align RedAlign2 = UseABI ? DL.getABITypeAlign(Ty) : DL.getPrefTypeAlign(Ty);
+    if (RedAlign2 < RedAlign)
+      RedAlign = RedAlign2;
+  }
+
+  return RedAlign;
+}
+
+SDValue SelectionDAG::CreateStackTemporary(TypeSize Bytes, Align Alignment) {
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  int StackID = 0;
+  if (Bytes.isScalable())
+    StackID = TFI->getStackIDForScalableVectors();
+  // The stack id gives an indication of whether the object is scalable or
+  // not, so it's safe to pass in the minimum size here.
+  int FrameIdx = MFI.CreateStackObject(Bytes.getKnownMinValue(), Alignment,
+                                       false, nullptr, StackID);
+  return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
+}
+
+SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
+  Type *Ty = VT.getTypeForEVT(*getContext());
+  Align StackAlign =
+      std::max(getDataLayout().getPrefTypeAlign(Ty), Align(minAlign));
+  return CreateStackTemporary(VT.getStoreSize(), StackAlign);
+}
+
+SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
+  TypeSize VT1Size = VT1.getStoreSize();
+  TypeSize VT2Size = VT2.getStoreSize();
+  assert(VT1Size.isScalable() == VT2Size.isScalable() &&
+         "Don't know how to choose the maximum size when creating a stack "
+         "temporary");
+  TypeSize Bytes = VT1Size.getKnownMinValue() > VT2Size.getKnownMinValue()
+                       ? VT1Size
+                       : VT2Size;
+
+  Type *Ty1 = VT1.getTypeForEVT(*getContext());
+  Type *Ty2 = VT2.getTypeForEVT(*getContext());
+  const DataLayout &DL = getDataLayout();
+  Align Align = std::max(DL.getPrefTypeAlign(Ty1), DL.getPrefTypeAlign(Ty2));
+  return CreateStackTemporary(Bytes, Align);
+}
+
+SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
+                                ISD::CondCode Cond, const SDLoc &dl) {
+  EVT OpVT = N1.getValueType();
+
+  auto GetUndefBooleanConstant = [&]() {
+    if (VT.getScalarType() == MVT::i1 ||
+        TLI->getBooleanContents(OpVT) ==
+            TargetLowering::UndefinedBooleanContent)
+      return getUNDEF(VT);
+    // ZeroOrOne / ZeroOrNegative require specific values for the high bits,
+    // so we cannot use getUNDEF(). Return zero instead.
+    return getConstant(0, dl, VT);
+  };
+
+  // These setcc operations always fold.
+  switch (Cond) {
+  default: break;
+  case ISD::SETFALSE:
+  case ISD::SETFALSE2: return getBoolConstant(false, dl, VT, OpVT);
+  case ISD::SETTRUE:
+  case ISD::SETTRUE2: return getBoolConstant(true, dl, VT, OpVT);
+
+  case ISD::SETOEQ:
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETONE:
+  case ISD::SETO:
+  case ISD::SETUO:
+  case ISD::SETUEQ:
+  case ISD::SETUNE:
+    assert(!OpVT.isInteger() && "Illegal setcc for integer!");
+    break;
+  }
+
+  if (OpVT.isInteger()) {
+    // For EQ and NE, we can always pick a value for the undef to make the
+    // predicate pass or fail, so we can return undef.
+    // Matches behavior in llvm::ConstantFoldCompareInstruction.
+    // icmp eq/ne X, undef -> undef.
+    if ((N1.isUndef() || N2.isUndef()) &&
+        (Cond == ISD::SETEQ || Cond == ISD::SETNE))
+      return GetUndefBooleanConstant();
+
+    // If both operands are undef, we can return undef for int comparison.
+    // icmp undef, undef -> undef.
+    if (N1.isUndef() && N2.isUndef())
+      return GetUndefBooleanConstant();
+
+    // icmp X, X -> true/false
+    // icmp X, undef -> true/false because undef could be X.
+    if (N1 == N2)
+      return getBoolConstant(ISD::isTrueWhenEqual(Cond), dl, VT, OpVT);
+  }
+
+  if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2)) {
+    const APInt &C2 = N2C->getAPIntValue();
+    if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) {
+      const APInt &C1 = N1C->getAPIntValue();
+
+      return getBoolConstant(ICmpInst::compare(C1, C2, getICmpCondCode(Cond)),
+                             dl, VT, OpVT);
+    }
+  }
+
+  auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  auto *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
+
+  if (N1CFP && N2CFP) {
+    APFloat::cmpResult R = N1CFP->getValueAPF().compare(N2CFP->getValueAPF());
+    switch (Cond) {
+    default: break;
+    case ISD::SETEQ:  if (R==APFloat::cmpUnordered)
+                        return GetUndefBooleanConstant();
+                      [[fallthrough]];
+    case ISD::SETOEQ: return getBoolConstant(R==APFloat::cmpEqual, dl, VT,
+                                             OpVT);
+    case ISD::SETNE:  if (R==APFloat::cmpUnordered)
+                        return GetUndefBooleanConstant();
+                      [[fallthrough]];
+    case ISD::SETONE: return getBoolConstant(R==APFloat::cmpGreaterThan ||
+                                             R==APFloat::cmpLessThan, dl, VT,
+                                             OpVT);
+    case ISD::SETLT:  if (R==APFloat::cmpUnordered)
+                        return GetUndefBooleanConstant();
+                      [[fallthrough]];
+    case ISD::SETOLT: return getBoolConstant(R==APFloat::cmpLessThan, dl, VT,
+                                             OpVT);
+    case ISD::SETGT:  if (R==APFloat::cmpUnordered)
+                        return GetUndefBooleanConstant();
+                      [[fallthrough]];
+    case ISD::SETOGT: return getBoolConstant(R==APFloat::cmpGreaterThan, dl,
+                                             VT, OpVT);
+    case ISD::SETLE:  if (R==APFloat::cmpUnordered)
+                        return GetUndefBooleanConstant();
+                      [[fallthrough]];
+    case ISD::SETOLE: return getBoolConstant(R==APFloat::cmpLessThan ||
+                                             R==APFloat::cmpEqual, dl, VT,
+                                             OpVT);
+    case ISD::SETGE:  if (R==APFloat::cmpUnordered)
+                        return GetUndefBooleanConstant();
+                      [[fallthrough]];
+    case ISD::SETOGE: return getBoolConstant(R==APFloat::cmpGreaterThan ||
+                                         R==APFloat::cmpEqual, dl, VT, OpVT);
+    case ISD::SETO:   return getBoolConstant(R!=APFloat::cmpUnordered, dl, VT,
+                                             OpVT);
+    case ISD::SETUO:  return getBoolConstant(R==APFloat::cmpUnordered, dl, VT,
+                                             OpVT);
+    case ISD::SETUEQ: return getBoolConstant(R==APFloat::cmpUnordered ||
+                                             R==APFloat::cmpEqual, dl, VT,
+                                             OpVT);
+    case ISD::SETUNE: return getBoolConstant(R!=APFloat::cmpEqual, dl, VT,
+                                             OpVT);
+    case ISD::SETULT: return getBoolConstant(R==APFloat::cmpUnordered ||
+                                             R==APFloat::cmpLessThan, dl, VT,
+                                             OpVT);
+    case ISD::SETUGT: return getBoolConstant(R==APFloat::cmpGreaterThan ||
+                                             R==APFloat::cmpUnordered, dl, VT,
+                                             OpVT);
+    case ISD::SETULE: return getBoolConstant(R!=APFloat::cmpGreaterThan, dl,
+                                             VT, OpVT);
+    case ISD::SETUGE: return getBoolConstant(R!=APFloat::cmpLessThan, dl, VT,
+                                             OpVT);
+    }
+  } else if (N1CFP && OpVT.isSimple() && !N2.isUndef()) {
+    // Ensure that the constant occurs on the RHS.
+    ISD::CondCode SwappedCond = ISD::getSetCCSwappedOperands(Cond);
+    if (!TLI->isCondCodeLegal(SwappedCond, OpVT.getSimpleVT()))
+      return SDValue();
+    return getSetCC(dl, VT, N2, N1, SwappedCond);
+  } else if ((N2CFP && N2CFP->getValueAPF().isNaN()) ||
+             (OpVT.isFloatingPoint() && (N1.isUndef() || N2.isUndef()))) {
+    // If an operand is known to be a nan (or undef that could be a nan), we can
+    // fold it.
+    // Choosing NaN for the undef will always make unordered comparison succeed
+    // and ordered comparison fails.
+    // Matches behavior in llvm::ConstantFoldCompareInstruction.
+    switch (ISD::getUnorderedFlavor(Cond)) {
+    default:
+      llvm_unreachable("Unknown flavor!");
+    case 0: // Known false.
+      return getBoolConstant(false, dl, VT, OpVT);
+    case 1: // Known true.
+      return getBoolConstant(true, dl, VT, OpVT);
+    case 2: // Undefined.
+      return GetUndefBooleanConstant();
+    }
+  }
+
+  // Could not fold it.
+  return SDValue();
+}
+
+/// SignBitIsZero - Return true if the sign bit of Op is known to be zero.  We
+/// use this predicate to simplify operations downstream.
+bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
+  unsigned BitWidth = Op.getScalarValueSizeInBits();
+  return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth);
+}
+
+/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
+/// this predicate to simplify operations downstream.  Mask is known to be zero
+/// for bits that V cannot have.
+bool SelectionDAG::MaskedValueIsZero(SDValue V, const APInt &Mask,
+                                     unsigned Depth) const {
+  return Mask.isSubsetOf(computeKnownBits(V, Depth).Zero);
+}
+
+/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero in
+/// DemandedElts.  We use this predicate to simplify operations downstream.
+/// Mask is known to be zero for bits that V cannot have.
+bool SelectionDAG::MaskedValueIsZero(SDValue V, const APInt &Mask,
+                                     const APInt &DemandedElts,
+                                     unsigned Depth) const {
+  return Mask.isSubsetOf(computeKnownBits(V, DemandedElts, Depth).Zero);
+}
+
+/// MaskedVectorIsZero - Return true if 'Op' is known to be zero in
+/// DemandedElts.  We use this predicate to simplify operations downstream.
+bool SelectionDAG::MaskedVectorIsZero(SDValue V, const APInt &DemandedElts,
+                                      unsigned Depth /* = 0 */) const {
+  return computeKnownBits(V, DemandedElts, Depth).isZero();
+}
+
+/// MaskedValueIsAllOnes - Return true if '(Op & Mask) == Mask'.
+bool SelectionDAG::MaskedValueIsAllOnes(SDValue V, const APInt &Mask,
+                                        unsigned Depth) const {
+  return Mask.isSubsetOf(computeKnownBits(V, Depth).One);
+}
+
+APInt SelectionDAG::computeVectorKnownZeroElements(SDValue Op,
+                                                   const APInt &DemandedElts,
+                                                   unsigned Depth) const {
+  EVT VT = Op.getValueType();
+  assert(VT.isVector() && !VT.isScalableVector() && "Only for fixed vectors!");
+
+  unsigned NumElts = VT.getVectorNumElements();
+  assert(DemandedElts.getBitWidth() == NumElts && "Unexpected demanded mask.");
+
+  APInt KnownZeroElements = APInt::getZero(NumElts);
+  for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
+    if (!DemandedElts[EltIdx])
+      continue; // Don't query elements that are not demanded.
+    APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
+    if (MaskedVectorIsZero(Op, Mask, Depth))
+      KnownZeroElements.setBit(EltIdx);
+  }
+  return KnownZeroElements;
+}
+
+/// isSplatValue - Return true if the vector V has the same value
+/// across all DemandedElts. For scalable vectors, we don't know the
+/// number of lanes at compile time.  Instead, we use a 1 bit APInt
+/// to represent a conservative value for all lanes; that is, that
+/// one bit value is implicitly splatted across all lanes.
+bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
+                                APInt &UndefElts, unsigned Depth) const {
+  unsigned Opcode = V.getOpcode();
+  EVT VT = V.getValueType();
+  assert(VT.isVector() && "Vector type expected");
+  assert((!VT.isScalableVector() || DemandedElts.getBitWidth() == 1) &&
+         "scalable demanded bits are ignored");
+
+  if (!DemandedElts)
+    return false; // No demanded elts, better to assume we don't know anything.
+
+  if (Depth >= MaxRecursionDepth)
+    return false; // Limit search depth.
+
+  // Deal with some common cases here that work for both fixed and scalable
+  // vector types.
+  switch (Opcode) {
+  case ISD::SPLAT_VECTOR:
+    UndefElts = V.getOperand(0).isUndef()
+                    ? APInt::getAllOnes(DemandedElts.getBitWidth())
+                    : APInt(DemandedElts.getBitWidth(), 0);
+    return true;
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::AND:
+  case ISD::XOR:
+  case ISD::OR: {
+    APInt UndefLHS, UndefRHS;
+    SDValue LHS = V.getOperand(0);
+    SDValue RHS = V.getOperand(1);
+    if (isSplatValue(LHS, DemandedElts, UndefLHS, Depth + 1) &&
+        isSplatValue(RHS, DemandedElts, UndefRHS, Depth + 1)) {
+      UndefElts = UndefLHS | UndefRHS;
+      return true;
+    }
+    return false;
+  }
+  case ISD::ABS:
+  case ISD::TRUNCATE:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    return isSplatValue(V.getOperand(0), DemandedElts, UndefElts, Depth + 1);
+  default:
+    if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::INTRINSIC_WO_CHAIN ||
+        Opcode == ISD::INTRINSIC_W_CHAIN || Opcode == ISD::INTRINSIC_VOID)
+      return TLI->isSplatValueForTargetNode(V, DemandedElts, UndefElts, *this,
+                                            Depth);
+    break;
+}
+
+  // We don't support other cases than those above for scalable vectors at
+  // the moment.
+  if (VT.isScalableVector())
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  assert(NumElts == DemandedElts.getBitWidth() && "Vector size mismatch");
+  UndefElts = APInt::getZero(NumElts);
+
+  switch (Opcode) {
+  case ISD::BUILD_VECTOR: {
+    SDValue Scl;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      SDValue Op = V.getOperand(i);
+      if (Op.isUndef()) {
+        UndefElts.setBit(i);
+        continue;
+      }
+      if (!DemandedElts[i])
+        continue;
+      if (Scl && Scl != Op)
+        return false;
+      Scl = Op;
+    }
+    return true;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    // Check if this is a shuffle node doing a splat or a shuffle of a splat.
+    APInt DemandedLHS = APInt::getZero(NumElts);
+    APInt DemandedRHS = APInt::getZero(NumElts);
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(V)->getMask();
+    for (int i = 0; i != (int)NumElts; ++i) {
+      int M = Mask[i];
+      if (M < 0) {
+        UndefElts.setBit(i);
+        continue;
+      }
+      if (!DemandedElts[i])
+        continue;
+      if (M < (int)NumElts)
+        DemandedLHS.setBit(M);
+      else
+        DemandedRHS.setBit(M - NumElts);
+    }
+
+    // If we aren't demanding either op, assume there's no splat.
+    // If we are demanding both ops, assume there's no splat.
+    if ((DemandedLHS.isZero() && DemandedRHS.isZero()) ||
+        (!DemandedLHS.isZero() && !DemandedRHS.isZero()))
+      return false;
+
+    // See if the demanded elts of the source op is a splat or we only demand
+    // one element, which should always be a splat.
+    // TODO: Handle source ops splats with undefs.
+    auto CheckSplatSrc = [&](SDValue Src, const APInt &SrcElts) {
+      APInt SrcUndefs;
+      return (SrcElts.popcount() == 1) ||
+             (isSplatValue(Src, SrcElts, SrcUndefs, Depth + 1) &&
+              (SrcElts & SrcUndefs).isZero());
+    };
+    if (!DemandedLHS.isZero())
+      return CheckSplatSrc(V.getOperand(0), DemandedLHS);
+    return CheckSplatSrc(V.getOperand(1), DemandedRHS);
+  }
+  case ISD::EXTRACT_SUBVECTOR: {
+    // Offset the demanded elts by the subvector index.
+    SDValue Src = V.getOperand(0);
+    // We don't support scalable vectors at the moment.
+    if (Src.getValueType().isScalableVector())
+      return false;
+    uint64_t Idx = V.getConstantOperandVal(1);
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt UndefSrcElts;
+    APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx);
+    if (isSplatValue(Src, DemandedSrcElts, UndefSrcElts, Depth + 1)) {
+      UndefElts = UndefSrcElts.extractBits(NumElts, Idx);
+      return true;
+    }
+    break;
+  }
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG: {
+    // Widen the demanded elts by the src element count.
+    SDValue Src = V.getOperand(0);
+    // We don't support scalable vectors at the moment.
+    if (Src.getValueType().isScalableVector())
+      return false;
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt UndefSrcElts;
+    APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts);
+    if (isSplatValue(Src, DemandedSrcElts, UndefSrcElts, Depth + 1)) {
+      UndefElts = UndefSrcElts.trunc(NumElts);
+      return true;
+    }
+    break;
+  }
+  case ISD::BITCAST: {
+    SDValue Src = V.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    unsigned SrcBitWidth = SrcVT.getScalarSizeInBits();
+    unsigned BitWidth = VT.getScalarSizeInBits();
+
+    // Ignore bitcasts from unsupported types.
+    // TODO: Add fp support?
+    if (!SrcVT.isVector() || !SrcVT.isInteger() || !VT.isInteger())
+      break;
+
+    // Bitcast 'small element' vector to 'large element' vector.
+    if ((BitWidth % SrcBitWidth) == 0) {
+      // See if each sub element is a splat.
+      unsigned Scale = BitWidth / SrcBitWidth;
+      unsigned NumSrcElts = SrcVT.getVectorNumElements();
+      APInt ScaledDemandedElts =
+          APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
+      for (unsigned I = 0; I != Scale; ++I) {
+        APInt SubUndefElts;
+        APInt SubDemandedElt = APInt::getOneBitSet(Scale, I);
+        APInt SubDemandedElts = APInt::getSplat(NumSrcElts, SubDemandedElt);
+        SubDemandedElts &= ScaledDemandedElts;
+        if (!isSplatValue(Src, SubDemandedElts, SubUndefElts, Depth + 1))
+          return false;
+        // TODO: Add support for merging sub undef elements.
+        if (!SubUndefElts.isZero())
+          return false;
+      }
+      return true;
+    }
+    break;
+  }
+  }
+
+  return false;
+}
+
+/// Helper wrapper to main isSplatValue function.
+bool SelectionDAG::isSplatValue(SDValue V, bool AllowUndefs) const {
+  EVT VT = V.getValueType();
+  assert(VT.isVector() && "Vector type expected");
+
+  APInt UndefElts;
+  // Since the number of lanes in a scalable vector is unknown at compile time,
+  // we track one bit which is implicitly broadcast to all lanes.  This means
+  // that all lanes in a scalable vector are considered demanded.
+  APInt DemandedElts
+    = APInt::getAllOnes(VT.isScalableVector() ? 1 : VT.getVectorNumElements());
+  return isSplatValue(V, DemandedElts, UndefElts) &&
+         (AllowUndefs || !UndefElts);
+}
+
+SDValue SelectionDAG::getSplatSourceVector(SDValue V, int &SplatIdx) {
+  V = peekThroughExtractSubvectors(V);
+
+  EVT VT = V.getValueType();
+  unsigned Opcode = V.getOpcode();
+  switch (Opcode) {
+  default: {
+    APInt UndefElts;
+    // Since the number of lanes in a scalable vector is unknown at compile time,
+    // we track one bit which is implicitly broadcast to all lanes.  This means
+    // that all lanes in a scalable vector are considered demanded.
+    APInt DemandedElts
+      = APInt::getAllOnes(VT.isScalableVector() ? 1 : VT.getVectorNumElements());
+
+    if (isSplatValue(V, DemandedElts, UndefElts)) {
+      if (VT.isScalableVector()) {
+        // DemandedElts and UndefElts are ignored for scalable vectors, since
+        // the only supported cases are SPLAT_VECTOR nodes.
+        SplatIdx = 0;
+      } else {
+        // Handle case where all demanded elements are UNDEF.
+        if (DemandedElts.isSubsetOf(UndefElts)) {
+          SplatIdx = 0;
+          return getUNDEF(VT);
+        }
+        SplatIdx = (UndefElts & DemandedElts).countr_one();
+      }
+      return V;
+    }
+    break;
+  }
+  case ISD::SPLAT_VECTOR:
+    SplatIdx = 0;
+    return V;
+  case ISD::VECTOR_SHUFFLE: {
+    assert(!VT.isScalableVector());
+    // Check if this is a shuffle node doing a splat.
+    // TODO - remove this and rely purely on SelectionDAG::isSplatValue,
+    // getTargetVShiftNode currently struggles without the splat source.
+    auto *SVN = cast<ShuffleVectorSDNode>(V);
+    if (!SVN->isSplat())
+      break;
+    int Idx = SVN->getSplatIndex();
+    int NumElts = V.getValueType().getVectorNumElements();
+    SplatIdx = Idx % NumElts;
+    return V.getOperand(Idx / NumElts);
+  }
+  }
+
+  return SDValue();
+}
+
+SDValue SelectionDAG::getSplatValue(SDValue V, bool LegalTypes) {
+  int SplatIdx;
+  if (SDValue SrcVector = getSplatSourceVector(V, SplatIdx)) {
+    EVT SVT = SrcVector.getValueType().getScalarType();
+    EVT LegalSVT = SVT;
+    if (LegalTypes && !TLI->isTypeLegal(SVT)) {
+      if (!SVT.isInteger())
+        return SDValue();
+      LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
+      if (LegalSVT.bitsLT(SVT))
+        return SDValue();
+    }
+    return getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), LegalSVT, SrcVector,
+                   getVectorIdxConstant(SplatIdx, SDLoc(V)));
+  }
+  return SDValue();
+}
+
+const APInt *
+SelectionDAG::getValidShiftAmountConstant(SDValue V,
+                                          const APInt &DemandedElts) const {
+  assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL ||
+          V.getOpcode() == ISD::SRA) &&
+         "Unknown shift node");
+  unsigned BitWidth = V.getScalarValueSizeInBits();
+  if (ConstantSDNode *SA = isConstOrConstSplat(V.getOperand(1), DemandedElts)) {
+    // Shifting more than the bitwidth is not valid.
+    const APInt &ShAmt = SA->getAPIntValue();
+    if (ShAmt.ult(BitWidth))
+      return &ShAmt;
+  }
+  return nullptr;
+}
+
+const APInt *SelectionDAG::getValidMinimumShiftAmountConstant(
+    SDValue V, const APInt &DemandedElts) const {
+  assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL ||
+          V.getOpcode() == ISD::SRA) &&
+         "Unknown shift node");
+  if (const APInt *ValidAmt = getValidShiftAmountConstant(V, DemandedElts))
+    return ValidAmt;
+  unsigned BitWidth = V.getScalarValueSizeInBits();
+  auto *BV = dyn_cast<BuildVectorSDNode>(V.getOperand(1));
+  if (!BV)
+    return nullptr;
+  const APInt *MinShAmt = nullptr;
+  for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
+    if (!DemandedElts[i])
+      continue;
+    auto *SA = dyn_cast<ConstantSDNode>(BV->getOperand(i));
+    if (!SA)
+      return nullptr;
+    // Shifting more than the bitwidth is not valid.
+    const APInt &ShAmt = SA->getAPIntValue();
+    if (ShAmt.uge(BitWidth))
+      return nullptr;
+    if (MinShAmt && MinShAmt->ule(ShAmt))
+      continue;
+    MinShAmt = &ShAmt;
+  }
+  return MinShAmt;
+}
+
+const APInt *SelectionDAG::getValidMaximumShiftAmountConstant(
+    SDValue V, const APInt &DemandedElts) const {
+  assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL ||
+          V.getOpcode() == ISD::SRA) &&
+         "Unknown shift node");
+  if (const APInt *ValidAmt = getValidShiftAmountConstant(V, DemandedElts))
+    return ValidAmt;
+  unsigned BitWidth = V.getScalarValueSizeInBits();
+  auto *BV = dyn_cast<BuildVectorSDNode>(V.getOperand(1));
+  if (!BV)
+    return nullptr;
+  const APInt *MaxShAmt = nullptr;
+  for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
+    if (!DemandedElts[i])
+      continue;
+    auto *SA = dyn_cast<ConstantSDNode>(BV->getOperand(i));
+    if (!SA)
+      return nullptr;
+    // Shifting more than the bitwidth is not valid.
+    const APInt &ShAmt = SA->getAPIntValue();
+    if (ShAmt.uge(BitWidth))
+      return nullptr;
+    if (MaxShAmt && MaxShAmt->uge(ShAmt))
+      continue;
+    MaxShAmt = &ShAmt;
+  }
+  return MaxShAmt;
+}
+
+/// Determine which bits of Op are known to be either zero or one and return
+/// them in Known. For vectors, the known bits are those that are shared by
+/// every vector element.
+KnownBits SelectionDAG::computeKnownBits(SDValue Op, unsigned Depth) const {
+  EVT VT = Op.getValueType();
+
+  // Since the number of lanes in a scalable vector is unknown at compile time,
+  // we track one bit which is implicitly broadcast to all lanes.  This means
+  // that all lanes in a scalable vector are considered demanded.
+  APInt DemandedElts = VT.isFixedLengthVector()
+                           ? APInt::getAllOnes(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  return computeKnownBits(Op, DemandedElts, Depth);
+}
+
+/// Determine which bits of Op are known to be either zero or one and return
+/// them in Known. The DemandedElts argument allows us to only collect the known
+/// bits that are shared by the requested vector elements.
+KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
+                                         unsigned Depth) const {
+  unsigned BitWidth = Op.getScalarValueSizeInBits();
+
+  KnownBits Known(BitWidth);   // Don't know anything.
+
+  if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
+    // We know all of the bits for a constant!
+    return KnownBits::makeConstant(C->getAPIntValue());
+  }
+  if (auto *C = dyn_cast<ConstantFPSDNode>(Op)) {
+    // We know all of the bits for a constant fp!
+    return KnownBits::makeConstant(C->getValueAPF().bitcastToAPInt());
+  }
+
+  if (Depth >= MaxRecursionDepth)
+    return Known;  // Limit search depth.
+
+  KnownBits Known2;
+  unsigned NumElts = DemandedElts.getBitWidth();
+  assert((!Op.getValueType().isFixedLengthVector() ||
+          NumElts == Op.getValueType().getVectorNumElements()) &&
+         "Unexpected vector size");
+
+  if (!DemandedElts)
+    return Known;  // No demanded elts, better to assume we don't know anything.
+
+  unsigned Opcode = Op.getOpcode();
+  switch (Opcode) {
+  case ISD::MERGE_VALUES:
+    return computeKnownBits(Op.getOperand(Op.getResNo()), DemandedElts,
+                            Depth + 1);
+  case ISD::SPLAT_VECTOR: {
+    SDValue SrcOp = Op.getOperand(0);
+    assert(SrcOp.getValueSizeInBits() >= BitWidth &&
+           "Expected SPLAT_VECTOR implicit truncation");
+    // Implicitly truncate the bits to match the official semantics of
+    // SPLAT_VECTOR.
+    Known = computeKnownBits(SrcOp, Depth + 1).trunc(BitWidth);
+    break;
+  }
+  case ISD::BUILD_VECTOR:
+    assert(!Op.getValueType().isScalableVector());
+    // Collect the known bits that are shared by every demanded vector element.
+    Known.Zero.setAllBits(); Known.One.setAllBits();
+    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+      if (!DemandedElts[i])
+        continue;
+
+      SDValue SrcOp = Op.getOperand(i);
+      Known2 = computeKnownBits(SrcOp, Depth + 1);
+
+      // BUILD_VECTOR can implicitly truncate sources, we must handle this.
+      if (SrcOp.getValueSizeInBits() != BitWidth) {
+        assert(SrcOp.getValueSizeInBits() > BitWidth &&
+               "Expected BUILD_VECTOR implicit truncation");
+        Known2 = Known2.trunc(BitWidth);
+      }
+
+      // Known bits are the values that are shared by every demanded element.
+      Known = Known.intersectWith(Known2);
+
+      // If we don't know any bits, early out.
+      if (Known.isUnknown())
+        break;
+    }
+    break;
+  case ISD::VECTOR_SHUFFLE: {
+    assert(!Op.getValueType().isScalableVector());
+    // Collect the known bits that are shared by every vector element referenced
+    // by the shuffle.
+    APInt DemandedLHS, DemandedRHS;
+    const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+    assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
+    if (!getShuffleDemandedElts(NumElts, SVN->getMask(), DemandedElts,
+                                DemandedLHS, DemandedRHS))
+      break;
+
+    // Known bits are the values that are shared by every demanded element.
+    Known.Zero.setAllBits(); Known.One.setAllBits();
+    if (!!DemandedLHS) {
+      SDValue LHS = Op.getOperand(0);
+      Known2 = computeKnownBits(LHS, DemandedLHS, Depth + 1);
+      Known = Known.intersectWith(Known2);
+    }
+    // If we don't know any bits, early out.
+    if (Known.isUnknown())
+      break;
+    if (!!DemandedRHS) {
+      SDValue RHS = Op.getOperand(1);
+      Known2 = computeKnownBits(RHS, DemandedRHS, Depth + 1);
+      Known = Known.intersectWith(Known2);
+    }
+    break;
+  }
+  case ISD::VSCALE: {
+    const Function &F = getMachineFunction().getFunction();
+    const APInt &Multiplier = Op.getConstantOperandAPInt(0);
+    Known = getVScaleRange(&F, BitWidth).multiply(Multiplier).toKnownBits();
+    break;
+  }
+  case ISD::CONCAT_VECTORS: {
+    if (Op.getValueType().isScalableVector())
+      break;
+    // Split DemandedElts and test each of the demanded subvectors.
+    Known.Zero.setAllBits(); Known.One.setAllBits();
+    EVT SubVectorVT = Op.getOperand(0).getValueType();
+    unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
+    unsigned NumSubVectors = Op.getNumOperands();
+    for (unsigned i = 0; i != NumSubVectors; ++i) {
+      APInt DemandedSub =
+          DemandedElts.extractBits(NumSubVectorElts, i * NumSubVectorElts);
+      if (!!DemandedSub) {
+        SDValue Sub = Op.getOperand(i);
+        Known2 = computeKnownBits(Sub, DemandedSub, Depth + 1);
+        Known = Known.intersectWith(Known2);
+      }
+      // If we don't know any bits, early out.
+      if (Known.isUnknown())
+        break;
+    }
+    break;
+  }
+  case ISD::INSERT_SUBVECTOR: {
+    if (Op.getValueType().isScalableVector())
+      break;
+    // Demand any elements from the subvector and the remainder from the src its
+    // inserted into.
+    SDValue Src = Op.getOperand(0);
+    SDValue Sub = Op.getOperand(1);
+    uint64_t Idx = Op.getConstantOperandVal(2);
+    unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+    APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+    APInt DemandedSrcElts = DemandedElts;
+    DemandedSrcElts.insertBits(APInt::getZero(NumSubElts), Idx);
+
+    Known.One.setAllBits();
+    Known.Zero.setAllBits();
+    if (!!DemandedSubElts) {
+      Known = computeKnownBits(Sub, DemandedSubElts, Depth + 1);
+      if (Known.isUnknown())
+        break; // early-out.
+    }
+    if (!!DemandedSrcElts) {
+      Known2 = computeKnownBits(Src, DemandedSrcElts, Depth + 1);
+      Known = Known.intersectWith(Known2);
+    }
+    break;
+  }
+  case ISD::EXTRACT_SUBVECTOR: {
+    // Offset the demanded elts by the subvector index.
+    SDValue Src = Op.getOperand(0);
+    // Bail until we can represent demanded elements for scalable vectors.
+    if (Op.getValueType().isScalableVector() || Src.getValueType().isScalableVector())
+      break;
+    uint64_t Idx = Op.getConstantOperandVal(1);
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx);
+    Known = computeKnownBits(Src, DemandedSrcElts, Depth + 1);
+    break;
+  }
+  case ISD::SCALAR_TO_VECTOR: {
+    if (Op.getValueType().isScalableVector())
+      break;
+    // We know about scalar_to_vector as much as we know about it source,
+    // which becomes the first element of otherwise unknown vector.
+    if (DemandedElts != 1)
+      break;
+
+    SDValue N0 = Op.getOperand(0);
+    Known = computeKnownBits(N0, Depth + 1);
+    if (N0.getValueSizeInBits() != BitWidth)
+      Known = Known.trunc(BitWidth);
+
+    break;
+  }
+  case ISD::BITCAST: {
+    if (Op.getValueType().isScalableVector())
+      break;
+
+    SDValue N0 = Op.getOperand(0);
+    EVT SubVT = N0.getValueType();
+    unsigned SubBitWidth = SubVT.getScalarSizeInBits();
+
+    // Ignore bitcasts from unsupported types.
+    if (!(SubVT.isInteger() || SubVT.isFloatingPoint()))
+      break;
+
+    // Fast handling of 'identity' bitcasts.
+    if (BitWidth == SubBitWidth) {
+      Known = computeKnownBits(N0, DemandedElts, Depth + 1);
+      break;
+    }
+
+    bool IsLE = getDataLayout().isLittleEndian();
+
+    // Bitcast 'small element' vector to 'large element' scalar/vector.
+    if ((BitWidth % SubBitWidth) == 0) {
+      assert(N0.getValueType().isVector() && "Expected bitcast from vector");
+
+      // Collect known bits for the (larger) output by collecting the known
+      // bits from each set of sub elements and shift these into place.
+      // We need to separately call computeKnownBits for each set of
+      // sub elements as the knownbits for each is likely to be different.
+      unsigned SubScale = BitWidth / SubBitWidth;
+      APInt SubDemandedElts(NumElts * SubScale, 0);
+      for (unsigned i = 0; i != NumElts; ++i)
+        if (DemandedElts[i])
+          SubDemandedElts.setBit(i * SubScale);
+
+      for (unsigned i = 0; i != SubScale; ++i) {
+        Known2 = computeKnownBits(N0, SubDemandedElts.shl(i),
+                         Depth + 1);
+        unsigned Shifts = IsLE ? i : SubScale - 1 - i;
+        Known.insertBits(Known2, SubBitWidth * Shifts);
+      }
+    }
+
+    // Bitcast 'large element' scalar/vector to 'small element' vector.
+    if ((SubBitWidth % BitWidth) == 0) {
+      assert(Op.getValueType().isVector() && "Expected bitcast to vector");
+
+      // Collect known bits for the (smaller) output by collecting the known
+      // bits from the overlapping larger input elements and extracting the
+      // sub sections we actually care about.
+      unsigned SubScale = SubBitWidth / BitWidth;
+      APInt SubDemandedElts =
+          APIntOps::ScaleBitMask(DemandedElts, NumElts / SubScale);
+      Known2 = computeKnownBits(N0, SubDemandedElts, Depth + 1);
+
+      Known.Zero.setAllBits(); Known.One.setAllBits();
+      for (unsigned i = 0; i != NumElts; ++i)
+        if (DemandedElts[i]) {
+          unsigned Shifts = IsLE ? i : NumElts - 1 - i;
+          unsigned Offset = (Shifts % SubScale) * BitWidth;
+          Known = Known.intersectWith(Known2.extractBits(BitWidth, Offset));
+          // If we don't know any bits, early out.
+          if (Known.isUnknown())
+            break;
+        }
+    }
+    break;
+  }
+  case ISD::AND:
+    Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+    Known &= Known2;
+    break;
+  case ISD::OR:
+    Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+    Known |= Known2;
+    break;
+  case ISD::XOR:
+    Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+    Known ^= Known2;
+    break;
+  case ISD::MUL: {
+    Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    bool SelfMultiply = Op.getOperand(0) == Op.getOperand(1);
+    // TODO: SelfMultiply can be poison, but not undef.
+    if (SelfMultiply)
+      SelfMultiply &= isGuaranteedNotToBeUndefOrPoison(
+          Op.getOperand(0), DemandedElts, false, Depth + 1);
+    Known = KnownBits::mul(Known, Known2, SelfMultiply);
+
+    // If the multiplication is known not to overflow, the product of a number
+    // with itself is non-negative. Only do this if we didn't already computed
+    // the opposite value for the sign bit.
+    if (Op->getFlags().hasNoSignedWrap() &&
+        Op.getOperand(0) == Op.getOperand(1) &&
+        !Known.isNegative())
+      Known.makeNonNegative();
+    break;
+  }
+  case ISD::MULHU: {
+    Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known = KnownBits::mulhu(Known, Known2);
+    break;
+  }
+  case ISD::MULHS: {
+    Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known = KnownBits::mulhs(Known, Known2);
+    break;
+  }
+  case ISD::UMUL_LOHI: {
+    assert((Op.getResNo() == 0 || Op.getResNo() == 1) && "Unknown result");
+    Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    bool SelfMultiply = Op.getOperand(0) == Op.getOperand(1);
+    if (Op.getResNo() == 0)
+      Known = KnownBits::mul(Known, Known2, SelfMultiply);
+    else
+      Known = KnownBits::mulhu(Known, Known2);
+    break;
+  }
+  case ISD::SMUL_LOHI: {
+    assert((Op.getResNo() == 0 || Op.getResNo() == 1) && "Unknown result");
+    Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    bool SelfMultiply = Op.getOperand(0) == Op.getOperand(1);
+    if (Op.getResNo() == 0)
+      Known = KnownBits::mul(Known, Known2, SelfMultiply);
+    else
+      Known = KnownBits::mulhs(Known, Known2);
+    break;
+  }
+  case ISD::AVGCEILU: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = Known.zext(BitWidth + 1);
+    Known2 = Known2.zext(BitWidth + 1);
+    KnownBits One = KnownBits::makeConstant(APInt(1, 1));
+    Known = KnownBits::computeForAddCarry(Known, Known2, One);
+    Known = Known.extractBits(BitWidth, 1);
+    break;
+  }
+  case ISD::SELECT:
+  case ISD::VSELECT:
+    Known = computeKnownBits(Op.getOperand(2), DemandedElts, Depth+1);
+    // If we don't know any bits, early out.
+    if (Known.isUnknown())
+      break;
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth+1);
+
+    // Only known if known in both the LHS and RHS.
+    Known = Known.intersectWith(Known2);
+    break;
+  case ISD::SELECT_CC:
+    Known = computeKnownBits(Op.getOperand(3), DemandedElts, Depth+1);
+    // If we don't know any bits, early out.
+    if (Known.isUnknown())
+      break;
+    Known2 = computeKnownBits(Op.getOperand(2), DemandedElts, Depth+1);
+
+    // Only known if known in both the LHS and RHS.
+    Known = Known.intersectWith(Known2);
+    break;
+  case ISD::SMULO:
+  case ISD::UMULO:
+    if (Op.getResNo() != 1)
+      break;
+    // The boolean result conforms to getBooleanContents.
+    // If we know the result of a setcc has the top bits zero, use this info.
+    // We know that we have an integer-based boolean since these operations
+    // are only available for integer.
+    if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
+            TargetLowering::ZeroOrOneBooleanContent &&
+        BitWidth > 1)
+      Known.Zero.setBitsFrom(1);
+    break;
+  case ISD::SETCC:
+  case ISD::SETCCCARRY:
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS: {
+    unsigned OpNo = Op->isStrictFPOpcode() ? 1 : 0;
+    // If we know the result of a setcc has the top bits zero, use this info.
+    if (TLI->getBooleanContents(Op.getOperand(OpNo).getValueType()) ==
+            TargetLowering::ZeroOrOneBooleanContent &&
+        BitWidth > 1)
+      Known.Zero.setBitsFrom(1);
+    break;
+  }
+  case ISD::SHL:
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::shl(Known, Known2);
+
+    // Minimum shift low bits are known zero.
+    if (const APInt *ShMinAmt =
+            getValidMinimumShiftAmountConstant(Op, DemandedElts))
+      Known.Zero.setLowBits(ShMinAmt->getZExtValue());
+    break;
+  case ISD::SRL:
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::lshr(Known, Known2);
+
+    // Minimum shift high bits are known zero.
+    if (const APInt *ShMinAmt =
+            getValidMinimumShiftAmountConstant(Op, DemandedElts))
+      Known.Zero.setHighBits(ShMinAmt->getZExtValue());
+    break;
+  case ISD::SRA:
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::ashr(Known, Known2);
+    break;
+  case ISD::FSHL:
+  case ISD::FSHR:
+    if (ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(2), DemandedElts)) {
+      unsigned Amt = C->getAPIntValue().urem(BitWidth);
+
+      // For fshl, 0-shift returns the 1st arg.
+      // For fshr, 0-shift returns the 2nd arg.
+      if (Amt == 0) {
+        Known = computeKnownBits(Op.getOperand(Opcode == ISD::FSHL ? 0 : 1),
+                                 DemandedElts, Depth + 1);
+        break;
+      }
+
+      // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
+      // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+      Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+      Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+      if (Opcode == ISD::FSHL) {
+        Known.One <<= Amt;
+        Known.Zero <<= Amt;
+        Known2.One.lshrInPlace(BitWidth - Amt);
+        Known2.Zero.lshrInPlace(BitWidth - Amt);
+      } else {
+        Known.One <<= BitWidth - Amt;
+        Known.Zero <<= BitWidth - Amt;
+        Known2.One.lshrInPlace(Amt);
+        Known2.Zero.lshrInPlace(Amt);
+      }
+      Known = Known.unionWith(Known2);
+    }
+    break;
+  case ISD::SHL_PARTS:
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS: {
+    assert((Op.getResNo() == 0 || Op.getResNo() == 1) && "Unknown result");
+
+    // Collect lo/hi source values and concatenate.
+    unsigned LoBits = Op.getOperand(0).getScalarValueSizeInBits();
+    unsigned HiBits = Op.getOperand(1).getScalarValueSizeInBits();
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = Known2.concat(Known);
+
+    // Collect shift amount.
+    Known2 = computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
+
+    if (Opcode == ISD::SHL_PARTS)
+      Known = KnownBits::shl(Known, Known2);
+    else if (Opcode == ISD::SRA_PARTS)
+      Known = KnownBits::ashr(Known, Known2);
+    else // if (Opcode == ISD::SRL_PARTS)
+      Known = KnownBits::lshr(Known, Known2);
+
+    // TODO: Minimum shift low/high bits are known zero.
+
+    if (Op.getResNo() == 0)
+      Known = Known.extractBits(LoBits, 0);
+    else
+      Known = Known.extractBits(HiBits, LoBits);
+    break;
+  }
+  case ISD::SIGN_EXTEND_INREG: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    Known = Known.sextInReg(EVT.getScalarSizeInBits());
+    break;
+  }
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF: {
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    // If we have a known 1, its position is our upper bound.
+    unsigned PossibleTZ = Known2.countMaxTrailingZeros();
+    unsigned LowBits = llvm::bit_width(PossibleTZ);
+    Known.Zero.setBitsFrom(LowBits);
+    break;
+  }
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF: {
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    // If we have a known 1, its position is our upper bound.
+    unsigned PossibleLZ = Known2.countMaxLeadingZeros();
+    unsigned LowBits = llvm::bit_width(PossibleLZ);
+    Known.Zero.setBitsFrom(LowBits);
+    break;
+  }
+  case ISD::CTPOP: {
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    // If we know some of the bits are zero, they can't be one.
+    unsigned PossibleOnes = Known2.countMaxPopulation();
+    Known.Zero.setBitsFrom(llvm::bit_width(PossibleOnes));
+    break;
+  }
+  case ISD::PARITY: {
+    // Parity returns 0 everywhere but the LSB.
+    Known.Zero.setBitsFrom(1);
+    break;
+  }
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
+    const Constant *Cst = TLI->getTargetConstantFromLoad(LD);
+    if (ISD::isNON_EXTLoad(LD) && Cst) {
+      // Determine any common known bits from the loaded constant pool value.
+      Type *CstTy = Cst->getType();
+      if ((NumElts * BitWidth) == CstTy->getPrimitiveSizeInBits() &&
+          !Op.getValueType().isScalableVector()) {
+        // If its a vector splat, then we can (quickly) reuse the scalar path.
+        // NOTE: We assume all elements match and none are UNDEF.
+        if (CstTy->isVectorTy()) {
+          if (const Constant *Splat = Cst->getSplatValue()) {
+            Cst = Splat;
+            CstTy = Cst->getType();
+          }
+        }
+        // TODO - do we need to handle different bitwidths?
+        if (CstTy->isVectorTy() && BitWidth == CstTy->getScalarSizeInBits()) {
+          // Iterate across all vector elements finding common known bits.
+          Known.One.setAllBits();
+          Known.Zero.setAllBits();
+          for (unsigned i = 0; i != NumElts; ++i) {
+            if (!DemandedElts[i])
+              continue;
+            if (Constant *Elt = Cst->getAggregateElement(i)) {
+              if (auto *CInt = dyn_cast<ConstantInt>(Elt)) {
+                const APInt &Value = CInt->getValue();
+                Known.One &= Value;
+                Known.Zero &= ~Value;
+                continue;
+              }
+              if (auto *CFP = dyn_cast<ConstantFP>(Elt)) {
+                APInt Value = CFP->getValueAPF().bitcastToAPInt();
+                Known.One &= Value;
+                Known.Zero &= ~Value;
+                continue;
+              }
+            }
+            Known.One.clearAllBits();
+            Known.Zero.clearAllBits();
+            break;
+          }
+        } else if (BitWidth == CstTy->getPrimitiveSizeInBits()) {
+          if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
+            Known = KnownBits::makeConstant(CInt->getValue());
+          } else if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
+            Known =
+                KnownBits::makeConstant(CFP->getValueAPF().bitcastToAPInt());
+          }
+        }
+      }
+    } else if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
+      // If this is a ZEXTLoad and we are looking at the loaded value.
+      EVT VT = LD->getMemoryVT();
+      unsigned MemBits = VT.getScalarSizeInBits();
+      Known.Zero.setBitsFrom(MemBits);
+    } else if (const MDNode *Ranges = LD->getRanges()) {
+      EVT VT = LD->getValueType(0);
+
+      // TODO: Handle for extending loads
+      if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
+        if (VT.isVector()) {
+          // Handle truncation to the first demanded element.
+          // TODO: Figure out which demanded elements are covered
+          if (DemandedElts != 1 || !getDataLayout().isLittleEndian())
+            break;
+
+          // Handle the case where a load has a vector type, but scalar memory
+          // with an attached range.
+          EVT MemVT = LD->getMemoryVT();
+          KnownBits KnownFull(MemVT.getSizeInBits());
+
+          computeKnownBitsFromRangeMetadata(*Ranges, KnownFull);
+          Known = KnownFull.trunc(BitWidth);
+        } else
+          computeKnownBitsFromRangeMetadata(*Ranges, Known);
+      }
+    }
+    break;
+  }
+  case ISD::ZERO_EXTEND_VECTOR_INREG: {
+    if (Op.getValueType().isScalableVector())
+      break;
+    EVT InVT = Op.getOperand(0).getValueType();
+    APInt InDemandedElts = DemandedElts.zext(InVT.getVectorNumElements());
+    Known = computeKnownBits(Op.getOperand(0), InDemandedElts, Depth + 1);
+    Known = Known.zext(BitWidth);
+    break;
+  }
+  case ISD::ZERO_EXTEND: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known = Known.zext(BitWidth);
+    break;
+  }
+  case ISD::SIGN_EXTEND_VECTOR_INREG: {
+    if (Op.getValueType().isScalableVector())
+      break;
+    EVT InVT = Op.getOperand(0).getValueType();
+    APInt InDemandedElts = DemandedElts.zext(InVT.getVectorNumElements());
+    Known = computeKnownBits(Op.getOperand(0), InDemandedElts, Depth + 1);
+    // If the sign bit is known to be zero or one, then sext will extend
+    // it to the top bits, else it will just zext.
+    Known = Known.sext(BitWidth);
+    break;
+  }
+  case ISD::SIGN_EXTEND: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    // If the sign bit is known to be zero or one, then sext will extend
+    // it to the top bits, else it will just zext.
+    Known = Known.sext(BitWidth);
+    break;
+  }
+  case ISD::ANY_EXTEND_VECTOR_INREG: {
+    if (Op.getValueType().isScalableVector())
+      break;
+    EVT InVT = Op.getOperand(0).getValueType();
+    APInt InDemandedElts = DemandedElts.zext(InVT.getVectorNumElements());
+    Known = computeKnownBits(Op.getOperand(0), InDemandedElts, Depth + 1);
+    Known = Known.anyext(BitWidth);
+    break;
+  }
+  case ISD::ANY_EXTEND: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known = Known.anyext(BitWidth);
+    break;
+  }
+  case ISD::TRUNCATE: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known = Known.trunc(BitWidth);
+    break;
+  }
+  case ISD::AssertZext: {
+    EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known.Zero |= (~InMask);
+    Known.One  &= (~Known.Zero);
+    break;
+  }
+  case ISD::AssertAlign: {
+    unsigned LogOfAlign = Log2(cast<AssertAlignSDNode>(Op)->getAlign());
+    assert(LogOfAlign != 0);
+
+    // TODO: Should use maximum with source
+    // If a node is guaranteed to be aligned, set low zero bits accordingly as
+    // well as clearing one bits.
+    Known.Zero.setLowBits(LogOfAlign);
+    Known.One.clearLowBits(LogOfAlign);
+    break;
+  }
+  case ISD::FGETSIGN:
+    // All bits are zero except the low bit.
+    Known.Zero.setBitsFrom(1);
+    break;
+  case ISD::ADD:
+  case ISD::SUB: {
+    SDNodeFlags Flags = Op.getNode()->getFlags();
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::computeForAddSub(Op.getOpcode() == ISD::ADD,
+                                        Flags.hasNoSignedWrap(), Known, Known2);
+    break;
+  }
+  case ISD::USUBO:
+  case ISD::SSUBO:
+  case ISD::USUBO_CARRY:
+  case ISD::SSUBO_CARRY:
+    if (Op.getResNo() == 1) {
+      // If we know the result of a setcc has the top bits zero, use this info.
+      if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
+              TargetLowering::ZeroOrOneBooleanContent &&
+          BitWidth > 1)
+        Known.Zero.setBitsFrom(1);
+      break;
+    }
+    [[fallthrough]];
+  case ISD::SUBC: {
+    assert(Op.getResNo() == 0 &&
+           "We only compute knownbits for the difference here.");
+
+    // TODO: Compute influence of the carry operand.
+    if (Opcode == ISD::USUBO_CARRY || Opcode == ISD::SSUBO_CARRY)
+      break;
+
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::computeForAddSub(/* Add */ false, /* NSW */ false,
+                                        Known, Known2);
+    break;
+  }
+  case ISD::UADDO:
+  case ISD::SADDO:
+  case ISD::UADDO_CARRY:
+  case ISD::SADDO_CARRY:
+    if (Op.getResNo() == 1) {
+      // If we know the result of a setcc has the top bits zero, use this info.
+      if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
+              TargetLowering::ZeroOrOneBooleanContent &&
+          BitWidth > 1)
+        Known.Zero.setBitsFrom(1);
+      break;
+    }
+    [[fallthrough]];
+  case ISD::ADDC:
+  case ISD::ADDE: {
+    assert(Op.getResNo() == 0 && "We only compute knownbits for the sum here.");
+
+    // With ADDE and UADDO_CARRY, a carry bit may be added in.
+    KnownBits Carry(1);
+    if (Opcode == ISD::ADDE)
+      // Can't track carry from glue, set carry to unknown.
+      Carry.resetAll();
+    else if (Opcode == ISD::UADDO_CARRY || Opcode == ISD::SADDO_CARRY)
+      // TODO: Compute known bits for the carry operand. Not sure if it is worth
+      // the trouble (how often will we find a known carry bit). And I haven't
+      // tested this very much yet, but something like this might work:
+      //   Carry = computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
+      //   Carry = Carry.zextOrTrunc(1, false);
+      Carry.resetAll();
+    else
+      Carry.setAllZero();
+
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::computeForAddCarry(Known, Known2, Carry);
+    break;
+  }
+  case ISD::UDIV: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::udiv(Known, Known2, Op->getFlags().hasExact());
+    break;
+  }
+  case ISD::SDIV: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::sdiv(Known, Known2, Op->getFlags().hasExact());
+    break;
+  }
+  case ISD::SREM: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::srem(Known, Known2);
+    break;
+  }
+  case ISD::UREM: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::urem(Known, Known2);
+    break;
+  }
+  case ISD::EXTRACT_ELEMENT: {
+    Known = computeKnownBits(Op.getOperand(0), Depth+1);
+    const unsigned Index = Op.getConstantOperandVal(1);
+    const unsigned EltBitWidth = Op.getValueSizeInBits();
+
+    // Remove low part of known bits mask
+    Known.Zero = Known.Zero.getHiBits(Known.getBitWidth() - Index * EltBitWidth);
+    Known.One = Known.One.getHiBits(Known.getBitWidth() - Index * EltBitWidth);
+
+    // Remove high part of known bit mask
+    Known = Known.trunc(EltBitWidth);
+    break;
+  }
+  case ISD::EXTRACT_VECTOR_ELT: {
+    SDValue InVec = Op.getOperand(0);
+    SDValue EltNo = Op.getOperand(1);
+    EVT VecVT = InVec.getValueType();
+    // computeKnownBits not yet implemented for scalable vectors.
+    if (VecVT.isScalableVector())
+      break;
+    const unsigned EltBitWidth = VecVT.getScalarSizeInBits();
+    const unsigned NumSrcElts = VecVT.getVectorNumElements();
+
+    // If BitWidth > EltBitWidth the value is anyext:ed. So we do not know
+    // anything about the extended bits.
+    if (BitWidth > EltBitWidth)
+      Known = Known.trunc(EltBitWidth);
+
+    // If we know the element index, just demand that vector element, else for
+    // an unknown element index, ignore DemandedElts and demand them all.
+    APInt DemandedSrcElts = APInt::getAllOnes(NumSrcElts);
+    auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
+    if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts))
+      DemandedSrcElts =
+          APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue());
+
+    Known = computeKnownBits(InVec, DemandedSrcElts, Depth + 1);
+    if (BitWidth > EltBitWidth)
+      Known = Known.anyext(BitWidth);
+    break;
+  }
+  case ISD::INSERT_VECTOR_ELT: {
+    if (Op.getValueType().isScalableVector())
+      break;
+
+    // If we know the element index, split the demand between the
+    // source vector and the inserted element, otherwise assume we need
+    // the original demanded vector elements and the value.
+    SDValue InVec = Op.getOperand(0);
+    SDValue InVal = Op.getOperand(1);
+    SDValue EltNo = Op.getOperand(2);
+    bool DemandedVal = true;
+    APInt DemandedVecElts = DemandedElts;
+    auto *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
+    if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
+      unsigned EltIdx = CEltNo->getZExtValue();
+      DemandedVal = !!DemandedElts[EltIdx];
+      DemandedVecElts.clearBit(EltIdx);
+    }
+    Known.One.setAllBits();
+    Known.Zero.setAllBits();
+    if (DemandedVal) {
+      Known2 = computeKnownBits(InVal, Depth + 1);
+      Known = Known.intersectWith(Known2.zextOrTrunc(BitWidth));
+    }
+    if (!!DemandedVecElts) {
+      Known2 = computeKnownBits(InVec, DemandedVecElts, Depth + 1);
+      Known = Known.intersectWith(Known2);
+    }
+    break;
+  }
+  case ISD::BITREVERSE: {
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known = Known2.reverseBits();
+    break;
+  }
+  case ISD::BSWAP: {
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known = Known2.byteSwap();
+    break;
+  }
+  case ISD::ABS: {
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known = Known2.abs();
+    break;
+  }
+  case ISD::USUBSAT: {
+    // The result of usubsat will never be larger than the LHS.
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known.Zero.setHighBits(Known2.countMinLeadingZeros());
+    break;
+  }
+  case ISD::UMIN: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::umin(Known, Known2);
+    break;
+  }
+  case ISD::UMAX: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::umax(Known, Known2);
+    break;
+  }
+  case ISD::SMIN:
+  case ISD::SMAX: {
+    // If we have a clamp pattern, we know that the number of sign bits will be
+    // the minimum of the clamp min/max range.
+    bool IsMax = (Opcode == ISD::SMAX);
+    ConstantSDNode *CstLow = nullptr, *CstHigh = nullptr;
+    if ((CstLow = isConstOrConstSplat(Op.getOperand(1), DemandedElts)))
+      if (Op.getOperand(0).getOpcode() == (IsMax ? ISD::SMIN : ISD::SMAX))
+        CstHigh =
+            isConstOrConstSplat(Op.getOperand(0).getOperand(1), DemandedElts);
+    if (CstLow && CstHigh) {
+      if (!IsMax)
+        std::swap(CstLow, CstHigh);
+
+      const APInt &ValueLow = CstLow->getAPIntValue();
+      const APInt &ValueHigh = CstHigh->getAPIntValue();
+      if (ValueLow.sle(ValueHigh)) {
+        unsigned LowSignBits = ValueLow.getNumSignBits();
+        unsigned HighSignBits = ValueHigh.getNumSignBits();
+        unsigned MinSignBits = std::min(LowSignBits, HighSignBits);
+        if (ValueLow.isNegative() && ValueHigh.isNegative()) {
+          Known.One.setHighBits(MinSignBits);
+          break;
+        }
+        if (ValueLow.isNonNegative() && ValueHigh.isNonNegative()) {
+          Known.Zero.setHighBits(MinSignBits);
+          break;
+        }
+      }
+    }
+
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    if (IsMax)
+      Known = KnownBits::smax(Known, Known2);
+    else
+      Known = KnownBits::smin(Known, Known2);
+
+    // For SMAX, if CstLow is non-negative we know the result will be
+    // non-negative and thus all sign bits are 0.
+    // TODO: There's an equivalent of this for smin with negative constant for
+    // known ones.
+    if (IsMax && CstLow) {
+      const APInt &ValueLow = CstLow->getAPIntValue();
+      if (ValueLow.isNonNegative()) {
+        unsigned SignBits = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+        Known.Zero.setHighBits(std::min(SignBits, ValueLow.getNumSignBits()));
+      }
+    }
+
+    break;
+  }
+  case ISD::FP_TO_UINT_SAT: {
+    // FP_TO_UINT_SAT produces an unsigned value that fits in the saturating VT.
+    EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    Known.Zero |= APInt::getBitsSetFrom(BitWidth, VT.getScalarSizeInBits());
+    break;
+  }
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+    if (Op.getResNo() == 1) {
+      // The boolean result conforms to getBooleanContents.
+      // If we know the result of a setcc has the top bits zero, use this info.
+      // We know that we have an integer-based boolean since these operations
+      // are only available for integer.
+      if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
+              TargetLowering::ZeroOrOneBooleanContent &&
+          BitWidth > 1)
+        Known.Zero.setBitsFrom(1);
+      break;
+    }
+    [[fallthrough]];
+  case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_CLR:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_LOAD: {
+    unsigned MemBits =
+        cast<AtomicSDNode>(Op)->getMemoryVT().getScalarSizeInBits();
+    // If we are looking at the loaded value.
+    if (Op.getResNo() == 0) {
+      if (TLI->getExtendForAtomicOps() == ISD::ZERO_EXTEND)
+        Known.Zero.setBitsFrom(MemBits);
+    }
+    break;
+  }
+  case ISD::FrameIndex:
+  case ISD::TargetFrameIndex:
+    TLI->computeKnownBitsForFrameIndex(cast<FrameIndexSDNode>(Op)->getIndex(),
+                                       Known, getMachineFunction());
+    break;
+
+  default:
+    if (Opcode < ISD::BUILTIN_OP_END)
+      break;
+    [[fallthrough]];
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_VOID:
+    // TODO: Probably okay to remove after audit; here to reduce change size
+    // in initial enablement patch for scalable vectors
+    if (Op.getValueType().isScalableVector())
+      break;
+
+    // Allow the target to implement this method for its nodes.
+    TLI->computeKnownBitsForTargetNode(Op, Known, DemandedElts, *this, Depth);
+    break;
+  }
+
+  assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+  return Known;
+}
+
+/// Convert ConstantRange OverflowResult into SelectionDAG::OverflowKind.
+static SelectionDAG::OverflowKind mapOverflowResult(ConstantRange::OverflowResult OR) {
+  switch (OR) {
+  case ConstantRange::OverflowResult::MayOverflow:
+    return SelectionDAG::OFK_Sometime;
+  case ConstantRange::OverflowResult::AlwaysOverflowsLow:
+  case ConstantRange::OverflowResult::AlwaysOverflowsHigh:
+    return SelectionDAG::OFK_Always;
+  case ConstantRange::OverflowResult::NeverOverflows:
+    return SelectionDAG::OFK_Never;
+  }
+  llvm_unreachable("Unknown OverflowResult");
+}
+
+SelectionDAG::OverflowKind
+SelectionDAG::computeOverflowForSignedAdd(SDValue N0, SDValue N1) const {
+  // X + 0 never overflow
+  if (isNullConstant(N1))
+    return OFK_Never;
+
+  // If both operands each have at least two sign bits, the addition
+  // cannot overflow.
+  if (ComputeNumSignBits(N0) > 1 && ComputeNumSignBits(N1) > 1)
+    return OFK_Never;
+
+  // TODO: Add ConstantRange::signedAddMayOverflow handling.
+  return OFK_Sometime;
+}
+
+SelectionDAG::OverflowKind
+SelectionDAG::computeOverflowForUnsignedAdd(SDValue N0, SDValue N1) const {
+  // X + 0 never overflow
+  if (isNullConstant(N1))
+    return OFK_Never;
+
+  // mulhi + 1 never overflow
+  KnownBits N1Known = computeKnownBits(N1);
+  if (N0.getOpcode() == ISD::UMUL_LOHI && N0.getResNo() == 1 &&
+      N1Known.getMaxValue().ult(2))
+    return OFK_Never;
+
+  KnownBits N0Known = computeKnownBits(N0);
+  if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1 &&
+      N0Known.getMaxValue().ult(2))
+    return OFK_Never;
+
+  // Fallback to ConstantRange::unsignedAddMayOverflow handling.
+  ConstantRange N0Range = ConstantRange::fromKnownBits(N0Known, false);
+  ConstantRange N1Range = ConstantRange::fromKnownBits(N1Known, false);
+  return mapOverflowResult(N0Range.unsignedAddMayOverflow(N1Range));
+}
+
+SelectionDAG::OverflowKind
+SelectionDAG::computeOverflowForSignedSub(SDValue N0, SDValue N1) const {
+  // X - 0 never overflow
+  if (isNullConstant(N1))
+    return OFK_Never;
+
+  // If both operands each have at least two sign bits, the subtraction
+  // cannot overflow.
+  if (ComputeNumSignBits(N0) > 1 && ComputeNumSignBits(N1) > 1)
+    return OFK_Never;
+
+  // TODO: Add ConstantRange::signedSubMayOverflow handling.
+  return OFK_Sometime;
+}
+
+SelectionDAG::OverflowKind
+SelectionDAG::computeOverflowForUnsignedSub(SDValue N0, SDValue N1) const {
+  // X - 0 never overflow
+  if (isNullConstant(N1))
+    return OFK_Never;
+
+  // TODO: Add ConstantRange::unsignedSubMayOverflow handling.
+  return OFK_Sometime;
+}
+
+bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val, unsigned Depth) const {
+  if (Depth >= MaxRecursionDepth)
+    return false; // Limit search depth.
+
+  EVT OpVT = Val.getValueType();
+  unsigned BitWidth = OpVT.getScalarSizeInBits();
+
+  // Is the constant a known power of 2?
+  if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val))
+    return Const->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();
+
+  // A left-shift of a constant one will have exactly one bit set because
+  // shifting the bit off the end is undefined.
+  if (Val.getOpcode() == ISD::SHL) {
+    auto *C = isConstOrConstSplat(Val.getOperand(0));
+    if (C && C->getAPIntValue() == 1)
+      return true;
+  }
+
+  // Similarly, a logical right-shift of a constant sign-bit will have exactly
+  // one bit set.
+  if (Val.getOpcode() == ISD::SRL) {
+    auto *C = isConstOrConstSplat(Val.getOperand(0));
+    if (C && C->getAPIntValue().isSignMask())
+      return true;
+  }
+
+  // Are all operands of a build vector constant powers of two?
+  if (Val.getOpcode() == ISD::BUILD_VECTOR)
+    if (llvm::all_of(Val->ops(), [BitWidth](SDValue E) {
+          if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(E))
+            return C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();
+          return false;
+        }))
+      return true;
+
+  // Is the operand of a splat vector a constant power of two?
+  if (Val.getOpcode() == ISD::SPLAT_VECTOR)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val->getOperand(0)))
+      if (C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2())
+        return true;
+
+  // vscale(power-of-two) is a power-of-two for some targets
+  if (Val.getOpcode() == ISD::VSCALE &&
+      getTargetLoweringInfo().isVScaleKnownToBeAPowerOfTwo() &&
+      isKnownToBeAPowerOfTwo(Val.getOperand(0), Depth + 1))
+    return true;
+
+  // More could be done here, though the above checks are enough
+  // to handle some common cases.
+  return false;
+}
+
+unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
+  EVT VT = Op.getValueType();
+
+  // Since the number of lanes in a scalable vector is unknown at compile time,
+  // we track one bit which is implicitly broadcast to all lanes.  This means
+  // that all lanes in a scalable vector are considered demanded.
+  APInt DemandedElts = VT.isFixedLengthVector()
+                           ? APInt::getAllOnes(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  return ComputeNumSignBits(Op, DemandedElts, Depth);
+}
+
+unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
+                                          unsigned Depth) const {
+  EVT VT = Op.getValueType();
+  assert((VT.isInteger() || VT.isFloatingPoint()) && "Invalid VT!");
+  unsigned VTBits = VT.getScalarSizeInBits();
+  unsigned NumElts = DemandedElts.getBitWidth();
+  unsigned Tmp, Tmp2;
+  unsigned FirstAnswer = 1;
+
+  if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
+    const APInt &Val = C->getAPIntValue();
+    return Val.getNumSignBits();
+  }
+
+  if (Depth >= MaxRecursionDepth)
+    return 1;  // Limit search depth.
+
+  if (!DemandedElts)
+    return 1;  // No demanded elts, better to assume we don't know anything.
+
+  unsigned Opcode = Op.getOpcode();
+  switch (Opcode) {
+  default: break;
+  case ISD::AssertSext:
+    Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
+    return VTBits-Tmp+1;
+  case ISD::AssertZext:
+    Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
+    return VTBits-Tmp;
+  case ISD::MERGE_VALUES:
+    return ComputeNumSignBits(Op.getOperand(Op.getResNo()), DemandedElts,
+                              Depth + 1);
+  case ISD::SPLAT_VECTOR: {
+    // Check if the sign bits of source go down as far as the truncated value.
+    unsigned NumSrcBits = Op.getOperand(0).getValueSizeInBits();
+    unsigned NumSrcSignBits = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    if (NumSrcSignBits > (NumSrcBits - VTBits))
+      return NumSrcSignBits - (NumSrcBits - VTBits);
+    break;
+  }
+  case ISD::BUILD_VECTOR:
+    assert(!VT.isScalableVector());
+    Tmp = VTBits;
+    for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) {
+      if (!DemandedElts[i])
+        continue;
+
+      SDValue SrcOp = Op.getOperand(i);
+      // BUILD_VECTOR can implicitly truncate sources, we handle this specially
+      // for constant nodes to ensure we only look at the sign bits.
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SrcOp)) {
+        APInt T = C->getAPIntValue().trunc(VTBits);
+        Tmp2 = T.getNumSignBits();
+      } else {
+        Tmp2 = ComputeNumSignBits(SrcOp, Depth + 1);
+
+        if (SrcOp.getValueSizeInBits() != VTBits) {
+          assert(SrcOp.getValueSizeInBits() > VTBits &&
+                 "Expected BUILD_VECTOR implicit truncation");
+          unsigned ExtraBits = SrcOp.getValueSizeInBits() - VTBits;
+          Tmp2 = (Tmp2 > ExtraBits ? Tmp2 - ExtraBits : 1);
+        }
+      }
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    return Tmp;
+
+  case ISD::VECTOR_SHUFFLE: {
+    // Collect the minimum number of sign bits that are shared by every vector
+    // element referenced by the shuffle.
+    APInt DemandedLHS, DemandedRHS;
+    const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+    assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
+    if (!getShuffleDemandedElts(NumElts, SVN->getMask(), DemandedElts,
+                                DemandedLHS, DemandedRHS))
+      return 1;
+
+    Tmp = std::numeric_limits<unsigned>::max();
+    if (!!DemandedLHS)
+      Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
+    if (!!DemandedRHS) {
+      Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    // If we don't know anything, early out and try computeKnownBits fall-back.
+    if (Tmp == 1)
+      break;
+    assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
+    return Tmp;
+  }
+
+  case ISD::BITCAST: {
+    if (VT.isScalableVector())
+      break;
+    SDValue N0 = Op.getOperand(0);
+    EVT SrcVT = N0.getValueType();
+    unsigned SrcBits = SrcVT.getScalarSizeInBits();
+
+    // Ignore bitcasts from unsupported types..
+    if (!(SrcVT.isInteger() || SrcVT.isFloatingPoint()))
+      break;
+
+    // Fast handling of 'identity' bitcasts.
+    if (VTBits == SrcBits)
+      return ComputeNumSignBits(N0, DemandedElts, Depth + 1);
+
+    bool IsLE = getDataLayout().isLittleEndian();
+
+    // Bitcast 'large element' scalar/vector to 'small element' vector.
+    if ((SrcBits % VTBits) == 0) {
+      assert(VT.isVector() && "Expected bitcast to vector");
+
+      unsigned Scale = SrcBits / VTBits;
+      APInt SrcDemandedElts =
+          APIntOps::ScaleBitMask(DemandedElts, NumElts / Scale);
+
+      // Fast case - sign splat can be simply split across the small elements.
+      Tmp = ComputeNumSignBits(N0, SrcDemandedElts, Depth + 1);
+      if (Tmp == SrcBits)
+        return VTBits;
+
+      // Slow case - determine how far the sign extends into each sub-element.
+      Tmp2 = VTBits;
+      for (unsigned i = 0; i != NumElts; ++i)
+        if (DemandedElts[i]) {
+          unsigned SubOffset = i % Scale;
+          SubOffset = (IsLE ? ((Scale - 1) - SubOffset) : SubOffset);
+          SubOffset = SubOffset * VTBits;
+          if (Tmp <= SubOffset)
+            return 1;
+          Tmp2 = std::min(Tmp2, Tmp - SubOffset);
+        }
+      return Tmp2;
+    }
+    break;
+  }
+
+  case ISD::FP_TO_SINT_SAT:
+    // FP_TO_SINT_SAT produces a signed value that fits in the saturating VT.
+    Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getScalarSizeInBits();
+    return VTBits - Tmp + 1;
+  case ISD::SIGN_EXTEND:
+    Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits();
+    return ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1) + Tmp;
+  case ISD::SIGN_EXTEND_INREG:
+    // Max of the input and what this extends.
+    Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getScalarSizeInBits();
+    Tmp = VTBits-Tmp+1;
+    Tmp2 = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
+    return std::max(Tmp, Tmp2);
+  case ISD::SIGN_EXTEND_VECTOR_INREG: {
+    if (VT.isScalableVector())
+      break;
+    SDValue Src = Op.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    APInt DemandedSrcElts = DemandedElts.zext(SrcVT.getVectorNumElements());
+    Tmp = VTBits - SrcVT.getScalarSizeInBits();
+    return ComputeNumSignBits(Src, DemandedSrcElts, Depth+1) + Tmp;
+  }
+  case ISD::SRA:
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    // SRA X, C -> adds C sign bits.
+    if (const APInt *ShAmt =
+            getValidMinimumShiftAmountConstant(Op, DemandedElts))
+      Tmp = std::min<uint64_t>(Tmp + ShAmt->getZExtValue(), VTBits);
+    return Tmp;
+  case ISD::SHL:
+    if (const APInt *ShAmt =
+            getValidMaximumShiftAmountConstant(Op, DemandedElts)) {
+      // shl destroys sign bits, ensure it doesn't shift out all sign bits.
+      Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+      if (ShAmt->ult(Tmp))
+        return Tmp - ShAmt->getZExtValue();
+    }
+    break;
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:    // NOT is handled here.
+    // Logical binary ops preserve the number of sign bits at the worst.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
+    if (Tmp != 1) {
+      Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth+1);
+      FirstAnswer = std::min(Tmp, Tmp2);
+      // We computed what we know about the sign bits as our first
+      // answer. Now proceed to the generic code that uses
+      // computeKnownBits, and pick whichever answer is better.
+    }
+    break;
+
+  case ISD::SELECT:
+  case ISD::VSELECT:
+    Tmp = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+    Tmp2 = ComputeNumSignBits(Op.getOperand(2), DemandedElts, Depth+1);
+    return std::min(Tmp, Tmp2);
+  case ISD::SELECT_CC:
+    Tmp = ComputeNumSignBits(Op.getOperand(2), DemandedElts, Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+    Tmp2 = ComputeNumSignBits(Op.getOperand(3), DemandedElts, Depth+1);
+    return std::min(Tmp, Tmp2);
+
+  case ISD::SMIN:
+  case ISD::SMAX: {
+    // If we have a clamp pattern, we know that the number of sign bits will be
+    // the minimum of the clamp min/max range.
+    bool IsMax = (Opcode == ISD::SMAX);
+    ConstantSDNode *CstLow = nullptr, *CstHigh = nullptr;
+    if ((CstLow = isConstOrConstSplat(Op.getOperand(1), DemandedElts)))
+      if (Op.getOperand(0).getOpcode() == (IsMax ? ISD::SMIN : ISD::SMAX))
+        CstHigh =
+            isConstOrConstSplat(Op.getOperand(0).getOperand(1), DemandedElts);
+    if (CstLow && CstHigh) {
+      if (!IsMax)
+        std::swap(CstLow, CstHigh);
+      if (CstLow->getAPIntValue().sle(CstHigh->getAPIntValue())) {
+        Tmp = CstLow->getAPIntValue().getNumSignBits();
+        Tmp2 = CstHigh->getAPIntValue().getNumSignBits();
+        return std::min(Tmp, Tmp2);
+      }
+    }
+
+    // Fallback - just get the minimum number of sign bits of the operands.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    if (Tmp == 1)
+      return 1;  // Early out.
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    return std::min(Tmp, Tmp2);
+  }
+  case ISD::UMIN:
+  case ISD::UMAX:
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    if (Tmp == 1)
+      return 1;  // Early out.
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    return std::min(Tmp, Tmp2);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SADDO_CARRY:
+  case ISD::UADDO_CARRY:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SSUBO_CARRY:
+  case ISD::USUBO_CARRY:
+  case ISD::SMULO:
+  case ISD::UMULO:
+    if (Op.getResNo() != 1)
+      break;
+    // The boolean result conforms to getBooleanContents.  Fall through.
+    // If setcc returns 0/-1, all bits are sign bits.
+    // We know that we have an integer-based boolean since these operations
+    // are only available for integer.
+    if (TLI->getBooleanContents(VT.isVector(), false) ==
+        TargetLowering::ZeroOrNegativeOneBooleanContent)
+      return VTBits;
+    break;
+  case ISD::SETCC:
+  case ISD::SETCCCARRY:
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS: {
+    unsigned OpNo = Op->isStrictFPOpcode() ? 1 : 0;
+    // If setcc returns 0/-1, all bits are sign bits.
+    if (TLI->getBooleanContents(Op.getOperand(OpNo).getValueType()) ==
+        TargetLowering::ZeroOrNegativeOneBooleanContent)
+      return VTBits;
+    break;
+  }
+  case ISD::ROTL:
+  case ISD::ROTR:
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+    // If we're rotating an 0/-1 value, then it stays an 0/-1 value.
+    if (Tmp == VTBits)
+      return VTBits;
+
+    if (ConstantSDNode *C =
+            isConstOrConstSplat(Op.getOperand(1), DemandedElts)) {
+      unsigned RotAmt = C->getAPIntValue().urem(VTBits);
+
+      // Handle rotate right by N like a rotate left by 32-N.
+      if (Opcode == ISD::ROTR)
+        RotAmt = (VTBits - RotAmt) % VTBits;
+
+      // If we aren't rotating out all of the known-in sign bits, return the
+      // number that are left.  This handles rotl(sext(x), 1) for example.
+      if (Tmp > (RotAmt + 1)) return (Tmp - RotAmt);
+    }
+    break;
+  case ISD::ADD:
+  case ISD::ADDC:
+    // Add can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    if (Tmp == 1) return 1; // Early out.
+
+    // Special case decrementing a value (ADD X, -1):
+    if (ConstantSDNode *CRHS =
+            isConstOrConstSplat(Op.getOperand(1), DemandedElts))
+      if (CRHS->isAllOnes()) {
+        KnownBits Known =
+            computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+        // If the input is known to be 0 or 1, the output is 0/-1, which is all
+        // sign bits set.
+        if ((Known.Zero | 1).isAllOnes())
+          return VTBits;
+
+        // If we are subtracting one from a positive number, there is no carry
+        // out of the result.
+        if (Known.isNonNegative())
+          return Tmp;
+      }
+
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    if (Tmp2 == 1) return 1; // Early out.
+    return std::min(Tmp, Tmp2) - 1;
+  case ISD::SUB:
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    if (Tmp2 == 1) return 1; // Early out.
+
+    // Handle NEG.
+    if (ConstantSDNode *CLHS =
+            isConstOrConstSplat(Op.getOperand(0), DemandedElts))
+      if (CLHS->isZero()) {
+        KnownBits Known =
+            computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+        // If the input is known to be 0 or 1, the output is 0/-1, which is all
+        // sign bits set.
+        if ((Known.Zero | 1).isAllOnes())
+          return VTBits;
+
+        // If the input is known to be positive (the sign bit is known clear),
+        // the output of the NEG has the same number of sign bits as the input.
+        if (Known.isNonNegative())
+          return Tmp2;
+
+        // Otherwise, we treat this like a SUB.
+      }
+
+    // Sub can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    if (Tmp == 1) return 1; // Early out.
+    return std::min(Tmp, Tmp2) - 1;
+  case ISD::MUL: {
+    // The output of the Mul can be at most twice the valid bits in the inputs.
+    unsigned SignBitsOp0 = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    if (SignBitsOp0 == 1)
+      break;
+    unsigned SignBitsOp1 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
+    if (SignBitsOp1 == 1)
+      break;
+    unsigned OutValidBits =
+        (VTBits - SignBitsOp0 + 1) + (VTBits - SignBitsOp1 + 1);
+    return OutValidBits > VTBits ? 1 : VTBits - OutValidBits + 1;
+  }
+  case ISD::SREM:
+    // The sign bit is the LHS's sign bit, except when the result of the
+    // remainder is zero. The magnitude of the result should be less than or
+    // equal to the magnitude of the LHS. Therefore, the result should have
+    // at least as many sign bits as the left hand side.
+    return ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+  case ISD::TRUNCATE: {
+    // Check if the sign bits of source go down as far as the truncated value.
+    unsigned NumSrcBits = Op.getOperand(0).getScalarValueSizeInBits();
+    unsigned NumSrcSignBits = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    if (NumSrcSignBits > (NumSrcBits - VTBits))
+      return NumSrcSignBits - (NumSrcBits - VTBits);
+    break;
+  }
+  case ISD::EXTRACT_ELEMENT: {
+    if (VT.isScalableVector())
+      break;
+    const int KnownSign = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    const int BitWidth = Op.getValueSizeInBits();
+    const int Items = Op.getOperand(0).getValueSizeInBits() / BitWidth;
+
+    // Get reverse index (starting from 1), Op1 value indexes elements from
+    // little end. Sign starts at big end.
+    const int rIndex = Items - 1 - Op.getConstantOperandVal(1);
+
+    // If the sign portion ends in our element the subtraction gives correct
+    // result. Otherwise it gives either negative or > bitwidth result
+    return std::clamp(KnownSign - rIndex * BitWidth, 0, BitWidth);
+  }
+  case ISD::INSERT_VECTOR_ELT: {
+    if (VT.isScalableVector())
+      break;
+    // If we know the element index, split the demand between the
+    // source vector and the inserted element, otherwise assume we need
+    // the original demanded vector elements and the value.
+    SDValue InVec = Op.getOperand(0);
+    SDValue InVal = Op.getOperand(1);
+    SDValue EltNo = Op.getOperand(2);
+    bool DemandedVal = true;
+    APInt DemandedVecElts = DemandedElts;
+    auto *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
+    if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
+      unsigned EltIdx = CEltNo->getZExtValue();
+      DemandedVal = !!DemandedElts[EltIdx];
+      DemandedVecElts.clearBit(EltIdx);
+    }
+    Tmp = std::numeric_limits<unsigned>::max();
+    if (DemandedVal) {
+      // TODO - handle implicit truncation of inserted elements.
+      if (InVal.getScalarValueSizeInBits() != VTBits)
+        break;
+      Tmp2 = ComputeNumSignBits(InVal, Depth + 1);
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    if (!!DemandedVecElts) {
+      Tmp2 = ComputeNumSignBits(InVec, DemandedVecElts, Depth + 1);
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
+    return Tmp;
+  }
+  case ISD::EXTRACT_VECTOR_ELT: {
+    assert(!VT.isScalableVector());
+    SDValue InVec = Op.getOperand(0);
+    SDValue EltNo = Op.getOperand(1);
+    EVT VecVT = InVec.getValueType();
+    // ComputeNumSignBits not yet implemented for scalable vectors.
+    if (VecVT.isScalableVector())
+      break;
+    const unsigned BitWidth = Op.getValueSizeInBits();
+    const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
+    const unsigned NumSrcElts = VecVT.getVectorNumElements();
+
+    // If BitWidth > EltBitWidth the value is anyext:ed, and we do not know
+    // anything about sign bits. But if the sizes match we can derive knowledge
+    // about sign bits from the vector operand.
+    if (BitWidth != EltBitWidth)
+      break;
+
+    // If we know the element index, just demand that vector element, else for
+    // an unknown element index, ignore DemandedElts and demand them all.
+    APInt DemandedSrcElts = APInt::getAllOnes(NumSrcElts);
+    auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
+    if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts))
+      DemandedSrcElts =
+          APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue());
+
+    return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);
+  }
+  case ISD::EXTRACT_SUBVECTOR: {
+    // Offset the demanded elts by the subvector index.
+    SDValue Src = Op.getOperand(0);
+    // Bail until we can represent demanded elements for scalable vectors.
+    if (Src.getValueType().isScalableVector())
+      break;
+    uint64_t Idx = Op.getConstantOperandVal(1);
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx);
+    return ComputeNumSignBits(Src, DemandedSrcElts, Depth + 1);
+  }
+  case ISD::CONCAT_VECTORS: {
+    if (VT.isScalableVector())
+      break;
+    // Determine the minimum number of sign bits across all demanded
+    // elts of the input vectors. Early out if the result is already 1.
+    Tmp = std::numeric_limits<unsigned>::max();
+    EVT SubVectorVT = Op.getOperand(0).getValueType();
+    unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
+    unsigned NumSubVectors = Op.getNumOperands();
+    for (unsigned i = 0; (i < NumSubVectors) && (Tmp > 1); ++i) {
+      APInt DemandedSub =
+          DemandedElts.extractBits(NumSubVectorElts, i * NumSubVectorElts);
+      if (!DemandedSub)
+        continue;
+      Tmp2 = ComputeNumSignBits(Op.getOperand(i), DemandedSub, Depth + 1);
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
+    return Tmp;
+  }
+  case ISD::INSERT_SUBVECTOR: {
+    if (VT.isScalableVector())
+      break;
+    // Demand any elements from the subvector and the remainder from the src its
+    // inserted into.
+    SDValue Src = Op.getOperand(0);
+    SDValue Sub = Op.getOperand(1);
+    uint64_t Idx = Op.getConstantOperandVal(2);
+    unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+    APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+    APInt DemandedSrcElts = DemandedElts;
+    DemandedSrcElts.insertBits(APInt::getZero(NumSubElts), Idx);
+
+    Tmp = std::numeric_limits<unsigned>::max();
+    if (!!DemandedSubElts) {
+      Tmp = ComputeNumSignBits(Sub, DemandedSubElts, Depth + 1);
+      if (Tmp == 1)
+        return 1; // early-out
+    }
+    if (!!DemandedSrcElts) {
+      Tmp2 = ComputeNumSignBits(Src, DemandedSrcElts, Depth + 1);
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
+    return Tmp;
+  }
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
+    if (const MDNode *Ranges = LD->getRanges()) {
+      if (DemandedElts != 1)
+        break;
+
+      ConstantRange CR = getConstantRangeFromMetadata(*Ranges);
+      if (VTBits > CR.getBitWidth()) {
+        switch (LD->getExtensionType()) {
+        case ISD::SEXTLOAD:
+          CR = CR.signExtend(VTBits);
+          break;
+        case ISD::ZEXTLOAD:
+          CR = CR.zeroExtend(VTBits);
+          break;
+        default:
+          break;
+        }
+      }
+
+      if (VTBits != CR.getBitWidth())
+        break;
+      return std::min(CR.getSignedMin().getNumSignBits(),
+                      CR.getSignedMax().getNumSignBits());
+    }
+
+    break;
+  }
+  case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_CLR:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_LOAD: {
+    Tmp = cast<AtomicSDNode>(Op)->getMemoryVT().getScalarSizeInBits();
+    // If we are looking at the loaded value.
+    if (Op.getResNo() == 0) {
+      if (Tmp == VTBits)
+        return 1; // early-out
+      if (TLI->getExtendForAtomicOps() == ISD::SIGN_EXTEND)
+        return VTBits - Tmp + 1;
+      if (TLI->getExtendForAtomicOps() == ISD::ZERO_EXTEND)
+        return VTBits - Tmp;
+    }
+    break;
+  }
+  }
+
+  // If we are looking at the loaded value of the SDNode.
+  if (Op.getResNo() == 0) {
+    // Handle LOADX separately here. EXTLOAD case will fallthrough.
+    if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
+      unsigned ExtType = LD->getExtensionType();
+      switch (ExtType) {
+      default: break;
+      case ISD::SEXTLOAD: // e.g. i16->i32 = '17' bits known.
+        Tmp = LD->getMemoryVT().getScalarSizeInBits();
+        return VTBits - Tmp + 1;
+      case ISD::ZEXTLOAD: // e.g. i16->i32 = '16' bits known.
+        Tmp = LD->getMemoryVT().getScalarSizeInBits();
+        return VTBits - Tmp;
+      case ISD::NON_EXTLOAD:
+        if (const Constant *Cst = TLI->getTargetConstantFromLoad(LD)) {
+          // We only need to handle vectors - computeKnownBits should handle
+          // scalar cases.
+          Type *CstTy = Cst->getType();
+          if (CstTy->isVectorTy() && !VT.isScalableVector() &&
+              (NumElts * VTBits) == CstTy->getPrimitiveSizeInBits() &&
+              VTBits == CstTy->getScalarSizeInBits()) {
+            Tmp = VTBits;
+            for (unsigned i = 0; i != NumElts; ++i) {
+              if (!DemandedElts[i])
+                continue;
+              if (Constant *Elt = Cst->getAggregateElement(i)) {
+                if (auto *CInt = dyn_cast<ConstantInt>(Elt)) {
+                  const APInt &Value = CInt->getValue();
+                  Tmp = std::min(Tmp, Value.getNumSignBits());
+                  continue;
+                }
+                if (auto *CFP = dyn_cast<ConstantFP>(Elt)) {
+                  APInt Value = CFP->getValueAPF().bitcastToAPInt();
+                  Tmp = std::min(Tmp, Value.getNumSignBits());
+                  continue;
+                }
+              }
+              // Unknown type. Conservatively assume no bits match sign bit.
+              return 1;
+            }
+            return Tmp;
+          }
+        }
+        break;
+      }
+    }
+  }
+
+  // Allow the target to implement this method for its nodes.
+  if (Opcode >= ISD::BUILTIN_OP_END ||
+      Opcode == ISD::INTRINSIC_WO_CHAIN ||
+      Opcode == ISD::INTRINSIC_W_CHAIN ||
+      Opcode == ISD::INTRINSIC_VOID) {
+    // TODO: This can probably be removed once target code is audited.  This
+    // is here purely to reduce patch size and review complexity.
+    if (!VT.isScalableVector()) {
+      unsigned NumBits =
+        TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth);
+      if (NumBits > 1)
+        FirstAnswer = std::max(FirstAnswer, NumBits);
+    }
+  }
+
+  // Finally, if we can prove that the top bits of the result are 0's or 1's,
+  // use this information.
+  KnownBits Known = computeKnownBits(Op, DemandedElts, Depth);
+  return std::max(FirstAnswer, Known.countMinSignBits());
+}
+
+unsigned SelectionDAG::ComputeMaxSignificantBits(SDValue Op,
+                                                 unsigned Depth) const {
+  unsigned SignBits = ComputeNumSignBits(Op, Depth);
+  return Op.getScalarValueSizeInBits() - SignBits + 1;
+}
+
+unsigned SelectionDAG::ComputeMaxSignificantBits(SDValue Op,
+                                                 const APInt &DemandedElts,
+                                                 unsigned Depth) const {
+  unsigned SignBits = ComputeNumSignBits(Op, DemandedElts, Depth);
+  return Op.getScalarValueSizeInBits() - SignBits + 1;
+}
+
+bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly,
+                                                    unsigned Depth) const {
+  // Early out for FREEZE.
+  if (Op.getOpcode() == ISD::FREEZE)
+    return true;
+
+  // TODO: Assume we don't know anything for now.
+  EVT VT = Op.getValueType();
+  if (VT.isScalableVector())
+    return false;
+
+  APInt DemandedElts = VT.isVector()
+                           ? APInt::getAllOnes(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  return isGuaranteedNotToBeUndefOrPoison(Op, DemandedElts, PoisonOnly, Depth);
+}
+
+bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
+                                                    const APInt &DemandedElts,
+                                                    bool PoisonOnly,
+                                                    unsigned Depth) const {
+  unsigned Opcode = Op.getOpcode();
+
+  // Early out for FREEZE.
+  if (Opcode == ISD::FREEZE)
+    return true;
+
+  if (Depth >= MaxRecursionDepth)
+    return false; // Limit search depth.
+
+  if (isIntOrFPConstant(Op))
+    return true;
+
+  switch (Opcode) {
+  case ISD::VALUETYPE:
+  case ISD::FrameIndex:
+  case ISD::TargetFrameIndex:
+    return true;
+
+  case ISD::UNDEF:
+    return PoisonOnly;
+
+  case ISD::BUILD_VECTOR:
+    // NOTE: BUILD_VECTOR has implicit truncation of wider scalar elements -
+    // this shouldn't affect the result.
+    for (unsigned i = 0, e = Op.getNumOperands(); i < e; ++i) {
+      if (!DemandedElts[i])
+        continue;
+      if (!isGuaranteedNotToBeUndefOrPoison(Op.getOperand(i), PoisonOnly,
+                                            Depth + 1))
+        return false;
+    }
+    return true;
+
+    // TODO: Search for noundef attributes from library functions.
+
+    // TODO: Pointers dereferenced by ISD::LOAD/STORE ops are noundef.
+
+  default:
+    // Allow the target to implement this method for its nodes.
+    if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::INTRINSIC_WO_CHAIN ||
+        Opcode == ISD::INTRINSIC_W_CHAIN || Opcode == ISD::INTRINSIC_VOID)
+      return TLI->isGuaranteedNotToBeUndefOrPoisonForTargetNode(
+          Op, DemandedElts, *this, PoisonOnly, Depth);
+    break;
+  }
+
+  // If Op can't create undef/poison and none of its operands are undef/poison
+  // then Op is never undef/poison.
+  // NOTE: TargetNodes should handle this in themselves in
+  // isGuaranteedNotToBeUndefOrPoisonForTargetNode.
+  return !canCreateUndefOrPoison(Op, PoisonOnly, /*ConsiderFlags*/ true,
+                                 Depth) &&
+         all_of(Op->ops(), [&](SDValue V) {
+           return isGuaranteedNotToBeUndefOrPoison(V, PoisonOnly, Depth + 1);
+         });
+}
+
+bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, bool PoisonOnly,
+                                          bool ConsiderFlags,
+                                          unsigned Depth) const {
+  // TODO: Assume we don't know anything for now.
+  EVT VT = Op.getValueType();
+  if (VT.isScalableVector())
+    return true;
+
+  APInt DemandedElts = VT.isVector()
+                           ? APInt::getAllOnes(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  return canCreateUndefOrPoison(Op, DemandedElts, PoisonOnly, ConsiderFlags,
+                                Depth);
+}
+
+bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
+                                          bool PoisonOnly, bool ConsiderFlags,
+                                          unsigned Depth) const {
+  // TODO: Assume we don't know anything for now.
+  EVT VT = Op.getValueType();
+  if (VT.isScalableVector())
+    return true;
+
+  unsigned Opcode = Op.getOpcode();
+  switch (Opcode) {
+  case ISD::AssertSext:
+  case ISD::AssertZext:
+  case ISD::FREEZE:
+  case ISD::CONCAT_VECTORS:
+  case ISD::INSERT_SUBVECTOR:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::ROTL:
+  case ISD::ROTR:
+  case ISD::FSHL:
+  case ISD::FSHR:
+  case ISD::BSWAP:
+  case ISD::CTPOP:
+  case ISD::BITREVERSE:
+  case ISD::PARITY:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::TRUNCATE:
+  case ISD::SIGN_EXTEND_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+  case ISD::BITCAST:
+  case ISD::BUILD_VECTOR:
+  case ISD::BUILD_PAIR:
+    return false;
+
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+    // Matches hasPoisonGeneratingFlags().
+    return ConsiderFlags && (Op->getFlags().hasNoSignedWrap() ||
+                             Op->getFlags().hasNoUnsignedWrap());
+
+  case ISD::SHL:
+    // If the max shift amount isn't in range, then the shift can create poison.
+    if (!getValidMaximumShiftAmountConstant(Op, DemandedElts))
+      return true;
+
+    // Matches hasPoisonGeneratingFlags().
+    return ConsiderFlags && (Op->getFlags().hasNoSignedWrap() ||
+                             Op->getFlags().hasNoUnsignedWrap());
+
+  case ISD::INSERT_VECTOR_ELT:{
+    // Ensure that the element index is in bounds.
+    EVT VecVT = Op.getOperand(0).getValueType();
+    KnownBits KnownIdx = computeKnownBits(Op.getOperand(2), Depth + 1);
+    return KnownIdx.getMaxValue().uge(VecVT.getVectorMinNumElements());
+  }
+
+  default:
+    // Allow the target to implement this method for its nodes.
+    if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::INTRINSIC_WO_CHAIN ||
+        Opcode == ISD::INTRINSIC_W_CHAIN || Opcode == ISD::INTRINSIC_VOID)
+      return TLI->canCreateUndefOrPoisonForTargetNode(
+          Op, DemandedElts, *this, PoisonOnly, ConsiderFlags, Depth);
+    break;
+  }
+
+  // Be conservative and return true.
+  return true;
+}
+
+bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const {
+  if ((Op.getOpcode() != ISD::ADD && Op.getOpcode() != ISD::OR) ||
+      !isa<ConstantSDNode>(Op.getOperand(1)))
+    return false;
+
+  if (Op.getOpcode() == ISD::OR &&
+      !MaskedValueIsZero(Op.getOperand(0), Op.getConstantOperandAPInt(1)))
+    return false;
+
+  return true;
+}
+
+bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const {
+  // If we're told that NaNs won't happen, assume they won't.
+  if (getTarget().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs())
+    return true;
+
+  if (Depth >= MaxRecursionDepth)
+    return false; // Limit search depth.
+
+  // If the value is a constant, we can obviously see if it is a NaN or not.
+  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
+    return !C->getValueAPF().isNaN() ||
+           (SNaN && !C->getValueAPF().isSignaling());
+  }
+
+  unsigned Opcode = Op.getOpcode();
+  switch (Opcode) {
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::FSIN:
+  case ISD::FCOS:
+  case ISD::FMA:
+  case ISD::FMAD: {
+    if (SNaN)
+      return true;
+    // TODO: Need isKnownNeverInfinity
+    return false;
+  }
+  case ISD::FCANONICALIZE:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FTRUNC:
+  case ISD::FFLOOR:
+  case ISD::FCEIL:
+  case ISD::FROUND:
+  case ISD::FROUNDEVEN:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::FLDEXP: {
+    if (SNaN)
+      return true;
+    return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
+  }
+  case ISD::FABS:
+  case ISD::FNEG:
+  case ISD::FCOPYSIGN: {
+    return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
+  }
+  case ISD::SELECT:
+    return isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
+           isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
+  case ISD::FP_EXTEND:
+  case ISD::FP_ROUND: {
+    if (SNaN)
+      return true;
+    return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
+  }
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return true;
+  case ISD::FSQRT: // Need is known positive
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+  case ISD::FPOWI:
+  case ISD::FPOW: {
+    if (SNaN)
+      return true;
+    // TODO: Refine on operand
+    return false;
+  }
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM: {
+    // Only one needs to be known not-nan, since it will be returned if the
+    // other ends up being one.
+    return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) ||
+           isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
+  }
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE: {
+    if (SNaN)
+      return true;
+    // This can return a NaN if either operand is an sNaN, or if both operands
+    // are NaN.
+    return (isKnownNeverNaN(Op.getOperand(0), false, Depth + 1) &&
+            isKnownNeverSNaN(Op.getOperand(1), Depth + 1)) ||
+           (isKnownNeverNaN(Op.getOperand(1), false, Depth + 1) &&
+            isKnownNeverSNaN(Op.getOperand(0), Depth + 1));
+  }
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM: {
+    // TODO: Does this quiet or return the origina NaN as-is?
+    return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
+           isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
+  }
+  case ISD::EXTRACT_VECTOR_ELT: {
+    return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
+  }
+  case ISD::BUILD_VECTOR: {
+    for (const SDValue &Opnd : Op->ops())
+      if (!isKnownNeverNaN(Opnd, SNaN, Depth + 1))
+        return false;
+    return true;
+  }
+  default:
+    if (Opcode >= ISD::BUILTIN_OP_END ||
+        Opcode == ISD::INTRINSIC_WO_CHAIN ||
+        Opcode == ISD::INTRINSIC_W_CHAIN ||
+        Opcode == ISD::INTRINSIC_VOID) {
+      return TLI->isKnownNeverNaNForTargetNode(Op, *this, SNaN, Depth);
+    }
+
+    return false;
+  }
+}
+
+bool SelectionDAG::isKnownNeverZeroFloat(SDValue Op) const {
+  assert(Op.getValueType().isFloatingPoint() &&
+         "Floating point type expected");
+
+  // If the value is a constant, we can obviously see if it is a zero or not.
+  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
+    return !C->isZero();
+
+  // Return false if we find any zero in a vector.
+  if (Op->getOpcode() == ISD::BUILD_VECTOR ||
+      Op->getOpcode() == ISD::SPLAT_VECTOR) {
+    for (const SDValue &OpVal : Op->op_values()) {
+      if (OpVal.isUndef())
+        return false;
+      if (auto *C = dyn_cast<ConstantFPSDNode>(OpVal))
+        if (C->isZero())
+          return false;
+    }
+    return true;
+  }
+  return false;
+}
+
+bool SelectionDAG::isKnownNeverZero(SDValue Op, unsigned Depth) const {
+  if (Depth >= MaxRecursionDepth)
+    return false; // Limit search depth.
+
+  assert(!Op.getValueType().isFloatingPoint() &&
+         "Floating point types unsupported - use isKnownNeverZeroFloat");
+
+  // If the value is a constant, we can obviously see if it is a zero or not.
+  if (ISD::matchUnaryPredicate(Op,
+                               [](ConstantSDNode *C) { return !C->isZero(); }))
+    return true;
+
+  // TODO: Recognize more cases here. Most of the cases are also incomplete to
+  // some degree.
+  switch (Op.getOpcode()) {
+  default:
+    break;
+
+  case ISD::OR:
+    return isKnownNeverZero(Op.getOperand(1), Depth + 1) ||
+           isKnownNeverZero(Op.getOperand(0), Depth + 1);
+
+  case ISD::VSELECT:
+  case ISD::SELECT:
+    return isKnownNeverZero(Op.getOperand(1), Depth + 1) &&
+           isKnownNeverZero(Op.getOperand(2), Depth + 1);
+
+  case ISD::SHL:
+    if (Op->getFlags().hasNoSignedWrap() || Op->getFlags().hasNoUnsignedWrap())
+      return isKnownNeverZero(Op.getOperand(0), Depth + 1);
+
+    // 1 << X is never zero. TODO: This can be expanded if we can bound X.
+    // The expression is really !Known.One[BitWidth-MaxLog2(Known):0].isZero()
+    if (computeKnownBits(Op.getOperand(0), Depth + 1).One[0])
+      return true;
+    break;
+
+  case ISD::UADDSAT:
+  case ISD::UMAX:
+    return isKnownNeverZero(Op.getOperand(1), Depth + 1) ||
+           isKnownNeverZero(Op.getOperand(0), Depth + 1);
+
+  case ISD::UMIN:
+    return isKnownNeverZero(Op.getOperand(1), Depth + 1) &&
+           isKnownNeverZero(Op.getOperand(0), Depth + 1);
+
+  case ISD::ROTL:
+  case ISD::ROTR:
+  case ISD::BITREVERSE:
+  case ISD::BSWAP:
+  case ISD::CTPOP:
+  case ISD::ABS:
+    return isKnownNeverZero(Op.getOperand(0), Depth + 1);
+
+  case ISD::SRA:
+  case ISD::SRL:
+    if (Op->getFlags().hasExact())
+      return isKnownNeverZero(Op.getOperand(0), Depth + 1);
+    // Signed >> X is never zero. TODO: This can be expanded if we can bound X.
+    // The expression is really
+    // !Known.One[SignBit:SignBit-(BitWidth-MaxLog2(Known))].isZero()
+    if (computeKnownBits(Op.getOperand(0), Depth + 1).isNegative())
+      return true;
+    break;
+
+  case ISD::UDIV:
+  case ISD::SDIV:
+    // div exact can only produce a zero if the dividend is zero.
+    // TODO: For udiv this is also true if Op1 u<= Op0
+    if (Op->getFlags().hasExact())
+      return isKnownNeverZero(Op.getOperand(0), Depth + 1);
+    break;
+
+  case ISD::ADD:
+    if (Op->getFlags().hasNoUnsignedWrap())
+      if (isKnownNeverZero(Op.getOperand(1), Depth + 1) ||
+          isKnownNeverZero(Op.getOperand(0), Depth + 1))
+        return true;
+    // TODO: There are a lot more cases we can prove for add.
+    break;
+
+  case ISD::SUB: {
+    if (isNullConstant(Op.getOperand(0)))
+      return isKnownNeverZero(Op.getOperand(1), Depth + 1);
+
+    std::optional<bool> ne =
+        KnownBits::ne(computeKnownBits(Op.getOperand(0), Depth + 1),
+                      computeKnownBits(Op.getOperand(1), Depth + 1));
+    return ne && *ne;
+  }
+
+  case ISD::MUL:
+    if (Op->getFlags().hasNoSignedWrap() || Op->getFlags().hasNoUnsignedWrap())
+      if (isKnownNeverZero(Op.getOperand(1), Depth + 1) &&
+          isKnownNeverZero(Op.getOperand(0), Depth + 1))
+        return true;
+    break;
+
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+    return isKnownNeverZero(Op.getOperand(0), Depth + 1);
+  }
+
+  return computeKnownBits(Op, Depth).isNonZero();
+}
+
+bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
+  // Check the obvious case.
+  if (A == B) return true;
+
+  // For negative and positive zero.
+  if (const ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A))
+    if (const ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B))
+      if (CA->isZero() && CB->isZero()) return true;
+
+  // Otherwise they may not be equal.
+  return false;
+}
+
+// Only bits set in Mask must be negated, other bits may be arbitrary.
+SDValue llvm::getBitwiseNotOperand(SDValue V, SDValue Mask, bool AllowUndefs) {
+  if (isBitwiseNot(V, AllowUndefs))
+    return V.getOperand(0);
+
+  // Handle any_extend (not (truncate X)) pattern, where Mask only sets
+  // bits in the non-extended part.
+  ConstantSDNode *MaskC = isConstOrConstSplat(Mask);
+  if (!MaskC || V.getOpcode() != ISD::ANY_EXTEND)
+    return SDValue();
+  SDValue ExtArg = V.getOperand(0);
+  if (ExtArg.getScalarValueSizeInBits() >=
+          MaskC->getAPIntValue().getActiveBits() &&
+      isBitwiseNot(ExtArg, AllowUndefs) &&
+      ExtArg.getOperand(0).getOpcode() == ISD::TRUNCATE &&
+      ExtArg.getOperand(0).getOperand(0).getValueType() == V.getValueType())
+    return ExtArg.getOperand(0).getOperand(0);
+  return SDValue();
+}
+
+static bool haveNoCommonBitsSetCommutative(SDValue A, SDValue B) {
+  // Match masked merge pattern (X & ~M) op (Y & M)
+  // Including degenerate case (X & ~M) op M
+  auto MatchNoCommonBitsPattern = [&](SDValue Not, SDValue Mask,
+                                      SDValue Other) {
+    if (SDValue NotOperand =
+            getBitwiseNotOperand(Not, Mask, /* AllowUndefs */ true)) {
+      if (NotOperand->getOpcode() == ISD::ZERO_EXTEND ||
+          NotOperand->getOpcode() == ISD::TRUNCATE)
+        NotOperand = NotOperand->getOperand(0);
+
+      if (Other == NotOperand)
+        return true;
+      if (Other->getOpcode() == ISD::AND)
+        return NotOperand == Other->getOperand(0) ||
+               NotOperand == Other->getOperand(1);
+    }
+    return false;
+  };
+
+  if (A->getOpcode() == ISD::ZERO_EXTEND || A->getOpcode() == ISD::TRUNCATE)
+    A = A->getOperand(0);
+
+  if (B->getOpcode() == ISD::ZERO_EXTEND || B->getOpcode() == ISD::TRUNCATE)
+    B = B->getOperand(0);
+
+  if (A->getOpcode() == ISD::AND)
+    return MatchNoCommonBitsPattern(A->getOperand(0), A->getOperand(1), B) ||
+           MatchNoCommonBitsPattern(A->getOperand(1), A->getOperand(0), B);
+  return false;
+}
+
+// FIXME: unify with llvm::haveNoCommonBitsSet.
+bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
+  assert(A.getValueType() == B.getValueType() &&
+         "Values must have the same type");
+  if (haveNoCommonBitsSetCommutative(A, B) ||
+      haveNoCommonBitsSetCommutative(B, A))
+    return true;
+  return KnownBits::haveNoCommonBitsSet(computeKnownBits(A),
+                                        computeKnownBits(B));
+}
+
+static SDValue FoldSTEP_VECTOR(const SDLoc &DL, EVT VT, SDValue Step,
+                               SelectionDAG &DAG) {
+  if (cast<ConstantSDNode>(Step)->isZero())
+    return DAG.getConstant(0, DL, VT);
+
+  return SDValue();
+}
+
+static SDValue FoldBUILD_VECTOR(const SDLoc &DL, EVT VT,
+                                ArrayRef<SDValue> Ops,
+                                SelectionDAG &DAG) {
+  int NumOps = Ops.size();
+  assert(NumOps != 0 && "Can't build an empty vector!");
+  assert(!VT.isScalableVector() &&
+         "BUILD_VECTOR cannot be used with scalable types");
+  assert(VT.getVectorNumElements() == (unsigned)NumOps &&
+         "Incorrect element count in BUILD_VECTOR!");
+
+  // BUILD_VECTOR of UNDEFs is UNDEF.
+  if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
+    return DAG.getUNDEF(VT);
+
+  // BUILD_VECTOR of seq extract/insert from the same vector + type is Identity.
+  SDValue IdentitySrc;
+  bool IsIdentity = true;
+  for (int i = 0; i != NumOps; ++i) {
+    if (Ops[i].getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        Ops[i].getOperand(0).getValueType() != VT ||
+        (IdentitySrc && Ops[i].getOperand(0) != IdentitySrc) ||
+        !isa<ConstantSDNode>(Ops[i].getOperand(1)) ||
+        cast<ConstantSDNode>(Ops[i].getOperand(1))->getAPIntValue() != i) {
+      IsIdentity = false;
+      break;
+    }
+    IdentitySrc = Ops[i].getOperand(0);
+  }
+  if (IsIdentity)
+    return IdentitySrc;
+
+  return SDValue();
+}
+
+/// Try to simplify vector concatenation to an input value, undef, or build
+/// vector.
+static SDValue foldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
+                                  ArrayRef<SDValue> Ops,
+                                  SelectionDAG &DAG) {
+  assert(!Ops.empty() && "Can't concatenate an empty list of vectors!");
+  assert(llvm::all_of(Ops,
+                      [Ops](SDValue Op) {
+                        return Ops[0].getValueType() == Op.getValueType();
+                      }) &&
+         "Concatenation of vectors with inconsistent value types!");
+  assert((Ops[0].getValueType().getVectorElementCount() * Ops.size()) ==
+             VT.getVectorElementCount() &&
+         "Incorrect element count in vector concatenation!");
+
+  if (Ops.size() == 1)
+    return Ops[0];
+
+  // Concat of UNDEFs is UNDEF.
+  if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
+    return DAG.getUNDEF(VT);
+
+  // Scan the operands and look for extract operations from a single source
+  // that correspond to insertion at the same location via this concatenation:
+  // concat (extract X, 0*subvec_elts), (extract X, 1*subvec_elts), ...
+  SDValue IdentitySrc;
+  bool IsIdentity = true;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    SDValue Op = Ops[i];
+    unsigned IdentityIndex = i * Op.getValueType().getVectorMinNumElements();
+    if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+        Op.getOperand(0).getValueType() != VT ||
+        (IdentitySrc && Op.getOperand(0) != IdentitySrc) ||
+        Op.getConstantOperandVal(1) != IdentityIndex) {
+      IsIdentity = false;
+      break;
+    }
+    assert((!IdentitySrc || IdentitySrc == Op.getOperand(0)) &&
+           "Unexpected identity source vector for concat of extracts");
+    IdentitySrc = Op.getOperand(0);
+  }
+  if (IsIdentity) {
+    assert(IdentitySrc && "Failed to set source vector of extracts");
+    return IdentitySrc;
+  }
+
+  // The code below this point is only designed to work for fixed width
+  // vectors, so we bail out for now.
+  if (VT.isScalableVector())
+    return SDValue();
+
+  // A CONCAT_VECTOR with all UNDEF/BUILD_VECTOR operands can be
+  // simplified to one big BUILD_VECTOR.
+  // FIXME: Add support for SCALAR_TO_VECTOR as well.
+  EVT SVT = VT.getScalarType();
+  SmallVector<SDValue, 16> Elts;
+  for (SDValue Op : Ops) {
+    EVT OpVT = Op.getValueType();
+    if (Op.isUndef())
+      Elts.append(OpVT.getVectorNumElements(), DAG.getUNDEF(SVT));
+    else if (Op.getOpcode() == ISD::BUILD_VECTOR)
+      Elts.append(Op->op_begin(), Op->op_end());
+    else
+      return SDValue();
+  }
+
+  // BUILD_VECTOR requires all inputs to be of the same type, find the
+  // maximum type and extend them all.
+  for (SDValue Op : Elts)
+    SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
+
+  if (SVT.bitsGT(VT.getScalarType())) {
+    for (SDValue &Op : Elts) {
+      if (Op.isUndef())
+        Op = DAG.getUNDEF(SVT);
+      else
+        Op = DAG.getTargetLoweringInfo().isZExtFree(Op.getValueType(), SVT)
+                 ? DAG.getZExtOrTrunc(Op, DL, SVT)
+                 : DAG.getSExtOrTrunc(Op, DL, SVT);
+    }
+  }
+
+  SDValue V = DAG.getBuildVector(VT, DL, Elts);
+  NewSDValueDbgMsg(V, "New node fold concat vectors: ", &DAG);
+  return V;
+}
+
+/// Gets or creates the specified node.
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opcode, getVTList(VT), std::nullopt);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(),
+                              getVTList(VT));
+  CSEMap.InsertNode(N, IP);
+
+  InsertNode(N);
+  SDValue V = SDValue(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1) {
+  SDNodeFlags Flags;
+  if (Inserter)
+    Flags = Inserter->getFlags();
+  return getNode(Opcode, DL, VT, N1, Flags);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1, const SDNodeFlags Flags) {
+  assert(N1.getOpcode() != ISD::DELETED_NODE && "Operand is DELETED_NODE!");
+  // Constant fold unary operations with an integer constant operand. Even
+  // opaque constant will be folded, because the folding of unary operations
+  // doesn't create new constants with different values. Nevertheless, the
+  // opaque flag is preserved during folding to prevent future folding with
+  // other constants.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+    const APInt &Val = C->getAPIntValue();
+    switch (Opcode) {
+    default: break;
+    case ISD::SIGN_EXTEND:
+      return getConstant(Val.sextOrTrunc(VT.getSizeInBits()), DL, VT,
+                         C->isTargetOpcode(), C->isOpaque());
+    case ISD::TRUNCATE:
+      if (C->isOpaque())
+        break;
+      [[fallthrough]];
+    case ISD::ZERO_EXTEND:
+      return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), DL, VT,
+                         C->isTargetOpcode(), C->isOpaque());
+    case ISD::ANY_EXTEND:
+      // Some targets like RISCV prefer to sign extend some types.
+      if (TLI->isSExtCheaperThanZExt(N1.getValueType(), VT))
+        return getConstant(Val.sextOrTrunc(VT.getSizeInBits()), DL, VT,
+                           C->isTargetOpcode(), C->isOpaque());
+      return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), DL, VT,
+                         C->isTargetOpcode(), C->isOpaque());
+    case ISD::UINT_TO_FP:
+    case ISD::SINT_TO_FP: {
+      APFloat apf(EVTToAPFloatSemantics(VT),
+                  APInt::getZero(VT.getSizeInBits()));
+      (void)apf.convertFromAPInt(Val,
+                                 Opcode==ISD::SINT_TO_FP,
+                                 APFloat::rmNearestTiesToEven);
+      return getConstantFP(apf, DL, VT);
+    }
+    case ISD::BITCAST:
+      if (VT == MVT::f16 && C->getValueType(0) == MVT::i16)
+        return getConstantFP(APFloat(APFloat::IEEEhalf(), Val), DL, VT);
+      if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
+        return getConstantFP(APFloat(APFloat::IEEEsingle(), Val), DL, VT);
+      if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
+        return getConstantFP(APFloat(APFloat::IEEEdouble(), Val), DL, VT);
+      if (VT == MVT::f128 && C->getValueType(0) == MVT::i128)
+        return getConstantFP(APFloat(APFloat::IEEEquad(), Val), DL, VT);
+      break;
+    case ISD::ABS:
+      return getConstant(Val.abs(), DL, VT, C->isTargetOpcode(),
+                         C->isOpaque());
+    case ISD::BITREVERSE:
+      return getConstant(Val.reverseBits(), DL, VT, C->isTargetOpcode(),
+                         C->isOpaque());
+    case ISD::BSWAP:
+      return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(),
+                         C->isOpaque());
+    case ISD::CTPOP:
+      return getConstant(Val.popcount(), DL, VT, C->isTargetOpcode(),
+                         C->isOpaque());
+    case ISD::CTLZ:
+    case ISD::CTLZ_ZERO_UNDEF:
+      return getConstant(Val.countl_zero(), DL, VT, C->isTargetOpcode(),
+                         C->isOpaque());
+    case ISD::CTTZ:
+    case ISD::CTTZ_ZERO_UNDEF:
+      return getConstant(Val.countr_zero(), DL, VT, C->isTargetOpcode(),
+                         C->isOpaque());
+    case ISD::FP16_TO_FP:
+    case ISD::BF16_TO_FP: {
+      bool Ignored;
+      APFloat FPV(Opcode == ISD::FP16_TO_FP ? APFloat::IEEEhalf()
+                                            : APFloat::BFloat(),
+                  (Val.getBitWidth() == 16) ? Val : Val.trunc(16));
+
+      // This can return overflow, underflow, or inexact; we don't care.
+      // FIXME need to be more flexible about rounding mode.
+      (void)FPV.convert(EVTToAPFloatSemantics(VT),
+                        APFloat::rmNearestTiesToEven, &Ignored);
+      return getConstantFP(FPV, DL, VT);
+    }
+    case ISD::STEP_VECTOR: {
+      if (SDValue V = FoldSTEP_VECTOR(DL, VT, N1, *this))
+        return V;
+      break;
+    }
+    }
+  }
+
+  // Constant fold unary operations with a floating point constant operand.
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N1)) {
+    APFloat V = C->getValueAPF();    // make copy
+    switch (Opcode) {
+    case ISD::FNEG:
+      V.changeSign();
+      return getConstantFP(V, DL, VT);
+    case ISD::FABS:
+      V.clearSign();
+      return getConstantFP(V, DL, VT);
+    case ISD::FCEIL: {
+      APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardPositive);
+      if (fs == APFloat::opOK || fs == APFloat::opInexact)
+        return getConstantFP(V, DL, VT);
+      break;
+    }
+    case ISD::FTRUNC: {
+      APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardZero);
+      if (fs == APFloat::opOK || fs == APFloat::opInexact)
+        return getConstantFP(V, DL, VT);
+      break;
+    }
+    case ISD::FFLOOR: {
+      APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardNegative);
+      if (fs == APFloat::opOK || fs == APFloat::opInexact)
+        return getConstantFP(V, DL, VT);
+      break;
+    }
+    case ISD::FP_EXTEND: {
+      bool ignored;
+      // This can return overflow, underflow, or inexact; we don't care.
+      // FIXME need to be more flexible about rounding mode.
+      (void)V.convert(EVTToAPFloatSemantics(VT),
+                      APFloat::rmNearestTiesToEven, &ignored);
+      return getConstantFP(V, DL, VT);
+    }
+    case ISD::FP_TO_SINT:
+    case ISD::FP_TO_UINT: {
+      bool ignored;
+      APSInt IntVal(VT.getSizeInBits(), Opcode == ISD::FP_TO_UINT);
+      // FIXME need to be more flexible about rounding mode.
+      APFloat::opStatus s =
+          V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored);
+      if (s == APFloat::opInvalidOp) // inexact is OK, in fact usual
+        break;
+      return getConstant(IntVal, DL, VT);
+    }
+    case ISD::BITCAST:
+      if (VT == MVT::i16 && C->getValueType(0) == MVT::f16)
+        return getConstant((uint16_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
+      if (VT == MVT::i16 && C->getValueType(0) == MVT::bf16)
+        return getConstant((uint16_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
+      if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
+        return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
+      if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
+        return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT);
+      break;
+    case ISD::FP_TO_FP16:
+    case ISD::FP_TO_BF16: {
+      bool Ignored;
+      // This can return overflow, underflow, or inexact; we don't care.
+      // FIXME need to be more flexible about rounding mode.
+      (void)V.convert(Opcode == ISD::FP_TO_FP16 ? APFloat::IEEEhalf()
+                                                : APFloat::BFloat(),
+                      APFloat::rmNearestTiesToEven, &Ignored);
+      return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT);
+    }
+    }
+  }
+
+  // Constant fold unary operations with a vector integer or float operand.
+  switch (Opcode) {
+  default:
+    // FIXME: Entirely reasonable to perform folding of other unary
+    // operations here as the need arises.
+    break;
+  case ISD::FNEG:
+  case ISD::FABS:
+  case ISD::FCEIL:
+  case ISD::FTRUNC:
+  case ISD::FFLOOR:
+  case ISD::FP_EXTEND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::TRUNCATE:
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+  case ISD::ABS:
+  case ISD::BITREVERSE:
+  case ISD::BSWAP:
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTPOP: {
+    SDValue Ops = {N1};
+    if (SDValue Fold = FoldConstantArithmetic(Opcode, DL, VT, Ops))
+      return Fold;
+  }
+  }
+
+  unsigned OpOpcode = N1.getNode()->getOpcode();
+  switch (Opcode) {
+  case ISD::STEP_VECTOR:
+    assert(VT.isScalableVector() &&
+           "STEP_VECTOR can only be used with scalable types");
+    assert(OpOpcode == ISD::TargetConstant &&
+           VT.getVectorElementType() == N1.getValueType() &&
+           "Unexpected step operand");
+    break;
+  case ISD::FREEZE:
+    assert(VT == N1.getValueType() && "Unexpected VT!");
+    if (isGuaranteedNotToBeUndefOrPoison(N1, /*PoisonOnly*/ false,
+                                         /*Depth*/ 1))
+      return N1;
+    break;
+  case ISD::TokenFactor:
+  case ISD::MERGE_VALUES:
+  case ISD::CONCAT_VECTORS:
+    return N1;         // Factor, merge or concat of one node?  No need.
+  case ISD::BUILD_VECTOR: {
+    // Attempt to simplify BUILD_VECTOR.
+    SDValue Ops[] = {N1};
+    if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
+      return V;
+    break;
+  }
+  case ISD::FP_ROUND: llvm_unreachable("Invalid method to make FP_ROUND node");
+  case ISD::FP_EXTEND:
+    assert(VT.isFloatingPoint() && N1.getValueType().isFloatingPoint() &&
+           "Invalid FP cast!");
+    if (N1.getValueType() == VT) return N1;  // noop conversion.
+    assert((!VT.isVector() || VT.getVectorElementCount() ==
+                                  N1.getValueType().getVectorElementCount()) &&
+           "Vector element count mismatch!");
+    assert(N1.getValueType().bitsLT(VT) && "Invalid fpext node, dst < src!");
+    if (N1.isUndef())
+      return getUNDEF(VT);
+    break;
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    if (N1.isUndef())
+      return getUNDEF(VT);
+    break;
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    // [us]itofp(undef) = 0, because the result value is bounded.
+    if (N1.isUndef())
+      return getConstantFP(0.0, DL, VT);
+    break;
+  case ISD::SIGN_EXTEND:
+    assert(VT.isInteger() && N1.getValueType().isInteger() &&
+           "Invalid SIGN_EXTEND!");
+    assert(VT.isVector() == N1.getValueType().isVector() &&
+           "SIGN_EXTEND result type type should be vector iff the operand "
+           "type is vector!");
+    if (N1.getValueType() == VT) return N1;   // noop extension
+    assert((!VT.isVector() || VT.getVectorElementCount() ==
+                                  N1.getValueType().getVectorElementCount()) &&
+           "Vector element count mismatch!");
+    assert(N1.getValueType().bitsLT(VT) && "Invalid sext node, dst < src!");
+    if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND)
+      return getNode(OpOpcode, DL, VT, N1.getOperand(0));
+    if (OpOpcode == ISD::UNDEF)
+      // sext(undef) = 0, because the top bits will all be the same.
+      return getConstant(0, DL, VT);
+    break;
+  case ISD::ZERO_EXTEND:
+    assert(VT.isInteger() && N1.getValueType().isInteger() &&
+           "Invalid ZERO_EXTEND!");
+    assert(VT.isVector() == N1.getValueType().isVector() &&
+           "ZERO_EXTEND result type type should be vector iff the operand "
+           "type is vector!");
+    if (N1.getValueType() == VT) return N1;   // noop extension
+    assert((!VT.isVector() || VT.getVectorElementCount() ==
+                                  N1.getValueType().getVectorElementCount()) &&
+           "Vector element count mismatch!");
+    assert(N1.getValueType().bitsLT(VT) && "Invalid zext node, dst < src!");
+    if (OpOpcode == ISD::ZERO_EXTEND) // (zext (zext x)) -> (zext x)
+      return getNode(ISD::ZERO_EXTEND, DL, VT, N1.getOperand(0));
+    if (OpOpcode == ISD::UNDEF)
+      // zext(undef) = 0, because the top bits will be zero.
+      return getConstant(0, DL, VT);
+    break;
+  case ISD::ANY_EXTEND:
+    assert(VT.isInteger() && N1.getValueType().isInteger() &&
+           "Invalid ANY_EXTEND!");
+    assert(VT.isVector() == N1.getValueType().isVector() &&
+           "ANY_EXTEND result type type should be vector iff the operand "
+           "type is vector!");
+    if (N1.getValueType() == VT) return N1;   // noop extension
+    assert((!VT.isVector() || VT.getVectorElementCount() ==
+                                  N1.getValueType().getVectorElementCount()) &&
+           "Vector element count mismatch!");
+    assert(N1.getValueType().bitsLT(VT) && "Invalid anyext node, dst < src!");
+
+    if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
+        OpOpcode == ISD::ANY_EXTEND)
+      // (ext (zext x)) -> (zext x)  and  (ext (sext x)) -> (sext x)
+      return getNode(OpOpcode, DL, VT, N1.getOperand(0));
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+
+    // (ext (trunc x)) -> x
+    if (OpOpcode == ISD::TRUNCATE) {
+      SDValue OpOp = N1.getOperand(0);
+      if (OpOp.getValueType() == VT) {
+        transferDbgValues(N1, OpOp);
+        return OpOp;
+      }
+    }
+    break;
+  case ISD::TRUNCATE:
+    assert(VT.isInteger() && N1.getValueType().isInteger() &&
+           "Invalid TRUNCATE!");
+    assert(VT.isVector() == N1.getValueType().isVector() &&
+           "TRUNCATE result type type should be vector iff the operand "
+           "type is vector!");
+    if (N1.getValueType() == VT) return N1;   // noop truncate
+    assert((!VT.isVector() || VT.getVectorElementCount() ==
+                                  N1.getValueType().getVectorElementCount()) &&
+           "Vector element count mismatch!");
+    assert(N1.getValueType().bitsGT(VT) && "Invalid truncate node, src < dst!");
+    if (OpOpcode == ISD::TRUNCATE)
+      return getNode(ISD::TRUNCATE, DL, VT, N1.getOperand(0));
+    if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
+        OpOpcode == ISD::ANY_EXTEND) {
+      // If the source is smaller than the dest, we still need an extend.
+      if (N1.getOperand(0).getValueType().getScalarType().bitsLT(
+              VT.getScalarType()))
+        return getNode(OpOpcode, DL, VT, N1.getOperand(0));
+      if (N1.getOperand(0).getValueType().bitsGT(VT))
+        return getNode(ISD::TRUNCATE, DL, VT, N1.getOperand(0));
+      return N1.getOperand(0);
+    }
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    if (OpOpcode == ISD::VSCALE && !NewNodesMustHaveLegalTypes)
+      return getVScale(DL, VT,
+                       N1.getConstantOperandAPInt(0).trunc(VT.getSizeInBits()));
+    break;
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    assert(VT.isVector() && "This DAG node is restricted to vector types.");
+    assert(N1.getValueType().bitsLE(VT) &&
+           "The input must be the same size or smaller than the result.");
+    assert(VT.getVectorMinNumElements() <
+               N1.getValueType().getVectorMinNumElements() &&
+           "The destination vector type must have fewer lanes than the input.");
+    break;
+  case ISD::ABS:
+    assert(VT.isInteger() && VT == N1.getValueType() && "Invalid ABS!");
+    if (OpOpcode == ISD::UNDEF)
+      return getConstant(0, DL, VT);
+    break;
+  case ISD::BSWAP:
+    assert(VT.isInteger() && VT == N1.getValueType() && "Invalid BSWAP!");
+    assert((VT.getScalarSizeInBits() % 16 == 0) &&
+           "BSWAP types must be a multiple of 16 bits!");
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    // bswap(bswap(X)) -> X.
+    if (OpOpcode == ISD::BSWAP)
+      return N1.getOperand(0);
+    break;
+  case ISD::BITREVERSE:
+    assert(VT.isInteger() && VT == N1.getValueType() && "Invalid BITREVERSE!");
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
+  case ISD::BITCAST:
+    assert(VT.getSizeInBits() == N1.getValueSizeInBits() &&
+           "Cannot BITCAST between types of different sizes!");
+    if (VT == N1.getValueType()) return N1;   // noop conversion.
+    if (OpOpcode == ISD::BITCAST) // bitconv(bitconv(x)) -> bitconv(x)
+      return getNode(ISD::BITCAST, DL, VT, N1.getOperand(0));
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
+  case ISD::SCALAR_TO_VECTOR:
+    assert(VT.isVector() && !N1.getValueType().isVector() &&
+           (VT.getVectorElementType() == N1.getValueType() ||
+            (VT.getVectorElementType().isInteger() &&
+             N1.getValueType().isInteger() &&
+             VT.getVectorElementType().bitsLE(N1.getValueType()))) &&
+           "Illegal SCALAR_TO_VECTOR node!");
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    // scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined.
+    if (OpOpcode == ISD::EXTRACT_VECTOR_ELT &&
+        isa<ConstantSDNode>(N1.getOperand(1)) &&
+        N1.getConstantOperandVal(1) == 0 &&
+        N1.getOperand(0).getValueType() == VT)
+      return N1.getOperand(0);
+    break;
+  case ISD::FNEG:
+    // Negation of an unknown bag of bits is still completely undefined.
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+
+    if (OpOpcode == ISD::FNEG) // --X -> X
+      return N1.getOperand(0);
+    break;
+  case ISD::FABS:
+    if (OpOpcode == ISD::FNEG) // abs(-X) -> abs(X)
+      return getNode(ISD::FABS, DL, VT, N1.getOperand(0));
+    break;
+  case ISD::VSCALE:
+    assert(VT == N1.getValueType() && "Unexpected VT!");
+    break;
+  case ISD::CTPOP:
+    if (N1.getValueType().getScalarType() == MVT::i1)
+      return N1;
+    break;
+  case ISD::CTLZ:
+  case ISD::CTTZ:
+    if (N1.getValueType().getScalarType() == MVT::i1)
+      return getNOT(DL, N1, N1.getValueType());
+    break;
+  case ISD::VECREDUCE_ADD:
+    if (N1.getValueType().getScalarType() == MVT::i1)
+      return getNode(ISD::VECREDUCE_XOR, DL, VT, N1);
+    break;
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+    if (N1.getValueType().getScalarType() == MVT::i1)
+      return getNode(ISD::VECREDUCE_OR, DL, VT, N1);
+    break;
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_UMIN:
+    if (N1.getValueType().getScalarType() == MVT::i1)
+      return getNode(ISD::VECREDUCE_AND, DL, VT, N1);
+    break;
+  }
+
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = {N1};
+  if (VT != MVT::Glue) { // Don't CSE flag producing nodes
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops);
+    void *IP = nullptr;
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+      E->intersectFlagsWith(Flags);
+      return SDValue(E, 0);
+    }
+
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    N->setFlags(Flags);
+    createOperands(N, Ops);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
+  }
+
+  InsertNode(N);
+  SDValue V = SDValue(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+static std::optional<APInt> FoldValue(unsigned Opcode, const APInt &C1,
+                                      const APInt &C2) {
+  switch (Opcode) {
+  case ISD::ADD:  return C1 + C2;
+  case ISD::SUB:  return C1 - C2;
+  case ISD::MUL:  return C1 * C2;
+  case ISD::AND:  return C1 & C2;
+  case ISD::OR:   return C1 | C2;
+  case ISD::XOR:  return C1 ^ C2;
+  case ISD::SHL:  return C1 << C2;
+  case ISD::SRL:  return C1.lshr(C2);
+  case ISD::SRA:  return C1.ashr(C2);
+  case ISD::ROTL: return C1.rotl(C2);
+  case ISD::ROTR: return C1.rotr(C2);
+  case ISD::SMIN: return C1.sle(C2) ? C1 : C2;
+  case ISD::SMAX: return C1.sge(C2) ? C1 : C2;
+  case ISD::UMIN: return C1.ule(C2) ? C1 : C2;
+  case ISD::UMAX: return C1.uge(C2) ? C1 : C2;
+  case ISD::SADDSAT: return C1.sadd_sat(C2);
+  case ISD::UADDSAT: return C1.uadd_sat(C2);
+  case ISD::SSUBSAT: return C1.ssub_sat(C2);
+  case ISD::USUBSAT: return C1.usub_sat(C2);
+  case ISD::SSHLSAT: return C1.sshl_sat(C2);
+  case ISD::USHLSAT: return C1.ushl_sat(C2);
+  case ISD::UDIV:
+    if (!C2.getBoolValue())
+      break;
+    return C1.udiv(C2);
+  case ISD::UREM:
+    if (!C2.getBoolValue())
+      break;
+    return C1.urem(C2);
+  case ISD::SDIV:
+    if (!C2.getBoolValue())
+      break;
+    return C1.sdiv(C2);
+  case ISD::SREM:
+    if (!C2.getBoolValue())
+      break;
+    return C1.srem(C2);
+  case ISD::MULHS: {
+    unsigned FullWidth = C1.getBitWidth() * 2;
+    APInt C1Ext = C1.sext(FullWidth);
+    APInt C2Ext = C2.sext(FullWidth);
+    return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth());
+  }
+  case ISD::MULHU: {
+    unsigned FullWidth = C1.getBitWidth() * 2;
+    APInt C1Ext = C1.zext(FullWidth);
+    APInt C2Ext = C2.zext(FullWidth);
+    return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth());
+  }
+  case ISD::AVGFLOORS: {
+    unsigned FullWidth = C1.getBitWidth() + 1;
+    APInt C1Ext = C1.sext(FullWidth);
+    APInt C2Ext = C2.sext(FullWidth);
+    return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1);
+  }
+  case ISD::AVGFLOORU: {
+    unsigned FullWidth = C1.getBitWidth() + 1;
+    APInt C1Ext = C1.zext(FullWidth);
+    APInt C2Ext = C2.zext(FullWidth);
+    return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1);
+  }
+  case ISD::AVGCEILS: {
+    unsigned FullWidth = C1.getBitWidth() + 1;
+    APInt C1Ext = C1.sext(FullWidth);
+    APInt C2Ext = C2.sext(FullWidth);
+    return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1);
+  }
+  case ISD::AVGCEILU: {
+    unsigned FullWidth = C1.getBitWidth() + 1;
+    APInt C1Ext = C1.zext(FullWidth);
+    APInt C2Ext = C2.zext(FullWidth);
+    return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1);
+  }
+  case ISD::ABDS:
+    return APIntOps::smax(C1, C2) - APIntOps::smin(C1, C2);
+  case ISD::ABDU:
+    return APIntOps::umax(C1, C2) - APIntOps::umin(C1, C2);
+  }
+  return std::nullopt;
+}
+
+// Handle constant folding with UNDEF.
+// TODO: Handle more cases.
+static std::optional<APInt> FoldValueWithUndef(unsigned Opcode, const APInt &C1,
+                                               bool IsUndef1, const APInt &C2,
+                                               bool IsUndef2) {
+  if (!(IsUndef1 || IsUndef2))
+    return FoldValue(Opcode, C1, C2);
+
+  // Fold and(x, undef) -> 0
+  // Fold mul(x, undef) -> 0
+  if (Opcode == ISD::AND || Opcode == ISD::MUL)
+    return APInt::getZero(C1.getBitWidth());
+
+  return std::nullopt;
+}
+
+SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
+                                       const GlobalAddressSDNode *GA,
+                                       const SDNode *N2) {
+  if (GA->getOpcode() != ISD::GlobalAddress)
+    return SDValue();
+  if (!TLI->isOffsetFoldingLegal(GA))
+    return SDValue();
+  auto *C2 = dyn_cast<ConstantSDNode>(N2);
+  if (!C2)
+    return SDValue();
+  int64_t Offset = C2->getSExtValue();
+  switch (Opcode) {
+  case ISD::ADD: break;
+  case ISD::SUB: Offset = -uint64_t(Offset); break;
+  default: return SDValue();
+  }
+  return getGlobalAddress(GA->getGlobal(), SDLoc(C2), VT,
+                          GA->getOffset() + uint64_t(Offset));
+}
+
+bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
+  switch (Opcode) {
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM: {
+    // If a divisor is zero/undef or any element of a divisor vector is
+    // zero/undef, the whole op is undef.
+    assert(Ops.size() == 2 && "Div/rem should have 2 operands");
+    SDValue Divisor = Ops[1];
+    if (Divisor.isUndef() || isNullConstant(Divisor))
+      return true;
+
+    return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) &&
+           llvm::any_of(Divisor->op_values(),
+                        [](SDValue V) { return V.isUndef() ||
+                                        isNullConstant(V); });
+    // TODO: Handle signed overflow.
+  }
+  // TODO: Handle oversized shifts.
+  default:
+    return false;
+  }
+}
+
+SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
+                                             EVT VT, ArrayRef<SDValue> Ops) {
+  // If the opcode is a target-specific ISD node, there's nothing we can
+  // do here and the operand rules may not line up with the below, so
+  // bail early.
+  // We can't create a scalar CONCAT_VECTORS so skip it. It will break
+  // for concats involving SPLAT_VECTOR. Concats of BUILD_VECTORS are handled by
+  // foldCONCAT_VECTORS in getNode before this is called.
+  if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::CONCAT_VECTORS)
+    return SDValue();
+
+  unsigned NumOps = Ops.size();
+  if (NumOps == 0)
+    return SDValue();
+
+  if (isUndef(Opcode, Ops))
+    return getUNDEF(VT);
+
+  // Handle binops special cases.
+  if (NumOps == 2) {
+    if (SDValue CFP = foldConstantFPMath(Opcode, DL, VT, Ops[0], Ops[1]))
+      return CFP;
+
+    if (auto *C1 = dyn_cast<ConstantSDNode>(Ops[0])) {
+      if (auto *C2 = dyn_cast<ConstantSDNode>(Ops[1])) {
+        if (C1->isOpaque() || C2->isOpaque())
+          return SDValue();
+
+        std::optional<APInt> FoldAttempt =
+            FoldValue(Opcode, C1->getAPIntValue(), C2->getAPIntValue());
+        if (!FoldAttempt)
+          return SDValue();
+
+        SDValue Folded = getConstant(*FoldAttempt, DL, VT);
+        assert((!Folded || !VT.isVector()) &&
+               "Can't fold vectors ops with scalar operands");
+        return Folded;
+      }
+    }
+
+    // fold (add Sym, c) -> Sym+c
+    if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Ops[0]))
+      return FoldSymbolOffset(Opcode, VT, GA, Ops[1].getNode());
+    if (TLI->isCommutativeBinOp(Opcode))
+      if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Ops[1]))
+        return FoldSymbolOffset(Opcode, VT, GA, Ops[0].getNode());
+  }
+
+  // This is for vector folding only from here on.
+  if (!VT.isVector())
+    return SDValue();
+
+  ElementCount NumElts = VT.getVectorElementCount();
+
+  // See if we can fold through bitcasted integer ops.
+  if (NumOps == 2 && VT.isFixedLengthVector() && VT.isInteger() &&
+      Ops[0].getValueType() == VT && Ops[1].getValueType() == VT &&
+      Ops[0].getOpcode() == ISD::BITCAST &&
+      Ops[1].getOpcode() == ISD::BITCAST) {
+    SDValue N1 = peekThroughBitcasts(Ops[0]);
+    SDValue N2 = peekThroughBitcasts(Ops[1]);
+    auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
+    auto *BV2 = dyn_cast<BuildVectorSDNode>(N2);
+    EVT BVVT = N1.getValueType();
+    if (BV1 && BV2 && BVVT.isInteger() && BVVT == N2.getValueType()) {
+      bool IsLE = getDataLayout().isLittleEndian();
+      unsigned EltBits = VT.getScalarSizeInBits();
+      SmallVector<APInt> RawBits1, RawBits2;
+      BitVector UndefElts1, UndefElts2;
+      if (BV1->getConstantRawBits(IsLE, EltBits, RawBits1, UndefElts1) &&
+          BV2->getConstantRawBits(IsLE, EltBits, RawBits2, UndefElts2)) {
+        SmallVector<APInt> RawBits;
+        for (unsigned I = 0, E = NumElts.getFixedValue(); I != E; ++I) {
+          std::optional<APInt> Fold = FoldValueWithUndef(
+              Opcode, RawBits1[I], UndefElts1[I], RawBits2[I], UndefElts2[I]);
+          if (!Fold)
+            break;
+          RawBits.push_back(*Fold);
+        }
+        if (RawBits.size() == NumElts.getFixedValue()) {
+          // We have constant folded, but we need to cast this again back to
+          // the original (possibly legalized) type.
+          SmallVector<APInt> DstBits;
+          BitVector DstUndefs;
+          BuildVectorSDNode::recastRawBits(IsLE, BVVT.getScalarSizeInBits(),
+                                           DstBits, RawBits, DstUndefs,
+                                           BitVector(RawBits.size(), false));
+          EVT BVEltVT = BV1->getOperand(0).getValueType();
+          unsigned BVEltBits = BVEltVT.getSizeInBits();
+          SmallVector<SDValue> Ops(DstBits.size(), getUNDEF(BVEltVT));
+          for (unsigned I = 0, E = DstBits.size(); I != E; ++I) {
+            if (DstUndefs[I])
+              continue;
+            Ops[I] = getConstant(DstBits[I].sext(BVEltBits), DL, BVEltVT);
+          }
+          return getBitcast(VT, getBuildVector(BVVT, DL, Ops));
+        }
+      }
+    }
+  }
+
+  // Fold (mul step_vector(C0), C1) to (step_vector(C0 * C1)).
+  //      (shl step_vector(C0), C1) -> (step_vector(C0 << C1))
+  if ((Opcode == ISD::MUL || Opcode == ISD::SHL) &&
+      Ops[0].getOpcode() == ISD::STEP_VECTOR) {
+    APInt RHSVal;
+    if (ISD::isConstantSplatVector(Ops[1].getNode(), RHSVal)) {
+      APInt NewStep = Opcode == ISD::MUL
+                          ? Ops[0].getConstantOperandAPInt(0) * RHSVal
+                          : Ops[0].getConstantOperandAPInt(0) << RHSVal;
+      return getStepVector(DL, VT, NewStep);
+    }
+  }
+
+  auto IsScalarOrSameVectorSize = [NumElts](const SDValue &Op) {
+    return !Op.getValueType().isVector() ||
+           Op.getValueType().getVectorElementCount() == NumElts;
+  };
+
+  auto IsBuildVectorSplatVectorOrUndef = [](const SDValue &Op) {
+    return Op.isUndef() || Op.getOpcode() == ISD::CONDCODE ||
+           Op.getOpcode() == ISD::BUILD_VECTOR ||
+           Op.getOpcode() == ISD::SPLAT_VECTOR;
+  };
+
+  // All operands must be vector types with the same number of elements as
+  // the result type and must be either UNDEF or a build/splat vector
+  // or UNDEF scalars.
+  if (!llvm::all_of(Ops, IsBuildVectorSplatVectorOrUndef) ||
+      !llvm::all_of(Ops, IsScalarOrSameVectorSize))
+    return SDValue();
+
+  // If we are comparing vectors, then the result needs to be a i1 boolean that
+  // is then extended back to the legal result type depending on how booleans
+  // are represented.
+  EVT SVT = (Opcode == ISD::SETCC ? MVT::i1 : VT.getScalarType());
+  ISD::NodeType ExtendCode =
+      (Opcode == ISD::SETCC && SVT != VT.getScalarType())
+          ? TargetLowering::getExtendForContent(TLI->getBooleanContents(VT))
+          : ISD::SIGN_EXTEND;
+
+  // Find legal integer scalar type for constant promotion and
+  // ensure that its scalar size is at least as large as source.
+  EVT LegalSVT = VT.getScalarType();
+  if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {
+    LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
+    if (LegalSVT.bitsLT(VT.getScalarType()))
+      return SDValue();
+  }
+
+  // For scalable vector types we know we're dealing with SPLAT_VECTORs. We
+  // only have one operand to check. For fixed-length vector types we may have
+  // a combination of BUILD_VECTOR and SPLAT_VECTOR.
+  unsigned NumVectorElts = NumElts.isScalable() ? 1 : NumElts.getFixedValue();
+
+  // Constant fold each scalar lane separately.
+  SmallVector<SDValue, 4> ScalarResults;
+  for (unsigned I = 0; I != NumVectorElts; I++) {
+    SmallVector<SDValue, 4> ScalarOps;
+    for (SDValue Op : Ops) {
+      EVT InSVT = Op.getValueType().getScalarType();
+      if (Op.getOpcode() != ISD::BUILD_VECTOR &&
+          Op.getOpcode() != ISD::SPLAT_VECTOR) {
+        if (Op.isUndef())
+          ScalarOps.push_back(getUNDEF(InSVT));
+        else
+          ScalarOps.push_back(Op);
+        continue;
+      }
+
+      SDValue ScalarOp =
+          Op.getOperand(Op.getOpcode() == ISD::SPLAT_VECTOR ? 0 : I);
+      EVT ScalarVT = ScalarOp.getValueType();
+
+      // Build vector (integer) scalar operands may need implicit
+      // truncation - do this before constant folding.
+      if (ScalarVT.isInteger() && ScalarVT.bitsGT(InSVT)) {
+        // Don't create illegally-typed nodes unless they're constants or undef
+        // - if we fail to constant fold we can't guarantee the (dead) nodes
+        // we're creating will be cleaned up before being visited for
+        // legalization.
+        if (NewNodesMustHaveLegalTypes && !ScalarOp.isUndef() &&
+            !isa<ConstantSDNode>(ScalarOp) &&
+            TLI->getTypeAction(*getContext(), InSVT) !=
+                TargetLowering::TypeLegal)
+          return SDValue();
+        ScalarOp = getNode(ISD::TRUNCATE, DL, InSVT, ScalarOp);
+      }
+
+      ScalarOps.push_back(ScalarOp);
+    }
+
+    // Constant fold the scalar operands.
+    SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps);
+
+    // Legalize the (integer) scalar constant if necessary.
+    if (LegalSVT != SVT)
+      ScalarResult = getNode(ExtendCode, DL, LegalSVT, ScalarResult);
+
+    // Scalar folding only succeeded if the result is a constant or UNDEF.
+    if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
+        ScalarResult.getOpcode() != ISD::ConstantFP)
+      return SDValue();
+    ScalarResults.push_back(ScalarResult);
+  }
+
+  SDValue V = NumElts.isScalable() ? getSplatVector(VT, DL, ScalarResults[0])
+                                   : getBuildVector(VT, DL, ScalarResults);
+  NewSDValueDbgMsg(V, "New node fold constant vector: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::foldConstantFPMath(unsigned Opcode, const SDLoc &DL,
+                                         EVT VT, SDValue N1, SDValue N2) {
+  // TODO: We don't do any constant folding for strict FP opcodes here, but we
+  //       should. That will require dealing with a potentially non-default
+  //       rounding mode, checking the "opStatus" return value from the APFloat
+  //       math calculations, and possibly other variations.
+  ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, /*AllowUndefs*/ false);
+  ConstantFPSDNode *N2CFP = isConstOrConstSplatFP(N2, /*AllowUndefs*/ false);
+  if (N1CFP && N2CFP) {
+    APFloat C1 = N1CFP->getValueAPF(); // make copy
+    const APFloat &C2 = N2CFP->getValueAPF();
+    switch (Opcode) {
+    case ISD::FADD:
+      C1.add(C2, APFloat::rmNearestTiesToEven);
+      return getConstantFP(C1, DL, VT);
+    case ISD::FSUB:
+      C1.subtract(C2, APFloat::rmNearestTiesToEven);
+      return getConstantFP(C1, DL, VT);
+    case ISD::FMUL:
+      C1.multiply(C2, APFloat::rmNearestTiesToEven);
+      return getConstantFP(C1, DL, VT);
+    case ISD::FDIV:
+      C1.divide(C2, APFloat::rmNearestTiesToEven);
+      return getConstantFP(C1, DL, VT);
+    case ISD::FREM:
+      C1.mod(C2);
+      return getConstantFP(C1, DL, VT);
+    case ISD::FCOPYSIGN:
+      C1.copySign(C2);
+      return getConstantFP(C1, DL, VT);
+    case ISD::FMINNUM:
+      return getConstantFP(minnum(C1, C2), DL, VT);
+    case ISD::FMAXNUM:
+      return getConstantFP(maxnum(C1, C2), DL, VT);
+    case ISD::FMINIMUM:
+      return getConstantFP(minimum(C1, C2), DL, VT);
+    case ISD::FMAXIMUM:
+      return getConstantFP(maximum(C1, C2), DL, VT);
+    default: break;
+    }
+  }
+  if (N1CFP && Opcode == ISD::FP_ROUND) {
+    APFloat C1 = N1CFP->getValueAPF();    // make copy
+    bool Unused;
+    // This can return overflow, underflow, or inexact; we don't care.
+    // FIXME need to be more flexible about rounding mode.
+    (void) C1.convert(EVTToAPFloatSemantics(VT), APFloat::rmNearestTiesToEven,
+                      &Unused);
+    return getConstantFP(C1, DL, VT);
+  }
+
+  switch (Opcode) {
+  case ISD::FSUB:
+    // -0.0 - undef --> undef (consistent with "fneg undef")
+    if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, /*AllowUndefs*/ true))
+      if (N1C && N1C->getValueAPF().isNegZero() && N2.isUndef())
+        return getUNDEF(VT);
+    [[fallthrough]];
+
+  case ISD::FADD:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+    // If both operands are undef, the result is undef. If 1 operand is undef,
+    // the result is NaN. This should match the behavior of the IR optimizer.
+    if (N1.isUndef() && N2.isUndef())
+      return getUNDEF(VT);
+    if (N1.isUndef() || N2.isUndef())
+      return getConstantFP(APFloat::getNaN(EVTToAPFloatSemantics(VT)), DL, VT);
+  }
+  return SDValue();
+}
+
+SDValue SelectionDAG::getAssertAlign(const SDLoc &DL, SDValue Val, Align A) {
+  assert(Val.getValueType().isInteger() && "Invalid AssertAlign!");
+
+  // There's no need to assert on a byte-aligned pointer. All pointers are at
+  // least byte aligned.
+  if (A == Align(1))
+    return Val;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::AssertAlign, getVTList(Val.getValueType()), {Val});
+  ID.AddInteger(A.value());
+
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<AssertAlignSDNode>(DL.getIROrder(), DL.getDebugLoc(),
+                                         Val.getValueType(), A);
+  createOperands(N, {Val});
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1, SDValue N2) {
+  SDNodeFlags Flags;
+  if (Inserter)
+    Flags = Inserter->getFlags();
+  return getNode(Opcode, DL, VT, N1, N2, Flags);
+}
+
+void SelectionDAG::canonicalizeCommutativeBinop(unsigned Opcode, SDValue &N1,
+                                                SDValue &N2) const {
+  if (!TLI->isCommutativeBinOp(Opcode))
+    return;
+
+  // Canonicalize:
+  //   binop(const, nonconst) -> binop(nonconst, const)
+  SDNode *N1C = isConstantIntBuildVectorOrConstantInt(N1);
+  SDNode *N2C = isConstantIntBuildVectorOrConstantInt(N2);
+  SDNode *N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
+  SDNode *N2CFP = isConstantFPBuildVectorOrConstantFP(N2);
+  if ((N1C && !N2C) || (N1CFP && !N2CFP))
+    std::swap(N1, N2);
+
+  // Canonicalize:
+  //  binop(splat(x), step_vector) -> binop(step_vector, splat(x))
+  else if (N1.getOpcode() == ISD::SPLAT_VECTOR &&
+           N2.getOpcode() == ISD::STEP_VECTOR)
+    std::swap(N1, N2);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1, SDValue N2, const SDNodeFlags Flags) {
+  assert(N1.getOpcode() != ISD::DELETED_NODE &&
+         N2.getOpcode() != ISD::DELETED_NODE &&
+         "Operand is DELETED_NODE!");
+
+  canonicalizeCommutativeBinop(Opcode, N1, N2);
+
+  auto *N1C = dyn_cast<ConstantSDNode>(N1);
+  auto *N2C = dyn_cast<ConstantSDNode>(N2);
+
+  // Don't allow undefs in vector splats - we might be returning N2 when folding
+  // to zero etc.
+  ConstantSDNode *N2CV =
+      isConstOrConstSplat(N2, /*AllowUndefs*/ false, /*AllowTruncation*/ true);
+
+  switch (Opcode) {
+  default: break;
+  case ISD::TokenFactor:
+    assert(VT == MVT::Other && N1.getValueType() == MVT::Other &&
+           N2.getValueType() == MVT::Other && "Invalid token factor!");
+    // Fold trivial token factors.
+    if (N1.getOpcode() == ISD::EntryToken) return N2;
+    if (N2.getOpcode() == ISD::EntryToken) return N1;
+    if (N1 == N2) return N1;
+    break;
+  case ISD::BUILD_VECTOR: {
+    // Attempt to simplify BUILD_VECTOR.
+    SDValue Ops[] = {N1, N2};
+    if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
+      return V;
+    break;
+  }
+  case ISD::CONCAT_VECTORS: {
+    SDValue Ops[] = {N1, N2};
+    if (SDValue V = foldCONCAT_VECTORS(DL, VT, Ops, *this))
+      return V;
+    break;
+  }
+  case ISD::AND:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    // (X & 0) -> 0.  This commonly occurs when legalizing i64 values, so it's
+    // worth handling here.
+    if (N2CV && N2CV->isZero())
+      return N2;
+    if (N2CV && N2CV->isAllOnes()) // X & -1 -> X
+      return N1;
+    break;
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::ADD:
+  case ISD::SUB:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    // (X ^|+- 0) -> X.  This commonly occurs when legalizing i64 values, so
+    // it's worth handling here.
+    if (N2CV && N2CV->isZero())
+      return N1;
+    if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && VT.isVector() &&
+        VT.getVectorElementType() == MVT::i1)
+      return getNode(ISD::XOR, DL, VT, N1, N2);
+    break;
+  case ISD::MUL:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+      return getNode(ISD::AND, DL, VT, N1, N2);
+    if (N2C && (N1.getOpcode() == ISD::VSCALE) && Flags.hasNoSignedWrap()) {
+      const APInt &MulImm = N1->getConstantOperandAPInt(0);
+      const APInt &N2CImm = N2C->getAPIntValue();
+      return getVScale(DL, VT, MulImm * N2CImm);
+    }
+    break;
+  case ISD::UDIV:
+  case ISD::UREM:
+  case ISD::MULHU:
+  case ISD::MULHS:
+  case ISD::SDIV:
+  case ISD::SREM:
+  case ISD::SADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::UADDSAT:
+  case ISD::USUBSAT:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
+      // fold (add_sat x, y) -> (or x, y) for bool types.
+      if (Opcode == ISD::SADDSAT || Opcode == ISD::UADDSAT)
+        return getNode(ISD::OR, DL, VT, N1, N2);
+      // fold (sub_sat x, y) -> (and x, ~y) for bool types.
+      if (Opcode == ISD::SSUBSAT || Opcode == ISD::USUBSAT)
+        return getNode(ISD::AND, DL, VT, N1, getNOT(DL, N2, VT));
+    }
+    break;
+  case ISD::ABDS:
+  case ISD::ABDU:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    break;
+  case ISD::SMIN:
+  case ISD::UMAX:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+      return getNode(ISD::OR, DL, VT, N1, N2);
+    break;
+  case ISD::SMAX:
+  case ISD::UMIN:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+      return getNode(ISD::AND, DL, VT, N1, N2);
+    break;
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+    assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    if (SDValue V = simplifyFPBinop(Opcode, N1, N2, Flags))
+      return V;
+    break;
+  case ISD::FCOPYSIGN:   // N1 and result must match.  N1/N2 need not match.
+    assert(N1.getValueType() == VT &&
+           N1.getValueType().isFloatingPoint() &&
+           N2.getValueType().isFloatingPoint() &&
+           "Invalid FCOPYSIGN!");
+    break;
+  case ISD::SHL:
+    if (N2C && (N1.getOpcode() == ISD::VSCALE) && Flags.hasNoSignedWrap()) {
+      const APInt &MulImm = N1->getConstantOperandAPInt(0);
+      const APInt &ShiftImm = N2C->getAPIntValue();
+      return getVScale(DL, VT, MulImm << ShiftImm);
+    }
+    [[fallthrough]];
+  case ISD::SRA:
+  case ISD::SRL:
+    if (SDValue V = simplifyShift(N1, N2))
+      return V;
+    [[fallthrough]];
+  case ISD::ROTL:
+  case ISD::ROTR:
+    assert(VT == N1.getValueType() &&
+           "Shift operators return type must be the same as their first arg");
+    assert(VT.isInteger() && N2.getValueType().isInteger() &&
+           "Shifts only work on integers");
+    assert((!VT.isVector() || VT == N2.getValueType()) &&
+           "Vector shift amounts must be in the same as their first arg");
+    // Verify that the shift amount VT is big enough to hold valid shift
+    // amounts.  This catches things like trying to shift an i1024 value by an
+    // i8, which is easy to fall into in generic code that uses
+    // TLI.getShiftAmount().
+    assert(N2.getValueType().getScalarSizeInBits() >=
+               Log2_32_Ceil(VT.getScalarSizeInBits()) &&
+           "Invalid use of small shift amount with oversized value!");
+
+    // Always fold shifts of i1 values so the code generator doesn't need to
+    // handle them.  Since we know the size of the shift has to be less than the
+    // size of the value, the shift/rotate count is guaranteed to be zero.
+    if (VT == MVT::i1)
+      return N1;
+    if (N2CV && N2CV->isZero())
+      return N1;
+    break;
+  case ISD::FP_ROUND:
+    assert(VT.isFloatingPoint() &&
+           N1.getValueType().isFloatingPoint() &&
+           VT.bitsLE(N1.getValueType()) &&
+           N2C && (N2C->getZExtValue() == 0 || N2C->getZExtValue() == 1) &&
+           "Invalid FP_ROUND!");
+    if (N1.getValueType() == VT) return N1;  // noop conversion.
+    break;
+  case ISD::AssertSext:
+  case ISD::AssertZext: {
+    EVT EVT = cast<VTSDNode>(N2)->getVT();
+    assert(VT == N1.getValueType() && "Not an inreg extend!");
+    assert(VT.isInteger() && EVT.isInteger() &&
+           "Cannot *_EXTEND_INREG FP types");
+    assert(!EVT.isVector() &&
+           "AssertSExt/AssertZExt type should be the vector element type "
+           "rather than the vector type!");
+    assert(EVT.bitsLE(VT.getScalarType()) && "Not extending!");
+    if (VT.getScalarType() == EVT) return N1; // noop assertion.
+    break;
+  }
+  case ISD::SIGN_EXTEND_INREG: {
+    EVT EVT = cast<VTSDNode>(N2)->getVT();
+    assert(VT == N1.getValueType() && "Not an inreg extend!");
+    assert(VT.isInteger() && EVT.isInteger() &&
+           "Cannot *_EXTEND_INREG FP types");
+    assert(EVT.isVector() == VT.isVector() &&
+           "SIGN_EXTEND_INREG type should be vector iff the operand "
+           "type is vector!");
+    assert((!EVT.isVector() ||
+            EVT.getVectorElementCount() == VT.getVectorElementCount()) &&
+           "Vector element counts must match in SIGN_EXTEND_INREG");
+    assert(EVT.bitsLE(VT) && "Not extending!");
+    if (EVT == VT) return N1;  // Not actually extending
+
+    auto SignExtendInReg = [&](APInt Val, llvm::EVT ConstantVT) {
+      unsigned FromBits = EVT.getScalarSizeInBits();
+      Val <<= Val.getBitWidth() - FromBits;
+      Val.ashrInPlace(Val.getBitWidth() - FromBits);
+      return getConstant(Val, DL, ConstantVT);
+    };
+
+    if (N1C) {
+      const APInt &Val = N1C->getAPIntValue();
+      return SignExtendInReg(Val, VT);
+    }
+
+    if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {
+      SmallVector<SDValue, 8> Ops;
+      llvm::EVT OpVT = N1.getOperand(0).getValueType();
+      for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+        SDValue Op = N1.getOperand(i);
+        if (Op.isUndef()) {
+          Ops.push_back(getUNDEF(OpVT));
+          continue;
+        }
+        ConstantSDNode *C = cast<ConstantSDNode>(Op);
+        APInt Val = C->getAPIntValue();
+        Ops.push_back(SignExtendInReg(Val, OpVT));
+      }
+      return getBuildVector(VT, DL, Ops);
+    }
+    break;
+  }
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT: {
+    assert(VT.isInteger() && cast<VTSDNode>(N2)->getVT().isInteger() &&
+           N1.getValueType().isFloatingPoint() && "Invalid FP_TO_*INT_SAT");
+    assert(N1.getValueType().isVector() == VT.isVector() &&
+           "FP_TO_*INT_SAT type should be vector iff the operand type is "
+           "vector!");
+    assert((!VT.isVector() || VT.getVectorElementCount() ==
+                                  N1.getValueType().getVectorElementCount()) &&
+           "Vector element counts must match in FP_TO_*INT_SAT");
+    assert(!cast<VTSDNode>(N2)->getVT().isVector() &&
+           "Type to saturate to must be a scalar.");
+    assert(cast<VTSDNode>(N2)->getVT().bitsLE(VT.getScalarType()) &&
+           "Not extending!");
+    break;
+  }
+  case ISD::EXTRACT_VECTOR_ELT:
+    assert(VT.getSizeInBits() >= N1.getValueType().getScalarSizeInBits() &&
+           "The result of EXTRACT_VECTOR_ELT must be at least as wide as the \
+             element type of the vector.");
+
+    // Extract from an undefined value or using an undefined index is undefined.
+    if (N1.isUndef() || N2.isUndef())
+      return getUNDEF(VT);
+
+    // EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF for fixed length
+    // vectors. For scalable vectors we will provide appropriate support for
+    // dealing with arbitrary indices.
+    if (N2C && N1.getValueType().isFixedLengthVector() &&
+        N2C->getAPIntValue().uge(N1.getValueType().getVectorNumElements()))
+      return getUNDEF(VT);
+
+    // EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is
+    // expanding copies of large vectors from registers. This only works for
+    // fixed length vectors, since we need to know the exact number of
+    // elements.
+    if (N2C && N1.getOperand(0).getValueType().isFixedLengthVector() &&
+        N1.getOpcode() == ISD::CONCAT_VECTORS && N1.getNumOperands() > 0) {
+      unsigned Factor =
+        N1.getOperand(0).getValueType().getVectorNumElements();
+      return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+                     N1.getOperand(N2C->getZExtValue() / Factor),
+                     getVectorIdxConstant(N2C->getZExtValue() % Factor, DL));
+    }
+
+    // EXTRACT_VECTOR_ELT of BUILD_VECTOR or SPLAT_VECTOR is often formed while
+    // lowering is expanding large vector constants.
+    if (N2C && (N1.getOpcode() == ISD::BUILD_VECTOR ||
+                N1.getOpcode() == ISD::SPLAT_VECTOR)) {
+      assert((N1.getOpcode() != ISD::BUILD_VECTOR ||
+              N1.getValueType().isFixedLengthVector()) &&
+             "BUILD_VECTOR used for scalable vectors");
+      unsigned Index =
+          N1.getOpcode() == ISD::BUILD_VECTOR ? N2C->getZExtValue() : 0;
+      SDValue Elt = N1.getOperand(Index);
+
+      if (VT != Elt.getValueType())
+        // If the vector element type is not legal, the BUILD_VECTOR operands
+        // are promoted and implicitly truncated, and the result implicitly
+        // extended. Make that explicit here.
+        Elt = getAnyExtOrTrunc(Elt, DL, VT);
+
+      return Elt;
+    }
+
+    // EXTRACT_VECTOR_ELT of INSERT_VECTOR_ELT is often formed when vector
+    // operations are lowered to scalars.
+    if (N1.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+      // If the indices are the same, return the inserted element else
+      // if the indices are known different, extract the element from
+      // the original vector.
+      SDValue N1Op2 = N1.getOperand(2);
+      ConstantSDNode *N1Op2C = dyn_cast<ConstantSDNode>(N1Op2);
+
+      if (N1Op2C && N2C) {
+        if (N1Op2C->getZExtValue() == N2C->getZExtValue()) {
+          if (VT == N1.getOperand(1).getValueType())
+            return N1.getOperand(1);
+          if (VT.isFloatingPoint()) {
+            assert(VT.getSizeInBits() > N1.getOperand(1).getValueType().getSizeInBits());
+            return getFPExtendOrRound(N1.getOperand(1), DL, VT);
+          }
+          return getSExtOrTrunc(N1.getOperand(1), DL, VT);
+        }
+        return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0), N2);
+      }
+    }
+
+    // EXTRACT_VECTOR_ELT of v1iX EXTRACT_SUBVECTOR could be formed
+    // when vector types are scalarized and v1iX is legal.
+    // vextract (v1iX extract_subvector(vNiX, Idx)) -> vextract(vNiX,Idx).
+    // Here we are completely ignoring the extract element index (N2),
+    // which is fine for fixed width vectors, since any index other than 0
+    // is undefined anyway. However, this cannot be ignored for scalable
+    // vectors - in theory we could support this, but we don't want to do this
+    // without a profitability check.
+    if (N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        N1.getValueType().isFixedLengthVector() &&
+        N1.getValueType().getVectorNumElements() == 1) {
+      return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0),
+                     N1.getOperand(1));
+    }
+    break;
+  case ISD::EXTRACT_ELEMENT:
+    assert(N2C && (unsigned)N2C->getZExtValue() < 2 && "Bad EXTRACT_ELEMENT!");
+    assert(!N1.getValueType().isVector() && !VT.isVector() &&
+           (N1.getValueType().isInteger() == VT.isInteger()) &&
+           N1.getValueType() != VT &&
+           "Wrong types for EXTRACT_ELEMENT!");
+
+    // EXTRACT_ELEMENT of BUILD_PAIR is often formed while legalize is expanding
+    // 64-bit integers into 32-bit parts.  Instead of building the extract of
+    // the BUILD_PAIR, only to have legalize rip it apart, just do it now.
+    if (N1.getOpcode() == ISD::BUILD_PAIR)
+      return N1.getOperand(N2C->getZExtValue());
+
+    // EXTRACT_ELEMENT of a constant int is also very common.
+    if (N1C) {
+      unsigned ElementSize = VT.getSizeInBits();
+      unsigned Shift = ElementSize * N2C->getZExtValue();
+      const APInt &Val = N1C->getAPIntValue();
+      return getConstant(Val.extractBits(ElementSize, Shift), DL, VT);
+    }
+    break;
+  case ISD::EXTRACT_SUBVECTOR: {
+    EVT N1VT = N1.getValueType();
+    assert(VT.isVector() && N1VT.isVector() &&
+           "Extract subvector VTs must be vectors!");
+    assert(VT.getVectorElementType() == N1VT.getVectorElementType() &&
+           "Extract subvector VTs must have the same element type!");
+    assert((VT.isFixedLengthVector() || N1VT.isScalableVector()) &&
+           "Cannot extract a scalable vector from a fixed length vector!");
+    assert((VT.isScalableVector() != N1VT.isScalableVector() ||
+            VT.getVectorMinNumElements() <= N1VT.getVectorMinNumElements()) &&
+           "Extract subvector must be from larger vector to smaller vector!");
+    assert(N2C && "Extract subvector index must be a constant");
+    assert((VT.isScalableVector() != N1VT.isScalableVector() ||
+            (VT.getVectorMinNumElements() + N2C->getZExtValue()) <=
+                N1VT.getVectorMinNumElements()) &&
+           "Extract subvector overflow!");
+    assert(N2C->getAPIntValue().getBitWidth() ==
+               TLI->getVectorIdxTy(getDataLayout()).getFixedSizeInBits() &&
+           "Constant index for EXTRACT_SUBVECTOR has an invalid size");
+
+    // Trivial extraction.
+    if (VT == N1VT)
+      return N1;
+
+    // EXTRACT_SUBVECTOR of an UNDEF is an UNDEF.
+    if (N1.isUndef())
+      return getUNDEF(VT);
+
+    // EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of
+    // the concat have the same type as the extract.
+    if (N1.getOpcode() == ISD::CONCAT_VECTORS && N1.getNumOperands() > 0 &&
+        VT == N1.getOperand(0).getValueType()) {
+      unsigned Factor = VT.getVectorMinNumElements();
+      return N1.getOperand(N2C->getZExtValue() / Factor);
+    }
+
+    // EXTRACT_SUBVECTOR of INSERT_SUBVECTOR is often created
+    // during shuffle legalization.
+    if (N1.getOpcode() == ISD::INSERT_SUBVECTOR && N2 == N1.getOperand(2) &&
+        VT == N1.getOperand(1).getValueType())
+      return N1.getOperand(1);
+    break;
+  }
+  }
+
+  // Perform trivial constant folding.
+  if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}))
+    return SV;
+
+  // Canonicalize an UNDEF to the RHS, even over a constant.
+  if (N1.isUndef()) {
+    if (TLI->isCommutativeBinOp(Opcode)) {
+      std::swap(N1, N2);
+    } else {
+      switch (Opcode) {
+      case ISD::SUB:
+        return getUNDEF(VT);     // fold op(undef, arg2) -> undef
+      case ISD::SIGN_EXTEND_INREG:
+      case ISD::UDIV:
+      case ISD::SDIV:
+      case ISD::UREM:
+      case ISD::SREM:
+      case ISD::SSUBSAT:
+      case ISD::USUBSAT:
+        return getConstant(0, DL, VT);    // fold op(undef, arg2) -> 0
+      }
+    }
+  }
+
+  // Fold a bunch of operators when the RHS is undef.
+  if (N2.isUndef()) {
+    switch (Opcode) {
+    case ISD::XOR:
+      if (N1.isUndef())
+        // Handle undef ^ undef -> 0 special case. This is a common
+        // idiom (misuse).
+        return getConstant(0, DL, VT);
+      [[fallthrough]];
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::UDIV:
+    case ISD::SDIV:
+    case ISD::UREM:
+    case ISD::SREM:
+      return getUNDEF(VT);       // fold op(arg1, undef) -> undef
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::SSUBSAT:
+    case ISD::USUBSAT:
+      return getConstant(0, DL, VT);  // fold op(arg1, undef) -> 0
+    case ISD::OR:
+    case ISD::SADDSAT:
+    case ISD::UADDSAT:
+      return getAllOnesConstant(DL, VT);
+    }
+  }
+
+  // Memoize this node if possible.
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = {N1, N2};
+  if (VT != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops);
+    void *IP = nullptr;
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+      E->intersectFlagsWith(Flags);
+      return SDValue(E, 0);
+    }
+
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    N->setFlags(Flags);
+    createOperands(N, Ops);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
+  }
+
+  InsertNode(N);
+  SDValue V = SDValue(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1, SDValue N2, SDValue N3) {
+  SDNodeFlags Flags;
+  if (Inserter)
+    Flags = Inserter->getFlags();
+  return getNode(Opcode, DL, VT, N1, N2, N3, Flags);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1, SDValue N2, SDValue N3,
+                              const SDNodeFlags Flags) {
+  assert(N1.getOpcode() != ISD::DELETED_NODE &&
+         N2.getOpcode() != ISD::DELETED_NODE &&
+         N3.getOpcode() != ISD::DELETED_NODE &&
+         "Operand is DELETED_NODE!");
+  // Perform various simplifications.
+  switch (Opcode) {
+  case ISD::FMA: {
+    assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
+    assert(N1.getValueType() == VT && N2.getValueType() == VT &&
+           N3.getValueType() == VT && "FMA types must match!");
+    ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+    ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
+    ConstantFPSDNode *N3CFP = dyn_cast<ConstantFPSDNode>(N3);
+    if (N1CFP && N2CFP && N3CFP) {
+      APFloat  V1 = N1CFP->getValueAPF();
+      const APFloat &V2 = N2CFP->getValueAPF();
+      const APFloat &V3 = N3CFP->getValueAPF();
+      V1.fusedMultiplyAdd(V2, V3, APFloat::rmNearestTiesToEven);
+      return getConstantFP(V1, DL, VT);
+    }
+    break;
+  }
+  case ISD::BUILD_VECTOR: {
+    // Attempt to simplify BUILD_VECTOR.
+    SDValue Ops[] = {N1, N2, N3};
+    if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
+      return V;
+    break;
+  }
+  case ISD::CONCAT_VECTORS: {
+    SDValue Ops[] = {N1, N2, N3};
+    if (SDValue V = foldCONCAT_VECTORS(DL, VT, Ops, *this))
+      return V;
+    break;
+  }
+  case ISD::SETCC: {
+    assert(VT.isInteger() && "SETCC result type must be an integer!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           "SETCC operands must have the same type!");
+    assert(VT.isVector() == N1.getValueType().isVector() &&
+           "SETCC type should be vector iff the operand type is vector!");
+    assert((!VT.isVector() || VT.getVectorElementCount() ==
+                                  N1.getValueType().getVectorElementCount()) &&
+           "SETCC vector element counts must match!");
+    // Use FoldSetCC to simplify SETCC's.
+    if (SDValue V = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL))
+      return V;
+    // Vector constant folding.
+    SDValue Ops[] = {N1, N2, N3};
+    if (SDValue V = FoldConstantArithmetic(Opcode, DL, VT, Ops)) {
+      NewSDValueDbgMsg(V, "New node vector constant folding: ", this);
+      return V;
+    }
+    break;
+  }
+  case ISD::SELECT:
+  case ISD::VSELECT:
+    if (SDValue V = simplifySelect(N1, N2, N3))
+      return V;
+    break;
+  case ISD::VECTOR_SHUFFLE:
+    llvm_unreachable("should use getVectorShuffle constructor!");
+  case ISD::VECTOR_SPLICE: {
+    if (cast<ConstantSDNode>(N3)->isZero())
+      return N1;
+    break;
+  }
+  case ISD::INSERT_VECTOR_ELT: {
+    ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N3);
+    // INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF, except
+    // for scalable vectors where we will generate appropriate code to
+    // deal with out-of-bounds cases correctly.
+    if (N3C && N1.getValueType().isFixedLengthVector() &&
+        N3C->getZExtValue() >= N1.getValueType().getVectorNumElements())
+      return getUNDEF(VT);
+
+    // Undefined index can be assumed out-of-bounds, so that's UNDEF too.
+    if (N3.isUndef())
+      return getUNDEF(VT);
+
+    // If the inserted element is an UNDEF, just use the input vector.
+    if (N2.isUndef())
+      return N1;
+
+    break;
+  }
+  case ISD::INSERT_SUBVECTOR: {
+    // Inserting undef into undef is still undef.
+    if (N1.isUndef() && N2.isUndef())
+      return getUNDEF(VT);
+
+    EVT N2VT = N2.getValueType();
+    assert(VT == N1.getValueType() &&
+           "Dest and insert subvector source types must match!");
+    assert(VT.isVector() && N2VT.isVector() &&
+           "Insert subvector VTs must be vectors!");
+    assert(VT.getVectorElementType() == N2VT.getVectorElementType() &&
+           "Insert subvector VTs must have the same element type!");
+    assert((VT.isScalableVector() || N2VT.isFixedLengthVector()) &&
+           "Cannot insert a scalable vector into a fixed length vector!");
+    assert((VT.isScalableVector() != N2VT.isScalableVector() ||
+            VT.getVectorMinNumElements() >= N2VT.getVectorMinNumElements()) &&
+           "Insert subvector must be from smaller vector to larger vector!");
+    assert(isa<ConstantSDNode>(N3) &&
+           "Insert subvector index must be constant");
+    assert((VT.isScalableVector() != N2VT.isScalableVector() ||
+            (N2VT.getVectorMinNumElements() +
+             cast<ConstantSDNode>(N3)->getZExtValue()) <=
+                VT.getVectorMinNumElements()) &&
+           "Insert subvector overflow!");
+    assert(cast<ConstantSDNode>(N3)->getAPIntValue().getBitWidth() ==
+               TLI->getVectorIdxTy(getDataLayout()).getFixedSizeInBits() &&
+           "Constant index for INSERT_SUBVECTOR has an invalid size");
+
+    // Trivial insertion.
+    if (VT == N2VT)
+      return N2;
+
+    // If this is an insert of an extracted vector into an undef vector, we
+    // can just use the input to the extract.
+    if (N1.isUndef() && N2.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT)
+      return N2.getOperand(0);
+    break;
+  }
+  case ISD::BITCAST:
+    // Fold bit_convert nodes from a type to themselves.
+    if (N1.getValueType() == VT)
+      return N1;
+    break;
+  case ISD::VP_TRUNCATE:
+  case ISD::VP_SIGN_EXTEND:
+  case ISD::VP_ZERO_EXTEND:
+    // Don't create noop casts.
+    if (N1.getValueType() == VT)
+      return N1;
+    break;
+  }
+
+  // Memoize node if it doesn't produce a flag.
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = {N1, N2, N3};
+  if (VT != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops);
+    void *IP = nullptr;
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+      E->intersectFlagsWith(Flags);
+      return SDValue(E, 0);
+    }
+
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    N->setFlags(Flags);
+    createOperands(N, Ops);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
+  }
+
+  InsertNode(N);
+  SDValue V = SDValue(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
+  SDValue Ops[] = { N1, N2, N3, N4 };
+  return getNode(Opcode, DL, VT, Ops);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1, SDValue N2, SDValue N3, SDValue N4,
+                              SDValue N5) {
+  SDValue Ops[] = { N1, N2, N3, N4, N5 };
+  return getNode(Opcode, DL, VT, Ops);
+}
+
+/// getStackArgumentTokenFactor - Compute a TokenFactor to force all
+/// the incoming stack arguments to be loaded from the stack.
+SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) {
+  SmallVector<SDValue, 8> ArgChains;
+
+  // Include the original chain at the beginning of the list. When this is
+  // used by target LowerCall hooks, this helps legalize find the
+  // CALLSEQ_BEGIN node.
+  ArgChains.push_back(Chain);
+
+  // Add a chain value for each stack argument.
+  for (SDNode *U : getEntryNode().getNode()->uses())
+    if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
+        if (FI->getIndex() < 0)
+          ArgChains.push_back(SDValue(L, 1));
+
+  // Build a tokenfactor for all the chains.
+  return getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
+}
+
+/// getMemsetValue - Vectorized representation of the memset value
+/// operand.
+static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
+                              const SDLoc &dl) {
+  assert(!Value.isUndef());
+
+  unsigned NumBits = VT.getScalarSizeInBits();
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
+    assert(C->getAPIntValue().getBitWidth() == 8);
+    APInt Val = APInt::getSplat(NumBits, C->getAPIntValue());
+    if (VT.isInteger()) {
+      bool IsOpaque = VT.getSizeInBits() > 64 ||
+          !DAG.getTargetLoweringInfo().isLegalStoreImmediate(C->getSExtValue());
+      return DAG.getConstant(Val, dl, VT, false, IsOpaque);
+    }
+    return DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(VT), Val), dl,
+                             VT);
+  }
+
+  assert(Value.getValueType() == MVT::i8 && "memset with non-byte fill value?");
+  EVT IntVT = VT.getScalarType();
+  if (!IntVT.isInteger())
+    IntVT = EVT::getIntegerVT(*DAG.getContext(), IntVT.getSizeInBits());
+
+  Value = DAG.getNode(ISD::ZERO_EXTEND, dl, IntVT, Value);
+  if (NumBits > 8) {
+    // Use a multiplication with 0x010101... to extend the input to the
+    // required length.
+    APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
+    Value = DAG.getNode(ISD::MUL, dl, IntVT, Value,
+                        DAG.getConstant(Magic, dl, IntVT));
+  }
+
+  if (VT != Value.getValueType() && !VT.isInteger())
+    Value = DAG.getBitcast(VT.getScalarType(), Value);
+  if (VT != Value.getValueType())
+    Value = DAG.getSplatBuildVector(VT, dl, Value);
+
+  return Value;
+}
+
+/// getMemsetStringVal - Similar to getMemsetValue. Except this is only
+/// used when a memcpy is turned into a memset when the source is a constant
+/// string ptr.
+static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
+                                  const TargetLowering &TLI,
+                                  const ConstantDataArraySlice &Slice) {
+  // Handle vector with all elements zero.
+  if (Slice.Array == nullptr) {
+    if (VT.isInteger())
+      return DAG.getConstant(0, dl, VT);
+    if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128)
+      return DAG.getConstantFP(0.0, dl, VT);
+    if (VT.isVector()) {
+      unsigned NumElts = VT.getVectorNumElements();
+      MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
+      return DAG.getNode(ISD::BITCAST, dl, VT,
+                         DAG.getConstant(0, dl,
+                                         EVT::getVectorVT(*DAG.getContext(),
+                                                          EltVT, NumElts)));
+    }
+    llvm_unreachable("Expected type!");
+  }
+
+  assert(!VT.isVector() && "Can't handle vector type here!");
+  unsigned NumVTBits = VT.getSizeInBits();
+  unsigned NumVTBytes = NumVTBits / 8;
+  unsigned NumBytes = std::min(NumVTBytes, unsigned(Slice.Length));
+
+  APInt Val(NumVTBits, 0);
+  if (DAG.getDataLayout().isLittleEndian()) {
+    for (unsigned i = 0; i != NumBytes; ++i)
+      Val |= (uint64_t)(unsigned char)Slice[i] << i*8;
+  } else {
+    for (unsigned i = 0; i != NumBytes; ++i)
+      Val |= (uint64_t)(unsigned char)Slice[i] << (NumVTBytes-i-1)*8;
+  }
+
+  // If the "cost" of materializing the integer immediate is less than the cost
+  // of a load, then it is cost effective to turn the load into the immediate.
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+  if (TLI.shouldConvertConstantLoadToIntImm(Val, Ty))
+    return DAG.getConstant(Val, dl, VT);
+  return SDValue();
+}
+
+SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, TypeSize Offset,
+                                           const SDLoc &DL,
+                                           const SDNodeFlags Flags) {
+  EVT VT = Base.getValueType();
+  SDValue Index;
+
+  if (Offset.isScalable())
+    Index = getVScale(DL, Base.getValueType(),
+                      APInt(Base.getValueSizeInBits().getFixedValue(),
+                            Offset.getKnownMinValue()));
+  else
+    Index = getConstant(Offset.getFixedValue(), DL, VT);
+
+  return getMemBasePlusOffset(Base, Index, DL, Flags);
+}
+
+SDValue SelectionDAG::getMemBasePlusOffset(SDValue Ptr, SDValue Offset,
+                                           const SDLoc &DL,
+                                           const SDNodeFlags Flags) {
+  assert(Offset.getValueType().isInteger());
+  EVT BasePtrVT = Ptr.getValueType();
+  return getNode(ISD::ADD, DL, BasePtrVT, Ptr, Offset, Flags);
+}
+
+/// Returns true if memcpy source is constant data.
+static bool isMemSrcFromConstant(SDValue Src, ConstantDataArraySlice &Slice) {
+  uint64_t SrcDelta = 0;
+  GlobalAddressSDNode *G = nullptr;
+  if (Src.getOpcode() == ISD::GlobalAddress)
+    G = cast<GlobalAddressSDNode>(Src);
+  else if (Src.getOpcode() == ISD::ADD &&
+           Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
+           Src.getOperand(1).getOpcode() == ISD::Constant) {
+    G = cast<GlobalAddressSDNode>(Src.getOperand(0));
+    SrcDelta = cast<ConstantSDNode>(Src.getOperand(1))->getZExtValue();
+  }
+  if (!G)
+    return false;
+
+  return getConstantDataArrayInfo(G->getGlobal(), Slice, 8,
+                                  SrcDelta + G->getOffset());
+}
+
+static bool shouldLowerMemFuncForSize(const MachineFunction &MF,
+                                      SelectionDAG &DAG) {
+  // On Darwin, -Os means optimize for size without hurting performance, so
+  // only really optimize for size when -Oz (MinSize) is used.
+  if (MF.getTarget().getTargetTriple().isOSDarwin())
+    return MF.getFunction().hasMinSize();
+  return DAG.shouldOptForSize();
+}
+
+static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                          SmallVector<SDValue, 32> &OutChains, unsigned From,
+                          unsigned To, SmallVector<SDValue, 16> &OutLoadChains,
+                          SmallVector<SDValue, 16> &OutStoreChains) {
+  assert(OutLoadChains.size() && "Missing loads in memcpy inlining");
+  assert(OutStoreChains.size() && "Missing stores in memcpy inlining");
+  SmallVector<SDValue, 16> GluedLoadChains;
+  for (unsigned i = From; i < To; ++i) {
+    OutChains.push_back(OutLoadChains[i]);
+    GluedLoadChains.push_back(OutLoadChains[i]);
+  }
+
+  // Chain for all loads.
+  SDValue LoadToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                  GluedLoadChains);
+
+  for (unsigned i = From; i < To; ++i) {
+    StoreSDNode *ST = dyn_cast<StoreSDNode>(OutStoreChains[i]);
+    SDValue NewStore = DAG.getTruncStore(LoadToken, dl, ST->getValue(),
+                                  ST->getBasePtr(), ST->getMemoryVT(),
+                                  ST->getMemOperand());
+    OutChains.push_back(NewStore);
+  }
+}
+
+static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
+                                       SDValue Chain, SDValue Dst, SDValue Src,
+                                       uint64_t Size, Align Alignment,
+                                       bool isVol, bool AlwaysInline,
+                                       MachinePointerInfo DstPtrInfo,
+                                       MachinePointerInfo SrcPtrInfo,
+                                       const AAMDNodes &AAInfo, AAResults *AA) {
+  // Turn a memcpy of undef to nop.
+  // FIXME: We need to honor volatile even is Src is undef.
+  if (Src.isUndef())
+    return Chain;
+
+  // Expand memcpy to a series of load and store ops if the size operand falls
+  // below a certain threshold.
+  // TODO: In the AlwaysInline case, if the size is big then generate a loop
+  // rather than maybe a humongous number of loads and stores.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const DataLayout &DL = DAG.getDataLayout();
+  LLVMContext &C = *DAG.getContext();
+  std::vector<EVT> MemOps;
+  bool DstAlignCanChange = false;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool OptSize = shouldLowerMemFuncForSize(MF, DAG);
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+  if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
+    DstAlignCanChange = true;
+  MaybeAlign SrcAlign = DAG.InferPtrAlign(Src);
+  if (!SrcAlign || Alignment > *SrcAlign)
+    SrcAlign = Alignment;
+  assert(SrcAlign && "SrcAlign must be set");
+  ConstantDataArraySlice Slice;
+  // If marked as volatile, perform a copy even when marked as constant.
+  bool CopyFromConstant = !isVol && isMemSrcFromConstant(Src, Slice);
+  bool isZeroConstant = CopyFromConstant && Slice.Array == nullptr;
+  unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize);
+  const MemOp Op = isZeroConstant
+                       ? MemOp::Set(Size, DstAlignCanChange, Alignment,
+                                    /*IsZeroMemset*/ true, isVol)
+                       : MemOp::Copy(Size, DstAlignCanChange, Alignment,
+                                     *SrcAlign, isVol, CopyFromConstant);
+  if (!TLI.findOptimalMemOpLowering(
+          MemOps, Limit, Op, DstPtrInfo.getAddrSpace(),
+          SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes()))
+    return SDValue();
+
+  if (DstAlignCanChange) {
+    Type *Ty = MemOps[0].getTypeForEVT(C);
+    Align NewAlign = DL.getABITypeAlign(Ty);
+
+    // Don't promote to an alignment that would require dynamic stack
+    // realignment which may conflict with optimizations such as tail call
+    // optimization.
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+    if (!TRI->hasStackRealignment(MF))
+      while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
+        NewAlign = NewAlign.previous();
+
+    if (NewAlign > Alignment) {
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI.getObjectAlign(FI->getIndex()) < NewAlign)
+        MFI.setObjectAlignment(FI->getIndex(), NewAlign);
+      Alignment = NewAlign;
+    }
+  }
+
+  // Prepare AAInfo for loads/stores after lowering this memcpy.
+  AAMDNodes NewAAInfo = AAInfo;
+  NewAAInfo.TBAA = NewAAInfo.TBAAStruct = nullptr;
+
+  const Value *SrcVal = dyn_cast_if_present<const Value *>(SrcPtrInfo.V);
+  bool isConstant =
+      AA && SrcVal &&
+      AA->pointsToConstantMemory(MemoryLocation(SrcVal, Size, AAInfo));
+
+  MachineMemOperand::Flags MMOFlags =
+      isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
+  SmallVector<SDValue, 16> OutLoadChains;
+  SmallVector<SDValue, 16> OutStoreChains;
+  SmallVector<SDValue, 32> OutChains;
+  unsigned NumMemOps = MemOps.size();
+  uint64_t SrcOff = 0, DstOff = 0;
+  for (unsigned i = 0; i != NumMemOps; ++i) {
+    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Value, Store;
+
+    if (VTSize > Size) {
+      // Issuing an unaligned load / store pair  that overlaps with the previous
+      // pair. Adjust the offset accordingly.
+      assert(i == NumMemOps-1 && i != 0);
+      SrcOff -= VTSize - Size;
+      DstOff -= VTSize - Size;
+    }
+
+    if (CopyFromConstant &&
+        (isZeroConstant || (VT.isInteger() && !VT.isVector()))) {
+      // It's unlikely a store of a vector immediate can be done in a single
+      // instruction. It would require a load from a constantpool first.
+      // We only handle zero vectors here.
+      // FIXME: Handle other cases where store of vector immediate is done in
+      // a single instruction.
+      ConstantDataArraySlice SubSlice;
+      if (SrcOff < Slice.Length) {
+        SubSlice = Slice;
+        SubSlice.move(SrcOff);
+      } else {
+        // This is an out-of-bounds access and hence UB. Pretend we read zero.
+        SubSlice.Array = nullptr;
+        SubSlice.Offset = 0;
+        SubSlice.Length = VTSize;
+      }
+      Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
+      if (Value.getNode()) {
+        Store = DAG.getStore(
+            Chain, dl, Value,
+            DAG.getMemBasePlusOffset(Dst, TypeSize::Fixed(DstOff), dl),
+            DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo);
+        OutChains.push_back(Store);
+      }
+    }
+
+    if (!Store.getNode()) {
+      // The type might not be legal for the target.  This should only happen
+      // if the type is smaller than a legal type, as on PPC, so the right
+      // thing to do is generate a LoadExt/StoreTrunc pair.  These simplify
+      // to Load/Store if NVT==VT.
+      // FIXME does the case above also need this?
+      EVT NVT = TLI.getTypeToTransformTo(C, VT);
+      assert(NVT.bitsGE(VT));
+
+      bool isDereferenceable =
+        SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
+      MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
+      if (isDereferenceable)
+        SrcMMOFlags |= MachineMemOperand::MODereferenceable;
+      if (isConstant)
+        SrcMMOFlags |= MachineMemOperand::MOInvariant;
+
+      Value = DAG.getExtLoad(
+          ISD::EXTLOAD, dl, NVT, Chain,
+          DAG.getMemBasePlusOffset(Src, TypeSize::Fixed(SrcOff), dl),
+          SrcPtrInfo.getWithOffset(SrcOff), VT,
+          commonAlignment(*SrcAlign, SrcOff), SrcMMOFlags, NewAAInfo);
+      OutLoadChains.push_back(Value.getValue(1));
+
+      Store = DAG.getTruncStore(
+          Chain, dl, Value,
+          DAG.getMemBasePlusOffset(Dst, TypeSize::Fixed(DstOff), dl),
+          DstPtrInfo.getWithOffset(DstOff), VT, Alignment, MMOFlags, NewAAInfo);
+      OutStoreChains.push_back(Store);
+    }
+    SrcOff += VTSize;
+    DstOff += VTSize;
+    Size -= VTSize;
+  }
+
+  unsigned GluedLdStLimit = MaxLdStGlue == 0 ?
+                                TLI.getMaxGluedStoresPerMemcpy() : MaxLdStGlue;
+  unsigned NumLdStInMemcpy = OutStoreChains.size();
+
+  if (NumLdStInMemcpy) {
+    // It may be that memcpy might be converted to memset if it's memcpy
+    // of constants. In such a case, we won't have loads and stores, but
+    // just stores. In the absence of loads, there is nothing to gang up.
+    if ((GluedLdStLimit <= 1) || !EnableMemCpyDAGOpt) {
+      // If target does not care, just leave as it.
+      for (unsigned i = 0; i < NumLdStInMemcpy; ++i) {
+        OutChains.push_back(OutLoadChains[i]);
+        OutChains.push_back(OutStoreChains[i]);
+      }
+    } else {
+      // Ld/St less than/equal limit set by target.
+      if (NumLdStInMemcpy <= GluedLdStLimit) {
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
+                                        NumLdStInMemcpy, OutLoadChains,
+                                        OutStoreChains);
+      } else {
+        unsigned NumberLdChain =  NumLdStInMemcpy / GluedLdStLimit;
+        unsigned RemainingLdStInMemcpy = NumLdStInMemcpy % GluedLdStLimit;
+        unsigned GlueIter = 0;
+
+        for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) {
+          unsigned IndexFrom = NumLdStInMemcpy - GlueIter - GluedLdStLimit;
+          unsigned IndexTo   = NumLdStInMemcpy - GlueIter;
+
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo,
+                                       OutLoadChains, OutStoreChains);
+          GlueIter += GluedLdStLimit;
+        }
+
+        // Residual ld/st.
+        if (RemainingLdStInMemcpy) {
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
+                                        RemainingLdStInMemcpy, OutLoadChains,
+                                        OutStoreChains);
+        }
+      }
+    }
+  }
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+}
+
+static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
+                                        SDValue Chain, SDValue Dst, SDValue Src,
+                                        uint64_t Size, Align Alignment,
+                                        bool isVol, bool AlwaysInline,
+                                        MachinePointerInfo DstPtrInfo,
+                                        MachinePointerInfo SrcPtrInfo,
+                                        const AAMDNodes &AAInfo) {
+  // Turn a memmove of undef to nop.
+  // FIXME: We need to honor volatile even is Src is undef.
+  if (Src.isUndef())
+    return Chain;
+
+  // Expand memmove to a series of load and store ops if the size operand falls
+  // below a certain threshold.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const DataLayout &DL = DAG.getDataLayout();
+  LLVMContext &C = *DAG.getContext();
+  std::vector<EVT> MemOps;
+  bool DstAlignCanChange = false;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool OptSize = shouldLowerMemFuncForSize(MF, DAG);
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+  if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
+    DstAlignCanChange = true;
+  MaybeAlign SrcAlign = DAG.InferPtrAlign(Src);
+  if (!SrcAlign || Alignment > *SrcAlign)
+    SrcAlign = Alignment;
+  assert(SrcAlign && "SrcAlign must be set");
+  unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize);
+  if (!TLI.findOptimalMemOpLowering(
+          MemOps, Limit,
+          MemOp::Copy(Size, DstAlignCanChange, Alignment, *SrcAlign,
+                      /*IsVolatile*/ true),
+          DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
+          MF.getFunction().getAttributes()))
+    return SDValue();
+
+  if (DstAlignCanChange) {
+    Type *Ty = MemOps[0].getTypeForEVT(C);
+    Align NewAlign = DL.getABITypeAlign(Ty);
+
+    // Don't promote to an alignment that would require dynamic stack
+    // realignment which may conflict with optimizations such as tail call
+    // optimization.
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+    if (!TRI->hasStackRealignment(MF))
+      while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
+        NewAlign = NewAlign.previous();
+
+    if (NewAlign > Alignment) {
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI.getObjectAlign(FI->getIndex()) < NewAlign)
+        MFI.setObjectAlignment(FI->getIndex(), NewAlign);
+      Alignment = NewAlign;
+    }
+  }
+
+  // Prepare AAInfo for loads/stores after lowering this memmove.
+  AAMDNodes NewAAInfo = AAInfo;
+  NewAAInfo.TBAA = NewAAInfo.TBAAStruct = nullptr;
+
+  MachineMemOperand::Flags MMOFlags =
+      isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
+  uint64_t SrcOff = 0, DstOff = 0;
+  SmallVector<SDValue, 8> LoadValues;
+  SmallVector<SDValue, 8> LoadChains;
+  SmallVector<SDValue, 8> OutChains;
+  unsigned NumMemOps = MemOps.size();
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Value;
+
+    bool isDereferenceable =
+      SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
+    MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
+    if (isDereferenceable)
+      SrcMMOFlags |= MachineMemOperand::MODereferenceable;
+
+    Value = DAG.getLoad(
+        VT, dl, Chain,
+        DAG.getMemBasePlusOffset(Src, TypeSize::Fixed(SrcOff), dl),
+        SrcPtrInfo.getWithOffset(SrcOff), *SrcAlign, SrcMMOFlags, NewAAInfo);
+    LoadValues.push_back(Value);
+    LoadChains.push_back(Value.getValue(1));
+    SrcOff += VTSize;
+  }
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
+  OutChains.clear();
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Store;
+
+    Store = DAG.getStore(
+        Chain, dl, LoadValues[i],
+        DAG.getMemBasePlusOffset(Dst, TypeSize::Fixed(DstOff), dl),
+        DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo);
+    OutChains.push_back(Store);
+    DstOff += VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+}
+
+/// Lower the call to 'memset' intrinsic function into a series of store
+/// operations.
+///
+/// \param DAG Selection DAG where lowered code is placed.
+/// \param dl Link to corresponding IR location.
+/// \param Chain Control flow dependency.
+/// \param Dst Pointer to destination memory location.
+/// \param Src Value of byte to write into the memory.
+/// \param Size Number of bytes to write.
+/// \param Alignment Alignment of the destination in bytes.
+/// \param isVol True if destination is volatile.
+/// \param AlwaysInline Makes sure no function call is generated.
+/// \param DstPtrInfo IR information on the memory pointer.
+/// \returns New head in the control flow, if lowering was successful, empty
+/// SDValue otherwise.
+///
+/// The function tries to replace 'llvm.memset' intrinsic with several store
+/// operations and value calculation code. This is usually profitable for small
+/// memory size or when the semantic requires inlining.
+static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
+                               SDValue Chain, SDValue Dst, SDValue Src,
+                               uint64_t Size, Align Alignment, bool isVol,
+                               bool AlwaysInline, MachinePointerInfo DstPtrInfo,
+                               const AAMDNodes &AAInfo) {
+  // Turn a memset of undef to nop.
+  // FIXME: We need to honor volatile even is Src is undef.
+  if (Src.isUndef())
+    return Chain;
+
+  // Expand memset to a series of load/store ops if the size operand
+  // falls below a certain threshold.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  std::vector<EVT> MemOps;
+  bool DstAlignCanChange = false;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool OptSize = shouldLowerMemFuncForSize(MF, DAG);
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+  if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
+    DstAlignCanChange = true;
+  bool IsZeroVal = isNullConstant(Src);
+  unsigned Limit = AlwaysInline ? ~0 : TLI.getMaxStoresPerMemset(OptSize);
+
+  if (!TLI.findOptimalMemOpLowering(
+          MemOps, Limit,
+          MemOp::Set(Size, DstAlignCanChange, Alignment, IsZeroVal, isVol),
+          DstPtrInfo.getAddrSpace(), ~0u, MF.getFunction().getAttributes()))
+    return SDValue();
+
+  if (DstAlignCanChange) {
+    Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+    const DataLayout &DL = DAG.getDataLayout();
+    Align NewAlign = DL.getABITypeAlign(Ty);
+
+    // Don't promote to an alignment that would require dynamic stack
+    // realignment which may conflict with optimizations such as tail call
+    // optimization.
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+    if (!TRI->hasStackRealignment(MF))
+      while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
+        NewAlign = NewAlign.previous();
+
+    if (NewAlign > Alignment) {
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI.getObjectAlign(FI->getIndex()) < NewAlign)
+        MFI.setObjectAlignment(FI->getIndex(), NewAlign);
+      Alignment = NewAlign;
+    }
+  }
+
+  SmallVector<SDValue, 8> OutChains;
+  uint64_t DstOff = 0;
+  unsigned NumMemOps = MemOps.size();
+
+  // Find the largest store and generate the bit pattern for it.
+  EVT LargestVT = MemOps[0];
+  for (unsigned i = 1; i < NumMemOps; i++)
+    if (MemOps[i].bitsGT(LargestVT))
+      LargestVT = MemOps[i];
+  SDValue MemSetValue = getMemsetValue(Src, LargestVT, DAG, dl);
+
+  // Prepare AAInfo for loads/stores after lowering this memset.
+  AAMDNodes NewAAInfo = AAInfo;
+  NewAAInfo.TBAA = NewAAInfo.TBAAStruct = nullptr;
+
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    if (VTSize > Size) {
+      // Issuing an unaligned load / store pair  that overlaps with the previous
+      // pair. Adjust the offset accordingly.
+      assert(i == NumMemOps-1 && i != 0);
+      DstOff -= VTSize - Size;
+    }
+
+    // If this store is smaller than the largest store see whether we can get
+    // the smaller value for free with a truncate.
+    SDValue Value = MemSetValue;
+    if (VT.bitsLT(LargestVT)) {
+      if (!LargestVT.isVector() && !VT.isVector() &&
+          TLI.isTruncateFree(LargestVT, VT))
+        Value = DAG.getNode(ISD::TRUNCATE, dl, VT, MemSetValue);
+      else
+        Value = getMemsetValue(Src, VT, DAG, dl);
+    }
+    assert(Value.getValueType() == VT && "Value with wrong type.");
+    SDValue Store = DAG.getStore(
+        Chain, dl, Value,
+        DAG.getMemBasePlusOffset(Dst, TypeSize::Fixed(DstOff), dl),
+        DstPtrInfo.getWithOffset(DstOff), Alignment,
+        isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone,
+        NewAAInfo);
+    OutChains.push_back(Store);
+    DstOff += VT.getSizeInBits() / 8;
+    Size -= VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+}
+
+static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI,
+                                            unsigned AS) {
+  // Lowering memcpy / memset / memmove intrinsics to calls is only valid if all
+  // pointer operands can be losslessly bitcasted to pointers of address space 0
+  if (AS != 0 && !TLI->getTargetMachine().isNoopAddrSpaceCast(AS, 0)) {
+    report_fatal_error("cannot lower memory intrinsic in address space " +
+                       Twine(AS));
+  }
+}
+
+SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                                SDValue Src, SDValue Size, Align Alignment,
+                                bool isVol, bool AlwaysInline, bool isTailCall,
+                                MachinePointerInfo DstPtrInfo,
+                                MachinePointerInfo SrcPtrInfo,
+                                const AAMDNodes &AAInfo, AAResults *AA) {
+  // Check to see if we should lower the memcpy to loads and stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memcpy with size zero? Just return the original chain.
+    if (ConstantSize->isZero())
+      return Chain;
+
+    SDValue Result = getMemcpyLoadsAndStores(
+        *this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Alignment,
+        isVol, false, DstPtrInfo, SrcPtrInfo, AAInfo, AA);
+    if (Result.getNode())
+      return Result;
+  }
+
+  // Then check to see if we should lower the memcpy with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  if (TSI) {
+    SDValue Result = TSI->EmitTargetCodeForMemcpy(
+        *this, dl, Chain, Dst, Src, Size, Alignment, isVol, AlwaysInline,
+        DstPtrInfo, SrcPtrInfo);
+    if (Result.getNode())
+      return Result;
+  }
+
+  // If we really need inline code and the target declined to provide it,
+  // use a (potentially long) sequence of loads and stores.
+  if (AlwaysInline) {
+    assert(ConstantSize && "AlwaysInline requires a constant size!");
+    return getMemcpyLoadsAndStores(
+        *this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Alignment,
+        isVol, true, DstPtrInfo, SrcPtrInfo, AAInfo, AA);
+  }
+
+  checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
+  checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());
+
+  // FIXME: If the memcpy is volatile (isVol), lowering it to a plain libc
+  // memcpy is not guaranteed to be safe. libc memcpys aren't required to
+  // respect volatile, so they may do things like read or write memory
+  // beyond the given memory regions. But fixing this isn't easy, and most
+  // people don't care.
+
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = Type::getInt8PtrTy(*getContext());
+  Entry.Node = Dst; Args.push_back(Entry);
+  Entry.Node = Src; Args.push_back(Entry);
+
+  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
+  Entry.Node = Size; Args.push_back(Entry);
+  // FIXME: pass in SDLoc
+  TargetLowering::CallLoweringInfo CLI(*this);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
+                    Dst.getValueType().getTypeForEVT(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
+      .setDiscardResult()
+      .setTailCall(isTailCall);
+
+  std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getAtomicMemcpy(SDValue Chain, const SDLoc &dl,
+                                      SDValue Dst, SDValue Src, SDValue Size,
+                                      Type *SizeTy, unsigned ElemSz,
+                                      bool isTailCall,
+                                      MachinePointerInfo DstPtrInfo,
+                                      MachinePointerInfo SrcPtrInfo) {
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
+  Entry.Node = Dst;
+  Args.push_back(Entry);
+
+  Entry.Node = Src;
+  Args.push_back(Entry);
+
+  Entry.Ty = SizeTy;
+  Entry.Node = Size;
+  Args.push_back(Entry);
+
+  RTLIB::Libcall LibraryCall =
+      RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(ElemSz);
+  if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
+    report_fatal_error("Unsupported element size");
+
+  TargetLowering::CallLoweringInfo CLI(*this);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
+                    Type::getVoidTy(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(LibraryCall),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
+      .setDiscardResult()
+      .setTailCall(isTailCall);
+
+  std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                                 SDValue Src, SDValue Size, Align Alignment,
+                                 bool isVol, bool isTailCall,
+                                 MachinePointerInfo DstPtrInfo,
+                                 MachinePointerInfo SrcPtrInfo,
+                                 const AAMDNodes &AAInfo, AAResults *AA) {
+  // Check to see if we should lower the memmove to loads and stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memmove with size zero? Just return the original chain.
+    if (ConstantSize->isZero())
+      return Chain;
+
+    SDValue Result = getMemmoveLoadsAndStores(
+        *this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Alignment,
+        isVol, false, DstPtrInfo, SrcPtrInfo, AAInfo);
+    if (Result.getNode())
+      return Result;
+  }
+
+  // Then check to see if we should lower the memmove with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  if (TSI) {
+    SDValue Result =
+        TSI->EmitTargetCodeForMemmove(*this, dl, Chain, Dst, Src, Size,
+                                      Alignment, isVol, DstPtrInfo, SrcPtrInfo);
+    if (Result.getNode())
+      return Result;
+  }
+
+  checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
+  checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());
+
+  // FIXME: If the memmove is volatile, lowering it to plain libc memmove may
+  // not be safe.  See memcpy above for more details.
+
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = Type::getInt8PtrTy(*getContext());
+  Entry.Node = Dst; Args.push_back(Entry);
+  Entry.Node = Src; Args.push_back(Entry);
+
+  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
+  Entry.Node = Size; Args.push_back(Entry);
+  // FIXME:  pass in SDLoc
+  TargetLowering::CallLoweringInfo CLI(*this);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
+                    Dst.getValueType().getTypeForEVT(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
+      .setDiscardResult()
+      .setTailCall(isTailCall);
+
+  std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getAtomicMemmove(SDValue Chain, const SDLoc &dl,
+                                       SDValue Dst, SDValue Src, SDValue Size,
+                                       Type *SizeTy, unsigned ElemSz,
+                                       bool isTailCall,
+                                       MachinePointerInfo DstPtrInfo,
+                                       MachinePointerInfo SrcPtrInfo) {
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
+  Entry.Node = Dst;
+  Args.push_back(Entry);
+
+  Entry.Node = Src;
+  Args.push_back(Entry);
+
+  Entry.Ty = SizeTy;
+  Entry.Node = Size;
+  Args.push_back(Entry);
+
+  RTLIB::Libcall LibraryCall =
+      RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(ElemSz);
+  if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
+    report_fatal_error("Unsupported element size");
+
+  TargetLowering::CallLoweringInfo CLI(*this);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
+                    Type::getVoidTy(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(LibraryCall),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
+      .setDiscardResult()
+      .setTailCall(isTailCall);
+
+  std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                                SDValue Src, SDValue Size, Align Alignment,
+                                bool isVol, bool AlwaysInline, bool isTailCall,
+                                MachinePointerInfo DstPtrInfo,
+                                const AAMDNodes &AAInfo) {
+  // Check to see if we should lower the memset to stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memset with size zero? Just return the original chain.
+    if (ConstantSize->isZero())
+      return Chain;
+
+    SDValue Result = getMemsetStores(*this, dl, Chain, Dst, Src,
+                                     ConstantSize->getZExtValue(), Alignment,
+                                     isVol, false, DstPtrInfo, AAInfo);
+
+    if (Result.getNode())
+      return Result;
+  }
+
+  // Then check to see if we should lower the memset with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  if (TSI) {
+    SDValue Result = TSI->EmitTargetCodeForMemset(
+        *this, dl, Chain, Dst, Src, Size, Alignment, isVol, AlwaysInline, DstPtrInfo);
+    if (Result.getNode())
+      return Result;
+  }
+
+  // If we really need inline code and the target declined to provide it,
+  // use a (potentially long) sequence of loads and stores.
+  if (AlwaysInline) {
+    assert(ConstantSize && "AlwaysInline requires a constant size!");
+    SDValue Result = getMemsetStores(*this, dl, Chain, Dst, Src,
+                                     ConstantSize->getZExtValue(), Alignment,
+                                     isVol, true, DstPtrInfo, AAInfo);
+    assert(Result &&
+           "getMemsetStores must return a valid sequence when AlwaysInline");
+    return Result;
+  }
+
+  checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
+
+  // Emit a library call.
+  auto &Ctx = *getContext();
+  const auto& DL = getDataLayout();
+
+  TargetLowering::CallLoweringInfo CLI(*this);
+  // FIXME: pass in SDLoc
+  CLI.setDebugLoc(dl).setChain(Chain);
+
+  ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src);
+  const bool SrcIsZero = ConstantSrc && ConstantSrc->isZero();
+  const char *BzeroName = getTargetLoweringInfo().getLibcallName(RTLIB::BZERO);
+
+  // Helper function to create an Entry from Node and Type.
+  const auto CreateEntry = [](SDValue Node, Type *Ty) {
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = Node;
+    Entry.Ty = Ty;
+    return Entry;
+  };
+
+  // If zeroing out and bzero is present, use it.
+  if (SrcIsZero && BzeroName) {
+    TargetLowering::ArgListTy Args;
+    Args.push_back(CreateEntry(Dst, Type::getInt8PtrTy(Ctx)));
+    Args.push_back(CreateEntry(Size, DL.getIntPtrType(Ctx)));
+    CLI.setLibCallee(
+        TLI->getLibcallCallingConv(RTLIB::BZERO), Type::getVoidTy(Ctx),
+        getExternalSymbol(BzeroName, TLI->getPointerTy(DL)), std::move(Args));
+  } else {
+    TargetLowering::ArgListTy Args;
+    Args.push_back(CreateEntry(Dst, Type::getInt8PtrTy(Ctx)));
+    Args.push_back(CreateEntry(Src, Src.getValueType().getTypeForEVT(Ctx)));
+    Args.push_back(CreateEntry(Size, DL.getIntPtrType(Ctx)));
+    CLI.setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
+                     Dst.getValueType().getTypeForEVT(Ctx),
+                     getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
+                                       TLI->getPointerTy(DL)),
+                     std::move(Args));
+  }
+
+  CLI.setDiscardResult().setTailCall(isTailCall);
+
+  std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getAtomicMemset(SDValue Chain, const SDLoc &dl,
+                                      SDValue Dst, SDValue Value, SDValue Size,
+                                      Type *SizeTy, unsigned ElemSz,
+                                      bool isTailCall,
+                                      MachinePointerInfo DstPtrInfo) {
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
+  Entry.Node = Dst;
+  Args.push_back(Entry);
+
+  Entry.Ty = Type::getInt8Ty(*getContext());
+  Entry.Node = Value;
+  Args.push_back(Entry);
+
+  Entry.Ty = SizeTy;
+  Entry.Node = Size;
+  Args.push_back(Entry);
+
+  RTLIB::Libcall LibraryCall =
+      RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(ElemSz);
+  if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
+    report_fatal_error("Unsupported element size");
+
+  TargetLowering::CallLoweringInfo CLI(*this);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setLibCallee(TLI->getLibcallCallingConv(LibraryCall),
+                    Type::getVoidTy(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(LibraryCall),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
+      .setDiscardResult()
+      .setTailCall(isTailCall);
+
+  std::pair<SDValue, SDValue> CallResult = TLI->LowerCallTo(CLI);
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                                SDVTList VTList, ArrayRef<SDValue> Ops,
+                                MachineMemOperand *MMO) {
+  FoldingSetNodeID ID;
+  ID.AddInteger(MemVT.getRawBits());
+  AddNodeIDNode(ID, Opcode, VTList, Ops);
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void* IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<AtomicSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+
+  auto *N = newSDNode<AtomicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
+                                    VTList, MemVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getAtomicCmpSwap(unsigned Opcode, const SDLoc &dl,
+                                       EVT MemVT, SDVTList VTs, SDValue Chain,
+                                       SDValue Ptr, SDValue Cmp, SDValue Swp,
+                                       MachineMemOperand *MMO) {
+  assert(Opcode == ISD::ATOMIC_CMP_SWAP ||
+         Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
+  assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
+
+  SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                                SDValue Chain, SDValue Ptr, SDValue Val,
+                                MachineMemOperand *MMO) {
+  assert((Opcode == ISD::ATOMIC_LOAD_ADD ||
+          Opcode == ISD::ATOMIC_LOAD_SUB ||
+          Opcode == ISD::ATOMIC_LOAD_AND ||
+          Opcode == ISD::ATOMIC_LOAD_CLR ||
+          Opcode == ISD::ATOMIC_LOAD_OR ||
+          Opcode == ISD::ATOMIC_LOAD_XOR ||
+          Opcode == ISD::ATOMIC_LOAD_NAND ||
+          Opcode == ISD::ATOMIC_LOAD_MIN ||
+          Opcode == ISD::ATOMIC_LOAD_MAX ||
+          Opcode == ISD::ATOMIC_LOAD_UMIN ||
+          Opcode == ISD::ATOMIC_LOAD_UMAX ||
+          Opcode == ISD::ATOMIC_LOAD_FADD ||
+          Opcode == ISD::ATOMIC_LOAD_FSUB ||
+          Opcode == ISD::ATOMIC_LOAD_FMAX ||
+          Opcode == ISD::ATOMIC_LOAD_FMIN ||
+          Opcode == ISD::ATOMIC_LOAD_UINC_WRAP ||
+          Opcode == ISD::ATOMIC_LOAD_UDEC_WRAP ||
+          Opcode == ISD::ATOMIC_SWAP ||
+          Opcode == ISD::ATOMIC_STORE) &&
+         "Invalid Atomic Op");
+
+  EVT VT = Val.getValueType();
+
+  SDVTList VTs = Opcode == ISD::ATOMIC_STORE ? getVTList(MVT::Other) :
+                                               getVTList(VT, MVT::Other);
+  SDValue Ops[] = {Chain, Ptr, Val};
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                                EVT VT, SDValue Chain, SDValue Ptr,
+                                MachineMemOperand *MMO) {
+  assert(Opcode == ISD::ATOMIC_LOAD && "Invalid Atomic Op");
+
+  SDVTList VTs = getVTList(VT, MVT::Other);
+  SDValue Ops[] = {Chain, Ptr};
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO);
+}
+
+/// getMergeValues - Create a MERGE_VALUES node from the given operands.
+SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
+  if (Ops.size() == 1)
+    return Ops[0];
+
+  SmallVector<EVT, 4> VTs;
+  VTs.reserve(Ops.size());
+  for (const SDValue &Op : Ops)
+    VTs.push_back(Op.getValueType());
+  return getNode(ISD::MERGE_VALUES, dl, getVTList(VTs), Ops);
+}
+
+SDValue SelectionDAG::getMemIntrinsicNode(
+    unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
+    EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment,
+    MachineMemOperand::Flags Flags, uint64_t Size, const AAMDNodes &AAInfo) {
+  if (!Size && MemVT.isScalableVector())
+    Size = MemoryLocation::UnknownSize;
+  else if (!Size)
+    Size = MemVT.getStoreSize();
+
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PtrInfo, Flags, Size, Alignment, AAInfo);
+
+  return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
+}
+
+SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
+                                          SDVTList VTList,
+                                          ArrayRef<SDValue> Ops, EVT MemVT,
+                                          MachineMemOperand *MMO) {
+  assert((Opcode == ISD::INTRINSIC_VOID ||
+          Opcode == ISD::INTRINSIC_W_CHAIN ||
+          Opcode == ISD::PREFETCH ||
+          (Opcode <= (unsigned)std::numeric_limits<int>::max() &&
+           (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
+         "Opcode is not a memory-accessing opcode!");
+
+  // Memoize the node unless it returns a flag.
+  MemIntrinsicSDNode *N;
+  if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops);
+    ID.AddInteger(getSyntheticNodeSubclassData<MemIntrinsicSDNode>(
+        Opcode, dl.getIROrder(), VTList, MemVT, MMO));
+    ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+    ID.AddInteger(MMO->getFlags());
+    ID.AddInteger(MemVT.getRawBits());
+    void *IP = nullptr;
+    if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+      cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
+      return SDValue(E, 0);
+    }
+
+    N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
+                                      VTList, MemVT, MMO);
+    createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  } else {
+    N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
+                                      VTList, MemVT, MMO);
+    createOperands(N, Ops);
+  }
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
+                                      SDValue Chain, int FrameIndex,
+                                      int64_t Size, int64_t Offset) {
+  const unsigned Opcode = IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END;
+  const auto VTs = getVTList(MVT::Other);
+  SDValue Ops[2] = {
+      Chain,
+      getFrameIndex(FrameIndex,
+                    getTargetLoweringInfo().getFrameIndexTy(getDataLayout()),
+                    true)};
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opcode, VTs, Ops);
+  ID.AddInteger(FrameIndex);
+  ID.AddInteger(Size);
+  ID.AddInteger(Offset);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
+    return SDValue(E, 0);
+
+  LifetimeSDNode *N = newSDNode<LifetimeSDNode>(
+      Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs, Size, Offset);
+  createOperands(N, Ops);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getPseudoProbeNode(const SDLoc &Dl, SDValue Chain,
+                                         uint64_t Guid, uint64_t Index,
+                                         uint32_t Attr) {
+  const unsigned Opcode = ISD::PSEUDO_PROBE;
+  const auto VTs = getVTList(MVT::Other);
+  SDValue Ops[] = {Chain};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opcode, VTs, Ops);
+  ID.AddInteger(Guid);
+  ID.AddInteger(Index);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, Dl, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<PseudoProbeSDNode>(
+      Opcode, Dl.getIROrder(), Dl.getDebugLoc(), VTs, Guid, Index, Attr);
+  createOperands(N, Ops);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+/// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
+/// MachinePointerInfo record from it.  This is particularly useful because the
+/// code generator has many cases where it doesn't bother passing in a
+/// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
+static MachinePointerInfo InferPointerInfo(const MachinePointerInfo &Info,
+                                           SelectionDAG &DAG, SDValue Ptr,
+                                           int64_t Offset = 0) {
+  // If this is FI+Offset, we can model it.
+  if (const FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr))
+    return MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
+                                             FI->getIndex(), Offset);
+
+  // If this is (FI+Offset1)+Offset2, we can model it.
+  if (Ptr.getOpcode() != ISD::ADD ||
+      !isa<ConstantSDNode>(Ptr.getOperand(1)) ||
+      !isa<FrameIndexSDNode>(Ptr.getOperand(0)))
+    return Info;
+
+  int FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+  return MachinePointerInfo::getFixedStack(
+      DAG.getMachineFunction(), FI,
+      Offset + cast<ConstantSDNode>(Ptr.getOperand(1))->getSExtValue());
+}
+
+/// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
+/// MachinePointerInfo record from it.  This is particularly useful because the
+/// code generator has many cases where it doesn't bother passing in a
+/// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
+static MachinePointerInfo InferPointerInfo(const MachinePointerInfo &Info,
+                                           SelectionDAG &DAG, SDValue Ptr,
+                                           SDValue OffsetOp) {
+  // If the 'Offset' value isn't a constant, we can't handle this.
+  if (ConstantSDNode *OffsetNode = dyn_cast<ConstantSDNode>(OffsetOp))
+    return InferPointerInfo(Info, DAG, Ptr, OffsetNode->getSExtValue());
+  if (OffsetOp.isUndef())
+    return InferPointerInfo(Info, DAG, Ptr);
+  return Info;
+}
+
+SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
+                              EVT VT, const SDLoc &dl, SDValue Chain,
+                              SDValue Ptr, SDValue Offset,
+                              MachinePointerInfo PtrInfo, EVT MemVT,
+                              Align Alignment,
+                              MachineMemOperand::Flags MMOFlags,
+                              const AAMDNodes &AAInfo, const MDNode *Ranges) {
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+
+  MMOFlags |= MachineMemOperand::MOLoad;
+  assert((MMOFlags & MachineMemOperand::MOStore) == 0);
+  // If we don't have a PtrInfo, infer the trivial frame index case to simplify
+  // clients.
+  if (PtrInfo.V.isNull())
+    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);
+
+  uint64_t Size = MemoryLocation::getSizeOrUnknown(MemVT.getStoreSize());
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MMOFlags, Size,
+                                                   Alignment, AAInfo, Ranges);
+  return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
+}
+
+SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
+                              EVT VT, const SDLoc &dl, SDValue Chain,
+                              SDValue Ptr, SDValue Offset, EVT MemVT,
+                              MachineMemOperand *MMO) {
+  if (VT == MemVT) {
+    ExtType = ISD::NON_EXTLOAD;
+  } else if (ExtType == ISD::NON_EXTLOAD) {
+    assert(VT == MemVT && "Non-extending load from different memory type!");
+  } else {
+    // Extending load.
+    assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) &&
+           "Should only be an extending load, not truncating!");
+    assert(VT.isInteger() == MemVT.isInteger() &&
+           "Cannot convert from FP to Int or Int -> FP!");
+    assert(VT.isVector() == MemVT.isVector() &&
+           "Cannot use an ext load to convert to or from a vector!");
+    assert((!VT.isVector() ||
+            VT.getVectorElementCount() == MemVT.getVectorElementCount()) &&
+           "Cannot use an ext load to change the number of vector elements!");
+  }
+
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!");
+
+  SDVTList VTs = Indexed ?
+    getVTList(VT, Ptr.getValueType(), MVT::Other) : getVTList(VT, MVT::Other);
+  SDValue Ops[] = { Chain, Ptr, Offset };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::LOAD, VTs, Ops);
+  ID.AddInteger(MemVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<LoadSDNode>(
+      dl.getIROrder(), VTs, AM, ExtType, MemVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<LoadSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<LoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+                                  ExtType, MemVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
+                              SDValue Ptr, MachinePointerInfo PtrInfo,
+                              MaybeAlign Alignment,
+                              MachineMemOperand::Flags MMOFlags,
+                              const AAMDNodes &AAInfo, const MDNode *Ranges) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
+                 PtrInfo, VT, Alignment, MMOFlags, AAInfo, Ranges);
+}
+
+SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
+                              SDValue Ptr, MachineMemOperand *MMO) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
+                 VT, MMO);
+}
+
+SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
+                                 EVT VT, SDValue Chain, SDValue Ptr,
+                                 MachinePointerInfo PtrInfo, EVT MemVT,
+                                 MaybeAlign Alignment,
+                                 MachineMemOperand::Flags MMOFlags,
+                                 const AAMDNodes &AAInfo) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, PtrInfo,
+                 MemVT, Alignment, MMOFlags, AAInfo);
+}
+
+SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
+                                 EVT VT, SDValue Chain, SDValue Ptr, EVT MemVT,
+                                 MachineMemOperand *MMO) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
+                 MemVT, MMO);
+}
+
+SDValue SelectionDAG::getIndexedLoad(SDValue OrigLoad, const SDLoc &dl,
+                                     SDValue Base, SDValue Offset,
+                                     ISD::MemIndexedMode AM) {
+  LoadSDNode *LD = cast<LoadSDNode>(OrigLoad);
+  assert(LD->getOffset().isUndef() && "Load is already a indexed load!");
+  // Don't propagate the invariant or dereferenceable flags.
+  auto MMOFlags =
+      LD->getMemOperand()->getFlags() &
+      ~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
+  return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
+                 LD->getChain(), Base, Offset, LD->getPointerInfo(),
+                 LD->getMemoryVT(), LD->getAlign(), MMOFlags, LD->getAAInfo());
+}
+
+SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
+                               SDValue Ptr, MachinePointerInfo PtrInfo,
+                               Align Alignment,
+                               MachineMemOperand::Flags MMOFlags,
+                               const AAMDNodes &AAInfo) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+
+  MMOFlags |= MachineMemOperand::MOStore;
+  assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
+
+  if (PtrInfo.V.isNull())
+    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
+
+  MachineFunction &MF = getMachineFunction();
+  uint64_t Size =
+      MemoryLocation::getSizeOrUnknown(Val.getValueType().getStoreSize());
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PtrInfo, MMOFlags, Size, Alignment, AAInfo);
+  return getStore(Chain, dl, Val, Ptr, MMO);
+}
+
+SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
+                               SDValue Ptr, MachineMemOperand *MMO) {
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+  EVT VT = Val.getValueType();
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  SDValue Ops[] = { Chain, Val, Ptr, Undef };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<StoreSDNode>(
+      dl.getIROrder(), VTs, ISD::UNINDEXED, false, VT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<StoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                   ISD::UNINDEXED, false, VT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
+                                    SDValue Ptr, MachinePointerInfo PtrInfo,
+                                    EVT SVT, Align Alignment,
+                                    MachineMemOperand::Flags MMOFlags,
+                                    const AAMDNodes &AAInfo) {
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+
+  MMOFlags |= MachineMemOperand::MOStore;
+  assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
+
+  if (PtrInfo.V.isNull())
+    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
+
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MMOFlags, MemoryLocation::getSizeOrUnknown(SVT.getStoreSize()),
+      Alignment, AAInfo);
+  return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO);
+}
+
+SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
+                                    SDValue Ptr, EVT SVT,
+                                    MachineMemOperand *MMO) {
+  EVT VT = Val.getValueType();
+
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+  if (VT == SVT)
+    return getStore(Chain, dl, Val, Ptr, MMO);
+
+  assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
+         "Should only be a truncating store, not extending!");
+  assert(VT.isInteger() == SVT.isInteger() &&
+         "Can't do FP-INT conversion!");
+  assert(VT.isVector() == SVT.isVector() &&
+         "Cannot use trunc store to convert to or from a vector!");
+  assert((!VT.isVector() ||
+          VT.getVectorElementCount() == SVT.getVectorElementCount()) &&
+         "Cannot use trunc store to change the number of vector elements!");
+
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  SDValue Ops[] = { Chain, Val, Ptr, Undef };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
+  ID.AddInteger(SVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<StoreSDNode>(
+      dl.getIROrder(), VTs, ISD::UNINDEXED, true, SVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<StoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                   ISD::UNINDEXED, true, SVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, const SDLoc &dl,
+                                      SDValue Base, SDValue Offset,
+                                      ISD::MemIndexedMode AM) {
+  StoreSDNode *ST = cast<StoreSDNode>(OrigStore);
+  assert(ST->getOffset().isUndef() && "Store is already a indexed store!");
+  SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
+  SDValue Ops[] = { ST->getChain(), ST->getValue(), Base, Offset };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
+  ID.AddInteger(ST->getMemoryVT().getRawBits());
+  ID.AddInteger(ST->getRawSubclassData());
+  ID.AddInteger(ST->getPointerInfo().getAddrSpace());
+  ID.AddInteger(ST->getMemOperand()->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+                                   ST->isTruncatingStore(), ST->getMemoryVT(),
+                                   ST->getMemOperand());
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getLoadVP(
+    ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &dl,
+    SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Mask, SDValue EVL,
+    MachinePointerInfo PtrInfo, EVT MemVT, Align Alignment,
+    MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
+    const MDNode *Ranges, bool IsExpanding) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+
+  MMOFlags |= MachineMemOperand::MOLoad;
+  assert((MMOFlags & MachineMemOperand::MOStore) == 0);
+  // If we don't have a PtrInfo, infer the trivial frame index case to simplify
+  // clients.
+  if (PtrInfo.V.isNull())
+    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);
+
+  uint64_t Size = MemoryLocation::getSizeOrUnknown(MemVT.getStoreSize());
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MMOFlags, Size,
+                                                   Alignment, AAInfo, Ranges);
+  return getLoadVP(AM, ExtType, VT, dl, Chain, Ptr, Offset, Mask, EVL, MemVT,
+                   MMO, IsExpanding);
+}
+
+SDValue SelectionDAG::getLoadVP(ISD::MemIndexedMode AM,
+                                ISD::LoadExtType ExtType, EVT VT,
+                                const SDLoc &dl, SDValue Chain, SDValue Ptr,
+                                SDValue Offset, SDValue Mask, SDValue EVL,
+                                EVT MemVT, MachineMemOperand *MMO,
+                                bool IsExpanding) {
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!");
+
+  SDVTList VTs = Indexed ? getVTList(VT, Ptr.getValueType(), MVT::Other)
+                         : getVTList(VT, MVT::Other);
+  SDValue Ops[] = {Chain, Ptr, Offset, Mask, EVL};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::VP_LOAD, VTs, Ops);
+  ID.AddInteger(MemVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<VPLoadSDNode>(
+      dl.getIROrder(), VTs, AM, ExtType, IsExpanding, MemVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<VPLoadSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<VPLoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+                                    ExtType, IsExpanding, MemVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getLoadVP(EVT VT, const SDLoc &dl, SDValue Chain,
+                                SDValue Ptr, SDValue Mask, SDValue EVL,
+                                MachinePointerInfo PtrInfo,
+                                MaybeAlign Alignment,
+                                MachineMemOperand::Flags MMOFlags,
+                                const AAMDNodes &AAInfo, const MDNode *Ranges,
+                                bool IsExpanding) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoadVP(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
+                   Mask, EVL, PtrInfo, VT, Alignment, MMOFlags, AAInfo, Ranges,
+                   IsExpanding);
+}
+
+SDValue SelectionDAG::getLoadVP(EVT VT, const SDLoc &dl, SDValue Chain,
+                                SDValue Ptr, SDValue Mask, SDValue EVL,
+                                MachineMemOperand *MMO, bool IsExpanding) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoadVP(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
+                   Mask, EVL, VT, MMO, IsExpanding);
+}
+
+SDValue SelectionDAG::getExtLoadVP(ISD::LoadExtType ExtType, const SDLoc &dl,
+                                   EVT VT, SDValue Chain, SDValue Ptr,
+                                   SDValue Mask, SDValue EVL,
+                                   MachinePointerInfo PtrInfo, EVT MemVT,
+                                   MaybeAlign Alignment,
+                                   MachineMemOperand::Flags MMOFlags,
+                                   const AAMDNodes &AAInfo, bool IsExpanding) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoadVP(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, Mask,
+                   EVL, PtrInfo, MemVT, Alignment, MMOFlags, AAInfo, nullptr,
+                   IsExpanding);
+}
+
+SDValue SelectionDAG::getExtLoadVP(ISD::LoadExtType ExtType, const SDLoc &dl,
+                                   EVT VT, SDValue Chain, SDValue Ptr,
+                                   SDValue Mask, SDValue EVL, EVT MemVT,
+                                   MachineMemOperand *MMO, bool IsExpanding) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoadVP(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, Mask,
+                   EVL, MemVT, MMO, IsExpanding);
+}
+
+SDValue SelectionDAG::getIndexedLoadVP(SDValue OrigLoad, const SDLoc &dl,
+                                       SDValue Base, SDValue Offset,
+                                       ISD::MemIndexedMode AM) {
+  auto *LD = cast<VPLoadSDNode>(OrigLoad);
+  assert(LD->getOffset().isUndef() && "Load is already a indexed load!");
+  // Don't propagate the invariant or dereferenceable flags.
+  auto MMOFlags =
+      LD->getMemOperand()->getFlags() &
+      ~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
+  return getLoadVP(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
+                   LD->getChain(), Base, Offset, LD->getMask(),
+                   LD->getVectorLength(), LD->getPointerInfo(),
+                   LD->getMemoryVT(), LD->getAlign(), MMOFlags, LD->getAAInfo(),
+                   nullptr, LD->isExpandingLoad());
+}
+
+SDValue SelectionDAG::getStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val,
+                                 SDValue Ptr, SDValue Offset, SDValue Mask,
+                                 SDValue EVL, EVT MemVT, MachineMemOperand *MMO,
+                                 ISD::MemIndexedMode AM, bool IsTruncating,
+                                 bool IsCompressing) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.isUndef()) && "Unindexed vp_store with an offset!");
+  SDVTList VTs = Indexed ? getVTList(Ptr.getValueType(), MVT::Other)
+                         : getVTList(MVT::Other);
+  SDValue Ops[] = {Chain, Val, Ptr, Offset, Mask, EVL};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::VP_STORE, VTs, Ops);
+  ID.AddInteger(MemVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<VPStoreSDNode>(
+      dl.getIROrder(), VTs, AM, IsTruncating, IsCompressing, MemVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<VPStoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<VPStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+                                     IsTruncating, IsCompressing, MemVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getTruncStoreVP(SDValue Chain, const SDLoc &dl,
+                                      SDValue Val, SDValue Ptr, SDValue Mask,
+                                      SDValue EVL, MachinePointerInfo PtrInfo,
+                                      EVT SVT, Align Alignment,
+                                      MachineMemOperand::Flags MMOFlags,
+                                      const AAMDNodes &AAInfo,
+                                      bool IsCompressing) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+
+  MMOFlags |= MachineMemOperand::MOStore;
+  assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
+
+  if (PtrInfo.V.isNull())
+    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
+
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MMOFlags, MemoryLocation::getSizeOrUnknown(SVT.getStoreSize()),
+      Alignment, AAInfo);
+  return getTruncStoreVP(Chain, dl, Val, Ptr, Mask, EVL, SVT, MMO,
+                         IsCompressing);
+}
+
+SDValue SelectionDAG::getTruncStoreVP(SDValue Chain, const SDLoc &dl,
+                                      SDValue Val, SDValue Ptr, SDValue Mask,
+                                      SDValue EVL, EVT SVT,
+                                      MachineMemOperand *MMO,
+                                      bool IsCompressing) {
+  EVT VT = Val.getValueType();
+
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+  if (VT == SVT)
+    return getStoreVP(Chain, dl, Val, Ptr, getUNDEF(Ptr.getValueType()), Mask,
+                      EVL, VT, MMO, ISD::UNINDEXED,
+                      /*IsTruncating*/ false, IsCompressing);
+
+  assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
+         "Should only be a truncating store, not extending!");
+  assert(VT.isInteger() == SVT.isInteger() && "Can't do FP-INT conversion!");
+  assert(VT.isVector() == SVT.isVector() &&
+         "Cannot use trunc store to convert to or from a vector!");
+  assert((!VT.isVector() ||
+          VT.getVectorElementCount() == SVT.getVectorElementCount()) &&
+         "Cannot use trunc store to change the number of vector elements!");
+
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  SDValue Ops[] = {Chain, Val, Ptr, Undef, Mask, EVL};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::VP_STORE, VTs, Ops);
+  ID.AddInteger(SVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<VPStoreSDNode>(
+      dl.getIROrder(), VTs, ISD::UNINDEXED, true, IsCompressing, SVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<VPStoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N =
+      newSDNode<VPStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                               ISD::UNINDEXED, true, IsCompressing, SVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getIndexedStoreVP(SDValue OrigStore, const SDLoc &dl,
+                                        SDValue Base, SDValue Offset,
+                                        ISD::MemIndexedMode AM) {
+  auto *ST = cast<VPStoreSDNode>(OrigStore);
+  assert(ST->getOffset().isUndef() && "Store is already an indexed store!");
+  SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
+  SDValue Ops[] = {ST->getChain(), ST->getValue(), Base,
+                   Offset,         ST->getMask(),  ST->getVectorLength()};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::VP_STORE, VTs, Ops);
+  ID.AddInteger(ST->getMemoryVT().getRawBits());
+  ID.AddInteger(ST->getRawSubclassData());
+  ID.AddInteger(ST->getPointerInfo().getAddrSpace());
+  ID.AddInteger(ST->getMemOperand()->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<VPStoreSDNode>(
+      dl.getIROrder(), dl.getDebugLoc(), VTs, AM, ST->isTruncatingStore(),
+      ST->isCompressingStore(), ST->getMemoryVT(), ST->getMemOperand());
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getStridedLoadVP(
+    ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &DL,
+    SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Stride, SDValue Mask,
+    SDValue EVL, MachinePointerInfo PtrInfo, EVT MemVT, Align Alignment,
+    MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
+    const MDNode *Ranges, bool IsExpanding) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+
+  MMOFlags |= MachineMemOperand::MOLoad;
+  assert((MMOFlags & MachineMemOperand::MOStore) == 0);
+  // If we don't have a PtrInfo, infer the trivial frame index case to simplify
+  // clients.
+  if (PtrInfo.V.isNull())
+    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);
+
+  uint64_t Size = MemoryLocation::UnknownSize;
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MMOFlags, Size,
+                                                   Alignment, AAInfo, Ranges);
+  return getStridedLoadVP(AM, ExtType, VT, DL, Chain, Ptr, Offset, Stride, Mask,
+                          EVL, MemVT, MMO, IsExpanding);
+}
+
+SDValue SelectionDAG::getStridedLoadVP(
+    ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, EVT VT, const SDLoc &DL,
+    SDValue Chain, SDValue Ptr, SDValue Offset, SDValue Stride, SDValue Mask,
+    SDValue EVL, EVT MemVT, MachineMemOperand *MMO, bool IsExpanding) {
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!");
+
+  SDValue Ops[] = {Chain, Ptr, Offset, Stride, Mask, EVL};
+  SDVTList VTs = Indexed ? getVTList(VT, Ptr.getValueType(), MVT::Other)
+                         : getVTList(VT, MVT::Other);
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::EXPERIMENTAL_VP_STRIDED_LOAD, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<VPStridedLoadSDNode>(
+      DL.getIROrder(), VTs, AM, ExtType, IsExpanding, MemVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+    cast<VPStridedLoadSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+
+  auto *N =
+      newSDNode<VPStridedLoadSDNode>(DL.getIROrder(), DL.getDebugLoc(), VTs, AM,
+                                     ExtType, IsExpanding, MemVT, MMO);
+  createOperands(N, Ops);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getStridedLoadVP(
+    EVT VT, const SDLoc &DL, SDValue Chain, SDValue Ptr, SDValue Stride,
+    SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, MaybeAlign Alignment,
+    MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
+    const MDNode *Ranges, bool IsExpanding) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getStridedLoadVP(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, DL, Chain, Ptr,
+                          Undef, Stride, Mask, EVL, PtrInfo, VT, Alignment,
+                          MMOFlags, AAInfo, Ranges, IsExpanding);
+}
+
+SDValue SelectionDAG::getStridedLoadVP(EVT VT, const SDLoc &DL, SDValue Chain,
+                                       SDValue Ptr, SDValue Stride,
+                                       SDValue Mask, SDValue EVL,
+                                       MachineMemOperand *MMO,
+                                       bool IsExpanding) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getStridedLoadVP(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, DL, Chain, Ptr,
+                          Undef, Stride, Mask, EVL, VT, MMO, IsExpanding);
+}
+
+SDValue SelectionDAG::getExtStridedLoadVP(
+    ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT, SDValue Chain,
+    SDValue Ptr, SDValue Stride, SDValue Mask, SDValue EVL,
+    MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment,
+    MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
+    bool IsExpanding) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getStridedLoadVP(ISD::UNINDEXED, ExtType, VT, DL, Chain, Ptr, Undef,
+                          Stride, Mask, EVL, PtrInfo, MemVT, Alignment,
+                          MMOFlags, AAInfo, nullptr, IsExpanding);
+}
+
+SDValue SelectionDAG::getExtStridedLoadVP(
+    ISD::LoadExtType ExtType, const SDLoc &DL, EVT VT, SDValue Chain,
+    SDValue Ptr, SDValue Stride, SDValue Mask, SDValue EVL, EVT MemVT,
+    MachineMemOperand *MMO, bool IsExpanding) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getStridedLoadVP(ISD::UNINDEXED, ExtType, VT, DL, Chain, Ptr, Undef,
+                          Stride, Mask, EVL, MemVT, MMO, IsExpanding);
+}
+
+SDValue SelectionDAG::getIndexedStridedLoadVP(SDValue OrigLoad, const SDLoc &DL,
+                                              SDValue Base, SDValue Offset,
+                                              ISD::MemIndexedMode AM) {
+  auto *SLD = cast<VPStridedLoadSDNode>(OrigLoad);
+  assert(SLD->getOffset().isUndef() &&
+         "Strided load is already a indexed load!");
+  // Don't propagate the invariant or dereferenceable flags.
+  auto MMOFlags =
+      SLD->getMemOperand()->getFlags() &
+      ~(MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
+  return getStridedLoadVP(
+      AM, SLD->getExtensionType(), OrigLoad.getValueType(), DL, SLD->getChain(),
+      Base, Offset, SLD->getStride(), SLD->getMask(), SLD->getVectorLength(),
+      SLD->getPointerInfo(), SLD->getMemoryVT(), SLD->getAlign(), MMOFlags,
+      SLD->getAAInfo(), nullptr, SLD->isExpandingLoad());
+}
+
+SDValue SelectionDAG::getStridedStoreVP(SDValue Chain, const SDLoc &DL,
+                                        SDValue Val, SDValue Ptr,
+                                        SDValue Offset, SDValue Stride,
+                                        SDValue Mask, SDValue EVL, EVT MemVT,
+                                        MachineMemOperand *MMO,
+                                        ISD::MemIndexedMode AM,
+                                        bool IsTruncating, bool IsCompressing) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.isUndef()) && "Unindexed vp_store with an offset!");
+  SDVTList VTs = Indexed ? getVTList(Ptr.getValueType(), MVT::Other)
+                         : getVTList(MVT::Other);
+  SDValue Ops[] = {Chain, Val, Ptr, Offset, Stride, Mask, EVL};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::EXPERIMENTAL_VP_STRIDED_STORE, VTs, Ops);
+  ID.AddInteger(MemVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<VPStridedStoreSDNode>(
+      DL.getIROrder(), VTs, AM, IsTruncating, IsCompressing, MemVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+    cast<VPStridedStoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<VPStridedStoreSDNode>(DL.getIROrder(), DL.getDebugLoc(),
+                                            VTs, AM, IsTruncating,
+                                            IsCompressing, MemVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getTruncStridedStoreVP(
+    SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Stride,
+    SDValue Mask, SDValue EVL, MachinePointerInfo PtrInfo, EVT SVT,
+    Align Alignment, MachineMemOperand::Flags MMOFlags, const AAMDNodes &AAInfo,
+    bool IsCompressing) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+
+  MMOFlags |= MachineMemOperand::MOStore;
+  assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
+
+  if (PtrInfo.V.isNull())
+    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
+
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MMOFlags, MemoryLocation::UnknownSize, Alignment, AAInfo);
+  return getTruncStridedStoreVP(Chain, DL, Val, Ptr, Stride, Mask, EVL, SVT,
+                                MMO, IsCompressing);
+}
+
+SDValue SelectionDAG::getTruncStridedStoreVP(SDValue Chain, const SDLoc &DL,
+                                             SDValue Val, SDValue Ptr,
+                                             SDValue Stride, SDValue Mask,
+                                             SDValue EVL, EVT SVT,
+                                             MachineMemOperand *MMO,
+                                             bool IsCompressing) {
+  EVT VT = Val.getValueType();
+
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+  if (VT == SVT)
+    return getStridedStoreVP(Chain, DL, Val, Ptr, getUNDEF(Ptr.getValueType()),
+                             Stride, Mask, EVL, VT, MMO, ISD::UNINDEXED,
+                             /*IsTruncating*/ false, IsCompressing);
+
+  assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
+         "Should only be a truncating store, not extending!");
+  assert(VT.isInteger() == SVT.isInteger() && "Can't do FP-INT conversion!");
+  assert(VT.isVector() == SVT.isVector() &&
+         "Cannot use trunc store to convert to or from a vector!");
+  assert((!VT.isVector() ||
+          VT.getVectorElementCount() == SVT.getVectorElementCount()) &&
+         "Cannot use trunc store to change the number of vector elements!");
+
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  SDValue Ops[] = {Chain, Val, Ptr, Undef, Stride, Mask, EVL};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::EXPERIMENTAL_VP_STRIDED_STORE, VTs, Ops);
+  ID.AddInteger(SVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<VPStridedStoreSDNode>(
+      DL.getIROrder(), VTs, ISD::UNINDEXED, true, IsCompressing, SVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+    cast<VPStridedStoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<VPStridedStoreSDNode>(DL.getIROrder(), DL.getDebugLoc(),
+                                            VTs, ISD::UNINDEXED, true,
+                                            IsCompressing, SVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getIndexedStridedStoreVP(SDValue OrigStore,
+                                               const SDLoc &DL, SDValue Base,
+                                               SDValue Offset,
+                                               ISD::MemIndexedMode AM) {
+  auto *SST = cast<VPStridedStoreSDNode>(OrigStore);
+  assert(SST->getOffset().isUndef() &&
+         "Strided store is already an indexed store!");
+  SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
+  SDValue Ops[] = {
+      SST->getChain(), SST->getValue(),       Base, Offset, SST->getStride(),
+      SST->getMask(),  SST->getVectorLength()};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::EXPERIMENTAL_VP_STRIDED_STORE, VTs, Ops);
+  ID.AddInteger(SST->getMemoryVT().getRawBits());
+  ID.AddInteger(SST->getRawSubclassData());
+  ID.AddInteger(SST->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<VPStridedStoreSDNode>(
+      DL.getIROrder(), DL.getDebugLoc(), VTs, AM, SST->isTruncatingStore(),
+      SST->isCompressingStore(), SST->getMemoryVT(), SST->getMemOperand());
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getGatherVP(SDVTList VTs, EVT VT, const SDLoc &dl,
+                                  ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
+                                  ISD::MemIndexType IndexType) {
+  assert(Ops.size() == 6 && "Incompatible number of operands");
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::VP_GATHER, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<VPGatherSDNode>(
+      dl.getIROrder(), VTs, VT, MMO, IndexType));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<VPGatherSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+
+  auto *N = newSDNode<VPGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                      VT, MMO, IndexType);
+  createOperands(N, Ops);
+
+  assert(N->getMask().getValueType().getVectorElementCount() ==
+             N->getValueType(0).getVectorElementCount() &&
+         "Vector width mismatch between mask and data");
+  assert(N->getIndex().getValueType().getVectorElementCount().isScalable() ==
+             N->getValueType(0).getVectorElementCount().isScalable() &&
+         "Scalable flags of index and data do not match");
+  assert(ElementCount::isKnownGE(
+             N->getIndex().getValueType().getVectorElementCount(),
+             N->getValueType(0).getVectorElementCount()) &&
+         "Vector width mismatch between index and data");
+  assert(isa<ConstantSDNode>(N->getScale()) &&
+         cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
+         "Scale should be a constant power of 2");
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getScatterVP(SDVTList VTs, EVT VT, const SDLoc &dl,
+                                   ArrayRef<SDValue> Ops,
+                                   MachineMemOperand *MMO,
+                                   ISD::MemIndexType IndexType) {
+  assert(Ops.size() == 7 && "Incompatible number of operands");
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::VP_SCATTER, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<VPScatterSDNode>(
+      dl.getIROrder(), VTs, VT, MMO, IndexType));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<VPScatterSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<VPScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                       VT, MMO, IndexType);
+  createOperands(N, Ops);
+
+  assert(N->getMask().getValueType().getVectorElementCount() ==
+             N->getValue().getValueType().getVectorElementCount() &&
+         "Vector width mismatch between mask and data");
+  assert(
+      N->getIndex().getValueType().getVectorElementCount().isScalable() ==
+          N->getValue().getValueType().getVectorElementCount().isScalable() &&
+      "Scalable flags of index and data do not match");
+  assert(ElementCount::isKnownGE(
+             N->getIndex().getValueType().getVectorElementCount(),
+             N->getValue().getValueType().getVectorElementCount()) &&
+         "Vector width mismatch between index and data");
+  assert(isa<ConstantSDNode>(N->getScale()) &&
+         cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
+         "Scale should be a constant power of 2");
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
+                                    SDValue Base, SDValue Offset, SDValue Mask,
+                                    SDValue PassThru, EVT MemVT,
+                                    MachineMemOperand *MMO,
+                                    ISD::MemIndexedMode AM,
+                                    ISD::LoadExtType ExtTy, bool isExpanding) {
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.isUndef()) &&
+         "Unindexed masked load with an offset!");
+  SDVTList VTs = Indexed ? getVTList(VT, Base.getValueType(), MVT::Other)
+                         : getVTList(VT, MVT::Other);
+  SDValue Ops[] = {Chain, Base, Offset, Mask, PassThru};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
+  ID.AddInteger(MemVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<MaskedLoadSDNode>(
+      dl.getIROrder(), VTs, AM, ExtTy, isExpanding, MemVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<MaskedLoadSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<MaskedLoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                        AM, ExtTy, isExpanding, MemVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getIndexedMaskedLoad(SDValue OrigLoad, const SDLoc &dl,
+                                           SDValue Base, SDValue Offset,
+                                           ISD::MemIndexedMode AM) {
+  MaskedLoadSDNode *LD = cast<MaskedLoadSDNode>(OrigLoad);
+  assert(LD->getOffset().isUndef() && "Masked load is already a indexed load!");
+  return getMaskedLoad(OrigLoad.getValueType(), dl, LD->getChain(), Base,
+                       Offset, LD->getMask(), LD->getPassThru(),
+                       LD->getMemoryVT(), LD->getMemOperand(), AM,
+                       LD->getExtensionType(), LD->isExpandingLoad());
+}
+
+SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
+                                     SDValue Val, SDValue Base, SDValue Offset,
+                                     SDValue Mask, EVT MemVT,
+                                     MachineMemOperand *MMO,
+                                     ISD::MemIndexedMode AM, bool IsTruncating,
+                                     bool IsCompressing) {
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.isUndef()) &&
+         "Unindexed masked store with an offset!");
+  SDVTList VTs = Indexed ? getVTList(Base.getValueType(), MVT::Other)
+                         : getVTList(MVT::Other);
+  SDValue Ops[] = {Chain, Val, Base, Offset, Mask};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops);
+  ID.AddInteger(MemVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<MaskedStoreSDNode>(
+      dl.getIROrder(), VTs, AM, IsTruncating, IsCompressing, MemVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<MaskedStoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N =
+      newSDNode<MaskedStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+                                   IsTruncating, IsCompressing, MemVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getIndexedMaskedStore(SDValue OrigStore, const SDLoc &dl,
+                                            SDValue Base, SDValue Offset,
+                                            ISD::MemIndexedMode AM) {
+  MaskedStoreSDNode *ST = cast<MaskedStoreSDNode>(OrigStore);
+  assert(ST->getOffset().isUndef() &&
+         "Masked store is already a indexed store!");
+  return getMaskedStore(ST->getChain(), dl, ST->getValue(), Base, Offset,
+                        ST->getMask(), ST->getMemoryVT(), ST->getMemOperand(),
+                        AM, ST->isTruncatingStore(), ST->isCompressingStore());
+}
+
+SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl,
+                                      ArrayRef<SDValue> Ops,
+                                      MachineMemOperand *MMO,
+                                      ISD::MemIndexType IndexType,
+                                      ISD::LoadExtType ExtTy) {
+  assert(Ops.size() == 6 && "Incompatible number of operands");
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MGATHER, VTs, Ops);
+  ID.AddInteger(MemVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<MaskedGatherSDNode>(
+      dl.getIROrder(), VTs, MemVT, MMO, IndexType, ExtTy));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+
+  auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
+                                          VTs, MemVT, MMO, IndexType, ExtTy);
+  createOperands(N, Ops);
+
+  assert(N->getPassThru().getValueType() == N->getValueType(0) &&
+         "Incompatible type of the PassThru value in MaskedGatherSDNode");
+  assert(N->getMask().getValueType().getVectorElementCount() ==
+             N->getValueType(0).getVectorElementCount() &&
+         "Vector width mismatch between mask and data");
+  assert(N->getIndex().getValueType().getVectorElementCount().isScalable() ==
+             N->getValueType(0).getVectorElementCount().isScalable() &&
+         "Scalable flags of index and data do not match");
+  assert(ElementCount::isKnownGE(
+             N->getIndex().getValueType().getVectorElementCount(),
+             N->getValueType(0).getVectorElementCount()) &&
+         "Vector width mismatch between index and data");
+  assert(isa<ConstantSDNode>(N->getScale()) &&
+         cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
+         "Scale should be a constant power of 2");
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl,
+                                       ArrayRef<SDValue> Ops,
+                                       MachineMemOperand *MMO,
+                                       ISD::MemIndexType IndexType,
+                                       bool IsTrunc) {
+  assert(Ops.size() == 6 && "Incompatible number of operands");
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MSCATTER, VTs, Ops);
+  ID.AddInteger(MemVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<MaskedScatterSDNode>(
+      dl.getIROrder(), VTs, MemVT, MMO, IndexType, IsTrunc));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<MaskedScatterSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+
+  auto *N = newSDNode<MaskedScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(),
+                                           VTs, MemVT, MMO, IndexType, IsTrunc);
+  createOperands(N, Ops);
+
+  assert(N->getMask().getValueType().getVectorElementCount() ==
+             N->getValue().getValueType().getVectorElementCount() &&
+         "Vector width mismatch between mask and data");
+  assert(
+      N->getIndex().getValueType().getVectorElementCount().isScalable() ==
+          N->getValue().getValueType().getVectorElementCount().isScalable() &&
+      "Scalable flags of index and data do not match");
+  assert(ElementCount::isKnownGE(
+             N->getIndex().getValueType().getVectorElementCount(),
+             N->getValue().getValueType().getVectorElementCount()) &&
+         "Vector width mismatch between index and data");
+  assert(isa<ConstantSDNode>(N->getScale()) &&
+         cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
+         "Scale should be a constant power of 2");
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr,
+                                  EVT MemVT, MachineMemOperand *MMO) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Ops[] = {Chain, Ptr};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::GET_FPENV_MEM, VTs, Ops);
+  ID.AddInteger(MemVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<FPStateAccessSDNode>(
+      ISD::GET_FPENV_MEM, dl.getIROrder(), VTs, MemVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<FPStateAccessSDNode>(ISD::GET_FPENV_MEM, dl.getIROrder(),
+                                           dl.getDebugLoc(), VTs, MemVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getSetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr,
+                                  EVT MemVT, MachineMemOperand *MMO) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Ops[] = {Chain, Ptr};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::SET_FPENV_MEM, VTs, Ops);
+  ID.AddInteger(MemVT.getRawBits());
+  ID.AddInteger(getSyntheticNodeSubclassData<FPStateAccessSDNode>(
+      ISD::SET_FPENV_MEM, dl.getIROrder(), VTs, MemVT, MMO));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  ID.AddInteger(MMO->getFlags());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<FPStateAccessSDNode>(ISD::SET_FPENV_MEM, dl.getIROrder(),
+                                           dl.getDebugLoc(), VTs, MemVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::simplifySelect(SDValue Cond, SDValue T, SDValue F) {
+  // select undef, T, F --> T (if T is a constant), otherwise F
+  // select, ?, undef, F --> F
+  // select, ?, T, undef --> T
+  if (Cond.isUndef())
+    return isConstantValueOfAnyType(T) ? T : F;
+  if (T.isUndef())
+    return F;
+  if (F.isUndef())
+    return T;
+
+  // select true, T, F --> T
+  // select false, T, F --> F
+  if (auto *CondC = dyn_cast<ConstantSDNode>(Cond))
+    return CondC->isZero() ? F : T;
+
+  // TODO: This should simplify VSELECT with non-zero constant condition using
+  // something like this (but check boolean contents to be complete?):
+  if (ConstantSDNode *CondC = isConstOrConstSplat(Cond, /*AllowUndefs*/ false,
+                                                  /*AllowTruncation*/ true))
+    if (CondC->isZero())
+      return F;
+
+  // select ?, T, T --> T
+  if (T == F)
+    return T;
+
+  return SDValue();
+}
+
+SDValue SelectionDAG::simplifyShift(SDValue X, SDValue Y) {
+  // shift undef, Y --> 0 (can always assume that the undef value is 0)
+  if (X.isUndef())
+    return getConstant(0, SDLoc(X.getNode()), X.getValueType());
+  // shift X, undef --> undef (because it may shift by the bitwidth)
+  if (Y.isUndef())
+    return getUNDEF(X.getValueType());
+
+  // shift 0, Y --> 0
+  // shift X, 0 --> X
+  if (isNullOrNullSplat(X) || isNullOrNullSplat(Y))
+    return X;
+
+  // shift X, C >= bitwidth(X) --> undef
+  // All vector elements must be too big (or undef) to avoid partial undefs.
+  auto isShiftTooBig = [X](ConstantSDNode *Val) {
+    return !Val || Val->getAPIntValue().uge(X.getScalarValueSizeInBits());
+  };
+  if (ISD::matchUnaryPredicate(Y, isShiftTooBig, true))
+    return getUNDEF(X.getValueType());
+
+  return SDValue();
+}
+
+SDValue SelectionDAG::simplifyFPBinop(unsigned Opcode, SDValue X, SDValue Y,
+                                      SDNodeFlags Flags) {
+  // If this operation has 'nnan' or 'ninf' and at least 1 disallowed operand
+  // (an undef operand can be chosen to be Nan/Inf), then the result of this
+  // operation is poison. That result can be relaxed to undef.
+  ConstantFPSDNode *XC = isConstOrConstSplatFP(X, /* AllowUndefs */ true);
+  ConstantFPSDNode *YC = isConstOrConstSplatFP(Y, /* AllowUndefs */ true);
+  bool HasNan = (XC && XC->getValueAPF().isNaN()) ||
+                (YC && YC->getValueAPF().isNaN());
+  bool HasInf = (XC && XC->getValueAPF().isInfinity()) ||
+                (YC && YC->getValueAPF().isInfinity());
+
+  if (Flags.hasNoNaNs() && (HasNan || X.isUndef() || Y.isUndef()))
+    return getUNDEF(X.getValueType());
+
+  if (Flags.hasNoInfs() && (HasInf || X.isUndef() || Y.isUndef()))
+    return getUNDEF(X.getValueType());
+
+  if (!YC)
+    return SDValue();
+
+  // X + -0.0 --> X
+  if (Opcode == ISD::FADD)
+    if (YC->getValueAPF().isNegZero())
+      return X;
+
+  // X - +0.0 --> X
+  if (Opcode == ISD::FSUB)
+    if (YC->getValueAPF().isPosZero())
+      return X;
+
+  // X * 1.0 --> X
+  // X / 1.0 --> X
+  if (Opcode == ISD::FMUL || Opcode == ISD::FDIV)
+    if (YC->getValueAPF().isExactlyValue(1.0))
+      return X;
+
+  // X * 0.0 --> 0.0
+  if (Opcode == ISD::FMUL && Flags.hasNoNaNs() && Flags.hasNoSignedZeros())
+    if (YC->getValueAPF().isZero())
+      return getConstantFP(0.0, SDLoc(Y), Y.getValueType());
+
+  return SDValue();
+}
+
+SDValue SelectionDAG::getVAArg(EVT VT, const SDLoc &dl, SDValue Chain,
+                               SDValue Ptr, SDValue SV, unsigned Align) {
+  SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, dl, MVT::i32) };
+  return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              ArrayRef<SDUse> Ops) {
+  switch (Ops.size()) {
+  case 0: return getNode(Opcode, DL, VT);
+  case 1: return getNode(Opcode, DL, VT, static_cast<const SDValue>(Ops[0]));
+  case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
+  case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
+  default: break;
+  }
+
+  // Copy from an SDUse array into an SDValue array for use with
+  // the regular getNode logic.
+  SmallVector<SDValue, 8> NewOps(Ops.begin(), Ops.end());
+  return getNode(Opcode, DL, VT, NewOps);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              ArrayRef<SDValue> Ops) {
+  SDNodeFlags Flags;
+  if (Inserter)
+    Flags = Inserter->getFlags();
+  return getNode(Opcode, DL, VT, Ops, Flags);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              ArrayRef<SDValue> Ops, const SDNodeFlags Flags) {
+  unsigned NumOps = Ops.size();
+  switch (NumOps) {
+  case 0: return getNode(Opcode, DL, VT);
+  case 1: return getNode(Opcode, DL, VT, Ops[0], Flags);
+  case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags);
+  case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2], Flags);
+  default: break;
+  }
+
+#ifndef NDEBUG
+  for (const auto &Op : Ops)
+    assert(Op.getOpcode() != ISD::DELETED_NODE &&
+           "Operand is DELETED_NODE!");
+#endif
+
+  switch (Opcode) {
+  default: break;
+  case ISD::BUILD_VECTOR:
+    // Attempt to simplify BUILD_VECTOR.
+    if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
+      return V;
+    break;
+  case ISD::CONCAT_VECTORS:
+    if (SDValue V = foldCONCAT_VECTORS(DL, VT, Ops, *this))
+      return V;
+    break;
+  case ISD::SELECT_CC:
+    assert(NumOps == 5 && "SELECT_CC takes 5 operands!");
+    assert(Ops[0].getValueType() == Ops[1].getValueType() &&
+           "LHS and RHS of condition must have same type!");
+    assert(Ops[2].getValueType() == Ops[3].getValueType() &&
+           "True and False arms of SelectCC must have same type!");
+    assert(Ops[2].getValueType() == VT &&
+           "select_cc node must be of same type as true and false value!");
+    assert((!Ops[0].getValueType().isVector() ||
+            Ops[0].getValueType().getVectorElementCount() ==
+                VT.getVectorElementCount()) &&
+           "Expected select_cc with vector result to have the same sized "
+           "comparison type!");
+    break;
+  case ISD::BR_CC:
+    assert(NumOps == 5 && "BR_CC takes 5 operands!");
+    assert(Ops[2].getValueType() == Ops[3].getValueType() &&
+           "LHS/RHS of comparison should match types!");
+    break;
+  case ISD::VP_ADD:
+  case ISD::VP_SUB:
+    // If it is VP_ADD/VP_SUB mask operation then turn it to VP_XOR
+    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+      Opcode = ISD::VP_XOR;
+    break;
+  case ISD::VP_MUL:
+    // If it is VP_MUL mask operation then turn it to VP_AND
+    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+      Opcode = ISD::VP_AND;
+    break;
+  case ISD::VP_REDUCE_MUL:
+    // If it is VP_REDUCE_MUL mask operation then turn it to VP_REDUCE_AND
+    if (VT == MVT::i1)
+      Opcode = ISD::VP_REDUCE_AND;
+    break;
+  case ISD::VP_REDUCE_ADD:
+    // If it is VP_REDUCE_ADD mask operation then turn it to VP_REDUCE_XOR
+    if (VT == MVT::i1)
+      Opcode = ISD::VP_REDUCE_XOR;
+    break;
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_UMIN:
+    // If it is VP_REDUCE_SMAX/VP_REDUCE_UMIN mask operation then turn it to
+    // VP_REDUCE_AND.
+    if (VT == MVT::i1)
+      Opcode = ISD::VP_REDUCE_AND;
+    break;
+  case ISD::VP_REDUCE_SMIN:
+  case ISD::VP_REDUCE_UMAX:
+    // If it is VP_REDUCE_SMIN/VP_REDUCE_UMAX mask operation then turn it to
+    // VP_REDUCE_OR.
+    if (VT == MVT::i1)
+      Opcode = ISD::VP_REDUCE_OR;
+    break;
+  }
+
+  // Memoize nodes.
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+
+  if (VT != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops);
+    void *IP = nullptr;
+
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+      return SDValue(E, 0);
+
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
+
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
+  }
+
+  N->setFlags(Flags);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
+                              ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops) {
+  return getNode(Opcode, DL, getVTList(ResultTys), Ops);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                              ArrayRef<SDValue> Ops) {
+  SDNodeFlags Flags;
+  if (Inserter)
+    Flags = Inserter->getFlags();
+  return getNode(Opcode, DL, VTList, Ops, Flags);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                              ArrayRef<SDValue> Ops, const SDNodeFlags Flags) {
+  if (VTList.NumVTs == 1)
+    return getNode(Opcode, DL, VTList.VTs[0], Ops, Flags);
+
+#ifndef NDEBUG
+  for (const auto &Op : Ops)
+    assert(Op.getOpcode() != ISD::DELETED_NODE &&
+           "Operand is DELETED_NODE!");
+#endif
+
+  switch (Opcode) {
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO: {
+    assert(VTList.NumVTs == 2 && Ops.size() == 2 &&
+           "Invalid add/sub overflow op!");
+    assert(VTList.VTs[0].isInteger() && VTList.VTs[1].isInteger() &&
+           Ops[0].getValueType() == Ops[1].getValueType() &&
+           Ops[0].getValueType() == VTList.VTs[0] &&
+           "Binary operator types must match!");
+    SDValue N1 = Ops[0], N2 = Ops[1];
+    canonicalizeCommutativeBinop(Opcode, N1, N2);
+
+    // (X +- 0) -> X with zero-overflow.
+    ConstantSDNode *N2CV = isConstOrConstSplat(N2, /*AllowUndefs*/ false,
+                                               /*AllowTruncation*/ true);
+    if (N2CV && N2CV->isZero()) {
+      SDValue ZeroOverFlow = getConstant(0, DL, VTList.VTs[1]);
+      return getNode(ISD::MERGE_VALUES, DL, VTList, {N1, ZeroOverFlow}, Flags);
+    }
+    break;
+  }
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI: {
+    assert(VTList.NumVTs == 2 && Ops.size() == 2 && "Invalid mul lo/hi op!");
+    assert(VTList.VTs[0].isInteger() && VTList.VTs[0] == VTList.VTs[1] &&
+           VTList.VTs[0] == Ops[0].getValueType() &&
+           VTList.VTs[0] == Ops[1].getValueType() &&
+           "Binary operator types must match!");
+    break;
+  }
+  case ISD::FFREXP: {
+    assert(VTList.NumVTs == 2 && Ops.size() == 1 && "Invalid ffrexp op!");
+    assert(VTList.VTs[0].isFloatingPoint() && VTList.VTs[1].isInteger() &&
+           VTList.VTs[0] == Ops[0].getValueType() && "frexp type mismatch");
+
+    if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Ops[0])) {
+      int FrexpExp;
+      APFloat FrexpMant =
+          frexp(C->getValueAPF(), FrexpExp, APFloat::rmNearestTiesToEven);
+      SDValue Result0 = getConstantFP(FrexpMant, DL, VTList.VTs[0]);
+      SDValue Result1 =
+          getConstant(FrexpMant.isFinite() ? FrexpExp : 0, DL, VTList.VTs[1]);
+      return getNode(ISD::MERGE_VALUES, DL, VTList, {Result0, Result1}, Flags);
+    }
+
+    break;
+  }
+  case ISD::STRICT_FP_EXTEND:
+    assert(VTList.NumVTs == 2 && Ops.size() == 2 &&
+           "Invalid STRICT_FP_EXTEND!");
+    assert(VTList.VTs[0].isFloatingPoint() &&
+           Ops[1].getValueType().isFloatingPoint() && "Invalid FP cast!");
+    assert(VTList.VTs[0].isVector() == Ops[1].getValueType().isVector() &&
+           "STRICT_FP_EXTEND result type should be vector iff the operand "
+           "type is vector!");
+    assert((!VTList.VTs[0].isVector() ||
+            VTList.VTs[0].getVectorElementCount() ==
+                Ops[1].getValueType().getVectorElementCount()) &&
+           "Vector element count mismatch!");
+    assert(Ops[1].getValueType().bitsLT(VTList.VTs[0]) &&
+           "Invalid fpext node, dst <= src!");
+    break;
+  case ISD::STRICT_FP_ROUND:
+    assert(VTList.NumVTs == 2 && Ops.size() == 3 && "Invalid STRICT_FP_ROUND!");
+    assert(VTList.VTs[0].isVector() == Ops[1].getValueType().isVector() &&
+           "STRICT_FP_ROUND result type should be vector iff the operand "
+           "type is vector!");
+    assert((!VTList.VTs[0].isVector() ||
+            VTList.VTs[0].getVectorElementCount() ==
+                Ops[1].getValueType().getVectorElementCount()) &&
+           "Vector element count mismatch!");
+    assert(VTList.VTs[0].isFloatingPoint() &&
+           Ops[1].getValueType().isFloatingPoint() &&
+           VTList.VTs[0].bitsLT(Ops[1].getValueType()) &&
+           isa<ConstantSDNode>(Ops[2]) &&
+           (cast<ConstantSDNode>(Ops[2])->getZExtValue() == 0 ||
+            cast<ConstantSDNode>(Ops[2])->getZExtValue() == 1) &&
+           "Invalid STRICT_FP_ROUND!");
+    break;
+#if 0
+  // FIXME: figure out how to safely handle things like
+  // int foo(int x) { return 1 << (x & 255); }
+  // int bar() { return foo(256); }
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:
+  case ISD::SHL_PARTS:
+    if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+        cast<VTSDNode>(N3.getOperand(1))->getVT() != MVT::i1)
+      return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
+    else if (N3.getOpcode() == ISD::AND)
+      if (ConstantSDNode *AndRHS = dyn_cast<ConstantSDNode>(N3.getOperand(1))) {
+        // If the and is only masking out bits that cannot effect the shift,
+        // eliminate the and.
+        unsigned NumBits = VT.getScalarSizeInBits()*2;
+        if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1)
+          return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
+      }
+    break;
+#endif
+  }
+
+  // Memoize the node unless it returns a flag.
+  SDNode *N;
+  if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops);
+    void *IP = nullptr;
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+      return SDValue(E, 0);
+
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
+    createOperands(N, Ops);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
+    createOperands(N, Ops);
+  }
+
+  N->setFlags(Flags);
+  InsertNode(N);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
+                              SDVTList VTList) {
+  return getNode(Opcode, DL, VTList, std::nullopt);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                              SDValue N1) {
+  SDValue Ops[] = { N1 };
+  return getNode(Opcode, DL, VTList, Ops);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                              SDValue N1, SDValue N2) {
+  SDValue Ops[] = { N1, N2 };
+  return getNode(Opcode, DL, VTList, Ops);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3) {
+  SDValue Ops[] = { N1, N2, N3 };
+  return getNode(Opcode, DL, VTList, Ops);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
+  SDValue Ops[] = { N1, N2, N3, N4 };
+  return getNode(Opcode, DL, VTList, Ops);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3, SDValue N4,
+                              SDValue N5) {
+  SDValue Ops[] = { N1, N2, N3, N4, N5 };
+  return getNode(Opcode, DL, VTList, Ops);
+}
+
+SDVTList SelectionDAG::getVTList(EVT VT) {
+  return makeVTList(SDNode::getValueTypeList(VT), 1);
+}
+
+SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2) {
+  FoldingSetNodeID ID;
+  ID.AddInteger(2U);
+  ID.AddInteger(VT1.getRawBits());
+  ID.AddInteger(VT2.getRawBits());
+
+  void *IP = nullptr;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (!Result) {
+    EVT *Array = Allocator.Allocate<EVT>(2);
+    Array[0] = VT1;
+    Array[1] = VT2;
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 2);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
+}
+
+SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3) {
+  FoldingSetNodeID ID;
+  ID.AddInteger(3U);
+  ID.AddInteger(VT1.getRawBits());
+  ID.AddInteger(VT2.getRawBits());
+  ID.AddInteger(VT3.getRawBits());
+
+  void *IP = nullptr;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (!Result) {
+    EVT *Array = Allocator.Allocate<EVT>(3);
+    Array[0] = VT1;
+    Array[1] = VT2;
+    Array[2] = VT3;
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 3);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
+}
+
+SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
+  FoldingSetNodeID ID;
+  ID.AddInteger(4U);
+  ID.AddInteger(VT1.getRawBits());
+  ID.AddInteger(VT2.getRawBits());
+  ID.AddInteger(VT3.getRawBits());
+  ID.AddInteger(VT4.getRawBits());
+
+  void *IP = nullptr;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (!Result) {
+    EVT *Array = Allocator.Allocate<EVT>(4);
+    Array[0] = VT1;
+    Array[1] = VT2;
+    Array[2] = VT3;
+    Array[3] = VT4;
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 4);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
+}
+
+SDVTList SelectionDAG::getVTList(ArrayRef<EVT> VTs) {
+  unsigned NumVTs = VTs.size();
+  FoldingSetNodeID ID;
+  ID.AddInteger(NumVTs);
+  for (unsigned index = 0; index < NumVTs; index++) {
+    ID.AddInteger(VTs[index].getRawBits());
+  }
+
+  void *IP = nullptr;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (!Result) {
+    EVT *Array = Allocator.Allocate<EVT>(NumVTs);
+    llvm::copy(VTs, Array);
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, NumVTs);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
+}
+
+
+/// UpdateNodeOperands - *Mutate* the specified node in-place to have the
+/// specified operands.  If the resultant node already exists in the DAG,
+/// this does not modify the specified node, instead it returns the node that
+/// already exists.  If the resultant node does not exist in the DAG, the
+/// input node is returned.  As a degenerate case, if you specify the same
+/// input operands as the node already has, the input node is returned.
+SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op) {
+  assert(N->getNumOperands() == 1 && "Update with wrong number of operands");
+
+  // Check to see if there is no change.
+  if (Op == N->getOperand(0)) return N;
+
+  // See if the modified node already exists.
+  void *InsertPos = nullptr;
+  if (SDNode *Existing = FindModifiedNodeSlot(N, Op, InsertPos))
+    return Existing;
+
+  // Nope it doesn't.  Remove the node from its current place in the maps.
+  if (InsertPos)
+    if (!RemoveNodeFromCSEMaps(N))
+      InsertPos = nullptr;
+
+  // Now we update the operands.
+  N->OperandList[0].set(Op);
+
+  updateDivergence(N);
+  // If this gets put into a CSE map, add it.
+  if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+  return N;
+}
+
+SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2) {
+  assert(N->getNumOperands() == 2 && "Update with wrong number of operands");
+
+  // Check to see if there is no change.
+  if (Op1 == N->getOperand(0) && Op2 == N->getOperand(1))
+    return N;   // No operands changed, just return the input node.
+
+  // See if the modified node already exists.
+  void *InsertPos = nullptr;
+  if (SDNode *Existing = FindModifiedNodeSlot(N, Op1, Op2, InsertPos))
+    return Existing;
+
+  // Nope it doesn't.  Remove the node from its current place in the maps.
+  if (InsertPos)
+    if (!RemoveNodeFromCSEMaps(N))
+      InsertPos = nullptr;
+
+  // Now we update the operands.
+  if (N->OperandList[0] != Op1)
+    N->OperandList[0].set(Op1);
+  if (N->OperandList[1] != Op2)
+    N->OperandList[1].set(Op2);
+
+  updateDivergence(N);
+  // If this gets put into a CSE map, add it.
+  if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+  return N;
+}
+
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3) {
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return UpdateNodeOperands(N, Ops);
+}
+
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
+                   SDValue Op3, SDValue Op4) {
+  SDValue Ops[] = { Op1, Op2, Op3, Op4 };
+  return UpdateNodeOperands(N, Ops);
+}
+
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
+                   SDValue Op3, SDValue Op4, SDValue Op5) {
+  SDValue Ops[] = { Op1, Op2, Op3, Op4, Op5 };
+  return UpdateNodeOperands(N, Ops);
+}
+
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, ArrayRef<SDValue> Ops) {
+  unsigned NumOps = Ops.size();
+  assert(N->getNumOperands() == NumOps &&
+         "Update with wrong number of operands");
+
+  // If no operands changed just return the input node.
+  if (std::equal(Ops.begin(), Ops.end(), N->op_begin()))
+    return N;
+
+  // See if the modified node already exists.
+  void *InsertPos = nullptr;
+  if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, InsertPos))
+    return Existing;
+
+  // Nope it doesn't.  Remove the node from its current place in the maps.
+  if (InsertPos)
+    if (!RemoveNodeFromCSEMaps(N))
+      InsertPos = nullptr;
+
+  // Now we update the operands.
+  for (unsigned i = 0; i != NumOps; ++i)
+    if (N->OperandList[i] != Ops[i])
+      N->OperandList[i].set(Ops[i]);
+
+  updateDivergence(N);
+  // If this gets put into a CSE map, add it.
+  if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+  return N;
+}
+
+/// DropOperands - Release the operands and set this node to have
+/// zero operands.
+void SDNode::DropOperands() {
+  // Unlike the code in MorphNodeTo that does this, we don't need to
+  // watch for dead nodes here.
+  for (op_iterator I = op_begin(), E = op_end(); I != E; ) {
+    SDUse &Use = *I++;
+    Use.set(SDValue());
+  }
+}
+
+void SelectionDAG::setNodeMemRefs(MachineSDNode *N,
+                                  ArrayRef<MachineMemOperand *> NewMemRefs) {
+  if (NewMemRefs.empty()) {
+    N->clearMemRefs();
+    return;
+  }
+
+  // Check if we can avoid allocating by storing a single reference directly.
+  if (NewMemRefs.size() == 1) {
+    N->MemRefs = NewMemRefs[0];
+    N->NumMemRefs = 1;
+    return;
+  }
+
+  MachineMemOperand **MemRefsBuffer =
+      Allocator.template Allocate<MachineMemOperand *>(NewMemRefs.size());
+  llvm::copy(NewMemRefs, MemRefsBuffer);
+  N->MemRefs = MemRefsBuffer;
+  N->NumMemRefs = static_cast<int>(NewMemRefs.size());
+}
+
+/// SelectNodeTo - These are wrappers around MorphNodeTo that accept a
+/// machine opcode.
+///
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT) {
+  SDVTList VTs = getVTList(VT);
+  return SelectNodeTo(N, MachineOpc, VTs, std::nullopt);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT, SDValue Op1) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT, SDValue Op1,
+                                   SDValue Op2) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT, SDValue Op1,
+                                   SDValue Op2, SDValue Op3) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT, ArrayRef<SDValue> Ops) {
+  SDVTList VTs = getVTList(VT);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2, ArrayRef<SDValue> Ops) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return SelectNodeTo(N, MachineOpc, VTs, std::nullopt);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2, EVT VT3,
+                                   ArrayRef<SDValue> Ops) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2,
+                                   SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   SDVTList VTs,ArrayRef<SDValue> Ops) {
+  SDNode *New = MorphNodeTo(N, ~MachineOpc, VTs, Ops);
+  // Reset the NodeID to -1.
+  New->setNodeId(-1);
+  if (New != N) {
+    ReplaceAllUsesWith(N, New);
+    RemoveDeadNode(N);
+  }
+  return New;
+}
+
+/// UpdateSDLocOnMergeSDNode - If the opt level is -O0 then it throws away
+/// the line number information on the merged node since it is not possible to
+/// preserve the information that operation is associated with multiple lines.
+/// This will make the debugger working better at -O0, were there is a higher
+/// probability having other instructions associated with that line.
+///
+/// For IROrder, we keep the smaller of the two
+SDNode *SelectionDAG::UpdateSDLocOnMergeSDNode(SDNode *N, const SDLoc &OLoc) {
+  DebugLoc NLoc = N->getDebugLoc();
+  if (NLoc && OptLevel == CodeGenOpt::None && OLoc.getDebugLoc() != NLoc) {
+    N->setDebugLoc(DebugLoc());
+  }
+  unsigned Order = std::min(N->getIROrder(), OLoc.getIROrder());
+  N->setIROrder(Order);
+  return N;
+}
+
+/// MorphNodeTo - This *mutates* the specified node to have the specified
+/// return type, opcode, and operands.
+///
+/// Note that MorphNodeTo returns the resultant node.  If there is already a
+/// node of the specified opcode and operands, it returns that node instead of
+/// the current one.  Note that the SDLoc need not be the same.
+///
+/// Using MorphNodeTo is faster than creating a new node and swapping it in
+/// with ReplaceAllUsesWith both because it often avoids allocating a new
+/// node, and because it doesn't require CSE recalculation for any of
+/// the node's users.
+///
+/// However, note that MorphNodeTo recursively deletes dead nodes from the DAG.
+/// As a consequence it isn't appropriate to use from within the DAG combiner or
+/// the legalizer which maintain worklists that would need to be updated when
+/// deleting things.
+SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                  SDVTList VTs, ArrayRef<SDValue> Ops) {
+  // If an identical node already exists, use it.
+  void *IP = nullptr;
+  if (VTs.VTs[VTs.NumVTs-1] != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opc, VTs, Ops);
+    if (SDNode *ON = FindNodeOrInsertPos(ID, SDLoc(N), IP))
+      return UpdateSDLocOnMergeSDNode(ON, SDLoc(N));
+  }
+
+  if (!RemoveNodeFromCSEMaps(N))
+    IP = nullptr;
+
+  // Start the morphing.
+  N->NodeType = Opc;
+  N->ValueList = VTs.VTs;
+  N->NumValues = VTs.NumVTs;
+
+  // Clear the operands list, updating used nodes to remove this from their
+  // use list.  Keep track of any operands that become dead as a result.
+  SmallPtrSet<SDNode*, 16> DeadNodeSet;
+  for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
+    SDUse &Use = *I++;
+    SDNode *Used = Use.getNode();
+    Use.set(SDValue());
+    if (Used->use_empty())
+      DeadNodeSet.insert(Used);
+  }
+
+  // For MachineNode, initialize the memory references information.
+  if (MachineSDNode *MN = dyn_cast<MachineSDNode>(N))
+    MN->clearMemRefs();
+
+  // Swap for an appropriately sized array from the recycler.
+  removeOperands(N);
+  createOperands(N, Ops);
+
+  // Delete any nodes that are still dead after adding the uses for the
+  // new operands.
+  if (!DeadNodeSet.empty()) {
+    SmallVector<SDNode *, 16> DeadNodes;
+    for (SDNode *N : DeadNodeSet)
+      if (N->use_empty())
+        DeadNodes.push_back(N);
+    RemoveDeadNodes(DeadNodes);
+  }
+
+  if (IP)
+    CSEMap.InsertNode(N, IP);   // Memoize the new node.
+  return N;
+}
+
+SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) {
+  unsigned OrigOpc = Node->getOpcode();
+  unsigned NewOpc;
+  switch (OrigOpc) {
+  default:
+    llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!");
+#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
+  case ISD::STRICT_##DAGN: NewOpc = ISD::DAGN; break;
+#define CMP_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
+  case ISD::STRICT_##DAGN: NewOpc = ISD::SETCC; break;
+#include "llvm/IR/ConstrainedOps.def"
+  }
+
+  assert(Node->getNumValues() == 2 && "Unexpected number of results!");
+
+  // We're taking this node out of the chain, so we need to re-link things.
+  SDValue InputChain = Node->getOperand(0);
+  SDValue OutputChain = SDValue(Node, 1);
+  ReplaceAllUsesOfValueWith(OutputChain, InputChain);
+
+  SmallVector<SDValue, 3> Ops;
+  for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
+    Ops.push_back(Node->getOperand(i));
+
+  SDVTList VTs = getVTList(Node->getValueType(0));
+  SDNode *Res = MorphNodeTo(Node, NewOpc, VTs, Ops);
+
+  // MorphNodeTo can operate in two ways: if an existing node with the
+  // specified operands exists, it can just return it.  Otherwise, it
+  // updates the node in place to have the requested operands.
+  if (Res == Node) {
+    // If we updated the node in place, reset the node ID.  To the isel,
+    // this should be just like a newly allocated machine node.
+    Res->setNodeId(-1);
+  } else {
+    ReplaceAllUsesWith(Node, Res);
+    RemoveDeadNode(Node);
+  }
+
+  return Res;
+}
+
+/// getMachineNode - These are used for target selectors to create a new node
+/// with specified return type(s), MachineInstr opcode, and operands.
+///
+/// Note that getMachineNode returns the resultant node.  If there is already a
+/// node of the specified opcode and operands, it returns that node instead of
+/// the current one.
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT) {
+  SDVTList VTs = getVTList(VT);
+  return getMachineNode(Opcode, dl, VTs, std::nullopt);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT, SDValue Op1) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1 };
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT, SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2 };
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT, SDValue Op1, SDValue Op2,
+                                            SDValue Op3) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT, ArrayRef<SDValue> Ops) {
+  SDVTList VTs = getVTList(VT);
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, SDValue Op1,
+                                            SDValue Op2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2 };
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, SDValue Op1,
+                                            SDValue Op2, SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2,
+                                            ArrayRef<SDValue> Ops) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, EVT VT3,
+                                            SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  SDValue Ops[] = { Op1, Op2 };
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, EVT VT3,
+                                            SDValue Op1, SDValue Op2,
+                                            SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, EVT VT3,
+                                            ArrayRef<SDValue> Ops) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            ArrayRef<EVT> ResultTys,
+                                            ArrayRef<SDValue> Ops) {
+  SDVTList VTs = getVTList(ResultTys);
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &DL,
+                                            SDVTList VTs,
+                                            ArrayRef<SDValue> Ops) {
+  bool DoCSE = VTs.VTs[VTs.NumVTs-1] != MVT::Glue;
+  MachineSDNode *N;
+  void *IP = nullptr;
+
+  if (DoCSE) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, ~Opcode, VTs, Ops);
+    IP = nullptr;
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+      return cast<MachineSDNode>(UpdateSDLocOnMergeSDNode(E, DL));
+    }
+  }
+
+  // Allocate a new MachineSDNode.
+  N = newSDNode<MachineSDNode>(~Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+  createOperands(N, Ops);
+
+  if (DoCSE)
+    CSEMap.InsertNode(N, IP);
+
+  InsertNode(N);
+  NewSDValueDbgMsg(SDValue(N, 0), "Creating new machine node: ", this);
+  return N;
+}
+
+/// getTargetExtractSubreg - A convenience function for creating
+/// TargetOpcode::EXTRACT_SUBREG nodes.
+SDValue SelectionDAG::getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT,
+                                             SDValue Operand) {
+  SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
+  SDNode *Subreg = getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+                                  VT, Operand, SRIdxVal);
+  return SDValue(Subreg, 0);
+}
+
+/// getTargetInsertSubreg - A convenience function for creating
+/// TargetOpcode::INSERT_SUBREG nodes.
+SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
+                                            SDValue Operand, SDValue Subreg) {
+  SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
+  SDNode *Result = getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
+                                  VT, Operand, Subreg, SRIdxVal);
+  return SDValue(Result, 0);
+}
+
+/// getNodeIfExists - Get the specified node if it's already available, or
+/// else return NULL.
+SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
+                                      ArrayRef<SDValue> Ops) {
+  SDNodeFlags Flags;
+  if (Inserter)
+    Flags = Inserter->getFlags();
+  return getNodeIfExists(Opcode, VTList, Ops, Flags);
+}
+
+SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
+                                      ArrayRef<SDValue> Ops,
+                                      const SDNodeFlags Flags) {
+  if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops);
+    void *IP = nullptr;
+    if (SDNode *E = FindNodeOrInsertPos(ID, SDLoc(), IP)) {
+      E->intersectFlagsWith(Flags);
+      return E;
+    }
+  }
+  return nullptr;
+}
+
+/// doesNodeExist - Check if a node exists without modifying its flags.
+bool SelectionDAG::doesNodeExist(unsigned Opcode, SDVTList VTList,
+                                 ArrayRef<SDValue> Ops) {
+  if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops);
+    void *IP = nullptr;
+    if (FindNodeOrInsertPos(ID, SDLoc(), IP))
+      return true;
+  }
+  return false;
+}
+
+/// getDbgValue - Creates a SDDbgValue node.
+///
+/// SDNode
+SDDbgValue *SelectionDAG::getDbgValue(DIVariable *Var, DIExpression *Expr,
+                                      SDNode *N, unsigned R, bool IsIndirect,
+                                      const DebugLoc &DL, unsigned O) {
+  assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return new (DbgInfo->getAlloc())
+      SDDbgValue(DbgInfo->getAlloc(), Var, Expr, SDDbgOperand::fromNode(N, R),
+                 {}, IsIndirect, DL, O,
+                 /*IsVariadic=*/false);
+}
+
+/// Constant
+SDDbgValue *SelectionDAG::getConstantDbgValue(DIVariable *Var,
+                                              DIExpression *Expr,
+                                              const Value *C,
+                                              const DebugLoc &DL, unsigned O) {
+  assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return new (DbgInfo->getAlloc())
+      SDDbgValue(DbgInfo->getAlloc(), Var, Expr, SDDbgOperand::fromConst(C), {},
+                 /*IsIndirect=*/false, DL, O,
+                 /*IsVariadic=*/false);
+}
+
+/// FrameIndex
+SDDbgValue *SelectionDAG::getFrameIndexDbgValue(DIVariable *Var,
+                                                DIExpression *Expr, unsigned FI,
+                                                bool IsIndirect,
+                                                const DebugLoc &DL,
+                                                unsigned O) {
+  assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return getFrameIndexDbgValue(Var, Expr, FI, {}, IsIndirect, DL, O);
+}
+
+/// FrameIndex with dependencies
+SDDbgValue *SelectionDAG::getFrameIndexDbgValue(DIVariable *Var,
+                                                DIExpression *Expr, unsigned FI,
+                                                ArrayRef<SDNode *> Dependencies,
+                                                bool IsIndirect,
+                                                const DebugLoc &DL,
+                                                unsigned O) {
+  assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return new (DbgInfo->getAlloc())
+      SDDbgValue(DbgInfo->getAlloc(), Var, Expr, SDDbgOperand::fromFrameIdx(FI),
+                 Dependencies, IsIndirect, DL, O,
+                 /*IsVariadic=*/false);
+}
+
+/// VReg
+SDDbgValue *SelectionDAG::getVRegDbgValue(DIVariable *Var, DIExpression *Expr,
+                                          unsigned VReg, bool IsIndirect,
+                                          const DebugLoc &DL, unsigned O) {
+  assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return new (DbgInfo->getAlloc())
+      SDDbgValue(DbgInfo->getAlloc(), Var, Expr, SDDbgOperand::fromVReg(VReg),
+                 {}, IsIndirect, DL, O,
+                 /*IsVariadic=*/false);
+}
+
+SDDbgValue *SelectionDAG::getDbgValueList(DIVariable *Var, DIExpression *Expr,
+                                          ArrayRef<SDDbgOperand> Locs,
+                                          ArrayRef<SDNode *> Dependencies,
+                                          bool IsIndirect, const DebugLoc &DL,
+                                          unsigned O, bool IsVariadic) {
+  assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return new (DbgInfo->getAlloc())
+      SDDbgValue(DbgInfo->getAlloc(), Var, Expr, Locs, Dependencies, IsIndirect,
+                 DL, O, IsVariadic);
+}
+
+void SelectionDAG::transferDbgValues(SDValue From, SDValue To,
+                                     unsigned OffsetInBits, unsigned SizeInBits,
+                                     bool InvalidateDbg) {
+  SDNode *FromNode = From.getNode();
+  SDNode *ToNode = To.getNode();
+  assert(FromNode && ToNode && "Can't modify dbg values");
+
+  // PR35338
+  // TODO: assert(From != To && "Redundant dbg value transfer");
+  // TODO: assert(FromNode != ToNode && "Intranode dbg value transfer");
+  if (From == To || FromNode == ToNode)
+    return;
+
+  if (!FromNode->getHasDebugValue())
+    return;
+
+  SDDbgOperand FromLocOp =
+      SDDbgOperand::fromNode(From.getNode(), From.getResNo());
+  SDDbgOperand ToLocOp = SDDbgOperand::fromNode(To.getNode(), To.getResNo());
+
+  SmallVector<SDDbgValue *, 2> ClonedDVs;
+  for (SDDbgValue *Dbg : GetDbgValues(FromNode)) {
+    if (Dbg->isInvalidated())
+      continue;
+
+    // TODO: assert(!Dbg->isInvalidated() && "Transfer of invalid dbg value");
+
+    // Create a new location ops vector that is equal to the old vector, but
+    // with each instance of FromLocOp replaced with ToLocOp.
+    bool Changed = false;
+    auto NewLocOps = Dbg->copyLocationOps();
+    std::replace_if(
+        NewLocOps.begin(), NewLocOps.end(),
+        [&Changed, FromLocOp](const SDDbgOperand &Op) {
+          bool Match = Op == FromLocOp;
+          Changed |= Match;
+          return Match;
+        },
+        ToLocOp);
+    // Ignore this SDDbgValue if we didn't find a matching location.
+    if (!Changed)
+      continue;
+
+    DIVariable *Var = Dbg->getVariable();
+    auto *Expr = Dbg->getExpression();
+    // If a fragment is requested, update the expression.
+    if (SizeInBits) {
+      // When splitting a larger (e.g., sign-extended) value whose
+      // lower bits are described with an SDDbgValue, do not attempt
+      // to transfer the SDDbgValue to the upper bits.
+      if (auto FI = Expr->getFragmentInfo())
+        if (OffsetInBits + SizeInBits > FI->SizeInBits)
+          continue;
+      auto Fragment = DIExpression::createFragmentExpression(Expr, OffsetInBits,
+                                                             SizeInBits);
+      if (!Fragment)
+        continue;
+      Expr = *Fragment;
+    }
+
+    auto AdditionalDependencies = Dbg->getAdditionalDependencies();
+    // Clone the SDDbgValue and move it to To.
+    SDDbgValue *Clone = getDbgValueList(
+        Var, Expr, NewLocOps, AdditionalDependencies, Dbg->isIndirect(),
+        Dbg->getDebugLoc(), std::max(ToNode->getIROrder(), Dbg->getOrder()),
+        Dbg->isVariadic());
+    ClonedDVs.push_back(Clone);
+
+    if (InvalidateDbg) {
+      // Invalidate value and indicate the SDDbgValue should not be emitted.
+      Dbg->setIsInvalidated();
+      Dbg->setIsEmitted();
+    }
+  }
+
+  for (SDDbgValue *Dbg : ClonedDVs) {
+    assert(is_contained(Dbg->getSDNodes(), ToNode) &&
+           "Transferred DbgValues should depend on the new SDNode");
+    AddDbgValue(Dbg, false);
+  }
+}
+
+void SelectionDAG::salvageDebugInfo(SDNode &N) {
+  if (!N.getHasDebugValue())
+    return;
+
+  SmallVector<SDDbgValue *, 2> ClonedDVs;
+  for (auto *DV : GetDbgValues(&N)) {
+    if (DV->isInvalidated())
+      continue;
+    switch (N.getOpcode()) {
+    default:
+      break;
+    case ISD::ADD:
+      SDValue N0 = N.getOperand(0);
+      SDValue N1 = N.getOperand(1);
+      if (!isa<ConstantSDNode>(N0) && isa<ConstantSDNode>(N1)) {
+        uint64_t Offset = N.getConstantOperandVal(1);
+
+        // Rewrite an ADD constant node into a DIExpression. Since we are
+        // performing arithmetic to compute the variable's *value* in the
+        // DIExpression, we need to mark the expression with a
+        // DW_OP_stack_value.
+        auto *DIExpr = DV->getExpression();
+        auto NewLocOps = DV->copyLocationOps();
+        bool Changed = false;
+        for (size_t i = 0; i < NewLocOps.size(); ++i) {
+          // We're not given a ResNo to compare against because the whole
+          // node is going away. We know that any ISD::ADD only has one
+          // result, so we can assume any node match is using the result.
+          if (NewLocOps[i].getKind() != SDDbgOperand::SDNODE ||
+              NewLocOps[i].getSDNode() != &N)
+            continue;
+          NewLocOps[i] = SDDbgOperand::fromNode(N0.getNode(), N0.getResNo());
+          SmallVector<uint64_t, 3> ExprOps;
+          DIExpression::appendOffset(ExprOps, Offset);
+          DIExpr = DIExpression::appendOpsToArg(DIExpr, ExprOps, i, true);
+          Changed = true;
+        }
+        (void)Changed;
+        assert(Changed && "Salvage target doesn't use N");
+
+        auto AdditionalDependencies = DV->getAdditionalDependencies();
+        SDDbgValue *Clone = getDbgValueList(DV->getVariable(), DIExpr,
+                                            NewLocOps, AdditionalDependencies,
+                                            DV->isIndirect(), DV->getDebugLoc(),
+                                            DV->getOrder(), DV->isVariadic());
+        ClonedDVs.push_back(Clone);
+        DV->setIsInvalidated();
+        DV->setIsEmitted();
+        LLVM_DEBUG(dbgs() << "SALVAGE: Rewriting";
+                   N0.getNode()->dumprFull(this);
+                   dbgs() << " into " << *DIExpr << '\n');
+      }
+    }
+  }
+
+  for (SDDbgValue *Dbg : ClonedDVs) {
+    assert(!Dbg->getSDNodes().empty() &&
+           "Salvaged DbgValue should depend on a new SDNode");
+    AddDbgValue(Dbg, false);
+  }
+}
+
+/// Creates a SDDbgLabel node.
+SDDbgLabel *SelectionDAG::getDbgLabel(DILabel *Label,
+                                      const DebugLoc &DL, unsigned O) {
+  assert(cast<DILabel>(Label)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return new (DbgInfo->getAlloc()) SDDbgLabel(Label, DL, O);
+}
+
+namespace {
+
+/// RAUWUpdateListener - Helper for ReplaceAllUsesWith - When the node
+/// pointed to by a use iterator is deleted, increment the use iterator
+/// so that it doesn't dangle.
+///
+class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener {
+  SDNode::use_iterator &UI;
+  SDNode::use_iterator &UE;
+
+  void NodeDeleted(SDNode *N, SDNode *E) override {
+    // Increment the iterator as needed.
+    while (UI != UE && N == *UI)
+      ++UI;
+  }
+
+public:
+  RAUWUpdateListener(SelectionDAG &d,
+                     SDNode::use_iterator &ui,
+                     SDNode::use_iterator &ue)
+    : SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {}
+};
+
+} // end anonymous namespace
+
+/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+/// This can cause recursive merging of nodes in the DAG.
+///
+/// This version assumes From has a single result value.
+///
+void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
+  SDNode *From = FromN.getNode();
+  assert(From->getNumValues() == 1 && FromN.getResNo() == 0 &&
+         "Cannot replace with this method!");
+  assert(From != To.getNode() && "Cannot replace uses of with self");
+
+  // Preserve Debug Values
+  transferDbgValues(FromN, To);
+  // Preserve extra info.
+  copyExtraInfo(From, To.getNode());
+
+  // Iterate over all the existing uses of From. New uses will be added
+  // to the beginning of the use list, which we avoid visiting.
+  // This specifically avoids visiting uses of From that arise while the
+  // replacement is happening, because any such uses would be the result
+  // of CSE: If an existing node looks like From after one of its operands
+  // is replaced by To, we don't want to replace of all its users with To
+  // too. See PR3018 for more info.
+  SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+  RAUWUpdateListener Listener(*this, UI, UE);
+  while (UI != UE) {
+    SDNode *User = *UI;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+      ++UI;
+      Use.set(To);
+      if (To->isDivergent() != From->isDivergent())
+        updateDivergence(User);
+    } while (UI != UE && *UI == User);
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User);
+  }
+
+  // If we just RAUW'd the root, take note.
+  if (FromN == getRoot())
+    setRoot(To);
+}
+
+/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+/// This can cause recursive merging of nodes in the DAG.
+///
+/// This version assumes that for each value of From, there is a
+/// corresponding value in To in the same position with the same type.
+///
+void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To) {
+#ifndef NDEBUG
+  for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
+    assert((!From->hasAnyUseOfValue(i) ||
+            From->getValueType(i) == To->getValueType(i)) &&
+           "Cannot use this version of ReplaceAllUsesWith!");
+#endif
+
+  // Handle the trivial case.
+  if (From == To)
+    return;
+
+  // Preserve Debug Info. Only do this if there's a use.
+  for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
+    if (From->hasAnyUseOfValue(i)) {
+      assert((i < To->getNumValues()) && "Invalid To location");
+      transferDbgValues(SDValue(From, i), SDValue(To, i));
+    }
+  // Preserve extra info.
+  copyExtraInfo(From, To);
+
+  // Iterate over just the existing users of From. See the comments in
+  // the ReplaceAllUsesWith above.
+  SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+  RAUWUpdateListener Listener(*this, UI, UE);
+  while (UI != UE) {
+    SDNode *User = *UI;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+      ++UI;
+      Use.setNode(To);
+      if (To->isDivergent() != From->isDivergent())
+        updateDivergence(User);
+    } while (UI != UE && *UI == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User);
+  }
+
+  // If we just RAUW'd the root, take note.
+  if (From == getRoot().getNode())
+    setRoot(SDValue(To, getRoot().getResNo()));
+}
+
+/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+/// This can cause recursive merging of nodes in the DAG.
+///
+/// This version can replace From with any result values.  To must match the
+/// number and types of values returned by From.
+void SelectionDAG::ReplaceAllUsesWith(SDNode *From, const SDValue *To) {
+  if (From->getNumValues() == 1)  // Handle the simple case efficiently.
+    return ReplaceAllUsesWith(SDValue(From, 0), To[0]);
+
+  for (unsigned i = 0, e = From->getNumValues(); i != e; ++i) {
+    // Preserve Debug Info.
+    transferDbgValues(SDValue(From, i), To[i]);
+    // Preserve extra info.
+    copyExtraInfo(From, To[i].getNode());
+  }
+
+  // Iterate over just the existing users of From. See the comments in
+  // the ReplaceAllUsesWith above.
+  SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+  RAUWUpdateListener Listener(*this, UI, UE);
+  while (UI != UE) {
+    SDNode *User = *UI;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // A user can appear in a use list multiple times, and when this happens the
+    // uses are usually next to each other in the list.  To help reduce the
+    // number of CSE and divergence recomputations, process all the uses of this
+    // user that we can find this way.
+    bool To_IsDivergent = false;
+    do {
+      SDUse &Use = UI.getUse();
+      const SDValue &ToOp = To[Use.getResNo()];
+      ++UI;
+      Use.set(ToOp);
+      To_IsDivergent |= ToOp->isDivergent();
+    } while (UI != UE && *UI == User);
+
+    if (To_IsDivergent != From->isDivergent())
+      updateDivergence(User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User);
+  }
+
+  // If we just RAUW'd the root, take note.
+  if (From == getRoot().getNode())
+    setRoot(SDValue(To[getRoot().getResNo()]));
+}
+
+/// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving
+/// uses of other values produced by From.getNode() alone.  The Deleted
+/// vector is handled the same way as for ReplaceAllUsesWith.
+void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
+  // Handle the really simple, really trivial case efficiently.
+  if (From == To) return;
+
+  // Handle the simple, trivial, case efficiently.
+  if (From.getNode()->getNumValues() == 1) {
+    ReplaceAllUsesWith(From, To);
+    return;
+  }
+
+  // Preserve Debug Info.
+  transferDbgValues(From, To);
+  copyExtraInfo(From.getNode(), To.getNode());
+
+  // Iterate over just the existing users of From. See the comments in
+  // the ReplaceAllUsesWith above.
+  SDNode::use_iterator UI = From.getNode()->use_begin(),
+                       UE = From.getNode()->use_end();
+  RAUWUpdateListener Listener(*this, UI, UE);
+  while (UI != UE) {
+    SDNode *User = *UI;
+    bool UserRemovedFromCSEMaps = false;
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+
+      // Skip uses of different values from the same node.
+      if (Use.getResNo() != From.getResNo()) {
+        ++UI;
+        continue;
+      }
+
+      // If this node hasn't been modified yet, it's still in the CSE maps,
+      // so remove its old self from the CSE maps.
+      if (!UserRemovedFromCSEMaps) {
+        RemoveNodeFromCSEMaps(User);
+        UserRemovedFromCSEMaps = true;
+      }
+
+      ++UI;
+      Use.set(To);
+      if (To->isDivergent() != From->isDivergent())
+        updateDivergence(User);
+    } while (UI != UE && *UI == User);
+    // We are iterating over all uses of the From node, so if a use
+    // doesn't use the specific value, no changes are made.
+    if (!UserRemovedFromCSEMaps)
+      continue;
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User);
+  }
+
+  // If we just RAUW'd the root, take note.
+  if (From == getRoot())
+    setRoot(To);
+}
+
+namespace {
+
+/// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
+/// to record information about a use.
+struct UseMemo {
+  SDNode *User;
+  unsigned Index;
+  SDUse *Use;
+};
+
+/// operator< - Sort Memos by User.
+bool operator<(const UseMemo &L, const UseMemo &R) {
+  return (intptr_t)L.User < (intptr_t)R.User;
+}
+
+/// RAUOVWUpdateListener - Helper for ReplaceAllUsesOfValuesWith - When the node
+/// pointed to by a UseMemo is deleted, set the User to nullptr to indicate that
+/// the node already has been taken care of recursively.
+class RAUOVWUpdateListener : public SelectionDAG::DAGUpdateListener {
+  SmallVector<UseMemo, 4> &Uses;
+
+  void NodeDeleted(SDNode *N, SDNode *E) override {
+    for (UseMemo &Memo : Uses)
+      if (Memo.User == N)
+        Memo.User = nullptr;
+  }
+
+public:
+  RAUOVWUpdateListener(SelectionDAG &d, SmallVector<UseMemo, 4> &uses)
+      : SelectionDAG::DAGUpdateListener(d), Uses(uses) {}
+};
+
+} // end anonymous namespace
+
+bool SelectionDAG::calculateDivergence(SDNode *N) {
+  if (TLI->isSDNodeAlwaysUniform(N)) {
+    assert(!TLI->isSDNodeSourceOfDivergence(N, FLI, UA) &&
+           "Conflicting divergence information!");
+    return false;
+  }
+  if (TLI->isSDNodeSourceOfDivergence(N, FLI, UA))
+    return true;
+  for (const auto &Op : N->ops()) {
+    if (Op.Val.getValueType() != MVT::Other && Op.getNode()->isDivergent())
+      return true;
+  }
+  return false;
+}
+
+void SelectionDAG::updateDivergence(SDNode *N) {
+  SmallVector<SDNode *, 16> Worklist(1, N);
+  do {
+    N = Worklist.pop_back_val();
+    bool IsDivergent = calculateDivergence(N);
+    if (N->SDNodeBits.IsDivergent != IsDivergent) {
+      N->SDNodeBits.IsDivergent = IsDivergent;
+      llvm::append_range(Worklist, N->uses());
+    }
+  } while (!Worklist.empty());
+}
+
+void SelectionDAG::CreateTopologicalOrder(std::vector<SDNode *> &Order) {
+  DenseMap<SDNode *, unsigned> Degree;
+  Order.reserve(AllNodes.size());
+  for (auto &N : allnodes()) {
+    unsigned NOps = N.getNumOperands();
+    Degree[&N] = NOps;
+    if (0 == NOps)
+      Order.push_back(&N);
+  }
+  for (size_t I = 0; I != Order.size(); ++I) {
+    SDNode *N = Order[I];
+    for (auto *U : N->uses()) {
+      unsigned &UnsortedOps = Degree[U];
+      if (0 == --UnsortedOps)
+        Order.push_back(U);
+    }
+  }
+}
+
+#ifndef NDEBUG
+void SelectionDAG::VerifyDAGDivergence() {
+  std::vector<SDNode *> TopoOrder;
+  CreateTopologicalOrder(TopoOrder);
+  for (auto *N : TopoOrder) {
+    assert(calculateDivergence(N) == N->isDivergent() &&
+           "Divergence bit inconsistency detected");
+  }
+}
+#endif
+
+/// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving
+/// uses of other values produced by From.getNode() alone.  The same value
+/// may appear in both the From and To list.  The Deleted vector is
+/// handled the same way as for ReplaceAllUsesWith.
+void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
+                                              const SDValue *To,
+                                              unsigned Num){
+  // Handle the simple, trivial case efficiently.
+  if (Num == 1)
+    return ReplaceAllUsesOfValueWith(*From, *To);
+
+  transferDbgValues(*From, *To);
+  copyExtraInfo(From->getNode(), To->getNode());
+
+  // Read up all the uses and make records of them. This helps
+  // processing new uses that are introduced during the
+  // replacement process.
+  SmallVector<UseMemo, 4> Uses;
+  for (unsigned i = 0; i != Num; ++i) {
+    unsigned FromResNo = From[i].getResNo();
+    SDNode *FromNode = From[i].getNode();
+    for (SDNode::use_iterator UI = FromNode->use_begin(),
+         E = FromNode->use_end(); UI != E; ++UI) {
+      SDUse &Use = UI.getUse();
+      if (Use.getResNo() == FromResNo) {
+        UseMemo Memo = { *UI, i, &Use };
+        Uses.push_back(Memo);
+      }
+    }
+  }
+
+  // Sort the uses, so that all the uses from a given User are together.
+  llvm::sort(Uses);
+  RAUOVWUpdateListener Listener(*this, Uses);
+
+  for (unsigned UseIndex = 0, UseIndexEnd = Uses.size();
+       UseIndex != UseIndexEnd; ) {
+    // We know that this user uses some value of From.  If it is the right
+    // value, update it.
+    SDNode *User = Uses[UseIndex].User;
+    // If the node has been deleted by recursive CSE updates when updating
+    // another node, then just skip this entry.
+    if (User == nullptr) {
+      ++UseIndex;
+      continue;
+    }
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // The Uses array is sorted, so all the uses for a given User
+    // are next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      unsigned i = Uses[UseIndex].Index;
+      SDUse &Use = *Uses[UseIndex].Use;
+      ++UseIndex;
+
+      Use.set(To[i]);
+    } while (UseIndex != UseIndexEnd && Uses[UseIndex].User == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User);
+  }
+}
+
+/// AssignTopologicalOrder - Assign a unique node id for each node in the DAG
+/// based on their topological order. It returns the maximum id and a vector
+/// of the SDNodes* in assigned order by reference.
+unsigned SelectionDAG::AssignTopologicalOrder() {
+  unsigned DAGSize = 0;
+
+  // SortedPos tracks the progress of the algorithm. Nodes before it are
+  // sorted, nodes after it are unsorted. When the algorithm completes
+  // it is at the end of the list.
+  allnodes_iterator SortedPos = allnodes_begin();
+
+  // Visit all the nodes. Move nodes with no operands to the front of
+  // the list immediately. Annotate nodes that do have operands with their
+  // operand count. Before we do this, the Node Id fields of the nodes
+  // may contain arbitrary values. After, the Node Id fields for nodes
+  // before SortedPos will contain the topological sort index, and the
+  // Node Id fields for nodes At SortedPos and after will contain the
+  // count of outstanding operands.
+  for (SDNode &N : llvm::make_early_inc_range(allnodes())) {
+    checkForCycles(&N, this);
+    unsigned Degree = N.getNumOperands();
+    if (Degree == 0) {
+      // A node with no uses, add it to the result array immediately.
+      N.setNodeId(DAGSize++);
+      allnodes_iterator Q(&N);
+      if (Q != SortedPos)
+        SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(Q));
+      assert(SortedPos != AllNodes.end() && "Overran node list");
+      ++SortedPos;
+    } else {
+      // Temporarily use the Node Id as scratch space for the degree count.
+      N.setNodeId(Degree);
+    }
+  }
+
+  // Visit all the nodes. As we iterate, move nodes into sorted order,
+  // such that by the time the end is reached all nodes will be sorted.
+  for (SDNode &Node : allnodes()) {
+    SDNode *N = &Node;
+    checkForCycles(N, this);
+    // N is in sorted position, so all its uses have one less operand
+    // that needs to be sorted.
+    for (SDNode *P : N->uses()) {
+      unsigned Degree = P->getNodeId();
+      assert(Degree != 0 && "Invalid node degree");
+      --Degree;
+      if (Degree == 0) {
+        // All of P's operands are sorted, so P may sorted now.
+        P->setNodeId(DAGSize++);
+        if (P->getIterator() != SortedPos)
+          SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(P));
+        assert(SortedPos != AllNodes.end() && "Overran node list");
+        ++SortedPos;
+      } else {
+        // Update P's outstanding operand count.
+        P->setNodeId(Degree);
+      }
+    }
+    if (Node.getIterator() == SortedPos) {
+#ifndef NDEBUG
+      allnodes_iterator I(N);
+      SDNode *S = &*++I;
+      dbgs() << "Overran sorted position:\n";
+      S->dumprFull(this); dbgs() << "\n";
+      dbgs() << "Checking if this is due to cycles\n";
+      checkForCycles(this, true);
+#endif
+      llvm_unreachable(nullptr);
+    }
+  }
+
+  assert(SortedPos == AllNodes.end() &&
+         "Topological sort incomplete!");
+  assert(AllNodes.front().getOpcode() == ISD::EntryToken &&
+         "First node in topological sort is not the entry token!");
+  assert(AllNodes.front().getNodeId() == 0 &&
+         "First node in topological sort has non-zero id!");
+  assert(AllNodes.front().getNumOperands() == 0 &&
+         "First node in topological sort has operands!");
+  assert(AllNodes.back().getNodeId() == (int)DAGSize-1 &&
+         "Last node in topologic sort has unexpected id!");
+  assert(AllNodes.back().use_empty() &&
+         "Last node in topologic sort has users!");
+  assert(DAGSize == allnodes_size() && "Node count mismatch!");
+  return DAGSize;
+}
+
+/// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
+/// value is produced by SD.
+void SelectionDAG::AddDbgValue(SDDbgValue *DB, bool isParameter) {
+  for (SDNode *SD : DB->getSDNodes()) {
+    if (!SD)
+      continue;
+    assert(DbgInfo->getSDDbgValues(SD).empty() || SD->getHasDebugValue());
+    SD->setHasDebugValue(true);
+  }
+  DbgInfo->add(DB, isParameter);
+}
+
+void SelectionDAG::AddDbgLabel(SDDbgLabel *DB) { DbgInfo->add(DB); }
+
+SDValue SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain,
+                                                   SDValue NewMemOpChain) {
+  assert(isa<MemSDNode>(NewMemOpChain) && "Expected a memop node");
+  assert(NewMemOpChain.getValueType() == MVT::Other && "Expected a token VT");
+  // The new memory operation must have the same position as the old load in
+  // terms of memory dependency. Create a TokenFactor for the old load and new
+  // memory operation and update uses of the old load's output chain to use that
+  // TokenFactor.
+  if (OldChain == NewMemOpChain || OldChain.use_empty())
+    return NewMemOpChain;
+
+  SDValue TokenFactor = getNode(ISD::TokenFactor, SDLoc(OldChain), MVT::Other,
+                                OldChain, NewMemOpChain);
+  ReplaceAllUsesOfValueWith(OldChain, TokenFactor);
+  UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewMemOpChain);
+  return TokenFactor;
+}
+
+SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
+                                                   SDValue NewMemOp) {
+  assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
+  SDValue OldChain = SDValue(OldLoad, 1);
+  SDValue NewMemOpChain = NewMemOp.getValue(1);
+  return makeEquivalentMemoryOrdering(OldChain, NewMemOpChain);
+}
+
+SDValue SelectionDAG::getSymbolFunctionGlobalAddress(SDValue Op,
+                                                     Function **OutFunction) {
+  assert(isa<ExternalSymbolSDNode>(Op) && "Node should be an ExternalSymbol");
+
+  auto *Symbol = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+  auto *Module = MF->getFunction().getParent();
+  auto *Function = Module->getFunction(Symbol);
+
+  if (OutFunction != nullptr)
+      *OutFunction = Function;
+
+  if (Function != nullptr) {
+    auto PtrTy = TLI->getPointerTy(getDataLayout(), Function->getAddressSpace());
+    return getGlobalAddress(Function, SDLoc(Op), PtrTy);
+  }
+
+  std::string ErrorStr;
+  raw_string_ostream ErrorFormatter(ErrorStr);
+  ErrorFormatter << "Undefined external symbol ";
+  ErrorFormatter << '"' << Symbol << '"';
+  report_fatal_error(Twine(ErrorFormatter.str()));
+}
+
+//===----------------------------------------------------------------------===//
+//                              SDNode Class
+//===----------------------------------------------------------------------===//
+
+bool llvm::isNullConstant(SDValue V) {
+  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+  return Const != nullptr && Const->isZero();
+}
+
+bool llvm::isNullFPConstant(SDValue V) {
+  ConstantFPSDNode *Const = dyn_cast<ConstantFPSDNode>(V);
+  return Const != nullptr && Const->isZero() && !Const->isNegative();
+}
+
+bool llvm::isAllOnesConstant(SDValue V) {
+  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+  return Const != nullptr && Const->isAllOnes();
+}
+
+bool llvm::isOneConstant(SDValue V) {
+  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+  return Const != nullptr && Const->isOne();
+}
+
+bool llvm::isMinSignedConstant(SDValue V) {
+  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+  return Const != nullptr && Const->isMinSignedValue();
+}
+
+bool llvm::isNeutralConstant(unsigned Opcode, SDNodeFlags Flags, SDValue V,
+                             unsigned OperandNo) {
+  // NOTE: The cases should match with IR's ConstantExpr::getBinOpIdentity().
+  // TODO: Target-specific opcodes could be added.
+  if (auto *Const = isConstOrConstSplat(V)) {
+    switch (Opcode) {
+    case ISD::ADD:
+    case ISD::OR:
+    case ISD::XOR:
+    case ISD::UMAX:
+      return Const->isZero();
+    case ISD::MUL:
+      return Const->isOne();
+    case ISD::AND:
+    case ISD::UMIN:
+      return Const->isAllOnes();
+    case ISD::SMAX:
+      return Const->isMinSignedValue();
+    case ISD::SMIN:
+      return Const->isMaxSignedValue();
+    case ISD::SUB:
+    case ISD::SHL:
+    case ISD::SRA:
+    case ISD::SRL:
+      return OperandNo == 1 && Const->isZero();
+    case ISD::UDIV:
+    case ISD::SDIV:
+      return OperandNo == 1 && Const->isOne();
+    }
+  } else if (auto *ConstFP = isConstOrConstSplatFP(V)) {
+    switch (Opcode) {
+    case ISD::FADD:
+      return ConstFP->isZero() &&
+             (Flags.hasNoSignedZeros() || ConstFP->isNegative());
+    case ISD::FSUB:
+      return OperandNo == 1 && ConstFP->isZero() &&
+             (Flags.hasNoSignedZeros() || !ConstFP->isNegative());
+    case ISD::FMUL:
+      return ConstFP->isExactlyValue(1.0);
+    case ISD::FDIV:
+      return OperandNo == 1 && ConstFP->isExactlyValue(1.0);
+    case ISD::FMINNUM:
+    case ISD::FMAXNUM: {
+      // Neutral element for fminnum is NaN, Inf or FLT_MAX, depending on FMF.
+      EVT VT = V.getValueType();
+      const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);
+      APFloat NeutralAF = !Flags.hasNoNaNs()
+                              ? APFloat::getQNaN(Semantics)
+                              : !Flags.hasNoInfs()
+                                    ? APFloat::getInf(Semantics)
+                                    : APFloat::getLargest(Semantics);
+      if (Opcode == ISD::FMAXNUM)
+        NeutralAF.changeSign();
+
+      return ConstFP->isExactlyValue(NeutralAF);
+    }
+    }
+  }
+  return false;
+}
+
+SDValue llvm::peekThroughBitcasts(SDValue V) {
+  while (V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+  return V;
+}
+
+SDValue llvm::peekThroughOneUseBitcasts(SDValue V) {
+  while (V.getOpcode() == ISD::BITCAST && V.getOperand(0).hasOneUse())
+    V = V.getOperand(0);
+  return V;
+}
+
+SDValue llvm::peekThroughExtractSubvectors(SDValue V) {
+  while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+    V = V.getOperand(0);
+  return V;
+}
+
+SDValue llvm::peekThroughTruncates(SDValue V) {
+  while (V.getOpcode() == ISD::TRUNCATE)
+    V = V.getOperand(0);
+  return V;
+}
+
+bool llvm::isBitwiseNot(SDValue V, bool AllowUndefs) {
+  if (V.getOpcode() != ISD::XOR)
+    return false;
+  V = peekThroughBitcasts(V.getOperand(1));
+  unsigned NumBits = V.getScalarValueSizeInBits();
+  ConstantSDNode *C =
+      isConstOrConstSplat(V, AllowUndefs, /*AllowTruncation*/ true);
+  return C && (C->getAPIntValue().countr_one() >= NumBits);
+}
+
+ConstantSDNode *llvm::isConstOrConstSplat(SDValue N, bool AllowUndefs,
+                                          bool AllowTruncation) {
+  EVT VT = N.getValueType();
+  APInt DemandedElts = VT.isFixedLengthVector()
+                           ? APInt::getAllOnes(VT.getVectorMinNumElements())
+                           : APInt(1, 1);
+  return isConstOrConstSplat(N, DemandedElts, AllowUndefs, AllowTruncation);
+}
+
+ConstantSDNode *llvm::isConstOrConstSplat(SDValue N, const APInt &DemandedElts,
+                                          bool AllowUndefs,
+                                          bool AllowTruncation) {
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N))
+    return CN;
+
+  // SplatVectors can truncate their operands. Ignore that case here unless
+  // AllowTruncation is set.
+  if (N->getOpcode() == ISD::SPLAT_VECTOR) {
+    EVT VecEltVT = N->getValueType(0).getVectorElementType();
+    if (auto *CN = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      EVT CVT = CN->getValueType(0);
+      assert(CVT.bitsGE(VecEltVT) && "Illegal splat_vector element extension");
+      if (AllowTruncation || CVT == VecEltVT)
+        return CN;
+    }
+  }
+
+  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
+    BitVector UndefElements;
+    ConstantSDNode *CN = BV->getConstantSplatNode(DemandedElts, &UndefElements);
+
+    // BuildVectors can truncate their operands. Ignore that case here unless
+    // AllowTruncation is set.
+    // TODO: Look into whether we should allow UndefElements in non-DemandedElts
+    if (CN && (UndefElements.none() || AllowUndefs)) {
+      EVT CVT = CN->getValueType(0);
+      EVT NSVT = N.getValueType().getScalarType();
+      assert(CVT.bitsGE(NSVT) && "Illegal build vector element extension");
+      if (AllowTruncation || (CVT == NSVT))
+        return CN;
+    }
+  }
+
+  return nullptr;
+}
+
+ConstantFPSDNode *llvm::isConstOrConstSplatFP(SDValue N, bool AllowUndefs) {
+  EVT VT = N.getValueType();
+  APInt DemandedElts = VT.isFixedLengthVector()
+                           ? APInt::getAllOnes(VT.getVectorMinNumElements())
+                           : APInt(1, 1);
+  return isConstOrConstSplatFP(N, DemandedElts, AllowUndefs);
+}
+
+ConstantFPSDNode *llvm::isConstOrConstSplatFP(SDValue N,
+                                              const APInt &DemandedElts,
+                                              bool AllowUndefs) {
+  if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
+    return CN;
+
+  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
+    BitVector UndefElements;
+    ConstantFPSDNode *CN =
+        BV->getConstantFPSplatNode(DemandedElts, &UndefElements);
+    // TODO: Look into whether we should allow UndefElements in non-DemandedElts
+    if (CN && (UndefElements.none() || AllowUndefs))
+      return CN;
+  }
+
+  if (N.getOpcode() == ISD::SPLAT_VECTOR)
+    if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N.getOperand(0)))
+      return CN;
+
+  return nullptr;
+}
+
+bool llvm::isNullOrNullSplat(SDValue N, bool AllowUndefs) {
+  // TODO: may want to use peekThroughBitcast() here.
+  ConstantSDNode *C =
+      isConstOrConstSplat(N, AllowUndefs, /*AllowTruncation=*/true);
+  return C && C->isZero();
+}
+
+bool llvm::isOneOrOneSplat(SDValue N, bool AllowUndefs) {
+  ConstantSDNode *C =
+      isConstOrConstSplat(N, AllowUndefs, /*AllowTruncation*/ true);
+  return C && C->isOne();
+}
+
+bool llvm::isAllOnesOrAllOnesSplat(SDValue N, bool AllowUndefs) {
+  N = peekThroughBitcasts(N);
+  unsigned BitWidth = N.getScalarValueSizeInBits();
+  ConstantSDNode *C = isConstOrConstSplat(N, AllowUndefs);
+  return C && C->isAllOnes() && C->getValueSizeInBits(0) == BitWidth;
+}
+
+HandleSDNode::~HandleSDNode() {
+  DropOperands();
+}
+
+GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order,
+                                         const DebugLoc &DL,
+                                         const GlobalValue *GA, EVT VT,
+                                         int64_t o, unsigned TF)
+    : SDNode(Opc, Order, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
+  TheGlobal = GA;
+}
+
+AddrSpaceCastSDNode::AddrSpaceCastSDNode(unsigned Order, const DebugLoc &dl,
+                                         EVT VT, unsigned SrcAS,
+                                         unsigned DestAS)
+    : SDNode(ISD::ADDRSPACECAST, Order, dl, getSDVTList(VT)),
+      SrcAddrSpace(SrcAS), DestAddrSpace(DestAS) {}
+
+MemSDNode::MemSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl,
+                     SDVTList VTs, EVT memvt, MachineMemOperand *mmo)
+    : SDNode(Opc, Order, dl, VTs), MemoryVT(memvt), MMO(mmo) {
+  MemSDNodeBits.IsVolatile = MMO->isVolatile();
+  MemSDNodeBits.IsNonTemporal = MMO->isNonTemporal();
+  MemSDNodeBits.IsDereferenceable = MMO->isDereferenceable();
+  MemSDNodeBits.IsInvariant = MMO->isInvariant();
+
+  // We check here that the size of the memory operand fits within the size of
+  // the MMO. This is because the MMO might indicate only a possible address
+  // range instead of specifying the affected memory addresses precisely.
+  // TODO: Make MachineMemOperands aware of scalable vectors.
+  assert(memvt.getStoreSize().getKnownMinValue() <= MMO->getSize() &&
+         "Size mismatch!");
+}
+
+/// Profile - Gather unique data for the node.
+///
+void SDNode::Profile(FoldingSetNodeID &ID) const {
+  AddNodeIDNode(ID, this);
+}
+
+namespace {
+
+  struct EVTArray {
+    std::vector<EVT> VTs;
+
+    EVTArray() {
+      VTs.reserve(MVT::VALUETYPE_SIZE);
+      for (unsigned i = 0; i < MVT::VALUETYPE_SIZE; ++i)
+        VTs.push_back(MVT((MVT::SimpleValueType)i));
+    }
+  };
+
+} // end anonymous namespace
+
+/// getValueTypeList - Return a pointer to the specified value type.
+///
+const EVT *SDNode::getValueTypeList(EVT VT) {
+  static std::set<EVT, EVT::compareRawBits> EVTs;
+  static EVTArray SimpleVTArray;
+  static sys::SmartMutex<true> VTMutex;
+
+  if (VT.isExtended()) {
+    sys::SmartScopedLock<true> Lock(VTMutex);
+    return &(*EVTs.insert(VT).first);
+  }
+  assert(VT.getSimpleVT() < MVT::VALUETYPE_SIZE && "Value type out of range!");
+  return &SimpleVTArray.VTs[VT.getSimpleVT().SimpleTy];
+}
+
+/// hasNUsesOfValue - Return true if there are exactly NUSES uses of the
+/// indicated value.  This method ignores uses of other values defined by this
+/// operation.
+bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const {
+  assert(Value < getNumValues() && "Bad value!");
+
+  // TODO: Only iterate over uses of a given value of the node
+  for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
+    if (UI.getUse().getResNo() == Value) {
+      if (NUses == 0)
+        return false;
+      --NUses;
+    }
+  }
+
+  // Found exactly the right number of uses?
+  return NUses == 0;
+}
+
+/// hasAnyUseOfValue - Return true if there are any use of the indicated
+/// value. This method ignores uses of other values defined by this operation.
+bool SDNode::hasAnyUseOfValue(unsigned Value) const {
+  assert(Value < getNumValues() && "Bad value!");
+
+  for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI)
+    if (UI.getUse().getResNo() == Value)
+      return true;
+
+  return false;
+}
+
+/// isOnlyUserOf - Return true if this node is the only use of N.
+bool SDNode::isOnlyUserOf(const SDNode *N) const {
+  bool Seen = false;
+  for (const SDNode *User : N->uses()) {
+    if (User == this)
+      Seen = true;
+    else
+      return false;
+  }
+
+  return Seen;
+}
+
+/// Return true if the only users of N are contained in Nodes.
+bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) {
+  bool Seen = false;
+  for (const SDNode *User : N->uses()) {
+    if (llvm::is_contained(Nodes, User))
+      Seen = true;
+    else
+      return false;
+  }
+
+  return Seen;
+}
+
+/// isOperand - Return true if this node is an operand of N.
+bool SDValue::isOperandOf(const SDNode *N) const {
+  return is_contained(N->op_values(), *this);
+}
+
+bool SDNode::isOperandOf(const SDNode *N) const {
+  return any_of(N->op_values(),
+                [this](SDValue Op) { return this == Op.getNode(); });
+}
+
+/// reachesChainWithoutSideEffects - Return true if this operand (which must
+/// be a chain) reaches the specified operand without crossing any
+/// side-effecting instructions on any chain path.  In practice, this looks
+/// through token factors and non-volatile loads.  In order to remain efficient,
+/// this only looks a couple of nodes in, it does not do an exhaustive search.
+///
+/// Note that we only need to examine chains when we're searching for
+/// side-effects; SelectionDAG requires that all side-effects are represented
+/// by chains, even if another operand would force a specific ordering. This
+/// constraint is necessary to allow transformations like splitting loads.
+bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
+                                             unsigned Depth) const {
+  if (*this == Dest) return true;
+
+  // Don't search too deeply, we just want to be able to see through
+  // TokenFactor's etc.
+  if (Depth == 0) return false;
+
+  // If this is a token factor, all inputs to the TF happen in parallel.
+  if (getOpcode() == ISD::TokenFactor) {
+    // First, try a shallow search.
+    if (is_contained((*this)->ops(), Dest)) {
+      // We found the chain we want as an operand of this TokenFactor.
+      // Essentially, we reach the chain without side-effects if we could
+      // serialize the TokenFactor into a simple chain of operations with
+      // Dest as the last operation. This is automatically true if the
+      // chain has one use: there are no other ordering constraints.
+      // If the chain has more than one use, we give up: some other
+      // use of Dest might force a side-effect between Dest and the current
+      // node.
+      if (Dest.hasOneUse())
+        return true;
+    }
+    // Next, try a deep search: check whether every operand of the TokenFactor
+    // reaches Dest.
+    return llvm::all_of((*this)->ops(), [=](SDValue Op) {
+      return Op.reachesChainWithoutSideEffects(Dest, Depth - 1);
+    });
+  }
+
+  // Loads don't have side effects, look through them.
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(*this)) {
+    if (Ld->isUnordered())
+      return Ld->getChain().reachesChainWithoutSideEffects(Dest, Depth-1);
+  }
+  return false;
+}
+
+bool SDNode::hasPredecessor(const SDNode *N) const {
+  SmallPtrSet<const SDNode *, 32> Visited;
+  SmallVector<const SDNode *, 16> Worklist;
+  Worklist.push_back(this);
+  return hasPredecessorHelper(N, Visited, Worklist);
+}
+
+void SDNode::intersectFlagsWith(const SDNodeFlags Flags) {
+  this->Flags.intersectWith(Flags);
+}
+
+SDValue
+SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
+                                  ArrayRef<ISD::NodeType> CandidateBinOps,
+                                  bool AllowPartials) {
+  // The pattern must end in an extract from index 0.
+  if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !isNullConstant(Extract->getOperand(1)))
+    return SDValue();
+
+  // Match against one of the candidate binary ops.
+  SDValue Op = Extract->getOperand(0);
+  if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
+        return Op.getOpcode() == unsigned(BinOp);
+      }))
+    return SDValue();
+
+  // Floating-point reductions may require relaxed constraints on the final step
+  // of the reduction because they may reorder intermediate operations.
+  unsigned CandidateBinOp = Op.getOpcode();
+  if (Op.getValueType().isFloatingPoint()) {
+    SDNodeFlags Flags = Op->getFlags();
+    switch (CandidateBinOp) {
+    case ISD::FADD:
+      if (!Flags.hasNoSignedZeros() || !Flags.hasAllowReassociation())
+        return SDValue();
+      break;
+    default:
+      llvm_unreachable("Unhandled FP opcode for binop reduction");
+    }
+  }
+
+  // Matching failed - attempt to see if we did enough stages that a partial
+  // reduction from a subvector is possible.
+  auto PartialReduction = [&](SDValue Op, unsigned NumSubElts) {
+    if (!AllowPartials || !Op)
+      return SDValue();
+    EVT OpVT = Op.getValueType();
+    EVT OpSVT = OpVT.getScalarType();
+    EVT SubVT = EVT::getVectorVT(*getContext(), OpSVT, NumSubElts);
+    if (!TLI->isExtractSubvectorCheap(SubVT, OpVT, 0))
+      return SDValue();
+    BinOp = (ISD::NodeType)CandidateBinOp;
+    return getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Op), SubVT, Op,
+                   getVectorIdxConstant(0, SDLoc(Op)));
+  };
+
+  // At each stage, we're looking for something that looks like:
+  // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
+  //                    <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
+  //                               i32 undef, i32 undef, i32 undef, i32 undef>
+  // %a = binop <8 x i32> %op, %s
+  // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
+  // we expect something like:
+  // <4,5,6,7,u,u,u,u>
+  // <2,3,u,u,u,u,u,u>
+  // <1,u,u,u,u,u,u,u>
+  // While a partial reduction match would be:
+  // <2,3,u,u,u,u,u,u>
+  // <1,u,u,u,u,u,u,u>
+  unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
+  SDValue PrevOp;
+  for (unsigned i = 0; i < Stages; ++i) {
+    unsigned MaskEnd = (1 << i);
+
+    if (Op.getOpcode() != CandidateBinOp)
+      return PartialReduction(PrevOp, MaskEnd);
+
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(Op0);
+    if (Shuffle) {
+      Op = Op1;
+    } else {
+      Shuffle = dyn_cast<ShuffleVectorSDNode>(Op1);
+      Op = Op0;
+    }
+
+    // The first operand of the shuffle should be the same as the other operand
+    // of the binop.
+    if (!Shuffle || Shuffle->getOperand(0) != Op)
+      return PartialReduction(PrevOp, MaskEnd);
+
+    // Verify the shuffle has the expected (at this stage of the pyramid) mask.
+    for (int Index = 0; Index < (int)MaskEnd; ++Index)
+      if (Shuffle->getMaskElt(Index) != (int)(MaskEnd + Index))
+        return PartialReduction(PrevOp, MaskEnd);
+
+    PrevOp = Op;
+  }
+
+  // Handle subvector reductions, which tend to appear after the shuffle
+  // reduction stages.
+  while (Op.getOpcode() == CandidateBinOp) {
+    unsigned NumElts = Op.getValueType().getVectorNumElements();
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    if (Op0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+        Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+        Op0.getOperand(0) != Op1.getOperand(0))
+      break;
+    SDValue Src = Op0.getOperand(0);
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    if (NumSrcElts != (2 * NumElts))
+      break;
+    if (!(Op0.getConstantOperandAPInt(1) == 0 &&
+          Op1.getConstantOperandAPInt(1) == NumElts) &&
+        !(Op1.getConstantOperandAPInt(1) == 0 &&
+          Op0.getConstantOperandAPInt(1) == NumElts))
+      break;
+    Op = Src;
+  }
+
+  BinOp = (ISD::NodeType)CandidateBinOp;
+  return Op;
+}
+
+SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NE = VT.getVectorNumElements();
+
+  SDLoc dl(N);
+
+  // If ResNE is 0, fully unroll the vector op.
+  if (ResNE == 0)
+    ResNE = NE;
+  else if (NE > ResNE)
+    NE = ResNE;
+
+  if (N->getNumValues() == 2) {
+    SmallVector<SDValue, 8> Scalars0, Scalars1;
+    SmallVector<SDValue, 4> Operands(N->getNumOperands());
+    EVT VT1 = N->getValueType(1);
+    EVT EltVT1 = VT1.getVectorElementType();
+
+    unsigned i;
+    for (i = 0; i != NE; ++i) {
+      for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) {
+        SDValue Operand = N->getOperand(j);
+        EVT OperandVT = Operand.getValueType();
+
+        // A vector operand; extract a single element.
+        EVT OperandEltVT = OperandVT.getVectorElementType();
+        Operands[j] = getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT,
+                              Operand, getVectorIdxConstant(i, dl));
+      }
+
+      SDValue EltOp = getNode(N->getOpcode(), dl, {EltVT, EltVT1}, Operands);
+      Scalars0.push_back(EltOp);
+      Scalars1.push_back(EltOp.getValue(1));
+    }
+
+    SDValue Vec0 = getBuildVector(VT, dl, Scalars0);
+    SDValue Vec1 = getBuildVector(VT1, dl, Scalars1);
+    return getMergeValues({Vec0, Vec1}, dl);
+  }
+
+  assert(N->getNumValues() == 1 &&
+         "Can't unroll a vector with multiple results!");
+
+  SmallVector<SDValue, 8> Scalars;
+  SmallVector<SDValue, 4> Operands(N->getNumOperands());
+
+  unsigned i;
+  for (i= 0; i != NE; ++i) {
+    for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) {
+      SDValue Operand = N->getOperand(j);
+      EVT OperandVT = Operand.getValueType();
+      if (OperandVT.isVector()) {
+        // A vector operand; extract a single element.
+        EVT OperandEltVT = OperandVT.getVectorElementType();
+        Operands[j] = getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT,
+                              Operand, getVectorIdxConstant(i, dl));
+      } else {
+        // A scalar operand; just use it as is.
+        Operands[j] = Operand;
+      }
+    }
+
+    switch (N->getOpcode()) {
+    default: {
+      Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands,
+                                N->getFlags()));
+      break;
+    }
+    case ISD::VSELECT:
+      Scalars.push_back(getNode(ISD::SELECT, dl, EltVT, Operands));
+      break;
+    case ISD::SHL:
+    case ISD::SRA:
+    case ISD::SRL:
+    case ISD::ROTL:
+    case ISD::ROTR:
+      Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0],
+                               getShiftAmountOperand(Operands[0].getValueType(),
+                                                     Operands[1])));
+      break;
+    case ISD::SIGN_EXTEND_INREG: {
+      EVT ExtVT = cast<VTSDNode>(Operands[1])->getVT().getVectorElementType();
+      Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
+                                Operands[0],
+                                getValueType(ExtVT)));
+    }
+    }
+  }
+
+  for (; i < ResNE; ++i)
+    Scalars.push_back(getUNDEF(EltVT));
+
+  EVT VecVT = EVT::getVectorVT(*getContext(), EltVT, ResNE);
+  return getBuildVector(VecVT, dl, Scalars);
+}
+
+std::pair<SDValue, SDValue> SelectionDAG::UnrollVectorOverflowOp(
+    SDNode *N, unsigned ResNE) {
+  unsigned Opcode = N->getOpcode();
+  assert((Opcode == ISD::UADDO || Opcode == ISD::SADDO ||
+          Opcode == ISD::USUBO || Opcode == ISD::SSUBO ||
+          Opcode == ISD::UMULO || Opcode == ISD::SMULO) &&
+         "Expected an overflow opcode");
+
+  EVT ResVT = N->getValueType(0);
+  EVT OvVT = N->getValueType(1);
+  EVT ResEltVT = ResVT.getVectorElementType();
+  EVT OvEltVT = OvVT.getVectorElementType();
+  SDLoc dl(N);
+
+  // If ResNE is 0, fully unroll the vector op.
+  unsigned NE = ResVT.getVectorNumElements();
+  if (ResNE == 0)
+    ResNE = NE;
+  else if (NE > ResNE)
+    NE = ResNE;
+
+  SmallVector<SDValue, 8> LHSScalars;
+  SmallVector<SDValue, 8> RHSScalars;
+  ExtractVectorElements(N->getOperand(0), LHSScalars, 0, NE);
+  ExtractVectorElements(N->getOperand(1), RHSScalars, 0, NE);
+
+  EVT SVT = TLI->getSetCCResultType(getDataLayout(), *getContext(), ResEltVT);
+  SDVTList VTs = getVTList(ResEltVT, SVT);
+  SmallVector<SDValue, 8> ResScalars;
+  SmallVector<SDValue, 8> OvScalars;
+  for (unsigned i = 0; i < NE; ++i) {
+    SDValue Res = getNode(Opcode, dl, VTs, LHSScalars[i], RHSScalars[i]);
+    SDValue Ov =
+        getSelect(dl, OvEltVT, Res.getValue(1),
+                  getBoolConstant(true, dl, OvEltVT, ResVT),
+                  getConstant(0, dl, OvEltVT));
+
+    ResScalars.push_back(Res);
+    OvScalars.push_back(Ov);
+  }
+
+  ResScalars.append(ResNE - NE, getUNDEF(ResEltVT));
+  OvScalars.append(ResNE - NE, getUNDEF(OvEltVT));
+
+  EVT NewResVT = EVT::getVectorVT(*getContext(), ResEltVT, ResNE);
+  EVT NewOvVT = EVT::getVectorVT(*getContext(), OvEltVT, ResNE);
+  return std::make_pair(getBuildVector(NewResVT, dl, ResScalars),
+                        getBuildVector(NewOvVT, dl, OvScalars));
+}
+
+bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
+                                                  LoadSDNode *Base,
+                                                  unsigned Bytes,
+                                                  int Dist) const {
+  if (LD->isVolatile() || Base->isVolatile())
+    return false;
+  // TODO: probably too restrictive for atomics, revisit
+  if (!LD->isSimple())
+    return false;
+  if (LD->isIndexed() || Base->isIndexed())
+    return false;
+  if (LD->getChain() != Base->getChain())
+    return false;
+  EVT VT = LD->getMemoryVT();
+  if (VT.getSizeInBits() / 8 != Bytes)
+    return false;
+
+  auto BaseLocDecomp = BaseIndexOffset::match(Base, *this);
+  auto LocDecomp = BaseIndexOffset::match(LD, *this);
+
+  int64_t Offset = 0;
+  if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset))
+    return (Dist * (int64_t)Bytes == Offset);
+  return false;
+}
+
+/// InferPtrAlignment - Infer alignment of a load / store address. Return
+/// std::nullopt if it cannot be inferred.
+MaybeAlign SelectionDAG::InferPtrAlign(SDValue Ptr) const {
+  // If this is a GlobalAddress + cst, return the alignment.
+  const GlobalValue *GV = nullptr;
+  int64_t GVOffset = 0;
+  if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
+    unsigned PtrWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
+    KnownBits Known(PtrWidth);
+    llvm::computeKnownBits(GV, Known, getDataLayout());
+    unsigned AlignBits = Known.countMinTrailingZeros();
+    if (AlignBits)
+      return commonAlignment(Align(1ull << std::min(31U, AlignBits)), GVOffset);
+  }
+
+  // If this is a direct reference to a stack slot, use information about the
+  // stack slot's alignment.
+  int FrameIdx = INT_MIN;
+  int64_t FrameOffset = 0;
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr)) {
+    FrameIdx = FI->getIndex();
+  } else if (isBaseWithConstantOffset(Ptr) &&
+             isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+    // Handle FI+Cst
+    FrameIdx = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+    FrameOffset = Ptr.getConstantOperandVal(1);
+  }
+
+  if (FrameIdx != INT_MIN) {
+    const MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
+    return commonAlignment(MFI.getObjectAlign(FrameIdx), FrameOffset);
+  }
+
+  return std::nullopt;
+}
+
+/// Split the scalar node with EXTRACT_ELEMENT using the provided
+/// VTs and return the low/high part.
+std::pair<SDValue, SDValue> SelectionDAG::SplitScalar(const SDValue &N,
+                                                      const SDLoc &DL,
+                                                      const EVT &LoVT,
+                                                      const EVT &HiVT) {
+  assert(!LoVT.isVector() && !HiVT.isVector() && !N.getValueType().isVector() &&
+         "Split node must be a scalar type");
+  SDValue Lo =
+      getNode(ISD::EXTRACT_ELEMENT, DL, LoVT, N, getIntPtrConstant(0, DL));
+  SDValue Hi =
+      getNode(ISD::EXTRACT_ELEMENT, DL, HiVT, N, getIntPtrConstant(1, DL));
+  return std::make_pair(Lo, Hi);
+}
+
+/// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
+/// which is split (or expanded) into two not necessarily identical pieces.
+std::pair<EVT, EVT> SelectionDAG::GetSplitDestVTs(const EVT &VT) const {
+  // Currently all types are split in half.
+  EVT LoVT, HiVT;
+  if (!VT.isVector())
+    LoVT = HiVT = TLI->getTypeToTransformTo(*getContext(), VT);
+  else
+    LoVT = HiVT = VT.getHalfNumVectorElementsVT(*getContext());
+
+  return std::make_pair(LoVT, HiVT);
+}
+
+/// GetDependentSplitDestVTs - Compute the VTs needed for the low/hi parts of a
+/// type, dependent on an enveloping VT that has been split into two identical
+/// pieces. Sets the HiIsEmpty flag when hi type has zero storage size.
+std::pair<EVT, EVT>
+SelectionDAG::GetDependentSplitDestVTs(const EVT &VT, const EVT &EnvVT,
+                                       bool *HiIsEmpty) const {
+  EVT EltTp = VT.getVectorElementType();
+  // Examples:
+  //   custom VL=8  with enveloping VL=8/8 yields 8/0 (hi empty)
+  //   custom VL=9  with enveloping VL=8/8 yields 8/1
+  //   custom VL=10 with enveloping VL=8/8 yields 8/2
+  //   etc.
+  ElementCount VTNumElts = VT.getVectorElementCount();
+  ElementCount EnvNumElts = EnvVT.getVectorElementCount();
+  assert(VTNumElts.isScalable() == EnvNumElts.isScalable() &&
+         "Mixing fixed width and scalable vectors when enveloping a type");
+  EVT LoVT, HiVT;
+  if (VTNumElts.getKnownMinValue() > EnvNumElts.getKnownMinValue()) {
+    LoVT = EVT::getVectorVT(*getContext(), EltTp, EnvNumElts);
+    HiVT = EVT::getVectorVT(*getContext(), EltTp, VTNumElts - EnvNumElts);
+    *HiIsEmpty = false;
+  } else {
+    // Flag that hi type has zero storage size, but return split envelop type
+    // (this would be easier if vector types with zero elements were allowed).
+    LoVT = EVT::getVectorVT(*getContext(), EltTp, VTNumElts);
+    HiVT = EVT::getVectorVT(*getContext(), EltTp, EnvNumElts);
+    *HiIsEmpty = true;
+  }
+  return std::make_pair(LoVT, HiVT);
+}
+
+/// SplitVector - Split the vector with EXTRACT_SUBVECTOR and return the
+/// low/high part.
+std::pair<SDValue, SDValue>
+SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT,
+                          const EVT &HiVT) {
+  assert(LoVT.isScalableVector() == HiVT.isScalableVector() &&
+         LoVT.isScalableVector() == N.getValueType().isScalableVector() &&
+         "Splitting vector with an invalid mixture of fixed and scalable "
+         "vector types");
+  assert(LoVT.getVectorMinNumElements() + HiVT.getVectorMinNumElements() <=
+             N.getValueType().getVectorMinNumElements() &&
+         "More vector elements requested than available!");
+  SDValue Lo, Hi;
+  Lo =
+      getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, getVectorIdxConstant(0, DL));
+  // For scalable vectors it is safe to use LoVT.getVectorMinNumElements()
+  // (rather than having to use ElementCount), because EXTRACT_SUBVECTOR scales
+  // IDX with the runtime scaling factor of the result vector type. For
+  // fixed-width result vectors, that runtime scaling factor is 1.
+  Hi = getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N,
+               getVectorIdxConstant(LoVT.getVectorMinNumElements(), DL));
+  return std::make_pair(Lo, Hi);
+}
+
+std::pair<SDValue, SDValue> SelectionDAG::SplitEVL(SDValue N, EVT VecVT,
+                                                   const SDLoc &DL) {
+  // Split the vector length parameter.
+  // %evl -> umin(%evl, %halfnumelts) and usubsat(%evl - %halfnumelts).
+  EVT VT = N.getValueType();
+  assert(VecVT.getVectorElementCount().isKnownEven() &&
+         "Expecting the mask to be an evenly-sized vector");
+  unsigned HalfMinNumElts = VecVT.getVectorMinNumElements() / 2;
+  SDValue HalfNumElts =
+      VecVT.isFixedLengthVector()
+          ? getConstant(HalfMinNumElts, DL, VT)
+          : getVScale(DL, VT, APInt(VT.getScalarSizeInBits(), HalfMinNumElts));
+  SDValue Lo = getNode(ISD::UMIN, DL, VT, N, HalfNumElts);
+  SDValue Hi = getNode(ISD::USUBSAT, DL, VT, N, HalfNumElts);
+  return std::make_pair(Lo, Hi);
+}
+
+/// Widen the vector up to the next power of two using INSERT_SUBVECTOR.
+SDValue SelectionDAG::WidenVector(const SDValue &N, const SDLoc &DL) {
+  EVT VT = N.getValueType();
+  EVT WideVT = EVT::getVectorVT(*getContext(), VT.getVectorElementType(),
+                                NextPowerOf2(VT.getVectorNumElements()));
+  return getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, getUNDEF(WideVT), N,
+                 getVectorIdxConstant(0, DL));
+}
+
+void SelectionDAG::ExtractVectorElements(SDValue Op,
+                                         SmallVectorImpl<SDValue> &Args,
+                                         unsigned Start, unsigned Count,
+                                         EVT EltVT) {
+  EVT VT = Op.getValueType();
+  if (Count == 0)
+    Count = VT.getVectorNumElements();
+  if (EltVT == EVT())
+    EltVT = VT.getVectorElementType();
+  SDLoc SL(Op);
+  for (unsigned i = Start, e = Start + Count; i != e; ++i) {
+    Args.push_back(getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Op,
+                           getVectorIdxConstant(i, SL)));
+  }
+}
+
+// getAddressSpace - Return the address space this GlobalAddress belongs to.
+unsigned GlobalAddressSDNode::getAddressSpace() const {
+  return getGlobal()->getType()->getAddressSpace();
+}
+
+Type *ConstantPoolSDNode::getType() const {
+  if (isMachineConstantPoolEntry())
+    return Val.MachineCPVal->getType();
+  return Val.ConstVal->getType();
+}
+
+bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef,
+                                        unsigned &SplatBitSize,
+                                        bool &HasAnyUndefs,
+                                        unsigned MinSplatBits,
+                                        bool IsBigEndian) const {
+  EVT VT = getValueType(0);
+  assert(VT.isVector() && "Expected a vector type");
+  unsigned VecWidth = VT.getSizeInBits();
+  if (MinSplatBits > VecWidth)
+    return false;
+
+  // FIXME: The widths are based on this node's type, but build vectors can
+  // truncate their operands.
+  SplatValue = APInt(VecWidth, 0);
+  SplatUndef = APInt(VecWidth, 0);
+
+  // Get the bits. Bits with undefined values (when the corresponding element
+  // of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared
+  // in SplatValue. If any of the values are not constant, give up and return
+  // false.
+  unsigned int NumOps = getNumOperands();
+  assert(NumOps > 0 && "isConstantSplat has 0-size build vector");
+  unsigned EltWidth = VT.getScalarSizeInBits();
+
+  for (unsigned j = 0; j < NumOps; ++j) {
+    unsigned i = IsBigEndian ? NumOps - 1 - j : j;
+    SDValue OpVal = getOperand(i);
+    unsigned BitPos = j * EltWidth;
+
+    if (OpVal.isUndef())
+      SplatUndef.setBits(BitPos, BitPos + EltWidth);
+    else if (auto *CN = dyn_cast<ConstantSDNode>(OpVal))
+      SplatValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth), BitPos);
+    else if (auto *CN = dyn_cast<ConstantFPSDNode>(OpVal))
+      SplatValue.insertBits(CN->getValueAPF().bitcastToAPInt(), BitPos);
+    else
+      return false;
+  }
+
+  // The build_vector is all constants or undefs. Find the smallest element
+  // size that splats the vector.
+  HasAnyUndefs = (SplatUndef != 0);
+
+  // FIXME: This does not work for vectors with elements less than 8 bits.
+  while (VecWidth > 8) {
+    unsigned HalfSize = VecWidth / 2;
+    APInt HighValue = SplatValue.extractBits(HalfSize, HalfSize);
+    APInt LowValue = SplatValue.extractBits(HalfSize, 0);
+    APInt HighUndef = SplatUndef.extractBits(HalfSize, HalfSize);
+    APInt LowUndef = SplatUndef.extractBits(HalfSize, 0);
+
+    // If the two halves do not match (ignoring undef bits), stop here.
+    if ((HighValue & ~LowUndef) != (LowValue & ~HighUndef) ||
+        MinSplatBits > HalfSize)
+      break;
+
+    SplatValue = HighValue | LowValue;
+    SplatUndef = HighUndef & LowUndef;
+
+    VecWidth = HalfSize;
+  }
+
+  SplatBitSize = VecWidth;
+  return true;
+}
+
+SDValue BuildVectorSDNode::getSplatValue(const APInt &DemandedElts,
+                                         BitVector *UndefElements) const {
+  unsigned NumOps = getNumOperands();
+  if (UndefElements) {
+    UndefElements->clear();
+    UndefElements->resize(NumOps);
+  }
+  assert(NumOps == DemandedElts.getBitWidth() && "Unexpected vector size");
+  if (!DemandedElts)
+    return SDValue();
+  SDValue Splatted;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    if (!DemandedElts[i])
+      continue;
+    SDValue Op = getOperand(i);
+    if (Op.isUndef()) {
+      if (UndefElements)
+        (*UndefElements)[i] = true;
+    } else if (!Splatted) {
+      Splatted = Op;
+    } else if (Splatted != Op) {
+      return SDValue();
+    }
+  }
+
+  if (!Splatted) {
+    unsigned FirstDemandedIdx = DemandedElts.countr_zero();
+    assert(getOperand(FirstDemandedIdx).isUndef() &&
+           "Can only have a splat without a constant for all undefs.");
+    return getOperand(FirstDemandedIdx);
+  }
+
+  return Splatted;
+}
+
+SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const {
+  APInt DemandedElts = APInt::getAllOnes(getNumOperands());
+  return getSplatValue(DemandedElts, UndefElements);
+}
+
+bool BuildVectorSDNode::getRepeatedSequence(const APInt &DemandedElts,
+                                            SmallVectorImpl<SDValue> &Sequence,
+                                            BitVector *UndefElements) const {
+  unsigned NumOps = getNumOperands();
+  Sequence.clear();
+  if (UndefElements) {
+    UndefElements->clear();
+    UndefElements->resize(NumOps);
+  }
+  assert(NumOps == DemandedElts.getBitWidth() && "Unexpected vector size");
+  if (!DemandedElts || NumOps < 2 || !isPowerOf2_32(NumOps))
+    return false;
+
+  // Set the undefs even if we don't find a sequence (like getSplatValue).
+  if (UndefElements)
+    for (unsigned I = 0; I != NumOps; ++I)
+      if (DemandedElts[I] && getOperand(I).isUndef())
+        (*UndefElements)[I] = true;
+
+  // Iteratively widen the sequence length looking for repetitions.
+  for (unsigned SeqLen = 1; SeqLen < NumOps; SeqLen *= 2) {
+    Sequence.append(SeqLen, SDValue());
+    for (unsigned I = 0; I != NumOps; ++I) {
+      if (!DemandedElts[I])
+        continue;
+      SDValue &SeqOp = Sequence[I % SeqLen];
+      SDValue Op = getOperand(I);
+      if (Op.isUndef()) {
+        if (!SeqOp)
+          SeqOp = Op;
+        continue;
+      }
+      if (SeqOp && !SeqOp.isUndef() && SeqOp != Op) {
+        Sequence.clear();
+        break;
+      }
+      SeqOp = Op;
+    }
+    if (!Sequence.empty())
+      return true;
+  }
+
+  assert(Sequence.empty() && "Failed to empty non-repeating sequence pattern");
+  return false;
+}
+
+bool BuildVectorSDNode::getRepeatedSequence(SmallVectorImpl<SDValue> &Sequence,
+                                            BitVector *UndefElements) const {
+  APInt DemandedElts = APInt::getAllOnes(getNumOperands());
+  return getRepeatedSequence(DemandedElts, Sequence, UndefElements);
+}
+
+ConstantSDNode *
+BuildVectorSDNode::getConstantSplatNode(const APInt &DemandedElts,
+                                        BitVector *UndefElements) const {
+  return dyn_cast_or_null<ConstantSDNode>(
+      getSplatValue(DemandedElts, UndefElements));
+}
+
+ConstantSDNode *
+BuildVectorSDNode::getConstantSplatNode(BitVector *UndefElements) const {
+  return dyn_cast_or_null<ConstantSDNode>(getSplatValue(UndefElements));
+}
+
+ConstantFPSDNode *
+BuildVectorSDNode::getConstantFPSplatNode(const APInt &DemandedElts,
+                                          BitVector *UndefElements) const {
+  return dyn_cast_or_null<ConstantFPSDNode>(
+      getSplatValue(DemandedElts, UndefElements));
+}
+
+ConstantFPSDNode *
+BuildVectorSDNode::getConstantFPSplatNode(BitVector *UndefElements) const {
+  return dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements));
+}
+
+int32_t
+BuildVectorSDNode::getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements,
+                                                   uint32_t BitWidth) const {
+  if (ConstantFPSDNode *CN =
+          dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements))) {
+    bool IsExact;
+    APSInt IntVal(BitWidth);
+    const APFloat &APF = CN->getValueAPF();
+    if (APF.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
+            APFloat::opOK ||
+        !IsExact)
+      return -1;
+
+    return IntVal.exactLogBase2();
+  }
+  return -1;
+}
+
+bool BuildVectorSDNode::getConstantRawBits(
+    bool IsLittleEndian, unsigned DstEltSizeInBits,
+    SmallVectorImpl<APInt> &RawBitElements, BitVector &UndefElements) const {
+  // Early-out if this contains anything but Undef/Constant/ConstantFP.
+  if (!isConstant())
+    return false;
+
+  unsigned NumSrcOps = getNumOperands();
+  unsigned SrcEltSizeInBits = getValueType(0).getScalarSizeInBits();
+  assert(((NumSrcOps * SrcEltSizeInBits) % DstEltSizeInBits) == 0 &&
+         "Invalid bitcast scale");
+
+  // Extract raw src bits.
+  SmallVector<APInt> SrcBitElements(NumSrcOps,
+                                    APInt::getZero(SrcEltSizeInBits));
+  BitVector SrcUndeElements(NumSrcOps, false);
+
+  for (unsigned I = 0; I != NumSrcOps; ++I) {
+    SDValue Op = getOperand(I);
+    if (Op.isUndef()) {
+      SrcUndeElements.set(I);
+      continue;
+    }
+    auto *CInt = dyn_cast<ConstantSDNode>(Op);
+    auto *CFP = dyn_cast<ConstantFPSDNode>(Op);
+    assert((CInt || CFP) && "Unknown constant");
+    SrcBitElements[I] = CInt ? CInt->getAPIntValue().trunc(SrcEltSizeInBits)
+                             : CFP->getValueAPF().bitcastToAPInt();
+  }
+
+  // Recast to dst width.
+  recastRawBits(IsLittleEndian, DstEltSizeInBits, RawBitElements,
+                SrcBitElements, UndefElements, SrcUndeElements);
+  return true;
+}
+
+void BuildVectorSDNode::recastRawBits(bool IsLittleEndian,
+                                      unsigned DstEltSizeInBits,
+                                      SmallVectorImpl<APInt> &DstBitElements,
+                                      ArrayRef<APInt> SrcBitElements,
+                                      BitVector &DstUndefElements,
+                                      const BitVector &SrcUndefElements) {
+  unsigned NumSrcOps = SrcBitElements.size();
+  unsigned SrcEltSizeInBits = SrcBitElements[0].getBitWidth();
+  assert(((NumSrcOps * SrcEltSizeInBits) % DstEltSizeInBits) == 0 &&
+         "Invalid bitcast scale");
+  assert(NumSrcOps == SrcUndefElements.size() &&
+         "Vector size mismatch");
+
+  unsigned NumDstOps = (NumSrcOps * SrcEltSizeInBits) / DstEltSizeInBits;
+  DstUndefElements.clear();
+  DstUndefElements.resize(NumDstOps, false);
+  DstBitElements.assign(NumDstOps, APInt::getZero(DstEltSizeInBits));
+
+  // Concatenate src elements constant bits together into dst element.
+  if (SrcEltSizeInBits <= DstEltSizeInBits) {
+    unsigned Scale = DstEltSizeInBits / SrcEltSizeInBits;
+    for (unsigned I = 0; I != NumDstOps; ++I) {
+      DstUndefElements.set(I);
+      APInt &DstBits = DstBitElements[I];
+      for (unsigned J = 0; J != Scale; ++J) {
+        unsigned Idx = (I * Scale) + (IsLittleEndian ? J : (Scale - J - 1));
+        if (SrcUndefElements[Idx])
+          continue;
+        DstUndefElements.reset(I);
+        const APInt &SrcBits = SrcBitElements[Idx];
+        assert(SrcBits.getBitWidth() == SrcEltSizeInBits &&
+               "Illegal constant bitwidths");
+        DstBits.insertBits(SrcBits, J * SrcEltSizeInBits);
+      }
+    }
+    return;
+  }
+
+  // Split src element constant bits into dst elements.
+  unsigned Scale = SrcEltSizeInBits / DstEltSizeInBits;
+  for (unsigned I = 0; I != NumSrcOps; ++I) {
+    if (SrcUndefElements[I]) {
+      DstUndefElements.set(I * Scale, (I + 1) * Scale);
+      continue;
+    }
+    const APInt &SrcBits = SrcBitElements[I];
+    for (unsigned J = 0; J != Scale; ++J) {
+      unsigned Idx = (I * Scale) + (IsLittleEndian ? J : (Scale - J - 1));
+      APInt &DstBits = DstBitElements[Idx];
+      DstBits = SrcBits.extractBits(DstEltSizeInBits, J * DstEltSizeInBits);
+    }
+  }
+}
+
+bool BuildVectorSDNode::isConstant() const {
+  for (const SDValue &Op : op_values()) {
+    unsigned Opc = Op.getOpcode();
+    if (Opc != ISD::UNDEF && Opc != ISD::Constant && Opc != ISD::ConstantFP)
+      return false;
+  }
+  return true;
+}
+
+std::optional<std::pair<APInt, APInt>>
+BuildVectorSDNode::isConstantSequence() const {
+  unsigned NumOps = getNumOperands();
+  if (NumOps < 2)
+    return std::nullopt;
+
+  if (!isa<ConstantSDNode>(getOperand(0)) ||
+      !isa<ConstantSDNode>(getOperand(1)))
+    return std::nullopt;
+
+  unsigned EltSize = getValueType(0).getScalarSizeInBits();
+  APInt Start = getConstantOperandAPInt(0).trunc(EltSize);
+  APInt Stride = getConstantOperandAPInt(1).trunc(EltSize) - Start;
+
+  if (Stride.isZero())
+    return std::nullopt;
+
+  for (unsigned i = 2; i < NumOps; ++i) {
+    if (!isa<ConstantSDNode>(getOperand(i)))
+      return std::nullopt;
+
+    APInt Val = getConstantOperandAPInt(i).trunc(EltSize);
+    if (Val != (Start + (Stride * i)))
+      return std::nullopt;
+  }
+
+  return std::make_pair(Start, Stride);
+}
+
+bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
+  // Find the first non-undef value in the shuffle mask.
+  unsigned i, e;
+  for (i = 0, e = VT.getVectorNumElements(); i != e && Mask[i] < 0; ++i)
+    /* search */;
+
+  // If all elements are undefined, this shuffle can be considered a splat
+  // (although it should eventually get simplified away completely).
+  if (i == e)
+    return true;
+
+  // Make sure all remaining elements are either undef or the same as the first
+  // non-undef value.
+  for (int Idx = Mask[i]; i != e; ++i)
+    if (Mask[i] >= 0 && Mask[i] != Idx)
+      return false;
+  return true;
+}
+
+// Returns the SDNode if it is a constant integer BuildVector
+// or constant integer.
+SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) const {
+  if (isa<ConstantSDNode>(N))
+    return N.getNode();
+  if (ISD::isBuildVectorOfConstantSDNodes(N.getNode()))
+    return N.getNode();
+  // Treat a GlobalAddress supporting constant offset folding as a
+  // constant integer.
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N))
+    if (GA->getOpcode() == ISD::GlobalAddress &&
+        TLI->isOffsetFoldingLegal(GA))
+      return GA;
+  if ((N.getOpcode() == ISD::SPLAT_VECTOR) &&
+      isa<ConstantSDNode>(N.getOperand(0)))
+    return N.getNode();
+  return nullptr;
+}
+
+// Returns the SDNode if it is a constant float BuildVector
+// or constant float.
+SDNode *SelectionDAG::isConstantFPBuildVectorOrConstantFP(SDValue N) const {
+  if (isa<ConstantFPSDNode>(N))
+    return N.getNode();
+
+  if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
+    return N.getNode();
+
+  if ((N.getOpcode() == ISD::SPLAT_VECTOR) &&
+      isa<ConstantFPSDNode>(N.getOperand(0)))
+    return N.getNode();
+
+  return nullptr;
+}
+
+void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
+  assert(!Node->OperandList && "Node already has operands");
+  assert(SDNode::getMaxNumOperands() >= Vals.size() &&
+         "too many operands to fit into SDNode");
+  SDUse *Ops = OperandRecycler.allocate(
+      ArrayRecycler<SDUse>::Capacity::get(Vals.size()), OperandAllocator);
+
+  bool IsDivergent = false;
+  for (unsigned I = 0; I != Vals.size(); ++I) {
+    Ops[I].setUser(Node);
+    Ops[I].setInitial(Vals[I]);
+    if (Ops[I].Val.getValueType() != MVT::Other) // Skip Chain. It does not carry divergence.
+      IsDivergent |= Ops[I].getNode()->isDivergent();
+  }
+  Node->NumOperands = Vals.size();
+  Node->OperandList = Ops;
+  if (!TLI->isSDNodeAlwaysUniform(Node)) {
+    IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, UA);
+    Node->SDNodeBits.IsDivergent = IsDivergent;
+  }
+  checkForCycles(Node);
+}
+
+SDValue SelectionDAG::getTokenFactor(const SDLoc &DL,
+                                     SmallVectorImpl<SDValue> &Vals) {
+  size_t Limit = SDNode::getMaxNumOperands();
+  while (Vals.size() > Limit) {
+    unsigned SliceIdx = Vals.size() - Limit;
+    auto ExtractedTFs = ArrayRef<SDValue>(Vals).slice(SliceIdx, Limit);
+    SDValue NewTF = getNode(ISD::TokenFactor, DL, MVT::Other, ExtractedTFs);
+    Vals.erase(Vals.begin() + SliceIdx, Vals.end());
+    Vals.emplace_back(NewTF);
+  }
+  return getNode(ISD::TokenFactor, DL, MVT::Other, Vals);
+}
+
+SDValue SelectionDAG::getNeutralElement(unsigned Opcode, const SDLoc &DL,
+                                        EVT VT, SDNodeFlags Flags) {
+  switch (Opcode) {
+  default:
+    return SDValue();
+  case ISD::ADD:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::UMAX:
+    return getConstant(0, DL, VT);
+  case ISD::MUL:
+    return getConstant(1, DL, VT);
+  case ISD::AND:
+  case ISD::UMIN:
+    return getAllOnesConstant(DL, VT);
+  case ISD::SMAX:
+    return getConstant(APInt::getSignedMinValue(VT.getSizeInBits()), DL, VT);
+  case ISD::SMIN:
+    return getConstant(APInt::getSignedMaxValue(VT.getSizeInBits()), DL, VT);
+  case ISD::FADD:
+    return getConstantFP(-0.0, DL, VT);
+  case ISD::FMUL:
+    return getConstantFP(1.0, DL, VT);
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM: {
+    // Neutral element for fminnum is NaN, Inf or FLT_MAX, depending on FMF.
+    const fltSemantics &Semantics = EVTToAPFloatSemantics(VT);
+    APFloat NeutralAF = !Flags.hasNoNaNs() ? APFloat::getQNaN(Semantics) :
+                        !Flags.hasNoInfs() ? APFloat::getInf(Semantics) :
+                        APFloat::getLargest(Semantics);
+    if (Opcode == ISD::FMAXNUM)
+      NeutralAF.changeSign();
+
+    return getConstantFP(NeutralAF, DL, VT);
+  }
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM: {
+    // Neutral element for fminimum is Inf or FLT_MAX, depending on FMF.
+    const fltSemantics &Semantics = EVTToAPFloatSemantics(VT);
+    APFloat NeutralAF = !Flags.hasNoInfs() ? APFloat::getInf(Semantics)
+                                           : APFloat::getLargest(Semantics);
+    if (Opcode == ISD::FMAXIMUM)
+      NeutralAF.changeSign();
+
+    return getConstantFP(NeutralAF, DL, VT);
+  }
+
+  }
+}
+
+/// Helper used to make a call to a library function that has one argument of
+/// pointer type.
+///
+/// Such functions include 'fegetmode', 'fesetenv' and some others, which are
+/// used to get or set floating-point state. They have one argument of pointer
+/// type, which points to the memory region containing bits of the
+/// floating-point state. The value returned by such function is ignored in the
+/// created call.
+///
+/// \param LibFunc Reference to library function (value of RTLIB::Libcall).
+/// \param Ptr Pointer used to save/load state.
+/// \param InChain Ingoing token chain.
+/// \returns Outgoing chain token.
+SDValue SelectionDAG::makeStateFunctionCall(unsigned LibFunc, SDValue Ptr,
+                                            SDValue InChain,
+                                            const SDLoc &DLoc) {
+  assert(InChain.getValueType() == MVT::Other && "Expected token chain");
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Node = Ptr;
+  Entry.Ty = Ptr.getValueType().getTypeForEVT(*getContext());
+  Args.push_back(Entry);
+  RTLIB::Libcall LC = static_cast<RTLIB::Libcall>(LibFunc);
+  SDValue Callee = getExternalSymbol(TLI->getLibcallName(LC),
+                                     TLI->getPointerTy(getDataLayout()));
+  TargetLowering::CallLoweringInfo CLI(*this);
+  CLI.setDebugLoc(DLoc).setChain(InChain).setLibCallee(
+      TLI->getLibcallCallingConv(LC), Type::getVoidTy(*getContext()), Callee,
+      std::move(Args));
+  return TLI->LowerCallTo(CLI).second;
+}
+
+void SelectionDAG::copyExtraInfo(SDNode *From, SDNode *To) {
+  assert(From && To && "Invalid SDNode; empty source SDValue?");
+  auto I = SDEI.find(From);
+  if (I == SDEI.end())
+    return;
+
+  // Use of operator[] on the DenseMap may cause an insertion, which invalidates
+  // the iterator, hence the need to make a copy to prevent a use-after-free.
+  NodeExtraInfo NEI = I->second;
+  if (LLVM_LIKELY(!NEI.PCSections)) {
+    // No deep copy required for the types of extra info set.
+    //
+    // FIXME: Investigate if other types of extra info also need deep copy. This
+    // depends on the types of nodes they can be attached to: if some extra info
+    // is only ever attached to nodes where a replacement To node is always the
+    // node where later use and propagation of the extra info has the intended
+    // semantics, no deep copy is required.
+    SDEI[To] = std::move(NEI);
+    return;
+  }
+
+  // We need to copy NodeExtraInfo to all _new_ nodes that are being introduced
+  // through the replacement of From with To. Otherwise, replacements of a node
+  // (From) with more complex nodes (To and its operands) may result in lost
+  // extra info where the root node (To) is insignificant in further propagating
+  // and using extra info when further lowering to MIR.
+  //
+  // In the first step pre-populate the visited set with the nodes reachable
+  // from the old From node. This avoids copying NodeExtraInfo to parts of the
+  // DAG that is not new and should be left untouched.
+  SmallVector<const SDNode *> Leafs{From}; // Leafs reachable with VisitFrom.
+  DenseSet<const SDNode *> FromReach; // The set of nodes reachable from From.
+  auto VisitFrom = [&](auto &&Self, const SDNode *N, int MaxDepth) {
+    if (MaxDepth == 0) {
+      // Remember this node in case we need to increase MaxDepth and continue
+      // populating FromReach from this node.
+      Leafs.emplace_back(N);
+      return;
+    }
+    if (!FromReach.insert(N).second)
+      return;
+    for (const SDValue &Op : N->op_values())
+      Self(Self, Op.getNode(), MaxDepth - 1);
+  };
+
+  // Copy extra info to To and all its transitive operands (that are new).
+  SmallPtrSet<const SDNode *, 8> Visited;
+  auto DeepCopyTo = [&](auto &&Self, const SDNode *N) {
+    if (FromReach.contains(N))
+      return true;
+    if (!Visited.insert(N).second)
+      return true;
+    if (getEntryNode().getNode() == N)
+      return false;
+    for (const SDValue &Op : N->op_values()) {
+      if (!Self(Self, Op.getNode()))
+        return false;
+    }
+    // Copy only if entry node was not reached.
+    SDEI[N] = NEI;
+    return true;
+  };
+
+  // We first try with a lower MaxDepth, assuming that the path to common
+  // operands between From and To is relatively short. This significantly
+  // improves performance in the common case. The initial MaxDepth is big
+  // enough to avoid retry in the common case; the last MaxDepth is large
+  // enough to avoid having to use the fallback below (and protects from
+  // potential stack exhaustion from recursion).
+  for (int PrevDepth = 0, MaxDepth = 16; MaxDepth <= 1024;
+       PrevDepth = MaxDepth, MaxDepth *= 2, Visited.clear()) {
+    // StartFrom is the previous (or initial) set of leafs reachable at the
+    // previous maximum depth.
+    SmallVector<const SDNode *> StartFrom;
+    std::swap(StartFrom, Leafs);
+    for (const SDNode *N : StartFrom)
+      VisitFrom(VisitFrom, N, MaxDepth - PrevDepth);
+    if (LLVM_LIKELY(DeepCopyTo(DeepCopyTo, To)))
+      return;
+    // This should happen very rarely (reached the entry node).
+    LLVM_DEBUG(dbgs() << __func__ << ": MaxDepth=" << MaxDepth << " too low\n");
+    assert(!Leafs.empty());
+  }
+
+  // This should not happen - but if it did, that means the subgraph reachable
+  // from From has depth greater or equal to maximum MaxDepth, and VisitFrom()
+  // could not visit all reachable common operands. Consequently, we were able
+  // to reach the entry node.
+  errs() << "warning: incomplete propagation of SelectionDAG::NodeExtraInfo\n";
+  assert(false && "From subgraph too complex - increase max. MaxDepth?");
+  // Best-effort fallback if assertions disabled.
+  SDEI[To] = std::move(NEI);
+}
+
+#ifndef NDEBUG
+static void checkForCyclesHelper(const SDNode *N,
+                                 SmallPtrSetImpl<const SDNode*> &Visited,
+                                 SmallPtrSetImpl<const SDNode*> &Checked,
+                                 const llvm::SelectionDAG *DAG) {
+  // If this node has already been checked, don't check it again.
+  if (Checked.count(N))
+    return;
+
+  // If a node has already been visited on this depth-first walk, reject it as
+  // a cycle.
+  if (!Visited.insert(N).second) {
+    errs() << "Detected cycle in SelectionDAG\n";
+    dbgs() << "Offending node:\n";
+    N->dumprFull(DAG); dbgs() << "\n";
+    abort();
+  }
+
+  for (const SDValue &Op : N->op_values())
+    checkForCyclesHelper(Op.getNode(), Visited, Checked, DAG);
+
+  Checked.insert(N);
+  Visited.erase(N);
+}
+#endif
+
+void llvm::checkForCycles(const llvm::SDNode *N,
+                          const llvm::SelectionDAG *DAG,
+                          bool force) {
+#ifndef NDEBUG
+  bool check = force;
+#ifdef EXPENSIVE_CHECKS
+  check = true;
+#endif  // EXPENSIVE_CHECKS
+  if (check) {
+    assert(N && "Checking nonexistent SDNode");
+    SmallPtrSet<const SDNode*, 32> visited;
+    SmallPtrSet<const SDNode*, 32> checked;
+    checkForCyclesHelper(N, visited, checked, DAG);
+  }
+#endif  // !NDEBUG
+}
+
+void llvm::checkForCycles(const llvm::SelectionDAG *DAG, bool force) {
+  checkForCycles(DAG->getRoot().getNode(), DAG, force);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
new file mode 100644
index 000000000000..a432d8e92bca
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -0,0 +1,324 @@
+//==- llvm/CodeGen/SelectionDAGAddressAnalysis.cpp - DAG Address Analysis --==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include <cstdint>
+
+using namespace llvm;
+
+bool BaseIndexOffset::equalBaseIndex(const BaseIndexOffset &Other,
+                                     const SelectionDAG &DAG,
+                                     int64_t &Off) const {
+  // Conservatively fail if we a match failed..
+  if (!Base.getNode() || !Other.Base.getNode())
+    return false;
+  if (!hasValidOffset() || !Other.hasValidOffset())
+    return false;
+  // Initial Offset difference.
+  Off = *Other.Offset - *Offset;
+
+  if ((Other.Index == Index) && (Other.IsIndexSignExt == IsIndexSignExt)) {
+    // Trivial match.
+    if (Other.Base == Base)
+      return true;
+
+    // Match GlobalAddresses
+    if (auto *A = dyn_cast<GlobalAddressSDNode>(Base))
+      if (auto *B = dyn_cast<GlobalAddressSDNode>(Other.Base))
+        if (A->getGlobal() == B->getGlobal()) {
+          Off += B->getOffset() - A->getOffset();
+          return true;
+        }
+
+    // Match Constants
+    if (auto *A = dyn_cast<ConstantPoolSDNode>(Base))
+      if (auto *B = dyn_cast<ConstantPoolSDNode>(Other.Base)) {
+        bool IsMatch =
+            A->isMachineConstantPoolEntry() == B->isMachineConstantPoolEntry();
+        if (IsMatch) {
+          if (A->isMachineConstantPoolEntry())
+            IsMatch = A->getMachineCPVal() == B->getMachineCPVal();
+          else
+            IsMatch = A->getConstVal() == B->getConstVal();
+        }
+        if (IsMatch) {
+          Off += B->getOffset() - A->getOffset();
+          return true;
+        }
+      }
+
+    const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+    // Match FrameIndexes.
+    if (auto *A = dyn_cast<FrameIndexSDNode>(Base))
+      if (auto *B = dyn_cast<FrameIndexSDNode>(Other.Base)) {
+        // Equal FrameIndexes - offsets are directly comparable.
+        if (A->getIndex() == B->getIndex())
+          return true;
+        // Non-equal FrameIndexes - If both frame indices are fixed
+        // we know their relative offsets and can compare them. Otherwise
+        // we must be conservative.
+        if (MFI.isFixedObjectIndex(A->getIndex()) &&
+            MFI.isFixedObjectIndex(B->getIndex())) {
+          Off += MFI.getObjectOffset(B->getIndex()) -
+                 MFI.getObjectOffset(A->getIndex());
+          return true;
+        }
+      }
+  }
+  return false;
+}
+
+bool BaseIndexOffset::computeAliasing(const SDNode *Op0,
+                                      const std::optional<int64_t> NumBytes0,
+                                      const SDNode *Op1,
+                                      const std::optional<int64_t> NumBytes1,
+                                      const SelectionDAG &DAG, bool &IsAlias) {
+
+  BaseIndexOffset BasePtr0 = match(Op0, DAG);
+  BaseIndexOffset BasePtr1 = match(Op1, DAG);
+
+  if (!(BasePtr0.getBase().getNode() && BasePtr1.getBase().getNode()))
+    return false;
+  int64_t PtrDiff;
+  if (NumBytes0 && NumBytes1 &&
+      BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) {
+    // If the size of memory access is unknown, do not use it to analysis.
+    // One example of unknown size memory access is to load/store scalable
+    // vector objects on the stack.
+    // BasePtr1 is PtrDiff away from BasePtr0. They alias if none of the
+    // following situations arise:
+    if (PtrDiff >= 0 &&
+        *NumBytes0 != static_cast<int64_t>(MemoryLocation::UnknownSize)) {
+      // [----BasePtr0----]
+      //                         [---BasePtr1--]
+      // ========PtrDiff========>
+      IsAlias = !(*NumBytes0 <= PtrDiff);
+      return true;
+    }
+    if (PtrDiff < 0 &&
+        *NumBytes1 != static_cast<int64_t>(MemoryLocation::UnknownSize)) {
+      //                     [----BasePtr0----]
+      // [---BasePtr1--]
+      // =====(-PtrDiff)====>
+      IsAlias = !((PtrDiff + *NumBytes1) <= 0);
+      return true;
+    }
+    return false;
+  }
+  // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
+  // able to calculate their relative offset if at least one arises
+  // from an alloca. However, these allocas cannot overlap and we
+  // can infer there is no alias.
+  if (auto *A = dyn_cast<FrameIndexSDNode>(BasePtr0.getBase()))
+    if (auto *B = dyn_cast<FrameIndexSDNode>(BasePtr1.getBase())) {
+      MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+      // If the base are the same frame index but the we couldn't find a
+      // constant offset, (indices are different) be conservative.
+      if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) ||
+                     !MFI.isFixedObjectIndex(B->getIndex()))) {
+        IsAlias = false;
+        return true;
+      }
+    }
+
+  bool IsFI0 = isa<FrameIndexSDNode>(BasePtr0.getBase());
+  bool IsFI1 = isa<FrameIndexSDNode>(BasePtr1.getBase());
+  bool IsGV0 = isa<GlobalAddressSDNode>(BasePtr0.getBase());
+  bool IsGV1 = isa<GlobalAddressSDNode>(BasePtr1.getBase());
+  bool IsCV0 = isa<ConstantPoolSDNode>(BasePtr0.getBase());
+  bool IsCV1 = isa<ConstantPoolSDNode>(BasePtr1.getBase());
+
+  if ((IsFI0 || IsGV0 || IsCV0) && (IsFI1 || IsGV1 || IsCV1)) {
+    // We can derive NoAlias In case of mismatched base types.
+    if (IsFI0 != IsFI1 || IsGV0 != IsGV1 || IsCV0 != IsCV1) {
+      IsAlias = false;
+      return true;
+    }
+    if (IsGV0 && IsGV1) {
+      auto *GV0 = cast<GlobalAddressSDNode>(BasePtr0.getBase())->getGlobal();
+      auto *GV1 = cast<GlobalAddressSDNode>(BasePtr1.getBase())->getGlobal();
+      // It doesn't make sense to access one global value using another globals
+      // values address, so we can assume that there is no aliasing in case of
+      // two different globals (unless we have symbols that may indirectly point
+      // to each other).
+      // FIXME: This is perhaps a bit too defensive. We could try to follow the
+      // chain with aliasee information for GlobalAlias variables to find out if
+      // we indirect symbols may alias or not.
+      if (GV0 != GV1 && !isa<GlobalAlias>(GV0) && !isa<GlobalAlias>(GV1)) {
+        IsAlias = false;
+        return true;
+      }
+    }
+  }
+  return false; // Cannot determine whether the pointers alias.
+}
+
+bool BaseIndexOffset::contains(const SelectionDAG &DAG, int64_t BitSize,
+                               const BaseIndexOffset &Other,
+                               int64_t OtherBitSize, int64_t &BitOffset) const {
+  int64_t Offset;
+  if (!equalBaseIndex(Other, DAG, Offset))
+    return false;
+  if (Offset >= 0) {
+    // Other is after *this:
+    // [-------*this---------]
+    //            [---Other--]
+    // ==Offset==>
+    BitOffset = 8 * Offset;
+    return BitOffset + OtherBitSize <= BitSize;
+  }
+  // Other starts strictly before *this, it cannot be fully contained.
+  //    [-------*this---------]
+  // [--Other--]
+  return false;
+}
+
+/// Parses tree in Ptr for base, index, offset addresses.
+static BaseIndexOffset matchLSNode(const LSBaseSDNode *N,
+                                   const SelectionDAG &DAG) {
+  SDValue Ptr = N->getBasePtr();
+
+  // (((B + I*M) + c)) + c ...
+  SDValue Base = DAG.getTargetLoweringInfo().unwrapAddress(Ptr);
+  SDValue Index = SDValue();
+  int64_t Offset = 0;
+  bool IsIndexSignExt = false;
+
+  // pre-inc/pre-dec ops are components of EA.
+  if (N->getAddressingMode() == ISD::PRE_INC) {
+    if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset()))
+      Offset += C->getSExtValue();
+    else // If unknown, give up now.
+      return BaseIndexOffset(SDValue(), SDValue(), 0, false);
+  } else if (N->getAddressingMode() == ISD::PRE_DEC) {
+    if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset()))
+      Offset -= C->getSExtValue();
+    else // If unknown, give up now.
+      return BaseIndexOffset(SDValue(), SDValue(), 0, false);
+  }
+
+  // Consume constant adds & ors with appropriate masking.
+  while (true) {
+    switch (Base->getOpcode()) {
+    case ISD::OR:
+      // Only consider ORs which act as adds.
+      if (auto *C = dyn_cast<ConstantSDNode>(Base->getOperand(1)))
+        if (DAG.MaskedValueIsZero(Base->getOperand(0), C->getAPIntValue())) {
+          Offset += C->getSExtValue();
+          Base = DAG.getTargetLoweringInfo().unwrapAddress(Base->getOperand(0));
+          continue;
+        }
+      break;
+    case ISD::ADD:
+      if (auto *C = dyn_cast<ConstantSDNode>(Base->getOperand(1))) {
+        Offset += C->getSExtValue();
+        Base = DAG.getTargetLoweringInfo().unwrapAddress(Base->getOperand(0));
+        continue;
+      }
+      break;
+    case ISD::LOAD:
+    case ISD::STORE: {
+      auto *LSBase = cast<LSBaseSDNode>(Base.getNode());
+      unsigned int IndexResNo = (Base->getOpcode() == ISD::LOAD) ? 1 : 0;
+      if (LSBase->isIndexed() && Base.getResNo() == IndexResNo)
+        if (auto *C = dyn_cast<ConstantSDNode>(LSBase->getOffset())) {
+          auto Off = C->getSExtValue();
+          if (LSBase->getAddressingMode() == ISD::PRE_DEC ||
+              LSBase->getAddressingMode() == ISD::POST_DEC)
+            Offset -= Off;
+          else
+            Offset += Off;
+          Base = DAG.getTargetLoweringInfo().unwrapAddress(LSBase->getBasePtr());
+          continue;
+        }
+      break;
+    }
+    }
+    // If we get here break out of the loop.
+    break;
+  }
+
+  if (Base->getOpcode() == ISD::ADD) {
+    // TODO: The following code appears to be needless as it just
+    //       bails on some Ptrs early, reducing the cases where we
+    //       find equivalence. We should be able to remove this.
+    // Inside a loop the current BASE pointer is calculated using an ADD and a
+    // MUL instruction. In this case Base is the actual BASE pointer.
+    // (i64 add (i64 %array_ptr)
+    //          (i64 mul (i64 %induction_var)
+    //                   (i64 %element_size)))
+    if (Base->getOperand(1)->getOpcode() == ISD::MUL)
+      return BaseIndexOffset(Base, Index, Offset, IsIndexSignExt);
+
+    // Look at Base + Index + Offset cases.
+    Index = Base->getOperand(1);
+    SDValue PotentialBase = Base->getOperand(0);
+
+    // Skip signextends.
+    if (Index->getOpcode() == ISD::SIGN_EXTEND) {
+      Index = Index->getOperand(0);
+      IsIndexSignExt = true;
+    }
+
+    // Check if Index Offset pattern
+    if (Index->getOpcode() != ISD::ADD ||
+        !isa<ConstantSDNode>(Index->getOperand(1)))
+      return BaseIndexOffset(PotentialBase, Index, Offset, IsIndexSignExt);
+
+    Offset += cast<ConstantSDNode>(Index->getOperand(1))->getSExtValue();
+    Index = Index->getOperand(0);
+    if (Index->getOpcode() == ISD::SIGN_EXTEND) {
+      Index = Index->getOperand(0);
+      IsIndexSignExt = true;
+    } else
+      IsIndexSignExt = false;
+    Base = PotentialBase;
+  }
+  return BaseIndexOffset(Base, Index, Offset, IsIndexSignExt);
+}
+
+BaseIndexOffset BaseIndexOffset::match(const SDNode *N,
+                                       const SelectionDAG &DAG) {
+  if (const auto *LS0 = dyn_cast<LSBaseSDNode>(N))
+    return matchLSNode(LS0, DAG);
+  if (const auto *LN = dyn_cast<LifetimeSDNode>(N)) {
+    if (LN->hasOffset())
+      return BaseIndexOffset(LN->getOperand(1), SDValue(), LN->getOffset(),
+                             false);
+    return BaseIndexOffset(LN->getOperand(1), SDValue(), false);
+  }
+  return BaseIndexOffset();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+LLVM_DUMP_METHOD void BaseIndexOffset::dump() const {
+  print(dbgs());
+}
+
+void BaseIndexOffset::print(raw_ostream& OS) const {
+  OS << "BaseIndexOffset base=[";
+  Base->print(OS);
+  OS << "] index=[";
+  if (Index)
+    Index->print(OS);
+  OS << "] offset=" << Offset;
+}
+
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
new file mode 100644
index 000000000000..20c37eb4cb11
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -0,0 +1,11977 @@
+//===- SelectionDAGBuilder.cpp - Selection-DAG building -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements routines for translating from LLVM IR into SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SelectionDAGBuilder.h"
+#include "SDNodeDbgValue.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/AssignmentTrackingAnalysis.h"
+#include "llvm/CodeGen/CodeGenCommonISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundleIterator.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/SwiftErrorValueTracking.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/EHPersonalities.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsWebAssembly.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cstddef>
+#include <iterator>
+#include <limits>
+#include <optional>
+#include <tuple>
+
+using namespace llvm;
+using namespace PatternMatch;
+using namespace SwitchCG;
+
+#define DEBUG_TYPE "isel"
+
+/// LimitFloatPrecision - Generate low-precision inline sequences for
+/// some float libcalls (6, 8 or 12 bits).
+static unsigned LimitFloatPrecision;
+
+static cl::opt<bool>
+    InsertAssertAlign("insert-assert-align", cl::init(true),
+                      cl::desc("Insert the experimental `assertalign` node."),
+                      cl::ReallyHidden);
+
+static cl::opt<unsigned, true>
+    LimitFPPrecision("limit-float-precision",
+                     cl::desc("Generate low-precision inline sequences "
+                              "for some float libcalls"),
+                     cl::location(LimitFloatPrecision), cl::Hidden,
+                     cl::init(0));
+
+static cl::opt<unsigned> SwitchPeelThreshold(
+    "switch-peel-threshold", cl::Hidden, cl::init(66),
+    cl::desc("Set the case probability threshold for peeling the case from a "
+             "switch statement. A value greater than 100 will void this "
+             "optimization"));
+
+// Limit the width of DAG chains. This is important in general to prevent
+// DAG-based analysis from blowing up. For example, alias analysis and
+// load clustering may not complete in reasonable time. It is difficult to
+// recognize and avoid this situation within each individual analysis, and
+// future analyses are likely to have the same behavior. Limiting DAG width is
+// the safe approach and will be especially important with global DAGs.
+//
+// MaxParallelChains default is arbitrarily high to avoid affecting
+// optimization, but could be lowered to improve compile time. Any ld-ld-st-st
+// sequence over this should have been converted to llvm.memcpy by the
+// frontend. It is easy to induce this behavior with .ll code such as:
+// %buffer = alloca [4096 x i8]
+// %data = load [4096 x i8]* %argPtr
+// store [4096 x i8] %data, [4096 x i8]* %buffer
+static const unsigned MaxParallelChains = 64;
+
+static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
+                                      const SDValue *Parts, unsigned NumParts,
+                                      MVT PartVT, EVT ValueVT, const Value *V,
+                                      std::optional<CallingConv::ID> CC);
+
+/// getCopyFromParts - Create a value that contains the specified legal parts
+/// combined into the value they represent.  If the parts combine to a type
+/// larger than ValueVT then AssertOp can be used to specify whether the extra
+/// bits are known to be zero (ISD::AssertZext) or sign extended from ValueVT
+/// (ISD::AssertSext).
+static SDValue
+getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts,
+                 unsigned NumParts, MVT PartVT, EVT ValueVT, const Value *V,
+                 std::optional<CallingConv::ID> CC = std::nullopt,
+                 std::optional<ISD::NodeType> AssertOp = std::nullopt) {
+  // Let the target assemble the parts if it wants to
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (SDValue Val = TLI.joinRegisterPartsIntoValue(DAG, DL, Parts, NumParts,
+                                                   PartVT, ValueVT, CC))
+    return Val;
+
+  if (ValueVT.isVector())
+    return getCopyFromPartsVector(DAG, DL, Parts, NumParts, PartVT, ValueVT, V,
+                                  CC);
+
+  assert(NumParts > 0 && "No parts to assemble!");
+  SDValue Val = Parts[0];
+
+  if (NumParts > 1) {
+    // Assemble the value from multiple parts.
+    if (ValueVT.isInteger()) {
+      unsigned PartBits = PartVT.getSizeInBits();
+      unsigned ValueBits = ValueVT.getSizeInBits();
+
+      // Assemble the power of 2 part.
+      unsigned RoundParts = llvm::bit_floor(NumParts);
+      unsigned RoundBits = PartBits * RoundParts;
+      EVT RoundVT = RoundBits == ValueBits ?
+        ValueVT : EVT::getIntegerVT(*DAG.getContext(), RoundBits);
+      SDValue Lo, Hi;
+
+      EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), RoundBits/2);
+
+      if (RoundParts > 2) {
+        Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2,
+                              PartVT, HalfVT, V);
+        Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2,
+                              RoundParts / 2, PartVT, HalfVT, V);
+      } else {
+        Lo = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[0]);
+        Hi = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[1]);
+      }
+
+      if (DAG.getDataLayout().isBigEndian())
+        std::swap(Lo, Hi);
+
+      Val = DAG.getNode(ISD::BUILD_PAIR, DL, RoundVT, Lo, Hi);
+
+      if (RoundParts < NumParts) {
+        // Assemble the trailing non-power-of-2 part.
+        unsigned OddParts = NumParts - RoundParts;
+        EVT OddVT = EVT::getIntegerVT(*DAG.getContext(), OddParts * PartBits);
+        Hi = getCopyFromParts(DAG, DL, Parts + RoundParts, OddParts, PartVT,
+                              OddVT, V, CC);
+
+        // Combine the round and odd parts.
+        Lo = Val;
+        if (DAG.getDataLayout().isBigEndian())
+          std::swap(Lo, Hi);
+        EVT TotalVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+        Hi = DAG.getNode(ISD::ANY_EXTEND, DL, TotalVT, Hi);
+        Hi = DAG.getNode(ISD::SHL, DL, TotalVT, Hi,
+                         DAG.getConstant(Lo.getValueSizeInBits(), DL,
+                                         TLI.getShiftAmountTy(
+                                             TotalVT, DAG.getDataLayout())));
+        Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, TotalVT, Lo);
+        Val = DAG.getNode(ISD::OR, DL, TotalVT, Lo, Hi);
+      }
+    } else if (PartVT.isFloatingPoint()) {
+      // FP split into multiple FP parts (for ppcf128)
+      assert(ValueVT == EVT(MVT::ppcf128) && PartVT == MVT::f64 &&
+             "Unexpected split");
+      SDValue Lo, Hi;
+      Lo = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[0]);
+      Hi = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[1]);
+      if (TLI.hasBigEndianPartOrdering(ValueVT, DAG.getDataLayout()))
+        std::swap(Lo, Hi);
+      Val = DAG.getNode(ISD::BUILD_PAIR, DL, ValueVT, Lo, Hi);
+    } else {
+      // FP split into integer parts (soft fp)
+      assert(ValueVT.isFloatingPoint() && PartVT.isInteger() &&
+             !PartVT.isVector() && "Unexpected split");
+      EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
+      Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V, CC);
+    }
+  }
+
+  // There is now one part, held in Val.  Correct it to match ValueVT.
+  // PartEVT is the type of the register class that holds the value.
+  // ValueVT is the type of the inline asm operation.
+  EVT PartEVT = Val.getValueType();
+
+  if (PartEVT == ValueVT)
+    return Val;
+
+  if (PartEVT.isInteger() && ValueVT.isFloatingPoint() &&
+      ValueVT.bitsLT(PartEVT)) {
+    // For an FP value in an integer part, we need to truncate to the right
+    // width first.
+    PartEVT = EVT::getIntegerVT(*DAG.getContext(),  ValueVT.getSizeInBits());
+    Val = DAG.getNode(ISD::TRUNCATE, DL, PartEVT, Val);
+  }
+
+  // Handle types that have the same size.
+  if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits())
+    return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
+  // Handle types with different sizes.
+  if (PartEVT.isInteger() && ValueVT.isInteger()) {
+    if (ValueVT.bitsLT(PartEVT)) {
+      // For a truncate, see if we have any information to
+      // indicate whether the truncated bits will always be
+      // zero or sign-extension.
+      if (AssertOp)
+        Val = DAG.getNode(*AssertOp, DL, PartEVT, Val,
+                          DAG.getValueType(ValueVT));
+      return DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
+    }
+    return DAG.getNode(ISD::ANY_EXTEND, DL, ValueVT, Val);
+  }
+
+  if (PartEVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
+    // FP_ROUND's are always exact here.
+    if (ValueVT.bitsLT(Val.getValueType()))
+      return DAG.getNode(
+          ISD::FP_ROUND, DL, ValueVT, Val,
+          DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout())));
+
+    return DAG.getNode(ISD::FP_EXTEND, DL, ValueVT, Val);
+  }
+
+  // Handle MMX to a narrower integer type by bitcasting MMX to integer and
+  // then truncating.
+  if (PartEVT == MVT::x86mmx && ValueVT.isInteger() &&
+      ValueVT.bitsLT(PartEVT)) {
+    Val = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Val);
+    return DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
+  }
+
+  report_fatal_error("Unknown mismatch in getCopyFromParts!");
+}
+
+static void diagnosePossiblyInvalidConstraint(LLVMContext &Ctx, const Value *V,
+                                              const Twine &ErrMsg) {
+  const Instruction *I = dyn_cast_or_null<Instruction>(V);
+  if (!V)
+    return Ctx.emitError(ErrMsg);
+
+  const char *AsmError = ", possible invalid constraint for vector type";
+  if (const CallInst *CI = dyn_cast<CallInst>(I))
+    if (CI->isInlineAsm())
+      return Ctx.emitError(I, ErrMsg + AsmError);
+
+  return Ctx.emitError(I, ErrMsg);
+}
+
+/// getCopyFromPartsVector - Create a value that contains the specified legal
+/// parts combined into the value they represent.  If the parts combine to a
+/// type larger than ValueVT then AssertOp can be used to specify whether the
+/// extra bits are known to be zero (ISD::AssertZext) or sign extended from
+/// ValueVT (ISD::AssertSext).
+static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
+                                      const SDValue *Parts, unsigned NumParts,
+                                      MVT PartVT, EVT ValueVT, const Value *V,
+                                      std::optional<CallingConv::ID> CallConv) {
+  assert(ValueVT.isVector() && "Not a vector value");
+  assert(NumParts > 0 && "No parts to assemble!");
+  const bool IsABIRegCopy = CallConv.has_value();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Val = Parts[0];
+
+  // Handle a multi-element vector.
+  if (NumParts > 1) {
+    EVT IntermediateVT;
+    MVT RegisterVT;
+    unsigned NumIntermediates;
+    unsigned NumRegs;
+
+    if (IsABIRegCopy) {
+      NumRegs = TLI.getVectorTypeBreakdownForCallingConv(
+          *DAG.getContext(), *CallConv, ValueVT, IntermediateVT,
+          NumIntermediates, RegisterVT);
+    } else {
+      NumRegs =
+          TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
+                                     NumIntermediates, RegisterVT);
+    }
+
+    assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
+    NumParts = NumRegs; // Silence a compiler warning.
+    assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
+    assert(RegisterVT.getSizeInBits() ==
+           Parts[0].getSimpleValueType().getSizeInBits() &&
+           "Part type sizes don't match!");
+
+    // Assemble the parts into intermediate operands.
+    SmallVector<SDValue, 8> Ops(NumIntermediates);
+    if (NumIntermediates == NumParts) {
+      // If the register was not expanded, truncate or copy the value,
+      // as appropriate.
+      for (unsigned i = 0; i != NumParts; ++i)
+        Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1,
+                                  PartVT, IntermediateVT, V, CallConv);
+    } else if (NumParts > 0) {
+      // If the intermediate type was expanded, build the intermediate
+      // operands from the parts.
+      assert(NumParts % NumIntermediates == 0 &&
+             "Must expand into a divisible number of parts!");
+      unsigned Factor = NumParts / NumIntermediates;
+      for (unsigned i = 0; i != NumIntermediates; ++i)
+        Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor,
+                                  PartVT, IntermediateVT, V, CallConv);
+    }
+
+    // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
+    // intermediate operands.
+    EVT BuiltVectorTy =
+        IntermediateVT.isVector()
+            ? EVT::getVectorVT(
+                  *DAG.getContext(), IntermediateVT.getScalarType(),
+                  IntermediateVT.getVectorElementCount() * NumParts)
+            : EVT::getVectorVT(*DAG.getContext(),
+                               IntermediateVT.getScalarType(),
+                               NumIntermediates);
+    Val = DAG.getNode(IntermediateVT.isVector() ? ISD::CONCAT_VECTORS
+                                                : ISD::BUILD_VECTOR,
+                      DL, BuiltVectorTy, Ops);
+  }
+
+  // There is now one part, held in Val.  Correct it to match ValueVT.
+  EVT PartEVT = Val.getValueType();
+
+  if (PartEVT == ValueVT)
+    return Val;
+
+  if (PartEVT.isVector()) {
+    // Vector/Vector bitcast.
+    if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits())
+      return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
+    // If the parts vector has more elements than the value vector, then we
+    // have a vector widening case (e.g. <2 x float> -> <4 x float>).
+    // Extract the elements we want.
+    if (PartEVT.getVectorElementCount() != ValueVT.getVectorElementCount()) {
+      assert((PartEVT.getVectorElementCount().getKnownMinValue() >
+              ValueVT.getVectorElementCount().getKnownMinValue()) &&
+             (PartEVT.getVectorElementCount().isScalable() ==
+              ValueVT.getVectorElementCount().isScalable()) &&
+             "Cannot narrow, it would be a lossy transformation");
+      PartEVT =
+          EVT::getVectorVT(*DAG.getContext(), PartEVT.getVectorElementType(),
+                           ValueVT.getVectorElementCount());
+      Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, PartEVT, Val,
+                        DAG.getVectorIdxConstant(0, DL));
+      if (PartEVT == ValueVT)
+        return Val;
+      if (PartEVT.isInteger() && ValueVT.isFloatingPoint())
+        return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
+      // Vector/Vector bitcast (e.g. <2 x bfloat> -> <2 x half>).
+      if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits())
+        return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+    }
+
+    // Promoted vector extract
+    return DAG.getAnyExtOrTrunc(Val, DL, ValueVT);
+  }
+
+  // Trivial bitcast if the types are the same size and the destination
+  // vector type is legal.
+  if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits() &&
+      TLI.isTypeLegal(ValueVT))
+    return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
+  if (ValueVT.getVectorNumElements() != 1) {
+     // Certain ABIs require that vectors are passed as integers. For vectors
+     // are the same size, this is an obvious bitcast.
+     if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits()) {
+       return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+     } else if (ValueVT.bitsLT(PartEVT)) {
+       const uint64_t ValueSize = ValueVT.getFixedSizeInBits();
+       EVT IntermediateType = EVT::getIntegerVT(*DAG.getContext(), ValueSize);
+       // Drop the extra bits.
+       Val = DAG.getNode(ISD::TRUNCATE, DL, IntermediateType, Val);
+       return DAG.getBitcast(ValueVT, Val);
+     }
+
+     diagnosePossiblyInvalidConstraint(
+         *DAG.getContext(), V, "non-trivial scalar-to-vector conversion");
+     return DAG.getUNDEF(ValueVT);
+  }
+
+  // Handle cases such as i8 -> <1 x i1>
+  EVT ValueSVT = ValueVT.getVectorElementType();
+  if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT) {
+    unsigned ValueSize = ValueSVT.getSizeInBits();
+    if (ValueSize == PartEVT.getSizeInBits()) {
+      Val = DAG.getNode(ISD::BITCAST, DL, ValueSVT, Val);
+    } else if (ValueSVT.isFloatingPoint() && PartEVT.isInteger()) {
+      // It's possible a scalar floating point type gets softened to integer and
+      // then promoted to a larger integer. If PartEVT is the larger integer
+      // we need to truncate it and then bitcast to the FP type.
+      assert(ValueSVT.bitsLT(PartEVT) && "Unexpected types");
+      EVT IntermediateType = EVT::getIntegerVT(*DAG.getContext(), ValueSize);
+      Val = DAG.getNode(ISD::TRUNCATE, DL, IntermediateType, Val);
+      Val = DAG.getBitcast(ValueSVT, Val);
+    } else {
+      Val = ValueVT.isFloatingPoint()
+                ? DAG.getFPExtendOrRound(Val, DL, ValueSVT)
+                : DAG.getAnyExtOrTrunc(Val, DL, ValueSVT);
+    }
+  }
+
+  return DAG.getBuildVector(ValueVT, DL, Val);
+}
+
+static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &dl,
+                                 SDValue Val, SDValue *Parts, unsigned NumParts,
+                                 MVT PartVT, const Value *V,
+                                 std::optional<CallingConv::ID> CallConv);
+
+/// getCopyToParts - Create a series of nodes that contain the specified value
+/// split into legal parts.  If the parts contain more bits than Val, then, for
+/// integers, ExtendKind can be used to specify how to generate the extra bits.
+static void
+getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
+               unsigned NumParts, MVT PartVT, const Value *V,
+               std::optional<CallingConv::ID> CallConv = std::nullopt,
+               ISD::NodeType ExtendKind = ISD::ANY_EXTEND) {
+  // Let the target split the parts if it wants to
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.splitValueIntoRegisterParts(DAG, DL, Val, Parts, NumParts, PartVT,
+                                      CallConv))
+    return;
+  EVT ValueVT = Val.getValueType();
+
+  // Handle the vector case separately.
+  if (ValueVT.isVector())
+    return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V,
+                                CallConv);
+
+  unsigned OrigNumParts = NumParts;
+  assert(DAG.getTargetLoweringInfo().isTypeLegal(PartVT) &&
+         "Copying to an illegal type!");
+
+  if (NumParts == 0)
+    return;
+
+  assert(!ValueVT.isVector() && "Vector case handled elsewhere");
+  EVT PartEVT = PartVT;
+  if (PartEVT == ValueVT) {
+    assert(NumParts == 1 && "No-op copy with multiple parts!");
+    Parts[0] = Val;
+    return;
+  }
+
+  unsigned PartBits = PartVT.getSizeInBits();
+  if (NumParts * PartBits > ValueVT.getSizeInBits()) {
+    // If the parts cover more bits than the value has, promote the value.
+    if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
+      assert(NumParts == 1 && "Do not know what to promote to!");
+      Val = DAG.getNode(ISD::FP_EXTEND, DL, PartVT, Val);
+    } else {
+      if (ValueVT.isFloatingPoint()) {
+        // FP values need to be bitcast, then extended if they are being put
+        // into a larger container.
+        ValueVT = EVT::getIntegerVT(*DAG.getContext(),  ValueVT.getSizeInBits());
+        Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+      }
+      assert((PartVT.isInteger() || PartVT == MVT::x86mmx) &&
+             ValueVT.isInteger() &&
+             "Unknown mismatch!");
+      ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+      Val = DAG.getNode(ExtendKind, DL, ValueVT, Val);
+      if (PartVT == MVT::x86mmx)
+        Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+    }
+  } else if (PartBits == ValueVT.getSizeInBits()) {
+    // Different types of the same size.
+    assert(NumParts == 1 && PartEVT != ValueVT);
+    Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+  } else if (NumParts * PartBits < ValueVT.getSizeInBits()) {
+    // If the parts cover less bits than value has, truncate the value.
+    assert((PartVT.isInteger() || PartVT == MVT::x86mmx) &&
+           ValueVT.isInteger() &&
+           "Unknown mismatch!");
+    ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+    Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
+    if (PartVT == MVT::x86mmx)
+      Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+  }
+
+  // The value may have changed - recompute ValueVT.
+  ValueVT = Val.getValueType();
+  assert(NumParts * PartBits == ValueVT.getSizeInBits() &&
+         "Failed to tile the value with PartVT!");
+
+  if (NumParts == 1) {
+    if (PartEVT != ValueVT) {
+      diagnosePossiblyInvalidConstraint(*DAG.getContext(), V,
+                                        "scalar-to-vector conversion failed");
+      Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+    }
+
+    Parts[0] = Val;
+    return;
+  }
+
+  // Expand the value into multiple parts.
+  if (NumParts & (NumParts - 1)) {
+    // The number of parts is not a power of 2.  Split off and copy the tail.
+    assert(PartVT.isInteger() && ValueVT.isInteger() &&
+           "Do not know what to expand to!");
+    unsigned RoundParts = llvm::bit_floor(NumParts);
+    unsigned RoundBits = RoundParts * PartBits;
+    unsigned OddParts = NumParts - RoundParts;
+    SDValue OddVal = DAG.getNode(ISD::SRL, DL, ValueVT, Val,
+      DAG.getShiftAmountConstant(RoundBits, ValueVT, DL));
+
+    getCopyToParts(DAG, DL, OddVal, Parts + RoundParts, OddParts, PartVT, V,
+                   CallConv);
+
+    if (DAG.getDataLayout().isBigEndian())
+      // The odd parts were reversed by getCopyToParts - unreverse them.
+      std::reverse(Parts + RoundParts, Parts + NumParts);
+
+    NumParts = RoundParts;
+    ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+    Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
+  }
+
+  // The number of parts is a power of 2.  Repeatedly bisect the value using
+  // EXTRACT_ELEMENT.
+  Parts[0] = DAG.getNode(ISD::BITCAST, DL,
+                         EVT::getIntegerVT(*DAG.getContext(),
+                                           ValueVT.getSizeInBits()),
+                         Val);
+
+  for (unsigned StepSize = NumParts; StepSize > 1; StepSize /= 2) {
+    for (unsigned i = 0; i < NumParts; i += StepSize) {
+      unsigned ThisBits = StepSize * PartBits / 2;
+      EVT ThisVT = EVT::getIntegerVT(*DAG.getContext(), ThisBits);
+      SDValue &Part0 = Parts[i];
+      SDValue &Part1 = Parts[i+StepSize/2];
+
+      Part1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL,
+                          ThisVT, Part0, DAG.getIntPtrConstant(1, DL));
+      Part0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL,
+                          ThisVT, Part0, DAG.getIntPtrConstant(0, DL));
+
+      if (ThisBits == PartBits && ThisVT != PartVT) {
+        Part0 = DAG.getNode(ISD::BITCAST, DL, PartVT, Part0);
+        Part1 = DAG.getNode(ISD::BITCAST, DL, PartVT, Part1);
+      }
+    }
+  }
+
+  if (DAG.getDataLayout().isBigEndian())
+    std::reverse(Parts, Parts + OrigNumParts);
+}
+
+static SDValue widenVectorToPartType(SelectionDAG &DAG, SDValue Val,
+                                     const SDLoc &DL, EVT PartVT) {
+  if (!PartVT.isVector())
+    return SDValue();
+
+  EVT ValueVT = Val.getValueType();
+  EVT PartEVT = PartVT.getVectorElementType();
+  EVT ValueEVT = ValueVT.getVectorElementType();
+  ElementCount PartNumElts = PartVT.getVectorElementCount();
+  ElementCount ValueNumElts = ValueVT.getVectorElementCount();
+
+  // We only support widening vectors with equivalent element types and
+  // fixed/scalable properties. If a target needs to widen a fixed-length type
+  // to a scalable one, it should be possible to use INSERT_SUBVECTOR below.
+  if (ElementCount::isKnownLE(PartNumElts, ValueNumElts) ||
+      PartNumElts.isScalable() != ValueNumElts.isScalable())
+    return SDValue();
+
+  // Have a try for bf16 because some targets share its ABI with fp16.
+  if (ValueEVT == MVT::bf16 && PartEVT == MVT::f16) {
+    assert(DAG.getTargetLoweringInfo().isTypeLegal(PartVT) &&
+           "Cannot widen to illegal type");
+    Val = DAG.getNode(ISD::BITCAST, DL,
+                      ValueVT.changeVectorElementType(MVT::f16), Val);
+  } else if (PartEVT != ValueEVT) {
+    return SDValue();
+  }
+
+  // Widening a scalable vector to another scalable vector is done by inserting
+  // the vector into a larger undef one.
+  if (PartNumElts.isScalable())
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PartVT, DAG.getUNDEF(PartVT),
+                       Val, DAG.getVectorIdxConstant(0, DL));
+
+  // Vector widening case, e.g. <2 x float> -> <4 x float>.  Shuffle in
+  // undef elements.
+  SmallVector<SDValue, 16> Ops;
+  DAG.ExtractVectorElements(Val, Ops);
+  SDValue EltUndef = DAG.getUNDEF(PartEVT);
+  Ops.append((PartNumElts - ValueNumElts).getFixedValue(), EltUndef);
+
+  // FIXME: Use CONCAT for 2x -> 4x.
+  return DAG.getBuildVector(PartVT, DL, Ops);
+}
+
+/// getCopyToPartsVector - Create a series of nodes that contain the specified
+/// value split into legal parts.
+static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
+                                 SDValue Val, SDValue *Parts, unsigned NumParts,
+                                 MVT PartVT, const Value *V,
+                                 std::optional<CallingConv::ID> CallConv) {
+  EVT ValueVT = Val.getValueType();
+  assert(ValueVT.isVector() && "Not a vector");
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const bool IsABIRegCopy = CallConv.has_value();
+
+  if (NumParts == 1) {
+    EVT PartEVT = PartVT;
+    if (PartEVT == ValueVT) {
+      // Nothing to do.
+    } else if (PartVT.getSizeInBits() == ValueVT.getSizeInBits()) {
+      // Bitconvert vector->vector case.
+      Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+    } else if (SDValue Widened = widenVectorToPartType(DAG, Val, DL, PartVT)) {
+      Val = Widened;
+    } else if (PartVT.isVector() &&
+               PartEVT.getVectorElementType().bitsGE(
+                   ValueVT.getVectorElementType()) &&
+               PartEVT.getVectorElementCount() ==
+                   ValueVT.getVectorElementCount()) {
+
+      // Promoted vector extract
+      Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
+    } else if (PartEVT.isVector() &&
+               PartEVT.getVectorElementType() !=
+                   ValueVT.getVectorElementType() &&
+               TLI.getTypeAction(*DAG.getContext(), ValueVT) ==
+                   TargetLowering::TypeWidenVector) {
+      // Combination of widening and promotion.
+      EVT WidenVT =
+          EVT::getVectorVT(*DAG.getContext(), ValueVT.getVectorElementType(),
+                           PartVT.getVectorElementCount());
+      SDValue Widened = widenVectorToPartType(DAG, Val, DL, WidenVT);
+      Val = DAG.getAnyExtOrTrunc(Widened, DL, PartVT);
+    } else {
+      // Don't extract an integer from a float vector. This can happen if the
+      // FP type gets softened to integer and then promoted. The promotion
+      // prevents it from being picked up by the earlier bitcast case.
+      if (ValueVT.getVectorElementCount().isScalar() &&
+          (!ValueVT.isFloatingPoint() || !PartVT.isInteger())) {
+        Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
+                          DAG.getVectorIdxConstant(0, DL));
+      } else {
+        uint64_t ValueSize = ValueVT.getFixedSizeInBits();
+        assert(PartVT.getFixedSizeInBits() > ValueSize &&
+               "lossy conversion of vector to scalar type");
+        EVT IntermediateType = EVT::getIntegerVT(*DAG.getContext(), ValueSize);
+        Val = DAG.getBitcast(IntermediateType, Val);
+        Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
+      }
+    }
+
+    assert(Val.getValueType() == PartVT && "Unexpected vector part value type");
+    Parts[0] = Val;
+    return;
+  }
+
+  // Handle a multi-element vector.
+  EVT IntermediateVT;
+  MVT RegisterVT;
+  unsigned NumIntermediates;
+  unsigned NumRegs;
+  if (IsABIRegCopy) {
+    NumRegs = TLI.getVectorTypeBreakdownForCallingConv(
+        *DAG.getContext(), *CallConv, ValueVT, IntermediateVT, NumIntermediates,
+        RegisterVT);
+  } else {
+    NumRegs =
+        TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
+                                   NumIntermediates, RegisterVT);
+  }
+
+  assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
+  NumParts = NumRegs; // Silence a compiler warning.
+  assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
+
+  assert(IntermediateVT.isScalableVector() == ValueVT.isScalableVector() &&
+         "Mixing scalable and fixed vectors when copying in parts");
+
+  std::optional<ElementCount> DestEltCnt;
+
+  if (IntermediateVT.isVector())
+    DestEltCnt = IntermediateVT.getVectorElementCount() * NumIntermediates;
+  else
+    DestEltCnt = ElementCount::getFixed(NumIntermediates);
+
+  EVT BuiltVectorTy = EVT::getVectorVT(
+      *DAG.getContext(), IntermediateVT.getScalarType(), *DestEltCnt);
+
+  if (ValueVT == BuiltVectorTy) {
+    // Nothing to do.
+  } else if (ValueVT.getSizeInBits() == BuiltVectorTy.getSizeInBits()) {
+    // Bitconvert vector->vector case.
+    Val = DAG.getNode(ISD::BITCAST, DL, BuiltVectorTy, Val);
+  } else {
+    if (BuiltVectorTy.getVectorElementType().bitsGT(
+            ValueVT.getVectorElementType())) {
+      // Integer promotion.
+      ValueVT = EVT::getVectorVT(*DAG.getContext(),
+                                 BuiltVectorTy.getVectorElementType(),
+                                 ValueVT.getVectorElementCount());
+      Val = DAG.getNode(ISD::ANY_EXTEND, DL, ValueVT, Val);
+    }
+
+    if (SDValue Widened = widenVectorToPartType(DAG, Val, DL, BuiltVectorTy)) {
+      Val = Widened;
+    }
+  }
+
+  assert(Val.getValueType() == BuiltVectorTy && "Unexpected vector value type");
+
+  // Split the vector into intermediate operands.
+  SmallVector<SDValue, 8> Ops(NumIntermediates);
+  for (unsigned i = 0; i != NumIntermediates; ++i) {
+    if (IntermediateVT.isVector()) {
+      // This does something sensible for scalable vectors - see the
+      // definition of EXTRACT_SUBVECTOR for further details.
+      unsigned IntermediateNumElts = IntermediateVT.getVectorMinNumElements();
+      Ops[i] =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, IntermediateVT, Val,
+                      DAG.getVectorIdxConstant(i * IntermediateNumElts, DL));
+    } else {
+      Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, IntermediateVT, Val,
+                           DAG.getVectorIdxConstant(i, DL));
+    }
+  }
+
+  // Split the intermediate operands into legal parts.
+  if (NumParts == NumIntermediates) {
+    // If the register was not expanded, promote or copy the value,
+    // as appropriate.
+    for (unsigned i = 0; i != NumParts; ++i)
+      getCopyToParts(DAG, DL, Ops[i], &Parts[i], 1, PartVT, V, CallConv);
+  } else if (NumParts > 0) {
+    // If the intermediate type was expanded, split each the value into
+    // legal parts.
+    assert(NumIntermediates != 0 && "division by zero");
+    assert(NumParts % NumIntermediates == 0 &&
+           "Must expand into a divisible number of parts!");
+    unsigned Factor = NumParts / NumIntermediates;
+    for (unsigned i = 0; i != NumIntermediates; ++i)
+      getCopyToParts(DAG, DL, Ops[i], &Parts[i * Factor], Factor, PartVT, V,
+                     CallConv);
+  }
+}
+
+RegsForValue::RegsForValue(const SmallVector<unsigned, 4> &regs, MVT regvt,
+                           EVT valuevt, std::optional<CallingConv::ID> CC)
+    : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs),
+      RegCount(1, regs.size()), CallConv(CC) {}
+
+RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
+                           const DataLayout &DL, unsigned Reg, Type *Ty,
+                           std::optional<CallingConv::ID> CC) {
+  ComputeValueVTs(TLI, DL, Ty, ValueVTs);
+
+  CallConv = CC;
+
+  for (EVT ValueVT : ValueVTs) {
+    unsigned NumRegs =
+        isABIMangled()
+            ? TLI.getNumRegistersForCallingConv(Context, *CC, ValueVT)
+            : TLI.getNumRegisters(Context, ValueVT);
+    MVT RegisterVT =
+        isABIMangled()
+            ? TLI.getRegisterTypeForCallingConv(Context, *CC, ValueVT)
+            : TLI.getRegisterType(Context, ValueVT);
+    for (unsigned i = 0; i != NumRegs; ++i)
+      Regs.push_back(Reg + i);
+    RegVTs.push_back(RegisterVT);
+    RegCount.push_back(NumRegs);
+    Reg += NumRegs;
+  }
+}
+
+SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
+                                      FunctionLoweringInfo &FuncInfo,
+                                      const SDLoc &dl, SDValue &Chain,
+                                      SDValue *Glue, const Value *V) const {
+  // A Value with type {} or [0 x %t] needs no registers.
+  if (ValueVTs.empty())
+    return SDValue();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Assemble the legal parts into the final values.
+  SmallVector<SDValue, 4> Values(ValueVTs.size());
+  SmallVector<SDValue, 8> Parts;
+  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    // Copy the legal parts from the registers.
+    EVT ValueVT = ValueVTs[Value];
+    unsigned NumRegs = RegCount[Value];
+    MVT RegisterVT = isABIMangled()
+                         ? TLI.getRegisterTypeForCallingConv(
+                               *DAG.getContext(), *CallConv, RegVTs[Value])
+                         : RegVTs[Value];
+
+    Parts.resize(NumRegs);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      SDValue P;
+      if (!Glue) {
+        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT);
+      } else {
+        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Glue);
+        *Glue = P.getValue(2);
+      }
+
+      Chain = P.getValue(1);
+      Parts[i] = P;
+
+      // If the source register was virtual and if we know something about it,
+      // add an assert node.
+      if (!Register::isVirtualRegister(Regs[Part + i]) ||
+          !RegisterVT.isInteger())
+        continue;
+
+      const FunctionLoweringInfo::LiveOutInfo *LOI =
+        FuncInfo.GetLiveOutRegInfo(Regs[Part+i]);
+      if (!LOI)
+        continue;
+
+      unsigned RegSize = RegisterVT.getScalarSizeInBits();
+      unsigned NumSignBits = LOI->NumSignBits;
+      unsigned NumZeroBits = LOI->Known.countMinLeadingZeros();
+
+      if (NumZeroBits == RegSize) {
+        // The current value is a zero.
+        // Explicitly express that as it would be easier for
+        // optimizations to kick in.
+        Parts[i] = DAG.getConstant(0, dl, RegisterVT);
+        continue;
+      }
+
+      // FIXME: We capture more information than the dag can represent.  For
+      // now, just use the tightest assertzext/assertsext possible.
+      bool isSExt;
+      EVT FromVT(MVT::Other);
+      if (NumZeroBits) {
+        FromVT = EVT::getIntegerVT(*DAG.getContext(), RegSize - NumZeroBits);
+        isSExt = false;
+      } else if (NumSignBits > 1) {
+        FromVT =
+            EVT::getIntegerVT(*DAG.getContext(), RegSize - NumSignBits + 1);
+        isSExt = true;
+      } else {
+        continue;
+      }
+      // Add an assertion node.
+      assert(FromVT != MVT::Other);
+      Parts[i] = DAG.getNode(isSExt ? ISD::AssertSext : ISD::AssertZext, dl,
+                             RegisterVT, P, DAG.getValueType(FromVT));
+    }
+
+    Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(), NumRegs,
+                                     RegisterVT, ValueVT, V, CallConv);
+    Part += NumRegs;
+    Parts.clear();
+  }
+
+  return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(ValueVTs), Values);
+}
+
+void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
+                                 const SDLoc &dl, SDValue &Chain, SDValue *Glue,
+                                 const Value *V,
+                                 ISD::NodeType PreferredExtendType) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  ISD::NodeType ExtendKind = PreferredExtendType;
+
+  // Get the list of the values's legal parts.
+  unsigned NumRegs = Regs.size();
+  SmallVector<SDValue, 8> Parts(NumRegs);
+  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    unsigned NumParts = RegCount[Value];
+
+    MVT RegisterVT = isABIMangled()
+                         ? TLI.getRegisterTypeForCallingConv(
+                               *DAG.getContext(), *CallConv, RegVTs[Value])
+                         : RegVTs[Value];
+
+    if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
+      ExtendKind = ISD::ZERO_EXTEND;
+
+    getCopyToParts(DAG, dl, Val.getValue(Val.getResNo() + Value), &Parts[Part],
+                   NumParts, RegisterVT, V, CallConv, ExtendKind);
+    Part += NumParts;
+  }
+
+  // Copy the parts into the registers.
+  SmallVector<SDValue, 8> Chains(NumRegs);
+  for (unsigned i = 0; i != NumRegs; ++i) {
+    SDValue Part;
+    if (!Glue) {
+      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i]);
+    } else {
+      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Glue);
+      *Glue = Part.getValue(1);
+    }
+
+    Chains[i] = Part.getValue(0);
+  }
+
+  if (NumRegs == 1 || Glue)
+    // If NumRegs > 1 && Glue is used then the use of the last CopyToReg is
+    // flagged to it. That is the CopyToReg nodes and the user are considered
+    // a single scheduling unit. If we create a TokenFactor and return it as
+    // chain, then the TokenFactor is both a predecessor (operand) of the
+    // user as well as a successor (the TF operands are flagged to the user).
+    // c1, f1 = CopyToReg
+    // c2, f2 = CopyToReg
+    // c3     = TokenFactor c1, c2
+    // ...
+    //        = op c3, ..., f2
+    Chain = Chains[NumRegs-1];
+  else
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+}
+
+void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
+                                        unsigned MatchingIdx, const SDLoc &dl,
+                                        SelectionDAG &DAG,
+                                        std::vector<SDValue> &Ops) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  unsigned Flag = InlineAsm::getFlagWord(Code, Regs.size());
+  if (HasMatching)
+    Flag = InlineAsm::getFlagWordForMatchingOp(Flag, MatchingIdx);
+  else if (!Regs.empty() && Register::isVirtualRegister(Regs.front())) {
+    // Put the register class of the virtual registers in the flag word.  That
+    // way, later passes can recompute register class constraints for inline
+    // assembly as well as normal instructions.
+    // Don't do this for tied operands that can use the regclass information
+    // from the def.
+    const MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+    const TargetRegisterClass *RC = MRI.getRegClass(Regs.front());
+    Flag = InlineAsm::getFlagWordForRegClass(Flag, RC->getID());
+  }
+
+  SDValue Res = DAG.getTargetConstant(Flag, dl, MVT::i32);
+  Ops.push_back(Res);
+
+  if (Code == InlineAsm::Kind_Clobber) {
+    // Clobbers should always have a 1:1 mapping with registers, and may
+    // reference registers that have illegal (e.g. vector) types. Hence, we
+    // shouldn't try to apply any sort of splitting logic to them.
+    assert(Regs.size() == RegVTs.size() && Regs.size() == ValueVTs.size() &&
+           "No 1:1 mapping from clobbers to regs?");
+    Register SP = TLI.getStackPointerRegisterToSaveRestore();
+    (void)SP;
+    for (unsigned I = 0, E = ValueVTs.size(); I != E; ++I) {
+      Ops.push_back(DAG.getRegister(Regs[I], RegVTs[I]));
+      assert(
+          (Regs[I] != SP ||
+           DAG.getMachineFunction().getFrameInfo().hasOpaqueSPAdjustment()) &&
+          "If we clobbered the stack pointer, MFI should know about it.");
+    }
+    return;
+  }
+
+  for (unsigned Value = 0, Reg = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    MVT RegisterVT = RegVTs[Value];
+    unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVTs[Value],
+                                           RegisterVT);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      assert(Reg < Regs.size() && "Mismatch in # registers expected");
+      unsigned TheReg = Regs[Reg++];
+      Ops.push_back(DAG.getRegister(TheReg, RegisterVT));
+    }
+  }
+}
+
+SmallVector<std::pair<unsigned, TypeSize>, 4>
+RegsForValue::getRegsAndSizes() const {
+  SmallVector<std::pair<unsigned, TypeSize>, 4> OutVec;
+  unsigned I = 0;
+  for (auto CountAndVT : zip_first(RegCount, RegVTs)) {
+    unsigned RegCount = std::get<0>(CountAndVT);
+    MVT RegisterVT = std::get<1>(CountAndVT);
+    TypeSize RegisterSize = RegisterVT.getSizeInBits();
+    for (unsigned E = I + RegCount; I != E; ++I)
+      OutVec.push_back(std::make_pair(Regs[I], RegisterSize));
+  }
+  return OutVec;
+}
+
+void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa,
+                               AssumptionCache *ac,
+                               const TargetLibraryInfo *li) {
+  AA = aa;
+  AC = ac;
+  GFI = gfi;
+  LibInfo = li;
+  Context = DAG.getContext();
+  LPadToCallSiteMap.clear();
+  SL->init(DAG.getTargetLoweringInfo(), TM, DAG.getDataLayout());
+  AssignmentTrackingEnabled = isAssignmentTrackingEnabled(
+      *DAG.getMachineFunction().getFunction().getParent());
+}
+
+void SelectionDAGBuilder::clear() {
+  NodeMap.clear();
+  UnusedArgNodeMap.clear();
+  PendingLoads.clear();
+  PendingExports.clear();
+  PendingConstrainedFP.clear();
+  PendingConstrainedFPStrict.clear();
+  CurInst = nullptr;
+  HasTailCall = false;
+  SDNodeOrder = LowestSDNodeOrder;
+  StatepointLowering.clear();
+}
+
+void SelectionDAGBuilder::clearDanglingDebugInfo() {
+  DanglingDebugInfoMap.clear();
+}
+
+// Update DAG root to include dependencies on Pending chains.
+SDValue SelectionDAGBuilder::updateRoot(SmallVectorImpl<SDValue> &Pending) {
+  SDValue Root = DAG.getRoot();
+
+  if (Pending.empty())
+    return Root;
+
+  // Add current root to PendingChains, unless we already indirectly
+  // depend on it.
+  if (Root.getOpcode() != ISD::EntryToken) {
+    unsigned i = 0, e = Pending.size();
+    for (; i != e; ++i) {
+      assert(Pending[i].getNode()->getNumOperands() > 1);
+      if (Pending[i].getNode()->getOperand(0) == Root)
+        break;  // Don't add the root if we already indirectly depend on it.
+    }
+
+    if (i == e)
+      Pending.push_back(Root);
+  }
+
+  if (Pending.size() == 1)
+    Root = Pending[0];
+  else
+    Root = DAG.getTokenFactor(getCurSDLoc(), Pending);
+
+  DAG.setRoot(Root);
+  Pending.clear();
+  return Root;
+}
+
+SDValue SelectionDAGBuilder::getMemoryRoot() {
+  return updateRoot(PendingLoads);
+}
+
+SDValue SelectionDAGBuilder::getRoot() {
+  // Chain up all pending constrained intrinsics together with all
+  // pending loads, by simply appending them to PendingLoads and
+  // then calling getMemoryRoot().
+  PendingLoads.reserve(PendingLoads.size() +
+                       PendingConstrainedFP.size() +
+                       PendingConstrainedFPStrict.size());
+  PendingLoads.append(PendingConstrainedFP.begin(),
+                      PendingConstrainedFP.end());
+  PendingLoads.append(PendingConstrainedFPStrict.begin(),
+                      PendingConstrainedFPStrict.end());
+  PendingConstrainedFP.clear();
+  PendingConstrainedFPStrict.clear();
+  return getMemoryRoot();
+}
+
+SDValue SelectionDAGBuilder::getControlRoot() {
+  // We need to emit pending fpexcept.strict constrained intrinsics,
+  // so append them to the PendingExports list.
+  PendingExports.append(PendingConstrainedFPStrict.begin(),
+                        PendingConstrainedFPStrict.end());
+  PendingConstrainedFPStrict.clear();
+  return updateRoot(PendingExports);
+}
+
+void SelectionDAGBuilder::visit(const Instruction &I) {
+  // Set up outgoing PHI node register values before emitting the terminator.
+  if (I.isTerminator()) {
+    HandlePHINodesInSuccessorBlocks(I.getParent());
+  }
+
+  // Add SDDbgValue nodes for any var locs here. Do so before updating
+  // SDNodeOrder, as this mapping is {Inst -> Locs BEFORE Inst}.
+  if (FunctionVarLocs const *FnVarLocs = DAG.getFunctionVarLocs()) {
+    // Add SDDbgValue nodes for any var locs here. Do so before updating
+    // SDNodeOrder, as this mapping is {Inst -> Locs BEFORE Inst}.
+    for (auto It = FnVarLocs->locs_begin(&I), End = FnVarLocs->locs_end(&I);
+         It != End; ++It) {
+      auto *Var = FnVarLocs->getDILocalVariable(It->VariableID);
+      dropDanglingDebugInfo(Var, It->Expr);
+      if (It->Values.isKillLocation(It->Expr)) {
+        handleKillDebugValue(Var, It->Expr, It->DL, SDNodeOrder);
+        continue;
+      }
+      SmallVector<Value *> Values(It->Values.location_ops());
+      if (!handleDebugValue(Values, Var, It->Expr, It->DL, SDNodeOrder,
+                            It->Values.hasArgList()))
+        addDanglingDebugInfo(It, SDNodeOrder);
+    }
+  }
+
+  // Increase the SDNodeOrder if dealing with a non-debug instruction.
+  if (!isa<DbgInfoIntrinsic>(I))
+    ++SDNodeOrder;
+
+  CurInst = &I;
+
+  // Set inserted listener only if required.
+  bool NodeInserted = false;
+  std::unique_ptr<SelectionDAG::DAGNodeInsertedListener> InsertedListener;
+  MDNode *PCSectionsMD = I.getMetadata(LLVMContext::MD_pcsections);
+  if (PCSectionsMD) {
+    InsertedListener = std::make_unique<SelectionDAG::DAGNodeInsertedListener>(
+        DAG, [&](SDNode *) { NodeInserted = true; });
+  }
+
+  visit(I.getOpcode(), I);
+
+  if (!I.isTerminator() && !HasTailCall &&
+      !isa<GCStatepointInst>(I)) // statepoints handle their exports internally
+    CopyToExportRegsIfNeeded(&I);
+
+  // Handle metadata.
+  if (PCSectionsMD) {
+    auto It = NodeMap.find(&I);
+    if (It != NodeMap.end()) {
+      DAG.addPCSections(It->second.getNode(), PCSectionsMD);
+    } else if (NodeInserted) {
+      // This should not happen; if it does, don't let it go unnoticed so we can
+      // fix it. Relevant visit*() function is probably missing a setValue().
+      errs() << "warning: loosing !pcsections metadata ["
+             << I.getModule()->getName() << "]\n";
+      LLVM_DEBUG(I.dump());
+      assert(false);
+    }
+  }
+
+  CurInst = nullptr;
+}
+
+void SelectionDAGBuilder::visitPHI(const PHINode &) {
+  llvm_unreachable("SelectionDAGBuilder shouldn't visit PHI nodes!");
+}
+
+void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) {
+  // Note: this doesn't use InstVisitor, because it has to work with
+  // ConstantExpr's in addition to instructions.
+  switch (Opcode) {
+  default: llvm_unreachable("Unknown instruction type encountered!");
+    // Build the switch statement using the Instruction.def file.
+#define HANDLE_INST(NUM, OPCODE, CLASS) \
+    case Instruction::OPCODE: visit##OPCODE((const CLASS&)I); break;
+#include "llvm/IR/Instruction.def"
+  }
+}
+
+static bool handleDanglingVariadicDebugInfo(SelectionDAG &DAG,
+                                            DILocalVariable *Variable,
+                                            DebugLoc DL, unsigned Order,
+                                            RawLocationWrapper Values,
+                                            DIExpression *Expression) {
+  if (!Values.hasArgList())
+    return false;
+  // For variadic dbg_values we will now insert an undef.
+  // FIXME: We can potentially recover these!
+  SmallVector<SDDbgOperand, 2> Locs;
+  for (const Value *V : Values.location_ops()) {
+    auto *Undef = UndefValue::get(V->getType());
+    Locs.push_back(SDDbgOperand::fromConst(Undef));
+  }
+  SDDbgValue *SDV = DAG.getDbgValueList(Variable, Expression, Locs, {},
+                                        /*IsIndirect=*/false, DL, Order,
+                                        /*IsVariadic=*/true);
+  DAG.AddDbgValue(SDV, /*isParameter=*/false);
+  return true;
+}
+
+void SelectionDAGBuilder::addDanglingDebugInfo(const VarLocInfo *VarLoc,
+                                               unsigned Order) {
+  if (!handleDanglingVariadicDebugInfo(
+          DAG,
+          const_cast<DILocalVariable *>(DAG.getFunctionVarLocs()
+                                            ->getVariable(VarLoc->VariableID)
+                                            .getVariable()),
+          VarLoc->DL, Order, VarLoc->Values, VarLoc->Expr)) {
+    DanglingDebugInfoMap[VarLoc->Values.getVariableLocationOp(0)].emplace_back(
+        VarLoc, Order);
+  }
+}
+
+void SelectionDAGBuilder::addDanglingDebugInfo(const DbgValueInst *DI,
+                                               unsigned Order) {
+  // We treat variadic dbg_values differently at this stage.
+  if (!handleDanglingVariadicDebugInfo(
+          DAG, DI->getVariable(), DI->getDebugLoc(), Order,
+          DI->getWrappedLocation(), DI->getExpression())) {
+    // TODO: Dangling debug info will eventually either be resolved or produce
+    // an Undef DBG_VALUE. However in the resolution case, a gap may appear
+    // between the original dbg.value location and its resolved DBG_VALUE,
+    // which we should ideally fill with an extra Undef DBG_VALUE.
+    assert(DI->getNumVariableLocationOps() == 1 &&
+           "DbgValueInst without an ArgList should have a single location "
+           "operand.");
+    DanglingDebugInfoMap[DI->getValue(0)].emplace_back(DI, Order);
+  }
+}
+
+void SelectionDAGBuilder::dropDanglingDebugInfo(const DILocalVariable *Variable,
+                                                const DIExpression *Expr) {
+  auto isMatchingDbgValue = [&](DanglingDebugInfo &DDI) {
+    DIVariable *DanglingVariable = DDI.getVariable(DAG.getFunctionVarLocs());
+    DIExpression *DanglingExpr = DDI.getExpression();
+    if (DanglingVariable == Variable && Expr->fragmentsOverlap(DanglingExpr)) {
+      LLVM_DEBUG(dbgs() << "Dropping dangling debug info for " << printDDI(DDI)
+                        << "\n");
+      return true;
+    }
+    return false;
+  };
+
+  for (auto &DDIMI : DanglingDebugInfoMap) {
+    DanglingDebugInfoVector &DDIV = DDIMI.second;
+
+    // If debug info is to be dropped, run it through final checks to see
+    // whether it can be salvaged.
+    for (auto &DDI : DDIV)
+      if (isMatchingDbgValue(DDI))
+        salvageUnresolvedDbgValue(DDI);
+
+    erase_if(DDIV, isMatchingDbgValue);
+  }
+}
+
+// resolveDanglingDebugInfo - if we saw an earlier dbg_value referring to V,
+// generate the debug data structures now that we've seen its definition.
+void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
+                                                   SDValue Val) {
+  auto DanglingDbgInfoIt = DanglingDebugInfoMap.find(V);
+  if (DanglingDbgInfoIt == DanglingDebugInfoMap.end())
+    return;
+
+  DanglingDebugInfoVector &DDIV = DanglingDbgInfoIt->second;
+  for (auto &DDI : DDIV) {
+    DebugLoc DL = DDI.getDebugLoc();
+    unsigned ValSDNodeOrder = Val.getNode()->getIROrder();
+    unsigned DbgSDNodeOrder = DDI.getSDNodeOrder();
+    DILocalVariable *Variable = DDI.getVariable(DAG.getFunctionVarLocs());
+    DIExpression *Expr = DDI.getExpression();
+    assert(Variable->isValidLocationForIntrinsic(DL) &&
+           "Expected inlined-at fields to agree");
+    SDDbgValue *SDV;
+    if (Val.getNode()) {
+      // FIXME: I doubt that it is correct to resolve a dangling DbgValue as a
+      // FuncArgumentDbgValue (it would be hoisted to the function entry, and if
+      // we couldn't resolve it directly when examining the DbgValue intrinsic
+      // in the first place we should not be more successful here). Unless we
+      // have some test case that prove this to be correct we should avoid
+      // calling EmitFuncArgumentDbgValue here.
+      if (!EmitFuncArgumentDbgValue(V, Variable, Expr, DL,
+                                    FuncArgumentDbgValueKind::Value, Val)) {
+        LLVM_DEBUG(dbgs() << "Resolve dangling debug info for " << printDDI(DDI)
+                          << "\n");
+        LLVM_DEBUG(dbgs() << "  By mapping to:\n    "; Val.dump());
+        // Increase the SDNodeOrder for the DbgValue here to make sure it is
+        // inserted after the definition of Val when emitting the instructions
+        // after ISel. An alternative could be to teach
+        // ScheduleDAGSDNodes::EmitSchedule to delay the insertion properly.
+        LLVM_DEBUG(if (ValSDNodeOrder > DbgSDNodeOrder) dbgs()
+                   << "changing SDNodeOrder from " << DbgSDNodeOrder << " to "
+                   << ValSDNodeOrder << "\n");
+        SDV = getDbgValue(Val, Variable, Expr, DL,
+                          std::max(DbgSDNodeOrder, ValSDNodeOrder));
+        DAG.AddDbgValue(SDV, false);
+      } else
+        LLVM_DEBUG(dbgs() << "Resolved dangling debug info for "
+                          << printDDI(DDI) << " in EmitFuncArgumentDbgValue\n");
+    } else {
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << printDDI(DDI) << "\n");
+      auto Undef = UndefValue::get(V->getType());
+      auto SDV =
+          DAG.getConstantDbgValue(Variable, Expr, Undef, DL, DbgSDNodeOrder);
+      DAG.AddDbgValue(SDV, false);
+    }
+  }
+  DDIV.clear();
+}
+
+void SelectionDAGBuilder::salvageUnresolvedDbgValue(DanglingDebugInfo &DDI) {
+  // TODO: For the variadic implementation, instead of only checking the fail
+  // state of `handleDebugValue`, we need know specifically which values were
+  // invalid, so that we attempt to salvage only those values when processing
+  // a DIArgList.
+  Value *V = DDI.getVariableLocationOp(0);
+  Value *OrigV = V;
+  DILocalVariable *Var = DDI.getVariable(DAG.getFunctionVarLocs());
+  DIExpression *Expr = DDI.getExpression();
+  DebugLoc DL = DDI.getDebugLoc();
+  unsigned SDOrder = DDI.getSDNodeOrder();
+
+  // Currently we consider only dbg.value intrinsics -- we tell the salvager
+  // that DW_OP_stack_value is desired.
+  bool StackValue = true;
+
+  // Can this Value can be encoded without any further work?
+  if (handleDebugValue(V, Var, Expr, DL, SDOrder, /*IsVariadic=*/false))
+    return;
+
+  // Attempt to salvage back through as many instructions as possible. Bail if
+  // a non-instruction is seen, such as a constant expression or global
+  // variable. FIXME: Further work could recover those too.
+  while (isa<Instruction>(V)) {
+    Instruction &VAsInst = *cast<Instruction>(V);
+    // Temporary "0", awaiting real implementation.
+    SmallVector<uint64_t, 16> Ops;
+    SmallVector<Value *, 4> AdditionalValues;
+    V = salvageDebugInfoImpl(VAsInst, Expr->getNumLocationOperands(), Ops,
+                             AdditionalValues);
+    // If we cannot salvage any further, and haven't yet found a suitable debug
+    // expression, bail out.
+    if (!V)
+      break;
+
+    // TODO: If AdditionalValues isn't empty, then the salvage can only be
+    // represented with a DBG_VALUE_LIST, so we give up. When we have support
+    // here for variadic dbg_values, remove that condition.
+    if (!AdditionalValues.empty())
+      break;
+
+    // New value and expr now represent this debuginfo.
+    Expr = DIExpression::appendOpsToArg(Expr, Ops, 0, StackValue);
+
+    // Some kind of simplification occurred: check whether the operand of the
+    // salvaged debug expression can be encoded in this DAG.
+    if (handleDebugValue(V, Var, Expr, DL, SDOrder, /*IsVariadic=*/false)) {
+      LLVM_DEBUG(
+          dbgs() << "Salvaged debug location info for:\n  " << *Var << "\n"
+                 << *OrigV << "\nBy stripping back to:\n  " << *V << "\n");
+      return;
+    }
+  }
+
+  // This was the final opportunity to salvage this debug information, and it
+  // couldn't be done. Place an undef DBG_VALUE at this location to terminate
+  // any earlier variable location.
+  assert(OrigV && "V shouldn't be null");
+  auto *Undef = UndefValue::get(OrigV->getType());
+  auto *SDV = DAG.getConstantDbgValue(Var, Expr, Undef, DL, SDNodeOrder);
+  DAG.AddDbgValue(SDV, false);
+  LLVM_DEBUG(dbgs() << "Dropping debug value info for:\n  " << printDDI(DDI)
+                    << "\n");
+}
+
+void SelectionDAGBuilder::handleKillDebugValue(DILocalVariable *Var,
+                                               DIExpression *Expr,
+                                               DebugLoc DbgLoc,
+                                               unsigned Order) {
+  Value *Poison = PoisonValue::get(Type::getInt1Ty(*Context));
+  DIExpression *NewExpr =
+      const_cast<DIExpression *>(DIExpression::convertToUndefExpression(Expr));
+  handleDebugValue(Poison, Var, NewExpr, DbgLoc, Order,
+                   /*IsVariadic*/ false);
+}
+
+bool SelectionDAGBuilder::handleDebugValue(ArrayRef<const Value *> Values,
+                                           DILocalVariable *Var,
+                                           DIExpression *Expr, DebugLoc DbgLoc,
+                                           unsigned Order, bool IsVariadic) {
+  if (Values.empty())
+    return true;
+  SmallVector<SDDbgOperand> LocationOps;
+  SmallVector<SDNode *> Dependencies;
+  for (const Value *V : Values) {
+    // Constant value.
+    if (isa<ConstantInt>(V) || isa<ConstantFP>(V) || isa<UndefValue>(V) ||
+        isa<ConstantPointerNull>(V)) {
+      LocationOps.emplace_back(SDDbgOperand::fromConst(V));
+      continue;
+    }
+
+    // Look through IntToPtr constants.
+    if (auto *CE = dyn_cast<ConstantExpr>(V))
+      if (CE->getOpcode() == Instruction::IntToPtr) {
+        LocationOps.emplace_back(SDDbgOperand::fromConst(CE->getOperand(0)));
+        continue;
+      }
+
+    // If the Value is a frame index, we can create a FrameIndex debug value
+    // without relying on the DAG at all.
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+      auto SI = FuncInfo.StaticAllocaMap.find(AI);
+      if (SI != FuncInfo.StaticAllocaMap.end()) {
+        LocationOps.emplace_back(SDDbgOperand::fromFrameIdx(SI->second));
+        continue;
+      }
+    }
+
+    // Do not use getValue() in here; we don't want to generate code at
+    // this point if it hasn't been done yet.
+    SDValue N = NodeMap[V];
+    if (!N.getNode() && isa<Argument>(V)) // Check unused arguments map.
+      N = UnusedArgNodeMap[V];
+    if (N.getNode()) {
+      // Only emit func arg dbg value for non-variadic dbg.values for now.
+      if (!IsVariadic &&
+          EmitFuncArgumentDbgValue(V, Var, Expr, DbgLoc,
+                                   FuncArgumentDbgValueKind::Value, N))
+        return true;
+      if (auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode())) {
+        // Construct a FrameIndexDbgValue for FrameIndexSDNodes so we can
+        // describe stack slot locations.
+        //
+        // Consider "int x = 0; int *px = &x;". There are two kinds of
+        // interesting debug values here after optimization:
+        //
+        //   dbg.value(i32* %px, !"int *px", !DIExpression()), and
+        //   dbg.value(i32* %px, !"int x", !DIExpression(DW_OP_deref))
+        //
+        // Both describe the direct values of their associated variables.
+        Dependencies.push_back(N.getNode());
+        LocationOps.emplace_back(SDDbgOperand::fromFrameIdx(FISDN->getIndex()));
+        continue;
+      }
+      LocationOps.emplace_back(
+          SDDbgOperand::fromNode(N.getNode(), N.getResNo()));
+      continue;
+    }
+
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    // Special rules apply for the first dbg.values of parameter variables in a
+    // function. Identify them by the fact they reference Argument Values, that
+    // they're parameters, and they are parameters of the current function. We
+    // need to let them dangle until they get an SDNode.
+    bool IsParamOfFunc =
+        isa<Argument>(V) && Var->isParameter() && !DbgLoc.getInlinedAt();
+    if (IsParamOfFunc)
+      return false;
+
+    // The value is not used in this block yet (or it would have an SDNode).
+    // We still want the value to appear for the user if possible -- if it has
+    // an associated VReg, we can refer to that instead.
+    auto VMI = FuncInfo.ValueMap.find(V);
+    if (VMI != FuncInfo.ValueMap.end()) {
+      unsigned Reg = VMI->second;
+      // If this is a PHI node, it may be split up into several MI PHI nodes
+      // (in FunctionLoweringInfo::set).
+      RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg,
+                       V->getType(), std::nullopt);
+      if (RFV.occupiesMultipleRegs()) {
+        // FIXME: We could potentially support variadic dbg_values here.
+        if (IsVariadic)
+          return false;
+        unsigned Offset = 0;
+        unsigned BitsToDescribe = 0;
+        if (auto VarSize = Var->getSizeInBits())
+          BitsToDescribe = *VarSize;
+        if (auto Fragment = Expr->getFragmentInfo())
+          BitsToDescribe = Fragment->SizeInBits;
+        for (const auto &RegAndSize : RFV.getRegsAndSizes()) {
+          // Bail out if all bits are described already.
+          if (Offset >= BitsToDescribe)
+            break;
+          // TODO: handle scalable vectors.
+          unsigned RegisterSize = RegAndSize.second;
+          unsigned FragmentSize = (Offset + RegisterSize > BitsToDescribe)
+                                      ? BitsToDescribe - Offset
+                                      : RegisterSize;
+          auto FragmentExpr = DIExpression::createFragmentExpression(
+              Expr, Offset, FragmentSize);
+          if (!FragmentExpr)
+            continue;
+          SDDbgValue *SDV = DAG.getVRegDbgValue(
+              Var, *FragmentExpr, RegAndSize.first, false, DbgLoc, SDNodeOrder);
+          DAG.AddDbgValue(SDV, false);
+          Offset += RegisterSize;
+        }
+        return true;
+      }
+      // We can use simple vreg locations for variadic dbg_values as well.
+      LocationOps.emplace_back(SDDbgOperand::fromVReg(Reg));
+      continue;
+    }
+    // We failed to create a SDDbgOperand for V.
+    return false;
+  }
+
+  // We have created a SDDbgOperand for each Value in Values.
+  // Should use Order instead of SDNodeOrder?
+  assert(!LocationOps.empty());
+  SDDbgValue *SDV = DAG.getDbgValueList(Var, Expr, LocationOps, Dependencies,
+                                        /*IsIndirect=*/false, DbgLoc,
+                                        SDNodeOrder, IsVariadic);
+  DAG.AddDbgValue(SDV, /*isParameter=*/false);
+  return true;
+}
+
+void SelectionDAGBuilder::resolveOrClearDbgInfo() {
+  // Try to fixup any remaining dangling debug info -- and drop it if we can't.
+  for (auto &Pair : DanglingDebugInfoMap)
+    for (auto &DDI : Pair.second)
+      salvageUnresolvedDbgValue(DDI);
+  clearDanglingDebugInfo();
+}
+
+/// getCopyFromRegs - If there was virtual register allocated for the value V
+/// emit CopyFromReg of the specified type Ty. Return empty SDValue() otherwise.
+SDValue SelectionDAGBuilder::getCopyFromRegs(const Value *V, Type *Ty) {
+  DenseMap<const Value *, Register>::iterator It = FuncInfo.ValueMap.find(V);
+  SDValue Result;
+
+  if (It != FuncInfo.ValueMap.end()) {
+    Register InReg = It->second;
+
+    RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
+                     DAG.getDataLayout(), InReg, Ty,
+                     std::nullopt); // This is not an ABI copy.
+    SDValue Chain = DAG.getEntryNode();
+    Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr,
+                                 V);
+    resolveDanglingDebugInfo(V, Result);
+  }
+
+  return Result;
+}
+
+/// getValue - Return an SDValue for the given Value.
+SDValue SelectionDAGBuilder::getValue(const Value *V) {
+  // If we already have an SDValue for this value, use it. It's important
+  // to do this first, so that we don't create a CopyFromReg if we already
+  // have a regular SDValue.
+  SDValue &N = NodeMap[V];
+  if (N.getNode()) return N;
+
+  // If there's a virtual register allocated and initialized for this
+  // value, use it.
+  if (SDValue copyFromReg = getCopyFromRegs(V, V->getType()))
+    return copyFromReg;
+
+  // Otherwise create a new SDValue and remember it.
+  SDValue Val = getValueImpl(V);
+  NodeMap[V] = Val;
+  resolveDanglingDebugInfo(V, Val);
+  return Val;
+}
+
+/// getNonRegisterValue - Return an SDValue for the given Value, but
+/// don't look in FuncInfo.ValueMap for a virtual register.
+SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) {
+  // If we already have an SDValue for this value, use it.
+  SDValue &N = NodeMap[V];
+  if (N.getNode()) {
+    if (isIntOrFPConstant(N)) {
+      // Remove the debug location from the node as the node is about to be used
+      // in a location which may differ from the original debug location.  This
+      // is relevant to Constant and ConstantFP nodes because they can appear
+      // as constant expressions inside PHI nodes.
+      N->setDebugLoc(DebugLoc());
+    }
+    return N;
+  }
+
+  // Otherwise create a new SDValue and remember it.
+  SDValue Val = getValueImpl(V);
+  NodeMap[V] = Val;
+  resolveDanglingDebugInfo(V, Val);
+  return Val;
+}
+
+/// getValueImpl - Helper function for getValue and getNonRegisterValue.
+/// Create an SDValue for the given value.
+SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  if (const Constant *C = dyn_cast<Constant>(V)) {
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), V->getType(), true);
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
+      return DAG.getConstant(*CI, getCurSDLoc(), VT);
+
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+      return DAG.getGlobalAddress(GV, getCurSDLoc(), VT);
+
+    if (isa<ConstantPointerNull>(C)) {
+      unsigned AS = V->getType()->getPointerAddressSpace();
+      return DAG.getConstant(0, getCurSDLoc(),
+                             TLI.getPointerTy(DAG.getDataLayout(), AS));
+    }
+
+    if (match(C, m_VScale()))
+      return DAG.getVScale(getCurSDLoc(), VT, APInt(VT.getSizeInBits(), 1));
+
+    if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+      return DAG.getConstantFP(*CFP, getCurSDLoc(), VT);
+
+    if (isa<UndefValue>(C) && !V->getType()->isAggregateType())
+      return DAG.getUNDEF(VT);
+
+    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+      visit(CE->getOpcode(), *CE);
+      SDValue N1 = NodeMap[V];
+      assert(N1.getNode() && "visit didn't populate the NodeMap!");
+      return N1;
+    }
+
+    if (isa<ConstantStruct>(C) || isa<ConstantArray>(C)) {
+      SmallVector<SDValue, 4> Constants;
+      for (const Use &U : C->operands()) {
+        SDNode *Val = getValue(U).getNode();
+        // If the operand is an empty aggregate, there are no values.
+        if (!Val) continue;
+        // Add each leaf value from the operand to the Constants list
+        // to form a flattened list of all the values.
+        for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i)
+          Constants.push_back(SDValue(Val, i));
+      }
+
+      return DAG.getMergeValues(Constants, getCurSDLoc());
+    }
+
+    if (const ConstantDataSequential *CDS =
+          dyn_cast<ConstantDataSequential>(C)) {
+      SmallVector<SDValue, 4> Ops;
+      for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
+        SDNode *Val = getValue(CDS->getElementAsConstant(i)).getNode();
+        // Add each leaf value from the operand to the Constants list
+        // to form a flattened list of all the values.
+        for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i)
+          Ops.push_back(SDValue(Val, i));
+      }
+
+      if (isa<ArrayType>(CDS->getType()))
+        return DAG.getMergeValues(Ops, getCurSDLoc());
+      return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
+    }
+
+    if (C->getType()->isStructTy() || C->getType()->isArrayTy()) {
+      assert((isa<ConstantAggregateZero>(C) || isa<UndefValue>(C)) &&
+             "Unknown struct or array constant!");
+
+      SmallVector<EVT, 4> ValueVTs;
+      ComputeValueVTs(TLI, DAG.getDataLayout(), C->getType(), ValueVTs);
+      unsigned NumElts = ValueVTs.size();
+      if (NumElts == 0)
+        return SDValue(); // empty struct
+      SmallVector<SDValue, 4> Constants(NumElts);
+      for (unsigned i = 0; i != NumElts; ++i) {
+        EVT EltVT = ValueVTs[i];
+        if (isa<UndefValue>(C))
+          Constants[i] = DAG.getUNDEF(EltVT);
+        else if (EltVT.isFloatingPoint())
+          Constants[i] = DAG.getConstantFP(0, getCurSDLoc(), EltVT);
+        else
+          Constants[i] = DAG.getConstant(0, getCurSDLoc(), EltVT);
+      }
+
+      return DAG.getMergeValues(Constants, getCurSDLoc());
+    }
+
+    if (const BlockAddress *BA = dyn_cast<BlockAddress>(C))
+      return DAG.getBlockAddress(BA, VT);
+
+    if (const auto *Equiv = dyn_cast<DSOLocalEquivalent>(C))
+      return getValue(Equiv->getGlobalValue());
+
+    if (const auto *NC = dyn_cast<NoCFIValue>(C))
+      return getValue(NC->getGlobalValue());
+
+    VectorType *VecTy = cast<VectorType>(V->getType());
+
+    // Now that we know the number and type of the elements, get that number of
+    // elements into the Ops array based on what kind of constant it is.
+    if (const ConstantVector *CV = dyn_cast<ConstantVector>(C)) {
+      SmallVector<SDValue, 16> Ops;
+      unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
+      for (unsigned i = 0; i != NumElements; ++i)
+        Ops.push_back(getValue(CV->getOperand(i)));
+
+      return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
+    }
+
+    if (isa<ConstantAggregateZero>(C)) {
+      EVT EltVT =
+          TLI.getValueType(DAG.getDataLayout(), VecTy->getElementType());
+
+      SDValue Op;
+      if (EltVT.isFloatingPoint())
+        Op = DAG.getConstantFP(0, getCurSDLoc(), EltVT);
+      else
+        Op = DAG.getConstant(0, getCurSDLoc(), EltVT);
+
+      return NodeMap[V] = DAG.getSplat(VT, getCurSDLoc(), Op);
+    }
+
+    llvm_unreachable("Unknown vector constant");
+  }
+
+  // If this is a static alloca, generate it as the frameindex instead of
+  // computation.
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+    DenseMap<const AllocaInst*, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+    if (SI != FuncInfo.StaticAllocaMap.end())
+      return DAG.getFrameIndex(
+          SI->second, TLI.getValueType(DAG.getDataLayout(), AI->getType()));
+  }
+
+  // If this is an instruction which fast-isel has deferred, select it now.
+  if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+    Register InReg = FuncInfo.InitializeRegForValue(Inst);
+
+    RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg,
+                     Inst->getType(), std::nullopt);
+    SDValue Chain = DAG.getEntryNode();
+    return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
+  }
+
+  if (const MetadataAsValue *MD = dyn_cast<MetadataAsValue>(V))
+    return DAG.getMDNode(cast<MDNode>(MD->getMetadata()));
+
+  if (const auto *BB = dyn_cast<BasicBlock>(V))
+    return DAG.getBasicBlock(FuncInfo.MBBMap[BB]);
+
+  llvm_unreachable("Can't get register for value!");
+}
+
+void SelectionDAGBuilder::visitCatchPad(const CatchPadInst &I) {
+  auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
+  bool IsMSVCCXX = Pers == EHPersonality::MSVC_CXX;
+  bool IsCoreCLR = Pers == EHPersonality::CoreCLR;
+  bool IsSEH = isAsynchronousEHPersonality(Pers);
+  MachineBasicBlock *CatchPadMBB = FuncInfo.MBB;
+  if (!IsSEH)
+    CatchPadMBB->setIsEHScopeEntry();
+  // In MSVC C++ and CoreCLR, catchblocks are funclets and need prologues.
+  if (IsMSVCCXX || IsCoreCLR)
+    CatchPadMBB->setIsEHFuncletEntry();
+}
+
+void SelectionDAGBuilder::visitCatchRet(const CatchReturnInst &I) {
+  // Update machine-CFG edge.
+  MachineBasicBlock *TargetMBB = FuncInfo.MBBMap[I.getSuccessor()];
+  FuncInfo.MBB->addSuccessor(TargetMBB);
+  TargetMBB->setIsEHCatchretTarget(true);
+  DAG.getMachineFunction().setHasEHCatchret(true);
+
+  auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
+  bool IsSEH = isAsynchronousEHPersonality(Pers);
+  if (IsSEH) {
+    // If this is not a fall-through branch or optimizations are switched off,
+    // emit the branch.
+    if (TargetMBB != NextBlock(FuncInfo.MBB) ||
+        TM.getOptLevel() == CodeGenOpt::None)
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
+                              getControlRoot(), DAG.getBasicBlock(TargetMBB)));
+    return;
+  }
+
+  // Figure out the funclet membership for the catchret's successor.
+  // This will be used by the FuncletLayout pass to determine how to order the
+  // BB's.
+  // A 'catchret' returns to the outer scope's color.
+  Value *ParentPad = I.getCatchSwitchParentPad();
+  const BasicBlock *SuccessorColor;
+  if (isa<ConstantTokenNone>(ParentPad))
+    SuccessorColor = &FuncInfo.Fn->getEntryBlock();
+  else
+    SuccessorColor = cast<Instruction>(ParentPad)->getParent();
+  assert(SuccessorColor && "No parent funclet for catchret!");
+  MachineBasicBlock *SuccessorColorMBB = FuncInfo.MBBMap[SuccessorColor];
+  assert(SuccessorColorMBB && "No MBB for SuccessorColor!");
+
+  // Create the terminator node.
+  SDValue Ret = DAG.getNode(ISD::CATCHRET, getCurSDLoc(), MVT::Other,
+                            getControlRoot(), DAG.getBasicBlock(TargetMBB),
+                            DAG.getBasicBlock(SuccessorColorMBB));
+  DAG.setRoot(Ret);
+}
+
+void SelectionDAGBuilder::visitCleanupPad(const CleanupPadInst &CPI) {
+  // Don't emit any special code for the cleanuppad instruction. It just marks
+  // the start of an EH scope/funclet.
+  FuncInfo.MBB->setIsEHScopeEntry();
+  auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
+  if (Pers != EHPersonality::Wasm_CXX) {
+    FuncInfo.MBB->setIsEHFuncletEntry();
+    FuncInfo.MBB->setIsCleanupFuncletEntry();
+  }
+}
+
+// In wasm EH, even though a catchpad may not catch an exception if a tag does
+// not match, it is OK to add only the first unwind destination catchpad to the
+// successors, because there will be at least one invoke instruction within the
+// catch scope that points to the next unwind destination, if one exists, so
+// CFGSort cannot mess up with BB sorting order.
+// (All catchpads with 'catch (type)' clauses have a 'llvm.rethrow' intrinsic
+// call within them, and catchpads only consisting of 'catch (...)' have a
+// '__cxa_end_catch' call within them, both of which generate invokes in case
+// the next unwind destination exists, i.e., the next unwind destination is not
+// the caller.)
+//
+// Having at most one EH pad successor is also simpler and helps later
+// transformations.
+//
+// For example,
+// current:
+//   invoke void @foo to ... unwind label %catch.dispatch
+// catch.dispatch:
+//   %0 = catchswitch within ... [label %catch.start] unwind label %next
+// catch.start:
+//   ...
+//   ... in this BB or some other child BB dominated by this BB there will be an
+//   invoke that points to 'next' BB as an unwind destination
+//
+// next: ; We don't need to add this to 'current' BB's successor
+//   ...
+static void findWasmUnwindDestinations(
+    FunctionLoweringInfo &FuncInfo, const BasicBlock *EHPadBB,
+    BranchProbability Prob,
+    SmallVectorImpl<std::pair<MachineBasicBlock *, BranchProbability>>
+        &UnwindDests) {
+  while (EHPadBB) {
+    const Instruction *Pad = EHPadBB->getFirstNonPHI();
+    if (isa<CleanupPadInst>(Pad)) {
+      // Stop on cleanup pads.
+      UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
+      UnwindDests.back().first->setIsEHScopeEntry();
+      break;
+    } else if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
+      // Add the catchpad handlers to the possible destinations. We don't
+      // continue to the unwind destination of the catchswitch for wasm.
+      for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
+        UnwindDests.emplace_back(FuncInfo.MBBMap[CatchPadBB], Prob);
+        UnwindDests.back().first->setIsEHScopeEntry();
+      }
+      break;
+    } else {
+      continue;
+    }
+  }
+}
+
+/// When an invoke or a cleanupret unwinds to the next EH pad, there are
+/// many places it could ultimately go. In the IR, we have a single unwind
+/// destination, but in the machine CFG, we enumerate all the possible blocks.
+/// This function skips over imaginary basic blocks that hold catchswitch
+/// instructions, and finds all the "real" machine
+/// basic block destinations. As those destinations may not be successors of
+/// EHPadBB, here we also calculate the edge probability to those destinations.
+/// The passed-in Prob is the edge probability to EHPadBB.
+static void findUnwindDestinations(
+    FunctionLoweringInfo &FuncInfo, const BasicBlock *EHPadBB,
+    BranchProbability Prob,
+    SmallVectorImpl<std::pair<MachineBasicBlock *, BranchProbability>>
+        &UnwindDests) {
+  EHPersonality Personality =
+    classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
+  bool IsMSVCCXX = Personality == EHPersonality::MSVC_CXX;
+  bool IsCoreCLR = Personality == EHPersonality::CoreCLR;
+  bool IsWasmCXX = Personality == EHPersonality::Wasm_CXX;
+  bool IsSEH = isAsynchronousEHPersonality(Personality);
+
+  if (IsWasmCXX) {
+    findWasmUnwindDestinations(FuncInfo, EHPadBB, Prob, UnwindDests);
+    assert(UnwindDests.size() <= 1 &&
+           "There should be at most one unwind destination for wasm");
+    return;
+  }
+
+  while (EHPadBB) {
+    const Instruction *Pad = EHPadBB->getFirstNonPHI();
+    BasicBlock *NewEHPadBB = nullptr;
+    if (isa<LandingPadInst>(Pad)) {
+      // Stop on landingpads. They are not funclets.
+      UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
+      break;
+    } else if (isa<CleanupPadInst>(Pad)) {
+      // Stop on cleanup pads. Cleanups are always funclet entries for all known
+      // personalities.
+      UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
+      UnwindDests.back().first->setIsEHScopeEntry();
+      UnwindDests.back().first->setIsEHFuncletEntry();
+      break;
+    } else if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
+      // Add the catchpad handlers to the possible destinations.
+      for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
+        UnwindDests.emplace_back(FuncInfo.MBBMap[CatchPadBB], Prob);
+        // For MSVC++ and the CLR, catchblocks are funclets and need prologues.
+        if (IsMSVCCXX || IsCoreCLR)
+          UnwindDests.back().first->setIsEHFuncletEntry();
+        if (!IsSEH)
+          UnwindDests.back().first->setIsEHScopeEntry();
+      }
+      NewEHPadBB = CatchSwitch->getUnwindDest();
+    } else {
+      continue;
+    }
+
+    BranchProbabilityInfo *BPI = FuncInfo.BPI;
+    if (BPI && NewEHPadBB)
+      Prob *= BPI->getEdgeProbability(EHPadBB, NewEHPadBB);
+    EHPadBB = NewEHPadBB;
+  }
+}
+
+void SelectionDAGBuilder::visitCleanupRet(const CleanupReturnInst &I) {
+  // Update successor info.
+  SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
+  auto UnwindDest = I.getUnwindDest();
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  BranchProbability UnwindDestProb =
+      (BPI && UnwindDest)
+          ? BPI->getEdgeProbability(FuncInfo.MBB->getBasicBlock(), UnwindDest)
+          : BranchProbability::getZero();
+  findUnwindDestinations(FuncInfo, UnwindDest, UnwindDestProb, UnwindDests);
+  for (auto &UnwindDest : UnwindDests) {
+    UnwindDest.first->setIsEHPad();
+    addSuccessorWithProb(FuncInfo.MBB, UnwindDest.first, UnwindDest.second);
+  }
+  FuncInfo.MBB->normalizeSuccProbs();
+
+  // Create the terminator node.
+  SDValue Ret =
+      DAG.getNode(ISD::CLEANUPRET, getCurSDLoc(), MVT::Other, getControlRoot());
+  DAG.setRoot(Ret);
+}
+
+void SelectionDAGBuilder::visitCatchSwitch(const CatchSwitchInst &CSI) {
+  report_fatal_error("visitCatchSwitch not yet implemented!");
+}
+
+void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  auto &DL = DAG.getDataLayout();
+  SDValue Chain = getControlRoot();
+  SmallVector<ISD::OutputArg, 8> Outs;
+  SmallVector<SDValue, 8> OutVals;
+
+  // Calls to @llvm.experimental.deoptimize don't generate a return value, so
+  // lower
+  //
+  //   %val = call <ty> @llvm.experimental.deoptimize()
+  //   ret <ty> %val
+  //
+  // differently.
+  if (I.getParent()->getTerminatingDeoptimizeCall()) {
+    LowerDeoptimizingReturn();
+    return;
+  }
+
+  if (!FuncInfo.CanLowerReturn) {
+    unsigned DemoteReg = FuncInfo.DemoteRegister;
+    const Function *F = I.getParent()->getParent();
+
+    // Emit a store of the return value through the virtual register.
+    // Leave Outs empty so that LowerReturn won't try to load return
+    // registers the usual way.
+    SmallVector<EVT, 1> PtrValueVTs;
+    ComputeValueVTs(TLI, DL,
+                    PointerType::get(F->getContext(),
+                                     DAG.getDataLayout().getAllocaAddrSpace()),
+                    PtrValueVTs);
+
+    SDValue RetPtr =
+        DAG.getCopyFromReg(Chain, getCurSDLoc(), DemoteReg, PtrValueVTs[0]);
+    SDValue RetOp = getValue(I.getOperand(0));
+
+    SmallVector<EVT, 4> ValueVTs, MemVTs;
+    SmallVector<uint64_t, 4> Offsets;
+    ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs, &MemVTs,
+                    &Offsets, 0);
+    unsigned NumValues = ValueVTs.size();
+
+    SmallVector<SDValue, 4> Chains(NumValues);
+    Align BaseAlign = DL.getPrefTypeAlign(I.getOperand(0)->getType());
+    for (unsigned i = 0; i != NumValues; ++i) {
+      // An aggregate return value cannot wrap around the address space, so
+      // offsets to its parts don't wrap either.
+      SDValue Ptr = DAG.getObjectPtrOffset(getCurSDLoc(), RetPtr,
+                                           TypeSize::Fixed(Offsets[i]));
+
+      SDValue Val = RetOp.getValue(RetOp.getResNo() + i);
+      if (MemVTs[i] != ValueVTs[i])
+        Val = DAG.getPtrExtOrTrunc(Val, getCurSDLoc(), MemVTs[i]);
+      Chains[i] = DAG.getStore(
+          Chain, getCurSDLoc(), Val,
+          // FIXME: better loc info would be nice.
+          Ptr, MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()),
+          commonAlignment(BaseAlign, Offsets[i]));
+    }
+
+    Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
+                        MVT::Other, Chains);
+  } else if (I.getNumOperands() != 0) {
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs);
+    unsigned NumValues = ValueVTs.size();
+    if (NumValues) {
+      SDValue RetOp = getValue(I.getOperand(0));
+
+      const Function *F = I.getParent()->getParent();
+
+      bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters(
+          I.getOperand(0)->getType(), F->getCallingConv(),
+          /*IsVarArg*/ false, DL);
+
+      ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+      if (F->getAttributes().hasRetAttr(Attribute::SExt))
+        ExtendKind = ISD::SIGN_EXTEND;
+      else if (F->getAttributes().hasRetAttr(Attribute::ZExt))
+        ExtendKind = ISD::ZERO_EXTEND;
+
+      LLVMContext &Context = F->getContext();
+      bool RetInReg = F->getAttributes().hasRetAttr(Attribute::InReg);
+
+      for (unsigned j = 0; j != NumValues; ++j) {
+        EVT VT = ValueVTs[j];
+
+        if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
+          VT = TLI.getTypeForExtReturn(Context, VT, ExtendKind);
+
+        CallingConv::ID CC = F->getCallingConv();
+
+        unsigned NumParts = TLI.getNumRegistersForCallingConv(Context, CC, VT);
+        MVT PartVT = TLI.getRegisterTypeForCallingConv(Context, CC, VT);
+        SmallVector<SDValue, 4> Parts(NumParts);
+        getCopyToParts(DAG, getCurSDLoc(),
+                       SDValue(RetOp.getNode(), RetOp.getResNo() + j),
+                       &Parts[0], NumParts, PartVT, &I, CC, ExtendKind);
+
+        // 'inreg' on function refers to return value
+        ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+        if (RetInReg)
+          Flags.setInReg();
+
+        if (I.getOperand(0)->getType()->isPointerTy()) {
+          Flags.setPointer();
+          Flags.setPointerAddrSpace(
+              cast<PointerType>(I.getOperand(0)->getType())->getAddressSpace());
+        }
+
+        if (NeedsRegBlock) {
+          Flags.setInConsecutiveRegs();
+          if (j == NumValues - 1)
+            Flags.setInConsecutiveRegsLast();
+        }
+
+        // Propagate extension type if any
+        if (ExtendKind == ISD::SIGN_EXTEND)
+          Flags.setSExt();
+        else if (ExtendKind == ISD::ZERO_EXTEND)
+          Flags.setZExt();
+
+        for (unsigned i = 0; i < NumParts; ++i) {
+          Outs.push_back(ISD::OutputArg(Flags,
+                                        Parts[i].getValueType().getSimpleVT(),
+                                        VT, /*isfixed=*/true, 0, 0));
+          OutVals.push_back(Parts[i]);
+        }
+      }
+    }
+  }
+
+  // Push in swifterror virtual register as the last element of Outs. This makes
+  // sure swifterror virtual register will be returned in the swifterror
+  // physical register.
+  const Function *F = I.getParent()->getParent();
+  if (TLI.supportSwiftError() &&
+      F->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) {
+    assert(SwiftError.getFunctionArg() && "Need a swift error argument");
+    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+    Flags.setSwiftError();
+    Outs.push_back(ISD::OutputArg(
+        Flags, /*vt=*/TLI.getPointerTy(DL), /*argvt=*/EVT(TLI.getPointerTy(DL)),
+        /*isfixed=*/true, /*origidx=*/1, /*partOffs=*/0));
+    // Create SDNode for the swifterror virtual register.
+    OutVals.push_back(
+        DAG.getRegister(SwiftError.getOrCreateVRegUseAt(
+                            &I, FuncInfo.MBB, SwiftError.getFunctionArg()),
+                        EVT(TLI.getPointerTy(DL))));
+  }
+
+  bool isVarArg = DAG.getMachineFunction().getFunction().isVarArg();
+  CallingConv::ID CallConv =
+    DAG.getMachineFunction().getFunction().getCallingConv();
+  Chain = DAG.getTargetLoweringInfo().LowerReturn(
+      Chain, CallConv, isVarArg, Outs, OutVals, getCurSDLoc(), DAG);
+
+  // Verify that the target's LowerReturn behaved as expected.
+  assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
+         "LowerReturn didn't return a valid chain!");
+
+  // Update the DAG with the new chain value resulting from return lowering.
+  DAG.setRoot(Chain);
+}
+
+/// CopyToExportRegsIfNeeded - If the given value has virtual registers
+/// created for it, emit nodes to copy the value into the virtual
+/// registers.
+void SelectionDAGBuilder::CopyToExportRegsIfNeeded(const Value *V) {
+  // Skip empty types
+  if (V->getType()->isEmptyTy())
+    return;
+
+  DenseMap<const Value *, Register>::iterator VMI = FuncInfo.ValueMap.find(V);
+  if (VMI != FuncInfo.ValueMap.end()) {
+    assert((!V->use_empty() || isa<CallBrInst>(V)) &&
+           "Unused value assigned virtual registers!");
+    CopyValueToVirtualRegister(V, VMI->second);
+  }
+}
+
+/// ExportFromCurrentBlock - If this condition isn't known to be exported from
+/// the current basic block, add it to ValueMap now so that we'll get a
+/// CopyTo/FromReg.
+void SelectionDAGBuilder::ExportFromCurrentBlock(const Value *V) {
+  // No need to export constants.
+  if (!isa<Instruction>(V) && !isa<Argument>(V)) return;
+
+  // Already exported?
+  if (FuncInfo.isExportedInst(V)) return;
+
+  Register Reg = FuncInfo.InitializeRegForValue(V);
+  CopyValueToVirtualRegister(V, Reg);
+}
+
+bool SelectionDAGBuilder::isExportableFromCurrentBlock(const Value *V,
+                                                     const BasicBlock *FromBB) {
+  // The operands of the setcc have to be in this block.  We don't know
+  // how to export them from some other block.
+  if (const Instruction *VI = dyn_cast<Instruction>(V)) {
+    // Can export from current BB.
+    if (VI->getParent() == FromBB)
+      return true;
+
+    // Is already exported, noop.
+    return FuncInfo.isExportedInst(V);
+  }
+
+  // If this is an argument, we can export it if the BB is the entry block or
+  // if it is already exported.
+  if (isa<Argument>(V)) {
+    if (FromBB->isEntryBlock())
+      return true;
+
+    // Otherwise, can only export this if it is already exported.
+    return FuncInfo.isExportedInst(V);
+  }
+
+  // Otherwise, constants can always be exported.
+  return true;
+}
+
+/// Return branch probability calculated by BranchProbabilityInfo for IR blocks.
+BranchProbability
+SelectionDAGBuilder::getEdgeProbability(const MachineBasicBlock *Src,
+                                        const MachineBasicBlock *Dst) const {
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  const BasicBlock *SrcBB = Src->getBasicBlock();
+  const BasicBlock *DstBB = Dst->getBasicBlock();
+  if (!BPI) {
+    // If BPI is not available, set the default probability as 1 / N, where N is
+    // the number of successors.
+    auto SuccSize = std::max<uint32_t>(succ_size(SrcBB), 1);
+    return BranchProbability(1, SuccSize);
+  }
+  return BPI->getEdgeProbability(SrcBB, DstBB);
+}
+
+void SelectionDAGBuilder::addSuccessorWithProb(MachineBasicBlock *Src,
+                                               MachineBasicBlock *Dst,
+                                               BranchProbability Prob) {
+  if (!FuncInfo.BPI)
+    Src->addSuccessorWithoutProb(Dst);
+  else {
+    if (Prob.isUnknown())
+      Prob = getEdgeProbability(Src, Dst);
+    Src->addSuccessor(Dst, Prob);
+  }
+}
+
+static bool InBlock(const Value *V, const BasicBlock *BB) {
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() == BB;
+  return true;
+}
+
+/// EmitBranchForMergedCondition - Helper method for FindMergedConditions.
+/// This function emits a branch and is used at the leaves of an OR or an
+/// AND operator tree.
+void
+SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
+                                                  MachineBasicBlock *TBB,
+                                                  MachineBasicBlock *FBB,
+                                                  MachineBasicBlock *CurBB,
+                                                  MachineBasicBlock *SwitchBB,
+                                                  BranchProbability TProb,
+                                                  BranchProbability FProb,
+                                                  bool InvertCond) {
+  const BasicBlock *BB = CurBB->getBasicBlock();
+
+  // If the leaf of the tree is a comparison, merge the condition into
+  // the caseblock.
+  if (const CmpInst *BOp = dyn_cast<CmpInst>(Cond)) {
+    // The operands of the cmp have to be in this block.  We don't know
+    // how to export them from some other block.  If this is the first block
+    // of the sequence, no exporting is needed.
+    if (CurBB == SwitchBB ||
+        (isExportableFromCurrentBlock(BOp->getOperand(0), BB) &&
+         isExportableFromCurrentBlock(BOp->getOperand(1), BB))) {
+      ISD::CondCode Condition;
+      if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) {
+        ICmpInst::Predicate Pred =
+            InvertCond ? IC->getInversePredicate() : IC->getPredicate();
+        Condition = getICmpCondCode(Pred);
+      } else {
+        const FCmpInst *FC = cast<FCmpInst>(Cond);
+        FCmpInst::Predicate Pred =
+            InvertCond ? FC->getInversePredicate() : FC->getPredicate();
+        Condition = getFCmpCondCode(Pred);
+        if (TM.Options.NoNaNsFPMath)
+          Condition = getFCmpCodeWithoutNaN(Condition);
+      }
+
+      CaseBlock CB(Condition, BOp->getOperand(0), BOp->getOperand(1), nullptr,
+                   TBB, FBB, CurBB, getCurSDLoc(), TProb, FProb);
+      SL->SwitchCases.push_back(CB);
+      return;
+    }
+  }
+
+  // Create a CaseBlock record representing this branch.
+  ISD::CondCode Opc = InvertCond ? ISD::SETNE : ISD::SETEQ;
+  CaseBlock CB(Opc, Cond, ConstantInt::getTrue(*DAG.getContext()),
+               nullptr, TBB, FBB, CurBB, getCurSDLoc(), TProb, FProb);
+  SL->SwitchCases.push_back(CB);
+}
+
+void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
+                                               MachineBasicBlock *TBB,
+                                               MachineBasicBlock *FBB,
+                                               MachineBasicBlock *CurBB,
+                                               MachineBasicBlock *SwitchBB,
+                                               Instruction::BinaryOps Opc,
+                                               BranchProbability TProb,
+                                               BranchProbability FProb,
+                                               bool InvertCond) {
+  // Skip over not part of the tree and remember to invert op and operands at
+  // next level.
+  Value *NotCond;
+  if (match(Cond, m_OneUse(m_Not(m_Value(NotCond)))) &&
+      InBlock(NotCond, CurBB->getBasicBlock())) {
+    FindMergedConditions(NotCond, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb,
+                         !InvertCond);
+    return;
+  }
+
+  const Instruction *BOp = dyn_cast<Instruction>(Cond);
+  const Value *BOpOp0, *BOpOp1;
+  // Compute the effective opcode for Cond, taking into account whether it needs
+  // to be inverted, e.g.
+  //   and (not (or A, B)), C
+  // gets lowered as
+  //   and (and (not A, not B), C)
+  Instruction::BinaryOps BOpc = (Instruction::BinaryOps)0;
+  if (BOp) {
+    BOpc = match(BOp, m_LogicalAnd(m_Value(BOpOp0), m_Value(BOpOp1)))
+               ? Instruction::And
+               : (match(BOp, m_LogicalOr(m_Value(BOpOp0), m_Value(BOpOp1)))
+                      ? Instruction::Or
+                      : (Instruction::BinaryOps)0);
+    if (InvertCond) {
+      if (BOpc == Instruction::And)
+        BOpc = Instruction::Or;
+      else if (BOpc == Instruction::Or)
+        BOpc = Instruction::And;
+    }
+  }
+
+  // If this node is not part of the or/and tree, emit it as a branch.
+  // Note that all nodes in the tree should have same opcode.
+  bool BOpIsInOrAndTree = BOpc && BOpc == Opc && BOp->hasOneUse();
+  if (!BOpIsInOrAndTree || BOp->getParent() != CurBB->getBasicBlock() ||
+      !InBlock(BOpOp0, CurBB->getBasicBlock()) ||
+      !InBlock(BOpOp1, CurBB->getBasicBlock())) {
+    EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB,
+                                 TProb, FProb, InvertCond);
+    return;
+  }
+
+  //  Create TmpBB after CurBB.
+  MachineFunction::iterator BBI(CurBB);
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineBasicBlock *TmpBB = MF.CreateMachineBasicBlock(CurBB->getBasicBlock());
+  CurBB->getParent()->insert(++BBI, TmpBB);
+
+  if (Opc == Instruction::Or) {
+    // Codegen X | Y as:
+    // BB1:
+    //   jmp_if_X TBB
+    //   jmp TmpBB
+    // TmpBB:
+    //   jmp_if_Y TBB
+    //   jmp FBB
+    //
+
+    // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
+    // The requirement is that
+    //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
+    //     = TrueProb for original BB.
+    // Assuming the original probabilities are A and B, one choice is to set
+    // BB1's probabilities to A/2 and A/2+B, and set TmpBB's probabilities to
+    // A/(1+B) and 2B/(1+B). This choice assumes that
+    //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
+    // Another choice is to assume TrueProb for BB1 equals to TrueProb for
+    // TmpBB, but the math is more complicated.
+
+    auto NewTrueProb = TProb / 2;
+    auto NewFalseProb = TProb / 2 + FProb;
+    // Emit the LHS condition.
+    FindMergedConditions(BOpOp0, TBB, TmpBB, CurBB, SwitchBB, Opc, NewTrueProb,
+                         NewFalseProb, InvertCond);
+
+    // Normalize A/2 and B to get A/(1+B) and 2B/(1+B).
+    SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb};
+    BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
+    // Emit the RHS condition into TmpBB.
+    FindMergedConditions(BOpOp1, TBB, FBB, TmpBB, SwitchBB, Opc, Probs[0],
+                         Probs[1], InvertCond);
+  } else {
+    assert(Opc == Instruction::And && "Unknown merge op!");
+    // Codegen X & Y as:
+    // BB1:
+    //   jmp_if_X TmpBB
+    //   jmp FBB
+    // TmpBB:
+    //   jmp_if_Y TBB
+    //   jmp FBB
+    //
+    //  This requires creation of TmpBB after CurBB.
+
+    // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
+    // The requirement is that
+    //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
+    //     = FalseProb for original BB.
+    // Assuming the original probabilities are A and B, one choice is to set
+    // BB1's probabilities to A+B/2 and B/2, and set TmpBB's probabilities to
+    // 2A/(1+A) and B/(1+A). This choice assumes that FalseProb for BB1 ==
+    // TrueProb for BB1 * FalseProb for TmpBB.
+
+    auto NewTrueProb = TProb + FProb / 2;
+    auto NewFalseProb = FProb / 2;
+    // Emit the LHS condition.
+    FindMergedConditions(BOpOp0, TmpBB, FBB, CurBB, SwitchBB, Opc, NewTrueProb,
+                         NewFalseProb, InvertCond);
+
+    // Normalize A and B/2 to get 2A/(1+A) and B/(1+A).
+    SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2};
+    BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
+    // Emit the RHS condition into TmpBB.
+    FindMergedConditions(BOpOp1, TBB, FBB, TmpBB, SwitchBB, Opc, Probs[0],
+                         Probs[1], InvertCond);
+  }
+}
+
+/// If the set of cases should be emitted as a series of branches, return true.
+/// If we should emit this as a bunch of and/or'd together conditions, return
+/// false.
+bool
+SelectionDAGBuilder::ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases) {
+  if (Cases.size() != 2) return true;
+
+  // If this is two comparisons of the same values or'd or and'd together, they
+  // will get folded into a single comparison, so don't emit two blocks.
+  if ((Cases[0].CmpLHS == Cases[1].CmpLHS &&
+       Cases[0].CmpRHS == Cases[1].CmpRHS) ||
+      (Cases[0].CmpRHS == Cases[1].CmpLHS &&
+       Cases[0].CmpLHS == Cases[1].CmpRHS)) {
+    return false;
+  }
+
+  // Handle: (X != null) | (Y != null) --> (X|Y) != 0
+  // Handle: (X == null) & (Y == null) --> (X|Y) == 0
+  if (Cases[0].CmpRHS == Cases[1].CmpRHS &&
+      Cases[0].CC == Cases[1].CC &&
+      isa<Constant>(Cases[0].CmpRHS) &&
+      cast<Constant>(Cases[0].CmpRHS)->isNullValue()) {
+    if (Cases[0].CC == ISD::SETEQ && Cases[0].TrueBB == Cases[1].ThisBB)
+      return false;
+    if (Cases[0].CC == ISD::SETNE && Cases[0].FalseBB == Cases[1].ThisBB)
+      return false;
+  }
+
+  return true;
+}
+
+void SelectionDAGBuilder::visitBr(const BranchInst &I) {
+  MachineBasicBlock *BrMBB = FuncInfo.MBB;
+
+  // Update machine-CFG edges.
+  MachineBasicBlock *Succ0MBB = FuncInfo.MBBMap[I.getSuccessor(0)];
+
+  if (I.isUnconditional()) {
+    // Update machine-CFG edges.
+    BrMBB->addSuccessor(Succ0MBB);
+
+    // If this is not a fall-through branch or optimizations are switched off,
+    // emit the branch.
+    if (Succ0MBB != NextBlock(BrMBB) || TM.getOptLevel() == CodeGenOpt::None) {
+      auto Br = DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
+                            getControlRoot(), DAG.getBasicBlock(Succ0MBB));
+      setValue(&I, Br);
+      DAG.setRoot(Br);
+    }
+
+    return;
+  }
+
+  // If this condition is one of the special cases we handle, do special stuff
+  // now.
+  const Value *CondVal = I.getCondition();
+  MachineBasicBlock *Succ1MBB = FuncInfo.MBBMap[I.getSuccessor(1)];
+
+  // If this is a series of conditions that are or'd or and'd together, emit
+  // this as a sequence of branches instead of setcc's with and/or operations.
+  // As long as jumps are not expensive (exceptions for multi-use logic ops,
+  // unpredictable branches, and vector extracts because those jumps are likely
+  // expensive for any target), this should improve performance.
+  // For example, instead of something like:
+  //     cmp A, B
+  //     C = seteq
+  //     cmp D, E
+  //     F = setle
+  //     or C, F
+  //     jnz foo
+  // Emit:
+  //     cmp A, B
+  //     je foo
+  //     cmp D, E
+  //     jle foo
+  const Instruction *BOp = dyn_cast<Instruction>(CondVal);
+  if (!DAG.getTargetLoweringInfo().isJumpExpensive() && BOp &&
+      BOp->hasOneUse() && !I.hasMetadata(LLVMContext::MD_unpredictable)) {
+    Value *Vec;
+    const Value *BOp0, *BOp1;
+    Instruction::BinaryOps Opcode = (Instruction::BinaryOps)0;
+    if (match(BOp, m_LogicalAnd(m_Value(BOp0), m_Value(BOp1))))
+      Opcode = Instruction::And;
+    else if (match(BOp, m_LogicalOr(m_Value(BOp0), m_Value(BOp1))))
+      Opcode = Instruction::Or;
+
+    if (Opcode && !(match(BOp0, m_ExtractElt(m_Value(Vec), m_Value())) &&
+                    match(BOp1, m_ExtractElt(m_Specific(Vec), m_Value())))) {
+      FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB, Opcode,
+                           getEdgeProbability(BrMBB, Succ0MBB),
+                           getEdgeProbability(BrMBB, Succ1MBB),
+                           /*InvertCond=*/false);
+      // If the compares in later blocks need to use values not currently
+      // exported from this block, export them now.  This block should always
+      // be the first entry.
+      assert(SL->SwitchCases[0].ThisBB == BrMBB && "Unexpected lowering!");
+
+      // Allow some cases to be rejected.
+      if (ShouldEmitAsBranches(SL->SwitchCases)) {
+        for (unsigned i = 1, e = SL->SwitchCases.size(); i != e; ++i) {
+          ExportFromCurrentBlock(SL->SwitchCases[i].CmpLHS);
+          ExportFromCurrentBlock(SL->SwitchCases[i].CmpRHS);
+        }
+
+        // Emit the branch for this block.
+        visitSwitchCase(SL->SwitchCases[0], BrMBB);
+        SL->SwitchCases.erase(SL->SwitchCases.begin());
+        return;
+      }
+
+      // Okay, we decided not to do this, remove any inserted MBB's and clear
+      // SwitchCases.
+      for (unsigned i = 1, e = SL->SwitchCases.size(); i != e; ++i)
+        FuncInfo.MF->erase(SL->SwitchCases[i].ThisBB);
+
+      SL->SwitchCases.clear();
+    }
+  }
+
+  // Create a CaseBlock record representing this branch.
+  CaseBlock CB(ISD::SETEQ, CondVal, ConstantInt::getTrue(*DAG.getContext()),
+               nullptr, Succ0MBB, Succ1MBB, BrMBB, getCurSDLoc());
+
+  // Use visitSwitchCase to actually insert the fast branch sequence for this
+  // cond branch.
+  visitSwitchCase(CB, BrMBB);
+}
+
+/// visitSwitchCase - Emits the necessary code to represent a single node in
+/// the binary search tree resulting from lowering a switch instruction.
+void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
+                                          MachineBasicBlock *SwitchBB) {
+  SDValue Cond;
+  SDValue CondLHS = getValue(CB.CmpLHS);
+  SDLoc dl = CB.DL;
+
+  if (CB.CC == ISD::SETTRUE) {
+    // Branch or fall through to TrueBB.
+    addSuccessorWithProb(SwitchBB, CB.TrueBB, CB.TrueProb);
+    SwitchBB->normalizeSuccProbs();
+    if (CB.TrueBB != NextBlock(SwitchBB)) {
+      DAG.setRoot(DAG.getNode(ISD::BR, dl, MVT::Other, getControlRoot(),
+                              DAG.getBasicBlock(CB.TrueBB)));
+    }
+    return;
+  }
+
+  auto &TLI = DAG.getTargetLoweringInfo();
+  EVT MemVT = TLI.getMemValueType(DAG.getDataLayout(), CB.CmpLHS->getType());
+
+  // Build the setcc now.
+  if (!CB.CmpMHS) {
+    // Fold "(X == true)" to X and "(X == false)" to !X to
+    // handle common cases produced by branch lowering.
+    if (CB.CmpRHS == ConstantInt::getTrue(*DAG.getContext()) &&
+        CB.CC == ISD::SETEQ)
+      Cond = CondLHS;
+    else if (CB.CmpRHS == ConstantInt::getFalse(*DAG.getContext()) &&
+             CB.CC == ISD::SETEQ) {
+      SDValue True = DAG.getConstant(1, dl, CondLHS.getValueType());
+      Cond = DAG.getNode(ISD::XOR, dl, CondLHS.getValueType(), CondLHS, True);
+    } else {
+      SDValue CondRHS = getValue(CB.CmpRHS);
+
+      // If a pointer's DAG type is larger than its memory type then the DAG
+      // values are zero-extended. This breaks signed comparisons so truncate
+      // back to the underlying type before doing the compare.
+      if (CondLHS.getValueType() != MemVT) {
+        CondLHS = DAG.getPtrExtOrTrunc(CondLHS, getCurSDLoc(), MemVT);
+        CondRHS = DAG.getPtrExtOrTrunc(CondRHS, getCurSDLoc(), MemVT);
+      }
+      Cond = DAG.getSetCC(dl, MVT::i1, CondLHS, CondRHS, CB.CC);
+    }
+  } else {
+    assert(CB.CC == ISD::SETLE && "Can handle only LE ranges now");
+
+    const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
+    const APInt& High = cast<ConstantInt>(CB.CmpRHS)->getValue();
+
+    SDValue CmpOp = getValue(CB.CmpMHS);
+    EVT VT = CmpOp.getValueType();
+
+    if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) {
+      Cond = DAG.getSetCC(dl, MVT::i1, CmpOp, DAG.getConstant(High, dl, VT),
+                          ISD::SETLE);
+    } else {
+      SDValue SUB = DAG.getNode(ISD::SUB, dl,
+                                VT, CmpOp, DAG.getConstant(Low, dl, VT));
+      Cond = DAG.getSetCC(dl, MVT::i1, SUB,
+                          DAG.getConstant(High-Low, dl, VT), ISD::SETULE);
+    }
+  }
+
+  // Update successor info
+  addSuccessorWithProb(SwitchBB, CB.TrueBB, CB.TrueProb);
+  // TrueBB and FalseBB are always different unless the incoming IR is
+  // degenerate. This only happens when running llc on weird IR.
+  if (CB.TrueBB != CB.FalseBB)
+    addSuccessorWithProb(SwitchBB, CB.FalseBB, CB.FalseProb);
+  SwitchBB->normalizeSuccProbs();
+
+  // If the lhs block is the next block, invert the condition so that we can
+  // fall through to the lhs instead of the rhs block.
+  if (CB.TrueBB == NextBlock(SwitchBB)) {
+    std::swap(CB.TrueBB, CB.FalseBB);
+    SDValue True = DAG.getConstant(1, dl, Cond.getValueType());
+    Cond = DAG.getNode(ISD::XOR, dl, Cond.getValueType(), Cond, True);
+  }
+
+  SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
+                               MVT::Other, getControlRoot(), Cond,
+                               DAG.getBasicBlock(CB.TrueBB));
+
+  setValue(CurInst, BrCond);
+
+  // Insert the false branch. Do this even if it's a fall through branch,
+  // this makes it easier to do DAG optimizations which require inverting
+  // the branch condition.
+  BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
+                       DAG.getBasicBlock(CB.FalseBB));
+
+  DAG.setRoot(BrCond);
+}
+
+/// visitJumpTable - Emit JumpTable node in the current MBB
+void SelectionDAGBuilder::visitJumpTable(SwitchCG::JumpTable &JT) {
+  // Emit the code for the jump table
+  assert(JT.Reg != -1U && "Should lower JT Header first!");
+  EVT PTy = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+  SDValue Index = DAG.getCopyFromReg(getControlRoot(), getCurSDLoc(),
+                                     JT.Reg, PTy);
+  SDValue Table = DAG.getJumpTable(JT.JTI, PTy);
+  SDValue BrJumpTable = DAG.getNode(ISD::BR_JT, getCurSDLoc(),
+                                    MVT::Other, Index.getValue(1),
+                                    Table, Index);
+  DAG.setRoot(BrJumpTable);
+}
+
+/// visitJumpTableHeader - This function emits necessary code to produce index
+/// in the JumpTable from switch case.
+void SelectionDAGBuilder::visitJumpTableHeader(SwitchCG::JumpTable &JT,
+                                               JumpTableHeader &JTH,
+                                               MachineBasicBlock *SwitchBB) {
+  SDLoc dl = getCurSDLoc();
+
+  // Subtract the lowest switch case value from the value being switched on.
+  SDValue SwitchOp = getValue(JTH.SValue);
+  EVT VT = SwitchOp.getValueType();
+  SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SwitchOp,
+                            DAG.getConstant(JTH.First, dl, VT));
+
+  // The SDNode we just created, which holds the value being switched on minus
+  // the smallest case value, needs to be copied to a virtual register so it
+  // can be used as an index into the jump table in a subsequent basic block.
+  // This value may be smaller or larger than the target's pointer type, and
+  // therefore require extension or truncating.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SwitchOp = DAG.getZExtOrTrunc(Sub, dl, TLI.getPointerTy(DAG.getDataLayout()));
+
+  unsigned JumpTableReg =
+      FuncInfo.CreateReg(TLI.getPointerTy(DAG.getDataLayout()));
+  SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl,
+                                    JumpTableReg, SwitchOp);
+  JT.Reg = JumpTableReg;
+
+  if (!JTH.FallthroughUnreachable) {
+    // Emit the range check for the jump table, and branch to the default block
+    // for the switch statement if the value being switched on exceeds the
+    // largest case in the switch.
+    SDValue CMP = DAG.getSetCC(
+        dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                                   Sub.getValueType()),
+        Sub, DAG.getConstant(JTH.Last - JTH.First, dl, VT), ISD::SETUGT);
+
+    SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
+                                 MVT::Other, CopyTo, CMP,
+                                 DAG.getBasicBlock(JT.Default));
+
+    // Avoid emitting unnecessary branches to the next block.
+    if (JT.MBB != NextBlock(SwitchBB))
+      BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
+                           DAG.getBasicBlock(JT.MBB));
+
+    DAG.setRoot(BrCond);
+  } else {
+    // Avoid emitting unnecessary branches to the next block.
+    if (JT.MBB != NextBlock(SwitchBB))
+      DAG.setRoot(DAG.getNode(ISD::BR, dl, MVT::Other, CopyTo,
+                              DAG.getBasicBlock(JT.MBB)));
+    else
+      DAG.setRoot(CopyTo);
+  }
+}
+
+/// Create a LOAD_STACK_GUARD node, and let it carry the target specific global
+/// variable if there exists one.
+static SDValue getLoadStackGuard(SelectionDAG &DAG, const SDLoc &DL,
+                                 SDValue &Chain) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
+  EVT PtrMemTy = TLI.getPointerMemTy(DAG.getDataLayout());
+  MachineFunction &MF = DAG.getMachineFunction();
+  Value *Global = TLI.getSDagStackGuard(*MF.getFunction().getParent());
+  MachineSDNode *Node =
+      DAG.getMachineNode(TargetOpcode::LOAD_STACK_GUARD, DL, PtrTy, Chain);
+  if (Global) {
+    MachinePointerInfo MPInfo(Global);
+    auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
+                 MachineMemOperand::MODereferenceable;
+    MachineMemOperand *MemRef = MF.getMachineMemOperand(
+        MPInfo, Flags, PtrTy.getSizeInBits() / 8, DAG.getEVTAlign(PtrTy));
+    DAG.setNodeMemRefs(Node, {MemRef});
+  }
+  if (PtrTy != PtrMemTy)
+    return DAG.getPtrExtOrTrunc(SDValue(Node, 0), DL, PtrMemTy);
+  return SDValue(Node, 0);
+}
+
+/// Codegen a new tail for a stack protector check ParentMBB which has had its
+/// tail spliced into a stack protector check success bb.
+///
+/// For a high level explanation of how this fits into the stack protector
+/// generation see the comment on the declaration of class
+/// StackProtectorDescriptor.
+void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
+                                                  MachineBasicBlock *ParentBB) {
+
+  // First create the loads to the guard/stack slot for the comparison.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
+  EVT PtrMemTy = TLI.getPointerMemTy(DAG.getDataLayout());
+
+  MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
+  int FI = MFI.getStackProtectorIndex();
+
+  SDValue Guard;
+  SDLoc dl = getCurSDLoc();
+  SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
+  const Module &M = *ParentBB->getParent()->getFunction().getParent();
+  Align Align =
+      DAG.getDataLayout().getPrefTypeAlign(Type::getInt8PtrTy(M.getContext()));
+
+  // Generate code to load the content of the guard slot.
+  SDValue GuardVal = DAG.getLoad(
+      PtrMemTy, dl, DAG.getEntryNode(), StackSlotPtr,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), Align,
+      MachineMemOperand::MOVolatile);
+
+  if (TLI.useStackGuardXorFP())
+    GuardVal = TLI.emitStackGuardXorFP(DAG, GuardVal, dl);
+
+  // Retrieve guard check function, nullptr if instrumentation is inlined.
+  if (const Function *GuardCheckFn = TLI.getSSPStackGuardCheck(M)) {
+    // The target provides a guard check function to validate the guard value.
+    // Generate a call to that function with the content of the guard slot as
+    // argument.
+    FunctionType *FnTy = GuardCheckFn->getFunctionType();
+    assert(FnTy->getNumParams() == 1 && "Invalid function signature");
+
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = GuardVal;
+    Entry.Ty = FnTy->getParamType(0);
+    if (GuardCheckFn->hasParamAttribute(0, Attribute::AttrKind::InReg))
+      Entry.IsInReg = true;
+    Args.push_back(Entry);
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(getCurSDLoc())
+        .setChain(DAG.getEntryNode())
+        .setCallee(GuardCheckFn->getCallingConv(), FnTy->getReturnType(),
+                   getValue(GuardCheckFn), std::move(Args));
+
+    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
+    DAG.setRoot(Result.second);
+    return;
+  }
+
+  // If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
+  // Otherwise, emit a volatile load to retrieve the stack guard value.
+  SDValue Chain = DAG.getEntryNode();
+  if (TLI.useLoadStackGuardNode()) {
+    Guard = getLoadStackGuard(DAG, dl, Chain);
+  } else {
+    const Value *IRGuard = TLI.getSDagStackGuard(M);
+    SDValue GuardPtr = getValue(IRGuard);
+
+    Guard = DAG.getLoad(PtrMemTy, dl, Chain, GuardPtr,
+                        MachinePointerInfo(IRGuard, 0), Align,
+                        MachineMemOperand::MOVolatile);
+  }
+
+  // Perform the comparison via a getsetcc.
+  SDValue Cmp = DAG.getSetCC(dl, TLI.getSetCCResultType(DAG.getDataLayout(),
+                                                        *DAG.getContext(),
+                                                        Guard.getValueType()),
+                             Guard, GuardVal, ISD::SETNE);
+
+  // If the guard/stackslot do not equal, branch to failure MBB.
+  SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
+                               MVT::Other, GuardVal.getOperand(0),
+                               Cmp, DAG.getBasicBlock(SPD.getFailureMBB()));
+  // Otherwise branch to success MBB.
+  SDValue Br = DAG.getNode(ISD::BR, dl,
+                           MVT::Other, BrCond,
+                           DAG.getBasicBlock(SPD.getSuccessMBB()));
+
+  DAG.setRoot(Br);
+}
+
+/// Codegen the failure basic block for a stack protector check.
+///
+/// A failure stack protector machine basic block consists simply of a call to
+/// __stack_chk_fail().
+///
+/// For a high level explanation of how this fits into the stack protector
+/// generation see the comment on the declaration of class
+/// StackProtectorDescriptor.
+void
+SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setDiscardResult(true);
+  SDValue Chain =
+      TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL, MVT::isVoid,
+                      std::nullopt, CallOptions, getCurSDLoc())
+          .second;
+  // On PS4/PS5, the "return address" must still be within the calling
+  // function, even if it's at the very end, so emit an explicit TRAP here.
+  // Passing 'true' for doesNotReturn above won't generate the trap for us.
+  if (TM.getTargetTriple().isPS())
+    Chain = DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, Chain);
+  // WebAssembly needs an unreachable instruction after a non-returning call,
+  // because the function return type can be different from __stack_chk_fail's
+  // return type (void).
+  if (TM.getTargetTriple().isWasm())
+    Chain = DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, Chain);
+
+  DAG.setRoot(Chain);
+}
+
+/// visitBitTestHeader - This function emits necessary code to produce value
+/// suitable for "bit tests"
+void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
+                                             MachineBasicBlock *SwitchBB) {
+  SDLoc dl = getCurSDLoc();
+
+  // Subtract the minimum value.
+  SDValue SwitchOp = getValue(B.SValue);
+  EVT VT = SwitchOp.getValueType();
+  SDValue RangeSub =
+      DAG.getNode(ISD::SUB, dl, VT, SwitchOp, DAG.getConstant(B.First, dl, VT));
+
+  // Determine the type of the test operands.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  bool UsePtrType = false;
+  if (!TLI.isTypeLegal(VT)) {
+    UsePtrType = true;
+  } else {
+    for (unsigned i = 0, e = B.Cases.size(); i != e; ++i)
+      if (!isUIntN(VT.getSizeInBits(), B.Cases[i].Mask)) {
+        // Switch table case range are encoded into series of masks.
+        // Just use pointer type, it's guaranteed to fit.
+        UsePtrType = true;
+        break;
+      }
+  }
+  SDValue Sub = RangeSub;
+  if (UsePtrType) {
+    VT = TLI.getPointerTy(DAG.getDataLayout());
+    Sub = DAG.getZExtOrTrunc(Sub, dl, VT);
+  }
+
+  B.RegVT = VT.getSimpleVT();
+  B.Reg = FuncInfo.CreateReg(B.RegVT);
+  SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl, B.Reg, Sub);
+
+  MachineBasicBlock* MBB = B.Cases[0].ThisBB;
+
+  if (!B.FallthroughUnreachable)
+    addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb);
+  addSuccessorWithProb(SwitchBB, MBB, B.Prob);
+  SwitchBB->normalizeSuccProbs();
+
+  SDValue Root = CopyTo;
+  if (!B.FallthroughUnreachable) {
+    // Conditional branch to the default block.
+    SDValue RangeCmp = DAG.getSetCC(dl,
+        TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                               RangeSub.getValueType()),
+        RangeSub, DAG.getConstant(B.Range, dl, RangeSub.getValueType()),
+        ISD::SETUGT);
+
+    Root = DAG.getNode(ISD::BRCOND, dl, MVT::Other, Root, RangeCmp,
+                       DAG.getBasicBlock(B.Default));
+  }
+
+  // Avoid emitting unnecessary branches to the next block.
+  if (MBB != NextBlock(SwitchBB))
+    Root = DAG.getNode(ISD::BR, dl, MVT::Other, Root, DAG.getBasicBlock(MBB));
+
+  DAG.setRoot(Root);
+}
+
+/// visitBitTestCase - this function produces one "bit test"
+void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
+                                           MachineBasicBlock* NextMBB,
+                                           BranchProbability BranchProbToNext,
+                                           unsigned Reg,
+                                           BitTestCase &B,
+                                           MachineBasicBlock *SwitchBB) {
+  SDLoc dl = getCurSDLoc();
+  MVT VT = BB.RegVT;
+  SDValue ShiftOp = DAG.getCopyFromReg(getControlRoot(), dl, Reg, VT);
+  SDValue Cmp;
+  unsigned PopCount = llvm::popcount(B.Mask);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (PopCount == 1) {
+    // Testing for a single bit; just compare the shift count with what it
+    // would need to be to shift a 1 bit in that position.
+    Cmp = DAG.getSetCC(
+        dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
+        ShiftOp, DAG.getConstant(llvm::countr_zero(B.Mask), dl, VT),
+        ISD::SETEQ);
+  } else if (PopCount == BB.Range) {
+    // There is only one zero bit in the range, test for it directly.
+    Cmp = DAG.getSetCC(
+        dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
+        ShiftOp, DAG.getConstant(llvm::countr_one(B.Mask), dl, VT), ISD::SETNE);
+  } else {
+    // Make desired shift
+    SDValue SwitchVal = DAG.getNode(ISD::SHL, dl, VT,
+                                    DAG.getConstant(1, dl, VT), ShiftOp);
+
+    // Emit bit tests and jumps
+    SDValue AndOp = DAG.getNode(ISD::AND, dl,
+                                VT, SwitchVal, DAG.getConstant(B.Mask, dl, VT));
+    Cmp = DAG.getSetCC(
+        dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
+        AndOp, DAG.getConstant(0, dl, VT), ISD::SETNE);
+  }
+
+  // The branch probability from SwitchBB to B.TargetBB is B.ExtraProb.
+  addSuccessorWithProb(SwitchBB, B.TargetBB, B.ExtraProb);
+  // The branch probability from SwitchBB to NextMBB is BranchProbToNext.
+  addSuccessorWithProb(SwitchBB, NextMBB, BranchProbToNext);
+  // It is not guaranteed that the sum of B.ExtraProb and BranchProbToNext is
+  // one as they are relative probabilities (and thus work more like weights),
+  // and hence we need to normalize them to let the sum of them become one.
+  SwitchBB->normalizeSuccProbs();
+
+  SDValue BrAnd = DAG.getNode(ISD::BRCOND, dl,
+                              MVT::Other, getControlRoot(),
+                              Cmp, DAG.getBasicBlock(B.TargetBB));
+
+  // Avoid emitting unnecessary branches to the next block.
+  if (NextMBB != NextBlock(SwitchBB))
+    BrAnd = DAG.getNode(ISD::BR, dl, MVT::Other, BrAnd,
+                        DAG.getBasicBlock(NextMBB));
+
+  DAG.setRoot(BrAnd);
+}
+
+void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
+  MachineBasicBlock *InvokeMBB = FuncInfo.MBB;
+
+  // Retrieve successors. Look through artificial IR level blocks like
+  // catchswitch for successors.
+  MachineBasicBlock *Return = FuncInfo.MBBMap[I.getSuccessor(0)];
+  const BasicBlock *EHPadBB = I.getSuccessor(1);
+  MachineBasicBlock *EHPadMBB = FuncInfo.MBBMap[EHPadBB];
+
+  // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
+  // have to do anything here to lower funclet bundles.
+  assert(!I.hasOperandBundlesOtherThan(
+             {LLVMContext::OB_deopt, LLVMContext::OB_gc_transition,
+              LLVMContext::OB_gc_live, LLVMContext::OB_funclet,
+              LLVMContext::OB_cfguardtarget,
+              LLVMContext::OB_clang_arc_attachedcall}) &&
+         "Cannot lower invokes with arbitrary operand bundles yet!");
+
+  const Value *Callee(I.getCalledOperand());
+  const Function *Fn = dyn_cast<Function>(Callee);
+  if (isa<InlineAsm>(Callee))
+    visitInlineAsm(I, EHPadBB);
+  else if (Fn && Fn->isIntrinsic()) {
+    switch (Fn->getIntrinsicID()) {
+    default:
+      llvm_unreachable("Cannot invoke this intrinsic");
+    case Intrinsic::donothing:
+      // Ignore invokes to @llvm.donothing: jump directly to the next BB.
+    case Intrinsic::seh_try_begin:
+    case Intrinsic::seh_scope_begin:
+    case Intrinsic::seh_try_end:
+    case Intrinsic::seh_scope_end:
+      if (EHPadMBB)
+          // a block referenced by EH table
+          // so dtor-funclet not removed by opts
+          EHPadMBB->setMachineBlockAddressTaken();
+      break;
+    case Intrinsic::experimental_patchpoint_void:
+    case Intrinsic::experimental_patchpoint_i64:
+      visitPatchpoint(I, EHPadBB);
+      break;
+    case Intrinsic::experimental_gc_statepoint:
+      LowerStatepoint(cast<GCStatepointInst>(I), EHPadBB);
+      break;
+    case Intrinsic::wasm_rethrow: {
+      // This is usually done in visitTargetIntrinsic, but this intrinsic is
+      // special because it can be invoked, so we manually lower it to a DAG
+      // node here.
+      SmallVector<SDValue, 8> Ops;
+      Ops.push_back(getRoot()); // inchain
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      Ops.push_back(
+          DAG.getTargetConstant(Intrinsic::wasm_rethrow, getCurSDLoc(),
+                                TLI.getPointerTy(DAG.getDataLayout())));
+      SDVTList VTs = DAG.getVTList(ArrayRef<EVT>({MVT::Other})); // outchain
+      DAG.setRoot(DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops));
+      break;
+    }
+    }
+  } else if (I.countOperandBundlesOfType(LLVMContext::OB_deopt)) {
+    // Currently we do not lower any intrinsic calls with deopt operand bundles.
+    // Eventually we will support lowering the @llvm.experimental.deoptimize
+    // intrinsic, and right now there are no plans to support other intrinsics
+    // with deopt state.
+    LowerCallSiteWithDeoptBundle(&I, getValue(Callee), EHPadBB);
+  } else {
+    LowerCallTo(I, getValue(Callee), false, false, EHPadBB);
+  }
+
+  // If the value of the invoke is used outside of its defining block, make it
+  // available as a virtual register.
+  // We already took care of the exported value for the statepoint instruction
+  // during call to the LowerStatepoint.
+  if (!isa<GCStatepointInst>(I)) {
+    CopyToExportRegsIfNeeded(&I);
+  }
+
+  SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  BranchProbability EHPadBBProb =
+      BPI ? BPI->getEdgeProbability(InvokeMBB->getBasicBlock(), EHPadBB)
+          : BranchProbability::getZero();
+  findUnwindDestinations(FuncInfo, EHPadBB, EHPadBBProb, UnwindDests);
+
+  // Update successor info.
+  addSuccessorWithProb(InvokeMBB, Return);
+  for (auto &UnwindDest : UnwindDests) {
+    UnwindDest.first->setIsEHPad();
+    addSuccessorWithProb(InvokeMBB, UnwindDest.first, UnwindDest.second);
+  }
+  InvokeMBB->normalizeSuccProbs();
+
+  // Drop into normal successor.
+  DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, getControlRoot(),
+                          DAG.getBasicBlock(Return)));
+}
+
+void SelectionDAGBuilder::visitCallBr(const CallBrInst &I) {
+  MachineBasicBlock *CallBrMBB = FuncInfo.MBB;
+
+  // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
+  // have to do anything here to lower funclet bundles.
+  assert(!I.hasOperandBundlesOtherThan(
+             {LLVMContext::OB_deopt, LLVMContext::OB_funclet}) &&
+         "Cannot lower callbrs with arbitrary operand bundles yet!");
+
+  assert(I.isInlineAsm() && "Only know how to handle inlineasm callbr");
+  visitInlineAsm(I);
+  CopyToExportRegsIfNeeded(&I);
+
+  // Retrieve successors.
+  SmallPtrSet<BasicBlock *, 8> Dests;
+  Dests.insert(I.getDefaultDest());
+  MachineBasicBlock *Return = FuncInfo.MBBMap[I.getDefaultDest()];
+
+  // Update successor info.
+  addSuccessorWithProb(CallBrMBB, Return, BranchProbability::getOne());
+  for (unsigned i = 0, e = I.getNumIndirectDests(); i < e; ++i) {
+    BasicBlock *Dest = I.getIndirectDest(i);
+    MachineBasicBlock *Target = FuncInfo.MBBMap[Dest];
+    Target->setIsInlineAsmBrIndirectTarget();
+    Target->setMachineBlockAddressTaken();
+    Target->setLabelMustBeEmitted();
+    // Don't add duplicate machine successors.
+    if (Dests.insert(Dest).second)
+      addSuccessorWithProb(CallBrMBB, Target, BranchProbability::getZero());
+  }
+  CallBrMBB->normalizeSuccProbs();
+
+  // Drop into default successor.
+  DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
+                          MVT::Other, getControlRoot(),
+                          DAG.getBasicBlock(Return)));
+}
+
+void SelectionDAGBuilder::visitResume(const ResumeInst &RI) {
+  llvm_unreachable("SelectionDAGBuilder shouldn't visit resume instructions!");
+}
+
+void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
+  assert(FuncInfo.MBB->isEHPad() &&
+         "Call to landingpad not in landing pad!");
+
+  // If there aren't registers to copy the values into (e.g., during SjLj
+  // exceptions), then don't bother to create these DAG nodes.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const Constant *PersonalityFn = FuncInfo.Fn->getPersonalityFn();
+  if (TLI.getExceptionPointerRegister(PersonalityFn) == 0 &&
+      TLI.getExceptionSelectorRegister(PersonalityFn) == 0)
+    return;
+
+  // If landingpad's return type is token type, we don't create DAG nodes
+  // for its exception pointer and selector value. The extraction of exception
+  // pointer or selector value from token type landingpads is not currently
+  // supported.
+  if (LP.getType()->isTokenTy())
+    return;
+
+  SmallVector<EVT, 2> ValueVTs;
+  SDLoc dl = getCurSDLoc();
+  ComputeValueVTs(TLI, DAG.getDataLayout(), LP.getType(), ValueVTs);
+  assert(ValueVTs.size() == 2 && "Only two-valued landingpads are supported");
+
+  // Get the two live-in registers as SDValues. The physregs have already been
+  // copied into virtual registers.
+  SDValue Ops[2];
+  if (FuncInfo.ExceptionPointerVirtReg) {
+    Ops[0] = DAG.getZExtOrTrunc(
+        DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                           FuncInfo.ExceptionPointerVirtReg,
+                           TLI.getPointerTy(DAG.getDataLayout())),
+        dl, ValueVTs[0]);
+  } else {
+    Ops[0] = DAG.getConstant(0, dl, TLI.getPointerTy(DAG.getDataLayout()));
+  }
+  Ops[1] = DAG.getZExtOrTrunc(
+      DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                         FuncInfo.ExceptionSelectorVirtReg,
+                         TLI.getPointerTy(DAG.getDataLayout())),
+      dl, ValueVTs[1]);
+
+  // Merge into one.
+  SDValue Res = DAG.getNode(ISD::MERGE_VALUES, dl,
+                            DAG.getVTList(ValueVTs), Ops);
+  setValue(&LP, Res);
+}
+
+void SelectionDAGBuilder::UpdateSplitBlock(MachineBasicBlock *First,
+                                           MachineBasicBlock *Last) {
+  // Update JTCases.
+  for (JumpTableBlock &JTB : SL->JTCases)
+    if (JTB.first.HeaderBB == First)
+      JTB.first.HeaderBB = Last;
+
+  // Update BitTestCases.
+  for (BitTestBlock &BTB : SL->BitTestCases)
+    if (BTB.Parent == First)
+      BTB.Parent = Last;
+}
+
+void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
+  MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB;
+
+  // Update machine-CFG edges with unique successors.
+  SmallSet<BasicBlock*, 32> Done;
+  for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i) {
+    BasicBlock *BB = I.getSuccessor(i);
+    bool Inserted = Done.insert(BB).second;
+    if (!Inserted)
+        continue;
+
+    MachineBasicBlock *Succ = FuncInfo.MBBMap[BB];
+    addSuccessorWithProb(IndirectBrMBB, Succ);
+  }
+  IndirectBrMBB->normalizeSuccProbs();
+
+  DAG.setRoot(DAG.getNode(ISD::BRIND, getCurSDLoc(),
+                          MVT::Other, getControlRoot(),
+                          getValue(I.getAddress())));
+}
+
+void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) {
+  if (!DAG.getTarget().Options.TrapUnreachable)
+    return;
+
+  // We may be able to ignore unreachable behind a noreturn call.
+  if (DAG.getTarget().Options.NoTrapAfterNoreturn) {
+    const BasicBlock &BB = *I.getParent();
+    if (&I != &BB.front()) {
+      BasicBlock::const_iterator PredI =
+        std::prev(BasicBlock::const_iterator(&I));
+      if (const CallInst *Call = dyn_cast<CallInst>(&*PredI)) {
+        if (Call->doesNotReturn())
+          return;
+      }
+    }
+  }
+
+  DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
+}
+
+void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) {
+  SDNodeFlags Flags;
+  if (auto *FPOp = dyn_cast<FPMathOperator>(&I))
+    Flags.copyFMF(*FPOp);
+
+  SDValue Op = getValue(I.getOperand(0));
+  SDValue UnNodeValue = DAG.getNode(Opcode, getCurSDLoc(), Op.getValueType(),
+                                    Op, Flags);
+  setValue(&I, UnNodeValue);
+}
+
+void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) {
+  SDNodeFlags Flags;
+  if (auto *OFBinOp = dyn_cast<OverflowingBinaryOperator>(&I)) {
+    Flags.setNoSignedWrap(OFBinOp->hasNoSignedWrap());
+    Flags.setNoUnsignedWrap(OFBinOp->hasNoUnsignedWrap());
+  }
+  if (auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
+    Flags.setExact(ExactOp->isExact());
+  if (auto *FPOp = dyn_cast<FPMathOperator>(&I))
+    Flags.copyFMF(*FPOp);
+
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  SDValue BinNodeValue = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(),
+                                     Op1, Op2, Flags);
+  setValue(&I, BinNodeValue);
+}
+
+void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+
+  EVT ShiftTy = DAG.getTargetLoweringInfo().getShiftAmountTy(
+      Op1.getValueType(), DAG.getDataLayout());
+
+  // Coerce the shift amount to the right type if we can. This exposes the
+  // truncate or zext to optimization early.
+  if (!I.getType()->isVectorTy() && Op2.getValueType() != ShiftTy) {
+    assert(ShiftTy.getSizeInBits() >= Log2_32_Ceil(Op1.getValueSizeInBits()) &&
+           "Unexpected shift type");
+    Op2 = DAG.getZExtOrTrunc(Op2, getCurSDLoc(), ShiftTy);
+  }
+
+  bool nuw = false;
+  bool nsw = false;
+  bool exact = false;
+
+  if (Opcode == ISD::SRL || Opcode == ISD::SRA || Opcode == ISD::SHL) {
+
+    if (const OverflowingBinaryOperator *OFBinOp =
+            dyn_cast<const OverflowingBinaryOperator>(&I)) {
+      nuw = OFBinOp->hasNoUnsignedWrap();
+      nsw = OFBinOp->hasNoSignedWrap();
+    }
+    if (const PossiblyExactOperator *ExactOp =
+            dyn_cast<const PossiblyExactOperator>(&I))
+      exact = ExactOp->isExact();
+  }
+  SDNodeFlags Flags;
+  Flags.setExact(exact);
+  Flags.setNoSignedWrap(nsw);
+  Flags.setNoUnsignedWrap(nuw);
+  SDValue Res = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(), Op1, Op2,
+                            Flags);
+  setValue(&I, Res);
+}
+
+void SelectionDAGBuilder::visitSDiv(const User &I) {
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+
+  SDNodeFlags Flags;
+  Flags.setExact(isa<PossiblyExactOperator>(&I) &&
+                 cast<PossiblyExactOperator>(&I)->isExact());
+  setValue(&I, DAG.getNode(ISD::SDIV, getCurSDLoc(), Op1.getValueType(), Op1,
+                           Op2, Flags));
+}
+
+void SelectionDAGBuilder::visitICmp(const User &I) {
+  ICmpInst::Predicate predicate = ICmpInst::BAD_ICMP_PREDICATE;
+  if (const ICmpInst *IC = dyn_cast<ICmpInst>(&I))
+    predicate = IC->getPredicate();
+  else if (const ConstantExpr *IC = dyn_cast<ConstantExpr>(&I))
+    predicate = ICmpInst::Predicate(IC->getPredicate());
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  ISD::CondCode Opcode = getICmpCondCode(predicate);
+
+  auto &TLI = DAG.getTargetLoweringInfo();
+  EVT MemVT =
+      TLI.getMemValueType(DAG.getDataLayout(), I.getOperand(0)->getType());
+
+  // If a pointer's DAG type is larger than its memory type then the DAG values
+  // are zero-extended. This breaks signed comparisons so truncate back to the
+  // underlying type before doing the compare.
+  if (Op1.getValueType() != MemVT) {
+    Op1 = DAG.getPtrExtOrTrunc(Op1, getCurSDLoc(), MemVT);
+    Op2 = DAG.getPtrExtOrTrunc(Op2, getCurSDLoc(), MemVT);
+  }
+
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Opcode));
+}
+
+void SelectionDAGBuilder::visitFCmp(const User &I) {
+  FCmpInst::Predicate predicate = FCmpInst::BAD_FCMP_PREDICATE;
+  if (const FCmpInst *FC = dyn_cast<FCmpInst>(&I))
+    predicate = FC->getPredicate();
+  else if (const ConstantExpr *FC = dyn_cast<ConstantExpr>(&I))
+    predicate = FCmpInst::Predicate(FC->getPredicate());
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+
+  ISD::CondCode Condition = getFCmpCondCode(predicate);
+  auto *FPMO = cast<FPMathOperator>(&I);
+  if (FPMO->hasNoNaNs() || TM.Options.NoNaNsFPMath)
+    Condition = getFCmpCodeWithoutNaN(Condition);
+
+  SDNodeFlags Flags;
+  Flags.copyFMF(*FPMO);
+  SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
+
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition));
+}
+
+// Check if the condition of the select has one use or two users that are both
+// selects with the same condition.
+static bool hasOnlySelectUsers(const Value *Cond) {
+  return llvm::all_of(Cond->users(), [](const Value *V) {
+    return isa<SelectInst>(V);
+  });
+}
+
+void SelectionDAGBuilder::visitSelect(const User &I) {
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(),
+                  ValueVTs);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0) return;
+
+  SmallVector<SDValue, 4> Values(NumValues);
+  SDValue Cond     = getValue(I.getOperand(0));
+  SDValue LHSVal   = getValue(I.getOperand(1));
+  SDValue RHSVal   = getValue(I.getOperand(2));
+  SmallVector<SDValue, 1> BaseOps(1, Cond);
+  ISD::NodeType OpCode =
+      Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT;
+
+  bool IsUnaryAbs = false;
+  bool Negate = false;
+
+  SDNodeFlags Flags;
+  if (auto *FPOp = dyn_cast<FPMathOperator>(&I))
+    Flags.copyFMF(*FPOp);
+
+  Flags.setUnpredictable(
+      cast<SelectInst>(I).getMetadata(LLVMContext::MD_unpredictable));
+
+  // Min/max matching is only viable if all output VTs are the same.
+  if (all_equal(ValueVTs)) {
+    EVT VT = ValueVTs[0];
+    LLVMContext &Ctx = *DAG.getContext();
+    auto &TLI = DAG.getTargetLoweringInfo();
+
+    // We care about the legality of the operation after it has been type
+    // legalized.
+    while (TLI.getTypeAction(Ctx, VT) != TargetLoweringBase::TypeLegal)
+      VT = TLI.getTypeToTransformTo(Ctx, VT);
+
+    // If the vselect is legal, assume we want to leave this as a vector setcc +
+    // vselect. Otherwise, if this is going to be scalarized, we want to see if
+    // min/max is legal on the scalar type.
+    bool UseScalarMinMax = VT.isVector() &&
+      !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT);
+
+    // ValueTracking's select pattern matching does not account for -0.0,
+    // so we can't lower to FMINIMUM/FMAXIMUM because those nodes specify that
+    // -0.0 is less than +0.0.
+    Value *LHS, *RHS;
+    auto SPR = matchSelectPattern(const_cast<User*>(&I), LHS, RHS);
+    ISD::NodeType Opc = ISD::DELETED_NODE;
+    switch (SPR.Flavor) {
+    case SPF_UMAX:    Opc = ISD::UMAX; break;
+    case SPF_UMIN:    Opc = ISD::UMIN; break;
+    case SPF_SMAX:    Opc = ISD::SMAX; break;
+    case SPF_SMIN:    Opc = ISD::SMIN; break;
+    case SPF_FMINNUM:
+      switch (SPR.NaNBehavior) {
+      case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
+      case SPNB_RETURNS_NAN: break;
+      case SPNB_RETURNS_OTHER: Opc = ISD::FMINNUM; break;
+      case SPNB_RETURNS_ANY:
+        if (TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT) ||
+            (UseScalarMinMax &&
+             TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT.getScalarType())))
+          Opc = ISD::FMINNUM;
+        break;
+      }
+      break;
+    case SPF_FMAXNUM:
+      switch (SPR.NaNBehavior) {
+      case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
+      case SPNB_RETURNS_NAN: break;
+      case SPNB_RETURNS_OTHER: Opc = ISD::FMAXNUM; break;
+      case SPNB_RETURNS_ANY:
+        if (TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT) ||
+            (UseScalarMinMax &&
+             TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT.getScalarType())))
+          Opc = ISD::FMAXNUM;
+        break;
+      }
+      break;
+    case SPF_NABS:
+      Negate = true;
+      [[fallthrough]];
+    case SPF_ABS:
+      IsUnaryAbs = true;
+      Opc = ISD::ABS;
+      break;
+    default: break;
+    }
+
+    if (!IsUnaryAbs && Opc != ISD::DELETED_NODE &&
+        (TLI.isOperationLegalOrCustom(Opc, VT) ||
+         (UseScalarMinMax &&
+          TLI.isOperationLegalOrCustom(Opc, VT.getScalarType()))) &&
+        // If the underlying comparison instruction is used by any other
+        // instruction, the consumed instructions won't be destroyed, so it is
+        // not profitable to convert to a min/max.
+        hasOnlySelectUsers(cast<SelectInst>(I).getCondition())) {
+      OpCode = Opc;
+      LHSVal = getValue(LHS);
+      RHSVal = getValue(RHS);
+      BaseOps.clear();
+    }
+
+    if (IsUnaryAbs) {
+      OpCode = Opc;
+      LHSVal = getValue(LHS);
+      BaseOps.clear();
+    }
+  }
+
+  if (IsUnaryAbs) {
+    for (unsigned i = 0; i != NumValues; ++i) {
+      SDLoc dl = getCurSDLoc();
+      EVT VT = LHSVal.getNode()->getValueType(LHSVal.getResNo() + i);
+      Values[i] =
+          DAG.getNode(OpCode, dl, VT, LHSVal.getValue(LHSVal.getResNo() + i));
+      if (Negate)
+        Values[i] = DAG.getNegative(Values[i], dl, VT);
+    }
+  } else {
+    for (unsigned i = 0; i != NumValues; ++i) {
+      SmallVector<SDValue, 3> Ops(BaseOps.begin(), BaseOps.end());
+      Ops.push_back(SDValue(LHSVal.getNode(), LHSVal.getResNo() + i));
+      Ops.push_back(SDValue(RHSVal.getNode(), RHSVal.getResNo() + i));
+      Values[i] = DAG.getNode(
+          OpCode, getCurSDLoc(),
+          LHSVal.getNode()->getValueType(LHSVal.getResNo() + i), Ops, Flags);
+    }
+  }
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
+                           DAG.getVTList(ValueVTs), Values));
+}
+
+void SelectionDAGBuilder::visitTrunc(const User &I) {
+  // TruncInst cannot be a no-op cast because sizeof(src) > sizeof(dest).
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitZExt(const User &I) {
+  // ZExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
+  // ZExt also can't be a cast to bool for same reason. So, nothing much to do
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurSDLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitSExt(const User &I) {
+  // SExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
+  // SExt also can't be a cast to bool for same reason. So, nothing much to do
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurSDLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitFPTrunc(const User &I) {
+  // FPTrunc is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  SDLoc dl = getCurSDLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_ROUND, dl, DestVT, N,
+                           DAG.getTargetConstant(
+                               0, dl, TLI.getPointerTy(DAG.getDataLayout()))));
+}
+
+void SelectionDAGBuilder::visitFPExt(const User &I) {
+  // FPExt is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurSDLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitFPToUI(const User &I) {
+  // FPToUI is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_TO_UINT, getCurSDLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitFPToSI(const User &I) {
+  // FPToSI is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_TO_SINT, getCurSDLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitUIToFP(const User &I) {
+  // UIToFP is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getNode(ISD::UINT_TO_FP, getCurSDLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitSIToFP(const User &I) {
+  // SIToFP is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitPtrToInt(const User &I) {
+  // What to do depends on the size of the integer and the size of the pointer.
+  // We can either truncate, zero extend, or no-op, accordingly.
+  SDValue N = getValue(I.getOperand(0));
+  auto &TLI = DAG.getTargetLoweringInfo();
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  EVT PtrMemVT =
+      TLI.getMemValueType(DAG.getDataLayout(), I.getOperand(0)->getType());
+  N = DAG.getPtrExtOrTrunc(N, getCurSDLoc(), PtrMemVT);
+  N = DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT);
+  setValue(&I, N);
+}
+
+void SelectionDAGBuilder::visitIntToPtr(const User &I) {
+  // What to do depends on the size of the integer and the size of the pointer.
+  // We can either truncate, zero extend, or no-op, accordingly.
+  SDValue N = getValue(I.getOperand(0));
+  auto &TLI = DAG.getTargetLoweringInfo();
+  EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  EVT PtrMemVT = TLI.getMemValueType(DAG.getDataLayout(), I.getType());
+  N = DAG.getZExtOrTrunc(N, getCurSDLoc(), PtrMemVT);
+  N = DAG.getPtrExtOrTrunc(N, getCurSDLoc(), DestVT);
+  setValue(&I, N);
+}
+
+void SelectionDAGBuilder::visitBitCast(const User &I) {
+  SDValue N = getValue(I.getOperand(0));
+  SDLoc dl = getCurSDLoc();
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+
+  // BitCast assures us that source and destination are the same size so this is
+  // either a BITCAST or a no-op.
+  if (DestVT != N.getValueType())
+    setValue(&I, DAG.getNode(ISD::BITCAST, dl,
+                             DestVT, N)); // convert types.
+  // Check if the original LLVM IR Operand was a ConstantInt, because getValue()
+  // might fold any kind of constant expression to an integer constant and that
+  // is not what we are looking for. Only recognize a bitcast of a genuine
+  // constant integer as an opaque constant.
+  else if(ConstantInt *C = dyn_cast<ConstantInt>(I.getOperand(0)))
+    setValue(&I, DAG.getConstant(C->getValue(), dl, DestVT, /*isTarget=*/false,
+                                 /*isOpaque*/true));
+  else
+    setValue(&I, N);            // noop cast.
+}
+
+void SelectionDAGBuilder::visitAddrSpaceCast(const User &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const Value *SV = I.getOperand(0);
+  SDValue N = getValue(SV);
+  EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+  unsigned SrcAS = SV->getType()->getPointerAddressSpace();
+  unsigned DestAS = I.getType()->getPointerAddressSpace();
+
+  if (!TM.isNoopAddrSpaceCast(SrcAS, DestAS))
+    N = DAG.getAddrSpaceCast(getCurSDLoc(), DestVT, N, SrcAS, DestAS);
+
+  setValue(&I, N);
+}
+
+void SelectionDAGBuilder::visitInsertElement(const User &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue InVec = getValue(I.getOperand(0));
+  SDValue InVal = getValue(I.getOperand(1));
+  SDValue InIdx = DAG.getZExtOrTrunc(getValue(I.getOperand(2)), getCurSDLoc(),
+                                     TLI.getVectorIdxTy(DAG.getDataLayout()));
+  setValue(&I, DAG.getNode(ISD::INSERT_VECTOR_ELT, getCurSDLoc(),
+                           TLI.getValueType(DAG.getDataLayout(), I.getType()),
+                           InVec, InVal, InIdx));
+}
+
+void SelectionDAGBuilder::visitExtractElement(const User &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue InVec = getValue(I.getOperand(0));
+  SDValue InIdx = DAG.getZExtOrTrunc(getValue(I.getOperand(1)), getCurSDLoc(),
+                                     TLI.getVectorIdxTy(DAG.getDataLayout()));
+  setValue(&I, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurSDLoc(),
+                           TLI.getValueType(DAG.getDataLayout(), I.getType()),
+                           InVec, InIdx));
+}
+
+void SelectionDAGBuilder::visitShuffleVector(const User &I) {
+  SDValue Src1 = getValue(I.getOperand(0));
+  SDValue Src2 = getValue(I.getOperand(1));
+  ArrayRef<int> Mask;
+  if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I))
+    Mask = SVI->getShuffleMask();
+  else
+    Mask = cast<ConstantExpr>(I).getShuffleMask();
+  SDLoc DL = getCurSDLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  EVT SrcVT = Src1.getValueType();
+
+  if (all_of(Mask, [](int Elem) { return Elem == 0; }) &&
+      VT.isScalableVector()) {
+    // Canonical splat form of first element of first input vector.
+    SDValue FirstElt =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT.getScalarType(), Src1,
+                    DAG.getVectorIdxConstant(0, DL));
+    setValue(&I, DAG.getNode(ISD::SPLAT_VECTOR, DL, VT, FirstElt));
+    return;
+  }
+
+  // For now, we only handle splats for scalable vectors.
+  // The DAGCombiner will perform a BUILD_VECTOR -> SPLAT_VECTOR transformation
+  // for targets that support a SPLAT_VECTOR for non-scalable vector types.
+  assert(!VT.isScalableVector() && "Unsupported scalable vector shuffle");
+
+  unsigned SrcNumElts = SrcVT.getVectorNumElements();
+  unsigned MaskNumElts = Mask.size();
+
+  if (SrcNumElts == MaskNumElts) {
+    setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, Mask));
+    return;
+  }
+
+  // Normalize the shuffle vector since mask and vector length don't match.
+  if (SrcNumElts < MaskNumElts) {
+    // Mask is longer than the source vectors. We can use concatenate vector to
+    // make the mask and vectors lengths match.
+
+    if (MaskNumElts % SrcNumElts == 0) {
+      // Mask length is a multiple of the source vector length.
+      // Check if the shuffle is some kind of concatenation of the input
+      // vectors.
+      unsigned NumConcat = MaskNumElts / SrcNumElts;
+      bool IsConcat = true;
+      SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
+      for (unsigned i = 0; i != MaskNumElts; ++i) {
+        int Idx = Mask[i];
+        if (Idx < 0)
+          continue;
+        // Ensure the indices in each SrcVT sized piece are sequential and that
+        // the same source is used for the whole piece.
+        if ((Idx % SrcNumElts != (i % SrcNumElts)) ||
+            (ConcatSrcs[i / SrcNumElts] >= 0 &&
+             ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts))) {
+          IsConcat = false;
+          break;
+        }
+        // Remember which source this index came from.
+        ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
+      }
+
+      // The shuffle is concatenating multiple vectors together. Just emit
+      // a CONCAT_VECTORS operation.
+      if (IsConcat) {
+        SmallVector<SDValue, 8> ConcatOps;
+        for (auto Src : ConcatSrcs) {
+          if (Src < 0)
+            ConcatOps.push_back(DAG.getUNDEF(SrcVT));
+          else if (Src == 0)
+            ConcatOps.push_back(Src1);
+          else
+            ConcatOps.push_back(Src2);
+        }
+        setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps));
+        return;
+      }
+    }
+
+    unsigned PaddedMaskNumElts = alignTo(MaskNumElts, SrcNumElts);
+    unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
+    EVT PaddedVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
+                                    PaddedMaskNumElts);
+
+    // Pad both vectors with undefs to make them the same length as the mask.
+    SDValue UndefVal = DAG.getUNDEF(SrcVT);
+
+    SmallVector<SDValue, 8> MOps1(NumConcat, UndefVal);
+    SmallVector<SDValue, 8> MOps2(NumConcat, UndefVal);
+    MOps1[0] = Src1;
+    MOps2[0] = Src2;
+
+    Src1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, PaddedVT, MOps1);
+    Src2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, PaddedVT, MOps2);
+
+    // Readjust mask for new input vector length.
+    SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
+    for (unsigned i = 0; i != MaskNumElts; ++i) {
+      int Idx = Mask[i];
+      if (Idx >= (int)SrcNumElts)
+        Idx -= SrcNumElts - PaddedMaskNumElts;
+      MappedOps[i] = Idx;
+    }
+
+    SDValue Result = DAG.getVectorShuffle(PaddedVT, DL, Src1, Src2, MappedOps);
+
+    // If the concatenated vector was padded, extract a subvector with the
+    // correct number of elements.
+    if (MaskNumElts != PaddedMaskNumElts)
+      Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Result,
+                           DAG.getVectorIdxConstant(0, DL));
+
+    setValue(&I, Result);
+    return;
+  }
+
+  if (SrcNumElts > MaskNumElts) {
+    // Analyze the access pattern of the vector to see if we can extract
+    // two subvectors and do the shuffle.
+    int StartIdx[2] = { -1, -1 };  // StartIdx to extract from
+    bool CanExtract = true;
+    for (int Idx : Mask) {
+      unsigned Input = 0;
+      if (Idx < 0)
+        continue;
+
+      if (Idx >= (int)SrcNumElts) {
+        Input = 1;
+        Idx -= SrcNumElts;
+      }
+
+      // If all the indices come from the same MaskNumElts sized portion of
+      // the sources we can use extract. Also make sure the extract wouldn't
+      // extract past the end of the source.
+      int NewStartIdx = alignDown(Idx, MaskNumElts);
+      if (NewStartIdx + MaskNumElts > SrcNumElts ||
+          (StartIdx[Input] >= 0 && StartIdx[Input] != NewStartIdx))
+        CanExtract = false;
+      // Make sure we always update StartIdx as we use it to track if all
+      // elements are undef.
+      StartIdx[Input] = NewStartIdx;
+    }
+
+    if (StartIdx[0] < 0 && StartIdx[1] < 0) {
+      setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used.
+      return;
+    }
+    if (CanExtract) {
+      // Extract appropriate subvector and generate a vector shuffle
+      for (unsigned Input = 0; Input < 2; ++Input) {
+        SDValue &Src = Input == 0 ? Src1 : Src2;
+        if (StartIdx[Input] < 0)
+          Src = DAG.getUNDEF(VT);
+        else {
+          Src = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
+                            DAG.getVectorIdxConstant(StartIdx[Input], DL));
+        }
+      }
+
+      // Calculate new mask.
+      SmallVector<int, 8> MappedOps(Mask);
+      for (int &Idx : MappedOps) {
+        if (Idx >= (int)SrcNumElts)
+          Idx -= SrcNumElts + StartIdx[1] - MaskNumElts;
+        else if (Idx >= 0)
+          Idx -= StartIdx[0];
+      }
+
+      setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, MappedOps));
+      return;
+    }
+  }
+
+  // We can't use either concat vectors or extract subvectors so fall back to
+  // replacing the shuffle with extract and build vector.
+  // to insert and build vector.
+  EVT EltVT = VT.getVectorElementType();
+  SmallVector<SDValue,8> Ops;
+  for (int Idx : Mask) {
+    SDValue Res;
+
+    if (Idx < 0) {
+      Res = DAG.getUNDEF(EltVT);
+    } else {
+      SDValue &Src = Idx < (int)SrcNumElts ? Src1 : Src2;
+      if (Idx >= (int)SrcNumElts) Idx -= SrcNumElts;
+
+      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src,
+                        DAG.getVectorIdxConstant(Idx, DL));
+    }
+
+    Ops.push_back(Res);
+  }
+
+  setValue(&I, DAG.getBuildVector(VT, DL, Ops));
+}
+
+void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
+  ArrayRef<unsigned> Indices = I.getIndices();
+  const Value *Op0 = I.getOperand(0);
+  const Value *Op1 = I.getOperand(1);
+  Type *AggTy = I.getType();
+  Type *ValTy = Op1->getType();
+  bool IntoUndef = isa<UndefValue>(Op0);
+  bool FromUndef = isa<UndefValue>(Op1);
+
+  unsigned LinearIndex = ComputeLinearIndex(AggTy, Indices);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SmallVector<EVT, 4> AggValueVTs;
+  ComputeValueVTs(TLI, DAG.getDataLayout(), AggTy, AggValueVTs);
+  SmallVector<EVT, 4> ValValueVTs;
+  ComputeValueVTs(TLI, DAG.getDataLayout(), ValTy, ValValueVTs);
+
+  unsigned NumAggValues = AggValueVTs.size();
+  unsigned NumValValues = ValValueVTs.size();
+  SmallVector<SDValue, 4> Values(NumAggValues);
+
+  // Ignore an insertvalue that produces an empty object
+  if (!NumAggValues) {
+    setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
+    return;
+  }
+
+  SDValue Agg = getValue(Op0);
+  unsigned i = 0;
+  // Copy the beginning value(s) from the original aggregate.
+  for (; i != LinearIndex; ++i)
+    Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                SDValue(Agg.getNode(), Agg.getResNo() + i);
+  // Copy values from the inserted value(s).
+  if (NumValValues) {
+    SDValue Val = getValue(Op1);
+    for (; i != LinearIndex + NumValValues; ++i)
+      Values[i] = FromUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                  SDValue(Val.getNode(), Val.getResNo() + i - LinearIndex);
+  }
+  // Copy remaining value(s) from the original aggregate.
+  for (; i != NumAggValues; ++i)
+    Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                SDValue(Agg.getNode(), Agg.getResNo() + i);
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
+                           DAG.getVTList(AggValueVTs), Values));
+}
+
+void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
+  ArrayRef<unsigned> Indices = I.getIndices();
+  const Value *Op0 = I.getOperand(0);
+  Type *AggTy = Op0->getType();
+  Type *ValTy = I.getType();
+  bool OutOfUndef = isa<UndefValue>(Op0);
+
+  unsigned LinearIndex = ComputeLinearIndex(AggTy, Indices);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SmallVector<EVT, 4> ValValueVTs;
+  ComputeValueVTs(TLI, DAG.getDataLayout(), ValTy, ValValueVTs);
+
+  unsigned NumValValues = ValValueVTs.size();
+
+  // Ignore a extractvalue that produces an empty object
+  if (!NumValValues) {
+    setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
+    return;
+  }
+
+  SmallVector<SDValue, 4> Values(NumValValues);
+
+  SDValue Agg = getValue(Op0);
+  // Copy out the selected value(s).
+  for (unsigned i = LinearIndex; i != LinearIndex + NumValValues; ++i)
+    Values[i - LinearIndex] =
+      OutOfUndef ?
+        DAG.getUNDEF(Agg.getNode()->getValueType(Agg.getResNo() + i)) :
+        SDValue(Agg.getNode(), Agg.getResNo() + i);
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
+                           DAG.getVTList(ValValueVTs), Values));
+}
+
+void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
+  Value *Op0 = I.getOperand(0);
+  // Note that the pointer operand may be a vector of pointers. Take the scalar
+  // element which holds a pointer.
+  unsigned AS = Op0->getType()->getScalarType()->getPointerAddressSpace();
+  SDValue N = getValue(Op0);
+  SDLoc dl = getCurSDLoc();
+  auto &TLI = DAG.getTargetLoweringInfo();
+
+  // Normalize Vector GEP - all scalar operands should be converted to the
+  // splat vector.
+  bool IsVectorGEP = I.getType()->isVectorTy();
+  ElementCount VectorElementCount =
+      IsVectorGEP ? cast<VectorType>(I.getType())->getElementCount()
+                  : ElementCount::getFixed(0);
+
+  if (IsVectorGEP && !N.getValueType().isVector()) {
+    LLVMContext &Context = *DAG.getContext();
+    EVT VT = EVT::getVectorVT(Context, N.getValueType(), VectorElementCount);
+    N = DAG.getSplat(VT, dl, N);
+  }
+
+  for (gep_type_iterator GTI = gep_type_begin(&I), E = gep_type_end(&I);
+       GTI != E; ++GTI) {
+    const Value *Idx = GTI.getOperand();
+    if (StructType *StTy = GTI.getStructTypeOrNull()) {
+      unsigned Field = cast<Constant>(Idx)->getUniqueInteger().getZExtValue();
+      if (Field) {
+        // N = N + Offset
+        uint64_t Offset =
+            DAG.getDataLayout().getStructLayout(StTy)->getElementOffset(Field);
+
+        // In an inbounds GEP with an offset that is nonnegative even when
+        // interpreted as signed, assume there is no unsigned overflow.
+        SDNodeFlags Flags;
+        if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds())
+          Flags.setNoUnsignedWrap(true);
+
+        N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N,
+                        DAG.getConstant(Offset, dl, N.getValueType()), Flags);
+      }
+    } else {
+      // IdxSize is the width of the arithmetic according to IR semantics.
+      // In SelectionDAG, we may prefer to do arithmetic in a wider bitwidth
+      // (and fix up the result later).
+      unsigned IdxSize = DAG.getDataLayout().getIndexSizeInBits(AS);
+      MVT IdxTy = MVT::getIntegerVT(IdxSize);
+      TypeSize ElementSize =
+          DAG.getDataLayout().getTypeAllocSize(GTI.getIndexedType());
+      // We intentionally mask away the high bits here; ElementSize may not
+      // fit in IdxTy.
+      APInt ElementMul(IdxSize, ElementSize.getKnownMinValue());
+      bool ElementScalable = ElementSize.isScalable();
+
+      // If this is a scalar constant or a splat vector of constants,
+      // handle it quickly.
+      const auto *C = dyn_cast<Constant>(Idx);
+      if (C && isa<VectorType>(C->getType()))
+        C = C->getSplatValue();
+
+      const auto *CI = dyn_cast_or_null<ConstantInt>(C);
+      if (CI && CI->isZero())
+        continue;
+      if (CI && !ElementScalable) {
+        APInt Offs = ElementMul * CI->getValue().sextOrTrunc(IdxSize);
+        LLVMContext &Context = *DAG.getContext();
+        SDValue OffsVal;
+        if (IsVectorGEP)
+          OffsVal = DAG.getConstant(
+              Offs, dl, EVT::getVectorVT(Context, IdxTy, VectorElementCount));
+        else
+          OffsVal = DAG.getConstant(Offs, dl, IdxTy);
+
+        // In an inbounds GEP with an offset that is nonnegative even when
+        // interpreted as signed, assume there is no unsigned overflow.
+        SDNodeFlags Flags;
+        if (Offs.isNonNegative() && cast<GEPOperator>(I).isInBounds())
+          Flags.setNoUnsignedWrap(true);
+
+        OffsVal = DAG.getSExtOrTrunc(OffsVal, dl, N.getValueType());
+
+        N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, Flags);
+        continue;
+      }
+
+      // N = N + Idx * ElementMul;
+      SDValue IdxN = getValue(Idx);
+
+      if (!IdxN.getValueType().isVector() && IsVectorGEP) {
+        EVT VT = EVT::getVectorVT(*Context, IdxN.getValueType(),
+                                  VectorElementCount);
+        IdxN = DAG.getSplat(VT, dl, IdxN);
+      }
+
+      // If the index is smaller or larger than intptr_t, truncate or extend
+      // it.
+      IdxN = DAG.getSExtOrTrunc(IdxN, dl, N.getValueType());
+
+      if (ElementScalable) {
+        EVT VScaleTy = N.getValueType().getScalarType();
+        SDValue VScale = DAG.getNode(
+            ISD::VSCALE, dl, VScaleTy,
+            DAG.getConstant(ElementMul.getZExtValue(), dl, VScaleTy));
+        if (IsVectorGEP)
+          VScale = DAG.getSplatVector(N.getValueType(), dl, VScale);
+        IdxN = DAG.getNode(ISD::MUL, dl, N.getValueType(), IdxN, VScale);
+      } else {
+        // If this is a multiply by a power of two, turn it into a shl
+        // immediately.  This is a very common case.
+        if (ElementMul != 1) {
+          if (ElementMul.isPowerOf2()) {
+            unsigned Amt = ElementMul.logBase2();
+            IdxN = DAG.getNode(ISD::SHL, dl,
+                               N.getValueType(), IdxN,
+                               DAG.getConstant(Amt, dl, IdxN.getValueType()));
+          } else {
+            SDValue Scale = DAG.getConstant(ElementMul.getZExtValue(), dl,
+                                            IdxN.getValueType());
+            IdxN = DAG.getNode(ISD::MUL, dl,
+                               N.getValueType(), IdxN, Scale);
+          }
+        }
+      }
+
+      N = DAG.getNode(ISD::ADD, dl,
+                      N.getValueType(), N, IdxN);
+    }
+  }
+
+  MVT PtrTy = TLI.getPointerTy(DAG.getDataLayout(), AS);
+  MVT PtrMemTy = TLI.getPointerMemTy(DAG.getDataLayout(), AS);
+  if (IsVectorGEP) {
+    PtrTy = MVT::getVectorVT(PtrTy, VectorElementCount);
+    PtrMemTy = MVT::getVectorVT(PtrMemTy, VectorElementCount);
+  }
+
+  if (PtrMemTy != PtrTy && !cast<GEPOperator>(I).isInBounds())
+    N = DAG.getPtrExtendInReg(N, dl, PtrMemTy);
+
+  setValue(&I, N);
+}
+
+void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
+  // If this is a fixed sized alloca in the entry block of the function,
+  // allocate it statically on the stack.
+  if (FuncInfo.StaticAllocaMap.count(&I))
+    return;   // getValue will auto-populate this.
+
+  SDLoc dl = getCurSDLoc();
+  Type *Ty = I.getAllocatedType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  auto &DL = DAG.getDataLayout();
+  TypeSize TySize = DL.getTypeAllocSize(Ty);
+  MaybeAlign Alignment = std::max(DL.getPrefTypeAlign(Ty), I.getAlign());
+
+  SDValue AllocSize = getValue(I.getArraySize());
+
+  EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout(), I.getAddressSpace());
+  if (AllocSize.getValueType() != IntPtr)
+    AllocSize = DAG.getZExtOrTrunc(AllocSize, dl, IntPtr);
+
+  if (TySize.isScalable())
+    AllocSize = DAG.getNode(ISD::MUL, dl, IntPtr, AllocSize,
+                            DAG.getVScale(dl, IntPtr,
+                                          APInt(IntPtr.getScalarSizeInBits(),
+                                                TySize.getKnownMinValue())));
+  else
+    AllocSize =
+        DAG.getNode(ISD::MUL, dl, IntPtr, AllocSize,
+                    DAG.getConstant(TySize.getFixedValue(), dl, IntPtr));
+
+  // Handle alignment.  If the requested alignment is less than or equal to
+  // the stack alignment, ignore it.  If the size is greater than or equal to
+  // the stack alignment, we note this in the DYNAMIC_STACKALLOC node.
+  Align StackAlign = DAG.getSubtarget().getFrameLowering()->getStackAlign();
+  if (*Alignment <= StackAlign)
+    Alignment = std::nullopt;
+
+  const uint64_t StackAlignMask = StackAlign.value() - 1U;
+  // Round the size of the allocation up to the stack alignment size
+  // by add SA-1 to the size. This doesn't overflow because we're computing
+  // an address inside an alloca.
+  SDNodeFlags Flags;
+  Flags.setNoUnsignedWrap(true);
+  AllocSize = DAG.getNode(ISD::ADD, dl, AllocSize.getValueType(), AllocSize,
+                          DAG.getConstant(StackAlignMask, dl, IntPtr), Flags);
+
+  // Mask out the low bits for alignment purposes.
+  AllocSize = DAG.getNode(ISD::AND, dl, AllocSize.getValueType(), AllocSize,
+                          DAG.getConstant(~StackAlignMask, dl, IntPtr));
+
+  SDValue Ops[] = {
+      getRoot(), AllocSize,
+      DAG.getConstant(Alignment ? Alignment->value() : 0, dl, IntPtr)};
+  SDVTList VTs = DAG.getVTList(AllocSize.getValueType(), MVT::Other);
+  SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, dl, VTs, Ops);
+  setValue(&I, DSA);
+  DAG.setRoot(DSA.getValue(1));
+
+  assert(FuncInfo.MF->getFrameInfo().hasVarSizedObjects());
+}
+
+static const MDNode *getRangeMetadata(const Instruction &I) {
+  // If !noundef is not present, then !range violation results in a poison
+  // value rather than immediate undefined behavior. In theory, transferring
+  // these annotations to SDAG is fine, but in practice there are key SDAG
+  // transforms that are known not to be poison-safe, such as folding logical
+  // and/or to bitwise and/or. For now, only transfer !range if !noundef is
+  // also present.
+  if (!I.hasMetadata(LLVMContext::MD_noundef))
+    return nullptr;
+  return I.getMetadata(LLVMContext::MD_range);
+}
+
+void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
+  if (I.isAtomic())
+    return visitAtomicLoad(I);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const Value *SV = I.getOperand(0);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(SV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return visitLoadFromSwiftError(I);
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
+      if (Alloca->isSwiftError())
+        return visitLoadFromSwiftError(I);
+    }
+  }
+
+  SDValue Ptr = getValue(SV);
+
+  Type *Ty = I.getType();
+  SmallVector<EVT, 4> ValueVTs, MemVTs;
+  SmallVector<TypeSize, 4> Offsets;
+  ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets, 0);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0)
+    return;
+
+  Align Alignment = I.getAlign();
+  AAMDNodes AAInfo = I.getAAMetadata();
+  const MDNode *Ranges = getRangeMetadata(I);
+  bool isVolatile = I.isVolatile();
+  MachineMemOperand::Flags MMOFlags =
+      TLI.getLoadMemOperandFlags(I, DAG.getDataLayout(), AC, LibInfo);
+
+  SDValue Root;
+  bool ConstantMemory = false;
+  if (isVolatile)
+    // Serialize volatile loads with other side effects.
+    Root = getRoot();
+  else if (NumValues > MaxParallelChains)
+    Root = getMemoryRoot();
+  else if (AA &&
+           AA->pointsToConstantMemory(MemoryLocation(
+               SV,
+               LocationSize::precise(DAG.getDataLayout().getTypeStoreSize(Ty)),
+               AAInfo))) {
+    // Do not serialize (non-volatile) loads of constant memory with anything.
+    Root = DAG.getEntryNode();
+    ConstantMemory = true;
+    MMOFlags |= MachineMemOperand::MOInvariant;
+  } else {
+    // Do not serialize non-volatile loads against each other.
+    Root = DAG.getRoot();
+  }
+
+  SDLoc dl = getCurSDLoc();
+
+  if (isVolatile)
+    Root = TLI.prepareVolatileOrAtomicLoad(Root, dl, DAG);
+
+  SmallVector<SDValue, 4> Values(NumValues);
+  SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
+
+  unsigned ChainI = 0;
+  for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
+    // Serializing loads here may result in excessive register pressure, and
+    // TokenFactor places arbitrary choke points on the scheduler. SD scheduling
+    // could recover a bit by hoisting nodes upward in the chain by recognizing
+    // they are side-effect free or do not alias. The optimizer should really
+    // avoid this case by converting large object/array copies to llvm.memcpy
+    // (MaxParallelChains should always remain as failsafe).
+    if (ChainI == MaxParallelChains) {
+      assert(PendingLoads.empty() && "PendingLoads must be serialized first");
+      SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                  ArrayRef(Chains.data(), ChainI));
+      Root = Chain;
+      ChainI = 0;
+    }
+
+    // TODO: MachinePointerInfo only supports a fixed length offset.
+    MachinePointerInfo PtrInfo =
+        !Offsets[i].isScalable() || Offsets[i].isZero()
+            ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue())
+            : MachinePointerInfo();
+
+    SDValue A = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
+    SDValue L = DAG.getLoad(MemVTs[i], dl, Root, A, PtrInfo, Alignment,
+                            MMOFlags, AAInfo, Ranges);
+    Chains[ChainI] = L.getValue(1);
+
+    if (MemVTs[i] != ValueVTs[i])
+      L = DAG.getPtrExtOrTrunc(L, dl, ValueVTs[i]);
+
+    Values[i] = L;
+  }
+
+  if (!ConstantMemory) {
+    SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                ArrayRef(Chains.data(), ChainI));
+    if (isVolatile)
+      DAG.setRoot(Chain);
+    else
+      PendingLoads.push_back(Chain);
+  }
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, dl,
+                           DAG.getVTList(ValueVTs), Values));
+}
+
+void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) {
+  assert(DAG.getTargetLoweringInfo().supportSwiftError() &&
+         "call visitStoreToSwiftError when backend supports swifterror");
+
+  SmallVector<EVT, 4> ValueVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  const Value *SrcV = I.getOperand(0);
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
+                  SrcV->getType(), ValueVTs, &Offsets, 0);
+  assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
+         "expect a single EVT for swifterror");
+
+  SDValue Src = getValue(SrcV);
+  // Create a virtual register, then update the virtual register.
+  Register VReg =
+      SwiftError.getOrCreateVRegDefAt(&I, FuncInfo.MBB, I.getPointerOperand());
+  // Chain, DL, Reg, N or Chain, DL, Reg, N, Glue
+  // Chain can be getRoot or getControlRoot.
+  SDValue CopyNode = DAG.getCopyToReg(getRoot(), getCurSDLoc(), VReg,
+                                      SDValue(Src.getNode(), Src.getResNo()));
+  DAG.setRoot(CopyNode);
+}
+
+void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
+  assert(DAG.getTargetLoweringInfo().supportSwiftError() &&
+         "call visitLoadFromSwiftError when backend supports swifterror");
+
+  assert(!I.isVolatile() &&
+         !I.hasMetadata(LLVMContext::MD_nontemporal) &&
+         !I.hasMetadata(LLVMContext::MD_invariant_load) &&
+         "Support volatile, non temporal, invariant for load_from_swift_error");
+
+  const Value *SV = I.getOperand(0);
+  Type *Ty = I.getType();
+  assert(
+      (!AA ||
+       !AA->pointsToConstantMemory(MemoryLocation(
+           SV, LocationSize::precise(DAG.getDataLayout().getTypeStoreSize(Ty)),
+           I.getAAMetadata()))) &&
+      "load_from_swift_error should not be constant memory");
+
+  SmallVector<EVT, 4> ValueVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), Ty,
+                  ValueVTs, &Offsets, 0);
+  assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
+         "expect a single EVT for swifterror");
+
+  // Chain, DL, Reg, VT, Glue or Chain, DL, Reg, VT
+  SDValue L = DAG.getCopyFromReg(
+      getRoot(), getCurSDLoc(),
+      SwiftError.getOrCreateVRegUseAt(&I, FuncInfo.MBB, SV), ValueVTs[0]);
+
+  setValue(&I, L);
+}
+
+void SelectionDAGBuilder::visitStore(const StoreInst &I) {
+  if (I.isAtomic())
+    return visitAtomicStore(I);
+
+  const Value *SrcV = I.getOperand(0);
+  const Value *PtrV = I.getOperand(1);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return visitStoreToSwiftError(I);
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
+      if (Alloca->isSwiftError())
+        return visitStoreToSwiftError(I);
+    }
+  }
+
+  SmallVector<EVT, 4> ValueVTs, MemVTs;
+  SmallVector<TypeSize, 4> Offsets;
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
+                  SrcV->getType(), ValueVTs, &MemVTs, &Offsets, 0);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0)
+    return;
+
+  // Get the lowered operands. Note that we do this after
+  // checking if NumResults is zero, because with zero results
+  // the operands won't have values in the map.
+  SDValue Src = getValue(SrcV);
+  SDValue Ptr = getValue(PtrV);
+
+  SDValue Root = I.isVolatile() ? getRoot() : getMemoryRoot();
+  SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
+  SDLoc dl = getCurSDLoc();
+  Align Alignment = I.getAlign();
+  AAMDNodes AAInfo = I.getAAMetadata();
+
+  auto MMOFlags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout());
+
+  unsigned ChainI = 0;
+  for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
+    // See visitLoad comments.
+    if (ChainI == MaxParallelChains) {
+      SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                  ArrayRef(Chains.data(), ChainI));
+      Root = Chain;
+      ChainI = 0;
+    }
+
+    // TODO: MachinePointerInfo only supports a fixed length offset.
+    MachinePointerInfo PtrInfo =
+        !Offsets[i].isScalable() || Offsets[i].isZero()
+            ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue())
+            : MachinePointerInfo();
+
+    SDValue Add = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
+    SDValue Val = SDValue(Src.getNode(), Src.getResNo() + i);
+    if (MemVTs[i] != ValueVTs[i])
+      Val = DAG.getPtrExtOrTrunc(Val, dl, MemVTs[i]);
+    SDValue St =
+        DAG.getStore(Root, dl, Val, Add, PtrInfo, Alignment, MMOFlags, AAInfo);
+    Chains[ChainI] = St;
+  }
+
+  SDValue StoreNode = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                  ArrayRef(Chains.data(), ChainI));
+  setValue(&I, StoreNode);
+  DAG.setRoot(StoreNode);
+}
+
+void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
+                                           bool IsCompressing) {
+  SDLoc sdl = getCurSDLoc();
+
+  auto getMaskedStoreOps = [&](Value *&Ptr, Value *&Mask, Value *&Src0,
+                               MaybeAlign &Alignment) {
+    // llvm.masked.store.*(Src0, Ptr, alignment, Mask)
+    Src0 = I.getArgOperand(0);
+    Ptr = I.getArgOperand(1);
+    Alignment = cast<ConstantInt>(I.getArgOperand(2))->getMaybeAlignValue();
+    Mask = I.getArgOperand(3);
+  };
+  auto getCompressingStoreOps = [&](Value *&Ptr, Value *&Mask, Value *&Src0,
+                                    MaybeAlign &Alignment) {
+    // llvm.masked.compressstore.*(Src0, Ptr, Mask)
+    Src0 = I.getArgOperand(0);
+    Ptr = I.getArgOperand(1);
+    Mask = I.getArgOperand(2);
+    Alignment = std::nullopt;
+  };
+
+  Value  *PtrOperand, *MaskOperand, *Src0Operand;
+  MaybeAlign Alignment;
+  if (IsCompressing)
+    getCompressingStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
+  else
+    getMaskedStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
+
+  SDValue Ptr = getValue(PtrOperand);
+  SDValue Src0 = getValue(Src0Operand);
+  SDValue Mask = getValue(MaskOperand);
+  SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
+
+  EVT VT = Src0.getValueType();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlign(VT);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
+      MemoryLocation::UnknownSize, *Alignment, I.getAAMetadata());
+  SDValue StoreNode =
+      DAG.getMaskedStore(getMemoryRoot(), sdl, Src0, Ptr, Offset, Mask, VT, MMO,
+                         ISD::UNINDEXED, false /* Truncating */, IsCompressing);
+  DAG.setRoot(StoreNode);
+  setValue(&I, StoreNode);
+}
+
+// Get a uniform base for the Gather/Scatter intrinsic.
+// The first argument of the Gather/Scatter intrinsic is a vector of pointers.
+// We try to represent it as a base pointer + vector of indices.
+// Usually, the vector of pointers comes from a 'getelementptr' instruction.
+// The first operand of the GEP may be a single pointer or a vector of pointers
+// Example:
+//   %gep.ptr = getelementptr i32, <8 x i32*> %vptr, <8 x i32> %ind
+//  or
+//   %gep.ptr = getelementptr i32, i32* %ptr,        <8 x i32> %ind
+// %res = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.ptr, ..
+//
+// When the first GEP operand is a single pointer - it is the uniform base we
+// are looking for. If first operand of the GEP is a splat vector - we
+// extract the splat value and use it as a uniform base.
+// In all other cases the function returns 'false'.
+static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index,
+                           ISD::MemIndexType &IndexType, SDValue &Scale,
+                           SelectionDAGBuilder *SDB, const BasicBlock *CurBB,
+                           uint64_t ElemSize) {
+  SelectionDAG& DAG = SDB->DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const DataLayout &DL = DAG.getDataLayout();
+
+  assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type");
+
+  // Handle splat constant pointer.
+  if (auto *C = dyn_cast<Constant>(Ptr)) {
+    C = C->getSplatValue();
+    if (!C)
+      return false;
+
+    Base = SDB->getValue(C);
+
+    ElementCount NumElts = cast<VectorType>(Ptr->getType())->getElementCount();
+    EVT VT = EVT::getVectorVT(*DAG.getContext(), TLI.getPointerTy(DL), NumElts);
+    Index = DAG.getConstant(0, SDB->getCurSDLoc(), VT);
+    IndexType = ISD::SIGNED_SCALED;
+    Scale = DAG.getTargetConstant(1, SDB->getCurSDLoc(), TLI.getPointerTy(DL));
+    return true;
+  }
+
+  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!GEP || GEP->getParent() != CurBB)
+    return false;
+
+  if (GEP->getNumOperands() != 2)
+    return false;
+
+  const Value *BasePtr = GEP->getPointerOperand();
+  const Value *IndexVal = GEP->getOperand(GEP->getNumOperands() - 1);
+
+  // Make sure the base is scalar and the index is a vector.
+  if (BasePtr->getType()->isVectorTy() || !IndexVal->getType()->isVectorTy())
+    return false;
+
+  TypeSize ScaleVal = DL.getTypeAllocSize(GEP->getResultElementType());
+  if (ScaleVal.isScalable())
+    return false;
+
+  // Target may not support the required addressing mode.
+  if (ScaleVal != 1 &&
+      !TLI.isLegalScaleForGatherScatter(ScaleVal.getFixedValue(), ElemSize))
+    return false;
+
+  Base = SDB->getValue(BasePtr);
+  Index = SDB->getValue(IndexVal);
+  IndexType = ISD::SIGNED_SCALED;
+
+  Scale =
+      DAG.getTargetConstant(ScaleVal, SDB->getCurSDLoc(), TLI.getPointerTy(DL));
+  return true;
+}
+
+void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
+  SDLoc sdl = getCurSDLoc();
+
+  // llvm.masked.scatter.*(Src0, Ptrs, alignment, Mask)
+  const Value *Ptr = I.getArgOperand(1);
+  SDValue Src0 = getValue(I.getArgOperand(0));
+  SDValue Mask = getValue(I.getArgOperand(3));
+  EVT VT = Src0.getValueType();
+  Align Alignment = cast<ConstantInt>(I.getArgOperand(2))
+                        ->getMaybeAlignValue()
+                        .value_or(DAG.getEVTAlign(VT.getScalarType()));
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  SDValue Base;
+  SDValue Index;
+  ISD::MemIndexType IndexType;
+  SDValue Scale;
+  bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
+                                    I.getParent(), VT.getScalarStoreSize());
+
+  unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(AS), MachineMemOperand::MOStore,
+      // TODO: Make MachineMemOperands aware of scalable
+      // vectors.
+      MemoryLocation::UnknownSize, Alignment, I.getAAMetadata());
+  if (!UniformBase) {
+    Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+    Index = getValue(Ptr);
+    IndexType = ISD::SIGNED_SCALED;
+    Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+  }
+
+  EVT IdxVT = Index.getValueType();
+  EVT EltTy = IdxVT.getVectorElementType();
+  if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) {
+    EVT NewIdxVT = IdxVT.changeVectorElementType(EltTy);
+    Index = DAG.getNode(ISD::SIGN_EXTEND, sdl, NewIdxVT, Index);
+  }
+
+  SDValue Ops[] = { getMemoryRoot(), Src0, Mask, Base, Index, Scale };
+  SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), VT, sdl,
+                                         Ops, MMO, IndexType, false);
+  DAG.setRoot(Scatter);
+  setValue(&I, Scatter);
+}
+
+void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
+  SDLoc sdl = getCurSDLoc();
+
+  auto getMaskedLoadOps = [&](Value *&Ptr, Value *&Mask, Value *&Src0,
+                              MaybeAlign &Alignment) {
+    // @llvm.masked.load.*(Ptr, alignment, Mask, Src0)
+    Ptr = I.getArgOperand(0);
+    Alignment = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
+    Mask = I.getArgOperand(2);
+    Src0 = I.getArgOperand(3);
+  };
+  auto getExpandingLoadOps = [&](Value *&Ptr, Value *&Mask, Value *&Src0,
+                                 MaybeAlign &Alignment) {
+    // @llvm.masked.expandload.*(Ptr, Mask, Src0)
+    Ptr = I.getArgOperand(0);
+    Alignment = std::nullopt;
+    Mask = I.getArgOperand(1);
+    Src0 = I.getArgOperand(2);
+  };
+
+  Value  *PtrOperand, *MaskOperand, *Src0Operand;
+  MaybeAlign Alignment;
+  if (IsExpanding)
+    getExpandingLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
+  else
+    getMaskedLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
+
+  SDValue Ptr = getValue(PtrOperand);
+  SDValue Src0 = getValue(Src0Operand);
+  SDValue Mask = getValue(MaskOperand);
+  SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
+
+  EVT VT = Src0.getValueType();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlign(VT);
+
+  AAMDNodes AAInfo = I.getAAMetadata();
+  const MDNode *Ranges = getRangeMetadata(I);
+
+  // Do not serialize masked loads of constant memory with anything.
+  MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo);
+  bool AddToChain = !AA || !AA->pointsToConstantMemory(ML);
+
+  SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
+      MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges);
+
+  SDValue Load =
+      DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Offset, Mask, Src0, VT, MMO,
+                        ISD::UNINDEXED, ISD::NON_EXTLOAD, IsExpanding);
+  if (AddToChain)
+    PendingLoads.push_back(Load.getValue(1));
+  setValue(&I, Load);
+}
+
+void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
+  SDLoc sdl = getCurSDLoc();
+
+  // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0)
+  const Value *Ptr = I.getArgOperand(0);
+  SDValue Src0 = getValue(I.getArgOperand(3));
+  SDValue Mask = getValue(I.getArgOperand(2));
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  Align Alignment = cast<ConstantInt>(I.getArgOperand(1))
+                        ->getMaybeAlignValue()
+                        .value_or(DAG.getEVTAlign(VT.getScalarType()));
+
+  const MDNode *Ranges = getRangeMetadata(I);
+
+  SDValue Root = DAG.getRoot();
+  SDValue Base;
+  SDValue Index;
+  ISD::MemIndexType IndexType;
+  SDValue Scale;
+  bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
+                                    I.getParent(), VT.getScalarStoreSize());
+  unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(AS), MachineMemOperand::MOLoad,
+      // TODO: Make MachineMemOperands aware of scalable
+      // vectors.
+      MemoryLocation::UnknownSize, Alignment, I.getAAMetadata(), Ranges);
+
+  if (!UniformBase) {
+    Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+    Index = getValue(Ptr);
+    IndexType = ISD::SIGNED_SCALED;
+    Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+  }
+
+  EVT IdxVT = Index.getValueType();
+  EVT EltTy = IdxVT.getVectorElementType();
+  if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) {
+    EVT NewIdxVT = IdxVT.changeVectorElementType(EltTy);
+    Index = DAG.getNode(ISD::SIGN_EXTEND, sdl, NewIdxVT, Index);
+  }
+
+  SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale };
+  SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl,
+                                       Ops, MMO, IndexType, ISD::NON_EXTLOAD);
+
+  PendingLoads.push_back(Gather.getValue(1));
+  setValue(&I, Gather);
+}
+
+void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
+  SDLoc dl = getCurSDLoc();
+  AtomicOrdering SuccessOrdering = I.getSuccessOrdering();
+  AtomicOrdering FailureOrdering = I.getFailureOrdering();
+  SyncScope::ID SSID = I.getSyncScopeID();
+
+  SDValue InChain = getRoot();
+
+  MVT MemVT = getValue(I.getCompareOperand()).getSimpleValueType();
+  SDVTList VTs = DAG.getVTList(MemVT, MVT::i1, MVT::Other);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  auto Flags = TLI.getAtomicMemOperandFlags(I, DAG.getDataLayout());
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(I.getPointerOperand()), Flags, MemVT.getStoreSize(),
+      DAG.getEVTAlign(MemVT), AAMDNodes(), nullptr, SSID, SuccessOrdering,
+      FailureOrdering);
+
+  SDValue L = DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
+                                   dl, MemVT, VTs, InChain,
+                                   getValue(I.getPointerOperand()),
+                                   getValue(I.getCompareOperand()),
+                                   getValue(I.getNewValOperand()), MMO);
+
+  SDValue OutChain = L.getValue(2);
+
+  setValue(&I, L);
+  DAG.setRoot(OutChain);
+}
+
+void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
+  SDLoc dl = getCurSDLoc();
+  ISD::NodeType NT;
+  switch (I.getOperation()) {
+  default: llvm_unreachable("Unknown atomicrmw operation");
+  case AtomicRMWInst::Xchg: NT = ISD::ATOMIC_SWAP; break;
+  case AtomicRMWInst::Add:  NT = ISD::ATOMIC_LOAD_ADD; break;
+  case AtomicRMWInst::Sub:  NT = ISD::ATOMIC_LOAD_SUB; break;
+  case AtomicRMWInst::And:  NT = ISD::ATOMIC_LOAD_AND; break;
+  case AtomicRMWInst::Nand: NT = ISD::ATOMIC_LOAD_NAND; break;
+  case AtomicRMWInst::Or:   NT = ISD::ATOMIC_LOAD_OR; break;
+  case AtomicRMWInst::Xor:  NT = ISD::ATOMIC_LOAD_XOR; break;
+  case AtomicRMWInst::Max:  NT = ISD::ATOMIC_LOAD_MAX; break;
+  case AtomicRMWInst::Min:  NT = ISD::ATOMIC_LOAD_MIN; break;
+  case AtomicRMWInst::UMax: NT = ISD::ATOMIC_LOAD_UMAX; break;
+  case AtomicRMWInst::UMin: NT = ISD::ATOMIC_LOAD_UMIN; break;
+  case AtomicRMWInst::FAdd: NT = ISD::ATOMIC_LOAD_FADD; break;
+  case AtomicRMWInst::FSub: NT = ISD::ATOMIC_LOAD_FSUB; break;
+  case AtomicRMWInst::FMax: NT = ISD::ATOMIC_LOAD_FMAX; break;
+  case AtomicRMWInst::FMin: NT = ISD::ATOMIC_LOAD_FMIN; break;
+  case AtomicRMWInst::UIncWrap:
+    NT = ISD::ATOMIC_LOAD_UINC_WRAP;
+    break;
+  case AtomicRMWInst::UDecWrap:
+    NT = ISD::ATOMIC_LOAD_UDEC_WRAP;
+    break;
+  }
+  AtomicOrdering Ordering = I.getOrdering();
+  SyncScope::ID SSID = I.getSyncScopeID();
+
+  SDValue InChain = getRoot();
+
+  auto MemVT = getValue(I.getValOperand()).getSimpleValueType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  auto Flags = TLI.getAtomicMemOperandFlags(I, DAG.getDataLayout());
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(I.getPointerOperand()), Flags, MemVT.getStoreSize(),
+      DAG.getEVTAlign(MemVT), AAMDNodes(), nullptr, SSID, Ordering);
+
+  SDValue L =
+    DAG.getAtomic(NT, dl, MemVT, InChain,
+                  getValue(I.getPointerOperand()), getValue(I.getValOperand()),
+                  MMO);
+
+  SDValue OutChain = L.getValue(1);
+
+  setValue(&I, L);
+  DAG.setRoot(OutChain);
+}
+
+void SelectionDAGBuilder::visitFence(const FenceInst &I) {
+  SDLoc dl = getCurSDLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Ops[3];
+  Ops[0] = getRoot();
+  Ops[1] = DAG.getTargetConstant((unsigned)I.getOrdering(), dl,
+                                 TLI.getFenceOperandTy(DAG.getDataLayout()));
+  Ops[2] = DAG.getTargetConstant(I.getSyncScopeID(), dl,
+                                 TLI.getFenceOperandTy(DAG.getDataLayout()));
+  SDValue N = DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops);
+  setValue(&I, N);
+  DAG.setRoot(N);
+}
+
+void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
+  SDLoc dl = getCurSDLoc();
+  AtomicOrdering Order = I.getOrdering();
+  SyncScope::ID SSID = I.getSyncScopeID();
+
+  SDValue InChain = getRoot();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  EVT MemVT = TLI.getMemValueType(DAG.getDataLayout(), I.getType());
+
+  if (!TLI.supportsUnalignedAtomics() &&
+      I.getAlign().value() < MemVT.getSizeInBits() / 8)
+    report_fatal_error("Cannot generate unaligned atomic load");
+
+  auto Flags = TLI.getLoadMemOperandFlags(I, DAG.getDataLayout(), AC, LibInfo);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(I.getPointerOperand()), Flags, MemVT.getStoreSize(),
+      I.getAlign(), AAMDNodes(), nullptr, SSID, Order);
+
+  InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG);
+
+  SDValue Ptr = getValue(I.getPointerOperand());
+
+  if (TLI.lowerAtomicLoadAsLoadSDNode(I)) {
+    // TODO: Once this is better exercised by tests, it should be merged with
+    // the normal path for loads to prevent future divergence.
+    SDValue L = DAG.getLoad(MemVT, dl, InChain, Ptr, MMO);
+    if (MemVT != VT)
+      L = DAG.getPtrExtOrTrunc(L, dl, VT);
+
+    setValue(&I, L);
+    SDValue OutChain = L.getValue(1);
+    if (!I.isUnordered())
+      DAG.setRoot(OutChain);
+    else
+      PendingLoads.push_back(OutChain);
+    return;
+  }
+
+  SDValue L = DAG.getAtomic(ISD::ATOMIC_LOAD, dl, MemVT, MemVT, InChain,
+                            Ptr, MMO);
+
+  SDValue OutChain = L.getValue(1);
+  if (MemVT != VT)
+    L = DAG.getPtrExtOrTrunc(L, dl, VT);
+
+  setValue(&I, L);
+  DAG.setRoot(OutChain);
+}
+
+void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
+  SDLoc dl = getCurSDLoc();
+
+  AtomicOrdering Ordering = I.getOrdering();
+  SyncScope::ID SSID = I.getSyncScopeID();
+
+  SDValue InChain = getRoot();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT MemVT =
+      TLI.getMemValueType(DAG.getDataLayout(), I.getValueOperand()->getType());
+
+  if (!TLI.supportsUnalignedAtomics() &&
+      I.getAlign().value() < MemVT.getSizeInBits() / 8)
+    report_fatal_error("Cannot generate unaligned atomic store");
+
+  auto Flags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout());
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(I.getPointerOperand()), Flags, MemVT.getStoreSize(),
+      I.getAlign(), AAMDNodes(), nullptr, SSID, Ordering);
+
+  SDValue Val = getValue(I.getValueOperand());
+  if (Val.getValueType() != MemVT)
+    Val = DAG.getPtrExtOrTrunc(Val, dl, MemVT);
+  SDValue Ptr = getValue(I.getPointerOperand());
+
+  if (TLI.lowerAtomicStoreAsStoreSDNode(I)) {
+    // TODO: Once this is better exercised by tests, it should be merged with
+    // the normal path for stores to prevent future divergence.
+    SDValue S = DAG.getStore(InChain, dl, Val, Ptr, MMO);
+    setValue(&I, S);
+    DAG.setRoot(S);
+    return;
+  }
+  SDValue OutChain = DAG.getAtomic(ISD::ATOMIC_STORE, dl, MemVT, InChain,
+                                   Ptr, Val, MMO);
+
+  setValue(&I, OutChain);
+  DAG.setRoot(OutChain);
+}
+
+/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
+/// node.
+void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
+                                               unsigned Intrinsic) {
+  // Ignore the callsite's attributes. A specific call site may be marked with
+  // readnone, but the lowering code will expect the chain based on the
+  // definition.
+  const Function *F = I.getCalledFunction();
+  bool HasChain = !F->doesNotAccessMemory();
+  bool OnlyLoad = HasChain && F->onlyReadsMemory();
+
+  // Build the operand list.
+  SmallVector<SDValue, 8> Ops;
+  if (HasChain) {  // If this intrinsic has side-effects, chainify it.
+    if (OnlyLoad) {
+      // We don't need to serialize loads against other loads.
+      Ops.push_back(DAG.getRoot());
+    } else {
+      Ops.push_back(getRoot());
+    }
+  }
+
+  // Info is set by getTgtMemIntrinsic
+  TargetLowering::IntrinsicInfo Info;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I,
+                                               DAG.getMachineFunction(),
+                                               Intrinsic);
+
+  // Add the intrinsic ID as an integer operand if it's not a target intrinsic.
+  if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID ||
+      Info.opc == ISD::INTRINSIC_W_CHAIN)
+    Ops.push_back(DAG.getTargetConstant(Intrinsic, getCurSDLoc(),
+                                        TLI.getPointerTy(DAG.getDataLayout())));
+
+  // Add all operands of the call to the operand list.
+  for (unsigned i = 0, e = I.arg_size(); i != e; ++i) {
+    const Value *Arg = I.getArgOperand(i);
+    if (!I.paramHasAttr(i, Attribute::ImmArg)) {
+      Ops.push_back(getValue(Arg));
+      continue;
+    }
+
+    // Use TargetConstant instead of a regular constant for immarg.
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), Arg->getType(), true);
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(Arg)) {
+      assert(CI->getBitWidth() <= 64 &&
+             "large intrinsic immediates not handled");
+      Ops.push_back(DAG.getTargetConstant(*CI, SDLoc(), VT));
+    } else {
+      Ops.push_back(
+          DAG.getTargetConstantFP(*cast<ConstantFP>(Arg), SDLoc(), VT));
+    }
+  }
+
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);
+
+  if (HasChain)
+    ValueVTs.push_back(MVT::Other);
+
+  SDVTList VTs = DAG.getVTList(ValueVTs);
+
+  // Propagate fast-math-flags from IR to node(s).
+  SDNodeFlags Flags;
+  if (auto *FPMO = dyn_cast<FPMathOperator>(&I))
+    Flags.copyFMF(*FPMO);
+  SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
+
+  // Create the node.
+  SDValue Result;
+  // In some cases, custom collection of operands from CallInst I may be needed.
+  TLI.CollectTargetIntrinsicOperands(I, Ops, DAG);
+  if (IsTgtIntrinsic) {
+    // This is target intrinsic that touches memory
+    //
+    // TODO: We currently just fallback to address space 0 if getTgtMemIntrinsic
+    //       didn't yield anything useful.
+    MachinePointerInfo MPI;
+    if (Info.ptrVal)
+      MPI = MachinePointerInfo(Info.ptrVal, Info.offset);
+    else if (Info.fallbackAddressSpace)
+      MPI = MachinePointerInfo(*Info.fallbackAddressSpace);
+    Result = DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops,
+                                     Info.memVT, MPI, Info.align, Info.flags,
+                                     Info.size, I.getAAMetadata());
+  } else if (!HasChain) {
+    Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
+  } else if (!I.getType()->isVoidTy()) {
+    Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
+  } else {
+    Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
+  }
+
+  if (HasChain) {
+    SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1);
+    if (OnlyLoad)
+      PendingLoads.push_back(Chain);
+    else
+      DAG.setRoot(Chain);
+  }
+
+  if (!I.getType()->isVoidTy()) {
+    if (!isa<VectorType>(I.getType()))
+      Result = lowerRangeToAssertZExt(DAG, I, Result);
+
+    MaybeAlign Alignment = I.getRetAlign();
+
+    // Insert `assertalign` node if there's an alignment.
+    if (InsertAssertAlign && Alignment) {
+      Result =
+          DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne());
+    }
+
+    setValue(&I, Result);
+  }
+}
+
+/// GetSignificand - Get the significand and build it into a floating-point
+/// number with exponent of 1:
+///
+///   Op = (Op & 0x007fffff) | 0x3f800000;
+///
+/// where Op is the hexadecimal representation of floating point value.
+static SDValue GetSignificand(SelectionDAG &DAG, SDValue Op, const SDLoc &dl) {
+  SDValue t1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
+                           DAG.getConstant(0x007fffff, dl, MVT::i32));
+  SDValue t2 = DAG.getNode(ISD::OR, dl, MVT::i32, t1,
+                           DAG.getConstant(0x3f800000, dl, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, MVT::f32, t2);
+}
+
+/// GetExponent - Get the exponent:
+///
+///   (float)(int)(((Op & 0x7f800000) >> 23) - 127);
+///
+/// where Op is the hexadecimal representation of floating point value.
+static SDValue GetExponent(SelectionDAG &DAG, SDValue Op,
+                           const TargetLowering &TLI, const SDLoc &dl) {
+  SDValue t0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
+                           DAG.getConstant(0x7f800000, dl, MVT::i32));
+  SDValue t1 = DAG.getNode(
+      ISD::SRL, dl, MVT::i32, t0,
+      DAG.getConstant(23, dl,
+                      TLI.getShiftAmountTy(MVT::i32, DAG.getDataLayout())));
+  SDValue t2 = DAG.getNode(ISD::SUB, dl, MVT::i32, t1,
+                           DAG.getConstant(127, dl, MVT::i32));
+  return DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, t2);
+}
+
+/// getF32Constant - Get 32-bit floating point constant.
+static SDValue getF32Constant(SelectionDAG &DAG, unsigned Flt,
+                              const SDLoc &dl) {
+  return DAG.getConstantFP(APFloat(APFloat::IEEEsingle(), APInt(32, Flt)), dl,
+                           MVT::f32);
+}
+
+static SDValue getLimitedPrecisionExp2(SDValue t0, const SDLoc &dl,
+                                       SelectionDAG &DAG) {
+  // TODO: What fast-math-flags should be set on the floating-point nodes?
+
+  //   IntegerPartOfX = ((int32_t)(t0);
+  SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, t0);
+
+  //   FractionalPartOfX = t0 - (float)IntegerPartOfX;
+  SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX);
+  SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, t1);
+
+  //   IntegerPartOfX <<= 23;
+  IntegerPartOfX =
+      DAG.getNode(ISD::SHL, dl, MVT::i32, IntegerPartOfX,
+                  DAG.getConstant(23, dl,
+                                  DAG.getTargetLoweringInfo().getShiftAmountTy(
+                                      MVT::i32, DAG.getDataLayout())));
+
+  SDValue TwoToFractionalPartOfX;
+  if (LimitFloatPrecision <= 6) {
+    // For floating-point precision of 6:
+    //
+    //   TwoToFractionalPartOfX =
+    //     0.997535578f +
+    //       (0.735607626f + 0.252464424f * x) * x;
+    //
+    // error 0.0144103317, which is 6 bits
+    SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                             getF32Constant(DAG, 0x3e814304, dl));
+    SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                             getF32Constant(DAG, 0x3f3c50c8, dl));
+    SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+    TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                                         getF32Constant(DAG, 0x3f7f5e7e, dl));
+  } else if (LimitFloatPrecision <= 12) {
+    // For floating-point precision of 12:
+    //
+    //   TwoToFractionalPartOfX =
+    //     0.999892986f +
+    //       (0.696457318f +
+    //         (0.224338339f + 0.792043434e-1f * x) * x) * x;
+    //
+    // error 0.000107046256, which is 13 to 14 bits
+    SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                             getF32Constant(DAG, 0x3da235e3, dl));
+    SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                             getF32Constant(DAG, 0x3e65b8f3, dl));
+    SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+    SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                             getF32Constant(DAG, 0x3f324b07, dl));
+    SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+    TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                                         getF32Constant(DAG, 0x3f7ff8fd, dl));
+  } else { // LimitFloatPrecision <= 18
+    // For floating-point precision of 18:
+    //
+    //   TwoToFractionalPartOfX =
+    //     0.999999982f +
+    //       (0.693148872f +
+    //         (0.240227044f +
+    //           (0.554906021e-1f +
+    //             (0.961591928e-2f +
+    //               (0.136028312e-2f + 0.157059148e-3f *x)*x)*x)*x)*x)*x;
+    // error 2.47208000*10^(-7), which is better than 18 bits
+    SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                             getF32Constant(DAG, 0x3924b03e, dl));
+    SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                             getF32Constant(DAG, 0x3ab24b87, dl));
+    SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+    SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                             getF32Constant(DAG, 0x3c1d8c17, dl));
+    SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+    SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                             getF32Constant(DAG, 0x3d634a1d, dl));
+    SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+    SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                             getF32Constant(DAG, 0x3e75fe14, dl));
+    SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+    SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10,
+                              getF32Constant(DAG, 0x3f317234, dl));
+    SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X);
+    TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t12,
+                                         getF32Constant(DAG, 0x3f800000, dl));
+  }
+
+  // Add the exponent into the result in integer domain.
+  SDValue t13 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, TwoToFractionalPartOfX);
+  return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
+                     DAG.getNode(ISD::ADD, dl, MVT::i32, t13, IntegerPartOfX));
+}
+
+/// expandExp - Lower an exp intrinsic. Handles the special sequences for
+/// limited-precision mode.
+static SDValue expandExp(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
+                         const TargetLowering &TLI, SDNodeFlags Flags) {
+  if (Op.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+
+    // Put the exponent in the right bit position for later addition to the
+    // final result:
+    //
+    // t0 = Op * log2(e)
+
+    // TODO: What fast-math-flags should be set here?
+    SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, Op,
+                             DAG.getConstantFP(numbers::log2ef, dl, MVT::f32));
+    return getLimitedPrecisionExp2(t0, dl, DAG);
+  }
+
+  // No special expansion.
+  return DAG.getNode(ISD::FEXP, dl, Op.getValueType(), Op, Flags);
+}
+
+/// expandLog - Lower a log intrinsic. Handles the special sequences for
+/// limited-precision mode.
+static SDValue expandLog(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
+                         const TargetLowering &TLI, SDNodeFlags Flags) {
+  // TODO: What fast-math-flags should be set on the floating-point nodes?
+
+  if (Op.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+
+    // Scale the exponent by log(2).
+    SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
+    SDValue LogOfExponent =
+        DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
+                    DAG.getConstantFP(numbers::ln2f, dl, MVT::f32));
+
+    // Get the significand and build it into a floating-point number with
+    // exponent of 1.
+    SDValue X = GetSignificand(DAG, Op1, dl);
+
+    SDValue LogOfMantissa;
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   LogofMantissa =
+      //     -1.1609546f +
+      //       (1.4034025f - 0.23903021f * x) * x;
+      //
+      // error 0.0034276066, which is better than 8 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbe74c456, dl));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3fb3a2b1, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                                  getF32Constant(DAG, 0x3f949a29, dl));
+    } else if (LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   LogOfMantissa =
+      //     -1.7417939f +
+      //       (2.8212026f +
+      //         (-1.4699568f +
+      //           (0.44717955f - 0.56570851e-1f * x) * x) * x) * x;
+      //
+      // error 0.000061011436, which is 14 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbd67b6d6, dl));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3ee4f4b8, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3fbc278b, dl));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x40348e95, dl));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                                  getF32Constant(DAG, 0x3fdef31a, dl));
+    } else { // LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   LogOfMantissa =
+      //     -2.1072184f +
+      //       (4.2372794f +
+      //         (-3.7029485f +
+      //           (2.2781945f +
+      //             (-0.87823314f +
+      //               (0.19073739f - 0.17809712e-1f * x) * x) * x) * x) * x)*x;
+      //
+      // error 0.0000023660568, which is better than 18 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbc91e5ac, dl));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3e4350aa, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f60d3e3, dl));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x4011cdf0, dl));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x406cfd1c, dl));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x408797cb, dl));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
+                                  getF32Constant(DAG, 0x4006dcab, dl));
+    }
+
+    return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, LogOfMantissa);
+  }
+
+  // No special expansion.
+  return DAG.getNode(ISD::FLOG, dl, Op.getValueType(), Op, Flags);
+}
+
+/// expandLog2 - Lower a log2 intrinsic. Handles the special sequences for
+/// limited-precision mode.
+static SDValue expandLog2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
+                          const TargetLowering &TLI, SDNodeFlags Flags) {
+  // TODO: What fast-math-flags should be set on the floating-point nodes?
+
+  if (Op.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+
+    // Get the exponent.
+    SDValue LogOfExponent = GetExponent(DAG, Op1, TLI, dl);
+
+    // Get the significand and build it into a floating-point number with
+    // exponent of 1.
+    SDValue X = GetSignificand(DAG, Op1, dl);
+
+    // Different possible minimax approximations of significand in
+    // floating-point for various degrees of accuracy over [1,2].
+    SDValue Log2ofMantissa;
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   Log2ofMantissa = -1.6749035f + (2.0246817f - .34484768f * x) * x;
+      //
+      // error 0.0049451742, which is more than 7 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbeb08fe0, dl));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x40019463, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                                   getF32Constant(DAG, 0x3fd6633d, dl));
+    } else if (LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   Log2ofMantissa =
+      //     -2.51285454f +
+      //       (4.07009056f +
+      //         (-2.12067489f +
+      //           (.645142248f - 0.816157886e-1f * x) * x) * x) * x;
+      //
+      // error 0.0000876136000, which is better than 13 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbda7262e, dl));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3f25280b, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x4007b923, dl));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x40823e2f, dl));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                                   getF32Constant(DAG, 0x4020d29c, dl));
+    } else { // LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   Log2ofMantissa =
+      //     -3.0400495f +
+      //       (6.1129976f +
+      //         (-5.3420409f +
+      //           (3.2865683f +
+      //             (-1.2669343f +
+      //               (0.27515199f -
+      //                 0.25691327e-1f * x) * x) * x) * x) * x) * x;
+      //
+      // error 0.0000018516, which is better than 18 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbcd2769e, dl));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3e8ce0b9, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3fa22ae7, dl));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x40525723, dl));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x40aaf200, dl));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x40c39dad, dl));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
+                                   getF32Constant(DAG, 0x4042902c, dl));
+    }
+
+    return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, Log2ofMantissa);
+  }
+
+  // No special expansion.
+  return DAG.getNode(ISD::FLOG2, dl, Op.getValueType(), Op, Flags);
+}
+
+/// expandLog10 - Lower a log10 intrinsic. Handles the special sequences for
+/// limited-precision mode.
+static SDValue expandLog10(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
+                           const TargetLowering &TLI, SDNodeFlags Flags) {
+  // TODO: What fast-math-flags should be set on the floating-point nodes?
+
+  if (Op.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+
+    // Scale the exponent by log10(2) [0.30102999f].
+    SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
+    SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
+                                        getF32Constant(DAG, 0x3e9a209a, dl));
+
+    // Get the significand and build it into a floating-point number with
+    // exponent of 1.
+    SDValue X = GetSignificand(DAG, Op1, dl);
+
+    SDValue Log10ofMantissa;
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   Log10ofMantissa =
+      //     -0.50419619f +
+      //       (0.60948995f - 0.10380950f * x) * x;
+      //
+      // error 0.0014886165, which is 6 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbdd49a13, dl));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3f1c0789, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                                    getF32Constant(DAG, 0x3f011300, dl));
+    } else if (LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   Log10ofMantissa =
+      //     -0.64831180f +
+      //       (0.91751397f +
+      //         (-0.31664806f + 0.47637168e-1f * x) * x) * x;
+      //
+      // error 0.00019228036, which is better than 12 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3d431f31, dl));
+      SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3ea21fb2, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f6ae232, dl));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
+                                    getF32Constant(DAG, 0x3f25f7c3, dl));
+    } else { // LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   Log10ofMantissa =
+      //     -0.84299375f +
+      //       (1.5327582f +
+      //         (-1.0688956f +
+      //           (0.49102474f +
+      //             (-0.12539807f + 0.13508273e-1f * x) * x) * x) * x) * x;
+      //
+      // error 0.0000037995730, which is better than 18 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3c5d51ce, dl));
+      SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3e00685a, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3efb6798, dl));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f88d192, dl));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3fc4316c, dl));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t8,
+                                    getF32Constant(DAG, 0x3f57ce70, dl));
+    }
+
+    return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, Log10ofMantissa);
+  }
+
+  // No special expansion.
+  return DAG.getNode(ISD::FLOG10, dl, Op.getValueType(), Op, Flags);
+}
+
+/// expandExp2 - Lower an exp2 intrinsic. Handles the special sequences for
+/// limited-precision mode.
+static SDValue expandExp2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
+                          const TargetLowering &TLI, SDNodeFlags Flags) {
+  if (Op.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18)
+    return getLimitedPrecisionExp2(Op, dl, DAG);
+
+  // No special expansion.
+  return DAG.getNode(ISD::FEXP2, dl, Op.getValueType(), Op, Flags);
+}
+
+/// visitPow - Lower a pow intrinsic. Handles the special sequences for
+/// limited-precision mode with x == 10.0f.
+static SDValue expandPow(const SDLoc &dl, SDValue LHS, SDValue RHS,
+                         SelectionDAG &DAG, const TargetLowering &TLI,
+                         SDNodeFlags Flags) {
+  bool IsExp10 = false;
+  if (LHS.getValueType() == MVT::f32 && RHS.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    if (ConstantFPSDNode *LHSC = dyn_cast<ConstantFPSDNode>(LHS)) {
+      APFloat Ten(10.0f);
+      IsExp10 = LHSC->isExactlyValue(Ten);
+    }
+  }
+
+  // TODO: What fast-math-flags should be set on the FMUL node?
+  if (IsExp10) {
+    // Put the exponent in the right bit position for later addition to the
+    // final result:
+    //
+    //   #define LOG2OF10 3.3219281f
+    //   t0 = Op * LOG2OF10;
+    SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, RHS,
+                             getF32Constant(DAG, 0x40549a78, dl));
+    return getLimitedPrecisionExp2(t0, dl, DAG);
+  }
+
+  // No special expansion.
+  return DAG.getNode(ISD::FPOW, dl, LHS.getValueType(), LHS, RHS, Flags);
+}
+
+/// ExpandPowI - Expand a llvm.powi intrinsic.
+static SDValue ExpandPowI(const SDLoc &DL, SDValue LHS, SDValue RHS,
+                          SelectionDAG &DAG) {
+  // If RHS is a constant, we can expand this out to a multiplication tree if
+  // it's beneficial on the target, otherwise we end up lowering to a call to
+  // __powidf2 (for example).
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+    unsigned Val = RHSC->getSExtValue();
+
+    // powi(x, 0) -> 1.0
+    if (Val == 0)
+      return DAG.getConstantFP(1.0, DL, LHS.getValueType());
+
+    if (DAG.getTargetLoweringInfo().isBeneficialToExpandPowI(
+            Val, DAG.shouldOptForSize())) {
+      // Get the exponent as a positive value.
+      if ((int)Val < 0)
+        Val = -Val;
+      // We use the simple binary decomposition method to generate the multiply
+      // sequence.  There are more optimal ways to do this (for example,
+      // powi(x,15) generates one more multiply than it should), but this has
+      // the benefit of being both really simple and much better than a libcall.
+      SDValue Res; // Logically starts equal to 1.0
+      SDValue CurSquare = LHS;
+      // TODO: Intrinsics should have fast-math-flags that propagate to these
+      // nodes.
+      while (Val) {
+        if (Val & 1) {
+          if (Res.getNode())
+            Res =
+                DAG.getNode(ISD::FMUL, DL, Res.getValueType(), Res, CurSquare);
+          else
+            Res = CurSquare; // 1.0*CurSquare.
+        }
+
+        CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(),
+                                CurSquare, CurSquare);
+        Val >>= 1;
+      }
+
+      // If the original was negative, invert the result, producing 1/(x*x*x).
+      if (RHSC->getSExtValue() < 0)
+        Res = DAG.getNode(ISD::FDIV, DL, LHS.getValueType(),
+                          DAG.getConstantFP(1.0, DL, LHS.getValueType()), Res);
+      return Res;
+    }
+  }
+
+  // Otherwise, expand to a libcall.
+  return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS);
+}
+
+static SDValue expandDivFix(unsigned Opcode, const SDLoc &DL,
+                            SDValue LHS, SDValue RHS, SDValue Scale,
+                            SelectionDAG &DAG, const TargetLowering &TLI) {
+  EVT VT = LHS.getValueType();
+  bool Signed = Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT;
+  bool Saturating = Opcode == ISD::SDIVFIXSAT || Opcode == ISD::UDIVFIXSAT;
+  LLVMContext &Ctx = *DAG.getContext();
+
+  // If the type is legal but the operation isn't, this node might survive all
+  // the way to operation legalization. If we end up there and we do not have
+  // the ability to widen the type (if VT*2 is not legal), we cannot expand the
+  // node.
+
+  // Coax the legalizer into expanding the node during type legalization instead
+  // by bumping the size by one bit. This will force it to Promote, enabling the
+  // early expansion and avoiding the need to expand later.
+
+  // We don't have to do this if Scale is 0; that can always be expanded, unless
+  // it's a saturating signed operation. Those can experience true integer
+  // division overflow, a case which we must avoid.
+
+  // FIXME: We wouldn't have to do this (or any of the early
+  // expansion/promotion) if it was possible to expand a libcall of an
+  // illegal type during operation legalization. But it's not, so things
+  // get a bit hacky.
+  unsigned ScaleInt = cast<ConstantSDNode>(Scale)->getZExtValue();
+  if ((ScaleInt > 0 || (Saturating && Signed)) &&
+      (TLI.isTypeLegal(VT) ||
+       (VT.isVector() && TLI.isTypeLegal(VT.getVectorElementType())))) {
+    TargetLowering::LegalizeAction Action = TLI.getFixedPointOperationAction(
+        Opcode, VT, ScaleInt);
+    if (Action != TargetLowering::Legal && Action != TargetLowering::Custom) {
+      EVT PromVT;
+      if (VT.isScalarInteger())
+        PromVT = EVT::getIntegerVT(Ctx, VT.getSizeInBits() + 1);
+      else if (VT.isVector()) {
+        PromVT = VT.getVectorElementType();
+        PromVT = EVT::getIntegerVT(Ctx, PromVT.getSizeInBits() + 1);
+        PromVT = EVT::getVectorVT(Ctx, PromVT, VT.getVectorElementCount());
+      } else
+        llvm_unreachable("Wrong VT for DIVFIX?");
+      LHS = DAG.getExtOrTrunc(Signed, LHS, DL, PromVT);
+      RHS = DAG.getExtOrTrunc(Signed, RHS, DL, PromVT);
+      EVT ShiftTy = TLI.getShiftAmountTy(PromVT, DAG.getDataLayout());
+      // For saturating operations, we need to shift up the LHS to get the
+      // proper saturation width, and then shift down again afterwards.
+      if (Saturating)
+        LHS = DAG.getNode(ISD::SHL, DL, PromVT, LHS,
+                          DAG.getConstant(1, DL, ShiftTy));
+      SDValue Res = DAG.getNode(Opcode, DL, PromVT, LHS, RHS, Scale);
+      if (Saturating)
+        Res = DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, PromVT, Res,
+                          DAG.getConstant(1, DL, ShiftTy));
+      return DAG.getZExtOrTrunc(Res, DL, VT);
+    }
+  }
+
+  return DAG.getNode(Opcode, DL, VT, LHS, RHS, Scale);
+}
+
+// getUnderlyingArgRegs - Find underlying registers used for a truncated,
+// bitcasted, or split argument. Returns a list of <Register, size in bits>
+static void
+getUnderlyingArgRegs(SmallVectorImpl<std::pair<unsigned, TypeSize>> &Regs,
+                     const SDValue &N) {
+  switch (N.getOpcode()) {
+  case ISD::CopyFromReg: {
+    SDValue Op = N.getOperand(1);
+    Regs.emplace_back(cast<RegisterSDNode>(Op)->getReg(),
+                      Op.getValueType().getSizeInBits());
+    return;
+  }
+  case ISD::BITCAST:
+  case ISD::AssertZext:
+  case ISD::AssertSext:
+  case ISD::TRUNCATE:
+    getUnderlyingArgRegs(Regs, N.getOperand(0));
+    return;
+  case ISD::BUILD_PAIR:
+  case ISD::BUILD_VECTOR:
+  case ISD::CONCAT_VECTORS:
+    for (SDValue Op : N->op_values())
+      getUnderlyingArgRegs(Regs, Op);
+    return;
+  default:
+    return;
+  }
+}
+
+/// If the DbgValueInst is a dbg_value of a function argument, create the
+/// corresponding DBG_VALUE machine instruction for it now.  At the end of
+/// instruction selection, they will be inserted to the entry BB.
+/// We don't currently support this for variadic dbg_values, as they shouldn't
+/// appear for function arguments or in the prologue.
+bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
+    const Value *V, DILocalVariable *Variable, DIExpression *Expr,
+    DILocation *DL, FuncArgumentDbgValueKind Kind, const SDValue &N) {
+  const Argument *Arg = dyn_cast<Argument>(V);
+  if (!Arg)
+    return false;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
+
+  // Helper to create DBG_INSTR_REFs or DBG_VALUEs, depending on what kind
+  // we've been asked to pursue.
+  auto MakeVRegDbgValue = [&](Register Reg, DIExpression *FragExpr,
+                              bool Indirect) {
+    if (Reg.isVirtual() && MF.useDebugInstrRef()) {
+      // For VRegs, in instruction referencing mode, create a DBG_INSTR_REF
+      // pointing at the VReg, which will be patched up later.
+      auto &Inst = TII->get(TargetOpcode::DBG_INSTR_REF);
+      SmallVector<MachineOperand, 1> MOs({MachineOperand::CreateReg(
+          /* Reg */ Reg, /* isDef */ false, /* isImp */ false,
+          /* isKill */ false, /* isDead */ false,
+          /* isUndef */ false, /* isEarlyClobber */ false,
+          /* SubReg */ 0, /* isDebug */ true)});
+
+      auto *NewDIExpr = FragExpr;
+      // We don't have an "Indirect" field in DBG_INSTR_REF, fold that into
+      // the DIExpression.
+      if (Indirect)
+        NewDIExpr = DIExpression::prepend(FragExpr, DIExpression::DerefBefore);
+      SmallVector<uint64_t, 2> Ops({dwarf::DW_OP_LLVM_arg, 0});
+      NewDIExpr = DIExpression::prependOpcodes(NewDIExpr, Ops);
+      return BuildMI(MF, DL, Inst, false, MOs, Variable, NewDIExpr);
+    } else {
+      // Create a completely standard DBG_VALUE.
+      auto &Inst = TII->get(TargetOpcode::DBG_VALUE);
+      return BuildMI(MF, DL, Inst, Indirect, Reg, Variable, FragExpr);
+    }
+  };
+
+  if (Kind == FuncArgumentDbgValueKind::Value) {
+    // ArgDbgValues are hoisted to the beginning of the entry block. So we
+    // should only emit as ArgDbgValue if the dbg.value intrinsic is found in
+    // the entry block.
+    bool IsInEntryBlock = FuncInfo.MBB == &FuncInfo.MF->front();
+    if (!IsInEntryBlock)
+      return false;
+
+    // ArgDbgValues are hoisted to the beginning of the entry block.  So we
+    // should only emit as ArgDbgValue if the dbg.value intrinsic describes a
+    // variable that also is a param.
+    //
+    // Although, if we are at the top of the entry block already, we can still
+    // emit using ArgDbgValue. This might catch some situations when the
+    // dbg.value refers to an argument that isn't used in the entry block, so
+    // any CopyToReg node would be optimized out and the only way to express
+    // this DBG_VALUE is by using the physical reg (or FI) as done in this
+    // method.  ArgDbgValues are hoisted to the beginning of the entry block. So
+    // we should only emit as ArgDbgValue if the Variable is an argument to the
+    // current function, and the dbg.value intrinsic is found in the entry
+    // block.
+    bool VariableIsFunctionInputArg = Variable->isParameter() &&
+        !DL->getInlinedAt();
+    bool IsInPrologue = SDNodeOrder == LowestSDNodeOrder;
+    if (!IsInPrologue && !VariableIsFunctionInputArg)
+      return false;
+
+    // Here we assume that a function argument on IR level only can be used to
+    // describe one input parameter on source level. If we for example have
+    // source code like this
+    //
+    //    struct A { long x, y; };
+    //    void foo(struct A a, long b) {
+    //      ...
+    //      b = a.x;
+    //      ...
+    //    }
+    //
+    // and IR like this
+    //
+    //  define void @foo(i32 %a1, i32 %a2, i32 %b)  {
+    //  entry:
+    //    call void @llvm.dbg.value(metadata i32 %a1, "a", DW_OP_LLVM_fragment
+    //    call void @llvm.dbg.value(metadata i32 %a2, "a", DW_OP_LLVM_fragment
+    //    call void @llvm.dbg.value(metadata i32 %b, "b",
+    //    ...
+    //    call void @llvm.dbg.value(metadata i32 %a1, "b"
+    //    ...
+    //
+    // then the last dbg.value is describing a parameter "b" using a value that
+    // is an argument. But since we already has used %a1 to describe a parameter
+    // we should not handle that last dbg.value here (that would result in an
+    // incorrect hoisting of the DBG_VALUE to the function entry).
+    // Notice that we allow one dbg.value per IR level argument, to accommodate
+    // for the situation with fragments above.
+    if (VariableIsFunctionInputArg) {
+      unsigned ArgNo = Arg->getArgNo();
+      if (ArgNo >= FuncInfo.DescribedArgs.size())
+        FuncInfo.DescribedArgs.resize(ArgNo + 1, false);
+      else if (!IsInPrologue && FuncInfo.DescribedArgs.test(ArgNo))
+        return false;
+      FuncInfo.DescribedArgs.set(ArgNo);
+    }
+  }
+
+  bool IsIndirect = false;
+  std::optional<MachineOperand> Op;
+  // Some arguments' frame index is recorded during argument lowering.
+  int FI = FuncInfo.getArgumentFrameIndex(Arg);
+  if (FI != std::numeric_limits<int>::max())
+    Op = MachineOperand::CreateFI(FI);
+
+  SmallVector<std::pair<unsigned, TypeSize>, 8> ArgRegsAndSizes;
+  if (!Op && N.getNode()) {
+    getUnderlyingArgRegs(ArgRegsAndSizes, N);
+    Register Reg;
+    if (ArgRegsAndSizes.size() == 1)
+      Reg = ArgRegsAndSizes.front().first;
+
+    if (Reg && Reg.isVirtual()) {
+      MachineRegisterInfo &RegInfo = MF.getRegInfo();
+      Register PR = RegInfo.getLiveInPhysReg(Reg);
+      if (PR)
+        Reg = PR;
+    }
+    if (Reg) {
+      Op = MachineOperand::CreateReg(Reg, false);
+      IsIndirect = Kind != FuncArgumentDbgValueKind::Value;
+    }
+  }
+
+  if (!Op && N.getNode()) {
+    // Check if frame index is available.
+    SDValue LCandidate = peekThroughBitcasts(N);
+    if (LoadSDNode *LNode = dyn_cast<LoadSDNode>(LCandidate.getNode()))
+      if (FrameIndexSDNode *FINode =
+          dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
+        Op = MachineOperand::CreateFI(FINode->getIndex());
+  }
+
+  if (!Op) {
+    // Create a DBG_VALUE for each decomposed value in ArgRegs to cover Reg
+    auto splitMultiRegDbgValue = [&](ArrayRef<std::pair<unsigned, TypeSize>>
+                                         SplitRegs) {
+      unsigned Offset = 0;
+      for (const auto &RegAndSize : SplitRegs) {
+        // If the expression is already a fragment, the current register
+        // offset+size might extend beyond the fragment. In this case, only
+        // the register bits that are inside the fragment are relevant.
+        int RegFragmentSizeInBits = RegAndSize.second;
+        if (auto ExprFragmentInfo = Expr->getFragmentInfo()) {
+          uint64_t ExprFragmentSizeInBits = ExprFragmentInfo->SizeInBits;
+          // The register is entirely outside the expression fragment,
+          // so is irrelevant for debug info.
+          if (Offset >= ExprFragmentSizeInBits)
+            break;
+          // The register is partially outside the expression fragment, only
+          // the low bits within the fragment are relevant for debug info.
+          if (Offset + RegFragmentSizeInBits > ExprFragmentSizeInBits) {
+            RegFragmentSizeInBits = ExprFragmentSizeInBits - Offset;
+          }
+        }
+
+        auto FragmentExpr = DIExpression::createFragmentExpression(
+            Expr, Offset, RegFragmentSizeInBits);
+        Offset += RegAndSize.second;
+        // If a valid fragment expression cannot be created, the variable's
+        // correct value cannot be determined and so it is set as Undef.
+        if (!FragmentExpr) {
+          SDDbgValue *SDV = DAG.getConstantDbgValue(
+              Variable, Expr, UndefValue::get(V->getType()), DL, SDNodeOrder);
+          DAG.AddDbgValue(SDV, false);
+          continue;
+        }
+        MachineInstr *NewMI =
+            MakeVRegDbgValue(RegAndSize.first, *FragmentExpr,
+                             Kind != FuncArgumentDbgValueKind::Value);
+        FuncInfo.ArgDbgValues.push_back(NewMI);
+      }
+    };
+
+    // Check if ValueMap has reg number.
+    DenseMap<const Value *, Register>::const_iterator
+      VMI = FuncInfo.ValueMap.find(V);
+    if (VMI != FuncInfo.ValueMap.end()) {
+      const auto &TLI = DAG.getTargetLoweringInfo();
+      RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), VMI->second,
+                       V->getType(), std::nullopt);
+      if (RFV.occupiesMultipleRegs()) {
+        splitMultiRegDbgValue(RFV.getRegsAndSizes());
+        return true;
+      }
+
+      Op = MachineOperand::CreateReg(VMI->second, false);
+      IsIndirect = Kind != FuncArgumentDbgValueKind::Value;
+    } else if (ArgRegsAndSizes.size() > 1) {
+      // This was split due to the calling convention, and no virtual register
+      // mapping exists for the value.
+      splitMultiRegDbgValue(ArgRegsAndSizes);
+      return true;
+    }
+  }
+
+  if (!Op)
+    return false;
+
+  // If the expression refers to the entry value of an Argument, use the
+  // corresponding livein physical register. As per the Verifier, this is only
+  // allowed for swiftasync Arguments.
+  if (Op->isReg() && Expr->isEntryValue()) {
+    assert(Arg->hasAttribute(Attribute::AttrKind::SwiftAsync));
+    auto OpReg = Op->getReg();
+    for (auto [PhysReg, VirtReg] : FuncInfo.RegInfo->liveins())
+      if (OpReg == VirtReg || OpReg == PhysReg) {
+        SDDbgValue *SDV = DAG.getVRegDbgValue(
+            Variable, Expr, PhysReg,
+            Kind != FuncArgumentDbgValueKind::Value /*is indirect*/, DL,
+            SDNodeOrder);
+        DAG.AddDbgValue(SDV, false /*treat as dbg.declare byval parameter*/);
+        return true;
+      }
+    LLVM_DEBUG(dbgs() << "Dropping dbg.value: expression is entry_value but "
+                         "couldn't find a physical register\n");
+    return true;
+  }
+
+  assert(Variable->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  MachineInstr *NewMI = nullptr;
+
+  if (Op->isReg())
+    NewMI = MakeVRegDbgValue(Op->getReg(), Expr, IsIndirect);
+  else
+    NewMI = BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), true, *Op,
+                    Variable, Expr);
+
+  // Otherwise, use ArgDbgValues.
+  FuncInfo.ArgDbgValues.push_back(NewMI);
+  return true;
+}
+
+/// Return the appropriate SDDbgValue based on N.
+SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N,
+                                             DILocalVariable *Variable,
+                                             DIExpression *Expr,
+                                             const DebugLoc &dl,
+                                             unsigned DbgSDNodeOrder) {
+  if (auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode())) {
+    // Construct a FrameIndexDbgValue for FrameIndexSDNodes so we can describe
+    // stack slot locations.
+    //
+    // Consider "int x = 0; int *px = &x;". There are two kinds of interesting
+    // debug values here after optimization:
+    //
+    //   dbg.value(i32* %px, !"int *px", !DIExpression()), and
+    //   dbg.value(i32* %px, !"int x", !DIExpression(DW_OP_deref))
+    //
+    // Both describe the direct values of their associated variables.
+    return DAG.getFrameIndexDbgValue(Variable, Expr, FISDN->getIndex(),
+                                     /*IsIndirect*/ false, dl, DbgSDNodeOrder);
+  }
+  return DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(),
+                         /*IsIndirect*/ false, dl, DbgSDNodeOrder);
+}
+
+static unsigned FixedPointIntrinsicToOpcode(unsigned Intrinsic) {
+  switch (Intrinsic) {
+  case Intrinsic::smul_fix:
+    return ISD::SMULFIX;
+  case Intrinsic::umul_fix:
+    return ISD::UMULFIX;
+  case Intrinsic::smul_fix_sat:
+    return ISD::SMULFIXSAT;
+  case Intrinsic::umul_fix_sat:
+    return ISD::UMULFIXSAT;
+  case Intrinsic::sdiv_fix:
+    return ISD::SDIVFIX;
+  case Intrinsic::udiv_fix:
+    return ISD::UDIVFIX;
+  case Intrinsic::sdiv_fix_sat:
+    return ISD::SDIVFIXSAT;
+  case Intrinsic::udiv_fix_sat:
+    return ISD::UDIVFIXSAT;
+  default:
+    llvm_unreachable("Unhandled fixed point intrinsic");
+  }
+}
+
+void SelectionDAGBuilder::lowerCallToExternalSymbol(const CallInst &I,
+                                           const char *FunctionName) {
+  assert(FunctionName && "FunctionName must not be nullptr");
+  SDValue Callee = DAG.getExternalSymbol(
+      FunctionName,
+      DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()));
+  LowerCallTo(I, Callee, I.isTailCall(), I.isMustTailCall());
+}
+
+/// Given a @llvm.call.preallocated.setup, return the corresponding
+/// preallocated call.
+static const CallBase *FindPreallocatedCall(const Value *PreallocatedSetup) {
+  assert(cast<CallBase>(PreallocatedSetup)
+                 ->getCalledFunction()
+                 ->getIntrinsicID() == Intrinsic::call_preallocated_setup &&
+         "expected call_preallocated_setup Value");
+  for (const auto *U : PreallocatedSetup->users()) {
+    auto *UseCall = cast<CallBase>(U);
+    const Function *Fn = UseCall->getCalledFunction();
+    if (!Fn || Fn->getIntrinsicID() != Intrinsic::call_preallocated_arg) {
+      return UseCall;
+    }
+  }
+  llvm_unreachable("expected corresponding call to preallocated setup/arg");
+}
+
+/// Lower the call to the specified intrinsic function.
+void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
+                                             unsigned Intrinsic) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc sdl = getCurSDLoc();
+  DebugLoc dl = getCurDebugLoc();
+  SDValue Res;
+
+  SDNodeFlags Flags;
+  if (auto *FPOp = dyn_cast<FPMathOperator>(&I))
+    Flags.copyFMF(*FPOp);
+
+  switch (Intrinsic) {
+  default:
+    // By default, turn this into a target intrinsic node.
+    visitTargetIntrinsic(I, Intrinsic);
+    return;
+  case Intrinsic::vscale: {
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    setValue(&I, DAG.getVScale(sdl, VT, APInt(VT.getSizeInBits(), 1)));
+    return;
+  }
+  case Intrinsic::vastart:  visitVAStart(I); return;
+  case Intrinsic::vaend:    visitVAEnd(I); return;
+  case Intrinsic::vacopy:   visitVACopy(I); return;
+  case Intrinsic::returnaddress:
+    setValue(&I, DAG.getNode(ISD::RETURNADDR, sdl,
+                             TLI.getValueType(DAG.getDataLayout(), I.getType()),
+                             getValue(I.getArgOperand(0))));
+    return;
+  case Intrinsic::addressofreturnaddress:
+    setValue(&I,
+             DAG.getNode(ISD::ADDROFRETURNADDR, sdl,
+                         TLI.getValueType(DAG.getDataLayout(), I.getType())));
+    return;
+  case Intrinsic::sponentry:
+    setValue(&I,
+             DAG.getNode(ISD::SPONENTRY, sdl,
+                         TLI.getValueType(DAG.getDataLayout(), I.getType())));
+    return;
+  case Intrinsic::frameaddress:
+    setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl,
+                             TLI.getFrameIndexTy(DAG.getDataLayout()),
+                             getValue(I.getArgOperand(0))));
+    return;
+  case Intrinsic::read_volatile_register:
+  case Intrinsic::read_register: {
+    Value *Reg = I.getArgOperand(0);
+    SDValue Chain = getRoot();
+    SDValue RegName =
+        DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    Res = DAG.getNode(ISD::READ_REGISTER, sdl,
+      DAG.getVTList(VT, MVT::Other), Chain, RegName);
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return;
+  }
+  case Intrinsic::write_register: {
+    Value *Reg = I.getArgOperand(0);
+    Value *RegValue = I.getArgOperand(1);
+    SDValue Chain = getRoot();
+    SDValue RegName =
+        DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
+    DAG.setRoot(DAG.getNode(ISD::WRITE_REGISTER, sdl, MVT::Other, Chain,
+                            RegName, getValue(RegValue)));
+    return;
+  }
+  case Intrinsic::memcpy: {
+    const auto &MCI = cast<MemCpyInst>(I);
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    // @llvm.memcpy defines 0 and 1 to both mean no alignment.
+    Align DstAlign = MCI.getDestAlign().valueOrOne();
+    Align SrcAlign = MCI.getSourceAlign().valueOrOne();
+    Align Alignment = std::min(DstAlign, SrcAlign);
+    bool isVol = MCI.isVolatile();
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
+    // FIXME: Support passing different dest/src alignments to the memcpy DAG
+    // node.
+    SDValue Root = isVol ? getRoot() : getMemoryRoot();
+    SDValue MC = DAG.getMemcpy(
+        Root, sdl, Op1, Op2, Op3, Alignment, isVol,
+        /* AlwaysInline */ false, isTC, MachinePointerInfo(I.getArgOperand(0)),
+        MachinePointerInfo(I.getArgOperand(1)), I.getAAMetadata(), AA);
+    updateDAGForMaybeTailCall(MC);
+    return;
+  }
+  case Intrinsic::memcpy_inline: {
+    const auto &MCI = cast<MemCpyInlineInst>(I);
+    SDValue Dst = getValue(I.getArgOperand(0));
+    SDValue Src = getValue(I.getArgOperand(1));
+    SDValue Size = getValue(I.getArgOperand(2));
+    assert(isa<ConstantSDNode>(Size) && "memcpy_inline needs constant size");
+    // @llvm.memcpy.inline defines 0 and 1 to both mean no alignment.
+    Align DstAlign = MCI.getDestAlign().valueOrOne();
+    Align SrcAlign = MCI.getSourceAlign().valueOrOne();
+    Align Alignment = std::min(DstAlign, SrcAlign);
+    bool isVol = MCI.isVolatile();
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
+    // FIXME: Support passing different dest/src alignments to the memcpy DAG
+    // node.
+    SDValue MC = DAG.getMemcpy(
+        getRoot(), sdl, Dst, Src, Size, Alignment, isVol,
+        /* AlwaysInline */ true, isTC, MachinePointerInfo(I.getArgOperand(0)),
+        MachinePointerInfo(I.getArgOperand(1)), I.getAAMetadata(), AA);
+    updateDAGForMaybeTailCall(MC);
+    return;
+  }
+  case Intrinsic::memset: {
+    const auto &MSI = cast<MemSetInst>(I);
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    // @llvm.memset defines 0 and 1 to both mean no alignment.
+    Align Alignment = MSI.getDestAlign().valueOrOne();
+    bool isVol = MSI.isVolatile();
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
+    SDValue Root = isVol ? getRoot() : getMemoryRoot();
+    SDValue MS = DAG.getMemset(
+        Root, sdl, Op1, Op2, Op3, Alignment, isVol, /* AlwaysInline */ false,
+        isTC, MachinePointerInfo(I.getArgOperand(0)), I.getAAMetadata());
+    updateDAGForMaybeTailCall(MS);
+    return;
+  }
+  case Intrinsic::memset_inline: {
+    const auto &MSII = cast<MemSetInlineInst>(I);
+    SDValue Dst = getValue(I.getArgOperand(0));
+    SDValue Value = getValue(I.getArgOperand(1));
+    SDValue Size = getValue(I.getArgOperand(2));
+    assert(isa<ConstantSDNode>(Size) && "memset_inline needs constant size");
+    // @llvm.memset defines 0 and 1 to both mean no alignment.
+    Align DstAlign = MSII.getDestAlign().valueOrOne();
+    bool isVol = MSII.isVolatile();
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
+    SDValue Root = isVol ? getRoot() : getMemoryRoot();
+    SDValue MC = DAG.getMemset(Root, sdl, Dst, Value, Size, DstAlign, isVol,
+                               /* AlwaysInline */ true, isTC,
+                               MachinePointerInfo(I.getArgOperand(0)),
+                               I.getAAMetadata());
+    updateDAGForMaybeTailCall(MC);
+    return;
+  }
+  case Intrinsic::memmove: {
+    const auto &MMI = cast<MemMoveInst>(I);
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    // @llvm.memmove defines 0 and 1 to both mean no alignment.
+    Align DstAlign = MMI.getDestAlign().valueOrOne();
+    Align SrcAlign = MMI.getSourceAlign().valueOrOne();
+    Align Alignment = std::min(DstAlign, SrcAlign);
+    bool isVol = MMI.isVolatile();
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
+    // FIXME: Support passing different dest/src alignments to the memmove DAG
+    // node.
+    SDValue Root = isVol ? getRoot() : getMemoryRoot();
+    SDValue MM = DAG.getMemmove(Root, sdl, Op1, Op2, Op3, Alignment, isVol,
+                                isTC, MachinePointerInfo(I.getArgOperand(0)),
+                                MachinePointerInfo(I.getArgOperand(1)),
+                                I.getAAMetadata(), AA);
+    updateDAGForMaybeTailCall(MM);
+    return;
+  }
+  case Intrinsic::memcpy_element_unordered_atomic: {
+    const AtomicMemCpyInst &MI = cast<AtomicMemCpyInst>(I);
+    SDValue Dst = getValue(MI.getRawDest());
+    SDValue Src = getValue(MI.getRawSource());
+    SDValue Length = getValue(MI.getLength());
+
+    Type *LengthTy = MI.getLength()->getType();
+    unsigned ElemSz = MI.getElementSizeInBytes();
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
+    SDValue MC =
+        DAG.getAtomicMemcpy(getRoot(), sdl, Dst, Src, Length, LengthTy, ElemSz,
+                            isTC, MachinePointerInfo(MI.getRawDest()),
+                            MachinePointerInfo(MI.getRawSource()));
+    updateDAGForMaybeTailCall(MC);
+    return;
+  }
+  case Intrinsic::memmove_element_unordered_atomic: {
+    auto &MI = cast<AtomicMemMoveInst>(I);
+    SDValue Dst = getValue(MI.getRawDest());
+    SDValue Src = getValue(MI.getRawSource());
+    SDValue Length = getValue(MI.getLength());
+
+    Type *LengthTy = MI.getLength()->getType();
+    unsigned ElemSz = MI.getElementSizeInBytes();
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
+    SDValue MC =
+        DAG.getAtomicMemmove(getRoot(), sdl, Dst, Src, Length, LengthTy, ElemSz,
+                             isTC, MachinePointerInfo(MI.getRawDest()),
+                             MachinePointerInfo(MI.getRawSource()));
+    updateDAGForMaybeTailCall(MC);
+    return;
+  }
+  case Intrinsic::memset_element_unordered_atomic: {
+    auto &MI = cast<AtomicMemSetInst>(I);
+    SDValue Dst = getValue(MI.getRawDest());
+    SDValue Val = getValue(MI.getValue());
+    SDValue Length = getValue(MI.getLength());
+
+    Type *LengthTy = MI.getLength()->getType();
+    unsigned ElemSz = MI.getElementSizeInBytes();
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
+    SDValue MC =
+        DAG.getAtomicMemset(getRoot(), sdl, Dst, Val, Length, LengthTy, ElemSz,
+                            isTC, MachinePointerInfo(MI.getRawDest()));
+    updateDAGForMaybeTailCall(MC);
+    return;
+  }
+  case Intrinsic::call_preallocated_setup: {
+    const CallBase *PreallocatedCall = FindPreallocatedCall(&I);
+    SDValue SrcValue = DAG.getSrcValue(PreallocatedCall);
+    SDValue Res = DAG.getNode(ISD::PREALLOCATED_SETUP, sdl, MVT::Other,
+                              getRoot(), SrcValue);
+    setValue(&I, Res);
+    DAG.setRoot(Res);
+    return;
+  }
+  case Intrinsic::call_preallocated_arg: {
+    const CallBase *PreallocatedCall = FindPreallocatedCall(I.getOperand(0));
+    SDValue SrcValue = DAG.getSrcValue(PreallocatedCall);
+    SDValue Ops[3];
+    Ops[0] = getRoot();
+    Ops[1] = SrcValue;
+    Ops[2] = DAG.getTargetConstant(*cast<ConstantInt>(I.getArgOperand(1)), sdl,
+                                   MVT::i32); // arg index
+    SDValue Res = DAG.getNode(
+        ISD::PREALLOCATED_ARG, sdl,
+        DAG.getVTList(TLI.getPointerTy(DAG.getDataLayout()), MVT::Other), Ops);
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return;
+  }
+  case Intrinsic::dbg_declare: {
+    const auto &DI = cast<DbgDeclareInst>(I);
+    // Debug intrinsics are handled separately in assignment tracking mode.
+    // Some intrinsics are handled right after Argument lowering.
+    if (AssignmentTrackingEnabled ||
+        FuncInfo.PreprocessedDbgDeclares.count(&DI))
+      return;
+    // Assume dbg.declare can not currently use DIArgList, i.e.
+    // it is non-variadic.
+    assert(!DI.hasArgList() && "Only dbg.value should currently use DIArgList");
+    DILocalVariable *Variable = DI.getVariable();
+    DIExpression *Expression = DI.getExpression();
+    dropDanglingDebugInfo(Variable, Expression);
+    assert(Variable && "Missing variable");
+    LLVM_DEBUG(dbgs() << "SelectionDAG visiting debug intrinsic: " << DI
+                      << "\n");
+    // Check if address has undef value.
+    const Value *Address = DI.getVariableLocationOp(0);
+    if (!Address || isa<UndefValue>(Address) ||
+        (Address->use_empty() && !isa<Argument>(Address))) {
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI
+                        << " (bad/undef/unused-arg address)\n");
+      return;
+    }
+
+    bool isParameter = Variable->isParameter() || isa<Argument>(Address);
+
+    SDValue &N = NodeMap[Address];
+    if (!N.getNode() && isa<Argument>(Address))
+      // Check unused arguments map.
+      N = UnusedArgNodeMap[Address];
+    SDDbgValue *SDV;
+    if (N.getNode()) {
+      if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
+        Address = BCI->getOperand(0);
+      // Parameters are handled specially.
+      auto FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
+      if (isParameter && FINode) {
+        // Byval parameter. We have a frame index at this point.
+        SDV =
+            DAG.getFrameIndexDbgValue(Variable, Expression, FINode->getIndex(),
+                                      /*IsIndirect*/ true, dl, SDNodeOrder);
+      } else if (isa<Argument>(Address)) {
+        // Address is an argument, so try to emit its dbg value using
+        // virtual register info from the FuncInfo.ValueMap.
+        EmitFuncArgumentDbgValue(Address, Variable, Expression, dl,
+                                 FuncArgumentDbgValueKind::Declare, N);
+        return;
+      } else {
+        SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
+                              true, dl, SDNodeOrder);
+      }
+      DAG.AddDbgValue(SDV, isParameter);
+    } else {
+      // If Address is an argument then try to emit its dbg value using
+      // virtual register info from the FuncInfo.ValueMap.
+      if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl,
+                                    FuncArgumentDbgValueKind::Declare, N)) {
+        LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI
+                          << " (could not emit func-arg dbg_value)\n");
+      }
+    }
+    return;
+  }
+  case Intrinsic::dbg_label: {
+    const DbgLabelInst &DI = cast<DbgLabelInst>(I);
+    DILabel *Label = DI.getLabel();
+    assert(Label && "Missing label");
+
+    SDDbgLabel *SDV;
+    SDV = DAG.getDbgLabel(Label, dl, SDNodeOrder);
+    DAG.AddDbgLabel(SDV);
+    return;
+  }
+  case Intrinsic::dbg_assign: {
+    // Debug intrinsics are handled seperately in assignment tracking mode.
+    if (AssignmentTrackingEnabled)
+      return;
+    // If assignment tracking hasn't been enabled then fall through and treat
+    // the dbg.assign as a dbg.value.
+    [[fallthrough]];
+  }
+  case Intrinsic::dbg_value: {
+    // Debug intrinsics are handled seperately in assignment tracking mode.
+    if (AssignmentTrackingEnabled)
+      return;
+    const DbgValueInst &DI = cast<DbgValueInst>(I);
+    assert(DI.getVariable() && "Missing variable");
+
+    DILocalVariable *Variable = DI.getVariable();
+    DIExpression *Expression = DI.getExpression();
+    dropDanglingDebugInfo(Variable, Expression);
+
+    if (DI.isKillLocation()) {
+      handleKillDebugValue(Variable, Expression, DI.getDebugLoc(), SDNodeOrder);
+      return;
+    }
+
+    SmallVector<Value *, 4> Values(DI.getValues());
+    if (Values.empty())
+      return;
+
+    bool IsVariadic = DI.hasArgList();
+    if (!handleDebugValue(Values, Variable, Expression, DI.getDebugLoc(),
+                          SDNodeOrder, IsVariadic))
+      addDanglingDebugInfo(&DI, SDNodeOrder);
+    return;
+  }
+
+  case Intrinsic::eh_typeid_for: {
+    // Find the type id for the given typeinfo.
+    GlobalValue *GV = ExtractTypeInfo(I.getArgOperand(0));
+    unsigned TypeID = DAG.getMachineFunction().getTypeIDFor(GV);
+    Res = DAG.getConstant(TypeID, sdl, MVT::i32);
+    setValue(&I, Res);
+    return;
+  }
+
+  case Intrinsic::eh_return_i32:
+  case Intrinsic::eh_return_i64:
+    DAG.getMachineFunction().setCallsEHReturn(true);
+    DAG.setRoot(DAG.getNode(ISD::EH_RETURN, sdl,
+                            MVT::Other,
+                            getControlRoot(),
+                            getValue(I.getArgOperand(0)),
+                            getValue(I.getArgOperand(1))));
+    return;
+  case Intrinsic::eh_unwind_init:
+    DAG.getMachineFunction().setCallsUnwindInit(true);
+    return;
+  case Intrinsic::eh_dwarf_cfa:
+    setValue(&I, DAG.getNode(ISD::EH_DWARF_CFA, sdl,
+                             TLI.getPointerTy(DAG.getDataLayout()),
+                             getValue(I.getArgOperand(0))));
+    return;
+  case Intrinsic::eh_sjlj_callsite: {
+    MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
+    ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(0));
+    assert(MMI.getCurrentCallSite() == 0 && "Overlapping call sites!");
+
+    MMI.setCurrentCallSite(CI->getZExtValue());
+    return;
+  }
+  case Intrinsic::eh_sjlj_functioncontext: {
+    // Get and store the index of the function context.
+    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+    AllocaInst *FnCtx =
+      cast<AllocaInst>(I.getArgOperand(0)->stripPointerCasts());
+    int FI = FuncInfo.StaticAllocaMap[FnCtx];
+    MFI.setFunctionContextIndex(FI);
+    return;
+  }
+  case Intrinsic::eh_sjlj_setjmp: {
+    SDValue Ops[2];
+    Ops[0] = getRoot();
+    Ops[1] = getValue(I.getArgOperand(0));
+    SDValue Op = DAG.getNode(ISD::EH_SJLJ_SETJMP, sdl,
+                             DAG.getVTList(MVT::i32, MVT::Other), Ops);
+    setValue(&I, Op.getValue(0));
+    DAG.setRoot(Op.getValue(1));
+    return;
+  }
+  case Intrinsic::eh_sjlj_longjmp:
+    DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_LONGJMP, sdl, MVT::Other,
+                            getRoot(), getValue(I.getArgOperand(0))));
+    return;
+  case Intrinsic::eh_sjlj_setup_dispatch:
+    DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_SETUP_DISPATCH, sdl, MVT::Other,
+                            getRoot()));
+    return;
+  case Intrinsic::masked_gather:
+    visitMaskedGather(I);
+    return;
+  case Intrinsic::masked_load:
+    visitMaskedLoad(I);
+    return;
+  case Intrinsic::masked_scatter:
+    visitMaskedScatter(I);
+    return;
+  case Intrinsic::masked_store:
+    visitMaskedStore(I);
+    return;
+  case Intrinsic::masked_expandload:
+    visitMaskedLoad(I, true /* IsExpanding */);
+    return;
+  case Intrinsic::masked_compressstore:
+    visitMaskedStore(I, true /* IsCompressing */);
+    return;
+  case Intrinsic::powi:
+    setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
+                            getValue(I.getArgOperand(1)), DAG));
+    return;
+  case Intrinsic::log:
+    setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, TLI, Flags));
+    return;
+  case Intrinsic::log2:
+    setValue(&I,
+             expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, TLI, Flags));
+    return;
+  case Intrinsic::log10:
+    setValue(&I,
+             expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, TLI, Flags));
+    return;
+  case Intrinsic::exp:
+    setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, TLI, Flags));
+    return;
+  case Intrinsic::exp2:
+    setValue(&I,
+             expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, TLI, Flags));
+    return;
+  case Intrinsic::pow:
+    setValue(&I, expandPow(sdl, getValue(I.getArgOperand(0)),
+                           getValue(I.getArgOperand(1)), DAG, TLI, Flags));
+    return;
+  case Intrinsic::sqrt:
+  case Intrinsic::fabs:
+  case Intrinsic::sin:
+  case Intrinsic::cos:
+  case Intrinsic::floor:
+  case Intrinsic::ceil:
+  case Intrinsic::trunc:
+  case Intrinsic::rint:
+  case Intrinsic::nearbyint:
+  case Intrinsic::round:
+  case Intrinsic::roundeven:
+  case Intrinsic::canonicalize: {
+    unsigned Opcode;
+    switch (Intrinsic) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::sqrt:      Opcode = ISD::FSQRT;      break;
+    case Intrinsic::fabs:      Opcode = ISD::FABS;       break;
+    case Intrinsic::sin:       Opcode = ISD::FSIN;       break;
+    case Intrinsic::cos:       Opcode = ISD::FCOS;       break;
+    case Intrinsic::floor:     Opcode = ISD::FFLOOR;     break;
+    case Intrinsic::ceil:      Opcode = ISD::FCEIL;      break;
+    case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
+    case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
+    case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
+    case Intrinsic::round:     Opcode = ISD::FROUND;     break;
+    case Intrinsic::roundeven: Opcode = ISD::FROUNDEVEN; break;
+    case Intrinsic::canonicalize: Opcode = ISD::FCANONICALIZE; break;
+    }
+
+    setValue(&I, DAG.getNode(Opcode, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)), Flags));
+    return;
+  }
+  case Intrinsic::lround:
+  case Intrinsic::llround:
+  case Intrinsic::lrint:
+  case Intrinsic::llrint: {
+    unsigned Opcode;
+    switch (Intrinsic) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::lround:  Opcode = ISD::LROUND;  break;
+    case Intrinsic::llround: Opcode = ISD::LLROUND; break;
+    case Intrinsic::lrint:   Opcode = ISD::LRINT;   break;
+    case Intrinsic::llrint:  Opcode = ISD::LLRINT;  break;
+    }
+
+    EVT RetVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    setValue(&I, DAG.getNode(Opcode, sdl, RetVT,
+                             getValue(I.getArgOperand(0))));
+    return;
+  }
+  case Intrinsic::minnum:
+    setValue(&I, DAG.getNode(ISD::FMINNUM, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1)), Flags));
+    return;
+  case Intrinsic::maxnum:
+    setValue(&I, DAG.getNode(ISD::FMAXNUM, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1)), Flags));
+    return;
+  case Intrinsic::minimum:
+    setValue(&I, DAG.getNode(ISD::FMINIMUM, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1)), Flags));
+    return;
+  case Intrinsic::maximum:
+    setValue(&I, DAG.getNode(ISD::FMAXIMUM, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1)), Flags));
+    return;
+  case Intrinsic::copysign:
+    setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1)), Flags));
+    return;
+  case Intrinsic::ldexp:
+    setValue(&I, DAG.getNode(ISD::FLDEXP, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1)), Flags));
+    return;
+  case Intrinsic::frexp: {
+    SmallVector<EVT, 2> ValueVTs;
+    ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);
+    SDVTList VTs = DAG.getVTList(ValueVTs);
+    setValue(&I,
+             DAG.getNode(ISD::FFREXP, sdl, VTs, getValue(I.getArgOperand(0))));
+    return;
+  }
+  case Intrinsic::arithmetic_fence: {
+    setValue(&I, DAG.getNode(ISD::ARITH_FENCE, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)), Flags));
+    return;
+  }
+  case Intrinsic::fma:
+    setValue(&I, DAG.getNode(
+                     ISD::FMA, sdl, getValue(I.getArgOperand(0)).getValueType(),
+                     getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)),
+                     getValue(I.getArgOperand(2)), Flags));
+    return;
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
+  case Intrinsic::INTRINSIC:
+#include "llvm/IR/ConstrainedOps.def"
+    visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I));
+    return;
+#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
+#include "llvm/IR/VPIntrinsics.def"
+    visitVectorPredicationIntrinsic(cast<VPIntrinsic>(I));
+    return;
+  case Intrinsic::fptrunc_round: {
+    // Get the last argument, the metadata and convert it to an integer in the
+    // call
+    Metadata *MD = cast<MetadataAsValue>(I.getArgOperand(1))->getMetadata();
+    std::optional<RoundingMode> RoundMode =
+        convertStrToRoundingMode(cast<MDString>(MD)->getString());
+
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+    // Propagate fast-math-flags from IR to node(s).
+    SDNodeFlags Flags;
+    Flags.copyFMF(*cast<FPMathOperator>(&I));
+    SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
+
+    SDValue Result;
+    Result = DAG.getNode(
+        ISD::FPTRUNC_ROUND, sdl, VT, getValue(I.getArgOperand(0)),
+        DAG.getTargetConstant((int)*RoundMode, sdl,
+                              TLI.getPointerTy(DAG.getDataLayout())));
+    setValue(&I, Result);
+
+    return;
+  }
+  case Intrinsic::fmuladd: {
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
+        TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
+      setValue(&I, DAG.getNode(ISD::FMA, sdl,
+                               getValue(I.getArgOperand(0)).getValueType(),
+                               getValue(I.getArgOperand(0)),
+                               getValue(I.getArgOperand(1)),
+                               getValue(I.getArgOperand(2)), Flags));
+    } else {
+      // TODO: Intrinsic calls should have fast-math-flags.
+      SDValue Mul = DAG.getNode(
+          ISD::FMUL, sdl, getValue(I.getArgOperand(0)).getValueType(),
+          getValue(I.getArgOperand(0)), getValue(I.getArgOperand(1)), Flags);
+      SDValue Add = DAG.getNode(ISD::FADD, sdl,
+                                getValue(I.getArgOperand(0)).getValueType(),
+                                Mul, getValue(I.getArgOperand(2)), Flags);
+      setValue(&I, Add);
+    }
+    return;
+  }
+  case Intrinsic::convert_to_fp16:
+    setValue(&I, DAG.getNode(ISD::BITCAST, sdl, MVT::i16,
+                             DAG.getNode(ISD::FP_ROUND, sdl, MVT::f16,
+                                         getValue(I.getArgOperand(0)),
+                                         DAG.getTargetConstant(0, sdl,
+                                                               MVT::i32))));
+    return;
+  case Intrinsic::convert_from_fp16:
+    setValue(&I, DAG.getNode(ISD::FP_EXTEND, sdl,
+                             TLI.getValueType(DAG.getDataLayout(), I.getType()),
+                             DAG.getNode(ISD::BITCAST, sdl, MVT::f16,
+                                         getValue(I.getArgOperand(0)))));
+    return;
+  case Intrinsic::fptosi_sat: {
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    setValue(&I, DAG.getNode(ISD::FP_TO_SINT_SAT, sdl, VT,
+                             getValue(I.getArgOperand(0)),
+                             DAG.getValueType(VT.getScalarType())));
+    return;
+  }
+  case Intrinsic::fptoui_sat: {
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    setValue(&I, DAG.getNode(ISD::FP_TO_UINT_SAT, sdl, VT,
+                             getValue(I.getArgOperand(0)),
+                             DAG.getValueType(VT.getScalarType())));
+    return;
+  }
+  case Intrinsic::set_rounding:
+    Res = DAG.getNode(ISD::SET_ROUNDING, sdl, MVT::Other,
+                      {getRoot(), getValue(I.getArgOperand(0))});
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(0));
+    return;
+  case Intrinsic::is_fpclass: {
+    const DataLayout DLayout = DAG.getDataLayout();
+    EVT DestVT = TLI.getValueType(DLayout, I.getType());
+    EVT ArgVT = TLI.getValueType(DLayout, I.getArgOperand(0)->getType());
+    FPClassTest Test = static_cast<FPClassTest>(
+        cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
+    MachineFunction &MF = DAG.getMachineFunction();
+    const Function &F = MF.getFunction();
+    SDValue Op = getValue(I.getArgOperand(0));
+    SDNodeFlags Flags;
+    Flags.setNoFPExcept(
+        !F.getAttributes().hasFnAttr(llvm::Attribute::StrictFP));
+    // If ISD::IS_FPCLASS should be expanded, do it right now, because the
+    // expansion can use illegal types. Making expansion early allows
+    // legalizing these types prior to selection.
+    if (!TLI.isOperationLegalOrCustom(ISD::IS_FPCLASS, ArgVT)) {
+      SDValue Result = TLI.expandIS_FPCLASS(DestVT, Op, Test, Flags, sdl, DAG);
+      setValue(&I, Result);
+      return;
+    }
+
+    SDValue Check = DAG.getTargetConstant(Test, sdl, MVT::i32);
+    SDValue V = DAG.getNode(ISD::IS_FPCLASS, sdl, DestVT, {Op, Check}, Flags);
+    setValue(&I, V);
+    return;
+  }
+  case Intrinsic::get_fpenv: {
+    const DataLayout DLayout = DAG.getDataLayout();
+    EVT EnvVT = TLI.getValueType(DLayout, I.getType());
+    Align TempAlign = DAG.getEVTAlign(EnvVT);
+    SDValue Chain = getRoot();
+    // Use GET_FPENV if it is legal or custom. Otherwise use memory-based node
+    // and temporary storage in stack.
+    if (TLI.isOperationLegalOrCustom(ISD::GET_FPENV, EnvVT)) {
+      Res = DAG.getNode(
+          ISD::GET_FPENV, sdl,
+          DAG.getVTList(TLI.getValueType(DAG.getDataLayout(), I.getType()),
+                        MVT::Other),
+          Chain);
+    } else {
+      SDValue Temp = DAG.CreateStackTemporary(EnvVT, TempAlign.value());
+      int SPFI = cast<FrameIndexSDNode>(Temp.getNode())->getIndex();
+      auto MPI =
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+      MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+          MPI, MachineMemOperand::MOStore, MemoryLocation::UnknownSize,
+          TempAlign);
+      Chain = DAG.getGetFPEnv(Chain, sdl, Temp, EnvVT, MMO);
+      Res = DAG.getLoad(EnvVT, sdl, Chain, Temp, MPI);
+    }
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return;
+  }
+  case Intrinsic::set_fpenv: {
+    const DataLayout DLayout = DAG.getDataLayout();
+    SDValue Env = getValue(I.getArgOperand(0));
+    EVT EnvVT = Env.getValueType();
+    Align TempAlign = DAG.getEVTAlign(EnvVT);
+    SDValue Chain = getRoot();
+    // If SET_FPENV is custom or legal, use it. Otherwise use loading
+    // environment from memory.
+    if (TLI.isOperationLegalOrCustom(ISD::SET_FPENV, EnvVT)) {
+      Chain = DAG.getNode(ISD::SET_FPENV, sdl, MVT::Other, Chain, Env);
+    } else {
+      // Allocate space in stack, copy environment bits into it and use this
+      // memory in SET_FPENV_MEM.
+      SDValue Temp = DAG.CreateStackTemporary(EnvVT, TempAlign.value());
+      int SPFI = cast<FrameIndexSDNode>(Temp.getNode())->getIndex();
+      auto MPI =
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+      Chain = DAG.getStore(Chain, sdl, Env, Temp, MPI, TempAlign,
+                           MachineMemOperand::MOStore);
+      MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+          MPI, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize,
+          TempAlign);
+      Chain = DAG.getSetFPEnv(Chain, sdl, Temp, EnvVT, MMO);
+    }
+    DAG.setRoot(Chain);
+    return;
+  }
+  case Intrinsic::reset_fpenv:
+    DAG.setRoot(DAG.getNode(ISD::RESET_FPENV, sdl, MVT::Other, getRoot()));
+    return;
+  case Intrinsic::pcmarker: {
+    SDValue Tmp = getValue(I.getArgOperand(0));
+    DAG.setRoot(DAG.getNode(ISD::PCMARKER, sdl, MVT::Other, getRoot(), Tmp));
+    return;
+  }
+  case Intrinsic::readcyclecounter: {
+    SDValue Op = getRoot();
+    Res = DAG.getNode(ISD::READCYCLECOUNTER, sdl,
+                      DAG.getVTList(MVT::i64, MVT::Other), Op);
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return;
+  }
+  case Intrinsic::bitreverse:
+    setValue(&I, DAG.getNode(ISD::BITREVERSE, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
+    return;
+  case Intrinsic::bswap:
+    setValue(&I, DAG.getNode(ISD::BSWAP, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
+    return;
+  case Intrinsic::cttz: {
+    SDValue Arg = getValue(I.getArgOperand(0));
+    ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
+    EVT Ty = Arg.getValueType();
+    setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTTZ : ISD::CTTZ_ZERO_UNDEF,
+                             sdl, Ty, Arg));
+    return;
+  }
+  case Intrinsic::ctlz: {
+    SDValue Arg = getValue(I.getArgOperand(0));
+    ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
+    EVT Ty = Arg.getValueType();
+    setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTLZ : ISD::CTLZ_ZERO_UNDEF,
+                             sdl, Ty, Arg));
+    return;
+  }
+  case Intrinsic::ctpop: {
+    SDValue Arg = getValue(I.getArgOperand(0));
+    EVT Ty = Arg.getValueType();
+    setValue(&I, DAG.getNode(ISD::CTPOP, sdl, Ty, Arg));
+    return;
+  }
+  case Intrinsic::fshl:
+  case Intrinsic::fshr: {
+    bool IsFSHL = Intrinsic == Intrinsic::fshl;
+    SDValue X = getValue(I.getArgOperand(0));
+    SDValue Y = getValue(I.getArgOperand(1));
+    SDValue Z = getValue(I.getArgOperand(2));
+    EVT VT = X.getValueType();
+
+    if (X == Y) {
+      auto RotateOpcode = IsFSHL ? ISD::ROTL : ISD::ROTR;
+      setValue(&I, DAG.getNode(RotateOpcode, sdl, VT, X, Z));
+    } else {
+      auto FunnelOpcode = IsFSHL ? ISD::FSHL : ISD::FSHR;
+      setValue(&I, DAG.getNode(FunnelOpcode, sdl, VT, X, Y, Z));
+    }
+    return;
+  }
+  case Intrinsic::sadd_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::SADDSAT, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::uadd_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::UADDSAT, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::ssub_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::SSUBSAT, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::usub_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::USUBSAT, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::sshl_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::SSHLSAT, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::ushl_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::USHLSAT, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::smul_fix:
+  case Intrinsic::umul_fix:
+  case Intrinsic::smul_fix_sat:
+  case Intrinsic::umul_fix_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    setValue(&I, DAG.getNode(FixedPointIntrinsicToOpcode(Intrinsic), sdl,
+                             Op1.getValueType(), Op1, Op2, Op3));
+    return;
+  }
+  case Intrinsic::sdiv_fix:
+  case Intrinsic::udiv_fix:
+  case Intrinsic::sdiv_fix_sat:
+  case Intrinsic::udiv_fix_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    setValue(&I, expandDivFix(FixedPointIntrinsicToOpcode(Intrinsic), sdl,
+                              Op1, Op2, Op3, DAG, TLI));
+    return;
+  }
+  case Intrinsic::smax: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::SMAX, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::smin: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::SMIN, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::umax: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::UMAX, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::umin: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::UMIN, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::abs: {
+    // TODO: Preserve "int min is poison" arg in SDAG?
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    setValue(&I, DAG.getNode(ISD::ABS, sdl, Op1.getValueType(), Op1));
+    return;
+  }
+  case Intrinsic::stacksave: {
+    SDValue Op = getRoot();
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    Res = DAG.getNode(ISD::STACKSAVE, sdl, DAG.getVTList(VT, MVT::Other), Op);
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return;
+  }
+  case Intrinsic::stackrestore:
+    Res = getValue(I.getArgOperand(0));
+    DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, sdl, MVT::Other, getRoot(), Res));
+    return;
+  case Intrinsic::get_dynamic_area_offset: {
+    SDValue Op = getRoot();
+    EVT PtrTy = TLI.getFrameIndexTy(DAG.getDataLayout());
+    EVT ResTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    // Result type for @llvm.get.dynamic.area.offset should match PtrTy for
+    // target.
+    if (PtrTy.getFixedSizeInBits() < ResTy.getFixedSizeInBits())
+      report_fatal_error("Wrong result type for @llvm.get.dynamic.area.offset"
+                         " intrinsic!");
+    Res = DAG.getNode(ISD::GET_DYNAMIC_AREA_OFFSET, sdl, DAG.getVTList(ResTy),
+                      Op);
+    DAG.setRoot(Op);
+    setValue(&I, Res);
+    return;
+  }
+  case Intrinsic::stackguard: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    const Module &M = *MF.getFunction().getParent();
+    SDValue Chain = getRoot();
+    if (TLI.useLoadStackGuardNode()) {
+      Res = getLoadStackGuard(DAG, sdl, Chain);
+    } else {
+      EVT PtrTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
+      const Value *Global = TLI.getSDagStackGuard(M);
+      Align Align = DAG.getDataLayout().getPrefTypeAlign(Global->getType());
+      Res = DAG.getLoad(PtrTy, sdl, Chain, getValue(Global),
+                        MachinePointerInfo(Global, 0), Align,
+                        MachineMemOperand::MOVolatile);
+    }
+    if (TLI.useStackGuardXorFP())
+      Res = TLI.emitStackGuardXorFP(DAG, Res, sdl);
+    DAG.setRoot(Chain);
+    setValue(&I, Res);
+    return;
+  }
+  case Intrinsic::stackprotector: {
+    // Emit code into the DAG to store the stack guard onto the stack.
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    SDValue Src, Chain = getRoot();
+
+    if (TLI.useLoadStackGuardNode())
+      Src = getLoadStackGuard(DAG, sdl, Chain);
+    else
+      Src = getValue(I.getArgOperand(0));   // The guard's value.
+
+    AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
+
+    int FI = FuncInfo.StaticAllocaMap[Slot];
+    MFI.setStackProtectorIndex(FI);
+    EVT PtrTy = TLI.getFrameIndexTy(DAG.getDataLayout());
+
+    SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
+
+    // Store the stack protector onto the stack.
+    Res = DAG.getStore(
+        Chain, sdl, Src, FIN,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+        MaybeAlign(), MachineMemOperand::MOVolatile);
+    setValue(&I, Res);
+    DAG.setRoot(Res);
+    return;
+  }
+  case Intrinsic::objectsize:
+    llvm_unreachable("llvm.objectsize.* should have been lowered already");
+
+  case Intrinsic::is_constant:
+    llvm_unreachable("llvm.is.constant.* should have been lowered already");
+
+  case Intrinsic::annotation:
+  case Intrinsic::ptr_annotation:
+  case Intrinsic::launder_invariant_group:
+  case Intrinsic::strip_invariant_group:
+    // Drop the intrinsic, but forward the value
+    setValue(&I, getValue(I.getOperand(0)));
+    return;
+
+  case Intrinsic::assume:
+  case Intrinsic::experimental_noalias_scope_decl:
+  case Intrinsic::var_annotation:
+  case Intrinsic::sideeffect:
+    // Discard annotate attributes, noalias scope declarations, assumptions, and
+    // artificial side-effects.
+    return;
+
+  case Intrinsic::codeview_annotation: {
+    // Emit a label associated with this metadata.
+    MachineFunction &MF = DAG.getMachineFunction();
+    MCSymbol *Label =
+        MF.getMMI().getContext().createTempSymbol("annotation", true);
+    Metadata *MD = cast<MetadataAsValue>(I.getArgOperand(0))->getMetadata();
+    MF.addCodeViewAnnotation(Label, cast<MDNode>(MD));
+    Res = DAG.getLabelNode(ISD::ANNOTATION_LABEL, sdl, getRoot(), Label);
+    DAG.setRoot(Res);
+    return;
+  }
+
+  case Intrinsic::init_trampoline: {
+    const Function *F = cast<Function>(I.getArgOperand(1)->stripPointerCasts());
+
+    SDValue Ops[6];
+    Ops[0] = getRoot();
+    Ops[1] = getValue(I.getArgOperand(0));
+    Ops[2] = getValue(I.getArgOperand(1));
+    Ops[3] = getValue(I.getArgOperand(2));
+    Ops[4] = DAG.getSrcValue(I.getArgOperand(0));
+    Ops[5] = DAG.getSrcValue(F);
+
+    Res = DAG.getNode(ISD::INIT_TRAMPOLINE, sdl, MVT::Other, Ops);
+
+    DAG.setRoot(Res);
+    return;
+  }
+  case Intrinsic::adjust_trampoline:
+    setValue(&I, DAG.getNode(ISD::ADJUST_TRAMPOLINE, sdl,
+                             TLI.getPointerTy(DAG.getDataLayout()),
+                             getValue(I.getArgOperand(0))));
+    return;
+  case Intrinsic::gcroot: {
+    assert(DAG.getMachineFunction().getFunction().hasGC() &&
+           "only valid in functions with gc specified, enforced by Verifier");
+    assert(GFI && "implied by previous");
+    const Value *Alloca = I.getArgOperand(0)->stripPointerCasts();
+    const Constant *TypeMap = cast<Constant>(I.getArgOperand(1));
+
+    FrameIndexSDNode *FI = cast<FrameIndexSDNode>(getValue(Alloca).getNode());
+    GFI->addStackRoot(FI->getIndex(), TypeMap);
+    return;
+  }
+  case Intrinsic::gcread:
+  case Intrinsic::gcwrite:
+    llvm_unreachable("GC failed to lower gcread/gcwrite intrinsics!");
+  case Intrinsic::get_rounding:
+    Res = DAG.getNode(ISD::GET_ROUNDING, sdl, {MVT::i32, MVT::Other}, getRoot());
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return;
+
+  case Intrinsic::expect:
+    // Just replace __builtin_expect(exp, c) with EXP.
+    setValue(&I, getValue(I.getArgOperand(0)));
+    return;
+
+  case Intrinsic::ubsantrap:
+  case Intrinsic::debugtrap:
+  case Intrinsic::trap: {
+    StringRef TrapFuncName =
+        I.getAttributes().getFnAttr("trap-func-name").getValueAsString();
+    if (TrapFuncName.empty()) {
+      switch (Intrinsic) {
+      case Intrinsic::trap:
+        DAG.setRoot(DAG.getNode(ISD::TRAP, sdl, MVT::Other, getRoot()));
+        break;
+      case Intrinsic::debugtrap:
+        DAG.setRoot(DAG.getNode(ISD::DEBUGTRAP, sdl, MVT::Other, getRoot()));
+        break;
+      case Intrinsic::ubsantrap:
+        DAG.setRoot(DAG.getNode(
+            ISD::UBSANTRAP, sdl, MVT::Other, getRoot(),
+            DAG.getTargetConstant(
+                cast<ConstantInt>(I.getArgOperand(0))->getZExtValue(), sdl,
+                MVT::i32)));
+        break;
+      default: llvm_unreachable("unknown trap intrinsic");
+      }
+      return;
+    }
+    TargetLowering::ArgListTy Args;
+    if (Intrinsic == Intrinsic::ubsantrap) {
+      Args.push_back(TargetLoweringBase::ArgListEntry());
+      Args[0].Val = I.getArgOperand(0);
+      Args[0].Node = getValue(Args[0].Val);
+      Args[0].Ty = Args[0].Val->getType();
+    }
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
+        CallingConv::C, I.getType(),
+        DAG.getExternalSymbol(TrapFuncName.data(),
+                              TLI.getPointerTy(DAG.getDataLayout())),
+        std::move(Args));
+
+    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
+    DAG.setRoot(Result.second);
+    return;
+  }
+
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::umul_with_overflow:
+  case Intrinsic::smul_with_overflow: {
+    ISD::NodeType Op;
+    switch (Intrinsic) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::uadd_with_overflow: Op = ISD::UADDO; break;
+    case Intrinsic::sadd_with_overflow: Op = ISD::SADDO; break;
+    case Intrinsic::usub_with_overflow: Op = ISD::USUBO; break;
+    case Intrinsic::ssub_with_overflow: Op = ISD::SSUBO; break;
+    case Intrinsic::umul_with_overflow: Op = ISD::UMULO; break;
+    case Intrinsic::smul_with_overflow: Op = ISD::SMULO; break;
+    }
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+
+    EVT ResultVT = Op1.getValueType();
+    EVT OverflowVT = MVT::i1;
+    if (ResultVT.isVector())
+      OverflowVT = EVT::getVectorVT(
+          *Context, OverflowVT, ResultVT.getVectorElementCount());
+
+    SDVTList VTs = DAG.getVTList(ResultVT, OverflowVT);
+    setValue(&I, DAG.getNode(Op, sdl, VTs, Op1, Op2));
+    return;
+  }
+  case Intrinsic::prefetch: {
+    SDValue Ops[5];
+    unsigned rw = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+    auto Flags = rw == 0 ? MachineMemOperand::MOLoad :MachineMemOperand::MOStore;
+    Ops[0] = DAG.getRoot();
+    Ops[1] = getValue(I.getArgOperand(0));
+    Ops[2] = getValue(I.getArgOperand(1));
+    Ops[3] = getValue(I.getArgOperand(2));
+    Ops[4] = getValue(I.getArgOperand(3));
+    SDValue Result = DAG.getMemIntrinsicNode(
+        ISD::PREFETCH, sdl, DAG.getVTList(MVT::Other), Ops,
+        EVT::getIntegerVT(*Context, 8), MachinePointerInfo(I.getArgOperand(0)),
+        /* align */ std::nullopt, Flags);
+
+    // Chain the prefetch in parallell with any pending loads, to stay out of
+    // the way of later optimizations.
+    PendingLoads.push_back(Result);
+    Result = getRoot();
+    DAG.setRoot(Result);
+    return;
+  }
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end: {
+    bool IsStart = (Intrinsic == Intrinsic::lifetime_start);
+    // Stack coloring is not enabled in O0, discard region information.
+    if (TM.getOptLevel() == CodeGenOpt::None)
+      return;
+
+    const int64_t ObjectSize =
+        cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
+    Value *const ObjectPtr = I.getArgOperand(1);
+    SmallVector<const Value *, 4> Allocas;
+    getUnderlyingObjects(ObjectPtr, Allocas);
+
+    for (const Value *Alloca : Allocas) {
+      const AllocaInst *LifetimeObject = dyn_cast_or_null<AllocaInst>(Alloca);
+
+      // Could not find an Alloca.
+      if (!LifetimeObject)
+        continue;
+
+      // First check that the Alloca is static, otherwise it won't have a
+      // valid frame index.
+      auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject);
+      if (SI == FuncInfo.StaticAllocaMap.end())
+        return;
+
+      const int FrameIndex = SI->second;
+      int64_t Offset;
+      if (GetPointerBaseWithConstantOffset(
+              ObjectPtr, Offset, DAG.getDataLayout()) != LifetimeObject)
+        Offset = -1; // Cannot determine offset from alloca to lifetime object.
+      Res = DAG.getLifetimeNode(IsStart, sdl, getRoot(), FrameIndex, ObjectSize,
+                                Offset);
+      DAG.setRoot(Res);
+    }
+    return;
+  }
+  case Intrinsic::pseudoprobe: {
+    auto Guid = cast<ConstantInt>(I.getArgOperand(0))->getZExtValue();
+    auto Index = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+    auto Attr = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
+    Res = DAG.getPseudoProbeNode(sdl, getRoot(), Guid, Index, Attr);
+    DAG.setRoot(Res);
+    return;
+  }
+  case Intrinsic::invariant_start:
+    // Discard region information.
+    setValue(&I,
+             DAG.getUNDEF(TLI.getValueType(DAG.getDataLayout(), I.getType())));
+    return;
+  case Intrinsic::invariant_end:
+    // Discard region information.
+    return;
+  case Intrinsic::clear_cache:
+    /// FunctionName may be null.
+    if (const char *FunctionName = TLI.getClearCacheBuiltinName())
+      lowerCallToExternalSymbol(I, FunctionName);
+    return;
+  case Intrinsic::donothing:
+  case Intrinsic::seh_try_begin:
+  case Intrinsic::seh_scope_begin:
+  case Intrinsic::seh_try_end:
+  case Intrinsic::seh_scope_end:
+    // ignore
+    return;
+  case Intrinsic::experimental_stackmap:
+    visitStackmap(I);
+    return;
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    visitPatchpoint(I);
+    return;
+  case Intrinsic::experimental_gc_statepoint:
+    LowerStatepoint(cast<GCStatepointInst>(I));
+    return;
+  case Intrinsic::experimental_gc_result:
+    visitGCResult(cast<GCResultInst>(I));
+    return;
+  case Intrinsic::experimental_gc_relocate:
+    visitGCRelocate(cast<GCRelocateInst>(I));
+    return;
+  case Intrinsic::instrprof_cover:
+    llvm_unreachable("instrprof failed to lower a cover");
+  case Intrinsic::instrprof_increment:
+    llvm_unreachable("instrprof failed to lower an increment");
+  case Intrinsic::instrprof_timestamp:
+    llvm_unreachable("instrprof failed to lower a timestamp");
+  case Intrinsic::instrprof_value_profile:
+    llvm_unreachable("instrprof failed to lower a value profiling call");
+  case Intrinsic::localescape: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
+
+    // Directly emit some LOCAL_ESCAPE machine instrs. Label assignment emission
+    // is the same on all targets.
+    for (unsigned Idx = 0, E = I.arg_size(); Idx < E; ++Idx) {
+      Value *Arg = I.getArgOperand(Idx)->stripPointerCasts();
+      if (isa<ConstantPointerNull>(Arg))
+        continue; // Skip null pointers. They represent a hole in index space.
+      AllocaInst *Slot = cast<AllocaInst>(Arg);
+      assert(FuncInfo.StaticAllocaMap.count(Slot) &&
+             "can only escape static allocas");
+      int FI = FuncInfo.StaticAllocaMap[Slot];
+      MCSymbol *FrameAllocSym =
+          MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
+              GlobalValue::dropLLVMManglingEscape(MF.getName()), Idx);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl,
+              TII->get(TargetOpcode::LOCAL_ESCAPE))
+          .addSym(FrameAllocSym)
+          .addFrameIndex(FI);
+    }
+
+    return;
+  }
+
+  case Intrinsic::localrecover: {
+    // i8* @llvm.localrecover(i8* %fn, i8* %fp, i32 %idx)
+    MachineFunction &MF = DAG.getMachineFunction();
+
+    // Get the symbol that defines the frame offset.
+    auto *Fn = cast<Function>(I.getArgOperand(0)->stripPointerCasts());
+    auto *Idx = cast<ConstantInt>(I.getArgOperand(2));
+    unsigned IdxVal =
+        unsigned(Idx->getLimitedValue(std::numeric_limits<int>::max()));
+    MCSymbol *FrameAllocSym =
+        MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
+            GlobalValue::dropLLVMManglingEscape(Fn->getName()), IdxVal);
+
+    Value *FP = I.getArgOperand(1);
+    SDValue FPVal = getValue(FP);
+    EVT PtrVT = FPVal.getValueType();
+
+    // Create a MCSymbol for the label to avoid any target lowering
+    // that would make this PC relative.
+    SDValue OffsetSym = DAG.getMCSymbol(FrameAllocSym, PtrVT);
+    SDValue OffsetVal =
+        DAG.getNode(ISD::LOCAL_RECOVER, sdl, PtrVT, OffsetSym);
+
+    // Add the offset to the FP.
+    SDValue Add = DAG.getMemBasePlusOffset(FPVal, OffsetVal, sdl);
+    setValue(&I, Add);
+
+    return;
+  }
+
+  case Intrinsic::eh_exceptionpointer:
+  case Intrinsic::eh_exceptioncode: {
+    // Get the exception pointer vreg, copy from it, and resize it to fit.
+    const auto *CPI = cast<CatchPadInst>(I.getArgOperand(0));
+    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+    const TargetRegisterClass *PtrRC = TLI.getRegClassFor(PtrVT);
+    unsigned VReg = FuncInfo.getCatchPadExceptionPointerVReg(CPI, PtrRC);
+    SDValue N = DAG.getCopyFromReg(DAG.getEntryNode(), sdl, VReg, PtrVT);
+    if (Intrinsic == Intrinsic::eh_exceptioncode)
+      N = DAG.getZExtOrTrunc(N, sdl, MVT::i32);
+    setValue(&I, N);
+    return;
+  }
+  case Intrinsic::xray_customevent: {
+    // Here we want to make sure that the intrinsic behaves as if it has a
+    // specific calling convention.
+    const auto &Triple = DAG.getTarget().getTargetTriple();
+    if (!Triple.isAArch64(64) && Triple.getArch() != Triple::x86_64)
+      return;
+
+    SmallVector<SDValue, 8> Ops;
+
+    // We want to say that we always want the arguments in registers.
+    SDValue LogEntryVal = getValue(I.getArgOperand(0));
+    SDValue StrSizeVal = getValue(I.getArgOperand(1));
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Chain = getRoot();
+    Ops.push_back(LogEntryVal);
+    Ops.push_back(StrSizeVal);
+    Ops.push_back(Chain);
+
+    // We need to enforce the calling convention for the callsite, so that
+    // argument ordering is enforced correctly, and that register allocation can
+    // see that some registers may be assumed clobbered and have to preserve
+    // them across calls to the intrinsic.
+    MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHABLE_EVENT_CALL,
+                                           sdl, NodeTys, Ops);
+    SDValue patchableNode = SDValue(MN, 0);
+    DAG.setRoot(patchableNode);
+    setValue(&I, patchableNode);
+    return;
+  }
+  case Intrinsic::xray_typedevent: {
+    // Here we want to make sure that the intrinsic behaves as if it has a
+    // specific calling convention.
+    const auto &Triple = DAG.getTarget().getTargetTriple();
+    if (!Triple.isAArch64(64) && Triple.getArch() != Triple::x86_64)
+      return;
+
+    SmallVector<SDValue, 8> Ops;
+
+    // We want to say that we always want the arguments in registers.
+    // It's unclear to me how manipulating the selection DAG here forces callers
+    // to provide arguments in registers instead of on the stack.
+    SDValue LogTypeId = getValue(I.getArgOperand(0));
+    SDValue LogEntryVal = getValue(I.getArgOperand(1));
+    SDValue StrSizeVal = getValue(I.getArgOperand(2));
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Chain = getRoot();
+    Ops.push_back(LogTypeId);
+    Ops.push_back(LogEntryVal);
+    Ops.push_back(StrSizeVal);
+    Ops.push_back(Chain);
+
+    // We need to enforce the calling convention for the callsite, so that
+    // argument ordering is enforced correctly, and that register allocation can
+    // see that some registers may be assumed clobbered and have to preserve
+    // them across calls to the intrinsic.
+    MachineSDNode *MN = DAG.getMachineNode(
+        TargetOpcode::PATCHABLE_TYPED_EVENT_CALL, sdl, NodeTys, Ops);
+    SDValue patchableNode = SDValue(MN, 0);
+    DAG.setRoot(patchableNode);
+    setValue(&I, patchableNode);
+    return;
+  }
+  case Intrinsic::experimental_deoptimize:
+    LowerDeoptimizeCall(&I);
+    return;
+  case Intrinsic::experimental_stepvector:
+    visitStepVector(I);
+    return;
+  case Intrinsic::vector_reduce_fadd:
+  case Intrinsic::vector_reduce_fmul:
+  case Intrinsic::vector_reduce_add:
+  case Intrinsic::vector_reduce_mul:
+  case Intrinsic::vector_reduce_and:
+  case Intrinsic::vector_reduce_or:
+  case Intrinsic::vector_reduce_xor:
+  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_umax:
+  case Intrinsic::vector_reduce_umin:
+  case Intrinsic::vector_reduce_fmax:
+  case Intrinsic::vector_reduce_fmin:
+  case Intrinsic::vector_reduce_fmaximum:
+  case Intrinsic::vector_reduce_fminimum:
+    visitVectorReduce(I, Intrinsic);
+    return;
+
+  case Intrinsic::icall_branch_funnel: {
+    SmallVector<SDValue, 16> Ops;
+    Ops.push_back(getValue(I.getArgOperand(0)));
+
+    int64_t Offset;
+    auto *Base = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
+        I.getArgOperand(1), Offset, DAG.getDataLayout()));
+    if (!Base)
+      report_fatal_error(
+          "llvm.icall.branch.funnel operand must be a GlobalValue");
+    Ops.push_back(DAG.getTargetGlobalAddress(Base, sdl, MVT::i64, 0));
+
+    struct BranchFunnelTarget {
+      int64_t Offset;
+      SDValue Target;
+    };
+    SmallVector<BranchFunnelTarget, 8> Targets;
+
+    for (unsigned Op = 1, N = I.arg_size(); Op != N; Op += 2) {
+      auto *ElemBase = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
+          I.getArgOperand(Op), Offset, DAG.getDataLayout()));
+      if (ElemBase != Base)
+        report_fatal_error("all llvm.icall.branch.funnel operands must refer "
+                           "to the same GlobalValue");
+
+      SDValue Val = getValue(I.getArgOperand(Op + 1));
+      auto *GA = dyn_cast<GlobalAddressSDNode>(Val);
+      if (!GA)
+        report_fatal_error(
+            "llvm.icall.branch.funnel operand must be a GlobalValue");
+      Targets.push_back({Offset, DAG.getTargetGlobalAddress(
+                                     GA->getGlobal(), sdl, Val.getValueType(),
+                                     GA->getOffset())});
+    }
+    llvm::sort(Targets,
+               [](const BranchFunnelTarget &T1, const BranchFunnelTarget &T2) {
+                 return T1.Offset < T2.Offset;
+               });
+
+    for (auto &T : Targets) {
+      Ops.push_back(DAG.getTargetConstant(T.Offset, sdl, MVT::i32));
+      Ops.push_back(T.Target);
+    }
+
+    Ops.push_back(DAG.getRoot()); // Chain
+    SDValue N(DAG.getMachineNode(TargetOpcode::ICALL_BRANCH_FUNNEL, sdl,
+                                 MVT::Other, Ops),
+              0);
+    DAG.setRoot(N);
+    setValue(&I, N);
+    HasTailCall = true;
+    return;
+  }
+
+  case Intrinsic::wasm_landingpad_index:
+    // Information this intrinsic contained has been transferred to
+    // MachineFunction in SelectionDAGISel::PrepareEHLandingPad. We can safely
+    // delete it now.
+    return;
+
+  case Intrinsic::aarch64_settag:
+  case Intrinsic::aarch64_settag_zero: {
+    const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
+    bool ZeroMemory = Intrinsic == Intrinsic::aarch64_settag_zero;
+    SDValue Val = TSI.EmitTargetCodeForSetTag(
+        DAG, sdl, getRoot(), getValue(I.getArgOperand(0)),
+        getValue(I.getArgOperand(1)), MachinePointerInfo(I.getArgOperand(0)),
+        ZeroMemory);
+    DAG.setRoot(Val);
+    setValue(&I, Val);
+    return;
+  }
+  case Intrinsic::ptrmask: {
+    SDValue Ptr = getValue(I.getOperand(0));
+    SDValue Const = getValue(I.getOperand(1));
+
+    EVT PtrVT = Ptr.getValueType();
+    setValue(&I, DAG.getNode(ISD::AND, sdl, PtrVT, Ptr,
+                             DAG.getZExtOrTrunc(Const, sdl, PtrVT)));
+    return;
+  }
+  case Intrinsic::threadlocal_address: {
+    setValue(&I, getValue(I.getOperand(0)));
+    return;
+  }
+  case Intrinsic::get_active_lane_mask: {
+    EVT CCVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    SDValue Index = getValue(I.getOperand(0));
+    EVT ElementVT = Index.getValueType();
+
+    if (!TLI.shouldExpandGetActiveLaneMask(CCVT, ElementVT)) {
+      visitTargetIntrinsic(I, Intrinsic);
+      return;
+    }
+
+    SDValue TripCount = getValue(I.getOperand(1));
+    EVT VecTy = EVT::getVectorVT(*DAG.getContext(), ElementVT,
+                                 CCVT.getVectorElementCount());
+
+    SDValue VectorIndex = DAG.getSplat(VecTy, sdl, Index);
+    SDValue VectorTripCount = DAG.getSplat(VecTy, sdl, TripCount);
+    SDValue VectorStep = DAG.getStepVector(sdl, VecTy);
+    SDValue VectorInduction = DAG.getNode(
+        ISD::UADDSAT, sdl, VecTy, VectorIndex, VectorStep);
+    SDValue SetCC = DAG.getSetCC(sdl, CCVT, VectorInduction,
+                                 VectorTripCount, ISD::CondCode::SETULT);
+    setValue(&I, SetCC);
+    return;
+  }
+  case Intrinsic::experimental_get_vector_length: {
+    assert(cast<ConstantInt>(I.getOperand(1))->getSExtValue() > 0 &&
+           "Expected positive VF");
+    unsigned VF = cast<ConstantInt>(I.getOperand(1))->getZExtValue();
+    bool IsScalable = cast<ConstantInt>(I.getOperand(2))->isOne();
+
+    SDValue Count = getValue(I.getOperand(0));
+    EVT CountVT = Count.getValueType();
+
+    if (!TLI.shouldExpandGetVectorLength(CountVT, VF, IsScalable)) {
+      visitTargetIntrinsic(I, Intrinsic);
+      return;
+    }
+
+    // Expand to a umin between the trip count and the maximum elements the type
+    // can hold.
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+    // Extend the trip count to at least the result VT.
+    if (CountVT.bitsLT(VT)) {
+      Count = DAG.getNode(ISD::ZERO_EXTEND, sdl, VT, Count);
+      CountVT = VT;
+    }
+
+    SDValue MaxEVL = DAG.getElementCount(sdl, CountVT,
+                                         ElementCount::get(VF, IsScalable));
+
+    SDValue UMin = DAG.getNode(ISD::UMIN, sdl, CountVT, Count, MaxEVL);
+    // Clip to the result type if needed.
+    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, sdl, VT, UMin);
+
+    setValue(&I, Trunc);
+    return;
+  }
+  case Intrinsic::vector_insert: {
+    SDValue Vec = getValue(I.getOperand(0));
+    SDValue SubVec = getValue(I.getOperand(1));
+    SDValue Index = getValue(I.getOperand(2));
+
+    // The intrinsic's index type is i64, but the SDNode requires an index type
+    // suitable for the target. Convert the index as required.
+    MVT VectorIdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+    if (Index.getValueType() != VectorIdxTy)
+      Index = DAG.getVectorIdxConstant(
+          cast<ConstantSDNode>(Index)->getZExtValue(), sdl);
+
+    EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    setValue(&I, DAG.getNode(ISD::INSERT_SUBVECTOR, sdl, ResultVT, Vec, SubVec,
+                             Index));
+    return;
+  }
+  case Intrinsic::vector_extract: {
+    SDValue Vec = getValue(I.getOperand(0));
+    SDValue Index = getValue(I.getOperand(1));
+    EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+    // The intrinsic's index type is i64, but the SDNode requires an index type
+    // suitable for the target. Convert the index as required.
+    MVT VectorIdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+    if (Index.getValueType() != VectorIdxTy)
+      Index = DAG.getVectorIdxConstant(
+          cast<ConstantSDNode>(Index)->getZExtValue(), sdl);
+
+    setValue(&I,
+             DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, ResultVT, Vec, Index));
+    return;
+  }
+  case Intrinsic::experimental_vector_reverse:
+    visitVectorReverse(I);
+    return;
+  case Intrinsic::experimental_vector_splice:
+    visitVectorSplice(I);
+    return;
+  case Intrinsic::callbr_landingpad:
+    visitCallBrLandingPad(I);
+    return;
+  case Intrinsic::experimental_vector_interleave2:
+    visitVectorInterleave(I);
+    return;
+  case Intrinsic::experimental_vector_deinterleave2:
+    visitVectorDeinterleave(I);
+    return;
+  }
+}
+
+void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
+    const ConstrainedFPIntrinsic &FPI) {
+  SDLoc sdl = getCurSDLoc();
+
+  // We do not need to serialize constrained FP intrinsics against
+  // each other or against (nonvolatile) loads, so they can be
+  // chained like loads.
+  SDValue Chain = DAG.getRoot();
+  SmallVector<SDValue, 4> Opers;
+  Opers.push_back(Chain);
+  if (FPI.isUnaryOp()) {
+    Opers.push_back(getValue(FPI.getArgOperand(0)));
+  } else if (FPI.isTernaryOp()) {
+    Opers.push_back(getValue(FPI.getArgOperand(0)));
+    Opers.push_back(getValue(FPI.getArgOperand(1)));
+    Opers.push_back(getValue(FPI.getArgOperand(2)));
+  } else {
+    Opers.push_back(getValue(FPI.getArgOperand(0)));
+    Opers.push_back(getValue(FPI.getArgOperand(1)));
+  }
+
+  auto pushOutChain = [this](SDValue Result, fp::ExceptionBehavior EB) {
+    assert(Result.getNode()->getNumValues() == 2);
+
+    // Push node to the appropriate list so that future instructions can be
+    // chained up correctly.
+    SDValue OutChain = Result.getValue(1);
+    switch (EB) {
+    case fp::ExceptionBehavior::ebIgnore:
+      // The only reason why ebIgnore nodes still need to be chained is that
+      // they might depend on the current rounding mode, and therefore must
+      // not be moved across instruction that may change that mode.
+      [[fallthrough]];
+    case fp::ExceptionBehavior::ebMayTrap:
+      // These must not be moved across calls or instructions that may change
+      // floating-point exception masks.
+      PendingConstrainedFP.push_back(OutChain);
+      break;
+    case fp::ExceptionBehavior::ebStrict:
+      // These must not be moved across calls or instructions that may change
+      // floating-point exception masks or read floating-point exception flags.
+      // In addition, they cannot be optimized out even if unused.
+      PendingConstrainedFPStrict.push_back(OutChain);
+      break;
+    }
+  };
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), FPI.getType());
+  SDVTList VTs = DAG.getVTList(VT, MVT::Other);
+  fp::ExceptionBehavior EB = *FPI.getExceptionBehavior();
+
+  SDNodeFlags Flags;
+  if (EB == fp::ExceptionBehavior::ebIgnore)
+    Flags.setNoFPExcept(true);
+
+  if (auto *FPOp = dyn_cast<FPMathOperator>(&FPI))
+    Flags.copyFMF(*FPOp);
+
+  unsigned Opcode;
+  switch (FPI.getIntrinsicID()) {
+  default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
+  case Intrinsic::INTRINSIC:                                                   \
+    Opcode = ISD::STRICT_##DAGN;                                               \
+    break;
+#include "llvm/IR/ConstrainedOps.def"
+  case Intrinsic::experimental_constrained_fmuladd: {
+    Opcode = ISD::STRICT_FMA;
+    // Break fmuladd into fmul and fadd.
+    if (TM.Options.AllowFPOpFusion == FPOpFusion::Strict ||
+        !TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
+      Opers.pop_back();
+      SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers, Flags);
+      pushOutChain(Mul, EB);
+      Opcode = ISD::STRICT_FADD;
+      Opers.clear();
+      Opers.push_back(Mul.getValue(1));
+      Opers.push_back(Mul.getValue(0));
+      Opers.push_back(getValue(FPI.getArgOperand(2)));
+    }
+    break;
+  }
+  }
+
+  // A few strict DAG nodes carry additional operands that are not
+  // set up by the default code above.
+  switch (Opcode) {
+  default: break;
+  case ISD::STRICT_FP_ROUND:
+    Opers.push_back(
+        DAG.getTargetConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout())));
+    break;
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS: {
+    auto *FPCmp = dyn_cast<ConstrainedFPCmpIntrinsic>(&FPI);
+    ISD::CondCode Condition = getFCmpCondCode(FPCmp->getPredicate());
+    if (TM.Options.NoNaNsFPMath)
+      Condition = getFCmpCodeWithoutNaN(Condition);
+    Opers.push_back(DAG.getCondCode(Condition));
+    break;
+  }
+  }
+
+  SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers, Flags);
+  pushOutChain(Result, EB);
+
+  SDValue FPResult = Result.getValue(0);
+  setValue(&FPI, FPResult);
+}
+
+static unsigned getISDForVPIntrinsic(const VPIntrinsic &VPIntrin) {
+  std::optional<unsigned> ResOPC;
+  switch (VPIntrin.getIntrinsicID()) {
+  case Intrinsic::vp_ctlz: {
+    bool IsZeroUndef = cast<ConstantInt>(VPIntrin.getArgOperand(1))->isOne();
+    ResOPC = IsZeroUndef ? ISD::VP_CTLZ_ZERO_UNDEF : ISD::VP_CTLZ;
+    break;
+  }
+  case Intrinsic::vp_cttz: {
+    bool IsZeroUndef = cast<ConstantInt>(VPIntrin.getArgOperand(1))->isOne();
+    ResOPC = IsZeroUndef ? ISD::VP_CTTZ_ZERO_UNDEF : ISD::VP_CTTZ;
+    break;
+  }
+#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD)                                    \
+  case Intrinsic::VPID:                                                        \
+    ResOPC = ISD::VPSD;                                                        \
+    break;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+
+  if (!ResOPC)
+    llvm_unreachable(
+        "Inconsistency: no SDNode available for this VPIntrinsic!");
+
+  if (*ResOPC == ISD::VP_REDUCE_SEQ_FADD ||
+      *ResOPC == ISD::VP_REDUCE_SEQ_FMUL) {
+    if (VPIntrin.getFastMathFlags().allowReassoc())
+      return *ResOPC == ISD::VP_REDUCE_SEQ_FADD ? ISD::VP_REDUCE_FADD
+                                                : ISD::VP_REDUCE_FMUL;
+  }
+
+  return *ResOPC;
+}
+
+void SelectionDAGBuilder::visitVPLoad(
+    const VPIntrinsic &VPIntrin, EVT VT,
+    const SmallVectorImpl<SDValue> &OpValues) {
+  SDLoc DL = getCurSDLoc();
+  Value *PtrOperand = VPIntrin.getArgOperand(0);
+  MaybeAlign Alignment = VPIntrin.getPointerAlignment();
+  AAMDNodes AAInfo = VPIntrin.getAAMetadata();
+  const MDNode *Ranges = getRangeMetadata(VPIntrin);
+  SDValue LD;
+  // Do not serialize variable-length loads of constant memory with
+  // anything.
+  if (!Alignment)
+    Alignment = DAG.getEVTAlign(VT);
+  MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo);
+  bool AddToChain = !AA || !AA->pointsToConstantMemory(ML);
+  SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
+      MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges);
+  LD = DAG.getLoadVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2],
+                     MMO, false /*IsExpanding */);
+  if (AddToChain)
+    PendingLoads.push_back(LD.getValue(1));
+  setValue(&VPIntrin, LD);
+}
+
+void SelectionDAGBuilder::visitVPGather(
+    const VPIntrinsic &VPIntrin, EVT VT,
+    const SmallVectorImpl<SDValue> &OpValues) {
+  SDLoc DL = getCurSDLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  Value *PtrOperand = VPIntrin.getArgOperand(0);
+  MaybeAlign Alignment = VPIntrin.getPointerAlignment();
+  AAMDNodes AAInfo = VPIntrin.getAAMetadata();
+  const MDNode *Ranges = getRangeMetadata(VPIntrin);
+  SDValue LD;
+  if (!Alignment)
+    Alignment = DAG.getEVTAlign(VT.getScalarType());
+  unsigned AS =
+    PtrOperand->getType()->getScalarType()->getPointerAddressSpace();
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+     MachinePointerInfo(AS), MachineMemOperand::MOLoad,
+     MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges);
+  SDValue Base, Index, Scale;
+  ISD::MemIndexType IndexType;
+  bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale,
+                                    this, VPIntrin.getParent(),
+                                    VT.getScalarStoreSize());
+  if (!UniformBase) {
+    Base = DAG.getConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()));
+    Index = getValue(PtrOperand);
+    IndexType = ISD::SIGNED_SCALED;
+    Scale = DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout()));
+  }
+  EVT IdxVT = Index.getValueType();
+  EVT EltTy = IdxVT.getVectorElementType();
+  if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) {
+    EVT NewIdxVT = IdxVT.changeVectorElementType(EltTy);
+    Index = DAG.getNode(ISD::SIGN_EXTEND, DL, NewIdxVT, Index);
+  }
+  LD = DAG.getGatherVP(
+      DAG.getVTList(VT, MVT::Other), VT, DL,
+      {DAG.getRoot(), Base, Index, Scale, OpValues[1], OpValues[2]}, MMO,
+      IndexType);
+  PendingLoads.push_back(LD.getValue(1));
+  setValue(&VPIntrin, LD);
+}
+
+void SelectionDAGBuilder::visitVPStore(
+    const VPIntrinsic &VPIntrin, const SmallVectorImpl<SDValue> &OpValues) {
+  SDLoc DL = getCurSDLoc();
+  Value *PtrOperand = VPIntrin.getArgOperand(1);
+  EVT VT = OpValues[0].getValueType();
+  MaybeAlign Alignment = VPIntrin.getPointerAlignment();
+  AAMDNodes AAInfo = VPIntrin.getAAMetadata();
+  SDValue ST;
+  if (!Alignment)
+    Alignment = DAG.getEVTAlign(VT);
+  SDValue Ptr = OpValues[1];
+  SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
+      MemoryLocation::UnknownSize, *Alignment, AAInfo);
+  ST = DAG.getStoreVP(getMemoryRoot(), DL, OpValues[0], Ptr, Offset,
+                      OpValues[2], OpValues[3], VT, MMO, ISD::UNINDEXED,
+                      /* IsTruncating */ false, /*IsCompressing*/ false);
+  DAG.setRoot(ST);
+  setValue(&VPIntrin, ST);
+}
+
+void SelectionDAGBuilder::visitVPScatter(
+    const VPIntrinsic &VPIntrin, const SmallVectorImpl<SDValue> &OpValues) {
+  SDLoc DL = getCurSDLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  Value *PtrOperand = VPIntrin.getArgOperand(1);
+  EVT VT = OpValues[0].getValueType();
+  MaybeAlign Alignment = VPIntrin.getPointerAlignment();
+  AAMDNodes AAInfo = VPIntrin.getAAMetadata();
+  SDValue ST;
+  if (!Alignment)
+    Alignment = DAG.getEVTAlign(VT.getScalarType());
+  unsigned AS =
+      PtrOperand->getType()->getScalarType()->getPointerAddressSpace();
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(AS), MachineMemOperand::MOStore,
+      MemoryLocation::UnknownSize, *Alignment, AAInfo);
+  SDValue Base, Index, Scale;
+  ISD::MemIndexType IndexType;
+  bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale,
+                                    this, VPIntrin.getParent(),
+                                    VT.getScalarStoreSize());
+  if (!UniformBase) {
+    Base = DAG.getConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout()));
+    Index = getValue(PtrOperand);
+    IndexType = ISD::SIGNED_SCALED;
+    Scale =
+      DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout()));
+  }
+  EVT IdxVT = Index.getValueType();
+  EVT EltTy = IdxVT.getVectorElementType();
+  if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) {
+    EVT NewIdxVT = IdxVT.changeVectorElementType(EltTy);
+    Index = DAG.getNode(ISD::SIGN_EXTEND, DL, NewIdxVT, Index);
+  }
+  ST = DAG.getScatterVP(DAG.getVTList(MVT::Other), VT, DL,
+                        {getMemoryRoot(), OpValues[0], Base, Index, Scale,
+                         OpValues[2], OpValues[3]},
+                        MMO, IndexType);
+  DAG.setRoot(ST);
+  setValue(&VPIntrin, ST);
+}
+
+void SelectionDAGBuilder::visitVPStridedLoad(
+    const VPIntrinsic &VPIntrin, EVT VT,
+    const SmallVectorImpl<SDValue> &OpValues) {
+  SDLoc DL = getCurSDLoc();
+  Value *PtrOperand = VPIntrin.getArgOperand(0);
+  MaybeAlign Alignment = VPIntrin.getPointerAlignment();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlign(VT.getScalarType());
+  AAMDNodes AAInfo = VPIntrin.getAAMetadata();
+  const MDNode *Ranges = getRangeMetadata(VPIntrin);
+  MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo);
+  bool AddToChain = !AA || !AA->pointsToConstantMemory(ML);
+  SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
+      MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges);
+
+  SDValue LD = DAG.getStridedLoadVP(VT, DL, InChain, OpValues[0], OpValues[1],
+                                    OpValues[2], OpValues[3], MMO,
+                                    false /*IsExpanding*/);
+
+  if (AddToChain)
+    PendingLoads.push_back(LD.getValue(1));
+  setValue(&VPIntrin, LD);
+}
+
+void SelectionDAGBuilder::visitVPStridedStore(
+    const VPIntrinsic &VPIntrin, const SmallVectorImpl<SDValue> &OpValues) {
+  SDLoc DL = getCurSDLoc();
+  Value *PtrOperand = VPIntrin.getArgOperand(1);
+  EVT VT = OpValues[0].getValueType();
+  MaybeAlign Alignment = VPIntrin.getPointerAlignment();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlign(VT.getScalarType());
+  AAMDNodes AAInfo = VPIntrin.getAAMetadata();
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
+      MemoryLocation::UnknownSize, *Alignment, AAInfo);
+
+  SDValue ST = DAG.getStridedStoreVP(
+      getMemoryRoot(), DL, OpValues[0], OpValues[1],
+      DAG.getUNDEF(OpValues[1].getValueType()), OpValues[2], OpValues[3],
+      OpValues[4], VT, MMO, ISD::UNINDEXED, /*IsTruncating*/ false,
+      /*IsCompressing*/ false);
+
+  DAG.setRoot(ST);
+  setValue(&VPIntrin, ST);
+}
+
+void SelectionDAGBuilder::visitVPCmp(const VPCmpIntrinsic &VPIntrin) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc DL = getCurSDLoc();
+
+  ISD::CondCode Condition;
+  CmpInst::Predicate CondCode = VPIntrin.getPredicate();
+  bool IsFP = VPIntrin.getOperand(0)->getType()->isFPOrFPVectorTy();
+  if (IsFP) {
+    // FIXME: Regular fcmps are FPMathOperators which may have fast-math (nnan)
+    // flags, but calls that don't return floating-point types can't be
+    // FPMathOperators, like vp.fcmp. This affects constrained fcmp too.
+    Condition = getFCmpCondCode(CondCode);
+    if (TM.Options.NoNaNsFPMath)
+      Condition = getFCmpCodeWithoutNaN(Condition);
+  } else {
+    Condition = getICmpCondCode(CondCode);
+  }
+
+  SDValue Op1 = getValue(VPIntrin.getOperand(0));
+  SDValue Op2 = getValue(VPIntrin.getOperand(1));
+  // #2 is the condition code
+  SDValue MaskOp = getValue(VPIntrin.getOperand(3));
+  SDValue EVL = getValue(VPIntrin.getOperand(4));
+  MVT EVLParamVT = TLI.getVPExplicitVectorLengthTy();
+  assert(EVLParamVT.isScalarInteger() && EVLParamVT.bitsGE(MVT::i32) &&
+         "Unexpected target EVL type");
+  EVL = DAG.getNode(ISD::ZERO_EXTEND, DL, EVLParamVT, EVL);
+
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        VPIntrin.getType());
+  setValue(&VPIntrin,
+           DAG.getSetCCVP(DL, DestVT, Op1, Op2, Condition, MaskOp, EVL));
+}
+
+void SelectionDAGBuilder::visitVectorPredicationIntrinsic(
+    const VPIntrinsic &VPIntrin) {
+  SDLoc DL = getCurSDLoc();
+  unsigned Opcode = getISDForVPIntrinsic(VPIntrin);
+
+  auto IID = VPIntrin.getIntrinsicID();
+
+  if (const auto *CmpI = dyn_cast<VPCmpIntrinsic>(&VPIntrin))
+    return visitVPCmp(*CmpI);
+
+  SmallVector<EVT, 4> ValueVTs;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  ComputeValueVTs(TLI, DAG.getDataLayout(), VPIntrin.getType(), ValueVTs);
+  SDVTList VTs = DAG.getVTList(ValueVTs);
+
+  auto EVLParamPos = VPIntrinsic::getVectorLengthParamPos(IID);
+
+  MVT EVLParamVT = TLI.getVPExplicitVectorLengthTy();
+  assert(EVLParamVT.isScalarInteger() && EVLParamVT.bitsGE(MVT::i32) &&
+         "Unexpected target EVL type");
+
+  // Request operands.
+  SmallVector<SDValue, 7> OpValues;
+  for (unsigned I = 0; I < VPIntrin.arg_size(); ++I) {
+    auto Op = getValue(VPIntrin.getArgOperand(I));
+    if (I == EVLParamPos)
+      Op = DAG.getNode(ISD::ZERO_EXTEND, DL, EVLParamVT, Op);
+    OpValues.push_back(Op);
+  }
+
+  switch (Opcode) {
+  default: {
+    SDNodeFlags SDFlags;
+    if (auto *FPMO = dyn_cast<FPMathOperator>(&VPIntrin))
+      SDFlags.copyFMF(*FPMO);
+    SDValue Result = DAG.getNode(Opcode, DL, VTs, OpValues, SDFlags);
+    setValue(&VPIntrin, Result);
+    break;
+  }
+  case ISD::VP_LOAD:
+    visitVPLoad(VPIntrin, ValueVTs[0], OpValues);
+    break;
+  case ISD::VP_GATHER:
+    visitVPGather(VPIntrin, ValueVTs[0], OpValues);
+    break;
+  case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
+    visitVPStridedLoad(VPIntrin, ValueVTs[0], OpValues);
+    break;
+  case ISD::VP_STORE:
+    visitVPStore(VPIntrin, OpValues);
+    break;
+  case ISD::VP_SCATTER:
+    visitVPScatter(VPIntrin, OpValues);
+    break;
+  case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
+    visitVPStridedStore(VPIntrin, OpValues);
+    break;
+  case ISD::VP_FMULADD: {
+    assert(OpValues.size() == 5 && "Unexpected number of operands");
+    SDNodeFlags SDFlags;
+    if (auto *FPMO = dyn_cast<FPMathOperator>(&VPIntrin))
+      SDFlags.copyFMF(*FPMO);
+    if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
+        TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), ValueVTs[0])) {
+      setValue(&VPIntrin, DAG.getNode(ISD::VP_FMA, DL, VTs, OpValues, SDFlags));
+    } else {
+      SDValue Mul = DAG.getNode(
+          ISD::VP_FMUL, DL, VTs,
+          {OpValues[0], OpValues[1], OpValues[3], OpValues[4]}, SDFlags);
+      SDValue Add =
+          DAG.getNode(ISD::VP_FADD, DL, VTs,
+                      {Mul, OpValues[2], OpValues[3], OpValues[4]}, SDFlags);
+      setValue(&VPIntrin, Add);
+    }
+    break;
+  }
+  case ISD::VP_INTTOPTR: {
+    SDValue N = OpValues[0];
+    EVT DestVT = TLI.getValueType(DAG.getDataLayout(), VPIntrin.getType());
+    EVT PtrMemVT = TLI.getMemValueType(DAG.getDataLayout(), VPIntrin.getType());
+    N = DAG.getVPPtrExtOrTrunc(getCurSDLoc(), DestVT, N, OpValues[1],
+                               OpValues[2]);
+    N = DAG.getVPZExtOrTrunc(getCurSDLoc(), PtrMemVT, N, OpValues[1],
+                             OpValues[2]);
+    setValue(&VPIntrin, N);
+    break;
+  }
+  case ISD::VP_PTRTOINT: {
+    SDValue N = OpValues[0];
+    EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                          VPIntrin.getType());
+    EVT PtrMemVT = TLI.getMemValueType(DAG.getDataLayout(),
+                                       VPIntrin.getOperand(0)->getType());
+    N = DAG.getVPPtrExtOrTrunc(getCurSDLoc(), PtrMemVT, N, OpValues[1],
+                               OpValues[2]);
+    N = DAG.getVPZExtOrTrunc(getCurSDLoc(), DestVT, N, OpValues[1],
+                             OpValues[2]);
+    setValue(&VPIntrin, N);
+    break;
+  }
+  case ISD::VP_ABS:
+  case ISD::VP_CTLZ:
+  case ISD::VP_CTLZ_ZERO_UNDEF:
+  case ISD::VP_CTTZ:
+  case ISD::VP_CTTZ_ZERO_UNDEF: {
+    SDValue Result =
+        DAG.getNode(Opcode, DL, VTs, {OpValues[0], OpValues[2], OpValues[3]});
+    setValue(&VPIntrin, Result);
+    break;
+  }
+  }
+}
+
+SDValue SelectionDAGBuilder::lowerStartEH(SDValue Chain,
+                                          const BasicBlock *EHPadBB,
+                                          MCSymbol *&BeginLabel) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineModuleInfo &MMI = MF.getMMI();
+
+  // Insert a label before the invoke call to mark the try range.  This can be
+  // used to detect deletion of the invoke via the MachineModuleInfo.
+  BeginLabel = MMI.getContext().createTempSymbol();
+
+  // For SjLj, keep track of which landing pads go with which invokes
+  // so as to maintain the ordering of pads in the LSDA.
+  unsigned CallSiteIndex = MMI.getCurrentCallSite();
+  if (CallSiteIndex) {
+    MF.setCallSiteBeginLabel(BeginLabel, CallSiteIndex);
+    LPadToCallSiteMap[FuncInfo.MBBMap[EHPadBB]].push_back(CallSiteIndex);
+
+    // Now that the call site is handled, stop tracking it.
+    MMI.setCurrentCallSite(0);
+  }
+
+  return DAG.getEHLabel(getCurSDLoc(), Chain, BeginLabel);
+}
+
+SDValue SelectionDAGBuilder::lowerEndEH(SDValue Chain, const InvokeInst *II,
+                                        const BasicBlock *EHPadBB,
+                                        MCSymbol *BeginLabel) {
+  assert(BeginLabel && "BeginLabel should've been set");
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineModuleInfo &MMI = MF.getMMI();
+
+  // Insert a label at the end of the invoke call to mark the try range.  This
+  // can be used to detect deletion of the invoke via the MachineModuleInfo.
+  MCSymbol *EndLabel = MMI.getContext().createTempSymbol();
+  Chain = DAG.getEHLabel(getCurSDLoc(), Chain, EndLabel);
+
+  // Inform MachineModuleInfo of range.
+  auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
+  // There is a platform (e.g. wasm) that uses funclet style IR but does not
+  // actually use outlined funclets and their LSDA info style.
+  if (MF.hasEHFunclets() && isFuncletEHPersonality(Pers)) {
+    assert(II && "II should've been set");
+    WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
+    EHInfo->addIPToStateRange(II, BeginLabel, EndLabel);
+  } else if (!isScopedEHPersonality(Pers)) {
+    assert(EHPadBB);
+    MF.addInvoke(FuncInfo.MBBMap[EHPadBB], BeginLabel, EndLabel);
+  }
+
+  return Chain;
+}
+
+std::pair<SDValue, SDValue>
+SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
+                                    const BasicBlock *EHPadBB) {
+  MCSymbol *BeginLabel = nullptr;
+
+  if (EHPadBB) {
+    // Both PendingLoads and PendingExports must be flushed here;
+    // this call might not return.
+    (void)getRoot();
+    DAG.setRoot(lowerStartEH(getControlRoot(), EHPadBB, BeginLabel));
+    CLI.setChain(getRoot());
+  }
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
+
+  assert((CLI.IsTailCall || Result.second.getNode()) &&
+         "Non-null chain expected with non-tail call!");
+  assert((Result.second.getNode() || !Result.first.getNode()) &&
+         "Null value expected with tail call!");
+
+  if (!Result.second.getNode()) {
+    // As a special case, a null chain means that a tail call has been emitted
+    // and the DAG root is already updated.
+    HasTailCall = true;
+
+    // Since there's no actual continuation from this block, nothing can be
+    // relying on us setting vregs for them.
+    PendingExports.clear();
+  } else {
+    DAG.setRoot(Result.second);
+  }
+
+  if (EHPadBB) {
+    DAG.setRoot(lowerEndEH(getRoot(), cast_or_null<InvokeInst>(CLI.CB), EHPadBB,
+                           BeginLabel));
+  }
+
+  return Result;
+}
+
+void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
+                                      bool isTailCall,
+                                      bool isMustTailCall,
+                                      const BasicBlock *EHPadBB) {
+  auto &DL = DAG.getDataLayout();
+  FunctionType *FTy = CB.getFunctionType();
+  Type *RetTy = CB.getType();
+
+  TargetLowering::ArgListTy Args;
+  Args.reserve(CB.arg_size());
+
+  const Value *SwiftErrorVal = nullptr;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  if (isTailCall) {
+    // Avoid emitting tail calls in functions with the disable-tail-calls
+    // attribute.
+    auto *Caller = CB.getParent()->getParent();
+    if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() ==
+        "true" && !isMustTailCall)
+      isTailCall = false;
+
+    // We can't tail call inside a function with a swifterror argument. Lowering
+    // does not support this yet. It would have to move into the swifterror
+    // register before the call.
+    if (TLI.supportSwiftError() &&
+        Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+      isTailCall = false;
+  }
+
+  for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I) {
+    TargetLowering::ArgListEntry Entry;
+    const Value *V = *I;
+
+    // Skip empty types
+    if (V->getType()->isEmptyTy())
+      continue;
+
+    SDValue ArgNode = getValue(V);
+    Entry.Node = ArgNode; Entry.Ty = V->getType();
+
+    Entry.setAttributes(&CB, I - CB.arg_begin());
+
+    // Use swifterror virtual register as input to the call.
+    if (Entry.IsSwiftError && TLI.supportSwiftError()) {
+      SwiftErrorVal = V;
+      // We find the virtual register for the actual swifterror argument.
+      // Instead of using the Value, we use the virtual register instead.
+      Entry.Node =
+          DAG.getRegister(SwiftError.getOrCreateVRegUseAt(&CB, FuncInfo.MBB, V),
+                          EVT(TLI.getPointerTy(DL)));
+    }
+
+    Args.push_back(Entry);
+
+    // If we have an explicit sret argument that is an Instruction, (i.e., it
+    // might point to function-local memory), we can't meaningfully tail-call.
+    if (Entry.IsSRet && isa<Instruction>(V))
+      isTailCall = false;
+  }
+
+  // If call site has a cfguardtarget operand bundle, create and add an
+  // additional ArgListEntry.
+  if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_cfguardtarget)) {
+    TargetLowering::ArgListEntry Entry;
+    Value *V = Bundle->Inputs[0];
+    SDValue ArgNode = getValue(V);
+    Entry.Node = ArgNode;
+    Entry.Ty = V->getType();
+    Entry.IsCFGuardTarget = true;
+    Args.push_back(Entry);
+  }
+
+  // Check if target-independent constraints permit a tail call here.
+  // Target-dependent constraints are checked within TLI->LowerCallTo.
+  if (isTailCall && !isInTailCallPosition(CB, DAG.getTarget()))
+    isTailCall = false;
+
+  // Disable tail calls if there is an swifterror argument. Targets have not
+  // been updated to support tail calls.
+  if (TLI.supportSwiftError() && SwiftErrorVal)
+    isTailCall = false;
+
+  ConstantInt *CFIType = nullptr;
+  if (CB.isIndirectCall()) {
+    if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_kcfi)) {
+      if (!TLI.supportKCFIBundles())
+        report_fatal_error(
+            "Target doesn't support calls with kcfi operand bundles.");
+      CFIType = cast<ConstantInt>(Bundle->Inputs[0]);
+      assert(CFIType->getType()->isIntegerTy(32) && "Invalid CFI type");
+    }
+  }
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(getCurSDLoc())
+      .setChain(getRoot())
+      .setCallee(RetTy, FTy, Callee, std::move(Args), CB)
+      .setTailCall(isTailCall)
+      .setConvergent(CB.isConvergent())
+      .setIsPreallocated(
+          CB.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0)
+      .setCFIType(CFIType);
+  std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);
+
+  if (Result.first.getNode()) {
+    Result.first = lowerRangeToAssertZExt(DAG, CB, Result.first);
+    setValue(&CB, Result.first);
+  }
+
+  // The last element of CLI.InVals has the SDValue for swifterror return.
+  // Here we copy it to a virtual register and update SwiftErrorMap for
+  // book-keeping.
+  if (SwiftErrorVal && TLI.supportSwiftError()) {
+    // Get the last element of InVals.
+    SDValue Src = CLI.InVals.back();
+    Register VReg =
+        SwiftError.getOrCreateVRegDefAt(&CB, FuncInfo.MBB, SwiftErrorVal);
+    SDValue CopyNode = CLI.DAG.getCopyToReg(Result.second, CLI.DL, VReg, Src);
+    DAG.setRoot(CopyNode);
+  }
+}
+
+static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
+                             SelectionDAGBuilder &Builder) {
+  // Check to see if this load can be trivially constant folded, e.g. if the
+  // input is from a string literal.
+  if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) {
+    // Cast pointer to the type we really want to load.
+    Type *LoadTy =
+        Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits());
+    if (LoadVT.isVector())
+      LoadTy = FixedVectorType::get(LoadTy, LoadVT.getVectorNumElements());
+
+    LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
+                                         PointerType::getUnqual(LoadTy));
+
+    if (const Constant *LoadCst =
+            ConstantFoldLoadFromConstPtr(const_cast<Constant *>(LoadInput),
+                                         LoadTy, Builder.DAG.getDataLayout()))
+      return Builder.getValue(LoadCst);
+  }
+
+  // Otherwise, we have to emit the load.  If the pointer is to unfoldable but
+  // still constant memory, the input chain can be the entry node.
+  SDValue Root;
+  bool ConstantMemory = false;
+
+  // Do not serialize (non-volatile) loads of constant memory with anything.
+  if (Builder.AA && Builder.AA->pointsToConstantMemory(PtrVal)) {
+    Root = Builder.DAG.getEntryNode();
+    ConstantMemory = true;
+  } else {
+    // Do not serialize non-volatile loads against each other.
+    Root = Builder.DAG.getRoot();
+  }
+
+  SDValue Ptr = Builder.getValue(PtrVal);
+  SDValue LoadVal =
+      Builder.DAG.getLoad(LoadVT, Builder.getCurSDLoc(), Root, Ptr,
+                          MachinePointerInfo(PtrVal), Align(1));
+
+  if (!ConstantMemory)
+    Builder.PendingLoads.push_back(LoadVal.getValue(1));
+  return LoadVal;
+}
+
+/// Record the value for an instruction that produces an integer result,
+/// converting the type where necessary.
+void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
+                                                  SDValue Value,
+                                                  bool IsSigned) {
+  EVT VT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                    I.getType(), true);
+  Value = DAG.getExtOrTrunc(IsSigned, Value, getCurSDLoc(), VT);
+  setValue(&I, Value);
+}
+
+/// See if we can lower a memcmp/bcmp call into an optimized form. If so, return
+/// true and lower it. Otherwise return false, and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
+bool SelectionDAGBuilder::visitMemCmpBCmpCall(const CallInst &I) {
+  const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1);
+  const Value *Size = I.getArgOperand(2);
+  const ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(getValue(Size));
+  if (CSize && CSize->getZExtValue() == 0) {
+    EVT CallVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                          I.getType(), true);
+    setValue(&I, DAG.getConstant(0, getCurSDLoc(), CallVT));
+    return true;
+  }
+
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res = TSI.EmitTargetCodeForMemcmp(
+      DAG, getCurSDLoc(), DAG.getRoot(), getValue(LHS), getValue(RHS),
+      getValue(Size), MachinePointerInfo(LHS), MachinePointerInfo(RHS));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, true);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
+  // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
+  if (!CSize || !isOnlyUsedInZeroEqualityComparison(&I))
+    return false;
+
+  // If the target has a fast compare for the given size, it will return a
+  // preferred load type for that size. Require that the load VT is legal and
+  // that the target supports unaligned loads of that type. Otherwise, return
+  // INVALID.
+  auto hasFastLoadsAndCompare = [&](unsigned NumBits) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    MVT LVT = TLI.hasFastEqualityCompare(NumBits);
+    if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
+      // TODO: Handle 5 byte compare as 4-byte + 1 byte.
+      // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
+      // TODO: Check alignment of src and dest ptrs.
+      unsigned DstAS = LHS->getType()->getPointerAddressSpace();
+      unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
+      if (!TLI.isTypeLegal(LVT) ||
+          !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) ||
+          !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS))
+        LVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+    }
+
+    return LVT;
+  };
+
+  // This turns into unaligned loads. We only do this if the target natively
+  // supports the MVT we'll be loading or if it is small enough (<= 4) that
+  // we'll only produce a small number of byte loads.
+  MVT LoadVT;
+  unsigned NumBitsToCompare = CSize->getZExtValue() * 8;
+  switch (NumBitsToCompare) {
+  default:
+    return false;
+  case 16:
+    LoadVT = MVT::i16;
+    break;
+  case 32:
+    LoadVT = MVT::i32;
+    break;
+  case 64:
+  case 128:
+  case 256:
+    LoadVT = hasFastLoadsAndCompare(NumBitsToCompare);
+    break;
+  }
+
+  if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE)
+    return false;
+
+  SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this);
+  SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this);
+
+  // Bitcast to a wide integer type if the loads are vectors.
+  if (LoadVT.isVector()) {
+    EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits());
+    LoadL = DAG.getBitcast(CmpVT, LoadL);
+    LoadR = DAG.getBitcast(CmpVT, LoadR);
+  }
+
+  SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE);
+  processIntegerCallValue(I, Cmp, false);
+  return true;
+}
+
+/// See if we can lower a memchr call into an optimized form. If so, return
+/// true and lower it. Otherwise return false, and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
+bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {
+  const Value *Src = I.getArgOperand(0);
+  const Value *Char = I.getArgOperand(1);
+  const Value *Length = I.getArgOperand(2);
+
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForMemchr(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(Src), getValue(Char), getValue(Length),
+                                MachinePointerInfo(Src));
+  if (Res.first.getNode()) {
+    setValue(&I, Res.first);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// See if we can lower a mempcpy call into an optimized form. If so, return
+/// true and lower it. Otherwise return false, and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
+bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {
+  SDValue Dst = getValue(I.getArgOperand(0));
+  SDValue Src = getValue(I.getArgOperand(1));
+  SDValue Size = getValue(I.getArgOperand(2));
+
+  Align DstAlign = DAG.InferPtrAlign(Dst).valueOrOne();
+  Align SrcAlign = DAG.InferPtrAlign(Src).valueOrOne();
+  // DAG::getMemcpy needs Alignment to be defined.
+  Align Alignment = std::min(DstAlign, SrcAlign);
+
+  SDLoc sdl = getCurSDLoc();
+
+  // In the mempcpy context we need to pass in a false value for isTailCall
+  // because the return pointer needs to be adjusted by the size of
+  // the copied memory.
+  SDValue Root = getMemoryRoot();
+  SDValue MC = DAG.getMemcpy(Root, sdl, Dst, Src, Size, Alignment, false, false,
+                             /*isTailCall=*/false,
+                             MachinePointerInfo(I.getArgOperand(0)),
+                             MachinePointerInfo(I.getArgOperand(1)),
+                             I.getAAMetadata());
+  assert(MC.getNode() != nullptr &&
+         "** memcpy should not be lowered as TailCall in mempcpy context **");
+  DAG.setRoot(MC);
+
+  // Check if Size needs to be truncated or extended.
+  Size = DAG.getSExtOrTrunc(Size, sdl, Dst.getValueType());
+
+  // Adjust return pointer to point just past the last dst byte.
+  SDValue DstPlusSize = DAG.getNode(ISD::ADD, sdl, Dst.getValueType(),
+                                    Dst, Size);
+  setValue(&I, DstPlusSize);
+  return true;
+}
+
+/// See if we can lower a strcpy call into an optimized form.  If so, return
+/// true and lower it, otherwise return false and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
+bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {
+  const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
+
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrcpy(DAG, getCurSDLoc(), getRoot(),
+                                getValue(Arg0), getValue(Arg1),
+                                MachinePointerInfo(Arg0),
+                                MachinePointerInfo(Arg1), isStpcpy);
+  if (Res.first.getNode()) {
+    setValue(&I, Res.first);
+    DAG.setRoot(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// See if we can lower a strcmp call into an optimized form.  If so, return
+/// true and lower it, otherwise return false and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
+bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {
+  const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
+
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrcmp(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(Arg0), getValue(Arg1),
+                                MachinePointerInfo(Arg0),
+                                MachinePointerInfo(Arg1));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, true);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// See if we can lower a strlen call into an optimized form.  If so, return
+/// true and lower it, otherwise return false and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
+bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {
+  const Value *Arg0 = I.getArgOperand(0);
+
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrlen(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(Arg0), MachinePointerInfo(Arg0));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, false);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// See if we can lower a strnlen call into an optimized form.  If so, return
+/// true and lower it, otherwise return false and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
+bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
+  const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
+
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrnlen(DAG, getCurSDLoc(), DAG.getRoot(),
+                                 getValue(Arg0), getValue(Arg1),
+                                 MachinePointerInfo(Arg0));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, false);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// See if we can lower a unary floating-point operation into an SDNode with
+/// the specified Opcode.  If so, return true and lower it, otherwise return
+/// false and it will be lowered like a normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
+bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
+                                              unsigned Opcode) {
+  // We already checked this call's prototype; verify it doesn't modify errno.
+  if (!I.onlyReadsMemory())
+    return false;
+
+  SDNodeFlags Flags;
+  Flags.copyFMF(cast<FPMathOperator>(I));
+
+  SDValue Tmp = getValue(I.getArgOperand(0));
+  setValue(&I,
+           DAG.getNode(Opcode, getCurSDLoc(), Tmp.getValueType(), Tmp, Flags));
+  return true;
+}
+
+/// See if we can lower a binary floating-point operation into an SDNode with
+/// the specified Opcode. If so, return true and lower it. Otherwise return
+/// false, and it will be lowered like a normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
+bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I,
+                                               unsigned Opcode) {
+  // We already checked this call's prototype; verify it doesn't modify errno.
+  if (!I.onlyReadsMemory())
+    return false;
+
+  SDNodeFlags Flags;
+  Flags.copyFMF(cast<FPMathOperator>(I));
+
+  SDValue Tmp0 = getValue(I.getArgOperand(0));
+  SDValue Tmp1 = getValue(I.getArgOperand(1));
+  EVT VT = Tmp0.getValueType();
+  setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), VT, Tmp0, Tmp1, Flags));
+  return true;
+}
+
+void SelectionDAGBuilder::visitCall(const CallInst &I) {
+  // Handle inline assembly differently.
+  if (I.isInlineAsm()) {
+    visitInlineAsm(I);
+    return;
+  }
+
+  diagnoseDontCall(I);
+
+  if (Function *F = I.getCalledFunction()) {
+    if (F->isDeclaration()) {
+      // Is this an LLVM intrinsic or a target-specific intrinsic?
+      unsigned IID = F->getIntrinsicID();
+      if (!IID)
+        if (const TargetIntrinsicInfo *II = TM.getIntrinsicInfo())
+          IID = II->getIntrinsicID(F);
+
+      if (IID) {
+        visitIntrinsicCall(I, IID);
+        return;
+      }
+    }
+
+    // Check for well-known libc/libm calls.  If the function is internal, it
+    // can't be a library call.  Don't do the check if marked as nobuiltin for
+    // some reason or the call site requires strict floating point semantics.
+    LibFunc Func;
+    if (!I.isNoBuiltin() && !I.isStrictFP() && !F->hasLocalLinkage() &&
+        F->hasName() && LibInfo->getLibFunc(*F, Func) &&
+        LibInfo->hasOptimizedCodeGen(Func)) {
+      switch (Func) {
+      default: break;
+      case LibFunc_bcmp:
+        if (visitMemCmpBCmpCall(I))
+          return;
+        break;
+      case LibFunc_copysign:
+      case LibFunc_copysignf:
+      case LibFunc_copysignl:
+        // We already checked this call's prototype; verify it doesn't modify
+        // errno.
+        if (I.onlyReadsMemory()) {
+          SDValue LHS = getValue(I.getArgOperand(0));
+          SDValue RHS = getValue(I.getArgOperand(1));
+          setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurSDLoc(),
+                                   LHS.getValueType(), LHS, RHS));
+          return;
+        }
+        break;
+      case LibFunc_fabs:
+      case LibFunc_fabsf:
+      case LibFunc_fabsl:
+        if (visitUnaryFloatCall(I, ISD::FABS))
+          return;
+        break;
+      case LibFunc_fmin:
+      case LibFunc_fminf:
+      case LibFunc_fminl:
+        if (visitBinaryFloatCall(I, ISD::FMINNUM))
+          return;
+        break;
+      case LibFunc_fmax:
+      case LibFunc_fmaxf:
+      case LibFunc_fmaxl:
+        if (visitBinaryFloatCall(I, ISD::FMAXNUM))
+          return;
+        break;
+      case LibFunc_sin:
+      case LibFunc_sinf:
+      case LibFunc_sinl:
+        if (visitUnaryFloatCall(I, ISD::FSIN))
+          return;
+        break;
+      case LibFunc_cos:
+      case LibFunc_cosf:
+      case LibFunc_cosl:
+        if (visitUnaryFloatCall(I, ISD::FCOS))
+          return;
+        break;
+      case LibFunc_sqrt:
+      case LibFunc_sqrtf:
+      case LibFunc_sqrtl:
+      case LibFunc_sqrt_finite:
+      case LibFunc_sqrtf_finite:
+      case LibFunc_sqrtl_finite:
+        if (visitUnaryFloatCall(I, ISD::FSQRT))
+          return;
+        break;
+      case LibFunc_floor:
+      case LibFunc_floorf:
+      case LibFunc_floorl:
+        if (visitUnaryFloatCall(I, ISD::FFLOOR))
+          return;
+        break;
+      case LibFunc_nearbyint:
+      case LibFunc_nearbyintf:
+      case LibFunc_nearbyintl:
+        if (visitUnaryFloatCall(I, ISD::FNEARBYINT))
+          return;
+        break;
+      case LibFunc_ceil:
+      case LibFunc_ceilf:
+      case LibFunc_ceill:
+        if (visitUnaryFloatCall(I, ISD::FCEIL))
+          return;
+        break;
+      case LibFunc_rint:
+      case LibFunc_rintf:
+      case LibFunc_rintl:
+        if (visitUnaryFloatCall(I, ISD::FRINT))
+          return;
+        break;
+      case LibFunc_round:
+      case LibFunc_roundf:
+      case LibFunc_roundl:
+        if (visitUnaryFloatCall(I, ISD::FROUND))
+          return;
+        break;
+      case LibFunc_trunc:
+      case LibFunc_truncf:
+      case LibFunc_truncl:
+        if (visitUnaryFloatCall(I, ISD::FTRUNC))
+          return;
+        break;
+      case LibFunc_log2:
+      case LibFunc_log2f:
+      case LibFunc_log2l:
+        if (visitUnaryFloatCall(I, ISD::FLOG2))
+          return;
+        break;
+      case LibFunc_exp2:
+      case LibFunc_exp2f:
+      case LibFunc_exp2l:
+        if (visitUnaryFloatCall(I, ISD::FEXP2))
+          return;
+        break;
+      case LibFunc_ldexp:
+      case LibFunc_ldexpf:
+      case LibFunc_ldexpl:
+        if (visitBinaryFloatCall(I, ISD::FLDEXP))
+          return;
+        break;
+      case LibFunc_memcmp:
+        if (visitMemCmpBCmpCall(I))
+          return;
+        break;
+      case LibFunc_mempcpy:
+        if (visitMemPCpyCall(I))
+          return;
+        break;
+      case LibFunc_memchr:
+        if (visitMemChrCall(I))
+          return;
+        break;
+      case LibFunc_strcpy:
+        if (visitStrCpyCall(I, false))
+          return;
+        break;
+      case LibFunc_stpcpy:
+        if (visitStrCpyCall(I, true))
+          return;
+        break;
+      case LibFunc_strcmp:
+        if (visitStrCmpCall(I))
+          return;
+        break;
+      case LibFunc_strlen:
+        if (visitStrLenCall(I))
+          return;
+        break;
+      case LibFunc_strnlen:
+        if (visitStrNLenCall(I))
+          return;
+        break;
+      }
+    }
+  }
+
+  // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
+  // have to do anything here to lower funclet bundles.
+  // CFGuardTarget bundles are lowered in LowerCallTo.
+  assert(!I.hasOperandBundlesOtherThan(
+             {LLVMContext::OB_deopt, LLVMContext::OB_funclet,
+              LLVMContext::OB_cfguardtarget, LLVMContext::OB_preallocated,
+              LLVMContext::OB_clang_arc_attachedcall, LLVMContext::OB_kcfi}) &&
+         "Cannot lower calls with arbitrary operand bundles!");
+
+  SDValue Callee = getValue(I.getCalledOperand());
+
+  if (I.countOperandBundlesOfType(LLVMContext::OB_deopt))
+    LowerCallSiteWithDeoptBundle(&I, Callee, nullptr);
+  else
+    // Check if we can potentially perform a tail call. More detailed checking
+    // is be done within LowerCallTo, after more information about the call is
+    // known.
+    LowerCallTo(I, Callee, I.isTailCall(), I.isMustTailCall());
+}
+
+namespace {
+
+/// AsmOperandInfo - This contains information for each constraint that we are
+/// lowering.
+class SDISelAsmOperandInfo : public TargetLowering::AsmOperandInfo {
+public:
+  /// CallOperand - If this is the result output operand or a clobber
+  /// this is null, otherwise it is the incoming operand to the CallInst.
+  /// This gets modified as the asm is processed.
+  SDValue CallOperand;
+
+  /// AssignedRegs - If this is a register or register class operand, this
+  /// contains the set of register corresponding to the operand.
+  RegsForValue AssignedRegs;
+
+  explicit SDISelAsmOperandInfo(const TargetLowering::AsmOperandInfo &info)
+    : TargetLowering::AsmOperandInfo(info), CallOperand(nullptr, 0) {
+  }
+
+  /// Whether or not this operand accesses memory
+  bool hasMemory(const TargetLowering &TLI) const {
+    // Indirect operand accesses access memory.
+    if (isIndirect)
+      return true;
+
+    for (const auto &Code : Codes)
+      if (TLI.getConstraintType(Code) == TargetLowering::C_Memory)
+        return true;
+
+    return false;
+  }
+};
+
+
+} // end anonymous namespace
+
+/// Make sure that the output operand \p OpInfo and its corresponding input
+/// operand \p MatchingOpInfo have compatible constraint types (otherwise error
+/// out).
+static void patchMatchingInput(const SDISelAsmOperandInfo &OpInfo,
+                               SDISelAsmOperandInfo &MatchingOpInfo,
+                               SelectionDAG &DAG) {
+  if (OpInfo.ConstraintVT == MatchingOpInfo.ConstraintVT)
+    return;
+
+  const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+  const auto &TLI = DAG.getTargetLoweringInfo();
+
+  std::pair<unsigned, const TargetRegisterClass *> MatchRC =
+      TLI.getRegForInlineAsmConstraint(TRI, OpInfo.ConstraintCode,
+                                       OpInfo.ConstraintVT);
+  std::pair<unsigned, const TargetRegisterClass *> InputRC =
+      TLI.getRegForInlineAsmConstraint(TRI, MatchingOpInfo.ConstraintCode,
+                                       MatchingOpInfo.ConstraintVT);
+  if ((OpInfo.ConstraintVT.isInteger() !=
+       MatchingOpInfo.ConstraintVT.isInteger()) ||
+      (MatchRC.second != InputRC.second)) {
+    // FIXME: error out in a more elegant fashion
+    report_fatal_error("Unsupported asm: input constraint"
+                       " with a matching output constraint of"
+                       " incompatible type!");
+  }
+  MatchingOpInfo.ConstraintVT = OpInfo.ConstraintVT;
+}
+
+/// Get a direct memory input to behave well as an indirect operand.
+/// This may introduce stores, hence the need for a \p Chain.
+/// \return The (possibly updated) chain.
+static SDValue getAddressForMemoryInput(SDValue Chain, const SDLoc &Location,
+                                        SDISelAsmOperandInfo &OpInfo,
+                                        SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // If we don't have an indirect input, put it in the constpool if we can,
+  // otherwise spill it to a stack slot.
+  // TODO: This isn't quite right. We need to handle these according to
+  // the addressing mode that the constraint wants. Also, this may take
+  // an additional register for the computation and we don't want that
+  // either.
+
+  // If the operand is a float, integer, or vector constant, spill to a
+  // constant pool entry to get its address.
+  const Value *OpVal = OpInfo.CallOperandVal;
+  if (isa<ConstantFP>(OpVal) || isa<ConstantInt>(OpVal) ||
+      isa<ConstantVector>(OpVal) || isa<ConstantDataVector>(OpVal)) {
+    OpInfo.CallOperand = DAG.getConstantPool(
+        cast<Constant>(OpVal), TLI.getPointerTy(DAG.getDataLayout()));
+    return Chain;
+  }
+
+  // Otherwise, create a stack slot and emit a store to it before the asm.
+  Type *Ty = OpVal->getType();
+  auto &DL = DAG.getDataLayout();
+  uint64_t TySize = DL.getTypeAllocSize(Ty);
+  MachineFunction &MF = DAG.getMachineFunction();
+  int SSFI = MF.getFrameInfo().CreateStackObject(
+      TySize, DL.getPrefTypeAlign(Ty), false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getFrameIndexTy(DL));
+  Chain = DAG.getTruncStore(Chain, Location, OpInfo.CallOperand, StackSlot,
+                            MachinePointerInfo::getFixedStack(MF, SSFI),
+                            TLI.getMemValueType(DL, Ty));
+  OpInfo.CallOperand = StackSlot;
+
+  return Chain;
+}
+
+/// GetRegistersForValue - Assign registers (virtual or physical) for the
+/// specified operand.  We prefer to assign virtual registers, to allow the
+/// register allocator to handle the assignment process.  However, if the asm
+/// uses features that we can't model on machineinstrs, we have SDISel do the
+/// allocation.  This produces generally horrible, but correct, code.
+///
+///   OpInfo describes the operand
+///   RefOpInfo describes the matching operand if any, the operand otherwise
+static std::optional<unsigned>
+getRegistersForValue(SelectionDAG &DAG, const SDLoc &DL,
+                     SDISelAsmOperandInfo &OpInfo,
+                     SDISelAsmOperandInfo &RefOpInfo) {
+  LLVMContext &Context = *DAG.getContext();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  SmallVector<unsigned, 4> Regs;
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+
+  // No work to do for memory/address operands.
+  if (OpInfo.ConstraintType == TargetLowering::C_Memory ||
+      OpInfo.ConstraintType == TargetLowering::C_Address)
+    return std::nullopt;
+
+  // If this is a constraint for a single physreg, or a constraint for a
+  // register class, find it.
+  unsigned AssignedReg;
+  const TargetRegisterClass *RC;
+  std::tie(AssignedReg, RC) = TLI.getRegForInlineAsmConstraint(
+      &TRI, RefOpInfo.ConstraintCode, RefOpInfo.ConstraintVT);
+  // RC is unset only on failure. Return immediately.
+  if (!RC)
+    return std::nullopt;
+
+  // Get the actual register value type.  This is important, because the user
+  // may have asked for (e.g.) the AX register in i32 type.  We need to
+  // remember that AX is actually i16 to get the right extension.
+  const MVT RegVT = *TRI.legalclasstypes_begin(*RC);
+
+  if (OpInfo.ConstraintVT != MVT::Other && RegVT != MVT::Untyped) {
+    // If this is an FP operand in an integer register (or visa versa), or more
+    // generally if the operand value disagrees with the register class we plan
+    // to stick it in, fix the operand type.
+    //
+    // If this is an input value, the bitcast to the new type is done now.
+    // Bitcast for output value is done at the end of visitInlineAsm().
+    if ((OpInfo.Type == InlineAsm::isOutput ||
+         OpInfo.Type == InlineAsm::isInput) &&
+        !TRI.isTypeLegalForClass(*RC, OpInfo.ConstraintVT)) {
+      // Try to convert to the first EVT that the reg class contains.  If the
+      // types are identical size, use a bitcast to convert (e.g. two differing
+      // vector types).  Note: output bitcast is done at the end of
+      // visitInlineAsm().
+      if (RegVT.getSizeInBits() == OpInfo.ConstraintVT.getSizeInBits()) {
+        // Exclude indirect inputs while they are unsupported because the code
+        // to perform the load is missing and thus OpInfo.CallOperand still
+        // refers to the input address rather than the pointed-to value.
+        if (OpInfo.Type == InlineAsm::isInput && !OpInfo.isIndirect)
+          OpInfo.CallOperand =
+              DAG.getNode(ISD::BITCAST, DL, RegVT, OpInfo.CallOperand);
+        OpInfo.ConstraintVT = RegVT;
+        // If the operand is an FP value and we want it in integer registers,
+        // use the corresponding integer type. This turns an f64 value into
+        // i64, which can be passed with two i32 values on a 32-bit machine.
+      } else if (RegVT.isInteger() && OpInfo.ConstraintVT.isFloatingPoint()) {
+        MVT VT = MVT::getIntegerVT(OpInfo.ConstraintVT.getSizeInBits());
+        if (OpInfo.Type == InlineAsm::isInput)
+          OpInfo.CallOperand =
+              DAG.getNode(ISD::BITCAST, DL, VT, OpInfo.CallOperand);
+        OpInfo.ConstraintVT = VT;
+      }
+    }
+  }
+
+  // No need to allocate a matching input constraint since the constraint it's
+  // matching to has already been allocated.
+  if (OpInfo.isMatchingInputConstraint())
+    return std::nullopt;
+
+  EVT ValueVT = OpInfo.ConstraintVT;
+  if (OpInfo.ConstraintVT == MVT::Other)
+    ValueVT = RegVT;
+
+  // Initialize NumRegs.
+  unsigned NumRegs = 1;
+  if (OpInfo.ConstraintVT != MVT::Other)
+    NumRegs = TLI.getNumRegisters(Context, OpInfo.ConstraintVT, RegVT);
+
+  // If this is a constraint for a specific physical register, like {r17},
+  // assign it now.
+
+  // If this associated to a specific register, initialize iterator to correct
+  // place. If virtual, make sure we have enough registers
+
+  // Initialize iterator if necessary
+  TargetRegisterClass::iterator I = RC->begin();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  // Do not check for single registers.
+  if (AssignedReg) {
+    I = std::find(I, RC->end(), AssignedReg);
+    if (I == RC->end()) {
+      // RC does not contain the selected register, which indicates a
+      // mismatch between the register and the required type/bitwidth.
+      return {AssignedReg};
+    }
+  }
+
+  for (; NumRegs; --NumRegs, ++I) {
+    assert(I != RC->end() && "Ran out of registers to allocate!");
+    Register R = AssignedReg ? Register(*I) : RegInfo.createVirtualRegister(RC);
+    Regs.push_back(R);
+  }
+
+  OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
+  return std::nullopt;
+}
+
+static unsigned
+findMatchingInlineAsmOperand(unsigned OperandNo,
+                             const std::vector<SDValue> &AsmNodeOperands) {
+  // Scan until we find the definition we already emitted of this operand.
+  unsigned CurOp = InlineAsm::Op_FirstOperand;
+  for (; OperandNo; --OperandNo) {
+    // Advance to the next operand.
+    unsigned OpFlag =
+        cast<ConstantSDNode>(AsmNodeOperands[CurOp])->getZExtValue();
+    assert((InlineAsm::isRegDefKind(OpFlag) ||
+            InlineAsm::isRegDefEarlyClobberKind(OpFlag) ||
+            InlineAsm::isMemKind(OpFlag)) &&
+           "Skipped past definitions?");
+    CurOp += InlineAsm::getNumOperandRegisters(OpFlag) + 1;
+  }
+  return CurOp;
+}
+
+namespace {
+
+class ExtraFlags {
+  unsigned Flags = 0;
+
+public:
+  explicit ExtraFlags(const CallBase &Call) {
+    const InlineAsm *IA = cast<InlineAsm>(Call.getCalledOperand());
+    if (IA->hasSideEffects())
+      Flags |= InlineAsm::Extra_HasSideEffects;
+    if (IA->isAlignStack())
+      Flags |= InlineAsm::Extra_IsAlignStack;
+    if (Call.isConvergent())
+      Flags |= InlineAsm::Extra_IsConvergent;
+    Flags |= IA->getDialect() * InlineAsm::Extra_AsmDialect;
+  }
+
+  void update(const TargetLowering::AsmOperandInfo &OpInfo) {
+    // Ideally, we would only check against memory constraints.  However, the
+    // meaning of an Other constraint can be target-specific and we can't easily
+    // reason about it.  Therefore, be conservative and set MayLoad/MayStore
+    // for Other constraints as well.
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory ||
+        OpInfo.ConstraintType == TargetLowering::C_Other) {
+      if (OpInfo.Type == InlineAsm::isInput)
+        Flags |= InlineAsm::Extra_MayLoad;
+      else if (OpInfo.Type == InlineAsm::isOutput)
+        Flags |= InlineAsm::Extra_MayStore;
+      else if (OpInfo.Type == InlineAsm::isClobber)
+        Flags |= (InlineAsm::Extra_MayLoad | InlineAsm::Extra_MayStore);
+    }
+  }
+
+  unsigned get() const { return Flags; }
+};
+
+} // end anonymous namespace
+
+static bool isFunction(SDValue Op) {
+  if (Op && Op.getOpcode() == ISD::GlobalAddress) {
+    if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
+      auto Fn = dyn_cast_or_null<Function>(GA->getGlobal());
+
+      // In normal "call dllimport func" instruction (non-inlineasm) it force
+      // indirect access by specifing call opcode. And usually specially print
+      // asm with indirect symbol (i.g: "*") according to opcode. Inline asm can
+      // not do in this way now. (In fact, this is similar with "Data Access"
+      // action). So here we ignore dllimport function.
+      if (Fn && !Fn->hasDLLImportStorageClass())
+        return true;
+    }
+  }
+  return false;
+}
+
+/// visitInlineAsm - Handle a call to an InlineAsm object.
+void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
+                                         const BasicBlock *EHPadBB) {
+  const InlineAsm *IA = cast<InlineAsm>(Call.getCalledOperand());
+
+  /// ConstraintOperands - Information about all of the constraints.
+  SmallVector<SDISelAsmOperandInfo, 16> ConstraintOperands;
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(
+      DAG.getDataLayout(), DAG.getSubtarget().getRegisterInfo(), Call);
+
+  // First Pass: Calculate HasSideEffects and ExtraFlags (AlignStack,
+  // AsmDialect, MayLoad, MayStore).
+  bool HasSideEffect = IA->hasSideEffects();
+  ExtraFlags ExtraInfo(Call);
+
+  for (auto &T : TargetConstraints) {
+    ConstraintOperands.push_back(SDISelAsmOperandInfo(T));
+    SDISelAsmOperandInfo &OpInfo = ConstraintOperands.back();
+
+    if (OpInfo.CallOperandVal)
+      OpInfo.CallOperand = getValue(OpInfo.CallOperandVal);
+
+    if (!HasSideEffect)
+      HasSideEffect = OpInfo.hasMemory(TLI);
+
+    // Determine if this InlineAsm MayLoad or MayStore based on the constraints.
+    // FIXME: Could we compute this on OpInfo rather than T?
+
+    // Compute the constraint code and ConstraintType to use.
+    TLI.ComputeConstraintToUse(T, SDValue());
+
+    if (T.ConstraintType == TargetLowering::C_Immediate &&
+        OpInfo.CallOperand && !isa<ConstantSDNode>(OpInfo.CallOperand))
+      // We've delayed emitting a diagnostic like the "n" constraint because
+      // inlining could cause an integer showing up.
+      return emitInlineAsmError(Call, "constraint '" + Twine(T.ConstraintCode) +
+                                          "' expects an integer constant "
+                                          "expression");
+
+    ExtraInfo.update(T);
+  }
+
+  // We won't need to flush pending loads if this asm doesn't touch
+  // memory and is nonvolatile.
+  SDValue Glue, Chain = (HasSideEffect) ? getRoot() : DAG.getRoot();
+
+  bool EmitEHLabels = isa<InvokeInst>(Call);
+  if (EmitEHLabels) {
+    assert(EHPadBB && "InvokeInst must have an EHPadBB");
+  }
+  bool IsCallBr = isa<CallBrInst>(Call);
+
+  if (IsCallBr || EmitEHLabels) {
+    // If this is a callbr or invoke we need to flush pending exports since
+    // inlineasm_br and invoke are terminators.
+    // We need to do this before nodes are glued to the inlineasm_br node.
+    Chain = getControlRoot();
+  }
+
+  MCSymbol *BeginLabel = nullptr;
+  if (EmitEHLabels) {
+    Chain = lowerStartEH(Chain, EHPadBB, BeginLabel);
+  }
+
+  int OpNo = -1;
+  SmallVector<StringRef> AsmStrs;
+  IA->collectAsmStrs(AsmStrs);
+
+  // Second pass over the constraints: compute which constraint option to use.
+  for (SDISelAsmOperandInfo &OpInfo : ConstraintOperands) {
+    if (OpInfo.hasArg() || OpInfo.Type == InlineAsm::isOutput)
+      OpNo++;
+
+    // If this is an output operand with a matching input operand, look up the
+    // matching input. If their types mismatch, e.g. one is an integer, the
+    // other is floating point, or their sizes are different, flag it as an
+    // error.
+    if (OpInfo.hasMatchingInput()) {
+      SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
+      patchMatchingInput(OpInfo, Input, DAG);
+    }
+
+    // Compute the constraint code and ConstraintType to use.
+    TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);
+
+    if ((OpInfo.ConstraintType == TargetLowering::C_Memory &&
+         OpInfo.Type == InlineAsm::isClobber) ||
+        OpInfo.ConstraintType == TargetLowering::C_Address)
+      continue;
+
+    // In Linux PIC model, there are 4 cases about value/label addressing:
+    //
+    // 1: Function call or Label jmp inside the module.
+    // 2: Data access (such as global variable, static variable) inside module.
+    // 3: Function call or Label jmp outside the module.
+    // 4: Data access (such as global variable) outside the module.
+    //
+    // Due to current llvm inline asm architecture designed to not "recognize"
+    // the asm code, there are quite troubles for us to treat mem addressing
+    // differently for same value/adress used in different instuctions.
+    // For example, in pic model, call a func may in plt way or direclty
+    // pc-related, but lea/mov a function adress may use got.
+    //
+    // Here we try to "recognize" function call for the case 1 and case 3 in
+    // inline asm. And try to adjust the constraint for them.
+    //
+    // TODO: Due to current inline asm didn't encourage to jmp to the outsider
+    // label, so here we don't handle jmp function label now, but we need to
+    // enhance it (especilly in PIC model) if we meet meaningful requirements.
+    if (OpInfo.isIndirect && isFunction(OpInfo.CallOperand) &&
+        TLI.isInlineAsmTargetBranch(AsmStrs, OpNo) &&
+        TM.getCodeModel() != CodeModel::Large) {
+      OpInfo.isIndirect = false;
+      OpInfo.ConstraintType = TargetLowering::C_Address;
+    }
+
+    // If this is a memory input, and if the operand is not indirect, do what we
+    // need to provide an address for the memory input.
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
+        !OpInfo.isIndirect) {
+      assert((OpInfo.isMultipleAlternative ||
+              (OpInfo.Type == InlineAsm::isInput)) &&
+             "Can only indirectify direct input operands!");
+
+      // Memory operands really want the address of the value.
+      Chain = getAddressForMemoryInput(Chain, getCurSDLoc(), OpInfo, DAG);
+
+      // There is no longer a Value* corresponding to this operand.
+      OpInfo.CallOperandVal = nullptr;
+
+      // It is now an indirect operand.
+      OpInfo.isIndirect = true;
+    }
+
+  }
+
+  // AsmNodeOperands - The operands for the ISD::INLINEASM node.
+  std::vector<SDValue> AsmNodeOperands;
+  AsmNodeOperands.push_back(SDValue());  // reserve space for input chain
+  AsmNodeOperands.push_back(DAG.getTargetExternalSymbol(
+      IA->getAsmString().c_str(), TLI.getProgramPointerTy(DAG.getDataLayout())));
+
+  // If we have a !srcloc metadata node associated with it, we want to attach
+  // this to the ultimately generated inline asm machineinstr.  To do this, we
+  // pass in the third operand as this (potentially null) inline asm MDNode.
+  const MDNode *SrcLoc = Call.getMetadata("srcloc");
+  AsmNodeOperands.push_back(DAG.getMDNode(SrcLoc));
+
+  // Remember the HasSideEffect, AlignStack, AsmDialect, MayLoad and MayStore
+  // bits as operand 3.
+  AsmNodeOperands.push_back(DAG.getTargetConstant(
+      ExtraInfo.get(), getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
+
+  // Third pass: Loop over operands to prepare DAG-level operands.. As part of
+  // this, assign virtual and physical registers for inputs and otput.
+  for (SDISelAsmOperandInfo &OpInfo : ConstraintOperands) {
+    // Assign Registers.
+    SDISelAsmOperandInfo &RefOpInfo =
+        OpInfo.isMatchingInputConstraint()
+            ? ConstraintOperands[OpInfo.getMatchedOperand()]
+            : OpInfo;
+    const auto RegError =
+        getRegistersForValue(DAG, getCurSDLoc(), OpInfo, RefOpInfo);
+    if (RegError) {
+      const MachineFunction &MF = DAG.getMachineFunction();
+      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+      const char *RegName = TRI.getName(*RegError);
+      emitInlineAsmError(Call, "register '" + Twine(RegName) +
+                                   "' allocated for constraint '" +
+                                   Twine(OpInfo.ConstraintCode) +
+                                   "' does not match required type");
+      return;
+    }
+
+    auto DetectWriteToReservedRegister = [&]() {
+      const MachineFunction &MF = DAG.getMachineFunction();
+      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+      for (unsigned Reg : OpInfo.AssignedRegs.Regs) {
+        if (Register::isPhysicalRegister(Reg) &&
+            TRI.isInlineAsmReadOnlyReg(MF, Reg)) {
+          const char *RegName = TRI.getName(Reg);
+          emitInlineAsmError(Call, "write to reserved register '" +
+                                       Twine(RegName) + "'");
+          return true;
+        }
+      }
+      return false;
+    };
+    assert((OpInfo.ConstraintType != TargetLowering::C_Address ||
+            (OpInfo.Type == InlineAsm::isInput &&
+             !OpInfo.isMatchingInputConstraint())) &&
+           "Only address as input operand is allowed.");
+
+    switch (OpInfo.Type) {
+    case InlineAsm::isOutput:
+      if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
+        unsigned ConstraintID =
+            TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode);
+        assert(ConstraintID != InlineAsm::Constraint_Unknown &&
+               "Failed to convert memory constraint code to constraint id.");
+
+        // Add information to the INLINEASM node to know about this output.
+        unsigned OpFlags = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
+        OpFlags = InlineAsm::getFlagWordForMem(OpFlags, ConstraintID);
+        AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlags, getCurSDLoc(),
+                                                        MVT::i32));
+        AsmNodeOperands.push_back(OpInfo.CallOperand);
+      } else {
+        // Otherwise, this outputs to a register (directly for C_Register /
+        // C_RegisterClass, and a target-defined fashion for
+        // C_Immediate/C_Other). Find a register that we can use.
+        if (OpInfo.AssignedRegs.Regs.empty()) {
+          emitInlineAsmError(
+              Call, "couldn't allocate output register for constraint '" +
+                        Twine(OpInfo.ConstraintCode) + "'");
+          return;
+        }
+
+        if (DetectWriteToReservedRegister())
+          return;
+
+        // Add information to the INLINEASM node to know that this register is
+        // set.
+        OpInfo.AssignedRegs.AddInlineAsmOperands(
+            OpInfo.isEarlyClobber ? InlineAsm::Kind_RegDefEarlyClobber
+                                  : InlineAsm::Kind_RegDef,
+            false, 0, getCurSDLoc(), DAG, AsmNodeOperands);
+      }
+      break;
+
+    case InlineAsm::isInput:
+    case InlineAsm::isLabel: {
+      SDValue InOperandVal = OpInfo.CallOperand;
+
+      if (OpInfo.isMatchingInputConstraint()) {
+        // If this is required to match an output register we have already set,
+        // just use its register.
+        auto CurOp = findMatchingInlineAsmOperand(OpInfo.getMatchedOperand(),
+                                                  AsmNodeOperands);
+        unsigned OpFlag =
+          cast<ConstantSDNode>(AsmNodeOperands[CurOp])->getZExtValue();
+        if (InlineAsm::isRegDefKind(OpFlag) ||
+            InlineAsm::isRegDefEarlyClobberKind(OpFlag)) {
+          // Add (OpFlag&0xffff)>>3 registers to MatchedRegs.
+          if (OpInfo.isIndirect) {
+            // This happens on gcc/testsuite/gcc.dg/pr8788-1.c
+            emitInlineAsmError(Call, "inline asm not supported yet: "
+                                     "don't know how to handle tied "
+                                     "indirect register inputs");
+            return;
+          }
+
+          SmallVector<unsigned, 4> Regs;
+          MachineFunction &MF = DAG.getMachineFunction();
+          MachineRegisterInfo &MRI = MF.getRegInfo();
+          const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+          auto *R = cast<RegisterSDNode>(AsmNodeOperands[CurOp+1]);
+          Register TiedReg = R->getReg();
+          MVT RegVT = R->getSimpleValueType(0);
+          const TargetRegisterClass *RC =
+              TiedReg.isVirtual()     ? MRI.getRegClass(TiedReg)
+              : RegVT != MVT::Untyped ? TLI.getRegClassFor(RegVT)
+                                      : TRI.getMinimalPhysRegClass(TiedReg);
+          unsigned NumRegs = InlineAsm::getNumOperandRegisters(OpFlag);
+          for (unsigned i = 0; i != NumRegs; ++i)
+            Regs.push_back(MRI.createVirtualRegister(RC));
+
+          RegsForValue MatchedRegs(Regs, RegVT, InOperandVal.getValueType());
+
+          SDLoc dl = getCurSDLoc();
+          // Use the produced MatchedRegs object to
+          MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Glue, &Call);
+          MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse,
+                                           true, OpInfo.getMatchedOperand(), dl,
+                                           DAG, AsmNodeOperands);
+          break;
+        }
+
+        assert(InlineAsm::isMemKind(OpFlag) && "Unknown matching constraint!");
+        assert(InlineAsm::getNumOperandRegisters(OpFlag) == 1 &&
+               "Unexpected number of operands");
+        // Add information to the INLINEASM node to know about this input.
+        // See InlineAsm.h isUseOperandTiedToDef.
+        OpFlag = InlineAsm::convertMemFlagWordToMatchingFlagWord(OpFlag);
+        OpFlag = InlineAsm::getFlagWordForMatchingOp(OpFlag,
+                                                    OpInfo.getMatchedOperand());
+        AsmNodeOperands.push_back(DAG.getTargetConstant(
+            OpFlag, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
+        AsmNodeOperands.push_back(AsmNodeOperands[CurOp+1]);
+        break;
+      }
+
+      // Treat indirect 'X' constraint as memory.
+      if (OpInfo.ConstraintType == TargetLowering::C_Other &&
+          OpInfo.isIndirect)
+        OpInfo.ConstraintType = TargetLowering::C_Memory;
+
+      if (OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+          OpInfo.ConstraintType == TargetLowering::C_Other) {
+        std::vector<SDValue> Ops;
+        TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
+                                          Ops, DAG);
+        if (Ops.empty()) {
+          if (OpInfo.ConstraintType == TargetLowering::C_Immediate)
+            if (isa<ConstantSDNode>(InOperandVal)) {
+              emitInlineAsmError(Call, "value out of range for constraint '" +
+                                           Twine(OpInfo.ConstraintCode) + "'");
+              return;
+            }
+
+          emitInlineAsmError(Call,
+                             "invalid operand for inline asm constraint '" +
+                                 Twine(OpInfo.ConstraintCode) + "'");
+          return;
+        }
+
+        // Add information to the INLINEASM node to know about this input.
+        unsigned ResOpType =
+          InlineAsm::getFlagWord(InlineAsm::Kind_Imm, Ops.size());
+        AsmNodeOperands.push_back(DAG.getTargetConstant(
+            ResOpType, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
+        llvm::append_range(AsmNodeOperands, Ops);
+        break;
+      }
+
+      if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
+        assert((OpInfo.isIndirect ||
+                OpInfo.ConstraintType != TargetLowering::C_Memory) &&
+               "Operand must be indirect to be a mem!");
+        assert(InOperandVal.getValueType() ==
+                   TLI.getPointerTy(DAG.getDataLayout()) &&
+               "Memory operands expect pointer values");
+
+        unsigned ConstraintID =
+            TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode);
+        assert(ConstraintID != InlineAsm::Constraint_Unknown &&
+               "Failed to convert memory constraint code to constraint id.");
+
+        // Add information to the INLINEASM node to know about this input.
+        unsigned ResOpType = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
+        ResOpType = InlineAsm::getFlagWordForMem(ResOpType, ConstraintID);
+        AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
+                                                        getCurSDLoc(),
+                                                        MVT::i32));
+        AsmNodeOperands.push_back(InOperandVal);
+        break;
+      }
+
+      if (OpInfo.ConstraintType == TargetLowering::C_Address) {
+        unsigned ConstraintID =
+            TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode);
+        assert(ConstraintID != InlineAsm::Constraint_Unknown &&
+               "Failed to convert memory constraint code to constraint id.");
+
+        unsigned ResOpType = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
+
+        SDValue AsmOp = InOperandVal;
+        if (isFunction(InOperandVal)) {
+          auto *GA = cast<GlobalAddressSDNode>(InOperandVal);
+          ResOpType = InlineAsm::getFlagWord(InlineAsm::Kind_Func, 1);
+          AsmOp = DAG.getTargetGlobalAddress(GA->getGlobal(), getCurSDLoc(),
+                                             InOperandVal.getValueType(),
+                                             GA->getOffset());
+        }
+
+        // Add information to the INLINEASM node to know about this input.
+        ResOpType = InlineAsm::getFlagWordForMem(ResOpType, ConstraintID);
+
+        AsmNodeOperands.push_back(
+            DAG.getTargetConstant(ResOpType, getCurSDLoc(), MVT::i32));
+
+        AsmNodeOperands.push_back(AsmOp);
+        break;
+      }
+
+      assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass ||
+              OpInfo.ConstraintType == TargetLowering::C_Register) &&
+             "Unknown constraint type!");
+
+      // TODO: Support this.
+      if (OpInfo.isIndirect) {
+        emitInlineAsmError(
+            Call, "Don't know how to handle indirect register inputs yet "
+                  "for constraint '" +
+                      Twine(OpInfo.ConstraintCode) + "'");
+        return;
+      }
+
+      // Copy the input into the appropriate registers.
+      if (OpInfo.AssignedRegs.Regs.empty()) {
+        emitInlineAsmError(Call,
+                           "couldn't allocate input reg for constraint '" +
+                               Twine(OpInfo.ConstraintCode) + "'");
+        return;
+      }
+
+      if (DetectWriteToReservedRegister())
+        return;
+
+      SDLoc dl = getCurSDLoc();
+
+      OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Glue,
+                                        &Call);
+
+      OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse, false, 0,
+                                               dl, DAG, AsmNodeOperands);
+      break;
+    }
+    case InlineAsm::isClobber:
+      // Add the clobbered value to the operand list, so that the register
+      // allocator is aware that the physreg got clobbered.
+      if (!OpInfo.AssignedRegs.Regs.empty())
+        OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_Clobber,
+                                                 false, 0, getCurSDLoc(), DAG,
+                                                 AsmNodeOperands);
+      break;
+    }
+  }
+
+  // Finish up input operands.  Set the input chain and add the flag last.
+  AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
+  if (Glue.getNode()) AsmNodeOperands.push_back(Glue);
+
+  unsigned ISDOpc = IsCallBr ? ISD::INLINEASM_BR : ISD::INLINEASM;
+  Chain = DAG.getNode(ISDOpc, getCurSDLoc(),
+                      DAG.getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
+  Glue = Chain.getValue(1);
+
+  // Do additional work to generate outputs.
+
+  SmallVector<EVT, 1> ResultVTs;
+  SmallVector<SDValue, 1> ResultValues;
+  SmallVector<SDValue, 8> OutChains;
+
+  llvm::Type *CallResultType = Call.getType();
+  ArrayRef<Type *> ResultTypes;
+  if (StructType *StructResult = dyn_cast<StructType>(CallResultType))
+    ResultTypes = StructResult->elements();
+  else if (!CallResultType->isVoidTy())
+    ResultTypes = ArrayRef(CallResultType);
+
+  auto CurResultType = ResultTypes.begin();
+  auto handleRegAssign = [&](SDValue V) {
+    assert(CurResultType != ResultTypes.end() && "Unexpected value");
+    assert((*CurResultType)->isSized() && "Unexpected unsized type");
+    EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), *CurResultType);
+    ++CurResultType;
+    // If the type of the inline asm call site return value is different but has
+    // same size as the type of the asm output bitcast it.  One example of this
+    // is for vectors with different width / number of elements.  This can
+    // happen for register classes that can contain multiple different value
+    // types.  The preg or vreg allocated may not have the same VT as was
+    // expected.
+    //
+    // This can also happen for a return value that disagrees with the register
+    // class it is put in, eg. a double in a general-purpose register on a
+    // 32-bit machine.
+    if (ResultVT != V.getValueType() &&
+        ResultVT.getSizeInBits() == V.getValueSizeInBits())
+      V = DAG.getNode(ISD::BITCAST, getCurSDLoc(), ResultVT, V);
+    else if (ResultVT != V.getValueType() && ResultVT.isInteger() &&
+             V.getValueType().isInteger()) {
+      // If a result value was tied to an input value, the computed result
+      // may have a wider width than the expected result.  Extract the
+      // relevant portion.
+      V = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), ResultVT, V);
+    }
+    assert(ResultVT == V.getValueType() && "Asm result value mismatch!");
+    ResultVTs.push_back(ResultVT);
+    ResultValues.push_back(V);
+  };
+
+  // Deal with output operands.
+  for (SDISelAsmOperandInfo &OpInfo : ConstraintOperands) {
+    if (OpInfo.Type == InlineAsm::isOutput) {
+      SDValue Val;
+      // Skip trivial output operands.
+      if (OpInfo.AssignedRegs.Regs.empty())
+        continue;
+
+      switch (OpInfo.ConstraintType) {
+      case TargetLowering::C_Register:
+      case TargetLowering::C_RegisterClass:
+        Val = OpInfo.AssignedRegs.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(),
+                                                  Chain, &Glue, &Call);
+        break;
+      case TargetLowering::C_Immediate:
+      case TargetLowering::C_Other:
+        Val = TLI.LowerAsmOutputForConstraint(Chain, Glue, getCurSDLoc(),
+                                              OpInfo, DAG);
+        break;
+      case TargetLowering::C_Memory:
+        break; // Already handled.
+      case TargetLowering::C_Address:
+        break; // Silence warning.
+      case TargetLowering::C_Unknown:
+        assert(false && "Unexpected unknown constraint");
+      }
+
+      // Indirect output manifest as stores. Record output chains.
+      if (OpInfo.isIndirect) {
+        const Value *Ptr = OpInfo.CallOperandVal;
+        assert(Ptr && "Expected value CallOperandVal for indirect asm operand");
+        SDValue Store = DAG.getStore(Chain, getCurSDLoc(), Val, getValue(Ptr),
+                                     MachinePointerInfo(Ptr));
+        OutChains.push_back(Store);
+      } else {
+        // generate CopyFromRegs to associated registers.
+        assert(!Call.getType()->isVoidTy() && "Bad inline asm!");
+        if (Val.getOpcode() == ISD::MERGE_VALUES) {
+          for (const SDValue &V : Val->op_values())
+            handleRegAssign(V);
+        } else
+          handleRegAssign(Val);
+      }
+    }
+  }
+
+  // Set results.
+  if (!ResultValues.empty()) {
+    assert(CurResultType == ResultTypes.end() &&
+           "Mismatch in number of ResultTypes");
+    assert(ResultValues.size() == ResultTypes.size() &&
+           "Mismatch in number of output operands in asm result");
+
+    SDValue V = DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
+                            DAG.getVTList(ResultVTs), ResultValues);
+    setValue(&Call, V);
+  }
+
+  // Collect store chains.
+  if (!OutChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, OutChains);
+
+  if (EmitEHLabels) {
+    Chain = lowerEndEH(Chain, cast<InvokeInst>(&Call), EHPadBB, BeginLabel);
+  }
+
+  // Only Update Root if inline assembly has a memory effect.
+  if (ResultValues.empty() || HasSideEffect || !OutChains.empty() || IsCallBr ||
+      EmitEHLabels)
+    DAG.setRoot(Chain);
+}
+
+void SelectionDAGBuilder::emitInlineAsmError(const CallBase &Call,
+                                             const Twine &Message) {
+  LLVMContext &Ctx = *DAG.getContext();
+  Ctx.emitError(&Call, Message);
+
+  // Make sure we leave the DAG in a valid state
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SmallVector<EVT, 1> ValueVTs;
+  ComputeValueVTs(TLI, DAG.getDataLayout(), Call.getType(), ValueVTs);
+
+  if (ValueVTs.empty())
+    return;
+
+  SmallVector<SDValue, 1> Ops;
+  for (unsigned i = 0, e = ValueVTs.size(); i != e; ++i)
+    Ops.push_back(DAG.getUNDEF(ValueVTs[i]));
+
+  setValue(&Call, DAG.getMergeValues(Ops, getCurSDLoc()));
+}
+
+void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
+  DAG.setRoot(DAG.getNode(ISD::VASTART, getCurSDLoc(),
+                          MVT::Other, getRoot(),
+                          getValue(I.getArgOperand(0)),
+                          DAG.getSrcValue(I.getArgOperand(0))));
+}
+
+void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const DataLayout &DL = DAG.getDataLayout();
+  SDValue V = DAG.getVAArg(
+      TLI.getMemValueType(DAG.getDataLayout(), I.getType()), getCurSDLoc(),
+      getRoot(), getValue(I.getOperand(0)), DAG.getSrcValue(I.getOperand(0)),
+      DL.getABITypeAlign(I.getType()).value());
+  DAG.setRoot(V.getValue(1));
+
+  if (I.getType()->isPointerTy())
+    V = DAG.getPtrExtOrTrunc(
+        V, getCurSDLoc(), TLI.getValueType(DAG.getDataLayout(), I.getType()));
+  setValue(&I, V);
+}
+
+void SelectionDAGBuilder::visitVAEnd(const CallInst &I) {
+  DAG.setRoot(DAG.getNode(ISD::VAEND, getCurSDLoc(),
+                          MVT::Other, getRoot(),
+                          getValue(I.getArgOperand(0)),
+                          DAG.getSrcValue(I.getArgOperand(0))));
+}
+
+void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
+  DAG.setRoot(DAG.getNode(ISD::VACOPY, getCurSDLoc(),
+                          MVT::Other, getRoot(),
+                          getValue(I.getArgOperand(0)),
+                          getValue(I.getArgOperand(1)),
+                          DAG.getSrcValue(I.getArgOperand(0)),
+                          DAG.getSrcValue(I.getArgOperand(1))));
+}
+
+SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG,
+                                                    const Instruction &I,
+                                                    SDValue Op) {
+  const MDNode *Range = getRangeMetadata(I);
+  if (!Range)
+    return Op;
+
+  ConstantRange CR = getConstantRangeFromMetadata(*Range);
+  if (CR.isFullSet() || CR.isEmptySet() || CR.isUpperWrapped())
+    return Op;
+
+  APInt Lo = CR.getUnsignedMin();
+  if (!Lo.isMinValue())
+    return Op;
+
+  APInt Hi = CR.getUnsignedMax();
+  unsigned Bits = std::max(Hi.getActiveBits(),
+                           static_cast<unsigned>(IntegerType::MIN_INT_BITS));
+
+  EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
+
+  SDLoc SL = getCurSDLoc();
+
+  SDValue ZExt = DAG.getNode(ISD::AssertZext, SL, Op.getValueType(), Op,
+                             DAG.getValueType(SmallVT));
+  unsigned NumVals = Op.getNode()->getNumValues();
+  if (NumVals == 1)
+    return ZExt;
+
+  SmallVector<SDValue, 4> Ops;
+
+  Ops.push_back(ZExt);
+  for (unsigned I = 1; I != NumVals; ++I)
+    Ops.push_back(Op.getValue(I));
+
+  return DAG.getMergeValues(Ops, SL);
+}
+
+/// Populate a CallLowerinInfo (into \p CLI) based on the properties of
+/// the call being lowered.
+///
+/// This is a helper for lowering intrinsics that follow a target calling
+/// convention or require stack pointer adjustment. Only a subset of the
+/// intrinsic's operands need to participate in the calling convention.
+void SelectionDAGBuilder::populateCallLoweringInfo(
+    TargetLowering::CallLoweringInfo &CLI, const CallBase *Call,
+    unsigned ArgIdx, unsigned NumArgs, SDValue Callee, Type *ReturnTy,
+    bool IsPatchPoint) {
+  TargetLowering::ArgListTy Args;
+  Args.reserve(NumArgs);
+
+  // Populate the argument list.
+  // Attributes for args start at offset 1, after the return attribute.
+  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs;
+       ArgI != ArgE; ++ArgI) {
+    const Value *V = Call->getOperand(ArgI);
+
+    assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
+
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = getValue(V);
+    Entry.Ty = V->getType();
+    Entry.setAttributes(Call, ArgI);
+    Args.push_back(Entry);
+  }
+
+  CLI.setDebugLoc(getCurSDLoc())
+      .setChain(getRoot())
+      .setCallee(Call->getCallingConv(), ReturnTy, Callee, std::move(Args))
+      .setDiscardResult(Call->use_empty())
+      .setIsPatchPoint(IsPatchPoint)
+      .setIsPreallocated(
+          Call->countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0);
+}
+
+/// Add a stack map intrinsic call's live variable operands to a stackmap
+/// or patchpoint target node's operand list.
+///
+/// Constants are converted to TargetConstants purely as an optimization to
+/// avoid constant materialization and register allocation.
+///
+/// FrameIndex operands are converted to TargetFrameIndex so that ISEL does not
+/// generate addess computation nodes, and so FinalizeISel can convert the
+/// TargetFrameIndex into a DirectMemRefOp StackMap location. This avoids
+/// address materialization and register allocation, but may also be required
+/// for correctness. If a StackMap (or PatchPoint) intrinsic directly uses an
+/// alloca in the entry block, then the runtime may assume that the alloca's
+/// StackMap location can be read immediately after compilation and that the
+/// location is valid at any point during execution (this is similar to the
+/// assumption made by the llvm.gcroot intrinsic). If the alloca's location were
+/// only available in a register, then the runtime would need to trap when
+/// execution reaches the StackMap in order to read the alloca's location.
+static void addStackMapLiveVars(const CallBase &Call, unsigned StartIdx,
+                                const SDLoc &DL, SmallVectorImpl<SDValue> &Ops,
+                                SelectionDAGBuilder &Builder) {
+  SelectionDAG &DAG = Builder.DAG;
+  for (unsigned I = StartIdx; I < Call.arg_size(); I++) {
+    SDValue Op = Builder.getValue(Call.getArgOperand(I));
+
+    // Things on the stack are pointer-typed, meaning that they are already
+    // legal and can be emitted directly to target nodes.
+    if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Op)) {
+      Ops.push_back(DAG.getTargetFrameIndex(FI->getIndex(), Op.getValueType()));
+    } else {
+      // Otherwise emit a target independent node to be legalised.
+      Ops.push_back(Builder.getValue(Call.getArgOperand(I)));
+    }
+  }
+}
+
+/// Lower llvm.experimental.stackmap.
+void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
+  // void @llvm.experimental.stackmap(i64 <id>, i32 <numShadowBytes>,
+  //                                  [live variables...])
+
+  assert(CI.getType()->isVoidTy() && "Stackmap cannot return a value.");
+
+  SDValue Chain, InGlue, Callee;
+  SmallVector<SDValue, 32> Ops;
+
+  SDLoc DL = getCurSDLoc();
+  Callee = getValue(CI.getCalledOperand());
+
+  // The stackmap intrinsic only records the live variables (the arguments
+  // passed to it) and emits NOPS (if requested). Unlike the patchpoint
+  // intrinsic, this won't be lowered to a function call. This means we don't
+  // have to worry about calling conventions and target specific lowering code.
+  // Instead we perform the call lowering right here.
+  //
+  // chain, flag = CALLSEQ_START(chain, 0, 0)
+  // chain, flag = STACKMAP(id, nbytes, ..., chain, flag)
+  // chain, flag = CALLSEQ_END(chain, 0, 0, flag)
+  //
+  Chain = DAG.getCALLSEQ_START(getRoot(), 0, 0, DL);
+  InGlue = Chain.getValue(1);
+
+  // Add the STACKMAP operands, starting with DAG house-keeping.
+  Ops.push_back(Chain);
+  Ops.push_back(InGlue);
+
+  // Add the <id>, <numShadowBytes> operands.
+  //
+  // These do not require legalisation, and can be emitted directly to target
+  // constant nodes.
+  SDValue ID = getValue(CI.getArgOperand(0));
+  assert(ID.getValueType() == MVT::i64);
+  SDValue IDConst = DAG.getTargetConstant(
+      cast<ConstantSDNode>(ID)->getZExtValue(), DL, ID.getValueType());
+  Ops.push_back(IDConst);
+
+  SDValue Shad = getValue(CI.getArgOperand(1));
+  assert(Shad.getValueType() == MVT::i32);
+  SDValue ShadConst = DAG.getTargetConstant(
+      cast<ConstantSDNode>(Shad)->getZExtValue(), DL, Shad.getValueType());
+  Ops.push_back(ShadConst);
+
+  // Add the live variables.
+  addStackMapLiveVars(CI, 2, DL, Ops, *this);
+
+  // Create the STACKMAP node.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(ISD::STACKMAP, DL, NodeTys, Ops);
+  InGlue = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
+
+  // Stackmaps don't generate values, so nothing goes into the NodeMap.
+
+  // Set the root to the target-lowered call chain.
+  DAG.setRoot(Chain);
+
+  // Inform the Frame Information that we have a stackmap in this function.
+  FuncInfo.MF->getFrameInfo().setHasStackMap();
+}
+
+/// Lower llvm.experimental.patchpoint directly to its target opcode.
+void SelectionDAGBuilder::visitPatchpoint(const CallBase &CB,
+                                          const BasicBlock *EHPadBB) {
+  // void|i64 @llvm.experimental.patchpoint.void|i64(i64 <id>,
+  //                                                 i32 <numBytes>,
+  //                                                 i8* <target>,
+  //                                                 i32 <numArgs>,
+  //                                                 [Args...],
+  //                                                 [live variables...])
+
+  CallingConv::ID CC = CB.getCallingConv();
+  bool IsAnyRegCC = CC == CallingConv::AnyReg;
+  bool HasDef = !CB.getType()->isVoidTy();
+  SDLoc dl = getCurSDLoc();
+  SDValue Callee = getValue(CB.getArgOperand(PatchPointOpers::TargetPos));
+
+  // Handle immediate and symbolic callees.
+  if (auto* ConstCallee = dyn_cast<ConstantSDNode>(Callee))
+    Callee = DAG.getIntPtrConstant(ConstCallee->getZExtValue(), dl,
+                                   /*isTarget=*/true);
+  else if (auto* SymbolicCallee = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee =  DAG.getTargetGlobalAddress(SymbolicCallee->getGlobal(),
+                                         SDLoc(SymbolicCallee),
+                                         SymbolicCallee->getValueType(0));
+
+  // Get the real number of arguments participating in the call <numArgs>
+  SDValue NArgVal = getValue(CB.getArgOperand(PatchPointOpers::NArgPos));
+  unsigned NumArgs = cast<ConstantSDNode>(NArgVal)->getZExtValue();
+
+  // Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
+  // Intrinsics include all meta-operands up to but not including CC.
+  unsigned NumMetaOpers = PatchPointOpers::CCPos;
+  assert(CB.arg_size() >= NumMetaOpers + NumArgs &&
+         "Not enough arguments provided to the patchpoint intrinsic");
+
+  // For AnyRegCC the arguments are lowered later on manually.
+  unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
+  Type *ReturnTy =
+      IsAnyRegCC ? Type::getVoidTy(*DAG.getContext()) : CB.getType();
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  populateCallLoweringInfo(CLI, &CB, NumMetaOpers, NumCallArgs, Callee,
+                           ReturnTy, true);
+  std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);
+
+  SDNode *CallEnd = Result.second.getNode();
+  if (HasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
+    CallEnd = CallEnd->getOperand(0).getNode();
+
+  /// Get a call instruction from the call sequence chain.
+  /// Tail calls are not allowed.
+  assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
+         "Expected a callseq node.");
+  SDNode *Call = CallEnd->getOperand(0).getNode();
+  bool HasGlue = Call->getGluedNode();
+
+  // Replace the target specific call node with the patchable intrinsic.
+  SmallVector<SDValue, 8> Ops;
+
+  // Push the chain.
+  Ops.push_back(*(Call->op_begin()));
+
+  // Optionally, push the glue (if any).
+  if (HasGlue)
+    Ops.push_back(*(Call->op_end() - 1));
+
+  // Push the register mask info.
+  if (HasGlue)
+    Ops.push_back(*(Call->op_end() - 2));
+  else
+    Ops.push_back(*(Call->op_end() - 1));
+
+  // Add the <id> and <numBytes> constants.
+  SDValue IDVal = getValue(CB.getArgOperand(PatchPointOpers::IDPos));
+  Ops.push_back(DAG.getTargetConstant(
+                  cast<ConstantSDNode>(IDVal)->getZExtValue(), dl, MVT::i64));
+  SDValue NBytesVal = getValue(CB.getArgOperand(PatchPointOpers::NBytesPos));
+  Ops.push_back(DAG.getTargetConstant(
+                  cast<ConstantSDNode>(NBytesVal)->getZExtValue(), dl,
+                  MVT::i32));
+
+  // Add the callee.
+  Ops.push_back(Callee);
+
+  // Adjust <numArgs> to account for any arguments that have been passed on the
+  // stack instead.
+  // Call Node: Chain, Target, {Args}, RegMask, [Glue]
+  unsigned NumCallRegArgs = Call->getNumOperands() - (HasGlue ? 4 : 3);
+  NumCallRegArgs = IsAnyRegCC ? NumArgs : NumCallRegArgs;
+  Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, dl, MVT::i32));
+
+  // Add the calling convention
+  Ops.push_back(DAG.getTargetConstant((unsigned)CC, dl, MVT::i32));
+
+  // Add the arguments we omitted previously. The register allocator should
+  // place these in any free register.
+  if (IsAnyRegCC)
+    for (unsigned i = NumMetaOpers, e = NumMetaOpers + NumArgs; i != e; ++i)
+      Ops.push_back(getValue(CB.getArgOperand(i)));
+
+  // Push the arguments from the call instruction.
+  SDNode::op_iterator e = HasGlue ? Call->op_end()-2 : Call->op_end()-1;
+  Ops.append(Call->op_begin() + 2, e);
+
+  // Push live variables for the stack map.
+  addStackMapLiveVars(CB, NumMetaOpers + NumArgs, dl, Ops, *this);
+
+  SDVTList NodeTys;
+  if (IsAnyRegCC && HasDef) {
+    // Create the return types based on the intrinsic definition
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    SmallVector<EVT, 3> ValueVTs;
+    ComputeValueVTs(TLI, DAG.getDataLayout(), CB.getType(), ValueVTs);
+    assert(ValueVTs.size() == 1 && "Expected only one return value type.");
+
+    // There is always a chain and a glue type at the end
+    ValueVTs.push_back(MVT::Other);
+    ValueVTs.push_back(MVT::Glue);
+    NodeTys = DAG.getVTList(ValueVTs);
+  } else
+    NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  // Replace the target specific call node with a PATCHPOINT node.
+  SDValue PPV = DAG.getNode(ISD::PATCHPOINT, dl, NodeTys, Ops);
+
+  // Update the NodeMap.
+  if (HasDef) {
+    if (IsAnyRegCC)
+      setValue(&CB, SDValue(PPV.getNode(), 0));
+    else
+      setValue(&CB, Result.first);
+  }
+
+  // Fixup the consumers of the intrinsic. The chain and glue may be used in the
+  // call sequence. Furthermore the location of the chain and glue can change
+  // when the AnyReg calling convention is used and the intrinsic returns a
+  // value.
+  if (IsAnyRegCC && HasDef) {
+    SDValue From[] = {SDValue(Call, 0), SDValue(Call, 1)};
+    SDValue To[] = {PPV.getValue(1), PPV.getValue(2)};
+    DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
+  } else
+    DAG.ReplaceAllUsesWith(Call, PPV.getNode());
+  DAG.DeleteNode(Call);
+
+  // Inform the Frame Information that we have a patchpoint in this function.
+  FuncInfo.MF->getFrameInfo().setHasPatchPoint();
+}
+
+void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
+                                            unsigned Intrinsic) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Op1 = getValue(I.getArgOperand(0));
+  SDValue Op2;
+  if (I.arg_size() > 1)
+    Op2 = getValue(I.getArgOperand(1));
+  SDLoc dl = getCurSDLoc();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  SDValue Res;
+  SDNodeFlags SDFlags;
+  if (auto *FPMO = dyn_cast<FPMathOperator>(&I))
+    SDFlags.copyFMF(*FPMO);
+
+  switch (Intrinsic) {
+  case Intrinsic::vector_reduce_fadd:
+    if (SDFlags.hasAllowReassociation())
+      Res = DAG.getNode(ISD::FADD, dl, VT, Op1,
+                        DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2, SDFlags),
+                        SDFlags);
+    else
+      Res = DAG.getNode(ISD::VECREDUCE_SEQ_FADD, dl, VT, Op1, Op2, SDFlags);
+    break;
+  case Intrinsic::vector_reduce_fmul:
+    if (SDFlags.hasAllowReassociation())
+      Res = DAG.getNode(ISD::FMUL, dl, VT, Op1,
+                        DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2, SDFlags),
+                        SDFlags);
+    else
+      Res = DAG.getNode(ISD::VECREDUCE_SEQ_FMUL, dl, VT, Op1, Op2, SDFlags);
+    break;
+  case Intrinsic::vector_reduce_add:
+    Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1);
+    break;
+  case Intrinsic::vector_reduce_mul:
+    Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1);
+    break;
+  case Intrinsic::vector_reduce_and:
+    Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1);
+    break;
+  case Intrinsic::vector_reduce_or:
+    Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1);
+    break;
+  case Intrinsic::vector_reduce_xor:
+    Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1);
+    break;
+  case Intrinsic::vector_reduce_smax:
+    Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1);
+    break;
+  case Intrinsic::vector_reduce_smin:
+    Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1);
+    break;
+  case Intrinsic::vector_reduce_umax:
+    Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1);
+    break;
+  case Intrinsic::vector_reduce_umin:
+    Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
+    break;
+  case Intrinsic::vector_reduce_fmax:
+    Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags);
+    break;
+  case Intrinsic::vector_reduce_fmin:
+    Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags);
+    break;
+  case Intrinsic::vector_reduce_fmaximum:
+    Res = DAG.getNode(ISD::VECREDUCE_FMAXIMUM, dl, VT, Op1, SDFlags);
+    break;
+  case Intrinsic::vector_reduce_fminimum:
+    Res = DAG.getNode(ISD::VECREDUCE_FMINIMUM, dl, VT, Op1, SDFlags);
+    break;
+  default:
+    llvm_unreachable("Unhandled vector reduce intrinsic");
+  }
+  setValue(&I, Res);
+}
+
+/// Returns an AttributeList representing the attributes applied to the return
+/// value of the given call.
+static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
+  SmallVector<Attribute::AttrKind, 2> Attrs;
+  if (CLI.RetSExt)
+    Attrs.push_back(Attribute::SExt);
+  if (CLI.RetZExt)
+    Attrs.push_back(Attribute::ZExt);
+  if (CLI.IsInReg)
+    Attrs.push_back(Attribute::InReg);
+
+  return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex,
+                            Attrs);
+}
+
+/// TargetLowering::LowerCallTo - This is the default LowerCallTo
+/// implementation, which just calls LowerCall.
+/// FIXME: When all targets are
+/// migrated to using LowerCall, this hook should be integrated into SDISel.
+std::pair<SDValue, SDValue>
+TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
+  // Handle the incoming return values from the call.
+  CLI.Ins.clear();
+  Type *OrigRetTy = CLI.RetTy;
+  SmallVector<EVT, 4> RetTys;
+  SmallVector<uint64_t, 4> Offsets;
+  auto &DL = CLI.DAG.getDataLayout();
+  ComputeValueVTs(*this, DL, CLI.RetTy, RetTys, &Offsets, 0);
+
+  if (CLI.IsPostTypeLegalization) {
+    // If we are lowering a libcall after legalization, split the return type.
+    SmallVector<EVT, 4> OldRetTys;
+    SmallVector<uint64_t, 4> OldOffsets;
+    RetTys.swap(OldRetTys);
+    Offsets.swap(OldOffsets);
+
+    for (size_t i = 0, e = OldRetTys.size(); i != e; ++i) {
+      EVT RetVT = OldRetTys[i];
+      uint64_t Offset = OldOffsets[i];
+      MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), RetVT);
+      unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), RetVT);
+      unsigned RegisterVTByteSZ = RegisterVT.getSizeInBits() / 8;
+      RetTys.append(NumRegs, RegisterVT);
+      for (unsigned j = 0; j != NumRegs; ++j)
+        Offsets.push_back(Offset + j * RegisterVTByteSZ);
+    }
+  }
+
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(CLI.CallConv, CLI.RetTy, getReturnAttrs(CLI), Outs, *this, DL);
+
+  bool CanLowerReturn =
+      this->CanLowerReturn(CLI.CallConv, CLI.DAG.getMachineFunction(),
+                           CLI.IsVarArg, Outs, CLI.RetTy->getContext());
+
+  SDValue DemoteStackSlot;
+  int DemoteStackIdx = -100;
+  if (!CanLowerReturn) {
+    // FIXME: equivalent assert?
+    // assert(!CS.hasInAllocaArgument() &&
+    //        "sret demotion is incompatible with inalloca");
+    uint64_t TySize = DL.getTypeAllocSize(CLI.RetTy);
+    Align Alignment = DL.getPrefTypeAlign(CLI.RetTy);
+    MachineFunction &MF = CLI.DAG.getMachineFunction();
+    DemoteStackIdx =
+        MF.getFrameInfo().CreateStackObject(TySize, Alignment, false);
+    Type *StackSlotPtrType = PointerType::get(CLI.RetTy,
+                                              DL.getAllocaAddrSpace());
+
+    DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getFrameIndexTy(DL));
+    ArgListEntry Entry;
+    Entry.Node = DemoteStackSlot;
+    Entry.Ty = StackSlotPtrType;
+    Entry.IsSExt = false;
+    Entry.IsZExt = false;
+    Entry.IsInReg = false;
+    Entry.IsSRet = true;
+    Entry.IsNest = false;
+    Entry.IsByVal = false;
+    Entry.IsByRef = false;
+    Entry.IsReturned = false;
+    Entry.IsSwiftSelf = false;
+    Entry.IsSwiftAsync = false;
+    Entry.IsSwiftError = false;
+    Entry.IsCFGuardTarget = false;
+    Entry.Alignment = Alignment;
+    CLI.getArgs().insert(CLI.getArgs().begin(), Entry);
+    CLI.NumFixedArgs += 1;
+    CLI.getArgs()[0].IndirectType = CLI.RetTy;
+    CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext());
+
+    // sret demotion isn't compatible with tail-calls, since the sret argument
+    // points into the callers stack frame.
+    CLI.IsTailCall = false;
+  } else {
+    bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
+        CLI.RetTy, CLI.CallConv, CLI.IsVarArg, DL);
+    for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+      ISD::ArgFlagsTy Flags;
+      if (NeedsRegBlock) {
+        Flags.setInConsecutiveRegs();
+        if (I == RetTys.size() - 1)
+          Flags.setInConsecutiveRegsLast();
+      }
+      EVT VT = RetTys[I];
+      MVT RegisterVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
+                                                     CLI.CallConv, VT);
+      unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
+                                                       CLI.CallConv, VT);
+      for (unsigned i = 0; i != NumRegs; ++i) {
+        ISD::InputArg MyFlags;
+        MyFlags.Flags = Flags;
+        MyFlags.VT = RegisterVT;
+        MyFlags.ArgVT = VT;
+        MyFlags.Used = CLI.IsReturnValueUsed;
+        if (CLI.RetTy->isPointerTy()) {
+          MyFlags.Flags.setPointer();
+          MyFlags.Flags.setPointerAddrSpace(
+              cast<PointerType>(CLI.RetTy)->getAddressSpace());
+        }
+        if (CLI.RetSExt)
+          MyFlags.Flags.setSExt();
+        if (CLI.RetZExt)
+          MyFlags.Flags.setZExt();
+        if (CLI.IsInReg)
+          MyFlags.Flags.setInReg();
+        CLI.Ins.push_back(MyFlags);
+      }
+    }
+  }
+
+  // We push in swifterror return as the last element of CLI.Ins.
+  ArgListTy &Args = CLI.getArgs();
+  if (supportSwiftError()) {
+    for (const ArgListEntry &Arg : Args) {
+      if (Arg.IsSwiftError) {
+        ISD::InputArg MyFlags;
+        MyFlags.VT = getPointerTy(DL);
+        MyFlags.ArgVT = EVT(getPointerTy(DL));
+        MyFlags.Flags.setSwiftError();
+        CLI.Ins.push_back(MyFlags);
+      }
+    }
+  }
+
+  // Handle all of the outgoing arguments.
+  CLI.Outs.clear();
+  CLI.OutVals.clear();
+  for (unsigned i = 0, e = Args.size(); i != e; ++i) {
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs);
+    // FIXME: Split arguments if CLI.IsPostTypeLegalization
+    Type *FinalType = Args[i].Ty;
+    if (Args[i].IsByVal)
+      FinalType = Args[i].IndirectType;
+    bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
+        FinalType, CLI.CallConv, CLI.IsVarArg, DL);
+    for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues;
+         ++Value) {
+      EVT VT = ValueVTs[Value];
+      Type *ArgTy = VT.getTypeForEVT(CLI.RetTy->getContext());
+      SDValue Op = SDValue(Args[i].Node.getNode(),
+                           Args[i].Node.getResNo() + Value);
+      ISD::ArgFlagsTy Flags;
+
+      // Certain targets (such as MIPS), may have a different ABI alignment
+      // for a type depending on the context. Give the target a chance to
+      // specify the alignment it wants.
+      const Align OriginalAlignment(getABIAlignmentForCallingConv(ArgTy, DL));
+      Flags.setOrigAlign(OriginalAlignment);
+
+      if (Args[i].Ty->isPointerTy()) {
+        Flags.setPointer();
+        Flags.setPointerAddrSpace(
+            cast<PointerType>(Args[i].Ty)->getAddressSpace());
+      }
+      if (Args[i].IsZExt)
+        Flags.setZExt();
+      if (Args[i].IsSExt)
+        Flags.setSExt();
+      if (Args[i].IsInReg) {
+        // If we are using vectorcall calling convention, a structure that is
+        // passed InReg - is surely an HVA
+        if (CLI.CallConv == CallingConv::X86_VectorCall &&
+            isa<StructType>(FinalType)) {
+          // The first value of a structure is marked
+          if (0 == Value)
+            Flags.setHvaStart();
+          Flags.setHva();
+        }
+        // Set InReg Flag
+        Flags.setInReg();
+      }
+      if (Args[i].IsSRet)
+        Flags.setSRet();
+      if (Args[i].IsSwiftSelf)
+        Flags.setSwiftSelf();
+      if (Args[i].IsSwiftAsync)
+        Flags.setSwiftAsync();
+      if (Args[i].IsSwiftError)
+        Flags.setSwiftError();
+      if (Args[i].IsCFGuardTarget)
+        Flags.setCFGuardTarget();
+      if (Args[i].IsByVal)
+        Flags.setByVal();
+      if (Args[i].IsByRef)
+        Flags.setByRef();
+      if (Args[i].IsPreallocated) {
+        Flags.setPreallocated();
+        // Set the byval flag for CCAssignFn callbacks that don't know about
+        // preallocated.  This way we can know how many bytes we should've
+        // allocated and how many bytes a callee cleanup function will pop.  If
+        // we port preallocated to more targets, we'll have to add custom
+        // preallocated handling in the various CC lowering callbacks.
+        Flags.setByVal();
+      }
+      if (Args[i].IsInAlloca) {
+        Flags.setInAlloca();
+        // Set the byval flag for CCAssignFn callbacks that don't know about
+        // inalloca.  This way we can know how many bytes we should've allocated
+        // and how many bytes a callee cleanup function will pop.  If we port
+        // inalloca to more targets, we'll have to add custom inalloca handling
+        // in the various CC lowering callbacks.
+        Flags.setByVal();
+      }
+      Align MemAlign;
+      if (Args[i].IsByVal || Args[i].IsInAlloca || Args[i].IsPreallocated) {
+        unsigned FrameSize = DL.getTypeAllocSize(Args[i].IndirectType);
+        Flags.setByValSize(FrameSize);
+
+        // info is not there but there are cases it cannot get right.
+        if (auto MA = Args[i].Alignment)
+          MemAlign = *MA;
+        else
+          MemAlign = Align(getByValTypeAlignment(Args[i].IndirectType, DL));
+      } else if (auto MA = Args[i].Alignment) {
+        MemAlign = *MA;
+      } else {
+        MemAlign = OriginalAlignment;
+      }
+      Flags.setMemAlign(MemAlign);
+      if (Args[i].IsNest)
+        Flags.setNest();
+      if (NeedsRegBlock)
+        Flags.setInConsecutiveRegs();
+
+      MVT PartVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
+                                                 CLI.CallConv, VT);
+      unsigned NumParts = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
+                                                        CLI.CallConv, VT);
+      SmallVector<SDValue, 4> Parts(NumParts);
+      ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+
+      if (Args[i].IsSExt)
+        ExtendKind = ISD::SIGN_EXTEND;
+      else if (Args[i].IsZExt)
+        ExtendKind = ISD::ZERO_EXTEND;
+
+      // Conservatively only handle 'returned' on non-vectors that can be lowered,
+      // for now.
+      if (Args[i].IsReturned && !Op.getValueType().isVector() &&
+          CanLowerReturn) {
+        assert((CLI.RetTy == Args[i].Ty ||
+                (CLI.RetTy->isPointerTy() && Args[i].Ty->isPointerTy() &&
+                 CLI.RetTy->getPointerAddressSpace() ==
+                     Args[i].Ty->getPointerAddressSpace())) &&
+               RetTys.size() == NumValues && "unexpected use of 'returned'");
+        // Before passing 'returned' to the target lowering code, ensure that
+        // either the register MVT and the actual EVT are the same size or that
+        // the return value and argument are extended in the same way; in these
+        // cases it's safe to pass the argument register value unchanged as the
+        // return register value (although it's at the target's option whether
+        // to do so)
+        // TODO: allow code generation to take advantage of partially preserved
+        // registers rather than clobbering the entire register when the
+        // parameter extension method is not compatible with the return
+        // extension method
+        if ((NumParts * PartVT.getSizeInBits() == VT.getSizeInBits()) ||
+            (ExtendKind != ISD::ANY_EXTEND && CLI.RetSExt == Args[i].IsSExt &&
+             CLI.RetZExt == Args[i].IsZExt))
+          Flags.setReturned();
+      }
+
+      getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT, CLI.CB,
+                     CLI.CallConv, ExtendKind);
+
+      for (unsigned j = 0; j != NumParts; ++j) {
+        // if it isn't first piece, alignment must be 1
+        // For scalable vectors the scalable part is currently handled
+        // by individual targets, so we just use the known minimum size here.
+        ISD::OutputArg MyFlags(
+            Flags, Parts[j].getValueType().getSimpleVT(), VT,
+            i < CLI.NumFixedArgs, i,
+            j * Parts[j].getValueType().getStoreSize().getKnownMinValue());
+        if (NumParts > 1 && j == 0)
+          MyFlags.Flags.setSplit();
+        else if (j != 0) {
+          MyFlags.Flags.setOrigAlign(Align(1));
+          if (j == NumParts - 1)
+            MyFlags.Flags.setSplitEnd();
+        }
+
+        CLI.Outs.push_back(MyFlags);
+        CLI.OutVals.push_back(Parts[j]);
+      }
+
+      if (NeedsRegBlock && Value == NumValues - 1)
+        CLI.Outs[CLI.Outs.size() - 1].Flags.setInConsecutiveRegsLast();
+    }
+  }
+
+  SmallVector<SDValue, 4> InVals;
+  CLI.Chain = LowerCall(CLI, InVals);
+
+  // Update CLI.InVals to use outside of this function.
+  CLI.InVals = InVals;
+
+  // Verify that the target's LowerCall behaved as expected.
+  assert(CLI.Chain.getNode() && CLI.Chain.getValueType() == MVT::Other &&
+         "LowerCall didn't return a valid chain!");
+  assert((!CLI.IsTailCall || InVals.empty()) &&
+         "LowerCall emitted a return value for a tail call!");
+  assert((CLI.IsTailCall || InVals.size() == CLI.Ins.size()) &&
+         "LowerCall didn't emit the correct number of values!");
+
+  // For a tail call, the return value is merely live-out and there aren't
+  // any nodes in the DAG representing it. Return a special value to
+  // indicate that a tail call has been emitted and no more Instructions
+  // should be processed in the current block.
+  if (CLI.IsTailCall) {
+    CLI.DAG.setRoot(CLI.Chain);
+    return std::make_pair(SDValue(), SDValue());
+  }
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = CLI.Ins.size(); i != e; ++i) {
+    assert(InVals[i].getNode() && "LowerCall emitted a null value!");
+    assert(EVT(CLI.Ins[i].VT) == InVals[i].getValueType() &&
+           "LowerCall emitted a value with the wrong type!");
+  }
+#endif
+
+  SmallVector<SDValue, 4> ReturnValues;
+  if (!CanLowerReturn) {
+    // The instruction result is the result of loading from the
+    // hidden sret parameter.
+    SmallVector<EVT, 1> PVTs;
+    Type *PtrRetTy =
+        PointerType::get(OrigRetTy->getContext(), DL.getAllocaAddrSpace());
+
+    ComputeValueVTs(*this, DL, PtrRetTy, PVTs);
+    assert(PVTs.size() == 1 && "Pointers should fit in one register");
+    EVT PtrVT = PVTs[0];
+
+    unsigned NumValues = RetTys.size();
+    ReturnValues.resize(NumValues);
+    SmallVector<SDValue, 4> Chains(NumValues);
+
+    // An aggregate return value cannot wrap around the address space, so
+    // offsets to its parts don't wrap either.
+    SDNodeFlags Flags;
+    Flags.setNoUnsignedWrap(true);
+
+    MachineFunction &MF = CLI.DAG.getMachineFunction();
+    Align HiddenSRetAlign = MF.getFrameInfo().getObjectAlign(DemoteStackIdx);
+    for (unsigned i = 0; i < NumValues; ++i) {
+      SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot,
+                                    CLI.DAG.getConstant(Offsets[i], CLI.DL,
+                                                        PtrVT), Flags);
+      SDValue L = CLI.DAG.getLoad(
+          RetTys[i], CLI.DL, CLI.Chain, Add,
+          MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(),
+                                            DemoteStackIdx, Offsets[i]),
+          HiddenSRetAlign);
+      ReturnValues[i] = L;
+      Chains[i] = L.getValue(1);
+    }
+
+    CLI.Chain = CLI.DAG.getNode(ISD::TokenFactor, CLI.DL, MVT::Other, Chains);
+  } else {
+    // Collect the legal value parts into potentially illegal values
+    // that correspond to the original function's return values.
+    std::optional<ISD::NodeType> AssertOp;
+    if (CLI.RetSExt)
+      AssertOp = ISD::AssertSext;
+    else if (CLI.RetZExt)
+      AssertOp = ISD::AssertZext;
+    unsigned CurReg = 0;
+    for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+      EVT VT = RetTys[I];
+      MVT RegisterVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
+                                                     CLI.CallConv, VT);
+      unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
+                                                       CLI.CallConv, VT);
+
+      ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
+                                              NumRegs, RegisterVT, VT, nullptr,
+                                              CLI.CallConv, AssertOp));
+      CurReg += NumRegs;
+    }
+
+    // For a function returning void, there is no return value. We can't create
+    // such a node, so we just return a null return value in that case. In
+    // that case, nothing will actually look at the value.
+    if (ReturnValues.empty())
+      return std::make_pair(SDValue(), CLI.Chain);
+  }
+
+  SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL,
+                                CLI.DAG.getVTList(RetTys), ReturnValues);
+  return std::make_pair(Res, CLI.Chain);
+}
+
+/// Places new result values for the node in Results (their number
+/// and types must exactly match those of the original return values of
+/// the node), or leaves Results empty, which indicates that the node is not
+/// to be custom lowered after all.
+void TargetLowering::LowerOperationWrapper(SDNode *N,
+                                           SmallVectorImpl<SDValue> &Results,
+                                           SelectionDAG &DAG) const {
+  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+  if (!Res.getNode())
+    return;
+
+  // If the original node has one result, take the return value from
+  // LowerOperation as is. It might not be result number 0.
+  if (N->getNumValues() == 1) {
+    Results.push_back(Res);
+    return;
+  }
+
+  // If the original node has multiple results, then the return node should
+  // have the same number of results.
+  assert((N->getNumValues() == Res->getNumValues()) &&
+      "Lowering returned the wrong number of results!");
+
+  // Places new result values base on N result number.
+  for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
+    Results.push_back(Res.getValue(I));
+}
+
+SDValue TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  llvm_unreachable("LowerOperation not implemented for this target!");
+}
+
+void SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V,
+                                                     unsigned Reg,
+                                                     ISD::NodeType ExtendType) {
+  SDValue Op = getNonRegisterValue(V);
+  assert((Op.getOpcode() != ISD::CopyFromReg ||
+          cast<RegisterSDNode>(Op.getOperand(1))->getReg() != Reg) &&
+         "Copy from a reg to the same reg!");
+  assert(!Register::isPhysicalRegister(Reg) && "Is a physreg");
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  // If this is an InlineAsm we have to match the registers required, not the
+  // notional registers required by the type.
+
+  RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg, V->getType(),
+                   std::nullopt); // This is not an ABI copy.
+  SDValue Chain = DAG.getEntryNode();
+
+  if (ExtendType == ISD::ANY_EXTEND) {
+    auto PreferredExtendIt = FuncInfo.PreferredExtendType.find(V);
+    if (PreferredExtendIt != FuncInfo.PreferredExtendType.end())
+      ExtendType = PreferredExtendIt->second;
+  }
+  RFV.getCopyToRegs(Op, DAG, getCurSDLoc(), Chain, nullptr, V, ExtendType);
+  PendingExports.push_back(Chain);
+}
+
+#include "llvm/CodeGen/SelectionDAGISel.h"
+
+/// isOnlyUsedInEntryBlock - If the specified argument is only used in the
+/// entry block, return true.  This includes arguments used by switches, since
+/// the switch may expand into multiple basic blocks.
+static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) {
+  // With FastISel active, we may be splitting blocks, so force creation
+  // of virtual registers for all non-dead arguments.
+  if (FastISel)
+    return A->use_empty();
+
+  const BasicBlock &Entry = A->getParent()->front();
+  for (const User *U : A->users())
+    if (cast<Instruction>(U)->getParent() != &Entry || isa<SwitchInst>(U))
+      return false;  // Use not in entry block.
+
+  return true;
+}
+
+using ArgCopyElisionMapTy =
+    DenseMap<const Argument *,
+             std::pair<const AllocaInst *, const StoreInst *>>;
+
+/// Scan the entry block of the function in FuncInfo for arguments that look
+/// like copies into a local alloca. Record any copied arguments in
+/// ArgCopyElisionCandidates.
+static void
+findArgumentCopyElisionCandidates(const DataLayout &DL,
+                                  FunctionLoweringInfo *FuncInfo,
+                                  ArgCopyElisionMapTy &ArgCopyElisionCandidates) {
+  // Record the state of every static alloca used in the entry block. Argument
+  // allocas are all used in the entry block, so we need approximately as many
+  // entries as we have arguments.
+  enum StaticAllocaInfo { Unknown, Clobbered, Elidable };
+  SmallDenseMap<const AllocaInst *, StaticAllocaInfo, 8> StaticAllocas;
+  unsigned NumArgs = FuncInfo->Fn->arg_size();
+  StaticAllocas.reserve(NumArgs * 2);
+
+  auto GetInfoIfStaticAlloca = [&](const Value *V) -> StaticAllocaInfo * {
+    if (!V)
+      return nullptr;
+    V = V->stripPointerCasts();
+    const auto *AI = dyn_cast<AllocaInst>(V);
+    if (!AI || !AI->isStaticAlloca() || !FuncInfo->StaticAllocaMap.count(AI))
+      return nullptr;
+    auto Iter = StaticAllocas.insert({AI, Unknown});
+    return &Iter.first->second;
+  };
+
+  // Look for stores of arguments to static allocas. Look through bitcasts and
+  // GEPs to handle type coercions, as long as the alloca is fully initialized
+  // by the store. Any non-store use of an alloca escapes it and any subsequent
+  // unanalyzed store might write it.
+  // FIXME: Handle structs initialized with multiple stores.
+  for (const Instruction &I : FuncInfo->Fn->getEntryBlock()) {
+    // Look for stores, and handle non-store uses conservatively.
+    const auto *SI = dyn_cast<StoreInst>(&I);
+    if (!SI) {
+      // We will look through cast uses, so ignore them completely.
+      if (I.isCast())
+        continue;
+      // Ignore debug info and pseudo op intrinsics, they don't escape or store
+      // to allocas.
+      if (I.isDebugOrPseudoInst())
+        continue;
+      // This is an unknown instruction. Assume it escapes or writes to all
+      // static alloca operands.
+      for (const Use &U : I.operands()) {
+        if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(U))
+          *Info = StaticAllocaInfo::Clobbered;
+      }
+      continue;
+    }
+
+    // If the stored value is a static alloca, mark it as escaped.
+    if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(SI->getValueOperand()))
+      *Info = StaticAllocaInfo::Clobbered;
+
+    // Check if the destination is a static alloca.
+    const Value *Dst = SI->getPointerOperand()->stripPointerCasts();
+    StaticAllocaInfo *Info = GetInfoIfStaticAlloca(Dst);
+    if (!Info)
+      continue;
+    const AllocaInst *AI = cast<AllocaInst>(Dst);
+
+    // Skip allocas that have been initialized or clobbered.
+    if (*Info != StaticAllocaInfo::Unknown)
+      continue;
+
+    // Check if the stored value is an argument, and that this store fully
+    // initializes the alloca.
+    // If the argument type has padding bits we can't directly forward a pointer
+    // as the upper bits may contain garbage.
+    // Don't elide copies from the same argument twice.
+    const Value *Val = SI->getValueOperand()->stripPointerCasts();
+    const auto *Arg = dyn_cast<Argument>(Val);
+    if (!Arg || Arg->hasPassPointeeByValueCopyAttr() ||
+        Arg->getType()->isEmptyTy() ||
+        DL.getTypeStoreSize(Arg->getType()) !=
+            DL.getTypeAllocSize(AI->getAllocatedType()) ||
+        !DL.typeSizeEqualsStoreSize(Arg->getType()) ||
+        ArgCopyElisionCandidates.count(Arg)) {
+      *Info = StaticAllocaInfo::Clobbered;
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "Found argument copy elision candidate: " << *AI
+                      << '\n');
+
+    // Mark this alloca and store for argument copy elision.
+    *Info = StaticAllocaInfo::Elidable;
+    ArgCopyElisionCandidates.insert({Arg, {AI, SI}});
+
+    // Stop scanning if we've seen all arguments. This will happen early in -O0
+    // builds, which is useful, because -O0 builds have large entry blocks and
+    // many allocas.
+    if (ArgCopyElisionCandidates.size() == NumArgs)
+      break;
+  }
+}
+
+/// Try to elide argument copies from memory into a local alloca. Succeeds if
+/// ArgVal is a load from a suitable fixed stack object.
+static void tryToElideArgumentCopy(
+    FunctionLoweringInfo &FuncInfo, SmallVectorImpl<SDValue> &Chains,
+    DenseMap<int, int> &ArgCopyElisionFrameIndexMap,
+    SmallPtrSetImpl<const Instruction *> &ElidedArgCopyInstrs,
+    ArgCopyElisionMapTy &ArgCopyElisionCandidates, const Argument &Arg,
+    ArrayRef<SDValue> ArgVals, bool &ArgHasUses) {
+  // Check if this is a load from a fixed stack object.
+  auto *LNode = dyn_cast<LoadSDNode>(ArgVals[0]);
+  if (!LNode)
+    return;
+  auto *FINode = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode());
+  if (!FINode)
+    return;
+
+  // Check that the fixed stack object is the right size and alignment.
+  // Look at the alignment that the user wrote on the alloca instead of looking
+  // at the stack object.
+  auto ArgCopyIter = ArgCopyElisionCandidates.find(&Arg);
+  assert(ArgCopyIter != ArgCopyElisionCandidates.end());
+  const AllocaInst *AI = ArgCopyIter->second.first;
+  int FixedIndex = FINode->getIndex();
+  int &AllocaIndex = FuncInfo.StaticAllocaMap[AI];
+  int OldIndex = AllocaIndex;
+  MachineFrameInfo &MFI = FuncInfo.MF->getFrameInfo();
+  if (MFI.getObjectSize(FixedIndex) != MFI.getObjectSize(OldIndex)) {
+    LLVM_DEBUG(
+        dbgs() << "  argument copy elision failed due to bad fixed stack "
+                  "object size\n");
+    return;
+  }
+  Align RequiredAlignment = AI->getAlign();
+  if (MFI.getObjectAlign(FixedIndex) < RequiredAlignment) {
+    LLVM_DEBUG(dbgs() << "  argument copy elision failed: alignment of alloca "
+                         "greater than stack argument alignment ("
+                      << DebugStr(RequiredAlignment) << " vs "
+                      << DebugStr(MFI.getObjectAlign(FixedIndex)) << ")\n");
+    return;
+  }
+
+  // Perform the elision. Delete the old stack object and replace its only use
+  // in the variable info map. Mark the stack object as mutable.
+  LLVM_DEBUG({
+    dbgs() << "Eliding argument copy from " << Arg << " to " << *AI << '\n'
+           << "  Replacing frame index " << OldIndex << " with " << FixedIndex
+           << '\n';
+  });
+  MFI.RemoveStackObject(OldIndex);
+  MFI.setIsImmutableObjectIndex(FixedIndex, false);
+  AllocaIndex = FixedIndex;
+  ArgCopyElisionFrameIndexMap.insert({OldIndex, FixedIndex});
+  for (SDValue ArgVal : ArgVals)
+    Chains.push_back(ArgVal.getValue(1));
+
+  // Avoid emitting code for the store implementing the copy.
+  const StoreInst *SI = ArgCopyIter->second.second;
+  ElidedArgCopyInstrs.insert(SI);
+
+  // Check for uses of the argument again so that we can avoid exporting ArgVal
+  // if it is't used by anything other than the store.
+  for (const Value *U : Arg.users()) {
+    if (U != SI) {
+      ArgHasUses = true;
+      break;
+    }
+  }
+}
+
+void SelectionDAGISel::LowerArguments(const Function &F) {
+  SelectionDAG &DAG = SDB->DAG;
+  SDLoc dl = SDB->getCurSDLoc();
+  const DataLayout &DL = DAG.getDataLayout();
+  SmallVector<ISD::InputArg, 16> Ins;
+
+  // In Naked functions we aren't going to save any registers.
+  if (F.hasFnAttribute(Attribute::Naked))
+    return;
+
+  if (!FuncInfo->CanLowerReturn) {
+    // Put in an sret pointer parameter before all the other parameters.
+    SmallVector<EVT, 1> ValueVTs;
+    ComputeValueVTs(*TLI, DAG.getDataLayout(),
+                    PointerType::get(F.getContext(),
+                                     DAG.getDataLayout().getAllocaAddrSpace()),
+                    ValueVTs);
+
+    // NOTE: Assuming that a pointer will never break down to more than one VT
+    // or one register.
+    ISD::ArgFlagsTy Flags;
+    Flags.setSRet();
+    MVT RegisterVT = TLI->getRegisterType(*DAG.getContext(), ValueVTs[0]);
+    ISD::InputArg RetArg(Flags, RegisterVT, ValueVTs[0], true,
+                         ISD::InputArg::NoArgIndex, 0);
+    Ins.push_back(RetArg);
+  }
+
+  // Look for stores of arguments to static allocas. Mark such arguments with a
+  // flag to ask the target to give us the memory location of that argument if
+  // available.
+  ArgCopyElisionMapTy ArgCopyElisionCandidates;
+  findArgumentCopyElisionCandidates(DL, FuncInfo.get(),
+                                    ArgCopyElisionCandidates);
+
+  // Set up the incoming argument description vector.
+  for (const Argument &Arg : F.args()) {
+    unsigned ArgNo = Arg.getArgNo();
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
+    bool isArgValueUsed = !Arg.use_empty();
+    unsigned PartBase = 0;
+    Type *FinalType = Arg.getType();
+    if (Arg.hasAttribute(Attribute::ByVal))
+      FinalType = Arg.getParamByValType();
+    bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters(
+        FinalType, F.getCallingConv(), F.isVarArg(), DL);
+    for (unsigned Value = 0, NumValues = ValueVTs.size();
+         Value != NumValues; ++Value) {
+      EVT VT = ValueVTs[Value];
+      Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+      ISD::ArgFlagsTy Flags;
+
+
+      if (Arg.getType()->isPointerTy()) {
+        Flags.setPointer();
+        Flags.setPointerAddrSpace(
+            cast<PointerType>(Arg.getType())->getAddressSpace());
+      }
+      if (Arg.hasAttribute(Attribute::ZExt))
+        Flags.setZExt();
+      if (Arg.hasAttribute(Attribute::SExt))
+        Flags.setSExt();
+      if (Arg.hasAttribute(Attribute::InReg)) {
+        // If we are using vectorcall calling convention, a structure that is
+        // passed InReg - is surely an HVA
+        if (F.getCallingConv() == CallingConv::X86_VectorCall &&
+            isa<StructType>(Arg.getType())) {
+          // The first value of a structure is marked
+          if (0 == Value)
+            Flags.setHvaStart();
+          Flags.setHva();
+        }
+        // Set InReg Flag
+        Flags.setInReg();
+      }
+      if (Arg.hasAttribute(Attribute::StructRet))
+        Flags.setSRet();
+      if (Arg.hasAttribute(Attribute::SwiftSelf))
+        Flags.setSwiftSelf();
+      if (Arg.hasAttribute(Attribute::SwiftAsync))
+        Flags.setSwiftAsync();
+      if (Arg.hasAttribute(Attribute::SwiftError))
+        Flags.setSwiftError();
+      if (Arg.hasAttribute(Attribute::ByVal))
+        Flags.setByVal();
+      if (Arg.hasAttribute(Attribute::ByRef))
+        Flags.setByRef();
+      if (Arg.hasAttribute(Attribute::InAlloca)) {
+        Flags.setInAlloca();
+        // Set the byval flag for CCAssignFn callbacks that don't know about
+        // inalloca.  This way we can know how many bytes we should've allocated
+        // and how many bytes a callee cleanup function will pop.  If we port
+        // inalloca to more targets, we'll have to add custom inalloca handling
+        // in the various CC lowering callbacks.
+        Flags.setByVal();
+      }
+      if (Arg.hasAttribute(Attribute::Preallocated)) {
+        Flags.setPreallocated();
+        // Set the byval flag for CCAssignFn callbacks that don't know about
+        // preallocated.  This way we can know how many bytes we should've
+        // allocated and how many bytes a callee cleanup function will pop.  If
+        // we port preallocated to more targets, we'll have to add custom
+        // preallocated handling in the various CC lowering callbacks.
+        Flags.setByVal();
+      }
+
+      // Certain targets (such as MIPS), may have a different ABI alignment
+      // for a type depending on the context. Give the target a chance to
+      // specify the alignment it wants.
+      const Align OriginalAlignment(
+          TLI->getABIAlignmentForCallingConv(ArgTy, DL));
+      Flags.setOrigAlign(OriginalAlignment);
+
+      Align MemAlign;
+      Type *ArgMemTy = nullptr;
+      if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated() ||
+          Flags.isByRef()) {
+        if (!ArgMemTy)
+          ArgMemTy = Arg.getPointeeInMemoryValueType();
+
+        uint64_t MemSize = DL.getTypeAllocSize(ArgMemTy);
+
+        // For in-memory arguments, size and alignment should be passed from FE.
+        // BE will guess if this info is not there but there are cases it cannot
+        // get right.
+        if (auto ParamAlign = Arg.getParamStackAlign())
+          MemAlign = *ParamAlign;
+        else if ((ParamAlign = Arg.getParamAlign()))
+          MemAlign = *ParamAlign;
+        else
+          MemAlign = Align(TLI->getByValTypeAlignment(ArgMemTy, DL));
+        if (Flags.isByRef())
+          Flags.setByRefSize(MemSize);
+        else
+          Flags.setByValSize(MemSize);
+      } else if (auto ParamAlign = Arg.getParamStackAlign()) {
+        MemAlign = *ParamAlign;
+      } else {
+        MemAlign = OriginalAlignment;
+      }
+      Flags.setMemAlign(MemAlign);
+
+      if (Arg.hasAttribute(Attribute::Nest))
+        Flags.setNest();
+      if (NeedsRegBlock)
+        Flags.setInConsecutiveRegs();
+      if (ArgCopyElisionCandidates.count(&Arg))
+        Flags.setCopyElisionCandidate();
+      if (Arg.hasAttribute(Attribute::Returned))
+        Flags.setReturned();
+
+      MVT RegisterVT = TLI->getRegisterTypeForCallingConv(
+          *CurDAG->getContext(), F.getCallingConv(), VT);
+      unsigned NumRegs = TLI->getNumRegistersForCallingConv(
+          *CurDAG->getContext(), F.getCallingConv(), VT);
+      for (unsigned i = 0; i != NumRegs; ++i) {
+        // For scalable vectors, use the minimum size; individual targets
+        // are responsible for handling scalable vector arguments and
+        // return values.
+        ISD::InputArg MyFlags(
+            Flags, RegisterVT, VT, isArgValueUsed, ArgNo,
+            PartBase + i * RegisterVT.getStoreSize().getKnownMinValue());
+        if (NumRegs > 1 && i == 0)
+          MyFlags.Flags.setSplit();
+        // if it isn't first piece, alignment must be 1
+        else if (i > 0) {
+          MyFlags.Flags.setOrigAlign(Align(1));
+          if (i == NumRegs - 1)
+            MyFlags.Flags.setSplitEnd();
+        }
+        Ins.push_back(MyFlags);
+      }
+      if (NeedsRegBlock && Value == NumValues - 1)
+        Ins[Ins.size() - 1].Flags.setInConsecutiveRegsLast();
+      PartBase += VT.getStoreSize().getKnownMinValue();
+    }
+  }
+
+  // Call the target to set up the argument values.
+  SmallVector<SDValue, 8> InVals;
+  SDValue NewRoot = TLI->LowerFormalArguments(
+      DAG.getRoot(), F.getCallingConv(), F.isVarArg(), Ins, dl, DAG, InVals);
+
+  // Verify that the target's LowerFormalArguments behaved as expected.
+  assert(NewRoot.getNode() && NewRoot.getValueType() == MVT::Other &&
+         "LowerFormalArguments didn't return a valid chain!");
+  assert(InVals.size() == Ins.size() &&
+         "LowerFormalArguments didn't emit the correct number of values!");
+  LLVM_DEBUG({
+    for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+      assert(InVals[i].getNode() &&
+             "LowerFormalArguments emitted a null value!");
+      assert(EVT(Ins[i].VT) == InVals[i].getValueType() &&
+             "LowerFormalArguments emitted a value with the wrong type!");
+    }
+  });
+
+  // Update the DAG with the new chain value resulting from argument lowering.
+  DAG.setRoot(NewRoot);
+
+  // Set up the argument values.
+  unsigned i = 0;
+  if (!FuncInfo->CanLowerReturn) {
+    // Create a virtual register for the sret pointer, and put in a copy
+    // from the sret argument into it.
+    SmallVector<EVT, 1> ValueVTs;
+    ComputeValueVTs(*TLI, DAG.getDataLayout(),
+                    PointerType::get(F.getContext(),
+                                     DAG.getDataLayout().getAllocaAddrSpace()),
+                    ValueVTs);
+    MVT VT = ValueVTs[0].getSimpleVT();
+    MVT RegVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
+    std::optional<ISD::NodeType> AssertOp;
+    SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1, RegVT, VT,
+                                        nullptr, F.getCallingConv(), AssertOp);
+
+    MachineFunction& MF = SDB->DAG.getMachineFunction();
+    MachineRegisterInfo& RegInfo = MF.getRegInfo();
+    Register SRetReg =
+        RegInfo.createVirtualRegister(TLI->getRegClassFor(RegVT));
+    FuncInfo->DemoteRegister = SRetReg;
+    NewRoot =
+        SDB->DAG.getCopyToReg(NewRoot, SDB->getCurSDLoc(), SRetReg, ArgValue);
+    DAG.setRoot(NewRoot);
+
+    // i indexes lowered arguments.  Bump it past the hidden sret argument.
+    ++i;
+  }
+
+  SmallVector<SDValue, 4> Chains;
+  DenseMap<int, int> ArgCopyElisionFrameIndexMap;
+  for (const Argument &Arg : F.args()) {
+    SmallVector<SDValue, 4> ArgValues;
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
+    unsigned NumValues = ValueVTs.size();
+    if (NumValues == 0)
+      continue;
+
+    bool ArgHasUses = !Arg.use_empty();
+
+    // Elide the copying store if the target loaded this argument from a
+    // suitable fixed stack object.
+    if (Ins[i].Flags.isCopyElisionCandidate()) {
+      unsigned NumParts = 0;
+      for (EVT VT : ValueVTs)
+        NumParts += TLI->getNumRegistersForCallingConv(*CurDAG->getContext(),
+                                                       F.getCallingConv(), VT);
+
+      tryToElideArgumentCopy(*FuncInfo, Chains, ArgCopyElisionFrameIndexMap,
+                             ElidedArgCopyInstrs, ArgCopyElisionCandidates, Arg,
+                             ArrayRef(&InVals[i], NumParts), ArgHasUses);
+    }
+
+    // If this argument is unused then remember its value. It is used to generate
+    // debugging information.
+    bool isSwiftErrorArg =
+        TLI->supportSwiftError() &&
+        Arg.hasAttribute(Attribute::SwiftError);
+    if (!ArgHasUses && !isSwiftErrorArg) {
+      SDB->setUnusedArgValue(&Arg, InVals[i]);
+
+      // Also remember any frame index for use in FastISel.
+      if (FrameIndexSDNode *FI =
+          dyn_cast<FrameIndexSDNode>(InVals[i].getNode()))
+        FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
+    }
+
+    for (unsigned Val = 0; Val != NumValues; ++Val) {
+      EVT VT = ValueVTs[Val];
+      MVT PartVT = TLI->getRegisterTypeForCallingConv(*CurDAG->getContext(),
+                                                      F.getCallingConv(), VT);
+      unsigned NumParts = TLI->getNumRegistersForCallingConv(
+          *CurDAG->getContext(), F.getCallingConv(), VT);
+
+      // Even an apparent 'unused' swifterror argument needs to be returned. So
+      // we do generate a copy for it that can be used on return from the
+      // function.
+      if (ArgHasUses || isSwiftErrorArg) {
+        std::optional<ISD::NodeType> AssertOp;
+        if (Arg.hasAttribute(Attribute::SExt))
+          AssertOp = ISD::AssertSext;
+        else if (Arg.hasAttribute(Attribute::ZExt))
+          AssertOp = ISD::AssertZext;
+
+        ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts,
+                                             PartVT, VT, nullptr,
+                                             F.getCallingConv(), AssertOp));
+      }
+
+      i += NumParts;
+    }
+
+    // We don't need to do anything else for unused arguments.
+    if (ArgValues.empty())
+      continue;
+
+    // Note down frame index.
+    if (FrameIndexSDNode *FI =
+        dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
+      FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
+
+    SDValue Res = DAG.getMergeValues(ArrayRef(ArgValues.data(), NumValues),
+                                     SDB->getCurSDLoc());
+
+    SDB->setValue(&Arg, Res);
+    if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) {
+      // We want to associate the argument with the frame index, among
+      // involved operands, that correspond to the lowest address. The
+      // getCopyFromParts function, called earlier, is swapping the order of
+      // the operands to BUILD_PAIR depending on endianness. The result of
+      // that swapping is that the least significant bits of the argument will
+      // be in the first operand of the BUILD_PAIR node, and the most
+      // significant bits will be in the second operand.
+      unsigned LowAddressOp = DAG.getDataLayout().isBigEndian() ? 1 : 0;
+      if (LoadSDNode *LNode =
+          dyn_cast<LoadSDNode>(Res.getOperand(LowAddressOp).getNode()))
+        if (FrameIndexSDNode *FI =
+            dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
+          FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
+    }
+
+    // Analyses past this point are naive and don't expect an assertion.
+    if (Res.getOpcode() == ISD::AssertZext)
+      Res = Res.getOperand(0);
+
+    // Update the SwiftErrorVRegDefMap.
+    if (Res.getOpcode() == ISD::CopyFromReg && isSwiftErrorArg) {
+      unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
+      if (Register::isVirtualRegister(Reg))
+        SwiftError->setCurrentVReg(FuncInfo->MBB, SwiftError->getFunctionArg(),
+                                   Reg);
+    }
+
+    // If this argument is live outside of the entry block, insert a copy from
+    // wherever we got it to the vreg that other BB's will reference it as.
+    if (Res.getOpcode() == ISD::CopyFromReg) {
+      // If we can, though, try to skip creating an unnecessary vreg.
+      // FIXME: This isn't very clean... it would be nice to make this more
+      // general.
+      unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
+      if (Register::isVirtualRegister(Reg)) {
+        FuncInfo->ValueMap[&Arg] = Reg;
+        continue;
+      }
+    }
+    if (!isOnlyUsedInEntryBlock(&Arg, TM.Options.EnableFastISel)) {
+      FuncInfo->InitializeRegForValue(&Arg);
+      SDB->CopyToExportRegsIfNeeded(&Arg);
+    }
+  }
+
+  if (!Chains.empty()) {
+    Chains.push_back(NewRoot);
+    NewRoot = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+  }
+
+  DAG.setRoot(NewRoot);
+
+  assert(i == InVals.size() && "Argument register count mismatch!");
+
+  // If any argument copy elisions occurred and we have debug info, update the
+  // stale frame indices used in the dbg.declare variable info table.
+  if (!ArgCopyElisionFrameIndexMap.empty()) {
+    for (MachineFunction::VariableDbgInfo &VI :
+         MF->getInStackSlotVariableDbgInfo()) {
+      auto I = ArgCopyElisionFrameIndexMap.find(VI.getStackSlot());
+      if (I != ArgCopyElisionFrameIndexMap.end())
+        VI.updateStackSlot(I->second);
+    }
+  }
+
+  // Finally, if the target has anything special to do, allow it to do so.
+  emitFunctionEntryCode();
+}
+
+/// Handle PHI nodes in successor blocks.  Emit code into the SelectionDAG to
+/// ensure constants are generated when needed.  Remember the virtual registers
+/// that need to be added to the Machine PHI nodes as input.  We cannot just
+/// directly add them, because expansion might result in multiple MBB's for one
+/// BB.  As such, the start of the BB might correspond to a different MBB than
+/// the end.
+void
+SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
+
+  // Check PHI nodes in successors that expect a value to be available from this
+  // block.
+  for (const BasicBlock *SuccBB : successors(LLVMBB->getTerminator())) {
+    if (!isa<PHINode>(SuccBB->begin())) continue;
+    MachineBasicBlock *SuccMBB = FuncInfo.MBBMap[SuccBB];
+
+    // If this terminator has multiple identical successors (common for
+    // switches), only handle each succ once.
+    if (!SuccsHandled.insert(SuccMBB).second)
+      continue;
+
+    MachineBasicBlock::iterator MBBI = SuccMBB->begin();
+
+    // At this point we know that there is a 1-1 correspondence between LLVM PHI
+    // nodes and Machine PHI nodes, but the incoming operands have not been
+    // emitted yet.
+    for (const PHINode &PN : SuccBB->phis()) {
+      // Ignore dead phi's.
+      if (PN.use_empty())
+        continue;
+
+      // Skip empty types
+      if (PN.getType()->isEmptyTy())
+        continue;
+
+      unsigned Reg;
+      const Value *PHIOp = PN.getIncomingValueForBlock(LLVMBB);
+
+      if (const auto *C = dyn_cast<Constant>(PHIOp)) {
+        unsigned &RegOut = ConstantsOut[C];
+        if (RegOut == 0) {
+          RegOut = FuncInfo.CreateRegs(C);
+          // We need to zero/sign extend ConstantInt phi operands to match
+          // assumptions in FunctionLoweringInfo::ComputePHILiveOutRegInfo.
+          ISD::NodeType ExtendType = ISD::ANY_EXTEND;
+          if (auto *CI = dyn_cast<ConstantInt>(C))
+            ExtendType = TLI.signExtendConstant(CI) ? ISD::SIGN_EXTEND
+                                                    : ISD::ZERO_EXTEND;
+          CopyValueToVirtualRegister(C, RegOut, ExtendType);
+        }
+        Reg = RegOut;
+      } else {
+        DenseMap<const Value *, Register>::iterator I =
+          FuncInfo.ValueMap.find(PHIOp);
+        if (I != FuncInfo.ValueMap.end())
+          Reg = I->second;
+        else {
+          assert(isa<AllocaInst>(PHIOp) &&
+                 FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(PHIOp)) &&
+                 "Didn't codegen value into a register!??");
+          Reg = FuncInfo.CreateRegs(PHIOp);
+          CopyValueToVirtualRegister(PHIOp, Reg);
+        }
+      }
+
+      // Remember that this register needs to added to the machine PHI node as
+      // the input for this MBB.
+      SmallVector<EVT, 4> ValueVTs;
+      ComputeValueVTs(TLI, DAG.getDataLayout(), PN.getType(), ValueVTs);
+      for (EVT VT : ValueVTs) {
+        const unsigned NumRegisters = TLI.getNumRegisters(*DAG.getContext(), VT);
+        for (unsigned i = 0; i != NumRegisters; ++i)
+          FuncInfo.PHINodesToUpdate.push_back(
+              std::make_pair(&*MBBI++, Reg + i));
+        Reg += NumRegisters;
+      }
+    }
+  }
+
+  ConstantsOut.clear();
+}
+
+MachineBasicBlock *SelectionDAGBuilder::NextBlock(MachineBasicBlock *MBB) {
+  MachineFunction::iterator I(MBB);
+  if (++I == FuncInfo.MF->end())
+    return nullptr;
+  return &*I;
+}
+
+/// During lowering new call nodes can be created (such as memset, etc.).
+/// Those will become new roots of the current DAG, but complications arise
+/// when they are tail calls. In such cases, the call lowering will update
+/// the root, but the builder still needs to know that a tail call has been
+/// lowered in order to avoid generating an additional return.
+void SelectionDAGBuilder::updateDAGForMaybeTailCall(SDValue MaybeTC) {
+  // If the node is null, we do have a tail call.
+  if (MaybeTC.getNode() != nullptr)
+    DAG.setRoot(MaybeTC);
+  else
+    HasTailCall = true;
+}
+
+void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
+                                        MachineBasicBlock *SwitchMBB,
+                                        MachineBasicBlock *DefaultMBB) {
+  MachineFunction *CurMF = FuncInfo.MF;
+  MachineBasicBlock *NextMBB = nullptr;
+  MachineFunction::iterator BBI(W.MBB);
+  if (++BBI != FuncInfo.MF->end())
+    NextMBB = &*BBI;
+
+  unsigned Size = W.LastCluster - W.FirstCluster + 1;
+
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+
+  if (Size == 2 && W.MBB == SwitchMBB) {
+    // If any two of the cases has the same destination, and if one value
+    // is the same as the other, but has one bit unset that the other has set,
+    // use bit manipulation to do two compares at once.  For example:
+    // "if (X == 6 || X == 4)" -> "if ((X|2) == 6)"
+    // TODO: This could be extended to merge any 2 cases in switches with 3
+    // cases.
+    // TODO: Handle cases where W.CaseBB != SwitchBB.
+    CaseCluster &Small = *W.FirstCluster;
+    CaseCluster &Big = *W.LastCluster;
+
+    if (Small.Low == Small.High && Big.Low == Big.High &&
+        Small.MBB == Big.MBB) {
+      const APInt &SmallValue = Small.Low->getValue();
+      const APInt &BigValue = Big.Low->getValue();
+
+      // Check that there is only one bit different.
+      APInt CommonBit = BigValue ^ SmallValue;
+      if (CommonBit.isPowerOf2()) {
+        SDValue CondLHS = getValue(Cond);
+        EVT VT = CondLHS.getValueType();
+        SDLoc DL = getCurSDLoc();
+
+        SDValue Or = DAG.getNode(ISD::OR, DL, VT, CondLHS,
+                                 DAG.getConstant(CommonBit, DL, VT));
+        SDValue Cond = DAG.getSetCC(
+            DL, MVT::i1, Or, DAG.getConstant(BigValue | SmallValue, DL, VT),
+            ISD::SETEQ);
+
+        // Update successor info.
+        // Both Small and Big will jump to Small.BB, so we sum up the
+        // probabilities.
+        addSuccessorWithProb(SwitchMBB, Small.MBB, Small.Prob + Big.Prob);
+        if (BPI)
+          addSuccessorWithProb(
+              SwitchMBB, DefaultMBB,
+              // The default destination is the first successor in IR.
+              BPI->getEdgeProbability(SwitchMBB->getBasicBlock(), (unsigned)0));
+        else
+          addSuccessorWithProb(SwitchMBB, DefaultMBB);
+
+        // Insert the true branch.
+        SDValue BrCond =
+            DAG.getNode(ISD::BRCOND, DL, MVT::Other, getControlRoot(), Cond,
+                        DAG.getBasicBlock(Small.MBB));
+        // Insert the false branch.
+        BrCond = DAG.getNode(ISD::BR, DL, MVT::Other, BrCond,
+                             DAG.getBasicBlock(DefaultMBB));
+
+        DAG.setRoot(BrCond);
+        return;
+      }
+    }
+  }
+
+  if (TM.getOptLevel() != CodeGenOpt::None) {
+    // Here, we order cases by probability so the most likely case will be
+    // checked first. However, two clusters can have the same probability in
+    // which case their relative ordering is non-deterministic. So we use Low
+    // as a tie-breaker as clusters are guaranteed to never overlap.
+    llvm::sort(W.FirstCluster, W.LastCluster + 1,
+               [](const CaseCluster &a, const CaseCluster &b) {
+      return a.Prob != b.Prob ?
+             a.Prob > b.Prob :
+             a.Low->getValue().slt(b.Low->getValue());
+    });
+
+    // Rearrange the case blocks so that the last one falls through if possible
+    // without changing the order of probabilities.
+    for (CaseClusterIt I = W.LastCluster; I > W.FirstCluster; ) {
+      --I;
+      if (I->Prob > W.LastCluster->Prob)
+        break;
+      if (I->Kind == CC_Range && I->MBB == NextMBB) {
+        std::swap(*I, *W.LastCluster);
+        break;
+      }
+    }
+  }
+
+  // Compute total probability.
+  BranchProbability DefaultProb = W.DefaultProb;
+  BranchProbability UnhandledProbs = DefaultProb;
+  for (CaseClusterIt I = W.FirstCluster; I <= W.LastCluster; ++I)
+    UnhandledProbs += I->Prob;
+
+  MachineBasicBlock *CurMBB = W.MBB;
+  for (CaseClusterIt I = W.FirstCluster, E = W.LastCluster; I <= E; ++I) {
+    bool FallthroughUnreachable = false;
+    MachineBasicBlock *Fallthrough;
+    if (I == W.LastCluster) {
+      // For the last cluster, fall through to the default destination.
+      Fallthrough = DefaultMBB;
+      FallthroughUnreachable = isa<UnreachableInst>(
+          DefaultMBB->getBasicBlock()->getFirstNonPHIOrDbg());
+    } else {
+      Fallthrough = CurMF->CreateMachineBasicBlock(CurMBB->getBasicBlock());
+      CurMF->insert(BBI, Fallthrough);
+      // Put Cond in a virtual register to make it available from the new blocks.
+      ExportFromCurrentBlock(Cond);
+    }
+    UnhandledProbs -= I->Prob;
+
+    switch (I->Kind) {
+      case CC_JumpTable: {
+        // FIXME: Optimize away range check based on pivot comparisons.
+        JumpTableHeader *JTH = &SL->JTCases[I->JTCasesIndex].first;
+        SwitchCG::JumpTable *JT = &SL->JTCases[I->JTCasesIndex].second;
+
+        // The jump block hasn't been inserted yet; insert it here.
+        MachineBasicBlock *JumpMBB = JT->MBB;
+        CurMF->insert(BBI, JumpMBB);
+
+        auto JumpProb = I->Prob;
+        auto FallthroughProb = UnhandledProbs;
+
+        // If the default statement is a target of the jump table, we evenly
+        // distribute the default probability to successors of CurMBB. Also
+        // update the probability on the edge from JumpMBB to Fallthrough.
+        for (MachineBasicBlock::succ_iterator SI = JumpMBB->succ_begin(),
+                                              SE = JumpMBB->succ_end();
+             SI != SE; ++SI) {
+          if (*SI == DefaultMBB) {
+            JumpProb += DefaultProb / 2;
+            FallthroughProb -= DefaultProb / 2;
+            JumpMBB->setSuccProbability(SI, DefaultProb / 2);
+            JumpMBB->normalizeSuccProbs();
+            break;
+          }
+        }
+
+        // If the default clause is unreachable, propagate that knowledge into
+        // JTH->FallthroughUnreachable which will use it to suppress the range
+        // check.
+        //
+        // However, don't do this if we're doing branch target enforcement,
+        // because a table branch _without_ a range check can be a tempting JOP
+        // gadget - out-of-bounds inputs that are impossible in correct
+        // execution become possible again if an attacker can influence the
+        // control flow. So if an attacker doesn't already have a BTI bypass
+        // available, we don't want them to be able to get one out of this
+        // table branch.
+        if (FallthroughUnreachable) {
+          Function &CurFunc = CurMF->getFunction();
+          bool HasBranchTargetEnforcement = false;
+          if (CurFunc.hasFnAttribute("branch-target-enforcement")) {
+            HasBranchTargetEnforcement =
+                CurFunc.getFnAttribute("branch-target-enforcement")
+                    .getValueAsBool();
+          } else {
+            HasBranchTargetEnforcement =
+                CurMF->getMMI().getModule()->getModuleFlag(
+                    "branch-target-enforcement");
+          }
+          if (!HasBranchTargetEnforcement)
+            JTH->FallthroughUnreachable = true;
+        }
+
+        if (!JTH->FallthroughUnreachable)
+          addSuccessorWithProb(CurMBB, Fallthrough, FallthroughProb);
+        addSuccessorWithProb(CurMBB, JumpMBB, JumpProb);
+        CurMBB->normalizeSuccProbs();
+
+        // The jump table header will be inserted in our current block, do the
+        // range check, and fall through to our fallthrough block.
+        JTH->HeaderBB = CurMBB;
+        JT->Default = Fallthrough; // FIXME: Move Default to JumpTableHeader.
+
+        // If we're in the right place, emit the jump table header right now.
+        if (CurMBB == SwitchMBB) {
+          visitJumpTableHeader(*JT, *JTH, SwitchMBB);
+          JTH->Emitted = true;
+        }
+        break;
+      }
+      case CC_BitTests: {
+        // FIXME: Optimize away range check based on pivot comparisons.
+        BitTestBlock *BTB = &SL->BitTestCases[I->BTCasesIndex];
+
+        // The bit test blocks haven't been inserted yet; insert them here.
+        for (BitTestCase &BTC : BTB->Cases)
+          CurMF->insert(BBI, BTC.ThisBB);
+
+        // Fill in fields of the BitTestBlock.
+        BTB->Parent = CurMBB;
+        BTB->Default = Fallthrough;
+
+        BTB->DefaultProb = UnhandledProbs;
+        // If the cases in bit test don't form a contiguous range, we evenly
+        // distribute the probability on the edge to Fallthrough to two
+        // successors of CurMBB.
+        if (!BTB->ContiguousRange) {
+          BTB->Prob += DefaultProb / 2;
+          BTB->DefaultProb -= DefaultProb / 2;
+        }
+
+        if (FallthroughUnreachable)
+          BTB->FallthroughUnreachable = true;
+
+        // If we're in the right place, emit the bit test header right now.
+        if (CurMBB == SwitchMBB) {
+          visitBitTestHeader(*BTB, SwitchMBB);
+          BTB->Emitted = true;
+        }
+        break;
+      }
+      case CC_Range: {
+        const Value *RHS, *LHS, *MHS;
+        ISD::CondCode CC;
+        if (I->Low == I->High) {
+          // Check Cond == I->Low.
+          CC = ISD::SETEQ;
+          LHS = Cond;
+          RHS=I->Low;
+          MHS = nullptr;
+        } else {
+          // Check I->Low <= Cond <= I->High.
+          CC = ISD::SETLE;
+          LHS = I->Low;
+          MHS = Cond;
+          RHS = I->High;
+        }
+
+        // If Fallthrough is unreachable, fold away the comparison.
+        if (FallthroughUnreachable)
+          CC = ISD::SETTRUE;
+
+        // The false probability is the sum of all unhandled cases.
+        CaseBlock CB(CC, LHS, RHS, MHS, I->MBB, Fallthrough, CurMBB,
+                     getCurSDLoc(), I->Prob, UnhandledProbs);
+
+        if (CurMBB == SwitchMBB)
+          visitSwitchCase(CB, SwitchMBB);
+        else
+          SL->SwitchCases.push_back(CB);
+
+        break;
+      }
+    }
+    CurMBB = Fallthrough;
+  }
+}
+
+unsigned SelectionDAGBuilder::caseClusterRank(const CaseCluster &CC,
+                                              CaseClusterIt First,
+                                              CaseClusterIt Last) {
+  return std::count_if(First, Last + 1, [&](const CaseCluster &X) {
+    if (X.Prob != CC.Prob)
+      return X.Prob > CC.Prob;
+
+    // Ties are broken by comparing the case value.
+    return X.Low->getValue().slt(CC.Low->getValue());
+  });
+}
+
+void SelectionDAGBuilder::splitWorkItem(SwitchWorkList &WorkList,
+                                        const SwitchWorkListItem &W,
+                                        Value *Cond,
+                                        MachineBasicBlock *SwitchMBB) {
+  assert(W.FirstCluster->Low->getValue().slt(W.LastCluster->Low->getValue()) &&
+         "Clusters not sorted?");
+
+  assert(W.LastCluster - W.FirstCluster + 1 >= 2 && "Too small to split!");
+
+  // Balance the tree based on branch probabilities to create a near-optimal (in
+  // terms of search time given key frequency) binary search tree. See e.g. Kurt
+  // Mehlhorn "Nearly Optimal Binary Search Trees" (1975).
+  CaseClusterIt LastLeft = W.FirstCluster;
+  CaseClusterIt FirstRight = W.LastCluster;
+  auto LeftProb = LastLeft->Prob + W.DefaultProb / 2;
+  auto RightProb = FirstRight->Prob + W.DefaultProb / 2;
+
+  // Move LastLeft and FirstRight towards each other from opposite directions to
+  // find a partitioning of the clusters which balances the probability on both
+  // sides. If LeftProb and RightProb are equal, alternate which side is
+  // taken to ensure 0-probability nodes are distributed evenly.
+  unsigned I = 0;
+  while (LastLeft + 1 < FirstRight) {
+    if (LeftProb < RightProb || (LeftProb == RightProb && (I & 1)))
+      LeftProb += (++LastLeft)->Prob;
+    else
+      RightProb += (--FirstRight)->Prob;
+    I++;
+  }
+
+  while (true) {
+    // Our binary search tree differs from a typical BST in that ours can have up
+    // to three values in each leaf. The pivot selection above doesn't take that
+    // into account, which means the tree might require more nodes and be less
+    // efficient. We compensate for this here.
+
+    unsigned NumLeft = LastLeft - W.FirstCluster + 1;
+    unsigned NumRight = W.LastCluster - FirstRight + 1;
+
+    if (std::min(NumLeft, NumRight) < 3 && std::max(NumLeft, NumRight) > 3) {
+      // If one side has less than 3 clusters, and the other has more than 3,
+      // consider taking a cluster from the other side.
+
+      if (NumLeft < NumRight) {
+        // Consider moving the first cluster on the right to the left side.
+        CaseCluster &CC = *FirstRight;
+        unsigned RightSideRank = caseClusterRank(CC, FirstRight, W.LastCluster);
+        unsigned LeftSideRank = caseClusterRank(CC, W.FirstCluster, LastLeft);
+        if (LeftSideRank <= RightSideRank) {
+          // Moving the cluster to the left does not demote it.
+          ++LastLeft;
+          ++FirstRight;
+          continue;
+        }
+      } else {
+        assert(NumRight < NumLeft);
+        // Consider moving the last element on the left to the right side.
+        CaseCluster &CC = *LastLeft;
+        unsigned LeftSideRank = caseClusterRank(CC, W.FirstCluster, LastLeft);
+        unsigned RightSideRank = caseClusterRank(CC, FirstRight, W.LastCluster);
+        if (RightSideRank <= LeftSideRank) {
+          // Moving the cluster to the right does not demot it.
+          --LastLeft;
+          --FirstRight;
+          continue;
+        }
+      }
+    }
+    break;
+  }
+
+  assert(LastLeft + 1 == FirstRight);
+  assert(LastLeft >= W.FirstCluster);
+  assert(FirstRight <= W.LastCluster);
+
+  // Use the first element on the right as pivot since we will make less-than
+  // comparisons against it.
+  CaseClusterIt PivotCluster = FirstRight;
+  assert(PivotCluster > W.FirstCluster);
+  assert(PivotCluster <= W.LastCluster);
+
+  CaseClusterIt FirstLeft = W.FirstCluster;
+  CaseClusterIt LastRight = W.LastCluster;
+
+  const ConstantInt *Pivot = PivotCluster->Low;
+
+  // New blocks will be inserted immediately after the current one.
+  MachineFunction::iterator BBI(W.MBB);
+  ++BBI;
+
+  // We will branch to the LHS if Value < Pivot. If LHS is a single cluster,
+  // we can branch to its destination directly if it's squeezed exactly in
+  // between the known lower bound and Pivot - 1.
+  MachineBasicBlock *LeftMBB;
+  if (FirstLeft == LastLeft && FirstLeft->Kind == CC_Range &&
+      FirstLeft->Low == W.GE &&
+      (FirstLeft->High->getValue() + 1LL) == Pivot->getValue()) {
+    LeftMBB = FirstLeft->MBB;
+  } else {
+    LeftMBB = FuncInfo.MF->CreateMachineBasicBlock(W.MBB->getBasicBlock());
+    FuncInfo.MF->insert(BBI, LeftMBB);
+    WorkList.push_back(
+        {LeftMBB, FirstLeft, LastLeft, W.GE, Pivot, W.DefaultProb / 2});
+    // Put Cond in a virtual register to make it available from the new blocks.
+    ExportFromCurrentBlock(Cond);
+  }
+
+  // Similarly, we will branch to the RHS if Value >= Pivot. If RHS is a
+  // single cluster, RHS.Low == Pivot, and we can branch to its destination
+  // directly if RHS.High equals the current upper bound.
+  MachineBasicBlock *RightMBB;
+  if (FirstRight == LastRight && FirstRight->Kind == CC_Range &&
+      W.LT && (FirstRight->High->getValue() + 1ULL) == W.LT->getValue()) {
+    RightMBB = FirstRight->MBB;
+  } else {
+    RightMBB = FuncInfo.MF->CreateMachineBasicBlock(W.MBB->getBasicBlock());
+    FuncInfo.MF->insert(BBI, RightMBB);
+    WorkList.push_back(
+        {RightMBB, FirstRight, LastRight, Pivot, W.LT, W.DefaultProb / 2});
+    // Put Cond in a virtual register to make it available from the new blocks.
+    ExportFromCurrentBlock(Cond);
+  }
+
+  // Create the CaseBlock record that will be used to lower the branch.
+  CaseBlock CB(ISD::SETLT, Cond, Pivot, nullptr, LeftMBB, RightMBB, W.MBB,
+               getCurSDLoc(), LeftProb, RightProb);
+
+  if (W.MBB == SwitchMBB)
+    visitSwitchCase(CB, SwitchMBB);
+  else
+    SL->SwitchCases.push_back(CB);
+}
+
+// Scale CaseProb after peeling a case with the probablity of PeeledCaseProb
+// from the swith statement.
+static BranchProbability scaleCaseProbality(BranchProbability CaseProb,
+                                            BranchProbability PeeledCaseProb) {
+  if (PeeledCaseProb == BranchProbability::getOne())
+    return BranchProbability::getZero();
+  BranchProbability SwitchProb = PeeledCaseProb.getCompl();
+
+  uint32_t Numerator = CaseProb.getNumerator();
+  uint32_t Denominator = SwitchProb.scale(CaseProb.getDenominator());
+  return BranchProbability(Numerator, std::max(Numerator, Denominator));
+}
+
+// Try to peel the top probability case if it exceeds the threshold.
+// Return current MachineBasicBlock for the switch statement if the peeling
+// does not occur.
+// If the peeling is performed, return the newly created MachineBasicBlock
+// for the peeled switch statement. Also update Clusters to remove the peeled
+// case. PeeledCaseProb is the BranchProbability for the peeled case.
+MachineBasicBlock *SelectionDAGBuilder::peelDominantCaseCluster(
+    const SwitchInst &SI, CaseClusterVector &Clusters,
+    BranchProbability &PeeledCaseProb) {
+  MachineBasicBlock *SwitchMBB = FuncInfo.MBB;
+  // Don't perform if there is only one cluster or optimizing for size.
+  if (SwitchPeelThreshold > 100 || !FuncInfo.BPI || Clusters.size() < 2 ||
+      TM.getOptLevel() == CodeGenOpt::None ||
+      SwitchMBB->getParent()->getFunction().hasMinSize())
+    return SwitchMBB;
+
+  BranchProbability TopCaseProb = BranchProbability(SwitchPeelThreshold, 100);
+  unsigned PeeledCaseIndex = 0;
+  bool SwitchPeeled = false;
+  for (unsigned Index = 0; Index < Clusters.size(); ++Index) {
+    CaseCluster &CC = Clusters[Index];
+    if (CC.Prob < TopCaseProb)
+      continue;
+    TopCaseProb = CC.Prob;
+    PeeledCaseIndex = Index;
+    SwitchPeeled = true;
+  }
+  if (!SwitchPeeled)
+    return SwitchMBB;
+
+  LLVM_DEBUG(dbgs() << "Peeled one top case in switch stmt, prob: "
+                    << TopCaseProb << "\n");
+
+  // Record the MBB for the peeled switch statement.
+  MachineFunction::iterator BBI(SwitchMBB);
+  ++BBI;
+  MachineBasicBlock *PeeledSwitchMBB =
+      FuncInfo.MF->CreateMachineBasicBlock(SwitchMBB->getBasicBlock());
+  FuncInfo.MF->insert(BBI, PeeledSwitchMBB);
+
+  ExportFromCurrentBlock(SI.getCondition());
+  auto PeeledCaseIt = Clusters.begin() + PeeledCaseIndex;
+  SwitchWorkListItem W = {SwitchMBB, PeeledCaseIt, PeeledCaseIt,
+                          nullptr,   nullptr,      TopCaseProb.getCompl()};
+  lowerWorkItem(W, SI.getCondition(), SwitchMBB, PeeledSwitchMBB);
+
+  Clusters.erase(PeeledCaseIt);
+  for (CaseCluster &CC : Clusters) {
+    LLVM_DEBUG(
+        dbgs() << "Scale the probablity for one cluster, before scaling: "
+               << CC.Prob << "\n");
+    CC.Prob = scaleCaseProbality(CC.Prob, TopCaseProb);
+    LLVM_DEBUG(dbgs() << "After scaling: " << CC.Prob << "\n");
+  }
+  PeeledCaseProb = TopCaseProb;
+  return PeeledSwitchMBB;
+}
+
+void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
+  // Extract cases from the switch.
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  CaseClusterVector Clusters;
+  Clusters.reserve(SI.getNumCases());
+  for (auto I : SI.cases()) {
+    MachineBasicBlock *Succ = FuncInfo.MBBMap[I.getCaseSuccessor()];
+    const ConstantInt *CaseVal = I.getCaseValue();
+    BranchProbability Prob =
+        BPI ? BPI->getEdgeProbability(SI.getParent(), I.getSuccessorIndex())
+            : BranchProbability(1, SI.getNumCases() + 1);
+    Clusters.push_back(CaseCluster::range(CaseVal, CaseVal, Succ, Prob));
+  }
+
+  MachineBasicBlock *DefaultMBB = FuncInfo.MBBMap[SI.getDefaultDest()];
+
+  // Cluster adjacent cases with the same destination. We do this at all
+  // optimization levels because it's cheap to do and will make codegen faster
+  // if there are many clusters.
+  sortAndRangeify(Clusters);
+
+  // The branch probablity of the peeled case.
+  BranchProbability PeeledCaseProb = BranchProbability::getZero();
+  MachineBasicBlock *PeeledSwitchMBB =
+      peelDominantCaseCluster(SI, Clusters, PeeledCaseProb);
+
+  // If there is only the default destination, jump there directly.
+  MachineBasicBlock *SwitchMBB = FuncInfo.MBB;
+  if (Clusters.empty()) {
+    assert(PeeledSwitchMBB == SwitchMBB);
+    SwitchMBB->addSuccessor(DefaultMBB);
+    if (DefaultMBB != NextBlock(SwitchMBB)) {
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
+                              getControlRoot(), DAG.getBasicBlock(DefaultMBB)));
+    }
+    return;
+  }
+
+  SL->findJumpTables(Clusters, &SI, DefaultMBB, DAG.getPSI(), DAG.getBFI());
+  SL->findBitTestClusters(Clusters, &SI);
+
+  LLVM_DEBUG({
+    dbgs() << "Case clusters: ";
+    for (const CaseCluster &C : Clusters) {
+      if (C.Kind == CC_JumpTable)
+        dbgs() << "JT:";
+      if (C.Kind == CC_BitTests)
+        dbgs() << "BT:";
+
+      C.Low->getValue().print(dbgs(), true);
+      if (C.Low != C.High) {
+        dbgs() << '-';
+        C.High->getValue().print(dbgs(), true);
+      }
+      dbgs() << ' ';
+    }
+    dbgs() << '\n';
+  });
+
+  assert(!Clusters.empty());
+  SwitchWorkList WorkList;
+  CaseClusterIt First = Clusters.begin();
+  CaseClusterIt Last = Clusters.end() - 1;
+  auto DefaultProb = getEdgeProbability(PeeledSwitchMBB, DefaultMBB);
+  // Scale the branchprobability for DefaultMBB if the peel occurs and
+  // DefaultMBB is not replaced.
+  if (PeeledCaseProb != BranchProbability::getZero() &&
+      DefaultMBB == FuncInfo.MBBMap[SI.getDefaultDest()])
+    DefaultProb = scaleCaseProbality(DefaultProb, PeeledCaseProb);
+  WorkList.push_back(
+      {PeeledSwitchMBB, First, Last, nullptr, nullptr, DefaultProb});
+
+  while (!WorkList.empty()) {
+    SwitchWorkListItem W = WorkList.pop_back_val();
+    unsigned NumClusters = W.LastCluster - W.FirstCluster + 1;
+
+    if (NumClusters > 3 && TM.getOptLevel() != CodeGenOpt::None &&
+        !DefaultMBB->getParent()->getFunction().hasMinSize()) {
+      // For optimized builds, lower large range as a balanced binary tree.
+      splitWorkItem(WorkList, W, SI.getCondition(), SwitchMBB);
+      continue;
+    }
+
+    lowerWorkItem(W, SI.getCondition(), SwitchMBB, DefaultMBB);
+  }
+}
+
+void SelectionDAGBuilder::visitStepVector(const CallInst &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  auto DL = getCurSDLoc();
+  EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  setValue(&I, DAG.getStepVector(DL, ResultVT));
+}
+
+void SelectionDAGBuilder::visitVectorReverse(const CallInst &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+  SDLoc DL = getCurSDLoc();
+  SDValue V = getValue(I.getOperand(0));
+  assert(VT == V.getValueType() && "Malformed vector.reverse!");
+
+  if (VT.isScalableVector()) {
+    setValue(&I, DAG.getNode(ISD::VECTOR_REVERSE, DL, VT, V));
+    return;
+  }
+
+  // Use VECTOR_SHUFFLE for the fixed-length vector
+  // to maintain existing behavior.
+  SmallVector<int, 8> Mask;
+  unsigned NumElts = VT.getVectorMinNumElements();
+  for (unsigned i = 0; i != NumElts; ++i)
+    Mask.push_back(NumElts - 1 - i);
+
+  setValue(&I, DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), Mask));
+}
+
+void SelectionDAGBuilder::visitVectorDeinterleave(const CallInst &I) {
+  auto DL = getCurSDLoc();
+  SDValue InVec = getValue(I.getOperand(0));
+  EVT OutVT =
+      InVec.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
+
+  unsigned OutNumElts = OutVT.getVectorMinNumElements();
+
+  // ISD Node needs the input vectors split into two equal parts
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, InVec,
+                           DAG.getVectorIdxConstant(0, DL));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, InVec,
+                           DAG.getVectorIdxConstant(OutNumElts, DL));
+
+  // Use VECTOR_SHUFFLE for fixed-length vectors to benefit from existing
+  // legalisation and combines.
+  if (OutVT.isFixedLengthVector()) {
+    SDValue Even = DAG.getVectorShuffle(OutVT, DL, Lo, Hi,
+                                        createStrideMask(0, 2, OutNumElts));
+    SDValue Odd = DAG.getVectorShuffle(OutVT, DL, Lo, Hi,
+                                       createStrideMask(1, 2, OutNumElts));
+    SDValue Res = DAG.getMergeValues({Even, Odd}, getCurSDLoc());
+    setValue(&I, Res);
+    return;
+  }
+
+  SDValue Res = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL,
+                            DAG.getVTList(OutVT, OutVT), Lo, Hi);
+  setValue(&I, Res);
+}
+
+void SelectionDAGBuilder::visitVectorInterleave(const CallInst &I) {
+  auto DL = getCurSDLoc();
+  EVT InVT = getValue(I.getOperand(0)).getValueType();
+  SDValue InVec0 = getValue(I.getOperand(0));
+  SDValue InVec1 = getValue(I.getOperand(1));
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT OutVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+  // Use VECTOR_SHUFFLE for fixed-length vectors to benefit from existing
+  // legalisation and combines.
+  if (OutVT.isFixedLengthVector()) {
+    unsigned NumElts = InVT.getVectorMinNumElements();
+    SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, InVec0, InVec1);
+    setValue(&I, DAG.getVectorShuffle(OutVT, DL, V, DAG.getUNDEF(OutVT),
+                                      createInterleaveMask(NumElts, 2)));
+    return;
+  }
+
+  SDValue Res = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL,
+                            DAG.getVTList(InVT, InVT), InVec0, InVec1);
+  Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Res.getValue(0),
+                    Res.getValue(1));
+  setValue(&I, Res);
+}
+
+void SelectionDAGBuilder::visitFreeze(const FreezeInst &I) {
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(),
+                  ValueVTs);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0) return;
+
+  SmallVector<SDValue, 4> Values(NumValues);
+  SDValue Op = getValue(I.getOperand(0));
+
+  for (unsigned i = 0; i != NumValues; ++i)
+    Values[i] = DAG.getNode(ISD::FREEZE, getCurSDLoc(), ValueVTs[i],
+                            SDValue(Op.getNode(), Op.getResNo() + i));
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
+                           DAG.getVTList(ValueVTs), Values));
+}
+
+void SelectionDAGBuilder::visitVectorSplice(const CallInst &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+  SDLoc DL = getCurSDLoc();
+  SDValue V1 = getValue(I.getOperand(0));
+  SDValue V2 = getValue(I.getOperand(1));
+  int64_t Imm = cast<ConstantInt>(I.getOperand(2))->getSExtValue();
+
+  // VECTOR_SHUFFLE doesn't support a scalable mask so use a dedicated node.
+  if (VT.isScalableVector()) {
+    MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+    setValue(&I, DAG.getNode(ISD::VECTOR_SPLICE, DL, VT, V1, V2,
+                             DAG.getConstant(Imm, DL, IdxVT)));
+    return;
+  }
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  uint64_t Idx = (NumElts + Imm) % NumElts;
+
+  // Use VECTOR_SHUFFLE to maintain original behaviour for fixed-length vectors.
+  SmallVector<int, 8> Mask;
+  for (unsigned i = 0; i < NumElts; ++i)
+    Mask.push_back(Idx + i);
+  setValue(&I, DAG.getVectorShuffle(VT, DL, V1, V2, Mask));
+}
+
+// Consider the following MIR after SelectionDAG, which produces output in
+// phyregs in the first case or virtregs in the second case.
+//
+// INLINEASM_BR ..., implicit-def $ebx, ..., implicit-def $edx
+// %5:gr32 = COPY $ebx
+// %6:gr32 = COPY $edx
+// %1:gr32 = COPY %6:gr32
+// %0:gr32 = COPY %5:gr32
+//
+// INLINEASM_BR ..., def %5:gr32, ..., def %6:gr32
+// %1:gr32 = COPY %6:gr32
+// %0:gr32 = COPY %5:gr32
+//
+// Given %0, we'd like to return $ebx in the first case and %5 in the second.
+// Given %1, we'd like to return $edx in the first case and %6 in the second.
+//
+// If a callbr has outputs, it will have a single mapping in FuncInfo.ValueMap
+// to a single virtreg (such as %0). The remaining outputs monotonically
+// increase in virtreg number from there. If a callbr has no outputs, then it
+// should not have a corresponding callbr landingpad; in fact, the callbr
+// landingpad would not even be able to refer to such a callbr.
+static Register FollowCopyChain(MachineRegisterInfo &MRI, Register Reg) {
+  MachineInstr *MI = MRI.def_begin(Reg)->getParent();
+  // There is definitely at least one copy.
+  assert(MI->getOpcode() == TargetOpcode::COPY &&
+         "start of copy chain MUST be COPY");
+  Reg = MI->getOperand(1).getReg();
+  MI = MRI.def_begin(Reg)->getParent();
+  // There may be an optional second copy.
+  if (MI->getOpcode() == TargetOpcode::COPY) {
+    assert(Reg.isVirtual() && "expected COPY of virtual register");
+    Reg = MI->getOperand(1).getReg();
+    assert(Reg.isPhysical() && "expected COPY of physical register");
+    MI = MRI.def_begin(Reg)->getParent();
+  }
+  // The start of the chain must be an INLINEASM_BR.
+  assert(MI->getOpcode() == TargetOpcode::INLINEASM_BR &&
+         "end of copy chain MUST be INLINEASM_BR");
+  return Reg;
+}
+
+// We must do this walk rather than the simpler
+//   setValue(&I, getCopyFromRegs(CBR, CBR->getType()));
+// otherwise we will end up with copies of virtregs only valid along direct
+// edges.
+void SelectionDAGBuilder::visitCallBrLandingPad(const CallInst &I) {
+  SmallVector<EVT, 8> ResultVTs;
+  SmallVector<SDValue, 8> ResultValues;
+  const auto *CBR =
+      cast<CallBrInst>(I.getParent()->getUniquePredecessor()->getTerminator());
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+
+  unsigned InitialDef = FuncInfo.ValueMap[CBR];
+  SDValue Chain = DAG.getRoot();
+
+  // Re-parse the asm constraints string.
+  TargetLowering::AsmOperandInfoVector TargetConstraints =
+      TLI.ParseConstraints(DAG.getDataLayout(), TRI, *CBR);
+  for (auto &T : TargetConstraints) {
+    SDISelAsmOperandInfo OpInfo(T);
+    if (OpInfo.Type != InlineAsm::isOutput)
+      continue;
+
+    // Pencil in OpInfo.ConstraintType and OpInfo.ConstraintVT based on the
+    // individual constraint.
+    TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);
+
+    switch (OpInfo.ConstraintType) {
+    case TargetLowering::C_Register:
+    case TargetLowering::C_RegisterClass: {
+      // Fill in OpInfo.AssignedRegs.Regs.
+      getRegistersForValue(DAG, getCurSDLoc(), OpInfo, OpInfo);
+
+      // getRegistersForValue may produce 1 to many registers based on whether
+      // the OpInfo.ConstraintVT is legal on the target or not.
+      for (size_t i = 0, e = OpInfo.AssignedRegs.Regs.size(); i != e; ++i) {
+        Register OriginalDef = FollowCopyChain(MRI, InitialDef++);
+        if (Register::isPhysicalRegister(OriginalDef))
+          FuncInfo.MBB->addLiveIn(OriginalDef);
+        // Update the assigned registers to use the original defs.
+        OpInfo.AssignedRegs.Regs[i] = OriginalDef;
+      }
+
+      SDValue V = OpInfo.AssignedRegs.getCopyFromRegs(
+          DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, CBR);
+      ResultValues.push_back(V);
+      ResultVTs.push_back(OpInfo.ConstraintVT);
+      break;
+    }
+    case TargetLowering::C_Other: {
+      SDValue Flag;
+      SDValue V = TLI.LowerAsmOutputForConstraint(Chain, Flag, getCurSDLoc(),
+                                                  OpInfo, DAG);
+      ++InitialDef;
+      ResultValues.push_back(V);
+      ResultVTs.push_back(OpInfo.ConstraintVT);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+  SDValue V = DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
+                          DAG.getVTList(ResultVTs), ResultValues);
+  setValue(&I, V);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
new file mode 100644
index 000000000000..f2496f24973a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -0,0 +1,803 @@
+//===- SelectionDAGBuilder.h - Selection-DAG building -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements routines for translating from LLVM IR into SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SELECTIONDAGBUILDER_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_SELECTIONDAGBUILDER_H
+
+#include "StatepointLowering.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/AssignmentTrackingAnalysis.h"
+#include "llvm/CodeGen/CodeGenCommonISel.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/SwitchLoweringUtils.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <optional>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+
+class AAResults;
+class AllocaInst;
+class AtomicCmpXchgInst;
+class AtomicRMWInst;
+class AssumptionCache;
+class BasicBlock;
+class BranchInst;
+class CallInst;
+class CallBrInst;
+class CatchPadInst;
+class CatchReturnInst;
+class CatchSwitchInst;
+class CleanupPadInst;
+class CleanupReturnInst;
+class Constant;
+class ConstrainedFPIntrinsic;
+class DbgValueInst;
+class DataLayout;
+class DIExpression;
+class DILocalVariable;
+class DILocation;
+class FenceInst;
+class FunctionLoweringInfo;
+class GCFunctionInfo;
+class GCRelocateInst;
+class GCResultInst;
+class GCStatepointInst;
+class IndirectBrInst;
+class InvokeInst;
+class LandingPadInst;
+class LLVMContext;
+class LoadInst;
+class MachineBasicBlock;
+class PHINode;
+class ResumeInst;
+class ReturnInst;
+class SDDbgValue;
+class SelectionDAG;
+class StoreInst;
+class SwiftErrorValueTracking;
+class SwitchInst;
+class TargetLibraryInfo;
+class TargetMachine;
+class Type;
+class VAArgInst;
+class UnreachableInst;
+class Use;
+class User;
+class Value;
+
+//===----------------------------------------------------------------------===//
+/// SelectionDAGBuilder - This is the common target-independent lowering
+/// implementation that is parameterized by a TargetLowering object.
+///
+class SelectionDAGBuilder {
+  /// The current instruction being visited.
+  const Instruction *CurInst = nullptr;
+
+  DenseMap<const Value*, SDValue> NodeMap;
+
+  /// Maps argument value for unused arguments. This is used
+  /// to preserve debug information for incoming arguments.
+  DenseMap<const Value*, SDValue> UnusedArgNodeMap;
+
+  /// Helper type for DanglingDebugInfoMap.
+  class DanglingDebugInfo {
+    using DbgValTy = const DbgValueInst *;
+    using VarLocTy = const VarLocInfo *;
+    PointerUnion<DbgValTy, VarLocTy> Info;
+    unsigned SDNodeOrder = 0;
+
+  public:
+    DanglingDebugInfo() = default;
+    DanglingDebugInfo(const DbgValueInst *DI, unsigned SDNO)
+        : Info(DI), SDNodeOrder(SDNO) {}
+    DanglingDebugInfo(const VarLocInfo *VarLoc, unsigned SDNO)
+        : Info(VarLoc), SDNodeOrder(SDNO) {}
+
+    DILocalVariable *getVariable(const FunctionVarLocs *Locs) const {
+      if (isa<VarLocTy>(Info))
+        return Locs->getDILocalVariable(cast<VarLocTy>(Info)->VariableID);
+      return cast<DbgValTy>(Info)->getVariable();
+    }
+    DIExpression *getExpression() const {
+      if (isa<VarLocTy>(Info))
+        return cast<VarLocTy>(Info)->Expr;
+      return cast<DbgValTy>(Info)->getExpression();
+    }
+    Value *getVariableLocationOp(unsigned Idx) const {
+      assert(Idx == 0 && "Dangling variadic debug values not supported yet");
+      if (isa<VarLocTy>(Info))
+        return cast<VarLocTy>(Info)->Values.getVariableLocationOp(Idx);
+      return cast<DbgValTy>(Info)->getVariableLocationOp(Idx);
+    }
+    DebugLoc getDebugLoc() const {
+      if (isa<VarLocTy>(Info))
+        return cast<VarLocTy>(Info)->DL;
+      return cast<DbgValTy>(Info)->getDebugLoc();
+    }
+    unsigned getSDNodeOrder() const { return SDNodeOrder; }
+
+    /// Helper for printing DanglingDebugInfo. This hoop-jumping is to
+    /// accommodate the fact that an argument is required for getVariable.
+    /// Call SelectionDAGBuilder::printDDI instead of using directly.
+    struct Print {
+      Print(const DanglingDebugInfo &DDI, const FunctionVarLocs *VarLocs)
+          : DDI(DDI), VarLocs(VarLocs) {}
+      const DanglingDebugInfo &DDI;
+      const FunctionVarLocs *VarLocs;
+      friend raw_ostream &operator<<(raw_ostream &OS,
+                                     const DanglingDebugInfo::Print &P) {
+        OS << "DDI(var=" << *P.DDI.getVariable(P.VarLocs)
+           << ", val= " << *P.DDI.getVariableLocationOp(0)
+           << ", expr=" << *P.DDI.getExpression()
+           << ", order=" << P.DDI.getSDNodeOrder()
+           << ", loc=" << P.DDI.getDebugLoc() << ")";
+        return OS;
+      }
+    };
+  };
+
+  /// Returns an object that defines `raw_ostream &operator<<` for printing.
+  /// Usage example:
+  ////    errs() << printDDI(MyDanglingInfo) << " is dangling\n";
+  DanglingDebugInfo::Print printDDI(const DanglingDebugInfo &DDI) {
+    return DanglingDebugInfo::Print(DDI, DAG.getFunctionVarLocs());
+  }
+
+  /// Helper type for DanglingDebugInfoMap.
+  typedef std::vector<DanglingDebugInfo> DanglingDebugInfoVector;
+
+  /// Keeps track of dbg_values for which we have not yet seen the referent.
+  /// We defer handling these until we do see it.
+  MapVector<const Value*, DanglingDebugInfoVector> DanglingDebugInfoMap;
+
+  /// Cache the module flag for whether we should use debug-info assignment
+  /// tracking.
+  bool AssignmentTrackingEnabled = false;
+
+public:
+  /// Loads are not emitted to the program immediately.  We bunch them up and
+  /// then emit token factor nodes when possible.  This allows us to get simple
+  /// disambiguation between loads without worrying about alias analysis.
+  SmallVector<SDValue, 8> PendingLoads;
+
+  /// State used while lowering a statepoint sequence (gc_statepoint,
+  /// gc_relocate, and gc_result).  See StatepointLowering.hpp/cpp for details.
+  StatepointLoweringState StatepointLowering;
+
+private:
+  /// CopyToReg nodes that copy values to virtual registers for export to other
+  /// blocks need to be emitted before any terminator instruction, but they have
+  /// no other ordering requirements. We bunch them up and the emit a single
+  /// tokenfactor for them just before terminator instructions.
+  SmallVector<SDValue, 8> PendingExports;
+
+  /// Similar to loads, nodes corresponding to constrained FP intrinsics are
+  /// bunched up and emitted when necessary.  These can be moved across each
+  /// other and any (normal) memory operation (load or store), but not across
+  /// calls or instructions having unspecified side effects.  As a special
+  /// case, constrained FP intrinsics using fpexcept.strict may not be deleted
+  /// even if otherwise unused, so they need to be chained before any
+  /// terminator instruction (like PendingExports).  We track the latter
+  /// set of nodes in a separate list.
+  SmallVector<SDValue, 8> PendingConstrainedFP;
+  SmallVector<SDValue, 8> PendingConstrainedFPStrict;
+
+  /// Update root to include all chains from the Pending list.
+  SDValue updateRoot(SmallVectorImpl<SDValue> &Pending);
+
+  /// A unique monotonically increasing number used to order the SDNodes we
+  /// create.
+  unsigned SDNodeOrder;
+
+  /// Determine the rank by weight of CC in [First,Last]. If CC has more weight
+  /// than each cluster in the range, its rank is 0.
+  unsigned caseClusterRank(const SwitchCG::CaseCluster &CC,
+                           SwitchCG::CaseClusterIt First,
+                           SwitchCG::CaseClusterIt Last);
+
+  /// Emit comparison and split W into two subtrees.
+  void splitWorkItem(SwitchCG::SwitchWorkList &WorkList,
+                     const SwitchCG::SwitchWorkListItem &W, Value *Cond,
+                     MachineBasicBlock *SwitchMBB);
+
+  /// Lower W.
+  void lowerWorkItem(SwitchCG::SwitchWorkListItem W, Value *Cond,
+                     MachineBasicBlock *SwitchMBB,
+                     MachineBasicBlock *DefaultMBB);
+
+  /// Peel the top probability case if it exceeds the threshold
+  MachineBasicBlock *
+  peelDominantCaseCluster(const SwitchInst &SI,
+                          SwitchCG::CaseClusterVector &Clusters,
+                          BranchProbability &PeeledCaseProb);
+
+private:
+  const TargetMachine &TM;
+
+public:
+  /// Lowest valid SDNodeOrder. The special case 0 is reserved for scheduling
+  /// nodes without a corresponding SDNode.
+  static const unsigned LowestSDNodeOrder = 1;
+
+  SelectionDAG &DAG;
+  AAResults *AA = nullptr;
+  AssumptionCache *AC = nullptr;
+  const TargetLibraryInfo *LibInfo = nullptr;
+
+  class SDAGSwitchLowering : public SwitchCG::SwitchLowering {
+  public:
+    SDAGSwitchLowering(SelectionDAGBuilder *sdb, FunctionLoweringInfo &funcinfo)
+        : SwitchCG::SwitchLowering(funcinfo), SDB(sdb) {}
+
+    void addSuccessorWithProb(
+        MachineBasicBlock *Src, MachineBasicBlock *Dst,
+        BranchProbability Prob = BranchProbability::getUnknown()) override {
+      SDB->addSuccessorWithProb(Src, Dst, Prob);
+    }
+
+  private:
+    SelectionDAGBuilder *SDB = nullptr;
+  };
+
+  // Data related to deferred switch lowerings. Used to construct additional
+  // Basic Blocks in SelectionDAGISel::FinishBasicBlock.
+  std::unique_ptr<SDAGSwitchLowering> SL;
+
+  /// A StackProtectorDescriptor structure used to communicate stack protector
+  /// information in between SelectBasicBlock and FinishBasicBlock.
+  StackProtectorDescriptor SPDescriptor;
+
+  // Emit PHI-node-operand constants only once even if used by multiple
+  // PHI nodes.
+  DenseMap<const Constant *, unsigned> ConstantsOut;
+
+  /// Information about the function as a whole.
+  FunctionLoweringInfo &FuncInfo;
+
+  /// Information about the swifterror values used throughout the function.
+  SwiftErrorValueTracking &SwiftError;
+
+  /// Garbage collection metadata for the function.
+  GCFunctionInfo *GFI = nullptr;
+
+  /// Map a landing pad to the call site indexes.
+  DenseMap<MachineBasicBlock *, SmallVector<unsigned, 4>> LPadToCallSiteMap;
+
+  /// This is set to true if a call in the current block has been translated as
+  /// a tail call. In this case, no subsequent DAG nodes should be created.
+  bool HasTailCall = false;
+
+  LLVMContext *Context = nullptr;
+
+  SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo,
+                      SwiftErrorValueTracking &swifterror, CodeGenOpt::Level ol)
+      : SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()), DAG(dag),
+        SL(std::make_unique<SDAGSwitchLowering>(this, funcinfo)), FuncInfo(funcinfo),
+        SwiftError(swifterror) {}
+
+  void init(GCFunctionInfo *gfi, AAResults *AA, AssumptionCache *AC,
+            const TargetLibraryInfo *li);
+
+  /// Clear out the current SelectionDAG and the associated state and prepare
+  /// this SelectionDAGBuilder object to be used for a new block. This doesn't
+  /// clear out information about additional blocks that are needed to complete
+  /// switch lowering or PHI node updating; that information is cleared out as
+  /// it is consumed.
+  void clear();
+
+  /// Clear the dangling debug information map. This function is separated from
+  /// the clear so that debug information that is dangling in a basic block can
+  /// be properly resolved in a different basic block. This allows the
+  /// SelectionDAG to resolve dangling debug information attached to PHI nodes.
+  void clearDanglingDebugInfo();
+
+  /// Return the current virtual root of the Selection DAG, flushing any
+  /// PendingLoad items. This must be done before emitting a store or any other
+  /// memory node that may need to be ordered after any prior load instructions.
+  SDValue getMemoryRoot();
+
+  /// Similar to getMemoryRoot, but also flushes PendingConstrainedFP(Strict)
+  /// items. This must be done before emitting any call other any other node
+  /// that may need to be ordered after FP instructions due to other side
+  /// effects.
+  SDValue getRoot();
+
+  /// Similar to getRoot, but instead of flushing all the PendingLoad items,
+  /// flush all the PendingExports (and PendingConstrainedFPStrict) items.
+  /// It is necessary to do this before emitting a terminator instruction.
+  SDValue getControlRoot();
+
+  SDLoc getCurSDLoc() const {
+    return SDLoc(CurInst, SDNodeOrder);
+  }
+
+  DebugLoc getCurDebugLoc() const {
+    return CurInst ? CurInst->getDebugLoc() : DebugLoc();
+  }
+
+  void CopyValueToVirtualRegister(const Value *V, unsigned Reg,
+                                  ISD::NodeType ExtendType = ISD::ANY_EXTEND);
+
+  void visit(const Instruction &I);
+
+  void visit(unsigned Opcode, const User &I);
+
+  /// If there was virtual register allocated for the value V emit CopyFromReg
+  /// of the specified type Ty. Return empty SDValue() otherwise.
+  SDValue getCopyFromRegs(const Value *V, Type *Ty);
+
+  /// Register a dbg_value which relies on a Value which we have not yet seen.
+  void addDanglingDebugInfo(const DbgValueInst *DI, unsigned Order);
+  void addDanglingDebugInfo(const VarLocInfo *VarLoc, unsigned Order);
+
+  /// If we have dangling debug info that describes \p Variable, or an
+  /// overlapping part of variable considering the \p Expr, then this method
+  /// will drop that debug info as it isn't valid any longer.
+  void dropDanglingDebugInfo(const DILocalVariable *Variable,
+                             const DIExpression *Expr);
+
+  /// If we saw an earlier dbg_value referring to V, generate the debug data
+  /// structures now that we've seen its definition.
+  void resolveDanglingDebugInfo(const Value *V, SDValue Val);
+
+  /// For the given dangling debuginfo record, perform last-ditch efforts to
+  /// resolve the debuginfo to something that is represented in this DAG. If
+  /// this cannot be done, produce an Undef debug value record.
+  void salvageUnresolvedDbgValue(DanglingDebugInfo &DDI);
+
+  /// For a given list of Values, attempt to create and record a SDDbgValue in
+  /// the SelectionDAG.
+  bool handleDebugValue(ArrayRef<const Value *> Values, DILocalVariable *Var,
+                        DIExpression *Expr, DebugLoc DbgLoc, unsigned Order,
+                        bool IsVariadic);
+
+  /// Create a record for a kill location debug intrinsic.
+  void handleKillDebugValue(DILocalVariable *Var, DIExpression *Expr,
+                            DebugLoc DbgLoc, unsigned Order);
+
+  /// Evict any dangling debug information, attempting to salvage it first.
+  void resolveOrClearDbgInfo();
+
+  SDValue getValue(const Value *V);
+
+  SDValue getNonRegisterValue(const Value *V);
+  SDValue getValueImpl(const Value *V);
+
+  void setValue(const Value *V, SDValue NewN) {
+    SDValue &N = NodeMap[V];
+    assert(!N.getNode() && "Already set a value for this node!");
+    N = NewN;
+  }
+
+  void setUnusedArgValue(const Value *V, SDValue NewN) {
+    SDValue &N = UnusedArgNodeMap[V];
+    assert(!N.getNode() && "Already set a value for this node!");
+    N = NewN;
+  }
+
+  void FindMergedConditions(const Value *Cond, MachineBasicBlock *TBB,
+                            MachineBasicBlock *FBB, MachineBasicBlock *CurBB,
+                            MachineBasicBlock *SwitchBB,
+                            Instruction::BinaryOps Opc, BranchProbability TProb,
+                            BranchProbability FProb, bool InvertCond);
+  void EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *TBB,
+                                    MachineBasicBlock *FBB,
+                                    MachineBasicBlock *CurBB,
+                                    MachineBasicBlock *SwitchBB,
+                                    BranchProbability TProb, BranchProbability FProb,
+                                    bool InvertCond);
+  bool ShouldEmitAsBranches(const std::vector<SwitchCG::CaseBlock> &Cases);
+  bool isExportableFromCurrentBlock(const Value *V, const BasicBlock *FromBB);
+  void CopyToExportRegsIfNeeded(const Value *V);
+  void ExportFromCurrentBlock(const Value *V);
+  void LowerCallTo(const CallBase &CB, SDValue Callee, bool IsTailCall,
+                   bool IsMustTailCall, const BasicBlock *EHPadBB = nullptr);
+
+  // Lower range metadata from 0 to N to assert zext to an integer of nearest
+  // floor power of two.
+  SDValue lowerRangeToAssertZExt(SelectionDAG &DAG, const Instruction &I,
+                                 SDValue Op);
+
+  void populateCallLoweringInfo(TargetLowering::CallLoweringInfo &CLI,
+                                const CallBase *Call, unsigned ArgIdx,
+                                unsigned NumArgs, SDValue Callee,
+                                Type *ReturnTy, bool IsPatchPoint);
+
+  std::pair<SDValue, SDValue>
+  lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
+                 const BasicBlock *EHPadBB = nullptr);
+
+  /// When an MBB was split during scheduling, update the
+  /// references that need to refer to the last resulting block.
+  void UpdateSplitBlock(MachineBasicBlock *First, MachineBasicBlock *Last);
+
+  /// Describes a gc.statepoint or a gc.statepoint like thing for the purposes
+  /// of lowering into a STATEPOINT node.
+  struct StatepointLoweringInfo {
+    /// Bases[i] is the base pointer for Ptrs[i].  Together they denote the set
+    /// of gc pointers this STATEPOINT has to relocate.
+    SmallVector<const Value *, 16> Bases;
+    SmallVector<const Value *, 16> Ptrs;
+
+    /// The set of gc.relocate calls associated with this gc.statepoint.
+    SmallVector<const GCRelocateInst *, 16> GCRelocates;
+
+    /// The full list of gc arguments to the gc.statepoint being lowered.
+    ArrayRef<const Use> GCArgs;
+
+    /// The gc.statepoint instruction.
+    const Instruction *StatepointInstr = nullptr;
+
+    /// The list of gc transition arguments present in the gc.statepoint being
+    /// lowered.
+    ArrayRef<const Use> GCTransitionArgs;
+
+    /// The ID that the resulting STATEPOINT instruction has to report.
+    unsigned ID = -1;
+
+    /// Information regarding the underlying call instruction.
+    TargetLowering::CallLoweringInfo CLI;
+
+    /// The deoptimization state associated with this gc.statepoint call, if
+    /// any.
+    ArrayRef<const Use> DeoptState;
+
+    /// Flags associated with the meta arguments being lowered.
+    uint64_t StatepointFlags = -1;
+
+    /// The number of patchable bytes the call needs to get lowered into.
+    unsigned NumPatchBytes = -1;
+
+    /// The exception handling unwind destination, in case this represents an
+    /// invoke of gc.statepoint.
+    const BasicBlock *EHPadBB = nullptr;
+
+    explicit StatepointLoweringInfo(SelectionDAG &DAG) : CLI(DAG) {}
+  };
+
+  /// Lower \p SLI into a STATEPOINT instruction.
+  SDValue LowerAsSTATEPOINT(StatepointLoweringInfo &SI);
+
+  // This function is responsible for the whole statepoint lowering process.
+  // It uniformly handles invoke and call statepoints.
+  void LowerStatepoint(const GCStatepointInst &I,
+                       const BasicBlock *EHPadBB = nullptr);
+
+  void LowerCallSiteWithDeoptBundle(const CallBase *Call, SDValue Callee,
+                                    const BasicBlock *EHPadBB);
+
+  void LowerDeoptimizeCall(const CallInst *CI);
+  void LowerDeoptimizingReturn();
+
+  void LowerCallSiteWithDeoptBundleImpl(const CallBase *Call, SDValue Callee,
+                                        const BasicBlock *EHPadBB,
+                                        bool VarArgDisallowed,
+                                        bool ForceVoidReturnTy);
+
+  /// Returns the type of FrameIndex and TargetFrameIndex nodes.
+  MVT getFrameIndexTy() {
+    return DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout());
+  }
+
+private:
+  // Terminator instructions.
+  void visitRet(const ReturnInst &I);
+  void visitBr(const BranchInst &I);
+  void visitSwitch(const SwitchInst &I);
+  void visitIndirectBr(const IndirectBrInst &I);
+  void visitUnreachable(const UnreachableInst &I);
+  void visitCleanupRet(const CleanupReturnInst &I);
+  void visitCatchSwitch(const CatchSwitchInst &I);
+  void visitCatchRet(const CatchReturnInst &I);
+  void visitCatchPad(const CatchPadInst &I);
+  void visitCleanupPad(const CleanupPadInst &CPI);
+
+  BranchProbability getEdgeProbability(const MachineBasicBlock *Src,
+                                       const MachineBasicBlock *Dst) const;
+  void addSuccessorWithProb(
+      MachineBasicBlock *Src, MachineBasicBlock *Dst,
+      BranchProbability Prob = BranchProbability::getUnknown());
+
+public:
+  void visitSwitchCase(SwitchCG::CaseBlock &CB, MachineBasicBlock *SwitchBB);
+  void visitSPDescriptorParent(StackProtectorDescriptor &SPD,
+                               MachineBasicBlock *ParentBB);
+  void visitSPDescriptorFailure(StackProtectorDescriptor &SPD);
+  void visitBitTestHeader(SwitchCG::BitTestBlock &B,
+                          MachineBasicBlock *SwitchBB);
+  void visitBitTestCase(SwitchCG::BitTestBlock &BB, MachineBasicBlock *NextMBB,
+                        BranchProbability BranchProbToNext, unsigned Reg,
+                        SwitchCG::BitTestCase &B, MachineBasicBlock *SwitchBB);
+  void visitJumpTable(SwitchCG::JumpTable &JT);
+  void visitJumpTableHeader(SwitchCG::JumpTable &JT,
+                            SwitchCG::JumpTableHeader &JTH,
+                            MachineBasicBlock *SwitchBB);
+
+private:
+  // These all get lowered before this pass.
+  void visitInvoke(const InvokeInst &I);
+  void visitCallBr(const CallBrInst &I);
+  void visitCallBrLandingPad(const CallInst &I);
+  void visitResume(const ResumeInst &I);
+
+  void visitUnary(const User &I, unsigned Opcode);
+  void visitFNeg(const User &I) { visitUnary(I, ISD::FNEG); }
+
+  void visitBinary(const User &I, unsigned Opcode);
+  void visitShift(const User &I, unsigned Opcode);
+  void visitAdd(const User &I)  { visitBinary(I, ISD::ADD); }
+  void visitFAdd(const User &I) { visitBinary(I, ISD::FADD); }
+  void visitSub(const User &I)  { visitBinary(I, ISD::SUB); }
+  void visitFSub(const User &I) { visitBinary(I, ISD::FSUB); }
+  void visitMul(const User &I)  { visitBinary(I, ISD::MUL); }
+  void visitFMul(const User &I) { visitBinary(I, ISD::FMUL); }
+  void visitURem(const User &I) { visitBinary(I, ISD::UREM); }
+  void visitSRem(const User &I) { visitBinary(I, ISD::SREM); }
+  void visitFRem(const User &I) { visitBinary(I, ISD::FREM); }
+  void visitUDiv(const User &I) { visitBinary(I, ISD::UDIV); }
+  void visitSDiv(const User &I);
+  void visitFDiv(const User &I) { visitBinary(I, ISD::FDIV); }
+  void visitAnd (const User &I) { visitBinary(I, ISD::AND); }
+  void visitOr  (const User &I) { visitBinary(I, ISD::OR); }
+  void visitXor (const User &I) { visitBinary(I, ISD::XOR); }
+  void visitShl (const User &I) { visitShift(I, ISD::SHL); }
+  void visitLShr(const User &I) { visitShift(I, ISD::SRL); }
+  void visitAShr(const User &I) { visitShift(I, ISD::SRA); }
+  void visitICmp(const User &I);
+  void visitFCmp(const User &I);
+  // Visit the conversion instructions
+  void visitTrunc(const User &I);
+  void visitZExt(const User &I);
+  void visitSExt(const User &I);
+  void visitFPTrunc(const User &I);
+  void visitFPExt(const User &I);
+  void visitFPToUI(const User &I);
+  void visitFPToSI(const User &I);
+  void visitUIToFP(const User &I);
+  void visitSIToFP(const User &I);
+  void visitPtrToInt(const User &I);
+  void visitIntToPtr(const User &I);
+  void visitBitCast(const User &I);
+  void visitAddrSpaceCast(const User &I);
+
+  void visitExtractElement(const User &I);
+  void visitInsertElement(const User &I);
+  void visitShuffleVector(const User &I);
+
+  void visitExtractValue(const ExtractValueInst &I);
+  void visitInsertValue(const InsertValueInst &I);
+  void visitLandingPad(const LandingPadInst &LP);
+
+  void visitGetElementPtr(const User &I);
+  void visitSelect(const User &I);
+
+  void visitAlloca(const AllocaInst &I);
+  void visitLoad(const LoadInst &I);
+  void visitStore(const StoreInst &I);
+  void visitMaskedLoad(const CallInst &I, bool IsExpanding = false);
+  void visitMaskedStore(const CallInst &I, bool IsCompressing = false);
+  void visitMaskedGather(const CallInst &I);
+  void visitMaskedScatter(const CallInst &I);
+  void visitAtomicCmpXchg(const AtomicCmpXchgInst &I);
+  void visitAtomicRMW(const AtomicRMWInst &I);
+  void visitFence(const FenceInst &I);
+  void visitPHI(const PHINode &I);
+  void visitCall(const CallInst &I);
+  bool visitMemCmpBCmpCall(const CallInst &I);
+  bool visitMemPCpyCall(const CallInst &I);
+  bool visitMemChrCall(const CallInst &I);
+  bool visitStrCpyCall(const CallInst &I, bool isStpcpy);
+  bool visitStrCmpCall(const CallInst &I);
+  bool visitStrLenCall(const CallInst &I);
+  bool visitStrNLenCall(const CallInst &I);
+  bool visitUnaryFloatCall(const CallInst &I, unsigned Opcode);
+  bool visitBinaryFloatCall(const CallInst &I, unsigned Opcode);
+  void visitAtomicLoad(const LoadInst &I);
+  void visitAtomicStore(const StoreInst &I);
+  void visitLoadFromSwiftError(const LoadInst &I);
+  void visitStoreToSwiftError(const StoreInst &I);
+  void visitFreeze(const FreezeInst &I);
+
+  void visitInlineAsm(const CallBase &Call,
+                      const BasicBlock *EHPadBB = nullptr);
+  void visitIntrinsicCall(const CallInst &I, unsigned Intrinsic);
+  void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
+  void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI);
+  void visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT,
+                   const SmallVectorImpl<SDValue> &OpValues);
+  void visitVPStore(const VPIntrinsic &VPIntrin,
+                    const SmallVectorImpl<SDValue> &OpValues);
+  void visitVPGather(const VPIntrinsic &VPIntrin, EVT VT,
+                     const SmallVectorImpl<SDValue> &OpValues);
+  void visitVPScatter(const VPIntrinsic &VPIntrin,
+                      const SmallVectorImpl<SDValue> &OpValues);
+  void visitVPStridedLoad(const VPIntrinsic &VPIntrin, EVT VT,
+                          const SmallVectorImpl<SDValue> &OpValues);
+  void visitVPStridedStore(const VPIntrinsic &VPIntrin,
+                           const SmallVectorImpl<SDValue> &OpValues);
+  void visitVPCmp(const VPCmpIntrinsic &VPIntrin);
+  void visitVectorPredicationIntrinsic(const VPIntrinsic &VPIntrin);
+
+  void visitVAStart(const CallInst &I);
+  void visitVAArg(const VAArgInst &I);
+  void visitVAEnd(const CallInst &I);
+  void visitVACopy(const CallInst &I);
+  void visitStackmap(const CallInst &I);
+  void visitPatchpoint(const CallBase &CB, const BasicBlock *EHPadBB = nullptr);
+
+  // These two are implemented in StatepointLowering.cpp
+  void visitGCRelocate(const GCRelocateInst &Relocate);
+  void visitGCResult(const GCResultInst &I);
+
+  void visitVectorReduce(const CallInst &I, unsigned Intrinsic);
+  void visitVectorReverse(const CallInst &I);
+  void visitVectorSplice(const CallInst &I);
+  void visitVectorInterleave(const CallInst &I);
+  void visitVectorDeinterleave(const CallInst &I);
+  void visitStepVector(const CallInst &I);
+
+  void visitUserOp1(const Instruction &I) {
+    llvm_unreachable("UserOp1 should not exist at instruction selection time!");
+  }
+  void visitUserOp2(const Instruction &I) {
+    llvm_unreachable("UserOp2 should not exist at instruction selection time!");
+  }
+
+  void processIntegerCallValue(const Instruction &I,
+                               SDValue Value, bool IsSigned);
+
+  void HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB);
+
+  void emitInlineAsmError(const CallBase &Call, const Twine &Message);
+
+  /// An enum that states to emit func argument dbg value the kind of intrinsic
+  /// it originally had. This controls the internal behavior of
+  /// EmitFuncArgumentDbgValue.
+  enum class FuncArgumentDbgValueKind {
+    Value,   // This was originally a llvm.dbg.value.
+    Declare, // This was originally a llvm.dbg.declare.
+  };
+
+  /// If V is an function argument then create corresponding DBG_VALUE machine
+  /// instruction for it now. At the end of instruction selection, they will be
+  /// inserted to the entry BB.
+  bool EmitFuncArgumentDbgValue(const Value *V, DILocalVariable *Variable,
+                                DIExpression *Expr, DILocation *DL,
+                                FuncArgumentDbgValueKind Kind,
+                                const SDValue &N);
+
+  /// Return the next block after MBB, or nullptr if there is none.
+  MachineBasicBlock *NextBlock(MachineBasicBlock *MBB);
+
+  /// Update the DAG and DAG builder with the relevant information after
+  /// a new root node has been created which could be a tail call.
+  void updateDAGForMaybeTailCall(SDValue MaybeTC);
+
+  /// Return the appropriate SDDbgValue based on N.
+  SDDbgValue *getDbgValue(SDValue N, DILocalVariable *Variable,
+                          DIExpression *Expr, const DebugLoc &dl,
+                          unsigned DbgSDNodeOrder);
+
+  /// Lowers CallInst to an external symbol.
+  void lowerCallToExternalSymbol(const CallInst &I, const char *FunctionName);
+
+  SDValue lowerStartEH(SDValue Chain, const BasicBlock *EHPadBB,
+                       MCSymbol *&BeginLabel);
+  SDValue lowerEndEH(SDValue Chain, const InvokeInst *II,
+                     const BasicBlock *EHPadBB, MCSymbol *BeginLabel);
+};
+
+/// This struct represents the registers (physical or virtual)
+/// that a particular set of values is assigned, and the type information about
+/// the value. The most common situation is to represent one value at a time,
+/// but struct or array values are handled element-wise as multiple values.  The
+/// splitting of aggregates is performed recursively, so that we never have
+/// aggregate-typed registers. The values at this point do not necessarily have
+/// legal types, so each value may require one or more registers of some legal
+/// type.
+///
+struct RegsForValue {
+  /// The value types of the values, which may not be legal, and
+  /// may need be promoted or synthesized from one or more registers.
+  SmallVector<EVT, 4> ValueVTs;
+
+  /// The value types of the registers. This is the same size as ValueVTs and it
+  /// records, for each value, what the type of the assigned register or
+  /// registers are. (Individual values are never synthesized from more than one
+  /// type of register.)
+  ///
+  /// With virtual registers, the contents of RegVTs is redundant with TLI's
+  /// getRegisterType member function, however when with physical registers
+  /// it is necessary to have a separate record of the types.
+  SmallVector<MVT, 4> RegVTs;
+
+  /// This list holds the registers assigned to the values.
+  /// Each legal or promoted value requires one register, and each
+  /// expanded value requires multiple registers.
+  SmallVector<unsigned, 4> Regs;
+
+  /// This list holds the number of registers for each value.
+  SmallVector<unsigned, 4> RegCount;
+
+  /// Records if this value needs to be treated in an ABI dependant manner,
+  /// different to normal type legalization.
+  std::optional<CallingConv::ID> CallConv;
+
+  RegsForValue() = default;
+  RegsForValue(const SmallVector<unsigned, 4> &regs, MVT regvt, EVT valuevt,
+               std::optional<CallingConv::ID> CC = std::nullopt);
+  RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
+               const DataLayout &DL, unsigned Reg, Type *Ty,
+               std::optional<CallingConv::ID> CC);
+
+  bool isABIMangled() const { return CallConv.has_value(); }
+
+  /// Add the specified values to this one.
+  void append(const RegsForValue &RHS) {
+    ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end());
+    RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end());
+    Regs.append(RHS.Regs.begin(), RHS.Regs.end());
+    RegCount.push_back(RHS.Regs.size());
+  }
+
+  /// Emit a series of CopyFromReg nodes that copies from this value and returns
+  /// the result as a ValueVTs value. This uses Chain/Flag as the input and
+  /// updates them for the output Chain/Flag. If the Flag pointer is NULL, no
+  /// flag is used.
+  SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo,
+                          const SDLoc &dl, SDValue &Chain, SDValue *Glue,
+                          const Value *V = nullptr) const;
+
+  /// Emit a series of CopyToReg nodes that copies the specified value into the
+  /// registers specified by this object. This uses Chain/Flag as the input and
+  /// updates them for the output Chain/Flag. If the Flag pointer is nullptr, no
+  /// flag is used. If V is not nullptr, then it is used in printing better
+  /// diagnostic messages on error.
+  void getCopyToRegs(SDValue Val, SelectionDAG &DAG, const SDLoc &dl,
+                     SDValue &Chain, SDValue *Glue, const Value *V = nullptr,
+                     ISD::NodeType PreferredExtendType = ISD::ANY_EXTEND) const;
+
+  /// Add this value to the specified inlineasm node operand list. This adds the
+  /// code marker, matching input operand index (if applicable), and includes
+  /// the number of values added into it.
+  void AddInlineAsmOperands(unsigned Code, bool HasMatching,
+                            unsigned MatchingIdx, const SDLoc &dl,
+                            SelectionDAG &DAG, std::vector<SDValue> &Ops) const;
+
+  /// Check if the total RegCount is greater than one.
+  bool occupiesMultipleRegs() const {
+    return std::accumulate(RegCount.begin(), RegCount.end(), 0) > 1;
+  }
+
+  /// Return a list of registers and their sizes.
+  SmallVector<std::pair<unsigned, TypeSize>, 4> getRegsAndSizes() const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_SELECTIONDAG_SELECTIONDAGBUILDER_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
new file mode 100644
index 000000000000..03a1ead5bbb4
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -0,0 +1,1096 @@
+//===- SelectionDAGDumper.cpp - Implement SelectionDAG::dump() ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAG::dump method and friends.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SDNodeDbgValue.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Printable.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cstdint>
+#include <iterator>
+
+using namespace llvm;
+
+static cl::opt<bool>
+VerboseDAGDumping("dag-dump-verbose", cl::Hidden,
+                  cl::desc("Display more information when dumping selection "
+                           "DAG nodes."));
+
+std::string SDNode::getOperationName(const SelectionDAG *G) const {
+  switch (getOpcode()) {
+  default:
+    if (getOpcode() < ISD::BUILTIN_OP_END)
+      return "<<Unknown DAG Node>>";
+    if (isMachineOpcode()) {
+      if (G)
+        if (const TargetInstrInfo *TII = G->getSubtarget().getInstrInfo())
+          if (getMachineOpcode() < TII->getNumOpcodes())
+            return std::string(TII->getName(getMachineOpcode()));
+      return "<<Unknown Machine Node #" + utostr(getOpcode()) + ">>";
+    }
+    if (G) {
+      const TargetLowering &TLI = G->getTargetLoweringInfo();
+      const char *Name = TLI.getTargetNodeName(getOpcode());
+      if (Name) return Name;
+      return "<<Unknown Target Node #" + utostr(getOpcode()) + ">>";
+    }
+    return "<<Unknown Node #" + utostr(getOpcode()) + ">>";
+
+#ifndef NDEBUG
+  case ISD::DELETED_NODE:               return "<<Deleted Node!>>";
+#endif
+  case ISD::PREFETCH:                   return "Prefetch";
+  case ISD::MEMBARRIER:                 return "MemBarrier";
+  case ISD::ATOMIC_FENCE:               return "AtomicFence";
+  case ISD::ATOMIC_CMP_SWAP:            return "AtomicCmpSwap";
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: return "AtomicCmpSwapWithSuccess";
+  case ISD::ATOMIC_SWAP:                return "AtomicSwap";
+  case ISD::ATOMIC_LOAD_ADD:            return "AtomicLoadAdd";
+  case ISD::ATOMIC_LOAD_SUB:            return "AtomicLoadSub";
+  case ISD::ATOMIC_LOAD_AND:            return "AtomicLoadAnd";
+  case ISD::ATOMIC_LOAD_CLR:            return "AtomicLoadClr";
+  case ISD::ATOMIC_LOAD_OR:             return "AtomicLoadOr";
+  case ISD::ATOMIC_LOAD_XOR:            return "AtomicLoadXor";
+  case ISD::ATOMIC_LOAD_NAND:           return "AtomicLoadNand";
+  case ISD::ATOMIC_LOAD_MIN:            return "AtomicLoadMin";
+  case ISD::ATOMIC_LOAD_MAX:            return "AtomicLoadMax";
+  case ISD::ATOMIC_LOAD_UMIN:           return "AtomicLoadUMin";
+  case ISD::ATOMIC_LOAD_UMAX:           return "AtomicLoadUMax";
+  case ISD::ATOMIC_LOAD_FADD:           return "AtomicLoadFAdd";
+  case ISD::ATOMIC_LOAD_UINC_WRAP:
+    return "AtomicLoadUIncWrap";
+  case ISD::ATOMIC_LOAD_UDEC_WRAP:
+    return "AtomicLoadUDecWrap";
+  case ISD::ATOMIC_LOAD:                return "AtomicLoad";
+  case ISD::ATOMIC_STORE:               return "AtomicStore";
+  case ISD::PCMARKER:                   return "PCMarker";
+  case ISD::READCYCLECOUNTER:           return "ReadCycleCounter";
+  case ISD::SRCVALUE:                   return "SrcValue";
+  case ISD::MDNODE_SDNODE:              return "MDNode";
+  case ISD::EntryToken:                 return "EntryToken";
+  case ISD::TokenFactor:                return "TokenFactor";
+  case ISD::AssertSext:                 return "AssertSext";
+  case ISD::AssertZext:                 return "AssertZext";
+  case ISD::AssertAlign:                return "AssertAlign";
+
+  case ISD::BasicBlock:                 return "BasicBlock";
+  case ISD::VALUETYPE:                  return "ValueType";
+  case ISD::Register:                   return "Register";
+  case ISD::RegisterMask:               return "RegisterMask";
+  case ISD::Constant:
+    if (cast<ConstantSDNode>(this)->isOpaque())
+      return "OpaqueConstant";
+    return "Constant";
+  case ISD::ConstantFP:                 return "ConstantFP";
+  case ISD::GlobalAddress:              return "GlobalAddress";
+  case ISD::GlobalTLSAddress:           return "GlobalTLSAddress";
+  case ISD::FrameIndex:                 return "FrameIndex";
+  case ISD::JumpTable:                  return "JumpTable";
+  case ISD::GLOBAL_OFFSET_TABLE:        return "GLOBAL_OFFSET_TABLE";
+  case ISD::RETURNADDR:                 return "RETURNADDR";
+  case ISD::ADDROFRETURNADDR:           return "ADDROFRETURNADDR";
+  case ISD::FRAMEADDR:                  return "FRAMEADDR";
+  case ISD::SPONENTRY:                  return "SPONENTRY";
+  case ISD::LOCAL_RECOVER:              return "LOCAL_RECOVER";
+  case ISD::READ_REGISTER:              return "READ_REGISTER";
+  case ISD::WRITE_REGISTER:             return "WRITE_REGISTER";
+  case ISD::FRAME_TO_ARGS_OFFSET:       return "FRAME_TO_ARGS_OFFSET";
+  case ISD::EH_DWARF_CFA:               return "EH_DWARF_CFA";
+  case ISD::EH_RETURN:                  return "EH_RETURN";
+  case ISD::EH_SJLJ_SETJMP:             return "EH_SJLJ_SETJMP";
+  case ISD::EH_SJLJ_LONGJMP:            return "EH_SJLJ_LONGJMP";
+  case ISD::EH_SJLJ_SETUP_DISPATCH:     return "EH_SJLJ_SETUP_DISPATCH";
+  case ISD::ConstantPool:               return "ConstantPool";
+  case ISD::TargetIndex:                return "TargetIndex";
+  case ISD::ExternalSymbol:             return "ExternalSymbol";
+  case ISD::BlockAddress:               return "BlockAddress";
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned OpNo = getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 0 : 1;
+    unsigned IID = cast<ConstantSDNode>(getOperand(OpNo))->getZExtValue();
+    if (IID < Intrinsic::num_intrinsics)
+      return Intrinsic::getBaseName((Intrinsic::ID)IID).str();
+    if (!G)
+      return "Unknown intrinsic";
+    if (const TargetIntrinsicInfo *TII = G->getTarget().getIntrinsicInfo())
+      return TII->getName(IID);
+    llvm_unreachable("Invalid intrinsic ID");
+  }
+
+  case ISD::BUILD_VECTOR:               return "BUILD_VECTOR";
+  case ISD::TargetConstant:
+    if (cast<ConstantSDNode>(this)->isOpaque())
+      return "OpaqueTargetConstant";
+    return "TargetConstant";
+  case ISD::TargetConstantFP:           return "TargetConstantFP";
+  case ISD::TargetGlobalAddress:        return "TargetGlobalAddress";
+  case ISD::TargetGlobalTLSAddress:     return "TargetGlobalTLSAddress";
+  case ISD::TargetFrameIndex:           return "TargetFrameIndex";
+  case ISD::TargetJumpTable:            return "TargetJumpTable";
+  case ISD::TargetConstantPool:         return "TargetConstantPool";
+  case ISD::TargetExternalSymbol:       return "TargetExternalSymbol";
+  case ISD::MCSymbol:                   return "MCSymbol";
+  case ISD::TargetBlockAddress:         return "TargetBlockAddress";
+
+  case ISD::CopyToReg:                  return "CopyToReg";
+  case ISD::CopyFromReg:                return "CopyFromReg";
+  case ISD::UNDEF:                      return "undef";
+  case ISD::VSCALE:                     return "vscale";
+  case ISD::MERGE_VALUES:               return "merge_values";
+  case ISD::INLINEASM:                  return "inlineasm";
+  case ISD::INLINEASM_BR:               return "inlineasm_br";
+  case ISD::EH_LABEL:                   return "eh_label";
+  case ISD::ANNOTATION_LABEL:           return "annotation_label";
+  case ISD::HANDLENODE:                 return "handlenode";
+
+  // Unary operators
+  case ISD::FABS:                       return "fabs";
+  case ISD::FMINNUM:                    return "fminnum";
+  case ISD::STRICT_FMINNUM:             return "strict_fminnum";
+  case ISD::FMAXNUM:                    return "fmaxnum";
+  case ISD::STRICT_FMAXNUM:             return "strict_fmaxnum";
+  case ISD::FMINNUM_IEEE:               return "fminnum_ieee";
+  case ISD::FMAXNUM_IEEE:               return "fmaxnum_ieee";
+  case ISD::FMINIMUM:                   return "fminimum";
+  case ISD::STRICT_FMINIMUM:            return "strict_fminimum";
+  case ISD::FMAXIMUM:                   return "fmaximum";
+  case ISD::STRICT_FMAXIMUM:            return "strict_fmaximum";
+  case ISD::FNEG:                       return "fneg";
+  case ISD::FSQRT:                      return "fsqrt";
+  case ISD::STRICT_FSQRT:               return "strict_fsqrt";
+  case ISD::FCBRT:                      return "fcbrt";
+  case ISD::FSIN:                       return "fsin";
+  case ISD::STRICT_FSIN:                return "strict_fsin";
+  case ISD::FCOS:                       return "fcos";
+  case ISD::STRICT_FCOS:                return "strict_fcos";
+  case ISD::FSINCOS:                    return "fsincos";
+  case ISD::FTRUNC:                     return "ftrunc";
+  case ISD::STRICT_FTRUNC:              return "strict_ftrunc";
+  case ISD::FFLOOR:                     return "ffloor";
+  case ISD::STRICT_FFLOOR:              return "strict_ffloor";
+  case ISD::FCEIL:                      return "fceil";
+  case ISD::STRICT_FCEIL:               return "strict_fceil";
+  case ISD::FRINT:                      return "frint";
+  case ISD::STRICT_FRINT:               return "strict_frint";
+  case ISD::FNEARBYINT:                 return "fnearbyint";
+  case ISD::STRICT_FNEARBYINT:          return "strict_fnearbyint";
+  case ISD::FROUND:                     return "fround";
+  case ISD::STRICT_FROUND:              return "strict_fround";
+  case ISD::FROUNDEVEN:                 return "froundeven";
+  case ISD::STRICT_FROUNDEVEN:          return "strict_froundeven";
+  case ISD::FEXP:                       return "fexp";
+  case ISD::STRICT_FEXP:                return "strict_fexp";
+  case ISD::FEXP2:                      return "fexp2";
+  case ISD::STRICT_FEXP2:               return "strict_fexp2";
+  case ISD::FLOG:                       return "flog";
+  case ISD::STRICT_FLOG:                return "strict_flog";
+  case ISD::FLOG2:                      return "flog2";
+  case ISD::STRICT_FLOG2:               return "strict_flog2";
+  case ISD::FLOG10:                     return "flog10";
+  case ISD::STRICT_FLOG10:              return "strict_flog10";
+
+  // Binary operators
+  case ISD::ADD:                        return "add";
+  case ISD::SUB:                        return "sub";
+  case ISD::MUL:                        return "mul";
+  case ISD::MULHU:                      return "mulhu";
+  case ISD::MULHS:                      return "mulhs";
+  case ISD::AVGFLOORU:                  return "avgflooru";
+  case ISD::AVGFLOORS:                  return "avgfloors";
+  case ISD::AVGCEILU:                   return "avgceilu";
+  case ISD::AVGCEILS:                   return "avgceils";
+  case ISD::ABDS:                       return "abds";
+  case ISD::ABDU:                       return "abdu";
+  case ISD::SDIV:                       return "sdiv";
+  case ISD::UDIV:                       return "udiv";
+  case ISD::SREM:                       return "srem";
+  case ISD::UREM:                       return "urem";
+  case ISD::SMUL_LOHI:                  return "smul_lohi";
+  case ISD::UMUL_LOHI:                  return "umul_lohi";
+  case ISD::SDIVREM:                    return "sdivrem";
+  case ISD::UDIVREM:                    return "udivrem";
+  case ISD::AND:                        return "and";
+  case ISD::OR:                         return "or";
+  case ISD::XOR:                        return "xor";
+  case ISD::SHL:                        return "shl";
+  case ISD::SRA:                        return "sra";
+  case ISD::SRL:                        return "srl";
+  case ISD::ROTL:                       return "rotl";
+  case ISD::ROTR:                       return "rotr";
+  case ISD::FSHL:                       return "fshl";
+  case ISD::FSHR:                       return "fshr";
+  case ISD::FADD:                       return "fadd";
+  case ISD::STRICT_FADD:                return "strict_fadd";
+  case ISD::FSUB:                       return "fsub";
+  case ISD::STRICT_FSUB:                return "strict_fsub";
+  case ISD::FMUL:                       return "fmul";
+  case ISD::STRICT_FMUL:                return "strict_fmul";
+  case ISD::FDIV:                       return "fdiv";
+  case ISD::STRICT_FDIV:                return "strict_fdiv";
+  case ISD::FMA:                        return "fma";
+  case ISD::STRICT_FMA:                 return "strict_fma";
+  case ISD::FMAD:                       return "fmad";
+  case ISD::FREM:                       return "frem";
+  case ISD::STRICT_FREM:                return "strict_frem";
+  case ISD::FCOPYSIGN:                  return "fcopysign";
+  case ISD::FGETSIGN:                   return "fgetsign";
+  case ISD::FCANONICALIZE:              return "fcanonicalize";
+  case ISD::IS_FPCLASS:                 return "is_fpclass";
+  case ISD::FPOW:                       return "fpow";
+  case ISD::STRICT_FPOW:                return "strict_fpow";
+  case ISD::SMIN:                       return "smin";
+  case ISD::SMAX:                       return "smax";
+  case ISD::UMIN:                       return "umin";
+  case ISD::UMAX:                       return "umax";
+
+  case ISD::FLDEXP:                     return "fldexp";
+  case ISD::STRICT_FLDEXP:              return "strict_fldexp";
+  case ISD::FFREXP:                     return "ffrexp";
+  case ISD::FPOWI:                      return "fpowi";
+  case ISD::STRICT_FPOWI:               return "strict_fpowi";
+  case ISD::SETCC:                      return "setcc";
+  case ISD::SETCCCARRY:                 return "setcccarry";
+  case ISD::STRICT_FSETCC:              return "strict_fsetcc";
+  case ISD::STRICT_FSETCCS:             return "strict_fsetccs";
+  case ISD::SELECT:                     return "select";
+  case ISD::VSELECT:                    return "vselect";
+  case ISD::SELECT_CC:                  return "select_cc";
+  case ISD::INSERT_VECTOR_ELT:          return "insert_vector_elt";
+  case ISD::EXTRACT_VECTOR_ELT:         return "extract_vector_elt";
+  case ISD::CONCAT_VECTORS:             return "concat_vectors";
+  case ISD::INSERT_SUBVECTOR:           return "insert_subvector";
+  case ISD::EXTRACT_SUBVECTOR:          return "extract_subvector";
+  case ISD::VECTOR_DEINTERLEAVE:        return "vector_deinterleave";
+  case ISD::VECTOR_INTERLEAVE:          return "vector_interleave";
+  case ISD::SCALAR_TO_VECTOR:           return "scalar_to_vector";
+  case ISD::VECTOR_SHUFFLE:             return "vector_shuffle";
+  case ISD::VECTOR_SPLICE:              return "vector_splice";
+  case ISD::SPLAT_VECTOR:               return "splat_vector";
+  case ISD::SPLAT_VECTOR_PARTS:         return "splat_vector_parts";
+  case ISD::VECTOR_REVERSE:             return "vector_reverse";
+  case ISD::STEP_VECTOR:                return "step_vector";
+  case ISD::CARRY_FALSE:                return "carry_false";
+  case ISD::ADDC:                       return "addc";
+  case ISD::ADDE:                       return "adde";
+  case ISD::UADDO_CARRY:                return "uaddo_carry";
+  case ISD::SADDO_CARRY:                return "saddo_carry";
+  case ISD::SADDO:                      return "saddo";
+  case ISD::UADDO:                      return "uaddo";
+  case ISD::SSUBO:                      return "ssubo";
+  case ISD::USUBO:                      return "usubo";
+  case ISD::SMULO:                      return "smulo";
+  case ISD::UMULO:                      return "umulo";
+  case ISD::SUBC:                       return "subc";
+  case ISD::SUBE:                       return "sube";
+  case ISD::USUBO_CARRY:                return "usubo_carry";
+  case ISD::SSUBO_CARRY:                return "ssubo_carry";
+  case ISD::SHL_PARTS:                  return "shl_parts";
+  case ISD::SRA_PARTS:                  return "sra_parts";
+  case ISD::SRL_PARTS:                  return "srl_parts";
+
+  case ISD::SADDSAT:                    return "saddsat";
+  case ISD::UADDSAT:                    return "uaddsat";
+  case ISD::SSUBSAT:                    return "ssubsat";
+  case ISD::USUBSAT:                    return "usubsat";
+  case ISD::SSHLSAT:                    return "sshlsat";
+  case ISD::USHLSAT:                    return "ushlsat";
+
+  case ISD::SMULFIX:                    return "smulfix";
+  case ISD::SMULFIXSAT:                 return "smulfixsat";
+  case ISD::UMULFIX:                    return "umulfix";
+  case ISD::UMULFIXSAT:                 return "umulfixsat";
+
+  case ISD::SDIVFIX:                    return "sdivfix";
+  case ISD::SDIVFIXSAT:                 return "sdivfixsat";
+  case ISD::UDIVFIX:                    return "udivfix";
+  case ISD::UDIVFIXSAT:                 return "udivfixsat";
+
+  // Conversion operators.
+  case ISD::SIGN_EXTEND:                return "sign_extend";
+  case ISD::ZERO_EXTEND:                return "zero_extend";
+  case ISD::ANY_EXTEND:                 return "any_extend";
+  case ISD::SIGN_EXTEND_INREG:          return "sign_extend_inreg";
+  case ISD::ANY_EXTEND_VECTOR_INREG:    return "any_extend_vector_inreg";
+  case ISD::SIGN_EXTEND_VECTOR_INREG:   return "sign_extend_vector_inreg";
+  case ISD::ZERO_EXTEND_VECTOR_INREG:   return "zero_extend_vector_inreg";
+  case ISD::TRUNCATE:                   return "truncate";
+  case ISD::FP_ROUND:                   return "fp_round";
+  case ISD::STRICT_FP_ROUND:            return "strict_fp_round";
+  case ISD::FP_EXTEND:                  return "fp_extend";
+  case ISD::STRICT_FP_EXTEND:           return "strict_fp_extend";
+
+  case ISD::SINT_TO_FP:                 return "sint_to_fp";
+  case ISD::STRICT_SINT_TO_FP:          return "strict_sint_to_fp";
+  case ISD::UINT_TO_FP:                 return "uint_to_fp";
+  case ISD::STRICT_UINT_TO_FP:          return "strict_uint_to_fp";
+  case ISD::FP_TO_SINT:                 return "fp_to_sint";
+  case ISD::STRICT_FP_TO_SINT:          return "strict_fp_to_sint";
+  case ISD::FP_TO_UINT:                 return "fp_to_uint";
+  case ISD::STRICT_FP_TO_UINT:          return "strict_fp_to_uint";
+  case ISD::FP_TO_SINT_SAT:             return "fp_to_sint_sat";
+  case ISD::FP_TO_UINT_SAT:             return "fp_to_uint_sat";
+  case ISD::BITCAST:                    return "bitcast";
+  case ISD::ADDRSPACECAST:              return "addrspacecast";
+  case ISD::FP16_TO_FP:                 return "fp16_to_fp";
+  case ISD::STRICT_FP16_TO_FP:          return "strict_fp16_to_fp";
+  case ISD::FP_TO_FP16:                 return "fp_to_fp16";
+  case ISD::STRICT_FP_TO_FP16:          return "strict_fp_to_fp16";
+  case ISD::BF16_TO_FP:                 return "bf16_to_fp";
+  case ISD::FP_TO_BF16:                 return "fp_to_bf16";
+  case ISD::LROUND:                     return "lround";
+  case ISD::STRICT_LROUND:              return "strict_lround";
+  case ISD::LLROUND:                    return "llround";
+  case ISD::STRICT_LLROUND:             return "strict_llround";
+  case ISD::LRINT:                      return "lrint";
+  case ISD::STRICT_LRINT:               return "strict_lrint";
+  case ISD::LLRINT:                     return "llrint";
+  case ISD::STRICT_LLRINT:              return "strict_llrint";
+
+    // Control flow instructions
+  case ISD::BR:                         return "br";
+  case ISD::BRIND:                      return "brind";
+  case ISD::BR_JT:                      return "br_jt";
+  case ISD::BRCOND:                     return "brcond";
+  case ISD::BR_CC:                      return "br_cc";
+  case ISD::CALLSEQ_START:              return "callseq_start";
+  case ISD::CALLSEQ_END:                return "callseq_end";
+
+    // EH instructions
+  case ISD::CATCHRET:                   return "catchret";
+  case ISD::CLEANUPRET:                 return "cleanupret";
+
+    // Other operators
+  case ISD::LOAD:                       return "load";
+  case ISD::STORE:                      return "store";
+  case ISD::MLOAD:                      return "masked_load";
+  case ISD::MSTORE:                     return "masked_store";
+  case ISD::MGATHER:                    return "masked_gather";
+  case ISD::MSCATTER:                   return "masked_scatter";
+  case ISD::VAARG:                      return "vaarg";
+  case ISD::VACOPY:                     return "vacopy";
+  case ISD::VAEND:                      return "vaend";
+  case ISD::VASTART:                    return "vastart";
+  case ISD::DYNAMIC_STACKALLOC:         return "dynamic_stackalloc";
+  case ISD::EXTRACT_ELEMENT:            return "extract_element";
+  case ISD::BUILD_PAIR:                 return "build_pair";
+  case ISD::STACKSAVE:                  return "stacksave";
+  case ISD::STACKRESTORE:               return "stackrestore";
+  case ISD::TRAP:                       return "trap";
+  case ISD::DEBUGTRAP:                  return "debugtrap";
+  case ISD::UBSANTRAP:                  return "ubsantrap";
+  case ISD::LIFETIME_START:             return "lifetime.start";
+  case ISD::LIFETIME_END:               return "lifetime.end";
+  case ISD::PSEUDO_PROBE:
+    return "pseudoprobe";
+  case ISD::GC_TRANSITION_START:        return "gc_transition.start";
+  case ISD::GC_TRANSITION_END:          return "gc_transition.end";
+  case ISD::GET_DYNAMIC_AREA_OFFSET:    return "get.dynamic.area.offset";
+  case ISD::FREEZE:                     return "freeze";
+  case ISD::PREALLOCATED_SETUP:
+    return "call_setup";
+  case ISD::PREALLOCATED_ARG:
+    return "call_alloc";
+
+  // Floating point environment manipulation
+  case ISD::GET_ROUNDING:               return "get_rounding";
+  case ISD::SET_ROUNDING:               return "set_rounding";
+  case ISD::GET_FPENV:                  return "get_fpenv";
+  case ISD::SET_FPENV:                  return "set_fpenv";
+  case ISD::RESET_FPENV:                return "reset_fpenv";
+  case ISD::GET_FPENV_MEM:              return "get_fpenv_mem";
+  case ISD::SET_FPENV_MEM:              return "set_fpenv_mem";
+
+  // Bit manipulation
+  case ISD::ABS:                        return "abs";
+  case ISD::BITREVERSE:                 return "bitreverse";
+  case ISD::BSWAP:                      return "bswap";
+  case ISD::CTPOP:                      return "ctpop";
+  case ISD::CTTZ:                       return "cttz";
+  case ISD::CTTZ_ZERO_UNDEF:            return "cttz_zero_undef";
+  case ISD::CTLZ:                       return "ctlz";
+  case ISD::CTLZ_ZERO_UNDEF:            return "ctlz_zero_undef";
+  case ISD::PARITY:                     return "parity";
+
+  // Trampolines
+  case ISD::INIT_TRAMPOLINE:            return "init_trampoline";
+  case ISD::ADJUST_TRAMPOLINE:          return "adjust_trampoline";
+
+  case ISD::CONDCODE:
+    switch (cast<CondCodeSDNode>(this)->get()) {
+    default: llvm_unreachable("Unknown setcc condition!");
+    case ISD::SETOEQ:                   return "setoeq";
+    case ISD::SETOGT:                   return "setogt";
+    case ISD::SETOGE:                   return "setoge";
+    case ISD::SETOLT:                   return "setolt";
+    case ISD::SETOLE:                   return "setole";
+    case ISD::SETONE:                   return "setone";
+
+    case ISD::SETO:                     return "seto";
+    case ISD::SETUO:                    return "setuo";
+    case ISD::SETUEQ:                   return "setueq";
+    case ISD::SETUGT:                   return "setugt";
+    case ISD::SETUGE:                   return "setuge";
+    case ISD::SETULT:                   return "setult";
+    case ISD::SETULE:                   return "setule";
+    case ISD::SETUNE:                   return "setune";
+
+    case ISD::SETEQ:                    return "seteq";
+    case ISD::SETGT:                    return "setgt";
+    case ISD::SETGE:                    return "setge";
+    case ISD::SETLT:                    return "setlt";
+    case ISD::SETLE:                    return "setle";
+    case ISD::SETNE:                    return "setne";
+
+    case ISD::SETTRUE:                  return "settrue";
+    case ISD::SETTRUE2:                 return "settrue2";
+    case ISD::SETFALSE:                 return "setfalse";
+    case ISD::SETFALSE2:                return "setfalse2";
+    }
+  case ISD::VECREDUCE_FADD:             return "vecreduce_fadd";
+  case ISD::VECREDUCE_SEQ_FADD:         return "vecreduce_seq_fadd";
+  case ISD::VECREDUCE_FMUL:             return "vecreduce_fmul";
+  case ISD::VECREDUCE_SEQ_FMUL:         return "vecreduce_seq_fmul";
+  case ISD::VECREDUCE_ADD:              return "vecreduce_add";
+  case ISD::VECREDUCE_MUL:              return "vecreduce_mul";
+  case ISD::VECREDUCE_AND:              return "vecreduce_and";
+  case ISD::VECREDUCE_OR:               return "vecreduce_or";
+  case ISD::VECREDUCE_XOR:              return "vecreduce_xor";
+  case ISD::VECREDUCE_SMAX:             return "vecreduce_smax";
+  case ISD::VECREDUCE_SMIN:             return "vecreduce_smin";
+  case ISD::VECREDUCE_UMAX:             return "vecreduce_umax";
+  case ISD::VECREDUCE_UMIN:             return "vecreduce_umin";
+  case ISD::VECREDUCE_FMAX:             return "vecreduce_fmax";
+  case ISD::VECREDUCE_FMIN:             return "vecreduce_fmin";
+  case ISD::VECREDUCE_FMAXIMUM:         return "vecreduce_fmaximum";
+  case ISD::VECREDUCE_FMINIMUM:         return "vecreduce_fminimum";
+  case ISD::STACKMAP:
+    return "stackmap";
+  case ISD::PATCHPOINT:
+    return "patchpoint";
+
+    // Vector Predication
+#define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...)                    \
+  case ISD::SDID:                                                              \
+    return #NAME;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
+const char *SDNode::getIndexedModeName(ISD::MemIndexedMode AM) {
+  switch (AM) {
+  default:              return "";
+  case ISD::PRE_INC:    return "<pre-inc>";
+  case ISD::PRE_DEC:    return "<pre-dec>";
+  case ISD::POST_INC:   return "<post-inc>";
+  case ISD::POST_DEC:   return "<post-dec>";
+  }
+}
+
+static Printable PrintNodeId(const SDNode &Node) {
+  return Printable([&Node](raw_ostream &OS) {
+#ifndef NDEBUG
+    OS << 't' << Node.PersistentId;
+#else
+    OS << (const void*)&Node;
+#endif
+  });
+}
+
+// Print the MMO with more information from the SelectionDAG.
+static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
+                            const MachineFunction *MF, const Module *M,
+                            const MachineFrameInfo *MFI,
+                            const TargetInstrInfo *TII, LLVMContext &Ctx) {
+  ModuleSlotTracker MST(M);
+  if (MF)
+    MST.incorporateFunction(MF->getFunction());
+  SmallVector<StringRef, 0> SSNs;
+  MMO.print(OS, MST, SSNs, Ctx, MFI, TII);
+}
+
+static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
+                            const SelectionDAG *G) {
+  if (G) {
+    const MachineFunction *MF = &G->getMachineFunction();
+    return printMemOperand(OS, MMO, MF, MF->getFunction().getParent(),
+                           &MF->getFrameInfo(),
+                           G->getSubtarget().getInstrInfo(), *G->getContext());
+  }
+
+  LLVMContext Ctx;
+  return printMemOperand(OS, MMO, /*MF=*/nullptr, /*M=*/nullptr,
+                         /*MFI=*/nullptr, /*TII=*/nullptr, Ctx);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void SDNode::dump() const { dump(nullptr); }
+
+LLVM_DUMP_METHOD void SDNode::dump(const SelectionDAG *G) const {
+  print(dbgs(), G);
+  dbgs() << '\n';
+}
+#endif
+
+void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const {
+  for (unsigned i = 0, e = getNumValues(); i != e; ++i) {
+    if (i) OS << ",";
+    if (getValueType(i) == MVT::Other)
+      OS << "ch";
+    else
+      OS << getValueType(i).getEVTString();
+  }
+}
+
+void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
+  if (getFlags().hasNoUnsignedWrap())
+    OS << " nuw";
+
+  if (getFlags().hasNoSignedWrap())
+    OS << " nsw";
+
+  if (getFlags().hasExact())
+    OS << " exact";
+
+  if (getFlags().hasNoNaNs())
+    OS << " nnan";
+
+  if (getFlags().hasNoInfs())
+    OS << " ninf";
+
+  if (getFlags().hasNoSignedZeros())
+    OS << " nsz";
+
+  if (getFlags().hasAllowReciprocal())
+    OS << " arcp";
+
+  if (getFlags().hasAllowContract())
+    OS << " contract";
+
+  if (getFlags().hasApproximateFuncs())
+    OS << " afn";
+
+  if (getFlags().hasAllowReassociation())
+    OS << " reassoc";
+
+  if (getFlags().hasNoFPExcept())
+    OS << " nofpexcept";
+
+  if (const MachineSDNode *MN = dyn_cast<MachineSDNode>(this)) {
+    if (!MN->memoperands_empty()) {
+      OS << "<";
+      OS << "Mem:";
+      for (MachineSDNode::mmo_iterator i = MN->memoperands_begin(),
+           e = MN->memoperands_end(); i != e; ++i) {
+        printMemOperand(OS, **i, G);
+        if (std::next(i) != e)
+          OS << " ";
+      }
+      OS << ">";
+    }
+  } else if (const ShuffleVectorSDNode *SVN =
+               dyn_cast<ShuffleVectorSDNode>(this)) {
+    OS << "<";
+    for (unsigned i = 0, e = ValueList[0].getVectorNumElements(); i != e; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      if (i) OS << ",";
+      if (Idx < 0)
+        OS << "u";
+      else
+        OS << Idx;
+    }
+    OS << ">";
+  } else if (const ConstantSDNode *CSDN = dyn_cast<ConstantSDNode>(this)) {
+    OS << '<' << CSDN->getAPIntValue() << '>';
+  } else if (const ConstantFPSDNode *CSDN = dyn_cast<ConstantFPSDNode>(this)) {
+    if (&CSDN->getValueAPF().getSemantics() == &APFloat::IEEEsingle())
+      OS << '<' << CSDN->getValueAPF().convertToFloat() << '>';
+    else if (&CSDN->getValueAPF().getSemantics() == &APFloat::IEEEdouble())
+      OS << '<' << CSDN->getValueAPF().convertToDouble() << '>';
+    else {
+      OS << "<APFloat(";
+      CSDN->getValueAPF().bitcastToAPInt().print(OS, false);
+      OS << ")>";
+    }
+  } else if (const GlobalAddressSDNode *GADN =
+             dyn_cast<GlobalAddressSDNode>(this)) {
+    int64_t offset = GADN->getOffset();
+    OS << '<';
+    GADN->getGlobal()->printAsOperand(OS);
+    OS << '>';
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
+    if (unsigned int TF = GADN->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const FrameIndexSDNode *FIDN = dyn_cast<FrameIndexSDNode>(this)) {
+    OS << "<" << FIDN->getIndex() << ">";
+  } else if (const JumpTableSDNode *JTDN = dyn_cast<JumpTableSDNode>(this)) {
+    OS << "<" << JTDN->getIndex() << ">";
+    if (unsigned int TF = JTDN->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(this)){
+    int offset = CP->getOffset();
+    if (CP->isMachineConstantPoolEntry())
+      OS << "<" << *CP->getMachineCPVal() << ">";
+    else
+      OS << "<" << *CP->getConstVal() << ">";
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
+    if (unsigned int TF = CP->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const TargetIndexSDNode *TI = dyn_cast<TargetIndexSDNode>(this)) {
+    OS << "<" << TI->getIndex() << '+' << TI->getOffset() << ">";
+    if (unsigned TF = TI->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const BasicBlockSDNode *BBDN = dyn_cast<BasicBlockSDNode>(this)) {
+    OS << "<";
+    const Value *LBB = (const Value*)BBDN->getBasicBlock()->getBasicBlock();
+    if (LBB)
+      OS << LBB->getName() << " ";
+    OS << (const void*)BBDN->getBasicBlock() << ">";
+  } else if (const RegisterSDNode *R = dyn_cast<RegisterSDNode>(this)) {
+    OS << ' ' << printReg(R->getReg(),
+                          G ? G->getSubtarget().getRegisterInfo() : nullptr);
+  } else if (const ExternalSymbolSDNode *ES =
+             dyn_cast<ExternalSymbolSDNode>(this)) {
+    OS << "'" << ES->getSymbol() << "'";
+    if (unsigned int TF = ES->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const SrcValueSDNode *M = dyn_cast<SrcValueSDNode>(this)) {
+    if (M->getValue())
+      OS << "<" << M->getValue() << ">";
+    else
+      OS << "<null>";
+  } else if (const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(this)) {
+    if (MD->getMD())
+      OS << "<" << MD->getMD() << ">";
+    else
+      OS << "<null>";
+  } else if (const VTSDNode *N = dyn_cast<VTSDNode>(this)) {
+    OS << ":" << N->getVT();
+  }
+  else if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(this)) {
+    OS << "<";
+
+    printMemOperand(OS, *LD->getMemOperand(), G);
+
+    bool doExt = true;
+    switch (LD->getExtensionType()) {
+    default: doExt = false; break;
+    case ISD::EXTLOAD:  OS << ", anyext"; break;
+    case ISD::SEXTLOAD: OS << ", sext"; break;
+    case ISD::ZEXTLOAD: OS << ", zext"; break;
+    }
+    if (doExt)
+      OS << " from " << LD->getMemoryVT();
+
+    const char *AM = getIndexedModeName(LD->getAddressingMode());
+    if (*AM)
+      OS << ", " << AM;
+
+    OS << ">";
+  } else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(this)) {
+    OS << "<";
+    printMemOperand(OS, *ST->getMemOperand(), G);
+
+    if (ST->isTruncatingStore())
+      OS << ", trunc to " << ST->getMemoryVT();
+
+    const char *AM = getIndexedModeName(ST->getAddressingMode());
+    if (*AM)
+      OS << ", " << AM;
+
+    OS << ">";
+  } else if (const MaskedLoadSDNode *MLd = dyn_cast<MaskedLoadSDNode>(this)) {
+    OS << "<";
+
+    printMemOperand(OS, *MLd->getMemOperand(), G);
+
+    bool doExt = true;
+    switch (MLd->getExtensionType()) {
+    default: doExt = false; break;
+    case ISD::EXTLOAD:  OS << ", anyext"; break;
+    case ISD::SEXTLOAD: OS << ", sext"; break;
+    case ISD::ZEXTLOAD: OS << ", zext"; break;
+    }
+    if (doExt)
+      OS << " from " << MLd->getMemoryVT();
+
+    const char *AM = getIndexedModeName(MLd->getAddressingMode());
+    if (*AM)
+      OS << ", " << AM;
+
+    if (MLd->isExpandingLoad())
+      OS << ", expanding";
+
+    OS << ">";
+  } else if (const MaskedStoreSDNode *MSt = dyn_cast<MaskedStoreSDNode>(this)) {
+    OS << "<";
+    printMemOperand(OS, *MSt->getMemOperand(), G);
+
+    if (MSt->isTruncatingStore())
+      OS << ", trunc to " << MSt->getMemoryVT();
+
+    const char *AM = getIndexedModeName(MSt->getAddressingMode());
+    if (*AM)
+      OS << ", " << AM;
+
+    if (MSt->isCompressingStore())
+      OS << ", compressing";
+
+    OS << ">";
+  } else if (const auto *MGather = dyn_cast<MaskedGatherSDNode>(this)) {
+    OS << "<";
+    printMemOperand(OS, *MGather->getMemOperand(), G);
+
+    bool doExt = true;
+    switch (MGather->getExtensionType()) {
+    default: doExt = false; break;
+    case ISD::EXTLOAD:  OS << ", anyext"; break;
+    case ISD::SEXTLOAD: OS << ", sext"; break;
+    case ISD::ZEXTLOAD: OS << ", zext"; break;
+    }
+    if (doExt)
+      OS << " from " << MGather->getMemoryVT();
+
+    auto Signed = MGather->isIndexSigned() ? "signed" : "unsigned";
+    auto Scaled = MGather->isIndexScaled() ? "scaled" : "unscaled";
+    OS << ", " << Signed << " " << Scaled << " offset";
+
+    OS << ">";
+  } else if (const auto *MScatter = dyn_cast<MaskedScatterSDNode>(this)) {
+    OS << "<";
+    printMemOperand(OS, *MScatter->getMemOperand(), G);
+
+    if (MScatter->isTruncatingStore())
+      OS << ", trunc to " << MScatter->getMemoryVT();
+
+    auto Signed = MScatter->isIndexSigned() ? "signed" : "unsigned";
+    auto Scaled = MScatter->isIndexScaled() ? "scaled" : "unscaled";
+    OS << ", " << Signed << " " << Scaled << " offset";
+
+    OS << ">";
+  } else if (const MemSDNode *M = dyn_cast<MemSDNode>(this)) {
+    OS << "<";
+    printMemOperand(OS, *M->getMemOperand(), G);
+    OS << ">";
+  } else if (const BlockAddressSDNode *BA =
+               dyn_cast<BlockAddressSDNode>(this)) {
+    int64_t offset = BA->getOffset();
+    OS << "<";
+    BA->getBlockAddress()->getFunction()->printAsOperand(OS, false);
+    OS << ", ";
+    BA->getBlockAddress()->getBasicBlock()->printAsOperand(OS, false);
+    OS << ">";
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
+    if (unsigned int TF = BA->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const AddrSpaceCastSDNode *ASC =
+               dyn_cast<AddrSpaceCastSDNode>(this)) {
+    OS << '['
+       << ASC->getSrcAddressSpace()
+       << " -> "
+       << ASC->getDestAddressSpace()
+       << ']';
+  } else if (const LifetimeSDNode *LN = dyn_cast<LifetimeSDNode>(this)) {
+    if (LN->hasOffset())
+      OS << "<" << LN->getOffset() << " to " << LN->getOffset() + LN->getSize() << ">";
+  } else if (const auto *AA = dyn_cast<AssertAlignSDNode>(this)) {
+    OS << '<' << AA->getAlign().value() << '>';
+  }
+
+  if (VerboseDAGDumping) {
+    if (unsigned Order = getIROrder())
+        OS << " [ORD=" << Order << ']';
+
+    if (getNodeId() != -1)
+      OS << " [ID=" << getNodeId() << ']';
+    if (!(isa<ConstantSDNode>(this) || (isa<ConstantFPSDNode>(this))))
+      OS << " # D:" << isDivergent();
+
+    if (G && !G->GetDbgValues(this).empty()) {
+      OS << " [NoOfDbgValues=" << G->GetDbgValues(this).size() << ']';
+      for (SDDbgValue *Dbg : G->GetDbgValues(this))
+        if (!Dbg->isInvalidated())
+          Dbg->print(OS);
+    } else if (getHasDebugValue())
+      OS << " [NoOfDbgValues>0]";
+
+    if (const auto *MD = G ? G->getPCSections(this) : nullptr) {
+      OS << " [pcsections ";
+      MD->printAsOperand(OS, G->getMachineFunction().getFunction().getParent());
+      OS << ']';
+    }
+  }
+}
+
+LLVM_DUMP_METHOD void SDDbgValue::print(raw_ostream &OS) const {
+  OS << " DbgVal(Order=" << getOrder() << ')';
+  if (isInvalidated())
+    OS << "(Invalidated)";
+  if (isEmitted())
+    OS << "(Emitted)";
+  OS << "(";
+  bool Comma = false;
+  for (const SDDbgOperand &Op : getLocationOps()) {
+    if (Comma)
+      OS << ", ";
+    switch (Op.getKind()) {
+    case SDDbgOperand::SDNODE:
+      if (Op.getSDNode())
+        OS << "SDNODE=" << PrintNodeId(*Op.getSDNode()) << ':' << Op.getResNo();
+      else
+        OS << "SDNODE";
+      break;
+    case SDDbgOperand::CONST:
+      OS << "CONST";
+      break;
+    case SDDbgOperand::FRAMEIX:
+      OS << "FRAMEIX=" << Op.getFrameIx();
+      break;
+    case SDDbgOperand::VREG:
+      OS << "VREG=" << Op.getVReg();
+      break;
+    }
+    Comma = true;
+  }
+  OS << ")";
+  if (isIndirect()) OS << "(Indirect)";
+  if (isVariadic())
+    OS << "(Variadic)";
+  OS << ":\"" << Var->getName() << '"';
+#ifndef NDEBUG
+  if (Expr->getNumElements())
+    Expr->dump();
+#endif
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void SDDbgValue::dump() const {
+  if (isInvalidated())
+    return;
+  print(dbgs());
+  dbgs() << "\n";
+}
+#endif
+
+/// Return true if this node is so simple that we should just print it inline
+/// if it appears as an operand.
+static bool shouldPrintInline(const SDNode &Node, const SelectionDAG *G) {
+  // Avoid lots of cluttering when inline printing nodes with associated
+  // DbgValues in verbose mode.
+  if (VerboseDAGDumping && G && !G->GetDbgValues(&Node).empty())
+    return false;
+  if (Node.getOpcode() == ISD::EntryToken)
+    return false;
+  return Node.getNumOperands() == 0;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) {
+  for (const SDValue &Op : N->op_values()) {
+    if (shouldPrintInline(*Op.getNode(), G))
+      continue;
+    if (Op.getNode()->hasOneUse())
+      DumpNodes(Op.getNode(), indent+2, G);
+  }
+
+  dbgs().indent(indent);
+  N->dump(G);
+}
+
+LLVM_DUMP_METHOD void SelectionDAG::dump() const {
+  dbgs() << "SelectionDAG has " << AllNodes.size() << " nodes:\n";
+
+  for (const SDNode &N : allnodes()) {
+    if (!N.hasOneUse() && &N != getRoot().getNode() &&
+        (!shouldPrintInline(N, this) || N.use_empty()))
+      DumpNodes(&N, 2, this);
+  }
+
+  if (getRoot().getNode()) DumpNodes(getRoot().getNode(), 2, this);
+  dbgs() << "\n";
+
+  if (VerboseDAGDumping) {
+    if (DbgBegin() != DbgEnd())
+      dbgs() << "SDDbgValues:\n";
+    for (auto *Dbg : make_range(DbgBegin(), DbgEnd()))
+      Dbg->dump();
+    if (ByvalParmDbgBegin() != ByvalParmDbgEnd())
+      dbgs() << "Byval SDDbgValues:\n";
+    for (auto *Dbg : make_range(ByvalParmDbgBegin(), ByvalParmDbgEnd()))
+      Dbg->dump();
+  }
+  dbgs() << "\n";
+}
+#endif
+
+void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const {
+  OS << PrintNodeId(*this) << ": ";
+  print_types(OS, G);
+  OS << " = " << getOperationName(G);
+  print_details(OS, G);
+}
+
+static bool printOperand(raw_ostream &OS, const SelectionDAG *G,
+                         const SDValue Value) {
+  if (!Value.getNode()) {
+    OS << "<null>";
+    return false;
+  }
+
+  if (shouldPrintInline(*Value.getNode(), G)) {
+    OS << Value->getOperationName(G) << ':';
+    Value->print_types(OS, G);
+    Value->print_details(OS, G);
+    return true;
+  }
+
+  OS << PrintNodeId(*Value.getNode());
+  if (unsigned RN = Value.getResNo())
+    OS << ':' << RN;
+  return false;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+using VisitedSDNodeSet = SmallPtrSet<const SDNode *, 32>;
+
+static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
+                       const SelectionDAG *G, VisitedSDNodeSet &once) {
+  if (!once.insert(N).second) // If we've been here before, return now.
+    return;
+
+  // Dump the current SDNode, but don't end the line yet.
+  OS.indent(indent);
+  N->printr(OS, G);
+
+  // Having printed this SDNode, walk the children:
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (i) OS << ",";
+    OS << " ";
+
+    const SDValue Op = N->getOperand(i);
+    bool printedInline = printOperand(OS, G, Op);
+    if (printedInline)
+      once.insert(Op.getNode());
+  }
+
+  OS << "\n";
+
+  // Dump children that have grandchildren on their own line(s).
+  for (const SDValue &Op : N->op_values())
+    DumpNodesr(OS, Op.getNode(), indent+2, G, once);
+}
+
+LLVM_DUMP_METHOD void SDNode::dumpr() const {
+  VisitedSDNodeSet once;
+  DumpNodesr(dbgs(), this, 0, nullptr, once);
+}
+
+LLVM_DUMP_METHOD void SDNode::dumpr(const SelectionDAG *G) const {
+  VisitedSDNodeSet once;
+  DumpNodesr(dbgs(), this, 0, G, once);
+}
+#endif
+
+static void printrWithDepthHelper(raw_ostream &OS, const SDNode *N,
+                                  const SelectionDAG *G, unsigned depth,
+                                  unsigned indent) {
+  if (depth == 0)
+    return;
+
+  OS.indent(indent);
+
+  N->print(OS, G);
+
+  for (const SDValue &Op : N->op_values()) {
+    // Don't follow chain operands.
+    if (Op.getValueType() == MVT::Other)
+      continue;
+    OS << '\n';
+    printrWithDepthHelper(OS, Op.getNode(), G, depth - 1, indent + 2);
+  }
+}
+
+void SDNode::printrWithDepth(raw_ostream &OS, const SelectionDAG *G,
+                            unsigned depth) const {
+  printrWithDepthHelper(OS, this, G, depth, 0);
+}
+
+void SDNode::printrFull(raw_ostream &OS, const SelectionDAG *G) const {
+  // Don't print impossibly deep things.
+  printrWithDepth(OS, G, 10);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void SDNode::dumprWithDepth(const SelectionDAG *G, unsigned depth) const {
+  printrWithDepth(dbgs(), G, depth);
+}
+
+LLVM_DUMP_METHOD void SDNode::dumprFull(const SelectionDAG *G) const {
+  // Don't print impossibly deep things.
+  dumprWithDepth(G, 10);
+}
+#endif
+
+void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const {
+  printr(OS, G);
+  // Under VerboseDAGDumping divergence will be printed always.
+  if (isDivergent() && !VerboseDAGDumping)
+    OS << " # D:1";
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    if (i) OS << ", "; else OS << " ";
+    printOperand(OS, G, getOperand(i));
+  }
+  if (DebugLoc DL = getDebugLoc()) {
+    OS << ", ";
+    DL.print(OS);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
new file mode 100644
index 000000000000..35abd990f968
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -0,0 +1,3894 @@
+//===- SelectionDAGISel.cpp - Implement the SelectionDAGISel class --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAGISel class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "ScheduleDAGSDNodes.h"
+#include "SelectionDAGBuilder.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/CodeGen/AssignmentTrackingAnalysis.h"
+#include "llvm/CodeGen/CodeGenCommonISel.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePassRegistry.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/SwiftErrorValueTracking.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/EHPersonalities.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsWebAssembly.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "isel"
+
+STATISTIC(NumFastIselFailures, "Number of instructions fast isel failed on");
+STATISTIC(NumFastIselSuccess, "Number of instructions fast isel selected");
+STATISTIC(NumFastIselBlocks, "Number of blocks selected entirely by fast isel");
+STATISTIC(NumDAGBlocks, "Number of blocks selected using DAG");
+STATISTIC(NumDAGIselRetries,"Number of times dag isel has to try another path");
+STATISTIC(NumEntryBlocks, "Number of entry blocks encountered");
+STATISTIC(NumFastIselFailLowerArguments,
+          "Number of entry blocks where fast isel failed to lower arguments");
+
+static cl::opt<int> EnableFastISelAbort(
+    "fast-isel-abort", cl::Hidden,
+    cl::desc("Enable abort calls when \"fast\" instruction selection "
+             "fails to lower an instruction: 0 disable the abort, 1 will "
+             "abort but for args, calls and terminators, 2 will also "
+             "abort for argument lowering, and 3 will never fallback "
+             "to SelectionDAG."));
+
+static cl::opt<bool> EnableFastISelFallbackReport(
+    "fast-isel-report-on-fallback", cl::Hidden,
+    cl::desc("Emit a diagnostic when \"fast\" instruction selection "
+             "falls back to SelectionDAG."));
+
+static cl::opt<bool>
+UseMBPI("use-mbpi",
+        cl::desc("use Machine Branch Probability Info"),
+        cl::init(true), cl::Hidden);
+
+#ifndef NDEBUG
+static cl::opt<std::string>
+FilterDAGBasicBlockName("filter-view-dags", cl::Hidden,
+                        cl::desc("Only display the basic block whose name "
+                                 "matches this for all view-*-dags options"));
+static cl::opt<bool>
+ViewDAGCombine1("view-dag-combine1-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before the first "
+                   "dag combine pass"));
+static cl::opt<bool>
+ViewLegalizeTypesDAGs("view-legalize-types-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before legalize types"));
+static cl::opt<bool>
+    ViewDAGCombineLT("view-dag-combine-lt-dags", cl::Hidden,
+                     cl::desc("Pop up a window to show dags before the post "
+                              "legalize types dag combine pass"));
+static cl::opt<bool>
+    ViewLegalizeDAGs("view-legalize-dags", cl::Hidden,
+                     cl::desc("Pop up a window to show dags before legalize"));
+static cl::opt<bool>
+ViewDAGCombine2("view-dag-combine2-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before the second "
+                   "dag combine pass"));
+static cl::opt<bool>
+ViewISelDAGs("view-isel-dags", cl::Hidden,
+          cl::desc("Pop up a window to show isel dags as they are selected"));
+static cl::opt<bool>
+ViewSchedDAGs("view-sched-dags", cl::Hidden,
+          cl::desc("Pop up a window to show sched dags as they are processed"));
+static cl::opt<bool>
+ViewSUnitDAGs("view-sunit-dags", cl::Hidden,
+      cl::desc("Pop up a window to show SUnit dags after they are processed"));
+#else
+static const bool ViewDAGCombine1 = false, ViewLegalizeTypesDAGs = false,
+                  ViewDAGCombineLT = false, ViewLegalizeDAGs = false,
+                  ViewDAGCombine2 = false, ViewISelDAGs = false,
+                  ViewSchedDAGs = false, ViewSUnitDAGs = false;
+#endif
+
+//===---------------------------------------------------------------------===//
+///
+/// RegisterScheduler class - Track the registration of instruction schedulers.
+///
+//===---------------------------------------------------------------------===//
+MachinePassRegistry<RegisterScheduler::FunctionPassCtor>
+    RegisterScheduler::Registry;
+
+//===---------------------------------------------------------------------===//
+///
+/// ISHeuristic command line option for instruction schedulers.
+///
+//===---------------------------------------------------------------------===//
+static cl::opt<RegisterScheduler::FunctionPassCtor, false,
+               RegisterPassParser<RegisterScheduler>>
+ISHeuristic("pre-RA-sched",
+            cl::init(&createDefaultScheduler), cl::Hidden,
+            cl::desc("Instruction schedulers available (before register"
+                     " allocation):"));
+
+static RegisterScheduler
+defaultListDAGScheduler("default", "Best scheduler for the target",
+                        createDefaultScheduler);
+
+namespace llvm {
+
+  //===--------------------------------------------------------------------===//
+  /// This class is used by SelectionDAGISel to temporarily override
+  /// the optimization level on a per-function basis.
+  class OptLevelChanger {
+    SelectionDAGISel &IS;
+    CodeGenOpt::Level SavedOptLevel;
+    bool SavedFastISel;
+
+  public:
+    OptLevelChanger(SelectionDAGISel &ISel,
+                    CodeGenOpt::Level NewOptLevel) : IS(ISel) {
+      SavedOptLevel = IS.OptLevel;
+      SavedFastISel = IS.TM.Options.EnableFastISel;
+      if (NewOptLevel == SavedOptLevel)
+        return;
+      IS.OptLevel = NewOptLevel;
+      IS.TM.setOptLevel(NewOptLevel);
+      LLVM_DEBUG(dbgs() << "\nChanging optimization level for Function "
+                        << IS.MF->getFunction().getName() << "\n");
+      LLVM_DEBUG(dbgs() << "\tBefore: -O" << SavedOptLevel << " ; After: -O"
+                        << NewOptLevel << "\n");
+      if (NewOptLevel == CodeGenOpt::None) {
+        IS.TM.setFastISel(IS.TM.getO0WantsFastISel());
+        LLVM_DEBUG(
+            dbgs() << "\tFastISel is "
+                   << (IS.TM.Options.EnableFastISel ? "enabled" : "disabled")
+                   << "\n");
+      }
+    }
+
+    ~OptLevelChanger() {
+      if (IS.OptLevel == SavedOptLevel)
+        return;
+      LLVM_DEBUG(dbgs() << "\nRestoring optimization level for Function "
+                        << IS.MF->getFunction().getName() << "\n");
+      LLVM_DEBUG(dbgs() << "\tBefore: -O" << IS.OptLevel << " ; After: -O"
+                        << SavedOptLevel << "\n");
+      IS.OptLevel = SavedOptLevel;
+      IS.TM.setOptLevel(SavedOptLevel);
+      IS.TM.setFastISel(SavedFastISel);
+    }
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// createDefaultScheduler - This creates an instruction scheduler appropriate
+  /// for the target.
+  ScheduleDAGSDNodes* createDefaultScheduler(SelectionDAGISel *IS,
+                                             CodeGenOpt::Level OptLevel) {
+    const TargetLowering *TLI = IS->TLI;
+    const TargetSubtargetInfo &ST = IS->MF->getSubtarget();
+
+    // Try first to see if the Target has its own way of selecting a scheduler
+    if (auto *SchedulerCtor = ST.getDAGScheduler(OptLevel)) {
+      return SchedulerCtor(IS, OptLevel);
+    }
+
+    if (OptLevel == CodeGenOpt::None ||
+        (ST.enableMachineScheduler() && ST.enableMachineSchedDefaultSched()) ||
+        TLI->getSchedulingPreference() == Sched::Source)
+      return createSourceListDAGScheduler(IS, OptLevel);
+    if (TLI->getSchedulingPreference() == Sched::RegPressure)
+      return createBURRListDAGScheduler(IS, OptLevel);
+    if (TLI->getSchedulingPreference() == Sched::Hybrid)
+      return createHybridListDAGScheduler(IS, OptLevel);
+    if (TLI->getSchedulingPreference() == Sched::VLIW)
+      return createVLIWDAGScheduler(IS, OptLevel);
+    if (TLI->getSchedulingPreference() == Sched::Fast)
+      return createFastDAGScheduler(IS, OptLevel);
+    if (TLI->getSchedulingPreference() == Sched::Linearize)
+      return createDAGLinearizer(IS, OptLevel);
+    assert(TLI->getSchedulingPreference() == Sched::ILP &&
+           "Unknown sched type!");
+    return createILPListDAGScheduler(IS, OptLevel);
+  }
+
+} // end namespace llvm
+
+// EmitInstrWithCustomInserter - This method should be implemented by targets
+// that mark instructions with the 'usesCustomInserter' flag.  These
+// instructions are special in various ways, which require special support to
+// insert.  The specified MachineInstr is created but not inserted into any
+// basic blocks, and this method is called to expand it into a sequence of
+// instructions, potentially also creating new basic blocks and control flow.
+// When new basic blocks are inserted and the edges from MBB to its successors
+// are modified, the method should insert pairs of <OldSucc, NewSucc> into the
+// DenseMap.
+MachineBasicBlock *
+TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                            MachineBasicBlock *MBB) const {
+#ifndef NDEBUG
+  dbgs() << "If a target marks an instruction with "
+          "'usesCustomInserter', it must implement "
+          "TargetLowering::EmitInstrWithCustomInserter!\n";
+#endif
+  llvm_unreachable(nullptr);
+}
+
+void TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                                   SDNode *Node) const {
+  assert(!MI.hasPostISelHook() &&
+         "If a target marks an instruction with 'hasPostISelHook', "
+         "it must implement TargetLowering::AdjustInstrPostInstrSelection!");
+}
+
+//===----------------------------------------------------------------------===//
+// SelectionDAGISel code
+//===----------------------------------------------------------------------===//
+
+SelectionDAGISel::SelectionDAGISel(char &ID, TargetMachine &tm,
+                                   CodeGenOpt::Level OL)
+    : MachineFunctionPass(ID), TM(tm), FuncInfo(new FunctionLoweringInfo()),
+      SwiftError(new SwiftErrorValueTracking()),
+      CurDAG(new SelectionDAG(tm, OL)),
+      SDB(std::make_unique<SelectionDAGBuilder>(*CurDAG, *FuncInfo, *SwiftError,
+                                                OL)),
+      OptLevel(OL) {
+  initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
+  initializeBranchProbabilityInfoWrapperPassPass(
+      *PassRegistry::getPassRegistry());
+  initializeAAResultsWrapperPassPass(*PassRegistry::getPassRegistry());
+  initializeTargetLibraryInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+SelectionDAGISel::~SelectionDAGISel() {
+  delete CurDAG;
+  delete SwiftError;
+}
+
+void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
+  if (OptLevel != CodeGenOpt::None)
+    AU.addRequired<AAResultsWrapperPass>();
+  AU.addRequired<GCModuleInfo>();
+  AU.addRequired<StackProtector>();
+  AU.addPreserved<GCModuleInfo>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+  AU.addRequired<AssumptionCacheTracker>();
+  if (UseMBPI && OptLevel != CodeGenOpt::None)
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
+  AU.addRequired<ProfileSummaryInfoWrapperPass>();
+  // AssignmentTrackingAnalysis only runs if assignment tracking is enabled for
+  // the module.
+  AU.addRequired<AssignmentTrackingAnalysis>();
+  AU.addPreserved<AssignmentTrackingAnalysis>();
+  if (OptLevel != CodeGenOpt::None)
+    LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static void computeUsesMSVCFloatingPoint(const Triple &TT, const Function &F,
+                                         MachineModuleInfo &MMI) {
+  // Only needed for MSVC
+  if (!TT.isWindowsMSVCEnvironment())
+    return;
+
+  // If it's already set, nothing to do.
+  if (MMI.usesMSVCFloatingPoint())
+    return;
+
+  for (const Instruction &I : instructions(F)) {
+    if (I.getType()->isFPOrFPVectorTy()) {
+      MMI.setUsesMSVCFloatingPoint(true);
+      return;
+    }
+    for (const auto &Op : I.operands()) {
+      if (Op->getType()->isFPOrFPVectorTy()) {
+        MMI.setUsesMSVCFloatingPoint(true);
+        return;
+      }
+    }
+  }
+}
+
+bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
+  // If we already selected that function, we do not need to run SDISel.
+  if (mf.getProperties().hasProperty(
+          MachineFunctionProperties::Property::Selected))
+    return false;
+  // Do some sanity-checking on the command-line options.
+  assert((!EnableFastISelAbort || TM.Options.EnableFastISel) &&
+         "-fast-isel-abort > 0 requires -fast-isel");
+
+  const Function &Fn = mf.getFunction();
+  MF = &mf;
+
+  // Decide what flavour of variable location debug-info will be used, before
+  // we change the optimisation level.
+  bool InstrRef = mf.shouldUseDebugInstrRef();
+  mf.setUseDebugInstrRef(InstrRef);
+
+  // Reset the target options before resetting the optimization
+  // level below.
+  // FIXME: This is a horrible hack and should be processed via
+  // codegen looking at the optimization level explicitly when
+  // it wants to look at it.
+  TM.resetTargetOptions(Fn);
+  // Reset OptLevel to None for optnone functions.
+  CodeGenOpt::Level NewOptLevel = OptLevel;
+  if (OptLevel != CodeGenOpt::None && skipFunction(Fn))
+    NewOptLevel = CodeGenOpt::None;
+  OptLevelChanger OLC(*this, NewOptLevel);
+
+  TII = MF->getSubtarget().getInstrInfo();
+  TLI = MF->getSubtarget().getTargetLowering();
+  RegInfo = &MF->getRegInfo();
+  LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(Fn);
+  GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
+  ORE = std::make_unique<OptimizationRemarkEmitter>(&Fn);
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(mf.getFunction());
+  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  BlockFrequencyInfo *BFI = nullptr;
+  if (PSI && PSI->hasProfileSummary() && OptLevel != CodeGenOpt::None)
+    BFI = &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI();
+
+  FunctionVarLocs const *FnVarLocs = nullptr;
+  if (isAssignmentTrackingEnabled(*Fn.getParent()))
+    FnVarLocs = getAnalysis<AssignmentTrackingAnalysis>().getResults();
+
+  LLVM_DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
+
+  UniformityInfo *UA = nullptr;
+  if (auto *UAPass = getAnalysisIfAvailable<UniformityInfoWrapperPass>())
+    UA = &UAPass->getUniformityInfo();
+  CurDAG->init(*MF, *ORE, this, LibInfo, UA, PSI, BFI, FnVarLocs);
+  FuncInfo->set(Fn, *MF, CurDAG);
+  SwiftError->setFunction(*MF);
+
+  // Now get the optional analyzes if we want to.
+  // This is based on the possibly changed OptLevel (after optnone is taken
+  // into account).  That's unfortunate but OK because it just means we won't
+  // ask for passes that have been required anyway.
+
+  if (UseMBPI && OptLevel != CodeGenOpt::None)
+    FuncInfo->BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+  else
+    FuncInfo->BPI = nullptr;
+
+  if (OptLevel != CodeGenOpt::None)
+    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  else
+    AA = nullptr;
+
+  SDB->init(GFI, AA, AC, LibInfo);
+
+  MF->setHasInlineAsm(false);
+
+  FuncInfo->SplitCSR = false;
+
+  // We split CSR if the target supports it for the given function
+  // and the function has only return exits.
+  if (OptLevel != CodeGenOpt::None && TLI->supportSplitCSR(MF)) {
+    FuncInfo->SplitCSR = true;
+
+    // Collect all the return blocks.
+    for (const BasicBlock &BB : Fn) {
+      if (!succ_empty(&BB))
+        continue;
+
+      const Instruction *Term = BB.getTerminator();
+      if (isa<UnreachableInst>(Term) || isa<ReturnInst>(Term))
+        continue;
+
+      // Bail out if the exit block is not Return nor Unreachable.
+      FuncInfo->SplitCSR = false;
+      break;
+    }
+  }
+
+  MachineBasicBlock *EntryMBB = &MF->front();
+  if (FuncInfo->SplitCSR)
+    // This performs initialization so lowering for SplitCSR will be correct.
+    TLI->initializeSplitCSR(EntryMBB);
+
+  SelectAllBasicBlocks(Fn);
+  if (FastISelFailed && EnableFastISelFallbackReport) {
+    DiagnosticInfoISelFallback DiagFallback(Fn);
+    Fn.getContext().diagnose(DiagFallback);
+  }
+
+  // Replace forward-declared registers with the registers containing
+  // the desired value.
+  // Note: it is important that this happens **before** the call to
+  // EmitLiveInCopies, since implementations can skip copies of unused
+  // registers. If we don't apply the reg fixups before, some registers may
+  // appear as unused and will be skipped, resulting in bad MI.
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  for (DenseMap<Register, Register>::iterator I = FuncInfo->RegFixups.begin(),
+                                              E = FuncInfo->RegFixups.end();
+       I != E; ++I) {
+    Register From = I->first;
+    Register To = I->second;
+    // If To is also scheduled to be replaced, find what its ultimate
+    // replacement is.
+    while (true) {
+      DenseMap<Register, Register>::iterator J = FuncInfo->RegFixups.find(To);
+      if (J == E)
+        break;
+      To = J->second;
+    }
+    // Make sure the new register has a sufficiently constrained register class.
+    if (From.isVirtual() && To.isVirtual())
+      MRI.constrainRegClass(To, MRI.getRegClass(From));
+    // Replace it.
+
+    // Replacing one register with another won't touch the kill flags.
+    // We need to conservatively clear the kill flags as a kill on the old
+    // register might dominate existing uses of the new register.
+    if (!MRI.use_empty(To))
+      MRI.clearKillFlags(From);
+    MRI.replaceRegWith(From, To);
+  }
+
+  // If the first basic block in the function has live ins that need to be
+  // copied into vregs, emit the copies into the top of the block before
+  // emitting the code for the block.
+  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+  RegInfo->EmitLiveInCopies(EntryMBB, TRI, *TII);
+
+  // Insert copies in the entry block and the return blocks.
+  if (FuncInfo->SplitCSR) {
+    SmallVector<MachineBasicBlock*, 4> Returns;
+    // Collect all the return blocks.
+    for (MachineBasicBlock &MBB : mf) {
+      if (!MBB.succ_empty())
+        continue;
+
+      MachineBasicBlock::iterator Term = MBB.getFirstTerminator();
+      if (Term != MBB.end() && Term->isReturn()) {
+        Returns.push_back(&MBB);
+        continue;
+      }
+    }
+    TLI->insertCopiesSplitCSR(EntryMBB, Returns);
+  }
+
+  DenseMap<unsigned, unsigned> LiveInMap;
+  if (!FuncInfo->ArgDbgValues.empty())
+    for (std::pair<unsigned, unsigned> LI : RegInfo->liveins())
+      if (LI.second)
+        LiveInMap.insert(LI);
+
+  // Insert DBG_VALUE instructions for function arguments to the entry block.
+  for (unsigned i = 0, e = FuncInfo->ArgDbgValues.size(); i != e; ++i) {
+    MachineInstr *MI = FuncInfo->ArgDbgValues[e - i - 1];
+    assert(MI->getOpcode() != TargetOpcode::DBG_VALUE_LIST &&
+           "Function parameters should not be described by DBG_VALUE_LIST.");
+    bool hasFI = MI->getDebugOperand(0).isFI();
+    Register Reg =
+        hasFI ? TRI.getFrameRegister(*MF) : MI->getDebugOperand(0).getReg();
+    if (Reg.isPhysical())
+      EntryMBB->insert(EntryMBB->begin(), MI);
+    else {
+      MachineInstr *Def = RegInfo->getVRegDef(Reg);
+      if (Def) {
+        MachineBasicBlock::iterator InsertPos = Def;
+        // FIXME: VR def may not be in entry block.
+        Def->getParent()->insert(std::next(InsertPos), MI);
+      } else
+        LLVM_DEBUG(dbgs() << "Dropping debug info for dead vreg"
+                          << Register::virtReg2Index(Reg) << "\n");
+    }
+
+    // Don't try and extend through copies in instruction referencing mode.
+    if (InstrRef)
+      continue;
+
+    // If Reg is live-in then update debug info to track its copy in a vreg.
+    DenseMap<unsigned, unsigned>::iterator LDI = LiveInMap.find(Reg);
+    if (LDI != LiveInMap.end()) {
+      assert(!hasFI && "There's no handling of frame pointer updating here yet "
+                       "- add if needed");
+      MachineInstr *Def = RegInfo->getVRegDef(LDI->second);
+      MachineBasicBlock::iterator InsertPos = Def;
+      const MDNode *Variable = MI->getDebugVariable();
+      const MDNode *Expr = MI->getDebugExpression();
+      DebugLoc DL = MI->getDebugLoc();
+      bool IsIndirect = MI->isIndirectDebugValue();
+      if (IsIndirect)
+        assert(MI->getDebugOffset().getImm() == 0 &&
+               "DBG_VALUE with nonzero offset");
+      assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
+             "Expected inlined-at fields to agree");
+      assert(MI->getOpcode() != TargetOpcode::DBG_VALUE_LIST &&
+             "Didn't expect to see a DBG_VALUE_LIST here");
+      // Def is never a terminator here, so it is ok to increment InsertPos.
+      BuildMI(*EntryMBB, ++InsertPos, DL, TII->get(TargetOpcode::DBG_VALUE),
+              IsIndirect, LDI->second, Variable, Expr);
+
+      // If this vreg is directly copied into an exported register then
+      // that COPY instructions also need DBG_VALUE, if it is the only
+      // user of LDI->second.
+      MachineInstr *CopyUseMI = nullptr;
+      for (MachineRegisterInfo::use_instr_iterator
+           UI = RegInfo->use_instr_begin(LDI->second),
+           E = RegInfo->use_instr_end(); UI != E; ) {
+        MachineInstr *UseMI = &*(UI++);
+        if (UseMI->isDebugValue()) continue;
+        if (UseMI->isCopy() && !CopyUseMI && UseMI->getParent() == EntryMBB) {
+          CopyUseMI = UseMI; continue;
+        }
+        // Otherwise this is another use or second copy use.
+        CopyUseMI = nullptr; break;
+      }
+      if (CopyUseMI &&
+          TRI.getRegSizeInBits(LDI->second, MRI) ==
+              TRI.getRegSizeInBits(CopyUseMI->getOperand(0).getReg(), MRI)) {
+        // Use MI's debug location, which describes where Variable was
+        // declared, rather than whatever is attached to CopyUseMI.
+        MachineInstr *NewMI =
+            BuildMI(*MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsIndirect,
+                    CopyUseMI->getOperand(0).getReg(), Variable, Expr);
+        MachineBasicBlock::iterator Pos = CopyUseMI;
+        EntryMBB->insertAfter(Pos, NewMI);
+      }
+    }
+  }
+
+  // For debug-info, in instruction referencing mode, we need to perform some
+  // post-isel maintenence.
+  if (MF->useDebugInstrRef())
+    MF->finalizeDebugInstrRefs();
+
+  // Determine if there are any calls in this machine function.
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  for (const auto &MBB : *MF) {
+    if (MFI.hasCalls() && MF->hasInlineAsm())
+      break;
+
+    for (const auto &MI : MBB) {
+      const MCInstrDesc &MCID = TII->get(MI.getOpcode());
+      if ((MCID.isCall() && !MCID.isReturn()) ||
+          MI.isStackAligningInlineAsm()) {
+        MFI.setHasCalls(true);
+      }
+      if (MI.isInlineAsm()) {
+        MF->setHasInlineAsm(true);
+      }
+    }
+  }
+
+  // Determine if there is a call to setjmp in the machine function.
+  MF->setExposesReturnsTwice(Fn.callsFunctionThatReturnsTwice());
+
+  // Determine if floating point is used for msvc
+  computeUsesMSVCFloatingPoint(TM.getTargetTriple(), Fn, MF->getMMI());
+
+  // Release function-specific state. SDB and CurDAG are already cleared
+  // at this point.
+  FuncInfo->clear();
+
+  LLVM_DEBUG(dbgs() << "*** MachineFunction at end of ISel ***\n");
+  LLVM_DEBUG(MF->print(dbgs()));
+
+  return true;
+}
+
+static void reportFastISelFailure(MachineFunction &MF,
+                                  OptimizationRemarkEmitter &ORE,
+                                  OptimizationRemarkMissed &R,
+                                  bool ShouldAbort) {
+  // Print the function name explicitly if we don't have a debug location (which
+  // makes the diagnostic less useful) or if we're going to emit a raw error.
+  if (!R.getLocation().isValid() || ShouldAbort)
+    R << (" (in function: " + MF.getName() + ")").str();
+
+  if (ShouldAbort)
+    report_fatal_error(Twine(R.getMsg()));
+
+  ORE.emit(R);
+  LLVM_DEBUG(dbgs() << R.getMsg() << "\n");
+}
+
+void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
+                                        BasicBlock::const_iterator End,
+                                        bool &HadTailCall) {
+  // Allow creating illegal types during DAG building for the basic block.
+  CurDAG->NewNodesMustHaveLegalTypes = false;
+
+  // Lower the instructions. If a call is emitted as a tail call, cease emitting
+  // nodes for this block.
+  for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I) {
+    if (!ElidedArgCopyInstrs.count(&*I))
+      SDB->visit(*I);
+  }
+
+  // Make sure the root of the DAG is up-to-date.
+  CurDAG->setRoot(SDB->getControlRoot());
+  HadTailCall = SDB->HasTailCall;
+  SDB->resolveOrClearDbgInfo();
+  SDB->clear();
+
+  // Final step, emit the lowered DAG as machine code.
+  CodeGenAndEmitDAG();
+}
+
+void SelectionDAGISel::ComputeLiveOutVRegInfo() {
+  SmallPtrSet<SDNode *, 16> Added;
+  SmallVector<SDNode*, 128> Worklist;
+
+  Worklist.push_back(CurDAG->getRoot().getNode());
+  Added.insert(CurDAG->getRoot().getNode());
+
+  KnownBits Known;
+
+  do {
+    SDNode *N = Worklist.pop_back_val();
+
+    // Otherwise, add all chain operands to the worklist.
+    for (const SDValue &Op : N->op_values())
+      if (Op.getValueType() == MVT::Other && Added.insert(Op.getNode()).second)
+        Worklist.push_back(Op.getNode());
+
+    // If this is a CopyToReg with a vreg dest, process it.
+    if (N->getOpcode() != ISD::CopyToReg)
+      continue;
+
+    unsigned DestReg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
+    if (!Register::isVirtualRegister(DestReg))
+      continue;
+
+    // Ignore non-integer values.
+    SDValue Src = N->getOperand(2);
+    EVT SrcVT = Src.getValueType();
+    if (!SrcVT.isInteger())
+      continue;
+
+    unsigned NumSignBits = CurDAG->ComputeNumSignBits(Src);
+    Known = CurDAG->computeKnownBits(Src);
+    FuncInfo->AddLiveOutRegInfo(DestReg, NumSignBits, Known);
+  } while (!Worklist.empty());
+}
+
+void SelectionDAGISel::CodeGenAndEmitDAG() {
+  StringRef GroupName = "sdag";
+  StringRef GroupDescription = "Instruction Selection and Scheduling";
+  std::string BlockName;
+  bool MatchFilterBB = false; (void)MatchFilterBB;
+#ifndef NDEBUG
+  TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*FuncInfo->Fn);
+#endif
+
+  // Pre-type legalization allow creation of any node types.
+  CurDAG->NewNodesMustHaveLegalTypes = false;
+
+#ifndef NDEBUG
+  MatchFilterBB = (FilterDAGBasicBlockName.empty() ||
+                   FilterDAGBasicBlockName ==
+                       FuncInfo->MBB->getBasicBlock()->getName());
+#endif
+#ifdef NDEBUG
+  if (ViewDAGCombine1 || ViewLegalizeTypesDAGs || ViewDAGCombineLT ||
+      ViewLegalizeDAGs || ViewDAGCombine2 || ViewISelDAGs || ViewSchedDAGs ||
+      ViewSUnitDAGs)
+#endif
+  {
+    BlockName =
+        (MF->getName() + ":" + FuncInfo->MBB->getBasicBlock()->getName()).str();
+  }
+  LLVM_DEBUG(dbgs() << "Initial selection DAG: "
+                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                    << "'\n";
+             CurDAG->dump());
+
+#ifndef NDEBUG
+  if (TTI.hasBranchDivergence())
+    CurDAG->VerifyDAGDivergence();
+#endif
+
+  if (ViewDAGCombine1 && MatchFilterBB)
+    CurDAG->viewGraph("dag-combine1 input for " + BlockName);
+
+  // Run the DAG combiner in pre-legalize mode.
+  {
+    NamedRegionTimer T("combine1", "DAG Combining 1", GroupName,
+                       GroupDescription, TimePassesIsEnabled);
+    CurDAG->Combine(BeforeLegalizeTypes, AA, OptLevel);
+  }
+
+  LLVM_DEBUG(dbgs() << "Optimized lowered selection DAG: "
+                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                    << "'\n";
+             CurDAG->dump());
+
+#ifndef NDEBUG
+  if (TTI.hasBranchDivergence())
+    CurDAG->VerifyDAGDivergence();
+#endif
+
+  // Second step, hack on the DAG until it only uses operations and types that
+  // the target supports.
+  if (ViewLegalizeTypesDAGs && MatchFilterBB)
+    CurDAG->viewGraph("legalize-types input for " + BlockName);
+
+  bool Changed;
+  {
+    NamedRegionTimer T("legalize_types", "Type Legalization", GroupName,
+                       GroupDescription, TimePassesIsEnabled);
+    Changed = CurDAG->LegalizeTypes();
+  }
+
+  LLVM_DEBUG(dbgs() << "Type-legalized selection DAG: "
+                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                    << "'\n";
+             CurDAG->dump());
+
+#ifndef NDEBUG
+  if (TTI.hasBranchDivergence())
+    CurDAG->VerifyDAGDivergence();
+#endif
+
+  // Only allow creation of legal node types.
+  CurDAG->NewNodesMustHaveLegalTypes = true;
+
+  if (Changed) {
+    if (ViewDAGCombineLT && MatchFilterBB)
+      CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
+
+    // Run the DAG combiner in post-type-legalize mode.
+    {
+      NamedRegionTimer T("combine_lt", "DAG Combining after legalize types",
+                         GroupName, GroupDescription, TimePassesIsEnabled);
+      CurDAG->Combine(AfterLegalizeTypes, AA, OptLevel);
+    }
+
+    LLVM_DEBUG(dbgs() << "Optimized type-legalized selection DAG: "
+                      << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                      << "'\n";
+               CurDAG->dump());
+
+#ifndef NDEBUG
+    if (TTI.hasBranchDivergence())
+      CurDAG->VerifyDAGDivergence();
+#endif
+  }
+
+  {
+    NamedRegionTimer T("legalize_vec", "Vector Legalization", GroupName,
+                       GroupDescription, TimePassesIsEnabled);
+    Changed = CurDAG->LegalizeVectors();
+  }
+
+  if (Changed) {
+    LLVM_DEBUG(dbgs() << "Vector-legalized selection DAG: "
+                      << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                      << "'\n";
+               CurDAG->dump());
+
+#ifndef NDEBUG
+    if (TTI.hasBranchDivergence())
+      CurDAG->VerifyDAGDivergence();
+#endif
+
+    {
+      NamedRegionTimer T("legalize_types2", "Type Legalization 2", GroupName,
+                         GroupDescription, TimePassesIsEnabled);
+      CurDAG->LegalizeTypes();
+    }
+
+    LLVM_DEBUG(dbgs() << "Vector/type-legalized selection DAG: "
+                      << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                      << "'\n";
+               CurDAG->dump());
+
+#ifndef NDEBUG
+    if (TTI.hasBranchDivergence())
+      CurDAG->VerifyDAGDivergence();
+#endif
+
+    if (ViewDAGCombineLT && MatchFilterBB)
+      CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
+
+    // Run the DAG combiner in post-type-legalize mode.
+    {
+      NamedRegionTimer T("combine_lv", "DAG Combining after legalize vectors",
+                         GroupName, GroupDescription, TimePassesIsEnabled);
+      CurDAG->Combine(AfterLegalizeVectorOps, AA, OptLevel);
+    }
+
+    LLVM_DEBUG(dbgs() << "Optimized vector-legalized selection DAG: "
+                      << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                      << "'\n";
+               CurDAG->dump());
+
+#ifndef NDEBUG
+    if (TTI.hasBranchDivergence())
+      CurDAG->VerifyDAGDivergence();
+#endif
+  }
+
+  if (ViewLegalizeDAGs && MatchFilterBB)
+    CurDAG->viewGraph("legalize input for " + BlockName);
+
+  {
+    NamedRegionTimer T("legalize", "DAG Legalization", GroupName,
+                       GroupDescription, TimePassesIsEnabled);
+    CurDAG->Legalize();
+  }
+
+  LLVM_DEBUG(dbgs() << "Legalized selection DAG: "
+                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                    << "'\n";
+             CurDAG->dump());
+
+#ifndef NDEBUG
+  if (TTI.hasBranchDivergence())
+    CurDAG->VerifyDAGDivergence();
+#endif
+
+  if (ViewDAGCombine2 && MatchFilterBB)
+    CurDAG->viewGraph("dag-combine2 input for " + BlockName);
+
+  // Run the DAG combiner in post-legalize mode.
+  {
+    NamedRegionTimer T("combine2", "DAG Combining 2", GroupName,
+                       GroupDescription, TimePassesIsEnabled);
+    CurDAG->Combine(AfterLegalizeDAG, AA, OptLevel);
+  }
+
+  LLVM_DEBUG(dbgs() << "Optimized legalized selection DAG: "
+                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                    << "'\n";
+             CurDAG->dump());
+
+#ifndef NDEBUG
+  if (TTI.hasBranchDivergence())
+    CurDAG->VerifyDAGDivergence();
+#endif
+
+  if (OptLevel != CodeGenOpt::None)
+    ComputeLiveOutVRegInfo();
+
+  if (ViewISelDAGs && MatchFilterBB)
+    CurDAG->viewGraph("isel input for " + BlockName);
+
+  // Third, instruction select all of the operations to machine code, adding the
+  // code to the MachineBasicBlock.
+  {
+    NamedRegionTimer T("isel", "Instruction Selection", GroupName,
+                       GroupDescription, TimePassesIsEnabled);
+    DoInstructionSelection();
+  }
+
+  LLVM_DEBUG(dbgs() << "Selected selection DAG: "
+                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
+                    << "'\n";
+             CurDAG->dump());
+
+  if (ViewSchedDAGs && MatchFilterBB)
+    CurDAG->viewGraph("scheduler input for " + BlockName);
+
+  // Schedule machine code.
+  ScheduleDAGSDNodes *Scheduler = CreateScheduler();
+  {
+    NamedRegionTimer T("sched", "Instruction Scheduling", GroupName,
+                       GroupDescription, TimePassesIsEnabled);
+    Scheduler->Run(CurDAG, FuncInfo->MBB);
+  }
+
+  if (ViewSUnitDAGs && MatchFilterBB)
+    Scheduler->viewGraph();
+
+  // Emit machine code to BB.  This can change 'BB' to the last block being
+  // inserted into.
+  MachineBasicBlock *FirstMBB = FuncInfo->MBB, *LastMBB;
+  {
+    NamedRegionTimer T("emit", "Instruction Creation", GroupName,
+                       GroupDescription, TimePassesIsEnabled);
+
+    // FuncInfo->InsertPt is passed by reference and set to the end of the
+    // scheduled instructions.
+    LastMBB = FuncInfo->MBB = Scheduler->EmitSchedule(FuncInfo->InsertPt);
+  }
+
+  // If the block was split, make sure we update any references that are used to
+  // update PHI nodes later on.
+  if (FirstMBB != LastMBB)
+    SDB->UpdateSplitBlock(FirstMBB, LastMBB);
+
+  // Free the scheduler state.
+  {
+    NamedRegionTimer T("cleanup", "Instruction Scheduling Cleanup", GroupName,
+                       GroupDescription, TimePassesIsEnabled);
+    delete Scheduler;
+  }
+
+  // Free the SelectionDAG state, now that we're finished with it.
+  CurDAG->clear();
+}
+
+namespace {
+
+/// ISelUpdater - helper class to handle updates of the instruction selection
+/// graph.
+class ISelUpdater : public SelectionDAG::DAGUpdateListener {
+  SelectionDAG::allnodes_iterator &ISelPosition;
+
+public:
+  ISelUpdater(SelectionDAG &DAG, SelectionDAG::allnodes_iterator &isp)
+    : SelectionDAG::DAGUpdateListener(DAG), ISelPosition(isp) {}
+
+  /// NodeDeleted - Handle nodes deleted from the graph. If the node being
+  /// deleted is the current ISelPosition node, update ISelPosition.
+  ///
+  void NodeDeleted(SDNode *N, SDNode *E) override {
+    if (ISelPosition == SelectionDAG::allnodes_iterator(N))
+      ++ISelPosition;
+  }
+
+  /// NodeInserted - Handle new nodes inserted into the graph: propagate
+  /// metadata from root nodes that also applies to new nodes, in case the root
+  /// is later deleted.
+  void NodeInserted(SDNode *N) override {
+    SDNode *CurNode = &*ISelPosition;
+    if (MDNode *MD = DAG.getPCSections(CurNode))
+      DAG.addPCSections(N, MD);
+  }
+};
+
+} // end anonymous namespace
+
+// This function is used to enforce the topological node id property
+// leveraged during instruction selection. Before the selection process all
+// nodes are given a non-negative id such that all nodes have a greater id than
+// their operands. As this holds transitively we can prune checks that a node N
+// is a predecessor of M another by not recursively checking through M's
+// operands if N's ID is larger than M's ID. This significantly improves
+// performance of various legality checks (e.g. IsLegalToFold / UpdateChains).
+
+// However, when we fuse multiple nodes into a single node during the
+// selection we may induce a predecessor relationship between inputs and
+// outputs of distinct nodes being merged, violating the topological property.
+// Should a fused node have a successor which has yet to be selected,
+// our legality checks would be incorrect. To avoid this we mark all unselected
+// successor nodes, i.e. id != -1, as invalid for pruning by bit-negating (x =>
+// (-(x+1))) the ids and modify our pruning check to ignore negative Ids of M.
+// We use bit-negation to more clearly enforce that node id -1 can only be
+// achieved by selected nodes. As the conversion is reversable to the original
+// Id, topological pruning can still be leveraged when looking for unselected
+// nodes. This method is called internally in all ISel replacement related
+// functions.
+void SelectionDAGISel::EnforceNodeIdInvariant(SDNode *Node) {
+  SmallVector<SDNode *, 4> Nodes;
+  Nodes.push_back(Node);
+
+  while (!Nodes.empty()) {
+    SDNode *N = Nodes.pop_back_val();
+    for (auto *U : N->uses()) {
+      auto UId = U->getNodeId();
+      if (UId > 0) {
+        InvalidateNodeId(U);
+        Nodes.push_back(U);
+      }
+    }
+  }
+}
+
+// InvalidateNodeId - As explained in EnforceNodeIdInvariant, mark a
+// NodeId with the equivalent node id which is invalid for topological
+// pruning.
+void SelectionDAGISel::InvalidateNodeId(SDNode *N) {
+  int InvalidId = -(N->getNodeId() + 1);
+  N->setNodeId(InvalidId);
+}
+
+// getUninvalidatedNodeId - get original uninvalidated node id.
+int SelectionDAGISel::getUninvalidatedNodeId(SDNode *N) {
+  int Id = N->getNodeId();
+  if (Id < -1)
+    return -(Id + 1);
+  return Id;
+}
+
+void SelectionDAGISel::DoInstructionSelection() {
+  LLVM_DEBUG(dbgs() << "===== Instruction selection begins: "
+                    << printMBBReference(*FuncInfo->MBB) << " '"
+                    << FuncInfo->MBB->getName() << "'\n");
+
+  PreprocessISelDAG();
+
+  // Select target instructions for the DAG.
+  {
+    // Number all nodes with a topological order and set DAGSize.
+    DAGSize = CurDAG->AssignTopologicalOrder();
+
+    // Create a dummy node (which is not added to allnodes), that adds
+    // a reference to the root node, preventing it from being deleted,
+    // and tracking any changes of the root.
+    HandleSDNode Dummy(CurDAG->getRoot());
+    SelectionDAG::allnodes_iterator ISelPosition (CurDAG->getRoot().getNode());
+    ++ISelPosition;
+
+    // Make sure that ISelPosition gets properly updated when nodes are deleted
+    // in calls made from this function. New nodes inherit relevant metadata.
+    ISelUpdater ISU(*CurDAG, ISelPosition);
+
+    // The AllNodes list is now topological-sorted. Visit the
+    // nodes by starting at the end of the list (the root of the
+    // graph) and preceding back toward the beginning (the entry
+    // node).
+    while (ISelPosition != CurDAG->allnodes_begin()) {
+      SDNode *Node = &*--ISelPosition;
+      // Skip dead nodes. DAGCombiner is expected to eliminate all dead nodes,
+      // but there are currently some corner cases that it misses. Also, this
+      // makes it theoretically possible to disable the DAGCombiner.
+      if (Node->use_empty())
+        continue;
+
+#ifndef NDEBUG
+      SmallVector<SDNode *, 4> Nodes;
+      Nodes.push_back(Node);
+
+      while (!Nodes.empty()) {
+        auto N = Nodes.pop_back_val();
+        if (N->getOpcode() == ISD::TokenFactor || N->getNodeId() < 0)
+          continue;
+        for (const SDValue &Op : N->op_values()) {
+          if (Op->getOpcode() == ISD::TokenFactor)
+            Nodes.push_back(Op.getNode());
+          else {
+            // We rely on topological ordering of node ids for checking for
+            // cycles when fusing nodes during selection. All unselected nodes
+            // successors of an already selected node should have a negative id.
+            // This assertion will catch such cases. If this assertion triggers
+            // it is likely you using DAG-level Value/Node replacement functions
+            // (versus equivalent ISEL replacement) in backend-specific
+            // selections. See comment in EnforceNodeIdInvariant for more
+            // details.
+            assert(Op->getNodeId() != -1 &&
+                   "Node has already selected predecessor node");
+          }
+        }
+      }
+#endif
+
+      // When we are using non-default rounding modes or FP exception behavior
+      // FP operations are represented by StrictFP pseudo-operations.  For
+      // targets that do not (yet) understand strict FP operations directly,
+      // we convert them to normal FP opcodes instead at this point.  This
+      // will allow them to be handled by existing target-specific instruction
+      // selectors.
+      if (!TLI->isStrictFPEnabled() && Node->isStrictFPOpcode()) {
+        // For some opcodes, we need to call TLI->getOperationAction using
+        // the first operand type instead of the result type.  Note that this
+        // must match what SelectionDAGLegalize::LegalizeOp is doing.
+        EVT ActionVT;
+        switch (Node->getOpcode()) {
+        case ISD::STRICT_SINT_TO_FP:
+        case ISD::STRICT_UINT_TO_FP:
+        case ISD::STRICT_LRINT:
+        case ISD::STRICT_LLRINT:
+        case ISD::STRICT_LROUND:
+        case ISD::STRICT_LLROUND:
+        case ISD::STRICT_FSETCC:
+        case ISD::STRICT_FSETCCS:
+          ActionVT = Node->getOperand(1).getValueType();
+          break;
+        default:
+          ActionVT = Node->getValueType(0);
+          break;
+        }
+        if (TLI->getOperationAction(Node->getOpcode(), ActionVT)
+            == TargetLowering::Expand)
+          Node = CurDAG->mutateStrictFPToFP(Node);
+      }
+
+      LLVM_DEBUG(dbgs() << "\nISEL: Starting selection on root node: ";
+                 Node->dump(CurDAG));
+
+      Select(Node);
+    }
+
+    CurDAG->setRoot(Dummy.getValue());
+  }
+
+  LLVM_DEBUG(dbgs() << "\n===== Instruction selection ends:\n");
+
+  PostprocessISelDAG();
+}
+
+static bool hasExceptionPointerOrCodeUser(const CatchPadInst *CPI) {
+  for (const User *U : CPI->users()) {
+    if (const IntrinsicInst *EHPtrCall = dyn_cast<IntrinsicInst>(U)) {
+      Intrinsic::ID IID = EHPtrCall->getIntrinsicID();
+      if (IID == Intrinsic::eh_exceptionpointer ||
+          IID == Intrinsic::eh_exceptioncode)
+        return true;
+    }
+  }
+  return false;
+}
+
+// wasm.landingpad.index intrinsic is for associating a landing pad index number
+// with a catchpad instruction. Retrieve the landing pad index in the intrinsic
+// and store the mapping in the function.
+static void mapWasmLandingPadIndex(MachineBasicBlock *MBB,
+                                   const CatchPadInst *CPI) {
+  MachineFunction *MF = MBB->getParent();
+  // In case of single catch (...), we don't emit LSDA, so we don't need
+  // this information.
+  bool IsSingleCatchAllClause =
+      CPI->arg_size() == 1 &&
+      cast<Constant>(CPI->getArgOperand(0))->isNullValue();
+  // cathchpads for longjmp use an empty type list, e.g. catchpad within %0 []
+  // and they don't need LSDA info
+  bool IsCatchLongjmp = CPI->arg_size() == 0;
+  if (!IsSingleCatchAllClause && !IsCatchLongjmp) {
+    // Create a mapping from landing pad label to landing pad index.
+    bool IntrFound = false;
+    for (const User *U : CPI->users()) {
+      if (const auto *Call = dyn_cast<IntrinsicInst>(U)) {
+        Intrinsic::ID IID = Call->getIntrinsicID();
+        if (IID == Intrinsic::wasm_landingpad_index) {
+          Value *IndexArg = Call->getArgOperand(1);
+          int Index = cast<ConstantInt>(IndexArg)->getZExtValue();
+          MF->setWasmLandingPadIndex(MBB, Index);
+          IntrFound = true;
+          break;
+        }
+      }
+    }
+    assert(IntrFound && "wasm.landingpad.index intrinsic not found!");
+    (void)IntrFound;
+  }
+}
+
+/// PrepareEHLandingPad - Emit an EH_LABEL, set up live-in registers, and
+/// do other setup for EH landing-pad blocks.
+bool SelectionDAGISel::PrepareEHLandingPad() {
+  MachineBasicBlock *MBB = FuncInfo->MBB;
+  const Constant *PersonalityFn = FuncInfo->Fn->getPersonalityFn();
+  const BasicBlock *LLVMBB = MBB->getBasicBlock();
+  const TargetRegisterClass *PtrRC =
+      TLI->getRegClassFor(TLI->getPointerTy(CurDAG->getDataLayout()));
+
+  auto Pers = classifyEHPersonality(PersonalityFn);
+
+  // Catchpads have one live-in register, which typically holds the exception
+  // pointer or code.
+  if (isFuncletEHPersonality(Pers)) {
+    if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI())) {
+      if (hasExceptionPointerOrCodeUser(CPI)) {
+        // Get or create the virtual register to hold the pointer or code.  Mark
+        // the live in physreg and copy into the vreg.
+        MCPhysReg EHPhysReg = TLI->getExceptionPointerRegister(PersonalityFn);
+        assert(EHPhysReg && "target lacks exception pointer register");
+        MBB->addLiveIn(EHPhysReg);
+        unsigned VReg = FuncInfo->getCatchPadExceptionPointerVReg(CPI, PtrRC);
+        BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(),
+                TII->get(TargetOpcode::COPY), VReg)
+            .addReg(EHPhysReg, RegState::Kill);
+      }
+    }
+    return true;
+  }
+
+  // Add a label to mark the beginning of the landing pad.  Deletion of the
+  // landing pad can thus be detected via the MachineModuleInfo.
+  MCSymbol *Label = MF->addLandingPad(MBB);
+
+  const MCInstrDesc &II = TII->get(TargetOpcode::EH_LABEL);
+  BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
+    .addSym(Label);
+
+  // If the unwinder does not preserve all registers, ensure that the
+  // function marks the clobbered registers as used.
+  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+  if (auto *RegMask = TRI.getCustomEHPadPreservedMask(*MF))
+    MF->getRegInfo().addPhysRegsUsedFromRegMask(RegMask);
+
+  if (Pers == EHPersonality::Wasm_CXX) {
+    if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI()))
+      mapWasmLandingPadIndex(MBB, CPI);
+  } else {
+    // Assign the call site to the landing pad's begin label.
+    MF->setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]);
+    // Mark exception register as live in.
+    if (unsigned Reg = TLI->getExceptionPointerRegister(PersonalityFn))
+      FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
+    // Mark exception selector register as live in.
+    if (unsigned Reg = TLI->getExceptionSelectorRegister(PersonalityFn))
+      FuncInfo->ExceptionSelectorVirtReg = MBB->addLiveIn(Reg, PtrRC);
+  }
+
+  return true;
+}
+
+// Mark and Report IPToState for each Block under IsEHa
+void SelectionDAGISel::reportIPToStateForBlocks(MachineFunction *MF) {
+  MachineModuleInfo &MMI = MF->getMMI();
+  llvm::WinEHFuncInfo *EHInfo = MF->getWinEHFuncInfo();
+  if (!EHInfo)
+    return;
+  for (auto MBBI = MF->begin(), E = MF->end(); MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = &*MBBI;
+    const BasicBlock *BB = MBB->getBasicBlock();
+    int State = EHInfo->BlockToStateMap[BB];
+    if (BB->getFirstMayFaultInst()) {
+      // Report IP range only for blocks with Faulty inst
+      auto MBBb = MBB->getFirstNonPHI();
+      MachineInstr *MIb = &*MBBb;
+      if (MIb->isTerminator())
+        continue;
+
+      // Insert EH Labels
+      MCSymbol *BeginLabel = MMI.getContext().createTempSymbol();
+      MCSymbol *EndLabel = MMI.getContext().createTempSymbol();
+      EHInfo->addIPToStateRange(State, BeginLabel, EndLabel);
+      BuildMI(*MBB, MBBb, SDB->getCurDebugLoc(),
+              TII->get(TargetOpcode::EH_LABEL))
+          .addSym(BeginLabel);
+      auto MBBe = MBB->instr_end();
+      MachineInstr *MIe = &*(--MBBe);
+      // insert before (possible multiple) terminators
+      while (MIe->isTerminator())
+        MIe = &*(--MBBe);
+      ++MBBe;
+      BuildMI(*MBB, MBBe, SDB->getCurDebugLoc(),
+              TII->get(TargetOpcode::EH_LABEL))
+          .addSym(EndLabel);
+    }
+  }
+}
+
+/// isFoldedOrDeadInstruction - Return true if the specified instruction is
+/// side-effect free and is either dead or folded into a generated instruction.
+/// Return false if it needs to be emitted.
+static bool isFoldedOrDeadInstruction(const Instruction *I,
+                                      const FunctionLoweringInfo &FuncInfo) {
+  return !I->mayWriteToMemory() && // Side-effecting instructions aren't folded.
+         !I->isTerminator() &&     // Terminators aren't folded.
+         !isa<DbgInfoIntrinsic>(I) && // Debug instructions aren't folded.
+         !I->isEHPad() &&             // EH pad instructions aren't folded.
+         !FuncInfo.isExportedInst(I); // Exported instrs must be computed.
+}
+
+static bool processIfEntryValueDbgDeclare(FunctionLoweringInfo &FuncInfo,
+                                          const Value *Arg, DIExpression *Expr,
+                                          DILocalVariable *Var,
+                                          DebugLoc DbgLoc) {
+  if (!Expr->isEntryValue() || !isa<Argument>(Arg))
+    return false;
+
+  auto ArgIt = FuncInfo.ValueMap.find(Arg);
+  if (ArgIt == FuncInfo.ValueMap.end())
+    return false;
+  Register ArgVReg = ArgIt->getSecond();
+
+  // Find the corresponding livein physical register to this argument.
+  for (auto [PhysReg, VirtReg] : FuncInfo.RegInfo->liveins())
+    if (VirtReg == ArgVReg) {
+      FuncInfo.MF->setVariableDbgInfo(Var, Expr, PhysReg, DbgLoc);
+      LLVM_DEBUG(dbgs() << "processDbgDeclare: setVariableDbgInfo Var=" << *Var
+                        << ", Expr=" << *Expr << ",  MCRegister=" << PhysReg
+                        << ", DbgLoc=" << DbgLoc << "\n");
+      return true;
+    }
+  return false;
+}
+
+static bool processDbgDeclare(FunctionLoweringInfo &FuncInfo,
+                              const Value *Address, DIExpression *Expr,
+                              DILocalVariable *Var, DebugLoc DbgLoc) {
+  if (!Address) {
+    LLVM_DEBUG(dbgs() << "processDbgDeclares skipping " << *Var
+                      << " (bad address)\n");
+    return false;
+  }
+
+  if (processIfEntryValueDbgDeclare(FuncInfo, Address, Expr, Var, DbgLoc))
+    return true;
+
+  MachineFunction *MF = FuncInfo.MF;
+  const DataLayout &DL = MF->getDataLayout();
+
+  assert(Var && "Missing variable");
+  assert(DbgLoc && "Missing location");
+
+  // Look through casts and constant offset GEPs. These mostly come from
+  // inalloca.
+  APInt Offset(DL.getTypeSizeInBits(Address->getType()), 0);
+  Address = Address->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+
+  // Check if the variable is a static alloca or a byval or inalloca
+  // argument passed in memory. If it is not, then we will ignore this
+  // intrinsic and handle this during isel like dbg.value.
+  int FI = std::numeric_limits<int>::max();
+  if (const auto *AI = dyn_cast<AllocaInst>(Address)) {
+    auto SI = FuncInfo.StaticAllocaMap.find(AI);
+    if (SI != FuncInfo.StaticAllocaMap.end())
+      FI = SI->second;
+  } else if (const auto *Arg = dyn_cast<Argument>(Address))
+    FI = FuncInfo.getArgumentFrameIndex(Arg);
+
+  if (FI == std::numeric_limits<int>::max())
+    return false;
+
+  if (Offset.getBoolValue())
+    Expr = DIExpression::prepend(Expr, DIExpression::ApplyOffset,
+                                 Offset.getZExtValue());
+
+  LLVM_DEBUG(dbgs() << "processDbgDeclare: setVariableDbgInfo Var=" << *Var
+                    << ", Expr=" << *Expr << ",  FI=" << FI
+                    << ", DbgLoc=" << DbgLoc << "\n");
+  MF->setVariableDbgInfo(Var, Expr, FI, DbgLoc);
+  return true;
+}
+
+/// Collect llvm.dbg.declare information. This is done after argument lowering
+/// in case the declarations refer to arguments.
+static void processDbgDeclares(FunctionLoweringInfo &FuncInfo) {
+  for (const auto &I : instructions(*FuncInfo.Fn)) {
+    const auto *DI = dyn_cast<DbgDeclareInst>(&I);
+    if (DI && processDbgDeclare(FuncInfo, DI->getAddress(), DI->getExpression(),
+                                DI->getVariable(), DI->getDebugLoc()))
+      FuncInfo.PreprocessedDbgDeclares.insert(DI);
+  }
+}
+
+/// Collect single location variable information generated with assignment
+/// tracking. This is done after argument lowering in case the declarations
+/// refer to arguments.
+static void processSingleLocVars(FunctionLoweringInfo &FuncInfo,
+                                 FunctionVarLocs const *FnVarLocs) {
+  for (auto It = FnVarLocs->single_locs_begin(),
+            End = FnVarLocs->single_locs_end();
+       It != End; ++It) {
+    assert(!It->Values.hasArgList() && "Single loc variadic ops not supported");
+    processDbgDeclare(FuncInfo, It->Values.getVariableLocationOp(0), It->Expr,
+                      FnVarLocs->getDILocalVariable(It->VariableID), It->DL);
+  }
+}
+
+void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
+  FastISelFailed = false;
+  // Initialize the Fast-ISel state, if needed.
+  FastISel *FastIS = nullptr;
+  if (TM.Options.EnableFastISel) {
+    LLVM_DEBUG(dbgs() << "Enabling fast-isel\n");
+    FastIS = TLI->createFastISel(*FuncInfo, LibInfo);
+  }
+
+  ReversePostOrderTraversal<const Function*> RPOT(&Fn);
+
+  // Lower arguments up front. An RPO iteration always visits the entry block
+  // first.
+  assert(*RPOT.begin() == &Fn.getEntryBlock());
+  ++NumEntryBlocks;
+
+  // Set up FuncInfo for ISel. Entry blocks never have PHIs.
+  FuncInfo->MBB = FuncInfo->MBBMap[&Fn.getEntryBlock()];
+  FuncInfo->InsertPt = FuncInfo->MBB->begin();
+
+  CurDAG->setFunctionLoweringInfo(FuncInfo.get());
+
+  if (!FastIS) {
+    LowerArguments(Fn);
+  } else {
+    // See if fast isel can lower the arguments.
+    FastIS->startNewBlock();
+    if (!FastIS->lowerArguments()) {
+      FastISelFailed = true;
+      // Fast isel failed to lower these arguments
+      ++NumFastIselFailLowerArguments;
+
+      OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
+                                 Fn.getSubprogram(),
+                                 &Fn.getEntryBlock());
+      R << "FastISel didn't lower all arguments: "
+        << ore::NV("Prototype", Fn.getFunctionType());
+      reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 1);
+
+      // Use SelectionDAG argument lowering
+      LowerArguments(Fn);
+      CurDAG->setRoot(SDB->getControlRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    // If we inserted any instructions at the beginning, make a note of
+    // where they are, so we can be sure to emit subsequent instructions
+    // after them.
+    if (FuncInfo->InsertPt != FuncInfo->MBB->begin())
+      FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt));
+    else
+      FastIS->setLastLocalValue(nullptr);
+  }
+
+  bool Inserted = SwiftError->createEntriesInEntryBlock(SDB->getCurDebugLoc());
+
+  if (FastIS && Inserted)
+    FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt));
+
+  if (isAssignmentTrackingEnabled(*Fn.getParent())) {
+    assert(CurDAG->getFunctionVarLocs() &&
+           "expected AssignmentTrackingAnalysis pass results");
+    processSingleLocVars(*FuncInfo, CurDAG->getFunctionVarLocs());
+  } else {
+    processDbgDeclares(*FuncInfo);
+  }
+
+  // Iterate over all basic blocks in the function.
+  StackProtector &SP = getAnalysis<StackProtector>();
+  for (const BasicBlock *LLVMBB : RPOT) {
+    if (OptLevel != CodeGenOpt::None) {
+      bool AllPredsVisited = true;
+      for (const BasicBlock *Pred : predecessors(LLVMBB)) {
+        if (!FuncInfo->VisitedBBs.count(Pred)) {
+          AllPredsVisited = false;
+          break;
+        }
+      }
+
+      if (AllPredsVisited) {
+        for (const PHINode &PN : LLVMBB->phis())
+          FuncInfo->ComputePHILiveOutRegInfo(&PN);
+      } else {
+        for (const PHINode &PN : LLVMBB->phis())
+          FuncInfo->InvalidatePHILiveOutRegInfo(&PN);
+      }
+
+      FuncInfo->VisitedBBs.insert(LLVMBB);
+    }
+
+    BasicBlock::const_iterator const Begin =
+        LLVMBB->getFirstNonPHI()->getIterator();
+    BasicBlock::const_iterator const End = LLVMBB->end();
+    BasicBlock::const_iterator BI = End;
+
+    FuncInfo->MBB = FuncInfo->MBBMap[LLVMBB];
+    if (!FuncInfo->MBB)
+      continue; // Some blocks like catchpads have no code or MBB.
+
+    // Insert new instructions after any phi or argument setup code.
+    FuncInfo->InsertPt = FuncInfo->MBB->end();
+
+    // Setup an EH landing-pad block.
+    FuncInfo->ExceptionPointerVirtReg = 0;
+    FuncInfo->ExceptionSelectorVirtReg = 0;
+    if (LLVMBB->isEHPad())
+      if (!PrepareEHLandingPad())
+        continue;
+
+    // Before doing SelectionDAG ISel, see if FastISel has been requested.
+    if (FastIS) {
+      if (LLVMBB != &Fn.getEntryBlock())
+        FastIS->startNewBlock();
+
+      unsigned NumFastIselRemaining = std::distance(Begin, End);
+
+      // Pre-assign swifterror vregs.
+      SwiftError->preassignVRegs(FuncInfo->MBB, Begin, End);
+
+      // Do FastISel on as many instructions as possible.
+      for (; BI != Begin; --BI) {
+        const Instruction *Inst = &*std::prev(BI);
+
+        // If we no longer require this instruction, skip it.
+        if (isFoldedOrDeadInstruction(Inst, *FuncInfo) ||
+            ElidedArgCopyInstrs.count(Inst)) {
+          --NumFastIselRemaining;
+          continue;
+        }
+
+        // Bottom-up: reset the insert pos at the top, after any local-value
+        // instructions.
+        FastIS->recomputeInsertPt();
+
+        // Try to select the instruction with FastISel.
+        if (FastIS->selectInstruction(Inst)) {
+          --NumFastIselRemaining;
+          ++NumFastIselSuccess;
+          // If fast isel succeeded, skip over all the folded instructions, and
+          // then see if there is a load right before the selected instructions.
+          // Try to fold the load if so.
+          const Instruction *BeforeInst = Inst;
+          while (BeforeInst != &*Begin) {
+            BeforeInst = &*std::prev(BasicBlock::const_iterator(BeforeInst));
+            if (!isFoldedOrDeadInstruction(BeforeInst, *FuncInfo))
+              break;
+          }
+          if (BeforeInst != Inst && isa<LoadInst>(BeforeInst) &&
+              BeforeInst->hasOneUse() &&
+              FastIS->tryToFoldLoad(cast<LoadInst>(BeforeInst), Inst)) {
+            // If we succeeded, don't re-select the load.
+            LLVM_DEBUG(dbgs()
+                       << "FastISel folded load: " << *BeforeInst << "\n");
+            BI = std::next(BasicBlock::const_iterator(BeforeInst));
+            --NumFastIselRemaining;
+            ++NumFastIselSuccess;
+          }
+          continue;
+        }
+
+        FastISelFailed = true;
+
+        // Then handle certain instructions as single-LLVM-Instruction blocks.
+        // We cannot separate out GCrelocates to their own blocks since we need
+        // to keep track of gc-relocates for a particular gc-statepoint. This is
+        // done by SelectionDAGBuilder::LowerAsSTATEPOINT, called before
+        // visitGCRelocate.
+        if (isa<CallInst>(Inst) && !isa<GCStatepointInst>(Inst) &&
+            !isa<GCRelocateInst>(Inst) && !isa<GCResultInst>(Inst)) {
+          OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
+                                     Inst->getDebugLoc(), LLVMBB);
+
+          R << "FastISel missed call";
+
+          if (R.isEnabled() || EnableFastISelAbort) {
+            std::string InstStrStorage;
+            raw_string_ostream InstStr(InstStrStorage);
+            InstStr << *Inst;
+
+            R << ": " << InstStr.str();
+          }
+
+          reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 2);
+
+          if (!Inst->getType()->isVoidTy() && !Inst->getType()->isTokenTy() &&
+              !Inst->use_empty()) {
+            Register &R = FuncInfo->ValueMap[Inst];
+            if (!R)
+              R = FuncInfo->CreateRegs(Inst);
+          }
+
+          bool HadTailCall = false;
+          MachineBasicBlock::iterator SavedInsertPt = FuncInfo->InsertPt;
+          SelectBasicBlock(Inst->getIterator(), BI, HadTailCall);
+
+          // If the call was emitted as a tail call, we're done with the block.
+          // We also need to delete any previously emitted instructions.
+          if (HadTailCall) {
+            FastIS->removeDeadCode(SavedInsertPt, FuncInfo->MBB->end());
+            --BI;
+            break;
+          }
+
+          // Recompute NumFastIselRemaining as Selection DAG instruction
+          // selection may have handled the call, input args, etc.
+          unsigned RemainingNow = std::distance(Begin, BI);
+          NumFastIselFailures += NumFastIselRemaining - RemainingNow;
+          NumFastIselRemaining = RemainingNow;
+          continue;
+        }
+
+        OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
+                                   Inst->getDebugLoc(), LLVMBB);
+
+        bool ShouldAbort = EnableFastISelAbort;
+        if (Inst->isTerminator()) {
+          // Use a different message for terminator misses.
+          R << "FastISel missed terminator";
+          // Don't abort for terminator unless the level is really high
+          ShouldAbort = (EnableFastISelAbort > 2);
+        } else {
+          R << "FastISel missed";
+        }
+
+        if (R.isEnabled() || EnableFastISelAbort) {
+          std::string InstStrStorage;
+          raw_string_ostream InstStr(InstStrStorage);
+          InstStr << *Inst;
+          R << ": " << InstStr.str();
+        }
+
+        reportFastISelFailure(*MF, *ORE, R, ShouldAbort);
+
+        NumFastIselFailures += NumFastIselRemaining;
+        break;
+      }
+
+      FastIS->recomputeInsertPt();
+    }
+
+    if (SP.shouldEmitSDCheck(*LLVMBB)) {
+      bool FunctionBasedInstrumentation =
+          TLI->getSSPStackGuardCheck(*Fn.getParent());
+      SDB->SPDescriptor.initialize(LLVMBB, FuncInfo->MBBMap[LLVMBB],
+                                   FunctionBasedInstrumentation);
+    }
+
+    if (Begin != BI)
+      ++NumDAGBlocks;
+    else
+      ++NumFastIselBlocks;
+
+    if (Begin != BI) {
+      // Run SelectionDAG instruction selection on the remainder of the block
+      // not handled by FastISel. If FastISel is not run, this is the entire
+      // block.
+      bool HadTailCall;
+      SelectBasicBlock(Begin, BI, HadTailCall);
+
+      // But if FastISel was run, we already selected some of the block.
+      // If we emitted a tail-call, we need to delete any previously emitted
+      // instruction that follows it.
+      if (FastIS && HadTailCall && FuncInfo->InsertPt != FuncInfo->MBB->end())
+        FastIS->removeDeadCode(FuncInfo->InsertPt, FuncInfo->MBB->end());
+    }
+
+    if (FastIS)
+      FastIS->finishBasicBlock();
+    FinishBasicBlock();
+    FuncInfo->PHINodesToUpdate.clear();
+    ElidedArgCopyInstrs.clear();
+  }
+
+  // AsynchEH: Report Block State under -AsynchEH
+  if (Fn.getParent()->getModuleFlag("eh-asynch"))
+    reportIPToStateForBlocks(MF);
+
+  SP.copyToMachineFrameInfo(MF->getFrameInfo());
+
+  SwiftError->propagateVRegs();
+
+  delete FastIS;
+  SDB->clearDanglingDebugInfo();
+  SDB->SPDescriptor.resetPerFunctionState();
+}
+
+void
+SelectionDAGISel::FinishBasicBlock() {
+  LLVM_DEBUG(dbgs() << "Total amount of phi nodes to update: "
+                    << FuncInfo->PHINodesToUpdate.size() << "\n";
+             for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e;
+                  ++i) dbgs()
+             << "Node " << i << " : (" << FuncInfo->PHINodesToUpdate[i].first
+             << ", " << FuncInfo->PHINodesToUpdate[i].second << ")\n");
+
+  // Next, now that we know what the last MBB the LLVM BB expanded is, update
+  // PHI nodes in successors.
+  for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i) {
+    MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[i].first);
+    assert(PHI->isPHI() &&
+           "This is not a machine PHI node that we are updating!");
+    if (!FuncInfo->MBB->isSuccessor(PHI->getParent()))
+      continue;
+    PHI.addReg(FuncInfo->PHINodesToUpdate[i].second).addMBB(FuncInfo->MBB);
+  }
+
+  // Handle stack protector.
+  if (SDB->SPDescriptor.shouldEmitFunctionBasedCheckStackProtector()) {
+    // The target provides a guard check function. There is no need to
+    // generate error handling code or to split current basic block.
+    MachineBasicBlock *ParentMBB = SDB->SPDescriptor.getParentMBB();
+
+    // Add load and check to the basicblock.
+    FuncInfo->MBB = ParentMBB;
+    FuncInfo->InsertPt =
+        findSplitPointForStackProtector(ParentMBB, *TII);
+    SDB->visitSPDescriptorParent(SDB->SPDescriptor, ParentMBB);
+    CurDAG->setRoot(SDB->getRoot());
+    SDB->clear();
+    CodeGenAndEmitDAG();
+
+    // Clear the Per-BB State.
+    SDB->SPDescriptor.resetPerBBState();
+  } else if (SDB->SPDescriptor.shouldEmitStackProtector()) {
+    MachineBasicBlock *ParentMBB = SDB->SPDescriptor.getParentMBB();
+    MachineBasicBlock *SuccessMBB = SDB->SPDescriptor.getSuccessMBB();
+
+    // Find the split point to split the parent mbb. At the same time copy all
+    // physical registers used in the tail of parent mbb into virtual registers
+    // before the split point and back into physical registers after the split
+    // point. This prevents us needing to deal with Live-ins and many other
+    // register allocation issues caused by us splitting the parent mbb. The
+    // register allocator will clean up said virtual copies later on.
+    MachineBasicBlock::iterator SplitPoint =
+        findSplitPointForStackProtector(ParentMBB, *TII);
+
+    // Splice the terminator of ParentMBB into SuccessMBB.
+    SuccessMBB->splice(SuccessMBB->end(), ParentMBB,
+                       SplitPoint,
+                       ParentMBB->end());
+
+    // Add compare/jump on neq/jump to the parent BB.
+    FuncInfo->MBB = ParentMBB;
+    FuncInfo->InsertPt = ParentMBB->end();
+    SDB->visitSPDescriptorParent(SDB->SPDescriptor, ParentMBB);
+    CurDAG->setRoot(SDB->getRoot());
+    SDB->clear();
+    CodeGenAndEmitDAG();
+
+    // CodeGen Failure MBB if we have not codegened it yet.
+    MachineBasicBlock *FailureMBB = SDB->SPDescriptor.getFailureMBB();
+    if (FailureMBB->empty()) {
+      FuncInfo->MBB = FailureMBB;
+      FuncInfo->InsertPt = FailureMBB->end();
+      SDB->visitSPDescriptorFailure(SDB->SPDescriptor);
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    // Clear the Per-BB State.
+    SDB->SPDescriptor.resetPerBBState();
+  }
+
+  // Lower each BitTestBlock.
+  for (auto &BTB : SDB->SL->BitTestCases) {
+    // Lower header first, if it wasn't already lowered
+    if (!BTB.Emitted) {
+      // Set the current basic block to the mbb we wish to insert the code into
+      FuncInfo->MBB = BTB.Parent;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // Emit the code
+      SDB->visitBitTestHeader(BTB, FuncInfo->MBB);
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    BranchProbability UnhandledProb = BTB.Prob;
+    for (unsigned j = 0, ej = BTB.Cases.size(); j != ej; ++j) {
+      UnhandledProb -= BTB.Cases[j].ExtraProb;
+      // Set the current basic block to the mbb we wish to insert the code into
+      FuncInfo->MBB = BTB.Cases[j].ThisBB;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // Emit the code
+
+      // If all cases cover a contiguous range, it is not necessary to jump to
+      // the default block after the last bit test fails. This is because the
+      // range check during bit test header creation has guaranteed that every
+      // case here doesn't go outside the range. In this case, there is no need
+      // to perform the last bit test, as it will always be true. Instead, make
+      // the second-to-last bit-test fall through to the target of the last bit
+      // test, and delete the last bit test.
+
+      MachineBasicBlock *NextMBB;
+      if ((BTB.ContiguousRange || BTB.FallthroughUnreachable) && j + 2 == ej) {
+        // Second-to-last bit-test with contiguous range or omitted range
+        // check: fall through to the target of the final bit test.
+        NextMBB = BTB.Cases[j + 1].TargetBB;
+      } else if (j + 1 == ej) {
+        // For the last bit test, fall through to Default.
+        NextMBB = BTB.Default;
+      } else {
+        // Otherwise, fall through to the next bit test.
+        NextMBB = BTB.Cases[j + 1].ThisBB;
+      }
+
+      SDB->visitBitTestCase(BTB, NextMBB, UnhandledProb, BTB.Reg, BTB.Cases[j],
+                            FuncInfo->MBB);
+
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+
+      if ((BTB.ContiguousRange || BTB.FallthroughUnreachable) && j + 2 == ej) {
+        // Since we're not going to use the final bit test, remove it.
+        BTB.Cases.pop_back();
+        break;
+      }
+    }
+
+    // Update PHI Nodes
+    for (const std::pair<MachineInstr *, unsigned> &P :
+         FuncInfo->PHINodesToUpdate) {
+      MachineInstrBuilder PHI(*MF, P.first);
+      MachineBasicBlock *PHIBB = PHI->getParent();
+      assert(PHI->isPHI() &&
+             "This is not a machine PHI node that we are updating!");
+      // This is "default" BB. We have two jumps to it. From "header" BB and
+      // from last "case" BB, unless the latter was skipped.
+      if (PHIBB == BTB.Default) {
+        PHI.addReg(P.second).addMBB(BTB.Parent);
+        if (!BTB.ContiguousRange) {
+          PHI.addReg(P.second).addMBB(BTB.Cases.back().ThisBB);
+         }
+      }
+      // One of "cases" BB.
+      for (const SwitchCG::BitTestCase &BT : BTB.Cases) {
+        MachineBasicBlock* cBB = BT.ThisBB;
+        if (cBB->isSuccessor(PHIBB))
+          PHI.addReg(P.second).addMBB(cBB);
+      }
+    }
+  }
+  SDB->SL->BitTestCases.clear();
+
+  // If the JumpTable record is filled in, then we need to emit a jump table.
+  // Updating the PHI nodes is tricky in this case, since we need to determine
+  // whether the PHI is a successor of the range check MBB or the jump table MBB
+  for (unsigned i = 0, e = SDB->SL->JTCases.size(); i != e; ++i) {
+    // Lower header first, if it wasn't already lowered
+    if (!SDB->SL->JTCases[i].first.Emitted) {
+      // Set the current basic block to the mbb we wish to insert the code into
+      FuncInfo->MBB = SDB->SL->JTCases[i].first.HeaderBB;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // Emit the code
+      SDB->visitJumpTableHeader(SDB->SL->JTCases[i].second,
+                                SDB->SL->JTCases[i].first, FuncInfo->MBB);
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    // Set the current basic block to the mbb we wish to insert the code into
+    FuncInfo->MBB = SDB->SL->JTCases[i].second.MBB;
+    FuncInfo->InsertPt = FuncInfo->MBB->end();
+    // Emit the code
+    SDB->visitJumpTable(SDB->SL->JTCases[i].second);
+    CurDAG->setRoot(SDB->getRoot());
+    SDB->clear();
+    CodeGenAndEmitDAG();
+
+    // Update PHI Nodes
+    for (unsigned pi = 0, pe = FuncInfo->PHINodesToUpdate.size();
+         pi != pe; ++pi) {
+      MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[pi].first);
+      MachineBasicBlock *PHIBB = PHI->getParent();
+      assert(PHI->isPHI() &&
+             "This is not a machine PHI node that we are updating!");
+      // "default" BB. We can go there only from header BB.
+      if (PHIBB == SDB->SL->JTCases[i].second.Default)
+        PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second)
+           .addMBB(SDB->SL->JTCases[i].first.HeaderBB);
+      // JT BB. Just iterate over successors here
+      if (FuncInfo->MBB->isSuccessor(PHIBB))
+        PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second).addMBB(FuncInfo->MBB);
+    }
+  }
+  SDB->SL->JTCases.clear();
+
+  // If we generated any switch lowering information, build and codegen any
+  // additional DAGs necessary.
+  for (unsigned i = 0, e = SDB->SL->SwitchCases.size(); i != e; ++i) {
+    // Set the current basic block to the mbb we wish to insert the code into
+    FuncInfo->MBB = SDB->SL->SwitchCases[i].ThisBB;
+    FuncInfo->InsertPt = FuncInfo->MBB->end();
+
+    // Determine the unique successors.
+    SmallVector<MachineBasicBlock *, 2> Succs;
+    Succs.push_back(SDB->SL->SwitchCases[i].TrueBB);
+    if (SDB->SL->SwitchCases[i].TrueBB != SDB->SL->SwitchCases[i].FalseBB)
+      Succs.push_back(SDB->SL->SwitchCases[i].FalseBB);
+
+    // Emit the code. Note that this could result in FuncInfo->MBB being split.
+    SDB->visitSwitchCase(SDB->SL->SwitchCases[i], FuncInfo->MBB);
+    CurDAG->setRoot(SDB->getRoot());
+    SDB->clear();
+    CodeGenAndEmitDAG();
+
+    // Remember the last block, now that any splitting is done, for use in
+    // populating PHI nodes in successors.
+    MachineBasicBlock *ThisBB = FuncInfo->MBB;
+
+    // Handle any PHI nodes in successors of this chunk, as if we were coming
+    // from the original BB before switch expansion.  Note that PHI nodes can
+    // occur multiple times in PHINodesToUpdate.  We have to be very careful to
+    // handle them the right number of times.
+    for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
+      FuncInfo->MBB = Succs[i];
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // FuncInfo->MBB may have been removed from the CFG if a branch was
+      // constant folded.
+      if (ThisBB->isSuccessor(FuncInfo->MBB)) {
+        for (MachineBasicBlock::iterator
+             MBBI = FuncInfo->MBB->begin(), MBBE = FuncInfo->MBB->end();
+             MBBI != MBBE && MBBI->isPHI(); ++MBBI) {
+          MachineInstrBuilder PHI(*MF, MBBI);
+          // This value for this PHI node is recorded in PHINodesToUpdate.
+          for (unsigned pn = 0; ; ++pn) {
+            assert(pn != FuncInfo->PHINodesToUpdate.size() &&
+                   "Didn't find PHI entry!");
+            if (FuncInfo->PHINodesToUpdate[pn].first == PHI) {
+              PHI.addReg(FuncInfo->PHINodesToUpdate[pn].second).addMBB(ThisBB);
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+  SDB->SL->SwitchCases.clear();
+}
+
+/// Create the scheduler. If a specific scheduler was specified
+/// via the SchedulerRegistry, use it, otherwise select the
+/// one preferred by the target.
+///
+ScheduleDAGSDNodes *SelectionDAGISel::CreateScheduler() {
+  return ISHeuristic(this, OptLevel);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions used by the generated instruction selector.
+//===----------------------------------------------------------------------===//
+// Calls to these methods are generated by tblgen.
+
+/// CheckAndMask - The isel is trying to match something like (and X, 255).  If
+/// the dag combiner simplified the 255, we still want to match.  RHS is the
+/// actual value in the DAG on the RHS of an AND, and DesiredMaskS is the value
+/// specified in the .td file (e.g. 255).
+bool SelectionDAGISel::CheckAndMask(SDValue LHS, ConstantSDNode *RHS,
+                                    int64_t DesiredMaskS) const {
+  const APInt &ActualMask = RHS->getAPIntValue();
+  const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS);
+
+  // If the actual mask exactly matches, success!
+  if (ActualMask == DesiredMask)
+    return true;
+
+  // If the actual AND mask is allowing unallowed bits, this doesn't match.
+  if (!ActualMask.isSubsetOf(DesiredMask))
+    return false;
+
+  // Otherwise, the DAG Combiner may have proven that the value coming in is
+  // either already zero or is not demanded.  Check for known zero input bits.
+  APInt NeededMask = DesiredMask & ~ActualMask;
+  if (CurDAG->MaskedValueIsZero(LHS, NeededMask))
+    return true;
+
+  // TODO: check to see if missing bits are just not demanded.
+
+  // Otherwise, this pattern doesn't match.
+  return false;
+}
+
+/// CheckOrMask - The isel is trying to match something like (or X, 255).  If
+/// the dag combiner simplified the 255, we still want to match.  RHS is the
+/// actual value in the DAG on the RHS of an OR, and DesiredMaskS is the value
+/// specified in the .td file (e.g. 255).
+bool SelectionDAGISel::CheckOrMask(SDValue LHS, ConstantSDNode *RHS,
+                                   int64_t DesiredMaskS) const {
+  const APInt &ActualMask = RHS->getAPIntValue();
+  const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS);
+
+  // If the actual mask exactly matches, success!
+  if (ActualMask == DesiredMask)
+    return true;
+
+  // If the actual AND mask is allowing unallowed bits, this doesn't match.
+  if (!ActualMask.isSubsetOf(DesiredMask))
+    return false;
+
+  // Otherwise, the DAG Combiner may have proven that the value coming in is
+  // either already zero or is not demanded.  Check for known zero input bits.
+  APInt NeededMask = DesiredMask & ~ActualMask;
+  KnownBits Known = CurDAG->computeKnownBits(LHS);
+
+  // If all the missing bits in the or are already known to be set, match!
+  if (NeededMask.isSubsetOf(Known.One))
+    return true;
+
+  // TODO: check to see if missing bits are just not demanded.
+
+  // Otherwise, this pattern doesn't match.
+  return false;
+}
+
+/// SelectInlineAsmMemoryOperands - Calls to this are automatically generated
+/// by tblgen.  Others should not call it.
+void SelectionDAGISel::SelectInlineAsmMemoryOperands(std::vector<SDValue> &Ops,
+                                                     const SDLoc &DL) {
+  std::vector<SDValue> InOps;
+  std::swap(InOps, Ops);
+
+  Ops.push_back(InOps[InlineAsm::Op_InputChain]); // 0
+  Ops.push_back(InOps[InlineAsm::Op_AsmString]);  // 1
+  Ops.push_back(InOps[InlineAsm::Op_MDNode]);     // 2, !srcloc
+  Ops.push_back(InOps[InlineAsm::Op_ExtraInfo]);  // 3 (SideEffect, AlignStack)
+
+  unsigned i = InlineAsm::Op_FirstOperand, e = InOps.size();
+  if (InOps[e-1].getValueType() == MVT::Glue)
+    --e;  // Don't process a glue operand if it is here.
+
+  while (i != e) {
+    unsigned Flags = cast<ConstantSDNode>(InOps[i])->getZExtValue();
+    if (!InlineAsm::isMemKind(Flags) && !InlineAsm::isFuncKind(Flags)) {
+      // Just skip over this operand, copying the operands verbatim.
+      Ops.insert(Ops.end(), InOps.begin()+i,
+                 InOps.begin()+i+InlineAsm::getNumOperandRegisters(Flags) + 1);
+      i += InlineAsm::getNumOperandRegisters(Flags) + 1;
+    } else {
+      assert(InlineAsm::getNumOperandRegisters(Flags) == 1 &&
+             "Memory operand with multiple values?");
+
+      unsigned TiedToOperand;
+      if (InlineAsm::isUseOperandTiedToDef(Flags, TiedToOperand)) {
+        // We need the constraint ID from the operand this is tied to.
+        unsigned CurOp = InlineAsm::Op_FirstOperand;
+        Flags = cast<ConstantSDNode>(InOps[CurOp])->getZExtValue();
+        for (; TiedToOperand; --TiedToOperand) {
+          CurOp += InlineAsm::getNumOperandRegisters(Flags)+1;
+          Flags = cast<ConstantSDNode>(InOps[CurOp])->getZExtValue();
+        }
+      }
+
+      // Otherwise, this is a memory operand.  Ask the target to select it.
+      std::vector<SDValue> SelOps;
+      unsigned ConstraintID = InlineAsm::getMemoryConstraintID(Flags);
+      if (SelectInlineAsmMemoryOperand(InOps[i+1], ConstraintID, SelOps))
+        report_fatal_error("Could not match memory address.  Inline asm"
+                           " failure!");
+
+      // Add this to the output node.
+      unsigned NewFlags =
+          InlineAsm::isMemKind(Flags)
+              ? InlineAsm::getFlagWord(InlineAsm::Kind_Mem, SelOps.size())
+              : InlineAsm::getFlagWord(InlineAsm::Kind_Func, SelOps.size());
+      NewFlags = InlineAsm::getFlagWordForMem(NewFlags, ConstraintID);
+      Ops.push_back(CurDAG->getTargetConstant(NewFlags, DL, MVT::i32));
+      llvm::append_range(Ops, SelOps);
+      i += 2;
+    }
+  }
+
+  // Add the glue input back if present.
+  if (e != InOps.size())
+    Ops.push_back(InOps.back());
+}
+
+/// findGlueUse - Return use of MVT::Glue value produced by the specified
+/// SDNode.
+///
+static SDNode *findGlueUse(SDNode *N) {
+  unsigned FlagResNo = N->getNumValues()-1;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    SDUse &Use = I.getUse();
+    if (Use.getResNo() == FlagResNo)
+      return Use.getUser();
+  }
+  return nullptr;
+}
+
+/// findNonImmUse - Return true if "Def" is a predecessor of "Root" via a path
+/// beyond "ImmedUse".  We may ignore chains as they are checked separately.
+static bool findNonImmUse(SDNode *Root, SDNode *Def, SDNode *ImmedUse,
+                          bool IgnoreChains) {
+  SmallPtrSet<const SDNode *, 16> Visited;
+  SmallVector<const SDNode *, 16> WorkList;
+  // Only check if we have non-immediate uses of Def.
+  if (ImmedUse->isOnlyUserOf(Def))
+    return false;
+
+  // We don't care about paths to Def that go through ImmedUse so mark it
+  // visited and mark non-def operands as used.
+  Visited.insert(ImmedUse);
+  for (const SDValue &Op : ImmedUse->op_values()) {
+    SDNode *N = Op.getNode();
+    // Ignore chain deps (they are validated by
+    // HandleMergeInputChains) and immediate uses
+    if ((Op.getValueType() == MVT::Other && IgnoreChains) || N == Def)
+      continue;
+    if (!Visited.insert(N).second)
+      continue;
+    WorkList.push_back(N);
+  }
+
+  // Initialize worklist to operands of Root.
+  if (Root != ImmedUse) {
+    for (const SDValue &Op : Root->op_values()) {
+      SDNode *N = Op.getNode();
+      // Ignore chains (they are validated by HandleMergeInputChains)
+      if ((Op.getValueType() == MVT::Other && IgnoreChains) || N == Def)
+        continue;
+      if (!Visited.insert(N).second)
+        continue;
+      WorkList.push_back(N);
+    }
+  }
+
+  return SDNode::hasPredecessorHelper(Def, Visited, WorkList, 0, true);
+}
+
+/// IsProfitableToFold - Returns true if it's profitable to fold the specific
+/// operand node N of U during instruction selection that starts at Root.
+bool SelectionDAGISel::IsProfitableToFold(SDValue N, SDNode *U,
+                                          SDNode *Root) const {
+  if (OptLevel == CodeGenOpt::None) return false;
+  return N.hasOneUse();
+}
+
+/// IsLegalToFold - Returns true if the specific operand node N of
+/// U can be folded during instruction selection that starts at Root.
+bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,
+                                     CodeGenOpt::Level OptLevel,
+                                     bool IgnoreChains) {
+  if (OptLevel == CodeGenOpt::None) return false;
+
+  // If Root use can somehow reach N through a path that that doesn't contain
+  // U then folding N would create a cycle. e.g. In the following
+  // diagram, Root can reach N through X. If N is folded into Root, then
+  // X is both a predecessor and a successor of U.
+  //
+  //          [N*]           //
+  //         ^   ^           //
+  //        /     \          //
+  //      [U*]    [X]?       //
+  //        ^     ^          //
+  //         \   /           //
+  //          \ /            //
+  //         [Root*]         //
+  //
+  // * indicates nodes to be folded together.
+  //
+  // If Root produces glue, then it gets (even more) interesting. Since it
+  // will be "glued" together with its glue use in the scheduler, we need to
+  // check if it might reach N.
+  //
+  //          [N*]           //
+  //         ^   ^           //
+  //        /     \          //
+  //      [U*]    [X]?       //
+  //        ^       ^        //
+  //         \       \       //
+  //          \      |       //
+  //         [Root*] |       //
+  //          ^      |       //
+  //          f      |       //
+  //          |      /       //
+  //         [Y]    /        //
+  //           ^   /         //
+  //           f  /          //
+  //           | /           //
+  //          [GU]           //
+  //
+  // If GU (glue use) indirectly reaches N (the load), and Root folds N
+  // (call it Fold), then X is a predecessor of GU and a successor of
+  // Fold. But since Fold and GU are glued together, this will create
+  // a cycle in the scheduling graph.
+
+  // If the node has glue, walk down the graph to the "lowest" node in the
+  // glueged set.
+  EVT VT = Root->getValueType(Root->getNumValues()-1);
+  while (VT == MVT::Glue) {
+    SDNode *GU = findGlueUse(Root);
+    if (!GU)
+      break;
+    Root = GU;
+    VT = Root->getValueType(Root->getNumValues()-1);
+
+    // If our query node has a glue result with a use, we've walked up it.  If
+    // the user (which has already been selected) has a chain or indirectly uses
+    // the chain, HandleMergeInputChains will not consider it.  Because of
+    // this, we cannot ignore chains in this predicate.
+    IgnoreChains = false;
+  }
+
+  return !findNonImmUse(Root, N.getNode(), U, IgnoreChains);
+}
+
+void SelectionDAGISel::Select_INLINEASM(SDNode *N) {
+  SDLoc DL(N);
+
+  std::vector<SDValue> Ops(N->op_begin(), N->op_end());
+  SelectInlineAsmMemoryOperands(Ops, DL);
+
+  const EVT VTs[] = {MVT::Other, MVT::Glue};
+  SDValue New = CurDAG->getNode(N->getOpcode(), DL, VTs, Ops);
+  New->setNodeId(-1);
+  ReplaceUses(N, New.getNode());
+  CurDAG->RemoveDeadNode(N);
+}
+
+void SelectionDAGISel::Select_READ_REGISTER(SDNode *Op) {
+  SDLoc dl(Op);
+  MDNodeSDNode *MD = cast<MDNodeSDNode>(Op->getOperand(1));
+  const MDString *RegStr = cast<MDString>(MD->getMD()->getOperand(0));
+
+  EVT VT = Op->getValueType(0);
+  LLT Ty = VT.isSimple() ? getLLTForMVT(VT.getSimpleVT()) : LLT();
+  Register Reg =
+      TLI->getRegisterByName(RegStr->getString().data(), Ty,
+                             CurDAG->getMachineFunction());
+  SDValue New = CurDAG->getCopyFromReg(
+                        Op->getOperand(0), dl, Reg, Op->getValueType(0));
+  New->setNodeId(-1);
+  ReplaceUses(Op, New.getNode());
+  CurDAG->RemoveDeadNode(Op);
+}
+
+void SelectionDAGISel::Select_WRITE_REGISTER(SDNode *Op) {
+  SDLoc dl(Op);
+  MDNodeSDNode *MD = cast<MDNodeSDNode>(Op->getOperand(1));
+  const MDString *RegStr = cast<MDString>(MD->getMD()->getOperand(0));
+
+  EVT VT = Op->getOperand(2).getValueType();
+  LLT Ty = VT.isSimple() ? getLLTForMVT(VT.getSimpleVT()) : LLT();
+
+  Register Reg = TLI->getRegisterByName(RegStr->getString().data(), Ty,
+                                        CurDAG->getMachineFunction());
+  SDValue New = CurDAG->getCopyToReg(
+                        Op->getOperand(0), dl, Reg, Op->getOperand(2));
+  New->setNodeId(-1);
+  ReplaceUses(Op, New.getNode());
+  CurDAG->RemoveDeadNode(Op);
+}
+
+void SelectionDAGISel::Select_UNDEF(SDNode *N) {
+  CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
+}
+
+void SelectionDAGISel::Select_FREEZE(SDNode *N) {
+  // TODO: We don't have FREEZE pseudo-instruction in MachineInstr-level now.
+  // If FREEZE instruction is added later, the code below must be changed as
+  // well.
+  CurDAG->SelectNodeTo(N, TargetOpcode::COPY, N->getValueType(0),
+                       N->getOperand(0));
+}
+
+void SelectionDAGISel::Select_ARITH_FENCE(SDNode *N) {
+  CurDAG->SelectNodeTo(N, TargetOpcode::ARITH_FENCE, N->getValueType(0),
+                       N->getOperand(0));
+}
+
+void SelectionDAGISel::Select_MEMBARRIER(SDNode *N) {
+  CurDAG->SelectNodeTo(N, TargetOpcode::MEMBARRIER, N->getValueType(0),
+                       N->getOperand(0));
+}
+
+void SelectionDAGISel::pushStackMapLiveVariable(SmallVectorImpl<SDValue> &Ops,
+                                                SDValue OpVal, SDLoc DL) {
+  SDNode *OpNode = OpVal.getNode();
+
+  // FrameIndex nodes should have been directly emitted to TargetFrameIndex
+  // nodes at DAG-construction time.
+  assert(OpNode->getOpcode() != ISD::FrameIndex);
+
+  if (OpNode->getOpcode() == ISD::Constant) {
+    Ops.push_back(
+        CurDAG->getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64));
+    Ops.push_back(
+        CurDAG->getTargetConstant(cast<ConstantSDNode>(OpNode)->getZExtValue(),
+                                  DL, OpVal.getValueType()));
+  } else {
+    Ops.push_back(OpVal);
+  }
+}
+
+void SelectionDAGISel::Select_STACKMAP(SDNode *N) {
+  SmallVector<SDValue, 32> Ops;
+  auto *It = N->op_begin();
+  SDLoc DL(N);
+
+  // Stash the chain and glue operands so we can move them to the end.
+  SDValue Chain = *It++;
+  SDValue InGlue = *It++;
+
+  // <id> operand.
+  SDValue ID = *It++;
+  assert(ID.getValueType() == MVT::i64);
+  Ops.push_back(ID);
+
+  // <numShadowBytes> operand.
+  SDValue Shad = *It++;
+  assert(Shad.getValueType() == MVT::i32);
+  Ops.push_back(Shad);
+
+  // Live variable operands.
+  for (; It != N->op_end(); It++)
+    pushStackMapLiveVariable(Ops, *It, DL);
+
+  Ops.push_back(Chain);
+  Ops.push_back(InGlue);
+
+  SDVTList NodeTys = CurDAG->getVTList(MVT::Other, MVT::Glue);
+  CurDAG->SelectNodeTo(N, TargetOpcode::STACKMAP, NodeTys, Ops);
+}
+
+void SelectionDAGISel::Select_PATCHPOINT(SDNode *N) {
+  SmallVector<SDValue, 32> Ops;
+  auto *It = N->op_begin();
+  SDLoc DL(N);
+
+  // Cache arguments that will be moved to the end in the target node.
+  SDValue Chain = *It++;
+  std::optional<SDValue> Glue;
+  if (It->getValueType() == MVT::Glue)
+    Glue = *It++;
+  SDValue RegMask = *It++;
+
+  // <id> operand.
+  SDValue ID = *It++;
+  assert(ID.getValueType() == MVT::i64);
+  Ops.push_back(ID);
+
+  // <numShadowBytes> operand.
+  SDValue Shad = *It++;
+  assert(Shad.getValueType() == MVT::i32);
+  Ops.push_back(Shad);
+
+  // Add the callee.
+  Ops.push_back(*It++);
+
+  // Add <numArgs>.
+  SDValue NumArgs = *It++;
+  assert(NumArgs.getValueType() == MVT::i32);
+  Ops.push_back(NumArgs);
+
+  // Calling convention.
+  Ops.push_back(*It++);
+
+  // Push the args for the call.
+  for (uint64_t I = cast<ConstantSDNode>(NumArgs)->getZExtValue(); I != 0; I--)
+    Ops.push_back(*It++);
+
+  // Now push the live variables.
+  for (; It != N->op_end(); It++)
+    pushStackMapLiveVariable(Ops, *It, DL);
+
+  // Finally, the regmask, chain and (if present) glue are moved to the end.
+  Ops.push_back(RegMask);
+  Ops.push_back(Chain);
+  if (Glue.has_value())
+    Ops.push_back(*Glue);
+
+  SDVTList NodeTys = N->getVTList();
+  CurDAG->SelectNodeTo(N, TargetOpcode::PATCHPOINT, NodeTys, Ops);
+}
+
+/// GetVBR - decode a vbr encoding whose top bit is set.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64_t
+GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) {
+  assert(Val >= 128 && "Not a VBR");
+  Val &= 127;  // Remove first vbr bit.
+
+  unsigned Shift = 7;
+  uint64_t NextBits;
+  do {
+    NextBits = MatcherTable[Idx++];
+    Val |= (NextBits&127) << Shift;
+    Shift += 7;
+  } while (NextBits & 128);
+
+  return Val;
+}
+
+/// When a match is complete, this method updates uses of interior chain results
+/// to use the new results.
+void SelectionDAGISel::UpdateChains(
+    SDNode *NodeToMatch, SDValue InputChain,
+    SmallVectorImpl<SDNode *> &ChainNodesMatched, bool isMorphNodeTo) {
+  SmallVector<SDNode*, 4> NowDeadNodes;
+
+  // Now that all the normal results are replaced, we replace the chain and
+  // glue results if present.
+  if (!ChainNodesMatched.empty()) {
+    assert(InputChain.getNode() &&
+           "Matched input chains but didn't produce a chain");
+    // Loop over all of the nodes we matched that produced a chain result.
+    // Replace all the chain results with the final chain we ended up with.
+    for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
+      SDNode *ChainNode = ChainNodesMatched[i];
+      // If ChainNode is null, it's because we replaced it on a previous
+      // iteration and we cleared it out of the map. Just skip it.
+      if (!ChainNode)
+        continue;
+
+      assert(ChainNode->getOpcode() != ISD::DELETED_NODE &&
+             "Deleted node left in chain");
+
+      // Don't replace the results of the root node if we're doing a
+      // MorphNodeTo.
+      if (ChainNode == NodeToMatch && isMorphNodeTo)
+        continue;
+
+      SDValue ChainVal = SDValue(ChainNode, ChainNode->getNumValues()-1);
+      if (ChainVal.getValueType() == MVT::Glue)
+        ChainVal = ChainVal.getValue(ChainVal->getNumValues()-2);
+      assert(ChainVal.getValueType() == MVT::Other && "Not a chain?");
+      SelectionDAG::DAGNodeDeletedListener NDL(
+          *CurDAG, [&](SDNode *N, SDNode *E) {
+            std::replace(ChainNodesMatched.begin(), ChainNodesMatched.end(), N,
+                         static_cast<SDNode *>(nullptr));
+          });
+      if (ChainNode->getOpcode() != ISD::TokenFactor)
+        ReplaceUses(ChainVal, InputChain);
+
+      // If the node became dead and we haven't already seen it, delete it.
+      if (ChainNode != NodeToMatch && ChainNode->use_empty() &&
+          !llvm::is_contained(NowDeadNodes, ChainNode))
+        NowDeadNodes.push_back(ChainNode);
+    }
+  }
+
+  if (!NowDeadNodes.empty())
+    CurDAG->RemoveDeadNodes(NowDeadNodes);
+
+  LLVM_DEBUG(dbgs() << "ISEL: Match complete!\n");
+}
+
+/// HandleMergeInputChains - This implements the OPC_EmitMergeInputChains
+/// operation for when the pattern matched at least one node with a chains.  The
+/// input vector contains a list of all of the chained nodes that we match.  We
+/// must determine if this is a valid thing to cover (i.e. matching it won't
+/// induce cycles in the DAG) and if so, creating a TokenFactor node. that will
+/// be used as the input node chain for the generated nodes.
+static SDValue
+HandleMergeInputChains(SmallVectorImpl<SDNode*> &ChainNodesMatched,
+                       SelectionDAG *CurDAG) {
+
+  SmallPtrSet<const SDNode *, 16> Visited;
+  SmallVector<const SDNode *, 8> Worklist;
+  SmallVector<SDValue, 3> InputChains;
+  unsigned int Max = 8192;
+
+  // Quick exit on trivial merge.
+  if (ChainNodesMatched.size() == 1)
+    return ChainNodesMatched[0]->getOperand(0);
+
+  // Add chains that aren't already added (internal). Peek through
+  // token factors.
+  std::function<void(const SDValue)> AddChains = [&](const SDValue V) {
+    if (V.getValueType() != MVT::Other)
+      return;
+    if (V->getOpcode() == ISD::EntryToken)
+      return;
+    if (!Visited.insert(V.getNode()).second)
+      return;
+    if (V->getOpcode() == ISD::TokenFactor) {
+      for (const SDValue &Op : V->op_values())
+        AddChains(Op);
+    } else
+      InputChains.push_back(V);
+  };
+
+  for (auto *N : ChainNodesMatched) {
+    Worklist.push_back(N);
+    Visited.insert(N);
+  }
+
+  while (!Worklist.empty())
+    AddChains(Worklist.pop_back_val()->getOperand(0));
+
+  // Skip the search if there are no chain dependencies.
+  if (InputChains.size() == 0)
+    return CurDAG->getEntryNode();
+
+  // If one of these chains is a successor of input, we must have a
+  // node that is both the predecessor and successor of the
+  // to-be-merged nodes. Fail.
+  Visited.clear();
+  for (SDValue V : InputChains)
+    Worklist.push_back(V.getNode());
+
+  for (auto *N : ChainNodesMatched)
+    if (SDNode::hasPredecessorHelper(N, Visited, Worklist, Max, true))
+      return SDValue();
+
+  // Return merged chain.
+  if (InputChains.size() == 1)
+    return InputChains[0];
+  return CurDAG->getNode(ISD::TokenFactor, SDLoc(ChainNodesMatched[0]),
+                         MVT::Other, InputChains);
+}
+
+/// MorphNode - Handle morphing a node in place for the selector.
+SDNode *SelectionDAGISel::
+MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
+          ArrayRef<SDValue> Ops, unsigned EmitNodeInfo) {
+  // It is possible we're using MorphNodeTo to replace a node with no
+  // normal results with one that has a normal result (or we could be
+  // adding a chain) and the input could have glue and chains as well.
+  // In this case we need to shift the operands down.
+  // FIXME: This is a horrible hack and broken in obscure cases, no worse
+  // than the old isel though.
+  int OldGlueResultNo = -1, OldChainResultNo = -1;
+
+  unsigned NTMNumResults = Node->getNumValues();
+  if (Node->getValueType(NTMNumResults-1) == MVT::Glue) {
+    OldGlueResultNo = NTMNumResults-1;
+    if (NTMNumResults != 1 &&
+        Node->getValueType(NTMNumResults-2) == MVT::Other)
+      OldChainResultNo = NTMNumResults-2;
+  } else if (Node->getValueType(NTMNumResults-1) == MVT::Other)
+    OldChainResultNo = NTMNumResults-1;
+
+  // Call the underlying SelectionDAG routine to do the transmogrification. Note
+  // that this deletes operands of the old node that become dead.
+  SDNode *Res = CurDAG->MorphNodeTo(Node, ~TargetOpc, VTList, Ops);
+
+  // MorphNodeTo can operate in two ways: if an existing node with the
+  // specified operands exists, it can just return it.  Otherwise, it
+  // updates the node in place to have the requested operands.
+  if (Res == Node) {
+    // If we updated the node in place, reset the node ID.  To the isel,
+    // this should be just like a newly allocated machine node.
+    Res->setNodeId(-1);
+  }
+
+  unsigned ResNumResults = Res->getNumValues();
+  // Move the glue if needed.
+  if ((EmitNodeInfo & OPFL_GlueOutput) && OldGlueResultNo != -1 &&
+      (unsigned)OldGlueResultNo != ResNumResults-1)
+    ReplaceUses(SDValue(Node, OldGlueResultNo),
+                SDValue(Res, ResNumResults - 1));
+
+  if ((EmitNodeInfo & OPFL_GlueOutput) != 0)
+    --ResNumResults;
+
+  // Move the chain reference if needed.
+  if ((EmitNodeInfo & OPFL_Chain) && OldChainResultNo != -1 &&
+      (unsigned)OldChainResultNo != ResNumResults-1)
+    ReplaceUses(SDValue(Node, OldChainResultNo),
+                SDValue(Res, ResNumResults - 1));
+
+  // Otherwise, no replacement happened because the node already exists. Replace
+  // Uses of the old node with the new one.
+  if (Res != Node) {
+    ReplaceNode(Node, Res);
+  } else {
+    EnforceNodeIdInvariant(Res);
+  }
+
+  return Res;
+}
+
+/// CheckSame - Implements OP_CheckSame.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N,
+          const SmallVectorImpl<std::pair<SDValue, SDNode *>> &RecordedNodes) {
+  // Accept if it is exactly the same as a previously recorded node.
+  unsigned RecNo = MatcherTable[MatcherIndex++];
+  assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+  return N == RecordedNodes[RecNo].first;
+}
+
+/// CheckChildSame - Implements OP_CheckChildXSame.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool CheckChildSame(
+    const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N,
+    const SmallVectorImpl<std::pair<SDValue, SDNode *>> &RecordedNodes,
+    unsigned ChildNo) {
+  if (ChildNo >= N.getNumOperands())
+    return false;  // Match fails if out of range child #.
+  return ::CheckSame(MatcherTable, MatcherIndex, N.getOperand(ChildNo),
+                     RecordedNodes);
+}
+
+/// CheckPatternPredicate - Implements OP_CheckPatternPredicate.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckPatternPredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+                      const SelectionDAGISel &SDISel) {
+  return SDISel.CheckPatternPredicate(MatcherTable[MatcherIndex++]);
+}
+
+/// CheckNodePredicate - Implements OP_CheckNodePredicate.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckNodePredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+                   const SelectionDAGISel &SDISel, SDNode *N) {
+  return SDISel.CheckNodePredicate(N, MatcherTable[MatcherIndex++]);
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckOpcode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+            SDNode *N) {
+  uint16_t Opc = MatcherTable[MatcherIndex++];
+  Opc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
+  return N->getOpcode() == Opc;
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckType(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N,
+          const TargetLowering *TLI, const DataLayout &DL) {
+  MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+  if (N.getValueType() == VT) return true;
+
+  // Handle the case when VT is iPTR.
+  return VT == MVT::iPTR && N.getValueType() == TLI->getPointerTy(DL);
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckChildType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+               SDValue N, const TargetLowering *TLI, const DataLayout &DL,
+               unsigned ChildNo) {
+  if (ChildNo >= N.getNumOperands())
+    return false;  // Match fails if out of range child #.
+  return ::CheckType(MatcherTable, MatcherIndex, N.getOperand(ChildNo), TLI,
+                     DL);
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckCondCode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+              SDValue N) {
+  return cast<CondCodeSDNode>(N)->get() ==
+      (ISD::CondCode)MatcherTable[MatcherIndex++];
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckChild2CondCode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+                    SDValue N) {
+  if (2 >= N.getNumOperands())
+    return false;
+  return ::CheckCondCode(MatcherTable, MatcherIndex, N.getOperand(2));
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckValueType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+               SDValue N, const TargetLowering *TLI, const DataLayout &DL) {
+  MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+  if (cast<VTSDNode>(N)->getVT() == VT)
+    return true;
+
+  // Handle the case when VT is iPTR.
+  return VT == MVT::iPTR && cast<VTSDNode>(N)->getVT() == TLI->getPointerTy(DL);
+}
+
+// Bit 0 stores the sign of the immediate. The upper bits contain the magnitude
+// shifted left by 1.
+static uint64_t decodeSignRotatedValue(uint64_t V) {
+  if ((V & 1) == 0)
+    return V >> 1;
+  if (V != 1)
+    return -(V >> 1);
+  // There is no such thing as -0 with integers.  "-0" really means MININT.
+  return 1ULL << 63;
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+             SDValue N) {
+  int64_t Val = MatcherTable[MatcherIndex++];
+  if (Val & 128)
+    Val = GetVBR(Val, MatcherTable, MatcherIndex);
+
+  Val = decodeSignRotatedValue(Val);
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
+  return C && C->getSExtValue() == Val;
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckChildInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+                  SDValue N, unsigned ChildNo) {
+  if (ChildNo >= N.getNumOperands())
+    return false;  // Match fails if out of range child #.
+  return ::CheckInteger(MatcherTable, MatcherIndex, N.getOperand(ChildNo));
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+            SDValue N, const SelectionDAGISel &SDISel) {
+  int64_t Val = MatcherTable[MatcherIndex++];
+  if (Val & 128)
+    Val = GetVBR(Val, MatcherTable, MatcherIndex);
+
+  if (N->getOpcode() != ISD::AND) return false;
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  return C && SDISel.CheckAndMask(N.getOperand(0), C, Val);
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckOrImm(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N,
+           const SelectionDAGISel &SDISel) {
+  int64_t Val = MatcherTable[MatcherIndex++];
+  if (Val & 128)
+    Val = GetVBR(Val, MatcherTable, MatcherIndex);
+
+  if (N->getOpcode() != ISD::OR) return false;
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  return C && SDISel.CheckOrMask(N.getOperand(0), C, Val);
+}
+
+/// IsPredicateKnownToFail - If we know how and can do so without pushing a
+/// scope, evaluate the current node.  If the current predicate is known to
+/// fail, set Result=true and return anything.  If the current predicate is
+/// known to pass, set Result=false and return the MatcherIndex to continue
+/// with.  If the current predicate is unknown, set Result=false and return the
+/// MatcherIndex to continue with.
+static unsigned IsPredicateKnownToFail(const unsigned char *Table,
+                                       unsigned Index, SDValue N,
+                                       bool &Result,
+                                       const SelectionDAGISel &SDISel,
+                  SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) {
+  switch (Table[Index++]) {
+  default:
+    Result = false;
+    return Index-1;  // Could not evaluate this predicate.
+  case SelectionDAGISel::OPC_CheckSame:
+    Result = !::CheckSame(Table, Index, N, RecordedNodes);
+    return Index;
+  case SelectionDAGISel::OPC_CheckChild0Same:
+  case SelectionDAGISel::OPC_CheckChild1Same:
+  case SelectionDAGISel::OPC_CheckChild2Same:
+  case SelectionDAGISel::OPC_CheckChild3Same:
+    Result = !::CheckChildSame(Table, Index, N, RecordedNodes,
+                        Table[Index-1] - SelectionDAGISel::OPC_CheckChild0Same);
+    return Index;
+  case SelectionDAGISel::OPC_CheckPatternPredicate:
+    Result = !::CheckPatternPredicate(Table, Index, SDISel);
+    return Index;
+  case SelectionDAGISel::OPC_CheckPredicate:
+    Result = !::CheckNodePredicate(Table, Index, SDISel, N.getNode());
+    return Index;
+  case SelectionDAGISel::OPC_CheckOpcode:
+    Result = !::CheckOpcode(Table, Index, N.getNode());
+    return Index;
+  case SelectionDAGISel::OPC_CheckType:
+    Result = !::CheckType(Table, Index, N, SDISel.TLI,
+                          SDISel.CurDAG->getDataLayout());
+    return Index;
+  case SelectionDAGISel::OPC_CheckTypeRes: {
+    unsigned Res = Table[Index++];
+    Result = !::CheckType(Table, Index, N.getValue(Res), SDISel.TLI,
+                          SDISel.CurDAG->getDataLayout());
+    return Index;
+  }
+  case SelectionDAGISel::OPC_CheckChild0Type:
+  case SelectionDAGISel::OPC_CheckChild1Type:
+  case SelectionDAGISel::OPC_CheckChild2Type:
+  case SelectionDAGISel::OPC_CheckChild3Type:
+  case SelectionDAGISel::OPC_CheckChild4Type:
+  case SelectionDAGISel::OPC_CheckChild5Type:
+  case SelectionDAGISel::OPC_CheckChild6Type:
+  case SelectionDAGISel::OPC_CheckChild7Type:
+    Result = !::CheckChildType(
+                 Table, Index, N, SDISel.TLI, SDISel.CurDAG->getDataLayout(),
+                 Table[Index - 1] - SelectionDAGISel::OPC_CheckChild0Type);
+    return Index;
+  case SelectionDAGISel::OPC_CheckCondCode:
+    Result = !::CheckCondCode(Table, Index, N);
+    return Index;
+  case SelectionDAGISel::OPC_CheckChild2CondCode:
+    Result = !::CheckChild2CondCode(Table, Index, N);
+    return Index;
+  case SelectionDAGISel::OPC_CheckValueType:
+    Result = !::CheckValueType(Table, Index, N, SDISel.TLI,
+                               SDISel.CurDAG->getDataLayout());
+    return Index;
+  case SelectionDAGISel::OPC_CheckInteger:
+    Result = !::CheckInteger(Table, Index, N);
+    return Index;
+  case SelectionDAGISel::OPC_CheckChild0Integer:
+  case SelectionDAGISel::OPC_CheckChild1Integer:
+  case SelectionDAGISel::OPC_CheckChild2Integer:
+  case SelectionDAGISel::OPC_CheckChild3Integer:
+  case SelectionDAGISel::OPC_CheckChild4Integer:
+    Result = !::CheckChildInteger(Table, Index, N,
+                     Table[Index-1] - SelectionDAGISel::OPC_CheckChild0Integer);
+    return Index;
+  case SelectionDAGISel::OPC_CheckAndImm:
+    Result = !::CheckAndImm(Table, Index, N, SDISel);
+    return Index;
+  case SelectionDAGISel::OPC_CheckOrImm:
+    Result = !::CheckOrImm(Table, Index, N, SDISel);
+    return Index;
+  }
+}
+
+namespace {
+
+struct MatchScope {
+  /// FailIndex - If this match fails, this is the index to continue with.
+  unsigned FailIndex;
+
+  /// NodeStack - The node stack when the scope was formed.
+  SmallVector<SDValue, 4> NodeStack;
+
+  /// NumRecordedNodes - The number of recorded nodes when the scope was formed.
+  unsigned NumRecordedNodes;
+
+  /// NumMatchedMemRefs - The number of matched memref entries.
+  unsigned NumMatchedMemRefs;
+
+  /// InputChain/InputGlue - The current chain/glue
+  SDValue InputChain, InputGlue;
+
+  /// HasChainNodesMatched - True if the ChainNodesMatched list is non-empty.
+  bool HasChainNodesMatched;
+};
+
+/// \A DAG update listener to keep the matching state
+/// (i.e. RecordedNodes and MatchScope) uptodate if the target is allowed to
+/// change the DAG while matching.  X86 addressing mode matcher is an example
+/// for this.
+class MatchStateUpdater : public SelectionDAG::DAGUpdateListener
+{
+  SDNode **NodeToMatch;
+  SmallVectorImpl<std::pair<SDValue, SDNode *>> &RecordedNodes;
+  SmallVectorImpl<MatchScope> &MatchScopes;
+
+public:
+  MatchStateUpdater(SelectionDAG &DAG, SDNode **NodeToMatch,
+                    SmallVectorImpl<std::pair<SDValue, SDNode *>> &RN,
+                    SmallVectorImpl<MatchScope> &MS)
+      : SelectionDAG::DAGUpdateListener(DAG), NodeToMatch(NodeToMatch),
+        RecordedNodes(RN), MatchScopes(MS) {}
+
+  void NodeDeleted(SDNode *N, SDNode *E) override {
+    // Some early-returns here to avoid the search if we deleted the node or
+    // if the update comes from MorphNodeTo (MorphNodeTo is the last thing we
+    // do, so it's unnecessary to update matching state at that point).
+    // Neither of these can occur currently because we only install this
+    // update listener during matching a complex patterns.
+    if (!E || E->isMachineOpcode())
+      return;
+    // Check if NodeToMatch was updated.
+    if (N == *NodeToMatch)
+      *NodeToMatch = E;
+    // Performing linear search here does not matter because we almost never
+    // run this code.  You'd have to have a CSE during complex pattern
+    // matching.
+    for (auto &I : RecordedNodes)
+      if (I.first.getNode() == N)
+        I.first.setNode(E);
+
+    for (auto &I : MatchScopes)
+      for (auto &J : I.NodeStack)
+        if (J.getNode() == N)
+          J.setNode(E);
+  }
+};
+
+} // end anonymous namespace
+
+void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
+                                        const unsigned char *MatcherTable,
+                                        unsigned TableSize) {
+  // FIXME: Should these even be selected?  Handle these cases in the caller?
+  switch (NodeToMatch->getOpcode()) {
+  default:
+    break;
+  case ISD::EntryToken:       // These nodes remain the same.
+  case ISD::BasicBlock:
+  case ISD::Register:
+  case ISD::RegisterMask:
+  case ISD::HANDLENODE:
+  case ISD::MDNODE_SDNODE:
+  case ISD::TargetConstant:
+  case ISD::TargetConstantFP:
+  case ISD::TargetConstantPool:
+  case ISD::TargetFrameIndex:
+  case ISD::TargetExternalSymbol:
+  case ISD::MCSymbol:
+  case ISD::TargetBlockAddress:
+  case ISD::TargetJumpTable:
+  case ISD::TargetGlobalTLSAddress:
+  case ISD::TargetGlobalAddress:
+  case ISD::TokenFactor:
+  case ISD::CopyFromReg:
+  case ISD::CopyToReg:
+  case ISD::EH_LABEL:
+  case ISD::ANNOTATION_LABEL:
+  case ISD::LIFETIME_START:
+  case ISD::LIFETIME_END:
+  case ISD::PSEUDO_PROBE:
+    NodeToMatch->setNodeId(-1); // Mark selected.
+    return;
+  case ISD::AssertSext:
+  case ISD::AssertZext:
+  case ISD::AssertAlign:
+    ReplaceUses(SDValue(NodeToMatch, 0), NodeToMatch->getOperand(0));
+    CurDAG->RemoveDeadNode(NodeToMatch);
+    return;
+  case ISD::INLINEASM:
+  case ISD::INLINEASM_BR:
+    Select_INLINEASM(NodeToMatch);
+    return;
+  case ISD::READ_REGISTER:
+    Select_READ_REGISTER(NodeToMatch);
+    return;
+  case ISD::WRITE_REGISTER:
+    Select_WRITE_REGISTER(NodeToMatch);
+    return;
+  case ISD::UNDEF:
+    Select_UNDEF(NodeToMatch);
+    return;
+  case ISD::FREEZE:
+    Select_FREEZE(NodeToMatch);
+    return;
+  case ISD::ARITH_FENCE:
+    Select_ARITH_FENCE(NodeToMatch);
+    return;
+  case ISD::MEMBARRIER:
+    Select_MEMBARRIER(NodeToMatch);
+    return;
+  case ISD::STACKMAP:
+    Select_STACKMAP(NodeToMatch);
+    return;
+  case ISD::PATCHPOINT:
+    Select_PATCHPOINT(NodeToMatch);
+    return;
+  }
+
+  assert(!NodeToMatch->isMachineOpcode() && "Node already selected!");
+
+  // Set up the node stack with NodeToMatch as the only node on the stack.
+  SmallVector<SDValue, 8> NodeStack;
+  SDValue N = SDValue(NodeToMatch, 0);
+  NodeStack.push_back(N);
+
+  // MatchScopes - Scopes used when matching, if a match failure happens, this
+  // indicates where to continue checking.
+  SmallVector<MatchScope, 8> MatchScopes;
+
+  // RecordedNodes - This is the set of nodes that have been recorded by the
+  // state machine.  The second value is the parent of the node, or null if the
+  // root is recorded.
+  SmallVector<std::pair<SDValue, SDNode*>, 8> RecordedNodes;
+
+  // MatchedMemRefs - This is the set of MemRef's we've seen in the input
+  // pattern.
+  SmallVector<MachineMemOperand*, 2> MatchedMemRefs;
+
+  // These are the current input chain and glue for use when generating nodes.
+  // Various Emit operations change these.  For example, emitting a copytoreg
+  // uses and updates these.
+  SDValue InputChain, InputGlue;
+
+  // ChainNodesMatched - If a pattern matches nodes that have input/output
+  // chains, the OPC_EmitMergeInputChains operation is emitted which indicates
+  // which ones they are.  The result is captured into this list so that we can
+  // update the chain results when the pattern is complete.
+  SmallVector<SDNode*, 3> ChainNodesMatched;
+
+  LLVM_DEBUG(dbgs() << "ISEL: Starting pattern match\n");
+
+  // Determine where to start the interpreter.  Normally we start at opcode #0,
+  // but if the state machine starts with an OPC_SwitchOpcode, then we
+  // accelerate the first lookup (which is guaranteed to be hot) with the
+  // OpcodeOffset table.
+  unsigned MatcherIndex = 0;
+
+  if (!OpcodeOffset.empty()) {
+    // Already computed the OpcodeOffset table, just index into it.
+    if (N.getOpcode() < OpcodeOffset.size())
+      MatcherIndex = OpcodeOffset[N.getOpcode()];
+    LLVM_DEBUG(dbgs() << "  Initial Opcode index to " << MatcherIndex << "\n");
+
+  } else if (MatcherTable[0] == OPC_SwitchOpcode) {
+    // Otherwise, the table isn't computed, but the state machine does start
+    // with an OPC_SwitchOpcode instruction.  Populate the table now, since this
+    // is the first time we're selecting an instruction.
+    unsigned Idx = 1;
+    while (true) {
+      // Get the size of this case.
+      unsigned CaseSize = MatcherTable[Idx++];
+      if (CaseSize & 128)
+        CaseSize = GetVBR(CaseSize, MatcherTable, Idx);
+      if (CaseSize == 0) break;
+
+      // Get the opcode, add the index to the table.
+      uint16_t Opc = MatcherTable[Idx++];
+      Opc |= (unsigned short)MatcherTable[Idx++] << 8;
+      if (Opc >= OpcodeOffset.size())
+        OpcodeOffset.resize((Opc+1)*2);
+      OpcodeOffset[Opc] = Idx;
+      Idx += CaseSize;
+    }
+
+    // Okay, do the lookup for the first opcode.
+    if (N.getOpcode() < OpcodeOffset.size())
+      MatcherIndex = OpcodeOffset[N.getOpcode()];
+  }
+
+  while (true) {
+    assert(MatcherIndex < TableSize && "Invalid index");
+#ifndef NDEBUG
+    unsigned CurrentOpcodeIndex = MatcherIndex;
+#endif
+    BuiltinOpcodes Opcode = (BuiltinOpcodes)MatcherTable[MatcherIndex++];
+    switch (Opcode) {
+    case OPC_Scope: {
+      // Okay, the semantics of this operation are that we should push a scope
+      // then evaluate the first child.  However, pushing a scope only to have
+      // the first check fail (which then pops it) is inefficient.  If we can
+      // determine immediately that the first check (or first several) will
+      // immediately fail, don't even bother pushing a scope for them.
+      unsigned FailIndex;
+
+      while (true) {
+        unsigned NumToSkip = MatcherTable[MatcherIndex++];
+        if (NumToSkip & 128)
+          NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex);
+        // Found the end of the scope with no match.
+        if (NumToSkip == 0) {
+          FailIndex = 0;
+          break;
+        }
+
+        FailIndex = MatcherIndex+NumToSkip;
+
+        unsigned MatcherIndexOfPredicate = MatcherIndex;
+        (void)MatcherIndexOfPredicate; // silence warning.
+
+        // If we can't evaluate this predicate without pushing a scope (e.g. if
+        // it is a 'MoveParent') or if the predicate succeeds on this node, we
+        // push the scope and evaluate the full predicate chain.
+        bool Result;
+        MatcherIndex = IsPredicateKnownToFail(MatcherTable, MatcherIndex, N,
+                                              Result, *this, RecordedNodes);
+        if (!Result)
+          break;
+
+        LLVM_DEBUG(
+            dbgs() << "  Skipped scope entry (due to false predicate) at "
+                   << "index " << MatcherIndexOfPredicate << ", continuing at "
+                   << FailIndex << "\n");
+        ++NumDAGIselRetries;
+
+        // Otherwise, we know that this case of the Scope is guaranteed to fail,
+        // move to the next case.
+        MatcherIndex = FailIndex;
+      }
+
+      // If the whole scope failed to match, bail.
+      if (FailIndex == 0) break;
+
+      // Push a MatchScope which indicates where to go if the first child fails
+      // to match.
+      MatchScope NewEntry;
+      NewEntry.FailIndex = FailIndex;
+      NewEntry.NodeStack.append(NodeStack.begin(), NodeStack.end());
+      NewEntry.NumRecordedNodes = RecordedNodes.size();
+      NewEntry.NumMatchedMemRefs = MatchedMemRefs.size();
+      NewEntry.InputChain = InputChain;
+      NewEntry.InputGlue = InputGlue;
+      NewEntry.HasChainNodesMatched = !ChainNodesMatched.empty();
+      MatchScopes.push_back(NewEntry);
+      continue;
+    }
+    case OPC_RecordNode: {
+      // Remember this node, it may end up being an operand in the pattern.
+      SDNode *Parent = nullptr;
+      if (NodeStack.size() > 1)
+        Parent = NodeStack[NodeStack.size()-2].getNode();
+      RecordedNodes.push_back(std::make_pair(N, Parent));
+      continue;
+    }
+
+    case OPC_RecordChild0: case OPC_RecordChild1:
+    case OPC_RecordChild2: case OPC_RecordChild3:
+    case OPC_RecordChild4: case OPC_RecordChild5:
+    case OPC_RecordChild6: case OPC_RecordChild7: {
+      unsigned ChildNo = Opcode-OPC_RecordChild0;
+      if (ChildNo >= N.getNumOperands())
+        break;  // Match fails if out of range child #.
+
+      RecordedNodes.push_back(std::make_pair(N->getOperand(ChildNo),
+                                             N.getNode()));
+      continue;
+    }
+    case OPC_RecordMemRef:
+      if (auto *MN = dyn_cast<MemSDNode>(N))
+        MatchedMemRefs.push_back(MN->getMemOperand());
+      else {
+        LLVM_DEBUG(dbgs() << "Expected MemSDNode "; N->dump(CurDAG);
+                   dbgs() << '\n');
+      }
+
+      continue;
+
+    case OPC_CaptureGlueInput:
+      // If the current node has an input glue, capture it in InputGlue.
+      if (N->getNumOperands() != 0 &&
+          N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue)
+        InputGlue = N->getOperand(N->getNumOperands()-1);
+      continue;
+
+    case OPC_MoveChild: {
+      unsigned ChildNo = MatcherTable[MatcherIndex++];
+      if (ChildNo >= N.getNumOperands())
+        break;  // Match fails if out of range child #.
+      N = N.getOperand(ChildNo);
+      NodeStack.push_back(N);
+      continue;
+    }
+
+    case OPC_MoveChild0: case OPC_MoveChild1:
+    case OPC_MoveChild2: case OPC_MoveChild3:
+    case OPC_MoveChild4: case OPC_MoveChild5:
+    case OPC_MoveChild6: case OPC_MoveChild7: {
+      unsigned ChildNo = Opcode-OPC_MoveChild0;
+      if (ChildNo >= N.getNumOperands())
+        break;  // Match fails if out of range child #.
+      N = N.getOperand(ChildNo);
+      NodeStack.push_back(N);
+      continue;
+    }
+
+    case OPC_MoveParent:
+      // Pop the current node off the NodeStack.
+      NodeStack.pop_back();
+      assert(!NodeStack.empty() && "Node stack imbalance!");
+      N = NodeStack.back();
+      continue;
+
+    case OPC_CheckSame:
+      if (!::CheckSame(MatcherTable, MatcherIndex, N, RecordedNodes)) break;
+      continue;
+
+    case OPC_CheckChild0Same: case OPC_CheckChild1Same:
+    case OPC_CheckChild2Same: case OPC_CheckChild3Same:
+      if (!::CheckChildSame(MatcherTable, MatcherIndex, N, RecordedNodes,
+                            Opcode-OPC_CheckChild0Same))
+        break;
+      continue;
+
+    case OPC_CheckPatternPredicate:
+      if (!::CheckPatternPredicate(MatcherTable, MatcherIndex, *this)) break;
+      continue;
+    case OPC_CheckPredicate:
+      if (!::CheckNodePredicate(MatcherTable, MatcherIndex, *this,
+                                N.getNode()))
+        break;
+      continue;
+    case OPC_CheckPredicateWithOperands: {
+      unsigned OpNum = MatcherTable[MatcherIndex++];
+      SmallVector<SDValue, 8> Operands;
+
+      for (unsigned i = 0; i < OpNum; ++i)
+        Operands.push_back(RecordedNodes[MatcherTable[MatcherIndex++]].first);
+
+      unsigned PredNo = MatcherTable[MatcherIndex++];
+      if (!CheckNodePredicateWithOperands(N.getNode(), PredNo, Operands))
+        break;
+      continue;
+    }
+    case OPC_CheckComplexPat: {
+      unsigned CPNum = MatcherTable[MatcherIndex++];
+      unsigned RecNo = MatcherTable[MatcherIndex++];
+      assert(RecNo < RecordedNodes.size() && "Invalid CheckComplexPat");
+
+      // If target can modify DAG during matching, keep the matching state
+      // consistent.
+      std::unique_ptr<MatchStateUpdater> MSU;
+      if (ComplexPatternFuncMutatesDAG())
+        MSU.reset(new MatchStateUpdater(*CurDAG, &NodeToMatch, RecordedNodes,
+                                        MatchScopes));
+
+      if (!CheckComplexPattern(NodeToMatch, RecordedNodes[RecNo].second,
+                               RecordedNodes[RecNo].first, CPNum,
+                               RecordedNodes))
+        break;
+      continue;
+    }
+    case OPC_CheckOpcode:
+      if (!::CheckOpcode(MatcherTable, MatcherIndex, N.getNode())) break;
+      continue;
+
+    case OPC_CheckType:
+      if (!::CheckType(MatcherTable, MatcherIndex, N, TLI,
+                       CurDAG->getDataLayout()))
+        break;
+      continue;
+
+    case OPC_CheckTypeRes: {
+      unsigned Res = MatcherTable[MatcherIndex++];
+      if (!::CheckType(MatcherTable, MatcherIndex, N.getValue(Res), TLI,
+                       CurDAG->getDataLayout()))
+        break;
+      continue;
+    }
+
+    case OPC_SwitchOpcode: {
+      unsigned CurNodeOpcode = N.getOpcode();
+      unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
+      unsigned CaseSize;
+      while (true) {
+        // Get the size of this case.
+        CaseSize = MatcherTable[MatcherIndex++];
+        if (CaseSize & 128)
+          CaseSize = GetVBR(CaseSize, MatcherTable, MatcherIndex);
+        if (CaseSize == 0) break;
+
+        uint16_t Opc = MatcherTable[MatcherIndex++];
+        Opc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
+
+        // If the opcode matches, then we will execute this case.
+        if (CurNodeOpcode == Opc)
+          break;
+
+        // Otherwise, skip over this case.
+        MatcherIndex += CaseSize;
+      }
+
+      // If no cases matched, bail out.
+      if (CaseSize == 0) break;
+
+      // Otherwise, execute the case we found.
+      LLVM_DEBUG(dbgs() << "  OpcodeSwitch from " << SwitchStart << " to "
+                        << MatcherIndex << "\n");
+      continue;
+    }
+
+    case OPC_SwitchType: {
+      MVT CurNodeVT = N.getSimpleValueType();
+      unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
+      unsigned CaseSize;
+      while (true) {
+        // Get the size of this case.
+        CaseSize = MatcherTable[MatcherIndex++];
+        if (CaseSize & 128)
+          CaseSize = GetVBR(CaseSize, MatcherTable, MatcherIndex);
+        if (CaseSize == 0) break;
+
+        MVT CaseVT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+        if (CaseVT == MVT::iPTR)
+          CaseVT = TLI->getPointerTy(CurDAG->getDataLayout());
+
+        // If the VT matches, then we will execute this case.
+        if (CurNodeVT == CaseVT)
+          break;
+
+        // Otherwise, skip over this case.
+        MatcherIndex += CaseSize;
+      }
+
+      // If no cases matched, bail out.
+      if (CaseSize == 0) break;
+
+      // Otherwise, execute the case we found.
+      LLVM_DEBUG(dbgs() << "  TypeSwitch[" << CurNodeVT
+                        << "] from " << SwitchStart << " to " << MatcherIndex
+                        << '\n');
+      continue;
+    }
+    case OPC_CheckChild0Type: case OPC_CheckChild1Type:
+    case OPC_CheckChild2Type: case OPC_CheckChild3Type:
+    case OPC_CheckChild4Type: case OPC_CheckChild5Type:
+    case OPC_CheckChild6Type: case OPC_CheckChild7Type:
+      if (!::CheckChildType(MatcherTable, MatcherIndex, N, TLI,
+                            CurDAG->getDataLayout(),
+                            Opcode - OPC_CheckChild0Type))
+        break;
+      continue;
+    case OPC_CheckCondCode:
+      if (!::CheckCondCode(MatcherTable, MatcherIndex, N)) break;
+      continue;
+    case OPC_CheckChild2CondCode:
+      if (!::CheckChild2CondCode(MatcherTable, MatcherIndex, N)) break;
+      continue;
+    case OPC_CheckValueType:
+      if (!::CheckValueType(MatcherTable, MatcherIndex, N, TLI,
+                            CurDAG->getDataLayout()))
+        break;
+      continue;
+    case OPC_CheckInteger:
+      if (!::CheckInteger(MatcherTable, MatcherIndex, N)) break;
+      continue;
+    case OPC_CheckChild0Integer: case OPC_CheckChild1Integer:
+    case OPC_CheckChild2Integer: case OPC_CheckChild3Integer:
+    case OPC_CheckChild4Integer:
+      if (!::CheckChildInteger(MatcherTable, MatcherIndex, N,
+                               Opcode-OPC_CheckChild0Integer)) break;
+      continue;
+    case OPC_CheckAndImm:
+      if (!::CheckAndImm(MatcherTable, MatcherIndex, N, *this)) break;
+      continue;
+    case OPC_CheckOrImm:
+      if (!::CheckOrImm(MatcherTable, MatcherIndex, N, *this)) break;
+      continue;
+    case OPC_CheckImmAllOnesV:
+      if (!ISD::isConstantSplatVectorAllOnes(N.getNode()))
+        break;
+      continue;
+    case OPC_CheckImmAllZerosV:
+      if (!ISD::isConstantSplatVectorAllZeros(N.getNode()))
+        break;
+      continue;
+
+    case OPC_CheckFoldableChainNode: {
+      assert(NodeStack.size() != 1 && "No parent node");
+      // Verify that all intermediate nodes between the root and this one have
+      // a single use (ignoring chains, which are handled in UpdateChains).
+      bool HasMultipleUses = false;
+      for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i) {
+        unsigned NNonChainUses = 0;
+        SDNode *NS = NodeStack[i].getNode();
+        for (auto UI = NS->use_begin(), UE = NS->use_end(); UI != UE; ++UI)
+          if (UI.getUse().getValueType() != MVT::Other)
+            if (++NNonChainUses > 1) {
+              HasMultipleUses = true;
+              break;
+            }
+        if (HasMultipleUses) break;
+      }
+      if (HasMultipleUses) break;
+
+      // Check to see that the target thinks this is profitable to fold and that
+      // we can fold it without inducing cycles in the graph.
+      if (!IsProfitableToFold(N, NodeStack[NodeStack.size()-2].getNode(),
+                              NodeToMatch) ||
+          !IsLegalToFold(N, NodeStack[NodeStack.size()-2].getNode(),
+                         NodeToMatch, OptLevel,
+                         true/*We validate our own chains*/))
+        break;
+
+      continue;
+    }
+    case OPC_EmitInteger:
+    case OPC_EmitStringInteger: {
+      MVT::SimpleValueType VT =
+        (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+      int64_t Val = MatcherTable[MatcherIndex++];
+      if (Val & 128)
+        Val = GetVBR(Val, MatcherTable, MatcherIndex);
+      if (Opcode == OPC_EmitInteger)
+        Val = decodeSignRotatedValue(Val);
+      RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
+                              CurDAG->getTargetConstant(Val, SDLoc(NodeToMatch),
+                                                        VT), nullptr));
+      continue;
+    }
+    case OPC_EmitRegister: {
+      MVT::SimpleValueType VT =
+        (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+      unsigned RegNo = MatcherTable[MatcherIndex++];
+      RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
+                              CurDAG->getRegister(RegNo, VT), nullptr));
+      continue;
+    }
+    case OPC_EmitRegister2: {
+      // For targets w/ more than 256 register names, the register enum
+      // values are stored in two bytes in the matcher table (just like
+      // opcodes).
+      MVT::SimpleValueType VT =
+        (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+      unsigned RegNo = MatcherTable[MatcherIndex++];
+      RegNo |= MatcherTable[MatcherIndex++] << 8;
+      RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
+                              CurDAG->getRegister(RegNo, VT), nullptr));
+      continue;
+    }
+
+    case OPC_EmitConvertToTarget:  {
+      // Convert from IMM/FPIMM to target version.
+      unsigned RecNo = MatcherTable[MatcherIndex++];
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitConvertToTarget");
+      SDValue Imm = RecordedNodes[RecNo].first;
+
+      if (Imm->getOpcode() == ISD::Constant) {
+        const ConstantInt *Val=cast<ConstantSDNode>(Imm)->getConstantIntValue();
+        Imm = CurDAG->getTargetConstant(*Val, SDLoc(NodeToMatch),
+                                        Imm.getValueType());
+      } else if (Imm->getOpcode() == ISD::ConstantFP) {
+        const ConstantFP *Val=cast<ConstantFPSDNode>(Imm)->getConstantFPValue();
+        Imm = CurDAG->getTargetConstantFP(*Val, SDLoc(NodeToMatch),
+                                          Imm.getValueType());
+      }
+
+      RecordedNodes.push_back(std::make_pair(Imm, RecordedNodes[RecNo].second));
+      continue;
+    }
+
+    case OPC_EmitMergeInputChains1_0:    // OPC_EmitMergeInputChains, 1, 0
+    case OPC_EmitMergeInputChains1_1:    // OPC_EmitMergeInputChains, 1, 1
+    case OPC_EmitMergeInputChains1_2: {  // OPC_EmitMergeInputChains, 1, 2
+      // These are space-optimized forms of OPC_EmitMergeInputChains.
+      assert(!InputChain.getNode() &&
+             "EmitMergeInputChains should be the first chain producing node");
+      assert(ChainNodesMatched.empty() &&
+             "Should only have one EmitMergeInputChains per match");
+
+      // Read all of the chained nodes.
+      unsigned RecNo = Opcode - OPC_EmitMergeInputChains1_0;
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitMergeInputChains");
+      ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
+
+      // If the chained node is not the root, we can't fold it if it has
+      // multiple uses.
+      // FIXME: What if other value results of the node have uses not matched
+      // by this pattern?
+      if (ChainNodesMatched.back() != NodeToMatch &&
+          !RecordedNodes[RecNo].first.hasOneUse()) {
+        ChainNodesMatched.clear();
+        break;
+      }
+
+      // Merge the input chains if they are not intra-pattern references.
+      InputChain = HandleMergeInputChains(ChainNodesMatched, CurDAG);
+
+      if (!InputChain.getNode())
+        break;  // Failed to merge.
+      continue;
+    }
+
+    case OPC_EmitMergeInputChains: {
+      assert(!InputChain.getNode() &&
+             "EmitMergeInputChains should be the first chain producing node");
+      // This node gets a list of nodes we matched in the input that have
+      // chains.  We want to token factor all of the input chains to these nodes
+      // together.  However, if any of the input chains is actually one of the
+      // nodes matched in this pattern, then we have an intra-match reference.
+      // Ignore these because the newly token factored chain should not refer to
+      // the old nodes.
+      unsigned NumChains = MatcherTable[MatcherIndex++];
+      assert(NumChains != 0 && "Can't TF zero chains");
+
+      assert(ChainNodesMatched.empty() &&
+             "Should only have one EmitMergeInputChains per match");
+
+      // Read all of the chained nodes.
+      for (unsigned i = 0; i != NumChains; ++i) {
+        unsigned RecNo = MatcherTable[MatcherIndex++];
+        assert(RecNo < RecordedNodes.size() && "Invalid EmitMergeInputChains");
+        ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
+
+        // If the chained node is not the root, we can't fold it if it has
+        // multiple uses.
+        // FIXME: What if other value results of the node have uses not matched
+        // by this pattern?
+        if (ChainNodesMatched.back() != NodeToMatch &&
+            !RecordedNodes[RecNo].first.hasOneUse()) {
+          ChainNodesMatched.clear();
+          break;
+        }
+      }
+
+      // If the inner loop broke out, the match fails.
+      if (ChainNodesMatched.empty())
+        break;
+
+      // Merge the input chains if they are not intra-pattern references.
+      InputChain = HandleMergeInputChains(ChainNodesMatched, CurDAG);
+
+      if (!InputChain.getNode())
+        break;  // Failed to merge.
+
+      continue;
+    }
+
+    case OPC_EmitCopyToReg:
+    case OPC_EmitCopyToReg2: {
+      unsigned RecNo = MatcherTable[MatcherIndex++];
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitCopyToReg");
+      unsigned DestPhysReg = MatcherTable[MatcherIndex++];
+      if (Opcode == OPC_EmitCopyToReg2)
+        DestPhysReg |= MatcherTable[MatcherIndex++] << 8;
+
+      if (!InputChain.getNode())
+        InputChain = CurDAG->getEntryNode();
+
+      InputChain = CurDAG->getCopyToReg(InputChain, SDLoc(NodeToMatch),
+                                        DestPhysReg, RecordedNodes[RecNo].first,
+                                        InputGlue);
+
+      InputGlue = InputChain.getValue(1);
+      continue;
+    }
+
+    case OPC_EmitNodeXForm: {
+      unsigned XFormNo = MatcherTable[MatcherIndex++];
+      unsigned RecNo = MatcherTable[MatcherIndex++];
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitNodeXForm");
+      SDValue Res = RunSDNodeXForm(RecordedNodes[RecNo].first, XFormNo);
+      RecordedNodes.push_back(std::pair<SDValue,SDNode*>(Res, nullptr));
+      continue;
+    }
+    case OPC_Coverage: {
+      // This is emitted right before MorphNode/EmitNode.
+      // So it should be safe to assume that this node has been selected
+      unsigned index = MatcherTable[MatcherIndex++];
+      index |= (MatcherTable[MatcherIndex++] << 8);
+      dbgs() << "COVERED: " << getPatternForIndex(index) << "\n";
+      dbgs() << "INCLUDED: " << getIncludePathForIndex(index) << "\n";
+      continue;
+    }
+
+    case OPC_EmitNode:     case OPC_MorphNodeTo:
+    case OPC_EmitNode0:    case OPC_EmitNode1:    case OPC_EmitNode2:
+    case OPC_MorphNodeTo0: case OPC_MorphNodeTo1: case OPC_MorphNodeTo2: {
+      uint16_t TargetOpc = MatcherTable[MatcherIndex++];
+      TargetOpc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
+      unsigned EmitNodeInfo = MatcherTable[MatcherIndex++];
+      // Get the result VT list.
+      unsigned NumVTs;
+      // If this is one of the compressed forms, get the number of VTs based
+      // on the Opcode. Otherwise read the next byte from the table.
+      if (Opcode >= OPC_MorphNodeTo0 && Opcode <= OPC_MorphNodeTo2)
+        NumVTs = Opcode - OPC_MorphNodeTo0;
+      else if (Opcode >= OPC_EmitNode0 && Opcode <= OPC_EmitNode2)
+        NumVTs = Opcode - OPC_EmitNode0;
+      else
+        NumVTs = MatcherTable[MatcherIndex++];
+      SmallVector<EVT, 4> VTs;
+      for (unsigned i = 0; i != NumVTs; ++i) {
+        MVT::SimpleValueType VT =
+          (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+        if (VT == MVT::iPTR)
+          VT = TLI->getPointerTy(CurDAG->getDataLayout()).SimpleTy;
+        VTs.push_back(VT);
+      }
+
+      if (EmitNodeInfo & OPFL_Chain)
+        VTs.push_back(MVT::Other);
+      if (EmitNodeInfo & OPFL_GlueOutput)
+        VTs.push_back(MVT::Glue);
+
+      // This is hot code, so optimize the two most common cases of 1 and 2
+      // results.
+      SDVTList VTList;
+      if (VTs.size() == 1)
+        VTList = CurDAG->getVTList(VTs[0]);
+      else if (VTs.size() == 2)
+        VTList = CurDAG->getVTList(VTs[0], VTs[1]);
+      else
+        VTList = CurDAG->getVTList(VTs);
+
+      // Get the operand list.
+      unsigned NumOps = MatcherTable[MatcherIndex++];
+      SmallVector<SDValue, 8> Ops;
+      for (unsigned i = 0; i != NumOps; ++i) {
+        unsigned RecNo = MatcherTable[MatcherIndex++];
+        if (RecNo & 128)
+          RecNo = GetVBR(RecNo, MatcherTable, MatcherIndex);
+
+        assert(RecNo < RecordedNodes.size() && "Invalid EmitNode");
+        Ops.push_back(RecordedNodes[RecNo].first);
+      }
+
+      // If there are variadic operands to add, handle them now.
+      if (EmitNodeInfo & OPFL_VariadicInfo) {
+        // Determine the start index to copy from.
+        unsigned FirstOpToCopy = getNumFixedFromVariadicInfo(EmitNodeInfo);
+        FirstOpToCopy += (EmitNodeInfo & OPFL_Chain) ? 1 : 0;
+        assert(NodeToMatch->getNumOperands() >= FirstOpToCopy &&
+               "Invalid variadic node");
+        // Copy all of the variadic operands, not including a potential glue
+        // input.
+        for (unsigned i = FirstOpToCopy, e = NodeToMatch->getNumOperands();
+             i != e; ++i) {
+          SDValue V = NodeToMatch->getOperand(i);
+          if (V.getValueType() == MVT::Glue) break;
+          Ops.push_back(V);
+        }
+      }
+
+      // If this has chain/glue inputs, add them.
+      if (EmitNodeInfo & OPFL_Chain)
+        Ops.push_back(InputChain);
+      if ((EmitNodeInfo & OPFL_GlueInput) && InputGlue.getNode() != nullptr)
+        Ops.push_back(InputGlue);
+
+      // Check whether any matched node could raise an FP exception.  Since all
+      // such nodes must have a chain, it suffices to check ChainNodesMatched.
+      // We need to perform this check before potentially modifying one of the
+      // nodes via MorphNode.
+      bool MayRaiseFPException =
+          llvm::any_of(ChainNodesMatched, [this](SDNode *N) {
+            return mayRaiseFPException(N) && !N->getFlags().hasNoFPExcept();
+          });
+
+      // Create the node.
+      MachineSDNode *Res = nullptr;
+      bool IsMorphNodeTo = Opcode == OPC_MorphNodeTo ||
+                     (Opcode >= OPC_MorphNodeTo0 && Opcode <= OPC_MorphNodeTo2);
+      if (!IsMorphNodeTo) {
+        // If this is a normal EmitNode command, just create the new node and
+        // add the results to the RecordedNodes list.
+        Res = CurDAG->getMachineNode(TargetOpc, SDLoc(NodeToMatch),
+                                     VTList, Ops);
+
+        // Add all the non-glue/non-chain results to the RecordedNodes list.
+        for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+          if (VTs[i] == MVT::Other || VTs[i] == MVT::Glue) break;
+          RecordedNodes.push_back(std::pair<SDValue,SDNode*>(SDValue(Res, i),
+                                                             nullptr));
+        }
+      } else {
+        assert(NodeToMatch->getOpcode() != ISD::DELETED_NODE &&
+               "NodeToMatch was removed partway through selection");
+        SelectionDAG::DAGNodeDeletedListener NDL(*CurDAG, [&](SDNode *N,
+                                                              SDNode *E) {
+          CurDAG->salvageDebugInfo(*N);
+          auto &Chain = ChainNodesMatched;
+          assert((!E || !is_contained(Chain, N)) &&
+                 "Chain node replaced during MorphNode");
+          llvm::erase_value(Chain, N);
+        });
+        Res = cast<MachineSDNode>(MorphNode(NodeToMatch, TargetOpc, VTList,
+                                            Ops, EmitNodeInfo));
+      }
+
+      // Set the NoFPExcept flag when no original matched node could
+      // raise an FP exception, but the new node potentially might.
+      if (!MayRaiseFPException && mayRaiseFPException(Res)) {
+        SDNodeFlags Flags = Res->getFlags();
+        Flags.setNoFPExcept(true);
+        Res->setFlags(Flags);
+      }
+
+      // If the node had chain/glue results, update our notion of the current
+      // chain and glue.
+      if (EmitNodeInfo & OPFL_GlueOutput) {
+        InputGlue = SDValue(Res, VTs.size()-1);
+        if (EmitNodeInfo & OPFL_Chain)
+          InputChain = SDValue(Res, VTs.size()-2);
+      } else if (EmitNodeInfo & OPFL_Chain)
+        InputChain = SDValue(Res, VTs.size()-1);
+
+      // If the OPFL_MemRefs glue is set on this node, slap all of the
+      // accumulated memrefs onto it.
+      //
+      // FIXME: This is vastly incorrect for patterns with multiple outputs
+      // instructions that access memory and for ComplexPatterns that match
+      // loads.
+      if (EmitNodeInfo & OPFL_MemRefs) {
+        // Only attach load or store memory operands if the generated
+        // instruction may load or store.
+        const MCInstrDesc &MCID = TII->get(TargetOpc);
+        bool mayLoad = MCID.mayLoad();
+        bool mayStore = MCID.mayStore();
+
+        // We expect to have relatively few of these so just filter them into a
+        // temporary buffer so that we can easily add them to the instruction.
+        SmallVector<MachineMemOperand *, 4> FilteredMemRefs;
+        for (MachineMemOperand *MMO : MatchedMemRefs) {
+          if (MMO->isLoad()) {
+            if (mayLoad)
+              FilteredMemRefs.push_back(MMO);
+          } else if (MMO->isStore()) {
+            if (mayStore)
+              FilteredMemRefs.push_back(MMO);
+          } else {
+            FilteredMemRefs.push_back(MMO);
+          }
+        }
+
+        CurDAG->setNodeMemRefs(Res, FilteredMemRefs);
+      }
+
+      LLVM_DEBUG(if (!MatchedMemRefs.empty() && Res->memoperands_empty()) dbgs()
+                     << "  Dropping mem operands\n";
+                 dbgs() << "  " << (IsMorphNodeTo ? "Morphed" : "Created")
+                        << " node: ";
+                 Res->dump(CurDAG););
+
+      // If this was a MorphNodeTo then we're completely done!
+      if (IsMorphNodeTo) {
+        // Update chain uses.
+        UpdateChains(Res, InputChain, ChainNodesMatched, true);
+        return;
+      }
+      continue;
+    }
+
+    case OPC_CompleteMatch: {
+      // The match has been completed, and any new nodes (if any) have been
+      // created.  Patch up references to the matched dag to use the newly
+      // created nodes.
+      unsigned NumResults = MatcherTable[MatcherIndex++];
+
+      for (unsigned i = 0; i != NumResults; ++i) {
+        unsigned ResSlot = MatcherTable[MatcherIndex++];
+        if (ResSlot & 128)
+          ResSlot = GetVBR(ResSlot, MatcherTable, MatcherIndex);
+
+        assert(ResSlot < RecordedNodes.size() && "Invalid CompleteMatch");
+        SDValue Res = RecordedNodes[ResSlot].first;
+
+        assert(i < NodeToMatch->getNumValues() &&
+               NodeToMatch->getValueType(i) != MVT::Other &&
+               NodeToMatch->getValueType(i) != MVT::Glue &&
+               "Invalid number of results to complete!");
+        assert((NodeToMatch->getValueType(i) == Res.getValueType() ||
+                NodeToMatch->getValueType(i) == MVT::iPTR ||
+                Res.getValueType() == MVT::iPTR ||
+                NodeToMatch->getValueType(i).getSizeInBits() ==
+                    Res.getValueSizeInBits()) &&
+               "invalid replacement");
+        ReplaceUses(SDValue(NodeToMatch, i), Res);
+      }
+
+      // Update chain uses.
+      UpdateChains(NodeToMatch, InputChain, ChainNodesMatched, false);
+
+      // If the root node defines glue, we need to update it to the glue result.
+      // TODO: This never happens in our tests and I think it can be removed /
+      // replaced with an assert, but if we do it this the way the change is
+      // NFC.
+      if (NodeToMatch->getValueType(NodeToMatch->getNumValues() - 1) ==
+              MVT::Glue &&
+          InputGlue.getNode())
+        ReplaceUses(SDValue(NodeToMatch, NodeToMatch->getNumValues() - 1),
+                    InputGlue);
+
+      assert(NodeToMatch->use_empty() &&
+             "Didn't replace all uses of the node?");
+      CurDAG->RemoveDeadNode(NodeToMatch);
+
+      return;
+    }
+    }
+
+    // If the code reached this point, then the match failed.  See if there is
+    // another child to try in the current 'Scope', otherwise pop it until we
+    // find a case to check.
+    LLVM_DEBUG(dbgs() << "  Match failed at index " << CurrentOpcodeIndex
+                      << "\n");
+    ++NumDAGIselRetries;
+    while (true) {
+      if (MatchScopes.empty()) {
+        CannotYetSelect(NodeToMatch);
+        return;
+      }
+
+      // Restore the interpreter state back to the point where the scope was
+      // formed.
+      MatchScope &LastScope = MatchScopes.back();
+      RecordedNodes.resize(LastScope.NumRecordedNodes);
+      NodeStack.clear();
+      NodeStack.append(LastScope.NodeStack.begin(), LastScope.NodeStack.end());
+      N = NodeStack.back();
+
+      if (LastScope.NumMatchedMemRefs != MatchedMemRefs.size())
+        MatchedMemRefs.resize(LastScope.NumMatchedMemRefs);
+      MatcherIndex = LastScope.FailIndex;
+
+      LLVM_DEBUG(dbgs() << "  Continuing at " << MatcherIndex << "\n");
+
+      InputChain = LastScope.InputChain;
+      InputGlue = LastScope.InputGlue;
+      if (!LastScope.HasChainNodesMatched)
+        ChainNodesMatched.clear();
+
+      // Check to see what the offset is at the new MatcherIndex.  If it is zero
+      // we have reached the end of this scope, otherwise we have another child
+      // in the current scope to try.
+      unsigned NumToSkip = MatcherTable[MatcherIndex++];
+      if (NumToSkip & 128)
+        NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex);
+
+      // If we have another child in this scope to match, update FailIndex and
+      // try it.
+      if (NumToSkip != 0) {
+        LastScope.FailIndex = MatcherIndex+NumToSkip;
+        break;
+      }
+
+      // End of this scope, pop it and try the next child in the containing
+      // scope.
+      MatchScopes.pop_back();
+    }
+  }
+}
+
+/// Return whether the node may raise an FP exception.
+bool SelectionDAGISel::mayRaiseFPException(SDNode *N) const {
+  // For machine opcodes, consult the MCID flag.
+  if (N->isMachineOpcode()) {
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    return MCID.mayRaiseFPException();
+  }
+
+  // For ISD opcodes, only StrictFP opcodes may raise an FP
+  // exception.
+  if (N->isTargetOpcode())
+    return N->isTargetStrictFPOpcode();
+  return N->isStrictFPOpcode();
+}
+
+bool SelectionDAGISel::isOrEquivalentToAdd(const SDNode *N) const {
+  assert(N->getOpcode() == ISD::OR && "Unexpected opcode");
+  auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!C)
+    return false;
+
+  // Detect when "or" is used to add an offset to a stack object.
+  if (auto *FN = dyn_cast<FrameIndexSDNode>(N->getOperand(0))) {
+    MachineFrameInfo &MFI = MF->getFrameInfo();
+    Align A = MFI.getObjectAlign(FN->getIndex());
+    int32_t Off = C->getSExtValue();
+    // If the alleged offset fits in the zero bits guaranteed by
+    // the alignment, then this or is really an add.
+    return (Off >= 0) && (((A.value() - 1) & Off) == unsigned(Off));
+  }
+  return false;
+}
+
+void SelectionDAGISel::CannotYetSelect(SDNode *N) {
+  std::string msg;
+  raw_string_ostream Msg(msg);
+  Msg << "Cannot select: ";
+
+  if (N->getOpcode() != ISD::INTRINSIC_W_CHAIN &&
+      N->getOpcode() != ISD::INTRINSIC_WO_CHAIN &&
+      N->getOpcode() != ISD::INTRINSIC_VOID) {
+    N->printrFull(Msg, CurDAG);
+    Msg << "\nIn function: " << MF->getName();
+  } else {
+    bool HasInputChain = N->getOperand(0).getValueType() == MVT::Other;
+    unsigned iid =
+      cast<ConstantSDNode>(N->getOperand(HasInputChain))->getZExtValue();
+    if (iid < Intrinsic::num_intrinsics)
+      Msg << "intrinsic %" << Intrinsic::getBaseName((Intrinsic::ID)iid);
+    else if (const TargetIntrinsicInfo *TII = TM.getIntrinsicInfo())
+      Msg << "target intrinsic %" << TII->getName(iid);
+    else
+      Msg << "unknown intrinsic #" << iid;
+  }
+  report_fatal_error(Twine(Msg.str()));
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
new file mode 100644
index 000000000000..b66eeb6d2bb1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -0,0 +1,314 @@
+//===-- SelectionDAGPrinter.cpp - Implement SelectionDAG::viewGraph() -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAG::viewGraph method.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "dag-printer"
+
+namespace llvm {
+  template<>
+  struct DOTGraphTraits<SelectionDAG*> : public DefaultDOTGraphTraits {
+
+    explicit DOTGraphTraits(bool isSimple=false) :
+      DefaultDOTGraphTraits(isSimple) {}
+
+    static bool hasEdgeDestLabels() {
+      return true;
+    }
+
+    static unsigned numEdgeDestLabels(const void *Node) {
+      return ((const SDNode *) Node)->getNumValues();
+    }
+
+    static std::string getEdgeDestLabel(const void *Node, unsigned i) {
+      return ((const SDNode *) Node)->getValueType(i).getEVTString();
+    }
+
+    template<typename EdgeIter>
+    static std::string getEdgeSourceLabel(const void *Node, EdgeIter I) {
+      return itostr(I - SDNodeIterator::begin((const SDNode *) Node));
+    }
+
+    /// edgeTargetsEdgeSource - This method returns true if this outgoing edge
+    /// should actually target another edge source, not a node.  If this method
+    /// is implemented, getEdgeTarget should be implemented.
+    template<typename EdgeIter>
+    static bool edgeTargetsEdgeSource(const void *Node, EdgeIter I) {
+      return true;
+    }
+
+    /// getEdgeTarget - If edgeTargetsEdgeSource returns true, this method is
+    /// called to determine which outgoing edge of Node is the target of this
+    /// edge.
+    template<typename EdgeIter>
+    static EdgeIter getEdgeTarget(const void *Node, EdgeIter I) {
+      SDNode *TargetNode = *I;
+      SDNodeIterator NI = SDNodeIterator::begin(TargetNode);
+      std::advance(NI, I.getNode()->getOperand(I.getOperand()).getResNo());
+      return NI;
+    }
+
+    static std::string getGraphName(const SelectionDAG *G) {
+      return std::string(G->getMachineFunction().getName());
+    }
+
+    static bool renderGraphFromBottomUp() {
+      return true;
+    }
+
+    static std::string getNodeIdentifierLabel(const SDNode *Node,
+                                              const SelectionDAG *Graph) {
+      std::string R;
+      raw_string_ostream OS(R);
+#ifndef NDEBUG
+      OS << 't' << Node->PersistentId;
+#else
+      OS << static_cast<const void *>(Node);
+#endif
+      return R;
+    }
+
+    /// If you want to override the dot attributes printed for a particular
+    /// edge, override this method.
+    template<typename EdgeIter>
+    static std::string getEdgeAttributes(const void *Node, EdgeIter EI,
+                                         const SelectionDAG *Graph) {
+      SDValue Op = EI.getNode()->getOperand(EI.getOperand());
+      EVT VT = Op.getValueType();
+      if (VT == MVT::Glue)
+        return "color=red,style=bold";
+      else if (VT == MVT::Other)
+        return "color=blue,style=dashed";
+      return "";
+    }
+
+
+    static std::string getSimpleNodeLabel(const SDNode *Node,
+                                          const SelectionDAG *G) {
+      std::string Result = Node->getOperationName(G);
+      {
+        raw_string_ostream OS(Result);
+        Node->print_details(OS, G);
+      }
+      return Result;
+    }
+    std::string getNodeLabel(const SDNode *Node, const SelectionDAG *Graph);
+    static std::string getNodeAttributes(const SDNode *N,
+                                         const SelectionDAG *Graph) {
+#ifndef NDEBUG
+      const std::string &Attrs = Graph->getGraphAttrs(N);
+      if (!Attrs.empty()) {
+        if (Attrs.find("shape=") == std::string::npos)
+          return std::string("shape=Mrecord,") + Attrs;
+        else
+          return Attrs;
+      }
+#endif
+      return "shape=Mrecord";
+    }
+
+    static void addCustomGraphFeatures(SelectionDAG *G,
+                                       GraphWriter<SelectionDAG*> &GW) {
+      GW.emitSimpleNode(nullptr, "plaintext=circle", "GraphRoot");
+      if (G->getRoot().getNode())
+        GW.emitEdge(nullptr, -1, G->getRoot().getNode(), G->getRoot().getResNo(),
+                    "color=blue,style=dashed");
+    }
+  };
+}
+
+std::string DOTGraphTraits<SelectionDAG*>::getNodeLabel(const SDNode *Node,
+                                                        const SelectionDAG *G) {
+  return DOTGraphTraits<SelectionDAG*>::getSimpleNodeLabel(Node, G);
+}
+
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void SelectionDAG::viewGraph(const std::string &Title) {
+// This code is only for debugging!
+#ifndef NDEBUG
+  ViewGraph(this, "dag." + getMachineFunction().getName(),
+            false, Title);
+#else
+  errs() << "SelectionDAG::viewGraph is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif  // NDEBUG
+}
+
+// This overload is defined out-of-line here instead of just using a
+// default parameter because this is easiest for gdb to call.
+void SelectionDAG::viewGraph() {
+  viewGraph("");
+}
+
+/// Just dump dot graph to a user-provided path and title.
+/// This doesn't open the dot viewer program and
+/// helps visualization when outside debugging session.
+/// FileName expects absolute path. If provided
+/// without any path separators then the file
+/// will be created in the current directory.
+/// Error will be emitted if the path is insane.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void SelectionDAG::dumpDotGraph(const Twine &FileName,
+                                                 const Twine &Title) {
+  dumpDotGraphToFile(this, FileName, Title);
+}
+#endif
+
+/// clearGraphAttrs - Clear all previously defined node graph attributes.
+/// Intended to be used from a debugging tool (eg. gdb).
+void SelectionDAG::clearGraphAttrs() {
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+  NodeGraphAttrs.clear();
+#else
+  errs() << "SelectionDAG::clearGraphAttrs is only available in builds with "
+         << "ABI breaking checks enabled on systems with Graphviz or gv!\n";
+#endif
+}
+
+
+/// setGraphAttrs - Set graph attributes for a node. (eg. "color=red".)
+///
+void SelectionDAG::setGraphAttrs(const SDNode *N, const char *Attrs) {
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+  NodeGraphAttrs[N] = Attrs;
+#else
+  errs() << "SelectionDAG::setGraphAttrs is only available in builds with "
+         << "ABI breaking checks enabled on systems with Graphviz or gv!\n";
+#endif
+}
+
+
+/// getGraphAttrs - Get graph attributes for a node. (eg. "color=red".)
+/// Used from getNodeAttributes.
+std::string SelectionDAG::getGraphAttrs(const SDNode *N) const {
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+  std::map<const SDNode *, std::string>::const_iterator I =
+    NodeGraphAttrs.find(N);
+
+  if (I != NodeGraphAttrs.end())
+    return I->second;
+  else
+    return "";
+#else
+  errs() << "SelectionDAG::getGraphAttrs is only available in builds with "
+         << "ABI breaking checks enabled on systems with Graphviz or gv!\n";
+  return std::string();
+#endif
+}
+
+/// setGraphColor - Convenience for setting node color attribute.
+///
+void SelectionDAG::setGraphColor(const SDNode *N, const char *Color) {
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+  NodeGraphAttrs[N] = std::string("color=") + Color;
+#else
+  errs() << "SelectionDAG::setGraphColor is only available in builds with "
+         << "ABI breaking checks enabled on systems with Graphviz or gv!\n";
+#endif
+}
+
+/// setSubgraphColorHelper - Implement setSubgraphColor.  Return
+/// whether we truncated the search.
+///
+bool SelectionDAG::setSubgraphColorHelper(SDNode *N, const char *Color, DenseSet<SDNode *> &visited,
+                                          int level, bool &printed) {
+  bool hit_limit = false;
+
+#ifndef NDEBUG
+  if (level >= 20) {
+    if (!printed) {
+      printed = true;
+      LLVM_DEBUG(dbgs() << "setSubgraphColor hit max level\n");
+    }
+    return true;
+  }
+
+  unsigned oldSize = visited.size();
+  visited.insert(N);
+  if (visited.size() != oldSize) {
+    setGraphColor(N, Color);
+    for(SDNodeIterator i = SDNodeIterator::begin(N), iend = SDNodeIterator::end(N);
+        i != iend;
+        ++i) {
+      hit_limit = setSubgraphColorHelper(*i, Color, visited, level+1, printed) || hit_limit;
+    }
+  }
+#else
+  errs() << "SelectionDAG::setSubgraphColor is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+  return hit_limit;
+}
+
+/// setSubgraphColor - Convenience for setting subgraph color attribute.
+///
+void SelectionDAG::setSubgraphColor(SDNode *N, const char *Color) {
+#ifndef NDEBUG
+  DenseSet<SDNode *> visited;
+  bool printed = false;
+  if (setSubgraphColorHelper(N, Color, visited, 0, printed)) {
+    // Visually mark that we hit the limit
+    if (strcmp(Color, "red") == 0) {
+      setSubgraphColorHelper(N, "blue", visited, 0, printed);
+    } else if (strcmp(Color, "yellow") == 0) {
+      setSubgraphColorHelper(N, "green", visited, 0, printed);
+    }
+  }
+
+#else
+  errs() << "SelectionDAG::setSubgraphColor is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+std::string ScheduleDAGSDNodes::getGraphNodeLabel(const SUnit *SU) const {
+  std::string s;
+  raw_string_ostream O(s);
+  O << "SU(" << SU->NodeNum << "): ";
+  if (SU->getNode()) {
+    SmallVector<SDNode *, 4> GluedNodes;
+    for (SDNode *N = SU->getNode(); N; N = N->getGluedNode())
+      GluedNodes.push_back(N);
+    while (!GluedNodes.empty()) {
+      O << DOTGraphTraits<SelectionDAG*>
+        ::getSimpleNodeLabel(GluedNodes.back(), DAG);
+      GluedNodes.pop_back();
+      if (!GluedNodes.empty())
+        O << "\n    ";
+    }
+  } else {
+    O << "CROSS RC COPY";
+  }
+  return O.str();
+}
+
+void ScheduleDAGSDNodes::getCustomGraphFeatures(GraphWriter<ScheduleDAG*> &GW) const {
+  if (DAG) {
+    // Draw a special "GraphRoot" node to indicate the root of the graph.
+    GW.emitSimpleNode(nullptr, "plaintext=circle", "GraphRoot");
+    const SDNode *N = DAG->getRoot().getNode();
+    if (N && N->getNodeId() != -1)
+      GW.emitEdge(nullptr, -1, &SUnits[N->getNodeId()], -1,
+                  "color=blue,style=dashed");
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGTargetInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGTargetInfo.cpp
new file mode 100644
index 000000000000..3a2df6f60593
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGTargetInfo.cpp
@@ -0,0 +1,17 @@
+//===- SelectionDAGTargetInfo.cpp - SelectionDAG Info ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAGTargetInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+
+using namespace llvm;
+
+SelectionDAGTargetInfo::~SelectionDAGTargetInfo() = default;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
new file mode 100644
index 000000000000..5afd05648772
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -0,0 +1,1313 @@
+//===- StatepointLowering.cpp - SDAGBuilder's statepoint code -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes support code use by SelectionDAGBuilder when lowering a
+// statepoint sequence in SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "StatepointLowering.h"
+#include "SelectionDAGBuilder.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GCStrategy.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "statepoint-lowering"
+
+STATISTIC(NumSlotsAllocatedForStatepoints,
+          "Number of stack slots allocated for statepoints");
+STATISTIC(NumOfStatepoints, "Number of statepoint nodes encountered");
+STATISTIC(StatepointMaxSlotsRequired,
+          "Maximum number of stack slots required for a singe statepoint");
+
+cl::opt<bool> UseRegistersForDeoptValues(
+    "use-registers-for-deopt-values", cl::Hidden, cl::init(false),
+    cl::desc("Allow using registers for non pointer deopt args"));
+
+cl::opt<bool> UseRegistersForGCPointersInLandingPad(
+    "use-registers-for-gc-values-in-landing-pad", cl::Hidden, cl::init(false),
+    cl::desc("Allow using registers for gc pointer in landing pad"));
+
+cl::opt<unsigned> MaxRegistersForGCPointers(
+    "max-registers-for-gc-values", cl::Hidden, cl::init(0),
+    cl::desc("Max number of VRegs allowed to pass GC pointer meta args in"));
+
+typedef FunctionLoweringInfo::StatepointRelocationRecord RecordType;
+
+static void pushStackMapConstant(SmallVectorImpl<SDValue>& Ops,
+                                 SelectionDAGBuilder &Builder, uint64_t Value) {
+  SDLoc L = Builder.getCurSDLoc();
+  Ops.push_back(Builder.DAG.getTargetConstant(StackMaps::ConstantOp, L,
+                                              MVT::i64));
+  Ops.push_back(Builder.DAG.getTargetConstant(Value, L, MVT::i64));
+}
+
+void StatepointLoweringState::startNewStatepoint(SelectionDAGBuilder &Builder) {
+  // Consistency check
+  assert(PendingGCRelocateCalls.empty() &&
+         "Trying to visit statepoint before finished processing previous one");
+  Locations.clear();
+  NextSlotToAllocate = 0;
+  // Need to resize this on each safepoint - we need the two to stay in sync and
+  // the clear patterns of a SelectionDAGBuilder have no relation to
+  // FunctionLoweringInfo.  Also need to ensure used bits get cleared.
+  AllocatedStackSlots.clear();
+  AllocatedStackSlots.resize(Builder.FuncInfo.StatepointStackSlots.size());
+}
+
+void StatepointLoweringState::clear() {
+  Locations.clear();
+  AllocatedStackSlots.clear();
+  assert(PendingGCRelocateCalls.empty() &&
+         "cleared before statepoint sequence completed");
+}
+
+SDValue
+StatepointLoweringState::allocateStackSlot(EVT ValueType,
+                                           SelectionDAGBuilder &Builder) {
+  NumSlotsAllocatedForStatepoints++;
+  MachineFrameInfo &MFI = Builder.DAG.getMachineFunction().getFrameInfo();
+
+  unsigned SpillSize = ValueType.getStoreSize();
+  assert((SpillSize * 8) ==
+             (-8u & (7 + ValueType.getSizeInBits())) && // Round up modulo 8.
+         "Size not in bytes?");
+
+  // First look for a previously created stack slot which is not in
+  // use (accounting for the fact arbitrary slots may already be
+  // reserved), or to create a new stack slot and use it.
+
+  const size_t NumSlots = AllocatedStackSlots.size();
+  assert(NextSlotToAllocate <= NumSlots && "Broken invariant");
+
+  assert(AllocatedStackSlots.size() ==
+         Builder.FuncInfo.StatepointStackSlots.size() &&
+         "Broken invariant");
+
+  for (; NextSlotToAllocate < NumSlots; NextSlotToAllocate++) {
+    if (!AllocatedStackSlots.test(NextSlotToAllocate)) {
+      const int FI = Builder.FuncInfo.StatepointStackSlots[NextSlotToAllocate];
+      if (MFI.getObjectSize(FI) == SpillSize) {
+        AllocatedStackSlots.set(NextSlotToAllocate);
+        // TODO: Is ValueType the right thing to use here?
+        return Builder.DAG.getFrameIndex(FI, ValueType);
+      }
+    }
+  }
+
+  // Couldn't find a free slot, so create a new one:
+
+  SDValue SpillSlot = Builder.DAG.CreateStackTemporary(ValueType);
+  const unsigned FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+  MFI.markAsStatepointSpillSlotObjectIndex(FI);
+
+  Builder.FuncInfo.StatepointStackSlots.push_back(FI);
+  AllocatedStackSlots.resize(AllocatedStackSlots.size()+1, true);
+  assert(AllocatedStackSlots.size() ==
+         Builder.FuncInfo.StatepointStackSlots.size() &&
+         "Broken invariant");
+
+  StatepointMaxSlotsRequired.updateMax(
+      Builder.FuncInfo.StatepointStackSlots.size());
+
+  return SpillSlot;
+}
+
+/// Utility function for reservePreviousStackSlotForValue. Tries to find
+/// stack slot index to which we have spilled value for previous statepoints.
+/// LookUpDepth specifies maximum DFS depth this function is allowed to look.
+static std::optional<int> findPreviousSpillSlot(const Value *Val,
+                                                SelectionDAGBuilder &Builder,
+                                                int LookUpDepth) {
+  // Can not look any further - give up now
+  if (LookUpDepth <= 0)
+    return std::nullopt;
+
+  // Spill location is known for gc relocates
+  if (const auto *Relocate = dyn_cast<GCRelocateInst>(Val)) {
+    const Value *Statepoint = Relocate->getStatepoint();
+    assert((isa<GCStatepointInst>(Statepoint) || isa<UndefValue>(Statepoint)) &&
+           "GetStatepoint must return one of two types");
+    if (isa<UndefValue>(Statepoint))
+      return std::nullopt;
+
+    const auto &RelocationMap = Builder.FuncInfo.StatepointRelocationMaps
+                                    [cast<GCStatepointInst>(Statepoint)];
+
+    auto It = RelocationMap.find(Relocate);
+    if (It == RelocationMap.end())
+      return std::nullopt;
+
+    auto &Record = It->second;
+    if (Record.type != RecordType::Spill)
+      return std::nullopt;
+
+    return Record.payload.FI;
+  }
+
+  // Look through bitcast instructions.
+  if (const BitCastInst *Cast = dyn_cast<BitCastInst>(Val))
+    return findPreviousSpillSlot(Cast->getOperand(0), Builder, LookUpDepth - 1);
+
+  // Look through phi nodes
+  // All incoming values should have same known stack slot, otherwise result
+  // is unknown.
+  if (const PHINode *Phi = dyn_cast<PHINode>(Val)) {
+    std::optional<int> MergedResult;
+
+    for (const auto &IncomingValue : Phi->incoming_values()) {
+      std::optional<int> SpillSlot =
+          findPreviousSpillSlot(IncomingValue, Builder, LookUpDepth - 1);
+      if (!SpillSlot)
+        return std::nullopt;
+
+      if (MergedResult && *MergedResult != *SpillSlot)
+        return std::nullopt;
+
+      MergedResult = SpillSlot;
+    }
+    return MergedResult;
+  }
+
+  // TODO: We can do better for PHI nodes. In cases like this:
+  //   ptr = phi(relocated_pointer, not_relocated_pointer)
+  //   statepoint(ptr)
+  // We will return that stack slot for ptr is unknown. And later we might
+  // assign different stack slots for ptr and relocated_pointer. This limits
+  // llvm's ability to remove redundant stores.
+  // Unfortunately it's hard to accomplish in current infrastructure.
+  // We use this function to eliminate spill store completely, while
+  // in example we still need to emit store, but instead of any location
+  // we need to use special "preferred" location.
+
+  // TODO: handle simple updates.  If a value is modified and the original
+  // value is no longer live, it would be nice to put the modified value in the
+  // same slot.  This allows folding of the memory accesses for some
+  // instructions types (like an increment).
+  //   statepoint (i)
+  //   i1 = i+1
+  //   statepoint (i1)
+  // However we need to be careful for cases like this:
+  //   statepoint(i)
+  //   i1 = i+1
+  //   statepoint(i, i1)
+  // Here we want to reserve spill slot for 'i', but not for 'i+1'. If we just
+  // put handling of simple modifications in this function like it's done
+  // for bitcasts we might end up reserving i's slot for 'i+1' because order in
+  // which we visit values is unspecified.
+
+  // Don't know any information about this instruction
+  return std::nullopt;
+}
+
+/// Return true if-and-only-if the given SDValue can be lowered as either a
+/// constant argument or a stack reference.  The key point is that the value
+/// doesn't need to be spilled or tracked as a vreg use.
+static bool willLowerDirectly(SDValue Incoming) {
+  // We are making an unchecked assumption that the frame size <= 2^16 as that
+  // is the largest offset which can be encoded in the stackmap format.
+  if (isa<FrameIndexSDNode>(Incoming))
+    return true;
+
+  // The largest constant describeable in the StackMap format is 64 bits.
+  // Potential Optimization:  Constants values are sign extended by consumer,
+  // and thus there are many constants of static type > 64 bits whose value
+  // happens to be sext(Con64) and could thus be lowered directly.
+  if (Incoming.getValueType().getSizeInBits() > 64)
+    return false;
+
+  return isIntOrFPConstant(Incoming) || Incoming.isUndef();
+}
+
+/// Try to find existing copies of the incoming values in stack slots used for
+/// statepoint spilling.  If we can find a spill slot for the incoming value,
+/// mark that slot as allocated, and reuse the same slot for this safepoint.
+/// This helps to avoid series of loads and stores that only serve to reshuffle
+/// values on the stack between calls.
+static void reservePreviousStackSlotForValue(const Value *IncomingValue,
+                                             SelectionDAGBuilder &Builder) {
+  SDValue Incoming = Builder.getValue(IncomingValue);
+
+  // If we won't spill this, we don't need to check for previously allocated
+  // stack slots.
+  if (willLowerDirectly(Incoming))
+    return;
+
+  SDValue OldLocation = Builder.StatepointLowering.getLocation(Incoming);
+  if (OldLocation.getNode())
+    // Duplicates in input
+    return;
+
+  const int LookUpDepth = 6;
+  std::optional<int> Index =
+      findPreviousSpillSlot(IncomingValue, Builder, LookUpDepth);
+  if (!Index)
+    return;
+
+  const auto &StatepointSlots = Builder.FuncInfo.StatepointStackSlots;
+
+  auto SlotIt = find(StatepointSlots, *Index);
+  assert(SlotIt != StatepointSlots.end() &&
+         "Value spilled to the unknown stack slot");
+
+  // This is one of our dedicated lowering slots
+  const int Offset = std::distance(StatepointSlots.begin(), SlotIt);
+  if (Builder.StatepointLowering.isStackSlotAllocated(Offset)) {
+    // stack slot already assigned to someone else, can't use it!
+    // TODO: currently we reserve space for gc arguments after doing
+    // normal allocation for deopt arguments.  We should reserve for
+    // _all_ deopt and gc arguments, then start allocating.  This
+    // will prevent some moves being inserted when vm state changes,
+    // but gc state doesn't between two calls.
+    return;
+  }
+  // Reserve this stack slot
+  Builder.StatepointLowering.reserveStackSlot(Offset);
+
+  // Cache this slot so we find it when going through the normal
+  // assignment loop.
+  SDValue Loc =
+      Builder.DAG.getTargetFrameIndex(*Index, Builder.getFrameIndexTy());
+  Builder.StatepointLowering.setLocation(Incoming, Loc);
+}
+
+/// Extract call from statepoint, lower it and return pointer to the
+/// call node. Also update NodeMap so that getValue(statepoint) will
+/// reference lowered call result
+static std::pair<SDValue, SDNode *> lowerCallFromStatepointLoweringInfo(
+    SelectionDAGBuilder::StatepointLoweringInfo &SI,
+    SelectionDAGBuilder &Builder) {
+  SDValue ReturnValue, CallEndVal;
+  std::tie(ReturnValue, CallEndVal) =
+      Builder.lowerInvokable(SI.CLI, SI.EHPadBB);
+  SDNode *CallEnd = CallEndVal.getNode();
+
+  // Get a call instruction from the call sequence chain.  Tail calls are not
+  // allowed.  The following code is essentially reverse engineering X86's
+  // LowerCallTo.
+  //
+  // We are expecting DAG to have the following form:
+  //
+  // ch = eh_label (only in case of invoke statepoint)
+  //   ch, glue = callseq_start ch
+  //   ch, glue = X86::Call ch, glue
+  //   ch, glue = callseq_end ch, glue
+  //   get_return_value ch, glue
+  //
+  // get_return_value can either be a sequence of CopyFromReg instructions
+  // to grab the return value from the return register(s), or it can be a LOAD
+  // to load a value returned by reference via a stack slot.
+
+  bool HasDef = !SI.CLI.RetTy->isVoidTy();
+  if (HasDef) {
+    if (CallEnd->getOpcode() == ISD::LOAD)
+      CallEnd = CallEnd->getOperand(0).getNode();
+    else
+      while (CallEnd->getOpcode() == ISD::CopyFromReg)
+        CallEnd = CallEnd->getOperand(0).getNode();
+  }
+
+  assert(CallEnd->getOpcode() == ISD::CALLSEQ_END && "expected!");
+  return std::make_pair(ReturnValue, CallEnd->getOperand(0).getNode());
+}
+
+static MachineMemOperand* getMachineMemOperand(MachineFunction &MF,
+                                               FrameIndexSDNode &FI) {
+  auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FI.getIndex());
+  auto MMOFlags = MachineMemOperand::MOStore |
+    MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
+  auto &MFI = MF.getFrameInfo();
+  return MF.getMachineMemOperand(PtrInfo, MMOFlags,
+                                 MFI.getObjectSize(FI.getIndex()),
+                                 MFI.getObjectAlign(FI.getIndex()));
+}
+
+/// Spill a value incoming to the statepoint. It might be either part of
+/// vmstate
+/// or gcstate. In both cases unconditionally spill it on the stack unless it
+/// is a null constant. Return pair with first element being frame index
+/// containing saved value and second element with outgoing chain from the
+/// emitted store
+static std::tuple<SDValue, SDValue, MachineMemOperand*>
+spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
+                             SelectionDAGBuilder &Builder) {
+  SDValue Loc = Builder.StatepointLowering.getLocation(Incoming);
+  MachineMemOperand* MMO = nullptr;
+
+  // Emit new store if we didn't do it for this ptr before
+  if (!Loc.getNode()) {
+    Loc = Builder.StatepointLowering.allocateStackSlot(Incoming.getValueType(),
+                                                       Builder);
+    int Index = cast<FrameIndexSDNode>(Loc)->getIndex();
+    // We use TargetFrameIndex so that isel will not select it into LEA
+    Loc = Builder.DAG.getTargetFrameIndex(Index, Builder.getFrameIndexTy());
+
+    // Right now we always allocate spill slots that are of the same
+    // size as the value we're about to spill (the size of spillee can
+    // vary since we spill vectors of pointers too).  At some point we
+    // can consider allowing spills of smaller values to larger slots
+    // (i.e. change the '==' in the assert below to a '>=').
+    MachineFrameInfo &MFI = Builder.DAG.getMachineFunction().getFrameInfo();
+    assert((MFI.getObjectSize(Index) * 8) ==
+               (-8 & (7 + // Round up modulo 8.
+                      (int64_t)Incoming.getValueSizeInBits())) &&
+           "Bad spill:  stack slot does not match!");
+
+    // Note: Using the alignment of the spill slot (rather than the abi or
+    // preferred alignment) is required for correctness when dealing with spill
+    // slots with preferred alignments larger than frame alignment..
+    auto &MF = Builder.DAG.getMachineFunction();
+    auto PtrInfo = MachinePointerInfo::getFixedStack(MF, Index);
+    auto *StoreMMO = MF.getMachineMemOperand(
+        PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(Index),
+        MFI.getObjectAlign(Index));
+    Chain = Builder.DAG.getStore(Chain, Builder.getCurSDLoc(), Incoming, Loc,
+                                 StoreMMO);
+
+    MMO = getMachineMemOperand(MF, *cast<FrameIndexSDNode>(Loc));
+
+    Builder.StatepointLowering.setLocation(Incoming, Loc);
+  }
+
+  assert(Loc.getNode());
+  return std::make_tuple(Loc, Chain, MMO);
+}
+
+/// Lower a single value incoming to a statepoint node.  This value can be
+/// either a deopt value or a gc value, the handling is the same.  We special
+/// case constants and allocas, then fall back to spilling if required.
+static void
+lowerIncomingStatepointValue(SDValue Incoming, bool RequireSpillSlot,
+                             SmallVectorImpl<SDValue> &Ops,
+                             SmallVectorImpl<MachineMemOperand *> &MemRefs,
+                             SelectionDAGBuilder &Builder) {
+  
+  if (willLowerDirectly(Incoming)) {
+    if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) {
+      // This handles allocas as arguments to the statepoint (this is only
+      // really meaningful for a deopt value.  For GC, we'd be trying to
+      // relocate the address of the alloca itself?)
+      assert(Incoming.getValueType() == Builder.getFrameIndexTy() &&
+             "Incoming value is a frame index!");
+      Ops.push_back(Builder.DAG.getTargetFrameIndex(FI->getIndex(),
+                                                    Builder.getFrameIndexTy()));
+
+      auto &MF = Builder.DAG.getMachineFunction();
+      auto *MMO = getMachineMemOperand(MF, *FI);
+      MemRefs.push_back(MMO);
+      return;
+    }
+
+    assert(Incoming.getValueType().getSizeInBits() <= 64);
+    
+    if (Incoming.isUndef()) {
+      // Put an easily recognized constant that's unlikely to be a valid
+      // value so that uses of undef by the consumer of the stackmap is
+      // easily recognized. This is legal since the compiler is always
+      // allowed to chose an arbitrary value for undef.
+      pushStackMapConstant(Ops, Builder, 0xFEFEFEFE);
+      return;
+    }
+
+    // If the original value was a constant, make sure it gets recorded as
+    // such in the stackmap.  This is required so that the consumer can
+    // parse any internal format to the deopt state.  It also handles null
+    // pointers and other constant pointers in GC states.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Incoming)) {
+      pushStackMapConstant(Ops, Builder, C->getSExtValue());
+      return;
+    } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Incoming)) {
+      pushStackMapConstant(Ops, Builder,
+                           C->getValueAPF().bitcastToAPInt().getZExtValue());
+      return;
+    }
+
+    llvm_unreachable("unhandled direct lowering case");
+  }
+
+
+
+  if (!RequireSpillSlot) {
+    // If this value is live in (not live-on-return, or live-through), we can
+    // treat it the same way patchpoint treats it's "live in" values.  We'll
+    // end up folding some of these into stack references, but they'll be
+    // handled by the register allocator.  Note that we do not have the notion
+    // of a late use so these values might be placed in registers which are
+    // clobbered by the call.  This is fine for live-in. For live-through
+    // fix-up pass should be executed to force spilling of such registers.
+    Ops.push_back(Incoming);
+  } else {
+    // Otherwise, locate a spill slot and explicitly spill it so it can be
+    // found by the runtime later.  Note: We know all of these spills are
+    // independent, but don't bother to exploit that chain wise.  DAGCombine
+    // will happily do so as needed, so doing it here would be a small compile
+    // time win at most. 
+    SDValue Chain = Builder.getRoot();
+    auto Res = spillIncomingStatepointValue(Incoming, Chain, Builder);
+    Ops.push_back(std::get<0>(Res));
+    if (auto *MMO = std::get<2>(Res))
+      MemRefs.push_back(MMO);
+    Chain = std::get<1>(Res);
+    Builder.DAG.setRoot(Chain);
+  }
+
+}
+
+/// Return true if value V represents the GC value. The behavior is conservative
+/// in case it is not sure that value is not GC the function returns true.
+static bool isGCValue(const Value *V, SelectionDAGBuilder &Builder) {
+  auto *Ty = V->getType();
+  if (!Ty->isPtrOrPtrVectorTy())
+    return false;
+  if (auto *GFI = Builder.GFI)
+    if (auto IsManaged = GFI->getStrategy().isGCManagedPointer(Ty))
+      return *IsManaged;
+  return true; // conservative
+}
+
+/// Lower deopt state and gc pointer arguments of the statepoint.  The actual
+/// lowering is described in lowerIncomingStatepointValue.  This function is
+/// responsible for lowering everything in the right position and playing some
+/// tricks to avoid redundant stack manipulation where possible.  On
+/// completion, 'Ops' will contain ready to use operands for machine code
+/// statepoint. The chain nodes will have already been created and the DAG root
+/// will be set to the last value spilled (if any were).
+static void
+lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
+                        SmallVectorImpl<MachineMemOperand *> &MemRefs,
+                        SmallVectorImpl<SDValue> &GCPtrs,
+                        DenseMap<SDValue, int> &LowerAsVReg,
+                        SelectionDAGBuilder::StatepointLoweringInfo &SI,
+                        SelectionDAGBuilder &Builder) {
+  // Lower the deopt and gc arguments for this statepoint.  Layout will be:
+  // deopt argument length, deopt arguments.., gc arguments...
+
+  // Figure out what lowering strategy we're going to use for each part
+  // Note: Is is conservatively correct to lower both "live-in" and "live-out"
+  // as "live-through". A "live-through" variable is one which is "live-in",
+  // "live-out", and live throughout the lifetime of the call (i.e. we can find
+  // it from any PC within the transitive callee of the statepoint).  In
+  // particular, if the callee spills callee preserved registers we may not
+  // be able to find a value placed in that register during the call.  This is
+  // fine for live-out, but not for live-through.  If we were willing to make
+  // assumptions about the code generator producing the callee, we could
+  // potentially allow live-through values in callee saved registers.
+  const bool LiveInDeopt =
+    SI.StatepointFlags & (uint64_t)StatepointFlags::DeoptLiveIn;
+
+  // Decide which deriver pointers will go on VRegs
+  unsigned MaxVRegPtrs = MaxRegistersForGCPointers.getValue();
+
+  // Pointers used on exceptional path of invoke statepoint.
+  // We cannot assing them to VRegs.
+  SmallSet<SDValue, 8> LPadPointers;
+  if (!UseRegistersForGCPointersInLandingPad)
+    if (const auto *StInvoke =
+            dyn_cast_or_null<InvokeInst>(SI.StatepointInstr)) {
+      LandingPadInst *LPI = StInvoke->getLandingPadInst();
+      for (const auto *Relocate : SI.GCRelocates)
+        if (Relocate->getOperand(0) == LPI) {
+          LPadPointers.insert(Builder.getValue(Relocate->getBasePtr()));
+          LPadPointers.insert(Builder.getValue(Relocate->getDerivedPtr()));
+        }
+    }
+
+  LLVM_DEBUG(dbgs() << "Deciding how to lower GC Pointers:\n");
+
+  // List of unique lowered GC Pointer values.
+  SmallSetVector<SDValue, 16> LoweredGCPtrs;
+  // Map lowered GC Pointer value to the index in above vector
+  DenseMap<SDValue, unsigned> GCPtrIndexMap;
+
+  unsigned CurNumVRegs = 0;
+
+  auto canPassGCPtrOnVReg = [&](SDValue SD) {
+    if (SD.getValueType().isVector())
+      return false;
+    if (LPadPointers.count(SD))
+      return false;
+    return !willLowerDirectly(SD);
+  };
+
+  auto processGCPtr = [&](const Value *V) {
+    SDValue PtrSD = Builder.getValue(V);
+    if (!LoweredGCPtrs.insert(PtrSD))
+      return; // skip duplicates
+    GCPtrIndexMap[PtrSD] = LoweredGCPtrs.size() - 1;
+
+    assert(!LowerAsVReg.count(PtrSD) && "must not have been seen");
+    if (LowerAsVReg.size() == MaxVRegPtrs)
+      return;
+    assert(V->getType()->isVectorTy() == PtrSD.getValueType().isVector() &&
+           "IR and SD types disagree");
+    if (!canPassGCPtrOnVReg(PtrSD)) {
+      LLVM_DEBUG(dbgs() << "direct/spill "; PtrSD.dump(&Builder.DAG));
+      return;
+    }
+    LLVM_DEBUG(dbgs() << "vreg "; PtrSD.dump(&Builder.DAG));
+    LowerAsVReg[PtrSD] = CurNumVRegs++;
+  };
+
+  // Process derived pointers first to give them more chance to go on VReg.
+  for (const Value *V : SI.Ptrs)
+    processGCPtr(V);
+  for (const Value *V : SI.Bases)
+    processGCPtr(V);
+
+  LLVM_DEBUG(dbgs() << LowerAsVReg.size() << " pointers will go in vregs\n");
+
+  auto requireSpillSlot = [&](const Value *V) {
+    if (!Builder.DAG.getTargetLoweringInfo().isTypeLegal(
+             Builder.getValue(V).getValueType()))
+      return true;
+    if (isGCValue(V, Builder))
+      return !LowerAsVReg.count(Builder.getValue(V));
+    return !(LiveInDeopt || UseRegistersForDeoptValues);
+  };
+
+  // Before we actually start lowering (and allocating spill slots for values),
+  // reserve any stack slots which we judge to be profitable to reuse for a
+  // particular value.  This is purely an optimization over the code below and
+  // doesn't change semantics at all.  It is important for performance that we
+  // reserve slots for both deopt and gc values before lowering either.
+  for (const Value *V : SI.DeoptState) {
+    if (requireSpillSlot(V))
+      reservePreviousStackSlotForValue(V, Builder);
+  }
+
+  for (const Value *V : SI.Ptrs) {
+    SDValue SDV = Builder.getValue(V);
+    if (!LowerAsVReg.count(SDV))
+      reservePreviousStackSlotForValue(V, Builder);
+  }
+
+  for (const Value *V : SI.Bases) {
+    SDValue SDV = Builder.getValue(V);
+    if (!LowerAsVReg.count(SDV))
+      reservePreviousStackSlotForValue(V, Builder);
+  }
+
+  // First, prefix the list with the number of unique values to be
+  // lowered.  Note that this is the number of *Values* not the
+  // number of SDValues required to lower them.
+  const int NumVMSArgs = SI.DeoptState.size();
+  pushStackMapConstant(Ops, Builder, NumVMSArgs);
+
+  // The vm state arguments are lowered in an opaque manner.  We do not know
+  // what type of values are contained within.
+  LLVM_DEBUG(dbgs() << "Lowering deopt state\n");
+  for (const Value *V : SI.DeoptState) {
+    SDValue Incoming;
+    // If this is a function argument at a static frame index, generate it as
+    // the frame index.
+    if (const Argument *Arg = dyn_cast<Argument>(V)) {
+      int FI = Builder.FuncInfo.getArgumentFrameIndex(Arg);
+      if (FI != INT_MAX)
+        Incoming = Builder.DAG.getFrameIndex(FI, Builder.getFrameIndexTy());
+    }
+    if (!Incoming.getNode())
+      Incoming = Builder.getValue(V);
+    LLVM_DEBUG(dbgs() << "Value " << *V
+                      << " requireSpillSlot = " << requireSpillSlot(V) << "\n");
+    lowerIncomingStatepointValue(Incoming, requireSpillSlot(V), Ops, MemRefs,
+                                 Builder);
+  }
+
+  // Finally, go ahead and lower all the gc arguments.
+  pushStackMapConstant(Ops, Builder, LoweredGCPtrs.size());
+  for (SDValue SDV : LoweredGCPtrs)
+    lowerIncomingStatepointValue(SDV, !LowerAsVReg.count(SDV), Ops, MemRefs,
+                                 Builder);
+
+  // Copy to out vector. LoweredGCPtrs will be empty after this point.
+  GCPtrs = LoweredGCPtrs.takeVector();
+
+  // If there are any explicit spill slots passed to the statepoint, record
+  // them, but otherwise do not do anything special.  These are user provided
+  // allocas and give control over placement to the consumer.  In this case,
+  // it is the contents of the slot which may get updated, not the pointer to
+  // the alloca
+  SmallVector<SDValue, 4> Allocas;
+  for (Value *V : SI.GCArgs) {
+    SDValue Incoming = Builder.getValue(V);
+    if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) {
+      // This handles allocas as arguments to the statepoint
+      assert(Incoming.getValueType() == Builder.getFrameIndexTy() &&
+             "Incoming value is a frame index!");
+      Allocas.push_back(Builder.DAG.getTargetFrameIndex(
+          FI->getIndex(), Builder.getFrameIndexTy()));
+
+      auto &MF = Builder.DAG.getMachineFunction();
+      auto *MMO = getMachineMemOperand(MF, *FI);
+      MemRefs.push_back(MMO);
+    }
+  }
+  pushStackMapConstant(Ops, Builder, Allocas.size());
+  Ops.append(Allocas.begin(), Allocas.end());
+
+  // Now construct GC base/derived map;
+  pushStackMapConstant(Ops, Builder, SI.Ptrs.size());
+  SDLoc L = Builder.getCurSDLoc();
+  for (unsigned i = 0; i < SI.Ptrs.size(); ++i) {
+    SDValue Base = Builder.getValue(SI.Bases[i]);
+    assert(GCPtrIndexMap.count(Base) && "base not found in index map");
+    Ops.push_back(
+        Builder.DAG.getTargetConstant(GCPtrIndexMap[Base], L, MVT::i64));
+    SDValue Derived = Builder.getValue(SI.Ptrs[i]);
+    assert(GCPtrIndexMap.count(Derived) && "derived not found in index map");
+    Ops.push_back(
+        Builder.DAG.getTargetConstant(GCPtrIndexMap[Derived], L, MVT::i64));
+  }
+}
+
+SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
+    SelectionDAGBuilder::StatepointLoweringInfo &SI) {
+  // The basic scheme here is that information about both the original call and
+  // the safepoint is encoded in the CallInst.  We create a temporary call and
+  // lower it, then reverse engineer the calling sequence.
+
+  NumOfStatepoints++;
+  // Clear state
+  StatepointLowering.startNewStatepoint(*this);
+  assert(SI.Bases.size() == SI.Ptrs.size() && "Pointer without base!");
+  assert((GFI || SI.Bases.empty()) &&
+         "No gc specified, so cannot relocate pointers!");
+
+  LLVM_DEBUG(dbgs() << "Lowering statepoint " << *SI.StatepointInstr << "\n");
+#ifndef NDEBUG
+  for (const auto *Reloc : SI.GCRelocates)
+    if (Reloc->getParent() == SI.StatepointInstr->getParent())
+      StatepointLowering.scheduleRelocCall(*Reloc);
+#endif
+
+  // Lower statepoint vmstate and gcstate arguments
+
+  // All lowered meta args.
+  SmallVector<SDValue, 10> LoweredMetaArgs;
+  // Lowered GC pointers (subset of above).
+  SmallVector<SDValue, 16> LoweredGCArgs;
+  SmallVector<MachineMemOperand*, 16> MemRefs;
+  // Maps derived pointer SDValue to statepoint result of relocated pointer.
+  DenseMap<SDValue, int> LowerAsVReg;
+  lowerStatepointMetaArgs(LoweredMetaArgs, MemRefs, LoweredGCArgs, LowerAsVReg,
+                          SI, *this);
+
+  // Now that we've emitted the spills, we need to update the root so that the
+  // call sequence is ordered correctly.
+  SI.CLI.setChain(getRoot());
+
+  // Get call node, we will replace it later with statepoint
+  SDValue ReturnVal;
+  SDNode *CallNode;
+  std::tie(ReturnVal, CallNode) = lowerCallFromStatepointLoweringInfo(SI, *this);
+
+  // Construct the actual GC_TRANSITION_START, STATEPOINT, and GC_TRANSITION_END
+  // nodes with all the appropriate arguments and return values.
+
+  // Call Node: Chain, Target, {Args}, RegMask, [Glue]
+  SDValue Chain = CallNode->getOperand(0);
+
+  SDValue Glue;
+  bool CallHasIncomingGlue = CallNode->getGluedNode();
+  if (CallHasIncomingGlue) {
+    // Glue is always last operand
+    Glue = CallNode->getOperand(CallNode->getNumOperands() - 1);
+  }
+
+  // Build the GC_TRANSITION_START node if necessary.
+  //
+  // The operands to the GC_TRANSITION_{START,END} nodes are laid out in the
+  // order in which they appear in the call to the statepoint intrinsic. If
+  // any of the operands is a pointer-typed, that operand is immediately
+  // followed by a SRCVALUE for the pointer that may be used during lowering
+  // (e.g. to form MachinePointerInfo values for loads/stores).
+  const bool IsGCTransition =
+      (SI.StatepointFlags & (uint64_t)StatepointFlags::GCTransition) ==
+      (uint64_t)StatepointFlags::GCTransition;
+  if (IsGCTransition) {
+    SmallVector<SDValue, 8> TSOps;
+
+    // Add chain
+    TSOps.push_back(Chain);
+
+    // Add GC transition arguments
+    for (const Value *V : SI.GCTransitionArgs) {
+      TSOps.push_back(getValue(V));
+      if (V->getType()->isPointerTy())
+        TSOps.push_back(DAG.getSrcValue(V));
+    }
+
+    // Add glue if necessary
+    if (CallHasIncomingGlue)
+      TSOps.push_back(Glue);
+
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+    SDValue GCTransitionStart =
+        DAG.getNode(ISD::GC_TRANSITION_START, getCurSDLoc(), NodeTys, TSOps);
+
+    Chain = GCTransitionStart.getValue(0);
+    Glue = GCTransitionStart.getValue(1);
+  }
+
+  // TODO: Currently, all of these operands are being marked as read/write in
+  // PrologEpilougeInserter.cpp, we should special case the VMState arguments
+  // and flags to be read-only.
+  SmallVector<SDValue, 40> Ops;
+
+  // Add the <id> and <numBytes> constants.
+  Ops.push_back(DAG.getTargetConstant(SI.ID, getCurSDLoc(), MVT::i64));
+  Ops.push_back(
+      DAG.getTargetConstant(SI.NumPatchBytes, getCurSDLoc(), MVT::i32));
+
+  // Calculate and push starting position of vmstate arguments
+  // Get number of arguments incoming directly into call node
+  unsigned NumCallRegArgs =
+      CallNode->getNumOperands() - (CallHasIncomingGlue ? 4 : 3);
+  Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, getCurSDLoc(), MVT::i32));
+
+  // Add call target
+  SDValue CallTarget = SDValue(CallNode->getOperand(1).getNode(), 0);
+  Ops.push_back(CallTarget);
+
+  // Add call arguments
+  // Get position of register mask in the call
+  SDNode::op_iterator RegMaskIt;
+  if (CallHasIncomingGlue)
+    RegMaskIt = CallNode->op_end() - 2;
+  else
+    RegMaskIt = CallNode->op_end() - 1;
+  Ops.insert(Ops.end(), CallNode->op_begin() + 2, RegMaskIt);
+
+  // Add a constant argument for the calling convention
+  pushStackMapConstant(Ops, *this, SI.CLI.CallConv);
+
+  // Add a constant argument for the flags
+  uint64_t Flags = SI.StatepointFlags;
+  assert(((Flags & ~(uint64_t)StatepointFlags::MaskAll) == 0) &&
+         "Unknown flag used");
+  pushStackMapConstant(Ops, *this, Flags);
+
+  // Insert all vmstate and gcstate arguments
+  llvm::append_range(Ops, LoweredMetaArgs);
+
+  // Add register mask from call node
+  Ops.push_back(*RegMaskIt);
+
+  // Add chain
+  Ops.push_back(Chain);
+
+  // Same for the glue, but we add it only if original call had it
+  if (Glue.getNode())
+    Ops.push_back(Glue);
+
+  // Compute return values.  Provide a glue output since we consume one as
+  // input.  This allows someone else to chain off us as needed.
+  SmallVector<EVT, 8> NodeTys;
+  for (auto SD : LoweredGCArgs) {
+    if (!LowerAsVReg.count(SD))
+      continue;
+    NodeTys.push_back(SD.getValueType());
+  }
+  LLVM_DEBUG(dbgs() << "Statepoint has " << NodeTys.size() << " results\n");
+  assert(NodeTys.size() == LowerAsVReg.size() && "Inconsistent GC Ptr lowering");
+  NodeTys.push_back(MVT::Other);
+  NodeTys.push_back(MVT::Glue);
+
+  unsigned NumResults = NodeTys.size();
+  MachineSDNode *StatepointMCNode =
+    DAG.getMachineNode(TargetOpcode::STATEPOINT, getCurSDLoc(), NodeTys, Ops);
+  DAG.setNodeMemRefs(StatepointMCNode, MemRefs);
+
+  // For values lowered to tied-defs, create the virtual registers if used
+  // in other blocks. For local gc.relocate record appropriate statepoint
+  // result in StatepointLoweringState.
+  DenseMap<SDValue, Register> VirtRegs;
+  for (const auto *Relocate : SI.GCRelocates) {
+    Value *Derived = Relocate->getDerivedPtr();
+    SDValue SD = getValue(Derived);
+    if (!LowerAsVReg.count(SD))
+      continue;
+
+    SDValue Relocated = SDValue(StatepointMCNode, LowerAsVReg[SD]);
+
+    // Handle local relocate. Note that different relocates might
+    // map to the same SDValue.
+    if (SI.StatepointInstr->getParent() == Relocate->getParent()) {
+      SDValue Res = StatepointLowering.getLocation(SD);
+      if (Res)
+        assert(Res == Relocated);
+      else
+        StatepointLowering.setLocation(SD, Relocated);
+      continue;
+    }
+
+    // Handle multiple gc.relocates of the same input efficiently.
+    if (VirtRegs.count(SD))
+      continue;
+
+    auto *RetTy = Relocate->getType();
+    Register Reg = FuncInfo.CreateRegs(RetTy);
+    RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
+                     DAG.getDataLayout(), Reg, RetTy, std::nullopt);
+    SDValue Chain = DAG.getRoot();
+    RFV.getCopyToRegs(Relocated, DAG, getCurSDLoc(), Chain, nullptr);
+    PendingExports.push_back(Chain);
+
+    VirtRegs[SD] = Reg;
+  }
+
+  // Record for later use how each relocation was lowered.  This is needed to
+  // allow later gc.relocates to mirror the lowering chosen.
+  const Instruction *StatepointInstr = SI.StatepointInstr;
+  auto &RelocationMap = FuncInfo.StatepointRelocationMaps[StatepointInstr];
+  for (const GCRelocateInst *Relocate : SI.GCRelocates) {
+    const Value *V = Relocate->getDerivedPtr();
+    SDValue SDV = getValue(V);
+    SDValue Loc = StatepointLowering.getLocation(SDV);
+
+    bool IsLocal = (Relocate->getParent() == StatepointInstr->getParent());
+
+    RecordType Record;
+    if (IsLocal && LowerAsVReg.count(SDV)) {
+      // Result is already stored in StatepointLowering
+      Record.type = RecordType::SDValueNode;
+    } else if (LowerAsVReg.count(SDV)) {
+      Record.type = RecordType::VReg;
+      assert(VirtRegs.count(SDV));
+      Record.payload.Reg = VirtRegs[SDV];
+    } else if (Loc.getNode()) {
+      Record.type = RecordType::Spill;
+      Record.payload.FI = cast<FrameIndexSDNode>(Loc)->getIndex();
+    } else {
+      Record.type = RecordType::NoRelocate;
+      // If we didn't relocate a value, we'll essentialy end up inserting an
+      // additional use of the original value when lowering the gc.relocate.
+      // We need to make sure the value is available at the new use, which
+      // might be in another block.
+      if (Relocate->getParent() != StatepointInstr->getParent())
+        ExportFromCurrentBlock(V);
+    }
+    RelocationMap[Relocate] = Record;
+  }
+
+  
+
+  SDNode *SinkNode = StatepointMCNode;
+
+  // Build the GC_TRANSITION_END node if necessary.
+  //
+  // See the comment above regarding GC_TRANSITION_START for the layout of
+  // the operands to the GC_TRANSITION_END node.
+  if (IsGCTransition) {
+    SmallVector<SDValue, 8> TEOps;
+
+    // Add chain
+    TEOps.push_back(SDValue(StatepointMCNode, NumResults - 2));
+
+    // Add GC transition arguments
+    for (const Value *V : SI.GCTransitionArgs) {
+      TEOps.push_back(getValue(V));
+      if (V->getType()->isPointerTy())
+        TEOps.push_back(DAG.getSrcValue(V));
+    }
+
+    // Add glue
+    TEOps.push_back(SDValue(StatepointMCNode, NumResults - 1));
+
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+    SDValue GCTransitionStart =
+        DAG.getNode(ISD::GC_TRANSITION_END, getCurSDLoc(), NodeTys, TEOps);
+
+    SinkNode = GCTransitionStart.getNode();
+  }
+
+  // Replace original call
+  // Call: ch,glue = CALL ...
+  // Statepoint: [gc relocates],ch,glue = STATEPOINT ...
+  unsigned NumSinkValues = SinkNode->getNumValues();
+  SDValue StatepointValues[2] = {SDValue(SinkNode, NumSinkValues - 2),
+                                 SDValue(SinkNode, NumSinkValues - 1)};
+  DAG.ReplaceAllUsesWith(CallNode, StatepointValues);
+  // Remove original call node
+  DAG.DeleteNode(CallNode);
+
+  // Since we always emit CopyToRegs (even for local relocates), we must
+  // update root, so that they are emitted before any local uses.
+  (void)getControlRoot();
+
+  // TODO: A better future implementation would be to emit a single variable
+  // argument, variable return value STATEPOINT node here and then hookup the
+  // return value of each gc.relocate to the respective output of the
+  // previously emitted STATEPOINT value.  Unfortunately, this doesn't appear
+  // to actually be possible today.
+
+  return ReturnVal;
+}
+
+/// Return two gc.results if present.  First result is a block local
+/// gc.result, second result is a non-block local gc.result.  Corresponding
+/// entry will be nullptr if not present.
+static std::pair<const GCResultInst*, const GCResultInst*>
+getGCResultLocality(const GCStatepointInst &S) {
+  std::pair<const GCResultInst *, const GCResultInst*> Res(nullptr, nullptr);
+  for (const auto *U : S.users()) {
+    auto *GRI = dyn_cast<GCResultInst>(U);
+    if (!GRI)
+      continue;
+    if (GRI->getParent() == S.getParent())
+      Res.first = GRI;
+    else
+      Res.second = GRI;
+  }
+  return Res;
+}
+
+void
+SelectionDAGBuilder::LowerStatepoint(const GCStatepointInst &I,
+                                     const BasicBlock *EHPadBB /*= nullptr*/) {
+  assert(I.getCallingConv() != CallingConv::AnyReg &&
+         "anyregcc is not supported on statepoints!");
+
+#ifndef NDEBUG
+  // Check that the associated GCStrategy expects to encounter statepoints.
+  assert(GFI->getStrategy().useStatepoints() &&
+         "GCStrategy does not expect to encounter statepoints");
+#endif
+
+  SDValue ActualCallee;
+  SDValue Callee = getValue(I.getActualCalledOperand());
+
+  if (I.getNumPatchBytes() > 0) {
+    // If we've been asked to emit a nop sequence instead of a call instruction
+    // for this statepoint then don't lower the call target, but use a constant
+    // `undef` instead.  Not lowering the call target lets statepoint clients
+    // get away without providing a physical address for the symbolic call
+    // target at link time.
+    ActualCallee = DAG.getUNDEF(Callee.getValueType());
+  } else {
+    ActualCallee = Callee;
+  }
+
+  StatepointLoweringInfo SI(DAG);
+  populateCallLoweringInfo(SI.CLI, &I, GCStatepointInst::CallArgsBeginPos,
+                           I.getNumCallArgs(), ActualCallee,
+                           I.getActualReturnType(), false /* IsPatchPoint */);
+
+  // There may be duplication in the gc.relocate list; such as two copies of
+  // each relocation on normal and exceptional path for an invoke.  We only
+  // need to spill once and record one copy in the stackmap, but we need to
+  // reload once per gc.relocate.  (Dedupping gc.relocates is trickier and best
+  // handled as a CSE problem elsewhere.)
+  // TODO: There a couple of major stackmap size optimizations we could do
+  // here if we wished.
+  // 1) If we've encountered a derived pair {B, D}, we don't need to actually
+  // record {B,B} if it's seen later.
+  // 2) Due to rematerialization, actual derived pointers are somewhat rare;
+  // given that, we could change the format to record base pointer relocations
+  // separately with half the space. This would require a format rev and a
+  // fairly major rework of the STATEPOINT node though.
+  SmallSet<SDValue, 8> Seen;
+  for (const GCRelocateInst *Relocate : I.getGCRelocates()) {
+    SI.GCRelocates.push_back(Relocate);
+
+    SDValue DerivedSD = getValue(Relocate->getDerivedPtr());
+    if (Seen.insert(DerivedSD).second) {
+      SI.Bases.push_back(Relocate->getBasePtr());
+      SI.Ptrs.push_back(Relocate->getDerivedPtr());
+    }
+  }
+
+  // If we find a deopt value which isn't explicitly added, we need to
+  // ensure it gets lowered such that gc cycles occurring before the
+  // deoptimization event during the lifetime of the call don't invalidate
+  // the pointer we're deopting with.  Note that we assume that all
+  // pointers passed to deopt are base pointers; relaxing that assumption
+  // would require relatively large changes to how we represent relocations.
+  for (Value *V : I.deopt_operands()) {
+    if (!isGCValue(V, *this))
+      continue;
+    if (Seen.insert(getValue(V)).second) {
+      SI.Bases.push_back(V);
+      SI.Ptrs.push_back(V);
+    }
+  }
+
+  SI.GCArgs = ArrayRef<const Use>(I.gc_args_begin(), I.gc_args_end());
+  SI.StatepointInstr = &I;
+  SI.ID = I.getID();
+
+  SI.DeoptState = ArrayRef<const Use>(I.deopt_begin(), I.deopt_end());
+  SI.GCTransitionArgs = ArrayRef<const Use>(I.gc_transition_args_begin(),
+                                            I.gc_transition_args_end());
+
+  SI.StatepointFlags = I.getFlags();
+  SI.NumPatchBytes = I.getNumPatchBytes();
+  SI.EHPadBB = EHPadBB;
+
+  SDValue ReturnValue = LowerAsSTATEPOINT(SI);
+
+  // Export the result value if needed
+  const auto GCResultLocality = getGCResultLocality(I);
+
+  if (!GCResultLocality.first && !GCResultLocality.second) {
+    // The return value is not needed, just generate a poison value.
+    // Note: This covers the void return case.
+    setValue(&I, DAG.getIntPtrConstant(-1, getCurSDLoc()));
+    return;
+  }
+
+  if (GCResultLocality.first) {
+    // Result value will be used in a same basic block. Don't export it or
+    // perform any explicit register copies. The gc_result will simply grab
+    // this value. 
+    setValue(&I, ReturnValue);
+  }
+
+  if (!GCResultLocality.second)
+    return;
+  // Result value will be used in a different basic block so we need to export
+  // it now.  Default exporting mechanism will not work here because statepoint
+  // call has a different type than the actual call. It means that by default
+  // llvm will create export register of the wrong type (always i32 in our
+  // case). So instead we need to create export register with correct type
+  // manually.
+  // TODO: To eliminate this problem we can remove gc.result intrinsics
+  //       completely and make statepoint call to return a tuple.
+  Type *RetTy = GCResultLocality.second->getType();
+  Register Reg = FuncInfo.CreateRegs(RetTy);
+  RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
+                   DAG.getDataLayout(), Reg, RetTy,
+                   I.getCallingConv());
+  SDValue Chain = DAG.getEntryNode();
+  
+  RFV.getCopyToRegs(ReturnValue, DAG, getCurSDLoc(), Chain, nullptr);
+  PendingExports.push_back(Chain);
+  FuncInfo.ValueMap[&I] = Reg;
+}
+
+void SelectionDAGBuilder::LowerCallSiteWithDeoptBundleImpl(
+    const CallBase *Call, SDValue Callee, const BasicBlock *EHPadBB,
+    bool VarArgDisallowed, bool ForceVoidReturnTy) {
+  StatepointLoweringInfo SI(DAG);
+  unsigned ArgBeginIndex = Call->arg_begin() - Call->op_begin();
+  populateCallLoweringInfo(
+      SI.CLI, Call, ArgBeginIndex, Call->arg_size(), Callee,
+      ForceVoidReturnTy ? Type::getVoidTy(*DAG.getContext()) : Call->getType(),
+      false);
+  if (!VarArgDisallowed)
+    SI.CLI.IsVarArg = Call->getFunctionType()->isVarArg();
+
+  auto DeoptBundle = *Call->getOperandBundle(LLVMContext::OB_deopt);
+
+  unsigned DefaultID = StatepointDirectives::DeoptBundleStatepointID;
+
+  auto SD = parseStatepointDirectivesFromAttrs(Call->getAttributes());
+  SI.ID = SD.StatepointID.value_or(DefaultID);
+  SI.NumPatchBytes = SD.NumPatchBytes.value_or(0);
+
+  SI.DeoptState =
+      ArrayRef<const Use>(DeoptBundle.Inputs.begin(), DeoptBundle.Inputs.end());
+  SI.StatepointFlags = static_cast<uint64_t>(StatepointFlags::None);
+  SI.EHPadBB = EHPadBB;
+
+  // NB! The GC arguments are deliberately left empty.
+
+  if (SDValue ReturnVal = LowerAsSTATEPOINT(SI)) {
+    ReturnVal = lowerRangeToAssertZExt(DAG, *Call, ReturnVal);
+    setValue(Call, ReturnVal);
+  }
+}
+
+void SelectionDAGBuilder::LowerCallSiteWithDeoptBundle(
+    const CallBase *Call, SDValue Callee, const BasicBlock *EHPadBB) {
+  LowerCallSiteWithDeoptBundleImpl(Call, Callee, EHPadBB,
+                                   /* VarArgDisallowed = */ false,
+                                   /* ForceVoidReturnTy  = */ false);
+}
+
+void SelectionDAGBuilder::visitGCResult(const GCResultInst &CI) {
+  // The result value of the gc_result is simply the result of the actual
+  // call.  We've already emitted this, so just grab the value.
+  const Value *SI = CI.getStatepoint();
+  assert((isa<GCStatepointInst>(SI) || isa<UndefValue>(SI)) &&
+         "GetStatepoint must return one of two types");
+  if (isa<UndefValue>(SI))
+    return;
+
+  if (cast<GCStatepointInst>(SI)->getParent() == CI.getParent()) {
+    setValue(&CI, getValue(SI));
+    return;
+  }
+  // Statepoint is in different basic block so we should have stored call
+  // result in a virtual register.
+  // We can not use default getValue() functionality to copy value from this
+  // register because statepoint and actual call return types can be
+  // different, and getValue() will use CopyFromReg of the wrong type,
+  // which is always i32 in our case.
+  Type *RetTy = CI.getType();
+  SDValue CopyFromReg = getCopyFromRegs(SI, RetTy);
+  
+  assert(CopyFromReg.getNode());
+  setValue(&CI, CopyFromReg);
+}
+
+void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
+  const Value *Statepoint = Relocate.getStatepoint();
+#ifndef NDEBUG
+  // Consistency check
+  // We skip this check for relocates not in the same basic block as their
+  // statepoint. It would be too expensive to preserve validation info through
+  // different basic blocks.
+  assert((isa<GCStatepointInst>(Statepoint) || isa<UndefValue>(Statepoint)) &&
+         "GetStatepoint must return one of two types");
+  if (isa<UndefValue>(Statepoint))
+    return;
+
+  if (cast<GCStatepointInst>(Statepoint)->getParent() == Relocate.getParent())
+    StatepointLowering.relocCallVisited(Relocate);
+#endif
+
+  const Value *DerivedPtr = Relocate.getDerivedPtr();
+  auto &RelocationMap =
+      FuncInfo.StatepointRelocationMaps[cast<GCStatepointInst>(Statepoint)];
+  auto SlotIt = RelocationMap.find(&Relocate);
+  assert(SlotIt != RelocationMap.end() && "Relocating not lowered gc value");
+  const RecordType &Record = SlotIt->second;
+
+  // If relocation was done via virtual register..
+  if (Record.type == RecordType::SDValueNode) {
+    assert(cast<GCStatepointInst>(Statepoint)->getParent() ==
+               Relocate.getParent() &&
+           "Nonlocal gc.relocate mapped via SDValue");
+    SDValue SDV = StatepointLowering.getLocation(getValue(DerivedPtr));
+    assert(SDV.getNode() && "empty SDValue");
+    setValue(&Relocate, SDV);
+    return;
+  }
+  if (Record.type == RecordType::VReg) {
+    Register InReg = Record.payload.Reg;
+    RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
+                     DAG.getDataLayout(), InReg, Relocate.getType(),
+                     std::nullopt); // This is not an ABI copy.
+    // We generate copy to/from regs even for local uses, hence we must
+    // chain with current root to ensure proper ordering of copies w.r.t.
+    // statepoint.
+    SDValue Chain = DAG.getRoot();
+    SDValue Relocation = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(),
+                                             Chain, nullptr, nullptr);
+    setValue(&Relocate, Relocation);
+    return;
+  }
+
+  if (Record.type == RecordType::Spill) {
+    unsigned Index = Record.payload.FI;
+    SDValue SpillSlot = DAG.getTargetFrameIndex(Index, getFrameIndexTy());
+
+    // All the reloads are independent and are reading memory only modified by
+    // statepoints (i.e. no other aliasing stores); informing SelectionDAG of
+    // this lets CSE kick in for free and allows reordering of
+    // instructions if possible.  The lowering for statepoint sets the root,
+    // so this is ordering all reloads with the either
+    // a) the statepoint node itself, or
+    // b) the entry of the current block for an invoke statepoint.
+    const SDValue Chain = DAG.getRoot(); // != Builder.getRoot()
+
+    auto &MF = DAG.getMachineFunction();
+    auto &MFI = MF.getFrameInfo();
+    auto PtrInfo = MachinePointerInfo::getFixedStack(MF, Index);
+    auto *LoadMMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
+                                            MFI.getObjectSize(Index),
+                                            MFI.getObjectAlign(Index));
+
+    auto LoadVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                           Relocate.getType());
+
+    SDValue SpillLoad =
+        DAG.getLoad(LoadVT, getCurSDLoc(), Chain, SpillSlot, LoadMMO);
+    PendingLoads.push_back(SpillLoad.getValue(1));
+
+    assert(SpillLoad.getNode());
+    setValue(&Relocate, SpillLoad);
+    return;
+  }
+
+  assert(Record.type == RecordType::NoRelocate);
+  SDValue SD = getValue(DerivedPtr);
+
+  if (SD.isUndef() && SD.getValueType().getSizeInBits() <= 64) {
+    // Lowering relocate(undef) as arbitrary constant. Current constant value
+    // is chosen such that it's unlikely to be a valid pointer.
+    setValue(&Relocate, DAG.getTargetConstant(0xFEFEFEFE, SDLoc(SD), MVT::i64));
+    return;
+  }
+
+  // We didn't need to spill these special cases (constants and allocas).
+  // See the handling in spillIncomingValueForStatepoint for detail.
+  setValue(&Relocate, SD);
+}
+
+void SelectionDAGBuilder::LowerDeoptimizeCall(const CallInst *CI) {
+  const auto &TLI = DAG.getTargetLoweringInfo();
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::DEOPTIMIZE),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+
+  // We don't lower calls to __llvm_deoptimize as varargs, but as a regular
+  // call.  We also do not lower the return value to any virtual register, and
+  // change the immediately following return to a trap instruction.
+  LowerCallSiteWithDeoptBundleImpl(CI, Callee, /* EHPadBB = */ nullptr,
+                                   /* VarArgDisallowed = */ true,
+                                   /* ForceVoidReturnTy = */ true);
+}
+
+void SelectionDAGBuilder::LowerDeoptimizingReturn() {
+  // We do not lower the return value from llvm.deoptimize to any virtual
+  // register, and change the immediately following return to a trap
+  // instruction.
+  if (DAG.getTarget().Options.TrapUnreachable)
+    DAG.setRoot(
+        DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.h b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.h
new file mode 100644
index 000000000000..addc0a7eef3a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.h
@@ -0,0 +1,126 @@
+//===- StatepointLowering.h - SDAGBuilder's statepoint code ---*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes support code use by SelectionDAGBuilder when lowering a
+// statepoint sequence in SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include <cassert>
+
+namespace llvm {
+
+class SelectionDAGBuilder;
+
+/// This class tracks both per-statepoint and per-selectiondag information.
+/// For each statepoint it tracks locations of it's gc valuess (incoming and
+/// relocated) and list of gcreloc calls scheduled for visiting (this is
+/// used for a debug mode consistency check only).  The spill slot tracking
+/// works in concert with information in FunctionLoweringInfo.
+class StatepointLoweringState {
+public:
+  StatepointLoweringState() = default;
+
+  /// Reset all state tracking for a newly encountered safepoint.  Also
+  /// performs some consistency checking.
+  void startNewStatepoint(SelectionDAGBuilder &Builder);
+
+  /// Clear the memory usage of this object.  This is called from
+  /// SelectionDAGBuilder::clear.  We require this is never called in the
+  /// midst of processing a statepoint sequence.
+  void clear();
+
+  /// Returns the spill location of a value incoming to the current
+  /// statepoint.  Will return SDValue() if this value hasn't been
+  /// spilled.  Otherwise, the value has already been spilled and no
+  /// further action is required by the caller.
+  SDValue getLocation(SDValue Val) {
+    auto I = Locations.find(Val);
+    if (I == Locations.end())
+      return SDValue();
+    return I->second;
+  }
+
+  void setLocation(SDValue Val, SDValue Location) {
+    assert(!Locations.count(Val) &&
+           "Trying to allocate already allocated location");
+    Locations[Val] = Location;
+  }
+
+  /// Record the fact that we expect to encounter a given gc_relocate
+  /// before the next statepoint.  If we don't see it, we'll report
+  /// an assertion.
+  void scheduleRelocCall(const GCRelocateInst &RelocCall) {
+    // We are not interested in lowering dead instructions.
+    if (!RelocCall.use_empty())
+      PendingGCRelocateCalls.push_back(&RelocCall);
+  }
+
+  /// Remove this gc_relocate from the list we're expecting to see
+  /// before the next statepoint.  If we weren't expecting to see
+  /// it, we'll report an assertion.
+  void relocCallVisited(const GCRelocateInst &RelocCall) {
+    // We are not interested in lowering dead instructions.
+    if (RelocCall.use_empty())
+      return;
+    auto I = llvm::find(PendingGCRelocateCalls, &RelocCall);
+    assert(I != PendingGCRelocateCalls.end() &&
+           "Visited unexpected gcrelocate call");
+    PendingGCRelocateCalls.erase(I);
+  }
+
+  // TODO: Should add consistency tracking to ensure we encounter
+  // expected gc_result calls too.
+
+  /// Get a stack slot we can use to store an value of type ValueType.  This
+  /// will hopefully be a recylced slot from another statepoint.
+  SDValue allocateStackSlot(EVT ValueType, SelectionDAGBuilder &Builder);
+
+  void reserveStackSlot(int Offset) {
+    assert(Offset >= 0 && Offset < (int)AllocatedStackSlots.size() &&
+           "out of bounds");
+    assert(!AllocatedStackSlots.test(Offset) && "already reserved!");
+    assert(NextSlotToAllocate <= (unsigned)Offset && "consistency!");
+    AllocatedStackSlots.set(Offset);
+  }
+
+  bool isStackSlotAllocated(int Offset) {
+    assert(Offset >= 0 && Offset < (int)AllocatedStackSlots.size() &&
+           "out of bounds");
+    return AllocatedStackSlots.test(Offset);
+  }
+
+private:
+  /// Maps pre-relocation value (gc pointer directly incoming into statepoint)
+  /// into it's location (currently only stack slots)
+  DenseMap<SDValue, SDValue> Locations;
+
+  /// A boolean indicator for each slot listed in the FunctionInfo as to
+  /// whether it has been used in the current statepoint.  Since we try to
+  /// preserve stack slots across safepoints, there can be gaps in which
+  /// slots have been allocated.
+  SmallBitVector AllocatedStackSlots;
+
+  /// Points just beyond the last slot known to have been allocated
+  unsigned NextSlotToAllocate = 0;
+
+  /// Keep track of pending gcrelocate calls for consistency check
+  SmallVector<const GCRelocateInst *, 10> PendingGCRelocateCalls;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
new file mode 100644
index 000000000000..a84d35a6ea4e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -0,0 +1,10800 @@
+//===-- TargetLowering.cpp - Implement the TargetLowering class -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the TargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/CodeGenCommonISel.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/DivisionByConstantInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cctype>
+using namespace llvm;
+
+/// NOTE: The TargetMachine owns TLOF.
+TargetLowering::TargetLowering(const TargetMachine &tm)
+    : TargetLoweringBase(tm) {}
+
+const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  return nullptr;
+}
+
+bool TargetLowering::isPositionIndependent() const {
+  return getTargetMachine().isPositionIndependent();
+}
+
+/// Check whether a given call node is in tail position within its function. If
+/// so, it sets Chain to the input chain of the tail call.
+bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
+                                          SDValue &Chain) const {
+  const Function &F = DAG.getMachineFunction().getFunction();
+
+  // First, check if tail calls have been disabled in this function.
+  if (F.getFnAttribute("disable-tail-calls").getValueAsBool())
+    return false;
+
+  // Conservatively require the attributes of the call to match those of
+  // the return. Ignore following attributes because they don't affect the
+  // call sequence.
+  AttrBuilder CallerAttrs(F.getContext(), F.getAttributes().getRetAttrs());
+  for (const auto &Attr : {Attribute::Alignment, Attribute::Dereferenceable,
+                           Attribute::DereferenceableOrNull, Attribute::NoAlias,
+                           Attribute::NonNull, Attribute::NoUndef})
+    CallerAttrs.removeAttribute(Attr);
+
+  if (CallerAttrs.hasAttributes())
+    return false;
+
+  // It's not safe to eliminate the sign / zero extension of the return value.
+  if (CallerAttrs.contains(Attribute::ZExt) ||
+      CallerAttrs.contains(Attribute::SExt))
+    return false;
+
+  // Check if the only use is a function return node.
+  return isUsedByReturnOnly(Node, Chain);
+}
+
+bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,
+    const uint32_t *CallerPreservedMask,
+    const SmallVectorImpl<CCValAssign> &ArgLocs,
+    const SmallVectorImpl<SDValue> &OutVals) const {
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    const CCValAssign &ArgLoc = ArgLocs[I];
+    if (!ArgLoc.isRegLoc())
+      continue;
+    MCRegister Reg = ArgLoc.getLocReg();
+    // Only look at callee saved registers.
+    if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg))
+      continue;
+    // Check that we pass the value used for the caller.
+    // (We look for a CopyFromReg reading a virtual register that is used
+    //  for the function live-in value of register Reg)
+    SDValue Value = OutVals[I];
+    if (Value->getOpcode() == ISD::AssertZext)
+      Value = Value.getOperand(0);
+    if (Value->getOpcode() != ISD::CopyFromReg)
+      return false;
+    Register ArgReg = cast<RegisterSDNode>(Value->getOperand(1))->getReg();
+    if (MRI.getLiveInPhysReg(ArgReg) != Reg)
+      return false;
+  }
+  return true;
+}
+
+/// Set CallLoweringInfo attribute flags based on a call instruction
+/// and called function attributes.
+void TargetLoweringBase::ArgListEntry::setAttributes(const CallBase *Call,
+                                                     unsigned ArgIdx) {
+  IsSExt = Call->paramHasAttr(ArgIdx, Attribute::SExt);
+  IsZExt = Call->paramHasAttr(ArgIdx, Attribute::ZExt);
+  IsInReg = Call->paramHasAttr(ArgIdx, Attribute::InReg);
+  IsSRet = Call->paramHasAttr(ArgIdx, Attribute::StructRet);
+  IsNest = Call->paramHasAttr(ArgIdx, Attribute::Nest);
+  IsByVal = Call->paramHasAttr(ArgIdx, Attribute::ByVal);
+  IsPreallocated = Call->paramHasAttr(ArgIdx, Attribute::Preallocated);
+  IsInAlloca = Call->paramHasAttr(ArgIdx, Attribute::InAlloca);
+  IsReturned = Call->paramHasAttr(ArgIdx, Attribute::Returned);
+  IsSwiftSelf = Call->paramHasAttr(ArgIdx, Attribute::SwiftSelf);
+  IsSwiftAsync = Call->paramHasAttr(ArgIdx, Attribute::SwiftAsync);
+  IsSwiftError = Call->paramHasAttr(ArgIdx, Attribute::SwiftError);
+  Alignment = Call->getParamStackAlign(ArgIdx);
+  IndirectType = nullptr;
+  assert(IsByVal + IsPreallocated + IsInAlloca + IsSRet <= 1 &&
+         "multiple ABI attributes?");
+  if (IsByVal) {
+    IndirectType = Call->getParamByValType(ArgIdx);
+    if (!Alignment)
+      Alignment = Call->getParamAlign(ArgIdx);
+  }
+  if (IsPreallocated)
+    IndirectType = Call->getParamPreallocatedType(ArgIdx);
+  if (IsInAlloca)
+    IndirectType = Call->getParamInAllocaType(ArgIdx);
+  if (IsSRet)
+    IndirectType = Call->getParamStructRetType(ArgIdx);
+}
+
+/// Generate a libcall taking the given operands as arguments and returning a
+/// result of type RetVT.
+std::pair<SDValue, SDValue>
+TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
+                            ArrayRef<SDValue> Ops,
+                            MakeLibCallOptions CallOptions,
+                            const SDLoc &dl,
+                            SDValue InChain) const {
+  if (!InChain)
+    InChain = DAG.getEntryNode();
+
+  TargetLowering::ArgListTy Args;
+  Args.reserve(Ops.size());
+
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0; i < Ops.size(); ++i) {
+    SDValue NewOp = Ops[i];
+    Entry.Node = NewOp;
+    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+    Entry.IsSExt = shouldSignExtendTypeInLibCall(NewOp.getValueType(),
+                                                 CallOptions.IsSExt);
+    Entry.IsZExt = !Entry.IsSExt;
+
+    if (CallOptions.IsSoften &&
+        !shouldExtendTypeInLibCall(CallOptions.OpsVTBeforeSoften[i])) {
+      Entry.IsSExt = Entry.IsZExt = false;
+    }
+    Args.push_back(Entry);
+  }
+
+  if (LC == RTLIB::UNKNOWN_LIBCALL)
+    report_fatal_error("Unsupported library call operation!");
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy(DAG.getDataLayout()));
+
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  bool signExtend = shouldSignExtendTypeInLibCall(RetVT, CallOptions.IsSExt);
+  bool zeroExtend = !signExtend;
+
+  if (CallOptions.IsSoften &&
+      !shouldExtendTypeInLibCall(CallOptions.RetVTBeforeSoften)) {
+    signExtend = zeroExtend = false;
+  }
+
+  CLI.setDebugLoc(dl)
+      .setChain(InChain)
+      .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+      .setNoReturn(CallOptions.DoesNotReturn)
+      .setDiscardResult(!CallOptions.IsReturnValueUsed)
+      .setIsPostTypeLegalization(CallOptions.IsPostTypeLegalization)
+      .setSExtResult(signExtend)
+      .setZExtResult(zeroExtend);
+  return LowerCallTo(CLI);
+}
+
+bool TargetLowering::findOptimalMemOpLowering(
+    std::vector<EVT> &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS,
+    unsigned SrcAS, const AttributeList &FuncAttributes) const {
+  if (Limit != ~unsigned(0) && Op.isMemcpyWithFixedDstAlign() &&
+      Op.getSrcAlign() < Op.getDstAlign())
+    return false;
+
+  EVT VT = getOptimalMemOpType(Op, FuncAttributes);
+
+  if (VT == MVT::Other) {
+    // Use the largest integer type whose alignment constraints are satisfied.
+    // We only need to check DstAlign here as SrcAlign is always greater or
+    // equal to DstAlign (or zero).
+    VT = MVT::i64;
+    if (Op.isFixedDstAlign())
+      while (Op.getDstAlign() < (VT.getSizeInBits() / 8) &&
+             !allowsMisalignedMemoryAccesses(VT, DstAS, Op.getDstAlign()))
+        VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
+    assert(VT.isInteger());
+
+    // Find the largest legal integer type.
+    MVT LVT = MVT::i64;
+    while (!isTypeLegal(LVT))
+      LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1);
+    assert(LVT.isInteger());
+
+    // If the type we've chosen is larger than the largest legal integer type
+    // then use that instead.
+    if (VT.bitsGT(LVT))
+      VT = LVT;
+  }
+
+  unsigned NumMemOps = 0;
+  uint64_t Size = Op.size();
+  while (Size) {
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    while (VTSize > Size) {
+      // For now, only use non-vector load / store's for the left-over pieces.
+      EVT NewVT = VT;
+      unsigned NewVTSize;
+
+      bool Found = false;
+      if (VT.isVector() || VT.isFloatingPoint()) {
+        NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32;
+        if (isOperationLegalOrCustom(ISD::STORE, NewVT) &&
+            isSafeMemOpType(NewVT.getSimpleVT()))
+          Found = true;
+        else if (NewVT == MVT::i64 &&
+                 isOperationLegalOrCustom(ISD::STORE, MVT::f64) &&
+                 isSafeMemOpType(MVT::f64)) {
+          // i64 is usually not legal on 32-bit targets, but f64 may be.
+          NewVT = MVT::f64;
+          Found = true;
+        }
+      }
+
+      if (!Found) {
+        do {
+          NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1);
+          if (NewVT == MVT::i8)
+            break;
+        } while (!isSafeMemOpType(NewVT.getSimpleVT()));
+      }
+      NewVTSize = NewVT.getSizeInBits() / 8;
+
+      // If the new VT cannot cover all of the remaining bits, then consider
+      // issuing a (or a pair of) unaligned and overlapping load / store.
+      unsigned Fast;
+      if (NumMemOps && Op.allowOverlap() && NewVTSize < Size &&
+          allowsMisalignedMemoryAccesses(
+              VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
+              MachineMemOperand::MONone, &Fast) &&
+          Fast)
+        VTSize = Size;
+      else {
+        VT = NewVT;
+        VTSize = NewVTSize;
+      }
+    }
+
+    if (++NumMemOps > Limit)
+      return false;
+
+    MemOps.push_back(VT);
+    Size -= VTSize;
+  }
+
+  return true;
+}
+
+/// Soften the operands of a comparison. This code is shared among BR_CC,
+/// SELECT_CC, and SETCC handlers.
+void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
+                                         SDValue &NewLHS, SDValue &NewRHS,
+                                         ISD::CondCode &CCCode,
+                                         const SDLoc &dl, const SDValue OldLHS,
+                                         const SDValue OldRHS) const {
+  SDValue Chain;
+  return softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, dl, OldLHS,
+                             OldRHS, Chain);
+}
+
+void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
+                                         SDValue &NewLHS, SDValue &NewRHS,
+                                         ISD::CondCode &CCCode,
+                                         const SDLoc &dl, const SDValue OldLHS,
+                                         const SDValue OldRHS,
+                                         SDValue &Chain,
+                                         bool IsSignaling) const {
+  // FIXME: Currently we cannot really respect all IEEE predicates due to libgcc
+  // not supporting it. We can update this code when libgcc provides such
+  // functions.
+
+  assert((VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128 || VT == MVT::ppcf128)
+         && "Unsupported setcc type!");
+
+  // Expand into one or more soft-fp libcall(s).
+  RTLIB::Libcall LC1 = RTLIB::UNKNOWN_LIBCALL, LC2 = RTLIB::UNKNOWN_LIBCALL;
+  bool ShouldInvertCC = false;
+  switch (CCCode) {
+  case ISD::SETEQ:
+  case ISD::SETOEQ:
+    LC1 = (VT == MVT::f32) ? RTLIB::OEQ_F32 :
+          (VT == MVT::f64) ? RTLIB::OEQ_F64 :
+          (VT == MVT::f128) ? RTLIB::OEQ_F128 : RTLIB::OEQ_PPCF128;
+    break;
+  case ISD::SETNE:
+  case ISD::SETUNE:
+    LC1 = (VT == MVT::f32) ? RTLIB::UNE_F32 :
+          (VT == MVT::f64) ? RTLIB::UNE_F64 :
+          (VT == MVT::f128) ? RTLIB::UNE_F128 : RTLIB::UNE_PPCF128;
+    break;
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 :
+          (VT == MVT::f64) ? RTLIB::OGE_F64 :
+          (VT == MVT::f128) ? RTLIB::OGE_F128 : RTLIB::OGE_PPCF128;
+    break;
+  case ISD::SETLT:
+  case ISD::SETOLT:
+    LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
+          (VT == MVT::f64) ? RTLIB::OLT_F64 :
+          (VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128;
+    break;
+  case ISD::SETLE:
+  case ISD::SETOLE:
+    LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 :
+          (VT == MVT::f64) ? RTLIB::OLE_F64 :
+          (VT == MVT::f128) ? RTLIB::OLE_F128 : RTLIB::OLE_PPCF128;
+    break;
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
+          (VT == MVT::f64) ? RTLIB::OGT_F64 :
+          (VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128;
+    break;
+  case ISD::SETO:
+    ShouldInvertCC = true;
+    [[fallthrough]];
+  case ISD::SETUO:
+    LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 :
+          (VT == MVT::f64) ? RTLIB::UO_F64 :
+          (VT == MVT::f128) ? RTLIB::UO_F128 : RTLIB::UO_PPCF128;
+    break;
+  case ISD::SETONE:
+    // SETONE = O && UNE
+    ShouldInvertCC = true;
+    [[fallthrough]];
+  case ISD::SETUEQ:
+    LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 :
+          (VT == MVT::f64) ? RTLIB::UO_F64 :
+          (VT == MVT::f128) ? RTLIB::UO_F128 : RTLIB::UO_PPCF128;
+    LC2 = (VT == MVT::f32) ? RTLIB::OEQ_F32 :
+          (VT == MVT::f64) ? RTLIB::OEQ_F64 :
+          (VT == MVT::f128) ? RTLIB::OEQ_F128 : RTLIB::OEQ_PPCF128;
+    break;
+  default:
+    // Invert CC for unordered comparisons
+    ShouldInvertCC = true;
+    switch (CCCode) {
+    case ISD::SETULT:
+      LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 :
+            (VT == MVT::f64) ? RTLIB::OGE_F64 :
+            (VT == MVT::f128) ? RTLIB::OGE_F128 : RTLIB::OGE_PPCF128;
+      break;
+    case ISD::SETULE:
+      LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
+            (VT == MVT::f64) ? RTLIB::OGT_F64 :
+            (VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128;
+      break;
+    case ISD::SETUGT:
+      LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 :
+            (VT == MVT::f64) ? RTLIB::OLE_F64 :
+            (VT == MVT::f128) ? RTLIB::OLE_F128 : RTLIB::OLE_PPCF128;
+      break;
+    case ISD::SETUGE:
+      LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
+            (VT == MVT::f64) ? RTLIB::OLT_F64 :
+            (VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128;
+      break;
+    default: llvm_unreachable("Do not know how to soften this setcc!");
+    }
+  }
+
+  // Use the target specific return value for comparison lib calls.
+  EVT RetVT = getCmpLibcallReturnType();
+  SDValue Ops[2] = {NewLHS, NewRHS};
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { OldLHS.getValueType(),
+                   OldRHS.getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, RetVT, true);
+  auto Call = makeLibCall(DAG, LC1, RetVT, Ops, CallOptions, dl, Chain);
+  NewLHS = Call.first;
+  NewRHS = DAG.getConstant(0, dl, RetVT);
+
+  CCCode = getCmpLibcallCC(LC1);
+  if (ShouldInvertCC) {
+    assert(RetVT.isInteger());
+    CCCode = getSetCCInverse(CCCode, RetVT);
+  }
+
+  if (LC2 == RTLIB::UNKNOWN_LIBCALL) {
+    // Update Chain.
+    Chain = Call.second;
+  } else {
+    EVT SetCCVT =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), RetVT);
+    SDValue Tmp = DAG.getSetCC(dl, SetCCVT, NewLHS, NewRHS, CCCode);
+    auto Call2 = makeLibCall(DAG, LC2, RetVT, Ops, CallOptions, dl, Chain);
+    CCCode = getCmpLibcallCC(LC2);
+    if (ShouldInvertCC)
+      CCCode = getSetCCInverse(CCCode, RetVT);
+    NewLHS = DAG.getSetCC(dl, SetCCVT, Call2.first, NewRHS, CCCode);
+    if (Chain)
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Call.second,
+                          Call2.second);
+    NewLHS = DAG.getNode(ShouldInvertCC ? ISD::AND : ISD::OR, dl,
+                         Tmp.getValueType(), Tmp, NewLHS);
+    NewRHS = SDValue();
+  }
+}
+
+/// Return the entry encoding for a jump table in the current function. The
+/// returned value is a member of the MachineJumpTableInfo::JTEntryKind enum.
+unsigned TargetLowering::getJumpTableEncoding() const {
+  // In non-pic modes, just use the address of a block.
+  if (!isPositionIndependent())
+    return MachineJumpTableInfo::EK_BlockAddress;
+
+  // In PIC mode, if the target supports a GPRel32 directive, use it.
+  if (getTargetMachine().getMCAsmInfo()->getGPRel32Directive() != nullptr)
+    return MachineJumpTableInfo::EK_GPRel32BlockAddress;
+
+  // Otherwise, use a label difference.
+  return MachineJumpTableInfo::EK_LabelDifference32;
+}
+
+SDValue TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+                                                 SelectionDAG &DAG) const {
+  // If our PIC model is GP relative, use the global offset table as the base.
+  unsigned JTEncoding = getJumpTableEncoding();
+
+  if ((JTEncoding == MachineJumpTableInfo::EK_GPRel64BlockAddress) ||
+      (JTEncoding == MachineJumpTableInfo::EK_GPRel32BlockAddress))
+    return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy(DAG.getDataLayout()));
+
+  return Table;
+}
+
+/// This returns the relocation base for the given PIC jumptable, the same as
+/// getPICJumpTableRelocBase, but as an MCExpr.
+const MCExpr *
+TargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+                                             unsigned JTI,MCContext &Ctx) const{
+  // The normal PIC reloc base is the label at the start of the jump table.
+  return MCSymbolRefExpr::create(MF->getJTISymbol(JTI, Ctx), Ctx);
+}
+
+bool
+TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  const TargetMachine &TM = getTargetMachine();
+  const GlobalValue *GV = GA->getGlobal();
+
+  // If the address is not even local to this DSO we will have to load it from
+  // a got and then add the offset.
+  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+    return false;
+
+  // If the code is position independent we will have to add a base register.
+  if (isPositionIndependent())
+    return false;
+
+  // Otherwise we can do it.
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  Optimization Methods
+//===----------------------------------------------------------------------===//
+
+/// If the specified instruction has a constant integer operand and there are
+/// bits set in that constant that are not demanded, then clear those bits and
+/// return true.
+bool TargetLowering::ShrinkDemandedConstant(SDValue Op,
+                                            const APInt &DemandedBits,
+                                            const APInt &DemandedElts,
+                                            TargetLoweringOpt &TLO) const {
+  SDLoc DL(Op);
+  unsigned Opcode = Op.getOpcode();
+
+  // Early-out if we've ended up calling an undemanded node, leave this to
+  // constant folding.
+  if (DemandedBits.isZero() || DemandedElts.isZero())
+    return false;
+
+  // Do target-specific constant optimization.
+  if (targetShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO))
+    return TLO.New.getNode();
+
+  // FIXME: ISD::SELECT, ISD::SELECT_CC
+  switch (Opcode) {
+  default:
+    break;
+  case ISD::XOR:
+  case ISD::AND:
+  case ISD::OR: {
+    auto *Op1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!Op1C || Op1C->isOpaque())
+      return false;
+
+    // If this is a 'not' op, don't touch it because that's a canonical form.
+    const APInt &C = Op1C->getAPIntValue();
+    if (Opcode == ISD::XOR && DemandedBits.isSubsetOf(C))
+      return false;
+
+    if (!C.isSubsetOf(DemandedBits)) {
+      EVT VT = Op.getValueType();
+      SDValue NewC = TLO.DAG.getConstant(DemandedBits & C, DL, VT);
+      SDValue NewOp = TLO.DAG.getNode(Opcode, DL, VT, Op.getOperand(0), NewC);
+      return TLO.CombineTo(Op, NewOp);
+    }
+
+    break;
+  }
+  }
+
+  return false;
+}
+
+bool TargetLowering::ShrinkDemandedConstant(SDValue Op,
+                                            const APInt &DemandedBits,
+                                            TargetLoweringOpt &TLO) const {
+  EVT VT = Op.getValueType();
+  APInt DemandedElts = VT.isVector()
+                           ? APInt::getAllOnes(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  return ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO);
+}
+
+/// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.
+/// This uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
+/// generalized for targets with other types of implicit widening casts.
+bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
+                                      const APInt &DemandedBits,
+                                      TargetLoweringOpt &TLO) const {
+  assert(Op.getNumOperands() == 2 &&
+         "ShrinkDemandedOp only supports binary operators!");
+  assert(Op.getNode()->getNumValues() == 1 &&
+         "ShrinkDemandedOp only supports nodes with one result!");
+
+  EVT VT = Op.getValueType();
+  SelectionDAG &DAG = TLO.DAG;
+  SDLoc dl(Op);
+
+  // Early return, as this function cannot handle vector types.
+  if (VT.isVector())
+    return false;
+
+  // Don't do this if the node has another user, which may require the
+  // full value.
+  if (!Op.getNode()->hasOneUse())
+    return false;
+
+  // Search for the smallest integer type with free casts to and from
+  // Op's type. For expedience, just check power-of-2 integer types.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned DemandedSize = DemandedBits.getActiveBits();
+  for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize);
+       SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
+    EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), SmallVTBits);
+    if (TLI.isTruncateFree(VT, SmallVT) && TLI.isZExtFree(SmallVT, VT)) {
+      // We found a type with free casts.
+      SDValue X = DAG.getNode(
+          Op.getOpcode(), dl, SmallVT,
+          DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)),
+          DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(1)));
+      assert(DemandedSize <= SmallVTBits && "Narrowed below demanded bits?");
+      SDValue Z = DAG.getNode(ISD::ANY_EXTEND, dl, VT, X);
+      return TLO.CombineTo(Op, Z);
+    }
+  }
+  return false;
+}
+
+bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
+                                          DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                        !DCI.isBeforeLegalizeOps());
+  KnownBits Known;
+
+  bool Simplified = SimplifyDemandedBits(Op, DemandedBits, Known, TLO);
+  if (Simplified) {
+    DCI.AddToWorklist(Op.getNode());
+    DCI.CommitTargetLoweringOpt(TLO);
+  }
+  return Simplified;
+}
+
+bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
+                                          const APInt &DemandedElts,
+                                          DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                        !DCI.isBeforeLegalizeOps());
+  KnownBits Known;
+
+  bool Simplified =
+      SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO);
+  if (Simplified) {
+    DCI.AddToWorklist(Op.getNode());
+    DCI.CommitTargetLoweringOpt(TLO);
+  }
+  return Simplified;
+}
+
+bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
+                                          KnownBits &Known,
+                                          TargetLoweringOpt &TLO,
+                                          unsigned Depth,
+                                          bool AssumeSingleUse) const {
+  EVT VT = Op.getValueType();
+
+  // Since the number of lanes in a scalable vector is unknown at compile time,
+  // we track one bit which is implicitly broadcast to all lanes.  This means
+  // that all lanes in a scalable vector are considered demanded.
+  APInt DemandedElts = VT.isFixedLengthVector()
+                           ? APInt::getAllOnes(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  return SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, Depth,
+                              AssumeSingleUse);
+}
+
+// TODO: Under what circumstances can we create nodes? Constant folding?
+SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
+    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+    SelectionDAG &DAG, unsigned Depth) const {
+  EVT VT = Op.getValueType();
+
+  // Limit search depth.
+  if (Depth >= SelectionDAG::MaxRecursionDepth)
+    return SDValue();
+
+  // Ignore UNDEFs.
+  if (Op.isUndef())
+    return SDValue();
+
+  // Not demanding any bits/elts from Op.
+  if (DemandedBits == 0 || DemandedElts == 0)
+    return DAG.getUNDEF(VT);
+
+  bool IsLE = DAG.getDataLayout().isLittleEndian();
+  unsigned NumElts = DemandedElts.getBitWidth();
+  unsigned BitWidth = DemandedBits.getBitWidth();
+  KnownBits LHSKnown, RHSKnown;
+  switch (Op.getOpcode()) {
+  case ISD::BITCAST: {
+    if (VT.isScalableVector())
+      return SDValue();
+
+    SDValue Src = peekThroughBitcasts(Op.getOperand(0));
+    EVT SrcVT = Src.getValueType();
+    EVT DstVT = Op.getValueType();
+    if (SrcVT == DstVT)
+      return Src;
+
+    unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
+    unsigned NumDstEltBits = DstVT.getScalarSizeInBits();
+    if (NumSrcEltBits == NumDstEltBits)
+      if (SDValue V = SimplifyMultipleUseDemandedBits(
+              Src, DemandedBits, DemandedElts, DAG, Depth + 1))
+        return DAG.getBitcast(DstVT, V);
+
+    if (SrcVT.isVector() && (NumDstEltBits % NumSrcEltBits) == 0) {
+      unsigned Scale = NumDstEltBits / NumSrcEltBits;
+      unsigned NumSrcElts = SrcVT.getVectorNumElements();
+      APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits);
+      APInt DemandedSrcElts = APInt::getZero(NumSrcElts);
+      for (unsigned i = 0; i != Scale; ++i) {
+        unsigned EltOffset = IsLE ? i : (Scale - 1 - i);
+        unsigned BitOffset = EltOffset * NumSrcEltBits;
+        APInt Sub = DemandedBits.extractBits(NumSrcEltBits, BitOffset);
+        if (!Sub.isZero()) {
+          DemandedSrcBits |= Sub;
+          for (unsigned j = 0; j != NumElts; ++j)
+            if (DemandedElts[j])
+              DemandedSrcElts.setBit((j * Scale) + i);
+        }
+      }
+
+      if (SDValue V = SimplifyMultipleUseDemandedBits(
+              Src, DemandedSrcBits, DemandedSrcElts, DAG, Depth + 1))
+        return DAG.getBitcast(DstVT, V);
+    }
+
+    // TODO - bigendian once we have test coverage.
+    if (IsLE && (NumSrcEltBits % NumDstEltBits) == 0) {
+      unsigned Scale = NumSrcEltBits / NumDstEltBits;
+      unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
+      APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits);
+      APInt DemandedSrcElts = APInt::getZero(NumSrcElts);
+      for (unsigned i = 0; i != NumElts; ++i)
+        if (DemandedElts[i]) {
+          unsigned Offset = (i % Scale) * NumDstEltBits;
+          DemandedSrcBits.insertBits(DemandedBits, Offset);
+          DemandedSrcElts.setBit(i / Scale);
+        }
+
+      if (SDValue V = SimplifyMultipleUseDemandedBits(
+              Src, DemandedSrcBits, DemandedSrcElts, DAG, Depth + 1))
+        return DAG.getBitcast(DstVT, V);
+    }
+
+    break;
+  }
+  case ISD::AND: {
+    LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+
+    // If all of the demanded bits are known 1 on one side, return the other.
+    // These bits cannot contribute to the result of the 'and' in this
+    // context.
+    if (DemandedBits.isSubsetOf(LHSKnown.Zero | RHSKnown.One))
+      return Op.getOperand(0);
+    if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
+      return Op.getOperand(1);
+    break;
+  }
+  case ISD::OR: {
+    LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+
+    // If all of the demanded bits are known zero on one side, return the
+    // other.  These bits cannot contribute to the result of the 'or' in this
+    // context.
+    if (DemandedBits.isSubsetOf(LHSKnown.One | RHSKnown.Zero))
+      return Op.getOperand(0);
+    if (DemandedBits.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
+      return Op.getOperand(1);
+    break;
+  }
+  case ISD::XOR: {
+    LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+
+    // If all of the demanded bits are known zero on one side, return the
+    // other.
+    if (DemandedBits.isSubsetOf(RHSKnown.Zero))
+      return Op.getOperand(0);
+    if (DemandedBits.isSubsetOf(LHSKnown.Zero))
+      return Op.getOperand(1);
+    break;
+  }
+  case ISD::SHL: {
+    // If we are only demanding sign bits then we can use the shift source
+    // directly.
+    if (const APInt *MaxSA =
+            DAG.getValidMaximumShiftAmountConstant(Op, DemandedElts)) {
+      SDValue Op0 = Op.getOperand(0);
+      unsigned ShAmt = MaxSA->getZExtValue();
+      unsigned NumSignBits =
+          DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
+      unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
+      if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= (UpperDemandedBits))
+        return Op0;
+    }
+    break;
+  }
+  case ISD::SETCC: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    // If (1) we only need the sign-bit, (2) the setcc operands are the same
+    // width as the setcc result, and (3) the result of a setcc conforms to 0 or
+    // -1, we may be able to bypass the setcc.
+    if (DemandedBits.isSignMask() &&
+        Op0.getScalarValueSizeInBits() == BitWidth &&
+        getBooleanContents(Op0.getValueType()) ==
+            BooleanContent::ZeroOrNegativeOneBooleanContent) {
+      // If we're testing X < 0, then this compare isn't needed - just use X!
+      // FIXME: We're limiting to integer types here, but this should also work
+      // if we don't care about FP signed-zero. The use of SETLT with FP means
+      // that we don't care about NaNs.
+      if (CC == ISD::SETLT && Op1.getValueType().isInteger() &&
+          (isNullConstant(Op1) || ISD::isBuildVectorAllZeros(Op1.getNode())))
+        return Op0;
+    }
+    break;
+  }
+  case ISD::SIGN_EXTEND_INREG: {
+    // If none of the extended bits are demanded, eliminate the sextinreg.
+    SDValue Op0 = Op.getOperand(0);
+    EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    unsigned ExBits = ExVT.getScalarSizeInBits();
+    if (DemandedBits.getActiveBits() <= ExBits &&
+        shouldRemoveRedundantExtend(Op))
+      return Op0;
+    // If the input is already sign extended, just drop the extension.
+    unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
+    if (NumSignBits >= (BitWidth - ExBits + 1))
+      return Op0;
+    break;
+  }
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG: {
+    if (VT.isScalableVector())
+      return SDValue();
+
+    // If we only want the lowest element and none of extended bits, then we can
+    // return the bitcasted source vector.
+    SDValue Src = Op.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    EVT DstVT = Op.getValueType();
+    if (IsLE && DemandedElts == 1 &&
+        DstVT.getSizeInBits() == SrcVT.getSizeInBits() &&
+        DemandedBits.getActiveBits() <= SrcVT.getScalarSizeInBits()) {
+      return DAG.getBitcast(DstVT, Src);
+    }
+    break;
+  }
+  case ISD::INSERT_VECTOR_ELT: {
+    if (VT.isScalableVector())
+      return SDValue();
+
+    // If we don't demand the inserted element, return the base vector.
+    SDValue Vec = Op.getOperand(0);
+    auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    EVT VecVT = Vec.getValueType();
+    if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
+        !DemandedElts[CIdx->getZExtValue()])
+      return Vec;
+    break;
+  }
+  case ISD::INSERT_SUBVECTOR: {
+    if (VT.isScalableVector())
+      return SDValue();
+
+    SDValue Vec = Op.getOperand(0);
+    SDValue Sub = Op.getOperand(1);
+    uint64_t Idx = Op.getConstantOperandVal(2);
+    unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+    APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+    // If we don't demand the inserted subvector, return the base vector.
+    if (DemandedSubElts == 0)
+      return Vec;
+    break;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    assert(!VT.isScalableVector());
+    ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(Op)->getMask();
+
+    // If all the demanded elts are from one operand and are inline,
+    // then we can use the operand directly.
+    bool AllUndef = true, IdentityLHS = true, IdentityRHS = true;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int M = ShuffleMask[i];
+      if (M < 0 || !DemandedElts[i])
+        continue;
+      AllUndef = false;
+      IdentityLHS &= (M == (int)i);
+      IdentityRHS &= ((M - NumElts) == i);
+    }
+
+    if (AllUndef)
+      return DAG.getUNDEF(Op.getValueType());
+    if (IdentityLHS)
+      return Op.getOperand(0);
+    if (IdentityRHS)
+      return Op.getOperand(1);
+    break;
+  }
+  default:
+    // TODO: Probably okay to remove after audit; here to reduce change size
+    // in initial enablement patch for scalable vectors
+    if (VT.isScalableVector())
+      return SDValue();
+
+    if (Op.getOpcode() >= ISD::BUILTIN_OP_END)
+      if (SDValue V = SimplifyMultipleUseDemandedBitsForTargetNode(
+              Op, DemandedBits, DemandedElts, DAG, Depth))
+        return V;
+    break;
+  }
+  return SDValue();
+}
+
+SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
+    SDValue Op, const APInt &DemandedBits, SelectionDAG &DAG,
+    unsigned Depth) const {
+  EVT VT = Op.getValueType();
+  // Since the number of lanes in a scalable vector is unknown at compile time,
+  // we track one bit which is implicitly broadcast to all lanes.  This means
+  // that all lanes in a scalable vector are considered demanded.
+  APInt DemandedElts = VT.isFixedLengthVector()
+                           ? APInt::getAllOnes(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  return SimplifyMultipleUseDemandedBits(Op, DemandedBits, DemandedElts, DAG,
+                                         Depth);
+}
+
+SDValue TargetLowering::SimplifyMultipleUseDemandedVectorElts(
+    SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG,
+    unsigned Depth) const {
+  APInt DemandedBits = APInt::getAllOnes(Op.getScalarValueSizeInBits());
+  return SimplifyMultipleUseDemandedBits(Op, DemandedBits, DemandedElts, DAG,
+                                         Depth);
+}
+
+// Attempt to form ext(avgfloor(A, B)) from shr(add(ext(A), ext(B)), 1).
+//      or to form ext(avgceil(A, B)) from shr(add(ext(A), ext(B), 1), 1).
+static SDValue combineShiftToAVG(SDValue Op, SelectionDAG &DAG,
+                                 const TargetLowering &TLI,
+                                 const APInt &DemandedBits,
+                                 const APInt &DemandedElts,
+                                 unsigned Depth) {
+  assert((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) &&
+         "SRL or SRA node is required here!");
+  // Is the right shift using an immediate value of 1?
+  ConstantSDNode *N1C = isConstOrConstSplat(Op.getOperand(1), DemandedElts);
+  if (!N1C || !N1C->isOne())
+    return SDValue();
+
+  // We are looking for an avgfloor
+  // add(ext, ext)
+  // or one of these as a avgceil
+  // add(add(ext, ext), 1)
+  // add(add(ext, 1), ext)
+  // add(ext, add(ext, 1))
+  SDValue Add = Op.getOperand(0);
+  if (Add.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  SDValue ExtOpA = Add.getOperand(0);
+  SDValue ExtOpB = Add.getOperand(1);
+  SDValue Add2;
+  auto MatchOperands = [&](SDValue Op1, SDValue Op2, SDValue Op3, SDValue A) {
+    ConstantSDNode *ConstOp;
+    if ((ConstOp = isConstOrConstSplat(Op2, DemandedElts)) &&
+        ConstOp->isOne()) {
+      ExtOpA = Op1;
+      ExtOpB = Op3;
+      Add2 = A;
+      return true;
+    }
+    if ((ConstOp = isConstOrConstSplat(Op3, DemandedElts)) &&
+        ConstOp->isOne()) {
+      ExtOpA = Op1;
+      ExtOpB = Op2;
+      Add2 = A;
+      return true;
+    }
+    return false;
+  };
+  bool IsCeil =
+      (ExtOpA.getOpcode() == ISD::ADD &&
+       MatchOperands(ExtOpA.getOperand(0), ExtOpA.getOperand(1), ExtOpB, ExtOpA)) ||
+      (ExtOpB.getOpcode() == ISD::ADD &&
+       MatchOperands(ExtOpB.getOperand(0), ExtOpB.getOperand(1), ExtOpA, ExtOpB));
+
+  // If the shift is signed (sra):
+  //  - Needs >= 2 sign bit for both operands.
+  //  - Needs >= 2 zero bits.
+  // If the shift is unsigned (srl):
+  //  - Needs >= 1 zero bit for both operands.
+  //  - Needs 1 demanded bit zero and >= 2 sign bits.
+  unsigned ShiftOpc = Op.getOpcode();
+  bool IsSigned = false;
+  unsigned KnownBits;
+  unsigned NumSignedA = DAG.ComputeNumSignBits(ExtOpA, DemandedElts, Depth);
+  unsigned NumSignedB = DAG.ComputeNumSignBits(ExtOpB, DemandedElts, Depth);
+  unsigned NumSigned = std::min(NumSignedA, NumSignedB) - 1;
+  unsigned NumZeroA =
+      DAG.computeKnownBits(ExtOpA, DemandedElts, Depth).countMinLeadingZeros();
+  unsigned NumZeroB =
+      DAG.computeKnownBits(ExtOpB, DemandedElts, Depth).countMinLeadingZeros();
+  unsigned NumZero = std::min(NumZeroA, NumZeroB);
+
+  switch (ShiftOpc) {
+  default:
+    llvm_unreachable("Unexpected ShiftOpc in combineShiftToAVG");
+  case ISD::SRA: {
+    if (NumZero >= 2 && NumSigned < NumZero) {
+      IsSigned = false;
+      KnownBits = NumZero;
+      break;
+    }
+    if (NumSigned >= 1) {
+      IsSigned = true;
+      KnownBits = NumSigned;
+      break;
+    }
+    return SDValue();
+  }
+  case ISD::SRL: {
+    if (NumZero >= 1 && NumSigned < NumZero) {
+      IsSigned = false;
+      KnownBits = NumZero;
+      break;
+    }
+    if (NumSigned >= 1 && DemandedBits.isSignBitClear()) {
+      IsSigned = true;
+      KnownBits = NumSigned;
+      break;
+    }
+    return SDValue();
+  }
+  }
+
+  unsigned AVGOpc = IsCeil ? (IsSigned ? ISD::AVGCEILS : ISD::AVGCEILU)
+                           : (IsSigned ? ISD::AVGFLOORS : ISD::AVGFLOORU);
+
+  // Find the smallest power-2 type that is legal for this vector size and
+  // operation, given the original type size and the number of known sign/zero
+  // bits.
+  EVT VT = Op.getValueType();
+  unsigned MinWidth =
+      std::max<unsigned>(VT.getScalarSizeInBits() - KnownBits, 8);
+  EVT NVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_ceil(MinWidth));
+  if (VT.isVector())
+    NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
+  if (!TLI.isOperationLegalOrCustom(AVGOpc, NVT)) {
+    // If we could not transform, and (both) adds are nuw/nsw, we can use the
+    // larger type size to do the transform.
+    if (!TLI.isOperationLegalOrCustom(AVGOpc, VT))
+      return SDValue();
+
+    if (DAG.computeOverflowForAdd(IsSigned, Add.getOperand(0),
+                                  Add.getOperand(1)) ==
+            SelectionDAG::OFK_Never &&
+        (!Add2 || DAG.computeOverflowForAdd(IsSigned, Add2.getOperand(0),
+                                            Add2.getOperand(1)) ==
+                      SelectionDAG::OFK_Never))
+      NVT = VT;
+    else
+      return SDValue();
+  }
+
+  SDLoc DL(Op);
+  SDValue ResultAVG =
+      DAG.getNode(AVGOpc, DL, NVT, DAG.getNode(ISD::TRUNCATE, DL, NVT, ExtOpA),
+                  DAG.getNode(ISD::TRUNCATE, DL, NVT, ExtOpB));
+  return DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT,
+                     ResultAVG);
+}
+
+/// Look at Op. At this point, we know that only the OriginalDemandedBits of the
+/// result of Op are ever used downstream. If we can use this information to
+/// simplify Op, create a new simplified DAG node and return true, returning the
+/// original and new nodes in Old and New. Otherwise, analyze the expression and
+/// return a mask of Known bits for the expression (used to simplify the
+/// caller).  The Known bits may only be accurate for those bits in the
+/// OriginalDemandedBits and OriginalDemandedElts.
+bool TargetLowering::SimplifyDemandedBits(
+    SDValue Op, const APInt &OriginalDemandedBits,
+    const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
+    unsigned Depth, bool AssumeSingleUse) const {
+  unsigned BitWidth = OriginalDemandedBits.getBitWidth();
+  assert(Op.getScalarValueSizeInBits() == BitWidth &&
+         "Mask size mismatches value type size!");
+
+  // Don't know anything.
+  Known = KnownBits(BitWidth);
+
+  EVT VT = Op.getValueType();
+  bool IsLE = TLO.DAG.getDataLayout().isLittleEndian();
+  unsigned NumElts = OriginalDemandedElts.getBitWidth();
+  assert((!VT.isFixedLengthVector() || NumElts == VT.getVectorNumElements()) &&
+         "Unexpected vector size");
+
+  APInt DemandedBits = OriginalDemandedBits;
+  APInt DemandedElts = OriginalDemandedElts;
+  SDLoc dl(Op);
+  auto &DL = TLO.DAG.getDataLayout();
+
+  // Undef operand.
+  if (Op.isUndef())
+    return false;
+
+  // We can't simplify target constants.
+  if (Op.getOpcode() == ISD::TargetConstant)
+    return false;
+
+  if (Op.getOpcode() == ISD::Constant) {
+    // We know all of the bits for a constant!
+    Known = KnownBits::makeConstant(cast<ConstantSDNode>(Op)->getAPIntValue());
+    return false;
+  }
+
+  if (Op.getOpcode() == ISD::ConstantFP) {
+    // We know all of the bits for a floating point constant!
+    Known = KnownBits::makeConstant(
+        cast<ConstantFPSDNode>(Op)->getValueAPF().bitcastToAPInt());
+    return false;
+  }
+
+  // Other users may use these bits.
+  bool HasMultiUse = false;
+  if (!AssumeSingleUse && !Op.getNode()->hasOneUse()) {
+    if (Depth >= SelectionDAG::MaxRecursionDepth) {
+      // Limit search depth.
+      return false;
+    }
+    // Allow multiple uses, just set the DemandedBits/Elts to all bits.
+    DemandedBits = APInt::getAllOnes(BitWidth);
+    DemandedElts = APInt::getAllOnes(NumElts);
+    HasMultiUse = true;
+  } else if (OriginalDemandedBits == 0 || OriginalDemandedElts == 0) {
+    // Not demanding any bits/elts from Op.
+    return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
+  } else if (Depth >= SelectionDAG::MaxRecursionDepth) {
+    // Limit search depth.
+    return false;
+  }
+
+  KnownBits Known2;
+  switch (Op.getOpcode()) {
+  case ISD::SCALAR_TO_VECTOR: {
+    if (VT.isScalableVector())
+      return false;
+    if (!DemandedElts[0])
+      return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
+
+    KnownBits SrcKnown;
+    SDValue Src = Op.getOperand(0);
+    unsigned SrcBitWidth = Src.getScalarValueSizeInBits();
+    APInt SrcDemandedBits = DemandedBits.zext(SrcBitWidth);
+    if (SimplifyDemandedBits(Src, SrcDemandedBits, SrcKnown, TLO, Depth + 1))
+      return true;
+
+    // Upper elements are undef, so only get the knownbits if we just demand
+    // the bottom element.
+    if (DemandedElts == 1)
+      Known = SrcKnown.anyextOrTrunc(BitWidth);
+    break;
+  }
+  case ISD::BUILD_VECTOR:
+    // Collect the known bits that are shared by every demanded element.
+    // TODO: Call SimplifyDemandedBits for non-constant demanded elements.
+    Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+    return false; // Don't fall through, will infinitely loop.
+  case ISD::LOAD: {
+    auto *LD = cast<LoadSDNode>(Op);
+    if (getTargetConstantFromLoad(LD)) {
+      Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+      return false; // Don't fall through, will infinitely loop.
+    }
+    if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
+      // If this is a ZEXTLoad and we are looking at the loaded value.
+      EVT MemVT = LD->getMemoryVT();
+      unsigned MemBits = MemVT.getScalarSizeInBits();
+      Known.Zero.setBitsFrom(MemBits);
+      return false; // Don't fall through, will infinitely loop.
+    }
+    break;
+  }
+  case ISD::INSERT_VECTOR_ELT: {
+    if (VT.isScalableVector())
+      return false;
+    SDValue Vec = Op.getOperand(0);
+    SDValue Scl = Op.getOperand(1);
+    auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    EVT VecVT = Vec.getValueType();
+
+    // If index isn't constant, assume we need all vector elements AND the
+    // inserted element.
+    APInt DemandedVecElts(DemandedElts);
+    if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
+      unsigned Idx = CIdx->getZExtValue();
+      DemandedVecElts.clearBit(Idx);
+
+      // Inserted element is not required.
+      if (!DemandedElts[Idx])
+        return TLO.CombineTo(Op, Vec);
+    }
+
+    KnownBits KnownScl;
+    unsigned NumSclBits = Scl.getScalarValueSizeInBits();
+    APInt DemandedSclBits = DemandedBits.zextOrTrunc(NumSclBits);
+    if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
+      return true;
+
+    Known = KnownScl.anyextOrTrunc(BitWidth);
+
+    KnownBits KnownVec;
+    if (SimplifyDemandedBits(Vec, DemandedBits, DemandedVecElts, KnownVec, TLO,
+                             Depth + 1))
+      return true;
+
+    if (!!DemandedVecElts)
+      Known = Known.intersectWith(KnownVec);
+
+    return false;
+  }
+  case ISD::INSERT_SUBVECTOR: {
+    if (VT.isScalableVector())
+      return false;
+    // Demand any elements from the subvector and the remainder from the src its
+    // inserted into.
+    SDValue Src = Op.getOperand(0);
+    SDValue Sub = Op.getOperand(1);
+    uint64_t Idx = Op.getConstantOperandVal(2);
+    unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+    APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+    APInt DemandedSrcElts = DemandedElts;
+    DemandedSrcElts.insertBits(APInt::getZero(NumSubElts), Idx);
+
+    KnownBits KnownSub, KnownSrc;
+    if (SimplifyDemandedBits(Sub, DemandedBits, DemandedSubElts, KnownSub, TLO,
+                             Depth + 1))
+      return true;
+    if (SimplifyDemandedBits(Src, DemandedBits, DemandedSrcElts, KnownSrc, TLO,
+                             Depth + 1))
+      return true;
+
+    Known.Zero.setAllBits();
+    Known.One.setAllBits();
+    if (!!DemandedSubElts)
+      Known = Known.intersectWith(KnownSub);
+    if (!!DemandedSrcElts)
+      Known = Known.intersectWith(KnownSrc);
+
+    // Attempt to avoid multi-use src if we don't need anything from it.
+    if (!DemandedBits.isAllOnes() || !DemandedSubElts.isAllOnes() ||
+        !DemandedSrcElts.isAllOnes()) {
+      SDValue NewSub = SimplifyMultipleUseDemandedBits(
+          Sub, DemandedBits, DemandedSubElts, TLO.DAG, Depth + 1);
+      SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+          Src, DemandedBits, DemandedSrcElts, TLO.DAG, Depth + 1);
+      if (NewSub || NewSrc) {
+        NewSub = NewSub ? NewSub : Sub;
+        NewSrc = NewSrc ? NewSrc : Src;
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, NewSrc, NewSub,
+                                        Op.getOperand(2));
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+    break;
+  }
+  case ISD::EXTRACT_SUBVECTOR: {
+    if (VT.isScalableVector())
+      return false;
+    // Offset the demanded elts by the subvector index.
+    SDValue Src = Op.getOperand(0);
+    if (Src.getValueType().isScalableVector())
+      break;
+    uint64_t Idx = Op.getConstantOperandVal(1);
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx);
+
+    if (SimplifyDemandedBits(Src, DemandedBits, DemandedSrcElts, Known, TLO,
+                             Depth + 1))
+      return true;
+
+    // Attempt to avoid multi-use src if we don't need anything from it.
+    if (!DemandedBits.isAllOnes() || !DemandedSrcElts.isAllOnes()) {
+      SDValue DemandedSrc = SimplifyMultipleUseDemandedBits(
+          Src, DemandedBits, DemandedSrcElts, TLO.DAG, Depth + 1);
+      if (DemandedSrc) {
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, DemandedSrc,
+                                        Op.getOperand(1));
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+    break;
+  }
+  case ISD::CONCAT_VECTORS: {
+    if (VT.isScalableVector())
+      return false;
+    Known.Zero.setAllBits();
+    Known.One.setAllBits();
+    EVT SubVT = Op.getOperand(0).getValueType();
+    unsigned NumSubVecs = Op.getNumOperands();
+    unsigned NumSubElts = SubVT.getVectorNumElements();
+    for (unsigned i = 0; i != NumSubVecs; ++i) {
+      APInt DemandedSubElts =
+          DemandedElts.extractBits(NumSubElts, i * NumSubElts);
+      if (SimplifyDemandedBits(Op.getOperand(i), DemandedBits, DemandedSubElts,
+                               Known2, TLO, Depth + 1))
+        return true;
+      // Known bits are shared by every demanded subvector element.
+      if (!!DemandedSubElts)
+        Known = Known.intersectWith(Known2);
+    }
+    break;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    assert(!VT.isScalableVector());
+    ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(Op)->getMask();
+
+    // Collect demanded elements from shuffle operands..
+    APInt DemandedLHS, DemandedRHS;
+    if (!getShuffleDemandedElts(NumElts, ShuffleMask, DemandedElts, DemandedLHS,
+                                DemandedRHS))
+      break;
+
+    if (!!DemandedLHS || !!DemandedRHS) {
+      SDValue Op0 = Op.getOperand(0);
+      SDValue Op1 = Op.getOperand(1);
+
+      Known.Zero.setAllBits();
+      Known.One.setAllBits();
+      if (!!DemandedLHS) {
+        if (SimplifyDemandedBits(Op0, DemandedBits, DemandedLHS, Known2, TLO,
+                                 Depth + 1))
+          return true;
+        Known = Known.intersectWith(Known2);
+      }
+      if (!!DemandedRHS) {
+        if (SimplifyDemandedBits(Op1, DemandedBits, DemandedRHS, Known2, TLO,
+                                 Depth + 1))
+          return true;
+        Known = Known.intersectWith(Known2);
+      }
+
+      // Attempt to avoid multi-use ops if we don't need anything from them.
+      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+          Op0, DemandedBits, DemandedLHS, TLO.DAG, Depth + 1);
+      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+          Op1, DemandedBits, DemandedRHS, TLO.DAG, Depth + 1);
+      if (DemandedOp0 || DemandedOp1) {
+        Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+        Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+        SDValue NewOp = TLO.DAG.getVectorShuffle(VT, dl, Op0, Op1, ShuffleMask);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+    break;
+  }
+  case ISD::AND: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    // If the RHS is a constant, check to see if the LHS would be zero without
+    // using the bits from the RHS.  Below, we use knowledge about the RHS to
+    // simplify the LHS, here we're using information from the LHS to simplify
+    // the RHS.
+    if (ConstantSDNode *RHSC = isConstOrConstSplat(Op1)) {
+      // Do not increment Depth here; that can cause an infinite loop.
+      KnownBits LHSKnown = TLO.DAG.computeKnownBits(Op0, DemandedElts, Depth);
+      // If the LHS already has zeros where RHSC does, this 'and' is dead.
+      if ((LHSKnown.Zero & DemandedBits) ==
+          (~RHSC->getAPIntValue() & DemandedBits))
+        return TLO.CombineTo(Op, Op0);
+
+      // If any of the set bits in the RHS are known zero on the LHS, shrink
+      // the constant.
+      if (ShrinkDemandedConstant(Op, ~LHSKnown.Zero & DemandedBits,
+                                 DemandedElts, TLO))
+        return true;
+
+      // Bitwise-not (xor X, -1) is a special case: we don't usually shrink its
+      // constant, but if this 'and' is only clearing bits that were just set by
+      // the xor, then this 'and' can be eliminated by shrinking the mask of
+      // the xor. For example, for a 32-bit X:
+      // and (xor (srl X, 31), -1), 1 --> xor (srl X, 31), 1
+      if (isBitwiseNot(Op0) && Op0.hasOneUse() &&
+          LHSKnown.One == ~RHSC->getAPIntValue()) {
+        SDValue Xor = TLO.DAG.getNode(ISD::XOR, dl, VT, Op0.getOperand(0), Op1);
+        return TLO.CombineTo(Op, Xor);
+      }
+    }
+
+    // AND(INSERT_SUBVECTOR(C,X,I),M) -> INSERT_SUBVECTOR(AND(C,M),X,I)
+    // iff 'C' is Undef/Constant and AND(X,M) == X (for DemandedBits).
+    if (Op0.getOpcode() == ISD::INSERT_SUBVECTOR && !VT.isScalableVector() &&
+        (Op0.getOperand(0).isUndef() ||
+         ISD::isBuildVectorOfConstantSDNodes(Op0.getOperand(0).getNode())) &&
+        Op0->hasOneUse()) {
+      unsigned NumSubElts =
+          Op0.getOperand(1).getValueType().getVectorNumElements();
+      unsigned SubIdx = Op0.getConstantOperandVal(2);
+      APInt DemandedSub =
+          APInt::getBitsSet(NumElts, SubIdx, SubIdx + NumSubElts);
+      KnownBits KnownSubMask =
+          TLO.DAG.computeKnownBits(Op1, DemandedSub & DemandedElts, Depth + 1);
+      if (DemandedBits.isSubsetOf(KnownSubMask.One)) {
+        SDValue NewAnd =
+            TLO.DAG.getNode(ISD::AND, dl, VT, Op0.getOperand(0), Op1);
+        SDValue NewInsert =
+            TLO.DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, NewAnd,
+                            Op0.getOperand(1), Op0.getOperand(2));
+        return TLO.CombineTo(Op, NewInsert);
+      }
+    }
+
+    if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO,
+                             Depth + 1))
+      return true;
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    if (SimplifyDemandedBits(Op0, ~Known.Zero & DemandedBits, DemandedElts,
+                             Known2, TLO, Depth + 1))
+      return true;
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
+
+    // If all of the demanded bits are known one on one side, return the other.
+    // These bits cannot contribute to the result of the 'and'.
+    if (DemandedBits.isSubsetOf(Known2.Zero | Known.One))
+      return TLO.CombineTo(Op, Op0);
+    if (DemandedBits.isSubsetOf(Known.Zero | Known2.One))
+      return TLO.CombineTo(Op, Op1);
+    // If all of the demanded bits in the inputs are known zeros, return zero.
+    if (DemandedBits.isSubsetOf(Known.Zero | Known2.Zero))
+      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, VT));
+    // If the RHS is a constant, see if we can simplify it.
+    if (ShrinkDemandedConstant(Op, ~Known2.Zero & DemandedBits, DemandedElts,
+                               TLO))
+      return true;
+    // If the operation can be done in a smaller type, do so.
+    if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
+      return true;
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedBits.isAllOnes() || !DemandedElts.isAllOnes()) {
+      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+          Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+          Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      if (DemandedOp0 || DemandedOp1) {
+        Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+        Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
+    Known &= Known2;
+    break;
+  }
+  case ISD::OR: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO,
+                             Depth + 1))
+      return true;
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    if (SimplifyDemandedBits(Op0, ~Known.One & DemandedBits, DemandedElts,
+                             Known2, TLO, Depth + 1))
+      return true;
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
+
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'or'.
+    if (DemandedBits.isSubsetOf(Known2.One | Known.Zero))
+      return TLO.CombineTo(Op, Op0);
+    if (DemandedBits.isSubsetOf(Known.One | Known2.Zero))
+      return TLO.CombineTo(Op, Op1);
+    // If the RHS is a constant, see if we can simplify it.
+    if (ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO))
+      return true;
+    // If the operation can be done in a smaller type, do so.
+    if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
+      return true;
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedBits.isAllOnes() || !DemandedElts.isAllOnes()) {
+      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+          Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+          Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      if (DemandedOp0 || DemandedOp1) {
+        Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+        Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
+    // (or (and X, C1), (and (or X, Y), C2)) -> (or (and X, C1|C2), (and Y, C2))
+    // TODO: Use SimplifyMultipleUseDemandedBits to peek through masks.
+    if (Op0.getOpcode() == ISD::AND && Op1.getOpcode() == ISD::AND &&
+        Op0->hasOneUse() && Op1->hasOneUse()) {
+      // Attempt to match all commutations - m_c_Or would've been useful!
+      for (int I = 0; I != 2; ++I) {
+        SDValue X = Op.getOperand(I).getOperand(0);
+        SDValue C1 = Op.getOperand(I).getOperand(1);
+        SDValue Alt = Op.getOperand(1 - I).getOperand(0);
+        SDValue C2 = Op.getOperand(1 - I).getOperand(1);
+        if (Alt.getOpcode() == ISD::OR) {
+          for (int J = 0; J != 2; ++J) {
+            if (X == Alt.getOperand(J)) {
+              SDValue Y = Alt.getOperand(1 - J);
+              if (SDValue C12 = TLO.DAG.FoldConstantArithmetic(ISD::OR, dl, VT,
+                                                               {C1, C2})) {
+                SDValue MaskX = TLO.DAG.getNode(ISD::AND, dl, VT, X, C12);
+                SDValue MaskY = TLO.DAG.getNode(ISD::AND, dl, VT, Y, C2);
+                return TLO.CombineTo(
+                    Op, TLO.DAG.getNode(ISD::OR, dl, VT, MaskX, MaskY));
+              }
+            }
+          }
+        }
+      }
+    }
+
+    Known |= Known2;
+    break;
+  }
+  case ISD::XOR: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    if (SimplifyDemandedBits(Op1, DemandedBits, DemandedElts, Known, TLO,
+                             Depth + 1))
+      return true;
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known2, TLO,
+                             Depth + 1))
+      return true;
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
+
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'xor'.
+    if (DemandedBits.isSubsetOf(Known.Zero))
+      return TLO.CombineTo(Op, Op0);
+    if (DemandedBits.isSubsetOf(Known2.Zero))
+      return TLO.CombineTo(Op, Op1);
+    // If the operation can be done in a smaller type, do so.
+    if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
+      return true;
+
+    // If all of the unknown bits are known to be zero on one side or the other
+    // turn this into an *inclusive* or.
+    //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
+    if (DemandedBits.isSubsetOf(Known.Zero | Known2.Zero))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, VT, Op0, Op1));
+
+    ConstantSDNode *C = isConstOrConstSplat(Op1, DemandedElts);
+    if (C) {
+      // If one side is a constant, and all of the set bits in the constant are
+      // also known set on the other side, turn this into an AND, as we know
+      // the bits will be cleared.
+      //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
+      // NB: it is okay if more bits are known than are requested
+      if (C->getAPIntValue() == Known2.One) {
+        SDValue ANDC =
+            TLO.DAG.getConstant(~C->getAPIntValue() & DemandedBits, dl, VT);
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, dl, VT, Op0, ANDC));
+      }
+
+      // If the RHS is a constant, see if we can change it. Don't alter a -1
+      // constant because that's a 'not' op, and that is better for combining
+      // and codegen.
+      if (!C->isAllOnes() && DemandedBits.isSubsetOf(C->getAPIntValue())) {
+        // We're flipping all demanded bits. Flip the undemanded bits too.
+        SDValue New = TLO.DAG.getNOT(dl, Op0, VT);
+        return TLO.CombineTo(Op, New);
+      }
+
+      unsigned Op0Opcode = Op0.getOpcode();
+      if ((Op0Opcode == ISD::SRL || Op0Opcode == ISD::SHL) && Op0.hasOneUse()) {
+        if (ConstantSDNode *ShiftC =
+                isConstOrConstSplat(Op0.getOperand(1), DemandedElts)) {
+          // Don't crash on an oversized shift. We can not guarantee that a
+          // bogus shift has been simplified to undef.
+          if (ShiftC->getAPIntValue().ult(BitWidth)) {
+            uint64_t ShiftAmt = ShiftC->getZExtValue();
+            APInt Ones = APInt::getAllOnes(BitWidth);
+            Ones = Op0Opcode == ISD::SHL ? Ones.shl(ShiftAmt)
+                                         : Ones.lshr(ShiftAmt);
+            const TargetLowering &TLI = TLO.DAG.getTargetLoweringInfo();
+            if ((DemandedBits & C->getAPIntValue()) == (DemandedBits & Ones) &&
+                TLI.isDesirableToCommuteXorWithShift(Op.getNode())) {
+              // If the xor constant is a demanded mask, do a 'not' before the
+              // shift:
+              // xor (X << ShiftC), XorC --> (not X) << ShiftC
+              // xor (X >> ShiftC), XorC --> (not X) >> ShiftC
+              SDValue Not = TLO.DAG.getNOT(dl, Op0.getOperand(0), VT);
+              return TLO.CombineTo(Op, TLO.DAG.getNode(Op0Opcode, dl, VT, Not,
+                                                       Op0.getOperand(1)));
+            }
+          }
+        }
+      }
+    }
+
+    // If we can't turn this into a 'not', try to shrink the constant.
+    if (!C || !C->isAllOnes())
+      if (ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO))
+        return true;
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedBits.isAllOnes() || !DemandedElts.isAllOnes()) {
+      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+          Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+          Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      if (DemandedOp0 || DemandedOp1) {
+        Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+        Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
+    Known ^= Known2;
+    break;
+  }
+  case ISD::SELECT:
+    if (SimplifyDemandedBits(Op.getOperand(2), DemandedBits, Known, TLO,
+                             Depth + 1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(1), DemandedBits, Known2, TLO,
+                             Depth + 1))
+      return true;
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
+
+    // If the operands are constants, see if we can simplify them.
+    if (ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO))
+      return true;
+
+    // Only known if known in both the LHS and RHS.
+    Known = Known.intersectWith(Known2);
+    break;
+  case ISD::VSELECT:
+    if (SimplifyDemandedBits(Op.getOperand(2), DemandedBits, DemandedElts,
+                             Known, TLO, Depth + 1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(1), DemandedBits, DemandedElts,
+                             Known2, TLO, Depth + 1))
+      return true;
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
+
+    // Only known if known in both the LHS and RHS.
+    Known = Known.intersectWith(Known2);
+    break;
+  case ISD::SELECT_CC:
+    if (SimplifyDemandedBits(Op.getOperand(3), DemandedBits, Known, TLO,
+                             Depth + 1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(2), DemandedBits, Known2, TLO,
+                             Depth + 1))
+      return true;
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
+
+    // If the operands are constants, see if we can simplify them.
+    if (ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO))
+      return true;
+
+    // Only known if known in both the LHS and RHS.
+    Known = Known.intersectWith(Known2);
+    break;
+  case ISD::SETCC: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    // If (1) we only need the sign-bit, (2) the setcc operands are the same
+    // width as the setcc result, and (3) the result of a setcc conforms to 0 or
+    // -1, we may be able to bypass the setcc.
+    if (DemandedBits.isSignMask() &&
+        Op0.getScalarValueSizeInBits() == BitWidth &&
+        getBooleanContents(Op0.getValueType()) ==
+            BooleanContent::ZeroOrNegativeOneBooleanContent) {
+      // If we're testing X < 0, then this compare isn't needed - just use X!
+      // FIXME: We're limiting to integer types here, but this should also work
+      // if we don't care about FP signed-zero. The use of SETLT with FP means
+      // that we don't care about NaNs.
+      if (CC == ISD::SETLT && Op1.getValueType().isInteger() &&
+          (isNullConstant(Op1) || ISD::isBuildVectorAllZeros(Op1.getNode())))
+        return TLO.CombineTo(Op, Op0);
+
+      // TODO: Should we check for other forms of sign-bit comparisons?
+      // Examples: X <= -1, X >= 0
+    }
+    if (getBooleanContents(Op0.getValueType()) ==
+            TargetLowering::ZeroOrOneBooleanContent &&
+        BitWidth > 1)
+      Known.Zero.setBitsFrom(1);
+    break;
+  }
+  case ISD::SHL: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    EVT ShiftVT = Op1.getValueType();
+
+    if (const APInt *SA =
+            TLO.DAG.getValidShiftAmountConstant(Op, DemandedElts)) {
+      unsigned ShAmt = SA->getZExtValue();
+      if (ShAmt == 0)
+        return TLO.CombineTo(Op, Op0);
+
+      // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
+      // single shift.  We can do this if the bottom bits (which are shifted
+      // out) are never demanded.
+      // TODO - support non-uniform vector amounts.
+      if (Op0.getOpcode() == ISD::SRL) {
+        if (!DemandedBits.intersects(APInt::getLowBitsSet(BitWidth, ShAmt))) {
+          if (const APInt *SA2 =
+                  TLO.DAG.getValidShiftAmountConstant(Op0, DemandedElts)) {
+            unsigned C1 = SA2->getZExtValue();
+            unsigned Opc = ISD::SHL;
+            int Diff = ShAmt - C1;
+            if (Diff < 0) {
+              Diff = -Diff;
+              Opc = ISD::SRL;
+            }
+            SDValue NewSA = TLO.DAG.getConstant(Diff, dl, ShiftVT);
+            return TLO.CombineTo(
+                Op, TLO.DAG.getNode(Opc, dl, VT, Op0.getOperand(0), NewSA));
+          }
+        }
+      }
+
+      // Convert (shl (anyext x, c)) to (anyext (shl x, c)) if the high bits
+      // are not demanded. This will likely allow the anyext to be folded away.
+      // TODO - support non-uniform vector amounts.
+      if (Op0.getOpcode() == ISD::ANY_EXTEND) {
+        SDValue InnerOp = Op0.getOperand(0);
+        EVT InnerVT = InnerOp.getValueType();
+        unsigned InnerBits = InnerVT.getScalarSizeInBits();
+        if (ShAmt < InnerBits && DemandedBits.getActiveBits() <= InnerBits &&
+            isTypeDesirableForOp(ISD::SHL, InnerVT)) {
+          SDValue NarrowShl = TLO.DAG.getNode(
+              ISD::SHL, dl, InnerVT, InnerOp,
+              TLO.DAG.getShiftAmountConstant(ShAmt, InnerVT, dl));
+          return TLO.CombineTo(
+              Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, NarrowShl));
+        }
+
+        // Repeat the SHL optimization above in cases where an extension
+        // intervenes: (shl (anyext (shr x, c1)), c2) to
+        // (shl (anyext x), c2-c1).  This requires that the bottom c1 bits
+        // aren't demanded (as above) and that the shifted upper c1 bits of
+        // x aren't demanded.
+        // TODO - support non-uniform vector amounts.
+        if (InnerOp.getOpcode() == ISD::SRL && Op0.hasOneUse() &&
+            InnerOp.hasOneUse()) {
+          if (const APInt *SA2 =
+                  TLO.DAG.getValidShiftAmountConstant(InnerOp, DemandedElts)) {
+            unsigned InnerShAmt = SA2->getZExtValue();
+            if (InnerShAmt < ShAmt && InnerShAmt < InnerBits &&
+                DemandedBits.getActiveBits() <=
+                    (InnerBits - InnerShAmt + ShAmt) &&
+                DemandedBits.countr_zero() >= ShAmt) {
+              SDValue NewSA =
+                  TLO.DAG.getConstant(ShAmt - InnerShAmt, dl, ShiftVT);
+              SDValue NewExt = TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT,
+                                               InnerOp.getOperand(0));
+              return TLO.CombineTo(
+                  Op, TLO.DAG.getNode(ISD::SHL, dl, VT, NewExt, NewSA));
+            }
+          }
+        }
+      }
+
+      APInt InDemandedMask = DemandedBits.lshr(ShAmt);
+      if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
+                               Depth + 1))
+        return true;
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+      Known.Zero <<= ShAmt;
+      Known.One <<= ShAmt;
+      // low bits known zero.
+      Known.Zero.setLowBits(ShAmt);
+
+      // Attempt to avoid multi-use ops if we don't need anything from them.
+      if (!InDemandedMask.isAllOnes() || !DemandedElts.isAllOnes()) {
+        SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+            Op0, InDemandedMask, DemandedElts, TLO.DAG, Depth + 1);
+        if (DemandedOp0) {
+          SDValue NewOp = TLO.DAG.getNode(ISD::SHL, dl, VT, DemandedOp0, Op1);
+          return TLO.CombineTo(Op, NewOp);
+        }
+      }
+
+      // Try shrinking the operation as long as the shift amount will still be
+      // in range.
+      if ((ShAmt < DemandedBits.getActiveBits()) &&
+          ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
+        return true;
+    } else {
+      // This is a variable shift, so we can't shift the demand mask by a known
+      // amount. But if we are not demanding high bits, then we are not
+      // demanding those bits from the pre-shifted operand either.
+      if (unsigned CTLZ = DemandedBits.countl_zero()) {
+        APInt DemandedFromOp(APInt::getLowBitsSet(BitWidth, BitWidth - CTLZ));
+        if (SimplifyDemandedBits(Op0, DemandedFromOp, DemandedElts, Known, TLO,
+                                 Depth + 1)) {
+          SDNodeFlags Flags = Op.getNode()->getFlags();
+          if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) {
+            // Disable the nsw and nuw flags. We can no longer guarantee that we
+            // won't wrap after simplification.
+            Flags.setNoSignedWrap(false);
+            Flags.setNoUnsignedWrap(false);
+            Op->setFlags(Flags);
+          }
+          return true;
+        }
+        Known.resetAll();
+      }
+    }
+
+    // If we are only demanding sign bits then we can use the shift source
+    // directly.
+    if (const APInt *MaxSA =
+            TLO.DAG.getValidMaximumShiftAmountConstant(Op, DemandedElts)) {
+      unsigned ShAmt = MaxSA->getZExtValue();
+      unsigned NumSignBits =
+          TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
+      unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
+      if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= (UpperDemandedBits))
+        return TLO.CombineTo(Op, Op0);
+    }
+    break;
+  }
+  case ISD::SRL: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    EVT ShiftVT = Op1.getValueType();
+
+    // Try to match AVG patterns.
+    if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits,
+                                        DemandedElts, Depth + 1))
+      return TLO.CombineTo(Op, AVG);
+
+    if (const APInt *SA =
+            TLO.DAG.getValidShiftAmountConstant(Op, DemandedElts)) {
+      unsigned ShAmt = SA->getZExtValue();
+      if (ShAmt == 0)
+        return TLO.CombineTo(Op, Op0);
+
+      // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
+      // single shift.  We can do this if the top bits (which are shifted out)
+      // are never demanded.
+      // TODO - support non-uniform vector amounts.
+      if (Op0.getOpcode() == ISD::SHL) {
+        if (!DemandedBits.intersects(APInt::getHighBitsSet(BitWidth, ShAmt))) {
+          if (const APInt *SA2 =
+                  TLO.DAG.getValidShiftAmountConstant(Op0, DemandedElts)) {
+            unsigned C1 = SA2->getZExtValue();
+            unsigned Opc = ISD::SRL;
+            int Diff = ShAmt - C1;
+            if (Diff < 0) {
+              Diff = -Diff;
+              Opc = ISD::SHL;
+            }
+            SDValue NewSA = TLO.DAG.getConstant(Diff, dl, ShiftVT);
+            return TLO.CombineTo(
+                Op, TLO.DAG.getNode(Opc, dl, VT, Op0.getOperand(0), NewSA));
+          }
+        }
+      }
+
+      APInt InDemandedMask = (DemandedBits << ShAmt);
+
+      // If the shift is exact, then it does demand the low bits (and knows that
+      // they are zero).
+      if (Op->getFlags().hasExact())
+        InDemandedMask.setLowBits(ShAmt);
+
+      // Narrow shift to lower half - similar to ShrinkDemandedOp.
+      // (srl i64:x, K) -> (i64 zero_extend (srl (i32 (trunc i64:x)), K))
+      if ((BitWidth % 2) == 0 && !VT.isVector() &&
+          ((InDemandedMask.countLeadingZeros() >= (BitWidth / 2)) ||
+           TLO.DAG.MaskedValueIsZero(
+               Op0, APInt::getHighBitsSet(BitWidth, BitWidth / 2)))) {
+        EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), BitWidth / 2);
+        if (isNarrowingProfitable(VT, HalfVT) &&
+            isTypeDesirableForOp(ISD::SRL, HalfVT) &&
+            isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
+            (!TLO.LegalOperations() || isOperationLegal(ISD::SRL, VT))) {
+          SDValue NewOp = TLO.DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Op0);
+          SDValue NewShiftAmt = TLO.DAG.getShiftAmountConstant(
+              ShAmt, HalfVT, dl, TLO.LegalTypes());
+          SDValue NewShift =
+              TLO.DAG.getNode(ISD::SRL, dl, HalfVT, NewOp, NewShiftAmt);
+          return TLO.CombineTo(
+              Op, TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, VT, NewShift));
+        }
+      }
+
+      // Compute the new bits that are at the top now.
+      if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
+                               Depth + 1))
+        return true;
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+      Known.Zero.lshrInPlace(ShAmt);
+      Known.One.lshrInPlace(ShAmt);
+      // High bits known zero.
+      Known.Zero.setHighBits(ShAmt);
+
+      // Attempt to avoid multi-use ops if we don't need anything from them.
+      if (!InDemandedMask.isAllOnes() || !DemandedElts.isAllOnes()) {
+        SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+            Op0, InDemandedMask, DemandedElts, TLO.DAG, Depth + 1);
+        if (DemandedOp0) {
+          SDValue NewOp = TLO.DAG.getNode(ISD::SRL, dl, VT, DemandedOp0, Op1);
+          return TLO.CombineTo(Op, NewOp);
+        }
+      }
+    } else {
+      // Use generic knownbits computation as it has support for non-uniform
+      // shift amounts.
+      Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+    }
+    break;
+  }
+  case ISD::SRA: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    EVT ShiftVT = Op1.getValueType();
+
+    // If we only want bits that already match the signbit then we don't need
+    // to shift.
+    unsigned NumHiDemandedBits = BitWidth - DemandedBits.countr_zero();
+    if (TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1) >=
+        NumHiDemandedBits)
+      return TLO.CombineTo(Op, Op0);
+
+    // If this is an arithmetic shift right and only the low-bit is set, we can
+    // always convert this into a logical shr, even if the shift amount is
+    // variable.  The low bit of the shift cannot be an input sign bit unless
+    // the shift amount is >= the size of the datatype, which is undefined.
+    if (DemandedBits.isOne())
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1));
+
+    // Try to match AVG patterns.
+    if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits,
+                                        DemandedElts, Depth + 1))
+      return TLO.CombineTo(Op, AVG);
+
+    if (const APInt *SA =
+            TLO.DAG.getValidShiftAmountConstant(Op, DemandedElts)) {
+      unsigned ShAmt = SA->getZExtValue();
+      if (ShAmt == 0)
+        return TLO.CombineTo(Op, Op0);
+
+      APInt InDemandedMask = (DemandedBits << ShAmt);
+
+      // If the shift is exact, then it does demand the low bits (and knows that
+      // they are zero).
+      if (Op->getFlags().hasExact())
+        InDemandedMask.setLowBits(ShAmt);
+
+      // If any of the demanded bits are produced by the sign extension, we also
+      // demand the input sign bit.
+      if (DemandedBits.countl_zero() < ShAmt)
+        InDemandedMask.setSignBit();
+
+      if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
+                               Depth + 1))
+        return true;
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+      Known.Zero.lshrInPlace(ShAmt);
+      Known.One.lshrInPlace(ShAmt);
+
+      // If the input sign bit is known to be zero, or if none of the top bits
+      // are demanded, turn this into an unsigned shift right.
+      if (Known.Zero[BitWidth - ShAmt - 1] ||
+          DemandedBits.countl_zero() >= ShAmt) {
+        SDNodeFlags Flags;
+        Flags.setExact(Op->getFlags().hasExact());
+        return TLO.CombineTo(
+            Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1, Flags));
+      }
+
+      int Log2 = DemandedBits.exactLogBase2();
+      if (Log2 >= 0) {
+        // The bit must come from the sign.
+        SDValue NewSA = TLO.DAG.getConstant(BitWidth - 1 - Log2, dl, ShiftVT);
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, NewSA));
+      }
+
+      if (Known.One[BitWidth - ShAmt - 1])
+        // New bits are known one.
+        Known.One.setHighBits(ShAmt);
+
+      // Attempt to avoid multi-use ops if we don't need anything from them.
+      if (!InDemandedMask.isAllOnes() || !DemandedElts.isAllOnes()) {
+        SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+            Op0, InDemandedMask, DemandedElts, TLO.DAG, Depth + 1);
+        if (DemandedOp0) {
+          SDValue NewOp = TLO.DAG.getNode(ISD::SRA, dl, VT, DemandedOp0, Op1);
+          return TLO.CombineTo(Op, NewOp);
+        }
+      }
+    }
+    break;
+  }
+  case ISD::FSHL:
+  case ISD::FSHR: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    SDValue Op2 = Op.getOperand(2);
+    bool IsFSHL = (Op.getOpcode() == ISD::FSHL);
+
+    if (ConstantSDNode *SA = isConstOrConstSplat(Op2, DemandedElts)) {
+      unsigned Amt = SA->getAPIntValue().urem(BitWidth);
+
+      // For fshl, 0-shift returns the 1st arg.
+      // For fshr, 0-shift returns the 2nd arg.
+      if (Amt == 0) {
+        if (SimplifyDemandedBits(IsFSHL ? Op0 : Op1, DemandedBits, DemandedElts,
+                                 Known, TLO, Depth + 1))
+          return true;
+        break;
+      }
+
+      // fshl: (Op0 << Amt) | (Op1 >> (BW - Amt))
+      // fshr: (Op0 << (BW - Amt)) | (Op1 >> Amt)
+      APInt Demanded0 = DemandedBits.lshr(IsFSHL ? Amt : (BitWidth - Amt));
+      APInt Demanded1 = DemandedBits << (IsFSHL ? (BitWidth - Amt) : Amt);
+      if (SimplifyDemandedBits(Op0, Demanded0, DemandedElts, Known2, TLO,
+                               Depth + 1))
+        return true;
+      if (SimplifyDemandedBits(Op1, Demanded1, DemandedElts, Known, TLO,
+                               Depth + 1))
+        return true;
+
+      Known2.One <<= (IsFSHL ? Amt : (BitWidth - Amt));
+      Known2.Zero <<= (IsFSHL ? Amt : (BitWidth - Amt));
+      Known.One.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt);
+      Known.Zero.lshrInPlace(IsFSHL ? (BitWidth - Amt) : Amt);
+      Known = Known.unionWith(Known2);
+
+      // Attempt to avoid multi-use ops if we don't need anything from them.
+      if (!Demanded0.isAllOnes() || !Demanded1.isAllOnes() ||
+          !DemandedElts.isAllOnes()) {
+        SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+            Op0, Demanded0, DemandedElts, TLO.DAG, Depth + 1);
+        SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+            Op1, Demanded1, DemandedElts, TLO.DAG, Depth + 1);
+        if (DemandedOp0 || DemandedOp1) {
+          DemandedOp0 = DemandedOp0 ? DemandedOp0 : Op0;
+          DemandedOp1 = DemandedOp1 ? DemandedOp1 : Op1;
+          SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, DemandedOp0,
+                                          DemandedOp1, Op2);
+          return TLO.CombineTo(Op, NewOp);
+        }
+      }
+    }
+
+    // For pow-2 bitwidths we only demand the bottom modulo amt bits.
+    if (isPowerOf2_32(BitWidth)) {
+      APInt DemandedAmtBits(Op2.getScalarValueSizeInBits(), BitWidth - 1);
+      if (SimplifyDemandedBits(Op2, DemandedAmtBits, DemandedElts,
+                               Known2, TLO, Depth + 1))
+        return true;
+    }
+    break;
+  }
+  case ISD::ROTL:
+  case ISD::ROTR: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    bool IsROTL = (Op.getOpcode() == ISD::ROTL);
+
+    // If we're rotating an 0/-1 value, then it stays an 0/-1 value.
+    if (BitWidth == TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1))
+      return TLO.CombineTo(Op, Op0);
+
+    if (ConstantSDNode *SA = isConstOrConstSplat(Op1, DemandedElts)) {
+      unsigned Amt = SA->getAPIntValue().urem(BitWidth);
+      unsigned RevAmt = BitWidth - Amt;
+
+      // rotl: (Op0 << Amt) | (Op0 >> (BW - Amt))
+      // rotr: (Op0 << (BW - Amt)) | (Op0 >> Amt)
+      APInt Demanded0 = DemandedBits.rotr(IsROTL ? Amt : RevAmt);
+      if (SimplifyDemandedBits(Op0, Demanded0, DemandedElts, Known2, TLO,
+                               Depth + 1))
+        return true;
+
+      // rot*(x, 0) --> x
+      if (Amt == 0)
+        return TLO.CombineTo(Op, Op0);
+
+      // See if we don't demand either half of the rotated bits.
+      if ((!TLO.LegalOperations() || isOperationLegal(ISD::SHL, VT)) &&
+          DemandedBits.countr_zero() >= (IsROTL ? Amt : RevAmt)) {
+        Op1 = TLO.DAG.getConstant(IsROTL ? Amt : RevAmt, dl, Op1.getValueType());
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT, Op0, Op1));
+      }
+      if ((!TLO.LegalOperations() || isOperationLegal(ISD::SRL, VT)) &&
+          DemandedBits.countl_zero() >= (IsROTL ? RevAmt : Amt)) {
+        Op1 = TLO.DAG.getConstant(IsROTL ? RevAmt : Amt, dl, Op1.getValueType());
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1));
+      }
+    }
+
+    // For pow-2 bitwidths we only demand the bottom modulo amt bits.
+    if (isPowerOf2_32(BitWidth)) {
+      APInt DemandedAmtBits(Op1.getScalarValueSizeInBits(), BitWidth - 1);
+      if (SimplifyDemandedBits(Op1, DemandedAmtBits, DemandedElts, Known2, TLO,
+                               Depth + 1))
+        return true;
+    }
+    break;
+  }
+  case ISD::UMIN: {
+    // Check if one arg is always less than (or equal) to the other arg.
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    KnownBits Known0 = TLO.DAG.computeKnownBits(Op0, DemandedElts, Depth + 1);
+    KnownBits Known1 = TLO.DAG.computeKnownBits(Op1, DemandedElts, Depth + 1);
+    Known = KnownBits::umin(Known0, Known1);
+    if (std::optional<bool> IsULE = KnownBits::ule(Known0, Known1))
+      return TLO.CombineTo(Op, *IsULE ? Op0 : Op1);
+    if (std::optional<bool> IsULT = KnownBits::ult(Known0, Known1))
+      return TLO.CombineTo(Op, *IsULT ? Op0 : Op1);
+    break;
+  }
+  case ISD::UMAX: {
+    // Check if one arg is always greater than (or equal) to the other arg.
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    KnownBits Known0 = TLO.DAG.computeKnownBits(Op0, DemandedElts, Depth + 1);
+    KnownBits Known1 = TLO.DAG.computeKnownBits(Op1, DemandedElts, Depth + 1);
+    Known = KnownBits::umax(Known0, Known1);
+    if (std::optional<bool> IsUGE = KnownBits::uge(Known0, Known1))
+      return TLO.CombineTo(Op, *IsUGE ? Op0 : Op1);
+    if (std::optional<bool> IsUGT = KnownBits::ugt(Known0, Known1))
+      return TLO.CombineTo(Op, *IsUGT ? Op0 : Op1);
+    break;
+  }
+  case ISD::BITREVERSE: {
+    SDValue Src = Op.getOperand(0);
+    APInt DemandedSrcBits = DemandedBits.reverseBits();
+    if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO,
+                             Depth + 1))
+      return true;
+    Known.One = Known2.One.reverseBits();
+    Known.Zero = Known2.Zero.reverseBits();
+    break;
+  }
+  case ISD::BSWAP: {
+    SDValue Src = Op.getOperand(0);
+
+    // If the only bits demanded come from one byte of the bswap result,
+    // just shift the input byte into position to eliminate the bswap.
+    unsigned NLZ = DemandedBits.countl_zero();
+    unsigned NTZ = DemandedBits.countr_zero();
+
+    // Round NTZ down to the next byte.  If we have 11 trailing zeros, then
+    // we need all the bits down to bit 8.  Likewise, round NLZ.  If we
+    // have 14 leading zeros, round to 8.
+    NLZ = alignDown(NLZ, 8);
+    NTZ = alignDown(NTZ, 8);
+    // If we need exactly one byte, we can do this transformation.
+    if (BitWidth - NLZ - NTZ == 8) {
+      // Replace this with either a left or right shift to get the byte into
+      // the right place.
+      unsigned ShiftOpcode = NLZ > NTZ ? ISD::SRL : ISD::SHL;
+      if (!TLO.LegalOperations() || isOperationLegal(ShiftOpcode, VT)) {
+        EVT ShiftAmtTy = getShiftAmountTy(VT, DL);
+        unsigned ShiftAmount = NLZ > NTZ ? NLZ - NTZ : NTZ - NLZ;
+        SDValue ShAmt = TLO.DAG.getConstant(ShiftAmount, dl, ShiftAmtTy);
+        SDValue NewOp = TLO.DAG.getNode(ShiftOpcode, dl, VT, Src, ShAmt);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
+    APInt DemandedSrcBits = DemandedBits.byteSwap();
+    if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO,
+                             Depth + 1))
+      return true;
+    Known.One = Known2.One.byteSwap();
+    Known.Zero = Known2.Zero.byteSwap();
+    break;
+  }
+  case ISD::CTPOP: {
+    // If only 1 bit is demanded, replace with PARITY as long as we're before
+    // op legalization.
+    // FIXME: Limit to scalars for now.
+    if (DemandedBits.isOne() && !TLO.LegalOps && !VT.isVector())
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::PARITY, dl, VT,
+                                               Op.getOperand(0)));
+
+    Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+    break;
+  }
+  case ISD::SIGN_EXTEND_INREG: {
+    SDValue Op0 = Op.getOperand(0);
+    EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    unsigned ExVTBits = ExVT.getScalarSizeInBits();
+
+    // If we only care about the highest bit, don't bother shifting right.
+    if (DemandedBits.isSignMask()) {
+      unsigned MinSignedBits =
+          TLO.DAG.ComputeMaxSignificantBits(Op0, DemandedElts, Depth + 1);
+      bool AlreadySignExtended = ExVTBits >= MinSignedBits;
+      // However if the input is already sign extended we expect the sign
+      // extension to be dropped altogether later and do not simplify.
+      if (!AlreadySignExtended) {
+        // Compute the correct shift amount type, which must be getShiftAmountTy
+        // for scalar types after legalization.
+        SDValue ShiftAmt = TLO.DAG.getConstant(BitWidth - ExVTBits, dl,
+                                               getShiftAmountTy(VT, DL));
+        return TLO.CombineTo(Op,
+                             TLO.DAG.getNode(ISD::SHL, dl, VT, Op0, ShiftAmt));
+      }
+    }
+
+    // If none of the extended bits are demanded, eliminate the sextinreg.
+    if (DemandedBits.getActiveBits() <= ExVTBits)
+      return TLO.CombineTo(Op, Op0);
+
+    APInt InputDemandedBits = DemandedBits.getLoBits(ExVTBits);
+
+    // Since the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    InputDemandedBits.setBit(ExVTBits - 1);
+
+    if (SimplifyDemandedBits(Op0, InputDemandedBits, DemandedElts, Known, TLO,
+                             Depth + 1))
+      return true;
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+
+    // If the input sign bit is known zero, convert this into a zero extension.
+    if (Known.Zero[ExVTBits - 1])
+      return TLO.CombineTo(Op, TLO.DAG.getZeroExtendInReg(Op0, dl, ExVT));
+
+    APInt Mask = APInt::getLowBitsSet(BitWidth, ExVTBits);
+    if (Known.One[ExVTBits - 1]) { // Input sign bit known set
+      Known.One.setBitsFrom(ExVTBits);
+      Known.Zero &= Mask;
+    } else { // Input sign bit unknown
+      Known.Zero &= Mask;
+      Known.One &= Mask;
+    }
+    break;
+  }
+  case ISD::BUILD_PAIR: {
+    EVT HalfVT = Op.getOperand(0).getValueType();
+    unsigned HalfBitWidth = HalfVT.getScalarSizeInBits();
+
+    APInt MaskLo = DemandedBits.getLoBits(HalfBitWidth).trunc(HalfBitWidth);
+    APInt MaskHi = DemandedBits.getHiBits(HalfBitWidth).trunc(HalfBitWidth);
+
+    KnownBits KnownLo, KnownHi;
+
+    if (SimplifyDemandedBits(Op.getOperand(0), MaskLo, KnownLo, TLO, Depth + 1))
+      return true;
+
+    if (SimplifyDemandedBits(Op.getOperand(1), MaskHi, KnownHi, TLO, Depth + 1))
+      return true;
+
+    Known = KnownHi.concat(KnownLo);
+    break;
+  }
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    if (VT.isScalableVector())
+      return false;
+    [[fallthrough]];
+  case ISD::ZERO_EXTEND: {
+    SDValue Src = Op.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    unsigned InBits = SrcVT.getScalarSizeInBits();
+    unsigned InElts = SrcVT.isFixedLengthVector() ? SrcVT.getVectorNumElements() : 1;
+    bool IsVecInReg = Op.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG;
+
+    // If none of the top bits are demanded, convert this into an any_extend.
+    if (DemandedBits.getActiveBits() <= InBits) {
+      // If we only need the non-extended bits of the bottom element
+      // then we can just bitcast to the result.
+      if (IsLE && IsVecInReg && DemandedElts == 1 &&
+          VT.getSizeInBits() == SrcVT.getSizeInBits())
+        return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));
+
+      unsigned Opc =
+          IsVecInReg ? ISD::ANY_EXTEND_VECTOR_INREG : ISD::ANY_EXTEND;
+      if (!TLO.LegalOperations() || isOperationLegal(Opc, VT))
+        return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src));
+    }
+
+    APInt InDemandedBits = DemandedBits.trunc(InBits);
+    APInt InDemandedElts = DemandedElts.zext(InElts);
+    if (SimplifyDemandedBits(Src, InDemandedBits, InDemandedElts, Known, TLO,
+                             Depth + 1))
+      return true;
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    assert(Known.getBitWidth() == InBits && "Src width has changed?");
+    Known = Known.zext(BitWidth);
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+            Src, InDemandedBits, InDemandedElts, TLO.DAG, Depth + 1))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(Op.getOpcode(), dl, VT, NewSrc));
+    break;
+  }
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    if (VT.isScalableVector())
+      return false;
+    [[fallthrough]];
+  case ISD::SIGN_EXTEND: {
+    SDValue Src = Op.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    unsigned InBits = SrcVT.getScalarSizeInBits();
+    unsigned InElts = SrcVT.isFixedLengthVector() ? SrcVT.getVectorNumElements() : 1;
+    bool IsVecInReg = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG;
+
+    // If none of the top bits are demanded, convert this into an any_extend.
+    if (DemandedBits.getActiveBits() <= InBits) {
+      // If we only need the non-extended bits of the bottom element
+      // then we can just bitcast to the result.
+      if (IsLE && IsVecInReg && DemandedElts == 1 &&
+          VT.getSizeInBits() == SrcVT.getSizeInBits())
+        return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));
+
+      unsigned Opc =
+          IsVecInReg ? ISD::ANY_EXTEND_VECTOR_INREG : ISD::ANY_EXTEND;
+      if (!TLO.LegalOperations() || isOperationLegal(Opc, VT))
+        return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src));
+    }
+
+    APInt InDemandedBits = DemandedBits.trunc(InBits);
+    APInt InDemandedElts = DemandedElts.zext(InElts);
+
+    // Since some of the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    InDemandedBits.setBit(InBits - 1);
+
+    if (SimplifyDemandedBits(Src, InDemandedBits, InDemandedElts, Known, TLO,
+                             Depth + 1))
+      return true;
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    assert(Known.getBitWidth() == InBits && "Src width has changed?");
+
+    // If the sign bit is known one, the top bits match.
+    Known = Known.sext(BitWidth);
+
+    // If the sign bit is known zero, convert this to a zero extend.
+    if (Known.isNonNegative()) {
+      unsigned Opc =
+          IsVecInReg ? ISD::ZERO_EXTEND_VECTOR_INREG : ISD::ZERO_EXTEND;
+      if (!TLO.LegalOperations() || isOperationLegal(Opc, VT))
+        return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src));
+    }
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+            Src, InDemandedBits, InDemandedElts, TLO.DAG, Depth + 1))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(Op.getOpcode(), dl, VT, NewSrc));
+    break;
+  }
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+    if (VT.isScalableVector())
+      return false;
+    [[fallthrough]];
+  case ISD::ANY_EXTEND: {
+    SDValue Src = Op.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    unsigned InBits = SrcVT.getScalarSizeInBits();
+    unsigned InElts = SrcVT.isFixedLengthVector() ? SrcVT.getVectorNumElements() : 1;
+    bool IsVecInReg = Op.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG;
+
+    // If we only need the bottom element then we can just bitcast.
+    // TODO: Handle ANY_EXTEND?
+    if (IsLE && IsVecInReg && DemandedElts == 1 &&
+        VT.getSizeInBits() == SrcVT.getSizeInBits())
+      return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));
+
+    APInt InDemandedBits = DemandedBits.trunc(InBits);
+    APInt InDemandedElts = DemandedElts.zext(InElts);
+    if (SimplifyDemandedBits(Src, InDemandedBits, InDemandedElts, Known, TLO,
+                             Depth + 1))
+      return true;
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    assert(Known.getBitWidth() == InBits && "Src width has changed?");
+    Known = Known.anyext(BitWidth);
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+            Src, InDemandedBits, InDemandedElts, TLO.DAG, Depth + 1))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(Op.getOpcode(), dl, VT, NewSrc));
+    break;
+  }
+  case ISD::TRUNCATE: {
+    SDValue Src = Op.getOperand(0);
+
+    // Simplify the input, using demanded bit information, and compute the known
+    // zero/one bits live out.
+    unsigned OperandBitWidth = Src.getScalarValueSizeInBits();
+    APInt TruncMask = DemandedBits.zext(OperandBitWidth);
+    if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, Known, TLO,
+                             Depth + 1))
+      return true;
+    Known = Known.trunc(BitWidth);
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+            Src, TruncMask, DemandedElts, TLO.DAG, Depth + 1))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::TRUNCATE, dl, VT, NewSrc));
+
+    // If the input is only used by this truncate, see if we can shrink it based
+    // on the known demanded bits.
+    switch (Src.getOpcode()) {
+    default:
+      break;
+    case ISD::SRL:
+      // Shrink SRL by a constant if none of the high bits shifted in are
+      // demanded.
+      if (TLO.LegalTypes() && !isTypeDesirableForOp(ISD::SRL, VT))
+        // Do not turn (vt1 truncate (vt2 srl)) into (vt1 srl) if vt1 is
+        // undesirable.
+        break;
+
+      if (Src.getNode()->hasOneUse()) {
+        const APInt *ShAmtC =
+            TLO.DAG.getValidShiftAmountConstant(Src, DemandedElts);
+        if (!ShAmtC || ShAmtC->uge(BitWidth))
+          break;
+        uint64_t ShVal = ShAmtC->getZExtValue();
+
+        APInt HighBits =
+            APInt::getHighBitsSet(OperandBitWidth, OperandBitWidth - BitWidth);
+        HighBits.lshrInPlace(ShVal);
+        HighBits = HighBits.trunc(BitWidth);
+
+        if (!(HighBits & DemandedBits)) {
+          // None of the shifted in bits are needed.  Add a truncate of the
+          // shift input, then shift it.
+          SDValue NewShAmt = TLO.DAG.getConstant(
+              ShVal, dl, getShiftAmountTy(VT, DL, TLO.LegalTypes()));
+          SDValue NewTrunc =
+              TLO.DAG.getNode(ISD::TRUNCATE, dl, VT, Src.getOperand(0));
+          return TLO.CombineTo(
+              Op, TLO.DAG.getNode(ISD::SRL, dl, VT, NewTrunc, NewShAmt));
+        }
+      }
+      break;
+    }
+
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    break;
+  }
+  case ISD::AssertZext: {
+    // AssertZext demands all of the high bits, plus any of the low bits
+    // demanded by its users.
+    EVT ZVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    APInt InMask = APInt::getLowBitsSet(BitWidth, ZVT.getSizeInBits());
+    if (SimplifyDemandedBits(Op.getOperand(0), ~InMask | DemandedBits, Known,
+                             TLO, Depth + 1))
+      return true;
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+
+    Known.Zero |= ~InMask;
+    Known.One &= (~Known.Zero);
+    break;
+  }
+  case ISD::EXTRACT_VECTOR_ELT: {
+    SDValue Src = Op.getOperand(0);
+    SDValue Idx = Op.getOperand(1);
+    ElementCount SrcEltCnt = Src.getValueType().getVectorElementCount();
+    unsigned EltBitWidth = Src.getScalarValueSizeInBits();
+
+    if (SrcEltCnt.isScalable())
+      return false;
+
+    // Demand the bits from every vector element without a constant index.
+    unsigned NumSrcElts = SrcEltCnt.getFixedValue();
+    APInt DemandedSrcElts = APInt::getAllOnes(NumSrcElts);
+    if (auto *CIdx = dyn_cast<ConstantSDNode>(Idx))
+      if (CIdx->getAPIntValue().ult(NumSrcElts))
+        DemandedSrcElts = APInt::getOneBitSet(NumSrcElts, CIdx->getZExtValue());
+
+    // If BitWidth > EltBitWidth the value is anyext:ed. So we do not know
+    // anything about the extended bits.
+    APInt DemandedSrcBits = DemandedBits;
+    if (BitWidth > EltBitWidth)
+      DemandedSrcBits = DemandedSrcBits.trunc(EltBitWidth);
+
+    if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts, Known2, TLO,
+                             Depth + 1))
+      return true;
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedSrcBits.isAllOnes() || !DemandedSrcElts.isAllOnes()) {
+      if (SDValue DemandedSrc = SimplifyMultipleUseDemandedBits(
+              Src, DemandedSrcBits, DemandedSrcElts, TLO.DAG, Depth + 1)) {
+        SDValue NewOp =
+            TLO.DAG.getNode(Op.getOpcode(), dl, VT, DemandedSrc, Idx);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
+    Known = Known2;
+    if (BitWidth > EltBitWidth)
+      Known = Known.anyext(BitWidth);
+    break;
+  }
+  case ISD::BITCAST: {
+    if (VT.isScalableVector())
+      return false;
+    SDValue Src = Op.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
+
+    // If this is an FP->Int bitcast and if the sign bit is the only
+    // thing demanded, turn this into a FGETSIGN.
+    if (!TLO.LegalOperations() && !VT.isVector() && !SrcVT.isVector() &&
+        DemandedBits == APInt::getSignMask(Op.getValueSizeInBits()) &&
+        SrcVT.isFloatingPoint()) {
+      bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, VT);
+      bool i32Legal = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32);
+      if ((OpVTLegal || i32Legal) && VT.isSimple() && SrcVT != MVT::f16 &&
+          SrcVT != MVT::f128) {
+        // Cannot eliminate/lower SHL for f128 yet.
+        EVT Ty = OpVTLegal ? VT : MVT::i32;
+        // Make a FGETSIGN + SHL to move the sign bit into the appropriate
+        // place.  We expect the SHL to be eliminated by other optimizations.
+        SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, dl, Ty, Src);
+        unsigned OpVTSizeInBits = Op.getValueSizeInBits();
+        if (!OpVTLegal && OpVTSizeInBits > 32)
+          Sign = TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Sign);
+        unsigned ShVal = Op.getValueSizeInBits() - 1;
+        SDValue ShAmt = TLO.DAG.getConstant(ShVal, dl, VT);
+        return TLO.CombineTo(Op,
+                             TLO.DAG.getNode(ISD::SHL, dl, VT, Sign, ShAmt));
+      }
+    }
+
+    // Bitcast from a vector using SimplifyDemanded Bits/VectorElts.
+    // Demand the elt/bit if any of the original elts/bits are demanded.
+    if (SrcVT.isVector() && (BitWidth % NumSrcEltBits) == 0) {
+      unsigned Scale = BitWidth / NumSrcEltBits;
+      unsigned NumSrcElts = SrcVT.getVectorNumElements();
+      APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits);
+      APInt DemandedSrcElts = APInt::getZero(NumSrcElts);
+      for (unsigned i = 0; i != Scale; ++i) {
+        unsigned EltOffset = IsLE ? i : (Scale - 1 - i);
+        unsigned BitOffset = EltOffset * NumSrcEltBits;
+        APInt Sub = DemandedBits.extractBits(NumSrcEltBits, BitOffset);
+        if (!Sub.isZero()) {
+          DemandedSrcBits |= Sub;
+          for (unsigned j = 0; j != NumElts; ++j)
+            if (DemandedElts[j])
+              DemandedSrcElts.setBit((j * Scale) + i);
+        }
+      }
+
+      APInt KnownSrcUndef, KnownSrcZero;
+      if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef,
+                                     KnownSrcZero, TLO, Depth + 1))
+        return true;
+
+      KnownBits KnownSrcBits;
+      if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
+                               KnownSrcBits, TLO, Depth + 1))
+        return true;
+    } else if (IsLE && (NumSrcEltBits % BitWidth) == 0) {
+      // TODO - bigendian once we have test coverage.
+      unsigned Scale = NumSrcEltBits / BitWidth;
+      unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
+      APInt DemandedSrcBits = APInt::getZero(NumSrcEltBits);
+      APInt DemandedSrcElts = APInt::getZero(NumSrcElts);
+      for (unsigned i = 0; i != NumElts; ++i)
+        if (DemandedElts[i]) {
+          unsigned Offset = (i % Scale) * BitWidth;
+          DemandedSrcBits.insertBits(DemandedBits, Offset);
+          DemandedSrcElts.setBit(i / Scale);
+        }
+
+      if (SrcVT.isVector()) {
+        APInt KnownSrcUndef, KnownSrcZero;
+        if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownSrcUndef,
+                                       KnownSrcZero, TLO, Depth + 1))
+          return true;
+      }
+
+      KnownBits KnownSrcBits;
+      if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedSrcElts,
+                               KnownSrcBits, TLO, Depth + 1))
+        return true;
+
+      // Attempt to avoid multi-use ops if we don't need anything from them.
+      if (!DemandedSrcBits.isAllOnes() || !DemandedSrcElts.isAllOnes()) {
+        if (SDValue DemandedSrc = SimplifyMultipleUseDemandedBits(
+                Src, DemandedSrcBits, DemandedSrcElts, TLO.DAG, Depth + 1)) {
+          SDValue NewOp = TLO.DAG.getBitcast(VT, DemandedSrc);
+          return TLO.CombineTo(Op, NewOp);
+        }
+      }
+    }
+
+    // If this is a bitcast, let computeKnownBits handle it.  Only do this on a
+    // recursive call where Known may be useful to the caller.
+    if (Depth > 0) {
+      Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+      return false;
+    }
+    break;
+  }
+  case ISD::MUL:
+    if (DemandedBits.isPowerOf2()) {
+      // The LSB of X*Y is set only if (X & 1) == 1 and (Y & 1) == 1.
+      // If we demand exactly one bit N and we have "X * (C' << N)" where C' is
+      // odd (has LSB set), then the left-shifted low bit of X is the answer.
+      unsigned CTZ = DemandedBits.countr_zero();
+      ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(1), DemandedElts);
+      if (C && C->getAPIntValue().countr_zero() == CTZ) {
+        EVT ShiftAmtTy = getShiftAmountTy(VT, TLO.DAG.getDataLayout());
+        SDValue AmtC = TLO.DAG.getConstant(CTZ, dl, ShiftAmtTy);
+        SDValue Shl = TLO.DAG.getNode(ISD::SHL, dl, VT, Op.getOperand(0), AmtC);
+        return TLO.CombineTo(Op, Shl);
+      }
+    }
+    // For a squared value "X * X", the bottom 2 bits are 0 and X[0] because:
+    // X * X is odd iff X is odd.
+    // 'Quadratic Reciprocity': X * X -> 0 for bit[1]
+    if (Op.getOperand(0) == Op.getOperand(1) && DemandedBits.ult(4)) {
+      SDValue One = TLO.DAG.getConstant(1, dl, VT);
+      SDValue And1 = TLO.DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), One);
+      return TLO.CombineTo(Op, And1);
+    }
+    [[fallthrough]];
+  case ISD::ADD:
+  case ISD::SUB: {
+    // Add, Sub, and Mul don't demand any bits in positions beyond that
+    // of the highest bit demanded of them.
+    SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+    SDNodeFlags Flags = Op.getNode()->getFlags();
+    unsigned DemandedBitsLZ = DemandedBits.countl_zero();
+    APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
+    KnownBits KnownOp0, KnownOp1;
+    if (SimplifyDemandedBits(Op0, LoMask, DemandedElts, KnownOp0, TLO,
+                             Depth + 1) ||
+        SimplifyDemandedBits(Op1, LoMask, DemandedElts, KnownOp1, TLO,
+                             Depth + 1) ||
+        // See if the operation should be performed at a smaller bit width.
+        ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) {
+      if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) {
+        // Disable the nsw and nuw flags. We can no longer guarantee that we
+        // won't wrap after simplification.
+        Flags.setNoSignedWrap(false);
+        Flags.setNoUnsignedWrap(false);
+        Op->setFlags(Flags);
+      }
+      return true;
+    }
+
+    // neg x with only low bit demanded is simply x.
+    if (Op.getOpcode() == ISD::SUB && DemandedBits.isOne() &&
+        isa<ConstantSDNode>(Op0) && cast<ConstantSDNode>(Op0)->isZero())
+      return TLO.CombineTo(Op, Op1);
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!LoMask.isAllOnes() || !DemandedElts.isAllOnes()) {
+      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+          Op0, LoMask, DemandedElts, TLO.DAG, Depth + 1);
+      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+          Op1, LoMask, DemandedElts, TLO.DAG, Depth + 1);
+      if (DemandedOp0 || DemandedOp1) {
+        Flags.setNoSignedWrap(false);
+        Flags.setNoUnsignedWrap(false);
+        Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+        Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+        SDValue NewOp =
+            TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1, Flags);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
+    // If we have a constant operand, we may be able to turn it into -1 if we
+    // do not demand the high bits. This can make the constant smaller to
+    // encode, allow more general folding, or match specialized instruction
+    // patterns (eg, 'blsr' on x86). Don't bother changing 1 to -1 because that
+    // is probably not useful (and could be detrimental).
+    ConstantSDNode *C = isConstOrConstSplat(Op1);
+    APInt HighMask = APInt::getHighBitsSet(BitWidth, DemandedBitsLZ);
+    if (C && !C->isAllOnes() && !C->isOne() &&
+        (C->getAPIntValue() | HighMask).isAllOnes()) {
+      SDValue Neg1 = TLO.DAG.getAllOnesConstant(dl, VT);
+      // Disable the nsw and nuw flags. We can no longer guarantee that we
+      // won't wrap after simplification.
+      Flags.setNoSignedWrap(false);
+      Flags.setNoUnsignedWrap(false);
+      SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Neg1, Flags);
+      return TLO.CombineTo(Op, NewOp);
+    }
+
+    // Match a multiply with a disguised negated-power-of-2 and convert to a
+    // an equivalent shift-left amount.
+    // Example: (X * MulC) + Op1 --> Op1 - (X << log2(-MulC))
+    auto getShiftLeftAmt = [&HighMask](SDValue Mul) -> unsigned {
+      if (Mul.getOpcode() != ISD::MUL || !Mul.hasOneUse())
+        return 0;
+
+      // Don't touch opaque constants. Also, ignore zero and power-of-2
+      // multiplies. Those will get folded later.
+      ConstantSDNode *MulC = isConstOrConstSplat(Mul.getOperand(1));
+      if (MulC && !MulC->isOpaque() && !MulC->isZero() &&
+          !MulC->getAPIntValue().isPowerOf2()) {
+        APInt UnmaskedC = MulC->getAPIntValue() | HighMask;
+        if (UnmaskedC.isNegatedPowerOf2())
+          return (-UnmaskedC).logBase2();
+      }
+      return 0;
+    };
+
+    auto foldMul = [&](ISD::NodeType NT, SDValue X, SDValue Y, unsigned ShlAmt) {
+      EVT ShiftAmtTy = getShiftAmountTy(VT, TLO.DAG.getDataLayout());
+      SDValue ShlAmtC = TLO.DAG.getConstant(ShlAmt, dl, ShiftAmtTy);
+      SDValue Shl = TLO.DAG.getNode(ISD::SHL, dl, VT, X, ShlAmtC);
+      SDValue Res = TLO.DAG.getNode(NT, dl, VT, Y, Shl);
+      return TLO.CombineTo(Op, Res);
+    };
+
+    if (isOperationLegalOrCustom(ISD::SHL, VT)) {
+      if (Op.getOpcode() == ISD::ADD) {
+        // (X * MulC) + Op1 --> Op1 - (X << log2(-MulC))
+        if (unsigned ShAmt = getShiftLeftAmt(Op0))
+          return foldMul(ISD::SUB, Op0.getOperand(0), Op1, ShAmt);
+        // Op0 + (X * MulC) --> Op0 - (X << log2(-MulC))
+        if (unsigned ShAmt = getShiftLeftAmt(Op1))
+          return foldMul(ISD::SUB, Op1.getOperand(0), Op0, ShAmt);
+      }
+      if (Op.getOpcode() == ISD::SUB) {
+        // Op0 - (X * MulC) --> Op0 + (X << log2(-MulC))
+        if (unsigned ShAmt = getShiftLeftAmt(Op1))
+          return foldMul(ISD::ADD, Op1.getOperand(0), Op0, ShAmt);
+      }
+    }
+
+    if (Op.getOpcode() == ISD::MUL) {
+      Known = KnownBits::mul(KnownOp0, KnownOp1);
+    } else { // Op.getOpcode() is either ISD::ADD or ISD::SUB.
+      Known = KnownBits::computeForAddSub(Op.getOpcode() == ISD::ADD,
+                                          Flags.hasNoSignedWrap(), KnownOp0,
+                                          KnownOp1);
+    }
+    break;
+  }
+  default:
+    // We also ask the target about intrinsics (which could be specific to it).
+    if (Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+        Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+      // TODO: Probably okay to remove after audit; here to reduce change size
+      // in initial enablement patch for scalable vectors
+      if (Op.getValueType().isScalableVector())
+        break;
+      if (SimplifyDemandedBitsForTargetNode(Op, DemandedBits, DemandedElts,
+                                            Known, TLO, Depth))
+        return true;
+      break;
+    }
+
+    // Just use computeKnownBits to compute output bits.
+    Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+    break;
+  }
+
+  // If we know the value of all of the demanded bits, return this as a
+  // constant.
+  if (!isTargetCanonicalConstantNode(Op) &&
+      DemandedBits.isSubsetOf(Known.Zero | Known.One)) {
+    // Avoid folding to a constant if any OpaqueConstant is involved.
+    const SDNode *N = Op.getNode();
+    for (SDNode *Op :
+         llvm::make_range(SDNodeIterator::begin(N), SDNodeIterator::end(N))) {
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+        if (C->isOpaque())
+          return false;
+    }
+    if (VT.isInteger())
+      return TLO.CombineTo(Op, TLO.DAG.getConstant(Known.One, dl, VT));
+    if (VT.isFloatingPoint())
+      return TLO.CombineTo(
+          Op,
+          TLO.DAG.getConstantFP(
+              APFloat(TLO.DAG.EVTToAPFloatSemantics(VT), Known.One), dl, VT));
+  }
+
+  // A multi use 'all demanded elts' simplify failed to find any knownbits.
+  // Try again just for the original demanded elts.
+  // Ensure we do this AFTER constant folding above.
+  if (HasMultiUse && Known.isUnknown() && !OriginalDemandedElts.isAllOnes())
+    Known = TLO.DAG.computeKnownBits(Op, OriginalDemandedElts, Depth);
+
+  return false;
+}
+
+bool TargetLowering::SimplifyDemandedVectorElts(SDValue Op,
+                                                const APInt &DemandedElts,
+                                                DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                        !DCI.isBeforeLegalizeOps());
+
+  APInt KnownUndef, KnownZero;
+  bool Simplified =
+      SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, TLO);
+  if (Simplified) {
+    DCI.AddToWorklist(Op.getNode());
+    DCI.CommitTargetLoweringOpt(TLO);
+  }
+
+  return Simplified;
+}
+
+/// Given a vector binary operation and known undefined elements for each input
+/// operand, compute whether each element of the output is undefined.
+static APInt getKnownUndefForVectorBinop(SDValue BO, SelectionDAG &DAG,
+                                         const APInt &UndefOp0,
+                                         const APInt &UndefOp1) {
+  EVT VT = BO.getValueType();
+  assert(DAG.getTargetLoweringInfo().isBinOp(BO.getOpcode()) && VT.isVector() &&
+         "Vector binop only");
+
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElts = VT.isFixedLengthVector() ? VT.getVectorNumElements() : 1;
+  assert(UndefOp0.getBitWidth() == NumElts &&
+         UndefOp1.getBitWidth() == NumElts && "Bad type for undef analysis");
+
+  auto getUndefOrConstantElt = [&](SDValue V, unsigned Index,
+                                   const APInt &UndefVals) {
+    if (UndefVals[Index])
+      return DAG.getUNDEF(EltVT);
+
+    if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
+      // Try hard to make sure that the getNode() call is not creating temporary
+      // nodes. Ignore opaque integers because they do not constant fold.
+      SDValue Elt = BV->getOperand(Index);
+      auto *C = dyn_cast<ConstantSDNode>(Elt);
+      if (isa<ConstantFPSDNode>(Elt) || Elt.isUndef() || (C && !C->isOpaque()))
+        return Elt;
+    }
+
+    return SDValue();
+  };
+
+  APInt KnownUndef = APInt::getZero(NumElts);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    // If both inputs for this element are either constant or undef and match
+    // the element type, compute the constant/undef result for this element of
+    // the vector.
+    // TODO: Ideally we would use FoldConstantArithmetic() here, but that does
+    // not handle FP constants. The code within getNode() should be refactored
+    // to avoid the danger of creating a bogus temporary node here.
+    SDValue C0 = getUndefOrConstantElt(BO.getOperand(0), i, UndefOp0);
+    SDValue C1 = getUndefOrConstantElt(BO.getOperand(1), i, UndefOp1);
+    if (C0 && C1 && C0.getValueType() == EltVT && C1.getValueType() == EltVT)
+      if (DAG.getNode(BO.getOpcode(), SDLoc(BO), EltVT, C0, C1).isUndef())
+        KnownUndef.setBit(i);
+  }
+  return KnownUndef;
+}
+
+bool TargetLowering::SimplifyDemandedVectorElts(
+    SDValue Op, const APInt &OriginalDemandedElts, APInt &KnownUndef,
+    APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth,
+    bool AssumeSingleUse) const {
+  EVT VT = Op.getValueType();
+  unsigned Opcode = Op.getOpcode();
+  APInt DemandedElts = OriginalDemandedElts;
+  unsigned NumElts = DemandedElts.getBitWidth();
+  assert(VT.isVector() && "Expected vector op");
+
+  KnownUndef = KnownZero = APInt::getZero(NumElts);
+
+  const TargetLowering &TLI = TLO.DAG.getTargetLoweringInfo();
+  if (!TLI.shouldSimplifyDemandedVectorElts(Op, TLO))
+    return false;
+
+  // TODO: For now we assume we know nothing about scalable vectors.
+  if (VT.isScalableVector())
+    return false;
+
+  assert(VT.getVectorNumElements() == NumElts &&
+         "Mask size mismatches value type element count!");
+
+  // Undef operand.
+  if (Op.isUndef()) {
+    KnownUndef.setAllBits();
+    return false;
+  }
+
+  // If Op has other users, assume that all elements are needed.
+  if (!AssumeSingleUse && !Op.getNode()->hasOneUse())
+    DemandedElts.setAllBits();
+
+  // Not demanding any elements from Op.
+  if (DemandedElts == 0) {
+    KnownUndef.setAllBits();
+    return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
+  }
+
+  // Limit search depth.
+  if (Depth >= SelectionDAG::MaxRecursionDepth)
+    return false;
+
+  SDLoc DL(Op);
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  bool IsLE = TLO.DAG.getDataLayout().isLittleEndian();
+
+  // Helper for demanding the specified elements and all the bits of both binary
+  // operands.
+  auto SimplifyDemandedVectorEltsBinOp = [&](SDValue Op0, SDValue Op1) {
+    SDValue NewOp0 = SimplifyMultipleUseDemandedVectorElts(Op0, DemandedElts,
+                                                           TLO.DAG, Depth + 1);
+    SDValue NewOp1 = SimplifyMultipleUseDemandedVectorElts(Op1, DemandedElts,
+                                                           TLO.DAG, Depth + 1);
+    if (NewOp0 || NewOp1) {
+      SDValue NewOp = TLO.DAG.getNode(
+          Opcode, SDLoc(Op), VT, NewOp0 ? NewOp0 : Op0, NewOp1 ? NewOp1 : Op1);
+      return TLO.CombineTo(Op, NewOp);
+    }
+    return false;
+  };
+
+  switch (Opcode) {
+  case ISD::SCALAR_TO_VECTOR: {
+    if (!DemandedElts[0]) {
+      KnownUndef.setAllBits();
+      return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
+    }
+    SDValue ScalarSrc = Op.getOperand(0);
+    if (ScalarSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+      SDValue Src = ScalarSrc.getOperand(0);
+      SDValue Idx = ScalarSrc.getOperand(1);
+      EVT SrcVT = Src.getValueType();
+
+      ElementCount SrcEltCnt = SrcVT.getVectorElementCount();
+
+      if (SrcEltCnt.isScalable())
+        return false;
+
+      unsigned NumSrcElts = SrcEltCnt.getFixedValue();
+      if (isNullConstant(Idx)) {
+        APInt SrcDemandedElts = APInt::getOneBitSet(NumSrcElts, 0);
+        APInt SrcUndef = KnownUndef.zextOrTrunc(NumSrcElts);
+        APInt SrcZero = KnownZero.zextOrTrunc(NumSrcElts);
+        if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero,
+                                       TLO, Depth + 1))
+          return true;
+      }
+    }
+    KnownUndef.setHighBits(NumElts - 1);
+    break;
+  }
+  case ISD::BITCAST: {
+    SDValue Src = Op.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+
+    // We only handle vectors here.
+    // TODO - investigate calling SimplifyDemandedBits/ComputeKnownBits?
+    if (!SrcVT.isVector())
+      break;
+
+    // Fast handling of 'identity' bitcasts.
+    unsigned NumSrcElts = SrcVT.getVectorNumElements();
+    if (NumSrcElts == NumElts)
+      return SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef,
+                                        KnownZero, TLO, Depth + 1);
+
+    APInt SrcDemandedElts, SrcZero, SrcUndef;
+
+    // Bitcast from 'large element' src vector to 'small element' vector, we
+    // must demand a source element if any DemandedElt maps to it.
+    if ((NumElts % NumSrcElts) == 0) {
+      unsigned Scale = NumElts / NumSrcElts;
+      SrcDemandedElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
+      if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero,
+                                     TLO, Depth + 1))
+        return true;
+
+      // Try calling SimplifyDemandedBits, converting demanded elts to the bits
+      // of the large element.
+      // TODO - bigendian once we have test coverage.
+      if (IsLE) {
+        unsigned SrcEltSizeInBits = SrcVT.getScalarSizeInBits();
+        APInt SrcDemandedBits = APInt::getZero(SrcEltSizeInBits);
+        for (unsigned i = 0; i != NumElts; ++i)
+          if (DemandedElts[i]) {
+            unsigned Ofs = (i % Scale) * EltSizeInBits;
+            SrcDemandedBits.setBits(Ofs, Ofs + EltSizeInBits);
+          }
+
+        KnownBits Known;
+        if (SimplifyDemandedBits(Src, SrcDemandedBits, SrcDemandedElts, Known,
+                                 TLO, Depth + 1))
+          return true;
+
+        // The bitcast has split each wide element into a number of
+        // narrow subelements. We have just computed the Known bits
+        // for wide elements. See if element splitting results in
+        // some subelements being zero. Only for demanded elements!
+        for (unsigned SubElt = 0; SubElt != Scale; ++SubElt) {
+          if (!Known.Zero.extractBits(EltSizeInBits, SubElt * EltSizeInBits)
+                   .isAllOnes())
+            continue;
+          for (unsigned SrcElt = 0; SrcElt != NumSrcElts; ++SrcElt) {
+            unsigned Elt = Scale * SrcElt + SubElt;
+            if (DemandedElts[Elt])
+              KnownZero.setBit(Elt);
+          }
+        }
+      }
+
+      // If the src element is zero/undef then all the output elements will be -
+      // only demanded elements are guaranteed to be correct.
+      for (unsigned i = 0; i != NumSrcElts; ++i) {
+        if (SrcDemandedElts[i]) {
+          if (SrcZero[i])
+            KnownZero.setBits(i * Scale, (i + 1) * Scale);
+          if (SrcUndef[i])
+            KnownUndef.setBits(i * Scale, (i + 1) * Scale);
+        }
+      }
+    }
+
+    // Bitcast from 'small element' src vector to 'large element' vector, we
+    // demand all smaller source elements covered by the larger demanded element
+    // of this vector.
+    if ((NumSrcElts % NumElts) == 0) {
+      unsigned Scale = NumSrcElts / NumElts;
+      SrcDemandedElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
+      if (SimplifyDemandedVectorElts(Src, SrcDemandedElts, SrcUndef, SrcZero,
+                                     TLO, Depth + 1))
+        return true;
+
+      // If all the src elements covering an output element are zero/undef, then
+      // the output element will be as well, assuming it was demanded.
+      for (unsigned i = 0; i != NumElts; ++i) {
+        if (DemandedElts[i]) {
+          if (SrcZero.extractBits(Scale, i * Scale).isAllOnes())
+            KnownZero.setBit(i);
+          if (SrcUndef.extractBits(Scale, i * Scale).isAllOnes())
+            KnownUndef.setBit(i);
+        }
+      }
+    }
+    break;
+  }
+  case ISD::BUILD_VECTOR: {
+    // Check all elements and simplify any unused elements with UNDEF.
+    if (!DemandedElts.isAllOnes()) {
+      // Don't simplify BROADCASTS.
+      if (llvm::any_of(Op->op_values(),
+                       [&](SDValue Elt) { return Op.getOperand(0) != Elt; })) {
+        SmallVector<SDValue, 32> Ops(Op->op_begin(), Op->op_end());
+        bool Updated = false;
+        for (unsigned i = 0; i != NumElts; ++i) {
+          if (!DemandedElts[i] && !Ops[i].isUndef()) {
+            Ops[i] = TLO.DAG.getUNDEF(Ops[0].getValueType());
+            KnownUndef.setBit(i);
+            Updated = true;
+          }
+        }
+        if (Updated)
+          return TLO.CombineTo(Op, TLO.DAG.getBuildVector(VT, DL, Ops));
+      }
+    }
+    for (unsigned i = 0; i != NumElts; ++i) {
+      SDValue SrcOp = Op.getOperand(i);
+      if (SrcOp.isUndef()) {
+        KnownUndef.setBit(i);
+      } else if (EltSizeInBits == SrcOp.getScalarValueSizeInBits() &&
+                 (isNullConstant(SrcOp) || isNullFPConstant(SrcOp))) {
+        KnownZero.setBit(i);
+      }
+    }
+    break;
+  }
+  case ISD::CONCAT_VECTORS: {
+    EVT SubVT = Op.getOperand(0).getValueType();
+    unsigned NumSubVecs = Op.getNumOperands();
+    unsigned NumSubElts = SubVT.getVectorNumElements();
+    for (unsigned i = 0; i != NumSubVecs; ++i) {
+      SDValue SubOp = Op.getOperand(i);
+      APInt SubElts = DemandedElts.extractBits(NumSubElts, i * NumSubElts);
+      APInt SubUndef, SubZero;
+      if (SimplifyDemandedVectorElts(SubOp, SubElts, SubUndef, SubZero, TLO,
+                                     Depth + 1))
+        return true;
+      KnownUndef.insertBits(SubUndef, i * NumSubElts);
+      KnownZero.insertBits(SubZero, i * NumSubElts);
+    }
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedElts.isAllOnes()) {
+      bool FoundNewSub = false;
+      SmallVector<SDValue, 2> DemandedSubOps;
+      for (unsigned i = 0; i != NumSubVecs; ++i) {
+        SDValue SubOp = Op.getOperand(i);
+        APInt SubElts = DemandedElts.extractBits(NumSubElts, i * NumSubElts);
+        SDValue NewSubOp = SimplifyMultipleUseDemandedVectorElts(
+            SubOp, SubElts, TLO.DAG, Depth + 1);
+        DemandedSubOps.push_back(NewSubOp ? NewSubOp : SubOp);
+        FoundNewSub = NewSubOp ? true : FoundNewSub;
+      }
+      if (FoundNewSub) {
+        SDValue NewOp =
+            TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedSubOps);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+    break;
+  }
+  case ISD::INSERT_SUBVECTOR: {
+    // Demand any elements from the subvector and the remainder from the src its
+    // inserted into.
+    SDValue Src = Op.getOperand(0);
+    SDValue Sub = Op.getOperand(1);
+    uint64_t Idx = Op.getConstantOperandVal(2);
+    unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+    APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+    APInt DemandedSrcElts = DemandedElts;
+    DemandedSrcElts.insertBits(APInt::getZero(NumSubElts), Idx);
+
+    APInt SubUndef, SubZero;
+    if (SimplifyDemandedVectorElts(Sub, DemandedSubElts, SubUndef, SubZero, TLO,
+                                   Depth + 1))
+      return true;
+
+    // If none of the src operand elements are demanded, replace it with undef.
+    if (!DemandedSrcElts && !Src.isUndef())
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+                                               TLO.DAG.getUNDEF(VT), Sub,
+                                               Op.getOperand(2)));
+
+    if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownUndef, KnownZero,
+                                   TLO, Depth + 1))
+      return true;
+    KnownUndef.insertBits(SubUndef, Idx);
+    KnownZero.insertBits(SubZero, Idx);
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedSrcElts.isAllOnes() || !DemandedSubElts.isAllOnes()) {
+      SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
+          Src, DemandedSrcElts, TLO.DAG, Depth + 1);
+      SDValue NewSub = SimplifyMultipleUseDemandedVectorElts(
+          Sub, DemandedSubElts, TLO.DAG, Depth + 1);
+      if (NewSrc || NewSub) {
+        NewSrc = NewSrc ? NewSrc : Src;
+        NewSub = NewSub ? NewSub : Sub;
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, NewSrc,
+                                        NewSub, Op.getOperand(2));
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+    break;
+  }
+  case ISD::EXTRACT_SUBVECTOR: {
+    // Offset the demanded elts by the subvector index.
+    SDValue Src = Op.getOperand(0);
+    if (Src.getValueType().isScalableVector())
+      break;
+    uint64_t Idx = Op.getConstantOperandVal(1);
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts).shl(Idx);
+
+    APInt SrcUndef, SrcZero;
+    if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, SrcUndef, SrcZero, TLO,
+                                   Depth + 1))
+      return true;
+    KnownUndef = SrcUndef.extractBits(NumElts, Idx);
+    KnownZero = SrcZero.extractBits(NumElts, Idx);
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedElts.isAllOnes()) {
+      SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
+          Src, DemandedSrcElts, TLO.DAG, Depth + 1);
+      if (NewSrc) {
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, NewSrc,
+                                        Op.getOperand(1));
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+    break;
+  }
+  case ISD::INSERT_VECTOR_ELT: {
+    SDValue Vec = Op.getOperand(0);
+    SDValue Scl = Op.getOperand(1);
+    auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+
+    // For a legal, constant insertion index, if we don't need this insertion
+    // then strip it, else remove it from the demanded elts.
+    if (CIdx && CIdx->getAPIntValue().ult(NumElts)) {
+      unsigned Idx = CIdx->getZExtValue();
+      if (!DemandedElts[Idx])
+        return TLO.CombineTo(Op, Vec);
+
+      APInt DemandedVecElts(DemandedElts);
+      DemandedVecElts.clearBit(Idx);
+      if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
+                                     KnownZero, TLO, Depth + 1))
+        return true;
+
+      KnownUndef.setBitVal(Idx, Scl.isUndef());
+
+      KnownZero.setBitVal(Idx, isNullConstant(Scl) || isNullFPConstant(Scl));
+      break;
+    }
+
+    APInt VecUndef, VecZero;
+    if (SimplifyDemandedVectorElts(Vec, DemandedElts, VecUndef, VecZero, TLO,
+                                   Depth + 1))
+      return true;
+    // Without knowing the insertion index we can't set KnownUndef/KnownZero.
+    break;
+  }
+  case ISD::VSELECT: {
+    SDValue Sel = Op.getOperand(0);
+    SDValue LHS = Op.getOperand(1);
+    SDValue RHS = Op.getOperand(2);
+
+    // Try to transform the select condition based on the current demanded
+    // elements.
+    APInt UndefSel, UndefZero;
+    if (SimplifyDemandedVectorElts(Sel, DemandedElts, UndefSel, UndefZero, TLO,
+                                   Depth + 1))
+      return true;
+
+    // See if we can simplify either vselect operand.
+    APInt DemandedLHS(DemandedElts);
+    APInt DemandedRHS(DemandedElts);
+    APInt UndefLHS, ZeroLHS;
+    APInt UndefRHS, ZeroRHS;
+    if (SimplifyDemandedVectorElts(LHS, DemandedLHS, UndefLHS, ZeroLHS, TLO,
+                                   Depth + 1))
+      return true;
+    if (SimplifyDemandedVectorElts(RHS, DemandedRHS, UndefRHS, ZeroRHS, TLO,
+                                   Depth + 1))
+      return true;
+
+    KnownUndef = UndefLHS & UndefRHS;
+    KnownZero = ZeroLHS & ZeroRHS;
+
+    // If we know that the selected element is always zero, we don't need the
+    // select value element.
+    APInt DemandedSel = DemandedElts & ~KnownZero;
+    if (DemandedSel != DemandedElts)
+      if (SimplifyDemandedVectorElts(Sel, DemandedSel, UndefSel, UndefZero, TLO,
+                                     Depth + 1))
+        return true;
+
+    break;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    SDValue LHS = Op.getOperand(0);
+    SDValue RHS = Op.getOperand(1);
+    ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(Op)->getMask();
+
+    // Collect demanded elements from shuffle operands..
+    APInt DemandedLHS(NumElts, 0);
+    APInt DemandedRHS(NumElts, 0);
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int M = ShuffleMask[i];
+      if (M < 0 || !DemandedElts[i])
+        continue;
+      assert(0 <= M && M < (int)(2 * NumElts) && "Shuffle index out of range");
+      if (M < (int)NumElts)
+        DemandedLHS.setBit(M);
+      else
+        DemandedRHS.setBit(M - NumElts);
+    }
+
+    // See if we can simplify either shuffle operand.
+    APInt UndefLHS, ZeroLHS;
+    APInt UndefRHS, ZeroRHS;
+    if (SimplifyDemandedVectorElts(LHS, DemandedLHS, UndefLHS, ZeroLHS, TLO,
+                                   Depth + 1))
+      return true;
+    if (SimplifyDemandedVectorElts(RHS, DemandedRHS, UndefRHS, ZeroRHS, TLO,
+                                   Depth + 1))
+      return true;
+
+    // Simplify mask using undef elements from LHS/RHS.
+    bool Updated = false;
+    bool IdentityLHS = true, IdentityRHS = true;
+    SmallVector<int, 32> NewMask(ShuffleMask);
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int &M = NewMask[i];
+      if (M < 0)
+        continue;
+      if (!DemandedElts[i] || (M < (int)NumElts && UndefLHS[M]) ||
+          (M >= (int)NumElts && UndefRHS[M - NumElts])) {
+        Updated = true;
+        M = -1;
+      }
+      IdentityLHS &= (M < 0) || (M == (int)i);
+      IdentityRHS &= (M < 0) || ((M - NumElts) == i);
+    }
+
+    // Update legal shuffle masks based on demanded elements if it won't reduce
+    // to Identity which can cause premature removal of the shuffle mask.
+    if (Updated && !IdentityLHS && !IdentityRHS && !TLO.LegalOps) {
+      SDValue LegalShuffle =
+          buildLegalVectorShuffle(VT, DL, LHS, RHS, NewMask, TLO.DAG);
+      if (LegalShuffle)
+        return TLO.CombineTo(Op, LegalShuffle);
+    }
+
+    // Propagate undef/zero elements from LHS/RHS.
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int M = ShuffleMask[i];
+      if (M < 0) {
+        KnownUndef.setBit(i);
+      } else if (M < (int)NumElts) {
+        if (UndefLHS[M])
+          KnownUndef.setBit(i);
+        if (ZeroLHS[M])
+          KnownZero.setBit(i);
+      } else {
+        if (UndefRHS[M - NumElts])
+          KnownUndef.setBit(i);
+        if (ZeroRHS[M - NumElts])
+          KnownZero.setBit(i);
+      }
+    }
+    break;
+  }
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG: {
+    APInt SrcUndef, SrcZero;
+    SDValue Src = Op.getOperand(0);
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt DemandedSrcElts = DemandedElts.zext(NumSrcElts);
+    if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, SrcUndef, SrcZero, TLO,
+                                   Depth + 1))
+      return true;
+    KnownZero = SrcZero.zextOrTrunc(NumElts);
+    KnownUndef = SrcUndef.zextOrTrunc(NumElts);
+
+    if (IsLE && Op.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG &&
+        Op.getValueSizeInBits() == Src.getValueSizeInBits() &&
+        DemandedSrcElts == 1) {
+      // aext - if we just need the bottom element then we can bitcast.
+      return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Src));
+    }
+
+    if (Op.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) {
+      // zext(undef) upper bits are guaranteed to be zero.
+      if (DemandedElts.isSubsetOf(KnownUndef))
+        return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+      KnownUndef.clearAllBits();
+
+      // zext - if we just need the bottom element then we can mask:
+      // zext(and(x,c)) -> and(x,c') iff the zext is the only user of the and.
+      if (IsLE && DemandedSrcElts == 1 && Src.getOpcode() == ISD::AND &&
+          Op->isOnlyUserOf(Src.getNode()) &&
+          Op.getValueSizeInBits() == Src.getValueSizeInBits()) {
+        SDLoc DL(Op);
+        EVT SrcVT = Src.getValueType();
+        EVT SrcSVT = SrcVT.getScalarType();
+        SmallVector<SDValue> MaskElts;
+        MaskElts.push_back(TLO.DAG.getAllOnesConstant(DL, SrcSVT));
+        MaskElts.append(NumSrcElts - 1, TLO.DAG.getConstant(0, DL, SrcSVT));
+        SDValue Mask = TLO.DAG.getBuildVector(SrcVT, DL, MaskElts);
+        if (SDValue Fold = TLO.DAG.FoldConstantArithmetic(
+                ISD::AND, DL, SrcVT, {Src.getOperand(1), Mask})) {
+          Fold = TLO.DAG.getNode(ISD::AND, DL, SrcVT, Src.getOperand(0), Fold);
+          return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Fold));
+        }
+      }
+    }
+    break;
+  }
+
+  // TODO: There are more binop opcodes that could be handled here - MIN,
+  // MAX, saturated math, etc.
+  case ISD::ADD: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    if (Op0 == Op1 && Op->isOnlyUserOf(Op0.getNode())) {
+      APInt UndefLHS, ZeroLHS;
+      if (SimplifyDemandedVectorElts(Op0, DemandedElts, UndefLHS, ZeroLHS, TLO,
+                                     Depth + 1, /*AssumeSingleUse*/ true))
+        return true;
+    }
+    [[fallthrough]];
+  }
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SUB:
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    APInt UndefRHS, ZeroRHS;
+    if (SimplifyDemandedVectorElts(Op1, DemandedElts, UndefRHS, ZeroRHS, TLO,
+                                   Depth + 1))
+      return true;
+    APInt UndefLHS, ZeroLHS;
+    if (SimplifyDemandedVectorElts(Op0, DemandedElts, UndefLHS, ZeroLHS, TLO,
+                                   Depth + 1))
+      return true;
+
+    KnownZero = ZeroLHS & ZeroRHS;
+    KnownUndef = getKnownUndefForVectorBinop(Op, TLO.DAG, UndefLHS, UndefRHS);
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    // TODO - use KnownUndef to relax the demandedelts?
+    if (!DemandedElts.isAllOnes())
+      if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
+        return true;
+    break;
+  }
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:
+  case ISD::ROTL:
+  case ISD::ROTR: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    APInt UndefRHS, ZeroRHS;
+    if (SimplifyDemandedVectorElts(Op1, DemandedElts, UndefRHS, ZeroRHS, TLO,
+                                   Depth + 1))
+      return true;
+    APInt UndefLHS, ZeroLHS;
+    if (SimplifyDemandedVectorElts(Op0, DemandedElts, UndefLHS, ZeroLHS, TLO,
+                                   Depth + 1))
+      return true;
+
+    KnownZero = ZeroLHS;
+    KnownUndef = UndefLHS & UndefRHS; // TODO: use getKnownUndefForVectorBinop?
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    // TODO - use KnownUndef to relax the demandedelts?
+    if (!DemandedElts.isAllOnes())
+      if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
+        return true;
+    break;
+  }
+  case ISD::MUL:
+  case ISD::MULHU:
+  case ISD::MULHS:
+  case ISD::AND: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    APInt SrcUndef, SrcZero;
+    if (SimplifyDemandedVectorElts(Op1, DemandedElts, SrcUndef, SrcZero, TLO,
+                                   Depth + 1))
+      return true;
+    // If we know that a demanded element was zero in Op1 we don't need to
+    // demand it in Op0 - its guaranteed to be zero.
+    APInt DemandedElts0 = DemandedElts & ~SrcZero;
+    if (SimplifyDemandedVectorElts(Op0, DemandedElts0, KnownUndef, KnownZero,
+                                   TLO, Depth + 1))
+      return true;
+
+    KnownUndef &= DemandedElts0;
+    KnownZero &= DemandedElts0;
+
+    // If every element pair has a zero/undef then just fold to zero.
+    // fold (and x, undef) -> 0  /  (and x, 0) -> 0
+    // fold (mul x, undef) -> 0  /  (mul x, 0) -> 0
+    if (DemandedElts.isSubsetOf(SrcZero | KnownZero | SrcUndef | KnownUndef))
+      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+
+    // If either side has a zero element, then the result element is zero, even
+    // if the other is an UNDEF.
+    // TODO: Extend getKnownUndefForVectorBinop to also deal with known zeros
+    // and then handle 'and' nodes with the rest of the binop opcodes.
+    KnownZero |= SrcZero;
+    KnownUndef &= SrcUndef;
+    KnownUndef &= ~KnownZero;
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedElts.isAllOnes())
+      if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
+        return true;
+    break;
+  }
+  case ISD::TRUNCATE:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef,
+                                   KnownZero, TLO, Depth + 1))
+      return true;
+
+    if (Op.getOpcode() == ISD::ZERO_EXTEND) {
+      // zext(undef) upper bits are guaranteed to be zero.
+      if (DemandedElts.isSubsetOf(KnownUndef))
+        return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+      KnownUndef.clearAllBits();
+    }
+    break;
+  default: {
+    if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
+      if (SimplifyDemandedVectorEltsForTargetNode(Op, DemandedElts, KnownUndef,
+                                                  KnownZero, TLO, Depth))
+        return true;
+    } else {
+      KnownBits Known;
+      APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
+      if (SimplifyDemandedBits(Op, DemandedBits, OriginalDemandedElts, Known,
+                               TLO, Depth, AssumeSingleUse))
+        return true;
+    }
+    break;
+  }
+  }
+  assert((KnownUndef & KnownZero) == 0 && "Elements flagged as undef AND zero");
+
+  // Constant fold all undef cases.
+  // TODO: Handle zero cases as well.
+  if (DemandedElts.isSubsetOf(KnownUndef))
+    return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
+
+  return false;
+}
+
+/// Determine which of the bits specified in Mask are known to be either zero or
+/// one and return them in the Known.
+void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                   KnownBits &Known,
+                                                   const APInt &DemandedElts,
+                                                   const SelectionDAG &DAG,
+                                                   unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use MaskedValueIsZero if you don't know whether Op"
+         " is a target node!");
+  Known.resetAll();
+}
+
+void TargetLowering::computeKnownBitsForTargetInstr(
+    GISelKnownBits &Analysis, Register R, KnownBits &Known,
+    const APInt &DemandedElts, const MachineRegisterInfo &MRI,
+    unsigned Depth) const {
+  Known.resetAll();
+}
+
+void TargetLowering::computeKnownBitsForFrameIndex(
+  const int FrameIdx, KnownBits &Known, const MachineFunction &MF) const {
+  // The low bits are known zero if the pointer is aligned.
+  Known.Zero.setLowBits(Log2(MF.getFrameInfo().getObjectAlign(FrameIdx)));
+}
+
+Align TargetLowering::computeKnownAlignForTargetInstr(
+  GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI,
+  unsigned Depth) const {
+  return Align(1);
+}
+
+/// This method can be implemented by targets that want to expose additional
+/// information about sign bits to the DAG Combiner.
+unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                         const APInt &,
+                                                         const SelectionDAG &,
+                                                         unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use ComputeNumSignBits if you don't know whether Op"
+         " is a target node!");
+  return 1;
+}
+
+unsigned TargetLowering::computeNumSignBitsForTargetInstr(
+  GISelKnownBits &Analysis, Register R, const APInt &DemandedElts,
+  const MachineRegisterInfo &MRI, unsigned Depth) const {
+  return 1;
+}
+
+bool TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
+    SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
+    TargetLoweringOpt &TLO, unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use SimplifyDemandedVectorElts if you don't know whether Op"
+         " is a target node!");
+  return false;
+}
+
+bool TargetLowering::SimplifyDemandedBitsForTargetNode(
+    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+    KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use SimplifyDemandedBits if you don't know whether Op"
+         " is a target node!");
+  computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
+  return false;
+}
+
+SDValue TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
+    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+    SelectionDAG &DAG, unsigned Depth) const {
+  assert(
+      (Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+       Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+       Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+       Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+      "Should use SimplifyMultipleUseDemandedBits if you don't know whether Op"
+      " is a target node!");
+  return SDValue();
+}
+
+SDValue
+TargetLowering::buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0,
+                                        SDValue N1, MutableArrayRef<int> Mask,
+                                        SelectionDAG &DAG) const {
+  bool LegalMask = isShuffleMaskLegal(Mask, VT);
+  if (!LegalMask) {
+    std::swap(N0, N1);
+    ShuffleVectorSDNode::commuteMask(Mask);
+    LegalMask = isShuffleMaskLegal(Mask, VT);
+  }
+
+  if (!LegalMask)
+    return SDValue();
+
+  return DAG.getVectorShuffle(VT, DL, N0, N1, Mask);
+}
+
+const Constant *TargetLowering::getTargetConstantFromLoad(LoadSDNode*) const {
+  return nullptr;
+}
+
+bool TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
+    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+    bool PoisonOnly, unsigned Depth) const {
+  assert(
+      (Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+       Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+       Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+       Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+      "Should use isGuaranteedNotToBeUndefOrPoison if you don't know whether Op"
+      " is a target node!");
+  return false;
+}
+
+bool TargetLowering::canCreateUndefOrPoisonForTargetNode(
+    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+    bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use canCreateUndefOrPoison if you don't know whether Op"
+         " is a target node!");
+  // Be conservative and return true.
+  return true;
+}
+
+bool TargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
+                                                  const SelectionDAG &DAG,
+                                                  bool SNaN,
+                                                  unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use isKnownNeverNaN if you don't know whether Op"
+         " is a target node!");
+  return false;
+}
+
+bool TargetLowering::isSplatValueForTargetNode(SDValue Op,
+                                               const APInt &DemandedElts,
+                                               APInt &UndefElts,
+                                               const SelectionDAG &DAG,
+                                               unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use isSplatValue if you don't know whether Op"
+         " is a target node!");
+  return false;
+}
+
+// FIXME: Ideally, this would use ISD::isConstantSplatVector(), but that must
+// work with truncating build vectors and vectors with elements of less than
+// 8 bits.
+bool TargetLowering::isConstTrueVal(SDValue N) const {
+  if (!N)
+    return false;
+
+  unsigned EltWidth;
+  APInt CVal;
+  if (ConstantSDNode *CN = isConstOrConstSplat(N, /*AllowUndefs=*/false,
+                                               /*AllowTruncation=*/true)) {
+    CVal = CN->getAPIntValue();
+    EltWidth = N.getValueType().getScalarSizeInBits();
+  } else
+    return false;
+
+  // If this is a truncating splat, truncate the splat value.
+  // Otherwise, we may fail to match the expected values below.
+  if (EltWidth < CVal.getBitWidth())
+    CVal = CVal.trunc(EltWidth);
+
+  switch (getBooleanContents(N.getValueType())) {
+  case UndefinedBooleanContent:
+    return CVal[0];
+  case ZeroOrOneBooleanContent:
+    return CVal.isOne();
+  case ZeroOrNegativeOneBooleanContent:
+    return CVal.isAllOnes();
+  }
+
+  llvm_unreachable("Invalid boolean contents");
+}
+
+bool TargetLowering::isConstFalseVal(SDValue N) const {
+  if (!N)
+    return false;
+
+  const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
+  if (!CN) {
+    const BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N);
+    if (!BV)
+      return false;
+
+    // Only interested in constant splats, we don't care about undef
+    // elements in identifying boolean constants and getConstantSplatNode
+    // returns NULL if all ops are undef;
+    CN = BV->getConstantSplatNode();
+    if (!CN)
+      return false;
+  }
+
+  if (getBooleanContents(N->getValueType(0)) == UndefinedBooleanContent)
+    return !CN->getAPIntValue()[0];
+
+  return CN->isZero();
+}
+
+bool TargetLowering::isExtendedTrueVal(const ConstantSDNode *N, EVT VT,
+                                       bool SExt) const {
+  if (VT == MVT::i1)
+    return N->isOne();
+
+  TargetLowering::BooleanContent Cnt = getBooleanContents(VT);
+  switch (Cnt) {
+  case TargetLowering::ZeroOrOneBooleanContent:
+    // An extended value of 1 is always true, unless its original type is i1,
+    // in which case it will be sign extended to -1.
+    return (N->isOne() && !SExt) || (SExt && (N->getValueType(0) != MVT::i1));
+  case TargetLowering::UndefinedBooleanContent:
+  case TargetLowering::ZeroOrNegativeOneBooleanContent:
+    return N->isAllOnes() && SExt;
+  }
+  llvm_unreachable("Unexpected enumeration.");
+}
+
+/// This helper function of SimplifySetCC tries to optimize the comparison when
+/// either operand of the SetCC node is a bitwise-and instruction.
+SDValue TargetLowering::foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1,
+                                         ISD::CondCode Cond, const SDLoc &DL,
+                                         DAGCombinerInfo &DCI) const {
+  if (N1.getOpcode() == ISD::AND && N0.getOpcode() != ISD::AND)
+    std::swap(N0, N1);
+
+  SelectionDAG &DAG = DCI.DAG;
+  EVT OpVT = N0.getValueType();
+  if (N0.getOpcode() != ISD::AND || !OpVT.isInteger() ||
+      (Cond != ISD::SETEQ && Cond != ISD::SETNE))
+    return SDValue();
+
+  // (X & Y) != 0 --> zextOrTrunc(X & Y)
+  // iff everything but LSB is known zero:
+  if (Cond == ISD::SETNE && isNullConstant(N1) &&
+      (getBooleanContents(OpVT) == TargetLowering::UndefinedBooleanContent ||
+       getBooleanContents(OpVT) == TargetLowering::ZeroOrOneBooleanContent)) {
+    unsigned NumEltBits = OpVT.getScalarSizeInBits();
+    APInt UpperBits = APInt::getHighBitsSet(NumEltBits, NumEltBits - 1);
+    if (DAG.MaskedValueIsZero(N0, UpperBits))
+      return DAG.getBoolExtOrTrunc(N0, DL, VT, OpVT);
+  }
+
+  // Try to eliminate a power-of-2 mask constant by converting to a signbit
+  // test in a narrow type that we can truncate to with no cost. Examples:
+  // (i32 X & 32768) == 0 --> (trunc X to i16) >= 0
+  // (i32 X & 32768) != 0 --> (trunc X to i16) < 0
+  // TODO: This conservatively checks for type legality on the source and
+  //       destination types. That may inhibit optimizations, but it also
+  //       allows setcc->shift transforms that may be more beneficial.
+  auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  if (AndC && isNullConstant(N1) && AndC->getAPIntValue().isPowerOf2() &&
+      isTypeLegal(OpVT) && N0.hasOneUse()) {
+    EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(),
+                                     AndC->getAPIntValue().getActiveBits());
+    if (isTruncateFree(OpVT, NarrowVT) && isTypeLegal(NarrowVT)) {
+      SDValue Trunc = DAG.getZExtOrTrunc(N0.getOperand(0), DL, NarrowVT);
+      SDValue Zero = DAG.getConstant(0, DL, NarrowVT);
+      return DAG.getSetCC(DL, VT, Trunc, Zero,
+                          Cond == ISD::SETEQ ? ISD::SETGE : ISD::SETLT);
+    }
+  }
+
+  // Match these patterns in any of their permutations:
+  // (X & Y) == Y
+  // (X & Y) != Y
+  SDValue X, Y;
+  if (N0.getOperand(0) == N1) {
+    X = N0.getOperand(1);
+    Y = N0.getOperand(0);
+  } else if (N0.getOperand(1) == N1) {
+    X = N0.getOperand(0);
+    Y = N0.getOperand(1);
+  } else {
+    return SDValue();
+  }
+
+  SDValue Zero = DAG.getConstant(0, DL, OpVT);
+  if (DAG.isKnownToBeAPowerOfTwo(Y)) {
+    // Simplify X & Y == Y to X & Y != 0 if Y has exactly one bit set.
+    // Note that where Y is variable and is known to have at most one bit set
+    // (for example, if it is Z & 1) we cannot do this; the expressions are not
+    // equivalent when Y == 0.
+    assert(OpVT.isInteger());
+    Cond = ISD::getSetCCInverse(Cond, OpVT);
+    if (DCI.isBeforeLegalizeOps() ||
+        isCondCodeLegal(Cond, N0.getSimpleValueType()))
+      return DAG.getSetCC(DL, VT, N0, Zero, Cond);
+  } else if (N0.hasOneUse() && hasAndNotCompare(Y)) {
+    // If the target supports an 'and-not' or 'and-complement' logic operation,
+    // try to use that to make a comparison operation more efficient.
+    // But don't do this transform if the mask is a single bit because there are
+    // more efficient ways to deal with that case (for example, 'bt' on x86 or
+    // 'rlwinm' on PPC).
+
+    // Bail out if the compare operand that we want to turn into a zero is
+    // already a zero (otherwise, infinite loop).
+    auto *YConst = dyn_cast<ConstantSDNode>(Y);
+    if (YConst && YConst->isZero())
+      return SDValue();
+
+    // Transform this into: ~X & Y == 0.
+    SDValue NotX = DAG.getNOT(SDLoc(X), X, OpVT);
+    SDValue NewAnd = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, NotX, Y);
+    return DAG.getSetCC(DL, VT, NewAnd, Zero, Cond);
+  }
+
+  return SDValue();
+}
+
+/// There are multiple IR patterns that could be checking whether certain
+/// truncation of a signed number would be lossy or not. The pattern which is
+/// best at IR level, may not lower optimally. Thus, we want to unfold it.
+/// We are looking for the following pattern: (KeptBits is a constant)
+///   (add %x, (1 << (KeptBits-1))) srccond (1 << KeptBits)
+/// KeptBits won't be bitwidth(x), that will be constant-folded to true/false.
+/// KeptBits also can't be 1, that would have been folded to  %x dstcond 0
+/// We will unfold it into the natural trunc+sext pattern:
+///   ((%x << C) a>> C) dstcond %x
+/// Where  C = bitwidth(x) - KeptBits  and  C u< bitwidth(x)
+SDValue TargetLowering::optimizeSetCCOfSignedTruncationCheck(
+    EVT SCCVT, SDValue N0, SDValue N1, ISD::CondCode Cond, DAGCombinerInfo &DCI,
+    const SDLoc &DL) const {
+  // We must be comparing with a constant.
+  ConstantSDNode *C1;
+  if (!(C1 = dyn_cast<ConstantSDNode>(N1)))
+    return SDValue();
+
+  // N0 should be:  add %x, (1 << (KeptBits-1))
+  if (N0->getOpcode() != ISD::ADD)
+    return SDValue();
+
+  // And we must be 'add'ing a constant.
+  ConstantSDNode *C01;
+  if (!(C01 = dyn_cast<ConstantSDNode>(N0->getOperand(1))))
+    return SDValue();
+
+  SDValue X = N0->getOperand(0);
+  EVT XVT = X.getValueType();
+
+  // Validate constants ...
+
+  APInt I1 = C1->getAPIntValue();
+
+  ISD::CondCode NewCond;
+  if (Cond == ISD::CondCode::SETULT) {
+    NewCond = ISD::CondCode::SETEQ;
+  } else if (Cond == ISD::CondCode::SETULE) {
+    NewCond = ISD::CondCode::SETEQ;
+    // But need to 'canonicalize' the constant.
+    I1 += 1;
+  } else if (Cond == ISD::CondCode::SETUGT) {
+    NewCond = ISD::CondCode::SETNE;
+    // But need to 'canonicalize' the constant.
+    I1 += 1;
+  } else if (Cond == ISD::CondCode::SETUGE) {
+    NewCond = ISD::CondCode::SETNE;
+  } else
+    return SDValue();
+
+  APInt I01 = C01->getAPIntValue();
+
+  auto checkConstants = [&I1, &I01]() -> bool {
+    // Both of them must be power-of-two, and the constant from setcc is bigger.
+    return I1.ugt(I01) && I1.isPowerOf2() && I01.isPowerOf2();
+  };
+
+  if (checkConstants()) {
+    // Great, e.g. got  icmp ult i16 (add i16 %x, 128), 256
+  } else {
+    // What if we invert constants? (and the target predicate)
+    I1.negate();
+    I01.negate();
+    assert(XVT.isInteger());
+    NewCond = getSetCCInverse(NewCond, XVT);
+    if (!checkConstants())
+      return SDValue();
+    // Great, e.g. got  icmp uge i16 (add i16 %x, -128), -256
+  }
+
+  // They are power-of-two, so which bit is set?
+  const unsigned KeptBits = I1.logBase2();
+  const unsigned KeptBitsMinusOne = I01.logBase2();
+
+  // Magic!
+  if (KeptBits != (KeptBitsMinusOne + 1))
+    return SDValue();
+  assert(KeptBits > 0 && KeptBits < XVT.getSizeInBits() && "unreachable");
+
+  // We don't want to do this in every single case.
+  SelectionDAG &DAG = DCI.DAG;
+  if (!DAG.getTargetLoweringInfo().shouldTransformSignedTruncationCheck(
+          XVT, KeptBits))
+    return SDValue();
+
+  const unsigned MaskedBits = XVT.getSizeInBits() - KeptBits;
+  assert(MaskedBits > 0 && MaskedBits < XVT.getSizeInBits() && "unreachable");
+
+  // Unfold into:  ((%x << C) a>> C) cond %x
+  // Where 'cond' will be either 'eq' or 'ne'.
+  SDValue ShiftAmt = DAG.getConstant(MaskedBits, DL, XVT);
+  SDValue T0 = DAG.getNode(ISD::SHL, DL, XVT, X, ShiftAmt);
+  SDValue T1 = DAG.getNode(ISD::SRA, DL, XVT, T0, ShiftAmt);
+  SDValue T2 = DAG.getSetCC(DL, SCCVT, T1, X, NewCond);
+
+  return T2;
+}
+
+// (X & (C l>>/<< Y)) ==/!= 0  -->  ((X <</l>> Y) & C) ==/!= 0
+SDValue TargetLowering::optimizeSetCCByHoistingAndByConstFromLogicalShift(
+    EVT SCCVT, SDValue N0, SDValue N1C, ISD::CondCode Cond,
+    DAGCombinerInfo &DCI, const SDLoc &DL) const {
+  assert(isConstOrConstSplat(N1C) && isConstOrConstSplat(N1C)->isZero() &&
+         "Should be a comparison with 0.");
+  assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+         "Valid only for [in]equality comparisons.");
+
+  unsigned NewShiftOpcode;
+  SDValue X, C, Y;
+
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Look for '(C l>>/<< Y)'.
+  auto Match = [&NewShiftOpcode, &X, &C, &Y, &TLI, &DAG](SDValue V) {
+    // The shift should be one-use.
+    if (!V.hasOneUse())
+      return false;
+    unsigned OldShiftOpcode = V.getOpcode();
+    switch (OldShiftOpcode) {
+    case ISD::SHL:
+      NewShiftOpcode = ISD::SRL;
+      break;
+    case ISD::SRL:
+      NewShiftOpcode = ISD::SHL;
+      break;
+    default:
+      return false; // must be a logical shift.
+    }
+    // We should be shifting a constant.
+    // FIXME: best to use isConstantOrConstantVector().
+    C = V.getOperand(0);
+    ConstantSDNode *CC =
+        isConstOrConstSplat(C, /*AllowUndefs=*/true, /*AllowTruncation=*/true);
+    if (!CC)
+      return false;
+    Y = V.getOperand(1);
+
+    ConstantSDNode *XC =
+        isConstOrConstSplat(X, /*AllowUndefs=*/true, /*AllowTruncation=*/true);
+    return TLI.shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+        X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG);
+  };
+
+  // LHS of comparison should be an one-use 'and'.
+  if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
+    return SDValue();
+
+  X = N0.getOperand(0);
+  SDValue Mask = N0.getOperand(1);
+
+  // 'and' is commutative!
+  if (!Match(Mask)) {
+    std::swap(X, Mask);
+    if (!Match(Mask))
+      return SDValue();
+  }
+
+  EVT VT = X.getValueType();
+
+  // Produce:
+  // ((X 'OppositeShiftOpcode' Y) & C) Cond 0
+  SDValue T0 = DAG.getNode(NewShiftOpcode, DL, VT, X, Y);
+  SDValue T1 = DAG.getNode(ISD::AND, DL, VT, T0, C);
+  SDValue T2 = DAG.getSetCC(DL, SCCVT, T1, N1C, Cond);
+  return T2;
+}
+
+/// Try to fold an equality comparison with a {add/sub/xor} binary operation as
+/// the 1st operand (N0). Callers are expected to swap the N0/N1 parameters to
+/// handle the commuted versions of these patterns.
+SDValue TargetLowering::foldSetCCWithBinOp(EVT VT, SDValue N0, SDValue N1,
+                                           ISD::CondCode Cond, const SDLoc &DL,
+                                           DAGCombinerInfo &DCI) const {
+  unsigned BOpcode = N0.getOpcode();
+  assert((BOpcode == ISD::ADD || BOpcode == ISD::SUB || BOpcode == ISD::XOR) &&
+         "Unexpected binop");
+  assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) && "Unexpected condcode");
+
+  // (X + Y) == X --> Y == 0
+  // (X - Y) == X --> Y == 0
+  // (X ^ Y) == X --> Y == 0
+  SelectionDAG &DAG = DCI.DAG;
+  EVT OpVT = N0.getValueType();
+  SDValue X = N0.getOperand(0);
+  SDValue Y = N0.getOperand(1);
+  if (X == N1)
+    return DAG.getSetCC(DL, VT, Y, DAG.getConstant(0, DL, OpVT), Cond);
+
+  if (Y != N1)
+    return SDValue();
+
+  // (X + Y) == Y --> X == 0
+  // (X ^ Y) == Y --> X == 0
+  if (BOpcode == ISD::ADD || BOpcode == ISD::XOR)
+    return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, OpVT), Cond);
+
+  // The shift would not be valid if the operands are boolean (i1).
+  if (!N0.hasOneUse() || OpVT.getScalarSizeInBits() == 1)
+    return SDValue();
+
+  // (X - Y) == Y --> X == Y << 1
+  EVT ShiftVT = getShiftAmountTy(OpVT, DAG.getDataLayout(),
+                                 !DCI.isBeforeLegalize());
+  SDValue One = DAG.getConstant(1, DL, ShiftVT);
+  SDValue YShl1 = DAG.getNode(ISD::SHL, DL, N1.getValueType(), Y, One);
+  if (!DCI.isCalledByLegalizer())
+    DCI.AddToWorklist(YShl1.getNode());
+  return DAG.getSetCC(DL, VT, X, YShl1, Cond);
+}
+
+static SDValue simplifySetCCWithCTPOP(const TargetLowering &TLI, EVT VT,
+                                      SDValue N0, const APInt &C1,
+                                      ISD::CondCode Cond, const SDLoc &dl,
+                                      SelectionDAG &DAG) {
+  // Look through truncs that don't change the value of a ctpop.
+  // FIXME: Add vector support? Need to be careful with setcc result type below.
+  SDValue CTPOP = N0;
+  if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && !VT.isVector() &&
+      N0.getScalarValueSizeInBits() > Log2_32(N0.getOperand(0).getScalarValueSizeInBits()))
+    CTPOP = N0.getOperand(0);
+
+  if (CTPOP.getOpcode() != ISD::CTPOP || !CTPOP.hasOneUse())
+    return SDValue();
+
+  EVT CTVT = CTPOP.getValueType();
+  SDValue CTOp = CTPOP.getOperand(0);
+
+  // Expand a power-of-2-or-zero comparison based on ctpop:
+  // (ctpop x) u< 2 -> (x & x-1) == 0
+  // (ctpop x) u> 1 -> (x & x-1) != 0
+  if (Cond == ISD::SETULT || Cond == ISD::SETUGT) {
+    // Keep the CTPOP if it is a legal vector op.
+    if (CTVT.isVector() && TLI.isOperationLegal(ISD::CTPOP, CTVT))
+      return SDValue();
+
+    unsigned CostLimit = TLI.getCustomCtpopCost(CTVT, Cond);
+    if (C1.ugt(CostLimit + (Cond == ISD::SETULT)))
+      return SDValue();
+    if (C1 == 0 && (Cond == ISD::SETULT))
+      return SDValue(); // This is handled elsewhere.
+
+    unsigned Passes = C1.getLimitedValue() - (Cond == ISD::SETULT);
+
+    SDValue NegOne = DAG.getAllOnesConstant(dl, CTVT);
+    SDValue Result = CTOp;
+    for (unsigned i = 0; i < Passes; i++) {
+      SDValue Add = DAG.getNode(ISD::ADD, dl, CTVT, Result, NegOne);
+      Result = DAG.getNode(ISD::AND, dl, CTVT, Result, Add);
+    }
+    ISD::CondCode CC = Cond == ISD::SETULT ? ISD::SETEQ : ISD::SETNE;
+    return DAG.getSetCC(dl, VT, Result, DAG.getConstant(0, dl, CTVT), CC);
+  }
+
+  // Expand a power-of-2 comparison based on ctpop:
+  // (ctpop x) == 1 --> (x != 0) && ((x & x-1) == 0)
+  // (ctpop x) != 1 --> (x == 0) || ((x & x-1) != 0)
+  if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && C1 == 1) {
+    // Keep the CTPOP if it is legal.
+    if (TLI.isOperationLegal(ISD::CTPOP, CTVT))
+      return SDValue();
+
+    SDValue Zero = DAG.getConstant(0, dl, CTVT);
+    SDValue NegOne = DAG.getAllOnesConstant(dl, CTVT);
+    assert(CTVT.isInteger());
+    ISD::CondCode InvCond = ISD::getSetCCInverse(Cond, CTVT);
+    SDValue Add = DAG.getNode(ISD::ADD, dl, CTVT, CTOp, NegOne);
+    SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Add);
+    SDValue RHS = DAG.getSetCC(dl, VT, And, Zero, Cond);
+    // Its not uncommon for known-never-zero X to exist in (ctpop X) eq/ne 1, so
+    // check before the emit a potentially unnecessary op.
+    if (DAG.isKnownNeverZero(CTOp))
+      return RHS;
+    SDValue LHS = DAG.getSetCC(dl, VT, CTOp, Zero, InvCond);
+    unsigned LogicOpcode = Cond == ISD::SETEQ ? ISD::AND : ISD::OR;
+    return DAG.getNode(LogicOpcode, dl, VT, LHS, RHS);
+  }
+
+  return SDValue();
+}
+
+static SDValue foldSetCCWithRotate(EVT VT, SDValue N0, SDValue N1,
+                                   ISD::CondCode Cond, const SDLoc &dl,
+                                   SelectionDAG &DAG) {
+  if (Cond != ISD::SETEQ && Cond != ISD::SETNE)
+    return SDValue();
+
+  auto *C1 = isConstOrConstSplat(N1, /* AllowUndefs */ true);
+  if (!C1 || !(C1->isZero() || C1->isAllOnes()))
+    return SDValue();
+
+  auto getRotateSource = [](SDValue X) {
+    if (X.getOpcode() == ISD::ROTL || X.getOpcode() == ISD::ROTR)
+      return X.getOperand(0);
+    return SDValue();
+  };
+
+  // Peek through a rotated value compared against 0 or -1:
+  // (rot X, Y) == 0/-1 --> X == 0/-1
+  // (rot X, Y) != 0/-1 --> X != 0/-1
+  if (SDValue R = getRotateSource(N0))
+    return DAG.getSetCC(dl, VT, R, N1, Cond);
+
+  // Peek through an 'or' of a rotated value compared against 0:
+  // or (rot X, Y), Z ==/!= 0 --> (or X, Z) ==/!= 0
+  // or Z, (rot X, Y) ==/!= 0 --> (or X, Z) ==/!= 0
+  //
+  // TODO: Add the 'and' with -1 sibling.
+  // TODO: Recurse through a series of 'or' ops to find the rotate.
+  EVT OpVT = N0.getValueType();
+  if (N0.hasOneUse() && N0.getOpcode() == ISD::OR && C1->isZero()) {
+    if (SDValue R = getRotateSource(N0.getOperand(0))) {
+      SDValue NewOr = DAG.getNode(ISD::OR, dl, OpVT, R, N0.getOperand(1));
+      return DAG.getSetCC(dl, VT, NewOr, N1, Cond);
+    }
+    if (SDValue R = getRotateSource(N0.getOperand(1))) {
+      SDValue NewOr = DAG.getNode(ISD::OR, dl, OpVT, R, N0.getOperand(0));
+      return DAG.getSetCC(dl, VT, NewOr, N1, Cond);
+    }
+  }
+
+  return SDValue();
+}
+
+static SDValue foldSetCCWithFunnelShift(EVT VT, SDValue N0, SDValue N1,
+                                        ISD::CondCode Cond, const SDLoc &dl,
+                                        SelectionDAG &DAG) {
+  // If we are testing for all-bits-clear, we might be able to do that with
+  // less shifting since bit-order does not matter.
+  if (Cond != ISD::SETEQ && Cond != ISD::SETNE)
+    return SDValue();
+
+  auto *C1 = isConstOrConstSplat(N1, /* AllowUndefs */ true);
+  if (!C1 || !C1->isZero())
+    return SDValue();
+
+  if (!N0.hasOneUse() ||
+      (N0.getOpcode() != ISD::FSHL && N0.getOpcode() != ISD::FSHR))
+    return SDValue();
+
+  unsigned BitWidth = N0.getScalarValueSizeInBits();
+  auto *ShAmtC = isConstOrConstSplat(N0.getOperand(2));
+  if (!ShAmtC || ShAmtC->getAPIntValue().uge(BitWidth))
+    return SDValue();
+
+  // Canonicalize fshr as fshl to reduce pattern-matching.
+  unsigned ShAmt = ShAmtC->getZExtValue();
+  if (N0.getOpcode() == ISD::FSHR)
+    ShAmt = BitWidth - ShAmt;
+
+  // Match an 'or' with a specific operand 'Other' in either commuted variant.
+  SDValue X, Y;
+  auto matchOr = [&X, &Y](SDValue Or, SDValue Other) {
+    if (Or.getOpcode() != ISD::OR || !Or.hasOneUse())
+      return false;
+    if (Or.getOperand(0) == Other) {
+      X = Or.getOperand(0);
+      Y = Or.getOperand(1);
+      return true;
+    }
+    if (Or.getOperand(1) == Other) {
+      X = Or.getOperand(1);
+      Y = Or.getOperand(0);
+      return true;
+    }
+    return false;
+  };
+
+  EVT OpVT = N0.getValueType();
+  EVT ShAmtVT = N0.getOperand(2).getValueType();
+  SDValue F0 = N0.getOperand(0);
+  SDValue F1 = N0.getOperand(1);
+  if (matchOr(F0, F1)) {
+    // fshl (or X, Y), X, C ==/!= 0 --> or (shl Y, C), X ==/!= 0
+    SDValue NewShAmt = DAG.getConstant(ShAmt, dl, ShAmtVT);
+    SDValue Shift = DAG.getNode(ISD::SHL, dl, OpVT, Y, NewShAmt);
+    SDValue NewOr = DAG.getNode(ISD::OR, dl, OpVT, Shift, X);
+    return DAG.getSetCC(dl, VT, NewOr, N1, Cond);
+  }
+  if (matchOr(F1, F0)) {
+    // fshl X, (or X, Y), C ==/!= 0 --> or (srl Y, BW-C), X ==/!= 0
+    SDValue NewShAmt = DAG.getConstant(BitWidth - ShAmt, dl, ShAmtVT);
+    SDValue Shift = DAG.getNode(ISD::SRL, dl, OpVT, Y, NewShAmt);
+    SDValue NewOr = DAG.getNode(ISD::OR, dl, OpVT, Shift, X);
+    return DAG.getSetCC(dl, VT, NewOr, N1, Cond);
+  }
+
+  return SDValue();
+}
+
+/// Try to simplify a setcc built with the specified operands and cc. If it is
+/// unable to simplify it, return a null SDValue.
+SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
+                                      ISD::CondCode Cond, bool foldBooleans,
+                                      DAGCombinerInfo &DCI,
+                                      const SDLoc &dl) const {
+  SelectionDAG &DAG = DCI.DAG;
+  const DataLayout &Layout = DAG.getDataLayout();
+  EVT OpVT = N0.getValueType();
+  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+
+  // Constant fold or commute setcc.
+  if (SDValue Fold = DAG.FoldSetCC(VT, N0, N1, Cond, dl))
+    return Fold;
+
+  bool N0ConstOrSplat =
+      isConstOrConstSplat(N0, /*AllowUndefs*/ false, /*AllowTruncate*/ true);
+  bool N1ConstOrSplat =
+      isConstOrConstSplat(N1, /*AllowUndefs*/ false, /*AllowTruncate*/ true);
+
+  // Canonicalize toward having the constant on the RHS.
+  // TODO: Handle non-splat vector constants. All undef causes trouble.
+  // FIXME: We can't yet fold constant scalable vector splats, so avoid an
+  // infinite loop here when we encounter one.
+  ISD::CondCode SwappedCC = ISD::getSetCCSwappedOperands(Cond);
+  if (N0ConstOrSplat && !N1ConstOrSplat &&
+      (DCI.isBeforeLegalizeOps() ||
+       isCondCodeLegal(SwappedCC, N0.getSimpleValueType())))
+    return DAG.getSetCC(dl, VT, N1, N0, SwappedCC);
+
+  // If we have a subtract with the same 2 non-constant operands as this setcc
+  // -- but in reverse order -- then try to commute the operands of this setcc
+  // to match. A matching pair of setcc (cmp) and sub may be combined into 1
+  // instruction on some targets.
+  if (!N0ConstOrSplat && !N1ConstOrSplat &&
+      (DCI.isBeforeLegalizeOps() ||
+       isCondCodeLegal(SwappedCC, N0.getSimpleValueType())) &&
+      DAG.doesNodeExist(ISD::SUB, DAG.getVTList(OpVT), {N1, N0}) &&
+      !DAG.doesNodeExist(ISD::SUB, DAG.getVTList(OpVT), {N0, N1}))
+    return DAG.getSetCC(dl, VT, N1, N0, SwappedCC);
+
+  if (SDValue V = foldSetCCWithRotate(VT, N0, N1, Cond, dl, DAG))
+    return V;
+
+  if (SDValue V = foldSetCCWithFunnelShift(VT, N0, N1, Cond, dl, DAG))
+    return V;
+
+  if (auto *N1C = isConstOrConstSplat(N1)) {
+    const APInt &C1 = N1C->getAPIntValue();
+
+    // Optimize some CTPOP cases.
+    if (SDValue V = simplifySetCCWithCTPOP(*this, VT, N0, C1, Cond, dl, DAG))
+      return V;
+
+    // For equality to 0 of a no-wrap multiply, decompose and test each op:
+    // X * Y == 0 --> (X == 0) || (Y == 0)
+    // X * Y != 0 --> (X != 0) && (Y != 0)
+    // TODO: This bails out if minsize is set, but if the target doesn't have a
+    //       single instruction multiply for this type, it would likely be
+    //       smaller to decompose.
+    if (C1.isZero() && (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+        N0.getOpcode() == ISD::MUL && N0.hasOneUse() &&
+        (N0->getFlags().hasNoUnsignedWrap() ||
+         N0->getFlags().hasNoSignedWrap()) &&
+        !Attr.hasFnAttr(Attribute::MinSize)) {
+      SDValue IsXZero = DAG.getSetCC(dl, VT, N0.getOperand(0), N1, Cond);
+      SDValue IsYZero = DAG.getSetCC(dl, VT, N0.getOperand(1), N1, Cond);
+      unsigned LogicOp = Cond == ISD::SETEQ ? ISD::OR : ISD::AND;
+      return DAG.getNode(LogicOp, dl, VT, IsXZero, IsYZero);
+    }
+
+    // If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an
+    // equality comparison, then we're just comparing whether X itself is
+    // zero.
+    if (N0.getOpcode() == ISD::SRL && (C1.isZero() || C1.isOne()) &&
+        N0.getOperand(0).getOpcode() == ISD::CTLZ &&
+        llvm::has_single_bit<uint32_t>(N0.getScalarValueSizeInBits())) {
+      if (ConstantSDNode *ShAmt = isConstOrConstSplat(N0.getOperand(1))) {
+        if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+            ShAmt->getAPIntValue() == Log2_32(N0.getScalarValueSizeInBits())) {
+          if ((C1 == 0) == (Cond == ISD::SETEQ)) {
+            // (srl (ctlz x), 5) == 0  -> X != 0
+            // (srl (ctlz x), 5) != 1  -> X != 0
+            Cond = ISD::SETNE;
+          } else {
+            // (srl (ctlz x), 5) != 0  -> X == 0
+            // (srl (ctlz x), 5) == 1  -> X == 0
+            Cond = ISD::SETEQ;
+          }
+          SDValue Zero = DAG.getConstant(0, dl, N0.getValueType());
+          return DAG.getSetCC(dl, VT, N0.getOperand(0).getOperand(0), Zero,
+                              Cond);
+        }
+      }
+    }
+  }
+
+  // FIXME: Support vectors.
+  if (auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
+    const APInt &C1 = N1C->getAPIntValue();
+
+    // (zext x) == C --> x == (trunc C)
+    // (sext x) == C --> x == (trunc C)
+    if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+        DCI.isBeforeLegalize() && N0->hasOneUse()) {
+      unsigned MinBits = N0.getValueSizeInBits();
+      SDValue PreExt;
+      bool Signed = false;
+      if (N0->getOpcode() == ISD::ZERO_EXTEND) {
+        // ZExt
+        MinBits = N0->getOperand(0).getValueSizeInBits();
+        PreExt = N0->getOperand(0);
+      } else if (N0->getOpcode() == ISD::AND) {
+        // DAGCombine turns costly ZExts into ANDs
+        if (auto *C = dyn_cast<ConstantSDNode>(N0->getOperand(1)))
+          if ((C->getAPIntValue()+1).isPowerOf2()) {
+            MinBits = C->getAPIntValue().countr_one();
+            PreExt = N0->getOperand(0);
+          }
+      } else if (N0->getOpcode() == ISD::SIGN_EXTEND) {
+        // SExt
+        MinBits = N0->getOperand(0).getValueSizeInBits();
+        PreExt = N0->getOperand(0);
+        Signed = true;
+      } else if (auto *LN0 = dyn_cast<LoadSDNode>(N0)) {
+        // ZEXTLOAD / SEXTLOAD
+        if (LN0->getExtensionType() == ISD::ZEXTLOAD) {
+          MinBits = LN0->getMemoryVT().getSizeInBits();
+          PreExt = N0;
+        } else if (LN0->getExtensionType() == ISD::SEXTLOAD) {
+          Signed = true;
+          MinBits = LN0->getMemoryVT().getSizeInBits();
+          PreExt = N0;
+        }
+      }
+
+      // Figure out how many bits we need to preserve this constant.
+      unsigned ReqdBits = Signed ? C1.getSignificantBits() : C1.getActiveBits();
+
+      // Make sure we're not losing bits from the constant.
+      if (MinBits > 0 &&
+          MinBits < C1.getBitWidth() &&
+          MinBits >= ReqdBits) {
+        EVT MinVT = EVT::getIntegerVT(*DAG.getContext(), MinBits);
+        if (isTypeDesirableForOp(ISD::SETCC, MinVT)) {
+          // Will get folded away.
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MinVT, PreExt);
+          if (MinBits == 1 && C1 == 1)
+            // Invert the condition.
+            return DAG.getSetCC(dl, VT, Trunc, DAG.getConstant(0, dl, MVT::i1),
+                                Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
+          SDValue C = DAG.getConstant(C1.trunc(MinBits), dl, MinVT);
+          return DAG.getSetCC(dl, VT, Trunc, C, Cond);
+        }
+
+        // If truncating the setcc operands is not desirable, we can still
+        // simplify the expression in some cases:
+        // setcc ([sz]ext (setcc x, y, cc)), 0, setne) -> setcc (x, y, cc)
+        // setcc ([sz]ext (setcc x, y, cc)), 0, seteq) -> setcc (x, y, inv(cc))
+        // setcc (zext (setcc x, y, cc)), 1, setne) -> setcc (x, y, inv(cc))
+        // setcc (zext (setcc x, y, cc)), 1, seteq) -> setcc (x, y, cc)
+        // setcc (sext (setcc x, y, cc)), -1, setne) -> setcc (x, y, inv(cc))
+        // setcc (sext (setcc x, y, cc)), -1, seteq) -> setcc (x, y, cc)
+        SDValue TopSetCC = N0->getOperand(0);
+        unsigned N0Opc = N0->getOpcode();
+        bool SExt = (N0Opc == ISD::SIGN_EXTEND);
+        if (TopSetCC.getValueType() == MVT::i1 && VT == MVT::i1 &&
+            TopSetCC.getOpcode() == ISD::SETCC &&
+            (N0Opc == ISD::ZERO_EXTEND || N0Opc == ISD::SIGN_EXTEND) &&
+            (isConstFalseVal(N1) ||
+             isExtendedTrueVal(N1C, N0->getValueType(0), SExt))) {
+
+          bool Inverse = (N1C->isZero() && Cond == ISD::SETEQ) ||
+                         (!N1C->isZero() && Cond == ISD::SETNE);
+
+          if (!Inverse)
+            return TopSetCC;
+
+          ISD::CondCode InvCond = ISD::getSetCCInverse(
+              cast<CondCodeSDNode>(TopSetCC.getOperand(2))->get(),
+              TopSetCC.getOperand(0).getValueType());
+          return DAG.getSetCC(dl, VT, TopSetCC.getOperand(0),
+                                      TopSetCC.getOperand(1),
+                                      InvCond);
+        }
+      }
+    }
+
+    // If the LHS is '(and load, const)', the RHS is 0, the test is for
+    // equality or unsigned, and all 1 bits of the const are in the same
+    // partial word, see if we can shorten the load.
+    if (DCI.isBeforeLegalize() &&
+        !ISD::isSignedIntSetCC(Cond) &&
+        N0.getOpcode() == ISD::AND && C1 == 0 &&
+        N0.getNode()->hasOneUse() &&
+        isa<LoadSDNode>(N0.getOperand(0)) &&
+        N0.getOperand(0).getNode()->hasOneUse() &&
+        isa<ConstantSDNode>(N0.getOperand(1))) {
+      LoadSDNode *Lod = cast<LoadSDNode>(N0.getOperand(0));
+      APInt bestMask;
+      unsigned bestWidth = 0, bestOffset = 0;
+      if (Lod->isSimple() && Lod->isUnindexed()) {
+        unsigned origWidth = N0.getValueSizeInBits();
+        unsigned maskWidth = origWidth;
+        // We can narrow (e.g.) 16-bit extending loads on 32-bit target to
+        // 8 bits, but have to be careful...
+        if (Lod->getExtensionType() != ISD::NON_EXTLOAD)
+          origWidth = Lod->getMemoryVT().getSizeInBits();
+        const APInt &Mask = N0.getConstantOperandAPInt(1);
+        for (unsigned width = origWidth / 2; width>=8; width /= 2) {
+          APInt newMask = APInt::getLowBitsSet(maskWidth, width);
+          for (unsigned offset=0; offset<origWidth/width; offset++) {
+            if (Mask.isSubsetOf(newMask)) {
+              if (Layout.isLittleEndian())
+                bestOffset = (uint64_t)offset * (width/8);
+              else
+                bestOffset = (origWidth/width - offset - 1) * (width/8);
+              bestMask = Mask.lshr(offset * (width/8) * 8);
+              bestWidth = width;
+              break;
+            }
+            newMask <<= width;
+          }
+        }
+      }
+      if (bestWidth) {
+        EVT newVT = EVT::getIntegerVT(*DAG.getContext(), bestWidth);
+        if (newVT.isRound() &&
+            shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT)) {
+          SDValue Ptr = Lod->getBasePtr();
+          if (bestOffset != 0)
+            Ptr =
+                DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(bestOffset), dl);
+          SDValue NewLoad =
+              DAG.getLoad(newVT, dl, Lod->getChain(), Ptr,
+                          Lod->getPointerInfo().getWithOffset(bestOffset),
+                          Lod->getOriginalAlign());
+          return DAG.getSetCC(dl, VT,
+                              DAG.getNode(ISD::AND, dl, newVT, NewLoad,
+                                      DAG.getConstant(bestMask.trunc(bestWidth),
+                                                      dl, newVT)),
+                              DAG.getConstant(0LL, dl, newVT), Cond);
+        }
+      }
+    }
+
+    // If the LHS is a ZERO_EXTEND, perform the comparison on the input.
+    if (N0.getOpcode() == ISD::ZERO_EXTEND) {
+      unsigned InSize = N0.getOperand(0).getValueSizeInBits();
+
+      // If the comparison constant has bits in the upper part, the
+      // zero-extended value could never match.
+      if (C1.intersects(APInt::getHighBitsSet(C1.getBitWidth(),
+                                              C1.getBitWidth() - InSize))) {
+        switch (Cond) {
+        case ISD::SETUGT:
+        case ISD::SETUGE:
+        case ISD::SETEQ:
+          return DAG.getConstant(0, dl, VT);
+        case ISD::SETULT:
+        case ISD::SETULE:
+        case ISD::SETNE:
+          return DAG.getConstant(1, dl, VT);
+        case ISD::SETGT:
+        case ISD::SETGE:
+          // True if the sign bit of C1 is set.
+          return DAG.getConstant(C1.isNegative(), dl, VT);
+        case ISD::SETLT:
+        case ISD::SETLE:
+          // True if the sign bit of C1 isn't set.
+          return DAG.getConstant(C1.isNonNegative(), dl, VT);
+        default:
+          break;
+        }
+      }
+
+      // Otherwise, we can perform the comparison with the low bits.
+      switch (Cond) {
+      case ISD::SETEQ:
+      case ISD::SETNE:
+      case ISD::SETUGT:
+      case ISD::SETUGE:
+      case ISD::SETULT:
+      case ISD::SETULE: {
+        EVT newVT = N0.getOperand(0).getValueType();
+        if (DCI.isBeforeLegalizeOps() ||
+            (isOperationLegal(ISD::SETCC, newVT) &&
+             isCondCodeLegal(Cond, newVT.getSimpleVT()))) {
+          EVT NewSetCCVT = getSetCCResultType(Layout, *DAG.getContext(), newVT);
+          SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT);
+
+          SDValue NewSetCC = DAG.getSetCC(dl, NewSetCCVT, N0.getOperand(0),
+                                          NewConst, Cond);
+          return DAG.getBoolExtOrTrunc(NewSetCC, dl, VT, N0.getValueType());
+        }
+        break;
+      }
+      default:
+        break; // todo, be more careful with signed comparisons
+      }
+    } else if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+               (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+               !isSExtCheaperThanZExt(cast<VTSDNode>(N0.getOperand(1))->getVT(),
+                                      OpVT)) {
+      EVT ExtSrcTy = cast<VTSDNode>(N0.getOperand(1))->getVT();
+      unsigned ExtSrcTyBits = ExtSrcTy.getSizeInBits();
+      EVT ExtDstTy = N0.getValueType();
+      unsigned ExtDstTyBits = ExtDstTy.getSizeInBits();
+
+      // If the constant doesn't fit into the number of bits for the source of
+      // the sign extension, it is impossible for both sides to be equal.
+      if (C1.getSignificantBits() > ExtSrcTyBits)
+        return DAG.getBoolConstant(Cond == ISD::SETNE, dl, VT, OpVT);
+
+      assert(ExtDstTy == N0.getOperand(0).getValueType() &&
+             ExtDstTy != ExtSrcTy && "Unexpected types!");
+      APInt Imm = APInt::getLowBitsSet(ExtDstTyBits, ExtSrcTyBits);
+      SDValue ZextOp = DAG.getNode(ISD::AND, dl, ExtDstTy, N0.getOperand(0),
+                                   DAG.getConstant(Imm, dl, ExtDstTy));
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(ZextOp.getNode());
+      // Otherwise, make this a use of a zext.
+      return DAG.getSetCC(dl, VT, ZextOp,
+                          DAG.getConstant(C1 & Imm, dl, ExtDstTy), Cond);
+    } else if ((N1C->isZero() || N1C->isOne()) &&
+               (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+      // SETCC (SETCC), [0|1], [EQ|NE]  -> SETCC
+      if (N0.getOpcode() == ISD::SETCC &&
+          isTypeLegal(VT) && VT.bitsLE(N0.getValueType()) &&
+          (N0.getValueType() == MVT::i1 ||
+           getBooleanContents(N0.getOperand(0).getValueType()) ==
+                       ZeroOrOneBooleanContent)) {
+        bool TrueWhenTrue = (Cond == ISD::SETEQ) ^ (!N1C->isOne());
+        if (TrueWhenTrue)
+          return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
+        // Invert the condition.
+        ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+        CC = ISD::getSetCCInverse(CC, N0.getOperand(0).getValueType());
+        if (DCI.isBeforeLegalizeOps() ||
+            isCondCodeLegal(CC, N0.getOperand(0).getSimpleValueType()))
+          return DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
+      }
+
+      if ((N0.getOpcode() == ISD::XOR ||
+           (N0.getOpcode() == ISD::AND &&
+            N0.getOperand(0).getOpcode() == ISD::XOR &&
+            N0.getOperand(1) == N0.getOperand(0).getOperand(1))) &&
+          isOneConstant(N0.getOperand(1))) {
+        // If this is (X^1) == 0/1, swap the RHS and eliminate the xor.  We
+        // can only do this if the top bits are known zero.
+        unsigned BitWidth = N0.getValueSizeInBits();
+        if (DAG.MaskedValueIsZero(N0,
+                                  APInt::getHighBitsSet(BitWidth,
+                                                        BitWidth-1))) {
+          // Okay, get the un-inverted input value.
+          SDValue Val;
+          if (N0.getOpcode() == ISD::XOR) {
+            Val = N0.getOperand(0);
+          } else {
+            assert(N0.getOpcode() == ISD::AND &&
+                    N0.getOperand(0).getOpcode() == ISD::XOR);
+            // ((X^1)&1)^1 -> X & 1
+            Val = DAG.getNode(ISD::AND, dl, N0.getValueType(),
+                              N0.getOperand(0).getOperand(0),
+                              N0.getOperand(1));
+          }
+
+          return DAG.getSetCC(dl, VT, Val, N1,
+                              Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
+        }
+      } else if (N1C->isOne()) {
+        SDValue Op0 = N0;
+        if (Op0.getOpcode() == ISD::TRUNCATE)
+          Op0 = Op0.getOperand(0);
+
+        if ((Op0.getOpcode() == ISD::XOR) &&
+            Op0.getOperand(0).getOpcode() == ISD::SETCC &&
+            Op0.getOperand(1).getOpcode() == ISD::SETCC) {
+          SDValue XorLHS = Op0.getOperand(0);
+          SDValue XorRHS = Op0.getOperand(1);
+          // Ensure that the input setccs return an i1 type or 0/1 value.
+          if (Op0.getValueType() == MVT::i1 ||
+              (getBooleanContents(XorLHS.getOperand(0).getValueType()) ==
+                      ZeroOrOneBooleanContent &&
+               getBooleanContents(XorRHS.getOperand(0).getValueType()) ==
+                        ZeroOrOneBooleanContent)) {
+            // (xor (setcc), (setcc)) == / != 1 -> (setcc) != / == (setcc)
+            Cond = (Cond == ISD::SETEQ) ? ISD::SETNE : ISD::SETEQ;
+            return DAG.getSetCC(dl, VT, XorLHS, XorRHS, Cond);
+          }
+        }
+        if (Op0.getOpcode() == ISD::AND && isOneConstant(Op0.getOperand(1))) {
+          // If this is (X&1) == / != 1, normalize it to (X&1) != / == 0.
+          if (Op0.getValueType().bitsGT(VT))
+            Op0 = DAG.getNode(ISD::AND, dl, VT,
+                          DAG.getNode(ISD::TRUNCATE, dl, VT, Op0.getOperand(0)),
+                          DAG.getConstant(1, dl, VT));
+          else if (Op0.getValueType().bitsLT(VT))
+            Op0 = DAG.getNode(ISD::AND, dl, VT,
+                        DAG.getNode(ISD::ANY_EXTEND, dl, VT, Op0.getOperand(0)),
+                        DAG.getConstant(1, dl, VT));
+
+          return DAG.getSetCC(dl, VT, Op0,
+                              DAG.getConstant(0, dl, Op0.getValueType()),
+                              Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
+        }
+        if (Op0.getOpcode() == ISD::AssertZext &&
+            cast<VTSDNode>(Op0.getOperand(1))->getVT() == MVT::i1)
+          return DAG.getSetCC(dl, VT, Op0,
+                              DAG.getConstant(0, dl, Op0.getValueType()),
+                              Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
+      }
+    }
+
+    // Given:
+    //   icmp eq/ne (urem %x, %y), 0
+    // Iff %x has 0 or 1 bits set, and %y has at least 2 bits set, omit 'urem':
+    //   icmp eq/ne %x, 0
+    if (N0.getOpcode() == ISD::UREM && N1C->isZero() &&
+        (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+      KnownBits XKnown = DAG.computeKnownBits(N0.getOperand(0));
+      KnownBits YKnown = DAG.computeKnownBits(N0.getOperand(1));
+      if (XKnown.countMaxPopulation() == 1 && YKnown.countMinPopulation() >= 2)
+        return DAG.getSetCC(dl, VT, N0.getOperand(0), N1, Cond);
+    }
+
+    // Fold set_cc seteq (ashr X, BW-1), -1 -> set_cc setlt X, 0
+    //  and set_cc setne (ashr X, BW-1), -1 -> set_cc setge X, 0
+    if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+        N0.getOpcode() == ISD::SRA && isa<ConstantSDNode>(N0.getOperand(1)) &&
+        N0.getConstantOperandAPInt(1) == OpVT.getScalarSizeInBits() - 1 &&
+        N1C && N1C->isAllOnes()) {
+      return DAG.getSetCC(dl, VT, N0.getOperand(0),
+                          DAG.getConstant(0, dl, OpVT),
+                          Cond == ISD::SETEQ ? ISD::SETLT : ISD::SETGE);
+    }
+
+    if (SDValue V =
+            optimizeSetCCOfSignedTruncationCheck(VT, N0, N1, Cond, DCI, dl))
+      return V;
+  }
+
+  // These simplifications apply to splat vectors as well.
+  // TODO: Handle more splat vector cases.
+  if (auto *N1C = isConstOrConstSplat(N1)) {
+    const APInt &C1 = N1C->getAPIntValue();
+
+    APInt MinVal, MaxVal;
+    unsigned OperandBitSize = N1C->getValueType(0).getScalarSizeInBits();
+    if (ISD::isSignedIntSetCC(Cond)) {
+      MinVal = APInt::getSignedMinValue(OperandBitSize);
+      MaxVal = APInt::getSignedMaxValue(OperandBitSize);
+    } else {
+      MinVal = APInt::getMinValue(OperandBitSize);
+      MaxVal = APInt::getMaxValue(OperandBitSize);
+    }
+
+    // Canonicalize GE/LE comparisons to use GT/LT comparisons.
+    if (Cond == ISD::SETGE || Cond == ISD::SETUGE) {
+      // X >= MIN --> true
+      if (C1 == MinVal)
+        return DAG.getBoolConstant(true, dl, VT, OpVT);
+
+      if (!VT.isVector()) { // TODO: Support this for vectors.
+        // X >= C0 --> X > (C0 - 1)
+        APInt C = C1 - 1;
+        ISD::CondCode NewCC = (Cond == ISD::SETGE) ? ISD::SETGT : ISD::SETUGT;
+        if ((DCI.isBeforeLegalizeOps() ||
+             isCondCodeLegal(NewCC, VT.getSimpleVT())) &&
+            (!N1C->isOpaque() || (C.getBitWidth() <= 64 &&
+                                  isLegalICmpImmediate(C.getSExtValue())))) {
+          return DAG.getSetCC(dl, VT, N0,
+                              DAG.getConstant(C, dl, N1.getValueType()),
+                              NewCC);
+        }
+      }
+    }
+
+    if (Cond == ISD::SETLE || Cond == ISD::SETULE) {
+      // X <= MAX --> true
+      if (C1 == MaxVal)
+        return DAG.getBoolConstant(true, dl, VT, OpVT);
+
+      // X <= C0 --> X < (C0 + 1)
+      if (!VT.isVector()) { // TODO: Support this for vectors.
+        APInt C = C1 + 1;
+        ISD::CondCode NewCC = (Cond == ISD::SETLE) ? ISD::SETLT : ISD::SETULT;
+        if ((DCI.isBeforeLegalizeOps() ||
+             isCondCodeLegal(NewCC, VT.getSimpleVT())) &&
+            (!N1C->isOpaque() || (C.getBitWidth() <= 64 &&
+                                  isLegalICmpImmediate(C.getSExtValue())))) {
+          return DAG.getSetCC(dl, VT, N0,
+                              DAG.getConstant(C, dl, N1.getValueType()),
+                              NewCC);
+        }
+      }
+    }
+
+    if (Cond == ISD::SETLT || Cond == ISD::SETULT) {
+      if (C1 == MinVal)
+        return DAG.getBoolConstant(false, dl, VT, OpVT); // X < MIN --> false
+
+      // TODO: Support this for vectors after legalize ops.
+      if (!VT.isVector() || DCI.isBeforeLegalizeOps()) {
+        // Canonicalize setlt X, Max --> setne X, Max
+        if (C1 == MaxVal)
+          return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);
+
+        // If we have setult X, 1, turn it into seteq X, 0
+        if (C1 == MinVal+1)
+          return DAG.getSetCC(dl, VT, N0,
+                              DAG.getConstant(MinVal, dl, N0.getValueType()),
+                              ISD::SETEQ);
+      }
+    }
+
+    if (Cond == ISD::SETGT || Cond == ISD::SETUGT) {
+      if (C1 == MaxVal)
+        return DAG.getBoolConstant(false, dl, VT, OpVT); // X > MAX --> false
+
+      // TODO: Support this for vectors after legalize ops.
+      if (!VT.isVector() || DCI.isBeforeLegalizeOps()) {
+        // Canonicalize setgt X, Min --> setne X, Min
+        if (C1 == MinVal)
+          return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);
+
+        // If we have setugt X, Max-1, turn it into seteq X, Max
+        if (C1 == MaxVal-1)
+          return DAG.getSetCC(dl, VT, N0,
+                              DAG.getConstant(MaxVal, dl, N0.getValueType()),
+                              ISD::SETEQ);
+      }
+    }
+
+    if (Cond == ISD::SETEQ || Cond == ISD::SETNE) {
+      // (X & (C l>>/<< Y)) ==/!= 0  -->  ((X <</l>> Y) & C) ==/!= 0
+      if (C1.isZero())
+        if (SDValue CC = optimizeSetCCByHoistingAndByConstFromLogicalShift(
+                VT, N0, N1, Cond, DCI, dl))
+          return CC;
+
+      // For all/any comparisons, replace or(x,shl(y,bw/2)) with and/or(x,y).
+      // For example, when high 32-bits of i64 X are known clear:
+      // all bits clear: (X | (Y<<32)) ==  0 --> (X | Y) ==  0
+      // all bits set:   (X | (Y<<32)) == -1 --> (X & Y) == -1
+      bool CmpZero = N1C->isZero();
+      bool CmpNegOne = N1C->isAllOnes();
+      if ((CmpZero || CmpNegOne) && N0.hasOneUse()) {
+        // Match or(lo,shl(hi,bw/2)) pattern.
+        auto IsConcat = [&](SDValue V, SDValue &Lo, SDValue &Hi) {
+          unsigned EltBits = V.getScalarValueSizeInBits();
+          if (V.getOpcode() != ISD::OR || (EltBits % 2) != 0)
+            return false;
+          SDValue LHS = V.getOperand(0);
+          SDValue RHS = V.getOperand(1);
+          APInt HiBits = APInt::getHighBitsSet(EltBits, EltBits / 2);
+          // Unshifted element must have zero upperbits.
+          if (RHS.getOpcode() == ISD::SHL &&
+              isa<ConstantSDNode>(RHS.getOperand(1)) &&
+              RHS.getConstantOperandAPInt(1) == (EltBits / 2) &&
+              DAG.MaskedValueIsZero(LHS, HiBits)) {
+            Lo = LHS;
+            Hi = RHS.getOperand(0);
+            return true;
+          }
+          if (LHS.getOpcode() == ISD::SHL &&
+              isa<ConstantSDNode>(LHS.getOperand(1)) &&
+              LHS.getConstantOperandAPInt(1) == (EltBits / 2) &&
+              DAG.MaskedValueIsZero(RHS, HiBits)) {
+            Lo = RHS;
+            Hi = LHS.getOperand(0);
+            return true;
+          }
+          return false;
+        };
+
+        auto MergeConcat = [&](SDValue Lo, SDValue Hi) {
+          unsigned EltBits = N0.getScalarValueSizeInBits();
+          unsigned HalfBits = EltBits / 2;
+          APInt HiBits = APInt::getHighBitsSet(EltBits, HalfBits);
+          SDValue LoBits = DAG.getConstant(~HiBits, dl, OpVT);
+          SDValue HiMask = DAG.getNode(ISD::AND, dl, OpVT, Hi, LoBits);
+          SDValue NewN0 =
+              DAG.getNode(CmpZero ? ISD::OR : ISD::AND, dl, OpVT, Lo, HiMask);
+          SDValue NewN1 = CmpZero ? DAG.getConstant(0, dl, OpVT) : LoBits;
+          return DAG.getSetCC(dl, VT, NewN0, NewN1, Cond);
+        };
+
+        SDValue Lo, Hi;
+        if (IsConcat(N0, Lo, Hi))
+          return MergeConcat(Lo, Hi);
+
+        if (N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR) {
+          SDValue Lo0, Lo1, Hi0, Hi1;
+          if (IsConcat(N0.getOperand(0), Lo0, Hi0) &&
+              IsConcat(N0.getOperand(1), Lo1, Hi1)) {
+            return MergeConcat(DAG.getNode(N0.getOpcode(), dl, OpVT, Lo0, Lo1),
+                               DAG.getNode(N0.getOpcode(), dl, OpVT, Hi0, Hi1));
+          }
+        }
+      }
+    }
+
+    // If we have "setcc X, C0", check to see if we can shrink the immediate
+    // by changing cc.
+    // TODO: Support this for vectors after legalize ops.
+    if (!VT.isVector() || DCI.isBeforeLegalizeOps()) {
+      // SETUGT X, SINTMAX  -> SETLT X, 0
+      // SETUGE X, SINTMIN -> SETLT X, 0
+      if ((Cond == ISD::SETUGT && C1.isMaxSignedValue()) ||
+          (Cond == ISD::SETUGE && C1.isMinSignedValue()))
+        return DAG.getSetCC(dl, VT, N0,
+                            DAG.getConstant(0, dl, N1.getValueType()),
+                            ISD::SETLT);
+
+      // SETULT X, SINTMIN  -> SETGT X, -1
+      // SETULE X, SINTMAX  -> SETGT X, -1
+      if ((Cond == ISD::SETULT && C1.isMinSignedValue()) ||
+          (Cond == ISD::SETULE && C1.isMaxSignedValue()))
+        return DAG.getSetCC(dl, VT, N0,
+                            DAG.getAllOnesConstant(dl, N1.getValueType()),
+                            ISD::SETGT);
+    }
+  }
+
+  // Back to non-vector simplifications.
+  // TODO: Can we do these for vector splats?
+  if (auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    const APInt &C1 = N1C->getAPIntValue();
+    EVT ShValTy = N0.getValueType();
+
+    // Fold bit comparisons when we can. This will result in an
+    // incorrect value when boolean false is negative one, unless
+    // the bitsize is 1 in which case the false value is the same
+    // in practice regardless of the representation.
+    if ((VT.getSizeInBits() == 1 ||
+         getBooleanContents(N0.getValueType()) == ZeroOrOneBooleanContent) &&
+        (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+        (VT == ShValTy || (isTypeLegal(VT) && VT.bitsLE(ShValTy))) &&
+        N0.getOpcode() == ISD::AND) {
+      if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+        EVT ShiftTy =
+            getShiftAmountTy(ShValTy, Layout, !DCI.isBeforeLegalize());
+        if (Cond == ISD::SETNE && C1 == 0) {// (X & 8) != 0  -->  (X & 8) >> 3
+          // Perform the xform if the AND RHS is a single bit.
+          unsigned ShCt = AndRHS->getAPIntValue().logBase2();
+          if (AndRHS->getAPIntValue().isPowerOf2() &&
+              !TLI.shouldAvoidTransformToShift(ShValTy, ShCt)) {
+            return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                               DAG.getNode(ISD::SRL, dl, ShValTy, N0,
+                                           DAG.getConstant(ShCt, dl, ShiftTy)));
+          }
+        } else if (Cond == ISD::SETEQ && C1 == AndRHS->getAPIntValue()) {
+          // (X & 8) == 8  -->  (X & 8) >> 3
+          // Perform the xform if C1 is a single bit.
+          unsigned ShCt = C1.logBase2();
+          if (C1.isPowerOf2() &&
+              !TLI.shouldAvoidTransformToShift(ShValTy, ShCt)) {
+            return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                               DAG.getNode(ISD::SRL, dl, ShValTy, N0,
+                                           DAG.getConstant(ShCt, dl, ShiftTy)));
+          }
+        }
+      }
+    }
+
+    if (C1.getSignificantBits() <= 64 &&
+        !isLegalICmpImmediate(C1.getSExtValue())) {
+      EVT ShiftTy = getShiftAmountTy(ShValTy, Layout, !DCI.isBeforeLegalize());
+      // (X & -256) == 256 -> (X >> 8) == 1
+      if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+          N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
+        if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+          const APInt &AndRHSC = AndRHS->getAPIntValue();
+          if (AndRHSC.isNegatedPowerOf2() && (AndRHSC & C1) == C1) {
+            unsigned ShiftBits = AndRHSC.countr_zero();
+            if (!TLI.shouldAvoidTransformToShift(ShValTy, ShiftBits)) {
+              SDValue Shift =
+                DAG.getNode(ISD::SRL, dl, ShValTy, N0.getOperand(0),
+                            DAG.getConstant(ShiftBits, dl, ShiftTy));
+              SDValue CmpRHS = DAG.getConstant(C1.lshr(ShiftBits), dl, ShValTy);
+              return DAG.getSetCC(dl, VT, Shift, CmpRHS, Cond);
+            }
+          }
+        }
+      } else if (Cond == ISD::SETULT || Cond == ISD::SETUGE ||
+                 Cond == ISD::SETULE || Cond == ISD::SETUGT) {
+        bool AdjOne = (Cond == ISD::SETULE || Cond == ISD::SETUGT);
+        // X <  0x100000000 -> (X >> 32) <  1
+        // X >= 0x100000000 -> (X >> 32) >= 1
+        // X <= 0x0ffffffff -> (X >> 32) <  1
+        // X >  0x0ffffffff -> (X >> 32) >= 1
+        unsigned ShiftBits;
+        APInt NewC = C1;
+        ISD::CondCode NewCond = Cond;
+        if (AdjOne) {
+          ShiftBits = C1.countr_one();
+          NewC = NewC + 1;
+          NewCond = (Cond == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+        } else {
+          ShiftBits = C1.countr_zero();
+        }
+        NewC.lshrInPlace(ShiftBits);
+        if (ShiftBits && NewC.getSignificantBits() <= 64 &&
+            isLegalICmpImmediate(NewC.getSExtValue()) &&
+            !TLI.shouldAvoidTransformToShift(ShValTy, ShiftBits)) {
+          SDValue Shift = DAG.getNode(ISD::SRL, dl, ShValTy, N0,
+                                      DAG.getConstant(ShiftBits, dl, ShiftTy));
+          SDValue CmpRHS = DAG.getConstant(NewC, dl, ShValTy);
+          return DAG.getSetCC(dl, VT, Shift, CmpRHS, NewCond);
+        }
+      }
+    }
+  }
+
+  if (!isa<ConstantFPSDNode>(N0) && isa<ConstantFPSDNode>(N1)) {
+    auto *CFP = cast<ConstantFPSDNode>(N1);
+    assert(!CFP->getValueAPF().isNaN() && "Unexpected NaN value");
+
+    // Otherwise, we know the RHS is not a NaN.  Simplify the node to drop the
+    // constant if knowing that the operand is non-nan is enough.  We prefer to
+    // have SETO(x,x) instead of SETO(x, 0.0) because this avoids having to
+    // materialize 0.0.
+    if (Cond == ISD::SETO || Cond == ISD::SETUO)
+      return DAG.getSetCC(dl, VT, N0, N0, Cond);
+
+    // setcc (fneg x), C -> setcc swap(pred) x, -C
+    if (N0.getOpcode() == ISD::FNEG) {
+      ISD::CondCode SwapCond = ISD::getSetCCSwappedOperands(Cond);
+      if (DCI.isBeforeLegalizeOps() ||
+          isCondCodeLegal(SwapCond, N0.getSimpleValueType())) {
+        SDValue NegN1 = DAG.getNode(ISD::FNEG, dl, N0.getValueType(), N1);
+        return DAG.getSetCC(dl, VT, N0.getOperand(0), NegN1, SwapCond);
+      }
+    }
+
+    // If the condition is not legal, see if we can find an equivalent one
+    // which is legal.
+    if (!isCondCodeLegal(Cond, N0.getSimpleValueType())) {
+      // If the comparison was an awkward floating-point == or != and one of
+      // the comparison operands is infinity or negative infinity, convert the
+      // condition to a less-awkward <= or >=.
+      if (CFP->getValueAPF().isInfinity()) {
+        bool IsNegInf = CFP->getValueAPF().isNegative();
+        ISD::CondCode NewCond = ISD::SETCC_INVALID;
+        switch (Cond) {
+        case ISD::SETOEQ: NewCond = IsNegInf ? ISD::SETOLE : ISD::SETOGE; break;
+        case ISD::SETUEQ: NewCond = IsNegInf ? ISD::SETULE : ISD::SETUGE; break;
+        case ISD::SETUNE: NewCond = IsNegInf ? ISD::SETUGT : ISD::SETULT; break;
+        case ISD::SETONE: NewCond = IsNegInf ? ISD::SETOGT : ISD::SETOLT; break;
+        default: break;
+        }
+        if (NewCond != ISD::SETCC_INVALID &&
+            isCondCodeLegal(NewCond, N0.getSimpleValueType()))
+          return DAG.getSetCC(dl, VT, N0, N1, NewCond);
+      }
+    }
+  }
+
+  if (N0 == N1) {
+    // The sext(setcc()) => setcc() optimization relies on the appropriate
+    // constant being emitted.
+    assert(!N0.getValueType().isInteger() &&
+           "Integer types should be handled by FoldSetCC");
+
+    bool EqTrue = ISD::isTrueWhenEqual(Cond);
+    unsigned UOF = ISD::getUnorderedFlavor(Cond);
+    if (UOF == 2) // FP operators that are undefined on NaNs.
+      return DAG.getBoolConstant(EqTrue, dl, VT, OpVT);
+    if (UOF == unsigned(EqTrue))
+      return DAG.getBoolConstant(EqTrue, dl, VT, OpVT);
+    // Otherwise, we can't fold it.  However, we can simplify it to SETUO/SETO
+    // if it is not already.
+    ISD::CondCode NewCond = UOF == 0 ? ISD::SETO : ISD::SETUO;
+    if (NewCond != Cond &&
+        (DCI.isBeforeLegalizeOps() ||
+                            isCondCodeLegal(NewCond, N0.getSimpleValueType())))
+      return DAG.getSetCC(dl, VT, N0, N1, NewCond);
+  }
+
+  // ~X > ~Y --> Y > X
+  // ~X < ~Y --> Y < X
+  // ~X < C --> X > ~C
+  // ~X > C --> X < ~C
+  if ((isSignedIntSetCC(Cond) || isUnsignedIntSetCC(Cond)) &&
+      N0.getValueType().isInteger()) {
+    if (isBitwiseNot(N0)) {
+      if (isBitwiseNot(N1))
+        return DAG.getSetCC(dl, VT, N1.getOperand(0), N0.getOperand(0), Cond);
+
+      if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
+        SDValue Not = DAG.getNOT(dl, N1, OpVT);
+        return DAG.getSetCC(dl, VT, Not, N0.getOperand(0), Cond);
+      }
+    }
+  }
+
+  if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+      N0.getValueType().isInteger()) {
+    if (N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::SUB ||
+        N0.getOpcode() == ISD::XOR) {
+      // Simplify (X+Y) == (X+Z) -->  Y == Z
+      if (N0.getOpcode() == N1.getOpcode()) {
+        if (N0.getOperand(0) == N1.getOperand(0))
+          return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(1), Cond);
+        if (N0.getOperand(1) == N1.getOperand(1))
+          return DAG.getSetCC(dl, VT, N0.getOperand(0), N1.getOperand(0), Cond);
+        if (isCommutativeBinOp(N0.getOpcode())) {
+          // If X op Y == Y op X, try other combinations.
+          if (N0.getOperand(0) == N1.getOperand(1))
+            return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(0),
+                                Cond);
+          if (N0.getOperand(1) == N1.getOperand(0))
+            return DAG.getSetCC(dl, VT, N0.getOperand(0), N1.getOperand(1),
+                                Cond);
+        }
+      }
+
+      // If RHS is a legal immediate value for a compare instruction, we need
+      // to be careful about increasing register pressure needlessly.
+      bool LegalRHSImm = false;
+
+      if (auto *RHSC = dyn_cast<ConstantSDNode>(N1)) {
+        if (auto *LHSR = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+          // Turn (X+C1) == C2 --> X == C2-C1
+          if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse())
+            return DAG.getSetCC(
+                dl, VT, N0.getOperand(0),
+                DAG.getConstant(RHSC->getAPIntValue() - LHSR->getAPIntValue(),
+                                dl, N0.getValueType()),
+                Cond);
+
+          // Turn (X^C1) == C2 --> X == C1^C2
+          if (N0.getOpcode() == ISD::XOR && N0.getNode()->hasOneUse())
+            return DAG.getSetCC(
+                dl, VT, N0.getOperand(0),
+                DAG.getConstant(LHSR->getAPIntValue() ^ RHSC->getAPIntValue(),
+                                dl, N0.getValueType()),
+                Cond);
+        }
+
+        // Turn (C1-X) == C2 --> X == C1-C2
+        if (auto *SUBC = dyn_cast<ConstantSDNode>(N0.getOperand(0)))
+          if (N0.getOpcode() == ISD::SUB && N0.getNode()->hasOneUse())
+            return DAG.getSetCC(
+                dl, VT, N0.getOperand(1),
+                DAG.getConstant(SUBC->getAPIntValue() - RHSC->getAPIntValue(),
+                                dl, N0.getValueType()),
+                Cond);
+
+        // Could RHSC fold directly into a compare?
+        if (RHSC->getValueType(0).getSizeInBits() <= 64)
+          LegalRHSImm = isLegalICmpImmediate(RHSC->getSExtValue());
+      }
+
+      // (X+Y) == X --> Y == 0 and similar folds.
+      // Don't do this if X is an immediate that can fold into a cmp
+      // instruction and X+Y has other uses. It could be an induction variable
+      // chain, and the transform would increase register pressure.
+      if (!LegalRHSImm || N0.hasOneUse())
+        if (SDValue V = foldSetCCWithBinOp(VT, N0, N1, Cond, dl, DCI))
+          return V;
+    }
+
+    if (N1.getOpcode() == ISD::ADD || N1.getOpcode() == ISD::SUB ||
+        N1.getOpcode() == ISD::XOR)
+      if (SDValue V = foldSetCCWithBinOp(VT, N1, N0, Cond, dl, DCI))
+        return V;
+
+    if (SDValue V = foldSetCCWithAnd(VT, N0, N1, Cond, dl, DCI))
+      return V;
+  }
+
+  // Fold remainder of division by a constant.
+  if ((N0.getOpcode() == ISD::UREM || N0.getOpcode() == ISD::SREM) &&
+      N0.hasOneUse() && (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+    // When division is cheap or optimizing for minimum size,
+    // fall through to DIVREM creation by skipping this fold.
+    if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttr(Attribute::MinSize)) {
+      if (N0.getOpcode() == ISD::UREM) {
+        if (SDValue Folded = buildUREMEqFold(VT, N0, N1, Cond, DCI, dl))
+          return Folded;
+      } else if (N0.getOpcode() == ISD::SREM) {
+        if (SDValue Folded = buildSREMEqFold(VT, N0, N1, Cond, DCI, dl))
+          return Folded;
+      }
+    }
+  }
+
+  // Fold away ALL boolean setcc's.
+  if (N0.getValueType().getScalarType() == MVT::i1 && foldBooleans) {
+    SDValue Temp;
+    switch (Cond) {
+    default: llvm_unreachable("Unknown integer setcc!");
+    case ISD::SETEQ:  // X == Y  -> ~(X^Y)
+      Temp = DAG.getNode(ISD::XOR, dl, OpVT, N0, N1);
+      N0 = DAG.getNOT(dl, Temp, OpVT);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETNE:  // X != Y   -->  (X^Y)
+      N0 = DAG.getNode(ISD::XOR, dl, OpVT, N0, N1);
+      break;
+    case ISD::SETGT:  // X >s Y   -->  X == 0 & Y == 1  -->  ~X & Y
+    case ISD::SETULT: // X <u Y   -->  X == 0 & Y == 1  -->  ~X & Y
+      Temp = DAG.getNOT(dl, N0, OpVT);
+      N0 = DAG.getNode(ISD::AND, dl, OpVT, N1, Temp);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETLT:  // X <s Y   --> X == 1 & Y == 0  -->  ~Y & X
+    case ISD::SETUGT: // X >u Y   --> X == 1 & Y == 0  -->  ~Y & X
+      Temp = DAG.getNOT(dl, N1, OpVT);
+      N0 = DAG.getNode(ISD::AND, dl, OpVT, N0, Temp);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETULE: // X <=u Y  --> X == 0 | Y == 1  -->  ~X | Y
+    case ISD::SETGE:  // X >=s Y  --> X == 0 | Y == 1  -->  ~X | Y
+      Temp = DAG.getNOT(dl, N0, OpVT);
+      N0 = DAG.getNode(ISD::OR, dl, OpVT, N1, Temp);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETUGE: // X >=u Y  --> X == 1 | Y == 0  -->  ~Y | X
+    case ISD::SETLE:  // X <=s Y  --> X == 1 | Y == 0  -->  ~Y | X
+      Temp = DAG.getNOT(dl, N1, OpVT);
+      N0 = DAG.getNode(ISD::OR, dl, OpVT, N0, Temp);
+      break;
+    }
+    if (VT.getScalarType() != MVT::i1) {
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(N0.getNode());
+      // FIXME: If running after legalize, we probably can't do this.
+      ISD::NodeType ExtendCode = getExtendForContent(getBooleanContents(OpVT));
+      N0 = DAG.getNode(ExtendCode, dl, VT, N0);
+    }
+    return N0;
+  }
+
+  // Could not fold it.
+  return SDValue();
+}
+
+/// Returns true (and the GlobalValue and the offset) if the node is a
+/// GlobalAddress + offset.
+bool TargetLowering::isGAPlusOffset(SDNode *WN, const GlobalValue *&GA,
+                                    int64_t &Offset) const {
+
+  SDNode *N = unwrapAddress(SDValue(WN, 0)).getNode();
+
+  if (auto *GASD = dyn_cast<GlobalAddressSDNode>(N)) {
+    GA = GASD->getGlobal();
+    Offset += GASD->getOffset();
+    return true;
+  }
+
+  if (N->getOpcode() == ISD::ADD) {
+    SDValue N1 = N->getOperand(0);
+    SDValue N2 = N->getOperand(1);
+    if (isGAPlusOffset(N1.getNode(), GA, Offset)) {
+      if (auto *V = dyn_cast<ConstantSDNode>(N2)) {
+        Offset += V->getSExtValue();
+        return true;
+      }
+    } else if (isGAPlusOffset(N2.getNode(), GA, Offset)) {
+      if (auto *V = dyn_cast<ConstantSDNode>(N1)) {
+        Offset += V->getSExtValue();
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+SDValue TargetLowering::PerformDAGCombine(SDNode *N,
+                                          DAGCombinerInfo &DCI) const {
+  // Default implementation: no optimization.
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Inline Assembler Implementation Methods
+//===----------------------------------------------------------------------===//
+
+TargetLowering::ConstraintType
+TargetLowering::getConstraintType(StringRef Constraint) const {
+  unsigned S = Constraint.size();
+
+  if (S == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'r':
+      return C_RegisterClass;
+    case 'm': // memory
+    case 'o': // offsetable
+    case 'V': // not offsetable
+      return C_Memory;
+    case 'p': // Address.
+      return C_Address;
+    case 'n': // Simple Integer
+    case 'E': // Floating Point Constant
+    case 'F': // Floating Point Constant
+      return C_Immediate;
+    case 'i': // Simple Integer or Relocatable Constant
+    case 's': // Relocatable Constant
+    case 'X': // Allow ANY value.
+    case 'I': // Target registers.
+    case 'J':
+    case 'K':
+    case 'L':
+    case 'M':
+    case 'N':
+    case 'O':
+    case 'P':
+    case '<':
+    case '>':
+      return C_Other;
+    }
+  }
+
+  if (S > 1 && Constraint[0] == '{' && Constraint[S - 1] == '}') {
+    if (S == 8 && Constraint.substr(1, 6) == "memory") // "{memory}"
+      return C_Memory;
+    return C_Register;
+  }
+  return C_Unknown;
+}
+
+/// Try to replace an X constraint, which matches anything, with another that
+/// has more specific requirements based on the type of the corresponding
+/// operand.
+const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
+  if (ConstraintVT.isInteger())
+    return "r";
+  if (ConstraintVT.isFloatingPoint())
+    return "f"; // works for many targets
+  return nullptr;
+}
+
+SDValue TargetLowering::LowerAsmOutputForConstraint(
+    SDValue &Chain, SDValue &Glue, const SDLoc &DL,
+    const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
+  return SDValue();
+}
+
+/// Lower the specified operand into the Ops vector.
+/// If it is invalid, don't add anything to Ops.
+void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                  std::string &Constraint,
+                                                  std::vector<SDValue> &Ops,
+                                                  SelectionDAG &DAG) const {
+
+  if (Constraint.length() > 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break;
+  case 'X':    // Allows any operand
+  case 'i':    // Simple Integer or Relocatable Constant
+  case 'n':    // Simple Integer
+  case 's': {  // Relocatable Constant
+
+    ConstantSDNode *C;
+    uint64_t Offset = 0;
+
+    // Match (GA) or (C) or (GA+C) or (GA-C) or ((GA+C)+C) or (((GA+C)+C)+C),
+    // etc., since getelementpointer is variadic. We can't use
+    // SelectionDAG::FoldSymbolOffset because it expects the GA to be accessible
+    // while in this case the GA may be furthest from the root node which is
+    // likely an ISD::ADD.
+    while (true) {
+      if ((C = dyn_cast<ConstantSDNode>(Op)) && ConstraintLetter != 's') {
+        // gcc prints these as sign extended.  Sign extend value to 64 bits
+        // now; without this it would get ZExt'd later in
+        // ScheduleDAGSDNodes::EmitNode, which is very generic.
+        bool IsBool = C->getConstantIntValue()->getBitWidth() == 1;
+        BooleanContent BCont = getBooleanContents(MVT::i64);
+        ISD::NodeType ExtOpc =
+            IsBool ? getExtendForContent(BCont) : ISD::SIGN_EXTEND;
+        int64_t ExtVal =
+            ExtOpc == ISD::ZERO_EXTEND ? C->getZExtValue() : C->getSExtValue();
+        Ops.push_back(
+            DAG.getTargetConstant(Offset + ExtVal, SDLoc(C), MVT::i64));
+        return;
+      }
+      if (ConstraintLetter != 'n') {
+        if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
+          Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
+                                                   GA->getValueType(0),
+                                                   Offset + GA->getOffset()));
+          return;
+        }
+        if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
+          Ops.push_back(DAG.getTargetBlockAddress(
+              BA->getBlockAddress(), BA->getValueType(0),
+              Offset + BA->getOffset(), BA->getTargetFlags()));
+          return;
+        }
+        if (isa<BasicBlockSDNode>(Op)) {
+          Ops.push_back(Op);
+          return;
+        }
+      }
+      const unsigned OpCode = Op.getOpcode();
+      if (OpCode == ISD::ADD || OpCode == ISD::SUB) {
+        if ((C = dyn_cast<ConstantSDNode>(Op.getOperand(0))))
+          Op = Op.getOperand(1);
+        // Subtraction is not commutative.
+        else if (OpCode == ISD::ADD &&
+                 (C = dyn_cast<ConstantSDNode>(Op.getOperand(1))))
+          Op = Op.getOperand(0);
+        else
+          return;
+        Offset += (OpCode == ISD::ADD ? 1 : -1) * C->getSExtValue();
+        continue;
+      }
+      return;
+    }
+    break;
+  }
+  }
+}
+
+void TargetLowering::CollectTargetIntrinsicOperands(
+    const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI,
+                                             StringRef Constraint,
+                                             MVT VT) const {
+  if (Constraint.empty() || Constraint[0] != '{')
+    return std::make_pair(0u, static_cast<TargetRegisterClass *>(nullptr));
+  assert(*(Constraint.end() - 1) == '}' && "Not a brace enclosed constraint?");
+
+  // Remove the braces from around the name.
+  StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
+
+  std::pair<unsigned, const TargetRegisterClass *> R =
+      std::make_pair(0u, static_cast<const TargetRegisterClass *>(nullptr));
+
+  // Figure out which register class contains this reg.
+  for (const TargetRegisterClass *RC : RI->regclasses()) {
+    // If none of the value types for this register class are valid, we
+    // can't use it.  For example, 64-bit reg classes on 32-bit targets.
+    if (!isLegalRC(*RI, *RC))
+      continue;
+
+    for (const MCPhysReg &PR : *RC) {
+      if (RegName.equals_insensitive(RI->getRegAsmName(PR))) {
+        std::pair<unsigned, const TargetRegisterClass *> S =
+            std::make_pair(PR, RC);
+
+        // If this register class has the requested value type, return it,
+        // otherwise keep searching and return the first class found
+        // if no other is found which explicitly has the requested type.
+        if (RI->isTypeLegalForClass(*RC, VT))
+          return S;
+        if (!R.second)
+          R = S;
+      }
+    }
+  }
+
+  return R;
+}
+
+//===----------------------------------------------------------------------===//
+// Constraint Selection.
+
+/// Return true of this is an input operand that is a matching constraint like
+/// "4".
+bool TargetLowering::AsmOperandInfo::isMatchingInputConstraint() const {
+  assert(!ConstraintCode.empty() && "No known constraint!");
+  return isdigit(static_cast<unsigned char>(ConstraintCode[0]));
+}
+
+/// If this is an input matching constraint, this method returns the output
+/// operand it matches.
+unsigned TargetLowering::AsmOperandInfo::getMatchedOperand() const {
+  assert(!ConstraintCode.empty() && "No known constraint!");
+  return atoi(ConstraintCode.c_str());
+}
+
+/// Split up the constraint string from the inline assembly value into the
+/// specific constraints and their prefixes, and also tie in the associated
+/// operand values.
+/// If this returns an empty vector, and if the constraint string itself
+/// isn't empty, there was an error parsing.
+TargetLowering::AsmOperandInfoVector
+TargetLowering::ParseConstraints(const DataLayout &DL,
+                                 const TargetRegisterInfo *TRI,
+                                 const CallBase &Call) const {
+  /// Information about all of the constraints.
+  AsmOperandInfoVector ConstraintOperands;
+  const InlineAsm *IA = cast<InlineAsm>(Call.getCalledOperand());
+  unsigned maCount = 0; // Largest number of multiple alternative constraints.
+
+  // Do a prepass over the constraints, canonicalizing them, and building up the
+  // ConstraintOperands list.
+  unsigned ArgNo = 0; // ArgNo - The argument of the CallInst.
+  unsigned ResNo = 0; // ResNo - The result number of the next output.
+  unsigned LabelNo = 0; // LabelNo - CallBr indirect dest number.
+
+  for (InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
+    ConstraintOperands.emplace_back(std::move(CI));
+    AsmOperandInfo &OpInfo = ConstraintOperands.back();
+
+    // Update multiple alternative constraint count.
+    if (OpInfo.multipleAlternatives.size() > maCount)
+      maCount = OpInfo.multipleAlternatives.size();
+
+    OpInfo.ConstraintVT = MVT::Other;
+
+    // Compute the value type for each operand.
+    switch (OpInfo.Type) {
+    case InlineAsm::isOutput:
+      // Indirect outputs just consume an argument.
+      if (OpInfo.isIndirect) {
+        OpInfo.CallOperandVal = Call.getArgOperand(ArgNo);
+        break;
+      }
+
+      // The return value of the call is this value.  As such, there is no
+      // corresponding argument.
+      assert(!Call.getType()->isVoidTy() && "Bad inline asm!");
+      if (StructType *STy = dyn_cast<StructType>(Call.getType())) {
+        OpInfo.ConstraintVT =
+            getSimpleValueType(DL, STy->getElementType(ResNo));
+      } else {
+        assert(ResNo == 0 && "Asm only has one result!");
+        OpInfo.ConstraintVT =
+            getAsmOperandValueType(DL, Call.getType()).getSimpleVT();
+      }
+      ++ResNo;
+      break;
+    case InlineAsm::isInput:
+      OpInfo.CallOperandVal = Call.getArgOperand(ArgNo);
+      break;
+    case InlineAsm::isLabel:
+      OpInfo.CallOperandVal = cast<CallBrInst>(&Call)->getIndirectDest(LabelNo);
+      ++LabelNo;
+      continue;
+    case InlineAsm::isClobber:
+      // Nothing to do.
+      break;
+    }
+
+    if (OpInfo.CallOperandVal) {
+      llvm::Type *OpTy = OpInfo.CallOperandVal->getType();
+      if (OpInfo.isIndirect) {
+        OpTy = Call.getParamElementType(ArgNo);
+        assert(OpTy && "Indirect operand must have elementtype attribute");
+      }
+
+      // Look for vector wrapped in a struct. e.g. { <16 x i8> }.
+      if (StructType *STy = dyn_cast<StructType>(OpTy))
+        if (STy->getNumElements() == 1)
+          OpTy = STy->getElementType(0);
+
+      // If OpTy is not a single value, it may be a struct/union that we
+      // can tile with integers.
+      if (!OpTy->isSingleValueType() && OpTy->isSized()) {
+        unsigned BitSize = DL.getTypeSizeInBits(OpTy);
+        switch (BitSize) {
+        default: break;
+        case 1:
+        case 8:
+        case 16:
+        case 32:
+        case 64:
+        case 128:
+          OpTy = IntegerType::get(OpTy->getContext(), BitSize);
+          break;
+        }
+      }
+
+      EVT VT = getAsmOperandValueType(DL, OpTy, true);
+      OpInfo.ConstraintVT = VT.isSimple() ? VT.getSimpleVT() : MVT::Other;
+      ArgNo++;
+    }
+  }
+
+  // If we have multiple alternative constraints, select the best alternative.
+  if (!ConstraintOperands.empty()) {
+    if (maCount) {
+      unsigned bestMAIndex = 0;
+      int bestWeight = -1;
+      // weight:  -1 = invalid match, and 0 = so-so match to 5 = good match.
+      int weight = -1;
+      unsigned maIndex;
+      // Compute the sums of the weights for each alternative, keeping track
+      // of the best (highest weight) one so far.
+      for (maIndex = 0; maIndex < maCount; ++maIndex) {
+        int weightSum = 0;
+        for (unsigned cIndex = 0, eIndex = ConstraintOperands.size();
+             cIndex != eIndex; ++cIndex) {
+          AsmOperandInfo &OpInfo = ConstraintOperands[cIndex];
+          if (OpInfo.Type == InlineAsm::isClobber)
+            continue;
+
+          // If this is an output operand with a matching input operand,
+          // look up the matching input. If their types mismatch, e.g. one
+          // is an integer, the other is floating point, or their sizes are
+          // different, flag it as an maCantMatch.
+          if (OpInfo.hasMatchingInput()) {
+            AsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
+            if (OpInfo.ConstraintVT != Input.ConstraintVT) {
+              if ((OpInfo.ConstraintVT.isInteger() !=
+                   Input.ConstraintVT.isInteger()) ||
+                  (OpInfo.ConstraintVT.getSizeInBits() !=
+                   Input.ConstraintVT.getSizeInBits())) {
+                weightSum = -1; // Can't match.
+                break;
+              }
+            }
+          }
+          weight = getMultipleConstraintMatchWeight(OpInfo, maIndex);
+          if (weight == -1) {
+            weightSum = -1;
+            break;
+          }
+          weightSum += weight;
+        }
+        // Update best.
+        if (weightSum > bestWeight) {
+          bestWeight = weightSum;
+          bestMAIndex = maIndex;
+        }
+      }
+
+      // Now select chosen alternative in each constraint.
+      for (AsmOperandInfo &cInfo : ConstraintOperands)
+        if (cInfo.Type != InlineAsm::isClobber)
+          cInfo.selectAlternative(bestMAIndex);
+    }
+  }
+
+  // Check and hook up tied operands, choose constraint code to use.
+  for (unsigned cIndex = 0, eIndex = ConstraintOperands.size();
+       cIndex != eIndex; ++cIndex) {
+    AsmOperandInfo &OpInfo = ConstraintOperands[cIndex];
+
+    // If this is an output operand with a matching input operand, look up the
+    // matching input. If their types mismatch, e.g. one is an integer, the
+    // other is floating point, or their sizes are different, flag it as an
+    // error.
+    if (OpInfo.hasMatchingInput()) {
+      AsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
+
+      if (OpInfo.ConstraintVT != Input.ConstraintVT) {
+        std::pair<unsigned, const TargetRegisterClass *> MatchRC =
+            getRegForInlineAsmConstraint(TRI, OpInfo.ConstraintCode,
+                                         OpInfo.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass *> InputRC =
+            getRegForInlineAsmConstraint(TRI, Input.ConstraintCode,
+                                         Input.ConstraintVT);
+        if ((OpInfo.ConstraintVT.isInteger() !=
+             Input.ConstraintVT.isInteger()) ||
+            (MatchRC.second != InputRC.second)) {
+          report_fatal_error("Unsupported asm: input constraint"
+                             " with a matching output constraint of"
+                             " incompatible type!");
+        }
+      }
+    }
+  }
+
+  return ConstraintOperands;
+}
+
+/// Return an integer indicating how general CT is.
+static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
+  switch (CT) {
+  case TargetLowering::C_Immediate:
+  case TargetLowering::C_Other:
+  case TargetLowering::C_Unknown:
+    return 0;
+  case TargetLowering::C_Register:
+    return 1;
+  case TargetLowering::C_RegisterClass:
+    return 2;
+  case TargetLowering::C_Memory:
+  case TargetLowering::C_Address:
+    return 3;
+  }
+  llvm_unreachable("Invalid constraint type");
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+  TargetLowering::getMultipleConstraintMatchWeight(
+    AsmOperandInfo &info, int maIndex) const {
+  InlineAsm::ConstraintCodeVector *rCodes;
+  if (maIndex >= (int)info.multipleAlternatives.size())
+    rCodes = &info.Codes;
+  else
+    rCodes = &info.multipleAlternatives[maIndex].Codes;
+  ConstraintWeight BestWeight = CW_Invalid;
+
+  // Loop over the options, keeping track of the most general one.
+  for (const std::string &rCode : *rCodes) {
+    ConstraintWeight weight =
+        getSingleConstraintMatchWeight(info, rCode.c_str());
+    if (weight > BestWeight)
+      BestWeight = weight;
+  }
+
+  return BestWeight;
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+  TargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  // Look at the constraint type.
+  switch (*constraint) {
+    case 'i': // immediate integer.
+    case 'n': // immediate integer with a known value.
+      if (isa<ConstantInt>(CallOperandVal))
+        weight = CW_Constant;
+      break;
+    case 's': // non-explicit intregal immediate.
+      if (isa<GlobalValue>(CallOperandVal))
+        weight = CW_Constant;
+      break;
+    case 'E': // immediate float if host format.
+    case 'F': // immediate float.
+      if (isa<ConstantFP>(CallOperandVal))
+        weight = CW_Constant;
+      break;
+    case '<': // memory operand with autodecrement.
+    case '>': // memory operand with autoincrement.
+    case 'm': // memory operand.
+    case 'o': // offsettable memory operand
+    case 'V': // non-offsettable memory operand
+      weight = CW_Memory;
+      break;
+    case 'r': // general register.
+    case 'g': // general register, memory operand or immediate integer.
+              // note: Clang converts "g" to "imr".
+      if (CallOperandVal->getType()->isIntegerTy())
+        weight = CW_Register;
+      break;
+    case 'X': // any operand.
+  default:
+    weight = CW_Default;
+    break;
+  }
+  return weight;
+}
+
+/// If there are multiple different constraints that we could pick for this
+/// operand (e.g. "imr") try to pick the 'best' one.
+/// This is somewhat tricky: constraints fall into four classes:
+///    Other         -> immediates and magic values
+///    Register      -> one specific register
+///    RegisterClass -> a group of regs
+///    Memory        -> memory
+/// Ideally, we would pick the most specific constraint possible: if we have
+/// something that fits into a register, we would pick it.  The problem here
+/// is that if we have something that could either be in a register or in
+/// memory that use of the register could cause selection of *other*
+/// operands to fail: they might only succeed if we pick memory.  Because of
+/// this the heuristic we use is:
+///
+///  1) If there is an 'other' constraint, and if the operand is valid for
+///     that constraint, use it.  This makes us take advantage of 'i'
+///     constraints when available.
+///  2) Otherwise, pick the most general constraint present.  This prefers
+///     'm' over 'r', for example.
+///
+static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
+                             const TargetLowering &TLI,
+                             SDValue Op, SelectionDAG *DAG) {
+  assert(OpInfo.Codes.size() > 1 && "Doesn't have multiple constraint options");
+  unsigned BestIdx = 0;
+  TargetLowering::ConstraintType BestType = TargetLowering::C_Unknown;
+  int BestGenerality = -1;
+
+  // Loop over the options, keeping track of the most general one.
+  for (unsigned i = 0, e = OpInfo.Codes.size(); i != e; ++i) {
+    TargetLowering::ConstraintType CType =
+      TLI.getConstraintType(OpInfo.Codes[i]);
+
+    // Indirect 'other' or 'immediate' constraints are not allowed.
+    if (OpInfo.isIndirect && !(CType == TargetLowering::C_Memory ||
+                               CType == TargetLowering::C_Register ||
+                               CType == TargetLowering::C_RegisterClass))
+      continue;
+
+    // If this is an 'other' or 'immediate' constraint, see if the operand is
+    // valid for it. For example, on X86 we might have an 'rI' constraint. If
+    // the operand is an integer in the range [0..31] we want to use I (saving a
+    // load of a register), otherwise we must use 'r'.
+    if ((CType == TargetLowering::C_Other ||
+         CType == TargetLowering::C_Immediate) && Op.getNode()) {
+      assert(OpInfo.Codes[i].size() == 1 &&
+             "Unhandled multi-letter 'other' constraint");
+      std::vector<SDValue> ResultOps;
+      TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i],
+                                       ResultOps, *DAG);
+      if (!ResultOps.empty()) {
+        BestType = CType;
+        BestIdx = i;
+        break;
+      }
+    }
+
+    // Things with matching constraints can only be registers, per gcc
+    // documentation.  This mainly affects "g" constraints.
+    if (CType == TargetLowering::C_Memory && OpInfo.hasMatchingInput())
+      continue;
+
+    // This constraint letter is more general than the previous one, use it.
+    int Generality = getConstraintGenerality(CType);
+    if (Generality > BestGenerality) {
+      BestType = CType;
+      BestIdx = i;
+      BestGenerality = Generality;
+    }
+  }
+
+  OpInfo.ConstraintCode = OpInfo.Codes[BestIdx];
+  OpInfo.ConstraintType = BestType;
+}
+
+/// Determines the constraint code and constraint type to use for the specific
+/// AsmOperandInfo, setting OpInfo.ConstraintCode and OpInfo.ConstraintType.
+void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
+                                            SDValue Op,
+                                            SelectionDAG *DAG) const {
+  assert(!OpInfo.Codes.empty() && "Must have at least one constraint");
+
+  // Single-letter constraints ('r') are very common.
+  if (OpInfo.Codes.size() == 1) {
+    OpInfo.ConstraintCode = OpInfo.Codes[0];
+    OpInfo.ConstraintType = getConstraintType(OpInfo.ConstraintCode);
+  } else {
+    ChooseConstraint(OpInfo, *this, Op, DAG);
+  }
+
+  // 'X' matches anything.
+  if (OpInfo.ConstraintCode == "X" && OpInfo.CallOperandVal) {
+    // Constants are handled elsewhere.  For Functions, the type here is the
+    // type of the result, which is not what we want to look at; leave them
+    // alone.
+    Value *v = OpInfo.CallOperandVal;
+    if (isa<ConstantInt>(v) || isa<Function>(v)) {
+      return;
+    }
+
+    if (isa<BasicBlock>(v) || isa<BlockAddress>(v)) {
+      OpInfo.ConstraintCode = "i";
+      return;
+    }
+
+    // Otherwise, try to resolve it to something we know about by looking at
+    // the actual operand type.
+    if (const char *Repl = LowerXConstraint(OpInfo.ConstraintVT)) {
+      OpInfo.ConstraintCode = Repl;
+      OpInfo.ConstraintType = getConstraintType(OpInfo.ConstraintCode);
+    }
+  }
+}
+
+/// Given an exact SDIV by a constant, create a multiplication
+/// with the multiplicative inverse of the constant.
+static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
+                              const SDLoc &dl, SelectionDAG &DAG,
+                              SmallVectorImpl<SDNode *> &Created) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  EVT SVT = VT.getScalarType();
+  EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+  EVT ShSVT = ShVT.getScalarType();
+
+  bool UseSRA = false;
+  SmallVector<SDValue, 16> Shifts, Factors;
+
+  auto BuildSDIVPattern = [&](ConstantSDNode *C) {
+    if (C->isZero())
+      return false;
+    APInt Divisor = C->getAPIntValue();
+    unsigned Shift = Divisor.countr_zero();
+    if (Shift) {
+      Divisor.ashrInPlace(Shift);
+      UseSRA = true;
+    }
+    // Calculate the multiplicative inverse, using Newton's method.
+    APInt t;
+    APInt Factor = Divisor;
+    while ((t = Divisor * Factor) != 1)
+      Factor *= APInt(Divisor.getBitWidth(), 2) - t;
+    Shifts.push_back(DAG.getConstant(Shift, dl, ShSVT));
+    Factors.push_back(DAG.getConstant(Factor, dl, SVT));
+    return true;
+  };
+
+  // Collect all magic values from the build vector.
+  if (!ISD::matchUnaryPredicate(Op1, BuildSDIVPattern))
+    return SDValue();
+
+  SDValue Shift, Factor;
+  if (Op1.getOpcode() == ISD::BUILD_VECTOR) {
+    Shift = DAG.getBuildVector(ShVT, dl, Shifts);
+    Factor = DAG.getBuildVector(VT, dl, Factors);
+  } else if (Op1.getOpcode() == ISD::SPLAT_VECTOR) {
+    assert(Shifts.size() == 1 && Factors.size() == 1 &&
+           "Expected matchUnaryPredicate to return one element for scalable "
+           "vectors");
+    Shift = DAG.getSplatVector(ShVT, dl, Shifts[0]);
+    Factor = DAG.getSplatVector(VT, dl, Factors[0]);
+  } else {
+    assert(isa<ConstantSDNode>(Op1) && "Expected a constant");
+    Shift = Shifts[0];
+    Factor = Factors[0];
+  }
+
+  SDValue Res = Op0;
+
+  // Shift the value upfront if it is even, so the LSB is one.
+  if (UseSRA) {
+    // TODO: For UDIV use SRL instead of SRA.
+    SDNodeFlags Flags;
+    Flags.setExact(true);
+    Res = DAG.getNode(ISD::SRA, dl, VT, Res, Shift, Flags);
+    Created.push_back(Res.getNode());
+  }
+
+  return DAG.getNode(ISD::MUL, dl, VT, Res, Factor);
+}
+
+SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+                              SelectionDAG &DAG,
+                              SmallVectorImpl<SDNode *> &Created) const {
+  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.isIntDivCheap(N->getValueType(0), Attr))
+    return SDValue(N, 0); // Lower SDIV as SDIV
+  return SDValue();
+}
+
+SDValue
+TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
+                              SelectionDAG &DAG,
+                              SmallVectorImpl<SDNode *> &Created) const {
+  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.isIntDivCheap(N->getValueType(0), Attr))
+    return SDValue(N, 0); // Lower SREM as SREM
+  return SDValue();
+}
+
+/// Given an ISD::SDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
+SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
+                                  bool IsAfterLegalization,
+                                  SmallVectorImpl<SDNode *> &Created) const {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  EVT SVT = VT.getScalarType();
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  EVT ShSVT = ShVT.getScalarType();
+  unsigned EltBits = VT.getScalarSizeInBits();
+  EVT MulVT;
+
+  // Check to see if we can do this.
+  // FIXME: We should be more aggressive here.
+  if (!isTypeLegal(VT)) {
+    // Limit this to simple scalars for now.
+    if (VT.isVector() || !VT.isSimple())
+      return SDValue();
+
+    // If this type will be promoted to a large enough type with a legal
+    // multiply operation, we can go ahead and do this transform.
+    if (getTypeAction(VT.getSimpleVT()) != TypePromoteInteger)
+      return SDValue();
+
+    MulVT = getTypeToTransformTo(*DAG.getContext(), VT);
+    if (MulVT.getSizeInBits() < (2 * EltBits) ||
+        !isOperationLegal(ISD::MUL, MulVT))
+      return SDValue();
+  }
+
+  // If the sdiv has an 'exact' bit we can use a simpler lowering.
+  if (N->getFlags().hasExact())
+    return BuildExactSDIV(*this, N, dl, DAG, Created);
+
+  SmallVector<SDValue, 16> MagicFactors, Factors, Shifts, ShiftMasks;
+
+  auto BuildSDIVPattern = [&](ConstantSDNode *C) {
+    if (C->isZero())
+      return false;
+
+    const APInt &Divisor = C->getAPIntValue();
+    SignedDivisionByConstantInfo magics = SignedDivisionByConstantInfo::get(Divisor);
+    int NumeratorFactor = 0;
+    int ShiftMask = -1;
+
+    if (Divisor.isOne() || Divisor.isAllOnes()) {
+      // If d is +1/-1, we just multiply the numerator by +1/-1.
+      NumeratorFactor = Divisor.getSExtValue();
+      magics.Magic = 0;
+      magics.ShiftAmount = 0;
+      ShiftMask = 0;
+    } else if (Divisor.isStrictlyPositive() && magics.Magic.isNegative()) {
+      // If d > 0 and m < 0, add the numerator.
+      NumeratorFactor = 1;
+    } else if (Divisor.isNegative() && magics.Magic.isStrictlyPositive()) {
+      // If d < 0 and m > 0, subtract the numerator.
+      NumeratorFactor = -1;
+    }
+
+    MagicFactors.push_back(DAG.getConstant(magics.Magic, dl, SVT));
+    Factors.push_back(DAG.getConstant(NumeratorFactor, dl, SVT));
+    Shifts.push_back(DAG.getConstant(magics.ShiftAmount, dl, ShSVT));
+    ShiftMasks.push_back(DAG.getConstant(ShiftMask, dl, SVT));
+    return true;
+  };
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // Collect the shifts / magic values from each element.
+  if (!ISD::matchUnaryPredicate(N1, BuildSDIVPattern))
+    return SDValue();
+
+  SDValue MagicFactor, Factor, Shift, ShiftMask;
+  if (N1.getOpcode() == ISD::BUILD_VECTOR) {
+    MagicFactor = DAG.getBuildVector(VT, dl, MagicFactors);
+    Factor = DAG.getBuildVector(VT, dl, Factors);
+    Shift = DAG.getBuildVector(ShVT, dl, Shifts);
+    ShiftMask = DAG.getBuildVector(VT, dl, ShiftMasks);
+  } else if (N1.getOpcode() == ISD::SPLAT_VECTOR) {
+    assert(MagicFactors.size() == 1 && Factors.size() == 1 &&
+           Shifts.size() == 1 && ShiftMasks.size() == 1 &&
+           "Expected matchUnaryPredicate to return one element for scalable "
+           "vectors");
+    MagicFactor = DAG.getSplatVector(VT, dl, MagicFactors[0]);
+    Factor = DAG.getSplatVector(VT, dl, Factors[0]);
+    Shift = DAG.getSplatVector(ShVT, dl, Shifts[0]);
+    ShiftMask = DAG.getSplatVector(VT, dl, ShiftMasks[0]);
+  } else {
+    assert(isa<ConstantSDNode>(N1) && "Expected a constant");
+    MagicFactor = MagicFactors[0];
+    Factor = Factors[0];
+    Shift = Shifts[0];
+    ShiftMask = ShiftMasks[0];
+  }
+
+  // Multiply the numerator (operand 0) by the magic value.
+  // FIXME: We should support doing a MUL in a wider type.
+  auto GetMULHS = [&](SDValue X, SDValue Y) {
+    // If the type isn't legal, use a wider mul of the the type calculated
+    // earlier.
+    if (!isTypeLegal(VT)) {
+      X = DAG.getNode(ISD::SIGN_EXTEND, dl, MulVT, X);
+      Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MulVT, Y);
+      Y = DAG.getNode(ISD::MUL, dl, MulVT, X, Y);
+      Y = DAG.getNode(ISD::SRL, dl, MulVT, Y,
+                      DAG.getShiftAmountConstant(EltBits, MulVT, dl));
+      return DAG.getNode(ISD::TRUNCATE, dl, VT, Y);
+    }
+
+    if (isOperationLegalOrCustom(ISD::MULHS, VT, IsAfterLegalization))
+      return DAG.getNode(ISD::MULHS, dl, VT, X, Y);
+    if (isOperationLegalOrCustom(ISD::SMUL_LOHI, VT, IsAfterLegalization)) {
+      SDValue LoHi =
+          DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT), X, Y);
+      return SDValue(LoHi.getNode(), 1);
+    }
+    // If type twice as wide legal, widen and use a mul plus a shift.
+    unsigned Size = VT.getScalarSizeInBits();
+    EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), Size * 2);
+    if (VT.isVector())
+      WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT,
+                                VT.getVectorElementCount());
+    if (isOperationLegalOrCustom(ISD::MUL, WideVT)) {
+      X = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, X);
+      Y = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, Y);
+      Y = DAG.getNode(ISD::MUL, dl, WideVT, X, Y);
+      Y = DAG.getNode(ISD::SRL, dl, WideVT, Y,
+                      DAG.getShiftAmountConstant(EltBits, WideVT, dl));
+      return DAG.getNode(ISD::TRUNCATE, dl, VT, Y);
+    }
+    return SDValue();
+  };
+
+  SDValue Q = GetMULHS(N0, MagicFactor);
+  if (!Q)
+    return SDValue();
+
+  Created.push_back(Q.getNode());
+
+  // (Optionally) Add/subtract the numerator using Factor.
+  Factor = DAG.getNode(ISD::MUL, dl, VT, N0, Factor);
+  Created.push_back(Factor.getNode());
+  Q = DAG.getNode(ISD::ADD, dl, VT, Q, Factor);
+  Created.push_back(Q.getNode());
+
+  // Shift right algebraic by shift value.
+  Q = DAG.getNode(ISD::SRA, dl, VT, Q, Shift);
+  Created.push_back(Q.getNode());
+
+  // Extract the sign bit, mask it and add it to the quotient.
+  SDValue SignShift = DAG.getConstant(EltBits - 1, dl, ShVT);
+  SDValue T = DAG.getNode(ISD::SRL, dl, VT, Q, SignShift);
+  Created.push_back(T.getNode());
+  T = DAG.getNode(ISD::AND, dl, VT, T, ShiftMask);
+  Created.push_back(T.getNode());
+  return DAG.getNode(ISD::ADD, dl, VT, Q, T);
+}
+
+/// Given an ISD::UDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
+SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
+                                  bool IsAfterLegalization,
+                                  SmallVectorImpl<SDNode *> &Created) const {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  EVT SVT = VT.getScalarType();
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  EVT ShSVT = ShVT.getScalarType();
+  unsigned EltBits = VT.getScalarSizeInBits();
+  EVT MulVT;
+
+  // Check to see if we can do this.
+  // FIXME: We should be more aggressive here.
+  if (!isTypeLegal(VT)) {
+    // Limit this to simple scalars for now.
+    if (VT.isVector() || !VT.isSimple())
+      return SDValue();
+
+    // If this type will be promoted to a large enough type with a legal
+    // multiply operation, we can go ahead and do this transform.
+    if (getTypeAction(VT.getSimpleVT()) != TypePromoteInteger)
+      return SDValue();
+
+    MulVT = getTypeToTransformTo(*DAG.getContext(), VT);
+    if (MulVT.getSizeInBits() < (2 * EltBits) ||
+        !isOperationLegal(ISD::MUL, MulVT))
+      return SDValue();
+  }
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // Try to use leading zeros of the dividend to reduce the multiplier and
+  // avoid expensive fixups.
+  // TODO: Support vectors.
+  unsigned LeadingZeros = 0;
+  if (!VT.isVector() && isa<ConstantSDNode>(N1)) {
+    assert(!isOneConstant(N1) && "Unexpected divisor");
+    LeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros();
+    // UnsignedDivisionByConstantInfo doesn't work correctly if leading zeros in
+    // the dividend exceeds the leading zeros for the divisor.
+    LeadingZeros = std::min(
+        LeadingZeros, cast<ConstantSDNode>(N1)->getAPIntValue().countl_zero());
+  }
+
+  bool UseNPQ = false, UsePreShift = false, UsePostShift = false;
+  SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
+
+  auto BuildUDIVPattern = [&](ConstantSDNode *C) {
+    if (C->isZero())
+      return false;
+    const APInt& Divisor = C->getAPIntValue();
+
+    SDValue PreShift, MagicFactor, NPQFactor, PostShift;
+
+    // Magic algorithm doesn't work for division by 1. We need to emit a select
+    // at the end.
+    if (Divisor.isOne()) {
+      PreShift = PostShift = DAG.getUNDEF(ShSVT);
+      MagicFactor = NPQFactor = DAG.getUNDEF(SVT);
+    } else {
+      UnsignedDivisionByConstantInfo magics =
+          UnsignedDivisionByConstantInfo::get(Divisor, LeadingZeros);
+
+      MagicFactor = DAG.getConstant(magics.Magic, dl, SVT);
+
+      assert(magics.PreShift < Divisor.getBitWidth() &&
+             "We shouldn't generate an undefined shift!");
+      assert(magics.PostShift < Divisor.getBitWidth() &&
+             "We shouldn't generate an undefined shift!");
+      assert((!magics.IsAdd || magics.PreShift == 0) &&
+             "Unexpected pre-shift");
+      PreShift = DAG.getConstant(magics.PreShift, dl, ShSVT);
+      PostShift = DAG.getConstant(magics.PostShift, dl, ShSVT);
+      NPQFactor = DAG.getConstant(
+          magics.IsAdd ? APInt::getOneBitSet(EltBits, EltBits - 1)
+                       : APInt::getZero(EltBits),
+          dl, SVT);
+      UseNPQ |= magics.IsAdd;
+      UsePreShift |= magics.PreShift != 0;
+      UsePostShift |= magics.PostShift != 0;
+    }
+
+    PreShifts.push_back(PreShift);
+    MagicFactors.push_back(MagicFactor);
+    NPQFactors.push_back(NPQFactor);
+    PostShifts.push_back(PostShift);
+    return true;
+  };
+
+  // Collect the shifts/magic values from each element.
+  if (!ISD::matchUnaryPredicate(N1, BuildUDIVPattern))
+    return SDValue();
+
+  SDValue PreShift, PostShift, MagicFactor, NPQFactor;
+  if (N1.getOpcode() == ISD::BUILD_VECTOR) {
+    PreShift = DAG.getBuildVector(ShVT, dl, PreShifts);
+    MagicFactor = DAG.getBuildVector(VT, dl, MagicFactors);
+    NPQFactor = DAG.getBuildVector(VT, dl, NPQFactors);
+    PostShift = DAG.getBuildVector(ShVT, dl, PostShifts);
+  } else if (N1.getOpcode() == ISD::SPLAT_VECTOR) {
+    assert(PreShifts.size() == 1 && MagicFactors.size() == 1 &&
+           NPQFactors.size() == 1 && PostShifts.size() == 1 &&
+           "Expected matchUnaryPredicate to return one for scalable vectors");
+    PreShift = DAG.getSplatVector(ShVT, dl, PreShifts[0]);
+    MagicFactor = DAG.getSplatVector(VT, dl, MagicFactors[0]);
+    NPQFactor = DAG.getSplatVector(VT, dl, NPQFactors[0]);
+    PostShift = DAG.getSplatVector(ShVT, dl, PostShifts[0]);
+  } else {
+    assert(isa<ConstantSDNode>(N1) && "Expected a constant");
+    PreShift = PreShifts[0];
+    MagicFactor = MagicFactors[0];
+    PostShift = PostShifts[0];
+  }
+
+  SDValue Q = N0;
+  if (UsePreShift) {
+    Q = DAG.getNode(ISD::SRL, dl, VT, Q, PreShift);
+    Created.push_back(Q.getNode());
+  }
+
+  // FIXME: We should support doing a MUL in a wider type.
+  auto GetMULHU = [&](SDValue X, SDValue Y) {
+    // If the type isn't legal, use a wider mul of the the type calculated
+    // earlier.
+    if (!isTypeLegal(VT)) {
+      X = DAG.getNode(ISD::ZERO_EXTEND, dl, MulVT, X);
+      Y = DAG.getNode(ISD::ZERO_EXTEND, dl, MulVT, Y);
+      Y = DAG.getNode(ISD::MUL, dl, MulVT, X, Y);
+      Y = DAG.getNode(ISD::SRL, dl, MulVT, Y,
+                      DAG.getShiftAmountConstant(EltBits, MulVT, dl));
+      return DAG.getNode(ISD::TRUNCATE, dl, VT, Y);
+    }
+
+    if (isOperationLegalOrCustom(ISD::MULHU, VT, IsAfterLegalization))
+      return DAG.getNode(ISD::MULHU, dl, VT, X, Y);
+    if (isOperationLegalOrCustom(ISD::UMUL_LOHI, VT, IsAfterLegalization)) {
+      SDValue LoHi =
+          DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), X, Y);
+      return SDValue(LoHi.getNode(), 1);
+    }
+    // If type twice as wide legal, widen and use a mul plus a shift.
+    unsigned Size = VT.getScalarSizeInBits();
+    EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), Size * 2);
+    if (VT.isVector())
+      WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT,
+                                VT.getVectorElementCount());
+    if (isOperationLegalOrCustom(ISD::MUL, WideVT)) {
+      X = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, X);
+      Y = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, Y);
+      Y = DAG.getNode(ISD::MUL, dl, WideVT, X, Y);
+      Y = DAG.getNode(ISD::SRL, dl, WideVT, Y,
+                      DAG.getShiftAmountConstant(EltBits, WideVT, dl));
+      return DAG.getNode(ISD::TRUNCATE, dl, VT, Y);
+    }
+    return SDValue(); // No mulhu or equivalent
+  };
+
+  // Multiply the numerator (operand 0) by the magic value.
+  Q = GetMULHU(Q, MagicFactor);
+  if (!Q)
+    return SDValue();
+
+  Created.push_back(Q.getNode());
+
+  if (UseNPQ) {
+    SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N0, Q);
+    Created.push_back(NPQ.getNode());
+
+    // For vectors we might have a mix of non-NPQ/NPQ paths, so use
+    // MULHU to act as a SRL-by-1 for NPQ, else multiply by zero.
+    if (VT.isVector())
+      NPQ = GetMULHU(NPQ, NPQFactor);
+    else
+      NPQ = DAG.getNode(ISD::SRL, dl, VT, NPQ, DAG.getConstant(1, dl, ShVT));
+
+    Created.push_back(NPQ.getNode());
+
+    Q = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q);
+    Created.push_back(Q.getNode());
+  }
+
+  if (UsePostShift) {
+    Q = DAG.getNode(ISD::SRL, dl, VT, Q, PostShift);
+    Created.push_back(Q.getNode());
+  }
+
+  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+
+  SDValue One = DAG.getConstant(1, dl, VT);
+  SDValue IsOne = DAG.getSetCC(dl, SetCCVT, N1, One, ISD::SETEQ);
+  return DAG.getSelect(dl, VT, IsOne, N0, Q);
+}
+
+/// If all values in Values that *don't* match the predicate are same 'splat'
+/// value, then replace all values with that splat value.
+/// Else, if AlternativeReplacement was provided, then replace all values that
+/// do match predicate with AlternativeReplacement value.
+static void
+turnVectorIntoSplatVector(MutableArrayRef<SDValue> Values,
+                          std::function<bool(SDValue)> Predicate,
+                          SDValue AlternativeReplacement = SDValue()) {
+  SDValue Replacement;
+  // Is there a value for which the Predicate does *NOT* match? What is it?
+  auto SplatValue = llvm::find_if_not(Values, Predicate);
+  if (SplatValue != Values.end()) {
+    // Does Values consist only of SplatValue's and values matching Predicate?
+    if (llvm::all_of(Values, [Predicate, SplatValue](SDValue Value) {
+          return Value == *SplatValue || Predicate(Value);
+        })) // Then we shall replace values matching predicate with SplatValue.
+      Replacement = *SplatValue;
+  }
+  if (!Replacement) {
+    // Oops, we did not find the "baseline" splat value.
+    if (!AlternativeReplacement)
+      return; // Nothing to do.
+    // Let's replace with provided value then.
+    Replacement = AlternativeReplacement;
+  }
+  std::replace_if(Values.begin(), Values.end(), Predicate, Replacement);
+}
+
+/// Given an ISD::UREM used only by an ISD::SETEQ or ISD::SETNE
+/// where the divisor is constant and the comparison target is zero,
+/// return a DAG expression that will generate the same comparison result
+/// using only multiplications, additions and shifts/rotations.
+/// Ref: "Hacker's Delight" 10-17.
+SDValue TargetLowering::buildUREMEqFold(EVT SETCCVT, SDValue REMNode,
+                                        SDValue CompTargetNode,
+                                        ISD::CondCode Cond,
+                                        DAGCombinerInfo &DCI,
+                                        const SDLoc &DL) const {
+  SmallVector<SDNode *, 5> Built;
+  if (SDValue Folded = prepareUREMEqFold(SETCCVT, REMNode, CompTargetNode, Cond,
+                                         DCI, DL, Built)) {
+    for (SDNode *N : Built)
+      DCI.AddToWorklist(N);
+    return Folded;
+  }
+
+  return SDValue();
+}
+
+SDValue
+TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
+                                  SDValue CompTargetNode, ISD::CondCode Cond,
+                                  DAGCombinerInfo &DCI, const SDLoc &DL,
+                                  SmallVectorImpl<SDNode *> &Created) const {
+  // fold (seteq/ne (urem N, D), 0) -> (setule/ugt (rotr (mul N, P), K), Q)
+  // - D must be constant, with D = D0 * 2^K where D0 is odd
+  // - P is the multiplicative inverse of D0 modulo 2^W
+  // - Q = floor(((2^W) - 1) / D)
+  // where W is the width of the common type of N and D.
+  assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+         "Only applicable for (in)equality comparisons.");
+
+  SelectionDAG &DAG = DCI.DAG;
+
+  EVT VT = REMNode.getValueType();
+  EVT SVT = VT.getScalarType();
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout(), !DCI.isBeforeLegalize());
+  EVT ShSVT = ShVT.getScalarType();
+
+  // If MUL is unavailable, we cannot proceed in any case.
+  if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::MUL, VT))
+    return SDValue();
+
+  bool ComparingWithAllZeros = true;
+  bool AllComparisonsWithNonZerosAreTautological = true;
+  bool HadTautologicalLanes = false;
+  bool AllLanesAreTautological = true;
+  bool HadEvenDivisor = false;
+  bool AllDivisorsArePowerOfTwo = true;
+  bool HadTautologicalInvertedLanes = false;
+  SmallVector<SDValue, 16> PAmts, KAmts, QAmts, IAmts;
+
+  auto BuildUREMPattern = [&](ConstantSDNode *CDiv, ConstantSDNode *CCmp) {
+    // Division by 0 is UB. Leave it to be constant-folded elsewhere.
+    if (CDiv->isZero())
+      return false;
+
+    const APInt &D = CDiv->getAPIntValue();
+    const APInt &Cmp = CCmp->getAPIntValue();
+
+    ComparingWithAllZeros &= Cmp.isZero();
+
+    // x u% C1` is *always* less than C1. So given `x u% C1 == C2`,
+    // if C2 is not less than C1, the comparison is always false.
+    // But we will only be able to produce the comparison that will give the
+    // opposive tautological answer. So this lane would need to be fixed up.
+    bool TautologicalInvertedLane = D.ule(Cmp);
+    HadTautologicalInvertedLanes |= TautologicalInvertedLane;
+
+    // If all lanes are tautological (either all divisors are ones, or divisor
+    // is not greater than the constant we are comparing with),
+    // we will prefer to avoid the fold.
+    bool TautologicalLane = D.isOne() || TautologicalInvertedLane;
+    HadTautologicalLanes |= TautologicalLane;
+    AllLanesAreTautological &= TautologicalLane;
+
+    // If we are comparing with non-zero, we need'll need  to subtract said
+    // comparison value from the LHS. But there is no point in doing that if
+    // every lane where we are comparing with non-zero is tautological..
+    if (!Cmp.isZero())
+      AllComparisonsWithNonZerosAreTautological &= TautologicalLane;
+
+    // Decompose D into D0 * 2^K
+    unsigned K = D.countr_zero();
+    assert((!D.isOne() || (K == 0)) && "For divisor '1' we won't rotate.");
+    APInt D0 = D.lshr(K);
+
+    // D is even if it has trailing zeros.
+    HadEvenDivisor |= (K != 0);
+    // D is a power-of-two if D0 is one.
+    // If all divisors are power-of-two, we will prefer to avoid the fold.
+    AllDivisorsArePowerOfTwo &= D0.isOne();
+
+    // P = inv(D0, 2^W)
+    // 2^W requires W + 1 bits, so we have to extend and then truncate.
+    unsigned W = D.getBitWidth();
+    APInt P = D0.zext(W + 1)
+                  .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
+                  .trunc(W);
+    assert(!P.isZero() && "No multiplicative inverse!"); // unreachable
+    assert((D0 * P).isOne() && "Multiplicative inverse basic check failed.");
+
+    // Q = floor((2^W - 1) u/ D)
+    // R = ((2^W - 1) u% D)
+    APInt Q, R;
+    APInt::udivrem(APInt::getAllOnes(W), D, Q, R);
+
+    // If we are comparing with zero, then that comparison constant is okay,
+    // else it may need to be one less than that.
+    if (Cmp.ugt(R))
+      Q -= 1;
+
+    assert(APInt::getAllOnes(ShSVT.getSizeInBits()).ugt(K) &&
+           "We are expecting that K is always less than all-ones for ShSVT");
+
+    // If the lane is tautological the result can be constant-folded.
+    if (TautologicalLane) {
+      // Set P and K amount to a bogus values so we can try to splat them.
+      P = 0;
+      K = -1;
+      // And ensure that comparison constant is tautological,
+      // it will always compare true/false.
+      Q = -1;
+    }
+
+    PAmts.push_back(DAG.getConstant(P, DL, SVT));
+    KAmts.push_back(
+        DAG.getConstant(APInt(ShSVT.getSizeInBits(), K), DL, ShSVT));
+    QAmts.push_back(DAG.getConstant(Q, DL, SVT));
+    return true;
+  };
+
+  SDValue N = REMNode.getOperand(0);
+  SDValue D = REMNode.getOperand(1);
+
+  // Collect the values from each element.
+  if (!ISD::matchBinaryPredicate(D, CompTargetNode, BuildUREMPattern))
+    return SDValue();
+
+  // If all lanes are tautological, the result can be constant-folded.
+  if (AllLanesAreTautological)
+    return SDValue();
+
+  // If this is a urem by a powers-of-two, avoid the fold since it can be
+  // best implemented as a bit test.
+  if (AllDivisorsArePowerOfTwo)
+    return SDValue();
+
+  SDValue PVal, KVal, QVal;
+  if (D.getOpcode() == ISD::BUILD_VECTOR) {
+    if (HadTautologicalLanes) {
+      // Try to turn PAmts into a splat, since we don't care about the values
+      // that are currently '0'. If we can't, just keep '0'`s.
+      turnVectorIntoSplatVector(PAmts, isNullConstant);
+      // Try to turn KAmts into a splat, since we don't care about the values
+      // that are currently '-1'. If we can't, change them to '0'`s.
+      turnVectorIntoSplatVector(KAmts, isAllOnesConstant,
+                                DAG.getConstant(0, DL, ShSVT));
+    }
+
+    PVal = DAG.getBuildVector(VT, DL, PAmts);
+    KVal = DAG.getBuildVector(ShVT, DL, KAmts);
+    QVal = DAG.getBuildVector(VT, DL, QAmts);
+  } else if (D.getOpcode() == ISD::SPLAT_VECTOR) {
+    assert(PAmts.size() == 1 && KAmts.size() == 1 && QAmts.size() == 1 &&
+           "Expected matchBinaryPredicate to return one element for "
+           "SPLAT_VECTORs");
+    PVal = DAG.getSplatVector(VT, DL, PAmts[0]);
+    KVal = DAG.getSplatVector(ShVT, DL, KAmts[0]);
+    QVal = DAG.getSplatVector(VT, DL, QAmts[0]);
+  } else {
+    PVal = PAmts[0];
+    KVal = KAmts[0];
+    QVal = QAmts[0];
+  }
+
+  if (!ComparingWithAllZeros && !AllComparisonsWithNonZerosAreTautological) {
+    if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::SUB, VT))
+      return SDValue(); // FIXME: Could/should use `ISD::ADD`?
+    assert(CompTargetNode.getValueType() == N.getValueType() &&
+           "Expecting that the types on LHS and RHS of comparisons match.");
+    N = DAG.getNode(ISD::SUB, DL, VT, N, CompTargetNode);
+  }
+
+  // (mul N, P)
+  SDValue Op0 = DAG.getNode(ISD::MUL, DL, VT, N, PVal);
+  Created.push_back(Op0.getNode());
+
+  // Rotate right only if any divisor was even. We avoid rotates for all-odd
+  // divisors as a performance improvement, since rotating by 0 is a no-op.
+  if (HadEvenDivisor) {
+    // We need ROTR to do this.
+    if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::ROTR, VT))
+      return SDValue();
+    // UREM: (rotr (mul N, P), K)
+    Op0 = DAG.getNode(ISD::ROTR, DL, VT, Op0, KVal);
+    Created.push_back(Op0.getNode());
+  }
+
+  // UREM: (setule/setugt (rotr (mul N, P), K), Q)
+  SDValue NewCC =
+      DAG.getSetCC(DL, SETCCVT, Op0, QVal,
+                   ((Cond == ISD::SETEQ) ? ISD::SETULE : ISD::SETUGT));
+  if (!HadTautologicalInvertedLanes)
+    return NewCC;
+
+  // If any lanes previously compared always-false, the NewCC will give
+  // always-true result for them, so we need to fixup those lanes.
+  // Or the other way around for inequality predicate.
+  assert(VT.isVector() && "Can/should only get here for vectors.");
+  Created.push_back(NewCC.getNode());
+
+  // x u% C1` is *always* less than C1. So given `x u% C1 == C2`,
+  // if C2 is not less than C1, the comparison is always false.
+  // But we have produced the comparison that will give the
+  // opposive tautological answer. So these lanes would need to be fixed up.
+  SDValue TautologicalInvertedChannels =
+      DAG.getSetCC(DL, SETCCVT, D, CompTargetNode, ISD::SETULE);
+  Created.push_back(TautologicalInvertedChannels.getNode());
+
+  // NOTE: we avoid letting illegal types through even if we're before legalize
+  // ops – legalization has a hard time producing good code for this.
+  if (isOperationLegalOrCustom(ISD::VSELECT, SETCCVT)) {
+    // If we have a vector select, let's replace the comparison results in the
+    // affected lanes with the correct tautological result.
+    SDValue Replacement = DAG.getBoolConstant(Cond == ISD::SETEQ ? false : true,
+                                              DL, SETCCVT, SETCCVT);
+    return DAG.getNode(ISD::VSELECT, DL, SETCCVT, TautologicalInvertedChannels,
+                       Replacement, NewCC);
+  }
+
+  // Else, we can just invert the comparison result in the appropriate lanes.
+  //
+  // NOTE: see the note above VSELECT above.
+  if (isOperationLegalOrCustom(ISD::XOR, SETCCVT))
+    return DAG.getNode(ISD::XOR, DL, SETCCVT, NewCC,
+                       TautologicalInvertedChannels);
+
+  return SDValue(); // Don't know how to lower.
+}
+
+/// Given an ISD::SREM used only by an ISD::SETEQ or ISD::SETNE
+/// where the divisor is constant and the comparison target is zero,
+/// return a DAG expression that will generate the same comparison result
+/// using only multiplications, additions and shifts/rotations.
+/// Ref: "Hacker's Delight" 10-17.
+SDValue TargetLowering::buildSREMEqFold(EVT SETCCVT, SDValue REMNode,
+                                        SDValue CompTargetNode,
+                                        ISD::CondCode Cond,
+                                        DAGCombinerInfo &DCI,
+                                        const SDLoc &DL) const {
+  SmallVector<SDNode *, 7> Built;
+  if (SDValue Folded = prepareSREMEqFold(SETCCVT, REMNode, CompTargetNode, Cond,
+                                         DCI, DL, Built)) {
+    assert(Built.size() <= 7 && "Max size prediction failed.");
+    for (SDNode *N : Built)
+      DCI.AddToWorklist(N);
+    return Folded;
+  }
+
+  return SDValue();
+}
+
+SDValue
+TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
+                                  SDValue CompTargetNode, ISD::CondCode Cond,
+                                  DAGCombinerInfo &DCI, const SDLoc &DL,
+                                  SmallVectorImpl<SDNode *> &Created) const {
+  // Fold:
+  //   (seteq/ne (srem N, D), 0)
+  // To:
+  //   (setule/ugt (rotr (add (mul N, P), A), K), Q)
+  //
+  // - D must be constant, with D = D0 * 2^K where D0 is odd
+  // - P is the multiplicative inverse of D0 modulo 2^W
+  // - A = bitwiseand(floor((2^(W - 1) - 1) / D0), (-(2^k)))
+  // - Q = floor((2 * A) / (2^K))
+  // where W is the width of the common type of N and D.
+  assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+         "Only applicable for (in)equality comparisons.");
+
+  SelectionDAG &DAG = DCI.DAG;
+
+  EVT VT = REMNode.getValueType();
+  EVT SVT = VT.getScalarType();
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout(), !DCI.isBeforeLegalize());
+  EVT ShSVT = ShVT.getScalarType();
+
+  // If we are after ops legalization, and MUL is unavailable, we can not
+  // proceed.
+  if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::MUL, VT))
+    return SDValue();
+
+  // TODO: Could support comparing with non-zero too.
+  ConstantSDNode *CompTarget = isConstOrConstSplat(CompTargetNode);
+  if (!CompTarget || !CompTarget->isZero())
+    return SDValue();
+
+  bool HadIntMinDivisor = false;
+  bool HadOneDivisor = false;
+  bool AllDivisorsAreOnes = true;
+  bool HadEvenDivisor = false;
+  bool NeedToApplyOffset = false;
+  bool AllDivisorsArePowerOfTwo = true;
+  SmallVector<SDValue, 16> PAmts, AAmts, KAmts, QAmts;
+
+  auto BuildSREMPattern = [&](ConstantSDNode *C) {
+    // Division by 0 is UB. Leave it to be constant-folded elsewhere.
+    if (C->isZero())
+      return false;
+
+    // FIXME: we don't fold `rem %X, -C` to `rem %X, C` in DAGCombine.
+
+    // WARNING: this fold is only valid for positive divisors!
+    APInt D = C->getAPIntValue();
+    if (D.isNegative())
+      D.negate(); //  `rem %X, -C` is equivalent to `rem %X, C`
+
+    HadIntMinDivisor |= D.isMinSignedValue();
+
+    // If all divisors are ones, we will prefer to avoid the fold.
+    HadOneDivisor |= D.isOne();
+    AllDivisorsAreOnes &= D.isOne();
+
+    // Decompose D into D0 * 2^K
+    unsigned K = D.countr_zero();
+    assert((!D.isOne() || (K == 0)) && "For divisor '1' we won't rotate.");
+    APInt D0 = D.lshr(K);
+
+    if (!D.isMinSignedValue()) {
+      // D is even if it has trailing zeros; unless it's INT_MIN, in which case
+      // we don't care about this lane in this fold, we'll special-handle it.
+      HadEvenDivisor |= (K != 0);
+    }
+
+    // D is a power-of-two if D0 is one. This includes INT_MIN.
+    // If all divisors are power-of-two, we will prefer to avoid the fold.
+    AllDivisorsArePowerOfTwo &= D0.isOne();
+
+    // P = inv(D0, 2^W)
+    // 2^W requires W + 1 bits, so we have to extend and then truncate.
+    unsigned W = D.getBitWidth();
+    APInt P = D0.zext(W + 1)
+                  .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
+                  .trunc(W);
+    assert(!P.isZero() && "No multiplicative inverse!"); // unreachable
+    assert((D0 * P).isOne() && "Multiplicative inverse basic check failed.");
+
+    // A = floor((2^(W - 1) - 1) / D0) & -2^K
+    APInt A = APInt::getSignedMaxValue(W).udiv(D0);
+    A.clearLowBits(K);
+
+    if (!D.isMinSignedValue()) {
+      // If divisor INT_MIN, then we don't care about this lane in this fold,
+      // we'll special-handle it.
+      NeedToApplyOffset |= A != 0;
+    }
+
+    // Q = floor((2 * A) / (2^K))
+    APInt Q = (2 * A).udiv(APInt::getOneBitSet(W, K));
+
+    assert(APInt::getAllOnes(SVT.getSizeInBits()).ugt(A) &&
+           "We are expecting that A is always less than all-ones for SVT");
+    assert(APInt::getAllOnes(ShSVT.getSizeInBits()).ugt(K) &&
+           "We are expecting that K is always less than all-ones for ShSVT");
+
+    // If the divisor is 1 the result can be constant-folded. Likewise, we
+    // don't care about INT_MIN lanes, those can be set to undef if appropriate.
+    if (D.isOne()) {
+      // Set P, A and K to a bogus values so we can try to splat them.
+      P = 0;
+      A = -1;
+      K = -1;
+
+      // x ?% 1 == 0  <-->  true  <-->  x u<= -1
+      Q = -1;
+    }
+
+    PAmts.push_back(DAG.getConstant(P, DL, SVT));
+    AAmts.push_back(DAG.getConstant(A, DL, SVT));
+    KAmts.push_back(
+        DAG.getConstant(APInt(ShSVT.getSizeInBits(), K), DL, ShSVT));
+    QAmts.push_back(DAG.getConstant(Q, DL, SVT));
+    return true;
+  };
+
+  SDValue N = REMNode.getOperand(0);
+  SDValue D = REMNode.getOperand(1);
+
+  // Collect the values from each element.
+  if (!ISD::matchUnaryPredicate(D, BuildSREMPattern))
+    return SDValue();
+
+  // If this is a srem by a one, avoid the fold since it can be constant-folded.
+  if (AllDivisorsAreOnes)
+    return SDValue();
+
+  // If this is a srem by a powers-of-two (including INT_MIN), avoid the fold
+  // since it can be best implemented as a bit test.
+  if (AllDivisorsArePowerOfTwo)
+    return SDValue();
+
+  SDValue PVal, AVal, KVal, QVal;
+  if (D.getOpcode() == ISD::BUILD_VECTOR) {
+    if (HadOneDivisor) {
+      // Try to turn PAmts into a splat, since we don't care about the values
+      // that are currently '0'. If we can't, just keep '0'`s.
+      turnVectorIntoSplatVector(PAmts, isNullConstant);
+      // Try to turn AAmts into a splat, since we don't care about the
+      // values that are currently '-1'. If we can't, change them to '0'`s.
+      turnVectorIntoSplatVector(AAmts, isAllOnesConstant,
+                                DAG.getConstant(0, DL, SVT));
+      // Try to turn KAmts into a splat, since we don't care about the values
+      // that are currently '-1'. If we can't, change them to '0'`s.
+      turnVectorIntoSplatVector(KAmts, isAllOnesConstant,
+                                DAG.getConstant(0, DL, ShSVT));
+    }
+
+    PVal = DAG.getBuildVector(VT, DL, PAmts);
+    AVal = DAG.getBuildVector(VT, DL, AAmts);
+    KVal = DAG.getBuildVector(ShVT, DL, KAmts);
+    QVal = DAG.getBuildVector(VT, DL, QAmts);
+  } else if (D.getOpcode() == ISD::SPLAT_VECTOR) {
+    assert(PAmts.size() == 1 && AAmts.size() == 1 && KAmts.size() == 1 &&
+           QAmts.size() == 1 &&
+           "Expected matchUnaryPredicate to return one element for scalable "
+           "vectors");
+    PVal = DAG.getSplatVector(VT, DL, PAmts[0]);
+    AVal = DAG.getSplatVector(VT, DL, AAmts[0]);
+    KVal = DAG.getSplatVector(ShVT, DL, KAmts[0]);
+    QVal = DAG.getSplatVector(VT, DL, QAmts[0]);
+  } else {
+    assert(isa<ConstantSDNode>(D) && "Expected a constant");
+    PVal = PAmts[0];
+    AVal = AAmts[0];
+    KVal = KAmts[0];
+    QVal = QAmts[0];
+  }
+
+  // (mul N, P)
+  SDValue Op0 = DAG.getNode(ISD::MUL, DL, VT, N, PVal);
+  Created.push_back(Op0.getNode());
+
+  if (NeedToApplyOffset) {
+    // We need ADD to do this.
+    if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::ADD, VT))
+      return SDValue();
+
+    // (add (mul N, P), A)
+    Op0 = DAG.getNode(ISD::ADD, DL, VT, Op0, AVal);
+    Created.push_back(Op0.getNode());
+  }
+
+  // Rotate right only if any divisor was even. We avoid rotates for all-odd
+  // divisors as a performance improvement, since rotating by 0 is a no-op.
+  if (HadEvenDivisor) {
+    // We need ROTR to do this.
+    if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::ROTR, VT))
+      return SDValue();
+    // SREM: (rotr (add (mul N, P), A), K)
+    Op0 = DAG.getNode(ISD::ROTR, DL, VT, Op0, KVal);
+    Created.push_back(Op0.getNode());
+  }
+
+  // SREM: (setule/setugt (rotr (add (mul N, P), A), K), Q)
+  SDValue Fold =
+      DAG.getSetCC(DL, SETCCVT, Op0, QVal,
+                   ((Cond == ISD::SETEQ) ? ISD::SETULE : ISD::SETUGT));
+
+  // If we didn't have lanes with INT_MIN divisor, then we're done.
+  if (!HadIntMinDivisor)
+    return Fold;
+
+  // That fold is only valid for positive divisors. Which effectively means,
+  // it is invalid for INT_MIN divisors. So if we have such a lane,
+  // we must fix-up results for said lanes.
+  assert(VT.isVector() && "Can/should only get here for vectors.");
+
+  // NOTE: we avoid letting illegal types through even if we're before legalize
+  // ops – legalization has a hard time producing good code for the code that
+  // follows.
+  if (!isOperationLegalOrCustom(ISD::SETCC, SETCCVT) ||
+      !isOperationLegalOrCustom(ISD::AND, VT) ||
+      !isCondCodeLegalOrCustom(Cond, VT.getSimpleVT()) ||
+      !isOperationLegalOrCustom(ISD::VSELECT, SETCCVT))
+    return SDValue();
+
+  Created.push_back(Fold.getNode());
+
+  SDValue IntMin = DAG.getConstant(
+      APInt::getSignedMinValue(SVT.getScalarSizeInBits()), DL, VT);
+  SDValue IntMax = DAG.getConstant(
+      APInt::getSignedMaxValue(SVT.getScalarSizeInBits()), DL, VT);
+  SDValue Zero =
+      DAG.getConstant(APInt::getZero(SVT.getScalarSizeInBits()), DL, VT);
+
+  // Which lanes had INT_MIN divisors? Divisor is constant, so const-folded.
+  SDValue DivisorIsIntMin = DAG.getSetCC(DL, SETCCVT, D, IntMin, ISD::SETEQ);
+  Created.push_back(DivisorIsIntMin.getNode());
+
+  // (N s% INT_MIN) ==/!= 0  <-->  (N & INT_MAX) ==/!= 0
+  SDValue Masked = DAG.getNode(ISD::AND, DL, VT, N, IntMax);
+  Created.push_back(Masked.getNode());
+  SDValue MaskedIsZero = DAG.getSetCC(DL, SETCCVT, Masked, Zero, Cond);
+  Created.push_back(MaskedIsZero.getNode());
+
+  // To produce final result we need to blend 2 vectors: 'SetCC' and
+  // 'MaskedIsZero'. If the divisor for channel was *NOT* INT_MIN, we pick
+  // from 'Fold', else pick from 'MaskedIsZero'. Since 'DivisorIsIntMin' is
+  // constant-folded, select can get lowered to a shuffle with constant mask.
+  SDValue Blended = DAG.getNode(ISD::VSELECT, DL, SETCCVT, DivisorIsIntMin,
+                                MaskedIsZero, Fold);
+
+  return Blended;
+}
+
+bool TargetLowering::
+verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const {
+  if (!isa<ConstantSDNode>(Op.getOperand(0))) {
+    DAG.getContext()->emitError("argument to '__builtin_return_address' must "
+                                "be a constant integer");
+    return true;
+  }
+
+  return false;
+}
+
+SDValue TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
+                                         const DenormalMode &Mode) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
+
+  // This is specifically a check for the handling of denormal inputs, not the
+  // result.
+  if (Mode.Input == DenormalMode::PreserveSign ||
+      Mode.Input == DenormalMode::PositiveZero) {
+    // Test = X == 0.0
+    return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
+  }
+
+  // Testing it with denormal inputs to avoid wrong estimate.
+  //
+  // Test = fabs(X) < SmallestNormal
+  const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
+  APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem);
+  SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT);
+  SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op);
+  return DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT);
+}
+
+SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+                                             bool LegalOps, bool OptForSize,
+                                             NegatibleCost &Cost,
+                                             unsigned Depth) const {
+  // fneg is removable even if it has multiple uses.
+  if (Op.getOpcode() == ISD::FNEG || Op.getOpcode() == ISD::VP_FNEG) {
+    Cost = NegatibleCost::Cheaper;
+    return Op.getOperand(0);
+  }
+
+  // Don't recurse exponentially.
+  if (Depth > SelectionDAG::MaxRecursionDepth)
+    return SDValue();
+
+  // Pre-increment recursion depth for use in recursive calls.
+  ++Depth;
+  const SDNodeFlags Flags = Op->getFlags();
+  const TargetOptions &Options = DAG.getTarget().Options;
+  EVT VT = Op.getValueType();
+  unsigned Opcode = Op.getOpcode();
+
+  // Don't allow anything with multiple uses unless we know it is free.
+  if (!Op.hasOneUse() && Opcode != ISD::ConstantFP) {
+    bool IsFreeExtend = Opcode == ISD::FP_EXTEND &&
+                        isFPExtFree(VT, Op.getOperand(0).getValueType());
+    if (!IsFreeExtend)
+      return SDValue();
+  }
+
+  auto RemoveDeadNode = [&](SDValue N) {
+    if (N && N.getNode()->use_empty())
+      DAG.RemoveDeadNode(N.getNode());
+  };
+
+  SDLoc DL(Op);
+
+  // Because getNegatedExpression can delete nodes we need a handle to keep
+  // temporary nodes alive in case the recursion manages to create an identical
+  // node.
+  std::list<HandleSDNode> Handles;
+
+  switch (Opcode) {
+  case ISD::ConstantFP: {
+    // Don't invert constant FP values after legalization unless the target says
+    // the negated constant is legal.
+    bool IsOpLegal =
+        isOperationLegal(ISD::ConstantFP, VT) ||
+        isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT,
+                     OptForSize);
+
+    if (LegalOps && !IsOpLegal)
+      break;
+
+    APFloat V = cast<ConstantFPSDNode>(Op)->getValueAPF();
+    V.changeSign();
+    SDValue CFP = DAG.getConstantFP(V, DL, VT);
+
+    // If we already have the use of the negated floating constant, it is free
+    // to negate it even it has multiple uses.
+    if (!Op.hasOneUse() && CFP.use_empty())
+      break;
+    Cost = NegatibleCost::Neutral;
+    return CFP;
+  }
+  case ISD::BUILD_VECTOR: {
+    // Only permit BUILD_VECTOR of constants.
+    if (llvm::any_of(Op->op_values(), [&](SDValue N) {
+          return !N.isUndef() && !isa<ConstantFPSDNode>(N);
+        }))
+      break;
+
+    bool IsOpLegal =
+        (isOperationLegal(ISD::ConstantFP, VT) &&
+         isOperationLegal(ISD::BUILD_VECTOR, VT)) ||
+        llvm::all_of(Op->op_values(), [&](SDValue N) {
+          return N.isUndef() ||
+                 isFPImmLegal(neg(cast<ConstantFPSDNode>(N)->getValueAPF()), VT,
+                              OptForSize);
+        });
+
+    if (LegalOps && !IsOpLegal)
+      break;
+
+    SmallVector<SDValue, 4> Ops;
+    for (SDValue C : Op->op_values()) {
+      if (C.isUndef()) {
+        Ops.push_back(C);
+        continue;
+      }
+      APFloat V = cast<ConstantFPSDNode>(C)->getValueAPF();
+      V.changeSign();
+      Ops.push_back(DAG.getConstantFP(V, DL, C.getValueType()));
+    }
+    Cost = NegatibleCost::Neutral;
+    return DAG.getBuildVector(VT, DL, Ops);
+  }
+  case ISD::FADD: {
+    if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
+      break;
+
+    // After operation legalization, it might not be legal to create new FSUBs.
+    if (LegalOps && !isOperationLegalOrCustom(ISD::FSUB, VT))
+      break;
+    SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
+
+    // fold (fneg (fadd X, Y)) -> (fsub (fneg X), Y)
+    NegatibleCost CostX = NegatibleCost::Expensive;
+    SDValue NegX =
+        getNegatedExpression(X, DAG, LegalOps, OptForSize, CostX, Depth);
+    // Prevent this node from being deleted by the next call.
+    if (NegX)
+      Handles.emplace_back(NegX);
+
+    // fold (fneg (fadd X, Y)) -> (fsub (fneg Y), X)
+    NegatibleCost CostY = NegatibleCost::Expensive;
+    SDValue NegY =
+        getNegatedExpression(Y, DAG, LegalOps, OptForSize, CostY, Depth);
+
+    // We're done with the handles.
+    Handles.clear();
+
+    // Negate the X if its cost is less or equal than Y.
+    if (NegX && (CostX <= CostY)) {
+      Cost = CostX;
+      SDValue N = DAG.getNode(ISD::FSUB, DL, VT, NegX, Y, Flags);
+      if (NegY != N)
+        RemoveDeadNode(NegY);
+      return N;
+    }
+
+    // Negate the Y if it is not expensive.
+    if (NegY) {
+      Cost = CostY;
+      SDValue N = DAG.getNode(ISD::FSUB, DL, VT, NegY, X, Flags);
+      if (NegX != N)
+        RemoveDeadNode(NegX);
+      return N;
+    }
+    break;
+  }
+  case ISD::FSUB: {
+    // We can't turn -(A-B) into B-A when we honor signed zeros.
+    if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
+      break;
+
+    SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
+    // fold (fneg (fsub 0, Y)) -> Y
+    if (ConstantFPSDNode *C = isConstOrConstSplatFP(X, /*AllowUndefs*/ true))
+      if (C->isZero()) {
+        Cost = NegatibleCost::Cheaper;
+        return Y;
+      }
+
+    // fold (fneg (fsub X, Y)) -> (fsub Y, X)
+    Cost = NegatibleCost::Neutral;
+    return DAG.getNode(ISD::FSUB, DL, VT, Y, X, Flags);
+  }
+  case ISD::FMUL:
+  case ISD::FDIV: {
+    SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
+
+    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
+    NegatibleCost CostX = NegatibleCost::Expensive;
+    SDValue NegX =
+        getNegatedExpression(X, DAG, LegalOps, OptForSize, CostX, Depth);
+    // Prevent this node from being deleted by the next call.
+    if (NegX)
+      Handles.emplace_back(NegX);
+
+    // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
+    NegatibleCost CostY = NegatibleCost::Expensive;
+    SDValue NegY =
+        getNegatedExpression(Y, DAG, LegalOps, OptForSize, CostY, Depth);
+
+    // We're done with the handles.
+    Handles.clear();
+
+    // Negate the X if its cost is less or equal than Y.
+    if (NegX && (CostX <= CostY)) {
+      Cost = CostX;
+      SDValue N = DAG.getNode(Opcode, DL, VT, NegX, Y, Flags);
+      if (NegY != N)
+        RemoveDeadNode(NegY);
+      return N;
+    }
+
+    // Ignore X * 2.0 because that is expected to be canonicalized to X + X.
+    if (auto *C = isConstOrConstSplatFP(Op.getOperand(1)))
+      if (C->isExactlyValue(2.0) && Op.getOpcode() == ISD::FMUL)
+        break;
+
+    // Negate the Y if it is not expensive.
+    if (NegY) {
+      Cost = CostY;
+      SDValue N = DAG.getNode(Opcode, DL, VT, X, NegY, Flags);
+      if (NegX != N)
+        RemoveDeadNode(NegX);
+      return N;
+    }
+    break;
+  }
+  case ISD::FMA:
+  case ISD::FMAD: {
+    if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
+      break;
+
+    SDValue X = Op.getOperand(0), Y = Op.getOperand(1), Z = Op.getOperand(2);
+    NegatibleCost CostZ = NegatibleCost::Expensive;
+    SDValue NegZ =
+        getNegatedExpression(Z, DAG, LegalOps, OptForSize, CostZ, Depth);
+    // Give up if fail to negate the Z.
+    if (!NegZ)
+      break;
+
+    // Prevent this node from being deleted by the next two calls.
+    Handles.emplace_back(NegZ);
+
+    // fold (fneg (fma X, Y, Z)) -> (fma (fneg X), Y, (fneg Z))
+    NegatibleCost CostX = NegatibleCost::Expensive;
+    SDValue NegX =
+        getNegatedExpression(X, DAG, LegalOps, OptForSize, CostX, Depth);
+    // Prevent this node from being deleted by the next call.
+    if (NegX)
+      Handles.emplace_back(NegX);
+
+    // fold (fneg (fma X, Y, Z)) -> (fma X, (fneg Y), (fneg Z))
+    NegatibleCost CostY = NegatibleCost::Expensive;
+    SDValue NegY =
+        getNegatedExpression(Y, DAG, LegalOps, OptForSize, CostY, Depth);
+
+    // We're done with the handles.
+    Handles.clear();
+
+    // Negate the X if its cost is less or equal than Y.
+    if (NegX && (CostX <= CostY)) {
+      Cost = std::min(CostX, CostZ);
+      SDValue N = DAG.getNode(Opcode, DL, VT, NegX, Y, NegZ, Flags);
+      if (NegY != N)
+        RemoveDeadNode(NegY);
+      return N;
+    }
+
+    // Negate the Y if it is not expensive.
+    if (NegY) {
+      Cost = std::min(CostY, CostZ);
+      SDValue N = DAG.getNode(Opcode, DL, VT, X, NegY, NegZ, Flags);
+      if (NegX != N)
+        RemoveDeadNode(NegX);
+      return N;
+    }
+    break;
+  }
+
+  case ISD::FP_EXTEND:
+  case ISD::FSIN:
+    if (SDValue NegV = getNegatedExpression(Op.getOperand(0), DAG, LegalOps,
+                                            OptForSize, Cost, Depth))
+      return DAG.getNode(Opcode, DL, VT, NegV);
+    break;
+  case ISD::FP_ROUND:
+    if (SDValue NegV = getNegatedExpression(Op.getOperand(0), DAG, LegalOps,
+                                            OptForSize, Cost, Depth))
+      return DAG.getNode(ISD::FP_ROUND, DL, VT, NegV, Op.getOperand(1));
+    break;
+  case ISD::SELECT:
+  case ISD::VSELECT: {
+    // fold (fneg (select C, LHS, RHS)) -> (select C, (fneg LHS), (fneg RHS))
+    // iff at least one cost is cheaper and the other is neutral/cheaper
+    SDValue LHS = Op.getOperand(1);
+    NegatibleCost CostLHS = NegatibleCost::Expensive;
+    SDValue NegLHS =
+        getNegatedExpression(LHS, DAG, LegalOps, OptForSize, CostLHS, Depth);
+    if (!NegLHS || CostLHS > NegatibleCost::Neutral) {
+      RemoveDeadNode(NegLHS);
+      break;
+    }
+
+    // Prevent this node from being deleted by the next call.
+    Handles.emplace_back(NegLHS);
+
+    SDValue RHS = Op.getOperand(2);
+    NegatibleCost CostRHS = NegatibleCost::Expensive;
+    SDValue NegRHS =
+        getNegatedExpression(RHS, DAG, LegalOps, OptForSize, CostRHS, Depth);
+
+    // We're done with the handles.
+    Handles.clear();
+
+    if (!NegRHS || CostRHS > NegatibleCost::Neutral ||
+        (CostLHS != NegatibleCost::Cheaper &&
+         CostRHS != NegatibleCost::Cheaper)) {
+      RemoveDeadNode(NegLHS);
+      RemoveDeadNode(NegRHS);
+      break;
+    }
+
+    Cost = std::min(CostLHS, CostRHS);
+    return DAG.getSelect(DL, VT, Op.getOperand(0), NegLHS, NegRHS);
+  }
+  }
+
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// Legalization Utilities
+//===----------------------------------------------------------------------===//
+
+bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, const SDLoc &dl,
+                                    SDValue LHS, SDValue RHS,
+                                    SmallVectorImpl<SDValue> &Result,
+                                    EVT HiLoVT, SelectionDAG &DAG,
+                                    MulExpansionKind Kind, SDValue LL,
+                                    SDValue LH, SDValue RL, SDValue RH) const {
+  assert(Opcode == ISD::MUL || Opcode == ISD::UMUL_LOHI ||
+         Opcode == ISD::SMUL_LOHI);
+
+  bool HasMULHS = (Kind == MulExpansionKind::Always) ||
+                  isOperationLegalOrCustom(ISD::MULHS, HiLoVT);
+  bool HasMULHU = (Kind == MulExpansionKind::Always) ||
+                  isOperationLegalOrCustom(ISD::MULHU, HiLoVT);
+  bool HasSMUL_LOHI = (Kind == MulExpansionKind::Always) ||
+                      isOperationLegalOrCustom(ISD::SMUL_LOHI, HiLoVT);
+  bool HasUMUL_LOHI = (Kind == MulExpansionKind::Always) ||
+                      isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT);
+
+  if (!HasMULHU && !HasMULHS && !HasUMUL_LOHI && !HasSMUL_LOHI)
+    return false;
+
+  unsigned OuterBitSize = VT.getScalarSizeInBits();
+  unsigned InnerBitSize = HiLoVT.getScalarSizeInBits();
+
+  // LL, LH, RL, and RH must be either all NULL or all set to a value.
+  assert((LL.getNode() && LH.getNode() && RL.getNode() && RH.getNode()) ||
+         (!LL.getNode() && !LH.getNode() && !RL.getNode() && !RH.getNode()));
+
+  SDVTList VTs = DAG.getVTList(HiLoVT, HiLoVT);
+  auto MakeMUL_LOHI = [&](SDValue L, SDValue R, SDValue &Lo, SDValue &Hi,
+                          bool Signed) -> bool {
+    if ((Signed && HasSMUL_LOHI) || (!Signed && HasUMUL_LOHI)) {
+      Lo = DAG.getNode(Signed ? ISD::SMUL_LOHI : ISD::UMUL_LOHI, dl, VTs, L, R);
+      Hi = SDValue(Lo.getNode(), 1);
+      return true;
+    }
+    if ((Signed && HasMULHS) || (!Signed && HasMULHU)) {
+      Lo = DAG.getNode(ISD::MUL, dl, HiLoVT, L, R);
+      Hi = DAG.getNode(Signed ? ISD::MULHS : ISD::MULHU, dl, HiLoVT, L, R);
+      return true;
+    }
+    return false;
+  };
+
+  SDValue Lo, Hi;
+
+  if (!LL.getNode() && !RL.getNode() &&
+      isOperationLegalOrCustom(ISD::TRUNCATE, HiLoVT)) {
+    LL = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, LHS);
+    RL = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, RHS);
+  }
+
+  if (!LL.getNode())
+    return false;
+
+  APInt HighMask = APInt::getHighBitsSet(OuterBitSize, InnerBitSize);
+  if (DAG.MaskedValueIsZero(LHS, HighMask) &&
+      DAG.MaskedValueIsZero(RHS, HighMask)) {
+    // The inputs are both zero-extended.
+    if (MakeMUL_LOHI(LL, RL, Lo, Hi, false)) {
+      Result.push_back(Lo);
+      Result.push_back(Hi);
+      if (Opcode != ISD::MUL) {
+        SDValue Zero = DAG.getConstant(0, dl, HiLoVT);
+        Result.push_back(Zero);
+        Result.push_back(Zero);
+      }
+      return true;
+    }
+  }
+
+  if (!VT.isVector() && Opcode == ISD::MUL &&
+      DAG.ComputeMaxSignificantBits(LHS) <= InnerBitSize &&
+      DAG.ComputeMaxSignificantBits(RHS) <= InnerBitSize) {
+    // The input values are both sign-extended.
+    // TODO non-MUL case?
+    if (MakeMUL_LOHI(LL, RL, Lo, Hi, true)) {
+      Result.push_back(Lo);
+      Result.push_back(Hi);
+      return true;
+    }
+  }
+
+  unsigned ShiftAmount = OuterBitSize - InnerBitSize;
+  SDValue Shift = DAG.getShiftAmountConstant(ShiftAmount, VT, dl);
+
+  if (!LH.getNode() && !RH.getNode() &&
+      isOperationLegalOrCustom(ISD::SRL, VT) &&
+      isOperationLegalOrCustom(ISD::TRUNCATE, HiLoVT)) {
+    LH = DAG.getNode(ISD::SRL, dl, VT, LHS, Shift);
+    LH = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, LH);
+    RH = DAG.getNode(ISD::SRL, dl, VT, RHS, Shift);
+    RH = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, RH);
+  }
+
+  if (!LH.getNode())
+    return false;
+
+  if (!MakeMUL_LOHI(LL, RL, Lo, Hi, false))
+    return false;
+
+  Result.push_back(Lo);
+
+  if (Opcode == ISD::MUL) {
+    RH = DAG.getNode(ISD::MUL, dl, HiLoVT, LL, RH);
+    LH = DAG.getNode(ISD::MUL, dl, HiLoVT, LH, RL);
+    Hi = DAG.getNode(ISD::ADD, dl, HiLoVT, Hi, RH);
+    Hi = DAG.getNode(ISD::ADD, dl, HiLoVT, Hi, LH);
+    Result.push_back(Hi);
+    return true;
+  }
+
+  // Compute the full width result.
+  auto Merge = [&](SDValue Lo, SDValue Hi) -> SDValue {
+    Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Lo);
+    Hi = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Hi);
+    Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
+    return DAG.getNode(ISD::OR, dl, VT, Lo, Hi);
+  };
+
+  SDValue Next = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Hi);
+  if (!MakeMUL_LOHI(LL, RH, Lo, Hi, false))
+    return false;
+
+  // This is effectively the add part of a multiply-add of half-sized operands,
+  // so it cannot overflow.
+  Next = DAG.getNode(ISD::ADD, dl, VT, Next, Merge(Lo, Hi));
+
+  if (!MakeMUL_LOHI(LH, RL, Lo, Hi, false))
+    return false;
+
+  SDValue Zero = DAG.getConstant(0, dl, HiLoVT);
+  EVT BoolType = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+
+  bool UseGlue = (isOperationLegalOrCustom(ISD::ADDC, VT) &&
+                  isOperationLegalOrCustom(ISD::ADDE, VT));
+  if (UseGlue)
+    Next = DAG.getNode(ISD::ADDC, dl, DAG.getVTList(VT, MVT::Glue), Next,
+                       Merge(Lo, Hi));
+  else
+    Next = DAG.getNode(ISD::UADDO_CARRY, dl, DAG.getVTList(VT, BoolType), Next,
+                       Merge(Lo, Hi), DAG.getConstant(0, dl, BoolType));
+
+  SDValue Carry = Next.getValue(1);
+  Result.push_back(DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, Next));
+  Next = DAG.getNode(ISD::SRL, dl, VT, Next, Shift);
+
+  if (!MakeMUL_LOHI(LH, RH, Lo, Hi, Opcode == ISD::SMUL_LOHI))
+    return false;
+
+  if (UseGlue)
+    Hi = DAG.getNode(ISD::ADDE, dl, DAG.getVTList(HiLoVT, MVT::Glue), Hi, Zero,
+                     Carry);
+  else
+    Hi = DAG.getNode(ISD::UADDO_CARRY, dl, DAG.getVTList(HiLoVT, BoolType), Hi,
+                     Zero, Carry);
+
+  Next = DAG.getNode(ISD::ADD, dl, VT, Next, Merge(Lo, Hi));
+
+  if (Opcode == ISD::SMUL_LOHI) {
+    SDValue NextSub = DAG.getNode(ISD::SUB, dl, VT, Next,
+                                  DAG.getNode(ISD::ZERO_EXTEND, dl, VT, RL));
+    Next = DAG.getSelectCC(dl, LH, Zero, NextSub, Next, ISD::SETLT);
+
+    NextSub = DAG.getNode(ISD::SUB, dl, VT, Next,
+                          DAG.getNode(ISD::ZERO_EXTEND, dl, VT, LL));
+    Next = DAG.getSelectCC(dl, RH, Zero, NextSub, Next, ISD::SETLT);
+  }
+
+  Result.push_back(DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, Next));
+  Next = DAG.getNode(ISD::SRL, dl, VT, Next, Shift);
+  Result.push_back(DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, Next));
+  return true;
+}
+
+bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
+                               SelectionDAG &DAG, MulExpansionKind Kind,
+                               SDValue LL, SDValue LH, SDValue RL,
+                               SDValue RH) const {
+  SmallVector<SDValue, 2> Result;
+  bool Ok = expandMUL_LOHI(N->getOpcode(), N->getValueType(0), SDLoc(N),
+                           N->getOperand(0), N->getOperand(1), Result, HiLoVT,
+                           DAG, Kind, LL, LH, RL, RH);
+  if (Ok) {
+    assert(Result.size() == 2);
+    Lo = Result[0];
+    Hi = Result[1];
+  }
+  return Ok;
+}
+
+// Optimize unsigned division or remainder by constants for types twice as large
+// as a legal VT.
+//
+// If (1 << (BitWidth / 2)) % Constant == 1, then the remainder
+// can be computed
+// as:
+//   Sum += __builtin_uadd_overflow(Lo, High, &Sum);
+//   Remainder = Sum % Constant
+// This is based on "Remainder by Summing Digits" from Hacker's Delight.
+//
+// For division, we can compute the remainder using the algorithm described
+// above, subtract it from the dividend to get an exact multiple of Constant.
+// Then multiply that extact multiply by the multiplicative inverse modulo
+// (1 << (BitWidth / 2)) to get the quotient.
+
+// If Constant is even, we can shift right the dividend and the divisor by the
+// number of trailing zeros in Constant before applying the remainder algorithm.
+// If we're after the quotient, we can subtract this value from the shifted
+// dividend and multiply by the multiplicative inverse of the shifted divisor.
+// If we want the remainder, we shift the value left by the number of trailing
+// zeros and add the bits that were shifted out of the dividend.
+bool TargetLowering::expandDIVREMByConstant(SDNode *N,
+                                            SmallVectorImpl<SDValue> &Result,
+                                            EVT HiLoVT, SelectionDAG &DAG,
+                                            SDValue LL, SDValue LH) const {
+  unsigned Opcode = N->getOpcode();
+  EVT VT = N->getValueType(0);
+
+  // TODO: Support signed division/remainder.
+  if (Opcode == ISD::SREM || Opcode == ISD::SDIV || Opcode == ISD::SDIVREM)
+    return false;
+  assert(
+      (Opcode == ISD::UREM || Opcode == ISD::UDIV || Opcode == ISD::UDIVREM) &&
+      "Unexpected opcode");
+
+  auto *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!CN)
+    return false;
+
+  APInt Divisor = CN->getAPIntValue();
+  unsigned BitWidth = Divisor.getBitWidth();
+  unsigned HBitWidth = BitWidth / 2;
+  assert(VT.getScalarSizeInBits() == BitWidth &&
+         HiLoVT.getScalarSizeInBits() == HBitWidth && "Unexpected VTs");
+
+  // Divisor needs to less than (1 << HBitWidth).
+  APInt HalfMaxPlus1 = APInt::getOneBitSet(BitWidth, HBitWidth);
+  if (Divisor.uge(HalfMaxPlus1))
+    return false;
+
+  // We depend on the UREM by constant optimization in DAGCombiner that requires
+  // high multiply.
+  if (!isOperationLegalOrCustom(ISD::MULHU, HiLoVT) &&
+      !isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT))
+    return false;
+
+  // Don't expand if optimizing for size.
+  if (DAG.shouldOptForSize())
+    return false;
+
+  // Early out for 0 or 1 divisors.
+  if (Divisor.ule(1))
+    return false;
+
+  // If the divisor is even, shift it until it becomes odd.
+  unsigned TrailingZeros = 0;
+  if (!Divisor[0]) {
+    TrailingZeros = Divisor.countr_zero();
+    Divisor.lshrInPlace(TrailingZeros);
+  }
+
+  SDLoc dl(N);
+  SDValue Sum;
+  SDValue PartialRem;
+
+  // If (1 << HBitWidth) % divisor == 1, we can add the two halves together and
+  // then add in the carry.
+  // TODO: If we can't split it in half, we might be able to split into 3 or
+  // more pieces using a smaller bit width.
+  if (HalfMaxPlus1.urem(Divisor).isOne()) {
+    assert(!LL == !LH && "Expected both input halves or no input halves!");
+    if (!LL)
+      std::tie(LL, LH) = DAG.SplitScalar(N->getOperand(0), dl, HiLoVT, HiLoVT);
+
+    // Shift the input by the number of TrailingZeros in the divisor. The
+    // shifted out bits will be added to the remainder later.
+    if (TrailingZeros) {
+      // Save the shifted off bits if we need the remainder.
+      if (Opcode != ISD::UDIV) {
+        APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros);
+        PartialRem = DAG.getNode(ISD::AND, dl, HiLoVT, LL,
+                                 DAG.getConstant(Mask, dl, HiLoVT));
+      }
+
+      LL = DAG.getNode(
+          ISD::OR, dl, HiLoVT,
+          DAG.getNode(ISD::SRL, dl, HiLoVT, LL,
+                      DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)),
+          DAG.getNode(ISD::SHL, dl, HiLoVT, LH,
+                      DAG.getShiftAmountConstant(HBitWidth - TrailingZeros,
+                                                 HiLoVT, dl)));
+      LH = DAG.getNode(ISD::SRL, dl, HiLoVT, LH,
+                       DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
+    }
+
+    // Use uaddo_carry if we can, otherwise use a compare to detect overflow.
+    EVT SetCCType =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT);
+    if (isOperationLegalOrCustom(ISD::UADDO_CARRY, HiLoVT)) {
+      SDVTList VTList = DAG.getVTList(HiLoVT, SetCCType);
+      Sum = DAG.getNode(ISD::UADDO, dl, VTList, LL, LH);
+      Sum = DAG.getNode(ISD::UADDO_CARRY, dl, VTList, Sum,
+                        DAG.getConstant(0, dl, HiLoVT), Sum.getValue(1));
+    } else {
+      Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, LL, LH);
+      SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, LL, ISD::SETULT);
+      // If the boolean for the target is 0 or 1, we can add the setcc result
+      // directly.
+      if (getBooleanContents(HiLoVT) ==
+          TargetLoweringBase::ZeroOrOneBooleanContent)
+        Carry = DAG.getZExtOrTrunc(Carry, dl, HiLoVT);
+      else
+        Carry = DAG.getSelect(dl, HiLoVT, Carry, DAG.getConstant(1, dl, HiLoVT),
+                              DAG.getConstant(0, dl, HiLoVT));
+      Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, Sum, Carry);
+    }
+  }
+
+  // If we didn't find a sum, we can't do the expansion.
+  if (!Sum)
+    return false;
+
+  // Perform a HiLoVT urem on the Sum using truncated divisor.
+  SDValue RemL =
+      DAG.getNode(ISD::UREM, dl, HiLoVT, Sum,
+                  DAG.getConstant(Divisor.trunc(HBitWidth), dl, HiLoVT));
+  SDValue RemH = DAG.getConstant(0, dl, HiLoVT);
+
+  if (Opcode != ISD::UREM) {
+    // Subtract the remainder from the shifted dividend.
+    SDValue Dividend = DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH);
+    SDValue Rem = DAG.getNode(ISD::BUILD_PAIR, dl, VT, RemL, RemH);
+
+    Dividend = DAG.getNode(ISD::SUB, dl, VT, Dividend, Rem);
+
+    // Multiply by the multiplicative inverse of the divisor modulo
+    // (1 << BitWidth).
+    APInt Mod = APInt::getSignedMinValue(BitWidth + 1);
+    APInt MulFactor = Divisor.zext(BitWidth + 1);
+    MulFactor = MulFactor.multiplicativeInverse(Mod);
+    MulFactor = MulFactor.trunc(BitWidth);
+
+    SDValue Quotient = DAG.getNode(ISD::MUL, dl, VT, Dividend,
+                                   DAG.getConstant(MulFactor, dl, VT));
+
+    // Split the quotient into low and high parts.
+    SDValue QuotL, QuotH;
+    std::tie(QuotL, QuotH) = DAG.SplitScalar(Quotient, dl, HiLoVT, HiLoVT);
+    Result.push_back(QuotL);
+    Result.push_back(QuotH);
+  }
+
+  if (Opcode != ISD::UDIV) {
+    // If we shifted the input, shift the remainder left and add the bits we
+    // shifted off the input.
+    if (TrailingZeros) {
+      APInt Mask = APInt::getLowBitsSet(HBitWidth, TrailingZeros);
+      RemL = DAG.getNode(ISD::SHL, dl, HiLoVT, RemL,
+                         DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
+      RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL, PartialRem);
+    }
+    Result.push_back(RemL);
+    Result.push_back(DAG.getConstant(0, dl, HiLoVT));
+  }
+
+  return true;
+}
+
+// Check that (every element of) Z is undef or not an exact multiple of BW.
+static bool isNonZeroModBitWidthOrUndef(SDValue Z, unsigned BW) {
+  return ISD::matchUnaryPredicate(
+      Z,
+      [=](ConstantSDNode *C) { return !C || C->getAPIntValue().urem(BW) != 0; },
+      true);
+}
+
+static SDValue expandVPFunnelShift(SDNode *Node, SelectionDAG &DAG) {
+  EVT VT = Node->getValueType(0);
+  SDValue ShX, ShY;
+  SDValue ShAmt, InvShAmt;
+  SDValue X = Node->getOperand(0);
+  SDValue Y = Node->getOperand(1);
+  SDValue Z = Node->getOperand(2);
+  SDValue Mask = Node->getOperand(3);
+  SDValue VL = Node->getOperand(4);
+
+  unsigned BW = VT.getScalarSizeInBits();
+  bool IsFSHL = Node->getOpcode() == ISD::VP_FSHL;
+  SDLoc DL(SDValue(Node, 0));
+
+  EVT ShVT = Z.getValueType();
+  if (isNonZeroModBitWidthOrUndef(Z, BW)) {
+    // fshl: X << C | Y >> (BW - C)
+    // fshr: X << (BW - C) | Y >> C
+    // where C = Z % BW is not zero
+    SDValue BitWidthC = DAG.getConstant(BW, DL, ShVT);
+    ShAmt = DAG.getNode(ISD::VP_UREM, DL, ShVT, Z, BitWidthC, Mask, VL);
+    InvShAmt = DAG.getNode(ISD::VP_SUB, DL, ShVT, BitWidthC, ShAmt, Mask, VL);
+    ShX = DAG.getNode(ISD::VP_SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt, Mask,
+                      VL);
+    ShY = DAG.getNode(ISD::VP_LSHR, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt, Mask,
+                      VL);
+  } else {
+    // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
+    // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
+    SDValue BitMask = DAG.getConstant(BW - 1, DL, ShVT);
+    if (isPowerOf2_32(BW)) {
+      // Z % BW -> Z & (BW - 1)
+      ShAmt = DAG.getNode(ISD::VP_AND, DL, ShVT, Z, BitMask, Mask, VL);
+      // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
+      SDValue NotZ = DAG.getNode(ISD::VP_XOR, DL, ShVT, Z,
+                                 DAG.getAllOnesConstant(DL, ShVT), Mask, VL);
+      InvShAmt = DAG.getNode(ISD::VP_AND, DL, ShVT, NotZ, BitMask, Mask, VL);
+    } else {
+      SDValue BitWidthC = DAG.getConstant(BW, DL, ShVT);
+      ShAmt = DAG.getNode(ISD::VP_UREM, DL, ShVT, Z, BitWidthC, Mask, VL);
+      InvShAmt = DAG.getNode(ISD::VP_SUB, DL, ShVT, BitMask, ShAmt, Mask, VL);
+    }
+
+    SDValue One = DAG.getConstant(1, DL, ShVT);
+    if (IsFSHL) {
+      ShX = DAG.getNode(ISD::VP_SHL, DL, VT, X, ShAmt, Mask, VL);
+      SDValue ShY1 = DAG.getNode(ISD::VP_LSHR, DL, VT, Y, One, Mask, VL);
+      ShY = DAG.getNode(ISD::VP_LSHR, DL, VT, ShY1, InvShAmt, Mask, VL);
+    } else {
+      SDValue ShX1 = DAG.getNode(ISD::VP_SHL, DL, VT, X, One, Mask, VL);
+      ShX = DAG.getNode(ISD::VP_SHL, DL, VT, ShX1, InvShAmt, Mask, VL);
+      ShY = DAG.getNode(ISD::VP_LSHR, DL, VT, Y, ShAmt, Mask, VL);
+    }
+  }
+  return DAG.getNode(ISD::VP_OR, DL, VT, ShX, ShY, Mask, VL);
+}
+
+SDValue TargetLowering::expandFunnelShift(SDNode *Node,
+                                          SelectionDAG &DAG) const {
+  if (Node->isVPOpcode())
+    return expandVPFunnelShift(Node, DAG);
+
+  EVT VT = Node->getValueType(0);
+
+  if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SHL, VT) ||
+                        !isOperationLegalOrCustom(ISD::SRL, VT) ||
+                        !isOperationLegalOrCustom(ISD::SUB, VT) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
+    return SDValue();
+
+  SDValue X = Node->getOperand(0);
+  SDValue Y = Node->getOperand(1);
+  SDValue Z = Node->getOperand(2);
+
+  unsigned BW = VT.getScalarSizeInBits();
+  bool IsFSHL = Node->getOpcode() == ISD::FSHL;
+  SDLoc DL(SDValue(Node, 0));
+
+  EVT ShVT = Z.getValueType();
+
+  // If a funnel shift in the other direction is more supported, use it.
+  unsigned RevOpcode = IsFSHL ? ISD::FSHR : ISD::FSHL;
+  if (!isOperationLegalOrCustom(Node->getOpcode(), VT) &&
+      isOperationLegalOrCustom(RevOpcode, VT) && isPowerOf2_32(BW)) {
+    if (isNonZeroModBitWidthOrUndef(Z, BW)) {
+      // fshl X, Y, Z -> fshr X, Y, -Z
+      // fshr X, Y, Z -> fshl X, Y, -Z
+      SDValue Zero = DAG.getConstant(0, DL, ShVT);
+      Z = DAG.getNode(ISD::SUB, DL, VT, Zero, Z);
+    } else {
+      // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
+      // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
+      SDValue One = DAG.getConstant(1, DL, ShVT);
+      if (IsFSHL) {
+        Y = DAG.getNode(RevOpcode, DL, VT, X, Y, One);
+        X = DAG.getNode(ISD::SRL, DL, VT, X, One);
+      } else {
+        X = DAG.getNode(RevOpcode, DL, VT, X, Y, One);
+        Y = DAG.getNode(ISD::SHL, DL, VT, Y, One);
+      }
+      Z = DAG.getNOT(DL, Z, ShVT);
+    }
+    return DAG.getNode(RevOpcode, DL, VT, X, Y, Z);
+  }
+
+  SDValue ShX, ShY;
+  SDValue ShAmt, InvShAmt;
+  if (isNonZeroModBitWidthOrUndef(Z, BW)) {
+    // fshl: X << C | Y >> (BW - C)
+    // fshr: X << (BW - C) | Y >> C
+    // where C = Z % BW is not zero
+    SDValue BitWidthC = DAG.getConstant(BW, DL, ShVT);
+    ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC);
+    InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, ShAmt);
+    ShX = DAG.getNode(ISD::SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt);
+    ShY = DAG.getNode(ISD::SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt);
+  } else {
+    // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
+    // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
+    SDValue Mask = DAG.getConstant(BW - 1, DL, ShVT);
+    if (isPowerOf2_32(BW)) {
+      // Z % BW -> Z & (BW - 1)
+      ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Z, Mask);
+      // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
+      InvShAmt = DAG.getNode(ISD::AND, DL, ShVT, DAG.getNOT(DL, Z, ShVT), Mask);
+    } else {
+      SDValue BitWidthC = DAG.getConstant(BW, DL, ShVT);
+      ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC);
+      InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, Mask, ShAmt);
+    }
+
+    SDValue One = DAG.getConstant(1, DL, ShVT);
+    if (IsFSHL) {
+      ShX = DAG.getNode(ISD::SHL, DL, VT, X, ShAmt);
+      SDValue ShY1 = DAG.getNode(ISD::SRL, DL, VT, Y, One);
+      ShY = DAG.getNode(ISD::SRL, DL, VT, ShY1, InvShAmt);
+    } else {
+      SDValue ShX1 = DAG.getNode(ISD::SHL, DL, VT, X, One);
+      ShX = DAG.getNode(ISD::SHL, DL, VT, ShX1, InvShAmt);
+      ShY = DAG.getNode(ISD::SRL, DL, VT, Y, ShAmt);
+    }
+  }
+  return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
+}
+
+// TODO: Merge with expandFunnelShift.
+SDValue TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps,
+                                  SelectionDAG &DAG) const {
+  EVT VT = Node->getValueType(0);
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  bool IsLeft = Node->getOpcode() == ISD::ROTL;
+  SDValue Op0 = Node->getOperand(0);
+  SDValue Op1 = Node->getOperand(1);
+  SDLoc DL(SDValue(Node, 0));
+
+  EVT ShVT = Op1.getValueType();
+  SDValue Zero = DAG.getConstant(0, DL, ShVT);
+
+  // If a rotate in the other direction is more supported, use it.
+  unsigned RevRot = IsLeft ? ISD::ROTR : ISD::ROTL;
+  if (!isOperationLegalOrCustom(Node->getOpcode(), VT) &&
+      isOperationLegalOrCustom(RevRot, VT) && isPowerOf2_32(EltSizeInBits)) {
+    SDValue Sub = DAG.getNode(ISD::SUB, DL, ShVT, Zero, Op1);
+    return DAG.getNode(RevRot, DL, VT, Op0, Sub);
+  }
+
+  if (!AllowVectorOps && VT.isVector() &&
+      (!isOperationLegalOrCustom(ISD::SHL, VT) ||
+       !isOperationLegalOrCustom(ISD::SRL, VT) ||
+       !isOperationLegalOrCustom(ISD::SUB, VT) ||
+       !isOperationLegalOrCustomOrPromote(ISD::OR, VT) ||
+       !isOperationLegalOrCustomOrPromote(ISD::AND, VT)))
+    return SDValue();
+
+  unsigned ShOpc = IsLeft ? ISD::SHL : ISD::SRL;
+  unsigned HsOpc = IsLeft ? ISD::SRL : ISD::SHL;
+  SDValue BitWidthMinusOneC = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
+  SDValue ShVal;
+  SDValue HsVal;
+  if (isPowerOf2_32(EltSizeInBits)) {
+    // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
+    // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
+    SDValue NegOp1 = DAG.getNode(ISD::SUB, DL, ShVT, Zero, Op1);
+    SDValue ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Op1, BitWidthMinusOneC);
+    ShVal = DAG.getNode(ShOpc, DL, VT, Op0, ShAmt);
+    SDValue HsAmt = DAG.getNode(ISD::AND, DL, ShVT, NegOp1, BitWidthMinusOneC);
+    HsVal = DAG.getNode(HsOpc, DL, VT, Op0, HsAmt);
+  } else {
+    // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
+    // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
+    SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
+    SDValue ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Op1, BitWidthC);
+    ShVal = DAG.getNode(ShOpc, DL, VT, Op0, ShAmt);
+    SDValue HsAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthMinusOneC, ShAmt);
+    SDValue One = DAG.getConstant(1, DL, ShVT);
+    HsVal =
+        DAG.getNode(HsOpc, DL, VT, DAG.getNode(HsOpc, DL, VT, Op0, One), HsAmt);
+  }
+  return DAG.getNode(ISD::OR, DL, VT, ShVal, HsVal);
+}
+
+void TargetLowering::expandShiftParts(SDNode *Node, SDValue &Lo, SDValue &Hi,
+                                      SelectionDAG &DAG) const {
+  assert(Node->getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Node->getValueType(0);
+  unsigned VTBits = VT.getScalarSizeInBits();
+  assert(isPowerOf2_32(VTBits) && "Power-of-two integer type expected");
+
+  bool IsSHL = Node->getOpcode() == ISD::SHL_PARTS;
+  bool IsSRA = Node->getOpcode() == ISD::SRA_PARTS;
+  SDValue ShOpLo = Node->getOperand(0);
+  SDValue ShOpHi = Node->getOperand(1);
+  SDValue ShAmt = Node->getOperand(2);
+  EVT ShAmtVT = ShAmt.getValueType();
+  EVT ShAmtCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ShAmtVT);
+  SDLoc dl(Node);
+
+  // ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
+  // ISD::SRA/L nodes haven't. Insert an AND to be safe, it's usually optimized
+  // away during isel.
+  SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
+                                  DAG.getConstant(VTBits - 1, dl, ShAmtVT));
+  SDValue Tmp1 = IsSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
+                                     DAG.getConstant(VTBits - 1, dl, ShAmtVT))
+                       : DAG.getConstant(0, dl, VT);
+
+  SDValue Tmp2, Tmp3;
+  if (IsSHL) {
+    Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
+  } else {
+    Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
+    Tmp3 = DAG.getNode(IsSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
+  }
+
+  // If the shift amount is larger or equal than the width of a part we don't
+  // use the result from the FSHL/FSHR. Insert a test and select the appropriate
+  // values for large shift amounts.
+  SDValue AndNode = DAG.getNode(ISD::AND, dl, ShAmtVT, ShAmt,
+                                DAG.getConstant(VTBits, dl, ShAmtVT));
+  SDValue Cond = DAG.getSetCC(dl, ShAmtCCVT, AndNode,
+                              DAG.getConstant(0, dl, ShAmtVT), ISD::SETNE);
+
+  if (IsSHL) {
+    Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
+    Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
+  } else {
+    Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
+    Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
+  }
+}
+
+bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
+                                      SelectionDAG &DAG) const {
+  unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0;
+  SDValue Src = Node->getOperand(OpNo);
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = Node->getValueType(0);
+  SDLoc dl(SDValue(Node, 0));
+
+  // FIXME: Only f32 to i64 conversions are supported.
+  if (SrcVT != MVT::f32 || DstVT != MVT::i64)
+    return false;
+
+  if (Node->isStrictFPOpcode())
+    // When a NaN is converted to an integer a trap is allowed. We can't
+    // use this expansion here because it would eliminate that trap. Other
+    // traps are also allowed and cannot be eliminated. See
+    // IEEE 754-2008 sec 5.8.
+    return false;
+
+  // Expand f32 -> i64 conversion
+  // This algorithm comes from compiler-rt's implementation of fixsfdi:
+  // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
+  unsigned SrcEltBits = SrcVT.getScalarSizeInBits();
+  EVT IntVT = SrcVT.changeTypeToInteger();
+  EVT IntShVT = getShiftAmountTy(IntVT, DAG.getDataLayout());
+
+  SDValue ExponentMask = DAG.getConstant(0x7F800000, dl, IntVT);
+  SDValue ExponentLoBit = DAG.getConstant(23, dl, IntVT);
+  SDValue Bias = DAG.getConstant(127, dl, IntVT);
+  SDValue SignMask = DAG.getConstant(APInt::getSignMask(SrcEltBits), dl, IntVT);
+  SDValue SignLowBit = DAG.getConstant(SrcEltBits - 1, dl, IntVT);
+  SDValue MantissaMask = DAG.getConstant(0x007FFFFF, dl, IntVT);
+
+  SDValue Bits = DAG.getNode(ISD::BITCAST, dl, IntVT, Src);
+
+  SDValue ExponentBits = DAG.getNode(
+      ISD::SRL, dl, IntVT, DAG.getNode(ISD::AND, dl, IntVT, Bits, ExponentMask),
+      DAG.getZExtOrTrunc(ExponentLoBit, dl, IntShVT));
+  SDValue Exponent = DAG.getNode(ISD::SUB, dl, IntVT, ExponentBits, Bias);
+
+  SDValue Sign = DAG.getNode(ISD::SRA, dl, IntVT,
+                             DAG.getNode(ISD::AND, dl, IntVT, Bits, SignMask),
+                             DAG.getZExtOrTrunc(SignLowBit, dl, IntShVT));
+  Sign = DAG.getSExtOrTrunc(Sign, dl, DstVT);
+
+  SDValue R = DAG.getNode(ISD::OR, dl, IntVT,
+                          DAG.getNode(ISD::AND, dl, IntVT, Bits, MantissaMask),
+                          DAG.getConstant(0x00800000, dl, IntVT));
+
+  R = DAG.getZExtOrTrunc(R, dl, DstVT);
+
+  R = DAG.getSelectCC(
+      dl, Exponent, ExponentLoBit,
+      DAG.getNode(ISD::SHL, dl, DstVT, R,
+                  DAG.getZExtOrTrunc(
+                      DAG.getNode(ISD::SUB, dl, IntVT, Exponent, ExponentLoBit),
+                      dl, IntShVT)),
+      DAG.getNode(ISD::SRL, dl, DstVT, R,
+                  DAG.getZExtOrTrunc(
+                      DAG.getNode(ISD::SUB, dl, IntVT, ExponentLoBit, Exponent),
+                      dl, IntShVT)),
+      ISD::SETGT);
+
+  SDValue Ret = DAG.getNode(ISD::SUB, dl, DstVT,
+                            DAG.getNode(ISD::XOR, dl, DstVT, R, Sign), Sign);
+
+  Result = DAG.getSelectCC(dl, Exponent, DAG.getConstant(0, dl, IntVT),
+                           DAG.getConstant(0, dl, DstVT), Ret, ISD::SETLT);
+  return true;
+}
+
+bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
+                                      SDValue &Chain,
+                                      SelectionDAG &DAG) const {
+  SDLoc dl(SDValue(Node, 0));
+  unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0;
+  SDValue Src = Node->getOperand(OpNo);
+
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = Node->getValueType(0);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
+  EVT DstSetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), DstVT);
+
+  // Only expand vector types if we have the appropriate vector bit operations.
+  unsigned SIntOpcode = Node->isStrictFPOpcode() ? ISD::STRICT_FP_TO_SINT :
+                                                   ISD::FP_TO_SINT;
+  if (DstVT.isVector() && (!isOperationLegalOrCustom(SIntOpcode, DstVT) ||
+                           !isOperationLegalOrCustomOrPromote(ISD::XOR, SrcVT)))
+    return false;
+
+  // If the maximum float value is smaller then the signed integer range,
+  // the destination signmask can't be represented by the float, so we can
+  // just use FP_TO_SINT directly.
+  const fltSemantics &APFSem = DAG.EVTToAPFloatSemantics(SrcVT);
+  APFloat APF(APFSem, APInt::getZero(SrcVT.getScalarSizeInBits()));
+  APInt SignMask = APInt::getSignMask(DstVT.getScalarSizeInBits());
+  if (APFloat::opOverflow &
+      APF.convertFromAPInt(SignMask, false, APFloat::rmNearestTiesToEven)) {
+    if (Node->isStrictFPOpcode()) {
+      Result = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { DstVT, MVT::Other },
+                           { Node->getOperand(0), Src });
+      Chain = Result.getValue(1);
+    } else
+      Result = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src);
+    return true;
+  }
+
+  // Don't expand it if there isn't cheap fsub instruction.
+  if (!isOperationLegalOrCustom(
+          Node->isStrictFPOpcode() ? ISD::STRICT_FSUB : ISD::FSUB, SrcVT))
+    return false;
+
+  SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT);
+  SDValue Sel;
+
+  if (Node->isStrictFPOpcode()) {
+    Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT,
+                       Node->getOperand(0), /*IsSignaling*/ true);
+    Chain = Sel.getValue(1);
+  } else {
+    Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT);
+  }
+
+  bool Strict = Node->isStrictFPOpcode() ||
+                shouldUseStrictFP_TO_INT(SrcVT, DstVT, /*IsSigned*/ false);
+
+  if (Strict) {
+    // Expand based on maximum range of FP_TO_SINT, if the value exceeds the
+    // signmask then offset (the result of which should be fully representable).
+    // Sel = Src < 0x8000000000000000
+    // FltOfs = select Sel, 0, 0x8000000000000000
+    // IntOfs = select Sel, 0, 0x8000000000000000
+    // Result = fp_to_sint(Src - FltOfs) ^ IntOfs
+
+    // TODO: Should any fast-math-flags be set for the FSUB?
+    SDValue FltOfs = DAG.getSelect(dl, SrcVT, Sel,
+                                   DAG.getConstantFP(0.0, dl, SrcVT), Cst);
+    Sel = DAG.getBoolExtOrTrunc(Sel, dl, DstSetCCVT, DstVT);
+    SDValue IntOfs = DAG.getSelect(dl, DstVT, Sel,
+                                   DAG.getConstant(0, dl, DstVT),
+                                   DAG.getConstant(SignMask, dl, DstVT));
+    SDValue SInt;
+    if (Node->isStrictFPOpcode()) {
+      SDValue Val = DAG.getNode(ISD::STRICT_FSUB, dl, { SrcVT, MVT::Other },
+                                { Chain, Src, FltOfs });
+      SInt = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { DstVT, MVT::Other },
+                         { Val.getValue(1), Val });
+      Chain = SInt.getValue(1);
+    } else {
+      SDValue Val = DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FltOfs);
+      SInt = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Val);
+    }
+    Result = DAG.getNode(ISD::XOR, dl, DstVT, SInt, IntOfs);
+  } else {
+    // Expand based on maximum range of FP_TO_SINT:
+    // True = fp_to_sint(Src)
+    // False = 0x8000000000000000 + fp_to_sint(Src - 0x8000000000000000)
+    // Result = select (Src < 0x8000000000000000), True, False
+
+    SDValue True = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src);
+    // TODO: Should any fast-math-flags be set for the FSUB?
+    SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT,
+                                DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst));
+    False = DAG.getNode(ISD::XOR, dl, DstVT, False,
+                        DAG.getConstant(SignMask, dl, DstVT));
+    Sel = DAG.getBoolExtOrTrunc(Sel, dl, DstSetCCVT, DstVT);
+    Result = DAG.getSelect(dl, DstVT, Sel, True, False);
+  }
+  return true;
+}
+
+bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
+                                      SDValue &Chain,
+                                      SelectionDAG &DAG) const {
+  // This transform is not correct for converting 0 when rounding mode is set
+  // to round toward negative infinity which will produce -0.0. So disable under
+  // strictfp.
+  if (Node->isStrictFPOpcode())
+    return false;
+
+  SDValue Src = Node->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = Node->getValueType(0);
+
+  if (SrcVT.getScalarType() != MVT::i64 || DstVT.getScalarType() != MVT::f64)
+    return false;
+
+  // Only expand vector types if we have the appropriate vector bit operations.
+  if (SrcVT.isVector() && (!isOperationLegalOrCustom(ISD::SRL, SrcVT) ||
+                           !isOperationLegalOrCustom(ISD::FADD, DstVT) ||
+                           !isOperationLegalOrCustom(ISD::FSUB, DstVT) ||
+                           !isOperationLegalOrCustomOrPromote(ISD::OR, SrcVT) ||
+                           !isOperationLegalOrCustomOrPromote(ISD::AND, SrcVT)))
+    return false;
+
+  SDLoc dl(SDValue(Node, 0));
+  EVT ShiftVT = getShiftAmountTy(SrcVT, DAG.getDataLayout());
+
+  // Implementation of unsigned i64 to f64 following the algorithm in
+  // __floatundidf in compiler_rt.  This implementation performs rounding
+  // correctly in all rounding modes with the exception of converting 0
+  // when rounding toward negative infinity. In that case the fsub will produce
+  // -0.0. This will be added to +0.0 and produce -0.0 which is incorrect.
+  SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT);
+  SDValue TwoP84PlusTwoP52 = DAG.getConstantFP(
+      llvm::bit_cast<double>(UINT64_C(0x4530000000100000)), dl, DstVT);
+  SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT);
+  SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT);
+  SDValue HiShift = DAG.getConstant(32, dl, ShiftVT);
+
+  SDValue Lo = DAG.getNode(ISD::AND, dl, SrcVT, Src, LoMask);
+  SDValue Hi = DAG.getNode(ISD::SRL, dl, SrcVT, Src, HiShift);
+  SDValue LoOr = DAG.getNode(ISD::OR, dl, SrcVT, Lo, TwoP52);
+  SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84);
+  SDValue LoFlt = DAG.getBitcast(DstVT, LoOr);
+  SDValue HiFlt = DAG.getBitcast(DstVT, HiOr);
+  SDValue HiSub =
+      DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52);
+  Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub);
+  return true;
+}
+
+SDValue
+TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node,
+                                               SelectionDAG &DAG) const {
+  unsigned Opcode = Node->getOpcode();
+  assert((Opcode == ISD::FMINNUM || Opcode == ISD::FMAXNUM ||
+          Opcode == ISD::STRICT_FMINNUM || Opcode == ISD::STRICT_FMAXNUM) &&
+         "Wrong opcode");
+
+  if (Node->getFlags().hasNoNaNs()) {
+    ISD::CondCode Pred = Opcode == ISD::FMINNUM ? ISD::SETLT : ISD::SETGT;
+    SDValue Op1 = Node->getOperand(0);
+    SDValue Op2 = Node->getOperand(1);
+    SDValue SelCC = DAG.getSelectCC(SDLoc(Node), Op1, Op2, Op1, Op2, Pred);
+    // Copy FMF flags, but always set the no-signed-zeros flag
+    // as this is implied by the FMINNUM/FMAXNUM semantics.
+    SDNodeFlags Flags = Node->getFlags();
+    Flags.setNoSignedZeros(true);
+    SelCC->setFlags(Flags);
+    return SelCC;
+  }
+
+  return SDValue();
+}
+
+SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
+                                              SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  unsigned NewOp = Node->getOpcode() == ISD::FMINNUM ?
+    ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
+  EVT VT = Node->getValueType(0);
+
+  if (VT.isScalableVector())
+    report_fatal_error(
+        "Expanding fminnum/fmaxnum for scalable vectors is undefined.");
+
+  if (isOperationLegalOrCustom(NewOp, VT)) {
+    SDValue Quiet0 = Node->getOperand(0);
+    SDValue Quiet1 = Node->getOperand(1);
+
+    if (!Node->getFlags().hasNoNaNs()) {
+      // Insert canonicalizes if it's possible we need to quiet to get correct
+      // sNaN behavior.
+      if (!DAG.isKnownNeverSNaN(Quiet0)) {
+        Quiet0 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet0,
+                             Node->getFlags());
+      }
+      if (!DAG.isKnownNeverSNaN(Quiet1)) {
+        Quiet1 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet1,
+                             Node->getFlags());
+      }
+    }
+
+    return DAG.getNode(NewOp, dl, VT, Quiet0, Quiet1, Node->getFlags());
+  }
+
+  // If the target has FMINIMUM/FMAXIMUM but not FMINNUM/FMAXNUM use that
+  // instead if there are no NaNs and there can't be an incompatible zero
+  // compare: at least one operand isn't +/-0, or there are no signed-zeros.
+  if ((Node->getFlags().hasNoNaNs() ||
+       (DAG.isKnownNeverNaN(Node->getOperand(0)) &&
+        DAG.isKnownNeverNaN(Node->getOperand(1)))) &&
+      (Node->getFlags().hasNoSignedZeros() ||
+       DAG.isKnownNeverZeroFloat(Node->getOperand(0)) ||
+       DAG.isKnownNeverZeroFloat(Node->getOperand(1)))) {
+    unsigned IEEE2018Op =
+        Node->getOpcode() == ISD::FMINNUM ? ISD::FMINIMUM : ISD::FMAXIMUM;
+    if (isOperationLegalOrCustom(IEEE2018Op, VT))
+      return DAG.getNode(IEEE2018Op, dl, VT, Node->getOperand(0),
+                         Node->getOperand(1), Node->getFlags());
+  }
+
+  if (SDValue SelCC = createSelectForFMINNUM_FMAXNUM(Node, DAG))
+    return SelCC;
+
+  return SDValue();
+}
+
+/// Returns a true value if if this FPClassTest can be performed with an ordered
+/// fcmp to 0, and a false value if it's an unordered fcmp to 0. Returns
+/// std::nullopt if it cannot be performed as a compare with 0.
+static std::optional<bool> isFCmpEqualZero(FPClassTest Test,
+                                           const fltSemantics &Semantics,
+                                           const MachineFunction &MF) {
+  FPClassTest OrderedMask = Test & ~fcNan;
+  FPClassTest NanTest = Test & fcNan;
+  bool IsOrdered = NanTest == fcNone;
+  bool IsUnordered = NanTest == fcNan;
+
+  // Skip cases that are testing for only a qnan or snan.
+  if (!IsOrdered && !IsUnordered)
+    return std::nullopt;
+
+  if (OrderedMask == fcZero &&
+      MF.getDenormalMode(Semantics).Input == DenormalMode::IEEE)
+    return IsOrdered;
+  if (OrderedMask == (fcZero | fcSubnormal) &&
+      MF.getDenormalMode(Semantics).inputsAreZero())
+    return IsOrdered;
+  return std::nullopt;
+}
+
+SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
+                                         FPClassTest Test, SDNodeFlags Flags,
+                                         const SDLoc &DL,
+                                         SelectionDAG &DAG) const {
+  EVT OperandVT = Op.getValueType();
+  assert(OperandVT.isFloatingPoint());
+
+  // Degenerated cases.
+  if (Test == fcNone)
+    return DAG.getBoolConstant(false, DL, ResultVT, OperandVT);
+  if ((Test & fcAllFlags) == fcAllFlags)
+    return DAG.getBoolConstant(true, DL, ResultVT, OperandVT);
+
+  // PPC double double is a pair of doubles, of which the higher part determines
+  // the value class.
+  if (OperandVT == MVT::ppcf128) {
+    Op = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::f64, Op,
+                     DAG.getConstant(1, DL, MVT::i32));
+    OperandVT = MVT::f64;
+  }
+
+  // Some checks may be represented as inversion of simpler check, for example
+  // "inf|normal|subnormal|zero" => !"nan".
+  bool IsInverted = false;
+  if (FPClassTest InvertedCheck = invertFPClassTestIfSimpler(Test)) {
+    IsInverted = true;
+    Test = InvertedCheck;
+  }
+
+  // Floating-point type properties.
+  EVT ScalarFloatVT = OperandVT.getScalarType();
+  const Type *FloatTy = ScalarFloatVT.getTypeForEVT(*DAG.getContext());
+  const llvm::fltSemantics &Semantics = FloatTy->getFltSemantics();
+  bool IsF80 = (ScalarFloatVT == MVT::f80);
+
+  // Some checks can be implemented using float comparisons, if floating point
+  // exceptions are ignored.
+  if (Flags.hasNoFPExcept() &&
+      isOperationLegalOrCustom(ISD::SETCC, OperandVT.getScalarType())) {
+    ISD::CondCode OrderedCmpOpcode = IsInverted ? ISD::SETUNE : ISD::SETOEQ;
+    ISD::CondCode UnorderedCmpOpcode = IsInverted ? ISD::SETONE : ISD::SETUEQ;
+
+    if (std::optional<bool> IsCmp0 =
+            isFCmpEqualZero(Test, Semantics, DAG.getMachineFunction());
+        IsCmp0 && (isCondCodeLegalOrCustom(
+                      *IsCmp0 ? OrderedCmpOpcode : UnorderedCmpOpcode,
+                      OperandVT.getScalarType().getSimpleVT()))) {
+
+      // If denormals could be implicitly treated as 0, this is not equivalent
+      // to a compare with 0 since it will also be true for denormals.
+      return DAG.getSetCC(DL, ResultVT, Op,
+                          DAG.getConstantFP(0.0, DL, OperandVT),
+                          *IsCmp0 ? OrderedCmpOpcode : UnorderedCmpOpcode);
+    }
+
+    if (Test == fcNan &&
+        isCondCodeLegalOrCustom(IsInverted ? ISD::SETO : ISD::SETUO,
+                                OperandVT.getScalarType().getSimpleVT())) {
+      return DAG.getSetCC(DL, ResultVT, Op, Op,
+                          IsInverted ? ISD::SETO : ISD::SETUO);
+    }
+
+    if (Test == fcInf &&
+        isCondCodeLegalOrCustom(IsInverted ? ISD::SETUNE : ISD::SETOEQ,
+                                OperandVT.getScalarType().getSimpleVT()) &&
+        isOperationLegalOrCustom(ISD::FABS, OperandVT.getScalarType())) {
+      // isinf(x) --> fabs(x) == inf
+      SDValue Abs = DAG.getNode(ISD::FABS, DL, OperandVT, Op);
+      SDValue Inf =
+          DAG.getConstantFP(APFloat::getInf(Semantics), DL, OperandVT);
+      return DAG.getSetCC(DL, ResultVT, Abs, Inf,
+                          IsInverted ? ISD::SETUNE : ISD::SETOEQ);
+    }
+  }
+
+  // In the general case use integer operations.
+  unsigned BitSize = OperandVT.getScalarSizeInBits();
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), BitSize);
+  if (OperandVT.isVector())
+    IntVT = EVT::getVectorVT(*DAG.getContext(), IntVT,
+                             OperandVT.getVectorElementCount());
+  SDValue OpAsInt = DAG.getBitcast(IntVT, Op);
+
+  // Various masks.
+  APInt SignBit = APInt::getSignMask(BitSize);
+  APInt ValueMask = APInt::getSignedMaxValue(BitSize);     // All bits but sign.
+  APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit.
+  const unsigned ExplicitIntBitInF80 = 63;
+  APInt ExpMask = Inf;
+  if (IsF80)
+    ExpMask.clearBit(ExplicitIntBitInF80);
+  APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf;
+  APInt QNaNBitMask =
+      APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1);
+  APInt InvertionMask = APInt::getAllOnes(ResultVT.getScalarSizeInBits());
+
+  SDValue ValueMaskV = DAG.getConstant(ValueMask, DL, IntVT);
+  SDValue SignBitV = DAG.getConstant(SignBit, DL, IntVT);
+  SDValue ExpMaskV = DAG.getConstant(ExpMask, DL, IntVT);
+  SDValue ZeroV = DAG.getConstant(0, DL, IntVT);
+  SDValue InfV = DAG.getConstant(Inf, DL, IntVT);
+  SDValue ResultInvertionMask = DAG.getConstant(InvertionMask, DL, ResultVT);
+
+  SDValue Res;
+  const auto appendResult = [&](SDValue PartialRes) {
+    if (PartialRes) {
+      if (Res)
+        Res = DAG.getNode(ISD::OR, DL, ResultVT, Res, PartialRes);
+      else
+        Res = PartialRes;
+    }
+  };
+
+  SDValue IntBitIsSetV; // Explicit integer bit in f80 mantissa is set.
+  const auto getIntBitIsSet = [&]() -> SDValue {
+    if (!IntBitIsSetV) {
+      APInt IntBitMask(BitSize, 0);
+      IntBitMask.setBit(ExplicitIntBitInF80);
+      SDValue IntBitMaskV = DAG.getConstant(IntBitMask, DL, IntVT);
+      SDValue IntBitV = DAG.getNode(ISD::AND, DL, IntVT, OpAsInt, IntBitMaskV);
+      IntBitIsSetV = DAG.getSetCC(DL, ResultVT, IntBitV, ZeroV, ISD::SETNE);
+    }
+    return IntBitIsSetV;
+  };
+
+  // Split the value into sign bit and absolute value.
+  SDValue AbsV = DAG.getNode(ISD::AND, DL, IntVT, OpAsInt, ValueMaskV);
+  SDValue SignV = DAG.getSetCC(DL, ResultVT, OpAsInt,
+                               DAG.getConstant(0.0, DL, IntVT), ISD::SETLT);
+
+  // Tests that involve more than one class should be processed first.
+  SDValue PartialRes;
+
+  if (IsF80)
+    ; // Detect finite numbers of f80 by checking individual classes because
+      // they have different settings of the explicit integer bit.
+  else if ((Test & fcFinite) == fcFinite) {
+    // finite(V) ==> abs(V) < exp_mask
+    PartialRes = DAG.getSetCC(DL, ResultVT, AbsV, ExpMaskV, ISD::SETLT);
+    Test &= ~fcFinite;
+  } else if ((Test & fcFinite) == fcPosFinite) {
+    // finite(V) && V > 0 ==> V < exp_mask
+    PartialRes = DAG.getSetCC(DL, ResultVT, OpAsInt, ExpMaskV, ISD::SETULT);
+    Test &= ~fcPosFinite;
+  } else if ((Test & fcFinite) == fcNegFinite) {
+    // finite(V) && V < 0 ==> abs(V) < exp_mask && signbit == 1
+    PartialRes = DAG.getSetCC(DL, ResultVT, AbsV, ExpMaskV, ISD::SETLT);
+    PartialRes = DAG.getNode(ISD::AND, DL, ResultVT, PartialRes, SignV);
+    Test &= ~fcNegFinite;
+  }
+  appendResult(PartialRes);
+
+  if (FPClassTest PartialCheck = Test & (fcZero | fcSubnormal)) {
+    // fcZero | fcSubnormal => test all exponent bits are 0
+    // TODO: Handle sign bit specific cases
+    if (PartialCheck == (fcZero | fcSubnormal)) {
+      SDValue ExpBits = DAG.getNode(ISD::AND, DL, IntVT, OpAsInt, ExpMaskV);
+      SDValue ExpIsZero =
+          DAG.getSetCC(DL, ResultVT, ExpBits, ZeroV, ISD::SETEQ);
+      appendResult(ExpIsZero);
+      Test &= ~PartialCheck & fcAllFlags;
+    }
+  }
+
+  // Check for individual classes.
+
+  if (unsigned PartialCheck = Test & fcZero) {
+    if (PartialCheck == fcPosZero)
+      PartialRes = DAG.getSetCC(DL, ResultVT, OpAsInt, ZeroV, ISD::SETEQ);
+    else if (PartialCheck == fcZero)
+      PartialRes = DAG.getSetCC(DL, ResultVT, AbsV, ZeroV, ISD::SETEQ);
+    else // ISD::fcNegZero
+      PartialRes = DAG.getSetCC(DL, ResultVT, OpAsInt, SignBitV, ISD::SETEQ);
+    appendResult(PartialRes);
+  }
+
+  if (unsigned PartialCheck = Test & fcSubnormal) {
+    // issubnormal(V) ==> unsigned(abs(V) - 1) < (all mantissa bits set)
+    // issubnormal(V) && V>0 ==> unsigned(V - 1) < (all mantissa bits set)
+    SDValue V = (PartialCheck == fcPosSubnormal) ? OpAsInt : AbsV;
+    SDValue MantissaV = DAG.getConstant(AllOneMantissa, DL, IntVT);
+    SDValue VMinusOneV =
+        DAG.getNode(ISD::SUB, DL, IntVT, V, DAG.getConstant(1, DL, IntVT));
+    PartialRes = DAG.getSetCC(DL, ResultVT, VMinusOneV, MantissaV, ISD::SETULT);
+    if (PartialCheck == fcNegSubnormal)
+      PartialRes = DAG.getNode(ISD::AND, DL, ResultVT, PartialRes, SignV);
+    appendResult(PartialRes);
+  }
+
+  if (unsigned PartialCheck = Test & fcInf) {
+    if (PartialCheck == fcPosInf)
+      PartialRes = DAG.getSetCC(DL, ResultVT, OpAsInt, InfV, ISD::SETEQ);
+    else if (PartialCheck == fcInf)
+      PartialRes = DAG.getSetCC(DL, ResultVT, AbsV, InfV, ISD::SETEQ);
+    else { // ISD::fcNegInf
+      APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt();
+      SDValue NegInfV = DAG.getConstant(NegInf, DL, IntVT);
+      PartialRes = DAG.getSetCC(DL, ResultVT, OpAsInt, NegInfV, ISD::SETEQ);
+    }
+    appendResult(PartialRes);
+  }
+
+  if (unsigned PartialCheck = Test & fcNan) {
+    APInt InfWithQnanBit = Inf | QNaNBitMask;
+    SDValue InfWithQnanBitV = DAG.getConstant(InfWithQnanBit, DL, IntVT);
+    if (PartialCheck == fcNan) {
+      // isnan(V) ==> abs(V) > int(inf)
+      PartialRes = DAG.getSetCC(DL, ResultVT, AbsV, InfV, ISD::SETGT);
+      if (IsF80) {
+        // Recognize unsupported values as NaNs for compatibility with glibc.
+        // In them (exp(V)==0) == int_bit.
+        SDValue ExpBits = DAG.getNode(ISD::AND, DL, IntVT, AbsV, ExpMaskV);
+        SDValue ExpIsZero =
+            DAG.getSetCC(DL, ResultVT, ExpBits, ZeroV, ISD::SETEQ);
+        SDValue IsPseudo =
+            DAG.getSetCC(DL, ResultVT, getIntBitIsSet(), ExpIsZero, ISD::SETEQ);
+        PartialRes = DAG.getNode(ISD::OR, DL, ResultVT, PartialRes, IsPseudo);
+      }
+    } else if (PartialCheck == fcQNan) {
+      // isquiet(V) ==> abs(V) >= (unsigned(Inf) | quiet_bit)
+      PartialRes =
+          DAG.getSetCC(DL, ResultVT, AbsV, InfWithQnanBitV, ISD::SETGE);
+    } else { // ISD::fcSNan
+      // issignaling(V) ==> abs(V) > unsigned(Inf) &&
+      //                    abs(V) < (unsigned(Inf) | quiet_bit)
+      SDValue IsNan = DAG.getSetCC(DL, ResultVT, AbsV, InfV, ISD::SETGT);
+      SDValue IsNotQnan =
+          DAG.getSetCC(DL, ResultVT, AbsV, InfWithQnanBitV, ISD::SETLT);
+      PartialRes = DAG.getNode(ISD::AND, DL, ResultVT, IsNan, IsNotQnan);
+    }
+    appendResult(PartialRes);
+  }
+
+  if (unsigned PartialCheck = Test & fcNormal) {
+    // isnormal(V) ==> (0 < exp < max_exp) ==> (unsigned(exp-1) < (max_exp-1))
+    APInt ExpLSB = ExpMask & ~(ExpMask.shl(1));
+    SDValue ExpLSBV = DAG.getConstant(ExpLSB, DL, IntVT);
+    SDValue ExpMinus1 = DAG.getNode(ISD::SUB, DL, IntVT, AbsV, ExpLSBV);
+    APInt ExpLimit = ExpMask - ExpLSB;
+    SDValue ExpLimitV = DAG.getConstant(ExpLimit, DL, IntVT);
+    PartialRes = DAG.getSetCC(DL, ResultVT, ExpMinus1, ExpLimitV, ISD::SETULT);
+    if (PartialCheck == fcNegNormal)
+      PartialRes = DAG.getNode(ISD::AND, DL, ResultVT, PartialRes, SignV);
+    else if (PartialCheck == fcPosNormal) {
+      SDValue PosSignV =
+          DAG.getNode(ISD::XOR, DL, ResultVT, SignV, ResultInvertionMask);
+      PartialRes = DAG.getNode(ISD::AND, DL, ResultVT, PartialRes, PosSignV);
+    }
+    if (IsF80)
+      PartialRes =
+          DAG.getNode(ISD::AND, DL, ResultVT, PartialRes, getIntBitIsSet());
+    appendResult(PartialRes);
+  }
+
+  if (!Res)
+    return DAG.getConstant(IsInverted, DL, ResultVT);
+  if (IsInverted)
+    Res = DAG.getNode(ISD::XOR, DL, ResultVT, Res, ResultInvertionMask);
+  return Res;
+}
+
+// Only expand vector types if we have the appropriate vector bit operations.
+static bool canExpandVectorCTPOP(const TargetLowering &TLI, EVT VT) {
+  assert(VT.isVector() && "Expected vector type");
+  unsigned Len = VT.getScalarSizeInBits();
+  return TLI.isOperationLegalOrCustom(ISD::ADD, VT) &&
+         TLI.isOperationLegalOrCustom(ISD::SUB, VT) &&
+         TLI.isOperationLegalOrCustom(ISD::SRL, VT) &&
+         (Len == 8 || TLI.isOperationLegalOrCustom(ISD::MUL, VT)) &&
+         TLI.isOperationLegalOrCustomOrPromote(ISD::AND, VT);
+}
+
+SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  SDValue Op = Node->getOperand(0);
+  unsigned Len = VT.getScalarSizeInBits();
+  assert(VT.isInteger() && "CTPOP not implemented for this type.");
+
+  // TODO: Add support for irregular type lengths.
+  if (!(Len <= 128 && Len % 8 == 0))
+    return SDValue();
+
+  // Only expand vector types if we have the appropriate vector bit operations.
+  if (VT.isVector() && !canExpandVectorCTPOP(*this, VT))
+    return SDValue();
+
+  // This is the "best" algorithm from
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+  SDValue Mask55 =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), dl, VT);
+  SDValue Mask33 =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), dl, VT);
+  SDValue Mask0F =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), dl, VT);
+
+  // v = v - ((v >> 1) & 0x55555555...)
+  Op = DAG.getNode(ISD::SUB, dl, VT, Op,
+                   DAG.getNode(ISD::AND, dl, VT,
+                               DAG.getNode(ISD::SRL, dl, VT, Op,
+                                           DAG.getConstant(1, dl, ShVT)),
+                               Mask55));
+  // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+  Op = DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::AND, dl, VT, Op, Mask33),
+                   DAG.getNode(ISD::AND, dl, VT,
+                               DAG.getNode(ISD::SRL, dl, VT, Op,
+                                           DAG.getConstant(2, dl, ShVT)),
+                               Mask33));
+  // v = (v + (v >> 4)) & 0x0F0F0F0F...
+  Op = DAG.getNode(ISD::AND, dl, VT,
+                   DAG.getNode(ISD::ADD, dl, VT, Op,
+                               DAG.getNode(ISD::SRL, dl, VT, Op,
+                                           DAG.getConstant(4, dl, ShVT))),
+                   Mask0F);
+
+  if (Len <= 8)
+    return Op;
+
+  // Avoid the multiply if we only have 2 bytes to add.
+  // TODO: Only doing this for scalars because vectors weren't as obviously
+  // improved.
+  if (Len == 16 && !VT.isVector()) {
+    // v = (v + (v >> 8)) & 0x00FF;
+    return DAG.getNode(ISD::AND, dl, VT,
+                     DAG.getNode(ISD::ADD, dl, VT, Op,
+                                 DAG.getNode(ISD::SRL, dl, VT, Op,
+                                             DAG.getConstant(8, dl, ShVT))),
+                     DAG.getConstant(0xFF, dl, VT));
+  }
+
+  // v = (v * 0x01010101...) >> (Len - 8)
+  SDValue Mask01 =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
+  return DAG.getNode(ISD::SRL, dl, VT,
+                     DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
+                     DAG.getConstant(Len - 8, dl, ShVT));
+}
+
+SDValue TargetLowering::expandVPCTPOP(SDNode *Node, SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  SDValue Op = Node->getOperand(0);
+  SDValue Mask = Node->getOperand(1);
+  SDValue VL = Node->getOperand(2);
+  unsigned Len = VT.getScalarSizeInBits();
+  assert(VT.isInteger() && "VP_CTPOP not implemented for this type.");
+
+  // TODO: Add support for irregular type lengths.
+  if (!(Len <= 128 && Len % 8 == 0))
+    return SDValue();
+
+  // This is same algorithm of expandCTPOP from
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+  SDValue Mask55 =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), dl, VT);
+  SDValue Mask33 =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), dl, VT);
+  SDValue Mask0F =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), dl, VT);
+
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5;
+
+  // v = v - ((v >> 1) & 0x55555555...)
+  Tmp1 = DAG.getNode(ISD::VP_AND, dl, VT,
+                     DAG.getNode(ISD::VP_LSHR, dl, VT, Op,
+                                 DAG.getConstant(1, dl, ShVT), Mask, VL),
+                     Mask55, Mask, VL);
+  Op = DAG.getNode(ISD::VP_SUB, dl, VT, Op, Tmp1, Mask, VL);
+
+  // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+  Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Op, Mask33, Mask, VL);
+  Tmp3 = DAG.getNode(ISD::VP_AND, dl, VT,
+                     DAG.getNode(ISD::VP_LSHR, dl, VT, Op,
+                                 DAG.getConstant(2, dl, ShVT), Mask, VL),
+                     Mask33, Mask, VL);
+  Op = DAG.getNode(ISD::VP_ADD, dl, VT, Tmp2, Tmp3, Mask, VL);
+
+  // v = (v + (v >> 4)) & 0x0F0F0F0F...
+  Tmp4 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(4, dl, ShVT),
+                     Mask, VL),
+  Tmp5 = DAG.getNode(ISD::VP_ADD, dl, VT, Op, Tmp4, Mask, VL);
+  Op = DAG.getNode(ISD::VP_AND, dl, VT, Tmp5, Mask0F, Mask, VL);
+
+  if (Len <= 8)
+    return Op;
+
+  // v = (v * 0x01010101...) >> (Len - 8)
+  SDValue Mask01 =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
+  return DAG.getNode(ISD::VP_LSHR, dl, VT,
+                     DAG.getNode(ISD::VP_MUL, dl, VT, Op, Mask01, Mask, VL),
+                     DAG.getConstant(Len - 8, dl, ShVT), Mask, VL);
+}
+
+SDValue TargetLowering::expandCTLZ(SDNode *Node, SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  SDValue Op = Node->getOperand(0);
+  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+  // If the non-ZERO_UNDEF version is supported we can use that instead.
+  if (Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF &&
+      isOperationLegalOrCustom(ISD::CTLZ, VT))
+    return DAG.getNode(ISD::CTLZ, dl, VT, Op);
+
+  // If the ZERO_UNDEF version is supported use that and handle the zero case.
+  if (isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
+    EVT SetCCVT =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+    SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op);
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
+    return DAG.getSelect(dl, VT, SrcIsZero,
+                         DAG.getConstant(NumBitsPerElt, dl, VT), CTLZ);
+  }
+
+  // Only expand vector types if we have the appropriate vector bit operations.
+  // This includes the operations needed to expand CTPOP if it isn't supported.
+  if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
+                        (!isOperationLegalOrCustom(ISD::CTPOP, VT) &&
+                         !canExpandVectorCTPOP(*this, VT)) ||
+                        !isOperationLegalOrCustom(ISD::SRL, VT) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
+    return SDValue();
+
+  // for now, we do this:
+  // x = x | (x >> 1);
+  // x = x | (x >> 2);
+  // ...
+  // x = x | (x >>16);
+  // x = x | (x >>32); // for 64-bit input
+  // return popcount(~x);
+  //
+  // Ref: "Hacker's Delight" by Henry Warren
+  for (unsigned i = 0; (1U << i) < NumBitsPerElt; ++i) {
+    SDValue Tmp = DAG.getConstant(1ULL << i, dl, ShVT);
+    Op = DAG.getNode(ISD::OR, dl, VT, Op,
+                     DAG.getNode(ISD::SRL, dl, VT, Op, Tmp));
+  }
+  Op = DAG.getNOT(dl, Op, VT);
+  return DAG.getNode(ISD::CTPOP, dl, VT, Op);
+}
+
+SDValue TargetLowering::expandVPCTLZ(SDNode *Node, SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  SDValue Op = Node->getOperand(0);
+  SDValue Mask = Node->getOperand(1);
+  SDValue VL = Node->getOperand(2);
+  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+  // do this:
+  // x = x | (x >> 1);
+  // x = x | (x >> 2);
+  // ...
+  // x = x | (x >>16);
+  // x = x | (x >>32); // for 64-bit input
+  // return popcount(~x);
+  for (unsigned i = 0; (1U << i) < NumBitsPerElt; ++i) {
+    SDValue Tmp = DAG.getConstant(1ULL << i, dl, ShVT);
+    Op = DAG.getNode(ISD::VP_OR, dl, VT, Op,
+                     DAG.getNode(ISD::VP_LSHR, dl, VT, Op, Tmp, Mask, VL), Mask,
+                     VL);
+  }
+  Op = DAG.getNode(ISD::VP_XOR, dl, VT, Op, DAG.getConstant(-1, dl, VT), Mask,
+                   VL);
+  return DAG.getNode(ISD::VP_CTPOP, dl, VT, Op, Mask, VL);
+}
+
+SDValue TargetLowering::CTTZTableLookup(SDNode *Node, SelectionDAG &DAG,
+                                        const SDLoc &DL, EVT VT, SDValue Op,
+                                        unsigned BitWidth) const {
+  if (BitWidth != 32 && BitWidth != 64)
+    return SDValue();
+  APInt DeBruijn = BitWidth == 32 ? APInt(32, 0x077CB531U)
+                                  : APInt(64, 0x0218A392CD3D5DBFULL);
+  const DataLayout &TD = DAG.getDataLayout();
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+  unsigned ShiftAmt = BitWidth - Log2_32(BitWidth);
+  SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
+  SDValue Lookup = DAG.getNode(
+      ISD::SRL, DL, VT,
+      DAG.getNode(ISD::MUL, DL, VT, DAG.getNode(ISD::AND, DL, VT, Op, Neg),
+                  DAG.getConstant(DeBruijn, DL, VT)),
+      DAG.getConstant(ShiftAmt, DL, VT));
+  Lookup = DAG.getSExtOrTrunc(Lookup, DL, getPointerTy(TD));
+
+  SmallVector<uint8_t> Table(BitWidth, 0);
+  for (unsigned i = 0; i < BitWidth; i++) {
+    APInt Shl = DeBruijn.shl(i);
+    APInt Lshr = Shl.lshr(ShiftAmt);
+    Table[Lshr.getZExtValue()] = i;
+  }
+
+  // Create a ConstantArray in Constant Pool
+  auto *CA = ConstantDataArray::get(*DAG.getContext(), Table);
+  SDValue CPIdx = DAG.getConstantPool(CA, getPointerTy(TD),
+                                      TD.getPrefTypeAlign(CA->getType()));
+  SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getEntryNode(),
+                                   DAG.getMemBasePlusOffset(CPIdx, Lookup, DL),
+                                   PtrInfo, MVT::i8);
+  if (Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF)
+    return ExtLoad;
+
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue SrcIsZero = DAG.getSetCC(DL, SetCCVT, Op, Zero, ISD::SETEQ);
+  return DAG.getSelect(DL, VT, SrcIsZero,
+                       DAG.getConstant(BitWidth, DL, VT), ExtLoad);
+}
+
+SDValue TargetLowering::expandCTTZ(SDNode *Node, SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  SDValue Op = Node->getOperand(0);
+  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+  // If the non-ZERO_UNDEF version is supported we can use that instead.
+  if (Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF &&
+      isOperationLegalOrCustom(ISD::CTTZ, VT))
+    return DAG.getNode(ISD::CTTZ, dl, VT, Op);
+
+  // If the ZERO_UNDEF version is supported use that and handle the zero case.
+  if (isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) {
+    EVT SetCCVT =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+    SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op);
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
+    return DAG.getSelect(dl, VT, SrcIsZero,
+                         DAG.getConstant(NumBitsPerElt, dl, VT), CTTZ);
+  }
+
+  // Only expand vector types if we have the appropriate vector bit operations.
+  // This includes the operations needed to expand CTPOP if it isn't supported.
+  if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
+                        (!isOperationLegalOrCustom(ISD::CTPOP, VT) &&
+                         !isOperationLegalOrCustom(ISD::CTLZ, VT) &&
+                         !canExpandVectorCTPOP(*this, VT)) ||
+                        !isOperationLegalOrCustom(ISD::SUB, VT) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::AND, VT) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::XOR, VT)))
+    return SDValue();
+
+  // Emit Table Lookup if ISD::CTLZ and ISD::CTPOP are not legal.
+  if (!VT.isVector() && isOperationExpand(ISD::CTPOP, VT) &&
+      !isOperationLegal(ISD::CTLZ, VT))
+    if (SDValue V = CTTZTableLookup(Node, DAG, dl, VT, Op, NumBitsPerElt))
+      return V;
+
+  // for now, we use: { return popcount(~x & (x - 1)); }
+  // unless the target has ctlz but not ctpop, in which case we use:
+  // { return 32 - nlz(~x & (x-1)); }
+  // Ref: "Hacker's Delight" by Henry Warren
+  SDValue Tmp = DAG.getNode(
+      ISD::AND, dl, VT, DAG.getNOT(dl, Op, VT),
+      DAG.getNode(ISD::SUB, dl, VT, Op, DAG.getConstant(1, dl, VT)));
+
+  // If ISD::CTLZ is legal and CTPOP isn't, then do that instead.
+  if (isOperationLegal(ISD::CTLZ, VT) && !isOperationLegal(ISD::CTPOP, VT)) {
+    return DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(NumBitsPerElt, dl, VT),
+                       DAG.getNode(ISD::CTLZ, dl, VT, Tmp));
+  }
+
+  return DAG.getNode(ISD::CTPOP, dl, VT, Tmp);
+}
+
+SDValue TargetLowering::expandVPCTTZ(SDNode *Node, SelectionDAG &DAG) const {
+  SDValue Op = Node->getOperand(0);
+  SDValue Mask = Node->getOperand(1);
+  SDValue VL = Node->getOperand(2);
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+
+  // Same as the vector part of expandCTTZ, use: popcount(~x & (x - 1))
+  SDValue Not = DAG.getNode(ISD::VP_XOR, dl, VT, Op,
+                            DAG.getConstant(-1, dl, VT), Mask, VL);
+  SDValue MinusOne = DAG.getNode(ISD::VP_SUB, dl, VT, Op,
+                                 DAG.getConstant(1, dl, VT), Mask, VL);
+  SDValue Tmp = DAG.getNode(ISD::VP_AND, dl, VT, Not, MinusOne, Mask, VL);
+  return DAG.getNode(ISD::VP_CTPOP, dl, VT, Tmp, Mask, VL);
+}
+
+SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG,
+                                  bool IsNegative) const {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  SDValue Op = N->getOperand(0);
+
+  // abs(x) -> smax(x,sub(0,x))
+  if (!IsNegative && isOperationLegal(ISD::SUB, VT) &&
+      isOperationLegal(ISD::SMAX, VT)) {
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    return DAG.getNode(ISD::SMAX, dl, VT, Op,
+                       DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
+  }
+
+  // abs(x) -> umin(x,sub(0,x))
+  if (!IsNegative && isOperationLegal(ISD::SUB, VT) &&
+      isOperationLegal(ISD::UMIN, VT)) {
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    Op = DAG.getFreeze(Op);
+    return DAG.getNode(ISD::UMIN, dl, VT, Op,
+                       DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
+  }
+
+  // 0 - abs(x) -> smin(x, sub(0,x))
+  if (IsNegative && isOperationLegal(ISD::SUB, VT) &&
+      isOperationLegal(ISD::SMIN, VT)) {
+    Op = DAG.getFreeze(Op);
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    return DAG.getNode(ISD::SMIN, dl, VT, Op,
+                       DAG.getNode(ISD::SUB, dl, VT, Zero, Op));
+  }
+
+  // Only expand vector types if we have the appropriate vector operations.
+  if (VT.isVector() &&
+      (!isOperationLegalOrCustom(ISD::SRA, VT) ||
+       (!IsNegative && !isOperationLegalOrCustom(ISD::ADD, VT)) ||
+       (IsNegative && !isOperationLegalOrCustom(ISD::SUB, VT)) ||
+       !isOperationLegalOrCustomOrPromote(ISD::XOR, VT)))
+    return SDValue();
+
+  Op = DAG.getFreeze(Op);
+  SDValue Shift =
+      DAG.getNode(ISD::SRA, dl, VT, Op,
+                  DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, ShVT));
+  SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Op, Shift);
+
+  // abs(x) -> Y = sra (X, size(X)-1); sub (xor (X, Y), Y)
+  if (!IsNegative)
+    return DAG.getNode(ISD::SUB, dl, VT, Xor, Shift);
+
+  // 0 - abs(x) -> Y = sra (X, size(X)-1); sub (Y, xor (X, Y))
+  return DAG.getNode(ISD::SUB, dl, VT, Shift, Xor);
+}
+
+SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue LHS = DAG.getFreeze(N->getOperand(0));
+  SDValue RHS = DAG.getFreeze(N->getOperand(1));
+  bool IsSigned = N->getOpcode() == ISD::ABDS;
+
+  // abds(lhs, rhs) -> sub(smax(lhs,rhs), smin(lhs,rhs))
+  // abdu(lhs, rhs) -> sub(umax(lhs,rhs), umin(lhs,rhs))
+  unsigned MaxOpc = IsSigned ? ISD::SMAX : ISD::UMAX;
+  unsigned MinOpc = IsSigned ? ISD::SMIN : ISD::UMIN;
+  if (isOperationLegal(MaxOpc, VT) && isOperationLegal(MinOpc, VT)) {
+    SDValue Max = DAG.getNode(MaxOpc, dl, VT, LHS, RHS);
+    SDValue Min = DAG.getNode(MinOpc, dl, VT, LHS, RHS);
+    return DAG.getNode(ISD::SUB, dl, VT, Max, Min);
+  }
+
+  // abdu(lhs, rhs) -> or(usubsat(lhs,rhs), usubsat(rhs,lhs))
+  if (!IsSigned && isOperationLegal(ISD::USUBSAT, VT))
+    return DAG.getNode(ISD::OR, dl, VT,
+                       DAG.getNode(ISD::USUBSAT, dl, VT, LHS, RHS),
+                       DAG.getNode(ISD::USUBSAT, dl, VT, RHS, LHS));
+
+  // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
+  // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
+  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
+  SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC);
+  return DAG.getSelect(dl, VT, Cmp, DAG.getNode(ISD::SUB, dl, VT, LHS, RHS),
+                       DAG.getNode(ISD::SUB, dl, VT, RHS, LHS));
+}
+
+SDValue TargetLowering::expandBSWAP(SDNode *N, SelectionDAG &DAG) const {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+
+  if (!VT.isSimple())
+    return SDValue();
+
+  EVT SHVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5, Tmp6, Tmp7, Tmp8;
+  switch (VT.getSimpleVT().getScalarType().SimpleTy) {
+  default:
+    return SDValue();
+  case MVT::i16:
+    // Use a rotate by 8. This can be further expanded if necessary.
+    return DAG.getNode(ISD::ROTL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
+  case MVT::i32:
+    Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Op,
+                       DAG.getConstant(0xFF00, dl, VT));
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(8, dl, SHVT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(0xFF00, dl, VT));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
+    Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
+    Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
+    return DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
+  case MVT::i64:
+    Tmp8 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(56, dl, SHVT));
+    Tmp7 = DAG.getNode(ISD::AND, dl, VT, Op,
+                       DAG.getConstant(255ULL<<8, dl, VT));
+    Tmp7 = DAG.getNode(ISD::SHL, dl, VT, Tmp7, DAG.getConstant(40, dl, SHVT));
+    Tmp6 = DAG.getNode(ISD::AND, dl, VT, Op,
+                       DAG.getConstant(255ULL<<16, dl, VT));
+    Tmp6 = DAG.getNode(ISD::SHL, dl, VT, Tmp6, DAG.getConstant(24, dl, SHVT));
+    Tmp5 = DAG.getNode(ISD::AND, dl, VT, Op,
+                       DAG.getConstant(255ULL<<24, dl, VT));
+    Tmp5 = DAG.getNode(ISD::SHL, dl, VT, Tmp5, DAG.getConstant(8, dl, SHVT));
+    Tmp4 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
+    Tmp4 = DAG.getNode(ISD::AND, dl, VT, Tmp4,
+                       DAG.getConstant(255ULL<<24, dl, VT));
+    Tmp3 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3,
+                       DAG.getConstant(255ULL<<16, dl, VT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(40, dl, SHVT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2,
+                       DAG.getConstant(255ULL<<8, dl, VT));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(56, dl, SHVT));
+    Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp7);
+    Tmp6 = DAG.getNode(ISD::OR, dl, VT, Tmp6, Tmp5);
+    Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
+    Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
+    Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp6);
+    Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
+    return DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp4);
+  }
+}
+
+SDValue TargetLowering::expandVPBSWAP(SDNode *N, SelectionDAG &DAG) const {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  SDValue Mask = N->getOperand(1);
+  SDValue EVL = N->getOperand(2);
+
+  if (!VT.isSimple())
+    return SDValue();
+
+  EVT SHVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5, Tmp6, Tmp7, Tmp8;
+  switch (VT.getSimpleVT().getScalarType().SimpleTy) {
+  default:
+    return SDValue();
+  case MVT::i16:
+    Tmp1 = DAG.getNode(ISD::VP_SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
+                       Mask, EVL);
+    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
+                       Mask, EVL);
+    return DAG.getNode(ISD::VP_OR, dl, VT, Tmp1, Tmp2, Mask, EVL);
+  case MVT::i32:
+    Tmp4 = DAG.getNode(ISD::VP_SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT),
+                       Mask, EVL);
+    Tmp3 = DAG.getNode(ISD::VP_AND, dl, VT, Op, DAG.getConstant(0xFF00, dl, VT),
+                       Mask, EVL);
+    Tmp3 = DAG.getNode(ISD::VP_SHL, dl, VT, Tmp3, DAG.getConstant(8, dl, SHVT),
+                       Mask, EVL);
+    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
+                       Mask, EVL);
+    Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2,
+                       DAG.getConstant(0xFF00, dl, VT), Mask, EVL);
+    Tmp1 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(24, dl, SHVT),
+                       Mask, EVL);
+    Tmp4 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp4, Tmp3, Mask, EVL);
+    Tmp2 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp2, Tmp1, Mask, EVL);
+    return DAG.getNode(ISD::VP_OR, dl, VT, Tmp4, Tmp2, Mask, EVL);
+  case MVT::i64:
+    Tmp8 = DAG.getNode(ISD::VP_SHL, dl, VT, Op, DAG.getConstant(56, dl, SHVT),
+                       Mask, EVL);
+    Tmp7 = DAG.getNode(ISD::VP_AND, dl, VT, Op,
+                       DAG.getConstant(255ULL << 8, dl, VT), Mask, EVL);
+    Tmp7 = DAG.getNode(ISD::VP_SHL, dl, VT, Tmp7, DAG.getConstant(40, dl, SHVT),
+                       Mask, EVL);
+    Tmp6 = DAG.getNode(ISD::VP_AND, dl, VT, Op,
+                       DAG.getConstant(255ULL << 16, dl, VT), Mask, EVL);
+    Tmp6 = DAG.getNode(ISD::VP_SHL, dl, VT, Tmp6, DAG.getConstant(24, dl, SHVT),
+                       Mask, EVL);
+    Tmp5 = DAG.getNode(ISD::VP_AND, dl, VT, Op,
+                       DAG.getConstant(255ULL << 24, dl, VT), Mask, EVL);
+    Tmp5 = DAG.getNode(ISD::VP_SHL, dl, VT, Tmp5, DAG.getConstant(8, dl, SHVT),
+                       Mask, EVL);
+    Tmp4 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
+                       Mask, EVL);
+    Tmp4 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp4,
+                       DAG.getConstant(255ULL << 24, dl, VT), Mask, EVL);
+    Tmp3 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(24, dl, SHVT),
+                       Mask, EVL);
+    Tmp3 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp3,
+                       DAG.getConstant(255ULL << 16, dl, VT), Mask, EVL);
+    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(40, dl, SHVT),
+                       Mask, EVL);
+    Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2,
+                       DAG.getConstant(255ULL << 8, dl, VT), Mask, EVL);
+    Tmp1 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(56, dl, SHVT),
+                       Mask, EVL);
+    Tmp8 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp8, Tmp7, Mask, EVL);
+    Tmp6 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp6, Tmp5, Mask, EVL);
+    Tmp4 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp4, Tmp3, Mask, EVL);
+    Tmp2 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp2, Tmp1, Mask, EVL);
+    Tmp8 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp8, Tmp6, Mask, EVL);
+    Tmp4 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp4, Tmp2, Mask, EVL);
+    return DAG.getNode(ISD::VP_OR, dl, VT, Tmp8, Tmp4, Mask, EVL);
+  }
+}
+
+SDValue TargetLowering::expandBITREVERSE(SDNode *N, SelectionDAG &DAG) const {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  EVT SHVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  unsigned Sz = VT.getScalarSizeInBits();
+
+  SDValue Tmp, Tmp2, Tmp3;
+
+  // If we can, perform BSWAP first and then the mask+swap the i4, then i2
+  // and finally the i1 pairs.
+  // TODO: We can easily support i4/i2 legal types if any target ever does.
+  if (Sz >= 8 && isPowerOf2_32(Sz)) {
+    // Create the masks - repeating the pattern every byte.
+    APInt Mask4 = APInt::getSplat(Sz, APInt(8, 0x0F));
+    APInt Mask2 = APInt::getSplat(Sz, APInt(8, 0x33));
+    APInt Mask1 = APInt::getSplat(Sz, APInt(8, 0x55));
+
+    // BSWAP if the type is wider than a single byte.
+    Tmp = (Sz > 8 ? DAG.getNode(ISD::BSWAP, dl, VT, Op) : Op);
+
+    // swap i4: ((V >> 4) & 0x0F) | ((V & 0x0F) << 4)
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(4, dl, SHVT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask4, dl, VT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask4, dl, VT));
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(4, dl, SHVT));
+    Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
+
+    // swap i2: ((V >> 2) & 0x33) | ((V & 0x33) << 2)
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(2, dl, SHVT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask2, dl, VT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask2, dl, VT));
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(2, dl, SHVT));
+    Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
+
+    // swap i1: ((V >> 1) & 0x55) | ((V & 0x55) << 1)
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp, DAG.getConstant(1, dl, SHVT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Mask1, dl, VT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(Mask1, dl, VT));
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(1, dl, SHVT));
+    Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
+    return Tmp;
+  }
+
+  Tmp = DAG.getConstant(0, dl, VT);
+  for (unsigned I = 0, J = Sz-1; I < Sz; ++I, --J) {
+    if (I < J)
+      Tmp2 =
+          DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(J - I, dl, SHVT));
+    else
+      Tmp2 =
+          DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(I - J, dl, SHVT));
+
+    APInt Shift = APInt::getOneBitSet(Sz, J);
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Shift, dl, VT));
+    Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp, Tmp2);
+  }
+
+  return Tmp;
+}
+
+SDValue TargetLowering::expandVPBITREVERSE(SDNode *N, SelectionDAG &DAG) const {
+  assert(N->getOpcode() == ISD::VP_BITREVERSE);
+
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  SDValue Mask = N->getOperand(1);
+  SDValue EVL = N->getOperand(2);
+  EVT SHVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  unsigned Sz = VT.getScalarSizeInBits();
+
+  SDValue Tmp, Tmp2, Tmp3;
+
+  // If we can, perform BSWAP first and then the mask+swap the i4, then i2
+  // and finally the i1 pairs.
+  // TODO: We can easily support i4/i2 legal types if any target ever does.
+  if (Sz >= 8 && isPowerOf2_32(Sz)) {
+    // Create the masks - repeating the pattern every byte.
+    APInt Mask4 = APInt::getSplat(Sz, APInt(8, 0x0F));
+    APInt Mask2 = APInt::getSplat(Sz, APInt(8, 0x33));
+    APInt Mask1 = APInt::getSplat(Sz, APInt(8, 0x55));
+
+    // BSWAP if the type is wider than a single byte.
+    Tmp = (Sz > 8 ? DAG.getNode(ISD::VP_BSWAP, dl, VT, Op, Mask, EVL) : Op);
+
+    // swap i4: ((V >> 4) & 0x0F) | ((V & 0x0F) << 4)
+    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Tmp, DAG.getConstant(4, dl, SHVT),
+                       Mask, EVL);
+    Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2,
+                       DAG.getConstant(Mask4, dl, VT), Mask, EVL);
+    Tmp3 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp, DAG.getConstant(Mask4, dl, VT),
+                       Mask, EVL);
+    Tmp3 = DAG.getNode(ISD::VP_SHL, dl, VT, Tmp3, DAG.getConstant(4, dl, SHVT),
+                       Mask, EVL);
+    Tmp = DAG.getNode(ISD::VP_OR, dl, VT, Tmp2, Tmp3, Mask, EVL);
+
+    // swap i2: ((V >> 2) & 0x33) | ((V & 0x33) << 2)
+    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Tmp, DAG.getConstant(2, dl, SHVT),
+                       Mask, EVL);
+    Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2,
+                       DAG.getConstant(Mask2, dl, VT), Mask, EVL);
+    Tmp3 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp, DAG.getConstant(Mask2, dl, VT),
+                       Mask, EVL);
+    Tmp3 = DAG.getNode(ISD::VP_SHL, dl, VT, Tmp3, DAG.getConstant(2, dl, SHVT),
+                       Mask, EVL);
+    Tmp = DAG.getNode(ISD::VP_OR, dl, VT, Tmp2, Tmp3, Mask, EVL);
+
+    // swap i1: ((V >> 1) & 0x55) | ((V & 0x55) << 1)
+    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Tmp, DAG.getConstant(1, dl, SHVT),
+                       Mask, EVL);
+    Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2,
+                       DAG.getConstant(Mask1, dl, VT), Mask, EVL);
+    Tmp3 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp, DAG.getConstant(Mask1, dl, VT),
+                       Mask, EVL);
+    Tmp3 = DAG.getNode(ISD::VP_SHL, dl, VT, Tmp3, DAG.getConstant(1, dl, SHVT),
+                       Mask, EVL);
+    Tmp = DAG.getNode(ISD::VP_OR, dl, VT, Tmp2, Tmp3, Mask, EVL);
+    return Tmp;
+  }
+  return SDValue();
+}
+
+std::pair<SDValue, SDValue>
+TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
+                                    SelectionDAG &DAG) const {
+  SDLoc SL(LD);
+  SDValue Chain = LD->getChain();
+  SDValue BasePTR = LD->getBasePtr();
+  EVT SrcVT = LD->getMemoryVT();
+  EVT DstVT = LD->getValueType(0);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+
+  if (SrcVT.isScalableVector())
+    report_fatal_error("Cannot scalarize scalable vector loads");
+
+  unsigned NumElem = SrcVT.getVectorNumElements();
+
+  EVT SrcEltVT = SrcVT.getScalarType();
+  EVT DstEltVT = DstVT.getScalarType();
+
+  // A vector must always be stored in memory as-is, i.e. without any padding
+  // between the elements, since various code depend on it, e.g. in the
+  // handling of a bitcast of a vector type to int, which may be done with a
+  // vector store followed by an integer load. A vector that does not have
+  // elements that are byte-sized must therefore be stored as an integer
+  // built out of the extracted vector elements.
+  if (!SrcEltVT.isByteSized()) {
+    unsigned NumLoadBits = SrcVT.getStoreSizeInBits();
+    EVT LoadVT = EVT::getIntegerVT(*DAG.getContext(), NumLoadBits);
+
+    unsigned NumSrcBits = SrcVT.getSizeInBits();
+    EVT SrcIntVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcBits);
+
+    unsigned SrcEltBits = SrcEltVT.getSizeInBits();
+    SDValue SrcEltBitMask = DAG.getConstant(
+        APInt::getLowBitsSet(NumLoadBits, SrcEltBits), SL, LoadVT);
+
+    // Load the whole vector and avoid masking off the top bits as it makes
+    // the codegen worse.
+    SDValue Load =
+        DAG.getExtLoad(ISD::EXTLOAD, SL, LoadVT, Chain, BasePTR,
+                       LD->getPointerInfo(), SrcIntVT, LD->getOriginalAlign(),
+                       LD->getMemOperand()->getFlags(), LD->getAAInfo());
+
+    SmallVector<SDValue, 8> Vals;
+    for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
+      unsigned ShiftIntoIdx =
+          (DAG.getDataLayout().isBigEndian() ? (NumElem - 1) - Idx : Idx);
+      SDValue ShiftAmount =
+          DAG.getShiftAmountConstant(ShiftIntoIdx * SrcEltVT.getSizeInBits(),
+                                     LoadVT, SL, /*LegalTypes=*/false);
+      SDValue ShiftedElt = DAG.getNode(ISD::SRL, SL, LoadVT, Load, ShiftAmount);
+      SDValue Elt =
+          DAG.getNode(ISD::AND, SL, LoadVT, ShiftedElt, SrcEltBitMask);
+      SDValue Scalar = DAG.getNode(ISD::TRUNCATE, SL, SrcEltVT, Elt);
+
+      if (ExtType != ISD::NON_EXTLOAD) {
+        unsigned ExtendOp = ISD::getExtForLoadExtType(false, ExtType);
+        Scalar = DAG.getNode(ExtendOp, SL, DstEltVT, Scalar);
+      }
+
+      Vals.push_back(Scalar);
+    }
+
+    SDValue Value = DAG.getBuildVector(DstVT, SL, Vals);
+    return std::make_pair(Value, Load.getValue(1));
+  }
+
+  unsigned Stride = SrcEltVT.getSizeInBits() / 8;
+  assert(SrcEltVT.isByteSized());
+
+  SmallVector<SDValue, 8> Vals;
+  SmallVector<SDValue, 8> LoadChains;
+
+  for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
+    SDValue ScalarLoad =
+        DAG.getExtLoad(ExtType, SL, DstEltVT, Chain, BasePTR,
+                       LD->getPointerInfo().getWithOffset(Idx * Stride),
+                       SrcEltVT, LD->getOriginalAlign(),
+                       LD->getMemOperand()->getFlags(), LD->getAAInfo());
+
+    BasePTR = DAG.getObjectPtrOffset(SL, BasePTR, TypeSize::Fixed(Stride));
+
+    Vals.push_back(ScalarLoad.getValue(0));
+    LoadChains.push_back(ScalarLoad.getValue(1));
+  }
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoadChains);
+  SDValue Value = DAG.getBuildVector(DstVT, SL, Vals);
+
+  return std::make_pair(Value, NewChain);
+}
+
+SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST,
+                                             SelectionDAG &DAG) const {
+  SDLoc SL(ST);
+
+  SDValue Chain = ST->getChain();
+  SDValue BasePtr = ST->getBasePtr();
+  SDValue Value = ST->getValue();
+  EVT StVT = ST->getMemoryVT();
+
+  if (StVT.isScalableVector())
+    report_fatal_error("Cannot scalarize scalable vector stores");
+
+  // The type of the data we want to save
+  EVT RegVT = Value.getValueType();
+  EVT RegSclVT = RegVT.getScalarType();
+
+  // The type of data as saved in memory.
+  EVT MemSclVT = StVT.getScalarType();
+
+  unsigned NumElem = StVT.getVectorNumElements();
+
+  // A vector must always be stored in memory as-is, i.e. without any padding
+  // between the elements, since various code depend on it, e.g. in the
+  // handling of a bitcast of a vector type to int, which may be done with a
+  // vector store followed by an integer load. A vector that does not have
+  // elements that are byte-sized must therefore be stored as an integer
+  // built out of the extracted vector elements.
+  if (!MemSclVT.isByteSized()) {
+    unsigned NumBits = StVT.getSizeInBits();
+    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumBits);
+
+    SDValue CurrVal = DAG.getConstant(0, SL, IntVT);
+
+    for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
+      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, RegSclVT, Value,
+                                DAG.getVectorIdxConstant(Idx, SL));
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MemSclVT, Elt);
+      SDValue ExtElt = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Trunc);
+      unsigned ShiftIntoIdx =
+          (DAG.getDataLayout().isBigEndian() ? (NumElem - 1) - Idx : Idx);
+      SDValue ShiftAmount =
+          DAG.getConstant(ShiftIntoIdx * MemSclVT.getSizeInBits(), SL, IntVT);
+      SDValue ShiftedElt =
+          DAG.getNode(ISD::SHL, SL, IntVT, ExtElt, ShiftAmount);
+      CurrVal = DAG.getNode(ISD::OR, SL, IntVT, CurrVal, ShiftedElt);
+    }
+
+    return DAG.getStore(Chain, SL, CurrVal, BasePtr, ST->getPointerInfo(),
+                        ST->getOriginalAlign(), ST->getMemOperand()->getFlags(),
+                        ST->getAAInfo());
+  }
+
+  // Store Stride in bytes
+  unsigned Stride = MemSclVT.getSizeInBits() / 8;
+  assert(Stride && "Zero stride!");
+  // Extract each of the elements from the original vector and save them into
+  // memory individually.
+  SmallVector<SDValue, 8> Stores;
+  for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, RegSclVT, Value,
+                              DAG.getVectorIdxConstant(Idx, SL));
+
+    SDValue Ptr =
+        DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Idx * Stride));
+
+    // This scalar TruncStore may be illegal, but we legalize it later.
+    SDValue Store = DAG.getTruncStore(
+        Chain, SL, Elt, Ptr, ST->getPointerInfo().getWithOffset(Idx * Stride),
+        MemSclVT, ST->getOriginalAlign(), ST->getMemOperand()->getFlags(),
+        ST->getAAInfo());
+
+    Stores.push_back(Store);
+  }
+
+  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Stores);
+}
+
+std::pair<SDValue, SDValue>
+TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
+  assert(LD->getAddressingMode() == ISD::UNINDEXED &&
+         "unaligned indexed loads not implemented!");
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  EVT VT = LD->getValueType(0);
+  EVT LoadedVT = LD->getMemoryVT();
+  SDLoc dl(LD);
+  auto &MF = DAG.getMachineFunction();
+
+  if (VT.isFloatingPoint() || VT.isVector()) {
+    EVT intVT = EVT::getIntegerVT(*DAG.getContext(), LoadedVT.getSizeInBits());
+    if (isTypeLegal(intVT) && isTypeLegal(LoadedVT)) {
+      if (!isOperationLegalOrCustom(ISD::LOAD, intVT) &&
+          LoadedVT.isVector()) {
+        // Scalarize the load and let the individual components be handled.
+        return scalarizeVectorLoad(LD, DAG);
+      }
+
+      // Expand to a (misaligned) integer load of the same size,
+      // then bitconvert to floating point or vector.
+      SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr,
+                                    LD->getMemOperand());
+      SDValue Result = DAG.getNode(ISD::BITCAST, dl, LoadedVT, newLoad);
+      if (LoadedVT != VT)
+        Result = DAG.getNode(VT.isFloatingPoint() ? ISD::FP_EXTEND :
+                             ISD::ANY_EXTEND, dl, VT, Result);
+
+      return std::make_pair(Result, newLoad.getValue(1));
+    }
+
+    // Copy the value to a (aligned) stack slot using (unaligned) integer
+    // loads and stores, then do a (aligned) load from the stack slot.
+    MVT RegVT = getRegisterType(*DAG.getContext(), intVT);
+    unsigned LoadedBytes = LoadedVT.getStoreSize();
+    unsigned RegBytes = RegVT.getSizeInBits() / 8;
+    unsigned NumRegs = (LoadedBytes + RegBytes - 1) / RegBytes;
+
+    // Make sure the stack slot is also aligned for the register type.
+    SDValue StackBase = DAG.CreateStackTemporary(LoadedVT, RegVT);
+    auto FrameIndex = cast<FrameIndexSDNode>(StackBase.getNode())->getIndex();
+    SmallVector<SDValue, 8> Stores;
+    SDValue StackPtr = StackBase;
+    unsigned Offset = 0;
+
+    EVT PtrVT = Ptr.getValueType();
+    EVT StackPtrVT = StackPtr.getValueType();
+
+    SDValue PtrIncrement = DAG.getConstant(RegBytes, dl, PtrVT);
+    SDValue StackPtrIncrement = DAG.getConstant(RegBytes, dl, StackPtrVT);
+
+    // Do all but one copies using the full register width.
+    for (unsigned i = 1; i < NumRegs; i++) {
+      // Load one integer register's worth from the original location.
+      SDValue Load = DAG.getLoad(
+          RegVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(Offset),
+          LD->getOriginalAlign(), LD->getMemOperand()->getFlags(),
+          LD->getAAInfo());
+      // Follow the load with a store to the stack slot.  Remember the store.
+      Stores.push_back(DAG.getStore(
+          Load.getValue(1), dl, Load, StackPtr,
+          MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset)));
+      // Increment the pointers.
+      Offset += RegBytes;
+
+      Ptr = DAG.getObjectPtrOffset(dl, Ptr, PtrIncrement);
+      StackPtr = DAG.getObjectPtrOffset(dl, StackPtr, StackPtrIncrement);
+    }
+
+    // The last copy may be partial.  Do an extending load.
+    EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
+                                  8 * (LoadedBytes - Offset));
+    SDValue Load =
+        DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Chain, Ptr,
+                       LD->getPointerInfo().getWithOffset(Offset), MemVT,
+                       LD->getOriginalAlign(), LD->getMemOperand()->getFlags(),
+                       LD->getAAInfo());
+    // Follow the load with a store to the stack slot.  Remember the store.
+    // On big-endian machines this requires a truncating store to ensure
+    // that the bits end up in the right place.
+    Stores.push_back(DAG.getTruncStore(
+        Load.getValue(1), dl, Load, StackPtr,
+        MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset), MemVT));
+
+    // The order of the stores doesn't matter - say it with a TokenFactor.
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+
+    // Finally, perform the original load only redirected to the stack slot.
+    Load = DAG.getExtLoad(LD->getExtensionType(), dl, VT, TF, StackBase,
+                          MachinePointerInfo::getFixedStack(MF, FrameIndex, 0),
+                          LoadedVT);
+
+    // Callers expect a MERGE_VALUES node.
+    return std::make_pair(Load, TF);
+  }
+
+  assert(LoadedVT.isInteger() && !LoadedVT.isVector() &&
+         "Unaligned load of unsupported type.");
+
+  // Compute the new VT that is half the size of the old one.  This is an
+  // integer MVT.
+  unsigned NumBits = LoadedVT.getSizeInBits();
+  EVT NewLoadedVT;
+  NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2);
+  NumBits >>= 1;
+
+  Align Alignment = LD->getOriginalAlign();
+  unsigned IncrementSize = NumBits / 8;
+  ISD::LoadExtType HiExtType = LD->getExtensionType();
+
+  // If the original load is NON_EXTLOAD, the hi part load must be ZEXTLOAD.
+  if (HiExtType == ISD::NON_EXTLOAD)
+    HiExtType = ISD::ZEXTLOAD;
+
+  // Load the value in two parts
+  SDValue Lo, Hi;
+  if (DAG.getDataLayout().isLittleEndian()) {
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
+                        NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
+                        LD->getAAInfo());
+
+    Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::Fixed(IncrementSize));
+    Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
+                        LD->getPointerInfo().getWithOffset(IncrementSize),
+                        NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
+                        LD->getAAInfo());
+  } else {
+    Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
+                        NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
+                        LD->getAAInfo());
+
+    Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::Fixed(IncrementSize));
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
+                        LD->getPointerInfo().getWithOffset(IncrementSize),
+                        NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
+                        LD->getAAInfo());
+  }
+
+  // aggregate the two parts
+  SDValue ShiftAmount =
+      DAG.getConstant(NumBits, dl, getShiftAmountTy(Hi.getValueType(),
+                                                    DAG.getDataLayout()));
+  SDValue Result = DAG.getNode(ISD::SHL, dl, VT, Hi, ShiftAmount);
+  Result = DAG.getNode(ISD::OR, dl, VT, Result, Lo);
+
+  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                             Hi.getValue(1));
+
+  return std::make_pair(Result, TF);
+}
+
+SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
+                                             SelectionDAG &DAG) const {
+  assert(ST->getAddressingMode() == ISD::UNINDEXED &&
+         "unaligned indexed stores not implemented!");
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+  SDValue Val = ST->getValue();
+  EVT VT = Val.getValueType();
+  Align Alignment = ST->getOriginalAlign();
+  auto &MF = DAG.getMachineFunction();
+  EVT StoreMemVT = ST->getMemoryVT();
+
+  SDLoc dl(ST);
+  if (StoreMemVT.isFloatingPoint() || StoreMemVT.isVector()) {
+    EVT intVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+    if (isTypeLegal(intVT)) {
+      if (!isOperationLegalOrCustom(ISD::STORE, intVT) &&
+          StoreMemVT.isVector()) {
+        // Scalarize the store and let the individual components be handled.
+        SDValue Result = scalarizeVectorStore(ST, DAG);
+        return Result;
+      }
+      // Expand to a bitconvert of the value to the integer type of the
+      // same size, then a (misaligned) int store.
+      // FIXME: Does not handle truncating floating point stores!
+      SDValue Result = DAG.getNode(ISD::BITCAST, dl, intVT, Val);
+      Result = DAG.getStore(Chain, dl, Result, Ptr, ST->getPointerInfo(),
+                            Alignment, ST->getMemOperand()->getFlags());
+      return Result;
+    }
+    // Do a (aligned) store to a stack slot, then copy from the stack slot
+    // to the final destination using (unaligned) integer loads and stores.
+    MVT RegVT = getRegisterType(
+        *DAG.getContext(),
+        EVT::getIntegerVT(*DAG.getContext(), StoreMemVT.getSizeInBits()));
+    EVT PtrVT = Ptr.getValueType();
+    unsigned StoredBytes = StoreMemVT.getStoreSize();
+    unsigned RegBytes = RegVT.getSizeInBits() / 8;
+    unsigned NumRegs = (StoredBytes + RegBytes - 1) / RegBytes;
+
+    // Make sure the stack slot is also aligned for the register type.
+    SDValue StackPtr = DAG.CreateStackTemporary(StoreMemVT, RegVT);
+    auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+
+    // Perform the original store, only redirected to the stack slot.
+    SDValue Store = DAG.getTruncStore(
+        Chain, dl, Val, StackPtr,
+        MachinePointerInfo::getFixedStack(MF, FrameIndex, 0), StoreMemVT);
+
+    EVT StackPtrVT = StackPtr.getValueType();
+
+    SDValue PtrIncrement = DAG.getConstant(RegBytes, dl, PtrVT);
+    SDValue StackPtrIncrement = DAG.getConstant(RegBytes, dl, StackPtrVT);
+    SmallVector<SDValue, 8> Stores;
+    unsigned Offset = 0;
+
+    // Do all but one copies using the full register width.
+    for (unsigned i = 1; i < NumRegs; i++) {
+      // Load one integer register's worth from the stack slot.
+      SDValue Load = DAG.getLoad(
+          RegVT, dl, Store, StackPtr,
+          MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset));
+      // Store it to the final location.  Remember the store.
+      Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr,
+                                    ST->getPointerInfo().getWithOffset(Offset),
+                                    ST->getOriginalAlign(),
+                                    ST->getMemOperand()->getFlags()));
+      // Increment the pointers.
+      Offset += RegBytes;
+      StackPtr = DAG.getObjectPtrOffset(dl, StackPtr, StackPtrIncrement);
+      Ptr = DAG.getObjectPtrOffset(dl, Ptr, PtrIncrement);
+    }
+
+    // The last store may be partial.  Do a truncating store.  On big-endian
+    // machines this requires an extending load from the stack slot to ensure
+    // that the bits are in the right place.
+    EVT LoadMemVT =
+        EVT::getIntegerVT(*DAG.getContext(), 8 * (StoredBytes - Offset));
+
+    // Load from the stack slot.
+    SDValue Load = DAG.getExtLoad(
+        ISD::EXTLOAD, dl, RegVT, Store, StackPtr,
+        MachinePointerInfo::getFixedStack(MF, FrameIndex, Offset), LoadMemVT);
+
+    Stores.push_back(
+        DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
+                          ST->getPointerInfo().getWithOffset(Offset), LoadMemVT,
+                          ST->getOriginalAlign(),
+                          ST->getMemOperand()->getFlags(), ST->getAAInfo()));
+    // The order of the stores doesn't matter - say it with a TokenFactor.
+    SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+    return Result;
+  }
+
+  assert(StoreMemVT.isInteger() && !StoreMemVT.isVector() &&
+         "Unaligned store of unknown type.");
+  // Get the half-size VT
+  EVT NewStoredVT = StoreMemVT.getHalfSizedIntegerVT(*DAG.getContext());
+  unsigned NumBits = NewStoredVT.getFixedSizeInBits();
+  unsigned IncrementSize = NumBits / 8;
+
+  // Divide the stored value in two parts.
+  SDValue ShiftAmount = DAG.getConstant(
+      NumBits, dl, getShiftAmountTy(Val.getValueType(), DAG.getDataLayout()));
+  SDValue Lo = Val;
+  SDValue Hi = DAG.getNode(ISD::SRL, dl, VT, Val, ShiftAmount);
+
+  // Store the two parts
+  SDValue Store1, Store2;
+  Store1 = DAG.getTruncStore(Chain, dl,
+                             DAG.getDataLayout().isLittleEndian() ? Lo : Hi,
+                             Ptr, ST->getPointerInfo(), NewStoredVT, Alignment,
+                             ST->getMemOperand()->getFlags());
+
+  Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::Fixed(IncrementSize));
+  Store2 = DAG.getTruncStore(
+      Chain, dl, DAG.getDataLayout().isLittleEndian() ? Hi : Lo, Ptr,
+      ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT, Alignment,
+      ST->getMemOperand()->getFlags(), ST->getAAInfo());
+
+  SDValue Result =
+      DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+  return Result;
+}
+
+SDValue
+TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
+                                       const SDLoc &DL, EVT DataVT,
+                                       SelectionDAG &DAG,
+                                       bool IsCompressedMemory) const {
+  SDValue Increment;
+  EVT AddrVT = Addr.getValueType();
+  EVT MaskVT = Mask.getValueType();
+  assert(DataVT.getVectorElementCount() == MaskVT.getVectorElementCount() &&
+         "Incompatible types of Data and Mask");
+  if (IsCompressedMemory) {
+    if (DataVT.isScalableVector())
+      report_fatal_error(
+          "Cannot currently handle compressed memory with scalable vectors");
+    // Incrementing the pointer according to number of '1's in the mask.
+    EVT MaskIntVT = EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits());
+    SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask);
+    if (MaskIntVT.getSizeInBits() < 32) {
+      MaskInIntReg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskInIntReg);
+      MaskIntVT = MVT::i32;
+    }
+
+    // Count '1's with POPCNT.
+    Increment = DAG.getNode(ISD::CTPOP, DL, MaskIntVT, MaskInIntReg);
+    Increment = DAG.getZExtOrTrunc(Increment, DL, AddrVT);
+    // Scale is an element size in bytes.
+    SDValue Scale = DAG.getConstant(DataVT.getScalarSizeInBits() / 8, DL,
+                                    AddrVT);
+    Increment = DAG.getNode(ISD::MUL, DL, AddrVT, Increment, Scale);
+  } else if (DataVT.isScalableVector()) {
+    Increment = DAG.getVScale(DL, AddrVT,
+                              APInt(AddrVT.getFixedSizeInBits(),
+                                    DataVT.getStoreSize().getKnownMinValue()));
+  } else
+    Increment = DAG.getConstant(DataVT.getStoreSize(), DL, AddrVT);
+
+  return DAG.getNode(ISD::ADD, DL, AddrVT, Addr, Increment);
+}
+
+static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, SDValue Idx,
+                                       EVT VecVT, const SDLoc &dl,
+                                       ElementCount SubEC) {
+  assert(!(SubEC.isScalable() && VecVT.isFixedLengthVector()) &&
+         "Cannot index a scalable vector within a fixed-width vector");
+
+  unsigned NElts = VecVT.getVectorMinNumElements();
+  unsigned NumSubElts = SubEC.getKnownMinValue();
+  EVT IdxVT = Idx.getValueType();
+
+  if (VecVT.isScalableVector() && !SubEC.isScalable()) {
+    // If this is a constant index and we know the value plus the number of the
+    // elements in the subvector minus one is less than the minimum number of
+    // elements then it's safe to return Idx.
+    if (auto *IdxCst = dyn_cast<ConstantSDNode>(Idx))
+      if (IdxCst->getZExtValue() + (NumSubElts - 1) < NElts)
+        return Idx;
+    SDValue VS =
+        DAG.getVScale(dl, IdxVT, APInt(IdxVT.getFixedSizeInBits(), NElts));
+    unsigned SubOpcode = NumSubElts <= NElts ? ISD::SUB : ISD::USUBSAT;
+    SDValue Sub = DAG.getNode(SubOpcode, dl, IdxVT, VS,
+                              DAG.getConstant(NumSubElts, dl, IdxVT));
+    return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx, Sub);
+  }
+  if (isPowerOf2_32(NElts) && NumSubElts == 1) {
+    APInt Imm = APInt::getLowBitsSet(IdxVT.getSizeInBits(), Log2_32(NElts));
+    return DAG.getNode(ISD::AND, dl, IdxVT, Idx,
+                       DAG.getConstant(Imm, dl, IdxVT));
+  }
+  unsigned MaxIndex = NumSubElts < NElts ? NElts - NumSubElts : 0;
+  return DAG.getNode(ISD::UMIN, dl, IdxVT, Idx,
+                     DAG.getConstant(MaxIndex, dl, IdxVT));
+}
+
+SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG,
+                                                SDValue VecPtr, EVT VecVT,
+                                                SDValue Index) const {
+  return getVectorSubVecPointer(
+      DAG, VecPtr, VecVT,
+      EVT::getVectorVT(*DAG.getContext(), VecVT.getVectorElementType(), 1),
+      Index);
+}
+
+SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG,
+                                               SDValue VecPtr, EVT VecVT,
+                                               EVT SubVecVT,
+                                               SDValue Index) const {
+  SDLoc dl(Index);
+  // Make sure the index type is big enough to compute in.
+  Index = DAG.getZExtOrTrunc(Index, dl, VecPtr.getValueType());
+
+  EVT EltVT = VecVT.getVectorElementType();
+
+  // Calculate the element offset and add it to the pointer.
+  unsigned EltSize = EltVT.getFixedSizeInBits() / 8; // FIXME: should be ABI size.
+  assert(EltSize * 8 == EltVT.getFixedSizeInBits() &&
+         "Converting bits to bytes lost precision");
+  assert(SubVecVT.getVectorElementType() == EltVT &&
+         "Sub-vector must be a vector with matching element type");
+  Index = clampDynamicVectorIndex(DAG, Index, VecVT, dl,
+                                  SubVecVT.getVectorElementCount());
+
+  EVT IdxVT = Index.getValueType();
+  if (SubVecVT.isScalableVector())
+    Index =
+        DAG.getNode(ISD::MUL, dl, IdxVT, Index,
+                    DAG.getVScale(dl, IdxVT, APInt(IdxVT.getSizeInBits(), 1)));
+
+  Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index,
+                      DAG.getConstant(EltSize, dl, IdxVT));
+  return DAG.getMemBasePlusOffset(VecPtr, Index, dl);
+}
+
+//===----------------------------------------------------------------------===//
+// Implementation of Emulated TLS Model
+//===----------------------------------------------------------------------===//
+
+SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
+                                                SelectionDAG &DAG) const {
+  // Access to address of TLS varialbe xyz is lowered to a function call:
+  //   __emutls_get_address( address of global variable named "__emutls_v.xyz" )
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  PointerType *VoidPtrType = Type::getInt8PtrTy(*DAG.getContext());
+  SDLoc dl(GA);
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+  std::string NameString = ("__emutls_v." + GA->getGlobal()->getName()).str();
+  Module *VariableModule = const_cast<Module*>(GA->getGlobal()->getParent());
+  StringRef EmuTlsVarName(NameString);
+  GlobalVariable *EmuTlsVar = VariableModule->getNamedGlobal(EmuTlsVarName);
+  assert(EmuTlsVar && "Cannot find EmuTlsVar ");
+  Entry.Node = DAG.getGlobalAddress(EmuTlsVar, dl, PtrVT);
+  Entry.Ty = VoidPtrType;
+  Args.push_back(Entry);
+
+  SDValue EmuTlsGetAddr = DAG.getExternalSymbol("__emutls_get_address", PtrVT);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode());
+  CLI.setLibCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args));
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+  // At last for X86 targets, maybe good for other targets too?
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI.setAdjustsStack(true); // Is this only for X86 target?
+  MFI.setHasCalls(true);
+
+  assert((GA->getOffset() == 0) &&
+         "Emulated TLS must have zero offset in GlobalAddressSDNode");
+  return CallResult.first;
+}
+
+SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  assert((Op->getOpcode() == ISD::SETCC) && "Input has to be a SETCC node.");
+  if (!isCtlzFast())
+    return SDValue();
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDLoc dl(Op);
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+    if (C->isZero() && CC == ISD::SETEQ) {
+      EVT VT = Op.getOperand(0).getValueType();
+      SDValue Zext = Op.getOperand(0);
+      if (VT.bitsLT(MVT::i32)) {
+        VT = MVT::i32;
+        Zext = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Op.getOperand(0));
+      }
+      unsigned Log2b = Log2_32(VT.getSizeInBits());
+      SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Zext);
+      SDValue Scc = DAG.getNode(ISD::SRL, dl, VT, Clz,
+                                DAG.getConstant(Log2b, dl, MVT::i32));
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc);
+    }
+  }
+  return SDValue();
+}
+
+SDValue TargetLowering::expandIntMINMAX(SDNode *Node, SelectionDAG &DAG) const {
+  SDValue Op0 = Node->getOperand(0);
+  SDValue Op1 = Node->getOperand(1);
+  EVT VT = Op0.getValueType();
+  EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  unsigned Opcode = Node->getOpcode();
+  SDLoc DL(Node);
+
+  // umax(x,1) --> sub(x,cmpeq(x,0)) iff cmp result is allbits
+  if (Opcode == ISD::UMAX && llvm::isOneOrOneSplat(Op1, true) && BoolVT == VT &&
+      getBooleanContents(VT) == ZeroOrNegativeOneBooleanContent) {
+    Op0 = DAG.getFreeze(Op0);
+    SDValue Zero = DAG.getConstant(0, DL, VT);
+    return DAG.getNode(ISD::SUB, DL, VT, Op0,
+                       DAG.getSetCC(DL, VT, Op0, Zero, ISD::SETEQ));
+  }
+
+  // umin(x,y) -> sub(x,usubsat(x,y))
+  // TODO: Missing freeze(Op0)?
+  if (Opcode == ISD::UMIN && isOperationLegal(ISD::SUB, VT) &&
+      isOperationLegal(ISD::USUBSAT, VT)) {
+    return DAG.getNode(ISD::SUB, DL, VT, Op0,
+                       DAG.getNode(ISD::USUBSAT, DL, VT, Op0, Op1));
+  }
+
+  // umax(x,y) -> add(x,usubsat(y,x))
+  // TODO: Missing freeze(Op0)?
+  if (Opcode == ISD::UMAX && isOperationLegal(ISD::ADD, VT) &&
+      isOperationLegal(ISD::USUBSAT, VT)) {
+    return DAG.getNode(ISD::ADD, DL, VT, Op0,
+                       DAG.getNode(ISD::USUBSAT, DL, VT, Op1, Op0));
+  }
+
+  // FIXME: Should really try to split the vector in case it's legal on a
+  // subvector.
+  if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
+    return DAG.UnrollVectorOp(Node);
+
+  // Attempt to find an existing SETCC node that we can reuse.
+  // TODO: Do we need a generic doesSETCCNodeExist?
+  // TODO: Missing freeze(Op0)/freeze(Op1)?
+  auto buildMinMax = [&](ISD::CondCode PrefCC, ISD::CondCode AltCC,
+                         ISD::CondCode PrefCommuteCC,
+                         ISD::CondCode AltCommuteCC) {
+    SDVTList BoolVTList = DAG.getVTList(BoolVT);
+    for (ISD::CondCode CC : {PrefCC, AltCC}) {
+      if (DAG.doesNodeExist(ISD::SETCC, BoolVTList,
+                            {Op0, Op1, DAG.getCondCode(CC)})) {
+        SDValue Cond = DAG.getSetCC(DL, BoolVT, Op0, Op1, CC);
+        return DAG.getSelect(DL, VT, Cond, Op0, Op1);
+      }
+    }
+    for (ISD::CondCode CC : {PrefCommuteCC, AltCommuteCC}) {
+      if (DAG.doesNodeExist(ISD::SETCC, BoolVTList,
+                            {Op0, Op1, DAG.getCondCode(CC)})) {
+        SDValue Cond = DAG.getSetCC(DL, BoolVT, Op0, Op1, CC);
+        return DAG.getSelect(DL, VT, Cond, Op1, Op0);
+      }
+    }
+    SDValue Cond = DAG.getSetCC(DL, BoolVT, Op0, Op1, PrefCC);
+    return DAG.getSelect(DL, VT, Cond, Op0, Op1);
+  };
+
+  // Expand Y = MAX(A, B) -> Y = (A > B) ? A : B
+  //                      -> Y = (A < B) ? B : A
+  //                      -> Y = (A >= B) ? A : B
+  //                      -> Y = (A <= B) ? B : A
+  switch (Opcode) {
+  case ISD::SMAX:
+    return buildMinMax(ISD::SETGT, ISD::SETGE, ISD::SETLT, ISD::SETLE);
+  case ISD::SMIN:
+    return buildMinMax(ISD::SETLT, ISD::SETLE, ISD::SETGT, ISD::SETGE);
+  case ISD::UMAX:
+    return buildMinMax(ISD::SETUGT, ISD::SETUGE, ISD::SETULT, ISD::SETULE);
+  case ISD::UMIN:
+    return buildMinMax(ISD::SETULT, ISD::SETULE, ISD::SETUGT, ISD::SETUGE);
+  }
+
+  llvm_unreachable("How did we get here?");
+}
+
+SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
+  unsigned Opcode = Node->getOpcode();
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  EVT VT = LHS.getValueType();
+  SDLoc dl(Node);
+
+  assert(VT == RHS.getValueType() && "Expected operands to be the same type");
+  assert(VT.isInteger() && "Expected operands to be integers");
+
+  // usub.sat(a, b) -> umax(a, b) - b
+  if (Opcode == ISD::USUBSAT && isOperationLegal(ISD::UMAX, VT)) {
+    SDValue Max = DAG.getNode(ISD::UMAX, dl, VT, LHS, RHS);
+    return DAG.getNode(ISD::SUB, dl, VT, Max, RHS);
+  }
+
+  // uadd.sat(a, b) -> umin(a, ~b) + b
+  if (Opcode == ISD::UADDSAT && isOperationLegal(ISD::UMIN, VT)) {
+    SDValue InvRHS = DAG.getNOT(dl, RHS, VT);
+    SDValue Min = DAG.getNode(ISD::UMIN, dl, VT, LHS, InvRHS);
+    return DAG.getNode(ISD::ADD, dl, VT, Min, RHS);
+  }
+
+  unsigned OverflowOp;
+  switch (Opcode) {
+  case ISD::SADDSAT:
+    OverflowOp = ISD::SADDO;
+    break;
+  case ISD::UADDSAT:
+    OverflowOp = ISD::UADDO;
+    break;
+  case ISD::SSUBSAT:
+    OverflowOp = ISD::SSUBO;
+    break;
+  case ISD::USUBSAT:
+    OverflowOp = ISD::USUBO;
+    break;
+  default:
+    llvm_unreachable("Expected method to receive signed or unsigned saturation "
+                     "addition or subtraction node.");
+  }
+
+  // FIXME: Should really try to split the vector in case it's legal on a
+  // subvector.
+  if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
+    return DAG.UnrollVectorOp(Node);
+
+  unsigned BitWidth = LHS.getScalarValueSizeInBits();
+  EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  SDValue Result = DAG.getNode(OverflowOp, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
+  SDValue SumDiff = Result.getValue(0);
+  SDValue Overflow = Result.getValue(1);
+  SDValue Zero = DAG.getConstant(0, dl, VT);
+  SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
+
+  if (Opcode == ISD::UADDSAT) {
+    if (getBooleanContents(VT) == ZeroOrNegativeOneBooleanContent) {
+      // (LHS + RHS) | OverflowMask
+      SDValue OverflowMask = DAG.getSExtOrTrunc(Overflow, dl, VT);
+      return DAG.getNode(ISD::OR, dl, VT, SumDiff, OverflowMask);
+    }
+    // Overflow ? 0xffff.... : (LHS + RHS)
+    return DAG.getSelect(dl, VT, Overflow, AllOnes, SumDiff);
+  }
+
+  if (Opcode == ISD::USUBSAT) {
+    if (getBooleanContents(VT) == ZeroOrNegativeOneBooleanContent) {
+      // (LHS - RHS) & ~OverflowMask
+      SDValue OverflowMask = DAG.getSExtOrTrunc(Overflow, dl, VT);
+      SDValue Not = DAG.getNOT(dl, OverflowMask, VT);
+      return DAG.getNode(ISD::AND, dl, VT, SumDiff, Not);
+    }
+    // Overflow ? 0 : (LHS - RHS)
+    return DAG.getSelect(dl, VT, Overflow, Zero, SumDiff);
+  }
+
+  if (Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) {
+    APInt MinVal = APInt::getSignedMinValue(BitWidth);
+    APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
+
+    KnownBits KnownLHS = DAG.computeKnownBits(LHS);
+    KnownBits KnownRHS = DAG.computeKnownBits(RHS);
+
+    // If either of the operand signs are known, then they are guaranteed to
+    // only saturate in one direction. If non-negative they will saturate
+    // towards SIGNED_MAX, if negative they will saturate towards SIGNED_MIN.
+    //
+    // In the case of ISD::SSUBSAT, 'x - y' is equivalent to 'x + (-y)', so the
+    // sign of 'y' has to be flipped.
+
+    bool LHSIsNonNegative = KnownLHS.isNonNegative();
+    bool RHSIsNonNegative = Opcode == ISD::SADDSAT ? KnownRHS.isNonNegative()
+                                                   : KnownRHS.isNegative();
+    if (LHSIsNonNegative || RHSIsNonNegative) {
+      SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+      return DAG.getSelect(dl, VT, Overflow, SatMax, SumDiff);
+    }
+
+    bool LHSIsNegative = KnownLHS.isNegative();
+    bool RHSIsNegative = Opcode == ISD::SADDSAT ? KnownRHS.isNegative()
+                                                : KnownRHS.isNonNegative();
+    if (LHSIsNegative || RHSIsNegative) {
+      SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
+      return DAG.getSelect(dl, VT, Overflow, SatMin, SumDiff);
+    }
+  }
+
+  // Overflow ? (SumDiff >> BW) ^ MinVal : SumDiff
+  APInt MinVal = APInt::getSignedMinValue(BitWidth);
+  SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
+  SDValue Shift = DAG.getNode(ISD::SRA, dl, VT, SumDiff,
+                              DAG.getConstant(BitWidth - 1, dl, VT));
+  Result = DAG.getNode(ISD::XOR, dl, VT, Shift, SatMin);
+  return DAG.getSelect(dl, VT, Overflow, Result, SumDiff);
+}
+
+SDValue TargetLowering::expandShlSat(SDNode *Node, SelectionDAG &DAG) const {
+  unsigned Opcode = Node->getOpcode();
+  bool IsSigned = Opcode == ISD::SSHLSAT;
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  EVT VT = LHS.getValueType();
+  SDLoc dl(Node);
+
+  assert((Node->getOpcode() == ISD::SSHLSAT ||
+          Node->getOpcode() == ISD::USHLSAT) &&
+          "Expected a SHLSAT opcode");
+  assert(VT == RHS.getValueType() && "Expected operands to be the same type");
+  assert(VT.isInteger() && "Expected operands to be integers");
+
+  if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT))
+    return DAG.UnrollVectorOp(Node);
+
+  // If LHS != (LHS << RHS) >> RHS, we have overflow and must saturate.
+
+  unsigned BW = VT.getScalarSizeInBits();
+  EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  SDValue Result = DAG.getNode(ISD::SHL, dl, VT, LHS, RHS);
+  SDValue Orig =
+      DAG.getNode(IsSigned ? ISD::SRA : ISD::SRL, dl, VT, Result, RHS);
+
+  SDValue SatVal;
+  if (IsSigned) {
+    SDValue SatMin = DAG.getConstant(APInt::getSignedMinValue(BW), dl, VT);
+    SDValue SatMax = DAG.getConstant(APInt::getSignedMaxValue(BW), dl, VT);
+    SDValue Cond =
+        DAG.getSetCC(dl, BoolVT, LHS, DAG.getConstant(0, dl, VT), ISD::SETLT);
+    SatVal = DAG.getSelect(dl, VT, Cond, SatMin, SatMax);
+  } else {
+    SatVal = DAG.getConstant(APInt::getMaxValue(BW), dl, VT);
+  }
+  SDValue Cond = DAG.getSetCC(dl, BoolVT, LHS, Orig, ISD::SETNE);
+  return DAG.getSelect(dl, VT, Cond, SatVal, Result);
+}
+
+SDValue
+TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
+  assert((Node->getOpcode() == ISD::SMULFIX ||
+          Node->getOpcode() == ISD::UMULFIX ||
+          Node->getOpcode() == ISD::SMULFIXSAT ||
+          Node->getOpcode() == ISD::UMULFIXSAT) &&
+         "Expected a fixed point multiplication opcode");
+
+  SDLoc dl(Node);
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  EVT VT = LHS.getValueType();
+  unsigned Scale = Node->getConstantOperandVal(2);
+  bool Saturating = (Node->getOpcode() == ISD::SMULFIXSAT ||
+                     Node->getOpcode() == ISD::UMULFIXSAT);
+  bool Signed = (Node->getOpcode() == ISD::SMULFIX ||
+                 Node->getOpcode() == ISD::SMULFIXSAT);
+  EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  unsigned VTSize = VT.getScalarSizeInBits();
+
+  if (!Scale) {
+    // [us]mul.fix(a, b, 0) -> mul(a, b)
+    if (!Saturating) {
+      if (isOperationLegalOrCustom(ISD::MUL, VT))
+        return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+    } else if (Signed && isOperationLegalOrCustom(ISD::SMULO, VT)) {
+      SDValue Result =
+          DAG.getNode(ISD::SMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
+      SDValue Product = Result.getValue(0);
+      SDValue Overflow = Result.getValue(1);
+      SDValue Zero = DAG.getConstant(0, dl, VT);
+
+      APInt MinVal = APInt::getSignedMinValue(VTSize);
+      APInt MaxVal = APInt::getSignedMaxValue(VTSize);
+      SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
+      SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+      // Xor the inputs, if resulting sign bit is 0 the product will be
+      // positive, else negative.
+      SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, LHS, RHS);
+      SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Xor, Zero, ISD::SETLT);
+      Result = DAG.getSelect(dl, VT, ProdNeg, SatMin, SatMax);
+      return DAG.getSelect(dl, VT, Overflow, Result, Product);
+    } else if (!Signed && isOperationLegalOrCustom(ISD::UMULO, VT)) {
+      SDValue Result =
+          DAG.getNode(ISD::UMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
+      SDValue Product = Result.getValue(0);
+      SDValue Overflow = Result.getValue(1);
+
+      APInt MaxVal = APInt::getMaxValue(VTSize);
+      SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+      return DAG.getSelect(dl, VT, Overflow, SatMax, Product);
+    }
+  }
+
+  assert(((Signed && Scale < VTSize) || (!Signed && Scale <= VTSize)) &&
+         "Expected scale to be less than the number of bits if signed or at "
+         "most the number of bits if unsigned.");
+  assert(LHS.getValueType() == RHS.getValueType() &&
+         "Expected both operands to be the same type");
+
+  // Get the upper and lower bits of the result.
+  SDValue Lo, Hi;
+  unsigned LoHiOp = Signed ? ISD::SMUL_LOHI : ISD::UMUL_LOHI;
+  unsigned HiOp = Signed ? ISD::MULHS : ISD::MULHU;
+  if (isOperationLegalOrCustom(LoHiOp, VT)) {
+    SDValue Result = DAG.getNode(LoHiOp, dl, DAG.getVTList(VT, VT), LHS, RHS);
+    Lo = Result.getValue(0);
+    Hi = Result.getValue(1);
+  } else if (isOperationLegalOrCustom(HiOp, VT)) {
+    Lo = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+    Hi = DAG.getNode(HiOp, dl, VT, LHS, RHS);
+  } else if (VT.isVector()) {
+    return SDValue();
+  } else {
+    report_fatal_error("Unable to expand fixed point multiplication.");
+  }
+
+  if (Scale == VTSize)
+    // Result is just the top half since we'd be shifting by the width of the
+    // operand. Overflow impossible so this works for both UMULFIX and
+    // UMULFIXSAT.
+    return Hi;
+
+  // The result will need to be shifted right by the scale since both operands
+  // are scaled. The result is given to us in 2 halves, so we only want part of
+  // both in the result.
+  EVT ShiftTy = getShiftAmountTy(VT, DAG.getDataLayout());
+  SDValue Result = DAG.getNode(ISD::FSHR, dl, VT, Hi, Lo,
+                               DAG.getConstant(Scale, dl, ShiftTy));
+  if (!Saturating)
+    return Result;
+
+  if (!Signed) {
+    // Unsigned overflow happened if the upper (VTSize - Scale) bits (of the
+    // widened multiplication) aren't all zeroes.
+
+    // Saturate to max if ((Hi >> Scale) != 0),
+    // which is the same as if (Hi > ((1 << Scale) - 1))
+    APInt MaxVal = APInt::getMaxValue(VTSize);
+    SDValue LowMask = DAG.getConstant(APInt::getLowBitsSet(VTSize, Scale),
+                                      dl, VT);
+    Result = DAG.getSelectCC(dl, Hi, LowMask,
+                             DAG.getConstant(MaxVal, dl, VT), Result,
+                             ISD::SETUGT);
+
+    return Result;
+  }
+
+  // Signed overflow happened if the upper (VTSize - Scale + 1) bits (of the
+  // widened multiplication) aren't all ones or all zeroes.
+
+  SDValue SatMin = DAG.getConstant(APInt::getSignedMinValue(VTSize), dl, VT);
+  SDValue SatMax = DAG.getConstant(APInt::getSignedMaxValue(VTSize), dl, VT);
+
+  if (Scale == 0) {
+    SDValue Sign = DAG.getNode(ISD::SRA, dl, VT, Lo,
+                               DAG.getConstant(VTSize - 1, dl, ShiftTy));
+    SDValue Overflow = DAG.getSetCC(dl, BoolVT, Hi, Sign, ISD::SETNE);
+    // Saturated to SatMin if wide product is negative, and SatMax if wide
+    // product is positive ...
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    SDValue ResultIfOverflow = DAG.getSelectCC(dl, Hi, Zero, SatMin, SatMax,
+                                               ISD::SETLT);
+    // ... but only if we overflowed.
+    return DAG.getSelect(dl, VT, Overflow, ResultIfOverflow, Result);
+  }
+
+  //  We handled Scale==0 above so all the bits to examine is in Hi.
+
+  // Saturate to max if ((Hi >> (Scale - 1)) > 0),
+  // which is the same as if (Hi > (1 << (Scale - 1)) - 1)
+  SDValue LowMask = DAG.getConstant(APInt::getLowBitsSet(VTSize, Scale - 1),
+                                    dl, VT);
+  Result = DAG.getSelectCC(dl, Hi, LowMask, SatMax, Result, ISD::SETGT);
+  // Saturate to min if (Hi >> (Scale - 1)) < -1),
+  // which is the same as if (HI < (-1 << (Scale - 1))
+  SDValue HighMask =
+      DAG.getConstant(APInt::getHighBitsSet(VTSize, VTSize - Scale + 1),
+                      dl, VT);
+  Result = DAG.getSelectCC(dl, Hi, HighMask, SatMin, Result, ISD::SETLT);
+  return Result;
+}
+
+SDValue
+TargetLowering::expandFixedPointDiv(unsigned Opcode, const SDLoc &dl,
+                                    SDValue LHS, SDValue RHS,
+                                    unsigned Scale, SelectionDAG &DAG) const {
+  assert((Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT ||
+          Opcode == ISD::UDIVFIX || Opcode == ISD::UDIVFIXSAT) &&
+         "Expected a fixed point division opcode");
+
+  EVT VT = LHS.getValueType();
+  bool Signed = Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT;
+  bool Saturating = Opcode == ISD::SDIVFIXSAT || Opcode == ISD::UDIVFIXSAT;
+  EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+
+  // If there is enough room in the type to upscale the LHS or downscale the
+  // RHS before the division, we can perform it in this type without having to
+  // resize. For signed operations, the LHS headroom is the number of
+  // redundant sign bits, and for unsigned ones it is the number of zeroes.
+  // The headroom for the RHS is the number of trailing zeroes.
+  unsigned LHSLead = Signed ? DAG.ComputeNumSignBits(LHS) - 1
+                            : DAG.computeKnownBits(LHS).countMinLeadingZeros();
+  unsigned RHSTrail = DAG.computeKnownBits(RHS).countMinTrailingZeros();
+
+  // For signed saturating operations, we need to be able to detect true integer
+  // division overflow; that is, when you have MIN / -EPS. However, this
+  // is undefined behavior and if we emit divisions that could take such
+  // values it may cause undesired behavior (arithmetic exceptions on x86, for
+  // example).
+  // Avoid this by requiring an extra bit so that we never get this case.
+  // FIXME: This is a bit unfortunate as it means that for an 8-bit 7-scale
+  // signed saturating division, we need to emit a whopping 32-bit division.
+  if (LHSLead + RHSTrail < Scale + (unsigned)(Saturating && Signed))
+    return SDValue();
+
+  unsigned LHSShift = std::min(LHSLead, Scale);
+  unsigned RHSShift = Scale - LHSShift;
+
+  // At this point, we know that if we shift the LHS up by LHSShift and the
+  // RHS down by RHSShift, we can emit a regular division with a final scaling
+  // factor of Scale.
+
+  EVT ShiftTy = getShiftAmountTy(VT, DAG.getDataLayout());
+  if (LHSShift)
+    LHS = DAG.getNode(ISD::SHL, dl, VT, LHS,
+                      DAG.getConstant(LHSShift, dl, ShiftTy));
+  if (RHSShift)
+    RHS = DAG.getNode(Signed ? ISD::SRA : ISD::SRL, dl, VT, RHS,
+                      DAG.getConstant(RHSShift, dl, ShiftTy));
+
+  SDValue Quot;
+  if (Signed) {
+    // For signed operations, if the resulting quotient is negative and the
+    // remainder is nonzero, subtract 1 from the quotient to round towards
+    // negative infinity.
+    SDValue Rem;
+    // FIXME: Ideally we would always produce an SDIVREM here, but if the
+    // type isn't legal, SDIVREM cannot be expanded. There is no reason why
+    // we couldn't just form a libcall, but the type legalizer doesn't do it.
+    if (isTypeLegal(VT) &&
+        isOperationLegalOrCustom(ISD::SDIVREM, VT)) {
+      Quot = DAG.getNode(ISD::SDIVREM, dl,
+                         DAG.getVTList(VT, VT),
+                         LHS, RHS);
+      Rem = Quot.getValue(1);
+      Quot = Quot.getValue(0);
+    } else {
+      Quot = DAG.getNode(ISD::SDIV, dl, VT,
+                         LHS, RHS);
+      Rem = DAG.getNode(ISD::SREM, dl, VT,
+                        LHS, RHS);
+    }
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    SDValue RemNonZero = DAG.getSetCC(dl, BoolVT, Rem, Zero, ISD::SETNE);
+    SDValue LHSNeg = DAG.getSetCC(dl, BoolVT, LHS, Zero, ISD::SETLT);
+    SDValue RHSNeg = DAG.getSetCC(dl, BoolVT, RHS, Zero, ISD::SETLT);
+    SDValue QuotNeg = DAG.getNode(ISD::XOR, dl, BoolVT, LHSNeg, RHSNeg);
+    SDValue Sub1 = DAG.getNode(ISD::SUB, dl, VT, Quot,
+                               DAG.getConstant(1, dl, VT));
+    Quot = DAG.getSelect(dl, VT,
+                         DAG.getNode(ISD::AND, dl, BoolVT, RemNonZero, QuotNeg),
+                         Sub1, Quot);
+  } else
+    Quot = DAG.getNode(ISD::UDIV, dl, VT,
+                       LHS, RHS);
+
+  return Quot;
+}
+
+void TargetLowering::expandUADDSUBO(
+    SDNode *Node, SDValue &Result, SDValue &Overflow, SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  bool IsAdd = Node->getOpcode() == ISD::UADDO;
+
+  // If UADDO_CARRY/SUBO_CARRY is legal, use that instead.
+  unsigned OpcCarry = IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
+  if (isOperationLegalOrCustom(OpcCarry, Node->getValueType(0))) {
+    SDValue CarryIn = DAG.getConstant(0, dl, Node->getValueType(1));
+    SDValue NodeCarry = DAG.getNode(OpcCarry, dl, Node->getVTList(),
+                                    { LHS, RHS, CarryIn });
+    Result = SDValue(NodeCarry.getNode(), 0);
+    Overflow = SDValue(NodeCarry.getNode(), 1);
+    return;
+  }
+
+  Result = DAG.getNode(IsAdd ? ISD::ADD : ISD::SUB, dl,
+                            LHS.getValueType(), LHS, RHS);
+
+  EVT ResultType = Node->getValueType(1);
+  EVT SetCCType = getSetCCResultType(
+      DAG.getDataLayout(), *DAG.getContext(), Node->getValueType(0));
+  SDValue SetCC;
+  if (IsAdd && isOneConstant(RHS)) {
+    // Special case: uaddo X, 1 overflowed if X+1 is 0. This potential reduces
+    // the live range of X. We assume comparing with 0 is cheap.
+    // The general case (X + C) < C is not necessarily beneficial. Although we
+    // reduce the live range of X, we may introduce the materialization of
+    // constant C.
+    SetCC =
+        DAG.getSetCC(dl, SetCCType, Result,
+                     DAG.getConstant(0, dl, Node->getValueType(0)), ISD::SETEQ);
+  } else if (IsAdd && isAllOnesConstant(RHS)) {
+    // Special case: uaddo X, -1 overflows if X != 0.
+    SetCC =
+        DAG.getSetCC(dl, SetCCType, LHS,
+                     DAG.getConstant(0, dl, Node->getValueType(0)), ISD::SETNE);
+  } else {
+    ISD::CondCode CC = IsAdd ? ISD::SETULT : ISD::SETUGT;
+    SetCC = DAG.getSetCC(dl, SetCCType, Result, LHS, CC);
+  }
+  Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType);
+}
+
+void TargetLowering::expandSADDSUBO(
+    SDNode *Node, SDValue &Result, SDValue &Overflow, SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  bool IsAdd = Node->getOpcode() == ISD::SADDO;
+
+  Result = DAG.getNode(IsAdd ? ISD::ADD : ISD::SUB, dl,
+                            LHS.getValueType(), LHS, RHS);
+
+  EVT ResultType = Node->getValueType(1);
+  EVT OType = getSetCCResultType(
+      DAG.getDataLayout(), *DAG.getContext(), Node->getValueType(0));
+
+  // If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
+  unsigned OpcSat = IsAdd ? ISD::SADDSAT : ISD::SSUBSAT;
+  if (isOperationLegal(OpcSat, LHS.getValueType())) {
+    SDValue Sat = DAG.getNode(OpcSat, dl, LHS.getValueType(), LHS, RHS);
+    SDValue SetCC = DAG.getSetCC(dl, OType, Result, Sat, ISD::SETNE);
+    Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType);
+    return;
+  }
+
+  SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
+
+  // For an addition, the result should be less than one of the operands (LHS)
+  // if and only if the other operand (RHS) is negative, otherwise there will
+  // be overflow.
+  // For a subtraction, the result should be less than one of the operands
+  // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
+  // otherwise there will be overflow.
+  SDValue ResultLowerThanLHS = DAG.getSetCC(dl, OType, Result, LHS, ISD::SETLT);
+  SDValue ConditionRHS =
+      DAG.getSetCC(dl, OType, RHS, Zero, IsAdd ? ISD::SETLT : ISD::SETGT);
+
+  Overflow = DAG.getBoolExtOrTrunc(
+      DAG.getNode(ISD::XOR, dl, OType, ConditionRHS, ResultLowerThanLHS), dl,
+      ResultType, ResultType);
+}
+
+bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
+                                SDValue &Overflow, SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  bool isSigned = Node->getOpcode() == ISD::SMULO;
+
+  // For power-of-two multiplications we can use a simpler shift expansion.
+  if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
+    const APInt &C = RHSC->getAPIntValue();
+    // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
+    if (C.isPowerOf2()) {
+      // smulo(x, signed_min) is same as umulo(x, signed_min).
+      bool UseArithShift = isSigned && !C.isMinSignedValue();
+      EVT ShiftAmtTy = getShiftAmountTy(VT, DAG.getDataLayout());
+      SDValue ShiftAmt = DAG.getConstant(C.logBase2(), dl, ShiftAmtTy);
+      Result = DAG.getNode(ISD::SHL, dl, VT, LHS, ShiftAmt);
+      Overflow = DAG.getSetCC(dl, SetCCVT,
+          DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
+                      dl, VT, Result, ShiftAmt),
+          LHS, ISD::SETNE);
+      return true;
+    }
+  }
+
+  EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2);
+  if (VT.isVector())
+    WideVT =
+        EVT::getVectorVT(*DAG.getContext(), WideVT, VT.getVectorElementCount());
+
+  SDValue BottomHalf;
+  SDValue TopHalf;
+  static const unsigned Ops[2][3] =
+      { { ISD::MULHU, ISD::UMUL_LOHI, ISD::ZERO_EXTEND },
+        { ISD::MULHS, ISD::SMUL_LOHI, ISD::SIGN_EXTEND }};
+  if (isOperationLegalOrCustom(Ops[isSigned][0], VT)) {
+    BottomHalf = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+    TopHalf = DAG.getNode(Ops[isSigned][0], dl, VT, LHS, RHS);
+  } else if (isOperationLegalOrCustom(Ops[isSigned][1], VT)) {
+    BottomHalf = DAG.getNode(Ops[isSigned][1], dl, DAG.getVTList(VT, VT), LHS,
+                             RHS);
+    TopHalf = BottomHalf.getValue(1);
+  } else if (isTypeLegal(WideVT)) {
+    LHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, LHS);
+    RHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, RHS);
+    SDValue Mul = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS);
+    BottomHalf = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
+    SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits(), dl,
+        getShiftAmountTy(WideVT, DAG.getDataLayout()));
+    TopHalf = DAG.getNode(ISD::TRUNCATE, dl, VT,
+                          DAG.getNode(ISD::SRL, dl, WideVT, Mul, ShiftAmt));
+  } else {
+    if (VT.isVector())
+      return false;
+
+    // We can fall back to a libcall with an illegal type for the MUL if we
+    // have a libcall big enough.
+    // Also, we can fall back to a division in some cases, but that's a big
+    // performance hit in the general case.
+    RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+    if (WideVT == MVT::i16)
+      LC = RTLIB::MUL_I16;
+    else if (WideVT == MVT::i32)
+      LC = RTLIB::MUL_I32;
+    else if (WideVT == MVT::i64)
+      LC = RTLIB::MUL_I64;
+    else if (WideVT == MVT::i128)
+      LC = RTLIB::MUL_I128;
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!");
+
+    SDValue HiLHS;
+    SDValue HiRHS;
+    if (isSigned) {
+      // The high part is obtained by SRA'ing all but one of the bits of low
+      // part.
+      unsigned LoSize = VT.getFixedSizeInBits();
+      HiLHS =
+          DAG.getNode(ISD::SRA, dl, VT, LHS,
+                      DAG.getConstant(LoSize - 1, dl,
+                                      getPointerTy(DAG.getDataLayout())));
+      HiRHS =
+          DAG.getNode(ISD::SRA, dl, VT, RHS,
+                      DAG.getConstant(LoSize - 1, dl,
+                                      getPointerTy(DAG.getDataLayout())));
+    } else {
+        HiLHS = DAG.getConstant(0, dl, VT);
+        HiRHS = DAG.getConstant(0, dl, VT);
+    }
+
+    // Here we're passing the 2 arguments explicitly as 4 arguments that are
+    // pre-lowered to the correct types. This all depends upon WideVT not
+    // being a legal type for the architecture and thus has to be split to
+    // two arguments.
+    SDValue Ret;
+    TargetLowering::MakeLibCallOptions CallOptions;
+    CallOptions.setSExt(isSigned);
+    CallOptions.setIsPostTypeLegalization(true);
+    if (shouldSplitFunctionArgumentsAsLittleEndian(DAG.getDataLayout())) {
+      // Halves of WideVT are packed into registers in different order
+      // depending on platform endianness. This is usually handled by
+      // the C calling convention, but we can't defer to it in
+      // the legalizer.
+      SDValue Args[] = { LHS, HiLHS, RHS, HiRHS };
+      Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
+    } else {
+      SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
+      Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
+    }
+    assert(Ret.getOpcode() == ISD::MERGE_VALUES &&
+           "Ret value is a collection of constituent nodes holding result.");
+    if (DAG.getDataLayout().isLittleEndian()) {
+      // Same as above.
+      BottomHalf = Ret.getOperand(0);
+      TopHalf = Ret.getOperand(1);
+    } else {
+      BottomHalf = Ret.getOperand(1);
+      TopHalf = Ret.getOperand(0);
+    }
+  }
+
+  Result = BottomHalf;
+  if (isSigned) {
+    SDValue ShiftAmt = DAG.getConstant(
+        VT.getScalarSizeInBits() - 1, dl,
+        getShiftAmountTy(BottomHalf.getValueType(), DAG.getDataLayout()));
+    SDValue Sign = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, ShiftAmt);
+    Overflow = DAG.getSetCC(dl, SetCCVT, TopHalf, Sign, ISD::SETNE);
+  } else {
+    Overflow = DAG.getSetCC(dl, SetCCVT, TopHalf,
+                            DAG.getConstant(0, dl, VT), ISD::SETNE);
+  }
+
+  // Truncate the result if SetCC returns a larger type than needed.
+  EVT RType = Node->getValueType(1);
+  if (RType.bitsLT(Overflow.getValueType()))
+    Overflow = DAG.getNode(ISD::TRUNCATE, dl, RType, Overflow);
+
+  assert(RType.getSizeInBits() == Overflow.getValueSizeInBits() &&
+         "Unexpected result type for S/UMULO legalization");
+  return true;
+}
+
+SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  unsigned BaseOpcode = ISD::getVecReduceBaseOpcode(Node->getOpcode());
+  SDValue Op = Node->getOperand(0);
+  EVT VT = Op.getValueType();
+
+  if (VT.isScalableVector())
+    report_fatal_error(
+        "Expanding reductions for scalable vectors is undefined.");
+
+  // Try to use a shuffle reduction for power of two vectors.
+  if (VT.isPow2VectorType()) {
+    while (VT.getVectorNumElements() > 1) {
+      EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
+      if (!isOperationLegalOrCustom(BaseOpcode, HalfVT))
+        break;
+
+      SDValue Lo, Hi;
+      std::tie(Lo, Hi) = DAG.SplitVector(Op, dl);
+      Op = DAG.getNode(BaseOpcode, dl, HalfVT, Lo, Hi);
+      VT = HalfVT;
+    }
+  }
+
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SmallVector<SDValue, 8> Ops;
+  DAG.ExtractVectorElements(Op, Ops, 0, NumElts);
+
+  SDValue Res = Ops[0];
+  for (unsigned i = 1; i < NumElts; i++)
+    Res = DAG.getNode(BaseOpcode, dl, EltVT, Res, Ops[i], Node->getFlags());
+
+  // Result type may be wider than element type.
+  if (EltVT != Node->getValueType(0))
+    Res = DAG.getNode(ISD::ANY_EXTEND, dl, Node->getValueType(0), Res);
+  return Res;
+}
+
+SDValue TargetLowering::expandVecReduceSeq(SDNode *Node, SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  SDValue AccOp = Node->getOperand(0);
+  SDValue VecOp = Node->getOperand(1);
+  SDNodeFlags Flags = Node->getFlags();
+
+  EVT VT = VecOp.getValueType();
+  EVT EltVT = VT.getVectorElementType();
+
+  if (VT.isScalableVector())
+    report_fatal_error(
+        "Expanding reductions for scalable vectors is undefined.");
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SmallVector<SDValue, 8> Ops;
+  DAG.ExtractVectorElements(VecOp, Ops, 0, NumElts);
+
+  unsigned BaseOpcode = ISD::getVecReduceBaseOpcode(Node->getOpcode());
+
+  SDValue Res = AccOp;
+  for (unsigned i = 0; i < NumElts; i++)
+    Res = DAG.getNode(BaseOpcode, dl, EltVT, Res, Ops[i], Flags);
+
+  return Res;
+}
+
+bool TargetLowering::expandREM(SDNode *Node, SDValue &Result,
+                               SelectionDAG &DAG) const {
+  EVT VT = Node->getValueType(0);
+  SDLoc dl(Node);
+  bool isSigned = Node->getOpcode() == ISD::SREM;
+  unsigned DivOpc = isSigned ? ISD::SDIV : ISD::UDIV;
+  unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
+  SDValue Dividend = Node->getOperand(0);
+  SDValue Divisor = Node->getOperand(1);
+  if (isOperationLegalOrCustom(DivRemOpc, VT)) {
+    SDVTList VTs = DAG.getVTList(VT, VT);
+    Result = DAG.getNode(DivRemOpc, dl, VTs, Dividend, Divisor).getValue(1);
+    return true;
+  }
+  if (isOperationLegalOrCustom(DivOpc, VT)) {
+    // X % Y -> X-X/Y*Y
+    SDValue Divide = DAG.getNode(DivOpc, dl, VT, Dividend, Divisor);
+    SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Divide, Divisor);
+    Result = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
+    return true;
+  }
+  return false;
+}
+
+SDValue TargetLowering::expandFP_TO_INT_SAT(SDNode *Node,
+                                            SelectionDAG &DAG) const {
+  bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
+  SDLoc dl(SDValue(Node, 0));
+  SDValue Src = Node->getOperand(0);
+
+  // DstVT is the result type, while SatVT is the size to which we saturate
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = Node->getValueType(0);
+
+  EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
+  unsigned SatWidth = SatVT.getScalarSizeInBits();
+  unsigned DstWidth = DstVT.getScalarSizeInBits();
+  assert(SatWidth <= DstWidth &&
+         "Expected saturation width smaller than result width");
+
+  // Determine minimum and maximum integer values and their corresponding
+  // floating-point values.
+  APInt MinInt, MaxInt;
+  if (IsSigned) {
+    MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
+    MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
+  } else {
+    MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
+    MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
+  }
+
+  // We cannot risk emitting FP_TO_XINT nodes with a source VT of f16, as
+  // libcall emission cannot handle this. Large result types will fail.
+  if (SrcVT == MVT::f16) {
+    Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Src);
+    SrcVT = Src.getValueType();
+  }
+
+  APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
+  APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
+
+  APFloat::opStatus MinStatus =
+      MinFloat.convertFromAPInt(MinInt, IsSigned, APFloat::rmTowardZero);
+  APFloat::opStatus MaxStatus =
+      MaxFloat.convertFromAPInt(MaxInt, IsSigned, APFloat::rmTowardZero);
+  bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) &&
+                             !(MaxStatus & APFloat::opStatus::opInexact);
+
+  SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
+  SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
+
+  // If the integer bounds are exactly representable as floats and min/max are
+  // legal, emit a min+max+fptoi sequence. Otherwise we have to use a sequence
+  // of comparisons and selects.
+  bool MinMaxLegal = isOperationLegal(ISD::FMINNUM, SrcVT) &&
+                     isOperationLegal(ISD::FMAXNUM, SrcVT);
+  if (AreExactFloatBounds && MinMaxLegal) {
+    SDValue Clamped = Src;
+
+    // Clamp Src by MinFloat from below. If Src is NaN the result is MinFloat.
+    Clamped = DAG.getNode(ISD::FMAXNUM, dl, SrcVT, Clamped, MinFloatNode);
+    // Clamp by MaxFloat from above. NaN cannot occur.
+    Clamped = DAG.getNode(ISD::FMINNUM, dl, SrcVT, Clamped, MaxFloatNode);
+    // Convert clamped value to integer.
+    SDValue FpToInt = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT,
+                                  dl, DstVT, Clamped);
+
+    // In the unsigned case we're done, because we mapped NaN to MinFloat,
+    // which will cast to zero.
+    if (!IsSigned)
+      return FpToInt;
+
+    // Otherwise, select 0 if Src is NaN.
+    SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
+    EVT SetCCVT =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
+    SDValue IsNan = DAG.getSetCC(dl, SetCCVT, Src, Src, ISD::CondCode::SETUO);
+    return DAG.getSelect(dl, DstVT, IsNan, ZeroInt, FpToInt);
+  }
+
+  SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
+  SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
+
+  // Result of direct conversion. The assumption here is that the operation is
+  // non-trapping and it's fine to apply it to an out-of-range value if we
+  // select it away later.
+  SDValue FpToInt =
+      DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl, DstVT, Src);
+
+  SDValue Select = FpToInt;
+
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
+
+  // If Src ULT MinFloat, select MinInt. In particular, this also selects
+  // MinInt if Src is NaN.
+  SDValue ULT = DAG.getSetCC(dl, SetCCVT, Src, MinFloatNode, ISD::SETULT);
+  Select = DAG.getSelect(dl, DstVT, ULT, MinIntNode, Select);
+  // If Src OGT MaxFloat, select MaxInt.
+  SDValue OGT = DAG.getSetCC(dl, SetCCVT, Src, MaxFloatNode, ISD::SETOGT);
+  Select = DAG.getSelect(dl, DstVT, OGT, MaxIntNode, Select);
+
+  // In the unsigned case we are done, because we mapped NaN to MinInt, which
+  // is already zero.
+  if (!IsSigned)
+    return Select;
+
+  // Otherwise, select 0 if Src is NaN.
+  SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
+  SDValue IsNan = DAG.getSetCC(dl, SetCCVT, Src, Src, ISD::CondCode::SETUO);
+  return DAG.getSelect(dl, DstVT, IsNan, ZeroInt, Select);
+}
+
+SDValue TargetLowering::expandVectorSplice(SDNode *Node,
+                                           SelectionDAG &DAG) const {
+  assert(Node->getOpcode() == ISD::VECTOR_SPLICE && "Unexpected opcode!");
+  assert(Node->getValueType(0).isScalableVector() &&
+         "Fixed length vector types expected to use SHUFFLE_VECTOR!");
+
+  EVT VT = Node->getValueType(0);
+  SDValue V1 = Node->getOperand(0);
+  SDValue V2 = Node->getOperand(1);
+  int64_t Imm = cast<ConstantSDNode>(Node->getOperand(2))->getSExtValue();
+  SDLoc DL(Node);
+
+  // Expand through memory thusly:
+  //  Alloca CONCAT_VECTORS_TYPES(V1, V2) Ptr
+  //  Store V1, Ptr
+  //  Store V2, Ptr + sizeof(V1)
+  //  If (Imm < 0)
+  //    TrailingElts = -Imm
+  //    Ptr = Ptr + sizeof(V1) - (TrailingElts * sizeof(VT.Elt))
+  //  else
+  //    Ptr = Ptr + (Imm * sizeof(VT.Elt))
+  //  Res = Load Ptr
+
+  Align Alignment = DAG.getReducedAlign(VT, /*UseABI=*/false);
+
+  EVT MemVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                               VT.getVectorElementCount() * 2);
+  SDValue StackPtr = DAG.CreateStackTemporary(MemVT.getStoreSize(), Alignment);
+  EVT PtrVT = StackPtr.getValueType();
+  auto &MF = DAG.getMachineFunction();
+  auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+  // Store the lo part of CONCAT_VECTORS(V1, V2)
+  SDValue StoreV1 = DAG.getStore(DAG.getEntryNode(), DL, V1, StackPtr, PtrInfo);
+  // Store the hi part of CONCAT_VECTORS(V1, V2)
+  SDValue OffsetToV2 = DAG.getVScale(
+      DL, PtrVT,
+      APInt(PtrVT.getFixedSizeInBits(), VT.getStoreSize().getKnownMinValue()));
+  SDValue StackPtr2 = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, OffsetToV2);
+  SDValue StoreV2 = DAG.getStore(StoreV1, DL, V2, StackPtr2, PtrInfo);
+
+  if (Imm >= 0) {
+    // Load back the required element. getVectorElementPointer takes care of
+    // clamping the index if it's out-of-bounds.
+    StackPtr = getVectorElementPointer(DAG, StackPtr, VT, Node->getOperand(2));
+    // Load the spliced result
+    return DAG.getLoad(VT, DL, StoreV2, StackPtr,
+                       MachinePointerInfo::getUnknownStack(MF));
+  }
+
+  uint64_t TrailingElts = -Imm;
+
+  // NOTE: TrailingElts must be clamped so as not to read outside of V1:V2.
+  TypeSize EltByteSize = VT.getVectorElementType().getStoreSize();
+  SDValue TrailingBytes =
+      DAG.getConstant(TrailingElts * EltByteSize, DL, PtrVT);
+
+  if (TrailingElts > VT.getVectorMinNumElements()) {
+    SDValue VLBytes =
+        DAG.getVScale(DL, PtrVT,
+                      APInt(PtrVT.getFixedSizeInBits(),
+                            VT.getStoreSize().getKnownMinValue()));
+    TrailingBytes = DAG.getNode(ISD::UMIN, DL, PtrVT, TrailingBytes, VLBytes);
+  }
+
+  // Calculate the start address of the spliced result.
+  StackPtr2 = DAG.getNode(ISD::SUB, DL, PtrVT, StackPtr2, TrailingBytes);
+
+  // Load the spliced result
+  return DAG.getLoad(VT, DL, StoreV2, StackPtr2,
+                     MachinePointerInfo::getUnknownStack(MF));
+}
+
+bool TargetLowering::LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT,
+                                           SDValue &LHS, SDValue &RHS,
+                                           SDValue &CC, SDValue Mask,
+                                           SDValue EVL, bool &NeedInvert,
+                                           const SDLoc &dl, SDValue &Chain,
+                                           bool IsSignaling) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  MVT OpVT = LHS.getSimpleValueType();
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
+  NeedInvert = false;
+  assert(!EVL == !Mask && "VP Mask and EVL must either both be set or unset");
+  bool IsNonVP = !EVL;
+  switch (TLI.getCondCodeAction(CCCode, OpVT)) {
+  default:
+    llvm_unreachable("Unknown condition code action!");
+  case TargetLowering::Legal:
+    // Nothing to do.
+    break;
+  case TargetLowering::Expand: {
+    ISD::CondCode InvCC = ISD::getSetCCSwappedOperands(CCCode);
+    if (TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) {
+      std::swap(LHS, RHS);
+      CC = DAG.getCondCode(InvCC);
+      return true;
+    }
+    // Swapping operands didn't work. Try inverting the condition.
+    bool NeedSwap = false;
+    InvCC = getSetCCInverse(CCCode, OpVT);
+    if (!TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) {
+      // If inverting the condition is not enough, try swapping operands
+      // on top of it.
+      InvCC = ISD::getSetCCSwappedOperands(InvCC);
+      NeedSwap = true;
+    }
+    if (TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) {
+      CC = DAG.getCondCode(InvCC);
+      NeedInvert = true;
+      if (NeedSwap)
+        std::swap(LHS, RHS);
+      return true;
+    }
+
+    ISD::CondCode CC1 = ISD::SETCC_INVALID, CC2 = ISD::SETCC_INVALID;
+    unsigned Opc = 0;
+    switch (CCCode) {
+    default:
+      llvm_unreachable("Don't know how to expand this condition!");
+    case ISD::SETUO:
+      if (TLI.isCondCodeLegal(ISD::SETUNE, OpVT)) {
+        CC1 = ISD::SETUNE;
+        CC2 = ISD::SETUNE;
+        Opc = ISD::OR;
+        break;
+      }
+      assert(TLI.isCondCodeLegal(ISD::SETOEQ, OpVT) &&
+             "If SETUE is expanded, SETOEQ or SETUNE must be legal!");
+      NeedInvert = true;
+      [[fallthrough]];
+    case ISD::SETO:
+      assert(TLI.isCondCodeLegal(ISD::SETOEQ, OpVT) &&
+             "If SETO is expanded, SETOEQ must be legal!");
+      CC1 = ISD::SETOEQ;
+      CC2 = ISD::SETOEQ;
+      Opc = ISD::AND;
+      break;
+    case ISD::SETONE:
+    case ISD::SETUEQ:
+      // If the SETUO or SETO CC isn't legal, we might be able to use
+      // SETOGT || SETOLT, inverting the result for SETUEQ. We only need one
+      // of SETOGT/SETOLT to be legal, the other can be emulated by swapping
+      // the operands.
+      CC2 = ((unsigned)CCCode & 0x8U) ? ISD::SETUO : ISD::SETO;
+      if (!TLI.isCondCodeLegal(CC2, OpVT) &&
+          (TLI.isCondCodeLegal(ISD::SETOGT, OpVT) ||
+           TLI.isCondCodeLegal(ISD::SETOLT, OpVT))) {
+        CC1 = ISD::SETOGT;
+        CC2 = ISD::SETOLT;
+        Opc = ISD::OR;
+        NeedInvert = ((unsigned)CCCode & 0x8U);
+        break;
+      }
+      [[fallthrough]];
+    case ISD::SETOEQ:
+    case ISD::SETOGT:
+    case ISD::SETOGE:
+    case ISD::SETOLT:
+    case ISD::SETOLE:
+    case ISD::SETUNE:
+    case ISD::SETUGT:
+    case ISD::SETUGE:
+    case ISD::SETULT:
+    case ISD::SETULE:
+      // If we are floating point, assign and break, otherwise fall through.
+      if (!OpVT.isInteger()) {
+        // We can use the 4th bit to tell if we are the unordered
+        // or ordered version of the opcode.
+        CC2 = ((unsigned)CCCode & 0x8U) ? ISD::SETUO : ISD::SETO;
+        Opc = ((unsigned)CCCode & 0x8U) ? ISD::OR : ISD::AND;
+        CC1 = (ISD::CondCode)(((int)CCCode & 0x7) | 0x10);
+        break;
+      }
+      // Fallthrough if we are unsigned integer.
+      [[fallthrough]];
+    case ISD::SETLE:
+    case ISD::SETGT:
+    case ISD::SETGE:
+    case ISD::SETLT:
+    case ISD::SETNE:
+    case ISD::SETEQ:
+      // If all combinations of inverting the condition and swapping operands
+      // didn't work then we have no means to expand the condition.
+      llvm_unreachable("Don't know how to expand this condition!");
+    }
+
+    SDValue SetCC1, SetCC2;
+    if (CCCode != ISD::SETO && CCCode != ISD::SETUO) {
+      // If we aren't the ordered or unorder operation,
+      // then the pattern is (LHS CC1 RHS) Opc (LHS CC2 RHS).
+      if (IsNonVP) {
+        SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1, Chain, IsSignaling);
+        SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2, Chain, IsSignaling);
+      } else {
+        SetCC1 = DAG.getSetCCVP(dl, VT, LHS, RHS, CC1, Mask, EVL);
+        SetCC2 = DAG.getSetCCVP(dl, VT, LHS, RHS, CC2, Mask, EVL);
+      }
+    } else {
+      // Otherwise, the pattern is (LHS CC1 LHS) Opc (RHS CC2 RHS)
+      if (IsNonVP) {
+        SetCC1 = DAG.getSetCC(dl, VT, LHS, LHS, CC1, Chain, IsSignaling);
+        SetCC2 = DAG.getSetCC(dl, VT, RHS, RHS, CC2, Chain, IsSignaling);
+      } else {
+        SetCC1 = DAG.getSetCCVP(dl, VT, LHS, LHS, CC1, Mask, EVL);
+        SetCC2 = DAG.getSetCCVP(dl, VT, RHS, RHS, CC2, Mask, EVL);
+      }
+    }
+    if (Chain)
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, SetCC1.getValue(1),
+                          SetCC2.getValue(1));
+    if (IsNonVP)
+      LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2);
+    else {
+      // Transform the binary opcode to the VP equivalent.
+      assert((Opc == ISD::OR || Opc == ISD::AND) && "Unexpected opcode");
+      Opc = Opc == ISD::OR ? ISD::VP_OR : ISD::VP_AND;
+      LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2, Mask, EVL);
+    }
+    RHS = SDValue();
+    CC = SDValue();
+    return true;
+  }
+  }
+  return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ShadowStackGCLowering.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
new file mode 100644
index 000000000000..153fe77b8b4a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
@@ -0,0 +1,386 @@
+//===- ShadowStackGCLowering.cpp - Custom lowering for shadow-stack gc ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom lowering code required by the shadow-stack GC
+// strategy.
+//
+// This pass implements the code transformation described in this paper:
+//   "Accurate Garbage Collection in an Uncooperative Environment"
+//   Fergus Henderson, ISMM, 2002
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include <cassert>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "shadow-stack-gc-lowering"
+
+namespace {
+
+class ShadowStackGCLowering : public FunctionPass {
+  /// RootChain - This is the global linked-list that contains the chain of GC
+  /// roots.
+  GlobalVariable *Head = nullptr;
+
+  /// StackEntryTy - Abstract type of a link in the shadow stack.
+  StructType *StackEntryTy = nullptr;
+  StructType *FrameMapTy = nullptr;
+
+  /// Roots - GC roots in the current function. Each is a pair of the
+  /// intrinsic call and its corresponding alloca.
+  std::vector<std::pair<CallInst *, AllocaInst *>> Roots;
+
+public:
+  static char ID;
+
+  ShadowStackGCLowering();
+
+  bool doInitialization(Module &M) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+
+private:
+  bool IsNullValue(Value *V);
+  Constant *GetFrameMap(Function &F);
+  Type *GetConcreteStackEntryType(Function &F);
+  void CollectRoots(Function &F);
+
+  static GetElementPtrInst *CreateGEP(LLVMContext &Context, IRBuilder<> &B,
+                                      Type *Ty, Value *BasePtr, int Idx1,
+                                      const char *Name);
+  static GetElementPtrInst *CreateGEP(LLVMContext &Context, IRBuilder<> &B,
+                                      Type *Ty, Value *BasePtr, int Idx1, int Idx2,
+                                      const char *Name);
+};
+
+} // end anonymous namespace
+
+char ShadowStackGCLowering::ID = 0;
+char &llvm::ShadowStackGCLoweringID = ShadowStackGCLowering::ID;
+
+INITIALIZE_PASS_BEGIN(ShadowStackGCLowering, DEBUG_TYPE,
+                      "Shadow Stack GC Lowering", false, false)
+INITIALIZE_PASS_DEPENDENCY(GCModuleInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ShadowStackGCLowering, DEBUG_TYPE,
+                    "Shadow Stack GC Lowering", false, false)
+
+FunctionPass *llvm::createShadowStackGCLoweringPass() { return new ShadowStackGCLowering(); }
+
+ShadowStackGCLowering::ShadowStackGCLowering() : FunctionPass(ID) {
+  initializeShadowStackGCLoweringPass(*PassRegistry::getPassRegistry());
+}
+
+Constant *ShadowStackGCLowering::GetFrameMap(Function &F) {
+  // doInitialization creates the abstract type of this value.
+  Type *VoidPtr = Type::getInt8PtrTy(F.getContext());
+
+  // Truncate the ShadowStackDescriptor if some metadata is null.
+  unsigned NumMeta = 0;
+  SmallVector<Constant *, 16> Metadata;
+  for (unsigned I = 0; I != Roots.size(); ++I) {
+    Constant *C = cast<Constant>(Roots[I].first->getArgOperand(1));
+    if (!C->isNullValue())
+      NumMeta = I + 1;
+    Metadata.push_back(ConstantExpr::getBitCast(C, VoidPtr));
+  }
+  Metadata.resize(NumMeta);
+
+  Type *Int32Ty = Type::getInt32Ty(F.getContext());
+
+  Constant *BaseElts[] = {
+      ConstantInt::get(Int32Ty, Roots.size(), false),
+      ConstantInt::get(Int32Ty, NumMeta, false),
+  };
+
+  Constant *DescriptorElts[] = {
+      ConstantStruct::get(FrameMapTy, BaseElts),
+      ConstantArray::get(ArrayType::get(VoidPtr, NumMeta), Metadata)};
+
+  Type *EltTys[] = {DescriptorElts[0]->getType(), DescriptorElts[1]->getType()};
+  StructType *STy = StructType::create(EltTys, "gc_map." + utostr(NumMeta));
+
+  Constant *FrameMap = ConstantStruct::get(STy, DescriptorElts);
+
+  // FIXME: Is this actually dangerous as WritingAnLLVMPass.html claims? Seems
+  //        that, short of multithreaded LLVM, it should be safe; all that is
+  //        necessary is that a simple Module::iterator loop not be invalidated.
+  //        Appending to the GlobalVariable list is safe in that sense.
+  //
+  //        All of the output passes emit globals last. The ExecutionEngine
+  //        explicitly supports adding globals to the module after
+  //        initialization.
+  //
+  //        Still, if it isn't deemed acceptable, then this transformation needs
+  //        to be a ModulePass (which means it cannot be in the 'llc' pipeline
+  //        (which uses a FunctionPassManager (which segfaults (not asserts) if
+  //        provided a ModulePass))).
+  Constant *GV = new GlobalVariable(*F.getParent(), FrameMap->getType(), true,
+                                    GlobalVariable::InternalLinkage, FrameMap,
+                                    "__gc_" + F.getName());
+
+  Constant *GEPIndices[2] = {
+      ConstantInt::get(Type::getInt32Ty(F.getContext()), 0),
+      ConstantInt::get(Type::getInt32Ty(F.getContext()), 0)};
+  return ConstantExpr::getGetElementPtr(FrameMap->getType(), GV, GEPIndices);
+}
+
+Type *ShadowStackGCLowering::GetConcreteStackEntryType(Function &F) {
+  // doInitialization creates the generic version of this type.
+  std::vector<Type *> EltTys;
+  EltTys.push_back(StackEntryTy);
+  for (const std::pair<CallInst *, AllocaInst *> &Root : Roots)
+    EltTys.push_back(Root.second->getAllocatedType());
+
+  return StructType::create(EltTys, ("gc_stackentry." + F.getName()).str());
+}
+
+/// doInitialization - If this module uses the GC intrinsics, find them now. If
+/// not, exit fast.
+bool ShadowStackGCLowering::doInitialization(Module &M) {
+  bool Active = false;
+  for (Function &F : M) {
+    if (F.hasGC() && F.getGC() == std::string("shadow-stack")) {
+      Active = true;
+      break;
+    }
+  }
+  if (!Active)
+    return false;
+
+  // struct FrameMap {
+  //   int32_t NumRoots; // Number of roots in stack frame.
+  //   int32_t NumMeta;  // Number of metadata descriptors. May be < NumRoots.
+  //   void *Meta[];     // May be absent for roots without metadata.
+  // };
+  std::vector<Type *> EltTys;
+  // 32 bits is ok up to a 32GB stack frame. :)
+  EltTys.push_back(Type::getInt32Ty(M.getContext()));
+  // Specifies length of variable length array.
+  EltTys.push_back(Type::getInt32Ty(M.getContext()));
+  FrameMapTy = StructType::create(EltTys, "gc_map");
+  PointerType *FrameMapPtrTy = PointerType::getUnqual(FrameMapTy);
+
+  // struct StackEntry {
+  //   ShadowStackEntry *Next; // Caller's stack entry.
+  //   FrameMap *Map;          // Pointer to constant FrameMap.
+  //   void *Roots[];          // Stack roots (in-place array, so we pretend).
+  // };
+
+  StackEntryTy = StructType::create(M.getContext(), "gc_stackentry");
+
+  EltTys.clear();
+  EltTys.push_back(PointerType::getUnqual(StackEntryTy));
+  EltTys.push_back(FrameMapPtrTy);
+  StackEntryTy->setBody(EltTys);
+  PointerType *StackEntryPtrTy = PointerType::getUnqual(StackEntryTy);
+
+  // Get the root chain if it already exists.
+  Head = M.getGlobalVariable("llvm_gc_root_chain");
+  if (!Head) {
+    // If the root chain does not exist, insert a new one with linkonce
+    // linkage!
+    Head = new GlobalVariable(
+        M, StackEntryPtrTy, false, GlobalValue::LinkOnceAnyLinkage,
+        Constant::getNullValue(StackEntryPtrTy), "llvm_gc_root_chain");
+  } else if (Head->hasExternalLinkage() && Head->isDeclaration()) {
+    Head->setInitializer(Constant::getNullValue(StackEntryPtrTy));
+    Head->setLinkage(GlobalValue::LinkOnceAnyLinkage);
+  }
+
+  return true;
+}
+
+bool ShadowStackGCLowering::IsNullValue(Value *V) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return C->isNullValue();
+  return false;
+}
+
+void ShadowStackGCLowering::CollectRoots(Function &F) {
+  // FIXME: Account for original alignment. Could fragment the root array.
+  //   Approach 1: Null initialize empty slots at runtime. Yuck.
+  //   Approach 2: Emit a map of the array instead of just a count.
+
+  assert(Roots.empty() && "Not cleaned up?");
+
+  SmallVector<std::pair<CallInst *, AllocaInst *>, 16> MetaRoots;
+
+  for (BasicBlock &BB : F)
+    for (Instruction &I : BB)
+      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(&I))
+        if (Function *F = CI->getCalledFunction())
+          if (F->getIntrinsicID() == Intrinsic::gcroot) {
+            std::pair<CallInst *, AllocaInst *> Pair = std::make_pair(
+                CI,
+                cast<AllocaInst>(CI->getArgOperand(0)->stripPointerCasts()));
+            if (IsNullValue(CI->getArgOperand(1)))
+              Roots.push_back(Pair);
+            else
+              MetaRoots.push_back(Pair);
+          }
+
+  // Number roots with metadata (usually empty) at the beginning, so that the
+  // FrameMap::Meta array can be elided.
+  Roots.insert(Roots.begin(), MetaRoots.begin(), MetaRoots.end());
+}
+
+GetElementPtrInst *ShadowStackGCLowering::CreateGEP(LLVMContext &Context,
+                                                    IRBuilder<> &B, Type *Ty,
+                                                    Value *BasePtr, int Idx,
+                                                    int Idx2,
+                                                    const char *Name) {
+  Value *Indices[] = {ConstantInt::get(Type::getInt32Ty(Context), 0),
+                      ConstantInt::get(Type::getInt32Ty(Context), Idx),
+                      ConstantInt::get(Type::getInt32Ty(Context), Idx2)};
+  Value *Val = B.CreateGEP(Ty, BasePtr, Indices, Name);
+
+  assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
+
+  return dyn_cast<GetElementPtrInst>(Val);
+}
+
+GetElementPtrInst *ShadowStackGCLowering::CreateGEP(LLVMContext &Context,
+                                            IRBuilder<> &B, Type *Ty, Value *BasePtr,
+                                            int Idx, const char *Name) {
+  Value *Indices[] = {ConstantInt::get(Type::getInt32Ty(Context), 0),
+                      ConstantInt::get(Type::getInt32Ty(Context), Idx)};
+  Value *Val = B.CreateGEP(Ty, BasePtr, Indices, Name);
+
+  assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
+
+  return dyn_cast<GetElementPtrInst>(Val);
+}
+
+void ShadowStackGCLowering::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<DominatorTreeWrapperPass>();
+}
+
+/// runOnFunction - Insert code to maintain the shadow stack.
+bool ShadowStackGCLowering::runOnFunction(Function &F) {
+  // Quick exit for functions that do not use the shadow stack GC.
+  if (!F.hasGC() ||
+      F.getGC() != std::string("shadow-stack"))
+    return false;
+
+  LLVMContext &Context = F.getContext();
+
+  // Find calls to llvm.gcroot.
+  CollectRoots(F);
+
+  // If there are no roots in this function, then there is no need to add a
+  // stack map entry for it.
+  if (Roots.empty())
+    return false;
+
+  std::optional<DomTreeUpdater> DTU;
+  if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
+    DTU.emplace(DTWP->getDomTree(), DomTreeUpdater::UpdateStrategy::Lazy);
+
+  // Build the constant map and figure the type of the shadow stack entry.
+  Value *FrameMap = GetFrameMap(F);
+  Type *ConcreteStackEntryTy = GetConcreteStackEntryType(F);
+
+  // Build the shadow stack entry at the very start of the function.
+  BasicBlock::iterator IP = F.getEntryBlock().begin();
+  IRBuilder<> AtEntry(IP->getParent(), IP);
+
+  Instruction *StackEntry =
+      AtEntry.CreateAlloca(ConcreteStackEntryTy, nullptr, "gc_frame");
+
+  AtEntry.SetInsertPointPastAllocas(&F);
+  IP = AtEntry.GetInsertPoint();
+
+  // Initialize the map pointer and load the current head of the shadow stack.
+  Instruction *CurrentHead =
+      AtEntry.CreateLoad(StackEntryTy->getPointerTo(), Head, "gc_currhead");
+  Instruction *EntryMapPtr = CreateGEP(Context, AtEntry, ConcreteStackEntryTy,
+                                       StackEntry, 0, 1, "gc_frame.map");
+  AtEntry.CreateStore(FrameMap, EntryMapPtr);
+
+  // After all the allocas...
+  for (unsigned I = 0, E = Roots.size(); I != E; ++I) {
+    // For each root, find the corresponding slot in the aggregate...
+    Value *SlotPtr = CreateGEP(Context, AtEntry, ConcreteStackEntryTy,
+                               StackEntry, 1 + I, "gc_root");
+
+    // And use it in lieu of the alloca.
+    AllocaInst *OriginalAlloca = Roots[I].second;
+    SlotPtr->takeName(OriginalAlloca);
+    OriginalAlloca->replaceAllUsesWith(SlotPtr);
+  }
+
+  // Move past the original stores inserted by GCStrategy::InitRoots. This isn't
+  // really necessary (the collector would never see the intermediate state at
+  // runtime), but it's nicer not to push the half-initialized entry onto the
+  // shadow stack.
+  while (isa<StoreInst>(IP))
+    ++IP;
+  AtEntry.SetInsertPoint(IP->getParent(), IP);
+
+  // Push the entry onto the shadow stack.
+  Instruction *EntryNextPtr = CreateGEP(Context, AtEntry, ConcreteStackEntryTy,
+                                        StackEntry, 0, 0, "gc_frame.next");
+  Instruction *NewHeadVal = CreateGEP(Context, AtEntry, ConcreteStackEntryTy,
+                                      StackEntry, 0, "gc_newhead");
+  AtEntry.CreateStore(CurrentHead, EntryNextPtr);
+  AtEntry.CreateStore(NewHeadVal, Head);
+
+  // For each instruction that escapes...
+  EscapeEnumerator EE(F, "gc_cleanup", /*HandleExceptions=*/true,
+                      DTU ? &*DTU : nullptr);
+  while (IRBuilder<> *AtExit = EE.Next()) {
+    // Pop the entry from the shadow stack. Don't reuse CurrentHead from
+    // AtEntry, since that would make the value live for the entire function.
+    Instruction *EntryNextPtr2 =
+        CreateGEP(Context, *AtExit, ConcreteStackEntryTy, StackEntry, 0, 0,
+                  "gc_frame.next");
+    Value *SavedHead = AtExit->CreateLoad(StackEntryTy->getPointerTo(),
+                                          EntryNextPtr2, "gc_savedhead");
+    AtExit->CreateStore(SavedHead, Head);
+  }
+
+  // Delete the original allocas (which are no longer used) and the intrinsic
+  // calls (which are no longer valid). Doing this last avoids invalidating
+  // iterators.
+  for (std::pair<CallInst *, AllocaInst *> &Root : Roots) {
+    Root.first->eraseFromParent();
+    Root.second->eraseFromParent();
+  }
+
+  Roots.clear();
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ShrinkWrap.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ShrinkWrap.cpp
new file mode 100644
index 000000000000..4b1d3637a746
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -0,0 +1,997 @@
+//===- ShrinkWrap.cpp - Compute safe point for prolog/epilog insertion ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for safe point where the prologue and epilogue can be
+// inserted.
+// The safe point for the prologue (resp. epilogue) is called Save
+// (resp. Restore).
+// A point is safe for prologue (resp. epilogue) if and only if
+// it 1) dominates (resp. post-dominates) all the frame related operations and
+// between 2) two executions of the Save (resp. Restore) point there is an
+// execution of the Restore (resp. Save) point.
+//
+// For instance, the following points are safe:
+// for (int i = 0; i < 10; ++i) {
+//   Save
+//   ...
+//   Restore
+// }
+// Indeed, the execution looks like Save -> Restore -> Save -> Restore ...
+// And the following points are not:
+// for (int i = 0; i < 10; ++i) {
+//   Save
+//   ...
+// }
+// for (int i = 0; i < 10; ++i) {
+//   ...
+//   Restore
+// }
+// Indeed, the execution looks like Save -> Save -> ... -> Restore -> Restore.
+//
+// This pass also ensures that the safe points are 3) cheaper than the regular
+// entry and exits blocks.
+//
+// Property #1 is ensured via the use of MachineDominatorTree and
+// MachinePostDominatorTree.
+// Property #2 is ensured via property #1 and MachineLoopInfo, i.e., both
+// points must be in the same loop.
+// Property #3 is ensured via the MachineBlockFrequencyInfo.
+//
+// If this pass found points matching all these properties, then
+// MachineFrameInfo is updated with this information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <cstdint>
+#include <memory>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "shrink-wrap"
+
+STATISTIC(NumFunc, "Number of functions");
+STATISTIC(NumCandidates, "Number of shrink-wrapping candidates");
+STATISTIC(NumCandidatesDropped,
+          "Number of shrink-wrapping candidates dropped because of frequency");
+
+static cl::opt<cl::boolOrDefault>
+EnableShrinkWrapOpt("enable-shrink-wrap", cl::Hidden,
+                    cl::desc("enable the shrink-wrapping pass"));
+static cl::opt<bool> EnablePostShrinkWrapOpt(
+    "enable-shrink-wrap-region-split", cl::init(true), cl::Hidden,
+    cl::desc("enable splitting of the restore block if possible"));
+
+namespace {
+
+/// Class to determine where the safe point to insert the
+/// prologue and epilogue are.
+/// Unlike the paper from Fred C. Chow, PLDI'88, that introduces the
+/// shrink-wrapping term for prologue/epilogue placement, this pass
+/// does not rely on expensive data-flow analysis. Instead we use the
+/// dominance properties and loop information to decide which point
+/// are safe for such insertion.
+class ShrinkWrap : public MachineFunctionPass {
+  /// Hold callee-saved information.
+  RegisterClassInfo RCI;
+  MachineDominatorTree *MDT = nullptr;
+  MachinePostDominatorTree *MPDT = nullptr;
+
+  /// Current safe point found for the prologue.
+  /// The prologue will be inserted before the first instruction
+  /// in this basic block.
+  MachineBasicBlock *Save = nullptr;
+
+  /// Current safe point found for the epilogue.
+  /// The epilogue will be inserted before the first terminator instruction
+  /// in this basic block.
+  MachineBasicBlock *Restore = nullptr;
+
+  /// Hold the information of the basic block frequency.
+  /// Use to check the profitability of the new points.
+  MachineBlockFrequencyInfo *MBFI = nullptr;
+
+  /// Hold the loop information. Used to determine if Save and Restore
+  /// are in the same loop.
+  MachineLoopInfo *MLI = nullptr;
+
+  // Emit remarks.
+  MachineOptimizationRemarkEmitter *ORE = nullptr;
+
+  /// Frequency of the Entry block.
+  uint64_t EntryFreq = 0;
+
+  /// Current opcode for frame setup.
+  unsigned FrameSetupOpcode = ~0u;
+
+  /// Current opcode for frame destroy.
+  unsigned FrameDestroyOpcode = ~0u;
+
+  /// Stack pointer register, used by llvm.{savestack,restorestack}
+  Register SP;
+
+  /// Entry block.
+  const MachineBasicBlock *Entry = nullptr;
+
+  using SetOfRegs = SmallSetVector<unsigned, 16>;
+
+  /// Registers that need to be saved for the current function.
+  mutable SetOfRegs CurrentCSRs;
+
+  /// Current MachineFunction.
+  MachineFunction *MachineFunc = nullptr;
+
+  /// Is `true` for block numbers where we can guarantee no stack access
+  /// or computation of stack-relative addresses on any CFG path including
+  /// the block itself.
+  BitVector StackAddressUsedBlockInfo;
+
+  /// Check if \p MI uses or defines a callee-saved register or
+  /// a frame index. If this is the case, this means \p MI must happen
+  /// after Save and before Restore.
+  bool useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS,
+                       bool StackAddressUsed) const;
+
+  const SetOfRegs &getCurrentCSRs(RegScavenger *RS) const {
+    if (CurrentCSRs.empty()) {
+      BitVector SavedRegs;
+      const TargetFrameLowering *TFI =
+          MachineFunc->getSubtarget().getFrameLowering();
+
+      TFI->determineCalleeSaves(*MachineFunc, SavedRegs, RS);
+
+      for (int Reg = SavedRegs.find_first(); Reg != -1;
+           Reg = SavedRegs.find_next(Reg))
+        CurrentCSRs.insert((unsigned)Reg);
+    }
+    return CurrentCSRs;
+  }
+
+  /// Update the Save and Restore points such that \p MBB is in
+  /// the region that is dominated by Save and post-dominated by Restore
+  /// and Save and Restore still match the safe point definition.
+  /// Such point may not exist and Save and/or Restore may be null after
+  /// this call.
+  void updateSaveRestorePoints(MachineBasicBlock &MBB, RegScavenger *RS);
+
+  // Try to find safe point based on dominance and block frequency without
+  // any change in IR.
+  bool performShrinkWrapping(
+      const ReversePostOrderTraversal<MachineBasicBlock *> &RPOT,
+      RegScavenger *RS);
+
+  /// This function tries to split the restore point if doing so can shrink the
+  /// save point further. \return True if restore point is split.
+  bool postShrinkWrapping(bool HasCandidate, MachineFunction &MF,
+                          RegScavenger *RS);
+
+  /// This function analyzes if the restore point can split to create a new
+  /// restore point. This function collects
+  /// 1. Any preds of current restore that are reachable by callee save/FI
+  /// blocks
+  /// - indicated by DirtyPreds
+  /// 2. Any preds of current restore that are not DirtyPreds - indicated by
+  /// CleanPreds
+  /// Both sets should be non-empty for considering restore point split.
+  bool checkIfRestoreSplittable(
+      const MachineBasicBlock *CurRestore,
+      const DenseSet<const MachineBasicBlock *> &ReachableByDirty,
+      SmallVectorImpl<MachineBasicBlock *> &DirtyPreds,
+      SmallVectorImpl<MachineBasicBlock *> &CleanPreds,
+      const TargetInstrInfo *TII, RegScavenger *RS);
+
+  /// Initialize the pass for \p MF.
+  void init(MachineFunction &MF) {
+    RCI.runOnMachineFunction(MF);
+    MDT = &getAnalysis<MachineDominatorTree>();
+    MPDT = &getAnalysis<MachinePostDominatorTree>();
+    Save = nullptr;
+    Restore = nullptr;
+    MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+    MLI = &getAnalysis<MachineLoopInfo>();
+    ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
+    EntryFreq = MBFI->getEntryFreq();
+    const TargetSubtargetInfo &Subtarget = MF.getSubtarget();
+    const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+    FrameSetupOpcode = TII.getCallFrameSetupOpcode();
+    FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
+    SP = Subtarget.getTargetLowering()->getStackPointerRegisterToSaveRestore();
+    Entry = &MF.front();
+    CurrentCSRs.clear();
+    MachineFunc = &MF;
+
+    ++NumFunc;
+  }
+
+  /// Check whether or not Save and Restore points are still interesting for
+  /// shrink-wrapping.
+  bool ArePointsInteresting() const { return Save != Entry && Save && Restore; }
+
+  /// Check if shrink wrapping is enabled for this target and function.
+  static bool isShrinkWrapEnabled(const MachineFunction &MF);
+
+public:
+  static char ID;
+
+  ShrinkWrap() : MachineFunctionPass(ID) {
+    initializeShrinkWrapPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<MachineOptimizationRemarkEmitterPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+      MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  StringRef getPassName() const override { return "Shrink Wrapping analysis"; }
+
+  /// Perform the shrink-wrapping analysis and update
+  /// the MachineFrameInfo attached to \p MF with the results.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // end anonymous namespace
+
+char ShrinkWrap::ID = 0;
+
+char &llvm::ShrinkWrapID = ShrinkWrap::ID;
+
+INITIALIZE_PASS_BEGIN(ShrinkWrap, DEBUG_TYPE, "Shrink Wrap Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
+INITIALIZE_PASS_END(ShrinkWrap, DEBUG_TYPE, "Shrink Wrap Pass", false, false)
+
+bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS,
+                                 bool StackAddressUsed) const {
+  /// Check if \p Op is known to access an address not on the function's stack .
+  /// At the moment, accesses where the underlying object is a global, function
+  /// argument, or jump table are considered non-stack accesses. Note that the
+  /// caller's stack may get accessed when passing an argument via the stack,
+  /// but not the stack of the current function.
+  ///
+  auto IsKnownNonStackPtr = [](MachineMemOperand *Op) {
+    if (Op->getValue()) {
+      const Value *UO = getUnderlyingObject(Op->getValue());
+      if (!UO)
+        return false;
+      if (auto *Arg = dyn_cast<Argument>(UO))
+        return !Arg->hasPassPointeeByValueCopyAttr();
+      return isa<GlobalValue>(UO);
+    }
+    if (const PseudoSourceValue *PSV = Op->getPseudoValue())
+      return PSV->isJumpTable();
+    return false;
+  };
+  // Load/store operations may access the stack indirectly when we previously
+  // computed an address to a stack location.
+  if (StackAddressUsed && MI.mayLoadOrStore() &&
+      (MI.isCall() || MI.hasUnmodeledSideEffects() || MI.memoperands_empty() ||
+       !all_of(MI.memoperands(), IsKnownNonStackPtr)))
+    return true;
+
+  if (MI.getOpcode() == FrameSetupOpcode ||
+      MI.getOpcode() == FrameDestroyOpcode) {
+    LLVM_DEBUG(dbgs() << "Frame instruction: " << MI << '\n');
+    return true;
+  }
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  for (const MachineOperand &MO : MI.operands()) {
+    bool UseOrDefCSR = false;
+    if (MO.isReg()) {
+      // Ignore instructions like DBG_VALUE which don't read/def the register.
+      if (!MO.isDef() && !MO.readsReg())
+        continue;
+      Register PhysReg = MO.getReg();
+      if (!PhysReg)
+        continue;
+      assert(PhysReg.isPhysical() && "Unallocated register?!");
+      // The stack pointer is not normally described as a callee-saved register
+      // in calling convention definitions, so we need to watch for it
+      // separately. An SP mentioned by a call instruction, we can ignore,
+      // though, as it's harmless and we do not want to effectively disable tail
+      // calls by forcing the restore point to post-dominate them.
+      // PPC's LR is also not normally described as a callee-saved register in
+      // calling convention definitions, so we need to watch for it, too. An LR
+      // mentioned implicitly by a return (or "branch to link register")
+      // instruction we can ignore, otherwise we may pessimize shrinkwrapping.
+      UseOrDefCSR =
+          (!MI.isCall() && PhysReg == SP) ||
+          RCI.getLastCalleeSavedAlias(PhysReg) ||
+          (!MI.isReturn() && TRI->isNonallocatableRegisterCalleeSave(PhysReg));
+    } else if (MO.isRegMask()) {
+      // Check if this regmask clobbers any of the CSRs.
+      for (unsigned Reg : getCurrentCSRs(RS)) {
+        if (MO.clobbersPhysReg(Reg)) {
+          UseOrDefCSR = true;
+          break;
+        }
+      }
+    }
+    // Skip FrameIndex operands in DBG_VALUE instructions.
+    if (UseOrDefCSR || (MO.isFI() && !MI.isDebugValue())) {
+      LLVM_DEBUG(dbgs() << "Use or define CSR(" << UseOrDefCSR << ") or FI("
+                        << MO.isFI() << "): " << MI << '\n');
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Helper function to find the immediate (post) dominator.
+template <typename ListOfBBs, typename DominanceAnalysis>
+static MachineBasicBlock *FindIDom(MachineBasicBlock &Block, ListOfBBs BBs,
+                                   DominanceAnalysis &Dom, bool Strict = true) {
+  MachineBasicBlock *IDom = &Block;
+  for (MachineBasicBlock *BB : BBs) {
+    IDom = Dom.findNearestCommonDominator(IDom, BB);
+    if (!IDom)
+      break;
+  }
+  if (Strict && IDom == &Block)
+    return nullptr;
+  return IDom;
+}
+
+static bool isAnalyzableBB(const TargetInstrInfo &TII,
+                           MachineBasicBlock &Entry) {
+  // Check if the block is analyzable.
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  return !TII.analyzeBranch(Entry, TBB, FBB, Cond);
+}
+
+/// Determines if any predecessor of MBB is on the path from block that has use
+/// or def of CSRs/FI to MBB.
+/// ReachableByDirty: All blocks reachable from block that has use or def of
+/// CSR/FI.
+static bool
+hasDirtyPred(const DenseSet<const MachineBasicBlock *> &ReachableByDirty,
+             const MachineBasicBlock &MBB) {
+  for (const MachineBasicBlock *PredBB : MBB.predecessors())
+    if (ReachableByDirty.count(PredBB))
+      return true;
+  return false;
+}
+
+/// Derives the list of all the basic blocks reachable from MBB.
+static void markAllReachable(DenseSet<const MachineBasicBlock *> &Visited,
+                             const MachineBasicBlock &MBB) {
+  SmallVector<MachineBasicBlock *, 4> Worklist(MBB.succ_begin(),
+                                               MBB.succ_end());
+  Visited.insert(&MBB);
+  while (!Worklist.empty()) {
+    MachineBasicBlock *SuccMBB = Worklist.pop_back_val();
+    if (!Visited.insert(SuccMBB).second)
+      continue;
+    Worklist.append(SuccMBB->succ_begin(), SuccMBB->succ_end());
+  }
+}
+
+/// Collect blocks reachable by use or def of CSRs/FI.
+static void collectBlocksReachableByDirty(
+    const DenseSet<const MachineBasicBlock *> &DirtyBBs,
+    DenseSet<const MachineBasicBlock *> &ReachableByDirty) {
+  for (const MachineBasicBlock *MBB : DirtyBBs) {
+    if (ReachableByDirty.count(MBB))
+      continue;
+    // Mark all offsprings as reachable.
+    markAllReachable(ReachableByDirty, *MBB);
+  }
+}
+
+/// \return true if there is a clean path from SavePoint to the original
+/// Restore.
+static bool
+isSaveReachableThroughClean(const MachineBasicBlock *SavePoint,
+                            ArrayRef<MachineBasicBlock *> CleanPreds) {
+  DenseSet<const MachineBasicBlock *> Visited;
+  SmallVector<MachineBasicBlock *, 4> Worklist(CleanPreds.begin(),
+                                               CleanPreds.end());
+  while (!Worklist.empty()) {
+    MachineBasicBlock *CleanBB = Worklist.pop_back_val();
+    if (CleanBB == SavePoint)
+      return true;
+    if (!Visited.insert(CleanBB).second || !CleanBB->pred_size())
+      continue;
+    Worklist.append(CleanBB->pred_begin(), CleanBB->pred_end());
+  }
+  return false;
+}
+
+/// This function updates the branches post restore point split.
+///
+/// Restore point has been split.
+/// Old restore point: MBB
+/// New restore point: NMBB
+/// Any basic block(say BBToUpdate) which had a fallthrough to MBB
+/// previously should
+/// 1. Fallthrough to NMBB iff NMBB is inserted immediately above MBB in the
+/// block layout OR
+/// 2. Branch unconditionally to NMBB iff NMBB is inserted at any other place.
+static void updateTerminator(MachineBasicBlock *BBToUpdate,
+                             MachineBasicBlock *NMBB,
+                             const TargetInstrInfo *TII) {
+  DebugLoc DL = BBToUpdate->findBranchDebugLoc();
+  // if NMBB isn't the new layout successor for BBToUpdate, insert unconditional
+  // branch to it
+  if (!BBToUpdate->isLayoutSuccessor(NMBB))
+    TII->insertUnconditionalBranch(*BBToUpdate, NMBB, DL);
+}
+
+/// This function splits the restore point and returns new restore point/BB.
+///
+/// DirtyPreds: Predessors of \p MBB that are ReachableByDirty
+///
+/// Decision has been made to split the restore point.
+/// old restore point: \p MBB
+/// new restore point: \p NMBB
+/// This function makes the necessary block layout changes so that
+/// 1. \p NMBB points to \p MBB unconditionally
+/// 2. All dirtyPreds that previously pointed to \p MBB point to \p NMBB
+static MachineBasicBlock *
+tryToSplitRestore(MachineBasicBlock *MBB,
+                  ArrayRef<MachineBasicBlock *> DirtyPreds,
+                  const TargetInstrInfo *TII) {
+  MachineFunction *MF = MBB->getParent();
+
+  // get the list of DirtyPreds who have a fallthrough to MBB
+  // before the block layout change. This is just to ensure that if the NMBB is
+  // inserted after MBB, then we create unconditional branch from
+  // DirtyPred/CleanPred to NMBB
+  SmallPtrSet<MachineBasicBlock *, 8> MBBFallthrough;
+  for (MachineBasicBlock *BB : DirtyPreds)
+    if (BB->getFallThrough(false) == MBB)
+      MBBFallthrough.insert(BB);
+
+  MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
+  // Insert this block at the end of the function. Inserting in between may
+  // interfere with control flow optimizer decisions.
+  MF->insert(MF->end(), NMBB);
+
+  for (const MachineBasicBlock::RegisterMaskPair &LI : MBB->liveins())
+    NMBB->addLiveIn(LI.PhysReg);
+
+  TII->insertUnconditionalBranch(*NMBB, MBB, DebugLoc());
+
+  // After splitting, all predecessors of the restore point should be dirty
+  // blocks.
+  for (MachineBasicBlock *SuccBB : DirtyPreds)
+    SuccBB->ReplaceUsesOfBlockWith(MBB, NMBB);
+
+  NMBB->addSuccessor(MBB);
+
+  for (MachineBasicBlock *BBToUpdate : MBBFallthrough)
+    updateTerminator(BBToUpdate, NMBB, TII);
+
+  return NMBB;
+}
+
+/// This function undoes the restore point split done earlier.
+///
+/// DirtyPreds: All predecessors of \p NMBB that are ReachableByDirty.
+///
+/// Restore point was split and the change needs to be unrolled. Make necessary
+/// changes to reset restore point from \p NMBB to \p MBB.
+static void rollbackRestoreSplit(MachineFunction &MF, MachineBasicBlock *NMBB,
+                                 MachineBasicBlock *MBB,
+                                 ArrayRef<MachineBasicBlock *> DirtyPreds,
+                                 const TargetInstrInfo *TII) {
+  // For a BB, if NMBB is fallthrough in the current layout, then in the new
+  // layout a. BB should fallthrough to MBB OR b. BB should undconditionally
+  // branch to MBB
+  SmallPtrSet<MachineBasicBlock *, 8> NMBBFallthrough;
+  for (MachineBasicBlock *BB : DirtyPreds)
+    if (BB->getFallThrough(false) == NMBB)
+      NMBBFallthrough.insert(BB);
+
+  NMBB->removeSuccessor(MBB);
+  for (MachineBasicBlock *SuccBB : DirtyPreds)
+    SuccBB->ReplaceUsesOfBlockWith(NMBB, MBB);
+
+  NMBB->erase(NMBB->begin(), NMBB->end());
+  NMBB->eraseFromParent();
+
+  for (MachineBasicBlock *BBToUpdate : NMBBFallthrough)
+    updateTerminator(BBToUpdate, MBB, TII);
+}
+
+// A block is deemed fit for restore point split iff there exist
+// 1. DirtyPreds - preds of CurRestore reachable from use or def of CSR/FI
+// 2. CleanPreds - preds of CurRestore that arent DirtyPreds
+bool ShrinkWrap::checkIfRestoreSplittable(
+    const MachineBasicBlock *CurRestore,
+    const DenseSet<const MachineBasicBlock *> &ReachableByDirty,
+    SmallVectorImpl<MachineBasicBlock *> &DirtyPreds,
+    SmallVectorImpl<MachineBasicBlock *> &CleanPreds,
+    const TargetInstrInfo *TII, RegScavenger *RS) {
+  for (const MachineInstr &MI : *CurRestore)
+    if (useOrDefCSROrFI(MI, RS, /*StackAddressUsed=*/true))
+      return false;
+
+  for (MachineBasicBlock *PredBB : CurRestore->predecessors()) {
+    if (!isAnalyzableBB(*TII, *PredBB))
+      return false;
+
+    if (ReachableByDirty.count(PredBB))
+      DirtyPreds.push_back(PredBB);
+    else
+      CleanPreds.push_back(PredBB);
+  }
+
+  return !(CleanPreds.empty() || DirtyPreds.empty());
+}
+
+bool ShrinkWrap::postShrinkWrapping(bool HasCandidate, MachineFunction &MF,
+                                    RegScavenger *RS) {
+  if (!EnablePostShrinkWrapOpt)
+    return false;
+
+  MachineBasicBlock *InitSave = nullptr;
+  MachineBasicBlock *InitRestore = nullptr;
+
+  if (HasCandidate) {
+    InitSave = Save;
+    InitRestore = Restore;
+  } else {
+    InitRestore = nullptr;
+    InitSave = &MF.front();
+    for (MachineBasicBlock &MBB : MF) {
+      if (MBB.isEHFuncletEntry())
+        return false;
+      if (MBB.isReturnBlock()) {
+        // Do not support multiple restore points.
+        if (InitRestore)
+          return false;
+        InitRestore = &MBB;
+      }
+    }
+  }
+
+  if (!InitSave || !InitRestore || InitRestore == InitSave ||
+      !MDT->dominates(InitSave, InitRestore) ||
+      !MPDT->dominates(InitRestore, InitSave))
+    return false;
+
+  // Bail out of the optimization if any of the basic block is target of
+  // INLINEASM_BR instruction
+  for (MachineBasicBlock &MBB : MF)
+    if (MBB.isInlineAsmBrIndirectTarget())
+      return false;
+
+  DenseSet<const MachineBasicBlock *> DirtyBBs;
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBB.isEHPad()) {
+      DirtyBBs.insert(&MBB);
+      continue;
+    }
+    for (const MachineInstr &MI : MBB)
+      if (useOrDefCSROrFI(MI, RS, /*StackAddressUsed=*/true)) {
+        DirtyBBs.insert(&MBB);
+        break;
+      }
+  }
+
+  // Find blocks reachable from the use or def of CSRs/FI.
+  DenseSet<const MachineBasicBlock *> ReachableByDirty;
+  collectBlocksReachableByDirty(DirtyBBs, ReachableByDirty);
+
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  SmallVector<MachineBasicBlock *, 2> DirtyPreds;
+  SmallVector<MachineBasicBlock *, 2> CleanPreds;
+  if (!checkIfRestoreSplittable(InitRestore, ReachableByDirty, DirtyPreds,
+                                CleanPreds, TII, RS))
+    return false;
+
+  // Trying to reach out to the new save point which dominates all dirty blocks.
+  MachineBasicBlock *NewSave =
+      FindIDom<>(**DirtyPreds.begin(), DirtyPreds, *MDT, false);
+
+  while (NewSave && (hasDirtyPred(ReachableByDirty, *NewSave) ||
+                     EntryFreq < MBFI->getBlockFreq(NewSave).getFrequency() ||
+                     /*Entry freq has been observed more than a loop block in
+                        some cases*/
+                     MLI->getLoopFor(NewSave)))
+    NewSave = FindIDom<>(**NewSave->pred_begin(), NewSave->predecessors(), *MDT,
+                         false);
+
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  if (!NewSave || NewSave == InitSave ||
+      isSaveReachableThroughClean(NewSave, CleanPreds) ||
+      !TFI->canUseAsPrologue(*NewSave))
+    return false;
+
+  // Now we know that splitting a restore point can isolate the restore point
+  // from clean blocks and doing so can shrink the save point.
+  MachineBasicBlock *NewRestore =
+      tryToSplitRestore(InitRestore, DirtyPreds, TII);
+
+  // Make sure if the new restore point is valid as an epilogue, depending on
+  // targets.
+  if (!TFI->canUseAsEpilogue(*NewRestore)) {
+    rollbackRestoreSplit(MF, NewRestore, InitRestore, DirtyPreds, TII);
+    return false;
+  }
+
+  Save = NewSave;
+  Restore = NewRestore;
+
+  MDT->runOnMachineFunction(MF);
+  MPDT->runOnMachineFunction(MF);
+
+  assert((MDT->dominates(Save, Restore) && MPDT->dominates(Restore, Save)) &&
+         "Incorrect save or restore point due to dominance relations");
+  assert((!MLI->getLoopFor(Save) && !MLI->getLoopFor(Restore)) &&
+         "Unexpected save or restore point in a loop");
+  assert((EntryFreq >= MBFI->getBlockFreq(Save).getFrequency() &&
+          EntryFreq >= MBFI->getBlockFreq(Restore).getFrequency()) &&
+         "Incorrect save or restore point based on block frequency");
+  return true;
+}
+
+void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
+                                         RegScavenger *RS) {
+  // Get rid of the easy cases first.
+  if (!Save)
+    Save = &MBB;
+  else
+    Save = MDT->findNearestCommonDominator(Save, &MBB);
+  assert(Save);
+
+  if (!Restore)
+    Restore = &MBB;
+  else if (MPDT->getNode(&MBB)) // If the block is not in the post dom tree, it
+                                // means the block never returns. If that's the
+                                // case, we don't want to call
+                                // `findNearestCommonDominator`, which will
+                                // return `Restore`.
+    Restore = MPDT->findNearestCommonDominator(Restore, &MBB);
+  else
+    Restore = nullptr; // Abort, we can't find a restore point in this case.
+
+  // Make sure we would be able to insert the restore code before the
+  // terminator.
+  if (Restore == &MBB) {
+    for (const MachineInstr &Terminator : MBB.terminators()) {
+      if (!useOrDefCSROrFI(Terminator, RS, /*StackAddressUsed=*/true))
+        continue;
+      // One of the terminator needs to happen before the restore point.
+      if (MBB.succ_empty()) {
+        Restore = nullptr; // Abort, we can't find a restore point in this case.
+        break;
+      }
+      // Look for a restore point that post-dominates all the successors.
+      // The immediate post-dominator is what we are looking for.
+      Restore = FindIDom<>(*Restore, Restore->successors(), *MPDT);
+      break;
+    }
+  }
+
+  if (!Restore) {
+    LLVM_DEBUG(
+        dbgs() << "Restore point needs to be spanned on several blocks\n");
+    return;
+  }
+
+  // Make sure Save and Restore are suitable for shrink-wrapping:
+  // 1. all path from Save needs to lead to Restore before exiting.
+  // 2. all path to Restore needs to go through Save from Entry.
+  // We achieve that by making sure that:
+  // A. Save dominates Restore.
+  // B. Restore post-dominates Save.
+  // C. Save and Restore are in the same loop.
+  bool SaveDominatesRestore = false;
+  bool RestorePostDominatesSave = false;
+  while (Restore &&
+         (!(SaveDominatesRestore = MDT->dominates(Save, Restore)) ||
+          !(RestorePostDominatesSave = MPDT->dominates(Restore, Save)) ||
+          // Post-dominance is not enough in loops to ensure that all uses/defs
+          // are after the prologue and before the epilogue at runtime.
+          // E.g.,
+          // while(1) {
+          //  Save
+          //  Restore
+          //   if (...)
+          //     break;
+          //  use/def CSRs
+          // }
+          // All the uses/defs of CSRs are dominated by Save and post-dominated
+          // by Restore. However, the CSRs uses are still reachable after
+          // Restore and before Save are executed.
+          //
+          // For now, just push the restore/save points outside of loops.
+          // FIXME: Refine the criteria to still find interesting cases
+          // for loops.
+          MLI->getLoopFor(Save) || MLI->getLoopFor(Restore))) {
+    // Fix (A).
+    if (!SaveDominatesRestore) {
+      Save = MDT->findNearestCommonDominator(Save, Restore);
+      continue;
+    }
+    // Fix (B).
+    if (!RestorePostDominatesSave)
+      Restore = MPDT->findNearestCommonDominator(Restore, Save);
+
+    // Fix (C).
+    if (Restore && (MLI->getLoopFor(Save) || MLI->getLoopFor(Restore))) {
+      if (MLI->getLoopDepth(Save) > MLI->getLoopDepth(Restore)) {
+        // Push Save outside of this loop if immediate dominator is different
+        // from save block. If immediate dominator is not different, bail out.
+        Save = FindIDom<>(*Save, Save->predecessors(), *MDT);
+        if (!Save)
+          break;
+      } else {
+        // If the loop does not exit, there is no point in looking
+        // for a post-dominator outside the loop.
+        SmallVector<MachineBasicBlock*, 4> ExitBlocks;
+        MLI->getLoopFor(Restore)->getExitingBlocks(ExitBlocks);
+        // Push Restore outside of this loop.
+        // Look for the immediate post-dominator of the loop exits.
+        MachineBasicBlock *IPdom = Restore;
+        for (MachineBasicBlock *LoopExitBB: ExitBlocks) {
+          IPdom = FindIDom<>(*IPdom, LoopExitBB->successors(), *MPDT);
+          if (!IPdom)
+            break;
+        }
+        // If the immediate post-dominator is not in a less nested loop,
+        // then we are stuck in a program with an infinite loop.
+        // In that case, we will not find a safe point, hence, bail out.
+        if (IPdom && MLI->getLoopDepth(IPdom) < MLI->getLoopDepth(Restore))
+          Restore = IPdom;
+        else {
+          Restore = nullptr;
+          break;
+        }
+      }
+    }
+  }
+}
+
+static bool giveUpWithRemarks(MachineOptimizationRemarkEmitter *ORE,
+                              StringRef RemarkName, StringRef RemarkMessage,
+                              const DiagnosticLocation &Loc,
+                              const MachineBasicBlock *MBB) {
+  ORE->emit([&]() {
+    return MachineOptimizationRemarkMissed(DEBUG_TYPE, RemarkName, Loc, MBB)
+           << RemarkMessage;
+  });
+
+  LLVM_DEBUG(dbgs() << RemarkMessage << '\n');
+  return false;
+}
+
+bool ShrinkWrap::performShrinkWrapping(
+    const ReversePostOrderTraversal<MachineBasicBlock *> &RPOT,
+    RegScavenger *RS) {
+  for (MachineBasicBlock *MBB : RPOT) {
+    LLVM_DEBUG(dbgs() << "Look into: " << printMBBReference(*MBB) << '\n');
+
+    if (MBB->isEHFuncletEntry())
+      return giveUpWithRemarks(ORE, "UnsupportedEHFunclets",
+                               "EH Funclets are not supported yet.",
+                               MBB->front().getDebugLoc(), MBB);
+
+    if (MBB->isEHPad() || MBB->isInlineAsmBrIndirectTarget()) {
+      // Push the prologue and epilogue outside of the region that may throw (or
+      // jump out via inlineasm_br), by making sure that all the landing pads
+      // are at least at the boundary of the save and restore points.  The
+      // problem is that a basic block can jump out from the middle in these
+      // cases, which we do not handle.
+      updateSaveRestorePoints(*MBB, RS);
+      if (!ArePointsInteresting()) {
+        LLVM_DEBUG(dbgs() << "EHPad/inlineasm_br prevents shrink-wrapping\n");
+        return false;
+      }
+      continue;
+    }
+
+    bool StackAddressUsed = false;
+    // Check if we found any stack accesses in the predecessors. We are not
+    // doing a full dataflow analysis here to keep things simple but just
+    // rely on a reverse portorder traversal (RPOT) to guarantee predecessors
+    // are already processed except for loops (and accept the conservative
+    // result for loops).
+    for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+      if (StackAddressUsedBlockInfo.test(Pred->getNumber())) {
+        StackAddressUsed = true;
+        break;
+      }
+    }
+
+    for (const MachineInstr &MI : *MBB) {
+      if (useOrDefCSROrFI(MI, RS, StackAddressUsed)) {
+        // Save (resp. restore) point must dominate (resp. post dominate)
+        // MI. Look for the proper basic block for those.
+        updateSaveRestorePoints(*MBB, RS);
+        // If we are at a point where we cannot improve the placement of
+        // save/restore instructions, just give up.
+        if (!ArePointsInteresting()) {
+          LLVM_DEBUG(dbgs() << "No Shrink wrap candidate found\n");
+          return false;
+        }
+        // No need to look for other instructions, this basic block
+        // will already be part of the handled region.
+        StackAddressUsed = true;
+        break;
+      }
+    }
+    StackAddressUsedBlockInfo[MBB->getNumber()] = StackAddressUsed;
+  }
+  if (!ArePointsInteresting()) {
+    // If the points are not interesting at this point, then they must be null
+    // because it means we did not encounter any frame/CSR related code.
+    // Otherwise, we would have returned from the previous loop.
+    assert(!Save && !Restore && "We miss a shrink-wrap opportunity?!");
+    LLVM_DEBUG(dbgs() << "Nothing to shrink-wrap\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "\n ** Results **\nFrequency of the Entry: " << EntryFreq
+                    << '\n');
+
+  const TargetFrameLowering *TFI =
+      MachineFunc->getSubtarget().getFrameLowering();
+  do {
+    LLVM_DEBUG(dbgs() << "Shrink wrap candidates (#, Name, Freq):\nSave: "
+                      << printMBBReference(*Save) << ' '
+                      << MBFI->getBlockFreq(Save).getFrequency()
+                      << "\nRestore: " << printMBBReference(*Restore) << ' '
+                      << MBFI->getBlockFreq(Restore).getFrequency() << '\n');
+
+    bool IsSaveCheap, TargetCanUseSaveAsPrologue = false;
+    if (((IsSaveCheap = EntryFreq >= MBFI->getBlockFreq(Save).getFrequency()) &&
+         EntryFreq >= MBFI->getBlockFreq(Restore).getFrequency()) &&
+        ((TargetCanUseSaveAsPrologue = TFI->canUseAsPrologue(*Save)) &&
+         TFI->canUseAsEpilogue(*Restore)))
+      break;
+    LLVM_DEBUG(
+        dbgs() << "New points are too expensive or invalid for the target\n");
+    MachineBasicBlock *NewBB;
+    if (!IsSaveCheap || !TargetCanUseSaveAsPrologue) {
+      Save = FindIDom<>(*Save, Save->predecessors(), *MDT);
+      if (!Save)
+        break;
+      NewBB = Save;
+    } else {
+      // Restore is expensive.
+      Restore = FindIDom<>(*Restore, Restore->successors(), *MPDT);
+      if (!Restore)
+        break;
+      NewBB = Restore;
+    }
+    updateSaveRestorePoints(*NewBB, RS);
+  } while (Save && Restore);
+
+  if (!ArePointsInteresting()) {
+    ++NumCandidatesDropped;
+    return false;
+  }
+  return true;
+}
+
+bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()) || MF.empty() || !isShrinkWrapEnabled(MF))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
+
+  init(MF);
+
+  ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
+  if (containsIrreducibleCFG<MachineBasicBlock *>(RPOT, *MLI)) {
+    // If MF is irreducible, a block may be in a loop without
+    // MachineLoopInfo reporting it. I.e., we may use the
+    // post-dominance property in loops, which lead to incorrect
+    // results. Moreover, we may miss that the prologue and
+    // epilogue are not in the same loop, leading to unbalanced
+    // construction/deconstruction of the stack frame.
+    return giveUpWithRemarks(ORE, "UnsupportedIrreducibleCFG",
+                             "Irreducible CFGs are not supported yet.",
+                             MF.getFunction().getSubprogram(), &MF.front());
+  }
+
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  std::unique_ptr<RegScavenger> RS(
+      TRI->requiresRegisterScavenging(MF) ? new RegScavenger() : nullptr);
+
+  bool Changed = false;
+
+  StackAddressUsedBlockInfo.resize(MF.getNumBlockIDs(), true);
+  bool HasCandidate = performShrinkWrapping(RPOT, RS.get());
+  StackAddressUsedBlockInfo.clear();
+  Changed = postShrinkWrapping(HasCandidate, MF, RS.get());
+  if (!HasCandidate && !Changed)
+    return false;
+  if (!ArePointsInteresting())
+    return Changed;
+
+  LLVM_DEBUG(dbgs() << "Final shrink wrap candidates:\nSave: "
+                    << printMBBReference(*Save) << ' '
+                    << "\nRestore: " << printMBBReference(*Restore) << '\n');
+
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setSavePoint(Save);
+  MFI.setRestorePoint(Restore);
+  ++NumCandidates;
+  return Changed;
+}
+
+bool ShrinkWrap::isShrinkWrapEnabled(const MachineFunction &MF) {
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+
+  switch (EnableShrinkWrapOpt) {
+  case cl::BOU_UNSET:
+    return TFI->enableShrinkWrapping(MF) &&
+           // Windows with CFI has some limitations that make it impossible
+           // to use shrink-wrapping.
+           !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+           // Sanitizers look at the value of the stack at the location
+           // of the crash. Since a crash can happen anywhere, the
+           // frame must be lowered before anything else happen for the
+           // sanitizers to be able to get a correct stack frame.
+           !(MF.getFunction().hasFnAttribute(Attribute::SanitizeAddress) ||
+             MF.getFunction().hasFnAttribute(Attribute::SanitizeThread) ||
+             MF.getFunction().hasFnAttribute(Attribute::SanitizeMemory) ||
+             MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress));
+  // If EnableShrinkWrap is set, it takes precedence on whatever the
+  // target sets. The rational is that we assume we want to test
+  // something related to shrink-wrapping.
+  case cl::BOU_TRUE:
+    return true;
+  case cl::BOU_FALSE:
+    return false;
+  }
+  llvm_unreachable("Invalid shrink-wrapping state");
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SjLjEHPrepare.cpp
new file mode 100644
index 000000000000..d09953e76a80
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SjLjEHPrepare.cpp
@@ -0,0 +1,507 @@
+//===- SjLjEHPrepare.cpp - Eliminate Invoke & Unwind instructions ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation is designed for use by code generators which use SjLj
+// based exception handling.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "sjljehprepare"
+
+STATISTIC(NumInvokes, "Number of invokes replaced");
+STATISTIC(NumSpilled, "Number of registers live across unwind edges");
+
+namespace {
+class SjLjEHPrepare : public FunctionPass {
+  IntegerType *DataTy = nullptr;
+  Type *doubleUnderDataTy = nullptr;
+  Type *doubleUnderJBufTy = nullptr;
+  Type *FunctionContextTy = nullptr;
+  FunctionCallee RegisterFn;
+  FunctionCallee UnregisterFn;
+  Function *BuiltinSetupDispatchFn = nullptr;
+  Function *FrameAddrFn = nullptr;
+  Function *StackAddrFn = nullptr;
+  Function *StackRestoreFn = nullptr;
+  Function *LSDAAddrFn = nullptr;
+  Function *CallSiteFn = nullptr;
+  Function *FuncCtxFn = nullptr;
+  AllocaInst *FuncCtx = nullptr;
+  const TargetMachine *TM = nullptr;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit SjLjEHPrepare(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM) {}
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {}
+  StringRef getPassName() const override {
+    return "SJLJ Exception Handling preparation";
+  }
+
+private:
+  bool setupEntryBlockAndCallSites(Function &F);
+  void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal, Value *SelVal);
+  Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst *> LPads);
+  void lowerIncomingArguments(Function &F);
+  void lowerAcrossUnwindEdges(Function &F, ArrayRef<InvokeInst *> Invokes);
+  void insertCallSiteStore(Instruction *I, int Number);
+};
+} // end anonymous namespace
+
+char SjLjEHPrepare::ID = 0;
+INITIALIZE_PASS(SjLjEHPrepare, DEBUG_TYPE, "Prepare SjLj exceptions",
+                false, false)
+
+// Public Interface To the SjLjEHPrepare pass.
+FunctionPass *llvm::createSjLjEHPreparePass(const TargetMachine *TM) {
+  return new SjLjEHPrepare(TM);
+}
+
+// doInitialization - Set up decalarations and types needed to process
+// exceptions.
+bool SjLjEHPrepare::doInitialization(Module &M) {
+  // Build the function context structure.
+  // builtin_setjmp uses a five word jbuf
+  Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
+  unsigned DataBits =
+      TM ? TM->getSjLjDataSize() : TargetMachine::DefaultSjLjDataSize;
+  DataTy = Type::getIntNTy(M.getContext(), DataBits);
+  doubleUnderDataTy = ArrayType::get(DataTy, 4);
+  doubleUnderJBufTy = ArrayType::get(VoidPtrTy, 5);
+  FunctionContextTy = StructType::get(VoidPtrTy,         // __prev
+                                      DataTy,            // call_site
+                                      doubleUnderDataTy, // __data
+                                      VoidPtrTy,         // __personality
+                                      VoidPtrTy,         // __lsda
+                                      doubleUnderJBufTy  // __jbuf
+  );
+
+  return true;
+}
+
+/// insertCallSiteStore - Insert a store of the call-site value to the
+/// function context
+void SjLjEHPrepare::insertCallSiteStore(Instruction *I, int Number) {
+  IRBuilder<> Builder(I);
+
+  // Get a reference to the call_site field.
+  Type *Int32Ty = Type::getInt32Ty(I->getContext());
+  Value *Zero = ConstantInt::get(Int32Ty, 0);
+  Value *One = ConstantInt::get(Int32Ty, 1);
+  Value *Idxs[2] = { Zero, One };
+  Value *CallSite =
+      Builder.CreateGEP(FunctionContextTy, FuncCtx, Idxs, "call_site");
+
+  // Insert a store of the call-site number
+  ConstantInt *CallSiteNoC = ConstantInt::get(DataTy, Number);
+  Builder.CreateStore(CallSiteNoC, CallSite, true /*volatile*/);
+}
+
+/// MarkBlocksLiveIn - Insert BB and all of its predecessors into LiveBBs until
+/// we reach blocks we've already seen.
+static void MarkBlocksLiveIn(BasicBlock *BB,
+                             SmallPtrSetImpl<BasicBlock *> &LiveBBs) {
+  if (!LiveBBs.insert(BB).second)
+    return; // already been here.
+
+  df_iterator_default_set<BasicBlock*> Visited;
+
+  for (BasicBlock *B : inverse_depth_first_ext(BB, Visited))
+    LiveBBs.insert(B);
+}
+
+/// substituteLPadValues - Substitute the values returned by the landingpad
+/// instruction with those returned by the personality function.
+void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
+                                         Value *SelVal) {
+  SmallVector<Value *, 8> UseWorkList(LPI->users());
+  while (!UseWorkList.empty()) {
+    Value *Val = UseWorkList.pop_back_val();
+    auto *EVI = dyn_cast<ExtractValueInst>(Val);
+    if (!EVI)
+      continue;
+    if (EVI->getNumIndices() != 1)
+      continue;
+    if (*EVI->idx_begin() == 0)
+      EVI->replaceAllUsesWith(ExnVal);
+    else if (*EVI->idx_begin() == 1)
+      EVI->replaceAllUsesWith(SelVal);
+    if (EVI->use_empty())
+      EVI->eraseFromParent();
+  }
+
+  if (LPI->use_empty())
+    return;
+
+  // There are still some uses of LPI. Construct an aggregate with the exception
+  // values and replace the LPI with that aggregate.
+  Type *LPadType = LPI->getType();
+  Value *LPadVal = PoisonValue::get(LPadType);
+  auto *SelI = cast<Instruction>(SelVal);
+  IRBuilder<> Builder(SelI->getParent(), std::next(SelI->getIterator()));
+  LPadVal = Builder.CreateInsertValue(LPadVal, ExnVal, 0, "lpad.val");
+  LPadVal = Builder.CreateInsertValue(LPadVal, SelVal, 1, "lpad.val");
+
+  LPI->replaceAllUsesWith(LPadVal);
+}
+
+/// setupFunctionContext - Allocate the function context on the stack and fill
+/// it with all of the data that we know at this point.
+Value *SjLjEHPrepare::setupFunctionContext(Function &F,
+                                           ArrayRef<LandingPadInst *> LPads) {
+  BasicBlock *EntryBB = &F.front();
+
+  // Create an alloca for the incoming jump buffer ptr and the new jump buffer
+  // that needs to be restored on all exits from the function. This is an alloca
+  // because the value needs to be added to the global context list.
+  auto &DL = F.getParent()->getDataLayout();
+  const Align Alignment = DL.getPrefTypeAlign(FunctionContextTy);
+  FuncCtx = new AllocaInst(FunctionContextTy, DL.getAllocaAddrSpace(), nullptr,
+                           Alignment, "fn_context", &EntryBB->front());
+
+  // Fill in the function context structure.
+  for (LandingPadInst *LPI : LPads) {
+    IRBuilder<> Builder(LPI->getParent(),
+                        LPI->getParent()->getFirstInsertionPt());
+
+    // Reference the __data field.
+    Value *FCData =
+        Builder.CreateConstGEP2_32(FunctionContextTy, FuncCtx, 0, 2, "__data");
+
+    // The exception values come back in context->__data[0].
+    Value *ExceptionAddr = Builder.CreateConstGEP2_32(doubleUnderDataTy, FCData,
+                                                      0, 0, "exception_gep");
+    Value *ExnVal = Builder.CreateLoad(DataTy, ExceptionAddr, true, "exn_val");
+    ExnVal = Builder.CreateIntToPtr(ExnVal, Builder.getInt8PtrTy());
+
+    Value *SelectorAddr = Builder.CreateConstGEP2_32(doubleUnderDataTy, FCData,
+                                                     0, 1, "exn_selector_gep");
+    Value *SelVal =
+        Builder.CreateLoad(DataTy, SelectorAddr, true, "exn_selector_val");
+
+    // SelVal must be Int32Ty, so trunc it
+    SelVal = Builder.CreateTrunc(SelVal, Type::getInt32Ty(F.getContext()));
+
+    substituteLPadValues(LPI, ExnVal, SelVal);
+  }
+
+  // Personality function
+  IRBuilder<> Builder(EntryBB->getTerminator());
+  Value *PersonalityFn = F.getPersonalityFn();
+  Value *PersonalityFieldPtr = Builder.CreateConstGEP2_32(
+      FunctionContextTy, FuncCtx, 0, 3, "pers_fn_gep");
+  Builder.CreateStore(
+      Builder.CreateBitCast(PersonalityFn, Builder.getInt8PtrTy()),
+      PersonalityFieldPtr, /*isVolatile=*/true);
+
+  // LSDA address
+  Value *LSDA = Builder.CreateCall(LSDAAddrFn, {}, "lsda_addr");
+  Value *LSDAFieldPtr =
+      Builder.CreateConstGEP2_32(FunctionContextTy, FuncCtx, 0, 4, "lsda_gep");
+  Builder.CreateStore(LSDA, LSDAFieldPtr, /*isVolatile=*/true);
+
+  return FuncCtx;
+}
+
+/// lowerIncomingArguments - To avoid having to handle incoming arguments
+/// specially, we lower each arg to a copy instruction in the entry block. This
+/// ensures that the argument value itself cannot be live out of the entry
+/// block.
+void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
+  BasicBlock::iterator AfterAllocaInsPt = F.begin()->begin();
+  while (isa<AllocaInst>(AfterAllocaInsPt) &&
+         cast<AllocaInst>(AfterAllocaInsPt)->isStaticAlloca())
+    ++AfterAllocaInsPt;
+  assert(AfterAllocaInsPt != F.front().end());
+
+  for (auto &AI : F.args()) {
+    // Swift error really is a register that we model as memory -- instruction
+    // selection will perform mem-to-reg for us and spill/reload appropriately
+    // around calls that clobber it. There is no need to spill this
+    // value to the stack and doing so would not be allowed.
+    if (AI.isSwiftError())
+      continue;
+
+    Type *Ty = AI.getType();
+
+    // Use 'select i8 true, %arg, undef' to simulate a 'no-op' instruction.
+    Value *TrueValue = ConstantInt::getTrue(F.getContext());
+    Value *UndefValue = UndefValue::get(Ty);
+    Instruction *SI = SelectInst::Create(
+        TrueValue, &AI, UndefValue, AI.getName() + ".tmp", &*AfterAllocaInsPt);
+    AI.replaceAllUsesWith(SI);
+
+    // Reset the operand, because it  was clobbered by the RAUW above.
+    SI->setOperand(1, &AI);
+  }
+}
+
+/// lowerAcrossUnwindEdges - Find all variables which are alive across an unwind
+/// edge and spill them.
+void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
+                                           ArrayRef<InvokeInst *> Invokes) {
+  // Finally, scan the code looking for instructions with bad live ranges.
+  for (BasicBlock &BB : F) {
+    for (Instruction &Inst : BB) {
+      // Ignore obvious cases we don't have to handle. In particular, most
+      // instructions either have no uses or only have a single use inside the
+      // current block. Ignore them quickly.
+      if (Inst.use_empty())
+        continue;
+      if (Inst.hasOneUse() &&
+          cast<Instruction>(Inst.user_back())->getParent() == &BB &&
+          !isa<PHINode>(Inst.user_back()))
+        continue;
+
+      // If this is an alloca in the entry block, it's not a real register
+      // value.
+      if (auto *AI = dyn_cast<AllocaInst>(&Inst))
+        if (AI->isStaticAlloca())
+          continue;
+
+      // Avoid iterator invalidation by copying users to a temporary vector.
+      SmallVector<Instruction *, 16> Users;
+      for (User *U : Inst.users()) {
+        Instruction *UI = cast<Instruction>(U);
+        if (UI->getParent() != &BB || isa<PHINode>(UI))
+          Users.push_back(UI);
+      }
+
+      // Find all of the blocks that this value is live in.
+      SmallPtrSet<BasicBlock *, 32> LiveBBs;
+      LiveBBs.insert(&BB);
+      while (!Users.empty()) {
+        Instruction *U = Users.pop_back_val();
+
+        if (!isa<PHINode>(U)) {
+          MarkBlocksLiveIn(U->getParent(), LiveBBs);
+        } else {
+          // Uses for a PHI node occur in their predecessor block.
+          PHINode *PN = cast<PHINode>(U);
+          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+            if (PN->getIncomingValue(i) == &Inst)
+              MarkBlocksLiveIn(PN->getIncomingBlock(i), LiveBBs);
+        }
+      }
+
+      // Now that we know all of the blocks that this thing is live in, see if
+      // it includes any of the unwind locations.
+      bool NeedsSpill = false;
+      for (InvokeInst *Invoke : Invokes) {
+        BasicBlock *UnwindBlock = Invoke->getUnwindDest();
+        if (UnwindBlock != &BB && LiveBBs.count(UnwindBlock)) {
+          LLVM_DEBUG(dbgs() << "SJLJ Spill: " << Inst << " around "
+                            << UnwindBlock->getName() << "\n");
+          NeedsSpill = true;
+          break;
+        }
+      }
+
+      // If we decided we need a spill, do it.
+      // FIXME: Spilling this way is overkill, as it forces all uses of
+      // the value to be reloaded from the stack slot, even those that aren't
+      // in the unwind blocks. We should be more selective.
+      if (NeedsSpill) {
+        DemoteRegToStack(Inst, true);
+        ++NumSpilled;
+      }
+    }
+  }
+
+  // Go through the landing pads and remove any PHIs there.
+  for (InvokeInst *Invoke : Invokes) {
+    BasicBlock *UnwindBlock = Invoke->getUnwindDest();
+    LandingPadInst *LPI = UnwindBlock->getLandingPadInst();
+
+    // Place PHIs into a set to avoid invalidating the iterator.
+    SmallPtrSet<PHINode *, 8> PHIsToDemote;
+    for (BasicBlock::iterator PN = UnwindBlock->begin(); isa<PHINode>(PN); ++PN)
+      PHIsToDemote.insert(cast<PHINode>(PN));
+    if (PHIsToDemote.empty())
+      continue;
+
+    // Demote the PHIs to the stack.
+    for (PHINode *PN : PHIsToDemote)
+      DemotePHIToStack(PN);
+
+    // Move the landingpad instruction back to the top of the landing pad block.
+    LPI->moveBefore(&UnwindBlock->front());
+  }
+}
+
+/// setupEntryBlockAndCallSites - Setup the entry block by creating and filling
+/// the function context and marking the call sites with the appropriate
+/// values. These values are used by the DWARF EH emitter.
+bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
+  SmallVector<ReturnInst *, 16> Returns;
+  SmallVector<InvokeInst *, 16> Invokes;
+  SmallSetVector<LandingPadInst *, 16> LPads;
+
+  // Look through the terminators of the basic blocks to find invokes.
+  for (BasicBlock &BB : F)
+    if (auto *II = dyn_cast<InvokeInst>(BB.getTerminator())) {
+      if (Function *Callee = II->getCalledFunction())
+        if (Callee->getIntrinsicID() == Intrinsic::donothing) {
+          // Remove the NOP invoke.
+          BranchInst::Create(II->getNormalDest(), II);
+          II->eraseFromParent();
+          continue;
+        }
+
+      Invokes.push_back(II);
+      LPads.insert(II->getUnwindDest()->getLandingPadInst());
+    } else if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+      Returns.push_back(RI);
+    }
+
+  if (Invokes.empty())
+    return false;
+
+  NumInvokes += Invokes.size();
+
+  lowerIncomingArguments(F);
+  lowerAcrossUnwindEdges(F, Invokes);
+
+  Value *FuncCtx =
+      setupFunctionContext(F, ArrayRef(LPads.begin(), LPads.end()));
+  BasicBlock *EntryBB = &F.front();
+  IRBuilder<> Builder(EntryBB->getTerminator());
+
+  // Get a reference to the jump buffer.
+  Value *JBufPtr =
+      Builder.CreateConstGEP2_32(FunctionContextTy, FuncCtx, 0, 5, "jbuf_gep");
+
+  // Save the frame pointer.
+  Value *FramePtr = Builder.CreateConstGEP2_32(doubleUnderJBufTy, JBufPtr, 0, 0,
+                                               "jbuf_fp_gep");
+
+  Value *Val = Builder.CreateCall(FrameAddrFn, Builder.getInt32(0), "fp");
+  Builder.CreateStore(Val, FramePtr, /*isVolatile=*/true);
+
+  // Save the stack pointer.
+  Value *StackPtr = Builder.CreateConstGEP2_32(doubleUnderJBufTy, JBufPtr, 0, 2,
+                                               "jbuf_sp_gep");
+
+  Val = Builder.CreateCall(StackAddrFn, {}, "sp");
+  Builder.CreateStore(Val, StackPtr, /*isVolatile=*/true);
+
+  // Call the setup_dispatch intrinsic. It fills in the rest of the jmpbuf.
+  Builder.CreateCall(BuiltinSetupDispatchFn, {});
+
+  // Store a pointer to the function context so that the back-end will know
+  // where to look for it.
+  Value *FuncCtxArg = Builder.CreateBitCast(FuncCtx, Builder.getInt8PtrTy());
+  Builder.CreateCall(FuncCtxFn, FuncCtxArg);
+
+  // At this point, we are all set up, update the invoke instructions to mark
+  // their call_site values.
+  for (unsigned I = 0, E = Invokes.size(); I != E; ++I) {
+    insertCallSiteStore(Invokes[I], I + 1);
+
+    ConstantInt *CallSiteNum =
+        ConstantInt::get(Type::getInt32Ty(F.getContext()), I + 1);
+
+    // Record the call site value for the back end so it stays associated with
+    // the invoke.
+    CallInst::Create(CallSiteFn, CallSiteNum, "", Invokes[I]);
+  }
+
+  // Mark call instructions that aren't nounwind as no-action (call_site ==
+  // -1). Skip the entry block, as prior to then, no function context has been
+  // created for this function and any unexpected exceptions thrown will go
+  // directly to the caller's context, which is what we want anyway, so no need
+  // to do anything here.
+  for (BasicBlock &BB : F) {
+    if (&BB == &F.front())
+      continue;
+    for (Instruction &I : BB)
+      if (I.mayThrow())
+        insertCallSiteStore(&I, -1);
+  }
+
+  // Register the function context and make sure it's known to not throw
+  CallInst *Register =
+      CallInst::Create(RegisterFn, FuncCtx, "", EntryBB->getTerminator());
+  Register->setDoesNotThrow();
+
+  // Following any allocas not in the entry block, update the saved SP in the
+  // jmpbuf to the new value.
+  for (BasicBlock &BB : F) {
+    if (&BB == &F.front())
+      continue;
+    for (Instruction &I : BB) {
+      if (auto *CI = dyn_cast<CallInst>(&I)) {
+        if (CI->getCalledFunction() != StackRestoreFn)
+          continue;
+      } else if (!isa<AllocaInst>(&I)) {
+        continue;
+      }
+      Instruction *StackAddr = CallInst::Create(StackAddrFn, "sp");
+      StackAddr->insertAfter(&I);
+      new StoreInst(StackAddr, StackPtr, true, StackAddr->getNextNode());
+    }
+  }
+
+  // Finally, for any returns from this function, if this function contains an
+  // invoke, add a call to unregister the function context.
+  for (ReturnInst *Return : Returns) {
+    Instruction *InsertPoint = Return;
+    if (CallInst *CI = Return->getParent()->getTerminatingMustTailCall())
+      InsertPoint = CI;
+    CallInst::Create(UnregisterFn, FuncCtx, "", InsertPoint);
+  }
+
+  return true;
+}
+
+bool SjLjEHPrepare::runOnFunction(Function &F) {
+  Module &M = *F.getParent();
+  RegisterFn = M.getOrInsertFunction(
+      "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()),
+      PointerType::getUnqual(FunctionContextTy));
+  UnregisterFn = M.getOrInsertFunction(
+      "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()),
+      PointerType::getUnqual(FunctionContextTy));
+  FrameAddrFn = Intrinsic::getDeclaration(
+      &M, Intrinsic::frameaddress,
+      {Type::getInt8PtrTy(M.getContext(),
+                          M.getDataLayout().getAllocaAddrSpace())});
+  StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
+  StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
+  BuiltinSetupDispatchFn =
+    Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setup_dispatch);
+  LSDAAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_lsda);
+  CallSiteFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_callsite);
+  FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext);
+
+  bool Res = setupEntryBlockAndCallSites(F);
+  return Res;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SlotIndexes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SlotIndexes.cpp
new file mode 100644
index 000000000000..47ee36971d0e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SlotIndexes.cpp
@@ -0,0 +1,272 @@
+//===-- SlotIndexes.cpp - Slot Indexes Pass  ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "slotindexes"
+
+char SlotIndexes::ID = 0;
+
+SlotIndexes::SlotIndexes() : MachineFunctionPass(ID) {
+  initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
+}
+
+SlotIndexes::~SlotIndexes() {
+  // The indexList's nodes are all allocated in the BumpPtrAllocator.
+  indexList.clearAndLeakNodesUnsafely();
+}
+
+INITIALIZE_PASS(SlotIndexes, DEBUG_TYPE,
+                "Slot index numbering", false, false)
+
+STATISTIC(NumLocalRenum,  "Number of local renumberings");
+
+void SlotIndexes::getAnalysisUsage(AnalysisUsage &au) const {
+  au.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(au);
+}
+
+void SlotIndexes::releaseMemory() {
+  mi2iMap.clear();
+  MBBRanges.clear();
+  idx2MBBMap.clear();
+  indexList.clear();
+  ileAllocator.Reset();
+}
+
+bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
+
+  // Compute numbering as follows:
+  // Grab an iterator to the start of the index list.
+  // Iterate over all MBBs, and within each MBB all MIs, keeping the MI
+  // iterator in lock-step (though skipping it over indexes which have
+  // null pointers in the instruction field).
+  // At each iteration assert that the instruction pointed to in the index
+  // is the same one pointed to by the MI iterator. This
+
+  // FIXME: This can be simplified. The mi2iMap_, Idx2MBBMap, etc. should
+  // only need to be set up once after the first numbering is computed.
+
+  mf = &fn;
+
+  // Check that the list contains only the sentinal.
+  assert(indexList.empty() && "Index list non-empty at initial numbering?");
+  assert(idx2MBBMap.empty() &&
+         "Index -> MBB mapping non-empty at initial numbering?");
+  assert(MBBRanges.empty() &&
+         "MBB -> Index mapping non-empty at initial numbering?");
+  assert(mi2iMap.empty() &&
+         "MachineInstr -> Index mapping non-empty at initial numbering?");
+
+  unsigned index = 0;
+  MBBRanges.resize(mf->getNumBlockIDs());
+  idx2MBBMap.reserve(mf->size());
+
+  indexList.push_back(createEntry(nullptr, index));
+
+  // Iterate over the function.
+  for (MachineBasicBlock &MBB : *mf) {
+    // Insert an index for the MBB start.
+    SlotIndex blockStartIndex(&indexList.back(), SlotIndex::Slot_Block);
+
+    for (MachineInstr &MI : MBB) {
+      if (MI.isDebugOrPseudoInstr())
+        continue;
+
+      // Insert a store index for the instr.
+      indexList.push_back(createEntry(&MI, index += SlotIndex::InstrDist));
+
+      // Save this base index in the maps.
+      mi2iMap.insert(std::make_pair(
+          &MI, SlotIndex(&indexList.back(), SlotIndex::Slot_Block)));
+    }
+
+    // We insert one blank instructions between basic blocks.
+    indexList.push_back(createEntry(nullptr, index += SlotIndex::InstrDist));
+
+    MBBRanges[MBB.getNumber()].first = blockStartIndex;
+    MBBRanges[MBB.getNumber()].second = SlotIndex(&indexList.back(),
+                                                   SlotIndex::Slot_Block);
+    idx2MBBMap.push_back(IdxMBBPair(blockStartIndex, &MBB));
+  }
+
+  // Sort the Idx2MBBMap
+  llvm::sort(idx2MBBMap, less_first());
+
+  LLVM_DEBUG(mf->print(dbgs(), this));
+
+  // And we're done!
+  return false;
+}
+
+void SlotIndexes::removeMachineInstrFromMaps(MachineInstr &MI,
+                                             bool AllowBundled) {
+  assert((AllowBundled || !MI.isBundledWithPred()) &&
+         "Use removeSingleMachineInstrFromMaps() instead");
+  Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI);
+  if (mi2iItr == mi2iMap.end())
+    return;
+
+  SlotIndex MIIndex = mi2iItr->second;
+  IndexListEntry &MIEntry = *MIIndex.listEntry();
+  assert(MIEntry.getInstr() == &MI && "Instruction indexes broken.");
+  mi2iMap.erase(mi2iItr);
+  // FIXME: Eventually we want to actually delete these indexes.
+  MIEntry.setInstr(nullptr);
+}
+
+void SlotIndexes::removeSingleMachineInstrFromMaps(MachineInstr &MI) {
+  Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI);
+  if (mi2iItr == mi2iMap.end())
+    return;
+
+  SlotIndex MIIndex = mi2iItr->second;
+  IndexListEntry &MIEntry = *MIIndex.listEntry();
+  assert(MIEntry.getInstr() == &MI && "Instruction indexes broken.");
+  mi2iMap.erase(mi2iItr);
+
+  // When removing the first instruction of a bundle update mapping to next
+  // instruction.
+  if (MI.isBundledWithSucc()) {
+    // Only the first instruction of a bundle should have an index assigned.
+    assert(!MI.isBundledWithPred() && "Should be first bundle instruction");
+
+    MachineBasicBlock::instr_iterator Next = std::next(MI.getIterator());
+    MachineInstr &NextMI = *Next;
+    MIEntry.setInstr(&NextMI);
+    mi2iMap.insert(std::make_pair(&NextMI, MIIndex));
+    return;
+  } else {
+    // FIXME: Eventually we want to actually delete these indexes.
+    MIEntry.setInstr(nullptr);
+  }
+}
+
+// Renumber indexes locally after curItr was inserted, but failed to get a new
+// index.
+void SlotIndexes::renumberIndexes(IndexList::iterator curItr) {
+  // Number indexes with half the default spacing so we can catch up quickly.
+  const unsigned Space = SlotIndex::InstrDist/2;
+  static_assert((Space & 3) == 0, "InstrDist must be a multiple of 2*NUM");
+
+  IndexList::iterator startItr = std::prev(curItr);
+  unsigned index = startItr->getIndex();
+  do {
+    curItr->setIndex(index += Space);
+    ++curItr;
+    // If the next index is bigger, we have caught up.
+  } while (curItr != indexList.end() && curItr->getIndex() <= index);
+
+  LLVM_DEBUG(dbgs() << "\n*** Renumbered SlotIndexes " << startItr->getIndex()
+                    << '-' << index << " ***\n");
+  ++NumLocalRenum;
+}
+
+// Repair indexes after adding and removing instructions.
+void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator Begin,
+                                       MachineBasicBlock::iterator End) {
+  bool includeStart = (Begin == MBB->begin());
+  SlotIndex startIdx;
+  if (includeStart)
+    startIdx = getMBBStartIdx(MBB);
+  else
+    startIdx = getInstructionIndex(*--Begin);
+
+  SlotIndex endIdx;
+  if (End == MBB->end())
+    endIdx = getMBBEndIdx(MBB);
+  else
+    endIdx = getInstructionIndex(*End);
+
+  // FIXME: Conceptually, this code is implementing an iterator on MBB that
+  // optionally includes an additional position prior to MBB->begin(), indicated
+  // by the includeStart flag. This is done so that we can iterate MIs in a MBB
+  // in parallel with SlotIndexes, but there should be a better way to do this.
+  IndexList::iterator ListB = startIdx.listEntry()->getIterator();
+  IndexList::iterator ListI = endIdx.listEntry()->getIterator();
+  MachineBasicBlock::iterator MBBI = End;
+  bool pastStart = false;
+  while (ListI != ListB || MBBI != Begin || (includeStart && !pastStart)) {
+    assert(ListI->getIndex() >= startIdx.getIndex() &&
+           (includeStart || !pastStart) &&
+           "Decremented past the beginning of region to repair.");
+
+    MachineInstr *SlotMI = ListI->getInstr();
+    MachineInstr *MI = (MBBI != MBB->end() && !pastStart) ? &*MBBI : nullptr;
+    bool MBBIAtBegin = MBBI == Begin && (!includeStart || pastStart);
+
+    if (SlotMI == MI && !MBBIAtBegin) {
+      --ListI;
+      if (MBBI != Begin)
+        --MBBI;
+      else
+        pastStart = true;
+    } else if (MI && !mi2iMap.contains(MI)) {
+      if (MBBI != Begin)
+        --MBBI;
+      else
+        pastStart = true;
+    } else {
+      --ListI;
+      if (SlotMI)
+        removeMachineInstrFromMaps(*SlotMI);
+    }
+  }
+
+  // In theory this could be combined with the previous loop, but it is tricky
+  // to update the IndexList while we are iterating it.
+  for (MachineBasicBlock::iterator I = End; I != Begin;) {
+    --I;
+    MachineInstr &MI = *I;
+    if (!MI.isDebugOrPseudoInstr() && !mi2iMap.contains(&MI))
+      insertMachineInstrInMaps(MI);
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void SlotIndexes::dump() const {
+  for (const IndexListEntry &ILE : indexList) {
+    dbgs() << ILE.getIndex() << " ";
+
+    if (ILE.getInstr()) {
+      dbgs() << *ILE.getInstr();
+    } else {
+      dbgs() << "\n";
+    }
+  }
+
+  for (unsigned i = 0, e = MBBRanges.size(); i != e; ++i)
+    dbgs() << "%bb." << i << "\t[" << MBBRanges[i].first << ';'
+           << MBBRanges[i].second << ")\n";
+}
+#endif
+
+// Print a SlotIndex to a raw_ostream.
+void SlotIndex::print(raw_ostream &os) const {
+  if (isValid())
+    os << listEntry()->getIndex() << "Berd"[getSlot()];
+  else
+    os << "invalid";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+// Dump a SlotIndex to stderr.
+LLVM_DUMP_METHOD void SlotIndex::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SpillPlacement.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SpillPlacement.cpp
new file mode 100644
index 000000000000..91da5e49713c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SpillPlacement.cpp
@@ -0,0 +1,398 @@
+//===- SpillPlacement.cpp - Optimal Spill Code Placement ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the spill code placement analysis.
+//
+// Each edge bundle corresponds to a node in a Hopfield network. Constraints on
+// basic blocks are weighted by the block frequency and added to become the node
+// bias.
+//
+// Transparent basic blocks have the variable live through, but don't care if it
+// is spilled or in a register. These blocks become connections in the Hopfield
+// network, again weighted by block frequency.
+//
+// The Hopfield network minimizes (possibly locally) its energy function:
+//
+//   E = -sum_n V_n * ( B_n + sum_{n, m linked by b} V_m * F_b )
+//
+// The energy function represents the expected spill code execution frequency,
+// or the cost of spilling. This is a Lyapunov function which never increases
+// when a node is updated. It is guaranteed to converge to a local minimum.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SpillPlacement.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "spill-code-placement"
+
+char SpillPlacement::ID = 0;
+
+char &llvm::SpillPlacementID = SpillPlacement::ID;
+
+INITIALIZE_PASS_BEGIN(SpillPlacement, DEBUG_TYPE,
+                      "Spill Code Placement Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(EdgeBundles)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(SpillPlacement, DEBUG_TYPE,
+                    "Spill Code Placement Analysis", true, true)
+
+void SpillPlacement::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineBlockFrequencyInfo>();
+  AU.addRequiredTransitive<EdgeBundles>();
+  AU.addRequiredTransitive<MachineLoopInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Node - Each edge bundle corresponds to a Hopfield node.
+///
+/// The node contains precomputed frequency data that only depends on the CFG,
+/// but Bias and Links are computed each time placeSpills is called.
+///
+/// The node Value is positive when the variable should be in a register. The
+/// value can change when linked nodes change, but convergence is very fast
+/// because all weights are positive.
+struct SpillPlacement::Node {
+  /// BiasN - Sum of blocks that prefer a spill.
+  BlockFrequency BiasN;
+
+  /// BiasP - Sum of blocks that prefer a register.
+  BlockFrequency BiasP;
+
+  /// Value - Output value of this node computed from the Bias and links.
+  /// This is always on of the values {-1, 0, 1}. A positive number means the
+  /// variable should go in a register through this bundle.
+  int Value;
+
+  using LinkVector = SmallVector<std::pair<BlockFrequency, unsigned>, 4>;
+
+  /// Links - (Weight, BundleNo) for all transparent blocks connecting to other
+  /// bundles. The weights are all positive block frequencies.
+  LinkVector Links;
+
+  /// SumLinkWeights - Cached sum of the weights of all links + ThresHold.
+  BlockFrequency SumLinkWeights;
+
+  /// preferReg - Return true when this node prefers to be in a register.
+  bool preferReg() const {
+    // Undecided nodes (Value==0) go on the stack.
+    return Value > 0;
+  }
+
+  /// mustSpill - Return True if this node is so biased that it must spill.
+  bool mustSpill() const {
+    // We must spill if Bias < -sum(weights) or the MustSpill flag was set.
+    // BiasN is saturated when MustSpill is set, make sure this still returns
+    // true when the RHS saturates. Note that SumLinkWeights includes Threshold.
+    return BiasN >= BiasP + SumLinkWeights;
+  }
+
+  /// clear - Reset per-query data, but preserve frequencies that only depend on
+  /// the CFG.
+  void clear(const BlockFrequency &Threshold) {
+    BiasN = BiasP = Value = 0;
+    SumLinkWeights = Threshold;
+    Links.clear();
+  }
+
+  /// addLink - Add a link to bundle b with weight w.
+  void addLink(unsigned b, BlockFrequency w) {
+    // Update cached sum.
+    SumLinkWeights += w;
+
+    // There can be multiple links to the same bundle, add them up.
+    for (std::pair<BlockFrequency, unsigned> &L : Links)
+      if (L.second == b) {
+        L.first += w;
+        return;
+      }
+    // This must be the first link to b.
+    Links.push_back(std::make_pair(w, b));
+  }
+
+  /// addBias - Bias this node.
+  void addBias(BlockFrequency freq, BorderConstraint direction) {
+    switch (direction) {
+    default:
+      break;
+    case PrefReg:
+      BiasP += freq;
+      break;
+    case PrefSpill:
+      BiasN += freq;
+      break;
+    case MustSpill:
+      BiasN = BlockFrequency::getMaxFrequency();
+      break;
+    }
+  }
+
+  /// update - Recompute Value from Bias and Links. Return true when node
+  /// preference changes.
+  bool update(const Node nodes[], const BlockFrequency &Threshold) {
+    // Compute the weighted sum of inputs.
+    BlockFrequency SumN = BiasN;
+    BlockFrequency SumP = BiasP;
+    for (std::pair<BlockFrequency, unsigned> &L : Links) {
+      if (nodes[L.second].Value == -1)
+        SumN += L.first;
+      else if (nodes[L.second].Value == 1)
+        SumP += L.first;
+    }
+
+    // Each weighted sum is going to be less than the total frequency of the
+    // bundle. Ideally, we should simply set Value = sign(SumP - SumN), but we
+    // will add a dead zone around 0 for two reasons:
+    //
+    //  1. It avoids arbitrary bias when all links are 0 as is possible during
+    //     initial iterations.
+    //  2. It helps tame rounding errors when the links nominally sum to 0.
+    //
+    bool Before = preferReg();
+    if (SumN >= SumP + Threshold)
+      Value = -1;
+    else if (SumP >= SumN + Threshold)
+      Value = 1;
+    else
+      Value = 0;
+    return Before != preferReg();
+  }
+
+  void getDissentingNeighbors(SparseSet<unsigned> &List,
+                              const Node nodes[]) const {
+    for (const auto &Elt : Links) {
+      unsigned n = Elt.second;
+      // Neighbors that already have the same value are not going to
+      // change because of this node changing.
+      if (Value != nodes[n].Value)
+        List.insert(n);
+    }
+  }
+};
+
+bool SpillPlacement::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  bundles = &getAnalysis<EdgeBundles>();
+  loops = &getAnalysis<MachineLoopInfo>();
+
+  assert(!nodes && "Leaking node array");
+  nodes = new Node[bundles->getNumBundles()];
+  TodoList.clear();
+  TodoList.setUniverse(bundles->getNumBundles());
+
+  // Compute total ingoing and outgoing block frequencies for all bundles.
+  BlockFrequencies.resize(mf.getNumBlockIDs());
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+  setThreshold(MBFI->getEntryFreq());
+  for (auto &I : mf) {
+    unsigned Num = I.getNumber();
+    BlockFrequencies[Num] = MBFI->getBlockFreq(&I);
+  }
+
+  // We never change the function.
+  return false;
+}
+
+void SpillPlacement::releaseMemory() {
+  delete[] nodes;
+  nodes = nullptr;
+  TodoList.clear();
+}
+
+/// activate - mark node n as active if it wasn't already.
+void SpillPlacement::activate(unsigned n) {
+  TodoList.insert(n);
+  if (ActiveNodes->test(n))
+    return;
+  ActiveNodes->set(n);
+  nodes[n].clear(Threshold);
+
+  // Very large bundles usually come from big switches, indirect branches,
+  // landing pads, or loops with many 'continue' statements. It is difficult to
+  // allocate registers when so many different blocks are involved.
+  //
+  // Give a small negative bias to large bundles such that a substantial
+  // fraction of the connected blocks need to be interested before we consider
+  // expanding the region through the bundle. This helps compile time by
+  // limiting the number of blocks visited and the number of links in the
+  // Hopfield network.
+  if (bundles->getBlocks(n).size() > 100) {
+    nodes[n].BiasP = 0;
+    nodes[n].BiasN = (MBFI->getEntryFreq() / 16);
+  }
+}
+
+/// Set the threshold for a given entry frequency.
+///
+/// Set the threshold relative to \c Entry.  Since the threshold is used as a
+/// bound on the open interval (-Threshold;Threshold), 1 is the minimum
+/// threshold.
+void SpillPlacement::setThreshold(const BlockFrequency &Entry) {
+  // Apparently 2 is a good threshold when Entry==2^14, but we need to scale
+  // it.  Divide by 2^13, rounding as appropriate.
+  uint64_t Freq = Entry.getFrequency();
+  uint64_t Scaled = (Freq >> 13) + bool(Freq & (1 << 12));
+  Threshold = std::max(UINT64_C(1), Scaled);
+}
+
+/// addConstraints - Compute node biases and weights from a set of constraints.
+/// Set a bit in NodeMask for each active node.
+void SpillPlacement::addConstraints(ArrayRef<BlockConstraint> LiveBlocks) {
+  for (const BlockConstraint &LB : LiveBlocks) {
+    BlockFrequency Freq = BlockFrequencies[LB.Number];
+
+    // Live-in to block?
+    if (LB.Entry != DontCare) {
+      unsigned ib = bundles->getBundle(LB.Number, false);
+      activate(ib);
+      nodes[ib].addBias(Freq, LB.Entry);
+    }
+
+    // Live-out from block?
+    if (LB.Exit != DontCare) {
+      unsigned ob = bundles->getBundle(LB.Number, true);
+      activate(ob);
+      nodes[ob].addBias(Freq, LB.Exit);
+    }
+  }
+}
+
+/// addPrefSpill - Same as addConstraints(PrefSpill)
+void SpillPlacement::addPrefSpill(ArrayRef<unsigned> Blocks, bool Strong) {
+  for (unsigned B : Blocks) {
+    BlockFrequency Freq = BlockFrequencies[B];
+    if (Strong)
+      Freq += Freq;
+    unsigned ib = bundles->getBundle(B, false);
+    unsigned ob = bundles->getBundle(B, true);
+    activate(ib);
+    activate(ob);
+    nodes[ib].addBias(Freq, PrefSpill);
+    nodes[ob].addBias(Freq, PrefSpill);
+  }
+}
+
+void SpillPlacement::addLinks(ArrayRef<unsigned> Links) {
+  for (unsigned Number : Links) {
+    unsigned ib = bundles->getBundle(Number, false);
+    unsigned ob = bundles->getBundle(Number, true);
+
+    // Ignore self-loops.
+    if (ib == ob)
+      continue;
+    activate(ib);
+    activate(ob);
+    BlockFrequency Freq = BlockFrequencies[Number];
+    nodes[ib].addLink(ob, Freq);
+    nodes[ob].addLink(ib, Freq);
+  }
+}
+
+bool SpillPlacement::scanActiveBundles() {
+  RecentPositive.clear();
+  for (unsigned n : ActiveNodes->set_bits()) {
+    update(n);
+    // A node that must spill, or a node without any links is not going to
+    // change its value ever again, so exclude it from iterations.
+    if (nodes[n].mustSpill())
+      continue;
+    if (nodes[n].preferReg())
+      RecentPositive.push_back(n);
+  }
+  return !RecentPositive.empty();
+}
+
+bool SpillPlacement::update(unsigned n) {
+  if (!nodes[n].update(nodes, Threshold))
+    return false;
+  nodes[n].getDissentingNeighbors(TodoList, nodes);
+  return true;
+}
+
+/// iterate - Repeatedly update the Hopfield nodes until stability or the
+/// maximum number of iterations is reached.
+void SpillPlacement::iterate() {
+  // We do not need to push those node in the todolist.
+  // They are already been proceeded as part of the previous iteration.
+  RecentPositive.clear();
+
+  // Since the last iteration, the todolist have been augmented by calls
+  // to addConstraints, addLinks, and co.
+  // Update the network energy starting at this new frontier.
+  // The call to ::update will add the nodes that changed into the todolist.
+  unsigned Limit = bundles->getNumBundles() * 10;
+  while(Limit-- > 0 && !TodoList.empty()) {
+    unsigned n = TodoList.pop_back_val();
+    if (!update(n))
+      continue;
+    if (nodes[n].preferReg())
+      RecentPositive.push_back(n);
+  }
+}
+
+void SpillPlacement::prepare(BitVector &RegBundles) {
+  RecentPositive.clear();
+  TodoList.clear();
+  // Reuse RegBundles as our ActiveNodes vector.
+  ActiveNodes = &RegBundles;
+  ActiveNodes->clear();
+  ActiveNodes->resize(bundles->getNumBundles());
+}
+
+bool
+SpillPlacement::finish() {
+  assert(ActiveNodes && "Call prepare() first");
+
+  // Write preferences back to ActiveNodes.
+  bool Perfect = true;
+  for (unsigned n : ActiveNodes->set_bits())
+    if (!nodes[n].preferReg()) {
+      ActiveNodes->reset(n);
+      Perfect = false;
+    }
+  ActiveNodes = nullptr;
+  return Perfect;
+}
+
+void SpillPlacement::BlockConstraint::print(raw_ostream &OS) const {
+  auto toString = [](BorderConstraint C) -> StringRef {
+    switch(C) {
+    case DontCare: return "DontCare";
+    case PrefReg: return "PrefReg";
+    case PrefSpill: return "PrefSpill";
+    case PrefBoth: return "PrefBoth";
+    case MustSpill: return "MustSpill";
+    };
+    llvm_unreachable("uncovered switch");
+  };
+
+  dbgs() << "{" << Number << ", "
+         << toString(Entry) << ", "
+         << toString(Exit) << ", "
+         << (ChangesValue ? "changes" : "no change") << "}";
+}
+
+void SpillPlacement::BlockConstraint::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SpillPlacement.h b/contrib/llvm-project/llvm/lib/CodeGen/SpillPlacement.h
new file mode 100644
index 000000000000..bd37d85c6c0d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SpillPlacement.h
@@ -0,0 +1,172 @@
+//===- SpillPlacement.h - Optimal Spill Code Placement ---------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This analysis computes the optimal spill code placement between basic blocks.
+//
+// The runOnMachineFunction() method only precomputes some profiling information
+// about the CFG. The real work is done by prepare(), addConstraints(), and
+// finish() which are called by the register allocator.
+//
+// Given a variable that is live across multiple basic blocks, and given
+// constraints on the basic blocks where the variable is live, determine which
+// edge bundles should have the variable in a register and which edge bundles
+// should have the variable in a stack slot.
+//
+// The returned bit vector can be used to place optimal spill code at basic
+// block entries and exits. Spill code placement inside a basic block is not
+// considered.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SPILLPLACEMENT_H
+#define LLVM_LIB_CODEGEN_SPILLPLACEMENT_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/BlockFrequency.h"
+
+namespace llvm {
+
+class BitVector;
+class EdgeBundles;
+class MachineBlockFrequencyInfo;
+class MachineFunction;
+class MachineLoopInfo;
+
+class SpillPlacement : public MachineFunctionPass {
+  struct Node;
+  const MachineFunction *MF = nullptr;
+  const EdgeBundles *bundles = nullptr;
+  const MachineLoopInfo *loops = nullptr;
+  const MachineBlockFrequencyInfo *MBFI = nullptr;
+  Node *nodes = nullptr;
+
+  // Nodes that are active in the current computation. Owned by the prepare()
+  // caller.
+  BitVector *ActiveNodes = nullptr;
+
+  // Nodes with active links. Populated by scanActiveBundles.
+  SmallVector<unsigned, 8> Linked;
+
+  // Nodes that went positive during the last call to scanActiveBundles or
+  // iterate.
+  SmallVector<unsigned, 8> RecentPositive;
+
+  // Block frequencies are computed once. Indexed by block number.
+  SmallVector<BlockFrequency, 8> BlockFrequencies;
+
+  /// Decision threshold. A node gets the output value 0 if the weighted sum of
+  /// its inputs falls in the open interval (-Threshold;Threshold).
+  BlockFrequency Threshold;
+
+  /// List of nodes that need to be updated in ::iterate.
+  SparseSet<unsigned> TodoList;
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+
+  SpillPlacement() : MachineFunctionPass(ID) {}
+  ~SpillPlacement() override { releaseMemory(); }
+
+  /// BorderConstraint - A basic block has separate constraints for entry and
+  /// exit.
+  enum BorderConstraint {
+    DontCare,  ///< Block doesn't care / variable not live.
+    PrefReg,   ///< Block entry/exit prefers a register.
+    PrefSpill, ///< Block entry/exit prefers a stack slot.
+    PrefBoth,  ///< Block entry prefers both register and stack.
+    MustSpill  ///< A register is impossible, variable must be spilled.
+  };
+
+  /// BlockConstraint - Entry and exit constraints for a basic block.
+  struct BlockConstraint {
+    unsigned Number;            ///< Basic block number (from MBB::getNumber()).
+    BorderConstraint Entry : 8; ///< Constraint on block entry.
+    BorderConstraint Exit : 8;  ///< Constraint on block exit.
+
+    /// True when this block changes the value of the live range. This means
+    /// the block has a non-PHI def.  When this is false, a live-in value on
+    /// the stack can be live-out on the stack without inserting a spill.
+    bool ChangesValue;
+
+    void print(raw_ostream &OS) const;
+    void dump() const;
+  };
+
+  /// prepare - Reset state and prepare for a new spill placement computation.
+  /// @param RegBundles Bit vector to receive the edge bundles where the
+  ///                   variable should be kept in a register. Each bit
+  ///                   corresponds to an edge bundle, a set bit means the
+  ///                   variable should be kept in a register through the
+  ///                   bundle. A clear bit means the variable should be
+  ///                   spilled. This vector is retained.
+  void prepare(BitVector &RegBundles);
+
+  /// addConstraints - Add constraints and biases. This method may be called
+  /// more than once to accumulate constraints.
+  /// @param LiveBlocks Constraints for blocks that have the variable live in or
+  ///                   live out.
+  void addConstraints(ArrayRef<BlockConstraint> LiveBlocks);
+
+  /// addPrefSpill - Add PrefSpill constraints to all blocks listed.  This is
+  /// equivalent to calling addConstraint with identical BlockConstraints with
+  /// Entry = Exit = PrefSpill, and ChangesValue = false.
+  ///
+  /// @param Blocks Array of block numbers that prefer to spill in and out.
+  /// @param Strong When true, double the negative bias for these blocks.
+  void addPrefSpill(ArrayRef<unsigned> Blocks, bool Strong);
+
+  /// addLinks - Add transparent blocks with the given numbers.
+  void addLinks(ArrayRef<unsigned> Links);
+
+  /// scanActiveBundles - Perform an initial scan of all bundles activated by
+  /// addConstraints and addLinks, updating their state. Add all the bundles
+  /// that now prefer a register to RecentPositive.
+  /// Prepare internal data structures for iterate.
+  /// Return true is there are any positive nodes.
+  bool scanActiveBundles();
+
+  /// iterate - Update the network iteratively until convergence, or new bundles
+  /// are found.
+  void iterate();
+
+  /// getRecentPositive - Return an array of bundles that became positive during
+  /// the previous call to scanActiveBundles or iterate.
+  ArrayRef<unsigned> getRecentPositive() { return RecentPositive; }
+
+  /// finish - Compute the optimal spill code placement given the
+  /// constraints. No MustSpill constraints will be violated, and the smallest
+  /// possible number of PrefX constraints will be violated, weighted by
+  /// expected execution frequencies.
+  /// The selected bundles are returned in the bitvector passed to prepare().
+  /// @return True if a perfect solution was found, allowing the variable to be
+  ///         in a register through all relevant bundles.
+  bool finish();
+
+  /// getBlockFrequency - Return the estimated block execution frequency per
+  /// function invocation.
+  BlockFrequency getBlockFrequency(unsigned Number) const {
+    return BlockFrequencies[Number];
+  }
+
+private:
+  bool runOnMachineFunction(MachineFunction &mf) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void releaseMemory() override;
+
+  void activate(unsigned n);
+  void setThreshold(const BlockFrequency &Entry);
+
+  bool update(unsigned n);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_SPILLPLACEMENT_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SplitKit.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SplitKit.cpp
new file mode 100644
index 000000000000..eee54f09fbad
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SplitKit.cpp
@@ -0,0 +1,1888 @@
+//===- SplitKit.cpp - Toolkit for splitting live ranges -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SplitAnalysis class as well as mutator functions for
+// live range splitting.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SplitKit.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(NumFinished, "Number of splits finished");
+STATISTIC(NumSimple,   "Number of splits that were simple");
+STATISTIC(NumCopies,   "Number of copies inserted for splitting");
+STATISTIC(NumRemats,   "Number of rematerialized defs for splitting");
+
+//===----------------------------------------------------------------------===//
+//                     Last Insert Point Analysis
+//===----------------------------------------------------------------------===//
+
+InsertPointAnalysis::InsertPointAnalysis(const LiveIntervals &lis,
+                                         unsigned BBNum)
+    : LIS(lis), LastInsertPoint(BBNum) {}
+
+SlotIndex
+InsertPointAnalysis::computeLastInsertPoint(const LiveInterval &CurLI,
+                                            const MachineBasicBlock &MBB) {
+  unsigned Num = MBB.getNumber();
+  std::pair<SlotIndex, SlotIndex> &LIP = LastInsertPoint[Num];
+  SlotIndex MBBEnd = LIS.getMBBEndIdx(&MBB);
+
+  SmallVector<const MachineBasicBlock *, 1> ExceptionalSuccessors;
+  bool EHPadSuccessor = false;
+  for (const MachineBasicBlock *SMBB : MBB.successors()) {
+    if (SMBB->isEHPad()) {
+      ExceptionalSuccessors.push_back(SMBB);
+      EHPadSuccessor = true;
+    } else if (SMBB->isInlineAsmBrIndirectTarget())
+      ExceptionalSuccessors.push_back(SMBB);
+  }
+
+  // Compute insert points on the first call. The pair is independent of the
+  // current live interval.
+  if (!LIP.first.isValid()) {
+    MachineBasicBlock::const_iterator FirstTerm = MBB.getFirstTerminator();
+    if (FirstTerm == MBB.end())
+      LIP.first = MBBEnd;
+    else
+      LIP.first = LIS.getInstructionIndex(*FirstTerm);
+
+    // If there is a landing pad or inlineasm_br successor, also find the
+    // instruction. If there is no such instruction, we don't need to do
+    // anything special.  We assume there cannot be multiple instructions that
+    // are Calls with EHPad successors or INLINEASM_BR in a block. Further, we
+    // assume that if there are any, they will be after any other call
+    // instructions in the block.
+    if (ExceptionalSuccessors.empty())
+      return LIP.first;
+    for (const MachineInstr &MI : llvm::reverse(MBB)) {
+      if ((EHPadSuccessor && MI.isCall()) ||
+          MI.getOpcode() == TargetOpcode::INLINEASM_BR) {
+        LIP.second = LIS.getInstructionIndex(MI);
+        break;
+      }
+    }
+  }
+
+  // If CurLI is live into a landing pad successor, move the last insert point
+  // back to the call that may throw.
+  if (!LIP.second)
+    return LIP.first;
+
+  if (none_of(ExceptionalSuccessors, [&](const MachineBasicBlock *EHPad) {
+        return LIS.isLiveInToMBB(CurLI, EHPad);
+      }))
+    return LIP.first;
+
+  // Find the value leaving MBB.
+  const VNInfo *VNI = CurLI.getVNInfoBefore(MBBEnd);
+  if (!VNI)
+    return LIP.first;
+
+  // The def of statepoint instruction is a gc relocation and it should be alive
+  // in landing pad. So we cannot split interval after statepoint instruction.
+  if (SlotIndex::isSameInstr(VNI->def, LIP.second))
+    if (auto *I = LIS.getInstructionFromIndex(LIP.second))
+      if (I->getOpcode() == TargetOpcode::STATEPOINT)
+        return LIP.second;
+
+  // If the value leaving MBB was defined after the call in MBB, it can't
+  // really be live-in to the landing pad.  This can happen if the landing pad
+  // has a PHI, and this register is undef on the exceptional edge.
+  // <rdar://problem/10664933>
+  if (!SlotIndex::isEarlierInstr(VNI->def, LIP.second) && VNI->def < MBBEnd)
+    return LIP.first;
+
+  // Value is properly live-in to the landing pad.
+  // Only allow inserts before the call.
+  return LIP.second;
+}
+
+MachineBasicBlock::iterator
+InsertPointAnalysis::getLastInsertPointIter(const LiveInterval &CurLI,
+                                            MachineBasicBlock &MBB) {
+  SlotIndex LIP = getLastInsertPoint(CurLI, MBB);
+  if (LIP == LIS.getMBBEndIdx(&MBB))
+    return MBB.end();
+  return LIS.getInstructionFromIndex(LIP);
+}
+
+//===----------------------------------------------------------------------===//
+//                                 Split Analysis
+//===----------------------------------------------------------------------===//
+
+SplitAnalysis::SplitAnalysis(const VirtRegMap &vrm, const LiveIntervals &lis,
+                             const MachineLoopInfo &mli)
+    : MF(vrm.getMachineFunction()), VRM(vrm), LIS(lis), Loops(mli),
+      TII(*MF.getSubtarget().getInstrInfo()), IPA(lis, MF.getNumBlockIDs()) {}
+
+void SplitAnalysis::clear() {
+  UseSlots.clear();
+  UseBlocks.clear();
+  ThroughBlocks.clear();
+  CurLI = nullptr;
+}
+
+/// analyzeUses - Count instructions, basic blocks, and loops using CurLI.
+void SplitAnalysis::analyzeUses() {
+  assert(UseSlots.empty() && "Call clear first");
+
+  // First get all the defs from the interval values. This provides the correct
+  // slots for early clobbers.
+  for (const VNInfo *VNI : CurLI->valnos)
+    if (!VNI->isPHIDef() && !VNI->isUnused())
+      UseSlots.push_back(VNI->def);
+
+  // Get use slots form the use-def chain.
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (MachineOperand &MO : MRI.use_nodbg_operands(CurLI->reg()))
+    if (!MO.isUndef())
+      UseSlots.push_back(LIS.getInstructionIndex(*MO.getParent()).getRegSlot());
+
+  array_pod_sort(UseSlots.begin(), UseSlots.end());
+
+  // Remove duplicates, keeping the smaller slot for each instruction.
+  // That is what we want for early clobbers.
+  UseSlots.erase(std::unique(UseSlots.begin(), UseSlots.end(),
+                             SlotIndex::isSameInstr),
+                 UseSlots.end());
+
+  // Compute per-live block info.
+  calcLiveBlockInfo();
+
+  LLVM_DEBUG(dbgs() << "Analyze counted " << UseSlots.size() << " instrs in "
+                    << UseBlocks.size() << " blocks, through "
+                    << NumThroughBlocks << " blocks.\n");
+}
+
+/// calcLiveBlockInfo - Fill the LiveBlocks array with information about blocks
+/// where CurLI is live.
+void SplitAnalysis::calcLiveBlockInfo() {
+  ThroughBlocks.resize(MF.getNumBlockIDs());
+  NumThroughBlocks = NumGapBlocks = 0;
+  if (CurLI->empty())
+    return;
+
+  LiveInterval::const_iterator LVI = CurLI->begin();
+  LiveInterval::const_iterator LVE = CurLI->end();
+
+  SmallVectorImpl<SlotIndex>::const_iterator UseI, UseE;
+  UseI = UseSlots.begin();
+  UseE = UseSlots.end();
+
+  // Loop over basic blocks where CurLI is live.
+  MachineFunction::iterator MFI =
+      LIS.getMBBFromIndex(LVI->start)->getIterator();
+  while (true) {
+    BlockInfo BI;
+    BI.MBB = &*MFI;
+    SlotIndex Start, Stop;
+    std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);
+
+    // If the block contains no uses, the range must be live through. At one
+    // point, RegisterCoalescer could create dangling ranges that ended
+    // mid-block.
+    if (UseI == UseE || *UseI >= Stop) {
+      ++NumThroughBlocks;
+      ThroughBlocks.set(BI.MBB->getNumber());
+      // The range shouldn't end mid-block if there are no uses. This shouldn't
+      // happen.
+      assert(LVI->end >= Stop && "range ends mid block with no uses");
+    } else {
+      // This block has uses. Find the first and last uses in the block.
+      BI.FirstInstr = *UseI;
+      assert(BI.FirstInstr >= Start);
+      do ++UseI;
+      while (UseI != UseE && *UseI < Stop);
+      BI.LastInstr = UseI[-1];
+      assert(BI.LastInstr < Stop);
+
+      // LVI is the first live segment overlapping MBB.
+      BI.LiveIn = LVI->start <= Start;
+
+      // When not live in, the first use should be a def.
+      if (!BI.LiveIn) {
+        assert(LVI->start == LVI->valno->def && "Dangling Segment start");
+        assert(LVI->start == BI.FirstInstr && "First instr should be a def");
+        BI.FirstDef = BI.FirstInstr;
+      }
+
+      // Look for gaps in the live range.
+      BI.LiveOut = true;
+      while (LVI->end < Stop) {
+        SlotIndex LastStop = LVI->end;
+        if (++LVI == LVE || LVI->start >= Stop) {
+          BI.LiveOut = false;
+          BI.LastInstr = LastStop;
+          break;
+        }
+
+        if (LastStop < LVI->start) {
+          // There is a gap in the live range. Create duplicate entries for the
+          // live-in snippet and the live-out snippet.
+          ++NumGapBlocks;
+
+          // Push the Live-in part.
+          BI.LiveOut = false;
+          UseBlocks.push_back(BI);
+          UseBlocks.back().LastInstr = LastStop;
+
+          // Set up BI for the live-out part.
+          BI.LiveIn = false;
+          BI.LiveOut = true;
+          BI.FirstInstr = BI.FirstDef = LVI->start;
+        }
+
+        // A Segment that starts in the middle of the block must be a def.
+        assert(LVI->start == LVI->valno->def && "Dangling Segment start");
+        if (!BI.FirstDef)
+          BI.FirstDef = LVI->start;
+      }
+
+      UseBlocks.push_back(BI);
+
+      // LVI is now at LVE or LVI->end >= Stop.
+      if (LVI == LVE)
+        break;
+    }
+
+    // Live segment ends exactly at Stop. Move to the next segment.
+    if (LVI->end == Stop && ++LVI == LVE)
+      break;
+
+    // Pick the next basic block.
+    if (LVI->start < Stop)
+      ++MFI;
+    else
+      MFI = LIS.getMBBFromIndex(LVI->start)->getIterator();
+  }
+
+  assert(getNumLiveBlocks() == countLiveBlocks(CurLI) && "Bad block count");
+}
+
+unsigned SplitAnalysis::countLiveBlocks(const LiveInterval *cli) const {
+  if (cli->empty())
+    return 0;
+  LiveInterval *li = const_cast<LiveInterval*>(cli);
+  LiveInterval::iterator LVI = li->begin();
+  LiveInterval::iterator LVE = li->end();
+  unsigned Count = 0;
+
+  // Loop over basic blocks where li is live.
+  MachineFunction::const_iterator MFI =
+      LIS.getMBBFromIndex(LVI->start)->getIterator();
+  SlotIndex Stop = LIS.getMBBEndIdx(&*MFI);
+  while (true) {
+    ++Count;
+    LVI = li->advanceTo(LVI, Stop);
+    if (LVI == LVE)
+      return Count;
+    do {
+      ++MFI;
+      Stop = LIS.getMBBEndIdx(&*MFI);
+    } while (Stop <= LVI->start);
+  }
+}
+
+bool SplitAnalysis::isOriginalEndpoint(SlotIndex Idx) const {
+  Register OrigReg = VRM.getOriginal(CurLI->reg());
+  const LiveInterval &Orig = LIS.getInterval(OrigReg);
+  assert(!Orig.empty() && "Splitting empty interval?");
+  LiveInterval::const_iterator I = Orig.find(Idx);
+
+  // Range containing Idx should begin at Idx.
+  if (I != Orig.end() && I->start <= Idx)
+    return I->start == Idx;
+
+  // Range does not contain Idx, previous must end at Idx.
+  return I != Orig.begin() && (--I)->end == Idx;
+}
+
+void SplitAnalysis::analyze(const LiveInterval *li) {
+  clear();
+  CurLI = li;
+  analyzeUses();
+}
+
+//===----------------------------------------------------------------------===//
+//                               Split Editor
+//===----------------------------------------------------------------------===//
+
+/// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
+SplitEditor::SplitEditor(SplitAnalysis &SA, LiveIntervals &LIS, VirtRegMap &VRM,
+                         MachineDominatorTree &MDT,
+                         MachineBlockFrequencyInfo &MBFI, VirtRegAuxInfo &VRAI)
+    : SA(SA), LIS(LIS), VRM(VRM), MRI(VRM.getMachineFunction().getRegInfo()),
+      MDT(MDT), TII(*VRM.getMachineFunction().getSubtarget().getInstrInfo()),
+      TRI(*VRM.getMachineFunction().getSubtarget().getRegisterInfo()),
+      MBFI(MBFI), VRAI(VRAI), RegAssign(Allocator) {}
+
+void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
+  Edit = &LRE;
+  SpillMode = SM;
+  OpenIdx = 0;
+  RegAssign.clear();
+  Values.clear();
+
+  // Reset the LiveIntervalCalc instances needed for this spill mode.
+  LICalc[0].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
+                  &LIS.getVNInfoAllocator());
+  if (SpillMode)
+    LICalc[1].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
+                    &LIS.getVNInfoAllocator());
+
+  Edit->anyRematerializable();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void SplitEditor::dump() const {
+  if (RegAssign.empty()) {
+    dbgs() << " empty\n";
+    return;
+  }
+
+  for (RegAssignMap::const_iterator I = RegAssign.begin(); I.valid(); ++I)
+    dbgs() << " [" << I.start() << ';' << I.stop() << "):" << I.value();
+  dbgs() << '\n';
+}
+#endif
+
+/// Find a subrange corresponding to the exact lane mask @p LM in the live
+/// interval @p LI. The interval @p LI is assumed to contain such a subrange.
+/// This function is used to find corresponding subranges between the
+/// original interval and the new intervals.
+template <typename T> auto &getSubrangeImpl(LaneBitmask LM, T &LI) {
+  for (auto &S : LI.subranges())
+    if (S.LaneMask == LM)
+      return S;
+  llvm_unreachable("SubRange for this mask not found");
+}
+
+LiveInterval::SubRange &getSubRangeForMaskExact(LaneBitmask LM,
+                                                LiveInterval &LI) {
+  return getSubrangeImpl(LM, LI);
+}
+
+const LiveInterval::SubRange &getSubRangeForMaskExact(LaneBitmask LM,
+                                                      const LiveInterval &LI) {
+  return getSubrangeImpl(LM, LI);
+}
+
+/// Find a subrange corresponding to the lane mask @p LM, or a superset of it,
+/// in the live interval @p LI. The interval @p LI is assumed to contain such
+/// a subrange.  This function is used to find corresponding subranges between
+/// the original interval and the new intervals.
+const LiveInterval::SubRange &getSubRangeForMask(LaneBitmask LM,
+                                                 const LiveInterval &LI) {
+  for (const LiveInterval::SubRange &S : LI.subranges())
+    if ((S.LaneMask & LM) == LM)
+      return S;
+  llvm_unreachable("SubRange for this mask not found");
+}
+
+void SplitEditor::addDeadDef(LiveInterval &LI, VNInfo *VNI, bool Original) {
+  if (!LI.hasSubRanges()) {
+    LI.createDeadDef(VNI);
+    return;
+  }
+
+  SlotIndex Def = VNI->def;
+  if (Original) {
+    // If we are transferring a def from the original interval, make sure
+    // to only update the subranges for which the original subranges had
+    // a def at this location.
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      auto &PS = getSubRangeForMask(S.LaneMask, Edit->getParent());
+      VNInfo *PV = PS.getVNInfoAt(Def);
+      if (PV != nullptr && PV->def == Def)
+        S.createDeadDef(Def, LIS.getVNInfoAllocator());
+    }
+  } else {
+    // This is a new def: either from rematerialization, or from an inserted
+    // copy. Since rematerialization can regenerate a definition of a sub-
+    // register, we need to check which subranges need to be updated.
+    const MachineInstr *DefMI = LIS.getInstructionFromIndex(Def);
+    assert(DefMI != nullptr);
+    LaneBitmask LM;
+    for (const MachineOperand &DefOp : DefMI->defs()) {
+      Register R = DefOp.getReg();
+      if (R != LI.reg())
+        continue;
+      if (unsigned SR = DefOp.getSubReg())
+        LM |= TRI.getSubRegIndexLaneMask(SR);
+      else {
+        LM = MRI.getMaxLaneMaskForVReg(R);
+        break;
+      }
+    }
+    for (LiveInterval::SubRange &S : LI.subranges())
+      if ((S.LaneMask & LM).any())
+        S.createDeadDef(Def, LIS.getVNInfoAllocator());
+  }
+}
+
+VNInfo *SplitEditor::defValue(unsigned RegIdx,
+                              const VNInfo *ParentVNI,
+                              SlotIndex Idx,
+                              bool Original) {
+  assert(ParentVNI && "Mapping  NULL value");
+  assert(Idx.isValid() && "Invalid SlotIndex");
+  assert(Edit->getParent().getVNInfoAt(Idx) == ParentVNI && "Bad Parent VNI");
+  LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
+
+  // Create a new value.
+  VNInfo *VNI = LI->getNextValue(Idx, LIS.getVNInfoAllocator());
+
+  bool Force = LI->hasSubRanges();
+  ValueForcePair FP(Force ? nullptr : VNI, Force);
+  // Use insert for lookup, so we can add missing values with a second lookup.
+  std::pair<ValueMap::iterator, bool> InsP =
+    Values.insert(std::make_pair(std::make_pair(RegIdx, ParentVNI->id), FP));
+
+  // This was the first time (RegIdx, ParentVNI) was mapped, and it is not
+  // forced. Keep it as a simple def without any liveness.
+  if (!Force && InsP.second)
+    return VNI;
+
+  // If the previous value was a simple mapping, add liveness for it now.
+  if (VNInfo *OldVNI = InsP.first->second.getPointer()) {
+    addDeadDef(*LI, OldVNI, Original);
+
+    // No longer a simple mapping.  Switch to a complex mapping. If the
+    // interval has subranges, make it a forced mapping.
+    InsP.first->second = ValueForcePair(nullptr, Force);
+  }
+
+  // This is a complex mapping, add liveness for VNI
+  addDeadDef(*LI, VNI, Original);
+  return VNI;
+}
+
+void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI) {
+  ValueForcePair &VFP = Values[std::make_pair(RegIdx, ParentVNI.id)];
+  VNInfo *VNI = VFP.getPointer();
+
+  // ParentVNI was either unmapped or already complex mapped. Either way, just
+  // set the force bit.
+  if (!VNI) {
+    VFP.setInt(true);
+    return;
+  }
+
+  // This was previously a single mapping. Make sure the old def is represented
+  // by a trivial live range.
+  addDeadDef(LIS.getInterval(Edit->get(RegIdx)), VNI, false);
+
+  // Mark as complex mapped, forced.
+  VFP = ValueForcePair(nullptr, true);
+}
+
+SlotIndex SplitEditor::buildSingleSubRegCopy(Register FromReg, Register ToReg,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) {
+  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+  bool FirstCopy = !Def.isValid();
+  MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc)
+      .addReg(ToReg, RegState::Define | getUndefRegState(FirstCopy)
+              | getInternalReadRegState(!FirstCopy), SubIdx)
+      .addReg(FromReg, 0, SubIdx);
+
+  SlotIndexes &Indexes = *LIS.getSlotIndexes();
+  if (FirstCopy) {
+    Def = Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot();
+  } else {
+    CopyMI->bundleWithPred();
+  }
+  return Def;
+}
+
+SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg,
+    LaneBitmask LaneMask, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) {
+  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+  SlotIndexes &Indexes = *LIS.getSlotIndexes();
+  if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) {
+    // The full vreg is copied.
+    MachineInstr *CopyMI =
+        BuildMI(MBB, InsertBefore, DebugLoc(), Desc, ToReg).addReg(FromReg);
+    return Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot();
+  }
+
+  // Only a subset of lanes needs to be copied. The following is a simple
+  // heuristic to construct a sequence of COPYs. We could add a target
+  // specific callback if this turns out to be suboptimal.
+  LiveInterval &DestLI = LIS.getInterval(Edit->get(RegIdx));
+
+  // First pass: Try to find a perfectly matching subregister index. If none
+  // exists find the one covering the most lanemask bits.
+  const TargetRegisterClass *RC = MRI.getRegClass(FromReg);
+  assert(RC == MRI.getRegClass(ToReg) && "Should have same reg class");
+
+  SmallVector<unsigned, 8> SubIndexes;
+
+  // Abort if we cannot possibly implement the COPY with the given indexes.
+  if (!TRI.getCoveringSubRegIndexes(MRI, RC, LaneMask, SubIndexes))
+    report_fatal_error("Impossible to implement partial COPY");
+
+  SlotIndex Def;
+  for (unsigned BestIdx : SubIndexes) {
+    Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx,
+                                DestLI, Late, Def);
+  }
+
+  BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator();
+  DestLI.refineSubRanges(
+      Allocator, LaneMask,
+      [Def, &Allocator](LiveInterval::SubRange &SR) {
+        SR.createDeadDef(Def, Allocator);
+      },
+      Indexes, TRI);
+
+  return Def;
+}
+
+VNInfo *SplitEditor::defFromParent(unsigned RegIdx, const VNInfo *ParentVNI,
+                                   SlotIndex UseIdx, MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I) {
+  SlotIndex Def;
+  LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
+
+  // We may be trying to avoid interference that ends at a deleted instruction,
+  // so always begin RegIdx 0 early and all others late.
+  bool Late = RegIdx != 0;
+
+  // Attempt cheap-as-a-copy rematerialization.
+  Register Original = VRM.getOriginal(Edit->get(RegIdx));
+  LiveInterval &OrigLI = LIS.getInterval(Original);
+  VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx);
+
+  Register Reg = LI->reg();
+  bool DidRemat = false;
+  if (OrigVNI) {
+    LiveRangeEdit::Remat RM(ParentVNI);
+    RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);
+    if (Edit->canRematerializeAt(RM, OrigVNI, UseIdx, true)) {
+      Def = Edit->rematerializeAt(MBB, I, Reg, RM, TRI, Late);
+      ++NumRemats;
+      DidRemat = true;
+    }
+  }
+  if (!DidRemat) {
+    LaneBitmask LaneMask;
+    if (OrigLI.hasSubRanges()) {
+      LaneMask = LaneBitmask::getNone();
+      for (LiveInterval::SubRange &S : OrigLI.subranges()) {
+        if (S.liveAt(UseIdx))
+          LaneMask |= S.LaneMask;
+      }
+    } else {
+      LaneMask = LaneBitmask::getAll();
+    }
+
+    if (LaneMask.none()) {
+      const MCInstrDesc &Desc = TII.get(TargetOpcode::IMPLICIT_DEF);
+      MachineInstr *ImplicitDef = BuildMI(MBB, I, DebugLoc(), Desc, Reg);
+      SlotIndexes &Indexes = *LIS.getSlotIndexes();
+      Def = Indexes.insertMachineInstrInMaps(*ImplicitDef, Late).getRegSlot();
+    } else {
+      ++NumCopies;
+      Def = buildCopy(Edit->getReg(), Reg, LaneMask, MBB, I, Late, RegIdx);
+    }
+  }
+
+  // Define the value in Reg.
+  return defValue(RegIdx, ParentVNI, Def, false);
+}
+
+/// Create a new virtual register and live interval.
+unsigned SplitEditor::openIntv() {
+  // Create the complement as index 0.
+  if (Edit->empty())
+    Edit->createEmptyInterval();
+
+  // Create the open interval.
+  OpenIdx = Edit->size();
+  Edit->createEmptyInterval();
+  return OpenIdx;
+}
+
+void SplitEditor::selectIntv(unsigned Idx) {
+  assert(Idx != 0 && "Cannot select the complement interval");
+  assert(Idx < Edit->size() && "Can only select previously opened interval");
+  LLVM_DEBUG(dbgs() << "    selectIntv " << OpenIdx << " -> " << Idx << '\n');
+  OpenIdx = Idx;
+}
+
+SlotIndex SplitEditor::enterIntvBefore(SlotIndex Idx) {
+  assert(OpenIdx && "openIntv not called before enterIntvBefore");
+  LLVM_DEBUG(dbgs() << "    enterIntvBefore " << Idx);
+  Idx = Idx.getBaseIndex();
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
+  if (!ParentVNI) {
+    LLVM_DEBUG(dbgs() << ": not live\n");
+    return Idx;
+  }
+  LLVM_DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
+  MachineInstr *MI = LIS.getInstructionFromIndex(Idx);
+  assert(MI && "enterIntvBefore called with invalid index");
+
+  VNInfo *VNI = defFromParent(OpenIdx, ParentVNI, Idx, *MI->getParent(), MI);
+  return VNI->def;
+}
+
+SlotIndex SplitEditor::enterIntvAfter(SlotIndex Idx) {
+  assert(OpenIdx && "openIntv not called before enterIntvAfter");
+  LLVM_DEBUG(dbgs() << "    enterIntvAfter " << Idx);
+  Idx = Idx.getBoundaryIndex();
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
+  if (!ParentVNI) {
+    LLVM_DEBUG(dbgs() << ": not live\n");
+    return Idx;
+  }
+  LLVM_DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
+  MachineInstr *MI = LIS.getInstructionFromIndex(Idx);
+  assert(MI && "enterIntvAfter called with invalid index");
+
+  VNInfo *VNI = defFromParent(OpenIdx, ParentVNI, Idx, *MI->getParent(),
+                              std::next(MachineBasicBlock::iterator(MI)));
+  return VNI->def;
+}
+
+SlotIndex SplitEditor::enterIntvAtEnd(MachineBasicBlock &MBB) {
+  assert(OpenIdx && "openIntv not called before enterIntvAtEnd");
+  SlotIndex End = LIS.getMBBEndIdx(&MBB);
+  SlotIndex Last = End.getPrevSlot();
+  LLVM_DEBUG(dbgs() << "    enterIntvAtEnd " << printMBBReference(MBB) << ", "
+                    << Last);
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Last);
+  if (!ParentVNI) {
+    LLVM_DEBUG(dbgs() << ": not live\n");
+    return End;
+  }
+  SlotIndex LSP = SA.getLastSplitPoint(&MBB);
+  if (LSP < Last) {
+    // It could be that the use after LSP is a def, and thus the ParentVNI
+    // just selected starts at that def.  For this case to exist, the def
+    // must be part of a tied def/use pair (as otherwise we'd have split
+    // distinct live ranges into individual live intervals), and thus we
+    // can insert the def into the VNI of the use and the tied def/use
+    // pair can live in the resulting interval.
+    Last = LSP;
+    ParentVNI = Edit->getParent().getVNInfoAt(Last);
+    if (!ParentVNI) {
+      // undef use --> undef tied def
+      LLVM_DEBUG(dbgs() << ": tied use not live\n");
+      return End;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << ": valno " << ParentVNI->id);
+  VNInfo *VNI = defFromParent(OpenIdx, ParentVNI, Last, MBB,
+                              SA.getLastSplitPointIter(&MBB));
+  RegAssign.insert(VNI->def, End, OpenIdx);
+  LLVM_DEBUG(dump());
+  return VNI->def;
+}
+
+/// useIntv - indicate that all instructions in MBB should use OpenLI.
+void SplitEditor::useIntv(const MachineBasicBlock &MBB) {
+  useIntv(LIS.getMBBStartIdx(&MBB), LIS.getMBBEndIdx(&MBB));
+}
+
+void SplitEditor::useIntv(SlotIndex Start, SlotIndex End) {
+  assert(OpenIdx && "openIntv not called before useIntv");
+  LLVM_DEBUG(dbgs() << "    useIntv [" << Start << ';' << End << "):");
+  RegAssign.insert(Start, End, OpenIdx);
+  LLVM_DEBUG(dump());
+}
+
+SlotIndex SplitEditor::leaveIntvAfter(SlotIndex Idx) {
+  assert(OpenIdx && "openIntv not called before leaveIntvAfter");
+  LLVM_DEBUG(dbgs() << "    leaveIntvAfter " << Idx);
+
+  // The interval must be live beyond the instruction at Idx.
+  SlotIndex Boundary = Idx.getBoundaryIndex();
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Boundary);
+  if (!ParentVNI) {
+    LLVM_DEBUG(dbgs() << ": not live\n");
+    return Boundary.getNextSlot();
+  }
+  LLVM_DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
+  MachineInstr *MI = LIS.getInstructionFromIndex(Boundary);
+  assert(MI && "No instruction at index");
+
+  // In spill mode, make live ranges as short as possible by inserting the copy
+  // before MI.  This is only possible if that instruction doesn't redefine the
+  // value.  The inserted COPY is not a kill, and we don't need to recompute
+  // the source live range.  The spiller also won't try to hoist this copy.
+  if (SpillMode && !SlotIndex::isSameInstr(ParentVNI->def, Idx) &&
+      MI->readsVirtualRegister(Edit->getReg())) {
+    forceRecompute(0, *ParentVNI);
+    defFromParent(0, ParentVNI, Idx, *MI->getParent(), MI);
+    return Idx;
+  }
+
+  VNInfo *VNI = defFromParent(0, ParentVNI, Boundary, *MI->getParent(),
+                              std::next(MachineBasicBlock::iterator(MI)));
+  return VNI->def;
+}
+
+SlotIndex SplitEditor::leaveIntvBefore(SlotIndex Idx) {
+  assert(OpenIdx && "openIntv not called before leaveIntvBefore");
+  LLVM_DEBUG(dbgs() << "    leaveIntvBefore " << Idx);
+
+  // The interval must be live into the instruction at Idx.
+  Idx = Idx.getBaseIndex();
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
+  if (!ParentVNI) {
+    LLVM_DEBUG(dbgs() << ": not live\n");
+    return Idx.getNextSlot();
+  }
+  LLVM_DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
+
+  MachineInstr *MI = LIS.getInstructionFromIndex(Idx);
+  assert(MI && "No instruction at index");
+  VNInfo *VNI = defFromParent(0, ParentVNI, Idx, *MI->getParent(), MI);
+  return VNI->def;
+}
+
+SlotIndex SplitEditor::leaveIntvAtTop(MachineBasicBlock &MBB) {
+  assert(OpenIdx && "openIntv not called before leaveIntvAtTop");
+  SlotIndex Start = LIS.getMBBStartIdx(&MBB);
+  LLVM_DEBUG(dbgs() << "    leaveIntvAtTop " << printMBBReference(MBB) << ", "
+                    << Start);
+
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Start);
+  if (!ParentVNI) {
+    LLVM_DEBUG(dbgs() << ": not live\n");
+    return Start;
+  }
+
+  VNInfo *VNI = defFromParent(0, ParentVNI, Start, MBB,
+                              MBB.SkipPHIsLabelsAndDebug(MBB.begin()));
+  RegAssign.insert(Start, VNI->def, OpenIdx);
+  LLVM_DEBUG(dump());
+  return VNI->def;
+}
+
+static bool hasTiedUseOf(MachineInstr &MI, unsigned Reg) {
+  return any_of(MI.defs(), [Reg](const MachineOperand &MO) {
+    return MO.isReg() && MO.isTied() && MO.getReg() == Reg;
+  });
+}
+
+void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) {
+  assert(OpenIdx && "openIntv not called before overlapIntv");
+  const VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Start);
+  assert(ParentVNI == Edit->getParent().getVNInfoBefore(End) &&
+         "Parent changes value in extended range");
+  assert(LIS.getMBBFromIndex(Start) == LIS.getMBBFromIndex(End) &&
+         "Range cannot span basic blocks");
+
+  // The complement interval will be extended as needed by LICalc.extend().
+  if (ParentVNI)
+    forceRecompute(0, *ParentVNI);
+
+  // If the last use is tied to a def, we can't mark it as live for the
+  // interval which includes only the use.  That would cause the tied pair
+  // to end up in two different intervals.
+  if (auto *MI = LIS.getInstructionFromIndex(End))
+    if (hasTiedUseOf(*MI, Edit->getReg())) {
+      LLVM_DEBUG(dbgs() << "skip overlap due to tied def at end\n");
+      return;
+    }
+
+  LLVM_DEBUG(dbgs() << "    overlapIntv [" << Start << ';' << End << "):");
+  RegAssign.insert(Start, End, OpenIdx);
+  LLVM_DEBUG(dump());
+}
+
+//===----------------------------------------------------------------------===//
+//                                  Spill modes
+//===----------------------------------------------------------------------===//
+
+void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
+  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
+  LLVM_DEBUG(dbgs() << "Removing " << Copies.size() << " back-copies.\n");
+  RegAssignMap::iterator AssignI;
+  AssignI.setMap(RegAssign);
+
+  for (const VNInfo *C : Copies) {
+    SlotIndex Def = C->def;
+    MachineInstr *MI = LIS.getInstructionFromIndex(Def);
+    assert(MI && "No instruction for back-copy");
+
+    MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::iterator MBBI(MI);
+    bool AtBegin;
+    do AtBegin = MBBI == MBB->begin();
+    while (!AtBegin && (--MBBI)->isDebugOrPseudoInstr());
+
+    LLVM_DEBUG(dbgs() << "Removing " << Def << '\t' << *MI);
+    LIS.removeVRegDefAt(*LI, Def);
+    LIS.RemoveMachineInstrFromMaps(*MI);
+    MI->eraseFromParent();
+
+    // Adjust RegAssign if a register assignment is killed at Def. We want to
+    // avoid calculating the live range of the source register if possible.
+    AssignI.find(Def.getPrevSlot());
+    if (!AssignI.valid() || AssignI.start() >= Def)
+      continue;
+    // If MI doesn't kill the assigned register, just leave it.
+    if (AssignI.stop() != Def)
+      continue;
+    unsigned RegIdx = AssignI.value();
+    // We could hoist back-copy right after another back-copy. As a result
+    // MMBI points to copy instruction which is actually dead now.
+    // We cannot set its stop to MBBI which will be the same as start and
+    // interval does not support that.
+    SlotIndex Kill =
+        AtBegin ? SlotIndex() : LIS.getInstructionIndex(*MBBI).getRegSlot();
+    if (AtBegin || !MBBI->readsVirtualRegister(Edit->getReg()) ||
+        Kill <= AssignI.start()) {
+      LLVM_DEBUG(dbgs() << "  cannot find simple kill of RegIdx " << RegIdx
+                        << '\n');
+      forceRecompute(RegIdx, *Edit->getParent().getVNInfoAt(Def));
+    } else {
+      LLVM_DEBUG(dbgs() << "  move kill to " << Kill << '\t' << *MBBI);
+      AssignI.setStop(Kill);
+    }
+  }
+}
+
+MachineBasicBlock*
+SplitEditor::findShallowDominator(MachineBasicBlock *MBB,
+                                  MachineBasicBlock *DefMBB) {
+  if (MBB == DefMBB)
+    return MBB;
+  assert(MDT.dominates(DefMBB, MBB) && "MBB must be dominated by the def.");
+
+  const MachineLoopInfo &Loops = SA.Loops;
+  const MachineLoop *DefLoop = Loops.getLoopFor(DefMBB);
+  MachineDomTreeNode *DefDomNode = MDT[DefMBB];
+
+  // Best candidate so far.
+  MachineBasicBlock *BestMBB = MBB;
+  unsigned BestDepth = std::numeric_limits<unsigned>::max();
+
+  while (true) {
+    const MachineLoop *Loop = Loops.getLoopFor(MBB);
+
+    // MBB isn't in a loop, it doesn't get any better.  All dominators have a
+    // higher frequency by definition.
+    if (!Loop) {
+      LLVM_DEBUG(dbgs() << "Def in " << printMBBReference(*DefMBB)
+                        << " dominates " << printMBBReference(*MBB)
+                        << " at depth 0\n");
+      return MBB;
+    }
+
+    // We'll never be able to exit the DefLoop.
+    if (Loop == DefLoop) {
+      LLVM_DEBUG(dbgs() << "Def in " << printMBBReference(*DefMBB)
+                        << " dominates " << printMBBReference(*MBB)
+                        << " in the same loop\n");
+      return MBB;
+    }
+
+    // Least busy dominator seen so far.
+    unsigned Depth = Loop->getLoopDepth();
+    if (Depth < BestDepth) {
+      BestMBB = MBB;
+      BestDepth = Depth;
+      LLVM_DEBUG(dbgs() << "Def in " << printMBBReference(*DefMBB)
+                        << " dominates " << printMBBReference(*MBB)
+                        << " at depth " << Depth << '\n');
+    }
+
+    // Leave loop by going to the immediate dominator of the loop header.
+    // This is a bigger stride than simply walking up the dominator tree.
+    MachineDomTreeNode *IDom = MDT[Loop->getHeader()]->getIDom();
+
+    // Too far up the dominator tree?
+    if (!IDom || !MDT.dominates(DefDomNode, IDom))
+      return BestMBB;
+
+    MBB = IDom->getBlock();
+  }
+}
+
+void SplitEditor::computeRedundantBackCopies(
+    DenseSet<unsigned> &NotToHoistSet, SmallVectorImpl<VNInfo *> &BackCopies) {
+  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
+  const LiveInterval *Parent = &Edit->getParent();
+  SmallVector<SmallPtrSet<VNInfo *, 8>, 8> EqualVNs(Parent->getNumValNums());
+  SmallPtrSet<VNInfo *, 8> DominatedVNIs;
+
+  // Aggregate VNIs having the same value as ParentVNI.
+  for (VNInfo *VNI : LI->valnos) {
+    if (VNI->isUnused())
+      continue;
+    VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
+    EqualVNs[ParentVNI->id].insert(VNI);
+  }
+
+  // For VNI aggregation of each ParentVNI, collect dominated, i.e.,
+  // redundant VNIs to BackCopies.
+  for (unsigned i = 0, e = Parent->getNumValNums(); i != e; ++i) {
+    const VNInfo *ParentVNI = Parent->getValNumInfo(i);
+    if (!NotToHoistSet.count(ParentVNI->id))
+      continue;
+    SmallPtrSetIterator<VNInfo *> It1 = EqualVNs[ParentVNI->id].begin();
+    SmallPtrSetIterator<VNInfo *> It2 = It1;
+    for (; It1 != EqualVNs[ParentVNI->id].end(); ++It1) {
+      It2 = It1;
+      for (++It2; It2 != EqualVNs[ParentVNI->id].end(); ++It2) {
+        if (DominatedVNIs.count(*It1) || DominatedVNIs.count(*It2))
+          continue;
+
+        MachineBasicBlock *MBB1 = LIS.getMBBFromIndex((*It1)->def);
+        MachineBasicBlock *MBB2 = LIS.getMBBFromIndex((*It2)->def);
+        if (MBB1 == MBB2) {
+          DominatedVNIs.insert((*It1)->def < (*It2)->def ? (*It2) : (*It1));
+        } else if (MDT.dominates(MBB1, MBB2)) {
+          DominatedVNIs.insert(*It2);
+        } else if (MDT.dominates(MBB2, MBB1)) {
+          DominatedVNIs.insert(*It1);
+        }
+      }
+    }
+    if (!DominatedVNIs.empty()) {
+      forceRecompute(0, *ParentVNI);
+      append_range(BackCopies, DominatedVNIs);
+      DominatedVNIs.clear();
+    }
+  }
+}
+
+/// For SM_Size mode, find a common dominator for all the back-copies for
+/// the same ParentVNI and hoist the backcopies to the dominator BB.
+/// For SM_Speed mode, if the common dominator is hot and it is not beneficial
+/// to do the hoisting, simply remove the dominated backcopies for the same
+/// ParentVNI.
+void SplitEditor::hoistCopies() {
+  // Get the complement interval, always RegIdx 0.
+  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
+  const LiveInterval *Parent = &Edit->getParent();
+
+  // Track the nearest common dominator for all back-copies for each ParentVNI,
+  // indexed by ParentVNI->id.
+  using DomPair = std::pair<MachineBasicBlock *, SlotIndex>;
+  SmallVector<DomPair, 8> NearestDom(Parent->getNumValNums());
+  // The total cost of all the back-copies for each ParentVNI.
+  SmallVector<BlockFrequency, 8> Costs(Parent->getNumValNums());
+  // The ParentVNI->id set for which hoisting back-copies are not beneficial
+  // for Speed.
+  DenseSet<unsigned> NotToHoistSet;
+
+  // Find the nearest common dominator for parent values with multiple
+  // back-copies.  If a single back-copy dominates, put it in DomPair.second.
+  for (VNInfo *VNI : LI->valnos) {
+    if (VNI->isUnused())
+      continue;
+    VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
+    assert(ParentVNI && "Parent not live at complement def");
+
+    // Don't hoist remats.  The complement is probably going to disappear
+    // completely anyway.
+    if (Edit->didRematerialize(ParentVNI))
+      continue;
+
+    MachineBasicBlock *ValMBB = LIS.getMBBFromIndex(VNI->def);
+
+    DomPair &Dom = NearestDom[ParentVNI->id];
+
+    // Keep directly defined parent values.  This is either a PHI or an
+    // instruction in the complement range.  All other copies of ParentVNI
+    // should be eliminated.
+    if (VNI->def == ParentVNI->def) {
+      LLVM_DEBUG(dbgs() << "Direct complement def at " << VNI->def << '\n');
+      Dom = DomPair(ValMBB, VNI->def);
+      continue;
+    }
+    // Skip the singly mapped values.  There is nothing to gain from hoisting a
+    // single back-copy.
+    if (Values.lookup(std::make_pair(0, ParentVNI->id)).getPointer()) {
+      LLVM_DEBUG(dbgs() << "Single complement def at " << VNI->def << '\n');
+      continue;
+    }
+
+    if (!Dom.first) {
+      // First time we see ParentVNI.  VNI dominates itself.
+      Dom = DomPair(ValMBB, VNI->def);
+    } else if (Dom.first == ValMBB) {
+      // Two defs in the same block.  Pick the earlier def.
+      if (!Dom.second.isValid() || VNI->def < Dom.second)
+        Dom.second = VNI->def;
+    } else {
+      // Different basic blocks. Check if one dominates.
+      MachineBasicBlock *Near =
+        MDT.findNearestCommonDominator(Dom.first, ValMBB);
+      if (Near == ValMBB)
+        // Def ValMBB dominates.
+        Dom = DomPair(ValMBB, VNI->def);
+      else if (Near != Dom.first)
+        // None dominate. Hoist to common dominator, need new def.
+        Dom = DomPair(Near, SlotIndex());
+      Costs[ParentVNI->id] += MBFI.getBlockFreq(ValMBB);
+    }
+
+    LLVM_DEBUG(dbgs() << "Multi-mapped complement " << VNI->id << '@'
+                      << VNI->def << " for parent " << ParentVNI->id << '@'
+                      << ParentVNI->def << " hoist to "
+                      << printMBBReference(*Dom.first) << ' ' << Dom.second
+                      << '\n');
+  }
+
+  // Insert the hoisted copies.
+  for (unsigned i = 0, e = Parent->getNumValNums(); i != e; ++i) {
+    DomPair &Dom = NearestDom[i];
+    if (!Dom.first || Dom.second.isValid())
+      continue;
+    // This value needs a hoisted copy inserted at the end of Dom.first.
+    const VNInfo *ParentVNI = Parent->getValNumInfo(i);
+    MachineBasicBlock *DefMBB = LIS.getMBBFromIndex(ParentVNI->def);
+    // Get a less loopy dominator than Dom.first.
+    Dom.first = findShallowDominator(Dom.first, DefMBB);
+    if (SpillMode == SM_Speed &&
+        MBFI.getBlockFreq(Dom.first) > Costs[ParentVNI->id]) {
+      NotToHoistSet.insert(ParentVNI->id);
+      continue;
+    }
+    SlotIndex LSP = SA.getLastSplitPoint(Dom.first);
+    if (LSP <= ParentVNI->def) {
+      NotToHoistSet.insert(ParentVNI->id);
+      continue;
+    }
+    Dom.second = defFromParent(0, ParentVNI, LSP, *Dom.first,
+                               SA.getLastSplitPointIter(Dom.first))->def;
+  }
+
+  // Remove redundant back-copies that are now known to be dominated by another
+  // def with the same value.
+  SmallVector<VNInfo*, 8> BackCopies;
+  for (VNInfo *VNI : LI->valnos) {
+    if (VNI->isUnused())
+      continue;
+    VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
+    const DomPair &Dom = NearestDom[ParentVNI->id];
+    if (!Dom.first || Dom.second == VNI->def ||
+        NotToHoistSet.count(ParentVNI->id))
+      continue;
+    BackCopies.push_back(VNI);
+    forceRecompute(0, *ParentVNI);
+  }
+
+  // If it is not beneficial to hoist all the BackCopies, simply remove
+  // redundant BackCopies in speed mode.
+  if (SpillMode == SM_Speed && !NotToHoistSet.empty())
+    computeRedundantBackCopies(NotToHoistSet, BackCopies);
+
+  removeBackCopies(BackCopies);
+}
+
+/// transferValues - Transfer all possible values to the new live ranges.
+/// Values that were rematerialized are left alone, they need LICalc.extend().
+bool SplitEditor::transferValues() {
+  bool Skipped = false;
+  RegAssignMap::const_iterator AssignI = RegAssign.begin();
+  for (const LiveRange::Segment &S : Edit->getParent()) {
+    LLVM_DEBUG(dbgs() << "  blit " << S << ':');
+    VNInfo *ParentVNI = S.valno;
+    // RegAssign has holes where RegIdx 0 should be used.
+    SlotIndex Start = S.start;
+    AssignI.advanceTo(Start);
+    do {
+      unsigned RegIdx;
+      SlotIndex End = S.end;
+      if (!AssignI.valid()) {
+        RegIdx = 0;
+      } else if (AssignI.start() <= Start) {
+        RegIdx = AssignI.value();
+        if (AssignI.stop() < End) {
+          End = AssignI.stop();
+          ++AssignI;
+        }
+      } else {
+        RegIdx = 0;
+        End = std::min(End, AssignI.start());
+      }
+
+      // The interval [Start;End) is continuously mapped to RegIdx, ParentVNI.
+      LLVM_DEBUG(dbgs() << " [" << Start << ';' << End << ")=" << RegIdx << '('
+                        << printReg(Edit->get(RegIdx)) << ')');
+      LiveInterval &LI = LIS.getInterval(Edit->get(RegIdx));
+
+      // Check for a simply defined value that can be blitted directly.
+      ValueForcePair VFP = Values.lookup(std::make_pair(RegIdx, ParentVNI->id));
+      if (VNInfo *VNI = VFP.getPointer()) {
+        LLVM_DEBUG(dbgs() << ':' << VNI->id);
+        LI.addSegment(LiveInterval::Segment(Start, End, VNI));
+        Start = End;
+        continue;
+      }
+
+      // Skip values with forced recomputation.
+      if (VFP.getInt()) {
+        LLVM_DEBUG(dbgs() << "(recalc)");
+        Skipped = true;
+        Start = End;
+        continue;
+      }
+
+      LiveIntervalCalc &LIC = getLICalc(RegIdx);
+
+      // This value has multiple defs in RegIdx, but it wasn't rematerialized,
+      // so the live range is accurate. Add live-in blocks in [Start;End) to the
+      // LiveInBlocks.
+      MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start)->getIterator();
+      SlotIndex BlockStart, BlockEnd;
+      std::tie(BlockStart, BlockEnd) = LIS.getSlotIndexes()->getMBBRange(&*MBB);
+
+      // The first block may be live-in, or it may have its own def.
+      if (Start != BlockStart) {
+        VNInfo *VNI = LI.extendInBlock(BlockStart, std::min(BlockEnd, End));
+        assert(VNI && "Missing def for complex mapped value");
+        LLVM_DEBUG(dbgs() << ':' << VNI->id << "*" << printMBBReference(*MBB));
+        // MBB has its own def. Is it also live-out?
+        if (BlockEnd <= End)
+          LIC.setLiveOutValue(&*MBB, VNI);
+
+        // Skip to the next block for live-in.
+        ++MBB;
+        BlockStart = BlockEnd;
+      }
+
+      // Handle the live-in blocks covered by [Start;End).
+      assert(Start <= BlockStart && "Expected live-in block");
+      while (BlockStart < End) {
+        LLVM_DEBUG(dbgs() << ">" << printMBBReference(*MBB));
+        BlockEnd = LIS.getMBBEndIdx(&*MBB);
+        if (BlockStart == ParentVNI->def) {
+          // This block has the def of a parent PHI, so it isn't live-in.
+          assert(ParentVNI->isPHIDef() && "Non-phi defined at block start?");
+          VNInfo *VNI = LI.extendInBlock(BlockStart, std::min(BlockEnd, End));
+          assert(VNI && "Missing def for complex mapped parent PHI");
+          if (End >= BlockEnd)
+            LIC.setLiveOutValue(&*MBB, VNI); // Live-out as well.
+        } else {
+          // This block needs a live-in value.  The last block covered may not
+          // be live-out.
+          if (End < BlockEnd)
+            LIC.addLiveInBlock(LI, MDT[&*MBB], End);
+          else {
+            // Live-through, and we don't know the value.
+            LIC.addLiveInBlock(LI, MDT[&*MBB]);
+            LIC.setLiveOutValue(&*MBB, nullptr);
+          }
+        }
+        BlockStart = BlockEnd;
+        ++MBB;
+      }
+      Start = End;
+    } while (Start != S.end);
+    LLVM_DEBUG(dbgs() << '\n');
+  }
+
+  LICalc[0].calculateValues();
+  if (SpillMode)
+    LICalc[1].calculateValues();
+
+  return Skipped;
+}
+
+static bool removeDeadSegment(SlotIndex Def, LiveRange &LR) {
+  const LiveRange::Segment *Seg = LR.getSegmentContaining(Def);
+  if (Seg == nullptr)
+    return true;
+  if (Seg->end != Def.getDeadSlot())
+    return false;
+  // This is a dead PHI. Remove it.
+  LR.removeSegment(*Seg, true);
+  return true;
+}
+
+void SplitEditor::extendPHIRange(MachineBasicBlock &B, LiveIntervalCalc &LIC,
+                                 LiveRange &LR, LaneBitmask LM,
+                                 ArrayRef<SlotIndex> Undefs) {
+  for (MachineBasicBlock *P : B.predecessors()) {
+    SlotIndex End = LIS.getMBBEndIdx(P);
+    SlotIndex LastUse = End.getPrevSlot();
+    // The predecessor may not have a live-out value. That is OK, like an
+    // undef PHI operand.
+    const LiveInterval &PLI = Edit->getParent();
+    // Need the cast because the inputs to ?: would otherwise be deemed
+    // "incompatible": SubRange vs LiveInterval.
+    const LiveRange &PSR = !LM.all() ? getSubRangeForMaskExact(LM, PLI)
+                                     : static_cast<const LiveRange &>(PLI);
+    if (PSR.liveAt(LastUse))
+      LIC.extend(LR, End, /*PhysReg=*/0, Undefs);
+  }
+}
+
+void SplitEditor::extendPHIKillRanges() {
+  // Extend live ranges to be live-out for successor PHI values.
+
+  // Visit each PHI def slot in the parent live interval. If the def is dead,
+  // remove it. Otherwise, extend the live interval to reach the end indexes
+  // of all predecessor blocks.
+
+  const LiveInterval &ParentLI = Edit->getParent();
+  for (const VNInfo *V : ParentLI.valnos) {
+    if (V->isUnused() || !V->isPHIDef())
+      continue;
+
+    unsigned RegIdx = RegAssign.lookup(V->def);
+    LiveInterval &LI = LIS.getInterval(Edit->get(RegIdx));
+    LiveIntervalCalc &LIC = getLICalc(RegIdx);
+    MachineBasicBlock &B = *LIS.getMBBFromIndex(V->def);
+    if (!removeDeadSegment(V->def, LI))
+      extendPHIRange(B, LIC, LI, LaneBitmask::getAll(), /*Undefs=*/{});
+  }
+
+  SmallVector<SlotIndex, 4> Undefs;
+  LiveIntervalCalc SubLIC;
+
+  for (const LiveInterval::SubRange &PS : ParentLI.subranges()) {
+    for (const VNInfo *V : PS.valnos) {
+      if (V->isUnused() || !V->isPHIDef())
+        continue;
+      unsigned RegIdx = RegAssign.lookup(V->def);
+      LiveInterval &LI = LIS.getInterval(Edit->get(RegIdx));
+      LiveInterval::SubRange &S = getSubRangeForMaskExact(PS.LaneMask, LI);
+      if (removeDeadSegment(V->def, S))
+        continue;
+
+      MachineBasicBlock &B = *LIS.getMBBFromIndex(V->def);
+      SubLIC.reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
+                   &LIS.getVNInfoAllocator());
+      Undefs.clear();
+      LI.computeSubRangeUndefs(Undefs, PS.LaneMask, MRI, *LIS.getSlotIndexes());
+      extendPHIRange(B, SubLIC, S, PS.LaneMask, Undefs);
+    }
+  }
+}
+
+/// rewriteAssigned - Rewrite all uses of Edit->getReg().
+void SplitEditor::rewriteAssigned(bool ExtendRanges) {
+  struct ExtPoint {
+    ExtPoint(const MachineOperand &O, unsigned R, SlotIndex N)
+      : MO(O), RegIdx(R), Next(N) {}
+
+    MachineOperand MO;
+    unsigned RegIdx;
+    SlotIndex Next;
+  };
+
+  SmallVector<ExtPoint,4> ExtPoints;
+
+  for (MachineOperand &MO :
+       llvm::make_early_inc_range(MRI.reg_operands(Edit->getReg()))) {
+    MachineInstr *MI = MO.getParent();
+    // LiveDebugVariables should have handled all DBG_VALUE instructions.
+    if (MI->isDebugValue()) {
+      LLVM_DEBUG(dbgs() << "Zapping " << *MI);
+      MO.setReg(0);
+      continue;
+    }
+
+    // <undef> operands don't really read the register, so it doesn't matter
+    // which register we choose.  When the use operand is tied to a def, we must
+    // use the same register as the def, so just do that always.
+    SlotIndex Idx = LIS.getInstructionIndex(*MI);
+    if (MO.isDef() || MO.isUndef())
+      Idx = Idx.getRegSlot(MO.isEarlyClobber());
+
+    // Rewrite to the mapped register at Idx.
+    unsigned RegIdx = RegAssign.lookup(Idx);
+    LiveInterval &LI = LIS.getInterval(Edit->get(RegIdx));
+    MO.setReg(LI.reg());
+    LLVM_DEBUG(dbgs() << "  rewr " << printMBBReference(*MI->getParent())
+                      << '\t' << Idx << ':' << RegIdx << '\t' << *MI);
+
+    // Extend liveness to Idx if the instruction reads reg.
+    if (!ExtendRanges || MO.isUndef())
+      continue;
+
+    // Skip instructions that don't read Reg.
+    if (MO.isDef()) {
+      if (!MO.getSubReg() && !MO.isEarlyClobber())
+        continue;
+      // We may want to extend a live range for a partial redef, or for a use
+      // tied to an early clobber.
+      if (!Edit->getParent().liveAt(Idx.getPrevSlot()))
+        continue;
+    } else {
+      assert(MO.isUse());
+      bool IsEarlyClobber = false;
+      if (MO.isTied()) {
+        // We want to extend a live range into `e` slot rather than `r` slot if
+        // tied-def is early clobber, because the `e` slot already contained
+        // in the live range of early-clobber tied-def operand, give an example
+        // here:
+        //  0  %0 = ...
+        // 16  early-clobber %0 = Op %0 (tied-def 0), ...
+        // 32  ... = Op %0
+        // Before extend:
+        //   %0 = [0r, 0d) [16e, 32d)
+        // The point we want to extend is 0d to 16e not 16r in this case, but if
+        // we use 16r here we will extend nothing because that already contained
+        // in [16e, 32d).
+        unsigned OpIdx = MO.getOperandNo();
+        unsigned DefOpIdx = MI->findTiedOperandIdx(OpIdx);
+        const MachineOperand &DefOp = MI->getOperand(DefOpIdx);
+        IsEarlyClobber = DefOp.isEarlyClobber();
+      }
+
+      Idx = Idx.getRegSlot(IsEarlyClobber);
+    }
+
+    SlotIndex Next = Idx;
+    if (LI.hasSubRanges()) {
+      // We have to delay extending subranges until we have seen all operands
+      // defining the register. This is because a <def,read-undef> operand
+      // will create an "undef" point, and we cannot extend any subranges
+      // until all of them have been accounted for.
+      if (MO.isUse())
+        ExtPoints.push_back(ExtPoint(MO, RegIdx, Next));
+    } else {
+      LiveIntervalCalc &LIC = getLICalc(RegIdx);
+      LIC.extend(LI, Next, 0, ArrayRef<SlotIndex>());
+    }
+  }
+
+  for (ExtPoint &EP : ExtPoints) {
+    LiveInterval &LI = LIS.getInterval(Edit->get(EP.RegIdx));
+    assert(LI.hasSubRanges());
+
+    LiveIntervalCalc SubLIC;
+    Register Reg = EP.MO.getReg(), Sub = EP.MO.getSubReg();
+    LaneBitmask LM = Sub != 0 ? TRI.getSubRegIndexLaneMask(Sub)
+                              : MRI.getMaxLaneMaskForVReg(Reg);
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      if ((S.LaneMask & LM).none())
+        continue;
+      // The problem here can be that the new register may have been created
+      // for a partially defined original register. For example:
+      //   %0:subreg_hireg<def,read-undef> = ...
+      //   ...
+      //   %1 = COPY %0
+      if (S.empty())
+        continue;
+      SubLIC.reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
+                   &LIS.getVNInfoAllocator());
+      SmallVector<SlotIndex, 4> Undefs;
+      LI.computeSubRangeUndefs(Undefs, S.LaneMask, MRI, *LIS.getSlotIndexes());
+      SubLIC.extend(S, EP.Next, 0, Undefs);
+    }
+  }
+
+  for (Register R : *Edit) {
+    LiveInterval &LI = LIS.getInterval(R);
+    if (!LI.hasSubRanges())
+      continue;
+    LI.clear();
+    LI.removeEmptySubRanges();
+    LIS.constructMainRangeFromSubranges(LI);
+  }
+}
+
+void SplitEditor::deleteRematVictims() {
+  SmallVector<MachineInstr*, 8> Dead;
+  for (const Register &R : *Edit) {
+    LiveInterval *LI = &LIS.getInterval(R);
+    for (const LiveRange::Segment &S : LI->segments) {
+      // Dead defs end at the dead slot.
+      if (S.end != S.valno->def.getDeadSlot())
+        continue;
+      if (S.valno->isPHIDef())
+        continue;
+      MachineInstr *MI = LIS.getInstructionFromIndex(S.valno->def);
+      assert(MI && "Missing instruction for dead def");
+      MI->addRegisterDead(LI->reg(), &TRI);
+
+      if (!MI->allDefsAreDead())
+        continue;
+
+      LLVM_DEBUG(dbgs() << "All defs dead: " << *MI);
+      Dead.push_back(MI);
+    }
+  }
+
+  if (Dead.empty())
+    return;
+
+  Edit->eliminateDeadDefs(Dead, std::nullopt);
+}
+
+void SplitEditor::forceRecomputeVNI(const VNInfo &ParentVNI) {
+  // Fast-path for common case.
+  if (!ParentVNI.isPHIDef()) {
+    for (unsigned I = 0, E = Edit->size(); I != E; ++I)
+      forceRecompute(I, ParentVNI);
+    return;
+  }
+
+  // Trace value through phis.
+  SmallPtrSet<const VNInfo *, 8> Visited; ///< whether VNI was/is in worklist.
+  SmallVector<const VNInfo *, 4> WorkList;
+  Visited.insert(&ParentVNI);
+  WorkList.push_back(&ParentVNI);
+
+  const LiveInterval &ParentLI = Edit->getParent();
+  const SlotIndexes &Indexes = *LIS.getSlotIndexes();
+  do {
+    const VNInfo &VNI = *WorkList.back();
+    WorkList.pop_back();
+    for (unsigned I = 0, E = Edit->size(); I != E; ++I)
+      forceRecompute(I, VNI);
+    if (!VNI.isPHIDef())
+      continue;
+
+    MachineBasicBlock &MBB = *Indexes.getMBBFromIndex(VNI.def);
+    for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+      SlotIndex PredEnd = Indexes.getMBBEndIdx(Pred);
+      VNInfo *PredVNI = ParentLI.getVNInfoBefore(PredEnd);
+      assert(PredVNI && "Value available in PhiVNI predecessor");
+      if (Visited.insert(PredVNI).second)
+        WorkList.push_back(PredVNI);
+    }
+  } while(!WorkList.empty());
+}
+
+void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
+  ++NumFinished;
+
+  // At this point, the live intervals in Edit contain VNInfos corresponding to
+  // the inserted copies.
+
+  // Add the original defs from the parent interval.
+  for (const VNInfo *ParentVNI : Edit->getParent().valnos) {
+    if (ParentVNI->isUnused())
+      continue;
+    unsigned RegIdx = RegAssign.lookup(ParentVNI->def);
+    defValue(RegIdx, ParentVNI, ParentVNI->def, true);
+
+    // Force rematted values to be recomputed everywhere.
+    // The new live ranges may be truncated.
+    if (Edit->didRematerialize(ParentVNI))
+      forceRecomputeVNI(*ParentVNI);
+  }
+
+  // Hoist back-copies to the complement interval when in spill mode.
+  switch (SpillMode) {
+  case SM_Partition:
+    // Leave all back-copies as is.
+    break;
+  case SM_Size:
+  case SM_Speed:
+    // hoistCopies will behave differently between size and speed.
+    hoistCopies();
+  }
+
+  // Transfer the simply mapped values, check if any are skipped.
+  bool Skipped = transferValues();
+
+  // Rewrite virtual registers, possibly extending ranges.
+  rewriteAssigned(Skipped);
+
+  if (Skipped)
+    extendPHIKillRanges();
+  else
+    ++NumSimple;
+
+  // Delete defs that were rematted everywhere.
+  if (Skipped)
+    deleteRematVictims();
+
+  // Get rid of unused values and set phi-kill flags.
+  for (Register Reg : *Edit) {
+    LiveInterval &LI = LIS.getInterval(Reg);
+    LI.removeEmptySubRanges();
+    LI.RenumberValues();
+  }
+
+  // Provide a reverse mapping from original indices to Edit ranges.
+  if (LRMap) {
+    auto Seq = llvm::seq<unsigned>(0, Edit->size());
+    LRMap->assign(Seq.begin(), Seq.end());
+  }
+
+  // Now check if any registers were separated into multiple components.
+  ConnectedVNInfoEqClasses ConEQ(LIS);
+  for (unsigned i = 0, e = Edit->size(); i != e; ++i) {
+    // Don't use iterators, they are invalidated by create() below.
+    Register VReg = Edit->get(i);
+    LiveInterval &LI = LIS.getInterval(VReg);
+    SmallVector<LiveInterval*, 8> SplitLIs;
+    LIS.splitSeparateComponents(LI, SplitLIs);
+    Register Original = VRM.getOriginal(VReg);
+    for (LiveInterval *SplitLI : SplitLIs)
+      VRM.setIsSplitFromReg(SplitLI->reg(), Original);
+
+    // The new intervals all map back to i.
+    if (LRMap)
+      LRMap->resize(Edit->size(), i);
+  }
+
+  // Calculate spill weight and allocation hints for new intervals.
+  Edit->calculateRegClassAndHint(VRM.getMachineFunction(), VRAI);
+
+  assert(!LRMap || LRMap->size() == Edit->size());
+}
+
+//===----------------------------------------------------------------------===//
+//                            Single Block Splitting
+//===----------------------------------------------------------------------===//
+
+bool SplitAnalysis::shouldSplitSingleBlock(const BlockInfo &BI,
+                                           bool SingleInstrs) const {
+  // Always split for multiple instructions.
+  if (!BI.isOneInstr())
+    return true;
+  // Don't split for single instructions unless explicitly requested.
+  if (!SingleInstrs)
+    return false;
+  // Splitting a live-through range always makes progress.
+  if (BI.LiveIn && BI.LiveOut)
+    return true;
+  // No point in isolating a copy. It has no register class constraints.
+  if (LIS.getInstructionFromIndex(BI.FirstInstr)->isCopyLike())
+    return false;
+  // Finally, don't isolate an end point that was created by earlier splits.
+  return isOriginalEndpoint(BI.FirstInstr);
+}
+
+void SplitEditor::splitSingleBlock(const SplitAnalysis::BlockInfo &BI) {
+  openIntv();
+  SlotIndex LastSplitPoint = SA.getLastSplitPoint(BI.MBB);
+  SlotIndex SegStart = enterIntvBefore(std::min(BI.FirstInstr,
+    LastSplitPoint));
+  if (!BI.LiveOut || BI.LastInstr < LastSplitPoint) {
+    useIntv(SegStart, leaveIntvAfter(BI.LastInstr));
+  } else {
+      // The last use is after the last valid split point.
+    SlotIndex SegStop = leaveIntvBefore(LastSplitPoint);
+    useIntv(SegStart, SegStop);
+    overlapIntv(SegStop, BI.LastInstr);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                    Global Live Range Splitting Support
+//===----------------------------------------------------------------------===//
+
+// These methods support a method of global live range splitting that uses a
+// global algorithm to decide intervals for CFG edges. They will insert split
+// points and color intervals in basic blocks while avoiding interference.
+//
+// Note that splitSingleBlock is also useful for blocks where both CFG edges
+// are on the stack.
+
+void SplitEditor::splitLiveThroughBlock(unsigned MBBNum,
+                                        unsigned IntvIn, SlotIndex LeaveBefore,
+                                        unsigned IntvOut, SlotIndex EnterAfter){
+  SlotIndex Start, Stop;
+  std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(MBBNum);
+
+  LLVM_DEBUG(dbgs() << "%bb." << MBBNum << " [" << Start << ';' << Stop
+                    << ") intf " << LeaveBefore << '-' << EnterAfter
+                    << ", live-through " << IntvIn << " -> " << IntvOut);
+
+  assert((IntvIn || IntvOut) && "Use splitSingleBlock for isolated blocks");
+
+  assert((!LeaveBefore || LeaveBefore < Stop) && "Interference after block");
+  assert((!IntvIn || !LeaveBefore || LeaveBefore > Start) && "Impossible intf");
+  assert((!EnterAfter || EnterAfter >= Start) && "Interference before block");
+
+  MachineBasicBlock *MBB = VRM.getMachineFunction().getBlockNumbered(MBBNum);
+
+  if (!IntvOut) {
+    LLVM_DEBUG(dbgs() << ", spill on entry.\n");
+    //
+    //        <<<<<<<<<    Possible LeaveBefore interference.
+    //    |-----------|    Live through.
+    //    -____________    Spill on entry.
+    //
+    selectIntv(IntvIn);
+    SlotIndex Idx = leaveIntvAtTop(*MBB);
+    assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
+    (void)Idx;
+    return;
+  }
+
+  if (!IntvIn) {
+    LLVM_DEBUG(dbgs() << ", reload on exit.\n");
+    //
+    //    >>>>>>>          Possible EnterAfter interference.
+    //    |-----------|    Live through.
+    //    ___________--    Reload on exit.
+    //
+    selectIntv(IntvOut);
+    SlotIndex Idx = enterIntvAtEnd(*MBB);
+    assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
+    (void)Idx;
+    return;
+  }
+
+  if (IntvIn == IntvOut && !LeaveBefore && !EnterAfter) {
+    LLVM_DEBUG(dbgs() << ", straight through.\n");
+    //
+    //    |-----------|    Live through.
+    //    -------------    Straight through, same intv, no interference.
+    //
+    selectIntv(IntvOut);
+    useIntv(Start, Stop);
+    return;
+  }
+
+  // We cannot legally insert splits after LSP.
+  SlotIndex LSP = SA.getLastSplitPoint(MBBNum);
+  assert((!IntvOut || !EnterAfter || EnterAfter < LSP) && "Impossible intf");
+
+  if (IntvIn != IntvOut && (!LeaveBefore || !EnterAfter ||
+                  LeaveBefore.getBaseIndex() > EnterAfter.getBoundaryIndex())) {
+    LLVM_DEBUG(dbgs() << ", switch avoiding interference.\n");
+    //
+    //    >>>>     <<<<    Non-overlapping EnterAfter/LeaveBefore interference.
+    //    |-----------|    Live through.
+    //    ------=======    Switch intervals between interference.
+    //
+    selectIntv(IntvOut);
+    SlotIndex Idx;
+    if (LeaveBefore && LeaveBefore < LSP) {
+      Idx = enterIntvBefore(LeaveBefore);
+      useIntv(Idx, Stop);
+    } else {
+      Idx = enterIntvAtEnd(*MBB);
+    }
+    selectIntv(IntvIn);
+    useIntv(Start, Idx);
+    assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
+    assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
+    return;
+  }
+
+  LLVM_DEBUG(dbgs() << ", create local intv for interference.\n");
+  //
+  //    >>><><><><<<<    Overlapping EnterAfter/LeaveBefore interference.
+  //    |-----------|    Live through.
+  //    ==---------==    Switch intervals before/after interference.
+  //
+  assert(LeaveBefore <= EnterAfter && "Missed case");
+
+  selectIntv(IntvOut);
+  SlotIndex Idx = enterIntvAfter(EnterAfter);
+  useIntv(Idx, Stop);
+  assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
+
+  selectIntv(IntvIn);
+  Idx = leaveIntvBefore(LeaveBefore);
+  useIntv(Start, Idx);
+  assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
+}
+
+void SplitEditor::splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
+                                  unsigned IntvIn, SlotIndex LeaveBefore) {
+  SlotIndex Start, Stop;
+  std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);
+
+  LLVM_DEBUG(dbgs() << printMBBReference(*BI.MBB) << " [" << Start << ';'
+                    << Stop << "), uses " << BI.FirstInstr << '-'
+                    << BI.LastInstr << ", reg-in " << IntvIn
+                    << ", leave before " << LeaveBefore
+                    << (BI.LiveOut ? ", stack-out" : ", killed in block"));
+
+  assert(IntvIn && "Must have register in");
+  assert(BI.LiveIn && "Must be live-in");
+  assert((!LeaveBefore || LeaveBefore > Start) && "Bad interference");
+
+  if (!BI.LiveOut && (!LeaveBefore || LeaveBefore >= BI.LastInstr)) {
+    LLVM_DEBUG(dbgs() << " before interference.\n");
+    //
+    //               <<<    Interference after kill.
+    //     |---o---x   |    Killed in block.
+    //     =========        Use IntvIn everywhere.
+    //
+    selectIntv(IntvIn);
+    useIntv(Start, BI.LastInstr);
+    return;
+  }
+
+  SlotIndex LSP = SA.getLastSplitPoint(BI.MBB);
+
+  if (!LeaveBefore || LeaveBefore > BI.LastInstr.getBoundaryIndex()) {
+    //
+    //               <<<    Possible interference after last use.
+    //     |---o---o---|    Live-out on stack.
+    //     =========____    Leave IntvIn after last use.
+    //
+    //                 <    Interference after last use.
+    //     |---o---o--o|    Live-out on stack, late last use.
+    //     ============     Copy to stack after LSP, overlap IntvIn.
+    //            \_____    Stack interval is live-out.
+    //
+    if (BI.LastInstr < LSP) {
+      LLVM_DEBUG(dbgs() << ", spill after last use before interference.\n");
+      selectIntv(IntvIn);
+      SlotIndex Idx = leaveIntvAfter(BI.LastInstr);
+      useIntv(Start, Idx);
+      assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
+    } else {
+      LLVM_DEBUG(dbgs() << ", spill before last split point.\n");
+      selectIntv(IntvIn);
+      SlotIndex Idx = leaveIntvBefore(LSP);
+      overlapIntv(Idx, BI.LastInstr);
+      useIntv(Start, Idx);
+      assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
+    }
+    return;
+  }
+
+  // The interference is overlapping somewhere we wanted to use IntvIn. That
+  // means we need to create a local interval that can be allocated a
+  // different register.
+  unsigned LocalIntv = openIntv();
+  (void)LocalIntv;
+  LLVM_DEBUG(dbgs() << ", creating local interval " << LocalIntv << ".\n");
+
+  if (!BI.LiveOut || BI.LastInstr < LSP) {
+    //
+    //           <<<<<<<    Interference overlapping uses.
+    //     |---o---o---|    Live-out on stack.
+    //     =====----____    Leave IntvIn before interference, then spill.
+    //
+    SlotIndex To = leaveIntvAfter(BI.LastInstr);
+    SlotIndex From = enterIntvBefore(LeaveBefore);
+    useIntv(From, To);
+    selectIntv(IntvIn);
+    useIntv(Start, From);
+    assert((!LeaveBefore || From <= LeaveBefore) && "Interference");
+    return;
+  }
+
+  //           <<<<<<<    Interference overlapping uses.
+  //     |---o---o--o|    Live-out on stack, late last use.
+  //     =====-------     Copy to stack before LSP, overlap LocalIntv.
+  //            \_____    Stack interval is live-out.
+  //
+  SlotIndex To = leaveIntvBefore(LSP);
+  overlapIntv(To, BI.LastInstr);
+  SlotIndex From = enterIntvBefore(std::min(To, LeaveBefore));
+  useIntv(From, To);
+  selectIntv(IntvIn);
+  useIntv(Start, From);
+  assert((!LeaveBefore || From <= LeaveBefore) && "Interference");
+}
+
+void SplitEditor::splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
+                                   unsigned IntvOut, SlotIndex EnterAfter) {
+  SlotIndex Start, Stop;
+  std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);
+
+  LLVM_DEBUG(dbgs() << printMBBReference(*BI.MBB) << " [" << Start << ';'
+                    << Stop << "), uses " << BI.FirstInstr << '-'
+                    << BI.LastInstr << ", reg-out " << IntvOut
+                    << ", enter after " << EnterAfter
+                    << (BI.LiveIn ? ", stack-in" : ", defined in block"));
+
+  SlotIndex LSP = SA.getLastSplitPoint(BI.MBB);
+
+  assert(IntvOut && "Must have register out");
+  assert(BI.LiveOut && "Must be live-out");
+  assert((!EnterAfter || EnterAfter < LSP) && "Bad interference");
+
+  if (!BI.LiveIn && (!EnterAfter || EnterAfter <= BI.FirstInstr)) {
+    LLVM_DEBUG(dbgs() << " after interference.\n");
+    //
+    //    >>>>             Interference before def.
+    //    |   o---o---|    Defined in block.
+    //        =========    Use IntvOut everywhere.
+    //
+    selectIntv(IntvOut);
+    useIntv(BI.FirstInstr, Stop);
+    return;
+  }
+
+  if (!EnterAfter || EnterAfter < BI.FirstInstr.getBaseIndex()) {
+    LLVM_DEBUG(dbgs() << ", reload after interference.\n");
+    //
+    //    >>>>             Interference before def.
+    //    |---o---o---|    Live-through, stack-in.
+    //    ____=========    Enter IntvOut before first use.
+    //
+    selectIntv(IntvOut);
+    SlotIndex Idx = enterIntvBefore(std::min(LSP, BI.FirstInstr));
+    useIntv(Idx, Stop);
+    assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
+    return;
+  }
+
+  // The interference is overlapping somewhere we wanted to use IntvOut. That
+  // means we need to create a local interval that can be allocated a
+  // different register.
+  LLVM_DEBUG(dbgs() << ", interference overlaps uses.\n");
+  //
+  //    >>>>>>>          Interference overlapping uses.
+  //    |---o---o---|    Live-through, stack-in.
+  //    ____---======    Create local interval for interference range.
+  //
+  selectIntv(IntvOut);
+  SlotIndex Idx = enterIntvAfter(EnterAfter);
+  useIntv(Idx, Stop);
+  assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
+
+  openIntv();
+  SlotIndex From = enterIntvBefore(std::min(Idx, BI.FirstInstr));
+  useIntv(From, Idx);
+}
+
+void SplitAnalysis::BlockInfo::print(raw_ostream &OS) const {
+  OS << "{" << printMBBReference(*MBB) << ", "
+     << "uses " << FirstInstr << " to " << LastInstr << ", "
+     << "1st def " << FirstDef << ", "
+     << (LiveIn ? "live in" : "dead in") << ", "
+     << (LiveOut ? "live out" : "dead out") << "}";
+}
+
+void SplitAnalysis::BlockInfo::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SplitKit.h b/contrib/llvm-project/llvm/lib/CodeGen/SplitKit.h
new file mode 100644
index 000000000000..f764ffd4750c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SplitKit.h
@@ -0,0 +1,557 @@
+//===- SplitKit.h - Toolkit for splitting live ranges -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SplitAnalysis class as well as mutator functions for
+// live range splitting.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SPLITKIT_H
+#define LLVM_LIB_CODEGEN_SPLITKIT_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/IntervalMap.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LiveIntervalCalc.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/Support/Compiler.h"
+#include <utility>
+
+namespace llvm {
+
+class LiveInterval;
+class LiveRange;
+class LiveIntervals;
+class LiveRangeEdit;
+class MachineBlockFrequencyInfo;
+class MachineDominatorTree;
+class MachineLoopInfo;
+class MachineRegisterInfo;
+class TargetInstrInfo;
+class TargetRegisterInfo;
+class VirtRegMap;
+class VirtRegAuxInfo;
+
+/// Determines the latest safe point in a block in which we can insert a split,
+/// spill or other instruction related with CurLI.
+class LLVM_LIBRARY_VISIBILITY InsertPointAnalysis {
+private:
+  const LiveIntervals &LIS;
+
+  /// Last legal insert point in each basic block in the current function.
+  /// The first entry is the first terminator, the second entry is the
+  /// last valid point to insert a split or spill for a variable that is
+  /// live into a landing pad or inlineasm_br successor.
+  SmallVector<std::pair<SlotIndex, SlotIndex>, 8> LastInsertPoint;
+
+  SlotIndex computeLastInsertPoint(const LiveInterval &CurLI,
+                                   const MachineBasicBlock &MBB);
+
+public:
+  InsertPointAnalysis(const LiveIntervals &lis, unsigned BBNum);
+
+  /// Return the base index of the last valid insert point for \pCurLI in \pMBB.
+  SlotIndex getLastInsertPoint(const LiveInterval &CurLI,
+                               const MachineBasicBlock &MBB) {
+    unsigned Num = MBB.getNumber();
+    // Inline the common simple case.
+    if (LastInsertPoint[Num].first.isValid() &&
+        !LastInsertPoint[Num].second.isValid())
+      return LastInsertPoint[Num].first;
+    return computeLastInsertPoint(CurLI, MBB);
+  }
+
+  /// Returns the last insert point as an iterator for \pCurLI in \pMBB.
+  MachineBasicBlock::iterator getLastInsertPointIter(const LiveInterval &CurLI,
+                                                     MachineBasicBlock &MBB);
+
+  /// Return the base index of the first insert point in \pMBB.
+  SlotIndex getFirstInsertPoint(MachineBasicBlock &MBB) {
+    SlotIndex Res = LIS.getMBBStartIdx(&MBB);
+    if (!MBB.empty()) {
+      MachineBasicBlock::iterator MII = MBB.SkipPHIsLabelsAndDebug(MBB.begin());
+      if (MII != MBB.end())
+        Res = LIS.getInstructionIndex(*MII);
+    }
+    return Res;
+  }
+
+};
+
+/// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting
+/// opportunities.
+class LLVM_LIBRARY_VISIBILITY SplitAnalysis {
+public:
+  const MachineFunction &MF;
+  const VirtRegMap &VRM;
+  const LiveIntervals &LIS;
+  const MachineLoopInfo &Loops;
+  const TargetInstrInfo &TII;
+
+  /// Additional information about basic blocks where the current variable is
+  /// live. Such a block will look like one of these templates:
+  ///
+  ///  1. |   o---x   | Internal to block. Variable is only live in this block.
+  ///  2. |---x       | Live-in, kill.
+  ///  3. |       o---| Def, live-out.
+  ///  4. |---x   o---| Live-in, kill, def, live-out. Counted by NumGapBlocks.
+  ///  5. |---o---o---| Live-through with uses or defs.
+  ///  6. |-----------| Live-through without uses. Counted by NumThroughBlocks.
+  ///
+  /// Two BlockInfo entries are created for template 4. One for the live-in
+  /// segment, and one for the live-out segment. These entries look as if the
+  /// block were split in the middle where the live range isn't live.
+  ///
+  /// Live-through blocks without any uses don't get BlockInfo entries. They
+  /// are simply listed in ThroughBlocks instead.
+  ///
+  struct BlockInfo {
+    MachineBasicBlock *MBB;
+    SlotIndex FirstInstr; ///< First instr accessing current reg.
+    SlotIndex LastInstr;  ///< Last instr accessing current reg.
+    SlotIndex FirstDef;   ///< First non-phi valno->def, or SlotIndex().
+    bool LiveIn;          ///< Current reg is live in.
+    bool LiveOut;         ///< Current reg is live out.
+
+    /// isOneInstr - Returns true when this BlockInfo describes a single
+    /// instruction.
+    bool isOneInstr() const {
+      return SlotIndex::isSameInstr(FirstInstr, LastInstr);
+    }
+
+    void print(raw_ostream &OS) const;
+    void dump() const;
+  };
+
+private:
+  // Current live interval.
+  const LiveInterval *CurLI = nullptr;
+
+  /// Insert Point Analysis.
+  InsertPointAnalysis IPA;
+
+  // Sorted slot indexes of using instructions.
+  SmallVector<SlotIndex, 8> UseSlots;
+
+  /// UseBlocks - Blocks where CurLI has uses.
+  SmallVector<BlockInfo, 8> UseBlocks;
+
+  /// NumGapBlocks - Number of duplicate entries in UseBlocks for blocks where
+  /// the live range has a gap.
+  unsigned NumGapBlocks = 0u;
+
+  /// ThroughBlocks - Block numbers where CurLI is live through without uses.
+  BitVector ThroughBlocks;
+
+  /// NumThroughBlocks - Number of live-through blocks.
+  unsigned NumThroughBlocks = 0u;
+
+  // Sumarize statistics by counting instructions using CurLI.
+  void analyzeUses();
+
+  /// calcLiveBlockInfo - Compute per-block information about CurLI.
+  void calcLiveBlockInfo();
+
+public:
+  SplitAnalysis(const VirtRegMap &vrm, const LiveIntervals &lis,
+                const MachineLoopInfo &mli);
+
+  /// analyze - set CurLI to the specified interval, and analyze how it may be
+  /// split.
+  void analyze(const LiveInterval *li);
+
+  /// clear - clear all data structures so SplitAnalysis is ready to analyze a
+  /// new interval.
+  void clear();
+
+  /// getParent - Return the last analyzed interval.
+  const LiveInterval &getParent() const { return *CurLI; }
+
+  /// isOriginalEndpoint - Return true if the original live range was killed or
+  /// (re-)defined at Idx. Idx should be the 'def' slot for a normal kill/def,
+  /// and 'use' for an early-clobber def.
+  /// This can be used to recognize code inserted by earlier live range
+  /// splitting.
+  bool isOriginalEndpoint(SlotIndex Idx) const;
+
+  /// getUseSlots - Return an array of SlotIndexes of instructions using CurLI.
+  /// This include both use and def operands, at most one entry per instruction.
+  ArrayRef<SlotIndex> getUseSlots() const { return UseSlots; }
+
+  /// getUseBlocks - Return an array of BlockInfo objects for the basic blocks
+  /// where CurLI has uses.
+  ArrayRef<BlockInfo> getUseBlocks() const { return UseBlocks; }
+
+  /// getNumThroughBlocks - Return the number of through blocks.
+  unsigned getNumThroughBlocks() const { return NumThroughBlocks; }
+
+  /// isThroughBlock - Return true if CurLI is live through MBB without uses.
+  bool isThroughBlock(unsigned MBB) const { return ThroughBlocks.test(MBB); }
+
+  /// getThroughBlocks - Return the set of through blocks.
+  const BitVector &getThroughBlocks() const { return ThroughBlocks; }
+
+  /// getNumLiveBlocks - Return the number of blocks where CurLI is live.
+  unsigned getNumLiveBlocks() const {
+    return getUseBlocks().size() - NumGapBlocks + getNumThroughBlocks();
+  }
+
+  /// countLiveBlocks - Return the number of blocks where li is live. This is
+  /// guaranteed to return the same number as getNumLiveBlocks() after calling
+  /// analyze(li).
+  unsigned countLiveBlocks(const LiveInterval *li) const;
+
+  using BlockPtrSet = SmallPtrSet<const MachineBasicBlock *, 16>;
+
+  /// shouldSplitSingleBlock - Returns true if it would help to create a local
+  /// live range for the instructions in BI. There is normally no benefit to
+  /// creating a live range for a single instruction, but it does enable
+  /// register class inflation if the instruction has a restricted register
+  /// class.
+  ///
+  /// @param BI           The block to be isolated.
+  /// @param SingleInstrs True when single instructions should be isolated.
+  bool shouldSplitSingleBlock(const BlockInfo &BI, bool SingleInstrs) const;
+
+  SlotIndex getLastSplitPoint(unsigned Num) {
+    return IPA.getLastInsertPoint(*CurLI, *MF.getBlockNumbered(Num));
+  }
+
+  SlotIndex getLastSplitPoint(MachineBasicBlock *BB) {
+    return IPA.getLastInsertPoint(*CurLI, *BB);
+  }
+
+  MachineBasicBlock::iterator getLastSplitPointIter(MachineBasicBlock *BB) {
+    return IPA.getLastInsertPointIter(*CurLI, *BB);
+  }
+
+  SlotIndex getFirstSplitPoint(unsigned Num) {
+    return IPA.getFirstInsertPoint(*MF.getBlockNumbered(Num));
+  }
+};
+
+/// SplitEditor - Edit machine code and LiveIntervals for live range
+/// splitting.
+///
+/// - Create a SplitEditor from a SplitAnalysis.
+/// - Start a new live interval with openIntv.
+/// - Mark the places where the new interval is entered using enterIntv*
+/// - Mark the ranges where the new interval is used with useIntv*
+/// - Mark the places where the interval is exited with exitIntv*.
+/// - Finish the current interval with closeIntv and repeat from 2.
+/// - Rewrite instructions with finish().
+///
+class LLVM_LIBRARY_VISIBILITY SplitEditor {
+  SplitAnalysis &SA;
+  LiveIntervals &LIS;
+  VirtRegMap &VRM;
+  MachineRegisterInfo &MRI;
+  MachineDominatorTree &MDT;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  const MachineBlockFrequencyInfo &MBFI;
+  VirtRegAuxInfo &VRAI;
+
+public:
+  /// ComplementSpillMode - Select how the complement live range should be
+  /// created.  SplitEditor automatically creates interval 0 to contain
+  /// anything that isn't added to another interval.  This complement interval
+  /// can get quite complicated, and it can sometimes be an advantage to allow
+  /// it to overlap the other intervals.  If it is going to spill anyway, no
+  /// registers are wasted by keeping a value in two places at the same time.
+  enum ComplementSpillMode {
+    /// SM_Partition(Default) - Try to create the complement interval so it
+    /// doesn't overlap any other intervals, and the original interval is
+    /// partitioned.  This may require a large number of back copies and extra
+    /// PHI-defs.  Only segments marked with overlapIntv will be overlapping.
+    SM_Partition,
+
+    /// SM_Size - Overlap intervals to minimize the number of inserted COPY
+    /// instructions.  Copies to the complement interval are hoisted to their
+    /// common dominator, so only one COPY is required per value in the
+    /// complement interval.  This also means that no extra PHI-defs need to be
+    /// inserted in the complement interval.
+    SM_Size,
+
+    /// SM_Speed - Overlap intervals to minimize the expected execution
+    /// frequency of the inserted copies.  This is very similar to SM_Size, but
+    /// the complement interval may get some extra PHI-defs.
+    SM_Speed
+  };
+
+private:
+  /// Edit - The current parent register and new intervals created.
+  LiveRangeEdit *Edit = nullptr;
+
+  /// Index into Edit of the currently open interval.
+  /// The index 0 is used for the complement, so the first interval started by
+  /// openIntv will be 1.
+  unsigned OpenIdx = 0;
+
+  /// The current spill mode, selected by reset().
+  ComplementSpillMode SpillMode = SM_Partition;
+
+  using RegAssignMap = IntervalMap<SlotIndex, unsigned>;
+
+  /// Allocator for the interval map. This will eventually be shared with
+  /// SlotIndexes and LiveIntervals.
+  RegAssignMap::Allocator Allocator;
+
+  /// RegAssign - Map of the assigned register indexes.
+  /// Edit.get(RegAssign.lookup(Idx)) is the register that should be live at
+  /// Idx.
+  RegAssignMap RegAssign;
+
+  using ValueForcePair = PointerIntPair<VNInfo *, 1>;
+  using ValueMap = DenseMap<std::pair<unsigned, unsigned>, ValueForcePair>;
+
+  /// Values - keep track of the mapping from parent values to values in the new
+  /// intervals. Given a pair (RegIdx, ParentVNI->id), Values contains:
+  ///
+  /// 1. No entry - the value is not mapped to Edit.get(RegIdx).
+  /// 2. (Null, false) - the value is mapped to multiple values in
+  ///    Edit.get(RegIdx).  Each value is represented by a minimal live range at
+  ///    its def.  The full live range can be inferred exactly from the range
+  ///    of RegIdx in RegAssign.
+  /// 3. (Null, true).  As above, but the ranges in RegAssign are too large, and
+  ///    the live range must be recomputed using ::extend().
+  /// 4. (VNI, false) The value is mapped to a single new value.
+  ///    The new value has no live ranges anywhere.
+  ValueMap Values;
+
+  /// LICalc - Cache for computing live ranges and SSA update.  Each instance
+  /// can only handle non-overlapping live ranges, so use a separate
+  /// LiveIntervalCalc instance for the complement interval when in spill mode.
+  LiveIntervalCalc LICalc[2];
+
+  /// getLICalc - Return the LICalc to use for RegIdx.  In spill mode, the
+  /// complement interval can overlap the other intervals, so it gets its own
+  /// LICalc instance.  When not in spill mode, all intervals can share one.
+  LiveIntervalCalc &getLICalc(unsigned RegIdx) {
+    return LICalc[SpillMode != SM_Partition && RegIdx != 0];
+  }
+
+  /// Add a segment to the interval LI for the value number VNI. If LI has
+  /// subranges, corresponding segments will be added to them as well, but
+  /// with newly created value numbers. If Original is true, dead def will
+  /// only be added a subrange of LI if the corresponding subrange of the
+  /// original interval has a def at this index. Otherwise, all subranges
+  /// of LI will be updated.
+  void addDeadDef(LiveInterval &LI, VNInfo *VNI, bool Original);
+
+  /// defValue - define a value in RegIdx from ParentVNI at Idx.
+  /// Idx does not have to be ParentVNI->def, but it must be contained within
+  /// ParentVNI's live range in ParentLI. The new value is added to the value
+  /// map. The value being defined may either come from rematerialization
+  /// (or an inserted copy), or it may be coming from the original interval.
+  /// The parameter Original should be true in the latter case, otherwise
+  /// it should be false.
+  /// Return the new LI value.
+  VNInfo *defValue(unsigned RegIdx, const VNInfo *ParentVNI, SlotIndex Idx,
+                   bool Original);
+
+  /// forceRecompute - Force the live range of ParentVNI in RegIdx to be
+  /// recomputed by LiveRangeCalc::extend regardless of the number of defs.
+  /// This is used for values whose live range doesn't match RegAssign exactly.
+  /// They could have rematerialized, or back-copies may have been moved.
+  void forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI);
+
+  /// Calls forceRecompute() on any affected regidx and on ParentVNI
+  /// predecessors in case of a phi definition.
+  void forceRecomputeVNI(const VNInfo &ParentVNI);
+
+  /// defFromParent - Define Reg from ParentVNI at UseIdx using either
+  /// rematerialization or a COPY from parent. Return the new value.
+  VNInfo *defFromParent(unsigned RegIdx, const VNInfo *ParentVNI,
+                        SlotIndex UseIdx, MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator I);
+
+  /// removeBackCopies - Remove the copy instructions that defines the values
+  /// in the vector in the complement interval.
+  void removeBackCopies(SmallVectorImpl<VNInfo*> &Copies);
+
+  /// getShallowDominator - Returns the least busy dominator of MBB that is
+  /// also dominated by DefMBB.  Busy is measured by loop depth.
+  MachineBasicBlock *findShallowDominator(MachineBasicBlock *MBB,
+                                          MachineBasicBlock *DefMBB);
+
+  /// Find out all the backCopies dominated by others.
+  void computeRedundantBackCopies(DenseSet<unsigned> &NotToHoistSet,
+                                  SmallVectorImpl<VNInfo *> &BackCopies);
+
+  /// Hoist back-copies to the complement interval. It tries to hoist all
+  /// the back-copies to one BB if it is beneficial, or else simply remove
+  /// redundant backcopies dominated by others.
+  void hoistCopies();
+
+  /// transferValues - Transfer values to the new ranges.
+  /// Return true if any ranges were skipped.
+  bool transferValues();
+
+  /// Live range @p LR corresponding to the lane Mask @p LM has a live
+  /// PHI def at the beginning of block @p B. Extend the range @p LR of
+  /// all predecessor values that reach this def. If @p LR is a subrange,
+  /// the array @p Undefs is the set of all locations where it is undefined
+  /// via <def,read-undef> in other subranges for the same register.
+  void extendPHIRange(MachineBasicBlock &B, LiveIntervalCalc &LIC,
+                      LiveRange &LR, LaneBitmask LM,
+                      ArrayRef<SlotIndex> Undefs);
+
+  /// extendPHIKillRanges - Extend the ranges of all values killed by original
+  /// parent PHIDefs.
+  void extendPHIKillRanges();
+
+  /// rewriteAssigned - Rewrite all uses of Edit.getReg() to assigned registers.
+  void rewriteAssigned(bool ExtendRanges);
+
+  /// deleteRematVictims - Delete defs that are dead after rematerializing.
+  void deleteRematVictims();
+
+  /// Add a copy instruction copying \p FromReg to \p ToReg before
+  /// \p InsertBefore. This can be invoked with a \p LaneMask which may make it
+  /// necessary to construct a sequence of copies to cover it exactly.
+  SlotIndex buildCopy(Register FromReg, Register ToReg, LaneBitmask LaneMask,
+      MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+      bool Late, unsigned RegIdx);
+
+  SlotIndex buildSingleSubRegCopy(Register FromReg, Register ToReg,
+      MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore,
+      unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def);
+
+public:
+  /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
+  /// Newly created intervals will be appended to newIntervals.
+  SplitEditor(SplitAnalysis &SA, LiveIntervals &LIS, VirtRegMap &VRM,
+              MachineDominatorTree &MDT, MachineBlockFrequencyInfo &MBFI,
+              VirtRegAuxInfo &VRAI);
+
+  /// reset - Prepare for a new split.
+  void reset(LiveRangeEdit&, ComplementSpillMode = SM_Partition);
+
+  /// Create a new virtual register and live interval.
+  /// Return the interval index, starting from 1. Interval index 0 is the
+  /// implicit complement interval.
+  unsigned openIntv();
+
+  /// currentIntv - Return the current interval index.
+  unsigned currentIntv() const { return OpenIdx; }
+
+  /// selectIntv - Select a previously opened interval index.
+  void selectIntv(unsigned Idx);
+
+  /// enterIntvBefore - Enter the open interval before the instruction at Idx.
+  /// If the parent interval is not live before Idx, a COPY is not inserted.
+  /// Return the beginning of the new live range.
+  SlotIndex enterIntvBefore(SlotIndex Idx);
+
+  /// enterIntvAfter - Enter the open interval after the instruction at Idx.
+  /// Return the beginning of the new live range.
+  SlotIndex enterIntvAfter(SlotIndex Idx);
+
+  /// enterIntvAtEnd - Enter the open interval at the end of MBB.
+  /// Use the open interval from the inserted copy to the MBB end.
+  /// Return the beginning of the new live range.
+  SlotIndex enterIntvAtEnd(MachineBasicBlock &MBB);
+
+  /// useIntv - indicate that all instructions in MBB should use OpenLI.
+  void useIntv(const MachineBasicBlock &MBB);
+
+  /// useIntv - indicate that all instructions in range should use OpenLI.
+  void useIntv(SlotIndex Start, SlotIndex End);
+
+  /// leaveIntvAfter - Leave the open interval after the instruction at Idx.
+  /// Return the end of the live range.
+  SlotIndex leaveIntvAfter(SlotIndex Idx);
+
+  /// leaveIntvBefore - Leave the open interval before the instruction at Idx.
+  /// Return the end of the live range.
+  SlotIndex leaveIntvBefore(SlotIndex Idx);
+
+  /// leaveIntvAtTop - Leave the interval at the top of MBB.
+  /// Add liveness from the MBB top to the copy.
+  /// Return the end of the live range.
+  SlotIndex leaveIntvAtTop(MachineBasicBlock &MBB);
+
+  /// overlapIntv - Indicate that all instructions in range should use the open
+  /// interval if End does not have tied-def usage of the register and in this
+  /// case complement interval is used. Let the complement interval be live.
+  ///
+  /// This doubles the register pressure, but is sometimes required to deal with
+  /// register uses after the last valid split point.
+  ///
+  /// The Start index should be a return value from a leaveIntv* call, and End
+  /// should be in the same basic block. The parent interval must have the same
+  /// value across the range.
+  ///
+  void overlapIntv(SlotIndex Start, SlotIndex End);
+
+  /// finish - after all the new live ranges have been created, compute the
+  /// remaining live range, and rewrite instructions to use the new registers.
+  /// @param LRMap When not null, this vector will map each live range in Edit
+  ///              back to the indices returned by openIntv.
+  ///              There may be extra indices created by dead code elimination.
+  void finish(SmallVectorImpl<unsigned> *LRMap = nullptr);
+
+  /// dump - print the current interval mapping to dbgs().
+  void dump() const;
+
+  // ===--- High level methods ---===
+
+  /// splitSingleBlock - Split CurLI into a separate live interval around the
+  /// uses in a single block. This is intended to be used as part of a larger
+  /// split, and doesn't call finish().
+  void splitSingleBlock(const SplitAnalysis::BlockInfo &BI);
+
+  /// splitLiveThroughBlock - Split CurLI in the given block such that it
+  /// enters the block in IntvIn and leaves it in IntvOut. There may be uses in
+  /// the block, but they will be ignored when placing split points.
+  ///
+  /// @param MBBNum      Block number.
+  /// @param IntvIn      Interval index entering the block.
+  /// @param LeaveBefore When set, leave IntvIn before this point.
+  /// @param IntvOut     Interval index leaving the block.
+  /// @param EnterAfter  When set, enter IntvOut after this point.
+  void splitLiveThroughBlock(unsigned MBBNum,
+                             unsigned IntvIn, SlotIndex LeaveBefore,
+                             unsigned IntvOut, SlotIndex EnterAfter);
+
+  /// splitRegInBlock - Split CurLI in the given block such that it enters the
+  /// block in IntvIn and leaves it on the stack (or not at all). Split points
+  /// are placed in a way that avoids putting uses in the stack interval. This
+  /// may require creating a local interval when there is interference.
+  ///
+  /// @param BI          Block descriptor.
+  /// @param IntvIn      Interval index entering the block. Not 0.
+  /// @param LeaveBefore When set, leave IntvIn before this point.
+  void splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
+                       unsigned IntvIn, SlotIndex LeaveBefore);
+
+  /// splitRegOutBlock - Split CurLI in the given block such that it enters the
+  /// block on the stack (or isn't live-in at all) and leaves it in IntvOut.
+  /// Split points are placed to avoid interference and such that the uses are
+  /// not in the stack interval. This may require creating a local interval
+  /// when there is interference.
+  ///
+  /// @param BI          Block descriptor.
+  /// @param IntvOut     Interval index leaving the block.
+  /// @param EnterAfter  When set, enter IntvOut after this point.
+  void splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
+                        unsigned IntvOut, SlotIndex EnterAfter);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_SPLITKIT_H
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/StackColoring.cpp b/contrib/llvm-project/llvm/lib/CodeGen/StackColoring.cpp
new file mode 100644
index 000000000000..66b9086e1d88
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/StackColoring.cpp
@@ -0,0 +1,1379 @@
+//===- StackColoring.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements the stack-coloring optimization that looks for
+// lifetime markers machine instructions (LIFESTART_BEGIN and LIFESTART_END),
+// which represent the possible lifetime of stack slots. It attempts to
+// merge disjoint stack slots and reduce the used stack space.
+// NOTE: This pass is not StackSlotColoring, which optimizes spill slots.
+//
+// TODO: In the future we plan to improve stack coloring in the following ways:
+// 1. Allow merging multiple small slots into a single larger slot at different
+//    offsets.
+// 2. Merge this pass with StackSlotColoring and allow merging of allocas with
+//    spill slots.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <memory>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "stack-coloring"
+
+static cl::opt<bool>
+DisableColoring("no-stack-coloring",
+        cl::init(false), cl::Hidden,
+        cl::desc("Disable stack coloring"));
+
+/// The user may write code that uses allocas outside of the declared lifetime
+/// zone. This can happen when the user returns a reference to a local
+/// data-structure. We can detect these cases and decide not to optimize the
+/// code. If this flag is enabled, we try to save the user. This option
+/// is treated as overriding LifetimeStartOnFirstUse below.
+static cl::opt<bool>
+ProtectFromEscapedAllocas("protect-from-escaped-allocas",
+                          cl::init(false), cl::Hidden,
+                          cl::desc("Do not optimize lifetime zones that "
+                                   "are broken"));
+
+/// Enable enhanced dataflow scheme for lifetime analysis (treat first
+/// use of stack slot as start of slot lifetime, as opposed to looking
+/// for LIFETIME_START marker). See "Implementation notes" below for
+/// more info.
+static cl::opt<bool>
+LifetimeStartOnFirstUse("stackcoloring-lifetime-start-on-first-use",
+        cl::init(true), cl::Hidden,
+        cl::desc("Treat stack lifetimes as starting on first use, not on START marker."));
+
+
+STATISTIC(NumMarkerSeen,  "Number of lifetime markers found.");
+STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots.");
+STATISTIC(StackSlotMerged, "Number of stack slot merged.");
+STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
+
+//===----------------------------------------------------------------------===//
+//                           StackColoring Pass
+//===----------------------------------------------------------------------===//
+//
+// Stack Coloring reduces stack usage by merging stack slots when they
+// can't be used together. For example, consider the following C program:
+//
+//     void bar(char *, int);
+//     void foo(bool var) {
+//         A: {
+//             char z[4096];
+//             bar(z, 0);
+//         }
+//
+//         char *p;
+//         char x[4096];
+//         char y[4096];
+//         if (var) {
+//             p = x;
+//         } else {
+//             bar(y, 1);
+//             p = y + 1024;
+//         }
+//     B:
+//         bar(p, 2);
+//     }
+//
+// Naively-compiled, this program would use 12k of stack space. However, the
+// stack slot corresponding to `z` is always destroyed before either of the
+// stack slots for `x` or `y` are used, and then `x` is only used if `var`
+// is true, while `y` is only used if `var` is false. So in no time are 2
+// of the stack slots used together, and therefore we can merge them,
+// compiling the function using only a single 4k alloca:
+//
+//     void foo(bool var) { // equivalent
+//         char x[4096];
+//         char *p;
+//         bar(x, 0);
+//         if (var) {
+//             p = x;
+//         } else {
+//             bar(x, 1);
+//             p = x + 1024;
+//         }
+//         bar(p, 2);
+//     }
+//
+// This is an important optimization if we want stack space to be under
+// control in large functions, both open-coded ones and ones created by
+// inlining.
+//
+// Implementation Notes:
+// ---------------------
+//
+// An important part of the above reasoning is that `z` can't be accessed
+// while the latter 2 calls to `bar` are running. This is justified because
+// `z`'s lifetime is over after we exit from block `A:`, so any further
+// accesses to it would be UB. The way we represent this information
+// in LLVM is by having frontends delimit blocks with `lifetime.start`
+// and `lifetime.end` intrinsics.
+//
+// The effect of these intrinsics seems to be as follows (maybe I should
+// specify this in the reference?):
+//
+//   L1) at start, each stack-slot is marked as *out-of-scope*, unless no
+//   lifetime intrinsic refers to that stack slot, in which case
+//   it is marked as *in-scope*.
+//   L2) on a `lifetime.start`, a stack slot is marked as *in-scope* and
+//   the stack slot is overwritten with `undef`.
+//   L3) on a `lifetime.end`, a stack slot is marked as *out-of-scope*.
+//   L4) on function exit, all stack slots are marked as *out-of-scope*.
+//   L5) `lifetime.end` is a no-op when called on a slot that is already
+//   *out-of-scope*.
+//   L6) memory accesses to *out-of-scope* stack slots are UB.
+//   L7) when a stack-slot is marked as *out-of-scope*, all pointers to it
+//   are invalidated, unless the slot is "degenerate". This is used to
+//   justify not marking slots as in-use until the pointer to them is
+//   used, but feels a bit hacky in the presence of things like LICM. See
+//   the "Degenerate Slots" section for more details.
+//
+// Now, let's ground stack coloring on these rules. We'll define a slot
+// as *in-use* at a (dynamic) point in execution if it either can be
+// written to at that point, or if it has a live and non-undef content
+// at that point.
+//
+// Obviously, slots that are never *in-use* together can be merged, and
+// in our example `foo`, the slots for `x`, `y` and `z` are never
+// in-use together (of course, sometimes slots that *are* in-use together
+// might still be mergable, but we don't care about that here).
+//
+// In this implementation, we successively merge pairs of slots that are
+// not *in-use* together. We could be smarter - for example, we could merge
+// a single large slot with 2 small slots, or we could construct the
+// interference graph and run a "smart" graph coloring algorithm, but with
+// that aside, how do we find out whether a pair of slots might be *in-use*
+// together?
+//
+// From our rules, we see that *out-of-scope* slots are never *in-use*,
+// and from (L7) we see that "non-degenerate" slots remain non-*in-use*
+// until their address is taken. Therefore, we can approximate slot activity
+// using dataflow.
+//
+// A subtle point: naively, we might try to figure out which pairs of
+// stack-slots interfere by propagating `S in-use` through the CFG for every
+// stack-slot `S`, and having `S` and `T` interfere if there is a CFG point in
+// which they are both *in-use*.
+//
+// That is sound, but overly conservative in some cases: in our (artificial)
+// example `foo`, either `x` or `y` might be in use at the label `B:`, but
+// as `x` is only in use if we came in from the `var` edge and `y` only
+// if we came from the `!var` edge, they still can't be in use together.
+// See PR32488 for an important real-life case.
+//
+// If we wanted to find all points of interference precisely, we could
+// propagate `S in-use` and `S&T in-use` predicates through the CFG. That
+// would be precise, but requires propagating `O(n^2)` dataflow facts.
+//
+// However, we aren't interested in the *set* of points of interference
+// between 2 stack slots, only *whether* there *is* such a point. So we
+// can rely on a little trick: for `S` and `T` to be in-use together,
+// one of them needs to become in-use while the other is in-use (or
+// they might both become in use simultaneously). We can check this
+// by also keeping track of the points at which a stack slot might *start*
+// being in-use.
+//
+// Exact first use:
+// ----------------
+//
+// Consider the following motivating example:
+//
+//     int foo() {
+//       char b1[1024], b2[1024];
+//       if (...) {
+//         char b3[1024];
+//         <uses of b1, b3>;
+//         return x;
+//       } else {
+//         char b4[1024], b5[1024];
+//         <uses of b2, b4, b5>;
+//         return y;
+//       }
+//     }
+//
+// In the code above, "b3" and "b4" are declared in distinct lexical
+// scopes, meaning that it is easy to prove that they can share the
+// same stack slot. Variables "b1" and "b2" are declared in the same
+// scope, meaning that from a lexical point of view, their lifetimes
+// overlap. From a control flow pointer of view, however, the two
+// variables are accessed in disjoint regions of the CFG, thus it
+// should be possible for them to share the same stack slot. An ideal
+// stack allocation for the function above would look like:
+//
+//     slot 0: b1, b2
+//     slot 1: b3, b4
+//     slot 2: b5
+//
+// Achieving this allocation is tricky, however, due to the way
+// lifetime markers are inserted. Here is a simplified view of the
+// control flow graph for the code above:
+//
+//                +------  block 0 -------+
+//               0| LIFETIME_START b1, b2 |
+//               1| <test 'if' condition> |
+//                +-----------------------+
+//                   ./              \.
+//   +------  block 1 -------+   +------  block 2 -------+
+//  2| LIFETIME_START b3     |  5| LIFETIME_START b4, b5 |
+//  3| <uses of b1, b3>      |  6| <uses of b2, b4, b5>  |
+//  4| LIFETIME_END b3       |  7| LIFETIME_END b4, b5   |
+//   +-----------------------+   +-----------------------+
+//                   \.              /.
+//                +------  block 3 -------+
+//               8| <cleanupcode>         |
+//               9| LIFETIME_END b1, b2   |
+//              10| return                |
+//                +-----------------------+
+//
+// If we create live intervals for the variables above strictly based
+// on the lifetime markers, we'll get the set of intervals on the
+// left. If we ignore the lifetime start markers and instead treat a
+// variable's lifetime as beginning with the first reference to the
+// var, then we get the intervals on the right.
+//
+//            LIFETIME_START      First Use
+//     b1:    [0,9]               [3,4] [8,9]
+//     b2:    [0,9]               [6,9]
+//     b3:    [2,4]               [3,4]
+//     b4:    [5,7]               [6,7]
+//     b5:    [5,7]               [6,7]
+//
+// For the intervals on the left, the best we can do is overlap two
+// variables (b3 and b4, for example); this gives us a stack size of
+// 4*1024 bytes, not ideal. When treating first-use as the start of a
+// lifetime, we can additionally overlap b1 and b5, giving us a 3*1024
+// byte stack (better).
+//
+// Degenerate Slots:
+// -----------------
+//
+// Relying entirely on first-use of stack slots is problematic,
+// however, due to the fact that optimizations can sometimes migrate
+// uses of a variable outside of its lifetime start/end region. Here
+// is an example:
+//
+//     int bar() {
+//       char b1[1024], b2[1024];
+//       if (...) {
+//         <uses of b2>
+//         return y;
+//       } else {
+//         <uses of b1>
+//         while (...) {
+//           char b3[1024];
+//           <uses of b3>
+//         }
+//       }
+//     }
+//
+// Before optimization, the control flow graph for the code above
+// might look like the following:
+//
+//                +------  block 0 -------+
+//               0| LIFETIME_START b1, b2 |
+//               1| <test 'if' condition> |
+//                +-----------------------+
+//                   ./              \.
+//   +------  block 1 -------+    +------- block 2 -------+
+//  2| <uses of b2>          |   3| <uses of b1>          |
+//   +-----------------------+    +-----------------------+
+//              |                            |
+//              |                 +------- block 3 -------+ <-\.
+//              |                4| <while condition>     |    |
+//              |                 +-----------------------+    |
+//              |               /          |                   |
+//              |              /  +------- block 4 -------+
+//              \             /  5| LIFETIME_START b3     |    |
+//               \           /   6| <uses of b3>          |    |
+//                \         /    7| LIFETIME_END b3       |    |
+//                 \        |    +------------------------+    |
+//                  \       |                 \                /
+//                +------  block 5 -----+      \---------------
+//               8| <cleanupcode>       |
+//               9| LIFETIME_END b1, b2 |
+//              10| return              |
+//                +---------------------+
+//
+// During optimization, however, it can happen that an instruction
+// computing an address in "b3" (for example, a loop-invariant GEP) is
+// hoisted up out of the loop from block 4 to block 2.  [Note that
+// this is not an actual load from the stack, only an instruction that
+// computes the address to be loaded]. If this happens, there is now a
+// path leading from the first use of b3 to the return instruction
+// that does not encounter the b3 LIFETIME_END, hence b3's lifetime is
+// now larger than if we were computing live intervals strictly based
+// on lifetime markers. In the example above, this lengthened lifetime
+// would mean that it would appear illegal to overlap b3 with b2.
+//
+// To deal with this such cases, the code in ::collectMarkers() below
+// tries to identify "degenerate" slots -- those slots where on a single
+// forward pass through the CFG we encounter a first reference to slot
+// K before we hit the slot K lifetime start marker. For such slots,
+// we fall back on using the lifetime start marker as the beginning of
+// the variable's lifetime.  NB: with this implementation, slots can
+// appear degenerate in cases where there is unstructured control flow:
+//
+//    if (q) goto mid;
+//    if (x > 9) {
+//         int b[100];
+//         memcpy(&b[0], ...);
+//    mid: b[k] = ...;
+//         abc(&b);
+//    }
+//
+// If in RPO ordering chosen to walk the CFG  we happen to visit the b[k]
+// before visiting the memcpy block (which will contain the lifetime start
+// for "b" then it will appear that 'b' has a degenerate lifetime.
+//
+// Handle Windows Exception with LifetimeStartOnFirstUse:
+// -----------------
+//
+// There was a bug for using LifetimeStartOnFirstUse in win32.
+// class Type1 {
+// ...
+// ~Type1(){ write memory;}
+// }
+// ...
+// try{
+// Type1 V
+// ...
+// } catch (Type2 X){
+// ...
+// }
+// For variable X in catch(X), we put point pX=&(&X) into ConservativeSlots
+// to prevent using LifetimeStartOnFirstUse. Because pX may merged with
+// object V which may call destructor after implicitly writing pX. All these
+// are done in C++ EH runtime libs (through CxxThrowException), and can't
+// obviously check it in IR level.
+//
+// The loader of pX, without obvious writing IR, is usually the first LOAD MI
+// in EHPad, Some like:
+// bb.x.catch.i (landing-pad, ehfunclet-entry):
+// ; predecessors: %bb...
+//   successors: %bb...
+//  %n:gr32 = MOV32rm %stack.pX ...
+//  ...
+// The Type2** %stack.pX will only be written in EH runtime libs, so we
+// check the StoreSlots to screen it out.
+
+namespace {
+
+/// StackColoring - A machine pass for merging disjoint stack allocations,
+/// marked by the LIFETIME_START and LIFETIME_END pseudo instructions.
+class StackColoring : public MachineFunctionPass {
+  MachineFrameInfo *MFI = nullptr;
+  MachineFunction *MF = nullptr;
+
+  /// A class representing liveness information for a single basic block.
+  /// Each bit in the BitVector represents the liveness property
+  /// for a different stack slot.
+  struct BlockLifetimeInfo {
+    /// Which slots BEGINs in each basic block.
+    BitVector Begin;
+
+    /// Which slots ENDs in each basic block.
+    BitVector End;
+
+    /// Which slots are marked as LIVE_IN, coming into each basic block.
+    BitVector LiveIn;
+
+    /// Which slots are marked as LIVE_OUT, coming out of each basic block.
+    BitVector LiveOut;
+  };
+
+  /// Maps active slots (per bit) for each basic block.
+  using LivenessMap = DenseMap<const MachineBasicBlock *, BlockLifetimeInfo>;
+  LivenessMap BlockLiveness;
+
+  /// Maps serial numbers to basic blocks.
+  DenseMap<const MachineBasicBlock *, int> BasicBlocks;
+
+  /// Maps basic blocks to a serial number.
+  SmallVector<const MachineBasicBlock *, 8> BasicBlockNumbering;
+
+  /// Maps slots to their use interval. Outside of this interval, slots
+  /// values are either dead or `undef` and they will not be written to.
+  SmallVector<std::unique_ptr<LiveInterval>, 16> Intervals;
+
+  /// Maps slots to the points where they can become in-use.
+  SmallVector<SmallVector<SlotIndex, 4>, 16> LiveStarts;
+
+  /// VNInfo is used for the construction of LiveIntervals.
+  VNInfo::Allocator VNInfoAllocator;
+
+  /// SlotIndex analysis object.
+  SlotIndexes *Indexes = nullptr;
+
+  /// The list of lifetime markers found. These markers are to be removed
+  /// once the coloring is done.
+  SmallVector<MachineInstr*, 8> Markers;
+
+  /// Record the FI slots for which we have seen some sort of
+  /// lifetime marker (either start or end).
+  BitVector InterestingSlots;
+
+  /// FI slots that need to be handled conservatively (for these
+  /// slots lifetime-start-on-first-use is disabled).
+  BitVector ConservativeSlots;
+
+  /// Record the FI slots referenced by a 'may write to memory'.
+  BitVector StoreSlots;
+
+  /// Number of iterations taken during data flow analysis.
+  unsigned NumIterations;
+
+public:
+  static char ID;
+
+  StackColoring() : MachineFunctionPass(ID) {
+    initializeStackColoringPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &Func) override;
+
+private:
+  /// Used in collectMarkers
+  using BlockBitVecMap = DenseMap<const MachineBasicBlock *, BitVector>;
+
+  /// Debug.
+  void dump() const;
+  void dumpIntervals() const;
+  void dumpBB(MachineBasicBlock *MBB) const;
+  void dumpBV(const char *tag, const BitVector &BV) const;
+
+  /// Removes all of the lifetime marker instructions from the function.
+  /// \returns true if any markers were removed.
+  bool removeAllMarkers();
+
+  /// Scan the machine function and find all of the lifetime markers.
+  /// Record the findings in the BEGIN and END vectors.
+  /// \returns the number of markers found.
+  unsigned collectMarkers(unsigned NumSlot);
+
+  /// Perform the dataflow calculation and calculate the lifetime for each of
+  /// the slots, based on the BEGIN/END vectors. Set the LifetimeLIVE_IN and
+  /// LifetimeLIVE_OUT maps that represent which stack slots are live coming
+  /// in and out blocks.
+  void calculateLocalLiveness();
+
+  /// Returns TRUE if we're using the first-use-begins-lifetime method for
+  /// this slot (if FALSE, then the start marker is treated as start of lifetime).
+  bool applyFirstUse(int Slot) {
+    if (!LifetimeStartOnFirstUse || ProtectFromEscapedAllocas)
+      return false;
+    if (ConservativeSlots.test(Slot))
+      return false;
+    return true;
+  }
+
+  /// Examines the specified instruction and returns TRUE if the instruction
+  /// represents the start or end of an interesting lifetime. The slot or slots
+  /// starting or ending are added to the vector "slots" and "isStart" is set
+  /// accordingly.
+  /// \returns True if inst contains a lifetime start or end
+  bool isLifetimeStartOrEnd(const MachineInstr &MI,
+                            SmallVector<int, 4> &slots,
+                            bool &isStart);
+
+  /// Construct the LiveIntervals for the slots.
+  void calculateLiveIntervals(unsigned NumSlots);
+
+  /// Go over the machine function and change instructions which use stack
+  /// slots to use the joint slots.
+  void remapInstructions(DenseMap<int, int> &SlotRemap);
+
+  /// The input program may contain instructions which are not inside lifetime
+  /// markers. This can happen due to a bug in the compiler or due to a bug in
+  /// user code (for example, returning a reference to a local variable).
+  /// This procedure checks all of the instructions in the function and
+  /// invalidates lifetime ranges which do not contain all of the instructions
+  /// which access that frame slot.
+  void removeInvalidSlotRanges();
+
+  /// Map entries which point to other entries to their destination.
+  ///   A->B->C becomes A->C.
+  void expungeSlotMap(DenseMap<int, int> &SlotRemap, unsigned NumSlots);
+};
+
+} // end anonymous namespace
+
+char StackColoring::ID = 0;
+
+char &llvm::StackColoringID = StackColoring::ID;
+
+INITIALIZE_PASS_BEGIN(StackColoring, DEBUG_TYPE,
+                      "Merge disjoint stack slots", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(StackColoring, DEBUG_TYPE,
+                    "Merge disjoint stack slots", false, false)
+
+void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<SlotIndexes>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void StackColoring::dumpBV(const char *tag,
+                                            const BitVector &BV) const {
+  dbgs() << tag << " : { ";
+  for (unsigned I = 0, E = BV.size(); I != E; ++I)
+    dbgs() << BV.test(I) << " ";
+  dbgs() << "}\n";
+}
+
+LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const {
+  LivenessMap::const_iterator BI = BlockLiveness.find(MBB);
+  assert(BI != BlockLiveness.end() && "Block not found");
+  const BlockLifetimeInfo &BlockInfo = BI->second;
+
+  dumpBV("BEGIN", BlockInfo.Begin);
+  dumpBV("END", BlockInfo.End);
+  dumpBV("LIVE_IN", BlockInfo.LiveIn);
+  dumpBV("LIVE_OUT", BlockInfo.LiveOut);
+}
+
+LLVM_DUMP_METHOD void StackColoring::dump() const {
+  for (MachineBasicBlock *MBB : depth_first(MF)) {
+    dbgs() << "Inspecting block #" << MBB->getNumber() << " ["
+           << MBB->getName() << "]\n";
+    dumpBB(MBB);
+  }
+}
+
+LLVM_DUMP_METHOD void StackColoring::dumpIntervals() const {
+  for (unsigned I = 0, E = Intervals.size(); I != E; ++I) {
+    dbgs() << "Interval[" << I << "]:\n";
+    Intervals[I]->dump();
+  }
+}
+#endif
+
+static inline int getStartOrEndSlot(const MachineInstr &MI)
+{
+  assert((MI.getOpcode() == TargetOpcode::LIFETIME_START ||
+          MI.getOpcode() == TargetOpcode::LIFETIME_END) &&
+         "Expected LIFETIME_START or LIFETIME_END op");
+  const MachineOperand &MO = MI.getOperand(0);
+  int Slot = MO.getIndex();
+  if (Slot >= 0)
+    return Slot;
+  return -1;
+}
+
+// At the moment the only way to end a variable lifetime is with
+// a VARIABLE_LIFETIME op (which can't contain a start). If things
+// change and the IR allows for a single inst that both begins
+// and ends lifetime(s), this interface will need to be reworked.
+bool StackColoring::isLifetimeStartOrEnd(const MachineInstr &MI,
+                                         SmallVector<int, 4> &slots,
+                                         bool &isStart) {
+  if (MI.getOpcode() == TargetOpcode::LIFETIME_START ||
+      MI.getOpcode() == TargetOpcode::LIFETIME_END) {
+    int Slot = getStartOrEndSlot(MI);
+    if (Slot < 0)
+      return false;
+    if (!InterestingSlots.test(Slot))
+      return false;
+    slots.push_back(Slot);
+    if (MI.getOpcode() == TargetOpcode::LIFETIME_END) {
+      isStart = false;
+      return true;
+    }
+    if (!applyFirstUse(Slot)) {
+      isStart = true;
+      return true;
+    }
+  } else if (LifetimeStartOnFirstUse && !ProtectFromEscapedAllocas) {
+    if (!MI.isDebugInstr()) {
+      bool found = false;
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isFI())
+          continue;
+        int Slot = MO.getIndex();
+        if (Slot<0)
+          continue;
+        if (InterestingSlots.test(Slot) && applyFirstUse(Slot)) {
+          slots.push_back(Slot);
+          found = true;
+        }
+      }
+      if (found) {
+        isStart = true;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+unsigned StackColoring::collectMarkers(unsigned NumSlot) {
+  unsigned MarkersFound = 0;
+  BlockBitVecMap SeenStartMap;
+  InterestingSlots.clear();
+  InterestingSlots.resize(NumSlot);
+  ConservativeSlots.clear();
+  ConservativeSlots.resize(NumSlot);
+  StoreSlots.clear();
+  StoreSlots.resize(NumSlot);
+
+  // number of start and end lifetime ops for each slot
+  SmallVector<int, 8> NumStartLifetimes(NumSlot, 0);
+  SmallVector<int, 8> NumEndLifetimes(NumSlot, 0);
+  SmallVector<int, 8> NumLoadInCatchPad(NumSlot, 0);
+
+  // Step 1: collect markers and populate the "InterestingSlots"
+  // and "ConservativeSlots" sets.
+  for (MachineBasicBlock *MBB : depth_first(MF)) {
+    // Compute the set of slots for which we've seen a START marker but have
+    // not yet seen an END marker at this point in the walk (e.g. on entry
+    // to this bb).
+    BitVector BetweenStartEnd;
+    BetweenStartEnd.resize(NumSlot);
+    for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+      BlockBitVecMap::const_iterator I = SeenStartMap.find(Pred);
+      if (I != SeenStartMap.end()) {
+        BetweenStartEnd |= I->second;
+      }
+    }
+
+    // Walk the instructions in the block to look for start/end ops.
+    for (MachineInstr &MI : *MBB) {
+      if (MI.isDebugInstr())
+        continue;
+      if (MI.getOpcode() == TargetOpcode::LIFETIME_START ||
+          MI.getOpcode() == TargetOpcode::LIFETIME_END) {
+        int Slot = getStartOrEndSlot(MI);
+        if (Slot < 0)
+          continue;
+        InterestingSlots.set(Slot);
+        if (MI.getOpcode() == TargetOpcode::LIFETIME_START) {
+          BetweenStartEnd.set(Slot);
+          NumStartLifetimes[Slot] += 1;
+        } else {
+          BetweenStartEnd.reset(Slot);
+          NumEndLifetimes[Slot] += 1;
+        }
+        const AllocaInst *Allocation = MFI->getObjectAllocation(Slot);
+        if (Allocation) {
+          LLVM_DEBUG(dbgs() << "Found a lifetime ");
+          LLVM_DEBUG(dbgs() << (MI.getOpcode() == TargetOpcode::LIFETIME_START
+                                    ? "start"
+                                    : "end"));
+          LLVM_DEBUG(dbgs() << " marker for slot #" << Slot);
+          LLVM_DEBUG(dbgs()
+                     << " with allocation: " << Allocation->getName() << "\n");
+        }
+        Markers.push_back(&MI);
+        MarkersFound += 1;
+      } else {
+        for (const MachineOperand &MO : MI.operands()) {
+          if (!MO.isFI())
+            continue;
+          int Slot = MO.getIndex();
+          if (Slot < 0)
+            continue;
+          if (! BetweenStartEnd.test(Slot)) {
+            ConservativeSlots.set(Slot);
+          }
+          // Here we check the StoreSlots to screen catch point out. For more
+          // information, please refer "Handle Windows Exception with
+          // LifetimeStartOnFirstUse" at the head of this file.
+          if (MI.mayStore())
+            StoreSlots.set(Slot);
+          if (MF->getWinEHFuncInfo() && MBB->isEHPad() && MI.mayLoad())
+            NumLoadInCatchPad[Slot] += 1;
+        }
+      }
+    }
+    BitVector &SeenStart = SeenStartMap[MBB];
+    SeenStart |= BetweenStartEnd;
+  }
+  if (!MarkersFound) {
+    return 0;
+  }
+
+  // 1) PR27903: slots with multiple start or end lifetime ops are not
+  // safe to enable for "lifetime-start-on-first-use".
+  // 2) And also not safe for variable X in catch(X) in windows.
+  for (unsigned slot = 0; slot < NumSlot; ++slot) {
+    if (NumStartLifetimes[slot] > 1 || NumEndLifetimes[slot] > 1 ||
+        (NumLoadInCatchPad[slot] > 1 && !StoreSlots.test(slot)))
+      ConservativeSlots.set(slot);
+  }
+  LLVM_DEBUG(dumpBV("Conservative slots", ConservativeSlots));
+
+  // Step 2: compute begin/end sets for each block
+
+  // NOTE: We use a depth-first iteration to ensure that we obtain a
+  // deterministic numbering.
+  for (MachineBasicBlock *MBB : depth_first(MF)) {
+    // Assign a serial number to this basic block.
+    BasicBlocks[MBB] = BasicBlockNumbering.size();
+    BasicBlockNumbering.push_back(MBB);
+
+    // Keep a reference to avoid repeated lookups.
+    BlockLifetimeInfo &BlockInfo = BlockLiveness[MBB];
+
+    BlockInfo.Begin.resize(NumSlot);
+    BlockInfo.End.resize(NumSlot);
+
+    SmallVector<int, 4> slots;
+    for (MachineInstr &MI : *MBB) {
+      bool isStart = false;
+      slots.clear();
+      if (isLifetimeStartOrEnd(MI, slots, isStart)) {
+        if (!isStart) {
+          assert(slots.size() == 1 && "unexpected: MI ends multiple slots");
+          int Slot = slots[0];
+          if (BlockInfo.Begin.test(Slot)) {
+            BlockInfo.Begin.reset(Slot);
+          }
+          BlockInfo.End.set(Slot);
+        } else {
+          for (auto Slot : slots) {
+            LLVM_DEBUG(dbgs() << "Found a use of slot #" << Slot);
+            LLVM_DEBUG(dbgs()
+                       << " at " << printMBBReference(*MBB) << " index ");
+            LLVM_DEBUG(Indexes->getInstructionIndex(MI).print(dbgs()));
+            const AllocaInst *Allocation = MFI->getObjectAllocation(Slot);
+            if (Allocation) {
+              LLVM_DEBUG(dbgs()
+                         << " with allocation: " << Allocation->getName());
+            }
+            LLVM_DEBUG(dbgs() << "\n");
+            if (BlockInfo.End.test(Slot)) {
+              BlockInfo.End.reset(Slot);
+            }
+            BlockInfo.Begin.set(Slot);
+          }
+        }
+      }
+    }
+  }
+
+  // Update statistics.
+  NumMarkerSeen += MarkersFound;
+  return MarkersFound;
+}
+
+void StackColoring::calculateLocalLiveness() {
+  unsigned NumIters = 0;
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    ++NumIters;
+
+    for (const MachineBasicBlock *BB : BasicBlockNumbering) {
+      // Use an iterator to avoid repeated lookups.
+      LivenessMap::iterator BI = BlockLiveness.find(BB);
+      assert(BI != BlockLiveness.end() && "Block not found");
+      BlockLifetimeInfo &BlockInfo = BI->second;
+
+      // Compute LiveIn by unioning together the LiveOut sets of all preds.
+      BitVector LocalLiveIn;
+      for (MachineBasicBlock *Pred : BB->predecessors()) {
+        LivenessMap::const_iterator I = BlockLiveness.find(Pred);
+        // PR37130: transformations prior to stack coloring can
+        // sometimes leave behind statically unreachable blocks; these
+        // can be safely skipped here.
+        if (I != BlockLiveness.end())
+          LocalLiveIn |= I->second.LiveOut;
+      }
+
+      // Compute LiveOut by subtracting out lifetimes that end in this
+      // block, then adding in lifetimes that begin in this block.  If
+      // we have both BEGIN and END markers in the same basic block
+      // then we know that the BEGIN marker comes after the END,
+      // because we already handle the case where the BEGIN comes
+      // before the END when collecting the markers (and building the
+      // BEGIN/END vectors).
+      BitVector LocalLiveOut = LocalLiveIn;
+      LocalLiveOut.reset(BlockInfo.End);
+      LocalLiveOut |= BlockInfo.Begin;
+
+      // Update block LiveIn set, noting whether it has changed.
+      if (LocalLiveIn.test(BlockInfo.LiveIn)) {
+        changed = true;
+        BlockInfo.LiveIn |= LocalLiveIn;
+      }
+
+      // Update block LiveOut set, noting whether it has changed.
+      if (LocalLiveOut.test(BlockInfo.LiveOut)) {
+        changed = true;
+        BlockInfo.LiveOut |= LocalLiveOut;
+      }
+    }
+  } // while changed.
+
+  NumIterations = NumIters;
+}
+
+void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
+  SmallVector<SlotIndex, 16> Starts;
+  SmallVector<bool, 16> DefinitelyInUse;
+
+  // For each block, find which slots are active within this block
+  // and update the live intervals.
+  for (const MachineBasicBlock &MBB : *MF) {
+    Starts.clear();
+    Starts.resize(NumSlots);
+    DefinitelyInUse.clear();
+    DefinitelyInUse.resize(NumSlots);
+
+    // Start the interval of the slots that we previously found to be 'in-use'.
+    BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB];
+    for (int pos = MBBLiveness.LiveIn.find_first(); pos != -1;
+         pos = MBBLiveness.LiveIn.find_next(pos)) {
+      Starts[pos] = Indexes->getMBBStartIdx(&MBB);
+    }
+
+    // Create the interval for the basic blocks containing lifetime begin/end.
+    for (const MachineInstr &MI : MBB) {
+      SmallVector<int, 4> slots;
+      bool IsStart = false;
+      if (!isLifetimeStartOrEnd(MI, slots, IsStart))
+        continue;
+      SlotIndex ThisIndex = Indexes->getInstructionIndex(MI);
+      for (auto Slot : slots) {
+        if (IsStart) {
+          // If a slot is already definitely in use, we don't have to emit
+          // a new start marker because there is already a pre-existing
+          // one.
+          if (!DefinitelyInUse[Slot]) {
+            LiveStarts[Slot].push_back(ThisIndex);
+            DefinitelyInUse[Slot] = true;
+          }
+          if (!Starts[Slot].isValid())
+            Starts[Slot] = ThisIndex;
+        } else {
+          if (Starts[Slot].isValid()) {
+            VNInfo *VNI = Intervals[Slot]->getValNumInfo(0);
+            Intervals[Slot]->addSegment(
+                LiveInterval::Segment(Starts[Slot], ThisIndex, VNI));
+            Starts[Slot] = SlotIndex(); // Invalidate the start index
+            DefinitelyInUse[Slot] = false;
+          }
+        }
+      }
+    }
+
+    // Finish up started segments
+    for (unsigned i = 0; i < NumSlots; ++i) {
+      if (!Starts[i].isValid())
+        continue;
+
+      SlotIndex EndIdx = Indexes->getMBBEndIdx(&MBB);
+      VNInfo *VNI = Intervals[i]->getValNumInfo(0);
+      Intervals[i]->addSegment(LiveInterval::Segment(Starts[i], EndIdx, VNI));
+    }
+  }
+}
+
+bool StackColoring::removeAllMarkers() {
+  unsigned Count = 0;
+  for (MachineInstr *MI : Markers) {
+    MI->eraseFromParent();
+    Count++;
+  }
+  Markers.clear();
+
+  LLVM_DEBUG(dbgs() << "Removed " << Count << " markers.\n");
+  return Count;
+}
+
+void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
+  unsigned FixedInstr = 0;
+  unsigned FixedMemOp = 0;
+  unsigned FixedDbg = 0;
+
+  // Remap debug information that refers to stack slots.
+  for (auto &VI : MF->getVariableDbgInfo()) {
+    if (!VI.Var || !VI.inStackSlot())
+      continue;
+    int Slot = VI.getStackSlot();
+    if (SlotRemap.count(Slot)) {
+      LLVM_DEBUG(dbgs() << "Remapping debug info for ["
+                        << cast<DILocalVariable>(VI.Var)->getName() << "].\n");
+      VI.updateStackSlot(SlotRemap[Slot]);
+      FixedDbg++;
+    }
+  }
+
+  // Keep a list of *allocas* which need to be remapped.
+  DenseMap<const AllocaInst*, const AllocaInst*> Allocas;
+
+  // Keep a list of allocas which has been affected by the remap.
+  SmallPtrSet<const AllocaInst*, 32> MergedAllocas;
+
+  for (const std::pair<int, int> &SI : SlotRemap) {
+    const AllocaInst *From = MFI->getObjectAllocation(SI.first);
+    const AllocaInst *To = MFI->getObjectAllocation(SI.second);
+    assert(To && From && "Invalid allocation object");
+    Allocas[From] = To;
+
+    // If From is before wo, its possible that there is a use of From between
+    // them.
+    if (From->comesBefore(To))
+      const_cast<AllocaInst*>(To)->moveBefore(const_cast<AllocaInst*>(From));
+
+    // AA might be used later for instruction scheduling, and we need it to be
+    // able to deduce the correct aliasing releationships between pointers
+    // derived from the alloca being remapped and the target of that remapping.
+    // The only safe way, without directly informing AA about the remapping
+    // somehow, is to directly update the IR to reflect the change being made
+    // here.
+    Instruction *Inst = const_cast<AllocaInst *>(To);
+    if (From->getType() != To->getType()) {
+      BitCastInst *Cast = new BitCastInst(Inst, From->getType());
+      Cast->insertAfter(Inst);
+      Inst = Cast;
+    }
+
+    // We keep both slots to maintain AliasAnalysis metadata later.
+    MergedAllocas.insert(From);
+    MergedAllocas.insert(To);
+
+    // Transfer the stack protector layout tag, but make sure that SSPLK_AddrOf
+    // does not overwrite SSPLK_SmallArray or SSPLK_LargeArray, and make sure
+    // that SSPLK_SmallArray does not overwrite SSPLK_LargeArray.
+    MachineFrameInfo::SSPLayoutKind FromKind
+        = MFI->getObjectSSPLayout(SI.first);
+    MachineFrameInfo::SSPLayoutKind ToKind = MFI->getObjectSSPLayout(SI.second);
+    if (FromKind != MachineFrameInfo::SSPLK_None &&
+        (ToKind == MachineFrameInfo::SSPLK_None ||
+         (ToKind != MachineFrameInfo::SSPLK_LargeArray &&
+          FromKind != MachineFrameInfo::SSPLK_AddrOf)))
+      MFI->setObjectSSPLayout(SI.second, FromKind);
+
+    // The new alloca might not be valid in a llvm.dbg.declare for this
+    // variable, so undef out the use to make the verifier happy.
+    AllocaInst *FromAI = const_cast<AllocaInst *>(From);
+    if (FromAI->isUsedByMetadata())
+      ValueAsMetadata::handleRAUW(FromAI, UndefValue::get(FromAI->getType()));
+    for (auto &Use : FromAI->uses()) {
+      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Use.get()))
+        if (BCI->isUsedByMetadata())
+          ValueAsMetadata::handleRAUW(BCI, UndefValue::get(BCI->getType()));
+    }
+
+    // Note that this will not replace uses in MMOs (which we'll update below),
+    // or anywhere else (which is why we won't delete the original
+    // instruction).
+    FromAI->replaceAllUsesWith(Inst);
+  }
+
+  // Remap all instructions to the new stack slots.
+  std::vector<std::vector<MachineMemOperand *>> SSRefs(
+      MFI->getObjectIndexEnd());
+  for (MachineBasicBlock &BB : *MF)
+    for (MachineInstr &I : BB) {
+      // Skip lifetime markers. We'll remove them soon.
+      if (I.getOpcode() == TargetOpcode::LIFETIME_START ||
+          I.getOpcode() == TargetOpcode::LIFETIME_END)
+        continue;
+
+      // Update the MachineMemOperand to use the new alloca.
+      for (MachineMemOperand *MMO : I.memoperands()) {
+        // We've replaced IR-level uses of the remapped allocas, so we only
+        // need to replace direct uses here.
+        const AllocaInst *AI = dyn_cast_or_null<AllocaInst>(MMO->getValue());
+        if (!AI)
+          continue;
+
+        if (!Allocas.count(AI))
+          continue;
+
+        MMO->setValue(Allocas[AI]);
+        FixedMemOp++;
+      }
+
+      // Update all of the machine instruction operands.
+      for (MachineOperand &MO : I.operands()) {
+        if (!MO.isFI())
+          continue;
+        int FromSlot = MO.getIndex();
+
+        // Don't touch arguments.
+        if (FromSlot<0)
+          continue;
+
+        // Only look at mapped slots.
+        if (!SlotRemap.count(FromSlot))
+          continue;
+
+        // In a debug build, check that the instruction that we are modifying is
+        // inside the expected live range. If the instruction is not inside
+        // the calculated range then it means that the alloca usage moved
+        // outside of the lifetime markers, or that the user has a bug.
+        // NOTE: Alloca address calculations which happen outside the lifetime
+        // zone are okay, despite the fact that we don't have a good way
+        // for validating all of the usages of the calculation.
+#ifndef NDEBUG
+        bool TouchesMemory = I.mayLoadOrStore();
+        // If we *don't* protect the user from escaped allocas, don't bother
+        // validating the instructions.
+        if (!I.isDebugInstr() && TouchesMemory && ProtectFromEscapedAllocas) {
+          SlotIndex Index = Indexes->getInstructionIndex(I);
+          const LiveInterval *Interval = &*Intervals[FromSlot];
+          assert(Interval->find(Index) != Interval->end() &&
+                 "Found instruction usage outside of live range.");
+        }
+#endif
+
+        // Fix the machine instructions.
+        int ToSlot = SlotRemap[FromSlot];
+        MO.setIndex(ToSlot);
+        FixedInstr++;
+      }
+
+      // We adjust AliasAnalysis information for merged stack slots.
+      SmallVector<MachineMemOperand *, 2> NewMMOs;
+      bool ReplaceMemOps = false;
+      for (MachineMemOperand *MMO : I.memoperands()) {
+        // Collect MachineMemOperands which reference
+        // FixedStackPseudoSourceValues with old frame indices.
+        if (const auto *FSV = dyn_cast_or_null<FixedStackPseudoSourceValue>(
+                MMO->getPseudoValue())) {
+          int FI = FSV->getFrameIndex();
+          auto To = SlotRemap.find(FI);
+          if (To != SlotRemap.end())
+            SSRefs[FI].push_back(MMO);
+        }
+
+        // If this memory location can be a slot remapped here,
+        // we remove AA information.
+        bool MayHaveConflictingAAMD = false;
+        if (MMO->getAAInfo()) {
+          if (const Value *MMOV = MMO->getValue()) {
+            SmallVector<Value *, 4> Objs;
+            getUnderlyingObjectsForCodeGen(MMOV, Objs);
+
+            if (Objs.empty())
+              MayHaveConflictingAAMD = true;
+            else
+              for (Value *V : Objs) {
+                // If this memory location comes from a known stack slot
+                // that is not remapped, we continue checking.
+                // Otherwise, we need to invalidate AA infomation.
+                const AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V);
+                if (AI && MergedAllocas.count(AI)) {
+                  MayHaveConflictingAAMD = true;
+                  break;
+                }
+              }
+          }
+        }
+        if (MayHaveConflictingAAMD) {
+          NewMMOs.push_back(MF->getMachineMemOperand(MMO, AAMDNodes()));
+          ReplaceMemOps = true;
+        } else {
+          NewMMOs.push_back(MMO);
+        }
+      }
+
+      // If any memory operand is updated, set memory references of
+      // this instruction.
+      if (ReplaceMemOps)
+        I.setMemRefs(*MF, NewMMOs);
+    }
+
+  // Rewrite MachineMemOperands that reference old frame indices.
+  for (auto E : enumerate(SSRefs))
+    if (!E.value().empty()) {
+      const PseudoSourceValue *NewSV =
+          MF->getPSVManager().getFixedStack(SlotRemap.find(E.index())->second);
+      for (MachineMemOperand *Ref : E.value())
+        Ref->setValue(NewSV);
+    }
+
+  // Update the location of C++ catch objects for the MSVC personality routine.
+  if (WinEHFuncInfo *EHInfo = MF->getWinEHFuncInfo())
+    for (WinEHTryBlockMapEntry &TBME : EHInfo->TryBlockMap)
+      for (WinEHHandlerType &H : TBME.HandlerArray)
+        if (H.CatchObj.FrameIndex != std::numeric_limits<int>::max() &&
+            SlotRemap.count(H.CatchObj.FrameIndex))
+          H.CatchObj.FrameIndex = SlotRemap[H.CatchObj.FrameIndex];
+
+  LLVM_DEBUG(dbgs() << "Fixed " << FixedMemOp << " machine memory operands.\n");
+  LLVM_DEBUG(dbgs() << "Fixed " << FixedDbg << " debug locations.\n");
+  LLVM_DEBUG(dbgs() << "Fixed " << FixedInstr << " machine instructions.\n");
+  (void) FixedMemOp;
+  (void) FixedDbg;
+  (void) FixedInstr;
+}
+
+void StackColoring::removeInvalidSlotRanges() {
+  for (MachineBasicBlock &BB : *MF)
+    for (MachineInstr &I : BB) {
+      if (I.getOpcode() == TargetOpcode::LIFETIME_START ||
+          I.getOpcode() == TargetOpcode::LIFETIME_END || I.isDebugInstr())
+        continue;
+
+      // Some intervals are suspicious! In some cases we find address
+      // calculations outside of the lifetime zone, but not actual memory
+      // read or write. Memory accesses outside of the lifetime zone are a clear
+      // violation, but address calculations are okay. This can happen when
+      // GEPs are hoisted outside of the lifetime zone.
+      // So, in here we only check instructions which can read or write memory.
+      if (!I.mayLoad() && !I.mayStore())
+        continue;
+
+      // Check all of the machine operands.
+      for (const MachineOperand &MO : I.operands()) {
+        if (!MO.isFI())
+          continue;
+
+        int Slot = MO.getIndex();
+
+        if (Slot<0)
+          continue;
+
+        if (Intervals[Slot]->empty())
+          continue;
+
+        // Check that the used slot is inside the calculated lifetime range.
+        // If it is not, warn about it and invalidate the range.
+        LiveInterval *Interval = &*Intervals[Slot];
+        SlotIndex Index = Indexes->getInstructionIndex(I);
+        if (Interval->find(Index) == Interval->end()) {
+          Interval->clear();
+          LLVM_DEBUG(dbgs() << "Invalidating range #" << Slot << "\n");
+          EscapedAllocas++;
+        }
+      }
+    }
+}
+
+void StackColoring::expungeSlotMap(DenseMap<int, int> &SlotRemap,
+                                   unsigned NumSlots) {
+  // Expunge slot remap map.
+  for (unsigned i=0; i < NumSlots; ++i) {
+    // If we are remapping i
+    if (SlotRemap.count(i)) {
+      int Target = SlotRemap[i];
+      // As long as our target is mapped to something else, follow it.
+      while (SlotRemap.count(Target)) {
+        Target = SlotRemap[Target];
+        SlotRemap[i] = Target;
+      }
+    }
+  }
+}
+
+bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
+  LLVM_DEBUG(dbgs() << "********** Stack Coloring **********\n"
+                    << "********** Function: " << Func.getName() << '\n');
+  MF = &Func;
+  MFI = &MF->getFrameInfo();
+  Indexes = &getAnalysis<SlotIndexes>();
+  BlockLiveness.clear();
+  BasicBlocks.clear();
+  BasicBlockNumbering.clear();
+  Markers.clear();
+  Intervals.clear();
+  LiveStarts.clear();
+  VNInfoAllocator.Reset();
+
+  unsigned NumSlots = MFI->getObjectIndexEnd();
+
+  // If there are no stack slots then there are no markers to remove.
+  if (!NumSlots)
+    return false;
+
+  SmallVector<int, 8> SortedSlots;
+  SortedSlots.reserve(NumSlots);
+  Intervals.reserve(NumSlots);
+  LiveStarts.resize(NumSlots);
+
+  unsigned NumMarkers = collectMarkers(NumSlots);
+
+  unsigned TotalSize = 0;
+  LLVM_DEBUG(dbgs() << "Found " << NumMarkers << " markers and " << NumSlots
+                    << " slots\n");
+  LLVM_DEBUG(dbgs() << "Slot structure:\n");
+
+  for (int i=0; i < MFI->getObjectIndexEnd(); ++i) {
+    LLVM_DEBUG(dbgs() << "Slot #" << i << " - " << MFI->getObjectSize(i)
+                      << " bytes.\n");
+    TotalSize += MFI->getObjectSize(i);
+  }
+
+  LLVM_DEBUG(dbgs() << "Total Stack size: " << TotalSize << " bytes\n\n");
+
+  // Don't continue because there are not enough lifetime markers, or the
+  // stack is too small, or we are told not to optimize the slots.
+  if (NumMarkers < 2 || TotalSize < 16 || DisableColoring ||
+      skipFunction(Func.getFunction())) {
+    LLVM_DEBUG(dbgs() << "Will not try to merge slots.\n");
+    return removeAllMarkers();
+  }
+
+  for (unsigned i=0; i < NumSlots; ++i) {
+    std::unique_ptr<LiveInterval> LI(new LiveInterval(i, 0));
+    LI->getNextValue(Indexes->getZeroIndex(), VNInfoAllocator);
+    Intervals.push_back(std::move(LI));
+    SortedSlots.push_back(i);
+  }
+
+  // Calculate the liveness of each block.
+  calculateLocalLiveness();
+  LLVM_DEBUG(dbgs() << "Dataflow iterations: " << NumIterations << "\n");
+  LLVM_DEBUG(dump());
+
+  // Propagate the liveness information.
+  calculateLiveIntervals(NumSlots);
+  LLVM_DEBUG(dumpIntervals());
+
+  // Search for allocas which are used outside of the declared lifetime
+  // markers.
+  if (ProtectFromEscapedAllocas)
+    removeInvalidSlotRanges();
+
+  // Maps old slots to new slots.
+  DenseMap<int, int> SlotRemap;
+  unsigned RemovedSlots = 0;
+  unsigned ReducedSize = 0;
+
+  // Do not bother looking at empty intervals.
+  for (unsigned I = 0; I < NumSlots; ++I) {
+    if (Intervals[SortedSlots[I]]->empty())
+      SortedSlots[I] = -1;
+  }
+
+  // This is a simple greedy algorithm for merging allocas. First, sort the
+  // slots, placing the largest slots first. Next, perform an n^2 scan and look
+  // for disjoint slots. When you find disjoint slots, merge the smaller one
+  // into the bigger one and update the live interval. Remove the small alloca
+  // and continue.
+
+  // Sort the slots according to their size. Place unused slots at the end.
+  // Use stable sort to guarantee deterministic code generation.
+  llvm::stable_sort(SortedSlots, [this](int LHS, int RHS) {
+    // We use -1 to denote a uninteresting slot. Place these slots at the end.
+    if (LHS == -1)
+      return false;
+    if (RHS == -1)
+      return true;
+    // Sort according to size.
+    return MFI->getObjectSize(LHS) > MFI->getObjectSize(RHS);
+  });
+
+  for (auto &s : LiveStarts)
+    llvm::sort(s);
+
+  bool Changed = true;
+  while (Changed) {
+    Changed = false;
+    for (unsigned I = 0; I < NumSlots; ++I) {
+      if (SortedSlots[I] == -1)
+        continue;
+
+      for (unsigned J=I+1; J < NumSlots; ++J) {
+        if (SortedSlots[J] == -1)
+          continue;
+
+        int FirstSlot = SortedSlots[I];
+        int SecondSlot = SortedSlots[J];
+
+        // Objects with different stack IDs cannot be merged.
+        if (MFI->getStackID(FirstSlot) != MFI->getStackID(SecondSlot))
+          continue;
+
+        LiveInterval *First = &*Intervals[FirstSlot];
+        LiveInterval *Second = &*Intervals[SecondSlot];
+        auto &FirstS = LiveStarts[FirstSlot];
+        auto &SecondS = LiveStarts[SecondSlot];
+        assert(!First->empty() && !Second->empty() && "Found an empty range");
+
+        // Merge disjoint slots. This is a little bit tricky - see the
+        // Implementation Notes section for an explanation.
+        if (!First->isLiveAtIndexes(SecondS) &&
+            !Second->isLiveAtIndexes(FirstS)) {
+          Changed = true;
+          First->MergeSegmentsInAsValue(*Second, First->getValNumInfo(0));
+
+          int OldSize = FirstS.size();
+          FirstS.append(SecondS.begin(), SecondS.end());
+          auto Mid = FirstS.begin() + OldSize;
+          std::inplace_merge(FirstS.begin(), Mid, FirstS.end());
+
+          SlotRemap[SecondSlot] = FirstSlot;
+          SortedSlots[J] = -1;
+          LLVM_DEBUG(dbgs() << "Merging #" << FirstSlot << " and slots #"
+                            << SecondSlot << " together.\n");
+          Align MaxAlignment = std::max(MFI->getObjectAlign(FirstSlot),
+                                        MFI->getObjectAlign(SecondSlot));
+
+          assert(MFI->getObjectSize(FirstSlot) >=
+                 MFI->getObjectSize(SecondSlot) &&
+                 "Merging a small object into a larger one");
+
+          RemovedSlots+=1;
+          ReducedSize += MFI->getObjectSize(SecondSlot);
+          MFI->setObjectAlignment(FirstSlot, MaxAlignment);
+          MFI->RemoveStackObject(SecondSlot);
+        }
+      }
+    }
+  }// While changed.
+
+  // Record statistics.
+  StackSpaceSaved += ReducedSize;
+  StackSlotMerged += RemovedSlots;
+  LLVM_DEBUG(dbgs() << "Merge " << RemovedSlots << " slots. Saved "
+                    << ReducedSize << " bytes\n");
+
+  // Scan the entire function and update all machine operands that use frame
+  // indices to use the remapped frame index.
+  expungeSlotMap(SlotRemap, NumSlots);
+  remapInstructions(SlotRemap);
+
+  return removeAllMarkers();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
new file mode 100644
index 000000000000..5d3903ed84ce
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
@@ -0,0 +1,254 @@
+//===-- StackFrameLayoutAnalysisPass.cpp
+//------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// StackFrameLayoutAnalysisPass implementation. Outputs information about the
+// layout of the stack frame, using the remarks interface. On the CLI it prints
+// a textual representation of the stack frame. When possible it prints the
+// values that occupy a stack slot using any available debug information. Since
+// output is remarks based, it is also available in a machine readable file
+// format, such as YAML.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/PrintPasses.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <sstream>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "stack-frame-layout"
+
+namespace {
+
+/// StackFrameLayoutAnalysisPass - This is a pass to dump the stack frame of a
+/// MachineFunction.
+///
+struct StackFrameLayoutAnalysisPass : public MachineFunctionPass {
+  using SlotDbgMap = SmallDenseMap<int, SetVector<const DILocalVariable *>>;
+  static char ID;
+
+  enum SlotType {
+    Spill,          // a Spill slot
+    StackProtector, // Stack Protector slot
+    Variable,       // a slot used to store a local data (could be a tmp)
+    Invalid         // It's an error for a slot to have this type
+  };
+
+  struct SlotData {
+    int Slot;
+    int Size;
+    int Align;
+    int Offset;
+    SlotType SlotTy;
+
+    SlotData(const MachineFrameInfo &MFI, const int ValOffset, const int Idx)
+        : Slot(Idx), Size(MFI.getObjectSize(Idx)),
+          Align(MFI.getObjectAlign(Idx).value()),
+          Offset(MFI.getObjectOffset(Idx) - ValOffset), SlotTy(Invalid) {
+      if (MFI.isSpillSlotObjectIndex(Idx))
+        SlotTy = SlotType::Spill;
+      else if (Idx == MFI.getStackProtectorIndex())
+        SlotTy = SlotType::StackProtector;
+      else
+        SlotTy = SlotType::Variable;
+    }
+
+    // we use this to sort in reverse order, so that the layout is displayed
+    // correctly
+    bool operator<(const SlotData &Rhs) const { return Offset > Rhs.Offset; }
+  };
+
+  StackFrameLayoutAnalysisPass() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "Stack Frame Layout Analysis";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<MachineOptimizationRemarkEmitterPass>();
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    // TODO: We should implement a similar filter for remarks:
+    //   -Rpass-func-filter=<regex>
+    if (!isFunctionInPrintList(MF.getName()))
+      return false;
+
+    LLVMContext &Ctx = MF.getFunction().getContext();
+    if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(DEBUG_TYPE))
+      return false;
+
+    MachineOptimizationRemarkAnalysis Rem(DEBUG_TYPE, "StackLayout",
+                                          MF.getFunction().getSubprogram(),
+                                          &MF.front());
+    Rem << ("\nFunction: " + MF.getName()).str();
+    emitStackFrameLayoutRemarks(MF, Rem);
+    getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE().emit(Rem);
+    return false;
+  }
+
+  std::string getTypeString(SlotType Ty) {
+    switch (Ty) {
+    case SlotType::Spill:
+      return "Spill";
+    case SlotType::StackProtector:
+      return "Protector";
+    case SlotType::Variable:
+      return "Variable";
+    default:
+      llvm_unreachable("bad slot type for stack layout");
+    }
+  }
+
+  void emitStackSlotRemark(const MachineFunction &MF, const SlotData &D,
+                           MachineOptimizationRemarkAnalysis &Rem) {
+    // To make it easy to understand the stack layout from the CLI, we want to
+    // print each slot like the following:
+    //
+    //   Offset: [SP+8], Type: Spill, Align: 8, Size: 16
+    //       foo @ /path/to/file.c:25
+    //       bar @ /path/to/file.c:35
+    //
+    // Which prints the size, alignment, and offset from the SP at function
+    // entry.
+    //
+    // But we also want the machine readable remarks data to be nicely
+    // organized. So we print some additional data as strings for the CLI
+    // output, but maintain more structured data for the YAML.
+    //
+    // For example we store the Offset in YAML as:
+    //    ...
+    //    - Offset: -8
+    //
+    // But we print it to the CLI as
+    //   Offset: [SP-8]
+
+    // Negative offsets will print a leading `-`, so only add `+`
+    std::string Prefix =
+        formatv("\nOffset: [SP{0}", (D.Offset < 0) ? "" : "+").str();
+    Rem << Prefix << ore::NV("Offset", D.Offset)
+        << "], Type: " << ore::NV("Type", getTypeString(D.SlotTy))
+        << ", Align: " << ore::NV("Align", D.Align)
+        << ", Size: " << ore::NV("Size", D.Size);
+  }
+
+  void emitSourceLocRemark(const MachineFunction &MF, const DILocalVariable *N,
+                           MachineOptimizationRemarkAnalysis &Rem) {
+    std::string Loc =
+        formatv("{0} @ {1}:{2}", N->getName(), N->getFilename(), N->getLine())
+            .str();
+    Rem << "\n    " << ore::NV("DataLoc", Loc);
+  }
+
+  void emitStackFrameLayoutRemarks(MachineFunction &MF,
+                                   MachineOptimizationRemarkAnalysis &Rem) {
+    const MachineFrameInfo &MFI = MF.getFrameInfo();
+    if (!MFI.hasStackObjects())
+      return;
+
+    // ValOffset is the offset to the local area from the SP at function entry.
+    // To display the true offset from SP, we need to subtract ValOffset from
+    // MFI's ObjectOffset.
+    const TargetFrameLowering *FI = MF.getSubtarget().getFrameLowering();
+    const int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0);
+
+    LLVM_DEBUG(dbgs() << "getStackProtectorIndex =="
+                      << MFI.getStackProtectorIndex() << "\n");
+
+    std::vector<SlotData> SlotInfo;
+
+    const unsigned int NumObj = MFI.getNumObjects();
+    SlotInfo.reserve(NumObj);
+    // initialize slot info
+    for (int Idx = MFI.getObjectIndexBegin(), EndIdx = MFI.getObjectIndexEnd();
+         Idx != EndIdx; ++Idx) {
+      if (MFI.isDeadObjectIndex(Idx))
+        continue;
+      SlotInfo.emplace_back(MFI, ValOffset, Idx);
+    }
+
+    // sort the ordering, to match the actual layout in memory
+    llvm::sort(SlotInfo);
+
+    SlotDbgMap SlotMap = genSlotDbgMapping(MF);
+
+    for (const SlotData &Info : SlotInfo) {
+      emitStackSlotRemark(MF, Info, Rem);
+      for (const DILocalVariable *N : SlotMap[Info.Slot])
+        emitSourceLocRemark(MF, N, Rem);
+    }
+  }
+
+  // We need to generate a mapping of slots to the values that are stored to
+  // them. This information is lost by the time we need to print out the frame,
+  // so we reconstruct it here by walking the CFG, and generating the mapping.
+  SlotDbgMap genSlotDbgMapping(MachineFunction &MF) {
+    SlotDbgMap SlotDebugMap;
+
+    // add variables to the map
+    for (MachineFunction::VariableDbgInfo &DI :
+         MF.getInStackSlotVariableDbgInfo())
+      SlotDebugMap[DI.getStackSlot()].insert(DI.Var);
+
+    // Then add all the spills that have debug data
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
+        for (MachineMemOperand *MO : MI.memoperands()) {
+          if (!MO->isStore())
+            continue;
+          auto *FI = dyn_cast_or_null<FixedStackPseudoSourceValue>(
+              MO->getPseudoValue());
+          if (!FI)
+            continue;
+          int FrameIdx = FI->getFrameIndex();
+          SmallVector<MachineInstr *> Dbg;
+          MI.collectDebugValues(Dbg);
+
+          for (MachineInstr *MI : Dbg)
+            SlotDebugMap[FrameIdx].insert(MI->getDebugVariable());
+        }
+      }
+    }
+
+    return SlotDebugMap;
+  }
+};
+
+char StackFrameLayoutAnalysisPass::ID = 0;
+} // namespace
+
+char &llvm::StackFrameLayoutAnalysisPassID = StackFrameLayoutAnalysisPass::ID;
+INITIALIZE_PASS(StackFrameLayoutAnalysisPass, "stack-frame-layout",
+                "Stack Frame Layout", false, false)
+
+namespace llvm {
+/// Returns a newly-created StackFrameLayout pass.
+MachineFunctionPass *createStackFrameLayoutAnalysisPass() {
+  return new StackFrameLayoutAnalysisPass();
+}
+
+} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp b/contrib/llvm-project/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
new file mode 100644
index 000000000000..778ac1f5701c
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
@@ -0,0 +1,171 @@
+//===-- StackMapLivenessAnalysis.cpp - StackMap live Out Analysis ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the StackMap Liveness analysis pass. The pass calculates
+// the liveness for each basic block in a function and attaches the register
+// live-out information to a stackmap or patchpoint intrinsic if present.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "stackmaps"
+
+static cl::opt<bool> EnablePatchPointLiveness(
+    "enable-patchpoint-liveness", cl::Hidden, cl::init(true),
+    cl::desc("Enable PatchPoint Liveness Analysis Pass"));
+
+STATISTIC(NumStackMapFuncVisited, "Number of functions visited");
+STATISTIC(NumStackMapFuncSkipped, "Number of functions skipped");
+STATISTIC(NumBBsVisited,          "Number of basic blocks visited");
+STATISTIC(NumBBsHaveNoStackmap,   "Number of basic blocks with no stackmap");
+STATISTIC(NumStackMaps,           "Number of StackMaps visited");
+
+namespace {
+/// This pass calculates the liveness information for each basic block in
+/// a function and attaches the register live-out information to a patchpoint
+/// intrinsic if present.
+///
+/// This pass can be disabled via the -enable-patchpoint-liveness=false flag.
+/// The pass skips functions that don't have any patchpoint intrinsics. The
+/// information provided by this pass is optional and not required by the
+/// aformentioned intrinsic to function.
+class StackMapLiveness : public MachineFunctionPass {
+  const TargetRegisterInfo *TRI = nullptr;
+  LivePhysRegs LiveRegs;
+
+public:
+  static char ID;
+
+  /// Default construct and initialize the pass.
+  StackMapLiveness();
+
+  /// Tell the pass manager which passes we depend on and what
+  /// information we preserve.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  /// Calculate the liveness information for the given machine function.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  /// Performs the actual liveness calculation for the function.
+  bool calculateLiveness(MachineFunction &MF);
+
+  /// Add the current register live set to the instruction.
+  void addLiveOutSetToMI(MachineFunction &MF, MachineInstr &MI);
+
+  /// Create a register mask and initialize it with the registers from
+  /// the register live set.
+  uint32_t *createRegisterMask(MachineFunction &MF) const;
+};
+} // namespace
+
+char StackMapLiveness::ID = 0;
+char &llvm::StackMapLivenessID = StackMapLiveness::ID;
+INITIALIZE_PASS(StackMapLiveness, "stackmap-liveness",
+                "StackMap Liveness Analysis", false, false)
+
+/// Default construct and initialize the pass.
+StackMapLiveness::StackMapLiveness() : MachineFunctionPass(ID) {
+  initializeStackMapLivenessPass(*PassRegistry::getPassRegistry());
+}
+
+/// Tell the pass manager which passes we depend on and what information we
+/// preserve.
+void StackMapLiveness::getAnalysisUsage(AnalysisUsage &AU) const {
+  // We preserve all information.
+  AU.setPreservesAll();
+  AU.setPreservesCFG();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Calculate the liveness information for the given machine function.
+bool StackMapLiveness::runOnMachineFunction(MachineFunction &MF) {
+  if (!EnablePatchPointLiveness)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "********** COMPUTING STACKMAP LIVENESS: "
+                    << MF.getName() << " **********\n");
+  TRI = MF.getSubtarget().getRegisterInfo();
+  ++NumStackMapFuncVisited;
+
+  // Skip this function if there are no patchpoints to process.
+  if (!MF.getFrameInfo().hasPatchPoint()) {
+    ++NumStackMapFuncSkipped;
+    return false;
+  }
+  return calculateLiveness(MF);
+}
+
+/// Performs the actual liveness calculation for the function.
+bool StackMapLiveness::calculateLiveness(MachineFunction &MF) {
+  bool HasChanged = false;
+  // For all basic blocks in the function.
+  for (auto &MBB : MF) {
+    LLVM_DEBUG(dbgs() << "****** BB " << MBB.getName() << " ******\n");
+    LiveRegs.init(*TRI);
+    // FIXME: This should probably be addLiveOuts().
+    LiveRegs.addLiveOutsNoPristines(MBB);
+    bool HasStackMap = false;
+    // Reverse iterate over all instructions and add the current live register
+    // set to an instruction if we encounter a patchpoint instruction.
+    for (MachineInstr &MI : llvm::reverse(MBB)) {
+      if (MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+        addLiveOutSetToMI(MF, MI);
+        HasChanged = true;
+        HasStackMap = true;
+        ++NumStackMaps;
+      }
+      LLVM_DEBUG(dbgs() << "   " << LiveRegs << "   " << MI);
+      LiveRegs.stepBackward(MI);
+    }
+    ++NumBBsVisited;
+    if (!HasStackMap)
+      ++NumBBsHaveNoStackmap;
+  }
+  return HasChanged;
+}
+
+/// Add the current register live set to the instruction.
+void StackMapLiveness::addLiveOutSetToMI(MachineFunction &MF,
+                                         MachineInstr &MI) {
+  uint32_t *Mask = createRegisterMask(MF);
+  MachineOperand MO = MachineOperand::CreateRegLiveOut(Mask);
+  MI.addOperand(MF, MO);
+}
+
+/// Create a register mask and initialize it with the registers from the
+/// register live set.
+uint32_t *StackMapLiveness::createRegisterMask(MachineFunction &MF) const {
+  // The mask is owned and cleaned up by the Machine Function.
+  uint32_t *Mask = MF.allocateRegMask();
+  for (auto Reg : LiveRegs)
+    Mask[Reg / 32] |= 1U << (Reg % 32);
+
+  // Give the target a chance to adjust the mask.
+  TRI->adjustStackMapLiveOutMask(Mask);
+
+  return Mask;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/StackMaps.cpp b/contrib/llvm-project/llvm/lib/CodeGen/StackMaps.cpp
new file mode 100644
index 000000000000..f9115e434878
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/StackMaps.cpp
@@ -0,0 +1,760 @@
+//===- StackMaps.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "stackmaps"
+
+static cl::opt<int> StackMapVersion(
+    "stackmap-version", cl::init(3), cl::Hidden,
+    cl::desc("Specify the stackmap encoding version (default = 3)"));
+
+const char *StackMaps::WSMP = "Stack Maps: ";
+
+static uint64_t getConstMetaVal(const MachineInstr &MI, unsigned Idx) {
+  assert(MI.getOperand(Idx).isImm() &&
+         MI.getOperand(Idx).getImm() == StackMaps::ConstantOp);
+  const auto &MO = MI.getOperand(Idx + 1);
+  assert(MO.isImm());
+  return MO.getImm();
+}
+
+StackMapOpers::StackMapOpers(const MachineInstr *MI)
+  : MI(MI) {
+  assert(getVarIdx() <= MI->getNumOperands() &&
+         "invalid stackmap definition");
+}
+
+PatchPointOpers::PatchPointOpers(const MachineInstr *MI)
+    : MI(MI), HasDef(MI->getOperand(0).isReg() && MI->getOperand(0).isDef() &&
+                     !MI->getOperand(0).isImplicit()) {
+#ifndef NDEBUG
+  unsigned CheckStartIdx = 0, e = MI->getNumOperands();
+  while (CheckStartIdx < e && MI->getOperand(CheckStartIdx).isReg() &&
+         MI->getOperand(CheckStartIdx).isDef() &&
+         !MI->getOperand(CheckStartIdx).isImplicit())
+    ++CheckStartIdx;
+
+  assert(getMetaIdx() == CheckStartIdx &&
+         "Unexpected additional definition in Patchpoint intrinsic.");
+#endif
+}
+
+unsigned PatchPointOpers::getNextScratchIdx(unsigned StartIdx) const {
+  if (!StartIdx)
+    StartIdx = getVarIdx();
+
+  // Find the next scratch register (implicit def and early clobber)
+  unsigned ScratchIdx = StartIdx, e = MI->getNumOperands();
+  while (ScratchIdx < e &&
+         !(MI->getOperand(ScratchIdx).isReg() &&
+           MI->getOperand(ScratchIdx).isDef() &&
+           MI->getOperand(ScratchIdx).isImplicit() &&
+           MI->getOperand(ScratchIdx).isEarlyClobber()))
+    ++ScratchIdx;
+
+  assert(ScratchIdx != e && "No scratch register available");
+  return ScratchIdx;
+}
+
+unsigned StatepointOpers::getNumGcMapEntriesIdx() {
+  // Take index of num of allocas and skip all allocas records.
+  unsigned CurIdx = getNumAllocaIdx();
+  unsigned NumAllocas = getConstMetaVal(*MI, CurIdx - 1);
+  CurIdx++;
+  while (NumAllocas--)
+    CurIdx = StackMaps::getNextMetaArgIdx(MI, CurIdx);
+  return CurIdx + 1; // skip <StackMaps::ConstantOp>
+}
+
+unsigned StatepointOpers::getNumAllocaIdx() {
+  // Take index of num of gc ptrs and skip all gc ptr records.
+  unsigned CurIdx = getNumGCPtrIdx();
+  unsigned NumGCPtrs = getConstMetaVal(*MI, CurIdx - 1);
+  CurIdx++;
+  while (NumGCPtrs--)
+    CurIdx = StackMaps::getNextMetaArgIdx(MI, CurIdx);
+  return CurIdx + 1; // skip <StackMaps::ConstantOp>
+}
+
+unsigned StatepointOpers::getNumGCPtrIdx() {
+  // Take index of num of deopt args and skip all deopt records.
+  unsigned CurIdx = getNumDeoptArgsIdx();
+  unsigned NumDeoptArgs = getConstMetaVal(*MI, CurIdx - 1);
+  CurIdx++;
+  while (NumDeoptArgs--) {
+    CurIdx = StackMaps::getNextMetaArgIdx(MI, CurIdx);
+  }
+  return CurIdx + 1; // skip <StackMaps::ConstantOp>
+}
+
+int StatepointOpers::getFirstGCPtrIdx() {
+  unsigned NumGCPtrsIdx = getNumGCPtrIdx();
+  unsigned NumGCPtrs = getConstMetaVal(*MI, NumGCPtrsIdx - 1);
+  if (NumGCPtrs == 0)
+    return -1;
+  ++NumGCPtrsIdx; // skip <num gc ptrs>
+  assert(NumGCPtrsIdx < MI->getNumOperands());
+  return (int)NumGCPtrsIdx;
+}
+
+unsigned StatepointOpers::getGCPointerMap(
+    SmallVectorImpl<std::pair<unsigned, unsigned>> &GCMap) {
+  unsigned CurIdx = getNumGcMapEntriesIdx();
+  unsigned GCMapSize = getConstMetaVal(*MI, CurIdx - 1);
+  CurIdx++;
+  for (unsigned N = 0; N < GCMapSize; ++N) {
+    unsigned B = MI->getOperand(CurIdx++).getImm();
+    unsigned D = MI->getOperand(CurIdx++).getImm();
+    GCMap.push_back(std::make_pair(B, D));
+  }
+
+  return GCMapSize;
+}
+
+bool StatepointOpers::isFoldableReg(Register Reg) const {
+  unsigned FoldableAreaStart = getVarIdx();
+  for (const MachineOperand &MO : MI->uses()) {
+    if (MO.getOperandNo() >= FoldableAreaStart)
+      break;
+    if (MO.isReg() && MO.getReg() == Reg)
+      return false;
+  }
+  return true;
+}
+
+bool StatepointOpers::isFoldableReg(const MachineInstr *MI, Register Reg) {
+  if (MI->getOpcode() != TargetOpcode::STATEPOINT)
+    return false;
+  return StatepointOpers(MI).isFoldableReg(Reg);
+}
+
+StackMaps::StackMaps(AsmPrinter &AP) : AP(AP) {
+  if (StackMapVersion != 3)
+    llvm_unreachable("Unsupported stackmap version!");
+}
+
+unsigned StackMaps::getNextMetaArgIdx(const MachineInstr *MI, unsigned CurIdx) {
+  assert(CurIdx < MI->getNumOperands() && "Bad meta arg index");
+  const auto &MO = MI->getOperand(CurIdx);
+  if (MO.isImm()) {
+    switch (MO.getImm()) {
+    default:
+      llvm_unreachable("Unrecognized operand type.");
+    case StackMaps::DirectMemRefOp:
+      CurIdx += 2;
+      break;
+    case StackMaps::IndirectMemRefOp:
+      CurIdx += 3;
+      break;
+    case StackMaps::ConstantOp:
+      ++CurIdx;
+      break;
+    }
+  }
+  ++CurIdx;
+  assert(CurIdx < MI->getNumOperands() && "points past operand list");
+  return CurIdx;
+}
+
+/// Go up the super-register chain until we hit a valid dwarf register number.
+static unsigned getDwarfRegNum(unsigned Reg, const TargetRegisterInfo *TRI) {
+  int RegNum;
+  for (MCPhysReg SR : TRI->superregs_inclusive(Reg)) {
+    RegNum = TRI->getDwarfRegNum(SR, false);
+    if (RegNum >= 0)
+      break;
+  }
+
+  assert(RegNum >= 0 && "Invalid Dwarf register number.");
+  return (unsigned)RegNum;
+}
+
+MachineInstr::const_mop_iterator
+StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
+                        MachineInstr::const_mop_iterator MOE, LocationVec &Locs,
+                        LiveOutVec &LiveOuts) const {
+  const TargetRegisterInfo *TRI = AP.MF->getSubtarget().getRegisterInfo();
+  if (MOI->isImm()) {
+    switch (MOI->getImm()) {
+    default:
+      llvm_unreachable("Unrecognized operand type.");
+    case StackMaps::DirectMemRefOp: {
+      auto &DL = AP.MF->getDataLayout();
+
+      unsigned Size = DL.getPointerSizeInBits();
+      assert((Size % 8) == 0 && "Need pointer size in bytes.");
+      Size /= 8;
+      Register Reg = (++MOI)->getReg();
+      int64_t Imm = (++MOI)->getImm();
+      Locs.emplace_back(StackMaps::Location::Direct, Size,
+                        getDwarfRegNum(Reg, TRI), Imm);
+      break;
+    }
+    case StackMaps::IndirectMemRefOp: {
+      int64_t Size = (++MOI)->getImm();
+      assert(Size > 0 && "Need a valid size for indirect memory locations.");
+      Register Reg = (++MOI)->getReg();
+      int64_t Imm = (++MOI)->getImm();
+      Locs.emplace_back(StackMaps::Location::Indirect, Size,
+                        getDwarfRegNum(Reg, TRI), Imm);
+      break;
+    }
+    case StackMaps::ConstantOp: {
+      ++MOI;
+      assert(MOI->isImm() && "Expected constant operand.");
+      int64_t Imm = MOI->getImm();
+      Locs.emplace_back(Location::Constant, sizeof(int64_t), 0, Imm);
+      break;
+    }
+    }
+    return ++MOI;
+  }
+
+  // The physical register number will ultimately be encoded as a DWARF regno.
+  // The stack map also records the size of a spill slot that can hold the
+  // register content. (The runtime can track the actual size of the data type
+  // if it needs to.)
+  if (MOI->isReg()) {
+    // Skip implicit registers (this includes our scratch registers)
+    if (MOI->isImplicit())
+      return ++MOI;
+
+    if (MOI->isUndef()) {
+      // Record `undef` register as constant. Use same value as ISel uses.
+      Locs.emplace_back(Location::Constant, sizeof(int64_t), 0, 0xFEFEFEFE);
+      return ++MOI;
+    }
+
+    assert(MOI->getReg().isPhysical() &&
+           "Virtreg operands should have been rewritten before now.");
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(MOI->getReg());
+    assert(!MOI->getSubReg() && "Physical subreg still around.");
+
+    unsigned Offset = 0;
+    unsigned DwarfRegNum = getDwarfRegNum(MOI->getReg(), TRI);
+    unsigned LLVMRegNum = *TRI->getLLVMRegNum(DwarfRegNum, false);
+    unsigned SubRegIdx = TRI->getSubRegIndex(LLVMRegNum, MOI->getReg());
+    if (SubRegIdx)
+      Offset = TRI->getSubRegIdxOffset(SubRegIdx);
+
+    Locs.emplace_back(Location::Register, TRI->getSpillSize(*RC),
+                      DwarfRegNum, Offset);
+    return ++MOI;
+  }
+
+  if (MOI->isRegLiveOut())
+    LiveOuts = parseRegisterLiveOutMask(MOI->getRegLiveOut());
+
+  return ++MOI;
+}
+
+void StackMaps::print(raw_ostream &OS) {
+  const TargetRegisterInfo *TRI =
+      AP.MF ? AP.MF->getSubtarget().getRegisterInfo() : nullptr;
+  OS << WSMP << "callsites:\n";
+  for (const auto &CSI : CSInfos) {
+    const LocationVec &CSLocs = CSI.Locations;
+    const LiveOutVec &LiveOuts = CSI.LiveOuts;
+
+    OS << WSMP << "callsite " << CSI.ID << "\n";
+    OS << WSMP << "  has " << CSLocs.size() << " locations\n";
+
+    unsigned Idx = 0;
+    for (const auto &Loc : CSLocs) {
+      OS << WSMP << "\t\tLoc " << Idx << ": ";
+      switch (Loc.Type) {
+      case Location::Unprocessed:
+        OS << "<Unprocessed operand>";
+        break;
+      case Location::Register:
+        OS << "Register ";
+        if (TRI)
+          OS << printReg(Loc.Reg, TRI);
+        else
+          OS << Loc.Reg;
+        break;
+      case Location::Direct:
+        OS << "Direct ";
+        if (TRI)
+          OS << printReg(Loc.Reg, TRI);
+        else
+          OS << Loc.Reg;
+        if (Loc.Offset)
+          OS << " + " << Loc.Offset;
+        break;
+      case Location::Indirect:
+        OS << "Indirect ";
+        if (TRI)
+          OS << printReg(Loc.Reg, TRI);
+        else
+          OS << Loc.Reg;
+        OS << "+" << Loc.Offset;
+        break;
+      case Location::Constant:
+        OS << "Constant " << Loc.Offset;
+        break;
+      case Location::ConstantIndex:
+        OS << "Constant Index " << Loc.Offset;
+        break;
+      }
+      OS << "\t[encoding: .byte " << Loc.Type << ", .byte 0"
+         << ", .short " << Loc.Size << ", .short " << Loc.Reg << ", .short 0"
+         << ", .int " << Loc.Offset << "]\n";
+      Idx++;
+    }
+
+    OS << WSMP << "\thas " << LiveOuts.size() << " live-out registers\n";
+
+    Idx = 0;
+    for (const auto &LO : LiveOuts) {
+      OS << WSMP << "\t\tLO " << Idx << ": ";
+      if (TRI)
+        OS << printReg(LO.Reg, TRI);
+      else
+        OS << LO.Reg;
+      OS << "\t[encoding: .short " << LO.DwarfRegNum << ", .byte 0, .byte "
+         << LO.Size << "]\n";
+      Idx++;
+    }
+  }
+}
+
+/// Create a live-out register record for the given register Reg.
+StackMaps::LiveOutReg
+StackMaps::createLiveOutReg(unsigned Reg, const TargetRegisterInfo *TRI) const {
+  unsigned DwarfRegNum = getDwarfRegNum(Reg, TRI);
+  unsigned Size = TRI->getSpillSize(*TRI->getMinimalPhysRegClass(Reg));
+  return LiveOutReg(Reg, DwarfRegNum, Size);
+}
+
+/// Parse the register live-out mask and return a vector of live-out registers
+/// that need to be recorded in the stackmap.
+StackMaps::LiveOutVec
+StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const {
+  assert(Mask && "No register mask specified");
+  const TargetRegisterInfo *TRI = AP.MF->getSubtarget().getRegisterInfo();
+  LiveOutVec LiveOuts;
+
+  // Create a LiveOutReg for each bit that is set in the register mask.
+  for (unsigned Reg = 0, NumRegs = TRI->getNumRegs(); Reg != NumRegs; ++Reg)
+    if ((Mask[Reg / 32] >> (Reg % 32)) & 1)
+      LiveOuts.push_back(createLiveOutReg(Reg, TRI));
+
+  // We don't need to keep track of a register if its super-register is already
+  // in the list. Merge entries that refer to the same dwarf register and use
+  // the maximum size that needs to be spilled.
+
+  llvm::sort(LiveOuts, [](const LiveOutReg &LHS, const LiveOutReg &RHS) {
+    // Only sort by the dwarf register number.
+    return LHS.DwarfRegNum < RHS.DwarfRegNum;
+  });
+
+  for (auto I = LiveOuts.begin(), E = LiveOuts.end(); I != E; ++I) {
+    for (auto *II = std::next(I); II != E; ++II) {
+      if (I->DwarfRegNum != II->DwarfRegNum) {
+        // Skip all the now invalid entries.
+        I = --II;
+        break;
+      }
+      I->Size = std::max(I->Size, II->Size);
+      if (I->Reg && TRI->isSuperRegister(I->Reg, II->Reg))
+        I->Reg = II->Reg;
+      II->Reg = 0; // mark for deletion.
+    }
+  }
+
+  llvm::erase_if(LiveOuts, [](const LiveOutReg &LO) { return LO.Reg == 0; });
+
+  return LiveOuts;
+}
+
+// See statepoint MI format description in StatepointOpers' class comment
+// in include/llvm/CodeGen/StackMaps.h
+void StackMaps::parseStatepointOpers(const MachineInstr &MI,
+                                     MachineInstr::const_mop_iterator MOI,
+                                     MachineInstr::const_mop_iterator MOE,
+                                     LocationVec &Locations,
+                                     LiveOutVec &LiveOuts) {
+  LLVM_DEBUG(dbgs() << "record statepoint : " << MI << "\n");
+  StatepointOpers SO(&MI);
+  MOI = parseOperand(MOI, MOE, Locations, LiveOuts); // CC
+  MOI = parseOperand(MOI, MOE, Locations, LiveOuts); // Flags
+  MOI = parseOperand(MOI, MOE, Locations, LiveOuts); // Num Deopts
+
+  // Record Deopt Args.
+  unsigned NumDeoptArgs = Locations.back().Offset;
+  assert(Locations.back().Type == Location::Constant);
+  assert(NumDeoptArgs == SO.getNumDeoptArgs());
+
+  while (NumDeoptArgs--)
+    MOI = parseOperand(MOI, MOE, Locations, LiveOuts);
+
+  // Record gc base/derived pairs
+  assert(MOI->isImm() && MOI->getImm() == StackMaps::ConstantOp);
+  ++MOI;
+  assert(MOI->isImm());
+  unsigned NumGCPointers = MOI->getImm();
+  ++MOI;
+  if (NumGCPointers) {
+    // Map logical index of GC ptr to MI operand index.
+    SmallVector<unsigned, 8> GCPtrIndices;
+    unsigned GCPtrIdx = (unsigned)SO.getFirstGCPtrIdx();
+    assert((int)GCPtrIdx != -1);
+    assert(MOI - MI.operands_begin() == GCPtrIdx + 0LL);
+    while (NumGCPointers--) {
+      GCPtrIndices.push_back(GCPtrIdx);
+      GCPtrIdx = StackMaps::getNextMetaArgIdx(&MI, GCPtrIdx);
+    }
+
+    SmallVector<std::pair<unsigned, unsigned>, 8> GCPairs;
+    unsigned NumGCPairs = SO.getGCPointerMap(GCPairs);
+    (void)NumGCPairs;
+    LLVM_DEBUG(dbgs() << "NumGCPairs = " << NumGCPairs << "\n");
+
+    auto MOB = MI.operands_begin();
+    for (auto &P : GCPairs) {
+      assert(P.first < GCPtrIndices.size() && "base pointer index not found");
+      assert(P.second < GCPtrIndices.size() &&
+             "derived pointer index not found");
+      unsigned BaseIdx = GCPtrIndices[P.first];
+      unsigned DerivedIdx = GCPtrIndices[P.second];
+      LLVM_DEBUG(dbgs() << "Base : " << BaseIdx << " Derived : " << DerivedIdx
+                        << "\n");
+      (void)parseOperand(MOB + BaseIdx, MOE, Locations, LiveOuts);
+      (void)parseOperand(MOB + DerivedIdx, MOE, Locations, LiveOuts);
+    }
+
+    MOI = MOB + GCPtrIdx;
+  }
+
+  // Record gc allocas
+  assert(MOI < MOE);
+  assert(MOI->isImm() && MOI->getImm() == StackMaps::ConstantOp);
+  ++MOI;
+  unsigned NumAllocas = MOI->getImm();
+  ++MOI;
+  while (NumAllocas--) {
+    MOI = parseOperand(MOI, MOE, Locations, LiveOuts);
+    assert(MOI < MOE);
+  }
+}
+
+void StackMaps::recordStackMapOpers(const MCSymbol &MILabel,
+                                    const MachineInstr &MI, uint64_t ID,
+                                    MachineInstr::const_mop_iterator MOI,
+                                    MachineInstr::const_mop_iterator MOE,
+                                    bool recordResult) {
+  MCContext &OutContext = AP.OutStreamer->getContext();
+
+  LocationVec Locations;
+  LiveOutVec LiveOuts;
+
+  if (recordResult) {
+    assert(PatchPointOpers(&MI).hasDef() && "Stackmap has no return value.");
+    parseOperand(MI.operands_begin(), std::next(MI.operands_begin()), Locations,
+                 LiveOuts);
+  }
+
+  // Parse operands.
+  if (MI.getOpcode() == TargetOpcode::STATEPOINT)
+    parseStatepointOpers(MI, MOI, MOE, Locations, LiveOuts);
+  else
+    while (MOI != MOE)
+      MOI = parseOperand(MOI, MOE, Locations, LiveOuts);
+
+  // Move large constants into the constant pool.
+  for (auto &Loc : Locations) {
+    // Constants are encoded as sign-extended integers.
+    // -1 is directly encoded as .long 0xFFFFFFFF with no constant pool.
+    if (Loc.Type == Location::Constant && !isInt<32>(Loc.Offset)) {
+      Loc.Type = Location::ConstantIndex;
+      // ConstPool is intentionally a MapVector of 'uint64_t's (as
+      // opposed to 'int64_t's).  We should never be in a situation
+      // where we have to insert either the tombstone or the empty
+      // keys into a map, and for a DenseMap<uint64_t, T> these are
+      // (uint64_t)0 and (uint64_t)-1.  They can be and are
+      // represented using 32 bit integers.
+      assert((uint64_t)Loc.Offset != DenseMapInfo<uint64_t>::getEmptyKey() &&
+             (uint64_t)Loc.Offset !=
+                 DenseMapInfo<uint64_t>::getTombstoneKey() &&
+             "empty and tombstone keys should fit in 32 bits!");
+      auto Result = ConstPool.insert(std::make_pair(Loc.Offset, Loc.Offset));
+      Loc.Offset = Result.first - ConstPool.begin();
+    }
+  }
+
+  // Create an expression to calculate the offset of the callsite from function
+  // entry.
+  const MCExpr *CSOffsetExpr = MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(&MILabel, OutContext),
+      MCSymbolRefExpr::create(AP.CurrentFnSymForSize, OutContext), OutContext);
+
+  CSInfos.emplace_back(CSOffsetExpr, ID, std::move(Locations),
+                       std::move(LiveOuts));
+
+  // Record the stack size of the current function and update callsite count.
+  const MachineFrameInfo &MFI = AP.MF->getFrameInfo();
+  const TargetRegisterInfo *RegInfo = AP.MF->getSubtarget().getRegisterInfo();
+  bool HasDynamicFrameSize =
+      MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(*(AP.MF));
+  uint64_t FrameSize = HasDynamicFrameSize ? UINT64_MAX : MFI.getStackSize();
+
+  auto CurrentIt = FnInfos.find(AP.CurrentFnSym);
+  if (CurrentIt != FnInfos.end())
+    CurrentIt->second.RecordCount++;
+  else
+    FnInfos.insert(std::make_pair(AP.CurrentFnSym, FunctionInfo(FrameSize)));
+}
+
+void StackMaps::recordStackMap(const MCSymbol &L, const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::STACKMAP && "expected stackmap");
+
+  StackMapOpers opers(&MI);
+  const int64_t ID = MI.getOperand(PatchPointOpers::IDPos).getImm();
+  recordStackMapOpers(L, MI, ID, std::next(MI.operands_begin(),
+                                           opers.getVarIdx()),
+                      MI.operands_end());
+}
+
+void StackMaps::recordPatchPoint(const MCSymbol &L, const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::PATCHPOINT && "expected patchpoint");
+
+  PatchPointOpers opers(&MI);
+  const int64_t ID = opers.getID();
+  auto MOI = std::next(MI.operands_begin(), opers.getStackMapStartIdx());
+  recordStackMapOpers(L, MI, ID, MOI, MI.operands_end(),
+                      opers.isAnyReg() && opers.hasDef());
+
+#ifndef NDEBUG
+  // verify anyregcc
+  auto &Locations = CSInfos.back().Locations;
+  if (opers.isAnyReg()) {
+    unsigned NArgs = opers.getNumCallArgs();
+    for (unsigned i = 0, e = (opers.hasDef() ? NArgs + 1 : NArgs); i != e; ++i)
+      assert(Locations[i].Type == Location::Register &&
+             "anyreg arg must be in reg.");
+  }
+#endif
+}
+
+void StackMaps::recordStatepoint(const MCSymbol &L, const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::STATEPOINT && "expected statepoint");
+
+  StatepointOpers opers(&MI);
+  const unsigned StartIdx = opers.getVarIdx();
+  recordStackMapOpers(L, MI, opers.getID(), MI.operands_begin() + StartIdx,
+                      MI.operands_end(), false);
+}
+
+/// Emit the stackmap header.
+///
+/// Header {
+///   uint8  : Stack Map Version (currently 3)
+///   uint8  : Reserved (expected to be 0)
+///   uint16 : Reserved (expected to be 0)
+/// }
+/// uint32 : NumFunctions
+/// uint32 : NumConstants
+/// uint32 : NumRecords
+void StackMaps::emitStackmapHeader(MCStreamer &OS) {
+  // Header.
+  OS.emitIntValue(StackMapVersion, 1); // Version.
+  OS.emitIntValue(0, 1);               // Reserved.
+  OS.emitInt16(0);                     // Reserved.
+
+  // Num functions.
+  LLVM_DEBUG(dbgs() << WSMP << "#functions = " << FnInfos.size() << '\n');
+  OS.emitInt32(FnInfos.size());
+  // Num constants.
+  LLVM_DEBUG(dbgs() << WSMP << "#constants = " << ConstPool.size() << '\n');
+  OS.emitInt32(ConstPool.size());
+  // Num callsites.
+  LLVM_DEBUG(dbgs() << WSMP << "#callsites = " << CSInfos.size() << '\n');
+  OS.emitInt32(CSInfos.size());
+}
+
+/// Emit the function frame record for each function.
+///
+/// StkSizeRecord[NumFunctions] {
+///   uint64 : Function Address
+///   uint64 : Stack Size
+///   uint64 : Record Count
+/// }
+void StackMaps::emitFunctionFrameRecords(MCStreamer &OS) {
+  // Function Frame records.
+  LLVM_DEBUG(dbgs() << WSMP << "functions:\n");
+  for (auto const &FR : FnInfos) {
+    LLVM_DEBUG(dbgs() << WSMP << "function addr: " << FR.first
+                      << " frame size: " << FR.second.StackSize
+                      << " callsite count: " << FR.second.RecordCount << '\n');
+    OS.emitSymbolValue(FR.first, 8);
+    OS.emitIntValue(FR.second.StackSize, 8);
+    OS.emitIntValue(FR.second.RecordCount, 8);
+  }
+}
+
+/// Emit the constant pool.
+///
+/// int64  : Constants[NumConstants]
+void StackMaps::emitConstantPoolEntries(MCStreamer &OS) {
+  // Constant pool entries.
+  LLVM_DEBUG(dbgs() << WSMP << "constants:\n");
+  for (const auto &ConstEntry : ConstPool) {
+    LLVM_DEBUG(dbgs() << WSMP << ConstEntry.second << '\n');
+    OS.emitIntValue(ConstEntry.second, 8);
+  }
+}
+
+/// Emit the callsite info for each callsite.
+///
+/// StkMapRecord[NumRecords] {
+///   uint64 : PatchPoint ID
+///   uint32 : Instruction Offset
+///   uint16 : Reserved (record flags)
+///   uint16 : NumLocations
+///   Location[NumLocations] {
+///     uint8  : Register | Direct | Indirect | Constant | ConstantIndex
+///     uint8  : Size in Bytes
+///     uint16 : Dwarf RegNum
+///     int32  : Offset
+///   }
+///   uint16 : Padding
+///   uint16 : NumLiveOuts
+///   LiveOuts[NumLiveOuts] {
+///     uint16 : Dwarf RegNum
+///     uint8  : Reserved
+///     uint8  : Size in Bytes
+///   }
+///   uint32 : Padding (only if required to align to 8 byte)
+/// }
+///
+/// Location Encoding, Type, Value:
+///   0x1, Register, Reg                 (value in register)
+///   0x2, Direct, Reg + Offset          (frame index)
+///   0x3, Indirect, [Reg + Offset]      (spilled value)
+///   0x4, Constant, Offset              (small constant)
+///   0x5, ConstIndex, Constants[Offset] (large constant)
+void StackMaps::emitCallsiteEntries(MCStreamer &OS) {
+  LLVM_DEBUG(print(dbgs()));
+  // Callsite entries.
+  for (const auto &CSI : CSInfos) {
+    const LocationVec &CSLocs = CSI.Locations;
+    const LiveOutVec &LiveOuts = CSI.LiveOuts;
+
+    // Verify stack map entry. It's better to communicate a problem to the
+    // runtime than crash in case of in-process compilation. Currently, we do
+    // simple overflow checks, but we may eventually communicate other
+    // compilation errors this way.
+    if (CSLocs.size() > UINT16_MAX || LiveOuts.size() > UINT16_MAX) {
+      OS.emitIntValue(UINT64_MAX, 8); // Invalid ID.
+      OS.emitValue(CSI.CSOffsetExpr, 4);
+      OS.emitInt16(0); // Reserved.
+      OS.emitInt16(0); // 0 locations.
+      OS.emitInt16(0); // padding.
+      OS.emitInt16(0); // 0 live-out registers.
+      OS.emitInt32(0); // padding.
+      continue;
+    }
+
+    OS.emitIntValue(CSI.ID, 8);
+    OS.emitValue(CSI.CSOffsetExpr, 4);
+
+    // Reserved for flags.
+    OS.emitInt16(0);
+    OS.emitInt16(CSLocs.size());
+
+    for (const auto &Loc : CSLocs) {
+      OS.emitIntValue(Loc.Type, 1);
+      OS.emitIntValue(0, 1);  // Reserved
+      OS.emitInt16(Loc.Size);
+      OS.emitInt16(Loc.Reg);
+      OS.emitInt16(0); // Reserved
+      OS.emitInt32(Loc.Offset);
+    }
+
+    // Emit alignment to 8 byte.
+    OS.emitValueToAlignment(Align(8));
+
+    // Num live-out registers and padding to align to 4 byte.
+    OS.emitInt16(0);
+    OS.emitInt16(LiveOuts.size());
+
+    for (const auto &LO : LiveOuts) {
+      OS.emitInt16(LO.DwarfRegNum);
+      OS.emitIntValue(0, 1);
+      OS.emitIntValue(LO.Size, 1);
+    }
+    // Emit alignment to 8 byte.
+    OS.emitValueToAlignment(Align(8));
+  }
+}
+
+/// Serialize the stackmap data.
+void StackMaps::serializeToStackMapSection() {
+  (void)WSMP;
+  // Bail out if there's no stack map data.
+  assert((!CSInfos.empty() || ConstPool.empty()) &&
+         "Expected empty constant pool too!");
+  assert((!CSInfos.empty() || FnInfos.empty()) &&
+         "Expected empty function record too!");
+  if (CSInfos.empty())
+    return;
+
+  MCContext &OutContext = AP.OutStreamer->getContext();
+  MCStreamer &OS = *AP.OutStreamer;
+
+  // Create the section.
+  MCSection *StackMapSection =
+      OutContext.getObjectFileInfo()->getStackMapSection();
+  OS.switchSection(StackMapSection);
+
+  // Emit a dummy symbol to force section inclusion.
+  OS.emitLabel(OutContext.getOrCreateSymbol(Twine("__LLVM_StackMaps")));
+
+  // Serialize data.
+  LLVM_DEBUG(dbgs() << "********** Stack Map Output **********\n");
+  emitStackmapHeader(OS);
+  emitFunctionFrameRecords(OS);
+  emitConstantPoolEntries(OS);
+  emitCallsiteEntries(OS);
+  OS.addBlankLine();
+
+  // Clean up.
+  CSInfos.clear();
+  ConstPool.clear();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/StackProtector.cpp b/contrib/llvm-project/llvm/lib/CodeGen/StackProtector.cpp
new file mode 100644
index 000000000000..387b653f8815
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/StackProtector.cpp
@@ -0,0 +1,660 @@
+//===- StackProtector.cpp - Stack Protector Insertion ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass inserts stack protectors into functions which need them. A variable
+// with a random value in it is stored onto the stack before the local variables
+// are allocated. Upon exiting the block, the stored value is checked. If it's
+// changed, then there was some sort of violation and the program aborts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/EHPersonalities.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <optional>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "stack-protector"
+
+STATISTIC(NumFunProtected, "Number of functions protected");
+STATISTIC(NumAddrTaken, "Number of local variables that have their address"
+                        " taken.");
+
+static cl::opt<bool> EnableSelectionDAGSP("enable-selectiondag-sp",
+                                          cl::init(true), cl::Hidden);
+static cl::opt<bool> DisableCheckNoReturn("disable-check-noreturn-call",
+                                          cl::init(false), cl::Hidden);
+
+char StackProtector::ID = 0;
+
+StackProtector::StackProtector() : FunctionPass(ID) {
+  initializeStackProtectorPass(*PassRegistry::getPassRegistry());
+}
+
+INITIALIZE_PASS_BEGIN(StackProtector, DEBUG_TYPE,
+                      "Insert stack protectors", false, true)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(StackProtector, DEBUG_TYPE,
+                    "Insert stack protectors", false, true)
+
+FunctionPass *llvm::createStackProtectorPass() { return new StackProtector(); }
+
+void StackProtector::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+}
+
+bool StackProtector::runOnFunction(Function &Fn) {
+  F = &Fn;
+  M = F->getParent();
+  if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
+    DTU.emplace(DTWP->getDomTree(), DomTreeUpdater::UpdateStrategy::Lazy);
+  TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+  Trip = TM->getTargetTriple();
+  TLI = TM->getSubtargetImpl(Fn)->getTargetLowering();
+  HasPrologue = false;
+  HasIRCheck = false;
+
+  SSPBufferSize = Fn.getFnAttributeAsParsedInteger(
+      "stack-protector-buffer-size", DefaultSSPBufferSize);
+  if (!requiresStackProtector(F, &Layout))
+    return false;
+
+  // TODO(etienneb): Functions with funclets are not correctly supported now.
+  // Do nothing if this is funclet-based personality.
+  if (Fn.hasPersonalityFn()) {
+    EHPersonality Personality = classifyEHPersonality(Fn.getPersonalityFn());
+    if (isFuncletEHPersonality(Personality))
+      return false;
+  }
+
+  ++NumFunProtected;
+  bool Changed = InsertStackProtectors();
+#ifdef EXPENSIVE_CHECKS
+  assert((!DTU ||
+          DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full)) &&
+         "Failed to maintain validity of domtree!");
+#endif
+  DTU.reset();
+  return Changed;
+}
+
+/// \param [out] IsLarge is set to true if a protectable array is found and
+/// it is "large" ( >= ssp-buffer-size).  In the case of a structure with
+/// multiple arrays, this gets set if any of them is large.
+static bool ContainsProtectableArray(Type *Ty, Module *M, unsigned SSPBufferSize,
+                                     bool &IsLarge, bool Strong,
+                                     bool InStruct) {
+  if (!Ty)
+    return false;
+  if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+    if (!AT->getElementType()->isIntegerTy(8)) {
+      // If we're on a non-Darwin platform or we're inside of a structure, don't
+      // add stack protectors unless the array is a character array.
+      // However, in strong mode any array, regardless of type and size,
+      // triggers a protector.
+      if (!Strong && (InStruct || !Triple(M->getTargetTriple()).isOSDarwin()))
+        return false;
+    }
+
+    // If an array has more than SSPBufferSize bytes of allocated space, then we
+    // emit stack protectors.
+    if (SSPBufferSize <= M->getDataLayout().getTypeAllocSize(AT)) {
+      IsLarge = true;
+      return true;
+    }
+
+    if (Strong)
+      // Require a protector for all arrays in strong mode
+      return true;
+  }
+
+  const StructType *ST = dyn_cast<StructType>(Ty);
+  if (!ST)
+    return false;
+
+  bool NeedsProtector = false;
+  for (Type *ET : ST->elements())
+    if (ContainsProtectableArray(ET, M, SSPBufferSize, IsLarge, Strong, true)) {
+      // If the element is a protectable array and is large (>= SSPBufferSize)
+      // then we are done.  If the protectable array is not large, then
+      // keep looking in case a subsequent element is a large array.
+      if (IsLarge)
+        return true;
+      NeedsProtector = true;
+    }
+
+  return NeedsProtector;
+}
+
+/// Check whether a stack allocation has its address taken.
+static bool HasAddressTaken(const Instruction *AI, TypeSize AllocSize,
+                            Module *M,
+                            SmallPtrSet<const PHINode *, 16> &VisitedPHIs) {
+  const DataLayout &DL = M->getDataLayout();
+  for (const User *U : AI->users()) {
+    const auto *I = cast<Instruction>(U);
+    // If this instruction accesses memory make sure it doesn't access beyond
+    // the bounds of the allocated object.
+    std::optional<MemoryLocation> MemLoc = MemoryLocation::getOrNone(I);
+    if (MemLoc && MemLoc->Size.hasValue() &&
+        !TypeSize::isKnownGE(AllocSize,
+                             TypeSize::getFixed(MemLoc->Size.getValue())))
+      return true;
+    switch (I->getOpcode()) {
+    case Instruction::Store:
+      if (AI == cast<StoreInst>(I)->getValueOperand())
+        return true;
+      break;
+    case Instruction::AtomicCmpXchg:
+      // cmpxchg conceptually includes both a load and store from the same
+      // location. So, like store, the value being stored is what matters.
+      if (AI == cast<AtomicCmpXchgInst>(I)->getNewValOperand())
+        return true;
+      break;
+    case Instruction::PtrToInt:
+      if (AI == cast<PtrToIntInst>(I)->getOperand(0))
+        return true;
+      break;
+    case Instruction::Call: {
+      // Ignore intrinsics that do not become real instructions.
+      // TODO: Narrow this to intrinsics that have store-like effects.
+      const auto *CI = cast<CallInst>(I);
+      if (!CI->isDebugOrPseudoInst() && !CI->isLifetimeStartOrEnd())
+        return true;
+      break;
+    }
+    case Instruction::Invoke:
+      return true;
+    case Instruction::GetElementPtr: {
+      // If the GEP offset is out-of-bounds, or is non-constant and so has to be
+      // assumed to be potentially out-of-bounds, then any memory access that
+      // would use it could also be out-of-bounds meaning stack protection is
+      // required.
+      const GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+      unsigned IndexSize = DL.getIndexTypeSizeInBits(I->getType());
+      APInt Offset(IndexSize, 0);
+      if (!GEP->accumulateConstantOffset(DL, Offset))
+        return true;
+      TypeSize OffsetSize = TypeSize::Fixed(Offset.getLimitedValue());
+      if (!TypeSize::isKnownGT(AllocSize, OffsetSize))
+        return true;
+      // Adjust AllocSize to be the space remaining after this offset.
+      // We can't subtract a fixed size from a scalable one, so in that case
+      // assume the scalable value is of minimum size.
+      TypeSize NewAllocSize =
+          TypeSize::Fixed(AllocSize.getKnownMinValue()) - OffsetSize;
+      if (HasAddressTaken(I, NewAllocSize, M, VisitedPHIs))
+        return true;
+      break;
+    }
+    case Instruction::BitCast:
+    case Instruction::Select:
+    case Instruction::AddrSpaceCast:
+      if (HasAddressTaken(I, AllocSize, M, VisitedPHIs))
+        return true;
+      break;
+    case Instruction::PHI: {
+      // Keep track of what PHI nodes we have already visited to ensure
+      // they are only visited once.
+      const auto *PN = cast<PHINode>(I);
+      if (VisitedPHIs.insert(PN).second)
+        if (HasAddressTaken(PN, AllocSize, M, VisitedPHIs))
+          return true;
+      break;
+    }
+    case Instruction::Load:
+    case Instruction::AtomicRMW:
+    case Instruction::Ret:
+      // These instructions take an address operand, but have load-like or
+      // other innocuous behavior that should not trigger a stack protector.
+      // atomicrmw conceptually has both load and store semantics, but the
+      // value being stored must be integer; so if a pointer is being stored,
+      // we'll catch it in the PtrToInt case above.
+      break;
+    default:
+      // Conservatively return true for any instruction that takes an address
+      // operand, but is not handled above.
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Search for the first call to the llvm.stackprotector intrinsic and return it
+/// if present.
+static const CallInst *findStackProtectorIntrinsic(Function &F) {
+  for (const BasicBlock &BB : F)
+    for (const Instruction &I : BB)
+      if (const auto *II = dyn_cast<IntrinsicInst>(&I))
+        if (II->getIntrinsicID() == Intrinsic::stackprotector)
+          return II;
+  return nullptr;
+}
+
+/// Check whether or not this function needs a stack protector based
+/// upon the stack protector level.
+///
+/// We use two heuristics: a standard (ssp) and strong (sspstrong).
+/// The standard heuristic which will add a guard variable to functions that
+/// call alloca with a either a variable size or a size >= SSPBufferSize,
+/// functions with character buffers larger than SSPBufferSize, and functions
+/// with aggregates containing character buffers larger than SSPBufferSize. The
+/// strong heuristic will add a guard variables to functions that call alloca
+/// regardless of size, functions with any buffer regardless of type and size,
+/// functions with aggregates that contain any buffer regardless of type and
+/// size, and functions that contain stack-based variables that have had their
+/// address taken.
+bool StackProtector::requiresStackProtector(Function *F, SSPLayoutMap *Layout) {
+  Module *M = F->getParent();
+  bool Strong = false;
+  bool NeedsProtector = false;
+
+  // The set of PHI nodes visited when determining if a variable's reference has
+  // been taken.  This set is maintained to ensure we don't visit the same PHI
+  // node multiple times.
+  SmallPtrSet<const PHINode *, 16> VisitedPHIs;
+
+  unsigned SSPBufferSize = F->getFnAttributeAsParsedInteger(
+      "stack-protector-buffer-size", DefaultSSPBufferSize);
+
+  if (F->hasFnAttribute(Attribute::SafeStack))
+    return false;
+
+  // We are constructing the OptimizationRemarkEmitter on the fly rather than
+  // using the analysis pass to avoid building DominatorTree and LoopInfo which
+  // are not available this late in the IR pipeline.
+  OptimizationRemarkEmitter ORE(F);
+
+  if (F->hasFnAttribute(Attribute::StackProtectReq)) {
+    if (!Layout)
+      return true;
+    ORE.emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "StackProtectorRequested", F)
+             << "Stack protection applied to function "
+             << ore::NV("Function", F)
+             << " due to a function attribute or command-line switch";
+    });
+    NeedsProtector = true;
+    Strong = true; // Use the same heuristic as strong to determine SSPLayout
+  } else if (F->hasFnAttribute(Attribute::StackProtectStrong))
+    Strong = true;
+  else if (!F->hasFnAttribute(Attribute::StackProtect))
+    return false;
+
+  for (const BasicBlock &BB : *F) {
+    for (const Instruction &I : BB) {
+      if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
+        if (AI->isArrayAllocation()) {
+          auto RemarkBuilder = [&]() {
+            return OptimizationRemark(DEBUG_TYPE, "StackProtectorAllocaOrArray",
+                                      &I)
+                   << "Stack protection applied to function "
+                   << ore::NV("Function", F)
+                   << " due to a call to alloca or use of a variable length "
+                      "array";
+          };
+          if (const auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) {
+            if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) {
+              // A call to alloca with size >= SSPBufferSize requires
+              // stack protectors.
+              if (!Layout)
+                return true;
+              Layout->insert(
+                  std::make_pair(AI, MachineFrameInfo::SSPLK_LargeArray));
+              ORE.emit(RemarkBuilder);
+              NeedsProtector = true;
+            } else if (Strong) {
+              // Require protectors for all alloca calls in strong mode.
+              if (!Layout)
+                return true;
+              Layout->insert(
+                  std::make_pair(AI, MachineFrameInfo::SSPLK_SmallArray));
+              ORE.emit(RemarkBuilder);
+              NeedsProtector = true;
+            }
+          } else {
+            // A call to alloca with a variable size requires protectors.
+            if (!Layout)
+              return true;
+            Layout->insert(
+                std::make_pair(AI, MachineFrameInfo::SSPLK_LargeArray));
+            ORE.emit(RemarkBuilder);
+            NeedsProtector = true;
+          }
+          continue;
+        }
+
+        bool IsLarge = false;
+        if (ContainsProtectableArray(AI->getAllocatedType(), M, SSPBufferSize,
+                                     IsLarge, Strong, false)) {
+          if (!Layout)
+            return true;
+          Layout->insert(std::make_pair(
+              AI, IsLarge ? MachineFrameInfo::SSPLK_LargeArray
+                          : MachineFrameInfo::SSPLK_SmallArray));
+          ORE.emit([&]() {
+            return OptimizationRemark(DEBUG_TYPE, "StackProtectorBuffer", &I)
+                   << "Stack protection applied to function "
+                   << ore::NV("Function", F)
+                   << " due to a stack allocated buffer or struct containing a "
+                      "buffer";
+          });
+          NeedsProtector = true;
+          continue;
+        }
+
+        if (Strong &&
+            HasAddressTaken(
+                AI, M->getDataLayout().getTypeAllocSize(AI->getAllocatedType()),
+                M, VisitedPHIs)) {
+          ++NumAddrTaken;
+          if (!Layout)
+            return true;
+          Layout->insert(std::make_pair(AI, MachineFrameInfo::SSPLK_AddrOf));
+          ORE.emit([&]() {
+            return OptimizationRemark(DEBUG_TYPE, "StackProtectorAddressTaken",
+                                      &I)
+                   << "Stack protection applied to function "
+                   << ore::NV("Function", F)
+                   << " due to the address of a local variable being taken";
+          });
+          NeedsProtector = true;
+        }
+        // Clear any PHIs that we visited, to make sure we examine all uses of
+        // any subsequent allocas that we look at.
+        VisitedPHIs.clear();
+      }
+    }
+  }
+
+  return NeedsProtector;
+}
+
+/// Create a stack guard loading and populate whether SelectionDAG SSP is
+/// supported.
+static Value *getStackGuard(const TargetLoweringBase *TLI, Module *M,
+                            IRBuilder<> &B,
+                            bool *SupportsSelectionDAGSP = nullptr) {
+  Value *Guard = TLI->getIRStackGuard(B);
+  StringRef GuardMode = M->getStackProtectorGuard();
+  if ((GuardMode == "tls" || GuardMode.empty()) && Guard)
+    return B.CreateLoad(B.getInt8PtrTy(), Guard, true, "StackGuard");
+
+  // Use SelectionDAG SSP handling, since there isn't an IR guard.
+  //
+  // This is more or less weird, since we optionally output whether we
+  // should perform a SelectionDAG SP here. The reason is that it's strictly
+  // defined as !TLI->getIRStackGuard(B), where getIRStackGuard is also
+  // mutating. There is no way to get this bit without mutating the IR, so
+  // getting this bit has to happen in this right time.
+  //
+  // We could have define a new function TLI::supportsSelectionDAGSP(), but that
+  // will put more burden on the backends' overriding work, especially when it
+  // actually conveys the same information getIRStackGuard() already gives.
+  if (SupportsSelectionDAGSP)
+    *SupportsSelectionDAGSP = true;
+  TLI->insertSSPDeclarations(*M);
+  return B.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackguard));
+}
+
+/// Insert code into the entry block that stores the stack guard
+/// variable onto the stack:
+///
+///   entry:
+///     StackGuardSlot = alloca i8*
+///     StackGuard = <stack guard>
+///     call void @llvm.stackprotector(StackGuard, StackGuardSlot)
+///
+/// Returns true if the platform/triple supports the stackprotectorcreate pseudo
+/// node.
+static bool CreatePrologue(Function *F, Module *M, Instruction *CheckLoc,
+                           const TargetLoweringBase *TLI, AllocaInst *&AI) {
+  bool SupportsSelectionDAGSP = false;
+  IRBuilder<> B(&F->getEntryBlock().front());
+  PointerType *PtrTy = Type::getInt8PtrTy(CheckLoc->getContext());
+  AI = B.CreateAlloca(PtrTy, nullptr, "StackGuardSlot");
+
+  Value *GuardSlot = getStackGuard(TLI, M, B, &SupportsSelectionDAGSP);
+  B.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackprotector),
+               {GuardSlot, AI});
+  return SupportsSelectionDAGSP;
+}
+
+/// InsertStackProtectors - Insert code into the prologue and epilogue of the
+/// function.
+///
+///  - The prologue code loads and stores the stack guard onto the stack.
+///  - The epilogue checks the value stored in the prologue against the original
+///    value. It calls __stack_chk_fail if they differ.
+bool StackProtector::InsertStackProtectors() {
+  // If the target wants to XOR the frame pointer into the guard value, it's
+  // impossible to emit the check in IR, so the target *must* support stack
+  // protection in SDAG.
+  bool SupportsSelectionDAGSP =
+      TLI->useStackGuardXorFP() ||
+      (EnableSelectionDAGSP && !TM->Options.EnableFastISel);
+  AllocaInst *AI = nullptr; // Place on stack that stores the stack guard.
+  BasicBlock *FailBB = nullptr;
+
+  for (BasicBlock &BB : llvm::make_early_inc_range(*F)) {
+    // This is stack protector auto generated check BB, skip it.
+    if (&BB == FailBB)
+      continue;
+    Instruction *CheckLoc = dyn_cast<ReturnInst>(BB.getTerminator());
+    if (!CheckLoc && !DisableCheckNoReturn)
+      for (auto &Inst : BB)
+        if (auto *CB = dyn_cast<CallBase>(&Inst))
+          // Do stack check before noreturn calls that aren't nounwind (e.g:
+          // __cxa_throw).
+          if (CB->doesNotReturn() && !CB->doesNotThrow()) {
+            CheckLoc = CB;
+            break;
+          }
+
+    if (!CheckLoc)
+      continue;
+
+    // Generate prologue instrumentation if not already generated.
+    if (!HasPrologue) {
+      HasPrologue = true;
+      SupportsSelectionDAGSP &= CreatePrologue(F, M, CheckLoc, TLI, AI);
+    }
+
+    // SelectionDAG based code generation. Nothing else needs to be done here.
+    // The epilogue instrumentation is postponed to SelectionDAG.
+    if (SupportsSelectionDAGSP)
+      break;
+
+    // Find the stack guard slot if the prologue was not created by this pass
+    // itself via a previous call to CreatePrologue().
+    if (!AI) {
+      const CallInst *SPCall = findStackProtectorIntrinsic(*F);
+      assert(SPCall && "Call to llvm.stackprotector is missing");
+      AI = cast<AllocaInst>(SPCall->getArgOperand(1));
+    }
+
+    // Set HasIRCheck to true, so that SelectionDAG will not generate its own
+    // version. SelectionDAG called 'shouldEmitSDCheck' to check whether
+    // instrumentation has already been generated.
+    HasIRCheck = true;
+
+    // If we're instrumenting a block with a tail call, the check has to be
+    // inserted before the call rather than between it and the return. The
+    // verifier guarantees that a tail call is either directly before the
+    // return or with a single correct bitcast of the return value in between so
+    // we don't need to worry about many situations here.
+    Instruction *Prev = CheckLoc->getPrevNonDebugInstruction();
+    if (Prev && isa<CallInst>(Prev) && cast<CallInst>(Prev)->isTailCall())
+      CheckLoc = Prev;
+    else if (Prev) {
+      Prev = Prev->getPrevNonDebugInstruction();
+      if (Prev && isa<CallInst>(Prev) && cast<CallInst>(Prev)->isTailCall())
+        CheckLoc = Prev;
+    }
+
+    // Generate epilogue instrumentation. The epilogue intrumentation can be
+    // function-based or inlined depending on which mechanism the target is
+    // providing.
+    if (Function *GuardCheck = TLI->getSSPStackGuardCheck(*M)) {
+      // Generate the function-based epilogue instrumentation.
+      // The target provides a guard check function, generate a call to it.
+      IRBuilder<> B(CheckLoc);
+      LoadInst *Guard = B.CreateLoad(B.getInt8PtrTy(), AI, true, "Guard");
+      CallInst *Call = B.CreateCall(GuardCheck, {Guard});
+      Call->setAttributes(GuardCheck->getAttributes());
+      Call->setCallingConv(GuardCheck->getCallingConv());
+    } else {
+      // Generate the epilogue with inline instrumentation.
+      // If we do not support SelectionDAG based calls, generate IR level
+      // calls.
+      //
+      // For each block with a return instruction, convert this:
+      //
+      //   return:
+      //     ...
+      //     ret ...
+      //
+      // into this:
+      //
+      //   return:
+      //     ...
+      //     %1 = <stack guard>
+      //     %2 = load StackGuardSlot
+      //     %3 = icmp ne i1 %1, %2
+      //     br i1 %3, label %CallStackCheckFailBlk, label %SP_return
+      //
+      //   SP_return:
+      //     ret ...
+      //
+      //   CallStackCheckFailBlk:
+      //     call void @__stack_chk_fail()
+      //     unreachable
+
+      // Create the FailBB. We duplicate the BB every time since the MI tail
+      // merge pass will merge together all of the various BB into one including
+      // fail BB generated by the stack protector pseudo instruction.
+      if (!FailBB)
+        FailBB = CreateFailBB();
+
+      IRBuilder<> B(CheckLoc);
+      Value *Guard = getStackGuard(TLI, M, B);
+      LoadInst *LI2 = B.CreateLoad(B.getInt8PtrTy(), AI, true);
+      auto *Cmp = cast<ICmpInst>(B.CreateICmpNE(Guard, LI2));
+      auto SuccessProb =
+          BranchProbabilityInfo::getBranchProbStackProtector(true);
+      auto FailureProb =
+          BranchProbabilityInfo::getBranchProbStackProtector(false);
+      MDNode *Weights = MDBuilder(F->getContext())
+                            .createBranchWeights(FailureProb.getNumerator(),
+                                                 SuccessProb.getNumerator());
+
+      SplitBlockAndInsertIfThen(Cmp, CheckLoc,
+                                /*Unreachable=*/false, Weights,
+                                DTU ? &*DTU : nullptr,
+                                /*LI=*/nullptr, /*ThenBlock=*/FailBB);
+
+      auto *BI = cast<BranchInst>(Cmp->getParent()->getTerminator());
+      BasicBlock *NewBB = BI->getSuccessor(1);
+      NewBB->setName("SP_return");
+      NewBB->moveAfter(&BB);
+
+      Cmp->setPredicate(Cmp->getInversePredicate());
+      BI->swapSuccessors();
+    }
+  }
+
+  // Return if we didn't modify any basic blocks. i.e., there are no return
+  // statements in the function.
+  return HasPrologue;
+}
+
+/// CreateFailBB - Create a basic block to jump to when the stack protector
+/// check fails.
+BasicBlock *StackProtector::CreateFailBB() {
+  LLVMContext &Context = F->getContext();
+  BasicBlock *FailBB = BasicBlock::Create(Context, "CallStackCheckFailBlk", F);
+  IRBuilder<> B(FailBB);
+  if (F->getSubprogram())
+    B.SetCurrentDebugLocation(
+        DILocation::get(Context, 0, 0, F->getSubprogram()));
+  FunctionCallee StackChkFail;
+  SmallVector<Value *, 1> Args;
+  if (Trip.isOSOpenBSD()) {
+    StackChkFail = M->getOrInsertFunction("__stack_smash_handler",
+                                          Type::getVoidTy(Context),
+                                          Type::getInt8PtrTy(Context));
+    Args.push_back(B.CreateGlobalStringPtr(F->getName(), "SSH"));
+  } else {
+    StackChkFail =
+        M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context));
+  }
+  cast<Function>(StackChkFail.getCallee())->addFnAttr(Attribute::NoReturn);
+  B.CreateCall(StackChkFail, Args);
+  B.CreateUnreachable();
+  return FailBB;
+}
+
+bool StackProtector::shouldEmitSDCheck(const BasicBlock &BB) const {
+  return HasPrologue && !HasIRCheck && isa<ReturnInst>(BB.getTerminator());
+}
+
+void StackProtector::copyToMachineFrameInfo(MachineFrameInfo &MFI) const {
+  if (Layout.empty())
+    return;
+
+  for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
+    if (MFI.isDeadObjectIndex(I))
+      continue;
+
+    const AllocaInst *AI = MFI.getObjectAllocation(I);
+    if (!AI)
+      continue;
+
+    SSPLayoutMap::const_iterator LI = Layout.find(AI);
+    if (LI == Layout.end())
+      continue;
+
+    MFI.setObjectSSPLayout(I, LI->second);
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/StackSlotColoring.cpp b/contrib/llvm-project/llvm/lib/CodeGen/StackSlotColoring.cpp
new file mode 100644
index 000000000000..6d933ab12041
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -0,0 +1,550 @@
+//===- StackSlotColoring.cpp - Stack slot coloring pass. ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the stack slot coloring pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalUnion.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveStacks.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "stack-slot-coloring"
+
+static cl::opt<bool>
+DisableSharing("no-stack-slot-sharing",
+             cl::init(false), cl::Hidden,
+             cl::desc("Suppress slot sharing during stack coloring"));
+
+static cl::opt<int> DCELimit("ssc-dce-limit", cl::init(-1), cl::Hidden);
+
+STATISTIC(NumEliminated, "Number of stack slots eliminated due to coloring");
+STATISTIC(NumDead,       "Number of trivially dead stack accesses eliminated");
+
+namespace {
+
+  class StackSlotColoring : public MachineFunctionPass {
+    LiveStacks *LS = nullptr;
+    MachineFrameInfo *MFI = nullptr;
+    const TargetInstrInfo *TII = nullptr;
+    const MachineBlockFrequencyInfo *MBFI = nullptr;
+
+    // SSIntervals - Spill slot intervals.
+    std::vector<LiveInterval*> SSIntervals;
+
+    // SSRefs - Keep a list of MachineMemOperands for each spill slot.
+    // MachineMemOperands can be shared between instructions, so we need
+    // to be careful that renames like [FI0, FI1] -> [FI1, FI2] do not
+    // become FI0 -> FI1 -> FI2.
+    SmallVector<SmallVector<MachineMemOperand *, 8>, 16> SSRefs;
+
+    // OrigAlignments - Alignments of stack objects before coloring.
+    SmallVector<Align, 16> OrigAlignments;
+
+    // OrigSizes - Sizes of stack objects before coloring.
+    SmallVector<unsigned, 16> OrigSizes;
+
+    // AllColors - If index is set, it's a spill slot, i.e. color.
+    // FIXME: This assumes PEI locate spill slot with smaller indices
+    // closest to stack pointer / frame pointer. Therefore, smaller
+    // index == better color. This is per stack ID.
+    SmallVector<BitVector, 2> AllColors;
+
+    // NextColor - Next "color" that's not yet used. This is per stack ID.
+    SmallVector<int, 2> NextColors = { -1 };
+
+    // UsedColors - "Colors" that have been assigned. This is per stack ID
+    SmallVector<BitVector, 2> UsedColors;
+
+    // Join all intervals sharing one color into a single LiveIntervalUnion to
+    // speedup range overlap test.
+    class ColorAssignmentInfo {
+      // Single liverange (used to avoid creation of LiveIntervalUnion).
+      LiveInterval *SingleLI = nullptr;
+      // LiveIntervalUnion to perform overlap test.
+      LiveIntervalUnion *LIU = nullptr;
+      // LiveIntervalUnion has a parameter in its constructor so doing this
+      // dirty magic.
+      uint8_t LIUPad[sizeof(LiveIntervalUnion)];
+
+    public:
+      ~ColorAssignmentInfo() {
+        if (LIU)
+          LIU->~LiveIntervalUnion(); // Dirty magic again.
+      }
+
+      // Return true if LiveInterval overlaps with any
+      // intervals that have already been assigned to this color.
+      bool overlaps(LiveInterval *LI) const {
+        if (LIU)
+          return LiveIntervalUnion::Query(*LI, *LIU).checkInterference();
+        return SingleLI ? SingleLI->overlaps(*LI) : false;
+      }
+
+      // Add new LiveInterval to this color.
+      void add(LiveInterval *LI, LiveIntervalUnion::Allocator &Alloc) {
+        assert(!overlaps(LI));
+        if (LIU) {
+          LIU->unify(*LI, *LI);
+        } else if (SingleLI) {
+          LIU = new (LIUPad) LiveIntervalUnion(Alloc);
+          LIU->unify(*SingleLI, *SingleLI);
+          LIU->unify(*LI, *LI);
+          SingleLI = nullptr;
+        } else
+          SingleLI = LI;
+      }
+    };
+
+    LiveIntervalUnion::Allocator LIUAlloc;
+
+    // Assignments - Color to intervals mapping.
+    SmallVector<ColorAssignmentInfo, 16> Assignments;
+
+  public:
+    static char ID; // Pass identification
+
+    StackSlotColoring() : MachineFunctionPass(ID) {
+      initializeStackSlotColoringPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<SlotIndexes>();
+      AU.addPreserved<SlotIndexes>();
+      AU.addRequired<LiveStacks>();
+      AU.addRequired<MachineBlockFrequencyInfo>();
+      AU.addPreserved<MachineBlockFrequencyInfo>();
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    void InitializeSlots();
+    void ScanForSpillSlotRefs(MachineFunction &MF);
+    int ColorSlot(LiveInterval *li);
+    bool ColorSlots(MachineFunction &MF);
+    void RewriteInstruction(MachineInstr &MI, SmallVectorImpl<int> &SlotMapping,
+                            MachineFunction &MF);
+    bool RemoveDeadStores(MachineBasicBlock* MBB);
+  };
+
+} // end anonymous namespace
+
+char StackSlotColoring::ID = 0;
+
+char &llvm::StackSlotColoringID = StackSlotColoring::ID;
+
+INITIALIZE_PASS_BEGIN(StackSlotColoring, DEBUG_TYPE,
+                "Stack Slot Coloring", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveStacks)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(StackSlotColoring, DEBUG_TYPE,
+                "Stack Slot Coloring", false, false)
+
+namespace {
+
+// IntervalSorter - Comparison predicate that sort live intervals by
+// their weight.
+struct IntervalSorter {
+  bool operator()(LiveInterval* LHS, LiveInterval* RHS) const {
+    return LHS->weight() > RHS->weight();
+  }
+};
+
+} // end anonymous namespace
+
+/// ScanForSpillSlotRefs - Scan all the machine instructions for spill slot
+/// references and update spill slot weights.
+void StackSlotColoring::ScanForSpillSlotRefs(MachineFunction &MF) {
+  SSRefs.resize(MFI->getObjectIndexEnd());
+
+  // FIXME: Need the equivalent of MachineRegisterInfo for frameindex operands.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isFI())
+          continue;
+        int FI = MO.getIndex();
+        if (FI < 0)
+          continue;
+        if (!LS->hasInterval(FI))
+          continue;
+        LiveInterval &li = LS->getInterval(FI);
+        if (!MI.isDebugInstr())
+          li.incrementWeight(
+              LiveIntervals::getSpillWeight(false, true, MBFI, MI));
+      }
+      for (MachineInstr::mmo_iterator MMOI = MI.memoperands_begin(),
+                                      EE = MI.memoperands_end();
+           MMOI != EE; ++MMOI) {
+        MachineMemOperand *MMO = *MMOI;
+        if (const FixedStackPseudoSourceValue *FSV =
+            dyn_cast_or_null<FixedStackPseudoSourceValue>(
+                MMO->getPseudoValue())) {
+          int FI = FSV->getFrameIndex();
+          if (FI >= 0)
+            SSRefs[FI].push_back(MMO);
+        }
+      }
+    }
+  }
+}
+
+/// InitializeSlots - Process all spill stack slot liveintervals and add them
+/// to a sorted (by weight) list.
+void StackSlotColoring::InitializeSlots() {
+  int LastFI = MFI->getObjectIndexEnd();
+
+  // There is always at least one stack ID.
+  AllColors.resize(1);
+  UsedColors.resize(1);
+
+  OrigAlignments.resize(LastFI);
+  OrigSizes.resize(LastFI);
+  AllColors[0].resize(LastFI);
+  UsedColors[0].resize(LastFI);
+  Assignments.resize(LastFI);
+
+  using Pair = std::iterator_traits<LiveStacks::iterator>::value_type;
+
+  SmallVector<Pair *, 16> Intervals;
+
+  Intervals.reserve(LS->getNumIntervals());
+  for (auto &I : *LS)
+    Intervals.push_back(&I);
+  llvm::sort(Intervals,
+             [](Pair *LHS, Pair *RHS) { return LHS->first < RHS->first; });
+
+  // Gather all spill slots into a list.
+  LLVM_DEBUG(dbgs() << "Spill slot intervals:\n");
+  for (auto *I : Intervals) {
+    LiveInterval &li = I->second;
+    LLVM_DEBUG(li.dump());
+    int FI = Register::stackSlot2Index(li.reg());
+    if (MFI->isDeadObjectIndex(FI))
+      continue;
+
+    SSIntervals.push_back(&li);
+    OrigAlignments[FI] = MFI->getObjectAlign(FI);
+    OrigSizes[FI]      = MFI->getObjectSize(FI);
+
+    auto StackID = MFI->getStackID(FI);
+    if (StackID != 0) {
+      AllColors.resize(StackID + 1);
+      UsedColors.resize(StackID + 1);
+      AllColors[StackID].resize(LastFI);
+      UsedColors[StackID].resize(LastFI);
+    }
+
+    AllColors[StackID].set(FI);
+  }
+  LLVM_DEBUG(dbgs() << '\n');
+
+  // Sort them by weight.
+  llvm::stable_sort(SSIntervals, IntervalSorter());
+
+  NextColors.resize(AllColors.size());
+
+  // Get first "color".
+  for (unsigned I = 0, E = AllColors.size(); I != E; ++I)
+    NextColors[I] = AllColors[I].find_first();
+}
+
+/// ColorSlot - Assign a "color" (stack slot) to the specified stack slot.
+int StackSlotColoring::ColorSlot(LiveInterval *li) {
+  int Color = -1;
+  bool Share = false;
+  int FI = Register::stackSlot2Index(li->reg());
+  uint8_t StackID = MFI->getStackID(FI);
+
+  if (!DisableSharing) {
+
+    // Check if it's possible to reuse any of the used colors.
+    Color = UsedColors[StackID].find_first();
+    while (Color != -1) {
+      if (!Assignments[Color].overlaps(li)) {
+        Share = true;
+        ++NumEliminated;
+        break;
+      }
+      Color = UsedColors[StackID].find_next(Color);
+    }
+  }
+
+  if (Color != -1 && MFI->getStackID(Color) != MFI->getStackID(FI)) {
+    LLVM_DEBUG(dbgs() << "cannot share FIs with different stack IDs\n");
+    Share = false;
+  }
+
+  // Assign it to the first available color (assumed to be the best) if it's
+  // not possible to share a used color with other objects.
+  if (!Share) {
+    assert(NextColors[StackID] != -1 && "No more spill slots?");
+    Color = NextColors[StackID];
+    UsedColors[StackID].set(Color);
+    NextColors[StackID] = AllColors[StackID].find_next(NextColors[StackID]);
+  }
+
+  assert(MFI->getStackID(Color) == MFI->getStackID(FI));
+
+  // Record the assignment.
+  Assignments[Color].add(li, LIUAlloc);
+  LLVM_DEBUG(dbgs() << "Assigning fi#" << FI << " to fi#" << Color << "\n");
+
+  // Change size and alignment of the allocated slot. If there are multiple
+  // objects sharing the same slot, then make sure the size and alignment
+  // are large enough for all.
+  Align Alignment = OrigAlignments[FI];
+  if (!Share || Alignment > MFI->getObjectAlign(Color))
+    MFI->setObjectAlignment(Color, Alignment);
+  int64_t Size = OrigSizes[FI];
+  if (!Share || Size > MFI->getObjectSize(Color))
+    MFI->setObjectSize(Color, Size);
+  return Color;
+}
+
+/// Colorslots - Color all spill stack slots and rewrite all frameindex machine
+/// operands in the function.
+bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
+  unsigned NumObjs = MFI->getObjectIndexEnd();
+  SmallVector<int, 16> SlotMapping(NumObjs, -1);
+  SmallVector<float, 16> SlotWeights(NumObjs, 0.0);
+  SmallVector<SmallVector<int, 4>, 16> RevMap(NumObjs);
+  BitVector UsedColors(NumObjs);
+
+  LLVM_DEBUG(dbgs() << "Color spill slot intervals:\n");
+  bool Changed = false;
+  for (LiveInterval *li : SSIntervals) {
+    int SS = Register::stackSlot2Index(li->reg());
+    int NewSS = ColorSlot(li);
+    assert(NewSS >= 0 && "Stack coloring failed?");
+    SlotMapping[SS] = NewSS;
+    RevMap[NewSS].push_back(SS);
+    SlotWeights[NewSS] += li->weight();
+    UsedColors.set(NewSS);
+    Changed |= (SS != NewSS);
+  }
+
+  LLVM_DEBUG(dbgs() << "\nSpill slots after coloring:\n");
+  for (LiveInterval *li : SSIntervals) {
+    int SS = Register::stackSlot2Index(li->reg());
+    li->setWeight(SlotWeights[SS]);
+  }
+  // Sort them by new weight.
+  llvm::stable_sort(SSIntervals, IntervalSorter());
+
+#ifndef NDEBUG
+  for (LiveInterval *li : SSIntervals)
+    LLVM_DEBUG(li->dump());
+  LLVM_DEBUG(dbgs() << '\n');
+#endif
+
+  if (!Changed)
+    return false;
+
+  // Rewrite all MachineMemOperands.
+  for (unsigned SS = 0, SE = SSRefs.size(); SS != SE; ++SS) {
+    int NewFI = SlotMapping[SS];
+    if (NewFI == -1 || (NewFI == (int)SS))
+      continue;
+
+    const PseudoSourceValue *NewSV = MF.getPSVManager().getFixedStack(NewFI);
+    SmallVectorImpl<MachineMemOperand *> &RefMMOs = SSRefs[SS];
+    for (unsigned i = 0, e = RefMMOs.size(); i != e; ++i)
+      RefMMOs[i]->setValue(NewSV);
+  }
+
+  // Rewrite all MO_FrameIndex operands.  Look for dead stores.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB)
+      RewriteInstruction(MI, SlotMapping, MF);
+    RemoveDeadStores(&MBB);
+  }
+
+  // Delete unused stack slots.
+  for (int StackID = 0, E = AllColors.size(); StackID != E; ++StackID) {
+    int NextColor = NextColors[StackID];
+    while (NextColor != -1) {
+      LLVM_DEBUG(dbgs() << "Removing unused stack object fi#" << NextColor << "\n");
+      MFI->RemoveStackObject(NextColor);
+      NextColor = AllColors[StackID].find_next(NextColor);
+    }
+  }
+
+  return true;
+}
+
+/// RewriteInstruction - Rewrite specified instruction by replacing references
+/// to old frame index with new one.
+void StackSlotColoring::RewriteInstruction(MachineInstr &MI,
+                                           SmallVectorImpl<int> &SlotMapping,
+                                           MachineFunction &MF) {
+  // Update the operands.
+  for (MachineOperand &MO : MI.operands()) {
+    if (!MO.isFI())
+      continue;
+    int OldFI = MO.getIndex();
+    if (OldFI < 0)
+      continue;
+    int NewFI = SlotMapping[OldFI];
+    if (NewFI == -1 || NewFI == OldFI)
+      continue;
+
+    assert(MFI->getStackID(OldFI) == MFI->getStackID(NewFI));
+    MO.setIndex(NewFI);
+  }
+
+  // The MachineMemOperands have already been updated.
+}
+
+/// RemoveDeadStores - Scan through a basic block and look for loads followed
+/// by stores.  If they're both using the same stack slot, then the store is
+/// definitely dead.  This could obviously be much more aggressive (consider
+/// pairs with instructions between them), but such extensions might have a
+/// considerable compile time impact.
+bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
+  // FIXME: This could be much more aggressive, but we need to investigate
+  // the compile time impact of doing so.
+  bool changed = false;
+
+  SmallVector<MachineInstr*, 4> toErase;
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+       I != E; ++I) {
+    if (DCELimit != -1 && (int)NumDead >= DCELimit)
+      break;
+    int FirstSS, SecondSS;
+    if (TII->isStackSlotCopy(*I, FirstSS, SecondSS) && FirstSS == SecondSS &&
+        FirstSS != -1) {
+      ++NumDead;
+      changed = true;
+      toErase.push_back(&*I);
+      continue;
+    }
+
+    MachineBasicBlock::iterator NextMI = std::next(I);
+    MachineBasicBlock::iterator ProbableLoadMI = I;
+
+    unsigned LoadReg = 0;
+    unsigned StoreReg = 0;
+    unsigned LoadSize = 0;
+    unsigned StoreSize = 0;
+    if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS, LoadSize)))
+      continue;
+    // Skip the ...pseudo debugging... instructions between a load and store.
+    while ((NextMI != E) && NextMI->isDebugInstr()) {
+      ++NextMI;
+      ++I;
+    }
+    if (NextMI == E) continue;
+    if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS, StoreSize)))
+      continue;
+    if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1 ||
+        LoadSize != StoreSize)
+      continue;
+
+    ++NumDead;
+    changed = true;
+
+    if (NextMI->findRegisterUseOperandIdx(LoadReg, true, nullptr) != -1) {
+      ++NumDead;
+      toErase.push_back(&*ProbableLoadMI);
+    }
+
+    toErase.push_back(&*NextMI);
+    ++I;
+  }
+
+  for (MachineInstr *MI : toErase)
+    MI->eraseFromParent();
+
+  return changed;
+}
+
+bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG({
+    dbgs() << "********** Stack Slot Coloring **********\n"
+           << "********** Function: " << MF.getName() << '\n';
+  });
+
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  MFI = &MF.getFrameInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  LS = &getAnalysis<LiveStacks>();
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+
+  bool Changed = false;
+
+  unsigned NumSlots = LS->getNumIntervals();
+  if (NumSlots == 0)
+    // Nothing to do!
+    return false;
+
+  // If there are calls to setjmp or sigsetjmp, don't perform stack slot
+  // coloring. The stack could be modified before the longjmp is executed,
+  // resulting in the wrong value being used afterwards. (See
+  // <rdar://problem/8007500>.)
+  if (MF.exposesReturnsTwice())
+    return false;
+
+  // Gather spill slot references
+  ScanForSpillSlotRefs(MF);
+  InitializeSlots();
+  Changed = ColorSlots(MF);
+
+  for (int &Next : NextColors)
+    Next = -1;
+
+  SSIntervals.clear();
+  for (unsigned i = 0, e = SSRefs.size(); i != e; ++i)
+    SSRefs[i].clear();
+  SSRefs.clear();
+  OrigAlignments.clear();
+  OrigSizes.clear();
+  AllColors.clear();
+  UsedColors.clear();
+  Assignments.clear();
+
+  return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
new file mode 100644
index 000000000000..83a7063de112
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
@@ -0,0 +1,311 @@
+//===-- SwiftErrorValueTracking.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a limited mem2reg-like analysis to promote uses of function
+// arguments and allocas marked with swiftalloc from memory into virtual
+// registers tracked by this class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SwiftErrorValueTracking.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/Value.h"
+
+using namespace llvm;
+
+Register SwiftErrorValueTracking::getOrCreateVReg(const MachineBasicBlock *MBB,
+                                                  const Value *Val) {
+  auto Key = std::make_pair(MBB, Val);
+  auto It = VRegDefMap.find(Key);
+  // If this is the first use of this swifterror value in this basic block,
+  // create a new virtual register.
+  // After we processed all basic blocks we will satisfy this "upwards exposed
+  // use" by inserting a copy or phi at the beginning of this block.
+  if (It == VRegDefMap.end()) {
+    auto &DL = MF->getDataLayout();
+    const TargetRegisterClass *RC = TLI->getRegClassFor(TLI->getPointerTy(DL));
+    auto VReg = MF->getRegInfo().createVirtualRegister(RC);
+    VRegDefMap[Key] = VReg;
+    VRegUpwardsUse[Key] = VReg;
+    return VReg;
+  } else
+    return It->second;
+}
+
+void SwiftErrorValueTracking::setCurrentVReg(const MachineBasicBlock *MBB,
+                                             const Value *Val, Register VReg) {
+  VRegDefMap[std::make_pair(MBB, Val)] = VReg;
+}
+
+Register SwiftErrorValueTracking::getOrCreateVRegDefAt(
+    const Instruction *I, const MachineBasicBlock *MBB, const Value *Val) {
+  auto Key = PointerIntPair<const Instruction *, 1, bool>(I, true);
+  auto It = VRegDefUses.find(Key);
+  if (It != VRegDefUses.end())
+    return It->second;
+
+  auto &DL = MF->getDataLayout();
+  const TargetRegisterClass *RC = TLI->getRegClassFor(TLI->getPointerTy(DL));
+  Register VReg = MF->getRegInfo().createVirtualRegister(RC);
+  VRegDefUses[Key] = VReg;
+  setCurrentVReg(MBB, Val, VReg);
+  return VReg;
+}
+
+Register SwiftErrorValueTracking::getOrCreateVRegUseAt(
+    const Instruction *I, const MachineBasicBlock *MBB, const Value *Val) {
+  auto Key = PointerIntPair<const Instruction *, 1, bool>(I, false);
+  auto It = VRegDefUses.find(Key);
+  if (It != VRegDefUses.end())
+    return It->second;
+
+  Register VReg = getOrCreateVReg(MBB, Val);
+  VRegDefUses[Key] = VReg;
+  return VReg;
+}
+
+/// Set up SwiftErrorVals by going through the function. If the function has
+/// swifterror argument, it will be the first entry.
+void SwiftErrorValueTracking::setFunction(MachineFunction &mf) {
+  MF = &mf;
+  Fn = &MF->getFunction();
+  TLI = MF->getSubtarget().getTargetLowering();
+  TII = MF->getSubtarget().getInstrInfo();
+
+  if (!TLI->supportSwiftError())
+    return;
+
+  SwiftErrorVals.clear();
+  VRegDefMap.clear();
+  VRegUpwardsUse.clear();
+  VRegDefUses.clear();
+  SwiftErrorArg = nullptr;
+
+  // Check if function has a swifterror argument.
+  bool HaveSeenSwiftErrorArg = false;
+  for (Function::const_arg_iterator AI = Fn->arg_begin(), AE = Fn->arg_end();
+       AI != AE; ++AI)
+    if (AI->hasSwiftErrorAttr()) {
+      assert(!HaveSeenSwiftErrorArg &&
+             "Must have only one swifterror parameter");
+      (void)HaveSeenSwiftErrorArg; // silence warning.
+      HaveSeenSwiftErrorArg = true;
+      SwiftErrorArg = &*AI;
+      SwiftErrorVals.push_back(&*AI);
+    }
+
+  for (const auto &LLVMBB : *Fn)
+    for (const auto &Inst : LLVMBB) {
+      if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(&Inst))
+        if (Alloca->isSwiftError())
+          SwiftErrorVals.push_back(Alloca);
+    }
+}
+
+bool SwiftErrorValueTracking::createEntriesInEntryBlock(DebugLoc DbgLoc) {
+  if (!TLI->supportSwiftError())
+    return false;
+
+  // We only need to do this when we have swifterror parameter or swifterror
+  // alloc.
+  if (SwiftErrorVals.empty())
+    return false;
+
+  MachineBasicBlock *MBB = &*MF->begin();
+  auto &DL = MF->getDataLayout();
+  auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL));
+  bool Inserted = false;
+  for (const auto *SwiftErrorVal : SwiftErrorVals) {
+    // We will always generate a copy from the argument. It is always used at
+    // least by the 'return' of the swifterror.
+    if (SwiftErrorArg && SwiftErrorArg == SwiftErrorVal)
+      continue;
+    Register VReg = MF->getRegInfo().createVirtualRegister(RC);
+    // Assign Undef to Vreg. We construct MI directly to make sure it works
+    // with FastISel.
+    BuildMI(*MBB, MBB->getFirstNonPHI(), DbgLoc,
+            TII->get(TargetOpcode::IMPLICIT_DEF), VReg);
+
+    setCurrentVReg(MBB, SwiftErrorVal, VReg);
+    Inserted = true;
+  }
+
+  return Inserted;
+}
+
+/// Propagate swifterror values through the machine function CFG.
+void SwiftErrorValueTracking::propagateVRegs() {
+  if (!TLI->supportSwiftError())
+    return;
+
+  // We only need to do this when we have swifterror parameter or swifterror
+  // alloc.
+  if (SwiftErrorVals.empty())
+    return;
+
+  // For each machine basic block in reverse post order.
+  ReversePostOrderTraversal<MachineFunction *> RPOT(MF);
+  for (MachineBasicBlock *MBB : RPOT) {
+    // For each swifterror value in the function.
+    for (const auto *SwiftErrorVal : SwiftErrorVals) {
+      auto Key = std::make_pair(MBB, SwiftErrorVal);
+      auto UUseIt = VRegUpwardsUse.find(Key);
+      auto VRegDefIt = VRegDefMap.find(Key);
+      bool UpwardsUse = UUseIt != VRegUpwardsUse.end();
+      Register UUseVReg = UpwardsUse ? UUseIt->second : Register();
+      bool DownwardDef = VRegDefIt != VRegDefMap.end();
+      assert(!(UpwardsUse && !DownwardDef) &&
+             "We can't have an upwards use but no downwards def");
+
+      // If there is no upwards exposed use and an entry for the swifterror in
+      // the def map for this value we don't need to do anything: We already
+      // have a downward def for this basic block.
+      if (!UpwardsUse && DownwardDef)
+        continue;
+
+      // Otherwise we either have an upwards exposed use vreg that we need to
+      // materialize or need to forward the downward def from predecessors.
+
+      // Check whether we have a single vreg def from all predecessors.
+      // Otherwise we need a phi.
+      SmallVector<std::pair<MachineBasicBlock *, Register>, 4> VRegs;
+      SmallSet<const MachineBasicBlock *, 8> Visited;
+      for (auto *Pred : MBB->predecessors()) {
+        if (!Visited.insert(Pred).second)
+          continue;
+        VRegs.push_back(std::make_pair(
+            Pred, getOrCreateVReg(Pred, SwiftErrorVal)));
+        if (Pred != MBB)
+          continue;
+        // We have a self-edge.
+        // If there was no upwards use in this basic block there is now one: the
+        // phi needs to use it self.
+        if (!UpwardsUse) {
+          UpwardsUse = true;
+          UUseIt = VRegUpwardsUse.find(Key);
+          assert(UUseIt != VRegUpwardsUse.end());
+          UUseVReg = UUseIt->second;
+        }
+      }
+
+      // We need a phi node if we have more than one predecessor with different
+      // downward defs.
+      bool needPHI =
+          VRegs.size() >= 1 &&
+          llvm::any_of(
+              VRegs,
+              [&](const std::pair<const MachineBasicBlock *, Register> &V)
+                  -> bool { return V.second != VRegs[0].second; });
+
+      // If there is no upwards exposed used and we don't need a phi just
+      // forward the swifterror vreg from the predecessor(s).
+      if (!UpwardsUse && !needPHI) {
+        assert(!VRegs.empty() &&
+               "No predecessors? The entry block should bail out earlier");
+        // Just forward the swifterror vreg from the predecessor(s).
+        setCurrentVReg(MBB, SwiftErrorVal, VRegs[0].second);
+        continue;
+      }
+
+      auto DLoc = isa<Instruction>(SwiftErrorVal)
+                      ? cast<Instruction>(SwiftErrorVal)->getDebugLoc()
+                      : DebugLoc();
+      const auto *TII = MF->getSubtarget().getInstrInfo();
+
+      // If we don't need a phi create a copy to the upward exposed vreg.
+      if (!needPHI) {
+        assert(UpwardsUse);
+        assert(!VRegs.empty() &&
+               "No predecessors?  Is the Calling Convention correct?");
+        Register DestReg = UUseVReg;
+        BuildMI(*MBB, MBB->getFirstNonPHI(), DLoc, TII->get(TargetOpcode::COPY),
+                DestReg)
+            .addReg(VRegs[0].second);
+        continue;
+      }
+
+      // We need a phi: if there is an upwards exposed use we already have a
+      // destination virtual register number otherwise we generate a new one.
+      auto &DL = MF->getDataLayout();
+      auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL));
+      Register PHIVReg =
+          UpwardsUse ? UUseVReg : MF->getRegInfo().createVirtualRegister(RC);
+      MachineInstrBuilder PHI =
+          BuildMI(*MBB, MBB->getFirstNonPHI(), DLoc,
+                  TII->get(TargetOpcode::PHI), PHIVReg);
+      for (auto BBRegPair : VRegs) {
+        PHI.addReg(BBRegPair.second).addMBB(BBRegPair.first);
+      }
+
+      // We did not have a definition in this block before: store the phi's vreg
+      // as this block downward exposed def.
+      if (!UpwardsUse)
+        setCurrentVReg(MBB, SwiftErrorVal, PHIVReg);
+    }
+  }
+}
+
+void SwiftErrorValueTracking::preassignVRegs(
+    MachineBasicBlock *MBB, BasicBlock::const_iterator Begin,
+    BasicBlock::const_iterator End) {
+  if (!TLI->supportSwiftError() || SwiftErrorVals.empty())
+    return;
+
+  // Iterator over instructions and assign vregs to swifterror defs and uses.
+  for (auto It = Begin; It != End; ++It) {
+    if (auto *CB = dyn_cast<CallBase>(&*It)) {
+      // A call-site with a swifterror argument is both use and def.
+      const Value *SwiftErrorAddr = nullptr;
+      for (const auto &Arg : CB->args()) {
+        if (!Arg->isSwiftError())
+          continue;
+        // Use of swifterror.
+        assert(!SwiftErrorAddr && "Cannot have multiple swifterror arguments");
+        SwiftErrorAddr = &*Arg;
+        assert(SwiftErrorAddr->isSwiftError() &&
+               "Must have a swifterror value argument");
+        getOrCreateVRegUseAt(&*It, MBB, SwiftErrorAddr);
+      }
+      if (!SwiftErrorAddr)
+        continue;
+
+      // Def of swifterror.
+      getOrCreateVRegDefAt(&*It, MBB, SwiftErrorAddr);
+
+      // A load is a use.
+    } else if (const LoadInst *LI = dyn_cast<const LoadInst>(&*It)) {
+      const Value *V = LI->getOperand(0);
+      if (!V->isSwiftError())
+        continue;
+
+      getOrCreateVRegUseAt(LI, MBB, V);
+
+      // A store is a def.
+    } else if (const StoreInst *SI = dyn_cast<const StoreInst>(&*It)) {
+      const Value *SwiftErrorAddr = SI->getOperand(1);
+      if (!SwiftErrorAddr->isSwiftError())
+        continue;
+
+      // Def of swifterror.
+      getOrCreateVRegDefAt(&*It, MBB, SwiftErrorAddr);
+
+      // A return in a swiferror returning function is a use.
+    } else if (const ReturnInst *R = dyn_cast<const ReturnInst>(&*It)) {
+      const Function *F = R->getParent()->getParent();
+      if (!F->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+        continue;
+
+      getOrCreateVRegUseAt(R, MBB, SwiftErrorArg);
+    }
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/SwitchLoweringUtils.cpp b/contrib/llvm-project/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
new file mode 100644
index 000000000000..36a02d5beb4b
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
@@ -0,0 +1,494 @@
+//===- SwitchLoweringUtils.cpp - Switch Lowering --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains switch inst lowering optimizations and utilities for
+// codegen, so that it can be used for both SelectionDAG and GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SwitchLoweringUtils.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+using namespace SwitchCG;
+
+uint64_t SwitchCG::getJumpTableRange(const CaseClusterVector &Clusters,
+                                     unsigned First, unsigned Last) {
+  assert(Last >= First);
+  const APInt &LowCase = Clusters[First].Low->getValue();
+  const APInt &HighCase = Clusters[Last].High->getValue();
+  assert(LowCase.getBitWidth() == HighCase.getBitWidth());
+
+  // FIXME: A range of consecutive cases has 100% density, but only requires one
+  // comparison to lower. We should discriminate against such consecutive ranges
+  // in jump tables.
+  return (HighCase - LowCase).getLimitedValue((UINT64_MAX - 1) / 100) + 1;
+}
+
+uint64_t
+SwitchCG::getJumpTableNumCases(const SmallVectorImpl<unsigned> &TotalCases,
+                               unsigned First, unsigned Last) {
+  assert(Last >= First);
+  assert(TotalCases[Last] >= TotalCases[First]);
+  uint64_t NumCases =
+      TotalCases[Last] - (First == 0 ? 0 : TotalCases[First - 1]);
+  return NumCases;
+}
+
+void SwitchCG::SwitchLowering::findJumpTables(CaseClusterVector &Clusters,
+                                              const SwitchInst *SI,
+                                              MachineBasicBlock *DefaultMBB,
+                                              ProfileSummaryInfo *PSI,
+                                              BlockFrequencyInfo *BFI) {
+#ifndef NDEBUG
+  // Clusters must be non-empty, sorted, and only contain Range clusters.
+  assert(!Clusters.empty());
+  for (CaseCluster &C : Clusters)
+    assert(C.Kind == CC_Range);
+  for (unsigned i = 1, e = Clusters.size(); i < e; ++i)
+    assert(Clusters[i - 1].High->getValue().slt(Clusters[i].Low->getValue()));
+#endif
+
+  assert(TLI && "TLI not set!");
+  if (!TLI->areJTsAllowed(SI->getParent()->getParent()))
+    return;
+
+  const unsigned MinJumpTableEntries = TLI->getMinimumJumpTableEntries();
+  const unsigned SmallNumberOfEntries = MinJumpTableEntries / 2;
+
+  // Bail if not enough cases.
+  const int64_t N = Clusters.size();
+  if (N < 2 || N < MinJumpTableEntries)
+    return;
+
+  // Accumulated number of cases in each cluster and those prior to it.
+  SmallVector<unsigned, 8> TotalCases(N);
+  for (unsigned i = 0; i < N; ++i) {
+    const APInt &Hi = Clusters[i].High->getValue();
+    const APInt &Lo = Clusters[i].Low->getValue();
+    TotalCases[i] = (Hi - Lo).getLimitedValue() + 1;
+    if (i != 0)
+      TotalCases[i] += TotalCases[i - 1];
+  }
+
+  uint64_t Range = getJumpTableRange(Clusters,0, N - 1);
+  uint64_t NumCases = getJumpTableNumCases(TotalCases, 0, N - 1);
+  assert(NumCases < UINT64_MAX / 100);
+  assert(Range >= NumCases);
+
+  // Cheap case: the whole range may be suitable for jump table.
+  if (TLI->isSuitableForJumpTable(SI, NumCases, Range, PSI, BFI)) {
+    CaseCluster JTCluster;
+    if (buildJumpTable(Clusters, 0, N - 1, SI, DefaultMBB, JTCluster)) {
+      Clusters[0] = JTCluster;
+      Clusters.resize(1);
+      return;
+    }
+  }
+
+  // The algorithm below is not suitable for -O0.
+  if (TM->getOptLevel() == CodeGenOpt::None)
+    return;
+
+  // Split Clusters into minimum number of dense partitions. The algorithm uses
+  // the same idea as Kannan & Proebsting "Correction to 'Producing Good Code
+  // for the Case Statement'" (1994), but builds the MinPartitions array in
+  // reverse order to make it easier to reconstruct the partitions in ascending
+  // order. In the choice between two optimal partitionings, it picks the one
+  // which yields more jump tables.
+
+  // MinPartitions[i] is the minimum nbr of partitions of Clusters[i..N-1].
+  SmallVector<unsigned, 8> MinPartitions(N);
+  // LastElement[i] is the last element of the partition starting at i.
+  SmallVector<unsigned, 8> LastElement(N);
+  // PartitionsScore[i] is used to break ties when choosing between two
+  // partitionings resulting in the same number of partitions.
+  SmallVector<unsigned, 8> PartitionsScore(N);
+  // For PartitionsScore, a small number of comparisons is considered as good as
+  // a jump table and a single comparison is considered better than a jump
+  // table.
+  enum PartitionScores : unsigned {
+    NoTable = 0,
+    Table = 1,
+    FewCases = 1,
+    SingleCase = 2
+  };
+
+  // Base case: There is only one way to partition Clusters[N-1].
+  MinPartitions[N - 1] = 1;
+  LastElement[N - 1] = N - 1;
+  PartitionsScore[N - 1] = PartitionScores::SingleCase;
+
+  // Note: loop indexes are signed to avoid underflow.
+  for (int64_t i = N - 2; i >= 0; i--) {
+    // Find optimal partitioning of Clusters[i..N-1].
+    // Baseline: Put Clusters[i] into a partition on its own.
+    MinPartitions[i] = MinPartitions[i + 1] + 1;
+    LastElement[i] = i;
+    PartitionsScore[i] = PartitionsScore[i + 1] + PartitionScores::SingleCase;
+
+    // Search for a solution that results in fewer partitions.
+    for (int64_t j = N - 1; j > i; j--) {
+      // Try building a partition from Clusters[i..j].
+      Range = getJumpTableRange(Clusters, i, j);
+      NumCases = getJumpTableNumCases(TotalCases, i, j);
+      assert(NumCases < UINT64_MAX / 100);
+      assert(Range >= NumCases);
+
+      if (TLI->isSuitableForJumpTable(SI, NumCases, Range, PSI, BFI)) {
+        unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]);
+        unsigned Score = j == N - 1 ? 0 : PartitionsScore[j + 1];
+        int64_t NumEntries = j - i + 1;
+
+        if (NumEntries == 1)
+          Score += PartitionScores::SingleCase;
+        else if (NumEntries <= SmallNumberOfEntries)
+          Score += PartitionScores::FewCases;
+        else if (NumEntries >= MinJumpTableEntries)
+          Score += PartitionScores::Table;
+
+        // If this leads to fewer partitions, or to the same number of
+        // partitions with better score, it is a better partitioning.
+        if (NumPartitions < MinPartitions[i] ||
+            (NumPartitions == MinPartitions[i] && Score > PartitionsScore[i])) {
+          MinPartitions[i] = NumPartitions;
+          LastElement[i] = j;
+          PartitionsScore[i] = Score;
+        }
+      }
+    }
+  }
+
+  // Iterate over the partitions, replacing some with jump tables in-place.
+  unsigned DstIndex = 0;
+  for (unsigned First = 0, Last; First < N; First = Last + 1) {
+    Last = LastElement[First];
+    assert(Last >= First);
+    assert(DstIndex <= First);
+    unsigned NumClusters = Last - First + 1;
+
+    CaseCluster JTCluster;
+    if (NumClusters >= MinJumpTableEntries &&
+        buildJumpTable(Clusters, First, Last, SI, DefaultMBB, JTCluster)) {
+      Clusters[DstIndex++] = JTCluster;
+    } else {
+      for (unsigned I = First; I <= Last; ++I)
+        std::memmove(&Clusters[DstIndex++], &Clusters[I], sizeof(Clusters[I]));
+    }
+  }
+  Clusters.resize(DstIndex);
+}
+
+bool SwitchCG::SwitchLowering::buildJumpTable(const CaseClusterVector &Clusters,
+                                              unsigned First, unsigned Last,
+                                              const SwitchInst *SI,
+                                              MachineBasicBlock *DefaultMBB,
+                                              CaseCluster &JTCluster) {
+  assert(First <= Last);
+
+  auto Prob = BranchProbability::getZero();
+  unsigned NumCmps = 0;
+  std::vector<MachineBasicBlock*> Table;
+  DenseMap<MachineBasicBlock*, BranchProbability> JTProbs;
+
+  // Initialize probabilities in JTProbs.
+  for (unsigned I = First; I <= Last; ++I)
+    JTProbs[Clusters[I].MBB] = BranchProbability::getZero();
+
+  for (unsigned I = First; I <= Last; ++I) {
+    assert(Clusters[I].Kind == CC_Range);
+    Prob += Clusters[I].Prob;
+    const APInt &Low = Clusters[I].Low->getValue();
+    const APInt &High = Clusters[I].High->getValue();
+    NumCmps += (Low == High) ? 1 : 2;
+    if (I != First) {
+      // Fill the gap between this and the previous cluster.
+      const APInt &PreviousHigh = Clusters[I - 1].High->getValue();
+      assert(PreviousHigh.slt(Low));
+      uint64_t Gap = (Low - PreviousHigh).getLimitedValue() - 1;
+      for (uint64_t J = 0; J < Gap; J++)
+        Table.push_back(DefaultMBB);
+    }
+    uint64_t ClusterSize = (High - Low).getLimitedValue() + 1;
+    for (uint64_t J = 0; J < ClusterSize; ++J)
+      Table.push_back(Clusters[I].MBB);
+    JTProbs[Clusters[I].MBB] += Clusters[I].Prob;
+  }
+
+  unsigned NumDests = JTProbs.size();
+  if (TLI->isSuitableForBitTests(NumDests, NumCmps,
+                                 Clusters[First].Low->getValue(),
+                                 Clusters[Last].High->getValue(), *DL)) {
+    // Clusters[First..Last] should be lowered as bit tests instead.
+    return false;
+  }
+
+  // Create the MBB that will load from and jump through the table.
+  // Note: We create it here, but it's not inserted into the function yet.
+  MachineFunction *CurMF = FuncInfo.MF;
+  MachineBasicBlock *JumpTableMBB =
+      CurMF->CreateMachineBasicBlock(SI->getParent());
+
+  // Add successors. Note: use table order for determinism.
+  SmallPtrSet<MachineBasicBlock *, 8> Done;
+  for (MachineBasicBlock *Succ : Table) {
+    if (Done.count(Succ))
+      continue;
+    addSuccessorWithProb(JumpTableMBB, Succ, JTProbs[Succ]);
+    Done.insert(Succ);
+  }
+  JumpTableMBB->normalizeSuccProbs();
+
+  unsigned JTI = CurMF->getOrCreateJumpTableInfo(TLI->getJumpTableEncoding())
+                     ->createJumpTableIndex(Table);
+
+  // Set up the jump table info.
+  JumpTable JT(-1U, JTI, JumpTableMBB, nullptr);
+  JumpTableHeader JTH(Clusters[First].Low->getValue(),
+                      Clusters[Last].High->getValue(), SI->getCondition(),
+                      nullptr, false);
+  JTCases.emplace_back(std::move(JTH), std::move(JT));
+
+  JTCluster = CaseCluster::jumpTable(Clusters[First].Low, Clusters[Last].High,
+                                     JTCases.size() - 1, Prob);
+  return true;
+}
+
+void SwitchCG::SwitchLowering::findBitTestClusters(CaseClusterVector &Clusters,
+                                                   const SwitchInst *SI) {
+  // Partition Clusters into as few subsets as possible, where each subset has a
+  // range that fits in a machine word and has <= 3 unique destinations.
+
+#ifndef NDEBUG
+  // Clusters must be sorted and contain Range or JumpTable clusters.
+  assert(!Clusters.empty());
+  assert(Clusters[0].Kind == CC_Range || Clusters[0].Kind == CC_JumpTable);
+  for (const CaseCluster &C : Clusters)
+    assert(C.Kind == CC_Range || C.Kind == CC_JumpTable);
+  for (unsigned i = 1; i < Clusters.size(); ++i)
+    assert(Clusters[i-1].High->getValue().slt(Clusters[i].Low->getValue()));
+#endif
+
+  // The algorithm below is not suitable for -O0.
+  if (TM->getOptLevel() == CodeGenOpt::None)
+    return;
+
+  // If target does not have legal shift left, do not emit bit tests at all.
+  EVT PTy = TLI->getPointerTy(*DL);
+  if (!TLI->isOperationLegal(ISD::SHL, PTy))
+    return;
+
+  int BitWidth = PTy.getSizeInBits();
+  const int64_t N = Clusters.size();
+
+  // MinPartitions[i] is the minimum nbr of partitions of Clusters[i..N-1].
+  SmallVector<unsigned, 8> MinPartitions(N);
+  // LastElement[i] is the last element of the partition starting at i.
+  SmallVector<unsigned, 8> LastElement(N);
+
+  // FIXME: This might not be the best algorithm for finding bit test clusters.
+
+  // Base case: There is only one way to partition Clusters[N-1].
+  MinPartitions[N - 1] = 1;
+  LastElement[N - 1] = N - 1;
+
+  // Note: loop indexes are signed to avoid underflow.
+  for (int64_t i = N - 2; i >= 0; --i) {
+    // Find optimal partitioning of Clusters[i..N-1].
+    // Baseline: Put Clusters[i] into a partition on its own.
+    MinPartitions[i] = MinPartitions[i + 1] + 1;
+    LastElement[i] = i;
+
+    // Search for a solution that results in fewer partitions.
+    // Note: the search is limited by BitWidth, reducing time complexity.
+    for (int64_t j = std::min(N - 1, i + BitWidth - 1); j > i; --j) {
+      // Try building a partition from Clusters[i..j].
+
+      // Check the range.
+      if (!TLI->rangeFitsInWord(Clusters[i].Low->getValue(),
+                                Clusters[j].High->getValue(), *DL))
+        continue;
+
+      // Check nbr of destinations and cluster types.
+      // FIXME: This works, but doesn't seem very efficient.
+      bool RangesOnly = true;
+      BitVector Dests(FuncInfo.MF->getNumBlockIDs());
+      for (int64_t k = i; k <= j; k++) {
+        if (Clusters[k].Kind != CC_Range) {
+          RangesOnly = false;
+          break;
+        }
+        Dests.set(Clusters[k].MBB->getNumber());
+      }
+      if (!RangesOnly || Dests.count() > 3)
+        break;
+
+      // Check if it's a better partition.
+      unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]);
+      if (NumPartitions < MinPartitions[i]) {
+        // Found a better partition.
+        MinPartitions[i] = NumPartitions;
+        LastElement[i] = j;
+      }
+    }
+  }
+
+  // Iterate over the partitions, replacing with bit-test clusters in-place.
+  unsigned DstIndex = 0;
+  for (unsigned First = 0, Last; First < N; First = Last + 1) {
+    Last = LastElement[First];
+    assert(First <= Last);
+    assert(DstIndex <= First);
+
+    CaseCluster BitTestCluster;
+    if (buildBitTests(Clusters, First, Last, SI, BitTestCluster)) {
+      Clusters[DstIndex++] = BitTestCluster;
+    } else {
+      size_t NumClusters = Last - First + 1;
+      std::memmove(&Clusters[DstIndex], &Clusters[First],
+                   sizeof(Clusters[0]) * NumClusters);
+      DstIndex += NumClusters;
+    }
+  }
+  Clusters.resize(DstIndex);
+}
+
+bool SwitchCG::SwitchLowering::buildBitTests(CaseClusterVector &Clusters,
+                                             unsigned First, unsigned Last,
+                                             const SwitchInst *SI,
+                                             CaseCluster &BTCluster) {
+  assert(First <= Last);
+  if (First == Last)
+    return false;
+
+  BitVector Dests(FuncInfo.MF->getNumBlockIDs());
+  unsigned NumCmps = 0;
+  for (int64_t I = First; I <= Last; ++I) {
+    assert(Clusters[I].Kind == CC_Range);
+    Dests.set(Clusters[I].MBB->getNumber());
+    NumCmps += (Clusters[I].Low == Clusters[I].High) ? 1 : 2;
+  }
+  unsigned NumDests = Dests.count();
+
+  APInt Low = Clusters[First].Low->getValue();
+  APInt High = Clusters[Last].High->getValue();
+  assert(Low.slt(High));
+
+  if (!TLI->isSuitableForBitTests(NumDests, NumCmps, Low, High, *DL))
+    return false;
+
+  APInt LowBound;
+  APInt CmpRange;
+
+  const int BitWidth = TLI->getPointerTy(*DL).getSizeInBits();
+  assert(TLI->rangeFitsInWord(Low, High, *DL) &&
+         "Case range must fit in bit mask!");
+
+  // Check if the clusters cover a contiguous range such that no value in the
+  // range will jump to the default statement.
+  bool ContiguousRange = true;
+  for (int64_t I = First + 1; I <= Last; ++I) {
+    if (Clusters[I].Low->getValue() != Clusters[I - 1].High->getValue() + 1) {
+      ContiguousRange = false;
+      break;
+    }
+  }
+
+  if (Low.isStrictlyPositive() && High.slt(BitWidth)) {
+    // Optimize the case where all the case values fit in a word without having
+    // to subtract minValue. In this case, we can optimize away the subtraction.
+    LowBound = APInt::getZero(Low.getBitWidth());
+    CmpRange = High;
+    ContiguousRange = false;
+  } else {
+    LowBound = Low;
+    CmpRange = High - Low;
+  }
+
+  CaseBitsVector CBV;
+  auto TotalProb = BranchProbability::getZero();
+  for (unsigned i = First; i <= Last; ++i) {
+    // Find the CaseBits for this destination.
+    unsigned j;
+    for (j = 0; j < CBV.size(); ++j)
+      if (CBV[j].BB == Clusters[i].MBB)
+        break;
+    if (j == CBV.size())
+      CBV.push_back(
+          CaseBits(0, Clusters[i].MBB, 0, BranchProbability::getZero()));
+    CaseBits *CB = &CBV[j];
+
+    // Update Mask, Bits and ExtraProb.
+    uint64_t Lo = (Clusters[i].Low->getValue() - LowBound).getZExtValue();
+    uint64_t Hi = (Clusters[i].High->getValue() - LowBound).getZExtValue();
+    assert(Hi >= Lo && Hi < 64 && "Invalid bit case!");
+    CB->Mask |= (-1ULL >> (63 - (Hi - Lo))) << Lo;
+    CB->Bits += Hi - Lo + 1;
+    CB->ExtraProb += Clusters[i].Prob;
+    TotalProb += Clusters[i].Prob;
+  }
+
+  BitTestInfo BTI;
+  llvm::sort(CBV, [](const CaseBits &a, const CaseBits &b) {
+    // Sort by probability first, number of bits second, bit mask third.
+    if (a.ExtraProb != b.ExtraProb)
+      return a.ExtraProb > b.ExtraProb;
+    if (a.Bits != b.Bits)
+      return a.Bits > b.Bits;
+    return a.Mask < b.Mask;
+  });
+
+  for (auto &CB : CBV) {
+    MachineBasicBlock *BitTestBB =
+        FuncInfo.MF->CreateMachineBasicBlock(SI->getParent());
+    BTI.push_back(BitTestCase(CB.Mask, BitTestBB, CB.BB, CB.ExtraProb));
+  }
+  BitTestCases.emplace_back(std::move(LowBound), std::move(CmpRange),
+                            SI->getCondition(), -1U, MVT::Other, false,
+                            ContiguousRange, nullptr, nullptr, std::move(BTI),
+                            TotalProb);
+
+  BTCluster = CaseCluster::bitTests(Clusters[First].Low, Clusters[Last].High,
+                                    BitTestCases.size() - 1, TotalProb);
+  return true;
+}
+
+void SwitchCG::sortAndRangeify(CaseClusterVector &Clusters) {
+#ifndef NDEBUG
+  for (const CaseCluster &CC : Clusters)
+    assert(CC.Low == CC.High && "Input clusters must be single-case");
+#endif
+
+  llvm::sort(Clusters, [](const CaseCluster &a, const CaseCluster &b) {
+    return a.Low->getValue().slt(b.Low->getValue());
+  });
+
+  // Merge adjacent clusters with the same destination.
+  const unsigned N = Clusters.size();
+  unsigned DstIndex = 0;
+  for (unsigned SrcIndex = 0; SrcIndex < N; ++SrcIndex) {
+    CaseCluster &CC = Clusters[SrcIndex];
+    const ConstantInt *CaseVal = CC.Low;
+    MachineBasicBlock *Succ = CC.MBB;
+
+    if (DstIndex != 0 && Clusters[DstIndex - 1].MBB == Succ &&
+        (CaseVal->getValue() - Clusters[DstIndex - 1].High->getValue()) == 1) {
+      // If this case has the same successor and is a neighbour, merge it into
+      // the previous cluster.
+      Clusters[DstIndex - 1].High = CaseVal;
+      Clusters[DstIndex - 1].Prob += CC.Prob;
+    } else {
+      std::memmove(&Clusters[DstIndex++], &Clusters[SrcIndex],
+                   sizeof(Clusters[SrcIndex]));
+    }
+  }
+  Clusters.resize(DstIndex);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TailDuplication.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TailDuplication.cpp
new file mode 100644
index 000000000000..bf3d2088e196
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TailDuplication.cpp
@@ -0,0 +1,102 @@
+//===- TailDuplication.cpp - Duplicate blocks into predecessors' tails ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass duplicates basic blocks ending in unconditional branches
+/// into the tails of their predecessors, using the TailDuplicator utility
+/// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MBFIWrapper.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TailDuplicator.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tailduplication"
+
+namespace {
+
+class TailDuplicateBase : public MachineFunctionPass {
+  TailDuplicator Duplicator;
+  std::unique_ptr<MBFIWrapper> MBFIW;
+  bool PreRegAlloc;
+public:
+  TailDuplicateBase(char &PassID, bool PreRegAlloc)
+    : MachineFunctionPass(PassID), PreRegAlloc(PreRegAlloc) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+class TailDuplicate : public TailDuplicateBase {
+public:
+  static char ID;
+  TailDuplicate() : TailDuplicateBase(ID, false) {
+    initializeTailDuplicatePass(*PassRegistry::getPassRegistry());
+  }
+};
+
+class EarlyTailDuplicate : public TailDuplicateBase {
+public:
+  static char ID;
+  EarlyTailDuplicate() : TailDuplicateBase(ID, true) {
+    initializeEarlyTailDuplicatePass(*PassRegistry::getPassRegistry());
+  }
+
+  MachineFunctionProperties getClearedProperties() const override {
+    return MachineFunctionProperties()
+      .set(MachineFunctionProperties::Property::NoPHIs);
+  }
+};
+
+} // end anonymous namespace
+
+char TailDuplicate::ID;
+char EarlyTailDuplicate::ID;
+
+char &llvm::TailDuplicateID = TailDuplicate::ID;
+char &llvm::EarlyTailDuplicateID = EarlyTailDuplicate::ID;
+
+INITIALIZE_PASS(TailDuplicate, DEBUG_TYPE, "Tail Duplication", false, false)
+INITIALIZE_PASS(EarlyTailDuplicate, "early-tailduplication",
+                "Early Tail Duplication", false, false)
+
+bool TailDuplicateBase::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  auto MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  auto *MBFI = (PSI && PSI->hasProfileSummary()) ?
+               &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+               nullptr;
+  if (MBFI)
+    MBFIW = std::make_unique<MBFIWrapper>(*MBFI);
+  Duplicator.initMF(MF, PreRegAlloc, MBPI, MBFI ? MBFIW.get() : nullptr, PSI,
+                    /*LayoutMode=*/false);
+
+  bool MadeChange = false;
+  while (Duplicator.tailDuplicateBlocks())
+    MadeChange = true;
+
+  return MadeChange;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TailDuplicator.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TailDuplicator.cpp
new file mode 100644
index 000000000000..5ed67bd0a121
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -0,0 +1,1071 @@
+//===- TailDuplicator.cpp - Duplicate blocks into predecessors' tails -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This utility class duplicates basic blocks ending in unconditional branches
+// into the tails of their predecessors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TailDuplicator.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tailduplication"
+
+STATISTIC(NumTails, "Number of tails duplicated");
+STATISTIC(NumTailDups, "Number of tail duplicated blocks");
+STATISTIC(NumTailDupAdded,
+          "Number of instructions added due to tail duplication");
+STATISTIC(NumTailDupRemoved,
+          "Number of instructions removed due to tail duplication");
+STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
+STATISTIC(NumAddedPHIs, "Number of phis added");
+
+// Heuristic for tail duplication.
+static cl::opt<unsigned> TailDuplicateSize(
+    "tail-dup-size",
+    cl::desc("Maximum instructions to consider tail duplicating"), cl::init(2),
+    cl::Hidden);
+
+static cl::opt<unsigned> TailDupIndirectBranchSize(
+    "tail-dup-indirect-size",
+    cl::desc("Maximum instructions to consider tail duplicating blocks that "
+             "end with indirect branches."), cl::init(20),
+    cl::Hidden);
+
+static cl::opt<bool>
+    TailDupVerify("tail-dup-verify",
+                  cl::desc("Verify sanity of PHI instructions during taildup"),
+                  cl::init(false), cl::Hidden);
+
+static cl::opt<unsigned> TailDupLimit("tail-dup-limit", cl::init(~0U),
+                                      cl::Hidden);
+
+void TailDuplicator::initMF(MachineFunction &MFin, bool PreRegAlloc,
+                            const MachineBranchProbabilityInfo *MBPIin,
+                            MBFIWrapper *MBFIin,
+                            ProfileSummaryInfo *PSIin,
+                            bool LayoutModeIn, unsigned TailDupSizeIn) {
+  MF = &MFin;
+  TII = MF->getSubtarget().getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  MRI = &MF->getRegInfo();
+  MMI = &MF->getMMI();
+  MBPI = MBPIin;
+  MBFI = MBFIin;
+  PSI = PSIin;
+  TailDupSize = TailDupSizeIn;
+
+  assert(MBPI != nullptr && "Machine Branch Probability Info required");
+
+  LayoutMode = LayoutModeIn;
+  this->PreRegAlloc = PreRegAlloc;
+}
+
+static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
+  for (MachineBasicBlock &MBB : llvm::drop_begin(MF)) {
+    SmallSetVector<MachineBasicBlock *, 8> Preds(MBB.pred_begin(),
+                                                 MBB.pred_end());
+    MachineBasicBlock::iterator MI = MBB.begin();
+    while (MI != MBB.end()) {
+      if (!MI->isPHI())
+        break;
+      for (MachineBasicBlock *PredBB : Preds) {
+        bool Found = false;
+        for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+          MachineBasicBlock *PHIBB = MI->getOperand(i + 1).getMBB();
+          if (PHIBB == PredBB) {
+            Found = true;
+            break;
+          }
+        }
+        if (!Found) {
+          dbgs() << "Malformed PHI in " << printMBBReference(MBB) << ": "
+                 << *MI;
+          dbgs() << "  missing input from predecessor "
+                 << printMBBReference(*PredBB) << '\n';
+          llvm_unreachable(nullptr);
+        }
+      }
+
+      for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+        MachineBasicBlock *PHIBB = MI->getOperand(i + 1).getMBB();
+        if (CheckExtra && !Preds.count(PHIBB)) {
+          dbgs() << "Warning: malformed PHI in " << printMBBReference(MBB)
+                 << ": " << *MI;
+          dbgs() << "  extra input from predecessor "
+                 << printMBBReference(*PHIBB) << '\n';
+          llvm_unreachable(nullptr);
+        }
+        if (PHIBB->getNumber() < 0) {
+          dbgs() << "Malformed PHI in " << printMBBReference(MBB) << ": "
+                 << *MI;
+          dbgs() << "  non-existing " << printMBBReference(*PHIBB) << '\n';
+          llvm_unreachable(nullptr);
+        }
+      }
+      ++MI;
+    }
+  }
+}
+
+/// Tail duplicate the block and cleanup.
+/// \p IsSimple - return value of isSimpleBB
+/// \p MBB - block to be duplicated
+/// \p ForcedLayoutPred - If non-null, treat this block as the layout
+///     predecessor, instead of using the ordering in MF
+/// \p DuplicatedPreds - if non-null, \p DuplicatedPreds will contain a list of
+///     all Preds that received a copy of \p MBB.
+/// \p RemovalCallback - if non-null, called just before MBB is deleted.
+bool TailDuplicator::tailDuplicateAndUpdate(
+    bool IsSimple, MachineBasicBlock *MBB,
+    MachineBasicBlock *ForcedLayoutPred,
+    SmallVectorImpl<MachineBasicBlock*> *DuplicatedPreds,
+    function_ref<void(MachineBasicBlock *)> *RemovalCallback,
+    SmallVectorImpl<MachineBasicBlock *> *CandidatePtr) {
+  // Save the successors list.
+  SmallSetVector<MachineBasicBlock *, 8> Succs(MBB->succ_begin(),
+                                               MBB->succ_end());
+
+  SmallVector<MachineBasicBlock *, 8> TDBBs;
+  SmallVector<MachineInstr *, 16> Copies;
+  if (!tailDuplicate(IsSimple, MBB, ForcedLayoutPred,
+                     TDBBs, Copies, CandidatePtr))
+    return false;
+
+  ++NumTails;
+
+  SmallVector<MachineInstr *, 8> NewPHIs;
+  MachineSSAUpdater SSAUpdate(*MF, &NewPHIs);
+
+  // TailBB's immediate successors are now successors of those predecessors
+  // which duplicated TailBB. Add the predecessors as sources to the PHI
+  // instructions.
+  bool isDead = MBB->pred_empty() && !MBB->hasAddressTaken();
+  if (PreRegAlloc)
+    updateSuccessorsPHIs(MBB, isDead, TDBBs, Succs);
+
+  // If it is dead, remove it.
+  if (isDead) {
+    NumTailDupRemoved += MBB->size();
+    removeDeadBlock(MBB, RemovalCallback);
+    ++NumDeadBlocks;
+  }
+
+  // Update SSA form.
+  if (!SSAUpdateVRs.empty()) {
+    for (unsigned i = 0, e = SSAUpdateVRs.size(); i != e; ++i) {
+      unsigned VReg = SSAUpdateVRs[i];
+      SSAUpdate.Initialize(VReg);
+
+      // If the original definition is still around, add it as an available
+      // value.
+      MachineInstr *DefMI = MRI->getVRegDef(VReg);
+      MachineBasicBlock *DefBB = nullptr;
+      if (DefMI) {
+        DefBB = DefMI->getParent();
+        SSAUpdate.AddAvailableValue(DefBB, VReg);
+      }
+
+      // Add the new vregs as available values.
+      DenseMap<Register, AvailableValsTy>::iterator LI =
+          SSAUpdateVals.find(VReg);
+      for (std::pair<MachineBasicBlock *, Register> &J : LI->second) {
+        MachineBasicBlock *SrcBB = J.first;
+        Register SrcReg = J.second;
+        SSAUpdate.AddAvailableValue(SrcBB, SrcReg);
+      }
+
+      SmallVector<MachineOperand *> DebugUses;
+      // Rewrite uses that are outside of the original def's block.
+      for (MachineOperand &UseMO :
+           llvm::make_early_inc_range(MRI->use_operands(VReg))) {
+        MachineInstr *UseMI = UseMO.getParent();
+        // Rewrite debug uses last so that they can take advantage of any
+        // register mappings introduced by other users in its BB, since we
+        // cannot create new register definitions specifically for the debug
+        // instruction (as debug instructions should not affect CodeGen).
+        if (UseMI->isDebugValue()) {
+          DebugUses.push_back(&UseMO);
+          continue;
+        }
+        if (UseMI->getParent() == DefBB && !UseMI->isPHI())
+          continue;
+        SSAUpdate.RewriteUse(UseMO);
+      }
+      for (auto *UseMO : DebugUses) {
+        MachineInstr *UseMI = UseMO->getParent();
+        UseMO->setReg(
+            SSAUpdate.GetValueInMiddleOfBlock(UseMI->getParent(), true));
+      }
+    }
+
+    SSAUpdateVRs.clear();
+    SSAUpdateVals.clear();
+  }
+
+  // Eliminate some of the copies inserted by tail duplication to maintain
+  // SSA form.
+  for (unsigned i = 0, e = Copies.size(); i != e; ++i) {
+    MachineInstr *Copy = Copies[i];
+    if (!Copy->isCopy())
+      continue;
+    Register Dst = Copy->getOperand(0).getReg();
+    Register Src = Copy->getOperand(1).getReg();
+    if (MRI->hasOneNonDBGUse(Src) &&
+        MRI->constrainRegClass(Src, MRI->getRegClass(Dst))) {
+      // Copy is the only use. Do trivial copy propagation here.
+      MRI->replaceRegWith(Dst, Src);
+      Copy->eraseFromParent();
+    }
+  }
+
+  if (NewPHIs.size())
+    NumAddedPHIs += NewPHIs.size();
+
+  if (DuplicatedPreds)
+    *DuplicatedPreds = std::move(TDBBs);
+
+  return true;
+}
+
+/// Look for small blocks that are unconditionally branched to and do not fall
+/// through. Tail-duplicate their instructions into their predecessors to
+/// eliminate (dynamic) branches.
+bool TailDuplicator::tailDuplicateBlocks() {
+  bool MadeChange = false;
+
+  if (PreRegAlloc && TailDupVerify) {
+    LLVM_DEBUG(dbgs() << "\n*** Before tail-duplicating\n");
+    VerifyPHIs(*MF, true);
+  }
+
+  for (MachineBasicBlock &MBB :
+       llvm::make_early_inc_range(llvm::drop_begin(*MF))) {
+    if (NumTails == TailDupLimit)
+      break;
+
+    bool IsSimple = isSimpleBB(&MBB);
+
+    if (!shouldTailDuplicate(IsSimple, MBB))
+      continue;
+
+    MadeChange |= tailDuplicateAndUpdate(IsSimple, &MBB, nullptr);
+  }
+
+  if (PreRegAlloc && TailDupVerify)
+    VerifyPHIs(*MF, false);
+
+  return MadeChange;
+}
+
+static bool isDefLiveOut(Register Reg, MachineBasicBlock *BB,
+                         const MachineRegisterInfo *MRI) {
+  for (MachineInstr &UseMI : MRI->use_instructions(Reg)) {
+    if (UseMI.isDebugValue())
+      continue;
+    if (UseMI.getParent() != BB)
+      return true;
+  }
+  return false;
+}
+
+static unsigned getPHISrcRegOpIdx(MachineInstr *MI, MachineBasicBlock *SrcBB) {
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2)
+    if (MI->getOperand(i + 1).getMBB() == SrcBB)
+      return i;
+  return 0;
+}
+
+// Remember which registers are used by phis in this block. This is
+// used to determine which registers are liveout while modifying the
+// block (which is why we need to copy the information).
+static void getRegsUsedByPHIs(const MachineBasicBlock &BB,
+                              DenseSet<Register> *UsedByPhi) {
+  for (const auto &MI : BB) {
+    if (!MI.isPHI())
+      break;
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
+      Register SrcReg = MI.getOperand(i).getReg();
+      UsedByPhi->insert(SrcReg);
+    }
+  }
+}
+
+/// Add a definition and source virtual registers pair for SSA update.
+void TailDuplicator::addSSAUpdateEntry(Register OrigReg, Register NewReg,
+                                       MachineBasicBlock *BB) {
+  DenseMap<Register, AvailableValsTy>::iterator LI =
+      SSAUpdateVals.find(OrigReg);
+  if (LI != SSAUpdateVals.end())
+    LI->second.push_back(std::make_pair(BB, NewReg));
+  else {
+    AvailableValsTy Vals;
+    Vals.push_back(std::make_pair(BB, NewReg));
+    SSAUpdateVals.insert(std::make_pair(OrigReg, Vals));
+    SSAUpdateVRs.push_back(OrigReg);
+  }
+}
+
+/// Process PHI node in TailBB by turning it into a copy in PredBB. Remember the
+/// source register that's contributed by PredBB and update SSA update map.
+void TailDuplicator::processPHI(
+    MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB,
+    DenseMap<Register, RegSubRegPair> &LocalVRMap,
+    SmallVectorImpl<std::pair<Register, RegSubRegPair>> &Copies,
+    const DenseSet<Register> &RegsUsedByPhi, bool Remove) {
+  Register DefReg = MI->getOperand(0).getReg();
+  unsigned SrcOpIdx = getPHISrcRegOpIdx(MI, PredBB);
+  assert(SrcOpIdx && "Unable to find matching PHI source?");
+  Register SrcReg = MI->getOperand(SrcOpIdx).getReg();
+  unsigned SrcSubReg = MI->getOperand(SrcOpIdx).getSubReg();
+  const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+  LocalVRMap.insert(std::make_pair(DefReg, RegSubRegPair(SrcReg, SrcSubReg)));
+
+  // Insert a copy from source to the end of the block. The def register is the
+  // available value liveout of the block.
+  Register NewDef = MRI->createVirtualRegister(RC);
+  Copies.push_back(std::make_pair(NewDef, RegSubRegPair(SrcReg, SrcSubReg)));
+  if (isDefLiveOut(DefReg, TailBB, MRI) || RegsUsedByPhi.count(DefReg))
+    addSSAUpdateEntry(DefReg, NewDef, PredBB);
+
+  if (!Remove)
+    return;
+
+  // Remove PredBB from the PHI node.
+  MI->removeOperand(SrcOpIdx + 1);
+  MI->removeOperand(SrcOpIdx);
+  if (MI->getNumOperands() == 1 && !TailBB->hasAddressTaken())
+    MI->eraseFromParent();
+  else if (MI->getNumOperands() == 1)
+    MI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
+}
+
+/// Duplicate a TailBB instruction to PredBB and update
+/// the source operands due to earlier PHI translation.
+void TailDuplicator::duplicateInstruction(
+    MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB,
+    DenseMap<Register, RegSubRegPair> &LocalVRMap,
+    const DenseSet<Register> &UsedByPhi) {
+  // Allow duplication of CFI instructions.
+  if (MI->isCFIInstruction()) {
+    BuildMI(*PredBB, PredBB->end(), PredBB->findDebugLoc(PredBB->begin()),
+            TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(MI->getOperand(0).getCFIIndex())
+        .setMIFlags(MI->getFlags());
+    return;
+  }
+  MachineInstr &NewMI = TII->duplicate(*PredBB, PredBB->end(), *MI);
+  if (PreRegAlloc) {
+    for (unsigned i = 0, e = NewMI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = NewMI.getOperand(i);
+      if (!MO.isReg())
+        continue;
+      Register Reg = MO.getReg();
+      if (!Reg.isVirtual())
+        continue;
+      if (MO.isDef()) {
+        const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+        Register NewReg = MRI->createVirtualRegister(RC);
+        MO.setReg(NewReg);
+        LocalVRMap.insert(std::make_pair(Reg, RegSubRegPair(NewReg, 0)));
+        if (isDefLiveOut(Reg, TailBB, MRI) || UsedByPhi.count(Reg))
+          addSSAUpdateEntry(Reg, NewReg, PredBB);
+      } else {
+        auto VI = LocalVRMap.find(Reg);
+        if (VI != LocalVRMap.end()) {
+          // Need to make sure that the register class of the mapped register
+          // will satisfy the constraints of the class of the register being
+          // replaced.
+          auto *OrigRC = MRI->getRegClass(Reg);
+          auto *MappedRC = MRI->getRegClass(VI->second.Reg);
+          const TargetRegisterClass *ConstrRC;
+          if (VI->second.SubReg != 0) {
+            ConstrRC = TRI->getMatchingSuperRegClass(MappedRC, OrigRC,
+                                                     VI->second.SubReg);
+            if (ConstrRC) {
+              // The actual constraining (as in "find appropriate new class")
+              // is done by getMatchingSuperRegClass, so now we only need to
+              // change the class of the mapped register.
+              MRI->setRegClass(VI->second.Reg, ConstrRC);
+            }
+          } else {
+            // For mapped registers that do not have sub-registers, simply
+            // restrict their class to match the original one.
+
+            // We don't want debug instructions affecting the resulting code so
+            // if we're cloning a debug instruction then just use MappedRC
+            // rather than constraining the register class further.
+            ConstrRC = NewMI.isDebugInstr()
+                           ? MappedRC
+                           : MRI->constrainRegClass(VI->second.Reg, OrigRC);
+          }
+
+          if (ConstrRC) {
+            // If the class constraining succeeded, we can simply replace
+            // the old register with the mapped one.
+            MO.setReg(VI->second.Reg);
+            // We have Reg -> VI.Reg:VI.SubReg, so if Reg is used with a
+            // sub-register, we need to compose the sub-register indices.
+            MO.setSubReg(
+                TRI->composeSubRegIndices(VI->second.SubReg, MO.getSubReg()));
+          } else {
+            // The direct replacement is not possible, due to failing register
+            // class constraints. An explicit COPY is necessary. Create one
+            // that can be reused.
+            Register NewReg = MRI->createVirtualRegister(OrigRC);
+            BuildMI(*PredBB, NewMI, NewMI.getDebugLoc(),
+                    TII->get(TargetOpcode::COPY), NewReg)
+                .addReg(VI->second.Reg, 0, VI->second.SubReg);
+            LocalVRMap.erase(VI);
+            LocalVRMap.insert(std::make_pair(Reg, RegSubRegPair(NewReg, 0)));
+            MO.setReg(NewReg);
+            // The composed VI.Reg:VI.SubReg is replaced with NewReg, which
+            // is equivalent to the whole register Reg. Hence, Reg:subreg
+            // is same as NewReg:subreg, so keep the sub-register index
+            // unchanged.
+          }
+          // Clear any kill flags from this operand.  The new register could
+          // have uses after this one, so kills are not valid here.
+          MO.setIsKill(false);
+        }
+      }
+    }
+  }
+}
+
+/// After FromBB is tail duplicated into its predecessor blocks, the successors
+/// have gained new predecessors. Update the PHI instructions in them
+/// accordingly.
+void TailDuplicator::updateSuccessorsPHIs(
+    MachineBasicBlock *FromBB, bool isDead,
+    SmallVectorImpl<MachineBasicBlock *> &TDBBs,
+    SmallSetVector<MachineBasicBlock *, 8> &Succs) {
+  for (MachineBasicBlock *SuccBB : Succs) {
+    for (MachineInstr &MI : *SuccBB) {
+      if (!MI.isPHI())
+        break;
+      MachineInstrBuilder MIB(*FromBB->getParent(), MI);
+      unsigned Idx = 0;
+      for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
+        MachineOperand &MO = MI.getOperand(i + 1);
+        if (MO.getMBB() == FromBB) {
+          Idx = i;
+          break;
+        }
+      }
+
+      assert(Idx != 0);
+      MachineOperand &MO0 = MI.getOperand(Idx);
+      Register Reg = MO0.getReg();
+      if (isDead) {
+        // Folded into the previous BB.
+        // There could be duplicate phi source entries. FIXME: Should sdisel
+        // or earlier pass fixed this?
+        for (unsigned i = MI.getNumOperands() - 2; i != Idx; i -= 2) {
+          MachineOperand &MO = MI.getOperand(i + 1);
+          if (MO.getMBB() == FromBB) {
+            MI.removeOperand(i + 1);
+            MI.removeOperand(i);
+          }
+        }
+      } else
+        Idx = 0;
+
+      // If Idx is set, the operands at Idx and Idx+1 must be removed.
+      // We reuse the location to avoid expensive removeOperand calls.
+
+      DenseMap<Register, AvailableValsTy>::iterator LI =
+          SSAUpdateVals.find(Reg);
+      if (LI != SSAUpdateVals.end()) {
+        // This register is defined in the tail block.
+        for (const std::pair<MachineBasicBlock *, Register> &J : LI->second) {
+          MachineBasicBlock *SrcBB = J.first;
+          // If we didn't duplicate a bb into a particular predecessor, we
+          // might still have added an entry to SSAUpdateVals to correcly
+          // recompute SSA. If that case, avoid adding a dummy extra argument
+          // this PHI.
+          if (!SrcBB->isSuccessor(SuccBB))
+            continue;
+
+          Register SrcReg = J.second;
+          if (Idx != 0) {
+            MI.getOperand(Idx).setReg(SrcReg);
+            MI.getOperand(Idx + 1).setMBB(SrcBB);
+            Idx = 0;
+          } else {
+            MIB.addReg(SrcReg).addMBB(SrcBB);
+          }
+        }
+      } else {
+        // Live in tail block, must also be live in predecessors.
+        for (MachineBasicBlock *SrcBB : TDBBs) {
+          if (Idx != 0) {
+            MI.getOperand(Idx).setReg(Reg);
+            MI.getOperand(Idx + 1).setMBB(SrcBB);
+            Idx = 0;
+          } else {
+            MIB.addReg(Reg).addMBB(SrcBB);
+          }
+        }
+      }
+      if (Idx != 0) {
+        MI.removeOperand(Idx + 1);
+        MI.removeOperand(Idx);
+      }
+    }
+  }
+}
+
+/// Determine if it is profitable to duplicate this block.
+bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
+                                         MachineBasicBlock &TailBB) {
+  // When doing tail-duplication during layout, the block ordering is in flux,
+  // so canFallThrough returns a result based on incorrect information and
+  // should just be ignored.
+  if (!LayoutMode && TailBB.canFallThrough())
+    return false;
+
+  // Don't try to tail-duplicate single-block loops.
+  if (TailBB.isSuccessor(&TailBB))
+    return false;
+
+  // Set the limit on the cost to duplicate. When optimizing for size,
+  // duplicate only one, because one branch instruction can be eliminated to
+  // compensate for the duplication.
+  unsigned MaxDuplicateCount;
+  bool OptForSize = MF->getFunction().hasOptSize() ||
+                    llvm::shouldOptimizeForSize(&TailBB, PSI, MBFI);
+  if (TailDupSize == 0)
+    MaxDuplicateCount = TailDuplicateSize;
+  else
+    MaxDuplicateCount = TailDupSize;
+  if (OptForSize)
+    MaxDuplicateCount = 1;
+
+  // If the block to be duplicated ends in an unanalyzable fallthrough, don't
+  // duplicate it.
+  // A similar check is necessary in MachineBlockPlacement to make sure pairs of
+  // blocks with unanalyzable fallthrough get layed out contiguously.
+  MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
+  SmallVector<MachineOperand, 4> PredCond;
+  if (TII->analyzeBranch(TailBB, PredTBB, PredFBB, PredCond) &&
+      TailBB.canFallThrough())
+    return false;
+
+  // If the target has hardware branch prediction that can handle indirect
+  // branches, duplicating them can often make them predictable when there
+  // are common paths through the code.  The limit needs to be high enough
+  // to allow undoing the effects of tail merging and other optimizations
+  // that rearrange the predecessors of the indirect branch.
+
+  bool HasIndirectbr = false;
+  if (!TailBB.empty())
+    HasIndirectbr = TailBB.back().isIndirectBranch();
+
+  if (HasIndirectbr && PreRegAlloc)
+    MaxDuplicateCount = TailDupIndirectBranchSize;
+
+  // Check the instructions in the block to determine whether tail-duplication
+  // is invalid or unlikely to be profitable.
+  unsigned InstrCount = 0;
+  for (MachineInstr &MI : TailBB) {
+    // Non-duplicable things shouldn't be tail-duplicated.
+    // CFI instructions are marked as non-duplicable, because Darwin compact
+    // unwind info emission can't handle multiple prologue setups. In case of
+    // DWARF, allow them be duplicated, so that their existence doesn't prevent
+    // tail duplication of some basic blocks, that would be duplicated otherwise.
+    if (MI.isNotDuplicable() &&
+        (TailBB.getParent()->getTarget().getTargetTriple().isOSDarwin() ||
+        !MI.isCFIInstruction()))
+      return false;
+
+    // Convergent instructions can be duplicated only if doing so doesn't add
+    // new control dependencies, which is what we're going to do here.
+    if (MI.isConvergent())
+      return false;
+
+    // Do not duplicate 'return' instructions if this is a pre-regalloc run.
+    // A return may expand into a lot more instructions (e.g. reload of callee
+    // saved registers) after PEI.
+    if (PreRegAlloc && MI.isReturn())
+      return false;
+
+    // Avoid duplicating calls before register allocation. Calls presents a
+    // barrier to register allocation so duplicating them may end up increasing
+    // spills.
+    if (PreRegAlloc && MI.isCall())
+      return false;
+
+    // TailDuplicator::appendCopies will erroneously place COPYs after
+    // INLINEASM_BR instructions after 4b0aa5724fea, which demonstrates the same
+    // bug that was fixed in f7a53d82c090.
+    // FIXME: Use findPHICopyInsertPoint() to find the correct insertion point
+    //        for the COPY when replacing PHIs.
+    if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
+      return false;
+
+    if (MI.isBundle())
+      InstrCount += MI.getBundleSize();
+    else if (!MI.isPHI() && !MI.isMetaInstruction())
+      InstrCount += 1;
+
+    if (InstrCount > MaxDuplicateCount)
+      return false;
+  }
+
+  // Check if any of the successors of TailBB has a PHI node in which the
+  // value corresponding to TailBB uses a subregister.
+  // If a phi node uses a register paired with a subregister, the actual
+  // "value type" of the phi may differ from the type of the register without
+  // any subregisters. Due to a bug, tail duplication may add a new operand
+  // without a necessary subregister, producing an invalid code. This is
+  // demonstrated by test/CodeGen/Hexagon/tail-dup-subreg-abort.ll.
+  // Disable tail duplication for this case for now, until the problem is
+  // fixed.
+  for (auto *SB : TailBB.successors()) {
+    for (auto &I : *SB) {
+      if (!I.isPHI())
+        break;
+      unsigned Idx = getPHISrcRegOpIdx(&I, &TailBB);
+      assert(Idx != 0);
+      MachineOperand &PU = I.getOperand(Idx);
+      if (PU.getSubReg() != 0)
+        return false;
+    }
+  }
+
+  if (HasIndirectbr && PreRegAlloc)
+    return true;
+
+  if (IsSimple)
+    return true;
+
+  if (!PreRegAlloc)
+    return true;
+
+  return canCompletelyDuplicateBB(TailBB);
+}
+
+/// True if this BB has only one unconditional jump.
+bool TailDuplicator::isSimpleBB(MachineBasicBlock *TailBB) {
+  if (TailBB->succ_size() != 1)
+    return false;
+  if (TailBB->pred_empty())
+    return false;
+  MachineBasicBlock::iterator I = TailBB->getFirstNonDebugInstr(true);
+  if (I == TailBB->end())
+    return true;
+  return I->isUnconditionalBranch();
+}
+
+static bool bothUsedInPHI(const MachineBasicBlock &A,
+                          const SmallPtrSet<MachineBasicBlock *, 8> &SuccsB) {
+  for (MachineBasicBlock *BB : A.successors())
+    if (SuccsB.count(BB) && !BB->empty() && BB->begin()->isPHI())
+      return true;
+
+  return false;
+}
+
+bool TailDuplicator::canCompletelyDuplicateBB(MachineBasicBlock &BB) {
+  for (MachineBasicBlock *PredBB : BB.predecessors()) {
+    if (PredBB->succ_size() > 1)
+      return false;
+
+    MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
+    SmallVector<MachineOperand, 4> PredCond;
+    if (TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond))
+      return false;
+
+    if (!PredCond.empty())
+      return false;
+  }
+  return true;
+}
+
+bool TailDuplicator::duplicateSimpleBB(
+    MachineBasicBlock *TailBB, SmallVectorImpl<MachineBasicBlock *> &TDBBs,
+    const DenseSet<Register> &UsedByPhi) {
+  SmallPtrSet<MachineBasicBlock *, 8> Succs(TailBB->succ_begin(),
+                                            TailBB->succ_end());
+  SmallVector<MachineBasicBlock *, 8> Preds(TailBB->predecessors());
+  bool Changed = false;
+  for (MachineBasicBlock *PredBB : Preds) {
+    if (PredBB->hasEHPadSuccessor() || PredBB->mayHaveInlineAsmBr())
+      continue;
+
+    if (bothUsedInPHI(*PredBB, Succs))
+      continue;
+
+    MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
+    SmallVector<MachineOperand, 4> PredCond;
+    if (TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond))
+      continue;
+
+    Changed = true;
+    LLVM_DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
+                      << "From simple Succ: " << *TailBB);
+
+    MachineBasicBlock *NewTarget = *TailBB->succ_begin();
+    MachineBasicBlock *NextBB = PredBB->getNextNode();
+
+    // Make PredFBB explicit.
+    if (PredCond.empty())
+      PredFBB = PredTBB;
+
+    // Make fall through explicit.
+    if (!PredTBB)
+      PredTBB = NextBB;
+    if (!PredFBB)
+      PredFBB = NextBB;
+
+    // Redirect
+    if (PredFBB == TailBB)
+      PredFBB = NewTarget;
+    if (PredTBB == TailBB)
+      PredTBB = NewTarget;
+
+    // Make the branch unconditional if possible
+    if (PredTBB == PredFBB) {
+      PredCond.clear();
+      PredFBB = nullptr;
+    }
+
+    // Avoid adding fall through branches.
+    if (PredFBB == NextBB)
+      PredFBB = nullptr;
+    if (PredTBB == NextBB && PredFBB == nullptr)
+      PredTBB = nullptr;
+
+    auto DL = PredBB->findBranchDebugLoc();
+    TII->removeBranch(*PredBB);
+
+    if (!PredBB->isSuccessor(NewTarget))
+      PredBB->replaceSuccessor(TailBB, NewTarget);
+    else {
+      PredBB->removeSuccessor(TailBB, true);
+      assert(PredBB->succ_size() <= 1);
+    }
+
+    if (PredTBB)
+      TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DL);
+
+    TDBBs.push_back(PredBB);
+  }
+  return Changed;
+}
+
+bool TailDuplicator::canTailDuplicate(MachineBasicBlock *TailBB,
+                                      MachineBasicBlock *PredBB) {
+  // EH edges are ignored by analyzeBranch.
+  if (PredBB->succ_size() > 1)
+    return false;
+
+  MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
+  SmallVector<MachineOperand, 4> PredCond;
+  if (TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond))
+    return false;
+  if (!PredCond.empty())
+    return false;
+  // FIXME: This is overly conservative; it may be ok to relax this in the
+  // future under more specific conditions. If TailBB is an INLINEASM_BR
+  // indirect target, we need to see if the edge from PredBB to TailBB is from
+  // an INLINEASM_BR in PredBB, and then also if that edge was from the
+  // indirect target list, fallthrough/default target, or potentially both. If
+  // it's both, TailDuplicator::tailDuplicate will remove the edge, corrupting
+  // the successor list in PredBB and predecessor list in TailBB.
+  if (TailBB->isInlineAsmBrIndirectTarget())
+    return false;
+  return true;
+}
+
+/// If it is profitable, duplicate TailBB's contents in each
+/// of its predecessors.
+/// \p IsSimple result of isSimpleBB
+/// \p TailBB   Block to be duplicated.
+/// \p ForcedLayoutPred  When non-null, use this block as the layout predecessor
+///                      instead of the previous block in MF's order.
+/// \p TDBBs             A vector to keep track of all blocks tail-duplicated
+///                      into.
+/// \p Copies            A vector of copy instructions inserted. Used later to
+///                      walk all the inserted copies and remove redundant ones.
+bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
+                          MachineBasicBlock *ForcedLayoutPred,
+                          SmallVectorImpl<MachineBasicBlock *> &TDBBs,
+                          SmallVectorImpl<MachineInstr *> &Copies,
+                          SmallVectorImpl<MachineBasicBlock *> *CandidatePtr) {
+  LLVM_DEBUG(dbgs() << "\n*** Tail-duplicating " << printMBBReference(*TailBB)
+                    << '\n');
+
+  bool ShouldUpdateTerminators = TailBB->canFallThrough();
+
+  DenseSet<Register> UsedByPhi;
+  getRegsUsedByPHIs(*TailBB, &UsedByPhi);
+
+  if (IsSimple)
+    return duplicateSimpleBB(TailBB, TDBBs, UsedByPhi);
+
+  // Iterate through all the unique predecessors and tail-duplicate this
+  // block into them, if possible. Copying the list ahead of time also
+  // avoids trouble with the predecessor list reallocating.
+  bool Changed = false;
+  SmallSetVector<MachineBasicBlock *, 8> Preds;
+  if (CandidatePtr)
+    Preds.insert(CandidatePtr->begin(), CandidatePtr->end());
+  else
+    Preds.insert(TailBB->pred_begin(), TailBB->pred_end());
+
+  for (MachineBasicBlock *PredBB : Preds) {
+    assert(TailBB != PredBB &&
+           "Single-block loop should have been rejected earlier!");
+
+    if (!canTailDuplicate(TailBB, PredBB))
+      continue;
+
+    // Don't duplicate into a fall-through predecessor (at least for now).
+    // If profile is available, findDuplicateCandidates can choose better
+    // fall-through predecessor.
+    if (!(MF->getFunction().hasProfileData() && LayoutMode)) {
+      bool IsLayoutSuccessor = false;
+      if (ForcedLayoutPred)
+        IsLayoutSuccessor = (ForcedLayoutPred == PredBB);
+      else if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough())
+        IsLayoutSuccessor = true;
+      if (IsLayoutSuccessor)
+        continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
+                      << "From Succ: " << *TailBB);
+
+    TDBBs.push_back(PredBB);
+
+    // Remove PredBB's unconditional branch.
+    TII->removeBranch(*PredBB);
+
+    // Clone the contents of TailBB into PredBB.
+    DenseMap<Register, RegSubRegPair> LocalVRMap;
+    SmallVector<std::pair<Register, RegSubRegPair>, 4> CopyInfos;
+    for (MachineInstr &MI : llvm::make_early_inc_range(*TailBB)) {
+      if (MI.isPHI()) {
+        // Replace the uses of the def of the PHI with the register coming
+        // from PredBB.
+        processPHI(&MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, true);
+      } else {
+        // Replace def of virtual registers with new registers, and update
+        // uses with PHI source register or the new registers.
+        duplicateInstruction(&MI, TailBB, PredBB, LocalVRMap, UsedByPhi);
+      }
+    }
+    appendCopies(PredBB, CopyInfos, Copies);
+
+    NumTailDupAdded += TailBB->size() - 1; // subtract one for removed branch
+
+    // Update the CFG.
+    PredBB->removeSuccessor(PredBB->succ_begin());
+    assert(PredBB->succ_empty() &&
+           "TailDuplicate called on block with multiple successors!");
+    for (MachineBasicBlock *Succ : TailBB->successors())
+      PredBB->addSuccessor(Succ, MBPI->getEdgeProbability(TailBB, Succ));
+
+    // Update branches in pred to jump to tail's layout successor if needed.
+    if (ShouldUpdateTerminators)
+      PredBB->updateTerminator(TailBB->getNextNode());
+
+    Changed = true;
+    ++NumTailDups;
+  }
+
+  // If TailBB was duplicated into all its predecessors except for the prior
+  // block, which falls through unconditionally, move the contents of this
+  // block into the prior block.
+  MachineBasicBlock *PrevBB = ForcedLayoutPred;
+  if (!PrevBB)
+    PrevBB = &*std::prev(TailBB->getIterator());
+  MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr;
+  SmallVector<MachineOperand, 4> PriorCond;
+  // This has to check PrevBB->succ_size() because EH edges are ignored by
+  // analyzeBranch.
+  if (PrevBB->succ_size() == 1 &&
+      // Layout preds are not always CFG preds. Check.
+      *PrevBB->succ_begin() == TailBB &&
+      !TII->analyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond) &&
+      PriorCond.empty() &&
+      (!PriorTBB || PriorTBB == TailBB) &&
+      TailBB->pred_size() == 1 &&
+      !TailBB->hasAddressTaken()) {
+    LLVM_DEBUG(dbgs() << "\nMerging into block: " << *PrevBB
+                      << "From MBB: " << *TailBB);
+    // There may be a branch to the layout successor. This is unlikely but it
+    // happens. The correct thing to do is to remove the branch before
+    // duplicating the instructions in all cases.
+    bool RemovedBranches = TII->removeBranch(*PrevBB) != 0;
+
+    // If there are still tail instructions, abort the merge
+    if (PrevBB->getFirstTerminator() == PrevBB->end()) {
+      if (PreRegAlloc) {
+        DenseMap<Register, RegSubRegPair> LocalVRMap;
+        SmallVector<std::pair<Register, RegSubRegPair>, 4> CopyInfos;
+        MachineBasicBlock::iterator I = TailBB->begin();
+        // Process PHI instructions first.
+        while (I != TailBB->end() && I->isPHI()) {
+          // Replace the uses of the def of the PHI with the register coming
+          // from PredBB.
+          MachineInstr *MI = &*I++;
+          processPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos, UsedByPhi,
+                     true);
+        }
+
+        // Now copy the non-PHI instructions.
+        while (I != TailBB->end()) {
+          // Replace def of virtual registers with new registers, and update
+          // uses with PHI source register or the new registers.
+          MachineInstr *MI = &*I++;
+          assert(!MI->isBundle() && "Not expecting bundles before regalloc!");
+          duplicateInstruction(MI, TailBB, PrevBB, LocalVRMap, UsedByPhi);
+          MI->eraseFromParent();
+        }
+        appendCopies(PrevBB, CopyInfos, Copies);
+      } else {
+        TII->removeBranch(*PrevBB);
+        // No PHIs to worry about, just splice the instructions over.
+        PrevBB->splice(PrevBB->end(), TailBB, TailBB->begin(), TailBB->end());
+      }
+      PrevBB->removeSuccessor(PrevBB->succ_begin());
+      assert(PrevBB->succ_empty());
+      PrevBB->transferSuccessors(TailBB);
+
+      // Update branches in PrevBB based on Tail's layout successor.
+      if (ShouldUpdateTerminators)
+        PrevBB->updateTerminator(TailBB->getNextNode());
+
+      TDBBs.push_back(PrevBB);
+      Changed = true;
+    } else {
+      LLVM_DEBUG(dbgs() << "Abort merging blocks, the predecessor still "
+                           "contains terminator instructions");
+      // Return early if no changes were made
+      if (!Changed)
+        return RemovedBranches;
+    }
+    Changed |= RemovedBranches;
+  }
+
+  // If this is after register allocation, there are no phis to fix.
+  if (!PreRegAlloc)
+    return Changed;
+
+  // If we made no changes so far, we are safe.
+  if (!Changed)
+    return Changed;
+
+  // Handle the nasty case in that we duplicated a block that is part of a loop
+  // into some but not all of its predecessors. For example:
+  //    1 -> 2 <-> 3                 |
+  //          \                      |
+  //           \---> rest            |
+  // if we duplicate 2 into 1 but not into 3, we end up with
+  // 12 -> 3 <-> 2 -> rest           |
+  //   \             /               |
+  //    \----->-----/                |
+  // If there was a "var = phi(1, 3)" in 2, it has to be ultimately replaced
+  // with a phi in 3 (which now dominates 2).
+  // What we do here is introduce a copy in 3 of the register defined by the
+  // phi, just like when we are duplicating 2 into 3, but we don't copy any
+  // real instructions or remove the 3 -> 2 edge from the phi in 2.
+  for (MachineBasicBlock *PredBB : Preds) {
+    if (is_contained(TDBBs, PredBB))
+      continue;
+
+    // EH edges
+    if (PredBB->succ_size() != 1)
+      continue;
+
+    DenseMap<Register, RegSubRegPair> LocalVRMap;
+    SmallVector<std::pair<Register, RegSubRegPair>, 4> CopyInfos;
+    // Process PHI instructions first.
+    for (MachineInstr &MI : make_early_inc_range(TailBB->phis())) {
+      // Replace the uses of the def of the PHI with the register coming
+      // from PredBB.
+      processPHI(&MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, false);
+    }
+    appendCopies(PredBB, CopyInfos, Copies);
+  }
+
+  return Changed;
+}
+
+/// At the end of the block \p MBB generate COPY instructions between registers
+/// described by \p CopyInfos. Append resulting instructions to \p Copies.
+void TailDuplicator::appendCopies(MachineBasicBlock *MBB,
+      SmallVectorImpl<std::pair<Register, RegSubRegPair>> &CopyInfos,
+      SmallVectorImpl<MachineInstr*> &Copies) {
+  MachineBasicBlock::iterator Loc = MBB->getFirstTerminator();
+  const MCInstrDesc &CopyD = TII->get(TargetOpcode::COPY);
+  for (auto &CI : CopyInfos) {
+    auto C = BuildMI(*MBB, Loc, DebugLoc(), CopyD, CI.first)
+                .addReg(CI.second.Reg, 0, CI.second.SubReg);
+    Copies.push_back(C);
+  }
+}
+
+/// Remove the specified dead machine basic block from the function, updating
+/// the CFG.
+void TailDuplicator::removeDeadBlock(
+    MachineBasicBlock *MBB,
+    function_ref<void(MachineBasicBlock *)> *RemovalCallback) {
+  assert(MBB->pred_empty() && "MBB must be dead!");
+  LLVM_DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
+
+  MachineFunction *MF = MBB->getParent();
+  // Update the call site info.
+  for (const MachineInstr &MI : *MBB)
+    if (MI.shouldUpdateCallSiteInfo())
+      MF->eraseCallSiteInfo(&MI);
+
+  if (RemovalCallback)
+    (*RemovalCallback)(MBB);
+
+  // Remove all successors.
+  while (!MBB->succ_empty())
+    MBB->removeSuccessor(MBB->succ_end() - 1);
+
+  // Remove the block.
+  MBB->eraseFromParent();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
new file mode 100644
index 000000000000..48a2094f5d45
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -0,0 +1,168 @@
+//===- TargetFrameLoweringImpl.cpp - Implement target frame interface ------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the layout of a stack frame on the target machine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+TargetFrameLowering::~TargetFrameLowering() = default;
+
+bool TargetFrameLowering::enableCalleeSaveSkip(const MachineFunction &MF) const {
+  assert(MF.getFunction().hasFnAttribute(Attribute::NoReturn) &&
+         MF.getFunction().hasFnAttribute(Attribute::NoUnwind) &&
+         !MF.getFunction().hasFnAttribute(Attribute::UWTable));
+  return false;
+}
+
+bool TargetFrameLowering::enableCFIFixup(MachineFunction &MF) const {
+  return MF.needsFrameMoves() &&
+         !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+}
+
+/// Returns the displacement from the frame register to the stack
+/// frame of the specified index, along with the frame register used
+/// (in output arg FrameReg). This is the default implementation which
+/// is overridden for some targets.
+StackOffset
+TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                            Register &FrameReg) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+
+  // By default, assume all frame indices are referenced via whatever
+  // getFrameRegister() says. The target can override this if it's doing
+  // something different.
+  FrameReg = RI->getFrameRegister(MF);
+
+  return StackOffset::getFixed(MFI.getObjectOffset(FI) + MFI.getStackSize() -
+                               getOffsetOfLocalArea() +
+                               MFI.getOffsetAdjustment());
+}
+
+bool TargetFrameLowering::needsFrameIndexResolution(
+    const MachineFunction &MF) const {
+  return MF.getFrameInfo().hasStackObjects();
+}
+
+void TargetFrameLowering::getCalleeSaves(const MachineFunction &MF,
+                                         BitVector &CalleeSaves) const {
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  CalleeSaves.resize(TRI.getNumRegs());
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (!MFI.isCalleeSavedInfoValid())
+    return;
+
+  for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
+    CalleeSaves.set(Info.getReg());
+}
+
+void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                               BitVector &SavedRegs,
+                                               RegScavenger *RS) const {
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+
+  // Resize before the early returns. Some backends expect that
+  // SavedRegs.size() == TRI.getNumRegs() after this call even if there are no
+  // saved registers.
+  SavedRegs.resize(TRI.getNumRegs());
+
+  // When interprocedural register allocation is enabled caller saved registers
+  // are preferred over callee saved registers.
+  if (MF.getTarget().Options.EnableIPRA &&
+      isSafeForNoCSROpt(MF.getFunction()) &&
+      isProfitableForNoCSROpt(MF.getFunction()))
+    return;
+
+  // Get the callee saved register list...
+  const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
+
+  // Early exit if there are no callee saved registers.
+  if (!CSRegs || CSRegs[0] == 0)
+    return;
+
+  // In Naked functions we aren't going to save any registers.
+  if (MF.getFunction().hasFnAttribute(Attribute::Naked))
+    return;
+
+  // Noreturn+nounwind functions never restore CSR, so no saves are needed.
+  // Purely noreturn functions may still return through throws, so those must
+  // save CSR for caller exception handlers.
+  //
+  // If the function uses longjmp to break out of its current path of
+  // execution we do not need the CSR spills either: setjmp stores all CSRs
+  // it was called with into the jmp_buf, which longjmp then restores.
+  if (MF.getFunction().hasFnAttribute(Attribute::NoReturn) &&
+        MF.getFunction().hasFnAttribute(Attribute::NoUnwind) &&
+        !MF.getFunction().hasFnAttribute(Attribute::UWTable) &&
+        enableCalleeSaveSkip(MF))
+    return;
+
+  // Functions which call __builtin_unwind_init get all their registers saved.
+  bool CallsUnwindInit = MF.callsUnwindInit();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    if (CallsUnwindInit || MRI.isPhysRegModified(Reg))
+      SavedRegs.set(Reg);
+  }
+}
+
+bool TargetFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
+  const MachineFunction &MF) const {
+  if (!hasFP(MF))
+    return false;
+
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  return RegInfo->useFPForScavengingIndex(MF) &&
+         !RegInfo->hasStackRealignment(MF);
+}
+
+bool TargetFrameLowering::isSafeForNoCSROpt(const Function &F) {
+  if (!F.hasLocalLinkage() || F.hasAddressTaken() ||
+      !F.hasFnAttribute(Attribute::NoRecurse))
+    return false;
+  // Function should not be optimized as tail call.
+  for (const User *U : F.users())
+    if (auto *CB = dyn_cast<CallBase>(U))
+      if (CB->isTailCall())
+        return false;
+  return true;
+}
+
+int TargetFrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
+  llvm_unreachable("getInitialCFAOffset() not implemented!");
+}
+
+Register
+TargetFrameLowering::getInitialCFARegister(const MachineFunction &MF) const {
+  llvm_unreachable("getInitialCFARegister() not implemented!");
+}
+
+TargetFrameLowering::DwarfFrameBase
+TargetFrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {
+  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+  return DwarfFrameBase{DwarfFrameBase::Register, {RI->getFrameRegister(MF)}};
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetInstrInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetInstrInfo.cpp
new file mode 100644
index 000000000000..b29404b42519
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -0,0 +1,1726 @@
+//===-- TargetInstrInfo.cpp - Target Instruction Information --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/MachineCombinerPattern.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool> DisableHazardRecognizer(
+  "disable-sched-hazard", cl::Hidden, cl::init(false),
+  cl::desc("Disable hazard detection during preRA scheduling"));
+
+TargetInstrInfo::~TargetInstrInfo() = default;
+
+const TargetRegisterClass*
+TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
+                             const TargetRegisterInfo *TRI,
+                             const MachineFunction &MF) const {
+  if (OpNum >= MCID.getNumOperands())
+    return nullptr;
+
+  short RegClass = MCID.operands()[OpNum].RegClass;
+  if (MCID.operands()[OpNum].isLookupPtrRegClass())
+    return TRI->getPointerRegClass(MF, RegClass);
+
+  // Instructions like INSERT_SUBREG do not have fixed register classes.
+  if (RegClass < 0)
+    return nullptr;
+
+  // Otherwise just look it up normally.
+  return TRI->getRegClass(RegClass);
+}
+
+/// insertNoop - Insert a noop into the instruction stream at the specified
+/// point.
+void TargetInstrInfo::insertNoop(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI) const {
+  llvm_unreachable("Target didn't implement insertNoop!");
+}
+
+/// insertNoops - Insert noops into the instruction stream at the specified
+/// point.
+void TargetInstrInfo::insertNoops(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  unsigned Quantity) const {
+  for (unsigned i = 0; i < Quantity; ++i)
+    insertNoop(MBB, MI);
+}
+
+static bool isAsmComment(const char *Str, const MCAsmInfo &MAI) {
+  return strncmp(Str, MAI.getCommentString().data(),
+                 MAI.getCommentString().size()) == 0;
+}
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorString or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorString or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+/// We implement a special case of the .space directive which takes only a
+/// single integer argument in base 10 that is the size in bytes. This is a
+/// restricted form of the GAS directive in that we only interpret
+/// simple--i.e. not a logical or arithmetic expression--size values without
+/// the optional fill value. This is primarily used for creating arbitrary
+/// sized inline asm blocks for testing purposes.
+unsigned TargetInstrInfo::getInlineAsmLength(
+  const char *Str,
+  const MCAsmInfo &MAI, const TargetSubtargetInfo *STI) const {
+  // Count the number of instructions in the asm.
+  bool AtInsnStart = true;
+  unsigned Length = 0;
+  const unsigned MaxInstLength = MAI.getMaxInstLength(STI);
+  for (; *Str; ++Str) {
+    if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
+                                strlen(MAI.getSeparatorString())) == 0) {
+      AtInsnStart = true;
+    } else if (isAsmComment(Str, MAI)) {
+      // Stop counting as an instruction after a comment until the next
+      // separator.
+      AtInsnStart = false;
+    }
+
+    if (AtInsnStart && !isSpace(static_cast<unsigned char>(*Str))) {
+      unsigned AddLength = MaxInstLength;
+      if (strncmp(Str, ".space", 6) == 0) {
+        char *EStr;
+        int SpaceSize;
+        SpaceSize = strtol(Str + 6, &EStr, 10);
+        SpaceSize = SpaceSize < 0 ? 0 : SpaceSize;
+        while (*EStr != '\n' && isSpace(static_cast<unsigned char>(*EStr)))
+          ++EStr;
+        if (*EStr == '\0' || *EStr == '\n' ||
+            isAsmComment(EStr, MAI)) // Successfully parsed .space argument
+          AddLength = SpaceSize;
+      }
+      Length += AddLength;
+      AtInsnStart = false;
+    }
+  }
+
+  return Length;
+}
+
+/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
+/// after it, replacing it with an unconditional branch to NewDest.
+void
+TargetInstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+                                         MachineBasicBlock *NewDest) const {
+  MachineBasicBlock *MBB = Tail->getParent();
+
+  // Remove all the old successors of MBB from the CFG.
+  while (!MBB->succ_empty())
+    MBB->removeSuccessor(MBB->succ_begin());
+
+  // Save off the debug loc before erasing the instruction.
+  DebugLoc DL = Tail->getDebugLoc();
+
+  // Update call site info and remove all the dead instructions
+  // from the end of MBB.
+  while (Tail != MBB->end()) {
+    auto MI = Tail++;
+    if (MI->shouldUpdateCallSiteInfo())
+      MBB->getParent()->eraseCallSiteInfo(&*MI);
+    MBB->erase(MI);
+  }
+
+  // If MBB isn't immediately before MBB, insert a branch to it.
+  if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(NewDest))
+    insertBranch(*MBB, NewDest, nullptr, SmallVector<MachineOperand, 0>(), DL);
+  MBB->addSuccessor(NewDest);
+}
+
+MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI,
+                                                      bool NewMI, unsigned Idx1,
+                                                      unsigned Idx2) const {
+  const MCInstrDesc &MCID = MI.getDesc();
+  bool HasDef = MCID.getNumDefs();
+  if (HasDef && !MI.getOperand(0).isReg())
+    // No idea how to commute this instruction. Target should implement its own.
+    return nullptr;
+
+  unsigned CommutableOpIdx1 = Idx1; (void)CommutableOpIdx1;
+  unsigned CommutableOpIdx2 = Idx2; (void)CommutableOpIdx2;
+  assert(findCommutedOpIndices(MI, CommutableOpIdx1, CommutableOpIdx2) &&
+         CommutableOpIdx1 == Idx1 && CommutableOpIdx2 == Idx2 &&
+         "TargetInstrInfo::CommuteInstructionImpl(): not commutable operands.");
+  assert(MI.getOperand(Idx1).isReg() && MI.getOperand(Idx2).isReg() &&
+         "This only knows how to commute register operands so far");
+
+  Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register();
+  Register Reg1 = MI.getOperand(Idx1).getReg();
+  Register Reg2 = MI.getOperand(Idx2).getReg();
+  unsigned SubReg0 = HasDef ? MI.getOperand(0).getSubReg() : 0;
+  unsigned SubReg1 = MI.getOperand(Idx1).getSubReg();
+  unsigned SubReg2 = MI.getOperand(Idx2).getSubReg();
+  bool Reg1IsKill = MI.getOperand(Idx1).isKill();
+  bool Reg2IsKill = MI.getOperand(Idx2).isKill();
+  bool Reg1IsUndef = MI.getOperand(Idx1).isUndef();
+  bool Reg2IsUndef = MI.getOperand(Idx2).isUndef();
+  bool Reg1IsInternal = MI.getOperand(Idx1).isInternalRead();
+  bool Reg2IsInternal = MI.getOperand(Idx2).isInternalRead();
+  // Avoid calling isRenamable for virtual registers since we assert that
+  // renamable property is only queried/set for physical registers.
+  bool Reg1IsRenamable =
+      Reg1.isPhysical() ? MI.getOperand(Idx1).isRenamable() : false;
+  bool Reg2IsRenamable =
+      Reg2.isPhysical() ? MI.getOperand(Idx2).isRenamable() : false;
+  // If destination is tied to either of the commuted source register, then
+  // it must be updated.
+  if (HasDef && Reg0 == Reg1 &&
+      MI.getDesc().getOperandConstraint(Idx1, MCOI::TIED_TO) == 0) {
+    Reg2IsKill = false;
+    Reg0 = Reg2;
+    SubReg0 = SubReg2;
+  } else if (HasDef && Reg0 == Reg2 &&
+             MI.getDesc().getOperandConstraint(Idx2, MCOI::TIED_TO) == 0) {
+    Reg1IsKill = false;
+    Reg0 = Reg1;
+    SubReg0 = SubReg1;
+  }
+
+  MachineInstr *CommutedMI = nullptr;
+  if (NewMI) {
+    // Create a new instruction.
+    MachineFunction &MF = *MI.getMF();
+    CommutedMI = MF.CloneMachineInstr(&MI);
+  } else {
+    CommutedMI = &MI;
+  }
+
+  if (HasDef) {
+    CommutedMI->getOperand(0).setReg(Reg0);
+    CommutedMI->getOperand(0).setSubReg(SubReg0);
+  }
+  CommutedMI->getOperand(Idx2).setReg(Reg1);
+  CommutedMI->getOperand(Idx1).setReg(Reg2);
+  CommutedMI->getOperand(Idx2).setSubReg(SubReg1);
+  CommutedMI->getOperand(Idx1).setSubReg(SubReg2);
+  CommutedMI->getOperand(Idx2).setIsKill(Reg1IsKill);
+  CommutedMI->getOperand(Idx1).setIsKill(Reg2IsKill);
+  CommutedMI->getOperand(Idx2).setIsUndef(Reg1IsUndef);
+  CommutedMI->getOperand(Idx1).setIsUndef(Reg2IsUndef);
+  CommutedMI->getOperand(Idx2).setIsInternalRead(Reg1IsInternal);
+  CommutedMI->getOperand(Idx1).setIsInternalRead(Reg2IsInternal);
+  // Avoid calling setIsRenamable for virtual registers since we assert that
+  // renamable property is only queried/set for physical registers.
+  if (Reg1.isPhysical())
+    CommutedMI->getOperand(Idx2).setIsRenamable(Reg1IsRenamable);
+  if (Reg2.isPhysical())
+    CommutedMI->getOperand(Idx1).setIsRenamable(Reg2IsRenamable);
+  return CommutedMI;
+}
+
+MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr &MI, bool NewMI,
+                                                  unsigned OpIdx1,
+                                                  unsigned OpIdx2) const {
+  // If OpIdx1 or OpIdx2 is not specified, then this method is free to choose
+  // any commutable operand, which is done in findCommutedOpIndices() method
+  // called below.
+  if ((OpIdx1 == CommuteAnyOperandIndex || OpIdx2 == CommuteAnyOperandIndex) &&
+      !findCommutedOpIndices(MI, OpIdx1, OpIdx2)) {
+    assert(MI.isCommutable() &&
+           "Precondition violation: MI must be commutable.");
+    return nullptr;
+  }
+  return commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+}
+
+bool TargetInstrInfo::fixCommutedOpIndices(unsigned &ResultIdx1,
+                                           unsigned &ResultIdx2,
+                                           unsigned CommutableOpIdx1,
+                                           unsigned CommutableOpIdx2) {
+  if (ResultIdx1 == CommuteAnyOperandIndex &&
+      ResultIdx2 == CommuteAnyOperandIndex) {
+    ResultIdx1 = CommutableOpIdx1;
+    ResultIdx2 = CommutableOpIdx2;
+  } else if (ResultIdx1 == CommuteAnyOperandIndex) {
+    if (ResultIdx2 == CommutableOpIdx1)
+      ResultIdx1 = CommutableOpIdx2;
+    else if (ResultIdx2 == CommutableOpIdx2)
+      ResultIdx1 = CommutableOpIdx1;
+    else
+      return false;
+  } else if (ResultIdx2 == CommuteAnyOperandIndex) {
+    if (ResultIdx1 == CommutableOpIdx1)
+      ResultIdx2 = CommutableOpIdx2;
+    else if (ResultIdx1 == CommutableOpIdx2)
+      ResultIdx2 = CommutableOpIdx1;
+    else
+      return false;
+  } else
+    // Check that the result operand indices match the given commutable
+    // operand indices.
+    return (ResultIdx1 == CommutableOpIdx1 && ResultIdx2 == CommutableOpIdx2) ||
+           (ResultIdx1 == CommutableOpIdx2 && ResultIdx2 == CommutableOpIdx1);
+
+  return true;
+}
+
+bool TargetInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
+                                            unsigned &SrcOpIdx1,
+                                            unsigned &SrcOpIdx2) const {
+  assert(!MI.isBundle() &&
+         "TargetInstrInfo::findCommutedOpIndices() can't handle bundles");
+
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (!MCID.isCommutable())
+    return false;
+
+  // This assumes v0 = op v1, v2 and commuting would swap v1 and v2. If this
+  // is not true, then the target must implement this.
+  unsigned CommutableOpIdx1 = MCID.getNumDefs();
+  unsigned CommutableOpIdx2 = CommutableOpIdx1 + 1;
+  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+                            CommutableOpIdx1, CommutableOpIdx2))
+    return false;
+
+  if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg())
+    // No idea.
+    return false;
+  return true;
+}
+
+bool TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
+  if (!MI.isTerminator()) return false;
+
+  // Conditional branch is a special case.
+  if (MI.isBranch() && !MI.isBarrier())
+    return true;
+  if (!MI.isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+bool TargetInstrInfo::PredicateInstruction(
+    MachineInstr &MI, ArrayRef<MachineOperand> Pred) const {
+  bool MadeChange = false;
+
+  assert(!MI.isBundle() &&
+         "TargetInstrInfo::PredicateInstruction() can't handle bundles");
+
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (!MI.isPredicable())
+    return false;
+
+  for (unsigned j = 0, i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    if (MCID.operands()[i].isPredicate()) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        MO.setReg(Pred[j].getReg());
+        MadeChange = true;
+      } else if (MO.isImm()) {
+        MO.setImm(Pred[j].getImm());
+        MadeChange = true;
+      } else if (MO.isMBB()) {
+        MO.setMBB(Pred[j].getMBB());
+        MadeChange = true;
+      }
+      ++j;
+    }
+  }
+  return MadeChange;
+}
+
+bool TargetInstrInfo::hasLoadFromStackSlot(
+    const MachineInstr &MI,
+    SmallVectorImpl<const MachineMemOperand *> &Accesses) const {
+  size_t StartSize = Accesses.size();
+  for (MachineInstr::mmo_iterator o = MI.memoperands_begin(),
+                                  oe = MI.memoperands_end();
+       o != oe; ++o) {
+    if ((*o)->isLoad() &&
+        isa_and_nonnull<FixedStackPseudoSourceValue>((*o)->getPseudoValue()))
+      Accesses.push_back(*o);
+  }
+  return Accesses.size() != StartSize;
+}
+
+bool TargetInstrInfo::hasStoreToStackSlot(
+    const MachineInstr &MI,
+    SmallVectorImpl<const MachineMemOperand *> &Accesses) const {
+  size_t StartSize = Accesses.size();
+  for (MachineInstr::mmo_iterator o = MI.memoperands_begin(),
+                                  oe = MI.memoperands_end();
+       o != oe; ++o) {
+    if ((*o)->isStore() &&
+        isa_and_nonnull<FixedStackPseudoSourceValue>((*o)->getPseudoValue()))
+      Accesses.push_back(*o);
+  }
+  return Accesses.size() != StartSize;
+}
+
+bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
+                                        unsigned SubIdx, unsigned &Size,
+                                        unsigned &Offset,
+                                        const MachineFunction &MF) const {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  if (!SubIdx) {
+    Size = TRI->getSpillSize(*RC);
+    Offset = 0;
+    return true;
+  }
+  unsigned BitSize = TRI->getSubRegIdxSize(SubIdx);
+  // Convert bit size to byte size.
+  if (BitSize % 8)
+    return false;
+
+  int BitOffset = TRI->getSubRegIdxOffset(SubIdx);
+  if (BitOffset < 0 || BitOffset % 8)
+    return false;
+
+  Size = BitSize / 8;
+  Offset = (unsigned)BitOffset / 8;
+
+  assert(TRI->getSpillSize(*RC) >= (Offset + Size) && "bad subregister range");
+
+  if (!MF.getDataLayout().isLittleEndian()) {
+    Offset = TRI->getSpillSize(*RC) - (Offset + Size);
+  }
+  return true;
+}
+
+void TargetInstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    Register DestReg, unsigned SubIdx,
+                                    const MachineInstr &Orig,
+                                    const TargetRegisterInfo &TRI) const {
+  MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
+  MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, TRI);
+  MBB.insert(I, MI);
+}
+
+bool TargetInstrInfo::produceSameValue(const MachineInstr &MI0,
+                                       const MachineInstr &MI1,
+                                       const MachineRegisterInfo *MRI) const {
+  return MI0.isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
+}
+
+MachineInstr &TargetInstrInfo::duplicate(MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator InsertBefore, const MachineInstr &Orig) const {
+  assert(!Orig.isNotDuplicable() && "Instruction cannot be duplicated");
+  MachineFunction &MF = *MBB.getParent();
+  return MF.cloneMachineInstrBundle(MBB, InsertBefore, Orig);
+}
+
+// If the COPY instruction in MI can be folded to a stack operation, return
+// the register class to use.
+static const TargetRegisterClass *canFoldCopy(const MachineInstr &MI,
+                                              unsigned FoldIdx) {
+  assert(MI.isCopy() && "MI must be a COPY instruction");
+  if (MI.getNumOperands() != 2)
+    return nullptr;
+  assert(FoldIdx<2 && "FoldIdx refers no nonexistent operand");
+
+  const MachineOperand &FoldOp = MI.getOperand(FoldIdx);
+  const MachineOperand &LiveOp = MI.getOperand(1 - FoldIdx);
+
+  if (FoldOp.getSubReg() || LiveOp.getSubReg())
+    return nullptr;
+
+  Register FoldReg = FoldOp.getReg();
+  Register LiveReg = LiveOp.getReg();
+
+  assert(FoldReg.isVirtual() && "Cannot fold physregs");
+
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+  const TargetRegisterClass *RC = MRI.getRegClass(FoldReg);
+
+  if (LiveOp.getReg().isPhysical())
+    return RC->contains(LiveOp.getReg()) ? RC : nullptr;
+
+  if (RC->hasSubClassEq(MRI.getRegClass(LiveReg)))
+    return RC;
+
+  // FIXME: Allow folding when register classes are memory compatible.
+  return nullptr;
+}
+
+MCInst TargetInstrInfo::getNop() const { llvm_unreachable("Not implemented"); }
+
+std::pair<unsigned, unsigned>
+TargetInstrInfo::getPatchpointUnfoldableRange(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::STACKMAP:
+    // StackMapLiveValues are foldable
+    return std::make_pair(0, StackMapOpers(&MI).getVarIdx());
+  case TargetOpcode::PATCHPOINT:
+    // For PatchPoint, the call args are not foldable (even if reported in the
+    // stackmap e.g. via anyregcc).
+    return std::make_pair(0, PatchPointOpers(&MI).getVarIdx());
+  case TargetOpcode::STATEPOINT:
+    // For statepoints, fold deopt and gc arguments, but not call arguments.
+    return std::make_pair(MI.getNumDefs(), StatepointOpers(&MI).getVarIdx());
+  default:
+    llvm_unreachable("unexpected stackmap opcode");
+  }
+}
+
+static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI,
+                                    ArrayRef<unsigned> Ops, int FrameIndex,
+                                    const TargetInstrInfo &TII) {
+  unsigned StartIdx = 0;
+  unsigned NumDefs = 0;
+  // getPatchpointUnfoldableRange throws guarantee if MI is not a patchpoint.
+  std::tie(NumDefs, StartIdx) = TII.getPatchpointUnfoldableRange(MI);
+
+  unsigned DefToFoldIdx = MI.getNumOperands();
+
+  // Return false if any operands requested for folding are not foldable (not
+  // part of the stackmap's live values).
+  for (unsigned Op : Ops) {
+    if (Op < NumDefs) {
+      assert(DefToFoldIdx == MI.getNumOperands() && "Folding multiple defs");
+      DefToFoldIdx = Op;
+    } else if (Op < StartIdx) {
+      return nullptr;
+    }
+    if (MI.getOperand(Op).isTied())
+      return nullptr;
+  }
+
+  MachineInstr *NewMI =
+      MF.CreateMachineInstr(TII.get(MI.getOpcode()), MI.getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, NewMI);
+
+  // No need to fold return, the meta data, and function arguments
+  for (unsigned i = 0; i < StartIdx; ++i)
+    if (i != DefToFoldIdx)
+      MIB.add(MI.getOperand(i));
+
+  for (unsigned i = StartIdx, e = MI.getNumOperands(); i < e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    unsigned TiedTo = e;
+    (void)MI.isRegTiedToDefOperand(i, &TiedTo);
+
+    if (is_contained(Ops, i)) {
+      assert(TiedTo == e && "Cannot fold tied operands");
+      unsigned SpillSize;
+      unsigned SpillOffset;
+      // Compute the spill slot size and offset.
+      const TargetRegisterClass *RC =
+        MF.getRegInfo().getRegClass(MO.getReg());
+      bool Valid =
+          TII.getStackSlotRange(RC, MO.getSubReg(), SpillSize, SpillOffset, MF);
+      if (!Valid)
+        report_fatal_error("cannot spill patchpoint subregister operand");
+      MIB.addImm(StackMaps::IndirectMemRefOp);
+      MIB.addImm(SpillSize);
+      MIB.addFrameIndex(FrameIndex);
+      MIB.addImm(SpillOffset);
+    } else {
+      MIB.add(MO);
+      if (TiedTo < e) {
+        assert(TiedTo < NumDefs && "Bad tied operand");
+        if (TiedTo > DefToFoldIdx)
+          --TiedTo;
+        NewMI->tieOperands(TiedTo, NewMI->getNumOperands() - 1);
+      }
+    }
+  }
+  return NewMI;
+}
+
+MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
+                                                 ArrayRef<unsigned> Ops, int FI,
+                                                 LiveIntervals *LIS,
+                                                 VirtRegMap *VRM) const {
+  auto Flags = MachineMemOperand::MONone;
+  for (unsigned OpIdx : Ops)
+    Flags |= MI.getOperand(OpIdx).isDef() ? MachineMemOperand::MOStore
+                                          : MachineMemOperand::MOLoad;
+
+  MachineBasicBlock *MBB = MI.getParent();
+  assert(MBB && "foldMemoryOperand needs an inserted instruction");
+  MachineFunction &MF = *MBB->getParent();
+
+  // If we're not folding a load into a subreg, the size of the load is the
+  // size of the spill slot. But if we are, we need to figure out what the
+  // actual load size is.
+  int64_t MemSize = 0;
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+  if (Flags & MachineMemOperand::MOStore) {
+    MemSize = MFI.getObjectSize(FI);
+  } else {
+    for (unsigned OpIdx : Ops) {
+      int64_t OpSize = MFI.getObjectSize(FI);
+
+      if (auto SubReg = MI.getOperand(OpIdx).getSubReg()) {
+        unsigned SubRegSize = TRI->getSubRegIdxSize(SubReg);
+        if (SubRegSize > 0 && !(SubRegSize % 8))
+          OpSize = SubRegSize / 8;
+      }
+
+      MemSize = std::max(MemSize, OpSize);
+    }
+  }
+
+  assert(MemSize && "Did not expect a zero-sized stack slot");
+
+  MachineInstr *NewMI = nullptr;
+
+  if (MI.getOpcode() == TargetOpcode::STACKMAP ||
+      MI.getOpcode() == TargetOpcode::PATCHPOINT ||
+      MI.getOpcode() == TargetOpcode::STATEPOINT) {
+    // Fold stackmap/patchpoint.
+    NewMI = foldPatchpoint(MF, MI, Ops, FI, *this);
+    if (NewMI)
+      MBB->insert(MI, NewMI);
+  } else {
+    // Ask the target to do the actual folding.
+    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, FI, LIS, VRM);
+  }
+
+  if (NewMI) {
+    NewMI->setMemRefs(MF, MI.memoperands());
+    // Add a memory operand, foldMemoryOperandImpl doesn't do that.
+    assert((!(Flags & MachineMemOperand::MOStore) ||
+            NewMI->mayStore()) &&
+           "Folded a def to a non-store!");
+    assert((!(Flags & MachineMemOperand::MOLoad) ||
+            NewMI->mayLoad()) &&
+           "Folded a use to a non-load!");
+    assert(MFI.getObjectOffset(FI) != -1);
+    MachineMemOperand *MMO =
+        MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
+                                Flags, MemSize, MFI.getObjectAlign(FI));
+    NewMI->addMemOperand(MF, MMO);
+
+    // The pass "x86 speculative load hardening" always attaches symbols to
+    // call instructions. We need copy it form old instruction.
+    NewMI->cloneInstrSymbols(MF, MI);
+
+    return NewMI;
+  }
+
+  // Straight COPY may fold as load/store.
+  if (!MI.isCopy() || Ops.size() != 1)
+    return nullptr;
+
+  const TargetRegisterClass *RC = canFoldCopy(MI, Ops[0]);
+  if (!RC)
+    return nullptr;
+
+  const MachineOperand &MO = MI.getOperand(1 - Ops[0]);
+  MachineBasicBlock::iterator Pos = MI;
+
+  if (Flags == MachineMemOperand::MOStore)
+    storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI,
+                        Register());
+  else
+    loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI, Register());
+  return &*--Pos;
+}
+
+MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
+                                                 ArrayRef<unsigned> Ops,
+                                                 MachineInstr &LoadMI,
+                                                 LiveIntervals *LIS) const {
+  assert(LoadMI.canFoldAsLoad() && "LoadMI isn't foldable!");
+#ifndef NDEBUG
+  for (unsigned OpIdx : Ops)
+    assert(MI.getOperand(OpIdx).isUse() && "Folding load into def!");
+#endif
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+
+  // Ask the target to do the actual folding.
+  MachineInstr *NewMI = nullptr;
+  int FrameIndex = 0;
+
+  if ((MI.getOpcode() == TargetOpcode::STACKMAP ||
+       MI.getOpcode() == TargetOpcode::PATCHPOINT ||
+       MI.getOpcode() == TargetOpcode::STATEPOINT) &&
+      isLoadFromStackSlot(LoadMI, FrameIndex)) {
+    // Fold stackmap/patchpoint.
+    NewMI = foldPatchpoint(MF, MI, Ops, FrameIndex, *this);
+    if (NewMI)
+      NewMI = &*MBB.insert(MI, NewMI);
+  } else {
+    // Ask the target to do the actual folding.
+    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, LoadMI, LIS);
+  }
+
+  if (!NewMI)
+    return nullptr;
+
+  // Copy the memoperands from the load to the folded instruction.
+  if (MI.memoperands_empty()) {
+    NewMI->setMemRefs(MF, LoadMI.memoperands());
+  } else {
+    // Handle the rare case of folding multiple loads.
+    NewMI->setMemRefs(MF, MI.memoperands());
+    for (MachineInstr::mmo_iterator I = LoadMI.memoperands_begin(),
+                                    E = LoadMI.memoperands_end();
+         I != E; ++I) {
+      NewMI->addMemOperand(MF, *I);
+    }
+  }
+  return NewMI;
+}
+
+/// transferImplicitOperands - MI is a pseudo-instruction, and the lowered
+/// replacement instructions immediately precede it.  Copy any implicit
+/// operands from MI to the replacement instruction.
+static void transferImplicitOperands(MachineInstr *MI,
+                                     const TargetRegisterInfo *TRI) {
+  MachineBasicBlock::iterator CopyMI = MI;
+  --CopyMI;
+
+  Register DstReg = MI->getOperand(0).getReg();
+  for (const MachineOperand &MO : MI->implicit_operands()) {
+    CopyMI->addOperand(MO);
+
+    // Be conservative about preserving kills when subregister defs are
+    // involved. If there was implicit kill of a super-register overlapping the
+    // copy result, we would kill the subregisters previous copies defined.
+
+    if (MO.isKill() && TRI->regsOverlap(DstReg, MO.getReg()))
+      CopyMI->getOperand(CopyMI->getNumOperands() - 1).setIsKill(false);
+  }
+}
+
+void TargetInstrInfo::lowerCopy(MachineInstr *MI,
+                                const TargetRegisterInfo *TRI) const {
+  if (MI->allDefsAreDead()) {
+    MI->setDesc(get(TargetOpcode::KILL));
+    return;
+  }
+
+  MachineOperand &DstMO = MI->getOperand(0);
+  MachineOperand &SrcMO = MI->getOperand(1);
+
+  bool IdentityCopy = (SrcMO.getReg() == DstMO.getReg());
+  if (IdentityCopy || SrcMO.isUndef()) {
+    // No need to insert an identity copy instruction, but replace with a KILL
+    // if liveness is changed.
+    if (SrcMO.isUndef() || MI->getNumOperands() > 2) {
+      // We must make sure the super-register gets killed. Replace the
+      // instruction with KILL.
+      MI->setDesc(get(TargetOpcode::KILL));
+      return;
+    }
+    // Vanilla identity copy.
+    MI->eraseFromParent();
+    return;
+  }
+
+  copyPhysReg(*MI->getParent(), MI, MI->getDebugLoc(), DstMO.getReg(),
+              SrcMO.getReg(), SrcMO.isKill());
+
+  if (MI->getNumOperands() > 2)
+    transferImplicitOperands(MI, TRI);
+  MI->eraseFromParent();
+  return;
+}
+
+bool TargetInstrInfo::hasReassociableOperands(
+    const MachineInstr &Inst, const MachineBasicBlock *MBB) const {
+  const MachineOperand &Op1 = Inst.getOperand(1);
+  const MachineOperand &Op2 = Inst.getOperand(2);
+  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  // We need virtual register definitions for the operands that we will
+  // reassociate.
+  MachineInstr *MI1 = nullptr;
+  MachineInstr *MI2 = nullptr;
+  if (Op1.isReg() && Op1.getReg().isVirtual())
+    MI1 = MRI.getUniqueVRegDef(Op1.getReg());
+  if (Op2.isReg() && Op2.getReg().isVirtual())
+    MI2 = MRI.getUniqueVRegDef(Op2.getReg());
+
+  // And at least one operand must be defined in MBB.
+  return MI1 && MI2 && (MI1->getParent() == MBB || MI2->getParent() == MBB);
+}
+
+bool TargetInstrInfo::areOpcodesEqualOrInverse(unsigned Opcode1,
+                                               unsigned Opcode2) const {
+  return Opcode1 == Opcode2 || getInverseOpcode(Opcode1) == Opcode2;
+}
+
+bool TargetInstrInfo::hasReassociableSibling(const MachineInstr &Inst,
+                                             bool &Commuted) const {
+  const MachineBasicBlock *MBB = Inst.getParent();
+  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg());
+  MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg());
+  unsigned Opcode = Inst.getOpcode();
+
+  // If only one operand has the same or inverse opcode and it's the second
+  // source operand, the operands must be commuted.
+  Commuted = !areOpcodesEqualOrInverse(Opcode, MI1->getOpcode()) &&
+             areOpcodesEqualOrInverse(Opcode, MI2->getOpcode());
+  if (Commuted)
+    std::swap(MI1, MI2);
+
+  // 1. The previous instruction must be the same type as Inst.
+  // 2. The previous instruction must also be associative/commutative or be the
+  //    inverse of such an operation (this can be different even for
+  //    instructions with the same opcode if traits like fast-math-flags are
+  //    included).
+  // 3. The previous instruction must have virtual register definitions for its
+  //    operands in the same basic block as Inst.
+  // 4. The previous instruction's result must only be used by Inst.
+  return areOpcodesEqualOrInverse(Opcode, MI1->getOpcode()) &&
+         (isAssociativeAndCommutative(*MI1) ||
+          isAssociativeAndCommutative(*MI1, /* Invert */ true)) &&
+         hasReassociableOperands(*MI1, MBB) &&
+         MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg());
+}
+
+// 1. The operation must be associative and commutative or be the inverse of
+//    such an operation.
+// 2. The instruction must have virtual register definitions for its
+//    operands in the same basic block.
+// 3. The instruction must have a reassociable sibling.
+bool TargetInstrInfo::isReassociationCandidate(const MachineInstr &Inst,
+                                               bool &Commuted) const {
+  return (isAssociativeAndCommutative(Inst) ||
+          isAssociativeAndCommutative(Inst, /* Invert */ true)) &&
+         hasReassociableOperands(Inst, Inst.getParent()) &&
+         hasReassociableSibling(Inst, Commuted);
+}
+
+// The concept of the reassociation pass is that these operations can benefit
+// from this kind of transformation:
+//
+// A = ? op ?
+// B = A op X (Prev)
+// C = B op Y (Root)
+// -->
+// A = ? op ?
+// B = X op Y
+// C = A op B
+//
+// breaking the dependency between A and B, allowing them to be executed in
+// parallel (or back-to-back in a pipeline) instead of depending on each other.
+
+// FIXME: This has the potential to be expensive (compile time) while not
+// improving the code at all. Some ways to limit the overhead:
+// 1. Track successful transforms; bail out if hit rate gets too low.
+// 2. Only enable at -O3 or some other non-default optimization level.
+// 3. Pre-screen pattern candidates here: if an operand of the previous
+//    instruction is known to not increase the critical path, then don't match
+//    that pattern.
+bool TargetInstrInfo::getMachineCombinerPatterns(
+    MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
+    bool DoRegPressureReduce) const {
+  bool Commute;
+  if (isReassociationCandidate(Root, Commute)) {
+    // We found a sequence of instructions that may be suitable for a
+    // reassociation of operands to increase ILP. Specify each commutation
+    // possibility for the Prev instruction in the sequence and let the
+    // machine combiner decide if changing the operands is worthwhile.
+    if (Commute) {
+      Patterns.push_back(MachineCombinerPattern::REASSOC_AX_YB);
+      Patterns.push_back(MachineCombinerPattern::REASSOC_XA_YB);
+    } else {
+      Patterns.push_back(MachineCombinerPattern::REASSOC_AX_BY);
+      Patterns.push_back(MachineCombinerPattern::REASSOC_XA_BY);
+    }
+    return true;
+  }
+
+  return false;
+}
+
+/// Return true when a code sequence can improve loop throughput.
+bool
+TargetInstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
+  return false;
+}
+
+std::pair<unsigned, unsigned>
+TargetInstrInfo::getReassociationOpcodes(MachineCombinerPattern Pattern,
+                                         const MachineInstr &Root,
+                                         const MachineInstr &Prev) const {
+  bool AssocCommutRoot = isAssociativeAndCommutative(Root);
+  bool AssocCommutPrev = isAssociativeAndCommutative(Prev);
+
+  // Early exit if both opcodes are associative and commutative. It's a trivial
+  // reassociation when we only change operands order. In this case opcodes are
+  // not required to have inverse versions.
+  if (AssocCommutRoot && AssocCommutPrev) {
+    assert(Root.getOpcode() == Prev.getOpcode() && "Expected to be equal");
+    return std::make_pair(Root.getOpcode(), Root.getOpcode());
+  }
+
+  // At least one instruction is not associative or commutative.
+  // Since we have matched one of the reassociation patterns, we expect that the
+  // instructions' opcodes are equal or one of them is the inversion of the
+  // other.
+  assert(areOpcodesEqualOrInverse(Root.getOpcode(), Prev.getOpcode()) &&
+         "Incorrectly matched pattern");
+  unsigned AssocCommutOpcode = Root.getOpcode();
+  unsigned InverseOpcode = *getInverseOpcode(Root.getOpcode());
+  if (!AssocCommutRoot)
+    std::swap(AssocCommutOpcode, InverseOpcode);
+
+  // The transformation rule (`+` is any associative and commutative binary
+  // operation, `-` is the inverse):
+  // REASSOC_AX_BY:
+  //   (A + X) + Y => A + (X + Y)
+  //   (A + X) - Y => A + (X - Y)
+  //   (A - X) + Y => A - (X - Y)
+  //   (A - X) - Y => A - (X + Y)
+  // REASSOC_XA_BY:
+  //   (X + A) + Y => (X + Y) + A
+  //   (X + A) - Y => (X - Y) + A
+  //   (X - A) + Y => (X + Y) - A
+  //   (X - A) - Y => (X - Y) - A
+  // REASSOC_AX_YB:
+  //   Y + (A + X) => (Y + X) + A
+  //   Y - (A + X) => (Y - X) - A
+  //   Y + (A - X) => (Y - X) + A
+  //   Y - (A - X) => (Y + X) - A
+  // REASSOC_XA_YB:
+  //   Y + (X + A) => (Y + X) + A
+  //   Y - (X + A) => (Y - X) - A
+  //   Y + (X - A) => (Y + X) - A
+  //   Y - (X - A) => (Y - X) + A
+  switch (Pattern) {
+  default:
+    llvm_unreachable("Unexpected pattern");
+  case MachineCombinerPattern::REASSOC_AX_BY:
+    if (!AssocCommutRoot && AssocCommutPrev)
+      return {AssocCommutOpcode, InverseOpcode};
+    if (AssocCommutRoot && !AssocCommutPrev)
+      return {InverseOpcode, InverseOpcode};
+    if (!AssocCommutRoot && !AssocCommutPrev)
+      return {InverseOpcode, AssocCommutOpcode};
+    break;
+  case MachineCombinerPattern::REASSOC_XA_BY:
+    if (!AssocCommutRoot && AssocCommutPrev)
+      return {AssocCommutOpcode, InverseOpcode};
+    if (AssocCommutRoot && !AssocCommutPrev)
+      return {InverseOpcode, AssocCommutOpcode};
+    if (!AssocCommutRoot && !AssocCommutPrev)
+      return {InverseOpcode, InverseOpcode};
+    break;
+  case MachineCombinerPattern::REASSOC_AX_YB:
+    if (!AssocCommutRoot && AssocCommutPrev)
+      return {InverseOpcode, InverseOpcode};
+    if (AssocCommutRoot && !AssocCommutPrev)
+      return {AssocCommutOpcode, InverseOpcode};
+    if (!AssocCommutRoot && !AssocCommutPrev)
+      return {InverseOpcode, AssocCommutOpcode};
+    break;
+  case MachineCombinerPattern::REASSOC_XA_YB:
+    if (!AssocCommutRoot && AssocCommutPrev)
+      return {InverseOpcode, InverseOpcode};
+    if (AssocCommutRoot && !AssocCommutPrev)
+      return {InverseOpcode, AssocCommutOpcode};
+    if (!AssocCommutRoot && !AssocCommutPrev)
+      return {AssocCommutOpcode, InverseOpcode};
+    break;
+  }
+  llvm_unreachable("Unhandled combination");
+}
+
+// Return a pair of boolean flags showing if the new root and new prev operands
+// must be swapped. See visual example of the rule in
+// TargetInstrInfo::getReassociationOpcodes.
+static std::pair<bool, bool> mustSwapOperands(MachineCombinerPattern Pattern) {
+  switch (Pattern) {
+  default:
+    llvm_unreachable("Unexpected pattern");
+  case MachineCombinerPattern::REASSOC_AX_BY:
+    return {false, false};
+  case MachineCombinerPattern::REASSOC_XA_BY:
+    return {true, false};
+  case MachineCombinerPattern::REASSOC_AX_YB:
+    return {true, true};
+  case MachineCombinerPattern::REASSOC_XA_YB:
+    return {true, true};
+  }
+}
+
+/// Attempt the reassociation transformation to reduce critical path length.
+/// See the above comments before getMachineCombinerPatterns().
+void TargetInstrInfo::reassociateOps(
+    MachineInstr &Root, MachineInstr &Prev,
+    MachineCombinerPattern Pattern,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+  MachineFunction *MF = Root.getMF();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI);
+
+  // This array encodes the operand index for each parameter because the
+  // operands may be commuted. Each row corresponds to a pattern value,
+  // and each column specifies the index of A, B, X, Y.
+  unsigned OpIdx[4][4] = {
+    { 1, 1, 2, 2 },
+    { 1, 2, 2, 1 },
+    { 2, 1, 1, 2 },
+    { 2, 2, 1, 1 }
+  };
+
+  int Row;
+  switch (Pattern) {
+  case MachineCombinerPattern::REASSOC_AX_BY: Row = 0; break;
+  case MachineCombinerPattern::REASSOC_AX_YB: Row = 1; break;
+  case MachineCombinerPattern::REASSOC_XA_BY: Row = 2; break;
+  case MachineCombinerPattern::REASSOC_XA_YB: Row = 3; break;
+  default: llvm_unreachable("unexpected MachineCombinerPattern");
+  }
+
+  MachineOperand &OpA = Prev.getOperand(OpIdx[Row][0]);
+  MachineOperand &OpB = Root.getOperand(OpIdx[Row][1]);
+  MachineOperand &OpX = Prev.getOperand(OpIdx[Row][2]);
+  MachineOperand &OpY = Root.getOperand(OpIdx[Row][3]);
+  MachineOperand &OpC = Root.getOperand(0);
+
+  Register RegA = OpA.getReg();
+  Register RegB = OpB.getReg();
+  Register RegX = OpX.getReg();
+  Register RegY = OpY.getReg();
+  Register RegC = OpC.getReg();
+
+  if (RegA.isVirtual())
+    MRI.constrainRegClass(RegA, RC);
+  if (RegB.isVirtual())
+    MRI.constrainRegClass(RegB, RC);
+  if (RegX.isVirtual())
+    MRI.constrainRegClass(RegX, RC);
+  if (RegY.isVirtual())
+    MRI.constrainRegClass(RegY, RC);
+  if (RegC.isVirtual())
+    MRI.constrainRegClass(RegC, RC);
+
+  // Create a new virtual register for the result of (X op Y) instead of
+  // recycling RegB because the MachineCombiner's computation of the critical
+  // path requires a new register definition rather than an existing one.
+  Register NewVR = MRI.createVirtualRegister(RC);
+  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+
+  auto [NewRootOpc, NewPrevOpc] = getReassociationOpcodes(Pattern, Root, Prev);
+  bool KillA = OpA.isKill();
+  bool KillX = OpX.isKill();
+  bool KillY = OpY.isKill();
+  bool KillNewVR = true;
+
+  auto [SwapRootOperands, SwapPrevOperands] = mustSwapOperands(Pattern);
+
+  if (SwapPrevOperands) {
+    std::swap(RegX, RegY);
+    std::swap(KillX, KillY);
+  }
+
+  // Create new instructions for insertion.
+  MachineInstrBuilder MIB1 =
+      BuildMI(*MF, MIMetadata(Prev), TII->get(NewPrevOpc), NewVR)
+          .addReg(RegX, getKillRegState(KillX))
+          .addReg(RegY, getKillRegState(KillY))
+          .setMIFlags(Prev.getFlags());
+
+  if (SwapRootOperands) {
+    std::swap(RegA, NewVR);
+    std::swap(KillA, KillNewVR);
+  }
+
+  MachineInstrBuilder MIB2 =
+      BuildMI(*MF, MIMetadata(Root), TII->get(NewRootOpc), RegC)
+          .addReg(RegA, getKillRegState(KillA))
+          .addReg(NewVR, getKillRegState(KillNewVR))
+          .setMIFlags(Root.getFlags());
+
+  setSpecialOperandAttr(Root, Prev, *MIB1, *MIB2);
+
+  // Record new instructions for insertion and old instructions for deletion.
+  InsInstrs.push_back(MIB1);
+  InsInstrs.push_back(MIB2);
+  DelInstrs.push_back(&Prev);
+  DelInstrs.push_back(&Root);
+
+  // We transformed:
+  // B = A op X (Prev)
+  // C = B op Y (Root)
+  // Into:
+  // B = X op Y (MIB1)
+  // C = A op B (MIB2)
+  // C has the same value as before, B doesn't; as such, keep the debug number
+  // of C but not of B.
+  if (unsigned OldRootNum = Root.peekDebugInstrNum())
+    MIB2.getInstr()->setDebugInstrNum(OldRootNum);
+}
+
+void TargetInstrInfo::genAlternativeCodeSequence(
+    MachineInstr &Root, MachineCombinerPattern Pattern,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const {
+  MachineRegisterInfo &MRI = Root.getMF()->getRegInfo();
+
+  // Select the previous instruction in the sequence based on the input pattern.
+  MachineInstr *Prev = nullptr;
+  switch (Pattern) {
+  case MachineCombinerPattern::REASSOC_AX_BY:
+  case MachineCombinerPattern::REASSOC_XA_BY:
+    Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+    break;
+  case MachineCombinerPattern::REASSOC_AX_YB:
+  case MachineCombinerPattern::REASSOC_XA_YB:
+    Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
+    break;
+  default:
+    llvm_unreachable("Unknown pattern for machine combiner");
+  }
+
+  // Don't reassociate if Prev and Root are in different blocks.
+  if (Prev->getParent() != Root.getParent())
+    return;
+
+  reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg);
+}
+
+MachineTraceStrategy TargetInstrInfo::getMachineCombinerTraceStrategy() const {
+  return MachineTraceStrategy::TS_MinInstrCount;
+}
+
+bool TargetInstrInfo::isReallyTriviallyReMaterializableGeneric(
+    const MachineInstr &MI) const {
+  const MachineFunction &MF = *MI.getMF();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Remat clients assume operand 0 is the defined register.
+  if (!MI.getNumOperands() || !MI.getOperand(0).isReg())
+    return false;
+  Register DefReg = MI.getOperand(0).getReg();
+
+  // A sub-register definition can only be rematerialized if the instruction
+  // doesn't read the other parts of the register.  Otherwise it is really a
+  // read-modify-write operation on the full virtual register which cannot be
+  // moved safely.
+  if (DefReg.isVirtual() && MI.getOperand(0).getSubReg() &&
+      MI.readsVirtualRegister(DefReg))
+    return false;
+
+  // A load from a fixed stack slot can be rematerialized. This may be
+  // redundant with subsequent checks, but it's target-independent,
+  // simple, and a common case.
+  int FrameIdx = 0;
+  if (isLoadFromStackSlot(MI, FrameIdx) &&
+      MF.getFrameInfo().isImmutableObjectIndex(FrameIdx))
+    return true;
+
+  // Avoid instructions obviously unsafe for remat.
+  if (MI.isNotDuplicable() || MI.mayStore() || MI.mayRaiseFPException() ||
+      MI.hasUnmodeledSideEffects())
+    return false;
+
+  // Don't remat inline asm. We have no idea how expensive it is
+  // even if it's side effect free.
+  if (MI.isInlineAsm())
+    return false;
+
+  // Avoid instructions which load from potentially varying memory.
+  if (MI.mayLoad() && !MI.isDereferenceableInvariantLoad())
+    return false;
+
+  // If any of the registers accessed are non-constant, conservatively assume
+  // the instruction is not rematerializable.
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg()) continue;
+    Register Reg = MO.getReg();
+    if (Reg == 0)
+      continue;
+
+    // Check for a well-behaved physical register.
+    if (Reg.isPhysical()) {
+      if (MO.isUse()) {
+        // If the physreg has no defs anywhere, it's just an ambient register
+        // and we can freely move its uses. Alternatively, if it's allocatable,
+        // it could get allocated to something with a def during allocation.
+        if (!MRI.isConstantPhysReg(Reg))
+          return false;
+      } else {
+        // A physreg def. We can't remat it.
+        return false;
+      }
+      continue;
+    }
+
+    // Only allow one virtual-register def.  There may be multiple defs of the
+    // same virtual register, though.
+    if (MO.isDef() && Reg != DefReg)
+      return false;
+
+    // Don't allow any virtual-register uses. Rematting an instruction with
+    // virtual register uses would length the live ranges of the uses, which
+    // is not necessarily a good idea, certainly not "trivial".
+    if (MO.isUse())
+      return false;
+  }
+
+  // Everything checked out.
+  return true;
+}
+
+int TargetInstrInfo::getSPAdjust(const MachineInstr &MI) const {
+  const MachineFunction *MF = MI.getMF();
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  bool StackGrowsDown =
+    TFI->getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
+  unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
+
+  if (!isFrameInstr(MI))
+    return 0;
+
+  int SPAdj = TFI->alignSPAdjust(getFrameSize(MI));
+
+  if ((!StackGrowsDown && MI.getOpcode() == FrameSetupOpcode) ||
+      (StackGrowsDown && MI.getOpcode() == FrameDestroyOpcode))
+    SPAdj = -SPAdj;
+
+  return SPAdj;
+}
+
+/// isSchedulingBoundary - Test if the given instruction should be
+/// considered a scheduling boundary. This primarily includes labels
+/// and terminators.
+bool TargetInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                           const MachineBasicBlock *MBB,
+                                           const MachineFunction &MF) const {
+  // Terminators and labels can't be scheduled around.
+  if (MI.isTerminator() || MI.isPosition())
+    return true;
+
+  // INLINEASM_BR can jump to another block
+  if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
+    return true;
+
+  // Don't attempt to schedule around any instruction that defines
+  // a stack-oriented pointer, as it's unlikely to be profitable. This
+  // saves compile time, because it doesn't require every single
+  // stack slot reference to depend on the instruction that does the
+  // modification.
+  const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  return MI.modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI);
+}
+
+// Provide a global flag for disabling the PreRA hazard recognizer that targets
+// may choose to honor.
+bool TargetInstrInfo::usePreRAHazardRecognizer() const {
+  return !DisableHazardRecognizer;
+}
+
+// Default implementation of CreateTargetRAHazardRecognizer.
+ScheduleHazardRecognizer *TargetInstrInfo::
+CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+                             const ScheduleDAG *DAG) const {
+  // Dummy hazard recognizer allows all instructions to issue.
+  return new ScheduleHazardRecognizer();
+}
+
+// Default implementation of CreateTargetMIHazardRecognizer.
+ScheduleHazardRecognizer *TargetInstrInfo::CreateTargetMIHazardRecognizer(
+    const InstrItineraryData *II, const ScheduleDAGMI *DAG) const {
+  return new ScoreboardHazardRecognizer(II, DAG, "machine-scheduler");
+}
+
+// Default implementation of CreateTargetPostRAHazardRecognizer.
+ScheduleHazardRecognizer *TargetInstrInfo::
+CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                   const ScheduleDAG *DAG) const {
+  return new ScoreboardHazardRecognizer(II, DAG, "post-RA-sched");
+}
+
+// Default implementation of getMemOperandWithOffset.
+bool TargetInstrInfo::getMemOperandWithOffset(
+    const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset,
+    bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const {
+  SmallVector<const MachineOperand *, 4> BaseOps;
+  unsigned Width;
+  if (!getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, OffsetIsScalable,
+                                     Width, TRI) ||
+      BaseOps.size() != 1)
+    return false;
+  BaseOp = BaseOps.front();
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  SelectionDAG latency interface.
+//===----------------------------------------------------------------------===//
+
+int
+TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                   SDNode *DefNode, unsigned DefIdx,
+                                   SDNode *UseNode, unsigned UseIdx) const {
+  if (!ItinData || ItinData->isEmpty())
+    return -1;
+
+  if (!DefNode->isMachineOpcode())
+    return -1;
+
+  unsigned DefClass = get(DefNode->getMachineOpcode()).getSchedClass();
+  if (!UseNode->isMachineOpcode())
+    return ItinData->getOperandCycle(DefClass, DefIdx);
+  unsigned UseClass = get(UseNode->getMachineOpcode()).getSchedClass();
+  return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
+}
+
+int TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                     SDNode *N) const {
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  if (!N->isMachineOpcode())
+    return 1;
+
+  return ItinData->getStageLatency(get(N->getMachineOpcode()).getSchedClass());
+}
+
+//===----------------------------------------------------------------------===//
+//  MachineInstr latency interface.
+//===----------------------------------------------------------------------===//
+
+unsigned TargetInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
+                                         const MachineInstr &MI) const {
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  unsigned Class = MI.getDesc().getSchedClass();
+  int UOps = ItinData->Itineraries[Class].NumMicroOps;
+  if (UOps >= 0)
+    return UOps;
+
+  // The # of u-ops is dynamically determined. The specific target should
+  // override this function to return the right number.
+  return 1;
+}
+
+/// Return the default expected latency for a def based on it's opcode.
+unsigned TargetInstrInfo::defaultDefLatency(const MCSchedModel &SchedModel,
+                                            const MachineInstr &DefMI) const {
+  if (DefMI.isTransient())
+    return 0;
+  if (DefMI.mayLoad())
+    return SchedModel.LoadLatency;
+  if (isHighLatencyDef(DefMI.getOpcode()))
+    return SchedModel.HighLatency;
+  return 1;
+}
+
+unsigned TargetInstrInfo::getPredicationCost(const MachineInstr &) const {
+  return 0;
+}
+
+unsigned TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                          const MachineInstr &MI,
+                                          unsigned *PredCost) const {
+  // Default to one cycle for no itinerary. However, an "empty" itinerary may
+  // still have a MinLatency property, which getStageLatency checks.
+  if (!ItinData)
+    return MI.mayLoad() ? 2 : 1;
+
+  return ItinData->getStageLatency(MI.getDesc().getSchedClass());
+}
+
+bool TargetInstrInfo::hasLowDefLatency(const TargetSchedModel &SchedModel,
+                                       const MachineInstr &DefMI,
+                                       unsigned DefIdx) const {
+  const InstrItineraryData *ItinData = SchedModel.getInstrItineraries();
+  if (!ItinData || ItinData->isEmpty())
+    return false;
+
+  unsigned DefClass = DefMI.getDesc().getSchedClass();
+  int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+  return (DefCycle != -1 && DefCycle <= 1);
+}
+
+std::optional<ParamLoadedValue>
+TargetInstrInfo::describeLoadedValue(const MachineInstr &MI,
+                                     Register Reg) const {
+  const MachineFunction *MF = MI.getMF();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  DIExpression *Expr = DIExpression::get(MF->getFunction().getContext(), {});
+  int64_t Offset;
+  bool OffsetIsScalable;
+
+  // To simplify the sub-register handling, verify that we only need to
+  // consider physical registers.
+  assert(MF->getProperties().hasProperty(
+      MachineFunctionProperties::Property::NoVRegs));
+
+  if (auto DestSrc = isCopyInstr(MI)) {
+    Register DestReg = DestSrc->Destination->getReg();
+
+    // If the copy destination is the forwarding reg, describe the forwarding
+    // reg using the copy source as the backup location. Example:
+    //
+    //   x0 = MOV x7
+    //   call callee(x0)      ; x0 described as x7
+    if (Reg == DestReg)
+      return ParamLoadedValue(*DestSrc->Source, Expr);
+
+    // If the target's hook couldn't describe this copy, give up.
+    return std::nullopt;
+  } else if (auto RegImm = isAddImmediate(MI, Reg)) {
+    Register SrcReg = RegImm->Reg;
+    Offset = RegImm->Imm;
+    Expr = DIExpression::prepend(Expr, DIExpression::ApplyOffset, Offset);
+    return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
+  } else if (MI.hasOneMemOperand()) {
+    // Only describe memory which provably does not escape the function. As
+    // described in llvm.org/PR43343, escaped memory may be clobbered by the
+    // callee (or by another thread).
+    const auto &TII = MF->getSubtarget().getInstrInfo();
+    const MachineFrameInfo &MFI = MF->getFrameInfo();
+    const MachineMemOperand *MMO = MI.memoperands()[0];
+    const PseudoSourceValue *PSV = MMO->getPseudoValue();
+
+    // If the address points to "special" memory (e.g. a spill slot), it's
+    // sufficient to check that it isn't aliased by any high-level IR value.
+    if (!PSV || PSV->mayAlias(&MFI))
+      return std::nullopt;
+
+    const MachineOperand *BaseOp;
+    if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable,
+                                      TRI))
+      return std::nullopt;
+
+    // FIXME: Scalable offsets are not yet handled in the offset code below.
+    if (OffsetIsScalable)
+      return std::nullopt;
+
+    // TODO: Can currently only handle mem instructions with a single define.
+    // An example from the x86 target:
+    //    ...
+    //    DIV64m $rsp, 1, $noreg, 24, $noreg, implicit-def dead $rax, implicit-def $rdx
+    //    ...
+    //
+    if (MI.getNumExplicitDefs() != 1)
+      return std::nullopt;
+
+    // TODO: In what way do we need to take Reg into consideration here?
+
+    SmallVector<uint64_t, 8> Ops;
+    DIExpression::appendOffset(Ops, Offset);
+    Ops.push_back(dwarf::DW_OP_deref_size);
+    Ops.push_back(MMO->getSize());
+    Expr = DIExpression::prependOpcodes(Expr, Ops);
+    return ParamLoadedValue(*BaseOp, Expr);
+  }
+
+  return std::nullopt;
+}
+
+/// Both DefMI and UseMI must be valid.  By default, call directly to the
+/// itinerary. This may be overriden by the target.
+int TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                       const MachineInstr &DefMI,
+                                       unsigned DefIdx,
+                                       const MachineInstr &UseMI,
+                                       unsigned UseIdx) const {
+  unsigned DefClass = DefMI.getDesc().getSchedClass();
+  unsigned UseClass = UseMI.getDesc().getSchedClass();
+  return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
+}
+
+bool TargetInstrInfo::getRegSequenceInputs(
+    const MachineInstr &MI, unsigned DefIdx,
+    SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const {
+  assert((MI.isRegSequence() ||
+          MI.isRegSequenceLike()) && "Instruction do not have the proper type");
+
+  if (!MI.isRegSequence())
+    return getRegSequenceLikeInputs(MI, DefIdx, InputRegs);
+
+  // We are looking at:
+  // Def = REG_SEQUENCE v0, sub0, v1, sub1, ...
+  assert(DefIdx == 0 && "REG_SEQUENCE only has one def");
+  for (unsigned OpIdx = 1, EndOpIdx = MI.getNumOperands(); OpIdx != EndOpIdx;
+       OpIdx += 2) {
+    const MachineOperand &MOReg = MI.getOperand(OpIdx);
+    if (MOReg.isUndef())
+      continue;
+    const MachineOperand &MOSubIdx = MI.getOperand(OpIdx + 1);
+    assert(MOSubIdx.isImm() &&
+           "One of the subindex of the reg_sequence is not an immediate");
+    // Record Reg:SubReg, SubIdx.
+    InputRegs.push_back(RegSubRegPairAndIdx(MOReg.getReg(), MOReg.getSubReg(),
+                                            (unsigned)MOSubIdx.getImm()));
+  }
+  return true;
+}
+
+bool TargetInstrInfo::getExtractSubregInputs(
+    const MachineInstr &MI, unsigned DefIdx,
+    RegSubRegPairAndIdx &InputReg) const {
+  assert((MI.isExtractSubreg() ||
+      MI.isExtractSubregLike()) && "Instruction do not have the proper type");
+
+  if (!MI.isExtractSubreg())
+    return getExtractSubregLikeInputs(MI, DefIdx, InputReg);
+
+  // We are looking at:
+  // Def = EXTRACT_SUBREG v0.sub1, sub0.
+  assert(DefIdx == 0 && "EXTRACT_SUBREG only has one def");
+  const MachineOperand &MOReg = MI.getOperand(1);
+  if (MOReg.isUndef())
+    return false;
+  const MachineOperand &MOSubIdx = MI.getOperand(2);
+  assert(MOSubIdx.isImm() &&
+         "The subindex of the extract_subreg is not an immediate");
+
+  InputReg.Reg = MOReg.getReg();
+  InputReg.SubReg = MOReg.getSubReg();
+  InputReg.SubIdx = (unsigned)MOSubIdx.getImm();
+  return true;
+}
+
+bool TargetInstrInfo::getInsertSubregInputs(
+    const MachineInstr &MI, unsigned DefIdx,
+    RegSubRegPair &BaseReg, RegSubRegPairAndIdx &InsertedReg) const {
+  assert((MI.isInsertSubreg() ||
+      MI.isInsertSubregLike()) && "Instruction do not have the proper type");
+
+  if (!MI.isInsertSubreg())
+    return getInsertSubregLikeInputs(MI, DefIdx, BaseReg, InsertedReg);
+
+  // We are looking at:
+  // Def = INSERT_SEQUENCE v0, v1, sub0.
+  assert(DefIdx == 0 && "INSERT_SUBREG only has one def");
+  const MachineOperand &MOBaseReg = MI.getOperand(1);
+  const MachineOperand &MOInsertedReg = MI.getOperand(2);
+  if (MOInsertedReg.isUndef())
+    return false;
+  const MachineOperand &MOSubIdx = MI.getOperand(3);
+  assert(MOSubIdx.isImm() &&
+         "One of the subindex of the reg_sequence is not an immediate");
+  BaseReg.Reg = MOBaseReg.getReg();
+  BaseReg.SubReg = MOBaseReg.getSubReg();
+
+  InsertedReg.Reg = MOInsertedReg.getReg();
+  InsertedReg.SubReg = MOInsertedReg.getSubReg();
+  InsertedReg.SubIdx = (unsigned)MOSubIdx.getImm();
+  return true;
+}
+
+// Returns a MIRPrinter comment for this machine operand.
+std::string TargetInstrInfo::createMIROperandComment(
+    const MachineInstr &MI, const MachineOperand &Op, unsigned OpIdx,
+    const TargetRegisterInfo *TRI) const {
+
+  if (!MI.isInlineAsm())
+    return "";
+
+  std::string Flags;
+  raw_string_ostream OS(Flags);
+
+  if (OpIdx == InlineAsm::MIOp_ExtraInfo) {
+    // Print HasSideEffects, MayLoad, MayStore, IsAlignStack
+    unsigned ExtraInfo = Op.getImm();
+    bool First = true;
+    for (StringRef Info : InlineAsm::getExtraInfoNames(ExtraInfo)) {
+      if (!First)
+        OS << " ";
+      First = false;
+      OS << Info;
+    }
+
+    return OS.str();
+  }
+
+  int FlagIdx = MI.findInlineAsmFlagIdx(OpIdx);
+  if (FlagIdx < 0 || (unsigned)FlagIdx != OpIdx)
+    return "";
+
+  assert(Op.isImm() && "Expected flag operand to be an immediate");
+  // Pretty print the inline asm operand descriptor.
+  unsigned Flag = Op.getImm();
+  unsigned Kind = InlineAsm::getKind(Flag);
+  OS << InlineAsm::getKindName(Kind);
+
+  unsigned RCID = 0;
+  if (!InlineAsm::isImmKind(Flag) && !InlineAsm::isMemKind(Flag) &&
+      InlineAsm::hasRegClassConstraint(Flag, RCID)) {
+    if (TRI) {
+      OS << ':' << TRI->getRegClassName(TRI->getRegClass(RCID));
+    } else
+      OS << ":RC" << RCID;
+  }
+
+  if (InlineAsm::isMemKind(Flag)) {
+    unsigned MCID = InlineAsm::getMemoryConstraintID(Flag);
+    OS << ":" << InlineAsm::getMemConstraintName(MCID);
+  }
+
+  unsigned TiedTo = 0;
+  if (InlineAsm::isUseOperandTiedToDef(Flag, TiedTo))
+    OS << " tiedto:$" << TiedTo;
+
+  return OS.str();
+}
+
+TargetInstrInfo::PipelinerLoopInfo::~PipelinerLoopInfo() = default;
+
+void TargetInstrInfo::mergeOutliningCandidateAttributes(
+    Function &F, std::vector<outliner::Candidate> &Candidates) const {
+  // Include target features from an arbitrary candidate for the outlined
+  // function. This makes sure the outlined function knows what kinds of
+  // instructions are going into it. This is fine, since all parent functions
+  // must necessarily support the instructions that are in the outlined region.
+  outliner::Candidate &FirstCand = Candidates.front();
+  const Function &ParentFn = FirstCand.getMF()->getFunction();
+  if (ParentFn.hasFnAttribute("target-features"))
+    F.addFnAttr(ParentFn.getFnAttribute("target-features"));
+  if (ParentFn.hasFnAttribute("target-cpu"))
+    F.addFnAttr(ParentFn.getFnAttribute("target-cpu"));
+
+  // Set nounwind, so we don't generate eh_frame.
+  if (llvm::all_of(Candidates, [](const outliner::Candidate &C) {
+        return C.getMF()->getFunction().hasFnAttribute(Attribute::NoUnwind);
+      }))
+    F.addFnAttr(Attribute::NoUnwind);
+}
+
+outliner::InstrType TargetInstrInfo::getOutliningType(
+    MachineBasicBlock::iterator &MIT, unsigned Flags) const {
+  MachineInstr &MI = *MIT;
+
+  // NOTE: MI.isMetaInstruction() will match CFI_INSTRUCTION, but some targets
+  // have support for outlining those. Special-case that here.
+  if (MI.isCFIInstruction())
+    // Just go right to the target implementation.
+    return getOutliningTypeImpl(MIT, Flags);
+
+  // Be conservative about inline assembly.
+  if (MI.isInlineAsm())
+    return outliner::InstrType::Illegal;
+
+  // Labels generally can't safely be outlined.
+  if (MI.isLabel())
+    return outliner::InstrType::Illegal;
+
+  // Don't let debug instructions impact analysis.
+  if (MI.isDebugInstr())
+    return outliner::InstrType::Invisible;
+
+  // Some other special cases.
+  switch (MI.getOpcode()) {
+    case TargetOpcode::IMPLICIT_DEF:
+    case TargetOpcode::KILL:
+    case TargetOpcode::LIFETIME_START:
+    case TargetOpcode::LIFETIME_END:
+      return outliner::InstrType::Invisible;
+    default:
+      break;
+  }
+
+  // Is this a terminator for a basic block?
+  if (MI.isTerminator()) {
+    // If this is a branch to another block, we can't outline it.
+    if (!MI.getParent()->succ_empty())
+      return outliner::InstrType::Illegal;
+
+    // Don't outline if the branch is not unconditional.
+    if (isPredicated(MI))
+      return outliner::InstrType::Illegal;
+  }
+
+  // Make sure none of the operands of this instruction do anything that
+  // might break if they're moved outside their current function.
+  // This includes MachineBasicBlock references, BlockAddressses,
+  // Constant pool indices and jump table indices.
+  //
+  // A quick note on MO_TargetIndex:
+  // This doesn't seem to be used in any of the architectures that the
+  // MachineOutliner supports, but it was still filtered out in all of them.
+  // There was one exception (RISC-V), but MO_TargetIndex also isn't used there.
+  // As such, this check is removed both here and in the target-specific
+  // implementations. Instead, we assert to make sure this doesn't
+  // catch anyone off-guard somewhere down the line.
+  for (const MachineOperand &MOP : MI.operands()) {
+    // If you hit this assertion, please remove it and adjust
+    // `getOutliningTypeImpl` for your target appropriately if necessary.
+    // Adding the assertion back to other supported architectures
+    // would be nice too :)
+    assert(!MOP.isTargetIndex() && "This isn't used quite yet!");
+
+    // CFI instructions should already have been filtered out at this point.
+    assert(!MOP.isCFIIndex() && "CFI instructions handled elsewhere!");
+
+    // PrologEpilogInserter should've already run at this point.
+    assert(!MOP.isFI() && "FrameIndex instructions should be gone by now!");
+
+    if (MOP.isMBB() || MOP.isBlockAddress() || MOP.isCPI() || MOP.isJTI())
+      return outliner::InstrType::Illegal;
+  }
+
+  // If we don't know, delegate to the target-specific hook.
+  return getOutliningTypeImpl(MIT, Flags);
+}
+
+bool TargetInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
+                                             unsigned &Flags) const {
+  // Some instrumentations create special TargetOpcode at the start which
+  // expands to special code sequences which must be present.
+  auto First = MBB.getFirstNonDebugInstr();
+  if (First == MBB.end())
+    return true;
+
+  if (First->getOpcode() == TargetOpcode::FENTRY_CALL ||
+      First->getOpcode() == TargetOpcode::PATCHABLE_FUNCTION_ENTER)
+    return false;
+
+  // Some instrumentations create special pseudo-instructions at or just before
+  // the end that must be present.
+  auto Last = MBB.getLastNonDebugInstr();
+  if (Last->getOpcode() == TargetOpcode::PATCHABLE_RET ||
+      Last->getOpcode() == TargetOpcode::PATCHABLE_TAIL_CALL)
+    return false;
+
+  if (Last != First && Last->isReturn()) {
+    --Last;
+    if (Last->getOpcode() == TargetOpcode::PATCHABLE_FUNCTION_EXIT ||
+        Last->getOpcode() == TargetOpcode::PATCHABLE_TAIL_CALL)
+      return false;
+  }
+  return true;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringBase.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringBase.cpp
new file mode 100644
index 000000000000..10c54560da5a
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -0,0 +1,2405 @@
+//===- TargetLoweringBase.cpp - Implement the TargetLoweringBase class ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the TargetLoweringBase class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+static cl::opt<bool> JumpIsExpensiveOverride(
+    "jump-is-expensive", cl::init(false),
+    cl::desc("Do not create extra branches to split comparison logic."),
+    cl::Hidden);
+
+static cl::opt<unsigned> MinimumJumpTableEntries
+  ("min-jump-table-entries", cl::init(4), cl::Hidden,
+   cl::desc("Set minimum number of entries to use a jump table."));
+
+static cl::opt<unsigned> MaximumJumpTableSize
+  ("max-jump-table-size", cl::init(UINT_MAX), cl::Hidden,
+   cl::desc("Set maximum size of jump tables."));
+
+/// Minimum jump table density for normal functions.
+static cl::opt<unsigned>
+    JumpTableDensity("jump-table-density", cl::init(10), cl::Hidden,
+                     cl::desc("Minimum density for building a jump table in "
+                              "a normal function"));
+
+/// Minimum jump table density for -Os or -Oz functions.
+static cl::opt<unsigned> OptsizeJumpTableDensity(
+    "optsize-jump-table-density", cl::init(40), cl::Hidden,
+    cl::desc("Minimum density for building a jump table in "
+             "an optsize function"));
+
+// FIXME: This option is only to test if the strict fp operation processed
+// correctly by preventing mutating strict fp operation to normal fp operation
+// during development. When the backend supports strict float operation, this
+// option will be meaningless.
+static cl::opt<bool> DisableStrictNodeMutation("disable-strictnode-mutation",
+       cl::desc("Don't mutate strict-float node to a legalize node"),
+       cl::init(false), cl::Hidden);
+
+static bool darwinHasSinCos(const Triple &TT) {
+  assert(TT.isOSDarwin() && "should be called with darwin triple");
+  // Don't bother with 32 bit x86.
+  if (TT.getArch() == Triple::x86)
+    return false;
+  // Macos < 10.9 has no sincos_stret.
+  if (TT.isMacOSX())
+    return !TT.isMacOSXVersionLT(10, 9) && TT.isArch64Bit();
+  // iOS < 7.0 has no sincos_stret.
+  if (TT.isiOS())
+    return !TT.isOSVersionLT(7, 0);
+  // Any other darwin such as WatchOS/TvOS is new enough.
+  return true;
+}
+
+void TargetLoweringBase::InitLibcalls(const Triple &TT) {
+#define HANDLE_LIBCALL(code, name) \
+  setLibcallName(RTLIB::code, name);
+#include "llvm/IR/RuntimeLibcalls.def"
+#undef HANDLE_LIBCALL
+  // Initialize calling conventions to their default.
+  for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC)
+    setLibcallCallingConv((RTLIB::Libcall)LC, CallingConv::C);
+
+  // For IEEE quad-precision libcall names, PPC uses "kf" instead of "tf".
+  if (TT.isPPC()) {
+    setLibcallName(RTLIB::ADD_F128, "__addkf3");
+    setLibcallName(RTLIB::SUB_F128, "__subkf3");
+    setLibcallName(RTLIB::MUL_F128, "__mulkf3");
+    setLibcallName(RTLIB::DIV_F128, "__divkf3");
+    setLibcallName(RTLIB::POWI_F128, "__powikf2");
+    setLibcallName(RTLIB::FPEXT_F32_F128, "__extendsfkf2");
+    setLibcallName(RTLIB::FPEXT_F64_F128, "__extenddfkf2");
+    setLibcallName(RTLIB::FPROUND_F128_F32, "__trunckfsf2");
+    setLibcallName(RTLIB::FPROUND_F128_F64, "__trunckfdf2");
+    setLibcallName(RTLIB::FPTOSINT_F128_I32, "__fixkfsi");
+    setLibcallName(RTLIB::FPTOSINT_F128_I64, "__fixkfdi");
+    setLibcallName(RTLIB::FPTOSINT_F128_I128, "__fixkfti");
+    setLibcallName(RTLIB::FPTOUINT_F128_I32, "__fixunskfsi");
+    setLibcallName(RTLIB::FPTOUINT_F128_I64, "__fixunskfdi");
+    setLibcallName(RTLIB::FPTOUINT_F128_I128, "__fixunskfti");
+    setLibcallName(RTLIB::SINTTOFP_I32_F128, "__floatsikf");
+    setLibcallName(RTLIB::SINTTOFP_I64_F128, "__floatdikf");
+    setLibcallName(RTLIB::SINTTOFP_I128_F128, "__floattikf");
+    setLibcallName(RTLIB::UINTTOFP_I32_F128, "__floatunsikf");
+    setLibcallName(RTLIB::UINTTOFP_I64_F128, "__floatundikf");
+    setLibcallName(RTLIB::UINTTOFP_I128_F128, "__floatuntikf");
+    setLibcallName(RTLIB::OEQ_F128, "__eqkf2");
+    setLibcallName(RTLIB::UNE_F128, "__nekf2");
+    setLibcallName(RTLIB::OGE_F128, "__gekf2");
+    setLibcallName(RTLIB::OLT_F128, "__ltkf2");
+    setLibcallName(RTLIB::OLE_F128, "__lekf2");
+    setLibcallName(RTLIB::OGT_F128, "__gtkf2");
+    setLibcallName(RTLIB::UO_F128, "__unordkf2");
+  }
+
+  // A few names are different on particular architectures or environments.
+  if (TT.isOSDarwin()) {
+    // For f16/f32 conversions, Darwin uses the standard naming scheme, instead
+    // of the gnueabi-style __gnu_*_ieee.
+    // FIXME: What about other targets?
+    setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+    setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+
+    // Some darwins have an optimized __bzero/bzero function.
+    switch (TT.getArch()) {
+    case Triple::x86:
+    case Triple::x86_64:
+      if (TT.isMacOSX() && !TT.isMacOSXVersionLT(10, 6))
+        setLibcallName(RTLIB::BZERO, "__bzero");
+      break;
+    case Triple::aarch64:
+    case Triple::aarch64_32:
+      setLibcallName(RTLIB::BZERO, "bzero");
+      break;
+    default:
+      break;
+    }
+
+    if (darwinHasSinCos(TT)) {
+      setLibcallName(RTLIB::SINCOS_STRET_F32, "__sincosf_stret");
+      setLibcallName(RTLIB::SINCOS_STRET_F64, "__sincos_stret");
+      if (TT.isWatchABI()) {
+        setLibcallCallingConv(RTLIB::SINCOS_STRET_F32,
+                              CallingConv::ARM_AAPCS_VFP);
+        setLibcallCallingConv(RTLIB::SINCOS_STRET_F64,
+                              CallingConv::ARM_AAPCS_VFP);
+      }
+    }
+  } else {
+    setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee");
+    setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee");
+  }
+
+  if (TT.isGNUEnvironment() || TT.isOSFuchsia() ||
+      (TT.isAndroid() && !TT.isAndroidVersionLT(9))) {
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+    setLibcallName(RTLIB::SINCOS_F80, "sincosl");
+    setLibcallName(RTLIB::SINCOS_F128, "sincosl");
+    setLibcallName(RTLIB::SINCOS_PPCF128, "sincosl");
+  }
+
+  if (TT.isPS()) {
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+  }
+
+  if (TT.isOSOpenBSD()) {
+    setLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL, nullptr);
+  }
+
+  if (TT.isOSWindows() && !TT.isOSCygMing()) {
+    setLibcallName(RTLIB::LDEXP_F32, nullptr);
+    setLibcallName(RTLIB::LDEXP_F80, nullptr);
+    setLibcallName(RTLIB::LDEXP_F128, nullptr);
+    setLibcallName(RTLIB::LDEXP_PPCF128, nullptr);
+
+    setLibcallName(RTLIB::FREXP_F32, nullptr);
+    setLibcallName(RTLIB::FREXP_F80, nullptr);
+    setLibcallName(RTLIB::FREXP_F128, nullptr);
+    setLibcallName(RTLIB::FREXP_PPCF128, nullptr);
+  }
+}
+
+/// GetFPLibCall - Helper to return the right libcall for the given floating
+/// point type, or UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPLibCall(EVT VT,
+                                   RTLIB::Libcall Call_F32,
+                                   RTLIB::Libcall Call_F64,
+                                   RTLIB::Libcall Call_F80,
+                                   RTLIB::Libcall Call_F128,
+                                   RTLIB::Libcall Call_PPCF128) {
+  return
+    VT == MVT::f32 ? Call_F32 :
+    VT == MVT::f64 ? Call_F64 :
+    VT == MVT::f80 ? Call_F80 :
+    VT == MVT::f128 ? Call_F128 :
+    VT == MVT::ppcf128 ? Call_PPCF128 :
+    RTLIB::UNKNOWN_LIBCALL;
+}
+
+/// getFPEXT - Return the FPEXT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f16) {
+    if (RetVT == MVT::f32)
+      return FPEXT_F16_F32;
+    if (RetVT == MVT::f64)
+      return FPEXT_F16_F64;
+    if (RetVT == MVT::f80)
+      return FPEXT_F16_F80;
+    if (RetVT == MVT::f128)
+      return FPEXT_F16_F128;
+  } else if (OpVT == MVT::f32) {
+    if (RetVT == MVT::f64)
+      return FPEXT_F32_F64;
+    if (RetVT == MVT::f128)
+      return FPEXT_F32_F128;
+    if (RetVT == MVT::ppcf128)
+      return FPEXT_F32_PPCF128;
+  } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::f128)
+      return FPEXT_F64_F128;
+    else if (RetVT == MVT::ppcf128)
+      return FPEXT_F64_PPCF128;
+  } else if (OpVT == MVT::f80) {
+    if (RetVT == MVT::f128)
+      return FPEXT_F80_F128;
+  }
+
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPROUND - Return the FPROUND_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) {
+  if (RetVT == MVT::f16) {
+    if (OpVT == MVT::f32)
+      return FPROUND_F32_F16;
+    if (OpVT == MVT::f64)
+      return FPROUND_F64_F16;
+    if (OpVT == MVT::f80)
+      return FPROUND_F80_F16;
+    if (OpVT == MVT::f128)
+      return FPROUND_F128_F16;
+    if (OpVT == MVT::ppcf128)
+      return FPROUND_PPCF128_F16;
+  } else if (RetVT == MVT::bf16) {
+    if (OpVT == MVT::f32)
+      return FPROUND_F32_BF16;
+    if (OpVT == MVT::f64)
+      return FPROUND_F64_BF16;
+  } else if (RetVT == MVT::f32) {
+    if (OpVT == MVT::f64)
+      return FPROUND_F64_F32;
+    if (OpVT == MVT::f80)
+      return FPROUND_F80_F32;
+    if (OpVT == MVT::f128)
+      return FPROUND_F128_F32;
+    if (OpVT == MVT::ppcf128)
+      return FPROUND_PPCF128_F32;
+  } else if (RetVT == MVT::f64) {
+    if (OpVT == MVT::f80)
+      return FPROUND_F80_F64;
+    if (OpVT == MVT::f128)
+      return FPROUND_F128_F64;
+    if (OpVT == MVT::ppcf128)
+      return FPROUND_PPCF128_F64;
+  } else if (RetVT == MVT::f80) {
+    if (OpVT == MVT::f128)
+      return FPROUND_F128_F80;
+  }
+
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f16) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F16_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F16_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F16_I128;
+  } else if (OpVT == MVT::f32) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F32_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F32_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F32_I128;
+  } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F64_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F64_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F64_I128;
+  } else if (OpVT == MVT::f80) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F80_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F80_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F80_I128;
+  } else if (OpVT == MVT::f128) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F128_I128;
+  } else if (OpVT == MVT::ppcf128) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_PPCF128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_PPCF128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_PPCF128_I128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f16) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F16_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F16_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F16_I128;
+  } else if (OpVT == MVT::f32) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F32_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F32_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F32_I128;
+  } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F64_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F64_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F64_I128;
+  } else if (OpVT == MVT::f80) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F80_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F80_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F80_I128;
+  } else if (OpVT == MVT::f128) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F128_I128;
+  } else if (OpVT == MVT::ppcf128) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_PPCF128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_PPCF128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_PPCF128_I128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getSINTTOFP(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::i32) {
+    if (RetVT == MVT::f16)
+      return SINTTOFP_I32_F16;
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I32_F32;
+    if (RetVT == MVT::f64)
+      return SINTTOFP_I32_F64;
+    if (RetVT == MVT::f80)
+      return SINTTOFP_I32_F80;
+    if (RetVT == MVT::f128)
+      return SINTTOFP_I32_F128;
+    if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I32_PPCF128;
+  } else if (OpVT == MVT::i64) {
+    if (RetVT == MVT::f16)
+      return SINTTOFP_I64_F16;
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I64_F32;
+    if (RetVT == MVT::f64)
+      return SINTTOFP_I64_F64;
+    if (RetVT == MVT::f80)
+      return SINTTOFP_I64_F80;
+    if (RetVT == MVT::f128)
+      return SINTTOFP_I64_F128;
+    if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I64_PPCF128;
+  } else if (OpVT == MVT::i128) {
+    if (RetVT == MVT::f16)
+      return SINTTOFP_I128_F16;
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I128_F32;
+    if (RetVT == MVT::f64)
+      return SINTTOFP_I128_F64;
+    if (RetVT == MVT::f80)
+      return SINTTOFP_I128_F80;
+    if (RetVT == MVT::f128)
+      return SINTTOFP_I128_F128;
+    if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I128_PPCF128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getUINTTOFP(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::i32) {
+    if (RetVT == MVT::f16)
+      return UINTTOFP_I32_F16;
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I32_F32;
+    if (RetVT == MVT::f64)
+      return UINTTOFP_I32_F64;
+    if (RetVT == MVT::f80)
+      return UINTTOFP_I32_F80;
+    if (RetVT == MVT::f128)
+      return UINTTOFP_I32_F128;
+    if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I32_PPCF128;
+  } else if (OpVT == MVT::i64) {
+    if (RetVT == MVT::f16)
+      return UINTTOFP_I64_F16;
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I64_F32;
+    if (RetVT == MVT::f64)
+      return UINTTOFP_I64_F64;
+    if (RetVT == MVT::f80)
+      return UINTTOFP_I64_F80;
+    if (RetVT == MVT::f128)
+      return UINTTOFP_I64_F128;
+    if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I64_PPCF128;
+  } else if (OpVT == MVT::i128) {
+    if (RetVT == MVT::f16)
+      return UINTTOFP_I128_F16;
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I128_F32;
+    if (RetVT == MVT::f64)
+      return UINTTOFP_I128_F64;
+    if (RetVT == MVT::f80)
+      return UINTTOFP_I128_F80;
+    if (RetVT == MVT::f128)
+      return UINTTOFP_I128_F128;
+    if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I128_PPCF128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+RTLIB::Libcall RTLIB::getPOWI(EVT RetVT) {
+  return getFPLibCall(RetVT, POWI_F32, POWI_F64, POWI_F80, POWI_F128,
+                      POWI_PPCF128);
+}
+
+RTLIB::Libcall RTLIB::getLDEXP(EVT RetVT) {
+  return getFPLibCall(RetVT, LDEXP_F32, LDEXP_F64, LDEXP_F80, LDEXP_F128,
+                      LDEXP_PPCF128);
+}
+
+RTLIB::Libcall RTLIB::getFREXP(EVT RetVT) {
+  return getFPLibCall(RetVT, FREXP_F32, FREXP_F64, FREXP_F80, FREXP_F128,
+                      FREXP_PPCF128);
+}
+
+RTLIB::Libcall RTLIB::getOUTLINE_ATOMIC(unsigned Opc, AtomicOrdering Order,
+                                        MVT VT) {
+  unsigned ModeN, ModelN;
+  switch (VT.SimpleTy) {
+  case MVT::i8:
+    ModeN = 0;
+    break;
+  case MVT::i16:
+    ModeN = 1;
+    break;
+  case MVT::i32:
+    ModeN = 2;
+    break;
+  case MVT::i64:
+    ModeN = 3;
+    break;
+  case MVT::i128:
+    ModeN = 4;
+    break;
+  default:
+    return UNKNOWN_LIBCALL;
+  }
+
+  switch (Order) {
+  case AtomicOrdering::Monotonic:
+    ModelN = 0;
+    break;
+  case AtomicOrdering::Acquire:
+    ModelN = 1;
+    break;
+  case AtomicOrdering::Release:
+    ModelN = 2;
+    break;
+  case AtomicOrdering::AcquireRelease:
+  case AtomicOrdering::SequentiallyConsistent:
+    ModelN = 3;
+    break;
+  default:
+    return UNKNOWN_LIBCALL;
+  }
+
+#define LCALLS(A, B)                                                           \
+  { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
+#define LCALL5(A)                                                              \
+  LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
+  switch (Opc) {
+  case ISD::ATOMIC_CMP_SWAP: {
+    const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_CAS)};
+    return LC[ModeN][ModelN];
+  }
+  case ISD::ATOMIC_SWAP: {
+    const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_SWP)};
+    return LC[ModeN][ModelN];
+  }
+  case ISD::ATOMIC_LOAD_ADD: {
+    const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_LDADD)};
+    return LC[ModeN][ModelN];
+  }
+  case ISD::ATOMIC_LOAD_OR: {
+    const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_LDSET)};
+    return LC[ModeN][ModelN];
+  }
+  case ISD::ATOMIC_LOAD_CLR: {
+    const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_LDCLR)};
+    return LC[ModeN][ModelN];
+  }
+  case ISD::ATOMIC_LOAD_XOR: {
+    const Libcall LC[5][4] = {LCALL5(OUTLINE_ATOMIC_LDEOR)};
+    return LC[ModeN][ModelN];
+  }
+  default:
+    return UNKNOWN_LIBCALL;
+  }
+#undef LCALLS
+#undef LCALL5
+}
+
+RTLIB::Libcall RTLIB::getSYNC(unsigned Opc, MVT VT) {
+#define OP_TO_LIBCALL(Name, Enum)                                              \
+  case Name:                                                                   \
+    switch (VT.SimpleTy) {                                                     \
+    default:                                                                   \
+      return UNKNOWN_LIBCALL;                                                  \
+    case MVT::i8:                                                              \
+      return Enum##_1;                                                         \
+    case MVT::i16:                                                             \
+      return Enum##_2;                                                         \
+    case MVT::i32:                                                             \
+      return Enum##_4;                                                         \
+    case MVT::i64:                                                             \
+      return Enum##_8;                                                         \
+    case MVT::i128:                                                            \
+      return Enum##_16;                                                        \
+    }
+
+  switch (Opc) {
+    OP_TO_LIBCALL(ISD::ATOMIC_SWAP, SYNC_LOCK_TEST_AND_SET)
+    OP_TO_LIBCALL(ISD::ATOMIC_CMP_SWAP, SYNC_VAL_COMPARE_AND_SWAP)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_ADD, SYNC_FETCH_AND_ADD)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_SUB, SYNC_FETCH_AND_SUB)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_AND, SYNC_FETCH_AND_AND)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_OR, SYNC_FETCH_AND_OR)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_XOR, SYNC_FETCH_AND_XOR)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_NAND, SYNC_FETCH_AND_NAND)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_MAX, SYNC_FETCH_AND_MAX)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_UMAX, SYNC_FETCH_AND_UMAX)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_MIN, SYNC_FETCH_AND_MIN)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_UMIN, SYNC_FETCH_AND_UMIN)
+  }
+
+#undef OP_TO_LIBCALL
+
+  return UNKNOWN_LIBCALL;
+}
+
+RTLIB::Libcall RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) {
+  switch (ElementSize) {
+  case 1:
+    return MEMCPY_ELEMENT_UNORDERED_ATOMIC_1;
+  case 2:
+    return MEMCPY_ELEMENT_UNORDERED_ATOMIC_2;
+  case 4:
+    return MEMCPY_ELEMENT_UNORDERED_ATOMIC_4;
+  case 8:
+    return MEMCPY_ELEMENT_UNORDERED_ATOMIC_8;
+  case 16:
+    return MEMCPY_ELEMENT_UNORDERED_ATOMIC_16;
+  default:
+    return UNKNOWN_LIBCALL;
+  }
+}
+
+RTLIB::Libcall RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) {
+  switch (ElementSize) {
+  case 1:
+    return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1;
+  case 2:
+    return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2;
+  case 4:
+    return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4;
+  case 8:
+    return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8;
+  case 16:
+    return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16;
+  default:
+    return UNKNOWN_LIBCALL;
+  }
+}
+
+RTLIB::Libcall RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) {
+  switch (ElementSize) {
+  case 1:
+    return MEMSET_ELEMENT_UNORDERED_ATOMIC_1;
+  case 2:
+    return MEMSET_ELEMENT_UNORDERED_ATOMIC_2;
+  case 4:
+    return MEMSET_ELEMENT_UNORDERED_ATOMIC_4;
+  case 8:
+    return MEMSET_ELEMENT_UNORDERED_ATOMIC_8;
+  case 16:
+    return MEMSET_ELEMENT_UNORDERED_ATOMIC_16;
+  default:
+    return UNKNOWN_LIBCALL;
+  }
+}
+
+/// InitCmpLibcallCCs - Set default comparison libcall CC.
+static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
+  std::fill(CCs, CCs + RTLIB::UNKNOWN_LIBCALL, ISD::SETCC_INVALID);
+  CCs[RTLIB::OEQ_F32] = ISD::SETEQ;
+  CCs[RTLIB::OEQ_F64] = ISD::SETEQ;
+  CCs[RTLIB::OEQ_F128] = ISD::SETEQ;
+  CCs[RTLIB::OEQ_PPCF128] = ISD::SETEQ;
+  CCs[RTLIB::UNE_F32] = ISD::SETNE;
+  CCs[RTLIB::UNE_F64] = ISD::SETNE;
+  CCs[RTLIB::UNE_F128] = ISD::SETNE;
+  CCs[RTLIB::UNE_PPCF128] = ISD::SETNE;
+  CCs[RTLIB::OGE_F32] = ISD::SETGE;
+  CCs[RTLIB::OGE_F64] = ISD::SETGE;
+  CCs[RTLIB::OGE_F128] = ISD::SETGE;
+  CCs[RTLIB::OGE_PPCF128] = ISD::SETGE;
+  CCs[RTLIB::OLT_F32] = ISD::SETLT;
+  CCs[RTLIB::OLT_F64] = ISD::SETLT;
+  CCs[RTLIB::OLT_F128] = ISD::SETLT;
+  CCs[RTLIB::OLT_PPCF128] = ISD::SETLT;
+  CCs[RTLIB::OLE_F32] = ISD::SETLE;
+  CCs[RTLIB::OLE_F64] = ISD::SETLE;
+  CCs[RTLIB::OLE_F128] = ISD::SETLE;
+  CCs[RTLIB::OLE_PPCF128] = ISD::SETLE;
+  CCs[RTLIB::OGT_F32] = ISD::SETGT;
+  CCs[RTLIB::OGT_F64] = ISD::SETGT;
+  CCs[RTLIB::OGT_F128] = ISD::SETGT;
+  CCs[RTLIB::OGT_PPCF128] = ISD::SETGT;
+  CCs[RTLIB::UO_F32] = ISD::SETNE;
+  CCs[RTLIB::UO_F64] = ISD::SETNE;
+  CCs[RTLIB::UO_F128] = ISD::SETNE;
+  CCs[RTLIB::UO_PPCF128] = ISD::SETNE;
+}
+
+/// NOTE: The TargetMachine owns TLOF.
+TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
+  initActions();
+
+  // Perform these initializations only once.
+  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove =
+      MaxLoadsPerMemcmp = 8;
+  MaxGluedStoresPerMemcpy = 0;
+  MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
+      MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
+  HasMultipleConditionRegisters = false;
+  HasExtractBitsInsn = false;
+  JumpIsExpensive = JumpIsExpensiveOverride;
+  PredictableSelectIsExpensive = false;
+  EnableExtLdPromotion = false;
+  StackPointerRegisterToSaveRestore = 0;
+  BooleanContents = UndefinedBooleanContent;
+  BooleanFloatContents = UndefinedBooleanContent;
+  BooleanVectorContents = UndefinedBooleanContent;
+  SchedPreferenceInfo = Sched::ILP;
+  GatherAllAliasesMaxDepth = 18;
+  IsStrictFPEnabled = DisableStrictNodeMutation;
+  MaxBytesForAlignment = 0;
+  // TODO: the default will be switched to 0 in the next commit, along
+  // with the Target-specific changes necessary.
+  MaxAtomicSizeInBitsSupported = 1024;
+
+  // Assume that even with libcalls, no target supports wider than 128 bit
+  // division.
+  MaxDivRemBitWidthSupported = 128;
+
+  MaxLargeFPConvertBitWidthSupported = llvm::IntegerType::MAX_INT_BITS;
+
+  MinCmpXchgSizeInBits = 0;
+  SupportsUnalignedAtomics = false;
+
+  std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames), nullptr);
+
+  InitLibcalls(TM.getTargetTriple());
+  InitCmpLibcallCCs(CmpLibcallCCs);
+}
+
+void TargetLoweringBase::initActions() {
+  // All operations default to being supported.
+  memset(OpActions, 0, sizeof(OpActions));
+  memset(LoadExtActions, 0, sizeof(LoadExtActions));
+  memset(TruncStoreActions, 0, sizeof(TruncStoreActions));
+  memset(IndexedModeActions, 0, sizeof(IndexedModeActions));
+  memset(CondCodeActions, 0, sizeof(CondCodeActions));
+  std::fill(std::begin(RegClassForVT), std::end(RegClassForVT), nullptr);
+  std::fill(std::begin(TargetDAGCombineArray),
+            std::end(TargetDAGCombineArray), 0);
+
+  // We're somewhat special casing MVT::i2 and MVT::i4. Ideally we want to
+  // remove this and targets should individually set these types if not legal.
+  for (ISD::NodeType NT : enum_seq(ISD::DELETED_NODE, ISD::BUILTIN_OP_END,
+                                   force_iteration_on_noniterable_enum)) {
+    for (MVT VT : {MVT::i2, MVT::i4})
+      OpActions[(unsigned)VT.SimpleTy][NT] = Expand;
+  }
+  for (MVT AVT : MVT::all_valuetypes()) {
+    for (MVT VT : {MVT::i2, MVT::i4, MVT::v128i2, MVT::v64i4}) {
+      setTruncStoreAction(AVT, VT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, AVT, VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, AVT, VT, Expand);
+    }
+  }
+  for (unsigned IM = (unsigned)ISD::PRE_INC;
+       IM != (unsigned)ISD::LAST_INDEXED_MODE; ++IM) {
+    for (MVT VT : {MVT::i2, MVT::i4}) {
+      setIndexedLoadAction(IM, VT, Expand);
+      setIndexedStoreAction(IM, VT, Expand);
+      setIndexedMaskedLoadAction(IM, VT, Expand);
+      setIndexedMaskedStoreAction(IM, VT, Expand);
+    }
+  }
+
+  for (MVT VT : MVT::fp_valuetypes()) {
+    MVT IntVT = MVT::getIntegerVT(VT.getFixedSizeInBits());
+    if (IntVT.isValid()) {
+      setOperationAction(ISD::ATOMIC_SWAP, VT, Promote);
+      AddPromotedToType(ISD::ATOMIC_SWAP, VT, IntVT);
+    }
+  }
+
+  // Set default actions for various operations.
+  for (MVT VT : MVT::all_valuetypes()) {
+    // Default all indexed load / store to expand.
+    for (unsigned IM = (unsigned)ISD::PRE_INC;
+         IM != (unsigned)ISD::LAST_INDEXED_MODE; ++IM) {
+      setIndexedLoadAction(IM, VT, Expand);
+      setIndexedStoreAction(IM, VT, Expand);
+      setIndexedMaskedLoadAction(IM, VT, Expand);
+      setIndexedMaskedStoreAction(IM, VT, Expand);
+    }
+
+    // Most backends expect to see the node which just returns the value loaded.
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand);
+
+    // These operations default to expand.
+    setOperationAction({ISD::FGETSIGN,       ISD::CONCAT_VECTORS,
+                        ISD::FMINNUM,        ISD::FMAXNUM,
+                        ISD::FMINNUM_IEEE,   ISD::FMAXNUM_IEEE,
+                        ISD::FMINIMUM,       ISD::FMAXIMUM,
+                        ISD::FMAD,           ISD::SMIN,
+                        ISD::SMAX,           ISD::UMIN,
+                        ISD::UMAX,           ISD::ABS,
+                        ISD::FSHL,           ISD::FSHR,
+                        ISD::SADDSAT,        ISD::UADDSAT,
+                        ISD::SSUBSAT,        ISD::USUBSAT,
+                        ISD::SSHLSAT,        ISD::USHLSAT,
+                        ISD::SMULFIX,        ISD::SMULFIXSAT,
+                        ISD::UMULFIX,        ISD::UMULFIXSAT,
+                        ISD::SDIVFIX,        ISD::SDIVFIXSAT,
+                        ISD::UDIVFIX,        ISD::UDIVFIXSAT,
+                        ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
+                        ISD::IS_FPCLASS},
+                       VT, Expand);
+
+    // Overflow operations default to expand
+    setOperationAction({ISD::SADDO, ISD::SSUBO, ISD::UADDO, ISD::USUBO,
+                        ISD::SMULO, ISD::UMULO},
+                       VT, Expand);
+
+    // Carry-using overflow operations default to expand.
+    setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY, ISD::SETCCCARRY,
+                        ISD::SADDO_CARRY, ISD::SSUBO_CARRY},
+                       VT, Expand);
+
+    // ADDC/ADDE/SUBC/SUBE default to expand.
+    setOperationAction({ISD::ADDC, ISD::ADDE, ISD::SUBC, ISD::SUBE}, VT,
+                       Expand);
+
+    // Halving adds
+    setOperationAction(
+        {ISD::AVGFLOORS, ISD::AVGFLOORU, ISD::AVGCEILS, ISD::AVGCEILU}, VT,
+        Expand);
+
+    // Absolute difference
+    setOperationAction({ISD::ABDS, ISD::ABDU}, VT, Expand);
+
+    // These default to Expand so they will be expanded to CTLZ/CTTZ by default.
+    setOperationAction({ISD::CTLZ_ZERO_UNDEF, ISD::CTTZ_ZERO_UNDEF}, VT,
+                       Expand);
+
+    setOperationAction({ISD::BITREVERSE, ISD::PARITY}, VT, Expand);
+
+    // These library functions default to expand.
+    setOperationAction(
+        {ISD::FROUND, ISD::FROUNDEVEN, ISD::FPOWI, ISD::FLDEXP, ISD::FFREXP},
+        VT, Expand);
+
+    // These operations default to expand for vector types.
+    if (VT.isVector())
+      setOperationAction({ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG,
+                          ISD::ANY_EXTEND_VECTOR_INREG,
+                          ISD::SIGN_EXTEND_VECTOR_INREG,
+                          ISD::ZERO_EXTEND_VECTOR_INREG, ISD::SPLAT_VECTOR},
+                         VT, Expand);
+
+    // Constrained floating-point operations default to expand.
+#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
+    setOperationAction(ISD::STRICT_##DAGN, VT, Expand);
+#include "llvm/IR/ConstrainedOps.def"
+
+    // For most targets @llvm.get.dynamic.area.offset just returns 0.
+    setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, VT, Expand);
+
+    // Vector reduction default to expand.
+    setOperationAction(
+        {ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMUL, ISD::VECREDUCE_ADD,
+         ISD::VECREDUCE_MUL, ISD::VECREDUCE_AND, ISD::VECREDUCE_OR,
+         ISD::VECREDUCE_XOR, ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN,
+         ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN, ISD::VECREDUCE_FMAX,
+         ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM,
+         ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_SEQ_FMUL},
+        VT, Expand);
+
+    // Named vector shuffles default to expand.
+    setOperationAction(ISD::VECTOR_SPLICE, VT, Expand);
+
+    // VP operations default to expand.
+#define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...)                                   \
+    setOperationAction(ISD::SDOPC, VT, Expand);
+#include "llvm/IR/VPIntrinsics.def"
+
+    // FP environment operations default to expand.
+    setOperationAction(ISD::GET_FPENV, VT, Expand);
+    setOperationAction(ISD::SET_FPENV, VT, Expand);
+    setOperationAction(ISD::RESET_FPENV, VT, Expand);
+  }
+
+  // Most targets ignore the @llvm.prefetch intrinsic.
+  setOperationAction(ISD::PREFETCH, MVT::Other, Expand);
+
+  // Most targets also ignore the @llvm.readcyclecounter intrinsic.
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Expand);
+
+  // ConstantFP nodes default to expand.  Targets can either change this to
+  // Legal, in which case all fp constants are legal, or use isFPImmLegal()
+  // to optimize expansions for certain constants.
+  setOperationAction(ISD::ConstantFP,
+                     {MVT::bf16, MVT::f16, MVT::f32, MVT::f64, MVT::f80, MVT::f128},
+                     Expand);
+
+  // These library functions default to expand.
+  setOperationAction({ISD::FCBRT, ISD::FLOG, ISD::FLOG2, ISD::FLOG10, ISD::FEXP,
+                      ISD::FEXP2, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
+                      ISD::FRINT, ISD::FTRUNC, ISD::LROUND, ISD::LLROUND,
+                      ISD::LRINT, ISD::LLRINT},
+                     {MVT::f32, MVT::f64, MVT::f128}, Expand);
+
+  // Default ISD::TRAP to expand (which turns it into abort).
+  setOperationAction(ISD::TRAP, MVT::Other, Expand);
+
+  // On most systems, DEBUGTRAP and TRAP have no difference. The "Expand"
+  // here is to inform DAG Legalizer to replace DEBUGTRAP with TRAP.
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Expand);
+
+  setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
+
+  setOperationAction(ISD::GET_FPENV_MEM, MVT::Other, Expand);
+  setOperationAction(ISD::SET_FPENV_MEM, MVT::Other, Expand);
+}
+
+MVT TargetLoweringBase::getScalarShiftAmountTy(const DataLayout &DL,
+                                               EVT) const {
+  return MVT::getIntegerVT(DL.getPointerSizeInBits(0));
+}
+
+EVT TargetLoweringBase::getShiftAmountTy(EVT LHSTy, const DataLayout &DL,
+                                         bool LegalTypes) const {
+  assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
+  if (LHSTy.isVector())
+    return LHSTy;
+  MVT ShiftVT =
+      LegalTypes ? getScalarShiftAmountTy(DL, LHSTy) : getPointerTy(DL);
+  // If any possible shift value won't fit in the prefered type, just use
+  // something safe. Assume it will be legalized when the shift is expanded.
+  if (ShiftVT.getSizeInBits() < Log2_32_Ceil(LHSTy.getSizeInBits()))
+    ShiftVT = MVT::i32;
+  assert(ShiftVT.getSizeInBits() >= Log2_32_Ceil(LHSTy.getSizeInBits()) &&
+         "ShiftVT is still too small!");
+  return ShiftVT;
+}
+
+bool TargetLoweringBase::canOpTrap(unsigned Op, EVT VT) const {
+  assert(isTypeLegal(VT));
+  switch (Op) {
+  default:
+    return false;
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+    return true;
+  }
+}
+
+bool TargetLoweringBase::isFreeAddrSpaceCast(unsigned SrcAS,
+                                             unsigned DestAS) const {
+  return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
+}
+
+void TargetLoweringBase::setJumpIsExpensive(bool isExpensive) {
+  // If the command-line option was specified, ignore this request.
+  if (!JumpIsExpensiveOverride.getNumOccurrences())
+    JumpIsExpensive = isExpensive;
+}
+
+TargetLoweringBase::LegalizeKind
+TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
+  // If this is a simple type, use the ComputeRegisterProp mechanism.
+  if (VT.isSimple()) {
+    MVT SVT = VT.getSimpleVT();
+    assert((unsigned)SVT.SimpleTy < std::size(TransformToType));
+    MVT NVT = TransformToType[SVT.SimpleTy];
+    LegalizeTypeAction LA = ValueTypeActions.getTypeAction(SVT);
+
+    assert((LA == TypeLegal || LA == TypeSoftenFloat ||
+            LA == TypeSoftPromoteHalf ||
+            (NVT.isVector() ||
+             ValueTypeActions.getTypeAction(NVT) != TypePromoteInteger)) &&
+           "Promote may not follow Expand or Promote");
+
+    if (LA == TypeSplitVector)
+      return LegalizeKind(LA, EVT(SVT).getHalfNumVectorElementsVT(Context));
+    if (LA == TypeScalarizeVector)
+      return LegalizeKind(LA, SVT.getVectorElementType());
+    return LegalizeKind(LA, NVT);
+  }
+
+  // Handle Extended Scalar Types.
+  if (!VT.isVector()) {
+    assert(VT.isInteger() && "Float types must be simple");
+    unsigned BitSize = VT.getSizeInBits();
+    // First promote to a power-of-two size, then expand if necessary.
+    if (BitSize < 8 || !isPowerOf2_32(BitSize)) {
+      EVT NVT = VT.getRoundIntegerType(Context);
+      assert(NVT != VT && "Unable to round integer VT");
+      LegalizeKind NextStep = getTypeConversion(Context, NVT);
+      // Avoid multi-step promotion.
+      if (NextStep.first == TypePromoteInteger)
+        return NextStep;
+      // Return rounded integer type.
+      return LegalizeKind(TypePromoteInteger, NVT);
+    }
+
+    return LegalizeKind(TypeExpandInteger,
+                        EVT::getIntegerVT(Context, VT.getSizeInBits() / 2));
+  }
+
+  // Handle vector types.
+  ElementCount NumElts = VT.getVectorElementCount();
+  EVT EltVT = VT.getVectorElementType();
+
+  // Vectors with only one element are always scalarized.
+  if (NumElts.isScalar())
+    return LegalizeKind(TypeScalarizeVector, EltVT);
+
+  // Try to widen vector elements until the element type is a power of two and
+  // promote it to a legal type later on, for example:
+  // <3 x i8> -> <4 x i8> -> <4 x i32>
+  if (EltVT.isInteger()) {
+    // Vectors with a number of elements that is not a power of two are always
+    // widened, for example <3 x i8> -> <4 x i8>.
+    if (!VT.isPow2VectorType()) {
+      NumElts = NumElts.coefficientNextPowerOf2();
+      EVT NVT = EVT::getVectorVT(Context, EltVT, NumElts);
+      return LegalizeKind(TypeWidenVector, NVT);
+    }
+
+    // Examine the element type.
+    LegalizeKind LK = getTypeConversion(Context, EltVT);
+
+    // If type is to be expanded, split the vector.
+    //  <4 x i140> -> <2 x i140>
+    if (LK.first == TypeExpandInteger) {
+      if (VT.getVectorElementCount().isScalable())
+        return LegalizeKind(TypeScalarizeScalableVector, EltVT);
+      return LegalizeKind(TypeSplitVector,
+                          VT.getHalfNumVectorElementsVT(Context));
+    }
+
+    // Promote the integer element types until a legal vector type is found
+    // or until the element integer type is too big. If a legal type was not
+    // found, fallback to the usual mechanism of widening/splitting the
+    // vector.
+    EVT OldEltVT = EltVT;
+    while (true) {
+      // Increase the bitwidth of the element to the next pow-of-two
+      // (which is greater than 8 bits).
+      EltVT = EVT::getIntegerVT(Context, 1 + EltVT.getSizeInBits())
+                  .getRoundIntegerType(Context);
+
+      // Stop trying when getting a non-simple element type.
+      // Note that vector elements may be greater than legal vector element
+      // types. Example: X86 XMM registers hold 64bit element on 32bit
+      // systems.
+      if (!EltVT.isSimple())
+        break;
+
+      // Build a new vector type and check if it is legal.
+      MVT NVT = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
+      // Found a legal promoted vector type.
+      if (NVT != MVT() && ValueTypeActions.getTypeAction(NVT) == TypeLegal)
+        return LegalizeKind(TypePromoteInteger,
+                            EVT::getVectorVT(Context, EltVT, NumElts));
+    }
+
+    // Reset the type to the unexpanded type if we did not find a legal vector
+    // type with a promoted vector element type.
+    EltVT = OldEltVT;
+  }
+
+  // Try to widen the vector until a legal type is found.
+  // If there is no wider legal type, split the vector.
+  while (true) {
+    // Round up to the next power of 2.
+    NumElts = NumElts.coefficientNextPowerOf2();
+
+    // If there is no simple vector type with this many elements then there
+    // cannot be a larger legal vector type.  Note that this assumes that
+    // there are no skipped intermediate vector types in the simple types.
+    if (!EltVT.isSimple())
+      break;
+    MVT LargerVector = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
+    if (LargerVector == MVT())
+      break;
+
+    // If this type is legal then widen the vector.
+    if (ValueTypeActions.getTypeAction(LargerVector) == TypeLegal)
+      return LegalizeKind(TypeWidenVector, LargerVector);
+  }
+
+  // Widen odd vectors to next power of two.
+  if (!VT.isPow2VectorType()) {
+    EVT NVT = VT.getPow2VectorType(Context);
+    return LegalizeKind(TypeWidenVector, NVT);
+  }
+
+  if (VT.getVectorElementCount() == ElementCount::getScalable(1))
+    return LegalizeKind(TypeScalarizeScalableVector, EltVT);
+
+  // Vectors with illegal element types are expanded.
+  EVT NVT = EVT::getVectorVT(Context, EltVT,
+                             VT.getVectorElementCount().divideCoefficientBy(2));
+  return LegalizeKind(TypeSplitVector, NVT);
+}
+
+static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
+                                          unsigned &NumIntermediates,
+                                          MVT &RegisterVT,
+                                          TargetLoweringBase *TLI) {
+  // Figure out the right, legal destination reg to copy into.
+  ElementCount EC = VT.getVectorElementCount();
+  MVT EltTy = VT.getVectorElementType();
+
+  unsigned NumVectorRegs = 1;
+
+  // Scalable vectors cannot be scalarized, so splitting or widening is
+  // required.
+  if (VT.isScalableVector() && !isPowerOf2_32(EC.getKnownMinValue()))
+    llvm_unreachable(
+        "Splitting or widening of non-power-of-2 MVTs is not implemented.");
+
+  // FIXME: We don't support non-power-of-2-sized vectors for now.
+  // Ideally we could break down into LHS/RHS like LegalizeDAG does.
+  if (!isPowerOf2_32(EC.getKnownMinValue())) {
+    // Split EC to unit size (scalable property is preserved).
+    NumVectorRegs = EC.getKnownMinValue();
+    EC = ElementCount::getFixed(1);
+  }
+
+  // Divide the input until we get to a supported size. This will
+  // always end up with an EC that represent a scalar or a scalable
+  // scalar.
+  while (EC.getKnownMinValue() > 1 &&
+         !TLI->isTypeLegal(MVT::getVectorVT(EltTy, EC))) {
+    EC = EC.divideCoefficientBy(2);
+    NumVectorRegs <<= 1;
+  }
+
+  NumIntermediates = NumVectorRegs;
+
+  MVT NewVT = MVT::getVectorVT(EltTy, EC);
+  if (!TLI->isTypeLegal(NewVT))
+    NewVT = EltTy;
+  IntermediateVT = NewVT;
+
+  unsigned LaneSizeInBits = NewVT.getScalarSizeInBits();
+
+  // Convert sizes such as i33 to i64.
+  LaneSizeInBits = llvm::bit_ceil(LaneSizeInBits);
+
+  MVT DestVT = TLI->getRegisterType(NewVT);
+  RegisterVT = DestVT;
+  if (EVT(DestVT).bitsLT(NewVT))    // Value is expanded, e.g. i64 -> i16.
+    return NumVectorRegs * (LaneSizeInBits / DestVT.getScalarSizeInBits());
+
+  // Otherwise, promotion or legal types use the same number of registers as
+  // the vector decimated to the appropriate level.
+  return NumVectorRegs;
+}
+
+/// isLegalRC - Return true if the value types that can be represented by the
+/// specified register class are all legal.
+bool TargetLoweringBase::isLegalRC(const TargetRegisterInfo &TRI,
+                                   const TargetRegisterClass &RC) const {
+  for (const auto *I = TRI.legalclasstypes_begin(RC); *I != MVT::Other; ++I)
+    if (isTypeLegal(*I))
+      return true;
+  return false;
+}
+
+/// Replace/modify any TargetFrameIndex operands with a targte-dependent
+/// sequence of memory operands that is recognized by PrologEpilogInserter.
+MachineBasicBlock *
+TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,
+                                   MachineBasicBlock *MBB) const {
+  MachineInstr *MI = &InitialMI;
+  MachineFunction &MF = *MI->getMF();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // We're handling multiple types of operands here:
+  // PATCHPOINT MetaArgs - live-in, read only, direct
+  // STATEPOINT Deopt Spill - live-through, read only, indirect
+  // STATEPOINT Deopt Alloca - live-through, read only, direct
+  // (We're currently conservative and mark the deopt slots read/write in
+  // practice.)
+  // STATEPOINT GC Spill - live-through, read/write, indirect
+  // STATEPOINT GC Alloca - live-through, read/write, direct
+  // The live-in vs live-through is handled already (the live through ones are
+  // all stack slots), but we need to handle the different type of stackmap
+  // operands and memory effects here.
+
+  if (llvm::none_of(MI->operands(),
+                    [](MachineOperand &Operand) { return Operand.isFI(); }))
+    return MBB;
+
+  MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), MI->getDesc());
+
+  // Inherit previous memory operands.
+  MIB.cloneMemRefs(*MI);
+
+  for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isFI()) {
+      // Index of Def operand this Use it tied to.
+      // Since Defs are coming before Uses, if Use is tied, then
+      // index of Def must be smaller that index of that Use.
+      // Also, Defs preserve their position in new MI.
+      unsigned TiedTo = i;
+      if (MO.isReg() && MO.isTied())
+        TiedTo = MI->findTiedOperandIdx(i);
+      MIB.add(MO);
+      if (TiedTo < i)
+        MIB->tieOperands(TiedTo, MIB->getNumOperands() - 1);
+      continue;
+    }
+
+    // foldMemoryOperand builds a new MI after replacing a single FI operand
+    // with the canonical set of five x86 addressing-mode operands.
+    int FI = MO.getIndex();
+
+    // Add frame index operands recognized by stackmaps.cpp
+    if (MFI.isStatepointSpillSlotObjectIndex(FI)) {
+      // indirect-mem-ref tag, size, #FI, offset.
+      // Used for spills inserted by StatepointLowering.  This codepath is not
+      // used for patchpoints/stackmaps at all, for these spilling is done via
+      // foldMemoryOperand callback only.
+      assert(MI->getOpcode() == TargetOpcode::STATEPOINT && "sanity");
+      MIB.addImm(StackMaps::IndirectMemRefOp);
+      MIB.addImm(MFI.getObjectSize(FI));
+      MIB.add(MO);
+      MIB.addImm(0);
+    } else {
+      // direct-mem-ref tag, #FI, offset.
+      // Used by patchpoint, and direct alloca arguments to statepoints
+      MIB.addImm(StackMaps::DirectMemRefOp);
+      MIB.add(MO);
+      MIB.addImm(0);
+    }
+
+    assert(MIB->mayLoad() && "Folded a stackmap use to a non-load!");
+
+    // Add a new memory operand for this FI.
+    assert(MFI.getObjectOffset(FI) != -1);
+
+    // Note: STATEPOINT MMOs are added during SelectionDAG.  STACKMAP, and
+    // PATCHPOINT should be updated to do the same. (TODO)
+    if (MI->getOpcode() != TargetOpcode::STATEPOINT) {
+      auto Flags = MachineMemOperand::MOLoad;
+      MachineMemOperand *MMO = MF.getMachineMemOperand(
+          MachinePointerInfo::getFixedStack(MF, FI), Flags,
+          MF.getDataLayout().getPointerSize(), MFI.getObjectAlign(FI));
+      MIB->addMemOperand(MF, MMO);
+    }
+  }
+  MBB->insert(MachineBasicBlock::iterator(MI), MIB);
+  MI->eraseFromParent();
+  return MBB;
+}
+
+/// findRepresentativeClass - Return the largest legal super-reg register class
+/// of the register class for the specified type and its associated "cost".
+// This function is in TargetLowering because it uses RegClassForVT which would
+// need to be moved to TargetRegisterInfo and would necessitate moving
+// isTypeLegal over as well - a massive change that would just require
+// TargetLowering having a TargetRegisterInfo class member that it would use.
+std::pair<const TargetRegisterClass *, uint8_t>
+TargetLoweringBase::findRepresentativeClass(const TargetRegisterInfo *TRI,
+                                            MVT VT) const {
+  const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy];
+  if (!RC)
+    return std::make_pair(RC, 0);
+
+  // Compute the set of all super-register classes.
+  BitVector SuperRegRC(TRI->getNumRegClasses());
+  for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI)
+    SuperRegRC.setBitsInMask(RCI.getMask());
+
+  // Find the first legal register class with the largest spill size.
+  const TargetRegisterClass *BestRC = RC;
+  for (unsigned i : SuperRegRC.set_bits()) {
+    const TargetRegisterClass *SuperRC = TRI->getRegClass(i);
+    // We want the largest possible spill size.
+    if (TRI->getSpillSize(*SuperRC) <= TRI->getSpillSize(*BestRC))
+      continue;
+    if (!isLegalRC(*TRI, *SuperRC))
+      continue;
+    BestRC = SuperRC;
+  }
+  return std::make_pair(BestRC, 1);
+}
+
+/// computeRegisterProperties - Once all of the register classes are added,
+/// this allows us to compute derived properties we expose.
+void TargetLoweringBase::computeRegisterProperties(
+    const TargetRegisterInfo *TRI) {
+  static_assert(MVT::VALUETYPE_SIZE <= MVT::MAX_ALLOWED_VALUETYPE,
+                "Too many value types for ValueTypeActions to hold!");
+
+  // Everything defaults to needing one register.
+  for (unsigned i = 0; i != MVT::VALUETYPE_SIZE; ++i) {
+    NumRegistersForVT[i] = 1;
+    RegisterTypeForVT[i] = TransformToType[i] = (MVT::SimpleValueType)i;
+  }
+  // ...except isVoid, which doesn't need any registers.
+  NumRegistersForVT[MVT::isVoid] = 0;
+
+  // Find the largest integer register class.
+  unsigned LargestIntReg = MVT::LAST_INTEGER_VALUETYPE;
+  for (; RegClassForVT[LargestIntReg] == nullptr; --LargestIntReg)
+    assert(LargestIntReg != MVT::i1 && "No integer registers defined!");
+
+  // Every integer value type larger than this largest register takes twice as
+  // many registers to represent as the previous ValueType.
+  for (unsigned ExpandedReg = LargestIntReg + 1;
+       ExpandedReg <= MVT::LAST_INTEGER_VALUETYPE; ++ExpandedReg) {
+    NumRegistersForVT[ExpandedReg] = 2*NumRegistersForVT[ExpandedReg-1];
+    RegisterTypeForVT[ExpandedReg] = (MVT::SimpleValueType)LargestIntReg;
+    TransformToType[ExpandedReg] = (MVT::SimpleValueType)(ExpandedReg - 1);
+    ValueTypeActions.setTypeAction((MVT::SimpleValueType)ExpandedReg,
+                                   TypeExpandInteger);
+  }
+
+  // Inspect all of the ValueType's smaller than the largest integer
+  // register to see which ones need promotion.
+  unsigned LegalIntReg = LargestIntReg;
+  for (unsigned IntReg = LargestIntReg - 1;
+       IntReg >= (unsigned)MVT::i1; --IntReg) {
+    MVT IVT = (MVT::SimpleValueType)IntReg;
+    if (isTypeLegal(IVT)) {
+      LegalIntReg = IntReg;
+    } else {
+      RegisterTypeForVT[IntReg] = TransformToType[IntReg] =
+        (MVT::SimpleValueType)LegalIntReg;
+      ValueTypeActions.setTypeAction(IVT, TypePromoteInteger);
+    }
+  }
+
+  // ppcf128 type is really two f64's.
+  if (!isTypeLegal(MVT::ppcf128)) {
+    if (isTypeLegal(MVT::f64)) {
+      NumRegistersForVT[MVT::ppcf128] = 2*NumRegistersForVT[MVT::f64];
+      RegisterTypeForVT[MVT::ppcf128] = MVT::f64;
+      TransformToType[MVT::ppcf128] = MVT::f64;
+      ValueTypeActions.setTypeAction(MVT::ppcf128, TypeExpandFloat);
+    } else {
+      NumRegistersForVT[MVT::ppcf128] = NumRegistersForVT[MVT::i128];
+      RegisterTypeForVT[MVT::ppcf128] = RegisterTypeForVT[MVT::i128];
+      TransformToType[MVT::ppcf128] = MVT::i128;
+      ValueTypeActions.setTypeAction(MVT::ppcf128, TypeSoftenFloat);
+    }
+  }
+
+  // Decide how to handle f128. If the target does not have native f128 support,
+  // expand it to i128 and we will be generating soft float library calls.
+  if (!isTypeLegal(MVT::f128)) {
+    NumRegistersForVT[MVT::f128] = NumRegistersForVT[MVT::i128];
+    RegisterTypeForVT[MVT::f128] = RegisterTypeForVT[MVT::i128];
+    TransformToType[MVT::f128] = MVT::i128;
+    ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
+  }
+
+  // Decide how to handle f80. If the target does not have native f80 support,
+  // expand it to i96 and we will be generating soft float library calls.
+  if (!isTypeLegal(MVT::f80)) {
+    NumRegistersForVT[MVT::f80] = 3*NumRegistersForVT[MVT::i32];
+    RegisterTypeForVT[MVT::f80] = RegisterTypeForVT[MVT::i32];
+    TransformToType[MVT::f80] = MVT::i32;
+    ValueTypeActions.setTypeAction(MVT::f80, TypeSoftenFloat);
+  }
+
+  // Decide how to handle f64. If the target does not have native f64 support,
+  // expand it to i64 and we will be generating soft float library calls.
+  if (!isTypeLegal(MVT::f64)) {
+    NumRegistersForVT[MVT::f64] = NumRegistersForVT[MVT::i64];
+    RegisterTypeForVT[MVT::f64] = RegisterTypeForVT[MVT::i64];
+    TransformToType[MVT::f64] = MVT::i64;
+    ValueTypeActions.setTypeAction(MVT::f64, TypeSoftenFloat);
+  }
+
+  // Decide how to handle f32. If the target does not have native f32 support,
+  // expand it to i32 and we will be generating soft float library calls.
+  if (!isTypeLegal(MVT::f32)) {
+    NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::i32];
+    RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::i32];
+    TransformToType[MVT::f32] = MVT::i32;
+    ValueTypeActions.setTypeAction(MVT::f32, TypeSoftenFloat);
+  }
+
+  // Decide how to handle f16. If the target does not have native f16 support,
+  // promote it to f32, because there are no f16 library calls (except for
+  // conversions).
+  if (!isTypeLegal(MVT::f16)) {
+    // Allow targets to control how we legalize half.
+    if (softPromoteHalfType()) {
+      NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::i16];
+      RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::i16];
+      TransformToType[MVT::f16] = MVT::f32;
+      ValueTypeActions.setTypeAction(MVT::f16, TypeSoftPromoteHalf);
+    } else {
+      NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::f32];
+      RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::f32];
+      TransformToType[MVT::f16] = MVT::f32;
+      ValueTypeActions.setTypeAction(MVT::f16, TypePromoteFloat);
+    }
+  }
+
+  // Decide how to handle bf16. If the target does not have native bf16 support,
+  // promote it to f32, because there are no bf16 library calls (except for
+  // converting from f32 to bf16).
+  if (!isTypeLegal(MVT::bf16)) {
+    NumRegistersForVT[MVT::bf16] = NumRegistersForVT[MVT::f32];
+    RegisterTypeForVT[MVT::bf16] = RegisterTypeForVT[MVT::f32];
+    TransformToType[MVT::bf16] = MVT::f32;
+    ValueTypeActions.setTypeAction(MVT::bf16, TypeSoftPromoteHalf);
+  }
+
+  // Loop over all of the vector value types to see which need transformations.
+  for (unsigned i = MVT::FIRST_VECTOR_VALUETYPE;
+       i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+    MVT VT = (MVT::SimpleValueType) i;
+    if (isTypeLegal(VT))
+      continue;
+
+    MVT EltVT = VT.getVectorElementType();
+    ElementCount EC = VT.getVectorElementCount();
+    bool IsLegalWiderType = false;
+    bool IsScalable = VT.isScalableVector();
+    LegalizeTypeAction PreferredAction = getPreferredVectorAction(VT);
+    switch (PreferredAction) {
+    case TypePromoteInteger: {
+      MVT::SimpleValueType EndVT = IsScalable ?
+                                   MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE :
+                                   MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE;
+      // Try to promote the elements of integer vectors. If no legal
+      // promotion was found, fall through to the widen-vector method.
+      for (unsigned nVT = i + 1;
+           (MVT::SimpleValueType)nVT <= EndVT; ++nVT) {
+        MVT SVT = (MVT::SimpleValueType) nVT;
+        // Promote vectors of integers to vectors with the same number
+        // of elements, with a wider element type.
+        if (SVT.getScalarSizeInBits() > EltVT.getFixedSizeInBits() &&
+            SVT.getVectorElementCount() == EC && isTypeLegal(SVT)) {
+          TransformToType[i] = SVT;
+          RegisterTypeForVT[i] = SVT;
+          NumRegistersForVT[i] = 1;
+          ValueTypeActions.setTypeAction(VT, TypePromoteInteger);
+          IsLegalWiderType = true;
+          break;
+        }
+      }
+      if (IsLegalWiderType)
+        break;
+      [[fallthrough]];
+    }
+
+    case TypeWidenVector:
+      if (isPowerOf2_32(EC.getKnownMinValue())) {
+        // Try to widen the vector.
+        for (unsigned nVT = i + 1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+          MVT SVT = (MVT::SimpleValueType) nVT;
+          if (SVT.getVectorElementType() == EltVT &&
+              SVT.isScalableVector() == IsScalable &&
+              SVT.getVectorElementCount().getKnownMinValue() >
+                  EC.getKnownMinValue() &&
+              isTypeLegal(SVT)) {
+            TransformToType[i] = SVT;
+            RegisterTypeForVT[i] = SVT;
+            NumRegistersForVT[i] = 1;
+            ValueTypeActions.setTypeAction(VT, TypeWidenVector);
+            IsLegalWiderType = true;
+            break;
+          }
+        }
+        if (IsLegalWiderType)
+          break;
+      } else {
+        // Only widen to the next power of 2 to keep consistency with EVT.
+        MVT NVT = VT.getPow2VectorType();
+        if (isTypeLegal(NVT)) {
+          TransformToType[i] = NVT;
+          ValueTypeActions.setTypeAction(VT, TypeWidenVector);
+          RegisterTypeForVT[i] = NVT;
+          NumRegistersForVT[i] = 1;
+          break;
+        }
+      }
+      [[fallthrough]];
+
+    case TypeSplitVector:
+    case TypeScalarizeVector: {
+      MVT IntermediateVT;
+      MVT RegisterVT;
+      unsigned NumIntermediates;
+      unsigned NumRegisters = getVectorTypeBreakdownMVT(VT, IntermediateVT,
+          NumIntermediates, RegisterVT, this);
+      NumRegistersForVT[i] = NumRegisters;
+      assert(NumRegistersForVT[i] == NumRegisters &&
+             "NumRegistersForVT size cannot represent NumRegisters!");
+      RegisterTypeForVT[i] = RegisterVT;
+
+      MVT NVT = VT.getPow2VectorType();
+      if (NVT == VT) {
+        // Type is already a power of 2.  The default action is to split.
+        TransformToType[i] = MVT::Other;
+        if (PreferredAction == TypeScalarizeVector)
+          ValueTypeActions.setTypeAction(VT, TypeScalarizeVector);
+        else if (PreferredAction == TypeSplitVector)
+          ValueTypeActions.setTypeAction(VT, TypeSplitVector);
+        else if (EC.getKnownMinValue() > 1)
+          ValueTypeActions.setTypeAction(VT, TypeSplitVector);
+        else
+          ValueTypeActions.setTypeAction(VT, EC.isScalable()
+                                                 ? TypeScalarizeScalableVector
+                                                 : TypeScalarizeVector);
+      } else {
+        TransformToType[i] = NVT;
+        ValueTypeActions.setTypeAction(VT, TypeWidenVector);
+      }
+      break;
+    }
+    default:
+      llvm_unreachable("Unknown vector legalization action!");
+    }
+  }
+
+  // Determine the 'representative' register class for each value type.
+  // An representative register class is the largest (meaning one which is
+  // not a sub-register class / subreg register class) legal register class for
+  // a group of value types. For example, on i386, i8, i16, and i32
+  // representative would be GR32; while on x86_64 it's GR64.
+  for (unsigned i = 0; i != MVT::VALUETYPE_SIZE; ++i) {
+    const TargetRegisterClass* RRC;
+    uint8_t Cost;
+    std::tie(RRC, Cost) = findRepresentativeClass(TRI, (MVT::SimpleValueType)i);
+    RepRegClassForVT[i] = RRC;
+    RepRegClassCostForVT[i] = Cost;
+  }
+}
+
+EVT TargetLoweringBase::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                                           EVT VT) const {
+  assert(!VT.isVector() && "No default SetCC type for vectors!");
+  return getPointerTy(DL).SimpleTy;
+}
+
+MVT::SimpleValueType TargetLoweringBase::getCmpLibcallReturnType() const {
+  return MVT::i32; // return the default value
+}
+
+/// getVectorTypeBreakdown - Vector types are broken down into some number of
+/// legal first class types.  For example, MVT::v8f32 maps to 2 MVT::v4f32
+/// with Altivec or SSE1, or 8 promoted MVT::f64 values with the X86 FP stack.
+/// Similarly, MVT::v2i64 turns into 4 MVT::i32 values with both PPC and X86.
+///
+/// This method returns the number of registers needed, and the VT for each
+/// register.  It also returns the VT and quantity of the intermediate values
+/// before they are promoted/expanded.
+unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context,
+                                                    EVT VT, EVT &IntermediateVT,
+                                                    unsigned &NumIntermediates,
+                                                    MVT &RegisterVT) const {
+  ElementCount EltCnt = VT.getVectorElementCount();
+
+  // If there is a wider vector type with the same element type as this one,
+  // or a promoted vector type that has the same number of elements which
+  // are wider, then we should convert to that legal vector type.
+  // This handles things like <2 x float> -> <4 x float> and
+  // <4 x i1> -> <4 x i32>.
+  LegalizeTypeAction TA = getTypeAction(Context, VT);
+  if (!EltCnt.isScalar() &&
+      (TA == TypeWidenVector || TA == TypePromoteInteger)) {
+    EVT RegisterEVT = getTypeToTransformTo(Context, VT);
+    if (isTypeLegal(RegisterEVT)) {
+      IntermediateVT = RegisterEVT;
+      RegisterVT = RegisterEVT.getSimpleVT();
+      NumIntermediates = 1;
+      return 1;
+    }
+  }
+
+  // Figure out the right, legal destination reg to copy into.
+  EVT EltTy = VT.getVectorElementType();
+
+  unsigned NumVectorRegs = 1;
+
+  // Scalable vectors cannot be scalarized, so handle the legalisation of the
+  // types like done elsewhere in SelectionDAG.
+  if (EltCnt.isScalable()) {
+    LegalizeKind LK;
+    EVT PartVT = VT;
+    do {
+      // Iterate until we've found a legal (part) type to hold VT.
+      LK = getTypeConversion(Context, PartVT);
+      PartVT = LK.second;
+    } while (LK.first != TypeLegal);
+
+    if (!PartVT.isVector()) {
+      report_fatal_error(
+          "Don't know how to legalize this scalable vector type");
+    }
+
+    NumIntermediates =
+        divideCeil(VT.getVectorElementCount().getKnownMinValue(),
+                   PartVT.getVectorElementCount().getKnownMinValue());
+    IntermediateVT = PartVT;
+    RegisterVT = getRegisterType(Context, IntermediateVT);
+    return NumIntermediates;
+  }
+
+  // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally
+  // we could break down into LHS/RHS like LegalizeDAG does.
+  if (!isPowerOf2_32(EltCnt.getKnownMinValue())) {
+    NumVectorRegs = EltCnt.getKnownMinValue();
+    EltCnt = ElementCount::getFixed(1);
+  }
+
+  // Divide the input until we get to a supported size.  This will always
+  // end with a scalar if the target doesn't support vectors.
+  while (EltCnt.getKnownMinValue() > 1 &&
+         !isTypeLegal(EVT::getVectorVT(Context, EltTy, EltCnt))) {
+    EltCnt = EltCnt.divideCoefficientBy(2);
+    NumVectorRegs <<= 1;
+  }
+
+  NumIntermediates = NumVectorRegs;
+
+  EVT NewVT = EVT::getVectorVT(Context, EltTy, EltCnt);
+  if (!isTypeLegal(NewVT))
+    NewVT = EltTy;
+  IntermediateVT = NewVT;
+
+  MVT DestVT = getRegisterType(Context, NewVT);
+  RegisterVT = DestVT;
+
+  if (EVT(DestVT).bitsLT(NewVT)) {  // Value is expanded, e.g. i64 -> i16.
+    TypeSize NewVTSize = NewVT.getSizeInBits();
+    // Convert sizes such as i33 to i64.
+    if (!llvm::has_single_bit<uint32_t>(NewVTSize.getKnownMinValue()))
+      NewVTSize = NewVTSize.coefficientNextPowerOf2();
+    return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());
+  }
+
+  // Otherwise, promotion or legal types use the same number of registers as
+  // the vector decimated to the appropriate level.
+  return NumVectorRegs;
+}
+
+bool TargetLoweringBase::isSuitableForJumpTable(const SwitchInst *SI,
+                                                uint64_t NumCases,
+                                                uint64_t Range,
+                                                ProfileSummaryInfo *PSI,
+                                                BlockFrequencyInfo *BFI) const {
+  // FIXME: This function check the maximum table size and density, but the
+  // minimum size is not checked. It would be nice if the minimum size is
+  // also combined within this function. Currently, the minimum size check is
+  // performed in findJumpTable() in SelectionDAGBuiler and
+  // getEstimatedNumberOfCaseClusters() in BasicTTIImpl.
+  const bool OptForSize =
+      SI->getParent()->getParent()->hasOptSize() ||
+      llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI);
+  const unsigned MinDensity = getMinimumJumpTableDensity(OptForSize);
+  const unsigned MaxJumpTableSize = getMaximumJumpTableSize();
+
+  // Check whether the number of cases is small enough and
+  // the range is dense enough for a jump table.
+  return (OptForSize || Range <= MaxJumpTableSize) &&
+         (NumCases * 100 >= Range * MinDensity);
+}
+
+MVT TargetLoweringBase::getPreferredSwitchConditionType(LLVMContext &Context,
+                                                        EVT ConditionVT) const {
+  return getRegisterType(Context, ConditionVT);
+}
+
+/// Get the EVTs and ArgFlags collections that represent the legalized return
+/// type of the given function.  This does not require a DAG or a return value,
+/// and is suitable for use before any DAGs for the function are constructed.
+/// TODO: Move this out of TargetLowering.cpp.
+void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType,
+                         AttributeList attr,
+                         SmallVectorImpl<ISD::OutputArg> &Outs,
+                         const TargetLowering &TLI, const DataLayout &DL) {
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, DL, ReturnType, ValueVTs);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0) return;
+
+  for (unsigned j = 0, f = NumValues; j != f; ++j) {
+    EVT VT = ValueVTs[j];
+    ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+
+    if (attr.hasRetAttr(Attribute::SExt))
+      ExtendKind = ISD::SIGN_EXTEND;
+    else if (attr.hasRetAttr(Attribute::ZExt))
+      ExtendKind = ISD::ZERO_EXTEND;
+
+    // FIXME: C calling convention requires the return type to be promoted to
+    // at least 32-bit. But this is not necessary for non-C calling
+    // conventions. The frontend should mark functions whose return values
+    // require promoting with signext or zeroext attributes.
+    if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) {
+      MVT MinVT = TLI.getRegisterType(MVT::i32);
+      if (VT.bitsLT(MinVT))
+        VT = MinVT;
+    }
+
+    unsigned NumParts =
+        TLI.getNumRegistersForCallingConv(ReturnType->getContext(), CC, VT);
+    MVT PartVT =
+        TLI.getRegisterTypeForCallingConv(ReturnType->getContext(), CC, VT);
+
+    // 'inreg' on function refers to return value
+    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+    if (attr.hasRetAttr(Attribute::InReg))
+      Flags.setInReg();
+
+    // Propagate extension type if any
+    if (attr.hasRetAttr(Attribute::SExt))
+      Flags.setSExt();
+    else if (attr.hasRetAttr(Attribute::ZExt))
+      Flags.setZExt();
+
+    for (unsigned i = 0; i < NumParts; ++i)
+      Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, /*isfixed=*/true, 0, 0));
+  }
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area.  This is the actual
+/// alignment, not its logarithm.
+uint64_t TargetLoweringBase::getByValTypeAlignment(Type *Ty,
+                                                   const DataLayout &DL) const {
+  return DL.getABITypeAlign(Ty).value();
+}
+
+bool TargetLoweringBase::allowsMemoryAccessForAlignment(
+    LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace,
+    Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const {
+  // Check if the specified alignment is sufficient based on the data layout.
+  // TODO: While using the data layout works in practice, a better solution
+  // would be to implement this check directly (make this a virtual function).
+  // For example, the ABI alignment may change based on software platform while
+  // this function should only be affected by hardware implementation.
+  Type *Ty = VT.getTypeForEVT(Context);
+  if (VT.isZeroSized() || Alignment >= DL.getABITypeAlign(Ty)) {
+    // Assume that an access that meets the ABI-specified alignment is fast.
+    if (Fast != nullptr)
+      *Fast = 1;
+    return true;
+  }
+
+  // This is a misaligned access.
+  return allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags, Fast);
+}
+
+bool TargetLoweringBase::allowsMemoryAccessForAlignment(
+    LLVMContext &Context, const DataLayout &DL, EVT VT,
+    const MachineMemOperand &MMO, unsigned *Fast) const {
+  return allowsMemoryAccessForAlignment(Context, DL, VT, MMO.getAddrSpace(),
+                                        MMO.getAlign(), MMO.getFlags(), Fast);
+}
+
+bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
+                                            const DataLayout &DL, EVT VT,
+                                            unsigned AddrSpace, Align Alignment,
+                                            MachineMemOperand::Flags Flags,
+                                            unsigned *Fast) const {
+  return allowsMemoryAccessForAlignment(Context, DL, VT, AddrSpace, Alignment,
+                                        Flags, Fast);
+}
+
+bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
+                                            const DataLayout &DL, EVT VT,
+                                            const MachineMemOperand &MMO,
+                                            unsigned *Fast) const {
+  return allowsMemoryAccess(Context, DL, VT, MMO.getAddrSpace(), MMO.getAlign(),
+                            MMO.getFlags(), Fast);
+}
+
+bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
+                                            const DataLayout &DL, LLT Ty,
+                                            const MachineMemOperand &MMO,
+                                            unsigned *Fast) const {
+  EVT VT = getApproximateEVTForLLT(Ty, DL, Context);
+  return allowsMemoryAccess(Context, DL, VT, MMO.getAddrSpace(), MMO.getAlign(),
+                            MMO.getFlags(), Fast);
+}
+
+//===----------------------------------------------------------------------===//
+//  TargetTransformInfo Helpers
+//===----------------------------------------------------------------------===//
+
+int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
+  enum InstructionOpcodes {
+#define HANDLE_INST(NUM, OPCODE, CLASS) OPCODE = NUM,
+#define LAST_OTHER_INST(NUM) InstructionOpcodesCount = NUM
+#include "llvm/IR/Instruction.def"
+  };
+  switch (static_cast<InstructionOpcodes>(Opcode)) {
+  case Ret:            return 0;
+  case Br:             return 0;
+  case Switch:         return 0;
+  case IndirectBr:     return 0;
+  case Invoke:         return 0;
+  case CallBr:         return 0;
+  case Resume:         return 0;
+  case Unreachable:    return 0;
+  case CleanupRet:     return 0;
+  case CatchRet:       return 0;
+  case CatchPad:       return 0;
+  case CatchSwitch:    return 0;
+  case CleanupPad:     return 0;
+  case FNeg:           return ISD::FNEG;
+  case Add:            return ISD::ADD;
+  case FAdd:           return ISD::FADD;
+  case Sub:            return ISD::SUB;
+  case FSub:           return ISD::FSUB;
+  case Mul:            return ISD::MUL;
+  case FMul:           return ISD::FMUL;
+  case UDiv:           return ISD::UDIV;
+  case SDiv:           return ISD::SDIV;
+  case FDiv:           return ISD::FDIV;
+  case URem:           return ISD::UREM;
+  case SRem:           return ISD::SREM;
+  case FRem:           return ISD::FREM;
+  case Shl:            return ISD::SHL;
+  case LShr:           return ISD::SRL;
+  case AShr:           return ISD::SRA;
+  case And:            return ISD::AND;
+  case Or:             return ISD::OR;
+  case Xor:            return ISD::XOR;
+  case Alloca:         return 0;
+  case Load:           return ISD::LOAD;
+  case Store:          return ISD::STORE;
+  case GetElementPtr:  return 0;
+  case Fence:          return 0;
+  case AtomicCmpXchg:  return 0;
+  case AtomicRMW:      return 0;
+  case Trunc:          return ISD::TRUNCATE;
+  case ZExt:           return ISD::ZERO_EXTEND;
+  case SExt:           return ISD::SIGN_EXTEND;
+  case FPToUI:         return ISD::FP_TO_UINT;
+  case FPToSI:         return ISD::FP_TO_SINT;
+  case UIToFP:         return ISD::UINT_TO_FP;
+  case SIToFP:         return ISD::SINT_TO_FP;
+  case FPTrunc:        return ISD::FP_ROUND;
+  case FPExt:          return ISD::FP_EXTEND;
+  case PtrToInt:       return ISD::BITCAST;
+  case IntToPtr:       return ISD::BITCAST;
+  case BitCast:        return ISD::BITCAST;
+  case AddrSpaceCast:  return ISD::ADDRSPACECAST;
+  case ICmp:           return ISD::SETCC;
+  case FCmp:           return ISD::SETCC;
+  case PHI:            return 0;
+  case Call:           return 0;
+  case Select:         return ISD::SELECT;
+  case UserOp1:        return 0;
+  case UserOp2:        return 0;
+  case VAArg:          return 0;
+  case ExtractElement: return ISD::EXTRACT_VECTOR_ELT;
+  case InsertElement:  return ISD::INSERT_VECTOR_ELT;
+  case ShuffleVector:  return ISD::VECTOR_SHUFFLE;
+  case ExtractValue:   return ISD::MERGE_VALUES;
+  case InsertValue:    return ISD::MERGE_VALUES;
+  case LandingPad:     return 0;
+  case Freeze:         return ISD::FREEZE;
+  }
+
+  llvm_unreachable("Unknown instruction type encountered!");
+}
+
+Value *
+TargetLoweringBase::getDefaultSafeStackPointerLocation(IRBuilderBase &IRB,
+                                                       bool UseTLS) const {
+  // compiler-rt provides a variable with a magic name.  Targets that do not
+  // link with compiler-rt may also provide such a variable.
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  const char *UnsafeStackPtrVar = "__safestack_unsafe_stack_ptr";
+  auto UnsafeStackPtr =
+      dyn_cast_or_null<GlobalVariable>(M->getNamedValue(UnsafeStackPtrVar));
+
+  Type *StackPtrTy = Type::getInt8PtrTy(M->getContext());
+
+  if (!UnsafeStackPtr) {
+    auto TLSModel = UseTLS ?
+        GlobalValue::InitialExecTLSModel :
+        GlobalValue::NotThreadLocal;
+    // The global variable is not defined yet, define it ourselves.
+    // We use the initial-exec TLS model because we do not support the
+    // variable living anywhere other than in the main executable.
+    UnsafeStackPtr = new GlobalVariable(
+        *M, StackPtrTy, false, GlobalValue::ExternalLinkage, nullptr,
+        UnsafeStackPtrVar, nullptr, TLSModel);
+  } else {
+    // The variable exists, check its type and attributes.
+    if (UnsafeStackPtr->getValueType() != StackPtrTy)
+      report_fatal_error(Twine(UnsafeStackPtrVar) + " must have void* type");
+    if (UseTLS != UnsafeStackPtr->isThreadLocal())
+      report_fatal_error(Twine(UnsafeStackPtrVar) + " must " +
+                         (UseTLS ? "" : "not ") + "be thread-local");
+  }
+  return UnsafeStackPtr;
+}
+
+Value *
+TargetLoweringBase::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
+  if (!TM.getTargetTriple().isAndroid())
+    return getDefaultSafeStackPointerLocation(IRB, true);
+
+  // Android provides a libc function to retrieve the address of the current
+  // thread's unsafe stack pointer.
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  Type *StackPtrTy = Type::getInt8PtrTy(M->getContext());
+  FunctionCallee Fn = M->getOrInsertFunction("__safestack_pointer_address",
+                                             StackPtrTy->getPointerTo(0));
+  return IRB.CreateCall(Fn);
+}
+
+//===----------------------------------------------------------------------===//
+//  Loop Strength Reduction hooks
+//===----------------------------------------------------------------------===//
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool TargetLoweringBase::isLegalAddressingMode(const DataLayout &DL,
+                                               const AddrMode &AM, Type *Ty,
+                                               unsigned AS, Instruction *I) const {
+  // The default implementation of this implements a conservative RISCy, r+r and
+  // r+i addr mode.
+
+  // Allows a sign-extended 16-bit immediate field.
+  if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
+    return false;
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // Only support r+r,
+  switch (AM.Scale) {
+  case 0:  // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
+      return false;
+    // Otherwise we have r+r or r+i.
+    break;
+  case 2:
+    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
+      return false;
+    // Allow 2*r as r+r.
+    break;
+  default: // Don't allow n * r
+    return false;
+  }
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  Stack Protector
+//===----------------------------------------------------------------------===//
+
+// For OpenBSD return its special guard variable. Otherwise return nullptr,
+// so that SelectionDAG handle SSP.
+Value *TargetLoweringBase::getIRStackGuard(IRBuilderBase &IRB) const {
+  if (getTargetMachine().getTargetTriple().isOSOpenBSD()) {
+    Module &M = *IRB.GetInsertBlock()->getParent()->getParent();
+    PointerType *PtrTy = Type::getInt8PtrTy(M.getContext());
+    Constant *C = M.getOrInsertGlobal("__guard_local", PtrTy);
+    if (GlobalVariable *G = dyn_cast_or_null<GlobalVariable>(C))
+      G->setVisibility(GlobalValue::HiddenVisibility);
+    return C;
+  }
+  return nullptr;
+}
+
+// Currently only support "standard" __stack_chk_guard.
+// TODO: add LOAD_STACK_GUARD support.
+void TargetLoweringBase::insertSSPDeclarations(Module &M) const {
+  if (!M.getNamedValue("__stack_chk_guard")) {
+    auto *GV = new GlobalVariable(M, Type::getInt8PtrTy(M.getContext()), false,
+                                  GlobalVariable::ExternalLinkage, nullptr,
+                                  "__stack_chk_guard");
+
+    // FreeBSD has "__stack_chk_guard" defined externally on libc.so
+    if (M.getDirectAccessExternalData() &&
+        !TM.getTargetTriple().isWindowsGNUEnvironment() &&
+        !(TM.getTargetTriple().isPPC64() && TM.getTargetTriple().isOSFreeBSD()) &&
+        !TM.getTargetTriple().isOSDarwin())
+      GV->setDSOLocal(true);
+  }
+}
+
+// Currently only support "standard" __stack_chk_guard.
+// TODO: add LOAD_STACK_GUARD support.
+Value *TargetLoweringBase::getSDagStackGuard(const Module &M) const {
+  return M.getNamedValue("__stack_chk_guard");
+}
+
+Function *TargetLoweringBase::getSSPStackGuardCheck(const Module &M) const {
+  return nullptr;
+}
+
+unsigned TargetLoweringBase::getMinimumJumpTableEntries() const {
+  return MinimumJumpTableEntries;
+}
+
+void TargetLoweringBase::setMinimumJumpTableEntries(unsigned Val) {
+  MinimumJumpTableEntries = Val;
+}
+
+unsigned TargetLoweringBase::getMinimumJumpTableDensity(bool OptForSize) const {
+  return OptForSize ? OptsizeJumpTableDensity : JumpTableDensity;
+}
+
+unsigned TargetLoweringBase::getMaximumJumpTableSize() const {
+  return MaximumJumpTableSize;
+}
+
+void TargetLoweringBase::setMaximumJumpTableSize(unsigned Val) {
+  MaximumJumpTableSize = Val;
+}
+
+bool TargetLoweringBase::isJumpTableRelative() const {
+  return getTargetMachine().isPositionIndependent();
+}
+
+Align TargetLoweringBase::getPrefLoopAlignment(MachineLoop *ML) const {
+  if (TM.Options.LoopAlignment)
+    return Align(TM.Options.LoopAlignment);
+  return PrefLoopAlignment;
+}
+
+unsigned TargetLoweringBase::getMaxPermittedBytesForAlignment(
+    MachineBasicBlock *MBB) const {
+  return MaxBytesForAlignment;
+}
+
+//===----------------------------------------------------------------------===//
+//  Reciprocal Estimates
+//===----------------------------------------------------------------------===//
+
+/// Get the reciprocal estimate attribute string for a function that will
+/// override the target defaults.
+static StringRef getRecipEstimateForFunc(MachineFunction &MF) {
+  const Function &F = MF.getFunction();
+  return F.getFnAttribute("reciprocal-estimates").getValueAsString();
+}
+
+/// Construct a string for the given reciprocal operation of the given type.
+/// This string should match the corresponding option to the front-end's
+/// "-mrecip" flag assuming those strings have been passed through in an
+/// attribute string. For example, "vec-divf" for a division of a vXf32.
+static std::string getReciprocalOpName(bool IsSqrt, EVT VT) {
+  std::string Name = VT.isVector() ? "vec-" : "";
+
+  Name += IsSqrt ? "sqrt" : "div";
+
+  // TODO: Handle other float types?
+  if (VT.getScalarType() == MVT::f64) {
+    Name += "d";
+  } else if (VT.getScalarType() == MVT::f16) {
+    Name += "h";
+  } else {
+    assert(VT.getScalarType() == MVT::f32 &&
+           "Unexpected FP type for reciprocal estimate");
+    Name += "f";
+  }
+
+  return Name;
+}
+
+/// Return the character position and value (a single numeric character) of a
+/// customized refinement operation in the input string if it exists. Return
+/// false if there is no customized refinement step count.
+static bool parseRefinementStep(StringRef In, size_t &Position,
+                                uint8_t &Value) {
+  const char RefStepToken = ':';
+  Position = In.find(RefStepToken);
+  if (Position == StringRef::npos)
+    return false;
+
+  StringRef RefStepString = In.substr(Position + 1);
+  // Allow exactly one numeric character for the additional refinement
+  // step parameter.
+  if (RefStepString.size() == 1) {
+    char RefStepChar = RefStepString[0];
+    if (isDigit(RefStepChar)) {
+      Value = RefStepChar - '0';
+      return true;
+    }
+  }
+  report_fatal_error("Invalid refinement step for -recip.");
+}
+
+/// For the input attribute string, return one of the ReciprocalEstimate enum
+/// status values (enabled, disabled, or not specified) for this operation on
+/// the specified data type.
+static int getOpEnabled(bool IsSqrt, EVT VT, StringRef Override) {
+  if (Override.empty())
+    return TargetLoweringBase::ReciprocalEstimate::Unspecified;
+
+  SmallVector<StringRef, 4> OverrideVector;
+  Override.split(OverrideVector, ',');
+  unsigned NumArgs = OverrideVector.size();
+
+  // Check if "all", "none", or "default" was specified.
+  if (NumArgs == 1) {
+    // Look for an optional setting of the number of refinement steps needed
+    // for this type of reciprocal operation.
+    size_t RefPos;
+    uint8_t RefSteps;
+    if (parseRefinementStep(Override, RefPos, RefSteps)) {
+      // Split the string for further processing.
+      Override = Override.substr(0, RefPos);
+    }
+
+    // All reciprocal types are enabled.
+    if (Override == "all")
+      return TargetLoweringBase::ReciprocalEstimate::Enabled;
+
+    // All reciprocal types are disabled.
+    if (Override == "none")
+      return TargetLoweringBase::ReciprocalEstimate::Disabled;
+
+    // Target defaults for enablement are used.
+    if (Override == "default")
+      return TargetLoweringBase::ReciprocalEstimate::Unspecified;
+  }
+
+  // The attribute string may omit the size suffix ('f'/'d').
+  std::string VTName = getReciprocalOpName(IsSqrt, VT);
+  std::string VTNameNoSize = VTName;
+  VTNameNoSize.pop_back();
+  static const char DisabledPrefix = '!';
+
+  for (StringRef RecipType : OverrideVector) {
+    size_t RefPos;
+    uint8_t RefSteps;
+    if (parseRefinementStep(RecipType, RefPos, RefSteps))
+      RecipType = RecipType.substr(0, RefPos);
+
+    // Ignore the disablement token for string matching.
+    bool IsDisabled = RecipType[0] == DisabledPrefix;
+    if (IsDisabled)
+      RecipType = RecipType.substr(1);
+
+    if (RecipType.equals(VTName) || RecipType.equals(VTNameNoSize))
+      return IsDisabled ? TargetLoweringBase::ReciprocalEstimate::Disabled
+                        : TargetLoweringBase::ReciprocalEstimate::Enabled;
+  }
+
+  return TargetLoweringBase::ReciprocalEstimate::Unspecified;
+}
+
+/// For the input attribute string, return the customized refinement step count
+/// for this operation on the specified data type. If the step count does not
+/// exist, return the ReciprocalEstimate enum value for unspecified.
+static int getOpRefinementSteps(bool IsSqrt, EVT VT, StringRef Override) {
+  if (Override.empty())
+    return TargetLoweringBase::ReciprocalEstimate::Unspecified;
+
+  SmallVector<StringRef, 4> OverrideVector;
+  Override.split(OverrideVector, ',');
+  unsigned NumArgs = OverrideVector.size();
+
+  // Check if "all", "default", or "none" was specified.
+  if (NumArgs == 1) {
+    // Look for an optional setting of the number of refinement steps needed
+    // for this type of reciprocal operation.
+    size_t RefPos;
+    uint8_t RefSteps;
+    if (!parseRefinementStep(Override, RefPos, RefSteps))
+      return TargetLoweringBase::ReciprocalEstimate::Unspecified;
+
+    // Split the string for further processing.
+    Override = Override.substr(0, RefPos);
+    assert(Override != "none" &&
+           "Disabled reciprocals, but specifed refinement steps?");
+
+    // If this is a general override, return the specified number of steps.
+    if (Override == "all" || Override == "default")
+      return RefSteps;
+  }
+
+  // The attribute string may omit the size suffix ('f'/'d').
+  std::string VTName = getReciprocalOpName(IsSqrt, VT);
+  std::string VTNameNoSize = VTName;
+  VTNameNoSize.pop_back();
+
+  for (StringRef RecipType : OverrideVector) {
+    size_t RefPos;
+    uint8_t RefSteps;
+    if (!parseRefinementStep(RecipType, RefPos, RefSteps))
+      continue;
+
+    RecipType = RecipType.substr(0, RefPos);
+    if (RecipType.equals(VTName) || RecipType.equals(VTNameNoSize))
+      return RefSteps;
+  }
+
+  return TargetLoweringBase::ReciprocalEstimate::Unspecified;
+}
+
+int TargetLoweringBase::getRecipEstimateSqrtEnabled(EVT VT,
+                                                    MachineFunction &MF) const {
+  return getOpEnabled(true, VT, getRecipEstimateForFunc(MF));
+}
+
+int TargetLoweringBase::getRecipEstimateDivEnabled(EVT VT,
+                                                   MachineFunction &MF) const {
+  return getOpEnabled(false, VT, getRecipEstimateForFunc(MF));
+}
+
+int TargetLoweringBase::getSqrtRefinementSteps(EVT VT,
+                                               MachineFunction &MF) const {
+  return getOpRefinementSteps(true, VT, getRecipEstimateForFunc(MF));
+}
+
+int TargetLoweringBase::getDivRefinementSteps(EVT VT,
+                                              MachineFunction &MF) const {
+  return getOpRefinementSteps(false, VT, getRecipEstimateForFunc(MF));
+}
+
+bool TargetLoweringBase::isLoadBitCastBeneficial(
+    EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG,
+    const MachineMemOperand &MMO) const {
+  // Single-element vectors are scalarized, so we should generally avoid having
+  // any memory operations on such types, as they would get scalarized too.
+  if (LoadVT.isFixedLengthVector() && BitcastVT.isFixedLengthVector() &&
+      BitcastVT.getVectorNumElements() == 1)
+    return false;
+
+  // Don't do if we could do an indexed load on the original type, but not on
+  // the new one.
+  if (!LoadVT.isSimple() || !BitcastVT.isSimple())
+    return true;
+
+  MVT LoadMVT = LoadVT.getSimpleVT();
+
+  // Don't bother doing this if it's just going to be promoted again later, as
+  // doing so might interfere with other combines.
+  if (getOperationAction(ISD::LOAD, LoadMVT) == Promote &&
+      getTypeToPromoteTo(ISD::LOAD, LoadMVT) == BitcastVT.getSimpleVT())
+    return false;
+
+  unsigned Fast = 0;
+  return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), BitcastVT,
+                            MMO, &Fast) &&
+         Fast;
+}
+
+void TargetLoweringBase::finalizeLowering(MachineFunction &MF) const {
+  MF.getRegInfo().freezeReservedRegs(MF);
+}
+
+MachineMemOperand::Flags TargetLoweringBase::getLoadMemOperandFlags(
+    const LoadInst &LI, const DataLayout &DL, AssumptionCache *AC,
+    const TargetLibraryInfo *LibInfo) const {
+  MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad;
+  if (LI.isVolatile())
+    Flags |= MachineMemOperand::MOVolatile;
+
+  if (LI.hasMetadata(LLVMContext::MD_nontemporal))
+    Flags |= MachineMemOperand::MONonTemporal;
+
+  if (LI.hasMetadata(LLVMContext::MD_invariant_load))
+    Flags |= MachineMemOperand::MOInvariant;
+
+  if (isDereferenceableAndAlignedPointer(LI.getPointerOperand(), LI.getType(),
+                                         LI.getAlign(), DL, &LI, AC,
+                                         /*DT=*/nullptr, LibInfo))
+    Flags |= MachineMemOperand::MODereferenceable;
+
+  Flags |= getTargetMMOFlags(LI);
+  return Flags;
+}
+
+MachineMemOperand::Flags
+TargetLoweringBase::getStoreMemOperandFlags(const StoreInst &SI,
+                                            const DataLayout &DL) const {
+  MachineMemOperand::Flags Flags = MachineMemOperand::MOStore;
+
+  if (SI.isVolatile())
+    Flags |= MachineMemOperand::MOVolatile;
+
+  if (SI.hasMetadata(LLVMContext::MD_nontemporal))
+    Flags |= MachineMemOperand::MONonTemporal;
+
+  // FIXME: Not preserving dereferenceable
+  Flags |= getTargetMMOFlags(SI);
+  return Flags;
+}
+
+MachineMemOperand::Flags
+TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI,
+                                             const DataLayout &DL) const {
+  auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+
+  if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(&AI)) {
+    if (RMW->isVolatile())
+      Flags |= MachineMemOperand::MOVolatile;
+  } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(&AI)) {
+    if (CmpX->isVolatile())
+      Flags |= MachineMemOperand::MOVolatile;
+  } else
+    llvm_unreachable("not an atomic instruction");
+
+  // FIXME: Not preserving dereferenceable
+  Flags |= getTargetMMOFlags(AI);
+  return Flags;
+}
+
+Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder,
+                                                  Instruction *Inst,
+                                                  AtomicOrdering Ord) const {
+  if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
+    return Builder.CreateFence(Ord);
+  else
+    return nullptr;
+}
+
+Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder,
+                                                   Instruction *Inst,
+                                                   AtomicOrdering Ord) const {
+  if (isAcquireOrStronger(Ord))
+    return Builder.CreateFence(Ord);
+  else
+    return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+//  GlobalISel Hooks
+//===----------------------------------------------------------------------===//
+
+bool TargetLoweringBase::shouldLocalize(const MachineInstr &MI,
+                                        const TargetTransformInfo *TTI) const {
+  auto &MF = *MI.getMF();
+  auto &MRI = MF.getRegInfo();
+  // Assuming a spill and reload of a value has a cost of 1 instruction each,
+  // this helper function computes the maximum number of uses we should consider
+  // for remat. E.g. on arm64 global addresses take 2 insts to materialize. We
+  // break even in terms of code size when the original MI has 2 users vs
+  // choosing to potentially spill. Any more than 2 users we we have a net code
+  // size increase. This doesn't take into account register pressure though.
+  auto maxUses = [](unsigned RematCost) {
+    // A cost of 1 means remats are basically free.
+    if (RematCost == 1)
+      return std::numeric_limits<unsigned>::max();
+    if (RematCost == 2)
+      return 2U;
+
+    // Remat is too expensive, only sink if there's one user.
+    if (RematCost > 2)
+      return 1U;
+    llvm_unreachable("Unexpected remat cost");
+  };
+
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  // Constants-like instructions should be close to their users.
+  // We don't want long live-ranges for them.
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_FCONSTANT:
+  case TargetOpcode::G_FRAME_INDEX:
+  case TargetOpcode::G_INTTOPTR:
+    return true;
+  case TargetOpcode::G_GLOBAL_VALUE: {
+    unsigned RematCost = TTI->getGISelRematGlobalCost();
+    Register Reg = MI.getOperand(0).getReg();
+    unsigned MaxUses = maxUses(RematCost);
+    if (MaxUses == UINT_MAX)
+      return true; // Remats are "free" so always localize.
+    return MRI.hasAtMostUserInstrs(Reg, MaxUses);
+  }
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
new file mode 100644
index 000000000000..55fb522554fa
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -0,0 +1,2680 @@
+//===- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements classes used to handle lowerings specific to common
+// object file formats.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/CodeGen/BasicBlockSectionUtils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PseudoProbe.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionGOFF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionWasm.h"
+#include "llvm/MC/MCSectionXCOFF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Base64.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Triple.h"
+#include <cassert>
+#include <string>
+
+using namespace llvm;
+using namespace dwarf;
+
+static cl::opt<bool> JumpTableInFunctionSection(
+    "jumptable-in-function-section", cl::Hidden, cl::init(false),
+    cl::desc("Putting Jump Table in function section"));
+
+static void GetObjCImageInfo(Module &M, unsigned &Version, unsigned &Flags,
+                             StringRef &Section) {
+  SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
+  M.getModuleFlagsMetadata(ModuleFlags);
+
+  for (const auto &MFE: ModuleFlags) {
+    // Ignore flags with 'Require' behaviour.
+    if (MFE.Behavior == Module::Require)
+      continue;
+
+    StringRef Key = MFE.Key->getString();
+    if (Key == "Objective-C Image Info Version") {
+      Version = mdconst::extract<ConstantInt>(MFE.Val)->getZExtValue();
+    } else if (Key == "Objective-C Garbage Collection" ||
+               Key == "Objective-C GC Only" ||
+               Key == "Objective-C Is Simulated" ||
+               Key == "Objective-C Class Properties" ||
+               Key == "Objective-C Image Swift Version") {
+      Flags |= mdconst::extract<ConstantInt>(MFE.Val)->getZExtValue();
+    } else if (Key == "Objective-C Image Info Section") {
+      Section = cast<MDString>(MFE.Val)->getString();
+    }
+    // Backend generates L_OBJC_IMAGE_INFO from Swift ABI version + major + minor +
+    // "Objective-C Garbage Collection".
+    else if (Key == "Swift ABI Version") {
+      Flags |= (mdconst::extract<ConstantInt>(MFE.Val)->getZExtValue()) << 8;
+    } else if (Key == "Swift Major Version") {
+      Flags |= (mdconst::extract<ConstantInt>(MFE.Val)->getZExtValue()) << 24;
+    } else if (Key == "Swift Minor Version") {
+      Flags |= (mdconst::extract<ConstantInt>(MFE.Val)->getZExtValue()) << 16;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                                  ELF
+//===----------------------------------------------------------------------===//
+
+TargetLoweringObjectFileELF::TargetLoweringObjectFileELF() {
+  SupportDSOLocalEquivalentLowering = true;
+}
+
+void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx,
+                                             const TargetMachine &TgtM) {
+  TargetLoweringObjectFile::Initialize(Ctx, TgtM);
+
+  CodeModel::Model CM = TgtM.getCodeModel();
+  InitializeELF(TgtM.Options.UseInitArray);
+
+  switch (TgtM.getTargetTriple().getArch()) {
+  case Triple::arm:
+  case Triple::armeb:
+  case Triple::thumb:
+  case Triple::thumbeb:
+    if (Ctx.getAsmInfo()->getExceptionHandlingType() == ExceptionHandling::ARM)
+      break;
+    // Fallthrough if not using EHABI
+    [[fallthrough]];
+  case Triple::ppc:
+  case Triple::ppcle:
+  case Triple::x86:
+    PersonalityEncoding = isPositionIndependent()
+                              ? dwarf::DW_EH_PE_indirect |
+                                    dwarf::DW_EH_PE_pcrel |
+                                    dwarf::DW_EH_PE_sdata4
+                              : dwarf::DW_EH_PE_absptr;
+    LSDAEncoding = isPositionIndependent()
+                       ? dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
+                       : dwarf::DW_EH_PE_absptr;
+    TTypeEncoding = isPositionIndependent()
+                        ? dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+                              dwarf::DW_EH_PE_sdata4
+                        : dwarf::DW_EH_PE_absptr;
+    break;
+  case Triple::x86_64:
+    if (isPositionIndependent()) {
+      PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+        ((CM == CodeModel::Small || CM == CodeModel::Medium)
+         ? dwarf::DW_EH_PE_sdata4 : dwarf::DW_EH_PE_sdata8);
+      LSDAEncoding = dwarf::DW_EH_PE_pcrel |
+        (CM == CodeModel::Small
+         ? dwarf::DW_EH_PE_sdata4 : dwarf::DW_EH_PE_sdata8);
+      TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+        ((CM == CodeModel::Small || CM == CodeModel::Medium)
+         ? dwarf::DW_EH_PE_sdata4 : dwarf::DW_EH_PE_sdata8);
+    } else {
+      PersonalityEncoding =
+        (CM == CodeModel::Small || CM == CodeModel::Medium)
+        ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_absptr;
+      LSDAEncoding = (CM == CodeModel::Small)
+        ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_absptr;
+      TTypeEncoding = (CM == CodeModel::Small)
+        ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_absptr;
+    }
+    break;
+  case Triple::hexagon:
+    PersonalityEncoding = dwarf::DW_EH_PE_absptr;
+    LSDAEncoding = dwarf::DW_EH_PE_absptr;
+    TTypeEncoding = dwarf::DW_EH_PE_absptr;
+    if (isPositionIndependent()) {
+      PersonalityEncoding |= dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel;
+      LSDAEncoding |= dwarf::DW_EH_PE_pcrel;
+      TTypeEncoding |= dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel;
+    }
+    break;
+  case Triple::aarch64:
+  case Triple::aarch64_be:
+  case Triple::aarch64_32:
+    // The small model guarantees static code/data size < 4GB, but not where it
+    // will be in memory. Most of these could end up >2GB away so even a signed
+    // pc-relative 32-bit address is insufficient, theoretically.
+    //
+    // Use DW_EH_PE_indirect even for -fno-pic to avoid copy relocations.
+    LSDAEncoding = dwarf::DW_EH_PE_pcrel |
+                   (TgtM.getTargetTriple().getEnvironment() == Triple::GNUILP32
+                        ? dwarf::DW_EH_PE_sdata4
+                        : dwarf::DW_EH_PE_sdata8);
+    PersonalityEncoding = LSDAEncoding | dwarf::DW_EH_PE_indirect;
+    TTypeEncoding = LSDAEncoding | dwarf::DW_EH_PE_indirect;
+    break;
+  case Triple::lanai:
+    LSDAEncoding = dwarf::DW_EH_PE_absptr;
+    PersonalityEncoding = dwarf::DW_EH_PE_absptr;
+    TTypeEncoding = dwarf::DW_EH_PE_absptr;
+    break;
+  case Triple::mips:
+  case Triple::mipsel:
+  case Triple::mips64:
+  case Triple::mips64el:
+    // MIPS uses indirect pointer to refer personality functions and types, so
+    // that the eh_frame section can be read-only. DW.ref.personality will be
+    // generated for relocation.
+    PersonalityEncoding = dwarf::DW_EH_PE_indirect;
+    // FIXME: The N64 ABI probably ought to use DW_EH_PE_sdata8 but we can't
+    //        identify N64 from just a triple.
+    TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+                    dwarf::DW_EH_PE_sdata4;
+    // We don't support PC-relative LSDA references in GAS so we use the default
+    // DW_EH_PE_absptr for those.
+
+    // FreeBSD must be explicit about the data size and using pcrel since it's
+    // assembler/linker won't do the automatic conversion that the Linux tools
+    // do.
+    if (TgtM.getTargetTriple().isOSFreeBSD()) {
+      PersonalityEncoding |= dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+      LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+    }
+    break;
+  case Triple::ppc64:
+  case Triple::ppc64le:
+    PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+      dwarf::DW_EH_PE_udata8;
+    LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata8;
+    TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+      dwarf::DW_EH_PE_udata8;
+    break;
+  case Triple::sparcel:
+  case Triple::sparc:
+    if (isPositionIndependent()) {
+      LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+      PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+        dwarf::DW_EH_PE_sdata4;
+      TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+        dwarf::DW_EH_PE_sdata4;
+    } else {
+      LSDAEncoding = dwarf::DW_EH_PE_absptr;
+      PersonalityEncoding = dwarf::DW_EH_PE_absptr;
+      TTypeEncoding = dwarf::DW_EH_PE_absptr;
+    }
+    CallSiteEncoding = dwarf::DW_EH_PE_udata4;
+    break;
+  case Triple::riscv32:
+  case Triple::riscv64:
+    LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+    PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+                          dwarf::DW_EH_PE_sdata4;
+    TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+                    dwarf::DW_EH_PE_sdata4;
+    CallSiteEncoding = dwarf::DW_EH_PE_udata4;
+    break;
+  case Triple::sparcv9:
+    LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+    if (isPositionIndependent()) {
+      PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+        dwarf::DW_EH_PE_sdata4;
+      TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+        dwarf::DW_EH_PE_sdata4;
+    } else {
+      PersonalityEncoding = dwarf::DW_EH_PE_absptr;
+      TTypeEncoding = dwarf::DW_EH_PE_absptr;
+    }
+    break;
+  case Triple::systemz:
+    // All currently-defined code models guarantee that 4-byte PC-relative
+    // values will be in range.
+    if (isPositionIndependent()) {
+      PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+        dwarf::DW_EH_PE_sdata4;
+      LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+      TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+        dwarf::DW_EH_PE_sdata4;
+    } else {
+      PersonalityEncoding = dwarf::DW_EH_PE_absptr;
+      LSDAEncoding = dwarf::DW_EH_PE_absptr;
+      TTypeEncoding = dwarf::DW_EH_PE_absptr;
+    }
+    break;
+  case Triple::loongarch32:
+  case Triple::loongarch64:
+    LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+    PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+                          dwarf::DW_EH_PE_sdata4;
+    TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+                    dwarf::DW_EH_PE_sdata4;
+    break;
+  default:
+    break;
+  }
+}
+
+void TargetLoweringObjectFileELF::getModuleMetadata(Module &M) {
+  SmallVector<GlobalValue *, 4> Vec;
+  collectUsedGlobalVariables(M, Vec, false);
+  for (GlobalValue *GV : Vec)
+    if (auto *GO = dyn_cast<GlobalObject>(GV))
+      Used.insert(GO);
+}
+
+void TargetLoweringObjectFileELF::emitModuleMetadata(MCStreamer &Streamer,
+                                                     Module &M) const {
+  auto &C = getContext();
+
+  if (NamedMDNode *LinkerOptions = M.getNamedMetadata("llvm.linker.options")) {
+    auto *S = C.getELFSection(".linker-options", ELF::SHT_LLVM_LINKER_OPTIONS,
+                              ELF::SHF_EXCLUDE);
+
+    Streamer.switchSection(S);
+
+    for (const auto *Operand : LinkerOptions->operands()) {
+      if (cast<MDNode>(Operand)->getNumOperands() != 2)
+        report_fatal_error("invalid llvm.linker.options");
+      for (const auto &Option : cast<MDNode>(Operand)->operands()) {
+        Streamer.emitBytes(cast<MDString>(Option)->getString());
+        Streamer.emitInt8(0);
+      }
+    }
+  }
+
+  if (NamedMDNode *DependentLibraries = M.getNamedMetadata("llvm.dependent-libraries")) {
+    auto *S = C.getELFSection(".deplibs", ELF::SHT_LLVM_DEPENDENT_LIBRARIES,
+                              ELF::SHF_MERGE | ELF::SHF_STRINGS, 1);
+
+    Streamer.switchSection(S);
+
+    for (const auto *Operand : DependentLibraries->operands()) {
+      Streamer.emitBytes(
+          cast<MDString>(cast<MDNode>(Operand)->getOperand(0))->getString());
+      Streamer.emitInt8(0);
+    }
+  }
+
+  if (NamedMDNode *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName)) {
+    // Emit a descriptor for every function including functions that have an
+    // available external linkage. We may not want this for imported functions
+    // that has code in another thinLTO module but we don't have a good way to
+    // tell them apart from inline functions defined in header files. Therefore
+    // we put each descriptor in a separate comdat section and rely on the
+    // linker to deduplicate.
+    for (const auto *Operand : FuncInfo->operands()) {
+      const auto *MD = cast<MDNode>(Operand);
+      auto *GUID = mdconst::dyn_extract<ConstantInt>(MD->getOperand(0));
+      auto *Hash = mdconst::dyn_extract<ConstantInt>(MD->getOperand(1));
+      auto *Name = cast<MDString>(MD->getOperand(2));
+      auto *S = C.getObjectFileInfo()->getPseudoProbeDescSection(
+          TM->getFunctionSections() ? Name->getString() : StringRef());
+
+      Streamer.switchSection(S);
+      Streamer.emitInt64(GUID->getZExtValue());
+      Streamer.emitInt64(Hash->getZExtValue());
+      Streamer.emitULEB128IntValue(Name->getString().size());
+      Streamer.emitBytes(Name->getString());
+    }
+  }
+
+  if (NamedMDNode *LLVMStats = M.getNamedMetadata("llvm.stats")) {
+    // Emit the metadata for llvm statistics into .llvm_stats section, which is
+    // formatted as a list of key/value pair, the value is base64 encoded.
+    auto *S = C.getObjectFileInfo()->getLLVMStatsSection();
+    Streamer.switchSection(S);
+    for (const auto *Operand : LLVMStats->operands()) {
+      const auto *MD = cast<MDNode>(Operand);
+      assert(MD->getNumOperands() % 2 == 0 &&
+             ("Operand num should be even for a list of key/value pair"));
+      for (size_t I = 0; I < MD->getNumOperands(); I += 2) {
+        // Encode the key string size.
+        auto *Key = cast<MDString>(MD->getOperand(I));
+        Streamer.emitULEB128IntValue(Key->getString().size());
+        Streamer.emitBytes(Key->getString());
+        // Encode the value into a Base64 string.
+        std::string Value = encodeBase64(
+            Twine(mdconst::dyn_extract<ConstantInt>(MD->getOperand(I + 1))
+                      ->getZExtValue())
+                .str());
+        Streamer.emitULEB128IntValue(Value.size());
+        Streamer.emitBytes(Value);
+      }
+    }
+  }
+
+  unsigned Version = 0;
+  unsigned Flags = 0;
+  StringRef Section;
+
+  GetObjCImageInfo(M, Version, Flags, Section);
+  if (!Section.empty()) {
+    auto *S = C.getELFSection(Section, ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+    Streamer.switchSection(S);
+    Streamer.emitLabel(C.getOrCreateSymbol(StringRef("OBJC_IMAGE_INFO")));
+    Streamer.emitInt32(Version);
+    Streamer.emitInt32(Flags);
+    Streamer.addBlankLine();
+  }
+
+  emitCGProfileMetadata(Streamer, M);
+}
+
+MCSymbol *TargetLoweringObjectFileELF::getCFIPersonalitySymbol(
+    const GlobalValue *GV, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  unsigned Encoding = getPersonalityEncoding();
+  if ((Encoding & 0x80) == DW_EH_PE_indirect)
+    return getContext().getOrCreateSymbol(StringRef("DW.ref.") +
+                                          TM.getSymbol(GV)->getName());
+  if ((Encoding & 0x70) == DW_EH_PE_absptr)
+    return TM.getSymbol(GV);
+  report_fatal_error("We do not support this DWARF encoding yet!");
+}
+
+void TargetLoweringObjectFileELF::emitPersonalityValue(
+    MCStreamer &Streamer, const DataLayout &DL, const MCSymbol *Sym) const {
+  SmallString<64> NameData("DW.ref.");
+  NameData += Sym->getName();
+  MCSymbolELF *Label =
+      cast<MCSymbolELF>(getContext().getOrCreateSymbol(NameData));
+  Streamer.emitSymbolAttribute(Label, MCSA_Hidden);
+  Streamer.emitSymbolAttribute(Label, MCSA_Weak);
+  unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_GROUP;
+  MCSection *Sec = getContext().getELFNamedSection(".data", Label->getName(),
+                                                   ELF::SHT_PROGBITS, Flags, 0);
+  unsigned Size = DL.getPointerSize();
+  Streamer.switchSection(Sec);
+  Streamer.emitValueToAlignment(DL.getPointerABIAlignment(0));
+  Streamer.emitSymbolAttribute(Label, MCSA_ELF_TypeObject);
+  const MCExpr *E = MCConstantExpr::create(Size, getContext());
+  Streamer.emitELFSize(Label, E);
+  Streamer.emitLabel(Label);
+
+  Streamer.emitSymbolValue(Sym, Size);
+}
+
+const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
+    MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+  if (Encoding & DW_EH_PE_indirect) {
+    MachineModuleInfoELF &ELFMMI = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    MCSymbol *SSym = getSymbolWithGlobalValueBase(GV, ".DW.stub", TM);
+
+    // Add information about the stub reference to ELFMMI so that the stub
+    // gets emitted by the asmprinter.
+    MachineModuleInfoImpl::StubValueTy &StubSym = ELFMMI.getGVStubEntry(SSym);
+    if (!StubSym.getPointer()) {
+      MCSymbol *Sym = TM.getSymbol(GV);
+      StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
+    }
+
+    return TargetLoweringObjectFile::
+      getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()),
+                        Encoding & ~DW_EH_PE_indirect, Streamer);
+  }
+
+  return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, TM,
+                                                           MMI, Streamer);
+}
+
+static SectionKind getELFKindForNamedSection(StringRef Name, SectionKind K) {
+  // N.B.: The defaults used in here are not the same ones used in MC.
+  // We follow gcc, MC follows gas. For example, given ".section .eh_frame",
+  // both gas and MC will produce a section with no flags. Given
+  // section(".eh_frame") gcc will produce:
+  //
+  //   .section   .eh_frame,"a",@progbits
+
+  if (Name == getInstrProfSectionName(IPSK_covmap, Triple::ELF,
+                                      /*AddSegmentInfo=*/false) ||
+      Name == getInstrProfSectionName(IPSK_covfun, Triple::ELF,
+                                      /*AddSegmentInfo=*/false) ||
+      Name == ".llvmbc" || Name == ".llvmcmd")
+    return SectionKind::getMetadata();
+
+  if (Name.empty() || Name[0] != '.') return K;
+
+  // Default implementation based on some magic section names.
+  if (Name == ".bss" ||
+      Name.startswith(".bss.") ||
+      Name.startswith(".gnu.linkonce.b.") ||
+      Name.startswith(".llvm.linkonce.b.") ||
+      Name == ".sbss" ||
+      Name.startswith(".sbss.") ||
+      Name.startswith(".gnu.linkonce.sb.") ||
+      Name.startswith(".llvm.linkonce.sb."))
+    return SectionKind::getBSS();
+
+  if (Name == ".tdata" ||
+      Name.startswith(".tdata.") ||
+      Name.startswith(".gnu.linkonce.td.") ||
+      Name.startswith(".llvm.linkonce.td."))
+    return SectionKind::getThreadData();
+
+  if (Name == ".tbss" ||
+      Name.startswith(".tbss.") ||
+      Name.startswith(".gnu.linkonce.tb.") ||
+      Name.startswith(".llvm.linkonce.tb."))
+    return SectionKind::getThreadBSS();
+
+  return K;
+}
+
+static bool hasPrefix(StringRef SectionName, StringRef Prefix) {
+  return SectionName.consume_front(Prefix) &&
+         (SectionName.empty() || SectionName[0] == '.');
+}
+
+static unsigned getELFSectionType(StringRef Name, SectionKind K) {
+  // Use SHT_NOTE for section whose name starts with ".note" to allow
+  // emitting ELF notes from C variable declaration.
+  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77609
+  if (Name.startswith(".note"))
+    return ELF::SHT_NOTE;
+
+  if (hasPrefix(Name, ".init_array"))
+    return ELF::SHT_INIT_ARRAY;
+
+  if (hasPrefix(Name, ".fini_array"))
+    return ELF::SHT_FINI_ARRAY;
+
+  if (hasPrefix(Name, ".preinit_array"))
+    return ELF::SHT_PREINIT_ARRAY;
+
+  if (hasPrefix(Name, ".llvm.offloading"))
+    return ELF::SHT_LLVM_OFFLOADING;
+
+  if (K.isBSS() || K.isThreadBSS())
+    return ELF::SHT_NOBITS;
+
+  return ELF::SHT_PROGBITS;
+}
+
+static unsigned getELFSectionFlags(SectionKind K) {
+  unsigned Flags = 0;
+
+  if (!K.isMetadata() && !K.isExclude())
+    Flags |= ELF::SHF_ALLOC;
+
+  if (K.isExclude())
+    Flags |= ELF::SHF_EXCLUDE;
+
+  if (K.isText())
+    Flags |= ELF::SHF_EXECINSTR;
+
+  if (K.isExecuteOnly())
+    Flags |= ELF::SHF_ARM_PURECODE;
+
+  if (K.isWriteable())
+    Flags |= ELF::SHF_WRITE;
+
+  if (K.isThreadLocal())
+    Flags |= ELF::SHF_TLS;
+
+  if (K.isMergeableCString() || K.isMergeableConst())
+    Flags |= ELF::SHF_MERGE;
+
+  if (K.isMergeableCString())
+    Flags |= ELF::SHF_STRINGS;
+
+  return Flags;
+}
+
+static const Comdat *getELFComdat(const GlobalValue *GV) {
+  const Comdat *C = GV->getComdat();
+  if (!C)
+    return nullptr;
+
+  if (C->getSelectionKind() != Comdat::Any &&
+      C->getSelectionKind() != Comdat::NoDeduplicate)
+    report_fatal_error("ELF COMDATs only support SelectionKind::Any and "
+                       "SelectionKind::NoDeduplicate, '" +
+                       C->getName() + "' cannot be lowered.");
+
+  return C;
+}
+
+static const MCSymbolELF *getLinkedToSymbol(const GlobalObject *GO,
+                                            const TargetMachine &TM) {
+  MDNode *MD = GO->getMetadata(LLVMContext::MD_associated);
+  if (!MD)
+    return nullptr;
+
+  auto *VM = cast<ValueAsMetadata>(MD->getOperand(0).get());
+  auto *OtherGV = dyn_cast<GlobalValue>(VM->getValue());
+  return OtherGV ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGV)) : nullptr;
+}
+
+static unsigned getEntrySizeForKind(SectionKind Kind) {
+  if (Kind.isMergeable1ByteCString())
+    return 1;
+  else if (Kind.isMergeable2ByteCString())
+    return 2;
+  else if (Kind.isMergeable4ByteCString())
+    return 4;
+  else if (Kind.isMergeableConst4())
+    return 4;
+  else if (Kind.isMergeableConst8())
+    return 8;
+  else if (Kind.isMergeableConst16())
+    return 16;
+  else if (Kind.isMergeableConst32())
+    return 32;
+  else {
+    // We shouldn't have mergeable C strings or mergeable constants that we
+    // didn't handle above.
+    assert(!Kind.isMergeableCString() && "unknown string width");
+    assert(!Kind.isMergeableConst() && "unknown data width");
+    return 0;
+  }
+}
+
+/// Return the section prefix name used by options FunctionsSections and
+/// DataSections.
+static StringRef getSectionPrefixForGlobal(SectionKind Kind, bool IsLarge) {
+  if (Kind.isText())
+    return ".text";
+  if (Kind.isReadOnly())
+    return IsLarge ? ".lrodata" : ".rodata";
+  if (Kind.isBSS())
+    return IsLarge ? ".lbss" : ".bss";
+  if (Kind.isThreadData())
+    return ".tdata";
+  if (Kind.isThreadBSS())
+    return ".tbss";
+  if (Kind.isData())
+    return IsLarge ? ".ldata" : ".data";
+  if (Kind.isReadOnlyWithRel())
+    return IsLarge ? ".ldata.rel.ro" : ".data.rel.ro";
+  llvm_unreachable("Unknown section kind");
+}
+
+static SmallString<128>
+getELFSectionNameForGlobal(const GlobalObject *GO, SectionKind Kind,
+                           Mangler &Mang, const TargetMachine &TM,
+                           unsigned EntrySize, bool UniqueSectionName) {
+  SmallString<128> Name;
+  if (Kind.isMergeableCString()) {
+    // We also need alignment here.
+    // FIXME: this is getting the alignment of the character, not the
+    // alignment of the global!
+    Align Alignment = GO->getParent()->getDataLayout().getPreferredAlign(
+        cast<GlobalVariable>(GO));
+
+    std::string SizeSpec = ".rodata.str" + utostr(EntrySize) + ".";
+    Name = SizeSpec + utostr(Alignment.value());
+  } else if (Kind.isMergeableConst()) {
+    Name = ".rodata.cst";
+    Name += utostr(EntrySize);
+  } else {
+    bool IsLarge = false;
+    if (isa<GlobalVariable>(GO))
+      IsLarge = TM.isLargeData();
+    Name = getSectionPrefixForGlobal(Kind, IsLarge);
+  }
+
+  bool HasPrefix = false;
+  if (const auto *F = dyn_cast<Function>(GO)) {
+    if (std::optional<StringRef> Prefix = F->getSectionPrefix()) {
+      raw_svector_ostream(Name) << '.' << *Prefix;
+      HasPrefix = true;
+    }
+  }
+
+  if (UniqueSectionName) {
+    Name.push_back('.');
+    TM.getNameWithPrefix(Name, GO, Mang, /*MayAlwaysUsePrivate*/true);
+  } else if (HasPrefix)
+    // For distinguishing between .text.${text-section-prefix}. (with trailing
+    // dot) and .text.${function-name}
+    Name.push_back('.');
+  return Name;
+}
+
+namespace {
+class LoweringDiagnosticInfo : public DiagnosticInfo {
+  const Twine &Msg;
+
+public:
+  LoweringDiagnosticInfo(const Twine &DiagMsg,
+                         DiagnosticSeverity Severity = DS_Error)
+      : DiagnosticInfo(DK_Lowering, Severity), Msg(DiagMsg) {}
+  void print(DiagnosticPrinter &DP) const override { DP << Msg; }
+};
+}
+
+/// Calculate an appropriate unique ID for a section, and update Flags,
+/// EntrySize and NextUniqueID where appropriate.
+static unsigned
+calcUniqueIDUpdateFlagsAndSize(const GlobalObject *GO, StringRef SectionName,
+                               SectionKind Kind, const TargetMachine &TM,
+                               MCContext &Ctx, Mangler &Mang, unsigned &Flags,
+                               unsigned &EntrySize, unsigned &NextUniqueID,
+                               const bool Retain, const bool ForceUnique) {
+  // Increment uniqueID if we are forced to emit a unique section.
+  // This works perfectly fine with section attribute or pragma section as the
+  // sections with the same name are grouped together by the assembler.
+  if (ForceUnique)
+    return NextUniqueID++;
+
+  // A section can have at most one associated section. Put each global with
+  // MD_associated in a unique section.
+  const bool Associated = GO->getMetadata(LLVMContext::MD_associated);
+  if (Associated) {
+    Flags |= ELF::SHF_LINK_ORDER;
+    return NextUniqueID++;
+  }
+
+  if (Retain) {
+    if (TM.getTargetTriple().isOSSolaris())
+      Flags |= ELF::SHF_SUNW_NODISCARD;
+    else if (Ctx.getAsmInfo()->useIntegratedAssembler() ||
+             Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36))
+      Flags |= ELF::SHF_GNU_RETAIN;
+    return NextUniqueID++;
+  }
+
+  // If two symbols with differing sizes end up in the same mergeable section
+  // that section can be assigned an incorrect entry size. To avoid this we
+  // usually put symbols of the same size into distinct mergeable sections with
+  // the same name. Doing so relies on the ",unique ," assembly feature. This
+  // feature is not avalible until bintuils version 2.35
+  // (https://sourceware.org/bugzilla/show_bug.cgi?id=25380).
+  const bool SupportsUnique = Ctx.getAsmInfo()->useIntegratedAssembler() ||
+                              Ctx.getAsmInfo()->binutilsIsAtLeast(2, 35);
+  if (!SupportsUnique) {
+    Flags &= ~ELF::SHF_MERGE;
+    EntrySize = 0;
+    return MCContext::GenericSectionID;
+  }
+
+  const bool SymbolMergeable = Flags & ELF::SHF_MERGE;
+  const bool SeenSectionNameBefore =
+      Ctx.isELFGenericMergeableSection(SectionName);
+  // If this is the first ocurrence of this section name, treat it as the
+  // generic section
+  if (!SymbolMergeable && !SeenSectionNameBefore)
+    return MCContext::GenericSectionID;
+
+  // Symbols must be placed into sections with compatible entry sizes. Generate
+  // unique sections for symbols that have not been assigned to compatible
+  // sections.
+  const auto PreviousID =
+      Ctx.getELFUniqueIDForEntsize(SectionName, Flags, EntrySize);
+  if (PreviousID)
+    return *PreviousID;
+
+  // If the user has specified the same section name as would be created
+  // implicitly for this symbol e.g. .rodata.str1.1, then we don't need
+  // to unique the section as the entry size for this symbol will be
+  // compatible with implicitly created sections.
+  SmallString<128> ImplicitSectionNameStem =
+      getELFSectionNameForGlobal(GO, Kind, Mang, TM, EntrySize, false);
+  if (SymbolMergeable &&
+      Ctx.isELFImplicitMergeableSectionNamePrefix(SectionName) &&
+      SectionName.startswith(ImplicitSectionNameStem))
+    return MCContext::GenericSectionID;
+
+  // We have seen this section name before, but with different flags or entity
+  // size. Create a new unique ID.
+  return NextUniqueID++;
+}
+
+static MCSection *selectExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM,
+    MCContext &Ctx, Mangler &Mang, unsigned &NextUniqueID,
+    bool Retain, bool ForceUnique) {
+  StringRef SectionName = GO->getSection();
+
+  // Check if '#pragma clang section' name is applicable.
+  // Note that pragma directive overrides -ffunction-section, -fdata-section
+  // and so section name is exactly as user specified and not uniqued.
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(GO);
+  if (GV && GV->hasImplicitSection()) {
+    auto Attrs = GV->getAttributes();
+    if (Attrs.hasAttribute("bss-section") && Kind.isBSS()) {
+      SectionName = Attrs.getAttribute("bss-section").getValueAsString();
+    } else if (Attrs.hasAttribute("rodata-section") && Kind.isReadOnly()) {
+      SectionName = Attrs.getAttribute("rodata-section").getValueAsString();
+    } else if (Attrs.hasAttribute("relro-section") && Kind.isReadOnlyWithRel()) {
+      SectionName = Attrs.getAttribute("relro-section").getValueAsString();
+    } else if (Attrs.hasAttribute("data-section") && Kind.isData()) {
+      SectionName = Attrs.getAttribute("data-section").getValueAsString();
+    }
+  }
+  const Function *F = dyn_cast<Function>(GO);
+  if (F && F->hasFnAttribute("implicit-section-name")) {
+    SectionName = F->getFnAttribute("implicit-section-name").getValueAsString();
+  }
+
+  // Infer section flags from the section name if we can.
+  Kind = getELFKindForNamedSection(SectionName, Kind);
+
+  StringRef Group = "";
+  bool IsComdat = false;
+  unsigned Flags = getELFSectionFlags(Kind);
+  if (const Comdat *C = getELFComdat(GO)) {
+    Group = C->getName();
+    IsComdat = C->getSelectionKind() == Comdat::Any;
+    Flags |= ELF::SHF_GROUP;
+  }
+
+  unsigned EntrySize = getEntrySizeForKind(Kind);
+  const unsigned UniqueID = calcUniqueIDUpdateFlagsAndSize(
+      GO, SectionName, Kind, TM, Ctx, Mang, Flags, EntrySize, NextUniqueID,
+      Retain, ForceUnique);
+
+  const MCSymbolELF *LinkedToSym = getLinkedToSymbol(GO, TM);
+  MCSectionELF *Section = Ctx.getELFSection(
+      SectionName, getELFSectionType(SectionName, Kind), Flags, EntrySize,
+      Group, IsComdat, UniqueID, LinkedToSym);
+  // Make sure that we did not get some other section with incompatible sh_link.
+  // This should not be possible due to UniqueID code above.
+  assert(Section->getLinkedToSymbol() == LinkedToSym &&
+         "Associated symbol mismatch between sections");
+
+  if (!(Ctx.getAsmInfo()->useIntegratedAssembler() ||
+        Ctx.getAsmInfo()->binutilsIsAtLeast(2, 35))) {
+    // If we are using GNU as before 2.35, then this symbol might have
+    // been placed in an incompatible mergeable section. Emit an error if this
+    // is the case to avoid creating broken output.
+    if ((Section->getFlags() & ELF::SHF_MERGE) &&
+        (Section->getEntrySize() != getEntrySizeForKind(Kind)))
+      GO->getContext().diagnose(LoweringDiagnosticInfo(
+          "Symbol '" + GO->getName() + "' from module '" +
+          (GO->getParent() ? GO->getParent()->getSourceFileName() : "unknown") +
+          "' required a section with entry-size=" +
+          Twine(getEntrySizeForKind(Kind)) + " but was placed in section '" +
+          SectionName + "' with entry-size=" + Twine(Section->getEntrySize()) +
+          ": Explicit assignment by pragma or attribute of an incompatible "
+          "symbol to this section?"));
+  }
+
+  return Section;
+}
+
+MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  return selectExplicitSectionGlobal(GO, Kind, TM, getContext(), getMangler(),
+                                     NextUniqueID, Used.count(GO),
+                                     /* ForceUnique = */false);
+}
+
+static MCSectionELF *selectELFSectionForGlobal(
+    MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang,
+    const TargetMachine &TM, bool EmitUniqueSection, unsigned Flags,
+    unsigned *NextUniqueID, const MCSymbolELF *AssociatedSymbol) {
+
+  StringRef Group = "";
+  bool IsComdat = false;
+  if (const Comdat *C = getELFComdat(GO)) {
+    Flags |= ELF::SHF_GROUP;
+    Group = C->getName();
+    IsComdat = C->getSelectionKind() == Comdat::Any;
+  }
+  if (isa<GlobalVariable>(GO)) {
+    if (TM.isLargeData()) {
+      assert(TM.getTargetTriple().getArch() == Triple::x86_64);
+      Flags |= ELF::SHF_X86_64_LARGE;
+    }
+  }
+
+  // Get the section entry size based on the kind.
+  unsigned EntrySize = getEntrySizeForKind(Kind);
+
+  bool UniqueSectionName = false;
+  unsigned UniqueID = MCContext::GenericSectionID;
+  if (EmitUniqueSection) {
+    if (TM.getUniqueSectionNames()) {
+      UniqueSectionName = true;
+    } else {
+      UniqueID = *NextUniqueID;
+      (*NextUniqueID)++;
+    }
+  }
+  SmallString<128> Name = getELFSectionNameForGlobal(
+      GO, Kind, Mang, TM, EntrySize, UniqueSectionName);
+
+  // Use 0 as the unique ID for execute-only text.
+  if (Kind.isExecuteOnly())
+    UniqueID = 0;
+  return Ctx.getELFSection(Name, getELFSectionType(Name, Kind), Flags,
+                           EntrySize, Group, IsComdat, UniqueID,
+                           AssociatedSymbol);
+}
+
+static MCSection *selectELFSectionForGlobal(
+    MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang,
+    const TargetMachine &TM, bool Retain, bool EmitUniqueSection,
+    unsigned Flags, unsigned *NextUniqueID) {
+  const MCSymbolELF *LinkedToSym = getLinkedToSymbol(GO, TM);
+  if (LinkedToSym) {
+    EmitUniqueSection = true;
+    Flags |= ELF::SHF_LINK_ORDER;
+  }
+  if (Retain) {
+    if (TM.getTargetTriple().isOSSolaris()) {
+      EmitUniqueSection = true;
+      Flags |= ELF::SHF_SUNW_NODISCARD;
+    } else if (Ctx.getAsmInfo()->useIntegratedAssembler() ||
+               Ctx.getAsmInfo()->binutilsIsAtLeast(2, 36)) {
+      EmitUniqueSection = true;
+      Flags |= ELF::SHF_GNU_RETAIN;
+    }
+  }
+
+  MCSectionELF *Section = selectELFSectionForGlobal(
+      Ctx, GO, Kind, Mang, TM, EmitUniqueSection, Flags,
+      NextUniqueID, LinkedToSym);
+  assert(Section->getLinkedToSymbol() == LinkedToSym);
+  return Section;
+}
+
+MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  unsigned Flags = getELFSectionFlags(Kind);
+
+  // If we have -ffunction-section or -fdata-section then we should emit the
+  // global value to a uniqued section specifically for it.
+  bool EmitUniqueSection = false;
+  if (!(Flags & ELF::SHF_MERGE) && !Kind.isCommon()) {
+    if (Kind.isText())
+      EmitUniqueSection = TM.getFunctionSections();
+    else
+      EmitUniqueSection = TM.getDataSections();
+  }
+  EmitUniqueSection |= GO->hasComdat();
+  return selectELFSectionForGlobal(getContext(), GO, Kind, getMangler(), TM,
+                                   Used.count(GO), EmitUniqueSection, Flags,
+                                   &NextUniqueID);
+}
+
+MCSection *TargetLoweringObjectFileELF::getUniqueSectionForFunction(
+    const Function &F, const TargetMachine &TM) const {
+  SectionKind Kind = SectionKind::getText();
+  unsigned Flags = getELFSectionFlags(Kind);
+  // If the function's section names is pre-determined via pragma or a
+  // section attribute, call selectExplicitSectionGlobal.
+  if (F.hasSection() || F.hasFnAttribute("implicit-section-name"))
+    return selectExplicitSectionGlobal(
+        &F, Kind, TM, getContext(), getMangler(), NextUniqueID,
+        Used.count(&F), /* ForceUnique = */true);
+  else
+    return selectELFSectionForGlobal(
+        getContext(), &F, Kind, getMangler(), TM, Used.count(&F),
+        /*EmitUniqueSection=*/true, Flags, &NextUniqueID);
+}
+
+MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable(
+    const Function &F, const TargetMachine &TM) const {
+  // If the function can be removed, produce a unique section so that
+  // the table doesn't prevent the removal.
+  const Comdat *C = F.getComdat();
+  bool EmitUniqueSection = TM.getFunctionSections() || C;
+  if (!EmitUniqueSection)
+    return ReadOnlySection;
+
+  return selectELFSectionForGlobal(getContext(), &F, SectionKind::getReadOnly(),
+                                   getMangler(), TM, EmitUniqueSection,
+                                   ELF::SHF_ALLOC, &NextUniqueID,
+                                   /* AssociatedSymbol */ nullptr);
+}
+
+MCSection *TargetLoweringObjectFileELF::getSectionForLSDA(
+    const Function &F, const MCSymbol &FnSym, const TargetMachine &TM) const {
+  // If neither COMDAT nor function sections, use the monolithic LSDA section.
+  // Re-use this path if LSDASection is null as in the Arm EHABI.
+  if (!LSDASection || (!F.hasComdat() && !TM.getFunctionSections()))
+    return LSDASection;
+
+  const auto *LSDA = cast<MCSectionELF>(LSDASection);
+  unsigned Flags = LSDA->getFlags();
+  const MCSymbolELF *LinkedToSym = nullptr;
+  StringRef Group;
+  bool IsComdat = false;
+  if (const Comdat *C = getELFComdat(&F)) {
+    Flags |= ELF::SHF_GROUP;
+    Group = C->getName();
+    IsComdat = C->getSelectionKind() == Comdat::Any;
+  }
+  // Use SHF_LINK_ORDER to facilitate --gc-sections if we can use GNU ld>=2.36
+  // or LLD, which support mixed SHF_LINK_ORDER & non-SHF_LINK_ORDER.
+  if (TM.getFunctionSections() &&
+      (getContext().getAsmInfo()->useIntegratedAssembler() &&
+       getContext().getAsmInfo()->binutilsIsAtLeast(2, 36))) {
+    Flags |= ELF::SHF_LINK_ORDER;
+    LinkedToSym = cast<MCSymbolELF>(&FnSym);
+  }
+
+  // Append the function name as the suffix like GCC, assuming
+  // -funique-section-names applies to .gcc_except_table sections.
+  return getContext().getELFSection(
+      (TM.getUniqueSectionNames() ? LSDA->getName() + "." + F.getName()
+                                  : LSDA->getName()),
+      LSDA->getType(), Flags, 0, Group, IsComdat, MCSection::NonUniqueID,
+      LinkedToSym);
+}
+
+bool TargetLoweringObjectFileELF::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  // We can always create relative relocations, so use another section
+  // that can be marked non-executable.
+  return false;
+}
+
+/// Given a mergeable constant with the specified size and relocation
+/// information, return a section that it should be placed in.
+MCSection *TargetLoweringObjectFileELF::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C,
+    Align &Alignment) const {
+  if (Kind.isMergeableConst4() && MergeableConst4Section)
+    return MergeableConst4Section;
+  if (Kind.isMergeableConst8() && MergeableConst8Section)
+    return MergeableConst8Section;
+  if (Kind.isMergeableConst16() && MergeableConst16Section)
+    return MergeableConst16Section;
+  if (Kind.isMergeableConst32() && MergeableConst32Section)
+    return MergeableConst32Section;
+  if (Kind.isReadOnly())
+    return ReadOnlySection;
+
+  assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
+  return DataRelROSection;
+}
+
+/// Returns a unique section for the given machine basic block.
+MCSection *TargetLoweringObjectFileELF::getSectionForMachineBasicBlock(
+    const Function &F, const MachineBasicBlock &MBB,
+    const TargetMachine &TM) const {
+  assert(MBB.isBeginSection() && "Basic block does not start a section!");
+  unsigned UniqueID = MCContext::GenericSectionID;
+
+  // For cold sections use the .text.split. prefix along with the parent
+  // function name. All cold blocks for the same function go to the same
+  // section. Similarly all exception blocks are grouped by symbol name
+  // under the .text.eh prefix. For regular sections, we either use a unique
+  // name, or a unique ID for the section.
+  SmallString<128> Name;
+  if (MBB.getSectionID() == MBBSectionID::ColdSectionID) {
+    Name += BBSectionsColdTextPrefix;
+    Name += MBB.getParent()->getName();
+  } else if (MBB.getSectionID() == MBBSectionID::ExceptionSectionID) {
+    Name += ".text.eh.";
+    Name += MBB.getParent()->getName();
+  } else {
+    Name += MBB.getParent()->getSection()->getName();
+    if (TM.getUniqueBasicBlockSectionNames()) {
+      if (!Name.endswith("."))
+        Name += ".";
+      Name += MBB.getSymbol()->getName();
+    } else {
+      UniqueID = NextUniqueID++;
+    }
+  }
+
+  unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_EXECINSTR;
+  std::string GroupName;
+  if (F.hasComdat()) {
+    Flags |= ELF::SHF_GROUP;
+    GroupName = F.getComdat()->getName().str();
+  }
+  return getContext().getELFSection(Name, ELF::SHT_PROGBITS, Flags,
+                                    0 /* Entry Size */, GroupName,
+                                    F.hasComdat(), UniqueID, nullptr);
+}
+
+static MCSectionELF *getStaticStructorSection(MCContext &Ctx, bool UseInitArray,
+                                              bool IsCtor, unsigned Priority,
+                                              const MCSymbol *KeySym) {
+  std::string Name;
+  unsigned Type;
+  unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE;
+  StringRef Comdat = KeySym ? KeySym->getName() : "";
+
+  if (KeySym)
+    Flags |= ELF::SHF_GROUP;
+
+  if (UseInitArray) {
+    if (IsCtor) {
+      Type = ELF::SHT_INIT_ARRAY;
+      Name = ".init_array";
+    } else {
+      Type = ELF::SHT_FINI_ARRAY;
+      Name = ".fini_array";
+    }
+    if (Priority != 65535) {
+      Name += '.';
+      Name += utostr(Priority);
+    }
+  } else {
+    // The default scheme is .ctor / .dtor, so we have to invert the priority
+    // numbering.
+    if (IsCtor)
+      Name = ".ctors";
+    else
+      Name = ".dtors";
+    if (Priority != 65535)
+      raw_string_ostream(Name) << format(".%05u", 65535 - Priority);
+    Type = ELF::SHT_PROGBITS;
+  }
+
+  return Ctx.getELFSection(Name, Type, Flags, 0, Comdat, /*IsComdat=*/true);
+}
+
+MCSection *TargetLoweringObjectFileELF::getStaticCtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  return getStaticStructorSection(getContext(), UseInitArray, true, Priority,
+                                  KeySym);
+}
+
+MCSection *TargetLoweringObjectFileELF::getStaticDtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  return getStaticStructorSection(getContext(), UseInitArray, false, Priority,
+                                  KeySym);
+}
+
+const MCExpr *TargetLoweringObjectFileELF::lowerRelativeReference(
+    const GlobalValue *LHS, const GlobalValue *RHS,
+    const TargetMachine &TM) const {
+  // We may only use a PLT-relative relocation to refer to unnamed_addr
+  // functions.
+  if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy())
+    return nullptr;
+
+  // Basic correctness checks.
+  if (LHS->getType()->getPointerAddressSpace() != 0 ||
+      RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() ||
+      RHS->isThreadLocal())
+    return nullptr;
+
+  return MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(TM.getSymbol(LHS), PLTRelativeVariantKind,
+                              getContext()),
+      MCSymbolRefExpr::create(TM.getSymbol(RHS), getContext()), getContext());
+}
+
+const MCExpr *TargetLoweringObjectFileELF::lowerDSOLocalEquivalent(
+    const DSOLocalEquivalent *Equiv, const TargetMachine &TM) const {
+  assert(supportDSOLocalEquivalentLowering());
+
+  const auto *GV = Equiv->getGlobalValue();
+
+  // A PLT entry is not needed for dso_local globals.
+  if (GV->isDSOLocal() || GV->isImplicitDSOLocal())
+    return MCSymbolRefExpr::create(TM.getSymbol(GV), getContext());
+
+  return MCSymbolRefExpr::create(TM.getSymbol(GV), PLTRelativeVariantKind,
+                                 getContext());
+}
+
+MCSection *TargetLoweringObjectFileELF::getSectionForCommandLines() const {
+  // Use ".GCC.command.line" since this feature is to support clang's
+  // -frecord-gcc-switches which in turn attempts to mimic GCC's switch of the
+  // same name.
+  return getContext().getELFSection(".GCC.command.line", ELF::SHT_PROGBITS,
+                                    ELF::SHF_MERGE | ELF::SHF_STRINGS, 1);
+}
+
+void
+TargetLoweringObjectFileELF::InitializeELF(bool UseInitArray_) {
+  UseInitArray = UseInitArray_;
+  MCContext &Ctx = getContext();
+  if (!UseInitArray) {
+    StaticCtorSection = Ctx.getELFSection(".ctors", ELF::SHT_PROGBITS,
+                                          ELF::SHF_ALLOC | ELF::SHF_WRITE);
+
+    StaticDtorSection = Ctx.getELFSection(".dtors", ELF::SHT_PROGBITS,
+                                          ELF::SHF_ALLOC | ELF::SHF_WRITE);
+    return;
+  }
+
+  StaticCtorSection = Ctx.getELFSection(".init_array", ELF::SHT_INIT_ARRAY,
+                                        ELF::SHF_WRITE | ELF::SHF_ALLOC);
+  StaticDtorSection = Ctx.getELFSection(".fini_array", ELF::SHT_FINI_ARRAY,
+                                        ELF::SHF_WRITE | ELF::SHF_ALLOC);
+}
+
+//===----------------------------------------------------------------------===//
+//                                 MachO
+//===----------------------------------------------------------------------===//
+
+TargetLoweringObjectFileMachO::TargetLoweringObjectFileMachO() {
+  SupportIndirectSymViaGOTPCRel = true;
+}
+
+void TargetLoweringObjectFileMachO::Initialize(MCContext &Ctx,
+                                               const TargetMachine &TM) {
+  TargetLoweringObjectFile::Initialize(Ctx, TM);
+  if (TM.getRelocationModel() == Reloc::Static) {
+    StaticCtorSection = Ctx.getMachOSection("__TEXT", "__constructor", 0,
+                                            SectionKind::getData());
+    StaticDtorSection = Ctx.getMachOSection("__TEXT", "__destructor", 0,
+                                            SectionKind::getData());
+  } else {
+    StaticCtorSection = Ctx.getMachOSection("__DATA", "__mod_init_func",
+                                            MachO::S_MOD_INIT_FUNC_POINTERS,
+                                            SectionKind::getData());
+    StaticDtorSection = Ctx.getMachOSection("__DATA", "__mod_term_func",
+                                            MachO::S_MOD_TERM_FUNC_POINTERS,
+                                            SectionKind::getData());
+  }
+
+  PersonalityEncoding =
+      dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+  LSDAEncoding = dwarf::DW_EH_PE_pcrel;
+  TTypeEncoding =
+      dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+}
+
+MCSection *TargetLoweringObjectFileMachO::getStaticDtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  return StaticDtorSection;
+  // In userspace, we lower global destructors via atexit(), but kernel/kext
+  // environments do not provide this function so we still need to support the
+  // legacy way here.
+  // See the -disable-atexit-based-global-dtor-lowering CodeGen flag for more
+  // context.
+}
+
+void TargetLoweringObjectFileMachO::emitModuleMetadata(MCStreamer &Streamer,
+                                                       Module &M) const {
+  // Emit the linker options if present.
+  if (auto *LinkerOptions = M.getNamedMetadata("llvm.linker.options")) {
+    for (const auto *Option : LinkerOptions->operands()) {
+      SmallVector<std::string, 4> StrOptions;
+      for (const auto &Piece : cast<MDNode>(Option)->operands())
+        StrOptions.push_back(std::string(cast<MDString>(Piece)->getString()));
+      Streamer.emitLinkerOptions(StrOptions);
+    }
+  }
+
+  unsigned VersionVal = 0;
+  unsigned ImageInfoFlags = 0;
+  StringRef SectionVal;
+
+  GetObjCImageInfo(M, VersionVal, ImageInfoFlags, SectionVal);
+  emitCGProfileMetadata(Streamer, M);
+
+  // The section is mandatory. If we don't have it, then we don't have GC info.
+  if (SectionVal.empty())
+    return;
+
+  StringRef Segment, Section;
+  unsigned TAA = 0, StubSize = 0;
+  bool TAAParsed;
+  if (Error E = MCSectionMachO::ParseSectionSpecifier(
+          SectionVal, Segment, Section, TAA, TAAParsed, StubSize)) {
+    // If invalid, report the error with report_fatal_error.
+    report_fatal_error("Invalid section specifier '" + Section +
+                       "': " + toString(std::move(E)) + ".");
+  }
+
+  // Get the section.
+  MCSectionMachO *S = getContext().getMachOSection(
+      Segment, Section, TAA, StubSize, SectionKind::getData());
+  Streamer.switchSection(S);
+  Streamer.emitLabel(getContext().
+                     getOrCreateSymbol(StringRef("L_OBJC_IMAGE_INFO")));
+  Streamer.emitInt32(VersionVal);
+  Streamer.emitInt32(ImageInfoFlags);
+  Streamer.addBlankLine();
+}
+
+static void checkMachOComdat(const GlobalValue *GV) {
+  const Comdat *C = GV->getComdat();
+  if (!C)
+    return;
+
+  report_fatal_error("MachO doesn't support COMDATs, '" + C->getName() +
+                     "' cannot be lowered.");
+}
+
+MCSection *TargetLoweringObjectFileMachO::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+
+  StringRef SectionName = GO->getSection();
+
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(GO);
+  if (GV && GV->hasImplicitSection()) {
+    auto Attrs = GV->getAttributes();
+    if (Attrs.hasAttribute("bss-section") && Kind.isBSS()) {
+      SectionName = Attrs.getAttribute("bss-section").getValueAsString();
+    } else if (Attrs.hasAttribute("rodata-section") && Kind.isReadOnly()) {
+      SectionName = Attrs.getAttribute("rodata-section").getValueAsString();
+    } else if (Attrs.hasAttribute("relro-section") && Kind.isReadOnlyWithRel()) {
+      SectionName = Attrs.getAttribute("relro-section").getValueAsString();
+    } else if (Attrs.hasAttribute("data-section") && Kind.isData()) {
+      SectionName = Attrs.getAttribute("data-section").getValueAsString();
+    }
+  }
+
+  const Function *F = dyn_cast<Function>(GO);
+  if (F && F->hasFnAttribute("implicit-section-name")) {
+    SectionName = F->getFnAttribute("implicit-section-name").getValueAsString();
+  }
+
+  // Parse the section specifier and create it if valid.
+  StringRef Segment, Section;
+  unsigned TAA = 0, StubSize = 0;
+  bool TAAParsed;
+
+  checkMachOComdat(GO);
+
+  if (Error E = MCSectionMachO::ParseSectionSpecifier(
+          SectionName, Segment, Section, TAA, TAAParsed, StubSize)) {
+    // If invalid, report the error with report_fatal_error.
+    report_fatal_error("Global variable '" + GO->getName() +
+                       "' has an invalid section specifier '" +
+                       GO->getSection() + "': " + toString(std::move(E)) + ".");
+  }
+
+  // Get the section.
+  MCSectionMachO *S =
+      getContext().getMachOSection(Segment, Section, TAA, StubSize, Kind);
+
+  // If TAA wasn't set by ParseSectionSpecifier() above,
+  // use the value returned by getMachOSection() as a default.
+  if (!TAAParsed)
+    TAA = S->getTypeAndAttributes();
+
+  // Okay, now that we got the section, verify that the TAA & StubSize agree.
+  // If the user declared multiple globals with different section flags, we need
+  // to reject it here.
+  if (S->getTypeAndAttributes() != TAA || S->getStubSize() != StubSize) {
+    // If invalid, report the error with report_fatal_error.
+    report_fatal_error("Global variable '" + GO->getName() +
+                       "' section type or attributes does not match previous"
+                       " section specifier");
+  }
+
+  return S;
+}
+
+MCSection *TargetLoweringObjectFileMachO::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  checkMachOComdat(GO);
+
+  // Handle thread local data.
+  if (Kind.isThreadBSS()) return TLSBSSSection;
+  if (Kind.isThreadData()) return TLSDataSection;
+
+  if (Kind.isText())
+    return GO->isWeakForLinker() ? TextCoalSection : TextSection;
+
+  // If this is weak/linkonce, put this in a coalescable section, either in text
+  // or data depending on if it is writable.
+  if (GO->isWeakForLinker()) {
+    if (Kind.isReadOnly())
+      return ConstTextCoalSection;
+    if (Kind.isReadOnlyWithRel())
+      return ConstDataCoalSection;
+    return DataCoalSection;
+  }
+
+  // FIXME: Alignment check should be handled by section classifier.
+  if (Kind.isMergeable1ByteCString() &&
+      GO->getParent()->getDataLayout().getPreferredAlign(
+          cast<GlobalVariable>(GO)) < Align(32))
+    return CStringSection;
+
+  // Do not put 16-bit arrays in the UString section if they have an
+  // externally visible label, this runs into issues with certain linker
+  // versions.
+  if (Kind.isMergeable2ByteCString() && !GO->hasExternalLinkage() &&
+      GO->getParent()->getDataLayout().getPreferredAlign(
+          cast<GlobalVariable>(GO)) < Align(32))
+    return UStringSection;
+
+  // With MachO only variables whose corresponding symbol starts with 'l' or
+  // 'L' can be merged, so we only try merging GVs with private linkage.
+  if (GO->hasPrivateLinkage() && Kind.isMergeableConst()) {
+    if (Kind.isMergeableConst4())
+      return FourByteConstantSection;
+    if (Kind.isMergeableConst8())
+      return EightByteConstantSection;
+    if (Kind.isMergeableConst16())
+      return SixteenByteConstantSection;
+  }
+
+  // Otherwise, if it is readonly, but not something we can specially optimize,
+  // just drop it in .const.
+  if (Kind.isReadOnly())
+    return ReadOnlySection;
+
+  // If this is marked const, put it into a const section.  But if the dynamic
+  // linker needs to write to it, put it in the data segment.
+  if (Kind.isReadOnlyWithRel())
+    return ConstDataSection;
+
+  // Put zero initialized globals with strong external linkage in the
+  // DATA, __common section with the .zerofill directive.
+  if (Kind.isBSSExtern())
+    return DataCommonSection;
+
+  // Put zero initialized globals with local linkage in __DATA,__bss directive
+  // with the .zerofill directive (aka .lcomm).
+  if (Kind.isBSSLocal())
+    return DataBSSSection;
+
+  // Otherwise, just drop the variable in the normal data section.
+  return DataSection;
+}
+
+MCSection *TargetLoweringObjectFileMachO::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C,
+    Align &Alignment) const {
+  // If this constant requires a relocation, we have to put it in the data
+  // segment, not in the text segment.
+  if (Kind.isData() || Kind.isReadOnlyWithRel())
+    return ConstDataSection;
+
+  if (Kind.isMergeableConst4())
+    return FourByteConstantSection;
+  if (Kind.isMergeableConst8())
+    return EightByteConstantSection;
+  if (Kind.isMergeableConst16())
+    return SixteenByteConstantSection;
+  return ReadOnlySection;  // .const
+}
+
+MCSection *TargetLoweringObjectFileMachO::getSectionForCommandLines() const {
+  return getContext().getMachOSection("__TEXT", "__command_line", 0,
+                                      SectionKind::getReadOnly());
+}
+
+const MCExpr *TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
+    MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+  // The mach-o version of this method defaults to returning a stub reference.
+
+  if (Encoding & DW_EH_PE_indirect) {
+    MachineModuleInfoMachO &MachOMMI =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+    MCSymbol *SSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr", TM);
+
+    // Add information about the stub reference to MachOMMI so that the stub
+    // gets emitted by the asmprinter.
+    MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(SSym);
+    if (!StubSym.getPointer()) {
+      MCSymbol *Sym = TM.getSymbol(GV);
+      StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
+    }
+
+    return TargetLoweringObjectFile::
+      getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()),
+                        Encoding & ~DW_EH_PE_indirect, Streamer);
+  }
+
+  return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, TM,
+                                                           MMI, Streamer);
+}
+
+MCSymbol *TargetLoweringObjectFileMachO::getCFIPersonalitySymbol(
+    const GlobalValue *GV, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  // The mach-o version of this method defaults to returning a stub reference.
+  MachineModuleInfoMachO &MachOMMI =
+    MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+  MCSymbol *SSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr", TM);
+
+  // Add information about the stub reference to MachOMMI so that the stub
+  // gets emitted by the asmprinter.
+  MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(SSym);
+  if (!StubSym.getPointer()) {
+    MCSymbol *Sym = TM.getSymbol(GV);
+    StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
+  }
+
+  return SSym;
+}
+
+const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel(
+    const GlobalValue *GV, const MCSymbol *Sym, const MCValue &MV,
+    int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+  // Although MachO 32-bit targets do not explicitly have a GOTPCREL relocation
+  // as 64-bit do, we replace the GOT equivalent by accessing the final symbol
+  // through a non_lazy_ptr stub instead. One advantage is that it allows the
+  // computation of deltas to final external symbols. Example:
+  //
+  //    _extgotequiv:
+  //       .long   _extfoo
+  //
+  //    _delta:
+  //       .long   _extgotequiv-_delta
+  //
+  // is transformed to:
+  //
+  //    _delta:
+  //       .long   L_extfoo$non_lazy_ptr-(_delta+0)
+  //
+  //       .section        __IMPORT,__pointers,non_lazy_symbol_pointers
+  //    L_extfoo$non_lazy_ptr:
+  //       .indirect_symbol        _extfoo
+  //       .long   0
+  //
+  // The indirect symbol table (and sections of non_lazy_symbol_pointers type)
+  // may point to both local (same translation unit) and global (other
+  // translation units) symbols. Example:
+  //
+  // .section __DATA,__pointers,non_lazy_symbol_pointers
+  // L1:
+  //    .indirect_symbol _myGlobal
+  //    .long 0
+  // L2:
+  //    .indirect_symbol _myLocal
+  //    .long _myLocal
+  //
+  // If the symbol is local, instead of the symbol's index, the assembler
+  // places the constant INDIRECT_SYMBOL_LOCAL into the indirect symbol table.
+  // Then the linker will notice the constant in the table and will look at the
+  // content of the symbol.
+  MachineModuleInfoMachO &MachOMMI =
+    MMI->getObjFileInfo<MachineModuleInfoMachO>();
+  MCContext &Ctx = getContext();
+
+  // The offset must consider the original displacement from the base symbol
+  // since 32-bit targets don't have a GOTPCREL to fold the PC displacement.
+  Offset = -MV.getConstant();
+  const MCSymbol *BaseSym = &MV.getSymB()->getSymbol();
+
+  // Access the final symbol via sym$non_lazy_ptr and generate the appropriated
+  // non_lazy_ptr stubs.
+  SmallString<128> Name;
+  StringRef Suffix = "$non_lazy_ptr";
+  Name += MMI->getModule()->getDataLayout().getPrivateGlobalPrefix();
+  Name += Sym->getName();
+  Name += Suffix;
+  MCSymbol *Stub = Ctx.getOrCreateSymbol(Name);
+
+  MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(Stub);
+
+  if (!StubSym.getPointer())
+    StubSym = MachineModuleInfoImpl::StubValueTy(const_cast<MCSymbol *>(Sym),
+                                                 !GV->hasLocalLinkage());
+
+  const MCExpr *BSymExpr =
+    MCSymbolRefExpr::create(BaseSym, MCSymbolRefExpr::VK_None, Ctx);
+  const MCExpr *LHS =
+    MCSymbolRefExpr::create(Stub, MCSymbolRefExpr::VK_None, Ctx);
+
+  if (!Offset)
+    return MCBinaryExpr::createSub(LHS, BSymExpr, Ctx);
+
+  const MCExpr *RHS =
+    MCBinaryExpr::createAdd(BSymExpr, MCConstantExpr::create(Offset, Ctx), Ctx);
+  return MCBinaryExpr::createSub(LHS, RHS, Ctx);
+}
+
+static bool canUsePrivateLabel(const MCAsmInfo &AsmInfo,
+                               const MCSection &Section) {
+  if (!AsmInfo.isSectionAtomizableBySymbols(Section))
+    return true;
+
+  // FIXME: we should be able to use private labels for sections that can't be
+  // dead-stripped (there's no issue with blocking atomization there), but `ld
+  // -r` sometimes drops the no_dead_strip attribute from sections so for safety
+  // we don't allow it.
+  return false;
+}
+
+void TargetLoweringObjectFileMachO::getNameWithPrefix(
+    SmallVectorImpl<char> &OutName, const GlobalValue *GV,
+    const TargetMachine &TM) const {
+  bool CannotUsePrivateLabel = true;
+  if (auto *GO = GV->getAliaseeObject()) {
+    SectionKind GOKind = TargetLoweringObjectFile::getKindForGlobal(GO, TM);
+    const MCSection *TheSection = SectionForGlobal(GO, GOKind, TM);
+    CannotUsePrivateLabel =
+        !canUsePrivateLabel(*TM.getMCAsmInfo(), *TheSection);
+  }
+  getMangler().getNameWithPrefix(OutName, GV, CannotUsePrivateLabel);
+}
+
+//===----------------------------------------------------------------------===//
+//                                  COFF
+//===----------------------------------------------------------------------===//
+
+static unsigned
+getCOFFSectionFlags(SectionKind K, const TargetMachine &TM) {
+  unsigned Flags = 0;
+  bool isThumb = TM.getTargetTriple().getArch() == Triple::thumb;
+
+  if (K.isMetadata())
+    Flags |=
+      COFF::IMAGE_SCN_MEM_DISCARDABLE;
+  else if (K.isExclude())
+    Flags |=
+      COFF::IMAGE_SCN_LNK_REMOVE | COFF::IMAGE_SCN_MEM_DISCARDABLE;
+  else if (K.isText())
+    Flags |=
+      COFF::IMAGE_SCN_MEM_EXECUTE |
+      COFF::IMAGE_SCN_MEM_READ |
+      COFF::IMAGE_SCN_CNT_CODE |
+      (isThumb ? COFF::IMAGE_SCN_MEM_16BIT : (COFF::SectionCharacteristics)0);
+  else if (K.isBSS())
+    Flags |=
+      COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ |
+      COFF::IMAGE_SCN_MEM_WRITE;
+  else if (K.isThreadLocal())
+    Flags |=
+      COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ |
+      COFF::IMAGE_SCN_MEM_WRITE;
+  else if (K.isReadOnly() || K.isReadOnlyWithRel())
+    Flags |=
+      COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ;
+  else if (K.isWriteable())
+    Flags |=
+      COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ |
+      COFF::IMAGE_SCN_MEM_WRITE;
+
+  return Flags;
+}
+
+static const GlobalValue *getComdatGVForCOFF(const GlobalValue *GV) {
+  const Comdat *C = GV->getComdat();
+  assert(C && "expected GV to have a Comdat!");
+
+  StringRef ComdatGVName = C->getName();
+  const GlobalValue *ComdatGV = GV->getParent()->getNamedValue(ComdatGVName);
+  if (!ComdatGV)
+    report_fatal_error("Associative COMDAT symbol '" + ComdatGVName +
+                       "' does not exist.");
+
+  if (ComdatGV->getComdat() != C)
+    report_fatal_error("Associative COMDAT symbol '" + ComdatGVName +
+                       "' is not a key for its COMDAT.");
+
+  return ComdatGV;
+}
+
+static int getSelectionForCOFF(const GlobalValue *GV) {
+  if (const Comdat *C = GV->getComdat()) {
+    const GlobalValue *ComdatKey = getComdatGVForCOFF(GV);
+    if (const auto *GA = dyn_cast<GlobalAlias>(ComdatKey))
+      ComdatKey = GA->getAliaseeObject();
+    if (ComdatKey == GV) {
+      switch (C->getSelectionKind()) {
+      case Comdat::Any:
+        return COFF::IMAGE_COMDAT_SELECT_ANY;
+      case Comdat::ExactMatch:
+        return COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH;
+      case Comdat::Largest:
+        return COFF::IMAGE_COMDAT_SELECT_LARGEST;
+      case Comdat::NoDeduplicate:
+        return COFF::IMAGE_COMDAT_SELECT_NODUPLICATES;
+      case Comdat::SameSize:
+        return COFF::IMAGE_COMDAT_SELECT_SAME_SIZE;
+      }
+    } else {
+      return COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE;
+    }
+  }
+  return 0;
+}
+
+MCSection *TargetLoweringObjectFileCOFF::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  int Selection = 0;
+  unsigned Characteristics = getCOFFSectionFlags(Kind, TM);
+  StringRef Name = GO->getSection();
+  StringRef COMDATSymName = "";
+  if (GO->hasComdat()) {
+    Selection = getSelectionForCOFF(GO);
+    const GlobalValue *ComdatGV;
+    if (Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
+      ComdatGV = getComdatGVForCOFF(GO);
+    else
+      ComdatGV = GO;
+
+    if (!ComdatGV->hasPrivateLinkage()) {
+      MCSymbol *Sym = TM.getSymbol(ComdatGV);
+      COMDATSymName = Sym->getName();
+      Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
+    } else {
+      Selection = 0;
+    }
+  }
+
+  return getContext().getCOFFSection(Name, Characteristics, Kind, COMDATSymName,
+                                     Selection);
+}
+
+static StringRef getCOFFSectionNameForUniqueGlobal(SectionKind Kind) {
+  if (Kind.isText())
+    return ".text";
+  if (Kind.isBSS())
+    return ".bss";
+  if (Kind.isThreadLocal())
+    return ".tls$";
+  if (Kind.isReadOnly() || Kind.isReadOnlyWithRel())
+    return ".rdata";
+  return ".data";
+}
+
+MCSection *TargetLoweringObjectFileCOFF::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  // If we have -ffunction-sections then we should emit the global value to a
+  // uniqued section specifically for it.
+  bool EmitUniquedSection;
+  if (Kind.isText())
+    EmitUniquedSection = TM.getFunctionSections();
+  else
+    EmitUniquedSection = TM.getDataSections();
+
+  if ((EmitUniquedSection && !Kind.isCommon()) || GO->hasComdat()) {
+    SmallString<256> Name = getCOFFSectionNameForUniqueGlobal(Kind);
+
+    unsigned Characteristics = getCOFFSectionFlags(Kind, TM);
+
+    Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
+    int Selection = getSelectionForCOFF(GO);
+    if (!Selection)
+      Selection = COFF::IMAGE_COMDAT_SELECT_NODUPLICATES;
+    const GlobalValue *ComdatGV;
+    if (GO->hasComdat())
+      ComdatGV = getComdatGVForCOFF(GO);
+    else
+      ComdatGV = GO;
+
+    unsigned UniqueID = MCContext::GenericSectionID;
+    if (EmitUniquedSection)
+      UniqueID = NextUniqueID++;
+
+    if (!ComdatGV->hasPrivateLinkage()) {
+      MCSymbol *Sym = TM.getSymbol(ComdatGV);
+      StringRef COMDATSymName = Sym->getName();
+
+      if (const auto *F = dyn_cast<Function>(GO))
+        if (std::optional<StringRef> Prefix = F->getSectionPrefix())
+          raw_svector_ostream(Name) << '$' << *Prefix;
+
+      // Append "$symbol" to the section name *before* IR-level mangling is
+      // applied when targetting mingw. This is what GCC does, and the ld.bfd
+      // COFF linker will not properly handle comdats otherwise.
+      if (getContext().getTargetTriple().isWindowsGNUEnvironment())
+        raw_svector_ostream(Name) << '$' << ComdatGV->getName();
+
+      return getContext().getCOFFSection(Name, Characteristics, Kind,
+                                         COMDATSymName, Selection, UniqueID);
+    } else {
+      SmallString<256> TmpData;
+      getMangler().getNameWithPrefix(TmpData, GO, /*CannotUsePrivateLabel=*/true);
+      return getContext().getCOFFSection(Name, Characteristics, Kind, TmpData,
+                                         Selection, UniqueID);
+    }
+  }
+
+  if (Kind.isText())
+    return TextSection;
+
+  if (Kind.isThreadLocal())
+    return TLSDataSection;
+
+  if (Kind.isReadOnly() || Kind.isReadOnlyWithRel())
+    return ReadOnlySection;
+
+  // Note: we claim that common symbols are put in BSSSection, but they are
+  // really emitted with the magic .comm directive, which creates a symbol table
+  // entry but not a section.
+  if (Kind.isBSS() || Kind.isCommon())
+    return BSSSection;
+
+  return DataSection;
+}
+
+void TargetLoweringObjectFileCOFF::getNameWithPrefix(
+    SmallVectorImpl<char> &OutName, const GlobalValue *GV,
+    const TargetMachine &TM) const {
+  bool CannotUsePrivateLabel = false;
+  if (GV->hasPrivateLinkage() &&
+      ((isa<Function>(GV) && TM.getFunctionSections()) ||
+       (isa<GlobalVariable>(GV) && TM.getDataSections())))
+    CannotUsePrivateLabel = true;
+
+  getMangler().getNameWithPrefix(OutName, GV, CannotUsePrivateLabel);
+}
+
+MCSection *TargetLoweringObjectFileCOFF::getSectionForJumpTable(
+    const Function &F, const TargetMachine &TM) const {
+  // If the function can be removed, produce a unique section so that
+  // the table doesn't prevent the removal.
+  const Comdat *C = F.getComdat();
+  bool EmitUniqueSection = TM.getFunctionSections() || C;
+  if (!EmitUniqueSection)
+    return ReadOnlySection;
+
+  // FIXME: we should produce a symbol for F instead.
+  if (F.hasPrivateLinkage())
+    return ReadOnlySection;
+
+  MCSymbol *Sym = TM.getSymbol(&F);
+  StringRef COMDATSymName = Sym->getName();
+
+  SectionKind Kind = SectionKind::getReadOnly();
+  StringRef SecName = getCOFFSectionNameForUniqueGlobal(Kind);
+  unsigned Characteristics = getCOFFSectionFlags(Kind, TM);
+  Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
+  unsigned UniqueID = NextUniqueID++;
+
+  return getContext().getCOFFSection(
+      SecName, Characteristics, Kind, COMDATSymName,
+      COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE, UniqueID);
+}
+
+bool TargetLoweringObjectFileCOFF::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  if (TM->getTargetTriple().getArch() == Triple::x86_64) {
+    if (!JumpTableInFunctionSection) {
+      // We can always create relative relocations, so use another section
+      // that can be marked non-executable.
+      return false;
+    }
+  }
+  return TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection(
+    UsesLabelDifference, F);
+}
+
+void TargetLoweringObjectFileCOFF::emitModuleMetadata(MCStreamer &Streamer,
+                                                      Module &M) const {
+  emitLinkerDirectives(Streamer, M);
+
+  unsigned Version = 0;
+  unsigned Flags = 0;
+  StringRef Section;
+
+  GetObjCImageInfo(M, Version, Flags, Section);
+  if (!Section.empty()) {
+    auto &C = getContext();
+    auto *S = C.getCOFFSection(Section,
+                               COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                   COFF::IMAGE_SCN_MEM_READ,
+                               SectionKind::getReadOnly());
+    Streamer.switchSection(S);
+    Streamer.emitLabel(C.getOrCreateSymbol(StringRef("OBJC_IMAGE_INFO")));
+    Streamer.emitInt32(Version);
+    Streamer.emitInt32(Flags);
+    Streamer.addBlankLine();
+  }
+
+  emitCGProfileMetadata(Streamer, M);
+}
+
+void TargetLoweringObjectFileCOFF::emitLinkerDirectives(
+    MCStreamer &Streamer, Module &M) const {
+  if (NamedMDNode *LinkerOptions = M.getNamedMetadata("llvm.linker.options")) {
+    // Emit the linker options to the linker .drectve section.  According to the
+    // spec, this section is a space-separated string containing flags for
+    // linker.
+    MCSection *Sec = getDrectveSection();
+    Streamer.switchSection(Sec);
+    for (const auto *Option : LinkerOptions->operands()) {
+      for (const auto &Piece : cast<MDNode>(Option)->operands()) {
+        // Lead with a space for consistency with our dllexport implementation.
+        std::string Directive(" ");
+        Directive.append(std::string(cast<MDString>(Piece)->getString()));
+        Streamer.emitBytes(Directive);
+      }
+    }
+  }
+
+  // Emit /EXPORT: flags for each exported global as necessary.
+  std::string Flags;
+  for (const GlobalValue &GV : M.global_values()) {
+    raw_string_ostream OS(Flags);
+    emitLinkerFlagsForGlobalCOFF(OS, &GV, getContext().getTargetTriple(),
+                                 getMangler());
+    OS.flush();
+    if (!Flags.empty()) {
+      Streamer.switchSection(getDrectveSection());
+      Streamer.emitBytes(Flags);
+    }
+    Flags.clear();
+  }
+
+  // Emit /INCLUDE: flags for each used global as necessary.
+  if (const auto *LU = M.getNamedGlobal("llvm.used")) {
+    assert(LU->hasInitializer() && "expected llvm.used to have an initializer");
+    assert(isa<ArrayType>(LU->getValueType()) &&
+           "expected llvm.used to be an array type");
+    if (const auto *A = cast<ConstantArray>(LU->getInitializer())) {
+      for (const Value *Op : A->operands()) {
+        const auto *GV = cast<GlobalValue>(Op->stripPointerCasts());
+        // Global symbols with internal or private linkage are not visible to
+        // the linker, and thus would cause an error when the linker tried to
+        // preserve the symbol due to the `/include:` directive.
+        if (GV->hasLocalLinkage())
+          continue;
+
+        raw_string_ostream OS(Flags);
+        emitLinkerFlagsForUsedCOFF(OS, GV, getContext().getTargetTriple(),
+                                   getMangler());
+        OS.flush();
+
+        if (!Flags.empty()) {
+          Streamer.switchSection(getDrectveSection());
+          Streamer.emitBytes(Flags);
+        }
+        Flags.clear();
+      }
+    }
+  }
+}
+
+void TargetLoweringObjectFileCOFF::Initialize(MCContext &Ctx,
+                                              const TargetMachine &TM) {
+  TargetLoweringObjectFile::Initialize(Ctx, TM);
+  this->TM = &TM;
+  const Triple &T = TM.getTargetTriple();
+  if (T.isWindowsMSVCEnvironment() || T.isWindowsItaniumEnvironment()) {
+    StaticCtorSection =
+        Ctx.getCOFFSection(".CRT$XCU", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                           COFF::IMAGE_SCN_MEM_READ,
+                           SectionKind::getReadOnly());
+    StaticDtorSection =
+        Ctx.getCOFFSection(".CRT$XTX", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                           COFF::IMAGE_SCN_MEM_READ,
+                           SectionKind::getReadOnly());
+  } else {
+    StaticCtorSection = Ctx.getCOFFSection(
+        ".ctors", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                      COFF::IMAGE_SCN_MEM_READ | COFF::IMAGE_SCN_MEM_WRITE,
+        SectionKind::getData());
+    StaticDtorSection = Ctx.getCOFFSection(
+        ".dtors", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                      COFF::IMAGE_SCN_MEM_READ | COFF::IMAGE_SCN_MEM_WRITE,
+        SectionKind::getData());
+  }
+}
+
+static MCSectionCOFF *getCOFFStaticStructorSection(MCContext &Ctx,
+                                                   const Triple &T, bool IsCtor,
+                                                   unsigned Priority,
+                                                   const MCSymbol *KeySym,
+                                                   MCSectionCOFF *Default) {
+  if (T.isWindowsMSVCEnvironment() || T.isWindowsItaniumEnvironment()) {
+    // If the priority is the default, use .CRT$XCU, possibly associative.
+    if (Priority == 65535)
+      return Ctx.getAssociativeCOFFSection(Default, KeySym, 0);
+
+    // Otherwise, we need to compute a new section name. Low priorities should
+    // run earlier. The linker will sort sections ASCII-betically, and we need a
+    // string that sorts between .CRT$XCA and .CRT$XCU. In the general case, we
+    // make a name like ".CRT$XCT12345", since that runs before .CRT$XCU. Really
+    // low priorities need to sort before 'L', since the CRT uses that
+    // internally, so we use ".CRT$XCA00001" for them. We have a contract with
+    // the frontend that "init_seg(compiler)" corresponds to priority 200 and
+    // "init_seg(lib)" corresponds to priority 400, and those respectively use
+    // 'C' and 'L' without the priority suffix. Priorities between 200 and 400
+    // use 'C' with the priority as a suffix.
+    SmallString<24> Name;
+    char LastLetter = 'T';
+    bool AddPrioritySuffix = Priority != 200 && Priority != 400;
+    if (Priority < 200)
+      LastLetter = 'A';
+    else if (Priority < 400)
+      LastLetter = 'C';
+    else if (Priority == 400)
+      LastLetter = 'L';
+    raw_svector_ostream OS(Name);
+    OS << ".CRT$X" << (IsCtor ? "C" : "T") << LastLetter;
+    if (AddPrioritySuffix)
+      OS << format("%05u", Priority);
+    MCSectionCOFF *Sec = Ctx.getCOFFSection(
+        Name, COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ,
+        SectionKind::getReadOnly());
+    return Ctx.getAssociativeCOFFSection(Sec, KeySym, 0);
+  }
+
+  std::string Name = IsCtor ? ".ctors" : ".dtors";
+  if (Priority != 65535)
+    raw_string_ostream(Name) << format(".%05u", 65535 - Priority);
+
+  return Ctx.getAssociativeCOFFSection(
+      Ctx.getCOFFSection(Name, COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                   COFF::IMAGE_SCN_MEM_READ |
+                                   COFF::IMAGE_SCN_MEM_WRITE,
+                         SectionKind::getData()),
+      KeySym, 0);
+}
+
+MCSection *TargetLoweringObjectFileCOFF::getStaticCtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  return getCOFFStaticStructorSection(
+      getContext(), getContext().getTargetTriple(), true, Priority, KeySym,
+      cast<MCSectionCOFF>(StaticCtorSection));
+}
+
+MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  return getCOFFStaticStructorSection(
+      getContext(), getContext().getTargetTriple(), false, Priority, KeySym,
+      cast<MCSectionCOFF>(StaticDtorSection));
+}
+
+const MCExpr *TargetLoweringObjectFileCOFF::lowerRelativeReference(
+    const GlobalValue *LHS, const GlobalValue *RHS,
+    const TargetMachine &TM) const {
+  const Triple &T = TM.getTargetTriple();
+  if (T.isOSCygMing())
+    return nullptr;
+
+  // Our symbols should exist in address space zero, cowardly no-op if
+  // otherwise.
+  if (LHS->getType()->getPointerAddressSpace() != 0 ||
+      RHS->getType()->getPointerAddressSpace() != 0)
+    return nullptr;
+
+  // Both ptrtoint instructions must wrap global objects:
+  // - Only global variables are eligible for image relative relocations.
+  // - The subtrahend refers to the special symbol __ImageBase, a GlobalVariable.
+  // We expect __ImageBase to be a global variable without a section, externally
+  // defined.
+  //
+  // It should look something like this: @__ImageBase = external constant i8
+  if (!isa<GlobalObject>(LHS) || !isa<GlobalVariable>(RHS) ||
+      LHS->isThreadLocal() || RHS->isThreadLocal() ||
+      RHS->getName() != "__ImageBase" || !RHS->hasExternalLinkage() ||
+      cast<GlobalVariable>(RHS)->hasInitializer() || RHS->hasSection())
+    return nullptr;
+
+  return MCSymbolRefExpr::create(TM.getSymbol(LHS),
+                                 MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                 getContext());
+}
+
+static std::string APIntToHexString(const APInt &AI) {
+  unsigned Width = (AI.getBitWidth() / 8) * 2;
+  std::string HexString = toString(AI, 16, /*Signed=*/false);
+  llvm::transform(HexString, HexString.begin(), tolower);
+  unsigned Size = HexString.size();
+  assert(Width >= Size && "hex string is too large!");
+  HexString.insert(HexString.begin(), Width - Size, '0');
+
+  return HexString;
+}
+
+static std::string scalarConstantToHexString(const Constant *C) {
+  Type *Ty = C->getType();
+  if (isa<UndefValue>(C)) {
+    return APIntToHexString(APInt::getZero(Ty->getPrimitiveSizeInBits()));
+  } else if (const auto *CFP = dyn_cast<ConstantFP>(C)) {
+    return APIntToHexString(CFP->getValueAPF().bitcastToAPInt());
+  } else if (const auto *CI = dyn_cast<ConstantInt>(C)) {
+    return APIntToHexString(CI->getValue());
+  } else {
+    unsigned NumElements;
+    if (auto *VTy = dyn_cast<VectorType>(Ty))
+      NumElements = cast<FixedVectorType>(VTy)->getNumElements();
+    else
+      NumElements = Ty->getArrayNumElements();
+    std::string HexString;
+    for (int I = NumElements - 1, E = -1; I != E; --I)
+      HexString += scalarConstantToHexString(C->getAggregateElement(I));
+    return HexString;
+  }
+}
+
+MCSection *TargetLoweringObjectFileCOFF::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C,
+    Align &Alignment) const {
+  if (Kind.isMergeableConst() && C &&
+      getContext().getAsmInfo()->hasCOFFComdatConstants()) {
+    // This creates comdat sections with the given symbol name, but unless
+    // AsmPrinter::GetCPISymbol actually makes the symbol global, the symbol
+    // will be created with a null storage class, which makes GNU binutils
+    // error out.
+    const unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                     COFF::IMAGE_SCN_MEM_READ |
+                                     COFF::IMAGE_SCN_LNK_COMDAT;
+    std::string COMDATSymName;
+    if (Kind.isMergeableConst4()) {
+      if (Alignment <= 4) {
+        COMDATSymName = "__real@" + scalarConstantToHexString(C);
+        Alignment = Align(4);
+      }
+    } else if (Kind.isMergeableConst8()) {
+      if (Alignment <= 8) {
+        COMDATSymName = "__real@" + scalarConstantToHexString(C);
+        Alignment = Align(8);
+      }
+    } else if (Kind.isMergeableConst16()) {
+      // FIXME: These may not be appropriate for non-x86 architectures.
+      if (Alignment <= 16) {
+        COMDATSymName = "__xmm@" + scalarConstantToHexString(C);
+        Alignment = Align(16);
+      }
+    } else if (Kind.isMergeableConst32()) {
+      if (Alignment <= 32) {
+        COMDATSymName = "__ymm@" + scalarConstantToHexString(C);
+        Alignment = Align(32);
+      }
+    }
+
+    if (!COMDATSymName.empty())
+      return getContext().getCOFFSection(".rdata", Characteristics, Kind,
+                                         COMDATSymName,
+                                         COFF::IMAGE_COMDAT_SELECT_ANY);
+  }
+
+  return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C,
+                                                         Alignment);
+}
+
+//===----------------------------------------------------------------------===//
+//                                  Wasm
+//===----------------------------------------------------------------------===//
+
+static const Comdat *getWasmComdat(const GlobalValue *GV) {
+  const Comdat *C = GV->getComdat();
+  if (!C)
+    return nullptr;
+
+  if (C->getSelectionKind() != Comdat::Any)
+    report_fatal_error("WebAssembly COMDATs only support "
+                       "SelectionKind::Any, '" + C->getName() + "' cannot be "
+                       "lowered.");
+
+  return C;
+}
+
+static unsigned getWasmSectionFlags(SectionKind K) {
+  unsigned Flags = 0;
+
+  if (K.isThreadLocal())
+    Flags |= wasm::WASM_SEG_FLAG_TLS;
+
+  if (K.isMergeableCString())
+    Flags |= wasm::WASM_SEG_FLAG_STRINGS;
+
+  // TODO(sbc): Add suport for K.isMergeableConst()
+
+  return Flags;
+}
+
+MCSection *TargetLoweringObjectFileWasm::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  // We don't support explict section names for functions in the wasm object
+  // format.  Each function has to be in its own unique section.
+  if (isa<Function>(GO)) {
+    return SelectSectionForGlobal(GO, Kind, TM);
+  }
+
+  StringRef Name = GO->getSection();
+
+  // Certain data sections we treat as named custom sections rather than
+  // segments within the data section.
+  // This could be avoided if all data segements (the wasm sense) were
+  // represented as their own sections (in the llvm sense).
+  // TODO(sbc): https://github.com/WebAssembly/tool-conventions/issues/138
+  if (Name == ".llvmcmd" || Name == ".llvmbc")
+    Kind = SectionKind::getMetadata();
+
+  StringRef Group = "";
+  if (const Comdat *C = getWasmComdat(GO)) {
+    Group = C->getName();
+  }
+
+  unsigned Flags = getWasmSectionFlags(Kind);
+  MCSectionWasm *Section = getContext().getWasmSection(
+      Name, Kind, Flags, Group, MCContext::GenericSectionID);
+
+  return Section;
+}
+
+static MCSectionWasm *selectWasmSectionForGlobal(
+    MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang,
+    const TargetMachine &TM, bool EmitUniqueSection, unsigned *NextUniqueID) {
+  StringRef Group = "";
+  if (const Comdat *C = getWasmComdat(GO)) {
+    Group = C->getName();
+  }
+
+  bool UniqueSectionNames = TM.getUniqueSectionNames();
+  SmallString<128> Name = getSectionPrefixForGlobal(Kind, /*IsLarge=*/false);
+
+  if (const auto *F = dyn_cast<Function>(GO)) {
+    const auto &OptionalPrefix = F->getSectionPrefix();
+    if (OptionalPrefix)
+      raw_svector_ostream(Name) << '.' << *OptionalPrefix;
+  }
+
+  if (EmitUniqueSection && UniqueSectionNames) {
+    Name.push_back('.');
+    TM.getNameWithPrefix(Name, GO, Mang, true);
+  }
+  unsigned UniqueID = MCContext::GenericSectionID;
+  if (EmitUniqueSection && !UniqueSectionNames) {
+    UniqueID = *NextUniqueID;
+    (*NextUniqueID)++;
+  }
+
+  unsigned Flags = getWasmSectionFlags(Kind);
+  return Ctx.getWasmSection(Name, Kind, Flags, Group, UniqueID);
+}
+
+MCSection *TargetLoweringObjectFileWasm::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+
+  if (Kind.isCommon())
+    report_fatal_error("mergable sections not supported yet on wasm");
+
+  // If we have -ffunction-section or -fdata-section then we should emit the
+  // global value to a uniqued section specifically for it.
+  bool EmitUniqueSection = false;
+  if (Kind.isText())
+    EmitUniqueSection = TM.getFunctionSections();
+  else
+    EmitUniqueSection = TM.getDataSections();
+  EmitUniqueSection |= GO->hasComdat();
+
+  return selectWasmSectionForGlobal(getContext(), GO, Kind, getMangler(), TM,
+                                    EmitUniqueSection, &NextUniqueID);
+}
+
+bool TargetLoweringObjectFileWasm::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  // We can always create relative relocations, so use another section
+  // that can be marked non-executable.
+  return false;
+}
+
+const MCExpr *TargetLoweringObjectFileWasm::lowerRelativeReference(
+    const GlobalValue *LHS, const GlobalValue *RHS,
+    const TargetMachine &TM) const {
+  // We may only use a PLT-relative relocation to refer to unnamed_addr
+  // functions.
+  if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy())
+    return nullptr;
+
+  // Basic correctness checks.
+  if (LHS->getType()->getPointerAddressSpace() != 0 ||
+      RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() ||
+      RHS->isThreadLocal())
+    return nullptr;
+
+  return MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(TM.getSymbol(LHS), MCSymbolRefExpr::VK_None,
+                              getContext()),
+      MCSymbolRefExpr::create(TM.getSymbol(RHS), getContext()), getContext());
+}
+
+void TargetLoweringObjectFileWasm::InitializeWasm() {
+  StaticCtorSection =
+      getContext().getWasmSection(".init_array", SectionKind::getData());
+
+  // We don't use PersonalityEncoding and LSDAEncoding because we don't emit
+  // .cfi directives. We use TTypeEncoding to encode typeinfo global variables.
+  TTypeEncoding = dwarf::DW_EH_PE_absptr;
+}
+
+MCSection *TargetLoweringObjectFileWasm::getStaticCtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  return Priority == UINT16_MAX ?
+         StaticCtorSection :
+         getContext().getWasmSection(".init_array." + utostr(Priority),
+                                     SectionKind::getData());
+}
+
+MCSection *TargetLoweringObjectFileWasm::getStaticDtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  report_fatal_error("@llvm.global_dtors should have been lowered already");
+}
+
+//===----------------------------------------------------------------------===//
+//                                  XCOFF
+//===----------------------------------------------------------------------===//
+bool TargetLoweringObjectFileXCOFF::ShouldEmitEHBlock(
+    const MachineFunction *MF) {
+  if (!MF->getLandingPads().empty())
+    return true;
+
+  const Function &F = MF->getFunction();
+  if (!F.hasPersonalityFn() || !F.needsUnwindTableEntry())
+    return false;
+
+  const GlobalValue *Per =
+      dyn_cast<GlobalValue>(F.getPersonalityFn()->stripPointerCasts());
+  assert(Per && "Personality routine is not a GlobalValue type.");
+  if (isNoOpWithoutInvoke(classifyEHPersonality(Per)))
+    return false;
+
+  return true;
+}
+
+bool TargetLoweringObjectFileXCOFF::ShouldSetSSPCanaryBitInTB(
+    const MachineFunction *MF) {
+  const Function &F = MF->getFunction();
+  if (!F.hasStackProtectorFnAttr())
+    return false;
+  // FIXME: check presence of canary word
+  // There are cases that the stack protectors are not really inserted even if
+  // the attributes are on.
+  return true;
+}
+
+MCSymbol *
+TargetLoweringObjectFileXCOFF::getEHInfoTableSymbol(const MachineFunction *MF) {
+  return MF->getMMI().getContext().getOrCreateSymbol(
+      "__ehinfo." + Twine(MF->getFunctionNumber()));
+}
+
+MCSymbol *
+TargetLoweringObjectFileXCOFF::getTargetSymbol(const GlobalValue *GV,
+                                               const TargetMachine &TM) const {
+  // We always use a qualname symbol for a GV that represents
+  // a declaration, a function descriptor, or a common symbol.
+  // If a GV represents a GlobalVariable and -fdata-sections is enabled, we
+  // also return a qualname so that a label symbol could be avoided.
+  // It is inherently ambiguous when the GO represents the address of a
+  // function, as the GO could either represent a function descriptor or a
+  // function entry point. We choose to always return a function descriptor
+  // here.
+  if (const GlobalObject *GO = dyn_cast<GlobalObject>(GV)) {
+    if (GO->isDeclarationForLinker())
+      return cast<MCSectionXCOFF>(getSectionForExternalReference(GO, TM))
+          ->getQualNameSymbol();
+
+    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+      if (GVar->hasAttribute("toc-data"))
+        return cast<MCSectionXCOFF>(
+                   SectionForGlobal(GVar, SectionKind::getData(), TM))
+            ->getQualNameSymbol();
+
+    SectionKind GOKind = getKindForGlobal(GO, TM);
+    if (GOKind.isText())
+      return cast<MCSectionXCOFF>(
+                 getSectionForFunctionDescriptor(cast<Function>(GO), TM))
+          ->getQualNameSymbol();
+    if ((TM.getDataSections() && !GO->hasSection()) || GO->hasCommonLinkage() ||
+        GOKind.isBSSLocal() || GOKind.isThreadBSSLocal())
+      return cast<MCSectionXCOFF>(SectionForGlobal(GO, GOKind, TM))
+          ->getQualNameSymbol();
+  }
+
+  // For all other cases, fall back to getSymbol to return the unqualified name.
+  return nullptr;
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  if (!GO->hasSection())
+    report_fatal_error("#pragma clang section is not yet supported");
+
+  StringRef SectionName = GO->getSection();
+
+  // Handle the XCOFF::TD case first, then deal with the rest.
+  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GO))
+    if (GVar->hasAttribute("toc-data"))
+      return getContext().getXCOFFSection(
+          SectionName, Kind,
+          XCOFF::CsectProperties(/*MappingClass*/ XCOFF::XMC_TD, XCOFF::XTY_SD),
+          /* MultiSymbolsAllowed*/ true);
+
+  XCOFF::StorageMappingClass MappingClass;
+  if (Kind.isText())
+    MappingClass = XCOFF::XMC_PR;
+  else if (Kind.isData() || Kind.isBSS())
+    MappingClass = XCOFF::XMC_RW;
+  else if (Kind.isReadOnlyWithRel())
+    MappingClass =
+        TM.Options.XCOFFReadOnlyPointers ? XCOFF::XMC_RO : XCOFF::XMC_RW;
+  else if (Kind.isReadOnly())
+    MappingClass = XCOFF::XMC_RO;
+  else
+    report_fatal_error("XCOFF other section types not yet implemented.");
+
+  return getContext().getXCOFFSection(
+      SectionName, Kind, XCOFF::CsectProperties(MappingClass, XCOFF::XTY_SD),
+      /* MultiSymbolsAllowed*/ true);
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::getSectionForExternalReference(
+    const GlobalObject *GO, const TargetMachine &TM) const {
+  assert(GO->isDeclarationForLinker() &&
+         "Tried to get ER section for a defined global.");
+
+  SmallString<128> Name;
+  getNameWithPrefix(Name, GO, TM);
+
+  XCOFF::StorageMappingClass SMC =
+      isa<Function>(GO) ? XCOFF::XMC_DS : XCOFF::XMC_UA;
+  if (GO->isThreadLocal())
+    SMC = XCOFF::XMC_UL;
+
+  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GO))
+    if (GVar->hasAttribute("toc-data"))
+      SMC = XCOFF::XMC_TD;
+
+  // Externals go into a csect of type ER.
+  return getContext().getXCOFFSection(
+      Name, SectionKind::getMetadata(),
+      XCOFF::CsectProperties(SMC, XCOFF::XTY_ER));
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  // Handle the XCOFF::TD case first, then deal with the rest.
+  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GO))
+    if (GVar->hasAttribute("toc-data")) {
+      SmallString<128> Name;
+      getNameWithPrefix(Name, GO, TM);
+      return getContext().getXCOFFSection(
+          Name, Kind, XCOFF::CsectProperties(XCOFF::XMC_TD, XCOFF::XTY_SD),
+          /* MultiSymbolsAllowed*/ true);
+    }
+
+  // Common symbols go into a csect with matching name which will get mapped
+  // into the .bss section.
+  // Zero-initialized local TLS symbols go into a csect with matching name which
+  // will get mapped into the .tbss section.
+  if (Kind.isBSSLocal() || GO->hasCommonLinkage() || Kind.isThreadBSSLocal()) {
+    SmallString<128> Name;
+    getNameWithPrefix(Name, GO, TM);
+    XCOFF::StorageMappingClass SMC = Kind.isBSSLocal() ? XCOFF::XMC_BS
+                                     : Kind.isCommon() ? XCOFF::XMC_RW
+                                                       : XCOFF::XMC_UL;
+    return getContext().getXCOFFSection(
+        Name, Kind, XCOFF::CsectProperties(SMC, XCOFF::XTY_CM));
+  }
+
+  if (Kind.isText()) {
+    if (TM.getFunctionSections()) {
+      return cast<MCSymbolXCOFF>(getFunctionEntryPointSymbol(GO, TM))
+          ->getRepresentedCsect();
+    }
+    return TextSection;
+  }
+
+  if (TM.Options.XCOFFReadOnlyPointers && Kind.isReadOnlyWithRel()) {
+    if (!TM.getDataSections())
+      report_fatal_error(
+          "ReadOnlyPointers is supported only if data sections is turned on");
+
+    SmallString<128> Name;
+    getNameWithPrefix(Name, GO, TM);
+    return getContext().getXCOFFSection(
+        Name, SectionKind::getReadOnly(),
+        XCOFF::CsectProperties(XCOFF::XMC_RO, XCOFF::XTY_SD));
+  }
+
+  // For BSS kind, zero initialized data must be emitted to the .data section
+  // because external linkage control sections that get mapped to the .bss
+  // section will be linked as tentative defintions, which is only appropriate
+  // for SectionKind::Common.
+  if (Kind.isData() || Kind.isReadOnlyWithRel() || Kind.isBSS()) {
+    if (TM.getDataSections()) {
+      SmallString<128> Name;
+      getNameWithPrefix(Name, GO, TM);
+      return getContext().getXCOFFSection(
+          Name, SectionKind::getData(),
+          XCOFF::CsectProperties(XCOFF::XMC_RW, XCOFF::XTY_SD));
+    }
+    return DataSection;
+  }
+
+  if (Kind.isReadOnly()) {
+    if (TM.getDataSections()) {
+      SmallString<128> Name;
+      getNameWithPrefix(Name, GO, TM);
+      return getContext().getXCOFFSection(
+          Name, SectionKind::getReadOnly(),
+          XCOFF::CsectProperties(XCOFF::XMC_RO, XCOFF::XTY_SD));
+    }
+    return ReadOnlySection;
+  }
+
+  // External/weak TLS data and initialized local TLS data are not eligible
+  // to be put into common csect. If data sections are enabled, thread
+  // data are emitted into separate sections. Otherwise, thread data
+  // are emitted into the .tdata section.
+  if (Kind.isThreadLocal()) {
+    if (TM.getDataSections()) {
+      SmallString<128> Name;
+      getNameWithPrefix(Name, GO, TM);
+      return getContext().getXCOFFSection(
+          Name, Kind, XCOFF::CsectProperties(XCOFF::XMC_TL, XCOFF::XTY_SD));
+    }
+    return TLSDataSection;
+  }
+
+  report_fatal_error("XCOFF other section types not yet implemented.");
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::getSectionForJumpTable(
+    const Function &F, const TargetMachine &TM) const {
+  assert (!F.getComdat() && "Comdat not supported on XCOFF.");
+
+  if (!TM.getFunctionSections())
+    return ReadOnlySection;
+
+  // If the function can be removed, produce a unique section so that
+  // the table doesn't prevent the removal.
+  SmallString<128> NameStr(".rodata.jmp..");
+  getNameWithPrefix(NameStr, &F, TM);
+  return getContext().getXCOFFSection(
+      NameStr, SectionKind::getReadOnly(),
+      XCOFF::CsectProperties(XCOFF::XMC_RO, XCOFF::XTY_SD));
+}
+
+bool TargetLoweringObjectFileXCOFF::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  return false;
+}
+
+/// Given a mergeable constant with the specified size and relocation
+/// information, return a section that it should be placed in.
+MCSection *TargetLoweringObjectFileXCOFF::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C,
+    Align &Alignment) const {
+  // TODO: Enable emiting constant pool to unique sections when we support it.
+  if (Alignment > Align(16))
+    report_fatal_error("Alignments greater than 16 not yet supported.");
+
+  if (Alignment == Align(8)) {
+    assert(ReadOnly8Section && "Section should always be initialized.");
+    return ReadOnly8Section;
+  }
+
+  if (Alignment == Align(16)) {
+    assert(ReadOnly16Section && "Section should always be initialized.");
+    return ReadOnly16Section;
+  }
+
+  return ReadOnlySection;
+}
+
+void TargetLoweringObjectFileXCOFF::Initialize(MCContext &Ctx,
+                                               const TargetMachine &TgtM) {
+  TargetLoweringObjectFile::Initialize(Ctx, TgtM);
+  TTypeEncoding =
+      dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_datarel |
+      (TgtM.getTargetTriple().isArch32Bit() ? dwarf::DW_EH_PE_sdata4
+                                            : dwarf::DW_EH_PE_sdata8);
+  PersonalityEncoding = 0;
+  LSDAEncoding = 0;
+  CallSiteEncoding = dwarf::DW_EH_PE_udata4;
+
+  // AIX debug for thread local location is not ready. And for integrated as
+  // mode, the relocatable address for the thread local variable will cause
+  // linker error. So disable the location attribute generation for thread local
+  // variables for now.
+  // FIXME: when TLS debug on AIX is ready, remove this setting.
+  SupportDebugThreadLocalLocation = false;
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::getStaticCtorSection(
+	unsigned Priority, const MCSymbol *KeySym) const {
+  report_fatal_error("no static constructor section on AIX");
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::getStaticDtorSection(
+	unsigned Priority, const MCSymbol *KeySym) const {
+  report_fatal_error("no static destructor section on AIX");
+}
+
+const MCExpr *TargetLoweringObjectFileXCOFF::lowerRelativeReference(
+    const GlobalValue *LHS, const GlobalValue *RHS,
+    const TargetMachine &TM) const {
+  /* Not implemented yet, but don't crash, return nullptr. */
+  return nullptr;
+}
+
+XCOFF::StorageClass
+TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(const GlobalValue *GV) {
+  assert(!isa<GlobalIFunc>(GV) && "GlobalIFunc is not supported on AIX.");
+
+  switch (GV->getLinkage()) {
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::PrivateLinkage:
+    return XCOFF::C_HIDEXT;
+  case GlobalValue::ExternalLinkage:
+  case GlobalValue::CommonLinkage:
+  case GlobalValue::AvailableExternallyLinkage:
+    return XCOFF::C_EXT;
+  case GlobalValue::ExternalWeakLinkage:
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+    return XCOFF::C_WEAKEXT;
+  case GlobalValue::AppendingLinkage:
+    report_fatal_error(
+        "There is no mapping that implements AppendingLinkage for XCOFF.");
+  }
+  llvm_unreachable("Unknown linkage type!");
+}
+
+MCSymbol *TargetLoweringObjectFileXCOFF::getFunctionEntryPointSymbol(
+    const GlobalValue *Func, const TargetMachine &TM) const {
+  assert((isa<Function>(Func) ||
+          (isa<GlobalAlias>(Func) &&
+           isa_and_nonnull<Function>(
+               cast<GlobalAlias>(Func)->getAliaseeObject()))) &&
+         "Func must be a function or an alias which has a function as base "
+         "object.");
+
+  SmallString<128> NameStr;
+  NameStr.push_back('.');
+  getNameWithPrefix(NameStr, Func, TM);
+
+  // When -function-sections is enabled and explicit section is not specified,
+  // it's not necessary to emit function entry point label any more. We will use
+  // function entry point csect instead. And for function delcarations, the
+  // undefined symbols gets treated as csect with XTY_ER property.
+  if (((TM.getFunctionSections() && !Func->hasSection()) ||
+       Func->isDeclarationForLinker()) &&
+      isa<Function>(Func)) {
+    return getContext()
+        .getXCOFFSection(
+            NameStr, SectionKind::getText(),
+            XCOFF::CsectProperties(XCOFF::XMC_PR, Func->isDeclarationForLinker()
+                                                      ? XCOFF::XTY_ER
+                                                      : XCOFF::XTY_SD))
+        ->getQualNameSymbol();
+  }
+
+  return getContext().getOrCreateSymbol(NameStr);
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::getSectionForFunctionDescriptor(
+    const Function *F, const TargetMachine &TM) const {
+  SmallString<128> NameStr;
+  getNameWithPrefix(NameStr, F, TM);
+  return getContext().getXCOFFSection(
+      NameStr, SectionKind::getData(),
+      XCOFF::CsectProperties(XCOFF::XMC_DS, XCOFF::XTY_SD));
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::getSectionForTOCEntry(
+    const MCSymbol *Sym, const TargetMachine &TM) const {
+  // Use TE storage-mapping class when large code model is enabled so that
+  // the chance of needing -bbigtoc is decreased.
+  return getContext().getXCOFFSection(
+      cast<MCSymbolXCOFF>(Sym)->getSymbolTableName(), SectionKind::getData(),
+      XCOFF::CsectProperties(
+          TM.getCodeModel() == CodeModel::Large ? XCOFF::XMC_TE : XCOFF::XMC_TC,
+          XCOFF::XTY_SD));
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::getSectionForLSDA(
+    const Function &F, const MCSymbol &FnSym, const TargetMachine &TM) const {
+  auto *LSDA = cast<MCSectionXCOFF>(LSDASection);
+  if (TM.getFunctionSections()) {
+    // If option -ffunction-sections is on, append the function name to the
+    // name of the LSDA csect so that each function has its own LSDA csect.
+    // This helps the linker to garbage-collect EH info of unused functions.
+    SmallString<128> NameStr = LSDA->getName();
+    raw_svector_ostream(NameStr) << '.' << F.getName();
+    LSDA = getContext().getXCOFFSection(NameStr, LSDA->getKind(),
+                                        LSDA->getCsectProp());
+  }
+  return LSDA;
+}
+//===----------------------------------------------------------------------===//
+//                                  GOFF
+//===----------------------------------------------------------------------===//
+TargetLoweringObjectFileGOFF::TargetLoweringObjectFileGOFF() = default;
+
+MCSection *TargetLoweringObjectFileGOFF::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  return SelectSectionForGlobal(GO, Kind, TM);
+}
+
+MCSection *TargetLoweringObjectFileGOFF::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  auto *Symbol = TM.getSymbol(GO);
+  if (Kind.isBSS())
+    return getContext().getGOFFSection(Symbol->getName(), SectionKind::getBSS(),
+                                       nullptr, nullptr);
+
+  return getContext().getObjectFileInfo()->getTextSection();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetOptionsImpl.cpp
new file mode 100644
index 000000000000..af5d10103f78
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetOptionsImpl.cpp
@@ -0,0 +1,56 @@
+//===-- TargetOptionsImpl.cpp - Options that apply to all targets ----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the methods in the TargetOptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+/// DisableFramePointerElim - This returns true if frame pointer elimination
+/// optimization should be disabled for the given machine function.
+bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const {
+  // Check to see if the target want to forcably keep frame pointer.
+  if (MF.getSubtarget().getFrameLowering()->keepFramePointer(MF))
+    return true;
+
+  const Function &F = MF.getFunction();
+
+  if (!F.hasFnAttribute("frame-pointer"))
+    return false;
+  StringRef FP = F.getFnAttribute("frame-pointer").getValueAsString();
+  if (FP == "all")
+    return true;
+  if (FP == "non-leaf")
+    return MF.getFrameInfo().hasCalls();
+  if (FP == "none")
+    return false;
+  llvm_unreachable("unknown frame pointer flag");
+}
+
+/// HonorSignDependentRoundingFPMath - Return true if the codegen must assume
+/// that the rounding mode of the FPU can change from its default.
+bool TargetOptions::HonorSignDependentRoundingFPMath() const {
+  return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption;
+}
+
+/// NOTE: There are targets that still do not support the debug entry values
+/// production and that is being controlled with the SupportsDebugEntryValues.
+/// In addition, SCE debugger does not have the feature implemented, so prefer
+/// not to emit the debug entry values in that case.
+/// The EnableDebugEntryValues can be used for the testing purposes.
+bool TargetOptions::ShouldEmitDebugEntryValues() const {
+  return (SupportsDebugEntryValues && DebuggerTuning != DebuggerKind::SCE) ||
+         EnableDebugEntryValues;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetPassConfig.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetPassConfig.cpp
new file mode 100644
index 000000000000..98ea2f21b3c8
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -0,0 +1,1569 @@
+//===- TargetPassConfig.cpp - Target independent code generation passes ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines interfaces to access the target independent code
+// generation passes provided by the LLVM backend.
+//
+//===---------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h"
+#include "llvm/CodeGen/CSEConfigBase.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePassRegistry.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/PassInstrumentation.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Discriminator.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SaveAndRestore.h"
+#include "llvm/Support/Threading.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Target/CGPassBuilderOption.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include <cassert>
+#include <optional>
+#include <string>
+
+using namespace llvm;
+
+static cl::opt<bool>
+    EnableIPRA("enable-ipra", cl::init(false), cl::Hidden,
+               cl::desc("Enable interprocedural register allocation "
+                        "to reduce load/store at procedure calls."));
+static cl::opt<bool> DisablePostRASched("disable-post-ra", cl::Hidden,
+    cl::desc("Disable Post Regalloc Scheduler"));
+static cl::opt<bool> DisableBranchFold("disable-branch-fold", cl::Hidden,
+    cl::desc("Disable branch folding"));
+static cl::opt<bool> DisableTailDuplicate("disable-tail-duplicate", cl::Hidden,
+    cl::desc("Disable tail duplication"));
+static cl::opt<bool> DisableEarlyTailDup("disable-early-taildup", cl::Hidden,
+    cl::desc("Disable pre-register allocation tail duplication"));
+static cl::opt<bool> DisableBlockPlacement("disable-block-placement",
+    cl::Hidden, cl::desc("Disable probability-driven block placement"));
+static cl::opt<bool> EnableBlockPlacementStats("enable-block-placement-stats",
+    cl::Hidden, cl::desc("Collect probability-driven block placement stats"));
+static cl::opt<bool> DisableSSC("disable-ssc", cl::Hidden,
+    cl::desc("Disable Stack Slot Coloring"));
+static cl::opt<bool> DisableMachineDCE("disable-machine-dce", cl::Hidden,
+    cl::desc("Disable Machine Dead Code Elimination"));
+static cl::opt<bool> DisableEarlyIfConversion("disable-early-ifcvt", cl::Hidden,
+    cl::desc("Disable Early If-conversion"));
+static cl::opt<bool> DisableMachineLICM("disable-machine-licm", cl::Hidden,
+    cl::desc("Disable Machine LICM"));
+static cl::opt<bool> DisableMachineCSE("disable-machine-cse", cl::Hidden,
+    cl::desc("Disable Machine Common Subexpression Elimination"));
+static cl::opt<cl::boolOrDefault> OptimizeRegAlloc(
+    "optimize-regalloc", cl::Hidden,
+    cl::desc("Enable optimized register allocation compilation path."));
+static cl::opt<bool> DisablePostRAMachineLICM("disable-postra-machine-licm",
+    cl::Hidden,
+    cl::desc("Disable Machine LICM"));
+static cl::opt<bool> DisableMachineSink("disable-machine-sink", cl::Hidden,
+    cl::desc("Disable Machine Sinking"));
+static cl::opt<bool> DisablePostRAMachineSink("disable-postra-machine-sink",
+    cl::Hidden,
+    cl::desc("Disable PostRA Machine Sinking"));
+static cl::opt<bool> DisableLSR("disable-lsr", cl::Hidden,
+    cl::desc("Disable Loop Strength Reduction Pass"));
+static cl::opt<bool> DisableConstantHoisting("disable-constant-hoisting",
+    cl::Hidden, cl::desc("Disable ConstantHoisting"));
+static cl::opt<bool> DisableCGP("disable-cgp", cl::Hidden,
+    cl::desc("Disable Codegen Prepare"));
+static cl::opt<bool> DisableCopyProp("disable-copyprop", cl::Hidden,
+    cl::desc("Disable Copy Propagation pass"));
+static cl::opt<bool> DisablePartialLibcallInlining("disable-partial-libcall-inlining",
+    cl::Hidden, cl::desc("Disable Partial Libcall Inlining"));
+static cl::opt<bool> DisableAtExitBasedGlobalDtorLowering(
+    "disable-atexit-based-global-dtor-lowering", cl::Hidden,
+    cl::desc("For MachO, disable atexit()-based global destructor lowering"));
+static cl::opt<bool> EnableImplicitNullChecks(
+    "enable-implicit-null-checks",
+    cl::desc("Fold null checks into faulting memory operations"),
+    cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableMergeICmps("disable-mergeicmps",
+    cl::desc("Disable MergeICmps Pass"),
+    cl::init(false), cl::Hidden);
+static cl::opt<bool> PrintLSR("print-lsr-output", cl::Hidden,
+    cl::desc("Print LLVM IR produced by the loop-reduce pass"));
+static cl::opt<bool> PrintISelInput("print-isel-input", cl::Hidden,
+    cl::desc("Print LLVM IR input to isel pass"));
+static cl::opt<bool> PrintGCInfo("print-gc", cl::Hidden,
+    cl::desc("Dump garbage collector data"));
+static cl::opt<cl::boolOrDefault>
+    VerifyMachineCode("verify-machineinstrs", cl::Hidden,
+                      cl::desc("Verify generated machine code"));
+static cl::opt<cl::boolOrDefault>
+    DebugifyAndStripAll("debugify-and-strip-all-safe", cl::Hidden,
+                        cl::desc("Debugify MIR before and Strip debug after "
+                                 "each pass except those known to be unsafe "
+                                 "when debug info is present"));
+static cl::opt<cl::boolOrDefault> DebugifyCheckAndStripAll(
+    "debugify-check-and-strip-all-safe", cl::Hidden,
+    cl::desc(
+        "Debugify MIR before, by checking and stripping the debug info after, "
+        "each pass except those known to be unsafe when debug info is "
+        "present"));
+// Enable or disable the MachineOutliner.
+static cl::opt<RunOutliner> EnableMachineOutliner(
+    "enable-machine-outliner", cl::desc("Enable the machine outliner"),
+    cl::Hidden, cl::ValueOptional, cl::init(RunOutliner::TargetDefault),
+    cl::values(clEnumValN(RunOutliner::AlwaysOutline, "always",
+                          "Run on all functions guaranteed to be beneficial"),
+               clEnumValN(RunOutliner::NeverOutline, "never",
+                          "Disable all outlining"),
+               // Sentinel value for unspecified option.
+               clEnumValN(RunOutliner::AlwaysOutline, "", "")));
+// Disable the pass to fix unwind information. Whether the pass is included in
+// the pipeline is controlled via the target options, this option serves as
+// manual override.
+static cl::opt<bool> DisableCFIFixup("disable-cfi-fixup", cl::Hidden,
+                                     cl::desc("Disable the CFI fixup pass"));
+// Enable or disable FastISel. Both options are needed, because
+// FastISel is enabled by default with -fast, and we wish to be
+// able to enable or disable fast-isel independently from -O0.
+static cl::opt<cl::boolOrDefault>
+EnableFastISelOption("fast-isel", cl::Hidden,
+  cl::desc("Enable the \"fast\" instruction selector"));
+
+static cl::opt<cl::boolOrDefault> EnableGlobalISelOption(
+    "global-isel", cl::Hidden,
+    cl::desc("Enable the \"global\" instruction selector"));
+
+// FIXME: remove this after switching to NPM or GlobalISel, whichever gets there
+//        first...
+static cl::opt<bool>
+    PrintAfterISel("print-after-isel", cl::init(false), cl::Hidden,
+                   cl::desc("Print machine instrs after ISel"));
+
+static cl::opt<GlobalISelAbortMode> EnableGlobalISelAbort(
+    "global-isel-abort", cl::Hidden,
+    cl::desc("Enable abort calls when \"global\" instruction selection "
+             "fails to lower/select an instruction"),
+    cl::values(
+        clEnumValN(GlobalISelAbortMode::Disable, "0", "Disable the abort"),
+        clEnumValN(GlobalISelAbortMode::Enable, "1", "Enable the abort"),
+        clEnumValN(GlobalISelAbortMode::DisableWithDiag, "2",
+                   "Disable the abort but emit a diagnostic on failure")));
+
+// Disable MIRProfileLoader before RegAlloc. This is for for debugging and
+// tuning purpose.
+static cl::opt<bool> DisableRAFSProfileLoader(
+    "disable-ra-fsprofile-loader", cl::init(false), cl::Hidden,
+    cl::desc("Disable MIRProfileLoader before RegAlloc"));
+// Disable MIRProfileLoader before BloackPlacement. This is for for debugging
+// and tuning purpose.
+static cl::opt<bool> DisableLayoutFSProfileLoader(
+    "disable-layout-fsprofile-loader", cl::init(false), cl::Hidden,
+    cl::desc("Disable MIRProfileLoader before BlockPlacement"));
+// Specify FSProfile file name.
+static cl::opt<std::string>
+    FSProfileFile("fs-profile-file", cl::init(""), cl::value_desc("filename"),
+                  cl::desc("Flow Sensitive profile file name."), cl::Hidden);
+// Specify Remapping file for FSProfile.
+static cl::opt<std::string> FSRemappingFile(
+    "fs-remapping-file", cl::init(""), cl::value_desc("filename"),
+    cl::desc("Flow Sensitive profile remapping file name."), cl::Hidden);
+
+// Temporary option to allow experimenting with MachineScheduler as a post-RA
+// scheduler. Targets can "properly" enable this with
+// substitutePass(&PostRASchedulerID, &PostMachineSchedulerID).
+// Targets can return true in targetSchedulesPostRAScheduling() and
+// insert a PostRA scheduling pass wherever it wants.
+static cl::opt<bool> MISchedPostRA(
+    "misched-postra", cl::Hidden,
+    cl::desc(
+        "Run MachineScheduler post regalloc (independent of preRA sched)"));
+
+// Experimental option to run live interval analysis early.
+static cl::opt<bool> EarlyLiveIntervals("early-live-intervals", cl::Hidden,
+    cl::desc("Run live interval analysis earlier in the pipeline"));
+
+/// Option names for limiting the codegen pipeline.
+/// Those are used in error reporting and we didn't want
+/// to duplicate their names all over the place.
+static const char StartAfterOptName[] = "start-after";
+static const char StartBeforeOptName[] = "start-before";
+static const char StopAfterOptName[] = "stop-after";
+static const char StopBeforeOptName[] = "stop-before";
+
+static cl::opt<std::string>
+    StartAfterOpt(StringRef(StartAfterOptName),
+                  cl::desc("Resume compilation after a specific pass"),
+                  cl::value_desc("pass-name"), cl::init(""), cl::Hidden);
+
+static cl::opt<std::string>
+    StartBeforeOpt(StringRef(StartBeforeOptName),
+                   cl::desc("Resume compilation before a specific pass"),
+                   cl::value_desc("pass-name"), cl::init(""), cl::Hidden);
+
+static cl::opt<std::string>
+    StopAfterOpt(StringRef(StopAfterOptName),
+                 cl::desc("Stop compilation after a specific pass"),
+                 cl::value_desc("pass-name"), cl::init(""), cl::Hidden);
+
+static cl::opt<std::string>
+    StopBeforeOpt(StringRef(StopBeforeOptName),
+                  cl::desc("Stop compilation before a specific pass"),
+                  cl::value_desc("pass-name"), cl::init(""), cl::Hidden);
+
+/// Enable the machine function splitter pass.
+static cl::opt<bool> EnableMachineFunctionSplitter(
+    "enable-split-machine-functions", cl::Hidden,
+    cl::desc("Split out cold blocks from machine functions based on profile "
+             "information."));
+
+/// Disable the expand reductions pass for testing.
+static cl::opt<bool> DisableExpandReductions(
+    "disable-expand-reductions", cl::init(false), cl::Hidden,
+    cl::desc("Disable the expand reduction intrinsics pass from running"));
+
+/// Disable the select optimization pass.
+static cl::opt<bool> DisableSelectOptimize(
+    "disable-select-optimize", cl::init(true), cl::Hidden,
+    cl::desc("Disable the select-optimization pass from running"));
+
+/// Allow standard passes to be disabled by command line options. This supports
+/// simple binary flags that either suppress the pass or do nothing.
+/// i.e. -disable-mypass=false has no effect.
+/// These should be converted to boolOrDefault in order to use applyOverride.
+static IdentifyingPassPtr applyDisable(IdentifyingPassPtr PassID,
+                                       bool Override) {
+  if (Override)
+    return IdentifyingPassPtr();
+  return PassID;
+}
+
+/// Allow standard passes to be disabled by the command line, regardless of who
+/// is adding the pass.
+///
+/// StandardID is the pass identified in the standard pass pipeline and provided
+/// to addPass(). It may be a target-specific ID in the case that the target
+/// directly adds its own pass, but in that case we harmlessly fall through.
+///
+/// TargetID is the pass that the target has configured to override StandardID.
+///
+/// StandardID may be a pseudo ID. In that case TargetID is the name of the real
+/// pass to run. This allows multiple options to control a single pass depending
+/// on where in the pipeline that pass is added.
+static IdentifyingPassPtr overridePass(AnalysisID StandardID,
+                                       IdentifyingPassPtr TargetID) {
+  if (StandardID == &PostRASchedulerID)
+    return applyDisable(TargetID, DisablePostRASched);
+
+  if (StandardID == &BranchFolderPassID)
+    return applyDisable(TargetID, DisableBranchFold);
+
+  if (StandardID == &TailDuplicateID)
+    return applyDisable(TargetID, DisableTailDuplicate);
+
+  if (StandardID == &EarlyTailDuplicateID)
+    return applyDisable(TargetID, DisableEarlyTailDup);
+
+  if (StandardID == &MachineBlockPlacementID)
+    return applyDisable(TargetID, DisableBlockPlacement);
+
+  if (StandardID == &StackSlotColoringID)
+    return applyDisable(TargetID, DisableSSC);
+
+  if (StandardID == &DeadMachineInstructionElimID)
+    return applyDisable(TargetID, DisableMachineDCE);
+
+  if (StandardID == &EarlyIfConverterID)
+    return applyDisable(TargetID, DisableEarlyIfConversion);
+
+  if (StandardID == &EarlyMachineLICMID)
+    return applyDisable(TargetID, DisableMachineLICM);
+
+  if (StandardID == &MachineCSEID)
+    return applyDisable(TargetID, DisableMachineCSE);
+
+  if (StandardID == &MachineLICMID)
+    return applyDisable(TargetID, DisablePostRAMachineLICM);
+
+  if (StandardID == &MachineSinkingID)
+    return applyDisable(TargetID, DisableMachineSink);
+
+  if (StandardID == &PostRAMachineSinkingID)
+    return applyDisable(TargetID, DisablePostRAMachineSink);
+
+  if (StandardID == &MachineCopyPropagationID)
+    return applyDisable(TargetID, DisableCopyProp);
+
+  return TargetID;
+}
+
+// Find the FSProfile file name. The internal option takes the precedence
+// before getting from TargetMachine.
+static std::string getFSProfileFile(const TargetMachine *TM) {
+  if (!FSProfileFile.empty())
+    return FSProfileFile.getValue();
+  const std::optional<PGOOptions> &PGOOpt = TM->getPGOOption();
+  if (PGOOpt == std::nullopt || PGOOpt->Action != PGOOptions::SampleUse)
+    return std::string();
+  return PGOOpt->ProfileFile;
+}
+
+// Find the Profile remapping file name. The internal option takes the
+// precedence before getting from TargetMachine.
+static std::string getFSRemappingFile(const TargetMachine *TM) {
+  if (!FSRemappingFile.empty())
+    return FSRemappingFile.getValue();
+  const std::optional<PGOOptions> &PGOOpt = TM->getPGOOption();
+  if (PGOOpt == std::nullopt || PGOOpt->Action != PGOOptions::SampleUse)
+    return std::string();
+  return PGOOpt->ProfileRemappingFile;
+}
+
+//===---------------------------------------------------------------------===//
+/// TargetPassConfig
+//===---------------------------------------------------------------------===//
+
+INITIALIZE_PASS(TargetPassConfig, "targetpassconfig",
+                "Target Pass Configuration", false, false)
+char TargetPassConfig::ID = 0;
+
+namespace {
+
+struct InsertedPass {
+  AnalysisID TargetPassID;
+  IdentifyingPassPtr InsertedPassID;
+
+  InsertedPass(AnalysisID TargetPassID, IdentifyingPassPtr InsertedPassID)
+      : TargetPassID(TargetPassID), InsertedPassID(InsertedPassID) {}
+
+  Pass *getInsertedPass() const {
+    assert(InsertedPassID.isValid() && "Illegal Pass ID!");
+    if (InsertedPassID.isInstance())
+      return InsertedPassID.getInstance();
+    Pass *NP = Pass::createPass(InsertedPassID.getID());
+    assert(NP && "Pass ID not registered");
+    return NP;
+  }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+extern cl::opt<bool> EnableFSDiscriminator;
+
+class PassConfigImpl {
+public:
+  // List of passes explicitly substituted by this target. Normally this is
+  // empty, but it is a convenient way to suppress or replace specific passes
+  // that are part of a standard pass pipeline without overridding the entire
+  // pipeline. This mechanism allows target options to inherit a standard pass's
+  // user interface. For example, a target may disable a standard pass by
+  // default by substituting a pass ID of zero, and the user may still enable
+  // that standard pass with an explicit command line option.
+  DenseMap<AnalysisID,IdentifyingPassPtr> TargetPasses;
+
+  /// Store the pairs of <AnalysisID, AnalysisID> of which the second pass
+  /// is inserted after each instance of the first one.
+  SmallVector<InsertedPass, 4> InsertedPasses;
+};
+
+} // end namespace llvm
+
+// Out of line virtual method.
+TargetPassConfig::~TargetPassConfig() {
+  delete Impl;
+}
+
+static const PassInfo *getPassInfo(StringRef PassName) {
+  if (PassName.empty())
+    return nullptr;
+
+  const PassRegistry &PR = *PassRegistry::getPassRegistry();
+  const PassInfo *PI = PR.getPassInfo(PassName);
+  if (!PI)
+    report_fatal_error(Twine('\"') + Twine(PassName) +
+                       Twine("\" pass is not registered."));
+  return PI;
+}
+
+static AnalysisID getPassIDFromName(StringRef PassName) {
+  const PassInfo *PI = getPassInfo(PassName);
+  return PI ? PI->getTypeInfo() : nullptr;
+}
+
+static std::pair<StringRef, unsigned>
+getPassNameAndInstanceNum(StringRef PassName) {
+  StringRef Name, InstanceNumStr;
+  std::tie(Name, InstanceNumStr) = PassName.split(',');
+
+  unsigned InstanceNum = 0;
+  if (!InstanceNumStr.empty() && InstanceNumStr.getAsInteger(10, InstanceNum))
+    report_fatal_error("invalid pass instance specifier " + PassName);
+
+  return std::make_pair(Name, InstanceNum);
+}
+
+void TargetPassConfig::setStartStopPasses() {
+  StringRef StartBeforeName;
+  std::tie(StartBeforeName, StartBeforeInstanceNum) =
+    getPassNameAndInstanceNum(StartBeforeOpt);
+
+  StringRef StartAfterName;
+  std::tie(StartAfterName, StartAfterInstanceNum) =
+    getPassNameAndInstanceNum(StartAfterOpt);
+
+  StringRef StopBeforeName;
+  std::tie(StopBeforeName, StopBeforeInstanceNum)
+    = getPassNameAndInstanceNum(StopBeforeOpt);
+
+  StringRef StopAfterName;
+  std::tie(StopAfterName, StopAfterInstanceNum)
+    = getPassNameAndInstanceNum(StopAfterOpt);
+
+  StartBefore = getPassIDFromName(StartBeforeName);
+  StartAfter = getPassIDFromName(StartAfterName);
+  StopBefore = getPassIDFromName(StopBeforeName);
+  StopAfter = getPassIDFromName(StopAfterName);
+  if (StartBefore && StartAfter)
+    report_fatal_error(Twine(StartBeforeOptName) + Twine(" and ") +
+                       Twine(StartAfterOptName) + Twine(" specified!"));
+  if (StopBefore && StopAfter)
+    report_fatal_error(Twine(StopBeforeOptName) + Twine(" and ") +
+                       Twine(StopAfterOptName) + Twine(" specified!"));
+  Started = (StartAfter == nullptr) && (StartBefore == nullptr);
+}
+
+CGPassBuilderOption llvm::getCGPassBuilderOption() {
+  CGPassBuilderOption Opt;
+
+#define SET_OPTION(Option)                                                     \
+  if (Option.getNumOccurrences())                                              \
+    Opt.Option = Option;
+
+  SET_OPTION(EnableFastISelOption)
+  SET_OPTION(EnableGlobalISelAbort)
+  SET_OPTION(EnableGlobalISelOption)
+  SET_OPTION(EnableIPRA)
+  SET_OPTION(OptimizeRegAlloc)
+  SET_OPTION(VerifyMachineCode)
+
+#define SET_BOOLEAN_OPTION(Option) Opt.Option = Option;
+
+  SET_BOOLEAN_OPTION(EarlyLiveIntervals)
+  SET_BOOLEAN_OPTION(EnableBlockPlacementStats)
+  SET_BOOLEAN_OPTION(EnableImplicitNullChecks)
+  SET_BOOLEAN_OPTION(EnableMachineOutliner)
+  SET_BOOLEAN_OPTION(MISchedPostRA)
+  SET_BOOLEAN_OPTION(DisableMergeICmps)
+  SET_BOOLEAN_OPTION(DisableLSR)
+  SET_BOOLEAN_OPTION(DisableConstantHoisting)
+  SET_BOOLEAN_OPTION(DisableCGP)
+  SET_BOOLEAN_OPTION(DisablePartialLibcallInlining)
+  SET_BOOLEAN_OPTION(DisableSelectOptimize)
+  SET_BOOLEAN_OPTION(PrintLSR)
+  SET_BOOLEAN_OPTION(PrintISelInput)
+  SET_BOOLEAN_OPTION(PrintGCInfo)
+
+  return Opt;
+}
+
+static void registerPartialPipelineCallback(PassInstrumentationCallbacks &PIC,
+                                            LLVMTargetMachine &LLVMTM) {
+  StringRef StartBefore;
+  StringRef StartAfter;
+  StringRef StopBefore;
+  StringRef StopAfter;
+
+  unsigned StartBeforeInstanceNum = 0;
+  unsigned StartAfterInstanceNum = 0;
+  unsigned StopBeforeInstanceNum = 0;
+  unsigned StopAfterInstanceNum = 0;
+
+  std::tie(StartBefore, StartBeforeInstanceNum) =
+      getPassNameAndInstanceNum(StartBeforeOpt);
+  std::tie(StartAfter, StartAfterInstanceNum) =
+      getPassNameAndInstanceNum(StartAfterOpt);
+  std::tie(StopBefore, StopBeforeInstanceNum) =
+      getPassNameAndInstanceNum(StopBeforeOpt);
+  std::tie(StopAfter, StopAfterInstanceNum) =
+      getPassNameAndInstanceNum(StopAfterOpt);
+
+  if (StartBefore.empty() && StartAfter.empty() && StopBefore.empty() &&
+      StopAfter.empty())
+    return;
+
+  std::tie(StartBefore, std::ignore) =
+      LLVMTM.getPassNameFromLegacyName(StartBefore);
+  std::tie(StartAfter, std::ignore) =
+      LLVMTM.getPassNameFromLegacyName(StartAfter);
+  std::tie(StopBefore, std::ignore) =
+      LLVMTM.getPassNameFromLegacyName(StopBefore);
+  std::tie(StopAfter, std::ignore) =
+      LLVMTM.getPassNameFromLegacyName(StopAfter);
+  if (!StartBefore.empty() && !StartAfter.empty())
+    report_fatal_error(Twine(StartBeforeOptName) + Twine(" and ") +
+                       Twine(StartAfterOptName) + Twine(" specified!"));
+  if (!StopBefore.empty() && !StopAfter.empty())
+    report_fatal_error(Twine(StopBeforeOptName) + Twine(" and ") +
+                       Twine(StopAfterOptName) + Twine(" specified!"));
+
+  PIC.registerShouldRunOptionalPassCallback(
+      [=, EnableCurrent = StartBefore.empty() && StartAfter.empty(),
+       EnableNext = std::optional<bool>(), StartBeforeCount = 0u,
+       StartAfterCount = 0u, StopBeforeCount = 0u,
+       StopAfterCount = 0u](StringRef P, Any) mutable {
+        bool StartBeforePass = !StartBefore.empty() && P.contains(StartBefore);
+        bool StartAfterPass = !StartAfter.empty() && P.contains(StartAfter);
+        bool StopBeforePass = !StopBefore.empty() && P.contains(StopBefore);
+        bool StopAfterPass = !StopAfter.empty() && P.contains(StopAfter);
+
+        // Implement -start-after/-stop-after
+        if (EnableNext) {
+          EnableCurrent = *EnableNext;
+          EnableNext.reset();
+        }
+
+        // Using PIC.registerAfterPassCallback won't work because if this
+        // callback returns false, AfterPassCallback is also skipped.
+        if (StartAfterPass && StartAfterCount++ == StartAfterInstanceNum) {
+          assert(!EnableNext && "Error: assign to EnableNext more than once");
+          EnableNext = true;
+        }
+        if (StopAfterPass && StopAfterCount++ == StopAfterInstanceNum) {
+          assert(!EnableNext && "Error: assign to EnableNext more than once");
+          EnableNext = false;
+        }
+
+        if (StartBeforePass && StartBeforeCount++ == StartBeforeInstanceNum)
+          EnableCurrent = true;
+        if (StopBeforePass && StopBeforeCount++ == StopBeforeInstanceNum)
+          EnableCurrent = false;
+        return EnableCurrent;
+      });
+}
+
+void llvm::registerCodeGenCallback(PassInstrumentationCallbacks &PIC,
+                                   LLVMTargetMachine &LLVMTM) {
+
+  // Register a callback for disabling passes.
+  PIC.registerShouldRunOptionalPassCallback([](StringRef P, Any) {
+
+#define DISABLE_PASS(Option, Name)                                             \
+  if (Option && P.contains(#Name))                                             \
+    return false;
+    DISABLE_PASS(DisableBlockPlacement, MachineBlockPlacementPass)
+    DISABLE_PASS(DisableBranchFold, BranchFolderPass)
+    DISABLE_PASS(DisableCopyProp, MachineCopyPropagationPass)
+    DISABLE_PASS(DisableEarlyIfConversion, EarlyIfConverterPass)
+    DISABLE_PASS(DisableEarlyTailDup, EarlyTailDuplicatePass)
+    DISABLE_PASS(DisableMachineCSE, MachineCSEPass)
+    DISABLE_PASS(DisableMachineDCE, DeadMachineInstructionElimPass)
+    DISABLE_PASS(DisableMachineLICM, EarlyMachineLICMPass)
+    DISABLE_PASS(DisableMachineSink, MachineSinkingPass)
+    DISABLE_PASS(DisablePostRAMachineLICM, MachineLICMPass)
+    DISABLE_PASS(DisablePostRAMachineSink, PostRAMachineSinkingPass)
+    DISABLE_PASS(DisablePostRASched, PostRASchedulerPass)
+    DISABLE_PASS(DisableSSC, StackSlotColoringPass)
+    DISABLE_PASS(DisableTailDuplicate, TailDuplicatePass)
+
+    return true;
+  });
+
+  registerPartialPipelineCallback(PIC, LLVMTM);
+}
+
+// Out of line constructor provides default values for pass options and
+// registers all common codegen passes.
+TargetPassConfig::TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm)
+    : ImmutablePass(ID), PM(&pm), TM(&TM) {
+  Impl = new PassConfigImpl();
+
+  // Register all target independent codegen passes to activate their PassIDs,
+  // including this pass itself.
+  initializeCodeGen(*PassRegistry::getPassRegistry());
+
+  // Also register alias analysis passes required by codegen passes.
+  initializeBasicAAWrapperPassPass(*PassRegistry::getPassRegistry());
+  initializeAAResultsWrapperPassPass(*PassRegistry::getPassRegistry());
+
+  if (EnableIPRA.getNumOccurrences())
+    TM.Options.EnableIPRA = EnableIPRA;
+  else {
+    // If not explicitly specified, use target default.
+    TM.Options.EnableIPRA |= TM.useIPRA();
+  }
+
+  if (TM.Options.EnableIPRA)
+    setRequiresCodeGenSCCOrder();
+
+  if (EnableGlobalISelAbort.getNumOccurrences())
+    TM.Options.GlobalISelAbort = EnableGlobalISelAbort;
+
+  setStartStopPasses();
+}
+
+CodeGenOpt::Level TargetPassConfig::getOptLevel() const {
+  return TM->getOptLevel();
+}
+
+/// Insert InsertedPassID pass after TargetPassID.
+void TargetPassConfig::insertPass(AnalysisID TargetPassID,
+                                  IdentifyingPassPtr InsertedPassID) {
+  assert(((!InsertedPassID.isInstance() &&
+           TargetPassID != InsertedPassID.getID()) ||
+          (InsertedPassID.isInstance() &&
+           TargetPassID != InsertedPassID.getInstance()->getPassID())) &&
+         "Insert a pass after itself!");
+  Impl->InsertedPasses.emplace_back(TargetPassID, InsertedPassID);
+}
+
+/// createPassConfig - Create a pass configuration object to be used by
+/// addPassToEmitX methods for generating a pipeline of CodeGen passes.
+///
+/// Targets may override this to extend TargetPassConfig.
+TargetPassConfig *LLVMTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new TargetPassConfig(*this, PM);
+}
+
+TargetPassConfig::TargetPassConfig()
+  : ImmutablePass(ID) {
+  report_fatal_error("Trying to construct TargetPassConfig without a target "
+                     "machine. Scheduling a CodeGen pass without a target "
+                     "triple set?");
+}
+
+bool TargetPassConfig::willCompleteCodeGenPipeline() {
+  return StopBeforeOpt.empty() && StopAfterOpt.empty();
+}
+
+bool TargetPassConfig::hasLimitedCodeGenPipeline() {
+  return !StartBeforeOpt.empty() || !StartAfterOpt.empty() ||
+         !willCompleteCodeGenPipeline();
+}
+
+std::string
+TargetPassConfig::getLimitedCodeGenPipelineReason(const char *Separator) {
+  if (!hasLimitedCodeGenPipeline())
+    return std::string();
+  std::string Res;
+  static cl::opt<std::string> *PassNames[] = {&StartAfterOpt, &StartBeforeOpt,
+                                              &StopAfterOpt, &StopBeforeOpt};
+  static const char *OptNames[] = {StartAfterOptName, StartBeforeOptName,
+                                   StopAfterOptName, StopBeforeOptName};
+  bool IsFirst = true;
+  for (int Idx = 0; Idx < 4; ++Idx)
+    if (!PassNames[Idx]->empty()) {
+      if (!IsFirst)
+        Res += Separator;
+      IsFirst = false;
+      Res += OptNames[Idx];
+    }
+  return Res;
+}
+
+// Helper to verify the analysis is really immutable.
+void TargetPassConfig::setOpt(bool &Opt, bool Val) {
+  assert(!Initialized && "PassConfig is immutable");
+  Opt = Val;
+}
+
+void TargetPassConfig::substitutePass(AnalysisID StandardID,
+                                      IdentifyingPassPtr TargetID) {
+  Impl->TargetPasses[StandardID] = TargetID;
+}
+
+IdentifyingPassPtr TargetPassConfig::getPassSubstitution(AnalysisID ID) const {
+  DenseMap<AnalysisID, IdentifyingPassPtr>::const_iterator
+    I = Impl->TargetPasses.find(ID);
+  if (I == Impl->TargetPasses.end())
+    return ID;
+  return I->second;
+}
+
+bool TargetPassConfig::isPassSubstitutedOrOverridden(AnalysisID ID) const {
+  IdentifyingPassPtr TargetID = getPassSubstitution(ID);
+  IdentifyingPassPtr FinalPtr = overridePass(ID, TargetID);
+  return !FinalPtr.isValid() || FinalPtr.isInstance() ||
+      FinalPtr.getID() != ID;
+}
+
+/// Add a pass to the PassManager if that pass is supposed to be run.  If the
+/// Started/Stopped flags indicate either that the compilation should start at
+/// a later pass or that it should stop after an earlier pass, then do not add
+/// the pass.  Finally, compare the current pass against the StartAfter
+/// and StopAfter options and change the Started/Stopped flags accordingly.
+void TargetPassConfig::addPass(Pass *P) {
+  assert(!Initialized && "PassConfig is immutable");
+
+  // Cache the Pass ID here in case the pass manager finds this pass is
+  // redundant with ones already scheduled / available, and deletes it.
+  // Fundamentally, once we add the pass to the manager, we no longer own it
+  // and shouldn't reference it.
+  AnalysisID PassID = P->getPassID();
+
+  if (StartBefore == PassID && StartBeforeCount++ == StartBeforeInstanceNum)
+    Started = true;
+  if (StopBefore == PassID && StopBeforeCount++ == StopBeforeInstanceNum)
+    Stopped = true;
+  if (Started && !Stopped) {
+    if (AddingMachinePasses) {
+      // Construct banner message before PM->add() as that may delete the pass.
+      std::string Banner =
+          std::string("After ") + std::string(P->getPassName());
+      addMachinePrePasses();
+      PM->add(P);
+      addMachinePostPasses(Banner);
+    } else {
+      PM->add(P);
+    }
+
+    // Add the passes after the pass P if there is any.
+    for (const auto &IP : Impl->InsertedPasses)
+      if (IP.TargetPassID == PassID)
+        addPass(IP.getInsertedPass());
+  } else {
+    delete P;
+  }
+
+  if (StopAfter == PassID && StopAfterCount++ == StopAfterInstanceNum)
+    Stopped = true;
+
+  if (StartAfter == PassID && StartAfterCount++ == StartAfterInstanceNum)
+    Started = true;
+  if (Stopped && !Started)
+    report_fatal_error("Cannot stop compilation after pass that is not run");
+}
+
+/// Add a CodeGen pass at this point in the pipeline after checking for target
+/// and command line overrides.
+///
+/// addPass cannot return a pointer to the pass instance because is internal the
+/// PassManager and the instance we create here may already be freed.
+AnalysisID TargetPassConfig::addPass(AnalysisID PassID) {
+  IdentifyingPassPtr TargetID = getPassSubstitution(PassID);
+  IdentifyingPassPtr FinalPtr = overridePass(PassID, TargetID);
+  if (!FinalPtr.isValid())
+    return nullptr;
+
+  Pass *P;
+  if (FinalPtr.isInstance())
+    P = FinalPtr.getInstance();
+  else {
+    P = Pass::createPass(FinalPtr.getID());
+    if (!P)
+      llvm_unreachable("Pass ID not registered");
+  }
+  AnalysisID FinalID = P->getPassID();
+  addPass(P); // Ends the lifetime of P.
+
+  return FinalID;
+}
+
+void TargetPassConfig::printAndVerify(const std::string &Banner) {
+  addPrintPass(Banner);
+  addVerifyPass(Banner);
+}
+
+void TargetPassConfig::addPrintPass(const std::string &Banner) {
+  if (PrintAfterISel)
+    PM->add(createMachineFunctionPrinterPass(dbgs(), Banner));
+}
+
+void TargetPassConfig::addVerifyPass(const std::string &Banner) {
+  bool Verify = VerifyMachineCode == cl::BOU_TRUE;
+#ifdef EXPENSIVE_CHECKS
+  if (VerifyMachineCode == cl::BOU_UNSET)
+    Verify = TM->isMachineVerifierClean();
+#endif
+  if (Verify)
+    PM->add(createMachineVerifierPass(Banner));
+}
+
+void TargetPassConfig::addDebugifyPass() {
+  PM->add(createDebugifyMachineModulePass());
+}
+
+void TargetPassConfig::addStripDebugPass() {
+  PM->add(createStripDebugMachineModulePass(/*OnlyDebugified=*/true));
+}
+
+void TargetPassConfig::addCheckDebugPass() {
+  PM->add(createCheckDebugMachineModulePass());
+}
+
+void TargetPassConfig::addMachinePrePasses(bool AllowDebugify) {
+  if (AllowDebugify && DebugifyIsSafe &&
+      (DebugifyAndStripAll == cl::BOU_TRUE ||
+       DebugifyCheckAndStripAll == cl::BOU_TRUE))
+    addDebugifyPass();
+}
+
+void TargetPassConfig::addMachinePostPasses(const std::string &Banner) {
+  if (DebugifyIsSafe) {
+    if (DebugifyCheckAndStripAll == cl::BOU_TRUE) {
+      addCheckDebugPass();
+      addStripDebugPass();
+    } else if (DebugifyAndStripAll == cl::BOU_TRUE)
+      addStripDebugPass();
+  }
+  addVerifyPass(Banner);
+}
+
+/// Add common target configurable passes that perform LLVM IR to IR transforms
+/// following machine independent optimization.
+void TargetPassConfig::addIRPasses() {
+  // Before running any passes, run the verifier to determine if the input
+  // coming from the front-end and/or optimizer is valid.
+  if (!DisableVerify)
+    addPass(createVerifierPass());
+
+  if (getOptLevel() != CodeGenOpt::None) {
+    // Basic AliasAnalysis support.
+    // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+    // BasicAliasAnalysis wins if they disagree. This is intended to help
+    // support "obvious" type-punning idioms.
+    addPass(createTypeBasedAAWrapperPass());
+    addPass(createScopedNoAliasAAWrapperPass());
+    addPass(createBasicAAWrapperPass());
+
+    // Run loop strength reduction before anything else.
+    if (!DisableLSR) {
+      addPass(createCanonicalizeFreezeInLoopsPass());
+      addPass(createLoopStrengthReducePass());
+      if (PrintLSR)
+        addPass(createPrintFunctionPass(dbgs(),
+                                        "\n\n*** Code after LSR ***\n"));
+    }
+
+    // The MergeICmpsPass tries to create memcmp calls by grouping sequences of
+    // loads and compares. ExpandMemCmpPass then tries to expand those calls
+    // into optimally-sized loads and compares. The transforms are enabled by a
+    // target lowering hook.
+    if (!DisableMergeICmps)
+      addPass(createMergeICmpsLegacyPass());
+    addPass(createExpandMemCmpPass());
+  }
+
+  // Run GC lowering passes for builtin collectors
+  // TODO: add a pass insertion point here
+  addPass(&GCLoweringID);
+  addPass(&ShadowStackGCLoweringID);
+  addPass(createLowerConstantIntrinsicsPass());
+
+  // For MachO, lower @llvm.global_dtors into @llvm.global_ctors with
+  // __cxa_atexit() calls to avoid emitting the deprecated __mod_term_func.
+  if (TM->getTargetTriple().isOSBinFormatMachO() &&
+      !DisableAtExitBasedGlobalDtorLowering)
+    addPass(createLowerGlobalDtorsLegacyPass());
+
+  // Make sure that no unreachable blocks are instruction selected.
+  addPass(createUnreachableBlockEliminationPass());
+
+  // Prepare expensive constants for SelectionDAG.
+  if (getOptLevel() != CodeGenOpt::None && !DisableConstantHoisting)
+    addPass(createConstantHoistingPass());
+
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createReplaceWithVeclibLegacyPass());
+
+  if (getOptLevel() != CodeGenOpt::None && !DisablePartialLibcallInlining)
+    addPass(createPartiallyInlineLibCallsPass());
+
+  // Expand vector predication intrinsics into standard IR instructions.
+  // This pass has to run before ScalarizeMaskedMemIntrin and ExpandReduction
+  // passes since it emits those kinds of intrinsics.
+  addPass(createExpandVectorPredicationPass());
+
+  // Add scalarization of target's unsupported masked memory intrinsics pass.
+  // the unsupported intrinsic will be replaced with a chain of basic blocks,
+  // that stores/loads element one-by-one if the appropriate mask bit is set.
+  addPass(createScalarizeMaskedMemIntrinLegacyPass());
+
+  // Expand reduction intrinsics into shuffle sequences if the target wants to.
+  // Allow disabling it for testing purposes.
+  if (!DisableExpandReductions)
+    addPass(createExpandReductionsPass());
+
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createTLSVariableHoistPass());
+
+  // Convert conditional moves to conditional jumps when profitable.
+  if (getOptLevel() != CodeGenOpt::None && !DisableSelectOptimize)
+    addPass(createSelectOptimizePass());
+}
+
+/// Turn exception handling constructs into something the code generators can
+/// handle.
+void TargetPassConfig::addPassesToHandleExceptions() {
+  const MCAsmInfo *MCAI = TM->getMCAsmInfo();
+  assert(MCAI && "No MCAsmInfo");
+  switch (MCAI->getExceptionHandlingType()) {
+  case ExceptionHandling::SjLj:
+    // SjLj piggy-backs on dwarf for this bit. The cleanups done apply to both
+    // Dwarf EH prepare needs to be run after SjLj prepare. Otherwise,
+    // catch info can get misplaced when a selector ends up more than one block
+    // removed from the parent invoke(s). This could happen when a landing
+    // pad is shared by multiple invokes and is also a target of a normal
+    // edge from elsewhere.
+    addPass(createSjLjEHPreparePass(TM));
+    [[fallthrough]];
+  case ExceptionHandling::DwarfCFI:
+  case ExceptionHandling::ARM:
+  case ExceptionHandling::AIX:
+    addPass(createDwarfEHPass(getOptLevel()));
+    break;
+  case ExceptionHandling::WinEH:
+    // We support using both GCC-style and MSVC-style exceptions on Windows, so
+    // add both preparation passes. Each pass will only actually run if it
+    // recognizes the personality function.
+    addPass(createWinEHPass());
+    addPass(createDwarfEHPass(getOptLevel()));
+    break;
+  case ExceptionHandling::Wasm:
+    // Wasm EH uses Windows EH instructions, but it does not need to demote PHIs
+    // on catchpads and cleanuppads because it does not outline them into
+    // funclets. Catchswitch blocks are not lowered in SelectionDAG, so we
+    // should remove PHIs there.
+    addPass(createWinEHPass(/*DemoteCatchSwitchPHIOnly=*/false));
+    addPass(createWasmEHPass());
+    break;
+  case ExceptionHandling::None:
+    addPass(createLowerInvokePass());
+
+    // The lower invoke pass may create unreachable code. Remove it.
+    addPass(createUnreachableBlockEliminationPass());
+    break;
+  }
+}
+
+/// Add pass to prepare the LLVM IR for code generation. This should be done
+/// before exception handling preparation passes.
+void TargetPassConfig::addCodeGenPrepare() {
+  if (getOptLevel() != CodeGenOpt::None && !DisableCGP)
+    addPass(createCodeGenPreparePass());
+}
+
+/// Add common passes that perform LLVM IR to IR transforms in preparation for
+/// instruction selection.
+void TargetPassConfig::addISelPrepare() {
+  addPreISel();
+
+  // Force codegen to run according to the callgraph.
+  if (requiresCodeGenSCCOrder())
+    addPass(new DummyCGSCCPass);
+
+  addPass(createCallBrPass());
+
+  // Add both the safe stack and the stack protection passes: each of them will
+  // only protect functions that have corresponding attributes.
+  addPass(createSafeStackPass());
+  addPass(createStackProtectorPass());
+
+  if (PrintISelInput)
+    addPass(createPrintFunctionPass(
+        dbgs(), "\n\n*** Final LLVM Code input to ISel ***\n"));
+
+  // All passes which modify the LLVM IR are now complete; run the verifier
+  // to ensure that the IR is valid.
+  if (!DisableVerify)
+    addPass(createVerifierPass());
+}
+
+bool TargetPassConfig::addCoreISelPasses() {
+  // Enable FastISel with -fast-isel, but allow that to be overridden.
+  TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE);
+
+  // Determine an instruction selector.
+  enum class SelectorType { SelectionDAG, FastISel, GlobalISel };
+  SelectorType Selector;
+
+  if (EnableFastISelOption == cl::BOU_TRUE)
+    Selector = SelectorType::FastISel;
+  else if (EnableGlobalISelOption == cl::BOU_TRUE ||
+           (TM->Options.EnableGlobalISel &&
+            EnableGlobalISelOption != cl::BOU_FALSE))
+    Selector = SelectorType::GlobalISel;
+  else if (TM->getOptLevel() == CodeGenOpt::None && TM->getO0WantsFastISel())
+    Selector = SelectorType::FastISel;
+  else
+    Selector = SelectorType::SelectionDAG;
+
+  // Set consistently TM->Options.EnableFastISel and EnableGlobalISel.
+  if (Selector == SelectorType::FastISel) {
+    TM->setFastISel(true);
+    TM->setGlobalISel(false);
+  } else if (Selector == SelectorType::GlobalISel) {
+    TM->setFastISel(false);
+    TM->setGlobalISel(true);
+  }
+
+  // FIXME: Injecting into the DAGISel pipeline seems to cause issues with
+  //        analyses needing to be re-run. This can result in being unable to
+  //        schedule passes (particularly with 'Function Alias Analysis
+  //        Results'). It's not entirely clear why but AFAICT this seems to be
+  //        due to one FunctionPassManager not being able to use analyses from a
+  //        previous one. As we're injecting a ModulePass we break the usual
+  //        pass manager into two. GlobalISel with the fallback path disabled
+  //        and -run-pass seem to be unaffected. The majority of GlobalISel
+  //        testing uses -run-pass so this probably isn't too bad.
+  SaveAndRestore SavedDebugifyIsSafe(DebugifyIsSafe);
+  if (Selector != SelectorType::GlobalISel || !isGlobalISelAbortEnabled())
+    DebugifyIsSafe = false;
+
+  // Add instruction selector passes.
+  if (Selector == SelectorType::GlobalISel) {
+    SaveAndRestore SavedAddingMachinePasses(AddingMachinePasses, true);
+    if (addIRTranslator())
+      return true;
+
+    addPreLegalizeMachineIR();
+
+    if (addLegalizeMachineIR())
+      return true;
+
+    // Before running the register bank selector, ask the target if it
+    // wants to run some passes.
+    addPreRegBankSelect();
+
+    if (addRegBankSelect())
+      return true;
+
+    addPreGlobalInstructionSelect();
+
+    if (addGlobalInstructionSelect())
+      return true;
+
+    // Pass to reset the MachineFunction if the ISel failed.
+    addPass(createResetMachineFunctionPass(
+        reportDiagnosticWhenGlobalISelFallback(), isGlobalISelAbortEnabled()));
+
+    // Provide a fallback path when we do not want to abort on
+    // not-yet-supported input.
+    if (!isGlobalISelAbortEnabled() && addInstSelector())
+      return true;
+
+  } else if (addInstSelector())
+    return true;
+
+  // Expand pseudo-instructions emitted by ISel. Don't run the verifier before
+  // FinalizeISel.
+  addPass(&FinalizeISelID);
+
+  // Print the instruction selected machine code...
+  printAndVerify("After Instruction Selection");
+
+  return false;
+}
+
+bool TargetPassConfig::addISelPasses() {
+  if (TM->useEmulatedTLS())
+    addPass(createLowerEmuTLSPass());
+
+  PM->add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
+  addPass(createPreISelIntrinsicLoweringPass());
+  addPass(createExpandLargeDivRemPass());
+  addPass(createExpandLargeFpConvertPass());
+  addIRPasses();
+  addCodeGenPrepare();
+  addPassesToHandleExceptions();
+  addISelPrepare();
+
+  return addCoreISelPasses();
+}
+
+/// -regalloc=... command line option.
+static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
+static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
+               RegisterPassParser<RegisterRegAlloc>>
+    RegAlloc("regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
+             cl::desc("Register allocator to use"));
+
+/// Add the complete set of target-independent postISel code generator passes.
+///
+/// This can be read as the standard order of major LLVM CodeGen stages. Stages
+/// with nontrivial configuration or multiple passes are broken out below in
+/// add%Stage routines.
+///
+/// Any TargetPassConfig::addXX routine may be overriden by the Target. The
+/// addPre/Post methods with empty header implementations allow injecting
+/// target-specific fixups just before or after major stages. Additionally,
+/// targets have the flexibility to change pass order within a stage by
+/// overriding default implementation of add%Stage routines below. Each
+/// technique has maintainability tradeoffs because alternate pass orders are
+/// not well supported. addPre/Post works better if the target pass is easily
+/// tied to a common pass. But if it has subtle dependencies on multiple passes,
+/// the target should override the stage instead.
+///
+/// TODO: We could use a single addPre/Post(ID) hook to allow pass injection
+/// before/after any target-independent pass. But it's currently overkill.
+void TargetPassConfig::addMachinePasses() {
+  AddingMachinePasses = true;
+
+  // Add passes that optimize machine instructions in SSA form.
+  if (getOptLevel() != CodeGenOpt::None) {
+    addMachineSSAOptimization();
+  } else {
+    // If the target requests it, assign local variables to stack slots relative
+    // to one another and simplify frame index references where possible.
+    addPass(&LocalStackSlotAllocationID);
+  }
+
+  if (TM->Options.EnableIPRA)
+    addPass(createRegUsageInfoPropPass());
+
+  // Run pre-ra passes.
+  addPreRegAlloc();
+
+  // Debugifying the register allocator passes seems to provoke some
+  // non-determinism that affects CodeGen and there doesn't seem to be a point
+  // where it becomes safe again so stop debugifying here.
+  DebugifyIsSafe = false;
+
+  // Add a FSDiscriminator pass right before RA, so that we could get
+  // more precise SampleFDO profile for RA.
+  if (EnableFSDiscriminator) {
+    addPass(createMIRAddFSDiscriminatorsPass(
+        sampleprof::FSDiscriminatorPass::Pass1));
+    const std::string ProfileFile = getFSProfileFile(TM);
+    if (!ProfileFile.empty() && !DisableRAFSProfileLoader)
+      addPass(createMIRProfileLoaderPass(ProfileFile, getFSRemappingFile(TM),
+                                         sampleprof::FSDiscriminatorPass::Pass1,
+                                         nullptr));
+  }
+
+  // Run register allocation and passes that are tightly coupled with it,
+  // including phi elimination and scheduling.
+  if (getOptimizeRegAlloc())
+    addOptimizedRegAlloc();
+  else
+    addFastRegAlloc();
+
+  // Run post-ra passes.
+  addPostRegAlloc();
+
+  addPass(&RemoveRedundantDebugValuesID);
+
+  addPass(&FixupStatepointCallerSavedID);
+
+  // Insert prolog/epilog code.  Eliminate abstract frame index references...
+  if (getOptLevel() != CodeGenOpt::None) {
+    addPass(&PostRAMachineSinkingID);
+    addPass(&ShrinkWrapID);
+  }
+
+  // Prolog/Epilog inserter needs a TargetMachine to instantiate. But only
+  // do so if it hasn't been disabled, substituted, or overridden.
+  if (!isPassSubstitutedOrOverridden(&PrologEpilogCodeInserterID))
+      addPass(createPrologEpilogInserterPass());
+
+  /// Add passes that optimize machine instructions after register allocation.
+  if (getOptLevel() != CodeGenOpt::None)
+    addMachineLateOptimization();
+
+  // Expand pseudo instructions before second scheduling pass.
+  addPass(&ExpandPostRAPseudosID);
+
+  // Run pre-sched2 passes.
+  addPreSched2();
+
+  if (EnableImplicitNullChecks)
+    addPass(&ImplicitNullChecksID);
+
+  // Second pass scheduler.
+  // Let Target optionally insert this pass by itself at some other
+  // point.
+  if (getOptLevel() != CodeGenOpt::None &&
+      !TM->targetSchedulesPostRAScheduling()) {
+    if (MISchedPostRA)
+      addPass(&PostMachineSchedulerID);
+    else
+      addPass(&PostRASchedulerID);
+  }
+
+  // GC
+  if (addGCPasses()) {
+    if (PrintGCInfo)
+      addPass(createGCInfoPrinter(dbgs()));
+  }
+
+  // Basic block placement.
+  if (getOptLevel() != CodeGenOpt::None)
+    addBlockPlacement();
+
+  // Insert before XRay Instrumentation.
+  addPass(&FEntryInserterID);
+
+  addPass(&XRayInstrumentationID);
+  addPass(&PatchableFunctionID);
+
+  addPreEmitPass();
+
+  if (TM->Options.EnableIPRA)
+    // Collect register usage information and produce a register mask of
+    // clobbered registers, to be used to optimize call sites.
+    addPass(createRegUsageInfoCollector());
+
+  // FIXME: Some backends are incompatible with running the verifier after
+  // addPreEmitPass.  Maybe only pass "false" here for those targets?
+  addPass(&FuncletLayoutID);
+
+  addPass(&StackMapLivenessID);
+  addPass(&LiveDebugValuesID);
+  addPass(&MachineSanitizerBinaryMetadataID);
+
+  if (TM->Options.EnableMachineOutliner && getOptLevel() != CodeGenOpt::None &&
+      EnableMachineOutliner != RunOutliner::NeverOutline) {
+    bool RunOnAllFunctions =
+        (EnableMachineOutliner == RunOutliner::AlwaysOutline);
+    bool AddOutliner =
+        RunOnAllFunctions || TM->Options.SupportsDefaultOutlining;
+    if (AddOutliner)
+      addPass(createMachineOutlinerPass(RunOnAllFunctions));
+  }
+
+  if (EnableFSDiscriminator)
+    addPass(createMIRAddFSDiscriminatorsPass(
+        sampleprof::FSDiscriminatorPass::PassLast));
+
+  // Machine function splitter uses the basic block sections feature. Both
+  // cannot be enabled at the same time. Basic block sections takes precedence.
+  // FIXME: In principle, BasicBlockSection::Labels and splitting can used
+  // together. Update this check once we have addressed any issues.
+  if (TM->getBBSectionsType() != llvm::BasicBlockSection::None) {
+    if (TM->getBBSectionsType() == llvm::BasicBlockSection::List) {
+      addPass(llvm::createBasicBlockSectionsProfileReaderPass(
+          TM->getBBSectionsFuncListBuf()));
+    }
+    addPass(llvm::createBasicBlockSectionsPass());
+  } else if (TM->Options.EnableMachineFunctionSplitter ||
+             EnableMachineFunctionSplitter) {
+    const std::string ProfileFile = getFSProfileFile(TM);
+    if (!ProfileFile.empty()) {
+      if (EnableFSDiscriminator) {
+        addPass(createMIRProfileLoaderPass(
+            ProfileFile, getFSRemappingFile(TM),
+            sampleprof::FSDiscriminatorPass::PassLast, nullptr));
+      } else {
+        // Sample profile is given, but FSDiscriminator is not
+        // enabled, this may result in performance regression.
+        WithColor::warning()
+            << "Using AutoFDO without FSDiscriminator for MFS may regress "
+               "performance.";
+      }
+    }
+    addPass(createMachineFunctionSplitterPass());
+  }
+
+  addPostBBSections();
+
+  if (!DisableCFIFixup && TM->Options.EnableCFIFixup)
+    addPass(createCFIFixup());
+
+  PM->add(createStackFrameLayoutAnalysisPass());
+
+  // Add passes that directly emit MI after all other MI passes.
+  addPreEmitPass2();
+
+  AddingMachinePasses = false;
+}
+
+/// Add passes that optimize machine instructions in SSA form.
+void TargetPassConfig::addMachineSSAOptimization() {
+  // Pre-ra tail duplication.
+  addPass(&EarlyTailDuplicateID);
+
+  // Optimize PHIs before DCE: removing dead PHI cycles may make more
+  // instructions dead.
+  addPass(&OptimizePHIsID);
+
+  // This pass merges large allocas. StackSlotColoring is a different pass
+  // which merges spill slots.
+  addPass(&StackColoringID);
+
+  // If the target requests it, assign local variables to stack slots relative
+  // to one another and simplify frame index references where possible.
+  addPass(&LocalStackSlotAllocationID);
+
+  // With optimization, dead code should already be eliminated. However
+  // there is one known exception: lowered code for arguments that are only
+  // used by tail calls, where the tail calls reuse the incoming stack
+  // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
+  addPass(&DeadMachineInstructionElimID);
+
+  // Allow targets to insert passes that improve instruction level parallelism,
+  // like if-conversion. Such passes will typically need dominator trees and
+  // loop info, just like LICM and CSE below.
+  addILPOpts();
+
+  addPass(&EarlyMachineLICMID);
+  addPass(&MachineCSEID);
+
+  addPass(&MachineSinkingID);
+
+  addPass(&PeepholeOptimizerID);
+  // Clean-up the dead code that may have been generated by peephole
+  // rewriting.
+  addPass(&DeadMachineInstructionElimID);
+}
+
+//===---------------------------------------------------------------------===//
+/// Register Allocation Pass Configuration
+//===---------------------------------------------------------------------===//
+
+bool TargetPassConfig::getOptimizeRegAlloc() const {
+  switch (OptimizeRegAlloc) {
+  case cl::BOU_UNSET: return getOptLevel() != CodeGenOpt::None;
+  case cl::BOU_TRUE:  return true;
+  case cl::BOU_FALSE: return false;
+  }
+  llvm_unreachable("Invalid optimize-regalloc state");
+}
+
+/// A dummy default pass factory indicates whether the register allocator is
+/// overridden on the command line.
+static llvm::once_flag InitializeDefaultRegisterAllocatorFlag;
+
+static RegisterRegAlloc
+defaultRegAlloc("default",
+                "pick register allocator based on -O option",
+                useDefaultRegisterAllocator);
+
+static void initializeDefaultRegisterAllocatorOnce() {
+  if (!RegisterRegAlloc::getDefault())
+    RegisterRegAlloc::setDefault(RegAlloc);
+}
+
+/// Instantiate the default register allocator pass for this target for either
+/// the optimized or unoptimized allocation path. This will be added to the pass
+/// manager by addFastRegAlloc in the unoptimized case or addOptimizedRegAlloc
+/// in the optimized case.
+///
+/// A target that uses the standard regalloc pass order for fast or optimized
+/// allocation may still override this for per-target regalloc
+/// selection. But -regalloc=... always takes precedence.
+FunctionPass *TargetPassConfig::createTargetRegisterAllocator(bool Optimized) {
+  if (Optimized)
+    return createGreedyRegisterAllocator();
+  else
+    return createFastRegisterAllocator();
+}
+
+/// Find and instantiate the register allocation pass requested by this target
+/// at the current optimization level.  Different register allocators are
+/// defined as separate passes because they may require different analysis.
+///
+/// This helper ensures that the regalloc= option is always available,
+/// even for targets that override the default allocator.
+///
+/// FIXME: When MachinePassRegistry register pass IDs instead of function ptrs,
+/// this can be folded into addPass.
+FunctionPass *TargetPassConfig::createRegAllocPass(bool Optimized) {
+  // Initialize the global default.
+  llvm::call_once(InitializeDefaultRegisterAllocatorFlag,
+                  initializeDefaultRegisterAllocatorOnce);
+
+  RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();
+  if (Ctor != useDefaultRegisterAllocator)
+    return Ctor();
+
+  // With no -regalloc= override, ask the target for a regalloc pass.
+  return createTargetRegisterAllocator(Optimized);
+}
+
+bool TargetPassConfig::isCustomizedRegAlloc() {
+  return RegAlloc !=
+         (RegisterRegAlloc::FunctionPassCtor)&useDefaultRegisterAllocator;
+}
+
+bool TargetPassConfig::addRegAssignAndRewriteFast() {
+  if (RegAlloc != (RegisterRegAlloc::FunctionPassCtor)&useDefaultRegisterAllocator &&
+      RegAlloc != (RegisterRegAlloc::FunctionPassCtor)&createFastRegisterAllocator)
+    report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc.");
+
+  addPass(createRegAllocPass(false));
+
+  // Allow targets to change the register assignments after
+  // fast register allocation.
+  addPostFastRegAllocRewrite();
+  return true;
+}
+
+bool TargetPassConfig::addRegAssignAndRewriteOptimized() {
+  // Add the selected register allocation pass.
+  addPass(createRegAllocPass(true));
+
+  // Allow targets to change the register assignments before rewriting.
+  addPreRewrite();
+
+  // Finally rewrite virtual registers.
+  addPass(&VirtRegRewriterID);
+
+  // Regalloc scoring for ML-driven eviction - noop except when learning a new
+  // eviction policy.
+  addPass(createRegAllocScoringPass());
+  return true;
+}
+
+/// Return true if the default global register allocator is in use and
+/// has not be overriden on the command line with '-regalloc=...'
+bool TargetPassConfig::usingDefaultRegAlloc() const {
+  return RegAlloc.getNumOccurrences() == 0;
+}
+
+/// Add the minimum set of target-independent passes that are required for
+/// register allocation. No coalescing or scheduling.
+void TargetPassConfig::addFastRegAlloc() {
+  addPass(&PHIEliminationID);
+  addPass(&TwoAddressInstructionPassID);
+
+  addRegAssignAndRewriteFast();
+}
+
+/// Add standard target-independent passes that are tightly coupled with
+/// optimized register allocation, including coalescing, machine instruction
+/// scheduling, and register allocation itself.
+void TargetPassConfig::addOptimizedRegAlloc() {
+  addPass(&DetectDeadLanesID);
+
+  addPass(&ProcessImplicitDefsID);
+
+  // LiveVariables currently requires pure SSA form.
+  //
+  // FIXME: Once TwoAddressInstruction pass no longer uses kill flags,
+  // LiveVariables can be removed completely, and LiveIntervals can be directly
+  // computed. (We still either need to regenerate kill flags after regalloc, or
+  // preferably fix the scavenger to not depend on them).
+  // FIXME: UnreachableMachineBlockElim is a dependant pass of LiveVariables.
+  // When LiveVariables is removed this has to be removed/moved either.
+  // Explicit addition of UnreachableMachineBlockElim allows stopping before or
+  // after it with -stop-before/-stop-after.
+  addPass(&UnreachableMachineBlockElimID);
+  addPass(&LiveVariablesID);
+
+  // Edge splitting is smarter with machine loop info.
+  addPass(&MachineLoopInfoID);
+  addPass(&PHIEliminationID);
+
+  // Eventually, we want to run LiveIntervals before PHI elimination.
+  if (EarlyLiveIntervals)
+    addPass(&LiveIntervalsID);
+
+  addPass(&TwoAddressInstructionPassID);
+  addPass(&RegisterCoalescerID);
+
+  // The machine scheduler may accidentally create disconnected components
+  // when moving subregister definitions around, avoid this by splitting them to
+  // separate vregs before. Splitting can also improve reg. allocation quality.
+  addPass(&RenameIndependentSubregsID);
+
+  // PreRA instruction scheduling.
+  addPass(&MachineSchedulerID);
+
+  if (addRegAssignAndRewriteOptimized()) {
+    // Perform stack slot coloring and post-ra machine LICM.
+    addPass(&StackSlotColoringID);
+
+    // Allow targets to expand pseudo instructions depending on the choice of
+    // registers before MachineCopyPropagation.
+    addPostRewrite();
+
+    // Copy propagate to forward register uses and try to eliminate COPYs that
+    // were not coalesced.
+    addPass(&MachineCopyPropagationID);
+
+    // Run post-ra machine LICM to hoist reloads / remats.
+    //
+    // FIXME: can this move into MachineLateOptimization?
+    addPass(&MachineLICMID);
+  }
+}
+
+//===---------------------------------------------------------------------===//
+/// Post RegAlloc Pass Configuration
+//===---------------------------------------------------------------------===//
+
+/// Add passes that optimize machine instructions after register allocation.
+void TargetPassConfig::addMachineLateOptimization() {
+  // Cleanup of redundant immediate/address loads.
+  addPass(&MachineLateInstrsCleanupID);
+
+  // Branch folding must be run after regalloc and prolog/epilog insertion.
+  addPass(&BranchFolderPassID);
+
+  // Tail duplication.
+  // Note that duplicating tail just increases code size and degrades
+  // performance for targets that require Structured Control Flow.
+  // In addition it can also make CFG irreducible. Thus we disable it.
+  if (!TM->requiresStructuredCFG())
+    addPass(&TailDuplicateID);
+
+  // Copy propagation.
+  addPass(&MachineCopyPropagationID);
+}
+
+/// Add standard GC passes.
+bool TargetPassConfig::addGCPasses() {
+  addPass(&GCMachineCodeAnalysisID);
+  return true;
+}
+
+/// Add standard basic block placement passes.
+void TargetPassConfig::addBlockPlacement() {
+  if (EnableFSDiscriminator) {
+    addPass(createMIRAddFSDiscriminatorsPass(
+        sampleprof::FSDiscriminatorPass::Pass2));
+    const std::string ProfileFile = getFSProfileFile(TM);
+    if (!ProfileFile.empty() && !DisableLayoutFSProfileLoader)
+      addPass(createMIRProfileLoaderPass(ProfileFile, getFSRemappingFile(TM),
+                                         sampleprof::FSDiscriminatorPass::Pass2,
+                                         nullptr));
+  }
+  if (addPass(&MachineBlockPlacementID)) {
+    // Run a separate pass to collect block placement statistics.
+    if (EnableBlockPlacementStats)
+      addPass(&MachineBlockPlacementStatsID);
+  }
+}
+
+//===---------------------------------------------------------------------===//
+/// GlobalISel Configuration
+//===---------------------------------------------------------------------===//
+bool TargetPassConfig::isGlobalISelAbortEnabled() const {
+  return TM->Options.GlobalISelAbort == GlobalISelAbortMode::Enable;
+}
+
+bool TargetPassConfig::reportDiagnosticWhenGlobalISelFallback() const {
+  return TM->Options.GlobalISelAbort == GlobalISelAbortMode::DisableWithDiag;
+}
+
+bool TargetPassConfig::isGISelCSEEnabled() const {
+  return true;
+}
+
+std::unique_ptr<CSEConfigBase> TargetPassConfig::getCSEConfig() const {
+  return std::make_unique<CSEConfigBase>();
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetRegisterInfo.cpp
new file mode 100644
index 000000000000..77d2dfcf2323
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -0,0 +1,678 @@
+//==- TargetRegisterInfo.cpp - Target Register Information Implementation --==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetRegisterInfo interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Printable.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <utility>
+
+#define DEBUG_TYPE "target-reg-info"
+
+using namespace llvm;
+
+static cl::opt<unsigned>
+    HugeSizeForSplit("huge-size-for-split", cl::Hidden,
+                     cl::desc("A threshold of live range size which may cause "
+                              "high compile time cost in global splitting."),
+                     cl::init(5000));
+
+TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
+                             regclass_iterator RCB, regclass_iterator RCE,
+                             const char *const *SRINames,
+                             const LaneBitmask *SRILaneMasks,
+                             LaneBitmask SRICoveringLanes,
+                             const RegClassInfo *const RCIs,
+                             unsigned Mode)
+  : InfoDesc(ID), SubRegIndexNames(SRINames),
+    SubRegIndexLaneMasks(SRILaneMasks),
+    RegClassBegin(RCB), RegClassEnd(RCE),
+    CoveringLanes(SRICoveringLanes),
+    RCInfos(RCIs), HwMode(Mode) {
+}
+
+TargetRegisterInfo::~TargetRegisterInfo() = default;
+
+bool TargetRegisterInfo::shouldRegionSplitForVirtReg(
+    const MachineFunction &MF, const LiveInterval &VirtReg) const {
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineInstr *MI = MRI.getUniqueVRegDef(VirtReg.reg());
+  if (MI && TII->isTriviallyReMaterializable(*MI) &&
+      VirtReg.size() > HugeSizeForSplit)
+    return false;
+  return true;
+}
+
+void TargetRegisterInfo::markSuperRegs(BitVector &RegisterSet,
+                                       MCRegister Reg) const {
+  for (MCPhysReg SR : superregs_inclusive(Reg))
+    RegisterSet.set(SR);
+}
+
+bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet,
+    ArrayRef<MCPhysReg> Exceptions) const {
+  // Check that all super registers of reserved regs are reserved as well.
+  BitVector Checked(getNumRegs());
+  for (unsigned Reg : RegisterSet.set_bits()) {
+    if (Checked[Reg])
+      continue;
+    for (MCPhysReg SR : superregs(Reg)) {
+      if (!RegisterSet[SR] && !is_contained(Exceptions, Reg)) {
+        dbgs() << "Error: Super register " << printReg(SR, this)
+               << " of reserved register " << printReg(Reg, this)
+               << " is not reserved.\n";
+        return false;
+      }
+
+      // We transitively check superregs. So we can remember this for later
+      // to avoid compiletime explosion in deep register hierarchies.
+      Checked.set(SR);
+    }
+  }
+  return true;
+}
+
+namespace llvm {
+
+Printable printReg(Register Reg, const TargetRegisterInfo *TRI,
+                   unsigned SubIdx, const MachineRegisterInfo *MRI) {
+  return Printable([Reg, TRI, SubIdx, MRI](raw_ostream &OS) {
+    if (!Reg)
+      OS << "$noreg";
+    else if (Register::isStackSlot(Reg))
+      OS << "SS#" << Register::stackSlot2Index(Reg);
+    else if (Reg.isVirtual()) {
+      StringRef Name = MRI ? MRI->getVRegName(Reg) : "";
+      if (Name != "") {
+        OS << '%' << Name;
+      } else {
+        OS << '%' << Register::virtReg2Index(Reg);
+      }
+    } else if (!TRI)
+      OS << '$' << "physreg" << Reg;
+    else if (Reg < TRI->getNumRegs()) {
+      OS << '$';
+      printLowerCase(TRI->getName(Reg), OS);
+    } else
+      llvm_unreachable("Register kind is unsupported.");
+
+    if (SubIdx) {
+      if (TRI)
+        OS << ':' << TRI->getSubRegIndexName(SubIdx);
+      else
+        OS << ":sub(" << SubIdx << ')';
+    }
+  });
+}
+
+Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+  return Printable([Unit, TRI](raw_ostream &OS) {
+    // Generic printout when TRI is missing.
+    if (!TRI) {
+      OS << "Unit~" << Unit;
+      return;
+    }
+
+    // Check for invalid register units.
+    if (Unit >= TRI->getNumRegUnits()) {
+      OS << "BadUnit~" << Unit;
+      return;
+    }
+
+    // Normal units have at least one root.
+    MCRegUnitRootIterator Roots(Unit, TRI);
+    assert(Roots.isValid() && "Unit has no roots.");
+    OS << TRI->getName(*Roots);
+    for (++Roots; Roots.isValid(); ++Roots)
+      OS << '~' << TRI->getName(*Roots);
+  });
+}
+
+Printable printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+  return Printable([Unit, TRI](raw_ostream &OS) {
+    if (Register::isVirtualRegister(Unit)) {
+      OS << '%' << Register::virtReg2Index(Unit);
+    } else {
+      OS << printRegUnit(Unit, TRI);
+    }
+  });
+}
+
+Printable printRegClassOrBank(Register Reg, const MachineRegisterInfo &RegInfo,
+                              const TargetRegisterInfo *TRI) {
+  return Printable([Reg, &RegInfo, TRI](raw_ostream &OS) {
+    if (RegInfo.getRegClassOrNull(Reg))
+      OS << StringRef(TRI->getRegClassName(RegInfo.getRegClass(Reg))).lower();
+    else if (RegInfo.getRegBankOrNull(Reg))
+      OS << StringRef(RegInfo.getRegBankOrNull(Reg)->getName()).lower();
+    else {
+      OS << "_";
+      assert((RegInfo.def_empty(Reg) || RegInfo.getType(Reg).isValid()) &&
+             "Generic registers must have a valid type");
+    }
+  });
+}
+
+} // end namespace llvm
+
+/// getAllocatableClass - Return the maximal subclass of the given register
+/// class that is alloctable, or NULL.
+const TargetRegisterClass *
+TargetRegisterInfo::getAllocatableClass(const TargetRegisterClass *RC) const {
+  if (!RC || RC->isAllocatable())
+    return RC;
+
+  for (BitMaskClassIterator It(RC->getSubClassMask(), *this); It.isValid();
+       ++It) {
+    const TargetRegisterClass *SubRC = getRegClass(It.getID());
+    if (SubRC->isAllocatable())
+      return SubRC;
+  }
+  return nullptr;
+}
+
+/// getMinimalPhysRegClass - Returns the Register Class of a physical
+/// register of the given type, picking the most sub register class of
+/// the right type that contains this physreg.
+const TargetRegisterClass *
+TargetRegisterInfo::getMinimalPhysRegClass(MCRegister reg, MVT VT) const {
+  assert(Register::isPhysicalRegister(reg) &&
+         "reg must be a physical register");
+
+  // Pick the most sub register class of the right type that contains
+  // this physreg.
+  const TargetRegisterClass* BestRC = nullptr;
+  for (const TargetRegisterClass* RC : regclasses()) {
+    if ((VT == MVT::Other || isTypeLegalForClass(*RC, VT)) &&
+        RC->contains(reg) && (!BestRC || BestRC->hasSubClass(RC)))
+      BestRC = RC;
+  }
+
+  assert(BestRC && "Couldn't find the register class");
+  return BestRC;
+}
+
+const TargetRegisterClass *
+TargetRegisterInfo::getMinimalPhysRegClassLLT(MCRegister reg, LLT Ty) const {
+  assert(Register::isPhysicalRegister(reg) &&
+         "reg must be a physical register");
+
+  // Pick the most sub register class of the right type that contains
+  // this physreg.
+  const TargetRegisterClass *BestRC = nullptr;
+  for (const TargetRegisterClass *RC : regclasses()) {
+    if ((!Ty.isValid() || isTypeLegalForClass(*RC, Ty)) && RC->contains(reg) &&
+        (!BestRC || BestRC->hasSubClass(RC)))
+      BestRC = RC;
+  }
+
+  return BestRC;
+}
+
+/// getAllocatableSetForRC - Toggle the bits that represent allocatable
+/// registers for the specific register class.
+static void getAllocatableSetForRC(const MachineFunction &MF,
+                                   const TargetRegisterClass *RC, BitVector &R){
+  assert(RC->isAllocatable() && "invalid for nonallocatable sets");
+  ArrayRef<MCPhysReg> Order = RC->getRawAllocationOrder(MF);
+  for (MCPhysReg PR : Order)
+    R.set(PR);
+}
+
+BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
+                                          const TargetRegisterClass *RC) const {
+  BitVector Allocatable(getNumRegs());
+  if (RC) {
+    // A register class with no allocatable subclass returns an empty set.
+    const TargetRegisterClass *SubClass = getAllocatableClass(RC);
+    if (SubClass)
+      getAllocatableSetForRC(MF, SubClass, Allocatable);
+  } else {
+    for (const TargetRegisterClass *C : regclasses())
+      if (C->isAllocatable())
+        getAllocatableSetForRC(MF, C, Allocatable);
+  }
+
+  // Mask out the reserved registers
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const BitVector &Reserved = MRI.getReservedRegs();
+  Allocatable.reset(Reserved);
+
+  return Allocatable;
+}
+
+static inline
+const TargetRegisterClass *firstCommonClass(const uint32_t *A,
+                                            const uint32_t *B,
+                                            const TargetRegisterInfo *TRI) {
+  for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32)
+    if (unsigned Common = *A++ & *B++)
+      return TRI->getRegClass(I + llvm::countr_zero(Common));
+  return nullptr;
+}
+
+const TargetRegisterClass *
+TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
+                                      const TargetRegisterClass *B) const {
+  // First take care of the trivial cases.
+  if (A == B)
+    return A;
+  if (!A || !B)
+    return nullptr;
+
+  // Register classes are ordered topologically, so the largest common
+  // sub-class it the common sub-class with the smallest ID.
+  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this);
+}
+
+const TargetRegisterClass *
+TargetRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+                                             const TargetRegisterClass *B,
+                                             unsigned Idx) const {
+  assert(A && B && "Missing register class");
+  assert(Idx && "Bad sub-register index");
+
+  // Find Idx in the list of super-register indices.
+  for (SuperRegClassIterator RCI(B, this); RCI.isValid(); ++RCI)
+    if (RCI.getSubReg() == Idx)
+      // The bit mask contains all register classes that are projected into B
+      // by Idx. Find a class that is also a sub-class of A.
+      return firstCommonClass(RCI.getMask(), A->getSubClassMask(), this);
+  return nullptr;
+}
+
+const TargetRegisterClass *TargetRegisterInfo::
+getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA,
+                       const TargetRegisterClass *RCB, unsigned SubB,
+                       unsigned &PreA, unsigned &PreB) const {
+  assert(RCA && SubA && RCB && SubB && "Invalid arguments");
+
+  // Search all pairs of sub-register indices that project into RCA and RCB
+  // respectively. This is quadratic, but usually the sets are very small. On
+  // most targets like X86, there will only be a single sub-register index
+  // (e.g., sub_16bit projecting into GR16).
+  //
+  // The worst case is a register class like DPR on ARM.
+  // We have indices dsub_0..dsub_7 projecting into that class.
+  //
+  // It is very common that one register class is a sub-register of the other.
+  // Arrange for RCA to be the larger register so the answer will be found in
+  // the first iteration. This makes the search linear for the most common
+  // case.
+  const TargetRegisterClass *BestRC = nullptr;
+  unsigned *BestPreA = &PreA;
+  unsigned *BestPreB = &PreB;
+  if (getRegSizeInBits(*RCA) < getRegSizeInBits(*RCB)) {
+    std::swap(RCA, RCB);
+    std::swap(SubA, SubB);
+    std::swap(BestPreA, BestPreB);
+  }
+
+  // Also terminate the search one we have found a register class as small as
+  // RCA.
+  unsigned MinSize = getRegSizeInBits(*RCA);
+
+  for (SuperRegClassIterator IA(RCA, this, true); IA.isValid(); ++IA) {
+    unsigned FinalA = composeSubRegIndices(IA.getSubReg(), SubA);
+    for (SuperRegClassIterator IB(RCB, this, true); IB.isValid(); ++IB) {
+      // Check if a common super-register class exists for this index pair.
+      const TargetRegisterClass *RC =
+        firstCommonClass(IA.getMask(), IB.getMask(), this);
+      if (!RC || getRegSizeInBits(*RC) < MinSize)
+        continue;
+
+      // The indexes must compose identically: PreA+SubA == PreB+SubB.
+      unsigned FinalB = composeSubRegIndices(IB.getSubReg(), SubB);
+      if (FinalA != FinalB)
+        continue;
+
+      // Is RC a better candidate than BestRC?
+      if (BestRC && getRegSizeInBits(*RC) >= getRegSizeInBits(*BestRC))
+        continue;
+
+      // Yes, RC is the smallest super-register seen so far.
+      BestRC = RC;
+      *BestPreA = IA.getSubReg();
+      *BestPreB = IB.getSubReg();
+
+      // Bail early if we reached MinSize. We won't find a better candidate.
+      if (getRegSizeInBits(*BestRC) == MinSize)
+        return BestRC;
+    }
+  }
+  return BestRC;
+}
+
+/// Check if the registers defined by the pair (RegisterClass, SubReg)
+/// share the same register file.
+static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
+                                  const TargetRegisterClass *DefRC,
+                                  unsigned DefSubReg,
+                                  const TargetRegisterClass *SrcRC,
+                                  unsigned SrcSubReg) {
+  // Same register class.
+  if (DefRC == SrcRC)
+    return true;
+
+  // Both operands are sub registers. Check if they share a register class.
+  unsigned SrcIdx, DefIdx;
+  if (SrcSubReg && DefSubReg) {
+    return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg,
+                                      SrcIdx, DefIdx) != nullptr;
+  }
+
+  // At most one of the register is a sub register, make it Src to avoid
+  // duplicating the test.
+  if (!SrcSubReg) {
+    std::swap(DefSubReg, SrcSubReg);
+    std::swap(DefRC, SrcRC);
+  }
+
+  // One of the register is a sub register, check if we can get a superclass.
+  if (SrcSubReg)
+    return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr;
+
+  // Plain copy.
+  return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr;
+}
+
+bool TargetRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+                                              unsigned DefSubReg,
+                                              const TargetRegisterClass *SrcRC,
+                                              unsigned SrcSubReg) const {
+  // If this source does not incur a cross register bank copy, use it.
+  return shareSameRegisterFile(*this, DefRC, DefSubReg, SrcRC, SrcSubReg);
+}
+
+// Compute target-independent register allocator hints to help eliminate copies.
+bool TargetRegisterInfo::getRegAllocationHints(
+    Register VirtReg, ArrayRef<MCPhysReg> Order,
+    SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
+    const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const std::pair<unsigned, SmallVector<Register, 4>> &Hints_MRI =
+      MRI.getRegAllocationHints(VirtReg);
+
+  SmallSet<Register, 32> HintedRegs;
+  // First hint may be a target hint.
+  bool Skip = (Hints_MRI.first != 0);
+  for (auto Reg : Hints_MRI.second) {
+    if (Skip) {
+      Skip = false;
+      continue;
+    }
+
+    // Target-independent hints are either a physical or a virtual register.
+    Register Phys = Reg;
+    if (VRM && Phys.isVirtual())
+      Phys = VRM->getPhys(Phys);
+
+    // Don't add the same reg twice (Hints_MRI may contain multiple virtual
+    // registers allocated to the same physreg).
+    if (!HintedRegs.insert(Phys).second)
+      continue;
+    // Check that Phys is a valid hint in VirtReg's register class.
+    if (!Phys.isPhysical())
+      continue;
+    if (MRI.isReserved(Phys))
+      continue;
+    // Check that Phys is in the allocation order. We shouldn't heed hints
+    // from VirtReg's register class if they aren't in the allocation order. The
+    // target probably has a reason for removing the register.
+    if (!is_contained(Order, Phys))
+      continue;
+
+    // All clear, tell the register allocator to prefer this register.
+    Hints.push_back(Phys);
+  }
+  return false;
+}
+
+bool TargetRegisterInfo::isCalleeSavedPhysReg(
+    MCRegister PhysReg, const MachineFunction &MF) const {
+  if (PhysReg == 0)
+    return false;
+  const uint32_t *callerPreservedRegs =
+      getCallPreservedMask(MF, MF.getFunction().getCallingConv());
+  if (callerPreservedRegs) {
+    assert(Register::isPhysicalRegister(PhysReg) &&
+           "Expected physical register");
+    return (callerPreservedRegs[PhysReg / 32] >> PhysReg % 32) & 1;
+  }
+  return false;
+}
+
+bool TargetRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  return !MF.getFunction().hasFnAttribute("no-realign-stack");
+}
+
+bool TargetRegisterInfo::shouldRealignStack(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const Function &F = MF.getFunction();
+  return F.hasFnAttribute("stackrealign") ||
+         (MFI.getMaxAlign() > TFI->getStackAlign()) ||
+         F.hasFnAttribute(Attribute::StackAlignment);
+}
+
+bool TargetRegisterInfo::regmaskSubsetEqual(const uint32_t *mask0,
+                                            const uint32_t *mask1) const {
+  unsigned N = (getNumRegs()+31) / 32;
+  for (unsigned I = 0; I < N; ++I)
+    if ((mask0[I] & mask1[I]) != mask0[I])
+      return false;
+  return true;
+}
+
+unsigned
+TargetRegisterInfo::getRegSizeInBits(Register Reg,
+                                     const MachineRegisterInfo &MRI) const {
+  const TargetRegisterClass *RC{};
+  if (Reg.isPhysical()) {
+    // The size is not directly available for physical registers.
+    // Instead, we need to access a register class that contains Reg and
+    // get the size of that register class.
+    RC = getMinimalPhysRegClass(Reg);
+  } else {
+    LLT Ty = MRI.getType(Reg);
+    unsigned RegSize = Ty.isValid() ? Ty.getSizeInBits() : 0;
+    // If Reg is not a generic register, query the register class to
+    // get its size.
+    if (RegSize)
+      return RegSize;
+    // Since Reg is not a generic register, it must have a register class.
+    RC = MRI.getRegClass(Reg);
+  }
+  assert(RC && "Unable to deduce the register class");
+  return getRegSizeInBits(*RC);
+}
+
+bool TargetRegisterInfo::getCoveringSubRegIndexes(
+    const MachineRegisterInfo &MRI, const TargetRegisterClass *RC,
+    LaneBitmask LaneMask, SmallVectorImpl<unsigned> &NeededIndexes) const {
+  SmallVector<unsigned, 8> PossibleIndexes;
+  unsigned BestIdx = 0;
+  unsigned BestCover = 0;
+
+  for (unsigned Idx = 1, E = getNumSubRegIndices(); Idx < E; ++Idx) {
+    // Is this index even compatible with the given class?
+    if (getSubClassWithSubReg(RC, Idx) != RC)
+      continue;
+    LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx);
+    // Early exit if we found a perfect match.
+    if (SubRegMask == LaneMask) {
+      BestIdx = Idx;
+      break;
+    }
+
+    // The index must not cover any lanes outside \p LaneMask.
+    if ((SubRegMask & ~LaneMask).any())
+      continue;
+
+    unsigned PopCount = SubRegMask.getNumLanes();
+    PossibleIndexes.push_back(Idx);
+    if (PopCount > BestCover) {
+      BestCover = PopCount;
+      BestIdx = Idx;
+    }
+  }
+
+  // Abort if we cannot possibly implement the COPY with the given indexes.
+  if (BestIdx == 0)
+    return false;
+
+  NeededIndexes.push_back(BestIdx);
+
+  // Greedy heuristic: Keep iterating keeping the best covering subreg index
+  // each time.
+  LaneBitmask LanesLeft = LaneMask & ~getSubRegIndexLaneMask(BestIdx);
+  while (LanesLeft.any()) {
+    unsigned BestIdx = 0;
+    int BestCover = std::numeric_limits<int>::min();
+    for (unsigned Idx : PossibleIndexes) {
+      LaneBitmask SubRegMask = getSubRegIndexLaneMask(Idx);
+      // Early exit if we found a perfect match.
+      if (SubRegMask == LanesLeft) {
+        BestIdx = Idx;
+        break;
+      }
+
+      // Do not cover already-covered lanes to avoid creating cycles
+      // in copy bundles (= bundle contains copies that write to the
+      // registers).
+      if ((SubRegMask & ~LanesLeft).any())
+        continue;
+
+      // Try to cover as many of the remaining lanes as possible.
+      const int Cover = (SubRegMask & LanesLeft).getNumLanes();
+      if (Cover > BestCover) {
+        BestCover = Cover;
+        BestIdx = Idx;
+      }
+    }
+
+    if (BestIdx == 0)
+      return false; // Impossible to handle
+
+    NeededIndexes.push_back(BestIdx);
+
+    LanesLeft &= ~getSubRegIndexLaneMask(BestIdx);
+  }
+
+  return BestIdx;
+}
+
+Register
+TargetRegisterInfo::lookThruCopyLike(Register SrcReg,
+                                     const MachineRegisterInfo *MRI) const {
+  while (true) {
+    const MachineInstr *MI = MRI->getVRegDef(SrcReg);
+    if (!MI->isCopyLike())
+      return SrcReg;
+
+    Register CopySrcReg;
+    if (MI->isCopy())
+      CopySrcReg = MI->getOperand(1).getReg();
+    else {
+      assert(MI->isSubregToReg() && "Bad opcode for lookThruCopyLike");
+      CopySrcReg = MI->getOperand(2).getReg();
+    }
+
+    if (!CopySrcReg.isVirtual())
+      return CopySrcReg;
+
+    SrcReg = CopySrcReg;
+  }
+}
+
+Register TargetRegisterInfo::lookThruSingleUseCopyChain(
+    Register SrcReg, const MachineRegisterInfo *MRI) const {
+  while (true) {
+    const MachineInstr *MI = MRI->getVRegDef(SrcReg);
+    // Found the real definition, return it if it has a single use.
+    if (!MI->isCopyLike())
+      return MRI->hasOneNonDBGUse(SrcReg) ? SrcReg : Register();
+
+    Register CopySrcReg;
+    if (MI->isCopy())
+      CopySrcReg = MI->getOperand(1).getReg();
+    else {
+      assert(MI->isSubregToReg() && "Bad opcode for lookThruCopyLike");
+      CopySrcReg = MI->getOperand(2).getReg();
+    }
+
+    // Continue only if the next definition in the chain is for a virtual
+    // register that has a single use.
+    if (!CopySrcReg.isVirtual() || !MRI->hasOneNonDBGUse(CopySrcReg))
+      return Register();
+
+    SrcReg = CopySrcReg;
+  }
+}
+
+void TargetRegisterInfo::getOffsetOpcodes(
+    const StackOffset &Offset, SmallVectorImpl<uint64_t> &Ops) const {
+  assert(!Offset.getScalable() && "Scalable offsets are not handled");
+  DIExpression::appendOffset(Ops, Offset.getFixed());
+}
+
+DIExpression *
+TargetRegisterInfo::prependOffsetExpression(const DIExpression *Expr,
+                                            unsigned PrependFlags,
+                                            const StackOffset &Offset) const {
+  assert((PrependFlags &
+          ~(DIExpression::DerefBefore | DIExpression::DerefAfter |
+            DIExpression::StackValue | DIExpression::EntryValue)) == 0 &&
+         "Unsupported prepend flag");
+  SmallVector<uint64_t, 16> OffsetExpr;
+  if (PrependFlags & DIExpression::DerefBefore)
+    OffsetExpr.push_back(dwarf::DW_OP_deref);
+  getOffsetOpcodes(Offset, OffsetExpr);
+  if (PrependFlags & DIExpression::DerefAfter)
+    OffsetExpr.push_back(dwarf::DW_OP_deref);
+  return DIExpression::prependOpcodes(Expr, OffsetExpr,
+                                      PrependFlags & DIExpression::StackValue,
+                                      PrependFlags & DIExpression::EntryValue);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void TargetRegisterInfo::dumpReg(Register Reg, unsigned SubRegIndex,
+                                 const TargetRegisterInfo *TRI) {
+  dbgs() << printReg(Reg, TRI, SubRegIndex) << "\n";
+}
+#endif
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetSchedule.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetSchedule.cpp
new file mode 100644
index 000000000000..dba84950f49d
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetSchedule.cpp
@@ -0,0 +1,343 @@
+//===- llvm/Target/TargetSchedule.cpp - Sched Machine Model ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a wrapper around MCSchedModel that allows the interface
+// to benefit from information currently only available in TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <numeric>
+
+using namespace llvm;
+
+static cl::opt<bool> EnableSchedModel("schedmodel", cl::Hidden, cl::init(true),
+  cl::desc("Use TargetSchedModel for latency lookup"));
+
+static cl::opt<bool> EnableSchedItins("scheditins", cl::Hidden, cl::init(true),
+  cl::desc("Use InstrItineraryData for latency lookup"));
+
+bool TargetSchedModel::hasInstrSchedModel() const {
+  return EnableSchedModel && SchedModel.hasInstrSchedModel();
+}
+
+bool TargetSchedModel::hasInstrItineraries() const {
+  return EnableSchedItins && !InstrItins.isEmpty();
+}
+
+void TargetSchedModel::init(const TargetSubtargetInfo *TSInfo) {
+  STI = TSInfo;
+  SchedModel = TSInfo->getSchedModel();
+  TII = TSInfo->getInstrInfo();
+  STI->initInstrItins(InstrItins);
+
+  unsigned NumRes = SchedModel.getNumProcResourceKinds();
+  ResourceFactors.resize(NumRes);
+  ResourceLCM = SchedModel.IssueWidth;
+  for (unsigned Idx = 0; Idx < NumRes; ++Idx) {
+    unsigned NumUnits = SchedModel.getProcResource(Idx)->NumUnits;
+    if (NumUnits > 0)
+      ResourceLCM = std::lcm(ResourceLCM, NumUnits);
+  }
+  MicroOpFactor = ResourceLCM / SchedModel.IssueWidth;
+  for (unsigned Idx = 0; Idx < NumRes; ++Idx) {
+    unsigned NumUnits = SchedModel.getProcResource(Idx)->NumUnits;
+    ResourceFactors[Idx] = NumUnits ? (ResourceLCM / NumUnits) : 0;
+  }
+}
+
+/// Returns true only if instruction is specified as single issue.
+bool TargetSchedModel::mustBeginGroup(const MachineInstr *MI,
+                                     const MCSchedClassDesc *SC) const {
+  if (hasInstrSchedModel()) {
+    if (!SC)
+      SC = resolveSchedClass(MI);
+    if (SC->isValid())
+      return SC->BeginGroup;
+  }
+  return false;
+}
+
+bool TargetSchedModel::mustEndGroup(const MachineInstr *MI,
+                                     const MCSchedClassDesc *SC) const {
+  if (hasInstrSchedModel()) {
+    if (!SC)
+      SC = resolveSchedClass(MI);
+    if (SC->isValid())
+      return SC->EndGroup;
+  }
+  return false;
+}
+
+unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI,
+                                          const MCSchedClassDesc *SC) const {
+  if (hasInstrItineraries()) {
+    int UOps = InstrItins.getNumMicroOps(MI->getDesc().getSchedClass());
+    return (UOps >= 0) ? UOps : TII->getNumMicroOps(&InstrItins, *MI);
+  }
+  if (hasInstrSchedModel()) {
+    if (!SC)
+      SC = resolveSchedClass(MI);
+    if (SC->isValid())
+      return SC->NumMicroOps;
+  }
+  return MI->isTransient() ? 0 : 1;
+}
+
+// The machine model may explicitly specify an invalid latency, which
+// effectively means infinite latency. Since users of the TargetSchedule API
+// don't know how to handle this, we convert it to a very large latency that is
+// easy to distinguish when debugging the DAG but won't induce overflow.
+static unsigned capLatency(int Cycles) {
+  return Cycles >= 0 ? Cycles : 1000;
+}
+
+/// Return the MCSchedClassDesc for this instruction. Some SchedClasses require
+/// evaluation of predicates that depend on instruction operands or flags.
+const MCSchedClassDesc *TargetSchedModel::
+resolveSchedClass(const MachineInstr *MI) const {
+  // Get the definition's scheduling class descriptor from this machine model.
+  unsigned SchedClass = MI->getDesc().getSchedClass();
+  const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass);
+  if (!SCDesc->isValid())
+    return SCDesc;
+
+#ifndef NDEBUG
+  unsigned NIter = 0;
+#endif
+  while (SCDesc->isVariant()) {
+    assert(++NIter < 6 && "Variants are nested deeper than the magic number");
+
+    SchedClass = STI->resolveSchedClass(SchedClass, MI, this);
+    SCDesc = SchedModel.getSchedClassDesc(SchedClass);
+  }
+  return SCDesc;
+}
+
+/// Find the def index of this operand. This index maps to the machine model and
+/// is independent of use operands. Def operands may be reordered with uses or
+/// merged with uses without affecting the def index (e.g. before/after
+/// regalloc). However, an instruction's def operands must never be reordered
+/// with respect to each other.
+static unsigned findDefIdx(const MachineInstr *MI, unsigned DefOperIdx) {
+  unsigned DefIdx = 0;
+  for (unsigned i = 0; i != DefOperIdx; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef())
+      ++DefIdx;
+  }
+  return DefIdx;
+}
+
+/// Find the use index of this operand. This is independent of the instruction's
+/// def operands.
+///
+/// Note that uses are not determined by the operand's isUse property, which
+/// is simply the inverse of isDef. Here we consider any readsReg operand to be
+/// a "use". The machine model allows an operand to be both a Def and Use.
+static unsigned findUseIdx(const MachineInstr *MI, unsigned UseOperIdx) {
+  unsigned UseIdx = 0;
+  for (unsigned i = 0; i != UseOperIdx; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.readsReg() && !MO.isDef())
+      ++UseIdx;
+  }
+  return UseIdx;
+}
+
+// Top-level API for clients that know the operand indices.
+unsigned TargetSchedModel::computeOperandLatency(
+  const MachineInstr *DefMI, unsigned DefOperIdx,
+  const MachineInstr *UseMI, unsigned UseOperIdx) const {
+
+  if (!hasInstrSchedModel() && !hasInstrItineraries())
+    return TII->defaultDefLatency(SchedModel, *DefMI);
+
+  if (hasInstrItineraries()) {
+    int OperLatency = 0;
+    if (UseMI) {
+      OperLatency = TII->getOperandLatency(&InstrItins, *DefMI, DefOperIdx,
+                                           *UseMI, UseOperIdx);
+    }
+    else {
+      unsigned DefClass = DefMI->getDesc().getSchedClass();
+      OperLatency = InstrItins.getOperandCycle(DefClass, DefOperIdx);
+    }
+    if (OperLatency >= 0)
+      return OperLatency;
+
+    // No operand latency was found.
+    unsigned InstrLatency = TII->getInstrLatency(&InstrItins, *DefMI);
+
+    // Expected latency is the max of the stage latency and itinerary props.
+    // Rather than directly querying InstrItins stage latency, we call a TII
+    // hook to allow subtargets to specialize latency. This hook is only
+    // applicable to the InstrItins model. InstrSchedModel should model all
+    // special cases without TII hooks.
+    InstrLatency =
+        std::max(InstrLatency, TII->defaultDefLatency(SchedModel, *DefMI));
+    return InstrLatency;
+  }
+  // hasInstrSchedModel()
+  const MCSchedClassDesc *SCDesc = resolveSchedClass(DefMI);
+  unsigned DefIdx = findDefIdx(DefMI, DefOperIdx);
+  if (DefIdx < SCDesc->NumWriteLatencyEntries) {
+    // Lookup the definition's write latency in SubtargetInfo.
+    const MCWriteLatencyEntry *WLEntry =
+      STI->getWriteLatencyEntry(SCDesc, DefIdx);
+    unsigned WriteID = WLEntry->WriteResourceID;
+    unsigned Latency = capLatency(WLEntry->Cycles);
+    if (!UseMI)
+      return Latency;
+
+    // Lookup the use's latency adjustment in SubtargetInfo.
+    const MCSchedClassDesc *UseDesc = resolveSchedClass(UseMI);
+    if (UseDesc->NumReadAdvanceEntries == 0)
+      return Latency;
+    unsigned UseIdx = findUseIdx(UseMI, UseOperIdx);
+    int Advance = STI->getReadAdvanceCycles(UseDesc, UseIdx, WriteID);
+    if (Advance > 0 && (unsigned)Advance > Latency) // unsigned wrap
+      return 0;
+    return Latency - Advance;
+  }
+  // If DefIdx does not exist in the model (e.g. implicit defs), then return
+  // unit latency (defaultDefLatency may be too conservative).
+#ifndef NDEBUG
+  if (SCDesc->isValid() && !DefMI->getOperand(DefOperIdx).isImplicit() &&
+      !DefMI->getDesc().operands()[DefOperIdx].isOptionalDef() &&
+      SchedModel.isComplete()) {
+    errs() << "DefIdx " << DefIdx << " exceeds machine model writes for "
+           << *DefMI << " (Try with MCSchedModel.CompleteModel set to false)";
+    llvm_unreachable("incomplete machine model");
+  }
+#endif
+  // FIXME: Automatically giving all implicit defs defaultDefLatency is
+  // undesirable. We should only do it for defs that are known to the MC
+  // desc like flags. Truly implicit defs should get 1 cycle latency.
+  return DefMI->isTransient() ? 0 : TII->defaultDefLatency(SchedModel, *DefMI);
+}
+
+unsigned
+TargetSchedModel::computeInstrLatency(const MCSchedClassDesc &SCDesc) const {
+  return capLatency(MCSchedModel::computeInstrLatency(*STI, SCDesc));
+}
+
+unsigned TargetSchedModel::computeInstrLatency(unsigned Opcode) const {
+  assert(hasInstrSchedModel() && "Only call this function with a SchedModel");
+  unsigned SCIdx = TII->get(Opcode).getSchedClass();
+  return capLatency(SchedModel.computeInstrLatency(*STI, SCIdx));
+}
+
+unsigned TargetSchedModel::computeInstrLatency(const MCInst &Inst) const {
+  if (hasInstrSchedModel())
+    return capLatency(SchedModel.computeInstrLatency(*STI, *TII, Inst));
+  return computeInstrLatency(Inst.getOpcode());
+}
+
+unsigned
+TargetSchedModel::computeInstrLatency(const MachineInstr *MI,
+                                      bool UseDefaultDefLatency) const {
+  // For the itinerary model, fall back to the old subtarget hook.
+  // Allow subtargets to compute Bundle latencies outside the machine model.
+  if (hasInstrItineraries() || MI->isBundle() ||
+      (!hasInstrSchedModel() && !UseDefaultDefLatency))
+    return TII->getInstrLatency(&InstrItins, *MI);
+
+  if (hasInstrSchedModel()) {
+    const MCSchedClassDesc *SCDesc = resolveSchedClass(MI);
+    if (SCDesc->isValid())
+      return computeInstrLatency(*SCDesc);
+  }
+  return TII->defaultDefLatency(SchedModel, *MI);
+}
+
+unsigned TargetSchedModel::
+computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
+                     const MachineInstr *DepMI) const {
+  if (!SchedModel.isOutOfOrder())
+    return 1;
+
+  // Out-of-order processor can dispatch WAW dependencies in the same cycle.
+
+  // Treat predication as a data dependency for out-of-order cpus. In-order
+  // cpus do not need to treat predicated writes specially.
+  //
+  // TODO: The following hack exists because predication passes do not
+  // correctly append imp-use operands, and readsReg() strangely returns false
+  // for predicated defs.
+  Register Reg = DefMI->getOperand(DefOperIdx).getReg();
+  const MachineFunction &MF = *DefMI->getMF();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  if (!DepMI->readsRegister(Reg, TRI) && TII->isPredicated(*DepMI))
+    return computeInstrLatency(DefMI);
+
+  // If we have a per operand scheduling model, check if this def is writing
+  // an unbuffered resource. If so, it treated like an in-order cpu.
+  if (hasInstrSchedModel()) {
+    const MCSchedClassDesc *SCDesc = resolveSchedClass(DefMI);
+    if (SCDesc->isValid()) {
+      for (const MCWriteProcResEntry *PRI = STI->getWriteProcResBegin(SCDesc),
+             *PRE = STI->getWriteProcResEnd(SCDesc); PRI != PRE; ++PRI) {
+        if (!SchedModel.getProcResource(PRI->ProcResourceIdx)->BufferSize)
+          return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+double
+TargetSchedModel::computeReciprocalThroughput(const MachineInstr *MI) const {
+  if (hasInstrItineraries()) {
+    unsigned SchedClass = MI->getDesc().getSchedClass();
+    return MCSchedModel::getReciprocalThroughput(SchedClass,
+                                                 *getInstrItineraries());
+  }
+
+  if (hasInstrSchedModel())
+    return MCSchedModel::getReciprocalThroughput(*STI, *resolveSchedClass(MI));
+
+  return 0.0;
+}
+
+double
+TargetSchedModel::computeReciprocalThroughput(unsigned Opcode) const {
+  unsigned SchedClass = TII->get(Opcode).getSchedClass();
+  if (hasInstrItineraries())
+    return MCSchedModel::getReciprocalThroughput(SchedClass,
+                                                 *getInstrItineraries());
+  if (hasInstrSchedModel()) {
+    const MCSchedClassDesc &SCDesc = *SchedModel.getSchedClassDesc(SchedClass);
+    if (SCDesc.isValid() && !SCDesc.isVariant())
+      return MCSchedModel::getReciprocalThroughput(*STI, SCDesc);
+  }
+
+  return 0.0;
+}
+
+double
+TargetSchedModel::computeReciprocalThroughput(const MCInst &MI) const {
+  if (hasInstrSchedModel())
+    return SchedModel.getReciprocalThroughput(*STI, *TII, MI);
+  return computeReciprocalThroughput(MI.getOpcode());
+}
+
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TargetSubtargetInfo.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
new file mode 100644
index 000000000000..ba2c8dda7de5
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
@@ -0,0 +1,60 @@
+//===- TargetSubtargetInfo.cpp - General Target Information ----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file describes the general parts of a Subtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+TargetSubtargetInfo::TargetSubtargetInfo(
+    const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS,
+    ArrayRef<SubtargetFeatureKV> PF, ArrayRef<SubtargetSubTypeKV> PD,
+    const MCWriteProcResEntry *WPR, const MCWriteLatencyEntry *WL,
+    const MCReadAdvanceEntry *RA, const InstrStage *IS, const unsigned *OC,
+    const unsigned *FP)
+    : MCSubtargetInfo(TT, CPU, TuneCPU, FS, PF, PD, WPR, WL, RA, IS, OC, FP) {}
+
+TargetSubtargetInfo::~TargetSubtargetInfo() = default;
+
+bool TargetSubtargetInfo::enableAtomicExpand() const {
+  return true;
+}
+
+bool TargetSubtargetInfo::enableIndirectBrExpand() const {
+  return false;
+}
+
+bool TargetSubtargetInfo::enableMachineScheduler() const {
+  return false;
+}
+
+bool TargetSubtargetInfo::enableJoinGlobalCopies() const {
+  return enableMachineScheduler();
+}
+
+bool TargetSubtargetInfo::enableRALocalReassignment(
+    CodeGenOpt::Level OptLevel) const {
+  return true;
+}
+
+bool TargetSubtargetInfo::enablePostRAScheduler() const {
+  return getSchedModel().PostRAScheduler;
+}
+
+bool TargetSubtargetInfo::enablePostRAMachineScheduler() const {
+  return enableMachineScheduler() && enablePostRAScheduler();
+}
+
+bool TargetSubtargetInfo::useAA() const {
+  return false;
+}
+
+void TargetSubtargetInfo::mirFileLoaded(MachineFunction &MF) const { }
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
new file mode 100644
index 000000000000..c3ea76bf8cea
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -0,0 +1,1967 @@
+//===- TwoAddressInstructionPass.cpp - Two-Address instruction pass -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TwoAddress instruction pass which is used
+// by most register allocators. Two-Address instructions are rewritten
+// from:
+//
+//     A = B op C
+//
+// to:
+//
+//     A = B
+//     A op= C
+//
+// Note that if a register allocator chooses to use this pass, that it
+// has to be capable of handling the non-SSA nature of these rewritten
+// virtual registers.
+//
+// It is also worth noting that the duplicate operand of the two
+// address instruction is removed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "twoaddressinstruction"
+
+STATISTIC(NumTwoAddressInstrs, "Number of two-address instructions");
+STATISTIC(NumCommuted        , "Number of instructions commuted to coalesce");
+STATISTIC(NumAggrCommuted    , "Number of instructions aggressively commuted");
+STATISTIC(NumConvertedTo3Addr, "Number of instructions promoted to 3-address");
+STATISTIC(NumReSchedUps,       "Number of instructions re-scheduled up");
+STATISTIC(NumReSchedDowns,     "Number of instructions re-scheduled down");
+
+// Temporary flag to disable rescheduling.
+static cl::opt<bool>
+EnableRescheduling("twoaddr-reschedule",
+                   cl::desc("Coalesce copies by rescheduling (default=true)"),
+                   cl::init(true), cl::Hidden);
+
+// Limit the number of dataflow edges to traverse when evaluating the benefit
+// of commuting operands.
+static cl::opt<unsigned> MaxDataFlowEdge(
+    "dataflow-edge-limit", cl::Hidden, cl::init(3),
+    cl::desc("Maximum number of dataflow edges to traverse when evaluating "
+             "the benefit of commuting operands"));
+
+namespace {
+
+class TwoAddressInstructionPass : public MachineFunctionPass {
+  MachineFunction *MF = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  const InstrItineraryData *InstrItins = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  LiveVariables *LV = nullptr;
+  LiveIntervals *LIS = nullptr;
+  AliasAnalysis *AA = nullptr;
+  CodeGenOpt::Level OptLevel = CodeGenOpt::None;
+
+  // The current basic block being processed.
+  MachineBasicBlock *MBB = nullptr;
+
+  // Keep track the distance of a MI from the start of the current basic block.
+  DenseMap<MachineInstr*, unsigned> DistanceMap;
+
+  // Set of already processed instructions in the current block.
+  SmallPtrSet<MachineInstr*, 8> Processed;
+
+  // A map from virtual registers to physical registers which are likely targets
+  // to be coalesced to due to copies from physical registers to virtual
+  // registers. e.g. v1024 = move r0.
+  DenseMap<Register, Register> SrcRegMap;
+
+  // A map from virtual registers to physical registers which are likely targets
+  // to be coalesced to due to copies to physical registers from virtual
+  // registers. e.g. r1 = move v1024.
+  DenseMap<Register, Register> DstRegMap;
+
+  void removeClobberedSrcRegMap(MachineInstr *MI);
+
+  bool isRevCopyChain(Register FromReg, Register ToReg, int Maxlen);
+
+  bool noUseAfterLastDef(Register Reg, unsigned Dist, unsigned &LastDef);
+
+  bool isProfitableToCommute(Register RegA, Register RegB, Register RegC,
+                             MachineInstr *MI, unsigned Dist);
+
+  bool commuteInstruction(MachineInstr *MI, unsigned DstIdx,
+                          unsigned RegBIdx, unsigned RegCIdx, unsigned Dist);
+
+  bool isProfitableToConv3Addr(Register RegA, Register RegB);
+
+  bool convertInstTo3Addr(MachineBasicBlock::iterator &mi,
+                          MachineBasicBlock::iterator &nmi, Register RegA,
+                          Register RegB, unsigned &Dist);
+
+  bool isDefTooClose(Register Reg, unsigned Dist, MachineInstr *MI);
+
+  bool rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
+                             MachineBasicBlock::iterator &nmi, Register Reg);
+  bool rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
+                             MachineBasicBlock::iterator &nmi, Register Reg);
+
+  bool tryInstructionTransform(MachineBasicBlock::iterator &mi,
+                               MachineBasicBlock::iterator &nmi,
+                               unsigned SrcIdx, unsigned DstIdx,
+                               unsigned &Dist, bool shouldOnlyCommute);
+
+  bool tryInstructionCommute(MachineInstr *MI,
+                             unsigned DstOpIdx,
+                             unsigned BaseOpIdx,
+                             bool BaseOpKilled,
+                             unsigned Dist);
+  void scanUses(Register DstReg);
+
+  void processCopy(MachineInstr *MI);
+
+  using TiedPairList = SmallVector<std::pair<unsigned, unsigned>, 4>;
+  using TiedOperandMap = SmallDenseMap<unsigned, TiedPairList>;
+
+  bool collectTiedOperands(MachineInstr *MI, TiedOperandMap&);
+  void processTiedPairs(MachineInstr *MI, TiedPairList&, unsigned &Dist);
+  void eliminateRegSequence(MachineBasicBlock::iterator&);
+  bool processStatepoint(MachineInstr *MI, TiedOperandMap &TiedOperands);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  TwoAddressInstructionPass() : MachineFunctionPass(ID) {
+    initializeTwoAddressInstructionPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addUsedIfAvailable<AAResultsWrapperPass>();
+    AU.addUsedIfAvailable<LiveVariables>();
+    AU.addPreserved<LiveVariables>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  /// Pass entry point.
+  bool runOnMachineFunction(MachineFunction&) override;
+};
+
+} // end anonymous namespace
+
+char TwoAddressInstructionPass::ID = 0;
+
+char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID;
+
+INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, DEBUG_TYPE,
+                "Two-Address instruction pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(TwoAddressInstructionPass, DEBUG_TYPE,
+                "Two-Address instruction pass", false, false)
+
+/// Return the MachineInstr* if it is the single def of the Reg in current BB.
+static MachineInstr *getSingleDef(Register Reg, MachineBasicBlock *BB,
+                                  const MachineRegisterInfo *MRI) {
+  MachineInstr *Ret = nullptr;
+  for (MachineInstr &DefMI : MRI->def_instructions(Reg)) {
+    if (DefMI.getParent() != BB || DefMI.isDebugValue())
+      continue;
+    if (!Ret)
+      Ret = &DefMI;
+    else if (Ret != &DefMI)
+      return nullptr;
+  }
+  return Ret;
+}
+
+/// Check if there is a reversed copy chain from FromReg to ToReg:
+/// %Tmp1 = copy %Tmp2;
+/// %FromReg = copy %Tmp1;
+/// %ToReg = add %FromReg ...
+/// %Tmp2 = copy %ToReg;
+/// MaxLen specifies the maximum length of the copy chain the func
+/// can walk through.
+bool TwoAddressInstructionPass::isRevCopyChain(Register FromReg, Register ToReg,
+                                               int Maxlen) {
+  Register TmpReg = FromReg;
+  for (int i = 0; i < Maxlen; i++) {
+    MachineInstr *Def = getSingleDef(TmpReg, MBB, MRI);
+    if (!Def || !Def->isCopy())
+      return false;
+
+    TmpReg = Def->getOperand(1).getReg();
+
+    if (TmpReg == ToReg)
+      return true;
+  }
+  return false;
+}
+
+/// Return true if there are no intervening uses between the last instruction
+/// in the MBB that defines the specified register and the two-address
+/// instruction which is being processed. It also returns the last def location
+/// by reference.
+bool TwoAddressInstructionPass::noUseAfterLastDef(Register Reg, unsigned Dist,
+                                                  unsigned &LastDef) {
+  LastDef = 0;
+  unsigned LastUse = Dist;
+  for (MachineOperand &MO : MRI->reg_operands(Reg)) {
+    MachineInstr *MI = MO.getParent();
+    if (MI->getParent() != MBB || MI->isDebugValue())
+      continue;
+    DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
+    if (DI == DistanceMap.end())
+      continue;
+    if (MO.isUse() && DI->second < LastUse)
+      LastUse = DI->second;
+    if (MO.isDef() && DI->second > LastDef)
+      LastDef = DI->second;
+  }
+
+  return !(LastUse > LastDef && LastUse < Dist);
+}
+
+/// Return true if the specified MI is a copy instruction or an extract_subreg
+/// instruction. It also returns the source and destination registers and
+/// whether they are physical registers by reference.
+static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
+                        Register &SrcReg, Register &DstReg, bool &IsSrcPhys,
+                        bool &IsDstPhys) {
+  SrcReg = 0;
+  DstReg = 0;
+  if (MI.isCopy()) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+  } else if (MI.isInsertSubreg() || MI.isSubregToReg()) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(2).getReg();
+  } else {
+    return false;
+  }
+
+  IsSrcPhys = SrcReg.isPhysical();
+  IsDstPhys = DstReg.isPhysical();
+  return true;
+}
+
+/// Test if the given register value, which is used by the
+/// given instruction, is killed by the given instruction.
+static bool isPlainlyKilled(const MachineInstr *MI, Register Reg,
+                            LiveIntervals *LIS) {
+  if (LIS && Reg.isVirtual() && !LIS->isNotInMIMap(*MI)) {
+    // FIXME: Sometimes tryInstructionTransform() will add instructions and
+    // test whether they can be folded before keeping them. In this case it
+    // sets a kill before recursively calling tryInstructionTransform() again.
+    // If there is no interval available, we assume that this instruction is
+    // one of those. A kill flag is manually inserted on the operand so the
+    // check below will handle it.
+    LiveInterval &LI = LIS->getInterval(Reg);
+    // This is to match the kill flag version where undefs don't have kill
+    // flags.
+    if (!LI.hasAtLeastOneValue())
+      return false;
+
+    SlotIndex useIdx = LIS->getInstructionIndex(*MI);
+    LiveInterval::const_iterator I = LI.find(useIdx);
+    assert(I != LI.end() && "Reg must be live-in to use.");
+    return !I->end.isBlock() && SlotIndex::isSameInstr(I->end, useIdx);
+  }
+
+  return MI->killsRegister(Reg);
+}
+
+/// Test if the register used by the given operand is killed by the operand's
+/// instruction.
+static bool isPlainlyKilled(const MachineOperand &MO, LiveIntervals *LIS) {
+  return MO.isKill() || isPlainlyKilled(MO.getParent(), MO.getReg(), LIS);
+}
+
+/// Test if the given register value, which is used by the given
+/// instruction, is killed by the given instruction. This looks through
+/// coalescable copies to see if the original value is potentially not killed.
+///
+/// For example, in this code:
+///
+///   %reg1034 = copy %reg1024
+///   %reg1035 = copy killed %reg1025
+///   %reg1036 = add killed %reg1034, killed %reg1035
+///
+/// %reg1034 is not considered to be killed, since it is copied from a
+/// register which is not killed. Treating it as not killed lets the
+/// normal heuristics commute the (two-address) add, which lets
+/// coalescing eliminate the extra copy.
+///
+/// If allowFalsePositives is true then likely kills are treated as kills even
+/// if it can't be proven that they are kills.
+static bool isKilled(MachineInstr &MI, Register Reg,
+                     const MachineRegisterInfo *MRI, const TargetInstrInfo *TII,
+                     LiveIntervals *LIS, bool allowFalsePositives) {
+  MachineInstr *DefMI = &MI;
+  while (true) {
+    // All uses of physical registers are likely to be kills.
+    if (Reg.isPhysical() && (allowFalsePositives || MRI->hasOneUse(Reg)))
+      return true;
+    if (!isPlainlyKilled(DefMI, Reg, LIS))
+      return false;
+    if (Reg.isPhysical())
+      return true;
+    MachineRegisterInfo::def_iterator Begin = MRI->def_begin(Reg);
+    // If there are multiple defs, we can't do a simple analysis, so just
+    // go with what the kill flag says.
+    if (std::next(Begin) != MRI->def_end())
+      return true;
+    DefMI = Begin->getParent();
+    bool IsSrcPhys, IsDstPhys;
+    Register SrcReg, DstReg;
+    // If the def is something other than a copy, then it isn't going to
+    // be coalesced, so follow the kill flag.
+    if (!isCopyToReg(*DefMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys))
+      return true;
+    Reg = SrcReg;
+  }
+}
+
+/// Return true if the specified MI uses the specified register as a two-address
+/// use. If so, return the destination register by reference.
+static bool isTwoAddrUse(MachineInstr &MI, Register Reg, Register &DstReg) {
+  for (unsigned i = 0, NumOps = MI.getNumOperands(); i != NumOps; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
+      continue;
+    unsigned ti;
+    if (MI.isRegTiedToDefOperand(i, &ti)) {
+      DstReg = MI.getOperand(ti).getReg();
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Given a register, if all its uses are in the same basic block, return the
+/// last use instruction if it's a copy or a two-address use.
+static MachineInstr *
+findOnlyInterestingUse(Register Reg, MachineBasicBlock *MBB,
+                       MachineRegisterInfo *MRI, const TargetInstrInfo *TII,
+                       bool &IsCopy, Register &DstReg, bool &IsDstPhys,
+                       LiveIntervals *LIS) {
+  MachineOperand *UseOp = nullptr;
+  for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+    MachineInstr *MI = MO.getParent();
+    if (MI->getParent() != MBB)
+      return nullptr;
+    if (isPlainlyKilled(MI, Reg, LIS))
+      UseOp = &MO;
+  }
+  if (!UseOp)
+    return nullptr;
+  MachineInstr &UseMI = *UseOp->getParent();
+
+  Register SrcReg;
+  bool IsSrcPhys;
+  if (isCopyToReg(UseMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys)) {
+    IsCopy = true;
+    return &UseMI;
+  }
+  IsDstPhys = false;
+  if (isTwoAddrUse(UseMI, Reg, DstReg)) {
+    IsDstPhys = DstReg.isPhysical();
+    return &UseMI;
+  }
+  if (UseMI.isCommutable()) {
+    unsigned Src1 = TargetInstrInfo::CommuteAnyOperandIndex;
+    unsigned Src2 = UseOp->getOperandNo();
+    if (TII->findCommutedOpIndices(UseMI, Src1, Src2)) {
+      MachineOperand &MO = UseMI.getOperand(Src1);
+      if (MO.isReg() && MO.isUse() &&
+          isTwoAddrUse(UseMI, MO.getReg(), DstReg)) {
+        IsDstPhys = DstReg.isPhysical();
+        return &UseMI;
+      }
+    }
+  }
+  return nullptr;
+}
+
+/// Return the physical register the specified virtual register might be mapped
+/// to.
+static MCRegister getMappedReg(Register Reg,
+                               DenseMap<Register, Register> &RegMap) {
+  while (Reg.isVirtual()) {
+    DenseMap<Register, Register>::iterator SI = RegMap.find(Reg);
+    if (SI == RegMap.end())
+      return 0;
+    Reg = SI->second;
+  }
+  if (Reg.isPhysical())
+    return Reg;
+  return 0;
+}
+
+/// Return true if the two registers are equal or aliased.
+static bool regsAreCompatible(Register RegA, Register RegB,
+                              const TargetRegisterInfo *TRI) {
+  if (RegA == RegB)
+    return true;
+  if (!RegA || !RegB)
+    return false;
+  return TRI->regsOverlap(RegA, RegB);
+}
+
+/// From RegMap remove entries mapped to a physical register which overlaps MO.
+static void removeMapRegEntry(const MachineOperand &MO,
+                              DenseMap<Register, Register> &RegMap,
+                              const TargetRegisterInfo *TRI) {
+  assert(
+      (MO.isReg() || MO.isRegMask()) &&
+      "removeMapRegEntry must be called with a register or regmask operand.");
+
+  SmallVector<Register, 2> Srcs;
+  for (auto SI : RegMap) {
+    Register ToReg = SI.second;
+    if (ToReg.isVirtual())
+      continue;
+
+    if (MO.isReg()) {
+      Register Reg = MO.getReg();
+      if (TRI->regsOverlap(ToReg, Reg))
+        Srcs.push_back(SI.first);
+    } else if (MO.clobbersPhysReg(ToReg))
+      Srcs.push_back(SI.first);
+  }
+
+  for (auto SrcReg : Srcs)
+    RegMap.erase(SrcReg);
+}
+
+/// If a physical register is clobbered, old entries mapped to it should be
+/// deleted. For example
+///
+///     %2:gr64 = COPY killed $rdx
+///     MUL64r %3:gr64, implicit-def $rax, implicit-def $rdx
+///
+/// After the MUL instruction, $rdx contains different value than in the COPY
+/// instruction. So %2 should not map to $rdx after MUL.
+void TwoAddressInstructionPass::removeClobberedSrcRegMap(MachineInstr *MI) {
+  if (MI->isCopy()) {
+    // If a virtual register is copied to its mapped physical register, it
+    // doesn't change the potential coalescing between them, so we don't remove
+    // entries mapped to the physical register. For example
+    //
+    // %100 = COPY $r8
+    //     ...
+    // $r8  = COPY %100
+    //
+    // The first copy constructs SrcRegMap[%100] = $r8, the second copy doesn't
+    // destroy the content of $r8, and should not impact SrcRegMap.
+    Register Dst = MI->getOperand(0).getReg();
+    if (!Dst || Dst.isVirtual())
+      return;
+
+    Register Src = MI->getOperand(1).getReg();
+    if (regsAreCompatible(Dst, getMappedReg(Src, SrcRegMap), TRI))
+      return;
+  }
+
+  for (const MachineOperand &MO : MI->operands()) {
+    if (MO.isRegMask()) {
+      removeMapRegEntry(MO, SrcRegMap, TRI);
+      continue;
+    }
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    Register Reg = MO.getReg();
+    if (!Reg || Reg.isVirtual())
+      continue;
+    removeMapRegEntry(MO, SrcRegMap, TRI);
+  }
+}
+
+// Returns true if Reg is equal or aliased to at least one register in Set.
+static bool regOverlapsSet(const SmallVectorImpl<Register> &Set, Register Reg,
+                           const TargetRegisterInfo *TRI) {
+  for (unsigned R : Set)
+    if (TRI->regsOverlap(R, Reg))
+      return true;
+
+  return false;
+}
+
+/// Return true if it's potentially profitable to commute the two-address
+/// instruction that's being processed.
+bool TwoAddressInstructionPass::isProfitableToCommute(Register RegA,
+                                                      Register RegB,
+                                                      Register RegC,
+                                                      MachineInstr *MI,
+                                                      unsigned Dist) {
+  if (OptLevel == CodeGenOpt::None)
+    return false;
+
+  // Determine if it's profitable to commute this two address instruction. In
+  // general, we want no uses between this instruction and the definition of
+  // the two-address register.
+  // e.g.
+  // %reg1028 = EXTRACT_SUBREG killed %reg1027, 1
+  // %reg1029 = COPY %reg1028
+  // %reg1029 = SHR8ri %reg1029, 7, implicit dead %eflags
+  // insert => %reg1030 = COPY %reg1028
+  // %reg1030 = ADD8rr killed %reg1028, killed %reg1029, implicit dead %eflags
+  // In this case, it might not be possible to coalesce the second COPY
+  // instruction if the first one is coalesced. So it would be profitable to
+  // commute it:
+  // %reg1028 = EXTRACT_SUBREG killed %reg1027, 1
+  // %reg1029 = COPY %reg1028
+  // %reg1029 = SHR8ri %reg1029, 7, implicit dead %eflags
+  // insert => %reg1030 = COPY %reg1029
+  // %reg1030 = ADD8rr killed %reg1029, killed %reg1028, implicit dead %eflags
+
+  if (!isPlainlyKilled(MI, RegC, LIS))
+    return false;
+
+  // Ok, we have something like:
+  // %reg1030 = ADD8rr killed %reg1028, killed %reg1029, implicit dead %eflags
+  // let's see if it's worth commuting it.
+
+  // Look for situations like this:
+  // %reg1024 = MOV r1
+  // %reg1025 = MOV r0
+  // %reg1026 = ADD %reg1024, %reg1025
+  // r0            = MOV %reg1026
+  // Commute the ADD to hopefully eliminate an otherwise unavoidable copy.
+  MCRegister ToRegA = getMappedReg(RegA, DstRegMap);
+  if (ToRegA) {
+    MCRegister FromRegB = getMappedReg(RegB, SrcRegMap);
+    MCRegister FromRegC = getMappedReg(RegC, SrcRegMap);
+    bool CompB = FromRegB && regsAreCompatible(FromRegB, ToRegA, TRI);
+    bool CompC = FromRegC && regsAreCompatible(FromRegC, ToRegA, TRI);
+
+    // Compute if any of the following are true:
+    // -RegB is not tied to a register and RegC is compatible with RegA.
+    // -RegB is tied to the wrong physical register, but RegC is.
+    // -RegB is tied to the wrong physical register, and RegC isn't tied.
+    if ((!FromRegB && CompC) || (FromRegB && !CompB && (!FromRegC || CompC)))
+      return true;
+    // Don't compute if any of the following are true:
+    // -RegC is not tied to a register and RegB is compatible with RegA.
+    // -RegC is tied to the wrong physical register, but RegB is.
+    // -RegC is tied to the wrong physical register, and RegB isn't tied.
+    if ((!FromRegC && CompB) || (FromRegC && !CompC && (!FromRegB || CompB)))
+      return false;
+  }
+
+  // If there is a use of RegC between its last def (could be livein) and this
+  // instruction, then bail.
+  unsigned LastDefC = 0;
+  if (!noUseAfterLastDef(RegC, Dist, LastDefC))
+    return false;
+
+  // If there is a use of RegB between its last def (could be livein) and this
+  // instruction, then go ahead and make this transformation.
+  unsigned LastDefB = 0;
+  if (!noUseAfterLastDef(RegB, Dist, LastDefB))
+    return true;
+
+  // Look for situation like this:
+  // %reg101 = MOV %reg100
+  // %reg102 = ...
+  // %reg103 = ADD %reg102, %reg101
+  // ... = %reg103 ...
+  // %reg100 = MOV %reg103
+  // If there is a reversed copy chain from reg101 to reg103, commute the ADD
+  // to eliminate an otherwise unavoidable copy.
+  // FIXME:
+  // We can extend the logic further: If an pair of operands in an insn has
+  // been merged, the insn could be regarded as a virtual copy, and the virtual
+  // copy could also be used to construct a copy chain.
+  // To more generally minimize register copies, ideally the logic of two addr
+  // instruction pass should be integrated with register allocation pass where
+  // interference graph is available.
+  if (isRevCopyChain(RegC, RegA, MaxDataFlowEdge))
+    return true;
+
+  if (isRevCopyChain(RegB, RegA, MaxDataFlowEdge))
+    return false;
+
+  // Look for other target specific commute preference.
+  bool Commute;
+  if (TII->hasCommutePreference(*MI, Commute))
+    return Commute;
+
+  // Since there are no intervening uses for both registers, then commute
+  // if the def of RegC is closer. Its live interval is shorter.
+  return LastDefB && LastDefC && LastDefC > LastDefB;
+}
+
+/// Commute a two-address instruction and update the basic block, distance map,
+/// and live variables if needed. Return true if it is successful.
+bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI,
+                                                   unsigned DstIdx,
+                                                   unsigned RegBIdx,
+                                                   unsigned RegCIdx,
+                                                   unsigned Dist) {
+  Register RegC = MI->getOperand(RegCIdx).getReg();
+  LLVM_DEBUG(dbgs() << "2addr: COMMUTING  : " << *MI);
+  MachineInstr *NewMI = TII->commuteInstruction(*MI, false, RegBIdx, RegCIdx);
+
+  if (NewMI == nullptr) {
+    LLVM_DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "2addr: COMMUTED TO: " << *NewMI);
+  assert(NewMI == MI &&
+         "TargetInstrInfo::commuteInstruction() should not return a new "
+         "instruction unless it was requested.");
+
+  // Update source register map.
+  MCRegister FromRegC = getMappedReg(RegC, SrcRegMap);
+  if (FromRegC) {
+    Register RegA = MI->getOperand(DstIdx).getReg();
+    SrcRegMap[RegA] = FromRegC;
+  }
+
+  return true;
+}
+
+/// Return true if it is profitable to convert the given 2-address instruction
+/// to a 3-address one.
+bool TwoAddressInstructionPass::isProfitableToConv3Addr(Register RegA,
+                                                        Register RegB) {
+  // Look for situations like this:
+  // %reg1024 = MOV r1
+  // %reg1025 = MOV r0
+  // %reg1026 = ADD %reg1024, %reg1025
+  // r2            = MOV %reg1026
+  // Turn ADD into a 3-address instruction to avoid a copy.
+  MCRegister FromRegB = getMappedReg(RegB, SrcRegMap);
+  if (!FromRegB)
+    return false;
+  MCRegister ToRegA = getMappedReg(RegA, DstRegMap);
+  return (ToRegA && !regsAreCompatible(FromRegB, ToRegA, TRI));
+}
+
+/// Convert the specified two-address instruction into a three address one.
+/// Return true if this transformation was successful.
+bool TwoAddressInstructionPass::convertInstTo3Addr(
+    MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi,
+    Register RegA, Register RegB, unsigned &Dist) {
+  MachineInstrSpan MIS(mi, MBB);
+  MachineInstr *NewMI = TII->convertToThreeAddress(*mi, LV, LIS);
+  if (!NewMI)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+  LLVM_DEBUG(dbgs() << "2addr:         TO 3-ADDR: " << *NewMI);
+
+  // If the old instruction is debug value tracked, an update is required.
+  if (auto OldInstrNum = mi->peekDebugInstrNum()) {
+    assert(mi->getNumExplicitDefs() == 1);
+    assert(NewMI->getNumExplicitDefs() == 1);
+
+    // Find the old and new def location.
+    unsigned OldIdx = mi->defs().begin()->getOperandNo();
+    unsigned NewIdx = NewMI->defs().begin()->getOperandNo();
+
+    // Record that one def has been replaced by the other.
+    unsigned NewInstrNum = NewMI->getDebugInstrNum();
+    MF->makeDebugValueSubstitution(std::make_pair(OldInstrNum, OldIdx),
+                                   std::make_pair(NewInstrNum, NewIdx));
+  }
+
+  MBB->erase(mi); // Nuke the old inst.
+
+  for (MachineInstr &MI : MIS)
+    DistanceMap.insert(std::make_pair(&MI, Dist++));
+  Dist--;
+  mi = NewMI;
+  nmi = std::next(mi);
+
+  // Update source and destination register maps.
+  SrcRegMap.erase(RegA);
+  DstRegMap.erase(RegB);
+  return true;
+}
+
+/// Scan forward recursively for only uses, update maps if the use is a copy or
+/// a two-address instruction.
+void TwoAddressInstructionPass::scanUses(Register DstReg) {
+  SmallVector<Register, 4> VirtRegPairs;
+  bool IsDstPhys;
+  bool IsCopy = false;
+  Register NewReg;
+  Register Reg = DstReg;
+  while (MachineInstr *UseMI = findOnlyInterestingUse(Reg, MBB, MRI, TII,IsCopy,
+                                                      NewReg, IsDstPhys, LIS)) {
+    if (IsCopy && !Processed.insert(UseMI).second)
+      break;
+
+    DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(UseMI);
+    if (DI != DistanceMap.end())
+      // Earlier in the same MBB.Reached via a back edge.
+      break;
+
+    if (IsDstPhys) {
+      VirtRegPairs.push_back(NewReg);
+      break;
+    }
+    SrcRegMap[NewReg] = Reg;
+    VirtRegPairs.push_back(NewReg);
+    Reg = NewReg;
+  }
+
+  if (!VirtRegPairs.empty()) {
+    unsigned ToReg = VirtRegPairs.back();
+    VirtRegPairs.pop_back();
+    while (!VirtRegPairs.empty()) {
+      unsigned FromReg = VirtRegPairs.pop_back_val();
+      bool isNew = DstRegMap.insert(std::make_pair(FromReg, ToReg)).second;
+      if (!isNew)
+        assert(DstRegMap[FromReg] == ToReg &&"Can't map to two dst registers!");
+      ToReg = FromReg;
+    }
+    bool isNew = DstRegMap.insert(std::make_pair(DstReg, ToReg)).second;
+    if (!isNew)
+      assert(DstRegMap[DstReg] == ToReg && "Can't map to two dst registers!");
+  }
+}
+
+/// If the specified instruction is not yet processed, process it if it's a
+/// copy. For a copy instruction, we find the physical registers the
+/// source and destination registers might be mapped to. These are kept in
+/// point-to maps used to determine future optimizations. e.g.
+/// v1024 = mov r0
+/// v1025 = mov r1
+/// v1026 = add v1024, v1025
+/// r1    = mov r1026
+/// If 'add' is a two-address instruction, v1024, v1026 are both potentially
+/// coalesced to r0 (from the input side). v1025 is mapped to r1. v1026 is
+/// potentially joined with r1 on the output side. It's worthwhile to commute
+/// 'add' to eliminate a copy.
+void TwoAddressInstructionPass::processCopy(MachineInstr *MI) {
+  if (Processed.count(MI))
+    return;
+
+  bool IsSrcPhys, IsDstPhys;
+  Register SrcReg, DstReg;
+  if (!isCopyToReg(*MI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys))
+    return;
+
+  if (IsDstPhys && !IsSrcPhys) {
+    DstRegMap.insert(std::make_pair(SrcReg, DstReg));
+  } else if (!IsDstPhys && IsSrcPhys) {
+    bool isNew = SrcRegMap.insert(std::make_pair(DstReg, SrcReg)).second;
+    if (!isNew)
+      assert(SrcRegMap[DstReg] == SrcReg &&
+             "Can't map to two src physical registers!");
+
+    scanUses(DstReg);
+  }
+
+  Processed.insert(MI);
+}
+
+/// If there is one more local instruction that reads 'Reg' and it kills 'Reg,
+/// consider moving the instruction below the kill instruction in order to
+/// eliminate the need for the copy.
+bool TwoAddressInstructionPass::rescheduleMIBelowKill(
+    MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi,
+    Register Reg) {
+  // Bail immediately if we don't have LV or LIS available. We use them to find
+  // kills efficiently.
+  if (!LV && !LIS)
+    return false;
+
+  MachineInstr *MI = &*mi;
+  DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
+  if (DI == DistanceMap.end())
+    // Must be created from unfolded load. Don't waste time trying this.
+    return false;
+
+  MachineInstr *KillMI = nullptr;
+  if (LIS) {
+    LiveInterval &LI = LIS->getInterval(Reg);
+    assert(LI.end() != LI.begin() &&
+           "Reg should not have empty live interval.");
+
+    SlotIndex MBBEndIdx = LIS->getMBBEndIdx(MBB).getPrevSlot();
+    LiveInterval::const_iterator I = LI.find(MBBEndIdx);
+    if (I != LI.end() && I->start < MBBEndIdx)
+      return false;
+
+    --I;
+    KillMI = LIS->getInstructionFromIndex(I->end);
+  } else {
+    KillMI = LV->getVarInfo(Reg).findKill(MBB);
+  }
+  if (!KillMI || MI == KillMI || KillMI->isCopy() || KillMI->isCopyLike())
+    // Don't mess with copies, they may be coalesced later.
+    return false;
+
+  if (KillMI->hasUnmodeledSideEffects() || KillMI->isCall() ||
+      KillMI->isBranch() || KillMI->isTerminator())
+    // Don't move pass calls, etc.
+    return false;
+
+  Register DstReg;
+  if (isTwoAddrUse(*KillMI, Reg, DstReg))
+    return false;
+
+  bool SeenStore = true;
+  if (!MI->isSafeToMove(AA, SeenStore))
+    return false;
+
+  if (TII->getInstrLatency(InstrItins, *MI) > 1)
+    // FIXME: Needs more sophisticated heuristics.
+    return false;
+
+  SmallVector<Register, 2> Uses;
+  SmallVector<Register, 2> Kills;
+  SmallVector<Register, 2> Defs;
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    Register MOReg = MO.getReg();
+    if (!MOReg)
+      continue;
+    if (MO.isDef())
+      Defs.push_back(MOReg);
+    else {
+      Uses.push_back(MOReg);
+      if (MOReg != Reg && isPlainlyKilled(MO, LIS))
+        Kills.push_back(MOReg);
+    }
+  }
+
+  // Move the copies connected to MI down as well.
+  MachineBasicBlock::iterator Begin = MI;
+  MachineBasicBlock::iterator AfterMI = std::next(Begin);
+  MachineBasicBlock::iterator End = AfterMI;
+  while (End != MBB->end()) {
+    End = skipDebugInstructionsForward(End, MBB->end());
+    if (End->isCopy() && regOverlapsSet(Defs, End->getOperand(1).getReg(), TRI))
+      Defs.push_back(End->getOperand(0).getReg());
+    else
+      break;
+    ++End;
+  }
+
+  // Check if the reschedule will not break dependencies.
+  unsigned NumVisited = 0;
+  MachineBasicBlock::iterator KillPos = KillMI;
+  ++KillPos;
+  for (MachineInstr &OtherMI : make_range(End, KillPos)) {
+    // Debug or pseudo instructions cannot be counted against the limit.
+    if (OtherMI.isDebugOrPseudoInstr())
+      continue;
+    if (NumVisited > 10)  // FIXME: Arbitrary limit to reduce compile time cost.
+      return false;
+    ++NumVisited;
+    if (OtherMI.hasUnmodeledSideEffects() || OtherMI.isCall() ||
+        OtherMI.isBranch() || OtherMI.isTerminator())
+      // Don't move pass calls, etc.
+      return false;
+    for (const MachineOperand &MO : OtherMI.operands()) {
+      if (!MO.isReg())
+        continue;
+      Register MOReg = MO.getReg();
+      if (!MOReg)
+        continue;
+      if (MO.isDef()) {
+        if (regOverlapsSet(Uses, MOReg, TRI))
+          // Physical register use would be clobbered.
+          return false;
+        if (!MO.isDead() && regOverlapsSet(Defs, MOReg, TRI))
+          // May clobber a physical register def.
+          // FIXME: This may be too conservative. It's ok if the instruction
+          // is sunken completely below the use.
+          return false;
+      } else {
+        if (regOverlapsSet(Defs, MOReg, TRI))
+          return false;
+        bool isKill = isPlainlyKilled(MO, LIS);
+        if (MOReg != Reg && ((isKill && regOverlapsSet(Uses, MOReg, TRI)) ||
+                             regOverlapsSet(Kills, MOReg, TRI)))
+          // Don't want to extend other live ranges and update kills.
+          return false;
+        if (MOReg == Reg && !isKill)
+          // We can't schedule across a use of the register in question.
+          return false;
+        // Ensure that if this is register in question, its the kill we expect.
+        assert((MOReg != Reg || &OtherMI == KillMI) &&
+               "Found multiple kills of a register in a basic block");
+      }
+    }
+  }
+
+  // Move debug info as well.
+  while (Begin != MBB->begin() && std::prev(Begin)->isDebugInstr())
+    --Begin;
+
+  nmi = End;
+  MachineBasicBlock::iterator InsertPos = KillPos;
+  if (LIS) {
+    // We have to move the copies (and any interleaved debug instructions)
+    // first so that the MBB is still well-formed when calling handleMove().
+    for (MachineBasicBlock::iterator MBBI = AfterMI; MBBI != End;) {
+      auto CopyMI = MBBI++;
+      MBB->splice(InsertPos, MBB, CopyMI);
+      if (!CopyMI->isDebugOrPseudoInstr())
+        LIS->handleMove(*CopyMI);
+      InsertPos = CopyMI;
+    }
+    End = std::next(MachineBasicBlock::iterator(MI));
+  }
+
+  // Copies following MI may have been moved as well.
+  MBB->splice(InsertPos, MBB, Begin, End);
+  DistanceMap.erase(DI);
+
+  // Update live variables
+  if (LIS) {
+    LIS->handleMove(*MI);
+  } else {
+    LV->removeVirtualRegisterKilled(Reg, *KillMI);
+    LV->addVirtualRegisterKilled(Reg, *MI);
+  }
+
+  LLVM_DEBUG(dbgs() << "\trescheduled below kill: " << *KillMI);
+  return true;
+}
+
+/// Return true if the re-scheduling will put the given instruction too close
+/// to the defs of its register dependencies.
+bool TwoAddressInstructionPass::isDefTooClose(Register Reg, unsigned Dist,
+                                              MachineInstr *MI) {
+  for (MachineInstr &DefMI : MRI->def_instructions(Reg)) {
+    if (DefMI.getParent() != MBB || DefMI.isCopy() || DefMI.isCopyLike())
+      continue;
+    if (&DefMI == MI)
+      return true; // MI is defining something KillMI uses
+    DenseMap<MachineInstr*, unsigned>::iterator DDI = DistanceMap.find(&DefMI);
+    if (DDI == DistanceMap.end())
+      return true;  // Below MI
+    unsigned DefDist = DDI->second;
+    assert(Dist > DefDist && "Visited def already?");
+    if (TII->getInstrLatency(InstrItins, DefMI) > (Dist - DefDist))
+      return true;
+  }
+  return false;
+}
+
+/// If there is one more local instruction that reads 'Reg' and it kills 'Reg,
+/// consider moving the kill instruction above the current two-address
+/// instruction in order to eliminate the need for the copy.
+bool TwoAddressInstructionPass::rescheduleKillAboveMI(
+    MachineBasicBlock::iterator &mi, MachineBasicBlock::iterator &nmi,
+    Register Reg) {
+  // Bail immediately if we don't have LV or LIS available. We use them to find
+  // kills efficiently.
+  if (!LV && !LIS)
+    return false;
+
+  MachineInstr *MI = &*mi;
+  DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
+  if (DI == DistanceMap.end())
+    // Must be created from unfolded load. Don't waste time trying this.
+    return false;
+
+  MachineInstr *KillMI = nullptr;
+  if (LIS) {
+    LiveInterval &LI = LIS->getInterval(Reg);
+    assert(LI.end() != LI.begin() &&
+           "Reg should not have empty live interval.");
+
+    SlotIndex MBBEndIdx = LIS->getMBBEndIdx(MBB).getPrevSlot();
+    LiveInterval::const_iterator I = LI.find(MBBEndIdx);
+    if (I != LI.end() && I->start < MBBEndIdx)
+      return false;
+
+    --I;
+    KillMI = LIS->getInstructionFromIndex(I->end);
+  } else {
+    KillMI = LV->getVarInfo(Reg).findKill(MBB);
+  }
+  if (!KillMI || MI == KillMI || KillMI->isCopy() || KillMI->isCopyLike())
+    // Don't mess with copies, they may be coalesced later.
+    return false;
+
+  Register DstReg;
+  if (isTwoAddrUse(*KillMI, Reg, DstReg))
+    return false;
+
+  bool SeenStore = true;
+  if (!KillMI->isSafeToMove(AA, SeenStore))
+    return false;
+
+  SmallVector<Register, 2> Uses;
+  SmallVector<Register, 2> Kills;
+  SmallVector<Register, 2> Defs;
+  SmallVector<Register, 2> LiveDefs;
+  for (const MachineOperand &MO : KillMI->operands()) {
+    if (!MO.isReg())
+      continue;
+    Register MOReg = MO.getReg();
+    if (MO.isUse()) {
+      if (!MOReg)
+        continue;
+      if (isDefTooClose(MOReg, DI->second, MI))
+        return false;
+      bool isKill = isPlainlyKilled(MO, LIS);
+      if (MOReg == Reg && !isKill)
+        return false;
+      Uses.push_back(MOReg);
+      if (isKill && MOReg != Reg)
+        Kills.push_back(MOReg);
+    } else if (MOReg.isPhysical()) {
+      Defs.push_back(MOReg);
+      if (!MO.isDead())
+        LiveDefs.push_back(MOReg);
+    }
+  }
+
+  // Check if the reschedule will not break depedencies.
+  unsigned NumVisited = 0;
+  for (MachineInstr &OtherMI :
+       make_range(mi, MachineBasicBlock::iterator(KillMI))) {
+    // Debug or pseudo instructions cannot be counted against the limit.
+    if (OtherMI.isDebugOrPseudoInstr())
+      continue;
+    if (NumVisited > 10)  // FIXME: Arbitrary limit to reduce compile time cost.
+      return false;
+    ++NumVisited;
+    if (OtherMI.hasUnmodeledSideEffects() || OtherMI.isCall() ||
+        OtherMI.isBranch() || OtherMI.isTerminator())
+      // Don't move pass calls, etc.
+      return false;
+    SmallVector<Register, 2> OtherDefs;
+    for (const MachineOperand &MO : OtherMI.operands()) {
+      if (!MO.isReg())
+        continue;
+      Register MOReg = MO.getReg();
+      if (!MOReg)
+        continue;
+      if (MO.isUse()) {
+        if (regOverlapsSet(Defs, MOReg, TRI))
+          // Moving KillMI can clobber the physical register if the def has
+          // not been seen.
+          return false;
+        if (regOverlapsSet(Kills, MOReg, TRI))
+          // Don't want to extend other live ranges and update kills.
+          return false;
+        if (&OtherMI != MI && MOReg == Reg && !isPlainlyKilled(MO, LIS))
+          // We can't schedule across a use of the register in question.
+          return false;
+      } else {
+        OtherDefs.push_back(MOReg);
+      }
+    }
+
+    for (unsigned i = 0, e = OtherDefs.size(); i != e; ++i) {
+      Register MOReg = OtherDefs[i];
+      if (regOverlapsSet(Uses, MOReg, TRI))
+        return false;
+      if (MOReg.isPhysical() && regOverlapsSet(LiveDefs, MOReg, TRI))
+        return false;
+      // Physical register def is seen.
+      llvm::erase_value(Defs, MOReg);
+    }
+  }
+
+  // Move the old kill above MI, don't forget to move debug info as well.
+  MachineBasicBlock::iterator InsertPos = mi;
+  while (InsertPos != MBB->begin() && std::prev(InsertPos)->isDebugInstr())
+    --InsertPos;
+  MachineBasicBlock::iterator From = KillMI;
+  MachineBasicBlock::iterator To = std::next(From);
+  while (std::prev(From)->isDebugInstr())
+    --From;
+  MBB->splice(InsertPos, MBB, From, To);
+
+  nmi = std::prev(InsertPos); // Backtrack so we process the moved instr.
+  DistanceMap.erase(DI);
+
+  // Update live variables
+  if (LIS) {
+    LIS->handleMove(*KillMI);
+  } else {
+    LV->removeVirtualRegisterKilled(Reg, *KillMI);
+    LV->addVirtualRegisterKilled(Reg, *MI);
+  }
+
+  LLVM_DEBUG(dbgs() << "\trescheduled kill: " << *KillMI);
+  return true;
+}
+
+/// Tries to commute the operand 'BaseOpIdx' and some other operand in the
+/// given machine instruction to improve opportunities for coalescing and
+/// elimination of a register to register copy.
+///
+/// 'DstOpIdx' specifies the index of MI def operand.
+/// 'BaseOpKilled' specifies if the register associated with 'BaseOpIdx'
+/// operand is killed by the given instruction.
+/// The 'Dist' arguments provides the distance of MI from the start of the
+/// current basic block and it is used to determine if it is profitable
+/// to commute operands in the instruction.
+///
+/// Returns true if the transformation happened. Otherwise, returns false.
+bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
+                                                      unsigned DstOpIdx,
+                                                      unsigned BaseOpIdx,
+                                                      bool BaseOpKilled,
+                                                      unsigned Dist) {
+  if (!MI->isCommutable())
+    return false;
+
+  bool MadeChange = false;
+  Register DstOpReg = MI->getOperand(DstOpIdx).getReg();
+  Register BaseOpReg = MI->getOperand(BaseOpIdx).getReg();
+  unsigned OpsNum = MI->getDesc().getNumOperands();
+  unsigned OtherOpIdx = MI->getDesc().getNumDefs();
+  for (; OtherOpIdx < OpsNum; OtherOpIdx++) {
+    // The call of findCommutedOpIndices below only checks if BaseOpIdx
+    // and OtherOpIdx are commutable, it does not really search for
+    // other commutable operands and does not change the values of passed
+    // variables.
+    if (OtherOpIdx == BaseOpIdx || !MI->getOperand(OtherOpIdx).isReg() ||
+        !TII->findCommutedOpIndices(*MI, BaseOpIdx, OtherOpIdx))
+      continue;
+
+    Register OtherOpReg = MI->getOperand(OtherOpIdx).getReg();
+    bool AggressiveCommute = false;
+
+    // If OtherOp dies but BaseOp does not, swap the OtherOp and BaseOp
+    // operands. This makes the live ranges of DstOp and OtherOp joinable.
+    bool OtherOpKilled = isKilled(*MI, OtherOpReg, MRI, TII, LIS, false);
+    bool DoCommute = !BaseOpKilled && OtherOpKilled;
+
+    if (!DoCommute &&
+        isProfitableToCommute(DstOpReg, BaseOpReg, OtherOpReg, MI, Dist)) {
+      DoCommute = true;
+      AggressiveCommute = true;
+    }
+
+    // If it's profitable to commute, try to do so.
+    if (DoCommute && commuteInstruction(MI, DstOpIdx, BaseOpIdx, OtherOpIdx,
+                                        Dist)) {
+      MadeChange = true;
+      ++NumCommuted;
+      if (AggressiveCommute)
+        ++NumAggrCommuted;
+
+      // There might be more than two commutable operands, update BaseOp and
+      // continue scanning.
+      // FIXME: This assumes that the new instruction's operands are in the
+      // same positions and were simply swapped.
+      BaseOpReg = OtherOpReg;
+      BaseOpKilled = OtherOpKilled;
+      // Resamples OpsNum in case the number of operands was reduced. This
+      // happens with X86.
+      OpsNum = MI->getDesc().getNumOperands();
+    }
+  }
+  return MadeChange;
+}
+
+/// For the case where an instruction has a single pair of tied register
+/// operands, attempt some transformations that may either eliminate the tied
+/// operands or improve the opportunities for coalescing away the register copy.
+/// Returns true if no copy needs to be inserted to untie mi's operands
+/// (either because they were untied, or because mi was rescheduled, and will
+/// be visited again later). If the shouldOnlyCommute flag is true, only
+/// instruction commutation is attempted.
+bool TwoAddressInstructionPass::
+tryInstructionTransform(MachineBasicBlock::iterator &mi,
+                        MachineBasicBlock::iterator &nmi,
+                        unsigned SrcIdx, unsigned DstIdx,
+                        unsigned &Dist, bool shouldOnlyCommute) {
+  if (OptLevel == CodeGenOpt::None)
+    return false;
+
+  MachineInstr &MI = *mi;
+  Register regA = MI.getOperand(DstIdx).getReg();
+  Register regB = MI.getOperand(SrcIdx).getReg();
+
+  assert(regB.isVirtual() && "cannot make instruction into two-address form");
+  bool regBKilled = isKilled(MI, regB, MRI, TII, LIS, true);
+
+  if (regA.isVirtual())
+    scanUses(regA);
+
+  bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
+
+  // If the instruction is convertible to 3 Addr, instead
+  // of returning try 3 Addr transformation aggressively and
+  // use this variable to check later. Because it might be better.
+  // For example, we can just use `leal (%rsi,%rdi), %eax` and `ret`
+  // instead of the following code.
+  //   addl     %esi, %edi
+  //   movl     %edi, %eax
+  //   ret
+  if (Commuted && !MI.isConvertibleTo3Addr())
+    return false;
+
+  if (shouldOnlyCommute)
+    return false;
+
+  // If there is one more use of regB later in the same MBB, consider
+  // re-schedule this MI below it.
+  if (!Commuted && EnableRescheduling && rescheduleMIBelowKill(mi, nmi, regB)) {
+    ++NumReSchedDowns;
+    return true;
+  }
+
+  // If we commuted, regB may have changed so we should re-sample it to avoid
+  // confusing the three address conversion below.
+  if (Commuted) {
+    regB = MI.getOperand(SrcIdx).getReg();
+    regBKilled = isKilled(MI, regB, MRI, TII, LIS, true);
+  }
+
+  if (MI.isConvertibleTo3Addr()) {
+    // This instruction is potentially convertible to a true
+    // three-address instruction.  Check if it is profitable.
+    if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
+      // Try to convert it.
+      if (convertInstTo3Addr(mi, nmi, regA, regB, Dist)) {
+        ++NumConvertedTo3Addr;
+        return true; // Done with this instruction.
+      }
+    }
+  }
+
+  // Return if it is commuted but 3 addr conversion is failed.
+  if (Commuted)
+    return false;
+
+  // If there is one more use of regB later in the same MBB, consider
+  // re-schedule it before this MI if it's legal.
+  if (EnableRescheduling && rescheduleKillAboveMI(mi, nmi, regB)) {
+    ++NumReSchedUps;
+    return true;
+  }
+
+  // If this is an instruction with a load folded into it, try unfolding
+  // the load, e.g. avoid this:
+  //   movq %rdx, %rcx
+  //   addq (%rax), %rcx
+  // in favor of this:
+  //   movq (%rax), %rcx
+  //   addq %rdx, %rcx
+  // because it's preferable to schedule a load than a register copy.
+  if (MI.mayLoad() && !regBKilled) {
+    // Determine if a load can be unfolded.
+    unsigned LoadRegIndex;
+    unsigned NewOpc =
+      TII->getOpcodeAfterMemoryUnfold(MI.getOpcode(),
+                                      /*UnfoldLoad=*/true,
+                                      /*UnfoldStore=*/false,
+                                      &LoadRegIndex);
+    if (NewOpc != 0) {
+      const MCInstrDesc &UnfoldMCID = TII->get(NewOpc);
+      if (UnfoldMCID.getNumDefs() == 1) {
+        // Unfold the load.
+        LLVM_DEBUG(dbgs() << "2addr:   UNFOLDING: " << MI);
+        const TargetRegisterClass *RC =
+          TRI->getAllocatableClass(
+            TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI, *MF));
+        Register Reg = MRI->createVirtualRegister(RC);
+        SmallVector<MachineInstr *, 2> NewMIs;
+        if (!TII->unfoldMemoryOperand(*MF, MI, Reg,
+                                      /*UnfoldLoad=*/true,
+                                      /*UnfoldStore=*/false, NewMIs)) {
+          LLVM_DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
+          return false;
+        }
+        assert(NewMIs.size() == 2 &&
+               "Unfolded a load into multiple instructions!");
+        // The load was previously folded, so this is the only use.
+        NewMIs[1]->addRegisterKilled(Reg, TRI);
+
+        // Tentatively insert the instructions into the block so that they
+        // look "normal" to the transformation logic.
+        MBB->insert(mi, NewMIs[0]);
+        MBB->insert(mi, NewMIs[1]);
+        DistanceMap.insert(std::make_pair(NewMIs[0], Dist++));
+        DistanceMap.insert(std::make_pair(NewMIs[1], Dist));
+
+        LLVM_DEBUG(dbgs() << "2addr:    NEW LOAD: " << *NewMIs[0]
+                          << "2addr:    NEW INST: " << *NewMIs[1]);
+
+        // Transform the instruction, now that it no longer has a load.
+        unsigned NewDstIdx = NewMIs[1]->findRegisterDefOperandIdx(regA);
+        unsigned NewSrcIdx = NewMIs[1]->findRegisterUseOperandIdx(regB);
+        MachineBasicBlock::iterator NewMI = NewMIs[1];
+        bool TransformResult =
+          tryInstructionTransform(NewMI, mi, NewSrcIdx, NewDstIdx, Dist, true);
+        (void)TransformResult;
+        assert(!TransformResult &&
+               "tryInstructionTransform() should return false.");
+        if (NewMIs[1]->getOperand(NewSrcIdx).isKill()) {
+          // Success, or at least we made an improvement. Keep the unfolded
+          // instructions and discard the original.
+          if (LV) {
+            for (const MachineOperand &MO : MI.operands()) {
+              if (MO.isReg() && MO.getReg().isVirtual()) {
+                if (MO.isUse()) {
+                  if (MO.isKill()) {
+                    if (NewMIs[0]->killsRegister(MO.getReg()))
+                      LV->replaceKillInstruction(MO.getReg(), MI, *NewMIs[0]);
+                    else {
+                      assert(NewMIs[1]->killsRegister(MO.getReg()) &&
+                             "Kill missing after load unfold!");
+                      LV->replaceKillInstruction(MO.getReg(), MI, *NewMIs[1]);
+                    }
+                  }
+                } else if (LV->removeVirtualRegisterDead(MO.getReg(), MI)) {
+                  if (NewMIs[1]->registerDefIsDead(MO.getReg()))
+                    LV->addVirtualRegisterDead(MO.getReg(), *NewMIs[1]);
+                  else {
+                    assert(NewMIs[0]->registerDefIsDead(MO.getReg()) &&
+                           "Dead flag missing after load unfold!");
+                    LV->addVirtualRegisterDead(MO.getReg(), *NewMIs[0]);
+                  }
+                }
+              }
+            }
+            LV->addVirtualRegisterKilled(Reg, *NewMIs[1]);
+          }
+
+          SmallVector<Register, 4> OrigRegs;
+          if (LIS) {
+            for (const MachineOperand &MO : MI.operands()) {
+              if (MO.isReg())
+                OrigRegs.push_back(MO.getReg());
+            }
+
+            LIS->RemoveMachineInstrFromMaps(MI);
+          }
+
+          MI.eraseFromParent();
+          DistanceMap.erase(&MI);
+
+          // Update LiveIntervals.
+          if (LIS) {
+            MachineBasicBlock::iterator Begin(NewMIs[0]);
+            MachineBasicBlock::iterator End(NewMIs[1]);
+            LIS->repairIntervalsInRange(MBB, Begin, End, OrigRegs);
+          }
+
+          mi = NewMIs[1];
+        } else {
+          // Transforming didn't eliminate the tie and didn't lead to an
+          // improvement. Clean up the unfolded instructions and keep the
+          // original.
+          LLVM_DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
+          NewMIs[0]->eraseFromParent();
+          NewMIs[1]->eraseFromParent();
+          DistanceMap.erase(NewMIs[0]);
+          DistanceMap.erase(NewMIs[1]);
+          Dist--;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+// Collect tied operands of MI that need to be handled.
+// Rewrite trivial cases immediately.
+// Return true if any tied operands where found, including the trivial ones.
+bool TwoAddressInstructionPass::
+collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) {
+  bool AnyOps = false;
+  unsigned NumOps = MI->getNumOperands();
+
+  for (unsigned SrcIdx = 0; SrcIdx < NumOps; ++SrcIdx) {
+    unsigned DstIdx = 0;
+    if (!MI->isRegTiedToDefOperand(SrcIdx, &DstIdx))
+      continue;
+    AnyOps = true;
+    MachineOperand &SrcMO = MI->getOperand(SrcIdx);
+    MachineOperand &DstMO = MI->getOperand(DstIdx);
+    Register SrcReg = SrcMO.getReg();
+    Register DstReg = DstMO.getReg();
+    // Tied constraint already satisfied?
+    if (SrcReg == DstReg)
+      continue;
+
+    assert(SrcReg && SrcMO.isUse() && "two address instruction invalid");
+
+    // Deal with undef uses immediately - simply rewrite the src operand.
+    if (SrcMO.isUndef() && !DstMO.getSubReg()) {
+      // Constrain the DstReg register class if required.
+      if (DstReg.isVirtual()) {
+        const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
+        MRI->constrainRegClass(DstReg, RC);
+      }
+      SrcMO.setReg(DstReg);
+      SrcMO.setSubReg(0);
+      LLVM_DEBUG(dbgs() << "\t\trewrite undef:\t" << *MI);
+      continue;
+    }
+    TiedOperands[SrcReg].push_back(std::make_pair(SrcIdx, DstIdx));
+  }
+  return AnyOps;
+}
+
+// Process a list of tied MI operands that all use the same source register.
+// The tied pairs are of the form (SrcIdx, DstIdx).
+void
+TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
+                                            TiedPairList &TiedPairs,
+                                            unsigned &Dist) {
+  bool IsEarlyClobber = llvm::any_of(TiedPairs, [MI](auto const &TP) {
+    return MI->getOperand(TP.second).isEarlyClobber();
+  });
+
+  bool RemovedKillFlag = false;
+  bool AllUsesCopied = true;
+  unsigned LastCopiedReg = 0;
+  SlotIndex LastCopyIdx;
+  Register RegB = 0;
+  unsigned SubRegB = 0;
+  for (auto &TP : TiedPairs) {
+    unsigned SrcIdx = TP.first;
+    unsigned DstIdx = TP.second;
+
+    const MachineOperand &DstMO = MI->getOperand(DstIdx);
+    Register RegA = DstMO.getReg();
+
+    // Grab RegB from the instruction because it may have changed if the
+    // instruction was commuted.
+    RegB = MI->getOperand(SrcIdx).getReg();
+    SubRegB = MI->getOperand(SrcIdx).getSubReg();
+
+    if (RegA == RegB) {
+      // The register is tied to multiple destinations (or else we would
+      // not have continued this far), but this use of the register
+      // already matches the tied destination.  Leave it.
+      AllUsesCopied = false;
+      continue;
+    }
+    LastCopiedReg = RegA;
+
+    assert(RegB.isVirtual() && "cannot make instruction into two-address form");
+
+#ifndef NDEBUG
+    // First, verify that we don't have a use of "a" in the instruction
+    // (a = b + a for example) because our transformation will not
+    // work. This should never occur because we are in SSA form.
+    for (unsigned i = 0; i != MI->getNumOperands(); ++i)
+      assert(i == DstIdx ||
+             !MI->getOperand(i).isReg() ||
+             MI->getOperand(i).getReg() != RegA);
+#endif
+
+    // Emit a copy.
+    MachineInstrBuilder MIB = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                                      TII->get(TargetOpcode::COPY), RegA);
+    // If this operand is folding a truncation, the truncation now moves to the
+    // copy so that the register classes remain valid for the operands.
+    MIB.addReg(RegB, 0, SubRegB);
+    const TargetRegisterClass *RC = MRI->getRegClass(RegB);
+    if (SubRegB) {
+      if (RegA.isVirtual()) {
+        assert(TRI->getMatchingSuperRegClass(RC, MRI->getRegClass(RegA),
+                                             SubRegB) &&
+               "tied subregister must be a truncation");
+        // The superreg class will not be used to constrain the subreg class.
+        RC = nullptr;
+      } else {
+        assert(TRI->getMatchingSuperReg(RegA, SubRegB, MRI->getRegClass(RegB))
+               && "tied subregister must be a truncation");
+      }
+    }
+
+    // Update DistanceMap.
+    MachineBasicBlock::iterator PrevMI = MI;
+    --PrevMI;
+    DistanceMap.insert(std::make_pair(&*PrevMI, Dist));
+    DistanceMap[MI] = ++Dist;
+
+    if (LIS) {
+      LastCopyIdx = LIS->InsertMachineInstrInMaps(*PrevMI).getRegSlot();
+
+      SlotIndex endIdx =
+          LIS->getInstructionIndex(*MI).getRegSlot(IsEarlyClobber);
+      if (RegA.isVirtual()) {
+        LiveInterval &LI = LIS->getInterval(RegA);
+        VNInfo *VNI = LI.getNextValue(LastCopyIdx, LIS->getVNInfoAllocator());
+        LI.addSegment(LiveRange::Segment(LastCopyIdx, endIdx, VNI));
+        for (auto &S : LI.subranges()) {
+          VNI = S.getNextValue(LastCopyIdx, LIS->getVNInfoAllocator());
+          S.addSegment(LiveRange::Segment(LastCopyIdx, endIdx, VNI));
+        }
+      } else {
+        for (MCRegUnit Unit : TRI->regunits(RegA)) {
+          if (LiveRange *LR = LIS->getCachedRegUnit(Unit)) {
+            VNInfo *VNI =
+                LR->getNextValue(LastCopyIdx, LIS->getVNInfoAllocator());
+            LR->addSegment(LiveRange::Segment(LastCopyIdx, endIdx, VNI));
+          }
+        }
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "\t\tprepend:\t" << *MIB);
+
+    MachineOperand &MO = MI->getOperand(SrcIdx);
+    assert(MO.isReg() && MO.getReg() == RegB && MO.isUse() &&
+           "inconsistent operand info for 2-reg pass");
+    if (MO.isKill()) {
+      MO.setIsKill(false);
+      RemovedKillFlag = true;
+    }
+
+    // Make sure regA is a legal regclass for the SrcIdx operand.
+    if (RegA.isVirtual() && RegB.isVirtual())
+      MRI->constrainRegClass(RegA, RC);
+    MO.setReg(RegA);
+    // The getMatchingSuper asserts guarantee that the register class projected
+    // by SubRegB is compatible with RegA with no subregister. So regardless of
+    // whether the dest oper writes a subreg, the source oper should not.
+    MO.setSubReg(0);
+  }
+
+  if (AllUsesCopied) {
+    LaneBitmask RemainingUses = LaneBitmask::getNone();
+    // Replace other (un-tied) uses of regB with LastCopiedReg.
+    for (MachineOperand &MO : MI->all_uses()) {
+      if (MO.getReg() == RegB) {
+        if (MO.getSubReg() == SubRegB && !IsEarlyClobber) {
+          if (MO.isKill()) {
+            MO.setIsKill(false);
+            RemovedKillFlag = true;
+          }
+          MO.setReg(LastCopiedReg);
+          MO.setSubReg(0);
+        } else {
+          RemainingUses |= TRI->getSubRegIndexLaneMask(MO.getSubReg());
+        }
+      }
+    }
+
+    // Update live variables for regB.
+    if (RemovedKillFlag && RemainingUses.none() && LV &&
+        LV->getVarInfo(RegB).removeKill(*MI)) {
+      MachineBasicBlock::iterator PrevMI = MI;
+      --PrevMI;
+      LV->addVirtualRegisterKilled(RegB, *PrevMI);
+    }
+
+    if (RemovedKillFlag && RemainingUses.none())
+      SrcRegMap[LastCopiedReg] = RegB;
+
+    // Update LiveIntervals.
+    if (LIS) {
+      SlotIndex UseIdx = LIS->getInstructionIndex(*MI);
+      auto Shrink = [=](LiveRange &LR, LaneBitmask LaneMask) {
+        LiveRange::Segment *S = LR.getSegmentContaining(LastCopyIdx);
+        if (!S)
+          return true;
+        if ((LaneMask & RemainingUses).any())
+          return false;
+        if (S->end.getBaseIndex() != UseIdx)
+          return false;
+        S->end = LastCopyIdx;
+        return true;
+      };
+
+      LiveInterval &LI = LIS->getInterval(RegB);
+      bool ShrinkLI = true;
+      for (auto &S : LI.subranges())
+        ShrinkLI &= Shrink(S, S.LaneMask);
+      if (ShrinkLI)
+        Shrink(LI, LaneBitmask::getAll());
+    }
+  } else if (RemovedKillFlag) {
+    // Some tied uses of regB matched their destination registers, so
+    // regB is still used in this instruction, but a kill flag was
+    // removed from a different tied use of regB, so now we need to add
+    // a kill flag to one of the remaining uses of regB.
+    for (MachineOperand &MO : MI->all_uses()) {
+      if (MO.getReg() == RegB) {
+        MO.setIsKill(true);
+        break;
+      }
+    }
+  }
+}
+
+// For every tied operand pair this function transforms statepoint from
+//    RegA = STATEPOINT ... RegB(tied-def N)
+// to
+//    RegB = STATEPOINT ... RegB(tied-def N)
+// and replaces all uses of RegA with RegB.
+// No extra COPY instruction is necessary because tied use is killed at
+// STATEPOINT.
+bool TwoAddressInstructionPass::processStatepoint(
+    MachineInstr *MI, TiedOperandMap &TiedOperands) {
+
+  bool NeedCopy = false;
+  for (auto &TO : TiedOperands) {
+    Register RegB = TO.first;
+    if (TO.second.size() != 1) {
+      NeedCopy = true;
+      continue;
+    }
+
+    unsigned SrcIdx = TO.second[0].first;
+    unsigned DstIdx = TO.second[0].second;
+
+    MachineOperand &DstMO = MI->getOperand(DstIdx);
+    Register RegA = DstMO.getReg();
+
+    assert(RegB == MI->getOperand(SrcIdx).getReg());
+
+    if (RegA == RegB)
+      continue;
+
+    // CodeGenPrepare can sink pointer compare past statepoint, which
+    // breaks assumption that statepoint kills tied-use register when
+    // in SSA form (see note in IR/SafepointIRVerifier.cpp). Fall back
+    // to generic tied register handling to avoid assertion failures.
+    // TODO: Recompute LIS/LV information for new range here.
+    if (LIS) {
+      const auto &UseLI = LIS->getInterval(RegB);
+      const auto &DefLI = LIS->getInterval(RegA);
+      if (DefLI.overlaps(UseLI)) {
+        LLVM_DEBUG(dbgs() << "LIS: " << printReg(RegB, TRI, 0)
+                          << " UseLI overlaps with DefLI\n");
+        NeedCopy = true;
+        continue;
+      }
+    } else if (LV && LV->getVarInfo(RegB).findKill(MI->getParent()) != MI) {
+      // Note that MachineOperand::isKill does not work here, because it
+      // is set only on first register use in instruction and for statepoint
+      // tied-use register will usually be found in preceeding deopt bundle.
+      LLVM_DEBUG(dbgs() << "LV: " << printReg(RegB, TRI, 0)
+                        << " not killed by statepoint\n");
+      NeedCopy = true;
+      continue;
+    }
+
+    if (!MRI->constrainRegClass(RegB, MRI->getRegClass(RegA))) {
+      LLVM_DEBUG(dbgs() << "MRI: couldn't constrain" << printReg(RegB, TRI, 0)
+                        << " to register class of " << printReg(RegA, TRI, 0)
+                        << '\n');
+      NeedCopy = true;
+      continue;
+    }
+    MRI->replaceRegWith(RegA, RegB);
+
+    if (LIS) {
+      VNInfo::Allocator &A = LIS->getVNInfoAllocator();
+      LiveInterval &LI = LIS->getInterval(RegB);
+      LiveInterval &Other = LIS->getInterval(RegA);
+      SmallVector<VNInfo *> NewVNIs;
+      for (const VNInfo *VNI : Other.valnos) {
+        assert(VNI->id == NewVNIs.size() && "assumed");
+        NewVNIs.push_back(LI.createValueCopy(VNI, A));
+      }
+      for (auto &S : Other) {
+        VNInfo *VNI = NewVNIs[S.valno->id];
+        LiveRange::Segment NewSeg(S.start, S.end, VNI);
+        LI.addSegment(NewSeg);
+      }
+      LIS->removeInterval(RegA);
+    }
+
+    if (LV) {
+      if (MI->getOperand(SrcIdx).isKill())
+        LV->removeVirtualRegisterKilled(RegB, *MI);
+      LiveVariables::VarInfo &SrcInfo = LV->getVarInfo(RegB);
+      LiveVariables::VarInfo &DstInfo = LV->getVarInfo(RegA);
+      SrcInfo.AliveBlocks |= DstInfo.AliveBlocks;
+      DstInfo.AliveBlocks.clear();
+      for (auto *KillMI : DstInfo.Kills)
+        LV->addVirtualRegisterKilled(RegB, *KillMI, false);
+    }
+  }
+  return !NeedCopy;
+}
+
+/// Reduce two-address instructions to two operands.
+bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
+  MF = &Func;
+  const TargetMachine &TM = MF->getTarget();
+  MRI = &MF->getRegInfo();
+  TII = MF->getSubtarget().getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  InstrItins = MF->getSubtarget().getInstrItineraryData();
+  LV = getAnalysisIfAvailable<LiveVariables>();
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
+  if (auto *AAPass = getAnalysisIfAvailable<AAResultsWrapperPass>())
+    AA = &AAPass->getAAResults();
+  else
+    AA = nullptr;
+  OptLevel = TM.getOptLevel();
+  // Disable optimizations if requested. We cannot skip the whole pass as some
+  // fixups are necessary for correctness.
+  if (skipFunction(Func.getFunction()))
+    OptLevel = CodeGenOpt::None;
+
+  bool MadeChange = false;
+
+  LLVM_DEBUG(dbgs() << "********** REWRITING TWO-ADDR INSTRS **********\n");
+  LLVM_DEBUG(dbgs() << "********** Function: " << MF->getName() << '\n');
+
+  // This pass takes the function out of SSA form.
+  MRI->leaveSSA();
+
+  // This pass will rewrite the tied-def to meet the RegConstraint.
+  MF->getProperties()
+      .set(MachineFunctionProperties::Property::TiedOpsRewritten);
+
+  TiedOperandMap TiedOperands;
+  for (MachineBasicBlock &MBBI : *MF) {
+    MBB = &MBBI;
+    unsigned Dist = 0;
+    DistanceMap.clear();
+    SrcRegMap.clear();
+    DstRegMap.clear();
+    Processed.clear();
+    for (MachineBasicBlock::iterator mi = MBB->begin(), me = MBB->end();
+         mi != me; ) {
+      MachineBasicBlock::iterator nmi = std::next(mi);
+      // Skip debug instructions.
+      if (mi->isDebugInstr()) {
+        mi = nmi;
+        continue;
+      }
+
+      // Expand REG_SEQUENCE instructions. This will position mi at the first
+      // expanded instruction.
+      if (mi->isRegSequence())
+        eliminateRegSequence(mi);
+
+      DistanceMap.insert(std::make_pair(&*mi, ++Dist));
+
+      processCopy(&*mi);
+
+      // First scan through all the tied register uses in this instruction
+      // and record a list of pairs of tied operands for each register.
+      if (!collectTiedOperands(&*mi, TiedOperands)) {
+        removeClobberedSrcRegMap(&*mi);
+        mi = nmi;
+        continue;
+      }
+
+      ++NumTwoAddressInstrs;
+      MadeChange = true;
+      LLVM_DEBUG(dbgs() << '\t' << *mi);
+
+      // If the instruction has a single pair of tied operands, try some
+      // transformations that may either eliminate the tied operands or
+      // improve the opportunities for coalescing away the register copy.
+      if (TiedOperands.size() == 1) {
+        SmallVectorImpl<std::pair<unsigned, unsigned>> &TiedPairs
+          = TiedOperands.begin()->second;
+        if (TiedPairs.size() == 1) {
+          unsigned SrcIdx = TiedPairs[0].first;
+          unsigned DstIdx = TiedPairs[0].second;
+          Register SrcReg = mi->getOperand(SrcIdx).getReg();
+          Register DstReg = mi->getOperand(DstIdx).getReg();
+          if (SrcReg != DstReg &&
+              tryInstructionTransform(mi, nmi, SrcIdx, DstIdx, Dist, false)) {
+            // The tied operands have been eliminated or shifted further down
+            // the block to ease elimination. Continue processing with 'nmi'.
+            TiedOperands.clear();
+            removeClobberedSrcRegMap(&*mi);
+            mi = nmi;
+            continue;
+          }
+        }
+      }
+
+      if (mi->getOpcode() == TargetOpcode::STATEPOINT &&
+          processStatepoint(&*mi, TiedOperands)) {
+        TiedOperands.clear();
+        LLVM_DEBUG(dbgs() << "\t\trewrite to:\t" << *mi);
+        mi = nmi;
+        continue;
+      }
+
+      // Now iterate over the information collected above.
+      for (auto &TO : TiedOperands) {
+        processTiedPairs(&*mi, TO.second, Dist);
+        LLVM_DEBUG(dbgs() << "\t\trewrite to:\t" << *mi);
+      }
+
+      // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form.
+      if (mi->isInsertSubreg()) {
+        // From %reg = INSERT_SUBREG %reg, %subreg, subidx
+        // To   %reg:subidx = COPY %subreg
+        unsigned SubIdx = mi->getOperand(3).getImm();
+        mi->removeOperand(3);
+        assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx");
+        mi->getOperand(0).setSubReg(SubIdx);
+        mi->getOperand(0).setIsUndef(mi->getOperand(1).isUndef());
+        mi->removeOperand(1);
+        mi->setDesc(TII->get(TargetOpcode::COPY));
+        LLVM_DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
+
+        // Update LiveIntervals.
+        if (LIS) {
+          Register Reg = mi->getOperand(0).getReg();
+          LiveInterval &LI = LIS->getInterval(Reg);
+          if (LI.hasSubRanges()) {
+            // The COPY no longer defines subregs of %reg except for
+            // %reg.subidx.
+            LaneBitmask LaneMask =
+                TRI->getSubRegIndexLaneMask(mi->getOperand(0).getSubReg());
+            SlotIndex Idx = LIS->getInstructionIndex(*mi);
+            for (auto &S : LI.subranges()) {
+              if ((S.LaneMask & LaneMask).none()) {
+                LiveRange::iterator UseSeg = S.FindSegmentContaining(Idx);
+                LiveRange::iterator DefSeg = std::next(UseSeg);
+                S.MergeValueNumberInto(DefSeg->valno, UseSeg->valno);
+              }
+            }
+
+            // The COPY no longer has a use of %reg.
+            LIS->shrinkToUses(&LI);
+          } else {
+            // The live interval for Reg did not have subranges but now it needs
+            // them because we have introduced a subreg def. Recompute it.
+            LIS->removeInterval(Reg);
+            LIS->createAndComputeVirtRegInterval(Reg);
+          }
+        }
+      }
+
+      // Clear TiedOperands here instead of at the top of the loop
+      // since most instructions do not have tied operands.
+      TiedOperands.clear();
+      removeClobberedSrcRegMap(&*mi);
+      mi = nmi;
+    }
+  }
+
+  return MadeChange;
+}
+
+/// Eliminate a REG_SEQUENCE instruction as part of the de-ssa process.
+///
+/// The instruction is turned into a sequence of sub-register copies:
+///
+///   %dst = REG_SEQUENCE %v1, ssub0, %v2, ssub1
+///
+/// Becomes:
+///
+///   undef %dst:ssub0 = COPY %v1
+///   %dst:ssub1 = COPY %v2
+void TwoAddressInstructionPass::
+eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  Register DstReg = MI.getOperand(0).getReg();
+
+  SmallVector<Register, 4> OrigRegs;
+  if (LIS) {
+    OrigRegs.push_back(MI.getOperand(0).getReg());
+    for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2)
+      OrigRegs.push_back(MI.getOperand(i).getReg());
+  }
+
+  bool DefEmitted = false;
+  for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2) {
+    MachineOperand &UseMO = MI.getOperand(i);
+    Register SrcReg = UseMO.getReg();
+    unsigned SubIdx = MI.getOperand(i+1).getImm();
+    // Nothing needs to be inserted for undef operands.
+    if (UseMO.isUndef())
+      continue;
+
+    // Defer any kill flag to the last operand using SrcReg. Otherwise, we
+    // might insert a COPY that uses SrcReg after is was killed.
+    bool isKill = UseMO.isKill();
+    if (isKill)
+      for (unsigned j = i + 2; j < e; j += 2)
+        if (MI.getOperand(j).getReg() == SrcReg) {
+          MI.getOperand(j).setIsKill();
+          UseMO.setIsKill(false);
+          isKill = false;
+          break;
+        }
+
+    // Insert the sub-register copy.
+    MachineInstr *CopyMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY))
+                               .addReg(DstReg, RegState::Define, SubIdx)
+                               .add(UseMO);
+
+    // The first def needs an undef flag because there is no live register
+    // before it.
+    if (!DefEmitted) {
+      CopyMI->getOperand(0).setIsUndef(true);
+      // Return an iterator pointing to the first inserted instr.
+      MBBI = CopyMI;
+    }
+    DefEmitted = true;
+
+    // Update LiveVariables' kill info.
+    if (LV && isKill && !SrcReg.isPhysical())
+      LV->replaceKillInstruction(SrcReg, MI, *CopyMI);
+
+    LLVM_DEBUG(dbgs() << "Inserted: " << *CopyMI);
+  }
+
+  MachineBasicBlock::iterator EndMBBI =
+      std::next(MachineBasicBlock::iterator(MI));
+
+  if (!DefEmitted) {
+    LLVM_DEBUG(dbgs() << "Turned: " << MI << " into an IMPLICIT_DEF");
+    MI.setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
+    for (int j = MI.getNumOperands() - 1, ee = 0; j > ee; --j)
+      MI.removeOperand(j);
+  } else {
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(MI);
+
+    LLVM_DEBUG(dbgs() << "Eliminated: " << MI);
+    MI.eraseFromParent();
+  }
+
+  // Udpate LiveIntervals.
+  if (LIS)
+    LIS->repairIntervalsInRange(MBB, MBBI, EndMBBI, OrigRegs);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/TypePromotion.cpp b/contrib/llvm-project/llvm/lib/CodeGen/TypePromotion.cpp
new file mode 100644
index 000000000000..426292345a14
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/TypePromotion.cpp
@@ -0,0 +1,1047 @@
+//===----- TypePromotion.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This is an opcode based type promotion pass for small types that would
+/// otherwise be promoted during legalisation. This works around the limitations
+/// of selection dag for cyclic regions. The search begins from icmp
+/// instructions operands where a tree, consisting of non-wrapping or safe
+/// wrapping instructions, is built, checked and promoted if possible.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TypePromotion.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "type-promotion"
+#define PASS_NAME "Type Promotion"
+
+using namespace llvm;
+
+static cl::opt<bool> DisablePromotion("disable-type-promotion", cl::Hidden,
+                                      cl::init(false),
+                                      cl::desc("Disable type promotion pass"));
+
+// The goal of this pass is to enable more efficient code generation for
+// operations on narrow types (i.e. types with < 32-bits) and this is a
+// motivating IR code example:
+//
+//   define hidden i32 @cmp(i8 zeroext) {
+//     %2 = add i8 %0, -49
+//     %3 = icmp ult i8 %2, 3
+//     ..
+//   }
+//
+// The issue here is that i8 is type-legalized to i32 because i8 is not a
+// legal type. Thus, arithmetic is done in integer-precision, but then the
+// byte value is masked out as follows:
+//
+//   t19: i32 = add t4, Constant:i32<-49>
+//     t24: i32 = and t19, Constant:i32<255>
+//
+// Consequently, we generate code like this:
+//
+//   subs  r0, #49
+//   uxtb  r1, r0
+//   cmp r1, #3
+//
+// This shows that masking out the byte value results in generation of
+// the UXTB instruction. This is not optimal as r0 already contains the byte
+// value we need, and so instead we can just generate:
+//
+//   sub.w r1, r0, #49
+//   cmp r1, #3
+//
+// We achieve this by type promoting the IR to i32 like so for this example:
+//
+//   define i32 @cmp(i8 zeroext %c) {
+//     %0 = zext i8 %c to i32
+//     %c.off = add i32 %0, -49
+//     %1 = icmp ult i32 %c.off, 3
+//     ..
+//   }
+//
+// For this to be valid and legal, we need to prove that the i32 add is
+// producing the same value as the i8 addition, and that e.g. no overflow
+// happens.
+//
+// A brief sketch of the algorithm and some terminology.
+// We pattern match interesting IR patterns:
+// - which have "sources": instructions producing narrow values (i8, i16), and
+// - they have "sinks": instructions consuming these narrow values.
+//
+// We collect all instruction connecting sources and sinks in a worklist, so
+// that we can mutate these instruction and perform type promotion when it is
+// legal to do so.
+
+namespace {
+class IRPromoter {
+  LLVMContext &Ctx;
+  unsigned PromotedWidth = 0;
+  SetVector<Value *> &Visited;
+  SetVector<Value *> &Sources;
+  SetVector<Instruction *> &Sinks;
+  SmallPtrSetImpl<Instruction *> &SafeWrap;
+  SmallPtrSetImpl<Instruction *> &InstsToRemove;
+  IntegerType *ExtTy = nullptr;
+  SmallPtrSet<Value *, 8> NewInsts;
+  DenseMap<Value *, SmallVector<Type *, 4>> TruncTysMap;
+  SmallPtrSet<Value *, 8> Promoted;
+
+  void ReplaceAllUsersOfWith(Value *From, Value *To);
+  void ExtendSources();
+  void ConvertTruncs();
+  void PromoteTree();
+  void TruncateSinks();
+  void Cleanup();
+
+public:
+  IRPromoter(LLVMContext &C, unsigned Width, SetVector<Value *> &visited,
+             SetVector<Value *> &sources, SetVector<Instruction *> &sinks,
+             SmallPtrSetImpl<Instruction *> &wrap,
+             SmallPtrSetImpl<Instruction *> &instsToRemove)
+      : Ctx(C), PromotedWidth(Width), Visited(visited), Sources(sources),
+        Sinks(sinks), SafeWrap(wrap), InstsToRemove(instsToRemove) {
+    ExtTy = IntegerType::get(Ctx, PromotedWidth);
+  }
+
+  void Mutate();
+};
+
+class TypePromotionImpl {
+  unsigned TypeSize = 0;
+  LLVMContext *Ctx = nullptr;
+  unsigned RegisterBitWidth = 0;
+  SmallPtrSet<Value *, 16> AllVisited;
+  SmallPtrSet<Instruction *, 8> SafeToPromote;
+  SmallPtrSet<Instruction *, 4> SafeWrap;
+  SmallPtrSet<Instruction *, 4> InstsToRemove;
+
+  // Does V have the same size result type as TypeSize.
+  bool EqualTypeSize(Value *V);
+  // Does V have the same size, or narrower, result type as TypeSize.
+  bool LessOrEqualTypeSize(Value *V);
+  // Does V have a result type that is wider than TypeSize.
+  bool GreaterThanTypeSize(Value *V);
+  // Does V have a result type that is narrower than TypeSize.
+  bool LessThanTypeSize(Value *V);
+  // Should V be a leaf in the promote tree?
+  bool isSource(Value *V);
+  // Should V be a root in the promotion tree?
+  bool isSink(Value *V);
+  // Should we change the result type of V? It will result in the users of V
+  // being visited.
+  bool shouldPromote(Value *V);
+  // Is I an add or a sub, which isn't marked as nuw, but where a wrapping
+  // result won't affect the computation?
+  bool isSafeWrap(Instruction *I);
+  // Can V have its integer type promoted, or can the type be ignored.
+  bool isSupportedType(Value *V);
+  // Is V an instruction with a supported opcode or another value that we can
+  // handle, such as constants and basic blocks.
+  bool isSupportedValue(Value *V);
+  // Is V an instruction thats result can trivially promoted, or has safe
+  // wrapping.
+  bool isLegalToPromote(Value *V);
+  bool TryToPromote(Value *V, unsigned PromotedWidth, const LoopInfo &LI);
+
+public:
+  bool run(Function &F, const TargetMachine *TM,
+           const TargetTransformInfo &TTI, const LoopInfo &LI);
+};
+
+class TypePromotionLegacy : public FunctionPass {
+public:
+  static char ID;
+
+  TypePromotionLegacy() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<TargetPassConfig>();
+    AU.setPreservesCFG();
+    AU.addPreserved<LoopInfoWrapperPass>();
+  }
+
+  StringRef getPassName() const override { return PASS_NAME; }
+
+  bool runOnFunction(Function &F) override;
+};
+
+} // namespace
+
+static bool GenerateSignBits(Instruction *I) {
+  unsigned Opc = I->getOpcode();
+  return Opc == Instruction::AShr || Opc == Instruction::SDiv ||
+         Opc == Instruction::SRem || Opc == Instruction::SExt;
+}
+
+bool TypePromotionImpl::EqualTypeSize(Value *V) {
+  return V->getType()->getScalarSizeInBits() == TypeSize;
+}
+
+bool TypePromotionImpl::LessOrEqualTypeSize(Value *V) {
+  return V->getType()->getScalarSizeInBits() <= TypeSize;
+}
+
+bool TypePromotionImpl::GreaterThanTypeSize(Value *V) {
+  return V->getType()->getScalarSizeInBits() > TypeSize;
+}
+
+bool TypePromotionImpl::LessThanTypeSize(Value *V) {
+  return V->getType()->getScalarSizeInBits() < TypeSize;
+}
+
+/// Return true if the given value is a source in the use-def chain, producing
+/// a narrow 'TypeSize' value. These values will be zext to start the promotion
+/// of the tree to i32. We guarantee that these won't populate the upper bits
+/// of the register. ZExt on the loads will be free, and the same for call
+/// return values because we only accept ones that guarantee a zeroext ret val.
+/// Many arguments will have the zeroext attribute too, so those would be free
+/// too.
+bool TypePromotionImpl::isSource(Value *V) {
+  if (!isa<IntegerType>(V->getType()))
+    return false;
+
+  // TODO Allow zext to be sources.
+  if (isa<Argument>(V))
+    return true;
+  else if (isa<LoadInst>(V))
+    return true;
+  else if (auto *Call = dyn_cast<CallInst>(V))
+    return Call->hasRetAttr(Attribute::AttrKind::ZExt);
+  else if (auto *Trunc = dyn_cast<TruncInst>(V))
+    return EqualTypeSize(Trunc);
+  return false;
+}
+
+/// Return true if V will require any promoted values to be truncated for the
+/// the IR to remain valid. We can't mutate the value type of these
+/// instructions.
+bool TypePromotionImpl::isSink(Value *V) {
+  // TODO The truncate also isn't actually necessary because we would already
+  // proved that the data value is kept within the range of the original data
+  // type. We currently remove any truncs inserted for handling zext sinks.
+
+  // Sinks are:
+  // - points where the value in the register is being observed, such as an
+  //   icmp, switch or store.
+  // - points where value types have to match, such as calls and returns.
+  // - zext are included to ease the transformation and are generally removed
+  //   later on.
+  if (auto *Store = dyn_cast<StoreInst>(V))
+    return LessOrEqualTypeSize(Store->getValueOperand());
+  if (auto *Return = dyn_cast<ReturnInst>(V))
+    return LessOrEqualTypeSize(Return->getReturnValue());
+  if (auto *ZExt = dyn_cast<ZExtInst>(V))
+    return GreaterThanTypeSize(ZExt);
+  if (auto *Switch = dyn_cast<SwitchInst>(V))
+    return LessThanTypeSize(Switch->getCondition());
+  if (auto *ICmp = dyn_cast<ICmpInst>(V))
+    return ICmp->isSigned() || LessThanTypeSize(ICmp->getOperand(0));
+
+  return isa<CallInst>(V);
+}
+
+/// Return whether this instruction can safely wrap.
+bool TypePromotionImpl::isSafeWrap(Instruction *I) {
+  // We can support a potentially wrapping instruction (I) if:
+  // - It is only used by an unsigned icmp.
+  // - The icmp uses a constant.
+  // - The wrapping value (I) is decreasing, i.e would underflow - wrapping
+  //   around zero to become a larger number than before.
+  // - The wrapping instruction (I) also uses a constant.
+  //
+  // We can then use the two constants to calculate whether the result would
+  // wrap in respect to itself in the original bitwidth. If it doesn't wrap,
+  // just underflows the range, the icmp would give the same result whether the
+  // result has been truncated or not. We calculate this by:
+  // - Zero extending both constants, if needed, to RegisterBitWidth.
+  // - Take the absolute value of I's constant, adding this to the icmp const.
+  // - Check that this value is not out of range for small type. If it is, it
+  //   means that it has underflowed enough to wrap around the icmp constant.
+  //
+  // For example:
+  //
+  // %sub = sub i8 %a, 2
+  // %cmp = icmp ule i8 %sub, 254
+  //
+  // If %a = 0, %sub = -2 == FE == 254
+  // But if this is evalulated as a i32
+  // %sub = -2 == FF FF FF FE == 4294967294
+  // So the unsigned compares (i8 and i32) would not yield the same result.
+  //
+  // Another way to look at it is:
+  // %a - 2 <= 254
+  // %a + 2 <= 254 + 2
+  // %a <= 256
+  // And we can't represent 256 in the i8 format, so we don't support it.
+  //
+  // Whereas:
+  //
+  // %sub i8 %a, 1
+  // %cmp = icmp ule i8 %sub, 254
+  //
+  // If %a = 0, %sub = -1 == FF == 255
+  // As i32:
+  // %sub = -1 == FF FF FF FF == 4294967295
+  //
+  // In this case, the unsigned compare results would be the same and this
+  // would also be true for ult, uge and ugt:
+  // - (255 < 254) == (0xFFFFFFFF < 254) == false
+  // - (255 <= 254) == (0xFFFFFFFF <= 254) == false
+  // - (255 > 254) == (0xFFFFFFFF > 254) == true
+  // - (255 >= 254) == (0xFFFFFFFF >= 254) == true
+  //
+  // To demonstrate why we can't handle increasing values:
+  //
+  // %add = add i8 %a, 2
+  // %cmp = icmp ult i8 %add, 127
+  //
+  // If %a = 254, %add = 256 == (i8 1)
+  // As i32:
+  // %add = 256
+  //
+  // (1 < 127) != (256 < 127)
+
+  unsigned Opc = I->getOpcode();
+  if (Opc != Instruction::Add && Opc != Instruction::Sub)
+    return false;
+
+  if (!I->hasOneUse() || !isa<ICmpInst>(*I->user_begin()) ||
+      !isa<ConstantInt>(I->getOperand(1)))
+    return false;
+
+  // Don't support an icmp that deals with sign bits.
+  auto *CI = cast<ICmpInst>(*I->user_begin());
+  if (CI->isSigned() || CI->isEquality())
+    return false;
+
+  ConstantInt *ICmpConstant = nullptr;
+  if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(0)))
+    ICmpConstant = Const;
+  else if (auto *Const = dyn_cast<ConstantInt>(CI->getOperand(1)))
+    ICmpConstant = Const;
+  else
+    return false;
+
+  const APInt &ICmpConst = ICmpConstant->getValue();
+  APInt OverflowConst = cast<ConstantInt>(I->getOperand(1))->getValue();
+  if (Opc == Instruction::Sub)
+    OverflowConst = -OverflowConst;
+  if (!OverflowConst.isNonPositive())
+    return false;
+
+  // Using C1 = OverflowConst and C2 = ICmpConst, we can either prove that:
+  //   zext(x) + sext(C1) <u zext(C2)  if C1 < 0 and C1 >s C2
+  //   zext(x) + sext(C1) <u sext(C2)  if C1 < 0 and C1 <=s C2
+  if (OverflowConst.sgt(ICmpConst)) {
+    LLVM_DEBUG(dbgs() << "IR Promotion: Allowing safe overflow for sext "
+                      << "const of " << *I << "\n");
+    SafeWrap.insert(I);
+    return true;
+  } else {
+    LLVM_DEBUG(dbgs() << "IR Promotion: Allowing safe overflow for sext "
+                      << "const of " << *I << " and " << *CI << "\n");
+    SafeWrap.insert(I);
+    SafeWrap.insert(CI);
+    return true;
+  }
+  return false;
+}
+
+bool TypePromotionImpl::shouldPromote(Value *V) {
+  if (!isa<IntegerType>(V->getType()) || isSink(V))
+    return false;
+
+  if (isSource(V))
+    return true;
+
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  if (isa<ICmpInst>(I))
+    return false;
+
+  return true;
+}
+
+/// Return whether we can safely mutate V's type to ExtTy without having to be
+/// concerned with zero extending or truncation.
+static bool isPromotedResultSafe(Instruction *I) {
+  if (GenerateSignBits(I))
+    return false;
+
+  if (!isa<OverflowingBinaryOperator>(I))
+    return true;
+
+  return I->hasNoUnsignedWrap();
+}
+
+void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) {
+  SmallVector<Instruction *, 4> Users;
+  Instruction *InstTo = dyn_cast<Instruction>(To);
+  bool ReplacedAll = true;
+
+  LLVM_DEBUG(dbgs() << "IR Promotion: Replacing " << *From << " with " << *To
+                    << "\n");
+
+  for (Use &U : From->uses()) {
+    auto *User = cast<Instruction>(U.getUser());
+    if (InstTo && User->isIdenticalTo(InstTo)) {
+      ReplacedAll = false;
+      continue;
+    }
+    Users.push_back(User);
+  }
+
+  for (auto *U : Users)
+    U->replaceUsesOfWith(From, To);
+
+  if (ReplacedAll)
+    if (auto *I = dyn_cast<Instruction>(From))
+      InstsToRemove.insert(I);
+}
+
+void IRPromoter::ExtendSources() {
+  IRBuilder<> Builder{Ctx};
+
+  auto InsertZExt = [&](Value *V, Instruction *InsertPt) {
+    assert(V->getType() != ExtTy && "zext already extends to i32");
+    LLVM_DEBUG(dbgs() << "IR Promotion: Inserting ZExt for " << *V << "\n");
+    Builder.SetInsertPoint(InsertPt);
+    if (auto *I = dyn_cast<Instruction>(V))
+      Builder.SetCurrentDebugLocation(I->getDebugLoc());
+
+    Value *ZExt = Builder.CreateZExt(V, ExtTy);
+    if (auto *I = dyn_cast<Instruction>(ZExt)) {
+      if (isa<Argument>(V))
+        I->moveBefore(InsertPt);
+      else
+        I->moveAfter(InsertPt);
+      NewInsts.insert(I);
+    }
+
+    ReplaceAllUsersOfWith(V, ZExt);
+  };
+
+  // Now, insert extending instructions between the sources and their users.
+  LLVM_DEBUG(dbgs() << "IR Promotion: Promoting sources:\n");
+  for (auto *V : Sources) {
+    LLVM_DEBUG(dbgs() << " - " << *V << "\n");
+    if (auto *I = dyn_cast<Instruction>(V))
+      InsertZExt(I, I);
+    else if (auto *Arg = dyn_cast<Argument>(V)) {
+      BasicBlock &BB = Arg->getParent()->front();
+      InsertZExt(Arg, &*BB.getFirstInsertionPt());
+    } else {
+      llvm_unreachable("unhandled source that needs extending");
+    }
+    Promoted.insert(V);
+  }
+}
+
+void IRPromoter::PromoteTree() {
+  LLVM_DEBUG(dbgs() << "IR Promotion: Mutating the tree..\n");
+
+  // Mutate the types of the instructions within the tree. Here we handle
+  // constant operands.
+  for (auto *V : Visited) {
+    if (Sources.count(V))
+      continue;
+
+    auto *I = cast<Instruction>(V);
+    if (Sinks.count(I))
+      continue;
+
+    for (unsigned i = 0, e = I->getNumOperands(); i < e; ++i) {
+      Value *Op = I->getOperand(i);
+      if ((Op->getType() == ExtTy) || !isa<IntegerType>(Op->getType()))
+        continue;
+
+      if (auto *Const = dyn_cast<ConstantInt>(Op)) {
+        // For subtract, we don't need to sext the constant. We only put it in
+        // SafeWrap because SafeWrap.size() is used elsewhere.
+        // For cmp, we need to sign extend a constant appearing in either
+        // operand. For add, we should only sign extend the RHS.
+        Constant *NewConst = (SafeWrap.contains(I) &&
+                              (I->getOpcode() == Instruction::ICmp || i == 1) &&
+                              I->getOpcode() != Instruction::Sub)
+                                 ? ConstantExpr::getSExt(Const, ExtTy)
+                                 : ConstantExpr::getZExt(Const, ExtTy);
+        I->setOperand(i, NewConst);
+      } else if (isa<UndefValue>(Op))
+        I->setOperand(i, ConstantInt::get(ExtTy, 0));
+    }
+
+    // Mutate the result type, unless this is an icmp or switch.
+    if (!isa<ICmpInst>(I) && !isa<SwitchInst>(I)) {
+      I->mutateType(ExtTy);
+      Promoted.insert(I);
+    }
+  }
+}
+
+void IRPromoter::TruncateSinks() {
+  LLVM_DEBUG(dbgs() << "IR Promotion: Fixing up the sinks:\n");
+
+  IRBuilder<> Builder{Ctx};
+
+  auto InsertTrunc = [&](Value *V, Type *TruncTy) -> Instruction * {
+    if (!isa<Instruction>(V) || !isa<IntegerType>(V->getType()))
+      return nullptr;
+
+    if ((!Promoted.count(V) && !NewInsts.count(V)) || Sources.count(V))
+      return nullptr;
+
+    LLVM_DEBUG(dbgs() << "IR Promotion: Creating " << *TruncTy << " Trunc for "
+                      << *V << "\n");
+    Builder.SetInsertPoint(cast<Instruction>(V));
+    auto *Trunc = dyn_cast<Instruction>(Builder.CreateTrunc(V, TruncTy));
+    if (Trunc)
+      NewInsts.insert(Trunc);
+    return Trunc;
+  };
+
+  // Fix up any stores or returns that use the results of the promoted
+  // chain.
+  for (auto *I : Sinks) {
+    LLVM_DEBUG(dbgs() << "IR Promotion: For Sink: " << *I << "\n");
+
+    // Handle calls separately as we need to iterate over arg operands.
+    if (auto *Call = dyn_cast<CallInst>(I)) {
+      for (unsigned i = 0; i < Call->arg_size(); ++i) {
+        Value *Arg = Call->getArgOperand(i);
+        Type *Ty = TruncTysMap[Call][i];
+        if (Instruction *Trunc = InsertTrunc(Arg, Ty)) {
+          Trunc->moveBefore(Call);
+          Call->setArgOperand(i, Trunc);
+        }
+      }
+      continue;
+    }
+
+    // Special case switches because we need to truncate the condition.
+    if (auto *Switch = dyn_cast<SwitchInst>(I)) {
+      Type *Ty = TruncTysMap[Switch][0];
+      if (Instruction *Trunc = InsertTrunc(Switch->getCondition(), Ty)) {
+        Trunc->moveBefore(Switch);
+        Switch->setCondition(Trunc);
+      }
+      continue;
+    }
+
+    // Don't insert a trunc for a zext which can still legally promote.
+    // Nor insert a trunc when the input value to that trunc has the same width
+    // as the zext we are inserting it for.  When this happens the input operand
+    // for the zext will be promoted to the same width as the zext's return type
+    // rendering that zext unnecessary.  This zext gets removed before the end
+    // of the pass.
+    if (auto ZExt = dyn_cast<ZExtInst>(I))
+      if (ZExt->getType()->getScalarSizeInBits() >= PromotedWidth)
+        continue;
+
+    // Now handle the others.
+    for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+      Type *Ty = TruncTysMap[I][i];
+      if (Instruction *Trunc = InsertTrunc(I->getOperand(i), Ty)) {
+        Trunc->moveBefore(I);
+        I->setOperand(i, Trunc);
+      }
+    }
+  }
+}
+
+void IRPromoter::Cleanup() {
+  LLVM_DEBUG(dbgs() << "IR Promotion: Cleanup..\n");
+  // Some zexts will now have become redundant, along with their trunc
+  // operands, so remove them.
+  for (auto *V : Visited) {
+    if (!isa<ZExtInst>(V))
+      continue;
+
+    auto ZExt = cast<ZExtInst>(V);
+    if (ZExt->getDestTy() != ExtTy)
+      continue;
+
+    Value *Src = ZExt->getOperand(0);
+    if (ZExt->getSrcTy() == ZExt->getDestTy()) {
+      LLVM_DEBUG(dbgs() << "IR Promotion: Removing unnecessary cast: " << *ZExt
+                        << "\n");
+      ReplaceAllUsersOfWith(ZExt, Src);
+      continue;
+    }
+
+    // We've inserted a trunc for a zext sink, but we already know that the
+    // input is in range, negating the need for the trunc.
+    if (NewInsts.count(Src) && isa<TruncInst>(Src)) {
+      auto *Trunc = cast<TruncInst>(Src);
+      assert(Trunc->getOperand(0)->getType() == ExtTy &&
+             "expected inserted trunc to be operating on i32");
+      ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0));
+    }
+  }
+
+  for (auto *I : InstsToRemove) {
+    LLVM_DEBUG(dbgs() << "IR Promotion: Removing " << *I << "\n");
+    I->dropAllReferences();
+  }
+}
+
+void IRPromoter::ConvertTruncs() {
+  LLVM_DEBUG(dbgs() << "IR Promotion: Converting truncs..\n");
+  IRBuilder<> Builder{Ctx};
+
+  for (auto *V : Visited) {
+    if (!isa<TruncInst>(V) || Sources.count(V))
+      continue;
+
+    auto *Trunc = cast<TruncInst>(V);
+    Builder.SetInsertPoint(Trunc);
+    IntegerType *SrcTy = cast<IntegerType>(Trunc->getOperand(0)->getType());
+    IntegerType *DestTy = cast<IntegerType>(TruncTysMap[Trunc][0]);
+
+    unsigned NumBits = DestTy->getScalarSizeInBits();
+    ConstantInt *Mask =
+        ConstantInt::get(SrcTy, APInt::getMaxValue(NumBits).getZExtValue());
+    Value *Masked = Builder.CreateAnd(Trunc->getOperand(0), Mask);
+    if (SrcTy != ExtTy)
+      Masked = Builder.CreateTrunc(Masked, ExtTy);
+
+    if (auto *I = dyn_cast<Instruction>(Masked))
+      NewInsts.insert(I);
+
+    ReplaceAllUsersOfWith(Trunc, Masked);
+  }
+}
+
+void IRPromoter::Mutate() {
+  LLVM_DEBUG(dbgs() << "IR Promotion: Promoting use-def chains to "
+                    << PromotedWidth << "-bits\n");
+
+  // Cache original types of the values that will likely need truncating
+  for (auto *I : Sinks) {
+    if (auto *Call = dyn_cast<CallInst>(I)) {
+      for (Value *Arg : Call->args())
+        TruncTysMap[Call].push_back(Arg->getType());
+    } else if (auto *Switch = dyn_cast<SwitchInst>(I))
+      TruncTysMap[I].push_back(Switch->getCondition()->getType());
+    else {
+      for (unsigned i = 0; i < I->getNumOperands(); ++i)
+        TruncTysMap[I].push_back(I->getOperand(i)->getType());
+    }
+  }
+  for (auto *V : Visited) {
+    if (!isa<TruncInst>(V) || Sources.count(V))
+      continue;
+    auto *Trunc = cast<TruncInst>(V);
+    TruncTysMap[Trunc].push_back(Trunc->getDestTy());
+  }
+
+  // Insert zext instructions between sources and their users.
+  ExtendSources();
+
+  // Promote visited instructions, mutating their types in place.
+  PromoteTree();
+
+  // Convert any truncs, that aren't sources, into AND masks.
+  ConvertTruncs();
+
+  // Insert trunc instructions for use by calls, stores etc...
+  TruncateSinks();
+
+  // Finally, remove unecessary zexts and truncs, delete old instructions and
+  // clear the data structures.
+  Cleanup();
+
+  LLVM_DEBUG(dbgs() << "IR Promotion: Mutation complete\n");
+}
+
+/// We disallow booleans to make life easier when dealing with icmps but allow
+/// any other integer that fits in a scalar register. Void types are accepted
+/// so we can handle switches.
+bool TypePromotionImpl::isSupportedType(Value *V) {
+  Type *Ty = V->getType();
+
+  // Allow voids and pointers, these won't be promoted.
+  if (Ty->isVoidTy() || Ty->isPointerTy())
+    return true;
+
+  if (!isa<IntegerType>(Ty) || cast<IntegerType>(Ty)->getBitWidth() == 1 ||
+      cast<IntegerType>(Ty)->getBitWidth() > RegisterBitWidth)
+    return false;
+
+  return LessOrEqualTypeSize(V);
+}
+
+/// We accept most instructions, as well as Arguments and ConstantInsts. We
+/// Disallow casts other than zext and truncs and only allow calls if their
+/// return value is zeroext. We don't allow opcodes that can introduce sign
+/// bits.
+bool TypePromotionImpl::isSupportedValue(Value *V) {
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    switch (I->getOpcode()) {
+    default:
+      return isa<BinaryOperator>(I) && isSupportedType(I) &&
+             !GenerateSignBits(I);
+    case Instruction::GetElementPtr:
+    case Instruction::Store:
+    case Instruction::Br:
+    case Instruction::Switch:
+      return true;
+    case Instruction::PHI:
+    case Instruction::Select:
+    case Instruction::Ret:
+    case Instruction::Load:
+    case Instruction::Trunc:
+      return isSupportedType(I);
+    case Instruction::BitCast:
+      return I->getOperand(0)->getType() == I->getType();
+    case Instruction::ZExt:
+      return isSupportedType(I->getOperand(0));
+    case Instruction::ICmp:
+      // Now that we allow small types than TypeSize, only allow icmp of
+      // TypeSize because they will require a trunc to be legalised.
+      // TODO: Allow icmp of smaller types, and calculate at the end
+      // whether the transform would be beneficial.
+      if (isa<PointerType>(I->getOperand(0)->getType()))
+        return true;
+      return EqualTypeSize(I->getOperand(0));
+    case Instruction::Call: {
+      // Special cases for calls as we need to check for zeroext
+      // TODO We should accept calls even if they don't have zeroext, as they
+      // can still be sinks.
+      auto *Call = cast<CallInst>(I);
+      return isSupportedType(Call) &&
+             Call->hasRetAttr(Attribute::AttrKind::ZExt);
+    }
+    }
+  } else if (isa<Constant>(V) && !isa<ConstantExpr>(V)) {
+    return isSupportedType(V);
+  } else if (isa<Argument>(V))
+    return isSupportedType(V);
+
+  return isa<BasicBlock>(V);
+}
+
+/// Check that the type of V would be promoted and that the original type is
+/// smaller than the targeted promoted type. Check that we're not trying to
+/// promote something larger than our base 'TypeSize' type.
+bool TypePromotionImpl::isLegalToPromote(Value *V) {
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return true;
+
+  if (SafeToPromote.count(I))
+    return true;
+
+  if (isPromotedResultSafe(I) || isSafeWrap(I)) {
+    SafeToPromote.insert(I);
+    return true;
+  }
+  return false;
+}
+
+bool TypePromotionImpl::TryToPromote(Value *V, unsigned PromotedWidth,
+                                 const LoopInfo &LI) {
+  Type *OrigTy = V->getType();
+  TypeSize = OrigTy->getPrimitiveSizeInBits().getFixedValue();
+  SafeToPromote.clear();
+  SafeWrap.clear();
+
+  if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "IR Promotion: TryToPromote: " << *V << ", from "
+                    << TypeSize << " bits to " << PromotedWidth << "\n");
+
+  SetVector<Value *> WorkList;
+  SetVector<Value *> Sources;
+  SetVector<Instruction *> Sinks;
+  SetVector<Value *> CurrentVisited;
+  WorkList.insert(V);
+
+  // Return true if V was added to the worklist as a supported instruction,
+  // if it was already visited, or if we don't need to explore it (e.g.
+  // pointer values and GEPs), and false otherwise.
+  auto AddLegalInst = [&](Value *V) {
+    if (CurrentVisited.count(V))
+      return true;
+
+    // Ignore GEPs because they don't need promoting and the constant indices
+    // will prevent the transformation.
+    if (isa<GetElementPtrInst>(V))
+      return true;
+
+    if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) {
+      LLVM_DEBUG(dbgs() << "IR Promotion: Can't handle: " << *V << "\n");
+      return false;
+    }
+
+    WorkList.insert(V);
+    return true;
+  };
+
+  // Iterate through, and add to, a tree of operands and users in the use-def.
+  while (!WorkList.empty()) {
+    Value *V = WorkList.pop_back_val();
+    if (CurrentVisited.count(V))
+      continue;
+
+    // Ignore non-instructions, other than arguments.
+    if (!isa<Instruction>(V) && !isSource(V))
+      continue;
+
+    // If we've already visited this value from somewhere, bail now because
+    // the tree has already been explored.
+    // TODO: This could limit the transform, ie if we try to promote something
+    // from an i8 and fail first, before trying an i16.
+    if (AllVisited.count(V))
+      return false;
+
+    CurrentVisited.insert(V);
+    AllVisited.insert(V);
+
+    // Calls can be both sources and sinks.
+    if (isSink(V))
+      Sinks.insert(cast<Instruction>(V));
+
+    if (isSource(V))
+      Sources.insert(V);
+
+    if (!isSink(V) && !isSource(V)) {
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        // Visit operands of any instruction visited.
+        for (auto &U : I->operands()) {
+          if (!AddLegalInst(U))
+            return false;
+        }
+      }
+    }
+
+    // Don't visit users of a node which isn't going to be mutated unless its a
+    // source.
+    if (isSource(V) || shouldPromote(V)) {
+      for (Use &U : V->uses()) {
+        if (!AddLegalInst(U.getUser()))
+          return false;
+      }
+    }
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "IR Promotion: Visited nodes:\n";
+    for (auto *I : CurrentVisited)
+      I->dump();
+  });
+
+  unsigned ToPromote = 0;
+  unsigned NonFreeArgs = 0;
+  unsigned NonLoopSources = 0, LoopSinks = 0;
+  SmallPtrSet<BasicBlock *, 4> Blocks;
+  for (auto *CV : CurrentVisited) {
+    if (auto *I = dyn_cast<Instruction>(CV))
+      Blocks.insert(I->getParent());
+
+    if (Sources.count(CV)) {
+      if (auto *Arg = dyn_cast<Argument>(CV))
+        if (!Arg->hasZExtAttr() && !Arg->hasSExtAttr())
+          ++NonFreeArgs;
+      if (!isa<Instruction>(CV) ||
+          !LI.getLoopFor(cast<Instruction>(CV)->getParent()))
+        ++NonLoopSources;
+      continue;
+    }
+
+    if (isa<PHINode>(CV))
+      continue;
+    if (LI.getLoopFor(cast<Instruction>(CV)->getParent()))
+      ++LoopSinks;
+    if (Sinks.count(cast<Instruction>(CV)))
+      continue;
+    ++ToPromote;
+  }
+
+  // DAG optimizations should be able to handle these cases better, especially
+  // for function arguments.
+  if (!isa<PHINode>(V) && !(LoopSinks && NonLoopSources) &&
+      (ToPromote < 2 || (Blocks.size() == 1 && NonFreeArgs > SafeWrap.size())))
+    return false;
+
+  IRPromoter Promoter(*Ctx, PromotedWidth, CurrentVisited, Sources, Sinks,
+                      SafeWrap, InstsToRemove);
+  Promoter.Mutate();
+  return true;
+}
+
+bool TypePromotionImpl::run(Function &F, const TargetMachine *TM,
+                            const TargetTransformInfo &TTI,
+                            const LoopInfo &LI) {
+  if (DisablePromotion)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "IR Promotion: Running on " << F.getName() << "\n");
+
+  AllVisited.clear();
+  SafeToPromote.clear();
+  SafeWrap.clear();
+  bool MadeChange = false;
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  const TargetSubtargetInfo *SubtargetInfo = TM->getSubtargetImpl(F);
+  const TargetLowering *TLI = SubtargetInfo->getTargetLowering();
+  RegisterBitWidth =
+      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_Scalar).getFixedValue();
+  Ctx = &F.getParent()->getContext();
+
+  // Return the preferred integer width of the instruction, or zero if we
+  // shouldn't try.
+  auto GetPromoteWidth = [&](Instruction *I) -> uint32_t {
+    if (!isa<IntegerType>(I->getType()))
+      return 0;
+
+    EVT SrcVT = TLI->getValueType(DL, I->getType());
+    if (SrcVT.isSimple() && TLI->isTypeLegal(SrcVT.getSimpleVT()))
+      return 0;
+
+    if (TLI->getTypeAction(*Ctx, SrcVT) != TargetLowering::TypePromoteInteger)
+      return 0;
+
+    EVT PromotedVT = TLI->getTypeToTransformTo(*Ctx, SrcVT);
+    if (RegisterBitWidth < PromotedVT.getFixedSizeInBits()) {
+      LLVM_DEBUG(dbgs() << "IR Promotion: Couldn't find target register "
+                        << "for promoted type\n");
+      return 0;
+    }
+
+    // TODO: Should we prefer to use RegisterBitWidth instead?
+    return PromotedVT.getFixedSizeInBits();
+  };
+
+  auto BBIsInLoop = [&](BasicBlock *BB) -> bool {
+    for (auto *L : LI)
+      if (L->contains(BB))
+        return true;
+    return false;
+  };
+
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (AllVisited.count(&I))
+        continue;
+
+      if (isa<ZExtInst>(&I) && isa<PHINode>(I.getOperand(0)) &&
+          isa<IntegerType>(I.getType()) && BBIsInLoop(&BB)) {
+        LLVM_DEBUG(dbgs() << "IR Promotion: Searching from: "
+                          << *I.getOperand(0) << "\n");
+        EVT ZExtVT = TLI->getValueType(DL, I.getType());
+        Instruction *Phi = static_cast<Instruction *>(I.getOperand(0));
+        auto PromoteWidth = ZExtVT.getFixedSizeInBits();
+        if (RegisterBitWidth < PromoteWidth) {
+          LLVM_DEBUG(dbgs() << "IR Promotion: Couldn't find target "
+                            << "register for ZExt type\n");
+          continue;
+        }
+        MadeChange |= TryToPromote(Phi, PromoteWidth, LI);
+      } else if (auto *ICmp = dyn_cast<ICmpInst>(&I)) {
+        // Search up from icmps to try to promote their operands.
+        // Skip signed or pointer compares
+        if (ICmp->isSigned())
+          continue;
+
+        LLVM_DEBUG(dbgs() << "IR Promotion: Searching from: " << *ICmp << "\n");
+
+        for (auto &Op : ICmp->operands()) {
+          if (auto *OpI = dyn_cast<Instruction>(Op)) {
+            if (auto PromotedWidth = GetPromoteWidth(OpI)) {
+              MadeChange |= TryToPromote(OpI, PromotedWidth, LI);
+              break;
+            }
+          }
+        }
+      }
+    }
+    if (!InstsToRemove.empty()) {
+      for (auto *I : InstsToRemove)
+        I->eraseFromParent();
+      InstsToRemove.clear();
+    }
+  }
+
+  AllVisited.clear();
+  SafeToPromote.clear();
+  SafeWrap.clear();
+
+  return MadeChange;
+}
+
+INITIALIZE_PASS_BEGIN(TypePromotionLegacy, DEBUG_TYPE, PASS_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(TypePromotionLegacy, DEBUG_TYPE, PASS_NAME, false, false)
+
+char TypePromotionLegacy::ID = 0;
+
+bool TypePromotionLegacy::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  auto *TM = &TPC->getTM<TargetMachine>();
+  auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+  TypePromotionImpl TP;
+  return TP.run(F, TM, TTI, LI);
+}
+
+FunctionPass *llvm::createTypePromotionLegacyPass() {
+  return new TypePromotionLegacy();
+}
+
+PreservedAnalyses TypePromotionPass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  TypePromotionImpl TP;
+
+  bool Changed = TP.run(F, TM, TTI, LI);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<LoopAnalysis>();
+  return PA;
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/contrib/llvm-project/llvm/lib/CodeGen/UnreachableBlockElim.cpp
new file mode 100644
index 000000000000..f17450d264ba
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/UnreachableBlockElim.cpp
@@ -0,0 +1,196 @@
+//===-- UnreachableBlockElim.cpp - Remove unreachable blocks for codegen --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is an extremely simple version of the SimplifyCFG pass.  Its sole
+// job is to delete LLVM basic blocks that are not reachable from the entry
+// node.  To do this, it performs a simple depth first traversal of the CFG,
+// then deletes any unvisited nodes.
+//
+// Note that this pass is really a hack.  In particular, the instruction
+// selectors for various targets should just not generate code for unreachable
+// blocks.  Until LLVM has a more systematic way of defining instruction
+// selectors, however, we cannot really expect them to handle additional
+// complexity.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/UnreachableBlockElim.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+namespace {
+class UnreachableBlockElimLegacyPass : public FunctionPass {
+  bool runOnFunction(Function &F) override {
+    return llvm::EliminateUnreachableBlocks(F);
+  }
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  UnreachableBlockElimLegacyPass() : FunctionPass(ID) {
+    initializeUnreachableBlockElimLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+};
+}
+char UnreachableBlockElimLegacyPass::ID = 0;
+INITIALIZE_PASS(UnreachableBlockElimLegacyPass, "unreachableblockelim",
+                "Remove unreachable blocks from the CFG", false, false)
+
+FunctionPass *llvm::createUnreachableBlockEliminationPass() {
+  return new UnreachableBlockElimLegacyPass();
+}
+
+PreservedAnalyses UnreachableBlockElimPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  bool Changed = llvm::EliminateUnreachableBlocks(F);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
+
+namespace {
+  class UnreachableMachineBlockElim : public MachineFunctionPass {
+    bool runOnMachineFunction(MachineFunction &F) override;
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    UnreachableMachineBlockElim() : MachineFunctionPass(ID) {}
+  };
+}
+char UnreachableMachineBlockElim::ID = 0;
+
+INITIALIZE_PASS(UnreachableMachineBlockElim, "unreachable-mbb-elimination",
+  "Remove unreachable machine basic blocks", false, false)
+
+char &llvm::UnreachableMachineBlockElimID = UnreachableMachineBlockElim::ID;
+
+void UnreachableMachineBlockElim::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreserved<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
+  df_iterator_default_set<MachineBasicBlock*> Reachable;
+  bool ModifiedPHI = false;
+
+  MachineDominatorTree *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+  MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+
+  // Mark all reachable blocks.
+  for (MachineBasicBlock *BB : depth_first_ext(&F, Reachable))
+    (void)BB/* Mark all reachable blocks */;
+
+  // Loop over all dead blocks, remembering them and deleting all instructions
+  // in them.
+  std::vector<MachineBasicBlock*> DeadBlocks;
+  for (MachineBasicBlock &BB : F) {
+    // Test for deadness.
+    if (!Reachable.count(&BB)) {
+      DeadBlocks.push_back(&BB);
+
+      // Update dominator and loop info.
+      if (MLI) MLI->removeBlock(&BB);
+      if (MDT && MDT->getNode(&BB)) MDT->eraseNode(&BB);
+
+      while (BB.succ_begin() != BB.succ_end()) {
+        MachineBasicBlock* succ = *BB.succ_begin();
+
+        for (MachineInstr &Phi : succ->phis()) {
+          for (unsigned i = Phi.getNumOperands() - 1; i >= 2; i -= 2) {
+            if (Phi.getOperand(i).isMBB() &&
+                Phi.getOperand(i).getMBB() == &BB) {
+              Phi.removeOperand(i);
+              Phi.removeOperand(i - 1);
+            }
+          }
+        }
+
+        BB.removeSuccessor(BB.succ_begin());
+      }
+    }
+  }
+
+  // Actually remove the blocks now.
+  for (MachineBasicBlock *BB : DeadBlocks) {
+    // Remove any call site information for calls in the block.
+    for (auto &I : BB->instrs())
+      if (I.shouldUpdateCallSiteInfo())
+        BB->getParent()->eraseCallSiteInfo(&I);
+
+    BB->eraseFromParent();
+  }
+
+  // Cleanup PHI nodes.
+  for (MachineBasicBlock &BB : F) {
+    // Prune unneeded PHI entries.
+    SmallPtrSet<MachineBasicBlock*, 8> preds(BB.pred_begin(),
+                                             BB.pred_end());
+    for (MachineInstr &Phi : make_early_inc_range(BB.phis())) {
+      for (unsigned i = Phi.getNumOperands() - 1; i >= 2; i -= 2) {
+        if (!preds.count(Phi.getOperand(i).getMBB())) {
+          Phi.removeOperand(i);
+          Phi.removeOperand(i - 1);
+          ModifiedPHI = true;
+        }
+      }
+
+      if (Phi.getNumOperands() == 3) {
+        const MachineOperand &Input = Phi.getOperand(1);
+        const MachineOperand &Output = Phi.getOperand(0);
+        Register InputReg = Input.getReg();
+        Register OutputReg = Output.getReg();
+        assert(Output.getSubReg() == 0 && "Cannot have output subregister");
+        ModifiedPHI = true;
+
+        if (InputReg != OutputReg) {
+          MachineRegisterInfo &MRI = F.getRegInfo();
+          unsigned InputSub = Input.getSubReg();
+          if (InputSub == 0 &&
+              MRI.constrainRegClass(InputReg, MRI.getRegClass(OutputReg)) &&
+              !Input.isUndef()) {
+            MRI.replaceRegWith(OutputReg, InputReg);
+          } else {
+            // The input register to the PHI has a subregister or it can't be
+            // constrained to the proper register class or it is undef:
+            // insert a COPY instead of simply replacing the output
+            // with the input.
+            const TargetInstrInfo *TII = F.getSubtarget().getInstrInfo();
+            BuildMI(BB, BB.getFirstNonPHI(), Phi.getDebugLoc(),
+                    TII->get(TargetOpcode::COPY), OutputReg)
+                .addReg(InputReg, getRegState(Input), InputSub);
+          }
+          Phi.eraseFromParent();
+        }
+      }
+    }
+  }
+
+  F.RenumberBlocks();
+
+  return (!DeadBlocks.empty() || ModifiedPHI);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/VLIWMachineScheduler.cpp b/contrib/llvm-project/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
new file mode 100644
index 000000000000..fc1cbfefb0db
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
@@ -0,0 +1,1007 @@
+//===- VLIWMachineScheduler.cpp - VLIW-Focused Scheduling Pass ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineScheduler schedules machine instructions after phi elimination. It
+// preserves LiveIntervals so it can be invoked before register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/VLIWMachineScheduler.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iomanip>
+#include <limits>
+#include <memory>
+#include <sstream>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-scheduler"
+
+static cl::opt<bool> IgnoreBBRegPressure("ignore-bb-reg-pressure", cl::Hidden,
+                                         cl::init(false));
+
+static cl::opt<bool> UseNewerCandidate("use-newer-candidate", cl::Hidden,
+                                       cl::init(true));
+
+static cl::opt<unsigned> SchedDebugVerboseLevel("misched-verbose-level",
+                                                cl::Hidden, cl::init(1));
+
+// Check if the scheduler should penalize instructions that are available to
+// early due to a zero-latency dependence.
+static cl::opt<bool> CheckEarlyAvail("check-early-avail", cl::Hidden,
+                                     cl::init(true));
+
+// This value is used to determine if a register class is a high pressure set.
+// We compute the maximum number of registers needed and divided by the total
+// available. Then, we compare the result to this value.
+static cl::opt<float> RPThreshold("vliw-misched-reg-pressure", cl::Hidden,
+                                  cl::init(0.75f),
+                                  cl::desc("High register pressure threhold."));
+
+VLIWResourceModel::VLIWResourceModel(const TargetSubtargetInfo &STI,
+                                     const TargetSchedModel *SM)
+    : TII(STI.getInstrInfo()), SchedModel(SM) {
+  ResourcesModel = createPacketizer(STI);
+
+  // This hard requirement could be relaxed,
+  // but for now do not let it proceed.
+  assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
+
+  Packet.reserve(SchedModel->getIssueWidth());
+  Packet.clear();
+  ResourcesModel->clearResources();
+}
+
+void VLIWResourceModel::reset() {
+  Packet.clear();
+  ResourcesModel->clearResources();
+}
+
+VLIWResourceModel::~VLIWResourceModel() { delete ResourcesModel; }
+
+/// Return true if there is a dependence between SUd and SUu.
+bool VLIWResourceModel::hasDependence(const SUnit *SUd, const SUnit *SUu) {
+  if (SUd->Succs.size() == 0)
+    return false;
+
+  for (const auto &S : SUd->Succs) {
+    // Since we do not add pseudos to packets, might as well
+    // ignore order dependencies.
+    if (S.isCtrl())
+      continue;
+
+    if (S.getSUnit() == SUu && S.getLatency() > 0)
+      return true;
+  }
+  return false;
+}
+
+/// Check if scheduling of this SU is possible
+/// in the current packet.
+/// It is _not_ precise (statefull), it is more like
+/// another heuristic. Many corner cases are figured
+/// empirically.
+bool VLIWResourceModel::isResourceAvailable(SUnit *SU, bool IsTop) {
+  if (!SU || !SU->getInstr())
+    return false;
+
+  // First see if the pipeline could receive this instruction
+  // in the current cycle.
+  switch (SU->getInstr()->getOpcode()) {
+  default:
+    if (!ResourcesModel->canReserveResources(*SU->getInstr()))
+      return false;
+    break;
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+  case TargetOpcode::INLINEASM_BR:
+    break;
+  }
+
+  // Now see if there are no other dependencies to instructions already
+  // in the packet.
+  if (IsTop) {
+    for (unsigned i = 0, e = Packet.size(); i != e; ++i)
+      if (hasDependence(Packet[i], SU))
+        return false;
+  } else {
+    for (unsigned i = 0, e = Packet.size(); i != e; ++i)
+      if (hasDependence(SU, Packet[i]))
+        return false;
+  }
+  return true;
+}
+
+/// Keep track of available resources.
+bool VLIWResourceModel::reserveResources(SUnit *SU, bool IsTop) {
+  bool startNewCycle = false;
+  // Artificially reset state.
+  if (!SU) {
+    reset();
+    TotalPackets++;
+    return false;
+  }
+  // If this SU does not fit in the packet or the packet is now full
+  // start a new one.
+  if (!isResourceAvailable(SU, IsTop) ||
+      Packet.size() >= SchedModel->getIssueWidth()) {
+    reset();
+    TotalPackets++;
+    startNewCycle = true;
+  }
+
+  switch (SU->getInstr()->getOpcode()) {
+  default:
+    ResourcesModel->reserveResources(*SU->getInstr());
+    break;
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::CFI_INSTRUCTION:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+  case TargetOpcode::INLINEASM_BR:
+    break;
+  }
+  Packet.push_back(SU);
+
+#ifndef NDEBUG
+  LLVM_DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n");
+  for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
+    LLVM_DEBUG(dbgs() << "\t[" << i << "] SU(");
+    LLVM_DEBUG(dbgs() << Packet[i]->NodeNum << ")\t");
+    LLVM_DEBUG(Packet[i]->getInstr()->dump());
+  }
+#endif
+
+  return startNewCycle;
+}
+
+DFAPacketizer *
+VLIWResourceModel::createPacketizer(const TargetSubtargetInfo &STI) const {
+  return STI.getInstrInfo()->CreateTargetScheduleState(STI);
+}
+
+/// schedule - Called back from MachineScheduler::runOnMachineFunction
+/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
+/// only includes instructions that have DAG nodes, not scheduling boundaries.
+void VLIWMachineScheduler::schedule() {
+  LLVM_DEBUG(dbgs() << "********** MI Converging Scheduling VLIW "
+                    << printMBBReference(*BB) << " " << BB->getName()
+                    << " in_func " << BB->getParent()->getName()
+                    << " at loop depth " << MLI->getLoopDepth(BB) << " \n");
+
+  buildDAGWithRegPressure();
+
+  Topo.InitDAGTopologicalSorting();
+
+  // Postprocess the DAG to add platform-specific artificial dependencies.
+  postProcessDAG();
+
+  SmallVector<SUnit *, 8> TopRoots, BotRoots;
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+
+  // Initialize the strategy before modifying the DAG.
+  SchedImpl->initialize(this);
+
+  LLVM_DEBUG({
+    unsigned maxH = 0;
+    for (const SUnit &SU : SUnits)
+      if (SU.getHeight() > maxH)
+        maxH = SU.getHeight();
+    dbgs() << "Max Height " << maxH << "\n";
+  });
+  LLVM_DEBUG({
+    unsigned maxD = 0;
+    for (const SUnit &SU : SUnits)
+      if (SU.getDepth() > maxD)
+        maxD = SU.getDepth();
+    dbgs() << "Max Depth " << maxD << "\n";
+  });
+  LLVM_DEBUG(dump());
+  if (ViewMISchedDAGs)
+    viewGraph();
+
+  initQueues(TopRoots, BotRoots);
+
+  bool IsTopNode = false;
+  while (true) {
+    LLVM_DEBUG(
+        dbgs() << "** VLIWMachineScheduler::schedule picking next node\n");
+    SUnit *SU = SchedImpl->pickNode(IsTopNode);
+    if (!SU)
+      break;
+
+    if (!checkSchedLimit())
+      break;
+
+    scheduleMI(SU, IsTopNode);
+
+    // Notify the scheduling strategy after updating the DAG.
+    SchedImpl->schedNode(SU, IsTopNode);
+
+    updateQueues(SU, IsTopNode);
+  }
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+
+  LLVM_DEBUG({
+    dbgs() << "*** Final schedule for "
+           << printMBBReference(*begin()->getParent()) << " ***\n";
+    dumpSchedule();
+    dbgs() << '\n';
+  });
+}
+
+void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
+  DAG = static_cast<VLIWMachineScheduler *>(dag);
+  SchedModel = DAG->getSchedModel();
+
+  Top.init(DAG, SchedModel);
+  Bot.init(DAG, SchedModel);
+
+  // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
+  // are disabled, then these HazardRecs will be disabled.
+  const InstrItineraryData *Itin = DAG->getSchedModel()->getInstrItineraries();
+  const TargetSubtargetInfo &STI = DAG->MF.getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  delete Top.HazardRec;
+  delete Bot.HazardRec;
+  Top.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG);
+  Bot.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG);
+
+  delete Top.ResourceModel;
+  delete Bot.ResourceModel;
+  Top.ResourceModel = createVLIWResourceModel(STI, DAG->getSchedModel());
+  Bot.ResourceModel = createVLIWResourceModel(STI, DAG->getSchedModel());
+
+  const std::vector<unsigned> &MaxPressure =
+      DAG->getRegPressure().MaxSetPressure;
+  HighPressureSets.assign(MaxPressure.size(), false);
+  for (unsigned i = 0, e = MaxPressure.size(); i < e; ++i) {
+    unsigned Limit = DAG->getRegClassInfo()->getRegPressureSetLimit(i);
+    HighPressureSets[i] =
+        ((float)MaxPressure[i] > ((float)Limit * RPThreshold));
+  }
+
+  assert((!ForceTopDown || !ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+}
+
+VLIWResourceModel *ConvergingVLIWScheduler::createVLIWResourceModel(
+    const TargetSubtargetInfo &STI, const TargetSchedModel *SchedModel) const {
+  return new VLIWResourceModel(STI, SchedModel);
+}
+
+void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) {
+  for (const SDep &PI : SU->Preds) {
+    unsigned PredReadyCycle = PI.getSUnit()->TopReadyCycle;
+    unsigned MinLatency = PI.getLatency();
+#ifndef NDEBUG
+    Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency);
+#endif
+    if (SU->TopReadyCycle < PredReadyCycle + MinLatency)
+      SU->TopReadyCycle = PredReadyCycle + MinLatency;
+  }
+
+  if (!SU->isScheduled)
+    Top.releaseNode(SU, SU->TopReadyCycle);
+}
+
+void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) {
+  assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); I != E;
+       ++I) {
+    unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
+    unsigned MinLatency = I->getLatency();
+#ifndef NDEBUG
+    Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency);
+#endif
+    if (SU->BotReadyCycle < SuccReadyCycle + MinLatency)
+      SU->BotReadyCycle = SuccReadyCycle + MinLatency;
+  }
+
+  if (!SU->isScheduled)
+    Bot.releaseNode(SU, SU->BotReadyCycle);
+}
+
+ConvergingVLIWScheduler::VLIWSchedBoundary::~VLIWSchedBoundary() {
+  delete ResourceModel;
+  delete HazardRec;
+}
+
+/// Does this SU have a hazard within the current instruction group.
+///
+/// The scheduler supports two modes of hazard recognition. The first is the
+/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
+/// supports highly complicated in-order reservation tables
+/// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
+///
+/// The second is a streamlined mechanism that checks for hazards based on
+/// simple counters that the scheduler itself maintains. It explicitly checks
+/// for instruction dispatch limitations, including the number of micro-ops that
+/// can dispatch per cycle.
+///
+/// TODO: Also check whether the SU must start a new group.
+bool ConvergingVLIWScheduler::VLIWSchedBoundary::checkHazard(SUnit *SU) {
+  if (HazardRec->isEnabled())
+    return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
+
+  unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
+  if (IssueCount + uops > SchedModel->getIssueWidth())
+    return true;
+
+  return false;
+}
+
+void ConvergingVLIWScheduler::VLIWSchedBoundary::releaseNode(
+    SUnit *SU, unsigned ReadyCycle) {
+  if (ReadyCycle < MinReadyCycle)
+    MinReadyCycle = ReadyCycle;
+
+  // Check for interlocks first. For the purpose of other heuristics, an
+  // instruction that cannot issue appears as if it's not in the ReadyQueue.
+  if (ReadyCycle > CurrCycle || checkHazard(SU))
+
+    Pending.push(SU);
+  else
+    Available.push(SU);
+}
+
+/// Move the boundary of scheduled code by one cycle.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpCycle() {
+  unsigned Width = SchedModel->getIssueWidth();
+  IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
+
+  assert(MinReadyCycle < std::numeric_limits<unsigned>::max() &&
+         "MinReadyCycle uninitialized");
+  unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle);
+
+  if (!HazardRec->isEnabled()) {
+    // Bypass HazardRec virtual calls.
+    CurrCycle = NextCycle;
+  } else {
+    // Bypass getHazardType calls in case of long latency.
+    for (; CurrCycle != NextCycle; ++CurrCycle) {
+      if (isTop())
+        HazardRec->AdvanceCycle();
+      else
+        HazardRec->RecedeCycle();
+    }
+  }
+  CheckPending = true;
+
+  LLVM_DEBUG(dbgs() << "*** Next cycle " << Available.getName() << " cycle "
+                    << CurrCycle << '\n');
+}
+
+/// Move the boundary of scheduled code by one SUnit.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpNode(SUnit *SU) {
+  bool startNewCycle = false;
+
+  // Update the reservation table.
+  if (HazardRec->isEnabled()) {
+    if (!isTop() && SU->isCall) {
+      // Calls are scheduled with their preceding instructions. For bottom-up
+      // scheduling, clear the pipeline state before emitting.
+      HazardRec->Reset();
+    }
+    HazardRec->EmitInstruction(SU);
+  }
+
+  // Update DFA model.
+  startNewCycle = ResourceModel->reserveResources(SU, isTop());
+
+  // Check the instruction group dispatch limit.
+  // TODO: Check if this SU must end a dispatch group.
+  IssueCount += SchedModel->getNumMicroOps(SU->getInstr());
+  if (startNewCycle) {
+    LLVM_DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
+    bumpCycle();
+  } else
+    LLVM_DEBUG(dbgs() << "*** IssueCount " << IssueCount << " at cycle "
+                      << CurrCycle << '\n');
+}
+
+/// Release pending ready nodes in to the available queue. This makes them
+/// visible to heuristics.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::releasePending() {
+  // If the available queue is empty, it is safe to reset MinReadyCycle.
+  if (Available.empty())
+    MinReadyCycle = std::numeric_limits<unsigned>::max();
+
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
+    SUnit *SU = *(Pending.begin() + i);
+    unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+
+    if (ReadyCycle < MinReadyCycle)
+      MinReadyCycle = ReadyCycle;
+
+    if (ReadyCycle > CurrCycle)
+      continue;
+
+    if (checkHazard(SU))
+      continue;
+
+    Available.push(SU);
+    Pending.remove(Pending.begin() + i);
+    --i;
+    --e;
+  }
+  CheckPending = false;
+}
+
+/// Remove SU from the ready set for this boundary.
+void ConvergingVLIWScheduler::VLIWSchedBoundary::removeReady(SUnit *SU) {
+  if (Available.isInQueue(SU))
+    Available.remove(Available.find(SU));
+  else {
+    assert(Pending.isInQueue(SU) && "bad ready count");
+    Pending.remove(Pending.find(SU));
+  }
+}
+
+/// If this queue only has one ready candidate, return it. As a side effect,
+/// advance the cycle until at least one node is ready. If multiple instructions
+/// are ready, return NULL.
+SUnit *ConvergingVLIWScheduler::VLIWSchedBoundary::pickOnlyChoice() {
+  if (CheckPending)
+    releasePending();
+
+  auto AdvanceCycle = [this]() {
+    if (Available.empty())
+      return true;
+    if (Available.size() == 1 && Pending.size() > 0)
+      return !ResourceModel->isResourceAvailable(*Available.begin(), isTop()) ||
+             getWeakLeft(*Available.begin(), isTop()) != 0;
+    return false;
+  };
+  for (unsigned i = 0; AdvanceCycle(); ++i) {
+    assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
+           "permanent hazard");
+    (void)i;
+    ResourceModel->reserveResources(nullptr, isTop());
+    bumpCycle();
+    releasePending();
+  }
+  if (Available.size() == 1)
+    return *Available.begin();
+  return nullptr;
+}
+
+#ifndef NDEBUG
+void ConvergingVLIWScheduler::traceCandidate(const char *Label,
+                                             const ReadyQueue &Q, SUnit *SU,
+                                             int Cost, PressureChange P) {
+  dbgs() << Label << " " << Q.getName() << " ";
+  if (P.isValid())
+    dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":"
+           << P.getUnitInc() << " ";
+  else
+    dbgs() << "     ";
+  dbgs() << "cost(" << Cost << ")\t";
+  DAG->dumpNode(*SU);
+}
+
+// Very detailed queue dump, to be used with higher verbosity levels.
+void ConvergingVLIWScheduler::readyQueueVerboseDump(
+    const RegPressureTracker &RPTracker, SchedCandidate &Candidate,
+    ReadyQueue &Q) {
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
+
+  dbgs() << ">>> " << Q.getName() << "\n";
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+    RegPressureDelta RPDelta;
+    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
+                                    DAG->getRegionCriticalPSets(),
+                                    DAG->getRegPressure().MaxSetPressure);
+    std::stringstream dbgstr;
+    dbgstr << "SU(" << std::setw(3) << (*I)->NodeNum << ")";
+    dbgs() << dbgstr.str();
+    SchedulingCost(Q, *I, Candidate, RPDelta, true);
+    dbgs() << "\t";
+    (*I)->getInstr()->dump();
+  }
+  dbgs() << "\n";
+}
+#endif
+
+/// isSingleUnscheduledPred - If SU2 is the only unscheduled predecessor
+/// of SU, return true (we may have duplicates)
+static inline bool isSingleUnscheduledPred(SUnit *SU, SUnit *SU2) {
+  if (SU->NumPredsLeft == 0)
+    return false;
+
+  for (auto &Pred : SU->Preds) {
+    // We found an available, but not scheduled, predecessor.
+    if (!Pred.getSUnit()->isScheduled && (Pred.getSUnit() != SU2))
+      return false;
+  }
+
+  return true;
+}
+
+/// isSingleUnscheduledSucc - If SU2 is the only unscheduled successor
+/// of SU, return true (we may have duplicates)
+static inline bool isSingleUnscheduledSucc(SUnit *SU, SUnit *SU2) {
+  if (SU->NumSuccsLeft == 0)
+    return false;
+
+  for (auto &Succ : SU->Succs) {
+    // We found an available, but not scheduled, successor.
+    if (!Succ.getSUnit()->isScheduled && (Succ.getSUnit() != SU2))
+      return false;
+  }
+  return true;
+}
+
+/// Check if the instruction changes the register pressure of a register in the
+/// high pressure set. The function returns a negative value if the pressure
+/// decreases and a positive value is the pressure increases. If the instruction
+/// doesn't use a high pressure register or doesn't change the register
+/// pressure, then return 0.
+int ConvergingVLIWScheduler::pressureChange(const SUnit *SU, bool isBotUp) {
+  PressureDiff &PD = DAG->getPressureDiff(SU);
+  for (const auto &P : PD) {
+    if (!P.isValid())
+      continue;
+    // The pressure differences are computed bottom-up, so the comparison for
+    // an increase is positive in the bottom direction, but negative in the
+    //  top-down direction.
+    if (HighPressureSets[P.getPSet()])
+      return (isBotUp ? P.getUnitInc() : -P.getUnitInc());
+  }
+  return 0;
+}
+
+/// Single point to compute overall scheduling cost.
+/// TODO: More heuristics will be used soon.
+int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
+                                            SchedCandidate &Candidate,
+                                            RegPressureDelta &Delta,
+                                            bool verbose) {
+  // Initial trivial priority.
+  int ResCount = 1;
+
+  // Do not waste time on a node that is already scheduled.
+  if (!SU || SU->isScheduled)
+    return ResCount;
+
+  LLVM_DEBUG(if (verbose) dbgs()
+             << ((Q.getID() == TopQID) ? "(top|" : "(bot|"));
+  // Forced priority is high.
+  if (SU->isScheduleHigh) {
+    ResCount += PriorityOne;
+    LLVM_DEBUG(dbgs() << "H|");
+  }
+
+  unsigned IsAvailableAmt = 0;
+  // Critical path first.
+  if (Q.getID() == TopQID) {
+    if (Top.isLatencyBound(SU)) {
+      LLVM_DEBUG(if (verbose) dbgs() << "LB|");
+      ResCount += (SU->getHeight() * ScaleTwo);
+    }
+
+    LLVM_DEBUG(if (verbose) {
+      std::stringstream dbgstr;
+      dbgstr << "h" << std::setw(3) << SU->getHeight() << "|";
+      dbgs() << dbgstr.str();
+    });
+
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (Top.ResourceModel->isResourceAvailable(SU, true)) {
+      IsAvailableAmt = (PriorityTwo + PriorityThree);
+      ResCount += IsAvailableAmt;
+      LLVM_DEBUG(if (verbose) dbgs() << "A|");
+    } else
+      LLVM_DEBUG(if (verbose) dbgs() << " |");
+  } else {
+    if (Bot.isLatencyBound(SU)) {
+      LLVM_DEBUG(if (verbose) dbgs() << "LB|");
+      ResCount += (SU->getDepth() * ScaleTwo);
+    }
+
+    LLVM_DEBUG(if (verbose) {
+      std::stringstream dbgstr;
+      dbgstr << "d" << std::setw(3) << SU->getDepth() << "|";
+      dbgs() << dbgstr.str();
+    });
+
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (Bot.ResourceModel->isResourceAvailable(SU, false)) {
+      IsAvailableAmt = (PriorityTwo + PriorityThree);
+      ResCount += IsAvailableAmt;
+      LLVM_DEBUG(if (verbose) dbgs() << "A|");
+    } else
+      LLVM_DEBUG(if (verbose) dbgs() << " |");
+  }
+
+  unsigned NumNodesBlocking = 0;
+  if (Q.getID() == TopQID) {
+    // How many SUs does it block from scheduling?
+    // Look at all of the successors of this node.
+    // Count the number of nodes that
+    // this node is the sole unscheduled node for.
+    if (Top.isLatencyBound(SU))
+      for (const SDep &SI : SU->Succs)
+        if (isSingleUnscheduledPred(SI.getSUnit(), SU))
+          ++NumNodesBlocking;
+  } else {
+    // How many unscheduled predecessors block this node?
+    if (Bot.isLatencyBound(SU))
+      for (const SDep &PI : SU->Preds)
+        if (isSingleUnscheduledSucc(PI.getSUnit(), SU))
+          ++NumNodesBlocking;
+  }
+  ResCount += (NumNodesBlocking * ScaleTwo);
+
+  LLVM_DEBUG(if (verbose) {
+    std::stringstream dbgstr;
+    dbgstr << "blk " << std::setw(2) << NumNodesBlocking << ")|";
+    dbgs() << dbgstr.str();
+  });
+
+  // Factor in reg pressure as a heuristic.
+  if (!IgnoreBBRegPressure) {
+    // Decrease priority by the amount that register pressure exceeds the limit.
+    ResCount -= (Delta.Excess.getUnitInc() * PriorityOne);
+    // Decrease priority if register pressure exceeds the limit.
+    ResCount -= (Delta.CriticalMax.getUnitInc() * PriorityOne);
+    // Decrease priority slightly if register pressure would increase over the
+    // current maximum.
+    ResCount -= (Delta.CurrentMax.getUnitInc() * PriorityTwo);
+    // If there are register pressure issues, then we remove the value added for
+    // the instruction being available. The rationale is that we really don't
+    // want to schedule an instruction that causes a spill.
+    if (IsAvailableAmt && pressureChange(SU, Q.getID() != TopQID) > 0 &&
+        (Delta.Excess.getUnitInc() || Delta.CriticalMax.getUnitInc() ||
+         Delta.CurrentMax.getUnitInc()))
+      ResCount -= IsAvailableAmt;
+    LLVM_DEBUG(if (verbose) {
+      dbgs() << "RP " << Delta.Excess.getUnitInc() << "/"
+             << Delta.CriticalMax.getUnitInc() << "/"
+             << Delta.CurrentMax.getUnitInc() << ")|";
+    });
+  }
+
+  // Give preference to a zero latency instruction if the dependent
+  // instruction is in the current packet.
+  if (Q.getID() == TopQID && getWeakLeft(SU, true) == 0) {
+    for (const SDep &PI : SU->Preds) {
+      if (!PI.getSUnit()->getInstr()->isPseudo() && PI.isAssignedRegDep() &&
+          PI.getLatency() == 0 &&
+          Top.ResourceModel->isInPacket(PI.getSUnit())) {
+        ResCount += PriorityThree;
+        LLVM_DEBUG(if (verbose) dbgs() << "Z|");
+      }
+    }
+  } else if (Q.getID() == BotQID && getWeakLeft(SU, false) == 0) {
+    for (const SDep &SI : SU->Succs) {
+      if (!SI.getSUnit()->getInstr()->isPseudo() && SI.isAssignedRegDep() &&
+          SI.getLatency() == 0 &&
+          Bot.ResourceModel->isInPacket(SI.getSUnit())) {
+        ResCount += PriorityThree;
+        LLVM_DEBUG(if (verbose) dbgs() << "Z|");
+      }
+    }
+  }
+
+  // If the instruction has a non-zero latency dependence with an instruction in
+  // the current packet, then it should not be scheduled yet. The case occurs
+  // when the dependent instruction is scheduled in a new packet, so the
+  // scheduler updates the current cycle and pending instructions become
+  // available.
+  if (CheckEarlyAvail) {
+    if (Q.getID() == TopQID) {
+      for (const auto &PI : SU->Preds) {
+        if (PI.getLatency() > 0 &&
+            Top.ResourceModel->isInPacket(PI.getSUnit())) {
+          ResCount -= PriorityOne;
+          LLVM_DEBUG(if (verbose) dbgs() << "D|");
+        }
+      }
+    } else {
+      for (const auto &SI : SU->Succs) {
+        if (SI.getLatency() > 0 &&
+            Bot.ResourceModel->isInPacket(SI.getSUnit())) {
+          ResCount -= PriorityOne;
+          LLVM_DEBUG(if (verbose) dbgs() << "D|");
+        }
+      }
+    }
+  }
+
+  LLVM_DEBUG(if (verbose) {
+    std::stringstream dbgstr;
+    dbgstr << "Total " << std::setw(4) << ResCount << ")";
+    dbgs() << dbgstr.str();
+  });
+
+  return ResCount;
+}
+
+/// Pick the best candidate from the top queue.
+///
+/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
+/// DAG building. To adjust for the current scheduling location we need to
+/// maintain the number of vreg uses remaining to be top-scheduled.
+ConvergingVLIWScheduler::CandResult
+ConvergingVLIWScheduler::pickNodeFromQueue(VLIWSchedBoundary &Zone,
+                                           const RegPressureTracker &RPTracker,
+                                           SchedCandidate &Candidate) {
+  ReadyQueue &Q = Zone.Available;
+  LLVM_DEBUG(if (SchedDebugVerboseLevel > 1)
+                 readyQueueVerboseDump(RPTracker, Candidate, Q);
+             else Q.dump(););
+
+  // getMaxPressureDelta temporarily modifies the tracker.
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
+
+  // BestSU remains NULL if no top candidates beat the best existing candidate.
+  CandResult FoundCandidate = NoCand;
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+    RegPressureDelta RPDelta;
+    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
+                                    DAG->getRegionCriticalPSets(),
+                                    DAG->getRegPressure().MaxSetPressure);
+
+    int CurrentCost = SchedulingCost(Q, *I, Candidate, RPDelta, false);
+
+    // Initialize the candidate if needed.
+    if (!Candidate.SU) {
+      LLVM_DEBUG(traceCandidate("DCAND", Q, *I, CurrentCost));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      Candidate.SCost = CurrentCost;
+      FoundCandidate = NodeOrder;
+      continue;
+    }
+
+    // Choose node order for negative cost candidates. There is no good
+    // candidate in this case.
+    if (CurrentCost < 0 && Candidate.SCost < 0) {
+      if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum) ||
+          (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) {
+        LLVM_DEBUG(traceCandidate("NCAND", Q, *I, CurrentCost));
+        Candidate.SU = *I;
+        Candidate.RPDelta = RPDelta;
+        Candidate.SCost = CurrentCost;
+        FoundCandidate = NodeOrder;
+      }
+      continue;
+    }
+
+    // Best cost.
+    if (CurrentCost > Candidate.SCost) {
+      LLVM_DEBUG(traceCandidate("CCAND", Q, *I, CurrentCost));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      Candidate.SCost = CurrentCost;
+      FoundCandidate = BestCost;
+      continue;
+    }
+
+    // Choose an instruction that does not depend on an artificial edge.
+    unsigned CurrWeak = getWeakLeft(*I, (Q.getID() == TopQID));
+    unsigned CandWeak = getWeakLeft(Candidate.SU, (Q.getID() == TopQID));
+    if (CurrWeak != CandWeak) {
+      if (CurrWeak < CandWeak) {
+        LLVM_DEBUG(traceCandidate("WCAND", Q, *I, CurrentCost));
+        Candidate.SU = *I;
+        Candidate.RPDelta = RPDelta;
+        Candidate.SCost = CurrentCost;
+        FoundCandidate = Weak;
+      }
+      continue;
+    }
+
+    if (CurrentCost == Candidate.SCost && Zone.isLatencyBound(*I)) {
+      unsigned CurrSize, CandSize;
+      if (Q.getID() == TopQID) {
+        CurrSize = (*I)->Succs.size();
+        CandSize = Candidate.SU->Succs.size();
+      } else {
+        CurrSize = (*I)->Preds.size();
+        CandSize = Candidate.SU->Preds.size();
+      }
+      if (CurrSize > CandSize) {
+        LLVM_DEBUG(traceCandidate("SPCAND", Q, *I, CurrentCost));
+        Candidate.SU = *I;
+        Candidate.RPDelta = RPDelta;
+        Candidate.SCost = CurrentCost;
+        FoundCandidate = BestCost;
+      }
+      // Keep the old candidate if it's a better candidate. That is, don't use
+      // the subsequent tie breaker.
+      if (CurrSize != CandSize)
+        continue;
+    }
+
+    // Tie breaker.
+    // To avoid scheduling indeterminism, we need a tie breaker
+    // for the case when cost is identical for two nodes.
+    if (UseNewerCandidate && CurrentCost == Candidate.SCost) {
+      if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum) ||
+          (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) {
+        LLVM_DEBUG(traceCandidate("TCAND", Q, *I, CurrentCost));
+        Candidate.SU = *I;
+        Candidate.RPDelta = RPDelta;
+        Candidate.SCost = CurrentCost;
+        FoundCandidate = NodeOrder;
+        continue;
+      }
+    }
+
+    // Fall through to original instruction order.
+    // Only consider node order if Candidate was chosen from this Q.
+    if (FoundCandidate == NoCand)
+      continue;
+  }
+  return FoundCandidate;
+}
+
+/// Pick the best candidate node from either the top or bottom queue.
+SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
+  // Schedule as far as possible in the direction of no choice. This is most
+  // efficient, but also provides the best heuristics for CriticalPSets.
+  if (SUnit *SU = Bot.pickOnlyChoice()) {
+    LLVM_DEBUG(dbgs() << "Picked only Bottom\n");
+    IsTopNode = false;
+    return SU;
+  }
+  if (SUnit *SU = Top.pickOnlyChoice()) {
+    LLVM_DEBUG(dbgs() << "Picked only Top\n");
+    IsTopNode = true;
+    return SU;
+  }
+  SchedCandidate BotCand;
+  // Prefer bottom scheduling when heuristics are silent.
+  CandResult BotResult =
+      pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
+  assert(BotResult != NoCand && "failed to find the first candidate");
+
+  // If either Q has a single candidate that provides the least increase in
+  // Excess pressure, we can immediately schedule from that Q.
+  //
+  // RegionCriticalPSets summarizes the pressure within the scheduled region and
+  // affects picking from either Q. If scheduling in one direction must
+  // increase pressure for one of the excess PSets, then schedule in that
+  // direction first to provide more freedom in the other direction.
+  if (BotResult == SingleExcess || BotResult == SingleCritical) {
+    LLVM_DEBUG(dbgs() << "Prefered Bottom Node\n");
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  // Check if the top Q has a better candidate.
+  SchedCandidate TopCand;
+  CandResult TopResult =
+      pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
+  assert(TopResult != NoCand && "failed to find the first candidate");
+
+  if (TopResult == SingleExcess || TopResult == SingleCritical) {
+    LLVM_DEBUG(dbgs() << "Prefered Top Node\n");
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // If either Q has a single candidate that minimizes pressure above the
+  // original region's pressure pick it.
+  if (BotResult == SingleMax) {
+    LLVM_DEBUG(dbgs() << "Prefered Bottom Node SingleMax\n");
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  if (TopResult == SingleMax) {
+    LLVM_DEBUG(dbgs() << "Prefered Top Node SingleMax\n");
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  if (TopCand.SCost > BotCand.SCost) {
+    LLVM_DEBUG(dbgs() << "Prefered Top Node Cost\n");
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // Otherwise prefer the bottom candidate in node order.
+  LLVM_DEBUG(dbgs() << "Prefered Bottom in Node order\n");
+  IsTopNode = false;
+  return BotCand.SU;
+}
+
+/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
+SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+    return nullptr;
+  }
+  SUnit *SU;
+  if (ForceTopDown) {
+    SU = Top.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate TopCand;
+      CandResult TopResult =
+          pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
+      assert(TopResult != NoCand && "failed to find the first candidate");
+      (void)TopResult;
+      SU = TopCand.SU;
+    }
+    IsTopNode = true;
+  } else if (ForceBottomUp) {
+    SU = Bot.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate BotCand;
+      CandResult BotResult =
+          pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
+      assert(BotResult != NoCand && "failed to find the first candidate");
+      (void)BotResult;
+      SU = BotCand.SU;
+    }
+    IsTopNode = false;
+  } else {
+    SU = pickNodeBidrectional(IsTopNode);
+  }
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
+
+  LLVM_DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
+                    << " Scheduling instruction in cycle "
+                    << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << " ("
+                    << reportPackets() << ")\n";
+             DAG->dumpNode(*SU));
+  return SU;
+}
+
+/// Update the scheduler's state after scheduling a node. This is the same node
+/// that was just returned by pickNode(). However, VLIWMachineScheduler needs
+/// to update it's state based on the current cycle before MachineSchedStrategy
+/// does.
+void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+  if (IsTopNode) {
+    Top.bumpNode(SU);
+    SU->TopReadyCycle = Top.CurrCycle;
+  } else {
+    Bot.bumpNode(SU);
+    SU->BotReadyCycle = Bot.CurrCycle;
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/ValueTypes.cpp b/contrib/llvm-project/llvm/lib/CodeGen/ValueTypes.cpp
new file mode 100644
index 000000000000..d514e1642e29
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/ValueTypes.cpp
@@ -0,0 +1,642 @@
+//===----------- ValueTypes.cpp - Implementation of EVT methods -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TypeSize.h"
+#include "llvm/Support/WithColor.h"
+using namespace llvm;
+
+EVT EVT::changeExtendedTypeToInteger() const {
+  assert(isExtended() && "Type is not extended!");
+  LLVMContext &Context = LLVMTy->getContext();
+  return getIntegerVT(Context, getSizeInBits());
+}
+
+EVT EVT::changeExtendedVectorElementTypeToInteger() const {
+  assert(isExtended() && "Type is not extended!");
+  LLVMContext &Context = LLVMTy->getContext();
+  EVT IntTy = getIntegerVT(Context, getScalarSizeInBits());
+  return getVectorVT(Context, IntTy, getVectorElementCount());
+}
+
+EVT EVT::changeExtendedVectorElementType(EVT EltVT) const {
+  assert(isExtended() && "Type is not extended!");
+  LLVMContext &Context = LLVMTy->getContext();
+  return getVectorVT(Context, EltVT, getVectorElementCount());
+}
+
+EVT EVT::getExtendedIntegerVT(LLVMContext &Context, unsigned BitWidth) {
+  EVT VT;
+  VT.LLVMTy = IntegerType::get(Context, BitWidth);
+  assert(VT.isExtended() && "Type is not extended!");
+  return VT;
+}
+
+EVT EVT::getExtendedVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements,
+                             bool IsScalable) {
+  EVT ResultVT;
+  ResultVT.LLVMTy =
+      VectorType::get(VT.getTypeForEVT(Context), NumElements, IsScalable);
+  assert(ResultVT.isExtended() && "Type is not extended!");
+  return ResultVT;
+}
+
+EVT EVT::getExtendedVectorVT(LLVMContext &Context, EVT VT, ElementCount EC) {
+  EVT ResultVT;
+  ResultVT.LLVMTy = VectorType::get(VT.getTypeForEVT(Context), EC);
+  assert(ResultVT.isExtended() && "Type is not extended!");
+  return ResultVT;
+}
+
+bool EVT::isExtendedFloatingPoint() const {
+  assert(isExtended() && "Type is not extended!");
+  return LLVMTy->isFPOrFPVectorTy();
+}
+
+bool EVT::isExtendedInteger() const {
+  assert(isExtended() && "Type is not extended!");
+  return LLVMTy->isIntOrIntVectorTy();
+}
+
+bool EVT::isExtendedScalarInteger() const {
+  assert(isExtended() && "Type is not extended!");
+  return LLVMTy->isIntegerTy();
+}
+
+bool EVT::isExtendedVector() const {
+  assert(isExtended() && "Type is not extended!");
+  return LLVMTy->isVectorTy();
+}
+
+bool EVT::isExtended16BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 16;
+}
+
+bool EVT::isExtended32BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 32;
+}
+
+bool EVT::isExtended64BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 64;
+}
+
+bool EVT::isExtended128BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 128;
+}
+
+bool EVT::isExtended256BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 256;
+}
+
+bool EVT::isExtended512BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 512;
+}
+
+bool EVT::isExtended1024BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 1024;
+}
+
+bool EVT::isExtended2048BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 2048;
+}
+
+bool EVT::isExtendedFixedLengthVector() const {
+  return isExtendedVector() && isa<FixedVectorType>(LLVMTy);
+}
+
+bool EVT::isExtendedScalableVector() const {
+  return isExtendedVector() && isa<ScalableVectorType>(LLVMTy);
+}
+
+EVT EVT::getExtendedVectorElementType() const {
+  assert(isExtended() && "Type is not extended!");
+  return EVT::getEVT(cast<VectorType>(LLVMTy)->getElementType());
+}
+
+unsigned EVT::getExtendedVectorNumElements() const {
+  assert(isExtended() && "Type is not extended!");
+  ElementCount EC = cast<VectorType>(LLVMTy)->getElementCount();
+  if (EC.isScalable()) {
+    WithColor::warning()
+        << "The code that requested the fixed number of elements has made the "
+           "assumption that this vector is not scalable. This assumption was "
+           "not correct, and this may lead to broken code\n";
+  }
+  return EC.getKnownMinValue();
+}
+
+ElementCount EVT::getExtendedVectorElementCount() const {
+  assert(isExtended() && "Type is not extended!");
+  return cast<VectorType>(LLVMTy)->getElementCount();
+}
+
+TypeSize EVT::getExtendedSizeInBits() const {
+  assert(isExtended() && "Type is not extended!");
+  if (IntegerType *ITy = dyn_cast<IntegerType>(LLVMTy))
+    return TypeSize::Fixed(ITy->getBitWidth());
+  if (VectorType *VTy = dyn_cast<VectorType>(LLVMTy))
+    return VTy->getPrimitiveSizeInBits();
+  llvm_unreachable("Unrecognized extended type!");
+}
+
+/// getEVTString - This function returns value type as a string, e.g. "i32".
+std::string EVT::getEVTString() const {
+  switch (V.SimpleTy) {
+  default:
+    if (isVector())
+      return (isScalableVector() ? "nxv" : "v") +
+             utostr(getVectorElementCount().getKnownMinValue()) +
+             getVectorElementType().getEVTString();
+    if (isInteger())
+      return "i" + utostr(getSizeInBits());
+    if (isFloatingPoint())
+      return "f" + utostr(getSizeInBits());
+    llvm_unreachable("Invalid EVT!");
+  case MVT::bf16:      return "bf16";
+  case MVT::ppcf128:   return "ppcf128";
+  case MVT::isVoid:    return "isVoid";
+  case MVT::Other:     return "ch";
+  case MVT::Glue:      return "glue";
+  case MVT::x86mmx:    return "x86mmx";
+  case MVT::x86amx:    return "x86amx";
+  case MVT::i64x8:     return "i64x8";
+  case MVT::Metadata:  return "Metadata";
+  case MVT::Untyped:   return "Untyped";
+  case MVT::funcref:   return "funcref";
+  case MVT::externref: return "externref";
+  case MVT::aarch64svcount:
+    return "aarch64svcount";
+  case MVT::spirvbuiltin:
+    return "spirvbuiltin";
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void EVT::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+#endif
+
+/// getTypeForEVT - This method returns an LLVM type corresponding to the
+/// specified EVT.  For integer types, this returns an unsigned type.  Note
+/// that this will abort for types that cannot be represented.
+Type *EVT::getTypeForEVT(LLVMContext &Context) const {
+  // clang-format off
+  switch (V.SimpleTy) {
+  default:
+    assert(isExtended() && "Type is not extended!");
+    return LLVMTy;
+  case MVT::isVoid:  return Type::getVoidTy(Context);
+  case MVT::i1:      return Type::getInt1Ty(Context);
+  case MVT::i2:      return Type::getIntNTy(Context, 2);
+  case MVT::i4:      return Type::getIntNTy(Context, 4);
+  case MVT::i8:      return Type::getInt8Ty(Context);
+  case MVT::i16:     return Type::getInt16Ty(Context);
+  case MVT::i32:     return Type::getInt32Ty(Context);
+  case MVT::i64:     return Type::getInt64Ty(Context);
+  case MVT::i128:    return IntegerType::get(Context, 128);
+  case MVT::f16:     return Type::getHalfTy(Context);
+  case MVT::bf16:    return Type::getBFloatTy(Context);
+  case MVT::f32:     return Type::getFloatTy(Context);
+  case MVT::f64:     return Type::getDoubleTy(Context);
+  case MVT::f80:     return Type::getX86_FP80Ty(Context);
+  case MVT::f128:    return Type::getFP128Ty(Context);
+  case MVT::ppcf128: return Type::getPPC_FP128Ty(Context);
+  case MVT::x86mmx:  return Type::getX86_MMXTy(Context);
+  case MVT::aarch64svcount:
+    return TargetExtType::get(Context, "aarch64.svcount");
+  case MVT::x86amx:  return Type::getX86_AMXTy(Context);
+  case MVT::i64x8:   return IntegerType::get(Context, 512);
+  case MVT::externref: return Type::getWasm_ExternrefTy(Context);
+  case MVT::funcref: return Type::getWasm_FuncrefTy(Context);
+  case MVT::v1i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 1);
+  case MVT::v2i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 2);
+  case MVT::v4i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 4);
+  case MVT::v8i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 8);
+  case MVT::v16i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 16);
+  case MVT::v32i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 32);
+  case MVT::v64i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 64);
+  case MVT::v128i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 128);
+  case MVT::v256i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 256);
+  case MVT::v512i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 512);
+  case MVT::v1024i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 1024);
+  case MVT::v2048i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 2048);
+  case MVT::v128i2:
+    return FixedVectorType::get(Type::getIntNTy(Context, 2), 128);
+  case MVT::v256i2:
+    return FixedVectorType::get(Type::getIntNTy(Context, 2), 256);
+  case MVT::v64i4:
+    return FixedVectorType::get(Type::getIntNTy(Context, 4), 64);
+  case MVT::v128i4:
+    return FixedVectorType::get(Type::getIntNTy(Context, 4), 128);
+  case MVT::v1i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 1);
+  case MVT::v2i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 2);
+  case MVT::v4i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 4);
+  case MVT::v8i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 8);
+  case MVT::v16i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 16);
+  case MVT::v32i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 32);
+  case MVT::v64i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 64);
+  case MVT::v128i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 128);
+  case MVT::v256i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 256);
+  case MVT::v512i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 512);
+  case MVT::v1024i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 1024);
+  case MVT::v1i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 1);
+  case MVT::v2i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 2);
+  case MVT::v3i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 3);
+  case MVT::v4i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 4);
+  case MVT::v8i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 8);
+  case MVT::v16i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 16);
+  case MVT::v32i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 32);
+  case MVT::v64i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 64);
+  case MVT::v128i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 128);
+  case MVT::v256i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 256);
+  case MVT::v512i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 512);
+  case MVT::v1i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 1);
+  case MVT::v2i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 2);
+  case MVT::v3i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 3);
+  case MVT::v4i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 4);
+  case MVT::v5i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 5);
+  case MVT::v6i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 6);
+  case MVT::v7i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 7);
+  case MVT::v8i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 8);
+  case MVT::v9i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 9);
+  case MVT::v10i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 10);
+  case MVT::v11i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 11);
+  case MVT::v12i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 12);
+  case MVT::v16i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 16);
+  case MVT::v32i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 32);
+  case MVT::v64i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 64);
+  case MVT::v128i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 128);
+  case MVT::v256i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 256);
+  case MVT::v512i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 512);
+  case MVT::v1024i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 1024);
+  case MVT::v2048i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 2048);
+  case MVT::v1i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 1);
+  case MVT::v2i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 2);
+  case MVT::v3i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 3);
+  case MVT::v4i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 4);
+  case MVT::v8i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 8);
+  case MVT::v16i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 16);
+  case MVT::v32i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 32);
+  case MVT::v64i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 64);
+  case MVT::v128i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 128);
+  case MVT::v256i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 256);
+  case MVT::v1i128:
+    return FixedVectorType::get(Type::getInt128Ty(Context), 1);
+  case MVT::v1f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 1);
+  case MVT::v2f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 2);
+  case MVT::v3f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 3);
+  case MVT::v4f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 4);
+  case MVT::v8f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 8);
+  case MVT::v16f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 16);
+  case MVT::v32f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 32);
+  case MVT::v64f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 64);
+  case MVT::v128f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 128);
+  case MVT::v256f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 256);
+  case MVT::v512f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 512);
+  case MVT::v2bf16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 2);
+  case MVT::v3bf16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 3);
+  case MVT::v4bf16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 4);
+  case MVT::v8bf16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 8);
+  case MVT::v16bf16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 16);
+  case MVT::v32bf16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 32);
+  case MVT::v64bf16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 64);
+  case MVT::v128bf16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 128);
+  case MVT::v1f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 1);
+  case MVT::v2f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 2);
+  case MVT::v3f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 3);
+  case MVT::v4f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 4);
+  case MVT::v5f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 5);
+  case MVT::v6f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 6);
+  case MVT::v7f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 7);
+  case MVT::v8f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 8);
+  case MVT::v9f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 9);
+  case MVT::v10f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 10);
+  case MVT::v11f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 11);
+  case MVT::v12f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 12);
+  case MVT::v16f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 16);
+  case MVT::v32f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 32);
+  case MVT::v64f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 64);
+  case MVT::v128f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 128);
+  case MVT::v256f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 256);
+  case MVT::v512f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 512);
+  case MVT::v1024f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 1024);
+  case MVT::v2048f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 2048);
+  case MVT::v1f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 1);
+  case MVT::v2f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 2);
+  case MVT::v3f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 3);
+  case MVT::v4f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 4);
+  case MVT::v8f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 8);
+  case MVT::v16f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 16);
+  case MVT::v32f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 32);
+  case MVT::v64f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 64);
+  case MVT::v128f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 128);
+  case MVT::v256f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 256);
+  case MVT::nxv1i1:
+    return ScalableVectorType::get(Type::getInt1Ty(Context), 1);
+  case MVT::nxv2i1:
+    return ScalableVectorType::get(Type::getInt1Ty(Context), 2);
+  case MVT::nxv4i1:
+    return ScalableVectorType::get(Type::getInt1Ty(Context), 4);
+  case MVT::nxv8i1:
+    return ScalableVectorType::get(Type::getInt1Ty(Context), 8);
+  case MVT::nxv16i1:
+    return ScalableVectorType::get(Type::getInt1Ty(Context), 16);
+  case MVT::nxv32i1:
+    return ScalableVectorType::get(Type::getInt1Ty(Context), 32);
+  case MVT::nxv64i1:
+    return ScalableVectorType::get(Type::getInt1Ty(Context), 64);
+  case MVT::nxv1i8:
+    return ScalableVectorType::get(Type::getInt8Ty(Context), 1);
+  case MVT::nxv2i8:
+    return ScalableVectorType::get(Type::getInt8Ty(Context), 2);
+  case MVT::nxv4i8:
+    return ScalableVectorType::get(Type::getInt8Ty(Context), 4);
+  case MVT::nxv8i8:
+    return ScalableVectorType::get(Type::getInt8Ty(Context), 8);
+  case MVT::nxv16i8:
+    return ScalableVectorType::get(Type::getInt8Ty(Context), 16);
+  case MVT::nxv32i8:
+    return ScalableVectorType::get(Type::getInt8Ty(Context), 32);
+  case MVT::nxv64i8:
+    return ScalableVectorType::get(Type::getInt8Ty(Context), 64);
+  case MVT::nxv1i16:
+    return ScalableVectorType::get(Type::getInt16Ty(Context), 1);
+  case MVT::nxv2i16:
+    return ScalableVectorType::get(Type::getInt16Ty(Context), 2);
+  case MVT::nxv4i16:
+    return ScalableVectorType::get(Type::getInt16Ty(Context), 4);
+  case MVT::nxv8i16:
+    return ScalableVectorType::get(Type::getInt16Ty(Context), 8);
+  case MVT::nxv16i16:
+    return ScalableVectorType::get(Type::getInt16Ty(Context), 16);
+  case MVT::nxv32i16:
+    return ScalableVectorType::get(Type::getInt16Ty(Context), 32);
+  case MVT::nxv1i32:
+    return ScalableVectorType::get(Type::getInt32Ty(Context), 1);
+  case MVT::nxv2i32:
+    return ScalableVectorType::get(Type::getInt32Ty(Context), 2);
+  case MVT::nxv4i32:
+    return ScalableVectorType::get(Type::getInt32Ty(Context), 4);
+  case MVT::nxv8i32:
+    return ScalableVectorType::get(Type::getInt32Ty(Context), 8);
+  case MVT::nxv16i32:
+    return ScalableVectorType::get(Type::getInt32Ty(Context), 16);
+  case MVT::nxv32i32:
+    return ScalableVectorType::get(Type::getInt32Ty(Context), 32);
+  case MVT::nxv1i64:
+    return ScalableVectorType::get(Type::getInt64Ty(Context), 1);
+  case MVT::nxv2i64:
+    return ScalableVectorType::get(Type::getInt64Ty(Context), 2);
+  case MVT::nxv4i64:
+    return ScalableVectorType::get(Type::getInt64Ty(Context), 4);
+  case MVT::nxv8i64:
+    return ScalableVectorType::get(Type::getInt64Ty(Context), 8);
+  case MVT::nxv16i64:
+    return ScalableVectorType::get(Type::getInt64Ty(Context), 16);
+  case MVT::nxv32i64:
+    return ScalableVectorType::get(Type::getInt64Ty(Context), 32);
+  case MVT::nxv1f16:
+    return ScalableVectorType::get(Type::getHalfTy(Context), 1);
+  case MVT::nxv2f16:
+    return ScalableVectorType::get(Type::getHalfTy(Context), 2);
+  case MVT::nxv4f16:
+    return ScalableVectorType::get(Type::getHalfTy(Context), 4);
+  case MVT::nxv8f16:
+    return ScalableVectorType::get(Type::getHalfTy(Context), 8);
+  case MVT::nxv16f16:
+    return ScalableVectorType::get(Type::getHalfTy(Context), 16);
+  case MVT::nxv32f16:
+    return ScalableVectorType::get(Type::getHalfTy(Context), 32);
+  case MVT::nxv1bf16:
+    return ScalableVectorType::get(Type::getBFloatTy(Context), 1);
+  case MVT::nxv2bf16:
+    return ScalableVectorType::get(Type::getBFloatTy(Context), 2);
+  case MVT::nxv4bf16:
+    return ScalableVectorType::get(Type::getBFloatTy(Context), 4);
+  case MVT::nxv8bf16:
+    return ScalableVectorType::get(Type::getBFloatTy(Context), 8);
+  case MVT::nxv16bf16:
+    return ScalableVectorType::get(Type::getBFloatTy(Context), 16);
+  case MVT::nxv32bf16:
+    return ScalableVectorType::get(Type::getBFloatTy(Context), 32);
+  case MVT::nxv1f32:
+    return ScalableVectorType::get(Type::getFloatTy(Context), 1);
+  case MVT::nxv2f32:
+    return ScalableVectorType::get(Type::getFloatTy(Context), 2);
+  case MVT::nxv4f32:
+    return ScalableVectorType::get(Type::getFloatTy(Context), 4);
+  case MVT::nxv8f32:
+    return ScalableVectorType::get(Type::getFloatTy(Context), 8);
+  case MVT::nxv16f32:
+    return ScalableVectorType::get(Type::getFloatTy(Context), 16);
+  case MVT::nxv1f64:
+    return ScalableVectorType::get(Type::getDoubleTy(Context), 1);
+  case MVT::nxv2f64:
+    return ScalableVectorType::get(Type::getDoubleTy(Context), 2);
+  case MVT::nxv4f64:
+    return ScalableVectorType::get(Type::getDoubleTy(Context), 4);
+  case MVT::nxv8f64:
+    return ScalableVectorType::get(Type::getDoubleTy(Context), 8);
+  case MVT::Metadata: return Type::getMetadataTy(Context);
+  }
+  // clang-format on
+}
+
+/// Return the value type corresponding to the specified type.  This returns all
+/// pointers as MVT::iPTR.  If HandleUnknown is true, unknown types are returned
+/// as Other, otherwise they are invalid.
+MVT MVT::getVT(Type *Ty, bool HandleUnknown){
+  assert(Ty != nullptr && "Invalid type");
+  switch (Ty->getTypeID()) {
+  default:
+    if (HandleUnknown) return MVT(MVT::Other);
+    llvm_unreachable("Unknown type!");
+  case Type::VoidTyID:
+    return MVT::isVoid;
+  case Type::IntegerTyID:
+    return getIntegerVT(cast<IntegerType>(Ty)->getBitWidth());
+  case Type::HalfTyID:      return MVT(MVT::f16);
+  case Type::BFloatTyID:    return MVT(MVT::bf16);
+  case Type::FloatTyID:     return MVT(MVT::f32);
+  case Type::DoubleTyID:    return MVT(MVT::f64);
+  case Type::X86_FP80TyID:  return MVT(MVT::f80);
+  case Type::X86_MMXTyID:   return MVT(MVT::x86mmx);
+  case Type::TargetExtTyID: {
+    TargetExtType *TargetExtTy = cast<TargetExtType>(Ty);
+    if (TargetExtTy->getName() == "aarch64.svcount")
+      return MVT(MVT::aarch64svcount);
+    else if (TargetExtTy->getName().starts_with("spirv."))
+      return MVT(MVT::spirvbuiltin);
+    if (HandleUnknown)
+      return MVT(MVT::Other);
+    llvm_unreachable("Unknown target ext type!");
+  }
+  case Type::X86_AMXTyID:   return MVT(MVT::x86amx);
+  case Type::FP128TyID:     return MVT(MVT::f128);
+  case Type::PPC_FP128TyID: return MVT(MVT::ppcf128);
+  case Type::PointerTyID:   return MVT(MVT::iPTR);
+  case Type::FixedVectorTyID:
+  case Type::ScalableVectorTyID: {
+    VectorType *VTy = cast<VectorType>(Ty);
+    return getVectorVT(
+      getVT(VTy->getElementType(), /*HandleUnknown=*/ false),
+            VTy->getElementCount());
+  }
+  }
+}
+
+/// getEVT - Return the value type corresponding to the specified type.  This
+/// returns all pointers as MVT::iPTR.  If HandleUnknown is true, unknown types
+/// are returned as Other, otherwise they are invalid.
+EVT EVT::getEVT(Type *Ty, bool HandleUnknown){
+  switch (Ty->getTypeID()) {
+  default:
+    return MVT::getVT(Ty, HandleUnknown);
+  case Type::IntegerTyID:
+    return getIntegerVT(Ty->getContext(), cast<IntegerType>(Ty)->getBitWidth());
+  case Type::FixedVectorTyID:
+  case Type::ScalableVectorTyID: {
+    VectorType *VTy = cast<VectorType>(Ty);
+    return getVectorVT(Ty->getContext(),
+                       getEVT(VTy->getElementType(), /*HandleUnknown=*/ false),
+                       VTy->getElementCount());
+  }
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MVT::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+#endif
+
+void MVT::print(raw_ostream &OS) const {
+  OS << EVT(*this).getEVTString();
+}
+
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/VirtRegMap.cpp b/contrib/llvm-project/llvm/lib/CodeGen/VirtRegMap.cpp
new file mode 100644
index 000000000000..a816bd5b52de
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -0,0 +1,647 @@
+//===- llvm/CodeGen/VirtRegMap.cpp - Virtual Register Map -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the VirtRegMap class.
+//
+// It also contains implementations of the Spiller interface, which, given a
+// virtual register map and a machine function, eliminates all virtual
+// references by replacing them with physical register references - adding spill
+// code as necessary.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "LiveDebugVariables.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveStacks.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(NumSpillSlots, "Number of spill slots allocated");
+STATISTIC(NumIdCopies,   "Number of identity moves eliminated after rewriting");
+
+//===----------------------------------------------------------------------===//
+//  VirtRegMap implementation
+//===----------------------------------------------------------------------===//
+
+char VirtRegMap::ID = 0;
+
+INITIALIZE_PASS(VirtRegMap, "virtregmap", "Virtual Register Map", false, false)
+
+bool VirtRegMap::runOnMachineFunction(MachineFunction &mf) {
+  MRI = &mf.getRegInfo();
+  TII = mf.getSubtarget().getInstrInfo();
+  TRI = mf.getSubtarget().getRegisterInfo();
+  MF = &mf;
+
+  Virt2PhysMap.clear();
+  Virt2StackSlotMap.clear();
+  Virt2SplitMap.clear();
+  Virt2ShapeMap.clear();
+
+  grow();
+  return false;
+}
+
+void VirtRegMap::grow() {
+  unsigned NumRegs = MF->getRegInfo().getNumVirtRegs();
+  Virt2PhysMap.resize(NumRegs);
+  Virt2StackSlotMap.resize(NumRegs);
+  Virt2SplitMap.resize(NumRegs);
+}
+
+void VirtRegMap::assignVirt2Phys(Register virtReg, MCPhysReg physReg) {
+  assert(virtReg.isVirtual() && Register::isPhysicalRegister(physReg));
+  assert(Virt2PhysMap[virtReg.id()] == NO_PHYS_REG &&
+         "attempt to assign physical register to already mapped "
+         "virtual register");
+  assert(!getRegInfo().isReserved(physReg) &&
+         "Attempt to map virtReg to a reserved physReg");
+  Virt2PhysMap[virtReg.id()] = physReg;
+}
+
+unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) {
+  unsigned Size = TRI->getSpillSize(*RC);
+  Align Alignment = TRI->getSpillAlign(*RC);
+  // Set preferred alignment if we are still able to realign the stack
+  auto &ST = MF->getSubtarget();
+  Align CurrentAlign = ST.getFrameLowering()->getStackAlign();
+  if (Alignment > CurrentAlign && !ST.getRegisterInfo()->canRealignStack(*MF)) {
+    Alignment = CurrentAlign;
+  }
+  int SS = MF->getFrameInfo().CreateSpillStackObject(Size, Alignment);
+  ++NumSpillSlots;
+  return SS;
+}
+
+bool VirtRegMap::hasPreferredPhys(Register VirtReg) const {
+  Register Hint = MRI->getSimpleHint(VirtReg);
+  if (!Hint.isValid())
+    return false;
+  if (Hint.isVirtual())
+    Hint = getPhys(Hint);
+  return Register(getPhys(VirtReg)) == Hint;
+}
+
+bool VirtRegMap::hasKnownPreference(Register VirtReg) const {
+  std::pair<unsigned, Register> Hint = MRI->getRegAllocationHint(VirtReg);
+  if (Hint.second.isPhysical())
+    return true;
+  if (Hint.second.isVirtual())
+    return hasPhys(Hint.second);
+  return false;
+}
+
+int VirtRegMap::assignVirt2StackSlot(Register virtReg) {
+  assert(virtReg.isVirtual());
+  assert(Virt2StackSlotMap[virtReg.id()] == NO_STACK_SLOT &&
+         "attempt to assign stack slot to already spilled register");
+  const TargetRegisterClass* RC = MF->getRegInfo().getRegClass(virtReg);
+  return Virt2StackSlotMap[virtReg.id()] = createSpillSlot(RC);
+}
+
+void VirtRegMap::assignVirt2StackSlot(Register virtReg, int SS) {
+  assert(virtReg.isVirtual());
+  assert(Virt2StackSlotMap[virtReg.id()] == NO_STACK_SLOT &&
+         "attempt to assign stack slot to already spilled register");
+  assert((SS >= 0 ||
+          (SS >= MF->getFrameInfo().getObjectIndexBegin())) &&
+         "illegal fixed frame index");
+  Virt2StackSlotMap[virtReg.id()] = SS;
+}
+
+void VirtRegMap::print(raw_ostream &OS, const Module*) const {
+  OS << "********** REGISTER MAP **********\n";
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    Register Reg = Register::index2VirtReg(i);
+    if (Virt2PhysMap[Reg] != (unsigned)VirtRegMap::NO_PHYS_REG) {
+      OS << '[' << printReg(Reg, TRI) << " -> "
+         << printReg(Virt2PhysMap[Reg], TRI) << "] "
+         << TRI->getRegClassName(MRI->getRegClass(Reg)) << "\n";
+    }
+  }
+
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    Register Reg = Register::index2VirtReg(i);
+    if (Virt2StackSlotMap[Reg] != VirtRegMap::NO_STACK_SLOT) {
+      OS << '[' << printReg(Reg, TRI) << " -> fi#" << Virt2StackSlotMap[Reg]
+         << "] " << TRI->getRegClassName(MRI->getRegClass(Reg)) << "\n";
+    }
+  }
+  OS << '\n';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void VirtRegMap::dump() const {
+  print(dbgs());
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+//                              VirtRegRewriter
+//===----------------------------------------------------------------------===//
+//
+// The VirtRegRewriter is the last of the register allocator passes.
+// It rewrites virtual registers to physical registers as specified in the
+// VirtRegMap analysis. It also updates live-in information on basic blocks
+// according to LiveIntervals.
+//
+namespace {
+
+class VirtRegRewriter : public MachineFunctionPass {
+  MachineFunction *MF = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  SlotIndexes *Indexes = nullptr;
+  LiveIntervals *LIS = nullptr;
+  VirtRegMap *VRM = nullptr;
+  LiveDebugVariables *DebugVars = nullptr;
+  DenseSet<Register> RewriteRegs;
+  bool ClearVirtRegs;
+
+  void rewrite();
+  void addMBBLiveIns();
+  bool readsUndefSubreg(const MachineOperand &MO) const;
+  void addLiveInsForSubRanges(const LiveInterval &LI, MCRegister PhysReg) const;
+  void handleIdentityCopy(MachineInstr &MI);
+  void expandCopyBundle(MachineInstr &MI) const;
+  bool subRegLiveThrough(const MachineInstr &MI, MCRegister SuperPhysReg) const;
+
+public:
+  static char ID;
+  VirtRegRewriter(bool ClearVirtRegs_ = true) :
+    MachineFunctionPass(ID),
+    ClearVirtRegs(ClearVirtRegs_) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction&) override;
+
+  MachineFunctionProperties getSetProperties() const override {
+    if (ClearVirtRegs) {
+      return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+    }
+
+    return MachineFunctionProperties();
+  }
+};
+
+} // end anonymous namespace
+
+char VirtRegRewriter::ID = 0;
+
+char &llvm::VirtRegRewriterID = VirtRegRewriter::ID;
+
+INITIALIZE_PASS_BEGIN(VirtRegRewriter, "virtregrewriter",
+                      "Virtual Register Rewriter", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
+INITIALIZE_PASS_DEPENDENCY(LiveStacks)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(VirtRegRewriter, "virtregrewriter",
+                    "Virtual Register Rewriter", false, false)
+
+void VirtRegRewriter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  AU.addRequired<SlotIndexes>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<LiveDebugVariables>();
+  AU.addRequired<LiveStacks>();
+  AU.addPreserved<LiveStacks>();
+  AU.addRequired<VirtRegMap>();
+
+  if (!ClearVirtRegs)
+    AU.addPreserved<LiveDebugVariables>();
+
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) {
+  MF = &fn;
+  TRI = MF->getSubtarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
+  MRI = &MF->getRegInfo();
+  Indexes = &getAnalysis<SlotIndexes>();
+  LIS = &getAnalysis<LiveIntervals>();
+  VRM = &getAnalysis<VirtRegMap>();
+  DebugVars = getAnalysisIfAvailable<LiveDebugVariables>();
+  LLVM_DEBUG(dbgs() << "********** REWRITE VIRTUAL REGISTERS **********\n"
+                    << "********** Function: " << MF->getName() << '\n');
+  LLVM_DEBUG(VRM->dump());
+
+  // Add kill flags while we still have virtual registers.
+  LIS->addKillFlags(VRM);
+
+  // Live-in lists on basic blocks are required for physregs.
+  addMBBLiveIns();
+
+  // Rewrite virtual registers.
+  rewrite();
+
+  if (DebugVars && ClearVirtRegs) {
+    // Write out new DBG_VALUE instructions.
+
+    // We only do this if ClearVirtRegs is specified since this should be the
+    // final run of the pass and we don't want to emit them multiple times.
+    DebugVars->emitDebugValues(VRM);
+
+    // All machine operands and other references to virtual registers have been
+    // replaced. Remove the virtual registers and release all the transient data.
+    VRM->clearAllVirt();
+    MRI->clearVirtRegs();
+  }
+
+  return true;
+}
+
+void VirtRegRewriter::addLiveInsForSubRanges(const LiveInterval &LI,
+                                             MCRegister PhysReg) const {
+  assert(!LI.empty());
+  assert(LI.hasSubRanges());
+
+  using SubRangeIteratorPair =
+      std::pair<const LiveInterval::SubRange *, LiveInterval::const_iterator>;
+
+  SmallVector<SubRangeIteratorPair, 4> SubRanges;
+  SlotIndex First;
+  SlotIndex Last;
+  for (const LiveInterval::SubRange &SR : LI.subranges()) {
+    SubRanges.push_back(std::make_pair(&SR, SR.begin()));
+    if (!First.isValid() || SR.segments.front().start < First)
+      First = SR.segments.front().start;
+    if (!Last.isValid() || SR.segments.back().end > Last)
+      Last = SR.segments.back().end;
+  }
+
+  // Check all mbb start positions between First and Last while
+  // simulatenously advancing an iterator for each subrange.
+  for (SlotIndexes::MBBIndexIterator MBBI = Indexes->findMBBIndex(First);
+       MBBI != Indexes->MBBIndexEnd() && MBBI->first <= Last; ++MBBI) {
+    SlotIndex MBBBegin = MBBI->first;
+    // Advance all subrange iterators so that their end position is just
+    // behind MBBBegin (or the iterator is at the end).
+    LaneBitmask LaneMask;
+    for (auto &RangeIterPair : SubRanges) {
+      const LiveInterval::SubRange *SR = RangeIterPair.first;
+      LiveInterval::const_iterator &SRI = RangeIterPair.second;
+      while (SRI != SR->end() && SRI->end <= MBBBegin)
+        ++SRI;
+      if (SRI == SR->end())
+        continue;
+      if (SRI->start <= MBBBegin)
+        LaneMask |= SR->LaneMask;
+    }
+    if (LaneMask.none())
+      continue;
+    MachineBasicBlock *MBB = MBBI->second;
+    MBB->addLiveIn(PhysReg, LaneMask);
+  }
+}
+
+// Compute MBB live-in lists from virtual register live ranges and their
+// assignments.
+void VirtRegRewriter::addMBBLiveIns() {
+  for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
+    Register VirtReg = Register::index2VirtReg(Idx);
+    if (MRI->reg_nodbg_empty(VirtReg))
+      continue;
+    LiveInterval &LI = LIS->getInterval(VirtReg);
+    if (LI.empty() || LIS->intervalIsInOneMBB(LI))
+      continue;
+    // This is a virtual register that is live across basic blocks. Its
+    // assigned PhysReg must be marked as live-in to those blocks.
+    Register PhysReg = VRM->getPhys(VirtReg);
+    if (PhysReg == VirtRegMap::NO_PHYS_REG) {
+      // There may be no physical register assigned if only some register
+      // classes were already allocated.
+      assert(!ClearVirtRegs && "Unmapped virtual register");
+      continue;
+    }
+
+    if (LI.hasSubRanges()) {
+      addLiveInsForSubRanges(LI, PhysReg);
+    } else {
+      // Go over MBB begin positions and see if we have segments covering them.
+      // The following works because segments and the MBBIndex list are both
+      // sorted by slot indexes.
+      SlotIndexes::MBBIndexIterator I = Indexes->MBBIndexBegin();
+      for (const auto &Seg : LI) {
+        I = Indexes->advanceMBBIndex(I, Seg.start);
+        for (; I != Indexes->MBBIndexEnd() && I->first < Seg.end; ++I) {
+          MachineBasicBlock *MBB = I->second;
+          MBB->addLiveIn(PhysReg);
+        }
+      }
+    }
+  }
+
+  // Sort and unique MBB LiveIns as we've not checked if SubReg/PhysReg were in
+  // each MBB's LiveIns set before calling addLiveIn on them.
+  for (MachineBasicBlock &MBB : *MF)
+    MBB.sortUniqueLiveIns();
+}
+
+/// Returns true if the given machine operand \p MO only reads undefined lanes.
+/// The function only works for use operands with a subregister set.
+bool VirtRegRewriter::readsUndefSubreg(const MachineOperand &MO) const {
+  // Shortcut if the operand is already marked undef.
+  if (MO.isUndef())
+    return true;
+
+  Register Reg = MO.getReg();
+  const LiveInterval &LI = LIS->getInterval(Reg);
+  const MachineInstr &MI = *MO.getParent();
+  SlotIndex BaseIndex = LIS->getInstructionIndex(MI);
+  // This code is only meant to handle reading undefined subregisters which
+  // we couldn't properly detect before.
+  assert(LI.liveAt(BaseIndex) &&
+         "Reads of completely dead register should be marked undef already");
+  unsigned SubRegIdx = MO.getSubReg();
+  assert(SubRegIdx != 0 && LI.hasSubRanges());
+  LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(SubRegIdx);
+  // See if any of the relevant subregister liveranges is defined at this point.
+  for (const LiveInterval::SubRange &SR : LI.subranges()) {
+    if ((SR.LaneMask & UseMask).any() && SR.liveAt(BaseIndex))
+      return false;
+  }
+  return true;
+}
+
+void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) {
+  if (!MI.isIdentityCopy())
+    return;
+  LLVM_DEBUG(dbgs() << "Identity copy: " << MI);
+  ++NumIdCopies;
+
+  Register DstReg = MI.getOperand(0).getReg();
+
+  // We may have deferred allocation of the virtual register, and the rewrite
+  // regs code doesn't handle the liveness update.
+  if (DstReg.isVirtual())
+    return;
+
+  RewriteRegs.insert(DstReg);
+
+  // Copies like:
+  //    %r0 = COPY undef %r0
+  //    %al = COPY %al, implicit-def %eax
+  // give us additional liveness information: The target (super-)register
+  // must not be valid before this point. Replace the COPY with a KILL
+  // instruction to maintain this information.
+  if (MI.getOperand(1).isUndef() || MI.getNumOperands() > 2) {
+    MI.setDesc(TII->get(TargetOpcode::KILL));
+    LLVM_DEBUG(dbgs() << "  replace by: " << MI);
+    return;
+  }
+
+  if (Indexes)
+    Indexes->removeSingleMachineInstrFromMaps(MI);
+  MI.eraseFromBundle();
+  LLVM_DEBUG(dbgs() << "  deleted.\n");
+}
+
+/// The liverange splitting logic sometimes produces bundles of copies when
+/// subregisters are involved. Expand these into a sequence of copy instructions
+/// after processing the last in the bundle. Does not update LiveIntervals
+/// which we shouldn't need for this instruction anymore.
+void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const {
+  if (!MI.isCopy() && !MI.isKill())
+    return;
+
+  if (MI.isBundledWithPred() && !MI.isBundledWithSucc()) {
+    SmallVector<MachineInstr *, 2> MIs({&MI});
+
+    // Only do this when the complete bundle is made out of COPYs and KILLs.
+    MachineBasicBlock &MBB = *MI.getParent();
+    for (MachineBasicBlock::reverse_instr_iterator I =
+         std::next(MI.getReverseIterator()), E = MBB.instr_rend();
+         I != E && I->isBundledWithSucc(); ++I) {
+      if (!I->isCopy() && !I->isKill())
+        return;
+      MIs.push_back(&*I);
+    }
+    MachineInstr *FirstMI = MIs.back();
+
+    auto anyRegsAlias = [](const MachineInstr *Dst,
+                           ArrayRef<MachineInstr *> Srcs,
+                           const TargetRegisterInfo *TRI) {
+      for (const MachineInstr *Src : Srcs)
+        if (Src != Dst)
+          if (TRI->regsOverlap(Dst->getOperand(0).getReg(),
+                               Src->getOperand(1).getReg()))
+            return true;
+      return false;
+    };
+
+    // If any of the destination registers in the bundle of copies alias any of
+    // the source registers, try to schedule the instructions to avoid any
+    // clobbering.
+    for (int E = MIs.size(), PrevE = E; E > 1; PrevE = E) {
+      for (int I = E; I--; )
+        if (!anyRegsAlias(MIs[I], ArrayRef(MIs).take_front(E), TRI)) {
+          if (I + 1 != E)
+            std::swap(MIs[I], MIs[E - 1]);
+          --E;
+        }
+      if (PrevE == E) {
+        MF->getFunction().getContext().emitError(
+            "register rewriting failed: cycle in copy bundle");
+        break;
+      }
+    }
+
+    MachineInstr *BundleStart = FirstMI;
+    for (MachineInstr *BundledMI : llvm::reverse(MIs)) {
+      // If instruction is in the middle of the bundle, move it before the
+      // bundle starts, otherwise, just unbundle it. When we get to the last
+      // instruction, the bundle will have been completely undone.
+      if (BundledMI != BundleStart) {
+        BundledMI->removeFromBundle();
+        MBB.insert(BundleStart, BundledMI);
+      } else if (BundledMI->isBundledWithSucc()) {
+        BundledMI->unbundleFromSucc();
+        BundleStart = &*std::next(BundledMI->getIterator());
+      }
+
+      if (Indexes && BundledMI != FirstMI)
+        Indexes->insertMachineInstrInMaps(*BundledMI);
+    }
+  }
+}
+
+/// Check whether (part of) \p SuperPhysReg is live through \p MI.
+/// \pre \p MI defines a subregister of a virtual register that
+/// has been assigned to \p SuperPhysReg.
+bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI,
+                                        MCRegister SuperPhysReg) const {
+  SlotIndex MIIndex = LIS->getInstructionIndex(MI);
+  SlotIndex BeforeMIUses = MIIndex.getBaseIndex();
+  SlotIndex AfterMIDefs = MIIndex.getBoundaryIndex();
+  for (MCRegUnit Unit : TRI->regunits(SuperPhysReg)) {
+    const LiveRange &UnitRange = LIS->getRegUnit(Unit);
+    // If the regunit is live both before and after MI,
+    // we assume it is live through.
+    // Generally speaking, this is not true, because something like
+    // "RU = op RU" would match that description.
+    // However, we know that we are trying to assess whether
+    // a def of a virtual reg, vreg, is live at the same time of RU.
+    // If we are in the "RU = op RU" situation, that means that vreg
+    // is defined at the same time as RU (i.e., "vreg, RU = op RU").
+    // Thus, vreg and RU interferes and vreg cannot be assigned to
+    // SuperPhysReg. Therefore, this situation cannot happen.
+    if (UnitRange.liveAt(AfterMIDefs) && UnitRange.liveAt(BeforeMIUses))
+      return true;
+  }
+  return false;
+}
+
+void VirtRegRewriter::rewrite() {
+  bool NoSubRegLiveness = !MRI->subRegLivenessEnabled();
+  SmallVector<Register, 8> SuperDeads;
+  SmallVector<Register, 8> SuperDefs;
+  SmallVector<Register, 8> SuperKills;
+
+  for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
+       MBBI != MBBE; ++MBBI) {
+    LLVM_DEBUG(MBBI->print(dbgs(), Indexes));
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBBI->instrs())) {
+      for (MachineOperand &MO : MI.operands()) {
+        // Make sure MRI knows about registers clobbered by regmasks.
+        if (MO.isRegMask())
+          MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
+
+        if (!MO.isReg() || !MO.getReg().isVirtual())
+          continue;
+        Register VirtReg = MO.getReg();
+        MCRegister PhysReg = VRM->getPhys(VirtReg);
+        if (PhysReg == VirtRegMap::NO_PHYS_REG)
+          continue;
+
+        assert(Register(PhysReg).isPhysical());
+
+        RewriteRegs.insert(PhysReg);
+        assert(!MRI->isReserved(PhysReg) && "Reserved register assignment");
+
+        // Preserve semantics of sub-register operands.
+        unsigned SubReg = MO.getSubReg();
+        if (SubReg != 0) {
+          if (NoSubRegLiveness || !MRI->shouldTrackSubRegLiveness(VirtReg)) {
+            // A virtual register kill refers to the whole register, so we may
+            // have to add implicit killed operands for the super-register.  A
+            // partial redef always kills and redefines the super-register.
+            if ((MO.readsReg() && (MO.isDef() || MO.isKill())) ||
+                (MO.isDef() && subRegLiveThrough(MI, PhysReg)))
+              SuperKills.push_back(PhysReg);
+
+            if (MO.isDef()) {
+              // Also add implicit defs for the super-register.
+              if (MO.isDead())
+                SuperDeads.push_back(PhysReg);
+              else
+                SuperDefs.push_back(PhysReg);
+            }
+          } else {
+            if (MO.isUse()) {
+              if (readsUndefSubreg(MO))
+                // We need to add an <undef> flag if the subregister is
+                // completely undefined (and we are not adding super-register
+                // defs).
+                MO.setIsUndef(true);
+            } else if (!MO.isDead()) {
+              assert(MO.isDef());
+            }
+          }
+
+          // The def undef and def internal flags only make sense for
+          // sub-register defs, and we are substituting a full physreg.  An
+          // implicit killed operand from the SuperKills list will represent the
+          // partial read of the super-register.
+          if (MO.isDef()) {
+            MO.setIsUndef(false);
+            MO.setIsInternalRead(false);
+          }
+
+          // PhysReg operands cannot have subregister indexes.
+          PhysReg = TRI->getSubReg(PhysReg, SubReg);
+          assert(PhysReg.isValid() && "Invalid SubReg for physical register");
+          MO.setSubReg(0);
+        }
+        // Rewrite. Note we could have used MachineOperand::substPhysReg(), but
+        // we need the inlining here.
+        MO.setReg(PhysReg);
+        MO.setIsRenamable(true);
+      }
+
+      // Add any missing super-register kills after rewriting the whole
+      // instruction.
+      while (!SuperKills.empty())
+        MI.addRegisterKilled(SuperKills.pop_back_val(), TRI, true);
+
+      while (!SuperDeads.empty())
+        MI.addRegisterDead(SuperDeads.pop_back_val(), TRI, true);
+
+      while (!SuperDefs.empty())
+        MI.addRegisterDefined(SuperDefs.pop_back_val(), TRI);
+
+      LLVM_DEBUG(dbgs() << "> " << MI);
+
+      expandCopyBundle(MI);
+
+      // We can remove identity copies right now.
+      handleIdentityCopy(MI);
+    }
+  }
+
+  if (LIS) {
+    // Don't bother maintaining accurate LiveIntervals for registers which were
+    // already allocated.
+    for (Register PhysReg : RewriteRegs) {
+      for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+        LIS->removeRegUnit(Unit);
+      }
+    }
+  }
+
+  RewriteRegs.clear();
+}
+
+FunctionPass *llvm::createVirtRegRewriter(bool ClearVirtRegs) {
+  return new VirtRegRewriter(ClearVirtRegs);
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/WasmEHPrepare.cpp b/contrib/llvm-project/llvm/lib/CodeGen/WasmEHPrepare.cpp
new file mode 100644
index 000000000000..cc04807e8455
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/WasmEHPrepare.cpp
@@ -0,0 +1,377 @@
+//===-- WasmEHPrepare - Prepare excepton handling for WebAssembly --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation is designed for use by code generators which use
+// WebAssembly exception handling scheme. This currently supports C++
+// exceptions.
+//
+// WebAssembly exception handling uses Windows exception IR for the middle level
+// representation. This pass does the following transformation for every
+// catchpad block:
+// (In C-style pseudocode)
+//
+// - Before:
+//   catchpad ...
+//   exn = wasm.get.exception();
+//   selector = wasm.get.selector();
+//   ...
+//
+// - After:
+//   catchpad ...
+//   exn = wasm.catch(WebAssembly::CPP_EXCEPTION);
+//   // Only add below in case it's not a single catch (...)
+//   wasm.landingpad.index(index);
+//   __wasm_lpad_context.lpad_index = index;
+//   __wasm_lpad_context.lsda = wasm.lsda();
+//   _Unwind_CallPersonality(exn);
+//   selector = __wasm_lpad_context.selector;
+//   ...
+//
+//
+// * Background: Direct personality function call
+// In WebAssembly EH, the VM is responsible for unwinding the stack once an
+// exception is thrown. After the stack is unwound, the control flow is
+// transfered to WebAssembly 'catch' instruction.
+//
+// Unwinding the stack is not done by libunwind but the VM, so the personality
+// function in libcxxabi cannot be called from libunwind during the unwinding
+// process. So after a catch instruction, we insert a call to a wrapper function
+// in libunwind that in turn calls the real personality function.
+//
+// In Itanium EH, if the personality function decides there is no matching catch
+// clause in a call frame and no cleanup action to perform, the unwinder doesn't
+// stop there and continues unwinding. But in Wasm EH, the unwinder stops at
+// every call frame with a catch intruction, after which the personality
+// function is called from the compiler-generated user code here.
+//
+// In libunwind, we have this struct that serves as a communincation channel
+// between the compiler-generated user code and the personality function in
+// libcxxabi.
+//
+// struct _Unwind_LandingPadContext {
+//   uintptr_t lpad_index;
+//   uintptr_t lsda;
+//   uintptr_t selector;
+// };
+// struct _Unwind_LandingPadContext __wasm_lpad_context = ...;
+//
+// And this wrapper in libunwind calls the personality function.
+//
+// _Unwind_Reason_Code _Unwind_CallPersonality(void *exception_ptr) {
+//   struct _Unwind_Exception *exception_obj =
+//       (struct _Unwind_Exception *)exception_ptr;
+//   _Unwind_Reason_Code ret = __gxx_personality_v0(
+//       1, _UA_CLEANUP_PHASE, exception_obj->exception_class, exception_obj,
+//       (struct _Unwind_Context *)__wasm_lpad_context);
+//   return ret;
+// }
+//
+// We pass a landing pad index, and the address of LSDA for the current function
+// to the wrapper function _Unwind_CallPersonality in libunwind, and we retrieve
+// the selector after it returns.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/WasmEHFuncInfo.h"
+#include "llvm/IR/EHPersonalities.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicsWebAssembly.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "wasmehprepare"
+
+namespace {
+class WasmEHPrepare : public FunctionPass {
+  Type *LPadContextTy = nullptr; // type of 'struct _Unwind_LandingPadContext'
+  GlobalVariable *LPadContextGV = nullptr; // __wasm_lpad_context
+
+  // Field addresses of struct _Unwind_LandingPadContext
+  Value *LPadIndexField = nullptr; // lpad_index field
+  Value *LSDAField = nullptr;      // lsda field
+  Value *SelectorField = nullptr;  // selector
+
+  Function *ThrowF = nullptr;       // wasm.throw() intrinsic
+  Function *LPadIndexF = nullptr;   // wasm.landingpad.index() intrinsic
+  Function *LSDAF = nullptr;        // wasm.lsda() intrinsic
+  Function *GetExnF = nullptr;      // wasm.get.exception() intrinsic
+  Function *CatchF = nullptr;       // wasm.catch() intrinsic
+  Function *GetSelectorF = nullptr; // wasm.get.ehselector() intrinsic
+  FunctionCallee CallPersonalityF =
+      nullptr; // _Unwind_CallPersonality() wrapper
+
+  bool prepareThrows(Function &F);
+  bool prepareEHPads(Function &F);
+  void prepareEHPad(BasicBlock *BB, bool NeedPersonality, unsigned Index = 0);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  WasmEHPrepare() : FunctionPass(ID) {}
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override {
+    return "WebAssembly Exception handling preparation";
+  }
+};
+} // end anonymous namespace
+
+char WasmEHPrepare::ID = 0;
+INITIALIZE_PASS_BEGIN(WasmEHPrepare, DEBUG_TYPE,
+                      "Prepare WebAssembly exceptions", false, false)
+INITIALIZE_PASS_END(WasmEHPrepare, DEBUG_TYPE, "Prepare WebAssembly exceptions",
+                    false, false)
+
+FunctionPass *llvm::createWasmEHPass() { return new WasmEHPrepare(); }
+
+bool WasmEHPrepare::doInitialization(Module &M) {
+  IRBuilder<> IRB(M.getContext());
+  LPadContextTy = StructType::get(IRB.getInt32Ty(),   // lpad_index
+                                  IRB.getInt8PtrTy(), // lsda
+                                  IRB.getInt32Ty()    // selector
+  );
+  return false;
+}
+
+// Erase the specified BBs if the BB does not have any remaining predecessors,
+// and also all its dead children.
+template <typename Container>
+static void eraseDeadBBsAndChildren(const Container &BBs) {
+  SmallVector<BasicBlock *, 8> WL(BBs.begin(), BBs.end());
+  while (!WL.empty()) {
+    auto *BB = WL.pop_back_val();
+    if (!pred_empty(BB))
+      continue;
+    WL.append(succ_begin(BB), succ_end(BB));
+    DeleteDeadBlock(BB);
+  }
+}
+
+bool WasmEHPrepare::runOnFunction(Function &F) {
+  bool Changed = false;
+  Changed |= prepareThrows(F);
+  Changed |= prepareEHPads(F);
+  return Changed;
+}
+
+bool WasmEHPrepare::prepareThrows(Function &F) {
+  Module &M = *F.getParent();
+  IRBuilder<> IRB(F.getContext());
+  bool Changed = false;
+
+  // wasm.throw() intinsic, which will be lowered to wasm 'throw' instruction.
+  ThrowF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_throw);
+  // Insert an unreachable instruction after a call to @llvm.wasm.throw and
+  // delete all following instructions within the BB, and delete all the dead
+  // children of the BB as well.
+  for (User *U : ThrowF->users()) {
+    // A call to @llvm.wasm.throw() is only generated from __cxa_throw()
+    // builtin call within libcxxabi, and cannot be an InvokeInst.
+    auto *ThrowI = cast<CallInst>(U);
+    if (ThrowI->getFunction() != &F)
+      continue;
+    Changed = true;
+    auto *BB = ThrowI->getParent();
+    SmallVector<BasicBlock *, 4> Succs(successors(BB));
+    BB->erase(std::next(BasicBlock::iterator(ThrowI)), BB->end());
+    IRB.SetInsertPoint(BB);
+    IRB.CreateUnreachable();
+    eraseDeadBBsAndChildren(Succs);
+  }
+
+  return Changed;
+}
+
+bool WasmEHPrepare::prepareEHPads(Function &F) {
+  Module &M = *F.getParent();
+  IRBuilder<> IRB(F.getContext());
+
+  SmallVector<BasicBlock *, 16> CatchPads;
+  SmallVector<BasicBlock *, 16> CleanupPads;
+  for (BasicBlock &BB : F) {
+    if (!BB.isEHPad())
+      continue;
+    auto *Pad = BB.getFirstNonPHI();
+    if (isa<CatchPadInst>(Pad))
+      CatchPads.push_back(&BB);
+    else if (isa<CleanupPadInst>(Pad))
+      CleanupPads.push_back(&BB);
+  }
+  if (CatchPads.empty() && CleanupPads.empty())
+    return false;
+
+  if (!F.hasPersonalityFn() ||
+      !isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) {
+    report_fatal_error("Function '" + F.getName() +
+                       "' does not have a correct Wasm personality function "
+                       "'__gxx_wasm_personality_v0'");
+  }
+  assert(F.hasPersonalityFn() && "Personality function not found");
+
+  // __wasm_lpad_context global variable.
+  // This variable should be thread local. If the target does not support TLS,
+  // we depend on CoalesceFeaturesAndStripAtomics to downgrade it to
+  // non-thread-local ones, in which case we don't allow this object to be
+  // linked with other objects using shared memory.
+  LPadContextGV = cast<GlobalVariable>(
+      M.getOrInsertGlobal("__wasm_lpad_context", LPadContextTy));
+  LPadContextGV->setThreadLocalMode(GlobalValue::GeneralDynamicTLSModel);
+
+  LPadIndexField = IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 0,
+                                          "lpad_index_gep");
+  LSDAField =
+      IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 1, "lsda_gep");
+  SelectorField = IRB.CreateConstGEP2_32(LPadContextTy, LPadContextGV, 0, 2,
+                                         "selector_gep");
+
+  // wasm.landingpad.index() intrinsic, which is to specify landingpad index
+  LPadIndexF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_landingpad_index);
+  // wasm.lsda() intrinsic. Returns the address of LSDA table for the current
+  // function.
+  LSDAF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_lsda);
+  // wasm.get.exception() and wasm.get.ehselector() intrinsics. Calls to these
+  // are generated in clang.
+  GetExnF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_get_exception);
+  GetSelectorF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_get_ehselector);
+
+  // wasm.catch() will be lowered down to wasm 'catch' instruction in
+  // instruction selection.
+  CatchF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_catch);
+
+  // _Unwind_CallPersonality() wrapper function, which calls the personality
+  CallPersonalityF = M.getOrInsertFunction(
+      "_Unwind_CallPersonality", IRB.getInt32Ty(), IRB.getInt8PtrTy());
+  if (Function *F = dyn_cast<Function>(CallPersonalityF.getCallee()))
+    F->setDoesNotThrow();
+
+  unsigned Index = 0;
+  for (auto *BB : CatchPads) {
+    auto *CPI = cast<CatchPadInst>(BB->getFirstNonPHI());
+    // In case of a single catch (...), we don't need to emit a personalify
+    // function call
+    if (CPI->arg_size() == 1 &&
+        cast<Constant>(CPI->getArgOperand(0))->isNullValue())
+      prepareEHPad(BB, false);
+    else
+      prepareEHPad(BB, true, Index++);
+  }
+
+  // Cleanup pads don't need a personality function call.
+  for (auto *BB : CleanupPads)
+    prepareEHPad(BB, false);
+
+  return true;
+}
+
+// Prepare an EH pad for Wasm EH handling. If NeedPersonality is false, Index is
+// ignored.
+void WasmEHPrepare::prepareEHPad(BasicBlock *BB, bool NeedPersonality,
+                                 unsigned Index) {
+  assert(BB->isEHPad() && "BB is not an EHPad!");
+  IRBuilder<> IRB(BB->getContext());
+  IRB.SetInsertPoint(&*BB->getFirstInsertionPt());
+
+  auto *FPI = cast<FuncletPadInst>(BB->getFirstNonPHI());
+  Instruction *GetExnCI = nullptr, *GetSelectorCI = nullptr;
+  for (auto &U : FPI->uses()) {
+    if (auto *CI = dyn_cast<CallInst>(U.getUser())) {
+      if (CI->getCalledOperand() == GetExnF)
+        GetExnCI = CI;
+      if (CI->getCalledOperand() == GetSelectorF)
+        GetSelectorCI = CI;
+    }
+  }
+
+  // Cleanup pads do not have any of wasm.get.exception() or
+  // wasm.get.ehselector() calls. We need to do nothing.
+  if (!GetExnCI) {
+    assert(!GetSelectorCI &&
+           "wasm.get.ehselector() cannot exist w/o wasm.get.exception()");
+    return;
+  }
+
+  // Replace wasm.get.exception intrinsic with wasm.catch intrinsic, which will
+  // be lowered to wasm 'catch' instruction. We do this mainly because
+  // instruction selection cannot handle wasm.get.exception intrinsic's token
+  // argument.
+  Instruction *CatchCI =
+      IRB.CreateCall(CatchF, {IRB.getInt32(WebAssembly::CPP_EXCEPTION)}, "exn");
+  GetExnCI->replaceAllUsesWith(CatchCI);
+  GetExnCI->eraseFromParent();
+
+  // In case it is a catchpad with single catch (...) or a cleanuppad, we don't
+  // need to call personality function because we don't need a selector.
+  if (!NeedPersonality) {
+    if (GetSelectorCI) {
+      assert(GetSelectorCI->use_empty() &&
+             "wasm.get.ehselector() still has uses!");
+      GetSelectorCI->eraseFromParent();
+    }
+    return;
+  }
+  IRB.SetInsertPoint(CatchCI->getNextNode());
+
+  // This is to create a map of <landingpad EH label, landingpad index> in
+  // SelectionDAGISel, which is to be used in EHStreamer to emit LSDA tables.
+  // Pseudocode: wasm.landingpad.index(Index);
+  IRB.CreateCall(LPadIndexF, {FPI, IRB.getInt32(Index)});
+
+  // Pseudocode: __wasm_lpad_context.lpad_index = index;
+  IRB.CreateStore(IRB.getInt32(Index), LPadIndexField);
+
+  auto *CPI = cast<CatchPadInst>(FPI);
+  // TODO Sometimes storing the LSDA address every time is not necessary, in
+  // case it is already set in a dominating EH pad and there is no function call
+  // between from that EH pad to here. Consider optimizing those cases.
+  // Pseudocode: __wasm_lpad_context.lsda = wasm.lsda();
+  IRB.CreateStore(IRB.CreateCall(LSDAF), LSDAField);
+
+  // Pseudocode: _Unwind_CallPersonality(exn);
+  CallInst *PersCI = IRB.CreateCall(CallPersonalityF, CatchCI,
+                                    OperandBundleDef("funclet", CPI));
+  PersCI->setDoesNotThrow();
+
+  // Pseudocode: int selector = __wasm_lpad_context.selector;
+  Instruction *Selector =
+      IRB.CreateLoad(IRB.getInt32Ty(), SelectorField, "selector");
+
+  // Replace the return value from wasm.get.ehselector() with the selector value
+  // loaded from __wasm_lpad_context.selector.
+  assert(GetSelectorCI && "wasm.get.ehselector() call does not exist");
+  GetSelectorCI->replaceAllUsesWith(Selector);
+  GetSelectorCI->eraseFromParent();
+}
+
+void llvm::calculateWasmEHInfo(const Function *F, WasmEHFuncInfo &EHInfo) {
+  // If an exception is not caught by a catchpad (i.e., it is a foreign
+  // exception), it will unwind to its parent catchswitch's unwind destination.
+  // We don't record an unwind destination for cleanuppads because every
+  // exception should be caught by it.
+  for (const auto &BB : *F) {
+    if (!BB.isEHPad())
+      continue;
+    const Instruction *Pad = BB.getFirstNonPHI();
+
+    if (const auto *CatchPad = dyn_cast<CatchPadInst>(Pad)) {
+      const auto *UnwindBB = CatchPad->getCatchSwitch()->getUnwindDest();
+      if (!UnwindBB)
+        continue;
+      const Instruction *UnwindPad = UnwindBB->getFirstNonPHI();
+      if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(UnwindPad))
+        // Currently there should be only one handler per a catchswitch.
+        EHInfo.setUnwindDest(&BB, *CatchSwitch->handlers().begin());
+      else // cleanuppad
+        EHInfo.setUnwindDest(&BB, UnwindBB);
+    }
+  }
+}
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/WinEHPrepare.cpp b/contrib/llvm-project/llvm/lib/CodeGen/WinEHPrepare.cpp
new file mode 100644
index 000000000000..11597b119893
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/WinEHPrepare.cpp
@@ -0,0 +1,1396 @@
+//===-- WinEHPrepare - Prepare exception handling for code generation ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers LLVM IR exception handling into something closer to what the
+// backend wants for functions using a personality function from a runtime
+// provided by MSVC. Functions with other personality functions are left alone
+// and may be prepared by other passes. In particular, all supported MSVC
+// personality functions require cleanup code to be outlined, and the C++
+// personality requires catch handler code to be outlined.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/EHPersonalities.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "winehprepare"
+
+static cl::opt<bool> DisableDemotion(
+    "disable-demotion", cl::Hidden,
+    cl::desc(
+        "Clone multicolor basic blocks but do not demote cross scopes"),
+    cl::init(false));
+
+static cl::opt<bool> DisableCleanups(
+    "disable-cleanups", cl::Hidden,
+    cl::desc("Do not remove implausible terminators or other similar cleanups"),
+    cl::init(false));
+
+static cl::opt<bool> DemoteCatchSwitchPHIOnlyOpt(
+    "demote-catchswitch-only", cl::Hidden,
+    cl::desc("Demote catchswitch BBs only (for wasm EH)"), cl::init(false));
+
+namespace {
+
+class WinEHPrepare : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  WinEHPrepare(bool DemoteCatchSwitchPHIOnly = false)
+      : FunctionPass(ID), DemoteCatchSwitchPHIOnly(DemoteCatchSwitchPHIOnly) {}
+
+  bool runOnFunction(Function &Fn) override;
+
+  bool doFinalization(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  StringRef getPassName() const override {
+    return "Windows exception handling preparation";
+  }
+
+private:
+  void insertPHIStores(PHINode *OriginalPHI, AllocaInst *SpillSlot);
+  void
+  insertPHIStore(BasicBlock *PredBlock, Value *PredVal, AllocaInst *SpillSlot,
+                 SmallVectorImpl<std::pair<BasicBlock *, Value *>> &Worklist);
+  AllocaInst *insertPHILoads(PHINode *PN, Function &F);
+  void replaceUseWithLoad(Value *V, Use &U, AllocaInst *&SpillSlot,
+                          DenseMap<BasicBlock *, Value *> &Loads, Function &F);
+  bool prepareExplicitEH(Function &F);
+  void colorFunclets(Function &F);
+
+  void demotePHIsOnFunclets(Function &F, bool DemoteCatchSwitchPHIOnly);
+  void cloneCommonBlocks(Function &F);
+  void removeImplausibleInstructions(Function &F);
+  void cleanupPreparedFunclets(Function &F);
+  void verifyPreparedFunclets(Function &F);
+
+  bool DemoteCatchSwitchPHIOnly;
+
+  // All fields are reset by runOnFunction.
+  EHPersonality Personality = EHPersonality::Unknown;
+
+  const DataLayout *DL = nullptr;
+  DenseMap<BasicBlock *, ColorVector> BlockColors;
+  MapVector<BasicBlock *, std::vector<BasicBlock *>> FuncletBlocks;
+};
+
+} // end anonymous namespace
+
+char WinEHPrepare::ID = 0;
+INITIALIZE_PASS(WinEHPrepare, DEBUG_TYPE, "Prepare Windows exceptions",
+                false, false)
+
+FunctionPass *llvm::createWinEHPass(bool DemoteCatchSwitchPHIOnly) {
+  return new WinEHPrepare(DemoteCatchSwitchPHIOnly);
+}
+
+bool WinEHPrepare::runOnFunction(Function &Fn) {
+  if (!Fn.hasPersonalityFn())
+    return false;
+
+  // Classify the personality to see what kind of preparation we need.
+  Personality = classifyEHPersonality(Fn.getPersonalityFn());
+
+  // Do nothing if this is not a scope-based personality.
+  if (!isScopedEHPersonality(Personality))
+    return false;
+
+  DL = &Fn.getParent()->getDataLayout();
+  return prepareExplicitEH(Fn);
+}
+
+bool WinEHPrepare::doFinalization(Module &M) { return false; }
+
+void WinEHPrepare::getAnalysisUsage(AnalysisUsage &AU) const {}
+
+static int addUnwindMapEntry(WinEHFuncInfo &FuncInfo, int ToState,
+                             const BasicBlock *BB) {
+  CxxUnwindMapEntry UME;
+  UME.ToState = ToState;
+  UME.Cleanup = BB;
+  FuncInfo.CxxUnwindMap.push_back(UME);
+  return FuncInfo.getLastStateNumber();
+}
+
+static void addTryBlockMapEntry(WinEHFuncInfo &FuncInfo, int TryLow,
+                                int TryHigh, int CatchHigh,
+                                ArrayRef<const CatchPadInst *> Handlers) {
+  WinEHTryBlockMapEntry TBME;
+  TBME.TryLow = TryLow;
+  TBME.TryHigh = TryHigh;
+  TBME.CatchHigh = CatchHigh;
+  assert(TBME.TryLow <= TBME.TryHigh);
+  for (const CatchPadInst *CPI : Handlers) {
+    WinEHHandlerType HT;
+    Constant *TypeInfo = cast<Constant>(CPI->getArgOperand(0));
+    if (TypeInfo->isNullValue())
+      HT.TypeDescriptor = nullptr;
+    else
+      HT.TypeDescriptor = cast<GlobalVariable>(TypeInfo->stripPointerCasts());
+    HT.Adjectives = cast<ConstantInt>(CPI->getArgOperand(1))->getZExtValue();
+    HT.Handler = CPI->getParent();
+    if (auto *AI =
+            dyn_cast<AllocaInst>(CPI->getArgOperand(2)->stripPointerCasts()))
+      HT.CatchObj.Alloca = AI;
+    else
+      HT.CatchObj.Alloca = nullptr;
+    TBME.HandlerArray.push_back(HT);
+  }
+  FuncInfo.TryBlockMap.push_back(TBME);
+}
+
+static BasicBlock *getCleanupRetUnwindDest(const CleanupPadInst *CleanupPad) {
+  for (const User *U : CleanupPad->users())
+    if (const auto *CRI = dyn_cast<CleanupReturnInst>(U))
+      return CRI->getUnwindDest();
+  return nullptr;
+}
+
+static void calculateStateNumbersForInvokes(const Function *Fn,
+                                            WinEHFuncInfo &FuncInfo) {
+  auto *F = const_cast<Function *>(Fn);
+  DenseMap<BasicBlock *, ColorVector> BlockColors = colorEHFunclets(*F);
+  for (BasicBlock &BB : *F) {
+    auto *II = dyn_cast<InvokeInst>(BB.getTerminator());
+    if (!II)
+      continue;
+
+    auto &BBColors = BlockColors[&BB];
+    assert(BBColors.size() == 1 && "multi-color BB not removed by preparation");
+    BasicBlock *FuncletEntryBB = BBColors.front();
+
+    BasicBlock *FuncletUnwindDest;
+    auto *FuncletPad =
+        dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI());
+    assert(FuncletPad || FuncletEntryBB == &Fn->getEntryBlock());
+    if (!FuncletPad)
+      FuncletUnwindDest = nullptr;
+    else if (auto *CatchPad = dyn_cast<CatchPadInst>(FuncletPad))
+      FuncletUnwindDest = CatchPad->getCatchSwitch()->getUnwindDest();
+    else if (auto *CleanupPad = dyn_cast<CleanupPadInst>(FuncletPad))
+      FuncletUnwindDest = getCleanupRetUnwindDest(CleanupPad);
+    else
+      llvm_unreachable("unexpected funclet pad!");
+
+    BasicBlock *InvokeUnwindDest = II->getUnwindDest();
+    int BaseState = -1;
+    if (FuncletUnwindDest == InvokeUnwindDest) {
+      auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad);
+      if (BaseStateI != FuncInfo.FuncletBaseStateMap.end())
+        BaseState = BaseStateI->second;
+    }
+
+    if (BaseState != -1) {
+      FuncInfo.InvokeStateMap[II] = BaseState;
+    } else {
+      Instruction *PadInst = InvokeUnwindDest->getFirstNonPHI();
+      assert(FuncInfo.EHPadStateMap.count(PadInst) && "EH Pad has no state!");
+      FuncInfo.InvokeStateMap[II] = FuncInfo.EHPadStateMap[PadInst];
+    }
+  }
+}
+
+// See comments below for calculateSEHStateForAsynchEH().
+// State - incoming State of normal paths
+struct WorkItem {
+  const BasicBlock *Block;
+  int State;
+  WorkItem(const BasicBlock *BB, int St) {
+    Block = BB;
+    State = St;
+  }
+};
+void llvm::calculateCXXStateForAsynchEH(const BasicBlock *BB, int State,
+                                        WinEHFuncInfo &EHInfo) {
+  SmallVector<struct WorkItem *, 8> WorkList;
+  struct WorkItem *WI = new WorkItem(BB, State);
+  WorkList.push_back(WI);
+
+  while (!WorkList.empty()) {
+    WI = WorkList.pop_back_val();
+    const BasicBlock *BB = WI->Block;
+    int State = WI->State;
+    delete WI;
+    if (EHInfo.BlockToStateMap.count(BB) && EHInfo.BlockToStateMap[BB] <= State)
+      continue; // skip blocks already visited by lower State
+
+    const llvm::Instruction *I = BB->getFirstNonPHI();
+    const llvm::Instruction *TI = BB->getTerminator();
+    if (I->isEHPad())
+      State = EHInfo.EHPadStateMap[I];
+    EHInfo.BlockToStateMap[BB] = State; // Record state, also flag visiting
+
+    if ((isa<CleanupReturnInst>(TI) || isa<CatchReturnInst>(TI)) && State > 0) {
+      // Retrive the new State
+      State = EHInfo.CxxUnwindMap[State].ToState; // Retrive next State
+    } else if (isa<InvokeInst>(TI)) {
+      auto *Call = cast<CallBase>(TI);
+      const Function *Fn = Call->getCalledFunction();
+      if (Fn && Fn->isIntrinsic() &&
+          (Fn->getIntrinsicID() == Intrinsic::seh_scope_begin ||
+           Fn->getIntrinsicID() == Intrinsic::seh_try_begin))
+        // Retrive the new State from seh_scope_begin
+        State = EHInfo.InvokeStateMap[cast<InvokeInst>(TI)];
+      else if (Fn && Fn->isIntrinsic() &&
+               (Fn->getIntrinsicID() == Intrinsic::seh_scope_end ||
+                Fn->getIntrinsicID() == Intrinsic::seh_try_end)) {
+        // In case of conditional ctor, let's retrieve State from Invoke
+        State = EHInfo.InvokeStateMap[cast<InvokeInst>(TI)];
+        // end of current state, retrive new state from UnwindMap
+        State = EHInfo.CxxUnwindMap[State].ToState;
+      }
+    }
+    // Continue push successors into worklist
+    for (auto *SuccBB : successors(BB)) {
+      WI = new WorkItem(SuccBB, State);
+      WorkList.push_back(WI);
+    }
+  }
+}
+
+// The central theory of this routine is based on the following:
+//   A _try scope is always a SEME (Single Entry Multiple Exits) region
+//     as jumping into a _try is not allowed
+//   The single entry must start with a seh_try_begin() invoke with a
+//     correct State number that is the initial state of the SEME.
+//   Through control-flow, state number is propagated into all blocks.
+//   Side exits marked by seh_try_end() will unwind to parent state via
+//     existing SEHUnwindMap[].
+//   Side exits can ONLY jump into parent scopes (lower state number).
+//   Thus, when a block succeeds various states from its predecessors,
+//     the lowest State trumphs others.
+//   If some exits flow to unreachable, propagation on those paths terminate,
+//     not affecting remaining blocks.
+void llvm::calculateSEHStateForAsynchEH(const BasicBlock *BB, int State,
+                                        WinEHFuncInfo &EHInfo) {
+  SmallVector<struct WorkItem *, 8> WorkList;
+  struct WorkItem *WI = new WorkItem(BB, State);
+  WorkList.push_back(WI);
+
+  while (!WorkList.empty()) {
+    WI = WorkList.pop_back_val();
+    const BasicBlock *BB = WI->Block;
+    int State = WI->State;
+    delete WI;
+    if (EHInfo.BlockToStateMap.count(BB) && EHInfo.BlockToStateMap[BB] <= State)
+      continue; // skip blocks already visited by lower State
+
+    const llvm::Instruction *I = BB->getFirstNonPHI();
+    const llvm::Instruction *TI = BB->getTerminator();
+    if (I->isEHPad())
+      State = EHInfo.EHPadStateMap[I];
+    EHInfo.BlockToStateMap[BB] = State; // Record state
+
+    if (isa<CatchPadInst>(I) && isa<CatchReturnInst>(TI)) {
+      const Constant *FilterOrNull = cast<Constant>(
+          cast<CatchPadInst>(I)->getArgOperand(0)->stripPointerCasts());
+      const Function *Filter = dyn_cast<Function>(FilterOrNull);
+      if (!Filter || !Filter->getName().startswith("__IsLocalUnwind"))
+        State = EHInfo.SEHUnwindMap[State].ToState; // Retrive next State
+    } else if ((isa<CleanupReturnInst>(TI) || isa<CatchReturnInst>(TI)) &&
+               State > 0) {
+      // Retrive the new State.
+      State = EHInfo.SEHUnwindMap[State].ToState; // Retrive next State
+    } else if (isa<InvokeInst>(TI)) {
+      auto *Call = cast<CallBase>(TI);
+      const Function *Fn = Call->getCalledFunction();
+      if (Fn && Fn->isIntrinsic() &&
+          Fn->getIntrinsicID() == Intrinsic::seh_try_begin)
+        // Retrive the new State from seh_try_begin
+        State = EHInfo.InvokeStateMap[cast<InvokeInst>(TI)];
+      else if (Fn && Fn->isIntrinsic() &&
+               Fn->getIntrinsicID() == Intrinsic::seh_try_end)
+        // end of current state, retrive new state from UnwindMap
+        State = EHInfo.SEHUnwindMap[State].ToState;
+    }
+    // Continue push successors into worklist
+    for (auto *SuccBB : successors(BB)) {
+      WI = new WorkItem(SuccBB, State);
+      WorkList.push_back(WI);
+    }
+  }
+}
+
+// Given BB which ends in an unwind edge, return the EHPad that this BB belongs
+// to. If the unwind edge came from an invoke, return null.
+static const BasicBlock *getEHPadFromPredecessor(const BasicBlock *BB,
+                                                 Value *ParentPad) {
+  const Instruction *TI = BB->getTerminator();
+  if (isa<InvokeInst>(TI))
+    return nullptr;
+  if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) {
+    if (CatchSwitch->getParentPad() != ParentPad)
+      return nullptr;
+    return BB;
+  }
+  assert(!TI->isEHPad() && "unexpected EHPad!");
+  auto *CleanupPad = cast<CleanupReturnInst>(TI)->getCleanupPad();
+  if (CleanupPad->getParentPad() != ParentPad)
+    return nullptr;
+  return CleanupPad->getParent();
+}
+
+// Starting from a EHPad, Backward walk through control-flow graph
+// to produce two primary outputs:
+//      FuncInfo.EHPadStateMap[] and FuncInfo.CxxUnwindMap[]
+static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo,
+                                     const Instruction *FirstNonPHI,
+                                     int ParentState) {
+  const BasicBlock *BB = FirstNonPHI->getParent();
+  assert(BB->isEHPad() && "not a funclet!");
+
+  if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(FirstNonPHI)) {
+    assert(FuncInfo.EHPadStateMap.count(CatchSwitch) == 0 &&
+           "shouldn't revist catch funclets!");
+
+    SmallVector<const CatchPadInst *, 2> Handlers;
+    for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
+      auto *CatchPad = cast<CatchPadInst>(CatchPadBB->getFirstNonPHI());
+      Handlers.push_back(CatchPad);
+    }
+    int TryLow = addUnwindMapEntry(FuncInfo, ParentState, nullptr);
+    FuncInfo.EHPadStateMap[CatchSwitch] = TryLow;
+    for (const BasicBlock *PredBlock : predecessors(BB))
+      if ((PredBlock = getEHPadFromPredecessor(PredBlock,
+                                               CatchSwitch->getParentPad())))
+        calculateCXXStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(),
+                                 TryLow);
+    int CatchLow = addUnwindMapEntry(FuncInfo, ParentState, nullptr);
+
+    // catchpads are separate funclets in C++ EH due to the way rethrow works.
+    int TryHigh = CatchLow - 1;
+
+    // MSVC FrameHandler3/4 on x64&Arm64 expect Catch Handlers in $tryMap$
+    //  stored in pre-order (outer first, inner next), not post-order
+    //  Add to map here.  Fix the CatchHigh after children are processed
+    const Module *Mod = BB->getParent()->getParent();
+    bool IsPreOrder = Triple(Mod->getTargetTriple()).isArch64Bit();
+    if (IsPreOrder)
+      addTryBlockMapEntry(FuncInfo, TryLow, TryHigh, CatchLow, Handlers);
+    unsigned TBMEIdx = FuncInfo.TryBlockMap.size() - 1;
+
+    for (const auto *CatchPad : Handlers) {
+      FuncInfo.FuncletBaseStateMap[CatchPad] = CatchLow;
+      FuncInfo.EHPadStateMap[CatchPad] = CatchLow;
+      for (const User *U : CatchPad->users()) {
+        const auto *UserI = cast<Instruction>(U);
+        if (auto *InnerCatchSwitch = dyn_cast<CatchSwitchInst>(UserI)) {
+          BasicBlock *UnwindDest = InnerCatchSwitch->getUnwindDest();
+          if (!UnwindDest || UnwindDest == CatchSwitch->getUnwindDest())
+            calculateCXXStateNumbers(FuncInfo, UserI, CatchLow);
+        }
+        if (auto *InnerCleanupPad = dyn_cast<CleanupPadInst>(UserI)) {
+          BasicBlock *UnwindDest = getCleanupRetUnwindDest(InnerCleanupPad);
+          // If a nested cleanup pad reports a null unwind destination and the
+          // enclosing catch pad doesn't it must be post-dominated by an
+          // unreachable instruction.
+          if (!UnwindDest || UnwindDest == CatchSwitch->getUnwindDest())
+            calculateCXXStateNumbers(FuncInfo, UserI, CatchLow);
+        }
+      }
+    }
+    int CatchHigh = FuncInfo.getLastStateNumber();
+    // Now child Catches are processed, update CatchHigh
+    if (IsPreOrder)
+      FuncInfo.TryBlockMap[TBMEIdx].CatchHigh = CatchHigh;
+    else // PostOrder
+      addTryBlockMapEntry(FuncInfo, TryLow, TryHigh, CatchHigh, Handlers);
+
+    LLVM_DEBUG(dbgs() << "TryLow[" << BB->getName() << "]: " << TryLow << '\n');
+    LLVM_DEBUG(dbgs() << "TryHigh[" << BB->getName() << "]: " << TryHigh
+                      << '\n');
+    LLVM_DEBUG(dbgs() << "CatchHigh[" << BB->getName() << "]: " << CatchHigh
+                      << '\n');
+  } else {
+    auto *CleanupPad = cast<CleanupPadInst>(FirstNonPHI);
+
+    // It's possible for a cleanup to be visited twice: it might have multiple
+    // cleanupret instructions.
+    if (FuncInfo.EHPadStateMap.count(CleanupPad))
+      return;
+
+    int CleanupState = addUnwindMapEntry(FuncInfo, ParentState, BB);
+    FuncInfo.EHPadStateMap[CleanupPad] = CleanupState;
+    LLVM_DEBUG(dbgs() << "Assigning state #" << CleanupState << " to BB "
+                      << BB->getName() << '\n');
+    for (const BasicBlock *PredBlock : predecessors(BB)) {
+      if ((PredBlock = getEHPadFromPredecessor(PredBlock,
+                                               CleanupPad->getParentPad()))) {
+        calculateCXXStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(),
+                                 CleanupState);
+      }
+    }
+    for (const User *U : CleanupPad->users()) {
+      const auto *UserI = cast<Instruction>(U);
+      if (UserI->isEHPad())
+        report_fatal_error("Cleanup funclets for the MSVC++ personality cannot "
+                           "contain exceptional actions");
+    }
+  }
+}
+
+static int addSEHExcept(WinEHFuncInfo &FuncInfo, int ParentState,
+                        const Function *Filter, const BasicBlock *Handler) {
+  SEHUnwindMapEntry Entry;
+  Entry.ToState = ParentState;
+  Entry.IsFinally = false;
+  Entry.Filter = Filter;
+  Entry.Handler = Handler;
+  FuncInfo.SEHUnwindMap.push_back(Entry);
+  return FuncInfo.SEHUnwindMap.size() - 1;
+}
+
+static int addSEHFinally(WinEHFuncInfo &FuncInfo, int ParentState,
+                         const BasicBlock *Handler) {
+  SEHUnwindMapEntry Entry;
+  Entry.ToState = ParentState;
+  Entry.IsFinally = true;
+  Entry.Filter = nullptr;
+  Entry.Handler = Handler;
+  FuncInfo.SEHUnwindMap.push_back(Entry);
+  return FuncInfo.SEHUnwindMap.size() - 1;
+}
+
+// Starting from a EHPad, Backward walk through control-flow graph
+// to produce two primary outputs:
+//      FuncInfo.EHPadStateMap[] and FuncInfo.SEHUnwindMap[]
+static void calculateSEHStateNumbers(WinEHFuncInfo &FuncInfo,
+                                     const Instruction *FirstNonPHI,
+                                     int ParentState) {
+  const BasicBlock *BB = FirstNonPHI->getParent();
+  assert(BB->isEHPad() && "no a funclet!");
+
+  if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(FirstNonPHI)) {
+    assert(FuncInfo.EHPadStateMap.count(CatchSwitch) == 0 &&
+           "shouldn't revist catch funclets!");
+
+    // Extract the filter function and the __except basic block and create a
+    // state for them.
+    assert(CatchSwitch->getNumHandlers() == 1 &&
+           "SEH doesn't have multiple handlers per __try");
+    const auto *CatchPad =
+        cast<CatchPadInst>((*CatchSwitch->handler_begin())->getFirstNonPHI());
+    const BasicBlock *CatchPadBB = CatchPad->getParent();
+    const Constant *FilterOrNull =
+        cast<Constant>(CatchPad->getArgOperand(0)->stripPointerCasts());
+    const Function *Filter = dyn_cast<Function>(FilterOrNull);
+    assert((Filter || FilterOrNull->isNullValue()) &&
+           "unexpected filter value");
+    int TryState = addSEHExcept(FuncInfo, ParentState, Filter, CatchPadBB);
+
+    // Everything in the __try block uses TryState as its parent state.
+    FuncInfo.EHPadStateMap[CatchSwitch] = TryState;
+    FuncInfo.EHPadStateMap[CatchPad] = TryState;
+    LLVM_DEBUG(dbgs() << "Assigning state #" << TryState << " to BB "
+                      << CatchPadBB->getName() << '\n');
+    for (const BasicBlock *PredBlock : predecessors(BB))
+      if ((PredBlock = getEHPadFromPredecessor(PredBlock,
+                                               CatchSwitch->getParentPad())))
+        calculateSEHStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(),
+                                 TryState);
+
+    // Everything in the __except block unwinds to ParentState, just like code
+    // outside the __try.
+    for (const User *U : CatchPad->users()) {
+      const auto *UserI = cast<Instruction>(U);
+      if (auto *InnerCatchSwitch = dyn_cast<CatchSwitchInst>(UserI)) {
+        BasicBlock *UnwindDest = InnerCatchSwitch->getUnwindDest();
+        if (!UnwindDest || UnwindDest == CatchSwitch->getUnwindDest())
+          calculateSEHStateNumbers(FuncInfo, UserI, ParentState);
+      }
+      if (auto *InnerCleanupPad = dyn_cast<CleanupPadInst>(UserI)) {
+        BasicBlock *UnwindDest = getCleanupRetUnwindDest(InnerCleanupPad);
+        // If a nested cleanup pad reports a null unwind destination and the
+        // enclosing catch pad doesn't it must be post-dominated by an
+        // unreachable instruction.
+        if (!UnwindDest || UnwindDest == CatchSwitch->getUnwindDest())
+          calculateSEHStateNumbers(FuncInfo, UserI, ParentState);
+      }
+    }
+  } else {
+    auto *CleanupPad = cast<CleanupPadInst>(FirstNonPHI);
+
+    // It's possible for a cleanup to be visited twice: it might have multiple
+    // cleanupret instructions.
+    if (FuncInfo.EHPadStateMap.count(CleanupPad))
+      return;
+
+    int CleanupState = addSEHFinally(FuncInfo, ParentState, BB);
+    FuncInfo.EHPadStateMap[CleanupPad] = CleanupState;
+    LLVM_DEBUG(dbgs() << "Assigning state #" << CleanupState << " to BB "
+                      << BB->getName() << '\n');
+    for (const BasicBlock *PredBlock : predecessors(BB))
+      if ((PredBlock =
+               getEHPadFromPredecessor(PredBlock, CleanupPad->getParentPad())))
+        calculateSEHStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(),
+                                 CleanupState);
+    for (const User *U : CleanupPad->users()) {
+      const auto *UserI = cast<Instruction>(U);
+      if (UserI->isEHPad())
+        report_fatal_error("Cleanup funclets for the SEH personality cannot "
+                           "contain exceptional actions");
+    }
+  }
+}
+
+static bool isTopLevelPadForMSVC(const Instruction *EHPad) {
+  if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(EHPad))
+    return isa<ConstantTokenNone>(CatchSwitch->getParentPad()) &&
+           CatchSwitch->unwindsToCaller();
+  if (auto *CleanupPad = dyn_cast<CleanupPadInst>(EHPad))
+    return isa<ConstantTokenNone>(CleanupPad->getParentPad()) &&
+           getCleanupRetUnwindDest(CleanupPad) == nullptr;
+  if (isa<CatchPadInst>(EHPad))
+    return false;
+  llvm_unreachable("unexpected EHPad!");
+}
+
+void llvm::calculateSEHStateNumbers(const Function *Fn,
+                                    WinEHFuncInfo &FuncInfo) {
+  // Don't compute state numbers twice.
+  if (!FuncInfo.SEHUnwindMap.empty())
+    return;
+
+  for (const BasicBlock &BB : *Fn) {
+    if (!BB.isEHPad())
+      continue;
+    const Instruction *FirstNonPHI = BB.getFirstNonPHI();
+    if (!isTopLevelPadForMSVC(FirstNonPHI))
+      continue;
+    ::calculateSEHStateNumbers(FuncInfo, FirstNonPHI, -1);
+  }
+
+  calculateStateNumbersForInvokes(Fn, FuncInfo);
+
+  bool IsEHa = Fn->getParent()->getModuleFlag("eh-asynch");
+  if (IsEHa) {
+    const BasicBlock *EntryBB = &(Fn->getEntryBlock());
+    calculateSEHStateForAsynchEH(EntryBB, -1, FuncInfo);
+  }
+}
+
+void llvm::calculateWinCXXEHStateNumbers(const Function *Fn,
+                                         WinEHFuncInfo &FuncInfo) {
+  // Return if it's already been done.
+  if (!FuncInfo.EHPadStateMap.empty())
+    return;
+
+  for (const BasicBlock &BB : *Fn) {
+    if (!BB.isEHPad())
+      continue;
+    const Instruction *FirstNonPHI = BB.getFirstNonPHI();
+    if (!isTopLevelPadForMSVC(FirstNonPHI))
+      continue;
+    calculateCXXStateNumbers(FuncInfo, FirstNonPHI, -1);
+  }
+
+  calculateStateNumbersForInvokes(Fn, FuncInfo);
+
+  bool IsEHa = Fn->getParent()->getModuleFlag("eh-asynch");
+  if (IsEHa) {
+    const BasicBlock *EntryBB = &(Fn->getEntryBlock());
+    calculateCXXStateForAsynchEH(EntryBB, -1, FuncInfo);
+  }
+}
+
+static int addClrEHHandler(WinEHFuncInfo &FuncInfo, int HandlerParentState,
+                           int TryParentState, ClrHandlerType HandlerType,
+                           uint32_t TypeToken, const BasicBlock *Handler) {
+  ClrEHUnwindMapEntry Entry;
+  Entry.HandlerParentState = HandlerParentState;
+  Entry.TryParentState = TryParentState;
+  Entry.Handler = Handler;
+  Entry.HandlerType = HandlerType;
+  Entry.TypeToken = TypeToken;
+  FuncInfo.ClrEHUnwindMap.push_back(Entry);
+  return FuncInfo.ClrEHUnwindMap.size() - 1;
+}
+
+void llvm::calculateClrEHStateNumbers(const Function *Fn,
+                                      WinEHFuncInfo &FuncInfo) {
+  // Return if it's already been done.
+  if (!FuncInfo.EHPadStateMap.empty())
+    return;
+
+  // This numbering assigns one state number to each catchpad and cleanuppad.
+  // It also computes two tree-like relations over states:
+  // 1) Each state has a "HandlerParentState", which is the state of the next
+  //    outer handler enclosing this state's handler (same as nearest ancestor
+  //    per the ParentPad linkage on EH pads, but skipping over catchswitches).
+  // 2) Each state has a "TryParentState", which:
+  //    a) for a catchpad that's not the last handler on its catchswitch, is
+  //       the state of the next catchpad on that catchswitch
+  //    b) for all other pads, is the state of the pad whose try region is the
+  //       next outer try region enclosing this state's try region.  The "try
+  //       regions are not present as such in the IR, but will be inferred
+  //       based on the placement of invokes and pads which reach each other
+  //       by exceptional exits
+  // Catchswitches do not get their own states, but each gets mapped to the
+  // state of its first catchpad.
+
+  // Step one: walk down from outermost to innermost funclets, assigning each
+  // catchpad and cleanuppad a state number.  Add an entry to the
+  // ClrEHUnwindMap for each state, recording its HandlerParentState and
+  // handler attributes.  Record the TryParentState as well for each catchpad
+  // that's not the last on its catchswitch, but initialize all other entries'
+  // TryParentStates to a sentinel -1 value that the next pass will update.
+
+  // Seed a worklist with pads that have no parent.
+  SmallVector<std::pair<const Instruction *, int>, 8> Worklist;
+  for (const BasicBlock &BB : *Fn) {
+    const Instruction *FirstNonPHI = BB.getFirstNonPHI();
+    const Value *ParentPad;
+    if (const auto *CPI = dyn_cast<CleanupPadInst>(FirstNonPHI))
+      ParentPad = CPI->getParentPad();
+    else if (const auto *CSI = dyn_cast<CatchSwitchInst>(FirstNonPHI))
+      ParentPad = CSI->getParentPad();
+    else
+      continue;
+    if (isa<ConstantTokenNone>(ParentPad))
+      Worklist.emplace_back(FirstNonPHI, -1);
+  }
+
+  // Use the worklist to visit all pads, from outer to inner.  Record
+  // HandlerParentState for all pads.  Record TryParentState only for catchpads
+  // that aren't the last on their catchswitch (setting all other entries'
+  // TryParentStates to an initial value of -1).  This loop is also responsible
+  // for setting the EHPadStateMap entry for all catchpads, cleanuppads, and
+  // catchswitches.
+  while (!Worklist.empty()) {
+    const Instruction *Pad;
+    int HandlerParentState;
+    std::tie(Pad, HandlerParentState) = Worklist.pop_back_val();
+
+    if (const auto *Cleanup = dyn_cast<CleanupPadInst>(Pad)) {
+      // Create the entry for this cleanup with the appropriate handler
+      // properties.  Finally and fault handlers are distinguished by arity.
+      ClrHandlerType HandlerType =
+          (Cleanup->arg_size() ? ClrHandlerType::Fault
+                               : ClrHandlerType::Finally);
+      int CleanupState = addClrEHHandler(FuncInfo, HandlerParentState, -1,
+                                         HandlerType, 0, Pad->getParent());
+      // Queue any child EH pads on the worklist.
+      for (const User *U : Cleanup->users())
+        if (const auto *I = dyn_cast<Instruction>(U))
+          if (I->isEHPad())
+            Worklist.emplace_back(I, CleanupState);
+      // Remember this pad's state.
+      FuncInfo.EHPadStateMap[Cleanup] = CleanupState;
+    } else {
+      // Walk the handlers of this catchswitch in reverse order since all but
+      // the last need to set the following one as its TryParentState.
+      const auto *CatchSwitch = cast<CatchSwitchInst>(Pad);
+      int CatchState = -1, FollowerState = -1;
+      SmallVector<const BasicBlock *, 4> CatchBlocks(CatchSwitch->handlers());
+      for (const BasicBlock *CatchBlock : llvm::reverse(CatchBlocks)) {
+        // Create the entry for this catch with the appropriate handler
+        // properties.
+        const auto *Catch = cast<CatchPadInst>(CatchBlock->getFirstNonPHI());
+        uint32_t TypeToken = static_cast<uint32_t>(
+            cast<ConstantInt>(Catch->getArgOperand(0))->getZExtValue());
+        CatchState =
+            addClrEHHandler(FuncInfo, HandlerParentState, FollowerState,
+                            ClrHandlerType::Catch, TypeToken, CatchBlock);
+        // Queue any child EH pads on the worklist.
+        for (const User *U : Catch->users())
+          if (const auto *I = dyn_cast<Instruction>(U))
+            if (I->isEHPad())
+              Worklist.emplace_back(I, CatchState);
+        // Remember this catch's state.
+        FuncInfo.EHPadStateMap[Catch] = CatchState;
+        FollowerState = CatchState;
+      }
+      // Associate the catchswitch with the state of its first catch.
+      assert(CatchSwitch->getNumHandlers());
+      FuncInfo.EHPadStateMap[CatchSwitch] = CatchState;
+    }
+  }
+
+  // Step two: record the TryParentState of each state.  For cleanuppads that
+  // don't have cleanuprets, we may need to infer this from their child pads,
+  // so visit pads in descendant-most to ancestor-most order.
+  for (ClrEHUnwindMapEntry &Entry : llvm::reverse(FuncInfo.ClrEHUnwindMap)) {
+    const Instruction *Pad =
+        cast<const BasicBlock *>(Entry.Handler)->getFirstNonPHI();
+    // For most pads, the TryParentState is the state associated with the
+    // unwind dest of exceptional exits from it.
+    const BasicBlock *UnwindDest;
+    if (const auto *Catch = dyn_cast<CatchPadInst>(Pad)) {
+      // If a catch is not the last in its catchswitch, its TryParentState is
+      // the state associated with the next catch in the switch, even though
+      // that's not the unwind dest of exceptions escaping the catch.  Those
+      // cases were already assigned a TryParentState in the first pass, so
+      // skip them.
+      if (Entry.TryParentState != -1)
+        continue;
+      // Otherwise, get the unwind dest from the catchswitch.
+      UnwindDest = Catch->getCatchSwitch()->getUnwindDest();
+    } else {
+      const auto *Cleanup = cast<CleanupPadInst>(Pad);
+      UnwindDest = nullptr;
+      for (const User *U : Cleanup->users()) {
+        if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(U)) {
+          // Common and unambiguous case -- cleanupret indicates cleanup's
+          // unwind dest.
+          UnwindDest = CleanupRet->getUnwindDest();
+          break;
+        }
+
+        // Get an unwind dest for the user
+        const BasicBlock *UserUnwindDest = nullptr;
+        if (auto *Invoke = dyn_cast<InvokeInst>(U)) {
+          UserUnwindDest = Invoke->getUnwindDest();
+        } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(U)) {
+          UserUnwindDest = CatchSwitch->getUnwindDest();
+        } else if (auto *ChildCleanup = dyn_cast<CleanupPadInst>(U)) {
+          int UserState = FuncInfo.EHPadStateMap[ChildCleanup];
+          int UserUnwindState =
+              FuncInfo.ClrEHUnwindMap[UserState].TryParentState;
+          if (UserUnwindState != -1)
+            UserUnwindDest = cast<const BasicBlock *>(
+                FuncInfo.ClrEHUnwindMap[UserUnwindState].Handler);
+        }
+
+        // Not having an unwind dest for this user might indicate that it
+        // doesn't unwind, so can't be taken as proof that the cleanup itself
+        // may unwind to caller (see e.g. SimplifyUnreachable and
+        // RemoveUnwindEdge).
+        if (!UserUnwindDest)
+          continue;
+
+        // Now we have an unwind dest for the user, but we need to see if it
+        // unwinds all the way out of the cleanup or if it stays within it.
+        const Instruction *UserUnwindPad = UserUnwindDest->getFirstNonPHI();
+        const Value *UserUnwindParent;
+        if (auto *CSI = dyn_cast<CatchSwitchInst>(UserUnwindPad))
+          UserUnwindParent = CSI->getParentPad();
+        else
+          UserUnwindParent =
+              cast<CleanupPadInst>(UserUnwindPad)->getParentPad();
+
+        // The unwind stays within the cleanup iff it targets a child of the
+        // cleanup.
+        if (UserUnwindParent == Cleanup)
+          continue;
+
+        // This unwind exits the cleanup, so its dest is the cleanup's dest.
+        UnwindDest = UserUnwindDest;
+        break;
+      }
+    }
+
+    // Record the state of the unwind dest as the TryParentState.
+    int UnwindDestState;
+
+    // If UnwindDest is null at this point, either the pad in question can
+    // be exited by unwind to caller, or it cannot be exited by unwind.  In
+    // either case, reporting such cases as unwinding to caller is correct.
+    // This can lead to EH tables that "look strange" -- if this pad's is in
+    // a parent funclet which has other children that do unwind to an enclosing
+    // pad, the try region for this pad will be missing the "duplicate" EH
+    // clause entries that you'd expect to see covering the whole parent.  That
+    // should be benign, since the unwind never actually happens.  If it were
+    // an issue, we could add a subsequent pass that pushes unwind dests down
+    // from parents that have them to children that appear to unwind to caller.
+    if (!UnwindDest) {
+      UnwindDestState = -1;
+    } else {
+      UnwindDestState = FuncInfo.EHPadStateMap[UnwindDest->getFirstNonPHI()];
+    }
+
+    Entry.TryParentState = UnwindDestState;
+  }
+
+  // Step three: transfer information from pads to invokes.
+  calculateStateNumbersForInvokes(Fn, FuncInfo);
+}
+
+void WinEHPrepare::colorFunclets(Function &F) {
+  BlockColors = colorEHFunclets(F);
+
+  // Invert the map from BB to colors to color to BBs.
+  for (BasicBlock &BB : F) {
+    ColorVector &Colors = BlockColors[&BB];
+    for (BasicBlock *Color : Colors)
+      FuncletBlocks[Color].push_back(&BB);
+  }
+}
+
+void WinEHPrepare::demotePHIsOnFunclets(Function &F,
+                                        bool DemoteCatchSwitchPHIOnly) {
+  // Strip PHI nodes off of EH pads.
+  SmallVector<PHINode *, 16> PHINodes;
+  for (BasicBlock &BB : make_early_inc_range(F)) {
+    if (!BB.isEHPad())
+      continue;
+    if (DemoteCatchSwitchPHIOnly && !isa<CatchSwitchInst>(BB.getFirstNonPHI()))
+      continue;
+
+    for (Instruction &I : make_early_inc_range(BB)) {
+      auto *PN = dyn_cast<PHINode>(&I);
+      // Stop at the first non-PHI.
+      if (!PN)
+        break;
+
+      AllocaInst *SpillSlot = insertPHILoads(PN, F);
+      if (SpillSlot)
+        insertPHIStores(PN, SpillSlot);
+
+      PHINodes.push_back(PN);
+    }
+  }
+
+  for (auto *PN : PHINodes) {
+    // There may be lingering uses on other EH PHIs being removed
+    PN->replaceAllUsesWith(PoisonValue::get(PN->getType()));
+    PN->eraseFromParent();
+  }
+}
+
+void WinEHPrepare::cloneCommonBlocks(Function &F) {
+  // We need to clone all blocks which belong to multiple funclets.  Values are
+  // remapped throughout the funclet to propagate both the new instructions
+  // *and* the new basic blocks themselves.
+  for (auto &Funclets : FuncletBlocks) {
+    BasicBlock *FuncletPadBB = Funclets.first;
+    std::vector<BasicBlock *> &BlocksInFunclet = Funclets.second;
+    Value *FuncletToken;
+    if (FuncletPadBB == &F.getEntryBlock())
+      FuncletToken = ConstantTokenNone::get(F.getContext());
+    else
+      FuncletToken = FuncletPadBB->getFirstNonPHI();
+
+    std::vector<std::pair<BasicBlock *, BasicBlock *>> Orig2Clone;
+    ValueToValueMapTy VMap;
+    for (BasicBlock *BB : BlocksInFunclet) {
+      ColorVector &ColorsForBB = BlockColors[BB];
+      // We don't need to do anything if the block is monochromatic.
+      size_t NumColorsForBB = ColorsForBB.size();
+      if (NumColorsForBB == 1)
+        continue;
+
+      DEBUG_WITH_TYPE("winehprepare-coloring",
+                      dbgs() << "  Cloning block \'" << BB->getName()
+                              << "\' for funclet \'" << FuncletPadBB->getName()
+                              << "\'.\n");
+
+      // Create a new basic block and copy instructions into it!
+      BasicBlock *CBB =
+          CloneBasicBlock(BB, VMap, Twine(".for.", FuncletPadBB->getName()));
+      // Insert the clone immediately after the original to ensure determinism
+      // and to keep the same relative ordering of any funclet's blocks.
+      CBB->insertInto(&F, BB->getNextNode());
+
+      // Add basic block mapping.
+      VMap[BB] = CBB;
+
+      // Record delta operations that we need to perform to our color mappings.
+      Orig2Clone.emplace_back(BB, CBB);
+    }
+
+    // If nothing was cloned, we're done cloning in this funclet.
+    if (Orig2Clone.empty())
+      continue;
+
+    // Update our color mappings to reflect that one block has lost a color and
+    // another has gained a color.
+    for (auto &BBMapping : Orig2Clone) {
+      BasicBlock *OldBlock = BBMapping.first;
+      BasicBlock *NewBlock = BBMapping.second;
+
+      BlocksInFunclet.push_back(NewBlock);
+      ColorVector &NewColors = BlockColors[NewBlock];
+      assert(NewColors.empty() && "A new block should only have one color!");
+      NewColors.push_back(FuncletPadBB);
+
+      DEBUG_WITH_TYPE("winehprepare-coloring",
+                      dbgs() << "  Assigned color \'" << FuncletPadBB->getName()
+                              << "\' to block \'" << NewBlock->getName()
+                              << "\'.\n");
+
+      llvm::erase_value(BlocksInFunclet, OldBlock);
+      ColorVector &OldColors = BlockColors[OldBlock];
+      llvm::erase_value(OldColors, FuncletPadBB);
+
+      DEBUG_WITH_TYPE("winehprepare-coloring",
+                      dbgs() << "  Removed color \'" << FuncletPadBB->getName()
+                              << "\' from block \'" << OldBlock->getName()
+                              << "\'.\n");
+    }
+
+    // Loop over all of the instructions in this funclet, fixing up operand
+    // references as we go.  This uses VMap to do all the hard work.
+    for (BasicBlock *BB : BlocksInFunclet)
+      // Loop over all instructions, fixing each one as we find it...
+      for (Instruction &I : *BB)
+        RemapInstruction(&I, VMap,
+                         RF_IgnoreMissingLocals | RF_NoModuleLevelChanges);
+
+    // Catchrets targeting cloned blocks need to be updated separately from
+    // the loop above because they are not in the current funclet.
+    SmallVector<CatchReturnInst *, 2> FixupCatchrets;
+    for (auto &BBMapping : Orig2Clone) {
+      BasicBlock *OldBlock = BBMapping.first;
+      BasicBlock *NewBlock = BBMapping.second;
+
+      FixupCatchrets.clear();
+      for (BasicBlock *Pred : predecessors(OldBlock))
+        if (auto *CatchRet = dyn_cast<CatchReturnInst>(Pred->getTerminator()))
+          if (CatchRet->getCatchSwitchParentPad() == FuncletToken)
+            FixupCatchrets.push_back(CatchRet);
+
+      for (CatchReturnInst *CatchRet : FixupCatchrets)
+        CatchRet->setSuccessor(NewBlock);
+    }
+
+    auto UpdatePHIOnClonedBlock = [&](PHINode *PN, bool IsForOldBlock) {
+      unsigned NumPreds = PN->getNumIncomingValues();
+      for (unsigned PredIdx = 0, PredEnd = NumPreds; PredIdx != PredEnd;
+           ++PredIdx) {
+        BasicBlock *IncomingBlock = PN->getIncomingBlock(PredIdx);
+        bool EdgeTargetsFunclet;
+        if (auto *CRI =
+                dyn_cast<CatchReturnInst>(IncomingBlock->getTerminator())) {
+          EdgeTargetsFunclet = (CRI->getCatchSwitchParentPad() == FuncletToken);
+        } else {
+          ColorVector &IncomingColors = BlockColors[IncomingBlock];
+          assert(!IncomingColors.empty() && "Block not colored!");
+          assert((IncomingColors.size() == 1 ||
+                  !llvm::is_contained(IncomingColors, FuncletPadBB)) &&
+                 "Cloning should leave this funclet's blocks monochromatic");
+          EdgeTargetsFunclet = (IncomingColors.front() == FuncletPadBB);
+        }
+        if (IsForOldBlock != EdgeTargetsFunclet)
+          continue;
+        PN->removeIncomingValue(IncomingBlock, /*DeletePHIIfEmpty=*/false);
+        // Revisit the next entry.
+        --PredIdx;
+        --PredEnd;
+      }
+    };
+
+    for (auto &BBMapping : Orig2Clone) {
+      BasicBlock *OldBlock = BBMapping.first;
+      BasicBlock *NewBlock = BBMapping.second;
+      for (PHINode &OldPN : OldBlock->phis()) {
+        UpdatePHIOnClonedBlock(&OldPN, /*IsForOldBlock=*/true);
+      }
+      for (PHINode &NewPN : NewBlock->phis()) {
+        UpdatePHIOnClonedBlock(&NewPN, /*IsForOldBlock=*/false);
+      }
+    }
+
+    // Check to see if SuccBB has PHI nodes. If so, we need to add entries to
+    // the PHI nodes for NewBB now.
+    for (auto &BBMapping : Orig2Clone) {
+      BasicBlock *OldBlock = BBMapping.first;
+      BasicBlock *NewBlock = BBMapping.second;
+      for (BasicBlock *SuccBB : successors(NewBlock)) {
+        for (PHINode &SuccPN : SuccBB->phis()) {
+          // Ok, we have a PHI node.  Figure out what the incoming value was for
+          // the OldBlock.
+          int OldBlockIdx = SuccPN.getBasicBlockIndex(OldBlock);
+          if (OldBlockIdx == -1)
+            break;
+          Value *IV = SuccPN.getIncomingValue(OldBlockIdx);
+
+          // Remap the value if necessary.
+          if (auto *Inst = dyn_cast<Instruction>(IV)) {
+            ValueToValueMapTy::iterator I = VMap.find(Inst);
+            if (I != VMap.end())
+              IV = I->second;
+          }
+
+          SuccPN.addIncoming(IV, NewBlock);
+        }
+      }
+    }
+
+    for (ValueToValueMapTy::value_type VT : VMap) {
+      // If there were values defined in BB that are used outside the funclet,
+      // then we now have to update all uses of the value to use either the
+      // original value, the cloned value, or some PHI derived value.  This can
+      // require arbitrary PHI insertion, of which we are prepared to do, clean
+      // these up now.
+      SmallVector<Use *, 16> UsesToRename;
+
+      auto *OldI = dyn_cast<Instruction>(const_cast<Value *>(VT.first));
+      if (!OldI)
+        continue;
+      auto *NewI = cast<Instruction>(VT.second);
+      // Scan all uses of this instruction to see if it is used outside of its
+      // funclet, and if so, record them in UsesToRename.
+      for (Use &U : OldI->uses()) {
+        Instruction *UserI = cast<Instruction>(U.getUser());
+        BasicBlock *UserBB = UserI->getParent();
+        ColorVector &ColorsForUserBB = BlockColors[UserBB];
+        assert(!ColorsForUserBB.empty());
+        if (ColorsForUserBB.size() > 1 ||
+            *ColorsForUserBB.begin() != FuncletPadBB)
+          UsesToRename.push_back(&U);
+      }
+
+      // If there are no uses outside the block, we're done with this
+      // instruction.
+      if (UsesToRename.empty())
+        continue;
+
+      // We found a use of OldI outside of the funclet.  Rename all uses of OldI
+      // that are outside its funclet to be uses of the appropriate PHI node
+      // etc.
+      SSAUpdater SSAUpdate;
+      SSAUpdate.Initialize(OldI->getType(), OldI->getName());
+      SSAUpdate.AddAvailableValue(OldI->getParent(), OldI);
+      SSAUpdate.AddAvailableValue(NewI->getParent(), NewI);
+
+      while (!UsesToRename.empty())
+        SSAUpdate.RewriteUseAfterInsertions(*UsesToRename.pop_back_val());
+    }
+  }
+}
+
+void WinEHPrepare::removeImplausibleInstructions(Function &F) {
+  // Remove implausible terminators and replace them with UnreachableInst.
+  for (auto &Funclet : FuncletBlocks) {
+    BasicBlock *FuncletPadBB = Funclet.first;
+    std::vector<BasicBlock *> &BlocksInFunclet = Funclet.second;
+    Instruction *FirstNonPHI = FuncletPadBB->getFirstNonPHI();
+    auto *FuncletPad = dyn_cast<FuncletPadInst>(FirstNonPHI);
+    auto *CatchPad = dyn_cast_or_null<CatchPadInst>(FuncletPad);
+    auto *CleanupPad = dyn_cast_or_null<CleanupPadInst>(FuncletPad);
+
+    for (BasicBlock *BB : BlocksInFunclet) {
+      for (Instruction &I : *BB) {
+        auto *CB = dyn_cast<CallBase>(&I);
+        if (!CB)
+          continue;
+
+        Value *FuncletBundleOperand = nullptr;
+        if (auto BU = CB->getOperandBundle(LLVMContext::OB_funclet))
+          FuncletBundleOperand = BU->Inputs.front();
+
+        if (FuncletBundleOperand == FuncletPad)
+          continue;
+
+        // Skip call sites which are nounwind intrinsics or inline asm.
+        auto *CalledFn =
+            dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
+        if (CalledFn && ((CalledFn->isIntrinsic() && CB->doesNotThrow()) ||
+                         CB->isInlineAsm()))
+          continue;
+
+        // This call site was not part of this funclet, remove it.
+        if (isa<InvokeInst>(CB)) {
+          // Remove the unwind edge if it was an invoke.
+          removeUnwindEdge(BB);
+          // Get a pointer to the new call.
+          BasicBlock::iterator CallI =
+              std::prev(BB->getTerminator()->getIterator());
+          auto *CI = cast<CallInst>(&*CallI);
+          changeToUnreachable(CI);
+        } else {
+          changeToUnreachable(&I);
+        }
+
+        // There are no more instructions in the block (except for unreachable),
+        // we are done.
+        break;
+      }
+
+      Instruction *TI = BB->getTerminator();
+      // CatchPadInst and CleanupPadInst can't transfer control to a ReturnInst.
+      bool IsUnreachableRet = isa<ReturnInst>(TI) && FuncletPad;
+      // The token consumed by a CatchReturnInst must match the funclet token.
+      bool IsUnreachableCatchret = false;
+      if (auto *CRI = dyn_cast<CatchReturnInst>(TI))
+        IsUnreachableCatchret = CRI->getCatchPad() != CatchPad;
+      // The token consumed by a CleanupReturnInst must match the funclet token.
+      bool IsUnreachableCleanupret = false;
+      if (auto *CRI = dyn_cast<CleanupReturnInst>(TI))
+        IsUnreachableCleanupret = CRI->getCleanupPad() != CleanupPad;
+      if (IsUnreachableRet || IsUnreachableCatchret ||
+          IsUnreachableCleanupret) {
+        changeToUnreachable(TI);
+      } else if (isa<InvokeInst>(TI)) {
+        if (Personality == EHPersonality::MSVC_CXX && CleanupPad) {
+          // Invokes within a cleanuppad for the MSVC++ personality never
+          // transfer control to their unwind edge: the personality will
+          // terminate the program.
+          removeUnwindEdge(BB);
+        }
+      }
+    }
+  }
+}
+
+void WinEHPrepare::cleanupPreparedFunclets(Function &F) {
+  // Clean-up some of the mess we made by removing useles PHI nodes, trivial
+  // branches, etc.
+  for (BasicBlock &BB : llvm::make_early_inc_range(F)) {
+    SimplifyInstructionsInBlock(&BB);
+    ConstantFoldTerminator(&BB, /*DeleteDeadConditions=*/true);
+    MergeBlockIntoPredecessor(&BB);
+  }
+
+  // We might have some unreachable blocks after cleaning up some impossible
+  // control flow.
+  removeUnreachableBlocks(F);
+}
+
+#ifndef NDEBUG
+void WinEHPrepare::verifyPreparedFunclets(Function &F) {
+  for (BasicBlock &BB : F) {
+    size_t NumColors = BlockColors[&BB].size();
+    assert(NumColors == 1 && "Expected monochromatic BB!");
+    if (NumColors == 0)
+      report_fatal_error("Uncolored BB!");
+    if (NumColors > 1)
+      report_fatal_error("Multicolor BB!");
+    assert((DisableDemotion || !(BB.isEHPad() && isa<PHINode>(BB.begin()))) &&
+           "EH Pad still has a PHI!");
+  }
+}
+#endif
+
+bool WinEHPrepare::prepareExplicitEH(Function &F) {
+  // Remove unreachable blocks.  It is not valuable to assign them a color and
+  // their existence can trick us into thinking values are alive when they are
+  // not.
+  removeUnreachableBlocks(F);
+
+  // Determine which blocks are reachable from which funclet entries.
+  colorFunclets(F);
+
+  cloneCommonBlocks(F);
+
+  if (!DisableDemotion)
+    demotePHIsOnFunclets(F, DemoteCatchSwitchPHIOnly ||
+                                DemoteCatchSwitchPHIOnlyOpt);
+
+  if (!DisableCleanups) {
+    assert(!verifyFunction(F, &dbgs()));
+    removeImplausibleInstructions(F);
+
+    assert(!verifyFunction(F, &dbgs()));
+    cleanupPreparedFunclets(F);
+  }
+
+  LLVM_DEBUG(verifyPreparedFunclets(F));
+  // Recolor the CFG to verify that all is well.
+  LLVM_DEBUG(colorFunclets(F));
+  LLVM_DEBUG(verifyPreparedFunclets(F));
+
+  BlockColors.clear();
+  FuncletBlocks.clear();
+
+  return true;
+}
+
+// TODO: Share loads when one use dominates another, or when a catchpad exit
+// dominates uses (needs dominators).
+AllocaInst *WinEHPrepare::insertPHILoads(PHINode *PN, Function &F) {
+  BasicBlock *PHIBlock = PN->getParent();
+  AllocaInst *SpillSlot = nullptr;
+  Instruction *EHPad = PHIBlock->getFirstNonPHI();
+
+  if (!EHPad->isTerminator()) {
+    // If the EHPad isn't a terminator, then we can insert a load in this block
+    // that will dominate all uses.
+    SpillSlot = new AllocaInst(PN->getType(), DL->getAllocaAddrSpace(), nullptr,
+                               Twine(PN->getName(), ".wineh.spillslot"),
+                               &F.getEntryBlock().front());
+    Value *V = new LoadInst(PN->getType(), SpillSlot,
+                            Twine(PN->getName(), ".wineh.reload"),
+                            &*PHIBlock->getFirstInsertionPt());
+    PN->replaceAllUsesWith(V);
+    return SpillSlot;
+  }
+
+  // Otherwise, we have a PHI on a terminator EHPad, and we give up and insert
+  // loads of the slot before every use.
+  DenseMap<BasicBlock *, Value *> Loads;
+  for (Use &U : llvm::make_early_inc_range(PN->uses())) {
+    auto *UsingInst = cast<Instruction>(U.getUser());
+    if (isa<PHINode>(UsingInst) && UsingInst->getParent()->isEHPad()) {
+      // Use is on an EH pad phi.  Leave it alone; we'll insert loads and
+      // stores for it separately.
+      continue;
+    }
+    replaceUseWithLoad(PN, U, SpillSlot, Loads, F);
+  }
+  return SpillSlot;
+}
+
+// TODO: improve store placement.  Inserting at def is probably good, but need
+// to be careful not to introduce interfering stores (needs liveness analysis).
+// TODO: identify related phi nodes that can share spill slots, and share them
+// (also needs liveness).
+void WinEHPrepare::insertPHIStores(PHINode *OriginalPHI,
+                                   AllocaInst *SpillSlot) {
+  // Use a worklist of (Block, Value) pairs -- the given Value needs to be
+  // stored to the spill slot by the end of the given Block.
+  SmallVector<std::pair<BasicBlock *, Value *>, 4> Worklist;
+
+  Worklist.push_back({OriginalPHI->getParent(), OriginalPHI});
+
+  while (!Worklist.empty()) {
+    BasicBlock *EHBlock;
+    Value *InVal;
+    std::tie(EHBlock, InVal) = Worklist.pop_back_val();
+
+    PHINode *PN = dyn_cast<PHINode>(InVal);
+    if (PN && PN->getParent() == EHBlock) {
+      // The value is defined by another PHI we need to remove, with no room to
+      // insert a store after the PHI, so each predecessor needs to store its
+      // incoming value.
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) {
+        Value *PredVal = PN->getIncomingValue(i);
+
+        // Undef can safely be skipped.
+        if (isa<UndefValue>(PredVal))
+          continue;
+
+        insertPHIStore(PN->getIncomingBlock(i), PredVal, SpillSlot, Worklist);
+      }
+    } else {
+      // We need to store InVal, which dominates EHBlock, but can't put a store
+      // in EHBlock, so need to put stores in each predecessor.
+      for (BasicBlock *PredBlock : predecessors(EHBlock)) {
+        insertPHIStore(PredBlock, InVal, SpillSlot, Worklist);
+      }
+    }
+  }
+}
+
+void WinEHPrepare::insertPHIStore(
+    BasicBlock *PredBlock, Value *PredVal, AllocaInst *SpillSlot,
+    SmallVectorImpl<std::pair<BasicBlock *, Value *>> &Worklist) {
+
+  if (PredBlock->isEHPad() && PredBlock->getFirstNonPHI()->isTerminator()) {
+    // Pred is unsplittable, so we need to queue it on the worklist.
+    Worklist.push_back({PredBlock, PredVal});
+    return;
+  }
+
+  // Otherwise, insert the store at the end of the basic block.
+  new StoreInst(PredVal, SpillSlot, PredBlock->getTerminator());
+}
+
+void WinEHPrepare::replaceUseWithLoad(Value *V, Use &U, AllocaInst *&SpillSlot,
+                                      DenseMap<BasicBlock *, Value *> &Loads,
+                                      Function &F) {
+  // Lazilly create the spill slot.
+  if (!SpillSlot)
+    SpillSlot = new AllocaInst(V->getType(), DL->getAllocaAddrSpace(), nullptr,
+                               Twine(V->getName(), ".wineh.spillslot"),
+                               &F.getEntryBlock().front());
+
+  auto *UsingInst = cast<Instruction>(U.getUser());
+  if (auto *UsingPHI = dyn_cast<PHINode>(UsingInst)) {
+    // If this is a PHI node, we can't insert a load of the value before
+    // the use.  Instead insert the load in the predecessor block
+    // corresponding to the incoming value.
+    //
+    // Note that if there are multiple edges from a basic block to this
+    // PHI node that we cannot have multiple loads.  The problem is that
+    // the resulting PHI node will have multiple values (from each load)
+    // coming in from the same block, which is illegal SSA form.
+    // For this reason, we keep track of and reuse loads we insert.
+    BasicBlock *IncomingBlock = UsingPHI->getIncomingBlock(U);
+    if (auto *CatchRet =
+            dyn_cast<CatchReturnInst>(IncomingBlock->getTerminator())) {
+      // Putting a load above a catchret and use on the phi would still leave
+      // a cross-funclet def/use.  We need to split the edge, change the
+      // catchret to target the new block, and put the load there.
+      BasicBlock *PHIBlock = UsingInst->getParent();
+      BasicBlock *NewBlock = SplitEdge(IncomingBlock, PHIBlock);
+      // SplitEdge gives us:
+      //   IncomingBlock:
+      //     ...
+      //     br label %NewBlock
+      //   NewBlock:
+      //     catchret label %PHIBlock
+      // But we need:
+      //   IncomingBlock:
+      //     ...
+      //     catchret label %NewBlock
+      //   NewBlock:
+      //     br label %PHIBlock
+      // So move the terminators to each others' blocks and swap their
+      // successors.
+      BranchInst *Goto = cast<BranchInst>(IncomingBlock->getTerminator());
+      Goto->removeFromParent();
+      CatchRet->removeFromParent();
+      CatchRet->insertInto(IncomingBlock, IncomingBlock->end());
+      Goto->insertInto(NewBlock, NewBlock->end());
+      Goto->setSuccessor(0, PHIBlock);
+      CatchRet->setSuccessor(NewBlock);
+      // Update the color mapping for the newly split edge.
+      // Grab a reference to the ColorVector to be inserted before getting the
+      // reference to the vector we are copying because inserting the new
+      // element in BlockColors might cause the map to be reallocated.
+      ColorVector &ColorsForNewBlock = BlockColors[NewBlock];
+      ColorVector &ColorsForPHIBlock = BlockColors[PHIBlock];
+      ColorsForNewBlock = ColorsForPHIBlock;
+      for (BasicBlock *FuncletPad : ColorsForPHIBlock)
+        FuncletBlocks[FuncletPad].push_back(NewBlock);
+      // Treat the new block as incoming for load insertion.
+      IncomingBlock = NewBlock;
+    }
+    Value *&Load = Loads[IncomingBlock];
+    // Insert the load into the predecessor block
+    if (!Load)
+      Load = new LoadInst(V->getType(), SpillSlot,
+                          Twine(V->getName(), ".wineh.reload"),
+                          /*isVolatile=*/false, IncomingBlock->getTerminator());
+
+    U.set(Load);
+  } else {
+    // Reload right before the old use.
+    auto *Load = new LoadInst(V->getType(), SpillSlot,
+                              Twine(V->getName(), ".wineh.reload"),
+                              /*isVolatile=*/false, UsingInst);
+    U.set(Load);
+  }
+}
+
+void WinEHFuncInfo::addIPToStateRange(const InvokeInst *II,
+                                      MCSymbol *InvokeBegin,
+                                      MCSymbol *InvokeEnd) {
+  assert(InvokeStateMap.count(II) &&
+         "should get invoke with precomputed state");
+  LabelToStateMap[InvokeBegin] = std::make_pair(InvokeStateMap[II], InvokeEnd);
+}
+
+void WinEHFuncInfo::addIPToStateRange(int State, MCSymbol* InvokeBegin,
+    MCSymbol* InvokeEnd) {
+    LabelToStateMap[InvokeBegin] = std::make_pair(State, InvokeEnd);
+}
+
+WinEHFuncInfo::WinEHFuncInfo() = default;
diff --git a/contrib/llvm-project/llvm/lib/CodeGen/XRayInstrumentation.cpp b/contrib/llvm-project/llvm/lib/CodeGen/XRayInstrumentation.cpp
new file mode 100644
index 000000000000..d40725838c94
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/CodeGen/XRayInstrumentation.cpp
@@ -0,0 +1,269 @@
+//===- XRayInstrumentation.cpp - Adds XRay instrumentation to functions. --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a MachineFunctionPass that inserts the appropriate
+// XRay instrumentation instructions. We look for XRay-specific attributes
+// on the function to determine whether we should insert the replacement
+// operations.
+//
+//===---------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Triple.h"
+
+using namespace llvm;
+
+namespace {
+
+struct InstrumentationOptions {
+  // Whether to emit PATCHABLE_TAIL_CALL.
+  bool HandleTailcall;
+
+  // Whether to emit PATCHABLE_RET/PATCHABLE_FUNCTION_EXIT for all forms of
+  // return, e.g. conditional return.
+  bool HandleAllReturns;
+};
+
+struct XRayInstrumentation : public MachineFunctionPass {
+  static char ID;
+
+  XRayInstrumentation() : MachineFunctionPass(ID) {
+    initializeXRayInstrumentationPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreserved<MachineLoopInfo>();
+    AU.addPreserved<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  // Replace the original RET instruction with the exit sled code ("patchable
+  //   ret" pseudo-instruction), so that at runtime XRay can replace the sled
+  //   with a code jumping to XRay trampoline, which calls the tracing handler
+  //   and, in the end, issues the RET instruction.
+  // This is the approach to go on CPUs which have a single RET instruction,
+  //   like x86/x86_64.
+  void replaceRetWithPatchableRet(MachineFunction &MF,
+                                  const TargetInstrInfo *TII,
+                                  InstrumentationOptions);
+
+  // Prepend the original return instruction with the exit sled code ("patchable
+  //   function exit" pseudo-instruction), preserving the original return
+  //   instruction just after the exit sled code.
+  // This is the approach to go on CPUs which have multiple options for the
+  //   return instruction, like ARM. For such CPUs we can't just jump into the
+  //   XRay trampoline and issue a single return instruction there. We rather
+  //   have to call the trampoline and return from it to the original return
+  //   instruction of the function being instrumented.
+  void prependRetWithPatchableExit(MachineFunction &MF,
+                                   const TargetInstrInfo *TII,
+                                   InstrumentationOptions);
+};
+
+} // end anonymous namespace
+
+void XRayInstrumentation::replaceRetWithPatchableRet(
+    MachineFunction &MF, const TargetInstrInfo *TII,
+    InstrumentationOptions op) {
+  // We look for *all* terminators and returns, then replace those with
+  // PATCHABLE_RET instructions.
+  SmallVector<MachineInstr *, 4> Terminators;
+  for (auto &MBB : MF) {
+    for (auto &T : MBB.terminators()) {
+      unsigned Opc = 0;
+      if (T.isReturn() &&
+          (op.HandleAllReturns || T.getOpcode() == TII->getReturnOpcode())) {
+        // Replace return instructions with:
+        //   PATCHABLE_RET <Opcode>, <Operand>...
+        Opc = TargetOpcode::PATCHABLE_RET;
+      }
+      if (TII->isTailCall(T) && op.HandleTailcall) {
+        // Treat the tail call as a return instruction, which has a
+        // different-looking sled than the normal return case.
+        Opc = TargetOpcode::PATCHABLE_TAIL_CALL;
+      }
+      if (Opc != 0) {
+        auto MIB = BuildMI(MBB, T, T.getDebugLoc(), TII->get(Opc))
+                       .addImm(T.getOpcode());
+        for (auto &MO : T.operands())
+          MIB.add(MO);
+        Terminators.push_back(&T);
+        if (T.shouldUpdateCallSiteInfo())
+          MF.eraseCallSiteInfo(&T);
+      }
+    }
+  }
+
+  for (auto &I : Terminators)
+    I->eraseFromParent();
+}
+
+void XRayInstrumentation::prependRetWithPatchableExit(
+    MachineFunction &MF, const TargetInstrInfo *TII,
+    InstrumentationOptions op) {
+  for (auto &MBB : MF)
+    for (auto &T : MBB.terminators()) {
+      unsigned Opc = 0;
+      if (T.isReturn() &&
+          (op.HandleAllReturns || T.getOpcode() == TII->getReturnOpcode())) {
+        Opc = TargetOpcode::PATCHABLE_FUNCTION_EXIT;
+      }
+      if (TII->isTailCall(T) && op.HandleTailcall) {
+        Opc = TargetOpcode::PATCHABLE_TAIL_CALL;
+      }
+      if (Opc != 0) {
+        // Prepend the return instruction with PATCHABLE_FUNCTION_EXIT or
+        //   PATCHABLE_TAIL_CALL .
+        BuildMI(MBB, T, T.getDebugLoc(), TII->get(Opc));
+      }
+    }
+}
+
+bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
+  auto &F = MF.getFunction();
+  auto InstrAttr = F.getFnAttribute("function-instrument");
+  bool AlwaysInstrument = InstrAttr.isStringAttribute() &&
+                          InstrAttr.getValueAsString() == "xray-always";
+  bool NeverInstrument = InstrAttr.isStringAttribute() &&
+                         InstrAttr.getValueAsString() == "xray-never";
+  if (NeverInstrument && !AlwaysInstrument)
+    return false;
+  auto IgnoreLoopsAttr = F.getFnAttribute("xray-ignore-loops");
+
+  uint64_t XRayThreshold = 0;
+  if (!AlwaysInstrument) {
+    bool IgnoreLoops = IgnoreLoopsAttr.isValid();
+    XRayThreshold = F.getFnAttributeAsParsedInteger(
+        "xray-instruction-threshold", std::numeric_limits<uint64_t>::max());
+    if (XRayThreshold == std::numeric_limits<uint64_t>::max())
+      return false;
+
+    // Count the number of MachineInstr`s in MachineFunction
+    uint64_t MICount = 0;
+    for (const auto &MBB : MF)
+      MICount += MBB.size();
+
+    bool TooFewInstrs = MICount < XRayThreshold;
+
+    if (!IgnoreLoops) {
+      // Get MachineDominatorTree or compute it on the fly if it's unavailable
+      auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+      MachineDominatorTree ComputedMDT;
+      if (!MDT) {
+        ComputedMDT.getBase().recalculate(MF);
+        MDT = &ComputedMDT;
+      }
+
+      // Get MachineLoopInfo or compute it on the fly if it's unavailable
+      auto *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+      MachineLoopInfo ComputedMLI;
+      if (!MLI) {
+        ComputedMLI.getBase().analyze(MDT->getBase());
+        MLI = &ComputedMLI;
+      }
+
+      // Check if we have a loop.
+      // FIXME: Maybe make this smarter, and see whether the loops are dependent
+      // on inputs or side-effects?
+      if (MLI->empty() && TooFewInstrs)
+        return false; // Function is too small and has no loops.
+    } else if (TooFewInstrs) {
+      // Function is too small
+      return false;
+    }
+  }
+
+  // We look for the first non-empty MachineBasicBlock, so that we can insert
+  // the function instrumentation in the appropriate place.
+  auto MBI = llvm::find_if(
+      MF, [&](const MachineBasicBlock &MBB) { return !MBB.empty(); });
+  if (MBI == MF.end())
+    return false; // The function is empty.
+
+  auto *TII = MF.getSubtarget().getInstrInfo();
+  auto &FirstMBB = *MBI;
+  auto &FirstMI = *FirstMBB.begin();
+
+  if (!MF.getSubtarget().isXRaySupported()) {
+    FirstMI.emitError("An attempt to perform XRay instrumentation for an"
+                      " unsupported target.");
+    return false;
+  }
+
+  if (!F.hasFnAttribute("xray-skip-entry")) {
+    // First, insert an PATCHABLE_FUNCTION_ENTER as the first instruction of the
+    // MachineFunction.
+    BuildMI(FirstMBB, FirstMI, FirstMI.getDebugLoc(),
+            TII->get(TargetOpcode::PATCHABLE_FUNCTION_ENTER));
+  }
+
+  if (!F.hasFnAttribute("xray-skip-exit")) {
+    switch (MF.getTarget().getTargetTriple().getArch()) {
+    case Triple::ArchType::arm:
+    case Triple::ArchType::thumb:
+    case Triple::ArchType::aarch64:
+    case Triple::ArchType::hexagon:
+    case Triple::ArchType::loongarch64:
+    case Triple::ArchType::mips:
+    case Triple::ArchType::mipsel:
+    case Triple::ArchType::mips64:
+    case Triple::ArchType::mips64el: {
+      // For the architectures which don't have a single return instruction
+      InstrumentationOptions op;
+      op.HandleTailcall = false;
+      op.HandleAllReturns = true;
+      prependRetWithPatchableExit(MF, TII, op);
+      break;
+    }
+    case Triple::ArchType::ppc64le: {
+      // PPC has conditional returns. Turn them into branch and plain returns.
+      InstrumentationOptions op;
+      op.HandleTailcall = false;
+      op.HandleAllReturns = true;
+      replaceRetWithPatchableRet(MF, TII, op);
+      break;
+    }
+    default: {
+      // For the architectures that have a single return instruction (such as
+      //   RETQ on x86_64).
+      InstrumentationOptions op;
+      op.HandleTailcall = true;
+      op.HandleAllReturns = false;
+      replaceRetWithPatchableRet(MF, TII, op);
+      break;
+    }
+    }
+  }
+  return true;
+}
+
+char XRayInstrumentation::ID = 0;
+char &llvm::XRayInstrumentationID = XRayInstrumentation::ID;
+INITIALIZE_PASS_BEGIN(XRayInstrumentation, "xray-instrumentation",
+                      "Insert XRay ops", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(XRayInstrumentation, "xray-instrumentation",
+                    "Insert XRay ops", false, false)